From c8a1192b53915d595ad8b7a37945c64ec248673f Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Fri, 25 Feb 2022 18:52:21 +0300 Subject: [PATCH 0001/1022] Optimize WAL storage in safekeeper (#1318) When several AppendRequest's can be read from socket without blocking, they are processed together and fsync() to segment file is only called once. Segment file is no longer opened for every write request, now last opened file is cached inside PhysicalStorage. New metric for WAL flushes was added to the storage, FLUSH_WAL_SECONDS. More errors were added to storage for non-sequential WAL writes, now write_lsn can be moved only with calls to truncate_lsn(new_lsn). New messages have been added to ProposerAcceptorMessage enum. They can't be deserialized directly and now are used only for optimizing flushes. Existing protocol wasn't changed and flush will be called for every AppendRequest, as it was before. --- walkeeper/src/handler.rs | 12 ++ walkeeper/src/receive_wal.rs | 136 +++++++++--- walkeeper/src/safekeeper.rs | 45 +++- walkeeper/src/wal_storage.rs | 405 ++++++++++++++++++++--------------- 4 files changed, 392 insertions(+), 206 deletions(-) diff --git a/walkeeper/src/handler.rs b/walkeeper/src/handler.rs index 5367954842..d1ead5cb37 100644 --- a/walkeeper/src/handler.rs +++ b/walkeeper/src/handler.rs @@ -3,6 +3,7 @@ use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; use crate::receive_wal::ReceiveWalConn; +use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage}; use crate::send_wal::ReplicationConn; use crate::timeline::{Timeline, TimelineTools}; use crate::SafeKeeperConf; @@ -160,6 +161,17 @@ impl SafekeeperPostgresHandler { } } + /// Shortcut for calling `process_msg` in the timeline. + pub fn process_safekeeper_msg( + &self, + msg: &ProposerAcceptorMessage, + ) -> Result> { + self.timeline + .get() + .process_msg(msg) + .context("failed to process ProposerAcceptorMessage") + } + /// /// Handle IDENTIFY_SYSTEM replication command /// diff --git a/walkeeper/src/receive_wal.rs b/walkeeper/src/receive_wal.rs index b9420714fc..e6b12a0d81 100644 --- a/walkeeper/src/receive_wal.rs +++ b/walkeeper/src/receive_wal.rs @@ -2,15 +2,21 @@ //! Gets messages from the network, passes them down to consensus module and //! sends replies back. -use anyhow::{bail, Context, Result}; -use bytes::Bytes; +use anyhow::{anyhow, bail, Result}; + use bytes::BytesMut; use tokio::sync::mpsc::UnboundedSender; use tracing::*; +use zenith_utils::sock_split::ReadStream; use crate::timeline::Timeline; + use std::net::SocketAddr; +use std::sync::mpsc::channel; +use std::sync::mpsc::Receiver; + use std::sync::Arc; +use std::thread; use crate::safekeeper::AcceptorProposerMessage; use crate::safekeeper::ProposerAcceptorMessage; @@ -46,21 +52,6 @@ impl<'pg> ReceiveWalConn<'pg> { } } - // Read and extract the bytes of a `CopyData` message from the postgres instance - fn read_msg_bytes(&mut self) -> Result { - match self.pg_backend.read_message()? { - Some(FeMessage::CopyData(bytes)) => Ok(bytes), - Some(msg) => bail!("expected `CopyData` message, found {:?}", msg), - None => bail!("connection closed unexpectedly"), - } - } - - // Read and parse message sent from the postgres instance - fn read_msg(&mut self) -> Result { - let data = self.read_msg_bytes()?; - ProposerAcceptorMessage::parse(data) - } - // Send message to the postgres fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> Result<()> { let mut buf = BytesMut::with_capacity(128); @@ -77,18 +68,22 @@ impl<'pg> ReceiveWalConn<'pg> { self.pg_backend .write_message(&BeMessage::CopyBothResponse)?; + let r = self + .pg_backend + .take_stream_in() + .ok_or_else(|| anyhow!("failed to take read stream from pgbackend"))?; + let mut poll_reader = ProposerPollStream::new(r)?; + // Receive information about server - let mut msg = self - .read_msg() - .context("failed to receive proposer greeting")?; - match msg { + let next_msg = poll_reader.recv_msg()?; + match next_msg { ProposerAcceptorMessage::Greeting(ref greeting) => { info!( "start handshake with wal proposer {} sysid {} timeline {}", self.peer_addr, greeting.system_id, greeting.tli, ); } - _ => bail!("unexpected message {:?} instead of greeting", msg), + _ => bail!("unexpected message {:?} instead of greeting", next_msg), } // Register the connection and defer unregister. @@ -100,16 +95,97 @@ impl<'pg> ReceiveWalConn<'pg> { callmemaybe_tx: spg.tx.clone(), }; + let mut next_msg = Some(next_msg); + loop { - let reply = spg - .timeline - .get() - .process_msg(&msg) - .context("failed to process ProposerAcceptorMessage")?; - if let Some(reply) = reply { - self.write_msg(&reply)?; + if matches!(next_msg, Some(ProposerAcceptorMessage::AppendRequest(_))) { + // poll AppendRequest's without blocking and write WAL to disk without flushing, + // while it's readily available + while let Some(ProposerAcceptorMessage::AppendRequest(append_request)) = next_msg { + let msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request); + + let reply = spg.process_safekeeper_msg(&msg)?; + if let Some(reply) = reply { + self.write_msg(&reply)?; + } + + next_msg = poll_reader.poll_msg(); + } + + // flush all written WAL to the disk + let reply = spg.process_safekeeper_msg(&ProposerAcceptorMessage::FlushWAL)?; + if let Some(reply) = reply { + self.write_msg(&reply)?; + } + } else if let Some(msg) = next_msg.take() { + // process other message + let reply = spg.process_safekeeper_msg(&msg)?; + if let Some(reply) = reply { + self.write_msg(&reply)?; + } } - msg = self.read_msg()?; + + // blocking wait for the next message + if next_msg.is_none() { + next_msg = Some(poll_reader.recv_msg()?); + } + } + } +} + +struct ProposerPollStream { + msg_rx: Receiver, + read_thread: Option>>, +} + +impl ProposerPollStream { + fn new(mut r: ReadStream) -> Result { + let (msg_tx, msg_rx) = channel(); + + let read_thread = thread::Builder::new() + .name("Read WAL thread".into()) + .spawn(move || -> Result<()> { + loop { + let copy_data = match FeMessage::read(&mut r)? { + Some(FeMessage::CopyData(bytes)) => bytes, + Some(msg) => bail!("expected `CopyData` message, found {:?}", msg), + None => bail!("connection closed unexpectedly"), + }; + + let msg = ProposerAcceptorMessage::parse(copy_data)?; + msg_tx.send(msg)?; + } + // msg_tx will be dropped here, this will also close msg_rx + })?; + + Ok(Self { + msg_rx, + read_thread: Some(read_thread), + }) + } + + fn recv_msg(&mut self) -> Result { + self.msg_rx.recv().map_err(|_| { + // return error from the read thread + let res = match self.read_thread.take() { + Some(thread) => thread.join(), + None => return anyhow!("read thread is gone"), + }; + + match res { + Ok(Ok(())) => anyhow!("unexpected result from read thread"), + Err(err) => anyhow!("read thread panicked: {:?}", err), + Ok(Err(err)) => err, + } + }) + } + + fn poll_msg(&mut self) -> Option { + let res = self.msg_rx.try_recv(); + + match res { + Err(_) => None, + Ok(msg) => Some(msg), } } } diff --git a/walkeeper/src/safekeeper.rs b/walkeeper/src/safekeeper.rs index 981a0f4d57..fa624bb18f 100644 --- a/walkeeper/src/safekeeper.rs +++ b/walkeeper/src/safekeeper.rs @@ -301,6 +301,8 @@ pub enum ProposerAcceptorMessage { VoteRequest(VoteRequest), Elected(ProposerElected), AppendRequest(AppendRequest), + NoFlushAppendRequest(AppendRequest), + FlushWAL, } impl ProposerAcceptorMessage { @@ -499,7 +501,11 @@ where ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg), ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg), ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg), - ProposerAcceptorMessage::AppendRequest(msg) => self.handle_append_request(msg), + ProposerAcceptorMessage::AppendRequest(msg) => self.handle_append_request(msg, true), + ProposerAcceptorMessage::NoFlushAppendRequest(msg) => { + self.handle_append_request(msg, false) + } + ProposerAcceptorMessage::FlushWAL => self.handle_flush(), } } @@ -605,7 +611,10 @@ where return Ok(None); } - // truncate wal, update the lsns + // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to + // intersection of our history and history from msg + + // truncate wal, update the LSNs self.wal_store.truncate_wal(msg.start_streaming_at)?; // and now adopt term history from proposer @@ -622,6 +631,7 @@ where fn handle_append_request( &mut self, msg: &AppendRequest, + mut require_flush: bool, ) -> Result> { if self.s.acceptor_state.term < msg.h.term { bail!("got AppendRequest before ProposerElected"); @@ -650,9 +660,15 @@ where if self.s.wal_start_lsn == Lsn(0) { self.s.wal_start_lsn = msg.h.begin_lsn; sync_control_file = true; + require_flush = true; } } + // flush wal to the disk, if required + if require_flush { + self.wal_store.flush_wal()?; + } + // Advance commit_lsn taking into account what we have locally. // commit_lsn can be 0, being unknown to new walproposer while he hasn't // collected majority of its epoch acks yet, ignore it in this case. @@ -670,11 +686,9 @@ where } self.truncate_lsn = msg.h.truncate_lsn; - /* - * Update truncate and commit LSN in control file. - * To avoid negative impact on performance of extra fsync, do it only - * when truncate_lsn delta exceeds WAL segment size. - */ + // Update truncate and commit LSN in control file. + // To avoid negative impact on performance of extra fsync, do it only + // when truncate_lsn delta exceeds WAL segment size. sync_control_file |= self.s.truncate_lsn + (self.s.server.wal_seg_size as u64) < self.truncate_lsn; if sync_control_file { @@ -686,6 +700,11 @@ where self.control_store.persist(&self.s)?; } + // If flush_lsn hasn't updated, AppendResponse is not very useful. + if !require_flush { + return Ok(None); + } + let resp = self.append_response(); trace!( "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, resp {:?}", @@ -697,6 +716,14 @@ where ); Ok(Some(AcceptorProposerMessage::AppendResponse(resp))) } + + /// Flush WAL to disk. Return AppendResponse with latest LSNs. + fn handle_flush(&mut self) -> Result> { + self.wal_store.flush_wal()?; + Ok(Some(AcceptorProposerMessage::AppendResponse( + self.append_response(), + ))) + } } #[cfg(test)] @@ -738,6 +765,10 @@ mod tests { self.lsn = end_pos; Ok(()) } + + fn flush_wal(&mut self) -> Result<()> { + Ok(()) + } } #[test] diff --git a/walkeeper/src/wal_storage.rs b/walkeeper/src/wal_storage.rs index f8abc26af9..73eccd0ae8 100644 --- a/walkeeper/src/wal_storage.rs +++ b/walkeeper/src/wal_storage.rs @@ -7,7 +7,7 @@ //! //! Note that last file has `.partial` suffix, that's different from postgres. -use anyhow::{anyhow, Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; use std::io::{Read, Seek, SeekFrom}; use lazy_static::lazy_static; @@ -58,12 +58,20 @@ lazy_static! { DISK_WRITE_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_write_wal_seconds histogram vec"); + static ref FLUSH_WAL_SECONDS: HistogramVec = register_histogram_vec!( + "safekeeper_flush_wal_seconds", + "Seconds spent syncing WAL to a disk, grouped by timeline", + &["tenant_id", "timeline_id"], + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_flush_wal_seconds histogram vec"); } struct WalStorageMetrics { flush_lsn: Gauge, write_wal_bytes: Histogram, write_wal_seconds: Histogram, + flush_wal_seconds: Histogram, } impl WalStorageMetrics { @@ -74,24 +82,38 @@ impl WalStorageMetrics { flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&[&tenant_id, &timeline_id]), write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&[&tenant_id, &timeline_id]), write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), + flush_wal_seconds: FLUSH_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), } } } pub trait Storage { - /// lsn of last durably stored WAL record. + /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn; - /// Init storage with wal_seg_size and read WAL from disk to get latest lsn. + /// Init storage with wal_seg_size and read WAL from disk to get latest LSN. fn init_storage(&mut self, state: &SafeKeeperState) -> Result<()>; - /// Write piece of wal in buf to disk and sync it. + /// Write piece of WAL from buf to disk, but not necessarily sync it. fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; - // Truncate WAL at specified LSN. + /// Truncate WAL at specified LSN, which must be the end of WAL record. fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>; + + /// Durably store WAL on disk, up to the last written WAL record. + fn flush_wal(&mut self) -> Result<()>; } +/// PhysicalStorage is a storage that stores WAL on disk. Writes are separated from flushes +/// for better performance. Storage must be initialized before use. +/// +/// WAL is stored in segments, each segment is a file. Last segment has ".partial" suffix in +/// its filename and may be not fully flushed. +/// +/// Relationship of LSNs: +/// `write_lsn` >= `write_record_lsn` >= `flush_record_lsn` +/// +/// When storage is just created, all LSNs are zeroes and there are no segments on disk. pub struct PhysicalStorage { metrics: WalStorageMetrics, zttid: ZTenantTimelineId, @@ -99,27 +121,29 @@ pub struct PhysicalStorage { conf: SafeKeeperConf, // fields below are filled upon initialization - - // None if unitialized, Some(lsn) if storage is initialized + /// None if unitialized, Some(usize) if storage is initialized. wal_seg_size: Option, - // Relationship of lsns: - // `write_lsn` >= `write_record_lsn` >= `flush_record_lsn` - // - // All lsns are zeroes, if storage is just created, and there are no segments on disk. - - // Written to disk, but possibly still in the cache and not fully persisted. - // Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. + /// Written to disk, but possibly still in the cache and not fully persisted. + /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. write_lsn: Lsn, - // The LSN of the last WAL record written to disk. Still can be not fully flushed. + /// The LSN of the last WAL record written to disk. Still can be not fully flushed. write_record_lsn: Lsn, - // The LSN of the last WAL record flushed to disk. + /// The LSN of the last WAL record flushed to disk. flush_record_lsn: Lsn, - // Decoder is required for detecting boundaries of WAL records. + /// Decoder is required for detecting boundaries of WAL records. decoder: WalStreamDecoder, + + /// Cached open file for the last segment. + /// + /// If Some(file) is open, then it always: + /// - has ".partial" suffix + /// - points to write_lsn, so no seek is needed for writing + /// - doesn't point to the end of the segment + file: Option, } impl PhysicalStorage { @@ -135,128 +159,146 @@ impl PhysicalStorage { write_record_lsn: Lsn(0), flush_record_lsn: Lsn(0), decoder: WalStreamDecoder::new(Lsn(0)), + file: None, } } - // wrapper for flush_lsn updates that also updates metrics + /// Wrapper for flush_lsn updates that also updates metrics. fn update_flush_lsn(&mut self) { self.flush_record_lsn = self.write_record_lsn; self.metrics.flush_lsn.set(self.flush_record_lsn.0 as f64); } - /// Helper returning full path to WAL segment file and its .partial brother. - fn wal_file_paths(&self, segno: XLogSegNo) -> Result<(PathBuf, PathBuf)> { - let wal_seg_size = self - .wal_seg_size - .ok_or_else(|| anyhow!("wal_seg_size is not initialized"))?; - - let wal_file_name = XLogFileName(PG_TLI, segno, wal_seg_size); - let wal_file_path = self.timeline_dir.join(wal_file_name.clone()); - let wal_file_partial_path = self.timeline_dir.join(wal_file_name + ".partial"); - Ok((wal_file_path, wal_file_partial_path)) + /// Call fdatasync if config requires so. + fn fdatasync_file(&self, file: &mut File) -> Result<()> { + if !self.conf.no_sync { + self.metrics + .flush_wal_seconds + .observe_closure_duration(|| file.sync_data())?; + } + Ok(()) } - // TODO: this function is going to be refactored soon, what will change: - // - flush will be called separately from write_wal, this function - // will only write bytes to disk - // - File will be cached in PhysicalStorage, to remove extra syscalls, - // such as open(), seek(), close() - fn write_and_flush(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { + /// Call fsync if config requires so. + fn fsync_file(&self, file: &mut File) -> Result<()> { + if !self.conf.no_sync { + self.metrics + .flush_wal_seconds + .observe_closure_duration(|| file.sync_all())?; + } + Ok(()) + } + + /// Open or create WAL segment file. Caller must call seek to the wanted position. + /// Returns `file` and `is_partial`. + fn open_or_create(&self, segno: XLogSegNo, wal_seg_size: usize) -> Result<(File, bool)> { + let (wal_file_path, wal_file_partial_path) = + wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; + + // Try to open already completed segment + if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { + Ok((file, false)) + } else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) { + // Try to open existing partial file + Ok((file, true)) + } else { + // Create and fill new partial file + let mut file = OpenOptions::new() + .create(true) + .write(true) + .open(&wal_file_partial_path) + .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?; + + write_zeroes(&mut file, wal_seg_size)?; + self.fsync_file(&mut file)?; + Ok((file, true)) + } + } + + /// Write WAL bytes, which are known to be located in a single WAL segment. + fn write_in_segment( + &mut self, + segno: u64, + xlogoff: usize, + buf: &[u8], + wal_seg_size: usize, + ) -> Result<()> { + let mut file = if let Some(file) = self.file.take() { + file + } else { + let (mut file, is_partial) = self.open_or_create(segno, wal_seg_size)?; + assert!(is_partial, "unexpected write into non-partial segment file"); + file.seek(SeekFrom::Start(xlogoff as u64))?; + file + }; + + file.write_all(buf)?; + + if xlogoff + buf.len() == wal_seg_size { + // If we reached the end of a WAL segment, flush and close it. + self.fdatasync_file(&mut file)?; + + // Rename partial file to completed file + let (wal_file_path, wal_file_partial_path) = + wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; + fs::rename(&wal_file_partial_path, &wal_file_path)?; + } else { + // otherwise, file can be reused later + self.file = Some(file); + } + + Ok(()) + } + + /// Writes WAL to the segment files, until everything is writed. If some segments + /// are fully written, they are flushed to disk. The last (partial) segment can + /// be flushed separately later. + /// + /// Updates `write_lsn`. + fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> { let wal_seg_size = self .wal_seg_size .ok_or_else(|| anyhow!("wal_seg_size is not initialized"))?; - let mut bytes_left: usize = buf.len(); - let mut bytes_written: usize = 0; - let mut partial; - let mut start_pos = startpos; - const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ]; - - /* Extract WAL location for this block */ - let mut xlogoff = start_pos.segment_offset(wal_seg_size) as usize; - - while bytes_left != 0 { - let bytes_to_write; - - /* - * If crossing a WAL boundary, only write up until we reach wal - * segment size. - */ - if xlogoff + bytes_left > wal_seg_size { - bytes_to_write = wal_seg_size - xlogoff; - } else { - bytes_to_write = bytes_left; + if self.write_lsn != pos { + // need to flush the file before discarding it + if let Some(mut file) = self.file.take() { + self.fdatasync_file(&mut file)?; } - /* Open file */ - let segno = start_pos.segment_number(wal_seg_size); - let (wal_file_path, wal_file_partial_path) = self.wal_file_paths(segno)?; - { - let mut wal_file: File; - /* Try to open already completed segment */ - if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { - wal_file = file; - partial = false; - } else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) - { - /* Try to open existed partial file */ - wal_file = file; - partial = true; - } else { - /* Create and fill new partial file */ - partial = true; - match OpenOptions::new() - .create(true) - .write(true) - .open(&wal_file_partial_path) - { - Ok(mut file) => { - for _ in 0..(wal_seg_size / XLOG_BLCKSZ) { - file.write_all(ZERO_BLOCK)?; - } - wal_file = file; - } - Err(e) => { - error!("Failed to open log file {:?}: {}", &wal_file_path, e); - return Err(e.into()); - } - } - } - wal_file.seek(SeekFrom::Start(xlogoff as u64))?; - wal_file.write_all(&buf[bytes_written..(bytes_written + bytes_to_write)])?; - - // Flush file, if not said otherwise - if !self.conf.no_sync { - wal_file.sync_all()?; - } - } - /* Write was successful, advance our position */ - bytes_written += bytes_to_write; - bytes_left -= bytes_to_write; - start_pos += bytes_to_write as u64; - xlogoff += bytes_to_write; - - /* Did we reach the end of a WAL segment? */ - if start_pos.segment_offset(wal_seg_size) == 0 { - xlogoff = 0; - if partial { - fs::rename(&wal_file_partial_path, &wal_file_path)?; - } - } + self.write_lsn = pos; } + + while !buf.is_empty() { + // Extract WAL location for this block + let xlogoff = self.write_lsn.segment_offset(wal_seg_size) as usize; + let segno = self.write_lsn.segment_number(wal_seg_size); + + // If crossing a WAL boundary, only write up until we reach wal segment size. + let bytes_write = if xlogoff + buf.len() > wal_seg_size { + wal_seg_size - xlogoff + } else { + buf.len() + }; + + self.write_in_segment(segno, xlogoff, &buf[..bytes_write], wal_seg_size)?; + self.write_lsn += bytes_write as u64; + buf = &buf[bytes_write..]; + } + Ok(()) } } impl Storage for PhysicalStorage { - // flush_lsn returns lsn of last durably stored WAL record. + /// flush_lsn returns LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn { self.flush_record_lsn } - // Storage needs to know wal_seg_size to know which segment to read/write, but - // wal_seg_size is not always known at the moment of storage creation. This method - // allows to postpone its initialization. + /// Storage needs to know wal_seg_size to know which segment to read/write, but + /// wal_seg_size is not always known at the moment of storage creation. This method + /// allows to postpone its initialization. fn init_storage(&mut self, state: &SafeKeeperState) -> Result<()> { if state.server.wal_seg_size == 0 { // wal_seg_size is still unknown @@ -294,29 +336,31 @@ impl Storage for PhysicalStorage { Ok(()) } - // Write and flush WAL to disk. + /// Write WAL to disk. fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { + // Disallow any non-sequential writes, which can result in gaps or overwrites. + // If we need to move the pointer, use truncate_wal() instead. if self.write_lsn > startpos { - warn!( + bail!( "write_wal rewrites WAL written before, write_lsn={}, startpos={}", - self.write_lsn, startpos + self.write_lsn, + startpos ); } - if self.write_lsn < startpos { - warn!( + if self.write_lsn < startpos && self.write_lsn != Lsn(0) { + bail!( "write_wal creates gap in written WAL, write_lsn={}, startpos={}", - self.write_lsn, startpos + self.write_lsn, + startpos ); - // TODO: return error if write_lsn is not zero } { let _timer = self.metrics.write_wal_seconds.start_timer(); - self.write_and_flush(startpos, buf)?; + self.write_exact(startpos, buf)?; } - // WAL is written and flushed, updating lsns - self.write_lsn = startpos + buf.len() as u64; + // WAL is written, updating write metrics self.metrics.write_wal_bytes.observe(buf.len() as f64); // figure out last record's end lsn for reporting (if we got the @@ -339,69 +383,67 @@ impl Storage for PhysicalStorage { } } + Ok(()) + } + + fn flush_wal(&mut self) -> Result<()> { + if self.flush_record_lsn == self.write_record_lsn { + // no need to do extra flush + return Ok(()); + } + + if let Some(mut unflushed_file) = self.file.take() { + self.fdatasync_file(&mut unflushed_file)?; + self.file = Some(unflushed_file); + } else { + // We have unflushed data (write_lsn != flush_lsn), but no file. + // This should only happen if last file was fully written and flushed, + // but haven't updated flush_lsn yet. + assert!(self.write_lsn.segment_offset(self.wal_seg_size.unwrap()) == 0); + } + + // everything is flushed now, let's update flush_lsn self.update_flush_lsn(); Ok(()) } - // Truncate written WAL by removing all WAL segments after the given LSN. - // end_pos must point to the end of the WAL record. + /// Truncate written WAL by removing all WAL segments after the given LSN. + /// end_pos must point to the end of the WAL record. fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { let wal_seg_size = self .wal_seg_size .ok_or_else(|| anyhow!("wal_seg_size is not initialized"))?; - // TODO: cross check divergence point - - // nothing to truncate - if self.write_lsn == Lsn(0) { - return Ok(()); - } - // Streaming must not create a hole, so truncate cannot be called on non-written lsn - assert!(self.write_lsn >= end_pos); + assert!(self.write_lsn == Lsn(0) || self.write_lsn >= end_pos); - // open segment files and delete or fill end with zeroes - - let partial; - const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ]; - - /* Extract WAL location for this block */ - let mut xlogoff = end_pos.segment_offset(wal_seg_size) as usize; - - /* Open file */ - let mut segno = end_pos.segment_number(wal_seg_size); - let (wal_file_path, wal_file_partial_path) = self.wal_file_paths(segno)?; - { - let mut wal_file: File; - /* Try to open already completed segment */ - if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { - wal_file = file; - partial = false; - } else { - wal_file = OpenOptions::new() - .write(true) - .open(&wal_file_partial_path)?; - partial = true; - } - wal_file.seek(SeekFrom::Start(xlogoff as u64))?; - while xlogoff < wal_seg_size { - let bytes_to_write = min(XLOG_BLCKSZ, wal_seg_size - xlogoff); - wal_file.write_all(&ZERO_BLOCK[0..bytes_to_write])?; - xlogoff += bytes_to_write; - } - // Flush file, if not said otherwise - if !self.conf.no_sync { - wal_file.sync_all()?; - } + // Close previously opened file, if any + if let Some(mut unflushed_file) = self.file.take() { + self.fdatasync_file(&mut unflushed_file)?; } - if !partial { + + let xlogoff = end_pos.segment_offset(wal_seg_size) as usize; + let segno = end_pos.segment_number(wal_seg_size); + let (mut file, is_partial) = self.open_or_create(segno, wal_seg_size)?; + + // Fill end with zeroes + file.seek(SeekFrom::Start(xlogoff as u64))?; + write_zeroes(&mut file, wal_seg_size - xlogoff)?; + self.fdatasync_file(&mut file)?; + + if !is_partial { // Make segment partial once again + let (wal_file_path, wal_file_partial_path) = + wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; fs::rename(&wal_file_path, &wal_file_partial_path)?; } + // Remove all subsequent segments + let mut segno = segno; loop { segno += 1; - let (wal_file_path, wal_file_partial_path) = self.wal_file_paths(segno)?; + let (wal_file_path, wal_file_partial_path) = + wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; // TODO: better use fs::try_exists which is currenty avaialble only in nightly build if wal_file_path.exists() { fs::remove_file(&wal_file_path)?; @@ -412,7 +454,7 @@ impl Storage for PhysicalStorage { } } - // Update lsns + // Update LSNs self.write_lsn = end_pos; self.write_record_lsn = end_pos; self.update_flush_lsn(); @@ -491,3 +533,28 @@ impl WalReader { }) } } + +/// Zero block for filling created WAL segments. +const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ]; + +/// Helper for filling file with zeroes. +fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> { + while count >= XLOG_BLCKSZ { + file.write_all(ZERO_BLOCK)?; + count -= XLOG_BLCKSZ; + } + file.write_all(&ZERO_BLOCK[0..count])?; + Ok(()) +} + +/// Helper returning full path to WAL segment file and its .partial brother. +fn wal_file_paths( + timeline_dir: &Path, + segno: XLogSegNo, + wal_seg_size: usize, +) -> Result<(PathBuf, PathBuf)> { + let wal_file_name = XLogFileName(PG_TLI, segno, wal_seg_size); + let wal_file_path = timeline_dir.join(wal_file_name.clone()); + let wal_file_partial_path = timeline_dir.join(wal_file_name + ".partial"); + Ok((wal_file_path, wal_file_partial_path)) +} From 850dfd02df7f32eb9e2ab0bd647124f803ca57e5 Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Sat, 26 Feb 2022 23:33:16 +0300 Subject: [PATCH 0002/1022] Release deployment (#1331) * new deployment flow for staging and production * ansible playbooks and circleci config fixes * cleanup before merge * additional cleanup before merge * debug deployment to staging env * debug deployment to staging env * debug deployment to staging env * debug deployment to staging env * debug deployment to staging env * debug deployment to staging env * bianries artifacts path fix for ansible playbooks * deployment flow refactored * base64 decode fix for ssh key * fix for console notification and production deploy settings * cleanup after deployment tests * fix - trigger release binaries download for production deploy --- .circleci/ansible/ansible.cfg | 10 + .circleci/ansible/ansible.ssh.cfg | 11 ++ .circleci/ansible/deploy.yaml | 174 +++++++++++++++++ .circleci/ansible/get_binaries.sh | 52 +++++ .circleci/ansible/production.hosts | 7 + .circleci/ansible/staging.hosts | 7 + .circleci/ansible/systemd/pageserver.service | 18 ++ .circleci/ansible/systemd/safekeeper.service | 18 ++ .circleci/config.yml | 181 +++++++++++++++--- .circleci/helm-values/production.proxy.yaml | 35 ++++ .../staging.proxy.yaml} | 0 .circleci/storage-redeploy.playbook.yml | 138 ------------- 12 files changed, 484 insertions(+), 167 deletions(-) create mode 100644 .circleci/ansible/ansible.cfg create mode 100644 .circleci/ansible/ansible.ssh.cfg create mode 100644 .circleci/ansible/deploy.yaml create mode 100755 .circleci/ansible/get_binaries.sh create mode 100644 .circleci/ansible/production.hosts create mode 100644 .circleci/ansible/staging.hosts create mode 100644 .circleci/ansible/systemd/pageserver.service create mode 100644 .circleci/ansible/systemd/safekeeper.service create mode 100644 .circleci/helm-values/production.proxy.yaml rename .circleci/{proxy.staging.yaml => helm-values/staging.proxy.yaml} (100%) delete mode 100644 .circleci/storage-redeploy.playbook.yml diff --git a/.circleci/ansible/ansible.cfg b/.circleci/ansible/ansible.cfg new file mode 100644 index 0000000000..e3daf3abe3 --- /dev/null +++ b/.circleci/ansible/ansible.cfg @@ -0,0 +1,10 @@ +[defaults] + +localhost_warning = False +host_key_checking = False +timeout = 30 + +[ssh_connection] +ssh_args = -F ./ansible.ssh.cfg +scp_if_ssh = True +pipelining = True diff --git a/.circleci/ansible/ansible.ssh.cfg b/.circleci/ansible/ansible.ssh.cfg new file mode 100644 index 0000000000..91f673718e --- /dev/null +++ b/.circleci/ansible/ansible.ssh.cfg @@ -0,0 +1,11 @@ +Host tele.zenith.tech + User admin + Port 3023 + StrictHostKeyChecking no + UserKnownHostsFile /dev/null + +Host * !tele.zenith.tech + User admin + StrictHostKeyChecking no + UserKnownHostsFile /dev/null + ProxyJump tele.zenith.tech diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml new file mode 100644 index 0000000000..06385aa0d3 --- /dev/null +++ b/.circleci/ansible/deploy.yaml @@ -0,0 +1,174 @@ +- name: Upload Zenith binaries + hosts: pageservers:safekeepers + gather_facts: False + remote_user: admin + vars: + force_deploy: false + + tasks: + + - name: get latest version of Zenith binaries + ignore_errors: true + register: current_version_file + set_fact: + current_version: "{{ lookup('file', '.zenith_current_version') | trim }}" + tags: + - pageserver + - safekeeper + + - name: set zero value for current_version + when: current_version_file is failed + set_fact: + current_version: "0" + tags: + - pageserver + - safekeeper + + - name: get deployed version from content of remote file + ignore_errors: true + ansible.builtin.slurp: + src: /usr/local/.zenith_current_version + register: remote_version_file + tags: + - pageserver + - safekeeper + + - name: decode remote file content + when: remote_version_file is succeeded + set_fact: + remote_version: "{{ remote_version_file['content'] | b64decode | trim }}" + tags: + - pageserver + - safekeeper + + - name: set zero value for remote_version + when: remote_version_file is failed + set_fact: + remote_version: "0" + tags: + - pageserver + - safekeeper + + - name: inform about versions + debug: msg="Version to deploy - {{ current_version }}, version on storage node - {{ remote_version }}" + tags: + - pageserver + - safekeeper + + + - name: upload and extract Zenith binaries to /usr/local + when: current_version > remote_version or force_deploy + ansible.builtin.unarchive: + owner: root + group: root + src: zenith_install.tar.gz + dest: /usr/local + become: true + tags: + - pageserver + - safekeeper + - binaries + - putbinaries + +- name: Deploy pageserver + hosts: pageservers + gather_facts: False + remote_user: admin + vars: + force_deploy: false + + tasks: + - name: init pageserver + when: current_version > remote_version or force_deploy + shell: + cmd: sudo -u pageserver /usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data + args: + creates: "/storage/pageserver/data/tenants" + environment: + ZENITH_REPO_DIR: "/storage/pageserver/data" + LD_LIBRARY_PATH: "/usr/local/lib" + become: true + tags: + - pageserver + + - name: upload systemd service definition + when: current_version > remote_version or force_deploy + ansible.builtin.template: + src: systemd/pageserver.service + dest: /etc/systemd/system/pageserver.service + owner: root + group: root + mode: '0644' + become: true + tags: + - pageserver + + - name: start systemd service + when: current_version > remote_version or force_deploy + ansible.builtin.systemd: + daemon_reload: yes + name: pageserver + enabled: yes + state: restarted + become: true + tags: + - pageserver + + - name: post version to console + when: (current_version > remote_version or force_deploy) and console_mgmt_base_url is defined + shell: + cmd: | + INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) + curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID + tags: + - pageserver + +- name: Deploy safekeeper + hosts: safekeepers + gather_facts: False + remote_user: admin + vars: + force_deploy: false + + tasks: + + # in the future safekeepers should discover pageservers byself + # but currently use first pageserver that was discovered + - name: set first pageserver var for safekeepers + when: current_version > remote_version or force_deploy + set_fact: + first_pageserver: "{{ hostvars[groups['pageservers'][0]]['inventory_hostname'] }}" + tags: + - safekeeper + + - name: upload systemd service definition + when: current_version > remote_version or force_deploy + ansible.builtin.template: + src: systemd/safekeeper.service + dest: /etc/systemd/system/safekeeper.service + owner: root + group: root + mode: '0644' + become: true + tags: + - safekeeper + + - name: start systemd service + when: current_version > remote_version or force_deploy + ansible.builtin.systemd: + daemon_reload: yes + name: safekeeper + enabled: yes + state: restarted + become: true + tags: + - safekeeper + + - name: post version to console + when: (current_version > remote_version or force_deploy) and console_mgmt_base_url is defined + shell: + cmd: | + INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) + curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ hostvars.localhost.zenith.console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID + tags: + - safekeeper diff --git a/.circleci/ansible/get_binaries.sh b/.circleci/ansible/get_binaries.sh new file mode 100755 index 0000000000..242a9e87e2 --- /dev/null +++ b/.circleci/ansible/get_binaries.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +set -e + +RELEASE=${RELEASE:-false} + +# look at docker hub for latest tag fo zenith docker image +if [ "${RELEASE}" = "true" ]; then + echo "search latest relase tag" + VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/zenithdb/zenith/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | tail -1) + if [ -z "${VERSION}" ]; then + echo "no any docker tags found, exiting..." + exit 1 + else + TAG="release-${VERSION}" + fi +else + echo "search latest dev tag" + VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/zenithdb/zenith/tags |jq -r -S '.[].name' | grep -v release | tail -1) + if [ -z "${VERSION}" ]; then + echo "no any docker tags found, exiting..." + exit 1 + else + TAG="${VERSION}" + fi +fi + +echo "found ${VERSION}" + +# do initial cleanup +rm -rf zenith_install postgres_install.tar.gz zenith_install.tar.gz .zenith_current_version +mkdir zenith_install + +# retrive binaries from docker image +echo "getting binaries from docker image" +docker pull --quiet zenithdb/zenith:${TAG} +ID=$(docker create zenithdb/zenith:${TAG}) +docker cp ${ID}:/data/postgres_install.tar.gz . +tar -xzf postgres_install.tar.gz -C zenith_install +docker cp ${ID}:/usr/local/bin/pageserver zenith_install/bin/ +docker cp ${ID}:/usr/local/bin/safekeeper zenith_install/bin/ +docker cp ${ID}:/usr/local/bin/proxy zenith_install/bin/ +docker cp ${ID}:/usr/local/bin/postgres zenith_install/bin/ +docker rm -vf ${ID} + +# store version to file (for ansible playbooks) and create binaries tarball +echo ${VERSION} > zenith_install/.zenith_current_version +echo ${VERSION} > .zenith_current_version +tar -czf zenith_install.tar.gz -C zenith_install . + +# do final cleaup +rm -rf zenith_install postgres_install.tar.gz diff --git a/.circleci/ansible/production.hosts b/.circleci/ansible/production.hosts new file mode 100644 index 0000000000..c5b4f664a6 --- /dev/null +++ b/.circleci/ansible/production.hosts @@ -0,0 +1,7 @@ +[pageservers] +zenith-1-ps-1 + +[safekeepers] +zenith-1-sk-1 +zenith-1-sk-2 +zenith-1-sk-3 diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts new file mode 100644 index 0000000000..e625120bf3 --- /dev/null +++ b/.circleci/ansible/staging.hosts @@ -0,0 +1,7 @@ +[pageservers] +zenith-us-stage-ps-1 + +[safekeepers] +zenith-us-stage-sk-1 +zenith-us-stage-sk-2 +zenith-us-stage-sk-3 diff --git a/.circleci/ansible/systemd/pageserver.service b/.circleci/ansible/systemd/pageserver.service new file mode 100644 index 0000000000..d346643e58 --- /dev/null +++ b/.circleci/ansible/systemd/pageserver.service @@ -0,0 +1,18 @@ +[Unit] +Description=Zenith pageserver +After=network.target auditd.service + +[Service] +Type=simple +User=pageserver +Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib +ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /storage/pageserver/data +ExecReload=/bin/kill -HUP $MAINPID +KillMode=mixed +KillSignal=SIGINT +Restart=on-failure +TimeoutSec=10 +LimitNOFILE=30000000 + +[Install] +WantedBy=multi-user.target diff --git a/.circleci/ansible/systemd/safekeeper.service b/.circleci/ansible/systemd/safekeeper.service new file mode 100644 index 0000000000..e75602b609 --- /dev/null +++ b/.circleci/ansible/systemd/safekeeper.service @@ -0,0 +1,18 @@ +[Unit] +Description=Zenith safekeeper +After=network.target auditd.service + +[Service] +Type=simple +User=safekeeper +Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib +ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data +ExecReload=/bin/kill -HUP $MAINPID +KillMode=mixed +KillSignal=SIGINT +Restart=on-failure +TimeoutSec=10 +LimitNOFILE=30000000 + +[Install] +WantedBy=multi-user.target diff --git a/.circleci/config.yml b/.circleci/config.yml index 73c487c301..db9fc31334 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -471,46 +471,78 @@ jobs: docker build -t zenithdb/compute-node:latest vendor/postgres && docker push zenithdb/compute-node:latest docker tag zenithdb/compute-node:latest zenithdb/compute-node:${DOCKER_TAG} && docker push zenithdb/compute-node:${DOCKER_TAG} + # Build production zenithdb/zenith:release image and push it to Docker hub + docker-image-release: + docker: + - image: cimg/base:2021.04 + steps: + - checkout + - setup_remote_docker: + docker_layer_caching: true + - run: + name: Init postgres submodule + command: git submodule update --init --depth 1 + - run: + name: Build and push Docker image + command: | + echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin + DOCKER_TAG="release-$(git log --oneline|wc -l)" + docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:release . && docker push zenithdb/zenith:release + docker tag zenithdb/zenith:release zenithdb/zenith:${DOCKER_TAG} && docker push zenithdb/zenith:${DOCKER_TAG} + + # Build production zenithdb/compute-node:release image and push it to Docker hub + docker-image-compute-release: + docker: + - image: cimg/base:2021.04 + steps: + - checkout + - setup_remote_docker: + docker_layer_caching: true + # Build zenithdb/compute-tools:release image and push it to Docker hub + # TODO: this should probably also use versioned tag, not just :latest. + # XXX: but should it? We build and use it only locally now. + - run: + name: Build and push compute-tools Docker image + command: | + echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin + docker build -t zenithdb/compute-tools:release -f Dockerfile.compute-tools . + docker push zenithdb/compute-tools:release + - run: + name: Init postgres submodule + command: git submodule update --init --depth 1 + - run: + name: Build and push compute-node Docker image + command: | + echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin + DOCKER_TAG="release-$(git log --oneline|wc -l)" + docker build -t zenithdb/compute-node:release vendor/postgres && docker push zenithdb/compute-node:release + docker tag zenithdb/compute-node:release zenithdb/compute-node:${DOCKER_TAG} && docker push zenithdb/compute-node:${DOCKER_TAG} + deploy-staging: docker: - image: cimg/python:3.10 steps: - checkout - setup_remote_docker - - run: - name: Get Zenith binaries - command: | - rm -rf zenith_install postgres_install.tar.gz zenith_install.tar.gz - mkdir zenith_install - DOCKER_TAG=$(git log --oneline|wc -l) - docker pull --quiet zenithdb/zenith:${DOCKER_TAG} - ID=$(docker create zenithdb/zenith:${DOCKER_TAG}) - docker cp $ID:/data/postgres_install.tar.gz . - tar -xzf postgres_install.tar.gz -C zenith_install && rm postgres_install.tar.gz - docker cp $ID:/usr/local/bin/pageserver zenith_install/bin/ - docker cp $ID:/usr/local/bin/safekeeper zenith_install/bin/ - docker cp $ID:/usr/local/bin/proxy zenith_install/bin/ - docker cp $ID:/usr/local/bin/postgres zenith_install/bin/ - docker rm -v $ID - echo ${DOCKER_TAG} | tee zenith_install/.zenith_current_version - tar -czf zenith_install.tar.gz -C zenith_install . - ls -la zenith_install.tar.gz - run: name: Setup ansible command: | pip install --progress-bar off --user ansible boto3 - ansible-galaxy collection install amazon.aws - run: - name: Apply re-deploy playbook - environment: - ANSIBLE_HOST_KEY_CHECKING: false + name: Redeploy command: | - echo "${STAGING_SSH_KEY}" | base64 --decode | ssh-add - - export AWS_REGION=${STAGING_AWS_REGION} - export AWS_ACCESS_KEY_ID=${STAGING_AWS_ACCESS_KEY_ID} - export AWS_SECRET_ACCESS_KEY=${STAGING_AWS_SECRET_ACCESS_KEY} - ansible-playbook .circleci/storage-redeploy.playbook.yml - rm -f zenith_install.tar.gz + cd "$(pwd)/.circleci/ansible" + + ./get_binaries.sh + + echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key + echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub + chmod 0600 ssh-key + ssh-add ssh-key + rm -f ssh-key ssh-key-cert.pub + + ansible-playbook deploy.yaml -i staging.hosts + rm -f zenith_install.tar.gz .zenith_current_version deploy-staging-proxy: docker: @@ -533,7 +565,57 @@ jobs: name: Re-deploy proxy command: | DOCKER_TAG=$(git log --oneline|wc -l) - helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/proxy.staging.yaml --set image.tag=${DOCKER_TAG} --wait + helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait + + + deploy-release: + docker: + - image: cimg/python:3.10 + steps: + - checkout + - setup_remote_docker + - run: + name: Setup ansible + command: | + pip install --progress-bar off --user ansible boto3 + - run: + name: Redeploy + command: | + cd "$(pwd)/.circleci/ansible" + + RELEASE=true ./get_binaries.sh + + echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key + echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub + chmod 0600 ssh-key + ssh-add ssh-key + rm -f ssh-key ssh-key-cert.pub + + ansible-playbook deploy.yaml -i production.hosts -e console_mgmt_base_url=http://console-release.local + rm -f zenith_install.tar.gz .zenith_current_version + + deploy-release-proxy: + docker: + - image: cimg/base:2021.04 + environment: + KUBECONFIG: .kubeconfig + steps: + - checkout + - run: + name: Store kubeconfig file + command: | + echo "${PRODUCTION_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG} + chmod 0600 ${KUBECONFIG} + - run: + name: Setup helm v3 + command: | + curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + helm repo add zenithdb https://zenithdb.github.io/helm-charts + - run: + name: Re-deploy proxy + command: | + DOCKER_TAG="release-$(git log --oneline|wc -l)" + helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait # Trigger a new remote CI job remote-ci-trigger: @@ -669,6 +751,47 @@ workflows: - main requires: - docker-image + + - docker-image-release: + # Context gives an ability to login + context: Docker Hub + # Build image only for commits to main + filters: + branches: + only: + - release + requires: + - pg_regress-tests-release + - other-tests-release + - docker-image-compute-release: + # Context gives an ability to login + context: Docker Hub + # Build image only for commits to main + filters: + branches: + only: + - release + requires: + - pg_regress-tests-release + - other-tests-release + - deploy-release: + # Context gives an ability to login + context: Docker Hub + # deploy only for commits to main + filters: + branches: + only: + - release + requires: + - docker-image-release + - deploy-release-proxy: + # deploy only for commits to main + filters: + branches: + only: + - release + requires: + - docker-image-release - remote-ci-trigger: # Context passes credentials for gh api context: CI_ACCESS_TOKEN diff --git a/.circleci/helm-values/production.proxy.yaml b/.circleci/helm-values/production.proxy.yaml new file mode 100644 index 0000000000..27aa169c79 --- /dev/null +++ b/.circleci/helm-values/production.proxy.yaml @@ -0,0 +1,35 @@ +# Helm chart values for zenith-proxy. +# This is a YAML-formatted file. + +settings: + authEndpoint: "https://console.zenith.tech/authenticate_proxy_request/" + uri: "https://console.zenith.tech/psql_session/" + +# -- Additional labels for zenith-proxy pods +podLabels: + zenith_service: proxy + zenith_env: production + zenith_region: us-west-2 + zenith_region_slug: oregon + +service: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internal + external-dns.alpha.kubernetes.io/hostname: proxy-release.local + type: LoadBalancer + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: start.zenith.tech + +metrics: + enabled: true + serviceMonitor: + enabled: true + selector: + release: kube-prometheus-stack diff --git a/.circleci/proxy.staging.yaml b/.circleci/helm-values/staging.proxy.yaml similarity index 100% rename from .circleci/proxy.staging.yaml rename to .circleci/helm-values/staging.proxy.yaml diff --git a/.circleci/storage-redeploy.playbook.yml b/.circleci/storage-redeploy.playbook.yml deleted file mode 100644 index 8173d81521..0000000000 --- a/.circleci/storage-redeploy.playbook.yml +++ /dev/null @@ -1,138 +0,0 @@ -- name: discover storage nodes - hosts: localhost - connection: local - gather_facts: False - - tasks: - - - name: discover safekeepers - no_log: true - ec2_instance_info: - filters: - "tag:zenith_env": "staging" - "tag:zenith_service": "safekeeper" - register: ec2_safekeepers - - - name: discover pageservers - no_log: true - ec2_instance_info: - filters: - "tag:zenith_env": "staging" - "tag:zenith_service": "pageserver" - register: ec2_pageservers - - - name: add safekeepers to host group - no_log: true - add_host: - name: safekeeper-{{ ansible_loop.index }} - ansible_host: "{{ item.public_ip_address }}" - groups: - - storage - - safekeepers - with_items: "{{ ec2_safekeepers.instances }}" - loop_control: - extended: yes - - - name: add pageservers to host group - no_log: true - add_host: - name: pageserver-{{ ansible_loop.index }} - ansible_host: "{{ item.public_ip_address }}" - groups: - - storage - - pageservers - with_items: "{{ ec2_pageservers.instances }}" - loop_control: - extended: yes - -- name: Retrive versions - hosts: storage - gather_facts: False - remote_user: admin - - tasks: - - - name: Get current version of binaries - set_fact: - current_version: "{{lookup('file', '../zenith_install/.zenith_current_version') }}" - - - name: Check that file with version exists on host - stat: - path: /usr/local/.zenith_current_version - register: version_file - - - name: Try to get current version from the host - when: version_file.stat.exists - ansible.builtin.fetch: - src: /usr/local/.zenith_current_version - dest: .remote_version.{{ inventory_hostname }} - fail_on_missing: no - flat: yes - - - name: Store remote version to variable - when: version_file.stat.exists - set_fact: - remote_version: "{{ lookup('file', '.remote_version.{{ inventory_hostname }}') }}" - - - name: Store default value of remote version to variable in case when remote version file not found - when: not version_file.stat.exists - set_fact: - remote_version: "000" - -- name: Extract Zenith binaries - hosts: storage - gather_facts: False - remote_user: admin - - tasks: - - - name: Inform about version conflict - when: current_version <= remote_version - debug: msg="Current version {{ current_version }} LE than remote {{ remote_version }}" - - - name: Extract Zenith binaries to /usr/local - when: current_version > remote_version - ansible.builtin.unarchive: - src: ../zenith_install.tar.gz - dest: /usr/local - become: true - -- name: Restart safekeepers - hosts: safekeepers - gather_facts: False - remote_user: admin - - tasks: - - - name: Inform about version conflict - when: current_version <= remote_version - debug: msg="Current version {{ current_version }} LE than remote {{ remote_version }}" - - - name: Restart systemd service - when: current_version > remote_version - ansible.builtin.systemd: - daemon_reload: yes - name: safekeeper - enabled: yes - state: restarted - become: true - -- name: Restart pageservers - hosts: pageservers - gather_facts: False - remote_user: admin - - tasks: - - - name: Inform about version conflict - when: current_version <= remote_version - debug: msg="Current version {{ current_version }} LE than remote {{ remote_version }}" - - - name: Restart systemd service - when: current_version > remote_version - ansible.builtin.systemd: - daemon_reload: yes - name: pageserver - enabled: yes - state: restarted - become: true From 26a68612d92608fd1768bdf65dc9cb4725d87edc Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Sun, 27 Feb 2022 01:36:40 +0300 Subject: [PATCH 0003/1022] safekeeper to cosnole call fix (#1333) --- .circleci/ansible/deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml index 06385aa0d3..c95524a8a5 100644 --- a/.circleci/ansible/deploy.yaml +++ b/.circleci/ansible/deploy.yaml @@ -169,6 +169,6 @@ shell: cmd: | INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) - curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ hostvars.localhost.zenith.console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID + curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID tags: - safekeeper From a0f9a0d350d5abe5070d34f6f5b6bc7044ef071f Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Sun, 27 Feb 2022 01:52:33 +0300 Subject: [PATCH 0004/1022] safekeeper to cosnole call fix (#1333) (#1334) --- .circleci/ansible/deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml index 06385aa0d3..c95524a8a5 100644 --- a/.circleci/ansible/deploy.yaml +++ b/.circleci/ansible/deploy.yaml @@ -169,6 +169,6 @@ shell: cmd: | INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) - curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ hostvars.localhost.zenith.console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID + curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID tags: - safekeeper From 949f8b463330e318d72c3944fe78958b38305cb3 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 1 Mar 2022 22:42:57 +0200 Subject: [PATCH 0005/1022] Fix 1.59 rustc clippy warnings --- compute_tools/src/pg_helpers.rs | 2 +- compute_tools/src/spec.rs | 2 +- pageserver/src/layered_repository.rs | 9 ++++----- pageserver/src/layered_repository/inmemory_layer.rs | 9 ++++----- pageserver/src/walrecord.rs | 12 +++++------- proxy/src/main.rs | 2 +- 6 files changed, 16 insertions(+), 20 deletions(-) diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 8b6dc04069..6a22b865fa 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -171,7 +171,7 @@ impl PgQuote for PgIdent { /// always quotes provided string with `""` and escapes every `"`. Not idempotent, /// i.e. if string is already escaped it will be escaped again. fn quote(&self) -> String { - let result = format!("\"{}\"", self.replace("\"", "\"\"")); + let result = format!("\"{}\"", self.replace('"', "\"\"")); result } } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 41e4174bf0..1dd7c0044e 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -215,7 +215,7 @@ pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> { if let Some(r) = pg_db { // XXX: db owner name is returned as quoted string from Postgres, // when quoting is needed. - let new_owner = if r.owner.starts_with('\"') { + let new_owner = if r.owner.starts_with('"') { db.owner.quote() } else { db.owner.clone() diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 5dae1902c1..975b2f5d2b 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -893,12 +893,11 @@ impl Timeline for LayeredTimeline { let seg = SegmentTag { rel, segno: 0 }; - let result; - if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - result = layer.get_seg_exists(lsn)?; + let result = if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { + layer.get_seg_exists(lsn)? } else { - result = false; - } + false + }; trace!("get_rel_exists: {} at {} -> {}", rel, lsn, result); Ok(result) diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 17b061b20e..6e24bf6022 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -170,12 +170,11 @@ impl Layer for InMemoryLayer { fn filename(&self) -> PathBuf { let inner = self.inner.read().unwrap(); - let end_lsn; - if let Some(drop_lsn) = inner.end_lsn { - end_lsn = drop_lsn; + let end_lsn = if let Some(drop_lsn) = inner.end_lsn { + drop_lsn } else { - end_lsn = Lsn(u64::MAX); - } + Lsn(u64::MAX) + }; let delta_filename = DeltaFileName { seg: self.seg, diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 378a015d4a..ca9107cdbf 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -268,12 +268,11 @@ impl XlXactParsedRecord { let info = xl_info & pg_constants::XLOG_XACT_OPMASK; // The record starts with time of commit/abort let xact_time = buf.get_i64_le(); - let xinfo; - if xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 { - xinfo = buf.get_u32_le(); + let xinfo = if xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 { + buf.get_u32_le() } else { - xinfo = 0; - } + 0 + }; let db_id; let ts_id; if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 { @@ -502,7 +501,6 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { 0..=pg_constants::XLR_MAX_BLOCK_ID => { /* XLogRecordBlockHeader */ let mut blk = DecodedBkpBlock::new(); - let fork_flags: u8; if block_id <= max_block_id { // TODO @@ -515,7 +513,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { } max_block_id = block_id; - fork_flags = buf.get_u8(); + let fork_flags: u8 = buf.get_u8(); blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK; blk.flags = fork_flags; blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0; diff --git a/proxy/src/main.rs b/proxy/src/main.rs index fb3bf725b8..de618ccde9 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -122,7 +122,7 @@ async fn main() -> anyhow::Result<()> { None => RouterConfig::Dynamic(auth_method), Some(addr) => { if let ClientAuthMethod::Password = auth_method { - let (host, port) = addr.split_once(":").unwrap(); + let (host, port) = addr.split_once(':').unwrap(); RouterConfig::Static { host: host.to_string(), port: port.parse().unwrap(), From 1d90b1b205023c3bd404de8b361dda69cef6a502 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 4 Mar 2022 01:10:42 +0300 Subject: [PATCH 0006/1022] add node id to pageserver (#1310) * Add --id argument to safekeeper setting its unique u64 id. In preparation for storage node messaging. IDs are supposed to be monotonically assigned by the console. In tests it is issued by ZenithEnv; at the zenith cli level and fixtures, string name is completely replaced by integer id. Example TOML configs are adjusted accordingly. Sequential ids are chosen over Zid mainly because they are compact and easy to type/remember. * add node id to pageserver This adds node id parameter to pageserver configuration. Also I use a simple builder to construct pageserver config struct to avoid setting node id to some temporary invalid value. Some of the changes in test fixtures are needed to split init and start operations for envrionment. Co-authored-by: Arseny Sher --- control_plane/safekeepers.conf | 6 +- control_plane/simple.conf | 2 +- control_plane/src/local_env.rs | 17 +- control_plane/src/safekeeper.rs | 16 +- control_plane/src/storage.rs | 3 + docker-entrypoint.sh | 2 +- pageserver/src/bin/pageserver.rs | 9 +- pageserver/src/config.rs | 281 +++++++++++++++--- pageserver/src/http/models.rs | 6 + pageserver/src/http/openapi_spec.yml | 5 + pageserver/src/http/routes.rs | 14 +- test_runner/README.md | 2 +- test_runner/batch_others/test_auth.py | 4 +- test_runner/batch_others/test_backpressure.py | 2 +- .../batch_others/test_branch_behind.py | 2 +- test_runner/batch_others/test_next_xid.py | 2 +- .../batch_others/test_pageserver_api.py | 17 +- .../batch_others/test_pageserver_catchup.py | 2 +- .../batch_others/test_pageserver_restart.py | 2 +- .../batch_others/test_remote_storage.py | 2 +- .../batch_others/test_restart_compute.py | 2 +- .../batch_others/test_tenant_relocation.py | 2 +- test_runner/batch_others/test_tenants.py | 2 +- .../batch_others/test_timeline_size.py | 2 +- test_runner/batch_others/test_wal_acceptor.py | 38 ++- .../batch_others/test_wal_acceptor_async.py | 2 +- test_runner/batch_others/test_zenith_cli.py | 4 +- test_runner/fixtures/zenith_fixtures.py | 70 +++-- .../performance/test_bulk_tenant_create.py | 2 +- walkeeper/src/bin/safekeeper.rs | 81 ++++- walkeeper/src/http/routes.rs | 12 +- walkeeper/src/lib.rs | 4 +- zenith/src/main.rs | 41 +-- zenith_utils/src/zid.rs | 12 + 34 files changed, 501 insertions(+), 169 deletions(-) diff --git a/control_plane/safekeepers.conf b/control_plane/safekeepers.conf index 828d5a5a1e..df7dd2adca 100644 --- a/control_plane/safekeepers.conf +++ b/control_plane/safekeepers.conf @@ -5,16 +5,16 @@ listen_http_addr = '127.0.0.1:9898' auth_type = 'Trust' [[safekeepers]] -name = 'sk1' +id = 1 pg_port = 5454 http_port = 7676 [[safekeepers]] -name = 'sk2' +id = 2 pg_port = 5455 http_port = 7677 [[safekeepers]] -name = 'sk3' +id = 3 pg_port = 5456 http_port = 7678 diff --git a/control_plane/simple.conf b/control_plane/simple.conf index 796c6adbd9..2243a0a5f8 100644 --- a/control_plane/simple.conf +++ b/control_plane/simple.conf @@ -6,6 +6,6 @@ listen_http_addr = '127.0.0.1:9898' auth_type = 'Trust' [[safekeepers]] -name = 'single' +id = 1 pg_port = 5454 http_port = 7676 diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index b80e137cb9..55d0b00496 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -12,7 +12,9 @@ use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use zenith_utils::auth::{encode_from_key_file, Claims, Scope}; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{opt_display_serde, ZTenantId}; +use zenith_utils::zid::{opt_display_serde, ZNodeId, ZTenantId}; + +use crate::safekeeper::SafekeeperNode; // // This data structures represents zenith CLI config @@ -62,6 +64,8 @@ pub struct LocalEnv { #[derive(Serialize, Deserialize, Clone, Debug)] #[serde(default)] pub struct PageServerConf { + // node id + pub id: ZNodeId, // Pageserver connection settings pub listen_pg_addr: String, pub listen_http_addr: String, @@ -76,6 +80,7 @@ pub struct PageServerConf { impl Default for PageServerConf { fn default() -> Self { Self { + id: ZNodeId(0), listen_pg_addr: String::new(), listen_http_addr: String::new(), auth_type: AuthType::Trust, @@ -87,7 +92,7 @@ impl Default for PageServerConf { #[derive(Serialize, Deserialize, Clone, Debug)] #[serde(default)] pub struct SafekeeperConf { - pub name: String, + pub id: ZNodeId, pub pg_port: u16, pub http_port: u16, pub sync: bool, @@ -96,7 +101,7 @@ pub struct SafekeeperConf { impl Default for SafekeeperConf { fn default() -> Self { Self { - name: String::new(), + id: ZNodeId(0), pg_port: 0, http_port: 0, sync: true, @@ -136,8 +141,8 @@ impl LocalEnv { self.base_data_dir.clone() } - pub fn safekeeper_data_dir(&self, node_name: &str) -> PathBuf { - self.base_data_dir.join("safekeepers").join(node_name) + pub fn safekeeper_data_dir(&self, data_dir_name: &str) -> PathBuf { + self.base_data_dir.join("safekeepers").join(data_dir_name) } /// Create a LocalEnv from a config file. @@ -285,7 +290,7 @@ impl LocalEnv { fs::create_dir_all(self.pg_data_dirs_path())?; for safekeeper in &self.safekeepers { - fs::create_dir_all(self.safekeeper_data_dir(&safekeeper.name))?; + fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?; } let mut conf_content = String::new(); diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index f5478b5922..351d1efbbc 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -15,6 +15,7 @@ use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use thiserror::Error; use zenith_utils::http::error::HttpErrorBody; +use zenith_utils::zid::ZNodeId; use crate::local_env::{LocalEnv, SafekeeperConf}; use crate::storage::PageServerNode; @@ -61,7 +62,7 @@ impl ResponseErrorMessageExt for Response { // #[derive(Debug)] pub struct SafekeeperNode { - pub name: String, + pub id: ZNodeId, pub conf: SafekeeperConf, @@ -77,10 +78,10 @@ impl SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { let pageserver = Arc::new(PageServerNode::from_env(env)); - println!("initializing for {} for {}", conf.name, conf.http_port); + println!("initializing for sk {} for {}", conf.id, conf.http_port); SafekeeperNode { - name: conf.name.clone(), + id: conf.id, conf: conf.clone(), pg_connection_config: Self::safekeeper_connection_config(conf.pg_port), env: env.clone(), @@ -98,8 +99,12 @@ impl SafekeeperNode { .unwrap() } + pub fn datadir_path_by_id(env: &LocalEnv, sk_id: ZNodeId) -> PathBuf { + env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref()) + } + pub fn datadir_path(&self) -> PathBuf { - self.env.safekeeper_data_dir(&self.name) + SafekeeperNode::datadir_path_by_id(&self.env, self.id) } pub fn pid_file(&self) -> PathBuf { @@ -120,6 +125,7 @@ impl SafekeeperNode { let mut cmd = Command::new(self.env.safekeeper_bin()?); fill_rust_env_vars( cmd.args(&["-D", self.datadir_path().to_str().unwrap()]) + .args(&["--id", self.id.to_string().as_ref()]) .args(&["--listen-pg", &listen_pg]) .args(&["--listen-http", &listen_http]) .args(&["--recall", "1 second"]) @@ -183,7 +189,7 @@ impl SafekeeperNode { pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { let pid_file = self.pid_file(); if !pid_file.exists() { - println!("Safekeeper {} is already stopped", self.name); + println!("Safekeeper {} is already stopped", self.id); return Ok(()); } let pid = read_pidfile(&pid_file)?; diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index be594889ab..cd429e3f7a 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -103,6 +103,8 @@ impl PageServerNode { ) -> anyhow::Result<()> { let mut cmd = Command::new(self.env.pageserver_bin()?); + let id = format!("id={}", self.env.pageserver.id); + // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let base_data_dir_param = self.env.base_data_dir.display().to_string(); let pg_distrib_dir_param = @@ -122,6 +124,7 @@ impl PageServerNode { args.extend(["-c", &authg_type_param]); args.extend(["-c", &listen_http_addr_param]); args.extend(["-c", &listen_pg_addr_param]); + args.extend(["-c", &id]); for config_override in config_overrides { args.extend(["-c", config_override]); diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 45c41b4c19..93bb5f9cd7 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -4,7 +4,7 @@ set -eux if [ "$1" = 'pageserver' ]; then if [ ! -d "/data/tenants" ]; then echo "Initializing pageserver data directory" - pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" + pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" fi echo "Staring pageserver at 0.0.0.0:6400" pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /data diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index fb8baa28f6..d8d4033340 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -61,7 +61,7 @@ fn main() -> Result<()> { .number_of_values(1) .multiple_occurrences(true) .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). - Any option has to be a valid toml document, example: `-c \"foo='hey'\"` `-c \"foo={value=1}\"`"), + Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), ) .get_matches(); @@ -115,7 +115,14 @@ fn main() -> Result<()> { option_line ) })?; + for (key, item) in doc.iter() { + if key == "id" { + anyhow::ensure!( + init, + "node id can only be set during pageserver init and cannot be overridden" + ); + } toml.insert(key, item.clone()); } } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 8b65e7e2e6..3deabb7521 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -8,7 +8,7 @@ use anyhow::{bail, ensure, Context, Result}; use toml_edit; use toml_edit::{Document, Item}; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; use std::convert::TryInto; use std::env; @@ -78,6 +78,10 @@ pub mod defaults { #[derive(Debug, Clone, PartialEq, Eq)] pub struct PageServerConf { + // Identifier of that particular pageserver so e g safekeepers + // can safely distinguish different pageservers + pub id: ZNodeId, + /// Example (default): 127.0.0.1:64000 pub listen_pg_addr: String, /// Example (default): 127.0.0.1:9898 @@ -118,6 +122,206 @@ pub struct PageServerConf { pub remote_storage_config: Option, } +// use dedicated enum for builder to better indicate the intention +// and avoid possible confusion with nested options +pub enum BuilderValue { + Set(T), + NotSet, +} + +impl BuilderValue { + pub fn ok_or(self, err: E) -> Result { + match self { + Self::Set(v) => Ok(v), + Self::NotSet => Err(err), + } + } +} + +// needed to simplify config construction +struct PageServerConfigBuilder { + listen_pg_addr: BuilderValue, + + listen_http_addr: BuilderValue, + + checkpoint_distance: BuilderValue, + checkpoint_period: BuilderValue, + + gc_horizon: BuilderValue, + gc_period: BuilderValue, + + wait_lsn_timeout: BuilderValue, + wal_redo_timeout: BuilderValue, + + superuser: BuilderValue, + + page_cache_size: BuilderValue, + max_file_descriptors: BuilderValue, + + workdir: BuilderValue, + + pg_distrib_dir: BuilderValue, + + auth_type: BuilderValue, + + // + auth_validation_public_key_path: BuilderValue>, + remote_storage_config: BuilderValue>, + + id: BuilderValue, +} + +impl Default for PageServerConfigBuilder { + fn default() -> Self { + use self::BuilderValue::*; + use defaults::*; + Self { + listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()), + listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()), + checkpoint_distance: Set(DEFAULT_CHECKPOINT_DISTANCE), + checkpoint_period: Set(humantime::parse_duration(DEFAULT_CHECKPOINT_PERIOD) + .expect("cannot parse default checkpoint period")), + gc_horizon: Set(DEFAULT_GC_HORIZON), + gc_period: Set(humantime::parse_duration(DEFAULT_GC_PERIOD) + .expect("cannot parse default gc period")), + wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) + .expect("cannot parse default wait lsn timeout")), + wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) + .expect("cannot parse default wal redo timeout")), + superuser: Set(DEFAULT_SUPERUSER.to_string()), + page_cache_size: Set(DEFAULT_PAGE_CACHE_SIZE), + max_file_descriptors: Set(DEFAULT_MAX_FILE_DESCRIPTORS), + workdir: Set(PathBuf::new()), + pg_distrib_dir: Set(env::current_dir() + .expect("cannot access current directory") + .join("tmp_install")), + auth_type: Set(AuthType::Trust), + auth_validation_public_key_path: Set(None), + remote_storage_config: Set(None), + id: NotSet, + } + } +} + +impl PageServerConfigBuilder { + pub fn listen_pg_addr(&mut self, listen_pg_addr: String) { + self.listen_pg_addr = BuilderValue::Set(listen_pg_addr) + } + + pub fn listen_http_addr(&mut self, listen_http_addr: String) { + self.listen_http_addr = BuilderValue::Set(listen_http_addr) + } + + pub fn checkpoint_distance(&mut self, checkpoint_distance: u64) { + self.checkpoint_distance = BuilderValue::Set(checkpoint_distance) + } + + pub fn checkpoint_period(&mut self, checkpoint_period: Duration) { + self.checkpoint_period = BuilderValue::Set(checkpoint_period) + } + + pub fn gc_horizon(&mut self, gc_horizon: u64) { + self.gc_horizon = BuilderValue::Set(gc_horizon) + } + + pub fn gc_period(&mut self, gc_period: Duration) { + self.gc_period = BuilderValue::Set(gc_period) + } + + pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) { + self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout) + } + + pub fn wal_redo_timeout(&mut self, wal_redo_timeout: Duration) { + self.wal_redo_timeout = BuilderValue::Set(wal_redo_timeout) + } + + pub fn superuser(&mut self, superuser: String) { + self.superuser = BuilderValue::Set(superuser) + } + + pub fn page_cache_size(&mut self, page_cache_size: usize) { + self.page_cache_size = BuilderValue::Set(page_cache_size) + } + + pub fn max_file_descriptors(&mut self, max_file_descriptors: usize) { + self.max_file_descriptors = BuilderValue::Set(max_file_descriptors) + } + + pub fn workdir(&mut self, workdir: PathBuf) { + self.workdir = BuilderValue::Set(workdir) + } + + pub fn pg_distrib_dir(&mut self, pg_distrib_dir: PathBuf) { + self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir) + } + + pub fn auth_type(&mut self, auth_type: AuthType) { + self.auth_type = BuilderValue::Set(auth_type) + } + + pub fn auth_validation_public_key_path( + &mut self, + auth_validation_public_key_path: Option, + ) { + self.auth_validation_public_key_path = BuilderValue::Set(auth_validation_public_key_path) + } + + pub fn remote_storage_config(&mut self, remote_storage_config: Option) { + self.remote_storage_config = BuilderValue::Set(remote_storage_config) + } + + pub fn id(&mut self, node_id: ZNodeId) { + self.id = BuilderValue::Set(node_id) + } + + pub fn build(self) -> Result { + Ok(PageServerConf { + listen_pg_addr: self + .listen_pg_addr + .ok_or(anyhow::anyhow!("missing listen_pg_addr"))?, + listen_http_addr: self + .listen_http_addr + .ok_or(anyhow::anyhow!("missing listen_http_addr"))?, + checkpoint_distance: self + .checkpoint_distance + .ok_or(anyhow::anyhow!("missing checkpoint_distance"))?, + checkpoint_period: self + .checkpoint_period + .ok_or(anyhow::anyhow!("missing checkpoint_period"))?, + gc_horizon: self + .gc_horizon + .ok_or(anyhow::anyhow!("missing gc_horizon"))?, + gc_period: self.gc_period.ok_or(anyhow::anyhow!("missing gc_period"))?, + wait_lsn_timeout: self + .wait_lsn_timeout + .ok_or(anyhow::anyhow!("missing wait_lsn_timeout"))?, + wal_redo_timeout: self + .wal_redo_timeout + .ok_or(anyhow::anyhow!("missing wal_redo_timeout"))?, + superuser: self.superuser.ok_or(anyhow::anyhow!("missing superuser"))?, + page_cache_size: self + .page_cache_size + .ok_or(anyhow::anyhow!("missing page_cache_size"))?, + max_file_descriptors: self + .max_file_descriptors + .ok_or(anyhow::anyhow!("missing max_file_descriptors"))?, + workdir: self.workdir.ok_or(anyhow::anyhow!("missing workdir"))?, + pg_distrib_dir: self + .pg_distrib_dir + .ok_or(anyhow::anyhow!("missing pg_distrib_dir"))?, + auth_type: self.auth_type.ok_or(anyhow::anyhow!("missing auth_type"))?, + auth_validation_public_key_path: self + .auth_validation_public_key_path + .ok_or(anyhow::anyhow!("missing auth_validation_public_key_path"))?, + remote_storage_config: self + .remote_storage_config + .ok_or(anyhow::anyhow!("missing remote_storage_config"))?, + id: self.id.ok_or(anyhow::anyhow!("missing id"))?, + }) + } +} + /// External backup storage configuration, enough for creating a client for that storage. #[derive(Debug, Clone, PartialEq, Eq)] pub struct RemoteStorageConfig { @@ -233,61 +437,41 @@ impl PageServerConf { /// /// This leaves any options not present in the file in the built-in defaults. pub fn parse_and_validate(toml: &Document, workdir: &Path) -> Result { - use defaults::*; - - let mut conf = PageServerConf { - workdir: workdir.to_path_buf(), - - listen_pg_addr: DEFAULT_PG_LISTEN_ADDR.to_string(), - listen_http_addr: DEFAULT_HTTP_LISTEN_ADDR.to_string(), - checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, - checkpoint_period: humantime::parse_duration(DEFAULT_CHECKPOINT_PERIOD)?, - gc_horizon: DEFAULT_GC_HORIZON, - gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)?, - wait_lsn_timeout: humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)?, - wal_redo_timeout: humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)?, - page_cache_size: DEFAULT_PAGE_CACHE_SIZE, - max_file_descriptors: DEFAULT_MAX_FILE_DESCRIPTORS, - - pg_distrib_dir: PathBuf::new(), - auth_validation_public_key_path: None, - auth_type: AuthType::Trust, - - remote_storage_config: None, - - superuser: DEFAULT_SUPERUSER.to_string(), - }; + let mut builder = PageServerConfigBuilder::default(); + builder.workdir(workdir.to_owned()); for (key, item) in toml.iter() { match key { - "listen_pg_addr" => conf.listen_pg_addr = parse_toml_string(key, item)?, - "listen_http_addr" => conf.listen_http_addr = parse_toml_string(key, item)?, - "checkpoint_distance" => conf.checkpoint_distance = parse_toml_u64(key, item)?, - "checkpoint_period" => conf.checkpoint_period = parse_toml_duration(key, item)?, - "gc_horizon" => conf.gc_horizon = parse_toml_u64(key, item)?, - "gc_period" => conf.gc_period = parse_toml_duration(key, item)?, - "wait_lsn_timeout" => conf.wait_lsn_timeout = parse_toml_duration(key, item)?, - "wal_redo_timeout" => conf.wal_redo_timeout = parse_toml_duration(key, item)?, - "initial_superuser_name" => conf.superuser = parse_toml_string(key, item)?, - "page_cache_size" => conf.page_cache_size = parse_toml_u64(key, item)? as usize, + "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?), + "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?), + "checkpoint_distance" => builder.checkpoint_distance(parse_toml_u64(key, item)?), + "checkpoint_period" => builder.checkpoint_period(parse_toml_duration(key, item)?), + "gc_horizon" => builder.gc_horizon(parse_toml_u64(key, item)?), + "gc_period" => builder.gc_period(parse_toml_duration(key, item)?), + "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?), + "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?), + "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?), + "page_cache_size" => builder.page_cache_size(parse_toml_u64(key, item)? as usize), "max_file_descriptors" => { - conf.max_file_descriptors = parse_toml_u64(key, item)? as usize + builder.max_file_descriptors(parse_toml_u64(key, item)? as usize) } "pg_distrib_dir" => { - conf.pg_distrib_dir = PathBuf::from(parse_toml_string(key, item)?) + builder.pg_distrib_dir(PathBuf::from(parse_toml_string(key, item)?)) } - "auth_validation_public_key_path" => { - conf.auth_validation_public_key_path = - Some(PathBuf::from(parse_toml_string(key, item)?)) - } - "auth_type" => conf.auth_type = parse_toml_auth_type(key, item)?, + "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some( + PathBuf::from(parse_toml_string(key, item)?), + )), + "auth_type" => builder.auth_type(parse_toml_auth_type(key, item)?), "remote_storage" => { - conf.remote_storage_config = Some(Self::parse_remote_storage_config(item)?) + builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?)) } + "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)), _ => bail!("unrecognized pageserver option '{}'", key), } } + let mut conf = builder.build().context("invalid config")?; + if conf.auth_type == AuthType::ZenithJWT { let auth_validation_public_key_path = conf .auth_validation_public_key_path @@ -301,9 +485,6 @@ impl PageServerConf { ); } - if conf.pg_distrib_dir == PathBuf::new() { - conf.pg_distrib_dir = env::current_dir()?.join("tmp_install") - }; if !conf.pg_distrib_dir.join("bin/postgres").exists() { bail!( "Can't find postgres binary at {}", @@ -398,6 +579,7 @@ impl PageServerConf { #[cfg(test)] pub fn dummy_conf(repo_dir: PathBuf) -> Self { PageServerConf { + id: ZNodeId(0), checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, checkpoint_period: Duration::from_secs(10), gc_horizon: defaults::DEFAULT_GC_HORIZON, @@ -482,15 +664,16 @@ max_file_descriptors = 333 # initial superuser role name to use when creating a new tenant initial_superuser_name = 'zzzz' +id = 10 - "#; +"#; #[test] fn parse_defaults() -> anyhow::Result<()> { let tempdir = tempdir()?; let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; // we have to create dummy pathes to overcome the validation errors - let config_string = format!("pg_distrib_dir='{}'", pg_distrib_dir.display()); + let config_string = format!("pg_distrib_dir='{}'\nid=10", pg_distrib_dir.display()); let toml = config_string.parse()?; let parsed_config = @@ -501,6 +684,7 @@ initial_superuser_name = 'zzzz' assert_eq!( parsed_config, PageServerConf { + id: ZNodeId(10), listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, @@ -544,6 +728,7 @@ initial_superuser_name = 'zzzz' assert_eq!( parsed_config, PageServerConf { + id: ZNodeId(10), listen_pg_addr: "127.0.0.1:64000".to_string(), listen_http_addr: "127.0.0.1:9898".to_string(), checkpoint_distance: 111, diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 6ce377c535..5d7398ef03 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -1,6 +1,7 @@ use serde::{Deserialize, Serialize}; use crate::ZTenantId; +use zenith_utils::zid::ZNodeId; #[derive(Serialize, Deserialize)] pub struct BranchCreateRequest { @@ -15,3 +16,8 @@ pub struct TenantCreateRequest { #[serde(with = "hex")] pub tenant_id: ZTenantId, } + +#[derive(Serialize)] +pub struct StatusResponse { + pub id: ZNodeId, +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index dcb81849e0..baf81fcf21 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -17,6 +17,11 @@ paths: application/json: schema: type: object + required: + - id + properties: + id: + type: integer /v1/timeline/{tenant_id}: parameters: - name: tenant_id diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index b13a45750e..4fc41d6e82 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,7 +1,6 @@ use std::sync::Arc; use anyhow::{Context, Result}; -use hyper::header; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use serde::Serialize; @@ -23,6 +22,7 @@ use zenith_utils::lsn::Lsn; use zenith_utils::zid::{opt_display_serde, ZTimelineId}; use super::models::BranchCreateRequest; +use super::models::StatusResponse; use super::models::TenantCreateRequest; use crate::branches::BranchInfo; use crate::repository::RepositoryTimeline; @@ -64,12 +64,12 @@ fn get_config(request: &Request) -> &'static PageServerConf { } // healthcheck handler -async fn status_handler(_: Request) -> Result, ApiError> { - Ok(Response::builder() - .status(StatusCode::OK) - .header(header::CONTENT_TYPE, "application/json") - .body(Body::from("{}")) - .map_err(ApiError::from_err)?) +async fn status_handler(request: Request) -> Result, ApiError> { + let config = get_config(&request); + Ok(json_response( + StatusCode::OK, + StatusResponse { id: config.id }, + )?) } async fn branch_create_handler(mut request: Request) -> Result, ApiError> { diff --git a/test_runner/README.md b/test_runner/README.md index 514c5f1e3a..a56c2df2c0 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -89,7 +89,7 @@ def test_foobar(zenith_env_builder: ZenithEnvBuilder): # Now create the environment. This initializes the repository, and starts # up the page server and the safekeepers - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # Run the test ... diff --git a/test_runner/batch_others/test_auth.py b/test_runner/batch_others/test_auth.py index 7f86986e2e..ee1a09c917 100644 --- a/test_runner/batch_others/test_auth.py +++ b/test_runner/batch_others/test_auth.py @@ -8,7 +8,7 @@ import pytest def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.pageserver_auth_enabled = True - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() ps = env.pageserver @@ -51,7 +51,7 @@ def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_w zenith_env_builder.pageserver_auth_enabled = True if with_wal_acceptors: zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() branch = f"test_compute_auth_to_pageserver{with_wal_acceptors}" env.zenith_cli.create_branch(branch, "main") diff --git a/test_runner/batch_others/test_backpressure.py b/test_runner/batch_others/test_backpressure.py index 23af5b90ed..2b064c9fa8 100644 --- a/test_runner/batch_others/test_backpressure.py +++ b/test_runner/batch_others/test_backpressure.py @@ -93,7 +93,7 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv def test_backpressure_received_lsn_lag(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # Create a branch for us env.zenith_cli.create_branch("test_backpressure", "main") diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index 860db51c8a..509c46975e 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -19,7 +19,7 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): # # See https://github.com/zenithdb/zenith/issues/1068 zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # Branch at the point where only 100 rows were inserted env.zenith_cli.create_branch("test_branch_behind", "main") diff --git a/test_runner/batch_others/test_next_xid.py b/test_runner/batch_others/test_next_xid.py index 625abc39d3..fd0f761409 100644 --- a/test_runner/batch_others/test_next_xid.py +++ b/test_runner/batch_others/test_next_xid.py @@ -11,7 +11,7 @@ from fixtures.log_helper import log def test_next_xid(zenith_env_builder: ZenithEnvBuilder): # One safekeeper is enough for this test. zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() pg = env.postgres.create_start('main') diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index eccffc4d69..ba1f106c4b 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -1,8 +1,15 @@ -import json from uuid import uuid4, UUID -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient -from typing import cast -import pytest, psycopg2 +import pytest +from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient, zenith_binpath + + +# test that we cannot override node id +def test_pageserver_init_node_id(zenith_env_builder: ZenithEnvBuilder): + env = zenith_env_builder.init() + with pytest.raises( + Exception, + match="node id can only be set during pageserver init and cannot be overridden"): + env.pageserver.start(overrides=['--pageserver-config-override=id=10']) def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): @@ -41,7 +48,7 @@ def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv): def test_pageserver_http_api_client_auth_enabled(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.pageserver_auth_enabled = True - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() management_token = env.auth_keys.generate_management_token() diff --git a/test_runner/batch_others/test_pageserver_catchup.py b/test_runner/batch_others/test_pageserver_catchup.py index 97dc0f3260..985d1a3af0 100644 --- a/test_runner/batch_others/test_pageserver_catchup.py +++ b/test_runner/batch_others/test_pageserver_catchup.py @@ -14,7 +14,7 @@ from fixtures.log_helper import log # and new compute node contains all data. def test_pageserver_catchup_while_compute_down(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() env.zenith_cli.create_branch("test_pageserver_catchup_while_compute_down", "main") pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down') diff --git a/test_runner/batch_others/test_pageserver_restart.py b/test_runner/batch_others/test_pageserver_restart.py index 0cfc50f0ff..ec93c2cf5b 100644 --- a/test_runner/batch_others/test_pageserver_restart.py +++ b/test_runner/batch_others/test_pageserver_restart.py @@ -13,7 +13,7 @@ from fixtures.log_helper import log def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder): # One safekeeper is enough for this test. zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() env.zenith_cli.create_branch("test_pageserver_restart", "main") pg = env.postgres.create_start('test_pageserver_restart') diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index fa6feaf412..61feb1a5bd 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -42,7 +42,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, data_secret = 'very secret secret' ##### First start, insert secret data and upload it to the remote storage - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() pg = env.postgres.create_start() tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] diff --git a/test_runner/batch_others/test_restart_compute.py b/test_runner/batch_others/test_restart_compute.py index f7810be555..d4dd3fb9e2 100644 --- a/test_runner/batch_others/test_restart_compute.py +++ b/test_runner/batch_others/test_restart_compute.py @@ -13,7 +13,7 @@ def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptor zenith_env_builder.pageserver_auth_enabled = True if with_wal_acceptors: zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() env.zenith_cli.create_branch("test_restart_compute", "main") diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 5c6d78e730..acff3ef62c 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -122,7 +122,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, zenith_env_builder.num_safekeepers = 1 zenith_env_builder.enable_local_fs_remote_storage() - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # create folder for remote storage mock remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage' diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/batch_others/test_tenants.py index 232c724870..b665ae9022 100644 --- a/test_runner/batch_others/test_tenants.py +++ b/test_runner/batch_others/test_tenants.py @@ -10,7 +10,7 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_wal_acce if with_wal_acceptors: zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() """Tests tenants with and without wal acceptors""" tenant_1 = env.create_tenant() tenant_2 = env.create_tenant() diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index b48f830528..2c31267922 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -67,7 +67,7 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() env.zenith_cli.create_branch("test_timeline_size_quota", "main") client = env.pageserver.http_client() diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 4d9e18bb58..c375c9626a 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -22,7 +22,7 @@ from typing import List, Optional, Any # succeed and data is written def test_normal_work(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() env.zenith_cli.create_branch("test_wal_acceptors_normal_work", "main") @@ -51,7 +51,7 @@ class BranchMetrics: # against different timelines. def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() n_timelines = 3 @@ -181,7 +181,7 @@ def test_restarts(zenith_env_builder: ZenithEnvBuilder): n_acceptors = 3 zenith_env_builder.num_safekeepers = n_acceptors - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() env.zenith_cli.create_branch("test_wal_acceptors_restarts", "main") pg = env.postgres.create_start('test_wal_acceptors_restarts') @@ -218,7 +218,7 @@ def delayed_wal_acceptor_start(wa): # When majority of acceptors is offline, commits are expected to be frozen def test_unavailability(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 2 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() env.zenith_cli.create_branch("test_wal_acceptors_unavailability", "main") pg = env.postgres.create_start('test_wal_acceptors_unavailability') @@ -289,7 +289,7 @@ def stop_value(): def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value): zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() env.zenith_cli.create_branch("test_wal_acceptors_race_conditions", "main") pg = env.postgres.create_start('test_wal_acceptors_race_conditions') @@ -404,7 +404,7 @@ def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, # We don't really need the full environment for this test, just the # safekeepers would be enough. zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() timeline_id = uuid.uuid4() tenant_id = uuid.uuid4() @@ -454,7 +454,7 @@ def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() env.zenith_cli.create_branch("test_timeline_status", "main") pg = env.postgres.create_start('test_timeline_status') @@ -521,12 +521,7 @@ class SafekeeperEnv: http=self.port_distributor.get_port(), ) - if self.num_safekeepers == 1: - name = "single" - else: - name = f"sk{i}" - - safekeeper_dir = os.path.join(self.repo_dir, name) + safekeeper_dir = os.path.join(self.repo_dir, f"sk{i}") mkdir_if_needed(safekeeper_dir) args = [ @@ -537,6 +532,8 @@ class SafekeeperEnv: f"127.0.0.1:{port.http}", "-D", safekeeper_dir, + "--id", + str(i), "--daemonize" ] @@ -604,9 +601,8 @@ def test_safekeeper_without_pageserver(test_output_dir: str, def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): - def safekeepers_guc(env: ZenithEnv, sk_names: List[str]) -> str: - return ','.join( - [f'localhost:{sk.port.pg}' for sk in env.safekeepers if sk.name in sk_names]) + def safekeepers_guc(env: ZenithEnv, sk_names: List[int]) -> str: + return ','.join([f'localhost:{sk.port.pg}' for sk in env.safekeepers if sk.id in sk_names]) def execute_payload(pg: Postgres): with closing(pg.connect()) as conn: @@ -628,17 +624,17 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): http_cli = sk.http_client() try: status = http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"Safekeeper {sk.name} status: {status}") + log.info(f"Safekeeper {sk.id} status: {status}") except Exception as e: - log.info(f"Safekeeper {sk.name} status error: {e}") + log.info(f"Safekeeper {sk.id} status error: {e}") zenith_env_builder.num_safekeepers = 4 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() env.zenith_cli.create_branch("test_replace_safekeeper", "main") log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() - active_safekeepers = ['sk1', 'sk2', 'sk3'] + active_safekeepers = [1, 2, 3] pg = env.postgres.create('test_replace_safekeeper') pg.adjust_for_wal_acceptors(safekeepers_guc(env, active_safekeepers)) pg.start() @@ -678,7 +674,7 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): log.info("Recreate postgres to replace failed sk1 with new sk4") pg.stop_and_destroy().create('test_replace_safekeeper') - active_safekeepers = ['sk2', 'sk3', 'sk4'] + active_safekeepers = [2, 3, 4] env.safekeepers[3].start() pg.adjust_for_wal_acceptors(safekeepers_guc(env, active_safekeepers)) pg.start() diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index 1d2a186eb7..4b6a27f73d 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -200,7 +200,7 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_w # restart acceptors one by one, while executing and validating bank transactions def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() env.zenith_cli.create_branch("test_wal_acceptors_restarts_under_load", "main") pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load') diff --git a/test_runner/batch_others/test_zenith_cli.py b/test_runner/batch_others/test_zenith_cli.py index ce051dfd6e..f1897e4b6f 100644 --- a/test_runner/batch_others/test_zenith_cli.py +++ b/test_runner/batch_others/test_zenith_cli.py @@ -97,7 +97,7 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv): def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): # Start with single sk zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # Connect to sk port on v4 loopback res = requests.get(f'http://127.0.0.1:{env.safekeepers[0].port.http}/v1/status') @@ -114,7 +114,7 @@ def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): def test_cli_start_stop(zenith_env_builder: ZenithEnvBuilder): # Start with single sk zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # Stop default ps/sk env.zenith_cli.pageserver_stop() diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index b4b3de1db3..252ca9b3c1 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -27,7 +27,7 @@ from dataclasses import dataclass # Type-related stuff from psycopg2.extensions import connection as PgConnection -from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple +from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, TypeVar, cast, Union, Tuple from typing_extensions import Literal import pytest @@ -434,6 +434,14 @@ class ZenithEnvBuilder: self.env = ZenithEnv(self) return self.env + def start(self): + self.env.start() + + def init_start(self) -> ZenithEnv: + env = self.init() + self.start() + return env + """ Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path. Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. @@ -549,6 +557,7 @@ class ZenithEnv: toml += textwrap.dedent(f""" [pageserver] + id=1 listen_pg_addr = 'localhost:{pageserver_port.pg}' listen_http_addr = 'localhost:{pageserver_port.http}' auth_type = '{pageserver_auth_type}' @@ -566,25 +575,22 @@ class ZenithEnv: pg=self.port_distributor.get_port(), http=self.port_distributor.get_port(), ) - - if config.num_safekeepers == 1: - name = "single" - else: - name = f"sk{i}" - toml += f""" -[[safekeepers]] -name = '{name}' -pg_port = {port.pg} -http_port = {port.http} -sync = false # Disable fsyncs to make the tests go faster - """ - safekeeper = Safekeeper(env=self, name=name, port=port) + id = i # assign ids sequentially + toml += textwrap.dedent(f""" + [[safekeepers]] + id = {id} + pg_port = {port.pg} + http_port = {port.http} + sync = false # Disable fsyncs to make the tests go faster + """) + safekeeper = Safekeeper(env=self, id=id, port=port) self.safekeepers.append(safekeeper) log.info(f"Config: {toml}") self.zenith_cli.init(toml) + def start(self): # Start up the page server and all the safekeepers self.pageserver.start() @@ -625,7 +631,7 @@ def _shared_simple_env(request: Any, port_distributor) -> Iterator[ZenithEnv]: with ZenithEnvBuilder(Path(repo_dir), port_distributor) as builder: - env = builder.init() + env = builder.init_start() # For convenience in tests, create a branch from the freshly-initialized cluster. env.zenith_cli.create_branch("empty", "main") @@ -659,7 +665,7 @@ def zenith_env_builder(test_output_dir, port_distributor) -> Iterator[ZenithEnvB To use, define 'zenith_env_builder' fixture in your test to get access to the builder object. Set properties on it to describe the environment. Finally, initialize and start up the environment by calling - zenith_env_builder.init(). + zenith_env_builder.init_start(). After the initialization, you can launch compute nodes by calling the functions in the 'env.postgres' factory object, stop/start the @@ -847,8 +853,8 @@ class ZenithCli: return self.raw_cli(cmd) - def pageserver_start(self) -> 'subprocess.CompletedProcess[str]': - start_args = ['pageserver', 'start'] + def pageserver_start(self, overrides=()) -> 'subprocess.CompletedProcess[str]': + start_args = ['pageserver', 'start', *overrides] append_pageserver_param_overrides(start_args, self.env.pageserver.remote_storage, self.env.pageserver.config_override) @@ -862,17 +868,17 @@ class ZenithCli: log.info(f"Stopping pageserver with {cmd}") return self.raw_cli(cmd) - def safekeeper_start(self, name: str) -> 'subprocess.CompletedProcess[str]': - return self.raw_cli(['safekeeper', 'start', name]) + def safekeeper_start(self, id: int) -> 'subprocess.CompletedProcess[str]': + return self.raw_cli(['safekeeper', 'start', str(id)]) def safekeeper_stop(self, - name: Optional[str] = None, + id: Optional[int] = None, immediate=False) -> 'subprocess.CompletedProcess[str]': args = ['safekeeper', 'stop'] + if id is not None: + args.extend(str(id)) if immediate: args.extend(['-m', 'immediate']) - if name is not None: - args.append(name) return self.raw_cli(args) def pg_create( @@ -1005,14 +1011,15 @@ class ZenithPageserver(PgProtocol): self.remote_storage = remote_storage self.config_override = config_override - def start(self) -> 'ZenithPageserver': + def start(self, overrides=()) -> 'ZenithPageserver': """ Start the page server. + `overrides` allows to add some config to this pageserver start. Returns self. """ assert self.running == False - self.env.zenith_cli.pageserver_start() + self.env.zenith_cli.pageserver_start(overrides=overrides) self.running = True return self @@ -1466,12 +1473,14 @@ class Safekeeper: """ An object representing a running safekeeper daemon. """ env: ZenithEnv port: SafekeeperPort - name: str # identifier for logging + id: int auth_token: Optional[str] = None + running: bool = False def start(self) -> 'Safekeeper': - self.env.zenith_cli.safekeeper_start(self.name) - + assert self.running == False + self.env.zenith_cli.safekeeper_start(self.id) + self.running = True # wait for wal acceptor start by checking its status started_at = time.time() while True: @@ -1489,8 +1498,9 @@ class Safekeeper: return self def stop(self, immediate=False) -> 'Safekeeper': - log.info('Stopping safekeeper {}'.format(self.name)) - self.env.zenith_cli.safekeeper_stop(self.name, immediate) + log.info('Stopping safekeeper {}'.format(self.id)) + self.env.zenith_cli.safekeeper_stop(self.id, immediate) + self.running = False return self def append_logical_message(self, diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index 6fd77f3020..0247385211 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -23,7 +23,7 @@ def test_bulk_tenant_create( """Measure tenant creation time (with and without wal acceptors)""" if use_wal_acceptors == 'with_wa': zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() time_slices = [] diff --git a/walkeeper/src/bin/safekeeper.rs b/walkeeper/src/bin/safekeeper.rs index ea5d0cba14..48de1481d4 100644 --- a/walkeeper/src/bin/safekeeper.rs +++ b/walkeeper/src/bin/safekeeper.rs @@ -1,17 +1,19 @@ // // Main entry point for the safekeeper executable // -use anyhow::{Context, Result}; +use anyhow::{bail, Context, Result}; use clap::{App, Arg}; use const_format::formatcp; use daemonize::Daemonize; use fs2::FileExt; -use std::fs::File; +use std::fs::{self, File}; +use std::io::{ErrorKind, Write}; use std::path::{Path, PathBuf}; use std::thread; use tracing::*; use walkeeper::control_file::{self, CreateControlFile}; use zenith_utils::http::endpoint; +use zenith_utils::zid::ZNodeId; use zenith_utils::{logging, tcp_listener, GIT_VERSION}; use tokio::sync::mpsc; @@ -25,6 +27,7 @@ use zenith_utils::shutdown::exit_now; use zenith_utils::signals; const LOCK_FILE_NAME: &str = "safekeeper.lock"; +const ID_FILE_NAME: &str = "safekeeper.id"; fn main() -> Result<()> { zenith_metrics::set_common_metrics_prefix("safekeeper"); @@ -38,6 +41,12 @@ fn main() -> Result<()> { .takes_value(true) .help("Path to the safekeeper data directory"), ) + .arg( + Arg::new("init") + .long("init") + .takes_value(false) + .help("Initialize safekeeper with ID"), + ) .arg( Arg::new("listen-pg") .short('l') @@ -93,6 +102,9 @@ fn main() -> Result<()> { .takes_value(true) .help("Dump control file at path specifed by this argument and exit"), ) + .arg( + Arg::new("id").long("id").takes_value(true).help("safekeeper node id: integer") + ) .get_matches(); if let Some(addr) = arg_matches.value_of("dump-control-file") { @@ -136,10 +148,19 @@ fn main() -> Result<()> { conf.recall_period = humantime::parse_duration(recall)?; } - start_safekeeper(conf) + let mut given_id = None; + if let Some(given_id_str) = arg_matches.value_of("id") { + given_id = Some(ZNodeId( + given_id_str + .parse() + .context("failed to parse safekeeper id")?, + )); + } + + start_safekeeper(conf, given_id, arg_matches.is_present("init")) } -fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { +fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { let log_file = logging::init("safekeeper.log", conf.daemonize)?; info!("version: {}", GIT_VERSION); @@ -154,6 +175,12 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { ) })?; + // Set or read our ID. + set_id(&mut conf, given_id)?; + if init { + return Ok(()); + } + let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| { error!("failed to bind to address {}: {}", conf.listen_http_addr, e); e @@ -260,3 +287,49 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { std::process::exit(111); }) } + +/// Determine safekeeper id and set it in config. +fn set_id(conf: &mut SafeKeeperConf, given_id: Option) -> Result<()> { + let id_file_path = conf.workdir.join(ID_FILE_NAME); + + let my_id: ZNodeId; + // If ID exists, read it in; otherwise set one passed + match fs::read(&id_file_path) { + Ok(id_serialized) => { + my_id = ZNodeId( + std::str::from_utf8(&id_serialized) + .context("failed to parse safekeeper id")? + .parse() + .context("failed to parse safekeeper id")?, + ); + if let Some(given_id) = given_id { + if given_id != my_id { + bail!( + "safekeeper already initialized with id {}, can't set {}", + my_id, + given_id + ); + } + } + info!("safekeeper ID {}", my_id); + } + Err(error) => match error.kind() { + ErrorKind::NotFound => { + my_id = if let Some(given_id) = given_id { + given_id + } else { + bail!("safekeeper id is not specified"); + }; + let mut f = File::create(&id_file_path)?; + f.write_all(my_id.to_string().as_bytes())?; + f.sync_all()?; + info!("initialized safekeeper ID {}", my_id); + } + _ => { + return Err(error.into()); + } + }, + } + conf.my_id = my_id; + Ok(()) +} diff --git a/walkeeper/src/http/routes.rs b/walkeeper/src/http/routes.rs index 11a29ac6d3..bc992c6a6f 100644 --- a/walkeeper/src/http/routes.rs +++ b/walkeeper/src/http/routes.rs @@ -5,6 +5,7 @@ use std::fmt::Display; use std::sync::Arc; use zenith_utils::http::{RequestExt, RouterBuilder}; use zenith_utils::lsn::Lsn; +use zenith_utils::zid::ZNodeId; use zenith_utils::zid::ZTenantTimelineId; use crate::control_file::CreateControlFile; @@ -18,9 +19,16 @@ use zenith_utils::http::json::json_response; use zenith_utils::http::request::parse_request_param; use zenith_utils::zid::{ZTenantId, ZTimelineId}; +#[derive(Debug, Serialize)] +struct SafekeeperStatus { + id: ZNodeId, +} + /// Healthcheck handler. -async fn status_handler(_: Request) -> Result, ApiError> { - Ok(json_response(StatusCode::OK, "")?) +async fn status_handler(request: Request) -> Result, ApiError> { + let conf = get_conf(&request); + let status = SafekeeperStatus { id: conf.my_id }; + Ok(json_response(StatusCode::OK, status)?) } fn get_conf(request: &Request) -> &SafeKeeperConf { diff --git a/walkeeper/src/lib.rs b/walkeeper/src/lib.rs index 6c3e0b264e..dfd71e4de2 100644 --- a/walkeeper/src/lib.rs +++ b/walkeeper/src/lib.rs @@ -2,7 +2,7 @@ use std::path::PathBuf; use std::time::Duration; -use zenith_utils::zid::ZTenantTimelineId; +use zenith_utils::zid::{ZNodeId, ZTenantTimelineId}; pub mod callmemaybe; pub mod control_file; @@ -46,6 +46,7 @@ pub struct SafeKeeperConf { pub listen_http_addr: String, pub ttl: Option, pub recall_period: Duration, + pub my_id: ZNodeId, } impl SafeKeeperConf { @@ -69,6 +70,7 @@ impl Default for SafeKeeperConf { listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), ttl: None, recall_period: defaults::DEFAULT_RECALL_PERIOD, + my_id: ZNodeId(0), } } } diff --git a/zenith/src/main.rs b/zenith/src/main.rs index a2a762f5be..5500d924ea 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -18,32 +18,35 @@ use walkeeper::defaults::{ }; use zenith_utils::auth::{Claims, Scope}; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; use zenith_utils::GIT_VERSION; use pageserver::branches::BranchInfo; -// Default name of a safekeeper node, if not specified on the command line. -const DEFAULT_SAFEKEEPER_NAME: &str = "single"; +// Default id of a safekeeper node, if not specified on the command line. +const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1); +const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1); fn default_conf() -> String { format!( r#" # Default built-in configuration, defined in main.rs [pageserver] +id = {pageserver_id} listen_pg_addr = '{pageserver_pg_addr}' listen_http_addr = '{pageserver_http_addr}' auth_type = '{pageserver_auth_type}' [[safekeepers]] -name = '{safekeeper_name}' +id = {safekeeper_id} pg_port = {safekeeper_pg_port} http_port = {safekeeper_http_port} "#, + pageserver_id = DEFAULT_PAGESERVER_ID, pageserver_pg_addr = DEFAULT_PAGESERVER_PG_ADDR, pageserver_http_addr = DEFAULT_PAGESERVER_HTTP_ADDR, pageserver_auth_type = AuthType::Trust, - safekeeper_name = DEFAULT_SAFEKEEPER_NAME, + safekeeper_id = DEFAULT_SAFEKEEPER_ID, safekeeper_pg_port = DEFAULT_SAFEKEEPER_PG_PORT, safekeeper_http_port = DEFAULT_SAFEKEEPER_HTTP_PORT, ) @@ -74,9 +77,9 @@ fn main() -> Result<()> { .required(true); #[rustfmt::skip] - let safekeeper_node_arg = Arg::new("node") + let safekeeper_id_arg = Arg::new("id") .index(1) - .help("Node name") + .help("safekeeper id") .required(false); let timeline_arg = Arg::new("timeline") @@ -154,16 +157,16 @@ fn main() -> Result<()> { .about("Manage safekeepers") .subcommand(App::new("start") .about("Start local safekeeper") - .arg(safekeeper_node_arg.clone()) + .arg(safekeeper_id_arg.clone()) ) .subcommand(App::new("stop") .about("Stop local safekeeper") - .arg(safekeeper_node_arg.clone()) + .arg(safekeeper_id_arg.clone()) .arg(stop_mode_arg.clone()) ) .subcommand(App::new("restart") .about("Restart local safekeeper") - .arg(safekeeper_node_arg.clone()) + .arg(safekeeper_id_arg.clone()) .arg(stop_mode_arg.clone()) ) ) @@ -628,11 +631,11 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul Ok(()) } -fn get_safekeeper(env: &local_env::LocalEnv, name: &str) -> Result { - if let Some(node) = env.safekeepers.iter().find(|node| node.name == name) { +fn get_safekeeper(env: &local_env::LocalEnv, id: ZNodeId) -> Result { + if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) { Ok(SafekeeperNode::from_env(env, node)) } else { - bail!("could not find safekeeper '{}'", name) + bail!("could not find safekeeper '{}'", id) } } @@ -643,8 +646,12 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul }; // All the commands take an optional safekeeper name argument - let node_name = sub_args.value_of("node").unwrap_or(DEFAULT_SAFEKEEPER_NAME); - let safekeeper = get_safekeeper(env, node_name)?; + let sk_id = if let Some(id_str) = sub_args.value_of("id") { + ZNodeId(id_str.parse().context("while parsing safekeeper id")?) + } else { + DEFAULT_SAFEKEEPER_ID + }; + let safekeeper = get_safekeeper(env, sk_id)?; match sub_name { "start" => { @@ -697,7 +704,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); if let Err(e) = safekeeper.start() { - eprintln!("safekeeper '{}' start failed: {}", safekeeper.name, e); + eprintln!("safekeeper '{}' start failed: {}", safekeeper.id, e); exit(1); } } @@ -724,7 +731,7 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result< for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); if let Err(e) = safekeeper.stop(immediate) { - eprintln!("safekeeper '{}' stop failed: {}", safekeeper.name, e); + eprintln!("safekeeper '{}' stop failed: {}", safekeeper.id, e); } } Ok(()) diff --git a/zenith_utils/src/zid.rs b/zenith_utils/src/zid.rs index 2e93ab596c..7dfffd96d7 100644 --- a/zenith_utils/src/zid.rs +++ b/zenith_utils/src/zid.rs @@ -221,6 +221,18 @@ impl fmt::Display for ZTenantTimelineId { } } +// Unique ID of a storage node (safekeeper or pageserver). Supposed to be issued +// by the console. +#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Debug, Serialize, Deserialize)] +#[serde(transparent)] +pub struct ZNodeId(pub u64); + +impl fmt::Display for ZNodeId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} + #[cfg(test)] mod tests { use std::fmt::Display; From 9424bfae22d6a808371959c87aa1106701a34ad5 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 1 Mar 2022 22:34:42 +0200 Subject: [PATCH 0007/1022] Use a separate newtype for ZId that (de)serialize as hex strings --- control_plane/src/local_env.rs | 7 +- pageserver/src/branches.rs | 3 +- pageserver/src/http/routes.rs | 10 +- zenith/src/main.rs | 4 +- zenith_utils/src/auth.rs | 44 ++------ zenith_utils/src/zid.rs | 198 ++++++++++++++++++++++++++------- 6 files changed, 179 insertions(+), 87 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 55d0b00496..238c78821e 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -12,7 +12,7 @@ use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use zenith_utils::auth::{encode_from_key_file, Claims, Scope}; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{opt_display_serde, ZNodeId, ZTenantId}; +use zenith_utils::zid::{HexZTenantId, ZNodeId, ZTenantId}; use crate::safekeeper::SafekeeperNode; @@ -47,9 +47,8 @@ pub struct LocalEnv { // Default tenant ID to use with the 'zenith' command line utility, when // --tenantid is not explicitly specified. - #[serde(with = "opt_display_serde")] #[serde(default)] - pub default_tenantid: Option, + pub default_tenantid: Option, // used to issue tokens during e.g pg start #[serde(default)] @@ -185,7 +184,7 @@ impl LocalEnv { // If no initial tenant ID was given, generate it. if env.default_tenantid.is_none() { - env.default_tenantid = Some(ZTenantId::generate()); + env.default_tenantid = Some(HexZTenantId::from(ZTenantId::generate())); } env.base_data_dir = base_path(); diff --git a/pageserver/src/branches.rs b/pageserver/src/branches.rs index 8a411060de..43f27af5ea 100644 --- a/pageserver/src/branches.rs +++ b/pageserver/src/branches.rs @@ -16,10 +16,9 @@ use std::{ }; use tracing::*; -use zenith_utils::crashsafe_dir; -use zenith_utils::logging; use zenith_utils::lsn::Lsn; use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use zenith_utils::{crashsafe_dir, logging}; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 4fc41d6e82..26d473efaf 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -19,7 +19,8 @@ use zenith_utils::http::{ }; use zenith_utils::http::{RequestExt, RouterBuilder}; use zenith_utils::lsn::Lsn; -use zenith_utils::zid::{opt_display_serde, ZTimelineId}; +use zenith_utils::zid::HexZTimelineId; +use zenith_utils::zid::ZTimelineId; use super::models::BranchCreateRequest; use super::models::StatusResponse; @@ -198,8 +199,7 @@ enum TimelineInfo { timeline_id: ZTimelineId, #[serde(with = "hex")] tenant_id: ZTenantId, - #[serde(with = "opt_display_serde")] - ancestor_timeline_id: Option, + ancestor_timeline_id: Option, last_record_lsn: Lsn, prev_record_lsn: Lsn, disk_consistent_lsn: Lsn, @@ -232,7 +232,9 @@ async fn timeline_detail_handler(request: Request) -> Result TimelineInfo::Local { timeline_id, tenant_id, - ancestor_timeline_id: timeline.get_ancestor_timeline_id(), + ancestor_timeline_id: timeline + .get_ancestor_timeline_id() + .map(HexZTimelineId::from), disk_consistent_lsn: timeline.get_disk_consistent_lsn(), last_record_lsn: timeline.get_last_record_lsn(), prev_record_lsn: timeline.get_prev_record_lsn(), diff --git a/zenith/src/main.rs b/zenith/src/main.rs index 5500d924ea..bc42af5943 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -392,7 +392,7 @@ fn get_tenantid(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result Result<()> { let pageserver = PageServerNode::from_env(&env); if let Err(e) = pageserver.init( // default_tenantid was generated by the `env.init()` call above - Some(&env.default_tenantid.unwrap().to_string()), + Some(&ZTenantId::from(env.default_tenantid.unwrap()).to_string()), &pageserver_config_overrides(init_match), ) { eprintln!("pageserver init failed: {}", e); diff --git a/zenith_utils/src/auth.rs b/zenith_utils/src/auth.rs index 274dd13bee..cbc4fcee61 100644 --- a/zenith_utils/src/auth.rs +++ b/zenith_utils/src/auth.rs @@ -5,9 +5,7 @@ // The second one is that we wanted to use ed25519 keys, but they are also not supported until next version. So we go with RSA keys for now. // Relevant issue: https://github.com/Keats/jsonwebtoken/issues/162 -use hex::{self, FromHex}; -use serde::de::Error; -use serde::{self, Deserializer, Serializer}; +use serde; use std::fs; use std::path::Path; @@ -17,7 +15,7 @@ use jsonwebtoken::{ }; use serde::{Deserialize, Serialize}; -use crate::zid::ZTenantId; +use crate::zid::{HexZTenantId, ZTenantId}; const JWT_ALGORITHM: Algorithm = Algorithm::RS256; @@ -28,44 +26,18 @@ pub enum Scope { PageServerApi, } -pub fn to_hex_option(value: &Option, serializer: S) -> Result -where - S: Serializer, -{ - match value { - Some(tid) => hex::serialize(tid, serializer), - None => Option::serialize(value, serializer), - } -} - -fn from_hex_option<'de, D>(deserializer: D) -> Result, D::Error> -where - D: Deserializer<'de>, -{ - let opt: Option = Option::deserialize(deserializer)?; - match opt { - Some(tid) => Ok(Some(ZTenantId::from_hex(tid).map_err(Error::custom)?)), - None => Ok(None), - } -} - #[derive(Debug, Serialize, Deserialize, Clone)] pub struct Claims { - // this custom serialize/deserialize_with is needed because Option is not transparent to serde - // so clearest option is serde(with = "hex") but it is not working, for details see https://github.com/serde-rs/serde/issues/1301 - #[serde( - default, - skip_serializing_if = "Option::is_none", - serialize_with = "to_hex_option", - deserialize_with = "from_hex_option" - )] - pub tenant_id: Option, + pub tenant_id: Option, pub scope: Scope, } impl Claims { pub fn new(tenant_id: Option, scope: Scope) -> Self { - Self { tenant_id, scope } + Self { + tenant_id: tenant_id.map(HexZTenantId::from), + scope, + } } } @@ -75,7 +47,7 @@ pub fn check_permission(claims: &Claims, tenantid: Option) -> Result< bail!("Attempt to access management api with tenant scope. Permission denied") } (Scope::Tenant, Some(tenantid)) => { - if claims.tenant_id.unwrap() != tenantid { + if ZTenantId::from(claims.tenant_id.unwrap()) != tenantid { bail!("Tenant id mismatch. Permission denied") } Ok(()) diff --git a/zenith_utils/src/zid.rs b/zenith_utils/src/zid.rs index 7dfffd96d7..813eb3f8f4 100644 --- a/zenith_utils/src/zid.rs +++ b/zenith_utils/src/zid.rs @@ -2,13 +2,100 @@ use std::{fmt, str::FromStr}; use hex::FromHex; use rand::Rng; -use serde::{Deserialize, Serialize}; +use serde::{ + de::{self, Visitor}, + Deserialize, Serialize, +}; -// Zenith ID is a 128-bit random ID. -// Used to represent various identifiers. Provides handy utility methods and impls. +macro_rules! mutual_from { + ($id1:ident, $id2:ident) => { + impl From<$id1> for $id2 { + fn from(id1: $id1) -> Self { + Self(id1.0.into()) + } + } + + impl From<$id2> for $id1 { + fn from(id2: $id2) -> Self { + Self(id2.0.into()) + } + } + }; +} + +/// Zenith ID is a 128-bit random ID. +/// Used to represent various identifiers. Provides handy utility methods and impls. +/// +/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look +/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. +/// Use [`HexZId`] to serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] struct ZId([u8; 16]); +/// [`ZId`] version that serializes and deserializes as a hex string. +/// Useful for various json serializations, where hex byte array from original id is not convenient. +/// +/// Plain `ZId` could be (de)serialized into hex string with `#[serde(with = "hex")]` attribute. +/// This however won't work on nested types like `Option` or `Vec`, see https://github.com/serde-rs/serde/issues/723 for the details. +/// Every separate type currently needs a new (de)serializing method for every type separately. +/// +/// To provide a generic way to serialize the ZId as a hex string where `#[serde(with = "hex")]` is not enough, this wrapper is created. +/// The default wrapper serialization is left unchanged due to +/// * byte array (de)serialization being faster and simpler +/// * byte deserialization being used in Safekeeper already, with those bytes coming from compute (see `ProposerGreeting` in safekeeper) +/// * current `HexZId`'s deserialization impl breaks on compute byte array deserialization, having it by default is dangerous +#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +struct HexZId([u8; 16]); + +impl Serialize for HexZId { + fn serialize(&self, ser: S) -> Result + where + S: serde::Serializer, + { + hex::encode(self.0).serialize(ser) + } +} + +impl<'de> Deserialize<'de> for HexZId { + fn deserialize(de: D) -> Result + where + D: serde::Deserializer<'de>, + { + de.deserialize_bytes(HexVisitor) + } +} + +struct HexVisitor; + +impl<'de> Visitor<'de> for HexVisitor { + type Value = HexZId; + + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "A hexadecimal representation of a 128-bit random Zenith ID" + ) + } + + fn visit_bytes(self, hex_bytes: &[u8]) -> Result + where + E: de::Error, + { + ZId::from_hex(hex_bytes) + .map(HexZId::from) + .map_err(de::Error::custom) + } + + fn visit_str(self, hex_bytes_str: &str) -> Result + where + E: de::Error, + { + Self::visit_bytes(self, hex_bytes_str.as_bytes()) + } +} + +mutual_from!(ZId, HexZId); + impl ZId { pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> ZId { let mut arr = [0u8; 16]; @@ -155,46 +242,80 @@ macro_rules! zid_newtype { /// is separate from PostgreSQL timelines, and doesn't have those /// limitations. A zenith timeline is identified by a 128-bit ID, which /// is usually printed out as a hex string. +/// +/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look +/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. +/// Use [`HexZTimelineId`] to serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. #[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] pub struct ZTimelineId(ZId); -zid_newtype!(ZTimelineId); +/// A [`ZTimelineId`] version that gets (de)serialized as a hex string. +/// Use in complex types, where `#[serde(with = "hex")]` does not work. +/// See [`HexZId`] for more details. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] +pub struct HexZTimelineId(HexZId); -// Zenith Tenant Id represents identifiar of a particular tenant. -// Is used for distinguishing requests and data belonging to different users. +impl std::fmt::Debug for HexZTimelineId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + ZTimelineId::from(*self).fmt(f) + } +} + +impl std::fmt::Display for HexZTimelineId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + ZTimelineId::from(*self).fmt(f) + } +} + +impl FromStr for HexZTimelineId { + type Err = ::Err; + + fn from_str(s: &str) -> Result { + Ok(HexZTimelineId::from(ZTimelineId::from_str(s)?)) + } +} + +zid_newtype!(ZTimelineId); +mutual_from!(ZTimelineId, HexZTimelineId); + +/// Zenith Tenant Id represents identifiar of a particular tenant. +/// Is used for distinguishing requests and data belonging to different users. +/// +/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look +/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. +/// Use [`HexZTenantId`] to serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] pub struct ZTenantId(ZId); -zid_newtype!(ZTenantId); +/// A [`ZTenantId`] version that gets (de)serialized as a hex string. +/// Use in complex types, where `#[serde(with = "hex")]` does not work. +/// See [`HexZId`] for more details. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] +pub struct HexZTenantId(HexZId); -/// Serde routines for Option (de)serialization, using `T:Display` representations for inner values. -/// Useful for Option and Option to get their hex representations into serialized string and deserialize them back. -pub mod opt_display_serde { - use serde::{de, Deserialize, Deserializer, Serialize, Serializer}; - use std::{fmt::Display, str::FromStr}; - - pub fn serialize(id: &Option, ser: S) -> Result - where - S: Serializer, - Id: Display, - { - id.as_ref().map(ToString::to_string).serialize(ser) - } - - pub fn deserialize<'de, D, Id>(des: D) -> Result, D::Error> - where - D: Deserializer<'de>, - Id: FromStr, - ::Err: Display, - { - Ok(if let Some(s) = Option::::deserialize(des)? { - Some(Id::from_str(&s).map_err(de::Error::custom)?) - } else { - None - }) +impl std::fmt::Debug for HexZTenantId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + ZTenantId::from(*self).fmt(f) } } +impl std::fmt::Display for HexZTenantId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + ZTenantId::from(*self).fmt(f) + } +} + +impl FromStr for HexZTenantId { + type Err = ::Err; + + fn from_str(s: &str) -> Result { + Ok(HexZTenantId::from(ZTenantId::from_str(s)?)) + } +} + +zid_newtype!(ZTenantId); +mutual_from!(ZTenantId, HexZTenantId); + // A pair uniquely identifying Zenith instance. #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash)] pub struct ZTenantTimelineId { @@ -243,16 +364,15 @@ mod tests { #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] struct TestStruct + Display> { - #[serde(with = "opt_display_serde")] field: Option, } #[test] fn test_hex_serializations_tenant_id() { let original_struct = TestStruct { - field: Some(ZTenantId::from_array(hex!( + field: Some(HexZTenantId::from(ZTenantId::from_array(hex!( "11223344556677881122334455667788" - ))), + )))), }; let serialized_string = serde_json::to_string(&original_struct).unwrap(); @@ -261,7 +381,7 @@ mod tests { r#"{"field":"11223344556677881122334455667788"}"# ); - let deserialized_struct: TestStruct = + let deserialized_struct: TestStruct = serde_json::from_str(&serialized_string).unwrap(); assert_eq!(original_struct, deserialized_struct); } @@ -269,9 +389,9 @@ mod tests { #[test] fn test_hex_serializations_timeline_id() { let original_struct = TestStruct { - field: Some(ZTimelineId::from_array(hex!( + field: Some(HexZTimelineId::from(ZTimelineId::from_array(hex!( "AA223344556677881122334455667788" - ))), + )))), }; let serialized_string = serde_json::to_string(&original_struct).unwrap(); @@ -280,7 +400,7 @@ mod tests { r#"{"field":"aa223344556677881122334455667788"}"# ); - let deserialized_struct: TestStruct = + let deserialized_struct: TestStruct = serde_json::from_str(&serialized_string).unwrap(); assert_eq!(original_struct, deserialized_struct); } From 66eb2a1dd32403405414e0986c457588d8d45609 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 2 Mar 2022 13:27:39 +0200 Subject: [PATCH 0008/1022] Replace zenith/build build image with zimg/* ones --- Dockerfile | 6 +++--- Dockerfile.build | 16 ---------------- docs/docker.md | 24 +++--------------------- 3 files changed, 6 insertions(+), 40 deletions(-) delete mode 100644 Dockerfile.build diff --git a/Dockerfile b/Dockerfile index dd0dba60ca..18abae5327 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,7 +6,7 @@ # Build Postgres separately --- this layer will be rebuilt only if one of # mentioned paths will get any changes. # -FROM zenithdb/build:buster AS pg-build +FROM zimg/rust:1.56 AS pg-build WORKDIR /zenith COPY ./vendor/postgres vendor/postgres COPY ./Makefile Makefile @@ -20,7 +20,7 @@ RUN rm -rf postgres_install/build # TODO: build cargo deps as separate layer. We used cargo-chef before but that was # net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work. # -FROM zenithdb/build:buster AS build +FROM zimg/rust:1.56 AS build ARG GIT_VERSION RUN if [ -z "$GIT_VERSION" ]; then echo "GIT_VERSION is reqired, use build_arg to pass it"; exit 1; fi @@ -34,7 +34,7 @@ RUN GIT_VERSION=$GIT_VERSION cargo build --release # # Copy binaries to resulting image. # -FROM debian:buster-slim +FROM debian:bullseye-slim WORKDIR /data RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl ca-certificates && \ diff --git a/Dockerfile.build b/Dockerfile.build deleted file mode 100644 index a9fd2cb0af..0000000000 --- a/Dockerfile.build +++ /dev/null @@ -1,16 +0,0 @@ -# -# Image with all the required dependencies to build https://github.com/zenithdb/zenith -# and Postgres from https://github.com/zenithdb/postgres -# Also includes some rust development and build tools. -# NB: keep in sync with rust image version in .circle/config.yml -# -FROM rust:1.56.1-slim-buster -WORKDIR /zenith - -# Install postgres and zenith build dependencies -# clang is for rocksdb -RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ - libseccomp-dev pkg-config libssl-dev clang - -# Install rust tools -RUN rustup component add clippy && cargo install cargo-audit diff --git a/docs/docker.md b/docs/docker.md index 14ba2146cb..cc54d012dd 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -7,32 +7,14 @@ Currently we build two main images: - [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). - [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres). -And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos: +And additional intermediate images: -- [zenithdb/build](https://hub.docker.com/repository/docker/zenithdb/build) — image with all the dependencies required to build Zenith and compute node images. This image is based on `rust:slim-buster`, so it also has a proper `rust` environment. Built from [/Dockerfile.build](/Dockerfile.build). - [zenithdb/compute-tools](https://hub.docker.com/repository/docker/zenithdb/compute-tools) — compute node configuration management tools. ## Building pipeline 1. Image `zenithdb/compute-tools` is re-built automatically. -2. Image `zenithdb/build` is built manually. If you want to introduce any new compile time dependencies to Zenith or compute node you have to update this image as well, build it and push to Docker Hub. +2. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo. -Build: -```sh -docker build -t zenithdb/build:buster -f Dockerfile.build . -``` - -Login: -```sh -docker login -``` - -Push to Docker Hub: -```sh -docker push zenithdb/build:buster -``` - -3. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo. - -4. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically. +3. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically. From f86cf93435133ee11f8c4bc53b1470e2dada3ce0 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 15 Feb 2022 20:10:10 +0300 Subject: [PATCH 0009/1022] Refactor timeline creation on safekeepers, allowing storing peer ids. Have separate routine and http endpoint to create timeline on safekeepers. It is not used yet, i.e. timeline is still created implicitly, but we'll change that once infrastructure for learning which tlis are assigned to which safekeepers will be ready, preventing accidental creation by compute. Changes format of safekeeper control file, allowing to store set of peers. Knowing peers provides a part of foundation for peer recovery (calculating min horizons like truncate_lsn for WAL truncation and commit_lsn for sync-safekeepers replacement) and proper membership change; similarly, we don't yet use it for now. Employing cf file version bump, extracts tenant_id and timeline_id to top level where it is more suitable. Also adds a bunch of LSNs there and rename truncate_lsn to more specific peer_horizon_lsn. --- Cargo.lock | 1 + control_plane/Cargo.toml | 1 + control_plane/src/safekeeper.rs | 24 ++++- walkeeper/src/bin/safekeeper.rs | 7 +- walkeeper/src/control_file.rs | 104 ++++++++---------- walkeeper/src/control_file_upgrade.rs | 82 +++++++++++++-- walkeeper/src/handler.rs | 38 +++---- walkeeper/src/http/mod.rs | 1 + walkeeper/src/http/models.rs | 9 ++ walkeeper/src/http/routes.rs | 32 +++++- walkeeper/src/safekeeper.rs | 145 ++++++++++++++++++-------- walkeeper/src/timeline.rs | 128 ++++++++++++++++------- walkeeper/src/wal_storage.rs | 23 ++-- zenith_utils/src/zid.rs | 4 + 14 files changed, 404 insertions(+), 195 deletions(-) create mode 100644 walkeeper/src/http/models.rs diff --git a/Cargo.lock b/Cargo.lock index ba3c6729d6..ad38a41d91 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -424,6 +424,7 @@ dependencies = [ "thiserror", "toml", "url", + "walkeeper", "workspace_hack", "zenith_utils", ] diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 5e972200c2..eff6b3ef2d 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -17,5 +17,6 @@ url = "2.2.2" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } pageserver = { path = "../pageserver" } +walkeeper = { path = "../walkeeper" } zenith_utils = { path = "../zenith_utils" } workspace_hack = { path = "../workspace_hack" } diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 351d1efbbc..969e2cd531 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -14,8 +14,9 @@ use postgres::Config; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use thiserror::Error; +use walkeeper::http::models::TimelineCreateRequest; use zenith_utils::http::error::HttpErrorBody; -use zenith_utils::zid::ZNodeId; +use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; use crate::local_env::{LocalEnv, SafekeeperConf}; use crate::storage::PageServerNode; @@ -261,4 +262,25 @@ impl SafekeeperNode { .error_from_body()?; Ok(()) } + + pub fn timeline_create( + &self, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + peer_ids: Vec, + ) -> Result<()> { + Ok(self + .http_request( + Method::POST, + format!("{}/{}", self.http_base_url, "timeline"), + ) + .json(&TimelineCreateRequest { + tenant_id, + timeline_id, + peer_ids, + }) + .send()? + .error_from_body()? + .json()?) + } } diff --git a/walkeeper/src/bin/safekeeper.rs b/walkeeper/src/bin/safekeeper.rs index 48de1481d4..6c45115e5f 100644 --- a/walkeeper/src/bin/safekeeper.rs +++ b/walkeeper/src/bin/safekeeper.rs @@ -11,7 +11,7 @@ use std::io::{ErrorKind, Write}; use std::path::{Path, PathBuf}; use std::thread; use tracing::*; -use walkeeper::control_file::{self, CreateControlFile}; +use walkeeper::control_file::{self}; use zenith_utils::http::endpoint; use zenith_utils::zid::ZNodeId; use zenith_utils::{logging, tcp_listener, GIT_VERSION}; @@ -108,10 +108,7 @@ fn main() -> Result<()> { .get_matches(); if let Some(addr) = arg_matches.value_of("dump-control-file") { - let state = control_file::FileStorage::load_control_file( - Path::new(addr), - CreateControlFile::False, - )?; + let state = control_file::FileStorage::load_control_file(Path::new(addr))?; let json = serde_json::to_string(&state)?; print!("{}", json); return Ok(()); diff --git a/walkeeper/src/control_file.rs b/walkeeper/src/control_file.rs index 6016e00d1d..8b4e618661 100644 --- a/walkeeper/src/control_file.rs +++ b/walkeeper/src/control_file.rs @@ -27,13 +27,6 @@ const CONTROL_FILE_NAME: &str = "safekeeper.control"; const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial"; pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); -// A named boolean. -#[derive(Debug)] -pub enum CreateControlFile { - True, - False, -} - lazy_static! { static ref PERSIST_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!( "safekeeper_persist_control_file_seconds", @@ -94,28 +87,22 @@ impl FileStorage { pub fn load_control_file_conf( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, - create: CreateControlFile, ) -> Result { let path = conf.timeline_dir(zttid).join(CONTROL_FILE_NAME); - Self::load_control_file(path, create) + Self::load_control_file(path) } /// Read in the control file. /// If create=false and file doesn't exist, bails out. - pub fn load_control_file>( - control_file_path: P, - create: CreateControlFile, - ) -> Result { + pub fn load_control_file>(control_file_path: P) -> Result { info!( - "loading control file {}, create={:?}", + "loading control file {}", control_file_path.as_ref().display(), - create, ); let mut control_file = OpenOptions::new() .read(true) .write(true) - .create(matches!(create, CreateControlFile::True)) .open(&control_file_path) .with_context(|| { format!( @@ -124,41 +111,32 @@ impl FileStorage { ) })?; - // Empty file is legit on 'create', don't try to deser from it. - let state = if control_file.metadata().unwrap().len() == 0 { - if let CreateControlFile::False = create { - bail!("control file is empty"); - } - SafeKeeperState::new() - } else { - let mut buf = Vec::new(); - control_file - .read_to_end(&mut buf) - .context("failed to read control file")?; + let mut buf = Vec::new(); + control_file + .read_to_end(&mut buf) + .context("failed to read control file")?; - let calculated_checksum = crc32c::crc32c(&buf[..buf.len() - CHECKSUM_SIZE]); + let calculated_checksum = crc32c::crc32c(&buf[..buf.len() - CHECKSUM_SIZE]); - let expected_checksum_bytes: &[u8; CHECKSUM_SIZE] = - buf[buf.len() - CHECKSUM_SIZE..].try_into()?; - let expected_checksum = u32::from_le_bytes(*expected_checksum_bytes); + let expected_checksum_bytes: &[u8; CHECKSUM_SIZE] = + buf[buf.len() - CHECKSUM_SIZE..].try_into()?; + let expected_checksum = u32::from_le_bytes(*expected_checksum_bytes); - ensure!( - calculated_checksum == expected_checksum, + ensure!( + calculated_checksum == expected_checksum, + format!( + "safekeeper control file checksum mismatch: expected {} got {}", + expected_checksum, calculated_checksum + ) + ); + + let state = FileStorage::deser_sk_state(&mut &buf[..buf.len() - CHECKSUM_SIZE]) + .with_context(|| { format!( - "safekeeper control file checksum mismatch: expected {} got {}", - expected_checksum, calculated_checksum + "while reading control file {}", + control_file_path.as_ref().display(), ) - ); - - FileStorage::deser_sk_state(&mut &buf[..buf.len() - CHECKSUM_SIZE]).with_context( - || { - format!( - "while reading control file {}", - control_file_path.as_ref().display(), - ) - }, - )? - }; + })?; Ok(state) } } @@ -247,31 +225,38 @@ mod test { fn load_from_control_file( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, - create: CreateControlFile, ) -> Result<(FileStorage, SafeKeeperState)> { fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); Ok(( FileStorage::new(zttid, conf), - FileStorage::load_control_file_conf(conf, zttid, create)?, + FileStorage::load_control_file_conf(conf, zttid)?, )) } + fn create( + conf: &SafeKeeperConf, + zttid: &ZTenantTimelineId, + ) -> Result<(FileStorage, SafeKeeperState)> { + fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); + let state = SafeKeeperState::empty(); + let mut storage = FileStorage::new(zttid, conf); + storage.persist(&state)?; + Ok((storage, state)) + } + #[test] fn test_read_write_safekeeper_state() { let conf = stub_conf(); let zttid = ZTenantTimelineId::generate(); { - let (mut storage, mut state) = - load_from_control_file(&conf, &zttid, CreateControlFile::True) - .expect("failed to read state"); + let (mut storage, mut state) = create(&conf, &zttid).expect("failed to create state"); // change something - state.wal_start_lsn = Lsn(42); + state.commit_lsn = Lsn(42); storage.persist(&state).expect("failed to persist state"); } - let (_, state) = load_from_control_file(&conf, &zttid, CreateControlFile::False) - .expect("failed to read state"); - assert_eq!(state.wal_start_lsn, Lsn(42)); + let (_, state) = load_from_control_file(&conf, &zttid).expect("failed to read state"); + assert_eq!(state.commit_lsn, Lsn(42)); } #[test] @@ -279,11 +264,10 @@ mod test { let conf = stub_conf(); let zttid = ZTenantTimelineId::generate(); { - let (mut storage, mut state) = - load_from_control_file(&conf, &zttid, CreateControlFile::True) - .expect("failed to read state"); + let (mut storage, mut state) = create(&conf, &zttid).expect("failed to read state"); + // change something - state.wal_start_lsn = Lsn(42); + state.commit_lsn = Lsn(42); storage.persist(&state).expect("failed to persist state"); } let control_path = conf.timeline_dir(&zttid).join(CONTROL_FILE_NAME); @@ -291,7 +275,7 @@ mod test { data[0] += 1; // change the first byte of the file to fail checksum validation fs::write(&control_path, &data).expect("failed to write control file"); - match load_from_control_file(&conf, &zttid, CreateControlFile::False) { + match load_from_control_file(&conf, &zttid) { Err(err) => assert!(err .to_string() .contains("safekeeper control file checksum mismatch")), diff --git a/walkeeper/src/control_file_upgrade.rs b/walkeeper/src/control_file_upgrade.rs index 913bd02c1e..9effe42f8d 100644 --- a/walkeeper/src/control_file_upgrade.rs +++ b/walkeeper/src/control_file_upgrade.rs @@ -1,6 +1,6 @@ //! Code to deal with safekeeper control file upgrades use crate::safekeeper::{ - AcceptorState, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermSwitchEntry, + AcceptorState, Peers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermSwitchEntry, }; use anyhow::{bail, Result}; use serde::{Deserialize, Serialize}; @@ -26,7 +26,7 @@ struct SafeKeeperStateV1 { /// persistent acceptor state acceptor_state: AcceptorStateV1, /// information about server - server: ServerInfo, + server: ServerInfoV2, /// Unique id of the last *elected* proposer we dealed with. Not needed /// for correctness, exists for monitoring purposes. proposer_uuid: PgUuid, @@ -70,6 +70,39 @@ pub struct SafeKeeperStateV2 { pub wal_start_lsn: Lsn, } +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct ServerInfoV3 { + /// Postgres server version + pub pg_version: u32, + pub system_id: SystemId, + #[serde(with = "hex")] + pub tenant_id: ZTenantId, + /// Zenith timelineid + #[serde(with = "hex")] + pub timeline_id: ZTimelineId, + pub wal_seg_size: u32, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SafeKeeperStateV3 { + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfoV3, + /// Unique id of the last *elected* proposer we dealed with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// part of WAL acknowledged by quorum and available locally + pub commit_lsn: Lsn, + /// minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone) + pub truncate_lsn: Lsn, + // Safekeeper starts receiving WAL from this LSN, zeros before it ought to + // be skipped during decoding. + pub wal_start_lsn: Lsn, +} + pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { // migrate to storing full term history if version == 1 { @@ -83,12 +116,20 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result }]), }; return Ok(SafeKeeperState { + tenant_id: oldstate.server.tenant_id, + timeline_id: oldstate.server.ztli, acceptor_state: ac, - server: oldstate.server.clone(), + server: ServerInfo { + pg_version: oldstate.server.pg_version, + system_id: oldstate.server.system_id, + wal_seg_size: oldstate.server.wal_seg_size, + }, proposer_uuid: oldstate.proposer_uuid, commit_lsn: oldstate.commit_lsn, - truncate_lsn: oldstate.truncate_lsn, - wal_start_lsn: oldstate.wal_start_lsn, + s3_wal_lsn: Lsn(0), + peer_horizon_lsn: oldstate.truncate_lsn, + remote_consistent_lsn: Lsn(0), + peers: Peers(vec![]), }); // migrate to hexing some zids } else if version == 2 { @@ -97,17 +138,40 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result let server = ServerInfo { pg_version: oldstate.server.pg_version, system_id: oldstate.server.system_id, - tenant_id: oldstate.server.tenant_id, - timeline_id: oldstate.server.ztli, wal_seg_size: oldstate.server.wal_seg_size, }; return Ok(SafeKeeperState { + tenant_id: oldstate.server.tenant_id, + timeline_id: oldstate.server.ztli, acceptor_state: oldstate.acceptor_state, server, proposer_uuid: oldstate.proposer_uuid, commit_lsn: oldstate.commit_lsn, - truncate_lsn: oldstate.truncate_lsn, - wal_start_lsn: oldstate.wal_start_lsn, + s3_wal_lsn: Lsn(0), + peer_horizon_lsn: oldstate.truncate_lsn, + remote_consistent_lsn: Lsn(0), + peers: Peers(vec![]), + }); + // migrate to moving ztenantid/ztli to the top and adding some lsns + } else if version == 3 { + info!("reading safekeeper control file version {}", version); + let oldstate = SafeKeeperStateV3::des(&buf[..buf.len()])?; + let server = ServerInfo { + pg_version: oldstate.server.pg_version, + system_id: oldstate.server.system_id, + wal_seg_size: oldstate.server.wal_seg_size, + }; + return Ok(SafeKeeperState { + tenant_id: oldstate.server.tenant_id, + timeline_id: oldstate.server.timeline_id, + acceptor_state: oldstate.acceptor_state, + server, + proposer_uuid: oldstate.proposer_uuid, + commit_lsn: oldstate.commit_lsn, + s3_wal_lsn: Lsn(0), + peer_horizon_lsn: oldstate.truncate_lsn, + remote_consistent_lsn: Lsn(0), + peers: Peers(vec![]), }); } bail!("unsupported safekeeper control file version {}", version) diff --git a/walkeeper/src/handler.rs b/walkeeper/src/handler.rs index d1ead5cb37..ead6fab9fb 100644 --- a/walkeeper/src/handler.rs +++ b/walkeeper/src/handler.rs @@ -13,6 +13,7 @@ use postgres_ffi::xlog_utils::PG_TLI; use regex::Regex; use std::str::FromStr; use std::sync::Arc; +use tracing::info; use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend; use zenith_utils::postgres_backend::PostgresBackend; @@ -20,7 +21,6 @@ use zenith_utils::pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; use crate::callmemaybe::CallmeEvent; -use crate::control_file::CreateControlFile; use tokio::sync::mpsc::UnboundedSender; /// Safekeeper handler of postgres commands @@ -101,29 +101,19 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()> { let cmd = parse_cmd(query_string)?; - // Is this command is ztimeline scoped? - match cmd { - SafekeeperPostgresCommand::StartWalPush { .. } - | SafekeeperPostgresCommand::StartReplication { .. } - | SafekeeperPostgresCommand::IdentifySystem - | SafekeeperPostgresCommand::JSONCtrl { .. } => { - let tenantid = self.ztenantid.context("tenantid is required")?; - let timelineid = self.ztimelineid.context("timelineid is required")?; - if self.timeline.is_none() { - // START_WAL_PUSH is the only command that initializes the timeline in production. - // There is also JSON_CTRL command, which should initialize the timeline for testing. - let create_control_file = match cmd { - SafekeeperPostgresCommand::StartWalPush { .. } - | SafekeeperPostgresCommand::JSONCtrl { .. } => CreateControlFile::True, - _ => CreateControlFile::False, - }; - self.timeline.set( - &self.conf, - ZTenantTimelineId::new(tenantid, timelineid), - create_control_file, - )?; - } - } + info!("got query {:?}", query_string); + + let create = !(matches!(cmd, SafekeeperPostgresCommand::StartReplication { .. }) + || matches!(cmd, SafekeeperPostgresCommand::IdentifySystem)); + + let tenantid = self.ztenantid.context("tenantid is required")?; + let timelineid = self.ztimelineid.context("timelineid is required")?; + if self.timeline.is_none() { + self.timeline.set( + &self.conf, + ZTenantTimelineId::new(tenantid, timelineid), + create, + )?; } match cmd { diff --git a/walkeeper/src/http/mod.rs b/walkeeper/src/http/mod.rs index c82d1c0362..4c0be17ecd 100644 --- a/walkeeper/src/http/mod.rs +++ b/walkeeper/src/http/mod.rs @@ -1,2 +1,3 @@ +pub mod models; pub mod routes; pub use routes::make_router; diff --git a/walkeeper/src/http/models.rs b/walkeeper/src/http/models.rs new file mode 100644 index 0000000000..8a6ed7a812 --- /dev/null +++ b/walkeeper/src/http/models.rs @@ -0,0 +1,9 @@ +use serde::{Deserialize, Serialize}; +use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; + +#[derive(Serialize, Deserialize)] +pub struct TimelineCreateRequest { + pub tenant_id: ZTenantId, + pub timeline_id: ZTimelineId, + pub peer_ids: Vec, +} diff --git a/walkeeper/src/http/routes.rs b/walkeeper/src/http/routes.rs index bc992c6a6f..74f7f4a735 100644 --- a/walkeeper/src/http/routes.rs +++ b/walkeeper/src/http/routes.rs @@ -1,14 +1,15 @@ use hyper::{Body, Request, Response, StatusCode}; + use serde::Serialize; use serde::Serializer; use std::fmt::Display; use std::sync::Arc; +use zenith_utils::http::json::json_request; use zenith_utils::http::{RequestExt, RouterBuilder}; use zenith_utils::lsn::Lsn; use zenith_utils::zid::ZNodeId; use zenith_utils::zid::ZTenantTimelineId; -use crate::control_file::CreateControlFile; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; use crate::timeline::GlobalTimelines; @@ -19,6 +20,8 @@ use zenith_utils::http::json::json_response; use zenith_utils::http::request::parse_request_param; use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use super::models::TimelineCreateRequest; + #[derive(Debug, Serialize)] struct SafekeeperStatus { id: ZNodeId, @@ -66,7 +69,11 @@ struct TimelineStatus { #[serde(serialize_with = "display_serialize")] commit_lsn: Lsn, #[serde(serialize_with = "display_serialize")] - truncate_lsn: Lsn, + s3_wal_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] + peer_horizon_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] + remote_consistent_lsn: Lsn, #[serde(serialize_with = "display_serialize")] flush_lsn: Lsn, } @@ -78,8 +85,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { + let request_data: TimelineCreateRequest = json_request(&mut request).await?; + + let zttid = ZTenantTimelineId { + tenant_id: request_data.tenant_id, + timeline_id: request_data.timeline_id, + }; + GlobalTimelines::create(get_conf(&request), zttid, request_data.peer_ids) + .map_err(ApiError::from_err)?; + + Ok(json_response(StatusCode::CREATED, ())?) +} + /// Safekeeper http router. pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder { let router = endpoint::make_router(); @@ -110,4 +131,5 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder "/v1/timeline/:tenant_id/:timeline_id", timeline_status_handler, ) + .post("/v1/timeline", timeline_create_handler) } diff --git a/walkeeper/src/safekeeper.rs b/walkeeper/src/safekeeper.rs index fa624bb18f..f8b12530d8 100644 --- a/walkeeper/src/safekeeper.rs +++ b/walkeeper/src/safekeeper.rs @@ -10,6 +10,8 @@ use std::cmp::min; use std::fmt; use std::io::Read; use tracing::*; +use zenith_utils::zid::ZNodeId; +use zenith_utils::zid::ZTenantTimelineId; use lazy_static::lazy_static; @@ -25,12 +27,13 @@ use zenith_utils::pq_proto::ZenithFeedback; use zenith_utils::zid::{ZTenantId, ZTimelineId}; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 3; +pub const SK_FORMAT_VERSION: u32 = 4; const SK_PROTOCOL_VERSION: u32 = 1; const UNKNOWN_SERVER_VERSION: u32 = 0; /// Consensus logical timestamp. pub type Term = u64; +const INVALID_TERM: Term = 0; #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct TermSwitchEntry { @@ -128,18 +131,47 @@ pub struct ServerInfo { /// Postgres server version pub pg_version: u32, pub system_id: SystemId, - #[serde(with = "hex")] - pub tenant_id: ZTenantId, - /// Zenith timelineid - #[serde(with = "hex")] - pub timeline_id: ZTimelineId, pub wal_seg_size: u32, } +/// Data published by safekeeper to the peers +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PeerInfo { + /// LSN up to which safekeeper offloaded WAL to s3. + s3_wal_lsn: Lsn, + /// Term of the last entry. + term: Term, + /// LSN of the last record. + flush_lsn: Lsn, + /// Up to which LSN safekeeper regards its WAL as committed. + commit_lsn: Lsn, +} + +impl PeerInfo { + fn new() -> Self { + Self { + s3_wal_lsn: Lsn(0), + term: INVALID_TERM, + flush_lsn: Lsn(0), + commit_lsn: Lsn(0), + } + } +} + +// vector-based node id -> peer state map with very limited functionality we +// need/ +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Peers(pub Vec<(ZNodeId, PeerInfo)>); + /// Persistent information stored on safekeeper node /// On disk data is prefixed by magic and format version and followed by checksum. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SafeKeeperState { + #[serde(with = "hex")] + pub tenant_id: ZTenantId, + /// Zenith timelineid + #[serde(with = "hex")] + pub timeline_id: ZTimelineId, /// persistent acceptor state pub acceptor_state: AcceptorState, /// information about server @@ -148,19 +180,33 @@ pub struct SafeKeeperState { /// for correctness, exists for monitoring purposes. #[serde(with = "hex")] pub proposer_uuid: PgUuid, - /// part of WAL acknowledged by quorum and available locally + /// Part of WAL acknowledged by quorum and available locally. Always points + /// to record boundary. pub commit_lsn: Lsn, - /// minimal LSN which may be needed for recovery of some safekeeper (end_lsn - /// of last record streamed to everyone) - pub truncate_lsn: Lsn, - // Safekeeper starts receiving WAL from this LSN, zeros before it ought to - // be skipped during decoding. - pub wal_start_lsn: Lsn, + /// First LSN not yet offloaded to s3. Useful to persist to avoid finding + /// out offloading progress on boot. + pub s3_wal_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver. + pub remote_consistent_lsn: Lsn, + // Peers and their state as we remember it. Knowing peers themselves is + // fundamental; but state is saved here only for informational purposes and + // obviously can be stale. (Currently not saved at all, but let's provision + // place to have less file version upgrades). + pub peers: Peers, } impl SafeKeeperState { - pub fn new() -> SafeKeeperState { + pub fn new(zttid: &ZTenantTimelineId, peers: Vec) -> SafeKeeperState { SafeKeeperState { + tenant_id: zttid.tenant_id, + timeline_id: zttid.timeline_id, acceptor_state: AcceptorState { term: 0, term_history: TermHistory::empty(), @@ -168,21 +214,20 @@ impl SafeKeeperState { server: ServerInfo { pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ system_id: 0, /* Postgres system identifier */ - tenant_id: ZTenantId::from([0u8; 16]), - timeline_id: ZTimelineId::from([0u8; 16]), wal_seg_size: 0, }, proposer_uuid: [0; 16], - commit_lsn: Lsn(0), /* part of WAL acknowledged by quorum */ - truncate_lsn: Lsn(0), /* minimal LSN which may be needed for recovery of some safekeeper */ - wal_start_lsn: Lsn(0), + commit_lsn: Lsn(0), + s3_wal_lsn: Lsn(0), + peer_horizon_lsn: Lsn(0), + remote_consistent_lsn: Lsn(0), + peers: Peers(peers.iter().map(|p| (*p, PeerInfo::new())).collect()), } } -} -impl Default for SafeKeeperState { - fn default() -> Self { - Self::new() + #[cfg(test)] + pub fn empty() -> Self { + SafeKeeperState::new(&ZTenantTimelineId::empty(), vec![]) } } @@ -421,6 +466,7 @@ lazy_static! { struct SafeKeeperMetrics { commit_lsn: Gauge, + // WAL-related metrics are in WalStorageMetrics } impl SafeKeeperMetrics { @@ -443,7 +489,7 @@ pub struct SafeKeeper { /// not-yet-flushed pairs of same named fields in s.* pub commit_lsn: Lsn, - pub truncate_lsn: Lsn, + pub peer_horizon_lsn: Lsn, pub s: SafeKeeperState, // persistent part pub control_store: CTRL, @@ -462,16 +508,14 @@ where wal_store: WAL, state: SafeKeeperState, ) -> SafeKeeper { - if state.server.timeline_id != ZTimelineId::from([0u8; 16]) - && ztli != state.server.timeline_id - { - panic!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.server.timeline_id); + if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id { + panic!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id); } SafeKeeper { - metrics: SafeKeeperMetrics::new(state.server.tenant_id, ztli, state.commit_lsn), + metrics: SafeKeeperMetrics::new(state.tenant_id, ztli, state.commit_lsn), commit_lsn: state.commit_lsn, - truncate_lsn: state.truncate_lsn, + peer_horizon_lsn: state.peer_horizon_lsn, s: state, control_store, wal_store, @@ -532,12 +576,24 @@ where msg.pg_version, self.s.server.pg_version ); } + if msg.tenant_id != self.s.tenant_id { + bail!( + "invalid tenant ID, got {}, expected {}", + msg.tenant_id, + self.s.tenant_id + ); + } + if msg.ztli != self.s.timeline_id { + bail!( + "invalid timeline ID, got {}, expected {}", + msg.ztli, + self.s.timeline_id + ); + } // set basic info about server, if not yet // TODO: verify that is doesn't change after self.s.server.system_id = msg.system_id; - self.s.server.tenant_id = msg.tenant_id; - self.s.server.timeline_id = msg.ztli; self.s.server.wal_seg_size = msg.wal_seg_size; self.control_store .persist(&self.s) @@ -568,7 +624,7 @@ where term: self.s.acceptor_state.term, vote_given: false as u64, flush_lsn: self.wal_store.flush_lsn(), - truncate_lsn: self.s.truncate_lsn, + truncate_lsn: self.s.peer_horizon_lsn, term_history: self.get_term_history(), }; if self.s.acceptor_state.term < msg.term { @@ -655,10 +711,11 @@ where if !msg.wal_data.is_empty() { self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?; - // If this was the first record we ever receieved, remember LSN to help - // find_end_of_wal skip the hole in the beginning. - if self.s.wal_start_lsn == Lsn(0) { - self.s.wal_start_lsn = msg.h.begin_lsn; + // If this was the first record we ever receieved, initialize + // commit_lsn to help find_end_of_wal skip the hole in the + // beginning. + if self.s.commit_lsn == Lsn(0) { + self.s.commit_lsn = msg.h.begin_lsn; sync_control_file = true; require_flush = true; } @@ -685,15 +742,15 @@ where .set(u64::from(self.commit_lsn) as f64); } - self.truncate_lsn = msg.h.truncate_lsn; + self.peer_horizon_lsn = msg.h.truncate_lsn; // Update truncate and commit LSN in control file. // To avoid negative impact on performance of extra fsync, do it only // when truncate_lsn delta exceeds WAL segment size. sync_control_file |= - self.s.truncate_lsn + (self.s.server.wal_seg_size as u64) < self.truncate_lsn; + self.s.peer_horizon_lsn + (self.s.server.wal_seg_size as u64) < self.peer_horizon_lsn; if sync_control_file { self.s.commit_lsn = self.commit_lsn; - self.s.truncate_lsn = self.truncate_lsn; + self.s.peer_horizon_lsn = self.peer_horizon_lsn; } if sync_control_file { @@ -774,11 +831,11 @@ mod tests { #[test] fn test_voting() { let storage = InMemoryState { - persisted_state: SafeKeeperState::new(), + persisted_state: SafeKeeperState::empty(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::new()); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::empty()); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -806,11 +863,11 @@ mod tests { #[test] fn test_epoch_switch() { let storage = InMemoryState { - persisted_state: SafeKeeperState::new(), + persisted_state: SafeKeeperState::empty(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::new()); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::empty()); let mut ar_hdr = AppendRequestHeader { term: 1, diff --git a/walkeeper/src/timeline.rs b/walkeeper/src/timeline.rs index c639e81b79..ea8308b95e 100644 --- a/walkeeper/src/timeline.rs +++ b/walkeeper/src/timeline.rs @@ -1,7 +1,7 @@ //! This module contains timeline id -> safekeeper state map with file-backed //! persistence and support for interaction between sending and receiving wal. -use anyhow::{Context, Result}; +use anyhow::{bail, Context, Result}; use lazy_static::lazy_static; @@ -9,22 +9,24 @@ use std::cmp::{max, min}; use std::collections::HashMap; use std::fs::{self}; -use std::sync::{Arc, Condvar, Mutex}; +use std::sync::{Arc, Condvar, Mutex, MutexGuard}; use std::time::Duration; use tokio::sync::mpsc::UnboundedSender; use tracing::*; use zenith_utils::lsn::Lsn; -use zenith_utils::zid::ZTenantTimelineId; +use zenith_utils::zid::{ZNodeId, ZTenantTimelineId}; use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; -use crate::control_file::{self, CreateControlFile}; +use crate::control_file; +use crate::control_file::Storage as cf_storage; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, }; use crate::send_wal::HotStandbyFeedback; -use crate::wal_storage::{self, Storage}; +use crate::wal_storage; +use crate::wal_storage::Storage as wal_storage_iface; use crate::SafeKeeperConf; use zenith_utils::pq_proto::ZenithFeedback; @@ -87,21 +89,39 @@ struct SharedState { } impl SharedState { - /// Restore SharedState from control file. - /// If create=false and file doesn't exist, bails out. - fn create_restore( + /// Initialize timeline state, creating control file + fn create( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, - create: CreateControlFile, + peer_ids: Vec, ) -> Result { - let state = control_file::FileStorage::load_control_file_conf(conf, zttid, create) + let state = SafeKeeperState::new(zttid, peer_ids); + let control_store = control_file::FileStorage::new(zttid, conf); + let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); + let mut sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, state); + sk.control_store.persist(&sk.s)?; + + Ok(Self { + notified_commit_lsn: Lsn(0), + sk, + replicas: Vec::new(), + active: false, + num_computes: 0, + pageserver_connstr: None, + }) + } + + /// Restore SharedState from control file. + /// If file doesn't exist, bails out. + fn restore(conf: &SafeKeeperConf, zttid: &ZTenantTimelineId) -> Result { + let state = control_file::FileStorage::load_control_file_conf(conf, zttid) .context("failed to load from control file")?; let control_store = control_file::FileStorage::new(zttid, conf); let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); - info!("timeline {} created or restored", zttid.timeline_id); + info!("timeline {} restored", zttid.timeline_id); Ok(Self { notified_commit_lsn: Lsn(0), @@ -418,26 +438,13 @@ impl Timeline { // Utilities needed by various Connection-like objects pub trait TimelineTools { - fn set( - &mut self, - conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, - create: CreateControlFile, - ) -> Result<()>; + fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()>; fn get(&self) -> &Arc; } impl TimelineTools for Option> { - fn set( - &mut self, - conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, - create: CreateControlFile, - ) -> Result<()> { - // We will only set the timeline once. If it were to ever change, - // anyone who cloned the Arc would be out of date. - assert!(self.is_none()); + fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()> { *self = Some(GlobalTimelines::get(conf, zttid, create)?); Ok(()) } @@ -456,30 +463,73 @@ lazy_static! { pub struct GlobalTimelines; impl GlobalTimelines { + fn create_internal( + mut timelines: MutexGuard>>, + conf: &SafeKeeperConf, + zttid: ZTenantTimelineId, + peer_ids: Vec, + ) -> Result> { + match timelines.get(&zttid) { + Some(_) => bail!("timeline {} already exists", zttid), + None => { + // TODO: check directory existence + let dir = conf.timeline_dir(&zttid); + fs::create_dir_all(dir)?; + let shared_state = SharedState::create(conf, &zttid, peer_ids) + .context("failed to create shared state")?; + + let new_tli = Arc::new(Timeline::new(zttid, shared_state)); + timelines.insert(zttid, Arc::clone(&new_tli)); + Ok(new_tli) + } + } + } + + pub fn create( + conf: &SafeKeeperConf, + zttid: ZTenantTimelineId, + peer_ids: Vec, + ) -> Result> { + let timelines = TIMELINES.lock().unwrap(); + GlobalTimelines::create_internal(timelines, conf, zttid, peer_ids) + } + /// Get a timeline with control file loaded from the global TIMELINES map. - /// If control file doesn't exist and create=false, bails out. + /// If control file doesn't exist, bails out. pub fn get( conf: &SafeKeeperConf, zttid: ZTenantTimelineId, - create: CreateControlFile, + create: bool, ) -> Result> { let mut timelines = TIMELINES.lock().unwrap(); match timelines.get(&zttid) { Some(result) => Ok(Arc::clone(result)), None => { - if let CreateControlFile::True = create { - let dir = conf.timeline_dir(&zttid); - info!( - "creating timeline dir {}, create is {:?}", - dir.display(), - create - ); - fs::create_dir_all(dir)?; - } + let shared_state = + SharedState::restore(conf, &zttid).context("failed to restore shared state"); - let shared_state = SharedState::create_restore(conf, &zttid, create) - .context("failed to restore shared state")?; + let shared_state = match shared_state { + Ok(shared_state) => shared_state, + Err(error) => { + // TODO: always create timeline explicitly + if error + .root_cause() + .to_string() + .contains("No such file or directory") + && create + { + return GlobalTimelines::create_internal( + timelines, + conf, + zttid, + vec![], + ); + } else { + return Err(error); + } + } + }; let new_tli = Arc::new(Timeline::new(zttid, shared_state)); timelines.insert(zttid, Arc::clone(&new_tli)); diff --git a/walkeeper/src/wal_storage.rs b/walkeeper/src/wal_storage.rs index 73eccd0ae8..7cef525bee 100644 --- a/walkeeper/src/wal_storage.rs +++ b/walkeeper/src/wal_storage.rs @@ -301,7 +301,8 @@ impl Storage for PhysicalStorage { /// allows to postpone its initialization. fn init_storage(&mut self, state: &SafeKeeperState) -> Result<()> { if state.server.wal_seg_size == 0 { - // wal_seg_size is still unknown + // wal_seg_size is still unknown. This is dead path normally, should + // be used only in tests. return Ok(()); } @@ -315,9 +316,13 @@ impl Storage for PhysicalStorage { let wal_seg_size = state.server.wal_seg_size as usize; self.wal_seg_size = Some(wal_seg_size); - // we need to read WAL from disk to know which LSNs are stored on disk - self.write_lsn = - Lsn(find_end_of_wal(&self.timeline_dir, wal_seg_size, true, state.wal_start_lsn)?.0); + // Find out where stored WAL ends, starting at commit_lsn which is a + // known recent record boundary (unless we don't have WAL at all). + self.write_lsn = if state.commit_lsn == Lsn(0) { + Lsn(0) + } else { + Lsn(find_end_of_wal(&self.timeline_dir, wal_seg_size, true, state.commit_lsn)?.0) + }; self.write_record_lsn = self.write_lsn; @@ -326,11 +331,13 @@ impl Storage for PhysicalStorage { self.update_flush_lsn(); info!( - "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, truncate_lsn={}", - self.zttid.timeline_id, self.flush_record_lsn, state.commit_lsn, state.truncate_lsn, + "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}", + self.zttid.timeline_id, self.flush_record_lsn, state.commit_lsn, state.peer_horizon_lsn, ); - if self.flush_record_lsn < state.commit_lsn || self.flush_record_lsn < state.truncate_lsn { - warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or truncate_lsn from control file", self.zttid.timeline_id); + if self.flush_record_lsn < state.commit_lsn + || self.flush_record_lsn < state.peer_horizon_lsn + { + warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", self.zttid.timeline_id); } Ok(()) diff --git a/zenith_utils/src/zid.rs b/zenith_utils/src/zid.rs index 813eb3f8f4..a740d4fb48 100644 --- a/zenith_utils/src/zid.rs +++ b/zenith_utils/src/zid.rs @@ -334,6 +334,10 @@ impl ZTenantTimelineId { pub fn generate() -> Self { Self::new(ZTenantId::generate(), ZTimelineId::generate()) } + + pub fn empty() -> Self { + Self::new(ZTenantId::from([0u8; 16]), ZTimelineId::from([0u8; 16])) + } } impl fmt::Display for ZTenantTimelineId { From 8e37d345a8fd4e7e1b25fe1e88af95de5e163ee3 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 8 Mar 2022 08:07:00 +0300 Subject: [PATCH 0010/1022] Adjust safekeeper detailed logging to batch fsyncing. --- walkeeper/src/safekeeper.rs | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/walkeeper/src/safekeeper.rs b/walkeeper/src/safekeeper.rs index f8b12530d8..53fd6f5588 100644 --- a/walkeeper/src/safekeeper.rs +++ b/walkeeper/src/safekeeper.rs @@ -649,14 +649,16 @@ where /// Form AppendResponse from current state. fn append_response(&self) -> AppendResponse { - AppendResponse { + let ar = AppendResponse { term: self.s.acceptor_state.term, flush_lsn: self.wal_store.flush_lsn(), commit_lsn: self.s.commit_lsn, // will be filled by the upper code to avoid bothering safekeeper hs_feedback: HotStandbyFeedback::empty(), zenith_feedback: ZenithFeedback::empty(), - } + }; + trace!("formed AppendResponse {:?}", ar); + ar } fn handle_elected(&mut self, msg: &ProposerElected) -> Result> { @@ -757,20 +759,21 @@ where self.control_store.persist(&self.s)?; } + trace!( + "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}", + msg.wal_data.len(), + msg.h.end_lsn, + msg.h.commit_lsn, + msg.h.truncate_lsn, + require_flush, + ); + // If flush_lsn hasn't updated, AppendResponse is not very useful. if !require_flush { return Ok(None); } let resp = self.append_response(); - trace!( - "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, resp {:?}", - msg.wal_data.len(), - msg.h.end_lsn, - msg.h.commit_lsn, - msg.h.truncate_lsn, - &resp, - ); Ok(Some(AcceptorProposerMessage::AppendResponse(resp))) } From cffac59a41e59f2f92e68b183bd64bdaf7f17fa7 Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Tue, 8 Mar 2022 23:19:49 +0300 Subject: [PATCH 0011/1022] Docker improvement (#1345) * dockerfile fix, rust cache in docker build flow * check rust cachepot * another check rust cachepot * cleanup --- .circleci/config.yml | 30 +++++++++++----- Dockerfile | 81 +++++++++++++++++++++----------------------- 2 files changed, 61 insertions(+), 50 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index db9fc31334..d342e7c9f4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -440,8 +440,14 @@ jobs: command: | echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin DOCKER_TAG=$(git log --oneline|wc -l) - docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest - docker tag zenithdb/zenith:latest zenithdb/zenith:${DOCKER_TAG} && docker push zenithdb/zenith:${DOCKER_TAG} + docker build \ + --pull \ + --build-arg GIT_VERSION=${CIRCLE_SHA1} \ + --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ + --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ + --tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:latest . + docker push zenithdb/zenith:${DOCKER_TAG} + docker push zenithdb/zenith:latest # Build zenithdb/compute-node:latest image and push it to Docker hub docker-image-compute: @@ -468,8 +474,9 @@ jobs: command: | echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin DOCKER_TAG=$(git log --oneline|wc -l) - docker build -t zenithdb/compute-node:latest vendor/postgres && docker push zenithdb/compute-node:latest - docker tag zenithdb/compute-node:latest zenithdb/compute-node:${DOCKER_TAG} && docker push zenithdb/compute-node:${DOCKER_TAG} + docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:latest vendor/postgres + docker push zenithdb/compute-node:${DOCKER_TAG} + docker push zenithdb/compute-node:latest # Build production zenithdb/zenith:release image and push it to Docker hub docker-image-release: @@ -487,8 +494,14 @@ jobs: command: | echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin DOCKER_TAG="release-$(git log --oneline|wc -l)" - docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:release . && docker push zenithdb/zenith:release - docker tag zenithdb/zenith:release zenithdb/zenith:${DOCKER_TAG} && docker push zenithdb/zenith:${DOCKER_TAG} + docker build \ + --pull \ + --build-arg GIT_VERSION=${CIRCLE_SHA1} \ + --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ + --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ + --tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:release . + docker push zenithdb/zenith:${DOCKER_TAG} + docker push zenithdb/zenith:release # Build production zenithdb/compute-node:release image and push it to Docker hub docker-image-compute-release: @@ -515,8 +528,9 @@ jobs: command: | echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin DOCKER_TAG="release-$(git log --oneline|wc -l)" - docker build -t zenithdb/compute-node:release vendor/postgres && docker push zenithdb/compute-node:release - docker tag zenithdb/compute-node:release zenithdb/compute-node:${DOCKER_TAG} && docker push zenithdb/compute-node:${DOCKER_TAG} + docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:release vendor/postgres + docker push zenithdb/compute-node:${DOCKER_TAG} + docker push zenithdb/compute-node:release deploy-staging: docker: diff --git a/Dockerfile b/Dockerfile index 18abae5327..c568cb27b0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,62 +1,59 @@ -# -# Docker image for console integration testing. -# - -# -# Build Postgres separately --- this layer will be rebuilt only if one of -# mentioned paths will get any changes. +# Build Postgres # FROM zimg/rust:1.56 AS pg-build -WORKDIR /zenith -COPY ./vendor/postgres vendor/postgres -COPY ./Makefile Makefile -ENV BUILD_TYPE release -RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres -RUN rm -rf postgres_install/build +WORKDIR /pg + +USER root + +COPY vendor/postgres vendor/postgres +COPY Makefile Makefile + +ENV BUILD_TYPE release +RUN set -e \ + && make -j $(nproc) -s postgres \ + && rm -rf tmp_install/build \ + && tar -C tmp_install -czf /postgres_install.tar.gz . -# # Build zenith binaries # -# TODO: build cargo deps as separate layer. We used cargo-chef before but that was -# net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work. -# FROM zimg/rust:1.56 AS build +ARG GIT_VERSION=local -ARG GIT_VERSION -RUN if [ -z "$GIT_VERSION" ]; then echo "GIT_VERSION is reqired, use build_arg to pass it"; exit 1; fi - -WORKDIR /zenith -COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server +ARG CACHEPOT_BUCKET=zenith-rust-cachepot +ARG AWS_ACCESS_KEY_ID +ARG AWS_SECRET_ACCESS_KEY +ENV RUSTC_WRAPPER cachepot +COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server COPY . . -RUN GIT_VERSION=$GIT_VERSION cargo build --release -# -# Copy binaries to resulting image. +RUN cargo build --release + +# Build final image # FROM debian:bullseye-slim WORKDIR /data -RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl ca-certificates && \ - mkdir zenith_install +RUN set -e \ + && apt-get update \ + && apt-get install -y \ + libreadline-dev \ + libseccomp-dev \ + openssl \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ + && useradd -d /data zenith \ + && chown -R zenith:zenith /data + +COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/pageserver /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/safekeeper /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/proxy /usr/local/bin + +COPY --from=pg-build /pg/tmp_install/ /usr/local/ +COPY --from=pg-build /postgres_install.tar.gz /data/ -COPY --from=build /zenith/target/release/pageserver /usr/local/bin -COPY --from=build /zenith/target/release/safekeeper /usr/local/bin -COPY --from=build /zenith/target/release/proxy /usr/local/bin -COPY --from=pg-build /zenith/tmp_install postgres_install COPY docker-entrypoint.sh /docker-entrypoint.sh -# Remove build artifacts (~ 500 MB) -RUN rm -rf postgres_install/build && \ - # 'Install' Postgres binaries locally - cp -r postgres_install/* /usr/local/ && \ - # Prepare an archive of Postgres binaries (should be around 11 MB) - # and keep it inside container for an ease of deploy pipeline. - cd postgres_install && tar -czf /data/postgres_install.tar.gz . && cd .. && \ - rm -rf postgres_install - -RUN useradd -d /data zenith && chown -R zenith:zenith /data - VOLUME ["/data"] USER zenith EXPOSE 6400 From 934bbcba0fd8eb5c654109acf8934c2b62ee12e1 Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Wed, 9 Mar 2022 10:13:46 +0300 Subject: [PATCH 0012/1022] revert docker build to debian:buster based rust (#1347) * dockerfile fix, rust cache in docker build flow * check rust cachepot * another check rust cachepot * cleanup * revert docker build to debian:buster based rust to avoid libc6 version mismatch --- Dockerfile | 9 ++++++--- Dockerfile.build | 23 +++++++++++++++++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) create mode 100644 Dockerfile.build diff --git a/Dockerfile b/Dockerfile index c568cb27b0..9ee6abaa8a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,7 @@ # Build Postgres # -FROM zimg/rust:1.56 AS pg-build +#FROM zimg/rust:1.56 AS pg-build +FROM zenithdb/build:buster-20220309 AS pg-build WORKDIR /pg USER root @@ -16,13 +17,15 @@ RUN set -e \ # Build zenith binaries # -FROM zimg/rust:1.56 AS build +#FROM zimg/rust:1.56 AS build +FROM zenithdb/build:buster-20220309 AS build ARG GIT_VERSION=local ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID ARG AWS_SECRET_ACCESS_KEY -ENV RUSTC_WRAPPER cachepot +#ENV RUSTC_WRAPPER cachepot +ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server COPY . . diff --git a/Dockerfile.build b/Dockerfile.build new file mode 100644 index 0000000000..44a2aaafb9 --- /dev/null +++ b/Dockerfile.build @@ -0,0 +1,23 @@ +FROM rust:1.56.1-slim-buster +WORKDIR /home/circleci/project + +RUN set -e \ + && apt-get update \ + && apt-get -yq install \ + automake \ + libtool \ + build-essential \ + bison \ + flex \ + libreadline-dev \ + zlib1g-dev \ + libxml2-dev \ + libseccomp-dev \ + pkg-config \ + libssl-dev \ + clang + +RUN set -e \ + && rustup component add clippy \ + && cargo install cargo-audit \ + && cargo install --git https://github.com/paritytech/cachepot From 15b19a0a5713eec7cc740f5725df971f407d73e4 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Wed, 9 Mar 2022 14:47:06 -0500 Subject: [PATCH 0013/1022] [proxy] Test connstr options (#1344) * Add proxy test * Fix typo --- test_runner/batch_others/test_proxy.py | 13 +++++++++++ test_runner/fixtures/zenith_fixtures.py | 29 ++++++++++++++++++------- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/test_runner/batch_others/test_proxy.py b/test_runner/batch_others/test_proxy.py index 9510e880b2..d2039f9758 100644 --- a/test_runner/batch_others/test_proxy.py +++ b/test_runner/batch_others/test_proxy.py @@ -1,2 +1,15 @@ +import pytest + + def test_proxy_select_1(static_proxy): static_proxy.safe_psql("select 1;") + + +@pytest.mark.xfail # Proxy eats the extra connection options +def test_proxy_options(static_proxy): + schema_name = "tmp_schema_1" + with static_proxy.connect(schema=schema_name) as conn: + with conn.cursor() as cur: + cur.execute("SHOW search_path;") + search_path = cur.fetchall()[0][0] + assert schema_name == search_path diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 252ca9b3c1..4d6e84048c 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -242,15 +242,20 @@ class PgProtocol: host: str, port: int, username: Optional[str] = None, - password: Optional[str] = None): + password: Optional[str] = None, + dbname: Optional[str] = None, + schema: Optional[str] = None): self.host = host self.port = port self.username = username self.password = password + self.dbname = dbname + self.schema = schema def connstr(self, *, - dbname: str = 'postgres', + dbname: Optional[str] = None, + schema: Optional[str] = None, username: Optional[str] = None, password: Optional[str] = None) -> str: """ @@ -259,6 +264,8 @@ class PgProtocol: username = username or self.username password = password or self.password + dbname = dbname or self.dbname or "postgres" + schema = schema or self.schema res = f'host={self.host} port={self.port} dbname={dbname}' if username: @@ -267,13 +274,17 @@ class PgProtocol: if password: res = f'{res} password={password}' + if schema: + res = f"{res} options='-c search_path={schema}'" + return res # autocommit=True here by default because that's what we need most of the time def connect(self, *, autocommit=True, - dbname: str = 'postgres', + dbname: Optional[str] = None, + schema: Optional[str] = None, username: Optional[str] = None, password: Optional[str] = None) -> PgConnection: """ @@ -282,11 +293,13 @@ class PgProtocol: This method passes all extra params to connstr. """ - conn = psycopg2.connect(self.connstr( - dbname=dbname, - username=username, - password=password, - )) + conn = psycopg2.connect( + self.connstr( + dbname=dbname, + schema=schema, + username=username, + password=password, + )) # WARNING: this setting affects *all* tests! conn.autocommit = autocommit return conn From 5b34afe89326e337a256eef109990621e73dfd83 Mon Sep 17 00:00:00 2001 From: anastasia Date: Wed, 23 Feb 2022 21:37:10 +0300 Subject: [PATCH 0014/1022] Bump vendor/postgres to use local relation cache for smgr_exists --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 31dc24ab29..bf6797aab5 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 31dc24ab29e6bdd5cfb85920a9c728f759c01b29 +Subproject commit bf6797aab54f1a7b865491262328598ae1869c1f From 87f306c516e8fa1a8b43778971d570ac3201ad19 Mon Sep 17 00:00:00 2001 From: anastasia Date: Wed, 23 Feb 2022 21:58:32 +0300 Subject: [PATCH 0015/1022] Tune backpressure in python tests to make them more stable --- test_runner/fixtures/zenith_fixtures.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 4d6e84048c..06f75aa604 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1285,6 +1285,10 @@ class Postgres(PgProtocol): if config_lines is None: config_lines = [] + + # set small 'max_replication_write_lag' to enable backpressure + # and make tests more stable. + config_lines = ['max_replication_write_lag=15MB'] + config_lines self.config(config_lines) return self From 2883a25761431eeb4ec53945c3ea4f6fbf2aa6b2 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 10 Mar 2022 17:31:43 +0400 Subject: [PATCH 0016/1022] Bump vendor/postgres to use local relation cache for smgr_exists --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index bf6797aab5..093aa160e5 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit bf6797aab54f1a7b865491262328598ae1869c1f +Subproject commit 093aa160e5df19814ff19b995d36dd5ee03c7f8b From 10f811e886292e258adec931945f7f6bdce4b412 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 4 Feb 2022 10:37:39 -0500 Subject: [PATCH 0017/1022] Use `timeline` instead of `branch` in pageserver's API --- control_plane/src/compute.rs | 38 ++-- control_plane/src/storage.rs | 45 ++-- pageserver/src/bin/pageserver.rs | 5 +- pageserver/src/config.rs | 8 - pageserver/src/http/models.rs | 6 +- pageserver/src/http/openapi_spec.yml | 141 +++---------- pageserver/src/http/routes.rs | 123 +++-------- pageserver/src/layered_repository.rs | 45 ++-- pageserver/src/lib.rs | 2 +- pageserver/src/remote_storage/README.md | 8 - pageserver/src/remote_storage/storage_sync.rs | 62 +----- .../remote_storage/storage_sync/download.rs | 100 +-------- .../src/remote_storage/storage_sync/index.rs | 37 +--- .../src/remote_storage/storage_sync/upload.rs | 94 +-------- pageserver/src/repository.rs | 21 +- pageserver/src/tenant_mgr.rs | 4 +- pageserver/src/{branches.rs => timelines.rs} | 194 ++++++------------ zenith/src/main.rs | 178 ++++++++-------- 18 files changed, 311 insertions(+), 800 deletions(-) rename pageserver/src/{branches.rs => timelines.rs} (70%) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index a61191e7a4..3569cc1dbb 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -73,39 +73,43 @@ impl ComputeControlPlane { .unwrap_or(self.base_port) } - // FIXME: see also parse_point_in_time in branches.rs. + // FIXME: see also parse_point_in_time in timelines.rs. fn parse_point_in_time( &self, - tenantid: ZTenantId, + tenant_id: ZTenantId, s: &str, ) -> Result<(ZTimelineId, Option)> { - let mut strings = s.split('@'); - let name = strings.next().unwrap(); + let _strings = s.split('@'); + // let name = strings.next().unwrap(); - let lsn = strings - .next() - .map(Lsn::from_str) - .transpose() - .context("invalid LSN in point-in-time specification")?; + // let lsn = strings + // .next() + // .map(Lsn::from_str) + // .transpose() + // .context("invalid LSN in point-in-time specification")?; - // Resolve the timeline ID, given the human-readable branch name - let timeline_id = self - .pageserver - .branch_get_by_name(&tenantid, name)? - .timeline_id; + // // Resolve the timeline ID, given the human-readable branch name + // let timeline_id = self + // .pageserver + // .branch_get_by_name(&tenant_id, name)? + // .timeline_id; - Ok((timeline_id, lsn)) + // Ok((timeline_id, lsn)) + todo!("TODO kb check more about the '@name' format") } pub fn new_node( &mut self, tenantid: ZTenantId, name: &str, - timeline_spec: &str, + timeline_spec: Option<&str>, port: Option, ) -> Result> { // Resolve the human-readable timeline spec into timeline ID and LSN - let (timelineid, lsn) = self.parse_point_in_time(tenantid, timeline_spec)?; + let (timelineid, lsn) = match timeline_spec { + Some(timeline_spec) => self.parse_point_in_time(tenantid, timeline_spec)?, + None => (ZTimelineId::generate(), None), + }; let port = port.unwrap_or_else(|| self.get_port()); let node = Arc::new(PostgresNode { diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index cd429e3f7a..aed9a757d4 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -9,18 +9,18 @@ use anyhow::bail; use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; -use pageserver::http::models::{BranchCreateRequest, TenantCreateRequest}; +use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest}; +use pageserver::timelines::TimelineInfo; use postgres::{Config, NoTls}; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use thiserror::Error; use zenith_utils::http::error::HttpErrorBody; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::ZTenantId; +use zenith_utils::zid::{ZTenantId, ZTimelineId}; use crate::local_env::LocalEnv; use crate::{fill_rust_env_vars, read_pidfile}; -use pageserver::branches::BranchInfo; use pageserver::tenant_mgr::TenantInfo; use zenith_utils::connstring::connection_address; @@ -335,47 +335,32 @@ impl PageServerNode { .json()?) } - pub fn branch_list(&self, tenantid: &ZTenantId) -> Result> { + pub fn timeline_list(&self, tenantid: &ZTenantId) -> Result> { Ok(self .http_request( Method::GET, - format!("{}/branch/{}", self.http_base_url, tenantid), + format!("{}/timeline/{}", self.http_base_url, tenantid), ) .send()? .error_from_body()? .json()?) } - pub fn branch_create( + pub fn timeline_create( &self, - branch_name: &str, - startpoint: &str, - tenantid: &ZTenantId, - ) -> Result { + timeline_id: ZTimelineId, + start_point: String, + tenant_id: ZTenantId, + ) -> Result { Ok(self - .http_request(Method::POST, format!("{}/branch", self.http_base_url)) - .json(&BranchCreateRequest { - tenant_id: tenantid.to_owned(), - name: branch_name.to_owned(), - start_point: startpoint.to_owned(), + .http_request(Method::POST, format!("{}/timeline", self.http_base_url)) + .json(&TimelineCreateRequest { + tenant_id, + timeline_id, + start_point, }) .send()? .error_from_body()? .json()?) } - - pub fn branch_get_by_name( - &self, - tenantid: &ZTenantId, - branch_name: &str, - ) -> Result { - Ok(self - .http_request( - Method::GET, - format!("{}/branch/{}/{}", self.http_base_url, tenantid, branch_name), - ) - .send()? - .error_for_status()? - .json()?) - } } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index d8d4033340..2fa772af58 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -10,11 +10,10 @@ use clap::{App, Arg}; use daemonize::Daemonize; use pageserver::{ - branches, config::{defaults::*, PageServerConf}, http, page_cache, page_service, remote_storage, tenant_mgr, thread_mgr, thread_mgr::ThreadKind, - virtual_file, LOG_FILE_NAME, + timelines, virtual_file, LOG_FILE_NAME, }; use zenith_utils::http::endpoint; use zenith_utils::postgres_backend; @@ -143,7 +142,7 @@ fn main() -> Result<()> { // Create repo and exit if init was requested if init { - branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?; + timelines::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?; // write the config file std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| { format!( diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 3deabb7521..5a9c7557cc 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -400,14 +400,6 @@ impl PageServerConf { self.tags_path(tenantid).join(tag_name) } - pub fn branches_path(&self, tenantid: &ZTenantId) -> PathBuf { - self.tenant_path(tenantid).join("refs").join("branches") - } - - pub fn branch_path(&self, branch_name: &str, tenantid: &ZTenantId) -> PathBuf { - self.branches_path(tenantid).join(branch_name) - } - pub fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf { self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME) } diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 5d7398ef03..a6dce33c03 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -1,13 +1,15 @@ use serde::{Deserialize, Serialize}; +use zenith_utils::zid::ZTimelineId; use crate::ZTenantId; use zenith_utils::zid::ZNodeId; #[derive(Serialize, Deserialize)] -pub struct BranchCreateRequest { +pub struct TimelineCreateRequest { #[serde(with = "hex")] pub tenant_id: ZTenantId, - pub name: String, + #[serde(with = "hex")] + pub timeline_id: ZTimelineId, pub start_point: String, } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index baf81fcf21..7f3bf97bfe 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -30,19 +30,22 @@ paths: schema: type: string format: hex + - name: include-non-incremental-logical-size + in: query + schema: + type: string + description: Controls calculation of current_logical_size_non_incremental get: - description: List tenant timelines + description: Get timelines for tenant responses: "200": - description: array of brief timeline descriptions + description: TimelineInfo content: application/json: schema: type: array items: - # currently, just a timeline id string, but when remote index gets to be accessed - # remote/local timeline field would be added at least - type: string + $ref: "#/components/schemas/TimelineInfo" "400": description: Error when no tenant id found in path content: @@ -81,8 +84,13 @@ paths: schema: type: string format: hex + - name: include-non-incremental-logical-size + in: query + schema: + type: string + description: Controls calculation of current_logical_size_non_incremental get: - description: Get timeline info for tenant's remote timeline + description: Get timelines for tenant responses: "200": description: TimelineInfo @@ -91,7 +99,7 @@ paths: schema: $ref: "#/components/schemas/TimelineInfo" "400": - description: Error when no tenant id found in path or no branch name + description: Error when no tenant id found in path or no timeline id content: application/json: schema: @@ -114,108 +122,9 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - /v1/branch/{tenant_id}: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - format: hex - - name: include-non-incremental-logical-size - in: query - schema: - type: string - description: Controls calculation of current_logical_size_non_incremental - get: - description: Get branches for tenant - responses: - "200": - description: BranchInfo - content: - application/json: - schema: - type: array - items: - $ref: "#/components/schemas/BranchInfo" - "400": - description: Error when no tenant id found in path - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - /v1/branch/{tenant_id}/{branch_name}: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - format: hex - - name: branch_name - in: path - required: true - schema: - type: string - - name: include-non-incremental-logical-size - in: query - schema: - type: string - description: Controls calculation of current_logical_size_non_incremental - get: - description: Get branches for tenant - responses: - "200": - description: BranchInfo - content: - application/json: - schema: - $ref: "#/components/schemas/BranchInfo" - "400": - description: Error when no tenant id found in path or no branch name - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - /v1/branch/: + /v1/timeline/: post: - description: Create branch + description: Create timeline requestBody: content: application/json: @@ -223,25 +132,26 @@ paths: type: object required: - "tenant_id" - - "name" + - "timeline_id" - "start_point" properties: tenant_id: type: string format: hex - name: + timeline_id: type: string + format: hex start_point: type: string responses: "201": - description: BranchInfo + description: TimelineInfo content: application/json: schema: - $ref: "#/components/schemas/BranchInfo" + $ref: "#/components/schemas/TImelineInfo" "400": - description: Malformed branch create request + description: Malformed timeline create request content: application/json: schema: @@ -358,16 +268,13 @@ components: type: string state: type: string - BranchInfo: + TimelineInfo: type: object required: - - name - timeline_id - latest_valid_lsn - current_logical_size properties: - name: - type: string timeline_id: type: string format: hex diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 26d473efaf..5ab1576aa6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use anyhow::{Context, Result}; +use anyhow::Result; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use serde::Serialize; @@ -14,7 +14,6 @@ use zenith_utils::http::{ endpoint, error::HttpErrorBody, json::{json_request, json_response}, - request::get_request_param, request::parse_request_param, }; use zenith_utils::http::{RequestExt, RouterBuilder}; @@ -22,13 +21,12 @@ use zenith_utils::lsn::Lsn; use zenith_utils::zid::HexZTimelineId; use zenith_utils::zid::ZTimelineId; -use super::models::BranchCreateRequest; use super::models::StatusResponse; use super::models::TenantCreateRequest; -use crate::branches::BranchInfo; +use super::models::TimelineCreateRequest; use crate::repository::RepositoryTimeline; use crate::repository::TimelineSyncState; -use crate::{branches, config::PageServerConf, tenant_mgr, ZTenantId}; +use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId}; #[derive(Debug)] struct State { @@ -73,18 +71,18 @@ async fn status_handler(request: Request) -> Result, ApiErr )?) } -async fn branch_create_handler(mut request: Request) -> Result, ApiError> { - let request_data: BranchCreateRequest = json_request(&mut request).await?; +async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { + let request_data: TimelineCreateRequest = json_request(&mut request).await?; check_permission(&request, Some(request_data.tenant_id))?; let response_data = tokio::task::spawn_blocking(move || { - let _enter = info_span!("/branch_create", name = %request_data.name, tenant = %request_data.tenant_id, startpoint=%request_data.start_point).entered(); - branches::create_branch( + let _enter = info_span!("/timeline_create", timeline = %request_data.timeline_id, tenant = %request_data.tenant_id, startpoint=%request_data.start_point).entered(); + timelines::create_timeline( get_config(&request), - &request_data.name, &request_data.start_point, - &request_data.tenant_id, + request_data.tenant_id, + request_data.timeline_id, ) }) .await @@ -92,6 +90,19 @@ async fn branch_create_handler(mut request: Request) -> Result) -> Result, ApiError> { + let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); + let response_data = tokio::task::spawn_blocking(move || { + let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); + crate::timelines::get_timelines(tenant_id, include_non_incremental_logical_size) + }) + .await + .map_err(ApiError::from_err)??; + Ok(json_response(StatusCode::OK, response_data)?) +} + // Gate non incremental logical size calculation behind a flag // after pgbench -i -s100 calculation took 28ms so if multiplied by the number of timelines // and tenants it can take noticeable amount of time. Also the value currently used only in tests @@ -107,90 +118,6 @@ fn get_include_non_incremental_logical_size(request: &Request) -> bool { .unwrap_or(false) } -async fn branch_list_handler(request: Request) -> Result, ApiError> { - let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?; - - let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); - - check_permission(&request, Some(tenantid))?; - - let response_data = tokio::task::spawn_blocking(move || { - let _enter = info_span!("branch_list", tenant = %tenantid).entered(); - crate::branches::get_branches( - get_config(&request), - &tenantid, - include_non_incremental_logical_size, - ) - }) - .await - .map_err(ApiError::from_err)??; - Ok(json_response(StatusCode::OK, response_data)?) -} - -async fn branch_detail_handler(request: Request) -> Result, ApiError> { - let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?; - let branch_name: String = get_request_param(&request, "branch_name")?.to_string(); - let conf = get_state(&request).conf; - let path = conf.branch_path(&branch_name, &tenantid); - - let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); - - let response_data = tokio::task::spawn_blocking(move || { - let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered(); - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - BranchInfo::from_path(path, &repo, include_non_incremental_logical_size) - }) - .await - .map_err(ApiError::from_err)??; - - Ok(json_response(StatusCode::OK, response_data)?) -} - -async fn timeline_list_handler(request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - - let conf = get_state(&request).conf; - let timelines_dir = conf.timelines_path(&tenant_id); - - let mut timelines_dir_contents = - tokio::fs::read_dir(&timelines_dir).await.with_context(|| { - format!( - "Failed to list timelines dir '{}' contents", - timelines_dir.display() - ) - })?; - - let mut local_timelines = Vec::new(); - while let Some(entry) = timelines_dir_contents.next_entry().await.with_context(|| { - format!( - "Failed to list timelines dir '{}' contents", - timelines_dir.display() - ) - })? { - let entry_path = entry.path(); - let entry_type = entry.file_type().await.with_context(|| { - format!( - "Failed to get file type of timeline dirs' entry '{}'", - entry_path.display() - ) - })?; - - if entry_type.is_dir() { - match entry.file_name().to_string_lossy().parse::() { - Ok(timeline_id) => local_timelines.push(timeline_id.to_string()), - Err(e) => error!( - "Failed to get parse timeline id from timeline dirs' entry '{}': {}", - entry_path.display(), - e - ), - } - } - } - - Ok(json_response(StatusCode::OK, local_timelines)?) -} - #[derive(Debug, Serialize)] #[serde(tag = "type")] enum TimelineInfo { @@ -260,7 +187,7 @@ async fn timeline_attach_handler(request: Request) -> Result { + RepositoryTimeline::Local { .. } => { anyhow::bail!("Timeline with id {} is already local", timeline_id) } RepositoryTimeline::Remote { @@ -369,9 +296,7 @@ pub fn make_router( "/v1/timeline/:tenant_id/:timeline_id/detach", timeline_detach_handler, ) - .get("/v1/branch/:tenant_id", branch_list_handler) - .get("/v1/branch/:tenant_id/:branch_name", branch_detail_handler) - .post("/v1/branch", branch_create_handler) + .post("/v1/timeline", timeline_create_handler) .get("/v1/tenant", tenant_list_handler) .post("/v1/tenant", tenant_create_handler) .any(handler_404) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 975b2f5d2b..c3d42d1829 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -137,19 +137,20 @@ pub struct LayeredRepository { /// Public interface impl Repository for LayeredRepository { fn get_timeline(&self, timelineid: ZTimelineId) -> Result { - let mut timelines = self.timelines.lock().unwrap(); - Ok( - match self.get_or_init_timeline(timelineid, &mut timelines)? { - LayeredTimelineEntry::Local(local) => RepositoryTimeline::Local(local), - LayeredTimelineEntry::Remote { - id, - disk_consistent_lsn, - } => RepositoryTimeline::Remote { - id, - disk_consistent_lsn, - }, - }, - ) + Ok(RepositoryTimeline::from(self.get_or_init_timeline( + timelineid, + &mut self.timelines.lock().unwrap(), + )?)) + } + + fn list_timelines(&self) -> Result> { + Ok(self + .timelines + .lock() + .unwrap() + .values() + .map(|timeline_entry| RepositoryTimeline::from(timeline_entry.clone())) + .collect()) } fn create_empty_timeline( @@ -428,6 +429,24 @@ impl LayeredTimelineEntry { } } +impl From for RepositoryTimeline { + fn from(layered_timeline: LayeredTimelineEntry) -> Self { + match layered_timeline { + LayeredTimelineEntry::Local(timeline) => RepositoryTimeline::Local { + id: timeline.timelineid, + timeline, + }, + LayeredTimelineEntry::Remote { + id, + disk_consistent_lsn, + } => RepositoryTimeline::Remote { + id, + disk_consistent_lsn, + }, + } + } +} + /// Private functions impl LayeredRepository { // Implementation of the public `get_timeline` function. This differs from the public diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 3a68f56187..3d66192c80 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -1,5 +1,4 @@ pub mod basebackup; -pub mod branches; pub mod config; pub mod http; pub mod import_datadir; @@ -12,6 +11,7 @@ pub mod repository; pub mod tenant_mgr; pub mod tenant_threads; pub mod thread_mgr; +pub mod timelines; pub mod virtual_file; pub mod walingest; pub mod walreceiver; diff --git a/pageserver/src/remote_storage/README.md b/pageserver/src/remote_storage/README.md index 1c718acf06..3c77275da8 100644 --- a/pageserver/src/remote_storage/README.md +++ b/pageserver/src/remote_storage/README.md @@ -62,11 +62,3 @@ Based on previous evaluation, even `rusoto-s3` could be a better choice over thi So far, we don't adjust the remote storage based on GC thread loop results, only checkpointer loop affects the remote storage. Index module could be used as a base to implement a deferred GC mechanism, a "defragmentation" that repacks archives into new ones after GC is done removing the files from the archives. - -* bracnhes implementaion could be improved - -Currently, there's a code to sync the branches along with the timeline files: on upload, every local branch files that are missing remotely are uploaded, -on the timeline download, missing remote branch files are downlaoded. - -A branch is a per-tenant entity, yet a current implementaion requires synchronizing a timeline first to get the branch files locally. -Currently, there's no other way to know about the remote branch files, neither the file contents is verified and updated. diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index 6b588c8e5f..d14f849e15 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -14,13 +14,6 @@ //! Only GC removes local timeline files, the GC support is not added to sync currently, //! yet downloading extra files is not critically bad at this stage, GC can remove those again. //! -//! Along the timeline files, branch files are uploaded and downloaded every time a corresponding sync task is processed. -//! For simplicity, branch files are also treated as immutable: only missing files are uploaded or downloaded, no removals, amendments or file contents checks are done. -//! Also, the branches are copied as separate files, with no extra compressions done. -//! Despite branches information currently belonging to tenants, a tenants' timeline sync is required to upload or download the branch files, also, there's no way to know -//! the branch sync state outside of the sync loop. -//! This implementation is currently considered as temporary and is a subjec to change later. -//! //! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via listing the remote storage contents. //! It's enough to poll the remote state once on startup only, due to agreement that the pageserver has //! an exclusive write access to the remote storage: new files appear in the storage only after the same @@ -66,7 +59,6 @@ //! NOTE: No real contents or checksum check happens right now and is a subject to improve later. //! //! After the whole timeline is downloaded, [`crate::tenant_mgr::set_timeline_states`] function is used to update pageserver memory stage for the timeline processed. -//! No extra branch registration is done. //! //! When pageserver signals shutdown, current sync task gets finished and the loop exists. @@ -77,7 +69,7 @@ pub mod index; mod upload; use std::{ - collections::{BTreeSet, HashMap, HashSet, VecDeque}, + collections::{BTreeSet, HashMap, VecDeque}, num::{NonZeroU32, NonZeroUsize}, path::{Path, PathBuf}, sync::Arc, @@ -87,7 +79,6 @@ use anyhow::{bail, Context}; use futures::stream::{FuturesUnordered, StreamExt}; use lazy_static::lazy_static; use tokio::{ - fs, runtime::Runtime, sync::{ mpsc::{self, UnboundedReceiver}, @@ -101,8 +92,7 @@ use self::{ compression::ArchiveHeader, download::{download_timeline, DownloadedTimeline}, index::{ - ArchiveDescription, ArchiveId, RelativePath, RemoteTimeline, RemoteTimelineIndex, - TimelineIndexEntry, + ArchiveDescription, ArchiveId, RemoteTimeline, RemoteTimelineIndex, TimelineIndexEntry, }, upload::upload_timeline_checkpoint, }; @@ -843,28 +833,6 @@ async fn download_archive_header< Ok(header) } -async fn tenant_branch_files( - conf: &'static PageServerConf, - tenant_id: ZTenantId, -) -> anyhow::Result> { - let branches_dir = conf.branches_path(&tenant_id); - if !branches_dir.exists() { - return Ok(HashSet::new()); - } - - let mut branch_entries = fs::read_dir(&branches_dir) - .await - .context("Failed to list tenant branches dir contents")?; - - let mut branch_files = HashSet::new(); - while let Some(branch_entry) = branch_entries.next_entry().await? { - if branch_entry.file_type().await?.is_file() { - branch_files.insert(RelativePath::new(&branches_dir, branch_entry.path())?); - } - } - Ok(branch_files) -} - #[cfg(test)] mod test_utils { use std::{ @@ -971,30 +939,9 @@ mod test_utils { "Index contains unexpected sync ids" ); - let mut actual_branches = BTreeMap::new(); - let mut expected_branches = BTreeMap::new(); let mut actual_timeline_entries = BTreeMap::new(); let mut expected_timeline_entries = BTreeMap::new(); for sync_id in actual_sync_ids { - actual_branches.insert( - sync_id.tenant_id, - index_read - .branch_files(sync_id.tenant_id) - .into_iter() - .flat_map(|branch_paths| branch_paths.iter()) - .cloned() - .collect::>(), - ); - expected_branches.insert( - sync_id.tenant_id, - expected_index_with_descriptions - .branch_files(sync_id.tenant_id) - .into_iter() - .flat_map(|branch_paths| branch_paths.iter()) - .cloned() - .collect::>(), - ); - actual_timeline_entries.insert( sync_id, index_read.timeline_entry(&sync_id).unwrap().clone(), @@ -1009,11 +956,6 @@ mod test_utils { } drop(index_read); - assert_eq!( - actual_branches, expected_branches, - "Index contains unexpected branches" - ); - for (sync_id, actual_timeline_entry) in actual_timeline_entries { let expected_timeline_description = expected_timeline_entries .remove(&sync_id) diff --git a/pageserver/src/remote_storage/storage_sync/download.rs b/pageserver/src/remote_storage/storage_sync/download.rs index f268fc442a..00115ba8d5 100644 --- a/pageserver/src/remote_storage/storage_sync/download.rs +++ b/pageserver/src/remote_storage/storage_sync/download.rs @@ -1,10 +1,8 @@ //! Timeline synchrnonization logic to put files from archives on remote storage into pageserver's local directory. -//! Currently, tenant branch files are also downloaded, but this does not appear final. use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc}; use anyhow::{ensure, Context}; -use futures::{stream::FuturesUnordered, StreamExt}; use tokio::{fs, sync::RwLock}; use tracing::{debug, error, trace, warn}; use zenith_utils::{lsn::Lsn, zid::ZTenantId}; @@ -14,8 +12,8 @@ use crate::{ layered_repository::metadata::{metadata_path, TimelineMetadata}, remote_storage::{ storage_sync::{ - compression, index::TimelineIndexEntry, sync_queue, tenant_branch_files, - update_index_description, SyncKind, SyncTask, + compression, index::TimelineIndexEntry, sync_queue, update_index_description, SyncKind, + SyncTask, }, RemoteStorage, ZTenantTimelineId, }, @@ -42,8 +40,6 @@ pub(super) enum DownloadedTimeline { /// Timeline files that already exist locally are skipped during the download, but the local metadata file is /// updated in the end of every checkpoint archive extraction. /// -/// Before any archives are considered, the branch files are checked locally and remotely, all remote-only files are downloaded. -/// /// On an error, bumps the retries count and reschedules the download, with updated archive skip list /// (for any new successful archive downloads and extractions). pub(super) async fn download_timeline< @@ -113,22 +109,6 @@ pub(super) async fn download_timeline< } }; - if let Err(e) = download_missing_branches(conf, remote_assets.as_ref(), sync_id.tenant_id).await - { - error!( - "Failed to download missing branches for sync id {}: {:?}", - sync_id, e - ); - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Download(download), - )); - return DownloadedTimeline::FailedAndRescheduled { - disk_consistent_lsn, - }; - } - debug!("Downloading timeline archives"); let archives_to_download = remote_timeline .checkpoints() @@ -250,82 +230,6 @@ async fn read_local_metadata( .context("Failed to read local metadata files bytes")?) } -async fn download_missing_branches< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - conf: &'static PageServerConf, - (storage, index): &(S, RwLock), - tenant_id: ZTenantId, -) -> anyhow::Result<()> { - let local_branches = tenant_branch_files(conf, tenant_id) - .await - .context("Failed to list local branch files for the tenant")?; - let local_branches_dir = conf.branches_path(&tenant_id); - if !local_branches_dir.exists() { - fs::create_dir_all(&local_branches_dir) - .await - .with_context(|| { - format!( - "Failed to create local branches directory at path '{}'", - local_branches_dir.display() - ) - })?; - } - - if let Some(remote_branches) = index.read().await.branch_files(tenant_id) { - let mut remote_only_branches_downloads = remote_branches - .difference(&local_branches) - .map(|remote_only_branch| async move { - let branches_dir = conf.branches_path(&tenant_id); - let remote_branch_path = remote_only_branch.as_path(&branches_dir); - let storage_path = - storage.storage_path(&remote_branch_path).with_context(|| { - format!( - "Failed to derive a storage path for branch with local path '{}'", - remote_branch_path.display() - ) - })?; - let mut target_file = fs::OpenOptions::new() - .write(true) - .create_new(true) - .open(&remote_branch_path) - .await - .with_context(|| { - format!( - "Failed to create local branch file at '{}'", - remote_branch_path.display() - ) - })?; - storage - .download(&storage_path, &mut target_file) - .await - .with_context(|| { - format!( - "Failed to download branch file from the remote path {:?}", - storage_path - ) - })?; - Ok::<_, anyhow::Error>(()) - }) - .collect::>(); - - let mut branch_downloads_failed = false; - while let Some(download_result) = remote_only_branches_downloads.next().await { - if let Err(e) = download_result { - branch_downloads_failed = true; - error!("Failed to download a branch file: {:?}", e); - } - } - ensure!( - !branch_downloads_failed, - "Failed to download all branch files" - ); - } - - Ok(()) -} - #[cfg(test)] mod tests { use std::collections::BTreeSet; diff --git a/pageserver/src/remote_storage/storage_sync/index.rs b/pageserver/src/remote_storage/storage_sync/index.rs index 3d2680948d..8ff92ed55e 100644 --- a/pageserver/src/remote_storage/storage_sync/index.rs +++ b/pageserver/src/remote_storage/storage_sync/index.rs @@ -5,7 +5,7 @@ //! This way in the future, the index could be restored fast from its serialized stored form. use std::{ - collections::{BTreeMap, BTreeSet, HashMap, HashSet}, + collections::{BTreeMap, BTreeSet, HashMap}, path::{Path, PathBuf}, }; @@ -49,10 +49,9 @@ impl RelativePath { } /// An index to track tenant files that exist on the remote storage. -/// Currently, timeline archives and branch files are tracked. +/// Currently, timeline archives files are tracked only. #[derive(Debug, Clone)] pub struct RemoteTimelineIndex { - branch_files: HashMap>, timeline_files: HashMap, } @@ -65,7 +64,6 @@ impl RemoteTimelineIndex { paths: impl Iterator, ) -> Self { let mut index = Self { - branch_files: HashMap::new(), timeline_files: HashMap::new(), }; for path in paths { @@ -98,17 +96,6 @@ impl RemoteTimelineIndex { pub fn all_sync_ids(&self) -> impl Iterator + '_ { self.timeline_files.keys().copied() } - - pub fn add_branch_file(&mut self, tenant_id: ZTenantId, path: RelativePath) { - self.branch_files - .entry(tenant_id) - .or_insert_with(HashSet::new) - .insert(path); - } - - pub fn branch_files(&self, tenant_id: ZTenantId) -> Option<&HashSet> { - self.branch_files.get(&tenant_id) - } } #[derive(Debug, Clone, PartialEq, Eq)] @@ -306,20 +293,9 @@ fn try_parse_index_entry( .parse::() .with_context(|| format!("Failed to parse tenant id from path '{}'", path.display()))?; - let branches_path = conf.branches_path(&tenant_id); let timelines_path = conf.timelines_path(&tenant_id); - match ( - RelativePath::new(&branches_path, &path), - path.strip_prefix(&timelines_path), - ) { - (Ok(_), Ok(_)) => bail!( - "Path '{}' cannot start with both branches '{}' and the timelines '{}' prefixes", - path.display(), - branches_path.display(), - timelines_path.display() - ), - (Ok(branches_entry), Err(_)) => index.add_branch_file(tenant_id, branches_entry), - (Err(_), Ok(timelines_subpath)) => { + match path.strip_prefix(&timelines_path) { + Ok(timelines_subpath) => { let mut segments = timelines_subpath.iter(); let timeline_id = segments .next() @@ -375,11 +351,10 @@ fn try_parse_index_entry( } } } - (Err(branches_error), Err(timelines_strip_error)) => { + Err(timelines_strip_error) => { bail!( - "Path '{}' is not an index entry: it's neither parsable as a branch entry '{:#}' nor as an archive entry '{}'", + "Path '{}' is not an archive entry '{}'", path.display(), - branches_error, timelines_strip_error, ) } diff --git a/pageserver/src/remote_storage/storage_sync/upload.rs b/pageserver/src/remote_storage/storage_sync/upload.rs index 0f57d714dd..d064039ecc 100644 --- a/pageserver/src/remote_storage/storage_sync/upload.rs +++ b/pageserver/src/remote_storage/storage_sync/upload.rs @@ -1,13 +1,10 @@ //! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints. -//! Currently, tenant branch files are also uploaded, but this does not appear final. use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc}; -use anyhow::{ensure, Context}; -use futures::{stream::FuturesUnordered, StreamExt}; -use tokio::{fs, sync::RwLock}; +use anyhow::ensure; +use tokio::sync::RwLock; use tracing::{debug, error, warn}; -use zenith_utils::zid::ZTenantId; use crate::{ config::PageServerConf, @@ -15,7 +12,7 @@ use crate::{ storage_sync::{ compression, index::{RemoteTimeline, TimelineIndexEntry}, - sync_queue, tenant_branch_files, update_index_description, SyncKind, SyncTask, + sync_queue, update_index_description, SyncKind, SyncTask, }, RemoteStorage, ZTenantTimelineId, }, @@ -26,8 +23,6 @@ use super::{compression::ArchiveHeader, index::RemoteTimelineIndex, NewCheckpoin /// Attempts to compress and upload given checkpoint files. /// No extra checks for overlapping files is made: download takes care of that, ensuring no non-metadata local timeline files are overwritten. /// -/// Before the checkpoint files are uploaded, branch files are uploaded, if any local ones are missing remotely. -/// /// On an error, bumps the retries count and reschedules the entire task. /// On success, populates index data with new downloads. pub(super) async fn upload_timeline_checkpoint< @@ -41,19 +36,6 @@ pub(super) async fn upload_timeline_checkpoint< retries: u32, ) -> Option { debug!("Uploading checkpoint for sync id {}", sync_id); - if let Err(e) = upload_missing_branches(config, remote_assets.as_ref(), sync_id.tenant_id).await - { - error!( - "Failed to upload missing branches for sync id {}: {:?}", - sync_id, e - ); - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Upload(new_checkpoint), - )); - return Some(false); - } let new_upload_lsn = new_checkpoint.metadata.disk_consistent_lsn(); let index = &remote_assets.1; @@ -201,76 +183,6 @@ async fn try_upload_checkpoint< .map(|(header, header_size, _)| (header, header_size)) } -async fn upload_missing_branches< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - config: &'static PageServerConf, - (storage, index): &(S, RwLock), - tenant_id: ZTenantId, -) -> anyhow::Result<()> { - let local_branches = tenant_branch_files(config, tenant_id) - .await - .context("Failed to list local branch files for the tenant")?; - let index_read = index.read().await; - let remote_branches = index_read - .branch_files(tenant_id) - .cloned() - .unwrap_or_default(); - drop(index_read); - - let mut branch_uploads = local_branches - .difference(&remote_branches) - .map(|local_only_branch| async move { - let local_branch_path = local_only_branch.as_path(&config.branches_path(&tenant_id)); - let storage_path = storage.storage_path(&local_branch_path).with_context(|| { - format!( - "Failed to derive a storage path for branch with local path '{}'", - local_branch_path.display() - ) - })?; - let local_branch_file = fs::OpenOptions::new() - .read(true) - .open(&local_branch_path) - .await - .with_context(|| { - format!( - "Failed to open local branch file {} for reading", - local_branch_path.display() - ) - })?; - storage - .upload(local_branch_file, &storage_path) - .await - .with_context(|| { - format!( - "Failed to upload branch file to the remote path {:?}", - storage_path - ) - })?; - Ok::<_, anyhow::Error>(local_only_branch) - }) - .collect::>(); - - let mut branch_uploads_failed = false; - while let Some(upload_result) = branch_uploads.next().await { - match upload_result { - Ok(local_only_branch) => index - .write() - .await - .add_branch_file(tenant_id, local_only_branch.clone()), - Err(e) => { - error!("Failed to upload branch file: {:?}", e); - branch_uploads_failed = true; - } - } - } - - ensure!(!branch_uploads_failed, "Failed to upload all branch files"); - - Ok(()) -} - #[cfg(test)] mod tests { use tempfile::tempdir; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 6142953a58..674d447624 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -36,6 +36,10 @@ pub trait Repository: Send + Sync { /// Get Timeline handle for given zenith timeline ID. fn get_timeline(&self, timelineid: ZTimelineId) -> Result; + /// Lists timelines the repository contains. + /// Up to repository's implementation to omit certain timelines that ar not considered ready for use. + fn list_timelines(&self) -> Result>; + /// Create a new, empty timeline. The caller is responsible for loading data into it /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. fn create_empty_timeline( @@ -72,7 +76,10 @@ pub trait Repository: Send + Sync { pub enum RepositoryTimeline { /// Timeline, with its files present locally in pageserver's working directory. /// Loaded into pageserver's memory and ready to be used. - Local(Arc), + Local { + id: ZTimelineId, + timeline: Arc, + }, /// Timeline, found on the pageserver's remote storage, but not yet downloaded locally. Remote { id: ZTimelineId, @@ -83,12 +90,19 @@ pub enum RepositoryTimeline { impl RepositoryTimeline { pub fn local_timeline(&self) -> Option> { - if let Self::Local(local_timeline) = self { - Some(Arc::clone(local_timeline)) + if let Self::Local { timeline, .. } = self { + Some(Arc::clone(timeline)) } else { None } } + + pub fn id(&self) -> ZTimelineId { + match self { + Self::Local { id, .. } => *id, + Self::Remote { id, .. } => *id, + } + } } /// A state of the timeline synchronization with the remote storage. @@ -390,7 +404,6 @@ pub mod repo_harness { let tenant_id = ZTenantId::generate(); fs::create_dir_all(conf.tenant_path(&tenant_id))?; - fs::create_dir_all(conf.branches_path(&tenant_id))?; Ok(Self { conf, tenant_id }) } diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index d60b5fefd3..98777e5e4b 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -1,12 +1,12 @@ //! This module acts as a switchboard to access different repositories managed by this //! page server. -use crate::branches; use crate::config::PageServerConf; use crate::layered_repository::LayeredRepository; use crate::repository::{Repository, Timeline, TimelineSyncState}; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; +use crate::timelines; use crate::walredo::PostgresRedoManager; use crate::CheckpointConfig; use anyhow::{bail, Context, Result}; @@ -182,7 +182,7 @@ pub fn create_repository_for_tenant( tenantid: ZTenantId, ) -> Result<()> { let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid)); - let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?; + let repo = timelines::create_repo(conf, tenantid, wal_redo_manager)?; match access_tenants().entry(tenantid) { hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", tenantid), diff --git a/pageserver/src/branches.rs b/pageserver/src/timelines.rs similarity index 70% rename from pageserver/src/branches.rs rename to pageserver/src/timelines.rs index 43f27af5ea..4a84b434a9 100644 --- a/pageserver/src/branches.rs +++ b/pageserver/src/timelines.rs @@ -1,5 +1,5 @@ //! -//! Branch management code +//! Timeline management code //! // TODO: move all paths construction to conf impl // @@ -27,8 +27,7 @@ use crate::{import_datadir, LOG_FILE_NAME}; use crate::{repository::RepositoryTimeline, tenant_mgr}; #[derive(Serialize, Deserialize, Clone)] -pub struct BranchInfo { - pub name: String, +pub struct TimelineInfo { #[serde(with = "hex")] pub timeline_id: ZTimelineId, pub latest_valid_lsn: Lsn, @@ -38,59 +37,6 @@ pub struct BranchInfo { pub current_logical_size_non_incremental: Option, } -impl BranchInfo { - pub fn from_path>( - path: T, - repo: &Arc, - include_non_incremental_logical_size: bool, - ) -> Result { - let path = path.as_ref(); - let name = path.file_name().unwrap().to_string_lossy().to_string(); - let timeline_id = std::fs::read_to_string(path) - .with_context(|| { - format!( - "Failed to read branch file contents at path '{}'", - path.display() - ) - })? - .parse::()?; - - let timeline = match repo.get_timeline(timeline_id)? { - RepositoryTimeline::Local(local_entry) => local_entry, - RepositoryTimeline::Remote { .. } => { - bail!("Timeline {} is remote, no branches to display", timeline_id) - } - }; - - // we use ancestor lsn zero if we don't have an ancestor, so turn this into an option based on timeline id - let (ancestor_id, ancestor_lsn) = match timeline.get_ancestor_timeline_id() { - Some(ancestor_id) => ( - Some(ancestor_id.to_string()), - Some(timeline.get_ancestor_lsn().to_string()), - ), - None => (None, None), - }; - - // non incremental size calculation can be heavy, so let it be optional - // needed for tests to check size calculation - let current_logical_size_non_incremental = include_non_incremental_logical_size - .then(|| { - timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) - }) - .transpose()?; - - Ok(BranchInfo { - name, - timeline_id, - latest_valid_lsn: timeline.get_last_record_lsn(), - ancestor_id, - ancestor_lsn, - current_logical_size: timeline.get_current_logical_size(), - current_logical_size_non_incremental, - }) - } -} - #[derive(Debug, Clone, Copy)] pub struct PointInTime { pub timelineid: ZTimelineId, @@ -140,7 +86,6 @@ pub fn create_repo( .with_context(|| format!("could not create directory {}", repo_dir.display()))?; crashsafe_dir::create_dir(conf.timelines_path(&tenantid))?; - crashsafe_dir::create_dir_all(conf.branches_path(&tenantid))?; crashsafe_dir::create_dir_all(conf.tags_path(&tenantid))?; info!("created directory structure in {}", repo_dir.display()); @@ -198,7 +143,7 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { .output() .context("failed to execute initdb")?; if !initdb_output.status.success() { - anyhow::bail!( + bail!( "initdb failed: '{}'", String::from_utf8_lossy(&initdb_output.stderr) ); @@ -245,65 +190,80 @@ fn bootstrap_timeline( timeline.get_last_record_lsn() ); - let data = tli.to_string(); - fs::write(conf.branch_path("main", &tenantid), data)?; - println!("created main branch"); - // Remove temp dir. We don't need it anymore fs::remove_dir_all(pgdata_path)?; Ok(()) } -pub(crate) fn get_branches( - conf: &PageServerConf, - tenantid: &ZTenantId, +pub(crate) fn get_timelines( + tenant_id: ZTenantId, include_non_incremental_logical_size: bool, -) -> Result> { - let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?; +) -> Result> { + let repo = tenant_mgr::get_repository_for_tenant(tenant_id) + .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?; - // Each branch has a corresponding record (text file) in the refs/branches - // with timeline_id. - let branches_dir = conf.branches_path(tenantid); - - std::fs::read_dir(&branches_dir) - .with_context(|| { - format!( - "Found no branches directory '{}' for tenant {}", - branches_dir.display(), - tenantid - ) - })? - .map(|dir_entry_res| { - let dir_entry = dir_entry_res.with_context(|| { - format!( - "Failed to list branches directory '{}' content for tenant {}", - branches_dir.display(), - tenantid - ) - })?; - BranchInfo::from_path( - dir_entry.path(), - &repo, - include_non_incremental_logical_size, - ) + Ok(repo + .list_timelines() + .with_context(|| format!("Failed to list timelines for tenant {}", tenant_id))? + .into_iter() + .filter_map(|timeline| match timeline { + RepositoryTimeline::Local { timeline, id } => Some((id, timeline)), + RepositoryTimeline::Remote { .. } => None, }) - .collect() + .map(|(timeline_id, timeline)| { + let (ancestor_id, ancestor_lsn) = match timeline.get_ancestor_timeline_id() { + Some(ancestor_id) => ( + Some(ancestor_id.to_string()), + Some(timeline.get_ancestor_lsn().to_string()), + ), + None => (None, None), + }; + + let current_logical_size_non_incremental = if include_non_incremental_logical_size { + match timeline + .get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) + { + Ok(size) => Some(size), + Err(e) => { + error!( + "Failed to get current logical size for timeline {}: {:?}", + timeline_id, e + ); + None + } + } + } else { + None + }; + + TimelineInfo { + timeline_id, + latest_valid_lsn: timeline.get_last_record_lsn(), + ancestor_id, + ancestor_lsn, + current_logical_size: timeline.get_current_logical_size(), + // non incremental size calculation can be heavy, so let it be optional + // needed for tests to check size calculation + current_logical_size_non_incremental, + } + }) + .collect()) } -pub(crate) fn create_branch( +pub(crate) fn create_timeline( conf: &PageServerConf, - branchname: &str, startpoint_str: &str, - tenantid: &ZTenantId, -) -> Result { - let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?; + tenant_id: ZTenantId, + timeline_id: ZTimelineId, +) -> Result { + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - if conf.branch_path(branchname, tenantid).exists() { - anyhow::bail!("branch {} already exists", branchname); + if conf.timeline_path(&timeline_id, &tenant_id).exists() { + bail!("timeline {} already exists", timeline_id); } - let mut startpoint = parse_point_in_time(conf, startpoint_str, tenantid)?; + let mut startpoint = parse_point_in_time(conf, startpoint_str, &tenant_id)?; let timeline = repo .get_timeline(startpoint.timelineid)? .local_timeline() @@ -325,10 +285,10 @@ pub(crate) fn create_branch( startpoint.lsn = startpoint.lsn.align(); if timeline.get_ancestor_lsn() > startpoint.lsn { // can we safely just branch from the ancestor instead? - anyhow::bail!( - "invalid startpoint {} for the branch {}: less than timeline ancestor lsn {:?}", + bail!( + "invalid startpoint {} for the timeline {}: less than timeline ancestor lsn {:?}", startpoint.lsn, - branchname, + timeline_id, timeline.get_ancestor_lsn() ); } @@ -342,11 +302,11 @@ pub(crate) fn create_branch( // Remember the human-readable branch name for the new timeline. // FIXME: there's a race condition, if you create a branch with the same // name concurrently. + // TODO kb timeline creation needs more let data = new_timeline_id.to_string(); - fs::write(conf.branch_path(branchname, tenantid), data)?; + fs::write(conf.timeline_path(&timeline_id, &tenant_id), data)?; - Ok(BranchInfo { - name: branchname.to_string(), + Ok(TimelineInfo { timeline_id: new_timeline_id, latest_valid_lsn: startpoint.lsn, ancestor_id: Some(startpoint.timelineid.to_string()), @@ -367,14 +327,6 @@ pub(crate) fn create_branch( // A specific LSN on a timeline: // bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8 // -// Same, with a human-friendly branch name: -// main -// main@2/15D3DD8 -// -// Human-friendly tag name: -// mytag -// -// fn parse_point_in_time( conf: &PageServerConf, s: &str, @@ -399,18 +351,6 @@ fn parse_point_in_time( } } - // Check if it's a branch - // Check if it's branch @ LSN - let branchpath = conf.branch_path(name, tenantid); - if branchpath.exists() { - let pointstr = fs::read_to_string(branchpath)?; - - let mut result = parse_point_in_time(conf, &pointstr, tenantid)?; - - result.lsn = lsn.unwrap_or(Lsn(0)); - return Ok(result); - } - // Check if it's a timelineid // Check if it's timelineid @ LSN if let Ok(timelineid) = ZTimelineId::from_str(name) { diff --git a/zenith/src/main.rs b/zenith/src/main.rs index bc42af5943..9f8996a540 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -21,7 +21,7 @@ use zenith_utils::postgres_backend::AuthType; use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; use zenith_utils::GIT_VERSION; -use pageserver::branches::BranchInfo; +use pageserver::timelines::TimelineInfo; // Default id of a safekeeper node, if not specified on the command line. const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1); @@ -53,12 +53,12 @@ http_port = {safekeeper_http_port} } /// -/// Branches tree element used as a value in the HashMap. +/// Timelines tree element used as a value in the HashMap. /// -struct BranchTreeEl { - /// `BranchInfo` received from the `pageserver` via the `branch_list` libpq API call. - pub info: BranchInfo, - /// Holds all direct children of this branch referenced using `timeline_id`. +struct TimelineTreeEl { + /// `TimelineInfo` received from the `pageserver` via the `timeline_list` libpq API call. + pub info: TimelineInfo, + /// Holds all direct children of this timeline referenced using `timeline_id`. pub children: Vec, } @@ -84,7 +84,7 @@ fn main() -> Result<()> { let timeline_arg = Arg::new("timeline") .index(2) - .help("Branch name or a point-in time specification") + .help("Timeline id or a point-in time specification") .required(false); let tenantid_arg = Arg::new("tenantid") @@ -129,9 +129,9 @@ fn main() -> Result<()> { ) ) .subcommand( - App::new("branch") - .about("Create a new branch") - .arg(Arg::new("branchname").required(false).index(1)) + App::new("timeline") + .about("Create a new timeline") + .arg(Arg::new("timeline-name").required(false).index(1)) .arg(Arg::new("start-point").required(false).index(2)) .arg(tenantid_arg.clone()), ).subcommand( @@ -239,7 +239,7 @@ fn main() -> Result<()> { match sub_name { "tenant" => handle_tenant(sub_args, &env), - "branch" => handle_branch(sub_args, &env), + "timeline" => handle_timeline(sub_args, &env), "start" => handle_start_all(sub_args, &env), "stop" => handle_stop_all(sub_args, &env), "pageserver" => handle_pageserver(sub_args, &env), @@ -257,43 +257,42 @@ fn main() -> Result<()> { } /// -/// Prints branches list as a tree-like structure. +/// Prints timelines list as a tree-like structure. /// -fn print_branches_tree(branches: Vec) -> Result<()> { - let mut branches_hash: HashMap = HashMap::new(); +fn print_timelines_tree(timelines: Vec) -> Result<()> { + let mut timelines_hash: HashMap = timelines + .iter() + .map(|t| { + ( + t.timeline_id.to_string(), + TimelineTreeEl { + info: t.clone(), + children: Vec::new(), + }, + ) + }) + .collect(); - // Form a hash table of branch timeline_id -> BranchTreeEl. - for branch in &branches { - branches_hash.insert( - branch.timeline_id.to_string(), - BranchTreeEl { - info: branch.clone(), - children: Vec::new(), - }, - ); - } - - // Memorize all direct children of each branch. - for branch in &branches { - if let Some(tid) = &branch.ancestor_id { - branches_hash + // Memorize all direct children of each timeline. + for timeline in &timelines { + if let Some(tid) = &timeline.ancestor_id { + timelines_hash .get_mut(tid) - .context("missing branch info in the HashMap")? + .context("missing timeline info in the HashMap")? .children - .push(branch.timeline_id.to_string()); + .push(timeline.timeline_id.to_string()); } } // Sort children by tid to bring some minimal order. - for branch in &mut branches_hash.values_mut() { - branch.children.sort(); + for timeline in &mut timelines_hash.values_mut() { + timeline.children.sort(); } - for branch in branches_hash.values() { - // Start with root branches (no ancestors) first. - // Now there is 'main' branch only, but things may change. - if branch.info.ancestor_id.is_none() { - print_branch(0, &Vec::from([true]), branch, &branches_hash)?; + for timeline in timelines_hash.values() { + // Start with root timelines (no ancestors) first. + if timeline.info.ancestor_id.is_none() { + print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?; } } @@ -301,27 +300,27 @@ fn print_branches_tree(branches: Vec) -> Result<()> { } /// -/// Recursively prints branch info with all its children. +/// Recursively prints timeline info with all its children. /// -fn print_branch( +fn print_timeline( nesting_level: usize, is_last: &[bool], - branch: &BranchTreeEl, - branches: &HashMap, + timeline: &TimelineTreeEl, + timelines: &HashMap, ) -> Result<()> { // Draw main padding print!(" "); if nesting_level > 0 { - let lsn = branch + let lsn = timeline .info .ancestor_lsn .as_ref() - .context("missing branch info in the HashMap")?; + .context("missing timeline info in the HashMap")?; let mut br_sym = "┣━"; // Draw each nesting padding with proper style - // depending on whether its branch ended or not. + // depending on whether its timeline ended or not. if nesting_level > 1 { for l in &is_last[1..is_last.len() - 1] { if *l { @@ -332,7 +331,7 @@ fn print_branch( } } - // We are the last in this sub-branch + // We are the last in this sub-timeline if *is_last.last().unwrap() { br_sym = "┗━"; } @@ -340,51 +339,51 @@ fn print_branch( print!("{} @{}: ", br_sym, lsn); } - // Finally print a branch name with new line - println!("{}", branch.info.name); + // Finally print a timeline name with new line + println!("{}", timeline.info.timeline_id); - let len = branch.children.len(); + let len = timeline.children.len(); let mut i: usize = 0; let mut is_last_new = Vec::from(is_last); is_last_new.push(false); - for child in &branch.children { + for child in &timeline.children { i += 1; - // Mark that the last padding is the end of the branch + // Mark that the last padding is the end of the timeline if i == len { if let Some(last) = is_last_new.last_mut() { *last = true; } } - print_branch( + print_timeline( nesting_level + 1, &is_last_new, - branches + timelines .get(child) - .context("missing branch info in the HashMap")?, - branches, + .context("missing timeline info in the HashMap")?, + timelines, )?; } Ok(()) } -/// Returns a map of timeline IDs to branch_name@lsn strings. +/// Returns a map of timeline IDs to timeline_id@lsn strings. /// Connects to the pageserver to query this information. -fn get_branch_infos( +fn get_timeline_infos( env: &local_env::LocalEnv, tenantid: &ZTenantId, -) -> Result> { +) -> Result> { let page_server = PageServerNode::from_env(env); - let branch_infos: Vec = page_server.branch_list(tenantid)?; - let branch_infos: HashMap = branch_infos + let timeline_infos: Vec = page_server.timeline_list(tenantid)?; + let timeline_infos: HashMap = timeline_infos .into_iter() - .map(|branch_info| (branch_info.timeline_id, branch_info)) + .map(|timeline_info| (timeline_info.timeline_id, timeline_info)) .collect(); - Ok(branch_infos) + Ok(timeline_infos) } // Helper function to parse --tenantid option, or get the default from config file @@ -459,24 +458,28 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result Ok(()) } -fn handle_branch(branch_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +fn handle_timeline(timeline_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let pageserver = PageServerNode::from_env(env); - let tenantid = get_tenantid(branch_match, env)?; + let tenant_id = get_tenantid(timeline_match, env)?; - if let Some(branchname) = branch_match.value_of("branchname") { - let startpoint_str = branch_match + if let Some(timeline_id) = timeline_match.value_of("timeline-id") { + let startpoint_str = timeline_match .value_of("start-point") .context("Missing start-point")?; - let branch = pageserver.branch_create(branchname, startpoint_str, &tenantid)?; + let timeline_id = timeline_id + .parse::() + .context("Failed to parse timeline id from the request")?; + let timeline = + pageserver.timeline_create(timeline_id, startpoint_str.to_owned(), tenant_id)?; println!( - "Created branch '{}' at {:?} for tenant: {}", - branch.name, branch.latest_valid_lsn, tenantid, + "Created timeline '{}' at {:?} for tenant: {}", + timeline.timeline_id, timeline.latest_valid_lsn, tenant_id, ); } else { - // No arguments, list branches for tenant - let branches = pageserver.branch_list(&tenantid)?; - print_branches_tree(branches)?; + // No arguments, list timelines for tenant + let timelines = pageserver.timeline_list(&tenant_id)?; + print_timelines_tree(timelines)?; } Ok(()) @@ -495,12 +498,12 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { match sub_name { "list" => { - let branch_infos = get_branch_infos(env, &tenantid).unwrap_or_else(|e| { - eprintln!("Failed to load branch info: {}", e); + let timeline_infos = get_timeline_infos(env, &tenantid).unwrap_or_else(|e| { + eprintln!("Failed to load timeline info: {}", e); HashMap::new() }); - println!("NODE\tADDRESS\t\tBRANCH\tLSN\t\tSTATUS"); + println!("NODE\tADDRESS\t\tTIMELINE\tLSN\t\tSTATUS"); for ((_, node_name), node) in cplane .nodes .iter() @@ -509,7 +512,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { // FIXME: This shows the LSN at the end of the timeline. It's not the // right thing to do for read-only nodes that might be anchored at an // older point in time, or following but lagging behind the primary. - let lsn_str = branch_infos + let lsn_str = timeline_infos .get(&node.timelineid) .map(|bi| bi.latest_valid_lsn.to_string()) .unwrap_or_else(|| "?".to_string()); @@ -518,7 +521,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { "{}\t{}\t{}\t{}\t{}", node_name, node.address, - node.timelineid, // FIXME: resolve human-friendly branch name + node.timelineid, lsn_str, node.status(), ); @@ -526,17 +529,17 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { } "create" => { let node_name = sub_args.value_of("node").unwrap_or("main"); - let timeline_name = sub_args.value_of("timeline").unwrap_or(node_name); + let timeline_spec = sub_args.value_of("timeline"); let port: Option = match sub_args.value_of("port") { Some(p) => Some(p.parse()?), None => None, }; - cplane.new_node(tenantid, node_name, timeline_name, port)?; + cplane.new_node(tenantid, node_name, timeline_spec, port)?; } "start" => { let node_name = sub_args.value_of("node").unwrap_or("main"); - let timeline_name = sub_args.value_of("timeline"); + let timeline_spec = sub_args.value_of("timeline"); let port: Option = match sub_args.value_of("port") { Some(p) => Some(p.parse()?), @@ -554,8 +557,8 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { }; if let Some(node) = node { - if timeline_name.is_some() { - println!("timeline name ignored because node exists already"); + if timeline_spec.is_some() { + println!("timeline spec ignored because its node exists already"); } println!("Starting existing postgres {}...", node_name); node.start(&auth_token)?; @@ -565,12 +568,11 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { // start --port X // stop // start <-- will also use port X even without explicit port argument - let timeline_name = timeline_name.unwrap_or(node_name); println!( - "Starting new postgres {} on {}...", - node_name, timeline_name + "Starting new postgres {} on timeline {:?} ...", + node_name, timeline_spec ); - let node = cplane.new_node(tenantid, node_name, timeline_name, port)?; + let node = cplane.new_node(tenantid, node_name, timeline_spec, port)?; node.start(&auth_token)?; } } @@ -585,9 +587,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { node.stop(destroy)?; } - _ => { - bail!("Unexpected pg subcommand '{}'", sub_name) - } + _ => bail!("Unexpected pg subcommand '{}'", sub_name), } Ok(()) From 0c91091c637d167e65b21e7456c344cafaeb6016 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sun, 13 Feb 2022 23:49:32 +0200 Subject: [PATCH 0018/1022] Avoid point in time concept on pageserver level --- control_plane/src/compute.rs | 67 ++++++++++++++++++++------------- control_plane/src/storage.rs | 7 ++-- pageserver/src/config.rs | 12 ------ pageserver/src/http/models.rs | 4 +- pageserver/src/http/routes.rs | 4 +- pageserver/src/timelines.rs | 71 ++++++----------------------------- zenith/src/main.rs | 12 +++--- 7 files changed, 67 insertions(+), 110 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 3569cc1dbb..3381ca4a04 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -10,7 +10,7 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{Context, Result}; +use anyhow::{bail, Context, Result}; use zenith_utils::connstring::connection_host_port; use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend::AuthType; @@ -73,31 +73,6 @@ impl ComputeControlPlane { .unwrap_or(self.base_port) } - // FIXME: see also parse_point_in_time in timelines.rs. - fn parse_point_in_time( - &self, - tenant_id: ZTenantId, - s: &str, - ) -> Result<(ZTimelineId, Option)> { - let _strings = s.split('@'); - // let name = strings.next().unwrap(); - - // let lsn = strings - // .next() - // .map(Lsn::from_str) - // .transpose() - // .context("invalid LSN in point-in-time specification")?; - - // // Resolve the timeline ID, given the human-readable branch name - // let timeline_id = self - // .pageserver - // .branch_get_by_name(&tenant_id, name)? - // .timeline_id; - - // Ok((timeline_id, lsn)) - todo!("TODO kb check more about the '@name' format") - } - pub fn new_node( &mut self, tenantid: ZTenantId, @@ -107,7 +82,7 @@ impl ComputeControlPlane { ) -> Result> { // Resolve the human-readable timeline spec into timeline ID and LSN let (timelineid, lsn) = match timeline_spec { - Some(timeline_spec) => self.parse_point_in_time(tenantid, timeline_spec)?, + Some(timeline_spec) => parse_point_in_time(timeline_spec)?, None => (ZTimelineId::generate(), None), }; @@ -134,6 +109,44 @@ impl ComputeControlPlane { } } +// Parse user-given string that represents a point-in-time. +// +// Variants suported: +// +// Raw timeline id in hex, meaning the end of that timeline: +// bc62e7d612d0e6fe8f99a6dd2f281f9d +// +// A specific LSN on a timeline: +// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8 +// +fn parse_point_in_time(timeline_spec: &str) -> anyhow::Result<(ZTimelineId, Option)> { + let mut strings = timeline_spec.split('@'); + + let name = match strings.next() { + Some(n) => n, + None => bail!("invalid timeline specification: {}", timeline_spec), + }; + let timeline_id = ZTimelineId::from_str(name).with_context(|| { + format!( + "failed to parse the timeline id from specification: {}", + timeline_spec + ) + })?; + + let lsn = strings + .next() + .map(Lsn::from_str) + .transpose() + .with_context(|| { + format!( + "failed to parse the Lsn from timeline specification: {}", + timeline_spec + ) + })?; + + Ok((timeline_id, lsn)) +} + /////////////////////////////////////////////////////////////////////////////// #[derive(Debug)] diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index aed9a757d4..d550bfc064 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -16,6 +16,7 @@ use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use thiserror::Error; use zenith_utils::http::error::HttpErrorBody; +use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend::AuthType; use zenith_utils::zid::{ZTenantId, ZTimelineId}; @@ -348,16 +349,16 @@ impl PageServerNode { pub fn timeline_create( &self, - timeline_id: ZTimelineId, - start_point: String, tenant_id: ZTenantId, + timeline_id: ZTimelineId, + start_lsn: Option, ) -> Result { Ok(self .http_request(Method::POST, format!("{}/timeline", self.http_base_url)) .json(&TimelineCreateRequest { tenant_id, timeline_id, - start_point, + start_lsn, }) .send()? .error_from_body()? diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 5a9c7557cc..dc85c83c17 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -392,14 +392,6 @@ impl PageServerConf { self.tenants_path().join(tenantid.to_string()) } - pub fn tags_path(&self, tenantid: &ZTenantId) -> PathBuf { - self.tenant_path(tenantid).join("refs").join("tags") - } - - pub fn tag_path(&self, tag_name: &str, tenantid: &ZTenantId) -> PathBuf { - self.tags_path(tenantid).join(tag_name) - } - pub fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf { self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME) } @@ -408,10 +400,6 @@ impl PageServerConf { self.timelines_path(tenantid).join(timelineid.to_string()) } - pub fn ancestor_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf { - self.timeline_path(timelineid, tenantid).join("ancestor") - } - // // Postgres distribution paths // diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index a6dce33c03..bc0d46a96c 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -1,5 +1,5 @@ use serde::{Deserialize, Serialize}; -use zenith_utils::zid::ZTimelineId; +use zenith_utils::{lsn::Lsn, zid::ZTimelineId}; use crate::ZTenantId; use zenith_utils::zid::ZNodeId; @@ -10,7 +10,7 @@ pub struct TimelineCreateRequest { pub tenant_id: ZTenantId, #[serde(with = "hex")] pub timeline_id: ZTimelineId, - pub start_point: String, + pub start_lsn: Option, } #[derive(Serialize, Deserialize)] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 5ab1576aa6..34a61cab9c 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -77,12 +77,12 @@ async fn timeline_create_handler(mut request: Request) -> Result, ) -> Result { - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - if conf.timeline_path(&timeline_id, &tenant_id).exists() { bail!("timeline {} already exists", timeline_id); } - let mut startpoint = parse_point_in_time(conf, startpoint_str, &tenant_id)?; + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + + let mut startpoint = PointInTime { + timeline_id, + lsn: start_lsn.unwrap_or(Lsn(0)), + }; + let timeline = repo - .get_timeline(startpoint.timelineid)? + .get_timeline(startpoint.timeline_id)? .local_timeline() .context("Cannot branch off the timeline that's not present locally")?; if startpoint.lsn == Lsn(0) { @@ -297,7 +300,7 @@ pub(crate) fn create_timeline( // Forward entire timeline creation routine to repository // backend, so it can do all needed initialization - repo.branch_timeline(startpoint.timelineid, new_timeline_id, startpoint.lsn)?; + repo.branch_timeline(startpoint.timeline_id, new_timeline_id, startpoint.lsn)?; // Remember the human-readable branch name for the new timeline. // FIXME: there's a race condition, if you create a branch with the same @@ -309,59 +312,9 @@ pub(crate) fn create_timeline( Ok(TimelineInfo { timeline_id: new_timeline_id, latest_valid_lsn: startpoint.lsn, - ancestor_id: Some(startpoint.timelineid.to_string()), + ancestor_id: Some(startpoint.timeline_id.to_string()), ancestor_lsn: Some(startpoint.lsn.to_string()), current_logical_size: 0, current_logical_size_non_incremental: Some(0), }) } - -// -// Parse user-given string that represents a point-in-time. -// -// We support multiple variants: -// -// Raw timeline id in hex, meaning the end of that timeline: -// bc62e7d612d0e6fe8f99a6dd2f281f9d -// -// A specific LSN on a timeline: -// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8 -// -fn parse_point_in_time( - conf: &PageServerConf, - s: &str, - tenantid: &ZTenantId, -) -> Result { - let mut strings = s.split('@'); - let name = strings.next().unwrap(); - - let lsn = strings - .next() - .map(Lsn::from_str) - .transpose() - .context("invalid LSN in point-in-time specification")?; - - // Check if it's a tag - if lsn.is_none() { - let tagpath = conf.tag_path(name, tenantid); - if tagpath.exists() { - let pointstr = fs::read_to_string(tagpath)?; - - return parse_point_in_time(conf, &pointstr, tenantid); - } - } - - // Check if it's a timelineid - // Check if it's timelineid @ LSN - if let Ok(timelineid) = ZTimelineId::from_str(name) { - let tlipath = conf.timeline_path(&timelineid, tenantid); - if tlipath.exists() { - return Ok(PointInTime { - timelineid, - lsn: lsn.unwrap_or(Lsn(0)), - }); - } - } - - bail!("could not parse point-in-time {}", s); -} diff --git a/zenith/src/main.rs b/zenith/src/main.rs index 9f8996a540..7170653754 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -17,6 +17,7 @@ use walkeeper::defaults::{ DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; use zenith_utils::auth::{Claims, Scope}; +use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend::AuthType; use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; use zenith_utils::GIT_VERSION; @@ -464,14 +465,15 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &local_env::LocalEnv) -> Re let tenant_id = get_tenantid(timeline_match, env)?; if let Some(timeline_id) = timeline_match.value_of("timeline-id") { - let startpoint_str = timeline_match - .value_of("start-point") - .context("Missing start-point")?; + let start_lsn = timeline_match + .value_of("start-lsn") + .map(|lsn| lsn.parse::()) + .transpose() + .context("Failed to parse start Lsn from the request")?; let timeline_id = timeline_id .parse::() .context("Failed to parse timeline id from the request")?; - let timeline = - pageserver.timeline_create(timeline_id, startpoint_str.to_owned(), tenant_id)?; + let timeline = pageserver.timeline_create(tenant_id, timeline_id, start_lsn)?; println!( "Created timeline '{}' at {:?} for tenant: {}", timeline.timeline_id, timeline.latest_valid_lsn, tenant_id, From f49990ed433616270a7db33c3d554d9ed4cf4135 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 14 Feb 2022 00:53:00 +0200 Subject: [PATCH 0019/1022] Allow creating timelines by branching off ancestors --- control_plane/src/compute.rs | 81 +---- control_plane/src/local_env.rs | 16 +- control_plane/src/storage.rs | 4 +- pageserver/src/http/models.rs | 9 +- pageserver/src/http/routes.rs | 58 +--- .../src/remote_storage/storage_sync/index.rs | 2 +- pageserver/src/repository.rs | 2 +- pageserver/src/tenant_mgr.rs | 6 +- pageserver/src/timelines.rs | 297 ++++++++++------ test_runner/batch_others/test_auth.py | 19 +- .../batch_others/test_branch_behind.py | 33 +- .../batch_others/test_clog_truncate.py | 15 +- test_runner/batch_others/test_config.py | 8 +- test_runner/batch_others/test_createdropdb.py | 27 +- test_runner/batch_others/test_createuser.py | 11 +- .../batch_others/test_gc_aggressive.py | 5 +- test_runner/batch_others/test_multixact.py | 10 +- .../batch_others/test_old_request_lsn.py | 4 +- .../batch_others/test_pageserver_api.py | 14 +- .../batch_others/test_pageserver_catchup.py | 8 +- .../batch_others/test_pageserver_restart.py | 4 +- .../batch_others/test_parallel_copy.py | 6 +- test_runner/batch_others/test_pgbench.py | 4 +- .../batch_others/test_readonly_node.py | 18 +- .../batch_others/test_restart_compute.py | 11 +- test_runner/batch_others/test_snapfiles_gc.py | 4 +- test_runner/batch_others/test_subxacts.py | 4 +- .../batch_others/test_tenant_relocation.py | 12 +- test_runner/batch_others/test_tenants.py | 22 +- .../batch_others/test_timeline_size.py | 17 +- test_runner/batch_others/test_twophase.py | 11 +- test_runner/batch_others/test_vm_bits.py | 10 +- test_runner/batch_others/test_wal_acceptor.py | 86 ++--- .../batch_others/test_wal_acceptor_async.py | 5 +- test_runner/batch_others/test_zenith_cli.py | 60 ++-- .../batch_pg_regress/test_isolation.py | 6 +- .../batch_pg_regress/test_pg_regress.py | 4 +- .../batch_pg_regress/test_zenith_regress.py | 4 +- test_runner/fixtures/compare_fixtures.py | 5 +- test_runner/fixtures/zenith_fixtures.py | 243 +++++++------ .../performance/test_bulk_tenant_create.py | 11 +- .../performance/test_parallel_copy_to.py | 1 - test_runner/test_broken.py | 4 +- zenith/src/main.rs | 327 ++++++++++++------ 44 files changed, 855 insertions(+), 653 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 3381ca4a04..5d225a67fa 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -10,7 +10,7 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{bail, Context, Result}; +use anyhow::{Context, Result}; use zenith_utils::connstring::connection_host_port; use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend::AuthType; @@ -37,7 +37,7 @@ impl ComputeControlPlane { // pgdatadirs // |- tenants // | |- - // | | |- + // | | |- pub fn load(env: LocalEnv) -> Result { let pageserver = Arc::new(PageServerNode::from_env(&env)); @@ -52,7 +52,7 @@ impl ComputeControlPlane { .with_context(|| format!("failed to list {}", tenant_dir.path().display()))? { let node = PostgresNode::from_dir_entry(timeline_dir?, &env, &pageserver)?; - nodes.insert((node.tenantid, node.name.clone()), Arc::new(node)); + nodes.insert((node.tenant_id, node.name.clone()), Arc::new(node)); } } @@ -75,17 +75,12 @@ impl ComputeControlPlane { pub fn new_node( &mut self, - tenantid: ZTenantId, + tenant_id: ZTenantId, name: &str, - timeline_spec: Option<&str>, + timeline_id: ZTimelineId, + lsn: Option, port: Option, ) -> Result> { - // Resolve the human-readable timeline spec into timeline ID and LSN - let (timelineid, lsn) = match timeline_spec { - Some(timeline_spec) => parse_point_in_time(timeline_spec)?, - None => (ZTimelineId::generate(), None), - }; - let port = port.unwrap_or_else(|| self.get_port()); let node = Arc::new(PostgresNode { name: name.to_owned(), @@ -93,9 +88,9 @@ impl ComputeControlPlane { env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), is_test: false, - timelineid, + timeline_id, lsn, - tenantid, + tenant_id, uses_wal_proposer: false, }); @@ -103,50 +98,12 @@ impl ComputeControlPlane { node.setup_pg_conf(self.env.pageserver.auth_type)?; self.nodes - .insert((tenantid, node.name.clone()), Arc::clone(&node)); + .insert((tenant_id, node.name.clone()), Arc::clone(&node)); Ok(node) } } -// Parse user-given string that represents a point-in-time. -// -// Variants suported: -// -// Raw timeline id in hex, meaning the end of that timeline: -// bc62e7d612d0e6fe8f99a6dd2f281f9d -// -// A specific LSN on a timeline: -// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8 -// -fn parse_point_in_time(timeline_spec: &str) -> anyhow::Result<(ZTimelineId, Option)> { - let mut strings = timeline_spec.split('@'); - - let name = match strings.next() { - Some(n) => n, - None => bail!("invalid timeline specification: {}", timeline_spec), - }; - let timeline_id = ZTimelineId::from_str(name).with_context(|| { - format!( - "failed to parse the timeline id from specification: {}", - timeline_spec - ) - })?; - - let lsn = strings - .next() - .map(Lsn::from_str) - .transpose() - .with_context(|| { - format!( - "failed to parse the Lsn from timeline specification: {}", - timeline_spec - ) - })?; - - Ok((timeline_id, lsn)) -} - /////////////////////////////////////////////////////////////////////////////// #[derive(Debug)] @@ -156,9 +113,9 @@ pub struct PostgresNode { pub env: LocalEnv, pageserver: Arc, is_test: bool, - pub timelineid: ZTimelineId, + pub timeline_id: ZTimelineId, pub lsn: Option, // if it's a read-only node. None for primary - pub tenantid: ZTenantId, + pub tenant_id: ZTenantId, uses_wal_proposer: bool, } @@ -191,7 +148,7 @@ impl PostgresNode { let context = format!("in config file {}", cfg_path_str); let port: u16 = conf.parse_field("port", &context)?; let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?; - let tenantid: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?; + let tenant_id: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?; let uses_wal_proposer = conf.get("wal_acceptors").is_some(); // parse recovery_target_lsn, if any @@ -205,9 +162,9 @@ impl PostgresNode { env: env.clone(), pageserver: Arc::clone(pageserver), is_test: false, - timelineid, + timeline_id: timelineid, lsn: recovery_target_lsn, - tenantid, + tenant_id, uses_wal_proposer, }) } @@ -258,9 +215,9 @@ impl PostgresNode { ); let sql = if let Some(lsn) = lsn { - format!("basebackup {} {} {}", self.tenantid, self.timelineid, lsn) + format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn) } else { - format!("basebackup {} {}", self.tenantid, self.timelineid) + format!("basebackup {} {}", self.tenant_id, self.timeline_id) }; let mut client = self @@ -346,8 +303,8 @@ impl PostgresNode { conf.append("shared_preload_libraries", "zenith"); conf.append_line(""); conf.append("zenith.page_server_connstring", &pageserver_connstr); - conf.append("zenith.zenith_tenant", &self.tenantid.to_string()); - conf.append("zenith.zenith_timeline", &self.timelineid.to_string()); + conf.append("zenith.zenith_tenant", &self.tenant_id.to_string()); + conf.append("zenith.zenith_timeline", &self.timeline_id.to_string()); if let Some(lsn) = self.lsn { conf.append("recovery_target_lsn", &lsn.to_string()); } @@ -425,7 +382,7 @@ impl PostgresNode { } pub fn pgdata(&self) -> PathBuf { - self.env.pg_data_dir(&self.tenantid, &self.name) + self.env.pg_data_dir(&self.tenant_id, &self.name) } pub fn status(&self) -> &str { diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 238c78821e..98b6379106 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -5,6 +5,7 @@ use anyhow::{bail, Context}; use serde::{Deserialize, Serialize}; +use std::collections::HashMap; use std::env; use std::fmt::Write; use std::fs; @@ -12,7 +13,7 @@ use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use zenith_utils::auth::{encode_from_key_file, Claims, Scope}; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{HexZTenantId, ZNodeId, ZTenantId}; +use zenith_utils::zid::{HexZTenantId, ZNodeId, ZTenantId, ZTimelineId}; use crate::safekeeper::SafekeeperNode; @@ -48,7 +49,7 @@ pub struct LocalEnv { // Default tenant ID to use with the 'zenith' command line utility, when // --tenantid is not explicitly specified. #[serde(default)] - pub default_tenantid: Option, + pub default_tenant_id: Option, // used to issue tokens during e.g pg start #[serde(default)] @@ -58,6 +59,13 @@ pub struct LocalEnv { #[serde(default)] pub safekeepers: Vec, + + /// Every tenant has a first timeline created for it, currently the only one ancestor-less for this tenant. + /// It is used as a default timeline for branching, if no ancestor timeline is specified. + #[serde(default)] + // TODO kb this does not survive calls between invocations, so will have to persist it. + // Then it comes back to names again? + pub initial_timelines: HashMap, } #[derive(Serialize, Deserialize, Clone, Debug)] @@ -183,8 +191,8 @@ impl LocalEnv { } // If no initial tenant ID was given, generate it. - if env.default_tenantid.is_none() { - env.default_tenantid = Some(HexZTenantId::from(ZTenantId::generate())); + if env.default_tenant_id.is_none() { + env.default_tenant_id = Some(HexZTenantId::from(ZTenantId::generate())); } env.base_data_dir = base_path(); diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index d550bfc064..9d5a88784d 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -325,7 +325,7 @@ impl PageServerNode { .json()?) } - pub fn tenant_create(&self, tenantid: ZTenantId) -> Result<()> { + pub fn tenant_create(&self, tenantid: ZTenantId) -> Result { Ok(self .http_request(Method::POST, format!("{}/{}", self.http_base_url, "tenant")) .json(&TenantCreateRequest { @@ -352,6 +352,7 @@ impl PageServerNode { tenant_id: ZTenantId, timeline_id: ZTimelineId, start_lsn: Option, + ancestor_timeline_id: Option, ) -> Result { Ok(self .http_request(Method::POST, format!("{}/timeline", self.http_base_url)) @@ -359,6 +360,7 @@ impl PageServerNode { tenant_id, timeline_id, start_lsn, + ancestor_timeline_id, }) .send()? .error_from_body()? diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index bc0d46a96c..7f95c64527 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -1,8 +1,9 @@ use serde::{Deserialize, Serialize}; -use zenith_utils::{lsn::Lsn, zid::ZTimelineId}; - -use crate::ZTenantId; use zenith_utils::zid::ZNodeId; +use zenith_utils::{ + lsn::Lsn, + zid::{opt_display_serde, ZTenantId, ZTimelineId}, +}; #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { @@ -10,6 +11,8 @@ pub struct TimelineCreateRequest { pub tenant_id: ZTenantId, #[serde(with = "hex")] pub timeline_id: ZTimelineId, + #[serde(with = "opt_display_serde")] + pub ancestor_timeline_id: Option, pub start_lsn: Option, } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 34a61cab9c..f332e59135 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3,7 +3,6 @@ use std::sync::Arc; use anyhow::Result; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; -use serde::Serialize; use tracing::*; use zenith_utils::auth::JwtAuth; use zenith_utils::http::endpoint::attach_openapi_ui; @@ -17,15 +16,13 @@ use zenith_utils::http::{ request::parse_request_param, }; use zenith_utils::http::{RequestExt, RouterBuilder}; -use zenith_utils::lsn::Lsn; -use zenith_utils::zid::HexZTimelineId; -use zenith_utils::zid::ZTimelineId; +use zenith_utils::zid::{HexZTimelineId, ZTimelineId}; use super::models::StatusResponse; use super::models::TenantCreateRequest; use super::models::TimelineCreateRequest; use crate::repository::RepositoryTimeline; -use crate::repository::TimelineSyncState; +use crate::timelines::TimelineInfo; use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId}; #[derive(Debug)] @@ -82,6 +79,7 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> bool { .unwrap_or(false) } -#[derive(Debug, Serialize)] -#[serde(tag = "type")] -enum TimelineInfo { - Local { - #[serde(with = "hex")] - timeline_id: ZTimelineId, - #[serde(with = "hex")] - tenant_id: ZTenantId, - ancestor_timeline_id: Option, - last_record_lsn: Lsn, - prev_record_lsn: Lsn, - disk_consistent_lsn: Lsn, - timeline_state: Option, - }, - Remote { - #[serde(with = "hex")] - timeline_id: ZTimelineId, - #[serde(with = "hex")] - tenant_id: ZTenantId, - }, -} - async fn timeline_detail_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -151,23 +127,13 @@ async fn timeline_detail_handler(request: Request) -> Result(match repo.get_timeline(timeline_id)?.local_timeline() { - None => TimelineInfo::Remote { - timeline_id, - tenant_id, - }, - Some(timeline) => TimelineInfo::Local { - timeline_id, - tenant_id, - ancestor_timeline_id: timeline - .get_ancestor_timeline_id() - .map(HexZTimelineId::from), - disk_consistent_lsn: timeline.get_disk_consistent_lsn(), - last_record_lsn: timeline.get_last_record_lsn(), - prev_record_lsn: timeline.get_prev_record_lsn(), - timeline_state: repo.get_timeline_state(timeline_id), - }, - }) + let include_non_incremental_logical_size = + get_include_non_incremental_logical_size(&request); + Ok::<_, anyhow::Error>(TimelineInfo::from_repo_timeline( + tenant_id, + repo.get_timeline(timeline_id)?, + include_non_incremental_logical_size, + )) }) .await .map_err(ApiError::from_err)??; @@ -247,13 +213,13 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result, ApiError> { diff --git a/pageserver/src/remote_storage/storage_sync/index.rs b/pageserver/src/remote_storage/storage_sync/index.rs index 8ff92ed55e..81c99754c9 100644 --- a/pageserver/src/remote_storage/storage_sync/index.rs +++ b/pageserver/src/remote_storage/storage_sync/index.rs @@ -49,7 +49,7 @@ impl RelativePath { } /// An index to track tenant files that exist on the remote storage. -/// Currently, timeline archives files are tracked only. +/// Currently, timeline archive files are tracked only. #[derive(Debug, Clone)] pub struct RemoteTimelineIndex { timeline_files: HashMap, diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 674d447624..be937b8d26 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -107,7 +107,7 @@ impl RepositoryTimeline { /// A state of the timeline synchronization with the remote storage. /// Contains `disk_consistent_lsn` of the corresponding remote timeline (latest checkpoint's disk_consistent_lsn). -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub enum TimelineSyncState { /// No further downloads from the remote storage are needed. /// The timeline state is up-to-date or ahead of the remote storage one, diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 98777e5e4b..f7f694d833 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -180,9 +180,9 @@ pub fn shutdown_all_tenants() { pub fn create_repository_for_tenant( conf: &'static PageServerConf, tenantid: ZTenantId, -) -> Result<()> { +) -> Result { let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid)); - let repo = timelines::create_repo(conf, tenantid, wal_redo_manager)?; + let (initial_timeline_id, repo) = timelines::create_repo(conf, tenantid, wal_redo_manager)?; match access_tenants().entry(tenantid) { hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", tenantid), @@ -194,7 +194,7 @@ pub fn create_repository_for_tenant( } } - Ok(()) + Ok(initial_timeline_id) } pub fn get_tenant_state(tenantid: ZTenantId) -> Option { diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 1e54fe3897..fc29767ddd 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -17,24 +17,133 @@ use std::{ use tracing::*; use zenith_utils::lsn::Lsn; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use zenith_utils::zid::{opt_display_serde, ZTenantId, ZTimelineId}; use zenith_utils::{crashsafe_dir, logging}; use crate::walredo::WalRedoManager; -use crate::CheckpointConfig; use crate::{config::PageServerConf, repository::Repository}; use crate::{import_datadir, LOG_FILE_NAME}; use crate::{repository::RepositoryTimeline, tenant_mgr}; +use crate::{repository::Timeline, CheckpointConfig}; #[derive(Serialize, Deserialize, Clone)] -pub struct TimelineInfo { - #[serde(with = "hex")] - pub timeline_id: ZTimelineId, - pub latest_valid_lsn: Lsn, - pub ancestor_id: Option, - pub ancestor_lsn: Option, - pub current_logical_size: usize, - pub current_logical_size_non_incremental: Option, +#[serde(tag = "type")] +pub enum TimelineInfo { + Local { + #[serde(with = "hex")] + timeline_id: ZTimelineId, + #[serde(with = "hex")] + tenant_id: ZTenantId, + last_record_lsn: Lsn, + prev_record_lsn: Lsn, + #[serde(with = "opt_display_serde")] + ancestor_timeline_id: Option, + ancestor_lsn: Option, + disk_consistent_lsn: Lsn, + current_logical_size: usize, + current_logical_size_non_incremental: Option, + }, + Remote { + #[serde(with = "hex")] + timeline_id: ZTimelineId, + #[serde(with = "hex")] + tenant_id: ZTenantId, + disk_consistent_lsn: Lsn, + }, +} + +impl TimelineInfo { + pub fn from_repo_timeline( + tenant_id: ZTenantId, + repo_timeline: RepositoryTimeline, + include_non_incremental_logical_size: bool, + ) -> Self { + match repo_timeline { + RepositoryTimeline::Local { id, timeline } => { + let ancestor_timeline_id = timeline.get_ancestor_timeline_id(); + let ancestor_lsn = if ancestor_timeline_id.is_some() { + Some(timeline.get_ancestor_lsn()) + } else { + None + }; + + Self::Local { + timeline_id: id, + tenant_id, + last_record_lsn: timeline.get_last_record_lsn(), + prev_record_lsn: timeline.get_prev_record_lsn(), + ancestor_timeline_id, + ancestor_lsn, + disk_consistent_lsn: timeline.get_disk_consistent_lsn(), + current_logical_size: timeline.get_current_logical_size(), + current_logical_size_non_incremental: get_current_logical_size_non_incremental( + include_non_incremental_logical_size, + timeline.as_ref(), + ), + } + } + RepositoryTimeline::Remote { + id, + disk_consistent_lsn, + } => Self::Remote { + timeline_id: id, + tenant_id, + disk_consistent_lsn, + }, + } + } + + pub fn from_dyn_timeline( + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + timeline: &dyn Timeline, + include_non_incremental_logical_size: bool, + ) -> Self { + let ancestor_timeline_id = timeline.get_ancestor_timeline_id(); + let ancestor_lsn = if ancestor_timeline_id.is_some() { + Some(timeline.get_ancestor_lsn()) + } else { + None + }; + + Self::Local { + timeline_id, + tenant_id, + last_record_lsn: timeline.get_last_record_lsn(), + prev_record_lsn: timeline.get_prev_record_lsn(), + ancestor_timeline_id, + ancestor_lsn, + disk_consistent_lsn: timeline.get_disk_consistent_lsn(), + current_logical_size: timeline.get_current_logical_size(), + current_logical_size_non_incremental: get_current_logical_size_non_incremental( + include_non_incremental_logical_size, + timeline, + ), + } + } + + pub fn timeline_id(&self) -> ZTimelineId { + match *self { + TimelineInfo::Local { timeline_id, .. } => timeline_id, + TimelineInfo::Remote { timeline_id, .. } => timeline_id, + } + } +} + +fn get_current_logical_size_non_incremental( + include_non_incremental_logical_size: bool, + timeline: &dyn Timeline, +) -> Option { + if !include_non_incremental_logical_size { + return None; + } + match timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) { + Ok(size) => Some(size), + Err(e) => { + error!("Failed to get non-incremental logical size: {:?}", e); + None + } + } } #[derive(Debug, Clone, Copy)] @@ -75,7 +184,7 @@ pub fn create_repo( conf: &'static PageServerConf, tenantid: ZTenantId, wal_redo_manager: Arc, -) -> Result> { +) -> Result<(ZTimelineId, Arc)> { let repo_dir = conf.tenant_path(&tenantid); if repo_dir.exists() { bail!("repo for {} already exists", tenantid) @@ -107,7 +216,7 @@ pub fn create_repo( // move data loading out of create_repo() bootstrap_timeline(conf, tenantid, timeline_id, repo.as_ref())?; - Ok(repo) + Ok((timeline_id, repo)) } // Returns checkpoint LSN from controlfile @@ -160,7 +269,7 @@ fn bootstrap_timeline( tenantid: ZTenantId, tli: ZTimelineId, repo: &dyn Repository, -) -> Result<()> { +) -> Result> { let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered(); let initdb_path = conf.tenant_path(&tenantid).join("tmp"); @@ -192,7 +301,7 @@ fn bootstrap_timeline( // Remove temp dir. We don't need it anymore fs::remove_dir_all(pgdata_path)?; - Ok(()) + Ok(timeline) } pub(crate) fn get_timelines( @@ -211,110 +320,86 @@ pub(crate) fn get_timelines( RepositoryTimeline::Remote { .. } => None, }) .map(|(timeline_id, timeline)| { - let (ancestor_id, ancestor_lsn) = match timeline.get_ancestor_timeline_id() { - Some(ancestor_id) => ( - Some(ancestor_id.to_string()), - Some(timeline.get_ancestor_lsn().to_string()), - ), - None => (None, None), - }; - - let current_logical_size_non_incremental = if include_non_incremental_logical_size { - match timeline - .get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) - { - Ok(size) => Some(size), - Err(e) => { - error!( - "Failed to get current logical size for timeline {}: {:?}", - timeline_id, e - ); - None - } - } - } else { - None - }; - - TimelineInfo { + TimelineInfo::from_dyn_timeline( + tenant_id, timeline_id, - latest_valid_lsn: timeline.get_last_record_lsn(), - ancestor_id, - ancestor_lsn, - current_logical_size: timeline.get_current_logical_size(), - // non incremental size calculation can be heavy, so let it be optional - // needed for tests to check size calculation - current_logical_size_non_incremental, - } + timeline.as_ref(), + include_non_incremental_logical_size, + ) }) .collect()) } pub(crate) fn create_timeline( - conf: &PageServerConf, + conf: &'static PageServerConf, tenant_id: ZTenantId, - timeline_id: ZTimelineId, - start_lsn: Option, + new_timeline_id: ZTimelineId, + ancestor_timeline_id: Option, + ancestor_start_lsn: Option, ) -> Result { - if conf.timeline_path(&timeline_id, &tenant_id).exists() { - bail!("timeline {} already exists", timeline_id); + if conf.timeline_path(&new_timeline_id, &tenant_id).exists() { + bail!("timeline {} already exists", new_timeline_id); } let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + let mut start_lsn = ancestor_start_lsn.unwrap_or(Lsn(0)); - let mut startpoint = PointInTime { - timeline_id, - lsn: start_lsn.unwrap_or(Lsn(0)), - }; + match ancestor_timeline_id { + Some(ancestor_timeline_id) => { + let ancestor_timeline = repo + .get_timeline(ancestor_timeline_id) + .with_context(|| format!("Cannot get ancestor timeline {}", ancestor_timeline_id))? + .local_timeline() + .with_context(|| { + format!( + "Cannot branch off the timeline {} that's not present locally", + ancestor_timeline_id + ) + })?; - let timeline = repo - .get_timeline(startpoint.timeline_id)? - .local_timeline() - .context("Cannot branch off the timeline that's not present locally")?; - if startpoint.lsn == Lsn(0) { - // Find end of WAL on the old timeline - let end_of_wal = timeline.get_last_record_lsn(); - info!("branching at end of WAL: {}", end_of_wal); - startpoint.lsn = end_of_wal; - } else { - // Wait for the WAL to arrive and be processed on the parent branch up - // to the requested branch point. The repository code itself doesn't - // require it, but if we start to receive WAL on the new timeline, - // decoding the new WAL might need to look up previous pages, relation - // sizes etc. and that would get confused if the previous page versions - // are not in the repository yet. - timeline.wait_lsn(startpoint.lsn)?; + if start_lsn == Lsn(0) { + // Find end of WAL on the old timeline + let end_of_wal = ancestor_timeline.get_last_record_lsn(); + info!("branching at end of WAL: {}", end_of_wal); + start_lsn = end_of_wal; + } else { + // Wait for the WAL to arrive and be processed on the parent branch up + // to the requested branch point. The repository code itself doesn't + // require it, but if we start to receive WAL on the new timeline, + // decoding the new WAL might need to look up previous pages, relation + // sizes etc. and that would get confused if the previous page versions + // are not in the repository yet. + ancestor_timeline.wait_lsn(start_lsn)?; + } + start_lsn = start_lsn.align(); + + let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); + if ancestor_ancestor_lsn > start_lsn { + // can we safely just branch from the ancestor instead? + anyhow::bail!( + "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", + start_lsn, + ancestor_timeline_id, + ancestor_ancestor_lsn, + ); + } + repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?; + // load the timeline into memory + let loaded_timeline = repo.get_timeline(new_timeline_id)?; + Ok(TimelineInfo::from_repo_timeline( + tenant_id, + loaded_timeline, + false, + )) + } + None => { + let new_timeline = bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?; + Ok(TimelineInfo::from_dyn_timeline( + tenant_id, + new_timeline_id, + new_timeline.as_ref(), + false, + )) + } } - startpoint.lsn = startpoint.lsn.align(); - if timeline.get_ancestor_lsn() > startpoint.lsn { - // can we safely just branch from the ancestor instead? - bail!( - "invalid startpoint {} for the timeline {}: less than timeline ancestor lsn {:?}", - startpoint.lsn, - timeline_id, - timeline.get_ancestor_lsn() - ); - } - - let new_timeline_id = ZTimelineId::generate(); - - // Forward entire timeline creation routine to repository - // backend, so it can do all needed initialization - repo.branch_timeline(startpoint.timeline_id, new_timeline_id, startpoint.lsn)?; - - // Remember the human-readable branch name for the new timeline. - // FIXME: there's a race condition, if you create a branch with the same - // name concurrently. - // TODO kb timeline creation needs more - let data = new_timeline_id.to_string(); - fs::write(conf.timeline_path(&timeline_id, &tenant_id), data)?; - - Ok(TimelineInfo { - timeline_id: new_timeline_id, - latest_valid_lsn: startpoint.lsn, - ancestor_id: Some(startpoint.timeline_id.to_string()), - ancestor_lsn: Some(startpoint.lsn.to_string()), - current_logical_size: 0, - current_logical_size_non_incremental: Some(0), - }) } diff --git a/test_runner/batch_others/test_auth.py b/test_runner/batch_others/test_auth.py index ee1a09c917..4d1d0847ed 100644 --- a/test_runner/batch_others/test_auth.py +++ b/test_runner/batch_others/test_auth.py @@ -1,8 +1,8 @@ from contextlib import closing from typing import Iterator from uuid import UUID, uuid4 -import psycopg2 from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithPageserverApiException +from requests.exceptions import HTTPError import pytest @@ -26,14 +26,20 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): ps.safe_psql("set FOO", password=management_token) # tenant can create branches - tenant_http_client.branch_create(env.initial_tenant, 'new1', 'main') + tenant_http_client.timeline_create(timeline_id=uuid4(), + tenant_id=env.initial_tenant, + ancestor_timeline_id=env.initial_timeline) # console can create branches for tenant - management_http_client.branch_create(env.initial_tenant, 'new2', 'main') + management_http_client.timeline_create(timeline_id=uuid4(), + tenant_id=env.initial_tenant, + ancestor_timeline_id=env.initial_timeline) # fail to create branch using token with different tenant_id with pytest.raises(ZenithPageserverApiException, match='Forbidden: Tenant id mismatch. Permission denied'): - invalid_tenant_http_client.branch_create(env.initial_tenant, "new3", "main") + invalid_tenant_http_client.timeline_create(timeline_id=uuid4(), + tenant_id=env.initial_tenant, + ancestor_timeline_id=env.initial_timeline) # create tenant using management token management_http_client.tenant_create(uuid4()) @@ -54,9 +60,8 @@ def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_w env = zenith_env_builder.init_start() branch = f"test_compute_auth_to_pageserver{with_wal_acceptors}" - env.zenith_cli.create_branch(branch, "main") - - pg = env.postgres.create_start(branch) + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start(branch, timeline_id=new_timeline_id) with closing(pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index 509c46975e..f8ff1741b4 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -22,9 +22,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): env = zenith_env_builder.init_start() # Branch at the point where only 100 rows were inserted - env.zenith_cli.create_branch("test_branch_behind", "main") - - pgmain = env.postgres.create_start('test_branch_behind') + test_branch_behind_timeline_id = env.zenith_cli.branch_timeline() + pgmain = env.postgres.create_start('test_branch_behind', + timeline_id=test_branch_behind_timeline_id) log.info("postgres is running on 'test_branch_behind' branch") main_pg_conn = pgmain.connect() @@ -60,7 +60,8 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 200100 rows: {lsn_b}') # Branch at the point where only 100 rows were inserted - env.zenith_cli.create_branch("test_branch_behind_hundred", "test_branch_behind@" + lsn_a) + test_branch_behind_hundred_timeline_id = env.zenith_cli.branch_timeline( + ancestor_timeline_id=test_branch_behind_timeline_id, ancestor_start_lsn=lsn_a) # Insert many more rows. This generates enough WAL to fill a few segments. main_cur.execute(''' @@ -75,10 +76,13 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 400100 rows: {lsn_c}') # Branch at the point where only 200100 rows were inserted - env.zenith_cli.create_branch("test_branch_behind_more", "test_branch_behind@" + lsn_b) + test_branch_behind_more_timeline_id = env.zenith_cli.branch_timeline( + ancestor_timeline_id=test_branch_behind_timeline_id, ancestor_start_lsn=lsn_b) - pg_hundred = env.postgres.create_start("test_branch_behind_hundred") - pg_more = env.postgres.create_start("test_branch_behind_more") + pg_hundred = env.postgres.create_start("test_branch_behind_hundred", + timeline_id=test_branch_behind_hundred_timeline_id) + pg_more = env.postgres.create_start("test_branch_behind_more", + timeline_id=test_branch_behind_more_timeline_id) # On the 'hundred' branch, we should see only 100 rows hundred_pg_conn = pg_hundred.connect() @@ -99,19 +103,23 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): # Check bad lsn's for branching # branch at segment boundary - env.zenith_cli.create_branch("test_branch_segment_boundary", "test_branch_behind@0/3000000") - pg = env.postgres.create_start("test_branch_segment_boundary") + test_branch_segment_boundary_timeline_id = env.zenith_cli.branch_timeline( + ancestor_timeline_id=test_branch_behind_timeline_id, ancestor_start_lsn="0/3000000") + pg = env.postgres.create_start("test_branch_segment_boundary", + timeline_id=test_branch_segment_boundary_timeline_id) cur = pg.connect().cursor() cur.execute('SELECT 1') assert cur.fetchone() == (1, ) # branch at pre-initdb lsn with pytest.raises(Exception, match="invalid branch start lsn"): - env.zenith_cli.create_branch("test_branch_preinitdb", "main@0/42") + env.zenith_cli.branch_timeline(ancestor_timeline_id=env.initial_timeline, + ancestor_start_lsn="0/42") # branch at pre-ancestor lsn with pytest.raises(Exception, match="less than timeline ancestor lsn"): - env.zenith_cli.create_branch("test_branch_preinitdb", "test_branch_behind@0/42") + env.zenith_cli.branch_timeline(ancestor_timeline_id=test_branch_behind_timeline_id, + ancestor_start_lsn="0/42") # check that we cannot create branch based on garbage collected data with closing(env.pageserver.connect()) as psconn: @@ -123,7 +131,8 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): with pytest.raises(Exception, match="invalid branch start lsn"): # this gced_lsn is pretty random, so if gc is disabled this woudln't fail - env.zenith_cli.create_branch("test_branch_create_fail", f"test_branch_behind@{gced_lsn}") + env.zenith_cli.branch_timeline(ancestor_timeline_id=test_branch_behind_timeline_id, + ancestor_start_lsn=gced_lsn) # check that after gc everything is still there hundred_cur.execute('SELECT count(*) FROM foo') diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/batch_others/test_clog_truncate.py index 504f455936..9d3927aa84 100644 --- a/test_runner/batch_others/test_clog_truncate.py +++ b/test_runner/batch_others/test_clog_truncate.py @@ -12,7 +12,7 @@ from fixtures.log_helper import log # def test_clog_truncate(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_clog_truncate", "empty") + test_clog_truncate_timeline_id = env.zenith_cli.branch_timeline() # set agressive autovacuum to make sure that truncation will happen config = [ @@ -25,7 +25,9 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv): 'autovacuum_freeze_max_age=100000' ] - pg = env.postgres.create_start('test_clog_truncate', config_lines=config) + pg = env.postgres.create_start('test_clog_truncate', + config_lines=config, + timeline_id=test_clog_truncate_timeline_id) log.info('postgres is running on test_clog_truncate branch') # Install extension containing function needed for test @@ -62,10 +64,11 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv): # create new branch after clog truncation and start a compute node on it log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}') - env.zenith_cli.create_branch("test_clog_truncate_new", - "test_clog_truncate@" + lsn_after_truncation) - - pg2 = env.postgres.create_start('test_clog_truncate_new') + test_clog_truncate_new_timeline_id = env.zenith_cli.branch_timeline( + ancestor_timeline_id=test_clog_truncate_timeline_id, + ancestor_start_lsn=lsn_after_truncation) + pg2 = env.postgres.create_start('test_clog_truncate_new', + timeline_id=test_clog_truncate_new_timeline_id) log.info('postgres is running on test_clog_truncate_new branch') # check that new node doesn't contain truncated segment diff --git a/test_runner/batch_others/test_config.py b/test_runner/batch_others/test_config.py index fd2b3b4e99..bd1f8b487f 100644 --- a/test_runner/batch_others/test_config.py +++ b/test_runner/batch_others/test_config.py @@ -9,10 +9,10 @@ from fixtures.log_helper import log # def test_config(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_config", "empty") - - # change config - pg = env.postgres.create_start('test_config', config_lines=['log_min_messages=debug1']) + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_config', + config_lines=['log_min_messages=debug1'], + timeline_id=new_timeline_id) log.info('postgres is running on test_config branch') with closing(pg.connect()) as conn: diff --git a/test_runner/batch_others/test_createdropdb.py b/test_runner/batch_others/test_createdropdb.py index 38243b298b..e77e1928b8 100644 --- a/test_runner/batch_others/test_createdropdb.py +++ b/test_runner/batch_others/test_createdropdb.py @@ -11,9 +11,9 @@ from fixtures.log_helper import log # def test_createdb(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_createdb", "empty") + test_createdb_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_createdb') + pg = env.postgres.create_start('test_createdb', timeline_id=test_createdb_timeline_id) log.info("postgres is running on 'test_createdb' branch") with closing(pg.connect()) as conn: @@ -27,9 +27,9 @@ def test_createdb(zenith_simple_env: ZenithEnv): lsn = cur.fetchone()[0] # Create a branch - env.zenith_cli.create_branch("test_createdb2", "test_createdb@" + lsn) - - pg2 = env.postgres.create_start('test_createdb2') + test_createdb2_timeline_id = env.zenith_cli.branch_timeline( + ancestor_timeline_id=test_createdb_timeline_id, ancestor_start_lsn=lsn) + pg2 = env.postgres.create_start('test_createdb2', timeline_id=test_createdb2_timeline_id) # Test that you can connect to the new database on both branches for db in (pg, pg2): @@ -41,9 +41,8 @@ def test_createdb(zenith_simple_env: ZenithEnv): # def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir): env = zenith_simple_env - env.zenith_cli.create_branch("test_dropdb", "empty") - - pg = env.postgres.create_start('test_dropdb') + test_dropdb_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_dropdb', timeline_id=test_dropdb_timeline_id) log.info("postgres is running on 'test_dropdb' branch") with closing(pg.connect()) as conn: @@ -66,11 +65,15 @@ def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir): lsn_after_drop = cur.fetchone()[0] # Create two branches before and after database drop. - env.zenith_cli.create_branch("test_before_dropdb", "test_dropdb@" + lsn_before_drop) - pg_before = env.postgres.create_start('test_before_dropdb') + test_before_dropdb_timeline_db = env.zenith_cli.branch_timeline( + ancestor_timeline_id=test_dropdb_timeline_id, ancestor_start_lsn=lsn_before_drop) + pg_before = env.postgres.create_start('test_before_dropdb', + timeline_id=test_before_dropdb_timeline_db) - env.zenith_cli.create_branch("test_after_dropdb", "test_dropdb@" + lsn_after_drop) - pg_after = env.postgres.create_start('test_after_dropdb') + test_after_dropdb_timeline_id = env.zenith_cli.branch_timeline( + ancestor_timeline_id=test_dropdb_timeline_id, ancestor_start_lsn=lsn_after_drop) + pg_after = env.postgres.create_start('test_after_dropdb', + timeline_id=test_after_dropdb_timeline_id) # Test that database exists on the branch before drop pg_before.connect(dbname='foodb').close() diff --git a/test_runner/batch_others/test_createuser.py b/test_runner/batch_others/test_createuser.py index 1959b47dcc..8f825a0a1a 100644 --- a/test_runner/batch_others/test_createuser.py +++ b/test_runner/batch_others/test_createuser.py @@ -9,9 +9,8 @@ from fixtures.log_helper import log # def test_createuser(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_createuser", "empty") - - pg = env.postgres.create_start('test_createuser') + test_createuser_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_createuser', timeline_id=test_createuser_timeline_id) log.info("postgres is running on 'test_createuser' branch") with closing(pg.connect()) as conn: @@ -25,9 +24,9 @@ def test_createuser(zenith_simple_env: ZenithEnv): lsn = cur.fetchone()[0] # Create a branch - env.zenith_cli.create_branch("test_createuser2", "test_createuser@" + lsn) - - pg2 = env.postgres.create_start('test_createuser2') + test_createuser2_timeline_id = env.zenith_cli.branch_timeline( + ancestor_timeline_id=test_createuser_timeline_id, ancestor_start_lsn=lsn) + pg2 = env.postgres.create_start('test_createuser2', timeline_id=test_createuser2_timeline_id) # Test that you can connect to new branch as a new user assert pg2.safe_psql('select current_user', username='testuser') == [('testuser', )] diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/batch_others/test_gc_aggressive.py index 9de6ba9f59..7dd38a5799 100644 --- a/test_runner/batch_others/test_gc_aggressive.py +++ b/test_runner/batch_others/test_gc_aggressive.py @@ -1,7 +1,6 @@ from contextlib import closing import asyncio -import asyncpg import random from fixtures.zenith_fixtures import ZenithEnv, Postgres, Safekeeper @@ -55,8 +54,8 @@ async def update_and_gc(env: ZenithEnv, pg: Postgres, timeline: str): # def test_gc_aggressive(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_gc_aggressive", "empty") - pg = env.postgres.create_start('test_gc_aggressive') + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_gc_aggressive', timeline_id=new_timeline_id) log.info('postgres is running on test_gc_aggressive branch') conn = pg.connect() diff --git a/test_runner/batch_others/test_multixact.py b/test_runner/batch_others/test_multixact.py index 6a2afd2ede..11f8000226 100644 --- a/test_runner/batch_others/test_multixact.py +++ b/test_runner/batch_others/test_multixact.py @@ -10,8 +10,8 @@ from fixtures.log_helper import log # def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir): env = zenith_simple_env - env.zenith_cli.create_branch("test_multixact", "empty") - pg = env.postgres.create_start('test_multixact') + test_multixact_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_multixact', timeline_id=test_multixact_timeline_id) log.info("postgres is running on 'test_multixact' branch") pg_conn = pg.connect() @@ -60,8 +60,10 @@ def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir): assert int(next_multixact_id) > int(next_multixact_id_old) # Branch at this point - env.zenith_cli.create_branch("test_multixact_new", "test_multixact@" + lsn) - pg_new = env.postgres.create_start('test_multixact_new') + test_multixact_new_timeline_id = env.zenith_cli.branch_timeline( + ancestor_timeline_id=test_multixact_timeline_id, ancestor_start_lsn=lsn) + pg_new = env.postgres.create_start('test_multixact_new', + timeline_id=test_multixact_new_timeline_id) log.info("postgres is running on 'test_multixact_new' branch") pg_new_conn = pg_new.connect() diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py index d09fb24913..f0701dfe4f 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -16,8 +16,8 @@ from fixtures.log_helper import log # def test_old_request_lsn(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_old_request_lsn", "empty") - pg = env.postgres.create_start('test_old_request_lsn') + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_old_request_lsn', timeline_id=new_timeline_id) log.info('postgres is running on test_old_request_lsn branch') pg_conn = pg.connect() diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index ba1f106c4b..4c3b98e838 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -26,18 +26,20 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): # check its timelines timelines = client.timeline_list(tenant_id) assert len(timelines) > 0 - for timeline_id_str in timelines: - timeline_details = client.timeline_detail(tenant_id, UUID(timeline_id_str)) + for timeline in timelines: + timeline_id_str = str(timeline['timeline_id']) + timeline_details = client.timeline_detail(tenant_id=tenant_id, + timeline_id=UUID(timeline_id_str)) assert timeline_details['type'] == 'Local' assert timeline_details['tenant_id'] == tenant_id.hex assert timeline_details['timeline_id'] == timeline_id_str - # create branch - branch_name = uuid4().hex - client.branch_create(tenant_id, branch_name, "main") + # create timeline + timeline_id = uuid4() + client.timeline_create(tenant_id=tenant_id, timeline_id=timeline_id) # check it is there - assert branch_name in {b['name'] for b in client.branch_list(tenant_id)} + assert timeline_id.hex in {b['timeline_id'] for b in client.timeline_list(tenant_id)} def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv): diff --git a/test_runner/batch_others/test_pageserver_catchup.py b/test_runner/batch_others/test_pageserver_catchup.py index 985d1a3af0..ba77a4a321 100644 --- a/test_runner/batch_others/test_pageserver_catchup.py +++ b/test_runner/batch_others/test_pageserver_catchup.py @@ -16,8 +16,9 @@ def test_pageserver_catchup_while_compute_down(zenith_env_builder: ZenithEnvBuil zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_pageserver_catchup_while_compute_down", "main") - pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down') + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down', + timeline_id=new_timeline_id) pg_conn = pg.connect() cur = pg_conn.cursor() @@ -59,7 +60,8 @@ def test_pageserver_catchup_while_compute_down(zenith_env_builder: ZenithEnvBuil env.safekeepers[2].start() # restart compute node - pg.stop_and_destroy().create_start('test_pageserver_catchup_while_compute_down') + pg.stop_and_destroy().create_start('test_pageserver_catchup_while_compute_down', + timeline_id=new_timeline_id) # Ensure that basebackup went correct and pageserver returned all data pg_conn = pg.connect() diff --git a/test_runner/batch_others/test_pageserver_restart.py b/test_runner/batch_others/test_pageserver_restart.py index ec93c2cf5b..f1d154408c 100644 --- a/test_runner/batch_others/test_pageserver_restart.py +++ b/test_runner/batch_others/test_pageserver_restart.py @@ -15,8 +15,8 @@ def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_pageserver_restart", "main") - pg = env.postgres.create_start('test_pageserver_restart') + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_pageserver_restart', timeline_id=new_timeline_id) pg_conn = pg.connect() cur = pg_conn.cursor() diff --git a/test_runner/batch_others/test_parallel_copy.py b/test_runner/batch_others/test_parallel_copy.py index 6f87bc4a36..8e954a8e51 100644 --- a/test_runner/batch_others/test_parallel_copy.py +++ b/test_runner/batch_others/test_parallel_copy.py @@ -1,7 +1,5 @@ from io import BytesIO import asyncio -import asyncpg -import subprocess from fixtures.zenith_fixtures import ZenithEnv, Postgres from fixtures.log_helper import log @@ -37,8 +35,8 @@ async def parallel_load_same_table(pg: Postgres, n_parallel: int): # Load data into one table with COPY TO from 5 parallel connections def test_parallel_copy(zenith_simple_env: ZenithEnv, n_parallel=5): env = zenith_simple_env - env.zenith_cli.create_branch("test_parallel_copy", "empty") - pg = env.postgres.create_start('test_parallel_copy') + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_parallel_copy', timeline_id=new_timeline_id) log.info("postgres is running on 'test_parallel_copy' branch") # Create test table diff --git a/test_runner/batch_others/test_pgbench.py b/test_runner/batch_others/test_pgbench.py index 09713023bc..207f1e1e2c 100644 --- a/test_runner/batch_others/test_pgbench.py +++ b/test_runner/batch_others/test_pgbench.py @@ -4,8 +4,8 @@ from fixtures.log_helper import log def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin): env = zenith_simple_env - env.zenith_cli.create_branch("test_pgbench", "empty") - pg = env.postgres.create_start('test_pgbench') + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_pgbench', timeline_id=new_timeline_id) log.info("postgres is running on 'test_pgbench' branch") connstr = pg.connstr() diff --git a/test_runner/batch_others/test_readonly_node.py b/test_runner/batch_others/test_readonly_node.py index ba256e71f7..2998ea7528 100644 --- a/test_runner/batch_others/test_readonly_node.py +++ b/test_runner/batch_others/test_readonly_node.py @@ -11,9 +11,9 @@ from fixtures.zenith_fixtures import ZenithEnv # def test_readonly_node(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_readonly_node", "empty") - - pgmain = env.postgres.create_start('test_readonly_node') + test_readonly_node_timeline_id = env.zenith_cli.branch_timeline() + pgmain = env.postgres.create_start('test_readonly_node', + timeline_id=test_readonly_node_timeline_id) log.info("postgres is running on 'test_readonly_node' branch") main_pg_conn = pgmain.connect() @@ -54,11 +54,13 @@ def test_readonly_node(zenith_simple_env: ZenithEnv): # Create first read-only node at the point where only 100 rows were inserted pg_hundred = env.postgres.create_start("test_readonly_node_hundred", - branch=f'test_readonly_node@{lsn_a}') + timeline_id=test_readonly_node_timeline_id, + lsn=lsn_a) # And another at the point where 200100 rows were inserted pg_more = env.postgres.create_start("test_readonly_node_more", - branch=f'test_readonly_node@{lsn_b}') + timeline_id=test_readonly_node_timeline_id, + lsn=lsn_b) # On the 'hundred' node, we should see only 100 rows hundred_pg_conn = pg_hundred.connect() @@ -78,7 +80,8 @@ def test_readonly_node(zenith_simple_env: ZenithEnv): # Check creating a node at segment boundary pg = env.postgres.create_start("test_branch_segment_boundary", - branch="test_readonly_node@0/3000000") + timeline_id=test_readonly_node_timeline_id, + lsn='0/3000000') cur = pg.connect().cursor() cur.execute('SELECT 1') assert cur.fetchone() == (1, ) @@ -87,4 +90,5 @@ def test_readonly_node(zenith_simple_env: ZenithEnv): with pytest.raises(Exception, match="invalid basebackup lsn"): # compute node startup with invalid LSN should fail env.zenith_cli.pg_start("test_readonly_node_preinitdb", - timeline_spec="test_readonly_node@0/42") + timeline_id=test_readonly_node_timeline_id, + lsn="0/42") diff --git a/test_runner/batch_others/test_restart_compute.py b/test_runner/batch_others/test_restart_compute.py index d4dd3fb9e2..baa1f787df 100644 --- a/test_runner/batch_others/test_restart_compute.py +++ b/test_runner/batch_others/test_restart_compute.py @@ -15,9 +15,8 @@ def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptor zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_restart_compute", "main") - - pg = env.postgres.create_start('test_restart_compute') + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_restart_compute', timeline_id=new_timeline_id) log.info("postgres is running on 'test_restart_compute' branch") with closing(pg.connect()) as conn: @@ -30,7 +29,7 @@ def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptor log.info(f"res = {r}") # Remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute') + pg.stop_and_destroy().create_start('test_restart_compute', timeline_id=new_timeline_id) with closing(pg.connect()) as conn: with conn.cursor() as cur: @@ -49,7 +48,7 @@ def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptor log.info(f"res = {r}") # Again remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute') + pg.stop_and_destroy().create_start('test_restart_compute', timeline_id=new_timeline_id) # That select causes lots of FPI's and increases probability of wakeepers # lagging behind after query completion @@ -63,7 +62,7 @@ def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptor log.info(f"res = {r}") # And again remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute') + pg.stop_and_destroy().create_start('test_restart_compute', timeline_id=new_timeline_id) with closing(pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/batch_others/test_snapfiles_gc.py b/test_runner/batch_others/test_snapfiles_gc.py index c6d4512bc9..fb02e54be2 100644 --- a/test_runner/batch_others/test_snapfiles_gc.py +++ b/test_runner/batch_others/test_snapfiles_gc.py @@ -14,8 +14,8 @@ from fixtures.log_helper import log # def test_layerfiles_gc(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_layerfiles_gc", "empty") - pg = env.postgres.create_start('test_layerfiles_gc') + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_layerfiles_gc', timeline_id=new_timeline_id) with closing(pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/batch_others/test_subxacts.py b/test_runner/batch_others/test_subxacts.py index bed1c4be63..6153bd1fe2 100644 --- a/test_runner/batch_others/test_subxacts.py +++ b/test_runner/batch_others/test_subxacts.py @@ -10,8 +10,8 @@ from fixtures.log_helper import log # CLOG. def test_subxacts(zenith_simple_env: ZenithEnv, test_output_dir): env = zenith_simple_env - env.zenith_cli.create_branch("test_subxacts", "empty") - pg = env.postgres.create_start('test_subxacts') + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_subxacts', timeline_id=new_timeline_id) log.info("postgres is running on 'test_subxacts' branch") pg_conn = pg.connect() diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index acff3ef62c..429aee8488 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -127,16 +127,14 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, # create folder for remote storage mock remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage' - tenant = env.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) + (tenant, _) = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) log.info("tenant to relocate %s", tenant) - env.zenith_cli.create_branch("test_tenant_relocation", "main", tenant_id=tenant) + new_timeline_id = env.zenith_cli.branch_timeline(tenant_id=tenant) - tenant_pg = env.postgres.create_start( - "test_tenant_relocation", - "main", # branch name, None means same as node name - tenant_id=tenant, - ) + tenant_pg = env.postgres.create_start("test_tenant_relocation", + tenant_id=tenant, + timeline_id=new_timeline_id) # insert some data with closing(tenant_pg.connect()) as conn: diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/batch_others/test_tenants.py index b665ae9022..20a910e9ce 100644 --- a/test_runner/batch_others/test_tenants.py +++ b/test_runner/batch_others/test_tenants.py @@ -12,25 +12,23 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_wal_acce env = zenith_env_builder.init_start() """Tests tenants with and without wal acceptors""" - tenant_1 = env.create_tenant() - tenant_2 = env.create_tenant() + (tenant_1, initial_timeline_1) = env.zenith_cli.create_tenant() + (tenant_2, initial_timeline_2) = env.zenith_cli.create_tenant() - env.zenith_cli.create_branch(f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", - "main", - tenant_id=tenant_1) - env.zenith_cli.create_branch(f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", - "main", - tenant_id=tenant_2) + new_timeline_tenant_1 = env.zenith_cli.branch_timeline(tenant_id=tenant_1, + ancestor_timeline_id=initial_timeline_1) + new_timeline_tenant_2 = env.zenith_cli.branch_timeline(tenant_id=tenant_2, + ancestor_timeline_id=initial_timeline_2) pg_tenant1 = env.postgres.create_start( f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", - None, # branch name, None means same as node name - tenant_1, + tenant_id=tenant_1, + timeline_id=new_timeline_tenant_1, ) pg_tenant2 = env.postgres.create_start( f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", - None, # branch name, None means same as node name - tenant_2, + tenant_id=tenant_2, + timeline_id=new_timeline_tenant_2, ) for pg in [pg_tenant1, pg_tenant2]: diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 2c31267922..49143d0000 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -10,13 +10,14 @@ import time def test_timeline_size(zenith_simple_env: ZenithEnv): env = zenith_simple_env # Branch at the point where only 100 rows were inserted - env.zenith_cli.create_branch("test_timeline_size", "empty") + new_timeline_id = env.zenith_cli.branch_timeline() client = env.pageserver.http_client() - res = client.branch_detail(env.initial_tenant, "test_timeline_size") + res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) + print(f'@@@@@@@@@@\n{res}\n@@@@@@@@@@@') assert res["current_logical_size"] == res["current_logical_size_non_incremental"] - pgmain = env.postgres.create_start("test_timeline_size") + pgmain = env.postgres.create_start("test_timeline_size", timeline_id=new_timeline_id) log.info("postgres is running on 'test_timeline_size' branch") with closing(pgmain.connect()) as conn: @@ -31,11 +32,11 @@ def test_timeline_size(zenith_simple_env: ZenithEnv): FROM generate_series(1, 10) g """) - res = client.branch_detail(env.initial_tenant, "test_timeline_size") + res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) assert res["current_logical_size"] == res["current_logical_size_non_incremental"] cur.execute("TRUNCATE foo") - res = client.branch_detail(env.initial_tenant, "test_timeline_size") + res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) assert res["current_logical_size"] == res["current_logical_size_non_incremental"] @@ -68,17 +69,17 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_timeline_size_quota", "main") + new_timeline_id = env.zenith_cli.branch_timeline() client = env.pageserver.http_client() - res = client.branch_detail(env.initial_tenant, "test_timeline_size_quota") + res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) assert res["current_logical_size"] == res["current_logical_size_non_incremental"] pgmain = env.postgres.create_start( "test_timeline_size_quota", # Set small limit for the test config_lines=['zenith.max_cluster_size=30MB'], - ) + timeline_id=new_timeline_id) log.info("postgres is running on 'test_timeline_size_quota' branch") with closing(pgmain.connect()) as conn: diff --git a/test_runner/batch_others/test_twophase.py b/test_runner/batch_others/test_twophase.py index d6a1cd01e8..b479e9de22 100644 --- a/test_runner/batch_others/test_twophase.py +++ b/test_runner/batch_others/test_twophase.py @@ -9,9 +9,10 @@ from fixtures.log_helper import log # def test_twophase(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_twophase", "empty") - - pg = env.postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5']) + test_twophase_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_twophase', + config_lines=['max_prepared_transactions=5'], + timeline_id=test_twophase_timeline_id) log.info("postgres is running on 'test_twophase' branch") conn = pg.connect() @@ -56,12 +57,14 @@ def test_twophase(zenith_simple_env: ZenithEnv): assert len(twophase_files) == 2 # Create a branch with the transaction in prepared state - env.zenith_cli.create_branch("test_twophase_prepared", "test_twophase") + test_twophase_prepared_timeline_id = env.zenith_cli.branch_timeline( + ancestor_timeline_id=test_twophase_timeline_id) # Start compute on the new branch pg2 = env.postgres.create_start( 'test_twophase_prepared', config_lines=['max_prepared_transactions=5'], + timeline_id=test_twophase_prepared_timeline_id, ) # Check that we restored only needed twophase files diff --git a/test_runner/batch_others/test_vm_bits.py b/test_runner/batch_others/test_vm_bits.py index 49e48dd450..a657b3e3fd 100644 --- a/test_runner/batch_others/test_vm_bits.py +++ b/test_runner/batch_others/test_vm_bits.py @@ -9,8 +9,8 @@ from fixtures.log_helper import log def test_vm_bit_clear(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_vm_bit_clear", "empty") - pg = env.postgres.create_start('test_vm_bit_clear') + test_vm_bit_clear_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_vm_bit_clear', timeline_id=test_vm_bit_clear_timeline_id) log.info("postgres is running on 'test_vm_bit_clear' branch") pg_conn = pg.connect() @@ -33,7 +33,8 @@ def test_vm_bit_clear(zenith_simple_env: ZenithEnv): cur.execute('UPDATE vmtest_update SET id = 5000 WHERE id = 1') # Branch at this point, to test that later - env.zenith_cli.create_branch("test_vm_bit_clear_new", "test_vm_bit_clear") + test_vm_bit_clear_new_timeline_id = env.zenith_cli.branch_timeline( + ancestor_timeline_id=test_vm_bit_clear_timeline_id) # Clear the buffer cache, to force the VM page to be re-fetched from # the page server @@ -61,7 +62,8 @@ def test_vm_bit_clear(zenith_simple_env: ZenithEnv): # a dirty VM page is evicted. If the VM bit was not correctly cleared by the # earlier WAL record, the full-page image hides the problem. Starting a new # server at the right point-in-time avoids that full-page image. - pg_new = env.postgres.create_start('test_vm_bit_clear_new') + pg_new = env.postgres.create_start('test_vm_bit_clear_new', + timeline_id=test_vm_bit_clear_new_timeline_id) log.info("postgres is running on 'test_vm_bit_clear_new' branch") pg_new_conn = pg_new.connect() diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index c375c9626a..3e39228494 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -24,9 +24,8 @@ def test_normal_work(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_wal_acceptors_normal_work", "main") - - pg = env.postgres.create_start('test_wal_acceptors_normal_work') + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_wal_acceptors_normal_work', timeline_id=new_timeline_id) with closing(pg.connect()) as conn: with conn.cursor() as cur: @@ -39,9 +38,9 @@ def test_normal_work(zenith_env_builder: ZenithEnvBuilder): @dataclass -class BranchMetrics: - name: str - latest_valid_lsn: int +class TimelineMetrics: + timeline_id: str + last_record_lsn: int # One entry per each Safekeeper, order is the same flush_lsns: List[int] = field(default_factory=list) commit_lsns: List[int] = field(default_factory=list) @@ -55,21 +54,26 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): n_timelines = 3 - branches = ["test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)] + branch_names = [ + "test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines) + ] + branch_names_to_timeline_ids = {} # start postgres on each timeline pgs = [] - for branch in branches: - env.zenith_cli.create_branch(branch, "main") - pgs.append(env.postgres.create_start(branch)) + for branch_name in branch_names: + new_timeline_id = env.zenith_cli.branch_timeline() + pgs.append(env.postgres.create_start(branch_name, timeline_id=new_timeline_id)) + branch_names_to_timeline_ids[branch_name] = new_timeline_id tenant_id = env.initial_tenant - def collect_metrics(message: str) -> List[BranchMetrics]: + def collect_metrics(message: str) -> List[TimelineMetrics]: with env.pageserver.http_client() as pageserver_http: - branch_details = [ - pageserver_http.branch_detail(tenant_id=tenant_id, name=branch) - for branch in branches + timeline_details = [ + pageserver_http.timeline_detail( + tenant_id=tenant_id, timeline_id=branch_names_to_timeline_ids[branch_name]) + for branch_name in branch_names ] # All changes visible to pageserver (latest_valid_lsn) should be # confirmed by safekeepers first. As we cannot atomically get @@ -80,14 +84,15 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): # safekeepers' state, it will look contradictory. sk_metrics = [sk.http_client().get_metrics() for sk in env.safekeepers] - branch_metrics = [] + timeline_metrics = [] with env.pageserver.http_client() as pageserver_http: - for branch_detail in branch_details: - timeline_id: str = branch_detail["timeline_id"] + for timeline_detail in timeline_details: + print(f"@@@@@@@@@@@\n{timeline_detail}\n@@@@@@@@@@@") + timeline_id: str = timeline_detail["timeline_id"] - m = BranchMetrics( - name=branch_detail["name"], - latest_valid_lsn=branch_detail["latest_valid_lsn"], + m = TimelineMetrics( + timeline_id=timeline_id, + last_record_lsn=timeline_detail["last_record_lsn"], ) for sk_m in sk_metrics: m.flush_lsns.append(sk_m.flush_lsn_inexact[(tenant_id.hex, timeline_id)]) @@ -99,13 +104,13 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): # We only call collect_metrics() after a transaction is confirmed by # the compute node, which only happens after a consensus of safekeepers # has confirmed the transaction. We assume majority consensus here. - assert (2 * sum(m.latest_valid_lsn <= lsn + assert (2 * sum(m.last_record_lsn <= lsn for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers) - assert (2 * sum(m.latest_valid_lsn <= lsn + assert (2 * sum(m.last_record_lsn <= lsn for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers) - branch_metrics.append(m) - log.info(f"{message}: {branch_metrics}") - return branch_metrics + timeline_metrics.append(m) + log.info(f"{message}: {timeline_metrics}") + return timeline_metrics # TODO: https://github.com/zenithdb/zenith/issues/809 # collect_metrics("before CREATE TABLE") @@ -117,7 +122,7 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): pg.safe_psql("CREATE TABLE t(key int primary key, value text)") init_m = collect_metrics("after CREATE TABLE") - # Populate data for 2/3 branches + # Populate data for 2/3 timelines class MetricsChecker(threading.Thread): def __init__(self) -> None: super().__init__(daemon=True) @@ -155,15 +160,15 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): collect_metrics("after INSERT INTO") - # Check data for 2/3 branches + # Check data for 2/3 timelines for pg in pgs[:-1]: res = pg.safe_psql("SELECT sum(key) FROM t") assert res[0] == (5000050000, ) final_m = collect_metrics("after SELECT") - # Assume that LSNs (a) behave similarly in all branches; and (b) INSERT INTO alters LSN significantly. + # Assume that LSNs (a) behave similarly in all timelines; and (b) INSERT INTO alters LSN significantly. # Also assume that safekeepers will not be significantly out of sync in this test. - middle_lsn = (init_m[0].latest_valid_lsn + final_m[0].latest_valid_lsn) // 2 + middle_lsn = (init_m[0].last_record_lsn + final_m[0].last_record_lsn) // 2 assert max(init_m[0].flush_lsns) < middle_lsn < min(final_m[0].flush_lsns) assert max(init_m[0].commit_lsns) < middle_lsn < min(final_m[0].commit_lsns) assert max(init_m[1].flush_lsns) < middle_lsn < min(final_m[1].flush_lsns) @@ -183,8 +188,8 @@ def test_restarts(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = n_acceptors env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_wal_acceptors_restarts", "main") - pg = env.postgres.create_start('test_wal_acceptors_restarts') + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_wal_acceptors_restarts', timeline_id=new_timeline_id) # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -220,8 +225,8 @@ def test_unavailability(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 2 env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_wal_acceptors_unavailability", "main") - pg = env.postgres.create_start('test_wal_acceptors_unavailability') + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_wal_acceptors_unavailability', timeline_id=new_timeline_id) # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -291,8 +296,9 @@ def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value): zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_wal_acceptors_race_conditions", "main") - pg = env.postgres.create_start('test_wal_acceptors_race_conditions') + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_wal_acceptors_race_conditions', + timeline_id=new_timeline_id) # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -456,8 +462,8 @@ def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_timeline_status", "main") - pg = env.postgres.create_start('test_timeline_status') + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_timeline_status', timeline_id=new_timeline_id) wa = env.safekeepers[0] wa_http_cli = wa.http_client() @@ -630,12 +636,12 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 4 env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_replace_safekeeper", "main") + new_timeline_id = env.zenith_cli.branch_timeline() log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() active_safekeepers = [1, 2, 3] - pg = env.postgres.create('test_replace_safekeeper') + pg = env.postgres.create('test_replace_safekeeper', timeline_id=new_timeline_id) pg.adjust_for_wal_acceptors(safekeepers_guc(env, active_safekeepers)) pg.start() @@ -673,7 +679,7 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): show_statuses(env.safekeepers, tenant_id, timeline_id) log.info("Recreate postgres to replace failed sk1 with new sk4") - pg.stop_and_destroy().create('test_replace_safekeeper') + pg.stop_and_destroy().create('test_replace_safekeeper', timeline_id=uuid.UUID(timeline_id)) active_safekeepers = [2, 3, 4] env.safekeepers[3].start() pg.adjust_for_wal_acceptors(safekeepers_guc(env, active_safekeepers)) diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index 4b6a27f73d..719e8c163f 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -202,8 +202,9 @@ def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_wal_acceptors_restarts_under_load", "main") - pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load') + new_timeline_id = env.zenith_cli.branch_timeline() + pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load', + timeline_id=new_timeline_id) asyncio.run(run_restarts_under_load(pg, env.safekeepers)) diff --git a/test_runner/batch_others/test_zenith_cli.py b/test_runner/batch_others/test_zenith_cli.py index f1897e4b6f..4f089d4354 100644 --- a/test_runner/batch_others/test_zenith_cli.py +++ b/test_runner/batch_others/test_zenith_cli.py @@ -7,52 +7,47 @@ from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserv from typing import cast -def helper_compare_branch_list(pageserver_http_client: ZenithPageserverHttpClient, - env: ZenithEnv, - initial_tenant: uuid.UUID): +def helper_compare_timeline_list(pageserver_http_client: ZenithPageserverHttpClient, + env: ZenithEnv, + initial_tenant: uuid.UUID): """ - Compare branches list returned by CLI and directly via API. - Filters out branches created by other tests. + Compare timelines list returned by CLI and directly via API. + Filters out timelines created by other tests. """ - branches = pageserver_http_client.branch_list(initial_tenant) - branches_api = sorted(map(lambda b: cast(str, b['name']), branches)) - branches_api = [b for b in branches_api if b.startswith('test_cli_') or b in ('empty', 'main')] - res = env.zenith_cli.list_branches() - branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n"))) - branches_cli = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')] - - res = env.zenith_cli.list_branches(tenant_id=initial_tenant) - branches_cli_with_tenant_arg = sorted( - map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n"))) - branches_cli_with_tenant_arg = [ - b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main') + timelines_cli = env.zenith_cli.list_timelines() + timelines_cli = [ + b for b in timelines_cli if b.startswith('test_cli_') or b in ('empty', 'main') ] - assert branches_api == branches_cli == branches_cli_with_tenant_arg + timelines_cli_with_tenant_arg = env.zenith_cli.list_timelines(initial_tenant) + timelines_cli_with_tenant_arg = [ + b for b in timelines_cli if b.startswith('test_cli_') or b in ('empty', 'main') + ] + + assert timelines_cli == timelines_cli_with_tenant_arg -def test_cli_branch_list(zenith_simple_env: ZenithEnv): +def test_cli_timeline_list(zenith_simple_env: ZenithEnv): env = zenith_simple_env pageserver_http_client = env.pageserver.http_client() # Initial sanity check - helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant) - env.zenith_cli.create_branch("test_cli_branch_list_main", "empty") - helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant) + helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) + + # Create a branch for us + main_timeline_id = env.zenith_cli.branch_timeline() + helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Create a nested branch - res = env.zenith_cli.create_branch("test_cli_branch_list_nested", "test_cli_branch_list_main") - assert res.stderr == '' - helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant) + nested_timeline_id = env.zenith_cli.branch_timeline(ancestor_timeline_id=main_timeline_id) + helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Check that all new branches are visible via CLI - res = env.zenith_cli.list_branches() - assert res.stderr == '' - branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n"))) + timelines_cli = env.zenith_cli.list_timelines() - assert 'test_cli_branch_list_main' in branches_cli - assert 'test_cli_branch_list_nested' in branches_cli + assert main_timeline_id.hex in timelines_cli + assert nested_timeline_id.hex in timelines_cli def helper_compare_tenant_list(pageserver_http_client: ZenithPageserverHttpClient, env: ZenithEnv): @@ -60,7 +55,6 @@ def helper_compare_tenant_list(pageserver_http_client: ZenithPageserverHttpClien tenants_api = sorted(map(lambda t: cast(str, t['id']), tenants)) res = env.zenith_cli.list_tenants() - assert res.stderr == '' tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines())) assert tenants_api == tenants_cli @@ -74,14 +68,14 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv): # Create new tenant tenant1 = uuid.uuid4() - env.zenith_cli.create_tenant(tenant1) + env.zenith_cli.create_tenant(tenant_id=tenant1) # check tenant1 appeared helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant tenant2 = uuid.uuid4() - env.zenith_cli.create_tenant(tenant2) + env.zenith_cli.create_tenant(tenant_id=tenant2) # check tenant2 appeared helper_compare_tenant_list(pageserver_http_client, env) diff --git a/test_runner/batch_pg_regress/test_isolation.py b/test_runner/batch_pg_regress/test_isolation.py index ddafc3815b..8dce020dc0 100644 --- a/test_runner/batch_pg_regress/test_isolation.py +++ b/test_runner/batch_pg_regress/test_isolation.py @@ -7,10 +7,12 @@ from fixtures.zenith_fixtures import ZenithEnv, base_dir, pg_distrib_dir def test_isolation(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys): env = zenith_simple_env - env.zenith_cli.create_branch("test_isolation", "empty") + new_timeline_id = env.zenith_cli.branch_timeline() # Connect to postgres and create a database called "regression". # isolation tests use prepared transactions, so enable them - pg = env.postgres.create_start('test_isolation', config_lines=['max_prepared_transactions=100']) + pg = env.postgres.create_start('test_isolation', + config_lines=['max_prepared_transactions=100'], + timeline_id=new_timeline_id) pg.safe_psql('CREATE DATABASE isolation_regression') # Create some local directories for pg_isolation_regress to run in. diff --git a/test_runner/batch_pg_regress/test_pg_regress.py b/test_runner/batch_pg_regress/test_pg_regress.py index 5199f65216..efeb63fce3 100644 --- a/test_runner/batch_pg_regress/test_pg_regress.py +++ b/test_runner/batch_pg_regress/test_pg_regress.py @@ -7,9 +7,9 @@ from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content, def test_pg_regress(zenith_simple_env: ZenithEnv, test_output_dir: str, pg_bin, capsys): env = zenith_simple_env - env.zenith_cli.create_branch("test_pg_regress", "empty") + new_timeline_id = env.zenith_cli.branch_timeline() # Connect to postgres and create a database called "regression". - pg = env.postgres.create_start('test_pg_regress') + pg = env.postgres.create_start('test_pg_regress', timeline_id=new_timeline_id) pg.safe_psql('CREATE DATABASE regression') # Create some local directories for pg_regress to run in. diff --git a/test_runner/batch_pg_regress/test_zenith_regress.py b/test_runner/batch_pg_regress/test_zenith_regress.py index 31d5b07093..2ccbafccfd 100644 --- a/test_runner/batch_pg_regress/test_zenith_regress.py +++ b/test_runner/batch_pg_regress/test_zenith_regress.py @@ -11,9 +11,9 @@ from fixtures.log_helper import log def test_zenith_regress(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys): env = zenith_simple_env - env.zenith_cli.create_branch("test_zenith_regress", "empty") + new_timeline_id = env.zenith_cli.branch_timeline() # Connect to postgres and create a database called "regression". - pg = env.postgres.create_start('test_zenith_regress') + pg = env.postgres.create_start('test_zenith_regress', timeline_id=new_timeline_id) pg.safe_psql('CREATE DATABASE regression') # Create some local directories for pg_regress to run in. diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 570c787184..66b9fe54ea 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -64,9 +64,8 @@ class ZenithCompare(PgCompare): self._pg_bin = pg_bin # We only use one branch and one timeline - self.branch = branch_name - self.env.zenith_cli.create_branch(self.branch, "empty") - self._pg = self.env.postgres.create_start(self.branch) + timeline_id = self.env.zenith_cli.branch_timeline() + self._pg = self.env.postgres.create_start("branch", timeline_id=timeline_id) self.timeline = self.pg.safe_psql("SHOW zenith.zenith_timeline")[0][0] # Long-lived cursor, useful for flushing diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 06f75aa604..7c4d178a3f 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -548,8 +548,7 @@ class ZenithEnv: self.s3_mock_server = config.s3_mock_server self.zenith_cli = ZenithCli(env=self) - self.postgres = PostgresFactory(self) - + self.zenith_cli = ZenithCli(env=self) self.safekeepers: List[Safekeeper] = [] # generate initial tenant ID here instead of letting 'zenith init' generate it, @@ -558,7 +557,7 @@ class ZenithEnv: # Create a config file corresponding to the options toml = textwrap.dedent(f""" - default_tenantid = '{self.initial_tenant.hex}' + default_tenant_id = '{self.initial_tenant.hex}' """) # Create config for pageserver @@ -600,8 +599,9 @@ class ZenithEnv: self.safekeepers.append(safekeeper) log.info(f"Config: {toml}") - - self.zenith_cli.init(toml) + # TODO kb is this a wrong concept? will break for multiple tenant tests + self.initial_timeline = self.zenith_cli.init(toml) + self.postgres = PostgresFactory(self) def start(self): # Start up the page server and all the safekeepers @@ -614,12 +614,6 @@ class ZenithEnv: """ Get list of safekeeper endpoints suitable for wal_acceptors GUC """ return ','.join([f'localhost:{wa.port.pg}' for wa in self.safekeepers]) - def create_tenant(self, tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: - if tenant_id is None: - tenant_id = uuid.uuid4() - self.zenith_cli.create_tenant(tenant_id) - return tenant_id - @cached_property def auth_keys(self) -> AuthKeys: pub = (Path(self.repo_dir) / 'auth_public_key.pem').read_bytes() @@ -643,14 +637,7 @@ def _shared_simple_env(request: Any, port_distributor) -> Iterator[ZenithEnv]: shutil.rmtree(repo_dir, ignore_errors=True) with ZenithEnvBuilder(Path(repo_dir), port_distributor) as builder: - - env = builder.init_start() - - # For convenience in tests, create a branch from the freshly-initialized cluster. - env.zenith_cli.create_branch("empty", "main") - - # Return the builder to the caller - yield env + yield builder.init_start() @pytest.fixture(scope='function') @@ -729,34 +716,27 @@ class ZenithPageserverHttpClient(requests.Session): f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}/detach", ) self.verbose_error(res) - def branch_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def branch_create(self, tenant_id: uuid.UUID, name: str, start_point: str) -> Dict[Any, Any]: - res = self.post(f"http://localhost:{self.port}/v1/branch", + def timeline_create(self, + tenant_id: uuid.UUID, + timeline_id: uuid.UUID, + start_lsn: Optional[str] = None, + ancestor_timeline_id: Optional[uuid.UUID] = None) -> Dict[Any, Any]: + res = self.post(f"http://localhost:{self.port}/v1/timeline", json={ - 'tenant_id': tenant_id.hex, - 'name': name, - 'start_point': start_point, + 'tenant_id': + tenant_id.hex, + 'timeline_id': + timeline_id.hex, + 'start_lsn': + start_lsn, + 'ancestor_timeline_id': + ancestor_timeline_id.hex if ancestor_timeline_id else None, }) self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) return res_json - def branch_detail(self, tenant_id: uuid.UUID, name: str) -> Dict[Any, Any]: - res = self.get( - f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}/{name}?include-non-incremental-logical-size=1", - ) - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - def tenant_list(self) -> List[Dict[Any, Any]]: res = self.get(f"http://localhost:{self.port}/v1/tenant") self.verbose_error(res) @@ -774,7 +754,7 @@ class ZenithPageserverHttpClient(requests.Session): self.verbose_error(res) return res.json() - def timeline_list(self, tenant_id: uuid.UUID) -> List[str]: + def timeline_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]: res = self.get(f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}") self.verbose_error(res) res_json = res.json() @@ -783,7 +763,8 @@ class ZenithPageserverHttpClient(requests.Session): def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: res = self.get( - f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}") + f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}?include-non-incremental-logical-size=1" + ) self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) @@ -827,34 +808,76 @@ class ZenithCli: self.env = env pass - def create_tenant(self, tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: + def create_tenant(self, tenant_id: Optional[uuid.UUID] = None) -> tuple[uuid.UUID, uuid.UUID]: + """ + Creates a new tenant, returns its id and its initial timeline's id. + """ if tenant_id is None: tenant_id = uuid.uuid4() - self.raw_cli(['tenant', 'create', tenant_id.hex]) - return tenant_id + res = self.raw_cli(['tenant', 'create', '--tenant-id', tenant_id.hex]) + + initial_timeline_id_extractor = re.compile(r"initial timeline: '(?P[^']+)'", + re.MULTILINE) + matches = initial_timeline_id_extractor.search(res.stdout) + + created_timeline_id = None + if matches is not None: + created_timeline_id = matches.group('timeline_id') + + if created_timeline_id is None: + raise Exception('could not find timeline id after `zenith tenant create` invocation') + else: + return (tenant_id, uuid.UUID(created_timeline_id)) def list_tenants(self) -> 'subprocess.CompletedProcess[str]': - return self.raw_cli(['tenant', 'list']) + res = self.raw_cli(['tenant', 'list']) + res.check_returncode() + return res - def create_branch(self, - branch_name: str, - starting_point: str, - tenant_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]': - args = ['branch'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) - args.extend([branch_name, starting_point]) + def branch_timeline(self, + tenant_id: Optional[uuid.UUID] = None, + new_timeline_id: Optional[uuid.UUID] = None, + ancestor_timeline_id: Optional[uuid.UUID] = None, + ancestor_start_lsn: Optional[str] = None) -> uuid.UUID: + cmd = [ + 'timeline', + 'branch', + '--tenant-id', + (tenant_id or self.env.initial_tenant).hex, + '--ancestor-timeline-id', + (ancestor_timeline_id or self.env.initial_timeline).hex, + ] + if ancestor_start_lsn is not None: + cmd.extend(['--ancestor-start-lsn', ancestor_start_lsn]) + if new_timeline_id is not None: + cmd.extend(['--timeline-id', new_timeline_id.hex]) - return self.raw_cli(args) + completed_process = self.raw_cli(cmd) + completed_process.check_returncode() + create_timeline_id_extractor = re.compile(r"^Created timeline '(?P[^']+)'", + re.MULTILINE) + matches = create_timeline_id_extractor.search(completed_process.stdout) - def list_branches(self, - tenant_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]': - args = ['branch'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) - return self.raw_cli(args) + created_timeline_id = None + if matches is not None: + created_timeline_id = matches.group('timeline_id') + + if created_timeline_id is None: + raise Exception('could not find timeline id after `zenith timeline create` invocation') + else: + return uuid.UUID(created_timeline_id) + + def list_timelines(self, tenant_id: Optional[uuid.UUID] = None) -> List[str]: + res = self.raw_cli( + ['timeline', 'list', '--tenant-id', (tenant_id or self.env.initial_tenant).hex]) + branches_cli = sorted( + map(lambda b: b.split(') ')[-1].strip().split(':')[-1].strip(), + res.stdout.strip().split("\n"))) + return branches_cli + + def init(self, config_toml: str) -> uuid.UUID: + initial_timeline = None - def init(self, config_toml: str) -> 'subprocess.CompletedProcess[str]': with tempfile.NamedTemporaryFile(mode='w+') as tmp: tmp.write(config_toml) tmp.flush() @@ -864,7 +887,18 @@ class ZenithCli: self.env.pageserver.remote_storage, self.env.pageserver.config_override) - return self.raw_cli(cmd) + completed_process = self.raw_cli(cmd) + completed_process.check_returncode() + init_timeline_id_extractor = re.compile( + r'^created initial timeline (?P[^\s]+)\s', re.MULTILINE) + matches = init_timeline_id_extractor.search(completed_process.stdout) + if matches is not None: + initial_timeline = matches.group('timeline_id') + + if initial_timeline is None: + raise Exception('could not find timeline id after `zenith init` invocation') + else: + return uuid.UUID(initial_timeline) def pageserver_start(self, overrides=()) -> 'subprocess.CompletedProcess[str]': start_args = ['pageserver', 'start', *overrides] @@ -898,36 +932,50 @@ class ZenithCli: self, node_name: str, tenant_id: Optional[uuid.UUID] = None, - timeline_spec: Optional[str] = None, + timeline_id: Optional[uuid.UUID] = None, + lsn: Optional[str] = None, port: Optional[int] = None, ) -> 'subprocess.CompletedProcess[str]': - args = ['pg', 'create'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) + args = [ + 'pg', + 'create', + '--tenant-id', (tenant_id or self.env.initial_tenant).hex, + '--timeline-id', (timeline_id or self.env.initial_timeline).hex + ] + if lsn is not None: + args.append(f'--lsn={lsn}') if port is not None: args.append(f'--port={port}') args.append(node_name) - if timeline_spec is not None: - args.append(timeline_spec) - return self.raw_cli(args) + res = self.raw_cli(args) + res.check_returncode() + return res def pg_start( self, node_name: str, tenant_id: Optional[uuid.UUID] = None, - timeline_spec: Optional[str] = None, + timeline_id: Optional[uuid.UUID] = None, + lsn: Optional[str] = None, port: Optional[int] = None, ) -> 'subprocess.CompletedProcess[str]': - args = ['pg', 'start'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) + args = [ + 'pg', + 'start', + '--tenant-id', + (tenant_id or self.env.initial_tenant).hex, + '--timeline-id', + (timeline_id or self.env.initial_timeline).hex, + ] + if lsn is not None: + args.append(f'--lsn={lsn}') if port is not None: args.append(f'--port={port}') args.append(node_name) - if timeline_spec is not None: - args.append(timeline_spec) - return self.raw_cli(args) + res = self.raw_cli(args) + res.check_returncode() + return res def pg_stop( self, @@ -935,9 +983,7 @@ class ZenithCli: tenant_id: Optional[uuid.UUID] = None, destroy=False, ) -> 'subprocess.CompletedProcess[str]': - args = ['pg', 'stop'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) + args = ['pg', 'stop', f'--tenant-id={(tenant_id or self.env.initial_tenant).hex}'] if destroy: args.append('--destroy') args.append(node_name) @@ -1044,7 +1090,6 @@ class ZenithPageserver(PgProtocol): if self.running: self.env.zenith_cli.pageserver_stop(immediate) self.running = False - return self def __enter__(self): @@ -1261,7 +1306,8 @@ class Postgres(PgProtocol): def create( self, node_name: str, - branch: Optional[str] = None, + timeline_id: uuid.UUID, + lsn: Optional[str] = None, config_lines: Optional[List[str]] = None, ) -> 'Postgres': """ @@ -1272,13 +1318,11 @@ class Postgres(PgProtocol): if not config_lines: config_lines = [] - if branch is None: - branch = node_name - self.env.zenith_cli.pg_create(node_name, + timeline_id=timeline_id, tenant_id=self.tenant_id, - port=self.port, - timeline_spec=branch) + lsn=lsn, + port=self.port) self.node_name = node_name path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name self.pgdata_dir = os.path.join(self.env.repo_dir, path) @@ -1375,7 +1419,7 @@ class Postgres(PgProtocol): if self.running: assert self.node_name is not None - self.env.zenith_cli.pg_stop(self.node_name, tenant_id=self.tenant_id) + self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id) self.running = False return self @@ -1387,7 +1431,7 @@ class Postgres(PgProtocol): """ assert self.node_name is not None - self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, destroy=True) + self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, True) self.node_name = None return self @@ -1395,7 +1439,8 @@ class Postgres(PgProtocol): def create_start( self, node_name: str, - branch: Optional[str] = None, + timeline_id: uuid.UUID, + lsn: Optional[str] = None, config_lines: Optional[List[str]] = None, ) -> 'Postgres': """ @@ -1406,8 +1451,9 @@ class Postgres(PgProtocol): self.create( node_name=node_name, - branch=branch, + timeline_id=timeline_id, config_lines=config_lines, + lsn=lsn, ).start() return self @@ -1428,8 +1474,9 @@ class PostgresFactory: def create_start(self, node_name: str = "main", - branch: Optional[str] = None, tenant_id: Optional[uuid.UUID] = None, + timeline_id: Optional[uuid.UUID] = None, + lsn: Optional[str] = None, config_lines: Optional[List[str]] = None) -> Postgres: pg = Postgres( @@ -1442,14 +1489,16 @@ class PostgresFactory: return pg.create_start( node_name=node_name, - branch=branch, + timeline_id=timeline_id or self.env.initial_timeline, config_lines=config_lines, + lsn=lsn, ) def create(self, node_name: str = "main", - branch: Optional[str] = None, tenant_id: Optional[uuid.UUID] = None, + timeline_id: Optional[uuid.UUID] = None, + lsn: Optional[str] = None, config_lines: Optional[List[str]] = None) -> Postgres: pg = Postgres( @@ -1463,7 +1512,8 @@ class PostgresFactory: return pg.create( node_name=node_name, - branch=branch, + timeline_id=timeline_id or self.env.initial_timeline, + lsn=lsn, config_lines=config_lines, ) @@ -1683,8 +1733,7 @@ def list_files_to_compare(pgdata_dir: str): # pg is the existing and running compute node, that we want to compare with a basebackup def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Postgres): - - # Get the timeline ID of our branch. We need it for the 'basebackup' command + # Get the timeline ID. We need it for the 'basebackup' command with closing(pg.connect()) as conn: with conn.cursor() as cur: cur.execute("SHOW zenith.zenith_timeline") diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index 0247385211..dda31ba692 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -30,11 +30,9 @@ def test_bulk_tenant_create( for i in range(tenants_count): start = timeit.default_timer() - tenant = env.create_tenant() - env.zenith_cli.create_branch( - f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", - "main", - tenant_id=tenant) + (tenant, tenant_initial_timeline_id) = env.zenith_cli.create_tenant() + new_timeline_id = env.zenith_cli.branch_timeline( + tenant_id=tenant, ancestor_timeline_id=tenant_initial_timeline_id) # FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now? #if use_wal_acceptors == 'with_wa': @@ -42,9 +40,8 @@ def test_bulk_tenant_create( pg_tenant = env.postgres.create_start( f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", - None, # branch name, None means same as node name tenant, - ) + timeline_id=new_timeline_id) end = timeit.default_timer() time_slices.append(end - start) diff --git a/test_runner/performance/test_parallel_copy_to.py b/test_runner/performance/test_parallel_copy_to.py index e4388ce8e2..0ee0a37ebb 100644 --- a/test_runner/performance/test_parallel_copy_to.py +++ b/test_runner/performance/test_parallel_copy_to.py @@ -1,6 +1,5 @@ from io import BytesIO import asyncio -import asyncpg from fixtures.zenith_fixtures import ZenithEnv, Postgres, PgProtocol from fixtures.log_helper import log from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py index 56c735e87c..994544666b 100644 --- a/test_runner/test_broken.py +++ b/test_runner/test_broken.py @@ -21,8 +21,8 @@ run_broken = pytest.mark.skipif(os.environ.get('RUN_BROKEN') is None, def test_broken(zenith_simple_env: ZenithEnv, pg_bin): env = zenith_simple_env - env.zenith_cli.create_branch("test_broken", "empty") - env.postgres.create_start("test_broken") + new_timeline_id = env.zenith_cli.branch_timeline() + env.postgres.create_start("test_broken", timeline_id=new_timeline_id) log.info('postgres is running') log.info('THIS NEXT COMMAND WILL FAIL:') diff --git a/zenith/src/main.rs b/zenith/src/main.rs index 7170653754..dcfeb63309 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -9,7 +9,7 @@ use pageserver::config::defaults::{ DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR, }; -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::process::exit; use std::str::FromStr; use walkeeper::defaults::{ @@ -60,7 +60,7 @@ struct TimelineTreeEl { /// `TimelineInfo` received from the `pageserver` via the `timeline_list` libpq API call. pub info: TimelineInfo, /// Holds all direct children of this timeline referenced using `timeline_id`. - pub children: Vec, + pub children: BTreeSet, } // Main entry point for the 'zenith' CLI utility @@ -71,25 +71,18 @@ struct TimelineTreeEl { // * Providing CLI api to the pageserver // * TODO: export/import to/from usual postgres fn main() -> Result<()> { - #[rustfmt::skip] // rustfmt squashes these into a single line otherwise - let pg_node_arg = Arg::new("node") - .index(1) - .help("Node name") - .required(true); + let pg_node_arg = Arg::new("node").help("Node name").required(true); - #[rustfmt::skip] - let safekeeper_id_arg = Arg::new("id") - .index(1) - .help("safekeeper id") + let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false); + + let timeline_id_arg = Arg::new("timeline-id") + .long("timeline-id") + .help("Timeline id. Represented as a hexadecimal string 32 symbols length") + .takes_value(true) .required(false); - let timeline_arg = Arg::new("timeline") - .index(2) - .help("Timeline id or a point-in time specification") - .required(false); - - let tenantid_arg = Arg::new("tenantid") - .long("tenantid") + let tenant_id_arg = Arg::new("tenant-id") + .long("tenant-id") .help("Tenant id. Represented as a hexadecimal string 32 symbols length") .takes_value(true) .required(false); @@ -115,6 +108,12 @@ fn main() -> Result<()> { .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more") .required(false); + let lsn_arg = Arg::new("lsn") + .long("lsn") + .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.") + .takes_value(true) + .required(false); + let matches = App::new("Zenith CLI") .setting(AppSettings::ArgRequiredElseHelp) .version(GIT_VERSION) @@ -131,16 +130,28 @@ fn main() -> Result<()> { ) .subcommand( App::new("timeline") - .about("Create a new timeline") - .arg(Arg::new("timeline-name").required(false).index(1)) - .arg(Arg::new("start-point").required(false).index(2)) - .arg(tenantid_arg.clone()), + .about("Manage timelines") + .subcommand(App::new("list") + .about("List all timelines, available to this pageserver") + .arg(tenant_id_arg.clone())) + .subcommand(App::new("branch") + .about("Create a new timeline, using another timeline as a base, copying its data") + .arg(tenant_id_arg.clone()) + .arg(timeline_id_arg.clone().help("Id of the new timeline, optional. If not specified, it will be generated randomly")) + .arg(Arg::new("ancestor-timeline-id").long("ancestor-timeline-id").takes_value(true) + .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline").required(false)) + .arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn").takes_value(true) + .help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false))) + .subcommand(App::new("create") + .about("Create a new blank timeline") + .arg(tenant_id_arg.clone()) + .arg(timeline_id_arg.clone().help("Id of the new timeline, optional. If not specified, it will be generated randomly"))) ).subcommand( App::new("tenant") .setting(AppSettings::ArgRequiredElseHelp) .about("Manage tenants") .subcommand(App::new("list")) - .subcommand(App::new("create").arg(Arg::new("tenantid").required(false).index(1))) + .subcommand(App::new("create").arg(tenant_id_arg.clone())) ) .subcommand( App::new("pageserver") @@ -175,12 +186,13 @@ fn main() -> Result<()> { App::new("pg") .setting(AppSettings::ArgRequiredElseHelp) .about("Manage postgres instances") - .subcommand(App::new("list").arg(tenantid_arg.clone())) + .subcommand(App::new("list").arg(tenant_id_arg.clone())) .subcommand(App::new("create") .about("Create a postgres compute node") .arg(pg_node_arg.clone()) - .arg(timeline_arg.clone()) - .arg(tenantid_arg.clone()) + .arg(timeline_id_arg.clone()) + .arg(tenant_id_arg.clone()) + .arg(lsn_arg.clone()) .arg(port_arg.clone()) .arg( Arg::new("config-only") @@ -191,14 +203,14 @@ fn main() -> Result<()> { .subcommand(App::new("start") .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") .arg(pg_node_arg.clone()) - .arg(timeline_arg.clone()) - .arg(tenantid_arg.clone()) + .arg(timeline_id_arg.clone()) + .arg(tenant_id_arg.clone()) + .arg(lsn_arg.clone()) .arg(port_arg.clone())) .subcommand( App::new("stop") .arg(pg_node_arg.clone()) - .arg(timeline_arg.clone()) - .arg(tenantid_arg.clone()) + .arg(tenant_id_arg.clone()) .arg( Arg::new("destroy") .help("Also delete data directory (now optional, should be default in future)") @@ -230,7 +242,7 @@ fn main() -> Result<()> { handle_init(sub_args) } else { // all other commands need an existing config - let env = match LocalEnv::load_config() { + let mut env = match LocalEnv::load_config() { Ok(conf) => conf, Err(e) => { eprintln!("Error loading config: {}", e); @@ -239,7 +251,7 @@ fn main() -> Result<()> { }; match sub_name { - "tenant" => handle_tenant(sub_args, &env), + "tenant" => handle_tenant(sub_args, &mut env), "timeline" => handle_timeline(sub_args, &env), "start" => handle_start_all(sub_args, &env), "stop" => handle_stop_all(sub_args, &env), @@ -261,39 +273,44 @@ fn main() -> Result<()> { /// Prints timelines list as a tree-like structure. /// fn print_timelines_tree(timelines: Vec) -> Result<()> { - let mut timelines_hash: HashMap = timelines + let mut timelines_hash = timelines .iter() .map(|t| { ( - t.timeline_id.to_string(), + t.timeline_id(), TimelineTreeEl { info: t.clone(), - children: Vec::new(), + children: BTreeSet::new(), }, ) }) - .collect(); + .collect::>(); // Memorize all direct children of each timeline. for timeline in &timelines { - if let Some(tid) = &timeline.ancestor_id { + if let TimelineInfo::Local { + ancestor_timeline_id: Some(tid), + .. + } = timeline + { timelines_hash .get_mut(tid) .context("missing timeline info in the HashMap")? .children - .push(timeline.timeline_id.to_string()); + .insert(timeline.timeline_id()); } } - // Sort children by tid to bring some minimal order. - for timeline in &mut timelines_hash.values_mut() { - timeline.children.sort(); - } - for timeline in timelines_hash.values() { - // Start with root timelines (no ancestors) first. - if timeline.info.ancestor_id.is_none() { - print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?; + // Start with root local timelines (no ancestors) first. + if let TimelineInfo::Local { + ancestor_timeline_id, + .. + } = &timeline.info + { + if ancestor_timeline_id.is_none() { + print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?; + } } } @@ -307,17 +324,22 @@ fn print_timeline( nesting_level: usize, is_last: &[bool], timeline: &TimelineTreeEl, - timelines: &HashMap, + timelines: &HashMap, ) -> Result<()> { + let local_or_remote = match timeline.info { + TimelineInfo::Local { .. } => "(L)", + TimelineInfo::Remote { .. } => "(R)", + }; // Draw main padding - print!(" "); + print!("{} ", local_or_remote); if nesting_level > 0 { - let lsn = timeline - .info - .ancestor_lsn - .as_ref() - .context("missing timeline info in the HashMap")?; + let lsn_string = match timeline.info { + TimelineInfo::Local { ancestor_lsn, .. } => ancestor_lsn + .map(|lsn| lsn.to_string()) + .unwrap_or_else(|| "Unknown local Lsn".to_string()), + TimelineInfo::Remote { .. } => "unknown Lsn (remote)".to_string(), + }; let mut br_sym = "┣━"; // Draw each nesting padding with proper style @@ -337,11 +359,11 @@ fn print_timeline( br_sym = "┗━"; } - print!("{} @{}: ", br_sym, lsn); + print!("{} @{}: ", br_sym, lsn_string); } // Finally print a timeline name with new line - println!("{}", timeline.info.timeline_id); + println!("{}", timeline.info.timeline_id()); let len = timeline.children.len(); let mut i: usize = 0; @@ -375,26 +397,44 @@ fn print_timeline( /// Connects to the pageserver to query this information. fn get_timeline_infos( env: &local_env::LocalEnv, - tenantid: &ZTenantId, + tenant_id: &ZTenantId, ) -> Result> { let page_server = PageServerNode::from_env(env); - let timeline_infos: Vec = page_server.timeline_list(tenantid)?; + let timeline_infos: Vec = page_server.timeline_list(tenant_id)?; let timeline_infos: HashMap = timeline_infos .into_iter() - .map(|timeline_info| (timeline_info.timeline_id, timeline_info)) + .map(|timeline_info| (timeline_info.timeline_id(), timeline_info)) .collect(); Ok(timeline_infos) } -// Helper function to parse --tenantid option, or get the default from config file -fn get_tenantid(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result { - if let Some(tenantid_cmd) = sub_match.value_of("tenantid") { - Ok(ZTenantId::from_str(tenantid_cmd)?) - } else if let Some(tenantid_conf) = env.default_tenantid { +// Helper function to parse --tenant_id option, or get the default from config file +fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result { + if let Some(tenantid_cmd) = sub_match.value_of("tenant-id") { + Ok( + ZTenantId::from_str(tenantid_cmd) + .context("Failed to parse tenant id from arguments")?, + ) + } else if let Some(tenantid_conf) = env.default_tenant_id { Ok(ZTenantId::from(tenantid_conf)) } else { - bail!("No tenantid. Use --tenantid, or set 'default_tenantid' in the config file"); + bail!("No tenant id. Use --tenant-id, or set 'default_tenant_id' in the config file"); + } +} + +fn get_timeline_id( + sub_match: &ArgMatches, + tenant_id: ZTenantId, + env: &local_env::LocalEnv, +) -> anyhow::Result { + if let Some(timeline_id) = sub_match.value_of("timeline-id") { + Ok(ZTimelineId::from_str(timeline_id) + .context("Failed to parse timeline id from arguments")?) + } else if let Some(&initial_timeline_id) = env.initial_timelines.get(&tenant_id) { + Ok(initial_timeline_id) + } else { + bail!("No timeline id, specify one in the subcommand's arguments"); } } @@ -418,7 +458,7 @@ fn handle_init(init_match: &ArgMatches) -> Result<()> { let pageserver = PageServerNode::from_env(&env); if let Err(e) = pageserver.init( // default_tenantid was generated by the `env.init()` call above - Some(&ZTenantId::from(env.default_tenantid.unwrap()).to_string()), + Some(&ZTenantId::from(env.default_tenant_id.unwrap()).to_string()), &pageserver_config_overrides(init_match), ) { eprintln!("pageserver init failed: {}", e); @@ -436,7 +476,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { .collect() } -fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { let pageserver = PageServerNode::from_env(env); match tenant_match.subcommand() { Some(("list", _)) => { @@ -445,13 +485,17 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result } } Some(("create", create_match)) => { - let tenantid = match create_match.value_of("tenantid") { - Some(tenantid) => ZTenantId::from_str(tenantid)?, + let tenant_id = match create_match.value_of("tenant-id") { + Some(id) => ZTenantId::from_str(id)?, None => ZTenantId::generate(), }; - println!("using tenant id {}", tenantid); - pageserver.tenant_create(tenantid)?; - println!("tenant successfully created on the pageserver"); + println!("using tenant id {}", tenant_id); + let initial_timeline_id = pageserver.tenant_create(tenant_id)?; + env.initial_timelines.insert(tenant_id, initial_timeline_id); + println!( + "tenant {} successfully created on the pageserver, initial timeline: '{}'", + tenant_id, initial_timeline_id + ); } Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), None => bail!("no tenant subcommand provided"), @@ -462,26 +506,77 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result fn handle_timeline(timeline_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let pageserver = PageServerNode::from_env(env); - let tenant_id = get_tenantid(timeline_match, env)?; + match timeline_match.subcommand() { + Some(("list", list_match)) => { + let tenant_id = get_tenant_id(list_match, env)?; + let timelines = pageserver.timeline_list(&tenant_id)?; + print_timelines_tree(timelines)?; + } + Some(("create", create_match)) => { + let tenant_id = get_tenant_id(create_match, env)?; + let timeline_id = get_timeline_id(create_match, tenant_id, env) + .unwrap_or_else(|_| ZTimelineId::generate()); + let timeline = pageserver.timeline_create(tenant_id, timeline_id, None, None)?; - if let Some(timeline_id) = timeline_match.value_of("timeline-id") { - let start_lsn = timeline_match - .value_of("start-lsn") - .map(|lsn| lsn.parse::()) - .transpose() - .context("Failed to parse start Lsn from the request")?; - let timeline_id = timeline_id - .parse::() - .context("Failed to parse timeline id from the request")?; - let timeline = pageserver.timeline_create(tenant_id, timeline_id, start_lsn)?; - println!( - "Created timeline '{}' at {:?} for tenant: {}", - timeline.timeline_id, timeline.latest_valid_lsn, tenant_id, - ); - } else { - // No arguments, list timelines for tenant - let timelines = pageserver.timeline_list(&tenant_id)?; - print_timelines_tree(timelines)?; + let last_record_lsn = match timeline { + TimelineInfo::Local { + last_record_lsn, .. + } => last_record_lsn, + TimelineInfo::Remote { .. } => { + bail!("Timeline {} was created as remote, not local", timeline_id) + } + }; + println!( + "Created timeline '{}' at Lsn {} for tenant: {}", + timeline.timeline_id(), + last_record_lsn, + tenant_id, + ); + } + Some(("branch", branch_match)) => { + let tenant_id = get_tenant_id(branch_match, env)?; + let timeline_id = get_timeline_id(branch_match, tenant_id, env) + .unwrap_or_else(|_| ZTimelineId::generate()); + let ancestor_timeline_id = match branch_match + .value_of("ancestor-timeline-id") + .map(ZTimelineId::from_str) + .transpose() + .context("Failed to parse ancestor timeline id from the request")? + .or_else(|| env.initial_timelines.get(&tenant_id).copied()) + { + Some(id) => id, + None => bail!("No ancestor timeline id provided"), + }; + let start_lsn = branch_match + .value_of("ancestor-start-lsn") + .map(Lsn::from_str) + .transpose() + .context("Failed to parse ancestor start Lsn from the request")?; + let timeline = pageserver.timeline_create( + tenant_id, + timeline_id, + start_lsn, + Some(ancestor_timeline_id), + )?; + + let last_record_lsn = match timeline { + TimelineInfo::Local { + last_record_lsn, .. + } => last_record_lsn, + TimelineInfo::Remote { .. } => { + bail!("Timeline {} was created as remote, not local", timeline_id) + } + }; + println!( + "Created timeline '{}' at Lsn {} for tenant: {}. Ancestor timeline: '{}'", + timeline.timeline_id(), + last_record_lsn, + tenant_id, + ancestor_timeline_id, + ); + } + Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), + None => bail!("no tenant subcommand provided"), } Ok(()) @@ -495,12 +590,12 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let mut cplane = ComputeControlPlane::load(env.clone())?; - // All subcommands take an optional --tenantid option - let tenantid = get_tenantid(sub_args, env)?; + // All subcommands take an optional --tenant-id option + let tenant_id = get_tenant_id(sub_args, env)?; match sub_name { "list" => { - let timeline_infos = get_timeline_infos(env, &tenantid).unwrap_or_else(|e| { + let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| { eprintln!("Failed to load timeline info: {}", e); HashMap::new() }); @@ -509,21 +604,26 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { for ((_, node_name), node) in cplane .nodes .iter() - .filter(|((node_tenantid, _), _)| node_tenantid == &tenantid) + .filter(|((node_tenant_id, _), _)| node_tenant_id == &tenant_id) { // FIXME: This shows the LSN at the end of the timeline. It's not the // right thing to do for read-only nodes that might be anchored at an // older point in time, or following but lagging behind the primary. let lsn_str = timeline_infos - .get(&node.timelineid) - .map(|bi| bi.latest_valid_lsn.to_string()) - .unwrap_or_else(|| "?".to_string()); + .get(&node.timeline_id) + .map(|bi| match bi { + TimelineInfo::Local { + last_record_lsn, .. + } => last_record_lsn.to_string(), + TimelineInfo::Remote { .. } => "? (remote)".to_string(), + }) + .unwrap_or_else(|| '?'.to_string()); println!( "{}\t{}\t{}\t{}\t{}", node_name, node.address, - node.timelineid, + node.timeline_id, lsn_str, node.status(), ); @@ -531,27 +631,31 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { } "create" => { let node_name = sub_args.value_of("node").unwrap_or("main"); - let timeline_spec = sub_args.value_of("timeline"); + let lsn = sub_args + .value_of("lsn") + .map(Lsn::from_str) + .transpose() + .context("Failed to parse Lsn from the request")?; + let timeline_id = get_timeline_id(sub_args, tenant_id, env)?; let port: Option = match sub_args.value_of("port") { Some(p) => Some(p.parse()?), None => None, }; - cplane.new_node(tenantid, node_name, timeline_spec, port)?; + cplane.new_node(tenant_id, node_name, timeline_id, lsn, port)?; } "start" => { let node_name = sub_args.value_of("node").unwrap_or("main"); - let timeline_spec = sub_args.value_of("timeline"); let port: Option = match sub_args.value_of("port") { Some(p) => Some(p.parse()?), None => None, }; - let node = cplane.nodes.get(&(tenantid, node_name.to_owned())); + let node = cplane.nodes.get(&(tenant_id, node_name.to_owned())); let auth_token = if matches!(env.pageserver.auth_type, AuthType::ZenithJWT) { - let claims = Claims::new(Some(tenantid), Scope::Tenant); + let claims = Claims::new(Some(tenant_id), Scope::Tenant); Some(env.generate_auth_token(&claims)?) } else { @@ -559,22 +663,25 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { }; if let Some(node) = node { - if timeline_spec.is_some() { - println!("timeline spec ignored because its node exists already"); - } println!("Starting existing postgres {}...", node_name); node.start(&auth_token)?; } else { + let timeline_id = get_timeline_id(sub_args, tenant_id, env)?; + let lsn = sub_args + .value_of("lsn") + .map(Lsn::from_str) + .transpose() + .context("Failed to parse Lsn from the request")?; // when used with custom port this results in non obvious behaviour // port is remembered from first start command, i e // start --port X // stop // start <-- will also use port X even without explicit port argument println!( - "Starting new postgres {} on timeline {:?} ...", - node_name, timeline_spec + "Starting new postgres {} on timeline {} ...", + node_name, timeline_id ); - let node = cplane.new_node(tenantid, node_name, timeline_spec, port)?; + let node = cplane.new_node(tenant_id, node_name, timeline_id, lsn, port)?; node.start(&auth_token)?; } } @@ -584,7 +691,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let node = cplane .nodes - .get(&(tenantid, node_name.to_owned())) + .get(&(tenant_id, node_name.to_owned())) .with_context(|| format!("postgres {} is not found", node_name))?; node.stop(destroy)?; } From 4d0f7fd1e4306f387b1606377ba8f574ed40cf4f Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 24 Feb 2022 13:40:32 +0300 Subject: [PATCH 0020/1022] Update Zenith CLI config between runs --- .github/workflows/benchmarking.yml | 2 +- control_plane/src/compute.rs | 4 +- control_plane/src/local_env.rs | 97 ++++----- pageserver/src/timelines.rs | 2 - test_runner/batch_others/test_auth.py | 15 +- test_runner/batch_others/test_backpressure.py | 2 +- .../batch_others/test_branch_behind.py | 42 ++-- .../batch_others/test_clog_truncate.py | 15 +- test_runner/batch_others/test_config.py | 8 +- test_runner/batch_others/test_createdropdb.py | 29 ++- test_runner/batch_others/test_createuser.py | 9 +- .../batch_others/test_gc_aggressive.py | 5 +- test_runner/batch_others/test_multixact.py | 10 +- .../batch_others/test_old_request_lsn.py | 4 +- .../batch_others/test_pageserver_catchup.py | 8 +- .../batch_others/test_pageserver_restart.py | 4 +- .../batch_others/test_parallel_copy.py | 4 +- test_runner/batch_others/test_pgbench.py | 4 +- .../batch_others/test_readonly_node.py | 21 +- .../batch_others/test_restart_compute.py | 10 +- test_runner/batch_others/test_snapfiles_gc.py | 4 +- test_runner/batch_others/test_subxacts.py | 4 +- .../batch_others/test_tenant_relocation.py | 8 +- test_runner/batch_others/test_tenants.py | 18 +- .../batch_others/test_timeline_size.py | 10 +- test_runner/batch_others/test_twophase.py | 10 +- test_runner/batch_others/test_vm_bits.py | 10 +- test_runner/batch_others/test_wal_acceptor.py | 32 ++- .../batch_others/test_wal_acceptor_async.py | 5 +- test_runner/batch_others/test_zenith_cli.py | 11 +- .../batch_pg_regress/test_isolation.py | 6 +- .../batch_pg_regress/test_pg_regress.py | 4 +- .../batch_pg_regress/test_zenith_regress.py | 4 +- test_runner/fixtures/compare_fixtures.py | 4 +- test_runner/fixtures/zenith_fixtures.py | 186 ++++++++---------- .../performance/test_bulk_tenant_create.py | 10 +- .../performance/test_parallel_copy_to.py | 1 + test_runner/test_broken.py | 4 +- zenith/src/main.rs | 158 ++++++++------- zenith_utils/src/zid.rs | 2 +- 40 files changed, 371 insertions(+), 415 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index dd23440afb..36df35297d 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -48,7 +48,7 @@ jobs: echo Python python3 --version poetry run python3 --version - echo Pipenv + echo Poetry poetry --version echo Pgbench $PG_BIN/pgbench --version diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 5d225a67fa..e8baffdc74 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -147,7 +147,7 @@ impl PostgresNode { // Read a few options from the config file let context = format!("in config file {}", cfg_path_str); let port: u16 = conf.parse_field("port", &context)?; - let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?; + let timeline_id: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?; let tenant_id: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?; let uses_wal_proposer = conf.get("wal_acceptors").is_some(); @@ -162,7 +162,7 @@ impl PostgresNode { env: env.clone(), pageserver: Arc::clone(pageserver), is_test: false, - timeline_id: timelineid, + timeline_id, lsn: recovery_target_lsn, tenant_id, uses_wal_proposer, diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 98b6379106..9b50a6b9e4 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -3,17 +3,16 @@ //! Now it also provides init method which acts like a stub for proper installation //! script which will use local paths. -use anyhow::{bail, Context}; +use anyhow::{bail, ensure, Context}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::env; -use std::fmt::Write; use std::fs; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use zenith_utils::auth::{encode_from_key_file, Claims, Scope}; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{HexZTenantId, ZNodeId, ZTenantId, ZTimelineId}; +use zenith_utils::zid::{HexZTenantId, ZNodeId, ZTenantId, ZTenantTimelineId}; use crate::safekeeper::SafekeeperNode; @@ -24,7 +23,7 @@ use crate::safekeeper::SafekeeperNode; // to 'zenith init --config=' option. See control_plane/simple.conf for // an example. // -#[derive(Serialize, Deserialize, Clone, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] pub struct LocalEnv { // Base directory for all the nodes (the pageserver, safekeepers and // compute nodes). @@ -63,12 +62,10 @@ pub struct LocalEnv { /// Every tenant has a first timeline created for it, currently the only one ancestor-less for this tenant. /// It is used as a default timeline for branching, if no ancestor timeline is specified. #[serde(default)] - // TODO kb this does not survive calls between invocations, so will have to persist it. - // Then it comes back to names again? - pub initial_timelines: HashMap, + pub branch_name_mappings: HashMap, } -#[derive(Serialize, Deserialize, Clone, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct PageServerConf { // node id @@ -96,7 +93,7 @@ impl Default for PageServerConf { } } -#[derive(Serialize, Deserialize, Clone, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct SafekeeperConf { pub id: ZNodeId, @@ -222,6 +219,39 @@ impl LocalEnv { Ok(env) } + pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> { + // Currently, the user first passes a config file with 'zenith init --config=' + // We read that in, in `create_config`, and fill any missing defaults. Then it's saved + // to .zenith/config. TODO: We lose any formatting and comments along the way, which is + // a bit sad. + let mut conf_content = r#"# This file describes a locale deployment of the page server +# and safekeeeper node. It is read by the 'zenith' command-line +# utility. +"# + .to_string(); + + // Convert the LocalEnv to a toml file. + // + // This could be as simple as this: + // + // conf_content += &toml::to_string_pretty(env)?; + // + // But it results in a "values must be emitted before tables". I'm not sure + // why, AFAICS the table, i.e. 'safekeepers: Vec' is last. + // Maybe rust reorders the fields to squeeze avoid padding or something? + // In any case, converting to toml::Value first, and serializing that, works. + // See https://github.com/alexcrichton/toml-rs/issues/142 + conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?; + + let target_config_path = base_path.join("config"); + fs::write(&target_config_path, conf_content).with_context(|| { + format!( + "Failed to write config file into path '{}'", + target_config_path.display() + ) + }) + } + // this function is used only for testing purposes in CLI e g generate tokens during init pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result { let private_key_path = if self.private_key_path.is_absolute() { @@ -240,15 +270,15 @@ impl LocalEnv { pub fn init(&mut self) -> anyhow::Result<()> { // check if config already exists let base_path = &self.base_data_dir; - if base_path == Path::new("") { - bail!("repository base path is missing"); - } - if base_path.exists() { - bail!( - "directory '{}' already exists. Perhaps already initialized?", - base_path.to_str().unwrap() - ); - } + ensure!( + base_path != Path::new(""), + "repository base path is missing" + ); + ensure!( + !base_path.exists(), + "directory '{}' already exists. Perhaps already initialized?", + base_path.display() + ); fs::create_dir(&base_path)?; @@ -300,36 +330,7 @@ impl LocalEnv { fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?; } - let mut conf_content = String::new(); - - // Currently, the user first passes a config file with 'zenith init --config=' - // We read that in, in `create_config`, and fill any missing defaults. Then it's saved - // to .zenith/config. TODO: We lose any formatting and comments along the way, which is - // a bit sad. - write!( - &mut conf_content, - r#"# This file describes a locale deployment of the page server -# and safekeeeper node. It is read by the 'zenith' command-line -# utility. -"# - )?; - - // Convert the LocalEnv to a toml file. - // - // This could be as simple as this: - // - // conf_content += &toml::to_string_pretty(env)?; - // - // But it results in a "values must be emitted before tables". I'm not sure - // why, AFAICS the table, i.e. 'safekeepers: Vec' is last. - // Maybe rust reorders the fields to squeeze avoid padding or something? - // In any case, converting to toml::Value first, and serializing that, works. - // See https://github.com/alexcrichton/toml-rs/issues/142 - conf_content += &toml::to_string_pretty(&toml::Value::try_from(&self)?)?; - - fs::write(base_path.join("config"), conf_content)?; - - Ok(()) + self.persist_config(base_path) } } diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index fc29767ddd..8b4dc57342 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -1,7 +1,5 @@ //! //! Timeline management code -//! -// TODO: move all paths construction to conf impl // use anyhow::{bail, Context, Result}; diff --git a/test_runner/batch_others/test_auth.py b/test_runner/batch_others/test_auth.py index 4d1d0847ed..e92eb2e044 100644 --- a/test_runner/batch_others/test_auth.py +++ b/test_runner/batch_others/test_auth.py @@ -25,21 +25,24 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): ps.safe_psql("set FOO", password=tenant_token) ps.safe_psql("set FOO", password=management_token) + new_timeline_id = env.zenith_cli.create_branch('test_pageserver_auth', + tenant_id=env.initial_tenant) + # tenant can create branches tenant_http_client.timeline_create(timeline_id=uuid4(), tenant_id=env.initial_tenant, - ancestor_timeline_id=env.initial_timeline) + ancestor_timeline_id=new_timeline_id) # console can create branches for tenant management_http_client.timeline_create(timeline_id=uuid4(), tenant_id=env.initial_tenant, - ancestor_timeline_id=env.initial_timeline) + ancestor_timeline_id=new_timeline_id) # fail to create branch using token with different tenant_id with pytest.raises(ZenithPageserverApiException, match='Forbidden: Tenant id mismatch. Permission denied'): invalid_tenant_http_client.timeline_create(timeline_id=uuid4(), tenant_id=env.initial_tenant, - ancestor_timeline_id=env.initial_timeline) + ancestor_timeline_id=new_timeline_id) # create tenant using management token management_http_client.tenant_create(uuid4()) @@ -59,9 +62,9 @@ def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_w zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() - branch = f"test_compute_auth_to_pageserver{with_wal_acceptors}" - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start(branch, timeline_id=new_timeline_id) + branch = f'test_compute_auth_to_pageserver{with_wal_acceptors}' + env.zenith_cli.create_branch(branch) + pg = env.postgres.create_start(branch) with closing(pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/batch_others/test_backpressure.py b/test_runner/batch_others/test_backpressure.py index 2b064c9fa8..ff34121327 100644 --- a/test_runner/batch_others/test_backpressure.py +++ b/test_runner/batch_others/test_backpressure.py @@ -95,7 +95,7 @@ def test_backpressure_received_lsn_lag(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() # Create a branch for us - env.zenith_cli.create_branch("test_backpressure", "main") + env.zenith_cli.create_branch('test_backpressure') pg = env.postgres.create_start('test_backpressure', config_lines=['max_replication_write_lag=30MB']) diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index f8ff1741b4..4e2be352f4 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -22,9 +22,8 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): env = zenith_env_builder.init_start() # Branch at the point where only 100 rows were inserted - test_branch_behind_timeline_id = env.zenith_cli.branch_timeline() - pgmain = env.postgres.create_start('test_branch_behind', - timeline_id=test_branch_behind_timeline_id) + env.zenith_cli.create_branch('test_branch_behind') + pgmain = env.postgres.create_start('test_branch_behind') log.info("postgres is running on 'test_branch_behind' branch") main_pg_conn = pgmain.connect() @@ -60,8 +59,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 200100 rows: {lsn_b}') # Branch at the point where only 100 rows were inserted - test_branch_behind_hundred_timeline_id = env.zenith_cli.branch_timeline( - ancestor_timeline_id=test_branch_behind_timeline_id, ancestor_start_lsn=lsn_a) + env.zenith_cli.create_branch('test_branch_behind_hundred', + 'test_branch_behind', + ancestor_start_lsn=lsn_a) # Insert many more rows. This generates enough WAL to fill a few segments. main_cur.execute(''' @@ -76,13 +76,12 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 400100 rows: {lsn_c}') # Branch at the point where only 200100 rows were inserted - test_branch_behind_more_timeline_id = env.zenith_cli.branch_timeline( - ancestor_timeline_id=test_branch_behind_timeline_id, ancestor_start_lsn=lsn_b) + env.zenith_cli.create_branch('test_branch_behind_more', + 'test_branch_behind', + ancestor_start_lsn=lsn_b) - pg_hundred = env.postgres.create_start("test_branch_behind_hundred", - timeline_id=test_branch_behind_hundred_timeline_id) - pg_more = env.postgres.create_start("test_branch_behind_more", - timeline_id=test_branch_behind_more_timeline_id) + pg_hundred = env.postgres.create_start('test_branch_behind_hundred') + pg_more = env.postgres.create_start('test_branch_behind_more') # On the 'hundred' branch, we should see only 100 rows hundred_pg_conn = pg_hundred.connect() @@ -103,23 +102,23 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): # Check bad lsn's for branching # branch at segment boundary - test_branch_segment_boundary_timeline_id = env.zenith_cli.branch_timeline( - ancestor_timeline_id=test_branch_behind_timeline_id, ancestor_start_lsn="0/3000000") - pg = env.postgres.create_start("test_branch_segment_boundary", - timeline_id=test_branch_segment_boundary_timeline_id) + env.zenith_cli.create_branch('test_branch_segment_boundary', + 'test_branch_behind', + ancestor_start_lsn="0/3000000") + pg = env.postgres.create_start('test_branch_segment_boundary') cur = pg.connect().cursor() cur.execute('SELECT 1') assert cur.fetchone() == (1, ) # branch at pre-initdb lsn with pytest.raises(Exception, match="invalid branch start lsn"): - env.zenith_cli.branch_timeline(ancestor_timeline_id=env.initial_timeline, - ancestor_start_lsn="0/42") + env.zenith_cli.create_branch('test_branch_preinitdb', ancestor_start_lsn="0/42") # branch at pre-ancestor lsn with pytest.raises(Exception, match="less than timeline ancestor lsn"): - env.zenith_cli.branch_timeline(ancestor_timeline_id=test_branch_behind_timeline_id, - ancestor_start_lsn="0/42") + env.zenith_cli.create_branch('test_branch_preinitdb', + 'test_branch_behind', + ancestor_start_lsn="0/42") # check that we cannot create branch based on garbage collected data with closing(env.pageserver.connect()) as psconn: @@ -131,8 +130,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): with pytest.raises(Exception, match="invalid branch start lsn"): # this gced_lsn is pretty random, so if gc is disabled this woudln't fail - env.zenith_cli.branch_timeline(ancestor_timeline_id=test_branch_behind_timeline_id, - ancestor_start_lsn=gced_lsn) + env.zenith_cli.create_branch('test_branch_create_fail', + 'test_branch_behind', + ancestor_start_lsn=gced_lsn) # check that after gc everything is still there hundred_cur.execute('SELECT count(*) FROM foo') diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/batch_others/test_clog_truncate.py index 9d3927aa84..b7eeedb23e 100644 --- a/test_runner/batch_others/test_clog_truncate.py +++ b/test_runner/batch_others/test_clog_truncate.py @@ -12,7 +12,7 @@ from fixtures.log_helper import log # def test_clog_truncate(zenith_simple_env: ZenithEnv): env = zenith_simple_env - test_clog_truncate_timeline_id = env.zenith_cli.branch_timeline() + env.zenith_cli.create_branch('test_clog_truncate', 'empty') # set agressive autovacuum to make sure that truncation will happen config = [ @@ -25,9 +25,7 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv): 'autovacuum_freeze_max_age=100000' ] - pg = env.postgres.create_start('test_clog_truncate', - config_lines=config, - timeline_id=test_clog_truncate_timeline_id) + pg = env.postgres.create_start('test_clog_truncate', config_lines=config) log.info('postgres is running on test_clog_truncate branch') # Install extension containing function needed for test @@ -64,11 +62,10 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv): # create new branch after clog truncation and start a compute node on it log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}') - test_clog_truncate_new_timeline_id = env.zenith_cli.branch_timeline( - ancestor_timeline_id=test_clog_truncate_timeline_id, - ancestor_start_lsn=lsn_after_truncation) - pg2 = env.postgres.create_start('test_clog_truncate_new', - timeline_id=test_clog_truncate_new_timeline_id) + env.zenith_cli.create_branch('test_clog_truncate_new', + 'test_clog_truncate', + ancestor_start_lsn=lsn_after_truncation) + pg2 = env.postgres.create_start('test_clog_truncate_new') log.info('postgres is running on test_clog_truncate_new branch') # check that new node doesn't contain truncated segment diff --git a/test_runner/batch_others/test_config.py b/test_runner/batch_others/test_config.py index bd1f8b487f..fd2b3b4e99 100644 --- a/test_runner/batch_others/test_config.py +++ b/test_runner/batch_others/test_config.py @@ -9,10 +9,10 @@ from fixtures.log_helper import log # def test_config(zenith_simple_env: ZenithEnv): env = zenith_simple_env - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_config', - config_lines=['log_min_messages=debug1'], - timeline_id=new_timeline_id) + env.zenith_cli.create_branch("test_config", "empty") + + # change config + pg = env.postgres.create_start('test_config', config_lines=['log_min_messages=debug1']) log.info('postgres is running on test_config branch') with closing(pg.connect()) as conn: diff --git a/test_runner/batch_others/test_createdropdb.py b/test_runner/batch_others/test_createdropdb.py index e77e1928b8..88937fa0dc 100644 --- a/test_runner/batch_others/test_createdropdb.py +++ b/test_runner/batch_others/test_createdropdb.py @@ -11,9 +11,9 @@ from fixtures.log_helper import log # def test_createdb(zenith_simple_env: ZenithEnv): env = zenith_simple_env - test_createdb_timeline_id = env.zenith_cli.branch_timeline() + env.zenith_cli.create_branch('test_createdb', 'empty') - pg = env.postgres.create_start('test_createdb', timeline_id=test_createdb_timeline_id) + pg = env.postgres.create_start('test_createdb') log.info("postgres is running on 'test_createdb' branch") with closing(pg.connect()) as conn: @@ -27,9 +27,8 @@ def test_createdb(zenith_simple_env: ZenithEnv): lsn = cur.fetchone()[0] # Create a branch - test_createdb2_timeline_id = env.zenith_cli.branch_timeline( - ancestor_timeline_id=test_createdb_timeline_id, ancestor_start_lsn=lsn) - pg2 = env.postgres.create_start('test_createdb2', timeline_id=test_createdb2_timeline_id) + env.zenith_cli.create_branch('test_createdb2', 'test_createdb', ancestor_start_lsn=lsn) + pg2 = env.postgres.create_start('test_createdb2') # Test that you can connect to the new database on both branches for db in (pg, pg2): @@ -41,8 +40,8 @@ def test_createdb(zenith_simple_env: ZenithEnv): # def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir): env = zenith_simple_env - test_dropdb_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_dropdb', timeline_id=test_dropdb_timeline_id) + env.zenith_cli.create_branch('test_dropdb', 'empty') + pg = env.postgres.create_start('test_dropdb') log.info("postgres is running on 'test_dropdb' branch") with closing(pg.connect()) as conn: @@ -65,15 +64,15 @@ def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir): lsn_after_drop = cur.fetchone()[0] # Create two branches before and after database drop. - test_before_dropdb_timeline_db = env.zenith_cli.branch_timeline( - ancestor_timeline_id=test_dropdb_timeline_id, ancestor_start_lsn=lsn_before_drop) - pg_before = env.postgres.create_start('test_before_dropdb', - timeline_id=test_before_dropdb_timeline_db) + env.zenith_cli.create_branch('test_before_dropdb', + 'test_dropdb', + ancestor_start_lsn=lsn_before_drop) + pg_before = env.postgres.create_start('test_before_dropdb') - test_after_dropdb_timeline_id = env.zenith_cli.branch_timeline( - ancestor_timeline_id=test_dropdb_timeline_id, ancestor_start_lsn=lsn_after_drop) - pg_after = env.postgres.create_start('test_after_dropdb', - timeline_id=test_after_dropdb_timeline_id) + env.zenith_cli.create_branch('test_after_dropdb', + 'test_dropdb', + ancestor_start_lsn=lsn_after_drop) + pg_after = env.postgres.create_start('test_after_dropdb') # Test that database exists on the branch before drop pg_before.connect(dbname='foodb').close() diff --git a/test_runner/batch_others/test_createuser.py b/test_runner/batch_others/test_createuser.py index 8f825a0a1a..efb2af3f07 100644 --- a/test_runner/batch_others/test_createuser.py +++ b/test_runner/batch_others/test_createuser.py @@ -9,8 +9,8 @@ from fixtures.log_helper import log # def test_createuser(zenith_simple_env: ZenithEnv): env = zenith_simple_env - test_createuser_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_createuser', timeline_id=test_createuser_timeline_id) + env.zenith_cli.create_branch('test_createuser', 'empty') + pg = env.postgres.create_start('test_createuser') log.info("postgres is running on 'test_createuser' branch") with closing(pg.connect()) as conn: @@ -24,9 +24,8 @@ def test_createuser(zenith_simple_env: ZenithEnv): lsn = cur.fetchone()[0] # Create a branch - test_createuser2_timeline_id = env.zenith_cli.branch_timeline( - ancestor_timeline_id=test_createuser_timeline_id, ancestor_start_lsn=lsn) - pg2 = env.postgres.create_start('test_createuser2', timeline_id=test_createuser2_timeline_id) + env.zenith_cli.create_branch('test_createuser2', 'test_createuser', ancestor_start_lsn=lsn) + pg2 = env.postgres.create_start('test_createuser2') # Test that you can connect to new branch as a new user assert pg2.safe_psql('select current_user', username='testuser') == [('testuser', )] diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/batch_others/test_gc_aggressive.py index 7dd38a5799..9de6ba9f59 100644 --- a/test_runner/batch_others/test_gc_aggressive.py +++ b/test_runner/batch_others/test_gc_aggressive.py @@ -1,6 +1,7 @@ from contextlib import closing import asyncio +import asyncpg import random from fixtures.zenith_fixtures import ZenithEnv, Postgres, Safekeeper @@ -54,8 +55,8 @@ async def update_and_gc(env: ZenithEnv, pg: Postgres, timeline: str): # def test_gc_aggressive(zenith_simple_env: ZenithEnv): env = zenith_simple_env - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_gc_aggressive', timeline_id=new_timeline_id) + env.zenith_cli.create_branch("test_gc_aggressive", "empty") + pg = env.postgres.create_start('test_gc_aggressive') log.info('postgres is running on test_gc_aggressive branch') conn = pg.connect() diff --git a/test_runner/batch_others/test_multixact.py b/test_runner/batch_others/test_multixact.py index 11f8000226..7a508a67fb 100644 --- a/test_runner/batch_others/test_multixact.py +++ b/test_runner/batch_others/test_multixact.py @@ -10,8 +10,8 @@ from fixtures.log_helper import log # def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir): env = zenith_simple_env - test_multixact_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_multixact', timeline_id=test_multixact_timeline_id) + env.zenith_cli.create_branch('test_multixact', 'empty') + pg = env.postgres.create_start('test_multixact') log.info("postgres is running on 'test_multixact' branch") pg_conn = pg.connect() @@ -60,10 +60,8 @@ def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir): assert int(next_multixact_id) > int(next_multixact_id_old) # Branch at this point - test_multixact_new_timeline_id = env.zenith_cli.branch_timeline( - ancestor_timeline_id=test_multixact_timeline_id, ancestor_start_lsn=lsn) - pg_new = env.postgres.create_start('test_multixact_new', - timeline_id=test_multixact_new_timeline_id) + env.zenith_cli.create_branch('test_multixact_new', 'test_multixact', ancestor_start_lsn=lsn) + pg_new = env.postgres.create_start('test_multixact_new') log.info("postgres is running on 'test_multixact_new' branch") pg_new_conn = pg_new.connect() diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py index f0701dfe4f..d09fb24913 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -16,8 +16,8 @@ from fixtures.log_helper import log # def test_old_request_lsn(zenith_simple_env: ZenithEnv): env = zenith_simple_env - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_old_request_lsn', timeline_id=new_timeline_id) + env.zenith_cli.create_branch("test_old_request_lsn", "empty") + pg = env.postgres.create_start('test_old_request_lsn') log.info('postgres is running on test_old_request_lsn branch') pg_conn = pg.connect() diff --git a/test_runner/batch_others/test_pageserver_catchup.py b/test_runner/batch_others/test_pageserver_catchup.py index ba77a4a321..7093a1bdb3 100644 --- a/test_runner/batch_others/test_pageserver_catchup.py +++ b/test_runner/batch_others/test_pageserver_catchup.py @@ -16,9 +16,8 @@ def test_pageserver_catchup_while_compute_down(zenith_env_builder: ZenithEnvBuil zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down', - timeline_id=new_timeline_id) + env.zenith_cli.create_branch('test_pageserver_catchup_while_compute_down') + pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down') pg_conn = pg.connect() cur = pg_conn.cursor() @@ -60,8 +59,7 @@ def test_pageserver_catchup_while_compute_down(zenith_env_builder: ZenithEnvBuil env.safekeepers[2].start() # restart compute node - pg.stop_and_destroy().create_start('test_pageserver_catchup_while_compute_down', - timeline_id=new_timeline_id) + pg.stop_and_destroy().create_start('test_pageserver_catchup_while_compute_down') # Ensure that basebackup went correct and pageserver returned all data pg_conn = pg.connect() diff --git a/test_runner/batch_others/test_pageserver_restart.py b/test_runner/batch_others/test_pageserver_restart.py index f1d154408c..57f9db8f96 100644 --- a/test_runner/batch_others/test_pageserver_restart.py +++ b/test_runner/batch_others/test_pageserver_restart.py @@ -15,8 +15,8 @@ def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_pageserver_restart', timeline_id=new_timeline_id) + env.zenith_cli.create_branch('test_pageserver_restart') + pg = env.postgres.create_start('test_pageserver_restart') pg_conn = pg.connect() cur = pg_conn.cursor() diff --git a/test_runner/batch_others/test_parallel_copy.py b/test_runner/batch_others/test_parallel_copy.py index 8e954a8e51..4b7cc58d42 100644 --- a/test_runner/batch_others/test_parallel_copy.py +++ b/test_runner/batch_others/test_parallel_copy.py @@ -35,8 +35,8 @@ async def parallel_load_same_table(pg: Postgres, n_parallel: int): # Load data into one table with COPY TO from 5 parallel connections def test_parallel_copy(zenith_simple_env: ZenithEnv, n_parallel=5): env = zenith_simple_env - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_parallel_copy', timeline_id=new_timeline_id) + env.zenith_cli.create_branch("test_parallel_copy", "empty") + pg = env.postgres.create_start('test_parallel_copy') log.info("postgres is running on 'test_parallel_copy' branch") # Create test table diff --git a/test_runner/batch_others/test_pgbench.py b/test_runner/batch_others/test_pgbench.py index 207f1e1e2c..09713023bc 100644 --- a/test_runner/batch_others/test_pgbench.py +++ b/test_runner/batch_others/test_pgbench.py @@ -4,8 +4,8 @@ from fixtures.log_helper import log def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin): env = zenith_simple_env - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_pgbench', timeline_id=new_timeline_id) + env.zenith_cli.create_branch("test_pgbench", "empty") + pg = env.postgres.create_start('test_pgbench') log.info("postgres is running on 'test_pgbench' branch") connstr = pg.connstr() diff --git a/test_runner/batch_others/test_readonly_node.py b/test_runner/batch_others/test_readonly_node.py index 2998ea7528..5d5949add6 100644 --- a/test_runner/batch_others/test_readonly_node.py +++ b/test_runner/batch_others/test_readonly_node.py @@ -11,9 +11,8 @@ from fixtures.zenith_fixtures import ZenithEnv # def test_readonly_node(zenith_simple_env: ZenithEnv): env = zenith_simple_env - test_readonly_node_timeline_id = env.zenith_cli.branch_timeline() - pgmain = env.postgres.create_start('test_readonly_node', - timeline_id=test_readonly_node_timeline_id) + env.zenith_cli.create_branch('test_readonly_node', 'empty') + pgmain = env.postgres.create_start('test_readonly_node') log.info("postgres is running on 'test_readonly_node' branch") main_pg_conn = pgmain.connect() @@ -53,14 +52,10 @@ def test_readonly_node(zenith_simple_env: ZenithEnv): log.info('LSN after 400100 rows: ' + lsn_c) # Create first read-only node at the point where only 100 rows were inserted - pg_hundred = env.postgres.create_start("test_readonly_node_hundred", - timeline_id=test_readonly_node_timeline_id, - lsn=lsn_a) + pg_hundred = env.postgres.create_start("test_readonly_node_hundred", lsn=lsn_a) # And another at the point where 200100 rows were inserted - pg_more = env.postgres.create_start("test_readonly_node_more", - timeline_id=test_readonly_node_timeline_id, - lsn=lsn_b) + pg_more = env.postgres.create_start("test_readonly_node_more", lsn=lsn_b) # On the 'hundred' node, we should see only 100 rows hundred_pg_conn = pg_hundred.connect() @@ -79,9 +74,7 @@ def test_readonly_node(zenith_simple_env: ZenithEnv): assert main_cur.fetchone() == (400100, ) # Check creating a node at segment boundary - pg = env.postgres.create_start("test_branch_segment_boundary", - timeline_id=test_readonly_node_timeline_id, - lsn='0/3000000') + pg = env.postgres.create_start("test_branch_segment_boundary", lsn='0/3000000') cur = pg.connect().cursor() cur.execute('SELECT 1') assert cur.fetchone() == (1, ) @@ -89,6 +82,4 @@ def test_readonly_node(zenith_simple_env: ZenithEnv): # Create node at pre-initdb lsn with pytest.raises(Exception, match="invalid basebackup lsn"): # compute node startup with invalid LSN should fail - env.zenith_cli.pg_start("test_readonly_node_preinitdb", - timeline_id=test_readonly_node_timeline_id, - lsn="0/42") + env.zenith_cli.pg_start("test_readonly_node_preinitdb", lsn="0/42") diff --git a/test_runner/batch_others/test_restart_compute.py b/test_runner/batch_others/test_restart_compute.py index baa1f787df..fd06561c00 100644 --- a/test_runner/batch_others/test_restart_compute.py +++ b/test_runner/batch_others/test_restart_compute.py @@ -15,8 +15,8 @@ def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptor zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_restart_compute', timeline_id=new_timeline_id) + env.zenith_cli.create_branch('test_restart_compute') + pg = env.postgres.create_start('test_restart_compute') log.info("postgres is running on 'test_restart_compute' branch") with closing(pg.connect()) as conn: @@ -29,7 +29,7 @@ def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptor log.info(f"res = {r}") # Remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute', timeline_id=new_timeline_id) + pg.stop_and_destroy().create_start('test_restart_compute') with closing(pg.connect()) as conn: with conn.cursor() as cur: @@ -48,7 +48,7 @@ def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptor log.info(f"res = {r}") # Again remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute', timeline_id=new_timeline_id) + pg.stop_and_destroy().create_start('test_restart_compute') # That select causes lots of FPI's and increases probability of wakeepers # lagging behind after query completion @@ -62,7 +62,7 @@ def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptor log.info(f"res = {r}") # And again remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute', timeline_id=new_timeline_id) + pg.stop_and_destroy().create_start('test_restart_compute') with closing(pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/batch_others/test_snapfiles_gc.py b/test_runner/batch_others/test_snapfiles_gc.py index fb02e54be2..c6d4512bc9 100644 --- a/test_runner/batch_others/test_snapfiles_gc.py +++ b/test_runner/batch_others/test_snapfiles_gc.py @@ -14,8 +14,8 @@ from fixtures.log_helper import log # def test_layerfiles_gc(zenith_simple_env: ZenithEnv): env = zenith_simple_env - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_layerfiles_gc', timeline_id=new_timeline_id) + env.zenith_cli.create_branch("test_layerfiles_gc", "empty") + pg = env.postgres.create_start('test_layerfiles_gc') with closing(pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/batch_others/test_subxacts.py b/test_runner/batch_others/test_subxacts.py index 6153bd1fe2..bed1c4be63 100644 --- a/test_runner/batch_others/test_subxacts.py +++ b/test_runner/batch_others/test_subxacts.py @@ -10,8 +10,8 @@ from fixtures.log_helper import log # CLOG. def test_subxacts(zenith_simple_env: ZenithEnv, test_output_dir): env = zenith_simple_env - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_subxacts', timeline_id=new_timeline_id) + env.zenith_cli.create_branch("test_subxacts", "empty") + pg = env.postgres.create_start('test_subxacts') log.info("postgres is running on 'test_subxacts' branch") pg_conn = pg.connect() diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 429aee8488..a1286adfb0 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -127,14 +127,12 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, # create folder for remote storage mock remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage' - (tenant, _) = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) + tenant = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) log.info("tenant to relocate %s", tenant) - new_timeline_id = env.zenith_cli.branch_timeline(tenant_id=tenant) + env.zenith_cli.create_branch('test_tenant_relocation', tenant_id=tenant) - tenant_pg = env.postgres.create_start("test_tenant_relocation", - tenant_id=tenant, - timeline_id=new_timeline_id) + tenant_pg = env.postgres.create_start("test_tenant_relocation", tenant_id=tenant) # insert some data with closing(tenant_pg.connect()) as conn: diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/batch_others/test_tenants.py index 20a910e9ce..87acf2086d 100644 --- a/test_runner/batch_others/test_tenants.py +++ b/test_runner/batch_others/test_tenants.py @@ -12,23 +12,21 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_wal_acce env = zenith_env_builder.init_start() """Tests tenants with and without wal acceptors""" - (tenant_1, initial_timeline_1) = env.zenith_cli.create_tenant() - (tenant_2, initial_timeline_2) = env.zenith_cli.create_tenant() + tenant_1 = env.zenith_cli.create_tenant() + tenant_2 = env.zenith_cli.create_tenant() - new_timeline_tenant_1 = env.zenith_cli.branch_timeline(tenant_id=tenant_1, - ancestor_timeline_id=initial_timeline_1) - new_timeline_tenant_2 = env.zenith_cli.branch_timeline(tenant_id=tenant_2, - ancestor_timeline_id=initial_timeline_2) + env.zenith_cli.create_branch(f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', + tenant_id=tenant_1) + env.zenith_cli.create_branch(f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', + tenant_id=tenant_2) pg_tenant1 = env.postgres.create_start( - f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", + f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', tenant_id=tenant_1, - timeline_id=new_timeline_tenant_1, ) pg_tenant2 = env.postgres.create_start( - f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", + f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', tenant_id=tenant_2, - timeline_id=new_timeline_tenant_2, ) for pg in [pg_tenant1, pg_tenant2]: diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 49143d0000..7d8ab551b0 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -10,14 +10,13 @@ import time def test_timeline_size(zenith_simple_env: ZenithEnv): env = zenith_simple_env # Branch at the point where only 100 rows were inserted - new_timeline_id = env.zenith_cli.branch_timeline() + new_timeline_id = env.zenith_cli.create_branch('test_timeline_size', 'empty') client = env.pageserver.http_client() res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) - print(f'@@@@@@@@@@\n{res}\n@@@@@@@@@@@') assert res["current_logical_size"] == res["current_logical_size_non_incremental"] - pgmain = env.postgres.create_start("test_timeline_size", timeline_id=new_timeline_id) + pgmain = env.postgres.create_start("test_timeline_size") log.info("postgres is running on 'test_timeline_size' branch") with closing(pgmain.connect()) as conn: @@ -69,7 +68,7 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() - new_timeline_id = env.zenith_cli.branch_timeline() + new_timeline_id = env.zenith_cli.create_branch('test_timeline_size_quota') client = env.pageserver.http_client() res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) @@ -78,8 +77,7 @@ def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder): pgmain = env.postgres.create_start( "test_timeline_size_quota", # Set small limit for the test - config_lines=['zenith.max_cluster_size=30MB'], - timeline_id=new_timeline_id) + config_lines=['zenith.max_cluster_size=30MB']) log.info("postgres is running on 'test_timeline_size_quota' branch") with closing(pgmain.connect()) as conn: diff --git a/test_runner/batch_others/test_twophase.py b/test_runner/batch_others/test_twophase.py index b479e9de22..4afdc7e0be 100644 --- a/test_runner/batch_others/test_twophase.py +++ b/test_runner/batch_others/test_twophase.py @@ -9,10 +9,8 @@ from fixtures.log_helper import log # def test_twophase(zenith_simple_env: ZenithEnv): env = zenith_simple_env - test_twophase_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_twophase', - config_lines=['max_prepared_transactions=5'], - timeline_id=test_twophase_timeline_id) + env.zenith_cli.create_branch("test_twophase", "empty") + pg = env.postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5']) log.info("postgres is running on 'test_twophase' branch") conn = pg.connect() @@ -57,14 +55,12 @@ def test_twophase(zenith_simple_env: ZenithEnv): assert len(twophase_files) == 2 # Create a branch with the transaction in prepared state - test_twophase_prepared_timeline_id = env.zenith_cli.branch_timeline( - ancestor_timeline_id=test_twophase_timeline_id) + env.zenith_cli.create_branch("test_twophase_prepared", "test_twophase") # Start compute on the new branch pg2 = env.postgres.create_start( 'test_twophase_prepared', config_lines=['max_prepared_transactions=5'], - timeline_id=test_twophase_prepared_timeline_id, ) # Check that we restored only needed twophase files diff --git a/test_runner/batch_others/test_vm_bits.py b/test_runner/batch_others/test_vm_bits.py index a657b3e3fd..49e48dd450 100644 --- a/test_runner/batch_others/test_vm_bits.py +++ b/test_runner/batch_others/test_vm_bits.py @@ -9,8 +9,8 @@ from fixtures.log_helper import log def test_vm_bit_clear(zenith_simple_env: ZenithEnv): env = zenith_simple_env - test_vm_bit_clear_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_vm_bit_clear', timeline_id=test_vm_bit_clear_timeline_id) + env.zenith_cli.create_branch("test_vm_bit_clear", "empty") + pg = env.postgres.create_start('test_vm_bit_clear') log.info("postgres is running on 'test_vm_bit_clear' branch") pg_conn = pg.connect() @@ -33,8 +33,7 @@ def test_vm_bit_clear(zenith_simple_env: ZenithEnv): cur.execute('UPDATE vmtest_update SET id = 5000 WHERE id = 1') # Branch at this point, to test that later - test_vm_bit_clear_new_timeline_id = env.zenith_cli.branch_timeline( - ancestor_timeline_id=test_vm_bit_clear_timeline_id) + env.zenith_cli.create_branch("test_vm_bit_clear_new", "test_vm_bit_clear") # Clear the buffer cache, to force the VM page to be re-fetched from # the page server @@ -62,8 +61,7 @@ def test_vm_bit_clear(zenith_simple_env: ZenithEnv): # a dirty VM page is evicted. If the VM bit was not correctly cleared by the # earlier WAL record, the full-page image hides the problem. Starting a new # server at the right point-in-time avoids that full-page image. - pg_new = env.postgres.create_start('test_vm_bit_clear_new', - timeline_id=test_vm_bit_clear_new_timeline_id) + pg_new = env.postgres.create_start('test_vm_bit_clear_new') log.info("postgres is running on 'test_vm_bit_clear_new' branch") pg_new_conn = pg_new.connect() diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 3e39228494..9518a14b75 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -24,8 +24,8 @@ def test_normal_work(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_wal_acceptors_normal_work', timeline_id=new_timeline_id) + env.zenith_cli.create_branch('test_wal_acceptors_normal_work') + pg = env.postgres.create_start('test_wal_acceptors_normal_work') with closing(pg.connect()) as conn: with conn.cursor() as cur: @@ -62,8 +62,8 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): # start postgres on each timeline pgs = [] for branch_name in branch_names: - new_timeline_id = env.zenith_cli.branch_timeline() - pgs.append(env.postgres.create_start(branch_name, timeline_id=new_timeline_id)) + new_timeline_id = env.zenith_cli.create_branch(branch_name) + pgs.append(env.postgres.create_start(branch_name)) branch_names_to_timeline_ids[branch_name] = new_timeline_id tenant_id = env.initial_tenant @@ -87,7 +87,6 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): timeline_metrics = [] with env.pageserver.http_client() as pageserver_http: for timeline_detail in timeline_details: - print(f"@@@@@@@@@@@\n{timeline_detail}\n@@@@@@@@@@@") timeline_id: str = timeline_detail["timeline_id"] m = TimelineMetrics( @@ -188,8 +187,8 @@ def test_restarts(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = n_acceptors env = zenith_env_builder.init_start() - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_wal_acceptors_restarts', timeline_id=new_timeline_id) + env.zenith_cli.create_branch('test_wal_acceptors_restarts') + pg = env.postgres.create_start('test_wal_acceptors_restarts') # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -225,8 +224,8 @@ def test_unavailability(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 2 env = zenith_env_builder.init_start() - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_wal_acceptors_unavailability', timeline_id=new_timeline_id) + env.zenith_cli.create_branch('test_wal_acceptors_unavailability') + pg = env.postgres.create_start('test_wal_acceptors_unavailability') # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -296,9 +295,8 @@ def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value): zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_wal_acceptors_race_conditions', - timeline_id=new_timeline_id) + env.zenith_cli.create_branch('test_wal_acceptors_race_conditions') + pg = env.postgres.create_start('test_wal_acceptors_race_conditions') # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -462,8 +460,8 @@ def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_timeline_status', timeline_id=new_timeline_id) + env.zenith_cli.create_branch('test_timeline_status') + pg = env.postgres.create_start('test_timeline_status') wa = env.safekeepers[0] wa_http_cli = wa.http_client() @@ -636,12 +634,12 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 4 env = zenith_env_builder.init_start() - new_timeline_id = env.zenith_cli.branch_timeline() + env.zenith_cli.create_branch('test_replace_safekeeper') log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() active_safekeepers = [1, 2, 3] - pg = env.postgres.create('test_replace_safekeeper', timeline_id=new_timeline_id) + pg = env.postgres.create('test_replace_safekeeper') pg.adjust_for_wal_acceptors(safekeepers_guc(env, active_safekeepers)) pg.start() @@ -679,7 +677,7 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): show_statuses(env.safekeepers, tenant_id, timeline_id) log.info("Recreate postgres to replace failed sk1 with new sk4") - pg.stop_and_destroy().create('test_replace_safekeeper', timeline_id=uuid.UUID(timeline_id)) + pg.stop_and_destroy().create('test_replace_safekeeper') active_safekeepers = [2, 3, 4] env.safekeepers[3].start() pg.adjust_for_wal_acceptors(safekeepers_guc(env, active_safekeepers)) diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index 719e8c163f..31ace7eab3 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -202,9 +202,8 @@ def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() - new_timeline_id = env.zenith_cli.branch_timeline() - pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load', - timeline_id=new_timeline_id) + env.zenith_cli.create_branch('test_wal_acceptors_restarts_under_load') + pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load') asyncio.run(run_restarts_under_load(pg, env.safekeepers)) diff --git a/test_runner/batch_others/test_zenith_cli.py b/test_runner/batch_others/test_zenith_cli.py index 4f089d4354..8777a653b3 100644 --- a/test_runner/batch_others/test_zenith_cli.py +++ b/test_runner/batch_others/test_zenith_cli.py @@ -36,11 +36,12 @@ def test_cli_timeline_list(zenith_simple_env: ZenithEnv): helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Create a branch for us - main_timeline_id = env.zenith_cli.branch_timeline() + main_timeline_id = env.zenith_cli.create_branch('test_cli_branch_list_main') helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Create a nested branch - nested_timeline_id = env.zenith_cli.branch_timeline(ancestor_timeline_id=main_timeline_id) + nested_timeline_id = env.zenith_cli.create_branch('test_cli_branch_list_nested', + 'test_cli_branch_list_main') helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Check that all new branches are visible via CLI @@ -67,15 +68,13 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv): helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant - tenant1 = uuid.uuid4() - env.zenith_cli.create_tenant(tenant_id=tenant1) + tenant1 = env.zenith_cli.create_tenant() # check tenant1 appeared helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant - tenant2 = uuid.uuid4() - env.zenith_cli.create_tenant(tenant_id=tenant2) + tenant2 = env.zenith_cli.create_tenant() # check tenant2 appeared helper_compare_tenant_list(pageserver_http_client, env) diff --git a/test_runner/batch_pg_regress/test_isolation.py b/test_runner/batch_pg_regress/test_isolation.py index 8dce020dc0..ddafc3815b 100644 --- a/test_runner/batch_pg_regress/test_isolation.py +++ b/test_runner/batch_pg_regress/test_isolation.py @@ -7,12 +7,10 @@ from fixtures.zenith_fixtures import ZenithEnv, base_dir, pg_distrib_dir def test_isolation(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys): env = zenith_simple_env - new_timeline_id = env.zenith_cli.branch_timeline() + env.zenith_cli.create_branch("test_isolation", "empty") # Connect to postgres and create a database called "regression". # isolation tests use prepared transactions, so enable them - pg = env.postgres.create_start('test_isolation', - config_lines=['max_prepared_transactions=100'], - timeline_id=new_timeline_id) + pg = env.postgres.create_start('test_isolation', config_lines=['max_prepared_transactions=100']) pg.safe_psql('CREATE DATABASE isolation_regression') # Create some local directories for pg_isolation_regress to run in. diff --git a/test_runner/batch_pg_regress/test_pg_regress.py b/test_runner/batch_pg_regress/test_pg_regress.py index efeb63fce3..5199f65216 100644 --- a/test_runner/batch_pg_regress/test_pg_regress.py +++ b/test_runner/batch_pg_regress/test_pg_regress.py @@ -7,9 +7,9 @@ from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content, def test_pg_regress(zenith_simple_env: ZenithEnv, test_output_dir: str, pg_bin, capsys): env = zenith_simple_env - new_timeline_id = env.zenith_cli.branch_timeline() + env.zenith_cli.create_branch("test_pg_regress", "empty") # Connect to postgres and create a database called "regression". - pg = env.postgres.create_start('test_pg_regress', timeline_id=new_timeline_id) + pg = env.postgres.create_start('test_pg_regress') pg.safe_psql('CREATE DATABASE regression') # Create some local directories for pg_regress to run in. diff --git a/test_runner/batch_pg_regress/test_zenith_regress.py b/test_runner/batch_pg_regress/test_zenith_regress.py index 2ccbafccfd..31d5b07093 100644 --- a/test_runner/batch_pg_regress/test_zenith_regress.py +++ b/test_runner/batch_pg_regress/test_zenith_regress.py @@ -11,9 +11,9 @@ from fixtures.log_helper import log def test_zenith_regress(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys): env = zenith_simple_env - new_timeline_id = env.zenith_cli.branch_timeline() + env.zenith_cli.create_branch("test_zenith_regress", "empty") # Connect to postgres and create a database called "regression". - pg = env.postgres.create_start('test_zenith_regress', timeline_id=new_timeline_id) + pg = env.postgres.create_start('test_zenith_regress') pg.safe_psql('CREATE DATABASE regression') # Create some local directories for pg_regress to run in. diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 66b9fe54ea..750b02c894 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -64,8 +64,8 @@ class ZenithCompare(PgCompare): self._pg_bin = pg_bin # We only use one branch and one timeline - timeline_id = self.env.zenith_cli.branch_timeline() - self._pg = self.env.postgres.create_start("branch", timeline_id=timeline_id) + self.env.zenith_cli.create_branch(branch_name, 'empty') + self._pg = self.env.postgres.create_start(branch_name) self.timeline = self.pg.safe_psql("SHOW zenith.zenith_timeline")[0][0] # Long-lived cursor, useful for flushing diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 7c4d178a3f..9345c7f238 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1,6 +1,6 @@ from __future__ import annotations -from dataclasses import dataclass, field +from dataclasses import field import textwrap from cached_property import cached_property import asyncpg @@ -29,7 +29,6 @@ from dataclasses import dataclass from psycopg2.extensions import connection as PgConnection from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, TypeVar, cast, Union, Tuple from typing_extensions import Literal -import pytest import requests import backoff # type: ignore @@ -219,7 +218,7 @@ def can_bind(host: str, port: int) -> bool: class PortDistributor: - def __init__(self, base_port: int, port_number: int) -> None: + def __init__(self, base_port: int, port_number: int): self.iterator = iter(range(base_port, base_port + port_number)) def get_port(self) -> int: @@ -424,7 +423,8 @@ class ZenithEnvBuilder: pageserver_config_override: Optional[str] = None, num_safekeepers: int = 0, pageserver_auth_enabled: bool = False, - rust_log_override: Optional[str] = None): + rust_log_override: Optional[str] = None, + default_branch_name='main'): self.repo_dir = repo_dir self.rust_log_override = rust_log_override self.port_distributor = port_distributor @@ -432,6 +432,7 @@ class ZenithEnvBuilder: self.pageserver_config_override = pageserver_config_override self.num_safekeepers = num_safekeepers self.pageserver_auth_enabled = pageserver_auth_enabled + self.default_branch_name = default_branch_name self.env: Optional[ZenithEnv] = None self.s3_mock_server: Optional[MockS3Server] = None @@ -536,7 +537,7 @@ class ZenithEnv: initial_tenant - tenant ID of the initial tenant created in the repository - zenith_cli() - zenith_cli() can be used to run the 'zenith' CLI tool + zenith_cli - can be used to run the 'zenith' CLI tool create_tenant() - initializes a new tenant in the page server, returns the tenant id @@ -546,9 +547,9 @@ class ZenithEnv: self.rust_log_override = config.rust_log_override self.port_distributor = config.port_distributor self.s3_mock_server = config.s3_mock_server + self.default_branch_name = config.default_branch_name self.zenith_cli = ZenithCli(env=self) - - self.zenith_cli = ZenithCli(env=self) + self.postgres = PostgresFactory(self) self.safekeepers: List[Safekeeper] = [] # generate initial tenant ID here instead of letting 'zenith init' generate it, @@ -599,9 +600,7 @@ class ZenithEnv: self.safekeepers.append(safekeeper) log.info(f"Config: {toml}") - # TODO kb is this a wrong concept? will break for multiple tenant tests - self.initial_timeline = self.zenith_cli.init(toml) - self.postgres = PostgresFactory(self) + self.zenith_cli.init(toml) def start(self): # Start up the page server and all the safekeepers @@ -637,7 +636,12 @@ def _shared_simple_env(request: Any, port_distributor) -> Iterator[ZenithEnv]: shutil.rmtree(repo_dir, ignore_errors=True) with ZenithEnvBuilder(Path(repo_dir), port_distributor) as builder: - yield builder.init_start() + env = builder.init_start() + + # For convenience in tests, create a branch from the freshly-initialized cluster. + env.zenith_cli.create_branch("empty") + + yield env @pytest.fixture(scope='function') @@ -685,7 +689,7 @@ class ZenithPageserverApiException(Exception): class ZenithPageserverHttpClient(requests.Session): - def __init__(self, port: int, auth_token: Optional[str] = None) -> None: + def __init__(self, port: int, auth_token: Optional[str] = None): super().__init__() self.port = port self.auth_token = auth_token @@ -804,59 +808,49 @@ class ZenithCli: A typed wrapper around the `zenith` CLI tool. Supports main commands via typed methods and a way to run arbitrary command directly via CLI. """ - def __init__(self, env: ZenithEnv) -> None: + def __init__(self, env: ZenithEnv): self.env = env pass - def create_tenant(self, tenant_id: Optional[uuid.UUID] = None) -> tuple[uuid.UUID, uuid.UUID]: + def create_tenant(self, tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: """ Creates a new tenant, returns its id and its initial timeline's id. """ if tenant_id is None: tenant_id = uuid.uuid4() res = self.raw_cli(['tenant', 'create', '--tenant-id', tenant_id.hex]) - - initial_timeline_id_extractor = re.compile(r"initial timeline: '(?P[^']+)'", - re.MULTILINE) - matches = initial_timeline_id_extractor.search(res.stdout) - - created_timeline_id = None - if matches is not None: - created_timeline_id = matches.group('timeline_id') - - if created_timeline_id is None: - raise Exception('could not find timeline id after `zenith tenant create` invocation') - else: - return (tenant_id, uuid.UUID(created_timeline_id)) + res.check_returncode() + return tenant_id def list_tenants(self) -> 'subprocess.CompletedProcess[str]': res = self.raw_cli(['tenant', 'list']) res.check_returncode() return res - def branch_timeline(self, - tenant_id: Optional[uuid.UUID] = None, - new_timeline_id: Optional[uuid.UUID] = None, - ancestor_timeline_id: Optional[uuid.UUID] = None, - ancestor_start_lsn: Optional[str] = None) -> uuid.UUID: + def create_branch(self, + new_branch_name: str, + ancestor_branch_name: Optional[str] = None, + tenant_id: Optional[uuid.UUID] = None, + ancestor_start_lsn: Optional[str] = None) -> uuid.UUID: cmd = [ 'timeline', 'branch', + '--name', + new_branch_name, '--tenant-id', (tenant_id or self.env.initial_tenant).hex, - '--ancestor-timeline-id', - (ancestor_timeline_id or self.env.initial_timeline).hex, + '--ancestor-branch-name', + ancestor_branch_name or self.env.default_branch_name, ] if ancestor_start_lsn is not None: cmd.extend(['--ancestor-start-lsn', ancestor_start_lsn]) - if new_timeline_id is not None: - cmd.extend(['--timeline-id', new_timeline_id.hex]) - completed_process = self.raw_cli(cmd) - completed_process.check_returncode() + res = self.raw_cli(cmd) + res.check_returncode() + create_timeline_id_extractor = re.compile(r"^Created timeline '(?P[^']+)'", re.MULTILINE) - matches = create_timeline_id_extractor.search(completed_process.stdout) + matches = create_timeline_id_extractor.search(res.stdout) created_timeline_id = None if matches is not None: @@ -875,9 +869,7 @@ class ZenithCli: res.stdout.strip().split("\n"))) return branches_cli - def init(self, config_toml: str) -> uuid.UUID: - initial_timeline = None - + def init(self, config_toml: str) -> 'subprocess.CompletedProcess[str]': with tempfile.NamedTemporaryFile(mode='w+') as tmp: tmp.write(config_toml) tmp.flush() @@ -887,18 +879,9 @@ class ZenithCli: self.env.pageserver.remote_storage, self.env.pageserver.config_override) - completed_process = self.raw_cli(cmd) - completed_process.check_returncode() - init_timeline_id_extractor = re.compile( - r'^created initial timeline (?P[^\s]+)\s', re.MULTILINE) - matches = init_timeline_id_extractor.search(completed_process.stdout) - if matches is not None: - initial_timeline = matches.group('timeline_id') - - if initial_timeline is None: - raise Exception('could not find timeline id after `zenith init` invocation') - else: - return uuid.UUID(initial_timeline) + res = self.raw_cli(cmd) + res.check_returncode() + return res def pageserver_start(self, overrides=()) -> 'subprocess.CompletedProcess[str]': start_args = ['pageserver', 'start', *overrides] @@ -930,9 +913,8 @@ class ZenithCli: def pg_create( self, - node_name: str, + branch_name: str, tenant_id: Optional[uuid.UUID] = None, - timeline_id: Optional[uuid.UUID] = None, lsn: Optional[str] = None, port: Optional[int] = None, ) -> 'subprocess.CompletedProcess[str]': @@ -940,22 +922,21 @@ class ZenithCli: 'pg', 'create', '--tenant-id', (tenant_id or self.env.initial_tenant).hex, - '--timeline-id', (timeline_id or self.env.initial_timeline).hex + '--name', + branch_name ] if lsn is not None: args.append(f'--lsn={lsn}') if port is not None: args.append(f'--port={port}') - args.append(node_name) res = self.raw_cli(args) res.check_returncode() return res def pg_start( self, - node_name: str, + branch_name: str, tenant_id: Optional[uuid.UUID] = None, - timeline_id: Optional[uuid.UUID] = None, lsn: Optional[str] = None, port: Optional[int] = None, ) -> 'subprocess.CompletedProcess[str]': @@ -964,14 +945,13 @@ class ZenithCli: 'start', '--tenant-id', (tenant_id or self.env.initial_tenant).hex, - '--timeline-id', - (timeline_id or self.env.initial_timeline).hex, + '--name', + branch_name, ] if lsn is not None: args.append(f'--lsn={lsn}') if port is not None: args.append(f'--port={port}') - args.append(node_name) res = self.raw_cli(args) res.check_returncode() @@ -979,14 +959,19 @@ class ZenithCli: def pg_stop( self, - node_name: str, + branch_name: str, tenant_id: Optional[uuid.UUID] = None, destroy=False, ) -> 'subprocess.CompletedProcess[str]': - args = ['pg', 'stop', f'--tenant-id={(tenant_id or self.env.initial_tenant).hex}'] + args = [ + 'pg', + 'stop', + f'--tenant-id={(tenant_id or self.env.initial_tenant).hex}', + '--name', + branch_name + ] if destroy: args.append('--destroy') - args.append(node_name) return self.raw_cli(args) @@ -1061,8 +1046,7 @@ class ZenithPageserver(PgProtocol): env: ZenithEnv, port: PageserverPort, remote_storage: Optional[RemoteStorage] = None, - config_override: Optional[str] = None, - enable_auth=False): + config_override: Optional[str] = None): super().__init__(host='localhost', port=port.pg, username='zenith_admin') self.env = env self.running = False @@ -1150,7 +1134,7 @@ class PgBin: self.env = os.environ.copy() self.env['LD_LIBRARY_PATH'] = os.path.join(str(pg_distrib_dir), 'lib') - def _fixpath(self, command: List[str]) -> None: + def _fixpath(self, command: List[str]): if '/' not in command[0]: command[0] = os.path.join(self.pg_bin_path, command[0]) @@ -1161,7 +1145,7 @@ class PgBin: env.update(env_add) return env - def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None) -> None: + def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None): """ Run one of the postgres binaries. @@ -1211,18 +1195,18 @@ class VanillaPostgres(PgProtocol): self.running = False self.pg_bin.run_capture(['initdb', '-D', pgdatadir]) - def configure(self, options: List[str]) -> None: + def configure(self, options: List[str]): """Append lines into postgresql.conf file.""" assert not self.running with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file: conf_file.writelines(options) - def start(self) -> None: + def start(self): assert not self.running self.running = True self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, 'start']) - def stop(self) -> None: + def stop(self): assert self.running self.running = False self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, 'stop']) @@ -1298,15 +1282,14 @@ class Postgres(PgProtocol): self.env = env self.running = False - self.node_name: Optional[str] = None # dubious, see asserts below + self.branch_name: Optional[str] = None # dubious, see asserts below self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA self.tenant_id = tenant_id - # path to conf is /pgdatadirs/tenants///postgresql.conf + # path to conf is /pgdatadirs/tenants///postgresql.conf def create( self, - node_name: str, - timeline_id: uuid.UUID, + branch_name: str, lsn: Optional[str] = None, config_lines: Optional[List[str]] = None, ) -> 'Postgres': @@ -1318,13 +1301,12 @@ class Postgres(PgProtocol): if not config_lines: config_lines = [] - self.env.zenith_cli.pg_create(node_name, - timeline_id=timeline_id, + self.env.zenith_cli.pg_create(branch_name, tenant_id=self.tenant_id, lsn=lsn, port=self.port) - self.node_name = node_name - path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name + self.branch_name = branch_name + path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.branch_name self.pgdata_dir = os.path.join(self.env.repo_dir, path) if config_lines is None: @@ -1343,11 +1325,11 @@ class Postgres(PgProtocol): Returns self. """ - assert self.node_name is not None + assert self.branch_name is not None - log.info(f"Starting postgres node {self.node_name}") + log.info(f"Starting postgres node {self.branch_name}") - run_result = self.env.zenith_cli.pg_start(self.node_name, + run_result = self.env.zenith_cli.pg_start(self.branch_name, tenant_id=self.tenant_id, port=self.port) self.running = True @@ -1358,8 +1340,8 @@ class Postgres(PgProtocol): def pg_data_dir_path(self) -> str: """ Path to data directory """ - assert self.node_name - path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name + assert self.branch_name + path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.branch_name return os.path.join(self.env.repo_dir, path) def pg_xact_dir_path(self) -> str: @@ -1418,8 +1400,8 @@ class Postgres(PgProtocol): """ if self.running: - assert self.node_name is not None - self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id) + assert self.branch_name is not None + self.env.zenith_cli.pg_stop(self.branch_name, self.tenant_id) self.running = False return self @@ -1430,16 +1412,15 @@ class Postgres(PgProtocol): Returns self. """ - assert self.node_name is not None - self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, True) - self.node_name = None + assert self.branch_name is not None + self.env.zenith_cli.pg_stop(self.branch_name, self.tenant_id, True) + self.branch_name = None return self def create_start( self, - node_name: str, - timeline_id: uuid.UUID, + branch_name: str, lsn: Optional[str] = None, config_lines: Optional[List[str]] = None, ) -> 'Postgres': @@ -1450,8 +1431,7 @@ class Postgres(PgProtocol): """ self.create( - node_name=node_name, - timeline_id=timeline_id, + branch_name=branch_name, config_lines=config_lines, lsn=lsn, ).start() @@ -1473,9 +1453,8 @@ class PostgresFactory: self.instances: List[Postgres] = [] def create_start(self, - node_name: str = "main", + branch_name: Optional[str] = None, tenant_id: Optional[uuid.UUID] = None, - timeline_id: Optional[uuid.UUID] = None, lsn: Optional[str] = None, config_lines: Optional[List[str]] = None) -> Postgres: @@ -1488,16 +1467,14 @@ class PostgresFactory: self.instances.append(pg) return pg.create_start( - node_name=node_name, - timeline_id=timeline_id or self.env.initial_timeline, + branch_name=branch_name or self.env.default_branch_name, config_lines=config_lines, lsn=lsn, ) def create(self, - node_name: str = "main", + branch_name: Optional[str] = None, tenant_id: Optional[uuid.UUID] = None, - timeline_id: Optional[uuid.UUID] = None, lsn: Optional[str] = None, config_lines: Optional[List[str]] = None) -> Postgres: @@ -1511,8 +1488,7 @@ class PostgresFactory: self.instances.append(pg) return pg.create( - node_name=node_name, - timeline_id=timeline_id or self.env.initial_timeline, + branch_name=branch_name or self.env.default_branch_name, lsn=lsn, config_lines=config_lines, ) @@ -1616,7 +1592,7 @@ class SafekeeperMetrics: class SafekeeperHttpClient(requests.Session): - def __init__(self, port: int) -> None: + def __init__(self, port: int): super().__init__() self.port = port @@ -1743,7 +1719,7 @@ def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Pos pg.stop() # Take a basebackup from pageserver - restored_dir_path = os.path.join(env.repo_dir, f"{pg.node_name}_restored_datadir") + restored_dir_path = os.path.join(env.repo_dir, f"{pg.branch_name}_restored_datadir") mkdir_if_needed(restored_dir_path) pg_bin = PgBin(test_output_dir) diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index dda31ba692..def9753347 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -30,18 +30,16 @@ def test_bulk_tenant_create( for i in range(tenants_count): start = timeit.default_timer() - (tenant, tenant_initial_timeline_id) = env.zenith_cli.create_tenant() - new_timeline_id = env.zenith_cli.branch_timeline( - tenant_id=tenant, ancestor_timeline_id=tenant_initial_timeline_id) + tenant = env.zenith_cli.create_tenant() + env.zenith_cli.create_branch( + f'test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}', tenant_id=tenant) # FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now? #if use_wal_acceptors == 'with_wa': # wa_factory.start_n_new(3) pg_tenant = env.postgres.create_start( - f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", - tenant, - timeline_id=new_timeline_id) + f'test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}', tenant) end = timeit.default_timer() time_slices.append(end - start) diff --git a/test_runner/performance/test_parallel_copy_to.py b/test_runner/performance/test_parallel_copy_to.py index 0ee0a37ebb..e4388ce8e2 100644 --- a/test_runner/performance/test_parallel_copy_to.py +++ b/test_runner/performance/test_parallel_copy_to.py @@ -1,5 +1,6 @@ from io import BytesIO import asyncio +import asyncpg from fixtures.zenith_fixtures import ZenithEnv, Postgres, PgProtocol from fixtures.log_helper import log from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py index 994544666b..56c735e87c 100644 --- a/test_runner/test_broken.py +++ b/test_runner/test_broken.py @@ -21,8 +21,8 @@ run_broken = pytest.mark.skipif(os.environ.get('RUN_BROKEN') is None, def test_broken(zenith_simple_env: ZenithEnv, pg_bin): env = zenith_simple_env - new_timeline_id = env.zenith_cli.branch_timeline() - env.postgres.create_start("test_broken", timeline_id=new_timeline_id) + env.zenith_cli.create_branch("test_broken", "empty") + env.postgres.create_start("test_broken") log.info('postgres is running') log.info('THIS NEXT COMMAND WILL FAIL:') diff --git a/zenith/src/main.rs b/zenith/src/main.rs index dcfeb63309..fb0b230c2c 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -1,4 +1,4 @@ -use anyhow::{bail, Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; use clap::{App, AppSettings, Arg, ArgMatches}; use control_plane::compute::ComputeControlPlane; use control_plane::local_env; @@ -19,7 +19,7 @@ use walkeeper::defaults::{ use zenith_utils::auth::{Claims, Scope}; use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; +use zenith_utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}; use zenith_utils::GIT_VERSION; use pageserver::timelines::TimelineInfo; @@ -27,6 +27,7 @@ use pageserver::timelines::TimelineInfo; // Default id of a safekeeper node, if not specified on the command line. const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1); const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1); +const DEFAULT_BRANCH_NAME: &str = "main"; fn default_conf() -> String { format!( @@ -57,7 +58,7 @@ http_port = {safekeeper_http_port} /// Timelines tree element used as a value in the HashMap. /// struct TimelineTreeEl { - /// `TimelineInfo` received from the `pageserver` via the `timeline_list` libpq API call. + /// `TimelineInfo` received from the `pageserver` via the `timeline_list` http API call. pub info: TimelineInfo, /// Holds all direct children of this timeline referenced using `timeline_id`. pub children: BTreeSet, @@ -71,16 +72,15 @@ struct TimelineTreeEl { // * Providing CLI api to the pageserver // * TODO: export/import to/from usual postgres fn main() -> Result<()> { - let pg_node_arg = Arg::new("node").help("Node name").required(true); + let branch_name_arg = Arg::new("name") + .long("name") + .short('n') + .takes_value(true) + .help("Name of the branch to be created or used as an alias for other services") + .required(false); let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false); - let timeline_id_arg = Arg::new("timeline-id") - .long("timeline-id") - .help("Timeline id. Represented as a hexadecimal string 32 symbols length") - .takes_value(true) - .required(false); - let tenant_id_arg = Arg::new("tenant-id") .long("tenant-id") .help("Tenant id. Represented as a hexadecimal string 32 symbols length") @@ -137,15 +137,15 @@ fn main() -> Result<()> { .subcommand(App::new("branch") .about("Create a new timeline, using another timeline as a base, copying its data") .arg(tenant_id_arg.clone()) - .arg(timeline_id_arg.clone().help("Id of the new timeline, optional. If not specified, it will be generated randomly")) - .arg(Arg::new("ancestor-timeline-id").long("ancestor-timeline-id").takes_value(true) - .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline").required(false)) + .arg(branch_name_arg.clone()) + .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name").takes_value(true) + .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(true)) .arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn").takes_value(true) .help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false))) .subcommand(App::new("create") .about("Create a new blank timeline") .arg(tenant_id_arg.clone()) - .arg(timeline_id_arg.clone().help("Id of the new timeline, optional. If not specified, it will be generated randomly"))) + .arg(branch_name_arg.clone())) ).subcommand( App::new("tenant") .setting(AppSettings::ArgRequiredElseHelp) @@ -189,8 +189,7 @@ fn main() -> Result<()> { .subcommand(App::new("list").arg(tenant_id_arg.clone())) .subcommand(App::new("create") .about("Create a postgres compute node") - .arg(pg_node_arg.clone()) - .arg(timeline_id_arg.clone()) + .arg(branch_name_arg.clone()) .arg(tenant_id_arg.clone()) .arg(lsn_arg.clone()) .arg(port_arg.clone()) @@ -202,14 +201,13 @@ fn main() -> Result<()> { )) .subcommand(App::new("start") .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") - .arg(pg_node_arg.clone()) - .arg(timeline_id_arg.clone()) + .arg(branch_name_arg.clone()) .arg(tenant_id_arg.clone()) .arg(lsn_arg.clone()) .arg(port_arg.clone())) .subcommand( App::new("stop") - .arg(pg_node_arg.clone()) + .arg(branch_name_arg.clone()) .arg(tenant_id_arg.clone()) .arg( Arg::new("destroy") @@ -242,24 +240,26 @@ fn main() -> Result<()> { handle_init(sub_args) } else { // all other commands need an existing config - let mut env = match LocalEnv::load_config() { - Ok(conf) => conf, - Err(e) => { - eprintln!("Error loading config: {}", e); - exit(1); - } - }; + let mut env = LocalEnv::load_config().context("Error loading config")?; + let original_env = env.clone(); - match sub_name { + let subcommand_result = match sub_name { "tenant" => handle_tenant(sub_args, &mut env), - "timeline" => handle_timeline(sub_args, &env), + "timeline" => handle_timeline(sub_args, &mut env), "start" => handle_start_all(sub_args, &env), "stop" => handle_stop_all(sub_args, &env), "pageserver" => handle_pageserver(sub_args, &env), "pg" => handle_pg(sub_args, &env), "safekeeper" => handle_safekeeper(sub_args, &env), _ => bail!("unexpected subcommand {}", sub_name), + }; + + if subcommand_result.is_ok() && original_env != env { + eprintln!("Subcommand had changed the config, updating"); + env.persist_config(&env.base_data_dir)?; } + + subcommand_result }; if let Err(e) = subcmd_result { eprintln!("command failed: {:#}", e); @@ -423,21 +423,6 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R } } -fn get_timeline_id( - sub_match: &ArgMatches, - tenant_id: ZTenantId, - env: &local_env::LocalEnv, -) -> anyhow::Result { - if let Some(timeline_id) = sub_match.value_of("timeline-id") { - Ok(ZTimelineId::from_str(timeline_id) - .context("Failed to parse timeline id from arguments")?) - } else if let Some(&initial_timeline_id) = env.initial_timelines.get(&tenant_id) { - Ok(initial_timeline_id) - } else { - bail!("No timeline id, specify one in the subcommand's arguments"); - } -} - fn handle_init(init_match: &ArgMatches) -> Result<()> { // Create config file let toml_file: String = if let Some(config_path) = init_match.value_of("config") { @@ -491,7 +476,10 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re }; println!("using tenant id {}", tenant_id); let initial_timeline_id = pageserver.tenant_create(tenant_id)?; - env.initial_timelines.insert(tenant_id, initial_timeline_id); + env.branch_name_mappings.insert( + DEFAULT_BRANCH_NAME.to_owned(), + ZTenantTimelineId::new(tenant_id, initial_timeline_id), + ); println!( "tenant {} successfully created on the pageserver, initial timeline: '{}'", tenant_id, initial_timeline_id @@ -503,7 +491,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re Ok(()) } -fn handle_timeline(timeline_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { let pageserver = PageServerNode::from_env(env); match timeline_match.subcommand() { @@ -514,18 +502,28 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &local_env::LocalEnv) -> Re } Some(("create", create_match)) => { let tenant_id = get_tenant_id(create_match, env)?; - let timeline_id = get_timeline_id(create_match, tenant_id, env) - .unwrap_or_else(|_| ZTimelineId::generate()); - let timeline = pageserver.timeline_create(tenant_id, timeline_id, None, None)?; + let new_timeline_id = ZTimelineId::generate(); + let new_branch_name = create_match + .value_of("name") + .ok_or(anyhow!("No branch name provided"))?; + let timeline = pageserver.timeline_create(tenant_id, new_timeline_id, None, None)?; let last_record_lsn = match timeline { TimelineInfo::Local { last_record_lsn, .. } => last_record_lsn, TimelineInfo::Remote { .. } => { - bail!("Timeline {} was created as remote, not local", timeline_id) + bail!( + "Timeline {} was created as remote, not local", + new_timeline_id + ) } }; + env.branch_name_mappings.insert( + new_branch_name.to_string(), + ZTenantTimelineId::new(tenant_id, new_timeline_id), + ); + println!( "Created timeline '{}' at Lsn {} for tenant: {}", timeline.timeline_id(), @@ -535,18 +533,22 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &local_env::LocalEnv) -> Re } Some(("branch", branch_match)) => { let tenant_id = get_tenant_id(branch_match, env)?; - let timeline_id = get_timeline_id(branch_match, tenant_id, env) - .unwrap_or_else(|_| ZTimelineId::generate()); - let ancestor_timeline_id = match branch_match - .value_of("ancestor-timeline-id") - .map(ZTimelineId::from_str) - .transpose() - .context("Failed to parse ancestor timeline id from the request")? - .or_else(|| env.initial_timelines.get(&tenant_id).copied()) - { - Some(id) => id, - None => bail!("No ancestor timeline id provided"), - }; + let new_timeline_id = ZTimelineId::generate(); + let new_branch_name = branch_match + .value_of("name") + .ok_or(anyhow!("No branch name provided"))?; + let ancestor_branch_name = branch_match + .value_of("ancestor-branch-name") + .ok_or(anyhow!("No ancestor branch name provided"))?; + let ancestor_timeline_id = env + .branch_name_mappings + .get(ancestor_branch_name) + .ok_or(anyhow!( + "Found no timeline id for branch name '{}'", + ancestor_branch_name + ))? + .timeline_id; + let start_lsn = branch_match .value_of("ancestor-start-lsn") .map(Lsn::from_str) @@ -554,7 +556,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .context("Failed to parse ancestor start Lsn from the request")?; let timeline = pageserver.timeline_create( tenant_id, - timeline_id, + new_timeline_id, start_lsn, Some(ancestor_timeline_id), )?; @@ -563,16 +565,23 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &local_env::LocalEnv) -> Re TimelineInfo::Local { last_record_lsn, .. } => last_record_lsn, - TimelineInfo::Remote { .. } => { - bail!("Timeline {} was created as remote, not local", timeline_id) - } + TimelineInfo::Remote { .. } => bail!( + "Timeline {} was created as remote, not local", + new_timeline_id + ), }; + + env.branch_name_mappings.insert( + new_branch_name.to_string(), + ZTenantTimelineId::new(tenant_id, new_timeline_id), + ); + println!( "Created timeline '{}' at Lsn {} for tenant: {}. Ancestor timeline: '{}'", timeline.timeline_id(), last_record_lsn, tenant_id, - ancestor_timeline_id, + ancestor_branch_name, ); } Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), @@ -592,6 +601,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { // All subcommands take an optional --tenant-id option let tenant_id = get_tenant_id(sub_args, env)?; + let node_name = sub_args.value_of("name").unwrap_or(DEFAULT_BRANCH_NAME); match sub_name { "list" => { @@ -630,13 +640,16 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { } } "create" => { - let node_name = sub_args.value_of("node").unwrap_or("main"); let lsn = sub_args .value_of("lsn") .map(Lsn::from_str) .transpose() .context("Failed to parse Lsn from the request")?; - let timeline_id = get_timeline_id(sub_args, tenant_id, env)?; + let timeline_id = env + .branch_name_mappings + .get(node_name) + .ok_or(anyhow!("Found no timeline id for node name {}", node_name))? + .timeline_id; let port: Option = match sub_args.value_of("port") { Some(p) => Some(p.parse()?), @@ -645,8 +658,6 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { cplane.new_node(tenant_id, node_name, timeline_id, lsn, port)?; } "start" => { - let node_name = sub_args.value_of("node").unwrap_or("main"); - let port: Option = match sub_args.value_of("port") { Some(p) => Some(p.parse()?), None => None, @@ -666,7 +677,11 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { println!("Starting existing postgres {}...", node_name); node.start(&auth_token)?; } else { - let timeline_id = get_timeline_id(sub_args, tenant_id, env)?; + let timeline_id = env + .branch_name_mappings + .get(node_name) + .ok_or(anyhow!("Found no timeline id for node name {}", node_name))? + .timeline_id; let lsn = sub_args .value_of("lsn") .map(Lsn::from_str) @@ -686,7 +701,6 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { } } "stop" => { - let node_name = sub_args.value_of("node").unwrap_or("main"); let destroy = sub_args.is_present("destroy"); let node = cplane diff --git a/zenith_utils/src/zid.rs b/zenith_utils/src/zid.rs index a740d4fb48..89708ee0df 100644 --- a/zenith_utils/src/zid.rs +++ b/zenith_utils/src/zid.rs @@ -317,7 +317,7 @@ zid_newtype!(ZTenantId); mutual_from!(ZTenantId, HexZTenantId); // A pair uniquely identifying Zenith instance. -#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct ZTenantTimelineId { pub tenant_id: ZTenantId, pub timeline_id: ZTimelineId, From c7569dce472182016e7e2925c5fc8a9e93c407f0 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 24 Feb 2022 20:35:41 +0200 Subject: [PATCH 0021/1022] Allow passing initial timeline id into zenith CLI commands --- control_plane/src/storage.rs | 68 +++++++++++++--- pageserver/src/bin/pageserver.rs | 30 ++++++- pageserver/src/http/models.rs | 4 + pageserver/src/http/routes.rs | 8 +- pageserver/src/tenant_mgr.rs | 12 +-- pageserver/src/timelines.rs | 32 ++++---- test_runner/fixtures/zenith_fixtures.py | 6 +- zenith/src/main.rs | 101 ++++++++++++++++-------- 8 files changed, 192 insertions(+), 69 deletions(-) diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 9d5a88784d..e18be05cea 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -5,7 +5,7 @@ use std::process::Command; use std::time::Duration; use std::{io, result, thread}; -use anyhow::bail; +use anyhow::{bail, Context}; use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; @@ -99,9 +99,10 @@ impl PageServerNode { pub fn init( &self, - create_tenant: Option<&str>, + create_tenant: Option, + initial_timeline_id: Option, config_overrides: &[&str], - ) -> anyhow::Result<()> { + ) -> anyhow::Result { let mut cmd = Command::new(self.env.pageserver_bin()?); let id = format!("id={}", self.env.pageserver.id); @@ -138,19 +139,29 @@ impl PageServerNode { ]); } - if let Some(tenantid) = create_tenant { - args.extend(["--create-tenant", tenantid]) + let create_tenant = create_tenant.map(|id| id.to_string()); + if let Some(tenant_id) = create_tenant.as_deref() { + args.extend(["--create-tenant", tenant_id]) } - let status = fill_rust_env_vars(cmd.args(args)) - .status() - .expect("pageserver init failed"); + let initial_timeline_id_str = initial_timeline_id.map(|id| id.to_string()); + if let Some(timeline_id) = initial_timeline_id_str.as_deref() { + args.extend(["--initial-timeline-id", timeline_id]) + } - if !status.success() { + let init_output = fill_rust_env_vars(cmd.args(args)) + .output() + .context("pageserver init failed")?; + + if !init_output.status.success() { bail!("pageserver init failed"); } - Ok(()) + if let Some(initial_timeline_id) = initial_timeline_id { + Ok(initial_timeline_id) + } else { + extract_initial_timeline_id(init_output.stdout) + } } pub fn repo_path(&self) -> PathBuf { @@ -325,11 +336,16 @@ impl PageServerNode { .json()?) } - pub fn tenant_create(&self, tenantid: ZTenantId) -> Result { + pub fn tenant_create( + &self, + tenant_id: ZTenantId, + initial_timeline_id: Option, + ) -> Result { Ok(self .http_request(Method::POST, format!("{}/{}", self.http_base_url, "tenant")) .json(&TenantCreateRequest { - tenant_id: tenantid, + tenant_id, + initial_timeline_id, }) .send()? .error_from_body()? @@ -367,3 +383,31 @@ impl PageServerNode { .json()?) } } + +fn extract_initial_timeline_id(init_stdout: Vec) -> anyhow::Result { + let output_string = + String::from_utf8(init_stdout).context("Init stdout is not a valid unicode")?; + + let string_with_timeline_id = match output_string.split_once("created initial timeline ") { + Some((_, string_with_timeline_id)) => string_with_timeline_id, + None => bail!( + "Found no line with timeline id in the init output: '{}'", + output_string + ), + }; + + let timeline_id_str = match string_with_timeline_id.split_once(' ') { + Some((timeline_id_str, _)) => timeline_id_str, + None => bail!( + "Found no timeline id in the init output: '{}'", + output_string + ), + }; + + timeline_id_str.parse().with_context(|| { + format!( + "Failed to parse timeline id from string, extracted from the init output: '{}'", + timeline_id_str + ) + }) +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 2fa772af58..83b128dd74 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -2,7 +2,14 @@ use std::{env, path::Path, str::FromStr}; use tracing::*; -use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType, tcp_listener, GIT_VERSION}; +use zenith_utils::{ + auth::JwtAuth, + logging, + postgres_backend::AuthType, + tcp_listener, + zid::{ZTenantId, ZTimelineId}, + GIT_VERSION, +}; use anyhow::{bail, Context, Result}; @@ -52,6 +59,13 @@ fn main() -> Result<()> { .help("Create tenant during init") .requires("init"), ) + .arg( + Arg::new("initial-timeline-id") + .long("initial-timeline-id") + .takes_value(true) + .help("Use a specific timeline id during init and tenant creation") + .requires("create-tenant"), + ) // See `settings.md` for more details on the extra configuration patameters pageserver can process .arg( Arg::new("config-override") @@ -71,7 +85,16 @@ fn main() -> Result<()> { let cfg_file_path = workdir.join("pageserver.toml"); let init = arg_matches.is_present("init"); - let create_tenant = arg_matches.value_of("create-tenant"); + let create_tenant = arg_matches + .value_of("create-tenant") + .map(ZTenantId::from_str) + .transpose() + .context("Failed to parse tenant id from the arguments")?; + let initial_timeline_id = arg_matches + .value_of("initial-timeline-id") + .map(ZTimelineId::from_str) + .transpose() + .context("Failed to parse timeline id from the arguments")?; // Set CWD to workdir for non-daemon modes env::set_current_dir(&workdir).with_context(|| { @@ -142,7 +165,8 @@ fn main() -> Result<()> { // Create repo and exit if init was requested if init { - timelines::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?; + timelines::init_pageserver(conf, create_tenant, initial_timeline_id) + .context("Failed to init pageserver")?; // write the config file std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| { format!( diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 7f95c64527..04ccb9708e 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -11,6 +11,7 @@ pub struct TimelineCreateRequest { pub tenant_id: ZTenantId, #[serde(with = "hex")] pub timeline_id: ZTimelineId, + #[serde(default)] #[serde(with = "opt_display_serde")] pub ancestor_timeline_id: Option, pub start_lsn: Option, @@ -20,6 +21,9 @@ pub struct TimelineCreateRequest { pub struct TenantCreateRequest { #[serde(with = "hex")] pub tenant_id: ZTenantId, + #[serde(default)] + #[serde(with = "opt_display_serde")] + pub initial_timeline_id: Option, } #[derive(Serialize)] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index f332e59135..45b0c3d4be 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -214,8 +214,12 @@ async fn tenant_create_handler(mut request: Request) -> Result, ) -> Result { - let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid)); - let (initial_timeline_id, repo) = timelines::create_repo(conf, tenantid, wal_redo_manager)?; + let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); + let (initial_timeline_id, repo) = + timelines::create_repo(conf, tenant_id, initial_timeline_id, wal_redo_manager)?; - match access_tenants().entry(tenantid) { - hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", tenantid), + match access_tenants().entry(tenant_id) { + hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", tenant_id), hash_map::Entry::Vacant(v) => { v.insert(Tenant { state: TenantState::Idle, diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 8b4dc57342..b97ab045c7 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -9,7 +9,6 @@ use std::{ fs, path::Path, process::{Command, Stdio}, - str::FromStr, sync::Arc, }; use tracing::*; @@ -150,7 +149,11 @@ pub struct PointInTime { pub lsn: Lsn, } -pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> { +pub fn init_pageserver( + conf: &'static PageServerConf, + create_tenant: Option, + initial_timeline_id: Option, +) -> Result<()> { // Initialize logger // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages let _log_file = logging::init(LOG_FILE_NAME, true)?; @@ -167,10 +170,10 @@ pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str // anymore, but I think that could still happen. let dummy_redo_mgr = Arc::new(crate::walredo::DummyRedoManager {}); - if let Some(tenantid) = create_tenant { - let tenantid = ZTenantId::from_str(tenantid)?; - println!("initializing tenantid {}", tenantid); - create_repo(conf, tenantid, dummy_redo_mgr).context("failed to create repo")?; + if let Some(tenant_id) = create_tenant { + println!("initializing tenantid {}", tenant_id); + create_repo(conf, tenant_id, initial_timeline_id, dummy_redo_mgr) + .context("failed to create repo")?; } crashsafe_dir::create_dir_all(conf.tenants_path())?; @@ -180,39 +183,40 @@ pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str pub fn create_repo( conf: &'static PageServerConf, - tenantid: ZTenantId, + tenant_id: ZTenantId, + init_timeline_id: Option, wal_redo_manager: Arc, ) -> Result<(ZTimelineId, Arc)> { - let repo_dir = conf.tenant_path(&tenantid); + let repo_dir = conf.tenant_path(&tenant_id); if repo_dir.exists() { - bail!("repo for {} already exists", tenantid) + bail!("repo for {} already exists", tenant_id) } // top-level dir may exist if we are creating it through CLI crashsafe_dir::create_dir_all(&repo_dir) .with_context(|| format!("could not create directory {}", repo_dir.display()))?; - crashsafe_dir::create_dir(conf.timelines_path(&tenantid))?; + crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; info!("created directory structure in {}", repo_dir.display()); // create a new timeline directory - let timeline_id = ZTimelineId::generate(); - let timelinedir = conf.timeline_path(&timeline_id, &tenantid); + let timeline_id = init_timeline_id.unwrap_or_else(|| ZTimelineId::generate()); + let timelinedir = conf.timeline_path(&timeline_id, &tenant_id); crashsafe_dir::create_dir(&timelinedir)?; let repo = Arc::new(crate::layered_repository::LayeredRepository::new( conf, wal_redo_manager, - tenantid, + tenant_id, conf.remote_storage_config.is_some(), )); // Load data into pageserver // TODO To implement zenith import we need to // move data loading out of create_repo() - bootstrap_timeline(conf, tenantid, timeline_id, repo.as_ref())?; + bootstrap_timeline(conf, tenant_id, timeline_id, repo.as_ref())?; Ok((timeline_id, repo)) } diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 9345c7f238..c283bea48e 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -869,12 +869,16 @@ class ZenithCli: res.stdout.strip().split("\n"))) return branches_cli - def init(self, config_toml: str) -> 'subprocess.CompletedProcess[str]': + def init(self, + config_toml: str, + initial_timeline_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]': with tempfile.NamedTemporaryFile(mode='w+') as tmp: tmp.write(config_toml) tmp.flush() cmd = ['init', f'--config={tmp.name}'] + if initial_timeline_id: + cmd.extend(['--timeline-id', initial_timeline_id.hex]) append_pageserver_param_overrides(cmd, self.env.pageserver.remote_storage, self.env.pageserver.config_override) diff --git a/zenith/src/main.rs b/zenith/src/main.rs index fb0b230c2c..34cab4b381 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -87,6 +87,12 @@ fn main() -> Result<()> { .takes_value(true) .required(false); + let timeline_id_arg = Arg::new("timeline-id") + .long("timeline-id") + .help("Timeline id. Represented as a hexadecimal string 32 symbols length") + .takes_value(true) + .required(false); + let port_arg = Arg::new("port") .long("port") .required(false) @@ -121,6 +127,7 @@ fn main() -> Result<()> { App::new("init") .about("Initialize a new Zenith repository") .arg(pageserver_config_args.clone()) + .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) .arg( Arg::new("config") .long("config") @@ -151,7 +158,10 @@ fn main() -> Result<()> { .setting(AppSettings::ArgRequiredElseHelp) .about("Manage tenants") .subcommand(App::new("list")) - .subcommand(App::new("create").arg(tenant_id_arg.clone())) + .subcommand(App::new("create") + .arg(tenant_id_arg.clone()) + .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) + ) ) .subcommand( App::new("pageserver") @@ -236,8 +246,8 @@ fn main() -> Result<()> { }; // Check for 'zenith init' command first. - let subcmd_result = if sub_name == "init" { - handle_init(sub_args) + let subcommand_result = if sub_name == "init" { + handle_init(sub_args).map(Some) } else { // all other commands need an existing config let mut env = LocalEnv::load_config().context("Error loading config")?; @@ -254,18 +264,21 @@ fn main() -> Result<()> { _ => bail!("unexpected subcommand {}", sub_name), }; - if subcommand_result.is_ok() && original_env != env { - eprintln!("Subcommand had changed the config, updating"); - env.persist_config(&env.base_data_dir)?; + if original_env != env { + subcommand_result.map(|()| Some(env)) + } else { + subcommand_result.map(|()| None) } - - subcommand_result }; - if let Err(e) = subcmd_result { - eprintln!("command failed: {:#}", e); - exit(1); - } + match subcommand_result { + Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?, + Ok(None) => (), + Err(e) => { + eprintln!("command failed: {:?}", e); + exit(1); + } + } Ok(()) } @@ -411,11 +424,8 @@ fn get_timeline_infos( // Helper function to parse --tenant_id option, or get the default from config file fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result { - if let Some(tenantid_cmd) = sub_match.value_of("tenant-id") { - Ok( - ZTenantId::from_str(tenantid_cmd) - .context("Failed to parse tenant id from arguments")?, - ) + if let Some(tenant_id_from_arguments) = parse_tenant_id(sub_match).transpose() { + tenant_id_from_arguments } else if let Some(tenantid_conf) = env.default_tenant_id { Ok(ZTenantId::from(tenantid_conf)) } else { @@ -423,7 +433,25 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R } } -fn handle_init(init_match: &ArgMatches) -> Result<()> { +fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { + sub_match + .value_of("tenant-id") + .map(ZTenantId::from_str) + .transpose() + .context("Failed to parse tenant id from the argument string") +} + +fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result> { + sub_match + .value_of("timeline-id") + .map(ZTimelineId::from_str) + .transpose() + .context("Failed to parse timeline id from the argument string") +} + +fn handle_init(init_match: &ArgMatches) -> Result { + let initial_timeline_id_arg = parse_timeline_id(init_match)?; + // Create config file let toml_file: String = if let Some(config_path) = init_match.value_of("config") { // load and parse the file @@ -439,18 +467,28 @@ fn handle_init(init_match: &ArgMatches) -> Result<()> { env.init() .context("Failed to initialize zenith repository")?; + // default_tenantid was generated by the `env.init()` call above + let initial_tenant_id = env.default_tenant_id.unwrap(); + // Call 'pageserver init'. let pageserver = PageServerNode::from_env(&env); - if let Err(e) = pageserver.init( - // default_tenantid was generated by the `env.init()` call above - Some(&ZTenantId::from(env.default_tenant_id.unwrap()).to_string()), - &pageserver_config_overrides(init_match), - ) { - eprintln!("pageserver init failed: {}", e); - exit(1); - } + let initial_timeline_id = pageserver + .init( + Some(initial_tenant_id), + initial_timeline_id_arg, + &pageserver_config_overrides(init_match), + ) + .unwrap_or_else(|e| { + eprintln!("pageserver init failed: {}", e); + exit(1); + }); - Ok(()) + env.branch_name_mappings.insert( + DEFAULT_BRANCH_NAME.to_owned(), + ZTenantTimelineId::new(initial_tenant_id, initial_timeline_id), + ); + + Ok(env) } fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { @@ -470,12 +508,11 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re } } Some(("create", create_match)) => { - let tenant_id = match create_match.value_of("tenant-id") { - Some(id) => ZTenantId::from_str(id)?, - None => ZTenantId::generate(), - }; + let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(|| ZTenantId::generate()); println!("using tenant id {}", tenant_id); - let initial_timeline_id = pageserver.tenant_create(tenant_id)?; + let initial_timeline_id_argument = parse_timeline_id(create_match)?; + let initial_timeline_id = + pageserver.tenant_create(tenant_id, initial_timeline_id_argument)?; env.branch_name_mappings.insert( DEFAULT_BRANCH_NAME.to_owned(), ZTenantTimelineId::new(tenant_id, initial_timeline_id), From 7b5482bac0f052913a056649d839792f457f1019 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 24 Feb 2022 23:28:30 +0200 Subject: [PATCH 0022/1022] Properly store the branch name mappings --- control_plane/src/compute.rs | 2 +- control_plane/src/local_env.rs | 32 +++++- pageserver/src/timelines.rs | 2 +- .../batch_others/test_readonly_node.py | 16 ++- .../batch_others/test_remote_storage.py | 4 +- .../batch_others/test_tenant_relocation.py | 4 +- test_runner/fixtures/zenith_fixtures.py | 79 +++++++------ .../performance/test_bulk_tenant_create.py | 2 +- zenith/src/main.rs | 105 ++++++++++-------- zenith_utils/src/zid.rs | 2 +- 10 files changed, 158 insertions(+), 90 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index e8baffdc74..64cd46fef6 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -37,7 +37,7 @@ impl ComputeControlPlane { // pgdatadirs // |- tenants // | |- - // | | |- + // | | |- pub fn load(env: LocalEnv) -> Result { let pageserver = Arc::new(PageServerNode::from_env(&env)); diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 9b50a6b9e4..a9352bdfcc 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -12,7 +12,8 @@ use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use zenith_utils::auth::{encode_from_key_file, Claims, Scope}; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{HexZTenantId, ZNodeId, ZTenantId, ZTenantTimelineId}; +use zenith_utils::zid::ZTimelineId; +use zenith_utils::zid::{HexZTenantId, ZNodeId, ZTenantId}; use crate::safekeeper::SafekeeperNode; @@ -62,7 +63,10 @@ pub struct LocalEnv { /// Every tenant has a first timeline created for it, currently the only one ancestor-less for this tenant. /// It is used as a default timeline for branching, if no ancestor timeline is specified. #[serde(default)] - pub branch_name_mappings: HashMap, + // A `HashMap>` would be more appropriate here, + // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. + // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". + branch_name_mappings: HashMap>, } #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] @@ -149,6 +153,30 @@ impl LocalEnv { self.base_data_dir.join("safekeepers").join(data_dir_name) } + pub fn register_branch_mapping( + &mut self, + branch_name: String, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + ) { + self.branch_name_mappings + .entry(branch_name) + .or_default() + .push((tenant_id, timeline_id)); + } + + pub fn get_branch_timeline_id( + &self, + branch_name: &str, + tenant_id: ZTenantId, + ) -> Option { + self.branch_name_mappings + .get(branch_name)? + .iter() + .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id) + .map(|&(_, timeline_id)| timeline_id) + } + /// Create a LocalEnv from a config file. /// /// Unlike 'load_config', this function fills in any defaults that are missing diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index b97ab045c7..786e102747 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -201,7 +201,7 @@ pub fn create_repo( info!("created directory structure in {}", repo_dir.display()); // create a new timeline directory - let timeline_id = init_timeline_id.unwrap_or_else(|| ZTimelineId::generate()); + let timeline_id = init_timeline_id.unwrap_or_else(ZTimelineId::generate); let timelinedir = conf.timeline_path(&timeline_id, &tenant_id); crashsafe_dir::create_dir(&timelinedir)?; diff --git a/test_runner/batch_others/test_readonly_node.py b/test_runner/batch_others/test_readonly_node.py index 5d5949add6..808ee62def 100644 --- a/test_runner/batch_others/test_readonly_node.py +++ b/test_runner/batch_others/test_readonly_node.py @@ -52,10 +52,14 @@ def test_readonly_node(zenith_simple_env: ZenithEnv): log.info('LSN after 400100 rows: ' + lsn_c) # Create first read-only node at the point where only 100 rows were inserted - pg_hundred = env.postgres.create_start("test_readonly_node_hundred", lsn=lsn_a) + pg_hundred = env.postgres.create_start(branch_name='test_readonly_node', + node_name='test_readonly_node_hundred', + lsn=lsn_a) # And another at the point where 200100 rows were inserted - pg_more = env.postgres.create_start("test_readonly_node_more", lsn=lsn_b) + pg_more = env.postgres.create_start(branch_name='test_readonly_node', + node_name='test_readonly_node_more', + lsn=lsn_b) # On the 'hundred' node, we should see only 100 rows hundred_pg_conn = pg_hundred.connect() @@ -74,7 +78,9 @@ def test_readonly_node(zenith_simple_env: ZenithEnv): assert main_cur.fetchone() == (400100, ) # Check creating a node at segment boundary - pg = env.postgres.create_start("test_branch_segment_boundary", lsn='0/3000000') + pg = env.postgres.create_start(branch_name='test_readonly_node', + node_name='test_branch_segment_boundary', + lsn='0/3000000') cur = pg.connect().cursor() cur.execute('SELECT 1') assert cur.fetchone() == (1, ) @@ -82,4 +88,6 @@ def test_readonly_node(zenith_simple_env: ZenithEnv): # Create node at pre-initdb lsn with pytest.raises(Exception, match="invalid basebackup lsn"): # compute node startup with invalid LSN should fail - env.zenith_cli.pg_start("test_readonly_node_preinitdb", lsn="0/42") + env.postgres.create_start(branch_name='test_readonly_node', + node_name='test_readonly_node_preinitdb', + lsn='0/42') diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index 61feb1a5bd..abd06bf5e9 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -43,7 +43,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, ##### First start, insert secret data and upload it to the remote storage env = zenith_env_builder.init_start() - pg = env.postgres.create_start() + pg = env.postgres.create_start('main') tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] @@ -94,7 +94,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, log.debug("still waiting") time.sleep(1) - pg = env.postgres.create_start() + pg = env.postgres.create_start('main') with closing(pg.connect()) as conn: with conn.cursor() as cur: cur.execute(f'SELECT secret FROM t1 WHERE id = {data_id};') diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index a1286adfb0..7a9d478f16 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -132,7 +132,9 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, env.zenith_cli.create_branch('test_tenant_relocation', tenant_id=tenant) - tenant_pg = env.postgres.create_start("test_tenant_relocation", tenant_id=tenant) + tenant_pg = env.postgres.create_start(branch_name='main', + node_name='test_tenant_relocation', + tenant_id=tenant) # insert some data with closing(tenant_pg.connect()) as conn: diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index c283bea48e..04b795b244 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -835,7 +835,7 @@ class ZenithCli: cmd = [ 'timeline', 'branch', - '--name', + '--branch-name', new_branch_name, '--tenant-id', (tenant_id or self.env.initial_tenant).hex, @@ -918,6 +918,7 @@ class ZenithCli: def pg_create( self, branch_name: str, + node_name: Optional[str] = None, tenant_id: Optional[uuid.UUID] = None, lsn: Optional[str] = None, port: Optional[int] = None, @@ -925,21 +926,25 @@ class ZenithCli: args = [ 'pg', 'create', - '--tenant-id', (tenant_id or self.env.initial_tenant).hex, - '--name', - branch_name + '--tenant-id', + (tenant_id or self.env.initial_tenant).hex, + '--branch-name', + branch_name, ] if lsn is not None: - args.append(f'--lsn={lsn}') + args.extend(['--lsn', lsn]) if port is not None: - args.append(f'--port={port}') + args.extend(['--port', str(port)]) + if node_name is not None: + args.append(node_name) + res = self.raw_cli(args) res.check_returncode() return res def pg_start( self, - branch_name: str, + node_name: str, tenant_id: Optional[uuid.UUID] = None, lsn: Optional[str] = None, port: Optional[int] = None, @@ -949,13 +954,13 @@ class ZenithCli: 'start', '--tenant-id', (tenant_id or self.env.initial_tenant).hex, - '--name', - branch_name, ] if lsn is not None: args.append(f'--lsn={lsn}') if port is not None: args.append(f'--port={port}') + if node_name is not None: + args.append(node_name) res = self.raw_cli(args) res.check_returncode() @@ -963,19 +968,20 @@ class ZenithCli: def pg_stop( self, - branch_name: str, + node_name: str, tenant_id: Optional[uuid.UUID] = None, destroy=False, ) -> 'subprocess.CompletedProcess[str]': args = [ 'pg', 'stop', - f'--tenant-id={(tenant_id or self.env.initial_tenant).hex}', - '--name', - branch_name + '--tenant-id', + (tenant_id or self.env.initial_tenant).hex, ] if destroy: args.append('--destroy') + if node_name is not None: + args.append(node_name) return self.raw_cli(args) @@ -1286,14 +1292,15 @@ class Postgres(PgProtocol): self.env = env self.running = False - self.branch_name: Optional[str] = None # dubious, see asserts below + self.node_name: Optional[str] = None # dubious, see asserts below self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA self.tenant_id = tenant_id - # path to conf is /pgdatadirs/tenants///postgresql.conf + # path to conf is /pgdatadirs/tenants///postgresql.conf def create( self, branch_name: str, + node_name: Optional[str] = None, lsn: Optional[str] = None, config_lines: Optional[List[str]] = None, ) -> 'Postgres': @@ -1305,12 +1312,13 @@ class Postgres(PgProtocol): if not config_lines: config_lines = [] + self.node_name = node_name or f'{branch_name}_pg_node' self.env.zenith_cli.pg_create(branch_name, + node_name=self.node_name, tenant_id=self.tenant_id, lsn=lsn, port=self.port) - self.branch_name = branch_name - path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.branch_name + path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name self.pgdata_dir = os.path.join(self.env.repo_dir, path) if config_lines is None: @@ -1329,11 +1337,11 @@ class Postgres(PgProtocol): Returns self. """ - assert self.branch_name is not None + assert self.node_name is not None - log.info(f"Starting postgres node {self.branch_name}") + log.info(f"Starting postgres node {self.node_name}") - run_result = self.env.zenith_cli.pg_start(self.branch_name, + run_result = self.env.zenith_cli.pg_start(self.node_name, tenant_id=self.tenant_id, port=self.port) self.running = True @@ -1344,8 +1352,8 @@ class Postgres(PgProtocol): def pg_data_dir_path(self) -> str: """ Path to data directory """ - assert self.branch_name - path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.branch_name + assert self.node_name + path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name return os.path.join(self.env.repo_dir, path) def pg_xact_dir_path(self) -> str: @@ -1404,8 +1412,8 @@ class Postgres(PgProtocol): """ if self.running: - assert self.branch_name is not None - self.env.zenith_cli.pg_stop(self.branch_name, self.tenant_id) + assert self.node_name is not None + self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id) self.running = False return self @@ -1416,15 +1424,16 @@ class Postgres(PgProtocol): Returns self. """ - assert self.branch_name is not None - self.env.zenith_cli.pg_stop(self.branch_name, self.tenant_id, True) - self.branch_name = None + assert self.node_name is not None + self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, True) + self.node_name = None return self def create_start( self, branch_name: str, + node_name: Optional[str] = None, lsn: Optional[str] = None, config_lines: Optional[List[str]] = None, ) -> 'Postgres': @@ -1436,6 +1445,7 @@ class Postgres(PgProtocol): self.create( branch_name=branch_name, + node_name=node_name, config_lines=config_lines, lsn=lsn, ).start() @@ -1457,7 +1467,8 @@ class PostgresFactory: self.instances: List[Postgres] = [] def create_start(self, - branch_name: Optional[str] = None, + branch_name: str, + node_name: Optional[str] = None, tenant_id: Optional[uuid.UUID] = None, lsn: Optional[str] = None, config_lines: Optional[List[str]] = None) -> Postgres: @@ -1471,13 +1482,15 @@ class PostgresFactory: self.instances.append(pg) return pg.create_start( - branch_name=branch_name or self.env.default_branch_name, + branch_name=branch_name, + node_name=node_name, config_lines=config_lines, lsn=lsn, ) def create(self, - branch_name: Optional[str] = None, + branch_name: str, + node_name: Optional[str] = None, tenant_id: Optional[uuid.UUID] = None, lsn: Optional[str] = None, config_lines: Optional[List[str]] = None) -> Postgres: @@ -1492,7 +1505,8 @@ class PostgresFactory: self.instances.append(pg) return pg.create( - branch_name=branch_name or self.env.default_branch_name, + branch_name=branch_name, + node_name=node_name, lsn=lsn, config_lines=config_lines, ) @@ -1713,6 +1727,7 @@ def list_files_to_compare(pgdata_dir: str): # pg is the existing and running compute node, that we want to compare with a basebackup def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Postgres): + # Get the timeline ID. We need it for the 'basebackup' command with closing(pg.connect()) as conn: with conn.cursor() as cur: @@ -1723,7 +1738,7 @@ def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Pos pg.stop() # Take a basebackup from pageserver - restored_dir_path = os.path.join(env.repo_dir, f"{pg.branch_name}_restored_datadir") + restored_dir_path = os.path.join(env.repo_dir, f"{pg.node_name}_restored_datadir") mkdir_if_needed(restored_dir_path) pg_bin = PgBin(test_output_dir) diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index def9753347..2430eec33e 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -39,7 +39,7 @@ def test_bulk_tenant_create( # wa_factory.start_n_new(3) pg_tenant = env.postgres.create_start( - f'test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}', tenant) + f'test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}', tenant_id=tenant) end = timeit.default_timer() time_slices.append(end - start) diff --git a/zenith/src/main.rs b/zenith/src/main.rs index 34cab4b381..9a578f79f1 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -19,7 +19,7 @@ use walkeeper::defaults::{ use zenith_utils::auth::{Claims, Scope}; use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}; +use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; use zenith_utils::GIT_VERSION; use pageserver::timelines::TimelineInfo; @@ -72,13 +72,17 @@ struct TimelineTreeEl { // * Providing CLI api to the pageserver // * TODO: export/import to/from usual postgres fn main() -> Result<()> { - let branch_name_arg = Arg::new("name") - .long("name") - .short('n') + let branch_name_arg = Arg::new("branch-name") + .long("branch-name") .takes_value(true) .help("Name of the branch to be created or used as an alias for other services") .required(false); + let pg_node_arg = Arg::new("node").help("Postgres node name").required(false); + let safekeeper_node_arg = Arg::new("node") + .help("Safekeeper node name") + .required(false); + let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false); let tenant_id_arg = Arg::new("tenant-id") @@ -199,6 +203,7 @@ fn main() -> Result<()> { .subcommand(App::new("list").arg(tenant_id_arg.clone())) .subcommand(App::new("create") .about("Create a postgres compute node") + .arg(pg_node_arg.clone()) .arg(branch_name_arg.clone()) .arg(tenant_id_arg.clone()) .arg(lsn_arg.clone()) @@ -211,20 +216,20 @@ fn main() -> Result<()> { )) .subcommand(App::new("start") .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") - .arg(branch_name_arg.clone()) + .arg(pg_node_arg.clone()) .arg(tenant_id_arg.clone()) .arg(lsn_arg.clone()) .arg(port_arg.clone())) .subcommand( App::new("stop") - .arg(branch_name_arg.clone()) - .arg(tenant_id_arg.clone()) - .arg( - Arg::new("destroy") - .help("Also delete data directory (now optional, should be default in future)") - .long("destroy") - .required(false) - ) + .arg(pg_node_arg.clone()) + .arg(tenant_id_arg.clone()) + .arg( + Arg::new("destroy") + .help("Also delete data directory (now optional, should be default in future)") + .long("destroy") + .required(false) + ) ) ) @@ -483,9 +488,10 @@ fn handle_init(init_match: &ArgMatches) -> Result { exit(1); }); - env.branch_name_mappings.insert( + env.register_branch_mapping( DEFAULT_BRANCH_NAME.to_owned(), - ZTenantTimelineId::new(initial_tenant_id, initial_timeline_id), + initial_tenant_id, + initial_timeline_id, ); Ok(env) @@ -508,14 +514,15 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re } } Some(("create", create_match)) => { - let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(|| ZTenantId::generate()); + let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(ZTenantId::generate); println!("using tenant id {}", tenant_id); let initial_timeline_id_argument = parse_timeline_id(create_match)?; let initial_timeline_id = pageserver.tenant_create(tenant_id, initial_timeline_id_argument)?; - env.branch_name_mappings.insert( + env.register_branch_mapping( DEFAULT_BRANCH_NAME.to_owned(), - ZTenantTimelineId::new(tenant_id, initial_timeline_id), + tenant_id, + initial_timeline_id, ); println!( "tenant {} successfully created on the pageserver, initial timeline: '{}'", @@ -541,7 +548,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let tenant_id = get_tenant_id(create_match, env)?; let new_timeline_id = ZTimelineId::generate(); let new_branch_name = create_match - .value_of("name") + .value_of("branch-name") .ok_or(anyhow!("No branch name provided"))?; let timeline = pageserver.timeline_create(tenant_id, new_timeline_id, None, None)?; @@ -556,10 +563,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - ) } }; - env.branch_name_mappings.insert( - new_branch_name.to_string(), - ZTenantTimelineId::new(tenant_id, new_timeline_id), - ); + env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id); println!( "Created timeline '{}' at Lsn {} for tenant: {}", @@ -572,19 +576,19 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let tenant_id = get_tenant_id(branch_match, env)?; let new_timeline_id = ZTimelineId::generate(); let new_branch_name = branch_match - .value_of("name") + .value_of("branch-name") .ok_or(anyhow!("No branch name provided"))?; let ancestor_branch_name = branch_match .value_of("ancestor-branch-name") .ok_or(anyhow!("No ancestor branch name provided"))?; let ancestor_timeline_id = env - .branch_name_mappings - .get(ancestor_branch_name) - .ok_or(anyhow!( - "Found no timeline id for branch name '{}'", - ancestor_branch_name - ))? - .timeline_id; + .get_branch_timeline_id(ancestor_branch_name, tenant_id) + .ok_or_else(|| { + anyhow!( + "Found no timeline id for branch name '{}'", + ancestor_branch_name + ) + })?; let start_lsn = branch_match .value_of("ancestor-start-lsn") @@ -608,10 +612,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - ), }; - env.branch_name_mappings.insert( - new_branch_name.to_string(), - ZTenantTimelineId::new(tenant_id, new_timeline_id), - ); + env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id); println!( "Created timeline '{}' at Lsn {} for tenant: {}. Ancestor timeline: '{}'", @@ -638,7 +639,6 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { // All subcommands take an optional --tenant-id option let tenant_id = get_tenant_id(sub_args, env)?; - let node_name = sub_args.value_of("name").unwrap_or(DEFAULT_BRANCH_NAME); match sub_name { "list" => { @@ -677,28 +677,37 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { } } "create" => { + let branch_name = sub_args + .value_of("branch-name") + .unwrap_or(DEFAULT_BRANCH_NAME); + let node_name = sub_args + .value_of("node") + .map(ToString::to_string) + .unwrap_or_else(|| format!("{}_node", branch_name)); + let lsn = sub_args .value_of("lsn") .map(Lsn::from_str) .transpose() .context("Failed to parse Lsn from the request")?; let timeline_id = env - .branch_name_mappings - .get(node_name) - .ok_or(anyhow!("Found no timeline id for node name {}", node_name))? - .timeline_id; + .get_branch_timeline_id(branch_name, tenant_id) + .ok_or_else(|| anyhow!("Found no timeline id for branch name '{}'", branch_name))?; let port: Option = match sub_args.value_of("port") { Some(p) => Some(p.parse()?), None => None, }; - cplane.new_node(tenant_id, node_name, timeline_id, lsn, port)?; + cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port)?; } "start" => { let port: Option = match sub_args.value_of("port") { Some(p) => Some(p.parse()?), None => None, }; + let node_name = sub_args + .value_of("node") + .ok_or_else(|| anyhow!("No node name was provided to start"))?; let node = cplane.nodes.get(&(tenant_id, node_name.to_owned())); @@ -714,11 +723,14 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { println!("Starting existing postgres {}...", node_name); node.start(&auth_token)?; } else { + let branch_name = sub_args + .value_of("branch-name") + .unwrap_or(DEFAULT_BRANCH_NAME); let timeline_id = env - .branch_name_mappings - .get(node_name) - .ok_or(anyhow!("Found no timeline id for node name {}", node_name))? - .timeline_id; + .get_branch_timeline_id(branch_name, tenant_id) + .ok_or_else(|| { + anyhow!("Found no timeline id for branch name '{}'", branch_name) + })?; let lsn = sub_args .value_of("lsn") .map(Lsn::from_str) @@ -738,6 +750,9 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { } } "stop" => { + let node_name = sub_args + .value_of("node") + .ok_or_else(|| anyhow!("No node name was provided to stop"))?; let destroy = sub_args.is_present("destroy"); let node = cplane diff --git a/zenith_utils/src/zid.rs b/zenith_utils/src/zid.rs index 89708ee0df..a740d4fb48 100644 --- a/zenith_utils/src/zid.rs +++ b/zenith_utils/src/zid.rs @@ -317,7 +317,7 @@ zid_newtype!(ZTenantId); mutual_from!(ZTenantId, HexZTenantId); // A pair uniquely identifying Zenith instance. -#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash)] pub struct ZTenantTimelineId { pub tenant_id: ZTenantId, pub timeline_id: ZTimelineId, From a5e10c4f64d87d286e46c2425699e90f2a5d0baa Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sun, 27 Feb 2022 23:57:00 +0200 Subject: [PATCH 0023/1022] Tidy up pageserver's endpoints --- README.md | 18 ++--- control_plane/src/local_env.rs | 46 ++++++++++-- control_plane/src/storage.rs | 32 ++++---- pageserver/src/http/models.rs | 22 ++++-- pageserver/src/http/openapi_spec.yml | 83 +++++++++------------ pageserver/src/http/routes.rs | 35 +++++---- pageserver/src/tenant_mgr.rs | 9 ++- pageserver/src/timelines.rs | 11 ++- test_runner/batch_others/test_auth.py | 13 ++-- test_runner/batch_others/test_zenith_cli.py | 18 ++--- test_runner/fixtures/zenith_fixtures.py | 63 ++++++++++------ zenith/src/main.rs | 68 +++++++++++------ 12 files changed, 246 insertions(+), 172 deletions(-) diff --git a/README.md b/README.md index 8dd407f41a..c8acf526b9 100644 --- a/README.md +++ b/README.md @@ -57,12 +57,12 @@ pageserver init succeeded Starting pageserver at 'localhost:64000' in '.zenith' Pageserver started initializing for single for 7676 -Starting safekeeper at 'localhost:5454' in '.zenith/safekeepers/single' +Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/single' Safekeeper started # start postgres compute node > ./target/debug/zenith pg start main -Starting new postgres main on main... +Starting new postgres main on timeline 5b014a9e41b4b63ce1a1febc04503636 ... Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432 Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres' waiting for server to start.... done @@ -70,8 +70,8 @@ server started # check list of running postgres instances > ./target/debug/zenith pg list -BRANCH ADDRESS LSN STATUS -main 127.0.0.1:55432 0/1609610 running +NODE ADDRESS TIMELINES BRANCH NAME LSN STATUS +main 127.0.0.1:55432 5b014a9e41b4b63ce1a1febc04503636 main 0/1609610 running ``` 4. Now it is possible to connect to postgres and run some queries: @@ -91,13 +91,13 @@ postgres=# select * from t; 5. And create branches and run postgres on them: ```sh # create branch named migration_check -> ./target/debug/zenith branch migration_check main -Created branch 'migration_check' at 0/1609610 +> ./target/debug/zenith timeline branch --branch-name migration_check +Created timeline '0e9331cad6efbafe6a88dd73ae21a5c9' at Lsn 0/16F5830 for tenant: c03ba6b7ad4c5e9cf556f059ade44229. Ancestor timeline: 'main' # check branches tree -> ./target/debug/zenith branch - main - ┗━ @0/1609610: migration_check +> ./target/debug/zenith timeline list + main [5b014a9e41b4b63ce1a1febc04503636] + ┗━ @0/1609610: migration_check [0e9331cad6efbafe6a88dd73ae21a5c9] # start postgres on that branch > ./target/debug/zenith pg start migration_check diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index a9352bdfcc..9278a9df5a 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -12,6 +12,7 @@ use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use zenith_utils::auth::{encode_from_key_file, Claims, Scope}; use zenith_utils::postgres_backend::AuthType; +use zenith_utils::zid::ZTenantTimelineId; use zenith_utils::zid::ZTimelineId; use zenith_utils::zid::{HexZTenantId, ZNodeId, ZTenantId}; @@ -60,8 +61,7 @@ pub struct LocalEnv { #[serde(default)] pub safekeepers: Vec, - /// Every tenant has a first timeline created for it, currently the only one ancestor-less for this tenant. - /// It is used as a default timeline for branching, if no ancestor timeline is specified. + /// Keep human-readable aliases in memory (and persist them to config), to hind ZId hex strings from the user. #[serde(default)] // A `HashMap>` would be more appropriate here, // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. @@ -158,11 +158,31 @@ impl LocalEnv { branch_name: String, tenant_id: ZTenantId, timeline_id: ZTimelineId, - ) { - self.branch_name_mappings - .entry(branch_name) - .or_default() - .push((tenant_id, timeline_id)); + ) -> anyhow::Result<()> { + let existing_values = self + .branch_name_mappings + .entry(branch_name.clone()) + .or_default(); + + let existing_ids = existing_values + .iter() + .find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id); + + if let Some((_, old_timeline_id)) = existing_ids { + if old_timeline_id == &timeline_id { + Ok(()) + } else { + bail!( + "branch '{}' is already mapped to timeline {}, cannot map to another timeline {}", + branch_name, + old_timeline_id, + timeline_id + ); + } + } else { + existing_values.push((tenant_id, timeline_id)); + Ok(()) + } } pub fn get_branch_timeline_id( @@ -177,6 +197,18 @@ impl LocalEnv { .map(|&(_, timeline_id)| timeline_id) } + pub fn timeline_name_mappings(&self) -> HashMap { + self.branch_name_mappings + .iter() + .map(|(name, tenant_timelines)| { + tenant_timelines.iter().map(|&(tenant_id, timeline_id)| { + (ZTenantTimelineId::new(tenant_id, timeline_id), name.clone()) + }) + }) + .flatten() + .collect() + } + /// Create a LocalEnv from a config file. /// /// Unlike 'load_config', this function fills in any defaults that are missing diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index e18be05cea..c2b99972e7 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -9,7 +9,7 @@ use anyhow::{bail, Context}; use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; -use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest}; +use pageserver::http::models::{TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest}; use pageserver::timelines::TimelineInfo; use postgres::{Config, NoTls}; use reqwest::blocking::{Client, RequestBuilder, Response}; @@ -322,7 +322,7 @@ impl PageServerNode { } pub fn check_status(&self) -> Result<()> { - self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status")) + self.http_request(Method::GET, format!("{}/status", self.http_base_url)) .send()? .error_from_body()?; Ok(()) @@ -330,7 +330,7 @@ impl PageServerNode { pub fn tenant_list(&self) -> Result> { Ok(self - .http_request(Method::GET, format!("{}/{}", self.http_base_url, "tenant")) + .http_request(Method::GET, format!("{}/tenant", self.http_base_url)) .send()? .error_from_body()? .json()?) @@ -338,13 +338,13 @@ impl PageServerNode { pub fn tenant_create( &self, - tenant_id: ZTenantId, + new_tenant_id: Option, initial_timeline_id: Option, - ) -> Result { + ) -> Result { Ok(self - .http_request(Method::POST, format!("{}/{}", self.http_base_url, "tenant")) + .http_request(Method::POST, format!("{}/tenant", self.http_base_url)) .json(&TenantCreateRequest { - tenant_id, + new_tenant_id, initial_timeline_id, }) .send()? @@ -352,11 +352,11 @@ impl PageServerNode { .json()?) } - pub fn timeline_list(&self, tenantid: &ZTenantId) -> Result> { + pub fn timeline_list(&self, tenant_id: &ZTenantId) -> Result> { Ok(self .http_request( Method::GET, - format!("{}/timeline/{}", self.http_base_url, tenantid), + format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), ) .send()? .error_from_body()? @@ -366,16 +366,18 @@ impl PageServerNode { pub fn timeline_create( &self, tenant_id: ZTenantId, - timeline_id: ZTimelineId, - start_lsn: Option, + new_timeline_id: Option, + ancestor_start_lsn: Option, ancestor_timeline_id: Option, ) -> Result { Ok(self - .http_request(Method::POST, format!("{}/timeline", self.http_base_url)) + .http_request( + Method::POST, + format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), + ) .json(&TimelineCreateRequest { - tenant_id, - timeline_id, - start_lsn, + new_timeline_id, + ancestor_start_lsn, ancestor_timeline_id, }) .send()? diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 04ccb9708e..9b321744eb 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -7,25 +7,33 @@ use zenith_utils::{ #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { - #[serde(with = "hex")] - pub tenant_id: ZTenantId, - #[serde(with = "hex")] - pub timeline_id: ZTimelineId, + #[serde(default)] + #[serde(with = "opt_display_serde")] + pub new_timeline_id: Option, #[serde(default)] #[serde(with = "opt_display_serde")] pub ancestor_timeline_id: Option, - pub start_lsn: Option, + pub ancestor_start_lsn: Option, } #[derive(Serialize, Deserialize)] pub struct TenantCreateRequest { - #[serde(with = "hex")] - pub tenant_id: ZTenantId, + #[serde(default)] + #[serde(with = "opt_display_serde")] + pub new_tenant_id: Option, #[serde(default)] #[serde(with = "opt_display_serde")] pub initial_timeline_id: Option, } +#[derive(Deserialize, Serialize)] +pub struct TenantCreateResponse { + #[serde(with = "hex")] + pub tenant_id: ZTenantId, + #[serde(with = "hex")] + pub timeline_id: ZTimelineId, +} + #[derive(Serialize)] pub struct StatusResponse { pub id: ZNodeId, diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 7f3bf97bfe..f276e01227 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -22,7 +22,7 @@ paths: properties: id: type: integer - /v1/timeline/{tenant_id}: + /v1/tenant/{tenant_id}/timeline: parameters: - name: tenant_id in: path @@ -70,7 +70,7 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - /v1/timeline/{tenant_id}/{timeline_id}: + /v1/tenant/{tenant_id}/timeline/{timeline_id}: parameters: - name: tenant_id in: path @@ -90,7 +90,7 @@ paths: type: string description: Controls calculation of current_logical_size_non_incremental get: - description: Get timelines for tenant + description: Get info about the timeline responses: "200": description: TimelineInfo @@ -122,7 +122,14 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - /v1/timeline/: + /v1/tenant/{tenant_id}/timeline/: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex post: description: Create timeline requestBody: @@ -130,18 +137,14 @@ paths: application/json: schema: type: object - required: - - "tenant_id" - - "timeline_id" - - "start_point" properties: - tenant_id: + new_timeline_id: type: string format: hex - timeline_id: + ancestor_timeline_id: type: string format: hex - start_point: + ancestor_start_lsn: type: string responses: "201": @@ -149,7 +152,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/TImelineInfo" + $ref: "#/components/schemas/TimelineInfo" "400": description: Malformed timeline create request content: @@ -211,10 +214,11 @@ paths: application/json: schema: type: object - required: - - "tenant_id" properties: - tenant_id: + new_tenant_id: + type: string + format: hex + initial_timeline_id: type: string format: hex responses: @@ -223,9 +227,14 @@ paths: content: application/json: schema: - type: array - items: - type: string + type: object + properties: + tenant_id: + type: string + format: hex + timeline_id: + type: string + format: hex "400": description: Malformed tenant create request content: @@ -268,35 +277,11 @@ components: type: string state: type: string - TimelineInfo: - type: object - required: - - timeline_id - - latest_valid_lsn - - current_logical_size - properties: - timeline_id: - type: string - format: hex - ancestor_id: - type: string - format: hex - ancestor_lsn: - type: string - current_logical_size: - type: integer - current_logical_size_non_incremental: - type: integer - latest_valid_lsn: - type: integer TimelineInfo: type: object required: - timeline_id - tenant_id - - last_record_lsn - - prev_record_lsn - - start_lsn - disk_consistent_lsn properties: timeline_id: @@ -305,19 +290,21 @@ components: tenant_id: type: string format: hex - ancestor_timeline_id: - type: string - format: hex last_record_lsn: type: string prev_record_lsn: type: string - start_lsn: + ancestor_timeline_id: + type: string + format: hex + ancestor_lsn: type: string disk_consistent_lsn: type: string - timeline_state: - type: string + current_logical_size: + type: integer + current_logical_size_non_incremental: + type: integer Error: type: object diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 45b0c3d4be..ddb52e209b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -20,6 +20,7 @@ use zenith_utils::zid::{HexZTimelineId, ZTimelineId}; use super::models::StatusResponse; use super::models::TenantCreateRequest; +use super::models::TenantCreateResponse; use super::models::TimelineCreateRequest; use crate::repository::RepositoryTimeline; use crate::timelines::TimelineInfo; @@ -69,18 +70,19 @@ async fn status_handler(request: Request) -> Result, ApiErr } async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { + let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; let request_data: TimelineCreateRequest = json_request(&mut request).await?; - check_permission(&request, Some(request_data.tenant_id))?; + check_permission(&request, Some(tenant_id))?; let response_data = tokio::task::spawn_blocking(move || { - let _enter = info_span!("/timeline_create", timeline = %request_data.timeline_id, tenant = %request_data.tenant_id, lsn=?request_data.start_lsn).entered(); + let _enter = info_span!("/timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn).entered(); timelines::create_timeline( get_config(&request), - request_data.tenant_id, - request_data.timeline_id, + tenant_id, + request_data.new_timeline_id, request_data.ancestor_timeline_id, - request_data.start_lsn, + request_data.ancestor_start_lsn, ) }) .await @@ -214,12 +216,15 @@ async fn tenant_create_handler(mut request: Request) -> Result> = Mutex::new(HashMap::new()); @@ -179,9 +179,10 @@ pub fn shutdown_all_tenants() { pub fn create_repository_for_tenant( conf: &'static PageServerConf, - tenant_id: ZTenantId, + new_tenant_id: Option, initial_timeline_id: Option, -) -> Result { +) -> Result { + let tenant_id = new_tenant_id.unwrap_or_else(ZTenantId::generate); let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); let (initial_timeline_id, repo) = timelines::create_repo(conf, tenant_id, initial_timeline_id, wal_redo_manager)?; @@ -196,7 +197,7 @@ pub fn create_repository_for_tenant( } } - Ok(initial_timeline_id) + Ok(ZTenantTimelineId::new(tenant_id, initial_timeline_id)) } pub fn get_tenant_state(tenantid: ZTenantId) -> Option { diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 786e102747..c6b2e81abc 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -125,6 +125,13 @@ impl TimelineInfo { TimelineInfo::Remote { timeline_id, .. } => timeline_id, } } + + pub fn tenant_id(&self) -> ZTenantId { + match *self { + TimelineInfo::Local { tenant_id, .. } => tenant_id, + TimelineInfo::Remote { tenant_id, .. } => tenant_id, + } + } } fn get_current_logical_size_non_incremental( @@ -335,10 +342,12 @@ pub(crate) fn get_timelines( pub(crate) fn create_timeline( conf: &'static PageServerConf, tenant_id: ZTenantId, - new_timeline_id: ZTimelineId, + new_timeline_id: Option, ancestor_timeline_id: Option, ancestor_start_lsn: Option, ) -> Result { + let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); + if conf.timeline_path(&new_timeline_id, &tenant_id).exists() { bail!("timeline {} already exists", new_timeline_id); } diff --git a/test_runner/batch_others/test_auth.py b/test_runner/batch_others/test_auth.py index e92eb2e044..bda6349ef9 100644 --- a/test_runner/batch_others/test_auth.py +++ b/test_runner/batch_others/test_auth.py @@ -29,30 +29,27 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): tenant_id=env.initial_tenant) # tenant can create branches - tenant_http_client.timeline_create(timeline_id=uuid4(), - tenant_id=env.initial_tenant, + tenant_http_client.timeline_create(tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id) # console can create branches for tenant - management_http_client.timeline_create(timeline_id=uuid4(), - tenant_id=env.initial_tenant, + management_http_client.timeline_create(tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id) # fail to create branch using token with different tenant_id with pytest.raises(ZenithPageserverApiException, match='Forbidden: Tenant id mismatch. Permission denied'): - invalid_tenant_http_client.timeline_create(timeline_id=uuid4(), - tenant_id=env.initial_tenant, + invalid_tenant_http_client.timeline_create(tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id) # create tenant using management token - management_http_client.tenant_create(uuid4()) + management_http_client.tenant_create() # fail to create tenant using tenant token with pytest.raises( ZenithPageserverApiException, match='Forbidden: Attempt to access management api with tenant scope. Permission denied' ): - tenant_http_client.tenant_create(uuid4()) + tenant_http_client.tenant_create() @pytest.mark.parametrize('with_wal_acceptors', [False, True]) diff --git a/test_runner/batch_others/test_zenith_cli.py b/test_runner/batch_others/test_zenith_cli.py index 8777a653b3..4a62a1430a 100644 --- a/test_runner/batch_others/test_zenith_cli.py +++ b/test_runner/batch_others/test_zenith_cli.py @@ -15,17 +15,15 @@ def helper_compare_timeline_list(pageserver_http_client: ZenithPageserverHttpCli Filters out timelines created by other tests. """ + timelines_api = sorted( + map(lambda t: cast(str, t['timeline_id']), + pageserver_http_client.timeline_list(initial_tenant))) + timelines_cli = env.zenith_cli.list_timelines() - timelines_cli = [ - b for b in timelines_cli if b.startswith('test_cli_') or b in ('empty', 'main') - ] + assert timelines_cli == env.zenith_cli.list_timelines(initial_tenant) - timelines_cli_with_tenant_arg = env.zenith_cli.list_timelines(initial_tenant) - timelines_cli_with_tenant_arg = [ - b for b in timelines_cli if b.startswith('test_cli_') or b in ('empty', 'main') - ] - - assert timelines_cli == timelines_cli_with_tenant_arg + cli_timeline_ids = sorted([timeline_id for (_, timeline_id) in timelines_cli]) + assert timelines_api == cli_timeline_ids def test_cli_timeline_list(zenith_simple_env: ZenithEnv): @@ -45,7 +43,7 @@ def test_cli_timeline_list(zenith_simple_env: ZenithEnv): helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Check that all new branches are visible via CLI - timelines_cli = env.zenith_cli.list_timelines() + timelines_cli = [timeline_id for (_, timeline_id) in env.zenith_cli.list_timelines()] assert main_timeline_id.hex in timelines_cli assert nested_timeline_id.hex in timelines_cli diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 04b795b244..bb2e690cb3 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -712,27 +712,29 @@ class ZenithPageserverHttpClient(requests.Session): def timeline_attach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): res = self.post( - f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}/attach", ) + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/attach", + ) self.verbose_error(res) def timeline_detach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): res = self.post( - f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}/detach", ) + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/detach", + ) self.verbose_error(res) - def timeline_create(self, - tenant_id: uuid.UUID, - timeline_id: uuid.UUID, - start_lsn: Optional[str] = None, - ancestor_timeline_id: Optional[uuid.UUID] = None) -> Dict[Any, Any]: - res = self.post(f"http://localhost:{self.port}/v1/timeline", + def timeline_create( + self, + tenant_id: uuid.UUID, + timeline_id: Optional[uuid.UUID] = None, + ancestor_timeline_id: Optional[uuid.UUID] = None, + ancestor_start_lsn: Optional[str] = None, + ) -> Dict[Any, Any]: + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline", json={ - 'tenant_id': - tenant_id.hex, - 'timeline_id': - timeline_id.hex, - 'start_lsn': - start_lsn, + 'new_timeline_id': + timeline_id.hex if timeline_id else None, + 'ancestor_start_lsn': + ancestor_start_lsn, 'ancestor_timeline_id': ancestor_timeline_id.hex if ancestor_timeline_id else None, }) @@ -748,18 +750,23 @@ class ZenithPageserverHttpClient(requests.Session): assert isinstance(res_json, list) return res_json - def tenant_create(self, tenant_id: uuid.UUID): + def tenant_create(self, + tenant_id: Optional[uuid.UUID] = None, + new_timeline_id: Optional[uuid.UUID] = None) -> Dict[Any, Any]: res = self.post( f"http://localhost:{self.port}/v1/tenant", json={ - 'tenant_id': tenant_id.hex, + 'new_tenant_id': tenant_id.hex if tenant_id else None, + 'initial_timeline_id': new_timeline_id.hex if new_timeline_id else None, }, ) self.verbose_error(res) - return res.json() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json def timeline_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}") + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline") self.verbose_error(res) res_json = res.json() assert isinstance(res_json, list) @@ -767,7 +774,7 @@ class ZenithPageserverHttpClient(requests.Session): def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: res = self.get( - f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}?include-non-incremental-logical-size=1" + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1" ) self.verbose_error(res) res_json = res.json() @@ -861,13 +868,21 @@ class ZenithCli: else: return uuid.UUID(created_timeline_id) - def list_timelines(self, tenant_id: Optional[uuid.UUID] = None) -> List[str]: + def list_timelines(self, tenant_id: Optional[uuid.UUID] = None) -> List[Tuple[str, str]]: + """ + Returns a list of (branch_name, timeline_id) tuples out of parsed `zenith timeline list` CLI output. + """ + + # (L) main [b49f7954224a0ad25cc0013ea107b54b] + # (L) ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540] + timeline_data_extractor = re.compile( + r"\s(?P[^\s]+)\s\[(?P[^\]]+)\]", re.MULTILINE) res = self.raw_cli( ['timeline', 'list', '--tenant-id', (tenant_id or self.env.initial_tenant).hex]) - branches_cli = sorted( - map(lambda b: b.split(') ')[-1].strip().split(':')[-1].strip(), - res.stdout.strip().split("\n"))) - return branches_cli + timelines_cli = sorted( + map(lambda branch_and_id: (branch_and_id[0], branch_and_id[1]), + timeline_data_extractor.findall(res.stdout))) + return timelines_cli def init(self, config_toml: str, diff --git a/zenith/src/main.rs b/zenith/src/main.rs index 9a578f79f1..5f2489a41d 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -19,7 +19,7 @@ use walkeeper::defaults::{ use zenith_utils::auth::{Claims, Scope}; use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; +use zenith_utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}; use zenith_utils::GIT_VERSION; use pageserver::timelines::TimelineInfo; @@ -60,6 +60,8 @@ http_port = {safekeeper_http_port} struct TimelineTreeEl { /// `TimelineInfo` received from the `pageserver` via the `timeline_list` http API call. pub info: TimelineInfo, + /// Name, recovered from zenith config mappings + pub name: Option, /// Holds all direct children of this timeline referenced using `timeline_id`. pub children: BTreeSet, } @@ -150,7 +152,7 @@ fn main() -> Result<()> { .arg(tenant_id_arg.clone()) .arg(branch_name_arg.clone()) .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name").takes_value(true) - .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(true)) + .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false)) .arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn").takes_value(true) .help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false))) .subcommand(App::new("create") @@ -218,6 +220,8 @@ fn main() -> Result<()> { .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") .arg(pg_node_arg.clone()) .arg(tenant_id_arg.clone()) + .arg(branch_name_arg.clone()) + .arg(timeline_id_arg.clone()) .arg(lsn_arg.clone()) .arg(port_arg.clone())) .subcommand( @@ -290,7 +294,10 @@ fn main() -> Result<()> { /// /// Prints timelines list as a tree-like structure. /// -fn print_timelines_tree(timelines: Vec) -> Result<()> { +fn print_timelines_tree( + timelines: Vec, + mut timeline_name_mappings: HashMap, +) -> Result<()> { let mut timelines_hash = timelines .iter() .map(|t| { @@ -299,6 +306,8 @@ fn print_timelines_tree(timelines: Vec) -> Result<()> { TimelineTreeEl { info: t.clone(), children: BTreeSet::new(), + name: timeline_name_mappings + .remove(&ZTenantTimelineId::new(t.tenant_id(), t.timeline_id())), }, ) }) @@ -380,8 +389,12 @@ fn print_timeline( print!("{} @{}: ", br_sym, lsn_string); } - // Finally print a timeline name with new line - println!("{}", timeline.info.timeline_id()); + // Finally print a timeline id and name with new line + println!( + "{} [{}]", + timeline.name.as_deref().unwrap_or("_no_name_"), + timeline.info.timeline_id() + ); let len = timeline.children.len(); let mut i: usize = 0; @@ -492,7 +505,7 @@ fn handle_init(init_match: &ArgMatches) -> Result { DEFAULT_BRANCH_NAME.to_owned(), initial_tenant_id, initial_timeline_id, - ); + )?; Ok(env) } @@ -514,19 +527,18 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re } } Some(("create", create_match)) => { - let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(ZTenantId::generate); - println!("using tenant id {}", tenant_id); + let initial_tenant_id = parse_tenant_id(create_match)?; let initial_timeline_id_argument = parse_timeline_id(create_match)?; - let initial_timeline_id = - pageserver.tenant_create(tenant_id, initial_timeline_id_argument)?; + let new_ds = + pageserver.tenant_create(initial_tenant_id, initial_timeline_id_argument)?; env.register_branch_mapping( DEFAULT_BRANCH_NAME.to_owned(), - tenant_id, - initial_timeline_id, - ); + new_ds.tenant_id, + new_ds.timeline_id, + )?; println!( "tenant {} successfully created on the pageserver, initial timeline: '{}'", - tenant_id, initial_timeline_id + new_ds.tenant_id, new_ds.timeline_id ); } Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), @@ -542,15 +554,15 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - Some(("list", list_match)) => { let tenant_id = get_tenant_id(list_match, env)?; let timelines = pageserver.timeline_list(&tenant_id)?; - print_timelines_tree(timelines)?; + print_timelines_tree(timelines, env.timeline_name_mappings())?; } Some(("create", create_match)) => { let tenant_id = get_tenant_id(create_match, env)?; - let new_timeline_id = ZTimelineId::generate(); let new_branch_name = create_match .value_of("branch-name") .ok_or(anyhow!("No branch name provided"))?; - let timeline = pageserver.timeline_create(tenant_id, new_timeline_id, None, None)?; + let timeline = pageserver.timeline_create(tenant_id, None, None, None)?; + let new_timeline_id = timeline.timeline_id(); let last_record_lsn = match timeline { TimelineInfo::Local { @@ -563,7 +575,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - ) } }; - env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id); + env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; println!( "Created timeline '{}' at Lsn {} for tenant: {}", @@ -574,13 +586,12 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - } Some(("branch", branch_match)) => { let tenant_id = get_tenant_id(branch_match, env)?; - let new_timeline_id = ZTimelineId::generate(); let new_branch_name = branch_match .value_of("branch-name") .ok_or(anyhow!("No branch name provided"))?; let ancestor_branch_name = branch_match .value_of("ancestor-branch-name") - .ok_or(anyhow!("No ancestor branch name provided"))?; + .unwrap_or(DEFAULT_BRANCH_NAME); let ancestor_timeline_id = env .get_branch_timeline_id(ancestor_branch_name, tenant_id) .ok_or_else(|| { @@ -597,10 +608,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - .context("Failed to parse ancestor start Lsn from the request")?; let timeline = pageserver.timeline_create( tenant_id, - new_timeline_id, + None, start_lsn, Some(ancestor_timeline_id), )?; + let new_timeline_id = timeline.timeline_id(); let last_record_lsn = match timeline { TimelineInfo::Local { @@ -612,7 +624,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - ), }; - env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id); + env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; println!( "Created timeline '{}' at Lsn {} for tenant: {}. Ancestor timeline: '{}'", @@ -647,7 +659,9 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { HashMap::new() }); - println!("NODE\tADDRESS\t\tTIMELINE\tLSN\t\tSTATUS"); + let timeline_name_mappings = env.timeline_name_mappings(); + + println!("NODE\tADDRESS\tTIMELINE\tBRANCH NAME\tLSN\t\tSTATUS"); for ((_, node_name), node) in cplane .nodes .iter() @@ -666,11 +680,17 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { }) .unwrap_or_else(|| '?'.to_string()); + let branch_name = timeline_name_mappings + .get(&ZTenantTimelineId::new(tenant_id, node.timeline_id)) + .map(|name| name.as_str()) + .unwrap_or("?"); + println!( - "{}\t{}\t{}\t{}\t{}", + "{}\t{}\t{}\t{}\t{}\t{}", node_name, node.address, node.timeline_id, + branch_name, lsn_str, node.status(), ); From dd74c66ef0d8b3bd2c5636549a04f17823748cb5 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 3 Mar 2022 19:21:35 +0200 Subject: [PATCH 0024/1022] Do not create timeline along with tenant --- control_plane/src/storage.rs | 23 ++++----- pageserver/src/bin/pageserver.rs | 2 +- pageserver/src/http/models.rs | 11 ---- pageserver/src/http/openapi_spec.yml | 13 +---- pageserver/src/http/routes.rs | 20 +++----- pageserver/src/tenant_mgr.rs | 18 +++---- pageserver/src/timelines.rs | 31 ++++------- .../batch_others/test_pageserver_api.py | 18 ++++--- test_runner/batch_others/test_tenants.py | 8 +-- test_runner/fixtures/zenith_fixtures.py | 51 ++++++++++++++----- .../performance/test_bulk_tenant_create.py | 2 +- zenith/src/main.rs | 13 ++--- 12 files changed, 97 insertions(+), 113 deletions(-) diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index c2b99972e7..3c68823760 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -9,7 +9,7 @@ use anyhow::{bail, Context}; use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; -use pageserver::http::models::{TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest}; +use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest}; use pageserver::timelines::TimelineInfo; use postgres::{Config, NoTls}; use reqwest::blocking::{Client, RequestBuilder, Response}; @@ -336,20 +336,19 @@ impl PageServerNode { .json()?) } - pub fn tenant_create( - &self, - new_tenant_id: Option, - initial_timeline_id: Option, - ) -> Result { - Ok(self + pub fn tenant_create(&self, new_tenant_id: Option) -> anyhow::Result { + let tenant_id_string = self .http_request(Method::POST, format!("{}/tenant", self.http_base_url)) - .json(&TenantCreateRequest { - new_tenant_id, - initial_timeline_id, - }) + .json(&TenantCreateRequest { new_tenant_id }) .send()? .error_from_body()? - .json()?) + .json::()?; + tenant_id_string.parse().with_context(|| { + format!( + "Failed to parse tennat creation response as tenant id: {}", + tenant_id_string + ) + }) } pub fn timeline_list(&self, tenant_id: &ZTenantId) -> Result> { diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 83b128dd74..d37ba0cece 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -43,7 +43,7 @@ fn main() -> Result<()> { Arg::new("init") .long("init") .takes_value(false) - .help("Initialize pageserver repo"), + .help("Initialize pageserver service: creates an initial config, tenant and timeline, if specified"), ) .arg( Arg::new("workdir") diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 9b321744eb..28d9791438 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -21,17 +21,6 @@ pub struct TenantCreateRequest { #[serde(default)] #[serde(with = "opt_display_serde")] pub new_tenant_id: Option, - #[serde(default)] - #[serde(with = "opt_display_serde")] - pub initial_timeline_id: Option, -} - -#[derive(Deserialize, Serialize)] -pub struct TenantCreateResponse { - #[serde(with = "hex")] - pub tenant_id: ZTenantId, - #[serde(with = "hex")] - pub timeline_id: ZTimelineId, } #[derive(Serialize)] diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index f276e01227..823f927796 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -218,23 +218,14 @@ paths: new_tenant_id: type: string format: hex - initial_timeline_id: - type: string - format: hex responses: "201": description: CREATED content: application/json: schema: - type: object - properties: - tenant_id: - type: string - format: hex - timeline_id: - type: string - format: hex + type: string + format: hex "400": description: Malformed tenant create request content: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index ddb52e209b..efcc7ae2f3 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -20,7 +20,6 @@ use zenith_utils::zid::{HexZTimelineId, ZTimelineId}; use super::models::StatusResponse; use super::models::TenantCreateRequest; -use super::models::TenantCreateResponse; use super::models::TimelineCreateRequest; use crate::repository::RepositoryTimeline; use crate::timelines::TimelineInfo; @@ -215,20 +214,17 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result, ApiError> { diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index b40c1c6f2c..7076962830 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -16,7 +16,7 @@ use serde::{Deserialize, Serialize}; use std::collections::{hash_map, HashMap}; use std::fmt; use std::sync::{Arc, Mutex, MutexGuard}; -use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +use zenith_utils::zid::{ZTenantId, ZTimelineId}; lazy_static! { static ref TENANTS: Mutex> = Mutex::new(HashMap::new()); @@ -180,15 +180,13 @@ pub fn shutdown_all_tenants() { pub fn create_repository_for_tenant( conf: &'static PageServerConf, new_tenant_id: Option, - initial_timeline_id: Option, -) -> Result { - let tenant_id = new_tenant_id.unwrap_or_else(ZTenantId::generate); - let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); - let (initial_timeline_id, repo) = - timelines::create_repo(conf, tenant_id, initial_timeline_id, wal_redo_manager)?; +) -> Result { + let new_tenant_id = new_tenant_id.unwrap_or_else(ZTenantId::generate); + let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, new_tenant_id)); + let repo = timelines::create_repo(conf, new_tenant_id, wal_redo_manager)?; - match access_tenants().entry(tenant_id) { - hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", tenant_id), + match access_tenants().entry(new_tenant_id) { + hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", new_tenant_id), hash_map::Entry::Vacant(v) => { v.insert(Tenant { state: TenantState::Idle, @@ -197,7 +195,7 @@ pub fn create_repository_for_tenant( } } - Ok(ZTenantTimelineId::new(tenant_id, initial_timeline_id)) + Ok(new_tenant_id) } pub fn get_tenant_state(tenantid: ZTenantId) -> Option { diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index c6b2e81abc..54f0a302f4 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -17,9 +17,9 @@ use zenith_utils::lsn::Lsn; use zenith_utils::zid::{opt_display_serde, ZTenantId, ZTimelineId}; use zenith_utils::{crashsafe_dir, logging}; -use crate::walredo::WalRedoManager; use crate::{config::PageServerConf, repository::Repository}; use crate::{import_datadir, LOG_FILE_NAME}; +use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager}; use crate::{repository::RepositoryTimeline, tenant_mgr}; use crate::{repository::Timeline, CheckpointConfig}; @@ -179,8 +179,13 @@ pub fn init_pageserver( if let Some(tenant_id) = create_tenant { println!("initializing tenantid {}", tenant_id); - create_repo(conf, tenant_id, initial_timeline_id, dummy_redo_mgr) - .context("failed to create repo")?; + let repo = create_repo(conf, tenant_id, dummy_redo_mgr).context("failed to create repo")?; + let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); + bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref()) + .context("failed to create initial timeline")?; + println!("initial timeline {} created", new_timeline_id) + } else if initial_timeline_id.is_some() { + println!("Ignoring initial timeline parameter, due to no tenant id to create given"); } crashsafe_dir::create_dir_all(conf.tenants_path())?; @@ -191,9 +196,8 @@ pub fn init_pageserver( pub fn create_repo( conf: &'static PageServerConf, tenant_id: ZTenantId, - init_timeline_id: Option, wal_redo_manager: Arc, -) -> Result<(ZTimelineId, Arc)> { +) -> Result> { let repo_dir = conf.tenant_path(&tenant_id); if repo_dir.exists() { bail!("repo for {} already exists", tenant_id) @@ -207,25 +211,12 @@ pub fn create_repo( info!("created directory structure in {}", repo_dir.display()); - // create a new timeline directory - let timeline_id = init_timeline_id.unwrap_or_else(ZTimelineId::generate); - let timelinedir = conf.timeline_path(&timeline_id, &tenant_id); - - crashsafe_dir::create_dir(&timelinedir)?; - - let repo = Arc::new(crate::layered_repository::LayeredRepository::new( + Ok(Arc::new(LayeredRepository::new( conf, wal_redo_manager, tenant_id, conf.remote_storage_config.is_some(), - )); - - // Load data into pageserver - // TODO To implement zenith import we need to - // move data loading out of create_repo() - bootstrap_timeline(conf, tenant_id, timeline_id, repo.as_ref())?; - - Ok((timeline_id, repo)) + ))) } // Returns checkpoint LSN from controlfile diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 4c3b98e838..7d2c0800a2 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -23,9 +23,18 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): client.tenant_create(tenant_id) assert tenant_id.hex in {t['id'] for t in client.tenant_list()} - # check its timelines + timelines = client.timeline_list(tenant_id) + assert len(timelines) == 0, "initial tenant should not have any timelines" + + # create timeline + timeline_id = uuid4() + client.timeline_create(tenant_id=tenant_id, timeline_id=timeline_id) + timelines = client.timeline_list(tenant_id) assert len(timelines) > 0 + + # check it is there + assert timeline_id.hex in {b['timeline_id'] for b in client.timeline_list(tenant_id)} for timeline in timelines: timeline_id_str = str(timeline['timeline_id']) timeline_details = client.timeline_detail(tenant_id=tenant_id, @@ -34,13 +43,6 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): assert timeline_details['tenant_id'] == tenant_id.hex assert timeline_details['timeline_id'] == timeline_id_str - # create timeline - timeline_id = uuid4() - client.timeline_create(tenant_id=tenant_id, timeline_id=timeline_id) - - # check it is there - assert timeline_id.hex in {b['timeline_id'] for b in client.timeline_list(tenant_id)} - def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv): env = zenith_simple_env diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/batch_others/test_tenants.py index 87acf2086d..e883018628 100644 --- a/test_runner/batch_others/test_tenants.py +++ b/test_runner/batch_others/test_tenants.py @@ -15,10 +15,10 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_wal_acce tenant_1 = env.zenith_cli.create_tenant() tenant_2 = env.zenith_cli.create_tenant() - env.zenith_cli.create_branch(f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', - tenant_id=tenant_1) - env.zenith_cli.create_branch(f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', - tenant_id=tenant_2) + env.zenith_cli.create_timeline( + f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', tenant_id=tenant_1) + env.zenith_cli.create_timeline( + f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', tenant_id=tenant_2) pg_tenant1 = env.postgres.create_start( f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index bb2e690cb3..e2c9f16630 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -57,6 +57,7 @@ Fn = TypeVar('Fn', bound=Callable[..., Any]) DEFAULT_OUTPUT_DIR = 'test_output' DEFAULT_POSTGRES_DIR = 'tmp_install' +DEFAULT_BRANCH_NAME = 'main' BASE_PORT = 15000 WORKER_PORT_NUM = 100 @@ -424,7 +425,7 @@ class ZenithEnvBuilder: num_safekeepers: int = 0, pageserver_auth_enabled: bool = False, rust_log_override: Optional[str] = None, - default_branch_name='main'): + default_branch_name=DEFAULT_BRANCH_NAME): self.repo_dir = repo_dir self.rust_log_override = rust_log_override self.port_distributor = port_distributor @@ -547,7 +548,6 @@ class ZenithEnv: self.rust_log_override = config.rust_log_override self.port_distributor = config.port_distributor self.s3_mock_server = config.s3_mock_server - self.default_branch_name = config.default_branch_name self.zenith_cli = ZenithCli(env=self) self.postgres = PostgresFactory(self) self.safekeepers: List[Safekeeper] = [] @@ -639,7 +639,7 @@ def _shared_simple_env(request: Any, port_distributor) -> Iterator[ZenithEnv]: env = builder.init_start() # For convenience in tests, create a branch from the freshly-initialized cluster. - env.zenith_cli.create_branch("empty") + env.zenith_cli.create_branch('empty', ancestor_branch_name=DEFAULT_BRANCH_NAME) yield env @@ -750,20 +750,17 @@ class ZenithPageserverHttpClient(requests.Session): assert isinstance(res_json, list) return res_json - def tenant_create(self, - tenant_id: Optional[uuid.UUID] = None, - new_timeline_id: Optional[uuid.UUID] = None) -> Dict[Any, Any]: + def tenant_create(self, tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: res = self.post( f"http://localhost:{self.port}/v1/tenant", json={ 'new_tenant_id': tenant_id.hex if tenant_id else None, - 'initial_timeline_id': new_timeline_id.hex if new_timeline_id else None, }, ) self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json + new_tenant_id = res.json() + assert isinstance(new_tenant_id, str) + return uuid.UUID(new_tenant_id) def timeline_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline") @@ -834,8 +831,36 @@ class ZenithCli: res.check_returncode() return res + def create_timeline(self, + new_branch_name: str, + tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: + cmd = [ + 'timeline', + 'create', + '--branch-name', + new_branch_name, + '--tenant-id', + (tenant_id or self.env.initial_tenant).hex, + ] + + res = self.raw_cli(cmd) + res.check_returncode() + + create_timeline_id_extractor = re.compile(r"^Created timeline '(?P[^']+)'", + re.MULTILINE) + matches = create_timeline_id_extractor.search(res.stdout) + + created_timeline_id = None + if matches is not None: + created_timeline_id = matches.group('timeline_id') + + if created_timeline_id is None: + raise Exception('could not find timeline id after `zenith timeline create` invocation') + else: + return uuid.UUID(created_timeline_id) + def create_branch(self, - new_branch_name: str, + new_branch_name: str = DEFAULT_BRANCH_NAME, ancestor_branch_name: Optional[str] = None, tenant_id: Optional[uuid.UUID] = None, ancestor_start_lsn: Optional[str] = None) -> uuid.UUID: @@ -846,9 +871,9 @@ class ZenithCli: new_branch_name, '--tenant-id', (tenant_id or self.env.initial_tenant).hex, - '--ancestor-branch-name', - ancestor_branch_name or self.env.default_branch_name, ] + if ancestor_branch_name is not None: + cmd.extend(['--ancestor-branch-name', ancestor_branch_name]) if ancestor_start_lsn is not None: cmd.extend(['--ancestor-start-lsn', ancestor_start_lsn]) diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index 2430eec33e..fbef131ffd 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -31,7 +31,7 @@ def test_bulk_tenant_create( start = timeit.default_timer() tenant = env.zenith_cli.create_tenant() - env.zenith_cli.create_branch( + env.zenith_cli.create_timeline( f'test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}', tenant_id=tenant) # FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now? diff --git a/zenith/src/main.rs b/zenith/src/main.rs index 5f2489a41d..c4636fa1a6 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -528,17 +528,10 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re } Some(("create", create_match)) => { let initial_tenant_id = parse_tenant_id(create_match)?; - let initial_timeline_id_argument = parse_timeline_id(create_match)?; - let new_ds = - pageserver.tenant_create(initial_tenant_id, initial_timeline_id_argument)?; - env.register_branch_mapping( - DEFAULT_BRANCH_NAME.to_owned(), - new_ds.tenant_id, - new_ds.timeline_id, - )?; + let new_tenant_id = pageserver.tenant_create(initial_tenant_id)?; println!( - "tenant {} successfully created on the pageserver, initial timeline: '{}'", - new_ds.tenant_id, new_ds.timeline_id + "tenant {} successfully created on the pageserver", + new_tenant_id ); } Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), From fe6fccfdae3e968cf207c41ae7218adced458764 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 3 Mar 2022 23:16:54 +0200 Subject: [PATCH 0025/1022] Allow already existing repo when creating a tenant --- pageserver/src/http/openapi_spec.yml | 2 +- pageserver/src/tenant_mgr.rs | 22 +++++++++++----------- pageserver/src/timelines.rs | 22 ++++++++++++---------- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 823f927796..25d5ceae4e 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -220,7 +220,7 @@ paths: format: hex responses: "201": - description: CREATED + description: Already exists or created content: application/json: schema: diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 7076962830..77ef865ec5 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -9,11 +9,11 @@ use crate::thread_mgr::ThreadKind; use crate::timelines; use crate::walredo::PostgresRedoManager; use crate::CheckpointConfig; -use anyhow::{bail, Context, Result}; +use anyhow::{Context, Result}; use lazy_static::lazy_static; use log::*; use serde::{Deserialize, Serialize}; -use std::collections::{hash_map, HashMap}; +use std::collections::HashMap; use std::fmt; use std::sync::{Arc, Mutex, MutexGuard}; use zenith_utils::zid::{ZTenantId, ZTimelineId}; @@ -183,16 +183,16 @@ pub fn create_repository_for_tenant( ) -> Result { let new_tenant_id = new_tenant_id.unwrap_or_else(ZTenantId::generate); let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, new_tenant_id)); - let repo = timelines::create_repo(conf, new_tenant_id, wal_redo_manager)?; - - match access_tenants().entry(new_tenant_id) { - hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", new_tenant_id), - hash_map::Entry::Vacant(v) => { - v.insert(Tenant { - state: TenantState::Idle, - repo, - }); + match timelines::create_repo(conf, new_tenant_id, wal_redo_manager)? { + Some(repo) => { + access_tenants() + .entry(new_tenant_id) + .or_insert_with(|| Tenant { + state: TenantState::Idle, + repo, + }); } + None => debug!("repository already exists for tenant {}", new_tenant_id), } Ok(new_tenant_id) diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 54f0a302f4..587b9a2cf2 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -2,7 +2,7 @@ //! Timeline management code // -use anyhow::{bail, Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; use postgres_ffi::ControlFileData; use serde::{Deserialize, Serialize}; use std::{ @@ -160,7 +160,7 @@ pub fn init_pageserver( conf: &'static PageServerConf, create_tenant: Option, initial_timeline_id: Option, -) -> Result<()> { +) -> anyhow::Result<()> { // Initialize logger // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages let _log_file = logging::init(LOG_FILE_NAME, true)?; @@ -177,9 +177,13 @@ pub fn init_pageserver( // anymore, but I think that could still happen. let dummy_redo_mgr = Arc::new(crate::walredo::DummyRedoManager {}); + crashsafe_dir::create_dir_all(conf.tenants_path())?; + if let Some(tenant_id) = create_tenant { println!("initializing tenantid {}", tenant_id); - let repo = create_repo(conf, tenant_id, dummy_redo_mgr).context("failed to create repo")?; + let repo = create_repo(conf, tenant_id, dummy_redo_mgr) + .context("failed to create repo")? + .ok_or_else(|| anyhow!("For newely created pageserver, found already existing repository for tenant {}", tenant_id))?; let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref()) .context("failed to create initial timeline")?; @@ -187,7 +191,6 @@ pub fn init_pageserver( } else if initial_timeline_id.is_some() { println!("Ignoring initial timeline parameter, due to no tenant id to create given"); } - crashsafe_dir::create_dir_all(conf.tenants_path())?; println!("pageserver init succeeded"); Ok(()) @@ -197,26 +200,25 @@ pub fn create_repo( conf: &'static PageServerConf, tenant_id: ZTenantId, wal_redo_manager: Arc, -) -> Result> { +) -> Result>> { let repo_dir = conf.tenant_path(&tenant_id); if repo_dir.exists() { - bail!("repo for {} already exists", tenant_id) + debug!("repo for {} already exists", tenant_id); + return Ok(None); } // top-level dir may exist if we are creating it through CLI crashsafe_dir::create_dir_all(&repo_dir) .with_context(|| format!("could not create directory {}", repo_dir.display()))?; - crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; - info!("created directory structure in {}", repo_dir.display()); - Ok(Arc::new(LayeredRepository::new( + Ok(Some(Arc::new(LayeredRepository::new( conf, wal_redo_manager, tenant_id, conf.remote_storage_config.is_some(), - ))) + )))) } // Returns checkpoint LSN from controlfile From c51d545fd974385c104799b9e18d67d6a8047afa Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 4 Mar 2022 11:11:42 +0200 Subject: [PATCH 0026/1022] Serialize Lsn as strings in http api --- control_plane/src/local_env.rs | 26 ++-- control_plane/src/storage.rs | 32 +++-- pageserver/src/http/models.rs | 117 ++++++++++++++++-- pageserver/src/http/routes.rs | 32 +++-- pageserver/src/timelines.rs | 11 +- .../batch_others/test_pageserver_api.py | 2 +- .../batch_others/test_remote_storage.py | 2 +- test_runner/batch_others/test_wal_acceptor.py | 4 +- zenith/src/main.rs | 16 +-- 9 files changed, 175 insertions(+), 67 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 9278a9df5a..2a1d51fe08 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -12,9 +12,9 @@ use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use zenith_utils::auth::{encode_from_key_file, Claims, Scope}; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::ZTenantTimelineId; -use zenith_utils::zid::ZTimelineId; -use zenith_utils::zid::{HexZTenantId, ZNodeId, ZTenantId}; +use zenith_utils::zid::{ + HexZTenantId, HexZTimelineId, ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId, +}; use crate::safekeeper::SafekeeperNode; @@ -61,12 +61,12 @@ pub struct LocalEnv { #[serde(default)] pub safekeepers: Vec, - /// Keep human-readable aliases in memory (and persist them to config), to hind ZId hex strings from the user. + /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. #[serde(default)] // A `HashMap>` would be more appropriate here, // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". - branch_name_mappings: HashMap>, + branch_name_mappings: HashMap>, } #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] @@ -164,6 +164,9 @@ impl LocalEnv { .entry(branch_name.clone()) .or_default(); + let tenant_id = HexZTenantId::from(tenant_id); + let timeline_id = HexZTimelineId::from(timeline_id); + let existing_ids = existing_values .iter() .find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id); @@ -190,22 +193,29 @@ impl LocalEnv { branch_name: &str, tenant_id: ZTenantId, ) -> Option { + let tenant_id = HexZTenantId::from(tenant_id); self.branch_name_mappings .get(branch_name)? .iter() .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id) .map(|&(_, timeline_id)| timeline_id) + .map(ZTimelineId::from) } pub fn timeline_name_mappings(&self) -> HashMap { self.branch_name_mappings .iter() - .map(|(name, tenant_timelines)| { + .flat_map(|(name, tenant_timelines)| { tenant_timelines.iter().map(|&(tenant_id, timeline_id)| { - (ZTenantTimelineId::new(tenant_id, timeline_id), name.clone()) + ( + ZTenantTimelineId::new( + ZTenantId::from(tenant_id), + ZTimelineId::from(timeline_id), + ), + name.clone(), + ) }) }) - .flatten() .collect() } diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 3c68823760..259fc79708 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,3 +1,4 @@ +use std::convert::TryFrom; use std::io::Write; use std::net::TcpStream; use std::path::PathBuf; @@ -9,7 +10,7 @@ use anyhow::{bail, Context}; use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; -use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest}; +use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponse}; use pageserver::timelines::TimelineInfo; use postgres::{Config, NoTls}; use reqwest::blocking::{Client, RequestBuilder, Response}; @@ -18,7 +19,7 @@ use thiserror::Error; use zenith_utils::http::error::HttpErrorBody; use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use zenith_utils::zid::{HexZTenantId, HexZTimelineId, ZTenantId, ZTimelineId}; use crate::local_env::LocalEnv; use crate::{fill_rust_env_vars, read_pidfile}; @@ -339,7 +340,9 @@ impl PageServerNode { pub fn tenant_create(&self, new_tenant_id: Option) -> anyhow::Result { let tenant_id_string = self .http_request(Method::POST, format!("{}/tenant", self.http_base_url)) - .json(&TenantCreateRequest { new_tenant_id }) + .json(&TenantCreateRequest { + new_tenant_id: new_tenant_id.map(HexZTenantId::from), + }) .send()? .error_from_body()? .json::()?; @@ -351,15 +354,20 @@ impl PageServerNode { }) } - pub fn timeline_list(&self, tenant_id: &ZTenantId) -> Result> { - Ok(self + pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result> { + let timeline_infos: Vec = self .http_request( Method::GET, format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), ) .send()? .error_from_body()? - .json()?) + .json()?; + + timeline_infos + .into_iter() + .map(TimelineInfo::try_from) + .collect() } pub fn timeline_create( @@ -368,20 +376,22 @@ impl PageServerNode { new_timeline_id: Option, ancestor_start_lsn: Option, ancestor_timeline_id: Option, - ) -> Result { - Ok(self + ) -> anyhow::Result { + let timeline_info_response = self .http_request( Method::POST, format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), ) .json(&TimelineCreateRequest { - new_timeline_id, + new_timeline_id: new_timeline_id.map(HexZTimelineId::from), ancestor_start_lsn, - ancestor_timeline_id, + ancestor_timeline_id: ancestor_timeline_id.map(HexZTimelineId::from), }) .send()? .error_from_body()? - .json()?) + .json::()?; + + TimelineInfo::try_from(timeline_info_response) } } diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 28d9791438..9844e7ea82 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -1,26 +1,121 @@ +use crate::timelines::TimelineInfo; +use anyhow::{anyhow, bail, Context}; use serde::{Deserialize, Serialize}; -use zenith_utils::zid::ZNodeId; use zenith_utils::{ lsn::Lsn, - zid::{opt_display_serde, ZTenantId, ZTimelineId}, + zid::{HexZTenantId, HexZTimelineId, ZNodeId, ZTenantId, ZTimelineId}, }; #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { - #[serde(default)] - #[serde(with = "opt_display_serde")] - pub new_timeline_id: Option, - #[serde(default)] - #[serde(with = "opt_display_serde")] - pub ancestor_timeline_id: Option, + pub new_timeline_id: Option, + pub ancestor_timeline_id: Option, pub ancestor_start_lsn: Option, } #[derive(Serialize, Deserialize)] pub struct TenantCreateRequest { - #[serde(default)] - #[serde(with = "opt_display_serde")] - pub new_tenant_id: Option, + pub new_tenant_id: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct TimelineInfoResponse { + pub kind: String, + #[serde(with = "hex")] + timeline_id: ZTimelineId, + #[serde(with = "hex")] + tenant_id: ZTenantId, + disk_consistent_lsn: String, + last_record_lsn: Option, + prev_record_lsn: Option, + ancestor_timeline_id: Option, + ancestor_lsn: Option, + current_logical_size: Option, + current_logical_size_non_incremental: Option, +} + +impl From for TimelineInfoResponse { + fn from(other: TimelineInfo) -> Self { + match other { + TimelineInfo::Local { + timeline_id, + tenant_id, + last_record_lsn, + prev_record_lsn, + ancestor_timeline_id, + ancestor_lsn, + disk_consistent_lsn, + current_logical_size, + current_logical_size_non_incremental, + } => TimelineInfoResponse { + kind: "Local".to_owned(), + timeline_id, + tenant_id, + disk_consistent_lsn: disk_consistent_lsn.to_string(), + last_record_lsn: Some(last_record_lsn.to_string()), + prev_record_lsn: Some(prev_record_lsn.to_string()), + ancestor_timeline_id: ancestor_timeline_id.map(HexZTimelineId::from), + ancestor_lsn: ancestor_lsn.map(|lsn| lsn.to_string()), + current_logical_size: Some(current_logical_size), + current_logical_size_non_incremental, + }, + TimelineInfo::Remote { + timeline_id, + tenant_id, + disk_consistent_lsn, + } => TimelineInfoResponse { + kind: "Remote".to_owned(), + timeline_id, + tenant_id, + disk_consistent_lsn: disk_consistent_lsn.to_string(), + last_record_lsn: None, + prev_record_lsn: None, + ancestor_timeline_id: None, + ancestor_lsn: None, + current_logical_size: None, + current_logical_size_non_incremental: None, + }, + } + } +} + +impl TryFrom for TimelineInfo { + type Error = anyhow::Error; + + fn try_from(other: TimelineInfoResponse) -> anyhow::Result { + let parse_lsn_hex_string = |lsn_string: String| { + lsn_string + .parse::() + .with_context(|| format!("Failed to parse Lsn as hex string from '{}'", lsn_string)) + }; + + let disk_consistent_lsn = parse_lsn_hex_string(other.disk_consistent_lsn)?; + Ok(match other.kind.as_str() { + "Local" => TimelineInfo::Local { + timeline_id: other.timeline_id, + tenant_id: other.tenant_id, + last_record_lsn: other + .last_record_lsn + .ok_or(anyhow!("Local timeline should have last_record_lsn")) + .and_then(parse_lsn_hex_string)?, + prev_record_lsn: other + .prev_record_lsn + .ok_or(anyhow!("Local timeline should have prev_record_lsn")) + .and_then(parse_lsn_hex_string)?, + ancestor_timeline_id: other.ancestor_timeline_id.map(ZTimelineId::from), + ancestor_lsn: other.ancestor_lsn.map(parse_lsn_hex_string).transpose()?, + disk_consistent_lsn, + current_logical_size: other.current_logical_size.ok_or(anyhow!("No "))?, + current_logical_size_non_incremental: other.current_logical_size_non_incremental, + }, + "Remote" => TimelineInfo::Remote { + timeline_id: other.timeline_id, + tenant_id: other.tenant_id, + disk_consistent_lsn, + }, + unknown => bail!("Unknown timeline kind: {}", unknown), + }) + } } #[derive(Serialize)] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index efcc7ae2f3..abc4043bdd 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -16,11 +16,11 @@ use zenith_utils::http::{ request::parse_request_param, }; use zenith_utils::http::{RequestExt, RouterBuilder}; -use zenith_utils::zid::{HexZTimelineId, ZTimelineId}; +use zenith_utils::zid::{HexZTenantId, ZTimelineId}; -use super::models::StatusResponse; -use super::models::TenantCreateRequest; -use super::models::TimelineCreateRequest; +use super::models::{ + StatusResponse, TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponse, +}; use crate::repository::RepositoryTimeline; use crate::timelines::TimelineInfo; use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId}; @@ -79,13 +79,13 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); - let response_data = tokio::task::spawn_blocking(move || { + let response_data: Vec = tokio::task::spawn_blocking(move || { let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); crate::timelines::get_timelines(tenant_id, include_non_incremental_logical_size) }) .await - .map_err(ApiError::from_err)??; + .map_err(ApiError::from_err)?? + .into_iter() + .map(TimelineInfoResponse::from) + .collect(); Ok(json_response(StatusCode::OK, response_data)?) } @@ -137,7 +140,8 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result, ancestor_lsn: Option, disk_consistent_lsn: Lsn, @@ -41,9 +36,7 @@ pub enum TimelineInfo { current_logical_size_non_incremental: Option, }, Remote { - #[serde(with = "hex")] timeline_id: ZTimelineId, - #[serde(with = "hex")] tenant_id: ZTenantId, disk_consistent_lsn: Lsn, }, diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 7d2c0800a2..41b1899882 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -39,7 +39,7 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): timeline_id_str = str(timeline['timeline_id']) timeline_details = client.timeline_detail(tenant_id=tenant_id, timeline_id=UUID(timeline_id_str)) - assert timeline_details['type'] == 'Local' + assert timeline_details['kind'] == 'Local' assert timeline_details['tenant_id'] == tenant_id.hex assert timeline_details['timeline_id'] == timeline_id_str diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index abd06bf5e9..edcc768819 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -85,7 +85,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, timeline_details = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) assert timeline_details['timeline_id'] == timeline_id assert timeline_details['tenant_id'] == tenant_id - if timeline_details['type'] == 'Local': + if timeline_details['kind'] == 'Local': log.info("timeline downloaded, checking its data") break attempts += 1 diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 9518a14b75..02da7ee749 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -13,7 +13,7 @@ from dataclasses import dataclass, field from multiprocessing import Process, Value from pathlib import Path from fixtures.zenith_fixtures import PgBin, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol -from fixtures.utils import lsn_to_hex, mkdir_if_needed +from fixtures.utils import lsn_to_hex, mkdir_if_needed, lsn_from_hex from fixtures.log_helper import log from typing import List, Optional, Any @@ -91,7 +91,7 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): m = TimelineMetrics( timeline_id=timeline_id, - last_record_lsn=timeline_detail["last_record_lsn"], + last_record_lsn=lsn_from_hex(timeline_detail["last_record_lsn"]), ) for sk_m in sk_metrics: m.flush_lsns.append(sk_m.flush_lsn_inexact[(tenant_id.hex, timeline_id)]) diff --git a/zenith/src/main.rs b/zenith/src/main.rs index c4636fa1a6..165a7d7950 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -81,9 +81,6 @@ fn main() -> Result<()> { .required(false); let pg_node_arg = Arg::new("node").help("Postgres node name").required(false); - let safekeeper_node_arg = Arg::new("node") - .help("Safekeeper node name") - .required(false); let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false); @@ -361,7 +358,7 @@ fn print_timeline( print!("{} ", local_or_remote); if nesting_level > 0 { - let lsn_string = match timeline.info { + let lsn_string = match &timeline.info { TimelineInfo::Local { ancestor_lsn, .. } => ancestor_lsn .map(|lsn| lsn.to_string()) .unwrap_or_else(|| "Unknown local Lsn".to_string()), @@ -430,14 +427,11 @@ fn get_timeline_infos( env: &local_env::LocalEnv, tenant_id: &ZTenantId, ) -> Result> { - let page_server = PageServerNode::from_env(env); - let timeline_infos: Vec = page_server.timeline_list(tenant_id)?; - let timeline_infos: HashMap = timeline_infos + Ok(PageServerNode::from_env(env) + .timeline_list(tenant_id)? .into_iter() .map(|timeline_info| (timeline_info.timeline_id(), timeline_info)) - .collect(); - - Ok(timeline_infos) + .collect()) } // Helper function to parse --tenant_id option, or get the default from config file @@ -486,7 +480,7 @@ fn handle_init(init_match: &ArgMatches) -> Result { .context("Failed to initialize zenith repository")?; // default_tenantid was generated by the `env.init()` call above - let initial_tenant_id = env.default_tenant_id.unwrap(); + let initial_tenant_id = ZTenantId::from(env.default_tenant_id.unwrap()); // Call 'pageserver init'. let pageserver = PageServerNode::from_env(&env); From 093ad8ab59c9a385183df31b595abbbe037fd5bd Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 7 Mar 2022 23:12:36 +0200 Subject: [PATCH 0027/1022] Send 409 HTTP responses on timeline and tenant creation for existing entity --- control_plane/src/storage.rs | 73 +++++++------------ pageserver/src/http/openapi_spec.yml | 29 +++++++- pageserver/src/http/routes.rs | 21 ++++-- pageserver/src/tenant_mgr.rs | 12 +-- pageserver/src/timelines.rs | 30 +++++--- .../batch_others/test_pageserver_api.py | 2 +- test_runner/batch_others/test_wal_acceptor.py | 6 +- test_runner/fixtures/zenith_fixtures.py | 37 +++++----- zenith/src/main.rs | 19 +++-- 9 files changed, 126 insertions(+), 103 deletions(-) diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 259fc79708..f6b7173067 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -145,10 +145,9 @@ impl PageServerNode { args.extend(["--create-tenant", tenant_id]) } - let initial_timeline_id_str = initial_timeline_id.map(|id| id.to_string()); - if let Some(timeline_id) = initial_timeline_id_str.as_deref() { - args.extend(["--initial-timeline-id", timeline_id]) - } + let initial_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); + let initial_timeline_id_string = initial_timeline_id.to_string(); + args.extend(["--initial-timeline-id", &initial_timeline_id_string]); let init_output = fill_rust_env_vars(cmd.args(args)) .output() @@ -158,11 +157,7 @@ impl PageServerNode { bail!("pageserver init failed"); } - if let Some(initial_timeline_id) = initial_timeline_id { - Ok(initial_timeline_id) - } else { - extract_initial_timeline_id(init_output.stdout) - } + Ok(initial_timeline_id) } pub fn repo_path(&self) -> PathBuf { @@ -337,7 +332,10 @@ impl PageServerNode { .json()?) } - pub fn tenant_create(&self, new_tenant_id: Option) -> anyhow::Result { + pub fn tenant_create( + &self, + new_tenant_id: Option, + ) -> anyhow::Result> { let tenant_id_string = self .http_request(Method::POST, format!("{}/tenant", self.http_base_url)) .json(&TenantCreateRequest { @@ -345,13 +343,18 @@ impl PageServerNode { }) .send()? .error_from_body()? - .json::()?; - tenant_id_string.parse().with_context(|| { - format!( - "Failed to parse tennat creation response as tenant id: {}", - tenant_id_string - ) - }) + .json::>()?; + + tenant_id_string + .map(|id| { + id.parse().with_context(|| { + format!( + "Failed to parse tennat creation response as tenant id: {}", + id + ) + }) + }) + .transpose() } pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result> { @@ -376,7 +379,7 @@ impl PageServerNode { new_timeline_id: Option, ancestor_start_lsn: Option, ancestor_timeline_id: Option, - ) -> anyhow::Result { + ) -> anyhow::Result> { let timeline_info_response = self .http_request( Method::POST, @@ -389,36 +392,10 @@ impl PageServerNode { }) .send()? .error_from_body()? - .json::()?; + .json::>()?; - TimelineInfo::try_from(timeline_info_response) + timeline_info_response + .map(TimelineInfo::try_from) + .transpose() } } - -fn extract_initial_timeline_id(init_stdout: Vec) -> anyhow::Result { - let output_string = - String::from_utf8(init_stdout).context("Init stdout is not a valid unicode")?; - - let string_with_timeline_id = match output_string.split_once("created initial timeline ") { - Some((_, string_with_timeline_id)) => string_with_timeline_id, - None => bail!( - "Found no line with timeline id in the init output: '{}'", - output_string - ), - }; - - let timeline_id_str = match string_with_timeline_id.split_once(' ') { - Some((timeline_id_str, _)) => timeline_id_str, - None => bail!( - "Found no timeline id in the init output: '{}'", - output_string - ), - }; - - timeline_id_str.parse().with_context(|| { - format!( - "Failed to parse timeline id from string, extracted from the init output: '{}'", - timeline_id_str - ) - }) -} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 25d5ceae4e..d322b051a6 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -131,7 +131,9 @@ paths: type: string format: hex post: - description: Create timeline + description: | + Create a timeline. Returns new timeline id on success.\ + If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline. requestBody: content: application/json: @@ -171,6 +173,12 @@ paths: application/json: schema: $ref: "#/components/schemas/ForbiddenError" + "409": + description: Timeline already exists, creation skipped + content: + application/json: + schema: + $ref: "#/components/schemas/AlreadyExistsError" "500": description: Generic operation error content: @@ -208,7 +216,9 @@ paths: schema: $ref: "#/components/schemas/Error" post: - description: Create tenant + description: | + Create a tenant. Returns new tenant id on success.\ + If no new tenant id is specified in parameters, it would be generated. It's an error to recreate the same tenant. requestBody: content: application/json: @@ -220,7 +230,7 @@ paths: format: hex responses: "201": - description: Already exists or created + description: New tenant created successfully content: application/json: schema: @@ -244,6 +254,12 @@ paths: application/json: schema: $ref: "#/components/schemas/ForbiddenError" + "409": + description: Tenant already exists, creation skipped + content: + application/json: + schema: + $ref: "#/components/schemas/AlreadyExistsError" "500": description: Generic operation error content: @@ -311,6 +327,13 @@ components: properties: msg: type: string + AlreadyExistsError: + type: object + required: + - msg + properties: + msg: + type: string ForbiddenError: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index abc4043bdd..8365601042 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -74,7 +74,7 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result json_response(StatusCode::CREATED, TimelineInfoResponse::from(info))?, + None => json_response(StatusCode::CONFLICT, ())?, + }) } async fn timeline_list_handler(request: Request) -> Result, ApiError> { @@ -220,17 +224,18 @@ async fn tenant_create_handler(mut request: Request) -> Result json_response(StatusCode::CREATED, HexZTenantId::from(id))?, + None => json_response(StatusCode::CONFLICT, ())?, + }) } async fn handler_404(_: Request) -> Result, ApiError> { diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 77ef865ec5..568088fc1d 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -177,10 +177,10 @@ pub fn shutdown_all_tenants() { } } -pub fn create_repository_for_tenant( +pub fn create_tenant_repository( conf: &'static PageServerConf, new_tenant_id: Option, -) -> Result { +) -> Result> { let new_tenant_id = new_tenant_id.unwrap_or_else(ZTenantId::generate); let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, new_tenant_id)); match timelines::create_repo(conf, new_tenant_id, wal_redo_manager)? { @@ -191,11 +191,13 @@ pub fn create_repository_for_tenant( state: TenantState::Idle, repo, }); + Ok(Some(new_tenant_id)) + } + None => { + debug!("repository already exists for tenant {}", new_tenant_id); + Ok(None) } - None => debug!("repository already exists for tenant {}", new_tenant_id), } - - Ok(new_tenant_id) } pub fn get_tenant_state(tenantid: ZTenantId) -> Option { diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 924c25804a..4de131ef70 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -331,17 +331,26 @@ pub(crate) fn create_timeline( new_timeline_id: Option, ancestor_timeline_id: Option, ancestor_start_lsn: Option, -) -> Result { +) -> Result> { let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; if conf.timeline_path(&new_timeline_id, &tenant_id).exists() { - bail!("timeline {} already exists", new_timeline_id); + match repo.get_timeline(new_timeline_id)? { + RepositoryTimeline::Local { id, .. } => { + debug!("timeline {} already exists", id); + return Ok(None); + } + RepositoryTimeline::Remote { id, .. } => bail!( + "timeline {} already exists in pageserver's remote storage", + id + ), + } } - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; let mut start_lsn = ancestor_start_lsn.unwrap_or(Lsn(0)); - match ancestor_timeline_id { + let new_timeline_info = match ancestor_timeline_id { Some(ancestor_timeline_id) => { let ancestor_timeline = repo .get_timeline(ancestor_timeline_id) @@ -383,20 +392,17 @@ pub(crate) fn create_timeline( repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?; // load the timeline into memory let loaded_timeline = repo.get_timeline(new_timeline_id)?; - Ok(TimelineInfo::from_repo_timeline( - tenant_id, - loaded_timeline, - false, - )) + TimelineInfo::from_repo_timeline(tenant_id, loaded_timeline, false) } None => { let new_timeline = bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?; - Ok(TimelineInfo::from_dyn_timeline( + TimelineInfo::from_dyn_timeline( tenant_id, new_timeline_id, new_timeline.as_ref(), false, - )) + ) } - } + }; + Ok(Some(new_timeline_info)) } diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 41b1899882..2aa3686904 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -28,7 +28,7 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): # create timeline timeline_id = uuid4() - client.timeline_create(tenant_id=tenant_id, timeline_id=timeline_id) + client.timeline_create(tenant_id=tenant_id, new_timeline_id=timeline_id) timelines = client.timeline_list(tenant_id) assert len(timelines) > 0 diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 02da7ee749..bdc4c4f63c 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -57,6 +57,10 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): branch_names = [ "test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines) ] + # pageserver, safekeeper operate timelines via their ids (can be represented in hex as 'ad50847381e248feaac9876cc71ae418') + # that's not really human readable, so the branch names are introduced in Zenith CLI. + # Zenith CLI stores its branch <-> timeline mapping in its internals, + # but we need this to collect metrics from other servers, related to the timeline. branch_names_to_timeline_ids = {} # start postgres on each timeline @@ -75,7 +79,7 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): tenant_id=tenant_id, timeline_id=branch_names_to_timeline_ids[branch_name]) for branch_name in branch_names ] - # All changes visible to pageserver (latest_valid_lsn) should be + # All changes visible to pageserver (last_record_lsn) should be # confirmed by safekeepers first. As we cannot atomically get # state of both pageserver and safekeepers, we should start with # pageserver. Looking at outdated data from pageserver is ok. diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index e2c9f16630..ec570a7dac 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -725,20 +725,23 @@ class ZenithPageserverHttpClient(requests.Session): def timeline_create( self, tenant_id: uuid.UUID, - timeline_id: Optional[uuid.UUID] = None, + new_timeline_id: Optional[uuid.UUID] = None, ancestor_timeline_id: Optional[uuid.UUID] = None, ancestor_start_lsn: Optional[str] = None, ) -> Dict[Any, Any]: res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline", json={ 'new_timeline_id': - timeline_id.hex if timeline_id else None, + new_timeline_id.hex if new_timeline_id else None, 'ancestor_start_lsn': ancestor_start_lsn, 'ancestor_timeline_id': ancestor_timeline_id.hex if ancestor_timeline_id else None, }) self.verbose_error(res) + if res.status_code == 409: + raise Exception(f'could not create timeline: already exists for id {new_timeline_id}') + res_json = res.json() assert isinstance(res_json, dict) return res_json @@ -750,14 +753,16 @@ class ZenithPageserverHttpClient(requests.Session): assert isinstance(res_json, list) return res_json - def tenant_create(self, tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: + def tenant_create(self, new_tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: res = self.post( f"http://localhost:{self.port}/v1/tenant", json={ - 'new_tenant_id': tenant_id.hex if tenant_id else None, + 'new_tenant_id': new_tenant_id.hex if new_tenant_id else None, }, ) self.verbose_error(res) + if res.status_code == 409: + raise Exception(f'could not create tenant: already exists for id {new_tenant_id}') new_tenant_id = res.json() assert isinstance(new_tenant_id, str) return uuid.UUID(new_tenant_id) @@ -806,6 +811,13 @@ class S3Storage: RemoteStorage = Union[LocalFsStorage, S3Storage] +CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", + re.MULTILINE) +CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", + re.MULTILINE) +TIMELINE_DATA_EXTRACTOR = re.compile(r"\s(?P[^\s]+)\s\[(?P[^\]]+)\]", + re.MULTILINE) + class ZenithCli: """ @@ -846,18 +858,13 @@ class ZenithCli: res = self.raw_cli(cmd) res.check_returncode() - create_timeline_id_extractor = re.compile(r"^Created timeline '(?P[^']+)'", - re.MULTILINE) - matches = create_timeline_id_extractor.search(res.stdout) + matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) created_timeline_id = None if matches is not None: created_timeline_id = matches.group('timeline_id') - if created_timeline_id is None: - raise Exception('could not find timeline id after `zenith timeline create` invocation') - else: - return uuid.UUID(created_timeline_id) + return uuid.UUID(created_timeline_id) def create_branch(self, new_branch_name: str = DEFAULT_BRANCH_NAME, @@ -880,9 +887,7 @@ class ZenithCli: res = self.raw_cli(cmd) res.check_returncode() - create_timeline_id_extractor = re.compile(r"^Created timeline '(?P[^']+)'", - re.MULTILINE) - matches = create_timeline_id_extractor.search(res.stdout) + matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) created_timeline_id = None if matches is not None: @@ -900,13 +905,11 @@ class ZenithCli: # (L) main [b49f7954224a0ad25cc0013ea107b54b] # (L) ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540] - timeline_data_extractor = re.compile( - r"\s(?P[^\s]+)\s\[(?P[^\]]+)\]", re.MULTILINE) res = self.raw_cli( ['timeline', 'list', '--tenant-id', (tenant_id or self.env.initial_tenant).hex]) timelines_cli = sorted( map(lambda branch_and_id: (branch_and_id[0], branch_and_id[1]), - timeline_data_extractor.findall(res.stdout))) + TIMELINE_DATA_EXTRACTOR.findall(res.stdout))) return timelines_cli def init(self, diff --git a/zenith/src/main.rs b/zenith/src/main.rs index 165a7d7950..dd35427d5d 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -522,7 +522,11 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re } Some(("create", create_match)) => { let initial_tenant_id = parse_tenant_id(create_match)?; - let new_tenant_id = pageserver.tenant_create(initial_tenant_id)?; + let new_tenant_id = pageserver + .tenant_create(initial_tenant_id)? + .ok_or_else(|| { + anyhow!("Tenant with id {:?} was already created", initial_tenant_id) + })?; println!( "tenant {} successfully created on the pageserver", new_tenant_id @@ -548,7 +552,9 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let new_branch_name = create_match .value_of("branch-name") .ok_or(anyhow!("No branch name provided"))?; - let timeline = pageserver.timeline_create(tenant_id, None, None, None)?; + let timeline = pageserver + .timeline_create(tenant_id, None, None, None)? + .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; let new_timeline_id = timeline.timeline_id(); let last_record_lsn = match timeline { @@ -593,12 +599,9 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - .map(Lsn::from_str) .transpose() .context("Failed to parse ancestor start Lsn from the request")?; - let timeline = pageserver.timeline_create( - tenant_id, - None, - start_lsn, - Some(ancestor_timeline_id), - )?; + let timeline = pageserver + .timeline_create(tenant_id, None, start_lsn, Some(ancestor_timeline_id))? + .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; let new_timeline_id = timeline.timeline_id(); let last_record_lsn = match timeline { From f67d010d1bf3678eb8a287d47bac3fe1eed3e8cc Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Mon, 21 Feb 2022 13:40:25 -0800 Subject: [PATCH 0028/1022] Add ps smgr/storage metrics tenant tags Signed-off-by: Dhammika Pathirana Add tenant_id,timeline_id in smgr/storage metrics (#1234) --- pageserver/src/layered_repository.rs | 42 ++++++++++++++++++---------- pageserver/src/page_service.rs | 10 ++++--- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index c3d42d1829..63ade9bb37 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -47,10 +47,8 @@ use crate::walredo::WalRedoManager; use crate::CheckpointConfig; use crate::{ZTenantId, ZTimelineId}; -use zenith_metrics::{ - register_histogram, register_int_gauge_vec, Histogram, IntGauge, IntGaugeVec, -}; use zenith_metrics::{register_histogram_vec, HistogramVec}; +use zenith_metrics::{register_int_gauge_vec, IntGauge, IntGaugeVec}; use zenith_utils::crashsafe_dir; use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn}; use zenith_utils::seqwait::SeqWait; @@ -87,16 +85,17 @@ lazy_static! { static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( "pageserver_storage_time", "Time spent on storage operations", - &["operation"] + &["operation", "tenant_id", "timeline_id"] ) .expect("failed to define a metric"); } // Metrics collected on operations on the storage repository. lazy_static! { - static ref RECONSTRUCT_TIME: Histogram = register_histogram!( + static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!( "pageserver_getpage_reconstruct_time", - "FIXME Time spent on storage operations" + "FIXME Time spent on storage operations", + &["tenant_id", "timeline_id"] ) .expect("failed to define a metric"); } @@ -248,11 +247,19 @@ impl Repository for LayeredRepository { horizon: u64, checkpoint_before_gc: bool, ) -> Result { - STORAGE_TIME - .with_label_values(&["gc"]) - .observe_closure_duration(|| { - self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) - }) + if let Some(timeline_id) = target_timelineid { + STORAGE_TIME + .with_label_values(&["gc", &self.tenantid.to_string(), &timeline_id.to_string()]) + .observe_closure_duration(|| { + self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) + }) + } else { + STORAGE_TIME + .with_label_values(&["gc", &self.tenantid.to_string(), "-"]) + .observe_closure_duration(|| { + self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) + }) + } } fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()> { @@ -859,7 +866,11 @@ impl Timeline for LayeredTimeline { let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { + let tenant_id = self.tenantid.to_string(); + let timeline_id = self.timelineid.to_string(); + RECONSTRUCT_TIME + .with_label_values(&[&tenant_id, &timeline_id]) .observe_closure_duration(|| self.materialize_page(seg, seg_blknum, lsn, &*layer)) } else { // FIXME: This can happen if PostgreSQL extends a relation but never writes @@ -1009,15 +1020,18 @@ impl Timeline for LayeredTimeline { /// checkpoint_internal function, this public facade just wraps it for /// metrics collection. fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()> { + let tenant_id = self.tenantid.to_string(); + let timeline_id = self.timelineid.to_string(); + match cconf { CheckpointConfig::Flush => STORAGE_TIME - .with_label_values(&["flush checkpoint"]) + .with_label_values(&["flush checkpoint", &tenant_id, &timeline_id]) .observe_closure_duration(|| self.checkpoint_internal(0, false)), CheckpointConfig::Forced => STORAGE_TIME - .with_label_values(&["forced checkpoint"]) + .with_label_values(&["forced checkpoint", &tenant_id, &timeline_id]) .observe_closure_duration(|| self.checkpoint_internal(0, true)), CheckpointConfig::Distance(distance) => STORAGE_TIME - .with_label_values(&["checkpoint"]) + .with_label_values(&["checkpoint", &tenant_id, &timeline_id]) .observe_closure_duration(|| self.checkpoint_internal(distance, true)), } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 7dc3c8c752..42a099cca5 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -298,7 +298,7 @@ lazy_static! { static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!( "pageserver_smgr_query_time", "Time spent on smgr query handling", - &["smgr_query_type"], + &["smgr_query_type", "tenant_id", "timeline_id"], TIME_BUCKETS.into() ) .expect("failed to define a metric"); @@ -340,20 +340,22 @@ impl PageServerHandler { }; let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; + let tenant_id = tenantid.to_string(); + let timeline_id = timelineid.to_string(); let response = match zenith_fe_msg { PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_exists"]) + .with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]) .observe_closure_duration(|| { self.handle_get_rel_exists_request(timeline.as_ref(), &req) }), PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_size"]) + .with_label_values(&["get_rel_size", &tenant_id, &timeline_id]) .observe_closure_duration(|| { self.handle_get_nblocks_request(timeline.as_ref(), &req) }), PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME - .with_label_values(&["get_page_at_lsn"]) + .with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]) .observe_closure_duration(|| { self.handle_get_page_at_lsn_request(timeline.as_ref(), &req) }), From 27dadba52c7543b9bd49b8c506fa74a1587df543 Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Fri, 25 Feb 2022 14:22:48 -0800 Subject: [PATCH 0029/1022] Fix retain references to layer histograms Signed-off-by: Dhammika Pathirana --- pageserver/src/layered_repository.rs | 80 ++++++++++++++++++---------- 1 file changed, 52 insertions(+), 28 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 63ade9bb37..a6e61cb9e0 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -47,7 +47,7 @@ use crate::walredo::WalRedoManager; use crate::CheckpointConfig; use crate::{ZTenantId, ZTimelineId}; -use zenith_metrics::{register_histogram_vec, HistogramVec}; +use zenith_metrics::{register_histogram_vec, Histogram, HistogramVec}; use zenith_metrics::{register_int_gauge_vec, IntGauge, IntGaugeVec}; use zenith_utils::crashsafe_dir; use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn}; @@ -247,19 +247,15 @@ impl Repository for LayeredRepository { horizon: u64, checkpoint_before_gc: bool, ) -> Result { - if let Some(timeline_id) = target_timelineid { - STORAGE_TIME - .with_label_values(&["gc", &self.tenantid.to_string(), &timeline_id.to_string()]) - .observe_closure_duration(|| { - self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) - }) - } else { - STORAGE_TIME - .with_label_values(&["gc", &self.tenantid.to_string(), "-"]) - .observe_closure_duration(|| { - self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) - }) - } + let timeline_str = target_timelineid + .map(|x| x.to_string()) + .unwrap_or_else(|| "-".to_string()); + + STORAGE_TIME + .with_label_values(&["gc", &self.tenantid.to_string(), &timeline_str]) + .observe_closure_duration(|| { + self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) + }) } fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()> { @@ -788,6 +784,12 @@ pub struct LayeredTimeline { // ordering for its operations, but involves private modules, and macro trickery current_logical_size_gauge: IntGauge, + // Metrics histograms + reconstruct_time_histo: Histogram, + checkpoint_time_histo: Histogram, + flush_checkpoint_time_histo: Histogram, + forced_checkpoint_time_histo: Histogram, + /// If `true`, will backup its files that appear after each checkpointing to the remote storage. upload_relishes: AtomicBool, @@ -866,11 +868,7 @@ impl Timeline for LayeredTimeline { let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - let tenant_id = self.tenantid.to_string(); - let timeline_id = self.timelineid.to_string(); - - RECONSTRUCT_TIME - .with_label_values(&[&tenant_id, &timeline_id]) + self.reconstruct_time_histo .observe_closure_duration(|| self.materialize_page(seg, seg_blknum, lsn, &*layer)) } else { // FIXME: This can happen if PostgreSQL extends a relation but never writes @@ -1020,18 +1018,15 @@ impl Timeline for LayeredTimeline { /// checkpoint_internal function, this public facade just wraps it for /// metrics collection. fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()> { - let tenant_id = self.tenantid.to_string(); - let timeline_id = self.timelineid.to_string(); - match cconf { - CheckpointConfig::Flush => STORAGE_TIME - .with_label_values(&["flush checkpoint", &tenant_id, &timeline_id]) + CheckpointConfig::Flush => self + .flush_checkpoint_time_histo .observe_closure_duration(|| self.checkpoint_internal(0, false)), - CheckpointConfig::Forced => STORAGE_TIME - .with_label_values(&["forced checkpoint", &tenant_id, &timeline_id]) + CheckpointConfig::Forced => self + .forced_checkpoint_time_histo .observe_closure_duration(|| self.checkpoint_internal(0, true)), - CheckpointConfig::Distance(distance) => STORAGE_TIME - .with_label_values(&["checkpoint", &tenant_id, &timeline_id]) + CheckpointConfig::Distance(distance) => self + .checkpoint_time_histo .observe_closure_duration(|| self.checkpoint_internal(distance, true)), } } @@ -1130,6 +1125,31 @@ impl LayeredTimeline { let current_logical_size_gauge = LOGICAL_TIMELINE_SIZE .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) .unwrap(); + let reconstruct_time_histo = RECONSTRUCT_TIME + .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) + .unwrap(); + let checkpoint_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "checkpoint", + &tenantid.to_string(), + &timelineid.to_string(), + ]) + .unwrap(); + let flush_checkpoint_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "flush checkpoint", + &tenantid.to_string(), + &timelineid.to_string(), + ]) + .unwrap(); + let forced_checkpoint_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "forced checkpoint", + &tenantid.to_string(), + &timelineid.to_string(), + ]) + .unwrap(); + LayeredTimeline { conf, timelineid, @@ -1149,6 +1169,10 @@ impl LayeredTimeline { ancestor_lsn: metadata.ancestor_lsn(), current_logical_size: AtomicUsize::new(current_logical_size), current_logical_size_gauge, + reconstruct_time_histo, + checkpoint_time_histo, + flush_checkpoint_time_histo, + forced_checkpoint_time_histo, upload_relishes: AtomicBool::new(upload_relishes), write_lock: Mutex::new(()), From b2ad8342d21521226160416efe8e330cf1655852 Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Mon, 28 Feb 2022 16:37:09 -0800 Subject: [PATCH 0030/1022] Add zid stringify bench test Signed-off-by: Dhammika Pathirana --- zenith_utils/Cargo.toml | 5 +++++ zenith_utils/benches/benchmarks.rs | 22 ++++++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 zenith_utils/benches/benchmarks.rs diff --git a/zenith_utils/Cargo.toml b/zenith_utils/Cargo.toml index b22fcbf748..daaf345f8f 100644 --- a/zenith_utils/Cargo.toml +++ b/zenith_utils/Cargo.toml @@ -37,3 +37,8 @@ bytes = "1.0.1" hex-literal = "0.3" tempfile = "3.2" webpki = "0.21" +criterion = "0.3" + +[[bench]] +name = "benchmarks" +harness = false diff --git a/zenith_utils/benches/benchmarks.rs b/zenith_utils/benches/benchmarks.rs new file mode 100644 index 0000000000..c945d5021c --- /dev/null +++ b/zenith_utils/benches/benchmarks.rs @@ -0,0 +1,22 @@ +#![allow(unused)] + +use criterion::{criterion_group, criterion_main, Criterion}; +use zenith_utils::zid; + +pub fn bench_zid_stringify(c: &mut Criterion) { + // Can only use public methods. + let ztl = zid::ZTenantTimelineId::generate(); + + c.bench_function("zid.to_string", |b| { + b.iter(|| { + // FIXME measurement overhead? + //for _ in 0..1000 { + // ztl.tenant_id.to_string(); + //} + ztl.tenant_id.to_string(); + }) + }); +} + +criterion_group!(benches, bench_zid_stringify); +criterion_main!(benches); From a8a7dc9ca65352ad738e55a3a26a7171a89db17b Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Tue, 1 Mar 2022 14:28:25 -0800 Subject: [PATCH 0031/1022] Fix zid encoding Signed-off-by: Dhammika Pathirana --- zenith_utils/src/zid.rs | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/zenith_utils/src/zid.rs b/zenith_utils/src/zid.rs index a740d4fb48..e047e38da7 100644 --- a/zenith_utils/src/zid.rs +++ b/zenith_utils/src/zid.rs @@ -112,6 +112,17 @@ impl ZId { rand::thread_rng().fill(&mut tli_buf); ZId::from(tli_buf) } + + fn hex_encode(&self) -> String { + static HEX: &[u8] = b"0123456789abcdef"; + + let mut buf = vec![0u8; self.0.len() * 2]; + for (&b, chunk) in self.0.as_ref().iter().zip(buf.chunks_exact_mut(2)) { + chunk[0] = HEX[((b >> 4) & 0xf) as usize]; + chunk[1] = HEX[(b & 0xf) as usize]; + } + unsafe { String::from_utf8_unchecked(buf) } + } } impl FromStr for ZId { @@ -147,13 +158,13 @@ impl From<[u8; 16]> for ZId { impl fmt::Display for ZId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(&hex::encode(self.0)) + f.write_str(&self.hex_encode()) } } impl fmt::Debug for ZId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(&hex::encode(self.0)) + f.write_str(&self.hex_encode()) } } From 5d7bd8643ade07d0e8a1f2ee8c9b535336b65e90 Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Wed, 2 Mar 2022 14:50:22 -0800 Subject: [PATCH 0032/1022] Fix page reconstruct time histo Signed-off-by: Dhammika Pathirana --- pageserver/src/layered_repository.rs | 29 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index a6e61cb9e0..9e0df5dab2 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -94,7 +94,7 @@ lazy_static! { lazy_static! { static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!( "pageserver_getpage_reconstruct_time", - "FIXME Time spent on storage operations", + "Time spent on storage operations", &["tenant_id", "timeline_id"] ) .expect("failed to define a metric"); @@ -868,8 +868,7 @@ impl Timeline for LayeredTimeline { let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - self.reconstruct_time_histo - .observe_closure_duration(|| self.materialize_page(seg, seg_blknum, lsn, &*layer)) + self.materialize_page(seg, seg_blknum, lsn, &*layer) } else { // FIXME: This can happen if PostgreSQL extends a relation but never writes // the page. See https://github.com/zenithdb/zenith/issues/841 @@ -2022,17 +2021,19 @@ impl LayeredTimeline { let mut layer_ref = layer; let mut curr_lsn = lsn; loop { - let result = layer_ref - .get_page_reconstruct_data(seg_blknum, curr_lsn, &mut data) - .with_context(|| { - format!( - "Failed to get reconstruct data {} {:?} {} {}", - layer_ref.get_seg_tag(), - layer_ref.filename(), - seg_blknum, - curr_lsn, - ) - })?; + let result = self.reconstruct_time_histo.observe_closure_duration(|| { + layer_ref + .get_page_reconstruct_data(seg_blknum, curr_lsn, &mut data) + .with_context(|| { + format!( + "Failed to get reconstruct data {} {:?} {} {}", + layer_ref.get_seg_tag(), + layer_ref.filename(), + seg_blknum, + curr_lsn, + ) + }) + })?; match result { PageReconstructResult::Complete => break, PageReconstructResult::Continue(cont_lsn) => { From d93fc371f348919ed728fd8539f34e4d0a270f9d Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 11 Mar 2022 18:49:36 +0200 Subject: [PATCH 0033/1022] Import all existing RFCs documents from the separate 'rfcs' repository. --- docs/rfcs/002-storage.md | 186 ++++++++++++ docs/rfcs/003-laptop-cli.md | 267 ++++++++++++++++++ docs/rfcs/004-durability.md | 218 ++++++++++++++ docs/rfcs/005-zenith_local.md | 103 +++++++ docs/rfcs/006-laptop-cli-v2-CLI.md | 64 +++++ .../006-laptop-cli-v2-repository-structure.md | 140 +++++++++ docs/rfcs/007-serverless-on-laptop.md | 93 ++++++ docs/rfcs/008-push-pull.md | 66 +++++ docs/rfcs/009-snapshot-first-storage-cli.md | 56 ++++ docs/rfcs/009-snapshot-first-storage-pitr.md | 227 +++++++++++++++ docs/rfcs/009-snapshot-first-storage.md | 148 ++++++++++ docs/rfcs/010-storage_details.md | 144 ++++++++++ docs/rfcs/011-retention-policy.md | 91 ++++++ docs/rfcs/012-background-tasks.md | 38 +++ docs/rfcs/013-term-history.md | 147 ++++++++++ docs/rfcs/README.md | 95 +++++++ docs/rfcs/images/storage.jpeg | Bin 0 -> 431075 bytes 17 files changed, 2083 insertions(+) create mode 100644 docs/rfcs/002-storage.md create mode 100644 docs/rfcs/003-laptop-cli.md create mode 100644 docs/rfcs/004-durability.md create mode 100644 docs/rfcs/005-zenith_local.md create mode 100644 docs/rfcs/006-laptop-cli-v2-CLI.md create mode 100644 docs/rfcs/006-laptop-cli-v2-repository-structure.md create mode 100644 docs/rfcs/007-serverless-on-laptop.md create mode 100644 docs/rfcs/008-push-pull.md create mode 100644 docs/rfcs/009-snapshot-first-storage-cli.md create mode 100644 docs/rfcs/009-snapshot-first-storage-pitr.md create mode 100644 docs/rfcs/009-snapshot-first-storage.md create mode 100644 docs/rfcs/010-storage_details.md create mode 100644 docs/rfcs/011-retention-policy.md create mode 100644 docs/rfcs/012-background-tasks.md create mode 100644 docs/rfcs/013-term-history.md create mode 100644 docs/rfcs/README.md create mode 100644 docs/rfcs/images/storage.jpeg diff --git a/docs/rfcs/002-storage.md b/docs/rfcs/002-storage.md new file mode 100644 index 0000000000..5cac377272 --- /dev/null +++ b/docs/rfcs/002-storage.md @@ -0,0 +1,186 @@ +# Zenith storage node — alternative + +## **Design considerations** + +Simplify storage operations for people => Gain adoption/installs on laptops and small private installation => Attract customers to DBaaS by seamless integration between our tooling and cloud. + +Proposed architecture addresses: + +- High availability -- tolerates n/2 - 1 failures +- Multi-tenancy -- one storage for all databases +- Elasticity -- increase storage size on the go by adding nodes +- Snapshots / backups / PITR with S3 offload +- Compression + +Minuses are: + +- Quite a lot of work +- Single page access may touch few disk pages +- Some bloat in data — may slowdown sequential scans + +## **Summary** + +Storage cluster is sharded key-value store with ordered keys. Key (****page_key****) is a tuple of `(pg_id, db_id, timeline_id, rel_id, forkno, segno, pageno, lsn)`. Value is either page or page diff/wal. Each chunk (chunk == shard) stores approx 50-100GB ~~and automatically splits in half when grows bigger then soft 100GB limit~~. by having a fixed range of pageno's it is responsible for. Chunks placement on storage nodes is stored in a separate metadata service, so chunk can be freely moved around the cluster if it is need. Chunk itself is a filesystem directory with following sub directories: + +``` + +|-chunk_42/ + |-store/ -- contains lsm with pages/pagediffs ranging from + | page_key_lo to page_key_hi + |-wal/ + | |- db_1234/ db-specific wal files with pages from page_key_lo + | to page_key_hi + | + |-chunk.meta -- small file with snapshot references + (page_key_prefix+lsn+name) + and PITR regions (page_key_start, page_key_end) +``` + +## **Chunk** + +Chunk is responsible for storing pages potentially from different databases and relations. Each page is addressed by a lexicographically ordered tuple (****page_key****) with following fields: + +- `pg_id` -- unique id of given postgres instance (or postgres cluster as it is called in postgres docs) +- `db_id` -- database that was created by 'CREATE DATABASE' in a given postgres instance +- `db_timeline` -- used to create Copy-on-Write instances from snapshots, described later +- `rel_id` -- tuple of (relation_id, 0) for tables and (indexed_relation_id, rel_id) for indices. Done this way so table indices were closer to table itself on our global key space. +- `(forkno, segno, pageno)` -- page coordinates in postgres data files +- `lsn_timeline` -- postgres feature, increments when PITR was done. +- `lsn` -- lsn of current page version. + +Chunk stores pages and page diffs ranging from page_key_lo to page_key_hi. Processing node looks at page in wal record and sends record to a chunk responsible for this page range. When wal record arrives to a chunk it is initially stored in `chunk_id/wal/db_id/wal_segno.wal`. Then background process moves records from that wal files to the lsm tree in `chunk_id/store`. Or, more precisely, wal records would be materialized into lsm memtable and when that memtable is flushed to SSTable on disk we may trim the wal. That way some not durably (in the distributed sense) committed pages may enter the tree -- here we rely on processing node behavior: page request from processing node should contain proper lsm horizons so that storage node may respond with proper page version. + +LSM here is a usual LSM for variable-length values: at first data is stored in memory (we hold incoming wal records to be able to regenerate it after restart) at some balanced tree. When this tree grows big enough we dump it into disk file (SSTable) sorting records by key. Then SStables are mergesorted in the background to a different files. All file operation are sequential and do not require WAL for durability. + +Content of SSTable can be following: + +```jsx +(pg_id, db_id, ... , pageno=42, lsn=100) (full 8k page data) +(pg_id, db_id, ... , pageno=42, lsn=150) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=180) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=200) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=220) (full 8k page data) +(pg_id, db_id, ... , pageno=42, lsn=250) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=270) (per-page diff) +(pg_id, db_id, ... , pageno=5000, lsn=100) (full 8k page data) +``` + +So query for `pageno=42 up to lsn=260` would need to find closest entry less then this key, iterate back to the latest full page and iterate forward to apply diffs. How often page is materialized in lsn-version sequence is up to us -- let's say each 5th version should be a full page. + +### **Page deletion** + +To delete old pages we insert blind deletion marker `(pg_id, db_id, #trim_lsn < 150)` into a lsm tree. During merges such marker would indicate that all pages with smaller lsn should be discarded. Delete marker will travel down the tree levels hierarchy until it reaches last level. In non-PITR scenario where old page version are not needed at all such deletion marker would (in average) prevent old page versions propagation down the tree -- so all bloat would concentrate at higher tree layers without affecting bigger bottom layers. + +### **Recovery** + +Upon storage node restart recent WAL files are applied to appropriate pages and resulting pages stored in lsm memtable. So this should be fast since we are not writing anything to disk. + +### **Checkpointing** + +No such mechanism is needed. Or we may look at the storage node as at kind of continuous chekpointer. + +### **Full page writes (torn page protection)** + +Storage node never updates individual pages, only merges SSTable, so torn pages is not an issue. + +### **Snapshot** + +That is the part that I like about this design -- snapshot creation is instant and cheap operation that can have flexible granularity level: whole instance, database, table. Snapshot creation inserts a record in `chunk.meta` file with lsn of this snapshot and key prefix `(pg_id, db_id, db_timeline, rel_id, *)` that prohibits pages deletion within this range. Storage node may not know anything about page internals, but by changing number of fields in our prefix we may change snapshot granularity. + +It is again useful to remap `rel_id` to `(indexed_relation_id, rel_id)` so that snapshot of relation would include it's indices. Also table snapshot would trickily interact with catalog. Probably all table snapshots should hold also a catalog snapshot. And when node is started with such snapshot it should check that only tables from snapshot are queried. I assume here that for snapshot reading one need to start a new postgres instance. + +Storage consumed by snapshot is proportional to the amount of data changed. We may have some heuristic (calculated based on cost of different storages) about when to offload old snapshot to s3. For example, if current database has more then 40% of changed pages with respect to previous snapshot then we may offload that snapshot to s3, and release this space. + +**Starting db from snapshot** + +When we are starting database from snapshot it can be done in two ways. First, we may create new db_id, move all the data from snapshot to a new db and start a database. Second option is to create Copy-on-Write (CoW) instance out of snapshot and read old pages from old snapshot and store new pages separately. That is why there is `db_timeline` key field near `db_id` -- CoW (🐮) database should create new `db_timeline` and remember old `db_timeline`. Such a database can have hashmap of pages that it is changed to query pages from proper snapshot on the first try. `db_timeline` is located near `db_id` so that new page versions generated by new instance would not bloat data of initial snapshot. It is not clear for whether it is possibly to effectively support "stacked" CoW snapshot, so we may disallow them. (Well, one way to support them is to move `db_timeline` close to `lsn` -- so we may scan neighboring pages and find right one. But again that way we bloat snapshot with unrelated data and may slowdown full scans that are happening in different database). + +**Snapshot export/import** + +Once we may start CoW instances it is easy to run auxiliary postgres instance on this snapshot and run `COPY FROM (...) TO stdout` or `pg_dump` and export data from the snapshot to some portable formats. Also we may start postgres on a new empty database and run `COPY FROM stdin`. This way we can initialize new non-CoW databases and transfer snapshots via network. + +### **PITR area** + +In described scheme PITR is just a prohibition to delete any versions within some key prefix, either it is a database or a table key prefix. So PITR may have different settings for different tables, databases, etc. + +PITR is quite bloaty, so we may aggressively offload it to s3 -- we may push same (or bigger) SSTables to s3 and maintain lsm structure there. + +### **Compression** + +Since we are storing page diffs of variable sizes there is no structural dependency on a page size and we may compress it. Again that could be enabled only on pages with some key prefixes, so we may have this with db/table granularity. + +### **Chunk metadata** + +Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunck should always consult this data when merging SSTables and applying delete markers. + +### **Chunk splitting** + +*(NB: following paragraph is about how to avoid page splitting)* + +When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global matadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following: + +1. Find separation key and spawn two new chunks with [lo, mid) [mid, hi) boundaries. + +2. Prohibit WAL deletion and old SSTables deletion on original chunk. + +3. On each lsm layer we would need to split only one SSTable, all other would fit within left or right range. Symlink/split that files to new chunks. + +4. Start WAL replay on new chunks. + +5. Update global metadata about new chunk boundaries. + +6. Eventually (metadata update should be pushed to processing node by metadata service) storage node will start sending WAL and page requests to the new nodes. + +7. New chunk may start serving read queries when following conditions are met: + +a) it receives at least on WAL record from processing node + +b) it replayed all WAL up to the new received one + +c) checked by downlinks that there were no WAL gaps. + +Chunk split as it is described here is quite fast operation when it is happening on the local disk -- vast majority of files will be just moved without copying anything. I suggest to keep split always local and not to mix it with chunk moving around cluster. So if we want to split some chunk but there is small amount of free space left on the device, we should first move some chunks away from the node and then proceed with splitting. + +### Fixed chunks + +Alternative strategy is to not to split at all and have pageno-fixed chunk boundaries. When table is created we first materialize this chunk by storing first new pages only and chunks is small. Then chunk is growing while table is filled, but it can't grow substantially bigger then allowed pageno range, so at max it would be 1GB or whatever limit we want + some bloat due to snapshots and old page versions. + +### **Chunk lsm internals** + +So how to implement chunk's lsm? + +- Write from scratch and use RocksDB to prototype/benchmark, then switch to own lsm implementation. RocksDB can provide some sanity check for performance of home-brewed implementation and it would be easier to prototype. +- Use postgres as lego constructor. We may model memtable with postgres B-tree referencing some in-memory log of incoming records. SSTable merging may reuse postgres external merging algorithm, etc. One thing that would definitely not fit (or I didn't came up with idea how to fit that) -- is multi-tenancy. If we are storing pages from different databases we can't use postgres buffer pool, since there is no db_id in the page header. We can add new field there but IMO it would be no go for committing that to vanilla. + +Other possibility is to not to try to fit few databases in one storage node. But that way it is no go for multi-tenant cloud installation: we would need to run a lot of storage node instances on one physical storage node, all with it own local page cache. So that would be much closer to ordinary managed RDS. + +Multi-tenant storage makes sense even on a laptop, when you work with different databases, running tests with temp database, etc. And when installation grows bigger it start to make more and more sense, so it seems important. + +# Storage fleet + +# **Storage fleet** + +- When database is smaller then a chunk size we naturally can store them in one chunk (since their page_key would fit in some chunk's [hi, lo) range). + +Screenshot_2021-02-22_at_16 49 17 + +Few databases are stored in one chunk, replicated three times + +- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we alway may manually move chunks around the cluster. + +Screenshot_2021-02-22_at_16 49 10 + +Here one big database occupies two set of nodes. Also some chunks were moved around to restore replication factor after disk failure. In this case we also have "sharded" storage for a big database and issue wal writes to different chunks in parallel. + +## **Chunk placement strategies** + +There are few scenarios where we may want to move chunks around the cluster: + +- disk usage on some node is big +- some disk experienced a failure +- some node experienced a failure or need maintenance + +## **Chunk replication** + +Chunk replication may be done by cloning page ranges with respect to some lsn from peer nodes, updating global metadata, waiting for WAL to come, replaying previous WAL and becoming online -- more or less like during chunk split. + diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md new file mode 100644 index 0000000000..4d1f0a68f0 --- /dev/null +++ b/docs/rfcs/003-laptop-cli.md @@ -0,0 +1,267 @@ +# Command line interface (end-user) + +Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start. + +This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots. + +The most important concept here is a snapshot, which can be created/pushed/pulled/exported. Also, we may start temporary read-only postgres instance over any local snapshot. A more complex scenario would consist of several basic operations over snapshots. + +# Possible usage scenarios + +## Install zenith, run a postgres + +``` +> brew install pg-zenith +> zenith pg create # creates pgdata with default pattern pgdata$i +> zenith pg list +ID PGDATA USED STORAGE ENDPOINT +primary1 pgdata1 0G zenith-local localhost:5432 +``` + +## Import standalone postgres to zenith + +``` +> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg +[====================------------] 60% | 20MB/s +> zenith snapshot list +ID SIZE PARENT +oldpg 5G - + +> zenith pg create --snapshot oldpg +Started postgres on localhost:5432 + +> zenith pg list +ID PGDATA USED STORAGE ENDPOINT +primary1 pgdata1 5G zenith-local localhost:5432 + +> zenith snapshot destroy oldpg +Ok +``` + +Also, we may start snapshot import implicitly by looking at snapshot schema + +``` +> zenith pg create --snapshot basebackup://replication@localhost:5432/ +Downloading snapshot... Done. +Started postgres on localhost:5432 +Destroying snapshot... Done. +``` + +## Pull snapshot with some publicly shared database + +Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage). + +``` +> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies +``` + +## Create snapshot and push it to the cloud + +``` +> zenith snapshot create pgdata1@snap1 +> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1 +``` + +## Rollback database to the snapshot + +One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`. + +``` +> zenith pg list +ID PGDATA USED STORAGE ENDPOINT +primary1 pgdata1 5G zenith-local localhost:5432 + +> zenith snapshot create pgdata1@snap1 + +> zenith snapshot list +ID SIZE PARENT +oldpg 5G - +pgdata1@snap1 6G - +pgdata1@CURRENT 6G - + +> zenith pg checkout pgdata1@snap1 +Stopping postgres on pgdata1. +Rolling back pgdata1@CURRENT to pgdata1@snap1. +Starting postgres on pgdata1. + +> zenith snapshot list +ID SIZE PARENT +oldpg 5G - +pgdata1@snap1 6G - +pgdata1@HEAD{0} 6G - +pgdata1@CURRENT 6G - +``` + +Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state of the database in the data directory. When we are checking out some snapshot CURRENT will be set to this snapshot and the old CURRENT state will be named HEAD{0} (0 is the number of postgres timeline, it would be incremented after each such checkout). + +## Configure PITR area (Point In Time Recovery). + +PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite). + +``` +> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month +``` + +Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area. + +# Manual + +## storage + +Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default. + +**zenith storage attach** -t [native|s3] -c key=value -n name + +Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'. + + +**zenith storage list** + +Show currently attached storages. For example: + +``` +> zenith storage list +NAME USED TYPE OPTIONS PATH +local 5.1G zenith-local /opt/zenith/store/local +local.compr 20.4G zenith-local comression=on /opt/zenith/store/local.compr +zcloud 60G zenith-remote zenith.tech/stas/mystore +s3tank 80G S3 +``` + +**zenith storage detach** + +**zenith storage show** + + + +## pg + +Manages postgres data directories and can start postgreses with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themself. + +Pg is a term for a single postgres running on some data. I'm trying to avoid here separation of datadir management and postgres instance management -- both that concepts bundled here together. + +**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata + +Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr. + +--no-start: just init datadir without creating + +--snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1) + +--cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database) + +**zenith pg destroy** + +**zenith pg start** [--replica] pgdata + +Start postgres with proper extensions preloaded/installed. + +**zenith pg checkout** + +Rollback data directory to some previous snapshot. + +**zenith pg stop** pg_id + +**zenith pg list** + +``` +ROLE PGDATA USED STORAGE ENDPOINT +primary my_pg 5.1G local localhost:5432 +replica-1 localhost:5433 +replica-2 localhost:5434 +primary my_pg2 3.2G local.compr localhost:5435 +- my_pg3 9.2G local.compr - +``` + +**zenith pg show** + +``` +my_pg: + storage: local + space used on local: 5.1G + space used on all storages: 15.1G + snapshots: + on local: + snap1: 1G + snap2: 1G + on zcloud: + snap2: 1G + on s3tank: + snap5: 2G + pitr: + on s3tank: + pitr_one_month: 45G + +``` + +**zenith pg start-rest/graphql** pgdata + +Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea. + + +## snapshot + +Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout. + +**zenith snapshot create** pgdata_name@snap_name + +Creates a new snapshot in the same storage where pgdata_name exists. + +**zenith snapshot push** --to url pgdata_name@snap_name + +Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go. + +**zenith snapshot recv** + +Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket. + +**zenith snapshot pull** --from url or path + +Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format. + +**zenith snapshot import** --from basebackup://<...> or path + +Creates a new snapshot out of running postgres via basebackup protocol or basebackup files. + +**zenith snapshot export** + +Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay). + +**zenith snapshot diff** snap1 snap2 + +Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses. + +**zenith snapshot destroy** + +## pitr + +Pitr represents wal stream and ttl policy for that stream + +XXX: any suggestions on a better name? + +**zenith pitr create** name + +--ttl = inf | period + +--size-limit = inf | limit + +--storage = storage_name + +**zenith pitr extract-snapshot** pitr_name --lsn xxx + +Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export) + +**zenith pitr gc** pitr_name + +Force garbage collection on some PITR area. + +**zenith pitr list** + +**zenith pitr destroy** + + +## console + +**zenith console** + +Opens browser targeted at web console with the more or less same functionality as described here. diff --git a/docs/rfcs/004-durability.md b/docs/rfcs/004-durability.md new file mode 100644 index 0000000000..4543be3dae --- /dev/null +++ b/docs/rfcs/004-durability.md @@ -0,0 +1,218 @@ +Durability & Consensus +====================== + +When a transaction commits, a commit record is generated in the WAL. +When do we consider the WAL record as durable, so that we can +acknowledge the commit to the client and be reasonably certain that we +will not lose the transaction? + +Zenith uses a group of WAL safekeeper nodes to hold the generated WAL. +A WAL record is considered durable, when it has been written to a +majority of WAL safekeeper nodes. In this document, I use 5 +safekeepers, because I have five fingers. A WAL record is durable, +when at least 3 safekeepers have written it to disk. + +First, assume that only one primary node can be running at a +time. This can be achieved by Kubernetes or etcd or some +cloud-provider specific facility, or we can implement it +ourselves. These options are discussed in later chapters. For now, +assume that there is a Magic STONITH Fairy that ensures that. + +In addition to the WAL safekeeper nodes, the WAL is archived in +S3. WAL that has been archived to S3 can be removed from the +safekeepers, so the safekeepers don't need a lot of disk space. + + + +----------------+ + +-----> | WAL safekeeper | + | +----------------+ + | +----------------+ + +-----> | WAL safekeeper | ++------------+ | +----------------+ +| Primary | | +----------------+ +| Processing | ---------+-----> | WAL safekeeper | +| Node | | +----------------+ ++------------+ | +----------------+ + \ +-----> | WAL safekeeper | + \ | +----------------+ + \ | +----------------+ + \ +-----> | WAL safekeeper | + \ +----------------+ + \ + \ + \ + \ + \ +--------+ + \ | | + +--> | S3 | + | | + +--------+ + + +Every WAL safekeeper holds a section of WAL, and a VCL value. +The WAL can be divided into three portions: + + + VCL LSN + | | + V V +.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX +Archived WAL Completed WAL In-flight WAL + + +Note that all this WAL kept in a safekeeper is a contiguous section. +This is different from Aurora: In Aurora, there can be holes in the +WAL, and there is a Gossip protocol to fill the holes. That could be +implemented in the future, but let's keep it simple for now. WAL needs +to be written to a safekeeper in order. However, during crash +recovery, In-flight WAL that has already been stored in a safekeeper +can be truncated or overwritten. + +The Archived WAL has already been stored in S3, and can be removed from +the safekeeper. + +The Completed WAL has been written to at least three safekeepers. The +algorithm ensures that it is not lost, when at most two nodes fail at +the same time. + +The In-flight WAL has been persisted in the safekeeper, but if a crash +happens, it may still be overwritten or truncated. + + +The VCL point is determined in the Primary. It is not strictly +necessary to store it in the safekeepers, but it allows some +optimizations and sanity checks and is probably generally useful for +the system as whole. The VCL values stored in the safekeepers can lag +behind the VCL computed by the primary. + + +Primary node Normal operation +----------------------------- + +1. Generate some WAL. + +2. Send the WAL to all the safekeepers that you can reach. + +3. As soon as a quorum of safekeepers have acknowledged that they have + received and durably stored the WAL up to that LSN, update local VCL + value in memory, and acknowledge commits to the clients. + +4. Send the new VCL to all the safekeepers that were part of the quorum. + (Optional) + + +Primary Crash recovery +---------------------- + +When a new Primary node starts up, before it can generate any new WAL +it needs to contact a majority of the WAL safekeepers to compute the +VCL. Remember that there is a Magic STONITH fairy that ensures that +only node process can be doing this at a time. + +1. Contact all WAL safekeepers. Find the Max((Epoch, LSN)) tuple among the ones you + can reach. This is the Winner safekeeper, and its LSN becomes the new VCL. + +2. Update the other safekeepers you can reach, by copying all the WAL + from the Winner, starting from each safekeeper's old VCL point. Any old + In-Flight WAL from previous Epoch is truncated away. + +3. Increment Epoch, and send the new Epoch to the quorum of + safekeepers. (This ensures that if any of the safekeepers that we + could not reach later come back online, they will be considered as + older than this in any future recovery) + +You can now start generating new WAL, starting from the newly-computed +VCL. + +Optimizations +------------- + +As described, the Primary node sends all the WAL to all the WAL safekeepers. That +can be a lot of network traffic. Instead of sending the WAL directly from Primary, +some safekeepers can be daisy-chained off other safekeepers, or there can be a +broadcast mechanism among them. There should still be a direct connection from the +each safekeeper to the Primary for the acknowledgments though. + +Similarly, the responsibility for archiving WAL to S3 can be delegated to one of +the safekeepers, to reduce the load on the primary. + + +Magic STONITH fairy +------------------- + +Now that we have a system that works as long as only one primary node is running at a time, how +do we ensure that? + +1. Use etcd to grant a lease on a key. The primary node is only allowed to operate as primary + when it's holding a valid lease. If the primary node dies, the lease expires after a timeout + period, and a new node is allowed to become the primary. + +2. Use S3 to store the lease. S3's consistency guarantees are more lenient, so in theory you + cannot do this safely. In practice, it would probably be OK if you make the lease times and + timeouts long enough. This has the advantage that we don't need to introduce a new + component to the architecture. + +3. Use Raft or Paxos, with the WAL safekeepers acting as the Acceptors to form the quorum. The + next chapter describes this option. + + +Built-in Paxos +-------------- + +The WAL safekeepers act as PAXOS Acceptors, and the Processing nodes +as both Proposers and Learners. + +Each WAL safekeeper holds an Epoch value in addition to the VCL and +the WAL. Each request by the primary to safekeep WAL is accompanied by +an Epoch value. If a safekeeper receives a request with Epoch that +doesn't match its current Accepted Epoch, it must ignore (NACK) it. +(In different Paxos papers, Epochs are called "terms" or "round +numbers") + +When a node wants to become the primary, it generates a new Epoch +value that is higher than any previously observed Epoch value, and +globally unique. + + +Accepted Epoch: 555 VCL LSN + | | + V V +.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX +Archived WAL Completed WAL In-flight WAL + + +Primary node startup: + +1. Contact all WAL safekeepers that you can reach (if you cannot + connect to a quorum of them, you can give up immediately). Find the + latest Epoch among them. + +2. Generate a new globally unique Epoch, greater than the latest Epoch + found in previous step. + +2. Send the new Epoch in a Prepare message to a quorum of + safekeepers. (PAXOS Prepare message) + +3. Each safekeeper responds with a Promise. If a safekeeper has + already made a promise with a higher Epoch, it doesn't respond (or + responds with a NACK). After making a promise, the safekeeper stops + responding to any write requests with earlier Epoch. + +4. Once you have received a majority of promises, you know that the + VCL cannot advance on the old Epoch anymore. This effectively kills + any old primary server. + +5. Find the highest written LSN among the quorum of safekeepers (these + can be included in the Promise messages already). This is the new + VCL. If a new node starts the election process after this point, + it will compute the same or higher VCL. + +6. Copy the WAL from the safekeeper with the highest LSN to the other + safekeepers in the quorum, using the new Epoch. (PAXOS Accept + phase) + +7. You can now start generating new WAL starting from the VCL. If + another process starts the election process after this point and + gains control of a majority of the safekeepers, we will no longer + be able to advance the VCL. + diff --git a/docs/rfcs/005-zenith_local.md b/docs/rfcs/005-zenith_local.md new file mode 100644 index 0000000000..7b078e9ec0 --- /dev/null +++ b/docs/rfcs/005-zenith_local.md @@ -0,0 +1,103 @@ +# Zenith local + +Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together. Your comments on both parts are very welcome. + +#### Why do we need it? +- For distribution - this easy to use binary will help us to build adoption among developers. +- For internal use - to test all components together. + +In my understanding, we consider it to be just a mock-up version of zenith-cloud. +> Question: How much should we care about durability and security issues for a local setup? + + +#### Why is it better than a simple local postgres? + +- Easy one-line setup. As simple as `cargo install zenith && zenith start` + +- Quick and cheap creation of compute nodes over the same storage. +> Question: How can we describe a use-case for this feature? + +- Zenith-local can work with S3 directly. + +- Push and pull images (snapshots) to remote S3 to exchange data with other users. + +- Quick and cheap snapshot checkouts to switch back and forth in the database history. +> Question: Do we want it in the very first release? This feature seems quite complicated. + +#### Distribution: + +Ideally, just one binary that incorporates all elements we need. +> Question: Let's discuss pros and cons of having a separate package with modified PostgreSQL. + +#### Components: + +- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responces to show them in a user-friendly way. +CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md +WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli + +- **zenith-console** - WEB UI with same functionality as CLI. +>Note: not for the first release. + +- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below. + > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local. + +- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation). +> Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server? + +WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src + +- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith. +> Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)? +> Question: Do we use it together with local page store or they are interchangeable? + +WIP code is ??? + +- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed. +> Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system. + +WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper + +- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database. + + WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node + +#### REST API: + +Service endpoint: `http://localhost:3000` + +Resources: +- /storages - Where data lives: zenith-pageserver or zenith-s3 +- /pgs - Postgres - zenith-computenode +- /snapshots - snapshots **TODO** + +>Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all? + +Methods and their mapping to CLI: + +- /storages - zenith-pageserver or zenith-s3 + +CLI | REST API +------------- | ------------- +storage attach -n name --type [native\s3] --path=[datadir\URL] | PUT -d { "name": "name", "type": "native", "path": "/tmp" } /storages +storage detach -n name | DELETE /storages/:storage_name +storage list | GET /storages +storage show -n name | GET /storages/:storage_name + + +- /pgs - zenith-computenode + +CLI | REST API +------------- | ------------- +pg create -n name --s storage_name | PUT -d { "name": "name", "storage_name": "storage_name" } /pgs +pg destroy -n name | DELETE /pgs/:pg_name +pg start -n name --replica | POST -d {"action": "start", "is_replica":"replica"} /pgs/:pg_name /actions +pg stop -n name | POST -d {"action": "stop"} /pgs/:pg_name /actions +pg promote -n name | POST -d {"action": "promote"} /pgs/:pg_name /actions +pg list | GET /pgs +pg show -n name | GET /pgs/:pg_name + +- /snapshots **TODO** + +CLI | REST API +------------- | ------------- + diff --git a/docs/rfcs/006-laptop-cli-v2-CLI.md b/docs/rfcs/006-laptop-cli-v2-CLI.md new file mode 100644 index 0000000000..a04536922a --- /dev/null +++ b/docs/rfcs/006-laptop-cli-v2-CLI.md @@ -0,0 +1,64 @@ +Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog". + +# CLI v2 (after chatting with Carl) + +Zenith introduces the notion of a repository. + +```bash +zenith init +zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory +``` + +Once you have a cluster catalog you can explore it + +```bash +zenith log -- returns a list of commits +zenith status -- returns if there are changes in the catalog that can be committed +zenith commit -- commits the changes and generates a new commit hash +zenith branch experimental -- creates a branch called testdb based on a given commit hash +``` + +To make changes in the catalog you need to run compute nodes + +```bash +-- here is how you a compute node +zenith start /home/pipedpiper/northwind:main -- starts a compute instance +zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud +-- you can start a compute node against any hash or branch +zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start anothe compute instance (on different port) +-- you can start a compute node against any hash or branch +zenith start /home/pipedpiper/northwind: --port 8009 -- start anothe compute instance (on different port) + +-- After running some DML you can run +-- zenith status and see how there are two WAL streams one on top of +-- the main branch +zenith status +-- and another on top of the experimental branch +zenith status -b experimental + +-- you can commit each branch separately +zenith commit main +-- or +zenith commit -c /home/pipedpiper/northwind:experimental +``` + +Starting compute instances against cloud environments + +```bash +-- you can start a compute instance against the cloud environment +-- in this case all of the changes will be streamed into the cloud +zenith start https://zenith:tech/pipedpiper/northwind:main +zenith start https://zenith:tech/pipedpiper/northwind:main +zenith status -c https://zenith:tech/pipedpiper/northwind:main +zenith commit -c https://zenith:tech/pipedpiper/northwind:main +zenith branch -c https://zenith:tech/pipedpiper/northwind: experimental +``` + +Pushing data into the cloud + +```bash +-- pull all the commits from the cloud +zenith pull +-- push all the commits to the cloud +zenith push +``` diff --git a/docs/rfcs/006-laptop-cli-v2-repository-structure.md b/docs/rfcs/006-laptop-cli-v2-repository-structure.md new file mode 100644 index 0000000000..ee4e432182 --- /dev/null +++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md @@ -0,0 +1,140 @@ +# Repository format + +A Zenith repository is similar to a traditional PostgreSQL backup +archive, like a WAL-G bucket or pgbarman backup catalogue. It holds +multiple versions of a PostgreSQL database cluster. + +The distinguishing feature is that you can launch a Zenith Postgres +server directly against a branch in the repository, without having to +"restore" it first. Also, Zenith manages the storage automatically, +there is no separation between full and incremental backups nor WAL +archive. Zenith relies heavily on the WAL, and uses concepts similar +to incremental backups and WAL archiving internally, but it is hidden +from the user. + +## Directory structure, version 1 + +This first version is pretty straightforward but not very +efficient. Just something to get us started. + +The repository directory looks like this: + + .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/ + .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots// + .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history + + .zenith/refs/branches/mybranch + .zenith/refs/tags/foo + .zenith/refs/tags/bar + + .zenith/datadirs/ + +### Timelines + +A timeline is similar to PostgeSQL's timeline, but is identified by a +UUID instead of a 32-bit timeline Id. For user convenience, it can be +given a name that refers to the UUID (called a branch). + +All WAL is generated on a timeline. You can launch a read-only node +against a tag or arbitrary LSN on a timeline, but in order to write, +you need to create a timeline. + +Each timeline is stored in a directory under .zenith/timelines. It +consists of a WAL archive, containing all the WAL in the standard +PostgreSQL format, under the wal/ subdirectory. + +The 'snapshots/' subdirectory, contains "base backups" of the data +directory at a different LSNs. Each snapshot is simply a copy of the +Postgres data directory. + +When a new timeline is forked from a previous timeline, the ancestor +timeline's UUID is stored in the 'history' file. + +### Refs + +There are two kinds of named objects in the repository: branches and +tags. A branch is a human-friendly name for a timeline UUID, and a +tag is a human-friendly name for a specific LSN on a timeline +(timeline UUID + LSN). Like in git, these are just for user +convenience; you can also use timeline UUIDs and LSNs directly. + +Refs do have one additional purpose though: naming a timeline or LSN +prevents it from being automatically garbage collected. + +The refs directory contains a small text file for each tag/branch. It +contains the UUID of the timeline (and LSN, for tags). + +### Datadirs + +.zenith/datadirs contains PostgreSQL data directories. You can launch +a Postgres instance on one of them with: + +``` + postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c +``` + +All the actual data is kept in the timeline directories, under +.zenith/timelines. The data directories are only needed for active +PostgreQSL instances. After an instance is stopped, the data directory +can be safely removed. "zenith start" will recreate it quickly from +the data in .zenith/timelines, if it's missing. + +## Version 2 + +The format described above isn't very different from a traditional +daily base backup + WAL archive configuration. The main difference is +the nicer naming of branches and tags. + +That's not very efficient. For performance, we need something like +incremental backups that don't require making a full copy of all +data. So only store modified files or pages. And instead of having to +replay all WAL from the last snapshot, "slice" the WAL into +per-relation WAL files and only recover what's needed when a table is +accessed. + +In version 2, the file format in the "snapshots" subdirectory gets +more advanced. The exact format is TODO. But it should support: +- storing WAL records of individual relations/pages +- storing a delta from an older snapshot +- compression + + +## Operations + +### Garbage collection + +When you run "zenith gc", old timelines that are no longer needed are +removed. That involves collecting the list of "unreachable" objects, +starting from the named branches and tags. + +Also, if enough WAL has been generated on a timeline since last +snapshot, a new snapshot or delta is created. + +### zenith push/pull + +Compare the tags and branches on both servers, and copy missing ones. +For each branch, compare the timeline it points to in both servers. If +one is behind the other, copy the missing parts. + +FIXME: how do you prevent confusion if you have to clones of the same +repository, launch an instance on the same branch in both clones, and +later try to push/pull between them? Perhaps create a new timeline +every time you start up an instance? Then you would detect that the +timelines have diverged. That would match with the "epoch" concept +that we have in the WAL safekeepr + +### zenith checkout/commit + +In this format, there is no concept of a "working tree", and hence no +concept of checking out or committing. All modifications are done on +a branch or a timeline. As soon as you launch a server, the changes are +appended to the timeline. + +You can easily fork off a temporary timeline to emulate a "working tree". +You can later remove it and have it garbage collected, or to "commit", +re-point the branch to the new timeline. + +If we want to have a worktree and "zenith checkout/commit" concept, we can +emulate that with a temporary timeline. Create the temporary timeline at +"zenith checkout", and have "zenith commit" modify the branch to point to +the new timeline. diff --git a/docs/rfcs/007-serverless-on-laptop.md b/docs/rfcs/007-serverless-on-laptop.md new file mode 100644 index 0000000000..e6355f4a03 --- /dev/null +++ b/docs/rfcs/007-serverless-on-laptop.md @@ -0,0 +1,93 @@ +How it works now +---------------- + +1. Create repository, start page server on it + +``` +$ zenith init +... +created main branch +new zenith repository was created in .zenith + +$ zenith pageserver start +Starting pageserver at '127.0.0.1:64000' in .zenith +Page server started +``` + +2. Create a branch, and start a Postgres instance on it + +``` +$ zenith branch heikki main +branching at end of WAL: 0/15ECF68 + +$ zenith pg create heikki +Initializing Postgres on timeline 76cf9279915be7797095241638e64644... +Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432 + +$ zenith pg start pg1 +Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki' +waiting for server to start.... done +server started +``` + + +3. Connect to it and run queries + +``` +$ psql "dbname=postgres port=55432" +psql (14devel) +Type "help" for help. + +postgres=# +``` + + +Proposal: Serverless on your Laptop +----------------------------------- + +We've been talking about doing the "pg create" step automatically at +"pg start", to eliminate that step. What if we go further, go +serverless on your laptop, so that the workflow becomes just: + +1. Create repository, start page server on it (same as before) + +``` +$ zenith init +... +created main branch +new zenith repository was created in .zenith + +$ zenith pageserver start +Starting pageserver at '127.0.0.1:64000' in .zenith +Page server started +``` + +2. Create branch + +``` +$ zenith branch heikki main +branching at end of WAL: 0/15ECF68 +``` + +3. Connect to it: + +``` +$ psql "dbname=postgres port=5432 branch=heikki" +psql (14devel) +Type "help" for help. + +postgres=# +``` + + +The trick behind the scenes is that when you launch the page server, +it starts to listen on port 5432. When you connect to it with psql, it +looks at the 'branch' parameter that you passed in the connection +string. It automatically performs the "pg create" and "pg start" steps +for that branch, and then forwards the connection to the Postgres +instance that it launched. After you disconnect, if there are no more +active connections to the server running on the branch, it can +automatically shut it down again. + +This is how serverless would work in the cloud. We can do it on your +laptop, too. diff --git a/docs/rfcs/008-push-pull.md b/docs/rfcs/008-push-pull.md new file mode 100644 index 0000000000..272628e1ce --- /dev/null +++ b/docs/rfcs/008-push-pull.md @@ -0,0 +1,66 @@ +# Push and pull between pageservers + +Here is a proposal about implementing push/pull mechanics between pageservers. We also want to be able to push/pull to S3 but that would depend on the exact storage format so we don't touch that in this proposal. + +## Origin management + +The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that). + +``` +zenith origin add +zenith origin list +zenith origin remove +``` + +Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport. + +Behind the scenes, this commands may update toml file inside .zenith directory. + +## Push + +### Pushing branch + +``` +zenith push mybranch cloudserver # push to eponymous branch in cloudserver +zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver +``` + +Exact mechanics would be slightly different in the following situations: + +1) Destination branch does not exist. + + That is the simplest scenario. We can just create an empty branch (or timeline in internal terminology) and transfer all the pages/records that we have in our timeline. Right now each timeline is quite independent of other timelines so I suggest skipping any checks that there is a common ancestor and just fill it with data. Later when CoW timelines will land to the pageserver we may add that check and decide whether this timeline belongs to this pageserver repository or not [*]. + + The exact mechanics may be the following: + + * CLI asks local pageserver to perform push and hands over connection uri: `perform_push `. + * local pageserver connects to the remote pageserver and runs `branch_push ` + Handler for branch_create would create destination timeline and switch connection to copyboth mode. + * Sending pageserver may start iterator on that timeline and send all the records as copy messages. + +2) Destination branch exists and latest_valid_lsn is less than ours. + + In this case, we need to send missing records. To do that we need to find all pages that were changed since that remote LSN. Right now we don't have any tracking mechanism for that, so let's just iterate over all records and send ones that are newer than remote LSN. Later we probably should add a sparse bitmap that would track changed pages to avoid full scan. + +3) Destination branch exists and latest_valid_lsn is bigger than ours. + + In this case, we can't push to that branch. We can only pull. + +### Pulling branch + +Here we need to handle the same three cases, but also keep in mind that local pageserver can be behind NAT and we can't trivially re-use pushing by asking remote to 'perform_push' to our address. So we would need a new set of commands: + +* CLI calls `perform_pull ` on local pageserver. +* local pageserver calls `branch_pull ` on remote pageserver. +* remote pageserver sends records in our direction + +But despite the different set of commands code that performs iteration over records and receiving code that inserts that records can be the same for both pull and push. + + + +[*] It looks to me that there are two different possible approaches to handling unrelated timelines: + +1) Allow storing unrelated timelines in one repo. Some timelines may have parents and some may not. +2) Transparently create and manage several repositories in one pageserver. + +But that is the topic for a separate RFC/discussion. diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md new file mode 100644 index 0000000000..3f5386c165 --- /dev/null +++ b/docs/rfcs/009-snapshot-first-storage-cli.md @@ -0,0 +1,56 @@ +While working on export/import commands, I understood that they fit really well into "snapshot-first design". + +We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files. + +Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postges to zenith. + +So here is an attemt to design consistent CLI for diferent usage scenarios: + +#### 1. Start empty pageserver. +That is what we have now. +Init empty pageserver using `initdb` in temporary directory. + +`--storage_dest=FILE_PREFIX | S3_PREFIX |...` option defines object storage type, all other parameters are passed via env variables. Inspired by WAL-G style naming : https://wal-g.readthedocs.io/STORAGES/. + +Save`storage_dest` and other parameters in config. +Push snapshots to `storage_dest` in background. + +``` +zenith init --storage_dest=S3_PREFIX +zenith start +``` + +#### 2. Restart pageserver (manually or crash-recovery). +Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`. +Push snapshots to `storage_dest` in background. + +``` +zenith start +``` + +#### 3. Import. +Start pageserver from existing snapshot. +Path to snapshot provided via `--snapshot_path=FILE_PREFIX | S3_PREFIX | ...` +Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time operation. +Save`storage_dest` parameters in config. +Push snapshots to `storage_dest` in background. +``` +//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage. +zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX +zenith start +``` +How to pass credentials needed for `snapshot_path`? + +#### 4. Export. +Manually push snapshot to `snapshot_path` which differs from `storage_dest` +Optionally set `snapshot_format`, which can be plain pgdata format or zenith format. +``` +zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata +``` + +#### Notes and questions +- walkeeper s3_offload should use same (similar) syntax for storage. How to set it in UI? +- Why do we need `zenith init` as a separate command? Can't we init everything at first start? +- We can think of better names for all options. +- Export to plain postgres format will be useless, if we are not 100% compatible on page level. +I can recall at least one such difference - PD_WAL_LOGGED flag in pages. \ No newline at end of file diff --git a/docs/rfcs/009-snapshot-first-storage-pitr.md b/docs/rfcs/009-snapshot-first-storage-pitr.md new file mode 100644 index 0000000000..801613e2c9 --- /dev/null +++ b/docs/rfcs/009-snapshot-first-storage-pitr.md @@ -0,0 +1,227 @@ +# Preface + +GetPage@LSN can be called with older LSNs, and the page server needs +to be able to reconstruct older page versions. That's needed for +having read-only replicas that lag behind the primary, or that are +"anchored" at an older LSN, and internally in the page server whne you +branch at an older point in time. How do you do that? + +For now, I'm not considering incremental snapshots at all. I don't +think that changes things. So whenever you create a snapshot or a +snapshot file, it contains an image of all the pages, there is no need +to look at an older snapshot file. + +Also, I'm imagining that this works on a per-relation basis, so that +each snapshot file contains data for one relation. A "relation" is a +fuzzy concept - it could actually be one 1 GB relation segment. Or it +could include all the different "forks" of a relation, or you could +treat each fork as a separate relation for storage purpose. And once +we have the "non-relational" work is finished, a "relation" could +actually mean some other versioned object kept in the PostgreSQL data +directory. Let's ignore that for now. + +# Eric's RFC: + +Every now and then, you create a "snapshot". It means that you create +a new snapshot file for each relation that was modified after the last +snapshot, and write out the contents the relation as it is/was at the +snapshot LSN. Write-ahead log is stored separately in S3 by the WAL +safekeeping service, in the original PostgreSQL WAL file format. + + SNAPSHOT @100 WAL + . | + . | + . | + . | + SNAPSHOT @200 | + . | + . | + . | + . | + SNAPSHOT @300 | + . | + . V + IN-MEMORY @400 + +If a GetPage@LSN request comes from the primary, you return the latest +page from the in-memory layer. If there is no trace of the page in +memory, it means that it hasn't been modified since the last snapshot, +so you return the page from the latest snapshot, at LSN 300 in the +above example. + +PITR is implemented using the original WAL files: + +If a GetPage@LSN request comes from a read replica with LSN 250, you +read the image of the page from the snapshot at LSN 200, and you also +scan the WAL between 200 and 250, and apply all WAL records for the +requested page, to reconstruct it at LSN 250. + +Scanning the WAL naively for every GetPage@LSN request would be +expensive, so in practice you'd construct an in-memory data structure +of all the WAL between 200 and 250 once that allows quickly looking up +records for a given page. + +## Problems/questions + +I think you'll need to store the list of snapshot LSNs on each +timeline somewhere. + +If the latest snapshot of a relation is at LSN 100, and you request a +page at LSN 1000000, how do you know if there are some modifications +to it between 100 and 1000000 that you need to replay? You can scan +all the WAL between 100 and 1000000, but that would be expensive. + +You can skip that, if you know that a snapshot was taken e.g. at LSN +999900. Then you know that the fact that there is no snapshot file at +999900 means that the relation hasn't been modified between +100-999900. Then you only need to scan the WAL between 999900 and +1000000. However, there is no trace of a snapshot happening at LSN +999900 in the snapshot file for this relation, so you need to get +that information from somewhere else. + +Where do you get that information from? Perhaps you can scan all the +other relations, and if you see a snapshot file for *any* relation at +LSN 999900, you know that if there were modifications to this +relation, there would be a newer snapshot file for it, too. In other +words, the list of snapshots that have been taken can be constructed +by scanning all relations and computing the union of all snapshot LSNs +that you see for any relation. But that's expensive so at least you +should keep that in memory, after computing it once. Also, if you rely +on that, it's not possible to have snapshots at different intervals +for different files. That seems limiting. + +Another option is to explicitly store a list of snapshot LSNs in a +separate metadata file. + + +# Current implementation in the 'layered_repo' branch: + +We store snapshot files like in the RFC, but each snapshot file also +contains all the WAL in the range of LSNs, so that you don't need to +fetch the WAL separately from S3. So you have "layers" like this: + + SNAPSHOT+WAL 100-200 + | + | + | + | + SNAPSHOT+WAL 200-300 + | + | + | + | + IN-MEMORY 300- + +Each "snapshot+WAL" is a file that contains a snapshot - i.e. full +copy of each page in the relation, at the *start* LSN. In addition to +that, it contains all the WAL applicable to the relation from the +start LSN to the end LSN. With that, you can reconstruct any page +version in the range that the file covers. + + +## Problems/questions + +I can see one potential performance issue here, compared to the RFC. +Let's focus on a single relation for now. Imagine that you start from +an empty relation, and you receive WAL from 100 to 200, containing +a bunch of inserts and updates to the relation. You now have all that +WAL in memory: + + memory: WAL from 100-200 + +We decide that it's time to materialize that to a snapshot file on +disk. We materialize full image of the relation as it was at LSN 100 +to the snapshot file, and include all of the WAL. Since the relation +was initially empty, the "image" at the beginning of th range is empty +too. + +So now you have one file on on disk: + + SNAPSHOT+WAL 100-200 + +It contains a full image of the relation at LSN 100 and all WAL +between 100-200. (It's actually stored as a serialized BTreeMap of +page versions, with the page images and WAL records all stored +together in the same BtreeMap. But for this story, that's not +important.) + +We now receive more WAL updating the relation, up to LSN 300. We +decide it's time to materialize a new snapshot file, and we now have +two files: + + SNAPSHOT+WAL 100-200 + SNAPSHOT+WAL 200-300 + +Note that the latest "full snapshot" that we store on disk always lags +behind by one snapshot cycle. The first file contains a full image of +the relation at LSN 100, the second at LSN 200. When we have received +WAL up to LSN 300, we write a materialized image at LSN 200. That +seems a bit silly. In the design per your RFC, you would write a +snapshots at LSNs 200 and 300, instead. That seems better. + + + +# Third option (not implemented yet) + +Store snapshot files like in the RFC, but also store per-relation +WAL files that contain WAL in a range of LSNs for that relation. + + SNAPSHOT @100 WAL 100-200 + . | + . | + . | + . | + SNAPSHOT @200 WAL 200-300 + . | + . | + . | + . | + SNAPSHOT @300 + . + . + IN-MEMORY 300- + + +This could be the best of both worlds. The snapshot files would be +independent of the PostgreSQL WAL format. When it's time to write +snapshot file @300, you write a full image of the relation at LSN 300, +and you write the WAL that you had accumulated between 200 and 300 to +a separate file. That way, you don't "lag behind" for one snapshot +cycle like in the current implementation. But you still have the WAL +for a particular relation readily available alongside the snapshot +files, and you don't need to track what snapshot LSNs exist +separately. + +(If we wanted to minize the number of files, you could include the +snapshot @300 and the WAL between 200 and 300 in the same file, but I +feel it's probably better to keep them separate) + + + +# Further thoughts + +There's no fundamental reason why the LSNs of the snapshot files and the +ranges of the WAL files would need to line up. So this would be possible +too: + + SNAPSHOT @100 WAL 100-150 + . | + . | + . WAL 150-250 + . | + SNAPSHOT @200 | + . | + . WAL 250-400 + . | + . | + SNAPSHOT @300 | + . | + . | + IN-MEMORY 300- + +I'm not sure what the benefit of this would be. You could materialize +additional snapshot files in the middle of a range covered by a WAL +file, maybe? Might be useful to speed up access when you create a new +branch in the middle of an LSN range or if there's some other reason +to believe that a particular LSN is "interesting" and there will be +a lot of requests using it. diff --git a/docs/rfcs/009-snapshot-first-storage.md b/docs/rfcs/009-snapshot-first-storage.md new file mode 100644 index 0000000000..aeef54898a --- /dev/null +++ b/docs/rfcs/009-snapshot-first-storage.md @@ -0,0 +1,148 @@ +# Snapshot-first storage architecture + +Goals: +- Long-term storage of database pages. +- Easy snapshots; simple snapshot and branch management. +- Allow cloud-based snapshot/branch management. +- Allow cloud-centric branching; decouple branch state from running pageserver. +- Allow customer ownership of data via s3 permissions. +- Provide same or better performance for typical workloads, vs plain postgres. + +Non-goals: +- Service database reads from s3 (reads should be serviced from the pageserver cache). +- Keep every version of every page / Implement point-in-time recovery (possibly a future paid feature, based on WAL replay from an existing snapshot). + +## Principle of operation + +The database “lives in s3”. This means that all of the long term page storage is in s3, and the “live database”-- the version that lives in the pageserver-- is a set of “dirty pages” that haven’t yet been written back to s3. + +In practice, this is mostly similar to storing frequent snapshots to s3 of a database that lives primarily elsewhere. + +The main difference is that s3 is authoritative about which branches exist; pageservers consume branches, snapshots, and related metadata by reading them from s3. This allows cloud-based management of branches and snapshots, regardless of whether a pageserver is running or not. + +It’s expected that a pageserver should keep a copy of all pages, to shield users from s3 latency. A cheap/slow pageserver that falls back to s3 for some reads would be possible, but doesn’t seem very useful right now. + +Because s3 keeps all history, and the safekeeper(s) preserve any WAL records needed to reconstruct the most recent changes, the pageserver can store dirty pages in RAM or using non-durable local storage; this should allow very good write performance, since there is no need for fsync or journaling. + +Objects in s3 are immutable snapshots, never to be modified once written (only deleted). + +Objects in s3 are files, each containing a set of pages for some branch/relation/segment as of a specific time (LSN). A snapshot could be complete (meaning it has a copy of every page), or it could be incremental (containing only the pages that were modified since the previous snapshot). It’s expected that most snapshots are incremental to keep storage costs low. + +It’s expected that the pageserver would upload new snapshot objects frequently, e.g. somewhere between 30 seconds and 15 minutes, depending on cost/performance balance. + +No-longer needed snapshots can be “squashed”-- meaning snapshot N and snapshot N+1 can be read by some cloud agent software, which writes out a new object containing the combined set of pages (keeping only the newest version of each page) and then deletes the original snapshots. + +A pageserver only needs to store the set of pages needed to satisfy operations in flight: if a snapshot is still being written, the pageserver needs to hold historical pages so that snapshot captures a consistent moment in time (similar to what is needed to satisfy a slow replica). + +WAL records can be discarded once a snapshot has been stored to s3. (Unless we want to keep them longer as part of a point-in-time recovery feature.) + +## Pageserver operation + +To start a pageserver from a stored snapshot, the pageserver downloads a set of snapshots sufficient to start handling requests. We assume this includes the latest copy of every page, though it might be possible to start handling requests early, and retrieve pages for the first time only when needed. + +To halt a pageserver, one final snapshot should be written containing all pending WAL updates; then the pageserver and safekeepers can shut down. + +It’s assumed there is some cloud management service that ensures only one pageserver is active and servicing writes to a given branch. + +The pageserver needs to be able to track whether a given page has been modified since the last snapshot, and should be able to produce the set of dirty pages efficiently to create a new snapshot. + +The pageserver need only store pages that are “reachable” from a particular LSN. For example, a page may be written four times, at LSN 100, 200, 300, and 400. If no snapshot is being created when LSN 200 is written, the page at LSN 100 can be discarded. If a snapshot is triggered when the pageserver is at LSN 299, the pageserver must preserve the page from LSN 200 until that snapshot is complete. As before, the page at LSN 300 can be discarded when the LSN 400 pages is written (regardless of whether the LSN 200 snapshot has completed.) + +If the pageserver is servicing multiple branches, those branches may contain common history. While it would be possible to serve branches with zero knowledge of their common history, a pageserver could save a lot of space using an awareness of branch history to share the common set of pages. Computing the “liveness” of a historical page may be tricky in the face of multiple branches. + +The pageserver may store dirty pages to memory or to local block storage; any local block storage format is only temporary “overflow” storage, and is not expected to be readable by future software versions. + +The pageserver may store clean pages (those that are captured in a snapshot) any way it likes: in memory, in a local filesystem (possibly keeping a local copy of the snapshot file), or using some custom storage format. Reading pages from s3 would be functional, but is expected to be prohibitively slow. + +The mechanism for recovery after a pageserver failure is WAL redo. If we find that too slow in some situations (e.g. write-heavy workload causes long startup), we can write more frequent snapshots to keep the number of outstanding WAL records low. If that’s still not good enough, we could look at other options (e.g. redundant pageserver or an EBS page journal). + +A read-only pageserver is possible; such a pageserver could be a read-only cache of a specific snapshot, or could auto-update to the latest snapshot on some branch. Either way, no safekeeper is required. Multiple read-only pageservers could exist for a single branch or snapshot. + +## Cloud snapshot manager operation + +Cloud software may wish to do the following operations (commanded by a user, or based on some pre-programmed policy or other cloud agent): +Create/delete/clone/rename a database +Create a new branch (possibly from a historical snapshot) +Start/stop the pageserver/safekeeper on a branch +List databases/branches/snapshots that are visible to this user account + +Some metadata operations (e.g. list branches/snapshots of a particular db) could be performed by scanning the contents of a bucket and inspecting the file headers of each snapshot object. This might not be fast enough; it might be necessary to build a metadata service that can respond more quickly to some queries. + +This is especially true if there are public databases: there may be many thousands of buckets that are public, and scanning all of them is not a practical strategy for answering metadata queries. + +## Snapshot names, deletion and concurrency + +There may be race conditions between operations-- in particular, a “squash” operation may replace two snapshot objects (A, B) with some combined object (C). Since C is logically equivalent to B, anything that attempts to access B should be able to seamlessly switch over to C. It’s assumed that concurrent delete won’t disrupt a read in flight, but it may be possible for some process to read B’s header, and then discover on the next operation that B is gone. + +For this reason, any attempted read should attempt a fallback procedure (list objects; search list for an equivalent object) if an attempted read fails. This requires a predictable naming scheme, e.g. `XXXX_YYYY_ZZZZ_DDDD`, where `XXXX` is the branch unique id, and `YYYY` and `ZZZZ` are the starting/ending LSN values. `DDDD` is a timestamp indicating when the object was created; this is used to disambiguate a series of empty snapshots, or to help a snapshot policy engine understand which snapshots should be kept or discarded. + +## Branching + +A user may request a new branch from the cloud user interface. There is a sequence of things that needs to happen: +- If the branch is supposed to be based on the latest contents, the pageserver should perform an immediate snapshot. This is the parent snapshot for the new branch. +- Cloud software should create the new branch, by generating a new (random) unique branch identifier, and creating a placeholder snapshot object. + - The placeholder object is an empty snapshot containing only metadata (which anchors it to the right parent history) and no pages. + - The placeholder can be discarded when the first snapshot (containing data) is completed. Discarding is equivalent to squashing, when the snapshot contains no data. +- If the branch needs to be started immediately, a pageserver should be notified that it needs to start servicing the branch. This may not be the same pageserver that services the parent branch, though the common history may make it the best choice. + +Some of these steps could be combined into the pageserver, but that process would not be possible under all cases (e.g. if no pageserver is currently running, or if the branch is based on an older snapshot, or if a different pageserver will be serving the new branch). Regardless of which software drives the process, the result should look the same. + +## Long-term file format + +Snapshot files (and any other object stored in s3) must be readable by future software versions. + +It should be possible to build multiple tools (in addition to the pageserver) that can read and write this file format-- for example, to allow cloud snapshot management. + +Files should contain the following metadata, in addition to the set of pages: +- The version of the file format. +- A unique identifier for this branch (should be worldwide-unique and unchanging). +- Optionally, any human-readable names assigned to this branch (for management UI/debugging/logging). +- For incremental snapshots, the identifier of the predecessor snapshot. For new branches, this will be the parent snapshot (the point at which history diverges). +- The location of the predecessor branch snapshot, if different from this branch’s location. +- The LSN range `(parent, latest]` for this snapshot. For complete snapshots, the parent LSN can be 0. +- The UTC timestamp of the snapshot creation (which may be different from the time of its highest LSN, if the database is idle). +- A SHA2 checksum over the entire file (excluding the checksum itself), to preserve file integrity. + +A file may contain no pages, and an empty LSN range (probably `(latest, latest]`?), which serves as a placeholder for either a newly-created branch, or a snapshot of an idle database. + +Any human-readable names stored in the file may fall out of date if database/branch renames are allowed; there may need to be a cloud metadata service to query (current name -> unique identifier). We may choose instead to not store human-readable names in the database, or treat them as debugging information only. + +## S3 semantics, and other kinds of storage + +For development and testing, it may be easier to use other kinds of storage in place of s3. For example, a directory full of files can substitute for an s3 bucket with multiple objects. This mode is expected to match the s3 semantics (e.g. don’t edit existing files or use symlinks). Unit tests may omit files entirely and use an in-memory mock bucket. + +Some users may want to use a local or network filesystem in place of s3. This isn’t prohibited but it’s not a priority, either. + +Alternate implementations of s3 should be supported, including Google Cloud Storage. + +Azure Blob Storage should be supported. We assume (without evidence) that it’s semantically equivalent to s3 for this purpose. + +The properties of s3 that we depend on are: +list objects +streaming read of entire object +read byte range from object +streaming write new object (may use multipart upload for better relialibity) +delete object (that should not disrupt an already-started read). + +Uploaded files, restored backups, or s3 buckets controlled by users could contain malicious content. We should always validate that objects contain the content they’re supposed to. Incorrect, Corrupt or malicious-looking contents should cause software (cloud tools, pageserver) to fail gracefully. + +## Notes + +Possible simplifications, for a first draft implementation: +- Assume that dirty pages fit in pageserver RAM. Can use kernel virtual memory to page out to disk if needed. Can improve this later. +- Don’t worry about the details of the squashing process yet. +- Don’t implement cloud metadata service; try to make everything work using basic s3 list-objects and reads. +- Don’t implement rename, delete at first. +- Don’t implement public/private, just use s3 permissions. +- Don’t worry about sharing history yet-- each user has their own bucket and a full copy of all data. +- Don’t worry about history that spans multiple buckets. +- Don’t worry about s3 regions. +- Don’t support user-writeable s3 buckets; users get only read-only access at most. + +Open questions: +- How important is point-in-time recovery? When should we add this? How should it work? +- Should snapshot files use compression? +- Should we use snapshots for async replication? A spare pageserver could stay mostly warmed up by consuming snapshots as they’re created. +- Should manual snapshots, or snapshots triggered by branch creation, be named differently from snapshots that are triggered by a snapshot policy? +- When a new branch is created, should it always be served by the same pageserver that owns its parent branch? When should we start a new pageserver? +- How can pageserver software upgrade be done with minimal downtime? diff --git a/docs/rfcs/010-storage_details.md b/docs/rfcs/010-storage_details.md new file mode 100644 index 0000000000..8429a2d9e3 --- /dev/null +++ b/docs/rfcs/010-storage_details.md @@ -0,0 +1,144 @@ +# Storage details + +Here I tried to describe the current state of thinking about our storage subsystem as I understand it. Feel free to correct me. Also, I tried to address items from Heikki's TODO and be specific on some of the details. + +## Overview + +![storage](images/storage.jpeg) + +### MemStore + +MemStore holds the data between `latest_snapshot_lsn` and `latest_lsn`. It consists of PageIndex that holds references to WAL records or pages, PageStore that stores recently materialized pages, and WalStore that stores recently received WAL. + +### PageIndex + +PageIndex is an ordered collection that maps `(BufferTag, LSN)` to one of the following references (by reference I mean some information that is needed to access that data, e.g. file_id and offset): + +* PageStoreRef -- page offset in the PageStore +* LocalStoreRef -- snapshot_id and page offset inside of that snapshot +* WalStoreRef -- offset (and size optionally) of WalRecord in WalStore + +PageIndex holds information about all the pages in all incremental snapshots and in the latest full snapshot. If we aren't using page compression inside snapshots we actually can avoid storing references to the full snapshot and calculate page offsets based on relation sizes metadata in the full snapshot (assuming that full snapshot stores pages sorted by page number). However, I would suggest embracing page compression from the beginning and treat all pages as variable-sized. + +We assume that PageIndex is few orders of magnitude smaller than addressed data hence it should fit memory. We also don't care about crash tolerance as we can rebuild it from snapshots metadata and WAL records from WalStore or/and Safekeeper. + +### WalStore + +WalStore is a queue of recent WalRecords. I imagine that we can store recent WAL the same way as Postgres does -- as 16MB files on disk. On top of that, we can add some fixed-size cache that would keep some amount of segments in memory. + +For now, we may rely on the Safekeeper to safely store that recent WAL. But generally, I think we can pack all S3 operations into the page server so that it would be also responsible for the recent WAL pushdown to S3 (and Safekeeper may just delete WAL that was confirmed as S3-durable by the page server). + +### PageStore + +PageStore is storage for recently materialized pages (or in other words cache of getPage results). It is also can be implemented as a file-based queue with some memory cache on top of it. + +There are few possible options for PageStore: + +a) we just add all recently materialized pages there (so several versions of the same page can be stored there) -- that is more or less how it happens now with the current RocksDB implementation. + +b) overwrite older pages with the newer pages -- if there is no replica we probably don't need older pages. During page overwrite, we would also need to change PageStoreRef back to WalStoreRef in PageIndex. + +I imagine that newly created pages would just be added to the back of PageStore (again in queue-like fashion) and this way there wouldn't be any meaningful ordering inside of that queue. When we are forming a new incremental snapshot we may prohibit any updates to the current set of pages in PageStore (giving up on single page version rule) and cut off that whole set when snapshot creation is complete. + +With option b) we can also treat PageStor as an uncompleted increamental snapshot. + +### LocalStore + +LocalStore keeps the latest full snapshot and set of incremental snapshots on top of it. We add new snapshots when the number of changed pages grows bigger than a certain threshold. + +## Granularity + +By granularity, I mean a set of pages that goes into a certain full snapshot. Following things should be taken into account: + +* can we shard big databases between page servers? +* how much time will we spend applying WAL to access certain pages with older LSN's? +* how many files do we create for a single database? + +I can think of the following options here: + +1. whole database goes to one full snapshot. + * +: we never create a lot of files for one database + * +: the approach is quite straightforward, moving data around is simple + * -: can not be sharded + * -: long recovery -- we always need to recover the whole database +2. table segment is the unit of snapshotting + * +: straightforward for sharding + * +: individual segment can be quickly recovered with sliced WAL + * -: full snapshot can be really small (e.g. when the corresponding segment consists of a single page) and we can blow amount of files. Then we would spend eternity in directory scans and the amount of metadata for sharding can be also quite big. +3. range-partitioned snapshots -- snapshot includes all pages between [BuffTagLo, BuffTagHi] mixing different relations, databases, and potentially clusters (albeit from one tenant only). When full snapshot outgrows a certain limit (could be also a few gigabytes) we split the snapshot in two during the next full snapshot write. That approach would also require pages sorted by BuffTag inside our snapshots. + * +: addresses all mentioned issues + * -: harder to implement + +I think it is okay to start with table segments granularity and just check how we will perform in cases of lots of small tables and check is there any way besides c) to deal with it. + +Both PageStore and WalStore should be "sharded" by this granularity level. + +## Security + +We can generate different IAM keys for each tenant and potentially share them with users (in read-only mode?) or even allow users to provide their S3 buckets credentials. + +Also, S3 backups are usually encrypted by per-tenant privates keys. I'm not sure in what threat model such encryption would improve something (taking into account per-tenant IAM keys), but it seems that everybody is doing that (both AMZN and YNDX). Most likely that comes as a requirement about "cold backups" by some certification procedure. + +## Dynamics + +### WAL stream handling + +When a new WAL record is received we need to parse BufferTags in that record and insert them in PageIndex with WalStoreRef as a value. + +### getPage queries + +Look up the page in PageIndex. If the value is a page reference then just respond with that page. If the referenced value is WAL record then find the most recent page with the same BuffTag (that is why we need ordering in PageIndex); recover it by applying WAL records; save it in PageStore; respond with that page. + +### Starting page server without local data + +* build set of latest full snapshots and incremental snapshots on top of them +* load all their metadata into PageIndex +* Safekeeper should connect soon and we can ask for a WAL stream starting from the latest incremental snapshot +* for databases that are connected to us through the Safekeeper we can start loading the set of the latest snapshots or we can do that lazily based on getPage request (I'd better avoid doing that lazily for now without some access stats from the previous run and just transfer all data for active database from S3 to LocalStore). + +### Starting page server with local data (aka restart or reboot) + +* check that local snapshot files are consistent with S3 + +### Snapshot creation + +Track size of future snapshots based on info in MemStore and when it exceeds some threshold (taking into account our granularity level) create a new incremental snapshot. Always emit incremental snapshots from MemStore. + +To create a new snapshot we need to walk through WalStore to get the list of all changed pages, sort it, and get the latest versions of that pages from PageStore or by WAL replay. It makes sense to maintain that set in memory while we are receiving the WAL stream to avoid parsing WAL during snapshot creation. + +Full snapshot creation can be done by GC (or we can call that entity differently -- e.g. merger?) by merging the previous full snapshot with several incremental snapshots. + +### S3 pushdown + +When we have several full snapshots GC can push the old one with its increments to S3. + +### Branch creation + +Create a new timeline and replay sliced WAL up to a requested point. When the page is not in PageIndex ask the parent timeline about a page. Relation sizes are tricky. + +## File formats + +As far as I understand Bookfile/Aversion addresses versioning and serialization parts. + +As for exact data that should go to snapshots I think it is the following for each snapshot: + +* format version number +* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknow key are present. If we add something backward compatible to the file we can keep the version number. +* array of [BuffTag, corresponding offset in file] for pages -- IIUC that is analogous to ToC in Bookfile +* array of [(BuffTag, LSN), corresponding offset in file] for the WAL records +* pages, one by one +* WAL records, one by one + +It is also important to be able to load metadata quickly since it would be one of the main factors impacting the time of page server start. E.g. if would store/cache about 10TB of data per page server, the size of uncompressed page references would be about 30GB (10TB / ( 8192 bytes page size / ( ~18 bytes per ObjectTag + 8 bytes offset in the file))). + +1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when realtion_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset delatas would be small). +2) It makes sense to keep ToC at the beginning of the file to avoid extra seeks to locate it. Doesn't matter too much with the local files but matters on S3 -- if we are accessing a lot of ~1Gb files with the size of metadata ~ 1Mb then the time to transfer this metadata would be comparable with access latency itself (which is about a half of a second). So by slurping metadata with one read of file header instead of N reads we can improve the speed of page server start by this N factor. + +I think both of that optimizations can be done later, but that is something to keep in mind when we are designing our storage serialization routines. + +Also, there were some discussions about how to embed WAL in incremental snapshots. So far following ideas were mentioned: +1. snapshot lsn=200, includes WAL in range 200-300 +2. snapshot lsn=200, includes WAL in range 100-200 +3. data snapshots are separated from WAL snapshots + +Both options 2 and 3 look good. I'm inclined towards option 3 as it would allow us to apply different S3 pushdown strategies for data and WAL files (e.g. we may keep data snapshot until the next full snapshot, but we may push WAL snapshot to S3 just when they appeared if there are no replicas). diff --git a/docs/rfcs/011-retention-policy.md b/docs/rfcs/011-retention-policy.md new file mode 100644 index 0000000000..fde36c8108 --- /dev/null +++ b/docs/rfcs/011-retention-policy.md @@ -0,0 +1,91 @@ +# User-visible timeline history + +The user can specify a retention policy. The retention policy is +presented to the user as a PITR period and snapshots. The PITR period +is the amount of recent history that needs to be retained, as minutes, +hours, or days. Within that period, you can create a branch or +snapshot at any point in time, open a compute node, and start running +queries. Internally, a PITR period is represented as a range of LSNs + +The user can also create snapshots. A snapshot is a point in time, +internally represented by an LSN. The user gives the snapshot a name. + +The user can also specify an interval, at which the system creates +snapshots automatically. For example, create a snapshot every night at +2 AM. After some user-specified time, old automatically created +snapshots are removed. + + Snapshot Snapshot + PITR "Monday" "Tuesday" PITR + ----######----------+-------------+-------------######> + +If there are multiple branches, you can specify different policies or +different branches. + +The PITR period and user-visible snapshots together define the +retention policy. + +NOTE: As presented here, this is probably overly flexible. In reality, +we want to keep the user interface simple. Only allow a PITR period at +the tip of a branch, for example. But that doesn't make much +difference to the internals. + + +# Retention policy behind the scenes + +The retention policy consists of points (for snapshots) and ranges +(for PITR periods). + +The system must be able to reconstruct any page within the retention +policy. Other page versions can be garbage collected away. We have a +lot of flexibility on when to perform the garbage collection and how +aggressive it is. + + +# Base images and WAL slices + +The page versions are stored in two kinds of files: base images and +WAL slices. A base image contains a dump of all the pages of one +relation at a specific LSN. A WAL slice contains all the WAL in an LSN +range. + + + | + | + | + | --Base img @100 + + | | + | | WAL slice + | | 100-200 + | | + | --Base img @200 + + | | + | | WAL slice + | | 200-300 + | | + | + + | + V + + +To recover a page e.g. at LSN 150, you need the base image at LSN 100, +and the WAL slice 100-200. + +All of this works at a per-relation or per-relation-segment basis. If +a relation is updated very frequently, we create base images and WAL +slices for it more quickly. For a relation that's updated +infrequently, we hold the recent WAL for that relation longer, and +only write it out when we need to release the disk space occupied by +the original WAL. (We need a backstop like that, because until all the +WAL/base images have been been durably copied to S3, we must keep the +original WAL for that period somewhere, in the WAL service or in S3.) + + +# Branching + +Internally, branch points are also "retention points", in addition to +the user-visible snapshots. If a branch has been forked off at LSN +100, we need to be able to reconstruct any page on the parent branch +at that LSN, because it is needed by the child branch. If a page is +modified in the child, we don't need to keep that in the parent +anymore, though. diff --git a/docs/rfcs/012-background-tasks.md b/docs/rfcs/012-background-tasks.md new file mode 100644 index 0000000000..8692b187e6 --- /dev/null +++ b/docs/rfcs/012-background-tasks.md @@ -0,0 +1,38 @@ +# Eviction + + Write out in-memory layer to disk, into a delta layer. + +- To release memory +- To make it possible to advance disk_consistent_lsn and allow the WAL + service to release some WAL. + +- Triggered if we are short on memory +- Or if the oldest in-memory layer is so old that it's holding back + the WAL service from removing old WAL + +# Materialization + +Create a new image layer of a segment, by performing WAL redo + +- To reduce the amount of WAL that needs to be replayed on a GetPage request. +- To allow garbage collection of old layers + +- Triggered by distance to last full image of a page + +# Coalescing + +Replace N consecutive layers of a segment with one larger layer. + +- To reduce the number of small files that needs to be uploaded to S3 + + +# Bundling + +Zip together multiple small files belonging to different segments. + +- To reduce the number of small files that needs to be uploaded to S3 + + +# Garbage collection + +Remove a layer that's older than the GC horizon, and isn't needed anymore. diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md new file mode 100644 index 0000000000..0c359028ed --- /dev/null +++ b/docs/rfcs/013-term-history.md @@ -0,0 +1,147 @@ +# What + +Currently, apart from WAL safekeeper persistently stores only two logical clock +counter (aka term) values, sourced from the same sequence. The first is bumped +whenever safekeeper gives vote to proposer (or acknowledges already elected one) +and e.g. prevents electing two proposers with the same term -- it is actually +called `term` in the code. The second, called `epoch`, reflects progress of log +receival and this might lag behind `term`; safekeeper switches to epoch `n` when +it has received all committed log records from all `< n` terms. This roughly +correspones to proposed in + +https://github.com/zenithdb/rfcs/pull/3/files + + +This makes our biggest our difference from Raft. In Raft, every log record is +stamped with term in which it was generated; while we essentialy store in +`epoch` only the term of the highest record on this safekeeper -- when we know +it -- because during recovery generally we don't, and `epoch` is bumped directly +to the term of the proposer who performs the recovery when it is finished. It is +not immediately obvious that this simplification is safe. I thought and I still +think it is; model checking confirmed that. However, some details now make me +believe it is better to keep full term switching history (which is equivalent to +knowing term of each record). + +# Why + +Without knowing full history (list of pairs) of terms it is hard to +determine the exact divergence point, and if we don't perform truncation at that +point safety becomes questionable. Consider the following history, with +safekeepers A, B, C, D, E. n_m means record created by proposer in term n with +LSN m; (t=x, e=y) means safekeeper currently has term x and epoch y. + +1) P1 in term 1 writes 1.1 everywhere, which is committed, and some more only +on A. + +
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=1, e=1) 1.1
+D(t=1, e=1) 1.1
+E(t=1, e=1) 1.1
+
+ +2) P2 is elected by CDE in term 2, epochStartLsn is 2, and writes 2.2, 2.3 on CD: + +
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=2, e=2) 1.1 2.2 2.3
+D(t=2, e=2) 1.1 2.2 2.3
+E(t=2, e=1) 1.1
+
+ + +3) P3 is elected by CDE in term 3, epochStartLsn is 4, and writes 3.4 on D: + +
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=3, e=2) 1.1 2.2 2.3
+D(t=3, e=3) 1.1 2.2 2.3 3.4
+E(t=3, e=1) 1.1
+
+ + +Now, A gets back and P3 starts recovering it. How it should proceed? There are +two options. + +## Don't try to find divergence point at all + +...start sending WAL conservatively since the horizon (1.1), and truncate +obsolete part of WAL only when recovery is finished, i.e. epochStartLsn (4) is +reached, i.e. 2.3 transferred -- that's what https://github.com/zenithdb/zenith/pull/505 proposes. + +Then the following is possible: + +4) P3 moves one record 2.2 to A. + +
+A(t=1, e=1) 1.1 2.2 1.3 1.4
+B(t=1, e=1) 1.1 1.2
+C(t=3, e=2) 1.1 2.2 2.3
+D(t=3, e=3) 1.1 2.2 2.3 3.4
+E(t=3, e=1) 1.1
+
+ +Now log of A is basically corrupted. Moreover, since ABE are all in epoch 1 and +A's log is the longest one, they can elect P4 who will commit such log. + +Note that this particular history couldn't happen if we forbid to *create* new +records in term n until majority of safekeepers switch to it. It would force CDE +to switch to 2 before 2.2 is created, and A could never become donor while his +log is corrupted. Generally with this additional barrier I believe the algorithm +becomes safe, but + - I don't like this kind of artificial barrier; + - I also feel somewhat discomfortable about even temporary having intentionally + corrupted WAL; + - I'd still model check the idea. + +## Find divergence point and truncate at it + +Then step 4 would delete 1.3 1.4 on A, and we are ok. The question is, how do we +do that? Without term switching history we have to resort to sending again since +the horizon and memcmp'ing records, which is inefficient and ugly. Or we can +maintain full history and determine truncation point by comparing 'wrong' and +'right' histories -- much like pg_rewind does -- and perform truncation + start +streaming right there. + +# Proposal + +- Add term history as array of pairs to safekeeper controlfile. +- Return it to proposer with VoteResponse so 1) proposer can tell it to other + nodes and 2) determine personal streaming starting point. However, since we + don't append WAL and update controlfile atomically, let's first always update + controlfile but send only the history of what we really have (up to highest + term in history where begin_lsn >= end of wal; this highest term replaces + current `epoch`). We also send end of wal as we do now to determine the donor. +- Create ProposerAnnouncement message which proposer sends before starting + streaming. It announces proposer as elected and + 1) Truncates wrong part of WAL on safekeeper + (divergence point is already calculated at proposer, but can be + cross-verified here). + 2) Communicates the 'right' history of its term (taken from donor). Seems + better to immediately put the history in the controlfile, + though safekeeper might not have full WAL for previous terms in it -- + this way is simpler, and we can't update WAL and controlfile atomically anyway. + + This also constitutes analogue of current epoch bump for those safekeepers + which don't need recovery, which is important for sync-safekeepers (bump + epoch without waiting records from new term). +- After ProposerAnnouncement proposer streams WAL since calculated starting + point -- only what is missing. + + +pros/cons: ++ (more) clear safety of WAL truncation -- we get very close to Raft ++ no unnecessary data sending (faster recovery for not-oldest-safekeepers, matters + only for 5+ nodes) ++ adds some observability at safekeepers + +- complexity, but not that much + + +# Misc + +- During model checking I did truncation on first locally non existent or + different record -- analogue of 'memcmp' variant described above. diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md new file mode 100644 index 0000000000..fdf6885929 --- /dev/null +++ b/docs/rfcs/README.md @@ -0,0 +1,95 @@ +This directory contains Request for Comments documents, or RFCs, for +features or concepts that have been proposed. Alternative names: +technical design doc, ERD, one-pager + +To make a new proposal, create a new text file in this directory and +open a Pull Request with it. That gives others a chance and a forum +to comment and discuss the design. + +When a feature is implemented and the code changes are committed, also +include the corresponding RFC in this directory. + +Some of the RFCs in this directory have been implemented in some form +or another, while others are on the roadmap, while still others are +just obsolete and forgotten about. So read them with a grain of salt, +but hopefully even the ones that don't reflect reality give useful +context information. + +## What + +We use Tech Design RFC’s to summarize what we are planning to +implement in our system. These RFCs should be created for large or not +obvious technical tasks, e.g. changes of the architecture or bigger +tasks that could take over a week, changes that touch multiple +components or their interaction. RFCs should fit into a couple of +pages, but could be longer on occasion. + +## Why + +We’re using RFCs to enable early review and collaboration, reduce +uncertainties, risk and save time during the implementation phase that +follows the Tech Design RFC. + +Tech Design RFCs also aim to avoid bus factor and are an additional +measure to keep more peers up to date & familiar with our design and +architecture. + +This is a crucial part for ensuring collaboration across timezones and +setting up for success a distributed team that works on complex +topics. + +## Prior art + +- Rust: [https://github.com/rust-lang/rfcs/blob/master/0000-template.md](https://github.com/rust-lang/rfcs/blob/master/0000-template.md) +- React.js: [https://github.com/reactjs/rfcs/blob/main/0000-template.md](https://github.com/reactjs/rfcs/blob/main/0000-template.md) +- Google fuchsia: [https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE](https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE) +- Apache: [https://cwiki.apache.org/confluence/display/GEODE/RFC+Template](https://cwiki.apache.org/confluence/display/GEODE/RFC+Template) / [https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process](https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process) + +## How + +RFC lifecycle: + +- Should be submitted in a pull request with and full RFC text in a commited markdown file and copy of the Summary and Motivation sections also included in the PR body. +- RFC should be published for review before most of the actual code is written. This isn’t a strict rule, don’t hesitate to experiment and build a POC in parallel with writing an RFC. +- Add labels to the PR in the same manner as you do Issues. Example TBD +- Request the review from your peers. Reviewing the RFCs from your peers is a priority, same as reviewing the actual code. +- The Tech Design RFC should evolve based on the feedback received and further during the development phase if problems are discovered with the taken approach +- RFCs stop evolving once the consensus is found or the proposal is implemented and merged. +- RFCs are not intended as a documentation that’s kept up to date **after** the implementation is finished. Do not update the Tech Design RFC when merged functionality evolves later on. In such situation a new RFC may be appropriate. + +### RFC template + +Note, a lot of the sections are marked as ‘if relevant’. They are included into the template as a reminder and to help inspiration. + +``` +# Name +Created on .. +Implemented on .. + +## Summary + +## Motivation + +## Non Goals (if relevant) + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +## Proposed implementation + +### Reliability, failure modes and corner cases (if relevant) + +### Interaction/Sequence diagram (if relevant) + +### Scalability (if relevant) + +### Security implications (if relevant) + +### Unresolved questions (if relevant) + +## Alternative implementation (if relevant) + +## Pros/cons of proposed approaches (if relevant) + +## Definition of Done (if relevant) + +``` diff --git a/docs/rfcs/images/storage.jpeg b/docs/rfcs/images/storage.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..1d72a018dc462a74ad01bb17561c98efd0745bca GIT binary patch literal 431075 zcmeFZcT`i~zb(2$C{jc35SoA>B3+6JHj0Q~p$jOz2-v8G1VMTS6#=D)h|)U>n9vac zL8^d|f}$cwA_`jw;cdRZ^Ugi@-7(&H=lyZ-x#PMWf;oKc6}0XYJh&yCi`3 zw28S10D}Pl4Eh7?&H*O?IMd#-_Z1Eu%&g3N$39k87FPCs931TX*x5O_5L_IbJe=(8 z+2blR- z_+>SYu?n2K!X|fJQ1eb^@jm(Eb)7=z-%}N|+-^j(a|nxwiiz)6R8l^mqOGH=cSPUd z#K}`8re@}+FI=>-wX=6{bocP|^7irdyLl@(Bs45MBIa&vTzo=eQdai8`wwy+=H@*q zd0P6c?D>oG`i91)=9ZVQTD!V?di(m{47?p1pO~DQ{xCC3rYwH?y!7Skw`JPT^p`ueYd{XfP9O#+OG8JbeIy>Y>qLZQEKK4unK4OaeR=h&`X7m(Au zvrq7NW^r97yS&zUs*u}__Z-3s+T{JTy`lXxvj1-bi~j#LvVR}ge;d~t3uekBSnj?0Z7 zs_}T)jIT{|S8jTlWP7|VfjPQZ^7{K;AR?~ph5gs9fQ6bDA4^|9PT{}w@yr)aNwd=2 z^jaVfpm#tYA3XrpIu9L0?ekR&SklY#n*7W!y8iPAlP0WBD<)->Z5~-4-|w^w%o>uK zb7&XNQ-9+Vuj7J41E;Q?9hdu~rO6}~Wpt833&FHmY?uo8qA+)!&IeIvc$C=93VEk!d%;uyoOjp%rsZ%sUs zT1VF+zxFq2w3yM}jK7d}*wJTFY*x`k&Wv$Q@A})?j$XhTtl7TIc()5Mx4R>vu=~Sy zfo)aiyamj&uai6dlVWJw{_)G7(4 zMEla2KNw}yiAcN!?)1&wE?MiJLCYH>Pvbhxu&TJO+MNeRr~G|ye)e$)nUFm{|Zd`-n3uKg$IvJ!}F8T&cE&1$!*za3rvZ#@p zM1QxU&sWVeCR+db1c34OhnC2#oo7!7+cy1i4CF|l)o{R0?yYjx@Y${}*HE7di;EDi zb^-VFxsonq(@8s&c>DYO?gE{0HSA#VUY9NY!`sE;RVFU>WRHtKuV z-=r9ZA6yX)S&Bl7?gAdGhNQkK+FI9aX#dt|PhsJ+(Ugy0XaKgAXYjT3G`&pj`fldBK?a}MTB6MGh zYdoee{sFo^M;&;HVe6vs=%9gh1eV!=qg|Vuxc1ju$y9) ziW3edBp2+D+t(!JwR(u9{ZxuAt*<=u^OsE=RrFgGMVngyp_69tkE>4 zo$Sr2^Q*%X@7`rn!tyaDB?_*kl7V=K=7sK|Gr^~A$NUR*^Ddq5>4*3s+n){}FTD|Y z|4B>P{YN?BG_Zl2>1S{Z22e)$9m3EXQPk#~BJShom{Brgjwc)9Os>kjjI#|%)n7U% z3;euz$iW!(73^FzgpwIyU%Nkz)3l}e)obc>ypELDlxkF;m}g6K)Ib4jwUjL4x z+Tmt(H*Ib|Ms#6W$FIVdR2>dh2O@(?-grCr&)Z<{!G0lEs-P8YRtLUV z!*1J7Rm(Sy2Ro}hvL7&{)>%kaj_`SP*z%Ab0I}OW@U_{$)!P4}YE7%%bQh4@c(Dt( zT-XJ^vPPlVcY!NMb^9H6DllwMmP5WDeNd|N`~HySn3-qoOVbN0HVJee`0b6*$=`^X z3mc1LzbpgEi!mlNo&MNhy`cLZO?f8G>-jr(TJ__%i$>XM_izyfC7Bbnpl%%d)xc0k z(9}|IIspn3^_H;9l}Bww$(M288n80bq1!?9gJ>3pIAdfNKzO(O45~6Z5E}kMc3HjJ z+o6nkbM}YzZJA$12m+3jL-^^XGUAy(c8kY2|95Mf0{6d|5+JE(+BI zExy5B;LRYlfzciuv?CT^EJ7e<;~Lhw3q?SSUYw|rKiOH!l@MbmE}WNlzx-*`QLjiQqR6MENpN*CL48% zH-opR{e~7VETBU&Rulr5jZa`OxhB@N37tk4BdzL6p-rNfrhbf#@iiY|8Gfo&FPA0+ ze@ni24FIkw!#DIcT+EEDwhv&m8>XwC;9Fi(ZS%cva($IjieE_-vtx_nDq%Z%!w0^e zvlmFwZkW?l*18vA-6Y$5^<<;{2h*?RS-w(#@$LByOtIL~XOXB<_KXzIvz)vLPLO;1 z5p;9J8j!S=(J(9ifd18VgT!|5SZ?cjNu4K0Jh^0aX}iW`-)=Xs4Wk z@Wmc&7+no)Ts@W&+G<-O7>aanPaspS9Xo%Ke7{z+GWE_QYc_Idualu+P#WX#`0DDF zq$za~Ki+;O8T&DfFp|u<4(lWRXRO(N!LLw9$bI+2c}hy@XzKjq`L;ya0-|}ezWkN< z_wU2lK3cPVGNlVS^-bgDcV1Lu)c60YcMP*Kt*=k|ST`e>ZF0yt`WIK1&(DaAazhSa z+inBC^ALsjhHkKnGqRx|;~CP^zt==&bxWVSEJxh3JL{e~HyCR)I>+a}kE0r{yCkm^ zg&45{m6bXRIWfFDr8+nsP{}IYYP88;HNfv(-(gIf>k-WXaSv+2)f+>R0~G)`U_~cWYTqLJh=*K=H9~BoM%fkjIz!PDQqf?vV9imK*wI72ZH@% zbnG>kd<}}!v4EvlHXlbj$~|%?La&JH1GEgSUI06dG1`IHmgg=Ir(UuE29Z5umh+@~ z^d7WUt<-;X;e7be;EZ#A92fkG@ugqDWox#jX}Xa6zh)-TNNxYGEC&#pnTDi~Ed&3| zOw?DgyYD~!mbDwTX0zNrfL*{b;k4Fe-JUJUa+i3ntDQ{P~ z4;L*$5OFasiiLfJ`;533C7vt=-0qI>DwDnWK<#Md4avewO+bkDe@D~)r`7HM$Dc@S zL4Fs=ybY0pnghGQFSs!>4q3}aFA(Xr+TM2w_f6p!Y`KtX14W%0i78V>|>cGOB8@ur?(Cjvo`1>Xp72dz{2D?aEN7Ln{IQ0 zg7UF3b6no;ZjF7UZM=%t_od07ck%p2Xb0VA}mhQMod%VfWoEc>sgcwTtAN|(|Mo)$M z&cW9ZCnAvY3V(VsV7owdUJK*VkLU1pZw3dXUMM|Y9NGH(bQkzmKO2QMF;b!xGQ`#x z?4L=5ItizH|88(sCr*eO2a+J*Ld?df!cBK@9YfET%j6+Fd;b z3*=}#HKgy;T}{;$rNheHME*JU3qi$hc#<_+H=24Lmx=>Wy7c4Jg$Alk$%GE4s$oLY zs-J{Xa~hb9oEhIts*yJlzVzzSl?kTfBp@;p8d}n5QyA*2cLyF4Q#1zi0-s8SbUHTjCkrgr6u(;wXco5 z&!u1F+G)4;CZ1ugOifj$<3c7E;yymT6Riv<2hF)JHm7czU`HeR(=bYsUyjFdMk<;6 zx*a%3JgYiRd1->dm%%2A65Mz$6q92(wQrkYGm-eX< zw?dTgwQV_k$^Damvd`44eZ$UYGqU@t+$h^KY%fC&dU+&BAfxih#8ekSDk^a?QuOHS z-|q<9cPB1{YkXArah(5Rtf#_Zm9S3MsG}CW1q#qxE`+%-x=ov?IWb12&zJ2qW;^|h z?2rC%_$Z^MEVm!1T_!CaVU0q49sT93)a`hvkUY%VB_XACx+&@H+bN+rN#2upOEW*1Xr@+GmbHNqY51OJG=>N=xzW~T~+EDj3^5*1>8e&i}xyDK%Yj6|k~VA}tVFo~bA$iw91Kbc0LH+jzVDV_ACrt%)DJX+?-)oW~ldWEY3_!io3N4sB9U zWfWJwV*xsq7E*Z*Gfz&pIca3eH^4^M&M}68AnZ(Go#;+=usWw7lG=bH3!=oMe@GR6 zAOvok7QC1IGLq~?R43}MrWqSr&4_IHBI{eaIY;8*eTCHQof?NiJ3Yb-(QR(&1Q*sd zXKCP3xc+?G;BR*U{}E7$K-aPBnL{QJsV!)-$hL4YYx)Ly1<}0rbE&{pYiRq7 z{YYVIo7Mcg@D5-TJyS+1qiYd5BjIEOU4cN&z+GvY!};u_uM~t921D}Ea?3zVP=3qW zqty|y4&YD2Mh*l9UWi%wMbv9rC_}`R4yE=vUmpS#LDbZpnt`8cPhxh0W=rM^RYT`R z-|6@Hz=?r4FEl3@vvIk0C#T)Bnn?87nAM5PzW%~qW%knP0G9*&9=7X3JfQWeVc!w` zB1DRjIg}WDGZrk>@1l&JH+c;m_H*R%IQ7fdJ)bACOMNWL038RN&omw4J3Zn#y~tz{ zhJ1Z8*Opgq8P2;h1s(fE^Gz?}T6yZ%t|-(FwLi`XO(GzC7X+x)*+I{I#VuPMXJ+QA zKadkmnC{RAK0V6L@_~Os%vLbiOUcMtwthWVfp+=Da3fg?_Zd~cYXg@LRjDi3aMZK# zqzYuPQaD-rLZYD%5$RM16+ZWOrzg6Ig2@zIDjyNaxjYroqpK^fkv@{Q;BjB5H2kdK zJ%Fqg(_`( z$yET?DPGM}cCdg4{3ibG-Jc{WtCMQl=3mE5l^qZ-Pi|SW)nMP@s1`M`y8xS!mHT3i zbl8){e#eDem5=rv8~(mmf;1mh#;Ls+q;j~A4MTrpTncUNh+iPt-?sTbT*pSNs+DF zFY^|B<_2FjJYo{|@mNlS$-k7z=!iZBfbu=i{WN)zR&ua_3;ddVdFYJwa^3;o5~&z- z$>K}fcU? zq@(Q!{B^?rI^lnv@c$QWRm;e#+|>swKG{^IeyY&^_4tZ+Yx8(fK)s8Frane51XdoI_EEtn`?Ngu9rQ_d9Ltwy(3XVt*i2VN$mawl<7iZ zf)NSu1lblhdxL}+6DVy^z8V9Y`h{i#gRJx%aVcf%*x~19;xfe!C47f7PJcdbHe{6Z z2Cdt{#u#-)VpZt-!DK42OPXn?u!$)B%|&AZJ5b>_#cHK)Zl$R+cr|s>%r5+pEsi0LPO|P$z4bL4GLs9Bpi`!fvjA=b+-K6-T-=Bi7*V*B^Oa*v(2cNdk5f*c;sO-c=2 zr-rVK2EA898N9d^LNl%_JQvlwv{N|z{qvu&b@cU?AtGU^Zx#qzb*F7pYC4=%yIP2x zSeU=7z|yqN3GdrH`AgZJqCoV*#q+$EvVOxuHV=NDJ6ZZ9(*&9|M^$(+dP`TA-yBDo`f})@)BhzZS5izZ0J&#=i9Enzu z>Q?5tq&s6K{&Dv1p(-2-7{qIptgudCt&A4<8qxK~=oWq-A=}XsSnlrjm)XAH%!$)m zk*!PD!u@DSe?8X(T-yce34K-xI7x1GH+E^WnD>m zo!1%7->C^lkrldnHRQz}I$-KU?@r&y*&b^JgzxjJ5Fn3qII4%~% zM+3~Zh0yA|gq;>40;M2ZRX^#r^MN3D+WYlWPqXrG+rJO`acD-J=`O8@E*_bN>&?!6k4Kjh=V2W!nE4yt8>SN*vjQ>l;Z#!tjzaVcle0r==?; z@WsuF&a&ZWzE93?Pz}t64G(I(`4o66<;pd2Yjct+5W4jxS!YL!98$bkfn;`F_#TqS z+}eneFh%+md$ z9<&I?4n&h{5>SR<w~1@JXlhN~zx1ODAu(sX5#1|^NY zbUym=_Y8UL$n08oz7_lO^_g#W5s4Yq6%yLTNo%QaA-)pflvX~?^^QZp2tOSQR+V5U z84@7k867tD8_Dur+}*_)oJH%cG^c-l`$56P4d$qwQeh^Vd-2ZZi72O?D!e`fg)Ag) z^zR};mW^9U}JSF*y3fb1qKFY!R2Nul=VZd$wSfGVZ;P`!x%yf({IyS$ZF~7SOtvI z1Vie(o`_b**3n(yp@Pr*+J^Es3Crp4;D^E#`1dKm3&5jL0xax7r@Qi_gu`m~;@64` zMA=!&_iVbIW3A-BB7SdP)MWi)?!A&`-Yc)P0gMN@O}juQ_6NlgIooCTFxNZY$ld7r zcj^62>)4T}s+OM{^zCOv2G0!5&D?0`2`}PF-gpEyEfS(}>^cZ(x;}W}1^vWSRkcj? zQRxn|G3LzLam}8zl>~j+i@(KWn2gz+$VlPu3>olB3HZgHYEoSC-7zkn^~Fu{%q58m zr<^FWDUVv$$M*LISw7tm`~)EMce1d1bGuFAzR-&k#P7{5SCfRATL5aI`<%o5>Y|G# z{N=-vfrsC{&u;PcnPbzp0jkI&=>#F2^x5tn9IS=``yL|A@%~ri>qc68flpIwM!&#i z0nvmwU+KdYvX`&8nJ(;ufY)yEOD{T_Aq(DqiqQvQ)pXNlTHuTQ4=RG6E&|!^23spC2|bT+Om~ zmu`K6%@X^GNd~T~SBfz#jUc>w0aU*@wy?3-&zFlD2#`rK?L@iW18$imHQw1g{S?t! z4MRhP(l{lw!-~a+p#VBlRUN=W-1B9c{U!Tvcwc{e2m3!MvPpJ-9v-~BIxE(9_F5(E zE8d6}lwhPrCAiS~?vV0#Zaa?sg3QtJ?oquh%g;U6a)l>P#~IE4kZndiE;k4<%3 zgKke7=_AB!+4Ue22%Jp_z(2HpM6mVf@|$XYJ(UKPw+YK>tj3;O{D;lh^r)_T*qg8m zNTT-BHEHP`YB#&2k<{S9-jcAi6^o5XXJL2o`kd5r?LADMKWCyYMBj`ymZG`t-9i|4 z6pcU$gt?Mf+bD&cVB>+wb~_)BZQU$KTN6aH?x~-Xwzdo3UtMDPWX(FgYC*$M*dUYm z?lgobzSR74U}9zO?I1txd_Gl!Q?iC-o{3U3b^ve1WTyh)18;=zJn8q2D;b-=m=8g( zy?Kf|E zg@&m>?XS1yPbr$byu+{SaL-aQDC$4NZv`2VUMNUEv3(w_AqCbzrQ(>8RIzVGxcj2R znL-bIHXm=%!=97+S;oa4UaZ1zwmSidZBu$A_=0+~uaFyKNZp2Fg$6%MOMAZIh%dgf zbZ^WZ!4bW1DNykK{ZI3aj zVYpRY(`1NARiCe<@vchNNssFjz$H*=u}}bG0bU>%N`ZqNEqLy!rhK7SuSeHH!>7)B zq*jXA#2ItJl`o4jY2^XyYct+Z9&t^>QS)$J@Ak+?9GaTum*`!F^Akf;+y%n(o7Yk` z-9NPD>dCI#`I1KA>R2d?H1AX&M8Qr_*_`q_HXWT@EEbEGcCrqzv<(jYq*sO~)dpy? zEs%b7$nayQb^&R|0XmHOET)>S3L+}$R_&9)Hg&Cy<0pp#GwaNqixb|N za~L00r?@7!pCZ-7xyaC3;^+t*CBWZ&>>0k1x>JFB>hdnZAz-2iQ)h%JPGs=*15q~% z8@zkfB`{v#u@Wdl$5Ok`QCVA_HxBA1N4A?KWobSwGD{$PSc|e+Ukm$qxCfpMA|UK- z=~Yk031DE0=}`v6=te7+WrY7z|CR!uc)L`uu!(ALQnFQLi0jO<(1gclhbTlMwAk^g zVM-gBR7B?pH$?B~=47+~yN40UgKf~h30MbqfGCbvz=VPB*B7mlkgQEs zakGsEO}%#`yowrBFB$)fqV|z34Ex_M-a2#a3jjClz(I%=R@||3~jQ(+i)TGsxk{Vg)bh6Q7Xzm*E5-btv8${Pg2s02v9x0MnS;&%y@J z>qg)!E_?j^m9W{@_V8(7c}RPfmkAWRH#c(u1LCY6#XALyz=+dCdaMA<5dv+h?dem+ z>67(#zKTRmlOHGj-r4GeWJbR@a$?iO!B`j13l8=`)to5k-^Wm+TCUyIN%gl%`muhZ z{h(KEX1oUevn?AQir;I-`hu_}Vm%H5(?aQ6BSH)Gb5QM0*T^|BdFpv1yR-`wn*cI|`+|2x25b~qm?aCMq9ROt z38G*;RQ7cv`9WeAp5rymtM19S`UQutAs&A=$I3sQY_oZo{k6PjG3qP}aE0*>DS+{% z#gnGv8FEm_21SxALv0*}-z7rD!Et`?RE3_Tw2FVqe90qyOjKt{khCrh@H$dGlkbk3-(RLI*zrz*yQkrSLA=7sred z8N~wrtq_)AEIkhCAVueLQG?IkxY(%ly8BF+$eRTGR7L8Cq~&*L1>#4RcH%>bzw>v2;K#3ThZON}Zkz|-(*za?UEhoXXojSX3M$b){o>17 zXH{$Sb4zhL$6NO8KY8tGo0970ldIR0fMM|F-WXEQzE+ZSF&d(SJG2`wK<&+aIc{|2 znul&{exjU>jiQFwy0EF|zLkr|JL7IP!U)p%J$h) z5u4EkCZsYM05bB6XtJwLra#Q^^HwmSic!A{PBUSuJVU_Xw}0k z=3gVCU{W*RjZ;cbPi{NNlh2y279Vgh&OMhhMR%sUl$G{y!* zc#ooa3v%q9iBN$@+hk@<+j@mtT+W1lWl=~M&;7@3j@7|J?Kq+8Y!JD3o2x#mqI=IBv z(swnig}S-V5-sLBzR7E^)b`6~z7p1m?eIOO6jcc$`lHlnJddgM+x;O}Rzll!00w=i zMkFrFIPh7Ule=;q7fkxff8eJWHxRN3$tO_;U@kTNJQ!Trw9{L*VMD#|VxJv27wd3N z%BDrAo9DTi0-v!cj$@B)(WY0$Xl!J*Lt?nu1 z6i<}9-2LOQ<+rdg>^K@~1f)8VHtL{red*>l?VtUel3~i4>Q=8+KOk4}c2K-Ame1Tx z$b9i}$w9!NFy7@LjV+lW2IBe_W75ed1D%QD3+&-0XtpOoCC+AQEHyf(%X zrF0Bf7P)P|OoRoZ0RQF%_S&(r4TO|NhQsiII|?!a$a}{^eU4YgZkqI@cqqehe&}?f zAcJiojGss@gGx%ud9cvt_EGa+?+%M(8ILiM-JMw+baYdIfmJR{P(18-iS3;9_93Jiq zQS5n(&e^_)(=B*CE4`HmLG`wsa$39`@1OAgR29y>hWeVpG_vYX8>F~)pyOw~lTjiV zfk}pyaDP6Zfe`A`%zomBnqGVj*(x2058NlBo1+e7R*BYL zGN?PS&LXq(584t}9@9oB-u(plGD8tUTP+9h%5&eXp4Q0ManBUFgE`6zbxyuNxeKUj znVg+3%^-bIDH6gQg*1t0RonJpGU-q;M6zqCF4?7}tA(k=;cc~vIORfE3ice=n9&v6 znuW2C==9UY_QPaN6m zKzxJNc7e8&(^C@_LF^7|S1s@NEFFCQ=jEiFyRuuH*{H@babb^-PtQF3EyG>|slXMb z*eNU$WcQ?UI=!Zzt$1c09=w=9ANLQCi8Jk#Ln{Ncr>7i^d*1#-P5n3Oz;93vSBh@L z@na6{iB?vGiQurxLkDjWwVNsrR$}^idsXQ}!iPgvxfcIXRjec-Js0wgovOpWpP(LP zysGxiliVt;iN755E5Get)SKocYhCAuSO1*g$9{E>GS;`6N#2VQU5v~)N(N)H7bEDY z8pVPG+PB}!L?&$8dP1;!u3tuTJNg)N)J+`A9_oVW(3-OAMvCqg>9XVzwoB&1=@bis z-kZruZ;cAsut|v_E28>?A)zq+HwsX*moQW=*iaFxFzPzgFN%Zwi(VH|THwfPkkG6c zYWV3|Zb`srcMsPC7f;X4hpGbYqpZaLEH!PQt|c3tsKgu^H@f*eOm6$^wE8X4V25_S zgN5I8H-iHA`zyM!y{kSGuA(n_2ilXfws9IsV$ zS-LWE4ZKOK@+{|1_stuOvJj$O^l9iGLaekHjhI5`yiJ-9a;O&(X_X=e84PUlHMOYJ zzJWz~GP43HF~-uk?L*MkjxyQ+-4CjHA@CKeiPy1d7(e^mV;4xgXSw82lPm7gUl*$R z8Y<4an?t;DnEJdZc6>!3*#BL?x45OKBHS29Ws){l^L2v zA5jd?>qF$l%3=;rU|?0tYMR4Oh(Y{vPg7nf^UED^5M+hjla$uu(f~f=S)oN(LbIT( z^^C+bG$DZqR6Rj;-iCIltnLDNa+B{p!@3p0Nr?#MhoX-sRBMz}hJZrBJu8N4x{8Ff z57Js37WN&xpIVkwHRWvA+sp`>X>9iOFtJq?OT{$(LrfpoaE@4W*UT0!BGVbH7?tdP zS8hC?AC9jnc0z3xU4M(FYIF@xS+9j^{E9|(&dG`FC?9ysIL5BV6{Fci>tV1Hny0F@ z1t$x;gJeX`IqST<_u+vI;;{bDdSi+B(*OzWiA=@u;f*kQZM5{Bwdh$w_fcD--OF*x z*zf)KUBdX}Js+QhBG+?P>wc%t!5yk3fn#fO;;He*AHnb>Lm3j$1mx(2x5M5gZ2}@Xa_76H54o! z%t)v_t~biPpQ+pRac9YxXJjJ|%v?m?#j&-YfD{!p_ZmI4`kO}ARI3mq7wRrz1|;x& z&TsqD#(2xk_eF5k0+Zm;MJqncNys^^kqm3;6j?w_=~V3dQKPL&S%|!wI;HoqykoOl zrIs0zfr}7&JNK}A+=EsYR{x!n-f=XDoHEN(ez4MQqJ1>ziEN_kl1Z-eMOM$ZkGrL~ z0fD`!6Ly}%zv=<&ZJ{4*qs#~4P*YjDpwkR0K+

z7l(=RCuMO8-2aITdx{aMA&ZLpREMSeS2w15purz3EVkPKPU(k@+Hy z1617Xjd+*(%<{H(v167J{=SHu(1&VsR8r;H!$prcfUccPm#Yb47JW&*_xY{e4z+oRL9`TxmD>X;5@cRDLCPMh5gM{D@UqnO z#LAwnoRol}LeKF_?d6>bsU=K0sbVp2Di?0ViuLUor-Vh*o}fEOP71B13)&&wS_m1Z z&bAAkzuH*t7wl7T5UUn*Oa1uV<0KjTtK7T;GEUKCH5waTWeq$F_3f|`7;}sc2Bb*c z^&dj|ZM)9UiM2oG7!8np+ER05)mRb<5wsG>^~c5j1m`I6o%2?Z>n~h!{c@i3*4r8* zzRIgD9QA!^f(@-V2oZ`hBog37H-Kb$yFO%NOB#wREM^1RQE-%uK@JM7uQSq4fC(~ZtD_s|7oA+ z$^A(tZ)4BRp7-=PUspTA^#(SL9YYJy{i!!eORs4N@>;hA6c7G+m*ZEq_Ge@aD2Jbu zJ}z$>Vn3N&Yoo+*snX3SXgpDw82Jyu}~0{-tL-xSNH5xWO&iM-qxvmm9Gvr zQ}60Eu*e^|ybt21BT&=B8l|wm3f>b@(+jOAV%PV_$cwtR54Li1^y)}iZ}%tZ?&lGD z$n>-T!3?k-gLJe9|NO@@G@EsPd-KJSUI5t? zx+mAbeyy?3L&X9l*PP9FnY6tB%P}a=GR61+3uHdA4DwcfIM?{)c@e@4giLZij3Ic1 z;wl+t0j;{jQ`%1Khbn)m@-IZvlgG3nzkjo(AFTn(c$PC z6MW3GWH}=0o??|)md)F>RGYL9V`qBa<$JkczaN1u)<8wBk^1&IP>+O7glfa7b}5S5 zk@;n5&3LFdyqlY^Vh@>5%;$a}cm;@k1ZB~ zp28{}yq<@Qij3v2e>0Y}T4|sqQ`q{^u^M6DDRy0IR^)fJo14Nh<^=xuEWx?@UxUBtaH+}t+B7=vhvD^wZAwJVX$#4S~OTTnqWzfBz6PuZx6mxL#+ z$*qLU)*Uwb1%&n^AZmsZ#asrxsq38tpou8<6|0T6aCUdCU&lO_61Oq&o)M6{XX2aw z@GD0#kmg`)kUncg$w|d?(PgHvGW4TR=?(4zPUCq(@u%t*Jc7f8T+Yjz-l%(V5HkCr z7M07eF^uk>Z{bZH>0Z#Gi9`p{b*lWf&WLPAu#CG=-jWx&rdBQ;64jVnXF)G;+yL0} zumfmmyfQtK`k<`_x|af#^uzeEChywjhKUEBdgoZUU!Qz$GIRX224`dKk~RWhIMJvd`Sq+uMiCF>nC|t2h%-2l z+z^E&2#EAVR5&b~T4>^Wn)m;tIt`{$Jr?F1nu~tsUxyTxB1`~nB#|KwktV&pZDvkH zG5-ji%$N0}D}&+FH7~g#{zoMJav3OSBLO^H!LTBDm-gHiZZoKwz_TNh5YnE3-2M|$4m(xYBlI&M8+F~17TUEI9nsP=e0tp>=IqPy#?QHV zvux&~LxWs5^jR|9*#bw$4*kn^+D~&KJfR<+G`jvQOlAAR)XJ>`O-?wT9%vtio}PK2 zhtvMch^-JFbt)uHS3=3= zK3~`2BY*DQs5CL*2ChKF6oT4*Vk2ZU-({%m*%ch1dlzJb*JJELAP=dk-ZM2s81kJa z-BqCAEkAM7mCA zwD(yLcJ#3WQcA0aH1B>KGlSO;#|!b=gA*tjf7_eK1S5`SWZ?TFtF#W9{dg0TB@{UT zKvb3h4}*A(66jP6e!m3qn)qtJ2ubnM>3DC5*I4ZNrYV;WN!hRbiO#@M_Z*Zjjw2Oqj)nB0FRcxO^L>V>%l_)$NLVB1zuc~M0*wWYGece_G9 zM^A#0O}V*uxm8*C%W-+zghOkG^f)e=ebTybEPM3}#7m*(hW{Kf#c)7f8%fYA+)tfO zhg7I7h?$Bn)%lG>ePX@s-*x#lx##ZNv)MQRo?H+$jWL!KS?S%gzWe6gDXfJwB;$!r~#9%2H{gj=I+2jD}nb9O<=_ z#qGHfiL(3+CkN*9c+P$FBB2tqw~cp_&%8tiUt*euj8lmGcUZx{Nh?CeDMbDYpZkgT zI7aryA1lLpitS(>L2G+X}@CKOEP-a_$(D!b;9%(L7?mgWibSXh6*dfJl z01rtk0-J+%4_Qb*kIC?WO?yIA6jbFKg6GIqkS3T!*P>8j>}Zz7*(0>n~xCOOyh~-Zw)HD=?=n zsG(5VpU`BN$YA!v0ZsE-owJ7?wlurw<_rtx)CVN2GWW>I=9!-QcV>zmbKUFL5u@pK z4@;20Kr$3%e_?X4Y5io4_M+mv;(nj|7FSa{4l0Ds%bsKs{*q-4Z-#OV4be;FKq0^! z3gQk5=>je~zpq++Fmk+}xHSJDmoH&oWsjVp+3^?q-DM(1Sqh_3kc`+v+~cKRQt0fr zNyGofOMk2xytN!^vp1;Vih{gDbFtT%VgW^N!0;j@rJe2Hs2jsF%|SFa5F>yqeZ9gkxFFjAvC=oNJyGRr z?R{VPk4!Vsj>~M=DWWu1na)Iwi>;v_gkS+<^?Fk0R_&{X@w-D)bwX;d?`oLs0{Rky ztFxjhpE4?8-*9{w2MS^zMvKOl+OAKkSmXO{a5nS$bBC|O!^z1rV{+fGmY$Frf{1D5 z&CL^%Ot3-h5D^lLX=S91bQ&9Eb#R+@7<~Sf-H%Ge)I%bXyvpecxyn0I#+Sv8eM#Lb z^9j-Dy*qIR?4_wgd#IpXi02$_mx$QdLMN!7DKM!MA9`OQ>oY0Zs(df}>atLWQpsUB z_Q0M5MGBfVgq*R?U4R<{pQ2lk(Lx>det5NJsAHwH!9q^6Zm(nIqPfKF>d*I$zdVOL z|8r19gXkc}3WR-!9Mhu~ltc9`8ykf)wsN*%Pa@}_8s9X;W>$p5;H5$oHxSZll^nct zf*hQaR&B)h-0^#;0dquSfkbB4{Ha@0lnZV{#!SXv-YTR7LWY0T&OMYa#tdAiE^YQ9 z1yLN3*TvOj5#SPulF~ZQAODZvKk1dG2fx|qts4EFAF=-;Tn}O?d)9B^##5-*VaFdz zAthmk-Lqk>wX{bIK{G*5ccw-~`{XV&+*hIC{pl)0p92uHdVAf#F$?;TX__8Sgxnb9 zw*`%f*%8}`N(@o`v@uQjpnK}1xr4Ojn4oiQ3pzXl-n3JUR|xATBRki4p)G|_-|8+P z^}Y4XwWjJ=cf#;X$TnTOYj>mWAH)B+bj(3O2gMUM-2)+t2nbnF>QuGOnBd1=y@&SL z>6r~peyl0PPD!i~J>`M{&}O}1f+tkV3E_?G!R9ZN9Ef+2Q}TG8H-~5}y95T6$hHKm z%b$7pHfreAiFWtmBBn)X5IzL|eAg%qI&6yAvF;lXu9ViFzi~3dsqr<2pe}uxH-JdE z4Iht&sjptyV-Fq9@b0{tE+WJpdLZ|A0DgqfBZJ;e}BN(-!4a-p!C`%ay|%+rqKvPFs&AELD-u1-!T5)C5)jPi}SZafZsQs(P~y+`I1sY zM$S`R@>)}7+sWABYv}8*j;H6swoePXcSp@1dBfH|`d>Ux!Mf=x;5@Zt5&QW0GLge5 zs0@5CZr<+CIdgT>O|0YH_)D&6o)cUzq}YpiD1V=!_0Kc3;8|mkMW8v2lJ!S(H3o@& zZNX}eu9m!-)D8iPONcl$0?p6S!}I^6r}{xPUwGoV8{|G#?O&KAQMKsR_pd+ogb zFZSL%9_qf|AD@Y`lqJa$naZ9-_K=|xl2F-Wie$@L7>pT_rR)l&LZJ}Zx5(I+B-t~z znL)D8SjLB0{65|1ocnv7bD#4)=en=^T#tU=$Nld-jQPxa-mm@ndOnNx0jb(#c-t8V zBJYRgDv3OfFIfjJxH~v9-QYO!pqAU}8#kM#RHGQv#6Jmd{)O;HQ>Xz3Mav*9C|Te# zZ@2%{o1#LbY75Q>H&@S{oI^eHpeB-Y|BIGd<}16U9<_h5H=daI|3UaisxHP8GjLfy zA=%>XOJ;a?kZ$o~eTA6%dFue9(vfgMHG^B9B0J2*_=NE!{a-)kK}=Xe(E}Nksv&Sk zHce$%oTe&~!xDI5r(8w3=B1hCae}@JWs2J6RrNs_h z3<>=}gMO`ew%)@IoNG-NuM9N`Gl`Qbyzwlk+)ZZd0j53*IX4k<#)fj2UXNyD^r6KO z%t-5=mO#CzbU{hO!{;DKomwpFJF>+-}wW%mVJ>^*zO`A-K z%|y7JkJXykp1!cD=6JPO1gj{uod#L_@sA8(B+p1J58Nf#jq-~86YwrXlj>hD3Rl1I zq73kF@h%9p8QDC)D^-Cvp*Wz^?7Z}hl-M*MWHaqI2lsoJEVV4?uQ2#mSaysP?4c@b zAIb*5!sluoTanH0uJ!U@*;#Y5u}nb;^Nz#FOirP*KaZNxjeEIj?aJUIPsATc&>B<@ zFE5eIyD7W~Y)fZy71yy%O|sC8NQIHHwB5wLMsDW@zJ4s6DdUNcINlHnH8bK?!Y}yZW zv@*RbN?De_w#vpVlJ)9{$ms8jj!-TY-!`VI+&kCfP#kb`743)W7 z;{&nJj&pk--yeeP2FW^`Fb}P{+T9kgV56o;*a^t2rCD=eY0KPWslE$>D{?A_1LE%#_$sC&K-TvRJiDk zLy&E()nLKB!y;7MftmeQb3=XOWBQm-sev+B14ulr!swo?8g52_+g8kzT$%^)@qFLx zY+TA*A77t8WpiSGvrt>4#OY_H*4LQg)}iYg5MJmScBzwU0KB8v07ls~Z4x>>dw%#s zWK?Ev`eA*2d&}&~nkU7pBbOH&vfj%ZL0lMZSQt&0s*gGlP4tg0kCnUlG5F$g=d0p1 z=A*}CBIr}6dNkLxMb44+80;0)Gf1u$RkSdvEt$wA*m>*wX}R)&M>2c~Jc3uOEEVzK zN|nj6)<`;3IaQL>RfV=KA$L#QkOekr(`c`vY%h&LWMQQ@vN zYS!(%#2v+VN?r?vl&G%aWypp}5s6YY~ta@(`-cOV*v zAoFM;^e0&9$|LyA!;(bF_&U)ybL#H^^C^>R^0Dih{QIxVJuAK9?|wHSQ*@Wm{jf9$ zDFYC@T~vTW_gR05zqO{4k&85+Td_CSvRqx3{p?dbmwfv;m#1>%U2WJE#<$6GFe$uC z8YV7fG{xV?u-zDx-QduYJ==4os&CoXFgjT*JIvl*%}-x|yRrUC%*BXS2xR3cy1lRAJEO&mPuTEBb*C!yOC-z zitEX9vA{XzP#)j1)Ri5m(m_R8t`*NP?ndAac5uXSb48%!s1BIM3tDQHaYugp84q;yy`Kh#H3iF<}p=omu5(krhUFe@o@G+OmT&azuul# zSrzx6ol1EmJ*5_>&FMA&StBo#D;uoEqY_cU;IA0~@^n}<)w~xg8B(IkP@}HhcFuf6 zk*jE;1`1T_!-VZs=Q$uKJ&@vc6HAYKer320PQB=&Zn%*4r1*VwP|>b?gOM&=B1Avk zqYf`uZ`^0=E+>{QZQqgVUhPBGY}04V z;r!mS`u~j)#NDJT`P6)ZY-{gg`EF{Q((HMLpwjs0{Fku`9 z6GUk^yheUZ@+%2?RNhl$|wl2Du*nGTgCALCS`wO_VMYrI}Jr6l&gxF#B@3ui=2UNc-;#dVdYh zEW~!&K{(^`Bq(uEd=g}McpVD~mZM_OpD?8$5k~w7)`Ywgp?UNT(zsRSu9aSP#mnY4 z&+H`UA-VDs;#g6tC`xD$2ao&tIz0~F9&gEI5Jo6^zrst1 z1yn#Y21UfK!s12Pff5(eFeQVOTxw$wz!C9!>5WQ8o=dER%4agE;Hj^DE%dm*G|1t# z(Y{nhy*`TmB(l3-nM31zm?athP%>pwzq~r3fc2P}LeI%D#bcf^?pN4Oo)%7n05~cI z3)+5?B~31p5?)Js6U`DLdHl)(_2lTMZ+!u--{n-7j;Z+Yo))^Srx~h!#*HFO&&Rs; z;4FyyRMSPj*2&2=e^fA7+8Cwk^RVHcj%sK>Dm<02{J5)=^@6I86w`BH0lY%TfviH* z^;G=i;wyZY>}{l+WB*l?q~4Xc1Mj&LmfU#c%^v#IOpG>5jj!fTGch&8NTw?|1U{qX zIgFVxgyp;$(L``?OFeYg3emW?a`)#TgGcuSQ2Du?SD1FOW^(y~Um?qtPrYgpg?s95{M z>i5D7ss}2>o>d7+R8CY$*D`tOJ<=9p@`8jYK7!EZhcNiJ85I^SD^Wgk8M`!hJD)gT zBpwm%_S!Iivd1Iqu~w78nVyt<{v)@hynix#7lBJ&@$qn;LQSiwB9^L%I@7n#Bxl@U ze5#lsCD-vus%7>Taknk>f>@}~V^A!n2XzP7A{dWU;ayRe$k*lkj#O4u6!#Mouh>v) zxbK9s;k^XE;R*Y{aLQ|I?d}DmRS_TsZ|yR=+MdCV<{vR4#2$0eoaslw#Ro2CdaS1D z`K*3;b!Ebi>&O!1zBbH&(VS6*%BWy+)q;>KAK!e1oH8m?P3XL3v-lwfvg9Hf+bQ+K zO7-aN4vqT|?o1IuA{_kxOL8PG$|I0_#DS{qf)=j+s#IunbKp&XdEU{aS>wmcb~YKO zN>$_&PI6v+qISB0)D)Lkl{Co?5LUIq&BAPX*`iHt4m(Nl`w~ z+EjQRcZ`Z5=e?jC@rzTrG{+$Zk26TYM*46^L}qYc($LYtd1%3bDa1Z}pzMGY;@S4i z7e%D$kU-x*Iu-HB;6fRZs#^Kf!-ty+gPOK;UVfw%yK-e-AMFcYcj{4O=o!Chacr{<{h~|6N{xLly$n^e!hWDY#7dTpfFQqNmSS}zcKCMV%r zQrljAjDW?1`llF@9xQS~FS7m!?OJuD#Oms(cDc`KQ}%%VkYKj7Nd}O~R)2~ygsIw? zYy&^=B%n*OS361?U&FC2Dj8JpkcE}5o@vRbF%`QLw`Z{74(O(4wDw~fFQGM&GQ`%m zU=@IAGVgR3pH?{(UHKG;&UvHsbX2jusr2UHqebStv1}6SUj)n$JCJD{N*R=$?9O03 zU-`lGf*z{`Dnztw@wCNC>1_ z&(R0!FcE+7@td#diOvsUODGp0wX#%2dT!mn!cd^8M!5t& z7CcruRCih{Y53GW52|MM)dbe9s05DYBcPP+s|jIxX1UzM$+Fuy>+Kqj)e*;*opxFuF@%u(_2d9x$W zeo35Ijj6=!XjJ9##H3jP7Wi&L+d3_XX)8RH~Pyx+1p)w-4tEUwjQlo zT^6fGhjgf9Th1e=z*Z6qNNyS$%^1oMsq@AVCPzF8_fE}Ga&JF6`Zl&Df172*X? zi8EuUS4c_c+nNcSn-qKS9kI0YgZ1(@G*WU#TGRvci z>Zy3s!R!a;-|?&-3Dta=xICR?U=LM;YD2d8iK#SRIOQ=&vGHLFKfG9xZRtYTIK5tY z98r4iZVu{C=Gdz9iSV?nE{nZ2oc?#eXtR6;)f5mT2qGo~Wm|yq0AdMUE1bCNKUT8mR8+}N zJv>k&4vF8LpMO8U>&PDbUe68z!6UF^C<_=57)CsLc;ALy+K(Kp7qgX!@YpBBH|H5u z2A_j{ICHFx9M*m~m+1#48V&)e3Gya|X(vIcgFL>M?|+bO+fgY?-mkRH@0puTqEi3^50Ku>o7+{p2U<)s6hC4u&3Zn+)~K`x;Q&n`H+cYmjS3?yrd z9HfG~FpW~0;zZM2WECZ75K9ml6wm~972+mJg4iY>m$Dgmdp`cEvfc@~V;3TF4lKaR z!5u;tuRqPuFzPlw>4o?*J?P|5jJq=PMj#kA3 zQsy6w>C7~Z9Z0syH7NB=L08b8_+YIU_m`(Clkrz4w6k+*ph3qVNQP1OzzlE2;JNi` zc5S>q7=}N$@TuCFvt8ftl=>X({a71jh-tj#eoqk(kK})R0zSI;-9+o1%bLx1Zl04r zOj+S}8jH7vd?p3Cg7pKl?_e7J+2h6G(18-MEm7+>t>5?f~ zIiq9q+v+Zr>i1SJ2o_$!lqSn>{~mfyASS)*+;tzq!Mg)V*vfO!uRw0T>}GCc@9t_S8sSJG>#tSuOWuO z&HlAJ`1|Xbe=+nKC1*6BT)c8k9h^U$CH4?&L^;LHot}Ghs*7J79#r;boObW`=Kz0& z@S#s&8bKY<>7@;0>aIELw$xXa%sDLSA{Wl0%{shdxCIey$Ny+}W)FVS4itfE^L`mc zK9;T#5cU=fN1KyPE8&=ZI}rPfkid${7>~DoG!4f9B8MwO_o^>O%$q;y>>hp%QPwCm z=(SrflAt6^IM&^G6iX=sq5}x+(PV6=h+1~ocTh)O1rWsA)5*l$xx&~>>s;}H!8yD?SU*2N#oe=k_W zF5^O+WkEI-9qVduz@7^GF{8T66ixB|athQc@}pPQsS*)jv#?RMPKdxIc(!;UO&EQ+ z-*bwm;FUxKOg1$zNl3^$LH~?1)78JJVzd$Ef|uiEdS*}*{pN{8lnJkTwg(R z^xlVbiS38SE{K3yZ{H9kOE*I&;2?gK5^mI@C98kstd^r*S@j9%x`dJ>y%$_;ZQ9z9 z0v%mK`bl*^YZK3}G3((K0xej8a95ifd_2L$b=7(xFIT+<*JA3z-D1fxcc<+~{RD6_ zaT#Ptlw#_H;|qoYm5Y{)nQ`{#t5nf^#m`NpAhKTK)!H@#Thgqv#k|mEILSiL_Y^6f zAiQ6aW|+QASQa`X!B@f>Nnhfb1|suOin6Ai5?4gEeeNKoQxzRh>OBsEGIu$Lg0OGSPp8L-{bvL%D&uCS-a%U^NKIK!j zNOqK`HmGS>4qf9bB-gvGe4}au9O82>cQm~N3Rnm0XP#E*94cN{ynbDN@!~d^y}U!W zr`=AbYNDIfxgOE9hy{{^j*TGAhW&F#jPEgLVmkW@ok)+s@@+8FoTzdvJa`G$7 z&&KEbbRo>hah%)3B&qf&Wr)=`DM9CkX=)neGyM$~{}j|Ckeff08p{xH%^-y%^S36x zBQ?t;QUf%6tg>KA``X{>CC{Tu4V0kd;l-=utHk!jUaHwQ8r!hk>>=|!q9P&IUsrLf zvdeEZK-!z%q@eVU#_&pJyxj52Xj8wc#{w4V#G3-9^N>ugIynBF`3K?2(kK1% z(OkGCn$FwGw4T@|pX>4nj9B*zKNgU<3&UWWz&%DcWe71ZwsqbfYONN-!zt%4XrEz` z1|DmR$bg5DPEw7a*;4iA)r+FaYlA4DI=bY*z4dhYg@Ys2q2f=O+GY&I+J*U;Z-CTG z>w6jO;wpJ8i=Je`^<-J;suF`!1KPtBA{e(&d_9$*46!Ojx|m+JTu8H#TTINgvd#?> zACAcwK|2T=Q7{0_+#S16zr4rbi?KG>g<*{S3LwSOA{ULrv`$7%9O~cTKx-@rMaSy+ zo;kVa*m!~{@BKsrGhM;GkIO-=$>HB=F8}Rx6jY1^q^m&k|1!#hYT&M96W8t?i0Avz zwBi9c(b2_i`dRGTWN_t{r@dC|T86*c=obYA=z9EO4&IZOrc8fX$A&jPG=~|%eUHtF z4q8BHovzDMEg7v>x#ht+WBCZO0k(qTfmc*ku#nlqfZL%A=`YJu2i^IvRS&G6xoTWn zVX*iB3_W*GsP3yd{=P%s{DNI|rwLT9HRD?tWdm3sv;c)<8`DToc_fY~v=^+fmSPli zZkS&PIG5G#!UyZ3<9ND1K8uoxfNaXe9-6pMJ%edfFNcL@2=!m-Lzx!+@O(Y$^5IUR zv081Mu~Nys$6fNTP6a$T-4UE6p35{HKho0(Cz;A|076udExlLM<_!{I{g|q&5^Z_2 z_O`C6rprB@8yycLO(URQU{numU)+IEy%~e`5PFu9acD-lV{2Ou=eu_}=TwUNo0q#k zE@z1MA8%DK;E{IxTv9Xs(MoGsfddlwb_x8N%&TNfD@>$|+>93=e_?mDZuQon(nAle zPyJ)q_Mo+m%rm*+M5keR9bzy;8kwa%MM>^t0f_jt10kMF^mfa&o0ym{gX6^GtgaJho|G9x zURS(v3-M*-DH2__)-E{EXBy_&78_rwvddkqv~W&( zI71#}YbgPBb)MtDKje?%H!H2G8oak&8?pDPVi#L1ty-YuM)Z*h8g2Zt5d=B81TthW z&jXYAp6FqiAly+Vqg>Y;jVr8(E!?(3%&DGi`Owm*kezs|QtL*e%}sNryAbb5QVXUk ze(B7Vk*W#(m9I_g@wVQ5FkITuD|avVn_nz-T<*+%mppuSPeT`|U9`Ls4}*9_E;_wO z_h5I^;FKDCZ{fXhiHIJGAh{G6k=X>7DVaUOTYjaC%A&Ot2=~;1cn!-GnP^zC^H7|5a5uY&vW3V5kDx%H~c-HTAzI@W-m^++m zV^r7ww)|t&i2}B`rFzq3md(U>yExTOAK6O|W3Y8ooq!ZdGm)=AmKIE$YYhjV7?DANHcYhup% z%^r-45$2b8M(|9rz$9o`)4&p`i#Uq1rYFi}B~32%A?7G5RlSyWKbE(~d6Yhvba4tk z;b?f`9#f!Txod9`6X;Fs&?hd`3y>u~zH<(|H?5~k7`XRQ)>LnEqU%6u`U9yuo1jK- z1^uHICruydp{lBeXbJs*kES1C^cPpF>_F~BO#0~7e)d%G*PS}^#W)Az@UxJpX8}2R zSV%SqQbudWKFm$m;d-eI)6mJK%1xUkT;Gs9H8XJl)vMNcdMofoObBX*s?2BunKZ#X zaMZ~ds3A^9!fukD9Ga-|m&l@*+&p(AgKP=ipeLe!&_h;l@%i%vL4*^bJMzTr6_qe&GiN)D28&L{t~4}z<%5Q zsURk-srb{hUYZY^l=6FBS7`KEsz}kd?e`OqKU&Iq7##0Joxt2&8SSBfg}QGrN!+Q< zV$#L4$a(u^PMyEsXPsTnC&hNFOCKjZk>9c^J`^GffQ>NbS$Y{wFVAH{eV`I`mJE+t z!mS_@<|!pw8D2uJ+PA8oF=u^Z3lsQOdl6c3t(fJ<%90RO`3Ly@8+QW^MdKWbS1JsH z3KdQFKQxM6bgAD zj8L6>0ZI|L@!@Uw#u?XHlm3$y*8Q1#1jo9H^-lbl=dLk+rZ?6jngrcWBn1)N<3qG| z12zPaEH~jqmvoV^ivywpGD;ELC%J@gzRZS`jzee(7(fEPWUqM(r;bu*(P8wfG+Qu6 z`@Jb}RZkoWTON>ccXqVO|Be%=ZyHX{XpTM524mkSdQf29WiYrdxV{@?r>zRr-*@~I zm!OHx&r*GcKNNEd14u1rlv@Tb`ox=4cij}Z?>~1fF~4=R@kYBJFH>6ub_D$1aH`&pnzV$6`!EVT%<71vEhg>pK*(a)x>K*$4p$H=g`TGJ%1`J@lC=HcH)kYwlG)<)oXg=!B(6u30;Oq@L5K8!)(h^+E#_} zZXf zoMcAd3EE=aicrXo@zN1J4S}fvA{ao%zZuKs0_NZAT#yDzpz>>%?cmmqzS}`T$N@3u zGQqk^_Dl%EyFpMQM7?OFmcj(|=hM8(=-F5ht#`oY`hs6-@BG}GoKIup#=(4(N8cu$ znr;`?<<38EiC(gpF%5-oOa3vx`Crb5k*dw{k=XJtu~cIuh|IIo1Tx;bX0ZFZF-JCQ z!5)rvJ8J#V_d6$GNy-0`9qG=nf85Xh$FcKJ693W`l^bOVRFNew0ShFRAh&Lhp}vWU z1;eB{S*HUwHO?MKlFo`sGVdzrihZQx-U8t~i{JrMl$&+B*)Wb~g2274OZI}k%0?m+ zgvhg_6Sg6VkIQh;JkaD(7%I&JRh5enN40Lj^4FE1%j^GJ5A{EP{9ocPMc6})!MYs^XN?)KW@YhS#eEBAiw z)8p{h@6ApO%@4JD*uXUKl_eS7gDGEWLg0eV4QZYuJvaMUSN?V7%S%6x#fkB%rLp&~ zK4i%q+1P-9^S`41C2#QG-sWH8I{cr9fPPQsre~o!ffX>4G4>#BTIYhMj9D5fu{Oqh zWk((|9Xo9$c;2%l=b??8FhA2UL?rcBKRFg@(n@2-UKIaZS3U%Log(2SU)y(v)ynIy z?l(fE<5r)tmSb zdY^?k4A|e1$+e%JjJegr#OSm;G)z;(WA?jk*NGe%yqu?$R^D6N=Er;uA@T#A;IiHgD-ZE*6jg!}5T)k#spd#bQ~j|j`a2rfnhLW@wN5fRW~;9DMDLav zQtO)c`#$QUyxmW{F3Uq$YX8Yv@-OZS32)}57p7940O4T5u4T)?;8M=dU=4m^6CM5> z6{IstU8(uLmF_82c?TTIul(W>k;CtJ%%|`U=5T>diXgZ$DMQhxzwwR7fx=Zs$s!e` zXe$ykj;}%@Nmf}p?W)(ogH@ru7tV2vTFIYH+>Ap)Ho?*M-{Q*t8~618;v2YFftyrw z7t8^S^A02mQ&xx#QSMe)AkaUI8NM4!qy!b)9)Nm38M`(1OfvAD$f-xa{LwLsNu6kc z>@hf#%M=Fal1B-W?Jp%whqd-2C%Rfv)Xk<`Hj_rhqK>`Tx76||JM{EEsJU>cF2uk5 zR}S-kVpbxIiL5_KlP5dHRjfoz93s~z<^peT@M@%g?`h} zQDdg2d;dfmi@9Y~_|*`MxjU&=$g3B=Zd5DhiS`{ZLh7e<+0$Bd#iGl#@YxB|oO)-q z&rJN<#nqpdy)ZR04kFhmnZt1Bq9}02nM!KE*vk-b*+1L#VT^fev-|rIqwep;pp5G6 zCW)q30o{*R)la!K%p1FT1Ut=& z@DXENS)Veo>dv_;6qU3Rk$AmDpVNQO>FEfT1f5LI_y2Sff9WYPC4ZHx=Qo;?MfWsq zrt@L=&L2}>H|Mc=sSAbGs$R_aC>%VgNXq2Q|FzExTB;}9TYjbOeO!)}bDhi(z(sUqSKK#+Qi4@pi_k&T|LK$tEtOMbsVypWUUrhu><2MJSz zZ0z$@_B>6o{k%88ZS|YagA?aJ?Hc@79|Sfkj7VLScv8<26B*VH+s`;T z0&dH5ZtOs|4$|NU^l$ZJknGFmu~*LI3To-OZ;gFB7V@j-WYG6krjJg3ha@P*WL$)h zibjNpJ?$&e>72tG@wmPCS(;3@b+b)cmvQ;(?dT3&-*(e1&rn}b;7K|jSKdK1|lnw1*b z%|6b*{MiYKJ>2JhBIe+J!&7HM=AD^3u}*MY7{Ox6fG~~#w>{ZCHUtaSZO_kD9bi-`5rO(Gmh2NZ6){Q=o7jO$Oh+_cB>E z%8Iip7iZm;YN40AcAaC4)IPB46OhUI;n%hvXz7L0w%$`W4qr?vxQ5@}-=PH#L2M}2 zpxhk{-D!7l2$bifS`BY#1pAx0UkUJyj+#jnvVYxHZ+#4ZJkGm1$v~|EA~O7|S69F9 z2mZTn5|jXE#oWT~1(*5O$dXeNsYJTJ=|H_`$E}*;55CkhZ|2Sy@C(-q+*I7;UJ@yX zE)V>{SpjslXbGfU+|3V&Qni4%sfMP}D`d}y1z$$HWTo1Qt>CcQG=DX&uuYTGED<u%cyk^S-WzBV9Ac8cLi2;)%6m1_#%V6+=`!M835cn_nN7yWI`5g_rjt~P? zY-Nahg%mDq6)Nj{WFxZgB!=xkg4rQde}W|Fm?J@SJL;gz)D9#9w;uTB@jYt=uPZhI z7Av|ZoSu%|@(K)P?EocJzx;y?Q&2wnwKvo2i`O1FUHK#=%mWn^TuR*dOG_B`0PBii zMJn2s((~&Aa*$cqGptkGPr7lu?JPTMo&AMab!)mB@I zby@2Em{;Lp@9bPp+1eNcFSkFbqVW#5EiVpXOD1D*vh5CrxlhO9TJkE@v__P_m43F_ zfu#C?yotscsfTWv_#r>Rfm84fIN`gU)(#{~jqJ(b04Y?C_0m(LcOV%)RFL+~wp4C= zU(OCBjDL%TrrFg#x$Yz)3nC1`MXG6QcqtjPwv6z?ydUAG@3{iL98C*jh&WEIY%9j| zflDOcb|9;=un{D7Yg4WoK71#d5hjhnw=#5gAY2OMO}b!vG=_8sv&9#`ScjldQ^#Nn zmAX3+ohMilNF8zn{GFd6sf?JZRxF?br$y^g?Bp21UCUe>qsMk2nV%5ADNF*U`Ewn} zvUkCoI7t(NF?6%2a{N0Gt(i@h?WIJ_S4TOT0QUd#9mD8;9oQcVsf>MyX&7lu4uFI!Jda3iDk7Ghn{5PV&3l?w<+jmQ4@q6wa7n8OaFqa5Q1GOEpv z)q{BZe;(LB5A5Gt(LZn4KTYzVKJ1@P|DRFvKaG;ClccN0BDp8P#i3}@8RaX+GnAi+ zg&V`RYx&UB^@p{=K7R?${`I@S-SZUGp5YA{3>QtWug;j%WA*~V-x3&LXzmlp)~^Oz z(l06c=wtTkN_2&OIc+fBpXJ%jyR1Q%Wc3H)kp>26mcg{!Z{I9f2iF&7&GQXYR1*t^ zHdXoxN{oc|z01lw`Xo+=WlyQbRU=zIXMta*6h=xmcFEz0lT))TqZu)Ql@Gm1A6y=i zuJPr0NQP_nJQn9QWV~D1%BF#|1)vDZ@*m53IqwqTQOn@c=~}CZY?*Grx)I-|=|js~ z{NPY(m&WSr5V#na+C7Is>$s@8G%GUZk(RtwRTSZ?tiwkYe;-BhnVEs2XHKU-Km9iA zqIRO;#S}BsI+h_orC5VS2^8;1J8+J{3&AwQKnnJCZA}jFXLZIjg>V55D7zr$f{@Z6crRhZ7hwjpnBMrSG zYyirHkH>5}MASUXyOb+xDih*U&Lc@aZ&_H5dxx@uvxR0jNiT!nO$8%>@+56}2+vtZ zMsgON&@FvU#LcrNhAdtjR}*tk6a2CV0=IPj$8@uxrHW#A7KHT83|OnXU^+>A(Ag#| z;ph%*xqmd)#hAs7AwiC8e2E|oeu*zM%xQJWH4w{ii8ZxBIz1AtG333$8}nXgIzJ@n z^#9?mkj>$xq}wbe51@>vqN`!J*)%-g}M`@3O!5d_4z~ zDok468h^R_o`~9?Vwi&4E0OEQ2j-G~Fpl;WR_v9or>X9byiO54!@8&^TH$0Jt4{LD ze_!|I^PZcJJ}nen=FEC9)U5#rWIhBcrUNi?WDBB1GX6w4htt)%K`nOV>viopjc2gt z)GNXdO)|Ee28v+;=Y;tY8ThrB9K3jsu_e5bvI7y_j&5dX#`BsY@0evh46QYEDf(`h zg>XR)#dU2Q5`10#P%`vdk~FWny8Q7^7j&3?nJpfm)3pAW$^Qe9`%H~m`_O}MQWZlJ zWegkzt+v?)v_gW-m;k);WcK);R}U`*>@5zkDXBkIv-!p*?#UlsLh{0jz^5Zo=RuOj zs}?(uzSAHzMGu3u7d%Y8R@gSF54-LNvY%!BC-(+@7SnhXJ3XrjvgO$8ZZck+kYd{S zgZ}fchk0l@G0Q-7w;6NUfe@CwOim{JaQox~OgUdJ_it>^wH{MODo zyu7BO@;R#YV)4<2)vdY+R9!~!RuYI)Cg`Je46$FL{&NlpMPzvGM76jWUFx7ad zC;Yvrp6XCsLW;0h!)-~Lury63RVdQ3PV-0oJMtWmKFN8zzhli>0MPI;Kzz|KKuz5X z1}SNow}9wikPX_HWS;fTRw(wIUW7Y4p|kVY(1keR^w|#X>2!n6qAtW;YTzLmKdqQ- zlmFyG{M~KAKhoqAHhxgAj5u%{Gas_4^w00J^k7x6Ajxb=@AVOc;R(%j5hW zS5&KMny|S4u}&<@Uh)(}-@nC}dQcfd7flI)gPfT7SlFr(%5(=(#32JVQ?8q^%q#!9 zxbT;Yz%ngK;}K$9zS|0?h#@d@O-&(iyMNGF5z4xcy_k7_C|4kaSbvbl2Uw8K>D7PL zIsk}XCVk1sv#WpQyF01a%IOBwp*m>~FC=TmL>`jKiTjP~V`_$b9MfA&Cc~NIfkcCT z`uf|$pXX6<<*lqKu5gH4(a(^kn>rGx7Bu2p|rpJs01mAoyLixpYd|X=!hEAj5a5?W`+%v2t{A*sUR_U*^7w z_~Q{HoA}LUM3l%0AO|1MJnDj$BG9bw@Nf^##QrdJG{CQ2Sh-6VHwIfFaaezsS|t;=rQmJU6zvq=1C=C^xd%c1JzmeA8#=S(i-Z|8@kp3!KAkqi+-56RepoV(~f z_T_E!8X`?g9smQBSZ(7Z1=7C*S$`?BA&jN>r2-CHqCi$Od>Ru6TJ9ibz2>l)Z-&;N z=ft3MAVb1M%$AD#IYt{DY>(DcCG1>3m`Cmw$ zp`d_OWULk+NKX5b>|_f}76NY#$V>g#Es^MDNy~c)jtaZYT`Xa;`VUp|QDHC0AzT5! z{fYi7$jM)${@?hGT@fE&e*+$$8$fB~9}SYd(&r|VeArsrBr7iX=~ouwa5&0>+|FRj zATL<=S8(mq%^g9s4aKE;*O{k{KkED-^E@DO{N}>ios;uj=^-#3O!MpxWPTL0O5TAi z9|B9FwH?TYJ9Y~+kWCJ-*4f&DY*$Xw>M+y@u%trmfHdl!3_Ccz87!8xFpNU5w=D-_ z5Ppt?;5!iWQE(;zHvPT6g5?$$c_7%9cOWz1pk+)6EZP2V zA8ikSQglMZSchhdZKtq$R-ChX#Q+yaeb!wf-$$F>f6Mw63qyO=U-$Fdut(9+Bn5wC1eD}L|&#HXd4&60b;4o1W4&yD;I zKJfFYlEk~~8ZSo1MqDF=h=zR)(F?pcf42OfzQ)^pK1CJ;G%EeTei7>g4_VLKFVlyO z2c5SC(>cg159*Itj=9=#4;?VAtavecae+5+DFNagX`TE7h$Ie+a<4_U73E8s$gMkSkYVm&MAM<$bAaNF@elqX;i_*KP$Afksa?Yj<}?E zz#V6}H1fCxZ9CHVMY{dKMYHt7@Bo>L_Y<{(gPkSiKEUndqppgNV;p_oS=6U~@9VL# z`HU;FBzt#hu`he-BnlB#*L zl@|AMWcgMArzrfT^Udkh(znOs)3$SQb5>`3`yIcVldD?eA*j`z4g4YbkraOZlh%qxlo5uwlkm}()4cS`n~`3T z!D96uvN>xS>`PkC|LSd;OUdc?bCG4HB%{L5vF}{W$=AnP>~G{JL9Ru#ahdiHFIegw zyzHA3X*}`Bh}L6XZy3`&Gf;Lhkv@1~ir+V=Jy_M&=+oHh@AJd|mX@f1y%z^GI0Sr+ zFY3(qr2E6`yfDpF?D~`7$5tb|IRy zcCk(x-+g&}pgV{w?NfMEV%2igiP+>Taial0AZLhwG%zXD84%!mkzJS7^`dFYIc_N@ z{Do{Ue$jYpurYE(bUl+3#t;mnfTo{?1e|st50WEyAcsKT`#*h=i<6#N2WC9Xi}dHm zRM2MtV^Z5nivpD5$uTFw-M_qH@QvO(a{?KqdOJDM*xr~Iig%-`(Ol@I0vJd17dQvz zc29)A`XQYGCLBms2nQrnUQWr%GzGRdyerxT7`BRV zRUkiGOtAHf^;c{OI_%r*NO168UFAxduYK++4T7G;deJ9P^1$KzQKTGYksd~4!9NpE zuo(sVwMmZ2lyKzu)5V8r3fi4!P96cVyCK2Bc+3M>Noq?EmktpdImrgHQAF$2>_996 z-T-%v(vJn@$q3?_HS-z|v+_c*VB@cm3JQw!)}a(ZT<^u^9Z2k7+JS%j>h2{W3L4l; z==!l?%!S%d;!F+Nzegilj>Agl`2LQgee!y<+K1_XTgm*Uvi?0`q%)hlWlaIp$Za{6 z;&A^fv{P#UN`#dnmJkGE+*D)Qa12eFVC1P2d46zGKaIom>fPd4v3&&(-urbZ)O<>> zx0Yj#bwPP#olmtxoVE?Fe^2KdR%_qepe(tZ_R#r{Z`6DwaoDt^fkb*-4_!n~4aq)3g(MW&L-s8R5gA5Gc8U;T2$3~gc4Oa{?0d}E z_ZiC=%uL_+{rQ~l`P}Ee&wb8)-{v=qbd-QxeNf+WGV%>Z@C~;Zu^|EeAq{$U@ zTy|MjqnULzia%*<_=t&fRd%4*q*`5?z-A^qq9A}1!<4@;LEA^&9eV1l?jq%^)j1WM zT5|lHI$u3q1r^A8kC%|ksd8Ip-3Ib`?NW6|=10>HJM!B}+!jp=itmNV*erd{(v^-i z!AQJo#N>l=&5izDxh)O=Xtqp5HQy%~VGnmS2Om4)-wV*)uaf-ThDL zqPH54R7~_oQ;t@AL=fDBAE4Gc@E~AXCa(K8hh->IxNWAoQ@Igb`8~%66};x zTzY(eeZrKhY^SIQyM+lX4<<7-;b@ILLN$bBk@sn?t;d1jeeqvCH!1rAodeo(2p2;> z_sbvV_sIuWn0+#7*6H|?v+5R_HG)g7^jLEmNAbI!CEDUzW3e%+IMsYR2M>%3>#Dg; zzcV{blvPJ{k{<5X37Sp{cvh(8T3@1|qQE4iDWH>HYM|+f+8a6y!P@GDVgw1N6*4(R z&v6dh_;gg{c8-ouPdbSjR9fi*C=T+PJuW1hFxOlK4&GHXJ5+?O8<}`BjCBZ1KW#Yp zW>IW{{G_6HM_-KMk_Np@FN3G2K2UBhQM_?Jftu-V^WDzu`)OwHIOlpq?R(Uo?ha_J zjtp*!5C$=j=J$SLh{h7EWdZr0+!x-N3VNEY#Y>_L5Rz`? z3aP)xRRsesN&vtobd)$k@Y1X1hob*OH&Fvu^AC`^+PGB0&>u03KR0Us+n=Kjh4YZ| z>bQvZgeDxzY}V_iQD-OC5Oj}I2JfXD_Va$Nv`$EKrxhxye|ttcXH$pGP0}Ci^m}IF zx1IL?<`-~~32J;Nf%52@(#Q|c*%}+piJk0gWUhgCrmHbA`)biy`xez#Gnw#wKz|5Ec#C`ts zz16a^q_$jrHW{qvHN|YPt^$Jl&LIf`&}6jB*(vyKQly`N6NE|u2)HW5=|`8fhQQU% z^M8G`q2psQ$MrGwbTSP8ov?5uXV!p5PG-#}fB%r>9p(aVhMc-SnyAvy%ls+A%~GI% z;*8IA0KGlKE@9%EkYRP4VJ)cBH4Yhs81;C{4R+toc3w-#VwI!*ip@yMGZ=rlKidfX zSER{L^DnC%WX=F(BWe$bmZ1Xc#J|<3eVR1ua&mXgFjZ5yH+@<%or#ZH{-yiSmB-4* z{r{%ZA2-{PE21M44s7)=j6Xoj4qEaFLj*sk9W|gpEMh+1OP*ZMjv&Wm`lSQNqzce3 zo<{H?m|X3MQMj4C!$NXAZ%#qUQseLO51>XkwLq5z(#J{kx+`Seg?@DeDwN>k9B{ej zn{V>|%gu9jYV<)dJ$hqSwD z?|TJj0eup5qoawUWR+>ib)3Kpl&$p_Y=$I%<+;$uLpLj9ZR7Wgow+o!Jt;XCROR*G zEe|)^D3tm^O%A44$#&NyApw0}D=?`cb2>nNn_5=)`aadpodss5T==35H8ouv_q zJ=nDg&1GkUJBpgfxxPy4*;Ki>B`tEU_R!w5P7I(&gs@iRrsrUC@bfJoSEa%N)&lN8 zioHX0+)DvS@H}mDJYzN?(8A`kUq1{iBE=yin(7PF5n<{Nx)Gcd{<6(@t@XNO_p<8k z)d6A@Wl-*L(-Sj0o$w^AZlTv=FU*-95xuiaGRt2n=yzl}J-PY9va}Mi zS^(m_)T-V_y1Lm|(tPn9X5@n4&{9UV01!Sjv$NHeb^Wfadnxar=)F^K%=5~ojq4Dz;ejhXA|CzuQblV|>TD7g6~s@M@DzN;ENdCObk%wih_wKKx8yu|OD|1OFrBpmy^O5?0Xp|sLE5{}OKaFxQ|H^wHHpdF>DDb&tS4lx+UG^R6!e`= zT5@qZy^NLeDfsH^Ja5;)8m?+^x+%!Q?5eBV!qWwh&Lx&S?=rLPewUGnNGu#x9G8iy zzr&2b1MUJbBm+eaGvGf8TS5>z8o%|C)5eq)~7c;9?g%`Pb(8Y&) zlEwvBWsPGOG`?%AMn_Cm*{P$&4rIlE-V9?9;5zuGUj@|3-ImHHv!-wZWGA1hDS*rz z$&a?TxE!h$U(T7Ig`e)Xh?9o|CzhpTISD=ynf2sfC)2UcB91}s3U#zJX0q>et!(J4 z7xIzCx;CNv7n;$o2WA{$;+k^r+1<1F$U`5aGYJ}qMY}Y=Z{I0RkhETAKR}w;$<+z- zSA`#)Y6a;g7rxNstiJG;Ihj6Yy-QUqymYEZH7V_k{h;I8QC2I#8|fqLCmQjb+*!a+ zr z;WeEEW_s&k{BfRY!@l?V{;rD;XI+)JH2+l=!7FQn^2m~SjFgzq2IckY6&M+(H1YDe zb8PK?g}~+u8szlL@r+AZN8_pBl(t})3u^L`GqxZZFimsV&MmT~LCydnnoLQ&m(}@g z{$aScOdrFoOO%r+yR9*oS@1eeV(pMR2xv|jHXU9Ly^Ny_>rqvY5?=1U)~GJQs=*(N|!X74(S8 zls6t@w_ZLMVKo~SNoC&sEjcpwse5va(Y#=q49DHz`VR3jkQ*Rh`O2~(btKxy9dzUj zXOB0)u0KAn9b88f)>mSYL|y3*;^jNK`2=d1=npL==;CtG+HGL4SSXaqYwI?S9~rpA z-s|#xGN#`)q=-+1+?9iL1Z`yhdg1;qT=*w1QpDU~Zq`1I<9mB6lZ#7PU+0Q4b@l~+ zg~3^|^%-mw-VsaALL*;6x#DjZ4R1~AR5MI>xous(o9gm5geE6dkfh$h_Xe!>b6ELr z57$XqMX)9C>WR8FpJwn~E>@AftWz^QM5 zUUxnLmg7FX4mdog=gjUMYJB`&xq?=t*eWX}x*l8ys-B*u0Kw(c^7+I_%$Af*Yk zEnC`;1QQU?$k9>7PLkO_KZ}36dJ67y@f%K9HT=`z-5xL|E7;}gTCLf5!G;cw#W*4}N~r{NYF95EGR6dcvtADI+Ao z$n5GXCkFl_XG*POYr2(v)04-LZ@Bx;--LY!Obs{`;Sx0|v~Nb!-?zR5Selo)tAYbw z|7B^^wo7GMEuw@987fl2jwIk%Te$d}muw6WVs2rH-}!OxB5f;h2Ej>Yt6=ek6p5xs ze9_)2ra`JA94=-e!?U zk*B~>aE`~GK(L?@dsBHrrVTeef!2E-UUat5W(RzgVo$oEaDs7mbb|tjWw{XfaO8x- z#!z<5_sc+o@1@d_&ErMxZQZHd&ENi2smg!b8KM?DPMaL;k{kcSAnouLGGRbbj>IsGLw%nvYU)u z^lke2yVECz$aecdlADn6=XNB@e8g5KARXr($=ZR{$ReMRy9Qy)Tuwhgeg?!#_%c`m z3Eh1Zgf~n00a98sKiWp2)yh5-`?k*5O~HzH`wpoA=4B=p5Wgz^GzKqsD4G=i3Gq#C zluRrn+}m(ONFFsM?1R^9CCM*FQ2Psr#waJ<1LSwk|BUtjg?UB$$*oMl5mazzKF@Qj zU1?@#8!sM*T&~R>YG2h{e9GuD7q$4*f{$E8_dIs0$H(SYBS zQB?Zl{ht2nmnk^S?QcB8GUAo_Lt=>{$OraQFUwQJ;{=N5 z3c3~_r@RVViz^eCTFm&~n9bC*#d&qneTjD8;W?w+Bnhyx}Z zp`ZhkIWZz*$tor_Z{ka+l2SJWUio;M1H*JhoZk*%GbBcBs0-9ag%_25rvGVw^ z0Yn*;5r~dy4=>yVyJtyyOXKV6oU1V^Y;&G83^KqMXa1RW{rN3S-#d`Opg@1h`u@eU zU#EK_seJ54{1v3}Dum=C6}S#ot2~!H06XOjKap?bBr!f_JlP_A0n>l);D7^BqfA-&5-M*QHPgvhucx}c0oYZ0)%81 zH5muK?`v_y32-dAL_ls`WDF=ARA@-+N4#Upf5O`ZQvvxVxpU^tkO&yF!U-JhZ1Ps1 zg4pba(geZXERJnJ(UM*$r>e%EuX^tOJ66J7v`G@fpKVaYt%%#fml0@>q*fMz)Gyhx zuR|#?iOHKKK-g%gzgB+UP<;Hsv@JL~i&2qJSPVD-6xwb&6skd{^7qgzI4pSjx9#7c!!r zb)4$;v*Ex#>7BNxT>hsDUT}h+EYNHcAWR+(Bhb>{M8y0!y|&rtTFOHeE((l2KIb=} z&ThdcJO73DJeiOR7KSjwT(rkQBqWf`|1xH7zbV%YhUUl4o9S2EfyhE(Phj4G$CF-j zO*{uW+|CMYxMIQh=wc%%3t@9)PWS$d>qmg)OLz1n98Km=E%(48 zPr>I01wdHitrQR^_yM{d(g2{us-N)ba}*Qw&aZyLZ?66SeJABB0-*KfGXZpMc#2wp zRMj*qPyv#g6kvq8gJc=lelUO{z`sBdcH`WC3PpI^H9;n<&@8}Z7+fb!Fq(Dt^g*2^ zLaUq<9oa;<`Haq~g%HavLXLHCoae0*2g;O!b^p)~=iqzW#GxS^8XRhr5P9syP|Vm} zN|DF;h0bZU#TB8L?-;4L%a%`8E?ns>{xTbIk|(m1LImVZ-PSkJHmkNi`iTP->L*)P zih3(OX+MHoVUHdC)a4=XuP6;%Ir>0LL{=LTAoUHz&^9YWm?R ziTYTqRXa~W8ns8#H|j4$N>Tv%81d8)by#n#$DBE5m%xN5aN&4EC!Y_d+<8lNa9*~T zO)cSNz=je;?e~Vb;TyOu{8<78ZgsCl*C(|P4P)m$z>_ayy8~KD0-Payg4gc2M^C5onnV5n|G;X_`Jn_e5X~2=f1e)i>a3_AfIHwU=2TYtz^A&Jp5&f zaswn8B=@I|BxO7S@rfugcr~}8j(N<>a`2ANO5z3_#$5A+V=C%uI}ZhXYR=Q|aYE{u zk3V|=1)JVbdbW)Pz)4>4hg~M~c!|1<&CNw-VUG+0^U3U9Bb%DKjfo%Tm95v+5^UGK zOkd?}Klgs@cSOHidB%bBYBEi4e_S76Q~3R)F+nHvdL4xOD7YPmhLiMnf>f(JVLs-3 zBnl%#DjfjgMPlk^$qo)#wj#%V`;`9EWA;Z3FIWhtf(nM&YX{3A*1NC;x>CJaf*lxN zefl~s0c}+(oq5;lRGzod`mh=Tc#w7Xe_iGW=@U<6bs<%r$vBsSzxUY&WO zbT013eB*C@&HufE{^q;?>qQyuD9E--jmrQC~)zQxLCU+d^v8F_04k-KIp_+JY&* zI-csm{PUrUn~%=vS6WjfMB-gOM4NudWl)m)r|{rsQ`duo1n+`R+8dBB;j~KZGFI3h z(7Aj2cuTa<)!dfo-JAklqQt$#?*X!z_W%dL^&QA(g*C`mPcxzy7X$D_TaF&@=VzO? zR33RZ!zEwt%tC}gej#Jz2}DU-BY3Yc84@eOc_gGxU3ve2@!tD5j1TBB3TlIaBb+U4eD7F?O+5gT((4Cv z8}hAn=b}ieFEHN6rN4Ge$Si?WXU`mcq>OsAS7=V1Z284ORieg6(Ql~|```w8L<8BW zmoxYzhdd>6ZpIT?MVv0^y}s0dDnEm~TwYCzRaoNO`ikR93RWAowm2NksrZy-?6ma* zvWpJ};AF6|@xilN9LL^DGZ;+r)J3;2tVApdXzV3<=D zKvgkwf*24I1rt<%fSeJBS0_$mM-XygBLH;l%iITQwt@dMrT>}I|LoHL9Q^;>rT@80|6`v1|7P)nn$WOG zw^fz$m-xF0eC~k~gxCFNA#S}O;;l5?U*Qy;7w_x33INFfxGP#t8|>Exzz?++pXPEx zNZ~Ss1}@j;+-+=HUpLIBebas{wNJBAuMHyP2Bs2&wOFzUn+tBw0HB0-RRTsUq44Q; zzMU%suWT0dxFekL^RW8UtxDj z8xochk+pQWoFuAzy8DTt^=lou+3?Wi`uiR36DZv{j&Us$na;z>ezAslRl_^hwJ||L zK3Dq?-)h{`9KTZyDj>YQT{2e^Bau|817=!98MGIuCN^)~RehqJUHG=}MXm<30-x@? zj}!f7c+Y-r>*Y=~s;es=6kZ&X5^5I_asJkEb*G&{X2`?Vs-AnXW?47Z z=#sK&ZS+)wfPnRrkfZBx`e=hdze`?T5*{L6X^3N9u)ybqK|41K*DvmK!E~AtymEbX zgu*5>@xjPKw5eN{UNOT}e$0Ty1!@pOC09M~P9cmju<#|aq*=MuWIUJkewLheoMZP= z24Mypz-4|!qa9+#W%7+|6t#hyg}Ro0^?cpO+Urzgoe-9_p?Avd5Jqq}&oOXaocG+7 zf2a(|))WGSx@22^#bx|^8HU$zdQvptcuQiInrFhkbK_J|+2%wol=S}BXC*?)jy+dn zuG|5cEF}cQ@#Wxp)p@v=ezhNeYok*`a8{TZfE)?nrW%4?Al|_-*F}7E=-TNz-_>=V z`jI%#LB`uJo*x6W6<_dBVo~@}QXTZX-^la1Wl#R4Ce)ccyO}I|4IZ#tr`BXxV-{Fc z{8h&-pgeH^{W~GUE~>ogG^H-y`F_e1(wTYHM^WxkwdnK=!1D1w`{JI z4O_*$Dc4nJJA?0c7GR*#e@O2cHY9(eHJ_I8&LOSGD0_qvuk-?qBqGUXu3 zE^0ypgIsT$%dYXjwwYX!r{E$dL(Tf`y@ai1h0|AfG@X`6xgQE5O_Bh2?hz^wSyuik zG7GKoBGW6b@v2-~kmx5mciT3$r!w46`LGx7%2UzIh=6<+{-}db-&>-li`6Wbg;b(LDIjAY5uK+14ak4pSe0cKn@k(TC39 zhZUapPi+UtE;;)9l>O0%|MFr4_7Zg~gjFCJF`JE|a2|0uu01>bq?k%xeOpYHH!T%h zvixgO?sqNVZ*zJypnsJ%Ja~qGJ1V73NB)L9?NQ6p6fd{8k(obvBg>?&L-f?tN!^#> z$F#ME_u;bFQv?KB26ClNmUDEhM_Eyx9BLd2PmW8B@xpB3LkOgg<^DW``zgo2?Xg51 zsofje8gV#%8~rusg#Ph2XO1P!alB*HeeqiJjo#s0vyVe7u_&~-HXo=Db|8Mw@BwFS zAxzSQFG>ry5!5h4LFD^_MK&LWxubodv>L4GRif4vUdP@y<{{)|x3{nF9h5Xkp&ts` z8m_k*hST_sXx55kT{~Kit3^mghZP!ECqqJ}NlQHd^QWMVm25Vr6{Gftf>$R@hjUGO zEDB5r4v#j{yR*+6w4Tz}e<*Rru+YzG!%acdDX6V&?R_lYW)kN?N}tUEVN_0Bo&;e7=~W@>be z$(&2{TIY0bJF;vcwuz$d51Fh&lk^WKw95gAgWU%>_$vEOrP-vs%P0I~)>|*;(8vpWF3v8#kx}@d z;XOzoP$P#qG9hz(a(N=M1TP}GYgqp!FcvN5>hBLSyt%5Kc4)u?zW3TLrPG5W$*Sk# zGhfEiXB$tuzuYqnw{ja6szp$~W=E~hL7G~z*Wom{ms7s4>ZMe%jlQLQ<)mjwl$Z0& zHOca?;w{fpy5zk2@)aJ4_-r3(drs#&Iq$4h_IHKi>jo!8@QZ_+icxr#A)qfvUIdRALvMV_km|&5@zD*`Gig{&%|hLO}#_1fsFg521}0D zeYz7ui`ojm*!BO`n3@-?zLo=DJV?F**R3VYVImmsV(CUQOzQDp-(D|Keq`fTc`0=E zBs|kXNL=(3HJkA7Z>i3$a-`u0+)1=38ze=?oO$p>f@CkZqR_3cHqwbnv*7590)cEN zlF;|*W1>_2{Vu_62vJ8yhz2EvrwW?R*rOJ?&3Gr?mJIueGYPJ)Sq`qz6dc31<7GG& z+)gRLbC~Z_<`cXDF?Rx~2>!(KgAw*?$n2!ilbcnW`P>_Ib$aE=JkahO_hq0c#3v{O z(Qx+Zt#AVfv!9%~=`onM9eP8AXOg_pynaBmV4~O^1#I9?!Pj-0S%{!5!Hj&dCJeFy zGe6J~elA1&+Y+r_H%=dYK!BHBY2K2Xn{Po(V~!JqsAryp-;O zKx+OVE)WyYej^Go8w_QtVCca%&3($ZRK24=fsg0Yd+f9zVE zbi_Hhd(%9*A(oW$t?KQYc6EK78^`ECer0}5Fzh|l3BN&o&pHt9pcFgW+-^yWH=Pub zN&+WpTyG1Sjq^{gcgw7IPKDV2-T>uaV2YYT#A${*UurVXeu)sTZ0Dl7#Ri?fOC9u6 z30Z+il>N-HHK+(~slnF--7@kaU-Og6xFg+pr}uB^YZ?|?7;TIauN`=9-gub=)R4{p z-!OMANLIwcWdy*;uHPAj?W!z*e6#+{3jY=i`^zSvZh(D#MoejI&^}_>Ly(vi;iCZe z+MZZW`iq$K2WZ-6o7i7#9tz?A)pt?90%TS*!DWyM1GHyt*0NkgZ$eg+r|6bN*|UJD zSh#mIMVaK=@VAMFHHD0-De7UFvYD4<#;+CN#~=4;fOW(h^}?Q4b8nen^3Ljml$pO$ zF!wF+qQtNS)d)$43d1jED;&O#Ic;mir|Mhjr*ERxIRBaT>rgiw^v7Fmh%rbodeU=PU&f7 z-kj0!d$bIdbuy5nUT^v6B`Tt$Dczhn@;1u>e`ptbzPe`OtuTI#J)nMew;sXS1@S-L6z+Mt>F*r%P_ zavsH{RY~;J6(P)i4zvo~59cn1U3)&iCp8n=Al&*m9{(|3l3>=?rEW6N^=%^aqG<)m za({&+&z&&WBd>X9o$&3~as~!5*5SSG!O$Xm+Fs zhL-YbJ-!k<>i)4nlIIt|{0!q<8}Y3R2M|Z40zv^QQqL<{h~d~Dpr2Hvvp+!J&#y-1 z!K{!6QiwnVUdRx!oMv8&wDtimuRk1XrI?*LhH&Ikn^S86q9o=tn`HX66qYD--N#3k z!!9;r=du;XdCxGl)|sKX_b4(>hjZ{&P}lrVPT8+uEQPkw$kjfu{r&{G+=n?O`ICu; z_cQm)jW(tOk@{IwwG6&LRV#o)LkK2V{9~mw{X7?|mVrEb26( z;f8`5rjMXS)WF8K7>@?)Uwy5pt&5e)Txb^m+?k;*Qo{OSdIO{8~wzfk==qCL*NaFW+F5z8vz|5&ZZhw?}srX8`op(F^8h zC%U+AQT-(0AqZ)cD9QXlq+JXQFjC)ukRB}~nFfG5N@;V4;MhYQ!73f?AFhWvhsxvy+1Lcl&a$FnSjJ7QremWY{?IhH!XR9>EF_fDt^w7r8`q)Vj zcjZ4{TuS~#A+Hqgg3PFdDqUj6V7SVuEwyo;w|SCcqCp zrUFKMJR?2sj*Uz3G7EM#_&4-QzEJ}08vjgm_%ofwFEIE&_Futa&?`ScojPP-rpFQr z5X)hmpd+n6reyfXPx#Bo{&(biU>3q0cC@Vu6<))OGePG)_a&F0B_*!KBLw2&@7=oM zDSKY;i=ms^UH9!-(lxaNi%aNS8^UDv{qmZ!64Ozawa?2ILIV-9{kb3G@7p<&#Yi~{ z2A`(v*$o;@d7Bu`R7bWx&kzD=vu97A6y}XuKT?jrHH52m-=MxC2mXR zbu5mJWAd$cH99!Q^3+BgtAgCk?sfki>%l3|7IkppJMfe*bX(+5$2~W(12*&zP#dIs zrQrvt=Eg9@(60@)4Wy6L05DGf3&7}JNBKb8@p6A+OHjVT3besqp%~!?7z8B&+$QH& zQb3ScT2EYf$I&$2$@S*5a4M9GP5pZPS!LPOGykY?JD8sI_3`GTA0TVMu1Okq_g8y8 z_X-nx>PFv&$y1wCf(QXp0glpqSvWloVsg&ax6V|*bGYDI1o?L3I-{a%<=EVaKTIY2 zms{!#CH_TG=@6hYRxPS4S5N)Wm{?pT(+*pX_DkJXJPFh(-Xicyb*~i=^-36!{2H1U z^tD=IH~nET(o0H^?7uU|zcQgeGvW|Mz;&~ZhK=oDbXC_g#!M~?x;E7K(-Q)KpTNxm zIBH%4rOS0E34;$JdoQA@aQ8|RXD@xnwm49#zS?Un9e4wqgqBk5_xutXP|PFCdLllh zABn;B%Yo7|>Dnh(V{$6m8PpzctIp^>)#skQOWlUTs)q0ZdgpX{&r9WpCMWtp?>8k} zw_Y3obRyR;P|M#r_dhG~Jr89i@&Fw+tyHS8TaBpWD}HQ)E-u_;%tWI|$i*gqT4ck0 zifgK=4dKn>{OY%2jZn%E`EjX$#~f4KpE`9SC?QVD7v z52Znjp>hEw$>+7j3KaR~Flu9lBq%^-M_v<)UyN8K-aYsS^1a{s$^TX;SWlpyg*Czg zzl-1uB5NpTeMBH;xU$hnkI6AJRi}<}$+)H>!v;HB!57Mw>THYt@~i*gj4Mdu0gw=L zn}7rqg93oP9`J|S0j669(Q%akATWzbbCSSSibIxPnW-m0)&v0VLg30kjK~0xFAk^! z4IaAwyTOBWi`ZWqJbe5IF!X0<=&$Dsz603W6TqP439&%2muo0sM}xe|iS#>U&7Bq$ zURL`DDC62YihbW-ot6benH}*1q$TQzVGsFJ4SncU;I~{ww3p$u$*58!iN#$BlD_-* zp8B^ZUQOY;7oUIo%Hny=_DLf*@i#t@=OFCNI164MfI8MQ$JhMm28oC~LcT|>t*OPP z6chcyo#l|SyOMNmwnjTUjNgFn%|9=J|IfDPZ&AcR!2Fq^{k!a{EL`x?pi_#nLczvB@H5EQNMn&B*W?qm`pt}w!FN~ z<3ewGz+xb%cws8=)Sj$4KEn4=aZ)_BqgS`C`&xTJd@mrIMZR58xW*7)t)Dp)*Jgpe zF{e!waq_j&7=bH_zQCB%P0^YfXgw394|&LC_@d#`9G5cJEgq!2dV;1 z-&+kA_cG5d#cZ*Z$da6ZR_!9snyWs%Jt9rC|@GE%(icOJQp)3}ht$WFyEw0Amh z{5WGcrfPXXR*&XvepK_?ML$EK(wgZ~zt8X1&;U#3Qb4v^%N050JJ3iXkKoDsW#+<)2hSI4-C z=ne$RFXE`lEv6(a3H3j11;T!HC;jaq{>5JuJAFhZ7V@1fi~-3D z5J|2)VxqvzVAc8#r2EbRV0HmUelu~Gdiqz_19dEU7`gojaSpbwF@o5QGzJj{|LdB& zFxmn;Bo82VO?yCoQUCm!qu?YX-U^HduziIH%btH`6+V|_>#rb>SNVW1MAlbz1Gc^( z!UZFluSrVaPI2xPzQ#X2W5=&py}!}1BfftCWDxwn&gi+r7&k?Q$?Sx$n*-iFDReaC zTbU7|RWjvnY`5mmswy*2$Ej#@sY)1Vxf1)oatRop+BZ+wiK+)^f)p_V8asFgr`-$- zGZPEwJ|*(BT2qvGU7;ewMxwhr&&Hj{BBq^2b{*O$-*e@FU(O58%}uvFKVO(@P(0B& z0=TooEttz_AsL+jlu2Rx8kC9<)cXA-p&A48hf3it&ICK><+XnPxM~npWpzsMwj_fg zPT;zi_*a<=HN+ioU@K7D8!pVm0WS`nI9!{|>RtQ%T9j$^TO@yC*{=PH-z6gdAn_i? zs784X7EyQM+f&&KH_Hc{rDWE5B4U+1MaRL_Y(Me)b0Ak6?^0|jGv~E)l1L+&N|=Zo zAhqQchF%;D5|MML!&f%JPmLDBg>aJ(2%4Eh^$Hv^&~^Y;x_k5?Y_B%@yPkc6wz&Ge zCK;ZGkax50NCU+*PJLbNE1#8ha(g-l9(>OCD4%vo=noJ0{5rz>Mw~&(Ky$RAlR1|k zl+f4koevjRnc?N$P^8TKH8yklgMO6*ldTb}TN6)8GTRk3`ON#7Zx@)vuU>A|6a2m% zU7O$U2}HdQIQtvWX*%>;I;(rKJsjwoLxBxNNYiB~FJa{9hAE%ZwW0_|CG4$&laV?y z)(0lx$8Hq(EPefI#eTpt8cX8{pDLU-J}Bth8c*!4N&!#0smM=SxnrJ1JSOuJpVSc4 z-Eqn_r_kDT#(amd-EyxypRorv<&?)oi*r)&u+G!v-GYhAx^*+Tt>?&w+xs|{aeYj5 zyLbDrv=+u}TPE2waR16`RnXOx3A0v_^59egGtRu>&?Ug{O5YsplvlyBL;0caVj=Tx zoQ`CUaM~L*l?`9U!}q>GgHrKRe#87~JC=WBbk>$Z{Je)F?1nCns6i<`*K-mR~A@ ztpwHu8;mw=oVXK?l!<1e3iQ}?%@H^k_q0!oXWVET44V-AjPzyI z8Gi$2T4yG`-UmUgNGyb=VyCciE1Kc}{%i%WOsuQZ>72=@N2G=~|XSc&UMOr#vMMtZz z+4+jC6{fLPF?|^xiGMi)__rrv8&&`TTit{toR0#8mLow3x*R{!~8!!jdMA=5`V7dAqFq@@UnKg)da=3*sv;*UZdp%tiCk_t>@0)oyP2 z<{u{QEnWA-in@yR^8GwKuiveoB@8HMb}+*IajIvmie&KeMK$B9hjNJ|oVE zRq(>44qE|#5F6iUd#gm!JIjLcMZU>ms4GGD>I^GTKR(}mPCtP%D{z;CG0owbaHij+ z7*6gbk>_gAn1b(4R7sCc$F`%l^~{#-!8|G!p?{RHiZK{|Qq+VBsYdax$Pv;z{(|)W zy^5cg@MAd0hD9FyJ|W8jh;te9N6qHa>2_gNrr#AMh;H00_SYKcde3_EjIm1iYs+JM z_d)xK|H#9+fuAf-S;K1)(zt3gdBuVj-vn|o)099OlwpA&csbylKOPcB)+6Y5Gb_WV zTXM%AD9G2jax|{yM_s@7+BchW_t*5+Y#q#W(vWzH5aA7^Bb|ma4ku3oS(6STAV1)y zZeUAI0J0TA%Z1p))$B4FL+_=-ndH_ z;$RQI%1|oMwx5EdG&NV$T$_$e^k zzVFFhM+R>p_v0lBxqOeNq(!^@-D6kU3-d8GQEcW{;sofZWjZIFY@_vkzoNQ#OE@nY zje1aR^l` zA!y`lcQoFWP|#&gfJ9lkiXVPGbJYeX?f!~j;EH|OOqRygl^xw$^>o4Hni?DOd>s`v zmN{RpquM?k3w+AG+oR6P28lXl)*>HTb(jU`${ zAf>cCe5%WyPG~RK?MSw$W;{{XdltLLhU?vGI7Dp7#CRcF*vp$Lr~;y)wdvlCYr3)T z1$kMUK>n8>tS@yoaXL?I%UC*Ed!}g`0Iisg=i3()nmCR6m76}shGu;6e06_sd_78) zKGN|Umz>4WO+$4ojEjp zeYweosnXy>2_gz18P8gzMLTgLPUz#w z+vVpsVMSGhc-1f1jlx$29v`YJ;i(Avzl4;dhEimq#uXch~Ch|zn9wWSsMCbG)*4-m@%hxu1+;s-qH z2gr0CvHMAGVRq|)Q+GF>9NoQ*+Rr8ecr=leUL%#iYJOaS1+#IiV@p)wyeKkP^H_j} z(ii`Y7k!Q{-XD@t{{Q zvgA-0iAPn47Le-wC(Q=I5Vvzt=+fx<@!9 zGWS$Jq%(7D7Ulyr37cmi!SjU>Yw`&zA2RFAY&r4B2}QUTR|y;C?5jawne0fPgSU?& z3mXu$ew;+=L65Do%LC@d<7I^t9&C>{Gb&EeRaM7<=EJGnTssP39En!$UC_ zHH?f9ZzCS4Jg^G4L1`!KBmt^1IQ5YdVSW|CRO8@)-U2PN$PU)PB)@#neRqbxV&Ww2 z%Rz->_YsVcl1SaK_A52-X|%jT3ZFaK7xbg{hIiGnzYIUQzo9DqVH&>+{eU%ZnF%x# zM`;5R^Oh>-z3wY48X9-Iika$Xok0|D#LdAGosy?H73%~)sMpSTdNV&6{w(_Zt^GkC z2GI%>N`^b&@@D7?6g;S3ks&L9ju0QUu6> zM-D!BZq_jmaJ%%8**xrWgBrDIwI?s_Xs%atFU^XDAt-pF)Unk5aMSdhY!eMy+J|!j z->lt$5}irW>H_?A29S4)1MaPsD_zx(+mcP_LZ0R7i}j?WnbEiY#aaKi)@M0OU^y}( z(9Uf&7lt5C!!M^20)T99bYv#!L-vE!bM0?TWfJW*Zt8W3^||n$Q_3@-W}9N7oLv3~ z7Cf!c{<%TOuA*9#@y5GniI4Lgq#_n-ke>DzuScK93pPXYL4J4ON~9MkCztUq%*D~r zA0S&_HN)Z3*LFwe(mt-LJ7=p3`FNm!JqWN996(Gx2MC9rfUE)|CJ32(kLyPzuGe{c}G!_YH`hPzN%W5_YW0F1pW-L#4}doMzVL;gZ_@#%E09p+nf zr9vqbG+IDRTpO=Iioc2+H%Gs~eq0q_t>YP9VmVw)XLnY&n_#vKrmyY{w-()0w|gYI zkxDuJj~3zYAO&%v63m{D+8K}-x%PrPw7|zGL6%bQVZlo9S@?>_^JA0L;an$;BnqL>7-`J>wtduwD3>DZp-}pc5eR({TZQK8dlE|d2 zSw@y*mk5O+*^*Ep>r@B{MYfC?k$nrHY@?*fo;^#(z9iWRF=iC9&RE7UOTVky>XrZ$ijfBqQq)uA+FUy~EUUU}E;;RD#k-^q{QSkYRyX6n zYB`^?jKz&q^oKJ!3iLDE{A;-1ul-XX)xbb>?ttuEfHgrWN0j?$906Jo*FP|3)4yXZ z{sLb7?c4m91g!*0+mSosjX+D>GNc)RzdM&u3u*KS@+_dLC<<30K3~L~>cKdJ8ORbH z$cnX4F{09vmoBF{T&;!DT+voq`=ciY_lAq{^}UYQWrPN{rNF z-eup(6K~UTFj|di$YuWn!7NKD;+#_#`l9&eq2t9#Nia zVV2-@Jn`j8F4vN4hIxywll~5^77rdBtwKjscF7mJ>&Eh&oX{(8zr%iBYR1W-?<_+{ zpvB>K%S=Hv=-{&dq9>xT4ix)QwIy6E|Wb!>q~-R2_x z7|$cbO7*3Qoz$vlKu4xdJm#SsV!uDTn~@3oVj_02Q7q;nZ>S|Zi#*3LPp;vb7;;w z18{wG_e*kwe))*ZEh)z;Ai0e9dU@!As_;y(zM!s@@*-}oz$`g74QSQyfYbSerc1rz zu!6Sd9vkfg`fM(u1_N5=rV5-4O`vVE0Rq-2m z6wAGEaSm~5?UcE*-_c23@v(t0LA&75*Gm`f`np{m{T8Rf<5SxW(8ivUHNJz+=Qd-> zB0t0qr2FaPA216dNU{Lt19r(AP`{up5A*{qYM_L7+ay#toE5hsLU^9A!hOb(dwp5; z{i-fAGh-~N$Zbyb!_@%kktb=+$H(wpSr4kX(8tFzP8brrah!un-q zU9cFOzrF9(E7$m9=1Y05Z{H0iyHZOeZ%P|`y>~CQ@KJ8M`TxNIVz+DoS{TFdf`M&y z`VZkoxGa6Efi)&g(dxmHoYc9|rM>5-WE{&etK$fSH^02OFv#cXZ<@KEf5^}T_aJ{G z43Po|g4i~bXiedSi$k&)KQ;&Oy+AdlqC@(b-J;it{UEF8*J2>J_YC$C4B$x2Jf|NY zYLo|5cX_74WZ4^v)BYwn>J=sL+e=g)zaoW6s|N!6q&j={zibE=Dx;Xu+qdnZqnmy4 zu!_zoaIoe{|I1S%L-H#hdyFqki%+DcNrpW4jeA&~Sh?)u)X~q1EH7Qj^(6XTt0&76 zQLSho57mJL&j}i}ZX@dxOr;cu##1F8r0kHKBCpgMq5HHTX8)?W|M_zON`;=$`yrJk zEo0xqA$6fjm1b{oE zz@d$&T>4+%QMHjj)D&4D>tWug)y@HArDOLL#&h*EzMwsU-dQ;^JG_lyqYJxu0oo>j z=2~-v9if(}nd2Gx*haJuzrBgq;H9qotc|k@rmu5OsPani0b2h0 z|2Oj6}9EI;6KBU@LMQR%SUW z^?Cu+#CdwU$l>)NV8tE0Z^&~evPffp*YnfW48y-%Q@`{;{$%b~cjF;C-~{fcGm~dK zA%_|mUfNB%K@WKvlS;+5-4&S?4JEH#B_8ETnH1P!R0V;+R{#8UGPWUa%TM&-`j%^P zE}ji~PNQ(MLX<=+o#)Nj(kjF5iRV}MT-IpY_c(On`O`3AA%21m$(Wo#?CVTL0fEp; zm&mYkC2c#b0c3f`Gvc|JBHs(K!k(6_xXeRrVJKE!=E)h5!U+Z!#wN_|Qa}@_2~#9UFniCJaIMoF<6R*g5CiO`XA|hgXn=( zzsnR7B8Lins)TUfO@~F{K7N{VY|>4sV+^ynlh4t0T7h}uj~K{*_)3MCDdzM~LQJ%$ zn>I!)?#GGH1fTU|gtbSNc>8s_vwP%+0#SDog|;*g@%b_k@NOG45ftNRfdJFCBg zsw7s*0>F{*faAye)g^;>VPW`>3&Z+J#ZttFT^2puNhVoZf1!_PdeZ<7lmqMleIS*D zP$aAt%ZF-8lD+Q$1-8D~)@t}f_Ak+1Pn}~l_Md&6WEt|lNdUS$NpU2<8tY`GC?Gx^ zy=Z@kT;W^|lqO%#e?@+ADp84p#) z)2>$E{*QNAdK8%J{%?5ka4TX2{c;)sOmFK2P^RkLl64K6ctEDf1v#PL#yZ;3g&X0c5t` z!R5yT+U@u&ff=kw%fUFq!XcHN!6oVp#Iw3Q3Ig2@NcO`U#wgw+HeK2iQ;O{75p4T` zE-P@dY$Sb5w1n-;Rl(IM6kU)q%CmBWzX&3$z%QU==pn)L-R7N0P4=y zDCi#z2KqqEM__Q8spt2=GVJzhs7)u;l%B+l}a z9^`2kzdPtXv>6OTmuW9U)J;7$nzuPSZl*ZwX5{!54)taoPkM!Es)f&(3$sRCjU{y;FRI6FvrO3J<_x!~{A^2Azw!K9R~d>k(U|s+rw(D1eh8|BrYo9G`7t+T9>0aT?OA<;$%yH*mnW64js=R zy(GI+Ed6!NiKVU5c3#t+1|yDx&c@aQ!ig`h4QefwO8+j#*c+amX477F|UUJi3AhH8hdHus-%36q8GV061t4Q0zW)7(m%~i;-WD-r!&U z9OTnZ0*$c`sIp1hVIFYfes`KVYAKx?~*|3abLQm?Q`(5IY~zg>I+SoI$lN;cgVN ztQn3XH59vN6CN%X4)ZIt;O(`##!B=lB%Ga^ofp^_dIWF z>uOURqW3@McJ{lu=QSwFun8zk+e_^0D2;^>Od(v0(7jNnz@@$eRMDaN(+D|oJh9Rf zf$4O+sd|&oL`z3OG$4*$475v`F=>(yhgGBn7bAq=a{ZTES_BNey^jkm(&0DIZ<9gw zsHPJwugw8aO!sF_03#IEuLR|CNt@;;$|buNxLQ8}y9YS=9q3^rDzu8`Ix2{k<$xc; zjR{M$%+w>EZE4uUjr*(??N!MPuWk7;9Aw!%S5I{f6voG;H_1%vE2a*Vrh4QA^8|{y zQIUFvyfI@0iNvs}>+UPf#U6ZF+9O+;bFjac4{)%&6w$*Rj{)KBKO~Li^`nU=4p_4{ zO@LBT6mTdmCsZ2&=U2;F60#zwFyWM6;`$EC@7Jvdeup#hyZhrab9M~6k;R%bl@d|uA+60;iH_|hlK(6&$MZ`DhRZy)}e^tHm?%V4&1 zI7LPs&FqYxi|r%d!aRg-Ufe`d0Ll`%fSOBl>_yH~6Eb({5<8!et;$pu;AnTw1^Up* z|3$^R{1#>d@H40;8*cz4-i>?&fSdaCzk?cK0D<(UZ-*|CQS*Rr#5M+`?4z<1zJuTZ zFAEUX?oJ1EsCSshh|rI%biUFV6WS3C2z>|nL!diZ8K{_A2bbcJSrODzuqM>*@dr@a zI^a_PO5eu;4+Z)K0t2e_&w_t3X6Wd}F*>qK6B9-Eg%Em?dyosy7QTbFfe;0d$S!(T z3UI$32O*D8m!Y_HIulSzd}`o3Xf5rpYU5U?U^_qjw0}?cher9o-|nBhm{q@nVxb_! z!3F4n2Ag6)9u-_!Z?hyQv=|sLYcVAd!zb06ofExT9{U}nSelS~A@3IcOR~MCN~^=Y zBgOmQU+T?W#e8a+IBPO)g1ckJeXK#*RkC|^=`Fp;jdW@e?D8|a{ReHD8jRWh3eGQL z3w8K5f`PCWu0*r6cz2i_9V9Y3Ege%>_I7UN1w*#>yS~1Ub_qm*;NQuz@=l*5riCGd z$PQ*iRNNLtmgoH74HcfWv7|_R7<0acp(=x(p%{oo*p#_hWumXcge*CVIURvB^ht<7 zR#>3!57;l}!Ly9om*YYOv^HZzdDV0mDWCWEGp;6tV-Kpc5+a)G{DL(F{5^(r`YYd( zY$hh<_93ID5&}&=9(M9ql#dMg@BcUgueZ@r>!o zSLA4rA2jXIF$-9OFPmf5z!QWRh=d(?CRLLIC}!lFfyFFN{Zy-5#C)43H|DHoWq0$$ z_+CR2r+i8xNjcW!lan0_ zO`7Wjm4|Q$VW=hD?;&+BZb=q=Zt%t->uVGG)v9B)Tel>{pGDmgOEMJe-}h^dd~Cs@ z*|6f9W+q3DZ(5K8)El%^O$o$@Mzuy-9#Can?_>sDYp^3sGB2NL1Eo4wMh127yB0ad zbQoUpN&>XYPja9YAovi&jR!qB{EvTIu8M=+v7V1ul28lOqomnoo@M$XZ#6Dc<^484RzDNuw zTw5sPl{Ga{Php#<;_-!MrhpGgr_Zp1_C75LX=0p3GE>EVL;!ma5%sc&fuy9eMgp?= zEKB=*m?q?9ufXnGh}Y^?IKLZX^Gu4E9*50`$Bb*mqWXhyXr8r;HFt&^+I0dh>F(<4 zcDS95L-=n}gQk1m|Mq-#^htTNnAL_r{57bH{P<)WV-xIXrMj_azD=TFNhK%pb%W`F zGADk&$J($Yt)94G9;6d&^4u7nRO&;3&B{mg0m=bnq{3phVEsPTwP?AEB&Vq9JVekl zG0D$)L-RRce{voXe4cDXz_wwJ!1c;CGjKt2^H<--i>P;7oaos6(ZP}70| z{6BWOqJL>;<6(jvnzWP|qcVxVRG93LY

pq&Jg!(lj9ahf zY%BI)(Qyhaw2`kUKE#~Pnn8~phpmD7k{=2R z<9UZcucP1okb@Keo!Il>AG{+lDU%lKvtrXBN=l(U*=i%`cW5Z^-MSXt>3~;SBupLK zQSW5r|KyHf%6h5Ao=dt+tGF+paK#?3Oe96p_mVNV-u|UV&c=gB>T`BpRbf9QCT;-4 z6(K*labCtxnu=6&0v2e!OBW%R;zzPJOAk))=g$T+ioGm<~-jbt>0uqs9}IZdBP zkP{4XZwUtp-OUmD_8;ctSIGQtEr;LE0soICxBe^h`cta`_R+G`l_?er_E2(O07ct> zOn;Or|JqH&glTg8CUu08?8%qz zo3LOz0azQ{!C~nB;>F|M8_ZLwI}z^BCWR22j|3F-IT>~YDFTR5gfQ(1Lg6#qkh1q9 z2EAC#bK3f2x(B?vaayHqPVZpVsE?kU_SuV}WS}5%iFBWqp>9br^-s{Jy+OLsV8%qT znnc;P5GucuVqdNcDON{cv>?^cQI+|IY}dYXy>gMbrNBH(tfLef$%KmQ#0)xcF`>f`6|D@kF7Y9MMi?G45qm5>ZSK7w(O zkL9+$LClUSY(54)=5%ba-^D{Of9iT0yJ?0Y|`3I zX+92R_iQlH{nHuc4{PhL4{~Rix+!Is?xM@PtKPB!?HwzBdxihqBp_VKW@G4`Dq1F; z57|;rIGz8MszWTU=)6tT6dSDEc#=w(P4WFI@@>QOF48ZAVaJWaN&jTMz5I~KY)v$8 zj644B<#JFcp50vlzEYPd4wYEkDg_YQCgb-`e9+l zl-792rDox0v8?{Sqvj|*0<92E)&msDjzMSrqEh(>_G=9qpdH=Et-pzTPIa*yxaNsrb3m~&0j=6&+ zp96ZW+%_pC`jj_?W>ayrIvf3Z5`;!ONqTaI)CMjI31wq~@{m!6uePkZn7%M3# zt?TnEo`1RL3tJcyF?ldp-H4<|{zg=p-40gvE46j7H=(mb6AsITCv6JIE)kXG3obP;-9zJBP(T zmq%F#^?XFVf%H#?bZrk;Kl8Gk1||hqm-bA!U8kbW96|!c_#_G%cO( zE7yqsbtLxBgHi%=e2kPrgmgk7@GFE{H3V-f!W{_v*7H1T81~}F^cQK*R6q_|@k~-* z&0jLyGRb&;{Wn=UvzjO2)&$QcZ1}`lySxyLmZ9J89=pD`^O<@?E z?mNm!gLQ>H4_Voc%suuE1~+3MWX4X+LAbFG&NE7rYh?4F$39u*60`^Lf|toh6>ye` zm}-}Xwi3g~5N##_j{bh;RV4L@DCq-$>qV1qBl!I{%nws_2zKRS>lb#CEr!AmpKT0q zGhxp8UPd$H_2{foQ6bRY-nh^AflvvAttbPz3VXM*5-b*3i9!l~#+Z zpM;`SPPXI=GwysO_Yng-V1o4HZWu20fIw25ja}_2#e7Y>2A!|b`yYN2aN*+gY3deB zGGOLsI$~!mSXC)6Gx)|}R#)a@+YF#+kUS9#wSdlKK=wdqa)kDz_n6gIW1 zNsfQXcu8KJg_?t>bFEXA!^dk4R$?cuwAw+4SHBYy0!QNtkuXUbC5PeoWRSN!A>Vwh zr6(#MWJlb_+!$H&-hSlw@#}>|Dr1*wm+W!ivA1CO{G~gf;07HZ98AdH`D(v%hF%-V zI;F5#(fPU5%we$U1CFla!6$L)ZUIU7Y0KzqPtTk=X<@-E9HPb~w8=mLm`6;rw6Yl* zn$8IZlj}`&h)0?Uo$f`2hahG(lrq@swR(QXPD2J&(}fd^OaCCG06^;f^V$3^VzqA* z9iqkLRy!H4vL%`10B;4w2zBv%gsO}P_iTU>0^3%Kzdi7wd`nZ*ctR>lE)bn{?26Sk z2Wy%1mMqcE8du(xW6kix_7>%9#4KDCN5#?I9dY7shFCW~aXOIi9g%IQed zT%2ToGaf5`d{!<^Gn_!LN0T$OhZ+p?4Emf3taSrDJ}6xD687m@zN@DT_Z4|W7|A<_ zZs#7_y43kh^^Diu)P#bS^}hj#B2!l9`$1;gwm#AwiU7L4RmTQC6vFo-j zrN{4B#eaBo04xn0y0h;Ql(!1Yai*Mydr{=x&9XwsmYUodVA9mBeU$seY{K{#wOJ;< zi?_AAWVIj#u!@9b4$2usucT+bzh#SCv48dv@c}QZK~XDup+&p}lm{++^5!MghA1 zH1qispIxyRr#(|Qr{#k_pa^kp6*7LT^xk*Vd!RX1=dQCmIsnig(AR)GP7#E=4>lOz z%emdiNkoQwoCh*$!zW7zjUDb>ZTF{jTzAgXhJmbQ*@~(`svbS z5%~Dx8$5m_Q>mKAt!;F0HaGWP=8Y$X;FO2LLcgPl*8M7cn&<7?6p?6gxz%MZbFFT% zY!D+LS$q%}ye{SrD|R9mkM}of8uRX(fgG@XmmS3>oLFF&Ty8X&n>vj!c=$y^m{Bwu zG(pS2%shcrf;(X4giyU!_S=&e{F z&D_hRV$Vpp<2QXww=G*_^z_tinR_8-U92FD{ZXbIV;U}uSeJje0ROU_*qLXUn>-&u zg{YbU2H@HB02W!ysSsimN&0M^Jn}qif0F2G>?thJ+59BN6+#v@iK&EC`KOUwiBPJT zuhBE0JMXBIf#+kSFFz_(@?B_T7vmbxAR|VT&wptavg|;dG$uM`Lq^m!@G$TtfM%~h zJd^Zk+WKT$^?D)O(=TSNpu4_x>jf^1yIf>tqIWY!5OLH6#z|!kB4vr%RUZ+lR__U5 z?h!luaN9AiRh0diJqxcGW7R*}aYq-DMD+oTfk`p(vHtTMc1s+lr7s-Ca^D68rpglR z<8WDui|aPyE0y(5;c9aH2UeROWJ;C3G0g=8SkL_9bLx*1JO8(cIfK)Xt`MNly(sAs z*#TdCX^=EGW>QrjAW{+2`|ROojMNv$uMvo6rS-TEGQ@FBdGNmEogr zJ|*Ct+c8}5AVP6xt9%&jK!a(hjmvEPh;od!o2Tc2t|Ei$9~xE!4OBlesG?JyRN)ba z%ErS*gubru)>p51$}jnvLgGOxIzuutD{@s# z4cvj$$F)7xTI^Rt&}wX2^Gt9OM4{b8DOH7)-L<a6_Id_TVFa&%j-HMOQ@+8b7hX~KjE z9Ux0eM8PHVCS7+^Dke%Y?g`G;!nfa>lYU z1J73B-!SnkzV6cVxr}8GY=uW}J%u9^$&I+Z-B{L!v6;K-u1ET^E!#7(-H0WO_Ak$O zQjFFq?&Z+*192zF*|XBT@RNw8?$)#Wjy-+EpZ9o0t1wLhD4O^AE3U{4pbWG6E!Yt2!~V&5bR1x> z!Ph>_sUB=SWpuZd@7i;%!3%oY$2eyMvL#int3&8QffONRKcxUU6TSiHP7K@ylLOY4 z0XnA6YCfPa!QV$$(QSg_<}qdSG;kf7zTXo{p5g%kdVtyVPSmF6BB06E0C0To+cN(8 zx75=B=of=vrdpGO}pC??~ZeCltag~4WiD1UFeko7k zRq2qKdq9^2pzl0cJYjh-bvNzN)JlGhji%Mh4`3I4^1P9b#e^AyftVE(1oB8C_x99PRGJqW~%x&32=3uZ(jvY#Df5< zh5yBj6p$Q*I#l~ukRRsfNbJqfde^kAnxMq6328+|L%;wV$U~los!a6A!XG?tF;0q6 zQ*^k?ASoiC?!my68{EXvsU>O#1cT03Fy;ujNF|XY0nR4%gQ|<^sYALm&W|!vcVo z9@J5uW$Dy{AB^jWY4)d-Bs=*_wGB0FVBK`p51w0kqSNSf3PU0=33XaL;Qb+o0ov(b zr%}KBV{U)^V`d(rB?$mDLhi#o2}}N#_=X<5vFdz}QQyI7KH-}R_Iws6#boJ&il_&2 z6owg8#e4@KGiXE7C!>O1!@0@C;@e0z>eVJUd+sqMqoI4~k2yP*zTQPPj?!7N@dX#* zX^Dw@d4hrgSz)07iXg!G{SE?F&QHt9?-Q^r{-1r!zKxS#K~t3BcgBvi&>6{TZgN#8 zMGk82PPj>yoQxMaA{E7ts(Vz;pxTuC>`qH(&B$LIKS(apz>oNeu&JCV7awb1> z+qis{6>smhb<}0L@iLi@PwFLG?5t8|k}qQ`;2h7R%CI!>E}(AvWBE%rHbDp*eV^9o z5i}DZ`cy$tJ!1ZQpZQ}}^eNud1LSWrHwZC66fWQahdd_-S8ARZ1@13|N#9t8w7a_P zkKJ?YtKHPF)E-dK7fHWQY<=UC_%-U`A=0f7{!#|}MV@To^OF*hRYCh!m(09l1>m@^9--yj+e?2RDqDi4o z_=>Sc@PcdbZy;zo0RhfvtG?H_c}yoJCOU>p)CZa3PYgR{JtY)?#FCr#JUpFLuX&pY zp>jr$poHmC%{cjX?qTujn@Ps6zWG_6raTl5OK}(%o;~}x{IMPj`i8f!o-fX5rli3$v%L~WQKR~dLU95>m7CD$yp?d}1NSe7$89fy z#CD7nOEZF5jXg8o<3a>Cb${t)u*(nqkZmTjQ?-~mM-mwuiXdyVe0rGQ;EN9Yf55j#`i zJru`DTb%;f03&oirZds72m{Hoxb+>o0|+ZTiy{5m;s7W-%d6+%DYuV>T6Wcqp0&v(LPT()x3 zz|l}m+;UFlbq+(U$9#@vy#5mh1-?fot@4aCu3ky`oMG%2G~RfG)gsR43Wv1s=t{L8 z@~DsRAjV;7KR!sMx1qnX@P^>G1esz18)_c%ov>|U!c&L4UkcOhknArkkYD`wujhpq0u|G2^?}ly|woW)BJnhlU#R-QLoUb(2VAr?v|Hg zzfjLoOyO*y=fkmho-p=MypchhbvQ~cG)clNVlr&zNOE|yAh`GeZ#-|WlX(yGmF$f1 zjm>Xk>$R2ZRGpM>XFGFiS7B^a#*g30bMuZ;JS!X6X* z)xL1RWS`wAm?0`Yx4wPZV&rUbsoM_1ImTk5q@stu&g&dKLm(Hj0l-F5J(J8X_f<-SbLw=5}f`=(DYPX83kQw_;m z1^yAFH?*26fHAyul-mEz_h>IE3cVtpG-OhyxaX$c!eGF~d0MTZI-6xm#vEk)nk0V3 zx7cZM8FLE71zQ}>va#JdPkTgX71`^ycBGbZZ~Cqb3L&xwa~kgbtfo?Pp6(nB{=M8_!4)pTC1XK&yyCFhA-c0FCD&O9S**gg8*+Lk3v7 zBao$!&Jz#-D^sA4wFLxW2|9FM%b8LtjSP@l8D|0S^(wMd4doKlM{T9RIKFNpo5AE8 zun^3~HXcR=0~CumGkU{Z(DQBR26zRopEd=UXz2f*k#Hm@{VF z(PR5$t|SQFa-He`e*vBz_r1S7UkNS>hkr&JwxVyPin*oU&cfy$#150zI`PD4z4$$+ zZ}roXyMho0=Ojjh9O}rfFD&*NRaV{FV$R59>BH^;uAnN_;Z?*3%YZk(VcOxR0Fsb1 zxdJcGNagBp04R3jMV;|JEIGqYk+_%THM-8B8d`oyGP4DRlDNo|5&M5MFlvx z&uH@cZ{Q>%=)!P)A~^EaaD)D>^jO~s_cl{)Po~TcjMANtU+)@r@u@P#qFUY-w#)R$ z^-9fn?cOzg_1RNw(%rMdP`+ON!KH(#o-&f zQ0h?~Am9MVsgdv6AU%MG$bh6iM*jPkfai2-4moK;I!mZo9>^gF;w@VthnGSoX3dQ? zua%Np0-mSxIh>aQh3$R`4FTWSm5=xw5QERuwPn}jxEvdh9PO%nKi##aq|oy% zuJd|scyuP;ZU99DZZguSi0Ex(BuAC@%`c|MXZFL4YxcRtxW24TI;UZUyQMlSbFV{s z>)P~0wu45<>pj6xPGvz+ds;2heI-zzXc;M2q|}TII?UN>=#*zwFJpQhEhP|FtJ*Q| zC%g%jZinm8i=B>qxFO`~@*;aI5Y{b^dB1(;?yzQ=`4x}O5ktTxG|@6f_EhK1>I+{l z8^(2SoShR z5=^k1{^%K%mHd1v2xT3Z#c%m;M2UZ4{rut2)?%f52VY$h*xAec;)C|L&P`3r_;}Zl z+=P(weBSp{C8eIp$Y;2#5fvGs3w>jCo3SmwPWdrzEYS~a6CK(#d4a%bE$jhz4yig@ zzP6!$go>70Z+82#bGQ0TN8f4u*{7D(UPEQF-pWCkLWm^%G%cxttB5RbIM!fZusDca zOii>`wEmFTez@!T2JV-C9@^zu+pWSg9N^?Gd16Kq17uUf}FfCm+Y=<27B-1TR)FqxR%FnCK; z8%3Y-c3tuS4ZYWYqcYx(#zVBMh5*gn*V^JaN2k+eW%}zAM*1505w)R(SMNrjs&n zC2ic>=u!_|;8-a7(gky=j-HD8v!3H?xJERg5NDr)llAcbGHjMtTgOrTCEie=x?O`5 z!XO%=Em|A^*M8cn+}X!r7+5F1kP65)qWJA@N~?7`^naO2j zZzz4-YwCq{;;$m|KQ~*=Px1r+O*dJi4h-ipK%`vd#kWRC^{BZzm+YfAsp&oUCo8_z zpEM5n9G>R3>dfS83J&;dj*gMYjDt~8$Vh;Now-GMG{cEkUm+2hRUiGkAn2;5FWY zez7B4eh0zoYUnZ5MOY$FE{(Z=mxp8vG{U<_b_MdojadLKj2jd0-%7v0=1ha5=ui3z zTE2t!2t{rk*lErAB}HV`lPr}$%>cxJmc0Onb&!98=G@%3BLqB}Qbpizpxdw;m1}AD z6hI%C$$+jqA=xpj^4~#dxA*iXn(*CQz!s*u(;}c>$PI?{)%x$C+@F-B&LqmXj-=!w5KP{c@ z8$p$gY@$_v_1(Q71Tg7lCqUBLg=`0O06bbsj|G5zBP2WYOSV6Z-c$|UJtn<%k%v+Y zC6snUTB(3gj+Ia z`OeA83=z8@EiU4}kbs6_&~kYoJ4Crgg^s`j9cBb!Es5VA3{ODEw^*36@5QTru6t_^ zSs!wf)3ms(=)(AU;a@h3zy8Ai;2syh9_%k)&EJ;opV~zehpIrYxXxfF^xKgfK%a6H z>(Z|7SZk2g{ra(C+*O;TK@(ob3BxOO6Bl!@2U`vYGpyeKYq}31Oak@Uwvg!k_1DK5xf8cWeTmiv_^OS6A>{}O0xMG& zthm^Kv)r34cJo?n5{Y3uHC5*y~vPvNWDOShUrYdSc)5?wF3;;l>pdaL?7y6KZ6z| z4)bQ7isFfaPK_ZqgAjIjG&Nb8dY5LRFZ&(jzjJC!2=lb>#jobXxV|w1R3zsC{FzT{ z#mI%ibb1#Kydyz?ZLLC*G_P<`19mG&XBwZ(|LUut9B>bcEQ6uOL0EyNfB}&@bLGz!Ry0ZdTv-8T5btOS7ehn82<} z_y^Kk0HgdO4&KR$+=D7zK#a|xsIf%o)(Y~Q0Zo<`2K_S7$P3&9007(!VLY;Z3w2O% zFa6GTbh7E+=pIK`6LSqk_nI?R90_L5&tcccMF7t&9ClRcEb1^-nppA-@2>cfxVf`> zM{GLrv#^Du8fe}vUoi5zo^}rVZv+kh%5M70%KP8qLjC94%`IL^hlW=cX8FN}i#KVS zy()d)_k;~*ZD$-&m$evcg%6B|9eA1^ox)pdxSVMEsK8K9YMB_AFnMZb(^D+f)IZs@ zEHY7?R{aF)Y~p2G2&T+6rmG7+Q+kr=ATiJb1J2853o<1b8;ObeV)EGz1Rs(9&=Zf` zzc_IFsL#i97lQNIRHfcGDZW|VzppWnFQhcMXyK-4k2nqf@n|Jl&odxP z0s1L_=~=Fl=K(X_^7a~v2D7M@({!`XGl}t+5Z_9rmAcEU)N=)lOQk11QUu7cvKOT< zpszsjuoY&)EbbPXJ+x%n-hSzbaN_+bMeGU>gNbqLv8&lRhgk$Ri_51W9dnO;^2Kr* zhqDr|DwyA6O+O>IJVt%Hht* zH`kwuN5v=Hsxq0oB+6>ZHdbj9OU@z|;(%HW)nX)0eniZsaG93gF)FIumUHi|@IvjZ zSg;A_`iC^W43#bag&NQEjW;4%M%u86E_F{D`2gu-_C~4OM;6lee*KzVdMDd4-6PMF z6d3oei|<(3XGt;V^fAS*leM;7*4?VmDjpx&gA~|aOZM8ln)?rQ{n5gJXYeN@-6;g; zHgLGT(r|+|+Mls~T5!~J^DMS}>5Z%3<6hakG3jcYgIFPbC2ekvAB_8(9QMDDgnu6i zf9yyQ&ED^EJ!t~3oFr+Qo68n{SwUOqkts6+!DxoL1vWm`s*-((tbtSNzh6(;QI>w% zIHJ*Gc($PAV)EI>JuiUl9>b7W62rnzuE?)~kYD=YzdY*y{tN$eOJ@WiohJNlK(hzV zCAtz}*XH!|SDmcq5FR#*oIvA|5$h>?w*Szch_N-i69PeE*Rb;#9O0YG`qw~PqI&NXICEcMM{Y{mWSb+(I5%dXwwpf|w>~Yz>TCDd=)foAvnzCeiB3ek8%0 z?@x;Yaf(OydHkJ#wSn{(RiGCk33eeG=^pQ zO6Ul~#T}+?F343GIA#ZqCo^Troy)h-847Mqofjm>R!DWb4u#+7J5yoXtNStuIYg=B zHvvbq=t;f%oZgtlM3yE>&vwQ&iY_%CCZ=>gS~*6Dd*57WqwS`UeHX74$Lb+nbAtU6 z`~KPVQ$;nB&DZns3h&)VEMwfQtjFrAcEjbukEWdXm_=K`%%nAF6TR{1c)G|ElwFhQ zTiszDT#;^(TYCN3b7{u+Wick&~SIW?MA#|-dm~hd(S|oZ`DNC?rU6wC&*dZ$@K7CHoHXawgNgE zo2^vWV7dl*&#;08bS(x)=?}i$kV8np1Ib3S(1WfR-j#-v1q(tF@9Z=@?>i-0k2+u9 zXB0N$@$o?BYgIK~v_Pq3@0$_l)Y^_b<-iLQdEbzIh~PKmJH~2fcrM3RI|mSlX2r)^ zC>o;rgwe>lu^4q@Z<2;cbH;6+89oT_j*cRmqqBwA5|IsVlxy&lgEpOrlY?-cQWxKPr{N-{Rz=%M9qPrRCU#YzY)MVE zN$kN}I1hNvM>9V>#lV;ptNoD1bcQ)?Uj9Z}vb-un02@-@9jW93=Y zfe1V@9=nGw;JTg8wHBrH8dDxRp_$;ozHI?}(juZAHXkd%@cE&gzio{22j`<@q40?f z74j6oo=U&cjCZRN%YJ9oL+_*?B7(ykj+ET48;z2S@x%+>veP#ia=IM1x7Pf?g=|aJ zCg$?c{^(Ua)k)xzRMwfx^LL|T()?(MTjN>RhV!|20;Nv}q%BWT&QVX0{cs6~tjT?C zmnUvpcSd<^$;-+(r`@?!7ABs!f3dj3d&Pd9gFwIW)DkVDB+uJ&a!S^mQc)ooQfz3IjXB|ShBK(OqlM|QSit*bKjyHHV5HwzfORzI1?!CN6(4p6T^x-((((0 z&AhJOuqqoVCjiOOi>a}7o`S8P(z%pL#uqdJL@XQ`c;Mhj^VLU+*Yulpdt_te_|gU%4FMkKt5Q7&l@Q$WX`$@wfm)-FQ}nHE;gTsCE>-;% zyEn9aU0P7?2qA#sj}6*XzD+ zb{OL$MIdVYmv*KLVI@z*11@37%6$9Uvng@VYuSMdjo(Hky%X2e9(&od=w4YovSYQ^ z8p3a12Z8owe`Z{Oo`)#dXI;bt@jnpZpYts+{%oCnyLsHDx)1c~oyF>5=gpG>=X|In zP};7gC5zgr3_zS1k;wCG%oO9C-v-(w<@s~8&}@D{Uo%crsh_xHbLX~8!TH`m z>TxdU4aZ%6S1pH^i9k$UiF5!K)AWs{lDrnvG0t>%$1#HRQ*#qwk<`INBV!pmLm&eh^PM4W5+F z>f5HlIw2mz02wO=)>4n?jNccO+yvvup%rclX|rJTH2l<$lB)@jd-QvAiA&!>tkBWoi7GhyBB7`K0OkvCbHV``nQXi*kVqLINc{GJ zF-bww0F2~*WUj_bcN?7qR{KS9f2w}6B_(OJmcvYX)K8mts@Dwyd-O4=oxp-&+N3&h zxrx!!9xkXJl3>39JLf=W#W{5)uI7EEr^xnut3nDcng+*xp+>TON*;Nh?cP+x#EU%% zet%Y-f-m^gvg=>}mR0*D0|fyP3n5iFehf`)Gld-o1dY$FwV#Vswy$P>V_xO$L;h}e zVTeO|cZIDSrS>*8QTHftVZ|4@(E~gbHCAtKFsy;5Z9B5TyRu*n!djsT5iF9^adYva zgbO{m7s?g(@nVQ{xBOfXRmUvTk}qz!R8Az>`fP0Pz-0d6s!drNXk@iO0caWksEqv}ZQ0Z>AB3y#feg_Fb(Gs3;d(;V{4vP8)t)#(|$SVOWkNC0e z^K=;S4)9;Ziui(vEdr1`iby8(L-*)0OoX#w93V?iwcFu+G&m}HaG|58B*3B+|r` zwcj=M7qD4tNp}QQQcUah+rBpd`nBhSa*EcG?k!I(4H_@jG(nt4SS0p>xu!rOWQnC|ge zUqH2y&;Uoeoo^ocErLKAdZJA;gKq(_!2I~<@D=`fQ5-!K3#hMi*-yWNQo&eOnoW7A z&mjQ9EjR#f_^RM30(}Q?R8;;4uHmLX@N`%4FE&f8v!>v5v+7amii|1Mxsc$-C*Vj1 zPaMxYe~+^I86S6GPW6{8TSXCs8=ycpNKo%5ZJh1eMbP_dX59CX{nFH+lOmhwUKvm-qP6K{ljJA2*&sWJb%ci^c2C{`R)Yc_ z>kTjV<=EbW_!BNMo&Q-|fzd!*MFw9!1aiE zBYx1M1kHmah*zyd^6n^ zKy;d?k80-?R|vyoGRGY8Y;#~^<-W0?%JF*PszmhPMD5gtI*z4rN4qWV+DBFoOsqPw zJafq9st0;0P9hlOo63^CRKhCYGom8 z`pvZ~JJ}gBgJgWnGRvntOupfuW^10Vz3ysK{4=xoROEPTdjm87$>kBv(&EAs0VOB0 zPI%OKO%PY}#C&by6v=3rEuE%E`J(Q|Lipl2@&%A#rv5jfN-};$#l~j+;*{YP!~4^1 z!%k*_l)w><#%Q!5XbD_c@j0rl2vM9u!8(<2ukRB}$?6YD0;^1k)06sL?-N0sSCL6I z575jpj7)SHg#w9jnW;WelZ~q-+q(jY@|OZ$SEY0v3tV1oz)2vqju22IhAA3nUpl=; z5X3n3nmdS1b?VNZYpm&Dyv}{9E0K&ynN%#uMRs7-YC(f0JTz9oVMyBt@KXyWEVL*s zc_owYX_{XBf(C~oXe42D0UFpBCUeW;@@MvawM`XcQmmFp8CxSnkd(nRvYs#D-6YmlH18A!xk@qh|Rq z)H!dq&8$arO~hj2^28nyFIzo^Nj;s{b_y*F$%_c%QaJKltHx2ESvV?`)d^8MxC&wI zV8NFM@%QVySP1ke?vV=KS6P7yto3IAnHY5R$uMrw9zX!R*R^!n2cJ9uW-*X`v?UZ! zK=G&4&Dn7eRq+04OQC|;0s^BxcG-(50$~pi%i^2hrnTD;5lS_hm)^SI9-{4R{dDnw zWyA4yoG5L=!e!lYKd79`@l_NP3Htj zk<5oScXQTZa!ERcuq!!d&35@#ZM_K^deIb2vS^*H_^Vhjsx0z6p`U|pB;dllvAkSy z_lQN_z)bqr%hieVwgz85#=kIOX7iNXvh~e8Kj^4UN9}7@{IXg{!)r1>uE|1OgN0AW zBB(|=9(NH|?Sf~Dr2r-@ufVZUUF$d~&2s%~e54F|vQ-YsA z2fQN#h1++^iaa1@?g-3x)6H|{Xw&al*UL~obekZMmB#G5=_aEkz8TFUJ^|ESA0+_5 zw#*weGP0QuC*IuAoKxT%XK_QCIe6HtXQ$}xnh|+~LRjRwv74@MI`eH$$(UCuxC!kJ ze1B$#N9#-LN<$Wm@{%Z15P+efk{8ZTaN`Rv+49o7X?04tjj9*?nXm?-!R=|AJ~FqY zi@+F1uDp|ESYACu)Q7YC%QL7Usvkt)uB3~S2aO@mH;ESBpVg+DHgjlrjmr} zaTpc@_;hBS0G}Dr2cUMYX;}(HSzPi^?84DYY3SioznpcNFUtf?7z&1u8V4U=HrlqG zZ>Tp-_ zbWL7Hc`gEb=#PHoh#%hCG2p7k>D&e2`7}QomTzzX?pNsgAVEec7=ZRef%ZOfzcK<% z85H}C5d=h%cL_iz-6g<9f;0jCda^O`AKXmdr!Bj}XTxkBIu+dkuLRtt5@)&WGgxn= zW!?CN4k>FMaS6G5#vTbzGTEmat>}$u?*Ef_{->5n1O(%!+wgH?WAVX~QlJq1^@-<@ zuJbn0Kl+j5pCD<1&S4!y$pa+Nso5wzFR$1m#)P@-D*|2X zfq+bRU9Q|B`hRqlEY4T-KpP+3WObr}yMfQ-#0r;~e#H`Nj_k8|*@)CesIE>wO@h#`b0qXs!Rhy-E!X|B5_e=Z@M$AL>5i zKb_-vSa)y5?V|KCtI)~gJ?-EifCB;L5?+D>TCZCVSV?0YG*s+u89PMW(yYe5&RIWC zm0nEeVqgzmc^CY*Q1h4U*l*^KJO9D_@mHF>#e?aR00N75V#9x)O4rGRz|>RFbQ7wX2HSRQ%p)$#>>66U$HKIdbrZh!heXP*=3&;41_%!N)OaB^fnwH(s{0povc z5z6-K|ItZ6%kfV-iHQP&0$y#)kCx+GKxFpw`>r!Nl#e)gi%*2!ADr|-k!*Ji=*`U{ z<|+II8y%G^YA}+c)VBxUUpu%DB4ckHa;@C*TZEx+!@exp4cp-;-c`tk>q@SFQUuB- z!k2-Y0b%DKe8n%yi1!zr**aJUZ+riPMS)flh67u|F})&;fo=bxBP0K&aDRLy7)X$O z898vpvJ?ZE0tKv#BWcyEV!znU_KDhap)EVdmGAIhe5OVky+^I&Upci(|LfxP zxtS0B-j!;$XEj{Q%cGW1Ig$HhB|+*c4qDjPT1ZInlw%kH}w|H}{VrDIL|_ZJh>gve&-`l!Wx%a9Y2B_kR5yf1m*WasB*%ZpEBK<-{21M{D+N zB-8F?z5CNfjW}OoJ{>e!se5QgA-|>Xt#OlXs025g(PPLrtK#8e_pU9hv6vG=3>kBu@jFsK$9z9W8X2`OZv5}!Ke}1gapr06BonabqfJjYa z@Lr#rF&(NZE7153GR^A1h#q*OY1#*z!3-7}@9eTJhLNzA#=8q|v5`y>Cl*tav3z{! zmgxqQO2tr%3g@CR>Ym8S#D*g$f(i+xlPT?@q1#3`KurCKz>a2$;=*LkiU9p>)xP(9 zFJ%~yo?bsM-R4B3qO+!Hc&}Zue2;@gFYMyXR8;lmF8jlW>o+HNa+?x{R^VrG3a^q-iRQ`E_UnNz%{A*^ zdVM3clI=b`L!2it0BwGG`3ctZ7Y3APB%IWRb&0JxFWKKY$8eZq9c1Jn>ce^CFIUwrCZ9T;Q1{LF zlu8{pTZocv+LClkX+>FhO7`4@P2_7!-$%P6epwdQCl`IseKm=v^X=;l;u)zrBA&KS z&wtA?=6Z$5;JSZygHdQ)%GO|Xxp&(8ytD49J2753+@ZRWJ|M&h#=8g?v~A~qhSu}z9i=g%GS z0)^b8%$Fizw@|(JIotq}>0#l^=Jl|VmQ9l;N)Bc*hTt-lw7%hxE2WYEZ@XkidDMfQwf5dX#VpvA|Du2)m-v+h?~GaFRhW6xoD&Lo|fr47Ot2D1D{*{g@{byEi7$6UO8w%ICa8E|`=_+w$pZQcB9QZ*f7o zL@`Qufya}e7|}7>W}%{gtPq0?M^@H8Ca^i~_}J5h(X|k zUEq6qMnwtm?m-%YG{IJj20zd7_PFf;*tsO5A6+E(t0M?e3@4KZD&+t;`!Cx$4yqH0 z03-Zs2sbu+-sT0Sac;+#?>2h23P`cn8{5RJr9R20T;Awan`Bnl^reT<1eFvKtg1Fe+KPCk~1=8@d+UKXp^ms~(H zOC}-3_pG)l=veetwLd#!Yp=o7!9n@0z7+!y*WN3Ns-MHmzw-;o`e0|8&*N?ZY|Uca7ZhG7`!%utAp(N&J;k2_{eqT&*WQRa45MQQti4iRU%;7m`d z?5d4fDMm*&$Cr37CMPR5jwW=G0j!K;f5V;sdYi~a%Px)-mr7D35;Uu{B{I%6I^9ug zjY)Ksr3RRm!5fGZU7S3=xQVV1yN&vkQTpY}s%cgFEsP`MP#b@ORowOUZ#|qT`~u%W zjEdbCT~*2cOA_^a5&R!Y%3|enJciz@> zCk2CgWQvOnUiu{#>CEanA&iQ6bv=@jYpr|xpS?PLkv8-?hfYaQFA}uBsjg{wg4%EL z=OPSP#7x#BrJnp}fF>CG{dKe=xQD`xnTk1L1 zdjId_{`aE#FGgT@H^BtT3{R{J67y#A^?l>C7I|lN zl0tWONR_WEc>gyvn*OPV`Sn5iB~tv07pe77&9Bjwx}Qt=ur~uy8YL_)1sW0PuQh`X zqGFUS1NDbyHhkW8w|A-7M!o`;D7jF)T=}pXL>d_s@~L9okR$m+7r?U06cSyQYFYEf3sUYMj&xKClA?oq&m$jyMsPWrok zpUH;we}9ADtBPNgypngVsH2!kZ^BoJhnzkZy=C#q315lhE)y@qzJM1dV&QhM;O%*2 zq_`m*hXK_%YZo#H2szf$%jf-=q?)8-#OPN0&QS@gN)>|v z5rfqoQt=7ybkKgoKkaY)`N99+!lV12>ac%pSs;63{e(9_2LGZO!Li6T>nukdF5*uNQ(9s=0Nybx@j)un4 zH%GN@I7#nX=fcLi7`?~0h#T%PeQlJs@(ZU=(F7yT76XK%ENGurCy4P9jB@pgiKe_` zwX+ClM*gEWQcxA3gmMU$Tl?J?{h^xr>7qQ-!8`m*+|3ikuQ*{%KG;arI+z%{tc@`& zV3?|RtP`GIisO8R>aASxCRJU$L-qkk{Qh!OertbyRa(K56TmS%QNdYZ7vB2D^&Sk) zTiL^{Dx&yI9(EMpj^VaybMEHiNG96a`=`$%JrN2~nY}Ry-M|9~>Zs$6wCLdzMQr4} z4~8c^CRLed_(9U6=Vzo6cz3!%;MvXHFFZip{F6|aJuiYni#!~NR+0E(FmRXNHvvCK*_bG zYl*6pkdc=O-T9oVNq5_LnUjKvcGEgUFJMK19BfEh00~ApZ61GqR~bgK9y1l&{F*-X zbwKo|+wVHAVn368eirlvQMpxyfnCS)&kb@+`XV~AE~Y++L$>V*Z!kvEYYSDFn_t>_ zCeVqzeE;a<^Z>};;E#3JZ(sOd`i-mwTH}Mu>T1zGVcLh|859C007z$ddM&;YXgt0( z*Z*_l@z4iY6qxrf-T5bh{MT|cfDay^VVi0w1&?`2E7=Zn=%A@;)D=e}~gliHcTy<18)BWGep zmU%^C<A7l@9T6# zXlwd;d$c%By_dfzr9!*mi0H6(9tdEP`BNSE*9G_Ig$KF9TNInY_ONsChcv)s$qDDJ zh4*6tSN-WjhG7EwS2A8G8ZEQ%C@#E$R4=%ZUdy*g({CSE>oE&7tkBq(ajUkz5mvxH zh7gYifIK2tO#qq<2f%!Dqi_tvQQ7hGb2xxZ24ndi;|pOG0Q!x&5haBu!2!j@P#qvm ztV5_@0s~k&G6Ff=!DyoyfF}R$F7)uyM-e~3dO*Q01U!{;A_@SUqnqjApJ5#ctRxtx z1YXa&_Y)uxJ{NJ27t{x!lkabbA@`UtE(9u-?;u*0UzTy{->dldUi@1w{yWadLMpc4 zgcm1{E^C9+`KD=($=^OSwMp^tIXhTL7So_?OlpxC6Q|_!Ex;(Q!^-|n<~@5gKD+gz z{cCMYlJ5*+SPh*#p|=qC`}Ay2F5`qFbCHd0sl&_6A6;SK@Qomb~e=RHF%2@(3STL zU>4@vo^Q}c4$0tHOITBvz;S7?ws#kDx!ZMmyDMgC(D*PnafGzp0N z$P3sJz~u+Z*A5r7&tnm$V|xQ+T^jkTjA>J0wdd?HT;}0%Gwe~V1r#aQ$$Aw?K!F~A z=63&q$>H!ixryT}$70YLozOF)`RCxRD*0+IE;8KY&S~e(y4`vVUU1!)3s&(1?ZB{n z@J=V~`Rt8DYg9aQWygX`c&)ClT)TH=HnK~ytcR{{P2(1W21kX8C_sW;u<}O^NBGv( zRO`6ode86$f8QyS2Odvu>7?Ed1fBj&mKi{d&w}_dpHJ5ib^r#?-pX z#Hm{^7+_nfL;^jWS0GsFi@`*zRexg1|FN|FIwT<*MAi&jKN#u?or{N3Ay72<6mCr3K=X7Nn`5od%HqA)V&>}D zpvS0}5hr^@hf4qUhyOvKBfpD+Zjzw=7h<;~J%NgL(6yS0E%{4+68L+ZA$H9JS4Ege zZ{1nD?G9S4*XnEmGDbIllkX>%YPY-q)GDF@z6wfg+0FOk60yo$Br~!54cablENf*K zLW&ihzM`n9sTnu?JZWOq;o{=L+Ci^#@m4JTJp+ALpiEDu4odwC$69{k29{_JMibx* zs0;?HYj0--Fj@(u(SC_v1x@psEDe#u*PN9h>b6T4ciaX*et#Yik-flO!e=+pp^e(% z=j63fax<3xh)9l$8SUu}$U(M9_R*M)K5~8C@+IYu_qz%th*tlSTT2R#Xrh@zoA$a=J>+S_4o_=g?5w;@^IN5Q~7bBqn^;E<=$a{Po zd~uAx06>yT?g1-!5_Axlu;KY%(2`_K=3sfiGY^4xI2A64Ny20`6niC ze-xD7QFgPmpje|=4U&6D!WpAp(=1@5G25Xk?}Q|Q0l;D7WR*<$UcsV$L*&jG+K-u~B;^9N7(mrn`s55)gB zvpoNEUGOhNC(xW+3F^fJ4ftiZvnxX*)3>8(;w4kYUPwxUOwK)!28+9R?`j=EEL!Nk zF)#i$Z6u$A65Z%SSI!N_$-7{L@Uv&02T{rw}$yjuz|q z+J%=^cV_ULtqcIX{W{p;FNPbwKAUpDh;#pgxX{+)DodUZ=!lpmrXt0{6-&O6`$mnG z6r5G%pmtKDv*d$k*J4AYxIq2^4n3?ns>2L-C~CyIM(2pKfn8$%I}ckipf=XpwZSui zXEL&#N-R|=iOl{kIkznfXzvvb13uv)+#wcMEdvaU5`VfMfZ8M4o!Gp2a(5M0dF=zrh1wh^GZA1CIA2$qR~@15 zwae7aU*~i&l^f!_kzY!`Y^Z+D$mPwI_W8GW3uUn{-YPmT4r`2kpasCPyV2Rlk~EQs z!-eml>Iv-wZ*f8(fV(4tFZv?aoq+?aI0{EUfc673f|#It4q!U?b{D_{m~$e>@*M=p z`@!VCL$hCd5orEhgJQ+NO8_GI*S8$PzXU)Z*}6bK3hapNc={Rqvlkf0LIVsEc+@e7 zXN_n_o&e1mQ!9i|o$$dz_?##%6z|jxU6G9^P(~a@;N<6*39&clUGPAuV$&0DYXw~f z#se`C6bUtp;5~LA&?3EwYa?*Dd3*;czs5)raPWf$Xk~f!?ivhB@f`%L%*>tUm^uf@ zU0osoCXq(E9^h2DgU$F33WA({A{}0rv4;%z=XHbbR#CJT0&<>*E9UQcyr4p0-5=lnZ<{e-*-p}oH z&uDIkn(y0lG{{EQ#QlPfU%f~nm`Ed)Hb2p`rib1b|NyCXqY-U=rvCYz*i_*WceYhkXZeIv@|W2`O};;sEJyTHmhn ze$eHB=YQu7ap+)t7J6|)7?DVnh6}O7EcdA7mj|g+>JvIS7-3g(H!{F++_T@;=5pStd zsx%bh>PuH$Q%v7`v=}`x^NgOrH6J+aOsYw#J6OgXKD7y7V%`$|rglH&W=;+%zyuD= zU?ks$@m;Q~WpR#|DYEMt{(DmOUqZWoDftvBKFt~MZv2ao9Pv}}8lQ_dlZ>;w&j8R2Y$}-otv4N`eHf|=BU}rb#{<#f`T=*qXxsAHxo)#C{ zO|3g4fx)ePdLw0YeRWk$k9otbKY2Gu3$#6Mp(hf_F8E@9jd98_#mdFn+WM^7w3%6Z zc?YHHYaLp8(7RV45HkpL`BcZBr8dJ(T>+S!DMd-JG%ZkS2_^I(keN7R&%ri)-smN8 zCZso=##Ll&Xgg1ZXq%q}&Hkl81BmHPl@@w=Smj&HPTG$+FfpRlgbbfGaHmnfQ|~NZ zyl8U~WE^~E@$mo%Xu$n&VvVIccqW_nL}V{|kR(7Ey_cv2Fd!Q_)uwh_aFn-Bx&4-! znEmy$OtE17_hg?vf4gTyB>NLz9+rv*4Jvd^&OZuxOBaynbb3>1tBaYq{wGf!ZeIv> z$d2&Kj(>o4e1{cUZJ6nd6WU(hjhQu7Cfsjs{|?H^F!9C}7mK-F=UDi=Na5En2}}g| zn>P@P3qSnI2`V7Db3N*Vc@mzGU4(mq;oye%h(_cc8xMthBW;u9^gKw+=X|(nIx9;|vpA^pj2<++tPiq?8gQd+y*sz8YgA#@D2Qk= zue=v%`_6cJ%`dQ3+tKH|_UX6o70MqO?rkWNzN3p(E$o;PKf5Pz&!P8;wY^f+$c)6s zM9&L9pI97U5jOm=HQl+<(XD8;*K}=&dHd-pBEA(9#pCfi9Xf>uLt#mYGeAF3(v_CE z)g^{i>7hb&&&DA<(K?&fH|+*GHI27=UP0>u1@jA?rR0~d%0IICrN0|nMWHqY8o4MN zg*uk7PoA{sd1N$42TEE`YZUpf8@o91oL6*X?W2o}i4miLyc$nPOxHCNPc7m9Tqt0` zkjFS;kv{aolqObQc;0_gEV1c%3QI(b_ItZ7E<=XrD| z%)lg95?N?jd>Pv{R{4>n3Jvd$;c88&*@#nhAn&Tk!)-2;S1%yzuNP_4`A1`o3tMKc z-bYuzms}2SS4nmAw%>Dn&4`$%y3l%j`*{1QBzf^eVeg}no|lF7A*F6>{bUR3)egSC zvMCmcYxZg1u$d#YY`oOeUOfIhjTKIWGnI)E3xh4|k&N8f(A;pVV9Oh!7$Fctdu0HH z_rcp^el*khk$CZz!NoExpGR%whssf>`^_U;sQQ|hH#;Om*x9vj6iaihT83G(W6ZN} zno7L%OUPqi96sp>N6g9(5xBAUX&CJcC!U#_t8qJ|*%e*9+@W=&k~(L+{=ILV)!TQY zBWj0)7wc}+HzQHS0TmnOZ|`YgjlPMJ%`Y8=zZNl*^eR7j zewq7}E47%Ci{YCbEd!nkO?=nOkXCJQWI2M0Mca73?#H*J(YvCY3x$8{urZ5Wwi`a>`kgWB$y6ss9UDRz@@G| zo7(pw?pVe~ls2p|E>E*Te3cV#D2)Un99!DUGWg^Y$#%bUDY zdE@KRQDbyCTqLH*xYYF6+Vu|D)=1q}&P240Ssgb5sR$ZiY7i7BzrHT%yvdP$- z*s{&ZK$m(vYtiq&vJ+;RHdJ)b-~{VkIw&bx^`m|XDJgL{XJTR|vTFOXm6JL%X}iNy zzwqo;kGC6XdflyeG0ew9ZK%7;G7l3Q;ao4!$WCZDL}ZLK8nw-0JcATxP=(k{-5}K( zm~)w<+gW~Hs?L&1M+dc4f8$b%gDf|`aCP=izH>Y-MRj>(ZO`6svN2%8t5)jTxZ%#s ztTx-Z8{uj^2_bJTvk0jH{ZJ7LW*lsN7z>bf=+L3e4oGqwXftvFUXDl|R zG`xqMa+`*}PP__u1z2%VD>!&Y(e2wTHhdgPH5Ib}pXR3RM_{RKp=ef6 zG~rARK*Y?bz2P(`Q;^;N0sWQE*LlTO4$^Z;6=*;)?xOEdM_VFvz)cB`RZxf98nS$ol5;K zs_`>yDtmsSb<+c7m^W8)p+ ze;pquP|Pl^yjd?|A#?)t2j6v+aMGn>*cLw6nS7jr#VaWAMuf#B5{B2g(K!cbsLwnxK&9d^yRv#^tTu zZ0!p^e4I^1*v0qHDl*?E1$B3tai6;qP@Ke@7u7YE6BYE0hj()#WvUB8N5Pts#|luw z9I-KVYE$C^v6|?u<$0>?u1yua^p77UDeOI0bj6oc%R`ySm~AbucK395(v&34i^WVc zzjtqWTi_3|4W#qi9AcL?L6u7y@TpHs8~T{0wOg3Y9e$+28gpF2C=5ndUs!ovKeE_V zt>M1y{`7uiD&$mIe^=%+@=tD-20{&4ZW(y3SV?t()yZ0Pt&h*xhKF*Pn{l=^NpAx# z&KgI1V$g%ShJ7sB85Uu|n2+=I@_ZiDPb!7a>>2Tfm?z6)8nH$7R_-KEkJDRcUv3We zZ%-X;PQcS{x;TX_A6O2~OdU>)?n57E(;-mK0oO(XtR>%>S*lvj9x(5gJ99}p)pKvk zUpbvfY?W!##I`F$=;<7!d^x^)1OcSBE4mGDT2%0Pd%auc#_!5-|JoW`7(nP*UGN0q zZ~@6-f^}?_-Fg~url>SU&O!B+`IZ={ulC8^`+-B5DJ0^DZ;5B#f?kokhzjkktCj6j z)Hyjh4dtcV=bmA+Bh?V+01+nwU2aUlaL2OAllkn9Xk;kBCw_6tH@;mqFl_2c+bX81 zMm?d)uDkZ)BVEKcQTxL-qWT4TqK#XmB*v%6TgX0uY_KkE92d~e(T*qtITo%pB$2_x z?{u%gH19*x#Y9!r&m`uF+Qfyt!4fO~d70$5YG5Z2b`XtjPC8LrfmL1{lGOMBH$(3= zb4k_Ai#u97#|k`#7+(>(f6rVK2UP^*FZm< zgWJZlnGT>ik%pto7Xri?2CAoRx41zK_^Y()yA=<0AGv2DTRaJ-1Vcz;`|Y?~=^A4{ z3OuS+XV>EphLjkj!`kweqqJGDQVYB#LLoHlYsXLWwtR1$S7lJ!4&@RLE4d#Y_|E!3kpQ}nK%ixL9=mnxxsmuE`LFo$iUT>m(8ka92rr|Us(6uuqh#J>SfjiWj9fEFurWs zWeHbN0~t&0lOhvnJ@m!ycAM}8JuLr%Hj$%g({5l|*RhA`nzF8Tnh$r@1GCD{s5wjc z>g)Zo^pCn(MwYQ^n+7JwwO&VM?fCdeOqKMsAmG70s7LGRK51wbOf}pYcG2D{?W)K5 z0`E^@9&thj-DTKgsXd8>-Q0--0?EWQGJ@oB5{@4DQ4J+}a(WtI(m7Qm_#Nb)N1(Uv zMDFNV!PgfN;uHtjz@M_fWmAkEtut#Aqm z^E#NiibWEBYK7W%sN_Kw{Cx*o*mM2L7xbiLg~FL*_jYhR=0Y;7UB)F38-2tE>WUOv zmW`i;cZ?bqNu)XPfa#zd_t2D&}SJnLnT9y#`bl{j^`8v zfUyr}xyy%nuN}&!bnKYjITNd&9T}BZ#eE z7&4|fb9eclO)*%|RQ#l-kXhpIXF6qi*kGa3iip03qz%u@L)z7{f}cvL-7<~KkaND8 zs7hJ#xZaoRRm;J}tR}=&<5@d(`O3J}EG7omWUR-eQJ#Te+qhBKc-oGm88id&&fddx zunT&AZI*vDfCn9q+Gys>Q1xKg->B6fsd5hud7S~;A@8}D$##m=F|pLiFwb76MR(b4 zX0{hd2fWyvt79S#`>5OR8_)cHt@M3Jbn?{frm&G#+=P++iq|Y9rXWfLC-=?~ejdl2 zMK7x-ljQZ~1qb)_&C3_>US4$7a&@ewh$<1#EsT!f6Meb26|m%$ICpu$4&^SdG9H*$ zvb9sqyOC37v{K^fS_5W-T=wnd_Zo}9Y@mCe%MHz`yCb|AG`J3vP7`9}W>M8sq|INa$0fN|~JlvfT zk-P})y5W5b5m@D#sZ-Z+pZ%i+>C-Vql+UR3Lr4Wb_wABf%}tqC#Kl5ljay}>9HeRT^=tP~LUp7(G#KfZd!Jc!Ez0ahBdwM}9IBJ}O zqkbG`kG5RU1~{S{$?9tmYRr}=F zw%PMtxa56!8x5X^eYw5th|dyeX@AdiNOh5`qhHz1x0-n~u+jN!BgNObIW zMW0wo4-?o3{eIBsYTvw63h4q0dSj$etPkhA7_^w`^821LLD7yjlH}H^10Zx5esO4S zlrS$HAGiMPi%oL=LeV6TmN23|U^Z+hKo@($lWe*Za3)Ig+5P%Js%SkfyyM zJah?e4^wpXake=sF<7v5CkM+|gjvO$7+KzX-bBJ|knCeGIjw;(;{cy^P4+DSKm9mV z;$nM7U1UmqSj}rl4p|as`h%FdV>hp79?v%yHh`*|cT7oL*S*z{_|=_zC!!xADn0d; zZ86RWCy_XHzbZgg|+?xU?M{u3pk9XBs1OTMmz&^;BqtwOydDYl`Z zb!V-{z9|2L-qAw~45U^xLPGF^1N((LV_9PcPhN&uTDMQ$y%PO3W_g9m-m)l1b5zH@ zR;j)(^={7ddDN2>;}P8xJ?|)>{Yh)K;1d-Fcs64!I!g%1guoW~!yms1j6M-lkglF- z8ZLLac5AS^IPN-M8N&`xX+K}|{KUly>V2)cT5})Hbx_@!>>9ucd{Xw=a;i+K%A<;h zxyZd>tWenB!5k}z5!l)8Y8Ju>;N&m{Q`j=okTK`E3Q?+L9b8l;#Ptbr$VbmpWkOx* z#_838$=ULWAF9;s%kuVRu!G@+L%mmwnPPL8!t?mu>!Lm6a>sIJQmgw0BX)yE z(su)tlWq_O-lK6>l+)`X--wq-c8(>5Oj{g#ggC7AQSRE~9n(^j0t~D70<{9D*yh>J zgqv^{ZrYk^HU#wSXSYRdWuVCFYDrj^RcmW@Ilsz~1jnMB-4Q+O))Udjmd?g;`le9}H zChUrwaYx}o#Ru)nkT-semr1Efa$LTQNbo-h!&;+^AuKc4h&Df}@Pv0*kKU#VK(89J zpOW1&f5qm+9j^=;fAQ&5RI6;&o1Dug`6&plq_$6d{d`$%B>x|K?;X`tyKawypeRyA zL^=^^QUs9>fuMYVfPnN)RHO?C(g_4ndJ_d;{gZy(c}odA3}BR7VuUf=z!z~h&R zMX#9O8ec}Hq#`QBERS=BI`qR@D4K}&C2N6Z;oca9=VJV6ZT@8w_6g?0l_@tkvjjA@ zQW}L6Wai>~aieY8**8*jG~L{rjt*7KBK=YFdAXyVSV`v0?9oiO>O~=kdS6rhnM%`j zJGFVn=v|l%E~WjXwm>*}n>R=+C+np`IQmXUZvN*Im8Hpgm7C_-*MyV_r`7T#)!ub` zF7@d4Vbj+xTD~jVx)S^tou3CH2Z`~t6o4QG-g{3Pa(bm4dEi}xYtknlgK!lCh_P8f zgz_`2;w&Vcob>Qaup%2`^Vb79L>xX}hqwyMVP2?cx@n5l$RazZr z_)XE`bWVbJleHuM#IFPShSg}XCwHtPP13RaQ1<9{qKEh}pszYf|A+hn&^l29Q3=4f zzR)+E5xMgWpbpjbp)$Zg!Jc5*Zwi3$S~rlFf*e`EkkTGz*vOvnzo&R8_a(L|7iL&H z5!H!pLyZ2h`U$6c&9#kM+e`dnp%)%ATvS$tga>!sizXmCq$0pY*UZKuI}V!3oYLkgl{*( zh}rUU^%5FfO8AVgi#o`0{b5Tn1~!3&A0V&0Tvhw*GA!F;7j4Uru5?@iHS^~}BKhg3 zj)+$0HOj;^#0z8$jVBgaYYu~aF!~59?s}hF+ESkv`?1x~fE8qQ1+e>t!5Ko>2b9wq zi(NuQkwN|k>k={<;tp>wu@t1{(KtiJUjBIy~P^QEIQ6HRy5p8O(OvkEA6?YUmQVWXf1JV}H!sKc*}r0|x2iO>Z~4;3JA{q~SKnW3{Q%`I@qqOfKM z4G;PjVP>Goyz8UCD+vSPKmq~2~k=fXM zXa9Ko4Sp^*{Hx*DP`Ru{$@kNmDhig)0#_kX8dJv?rQdzEJmZ(L%~HUuT{B*I^pOPLJ*Oc zYZ2?$iDXzqJ>vBp8~Bw=e6{X+eLZph9pyTQUTKkQgGpL_-mIqIlVVJ_pQ|T(uMS-d zJGcHv!;2@wGzsL9Kdq6%ATF0kGOl>WP8HGGVQ+T9wB0{OYcy-CC;Hy^UM^s#=8fC7 zM=u6F38(Nw(5W!O#EB9M2RLLK^4vNP$Jjf1{4Y#+K-AjkPp>_|9j$-%fb!SyKa%SH z2j2tTBC!xn@U**xHR2Q=*#DWEN2OZ})@NC9*>&Q{0?Th{#|D`0zbM#F!i79pz2fD)W_CKJoi2@+W!rZ@}GvHh^5%{E)NVV z!H}4a1#$ZCw~69k&Zm|7AAPg-4;*+r0haLS;&Y0C{T!j}QxNHJzuTduO>vUG0P6I( zK=L34k?miljBQQ|F)> zV-W~D8DxMsm%@%nVu|ZtN-yb0A-{bOD<5|rLPHc?dXQSSnqOh|L=Wz|EnMdLu$9F& z(=^oE2Tk0=;sUUQWxBG%3J<9t#ml*aN@H4Q7&0qFc79MGRZq&lcaA~OFxJ*6WaC&% zJ&+68ZWBaiW$ccJbcz3wMrOx#e&Yuvc?iaNK#QFVLFQ+rDO6UcI+F1rZ^KT}sC~HS zuBSNFncjj(hRdjV%--SBo9dAG`x4|?@sN?B3YZ$cC{l&NU)`;+UOm|1(48&n!tftY z@84DZ0gHRtEV5r`#A%y*A>lU#bIYt5xgihH9W5C=}#1fd0Kw*?#F=DFFq+ac+EUYP^QEL&&jW=JBw;p4}i1ofFY19 z_!)(nuNiq+hji;WR~Eo=VmX0^O5pfCv>Hene>-&bHw6YkPE)f!J|e|#z>o*|fwQcD zcdFT&-xS{wCoYc3#L{{3;<@wNFoT6%4*#r2^)}*YvW@r(uE%}&eQN%LVG505ue@AV z?gtf|Oz-qJXlq20-Fv2*Z@U?4u4U+ho}ieADC62j-`Hri8ppW_h&y~*!UQ$XW;SjG z9<>KswFj=Yf@(<0W4901ep6JH^Id8{32REgy~2D@lN!h-wuk@11__iE{1q(C%`&1X|5vjy4Q9u@;01#U{SyntQK!lN2r>s zlUhFZLeFYu!xU}-0&q}=E}J^ZmO}Aq2WM-Wc6S> zX7Q%EQr2je!o8;4__MjkIZelpkrnR|)rGAH z_Og6mc0WYb#4Hn-SI|Ry5cwYE3Lqfu7c6A-dR>UU-5}DQ@%ZM6<5{o`fQe34NJJv1 z`2bQ=Y8wXl3Mc1Q5CN5VeqIwX6#!Q@JD~*X7C;5gXq%vPHd#c3E)69uR2*|Z24eZP zA!oOFJaDnCnUD^~DTF32?K}VV^x^4qcv)2Z(FPmU5Y5M*f=x|zm+2f#Xk}yp zx^y&yK!O4dC_JHbMPBD+Yqw^VUbc~bPa(>uVY~~8SWN%HEe6#so+2)`p1d$g8N48h zF@O1LNQmlHc2A-@Rh-~HT`t$a=XdEn1!sY|X*>v8&oj6^n+{D}&bCKjb^&ts(6?G* z<{QL0-b@c4+0IDqDjT!XXFmC*NZm7HZ#hgYO!>tNry&xu2)+s_dy z{dXo*6pcR%xmB=tIxm0R&5UzC%ka70TJ_ThlOS1*hib~~$Sj4ZN2U;PsfP`up_^yP zmmoB5uaqTrLVhsncTv$bX=e6ha|wouM*K2w3c1-NdH1AW;c>~H zib>xylq^~lX~SqaLFRy|;4hR4)|dNBX(EvsCj7Z8Bm9oEcchYfC@6EkQ}|Xm6w=TG z*0TlDbH0g#IGAqsGSMfz?P(FM%l56F{EE^VL~yVVSO=^5#v2q4XC?EvA{e%fv>Q+| zyibDcofmOF>qGj2XXN}MMS1I&bmiSQ{&picA&QCu>_xznzq^^t2)#w~|2)!O&i&kosxqp^o}L<^oIHVL248)=gS{H;DE z`k^1dP{^x8zRA+4<>w$F3+bfvH^$agN6~@za};`7qMeY?A4B-EN}?knh6pX0Nlx2$ zPH`6HJuSS)Ra|;FqZsnEs#Eiq!~1|fS9#LbZ;Ao#00FZMoUOH=5XQ4^+LQ;kRR*@}zsv_;V7*T$M1Pmv?n z>IHcn0uW4(*OYp{xOg4!?RQ!M8Jy671BrwhonX#ZOG25?4f8T@Oolna2}3oV(!B`P zjV&&Dl)XFx`AKrU**3(P`2=?`FV?!q^l2t18NBrTxYMxH+N zFC4v`sMyzby*8JtR$=X_zN|f=vtvK8E%YYvHY*ZvSPb_$eMZ#2w7U1D$?7~RVkf~9 zp`whdXmRJQ1I1H>ocz*sm2=Cc~0SoCH96*dmj8Frfa8h(B9D_VD)}tFbn}kXX za83|E?mhWGAdHs{$oAcfuCt<}V2@H~o{*XkrR4~p)!vo|FVtW$t(*!Fe~zuZ_fMUk ztcDkf-VZ9h?Y8QA*7dDS{@z&Z$d0#3*Rz+oio}5VxoAuIVVK)TwUVhXVa8?-n(;rn zmb`i2%_p_8bMc7QQN4fah0aA?Rxvw1Daw<{vn{yv(#Y2l^eLntMCE;%bMrFjoN8Z* z>Llu}z7M$aJ*L>qaRxL&iwHb$#jChRWjiV&FE>CgqCaTOLbK?#(-Zv+e`i9j?Rs~ElRVN2eZ?uKMCJ~Hp8&<=~{u9;iO!h z4cZDyKDXiX*eKh@B&JypQIfHk!${y6A#n%uU-b_RNE?})@GjHViLDYJt8$;ZG?VT1 zn?mB~`+(c4icXntlKL5yDl=@_QF}rT!0k z)3d`%O&8vR<7^gdn!iWf)c?d6Zm{)%R@l?QLz^q6=FHgY6y13I=B1diNj;J}5E=o# zy#P#TIr+|V;#$(L=M+vwT)}kal-9>e_V6cb@h5v}RWP%}l*I$F?CH|ETT3N|CeK#o zd`qmDUd6Gq=v3!yWFta>gq-+IR&i--m(& zqHi+tPJfN;+t0s6@#FSf0R_MX(<0{)&`QlV;hW9FAy=F;%Zsjao@#cSuTh9iNVx7b z(lv|!1UO|fb#qC6G0-TE`epde8EhkAs6?T-J`ZTsJ?g#YT#h4psz>DDB4 z>s5tKqM2&tAIIIiaX>3i1sG`RpK6lFa%s=rOWQDZkRdq0j=e(K#_Ex*oZN{jP-qHC%90T{2{)Gm!>R0KD&@D`>2ht_EXMWa zu2+as#k&_|zMJ7nr+TjXnbV)O1nZygqCQXLrUy8C6k7~E@ncKA{2tBz=-}CtFbJRR zmrq5k?9YcH^DO-Xlgf0_2F^t*HJUlFYeZ2;oY#;^*zM;VRW2zS+mbbV+8UwZ4?UjG zS8?G{1pxuJW?YqbwXFwjO))D&)r;=3C0!;?gT~F-&V^+I1HRjSm9>4kzSGSMjNz}_ z5m#Wkcvd46q(+7w_T@!l)W0`XYQA3-F<)^qG>a>SyiOHD^2mtg{aV;qcy~6nT{xG} ztOWXG_x;#z|M~ToM)arXOTs|_J6#8joS8tb%+AocNz5~L(zEKBZ zad}2W4=gk)ougnS@2=JvkTf^I!9Vhoxo=+q)X1k4IXm>+%;sT3!E>&!H7@&dgtC=f z)ZUK2yt{boGD%HG>BoGk{5_ei@dLLm1Eicdf`!Zz8L+T@!%2l_Oqv;jzK~;2kI1U> z$XNcMi}CWzc4U`X8!6gy*2vj?mOc$jvx_|;-z^uPZrXcS7KF*Db6pRD2636SSG)n# zt6-A~mq1Wd^srCee!7P2dn=2VZ+iqukI%Jp(s15`NPyYI>NaGG6upiF2yRt}$n|58 zK?m#!xpJB4!qx!UzGl)~A=l9;zqeb3l$zX`#lI?sQA~IzUV#!{FX(r;?p}gB*Jo~Y zSY3{Fc$=O4+59|L_BjRs2g24Dr3~h1nY2#4XxTt%Rz%{d7~@*UTJa3D@(71A_I>#Q z@qv8I{sh#|uVfOY2$)Eag#Cg`{OhnPdDz&}@dpzQu{~iF`H18@9p_&5unnH~ef%}` z>-J!cnJif=kZ<*SqDtSsi}#xSFSdon3$uxthR6U5CGl9Jbd<_D7vB~UBpwbh5m9Ej zR~shGdxD?Wwqa+TFZDbw2<1u+pbyiN%r+CS{^VIELewtLO4~p^Jo*X_zF!X2zdiKT zv!gBdZMsxkG8&~-hta4tYfflvwWXe{4yG|01^X>tPGoKQGO$`Ru43)VEZBav5PL2u zdy!k81ry$8T%`A?Lm&OpZyVh`xEip-${t$~-TR`Cwgf$0zOz1~)hIaUHh6Vxo-w}Z z_Uby6q2=y|CC_Vc+@_sO@^nqhVltmsJ1Ep>Iys<1ML~m)(f^6-_tt}&?X{sVW!=ZN zu6;uZpR~A9?Tq0KLTzUv*fG8O@5q*Mfm4l-#Wq|6J`!Mr z>fIL#8^CeEb@<0);rSbjtiFeCJ5zfwgHHMKTOAq&PQ^xH)x4kx(Y_&dzrx-g{v_7+ z#0V&^;t0>{9CshKxf#=$y{Q%x0BJC1G{(-`t^3R^|*lGeu|MT#jaNTrs^I;?X? zwl4YW6P&kly3oXWF*K7jsRgxnt3{0u1EX!5_lSsvXqZ$*G}Gni54mCiZy!=pqQKt# zOw(6It$k^8RZ3WLG-DrD^CV`8VwIuz7u%HU=}h$kxG-meopQlh;J zI#+kbr?cN#ZL)dgBU~+KGNwtlh{3>C+}`RtdxeZ5kDW?a_7M`Rj+N@_pSw-}>hhdy zrbE&k($YrCYzMo!k@uJVtD@kOaian0RRmbCXc`G4>S2G(IV%A03k$rsozf4%c|cEv z_BEP1Zj?I?d1f!?_H$4w`yg-l8;JT}sn6?Pa&{unAH_Z+jq*l1@RRfm~ z@^J7BbZpa^;agBnZ`YOUI+~P|rL(-kLj#6k9?bkaLD7R7jB&1)XyauF zX*nKqhbnn2o#WrK)r`Ou;c<(~+zKl8xLdouHKXeojh7seKX#*Mg{#S}`0j4bzBBq? z0fb4k^$RUUs<(8ZC!qm!^6SETxX_}tvk43}i37(CdsoiR(ONG)@- z%l=!ZK{%U=$%V|(Q&B5#e=g-N887R0miMTlZAV+F*rR%%EJLp;0!H6xSp4Rxdec{T z%nF~NXZEwF&BCIEiHSuJM;Vntz~LfsVTdF$M575fTOqa~;#dpv2m#Kg^iv*F-(Xkn z8Y|ukcN{hjs3@HTz>Cb!!%xmq)rcI znCct#dn3b=ogp-bjNxzni?iY%_MJ(GnWRee4G5g4$@G$ZP;85yc0uES2u+0%0JFW=pSnSv_sXf2g-NR!={cMq%GTAfN zdBLl~-fB}mCLwD1r=`e9x}}xb0JVRkzA8G|`NVlqSW!mSv27_}Rw5FQ>%ZeM zml$lM?@dC?O}GRYBdUsGYh0D!bi~p{m9i4z)2Y0!+}qS1H+eY?@@KWEsMu-7DP!=n zI=1;HA`_Ux2CKvR6Ka@7-`7<8gucD+MxEL~-Ij8eYcyE4I;^rOA95L!#&C1$){IK` z`P5ZahToo#4&-;DFafEa4;o+KFnAPJY00-oG8HVBZNkee4=B`Pb})_b0GHw)|4pdm zmMeNTXQ7PI0MAn)AJMK|ok`eGBVFkTpFy5CbfP+bjbDwe0fC5%9V$u_J#w>r6Yhtu z+1X?Q)*-e2r2fMNf1ysx#8+U)55zf5c_fZd5t$r)x zR1v|HXaVZL>|Rm#hKUwcn*x|hQ-;F@-uUzjlJ5F99``c%Z+ZS1{`3Y0AX{Ox>juZQ z$crj8Gt-VLCP&dSERi0Co31a8NkRZ-1uMl1k!e#;pOOC9e_TflcV6HWuF8Vauj4Ux zXH!f2;;cNQ!!-SX+6@^G4l3bx>{gMOB&mnI%qp{bl(u7+iv)ie0N8rB(>wlo3rDYh zQ#gUn`D3uzW8Djhj4pVWy28tLFP1Q`#kBcA{?a9EDjk^=1#`ie3dPpVV8_-#X~DKW(K4n?y!Fon;=3Y(mdW8PNmhnzhum*0i4 z^(Wsk=)3Y+k47+5#SoB7eGTbA92guzNkSe-;siyo{`}l^r(9)RMC2lZZo0WE5qnz} zV>-6tkCshFV+GO#MSk2(PAH;2r^rFkMr_7sEmZvQxX^>A!Xl#~Uk@(2->eBjew((& zT3ou=ctbPf0eJhzt}2gkKG0q~L_WR!(xhrc8X2D(yz|kMyQ~_Di{IF6v0xJsw;ZQT zR_(bF2n0f1qg(;8{iblPcm=OI987n_M`BruA_^z-(yXX+(E+f-xMDAGF@{aN*>f`V z4w*d!4XOfWl-MkZGz&$YS@bWV&05duDkBCrWPRc<*U#{4H;JV5Y^(C1fIkE;M;JbP z-$GAp+x!8!N*4R&6nK`bF3|aI($Da&SQPgqC#?OlIS*qX#WeBu=|d93*~e9s8=2mJ z4|KIIqO)YYU)Jo-7m(9F7?J5Ttg&;^9*sD!HMqW~YD;<<0|NSQvsAZP=0 zwdO*@m^**<_dSwwh_QB@$qzx^>^pqQ@{4?#6i51DNX`~+@~|yUELNgz+cH?^e&&kn zAGxktw}169QFirExL?|gp+FNO7p){BdCsF?K_$+Xg+;C(%yo-(9SeR4HhzCF{_TOC z*vIKbitFskYg9|S#DLw1iea4vZR`Y|j;#s9FABKXxpyfaB_ zp-ia5(AliP$|dcNZB6`vt<6A5vJ70rn9R=VJINh&GgMw5%?RvSTrk%8EE^pHoTzV) zr|GELFtE}ooY@#&$G1A!rRL7}h~E50msB|OgS-!fMNoBuFE4r!g^Qv{t{)`gRp7Jp5)GAn@b_E(CI=sA=c-n>lyThzZ+tVV!-E0Hv7F_{ z)9>Vnr^>8G`=mAr&T8FE8W-t!E$|^WSO4cNsL@- zZ!7HIyn=O~x{~~e(YTh?1^xEK*MZ?>74c*NVi!icE-34d`UGX=JL37zo(!`;?qTEA zxuS84YQ6U#%u|9j3C(ADD`)8{%<6a%RGTo?HRCW#`6j*0ZhTSZe%fmPMdA%!i1#o} z-G8j8`HyF#{yHheWR0=1#ppN&cyJN+^VG!*KRiofdy@JNcaJ)Zp28pT8g9opb+jPf-+lwQAI34A80zW2Xx{NLO2*A?`?i{gKung6X6e<`N_`zZW> z=26ID+)BbTw$3V^H63u9wKa5O%Al=vwKw&v`gj~olkWreDn|-~k{AVSi@tK_NY_&n zZHCQM@zTc^bADmP_W79;PGGNyI8Um30zOC z!IgG`=-isr#ovI9inI#!;n&VN#-KftUwpQ>=MeStLrSPQ7uVxZm=8XSd};kughYFm zsv@wN%SQB<(Pwg2VH$UQZ9Q3@-`L2UPiJak_=n_oz8l>f*icTKCi&9tsVL0%>|F@Q zX09;Nx4|Q-272k&fJ%Lyjh3^oqyH;si+hp}mi9CFTqsUi=Zo6265oJm!IHQJIFn9! zMR@hW==Sb%=>0v+AKE|I)vmBVW%_tyKJ_P+JOy{nKR#6cPsc9)<`{a7a>zdg%NW+g z3omlE6~Nz4Km?U)^H$cS>CftQhQ)tY=z74uo-)~ZXDURL^^);Nnp@qbCw!LPE+%@9 z%pR<{#{Jx%yBabKxd0O=QAw8l6?FcGubXT?W9p}>+~&@O4rO9Gu=KK;S(L!I!(k0xohX-k8n z_G(^%t@vFVo(Se55wGRUDg^{9>;+M0UI#S>Gg)lDJGdtnkk-2`J+gY*!{z|(F!N>R z@0T1|Z69^>n+o3P8vmNQDcW{B588^nLSn+x4(jZlCuVk=#$2WTH2bjWkAvAry86~G z)j`oL_H=CJ@vWyHuyY|TiFS(BB9DBFzx+k^;XB7fVhUIkCW9>f6DB>bpu6ldTjf~e zRjHR2o#gH12LsBd?0*$MNxS>VC2K^=@&2cxZOfye905-@VCdjK6Ak9L_1eS3@AMW- zqNXuS;bVGRnlQiDEqfnHx-dp+vt7q2cKB8J6VMbNwjxU+CgcLq>vFx9<^F^4>iRx- zaPD|_6W}P@I44cHSJ8nW-t=h8u^Olc1os{tAzZ1IeQu@b&JtC|Ba@UKAD*Rpp0it2 z%g|^D?g1P<*Q(;y4Y(aCEXMxPCp;=66A|1}#xfj>AQ&Jp(v-_{=MRuEbKmB+`Q|~< zpzA7}#G^>tFP#@>m!!BmVJ9pGx)nk@2#tXhL@W=o0pr+d{er?%%6I$ zef#>%n~(Qh!o?d)OsWjY@+O9#E^{jqfA{)>Ab7(RvGA?eRUsr}LXue(ARh7Z@U)HhuM zz2&q&ynS#^U4HrA$*Xl$_7)x@1ee~X_$6>RT!qzNQ+@oSsrjz-_Ej|6o9^Oc{&*^i zdpG9y=qY^0DSr|p=K+lyl+~ruwMCR@v%u|Ym|^m)Y1~`KHDme~<-O!T#8fFLQ+`~d zEL#}5KvZkRYtg)E63tsn(k@OG%&}vsZ1})S$yEIFk)AT(>Bf1DG8xRbpw0%RbLPv& z+k*g%Se3(+Kp?y=sQJRwPG@(C%Z0@_kV5_%rNIALC1_rU^ntD-ogq<)iywjT6`@L; z;X21u4Gv$c#k^gMP%+<4I@(6RPxUR`l{csjRbKgTODe!dozN<4o;!daKliEDriys7 zIhg6gjB}u4Wc*vnZa;+=s9fLQ6is%i@5+bfyBJ!l}5gJxfVel_87z&jGwR?#L3~k~kmi+Qph_EGlLx7<(y= zaFD^`+Yx&80i8L{-$)e%CR&kceNJz?B0aP|$fEfY@!b969u3#alWJnDVwcfUYTANx~* z+RpV3P1Qf_@o83fN!PTydiM!&L}u=pe5w3Nbnk;o4K~H_xX^nWl>B6w^`#NRZX-i8 zOY%b^^2)||Ub}q5yZrIwgpw(e=Y|&?7wvCG4$lz+%r5c(tpAL8;{cu%{FAIjM)gBh zNxvy@!hq9io4^9}!ca91odLvhP9L7Lx9#X{rK&yKfIL#M(+dv&T(tE+y~{dH;g4;by^<-N31V2=)4Um> zX(b=^I1m^VD*Sq2*${J@bag=LmR6TZVJ7(i*&oKu`ooXa>zCEKB6Zy(y1sMwE}NhzQ4usB zX}Drs8P!XRE9#lV#_qn!wzqVy*wrEZyzLF8oYV0P>N1BzB0qdWi z3B!YH+VK~~oz(@_V)EXvypR6u{dxQyLS0zyNYEex5{|knCW|LpTCgn`oNv4u9QtLn zO>4@FOk=!^Fie!5dDe71pilgxN%%L#{lOGK%WD8gL0u06AwOG|UI0sb1+-6w6R#lw zOZcm}inGg#`eaH?^0i>w#65|68bZqx018?CP4P+xaAjwb_{&ZKpvZr26`B(OcXjjh zE&TtV*GO1wS7EFP(x=jd$3Jl8@xuB(sp9#eM{Y@Gx+D{fLwc1mu%62+O6gz z)QUavAxR3~70Vh0lgM9}Yk>8yrNAAc_Ih8myMA!Lc<|-xZwlj1BNL}i`4f1iGToG8 zJ&*AGw?iS?n1S-3o${KvOp?w?TKIeppI*a64Ty354tnbT5pxgWnp@JRG>0FCdK5ww z4%Ing%JTKSB7J5byCb?C-Zu=lkCe$N)Lg5dEo&4gtzCxFTags;5s|J~2!j(^cQ(hL ztyPZT)!ih@0ulZ2WM7yoVdC0#R%Tt3f@@e3a-N$#=i#PMr~di)Qu*=l@_J0osH=%! zt*e7Gu*3eFIwA@!9U-e9@kO3O?*I`1`#=Oh!Jh{F7t_PxYZjw$94!Qw_b=55QnSx^ z46GXxEffe4_}hx5QlWXwI{o_p@ryjWy2e8jvB4=^Z9}UR8~X53O*QY*j~;aXvw5FR zoxDh0^-h~(KJyj!FYAEtd)zZ&H`CM%y>mhxNE0T|kk)mZSuNoC!G(FhE#NQts0gmd&E#NjG{^e0fO8>Xd zOK)^O36gG*q8a-Tw?Eg>xKv~QSn4Le=b<}Q(x6gIxIW$T&vzaeot&$kt4yE1-7PUO z3KX`rE+go>$EEGJlP`{y&F()MkydizTVBpYUsFFxz9Ukg$#{O?R(>;64p+NOhpy3| zwUD4~=&|##q`;CSg{u2a@dQJ(GuS(ZR|+&?LK8;)}k&;$z-ciD5b=VEUXpyu^jOwE0o*G=5g%1{iX|VX64*ZR$L$cn==}R>(X+eo2W>0!0ipP%x#r;2PU;I*GsRmm2a}0hDUb z@+P~UuXoN(+|MK}_LfyVLz2wS2Rf^U87Nr>DH~O zdqAT16ZvxIGsTLQ@mC5T^n;-SnC8{{(s)k;(<8H$p>+t&H@0}ytS74f${l2orwLf z$5Sn~Nca;xB0vl^2=04@f*%FnCEckFc=%LzpC9f|73VReH?eNdc0ObA$s4cbm6pb> ztTTo<8PFFK<6YNWE63A&BwKuLTCNZgisM-{8D^n$?HkZm^80QNYP!<#qMyqBQM<42 z0nMwOp~$^0ZcQ~%tU<|?dZTY6#?7rDp*!rOwsq=fOIt+78vKHaq|+QbSWXu3*cvKy zQH1@bcs723d7W|Ffl7Xm(jGkjmXZ3(#hXpARiV-pl=0GWo`8SPbJX$Ks<`N7F|lK- zh9Id-yig}&!T?rTUlS+vL(AiNbq|`U*hTS+X^AflzN4&8q@9g7(e}kXmYImYyA^h{ z@0)B21cEefV+F z#8^_lN+l;;?56gkW**3SHaDf0y*`wiqwe45KBM9kXUn=V>FF0&>nOq6jfl!sL>-nX z%_G}6gI{V7i(3vZnvDAh)FuRslX|Wu4HYY_-Q)F=n%(2#xe%|g;xjEEoVag6h5}98Y+|SI+*{#h=C%FP#p73HLG6H(Bg{N)W9U72Bsad*6 zkk9iPT>E0}M{FYdpMh=B^CeAu*iCbl3GUKqG2BdhsE+!L`J|p7(|5`u=3nJHCaGPH zNKm3NLyx$lYBEDy$(&W(2q{jVIb=++JRYyv0lIVmi9iLC>z+>b6m&X! zU9n*ew@)f?u)ZI~S)s7|Dp~RCw2WTux!fnF4R5%Q%WJEgt!%80VFSYR*Vek4ik|Jl zH%qu9x9RaT?JCm4Bw>8sf+}tR87*|ZmX**DW}_h>#$rHl=eD?~^oo5@@~4k{5KVNu zafki;M5p!AYQV9?tZsRqv(>Of&!~MU6H{uw+ad{jGNQt`W{Ns$2p%jc*+@A{(HY%_ zqihD&6yJVGr4H4zP^Eq=+S9WaSl?`Rv4*YF)-XumEHE(pm$le8LuVy{#;t(C0^o^x zhEuN&5g=ya7=q>6C2n3S-|~5|BUr5=Plt~?>Yn{|S9Q?8!U4g!{C!H1jSDJc8{L_I z20FIhNT3Zj>(MA%UMYJy-lUJWYPaRUsi${P*N)f3i$2=3oI4f4Y+r5YEl90>=J{%< zz+e9P=N2IrF-{k$_}%`k>V5h3g;m46f={mXzyw#jwLd5M4KxZw1m=OZHUWa%_#1Se zfFRQVjl?>6poBDXI;4uD>9A44A|iKvJQ&wc29uT^5?-GgcBtJ*dHU7oez#ZrB9foU zRs$lQYi23EDc`$6-fXBLAb&2G-CJ176GY^x*b*ynE~`H{ie7=AV0T$;h*@16t8vdq zPYqIIy1rfL?YtJuqCqo8am%1Hc+w=h8>G(^(LQFp*7mN9r*3ENaMd@6?YUtY``DX+ zr=PYGtIBULG}fg}u1NA5@oC=ViOYZdnrOC?l=9jzVGU0qXl?yCKktvl>foI;vU1pz;3xtk&bWcSw z7Ur-=#JWPFnio+6V-zhmV*dW>wVtWF}BeFypmelRIn*Sa=+EcNXn&=JB#l z(%slufn=(zJ+zFD>0eAbSPjJ-`4VyuNPS(32O2!pm#VGI6>|Q zGjIulHkpsAVnVbKUF~A{sLWSdIIPibz0pEP3gNOT6{@rq9q#_!Az=`TwUGl-RM(O zn*gK+|HVN4Dm9SQz+$2a1cFj{f4JoIpPwwD>P9*yqVpCax8(q@Oh=Na|KU5}XrN@o z0SpxrQa`n>D9qG8sBLuD`1Nb1=KHcHV6!Uy@8TQ({EG&YH3O{efzKO;qH1EfCGKMDP4(QeJw3<1^ol|b8*6(=$lw?(q98F;9j9sdDd#Zl~J*+fB;FThQ!I1m>u?wk393uEJDDoAepdMg-O?y>#o}N~&)B zV3}t6mP8U74OK$_8~F!g4q&mQA2-MPo5C^iwcVmzTd;)ML>`Xt@!aN#ZDT z5n}s}^3bf^+)(+oj%<272sYBv0^rm)Yb8ZOAmJ83!^wk+vH;_eAx@3}y`fm;*@YD5^p#t-on%Ig3NAPc;Q;nW?o|*k=bJ)Zvx_ym2nskX@k;#vA3_sYt(Sajv!_}(!Sho zF!mraPBw?!NGatoH{Vr&Lo;wZEJhIwwQ}Ik&C|+)WcS^@Sg6!^lOy)F(!x;aA-Df0 zftL8Eu&C%C4isVrWHA#s$06P%FRZ<{=s_9e4;X`Px+xYso5paIYOz7eYC`nbAQWIs zibw4bkiW}m2g*ws*~zV_wH$CqGGL{uL%&1>PnJSxAIFuVAA3qKTdZ3^SA#0Xb|Ef{ ziJv`H3}OpHV+{KWlXZ?2jP3Vyb(S9<{TUXk*#wlw$E!dj$!l4_SE+O@#NY>X&k(@E zyO4(PzcCUqv4`&%BirJ>W60WK9%r!hl_w!%fJc}b7#L7=ho4x`zg(T$lPCcWdZS63 z{(#~|z?PYeg|C;qS7ATPBs2O){%E+pjf$|3Ae`#dW?B_{6fv%VwJm;#8!7NFx+|CF zMrgh&i+qzU@b$km#89>2suuyg3k?P!{4aWH6Mm%%kxWYUfDRJSGRx&4(p@*>^k z8X7My(uBS2^ukKT7`rrF%M@BFE#eINztyZS6!$Io2)KJl*VegK&#qe; z|A3)3mG_-ThQ`1MK^}Q?XU?L6_59TCQ9v%=0+$}20=cVC zM5E~o0t~Hp6`VO7fkKt9H*lGtws{L|*7kMFU=X}}biT_SuqnD`=GCe2>>FMOZc2U% zGe4;Ie45{-dkyaj*qZYv0WrR<r9v9v*}!rx-uuFM8!xA$>`zkj`Btu^z>@ZN^{b&6dpSL4XSus)1YNhNR( z0hlFT9Rh62{-hPigB^7JU5;qH6Y(a%6m*+TI~#qXaIW zv1#238X?b&=KRu0Z9yS_)M1JV@w8PUkXkcOufy#*;3{qpj`l!-J9xy&g;=bTXWyZV zbacJwyb%B?kwD0wnZ&^Lpq0(k@5^$7i5ic+NVQXoxA!g)T))cMNv!ID5KH-+lvM(Vg7D|Z;L$rdUOLJx60Sc^tZ&{k~H&( zDi50Q{7niS&uxqMp@@tIrPUykDokJLikI@xHkHX1YFP`(%+b2;Dy<(?0gY#5YR}Y;CoSc~fT8uq& zRevrnA>^QO6Q+a=%I?KOvC=smI@Fq_)ljD~S5~w26sJqDjFi)2(9)~1#joF%qK+hK zo!zA@wU!Pvn;?=HWzM3`$v5u3d!z9!dX_xrn_AI9;!3_x@Zzt@Z0P1Yr}uSOqIR=4 zEykLq`SXh;ulVXneeb#;cdRzv*Kd`!NmYXo_ zyH9S)ngM67!0)HLxPrYD^n};3Za}I!=V5kguuECGQO)6ct8zbLk)Fd}d|{yMfRG}8 zP-(dlOSA1Ly4X8;%o{<)b3R{NRpt_E& z{IS<4Cl5Al7c*@iQvD6mM}_*MZ)0hPPQgm)Whz%P%{)dXpfY2G@yMw}-&5rgyL%FQ|L^3DW#LbLU%_2~UD0 zj(+!=QN0lVEo*TpFDc>JYM!UIe1G}%OF{DB{N6h)dUB^Q@{CIqF3td;j?_rP`}PM% zuYnTjdI>;X1A5na`VjS!ReJQlWs1pm%KVYfe6-&siFb zU$98ggR|jO9G7XKznF!5->H4Lq08?v?+*~a>i4_&(y(`rR#Rt#A(SGPyv^WanC zTiM_;xN{+p5s|}`R!Ot@XdXMMKC1V;2}MTFcnog@GT9`4=VHsJ<#q}0SgfdNiJ5VE z(NLahkk~kPawLA0+9ch*%*Z0P#3g9DXahpg%1#{ZnA(jd^T8iQjSN}glYe+-$xP@9 znLfy<&6PVgZFP2k+#Z^kKoy$3V&eQrNRE?fR$l#dNc+Xfblvfm$D;6P=5y7QVx^$$ zTR5Eu#HLdJeQZn!1u;6`-#*U$o}PBU5$TpL2^v zX9lnc>p6-t=e%FDUVR`{6l5Y(R#3YY7#@KGV>9+H}I{)vas5r<%l$N>jtMlwJ#&L@~0qu&p4u-h?*CZ~Nibr;^=%AMVWa1{p3D zHAk=)!OftfWcEeCE8j4_Jbyqg52EU(8s#DCFH%j&1jjh|wZ0&Eou+(a+s(9@XGLcA zG0r_#ncN)$4{*xC8h2gYdb)CvkWw4&(FD34X5gL<%bz zJ6c*HzL!b)9Iye{{PM`eU0AUY6v$hZ>_foZ4(DkhTb@;9pIuz`FVx8)Pn9T8lmP9F zpFVq3AN;{x$S7rgJnAhUkp-s;l{8IaMAuXc^bRj5%140r&=ow%iu5@e_2rhm(fPZrpGPWcG20^*SjnB z7fo|4!~Ntybl)hZngKaPS%$S0f9&)ifm;g9|C2&A+4peyi-Qa89nPy@93e) zBSJWk5DGa@d`;eMw-T>S5AY6ZP~5GMGo?>sBVSD#41CAR`R?M0(_wBWo05gE&?UpX z5j``g3wZw6WH&QKmPTltrG>j?&PhIANB1R9i`kD*92 zdupn>`g{;`MgW=)TvgwmXB*PeF)(Q>P7O2&_Y&sqrh5bC?NxKe$Ju>uB1_Po$wS>| zz?jw%2IU6736U^e#9_X&->Gn`li2lIy@=1sTvYC^iBs3IjlXmlX`W3&ANiU=>nE-T z>rd{N?FOv{rxwiCekwGZZz-)E@S=Y5Jk1@?{M7gBma2PjR|rgxd(idL+#q_#U(cfK z^T7QZ_c#T5;*W360UxHXmv_lgPKTc2$85l6s|WmJy$w)isf>e7LV6&3PJny|=*1fW zmG0(NHuWe$jnz)Owk`%|=2MRJ` zTOB`>5`c#}jdH$tsXPW4d@ploRSzGOJl=LLeHJ%0b+P-J;QI%k1-C#T`h1Y--^^@_Y_$>mNjfq6YkWZeu-0-b`PWZCmCY|*CLjt3m5oqxoC(!)+ z(Tt&;c7hBvy&)RpjCxiCXB|T)tR|=g|?=A#K(B;ps1s;N#(Yl(pLGye|XG{-P1Q@S+KmbD8*gbc>rG zN609d<|MC>M-(D|gLYHTJ-uN3x|IbkO>~|L1(2l(%sdSs#*x>AOSwpJ@HKd!6MlTt zkXChNX}|X`3QA?+RPrUS1qC+l)pl@oYLCkWd+Ygn{KFs$?h*M_;NL+ieJ-b(d7g6u! zKIU06*sa;l5=WkgKO-9LBmzDwHRun@eWdSB2MJQv5N+@ri{h_+K71lV79lJS0CPEq zm;L-eT&I3H340KJJK%q0ZF2BPByR37}}(BgoMC|?5PY2Qz6@<64^ z{JE!8`lq&5aJ3fV%GC)_gMbi+7b>L6}V^=!@ydjfl^~%wtg8YTfnlYwD)|-Y{ zU+zgi1o0wL|IW8YQ(GHwR94Oqa@7kfr}EY&Zc$?#wyDFsdLhRiC*G#EqKliW19$CC zvD`yjvsL2e!=IyUIX}7r?{Aa8Uka(0}7VKbg)-%QfA%V%Oaj z9PZiN?Vg)<>KCb<5&9-oGI4j_XW#w{aktBNPVOv2T8C+Bo94?}E37#4P3rZ>IXD(t z2=yV}KPBo;m)B7=eo9~GDl@MpL=i7ve!{%nCYRLHP?pPeHpP%GSxjQAnAVCZ1v zDr3e66ZYGKsPyf-GW7ucJf^+1_S_<=!drK5X{qlrmo{WiZ`uZ~b|~mjt>v88&c}6A znZsh0gT@^>rFH?~_nT7rdb=ri5QqOF)Alo)2Pu>}7pomn3e(6OmHlk~k~Ew8++X%X zb1{otOrJmbxEhkm6psAR`n|yF+8-Zl1f|z=0|9+jzrf(}rM+H{V8v(;-iE1Z!q+tr z5w>z1Izu(CqBUgz}+Wz}7vUuA&q4qoS^!j}l*^V<~%Xe>mMAb|`%ni4cM%9rR>R*_AYYifZICG$CHE*_P%mC+y~Gc;qq!8)#YevkjcOuk(@7%DuXf5!qWzrm(0!s` zkO`0XbHd-N<%f*sT&<4fzr&p+oy6iLA)v-z1>*?G&H?yn zSVu+ZA9ntkDlLz0J(5KUc|kpO;}`Cq-gRgUB0VDJV%e^%wZvOxkKDSrBKKpj3WS)t z;YD8i((CvLTOn{-wpmynoy_0I#qQInP1k1(xjjQe}iP~Fqzq4iVW=9x)EdW+@^Xhdhm{t zSC3h6T>e56}S zx~6?m5&iyEyNItY`(+?=lssp@#;KlTfGJllOM9R=Q!Z<6DX*(z6*ss!IQ>oyFMvI2 zgJ}kdM`bfaTj0*h`#s9n#Zw-h7Tc=CWQVR?N>JlHOpfbb_R#HCNvF|vAfra4QZKu& zvKu5@fji~p-db68e;#g`7bbao)!BxJjI)^eMFB332ZZ(oEt}%almrzaeB-U_Nuk?q z`JLBd;i9)5t=|%FD1e3;l-LE#$s{{W~4DXA#BsQ$aH4HcW#K6C#!v& z%~{d%CjkbaU&bYn`*_b=9#T`A7emxO6X(0wBi>X;9*WAg2Np)!1%RcI<93GnWN1~Y5R4rl|?t%5z)#w z;ZNMcLvEnLPfRcYH}4tiXqrUYCO2Jaq6xHawWs+o7Uo1m+41LK`fBaBj#n>YD%gBY z--`qb%JGl5nZ@NLXBxlQNe#*p-|7N%4E-JH_7P%m3S>ZF%K(7f*z7!?Teb7-c=P)s zi<9(??=UP~P&ZXr>F za}gZ!i=ch&yRA$wbQt{l`&}B-3>HK-EWosB50od3R_i3{MAPS8o4D+|9FfX0qz5%b z;oN8iOZo0WC40TG7j@SxR7v{Nd7_9^H^*jMin!=+R@0sn!W;@rYzNu=oQaMl)BJ8%Ur?K|o<+RgfL!xN1W!siohj{vSmH|ffP!TpWOF4*j_!IcHjdqtSzfuj zQQO_!EkO(Bs0Ouk8!yN{bBlcuE*9d9nPyuZ$_fHB{f0T&)(+|*y3wB z39BK;9!nq%rQpahn|K+z+q2pL!94@ag*Z_R@;){^9}5F9u|V3Ami|Fz5j6;pm{uce2>|Eq z>}?~GoY(?VJ7qxh=E83P^ko$-V9zLgk9+bzVV3`9Yx6(1EK)2XYT`*5tI?>Pmjq@7U1V`ELctiy_5ja1CKVs4aS$P8`V{ZF2Bg|m>X$bIgMiRm8V|resSm@i@|wC#Wa-p)HRQodFvXP zH>HhtX>#?dmne`!tklxsf<)m(aFmQOiru+Y7)rd=?C2$Z(C8LHTg3WYxJcqo>WX#S zhu3l<0c_t0`xcv*ll$)4e%(M;s(WdpCtV66w>q&!n5m9>FVA&9K#J{cb#a``3%_&r z*&7);HPl&x;&5c2OBG$L*$efs-biRnarx14_LQ^s;yaIbmpv987OE7}1CoWPlQ6v1 zQ)c2-JWnZ*#Y9F9H*aI1CEsIJB^aP!|Azj!_WX8n*7mN|uJ%Bg@*d1u zaF8=;{mb#rfDSE*mz#^&hW!f|2KO#FmwXN8AllkS(8{sj8<8GLYu+77&2(WMf*Y}B zPH(cgaH10C2I_)#eNW7_PAi411J+byTe&~s6EOlL(Wc86ddS!4w~$*})?DU(lZwAV z3jIdhR#ABA2;0HDS=hM5@;mSCd{GRz?lx^QJqEL{ENtULo1upDth6xD!w| zzcr5M=$as$(09&HQ5vJ+ch5zRIN}`FosaGZ{GqOr5U{#J|7+>WMFOVtg9wk$S2?-% z+>P1owNexf7YwYCxRR@W%dcf9U_>fQ;r>YJf!57oQD^7tnlS&T@QUszCWBxQ)1FW9 zV2irpmtFpy`3%KL^+9dD*_2I*2QL**vGeWE@#-Y8RVeufplNe6jWF(+gzr`dAohlK z%Yi=y<>UvkX~WJ^ip#a#99g)wVTY6RKQ}t8Ly!acJEcBkzEN>$6x~LnOjdDk{_Omu z4($AahUvDge_v@a1)I?0XA<&0T(E%-!&vs8C35?`qp(IDx9o1lrBBlQ<*6#GZkNd4 z0X_K-CLq8?<8>A7n16B5L`vkIs+lYh=C-xj3N)t?4McWqh@*~@BNq=qz`g^@P zdIfeGRn6;Az<>lgK!ndzlllD>I_-_|Zat!!UA6qP81hj4=$nh2@88M>lzneo7n{3|r zi?Un7o55w9(W_8aCl&n`_Ogwc0PUvQ9_BHN>vJ=;tBQ7U0H;axKbvL$n~bu*Zql(C zb2H*vRwK6+pZj21#9RW~scLG*d8=c~7haJ`>UTKCS@#S$xv?*$|E|cBvYUh89_}_Z z0C6(UVQH?skDIjQ3Q4@7$3)1cqoY&6zErd!s1S|u^B9(nV6~2%YL+e}(S-eMu+Ae0 zh9IQvC`QXZZdX}(QoOoi3U-w#KP!IB_}}2c;p3+v{w#}#uAx_wBrYNc=7TiNE~{0C zOUc~Kd>4!^r|@CBv9)EWq(Yv@_9L5`yv0}dz}HAJ09TL4zi+>sa#A1j6df-p2d+Umvu_~b&T;@OC~oFSDaken@~G0ow~OP zc5Xdopy2xpIpGMp5cXfcK`%&(IB-O>q|-g4=*OeatBt_h7sp(cS6<4Sfr#{EY9^ox z1mjr9^)STAIm3;moZW*nAb=+l&qG#+L-P;^^&fmr?vs3h4;)pL|LFqLZj7>&1L_K< z>8lq1^w!9R%wE;of|Lt)NeZS2fNAw$$!41+u;sOo+;<*lB01rA z*raj=%q%vp87*r#TJ5YCt7Pd59qJTP1ASnca(DSvFdAYs0$1TLth-I3oFS6 zvbWL`?U5!VQ#=G1<1x*d=-pP;xh0m*cVSItdu#9YJLag)ir{xX{UQcw;N4!O3~9e|Sui$DdoB2_ckk|(+^~T) zeS=SOrjCGyDYHj$fpe5apChuvnsa3)tHB7_Kg9)p+k76c(gVIwgE;SBW%Ba{p8J!O zs&FLjTc?}q*moC{t+p^6pR*sA7ZaaYiBk$O<#YSF7;f8+ZZH4BOEd!wqwmD@aL3fq zFJFIZU=U(Hu?PC)p?$N#7(Vfrx9{j*UovyeO1N~^3E*1O{lxkzUA&%yXMVY&E~RNHsk z(qqcJ3a6_pIWp|!I5d)`43K@=`&m&Qy+~QH{*478?MsE~8z}%2e)H$aju^I-iV<`B zSHvYXj`RI@O+D59v^0)%40lSrq=T=9!f+ATx(-ou-PDDlx863NHuP&o9p&4VrvvCo z`+{`$K(x#LAg<0A4T~v@w4auMck4DY1aKCG!6y>V0SqJS5(9j*HJg#%IKI=G3R2d0Fa3!Q&I+J!86Blrfq2`QZzzZ0X+J}$UOsj*v(GFurM;X07nnOpjRoR9w;G|+ib$s_xGPjtZtHNppx>Y+FFZ;gseEbkRd;aYZ z+W`oqu(yTU(9kaubL=-7O2d+cEglJ(z(vLF!ap}Sb{vNIYPK}hPCFB|aEsyle1|&_MVhGh8@p)D0z_5cYUb~uH`#wAw!Ifj(3ci@%${x?F)jK zT;4Ukr>PPe+k#~xbLl=+d*W~?5emcWbkO8+tb834DP#!@#=d?c2~v~0zI}Z^bS?0e z*$b2NA9R;m@@jOd?)pQbgxt%OumN}12RsLrfXyaNd5-iLHXjGsX2G_IHiH)-Y(yut zYBr>5-fI2lcXZv;MOu`^8;*!`FI#lZx%*o&NOr3HGA?3b$#pt{?bH^IsVOcr7dQT> z{2^*;q?3L%LnG5}cx@)DP7`Wy2xJ{vQI3;2;L26Ff(9>prP=1u??Vn-!|!%Xo>dmD z1q;^+x9d=b8#qZ#wa$+y!(#WhWUsAC{PF2b40dqGwVi&Q{Mg*4dAx#c56>hNoY;G zc<;EkvJr#vl_|+ph#QSWOy7W#j;Bw}+%=?6-FAtsc8 zv!6nvMWbTkPDWHzhzsmDC_j(`b&Y&^gPR8@{)WVYQRshcgBw50Yue58nLl?&!d=%~ z%#;3}{GMjMtm$b;z%FSDOh!LhxFcGHIQbOcgpSKZ zztt2)YKmtAlqch+i767Y?){0jLiFB@8xv+f;?kK})aw`*?x6dYjstGdr_-?3hDtORynUbnQ?#+0_?)i--fb4px5VbvBM;$Pm z~4UIYuzbIf2-I-j2kXY8+$9r&N%zT6Nn74I%;hxTX{hQWYTKUtN1Z6 zV+f0XRR0rn*w?D2T9hgbR{x6g&eO*W6fs^kK+dJM$&c2iP7$kX>e>^ITj;|xo~8&Q zQnWCs|Hjalz5;MYx?#M{aulHgu{3?zj~K8 zQuJev9T#~yxS`DZ6dH}7QR}qi(UCX$;vMVtpq?dAYeW3X_d9!;Ue()HnXiijy6Hn~ zUY>06d_M-X@2x%kxOm3jppp;iIVOeM4n`I=*H6kf2P}6RT^t)RHPA2!M+jd{b_CwI zYl=~st0v2?AwtcV!76#ZzVapHo#OU)rq!{2JaJRMTU#+Y#C(OF@})~8IvK0gc`=jk0UnMKy0X3Ac} zS?RtFD2Ki&-rQJZlRK5`Z>(~it@d3@tk|~V{OP?*;~lU2cJb6 zLr>(*h*ko>LGXCM06Db2d;T|Q>Ao=;rTH85xF96aco1A3_OF{QZt+tGVDp?u0z4fR zvcLp+#P8!pz*XmVEamnFLJ)LBo$xtOP7D5-M@;$BSruxDpiGVUg(?ALVMUuO^(gYg zan$zwnUKH*d-A4u>PqYe>FM!pK+832`y{e|B z32}q`!ICdC8PN>}neR*%1bJz)q(I?xSF>_8oWM^QK-2tIduglu>8 zkv!1-8#I)LSP&NM zi>Z58MkOW`&53R0d27#x#bS4NkOMg*b97HjBECCyms&t;Qa0wTxrRnIju#JK{;O=3 z5)c!*fRCkeq5-=d{|4$r`+x$x1&}{hE$71pU@SCiw?`UofkuJ(|KkAQzv+Ab%|rfAsH+iMUcg}^ zaY%9(jQ)&K_^4_t-sbO!ga6xlRS`@uIQyu~2M?&h15&Z8AyiHP-ZrBnaN4m}^9!3~ zF!v?iOFSwKx0|Atweg{CAL;%7k(Z$c9jA8YR3YM8Z;+>_SbCt5EP<9oFO|V-=M`@h zc1S9glxTQ}?05Gunb<`NJw6+6SSSJlX2<{jrJ%salNhYY`(0n*fy87FKdD;sO=4B) z*?VqP!FL6ds^9!6&#uQonzrwO?f))Z^8aYA0$e?c^M~678Fn6D<_F)In|1CF2xkhZ zeoV310`b38oF@dI<&v30MnkT&;8E?tdOQ1Cqcv<&B{?-yO4V=WZ$f5Wi}fBpQ!AId z{*q$-@6Jppjwb=_xF{|=3_BE$I(HBpGF2(0Yh_DOK6%2J7$z0(-%o!ZlUkv>UKOeC}UEg;S=0j{S|VI=u3)a5KR z6)03t-Ov?;ETU5ni#nkOwmpD1Y7dat(l+5B#7OjjV?-Jl zM%(!SdVWE2TmNs+>iptgP-9UR#y}ASJ?4Xu??EuBAyh!wl>Qq;lxPGh_awx#<=>$F zU3@A)Cwn9&{2TNUNlM58DljmvA4Li5t+Ws%HcP;T0{Q9*9MuuAAD}JXOl1O|#F(tE z^BeSzo@h7!3J_uNqsX-7DB?2s*IfkZlw|i80^e0lU_ibGJS%S@@D-*aN$h_64LV@} zwtp9lp(M}vElw0Um@M%d1UQN#^SclS!1E>mqVXllzd5IB1Ule9DgOqs0SwW9p3#4v z(SLqM|M{)|=Y915j3cI@5q#)i}%>T~Qt1hOWaxl-0c?*{pffXgHmF z*m{`u(sr)J(OfI~X=G7!7!36~g3?xPbwpN1eJJfKR2)}h+)9bM?OUy9UK>(i92FAv zZ#CWjd%(&+ZBhPpH%;y@h@3~wZU7>OWB!bBd}9f^C)oik>5#P1(BB~ch+^fhXK{$7 zaWb#~{wf1j&d*^Xgy8pE-v5B~gDM@)2$V{fLx!18meK9>tOYGB5#s0LJ{wIqC^pb( zT;Y2(7sC`)xx@)DnGPEN0c#(Htl+b-aO@>Y#dm;O`_4U@KUI5nz`2RSxWxt8Fg~1G z`}U=SEOyv>4fCs=l?EHd`nhiPw9Hk9jh=jvpVU9_j{kFkDLu@q8uy5{^r9Wl^>~$g zoTv`OdH5Yb0s_EH8T@TJ(wU?AW=4BmK2CLxj;waI&dgpu3OeF(x zIK`S<_!4zxAb=Ri6^^M4OIZDt$s{)=WE@4_YFW!)%W74NhTkF%VQ2joEuu@E{3QIL zovAE@bVbjrOSunKZde^p{gI)6)wb{=a#J27+ay%|acv;H(27H4q#XLa%5aMYxi63gd?1t_Rh z*nipxzm)D5?el& zHc#eWebKT(xDcR1Se^(iw65^8(~_&i3AQfBpKaRtvMeTQsM)s|X&okQRc`JRVQ*WKE0*4y znB#7~i_PBb+chd3HAb5&mHUu+fI%7LqD5Ki+h+3uN3O9RUm@fuQ{^g))5I=b}cxRjR06nW4Yz+!1ss{WD# zqE|M}^-HLS^$T{IatFE=@T}BB^iiR2CN|UyTA%#Di^n%JElfNn>0)DLpCmk*dvBp# z!h?MB=Cm^iTZxSM6h2Zta%dl7`tC(qpBqI)*28KS{*mB}X}pNPS63@5J~(&YA8J)O ze*JCjW5OuauU~_#bOwtP#o;$o_!Y-&^O|EMMRLK3V+IumcG^0IKhhibcK0L>$OyJ(v^+L>tjzP1bv>80;u+AX37ojr{d|$-u9m5S zQXt__N!~&!9bKj!rl>fxp_VP2yvagHJ%STN*T^EcIn-zHHAnXGvhEbhL|^b#P}+

}UUJ zwdU3!(-gUzjd~-pDLt2lhI)>?EQW5kPs9|Be#xlXTnaKE8Y}c~=yhVhR>evS&ek`H z8lAQJ-i@DBAwOn(>M^x!9~KX=k2?ht@a|8=}aRbM!9Bjth3KFNwC{$BigNMEhO|!@0a0)E!F2{v76eZ z&o7KN>uG2qEp-Jt+*mf0d5Jmr^E)-Wl8;Brx2tDm#`>C=A3fj~Qb_27?B?oFzD*4uti<#NW)oM0a#2>l7K2uZ`O7`{$+_GLDS(b~!t zaB@;Qm^d5NS#p;L@@wN?g-GD)$3m_H6yuKU8?Bb|SguE2Qgoe{H#!r$u^s0qUte5< z4Hk(Q>09MU8_Q`;&U*)kTuV{!33ws?Yhf|G^Ji$Lg)$p&cC159P3@-lk=oC^u=5{Y zQt@Vw9~3S+LLb5#7Tsd9Zv(Q=VCvnr#$X>C1lA1) zfD>`!&WZGIlP&f#H|`5}-|=m3(zJI|JK73Rivz@cp_rU-*JeaM+-Y1oI!i>fYc}hE za%*4h9jg%Yvu5tB(<>fUGU7C`-$wjD7<+n*7gXD?-DURafNRBt>qPcpG^E8@`{?JQ zYg2u-PR`a#vd0(VuhFymtEl4dy}ak6DOhfH-IM3VQ`7r96yFDeegWz!(@)eBbrMfV z9SZ%3ihUs>&HQAaF3LMQekHbXX7B%J{HFi2e)+$b<5ZXMi{!_3u>cu_G6>k9q_~I2F8@iRJaiGs<*$j! zd2CvY-zLkt(v#ua} z_q1uLYajh*>@%631sr~)8T$v`(0@JyhGPdrX&0GYjfMa{j$gcW??1ik##m}3l-|`3 zZQ~5R$9?})kle3Kf0hsec7o6=vSCHtOsmIg-qyUeALo6rvCHZ4#xT|pbcxz_QN&Y+ z!|x;v~YTBy!nQoTgek0TIHXZCB5Lh+sI)j zyouVI{x1_zM@2Q_C`5OUvRaP*P<_*#{!PszlP`Th!@S<{OAWxM-G>-?Tk)Z4h*_r( z{pocoZU##}80m@RJdt{-&ek(#4kj}B7^nBFc-wHW<~@Eq*LTyIKc4u$Pqufa3$PC5>c$v%`sZuXeG#yi zC5rwAspPbaB$&*M6xa#wtG;nf4d|6-LMG2ARw<&f@m+|p?LvP6U83Wc8JV($j1PJf zw?u6t)twkGs9N9lw0W{x1t|+KA$45pRW(uD)pwebE<8X>onGYKpH3a`nBv zMUSRjnxqoR`^Wjeju6pPEh#7**a|H*sn~C+qRMi+u2{|p{0aH_N3m+HrTn|e?$xCk z@SBoUyf4QkOEdS9zB9v7X+V|L%Zp1rdkwfW4k*l){{{9NM+cZY5~4YP%8l>%0~FA- zlHLtn0~pKzBUHuW_=ps}0Y@F=14%Lz8)(kp7-(}}jNOcG8cR%LxM+%u1C|r6I(-NgysXlim<*PP5vXIR-R5dkb z^FhPVV)d53?lOYGQ$x?lcW=FD4PZ()bAdbPBhry?%YEv7TGWi=EPeL~S5J%LQ!F32 zU$U@W1ts*n`o>*g8E!%=RovjEVVoGuo%)klE;1^HF+Gn_dk_JJ=yf&y+lj{2E(Ca8CVj53cq5_f znpHP{)WxJ*?YVY=wZre*zfiJC?Kz7odS**9Fx^sbIaCk4iskl+!9;xs;qKkgh!yh2 zPDd{%Wwe%+D5nN)Orum##8l>ZTm(iSs>Y2Hcw(t|*5`!Bug_;2J%0F-j)Ein&bJnAGUpqM4-WpDEV z`}RiHGDPXGE`*;?uya-4fCSaI|Ha;SMm5!L>jpuw(Tj9Yno^Y_orp*m5RhJ?0wN$? zdLUTnO}ccI5~-2i3BC8;A@oiH0s%t2>)U6aefR#(J??V%IOC2xzMmm0i?!ah-uIpJ zna_NlmqgzP{7#&PWmH5XjN?%8w3I2-DbaRiK8I4+_Hf}utGEqR5mlB(>m%cF1HDK9LT_T31_C7O95 zPuukx70&W>S1L~M#PGHQ{2VGD8MLLqvu~d=HJsF^B6jfJ%g(KaX}l2-7+{T%<-$E2 zmDVRdib&7EzT4Uc+_#KoW>iFyvpi1kT@AS;$8>gex%=)@j6wB5eqjc;<7)h~Yu{qj2RPAh6 zv)?sU=N8C+%H=vIj-^MBEj2JiBxHHWKmJ}-QWUj~;NKY)V2>k@HNSl_|53Oy<+!j% zPxpg|^+R%%tPC|{giL=*2X;#uo}%6I;+}AdrAQX-bg24dA;kaHkK*AkN=HwCi7%K9 z8y}s=huif3XU2wTjXb%t3B=2+r>au@!H|88T3rAgRogo!KU`lz&f38I@sCef7=D7H z#>nvXfP~c0;3A$GjR=yqK4UZVfHjk(KJ>7B6YAS9qkS@&(f{}(sc@q>F=(}xhtBI$ zew21r4YGgEixs{qQAoy#xs@%KA781voyF@ZhIm?L6j#f*80mMT{C?U6tUr2K z6P;alUB!XVD=n#L5BjXKvcYvm(Om+Pv!xTzVp$y0{KERSnw|a@a?UPg1^WD9{sAqIC^H4DT33|P_r9XZw?2Lr?lFX0n6#5cD`O49j?SbZKw@<2*wJsEU`O>DfFDk5*GH`v~uw$!C{A$R$m%ICrj`bs;<~Qzp-@sL& zoqFi~Fa-HSRzG0l<$ilxp$hF*qlCcIWVM&uyVcXWBPZ{hjKJHIu1oXu-HTU5uu!R2 zV}q|gik>$G-wo_@eRh`22e6O#oFLqR@HjOp-O-9$6#|I0k@F?7Hm6p;3<4&Gm=?9f zQRv5E-L9mS;$oj0>gveJ<|d#IP&AKUXwT4?UfNtBi+?2!F>OWJP;`HNgzZ^DsiN^J zF?*pcG=?l;G|W=U+qgV#cC>tJ#ibj=EwY$TV>E)E>Av)ZuFE8Wxjm zdG6SukJu8tWe|08sP?M%+QLRlLYhTaXfEWs-+S)Wa7U$VF_@?FcAmCGx!|POS*`OX99Y`8ZQgw=c$JSWIVWB4g9irC@w9WtO??8)tfErUF?n%bT0J89KPM zSfDK@a(%)(`d2?1HoNOBpKJL;xwy<&b+SUl0B^F>*g?8Q*bMJ61%Zv+pG4Z7|W01*Wgt*hR~1fnKotl2cIoRE7HfZ z2&Xs!Fzrb>k4|)hmRoFv4?w#lKAmN;y09GkeZ z`(AIJ72P(}vS0hdvHBag5agp$wW1%oWus-w553ib;6cm8;mvn&3O>>nnFv~6LG9S} z?&wS-pQIvGIvrFKhEX#Fgehszhb{elL31}p5- zCk2J6cFg%~DXBqU6g6GiMRy0Y7^s&t7&#p(am}pZd5JG_&3Tc&zftPl1;+x}o?K*| zoW^Tk`2rMw{9S*7GO`fI#03C!OZy0-drWUu{W2j>ub>yGP*?mrPyz==HzJH#yO!EVT2ug-- zHm)zLMWX8it+QrkjCS`eqi;9Gt2f*qQlbY95P$x=DV47yu&Bt@N-ZKN@e{09MF1_q zdI-tPtj7yLR-NuMRb9=ODo^Ey-Km{rlE+7DZHrfbI8M5%mTjCV=b4F>ov)0WR!BIN zgiBJ!F7=cSuR6idUwx%vgCm>s*qfSoFa!v2w9`=agP9QKGK#K9F z0@Bb3xE^o?S7De_aD{XFDJT+u2_UDCpRO|Ep$H@%1ZxV$0$_}iP9AK*AB;U`ovzY6 z2Pypo5yEgmJ}$sXW%cKQmU}Sj5zYe`@0?L2`e8cGH*#kNj`w2+iWWdSfoxR(K+NIY z1IWaze|vpON4o$?VQ+Vuj6OCVCxsaSRj;Mmf!m1aI#+yka-Az0tWX|`#G~l0nkKE z4KRCt08#;e8Nr)>4B#IF_{SUgS4+(Q{S;=ZXa&EA(3ipj@LZ#=kyG+IquX{Od5ML5 ztcnI>o1$UyVHGKE0ZdlJ1@4xfeTK%2s;SbHVF1vdbmTsdTDJ>qeZHc1FPtNFtO8hK ze1q_#yt*>znK(P>g1f|wT4&>n8*f8E$ADhxf6HV23o`e+LkLsRV$H}4P@y5K z0a+HTnksrX*~wEB7w=_P+<2ujc2#NOGf0D2&JRQgB6z?|^LKDGV9CbOIGCkBJ`b;z z`#fbagM}Lmz44&Xpr4t|JX0Dmj{4CH@-al2UL^zc!IaRwh0>|Fn&z6M-%L~6y#HYY z8u?d7>A&3Eg~IIW(2WY~Ocf&c9in%mYea1&c2d{gcA8G~Nj)JB!*?XH?#O+HiXuzS z^QCpOZR`1bY!=ldwzbJA8#29VlIWHmLK}q%DsCFe$I^^JY3OUu52sm(ZrafvZJTD7|7ZJEZ%>^nuZkQ2Ukfjs?tsua+e z6=%0$^}Tafuop^zzhS{c(FLK+OBJZgt}emJ-TyL4zee9}*dp-y3kih>G+4Tti<`M}c5-s|a*y>+&UTg_j?AF}F;`I}qwhe{d^l9)^W5|+Jbf9y zXRviAuPuSpomY(oO^7&MikciwiyKt*f2?OEtzb|_$E&yn6P-uQMY7-R`Ci3+3AJBK z@~W%+me~-3FUCo=S5ePh6E%3KJ#Yx{7u4i2EP>x%0p8dbNFGk9S<}CG(#5RYArsUm zcSy6X*F22Sc)DC_)84RhQZFdbI^3q6y!s(F9+6R(9<%8I_Al*chHoD(2?m_b2e`s^ z_{sW9zW2t$cfOabK%@1KKi`Gg^>^uv==Y&E7HJLjuW6GdiK!{&?(dD;$AgV}oeUpW z_7kR}Y?~+bR$|ZB&!ScP(;@C^4tAcQvV_>iCGcerF-3G;dxHbhj7rvIyUkE``fYul zdmpbub)O6KU~S01qKBcm)8PGk{gqODeZrDh>`SaiZnSer%9|R1h+Bn`(!;Q}>`flG z2u*tsn+?0($*u_D=kZoy*l03+&^mUvCa6u5@N3Le^}R!$rTxh=4ONXPk-h45PZPlp zjgIYfs0?XK$=K)^SL%dYbrN|JW}S20zTfuVz^)nD0?~&^|7DNpHaU{9Ks#$!q7dCB z2Tr+#{la|V&C5p!#dsFc?@nKIUoFO*A%&OkB^EqJ2Jx&K(C4+~uh5oh&N-|morv@i zE-0jjL1jp5&<^bk0$H2^pDay`6_+tt~6CHPVPJ{`nchA zzWLm?rJhACM?j=kO*yd7GCXqDm5@+InS)%VZ9%T^HUDFc$7w8C+~Q^%?U33d;aV>B zMMiiM$|)9&4W!O}n6aX-_Ga$9R9)0|a+U>--dm)tTk6g`fT6bx(7UPj!AG8lcg>?$ zI;H%=zBPbIon4X+(FR?gDg1%9iRH0s(~@7epuszJRRH^0}IDrv?&Wr|y-6k?2Iy^f*hTT>Wz z$(|jKC_e;0=taeOL_Q37e%(@%uCn!^)3+0yvf1H7`?30nzFwAh31aiivdq}fM*sqz z>tV(_dVef^>Xn`n8?j2rGjbw7rIc=6iW1|cWAVlVs&dl?s@`hy7;`fP9 zFPLODyv5#BXo%I+IQ=L)5=UIbX=2|3^7`fVK({~>n94@~RQg3<-$94pD|={luZtS= zHhp*+Ymv@d`o_%*%$?(Y^bUO^2xs_p9e)#TVJlFFGL>^>R5oU<1HJ1Kj!eG&;rc;o zM5bP18)f0G6rKU>!fphF%mqiQ6M}fxGy5I;NnGWqZpfjl7d~}7DNL+|t-%ntGhFQd zP%DFWZ>e^x`8HjaInMCm5~GyRR$xr;GqGos57?;MiNCC+nFqJ(ys>@~ovc@*r+RMh zHQ`I%xi4+u!t~NgHS4Ogw$}UA94V=+uC#>BVYmI6b?{3&MuP!gO6ybmnk+NxacdXn zUI%wV`CISru*E&Q+)TaWxq3o}stt$-=edIDx;F2f3mx%CL0r$cd(@u<`98ObP{ zQEP>#@9oy3we{_OC?6mL+|Bs0urt|p=jH_;C9To?+D3ust^t#6+ng&>hAJI8mDgR` z3p_pO^%_^c+axOBm;#Z`I_@-HF>%W%V$0 z`yjeaLGAs)}_LX)f_~Ctp_v8&G2-yDKAo3 z3!9x}REn?>-7_c-x*qLlyKa?lcp`c!Ly!L$lDGmZJ_^nURQI1UxYcEv zZB)ECMfKcp+q?c&g-*(V@V%L$ylNH`L%4G3?hl*xd2B{R(<+l3&xC4J4ykfpYYu-+ z_Ta`;VM6BS7}9F&O^xZp%hSCSCJ#yk7RYWIusAB?Y{zmmXlSxy%xu=0o&y92P7SE2 z(iItj3mRlX0hOa1~mcYqpzE-YI+uG6`Eq*{>b)P<^<6?K7!u=6fo)E2_!@e~&aA7Q#I zs>pp1USU^`ohSJBcX7NuhWunZ#OlMo{Rw@5=7oSymL^3BXVdrAb{f_~Z)4F{F3yJM zp?2Q34xf-LWtyy8vOMom=Wsu$uIoa$aEF*(reV)cj_-~MTkIXvTxAoNt|AFG(1F*d zq_TJhk>w3uNF3N(ZO^WnszLm5I&ng`curUbldFb3#QJuY=9~?#LSx6)N5qJur58cRIy>9)A%irn>oZU>VWv?WY+Yd95sTNp`CHw`<45VpMp*$e<`V?RN{tIl&0{gp7X^K`$$IPnSxH!`eSKqtE?|Bi(u74|GGnzCn86K5LL76Rv{%f`Nd@iQez&7L zas2PZXsiSrThKH`eV4a-bWM;XOtKK-f1hh3dT}WMQJ+Q`J6k|RnXmK1uy@ZHCTFr&$wae z=ncB$z|GxZS0;j^!x|%dZxQ1;#?Y$C)yKszw#Eh z&hFbp%Ya0J?NK^5coFgeSu)>37R*9g8nI-AKJs_*w%}0TV9KqeEdB|i%vOsU_;AZl z)(Fda#)ZvTf;9?6a+S-3G~2@ZLpW5J8hNfWK4zpoP=!FRIsPy-BY^5VI9do`IqW6S;16u`Cq3kTp}not=%^NvC&lm#YiGj1;)foCOr*cYPdQ+fkR`tTKS3-85Npfk~KT zdi-4=8_UiwpUF&C+1U+#?DS^zdk%fPlRHHkx$1z+w zUH6dp&K}Fu6)t0sOBSDUqbZ2p#2b!92$sOb+&k>Kz9V1~2Rxh2{54n23u=}HCf2;q zeWcXa@~s&?J9elo!aAOWS6&Yja%v#1{3V!9+yFyAL7a&HrdM&;3_z3vw-EXq0HqWT zlt#IrO2MrAYK!Lp9-Ysp(3l_-& z64IE8sxUx$;lErfX2v>_Q4!Y7r#NS_?WTLv*-&mOmmJzaD0UQ&O@(2Sk8ICl>hEkO zz4r1s4zB0JO5!)?_)eU924?pr)7)r@3RR?(mk&Ra1|A{M_=>YTIQt=BxeAoDYi$oi z)P@r;W=GnHSABPT8me_|pqqXo6VufqPIiY4C@rJa>s*vEqVS!Ii<)FMcQ+Z#3NY{% zu`w~n`|G>>E9AA#H+S>%FUtbA+VRVgCwm8by>}uL`7ZrP5O{K@JYFY-f&kzz>Tu=` zND2ox0yN?3?v;}aFC?|>G!ISF#ZYIBdpH?_&QV&tr&~B7dIlkxH`%Q258%K^ z**K}DDdh8V41S0r>y3WC5O#jJR0r}p{AX|Pzu6TOTwNV3?L=3WH?ET_4t2Aak3Q0h zZb|b4X}lr~#mZx9P>^;6$qcpz#TsS@Rmz;4>x64M5ox!t%&_TM@bcvOj6Wp3D}Uh% zHNnilf93oAZ^k3>8W=|nG}{FR)k{NZhPI(f-Wi?VmU#*^qZt(ZHG=UyU2;s{Z)$2> zbhj+cg3w%biV5}6+mc`PXlK^s@4(RIj+pD+Q$;5<3wrXu@kO$?B2jW4)A$IgHYAoD zz-OK7yU*Bx%9k8q4sCVNe+t+@0n1i0*|8x05%_n9Ms9~GofGw+o&^4J*q_Gkj{*7P z4f$g({Kq=-$6WYhF8oV#VGCwHPqbx5ab7De1Gh1g7=_B+!YUr^?79-ajLL8jC6eIt zeKG+n^w@(4rlR2fEw?r7?K@{r>&T|1#e3JS40RtXvXWev)MZ$CS8S(9mi(y9y`qyf zH*P8Y^KVJUfo3agLcRe>Bg?-z&tQ&i$2c_l-WpdgAJk_Bne#=iL zebIe;7QM6>=#L{_m|m_Q_yG%KxjBSW`ZD#ZfSRIRAT5%)_Bw(wl8xg7%@a82dBAPL z0h|JAhpM&uQWnaAa^s_oqz1y4Km0JYX*Xi=3Ux+Xb0v>5*a)=0HyL_lgmpjT_K?y; zgPZL%A$7xLK1S*X?%W+lIk_WA*$bR=LS{H$b1EqT}S`A4}9_Ax)^ zzaP~TchwVNbum|A*xP_IRJboEx^5KVZ&yq69Mw;1BNTgYHsSt(hB+wyqJ>KAGAX?1 zGgHv->AB$ft;ADxh`4X{ zpZ|d~lE(L@b@~;+xjlW+t<2KZ^Et{+U`ZFSRhV8=W4dh`Cq`Zq#ANaI0SQ(Ub3+YV zgC1@MQ~gk5%#$Lcw*F`nKXz;kUC)Rkyy%lzxIiY(@r2m$@4X2Byvx?#4iUi`nA%sy zgBW$`%kylHx4l5S+*Udav_Y| z*YY00)S4ez)xT+i{)5w;XVcBuM$&FBoo4&U!G-8!K?^bELqlcKz>@3B-XS1N`Xtqn z{V3RSF94t~ksUgO7{Mb}>c$F1Es^zazwFaK3Wvrj#y`AdE}*eZ>n2XWOgQs@UNHWA zJ%kHj>}`Eq3akNgiUL@S>lFHV(JuD0XUEf=l$?4u!cNF_e}WK0z(2l|o;i9Dxb@{f<{WV- zehR#A4F=}>w)!aSM=HR{oJhv2Gor}OLl_+J!A78?u3xZz{8rFNA8EM}>$csr@93ImXiYbs zzea3#Claa$WjsXg(48^v+xAY4Z5@>P$DL`~SKZ&5*ebQga|4Z11W<82-5|d!_(tp3 zjlpD7RddR{du@dl9PbPXI0F_`bow00QjNqJCd+iqG(=3Y@^w#ZjK{MBsGeE|CrFjr zA4$)*&B>hS%95|=VF=rbJXj3}c2nptzpy-(q=b z?8M*M23licz#vQRyrfTRSfKt|4jr>II`(=a;fr9Hh%{H+8pnv6110R~3y^E+!A|RD z*FrUD+i)H}(Jo$gMU_?+i6tox#OEovvdwdyKpNP$suAb9svj%$^RBphw~_TD0~)Zo zyAL8O@;O!u!wOhAFQ|*PO_`9`SLAk!*4$qr>QJq=B4ndzZ0KNHu%bCgEHstQyCKc5 zE%eGMZpd=~s@Soig4HreNialPTDw`vpxOg? zIyJ!1PPW3H+qmU@_wrswSO7)qoa*rzP^>@2)qJp7mAJ$% z8*cDz&~3#V=&JjHjZ1j)51ozoZ%-z)OOf|!nJ(;9^hJikblfhUacg){&s1IyTVtMy zGa9O=R{3h$>6B}f99OHyNVO8>HysugG|is|mZlM=8*XdAw7AFVeGX9XrMWNbwW4?I zZ|c6t&@id3Gb=Lw;k#eFb>Jh*q+g|C+h1%ugvb89^eDuQUT>sm9V)_{tcKD_8IDdpd)}?oQ1t#nBu? zB|>xFc49n2^j1T|5xLvmbPUlM{yE~i`Cft_z?P7A^`xWE$N8m&xxV<=H!db?S9$wI zkjlRXb1f*!@wVGwil91)OLwZZ)A=CEdA?G3Zhn(?3O(t9Nl$uYsa4A!J=RzsZPnaQ z0GYBjf`>aK4Qy6u1sXphVIawE$a!JO>=Ir{$ai@<(7ES6Pb%~!zl35ml4q;ttQ4Pb zMogxWX{GnII%K+P6;hhsY--yGyDZ9Y-U?%e8Wzr(sM*?)rn^YWTY27zqYwNjocgVB z!Rnpg5yBkTHUUSZr*?TQ%)W8@eRH&K9t&HZYKo4Ii?-&8{WL7F z-@f8rld2ly?N-zK!#}FHau&c-N=pv!x21V8Hw@7B!Kf{agVVR}Y`)@olUs37`b3}J z&QM3^UqVGiH9jpm;Zh5ZXTM0_mua8TYe_RazPGV@d#vd>kHnCAr=1V7C+4UhyrS+DtBm6t$IIu_F2#_keJF|2G)W~LJqb@2C=YUfW|s8H1aW84{lx{ zQyz8kqb1eUMWP}Sj^uac4?dCu?!)wKESO&|t$!Q3vY4K!`VPpFo+s8*+OksPp0mZh z9`6`_P1Dr zVc}{CkBv9JSZhYKBSRg%}JU}0Q(W%F9sWej-h2% z(R8{jCM?O5hDG>MklWLL)7Gc2Gr~6XX1Y#qh128p=jcU+0Acnx1-^&hRw9T&&gvjf zvhA!hZ4dBK8P*eqxzXs$I`+m}UCJO`DSrOxBT9Pq2nj-7#hy&+L?MC>3Ff=X#HGKB zasE2P2TUIS?S*-B`D>~rrOFl$9#&~p$!!wsD8%g=rNo+04gRSKqWzjKE3 z`w_yaP~I8p<*y5Ak)P?F3+1nJ2<67<^N4}G{CsW~*R}R@vZ!rL_z?a7y~%`OSep=} z09HaAD9gTiBPq-)J%I9QX$U>}gT)7O2h?B8N0zSh4mH0l`(7zod7HCad%SKBZfA!T z{z@0Dlf|QE&7A59muSKnV8s^p!i>}llNV#F1MFmVMB@%)zeNX2hj=GUwIsC|Czh5|ZN|TkDH8@=GP&_bpV6pY$h<8J9)@|9WUx~fL`{Als zw9@Q@D}9$pc^`9Azns{9WbJ*sZ%?=D=&(KwVV|Ylb5Aroqbofw0-zJvSC09zK|#ap z*pHEUBoC#M^~D-$OsB5UKkd-uZ)jWU1TjUsm}ye9#Y2*N7wYDWWhA8_^2tmYN|fjE z@#VAh(EuEOX;ON?!)lQ%QiS=wyOob*yZ$v$zvaa%?Psl#b#Gt=*u>1OLAZoq=@Me? z!~nS?8EM$YfBfnYLEynCk99&BgNd>3J6vrY9a@E!2b?e6t=wKNagvp!*kF=2$DS zp$l6iZjfs>PFoD}COKSVJC|>^U7jJmq*Yw-_M_ung2`X5WPiJ^{d+zUT11PRTU$nH zQK$*uYj`3V?R^!rD$9iJMx{qUWl#H?U{oG1CWCqa2Y*ejI>7t6^JQ!5QjWr(pilR9 z-3ezX2}TBfbtQiyF${P#;-zskn#7DmBlWF$KK^?yfVE-Gf!I!GTaS4}c1`iT-QhRy zYc6lP96`sC?lk9MFr+g9)dk0xFL{{&G%EMKqHf~iiH zraatnpVDzLxMqOpT$X?XBzOt;(*f_%uao^AzBhNIrSOk6@hbS|N3f}GJgk=BMDr)8 zW$?FOlOLrq1KccbfQ|#m)}MiQEPsN)b08cXU`)R%cbQ`i^8U@4D257%L${21ime1WVBhUW^kl0jhuOVN8L z_rNFICxL%^+X^LVXq@~X_x#6m{^Q;LV?O_W4HM2kj~rio4XX$YOjx|OKD0^PBvv!6 zK6A@hb8hDhD}|e@Z_)2wv!yJ$wxzdPJZ)fvS0>a8cfc~%FaSw>k`Ty)&5h_morV30^?YkfJ^(}6E&I?f&;V9$O^w;UB0 zIKpt4i_vp1k}l34+Sk(DQxVQKjNno2gGFr~9iffnm<#eLny=qxj}RZ>`TExHGbxBL z?CQyhN^h0QY)NqW>?G&S-v=_l0R9hxt}DK*Gf;u*;t@-%A=p2w^bDHu#DT1=M(JoU z^6ukjOT8O4?!8+Nn05aito4^oomJG-ceHvh1ZF!cb!3l~KzWl&Zyoubr_|Hpd+JGR(PB;EJe&QGkt3F)_OhdI#wk0czH$r9 z>|9~E&0LhG!r|xT`pYqe5(7p>SAlRM@;6&V6H(;(1)f7^&Er#F#J*LF2T%~>s23ap zpPD?DJZHm8T?MVl-t=gM5~4`tTZgP&Y$!ImB00z_^?6L6e$-^L?>xFxRrD!6XCUq+ z!alI9c~YwXfXC4J+viPFuF=sU`*|tjkvbHFe9EJu*$5JxPZLd|JtUQ>$QC>IIZk|+ zRPLB2o2Ar=IzvxN#bl+|c{5Q%viwqcF`S$BY;tO=b2oy!3BoKbv$z**glx8zy0Ta~ zggZ2bM?H2Yx#sVxHD1L&sqXmvz{ViOuFHA zINeV^4?o00cw_8LIqIaRAj03iS4HnO&Fy=;m3w|sG2@oF7yUVodm4^1b&QF9npCtf z4rkEK(x6;bDNXm5!CR$6?_yU_{UNe!>FsZPQTl4^RECr$gzE7Bu`pZogAj$1t| zuaCvc=5I6_Ct!euuyAbxqeN)#VN4v@_QE15uUe&0#ZXp%cJElY5Q7L|L+ksCf@w^q zWO|G&F1h*B@rn~UUfCF{anc>54a*bt=1rH1wsh$`xM~RH?Anwv-44WpTO6^-rc+1= z|4!PRjP{sRR`3MvgJy9KS9m~Sk%+k=uM2Nfie+&p=I}dJf-sz$v9SCOR0&g7(VTwC zMpme{D1M(A+%-$|nkKy`j-8G>;@W$xp2v)KA=5EhYw;}5x^16k z^w5XgT2a2C_qnN6p80!a1#WvV_F?G7rJo=Ys4^Po%Z{#_R^jFs*(rq z-;rrwW_zdLHSySz&lPN|uz^MBpj>uY+a2fbEfir&3*L<7s#W{wFZr;*l}x$+)8*q& zn+c7pS`Ocv?wW}>wdqEVT*favnPF#6I@%iK&ZZRgw9yDN>a5Zvh<8yNjuNwqz;L4u z(?brsZN&w4xzg^dJ+cfVa?T(~jFj6lkuVeR*cK|#;+Q=;JW1DVJ0WEZC`nwws;D&a z2<%(eX**aXEWDZHA1xFiyJ~B+lj&4@YM<`nPT6rq(PQZ)!FRc~2Yr_d^VO9U!#0)P zB%ej0Ea{ZZ53bUwhJPDLd9hQM#gSSuu0w?)3v>v@%grulNVB&(lG=(Q&eUtHvR z!zeFI(G-z+kkrf%lp+27J+s_>>WpMG1UtVu;$0#9&G%8H#uA2g=Jd7}@q0xR3_#`Bjw zz$n1N3#yf~QBb^XZ2akq|JjA@E61}H)+HPLnBPlIboFhlJvb3B0mD^*K9&8Rf*B5_zZNf!2^)srhyjG!35 z_Uy|#@4R%HUJfFRHxHkoWcv|c(mWf=oO_UG#)Lru+@z@~Uuw;!3heUKyt79KtrlTz z99Wn9_0x+LA6&uYkga>&9QTh{MLtE+H-Xd~RhEjAYJDva?=&U7kBR$XrpP4N*dKg% z6cf|Z#$m_ysoF86l#yTUsE`-eheY1waixVlx#Xq&~xUyd%@G469n?H z<)JSyDq|-OOw!EL=tfJG2wZMhxEps*j zo3({d=knB4Lrqp?Y}k(0Fq|WecpP+OUNrKZ`aBB~y*YVME7oi`!hrOCF+Kj)SPAO6)JohQ+Tnve1`Q zr}27wax;WJf06L~iCfM|(Sf6e&zE^1Nt-0q+nfgBH%OfLgbG(W$0Q>{0S=>@Sq%?1Gd4AcZhS} z1bo!vb1p;**!0h!j5OqE5ZEYHut=a1W}o*Hl)3kS0^c&f|7!j+Z2QSD?1$wtKf81WpsJvBFS!#B;DZITeQL7tR?-+cJe=bNSo4) zzu(x{e1~n}9Z#-yD!ublb=iJJ35C`v)z-9>q<5-bdKF){zSp${=S|vBDB_JfZ5^X? z>j|b*qX`3%-3X>C2p^R-l^d|w(DS{B90tL~sMB540+i^QyJb+>16^|C$1 z@`p7#S7P4&0$WKM>sAdXYnfyt@WT{N8n;q`u}859#)HOB?MPg`*>i4rhQAb`>c0}U zdvDE6f@!JFcAMu6Y}<9>J*|m+QN`KaYj>?I>LxH8|DymL^LY}|(jkDgIQwm(hYRXM z^+ z%`}A=sXJWyV)Jd-uwz~KWtg~yz=r^WxAcT;A)tN4r1_BrI$^sCy^fWCgiJ6RJCuqa zk(OU%{B|^JbDXatBx@L?+j%uPLP&)(G0{&Z{&!`pzeW`Q5@7;=Bz!lFzYLWe!V7Q0 zSo2F0huf0c!jecy6Pk@WRh9NJi8%^yWtJaka z?*T?3NBasFJ07%Mp;tBpNi%1@kF)_{kdurk*0#v^tRxP#{FweFlKvJuwhn67^$bNFTOP38wE;(dYaI{{=P*w$}!vd;=;g(~=; zAk9(0Up$f$yqwT_ssxl!g99O%tnJSq(ED-^EMUX}4mC+<&AUdnS4+*;nXj&)`)qlB z#Cd6F9$6!5_qs5{-Sd#z6DCnyQ`2CTC*UdVoqmIs3e)O)t`zzdAMan@hizJzL-=$ZE4z}+?RuM7HwZ=*JTrn6WH}O7-VF{CYkMp3^MLD6H4e)9ftiJQTbFYX;i@ z4o5bh6&#qr6KS`e`I-*XMtwT<>%Y@~E+l6r0*Ek895jz_pu5))C+8jK*GK@aL5tj% zWJ?D_v7J}PrTo+hwpl<+r-MpbJ7(rR+Pl?)={+2LK(qUDepl3otk;7KuWG`I9pBVM z>)=(mKOncM07}W{tNDoYt^}-BBqOX-Aspb^MdNw|r(k$xt6aHWBp)-bHrlC?+#xCFhqTT+^|zOIQtXM9EZvDAn!5urXdJ%Q`hCh%G8CL|o5u)ss;k_^Jk$oR5E zwaXZ6sG{9#XX6m_YPiD@I4WEoU~9Cyz!rOn+mYx14;v|X#n8gLR;#3in}CzVO%93? z!UJ=*@LG$thaNN2NoA60eW}}}HNLMmdhg=I8hrZ&TT!wV!A{!WW@gM8e}u=JSbH1J z14=Wb9%pSgWce^Rn$m+3?jKjL1wQPj`Ei-t)VO?zKK9uKqNp?w>bobsjNwb6+PtTY zi>Yg*b>lw<7oFne;)lA?wer!eP5$2SOMBg%K2+{*$Cfp-))~dCZ(!s=wd|n4R;>Oi zCdHQm9g*;6?EW}~VKBVLwQ+bmJ7LQ;*S*ADwPWv1XRR(ey}=7#hhc0}UyR&dIKIDs z&GK#S_s@J*tSx;`(HcS$TV~;6#sbZK**tKWG9%-a<8=1m07z?9J>!+==krjf36!gdW=}b9CLKsdzMH3j zY87r8?KRH_X*Eczqk?HH(=JXWIHpDWfj~c`L6=<@oOuzvZDV8kJ zqGbxcMMd+z8n{gq9pph$>9@V8G}ZWtW5CDGs=y` zsXi&=n9@$?iG_2!f34P29k*2)F2MW*g+i^+4yd`f&q@ffU>u}iUNk&zr?t|wb>ic+ zkm6IJ+eA^(S2{WhXI^sE8~TTf3V21ksUrqX=2*Lg`h@!O%sG`dv66S!QgrIOZ71!A z?IvI4myPs|LvI2nZ!)8)$2?A_d5$4EI*Qi(_Ud3G_j@Ou{3@TbVDCpEMH=S^r%bu1 z5G^lpNIg_?;3l8!p|i;1$a0aD!(AqwioEheSlRNA_XO1=^l=7a0ojy_iBu z$8J||&AaYFC*f!E*rhgBl3Ef}ufD>l%E+ZWyNgTtsu!n%EUdIErR0T3Ke3nDrYvqP zIR04EFgoDC{eUGlqvn+2F#@(B&5Jqsy~)m2)3eB=J5uxd)Tg*E$@F)ySIGsWFj^Ii zn-sqxn{DQxUFk$?snx~W^t&<9l9q8*yIy0icdJrzs{87)Z1*z@Hegs{vl~(i)dt^I zy7^tcPKD<2M5g{&T`_hqjvoFxu`7E8dvvg<)LbCea+;#m4qa6i_+Wn3B0OZ5upAZ8 z{(7JIxh&CCiGp)4Z_nM_zLGS|-w)m&ch*lyHStV2;> zjiIg>vbI^6cIk*T1JFGj@Mc1!J26u&vMAq$RrJv_ft{0?E8?u)1<<km{QTw9216*l~q>a)f^xCRLMNULzZ5m zDoM1q`XzA``HM!O?#?BGs?KRM&Z}EuJvWWgqTqAmFoE_B!+1kk0+i~B+r6r>R#9}0 zvjVSAcJykomxH&prHjtuQC^OR%hLRN98iC}9AB3|VeO^fcrDvdWP7NCEArL4kWl8q z$IFKWE(|eWJ`ELg-nrenjmWE(#*;S<^i(P_C!D`b<%#1_!bWQyn|JaakM;74RSjcp zkHXMyWsAJ9tF~UUHK=gyojCrlWB-f2?~H14-PR3)ASgtn7YPbT2kD*I=mLs#gosKP zkRBiqLFr9EdQ<7rq<2E^MS7DGK#(R0H9&~>Tlbzj&N<_pz1CiP?Xkuw>n9=k>U`%r z=QE%AOaXjcQX*RSvKF~KW%D^WNjiUVpxd1yJ>@J5a)sKssEcBlHL;c_*@@=1qwPA8 zWvhQi@(+r%l83@lw@q&fwy(a z8D-u>cxY&>*u`{CaB~|G@R2}g>HabpWTl2Zu2vJ-H8X9KC6xE?fT^FO_yvl@5lGL>4 z0VS?HetUt0)&KSo$z({T7HlhhJ*AmBa?R#%Vm0;s8s#p3k54ttc`S&SuR!=oE2U zo{v+*#^gICMPJ!^)78?=|zD%36+2MoqJvA=irwIpE^ z8KfdF-gqn}afc4;_CZbCg1f+U`^+;%<=iptMvwC8W9>j>MP*P2ghncG9~MXusFoaM z6#nr;aQX676XSMbU~kNWVv*Ft+@zOZck9?@ir{huSt$)wvzi{4l!?&jA>I;U=v|d1 z?rlnR2{A4WaWM<$ZHl!DX(*936;rC*ODAE!{9K{zf;3Su3C$~Ir@k7mCfT5g4K49? z`vGpd6Qo9WG9&LqP(|#+D0gF=33jb@f#$#jhyZRR6u^<~UQdq`j1@~9SHAKgYG2Cv z)jCr#K4*RM+GsV24pEkBSF7y$t>WX4MraXv`;)bx(6rI8jqoTs-!ZoHn4oaCp>wv( z%C#tK7j#_YJfbW>%uOe*^Gl&N{6RfxxZIY6leVqV6yu8BXEjf6>Gian4*|VY z))GtAPfTYl(54j7XD;OHgfj5g!!T|63B7>YjkAu1H$VbuM}-imaoXr=EHqp=oyCWs zd$~PjwdS4wJ*N^TX|kAGSG!9%1p7poDv{=a20pNR_2tQdZ1AMrM=FV3rzYRM5QpWs zQFKtGym01%YH3S>Ewi}^GUbB9X??ntO=o;1|#0@7X?j0EMw4P!lEB(pW#3^!izqeinLuhBbmS}m~mw?rh zoLR@(4y^$7@)iHFB_#UOPY@89o$wr@aAYn&L7kgpKS6rs$m3lC4p7V+K1&NFXm@Wn zfU)3~j4Q6w2T>%O(HrsZqbW8Ps#&+n0`&QV9(ajnbT;(6!$SHj^OrvK&Y(1#R1|FK zcodC8`vJ|T^c&+%yzoe*9~*gn3_qpfbfqxE#?5{IInskl%2Dsjq6+GfejPrFcLQ_d zhK;It&2$hIODEO2QXdI*Y=Km%FEJ+rn)A`Hk^_Q@$|(!r+nR7lLj3}AT{o_y6S^y1 zunsxnVzSKG(K+_L3$+Hb+SO$fGB6Ny`wBdxLtW`$#96#ij$o|5;xp3sR&H6kjkd4* zrg;7xqbdpCjr6|b>|tT;O^vAY>LS`S2I$E77j0*uK%C)74j)yij9v-s>PXNZu3vWH zQ@Y~s)cKOXAcC)MDwK|^J||;-I>(M%_HFDY-AUR)7b>(GM%5tWm<@lruswehD}b6l zp^JG^u0BQj6pYDP^js``0PkVbzZ<6_Qn+E@o~dnw5?X-vu1SKa3I_K^`1OZ4&*B=c zc`j;4z^LT~^Eb5fV8(_vJD$#4hJ7yA$t`5$qgUmgq^q{bPkzB_2}y62Q<_nK3v({p ziVwr6I6Tt>G>Vh;jng-=ua^Q7FbOSEzC2C0o!nKcM_>3I7yp>L8XfKuDXRMY{ed*y zZmqJ>BgqcF%fpV`({9LbtDaLY!_fNa^bdo!gc)Jni@D_rTp_~ z;ken}!#ZUBC7$+kFE3tECDP5fbxYymy>Xhy!!IOiwn_v>A5?~}+wVNN`SlLxtro8E z-AJ*(N8JFDSpOSX^1m=DXo)DleJjqeQaIb%q-?CP7atL6{m9cNatNZi#Rvm8U59fo zmAu(%aKr4hyC?s6owsu;c#wZXhC)V#nEmoyeqNI2mz6-SuD=cf68)VQ19||!g{!-L z>WvMfTbkQr^t&48KflcJq*F6&k*91Y%J2eV7p5Ug8C{St6s0gi1AB_*#d$BW_}H#? zE@<7`H@L~MWICflg4iSH-GpaR z+7iJhf(S`-rr{2~mNtS}P36(T9yAB1Uv#O9tJ#V4+NNKCVP8`;K;rAsL5h7xhBck5 zoKU=In|^sz5I5>N=1uaj+wKVyt@qm$I5u7;JChSAq>1lU@| z0CQ*Wk6mMWvCJ~R=?(QdI~zeV`u1t~po?w``d)`u>(+;oC67!H_BAfWev0u6oL>u% zO#0Nm!n&UoXQVcJZ*IKb8A&^L$xZ6fdF50Vb)7f%T=WMv3m+%i-F+n;S$%tE8przs zO7;gN_WQnQ53DLxob#Ec5IMT>&IxARlQq9?ghwqhm~a|Ff?at>VfFbEItoSnbHZw- z!kiu`Qv5@73M{~kuBr}8pLD5o{Uv`F*H_!#JK_H1-4frw#jUH5Y0B_VyQy?z;|(GU z+8KLBGbPj1jPk0|n*trTx$+EcVMGlYF1cp2b9YlD_SoZ0orT%EyQK##(gwx)K1tUX zLHDyf;C4YpflFJQ{4E`)C*q?7*3-n~+-Bqjc;8}9axyq@zYly_Epx`q#O$_1bPrc^ zfQelV)3x2(LoVqbYd0)tMGP-|dZ~`c&vmHU-0(^?Q|K=<Ia#?&`=0IEf6)>;8g^X`L|T*NGq)kz_Mn(UEg#8Or3R8Xr!0I{)lV zoD9N78U={)LI>9pI;Jm4Yba)id@Kn}jZ;+^FHVU{t3$fwj_pGHAe4$Uxca+w!wu$6 z-WE$~e7n}d0<#b8+}Wir<)=IKrn>SEJV_J#PJK-rw6E`MXJ zd0IBr2{(=o3$LY(FRM;5_mJ^iGRduj!XoVWBsOVcG zXY~~T<;(V-5+DUr>g%+jye7OB0I2?wFRcCXqjs52(m45%w7k7Nl_NxcFYnfNag2vX zZI^Fv9U%#QMrMSwD>1!oUSh*svCUO6NS$?uPD9A|kV#c~yW)tht?l50aw?|sYA%xk zI2&s~1iRVsW$0$KOWSxZi`tD5uX@EvO$p!XEe{CGldw)8!)-9if?Eli_(7qQacrEgco#@HCEwVv#2 z?v#B-Tn5xwBv1jyB=IMWsrK16pN-5^IC~>V-gZ(~pYwg-zCtrbt*jT8Z9qnv)i@Tr zTlU$hQz@z$OL{{r_r@nJoiMdZt{W>wa|P4Knc+C78#tVxh`)^s`B<8ubSf|VTDk_vJAIDif~I{(_}^yrlZsA&}p$<67U9nO*L4exV)|}qJ*AC?=OJudXEBVKkHeAw6J^MY4(0AH!_5BlVwc6v>Fq@`{J0^sjrGw?`~)7 zG}34K2#v%5-N@tkM(|ziT2uo^HeP;Q{_c%>bBWXwM9x&!_ttOeVx=@>s{*R52DxFK z4|4vA|O$2h$ZDP3P+m$Sx#6-F0_VZ!d!;bJi%6xAyH@s`y+*Ttf0UuOElp zxZ}b(cnTWQR)l<5X(%M2V4_)<%zi=7MsURv#5OH&RjKc&H`($apUd(a!%>b!voxQl z74>SO`jrNo(E?h0rgZk6V!#-31p>t9kAbBAw#0rp&ImUT!F@S^B>_a6GAKTJ7I}IB z5FxqKpamo(PG*H?+#L(YycUEDzABlxg|@o2kc!}aK%W8Ew9}Tr`?9QZu7aOK_WYA) z*Mb?j{Dw}iH{RD{jzjmUJ5ZehV+m5DL|QEp6HJ%_Bj6R20x4>8MGkcZ5_^hIF^)kE z&k01>wGMCX(7Ln`27`cp_inA+749E%i5UqlGLNa^cHy=Xgd!6r<^kIig#PRf)P2x} zwbb(11FYMJhPt!#5O&f|d-(01$40Ds!$h^Ayi6gjRU!ox(5ZeZMY!(5kxD&w)Cfm-cP+UkxzZOxsR~uC7838UZNhciC+cY+m>_Q)-wM)9*>`A5W`Iy8WA&zic+daOiU*p#Y zs;U(ZwQ1oFg2tMp_Fk=m>YPT#BN_olccheOv{V&8HbpG)@}5d@4a}Tmyj9`v9HJ_9 z%H-Cps*SNXbttJEqC6|8R3L6=Rt#9&reB)>3F#BU-R0$W8`UsnQ70r+n>r$_ z65$+6M{Ok`b(wxSw=72-9i05AU0*Vl&R*$Wzz`c-0+oyz##+2=2MUJsIATp@AWMHa zZxwAmxdRhaRYCD`w=I@}9bUln+w2wgNP(I8X%($@N}T1~haH-D|GO?;qKx)*tA?M5Su zJ02{meJmFO-2ydpET*RkglFJ(sSAq1S}AL0EkyjizO1zw3L5nDzN1 zYFL?!FmlgCUANx5)9oAWrgQT(s~eBmlt`7qa_8Nbg&%>H8V91l2uDC8%}g#0h6cA? z<%5D%dj$dFLIP*#iorunGf4rOCk(_*$DmHX}cJTpR934JZ6 zq+b;_T^#MDN^_vG8~e4w_Vzygus5*FkV7OE5<#HZAY8(dg;+)z#p(ubwoPmzuSGt; z6($;Zp2e1TEv}VWK``_+#HO z2*T#J5ar8+D_^$1*n0BZSg#^p3OViJaQ{n%3&_YYtxNRekj=m0({$vcySxPnXe${R z@kvo*x#IaC@lEuHz&Hx~`puGV?Vx16w1w?#EQfAQfSDa7-_$Kbt7r0(??3O$-G(SD zR#?(mP%*WRYOAR&tgO(jOu)4G3y>*-gsI^yFUPjI+q4%`kPI`0I?JoK^g=+V?;hbO zhF;IQiQM#-bv(hV^&aJ+>be!BRH#Z%wGr39R3^43KYqHjEAH~n&aXBh^39J>;lUD4 zhe)M}0*$3%1S+TN4WdT2Eb@e{9viDQasYcn5H6q4k*pweG{p5qUe3C|0l-xjmlT>y z(t0*@vlAc2DI90w>P^;4I|5=O;L`|Q^~HG{Ut7Rrz?TW( zz1nz+N?@*{c+-b~pP&yJXkmh?S?4b;MR>?@e1j>r#0=+&F)+kNbhTPy%nFZ|HAf4t zE=RK&ah!KBd79Tuglp*uzqE*DydNKLp43Lks!|zHn{_Y^4bsOqVRc%So31^2w5nf^ z{II%ku}KC`hI7LVJsf_HCC&C(viBwbQ5m*2`c2Tqn@>`CS5!~qGWq77l)l8aWfGfx z;&P_mZN3LKN~SinryfX*>{WO;+l@vGeq(+MUtpSg`Pw$aViX8ZmT=45&7%b8K6v!N zTjZjE1M|x9v;K8|%P56r?$_bEPGnR|rN?DOA9JPXyf-nMz#wrJzkCy#fkV@TMO_>n zPy${&dL3(k?P>>KUND2Nb%<4*q_x`Y%Fecfe{3&Hqi=yyomwOBP)y2qiwQgnWFRBS zd!!$_YTQ(G{1eo6@T_C8V7yXR<7?y$fooyfYB}MRI2l$u$n9cZsZTL3@KnDnLNMv| z?D6egZ6)c58pZJh+UM~8JDJAh&aKp8lSSJnv5Dqi8g&wgg5^>Ac4H{IXU2mwwpRQ|=>8CT@37s%OeeoIEhG42- zGnSJ}&b9`qDG~UK^$6jq1!Tsm+%v1k=0W0_ZHWib4Pd^*NW^SWVO_`*JD*2WvnF3p zIkZq!7%;bSI%PSXYHUk4~g4R^;OhQ3|UU0XmT2~fGx{|t~J@1VRDyANf5 z6`=V~D@=B!x|X^^w>Wt^wLcHh5}{#bDqXNJ2%9|b(p8tGq2{wk=v=h*E~F+U1OpjLMFqu8bM7M7k%+P`DvwIl6I*ua4cWsV)fbcOS}1?br{ z#rgY-Sx1SLAwe~sV^P^4%~M7=4S^1|m5Zrv)h3G;S7fPZsEuOeb#g`7Bx%>gSPeCT zyp?{n@8kc&eSZf|M+9gi6z=;-?ih@5?H0Xw60_>jBKJ&5po=*66(z{CYis9J6(M?- zCr{(Hyc3Y%v1e0d{H{)~6=vd-ZvBIrx|=_?MeU&Y^C>Za7F07xRdNQb&RPZr4lqwL6INjXbe~sG$?10Ke7w5USZP($2j5yGw z=APKkqMk8jdQK{FKA7e1`4`y*ovD zx*qRdRC>ADZd!OtxO`;4d>iz-g|tqaNzOLXyCnnM0KptitoF5>h^~K`*IeuWXv6sx z*Xg{~e#xEH+dz_m>@I1lwQ|qqJB|k) zk1+Lb!l>yigbb|k=F5G1pK!6QU-}Tuq_u%Y?9te>Wfo9ZmC23I?cWUVKd#+qGdn5i z-gkD88Y*Lk8Z<94=i-Zaxz_vBn)Jox!5CCCis}j)0ePPtW(wkd+s9eLewE1T=}S>K ztA5RWQpkCvISXv^KzeVGXP_p!V^yzq%w1oN|5)jILWJT)ggEwW=P)A#FLLRq{!U6_ zb__WqRZ92^DKEryhgE1VCb7q5B4EMCV8wN%!7lyfnXvh*;$`qR8#Tnc*y`TS5`SL^ zP6IukA{2|J7ScS<4j;>0_&Fs{_MLS;Vv50qgzi--!`WxYwLK& zc*n+gPkvC2ke8>ERquiDCt6B3Oi~NzOJ=6dZ~p`t3-wsn(II3OGcLi^H}x7Zhe?NO zmGwD|O^&iEdpVpjQ|{M1pS3Q!jFf)}$P9FjfRY;AVqJ z?u%079dqJa!>!iH@~)O0ux(F~%{*1uwumII;WQ*yb#{!S_VE#k|LyhAz4hl)%wmUc zb4!3|Rk8Z&68Du=X^()C5JZWJb*JD-O%`FT3mUKW8JsyeNiSb|F}~_3 zb2#vnZC6bwwVtc5tg4GY$G0D76RfCTL4~z!ignHo3puPN-}D(yEdX1R*7B~vDV$dy zoX#Ti_ZdrQ;F)x4!|^wIzJGnjh-&07iB+*#0;>HcomK8$$dHN;o{r|u3Gu@rS-~oN z=jc%GeW|4KYzhVS)Ne9FLnHKD(4t~gH@!t0=W!Ka@_EG$;OpaLH#Dn zuhChh6*`v+{0kLs7Y7y>4hf0AwAFBZhEr*2V!_;qtRo9>UHCl38Nm}a81q=nO)ksQ zapH5EdA|9zt?vcG85>?kO)il`rNT^!+Uc2UW_incQipWcG&J`7+z0r7tl zc@P@zt`cTkfu-XzE@;S_b*%R)A8JrJmB(MlYUL`Hf5K@hZI49=?6$awD?TgbO8oTs zaf})_lT3+Zlcs;W=2)tPPYNvmP{uJBp%V8Xk^vlibUj9`Q?$&OGvyZx8bF=L^WuCk z2lD{RZaLHIdE1~yV7FiQ0492;pN#mGQ5+=U|?DUR3F9C?#+>UXtTQW!$nc?SlPF332=>6x9Sd{w9+X4pn5!JZnfZn3v}yDTD$z{P#S;t9W;)MZttNwFfcStvHtz{`5Pm- z!xCq*7dKhrW(8=3^lG~m+u3fjz8!wp3 z0~OC9xG{mn$v=Y7JeiFFM{S&wSB6$H^sb12BD7*4nDqNTXR zx_k+f*r4&!c7Tgl-)d(>Hci`yOjm^Q>Gi8@*%k4Dp z?V(2V17owYq4j|gVg|RW&o&^?o^)@_W(%xhD3HKv?S*X9W`^tU z+!?Fr^X?NXy312P+JK8!R^llv{4_W)Y4U>$~3EvRU?VHTSB9}(MIaL@^LIH zG7cJYtd|zy(@j1CZ8TchjJtQ@Y8mX@@bq(64rD~6yUemI&RblKZozS*xwoO15{-=J z^z+!L*W8$+G=X@J6z?vU`YaHeI|t)zRFz*thA8)Yknbv(-ZWMs;3G&=~8 z3;yWi>7v9vBZ2bfkzyX;S?l8n7w#pnc8|(&+DY|Kr^KIwuoLpD6 zbzj01acQU6fb(inr92^>g)AtyhjT__Yr5iN?1CNao>g#r7YnoN2Nv<|Z!16xM(YkK z!W(OU6s#zOzlk+VS{z{LVi>XIweY1U*j$M2T%9AoAjQCbPMZD6D@w2995LQj<#f}i zoQ=WIhjmB#eWx?~wd27BfjOCXDjuimRg50UMG@mw0398m0mQ1MBrM@XvQA=K(PWlV zMLVe1m8VRS_58!|yX+i}I$2T=&CKM4`kXwU?+MhZvqo^!?=)5|0BqyC>6H|8FMl+W zjHl=S1esZ&=n<5-AoN0TaulE|rXwktAUsyXQ|Lj@RYzP?Iv04o zywxgEUYSaobtWYysEr4&(Wl?2AZx8DZ^}EO>m*_?nEds76T}8u;hN4Hric5mguJNq zv5hv=b%2@dhRuc6a`}+jJL&Le=}R@`6t#Ts*B5z*(q`vlr@go*zw}moPEDqr&Zb?qg&fD;5^gye)&6Uz<_4C9cspe4+N8*LWGlP4rpIpyS zkeu~0(LbCx^FIIho>gD~U0n^r7iX{p4o$WSe9@rbnQPJ%0W^iirc7TYz8Bj3W-227 zIFP9MD3g7F=bIj1Z<%8<72m|%tas)6Cs^vXS{XNeZX6n8;VWP>W4anwBnga1q?+@o zRvC-)xRBKwQz*O5b$P{^;&peoSd0~P`cbN_O8UF;Y_`o4#~peax&zCxxTPZ%qGG(* zS$-3j4M&R^6pMNz-gwSAKkuszwfdxo6O}p|P-tA1=H2IDKiL$?So|4R_btboRWAs$!pUs8zaMyB9 z_C76@5Q+O1{Aq_*u6!V0N6&IiaJuO63MG6*Y4$olSec1-)XqF!gzoSnvR^hV_ez0m@u>W?;YpT4mDuMhhYR9 z-HMAwZJ)bmLUUAr>}2aHgk~1!?<2aTY1PgkTU1(Z_3~}^CpyDAA(n44#2oXD{YMhn zNrUh5a^A6j+?j)wdtHV1-*ci8k{agpa-0@KuFVLaE*b<42(58VuL7)e?LwU6vPz`y z^@V{PwH$an|Gl!n8!qz7oO<>f5BOFhB}?)AUprSTN@Xk|=Shz9=XWye*Nc9FNZiKT zm2=o=<91lm)9vlmIA`yv{RBPv|3_wkkXMEvtxX(5H-+NG4H1g2Rn-Cu)?$v9Ul}Y1 zYRm_>P7P{VCt`Z$VOE1{1HSQPdyNDYy1#r!Zif`16_{iHg^m z#ba*jJ{`w9B6GdkOlO}_L~4Sc9-ZU=wc>-B4i?Q zz&d#YRFoqi|8l@YWINp;YVAz835kD&Z2dk75WyD+{75t~5M^2AWH^AJ<_lScV9&dh z07uf;$@mFkhhXX?07@xSWvZeSt56i<1bSsaHPOQux9{mdenJkSktaxrxkr#0R zd(?)!)c9#*o(pqunDBg3hxU!8!2PhsSDZqsJ)kolqKxvWwlfd3L9-(fLJ})C(^P~m zIya=*XlHckZPLeZ{ufV)_@i6cACP5!`7L_$@00*>hS1|l0CiEq)lD{%C|y&TO5G5p z;h+KPtvJrplmwaS$jl-y87&2i#O|kGJ!>7ax9jYDyi?ExK4zct?WU$Mf&K_Vv;s{V zrK^xubgqdZlmCl7L|&YfeNU(R8k6*p@oLB9$;9<+ZLGGyTY%oXQiBFJL(=|Q=(V;F!r@he3msXn6Ic->qj-j;Bmz4wp(D8@tii~MyqYjFc0n&tZ2W1k zm8A+GKy#~^TV5v8N8aJ;lkbmo^SOQX#*dp&9{F*IX9Zx33wd2-3OWtM!T)K|@ZWy; z=jy$4V>Z!IZNdR{7C&M!^)1PeAOT0m=MD8i>XoWGCb4bhZ&*J!KXwNZy%M=~)#A5H zc@SRTX|v0K!n!Ojr4^{h!`cQEciJ+j8Xl1(q)TB7b$I!$XkMMT+ERV
~jwu<*@ zPkyBsf^)&_0JT_hJar~s4nrSC75DzTexIF{{pHh7G6G!%av}o#o|aFw)WnHKepA}! zUyF)l5vIN)^bZN+Rmifrs5fg3bqTEzODS`fa-TWMv}KkrH^@1wwOhIG9NRWLAS?UL z74xr0{;jp8J6_ReZ>QckymLRp3RN950~733?9wT!3}RnQr5<@DpK)k>mVlr z;XD?vI+~3OxKSG0Keee03toArj_qO_r~Y_7PxTSbSxZg28T5}0x&HgF_z!%O?0?K8 zMA9DSK9+kOuiW_5OCP=duEKsA3c~D6iyc0|3~j}tuX?vCGGcwJmoB(%n;&>ujP(+; zFizb3beSkEPYS9p{T&#q`NQ?!Cwu?#FQz(=5QJR8Msj*cw%v@+YUW2ZR^zzr#$Gjv z*lvH-iRU`TqTP2v&CgjwK_h>6&Hn!-fX5z-2`y0}^pL8AF5xH?VBlwIxpw--W&voX zrtwP)Hlxe8xSy2Pi{EOj5UgyidW^jVdH6e4)Nk8;?4wxq#wexq3vOi9(e|8a8p^SB za!k!Dc`vSz{a;xfhBjzLSAZ|$yv;(Tc6i0k6?mDZ<&vF7nBW_wCprqz<(y9WicgY) z`daJ4v6Z(Jn*M1S@W1u5e@l?)xAW+KJ<1=PYxZv$7ys#L|HO7O%Gmzh*-6xdADXiMF$@5D)Zi1!ld#5dVTHen#W&r>P4cj+EEoSjO6#UO|^z~}Z#)!zaUY`mY5(Wqt8|tv6pXNDm5lhE`xyxYc{#7+cN5*)O%qPhCWDUDe-#+BN?}M@@yuVfUnqY$S1moLAa)DO;iXcoZjgzxV|GZo*j!|yFOgB^L#X?CUV~Tk_Yvta^)UT zj@OmLe*FH7msQ1y=KhI6*xxqbe`MVdVKTOd7I(r*PVeExaA`o~%!XRF69!vXA09ni zk4_bc^$e!1(Vua;@gsGqp?_J}vzP*|BxGi1XSX#vtJS5*7c^lb>j4xi`}c-$+EAO4 z6F`&+XlGdf5XnwEph~m^86x;I5Qu?xBsP^pegd+MZ~@usd8&yVPQdLDeu5wbF%|q> z0Ezr^=O?HcdHRIVfgHjiTfqO}Z~zY}7zsVO0KGtbWI-vayxDpa5TK+5t^}P@8@e_oM0_aznglhl?`1>{F zBJ}Jt@+oQ!4_yqxb7xdco-rYT=A|8gGrKH~Y=>r?kqG0-fB|mcEd$^eLHNr6=DK&7 z4B)TA*o89!6mJ=Y3j)Yp?*V=>pon+aauy1}1pq(IAA(8u9fj>%5afP>P+dPk;llus zOF-^lMqwa?DQGEdj{w2kCD07b*b%}}UC1M#haZ~^{Wp(|?60Tx*Hin;)_&&-r6KD& z$=#M&b|kTDx~x65*QFm(xV|?{-p4@Abh&uD>A|4b==C}bE44g5Upm($ieKw%{q#xl zaIYy}bcW*KDY8ZJ^ei5)Pz={Tbj7)#ORlVYFgq0Fz}*LDc1O^X8YBgbm%`;PdmDTw ztMH2Zr-wCU#Ms$oDZrgr{R9OV15INYjCVa1(>~mSY#l~3an&SEaXu6xrL8Y3UTrOQ zRt2C2os=;^g~;(#qj$OD_K&#?PF7Z{(}F7`EF|c zs=gdZvHADznZTq{>!InUuEUTz9K3EQbZyji=p3$iv_out)A~k~t~I#tV$8+^I#D5X zDmnP~%})CDfcuwNP|3~C(c4}kZG~K%rgK<>rTCkxxqlYhdw*pf;uxGB#si(v3^XwJ zEbaubT`t3{MoFwB>>AG`@Eu(_PkBwKBZMzlM*jSRn*hDJ_rGik_M17>uf#3?n#lcg zjzX$=Ns3yYfts_lo1#uOP@nA$TLtLkR|CZV^b$fZrr2e>SJgzCvs7q3d)Z>D#iS*B z8O~7&(g0rTKR2lPZ8Q2`em+uIsJ!-jIDea0ZWvzg)7_l!ClXoJlSi>vu3X^I8C+w< z5G^`;KOpNs65;P+Qx2zbSo2ero&vjhGTgOAll&!&Z}g=hM`%d#&EXywXIk{q1|w_U zPmpe{qRU%w-G6ETzxv_1oRUvd*2a22sy{szpf~nuNH7qsR$YqFkb}G@`9jtO!)TZa zKb&Zy!VN7~)zcJbW9_<_OM*2-HbL(?3vl;^JdfDk5gi8nWk`SaTB3eDYhpI5Vdd$_ ziqVRoEmil6xn~}z|EPuJ$^~&E(x}XJMaW3uz98BC%F&8BMUtwD^2&gxf%CLPI^s%1 zdbdCzQUG6c{{6m~*Un2aZb^X|=uDg04mYTC$hQ-W`&N<3D}Uvh8i)B;(NeOhFXzD| zaE?|aC7wSQ&e1$`wJGX@jYwi)p?%9P`c6kM6X$M!SBV$s>+jtJ|3nA;&vE1b)sBJ- z>sVk>U<|oP208KjTZKui{Z7nRz{)7_;$GX z2I$oN_Z!MD5>|uwhuCEx1$^Us69-+iEOPOh=f^e{&8;So(08h~>wh2I=6`+qS&y@Y zgY22JbT`?@-w*KoiBQ#_y~zKJ^GOVIDn}|^D!gF9!Tx&c3z-CF51r5kys{afzX4tS z#&vT4>-?_3&Lu& z&ohj_*arvAQ+~!w1_|xFSPA5>6^csXoY-0v2rWA7?&IZHW!k<4dGeQi{HakE?O3u)!{e&oZ@vU!o%gAKjGEuoaN>S>F}C!gMXwNTIZb3DXvg{=ye$l5*5xpC z5@qS=uhPXvHL@iErM2~%$oOdOF4et5InY1i;A)|)53P-SVIIh0bI8uNnUi+AVpD%S z{(RHTYqgiH`mSuVv|6Rrmxu3#g3ABOasIUJku1~~DBn~Z0nmvB0M&B0CmaqG;4%!~ zt`itA`%HEb#%G`D=nPhO$I23(NR#mNiSmLr2t*5rCu0O^xZMNXLGHSwqD*bYm5r*I zC)XL1LU;mK zXb`Ei^X6?daSm6+0j)P0v$2AK=`A6r_b-1w``Rvc>2-BXiwAy;XjAu3-46eZfb4%H zMnVb=!<(W@KscB#HaFZY#h&}#aFN;W$sxLwRfHYr(ouND#Q7G)>354w^Lcm1|S7U%N~PFkfrsE_eb{V|vAFCnC+XUKf5`n_BvsV2yW<{x%ff0^0udvru-WTn(r3uIfx z3Jt!9d);+X9F3-g7%?XVyUIxrNqm3k!Ya&uZPd=spPbQMdOok0Xq#CvIvb8uEBjKR zAUoftHCnV>Sy53Hu0nhHV}REc3yEtoqGn-g;S;^E^h!2OOtW-&f7X;kqe{b0j*+9P zI)JQ?RrQ0O%4ZO`^_9SX*>d*(Fel8W_K`3Oh?8ra-nCsQ_Xy$`JJ7yt&8&TxUWBx9 z;2hn8yo6XJKkzmB-0S2oGo$)M7Z|qjj~mJ_PmdKAs3?M$!X~ss=Y_;;AYMuB&|f2rr;|&98;z0h!J3TO$9%-q4@uAz%IkvAoBXxL`8dCl-iT zM?PTtC%|kGLOI`_gg$!(yQVtN;H24Rc~;-__zM~JDVJ}5GGwa@GkZ$Bjha;ynzSU0 zVy%L?1ok$P!&d4U91}YO64@0CyH?#fiD$q6jbP~S!VTG*8eVIa){F^x97~qUkG#0E zU3r_ELWZ08{R)8HqcHb@IeDAR)qcL$felq))Is22Cv09~%*`qiO`%4YN~6Yl^Q|Wf zm)(0ZbHABaiamc|7yV~2Apfp={_lSxDuX3R2Q&yqUQsV-w`36RilQ>KmBOsMS2Mp^ z;*8(WxuZ;eazzcTXO#Gg9P~Tp$N!0I;(ug}`iC=R2sK<7?a{71({@-m)+B-Vz_{n4 zcY;tC4tQNl&cRLe-ekgHndeSYyWtkyDmtQt_il+@NUR~7t@wl9nP_pI{VY?W9yv?% zOQQrhP+sO)-i#^y>00|d$C#PbmGx3m*H?T|@8YAMkw{7>T~G)nYxpae_=8XVPy0F1 zURR6u^jfDc_$)_Jl<);fEg{HuvetFQ$cDgbLjK&nY(#LBI;) zKXVS$xy{sd%Qp^z7p;K!94Q+27zShsOF3yXM|JK zVNim0U2Lb5wI+J%2WOp!pWr2b?U?K8nXhXsoZ6$_sttg==>Ayd`d8Tb+nG2iGFW_b z4t9I2B2CJuW<+Sm{zi56nBcIPiP}|)!tP><_&Y$4UbtU64{*A}6AK(2pls~WP35aB|^9AUcc7dM97=?fi)&KBfCQiXcU>RB#$PqLJ z2qm0fVR0OK`wF$f!ScObPgTnxsq;CkaNBwdQYhhwmVS?-KP}mV1qO;c$t-2 zQJ%#$D?gNI$YYb5X};O}S(=^A^V}=(zg)&2elx$04*;F)`MK6!7ORzoW^GpHxh-7d zG)0P>pY3nAo(lG6^H_rMzV<1DIvchJR zxN8{WDNgj|FY3-e9)^>J`vThG5DD;f$>Q8X$5r9B_>+~x$%(3`Bi?}l7vc`%zrIgb zd`GkbDK*$eFM(&uyJps2OCW>1kU`5M;x74xhm$*Us;$<`Q+L^$&sJU1$A}Ik{^Q0^ zwzhz$WBH|r4C-IE(>4nWNLLt>m!mJR$?Nf|@)uhZcLd$>;Qh{ok_8<=aW`^1kii*@ zO`wf*9$eB=0qU~tXL!z!PPu}&eiP@bD&4f-5xp9b0TdJ;kUDOg0CX=1&gWDf7rKUC z3usW_4o8?;Rd1IHcKi2s1P}Ib5hZ1V{2v++OAB1+03o{Ufggn9J|}E*t+fi1YdrVe z5_$Z!i|3@<$nPhp={(>xPT76Gvay{NI!I_<=|dluqqkvq%QfnkcDr1j5;t||rrP^# zd0Z}HM4JF?ql1j_1&AW8@6gQEs9=HsF3&n%Hf~|6=&`Z3^KM^h>Z`YzRp;{isQuoO z_b`FJ;+(K7Ex=@z28JBO9Sl{Wn*y<+5!>MR*&d>|oDnRDm``=+;UIR`A$8EY^?xl7 zK?Pe%h;l^*WKcFd#Bv*RY?(2R2vr)rOZD_;Olk$Yv-4c-p(W}f^rC1R?6KvG;MW9x z#BI!vj1QZNTov9E<)y+ag*OZ?ch~nk;f&C^MJ`PRR5ZOh{-eSCVbJWqLHLLW0_;#5 z5QtSBhi*v$V|Bid-hy!d1hue1S8CxygbqnSGq%qQboS!?r6C$z!d+nQmjsj^m+~;Q zO>KCatFf`}9sYA0bG*C4AV{7{`}QqJB8(ojSVEAgofLY}M1g<~zi_qA;-&o2RQA>M zao%@=u0seFvhIF{*-Mb9A{%fj7~!T(a>Yfg82TtsVu1ZK%~e6m()tJ50)@N(hr0I; zYU=H}h7Zzf=pBMI3j#`&8f+910w`UYNC&|J1PO#*q=SHf5EP{&(xph~pi}`xKuQ9F z2uOk&4wA(8{GR##`R03O?tAWe=9~BVUo(eu_O-9Q*V=n66EWA@yQkYBwHR;uUn_Xc zK@20>pD0*Bg5zoBfe#`qSubuk+TFh1LdZqLe~e~`l*+syB}spDo3h%S$@7@OwnNFsbq-Ge$Vy;0MlJSg^XR@gYpeM<{qB@>?>_# zr+3#L@{x;I>VSyxBUaj{ZY^q!t|mbOIRu9_Q@Ozq$MEZwTL&s)ep~A{UY8Th#zn5& zIH7|0^RW~rL#@ZHzM$!YoFp9^&~$Yru9X$@;>nZmg}f_fju z!4|_(3M=-lE@{+|67AoD$?CHKVFbs5;CX46m8$S^O}PtjOn#3{ z9S}K+vL)+K)1ZSv(!}pyrhPPx<=_r?SJ64llh0}eRg5o~uP1#J<+6qn&ty6`c(O zH;Z%>LeC$uT1nHN!T5+o3WMAIB^)zdLOz7iG`62Nj9*z1mOm&t3ltYuoYcVEPmH;0 z06TQ=(9l9(Ob3dF6k6W;3(%t#N*6)jPd1Mh{sQ)`g?=?t=XrN{r3WgqxfA@|WyLFj zqXScLz5@EPepB5#XqUHbGg)1V6@CuAtx54he7$1A&n}wCz08zWGB=^u`$qp0M>s(d zI3kc=)?UML=+fisU`DpmeAUofNq<}L$T+J?rudno$k1A9CRZ#Z1VDk-qX$jv9X_M) zftVL2WmhlX`>rpMZud$N+Gkc26}cV!_+k9*^Xpz>#>`toiFI50avy4Suati0Uhye> z*&Lb|YSH>_GvNwHdc~X57vK)S+kf4M{x{j0ZXY&0nS$d*2Ol|8RH*rp%wXf*L*(tCpqnE7`ekL^$TGp`q02imX7f-)WK`wsZtZ?cPfzbwdj4QUNf zLaC*=)e5XQB-B;@V;Il$dYPSAD+H6sm=E|+zb7*xXtEx(D4ZT*)T|U_n$;1dJO3B( z%~&kLFP2x;P_BZB6=x|xQJuCcHIds85G3x zmG42w`vLgb*6VK6SFR~J3SXzs&TJ@tkK#-X5x++V?b?$xX>6QFo@fKo;9jCGqRI)p z=>1BrQBq>6dFqZ^wubAsAK!j9moaFtrB?OT0e72dA2Dqw`kZW*k-U&HcJzxc6OkVT za7nxC!37sKd6u3rh6kM+2O^))daJFHt0Qz)m2N~Qj@FI&Fw1@4!p z*(cjj7ZqL^Gy5i^eF@OvA9w8&YJS@gh@X;DnuiLmq*JI07IKD*QfXyXM&+Nku>MD<3LZ6$Y?x)VSaSbt-oJ78=@jGSJc1mL zS-6i^!|<&Wv%$Mq%l6kt+}%fZk~S;YvHi8}&hq7RpQ7~g{-afw|4ZsWSSHkYLki~m zKB5&=ND-SV>P^4?7XW33Df!AGe}T{CD}U?*p@jdKjwD18Xd7yyNm8FS*K_#HH5#Ci zg!76&K8Q&M*Pb0!Jx=~<1F1*QmBI?3-+YXt>xNUTNo}bVKN53)p6D-q?$(BczF~`K zuhye0GuN(-i+MO@1&mnqMBN_wdI6Ap1{F>A)MCh~!wWwYOG@ma!KRxz9VC49PFRY1 zJa;q-W%zLQ`@S^r@&f_TP`Q+8!d41+Lh&Wm?=PeAmydn#5W?X=XOCXF{nFDbwd z9$$n}@AK0U4~AhpzBoagH%53n+xhL%WRk9But3fG`2oY#;X6@~V2lh;=`U-3x}&?R z%JT~3a3Va)A5F*H7s%!t(n@|O@NuH7toOd&uTr=2f^{2DPc7Z2NU_ruG0^{*EKhZ*DwYIh_%ZOVlu7@n}; z(i2{+LFwZC_UFa^BGb&eiFHAiC97B&R~-GweFG5$Bu;o#gj~q}u^@6zVlO@rUx4X) z?iUIN_t}77lmA8q!=*}wuIQLMeh6OO>y?ySAXA<~t*^(cU@Xi`s!b@LS?-nMIzlRI zf?X!pY7rY^!a%zfTk8Kg~Lm%hK@|&i|}i7n8pfn;?~X zui`Bb*%3K_YZWXwvPUUfHqQMja#9@SK+d)n*+T%_8^lF|x3$7h#QprXE|7^@clv}SG#pP}pN(PVXcT)0feLcIU z9QZ@=*`bVrZ{{#tF1$~F3Y1`4aWfE~qwt24A41CLfiP4PVKW`AP>zZrk6t$ux|U+q z>X;OuaWQ*Tdg#zx@Bhf z6N`MVfaIymK5@2EBf?RdZYJxz26}-smcunCQ(Mqz4N328KD~&#?~w3)N(1>g|E|Oc zVt$2QL(eJWAwZc0f3SjINgQ<9X!{yh%;I`8-Jflu4vn?gvaJ`0+a>e%~eh6>p#d$IEf zKYD-iRy51^8)3UqsP~(d{B%E=cTA2fDaWb#uhjQ=P{Vkr(N5Mit?aV2^VQhn@(*|{ z@vX$`^u+KPBa3RCdvp_#^BBo){>Tr`Iy2}9NUWx1CGIy<3rfB9NgpF6I{J~7^y(X{ z_hVsFQZya|_{$Zd=?b3{m`FC7+S!xKzVytuFyTp>jG(?<{)7eV4#Sbb4j=)w?_U2| z#j>%&h4WfTbm7pK27eMN5_ap?NkRjTZ(jXQsaY%wvK{4YJ!#4@ENE9;AmwJzR~<#S zrS_jgFaJ%=^sjzQ+-O*t)0d_So`R4R7H^;!d6PI)RbhYf{F3gcMp<{g#xX`9k`v7Y zCI=8z*D~U1$HDx*y6QoE1aiD(arSarGNc*Y+#6 zq{TO0U^xDyfmIZ!W6m_d&WFaR^^XcBu6awV70W-8RRzDf^0!XUiMvx_=z?EAV(N zYGm}I84#IA8^*OEdJxI_ti;LwnzXCKz4|Aoe;oy!d*Cc+6wen+;weH5-1dScgFG7j}TAy=m`Kgmfy<> zBd^Ztax3WrlO%Or1#qYdCSg|NY57jgtO?LlBNLSp55 z&VV*Nk)R#l{Zh+sd;Chb-Kv*=j!e6S-6s7R6jK!94BG9;0a7k%u1`laQnUywf{KHb zT7O+FJX6V%OzN?axHjtgGeeUwc5W3DAc)RcIRoB0xid)E_6y`i4G;g3_-0-30LS-E zZ?JAmr(IgCc;tQ1jSkmz16LyeUWaXj@2Z#4X0dFyNVS5NTOG?-t(o8BTEdNvON<<% zDq-h3GDn}TZCIIDdDH(4C%>n@r0G($sr3}Ea>!6*F2(;rgwTz~yhW2lyEo}0rxy)2 z5OR*5L$W`Fd%xWO6s3j9z_f1e!j6nlxk1M zOUAO#i9+QY0F@r(Bi-3vh40Wmquc=dBWhfHA9`ZL@1J@2Vqa`=v>>qJNWH=59zOkB zrkBnQFJT9Ovj3L)92bxQb#w$BFubSx1W9MviB(DQ7o1~Rqu9si&Wqcf3ZxJdYCn#QManc}*<8zTuNUwLUnTwc+SEtsAZZ&L*+|*C>r;o48^(^9un--zYqVXe+|slORn%(+4?kSK+g8}l>~~y? z!HZc62mnuzUlVb=oWy!UTe3dK=kL$D;1x@~?!ghlamzQU*(NfmDc<*;kKIGWK0hdh z4c5oDMjrbVsfe}|C?d)1(6=iWoOrq8LwL~e-qkmBhi!L z#K4`yDzpwLR)$ueMPIKkatzn%()94TFZa}PTs#}8K5LM<$Xz;`e`a+xY5;&NGk7sH z^IE7|XsX0~3z}lC#`Hofed`{eJ>-p~-9}4oErq#4;c;y#B87Z|B=BP#`c$NlXG&kz zLKYQNQ)&Ek%6*fd0%C_s=)9alaL%m*g)FR@Z2UrI{onmEk-C2B_u(`6bpuH#)))n; zRTvNDGz4=%m(FT{Crkhw()o5`MmgZu)^lxt$EI^(9+wZDm}W$?Lj`0xOQ)D2nL7s5 zR7+<@T8SD-y+f0HB{9Cc|H0wfF5glQG4BXY|CZW!5*YsK!Y5jV*W#JSk+;}pY|DRnFdwUND3Yxj<9*2fy-H`z zIe`du=yha+fwd^)s(I&Hn!9CfF(rpVx?yo56GVTv%a00_e7am;%n=uwhd$Yc7D2Fdpkcph~+&MJmFX$rp3@DiA79@p zfzP}OHhqWaWZ-E|$L!^*(fw4W@Lh$8`(c1!Sjq}_aGK%X4X~IHaed%}d`&0B?jEpVZe#@a?aTHC0{keGWGaFpXT)wlAF75iP| z2+MqUr1$W+cp|;Z31?H8bMjFLWqk#-Fo-{rr(9@gA$ip8CAK69FKvlV6g*hIfj8G( z^o#2q8F^p)sx#rMSb_lps?k=l@aAqDWR)yhvN<%7gR1D8i@i4cD*J5AZ#Jfc)k?fm z!dF*7EHe{05AZ}T!W*0*UC6-uc(EFIk_}BnUnu>LN|R6B110?_H<>KyfZ?2P_d6G& z5V{!ez0Wdhu$^bm9nm&zBZ}MvjNf zpbANf6y)l+byjM7c^ORRZXdOrJ}>b^YRpzP6D9K`{Wl12kz3gfTuZw?)DSs=;a2Y~ z_A>8kEDYrOvNYoHra6y6MX!~t<(+COb5iTGJAkeL=!IzP@V|hZCVr0^bqDk7d6YhM z4aQu@ysKD*@^C*S^0ccBtRq%m7q%?5_0%(e`=G$0+;yYr>RS1&=BKBz^-qu#rYPMx z8p~v^oFp-J zF3spF=^FRKn9=0d@jyh8uzsq0TE;(6%bE+(aNLlp>qAm7Hq6<%?h_)U!cqMEM&6Z6 z=>>~2v>S9kLn-`1G$bFvmWqKwVvC=j?$78E4+JFU_wCI$_0-^(Bo`JA2DAmz+iIN7 zxKDI;w8wk8%lvlj_8er6ss>775XuzO=O?7Ga;yvfY(V%5hVZh)MgkG+bDk}-{3aw$ z>P^22n~z<~AI^voT9%saWRXO>}QkU(gY=ep0Ey+$%K`Mv`u^Cr^qgVgt@%z644%~+?V96MG^KhR| zeSG2|mY5z3B-{R`O2 zL46sd&=P9Fyo6!U=d_26D&sBJn5kt4HI7#8b4k&~-POkkZmMh16GQn?}0Dpm?n-GDRVZl8A z7idh}d4><1NA^Cp+l8Pf5 z*K?ri2is`!Ad)1DavrrPs0a`|Fgv;?_+!kJ#ly&|A(c{TokSy_1(@b~;h^kfzcT&G zO+UCG+DSS6M0l8MTf?K#osb|A?usWpoz1V>-#@Lc>ODPh4A9P)N6=Vjag7jtD&K7%6j7jPdg90@(d`s9>P6opM%J_5dD{3k!BM$T#H$G5;hdmQl*oby5` z*W`{%TSVEUcl!O!!KT3cy{F`1DGAI@@zL4AqqCHgv)zKDYnKRdvn_tbW_wJNah?I$kxgx{J01BcxLYJ7|_3l=D+|(AZ&9t@JJjbLvEOA{@jx0x!F%UK1Jy* z-xegES>(MI_{cliMMJkzipi=XGsx~N^N1)0G6He(BdWV(gs|Qg_o$=j?<$=5P(g~j@LX9$N4u5`=P9*G*a8^hy+^PS%9~oO)ygswagBQV04hH4^~qSG zQ3jiD$Jn3zKJ25phm5IX6CIv4Zsd<~Ihj@3BWbT6(B&3M*p+xgs0Jj2yqXco{o!|I zFd_xP|ElYXJ3F(sP5v!^D}M|}PJsluBu=`_y^N~|Ub-Nk`~u}?*gzylmoIMp@hk|G zcv<|i%S}2)c=ojP$<>nVxsuv z{sHy%ouQpO3#zHBP_yuX4HzS7COm+NpfDE1`bW(v0oRQ`N`?E4>O5W%|8Ta~3Q=zn zWTZxSct}#Fv5@c*wsu8Upf$<)r$J+G3a6^|>!W*vB_S`ir8PcGG1>SB-+CBze_0Gl z*y>K4hFZEVq)#%gbp_Hh4^-r{BIw^K&FX)3go^9u2u@wPGPDASfgqMdNy#@|STo|+ zIPKutw)A?PVbN99Tx+Z0Zhn}Xfy93^hV-v6Hx(b?3-l!091Z+(SuLemX=9 z<#;JtpO}!m>cTc3kz+)pVVMftmL!JpLlifErS)XG1_-~Ed(0#?WD5rLr z5D&;mBY2%!81{}J4Z2lLD9hCQH363-NZ z&vjpFllj{S6&s29sTY;wD_>dk;sMm&?z5wyW!k5pZ=!M)QO}5<((s30J5o);$TdaD zFD%-fAJ^Q)ekEFTUB^2s=PLZ5g^~92v_fy85M#Gkt!cI_M zL+7VV2-1|vkB0T_v7tD-G)r%O>Azoz3`Hp^-Jz5(XT@8yizc!S=gC&CL%Dnj(<{@> zhAKQD$TyMZEN3=biR6Ov@fSV^Ttznf`?g3`H$<5G71x2g9als~q@!eT&|^HKJOE8e zm?0VuNh$3)YSLi3%e2~l;mTe3=U1ySI#s;qKcNz)t4m=en(o8n1_JteQ1xW36sbg! zo{JRqhBM22P|`KM{U>7g>v98mAV!FAL6pV)%iukFUX7BVy9VNu6yBOu2w+D$9$i|b z*qDkXb?L2!JGSQidL^B=Pgq(t4MN(?tB%~AlHL@{my#@lK2kQZq*5J|K+{V9WII#1 zaAcAfq;I^*kv$YgK2v^C?{dJFfu#MBIP@f!>JXlY_%^7$f%?EiA!q4SUsYd+h{0E- zd0){V4W>p!#G*#*&c8bx83v}9=djJniNV;#HnQM!R|F)iQ3T_;jVdzzF#YX|sxGIR zN>3w$%eg|C~tDSm@5F={cibEN_z5)me7SJxe&Y z)%f$9{!8u=p_THik`8$qUYmr*omv zX2i4{XH-hRZNqj)a5b!Cd(_C}nTyRibW zIqX=@Z$E_SNcmuIr__7TT<^+;e-(2C5?zhYyV8*>k`b#7Sa!u<^YA`63&k3PX2|IQOS88#_Ac&+nd=om z<#*Wws(!yoW9+r*+JK#%-OMYDZ@Wyz?@E2Et_J?;qEK@Klb@T&u3&rB=quC=UNA?# zLZ`XOd23Gzt2V7^n_E|``$@n4%@i{ql;3*Q5bkyI+q?JBsvWZ;E)V54+8&N{FWOjh z=oP8a{lX-4DdTZW-&&W&iFvUV-xjP3ciI|Xw;CT$dU#q2uwaD4(Yi#VL1hjnP?aE* z;2*pz_zR{n{X=i`&4XYWj&#-jDh?fpiPWi}@ClK%95*@fquymM;&4ACT)su)B;HA5%6-0N8Kw~u3>2K5v51gKtypnt!Q zUr9pgG~s!Z+6hy$%FkwsX#=~dcXCTTcw2$5{gRA;D~g%MN`j2obU)(fnsM=&zP98s zQ4#f&cY9gF_h2~+!pRdXa;(D8(9177Is-!`>c*39JNICB4dBKo$G%8mqJFFo zK9C8-AU~*u3n&D*yZL8c;67vD`W&D;W6C$1{ZIYU^1mXXF{jWVj1}I6JUUO&XeMpO z>%+DYJa=BHhev3bX7Bg_%i;EdHwq~J38az^h$ z@gVh8w0v_n{6f*ZyraA3=N^Fe>-9d!ya5dcKBDH*v`|Ilc+e6y;39}V^)V9eVuHEr zMa``={xK~Zepg#{Sd2el;9l%ww-FI3VH`WgM_Cwsky_R*h35Jg$%KmS@6e2u^6{rw zhx`a=QE42Cw!U_?W`k8^ZBgc{%&ogq09+j74GEP%t3L0Dqxi(cNCq%=NHj2g&j#8X zaB2*-Q_JTefuXy4-?BiM!MyCi%zxK48AI`(3RR(n&P zacoh@8NLmxkgvJO>Z%byGT=y#Hbv8Hx8ealKhsDpW(JXWBja=M|zpmguL z>^Y4a#i<%=_lOUxqg2hP&%1&njvRb(O;7&<$JCE7oOV7Ym6z7;?UA)U_lm~_oMTw* zHcXVZT3kaYpYT2wIqTAl9DqfIIx{8qK;uff7f8~|0>1SXEAL+F#vZ9(NL5Lf6A66^6s$=%06ubw&Dx!GR|GQok*KIic{YkXzAQLO`oaZqhY zRZl4!uIVo@Fv$%~&gen?pYv%DgIO)Uely18j+(-TXW^Xxq-eH1Pix2Xk#>ua5u5{n z$9ZBG6bQ=0Fn;vqDc!q7b{8-_V0GN=O!LvwmGx8A3TmahEBx;FlCmcHr3oJE=$Ab( z0on%_9yH>}2IWG=yapNTzdH-gJy;97zh+m0Y;-tlVi&_;sCz1nr|oK8fi3&N%eCtlq3Fp0=r(%TXUQ(vb0(bDHBR-j05X zVz$N^dJ%N2Mje39hD}I}?7(KEUl$H&6h*u36vI0Hem7|MjQ!ttI zlW?NXg^xD5e4L0)?2?>LQy$bzKsNc}<$0>iKAeWDn-pm1xY02e0uW%*3>#QUSP}0Q zLn{;8dL%*zmnBIr$AWyj<(qsWC(9CxMm>YFv@;YD=)t861_s}Zfo~X}y_Ayk;BjJ# zB^XUSzK1RM~IOXfz^ z+{m81fi?emx7@If+iCgc9sW02BVMPYF3wLt`T=_xlKP~;zP-3eInqHinl5}TAk?VE zzP4FGama5-MOvJ}<`$3ri{lq=aYw)72hhxAq4ht2I8Cb1wP6&O>fR#>kKyj z{9gE3wi<_pk(kiMk;|j8U!p`0hOsGyw!7}7X!h42UF`}=$a+Td$L28W1~u=KC4V-z zozYi7L?*SySiHH~0CD;?!%9orFyax04$V^r=HQKbmRtyT$}0VM>~7BcY^xU8+1nka zz8;pLj0vjige2U}d8z@Z39 z!`CK8XR&NnAarmt9V@KMN;<$#_U$pFkA2d;jzIJ6HOxp&>zgy8V&V3PllOrBJm~Frm zr32?gl?`-E_U$SU6=GwRnpe1;zn%RaDk+&uQh&R)m@GQ}B>Ja9j-;`8x&hP{hC>XF z2iQcy;eV77FsUvKzYz%!2TK|>uw2FH9K$({?^%r#RVuPkcS5eMR-Ma+$)03XW~1B# zC+i|VI?IEKwP?DJ4<>;v(V&jkq}mwEz)+=;^r^49?PViYOPP01kownb z`;hz;-4ERY(0_?MzC}AZd%JZ`*+0+5NnQ?RZJ+8rDv6g(>$a-YeXjcMG~E|8hP0U; zDT0QRPNa8>QuI1W!^saSrL+4lwAlItij=3peTUdv2SPKeW5%G^EoDbApuN`MNfICL{n0UW%DVxi{I$?t46C=pWY1po0gkst z(a6SBXVKJq@^di6nL1b^!HVwkdlK1@8 zPOeLOEKLvDEA}hhGj&C`pJ+8dN&kdes4q(~0h@@i#w6TeeS%OVKvG=Kc{!01EZ)#@ z+W38T*u7|3w^{m@A)_}^Q_QMF#2R840f%m=0*Jmv98*Ta@fmC@y804xUwK)6qb`p< z>V3+X*>N)efxR_nOM9;rE6H@tXc)l~`C$b{69QFN&GwRYSqa91W}$R__ab$t5z;Icz!a+sGD^8Xw%nI%=qvkO85^eik$d7fJB#_3s(FZy<_e?n161 zMbT%8bC0^ELG`Xv1Sj)8oitNJuQUD|de=*ggmsV#PoYE$J5PN~JBvnxZEl2lnhqM4 zk01D>U#+NCxy}n~?YSiAwh=*nW4Uo+m?96U3+GFop|^ETXl_ijf^ zk6G4&NoZ$n4f_brtIzdKUj?c(LGZ+2K4EQl;hZf}>dcR<)_1HAmPE?}+@XaCEkx9c zAjTUmNwEYA@PR3i!6zO@98L&^DE^81qh4waPoFKv* z!9|-cVFLYDCu3x8f_Gmzu(s?8>|wzC_s$mYGDGACjj>#;Ib(gRkAN9!b+;_VAG9aF z)kPBd@ODu5lfhRWMH{XwLHGDqR7?hmPCDou2Ut`nL@9>dDq%1_?*V?-%JZ zB^}`%Q1k=I)n6P2%cVz30-o1;YLSh{MoiUGV#4+hIGBz_sYFUpY^l%DSB}h3Yj|90 zH=EHjit2|msU1R(?#pi<#tsOC3Z9bx9p>nJ^4uZnmI-uEM}Rd)hA1%tWTMW2mH?HR zQz)tgJ);qldCjRDnp|dBL)G!znEp7gh<$~s#b%BvW=Ck6ZjONKUx|&BB8B!U(;2NP z3xBR*b+R}4LD&a?RffbfB$J;-u z#ofE|a#YFTOt$f6gKX+*_7xxuw}T}akyF94XQ)ERSuq(@4J7U#AU~$H<^=NT-w9~_ zn#{H1=+pDlmgVr!ptqE1kYq~E#4Hd3)Vx*d9^g~`loQNSy$C0 zAelx=bcQ}{JWUvtKY(D@RbWm=+q=2R{xQ$hjpzMw_X8^f-;nv#1%G!LEkFd(giE2F z1`k#@6Vdjh`kvPyV#t?Z;wY8I!%-l;1btoN6_G|Y{WDHG(gwNdL*QARo<-kD9952SFo}2bzS)ZsmfQ+qKkw!JNVgJ(CXxu`()i{VuJkbyRK0Xg^=5Y^=*X4sHU^uX!tQT;<9(*}xa zoqa=^X)li1er`0J7O*m(`i^wI8$&mZ*7(#FUPn1cB;!c*edGnrls&a1rC>8@ktTt+ z+lK6YaK$LG^t$u{|JzYvZ9DlS82Jg9j5m7-1?p4qdO1qWuIZ!0idkF37eT_DtK9qM z>0kbpK|N>J=~6S9d>|)Ks1`WH`I(vGm-hP1P9tKueSGvP9>umT+-^TB!;LIzrh#3N zc1hg)`a8h@asf*a+P$=Tf{3K3di6!f4AoS9uVYT?Uf$s>e!WlTUpy18>3MS`l;N}f z^^Pb#*qY5-l0rY6jbc(yffMU0l65`YJw2w?YT6V8W)~BzUyqpcH!O&lax)p;H{ky- zuR$KP9m%VAj}w)+mJt`Adu}adL@vPbYBQoEI&PvNQ?uRb1Iw4HTQ}OBXX*NBUR&Wi8rX4Yyc_rVbSgkBmLrTfCOPrk~X`5 z*)_ul(xYtghRG4ckiUSrs-M1G-Yuno3U>KbSmCkEQI;F~MW}nBTvq`{YAq$;$OMG8 zFN$j;$KagJ^`fY3vs*06wZD(qyEU9T?jykPK~VkzFE9Knm-)=-N#4YcYa+@@GJfphaeme^?{hQX9Z$;`W6MW85a;|12t!bN=#jM_t{Fby!UjzRJ3jZu zay-DQ(s<17e7bl?xj`kVE%JBUUR-A@kA_P8rW&x9CaFcO?n03I@%0Hb82Zcy^}WQx zw_|<;>a8C)k#0Os&b&mRi!?4?dlaHi?u#B`VZL%?4r15Z;`F(FA8zh^U44rf)K`Dx zEx6qG>#J_eU*K)l^|6exVCR&s;0ZBkIn#bs?>d32iio@LN%uZ}kyCkSPFwEnP|eF; z{eZgP@^K<=wv5D>YxNu;d;t7npu0)FM#|W|u;#+J?U1;OB(50}L*8;VoY4+wEnM?& zy}A0*y&Q>;bZy?LJ*4@LNxMY4(*fnL9F^@f&)LCVRd>sKzYFOn*N;PaIQ!{ z{CuALbrt@7Ti@r!1L$c$8S>v$UTErb$d7tXEsPZLc2;bf4K|S`nSQ@t8{p;7;p{v% zE;-dRVTva|naL<(pzi`VYbdr;k$k8eTTlZ37U4n~Z>et0sBn2>d?J6$;wbq6L(=GL z#bXuG^z3ZjvbQ=C*zp7|PLX=J!bgv9{wTeqo<;I$86U5Burr9_x|R92wFOH$Z}= z{000Jpb@tIWnLm=H5PGfb2V+~>9p^%RJOHH=9e&)r?8a-hO4hnTCyK|HLPb3$WrUO zJdThc9KV6BjT}Y5eq{*y^Gy)ug0%w{G^=SE|J=mzPT`DvdXB8PE_+RD&I zYzI>Vv5)!z?N+sHo_ycC?i}x9CJ(npaqR%3FY8sMT#4k8IfOSHhL-D|g=|yUe4LMy z8o%^p*?(7h6!W=l&x!t>XjA17{dwU@4g)K?FW|AgAx#=1ad&UPOli`LWIdGYqtxc* zb)tar?a}3O6cmA$x6jah`2y=&GG?OwuYhp>)wsa_n{YaQ+|Vw1LaUUql&S1RQlnvS zZq(_Xb)N4h0b9c})jf~TyTUG|uF@DH9{}I~M@B*aGa(?hKWWWv9TDSCg_`U~b(MK< z_VK@CSILwJJ=PhmCBks~9w!jyveSN~1%{Guf=Fg)nHsh&B-%@x>YS}t*Lf0m-+59u zE-~wZ%)Oxoqo+T=#tsDmdjEmr-+%V_|1~v1-;47=#vz)S`}{D%6~h^6`{*nA4j*my zj6GF&+yTyZqbE$`>lHupPZsGrmq!V1;+covA^iN1soa#HTz$d8mtrPe0tFf-i1~SX ze`qF+){^IBH9^oK?qEk*`1Xq;%!>O9qj&BX|3@PMJw5!rLntSv5 zq1QSZpEL+OG#W&5f}?|24qX$x3my2jON-!yG%cJl`xGO-aI=xSkIUd^QB=3^h|~`(R5H1 zL3t=XiV(GLOGXNUuYDsoQ4y+12wNPW;TR< zJsW~gxpzRZJL?_UC;6@-5{FwG0^)QAA`o)GLAg&{`N!)yMe-E58~%!-KUb?iSq5Ib zq?}uE%|Tkk0((m1%sr)bG3>ZpJN)c1GzL_Fx+s@{`h*e%(2cZd0TD@fv&&ylu_mo5 z@gJY57!KOWFyx<1FOp+qqV_+Hq5DMp0KF8N4Ae!c6X5g|c`yYL-DO!Ndu-G(2HK6# zxR(B1HYr(V`O&*}eYZ~BRYM}6At6Q$L;k{2S|{9#4zr-o#mUYz}Acq2h%L@F9u4Q^`dUx7m-R(hoKBxDufxejE>5~7%t7R`L$jZX z76;Eq@eYN7%hm4%-t`{lMrqeCxZp$6yH(KQpJ;H(d9Vnxsu*r@wpnz%$oN%twu0`p z4~-*L(lIY;=wnCRa-}3r^dnfnj9v@}T5q+U54C(Wu&|u-@{)U4{gTohWiR8tmVRc% zyc5_b4Tu4|^-P!sHJiqWwuegc0L&$@g|uc1V#AY04b}Z_dD|DD2UbW#ZGJYwzC1{`$W)SvhW{)TF%h&s< z*}%#|#VUNTKHY^FkLjxgsdW@pXcsf|M)Hojt{wyBY2Qw(sHR+PI*& zRUX38qw)#1eONveUoTCon;6nk`?;khul=^2nTPxZj+gd!^w8hTM3>HTzLd?SX|KOt z2s+4mcwAz)E|?s#Rvc=SVE5zq;u0<3Ls-cbE2n~yGzoXcC)qO0=~48_)TjD%=o_Hc z8ZsFoR$@s%9l_`d)bI_{B^e~{=T)C`i3Le6Dn@Kq-)HTnO7F&>N1QIDTZ7FX5im(O z-wakt$aJLsb_X&}O+syc1*N2{xsyMA%hfJgDk<@&MFVO+IzA|JEA`mkt`|`gBJTDQ zH6&r67pdM3^r~|3>o46FsJ42#(gqZY(&ck9>Pi_swB1PJPJ{f9Kh% z0#e*pLv;K{v{IH(@~xXri%WV9NiHXCT$XFHPSPcgorjcy?qVBPdM72&?4V`fs&W!m z@HZl`xKU10vHJ2LJwzYJk}w0^wL zFw_gl#$F>9f6m|Ounfz)a;n4T!dGeGoLfpZhLcja=_1u=lNjD^d5SL>NYd=XHVY-8 z#h!sG-%tHUa@o};tk~UnV*LKd;|i5P&mNYv=C&HlK{p*a183FsfQSS6myn1thW|Gr zapCPc`KM9s7R~EPm3VZa%-45fyFq7z`oHjG8fa$#ppFwUqi>hF$6+q5g)`iC`hjVL z(&m6BJym|6aQd+L??k04(whPjKxraUlp+X7lMX6fLPC+=1O!x6 zx}btcuM#=}Do7`Y5KuvC0vaAAiL-z2%zD>*=bm-voSAdieeas{rz;pX&$G*Se?Ki9 zJy(u>c)uNeaC;E0^)UMAN33UHQk(OZcQs~g289ec7-jK!!y%N)X*kXYIX5wg6Z?sza}JkY8Nd<0qK~~8!bb8v$X1-pO*1&>m#3{6 zagnF#Yu{+rI^7ylL`Q4`a2J7h7hy309ab5xa(U(y?S0@YJ4Thcrk}35wuZ-I?_U?Z zG~>sqdju34LCdJSg@-%Th^zg$BOwUkb-ZMK_pMipFPUHMU)U~62=hB$H_+-DI?#Hi zhyzLXranPO5*7N8M;bE)(RWCQJ}s=EPcqiU_l0;&`HP0Kc)3&2^~{!bMFxE*c7L$I z7P&DWbTntMK8x&_v1w@T4BP>ozO~OQ#QUTX4nmv8J{6mYy3A9}=f`x|Oe;x29MGe+r#BP!9)egu%jO zq<|VOqw^@~QqIc5vb@3Gu&lOu?_=NgcupwSHRs{l%qkyTTs<3&NW<29rgchFBEZCr zI}dFvBUrt47$2Pp?9lKOZfvUJ3RH^YJo(~R>OsY)%=-gMpWvm`3NWgimHi%vokTHS zsNw|$)!Eh>yX1op_S+s3cJEowHtZKpBMcr55u}G#6ANR1I&+}q$1!4`p@}O~YN8tY ziN^VTD)$uRrHttfx9ZGySlKCgXV(APaJ8s8D9=J(%=^!WrnH5ZN7L|&eGxl9lN)6M z8&5B=R|O<>USP_z`BEIJ_4?`u9+5m)T(5I|SjrCt{7bM{J!S@xoIXPGt+-OedRwd;)WC_|J_~vvq1mD6MAJ!w^9%ohmS#)#$0pD%d zLc^hV`jvubz2MQ*2dEo0T$oV}jAko|X<+9FMbBBl=XvG4(Tjn%s#DG?Ckx7Rf|?(| zPtP7s*8w^MGdHp)iSOls-P&Tq3rYYDmSU8}3{@N?eXS z+X66{>480XrpKtyw1rC`+g_Yt<6`1vZ_PTNz=-m`UEzC`!{Q+(u4fpUWdn|KKB-89 z-7(MX;0{@FPPnt*J7Uhe{zz@0zVw>6(uU0_?V;bUz8gP_;ineqFouXV+e+A1+>OlG zwu((EG1U}utD+cAY2ON~7}2h&^5JZ_Z7gx`wSi%_eo|31UFgman8*ysSID^r8IdZM zZTqXwt=J|}E_;&>ZEY8g)!eJ@%x`m(;`xoxLCS<-h$<45qk~3U`f6lsO6+vAkT8N z=@gn>b!K`$@$$s7a{a0E-=y^lJzncIehY6Qf4#eN@#opGCaTV}D_N zN#mK}^WGB2f&_#1Zf~!`Rq2enarGP(1Qfosf3cx#Q2>4JxqW!4spA=KInBa4Jm*(* z_q)5w3imG@O}~&I_~6SII;!;^`e@Ox_22Xvr4nJ2A4aki{SE{XG1V`H^!xrZ2y5wE0HmNVLqihRH@m z3Z9d#9=ri#aTe7`4Y8iQvLW#YRvf?B^L0Kr9k+VVf1y^#GDBB5;y~VM z0W-sgYCH$iYvoeVe&t;_R@RVDkEK^wS2LM zP9a%KAa|Phl!x69hOtYp_OKPF`Do_&i-mi$RO;gAZ_%gC=S<1Bo#!G6xr4oTp}4B> zZqKRh5X-3NuLe`!+szCce44k2h-};4UIo4gDmqBo@b@JYK5zin5Dk?spEc`nbM?a! zkga}rdyf+T?o`q4i>D3n?eN#%qPfYj)ZC-cPm(mDq7T%%%P2NnZ?)-xo?pW^>cakx zYYzRkm;-I|Ia=KsT=GKzq0O_~3+rVb_w*=agh~xjV3%sUcVe=N@w@BpHPcPN$%eQucqU}GOMsaJ+;hv~1oWUTtnMVvmRPgK%n!FsTt!TAC@IWk#C zLc_QG59~%JM?TU8xkfX(wmLGnquqh6{B){Ogpk#YFjBP=G+x zmjK#bv7&o-#u%kHSBX{=HX*EuoSagHN4=fyfd#5MHo;_f{jE>51#DT_-m zkw>rb>#wXj^FEBl@b1MeVtL{`mNT0akpJizCPF+X8jl3!XsD#onOGlIn8Tm8!n z=9X)^zyljC4+Xga(@a3nYi$hG2PqE}gw?ZM|Ls-*1JW%GY7WkQ zkpnaIRD^^`L~E@C9)Ys7&jD{Y?-BW$Kq5Qq1J9%$& zh2Ot-yDawb>29XjvWy>eg$HLT{Nq^9!Z?~Jbhdb-EwA2#u=dS(CxnsF`(loX5+U50 zQU670N%WaB67Y2D6O=oah4w*A6-r-mMnI!|ccw=B{0-DtTS82-vS(kuTauq*m{)1} z{Ao#MC0qCG=l??KEnKa`gfK;d5es~O-<%PebY@pvFe2CcUA=GiWfhy>DiAQBT;>_4 zD|F?bBhR4Q8^(O3{pJ1OKZm;E%<5(DxSANOaF{)^51;n*dRtw4DD|E3Ilp6h4zAYu z62^U;4PW4EKL3T?>c2xI_uv0V;L8egmJTbKTcGhm&T3`!7w@@kL|o~;gmIszW84}e zkDPynD4z&br6X_slYkTSAK1Yeuu4GXPO@E5=wB2KPrNkUI0~4{rtN3^c?aiIE6+te zU1*Lzr|`fcUV=A84h#QM^i1qw9%Sp23fa{j@O zgVS5nk%=(v4@Y1h^eazAXc?OG(7pUS)kOcC&EsF+@^A6Jf8bwqU;e;=kfMea%{omJ z3lDSnsAlG131uzYX=^-l@a|DLuKj@V2A6WC+|_0FxO>R6mGJ5RH-98{#w7zy!(INvBTeIa_f%f5 zoK~g#8S+os{%_f%xKq22HyCu`S71SAO1g};s0i2%OLLC6AB|Dy7nlBQ0XgCS-qy#=EiJ-|Cc8nLuH4H zj08OW+-q5w@l`-XF8Y|;D2P6cGhpr{ljB?K*M^2yqV>q)$D<|w^}_$nWBn)X#{cs> z+02EZNAulItTPmkJ9N!s!R^(TEG{v=J?<@xihTszMa%{+XZ97QJJ&1R?UbR|wAL3o z&Ak6oHQiLQfKJ*jPdIKN;$G!Yr>Nu74~zW!93B3j4sfI(ZQe|eH>Y@7{t^OYnu9v1r?fquWN@j*My4Zh>4eVcAu$bXTX@WdD6wo98hWLAG>zEy7^ z@@EmN>VNch=tTSb`0{oa94};@V^IZUYL*m66Y1B7qR*JeBY8Wx0T-E!;>3vgSS_yE zr@j~M*q!fS8#p1P@k@kJz}@9-5l1Juy&UACxU`V90e(kAjF1=$%7@VE} zhb@a6SFEi|p_Qh+7wQ3Uqz zr}uBj^$b2m*F(}3b?(lb=U&04Vq}5088b#6Gb|=S^#9QO?U&H&Y?}r>5p(f+!Q
mAu>1d~c1!p1@=t5HH3Nj}PS42i^y}ELW9Cjj|I81j`aX2eh&&CRz>bv_%sHGwt z|F+IaXorC7$((brB7MRzOiPak?5HjndLAvSGpso-SsxMk=!7COD)=`r5+K?F!}n6! zaf;c;4&fEt>|P&+d90E9ru@(O)>W@6vYte8%h|RQpVlgsd-Fx_STI##KH?h8V^BA+ zi813Ra-BnmwyR5!3c_xBAurI(Lw#0n)^PVn**J8sUVU`r(36pdw&S%&0|ej0qMi`j0eKNPF*6xJIZKh>+CPSj zV09K2Lfek?J83wDwrSQW4)vb+0(`t4=!jz$F%q3n=}WX2HJ`@pjJBNmAxc_*lya>> zJ#<1GFr$xsN?BBR@FB(SNPB`nbRgz~b2I%SkB;O5ni=XR4jd40ea7P6{tX8Y1(!Rm z&6Q#Ut=+K-@=WilQtUK<0{>O?X>0}x5rRy0X6o=D_L`6mdFAWKetgvsgCTzliIGkl z_)_F?ibVy{p-JO`Eyg&rl|m}S!0tkp@m;NEVrQ!Fv-d$lJf_LxijuikcHt&(qP5e; zm_q(_fmj15I7AC`{9>1oZ3b8hz_A9c zM4YWp8Iu26m$AHetvw#yUhrBwkJE6k>Vfqvzp(C2jUyZdETmK!?rRrR?30hb`&9aM#`r?v z<;JE+k%5|%N=dnouCwrdO)!Upha^wjq3~Q@4om6KAh`ir6j#R?AgKTBWAo@PjV)!4 zefT@(x@Sys%-gKxm2dZ7%Z=t4j?tk8|G@CJD~V|s9qG-|1F2WJzROyy9&bqe$$=|Z zOD2v~y2gl#!Ym~m9=y!nJ;^}NrF$Bv0IjGkDA#mTebV4Sjp*7UFGgfNY{=z#)T17) z$bu8dQy1#`g=KkdX6-uMpIaRPKBU{IW~M;|KSh~EtN&}y zczWAf;`%#S7v5+njf<`i_=MMD0e>?^vwVNEzBoMWTqD$8?h_XJyC!AZO098XvwG3J zu7f3A@YXRFQS00|DRGHyaJ=#ANMjaIwdyYkg-G-@sI(NA6o^$nmnwYjU{G<6C+lrj z`|&KEfDAX2i+&={u53F+M@Q9=+E+XibPl7D)13+%h2L%L{Y&2a24vROc;>LupS*eN z&77W}Xz3|Kx*|k2nlli*_LI~iiX>vd46zI(MxCPH@qNJfr=A zUmD;@d%y&P_iKYAj5dug-0~!elLH{*;A+?P$i5~brR_R+=;nV96VBdZB2cK`|$X#-p$USuF<{M z_c9C9dj$qzNGIAj8P}Llfk^xjIG1Qee3nE0E^X#(Gwp_ujHY9g<9Z!(JxIpo>zTLL zu5;@mPf>1=j3_+9WZjPd(oUr*LhlJXNi+cm$iFdN@%zm#abw!2n%P z-v{<{k^4LQGJ<^_&+@^=|H43|l8kn7wz}D30x*dyK)%LqA(j?Fgc~2`9Y71MAJbT; zOG)(2AAQ2qH~q^zQ)D_?1#M5>)vpmL6PR=Va0O-u|54FPI}Yf%sj8{|*q+}&VBnpf zB+>llb*pKTl(Tgfvkmf`E|MREFKomP6hTl#YuXwBF5$xqM4u(eop^+*b6)~hanFg0@1dIoykU)P zTJyXSiid?Cd?8A#dBHznT{H3O2jnM^c;L#Lpi514Gr=3b8lvpAJY?aY44)s(Rx~+g zf5z>!biyYlm<5v{3Dvzww8#`go6h)Y!%0TnSVRYtx1!m}buFIOj?VZjhoGvg;o)J^ z6Ivzouk`N$2LSG+{Xug)7IZtdY|CgPiw&d6!8HQ*;dR~HKg@2AkDnI3ncd(ileKMi zJx|@piZFa{9z$dtrSPbbi=m1G6~Ln6D<+bE37bvrsGSLUf1aW)~9d-!p&-bAyQj<#Th;lT54~a zu6+7cX){vgK~GP5OH8Bj{b)O`G8Wh>Dj*YIzr4Tsom5E3djjSUP3qC5df(ug&teDK z66-k+>2DU@9~dwGv=JjHIG{DgAQwiMq43J@_wGStg^W(liJuRad|NssU7pQ1_$al^ zCyOoQei?)ZDUOQW&;c_YH=9a+IF0izjJ*uu`s~d*#H7b+o{cxkR9_infA?I-M5MAT zJ0{zJo_l|a2a3R-|gVXM}K zE~P&EwDEhPkFUx83fE`z5cNazeMuLyxo7FwbH0Nhqtq1*3J*8gkGK>2(-aVw2WAGH zj!*iR)laAbot`}VLf;EZ&QJ3F`tRtz)C$tsk#vt>yH#nd;j2ry%{{e>K2h~3*~=vP zX7#2={$2BGW$bK9hLYw>S$8FepXz$hGt$El4^bWp_{F7skngdGR9j3D`cUkTT|o#8 z1^J4;DwOQbme~#z$i8yjr9zt;ed(|`2bprw8Kcy7EtvwJ)#~Le;c%z-H3FhGdZ`QWn6`@7`e`K0Jg30 zjo*Cv%TcVB#OY?Qe9h38zH+&f9CTNYWwB`9i@K7fi!5{|niq=s>cFTEXj*HTl|!cr z*|r-U4$dyP%zKL8)PhUatDn94#tS}%TFSv7UVp^i7#Zb1<*bKb`Q$GT3BRk;pdt8sqM0i`7xrr=2D1USNc#{z<*7{ktl>AFVz5{oRt2+YUDs@#3h?6bl`Sf z+uxvGScmIrDKdNUnZNdaCi$u2J1ke<7}NVgP7KIk#r14;1Q4GL+*+idsR>1`-dtYSV${EKP-t)R;Rs#kvIj6w7J@-cG!}0+-{%w$>Vx3< zwYyFG80n5!_p^dR8yDCRlVG}IEttM4^zI}&^LIoYgrP%;`_ULpTyNl!SH;+f)xqoX zPQSYro@XeQ&j?#CNrSKsKPc{i9y5XCMQcdk)j3AP+iHGGKUMpI9Ht{8EI9Vn;p=0O zI9Q0>QcBv?0DRBqU?LB^vZqp#b?HW%i4#qoEx}(smb! z;b<(=bkw}fWY|i79(^Qc`CGISYN>3@CkNB}lsmh-mR>V2S#rMv>7HvuGw`Z}K^_OH zAh`B8bc^60)`M*h+3HNimR|kjP3$H72u@fizAWO>@mc>lD`deGaiD_f@6?2jkw*HE zBHFh|ufyjrlcaLgpP)Ii{oV779x>BbefEXNFe){KY^>9*V(Zn)xUk(9F(_x{Z(hL_ z$%vAUX#?C3h9bmm@E~Gfi?yqLMP0}rtZt)Jj2;ki$NYhP<<~%wO&SBKN1sNkf_BM6 zwrXH^tSx)UX2+U|srUdr_&WvmMMo6_WHAj2edc(dJ+c`!zhaDXHllb=?%SK?C@bC; zMfta7L0rmh43?_5mImGd;e&xyAAArOfQrR67)68aGJ>QqBt(uebzKf;wWERo zfZM=dY+Z+yvsQauSY9>ZNZD!*^7RAy8CJh@G11@DbH&cwM8b+sY4E{*pzuxOtfP?3 z%;pYzPk?n(h6M_5`f$N~X&5;9Z|WY6rDKX^hhd_y-O@j)sGh=lWM3f9`u31pX?Vhp zMVrf%YpQv3m^fD%}5XsuD`q}g^s1=^BY5W?>A(N8Xfs_*+Ol%<5XWqc&tT(3b4WHpN%G1LPX z9*mm2E%(zLc(P_qb!Z(|PYHO%Wtf?py|jpY%-PM59rW}USiPWcr(yaBcE5E4NtUjm zW%cTcRUBNT^|m1g5dw1*C(!>>3rt+(QeDeC9a4R|)-*k}QG3e9)|T}?mv~$QAuhTI z7NKz$>y5y_1P)Kl1cf+MV|CucCZ`t|4}xOrSm}YH@t4X6G?@V1i|>Sucs!*)KxG;+F20cz}qn#)iX^Vsf{R1hxqe zt?0}FGMKWc0PQ& z_6rWf?b%-LRUy5Fb z`}uilIbJmfu5Ms6r0;-oR-z*Hp&VjCf2UL%iMOKnjVr5}v{g%1*^TD=pQP`<>dOH{ zvEi~OCD-6-Fgmg#Q95RNdz}j85v&EP%8nOR*U&?setP=1x<-NEhrJCN3o~)Qdy%12 zg~AWD5)%y?iTi14s3QWNjpLU73B_NB=s4VT8CSkF1l-!IDpChnjq_HlKZWxk3iAOt z1F3qQQs`Op54QcnChw?d?ibZ+@`Fj0ZXZ*O^(?o$w*Dx-N6*)j^%F3KId!Wf?}kp- zA@FN6Y0O)M$ekpx!Q;HBuCgW`se{e7zdbrUBC-wZAEr>T6Gt$rbQ{jZ3nEaeyGl@ zEWprG@#O8p^uS=#{eSJ${_P&pf7CDkzdoZ|a&B_o+*-!sBMIMPh!ch+wT!INn&cl; z(>~MN5(|;fdftE26V$!=2c{VE7S`aSTq+fPc3r@wDM2eTJX@y3A}OipWTgO%Zj4SB z@&9u~OecWafx59MLYqV}2M5wYaO<%K>bXlK9-)ppU+(ujQanwqNiQChWHoO;;b4i0 zj#xX)WPV^l;UO6fjLsE0LrMh4a~B$>8%?9sWdhWg-A%21BikbigN|!O+wj2;M~I*P z63u~IGLKRFtDs>-^Ab0lTkBgUJ2Z;%-py8X?W%}VmhoQV&l%> zXkuQdZ_|m8g2)ixiqux*+V#9a^a*LGeT=CLGY!s$OpG?g0FndE9%f%n_VS9}CjpM8 z)@BzXO7p^>MxIcDWk|ElHs&PAD36R@gGFiRBBe1C&b5I_oyXB8uc<|yFp9ny9C~s+ zt6%b6ib47BlKVFIOX`_s&->?QIr3hqxdZzm1p$zS;UdZPS==ii48{@`&SvHZReIZW zEog?h_ZJ=ACzR@}l*1yWx3EM6ph6=ye<}22GH=*lwEvW^&Ry`{{;F%NxnxYJn~-r= z6EocVua^8(D3CC+$VAa1;_il=xSqFn{-a;;Mq*5IdRYQ}dI}#bSMriF-JpxEm>^Mw z>_wUZt(?T=#ew&cHe&&*D6z-io5(ruW^Q4ApmuUgt~fCf+kx-!R{z1-@KNBPqQul& zbI9kBTtYctv;T9!5byF7IM1!&Dh&gX^mb)7bj#A7u2|MG_AX#QBoYIh<2q4|rtj4c zj1n()?3$lkOX0J7n8=awVy*9{^(8F}rnjBEKu8j}IJZ3=$bg1J8w5GGZl!ly&+y?1 z+^cw5u094YiNng5^i3MF9j;4^F?IgclL5hVJ}BVc58@yH$DQQ2bINw3T~W%Q*?94s zcg0=LQzJ;n2v*jzjgk~b`#cy=l*l@NXY3OTxNfS$J_eq`m037OG zoQours0Hk!y^D8o+?`^J5Z|h`M(xN81DTd&VXv&a7*m0{Tt*A6SB8enQa~-5iK=57 zw8MFUST%3loWrtCdQo_eO=HH4-Rr5bUzZuu#r+e9vqVK*Q*SFjXN&Ls5?v&c*sm~~ zPj01_QIsN4gQMv*!5o{3>R!n0+_+n|=t<$GkVUnLBsllih$O{~u9jXH%WI}F9TY^j zg5<^YkI_Ed!BmtQEgMrL%hsgUy~}~$R-@@H@j%Xp>fEf?e?@$8=jak0+}t3pIzR@* zbKN+0ZIckUpyyEJMN`@5RgRyTZkk96PT6HdR_aXBO@(h50Q&V(;SngT_xEO-)bkhL zZ9{D|9~;EYT{C5s2zoUxEb~s_dR#6r>w-S?5&|i_wmnDJe%lUWVNgO3YN0#i`T1%^ zr+1svkt5iS(Vu`MP2r{{(vDzmh5NU0ldIqfnStSY5cInaxTm{`0%q zGUMV8o3%w8y(j^nX?HFi=v*z53w3-9rIagM$xm z{z`rV%}7i?s?IZ22ZrV+VS8#)@ELJ0GI`c*xlO3)w^Xx)uT~$A$$Dg0dMsmbt?YV6 zBn%-5MI1=c7Jyy1-|vc8oabVi6H`0U;OCe8(C0OMG(Qf4Ue-N_x0b{ghmG7N0`Sbo zqtyVmdWdoz)ahQx#J`I5G@8O}IIb@=Y|j}QSSAi}Y@QcyP8m>5dTwpiwWE~AfG+}Z z0s9nQ^IjVrRVWKsJ7`^b8t&J)5M(tWFVTGTLYBEszHl8MeE?muAibxE5bXPQCj++o z!Vd(ZFrIdVTB0o&MQBKxo!iL&IK)xSUL<@)D>ckT>NA`vjH&dmNV=QUER=^o6WNZ) zJ{`yS3cr{XG9UkDM(swXXW_|Ld@_TUTpr|YiR|q`;ihOrD9C#|g}Z?jb-*m}gFj(@ z{Wml5&2+44Lj(7FGD)#HL;L=2$D3CDpl_Z6%hAffqfV3h#;@~~FGdAoCdKt+L#DQh zz=)7U@181cES}!y-UHdVhiSQAazA4gQN&^13(|E^z)KRlymPnn1Z@V(O*s=)0SRq| zunYHo7K!gDJ?)PDtYR3W7hWPqd;Qw1)lye1{y-P=*OIRWVk$AHm$2oa*Zi>)>6rZU zw1VeD`Dd1zCP~)E-QD8FzlD5@*2()TwayhaS32li@07R_r82156o*wv4%vHQFDeyL zekJk7CzEgeV!~$=jNZVa^Ez2+*H9@hB%hNY9N3y?tDbgK|$;?YCV$dBF z$u^O=hoBp$aRR85%@oEh}%zzKb&9++&ZGV_@Un@qTgn>PeYmZBSOrHMZwaH$DTs;KPBz-PgZ3xpko zdeq8JLC^%`uk^_*UB`!vuWPDTX6>jqpDe#Qa?f7f=2)~iWOo}B{@zo#8->sbWFO+N zM)#r^ZR{yWY7bZ6shSYhkT6_FgMhGL-ke0vQTFaK{iS=GU5}%q;)p6sxVtD%&}qD5 z-ix3w8|XE&iIA5*`abODYH*IRE<8wS?KJFU2H-2?Q5+6LFdwnZItt;b%Qc(6BrR~# zq206`eZ|D4$Ec4le7LsBc!D!O@{AMbn36@m2@l;HqAXbv$}1GICEugI#E8#A69dzn zFTAE7-Oh4ziT7ZS6LI^>7FuHXLZm+X-EEjoUl9kqZxIfHs@j(q@9L;;gz{TK64pWp>0{_mz(`$(?Fr@K7v&xb2`0a#~f(I_s61tmp;7EtkU@XCf?9UVR)I z5)k!*Ygm9SKhl&K-`pp9Zn@%aF|mK6!KSTy+W-BN-j_V?Dc27qt`86EXX{yM3Bl%w zf$5ZEB!pgKzYYUQ{z;8DKihZSt@xhztaVSS;3ve7#vb1Eg5RG)c_Zfb&d&(<+7ANS zz@a&xq;DtWJV^{MOoZcNGgv9X{c&P ztv$Rv=8Xi~A6UE2dm!ZVQ)6Yf8D=*(eG|L;VO{MJYib0a@mE7Z8yl;0htAO7(iNbo z0BV~;qULR%#Ut9j#ZbVo;R%u7O~t$=#=X#$x4pN>f~s`3T{vEhAjQ1-(FS3(4}l1O zq~(wEq76?Lw<-p#NnCNYcq{_kR#ml)R-753GYvmTVX486?2W#;b+3hTDK-P!gw9Q+ zh&1iNv$?m;%V9xIa=?L;0Ycb@c|I1BKNj9bgaX*qt#qSJ zIw`my3adMPDnw3s8>RV~{DB}9>-7iLk$vps$To)z{A;xl`76;Mw-v(Q!vMuC6_~^9 zpit9vbBFdxmkiX_^)E?WU&}bzpTFk)vi<2aczWM6nZsAQolxTijzsBH5GCmST*aaB zTUMn(1T0tgRD$ut{F`iLX<6M)gc8B&SOeQfQqes?NBpa09CDG4tY&BZ4eo3G05|`E`Z`b_kmb~ON1M%=D{lQ^5j%>WVgxLNH|Hv6YYB4{K(oS5kPXb?=4Fv*^>{ z4wy%Ms%=u;Pk0YsNsI|$zeMP>aB$clzuIeEs|DTZ4!5;dRk}5yrxnlE+WbG)SFQ6+ z3q{@Ky0+kWZMgPaAj=8QY$I1*-YZ;o_h9soU|&K4Gmwm`3hZ;?gLY8;(t#Q4Wm{i% zL&tzw2Pb;}>Muh$zn1HIv*y|sj1{q+5@;PL$3?1wxIZX_NcML*=8=D4@_3uwgCDlL zvs>C?90(q|k7G;&D58z|FEuaRrq={WYrgh_?kB2uXsHqMAQTilEKLw-kd@9h5ja_r z8Z40=4&(SiH-y@c~ zvmm6oL-*!I1svBN_1zf59~c~c3A#m6|KNX)Fq5f@;C7f-G+8v6bZt9QE66IH*H)Y#HExJdCO;xc}s;_gVb5;q8$z0WD>ZuV{2rtGFx?WT9{&X+2T z3Uc%+9ixY#XN2zt)9oWR3tsv>+S&X_oUohzDO;^i+v^Rze9h7E;`n>j1f{$%*Q?1t zr9r6jB88{4&q>YEc`3g)5Dx7)5F1z1DV9=&lG8bnavkxk4(D>dw&k+~=;1G^qyQp{ z=oBfCmf5_VNZkkU#?XDbpJGqP&H7XZD2NnD*JA$|lic|JZ>xx_sL6Z=88 z7W|)Fi~YMI^*{ahp|28+Xi`|>6$D<|5~=(h!#}3_GtgXldR!l+hRTISRs8$P`5zt> z-6&d*$aEKT0@ARCqE;3}eA*bB5pz46@5c3|GH?0?rsvM&GM(%q>GF3d5#I23cM6ZA z#R(&l_2a=>0;(IV9ui6Zgft_eLlLK~;ZMVh*fsw#gq_ih_RM_We76V@hg!-+?&?#( zwam>I`(5-6ECSJ+8I5<|Vm-i6yos6VYV1a|BX3mf+NaghPvUot#iyh)Ry`W5HpNe} z)v4bWxE?QlOBGmQG_P*@6G3d(=C4Xnb677y6v-M$XS>I1b>fwzv(A}-k~rT4w>{Yl zuS8s8F13ILejM-s=Uv7zg4LsQO6TVzw^qLs{QJpl<~NJj0?tWf$E11a zYFjWBV&49Ny;kT(aiWFCbYv(#M9OCkQecLk+svzgM&X0l*Zskrem=#bVM^&)4;O>@ z`sT+qS9KKR0D=&AWXgQc6E;_}v$VbY1$#uF*OY z0GExR4T%X-i)hG-K2%1%GYDVO5pHYws4YjBZqF9B1JJEk>ywU>s`>H)Sb*Tk`DR#* z?%6^zM3OHAEu!Jy<~^A)p;%&yLwKQe&ZQ*0;{}O^M%R6Q!!v@ZUl(GzpYj>PMXtkB z{tC#3Bm**u`%BS34&ywPrjtb}MvMj;6TP5J7U~D!N z(8TcU9BXe%P7uBG# zb(z#NzkIP7pBUF6w_Gs)tAVE@PuO=s-(jOH)q1!gFaqw#rDfC3o z&0bVAMw#{@FW-rA`rD-EgrtzcyO;Tj+1W;UlAxU_pui^J9`crH;xlLs#Rkyck)1gv zdd1|4K^vXNIsnZfxgu$`^YQ1AJ0lOJ)0+u8w=Yx*bO|?$WYYn>saxh}y3YhKspvD0 z$3HEX{8|&+VMQhzbZ+{}Twjl?dwAS)@MSG4}*1#3#B zM^Fn1NcMH1yS4Me*SFyW<6qX|`npH8v+ET%b!9MPIfvK(m3YhJrVBO`?80OnOeZ8oLHPx838k?AkHi}}T20BeDY}>DIlLl_&3K(m5 zw7RM<{TRA2as+VJxjyg95cQvZ2Fth=WlTyF88aI;EHBDV1fO*5EzRkf|4xZj`d(1 z+T>NZ&^F1sdVBQ^N#)u)3)FqeHD9sihrva$Njd=Q&{t1yfmg^)xd}NE3OR?WAQjyi zmlooWulwRXy9)-ZHE^omO^0T^{h$mlxGmWYK+`#xF)SH$@9|Cd>GsoM=*RLeG7*-Db5DX?l@eep9Uxnn;W){& zx8?{w98MmB!8*G`S<~8>Moo`Khs6!s%YRkwNjjd+=9d9LrsTTf~vST9jA{Pd?#WJTYEgHIogk zH}390J>ltnoKf#)x`4cCgsbuO`-j4>>-H>JFwNo`D!L0}u>urf7pB3~mMY3w!h&AN zdCk9uw|=kw4rIL!Ujk3V|6lti!xP1i?6nSUImRDO< zzVfxUw(a<&u&d{!2ze55u?UgqD1P8>x}kFh%qsVFkO0N-rTI?$V9y&L!EGu3@rz?~ z*Ur5ucjrtMwyAc17;|cl*z&z{4Va@`*W|1l7)1E@@~b=aYo2H-3)Ec^p2SuJA2K{AXFNlaGMcx_EIhl10G zso>N?P9raYdT(E^`P7)9^c#vQp>$zlJY$T3U4aRO#EUg}c=w&a1npx4GtQ9wMuUQj zlOfDoA$|E?pTi_u zwzPNeFncQXN@qs^uL&g5>!93fz{KXouzbhyW7NTdRBGwS`_`*pJE-YqNzWGak zRxF5)P+qji#~1)ZGp6j9>S@?r&czfh39J@r={&h)P4aX3#dVJP-jdf+7nDWlrGsXOIHh zMa`!!F%1@`{DE=maH91gY0o0ZPZaG*=}G!K)id|AKK+^=@7t^pUU~nT{=~uryal#Q z2_i+(SR9DrMlDd{%FZFQ76}{wqnrH1=B#*lHhwN5lwPwfS|xw(s8iW>A=nm8l@d0F z0VLDzv)X~A%iWyj0gp10$orwU)!pkKn+aXhAB2aFIk3S#m5Hz|M{D6`ImYoLWEAAh zUTKm3?7h<9{^xa1EF~fx7H+ilUU3}dkDLp$YnUg$Ob=x}p|qc7R_P#xI#D*bqRURi z4`IQJxkI{&yPU4kQ(GX~6|5Om1dJ6us9x|*T$*|)KC{PM`1^jb*_)X}H5m^!Wp%ZN zn7XryW-#=7u*H>3O247r&&FGX&W7h$G^bL>kLLgiIp9@pru<{|)PDmyqDnQm&26wd z?e+0#@;wQB;d3%MB8vBgWdCqc53>Y3C|*<tO{{-PW6LT@eAG)=SN_CQeM9Vg^ z%~p`ia}Mi5)tPf1b8OrvRsC1}cZZJKeOQ?9^jm@5qg(h}4e9@|tRBKm4$;2TuFw%Z zDV!@bW9Kx`-S7{jiZ>TAquj8+z?BT|M#BLR{sKPwtS5~HG%loUN&GYd@^@%`5kd{h zB(B&)V3(q=C2l7Z+W9)BPH77lZ}1W2!;G6o*s|l{*`JN`eKJhCc60$CYJOmi79&+` z?k{a2u8?T)rQvZs__IyP&%N~>JZup z;w33q5t&kd6mnku{#&fZB`D~bWBEI!PjWe0MsJv{y503}Vv{VG(y-lN#eNxA9X()Q zcmAzt!K4LdxC@9t3a?m7h+El0T;yNr`aO!@S&_wgJRYk3R6~-o1S2a%ld*Ng z)|qXrryQzlom^+&S#=F^{md!o$CoZ-moPJ6m&U~imJ0we>6iY%viAYycPugpCfB5-TI%O3VaAqZ7mc}obCw>y@!q)yNxbC9wcTqz zseP7s(a9%Y$^NQ+Z=j3z-iY{J1LGC8)l7Y7t_X0=qO7%#5g&5uT9aa^i882FZKasAP}{klfZwFi@L*{pU#@^6U` zoo6r~<`IJJ1wxXS+RK0)-pnA5+e=9M&ZYd9pXU~5zmxQ{B+EORHEKjoChrzmdsw{e zg2Pi_8@L;&UnUw`OrWgUp}zD6O!YCQ$3BY0+m!>8VJ2|L0Pcla#wF+Ph$a7Gjr*3WabtsOIEbB#rnGtYElXpZyj^mKx{c62b6w z()Ryg@7?2}47>i(kt8XTbB02)ce{u>y1V+zna=S|Y;fz9FB z5;hFv$W?{ED2evI$JDGg^5;^2Uz1|}&4KY0nWja#7)_gn&F3qwd^gzCLoJn`J44H2 z1az0F(q=1(Td&a5zHGGe(`;8#`sJB!32G3;owvze%gd%TT}ef`B%t)TDJra&@0ds< zF&w~YrzTJ!p^RpR>sO&XCsmpfK;O+eajP(rtM>Ee+5++De!Hkr(_Ry4xm&{F@vpW9 zCZNvxUPD!~3-x!x?u>NyD}6pf3znzQ4lA&Za-Nl2u`9`Ye9A-QXY_M??bqU@jjB3S zd5K{5h!KhqZ}Fd?c}rZywdgP$FEC;-XW^t#s%mwScy0{w^}vsrL*eQEp#?+yudw^B zINeU(>D7~o!sjus;5pe5%cM}pU2tB8Z(>y%KmB~jqn|v)A{<;kKei zTVvD>BLU}MgMqG`b`{d9^x;_YNOIb4x=M}NY$@9c$L-#_huWvC*Hw1hmlEH*P=A@@ z(?X@8fN3LY$rF}qVLbsxI5TP;HQm1us7qu3=R5Hi`w4k=8CuF!;G9y4!R8fL5c}n8 z4-*c-FVbg3OEP29o;n}0o3XTeQ*0v}xA)vm!4hk&*%nYX*u&DEWgG$goLHceE2%BO z5HpbO14&AFkb|yAa`ZR4)m*A1`ZBchZyr%H>kws8IxagUW<8m9Ro))5cCJ zb=fTT>7=dtQr-*nd0K74z*uG z~m#0!_59*zzxnef#X_%IU;N?HXf0#d?y$hy|YE5949g+7i1dzry}1dpnc( zf|GVqNsj&I^V<_pAbimcUH@HNGkKrv5-Wt+$~r;&`8gNPFeRR7^SG0HLZZ>x|8YMS zd;PkXxT2~kcK|Z_q7u zXVA{TJ7#KnAxQm0@{kpe%<7@~6dM_CjL);yO)-GCkt>V+ICt(B(G(DY`wzVNurtz* zSHbD=vqduX=4MEgdy~%AbRxonG|01R$q-HX&ooqu8+pQ-+%R! z_dWT0le-Ls&1JTPr{4?H`2bRy-2PEeaIll{D!3iN4y06qKsY=FNYAevu%u!M< z>q%)0R^JlwrAg-^>QGJMkEbX)21BSIeuQeG{U&SHfa*&14>--RRR7k0Ww|->+hp?9 zNHG!J#Kw#138wPE8+(D*2OO*XcqRDW8{kpO-yiMJRn6f6uA)-+%suh(mhF*N_Z+^Z zJ`uLw6@=RAv>{MkUzoKA4xm|99rP8Xq=W%Ku0T^hnW9YS(yNn=n;5`<|8>j%MtvsA z+ed`vP`M|!^I-GV2B!oqp+hC{!LOt@%RIE;Zbp9NnYWSpA^EP%?i&PwdMDilwS@XC zP}AOfuIur*EPXlgaI1ux~6e&I*Mo=+FjOr4)sRJh5;Klu?< z(Wsfe8IiSGu%8BP<9>DX# zUNq2EQisyAVF2X&Hl>tx7Eq76d7hndF=;IodlIf0s^n3@#j!)Tm7C}a(N5!fXAQaq zZHr(dsm{0t*+^ch6=P=5L~rZa6IMrNBoBZz)mdaKtU_y|%tw`-8Y6A;wNh5Gqu)2^ zZ?3TxJoMwS;byv^R*dc@&*f{MTeQ+Q1BM1L!TSVRIP7<$C1nOy1b&BAwC*Gf+?MaR zRPdV64VC?=#*F-;U%)YB!5zu6XGcRW=ooEQxe^>r8|59mV=!59o6WY8@F`X}hmfSvnrqF}d^lr!=_Ywpaj8qHkf>5D-aen*; z(j8--T)r-&Di4;;IeqI-BCT!e*>?-d3m0ESHAj@Zhx;4|?~bHzF2}%}RajoU)N5bv zil^p`tM@bN+H7p^Ei_Lk>S#$!T)n&cB=%c?8m;EZZ!WEyV6y{R@gwS^X%&x>7jC7a=S!#suU?1X54dfx&5=z^R{QH zMDQs=#SqPn@9!pTI#Q9&hpia?&@)#^`_*(lpEg;Qs@LEB+$+=Z$Ka6tkME59y0&i- zvI!z#w^aAK7D);=qwL}85fvWR0d`Ee49r{658gT?1Lnqse(h*U@Z7sCshYps%Ej|Z z#Oh;^0-lP!+|uViVol{@V&L8Lqs4*B-u|<^(N;tKYg#`GKgXy-s@E4UdAnGcZO=Io ze41M>Q2#xw$~?&0#wH>i@>N7szw3%mUif=YN4Fa{G7WAQG^wh-OAJ15Vjon$AyDsl zpk_-HmYX1nzeRfk>Uvc8Adjh+GdgV^jL%duJ}9^dmcj%!*Lk&^K;_Z}!79c^Nplx~ z7%*>TcYjM*!J?q8t(n@JZmQ%}HD`c@=kP!A+4P;KQA>?uY)zge=+4k-Q`am!247eq z@kYo*u3RX4y8TGQpje7~tjk7ea0FKzXqX4OrX&6>q5bbeZs1j+ti$n#Q*YSztK=0B zY(jKjzE8Y6KsAf| z9{silBqwXIVdZEzdt}fIcs{WOvD8zX*2iKP9aMqJpYKdQDhvxMz84c>f|aTly?6JiFz}83B+E!5)SY?Qota$&Jk8F;DWq zeG37V#;LuJ7~c*xT)yrt5eMqYB4gAdL=@uBZ09o&HsrBM`Q~8Iw zKD2Hg@6Qby8bunn`%G^g-*jbrNltX&rk`9NP|YYGA`imDQiP>mm{G_fj$1#{*wHn? zHZ^<3lYKZjUL_xQlihjeN4=OPh<}@nidI7#Sk|=KmizjVFgWoTZ5X6@)( zM=dC$Ftiy^$uoe262md@!F-@BY=N@q0(EP}>}}ZXSelc|I;x=n1yyGQ5(v*Val%e$ zqz=Hb@vbNUAzLHghnLh?1^}at2C=c-t-!G1Be9>8r`+J6b<}+cEMk<-`LEae*XsRC z_Wt)d8l02l0V0C~_N1AQoT>*QCENN)mHkT|r`uE#&y@tbhP?WAS#~q!s6?}`mh3~p z`kjI3mQ&nDow}y)-G-#vFD-f&nti&Sn}M)CemaJUI*^X;&Bnre%;y$8-8Kq@ZMe{E`_>bHo=soOrAb1Vk$8FRH)iZ#VxNu(d$IMg9 zzMO;CM2p%Ds_fpp^!OSlgGn}gcwq83HF6t{X40~HSjrBmmzZbo0cjjX0N(hMe1j$; zN}y@dZki9(l$OZ2O@qRgsAqAKcGtv{zbH4Ee4J)%e{rx`%(7VN(&rr)kL*j|ehL-v zpA|^{tK8`XGlVOV9(jI!wW2 zRAIE-qVZ>1VHSYIU3eRM4x4OIuY7cf>#T$9zVk81er?_X+Hv{)e=djY@92HO@}*C* zGB1JTtUGyt_|EqY=d!>***@o_5PN+XpmA1+6Y~txTEZ&QH!4bb_dYt~`4{CI!_->t zb{P<-Lj^SdJtB^$fJiP#a!xd%6WMn=uuBwA17Y$=JaOT=b}>6fV|dNX#Q@{+Hz&0H za}V4<@ht!M9?b#M$68iTfL!X{X^^b(if+L@vj-)_QH#Gp7s;cb@9XQ(UBW!|8{bV0 ziFX$Aim?h4zosd=No(|vWM6tDd0^*Q9N@+3sm2swsBcF-Hq_(pj}1ONIDAy@!f^G| zd;8FrjlSStt42)kRHjE`Z5o7EiRSahS%A;nMQR;&#De)4s`|+;B2SHg9C(%-_<Fo z2gEe0AYCc_O~5zozkggCQUZj1vV++Q)ri$Yt>^TAVrZYq9;7u*Wh@{Z$^}2-fg9*> zU>&tM&7Q)JZXF{}?Zezg!}h~PJ#0x4;C4Oe20t2>p!biV)g@#JL*L{zyd;eNoJM8x z;IF{-bVGYMHJxQof9V^E<2BE$zOvhm(fuyXwPbPg_PW&kiTaiN=rSjH?D^nYr5n;a z!n5}(5|bvioENSK*mIfg)xObVr}~5J3W>NQT(Hy_v;(#RC)Bn)_T>+UKKpy7DoOj^ zpD)tf*SoYcaz?j$9KFdmn=wHrJ}7`h1_U(r$E?FLvd$;Kvv)G@*b{; z3qN!Z@>`uX9q3$sd+z+j*CoUg?!~Hou71%I`EyTsO*5lE@QGQ(-SpLsoKeH-B7&@w z>|#X8SGuWQ9HI)ozHr2Ead)LgXpUG(X5MhbR9JExSz@;VUGB=mWr^03C%0;V;3lo};M&g4OhE z+ttL+=f31|PLMos@YutfANhk#7^K{naI8IZ(5iH~B^#=STAl<g2}cpF z!I)b|Wu&%2v=ne7jr(DHSQ@O=l%e~~Ys7^Qi_ndhHLWk8qKU^J^ZU^M`8M_+|FR}I z3MUMCkRRv^Nvm{WT^%&e6pP07Ki)~}3O;y9KsC`brfbHTWo@ik!?qRr;gDMD{-Q19 zn^WZ5xz6i^SnIB|c)AOflWxw4O=J*1!G^Th(M(cEL9gz?O&`yW85fkC{dp>O(Ns|X z^u&COV#D`*U*}%$@V6k}!OGE%-1@V2)JjB>in9}w{tu^dVIB47L>WK-Uo2IPtYdIC z(~Tux(B@5r7=ayyc8b&+9VZip(0mt;*POVJ_-SCNioagBCpXdSl;q@g+S2r=09(H=BEZ11VsTP` zJ%9J0f3j}+ua4)l`WiBZ@B_O|7yS+%WilSN&tGhhml}zxKD=eV#OO3nalKUw#i7r0 zN9(sdi>j)`(+)RE?fXOBwk!*)i@XmQr1e;L>K>uZR+TVFUX#PuZ1+SDc9sg`8=Zs1 z2CfBex{w%%S|a7?8D^xTzhgswi!U)MO@dhKXH4aJJzdlU={>E)hR#WaB^zsru(e1@ z-!X1_!Qb_7JlnFCLsSp-0{Uq8==&~W{1nAYQP{j6D7XIQ(}^{J@0JuHeGoPSOnXuF z1dcB2Q{8iNd1n5}#W6R7%|2y72}W16tS&Xo&w^ql8(+}QVB$-DFdgfO9V+gQMc#3~#;2LuQQ--4T8xUTC-hl<59Az2)@9{yGazZs$ z81OCPVAE`ndvv$_X&vd(s9m%24S{wEukm`Jjr)7!t><+>wiMI)+jrdruQ@{V(RZX1 z>el_H?3u|opmu40;YiDUX>Gbs*#6r0*)L-ZTjbsslpMRMQqc)z7+2WRkRy;{?^ zw4VFr%E-XbAzJy9(NyzTZRIlU=OsU6&-&hN_{iBiIxI$dQGJtYe`-g!rKvh0YrL;M z>cZAkj5UWPTI3^qum=BpF6@8t$69RJ;pO{Tm*Hp<%beCzJHO+|C~xfM z!RhYqM8iv5??{jLpdvo-`8CC#PiPU3qytp&e0&la}&34 zPf(Sal!HZ7z7I>dg*QTV#$G+S_ ze{)~IyN_Z4lh1%Vh5SGTf9VbWPTQ}MkFH5C!m7KE4B#X0ZyXp4?t6dohM_grk)B5# zCkEF_r6;$+FM>x;eoeDG{$OnHuvJ$qQmwG?cTGr7Hj8U5!Tl&<|nbjQS%SG5=_BDB3mXo5UYa-0xl9w+B%ATPkxzHe|va! zM>nY4WZ$mCv&?m&%(Fi<$4o@L<+srRGGsz}=sHcoRE zIiZV#hsR@Iklk^shhrmfjQxW^Qwh&vEQl?_F@n=Bb zuse4h<){iqDI#5%*53}2LdS=fFl#U7HNGBWw!+bF}0o5f6t0z0yGdPt7M*r&V zmOl`aCZ?|xF%KKXu_HxrYr=(umP+!1Le{KGGx2;9;FwY&{tU>%o%NHi8I}M>JhG(1 zls5#me!twuHHQ%+bC%h%?zaK<@Xy#|r25#DKTp*aWpskqSAymaUIC$0^#GrZ;O^^! zLXEQ?0nQ)RlQj%34`YEpLYG0z*HK^|9|O?Hqf9(b3TpU>9m73}Rn$Z_EVIR?&##It z?)nRU#2rHD8D7{8z*yz@G>^o~tW9I$y$m#qM zYqsAkc1^C_AJpQGo7=TP(C(Fd6v*HoX0D_LQa_ za(qb-ha`OeQOcgd^cc?0-zP4Zl%dyAmxrDJSS7FL7k7_T0E%#rP={GC*^iym3t)AV zrN^LEz&tmQy{wNrypHl+l^IeTM<4hT@Lwd}$1OllQ`m_`V3<$t3_c(a@i=xF2rW-9 z#I8W8`U9j7dWJs%1AhwZDK3gUI|l5c_9+F>G#Uw1Tz-nJj(v%}PhUsP zpNDEynp0*CMIL!dUO0va%j#4IfR>jY*$aN_sM?flZ0W-MiZ+fCJ09!8ehpUS z2O=ln5my!HcQ(Dgl=HhVU}`l#!#=euJl z4ClrjtIm~?hj_%!mhmUXp^yGO062fR#f3CfV>((7R-`Xde5gDW=DZ+lZ_h+m0acsU zYj3vaQFrat&7|V%?_ROqXYF(X?U()wl9>&1rG<7$`R2ywu&#?!df&Hi$+P<>(BHc=Z?|Y^3URc;@^y~VfDlGXre0t0i~AP|`+}IYy1NmhhmMZz ztCig3sIz?=>LSY;9;RiC+zn6L1!s>1V5=DkMZFtqU!~oZI%ZKPZhA2O`8%&+A&xP9 zF>pI15*i5dJx;AoaO~lyi(hv~&RTZtSI)hV_5d_4R|f>Jr_z0DQ2|<8TYnr)`9FyS z_x>Rg1Pw60Fz^1(A?DIGy*)B43jZ=)o|QXa`k_PVYN`J%f0LPV+Yi4|SNML{^@F3b5F(~p(7L>Ot|Ir zipK5D$~V+cX}jCmJX?}>-`~49D_1%2Rt&Y~p8NkmG5g;Z2P5eEE+S_?0QXJ%0#IPO z=Q!3PfEe7Xr^6cHoOF$)%m_ByJE5yc|8GnAqbqk;hRI3M^^LaZpFu_`m=+24bl z${4j~M-@YBTz>V9X`(b0#H2n5s80@nyS}s{Z*0MiyM(W!B*};IH&oc81k$3;(}kFe ze+GKQzE2)c#|MClqay627w(HQZb1nE-@({;#s%z(ub<4!un^Qxm<{zjTt}@bXLoCZ zBr_-6ss@`IDOZ7(UY}s$9zx@umhITJ6Z)2F>nOjJrUkvCa{I-qzgR1L+>9F_fI@)* zvBu?)6ePq02ii`uBKNK}ExL4Ffqpejt`gmPI@2QP8o*Xw#eU&M8e74IQ~Gq3PXk;{ zmY@!lZLSJ#GN09qHbwf_#Gfalx!8r+@hA*)s|_@D3g=LXT~NnS;-@FD^H@1TSkIhs zCuTbySRIM$s7`2(2uy~|y*SZ&FvVE7XF}%dcWmGn=(yi`!f|}VUm`Lb6kOp7E4&;a zf_c@akh48~G!@pv350vKX89Jh5(d0C!=-&>2mQxz2cr}`IR_R78(PwWM3B-($*Y3h zOSrYOnzvO}Y*=KqyIpY5!jV6ALn6`zEqEd-paYaIcIoxLH9nEwR?%;6%2)Zt`@zD6 zEw`>QMb^j&{l!vNUV5_ zAb3qWfLW(fud&j|^eZzhzgVU0?)F@j5HpAJPC#r zEOV0)D|_ijHv4PJOKcQ832atRHa>pPIvt<>XCpM`K*ii1^Ran$$P#J72lwRX@O)u_ z`P?sH#3gugBO3#Vj$X@I)x^SjR+@Y@nkk|}eqQUZi$y?kK5@L|I}4ao33mLmX!ia+ zGwkh%$zK<}LCt6X(9)H?=_(yO979E`!g zYNK1YZ3IPd#aF)^grg`~k-6$r?P`nE6Q9$5X!1MM7Jmy5LydD=c5FN#v$K`k?ti#O z{@YH}Advw007 z)-7m6V?2LluRd5Ger0%fL*=oZb`Lg(iN!rF;CWY_gU_kPZlQuRKW25VH zjj_QJ^G59$TM?Ocr z%X!R?dN+1^>)VUybi6Xt(B2J~Kfv+^{b8lNXq|mL%}CN6=Vafb_B$?f*a$bBIQqz; zY~J#jWGiG?k9*g7hZ?1m2efImv(su>+_cDEbQN#gZ#y7MllwShxM^UBWPBU?BBLUDgr z1~>144$wn2s$6o4$XRxTZEMznDr1Z*@-CP8%3cKpUB!$YrhrMEe!H}n~J)@v71slp9(2e^-B##Uluhc~H{ zLiG1_Z;?yz%u?+iHksTK^i=b!sAtmVq0AtM$`6~l(+OSV!sDq-rRg;O?!wqDQdg^M zp0-c>oQ)d0aeWu)xZb}zuBq_W-EE)iZBcY4aEVq%=Oac-=o-MU?wzl5JV0~EN!a1t zmzg}kJSrt$Srv>Z|>Z71HhNk7t1IF2}U|2r*Jhx;;o)bgPzqnySrFewTL}6 zhYqP4PkgAjb1UXu%`T;fTOQuxpbPg@TwP8Yth2>`$6Pb_9qwRhqw9)wtQG zF;8e+*nGkHiCLWBj*A*npLb#cSjuqa7+ILL7xblLnZYveeI?Ge^;7G^PPGpWIaF>` z-02%(66D6?d|p!`W&U;rcZ&>TiT$F??2<;Lz9dpDt=h>_-L7gJaL-3p)xltskFmhO z;nUbp&KO(Nib@}jkp*UOW~6kOAND^{(&l|ie8NLLK>C+af|;q9kBnlVMqTg4ob#p% zBX0mb*&-9T>R(OCu2?2FM72ACV%7y>)yeLgS)#XrNXo$89?YXV>?i}St-Mz{O+_Rk zf6NI?PAJ@{@n5VPEJZ-izB{v64rC)tKIWi!0X7?HHawMZaAI#;Zsa@u%IKt9Wq0-& z7ov>lI<@`Ab5xqj1-fGp#r3@!dBap-pDV3};e5a#GQNsb$Uvo|?iQbZ#IdNK0)_@$ zX*FLI&0|Nn=h{XgX?KsT+S|GHo;RnRGVIHtw*IjO_TpCWfgLh0Grn-s}GeC{~7N>aVHA( z)qGsVUJq5Y`gEa5*h%bSsp-BG3AgwT8ekM*&=nMy%d%)wgY|g`--Z1sZ=gc3>p?G7n#Lu)xEc~_IYTA zjG6GjgZ-AbjU|rPa$MP(dEw6G6QQWJ^8b{>4SkW`Qa{XzG!N&egq6$s&i!b*UUD_{ zii^v&cO656dY|?R`G;HF&B&7t6yCA;a66h0Q>;QY-=BhsqCyvm!p)Sv48dcKZ%_89 z@Qj;&uuZ9SB>X)1-3w&kY#x^WCCG3sXTQ8rj=ARN-Us{5Is&`c|Ll{Cp9cg=m2OZj z$kBp=`c16GEN&HWEDnW$jhE{t$W1-Hj@tl2G=lJ0O}h;VhD>Je`05B)tXYbXdKg7AZI_13am^s?l;DLtDmyl2QQeD)o?b1Ue{ z@oua0n)s3JXd}3fd5OivPAfa(HZmXSs8H77Z8p2HlM*R)Xmnh?)KvD`K*mvyLBWSE zy8e-<#bNl~v<>cL3M?I!YoP?E=ey8-Wr=MN?+yg;Bo#Zmmj7(#F^j^a=rCcDwsW79;rFp zG@^L#26T(OJNfd-93K?A0P7UH`kRP@p$NR+sXPO5+;-ryng?Tqk)2aSLBcMWuil}1 zNx~|>NaJFez;U}<5ihW^17gqK+%#C+G4kvpXVW_BMoS2G6Omy>FUrjI)j9GZ@kqzE zx>z5sM;2!8zEa0{CEjouHhc{+-n!$VIBJ5x$vQ>%te%JEjEFVSXWguAT3LJF9eyIa zsWs(o!Yp5vHE$MxHRwo&BM&S%8hoQnVkB3FX?CXL;RD%rHRW5YoQXfL zwjUP5?9>ER@}-1caz3VN9)o-@-&fS3%l*QwgdAGnA8l#)IWI4=-|U`8hTFyy4H+vp zgyUYU5-xhhZT>g@mUR@I@(y%!Ou|cK&nBw%$0Q1sNY#9sczWjAPtC!I@w{;3gnRPx zpnHtq-(#)`{kILf|L{rfvHDo-gd!SXNtv-oVyyw@%_Ecr_cKB}v;e#;LE!kE?uM3P zY*DOv7{FArW7X@Z<#A#+S*isZcd}y{Gtis>X4JV4`U_}(T`+(5$12uSL3rAL!Hpgx zu_0ITR|ffV19{xexD_!NBLVCU8iNw_L)p2|c)$K0sJ9dxfOP8|{1Xba0k{_r0ZVFy z2zG6cJMH8ma@;qT4t%>h5KS182M18~GfCm+pK-Lzdv%Z#-1Wmu2`2!@F}sa`k^Kz| z%dBFqfOqlrASJ>yhKjdNRP) z&X`a&UwMls%M7?rwCMi9@J67k^Q-!p2#BEoTtllaCUz5meqR!E$cF>0tdY5L=n3Qb z@0CV2onuT4Q7 zbS?h?t1WXL5tB}{L>3oZrUD2x0Y?CRW9%xHZHzQ7Vb{(k(ed73ehurWXu?_DoH3ZU zXUQegdfeq($Y0z8X<;S~YKWphOi8uj3FQDHSj~p-xDn`?xi!ELdnN}f zYb#)JPh2LYQ#@BCT9=_UGo4%WD{mkNG^v~qYTQl3vZMT5kc4{NCyc%K636NUa|hJ8 z^Anit^2(wu%Rd_IvO7>9q|R0R0DEja)|V8gTk!`)_)gdcH6qt=jMD>P6bh+V6N4B* zj72s&>nnB;xz4;%Ua<_g3`Sn!7TgHH)g?u%0#>K=boBmKXq-C>>Uo7-Ly@{&LHUt> zID)QUM|}a_n-KXhE|<^X&mb=WDsc^nEwhZh7~FAL`oePAHsP9WCat7#qx)4CJ+K4QKqM}tO;`<5p$wsM3Nf|;fWFNP_Z-u+37BRSa2N7{$^CaSQ8^}Y z1uivKuo+FFepq04T;}&63Ui?JsLtT##|C=&dyf$PP>U+CJg(Z3R?X0Z{a^p6oEV-> zRvuNXli51&_tUq@Kip9W_$wCN?dSdub;Sw9&QhM4wQ9j#Nu(LauI?g$9nUKyFU1T6 zs$!izw@yDz0sVCu!F0Wef?I>G`Hr=&-w$1O`_-vRdeI+pyJxF0=-$!+=Dh!7VJZ6| z-bb&7M>hG^v}(n-L!d$3!0`)yQXe(f4yL1JVCJYY?lce&bNEuO+%x?b6lkm4qM5{Y zs|_A28;ofm(+y`NxN`yA2}m?>MzoYUGL3w))#F7dZ!GWh+N0OIYEncklon0XUKJ#K zp6m|PCPxo17_opWf4Tg(5G7b}ewVBDf#5_L;AvGAKwa6)i;N8Ulihqy@moo zF&>=IT>+M7+x1qXD^8>1cE0B>Ibq2590L`vNuV+8Qw-t4lT!;pC;0;p0o9KgjRhG&Js(wXFfF8wqoXD0*7rAWO7$rFk4mE2LRFf~SqVdB3pkggRs^D4l-B;>^GoG(6oqGzmkb>ohdeew&Mji$jdzLHU81W1wqrEW^xb2>q_su#O zFY7n($bWwFU0_FCoLKKtBMN1@bAthQGXcZ?fOS`i62A_(QV%jk9DexvEv_Vs$CQqm zC?P#O6IaeXI(|4lcuW4IUH(w2`E5BO?|9p4&VXlfZ0hz>8`iQmS4N4}^h07Vct5Dxx;ODR9& zX7^`OsF!$o?QT^#S6s-DGeOI4+Pmqm#J8e{Jt7V-;bmf)k3QaN8ZysPZ@yT!)O+}|Dc#W8?jeTjII4|F2O6_Ba% zAY9rvcA<^M<3c&*R5zqc-@>ET!so`h@;;|_djov=MW4;NeL4bJ6ZN2D2$6b(9>u7k z_sp%1*_n$Y4JA+rOUmbUXnA=VU9Vt8#4Cb7`3T>{-76Pgp;fe`R1Kt~Xs>O6J`1*T zEPL+|R*m6Tkev((p5uT~+!O;raM1QGNy(V8A6+lB+w5fb#l^03zvrtni)s-&BRKTi z>K;pJfi^V1H*-tSB{#}^IPj0u`K-1TZy7mF^oQT2KH*=C5rig%E;Z!!>Rmc$LKL(h3#XKil5G2jPKo0 zqgVqxisXCg=yXIMG~R^-q08&2a7SI*;e5XA7qj|XI)5qs8r{f^+ih^fiX-X%hMZR$ zUTrtn9jHly<!t)vMc+h^(n`%n=K+sKON71AX-lmezaZJ3$!^ z((6u& z@sn|DI8Os9?3@=cn@BVgQtOXlivYf~LMdbOwk76sJFD{GuV254e)|0RQ=GBTL4`ff zdui`bvWXQN0{@0!I1q{6zp zKWq-7b^BYLeVb*@>9LbCR`O&=fY07^5CbV0_b_C%u~Y8&^LZ*S&$@r!Kzw}EThS>& z4;5+%i=$GH1l_^GisbO|(l{4K+B45$3bB zl&QK}Z_W4K4(Tmw7bIQ>y#x{w9;~~!Cv)GuVvYws`g=O}+}GQ(F@#UASOzKJU^x60 z>HZJi%HI}E`|p1JKd)bt_F|d>>#&q%4(tpa8(?5*feI2b1w_|k(ZI+{fNWVlKv~5{ zd(ttzy5LW;ztF7Bvx<%SZ~hqgOJrEynDq!7LtbVBYg4hG9km;L!2@Hs${9l*&!`1v zJYupY!6*bG;|eDM^ME>x1Ucp{xP-8HfdO}L1Y)8tfa97(s#I-R6&uCYgA^8tC}7AF zvsvGNmSsHtGt>n3;{2)t1_RA`!~*Ma-bR!a1#HQY*ztg5G$s{rdcYM8w@fh;h@DCv zhY5RtA^YMA$bYf3o?}S>XC#vFk=3_sgJXVNMO^y5m*rpLPtnPU`TS9k0CNYs^!{hL z;csjI9B9@-w*1yi9zpCx$8C3Id}itZV5#tSA{oebru(;t*R+bj5zK)V#eGB#Jc(N5k$rnd}F9|T`mL+|n!tdZ^KDC6&*{OdW z=)nl6F%vFD%GFxf`-1~@OTn(jNk0cVC=N)^2C_{COLI*I#$u3VYt~s-AE7|&&nSFR z2v0V6CF2_KuDaKtHMxGWJc(uFm(L^v$Le+_vI8+;{3gIY)iUbCj!@3qQ?a86n0G=f5DT2F&-p2g!k-N}#oPcjTEfhA)TIE$RfnFR z_llWRoF!8y2s@ryxsD3Lxb!>=aHCC9fvS{`pj9KaQS*~tjxFF=k*g%kq~_O<|fmBYiCIYb@E_N(s&(Mb+lP! zlsniP0E|QS@bmY(in`VMaxxQCJ{pvE?K!{a*5-yC4Kjr&Q>FqQOP>tGdZ7D7YaMsd ztiEJUm2?dpe)87Qt;5jWZ(GOVOG%xw8Ir75bOUK1s$3j2as=LmDdW3X?TDuR%6dPu zLhOTFI+g}Z_rJf|_qM-uF7klQhsbX?HMf`_F%Y_g4P0c|vhV9^(Nt@@i~XcC0!F?Z z?n+qT4t3{+Wg<<=dK>sk<)SrSphgY>Jer8039ZCJ>|E*y6AA`hcCHXb;kx1KJvjvd zw~v@!4Ryb08iPH!J3$M@y~R**(9O*;fD=D>oMrFmUT zY>Z*4jdNLP@nM;!M5p61Hsd}9FE-tM=%Te#5|q>lx>pYEr>V@5vs|S$XhkFn^^AJJ zQLB$#*RK?OI_Jh=b+F-Gg>*fLtDRzAN181O9m8BdX`#{HT%_=&)Pk-%rZ*&Wl(n;E zjCP?Yjv8LiZVsRgj-Y6sTu47S-=bB(Z|V0H9mCCm_FDcmekYGM%!;2ARI`nL=pk_M zf^+n#aeRQQxO3;=rU#k+_L)5UVG;cE6TA6DUPNI#Ko zD)v?2zzl$hK}Itt1yBd-@AT!SET;srgh{>$qfNDDUbjW!vrhTajK)7LN*>EOf6L&< zO$AP>HVV#1cmANAGDUI^3pWl5*VNvua?#V4J z=>m?*VVIKA(%-Qt+ka8#q%>o;STfSvhrX(c zxipO`s%O39j?Sf59p3&9;-XJo8-iZkxfxE~vE{?p_tx%*=47u1s*~N6bw;hJT})ZW zYx?}K>1ZsEJ3*3e?^EQrsZ^}Eq|1X+xNlzIlf@CH9d+6LE^2%ZG_WqAWTyJ6?NSqRN^npQWVzw0TwhB>wD0@iV0^sI=ff4amh4@{W35U$UPpeXmkl_DU%kS!tEza5`5<6uWjsgWpVONd@7OPqEqLp$0yC99 zYg>9x(1rmi6`MA$`lkcw(XSUzdVTwBb25_S0(_?DEg)EoA{Lla&oHfEpvXsxt77|9 zYY8GnbTyoFXW^Fs=g1w@2uwxd|H1jp=QeoCr4 zncrPRTJ-b%rVi?>r*=(!nQpJQ*QJ+o%ElHr>3G$m?hbzdEJ3S`?T*u+i6yLYob( z3OimWe{U`uNQ`)Wey>=y%8<-F+7>lIb~oRMcL7MiMPD;U8SGChwtyotE=&_ImUUbg zo=6)qm-F4>5iK=lDOs6p@bJa`O?AgdwnDK)5%M)0IQ^94rL=j_hpuUq#0BGf^)qv3 zq<2PM_@%y?8kSaM<+XaE-BdRIWnfU0@blOi}LFLd~w~y*f z)YGT+i5HxX$q%3c^xPMq%lH&TlSVft9m@xN&TJ`&unS!pr#B$im?!HQY_ zPtWe|TWfXv(&x?BESmx$fyIfP7o}*#0mP4u=GP-5J+`IhK1g>YNuZzkDpq|k_-RmT zL$QRa+vg-$$lYG?fquYU`vU?NQ5U8uaeb4Bg@voQk88Q@qqsVbUElilys-{`&FRC) z)B8yt-e2WYAoMf8!!Kw%E}({wmI^}jy38&Zf4HsbSV)x%bz<1RQftlm^()}TzUR1w zm}=LdT3u;2Ew*9wdwumuUc6kaX>Bu5d>;g)HtkHp#UIkZeW)P>_nKkhfk@29`p;V{o~&H>#Lh~9LM&h$#pah=%1Y*Va3Jh!nf zdI-@d#F_-iAm{C`WOFQm?&(Fq@Xc57=159ew7Q+6mSR=%LXMBTJXB;30k?7h3P!7k zcV$A+6Y%=^gnV2YYLCQ>{m8J6HO4DfrIC%}v}%3NDo*c$IsUDQj5o!TngU>?YxrFQ zWCO4}i->skkHpKl`rqrTR0sV00zT3|-^n(Wc#)O<>E?{iiy{R9HfrWNg&bu|<0Sim z3!O(*J$HR2G(Mm@=TvHHx%9IF6PVoT47%wT^&l%LLKtmpK-|5k5bu1Q~@?1KTe=8OyTjO$7NyKcWh4PypiCo@QF+ z`tQ1n>@c8Q3(5I{J^N>*9f<_{iFFFzi`-!iV{%C7Zct*;lv%{M4P=V;M9@DT1pCo^ zbQh4@HlR*{(-G4JbOI@kv-w&N`C zTkiHB{(PsK*q339`!FDv*Y0I}SJ|!vg3<>Vvc(VM0Z!Hi>}0enG4t2H#=sp4up=;Q zjO?`{C$9Vb1A$Zs^5^ai*8P5DDkYPVz8Y!q2gJ(a{Z;IS!!NW&)8s%w_@C+4JD?l* zf%@GcW!*X1@!fu${OYrP}?0P>F?YdHaYhB@q{T$)YtYgQsfx1|R-OMH;G5 z+rUTr9X(QYeR1zy=*D0`vmWC>8mL=PZObAv;VbBP=e5#NxCD9<^CJYi3%522poQUM zo3qX!VU7IlU~tNU-;1>dSe%S$oPe?Ap9wywBB=SDu7ipAxxvZ0-;Ffw!2i%G0_Vn7 zZs-zz!;bX^jzodzINJ=M2K%-KnTi>>8^*8JR-l^$)WhKCrXaVvw6^iJk+v-f*3cwt z4q5)=S2Ml0UxtE|jrg+)g7Yy$0>BZBF+gE9vvN9-FK-|iDStpn%R6w}`c6~+8K5A>!Do^uV(2o*5KLiz3iJ`eMH1mr*%g(w6Gq{j!ADF&==H84C@ zS1L0)7YA?SepsTw2Jz^_<)9zsU|u_HSx_+Gi}zckgSXB$c(Db$|L2q(V1fj*{5SFP zvMq;Ey|!sM!=%=fGoJuX&Sn9WL1osjeKKyK-J$|!@?ID15`bX3w>Mwac!TYRCu}kv z{=%o=epp{N0>63I)rx!y@Bb5o;=IQi10^XT8#~{;!S9YtW`bqj*9Pzcco7ZEsiKqQ z2xce1Vx=Yy^TlomEMQ50fG_0Y^bys3DUe^CEFNj_?XngTJ0zgY?U;`N$_07Em(Ktg zlM2Lb)j2iB5QyTj@9NoNrQ>%U7=k}#1^ju3Z zod;a=Xvef6&Zd7{<$nj$_V;0IY|Ds-^z`rLbOVONsMdhq_YHN1XjOY);O?td2eK-! zo;!K2|EfX^M}IdjH?;8~n@P=$|XZ8)Tdy-$4kVphR1?+4yvNa`aKMA+uU`*Q`vZvPH{lF+ z&Yd~1q=&qa41OfAq4~~tf|cRjgQ4*wXe9XOUl_)(OxCVAW&prlRy1o^oEn=jS{O$W zs0=^4h+uHS8KM8}vzfA*!7kYn2iq8$_p6|tu8AIS!qkWKrfkSjz7`XsGWQuhnzfD! zsBO85|J2@hRqgGd-r%-bm}4o=VY}y*x&1OS^1`m7PUqJ8-`^Z>`i`$K(jex=kXXtn zeTr!3K&!bm7wSamfXDWc9_gq2M_wJ26*4;NbQ#rIifK1)u`N08`YjSka$TIUHn5x! zla1%G5V>~v#S6uEkQunRE> z67aK_OH0@-7>vP=x!YEr$THiO{hMdlGZS+$Lxu?QUY6<4(}*7$uQ?fqk?FXV?`Sy| zjuDx;ZdlK{4U+konjf$i;84km?n4k_8Q^HJxJ7~e1QRv(7jF{n2xFvvHwDL`=wjfG z0X7L!1HhtUI!qPJUpQt#g%P*<0P_WT)5w*2oteMDe6mP60h3;JKlqk%ZdRw+*_pii z-UpL~>=Q>W&-5!yK0EVRx!n;AXR=yW%jKu9J!{t`a3YTf*P!YYwm}~epVd|p29lDaB@|rNusGryADIP zs&~X9&!0bZ_HyywVHJ6w<64udd$YFoUbo*Y?aiEGV`-sm>uIIWQLd9H>GG+S;gtbq z9z=Dxvaat8b2UP9Ap@&ic|`1*?yD$$ibLkr>ZB*SV;Ae4Ex>5~)$~@w|6-A7?mUP8 z5}l1>iD~rUx55wt;JRfQS^VX2a*l)By|3*HoZODq2ISRiTH?+h5Xvgz<&VJ~&P3KO zT%Ym3*0O!dQ{pRHLNgIj|$zajVrSDX|q+j3e+FZx>>5;6gHCMuq5Br4BBk{^T zZX;GXdyEbR<_lWw0EjoJb~CJJ@bkQFDrPU7*p@{}?ULai!=6q(oi@YD zSSG^P0Ekw`>Ks6<$iQhle?Vma<@4dx0ut|tYxy505afI45!TTO(shmfZPuv8fOgaG5WTW(^gX3jgPLy`2{e#8SDKw%68YhIK`{E z>j|<$KQo^2I$=6mA2I^httP^6=IA=;u8K{ZD;w=H9jV%@+&VfgtZVLOI`JG5x4lKR&*X_gLcx_bUz%Q?~m9aebWRDxW3Y zJ%-~hXFbK$@>5O+b=t>$O>;mePRt7p$&*Xtg9CX_+?*6#nT~ENxTi_dApf{u`d;{|1XQV{pNF9pptEZORuJXUC1z*XECm zRcj?{-8^Q=v6Nhr-i~k0#fJ0F#Hd~NnTa4%li4&sWa#gS# zJ5*Nx4iXIA9-pmOz3g<(YrFpIv-TE*o}M+kO*6Bbbpl7g7ifZG>LsO=wF^UCMKn-w zn^a_!=uTcIZi^kDRSrD92kwwx|l*3ZuF1m+{;EQkKp>R|j`0ml53;ZJNFhmfVp)bLfbZB?+w%uC0-T zgSA*Kh;PjIAOycTySUDlT$&ctvJt;gl9u&Wx9^)}@e-d7i+h}CB-6up)S6jt=;rHt zJyzEvq3e)&aUsO!U3K48u@^yh@#fi^#u(v_hccR#6~x2$m-Gyph>rr_dS%kWmeX)6 z>xRA{xWF>sRM2ffmWKf25C5r=h*{yuV?l5r*7&CfANC*goS&IfSn4I)sgdRG=0VCX zkl%)()pAGt9&V_eJd+N|dz|Rs-Lg0d4}GnqckbTfxfvsdwnPShn$IO-F;|5`gH?q5 z>_9;tah&Aa-fdma!XTyyN4{Qjk(&>*!M;Vk<`( zfC%O+j?B&i0U1AolE;)zOh=G~|9~{x|Cc9z#{5S=%}mXk#ob;No_Z$FauVl@-;19* zeYQU^$U{cw(1|^9+WcIcZ^{q?{&mJMzwS|kXmTJ#hd0#p;xn+qTS;nO zl3_~fOQ$>2GP`m7xInlV>Lj@*llq7xjBIXe5ZyOo z1${Hqo!)M#06av+HDeiBpjC`vC-phdcd2xMCJrEl`H})!zM*(81;NCsEdvJOA6nKO zW|retPLVTN+<}N+`$0pQh(wajACTkDcQ9mYkZtsffJQMN5Z3~Cgj(c5c<(yMvIOm4 z{_}(V=ZO918vD;(`Jczd|1AfsE-nqx)Ca2(Sgxn0@cReTiia#JuG~D#wl?-3A0z#r z9$x%=jiCMLv5%rIekH>{Q&p(7ucEAr{BLAGP&$61hi)~!YRZ0wOFkdc`kyi1zi|lv zzi4@~S-VcC8X7&tukDnyoy&dbr?xZz)ZofPZ?W&7CUC%zwW|b2(QBp_U1-f!C(><+ znQ4s}p+UK&ho{@l9G&Hsdyu&e+#K@$&IyOQHOU(vZ4fqrW6=OWioM4`(VmrF^n>#w zJf)qxa<%U<^P|#roC?Ah^L+f%2K-`XE}SvqDulVvbm)$(5sUy@(WebY8C<_b)Zp-m zO#RRtxitQPwD-Yzx3QOg!JM}am8$Ap0&YVUza=_G%pw%Bnt`B+E^bJLyMI$_jOOi} zoQ_W$6zZ!T6jAN#KEL<6fUdj<+cKtZqX#B~I>XEe#-_54j1k^aj3ypcU9=9)9qvk3 zY@Hhmf~!GHQk*VFVCQk%!Gtng2|VA{$c|M($(KO9<(eE#UE@1%ZDeVOa|^A}Ez z&dyzYUu(AKn&zkPUF>#j3iocCz@@&y$qxA4;fvHvKv}lK5{w1zT^tTcHw~ULv_3{? zc9i?%&b8-Cizjr?hxm*eM`BHkhYDeLSw~p&F1hKHs~Zcpz>6jplrm|Fk+V&#vz zab9+|u*jpT_IB{iN*#`lmz-}IA+*H>PXs3#sgi;RaHJXJCNtphs%MFC$X3Sv~rXk^E`a!P~v|InU?^mr0V0tR(7__771< zKCRy}fqO7SXuS*gOeDWnP4YbZtA|@3?azw^2q;kN`BcHgj+-z~(Y09{6= z;F_D7Vcq@~3wi6~-Ayv>nTQS4qEQ@r5=v-W<2p$~3#98uLy(pFM7rK-VPxj?#+1nCL$h zK>yc199qqHe^l+87pY!G0IF*47nVX7JLFIHo)*^QI+IF1n5X!S?)!EECN$nWd z)w^Y@p%ubl`)RU1A*9iV!ndidv^nzI?gC|cSc0LPyn4DPRWiKoJ^MS`pIJS1h=JZu zA5CP^>3$4gV$gw@%Iv`kuw>TYQ|_`OPVc7XqYDl)iWQkHwm(i)rr*!%Jrwjf!$ga% zC%m|OUXHTO5KeWX)TD;Mn*H?go#GpT-%%y0pPsBgc)Ol>qsu znf0CY)i1_`P#+gZR(NC(V~hc(6Ptt{h6~}mHF#;9-EjWK+ddkGW=q$azD{$Sk)B$# z7S5>#E`4v8ytn6Qdm+EWWG51wU}^bM7@;kEYJm{VFYtvKv#2=y?0JQ{6TH8OnPg9K?nVf=*aKOe>HJp+gN8Yv~OPT&ubJ-bJSKr|JTz&3JhHc)5yX;&Q9p8P8 zJ?^uuz`2a2S$KCYT5-tJ>6yg#l(yXnoD2R}qy9}T2dotfWz*?=?0N?aT&nP(8g zSrm?M$B@s$cxgc3OH88C2?lTH)Q5laX6`xftG#4+-)&>hb?WD=ce`(z8tvUof$VTn z5zL}z%n~&FH=NMVN0M#^=Mo~eNSBql_kFv?&ubW!d?+m~GeM!6s2P;awi%yDg3?{k zQwTwnbqV@?o9wV5CwZe|Qq5B%S%c(2bTdshxPa>Wq8H+}np9D1IseN`1kwx#&DSo` zr5IM^iQ9DNwP}qEv`lz4;d&wESbCjnf9Wf)J%e?+tOft|jLHk9%3mtfAhtj%e_@n$ z*qWTx_)(?T#SPaPsM*2G6f(FtH$SiSp=P?4RM;W>q?$$!p7+gl_XLaRNj2w-pezlyvgAT zUY3P>GG{iZrL-#gG0?Qgj>Yzfl7S{BUs*Jd?$Kc{9E%<7)Q#rb6%u|r`dlBO~|kpS11;^a3M0lTo!UXhTp)ES@s9S z-_`BZ_laJrM9b@@gb4YBAR@cWH4ZtX3{qSSk^Z|vGT{ZY8R-5xH!$QPBumExZi;fD z0d{%X007P)j7iqs$$jWwq4;cR%t_S!B6YOrf;LCHQ>T}um+a3T6ItjYE*;O$5G5B= zN11sHb~3$_`bInPd1WlGYshNP)!Fa8U&F3@JxRVCp)ka`!tu7eI`!!_iO%OsFB@~e z)Rdi6%LH~5K&c#S;bRp|#sLB3-jLISHQ{;PH(loDbS0N8JyNbO&s)Dv-tK({Im+T` zzcRmGq`0c4e7*3(KF2BEF%3m(ZBI(_b2**)zF=glHvq2dT z3Somtn$mKTy0LKPG=@<1tSkLFN^kso%~qd+f47T9(+N4f?@pg);m?4_64^?F*VPrp zL(%BseRkAOMv0=eb}N|CuV7~;^Uk%L`Q={o_(zBr2tw^IXTFz&pIC~FFdBx8?Pl#J>!el( zI-aAYm$%(3>h-^V6{i0yD(%kAB*U=*kJ|;H0&)0H{r$iGf$biq{2p_ZQ3m~iVR28< zWl;Q}CQmf{UN=%C*v)*~ZZw2CN}8Ur&z*A;4CGySA;3LL0(tNt*^dfmV$sm>2}BZ{ zKbY^#=!KMNSF`NJv-!b-eUMAF8ApNrC>b*)n5#ikhLhw1QsmG9Dr&mNm=*MI9YTzhsGOSyTn{kXH+9Bbb z6x?eN-^#3eFNkZq^CO)#a@V2?g1*k0z_EESZyQ<@Y^Rl|*D1!OPhH&oxS!Aah>8Yk zsfQZ2xqtQlG4rNtwO{Ts_Z~4J?U+b!#@>*sMnFlyxn0gbFf9*%Xf3xYWU0rA=ZG<8 zK_C#ZT`|V|%rppc<`MNV4+K7aCj2M;%6-TTEECb_Ou9wM>uG~BA2VEw7{2ZH@7QD(XXhLu_nW1L5mj(Wa|U~hp& z7i&t2GmboKZR^?1^|3O_+@f%iTnI}KaZL)*ps9RMYLm&=494>rMox6VJ~AKavDFR{ zT?5+2*j7Q|<@!NCiUEVH?Y03-)+FFFKMW`$dpy`KyqB z0<49JKOjr$k(|tAAVDt)6JcA!G;FV169zNqkH?iD)(+8rD?twOD1X16JZyAn!Ct{e{+Z8>Sre+JL#bmxi*_jHfz z9A&qm>y!s)LGM~QN0$h2gJ`s{X=t7`j;L#V((ye7v$t2c=0#I+X)Hn>oWDby?Kraa zf@l{4o?|W&-hPViuPBl_Z}W9X9crSU<@IhvviGitP~0{cCK)u#bA-Uq^05R zWy^i8KAv`+wVP_CaI&xoFlSq+-g$~=9Hxbl=er&@W(tJAbh|KJqo$lNZ+Ql>5SEom z+Y31dZrs~0z2xo1!1Y#&4DJ4iA%e(Nbxd*ur(oZC%S(1PN?7c1=CNOwQy*r9O3*161lKr$BAEHXES*R-4(O%-?_Z+uL}tIp^Fy1$OKINMuM85C&ke% z@6YsVSlAH=uR;2cVNb8z^IdAqoSCO@q9 z&rT@+{LRNzRZR-KB&r>D#8y~ncb2Y~uC8H*{?P6Q;OEIsnISg*u0y+1pg^k%dD>sm zO;`(l<)^$X&@l7Nk}OrhPXDzMX~&)kFe{-2S}(b#6Cq4$NE{(j6jP9Dk3J~+K!vpP zAOYVdcf}Zh+XtSCX-2LOkdeq&WNd1iY*kOQ-_eVyvu`vVphO47xYGhk=0?lHJ+ z0t3a^H(4Ly5-k>f;J(>SW8#`?tX%z#cRGmER(%d#^7_|3W6nUmGw!o(1ph4w!I=;) zPyHS2!FF0)q zJ@_Hd#S=WJ>I0qqOXN^D5LCfQf=0Fc!@^yl2}@up4L#yC19{MJ?^9=4idd)B0nMY0 zXAFojhTfeBvP0k$tZQdIat}uE>!^FgNqe_zkzU=66~aRgFNF5sNLCH}v_e>^T)t$I z19ARr?!mMA`iH%|^(2iw~P7Jldt zaR!k+shu=7x=Hx+<#sHu;iaG#s}s4fa_imsZsRZaeNzak56C^{n2^J9v>-Yi5()1@ zCcCrKtO*7-y?RnzU^9$A?L9ZBH1bac#2r240_RLErGB83Q?V!?nCJ`rWMo0}z zy7leH#Zz*zn+~JijSX3ur{Aw7_0k=#%anlftQYTBrNuN~_JKp$1a1Vd;Worocw8GW z$?{Gyj8L0@UD~(5rG8}nz#HwsId?Op@?n-D{{`FKP!OUZ+Hsk19<(tn)K-{rh!WD) z=o>&(DeExQ*;gk;@B&?4j>g5BOYSZQWYEUSzYf_mHl)=OkRm7$&ctS;q&FElFA(v@ zGBj()$&5nkP2K}-uQp|W2ToC+i~;TATMB-(D5xv#!iymq54({QA#Rx-S}Q~YvT!Uz9%yFI0-5V(yUXE$i3 zg7CzpQ;`jOdr8OWhfrq24KTq~>KaG=^EDnKZ`%F}(j}>sHQ9{)o?aI<17VNZAz#{* zK#Q&>IPkjgdxi6XrO%7f%^$I0JM;O|CA&-23(pK6n~jBZaeYY9u1wQmR2JXxfIZz0 zOpZSTtKXw^18HkZV|ioFK)&dTk9q5eYuY_(bO!F|z-;8Ns9J_m85_o1<2Eo%J3&AA z4GT52F8d~`|1&3{%1`lQ+DL0rR7J%@Pq_ju6#Ek>D(-Gj0XGCxC9wtzjxzJnyaVLr zUYCMYOu2DmNLh05!)LiL^^?hG19Ej|LOu0<1@(Q6F^(;SC4Oh|W7=&D1*?!23&6~# z!(D#p>g#MjpT;qKki^k!!L+7N3lI~cgYJs*uIh*2^@os>jOeGdLFy>2w}h5xMcyKU z=Dd+2CtaMT)ZPKQsGUEVu&-!qa5`H7ghMfIjkl9Y;Q@~6q_RfV6e2vy7MF_`=c7FpI;Zd$Enq?eH{rHCh~9fpLc1S{=9O*BM>rv$J#f$L!lz#|G;R9j?jGA@1Ymfkofp+U-J>WGYkEjfH_Fe18Xt!b-2M%I(g%z-RBv35 zi#k0LaS~19fCi{q%0fZmL2MnSzrqYmoH;1Sd%9f6@A(b~!3Au@8*`5^jOYfaAadh& ztVSi_?Wh{cKJd-C&#jse8TXqs%7IePBf2y{536P{XwnXVcd3 zs=%u)+b%HUvRP7(GgBSWx*wbKYT*iWh3PV4%{=s@=fJtB)QJ5Kb8KF+-ZS}-UmH{b z<|9TRT>~XI&R|<4IiDCiCViqSOrqw6?#lW1*%uHT7y4of-DZotAyL#Os>5G%yhtOSR!wFverri)LU`X7& zi0j@+$rtU`m_ZN=(updzynW63774-1Gwa7q-qZ4e?aLhc-oA78g@F7##>6;-`OKK> zJb-XHK!y{q^QLH2VO;2-zs5KFH+O`mzt2$dAam_cEb7I?U$j?H6sOJH7n4sPb{cDlyN+1 z@EL0~Xmy&4+?pFReK9xP5Fe=dKrvI!utp=`3a9ba8!!JwzudjS z3*)^O?Ruq2%H6hGVs)vSL~{rGjCkq`7vGxIJ$lfeAqgj!U9QN#u@FTopp z(zrlJ`5~G1z z-mz#7Dneqd^XzC(p5cgC5#N!nwkuYRxX2cDLOVEA^8Nw)_*K6cwm!LtO-44tI&9PP z_f-{{7SVXas5z~(n`*PKvmQhos@2rtq&)#H8Ne*5l$f`Ei!#%Wm2AXjVrw_LT9rEU z6H}(1O>Wt^A%?wk9tCIM-ew`AmioM$oV*YH0nr{})9Ao`#x259SX#aqIPSVevI{ab zq=rA~%TPnKB3FH5j#SI_sgz^8v^&{P{(8*e?f=RH7R7vy@gC$uj{^FuO(HynvaaBp zO}82IOtPfv)FyB%A2%m7&WYdTcJ|=Xm4z-LX;(LB2i-Q}kPF@0B;JKFa^}>h$J2M! zCx4tLecr!m{9aL^#^Iul{a3*;=qgw;IBGO7Sr{ev41xx+&vMtUS4EVq3M*d-tdTxz zSjou)3SL`C=j^j@T5^8E3V;>>nV^!vSZfUgHW*KAa=vlQ{Nb$9$zVQT*i=c*gX)(F}4MT~c z1=2C}T?0V6njAJ`4BCfnwgJ{&UkX-|5%9*KXh4V93bUpM3_k*+; z%IE2UN;lOF_JSu-3f2lumu(e(4T>A^q8=qFnqCxQn@4{@HhsNES?kH~?ZAjJ^vBUM zps30$dHIp~n4%`P=kX$Hy+VM3+xs4E(;tsx-)Tb(u)ut!n0AY<9&QX1qRmwYY&7a* zODkE`LuQeuV)nHo4(%Y_+vXQUGmfNIH|hSG8H>8sCexaIEalvr*GG3> zhTQ%7VUK_*du;jQ*Ja>@ifh1A_1tu7?(Ne8wfuLrFV|Ct`mRNmgz>CnToD(Fuk7vYE z@O`fG=X2)QcUeMr$3*nRfvlL10*f~cET$Py3z!ARHsV9y(0_$WjBVQl`M!Dyv3jF0 zU6J)V=Rvipmwg{(+ZK>6G-iIAz&z@l47h~u+;y3Eb>S7nP;FSZNONC*-Sm_WYFM6* zF+2e7FTTYMiC*;@x&iuqfjTXoFd&P&Ho@rk_lVe~V!ek)HZ<#g_G|i-_*ASiTfQx0 zFO1NegyLk!C$>q_?m5uzg_UXN`7fw=`Ko_;uxCS9^-9EfaEZ}xOCIL6V|N7r94?K) zx9AV+L^kxUk5;3!M_NuOr_8wI{(6?8T9^K*Mo49U0LSr~*PI6YKG5}=y|kLnc`kBK z>7vHoYgxWK+L^wJB+~3d6%plU4_|89LSxtz79dt&-5yP3@xUpry@L2~>_XT&AkM^S z8fWs)vOfDY$5)RZeLIk@_C+#5U&Ye=24FQe@bz$#*@DJ&VuvVj1$@#v|L}#cP*qcX zvHK!v-_gSc5mJW!jZJMFp)chuWg+3)=>Y4V7bU~H17%QhV=PfWmxQ;3M{lDMCXFNB zes41cF7~HA?UCEXS9F^`BBZ}7TvqBU!2H?w(W=NjJDm`L-ZFHdHiiK?<4zp$1Na7GWtUZt^~45}ebJ53_i_aZc##4&M3Y1GVa$tNy6* zF-2X~zOtEd6Y=Sr&Q%ap6`hAQgqLm+2hMHA?BvY#hc)4EJ1ffE%nfb*D}M>p4MfOW z&O1-xnHR2dE-zpD_8Gc{pGH#ejiDhY2y|&E=()m@T`N$|4sTdflhBykxFhG4YvvrR zV97(&?sNJENXT?+7SBP74k4lHnJxV!tG_^jtdkIudOJ|2I*)YYl7ct8gov{T+vP{& zO%(#ady!pazOAYJY5Td8C{tQ`u{X#sti)w-_i&Sg<@8q$C&yUlWbz4p;D)ztGCFYDp&N@{kmxxtGJK)!;i5Gx>5Xm2#-491 z#nIhQXB_KBFE>9i?3>mZFy|oJPu-k)=a~Ze0a`gz=fMaLQ*L#_VJ#tqWc_f@4v6zz9Cm-NuO#q!~%0PLtzycXNi&Nf3tc(Lc+(|!sc;xbIMcj5egy7j7a z^N$l6Q?=LzP2Q`5&*yL6e6@_$D}{NCzxzsMEd+5CdM~tC05kq=HL7b%JRWnp+2)Yt&v{#fR{lFEus%h!3I{aRx$ii~T z$;sJCGGEj5n+c4G#RY`Xo=`3$V}M_t;I%&>e1>6P3@38KnmK%`sY(yt^IAw2i*@QV zDl5ynjBjuOUnxh{XMNau6%mn`AC_XMa-gV$HvRp(Bv0*0yi#lxkn|i1mCX+)rS43-^c;cn>iYbpv-# z3=Jq4v?mRv0kix`AGqMoVEE5a-y}3Nn@@Z=Rq4!~G^?2{>Fa&G-HOLTi){s4k0lvU zlomPDej?dWmdr<6|Hy@@#Q z8rnfv4_16FU%jGg?wW*%gmzYm?+x*Q5XoN&xx*a~cB?xnDHIIEh$lI)twjPOO%ZG2 zWT#c-&%ty|ZA~gli6A)>pGdc*n+&Kzw)SN1lDm~5Sa5Vw9JF?QR0y^qT8aE5F(cT` zWj>nM;bo>{bL(7_c2h9Ff&z26;mdNV-Zv9C9EoW+Yq43x39aS>i%z`SY_SyT;gFlQpO{G#mv+42JU>9|TPKA@bfa zYQ0-^g(~S-lI5gAy`g>DKIJ%1z1c@!*ExZfP$}y^t(cS&Mqw3t|p^^v({EJe2uC=2}oYZ4WF<%zG#-xpUk z*0dyi9BtKfRL^eV_SYlry6-siSaVn&G#_HUYSG`?g&Z1p=|J3b$#j|Tyvl{S6J-6Q zeL8kLiCH;38DdAowAiAkqLV*qh|5Um+Wesi7h*#FWN4EnfX{bAC8vdZ0 z)H}q&Lt`%5!DzbSH3@tdleb+Wqr5!4n%u74>hG&cRE(WEu5y~!f4)weGxT>Crcr`z zBAl0TFgOGU4d?BcHtOy=ARJNhEtI;Ua5B~rb&A{J+?k_kWk3^}h`UBQ2pSyCY!kxP zmudzS+60ES7AvoX`l;;yC}&lAUwal_pe&mv*vJ1_{y>cJ9cCHM3m=aVG=|U~bw+x3 zg6^W<-ehE!Ty*yl-RC6eYV_p8k@~I|!(}Rxf{l4ad&2uf*oYV?&KD*)4Okh?keVb1 zWZfEzdt}2<$1mcoYW+dW>tB{>SpjOjMyI@6A;Ea^8RXO?ffY~JU%?BYEC^~UP86N= zuAO)i>1W-=KD7_kcRQ+N4bJvJ4@+P8bnA8uKRfE!GuHdx8*78#*BCNdG{R7#+%gY- zJL>;wAYAE|`pKXSQQ?`c1-5u${c5&%5GK$69xOBL2`*qThNZ`R+QwPhxsh1&y!yqP z@#;bwW_yChA+7s5ANIZ;$T$LIL5mBdLu<#$8HrDkB0Bo$NB(}=ojG`^Q&n-ZS0o^x z|M&Phe@z?wkN@{iFuFt?RUT zBGutR;a96_tA|sWxAGV<(m>Z-&=M&1n}S^?(9b{Z2*L22o_uD$r#qNwvTMHOga9xV z;DMA=n%>X!;>qVQcUXdylg8+*VxQyGiRqiCZ4K@4ZVD-_N5A&&i$>k$zrhYc$JznE zbH-lV&ZYtlS{7Z8c7~DVSGAU(`$TVd+|ozmJV}oXi#U9m!QYE^R)~;)|TP;E{d! zeQ3gcw%2a=>B1(`I1=s>F#=Yx24_aG|uRJUC7_d;#u_a3!)$S7Iw3v>R3%`Wxi6c z%A5IbXJAE~r7yP%d9Z*@5L(*@e+kHOVsmz=jO>_}tHHLp{1Ik>bsDZr(-clLgV@n* zDN)o#GO`oSLvwD&*1BgMNPO-?&7}&JCRGjDawP{HlnI>2wB71gwfY((OjW1Z(9OdA zsMWMYU|=2CPFV1V_DZ;ocQujmGaq*$8^?r{?WNMXvZx9DV!s|yARm}dT8=&kKm++K zGTnvAY;d$G^4%7i6217qJEv;D=D~GTQ;u=A1Y{fZS@(~BUPPBhnOncwXUbmnr0YtC z!^zf|3fKuIxAY41J~?>vBBovayF$V7bz{NRpjgbCH>lNOG6dEUqXiCZ#E+{ie&FZV z>I^L3mkasez%q9hfKBb7mxwl&oxx4syhzKUNc_;c?-ObDxrs0tnf#1rz0#BUb)R+z z>cd$9OU?mc%>cxb(}TV6`cJif6lbd2sG%r9yR|l_(SoAbxb(5%X~++XknYX%PwuQQ z=m9J90NSKhGn{OFUa7!5LV;=01%f`;Ois|bFEzhTd}pGLA*xvOoBgYSA--Yu{;r(|2Hy?Q*vc@L#T zTcq2d2XGBK35cthX;1gW9<&SvHGF}F=`uf|_)Zg%1C6NMoee#BHtWDZxySn@lX=MR z%sO~b4AX9k@W#W|w1;Oe*zz^hpOAfjs@DCZdXIGbRII)L*iJkh9yd3fJi}<-#WvVr!aacxs!{OO^)O#7r*^E#g2zf578zI5? z&29_Q)NgTVsg##YK8Ig35s06YxzZ?2ZR&CHXUOOVaBif zqlD>x=wS@Z@Th1Gf%g5%>kP3Bx!c7p9$B|%d_hCq_#r=Qbh8eQv$$#d z2Z@-DR*YS=gWCa!8S{B0B$+<{Wns&2RCd6)It+*B5>&p1GycvypJcSS3&>c>|)S`(P;jVjt9 z{VAUQrLG&u6LmzuKJ5YtLmp}Jl6M)a542PCXuYjyx#fAIC?QDUW^pm>#(Nk65NFoJ zXjcI5$C_4Gd8>ZB$r7{Csynhr_L0NBYTfnycjMOiAW?(soP;O9MWfEP^YfN#_#$g$ zbjI;*u++dUt+Y=cqOv;ZyP<`JFgEZqcU)X)7S!heB9W&>%8Mpx9K*l7*rHIGr%iUh za6fIaZ2xMoaZ%qFI}XtNV1s1=n_6S_awE3pQ-OB&?GC%!o)eJ${(y9t!^nkv7*6z*G23K|+f$!VTy0(#-ywesho*)M z7KW!&Dl}c;#Z&JanlJ8rS9&R42&?-k{lBE0585T%pwmf)KlR-{iPKK=@HiWZlTL?o zPi736EpgqCySwWK*Y6W6kMz5qA5*7w*m_Ta6lKqboTAsgix0Fs3R0VSZ;#%M-ZYU> zS*YQsVW}`?14EMR*)gv(9+H`W5Vlj*4|sO)?(|2M29KCR*_^BAL1&p4Iu-{e#Kj@f z=#u1OOuJifbIT1xv!O!IOb|4u_P&4-W^wKhhzo@7*&8-7?R}eJR_-pckD7mO_D7zs zT`KRs{;hXfY`^@-2w<5FqbYLs_^^U6pw3JVVK>a6z5!pNu8P8M7sA>Qm#t?&N-mD>GY&LC?xO5j_nFpW@l+~l%ZyodFF*}pTy zi9h?dVuAk=Wy|j7>{ve>{Pv@#{e3gvOM!|XzGD)!96n$`BiL~4`nlf(MFaMonhFsx z9EL{ER<>PQf37c3_L{zs$p_x}zq zfoVUB;#|ZgVIU~I;&5nt{zh^mm*FM<5Vhx-r8ZHT$5m@Y`uc_Hckfxz5lnk24H^M} zi&^we%Q+(cb452=gj|!*=;rPEs$tkkTF16No49&HrpV{}J_YvAFN%-6TR3P`1|7nA zJ&|-KZ?6-`lgB5wDn1@{cuDTT{-eP<^+)9)afS4lD<0vpF~DK{=OR9F-l!EUo)VM} zE#4crzF~OJhhGj?Z|vLpmfyi`lUq_Gv!Ib%>0;S4GBr?BR>+HKhOKW?ywB3&7HX!k z)W~|b6Fn^Gx!5&pwN2!OlJxytqq2ydso|l+-d2ZO%63ELFoAzSrg!a3s|i~(@1W(t z;bBL=93CLFYHc%1Qo$+MlD#|X{^gRBkvlT`iDSDZsR^H1X{XLDWn z^C~tfd{gvT;AQ<)K9(Ucz`0Teyf##k(@6MuG~4W8UzH2=0<$+cbHLE+sHFC_Au*jN zK~vIpVTd(+pt~R#>tc0GW+UF@45 zGKSeUXCw7FzM;>8(=PLik=N0OSKe~Kmrk{n2%J50=rCa8Rv*D_!VI!fv8e6N-_>rs zIv9IhilrxAm6nh|xPMj%5X!?;!U~pqm!-v7h~R`*mz& z*JF_vbrIcMITYV~%JFvT;Uej4vMk%`Y`@b%2a+*e9XzwQ41*6GgNW6FEslK>sNVS_ z4jZL?Daz&^k>7{JPAJJ5a`C@4@tXlzgUigf8pnu80r(?fCr*9bn*D5_o7IpI&tNmA z%qRTe7wh8<()NHm$Lp`gsx1s{(GqrizcjCy0VLJ7;yu!D4mJnez9$#%d;IHIQul?0 z$?&zE9xPig}FuF(q@v@_m7pg zS@%(Qi6B;-#j&)+QzT5N8)mD+xto5wio5d{m6P}Red{G2Z{sjtR^Z%ahCVd5DsIm@ zp^l`V^t}{kG6N)NCBoY%;18Zp(ySIm{XLYM=c6lNSb+jYBuxHgp|? zttY`*cxf-9`{L^fv){x^KCPFt@t)2xQ3^I)=Kk#6$u@Y8h2+8DjUSteF6kKXA!6%0 zBxt4$5=a7h?<-%&O|u*KsHWSddeR@94wua3=dAcH{&@{F8sm#mjd*Y>4w6Q4Nm5Pt zBK^q9Q(ZUv1^c`cLl$l%sVBsqyCW=l@3EiuiQH?O`0~yD_Y{G&y%|}C+6UNqCvg@u z{g=}h&C~BtsqcL=dA>rI9jMda1gK`lZ%U%o-yY_dPF%NO(caq3;K!fr(=eLA4$$jU z#VezF%zE97FMezy8ge-DHxxTO6iqmL%KuYl%yyv}ZSlzdg(|wKr$v2p?T9opjN#xz z-Z~;>U!Gvz&+DM-vEv9co#Wh*>E&g9!);QLV|Nj1qcj;XDt(;HHm>&+IO1@u2w(HW z=tIG}4az_!7xm3;FUTiqbCHo;v8;^!c`@-Cgew(aUeJtUx4vYL;sf*B@uBBt6S^yn zT+#(p#XvJ8CX7>Ad(1;e%fYOm|F`VLpMS>?K6I3QC>M&`&1FS;>idqsp;z`JDC=oQYrMB6mSY7xWRQim#vpR66Yy(8i3$GpNG zX?`go#1IKpkB!Cx=E;K6IDU|QBBVZiG$eI1&3i^QWQ9{FGm7H#WY|cwNDXu}=5be4 zNba+YEDZ83#Kqcv2i*vB)(><>%R?MS~xNDe(+ zTB8MW7Wc0+PIlij-mkEKWMuE>V~g$c_l7G9Ta^m;JOs&(4TbUYW+-x$UyF^_->e9C zh?ECZI%TlGRYq~q{6C$^I&5_yhOl(C%qddp(FE&$zgsH^a+^B514%B&gwW-hux~mV zo~%wc)r$vJwVu#O&3!*~_tnCE&@SBPFN+hP8Pc`=x(FpB@pKEn&XFC?o2FCw%MQ`( z>P@E>)csvhPa;@fnkN_ve?9lK4w24=5cgQNEApE@`koxfeN?2J87B7jFl1ado=jUGZ*xY%2I zEyc-6esXOe);fo!SzaMyalS-=a-QE4y#cmsW)K zZ~E?2_1)uigg?iF-5NnB7iC!|qXY3g5FkZ;!59;0$O+UyPXnI2^4N7B4Op}frzw^v zDjjv{#WXR-$-{|mbt}}GBE;>_HAS_j9V3@dC@d)X(69M*R_~0jodYpRbWo-0z1QW3 zc1JY5o~=1_KFZL`&D=}+06zlZ@7w!diaK1-qLng42!#_P69RC;zSpnkdq!&$x^EqH zw(j0`{cQk|Yd-ALrlt1d@y#c+h495}b&eeY5NPBLsOFu$d$HxEr;lM2`&#MM&eEZH z>2E^*r`Gdn(wp>;^0CY+|M$lo|N0RaTm2~MEMdE6ZUbHfy5Y0LMYA*Sl`kRi4IBBY zJbo0Pl_OepDEM*APX7n43C%l3g8j9||Mlt=|9Qv4|95@k*XRDv#5rUfAY@`fZPZX@ z*!dxnM-43BtDJ0YK4H;^({*yV+tuuAx7kQi|8QUkX2mBUcyZWO8+Gx2U5esg)>N>3 zODdQr1HnFOL>J`X^A&3W?VQ$qXP102Jv5ROLWi8dl;(5s);FM-lXIs&5la?E8G8i znX>;9OR%fDvaIKIz6yKGNq1_YM)$kvmuU%T|E>O7ssL=Cy;Aq!v(+lg|(<6#JoLXm$x~o}@~`{*wSK?$->>uWU$ZY7ywm_Dp5Z}PrRCJuNgV4__L*zb$5@|| z>CWGqu$wz}f+eD`>BNmAW19{E>M)aahCty0X} zu1>C^y2^*s?!^jSdHjf^vl+~h6G40;g#_YcfH|!1tL6c0`IWD%CkqdQIZ=Z&MxKp%KuNz9NYhkmV>RCKr%tGd*X?x zkZLX*D9s(qxoPg*-zY9z>acKNGD*%#cT`k)KJB%`)6a{yknsfo5!d}nWBrGi=0DjH zQ(DrVwSE!|O)@Zptv@2&a$wbRM!bI)|x4Zkhb4prP%3XGTn3#!orKyH?VeDAIF z&-a!FJj~vfy|pHvN|oGV?W?~b@73#ydSdOVj_yAr#tgf%7F9F3ZDBP@DPttqfV)8Zu+t=(%!V%zeBj`fD#pU3q-jueCEUiE?x5 zofBADIbB0_72l_A2d-~Z6*jgOAdJ7$0xEZ1$)3?a5Nseih|NLw9ms{D<|V=YH$M>V zfv{-&gaT8zJQRd1IP|(m^lJEiAT~y(*Ux={rgT8JQwuo}ju>iyXj&*nr2a zl+Y1{Zpr>iMfzSx?*#m@>rvm4JGbiXZ?t5k+B9&}ERr9#9}+qh_1HV-u(jy1*{d(R z6Aw(rD+9^cKdx9V@~=zM@@Jk3{Wli?lMF433f2LErc$da1iptN4daEO>9$IJ5a@v@ z?!#y3^#uzCDDIA+Y`{BGL(*XU~ro2V6w zb|$Z7^BpRr9{tVXC_w*yI+jtty+$^Ms!S7hL&=#NSWOy4Q9;cKm;q}xvG$K&ye=aX z;C%x<*XoNLD4yk1i@tkSa`fq20<%+CQ2|3PkZ*{-eTL;IHUHT`r~6GE1LL!=jC>9@ z_B3?otY48HphkLco$4TzlrLC7bTloaZiKhnPw*10iQahVQX9ven^${d>MZuNJ)##0 zrDh1_?K4U7%DLA&G<|o%l-bfwRY($K)FTN4J3 zj;^nCr2IhG(467G-D@EX)zL=uhO#DLYO+lLGiDFBfbG}u0yQ@Zs@I*@w7@Y5rqi;4 zEea`s#u&hRpnkhZJ=15Lu)z^Bf+CGVfS57exw^zaMZ>xobQwS|)_MXJ;6P?fCx%`R zi!KrIOc4n{*b5|sV_5=Sn}DWzkgH3fc}6Xn#lF=Xk{YNR4t& zD{&_`%PtOl(MAkfq&?p{EQ8G?>qy+}y|K3yQlA`~*@GI7yC$?~VzD3Sg0=82i36mB zhs3OsBN-QZ@=94)d|`_ep6a$QqJ=wi$i~T;-~d+6={^Y^~4gA=1d5KR_wjkU?d0$B)gdLk-+w!M)y~PrAhnX z3yjkrFM=*|AuP{?Y^>EoE5gW1^dDZGKJ|<$?0e|H@Zy%s!SpCw&{iu9%UxpZBeT`! zc5EFPn(m&HxZeKy;Dy-hk(=fzrozkH+o6zbm6nS^r+@#{ zj(Pd-QK!viJV8+jl|Uew6|5iB5&)r&i30P$yENn%V^vtEkFZ*cUg6dj_?685zmEI< z6NtDRJ&1vgM~M*g;a~?G+X}kMe;@#cpM2&C!IkkF{S)t&7!i(8C|o-Rt;^hj#`G`P z7_RF4B{RU51-7UfdNQ5i23l z>pVkL(Wu^#i6=!dnA;JCETjZ*+? z5}eG)DkvVqt_!y^kjoEeL8I>yvI`EGHuHP$t4LN@C;*q1CmF$2D$z^S62Hyg{?yN-(S%$vU;+MUuA2>=m0}UJc zozo&0Aeg5?H3!*>rS0!$B%;`%FRujQgiD=l^LvwlxcVG+V+_BsEsg!p^GiH!@N~VK zN!bgrIex8FKcP}zv_9#g=kp~CRgS%eNuCosy*vkXc=cSLt%CAs_dhev{-0mB`cG0* ztnFwA^tvka@CSk_N1a{-QNgMyjEMokeYig~aUoegZA$&^~a8WO+m zCDZ$RO-UjV9PMfFfe>;(Oen#A^T8Hg!&2lkrlDOf{ftMR1Tvuob;7TYKyd=92J_Yx z;9SY|X`Ln!0w9W>2J#2uGCPXNQ%c+G1;Q5%ecCrL)yV}~;$4NF4);HD{r!8yvggHt znPQz-Vn%{oih3Ax4)o^5-as+98PP@Ht}5!gQ0Cwe=;m3$m39E?IN20-<95?UK!X!u z8of&P0lW3E1(9J3zQ#KhoZ#+rl&9JB&(+x2g;L5?D&Jq2PIu779t=FPfk5XwKvQ!fRb=MT>A%D~%RN z)Mwz~3@~PG1I65TidGF`@z`rn6Jei(xeH^<+=fzjE=B44U(fyyW&vSRmqcJji8xad zssXKQ(oTVdZkNf8|ADAP>u;da%{FFMni~jZ1je)ST7`|3KJDa2jWU@$M@1T#xeNYvK^&70!1-F zZ>%9dQ%|i$xBmVd9O!`_7?W%Ivv*kreyoFKZNM)P_H5vLC6G|du^$MRJ?kgwO_i*x zhkzYI%R^(iSjr`wJYWH5VmB@TuFIPwXGUNiGpvY}$?9 zIHhH(`~%^g*h;tWDc zg07_aBK&OqJ`yOW7om9-FGe4r=Y*h#t8c;PoVv?X@V@?v)|wE1vt=b-wGJ~j<8X55Mlm&I2c=T z(4LjbYy1Nd<5s}DgJEtyyXDUo#y8(MZ?afxre`RF*nwCF`XN1J4n!Cw&KuHH+F zGQP1028nGv^1fGObz*yPosvkifTS)5{{y_z^>?*0tO(TEKMi*NcTo9%nVDZ^=9iiI zLy)rLV&AjD>PRRvt7k}Mp`?s>P1rs>-MYp@>KH>7WC3oV{gW)_+emi0H1&+R;<>Z- zl&_zhrvk3TxuT`B>JO!6MnKh&f*nk=m3yiUddlm$Zf zK$wqhp&Vp#%j@v8W59bwpMzgqK~GH*kQ5{>;%PbKXN@NwqUikXRbPUxzC-wB3N~b& zjiGmX2*v@p(Lz|?1xB5-(49cmMV^!mb6o{vDnCQ7o?zD373vv*u+(1gCoPcYm>gxq zKz07iGEnVFpn=b_DMI8~^yg193`XKO48D;&9v~Mys3K9EVS_cy7uPYUap}SP{)Q9} zVtasX7KZq)q$SU!o2njUyY1bw)`a3EuUGx~zPU3bPs_2F`bai6ekaVQVqE(R*Qng+ zoVdsO?g^uEsY7&a+6iki$bzFqXma&OT3mAGE^0QWJi|yf70BiD30y^NB+Y2_Orp2p z)R`4H(QzlKI(O7IoS;fs^mWchCdRK{tQmC5%0H}WVSQ5&*+?=+3Wugjx=BAv_O{Be ziawZccIx%;aO(S{pZNoL6t$vP$qvd~L0~ObJK*%o&=dd`NQx5Nl)>mAA!z`EI7e?t z*=zy#RQCfx-avmofMhW6U>TT^`s?^H)E_Ui0==|Et*@iOlF~(@m(exoaTE~Zn2z)W z0E`WMSu+4!XWyWgDG+!+R29^B$RDah({WIj2tc}A;D1GT{rjCD=Wn7v{Xnn+__DXT3QRv30OPsG85SMIaRH#Qal<#LW_B{@G;5L_b%JI(R)OAZPuQWc z+r!}+<-L*H>!xeA(h*bYH&}BedMWR+w~xDCI$daHC;hdow8VVg>q|sgs$|Biy^HO| zF!v9{&;%UX9{`c=5Ey=yKMLlgF<|fSc8MXb6NgNmN8D)<)t9iw?aBc0Jum2YDEL90> z_W0Xp%Im{*-A-A)YrVrOcUGi=U%YCZ_I0%hct@UEkR!QLgL@$dPK61!+734R!LNDQ zLAG7j;I7Y70VfIeDZ@?&U*E}pqIpc~@%I}8)*7S8)th-)_f|$Rrg+|vsJ#;HXPN;* z{^NsT75{xFXn+Xlx4rOBTqHJz6qpUu%;YZE1j4s*h{ik*q z3^^@@`w2;K_gC}NPJQhBT7R#5&{5ILzsZpG`;!RQ8Jh&ty0^D7xL1=*0Y8DPUD`68)cXh)4b+o4_u2NrRvu^EzE~@% zo69}%Hs&smy4&p!XgVmzS0_v-&CR-Aebd!=BJRxKaKqnjp8{~w|9aZQ{;nJ7Ppr+K zuph1xnzTK1N1Rz%R|%SragdZ16ck+S_^PtcyTQh8@!@7%&mk!;%xRL`=Z{CRobgvB zBqJujCiQ)F8t@W7daF8B07rJhrsv5QffaJbWLpsU``GP^hkqS%IF8^;G)lSAMgoeGwRnJ~+;Yz%0Ri zAg=Ay#WE5>kc_06kCxjpc@(xnW8cv~PkGlv*Z@XLA(FY}L3MU7~vX?y%jWgU@EU>y`c0%-pR8CZH zTX@GDar$EfG0u{I`$q*qk@PWN){^{CDIh$yZdsk)x}GgG?SsAL-iGlk%U34X5(i+938w2O zV;&3J$Cm~Vt`R#Lm$o^GXg(VeO4L7b{Mx?;HAW`cV5?C)IXyc=~LXS#Te6Esa^ zGh#D&+jj4liftM;r~=aHVF+el4>Tzb0)tDuAnM>_x$!6ABWo-OfEw{35PLRhwICX2 zyfs!KY}N9frnq#NzN$y}GyNU&=Cv{vm%Wn^b1(l6DC+MD$^N@9Mk0vA4XC22h7N&Q z8#Qus-CH+J4~s{}KJIQ-w1|8@S#_zDvv<<&u$}rjus+1quL-@Pdl`f~BU!-70LAnA zcLZD1hU6P)Y{>+8k1*gphDQsdncS9WlE*8Eq6@sjqVtz#IP^7&rv90*E(zR_hw3~C ztWKIE8RD~meGA-+dNLJ+w9n<(V6IhAwEj7`pp6bVvR(XvAlmf?&Pw{X0SDii5zurP zxRt;vBVbs6>qP%`Q@>W~*Pi-cdHUG8xtn5|3iaNeJe#A5n9=6?v*9$Y_-C&CVRUo! zfI_{mZga!SRmtk-eMaUTFj^bs@QZWsPmDl+kAwg3ez1tW(NR31yl*!MFR!yXa!qu0 zZvJP6&;NIK{jajR|M2_t5)4Dz=K(p862L>h(SkeXUoDb0IL~my--8E7-J*edQf*YVjHPVM8e8AMVbo{ z;_y^z8aWVX)K_`^-hmQz!`+Vusjkv44@|<$&aB&_6j{G-bZ%mtp>>d2$9nUh`jC|x zW9;wDsX|u@y{;r+)Npo}DZPcwt^wNOc?dq1$FDpK>Ne!-F3d?tcfYoae-QSaax|qn zAat)k(9~gnVnQ6emauQ@xvoGth>JZv9s>0opqqyxXJmt(XP6y{}id zp{Md*&+YfSX^_TV!t(T`C%*oo@x0pYS48o{?TZsk7uB)5V z_1kL}iQGDyPq?Rw^NI^k+mG1o!hO|2Y)Jme6w3b^uZj!;T#PqpiW4jd zDBUAHfQ_&4a#J5Dx^GTnCNA9zmm63`ig_gpbe*F0rn9=r%Ano?Why%oTRjv2y|Jr9 za^K{3+$c*K-?r3a_Ib%oa;-@BK(%qv_~mdflanV{wzo)=J6ZcH#fhN64FZh1_ZE?r zQZJbV-M$6es>1h_R0TCbVeMwz)f}7~%_0MLsoQTenCMQyx#ka)!uTxT z;BS@F{Cn@hcbl$Y6Gm)<@A%m~i}1(A(tV+p0l$wZs_Ka!h-`T^e^ATbas|}`v zl++J|?pd}GWtQdU1v1$dxZtgX8(1FCf|^8=3y!KvRt_;({k8c%uT7P?(?hEJ%gt_> z9OGjdgIEY-NrcT>k_>6mR0BTDW)B(jkYO~Td2qq3uEiS(UMSJIbP$jdxVv5jPJ+B|>yj3O84b_5gYV?bX3 zpCHwM-2ZSFDLDkr;C)6eUjg)VO$JcP3kFVilSvjy=>$o}5!c?E* zP}Zli~9 z03{pR1#;kcc2fr@QV=Ik_Du{`F5De#=(HwPzY8&+Aj2+ge4nfT>>ZtHd)Cx`=qA!X zRgfGKrm=;HeW>R5IFEL)P99@bUKwU@(l?^BJ@Ucp`^VUQUm$pn+}NwFm@3N}L>guA z$x)A!EkZL$P03{MK~W$!v@2olOhrlgpJ7o>V~0c6NV*mdUeh{rGK! zTlfLOM8ex@0h$MSf8Uaon}S4&hok1BWyE%^vnhP1v^U2Dg(BiWuk`hI@5+kG6DSno z(Om%|hSwk0W!i5wQq({oTK4*Dx%5F}&%vSW3&}eY;!5@teJB;15l3t_s(M#;oa^wO zD4u-ejpOwr{+b=N*qs`@WYN05ks~D4hbv6y6BXv31$ldssMfVW4|AJ{*R1=-{OwtZ zKV?*E_5H-_>6h%yMWDuW$U07ilkY+-D*}y^Je^hac;@%>2t0l4vW+9qK|RXgyT4XF z4NYzy_h|0mvi2+&bpg=Won!4T_ZBSLWFj-MU{aAj*4E4B>=339^t7 z(V5N)8vCb1(vXkPWAP}8qacc^JPLl3>y21}#K3KUlGrsu$i`}Oup2(C)B2`BoKU{t z={N7HqV}?F7QPypA-z*)M?=J2pbe$P1$T~ag|!B3u(& z<;%%Wo4+M?dEuxsOQ0+LvR_--MeM`U@E?eYgZ z5woh(#2s}|9Bm7Pc z-0v-dz1U+c_0k1}hY|bWP9VCu8}wdzduD@lcL_94&vWd9#eHqWtfE-k&-+&&}qg!=<@4{J)w~F&LHA%?~rg@r=`WQam zUh>3B=&BbV%Tmd|LC{|T3t{@GJc#c$~NYH!bgyMca(Wl6GZ9%RdY zEZxYy?-;torRsRZ5smi4$I>_z21A-qd<<6rla^=!1&pgSAJlz@9%Z0$@1uCH^sCk; zol=V}Ku4>DWf3ZDW+V~nI_UiK(28XmG8qPhxsR8sd7&>EXgGHEGs15aOL4~*g)k$* z7N2Q7lg0+Npd~bEO2YsOOem=NoM%~!M7aDw2xNSFn)=Qr73%&2Oi$a)WU&JOPRbpM1%;qL2Urxl(` zoWFl6cVErr!&&}XIy7;TYAqy87|TEjFKO+vFWYF39XCEjKWnTtqdL^#zP3Ns>!qjL z*=tUjI}L=#c4}+l;T@(Vx$)CbQG3-yq)d&s#N*dAlL<^x-4f=gTn>IbRguEa-&rfvExkN2F!z3jD5pk$DCC}o1RbXLooL~;iP%z9 zQ!Da8s|(PnGZ|3PJlt+Mh}vU5wXOOBS{9?Fi=cp*F_wr9^y8<2R*NlmWanUpbM}jj z^ci<^wZ^Xz_}RtE6v-Cb$n|d;EmG3=E@xpjcd~sREe)B^BDn&ZD@&!B;|qp5i5mOb z-dndl(s1yaV_PO(Rx zVtIbxYP)-a##38G#)Cub_bV@R+>vD+L(-%HQ+$~;)a1uc#xx6PpIHsJWIa9wlR~ zjypMQJFXut5ne?^J?Lbs!f7z$wGcpi*vF1U85(yER*|z@FlV1jIdLkO=zHAwt=A1N z{-;l)&L{z53w*~kZc9+;qv z;qvsUKT=>jUd4W<+bJ{VBk9Xv=C*e2En#VhX9%VK0BvE=;ewvT(~{wue!|A?Y1RoO zZEs&ljwhZf3ieZpvCeBZaVwMl15O7Ympze5XqPr*5kFGQ3TEF^6b9soYc*iz(A{OQ_qqXv};Mh#9&0io1)?fvGARL#XsugGf z<8#LD{k7GSl>+X9Zyq{D_VzOS2CPND-Oar{Cb#JwZeo8X!rP9)FFICGZ!I&nC%Xa9 zK|>pKR-Q<6GARH4Tp=*tuNiY@j)w-dhqzVKcgWtKBbBT-0*rmEW8YFR|mnH-6;?>HWHfrHRGvf zG*dctaF@3COsuiYI7F7 zOL_CJZ>9T$CnrDL>~ex<)2GGeptCH}so-$0k9&ZMe60c6Ay3kY=i{tG0Dgyp#z4V~ zh#W_Y_I>d*rjnn z(#X4U>tpv?Th%|@7@p(aA9wQHwwS|Pj$VqnyKUNlEz%P?aR4Pj7<;9) znHHHbBNB-ZUcBX;r-4>kEw1bDJ3Es&efzxXd54BE+ZV%Phb|*Dt*9FS`4ll9q33Uh z{6L(!=!QVUJL&4z;5@tCE1U&0S4+X~GZ-V;Hu7-H+FLYI^$c%w@q%ls>*k;)E_RmN9eWS+XE$M# zSxWz?TR(>;nK-W>CWI2!K%-MnX$rz~t!d6#z#6KRtvpoCYn4D@$d9kPsg!e8evu7r zUGo||ltsVAnX05Mh`ymPSWXq8sna!ScPwdvhMlWzX#H}LZ_gb7XTag>&EvbL;_5U$ zT{IrLq^P}3`0OFX(E^$bAwmTT#g^GMMN7_VOme5Y6 zCnA!f8JmRe=pyc7d8(CkNWAcllnHd`MpGi>H9lu$tkrK5*Z%c|kI}x8WFfdyd-|1< z&a>6_u7vA+EQ>!7rC{v@$9B>75K%~+EV>TLAehuCpVOk1|ZF~E@sg6Onf$)YjrmB zP_0zVj!}N_gh-kUwrOb-=mq4(*^Nhfx5$5KD7G9vn0?#e2O@#BRgjxNe>+vc$I=MZ z$}dzJ1NrQUfo$V*I4P}C_Yq-{kWs~LE=n?_g|8(%c!ei>ns>k4w8r3Jg^jZt2vm&` zXr{W(A9R1#S`a2L{p@?R^#>sKsy|TV-AAY`5MZ39?Rnu@UX!3S>S)8#t5-mEyr*y= zA!x^^u9w0$j!WW@4c#kOT&e2e;YSX{wiRM;7znFdVEr3(%{X#Im+%}OIzM*J zRa^W!fo|vcl7c7rrd0!@aEk%%A6g`4IerhM;9 zR^YhqX9Ln&=1l-bdY~1DG~^XhNIe>XGh^cYBs^V-=m+;YxE*4EwDF|-`v;pWyWNV+ zE+Ehi{KIx#kqA>TA6Ak17(E?8Qw8|ARa;!fpJ0av(G`6d1(QC%F$|8B zysvt>B7o9cW5W?0$D@YWuo&-H0Oo1)2F>_4u~QaWJXid42SqP)D%sQ3TsrmjV$;yU z`MaMQK+3ao%hFI{jrq?*HSdQrA~k0BjdWi0Y}T>d6)Um9ExMImR5sqFJk|XCexah5 zP4=v!V`~h)VyYW~b}S2o|2&mW2P279N@v12u|{!q#0yjJBaalL|r19LMOii+la*SRT!$^1ns4<%hb^f%U6o^6oh>VSEPpvl<+|C8ds{NFN^KsjH$y*f zGTepmu~=B6YhY>hRHMR99IK;;>r2MrBNu~{oo>BL@^Q33%_AAma)nnIjNA!j>EFBw z9RiU|od+sRY5KNAg1Bd$wOpe>*)|nT_IchG*FD61;mi9|t#XULS=~V#8ZCZ?xQi#9 zL2siqH&v_AvguZ|e!aInro=tV&FT6_cPDjaJ6w~CQ9b##Jkp9*`2wh;SJ?l3Dl4p$ zqXTrJU=+@Xei!b7Ir?xHa%@2UOwpb_b0&Vcc*wG(EbYV9h-bTRY<}jZ%(D9U?^4wK zn`v_Yy4e&Vn?od2K=tmKLi-Uk=|&LINo#xKXY-_{6wdj5hs@7OX6AeplY2Y7^-hC_ zGE2FbOQT35&wfcnPR=1dE`P2%e{pRI=UljrP&`5?K!>Ri?rUwW?!6Jyrt-BZ@FvB0 z_hjXR?Dmm`cx~~cG&d5>F$wdqIRK}jlCt&7fWyUT>NS+ZoAUi1uim$=AR6aS+}wHS zlVVF6nW7!pmO2Lo%4XxOX^o;5gSg%vo6$ z+=VgJCK^@F2oj^o>5%!%d3|4WsW=@GEiF$JT9)F;y>$!PS)0Z=i|FXDR3HC=;G2n? zu9oR6VPY|Orc)L(=hAU7kX7n_c7Y3cAmsd0`k`7k>IWc#vJ?W_WqA7a#>2FIVV=Rev1&_*WlOGA4u}m^PEtS2k3t zsn=l~ESfv1rw>GubtoyQkM4T|1$zaF!)@)32s7d9ef=Kk72#6QRX4j%_g zBjS!bwirFXknM;j#e89~lf*LUM`@?G7pAWBINKw~Q9^Sr9-v zo(|?=sTYDK_SIL~zVSD@*3YuN)UmzpbS=rP^cnX>eW9RT26at^0siUw2-}EzX&iVq z8WM!cNadD$ScIq2aZ)>8dBIKoMRfLw>+GM*&VOj($w}NF%Fg|4e^i}6Z=)XzNg-OQ zIHXyEy%VP^fma=L5;X6*QX#Mmbr>|{42d$+1GPmzvwR;U%=gnZJp(VrH@u&!MY3XN zeZ6#TKZTS!q|O06OKODKE!XPy#q0g zXhdA~93AiQ*V;1^J3YFS!T;Pdp-E=>YM|Mhuy2w$TW>Zt$IG#?f!b^Vexfwi@un~J!hGyK)4M`Wp@Kroub^U+X`|^0G_x=BoqEby|O_)wdk)}ejjuwP8 zl_cv_6cQ?l81oTXvM-@1qZCOcd$NpOlC0SoGq&tAmNCrCr{CK--Fxmi-+LeT-0wY) zd+zV|JAdiXV?MLI-mmxT^?Ys5*M&1SgZ*5MYZ2sk`T2$UvCUJfhkb19hJv)$h21@$ z&Rw=rY$Nby6%s3H7#~Gbb>;pBp06G}k#a>*=j=Uh9?1JVg}Aj(=Lzr-CsYuZy$^ae zbQ&-&lUlc)DP*qd-?GOtxDYyTj+i*1cWF&dlt+|G5>mHGQJ%lT%(5haQZG@X-{Qz$ zm!F-Le{7o{^gc3aZF0w2#Ah1WTYXLcILE}8769(YQ*R3%-!i1{(gqqclJpV8**)f*4&$2r^?PF$`@g*sV0I`!i zp)oPB374Ex?b{-{y38sg4z1jDH*9As;ziIwV*~=x&!xo=x<(|fyRnqu&v6SzI2?Wj z^SX}gTe;so+k&~P zKlI4Cc&w^J-Q%@gdeYIXDSiAT++x$nn`CBYX3u!r(P38M=;$n3wQg_H(NKX4#%Hb} zxDjg+)pVk_`g;0SJ-D0>I``+CH;q!*Re4`_TNi}lvSr`>U=GM?;xr?^!J#Au4+ z)o3IBMf2SXqaHQ~8g_oZHW9RCX~VNQ0j{=_ypOH`3fqarbf!O6aE!FF2zfibPUf+6 z&(iwrTB(Ph86WWtiEd&A7e=WZ)Z1pznu${>#Uj*{3cp-*Bl&<(6-7f#z zEC6yZ69R!1no?6W9SDiuG{l+_EtvP)4AT#}DH=NPc3SU3b$+onvG7;RFP!1orzJuB zn@*Phdp+uZ2psUD?P(-RZ$P*+VV9}9f?hyXSIoQI!nUYR?H-z+_UdL=$*ZTrpQ4{5 zT>rRMHr|Rp1rS_3Wp3?{tNw-%#cyWL_iNZ15-Z*lV=knB4a`5;)2Sbx{euXtq!SAbOM}OWA~>K_=A*s5 zX5_96LnoWAR8>+(+B&ZMzCuU@+HpWA6U`!hofv@M+ZH3dGXFF%e|rpE%sQ5_d-njN zkmcZBK?$gEzL&TsH)RYzAwd2X;&bb0?$D^9EamvV&DD~(BD=a?9!w9$ zoo1mE$2VR&A{Wy$DH_F_V#ht`;CXv$YI{!Zy`gy9a|59~rX7zEyf+LH1YdVg1{hiP zxyQ6Ul1YR8-Y}yL-@d(bK0hai-dwRxmbdq)f%i)d?<#~HRG|o!=|f+y)7@Y((4h6> zob~_n|HWkudIY9xRzpsx^=w6EFkP-H{$7i-QoWi{ONQ%saG{GvacIXS+2;M4osu8a zLtaScA_x`#?wky-GF_HRZJ;y+y_uAWTGQWzein4DBRx4#%>T5?%1sg42!aw`tpK#c zqIpJPsS?hU5}899BMI*h7U{X<-un1;U!2U-Am*#?d-N@Y{qA;$<#oErFR`SDEZsFs z+c-K8R(D^9+4cL2Z{p;0?k9h_EiS_luAE*vI2{BrPg_8A<6*Go+C5`&VVN8bbif8G z#XybP*q?aZCeZiDbJxkB-IDeU)Gw$n06hX7H#_wcIozvsKn@4}$PIS^e*&z@3|po) z^^>moh7ij2TDL?i=uZOWbRMF1ys^V4}_XUGJQ==I4jQp)r1!Lzhap? zrUUUV=o;L;l8tmsB~HLSu};6~s#Kon=QO3Zn#h=k-nV|he$vqrTg$RY-l|^1yY6`N z36t*H+$B`V^~;6^0kc7!u4`0u#PXcl+xEp_jplTUB}e|ZG5LS?xqq;G==Vwe|E}-C z4SHkkPyqwf)H0M9UgO-+>~RnA-r|>$k1TIrN^&kYKEed7Lnaf3v^ll`Xu%a{IytyV z=a#&Qf87-sGjp}ICo4bu_90BamRk~%Z1xXUz;Vy87#F`G?!N^6&mEKj6lc?mle`oE zHo4Ockj)0_`+!M;OH~8l=wbvxlg$UZuT7qC_CRw__1V$)NP`Y0FyLBjKE<)7WU!-{ zI2Hs7{l}-}$~C`!@0OUyGG&z6k9n)UAy{1Y#E+^R2Y<|sr%o_ok9QUTx5!3>tm zH-up+M>NBosg;COWpe0+6~yeC3i%^cV$fW%!O51K(btqHGWJ~8Ed~9|v=utWrMuUR zJV#i=^_ZLnPyoJySw(hlDQ)wJx$EJS)aCjm_F3lZN~cZY=K^~H<8lAU%eRwS5`|hQ zbOtDbQ&xf=UoFm~n;h;+xA@w2Kz*xve_waPrkfku2kS_yDG}_)XZl4EOFqjpJ>2{i zuo5ji7_H=msnC>cnKyX6f}X=UEoa)MlU-nysn6Scy+^{1+iCg5p!Ggsx!iFyBl_4?Jl64khbh155?+(C$?!#{q6@DX~(-6F%6+PK3-n=QlEEc<%(-wJGrm?^dbM*OZPwT#UISA_pcDSu-Tj- z$hLv01vxf^n^opn8OQlOKVWt-i*j1y1TMEKzt?fVShLx)k@u_kKd9t!Eqbx`K_88s zK?lw?dN>@|x|5snZ!ce0fr6l!<9HlcS|)zv8$z$o8-pUO0#;tG5st$wosuT#Rg!0v z(!U{`K*3KH5*S6=;TA(d-Ta<;@c?z=1o(t6^&5iYozpNRPROl5&B#6n z!w1mlaTp8s8tOkX;S_vBr1TR)O^Wc_PV#=l(TvE83Ju_vx5IH@J4fV6-p?d(XGqz~ zlb0^h$|vhdegs8#HG_7=*Oh+4;a6l%cp#)$H@CU3aa!rS)OX0VU6;;Kwqj(0z% zCXP6kV7;1>;xx0B5QIcRD=sJXF4&zq?SI34q4L)0F_SojU~w4(f30xOo5l}zSwD_^ zaoUu2O?AbwmE6Oi=Pew#>eG|_<)`shUqD$6G}z1dJ5Gjjs)JHarD5n~9-#ac*TMymW?DMb*2u*OYL!$6}QV zVllM_gM#{>6W8F;%2iktTZHMX>_0bTnrpsZ)5J5_03Z5UzgI=%t0}&e4d7-DIl zKcgCn#5*vLfVyLdsQTra6Kxam!^cu1pr^H46N#m7 zy9UwmF6W8cG-5@*#75d2wGX?|J>s449^@))g6JE{hY;Rt&rO4z5D0hWNlP$9@(mHD zxeYJPyaXxNwyNj zv*Dgt442fd-kKg@!Ru%(jBtAXBsXh6GMKnh zoj(OTpd!`*NXM_VSIM6*3Nto!(?1Y*XzT5lnx_vLOMzL|N>XNi2+1*xtw(n|3mRCF zi1GKa&#AZ^W(szrq3ojD#B9{t+isiK5BIzFHeIvhp5vo}g~O`u!u^_$sgsfv?bKjc zQYrVYS*&l0MbRy~i7><$u15s>X_|x$^M!Eb#ZhB zS2A$M%!JBQcF)El6f?P`9CO&ORxkPt&Q`m>N^c93xSx)-OCi@0Ad_boXhfK8(F7M?BGzFFz)As5 zX21kI&HMrM1RY_(0DqH!@mQb^htH4ihD|wj0Inb>Lol`rP#KbGhRR=HbJ#N95F}wm zD8tltreO*P76@Fuz`_*5tD*8n4v91}4dyzT1<4$bI!r7CK+vEs%Np6qA=C6#N2;Oj zmWri4Pcm?pcsU2h18wF7cW*67f7J0yusc{=dT==RN%g~9+h5sh>(AKB3tOtitZTbF z_%ynGY068DrHDUD=dB{%N#th+P@h~;vb@q|`bPVe&cXcMGuyhJMIAiUSMqMI0mI2G zQO2KQYLy^b2!iYRS=egP7JI3&w~CP~4zI+>tw?s6xjm43;a=sDH9D?ijso5JHgB}` zUS4zYl*jG`@yHVv1T~OBizVF!C-YZX=_dRqg>?iL3Yg>oQhjI*Xn?4yM~%0cva?yh z%}AlTMF_|@Dbwag#Jc~zmHE@H* z(NwlPvYQ1HV1BSBH#nZz@Ut4JXZ&7z^jk!O`o*2F1M8)-8it5HU^52XQNe)R*r3q- zx96<8G#?A?hKB%j;AK>!vRR~lu%bc?Z%v14a6Yjm82bB;J!uD?xLqm#$Sq(ld0u#E zchBrN?y^VfyOX?Ss_01>mJM60y~v_uMI5Ea%p{;l+w*BstD}r5HVQB(tLgC$$u)?L z1kn_%B+CZpjl~Yfbd}uG*wgOXL9HA}yI}J~E#X%e8C<@b!{s;!lx$#MaOSDCeKl(05j!c`b+XJ_??C93yIS(#d3s+lJ5{-2 zMDe-#hR4x)FMH9w3u@OoUR-=Sv>~8VX-u5ARC5F8Imr=q&*Alz623S}rH9kQSH1T7 zK%{yun|Lw1&g-;WxqzU*;c}6I??<_pB7SnYUYk9x6uzkul0+HsY3lWj0Py!q16D-- zS@l%DA^w}}pkc!uR1WW?a5|ysEQzONzw%uPz6@l;t8}}(X5X2T%B?+1={AmY!zI45 z^>(%bmUz`$C%Q&oM^CXHzq@*T-^mgGZq)hql$$7|doUw}^Gf{?=xs#bAqds>ADknt z8VkOhdN%ig{;7(qixCku3y#rux{55w8s&pU!J89?i(f@|^*I#O-m5k$H&0w*Nd596 zSSkN}6U0b=u{6KKb^7Nj^-p}%c6TF25GP#`8%o{)qwvPjrCIg_MN(pKSfsieTxAJFWmlmVJJj~ztL689Z%O(Edw>rxI&?Qf- zVu`SA=_55Lf7P0MCEMRTP;ykLjPt;-4U+7h?yC}5ckr^*<7SNJ18Pb6qc6vHy-rL` z6aEyf4c5Q#0bCnQ;9+lS1>4u9ubQ99E6&M3_dvin5OPj2isu8aATA5;64uS5TQb$>VT@w>>BzYDU)g(NC4M`<^i zfzBk zD5)O5sBW_55>tQn$tiuhqnh;lmV)8d`kh7zz9+`9<3>im65oZtLYAZdZ5o%r6306+ z)95##Kvhf?G=Z5!^ImBj4Z4}&7PkHRj@j6Y=PvPV;B79+;JfF)|L#5kMwB7$(_C7; zvCpGhw#|Z#rZgR2mLA-R+n7rf%=aE*x(UT4x%)3@A9~SsB8w;u8_h%^Ki2(?r;YhL zo|f$5@GF_!hJMCZAd~hR5$*4yRR0`_#FIcu!w%hbVwMrw_@lGs)w53@6*fzm6zV!z zs*`nfYY)v$=f+<5X)D+$fifF(B2QdR#?kn_%#Y6Y!B5UsaM{^P@GU!A%)Pf`OEtdI zOlP5l)UgHCi^1c1%@&oq2TzxV?4M06pq6+lQY|D-Sze&8HMz1RwLprWxPEcp+}oDH z$5S_PzW`Sb^*@`qe+j2t9_`!1DY*IC#JFvOpCV?`)stxm?e!VIIVona+4gH7;_t)| zhUkAc3Na*mlTbjMuWcDY0sZ7^9a?Z1CLpCPCdb@ffWD)0RnJnuhwjmOuMNiWjh}a_ zGH*^i&TX+ox%Es|P%i`ps2}KHttggEV!Git$ruOJMz%FGxF7OscHf|ZF7tSsiJNpV zz55(RXn4409YLqIbkb^%Gx5R=lU4T&XRX@wCY_af1}g`4QRExEJU9Q+S!8VeIcf^S z{deJITKJ?O%lIoo}le(mcP9JZqVNPS(U9PO!t*dTeY1^ypa9I z&EN1w*Mgoiq}(xHVZDnwJu#i?GTFsAkpx(~Flf?c^rHT2kZcQ+*av2hMEwwi@_)NB zhNfd-z3^a=VdQ=RjjA>B4osJl7NtE<%_H9s`cfIS2NPhOTxgQakw&#?iUO>U_5(Yw zz;2_#&of1mQ$MDvLG~GxiF{n3+Q#L);#;AI?BtBLch<1>8Peo5XBV*+mIhB6fnrrY zX08X>^l~BmE?5<70I&4~3;k|WiC}228N6qfEP@Voqf14La9MGs#L|zTGLzP0yRypb^f|?(%Z}(1d0~Cc_1p4&M5=`jHV?&|*-(n$rD%ze zXlFK2K@<^&%kGP$kSFE};*1{HW7BLp6BdfHPK|gYr^+@?^|PPRwIt$~z;x^3qjrOI zblF+mRi?*{A`Xoqr|h_sfG`L!m_-u+b9t-W)62o+jq{vA_@viqIhJ(u)U$^s8}CMB zy0@&HJn?W8H2Q*U14!RTVyO9?Fy0^OonFIjnB3v#G`UJE5vCEXyA%sX?qJ2T7RBNF zH7hq|KWNrlJ+z?ucI{g4O`VUmwjgGZp>M%>nvN^>4y@LuDS3s+lY?3h?#jLwC7CIv z&s%yEjIP30Pwg}1ub0(No89*=%(Ic3bUESi;cj@10ToOtUM~nPhxhD@xQ$DkmGZn3 zrSX-!5$cb-C2CBZ@TIUqHFvgAS!Y^DN@SCSA6ev_&rh344s_{2>LX^zOj-HZcugtz z^HSI*=k7}7y(dkCns{` zC3dad{mPHKFe_b*CsulOvP-u2zB&YQPircW<0EE5d0bat4q>hP{!bnCN}q|>_B>DA znYa04)zQ&HCzpUK#7Xe7!%@&>NBrg!>W&_~`AdSK%URhnG&HD2pM`6HD`=d0!QDY} zaPFcGFz`jR8gDkT|F*g{ox3_~QB^Z;ScIx&p_6q?tEcu5qI|jU>kb(E6Tn9d=2IAS4}i^fY|(@4;YZd;M|u(a#p4LDt<#;e+{o89Ix5L z-m--GFe#kLlXkcOKzs+IyC`l*;K85& z$&_(bvB8{Ll-fWBOK2X=*%4K&9tZ23gXlHD9O>!^GimdrCeEEZBZg zqx{`F}V2SY=1WOeIDFxBCyV+~Fj{_5V}Wl+af`-YI2rF}!d zRdU}DLoT@VYr?EhEFBJ9Qcqy+pXVS89Hzq0vjK*O1`X*DC+s~D_FYfV1v|(U!kNM& zlEW_H3w~co2RxtSO^3dBSW7t27GRKy)cuCY@=}KH8{l5kWZ1#BFlYvSR=H$BWFp(= zbi3ynsju+-pdNF=oB?u-Aah6{&giNG>u)`k21x!L z)0brU*sDFzA6P-S6?15)Ob;4tQwnFTwI?qfJUbNYNE+xuF9F5sV z>)njf*@nd}R7?%Jp136#R_EY`-2{goQGRNb<&|Ucd_#O#_|YK1F74B|h1tQ&Up#!! zD;CM=lL%abi;bSKS)90%=}J&I-l>`aw`9~Z7hoMwt|}$XrJCtJBfziK)9CRK$b^I z<2=@fum!69Kxl**`?{gYDuM$QS+%SXq`%Pzm7n#Y2$U0L%vl(y!vK8Erc=Mv2Td|i zhnZz{SgtA8O?L*vr_&*DssSO;oX!(1)Z6?2X6rdu9UJIjlr^~gEW{XU>BJltlLc10 z>RAQMfm&hnJ-L|^JHT&!jbpii@}%y3xIbyefC5bdGo_+Xl(Cxi7lZT#)BlJ(a}y;D z(95b6Xy^F+8xioJF`5NXg}&_!bPt+2LnTc%aFn^{t`rYYR(NZAaSBU>+yNEoM0W<0 zmy+kl$;`0o?J!uC+Vv#AFS94PfUNLzoosa7x`8k|?))ZLsEUg2tUJzB9;idhOeSu~ zqrqvbU0v1@tX)V$MCgv3JEW zi;PccJ6fgVq|FqsiV1F$kbAnk-Pq^(@>E5+N5jN4#aS5FrSn68rE1K3&WjBzqwc=f z-VODJYgFG(LZ*OXrhN&<(w)bAy6bTJ(Hy#E+mqc*oujLcABepqIYX5;SW$NOp&Hpo z)4GFufVhG!%gH8L9q}0d>zOP}yPV3YpguuRss4Ag{;E(5i;AHPhY1mn zlJB@xukP)~y}WsF6`2QbL>p%qHNl(kI9lZ`GvD>6*JF=+K5!CDdROpZ63v&jvFU^A zzHwhL_NU>_O_7-_X7M|`2Tk~ztKB7ana^otlM)A($dvQb{KRG*F2eblojiy_r6KEnX)+Yj;+!dNVe&_o0?- zsHE7Qy#hNC3%KqE#uX-!?t??HjhIr58_X6;Nm6@Wp77~shbNCm>idZv=J(KB5%I?O zoZ6*3h>aUCMkU}Z!{A1On6qDrH1h%l=VzvqMZD0?))OByRPH=?@lxI$C#h?ud|d4+ zH4ljH2oGU4e}2DPWq2o0&b;FO@F3C6qaKg?L&h?T+(uzpSe67>v6iPAh!!VT3Dt zTnnY$__u=s|EBT!UpOf%fo9|@As<|{3LPpXCbPp* zNZguxzxy02tOV7H)7v@09&vEAo>ij3H-zMF408Z9YDgyYvE4l~J~5z<1=ui{LgIKo zfiIK1G(XKcGj}<&6T}-h0XKn-UX!MAC;kh&AGCXX7d#J+&5wl*Z&kn*psCAWZb^%k zJr<7u9WoC{?aR&}%P$>L%=mb4fddnn!f=e_@GNmaw`JZo-=so9>$?DkU~zy77MZ{q zU5o*XZ3{BKyyN3kr!oN1p`I96EG8VVg zu=!>qXH{xg<#V_MjpmZwGd|xtr<9n$U&w&gO2A8{nE+%bozceRhqbc-GZJt}7rpji z*}UURl9Nj;qc+VoOb764dDda5qs*55MsC)5qgw???*eyb-#=nrF|s64fqNwJ7dSJh z3C=z`l|#f#$OB;X^_|=t5t&yBbrV_Fr)`KcvqMY7+wEUR-s8!DZ~_Tlt4ELH+-kwh zQJl3z;4Q$RIqQ8xWQbtq%-U!3WSFz_&_mk1rt+A5+sg0oQ3RqHHY`deq5&pcSa#|M zzHs>c;FTJ}gI-nWJ08M-bA&YGAX>xmQ=I0A$=#?sK&DR$vx3>_i`6)OO~I(v!0#&X z0bE7^z-!;t0hQSSB$6|}q>5UrIeOJ2U=qZsJL>5&aMgGQhv$WD03PU7ltMqm5}v@! z9X9E|J;Dy5qXy!J8$zl8P$Fc^ss+t_zV;o7#&CS0K~Lr)cw?V$2$k`LqF`A20YuZC z(X8Wzr;$g!Agi=yf)`vzcB4X^`{E|c;BSx(4o7k=)Q?*_6Q1xH3P)kTB$9C(Zhu!U z-Usg1Fcu^&7&16WEuMq(CiTcd;ed@)x7mhV7MJlgyg+UWYr&r9L_mYubX`uEYTQf^ zY5T8rB5f*{9&rDAZ`1Akitt4YM-8r>BQKqvUm95feaOQNK=-jJwP0T&2RiCijI;Ad zEyJuM4Gv=LFf5Tt5`#)oqVrOhm4W=Xx{&x+U9bXpKKeC>NScs-34MK${tH!bUe*K? zpb6kR@D43Z01@soCV?ZZsKzWwo=ie^=>LOmOvGP;obQnIAO0DFcY(9r2f`9>5CqA1 zAlP`O$e53`||dQ0olyH+AJ4NeVYd}ci$FG!X4s-a*w z{8}H8jQbqC_n-aCWf`VDLv=yaMP5l3-*Na<`?$IS2rFLV7z^~~Jf`X} zWIIq;z{DQ_2tiaLeO6;D1K`W()64MF3V6iPGha9>z0!us`8DiNA{S}G4ZxVg{zXO8 zu=cxfn!${%CYs4V2Sv8F_uE!~$9`hC>WoFn5>D10dM}Rlw!RK2Sfd8BVF~bujnYRjNq(?|loB1w& zfC7E&!IN(~pbp!U;8#~c$56MtjI>ZhVzw}-+5|z2lex~rF^jt314u1^&m4S*XZSuB zw%rop6=RHP{o(tYsgo*5GY|&IZ{jowHc?v22Z^#)kSO~ap8R8;tdSL4@|A&XaPAa_ zkCH)Br(-+~0Oq~zgUd{Fk_SPum~cbW7W{R$3ZAunZ&H=`m*8lagoFkGLV;K^Ezs!E z+$HidUV{dprQ_J=wci0733tV#rVO(l8XO|oVOb)(zae6(2#(YZfZPKF6^`m+xx zh$9^iv%ckj`O*P3#2FCPWRGzCaxom?RfK2?qa%EIa>PkvA{w^BlZ>WSIbsXg1Q1|9 z0Y`4HxgX4Wb=ks?!`NnS!R*n^knRzRZm)d!NzNP0U}uRGkZLiIX402+wCOvf z867-0kBgiDI8dLB1c6Fqq9pqUyOaKaG|=z`w!Lm;`uRHyyFq&E%npZ8n2Bq+5e>pZ zB=jL039p8AJI{YZ^isZtXN-SQ#Ws*SJ*So<)hyqa-w>0kY+^VxcmcaS!=oQLd%J-? zFuIi8Jx*!Z_Fchdk?WvZ4hZ@x$EQa?Hk~HS^25@>qc4_@|CmqTv>@$OJ5Ed_Wg*#t zB|Cwd*PMn=nG9O|Po0?8Xe-(K*Qo9Q6ND=us!N4_3hJy#7X8>op*ptn%)Plz3=IV$ zgZ%VKod^a=*cn$+_5$Z_E|w!G z_)DFbWCN9u1uAhEgqQ3R5aSNX=7WTo+P*mw8YlI;;kODroR*B54aY?F~mQ(ByItLiAqMj7$xZ@O}zZ{dp!`4fHL?Z*}x z_C`TxaYJKn5bd*SK1a-?ip<^x;NHCqIgS_b1P$+?LGrK*bhQmO1}V%3P{qM9Fb0kj zhG@^6;T_@g3#e}hO)zE^4L$gSUi9!Q?yW2{p@N)~Ulkq@Ppm`=xW^ebDOw&dFnT?9 z!%{Wpsfq|CtntR64Ogk)(gxk|wD|M8sAJEb%T4A*Im~GkN!wrCS^X^Ejom;>Z&O`% zh<_BU|II0~o&P8R0HFN}v2)gnlf>4byjHGARLG}&0d&Fh&)O~&#((Nr@ipY-iYe|N z0@VkvH$Q!Su+2AB>E;(=vcS`4kE}k;V3{^pKvW!7-tY~v<^fb)gqk-3=!7UY9k=)h z*ZB@N$i^)Ou=mSM8iPK73$~yD^$9beyOb6UbR}k?j$Of0=3^6szac=)+W#o$CE$Vr z2$uFRfDNJw#Et!H#Ee@gAumI|96P0#>iQ}lw?8?K^0f87tCcgm;ko7t z{JsLF5-qp_TK8t&eO@m|?)DE3@2LBB7pC0aFA$|muCe+uEgpbm8Nr`Rq?jDuyVA6b zFh@4}g}Z@~*9pF~SD~M6$|rtsz!P-Y>r~~p3hoE2m zTzBZ_-wOhU>}#Z2CJ9g){PGOKe?Ec$hLi9jMi{;TmTBB$8ADASreO9^0m*EFU)p>_ z%no_Y0LjDke8B1z>81@eEj40;IaUzV*D8UV*{*|-bpOVQEf?HJK zXwN4g?VhU9zf+r!&^wc`y~o=vVw1rN0Xqx0=9v}@*YQGcyB_=TS}@H6UP(HvmWwiCYK z=57f=%;OcXBJ}bhhP{ZxlRjboh~r44u=cQ##hk!>pkT4``$YUv8S%bCcO`xXp|dCwmaFKkuNeOoXAw1F=HB*s_-LiWp_D?)Kr% zCLKR@)uEJE;puWY9n4^DjvuVye%SIfawVbpfT|`j?TU$Lv^hGU%|dV27U{?LKCcKr zp=uHnTDx~kDAqA3wO-C_0f^7?^OX*LrJ?%M%8-J=udUM?%+I&=;qPtc{<(C1gYB?2 zq#a=xFym;f0SlB!$#(XI9J*_LN%(NEH4;AM+crhF@{jMtQ)-HZp97O(=QM~;P@v~L@{cYLr z<(KLb*KtBh_IBb;X}C&aAW3338!POaA@x2#S9E)K=H{IyCqIB%Nt-*=UZO9mJR93HJL@_aL^nw1VeA@dOi-2PZXFW~KYtJj} zCKy(_h$U@%6i9C`M&MbFED1tuY%JZ(9R3n(FR(MSnnVk!y!AqaJNm0!X&m1r{^|LdU6KequI`3!UB7Ko+JMuQ1JuQd0l^*3D7EN}_$96zna_mcKB*QE3nOyjg- zVj_E7i6ZP(m@1TCi44=tF-eN zM&D0PpWpK@r#Z10TMR#bg~a|sx^37b`_v02>16P! z#Nn;z?2FpScQHMOdT{FJ*DmZlmFk}=su+_jXj2r~xmxx*(8q=N)?{k1bBEhNMu01E zW1l-eBP6qFw-V>__&Lu*NxQpke)7^MfA-Qb+9rb7cs=u5Ig3g?Bd_l!Th5*d?R5Dc zUIzWMmwE6F@jIjGf39M`#myX|xiGYuPx^_WUD#?&AZg2Mb@Kx;sTTH4rn; zh)Y+`D`XDZ#?S9nxwTc^H`Omd)`??9I`BDOaTldsUdhJG#4;tdQWZ5JPaiL zos_nIX5ZzXtohV2uA**_)cughpDRbWbW;U~tZ%6ESQ~X;OqM zhiYzPt1%ZYQC_H9r+Jll9(nS*Vx3dR^T%P=3kndf8EUBe{j?z$z-WWtv2Q$Z#BYD%-UnbWpZ_ac;={3oI0 zqvp4^AMG03BC_FHY6M}@)-m9`!G_-@r5b+CiuK@D4w*#Ep}6YdIW4J^~y@kuaha5T?PMEUTVcL*P(wH8X3 zy_%Dvi6k^FX?#wn6(>1QX3H(`oGAstRrt` zMjxR;0l8>F_k_~{*yfwTrf9TN%2lbo)z2S>H*$k{B`}j=i|OJ&%z3UZ>^krbv6iV@ zsd^w-d@lBEw!O`ub z&Nr%lhHL5G@Al^Zu}-T0S1#~O99v)kK0aH8nFp<};5=;3aE+6)75DbQpNMN6r?%@| zj4O@btg7mN)+n`n^)}j}BxEqW@oS$b4Uql2RcWG_8jh>-13(>g50v-%CY#UPe*G)# zenhI>DYv0Z9&%Ah>&_#*Ff3g>mh)0wp6a!=jGP@;%2BA2G0IU^7M)6WKl`!1?eOb$ z_e!>H=nX+8#P(<%$rS0Z;K!601%-kMU$c?AnIkGi8P-5jpCh9lLcaxUt?_)e~! z&q%Q!o|mdw^!Ia*hIb`{5oqWVNGL9#HqCp<&zq;cNp8t0R*{~8+ugQj%VzgaXhnV) zdh2en;aCzfQ7Jr_L^BGNqYTwlZy^Zd!c|#0Cfvc;dGv*lj&({%&5nb$g6})Vq6p7}>P-23mM1f4bF0v(EqTmOiQ^&VNZ<>tS%QNVDc6Zgs%rjku zfvbpqP#&fl!-sbQbE38xU9*(W;S?|D$_qTcukB`tZ+i=EbVA?jam|WdUH5ABjh82# zJ|I|oyfr~nqt8p`thp~G!?%Rj-3rHih&O0I-FK`^CAP13V7o%toz(aa)`Vo6w`MjC zeU~jy!%{9sQntzkUd=)Cj;`VE`FL2iYTfOq+eobwTRN7a?6?O)l>x8WYsik{W&(Bc z^~VOm>zm(b)^duemv^=t<8lAg^fYSq4yDam{->+lK&ST+xL#r}CpDL&P)VeZ5G($0)GHRsoVqFd`D;(Gp!;~;k#c9O`_Cp20qw&O9ZEqKW$CFD9L z!NuN%3T;+7XB)AZyXu~`@w&YqiA7$|6l84?+zyoR$z-~3IYwlT=nck1gv%$@$^k`% z<@Uzh;_ZiTvL1>|YF}*ruz}H)5IOVa;}!ui?+q-*ASb2BoF&iXt&mWpC5Aq`qI-re zV0LNFEM$Leu+h3KmEgK4bv@Qwu8Sto5q8{lWJVwm=pDR4lQ|$5#dWgE^9Vc-!MKA% z#&MkSm))87<1Y4W+uFYFCE~MATg1j=Ji@+E|5K6>tY#;Q_0dG&k-;aK={w^MZ^$1$ ztQfkd?*QM(^nS^WRdGYNEML#Sw8)6{@+d|nXF02RnHE=p>KdVB`0t*BwpzSSeEj`6HplLPbyeN7Aqg;%)a3~9&T)IYDIdOl(F8g3l8 zk+Y^Zf$aLE@C*Fz5}IRT!PD^d6}O*tb(+_6QoYhYmtDx8 z<-UKe_Ve{kP6vCgJ>$;eB;xH^x0r=Bjbyev)0%==%aoY;=;;^K^d;j-^w5mt1xDQY zV^Vo0q0cr31Rclg@9L;T93WJ5$!x1~O_zxNoSY z3tb#K_tMd_az#TqsT(x_0#jvsD7$^~Hf=wBmz=Qg&(sW(efr&r2*G3)@( z*_A5BJC$t^pamvZdD42I!5_D|2+hLa*D^irsSWou<#UwV=wqAj=j8eBN1Wp^91N7P zezKC>Ca6EPPq;0o4U9-*109Djnf^FF z#Rvz&wzh|%&rT=b_wus|6*Mk8Y5L&95y#_us9duRlkM~xM$kK^E@(IFa-keZNE#P@ z|2!}K>3ICQlFN_1#JJk*Iuyf3C}@b6#ScAVESsF~`{--Xe47HC~reY+WIj zRwNYh(bYb8&7~s$K9O^JLwD`Zdv6-bzR(j|bET12X#g1pc1L2@xL+BkCZ^IPDf9UO zH(4%TZqIgSXiNwNDXYCRe_a*h@;F(NcUKvho+Dj`!p+c=(zF0QX7V%pj1;|FGhJl+ zX_*I(d!AlZzd%#Y=)H7xUHTg>6$JMPNfhpaww6e?NdQ9CM}x z?Dq59MG37nyIQwhaMc~Wz)gz-)hkb~1dIkaeztRS3EveDle=^BTQtSV=AEg5N&$r! zXMT+iK4$?f?~O45F6_BV~5_!4Zq#wBw?lE#Cv{&FeB>-O9k#Ftp+wvo?j?WLd~_2eH2l7Lss$Z?6E;t z&7-upyv)~k=1Y0M8vX;e+kbhqii=4fXjs$-b=i(l5G+2UpX42r{=MG(U;Z}#+Xwz* zB)@7T8|gAPA8A-q6ejrDxahhFROb+_?bvzt8I%#@@~e!`R~jaXfhB3+2fI z>TBK#Gm`(!x^zRy6WNamwf6F_uErKdjj6wsT={vg)@YZvW)deJV}tV}XY&y8eVbUf z(lf@!($mJ@C#3gUAl$B6CaoUmaml%HY*nz}O2lpDN_6+u0p$=_#Q}J-iHEcb-S&Bi zPeJ$1c;B|iZPj0zeDt4eilB(y?Xk$-do44fFp#$%M>|=JVN~Wa)t*?uO@tjx2iGU{ z)N_OyYs!nMRjqE&rGg3t1JUr^G7na-yKTcpXvPC;6T+7GLfB3CnAi0t6tZ7M_IU<2 z=DqGMFOTkiURbj~;D*4#%{*JwC%RNFXo)$7vjvb;WWfgR%)e&Nwp~B0pnzE z*J#F`dn4~3#P?fpMQ^;C*NhO6wK$)P@R**kr^!@9*|rmf%6_?n;ef(-xTM|5QS!mD zy!O*a)5W;jWvkALR0w{YlK0VBgJ5RBYZg-o9ql+i_7RAsJO;YFCvQGf*P@Ssg_*DX z=~JlHWa z5qt189PtAoA!18_QaBVrfZ$%HbN3O3RTyB^w1&w!yJ{qC8ISW^CL<-hUT}{hyFCU@ z=5nG5I}K8H*(Yel9zc9U)X7{j#d26^EfEy$ME81$8ipFvRH@9*C zGZnPVinHn*mm=BMWaqx!R(s12juo71;2q-~0$Zfng3+*{8JZ=k2%5o#H4t9=Xqx2X zIZXNft&W0%S0!oHv3XClxIeOXd%8<>Bq2kU>@Ku`ubJ`w2in{|-xu!oWU~&$A}L#@ z?5gX*u)WMLM44Z%UI4AQBu=V~R_2B#k7VjF}?|x^X6{7rL zx_+QnzDR1L@zw=T@#_tJvpc!l;2nfIR6eN&Ddx^wMG{ayL@P)rjek&*<+cBI|20uN zsc6i*#qcFn$1Nf%^GV1XoYy29anAv_du$bY00kZ1T}I?r|3Nr1m0U8teR$*NQLzy3 z9Cv%EMW1(alIfBw_Hqp_nM`J|vndCR-p$a%7aSZ1G&Y%hgqG~a?pDbx zJT&^~YK6~o-zL$(GKXtt2+bSaZL9lM(my5g)5*R)Jzvd=E`^+UWn5l>eSGcXtE@e@ zhDy%%mhuM5O!~6U5YM!ah3B&A{! zx?+dZY6O27BWZ&Syqe=+EmQ7{8o%)}FZ(t%QgHB9@+bGGt^?uM#}|ek4YUF-olano zicIaZ3H9;o=Nq-(=h6M1c9k@}i_0nO^Jc8vGXVDk9GCGc@W%^oaoHhfrLCxraq5{R zGogpC950;5jD*C!l}e?BK5AS=>d>sy%j86si0UTmZa%Uj;&NOa!s+pFWQvS9_5n_j zYqp&&a9QLP$sXAmVA^y6JISE`5GZ)6xLIvdBk8Qh^%A{nm@ena6)2{xMDPVFHi(>U z65x95thu~Vr82{0-`fxF&kLHH$|NYjy4e>}LdgE~ z-4;R5Mnm2?9DCi@9#m_>yO7}eWW)REuu>T4PCyhxOhAuyaAltW2~E6~v&Ri6lEv?j zojuLI%x!J+AWYjRd@U(!sh^<_-Dq|7W*m0dVx^L0M$Iy7-^(B-Cz0F5vvRw%%Z2Xc z=RvN}hpUA%CjqVZOzpt^7r95{rrZ=7wf7%#w?j`_XG5RhoS|DsvA1y~Nmspb(Ua_yaHyG<>7o1s+A#)l*C|mxXBBhUfFgB3CQ0n}vU7df6Q}@^8 zvz6(r0?)~*ADR&C8#s1ywGDyd5u#MxckSBE!r?=CM+`*@MDE7V9D{tm!aCb(8E{qG zsK?4X4t?WwrghEr`-3ODjsVQ!22qA4ift*O9*`;l1>V6WFX}dMnQ;lDm0N9D0W#4? z2heeldyiZRs=7Pxqf3S+x4r=noFV(L+yAz-`75IsQAdG~?u5M(KCec^}k z4XG3P1$hVEbS&T=J<()(kj~3}j*fFg&-MSXrddwMy~lC>{sMAoA%K_P^^A^~g^jAB zQ$qHe#L;&Jlr+`z4AdR>1f@31R8Qrzuj>+|h&wZ^b7LMcMm*xK?72M??<~o6Hol;> zH`)GHt-nfM#-~U9c9uokwIrnh)nGKqksw}L;+~eCLhHQ0tomteS*170=fJMa7-x0J zja#fVmCY{DM-v{pAim!gdpk>%uo+)D$&r5tb@t|BZGyzMvD;}gmci#-wIVT)&(5#b z_ndqgVD#!v;j0$hYrFRq=QFhC=t3Wwu^Ttg{5V2(GVbVagOLUOH;l$UKy3Rm zZL)m~5r-t38tQM;5&-+~+z;!ZD!@FGrRx6f_c7ln0Ox?`{zOf^`U!AUAHWf0nd!;h z0RqhqvSqZ~5ZO-eTsJF!qQ2rC>7iBL(^IxPMY&(8zo=stxl9bGxe#a`YxdKKs^(@` z6qp_MwyD6B0-Ysxb8~>TxWJ-Es>+{@BPZ=$U$$#^_IJvcmCDF}Y)*xL9{j3b7r090 z)ol*hN5Cz8@p(>mbm+es)KfIns&G{yYr?dsGiBnosQ8S^VJ*ksT{lklw{gQB`@b*^ z|L(w>01P!eAOO=0KS;~C>g&a`IFeX`8mU|p$)M!Fy8p~O)V^I^EXw1X`Fz*Y1B~m6 z4dQ!vM-k4k6iNR34l?qXB;3J|8n_tDJ@d&GA zr+e2|uMFyc4!l4`5mI<1?$x{oyU3rfFRQLUtd!Pt<9Zo-X9Hqx1}(_IND<2-i<7!H@t?qQXBQ;B8v z+!{~C%tcAbVKG}GU%^hZLYEEr&^%(%bz4F8 zXq?TbyjSNArrbr{UdHPk_RcyJT+QM9&F7I+Dc|LLW*#O|Sfu$t!?aWE(>ZC+tHH|- zTO%Rf3vgi~wgodHGqeApS}PVd&+i~IC=vT{&~P9xR@(67L(|)LvjZI=4E=-ey^o@i zk~dQX3w+kSLQnwJZ)`KFFd@IaQNsICgzS~JH^B6OQL+XhvzCuJ?B1usVo_w z(V)hPtz8109m$nA5o|8sH~q&b3gwo^$Xpc83CF((7aj1bC_b_K<`my}N>iiCrIKvR zk$K%c;l~?4b01)wvzH`=5aBs_*2oVO1L_5<*VT#67Q~$ocHQS@oKi`Ine%plsZTY9b{dX1_l8F)ec9^COOcu3I!Sg|EuM`btoJ#2re{w70v`Z@t3TF$bZG|%uHsch5Ay4c} zI}c%}%8k;N>D_*k?ZTyO$!8g~7uT;gC3pvD+1qKkwLEKo@kXpRiybvmESnr2+f#89 z@n6DT+n+{H>F0R;AD&8oJ00Zd2ILI3RUUS$>CGsV?u>R&vo|@Y`S6f+SK(y4k8JDX zON}@D=3He2g-UvrmQI&GFbaq%Ja!84M*JIVV)22_Z~B9uM%+W3P?`jn1iIoP96sQy zh=FmdBs~piGV8lo)%R^URXRpg3D51FKyH|UShWykcrGj1opv<~cWd3AthI+FeQ_kN zzddgnYhs+4|GUi%5b6JvaZUXn7%qUEFjkZZP&H{~S_kpAm^g72qC(0u@{LW?R51_B z)G4io@&jADN4#HhB|RD(CQNu+X*3yCUU<@9oJWxv?EiLMu;4O&PTE6xbzxib6}5q^ zjYo^3kiFF7)1J07tIcnY8hdde-H44&C8s60axCqbWMC-FX$B-$v)#En?WG=2BQCcV zZLd%DUGLVshi?-q110Wtui}1xCXj3VEuVXY{3);hyFvV4%sT$gEC!wkpoSw50423n zkkNrelUBG)Z$81d^{L}PrS&ylBj+Lb2|of)MEsh^@++UCg^#AhC5p`4&$xZQx43X? zNc42+`LWN%#h5p8lbR+Uppv~5)!ovG{?q%AJLTG#FAVye?r}KsM zA(<;K(3OGF8o#(?zZd`7sMSDT`u+=#?Q@;|UcxWE+q=Q{4W*2=?BcVsk=1%si!7#W zi>Cq$0mY1mHUl)19{26s9?(ekzh^<|585Kuyu`V;k?5~izme1UfsE&DHLdoCHtUww z2igK!u^niBsbBx41z!+Z>QBwWg3~C4!F6(bKP#>LHV}i=r%wR(+2>$qYx*J#JKOcf z<0}Ut8;V6=TaC=pB6m`bF47suo;m2N9)s0>+>LIeXP%oy{s&h4JLV%;pR6^h&!j^( zFk#}t19d_K#ME2_)!2!ojXXhkn)GympMKf!^H`G$?~SJ$(zYB$N>3ucYTF-nft-r`| zn4z8Ci!H}3FH$a$?NL@UldSU^qV=#$o)ZLxi~=18FO@Lu{y`|>-s|%UW3ScacjRj} zXy*b6*%=e4VgpT$*cRlgVJmaVD5I`_* z0u7XcT27!5D|}Lf%Iq@bbtjAUxc1ml`H(Ix&9_Hb?MW$$0v#{rr*+=z;6npXtu75x z&WE%k`KYI6a6^={-Bc?)j{oB|FO>8%V^Nsn#S70%9!dr7e$#w4nQ6qY>}ak6i3L9k$@2=xzR57uFaPTD%awNcHQ62BY7i5ei*XtLJI5$U3(^XG-P7CmQ4ujO%uYE{aUs5 zat3{0tc`7z464Ngt@A!EWeZ)RA`bxfP%UYKBk3b8M$52d9#=jXc@q=!9m1~qVGdW- zSE4WzzVKF$;kak7-h}g~lk&Bn-ZIJF#zQ<@k>zR2$0+Ae?u4!u^vJ3~kJiB{4ln6I~tN=MW?2dX_R500a3bzL(O{gZVGLGJwfB*{YT{@zeCQ#NTAMk20i5^i4)#4%G z-cIsTkL-$PZmQ`|EAY_b@6{7O;1ZLc@G`7wMiAoIA<4@yMsGuxRkt8mQN~35WmSsd z0>2fK3w1tW=TXa*q!@4>tJ{)jyfD(B)yIFu^MuXR!3DO}m$!&iSXE|GLIpR&Bnm+WPjnYQakUu=wI!F}OKzif z;pcr>qz6{6)FuW!EM~+URd2)Y+w-ATFy06Az?9jRa*|w48YGxBYl=DIp-m+m0W*zC zU%lIc@*PZg)l4U1q!lHUjd?ES+};xy@x|-DZ#I;fTmbY|@wBRYC&<1;-6^YRIv<(4 z%{P0x!&1jv8p4!i|Eyat?pWmo1!3ZXUUTl0Y?bVT~iyi zXz@v*^1zCd*TIMb7KTDZ?1KZxle2|*UG~BZP=2&~b#lZE(#@CTJ-+;9UL3>2r?TCX z+h(fex`c(o)vxZM1#+)5CW7bppecQNU&-ez_PCZ4^Ee9=@%Hm(!R>cWrMo* zoOn-6hgXGO$Bs}&L*Zkhr84_N;mNdgFR<*!9ZwEaVn)195lC^*#>e$WEYqGSy{f2Y z@h^%m*(+6r_>vnUc7So#wq%$>dQMBCtB?ypL@UEa0-95RNNb-+?JE1YUE4L1$?M_s z%*o1YLZW&-E2zA!YP5J4-_4gEiHRv zb;O5MkJ0a4(K`IRUWAZ^l`s)2%sIpx|8C_u*VsF_zu(;*M$cvMa@-@0lDUW=n{V+7 z4Ur}XPI*87TF*-4-5gn(yjB&lbXWCb))eW3Lkh7i zq|~o9*2{Sb0(+feIhjme1BN!xFNE!c%TRZc#fr&@>9#1n=)#GB7jx!OqsvvBw#8Y! zy{f?kg9RnRyTfM#zJ?!Q{4h()0`21HVP?RQs`I4CxC$c@>UPcSrhf{Y@)*4zZhTNp z6Ou6j=LwS4I3{iDkuxM5Dlr(cGh`4~m6lKkny=W8O%;nw4K{hPkLPW5FL`lK&zwU| zg&ldnBzCvV%kNBV^Kf+V%irUPZ)JE)y^L8_fw7 zV$WLl-VXIEesS4?6lW1S7k{jCz${)Ws0V&^Y3ebfzsHo}pbe3`U`rxVFEQ{OP0-f1z;5%^Ip89E z;@LmmWdFC{32noOqu*P?=xxR@noMjc;v9fDph-A<4t5Pn2I1L>6|g>zyW=|%V;(=m zY>zafnL;pY8b8msWyI&O=98quiO{)mjjoAl82&5>bP z!CxrV-e_Um=S1=%z2yzSmJOejm`DQ=rBoQz8>D(kJ&|-RZL8TJ#8MDYPHiDf(i=>H zcs{Zo>l|23<*lZlNt~lM)u8FO`?DH4-RalioWEi@jEeN#`QSsfm%c+TW$FSvoR79Q zc=+2HNJt21l34s93iOwbysXCxkCtT+!f>&&kqY1#~0wovzw4f#6fEO41L$u zyRAnvw5eN)ompRz{DH@RrlvEeQvsds9uS-bqF&fDU<|nj7g4)`wkIepgDv&z5qO#m zo$(uui$n`9ScL7Qs)Dzt(sB?>7t85Zd|TCbG^YSN5Mlp!a|*z4{3#hsDFT4|!~j@=yb}aUJAaSdniki#vX0xgf~<=cq|tKbk`*K$11X zogM>P{C9}Qt-+_$8s4x0TM6WrYJwyEZMr`_k?*xrh8LL>M{A*5P-hflu&nLwCx4j= zSI~{B=+9$-XI*;l+;_-`FHH;@3AqwASf*0T|+kb(hx5z+M_*wUyZt=%W6M^6`6mBW_Kb3be^e8_|!xQ6^4-5$-^c zCAVNs5#J&H(U0M{lJj2?@E;!5wL}E{JxAdW;oh|n@N|+Q|Kw)T_QTD>t0ekQZWbL{ zM}VBcCLT%QDZyoShLkiQSNITZfMb^XRmltg0eDV~rEJQ)2)RXnhgg#!=hEB6jJHl9 zlf3`StmxmgAveAMRWogL3K$~LGeM@l?L|9yi{=xC_-w04#J!j4136Yh2<|a_qNGCG z9FL7cuHArDz-aM)lN4`cIyr&}raN#9`m@~|B{gaL-o%a=JZXVuD!4}ezVQ6p&JK_g}@H=|wn-@qe(~{c^$~Abq?TFnDlZw%%C>;SEA67<%aMU}dY#h{%b< zL~I)(F-zv=Q z11_gT=ZE|`xr>%%))S+Ky#|ji#(!geC3#mza;GVC7QGWuPDPZvgyxaG^T=~nMDfJ5 zGW)NK1^U8oV%tA7M?!WPJ8eTBJ^)ITHot2qWBf8nKG5~HVL55NekdgTO1nMQ?iPh$jv^bQ@jB!-zrgi^J!j;(lp38fT z9!Ur^?)5Y1hwz#*zy2To%3tsP`9qQI4@I^=6xsezWcz=w$d+H^W@)o>FX*xRk+h2b zLr*F*En-s>PsuiLy~Ag@GJe64eK(haff}wJ$mJszadd?rI>rCVm$cRJE=y?y6Ozv- zUu};mhp?dhaS#5l9)5$u@q_zvEyp0-M;NoFL$wos*Khkzk+gnI`v3N+=wJNmKMqv~ z{y0>zLjE{Z{gjn4{BfxI5l`0sUmvPe$0hZW^;u)tl#et(e5JaXOsMtbt2^xPBNwt93z8u)Fx0ujYv?{A}ZMFAY;XaYu zzbwPc;P*!~|D#0qM~Uo@64}p10>(c|WWSIa{-HlNfaNvt_k94o*BXG;+r2lC5G^V@ znSoHe^+T#+z^OMjUVa}}=|V_6md3gXBOd{k)#=3^dP4*HCso!h z1Ua_O96$qX-yslC5te`v0LOGEaFMVA?`%u}rA+~S5)yw~lCv-S^Lv9R>Zw2p-%cU?f{3_pjr?3UF!HFZVpx0zTn~OSC?s0$&DU1AO3d3Pj zS(WLMUG65e4^q zbG2o*XAND8stjl%CF;R%e)4H)XPd@_e!dGK)&BTkK6er7?bc1UmgSL4q~nXz1f|cLSl?3 z_i4%CHxqRp?uLu@zf^;6k$^XLsv^Z0nm>x6-%lXm+M=IRyWDBOabu_+)XIlgVY42a zO-o%z0XzZ-VeO@WB{t-50`j2rdqYBbzkYkkoVe8x*_3h;wmyQO%%6KtfdoH>ZY{%T z8wb7whyFU$yf^9_{>Cs10UFO4#FpwPl(Gl7`U;lJudiU9yE3lNk(^Rhu+t9$`An$t zVpiea<8o_>T4!dyj&A+1dYn`K+3K;1_RDX|O>H3^-~Drg)KRB04`jUBn${8p(Ql>E z-E)5<#kP$~bCYoChTT#8t{N|I-G@I6ynaUqdH9I_~mT7+mmO1=-9aeK%C8bl* zzg^~ko#}r*l_0-<3Os3s&mkJgYBX^FE}eK@2@{+DOysx@7GBOru3g* z$6uXsjv#ES9V4qvuOa4x0qk?85`FR#It+Sh3+82vcZggiA5^5Kx0V8w^bA`7Qz}`}~7xySnpko8h_t@mOpkN}m^w-AYuU?1q&*&a_O249eA^_dH@t4rO zw{&I`7P2b-y0eac^pX-AbPJzC-tR9!%e&LJ`~%_xw6&igKHP85tFGQ^Xzrhz^ABke zEM8(Llrah2OU~a~*$zsVDgiChL&;tj*hz4YI9Q=7D z{xDr_`(e8J>+1h$CAR-_SJhD`|6?T%h8d{?NRP#LQ#Wh#hd28EmwbvKW>egML7q z@e99R?EfHCiF*@1f&IX1Sv{8tMksmuSrjy_e*nleYx)N#^c*nx{F7%?;}4V12EfLU zNkb~o7T5sElx#Vw;^5E>H2j{rDc zI1&=__FsjGm=%d8bVCd{Krb;qLx0umX8yAJ!<1A?tqEb=MUzX%>AgdLMw-k4++TT% ztr;*2WQl0f96cj!!D11+;}_h?jot|ClKgSPh^1Xm(DSc6){7`~Jw7;c+Xzvjr2rgc z%|W*2joA3y)onxU!U6tH-;CO_i$B4Olt}g`m|GF#&0W!)JGpeH8K3bUwrkha{jvqO zW{*0|?Vl)30);;6{FFF z+ltYnXUguG_0Fly`qft(rXC%8s=79`ThjCOwPM;>H*+mDm|X2hki&1r!sRB1iWdSx zI?nJFp83KSGd`gHcrv2;W@6kUQJ0)-R&+VKCDU;R+lb|$S`JY6XNhDq?^YpxUG%%= zccNU&yM8Sub0MrvOwg41DR>qz8G0)MRxg8cB$p5u-`2wivg`Zm*rpHMq;ab+P2X;0 zv|C`Yc*rZT0*P;Shd z`ttP0%NJno1+OeV&WsN7GDHKyYIQ3D`pwt?;p#Q4+*;4r`(l>!X@0%o`Rd?870#ju zqIb)9;)y!2s{tO(=>qX6?fihkCgDfIiT4U^^#h%J*_`%A_A9;Ij_7+f@5*@gzZFEb z5TF69US@uhU^>YV(zP8@xg$Wvp0F_z`7NnD=1H<|Wxv#({c^L4on8WvnJ@6Cq41ZW zv3*9r1;>VBC%e3$ni6jpsnGbx-t9Y~^`YRwaxBX__1c&7enRaHJG+^nOFcx#R)i>= z#|_Ivb;Jvz_WHKEZH)5bJf5buF@DHwPdK`&85=M9w6&`767$x?HBRgaL*2gY(Gk@y;OJ8{d#z&VU!|?fGB|zK# zM^=rBG!uP!7L>dN@Iwze)!FP(BCT%<6+((USWmM-s& zcjiY5Uz(jWY>;u=EBd2BpN*D+bEkNXh$Hy|teUf+Xr_R<#cEjwc= zR(@jkf(&bS>!wme;bH$wVvi8l3ItZdz|l+$Xh3DjLTD$!*J!QJm^64rBViSqP!6ho zX>Cne-!=N^wW`50jJy4muB0p+Zi|HwSIK@|OXUD~S$PeBmxs7kP;L89y#fyd#6l=L zkl=Gj!^2n_RW$T4BBwk=dmq|cWIp(Dc%}~YNZ)<8C^*k1 zZa7yAxq`AIpTd_UOhD0$0o%Q^bB4*&a!-weq9Up9+GYI})jtz;#9@G@HB3uI&)W|o z2tI1$Zcyj^_rJjWiYiSu#N)z4IN}=WcT~Kt?!A;{n_?Un^>FjujW!>jkx)$y$X31` zI5Py$JLnPv-=LiCx0uQ{(=+?4)ArQVJ%4H~TFc3NJXOnd7wkVa&YI9jM5%uRHbPvb zlDHXmGuUj>syz0><>X)^!@Kf#&XpB6;?p6UMvyJcIdG{y^bU=uIX(sBjG{@^BZd^uV0q?m)z!SHyfJ9y#a|%r z+N);9dz~53ftl%IaMQ(W-p)s#n1|Hc&F$FY-g{Xxgm9t#0NhbW;`D3A-)e+eH=aq^m&w z&HpnjPqKtS$ufBKZT&DRgPV_M9@|3&lAWDO*Q{XzSXG$&-Z~-fZZ7y-`Q)F`5?i^f(nN9S+`2S# zNo^K=bP>1Bs+WsOEvntGYXd*1TnfimpB3z8&g*9G6b9lZO(bU`Q#rZ>blwkSt(w)O zu1%#`r5sPzmv3h1zLIrdM;cYrg=7tiV*Vg%GCDyZ`IEen#{h-NyBjN0*1w}qJXhl_MzsaF@V@Ionw)9mP zmirmj+Y-?iyM{-nW#YbhfdiNBr!Sv;3sY-r11hZ@^^$7t=7wrNV`Td`toHvFF*u^9 zK>992(MBl~Q&VQTSY2;bXjCPrjT{oUH?s!qnGWX(0YGB>c=r!f^nXph`NQOJ9NoYx zDkKfNZKWELa2IfO>#=}@ZRD~h5Ce`27T}Bwc)n_k{e^?@oxkuS`e+fLP`etUf*xWs z0H>VmgLBrK)OcAGZsR`MA;2CSO`ZzSr_o>AvDuX~*brU@0uWcnG*{L;|xM6V*D|*s@FaRz0;+8NGKUD(k?GEWmby?|n9?A>9@yLb^*x zsjuAQ>CGIyv{CE0djW-2E%r-YQoAO!K;C-B_z8VUam2hCR)K45r|Qlai+FhPxC%c? zhd=eqU7LC3bTVl_SF#^>*{(}{H;zDdKp;dcS%#UKh6PS_t&3>dp-<0tlr()P^e`dI zVOow|dm#}Cjm%+qay=Jush8ZI4gInAMx!jqr=Em#OdKYp#b}+%byyVsQfBL>VK%0J z=)?GK({~92mPIYIhBVNRVO_SjV=3-ubb^^mEJk`2v--?nV&Vy zH#+E_ObmT<;{+ISUUJ|Ja&(c@T3ScWCk(c2GrvXdF!Ytu5PMX0WZWe1%Mr6yy~yDP zC)v&vv&PU9yCDA9vg&13ViH|>ak4CJ`M6hp`%JO;c!|om`&;S6LuQ=MKl7bBFV*_? zl+#HOi`!3RAO1$0|-|zHYRP zk4R+cgt*ZMag|oh2*vff{m&H!=S$mh;d?Kw?f2IauYD)lII1-Up>#JW9{Q4fGQ3FuOX2 z>A=iFI$>) zAV%+zG6-Wh))3)^8pnXDiEWQ%$JUz6l!~L<*B-PBkG?H!*==&5j&YgkbkVO#U1_yd;gm0}(G{zFAi>+D@}+Fl*7crNWCMh- z8hH=hXozrAceP7o2Dzvj^H|&G@+lu%O?^g$Q8A8?- zqr{Jr%V(hEgR4!Qh?%G`$2{`j+ZqdcL_uZTEL(57bB>C?fyRO4V?8g`S0g{RKxih} z&`=yc&uS#P85fJ#iFTsF7hIQ77R7n==SiZlLr;wZo^(Byv@c!k7Vl>uMJJ%Fr=a2J zJ-!J3h4-oAVuZx|PHQ=W!+Q?!t2>J4v&L#B?}DI?6INS+W+bS)u${q9M>kE1kzJ-t z!mi~fYwsU0cAZPb>$U8*a+OMIaB_0luPV>c-v(j)iVZhM`H@Ws)h!wzl=%|{P7KU- zKYdk?0ZIZV6SWlt`GzO2oV#{425$fGZK5HNRzrru_tVEGtL~@gV?sCsaL37K2n%fa z<$WQ2ZepFEW@OA?KTdiOAKLL$h53tV%Y|-ceZmJ)8trjCGZo!~b1@lZ3Kp@6Zuw%> zRIkuCZ6KMfCoQSZa;HdGr}VC)zP^*pjl=Gcc{Dy}vZAE9Yq$U|NRF7IKP$CCE?P0$ zyz~eVb+>%ErQq3-thdZ^@nh{w6Z`Pe0mg@)P@rvNmktpTB~32Pr&v^k3yt-z{xRP;=m+6Lb>$hWj857C+6bt$gx+( z;ZD(V&BKw0rd-GL?9V+&DDtV@6UyEok|jnSq}YUX;41RN=g|dnBiWF+X+_`I<(d(Ml#|kg$=I6| zr`txIuk=Ok^zfymD~q!-TS-c%L_k25t|F~%bp(o}rtG$oalFY>Zn2(<%EvpDjBxd<)K?6FBu$KoQUMM>i#eKUf_71Z+* z`)jsJgo{`^PF!o?E>WKmg9wl+X^-J*Unj*-JQP`aSDMYyh-oju3urgvX?7dcWX*sN zXLsnmc#$1hc|ZAFyL@c!v250(lPq9Q%$wjIs2JBhOUpySrfY>1a_vv_ne|HCkw4v_ z`HXW%<8c}H4?^kI6reQPtJv+`wu}{wW(f??}|W zhPwm<=5AP-*Rtl5kY2>jzG=^Il~tH$ZZ|rerrDf)KHH3)3>)srz#huIa`#GRZEUhp zvJTvt5Z&4}sz@3krp5Rw4$rh|=r@@})x+wJa=Hsxo*Of{ga?~2S9;HM~f zQHOc46XFdMu9pw+$NRrs<)oSs^&HxcS17aaRlVuxIH9{Et7A=RuQtcxOu zL8Cs-9R^iwndCs72aghH$Su4W6y`|2IX@eCX=kM0p`DnQGpm}Yv6t}&?Lz%wd`^*uj*RPU7UF-Qxo67A6y zizr7gbK$imo0#manrK0Z zf~mIgG%T7fLA_3{o~niJ`M751p2hL98q*QJKa%>2QGVw&>(ha>by3FkQhaDIXru>j zl!!{qvH)R_)QrKmy)Cpwavd*mJz7Q zX>>FW{}j8sj;{daR71Pw1eRr z^U@TN)HLI@M9HCE9k>W~%tg)UPkny-D$|F@6-2oWxn__0wo6Iwc?4QrtO#kXB|v0% z2nheEVu$CNM--c&Q8kAazYc7tEI;uVuv&TLH4`&ra4zZnvjkI(7jws?7|ok-75<>} zUXkif7<}tBLOI`~Wj=)#Ien-CBXHqFD*fa`w)NYIhu@b-8SlNhOE;RMfm7!gNaI2> zh_RE6=?<@bXA)FU>aPZtmOM9D4zZ4_5-VTb>q@^2_Z2@MdSEZ=y@AvfNH8A-LDc{f zF`Cl}Y8S}uloPISc|sZ~=wR<56%h|>RBL?Tnb{{SlLr^qeL!C}4>4SkYziZomqgbi zdev+|v!JI$udfBNR=cB^bmEC0!@yhf#G>8WYYf|5KQ|Hny(XDoV1!@({Na=HTf^3W zC3gG;qpSa|gVrppVy>x#Y>O!0#Gn|&US-Xh_b9L@b3K|kskMlvCWzi-*m9Yg6db4b z!gk}FF~c^Rt*;N03r(Z#PNB0jFF2X=5Z?{EdHZ0l@G@*hQ+YF{>>LoZ`>G!jG}_Nh zhUY~z^Sj0+Bt7SN;2q_;He$gFp;8{UKom@Z5n+x=Wu zDBXp64gT&G{653C>g7g?nFq2Q5vF2}FJ>LIB%sqFmxw2)YS%`nqP}Zx>Y&HbpEObi|0lc^q2EIeKZ3|7AAChbF zGGPeUF7m<6=2ZwvSL1H)RL>z*KFY3!D3hbwh_eAeQFC?4rJD>KLnDkDFJ$5*rP?Hj6+rRXoU~e#hNQ*8_bxc_ zyos6>>D>QuP2`R(%Evn6nN-sKL>5>7ySD()(TOO_Ujk_CnSez+Gmw)3fw8NWFc-bz zdBxUKoSjR?mdQHvN;EoMNjK`F4%j)12_+-}a^aN82m;u`TQy@x z*w~wGq;tnp;t_~qX6;HAHUZ*|+?gNMMrUmzhL1KY>nW~KHQMu6_g?F`b@k43sC50u zC&Xc!P=pTPiEMvFFRha0RrAx1|W#u0k=g^*KTs`Bye!8#1 zgQjbfdZ2>h;pSW$Xu$IFIro0QXz00h3lelJG30He~uH2;W{ zhjgh;Uoyjzm*fEXfDdI->87yBufMArQ-(;8s``!5*TpG!!-=VW~kIiiQ zrlZ*9bwMcgj$<|yP0L!T$mj~iaa_B(UpK!xQYFdjWjNoiCl?jPS`P;U1Bk>r4|t{` zl^@!0WT@t04q>A`fMI9RP(Nxbo=F`V2iAN zt}QSXCqm&MHw!;$dmPpTy;I&MTv#2aRi;Ddt3HuS7@N$5Xwq3#Ahc|>lcgAXN+APF z3GRKiBcIB@!#JQ`n0hRAjYod>w6*Ys#>SicbCU2G6^706yw1umMQVQL`l@xrq9jA1 zUG{=b7r6R4frlGGph51AstH-Px-|pfCuc<52mv7%6or+9PQSZ3^u;bwZFkR_w#Bm@ z=x{&FK_tNv5r40X)4!Zhn#GL{d+%ld3QM82(AH3RGkK1wz?Wb$&0ZBX>MMjgR4Dy$ z=3y-UB<-);b|Zz$~A>IVnzNMAq5E`&ulousu*}aC#)0>SX>VH%0C6$97w{Z0KXO zyVuZSBdlV{=4_>Ji94L17#)rN%r(ufIh)K?d+SL|%Ea^U5RK(4zCg$$btmI&r^}4V zat39$yUhtI`b{Y)%CwS*@;vsDrD&$(`%d4M+p4v|UVW=WF^bVQWzeUy-mFP2C1OZ^ zZ~9Oc;t`I|9Nx*RK8@G-+Nb&{NzLg(=n1(KWGNRHV6lXx3+a-;ZRdXL2Z+rhBwbsF zgM5Y}Nyd;YX%Fct0cL}3#DL`WW*h!o-^)a|W%omK_V5dB@2>mDEQ$UZ?LPC6`p& zzJp<3o87e|rp)uw5EQ5~?m8E3o<=eRvJ6MU#eB{LjqM48J~;2v9at2M_-~@-A+K|!}(4XIeLk)-aRgMMXlx~4y)1M{5UQ`6z&9 z({uhFK=w4!zC$E`0I^V3S`vExxgoegF=N2)od;Fk=*WpJ2)R`f*$008S6ES_1ce z%FKkyxqzf%0C@7pMg1{Sf6S>rR^J~X;g8_?|D$aNmCWocE(5A30=k*~X!A z8o}qzue^0N%d>8vdQFdVfyu6rF`Pl@Goucytl}^6@P1zq`kVhTe4v^Gf~g}wlPM6; zm2h^0VJnDo1l`U~=;~buaPw?QPZ?}Wpl=I^Rh_1v0NP1Lbx0bZx;@3;O1U-H5c#v8 zoG|(&nwq)VilH#L!L}8Isf$2sDJjw8I>|(=xz7I;HVQN8iAM5NjM`&MdABhb*FGGlb8vPUX;wur+pXddk$4E zqz&A@G_%vTHCyR3&+FY?`@;5y-tu2E0sgUZBpovGgbk4q>DuL?l6<<# zC__R($B;eVDUSVJ0Ju|V7P1rq^QnV8LrD&CBHr#i0?UfLt z$o;YnM;-AX$^jIVTu2zjTMRa=jsU*c+X-lzLc17uS?{r5#?vGNAl%Mj( zCWR9?uP`=kpob<&Z)h2n31)*NL-OQfLwu)^&kYp>??IPv-8R_kB({r?E9P7mAx@g7 zAVJJaTZB`Dpsvmdon5{rN02oVH&en1{$ zmpqP`9}R8BCW!B-B-E!pw;5Yl%(GP-@s+!fXPL|uXVLZ`yP!#b6R@n8&>qO4?-05G zjUGW1X@D!DnNY@rRHl?-J&`M=g=q$P2iIQ=%D-RCtR3UKI0gxX#h_if$OkrC?HTC8 zR7>(Q-d^Tyb++xopoR)Nsdmw)=>_+Mc0>FZwbmjIj^p46R85v6px+|E+vg|uXz}sp zp6Qjm=!|9Zc$gVysB#;^R?ZzD$OyRzw5Qr|fFW}aXs1p#VK@U2Z6oy&&m7!qX538; zbGuYZ<@H!{n}jaMM|F5M>~s|E1~f)fviDR;91dE!8dZO|I`Afz)A~(bTUwaZm9^xY zy`l3{_J%u|Os@mGW|JNCP9O%04jq6JCh>!9upJBHyA$0w9&9gh3oSW1OHHxYWhkBX zldHLs3~7JIgaM=@pfuaGIs!(%!Xzd{_5!(Pn$30*BVg~)F6HkpoX}F-&qgvBkjOew z^zs15f{xxkosm$+6%7d4n9dr99}hZNO7;Y0HM7CaH)OXL&4ZyFNm@O|JL7|7yI4$$ z8?vDK+9^G*uf^nT^4bAl#Ff1}FutS@qDq4mw%>g>hGs6)| z-#wsUvr1uZVeRmg+6y_qWhTc7VLEjrl{F`&DndsL-Don@G<^jpyqA`Xc3+LDa!@6v z-p^67W!F$rq3mH|ZMx}=xm(M7DnSSRDAn;U!zcPMtQ?VubM>^EirftU7R>Y>vCE_L zxjW?LlgYX&!rX>^b!%wbf6ndozNgBgIr zE)kfQaQTSDrH)cPT_+O=SG?KC^Bj*sTS|_+ZZHQ>XqJ@{zvkJxYw0b_M zobh(r%gHi*y|ZfItYCJ#L!cBS5K85%c|U1JPz#qD=!+WO3gYX;_K~R%=x)Ez8?A<77Ez~jRK`L zwM42I#S?Xk_M}qV_IO$oT%j9RE?>Mi%MmS#n-_hbz!|T&E%m4@LW-dZB8<^7s?Z4^y2XNweclbDK1F&M@0jvNjpJcsVK9KGgouBtZxflGwv~f9hMy_w0Lry4K78d)GSa-o5wP z`#azFoyG2p(OPMbeEQZrN>bmQG4?djhSsrg{^})L=p5_KI}Z3X%HmVo@`U|dCzOM5 z=e(ppZ;ZKNz#x;Y9@MUS(}WTrAfb-WW^*=LQ{RLxc>&izbic%9Nk_6@Fip;AaJR0z zCwNK9Z@8bl#m%iblbvsqanx5d;VlUx9v@dHaEixkUbffOkV(brvTfqzVBz!ZTe0O4 zt80hsvU5*g-IE?(_k9s%7I!MFZ((yN;`VXURltH{S}DZf5cu#=(GwkP%@2PKB zFk)hH(PypjhXn7K$hE!UY37abE=|3alZ83j?zRZ+i)JfTSFPr(&u%9=QAKv*Rb2Am z495IMG6^($VtQVGhH=p#^Zm2Cp;|==Q|s z8Lc;hEZX0xhMBG`h^qidpA#`<#G7ygBQ|{;SejS+x)rBTQD9mH9aX=0wC2aSoiJ(D zVddY3@`j0so-W9r&8W;qj+{)YX!3d@ZP=$c;yVdB{hI6%ZGU^saLl3w*WI|O?7P=0 zqXuJ2Z2s}#u`B07AW7Nu8~lA*u}LbMa$G%b9VZ9xONcpY;}luCi4wlzVdbVm+M|2b zwXlSF-R!=^(X0Kilg6gn2FIQsfOnKHR5Vv2sij`YBCYs|T%N-Fs0mU>FpuKs7_8XT zmpE;`8#mSliLckYjp<3UHKE^DJX2Z9&w5fe#zc)bti7}MS<*FTWi#FF6SyJDd-)5d!l7~mg)P5rLBaX zJ|PfeD-{qVnE=|`u~8HTn9{en_A)r&`oCWX0Ki1_n}@)@#{WUQL>thkXN>_$I7JxD zlbxZG#>KYP07>)!NMaAzEH+EwxU~#HbR@tVe+GD?e0B0~CyzUMR>&)gyc)~*iod+i zCZiPqakLN^zROlfzJ7`_PqD`dDc!W|=T66O_>~Z6eE&obU^XA1yP}OMT(yp8H=4ls zM?PwiK268^L=&P4dA@Ak|Ik`UkP?#+(#rnn zx#*$7{neiJv?+$*UXwb3?JAg&tv(EtWb^160pXnpeG+9Q_2O<^TyYC99XuuJGEL1^ z@}kL@9%yFH<9K^TR}r$*j+tGuQXxVL{RX)38OwHkfUG( zN0rcpk`IvmDQ9Iwz!KoB`vB1gSQ}IDt1QbNi0wl)#cXLAI8866E%gw}8+LzyIDbPB ztR!3@v~tBjI^v?!#t>VdlSIjSTYze2KstqfHf8`!l*|}n1xDvvG^bd*JCBPPs^%f7 z*Z6+@qWx2W{`m><`nmQ)G}AUUH5EfX9FI)$)1dVh*(Xl_Hn(?P`E0-zS6o{_>^4ia z^}!40^Dj5L+a*ehSZwWCR_*8iE*mGzMA54DRJ~^no4Kd9>g{#!n+~gPy!*?tJpPuc z1mWAs-(I5T&!M*9l<+WthXf^n@k3p*_(3-f7>zM`F!wZmMiYO7Vr{m)4e|xiap*S$@fJl9jLx~!Ull*2$&?ZV2Oat`ni>RkVyUIhZ3=h7I zjDBlrtPjDuP7;-2C6AC5?vS~xkCyGUbg{)}rCV1O?2fw{VeAeZTfJLftEYj0PpyDd-gWD9- z`=4zy{mMHb`t%RnGD%wO2gqw4xX8fH7Nu`1IdLK(aw7BcXqWBzoC0M9Qzy`T^Y*)A z1BonUvA0Ham9#w5tP z0alMi@Z>9|2{9%8Yr23;s<|T7Ni;uh;7_t1ayCO?AK1Uzqf@@&hVu1(htIM*`?qHg z|Kric-|2Oz;&gr~B|M=FH%tM@CuTL>o(^jfOCvk%I@jGg>72_{T9>*3mFPLHdg)UhG_jqp_6DU?nfBlA$* zMemw`z^Il^Kf{~KmmzPX{*@Q|$wK_+QTOu;!+(5I?MR4{L_B7JKN0clD<8@DmyRm0 z=IzRMOg@Ju6)DLUn1K=1$^~)L&Ih5fUrqv=ZlHTt+>0Bjn)w@YqNzlZcj< zFek$JCX&00Z8Ju9N>ira%Sp$~E-)=;^=LV14&1DLKFQx66sD4sE2rWPg@vx0+55g* zNzv?>Z~lPx$*^@n(X|CniAa;RI>>9zb{3-}Y932%SOa#_rFpXILmdixlV`I2F>ef3 zg=JW3{*6P|wqF# Date: Mon, 14 Mar 2022 17:55:30 +0300 Subject: [PATCH 0034/1022] Fix finding end of WAL on safekeepers after f86cf93435133ee11. That commit dropped wal_start_lsn, now we're looking since commit_lsn, which is the real end of WAL if no records follow it. ref #1351 --- postgres_ffi/src/xlog_utils.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/postgres_ffi/src/xlog_utils.rs b/postgres_ffi/src/xlog_utils.rs index caf1940a9c..d2b2b5c122 100644 --- a/postgres_ffi/src/xlog_utils.rs +++ b/postgres_ffi/src/xlog_utils.rs @@ -132,6 +132,8 @@ pub fn get_current_timestamp() -> TimestampTz { } } +/// Return offset of the last valid record in the segment segno, starting +/// looking at start_offset. Returns start_offset if no records found. fn find_end_of_wal_segment( data_dir: &Path, segno: XLogSegNo, @@ -147,7 +149,7 @@ fn find_end_of_wal_segment( let mut rec_offs: usize = 0; let mut buf = [0u8; XLOG_BLCKSZ]; let file_name = XLogFileName(tli, segno, wal_seg_size); - let mut last_valid_rec_pos: usize = 0; + let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap(); file.seek(SeekFrom::Start(offs as u64))?; let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS]; From 9c1a9a1d9f315adac161e5490b314dde63e3e292 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 14 Mar 2022 20:06:25 +0200 Subject: [PATCH 0035/1022] Update Cargo.lock for new dependencies (#1354) Commit b2ad8342d2 added dependency on 'criterion', which pulled along some other crates. --- Cargo.lock | 183 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index ad38a41d91..b1ebe6c07a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -260,6 +260,18 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426" +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "bumpalo" version = "3.9.1" @@ -281,6 +293,15 @@ dependencies = [ "serde", ] +[[package]] +name = "cast" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a" +dependencies = [ + "rustc_version", +] + [[package]] name = "cc" version = "1.0.72" @@ -447,6 +468,76 @@ dependencies = [ "rustc_version", ] +[[package]] +name = "criterion" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10" +dependencies = [ + "atty", + "cast", + "clap 2.34.0", + "criterion-plot", + "csv", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e54ea8bc3fb1ee042f5aace6e3c6e025d3874866da222930f70ce62aceba0bfa" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c00d6d2ea26e8b151d99093005cb442fb9a37aeaca582a03ec70946f49ab5ed9" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "lazy_static", + "memoffset", + "scopeguard", +] + [[package]] name = "crossbeam-utils" version = "0.8.7" @@ -477,6 +568,28 @@ dependencies = [ "subtle", ] +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa 0.4.8", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + [[package]] name = "daemonize" version = "0.4.1" @@ -1260,6 +1373,12 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + [[package]] name = "opaque-debug" version = "0.3.0" @@ -1444,6 +1563,34 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "plotters" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c" + +[[package]] +name = "plotters-svg" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" +dependencies = [ + "plotters-backend", +] + [[package]] name = "postgres" version = "0.19.1" @@ -1664,6 +1811,31 @@ dependencies = [ "rand_core", ] +[[package]] +name = "rayon" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "lazy_static", + "num_cpus", +] + [[package]] name = "rcgen" version = "0.8.14" @@ -2233,6 +2405,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tinyvec" version = "1.5.1" @@ -2855,6 +3037,7 @@ dependencies = [ "bincode", "byteorder", "bytes", + "criterion", "git-version", "hex", "hex-literal", From 705f51db2777228e3e61db77573625cc9929585c Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 16 Mar 2022 21:20:04 +0300 Subject: [PATCH 0036/1022] [proxy] Propagate some errors to user (#1329) * [proxy] Propagate most errors to user This change enables propagation of most errors to the user (e.g. auth and connectivity errors). Some of them will be stripped of sensitive information. As a side effect, most occurrences of `anyhow::Error` were replaced with concrete error types. * [proxy] Box weighty errors --- Cargo.lock | 2 + proxy/Cargo.toml | 2 + proxy/src/auth.rs | 124 ++++++++++++++++++++------- proxy/src/cancellation.rs | 2 +- proxy/src/compute.rs | 61 +++++++++++-- proxy/src/config.rs | 6 +- proxy/src/cplane_api.rs | 111 ++++++++++++++++++------ proxy/src/error.rs | 17 ++++ proxy/src/http.rs | 2 +- proxy/src/main.rs | 3 +- proxy/src/mgmt.rs | 20 +++-- proxy/src/proxy.rs | 176 +++++++++++++++++++++++--------------- proxy/src/stream.rs | 84 +++++++++++++++--- proxy/src/waiters.rs | 37 ++++++-- 14 files changed, 481 insertions(+), 166 deletions(-) create mode 100644 proxy/src/error.rs diff --git a/Cargo.lock b/Cargo.lock index b1ebe6c07a..750ac0edc2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1739,6 +1739,7 @@ dependencies = [ "anyhow", "bytes", "clap 3.0.14", + "fail", "futures", "hashbrown 0.11.2", "hex", @@ -1754,6 +1755,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", + "thiserror", "tokio", "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", "tokio-postgres-rustls", diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index d8d5cbe5bf..dda018a1d8 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" anyhow = "1.0" bytes = { version = "1.0.1", features = ['serde'] } clap = "3.0" +fail = "0.5.0" futures = "0.3.13" hashbrown = "0.11.2" hex = "0.4.3" @@ -21,6 +22,7 @@ rustls = "0.19.1" scopeguard = "1.1.0" serde = "1" serde_json = "1" +thiserror = "1.0" tokio = { version = "1.11", features = ["macros"] } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } tokio-rustls = "0.22.0" diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index a5bdaeaeca..5e6357fe80 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,11 +1,79 @@ use crate::compute::DatabaseInfo; use crate::config::ProxyConfig; use crate::cplane_api::{self, CPlaneApi}; +use crate::error::UserFacingError; use crate::stream::PqStream; -use anyhow::{anyhow, bail, Context}; +use crate::waiters; use std::collections::HashMap; +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use zenith_utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage, FeMessage as Fe}; +use zenith_utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; + +/// Common authentication error. +#[derive(Debug, Error)] +pub enum AuthErrorImpl { + /// Authentication error reported by the console. + #[error(transparent)] + Console(#[from] cplane_api::AuthError), + + /// For passwords that couldn't be processed by [`parse_password`]. + #[error("Malformed password message")] + MalformedPassword, + + /// Errors produced by [`PqStream`]. + #[error(transparent)] + Io(#[from] std::io::Error), +} + +impl AuthErrorImpl { + pub fn auth_failed(msg: impl Into) -> Self { + AuthErrorImpl::Console(cplane_api::AuthError::auth_failed(msg)) + } +} + +impl From for AuthErrorImpl { + fn from(e: waiters::RegisterError) -> Self { + AuthErrorImpl::Console(cplane_api::AuthError::from(e)) + } +} + +impl From for AuthErrorImpl { + fn from(e: waiters::WaitError) -> Self { + AuthErrorImpl::Console(cplane_api::AuthError::from(e)) + } +} + +#[derive(Debug, Error)] +#[error(transparent)] +pub struct AuthError(Box); + +impl From for AuthError +where + AuthErrorImpl: From, +{ + fn from(e: T) -> Self { + AuthError(Box::new(e.into())) + } +} + +impl UserFacingError for AuthError { + fn to_string_client(&self) -> String { + use AuthErrorImpl::*; + match self.0.as_ref() { + Console(e) => e.to_string_client(), + MalformedPassword => self.to_string(), + _ => "Internal error".to_string(), + } + } +} + +#[derive(Debug, Error)] +pub enum ClientCredsParseError { + #[error("Parameter `{0}` is missing in startup packet")] + MissingKey(&'static str), +} + +impl UserFacingError for ClientCredsParseError {} /// Various client credentials which we use for authentication. #[derive(Debug, PartialEq, Eq)] @@ -15,13 +83,13 @@ pub struct ClientCredentials { } impl TryFrom> for ClientCredentials { - type Error = anyhow::Error; + type Error = ClientCredsParseError; fn try_from(mut value: HashMap) -> Result { let mut get_param = |key| { value .remove(key) - .with_context(|| format!("{} is missing in startup packet", key)) + .ok_or(ClientCredsParseError::MissingKey(key)) }; let user = get_param("user")?; @@ -37,10 +105,14 @@ impl ClientCredentials { self, config: &ProxyConfig, client: &mut PqStream, - ) -> anyhow::Result { + ) -> Result { + fail::fail_point!("proxy-authenticate", |_| { + Err(AuthError::auth_failed("failpoint triggered")) + }); + use crate::config::ClientAuthMethod::*; use crate::config::RouterConfig::*; - let db_info = match &config.router_config { + match &config.router_config { Static { host, port } => handle_static(host.clone(), *port, client, self).await, Dynamic(Mixed) => { if self.user.ends_with("@zenith") { @@ -51,9 +123,7 @@ impl ClientCredentials { } Dynamic(Password) => handle_existing_user(config, client, self).await, Dynamic(Link) => handle_new_user(config, client).await, - }; - - db_info.context("failed to authenticate client") + } } } @@ -66,18 +136,14 @@ async fn handle_static( port: u16, client: &mut PqStream, creds: ClientCredentials, -) -> anyhow::Result { +) -> Result { client .write_message(&Be::AuthenticationCleartextPassword) .await?; // Read client's password bytes - let msg = match client.read_message().await? { - Fe::PasswordMessage(msg) => msg, - bad => bail!("unexpected message type: {:?}", bad), - }; - - let cleartext_password = std::str::from_utf8(&msg)?.split('\0').next().unwrap(); + let msg = client.read_password_message().await?; + let cleartext_password = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; let db_info = DatabaseInfo { host, @@ -98,7 +164,7 @@ async fn handle_existing_user( config: &ProxyConfig, client: &mut PqStream, creds: ClientCredentials, -) -> anyhow::Result { +) -> Result { let psql_session_id = new_psql_session_id(); let md5_salt = rand::random(); @@ -107,18 +173,12 @@ async fn handle_existing_user( .await?; // Read client's password hash - let msg = match client.read_message().await? { - Fe::PasswordMessage(msg) => msg, - bad => bail!("unexpected message type: {:?}", bad), - }; + let msg = client.read_password_message().await?; + let md5_response = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; - let (_trailing_null, md5_response) = msg - .split_last() - .ok_or_else(|| anyhow!("unexpected password message"))?; - - let cplane = CPlaneApi::new(&config.auth_endpoint); + let cplane = CPlaneApi::new(config.auth_endpoint.clone()); let db_info = cplane - .authenticate_proxy_request(creds, md5_response, &md5_salt, &psql_session_id) + .authenticate_proxy_client(creds, md5_response, &md5_salt, &psql_session_id) .await?; client @@ -131,7 +191,7 @@ async fn handle_existing_user( async fn handle_new_user( config: &ProxyConfig, client: &mut PqStream, -) -> anyhow::Result { +) -> Result { let psql_session_id = new_psql_session_id(); let greeting = hello_message(&config.redirect_uri, &psql_session_id); @@ -143,8 +203,8 @@ async fn handle_new_user( .write_message(&Be::NoticeResponse(greeting)) .await?; - // Wait for web console response - waiter.await?.map_err(|e| anyhow!(e)) + // Wait for web console response (see `mgmt`) + waiter.await?.map_err(AuthErrorImpl::auth_failed) }) .await?; @@ -153,6 +213,10 @@ async fn handle_new_user( Ok(db_info) } +fn parse_password(bytes: &[u8]) -> Option<&str> { + std::str::from_utf8(bytes).ok()?.strip_suffix('\0') +} + fn hello_message(redirect_uri: &str, session_id: &str) -> String { format!( concat![ diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index c1a7e81be9..07d3bcc71a 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -6,7 +6,7 @@ use tokio::net::TcpStream; use tokio_postgres::{CancelToken, NoTls}; use zenith_utils::pq_proto::CancelKeyData; -/// Enables serving CancelRequests. +/// Enables serving `CancelRequest`s. #[derive(Default)] pub struct CancelMap(Mutex>>); diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 7c294bd488..64ce5d0a5a 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,6 +1,27 @@ -use anyhow::Context; +use crate::cancellation::CancelClosure; +use crate::error::UserFacingError; use serde::{Deserialize, Serialize}; -use std::net::{SocketAddr, ToSocketAddrs}; +use std::io; +use std::net::SocketAddr; +use thiserror::Error; +use tokio::net::TcpStream; +use tokio_postgres::NoTls; + +#[derive(Debug, Error)] +pub enum ConnectionError { + /// This error doesn't seem to reveal any secrets; for instance, + /// [`tokio_postgres::error::Kind`] doesn't contain ip addresses and such. + #[error("Failed to connect to the compute node: {0}")] + Postgres(#[from] tokio_postgres::Error), + + #[error("Failed to connect to the compute node")] + FailedToConnectToCompute, + + #[error("Failed to fetch compute node version")] + FailedToFetchPgVersion, +} + +impl UserFacingError for ConnectionError {} /// Compute node connection params. #[derive(Serialize, Deserialize, Debug, Default)] @@ -12,14 +33,38 @@ pub struct DatabaseInfo { pub password: Option, } +/// PostgreSQL version as [`String`]. +pub type Version = String; + impl DatabaseInfo { - pub fn socket_addr(&self) -> anyhow::Result { + async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { let host_port = format!("{}:{}", self.host, self.port); - host_port - .to_socket_addrs() - .with_context(|| format!("cannot resolve {} to SocketAddr", host_port))? - .next() - .context("cannot resolve at least one SocketAddr") + let socket = TcpStream::connect(host_port).await?; + let socket_addr = socket.peer_addr()?; + + Ok((socket_addr, socket)) + } + + /// Connect to a corresponding compute node. + pub async fn connect(self) -> Result<(TcpStream, Version, CancelClosure), ConnectionError> { + let (socket_addr, mut socket) = self + .connect_raw() + .await + .map_err(|_| ConnectionError::FailedToConnectToCompute)?; + + // TODO: establish a secure connection to the DB + let (client, conn) = tokio_postgres::Config::from(self) + .connect_raw(&mut socket, NoTls) + .await?; + + let version = conn + .parameter("server_version") + .ok_or(ConnectionError::FailedToFetchPgVersion)? + .into(); + + let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); + + Ok((socket, version, cancel_closure)) } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 9ab64db795..077ff02898 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, ensure, Context}; +use anyhow::{anyhow, bail, ensure, Context}; use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig}; use std::net::SocketAddr; use std::str::FromStr; @@ -29,7 +29,7 @@ impl FromStr for ClientAuthMethod { "password" => Ok(Password), "link" => Ok(Link), "mixed" => Ok(Mixed), - _ => Err(anyhow::anyhow!("Invlid option for router")), + _ => bail!("Invalid option for router: `{}`", s), } } } @@ -53,7 +53,7 @@ pub struct ProxyConfig { pub redirect_uri: String, /// control plane address where we would check auth. - pub auth_endpoint: String, + pub auth_endpoint: reqwest::Url, pub tls_config: Option, } diff --git a/proxy/src/cplane_api.rs b/proxy/src/cplane_api.rs index 187809717f..21fce79df3 100644 --- a/proxy/src/cplane_api.rs +++ b/proxy/src/cplane_api.rs @@ -1,52 +1,113 @@ use crate::auth::ClientCredentials; use crate::compute::DatabaseInfo; -use crate::waiters::{Waiter, Waiters}; -use anyhow::{anyhow, bail}; +use crate::error::UserFacingError; +use crate::mgmt; +use crate::waiters::{self, Waiter, Waiters}; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; +use thiserror::Error; lazy_static! { - static ref CPLANE_WAITERS: Waiters> = Default::default(); + static ref CPLANE_WAITERS: Waiters = Default::default(); } /// Give caller an opportunity to wait for cplane's reply. -pub async fn with_waiter(psql_session_id: impl Into, f: F) -> anyhow::Result +pub async fn with_waiter( + psql_session_id: impl Into, + action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R, +) -> Result where - F: FnOnce(Waiter<'static, Result>) -> R, - R: std::future::Future>, + R: std::future::Future>, + E: From, { let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; - f(waiter).await + action(waiter).await } -pub fn notify(psql_session_id: &str, msg: Result) -> anyhow::Result<()> { +pub fn notify( + psql_session_id: &str, + msg: Result, +) -> Result<(), waiters::NotifyError> { CPLANE_WAITERS.notify(psql_session_id, msg) } /// Zenith console API wrapper. -pub struct CPlaneApi<'a> { - auth_endpoint: &'a str, +pub struct CPlaneApi { + auth_endpoint: reqwest::Url, } -impl<'a> CPlaneApi<'a> { - pub fn new(auth_endpoint: &'a str) -> Self { +impl CPlaneApi { + pub fn new(auth_endpoint: reqwest::Url) -> Self { Self { auth_endpoint } } } -impl CPlaneApi<'_> { - pub async fn authenticate_proxy_request( +#[derive(Debug, Error)] +pub enum AuthErrorImpl { + /// Authentication error reported by the console. + #[error("Authentication failed: {0}")] + AuthFailed(String), + + /// HTTP status (other than 200) returned by the console. + #[error("Console responded with an HTTP status: {0}")] + HttpStatus(reqwest::StatusCode), + + #[error("Console responded with a malformed JSON: {0}")] + MalformedResponse(#[from] serde_json::Error), + + #[error(transparent)] + Transport(#[from] reqwest::Error), + + #[error(transparent)] + WaiterRegister(#[from] waiters::RegisterError), + + #[error(transparent)] + WaiterWait(#[from] waiters::WaitError), +} + +#[derive(Debug, Error)] +#[error(transparent)] +pub struct AuthError(Box); + +impl AuthError { + /// Smart constructor for authentication error reported by `mgmt`. + pub fn auth_failed(msg: impl Into) -> Self { + AuthError(Box::new(AuthErrorImpl::AuthFailed(msg.into()))) + } +} + +impl From for AuthError +where + AuthErrorImpl: From, +{ + fn from(e: T) -> Self { + AuthError(Box::new(e.into())) + } +} + +impl UserFacingError for AuthError { + fn to_string_client(&self) -> String { + use AuthErrorImpl::*; + match self.0.as_ref() { + AuthFailed(_) | HttpStatus(_) => self.to_string(), + _ => "Internal error".to_string(), + } + } +} + +impl CPlaneApi { + pub async fn authenticate_proxy_client( &self, creds: ClientCredentials, - md5_response: &[u8], + md5_response: &str, salt: &[u8; 4], psql_session_id: &str, - ) -> anyhow::Result { - let mut url = reqwest::Url::parse(self.auth_endpoint)?; + ) -> Result { + let mut url = self.auth_endpoint.clone(); url.query_pairs_mut() .append_pair("login", &creds.user) .append_pair("database", &creds.dbname) - .append_pair("md5response", std::str::from_utf8(md5_response)?) + .append_pair("md5response", md5_response) .append_pair("salt", &hex::encode(salt)) .append_pair("psql_session_id", psql_session_id); @@ -55,18 +116,20 @@ impl CPlaneApi<'_> { // TODO: leverage `reqwest::Client` to reuse connections let resp = reqwest::get(url).await?; if !resp.status().is_success() { - bail!("Auth failed: {}", resp.status()) + return Err(AuthErrorImpl::HttpStatus(resp.status()).into()); } let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?; println!("got auth info: #{:?}", auth_info); use ProxyAuthResponse::*; - match auth_info { - Ready { conn_info } => Ok(conn_info), - Error { error } => bail!(error), - NotReady { .. } => waiter.await?.map_err(|e| anyhow!(e)), - } + let db_info = match auth_info { + Ready { conn_info } => conn_info, + Error { error } => return Err(AuthErrorImpl::AuthFailed(error).into()), + NotReady { .. } => waiter.await?.map_err(AuthErrorImpl::AuthFailed)?, + }; + + Ok(db_info) }) .await } diff --git a/proxy/src/error.rs b/proxy/src/error.rs new file mode 100644 index 0000000000..e98e553f83 --- /dev/null +++ b/proxy/src/error.rs @@ -0,0 +1,17 @@ +/// Marks errors that may be safely shown to a client. +/// This trait can be seen as a specialized version of [`ToString`]. +/// +/// NOTE: This trait should not be implemented for [`anyhow::Error`], since it +/// is way too convenient and tends to proliferate all across the codebase, +/// ultimately leading to accidental leaks of sensitive data. +pub trait UserFacingError: ToString { + /// Format the error for client, stripping all sensitive info. + /// + /// Although this might be a no-op for many types, it's highly + /// recommended to override the default impl in case error type + /// contains anything sensitive: various IDs, IP addresses etc. + #[inline(always)] + fn to_string_client(&self) -> String { + self.to_string() + } +} diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 0b693d88dd..33d134678f 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -7,7 +7,7 @@ use zenith_utils::http::json::json_response; use zenith_utils::http::{RouterBuilder, RouterService}; async fn status_handler(_: Request) -> Result, ApiError> { - Ok(json_response(StatusCode::OK, "")?) + json_response(StatusCode::OK, "") } fn make_router() -> RouterBuilder { diff --git a/proxy/src/main.rs b/proxy/src/main.rs index de618ccde9..bd99d0a639 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -20,13 +20,14 @@ mod cancellation; mod compute; mod config; mod cplane_api; +mod error; mod http; mod mgmt; mod proxy; mod stream; mod waiters; -/// Flattens Result> into Result. +/// Flattens `Result>` into `Result`. async fn flatten_err( f: impl Future, JoinError>>, ) -> anyhow::Result<()> { diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 55b49b441f..e53542dfd2 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -79,6 +79,18 @@ enum PsqlSessionResult { Failure(String), } +/// A message received by `mgmt` when a compute node is ready. +pub type ComputeReady = Result; + +impl PsqlSessionResult { + fn into_compute_ready(self) -> ComputeReady { + match self { + Self::Success(db_info) => Ok(db_info), + Self::Failure(message) => Err(message), + } + } +} + impl postgres_backend::Handler for MgmtHandler { fn process_query( &mut self, @@ -99,13 +111,7 @@ fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::R let resp: PsqlSessionResponse = serde_json::from_str(query_string)?; - use PsqlSessionResult::*; - let msg = match resp.result { - Success(db_info) => Ok(db_info), - Failure(message) => Err(message), - }; - - match cplane_api::notify(&resp.session_id, msg) { + match cplane_api::notify(&resp.session_id, resp.result.into_compute_ready()) { Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 1dc301b792..3c7f59bc26 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -1,17 +1,18 @@ use crate::auth; -use crate::cancellation::{self, CancelClosure, CancelMap}; -use crate::compute::DatabaseInfo; +use crate::cancellation::{self, CancelMap}; use crate::config::{ProxyConfig, TlsConfig}; use crate::stream::{MetricsStream, PqStream, Stream}; use anyhow::{bail, Context}; +use futures::TryFutureExt; use lazy_static::lazy_static; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::net::TcpStream; -use tokio_postgres::NoTls; use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter}; use zenith_utils::pq_proto::{BeMessage as Be, *}; +const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; +const ERR_PROTO_VIOLATION: &str = "protocol violation"; + lazy_static! { static ref NUM_CONNECTIONS_ACCEPTED_COUNTER: IntCounter = register_int_counter!( new_common_metric_name("num_connections_accepted"), @@ -30,6 +31,7 @@ lazy_static! { .unwrap(); } +/// A small combinator for pluggable error logging. async fn log_error(future: F) -> F::Output where F: std::future::Future>, @@ -76,20 +78,21 @@ async fn handle_client( } let tls = config.tls_config.clone(); - if let Some((client, creds)) = handshake(stream, tls, cancel_map).await? { - cancel_map - .with_session(|session| async { - connect_client_to_db(config, session, client, creds).await - }) - .await?; - } + let (stream, creds) = match handshake(stream, tls, cancel_map).await? { + Some(x) => x, + None => return Ok(()), // it's a cancellation request + }; - Ok(()) + let client = Client::new(stream, creds); + cancel_map + .with_session(|session| client.connect_to_db(config, session)) + .await } -/// Handle a connection from one client. -/// For better testing experience, `stream` can be -/// any object satisfying the traits. +/// Establish a (most probably, secure) connection with the client. +/// For better testing experience, `stream` can be any object satisfying the traits. +/// It's easier to work with owned `stream` here as we need to updgrade it to TLS; +/// we also take an extra care of propagating only the select handshake errors to client. async fn handshake( stream: S, mut tls: Option, @@ -119,7 +122,7 @@ async fn handshake( stream = PqStream::new(stream.into_inner().upgrade(tls).await?); } } - _ => bail!("protocol violation"), + _ => bail!(ERR_PROTO_VIOLATION), }, GssEncRequest => match stream.get_ref() { Stream::Raw { .. } if !tried_gss => { @@ -128,18 +131,21 @@ async fn handshake( // Currently, we don't support GSSAPI stream.write_message(&Be::EncryptionResponse(false)).await?; } - _ => bail!("protocol violation"), + _ => bail!(ERR_PROTO_VIOLATION), }, StartupMessage { params, .. } => { // Check that the config has been consumed during upgrade // OR we didn't provide it at all (for dev purposes). if tls.is_some() { - let msg = "connection is insecure (try using `sslmode=require`)"; - stream.write_message(&Be::ErrorResponse(msg)).await?; - bail!(msg); + stream.throw_error_str(ERR_INSECURE_CONNECTION).await?; } - break Ok(Some((stream, params.try_into()?))); + // Here and forth: `or_else` demands that we use a future here + let creds = async { params.try_into() } + .or_else(|e| stream.throw_error(e)) + .await?; + + break Ok(Some((stream, creds))); } CancelRequest(cancel_key_data) => { cancel_map.cancel_session(cancel_key_data).await?; @@ -150,58 +156,60 @@ async fn handshake( } } -async fn connect_client_to_db( - config: &ProxyConfig, - session: cancellation::Session<'_>, - mut client: PqStream, +/// Thin connection context. +struct Client { + /// The underlying libpq protocol stream. + stream: PqStream, + /// Client credentials that we care about. creds: auth::ClientCredentials, -) -> anyhow::Result<()> { - let db_info = creds.authenticate(config, &mut client).await?; - let (db, version, cancel_closure) = connect_to_db(db_info).await?; - let cancel_key_data = session.enable_cancellation(cancel_closure); - - client - .write_message_noflush(&BeMessage::ParameterStatus( - BeParameterStatusMessage::ServerVersion(&version), - ))? - .write_message_noflush(&Be::BackendKeyData(cancel_key_data))? - .write_message(&BeMessage::ReadyForQuery) - .await?; - - // This function will be called for writes to either direction. - fn inc_proxied(cnt: usize) { - // Consider inventing something more sophisticated - // if this ever becomes a bottleneck (cacheline bouncing). - NUM_BYTES_PROXIED_COUNTER.inc_by(cnt as u64); - } - - let mut db = MetricsStream::new(db, inc_proxied); - let mut client = MetricsStream::new(client.into_inner(), inc_proxied); - let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?; - - Ok(()) } -/// Connect to a corresponding compute node. -async fn connect_to_db( - db_info: DatabaseInfo, -) -> anyhow::Result<(TcpStream, String, CancelClosure)> { - // TODO: establish a secure connection to the DB - let socket_addr = db_info.socket_addr()?; - let mut socket = TcpStream::connect(socket_addr).await?; +impl Client { + /// Construct a new connection context. + fn new(stream: PqStream, creds: auth::ClientCredentials) -> Self { + Self { stream, creds } + } +} - let (client, conn) = tokio_postgres::Config::from(db_info) - .connect_raw(&mut socket, NoTls) - .await?; +impl Client { + /// Let the client authenticate and connect to the designated compute node. + async fn connect_to_db( + self, + config: &ProxyConfig, + session: cancellation::Session<'_>, + ) -> anyhow::Result<()> { + let Self { mut stream, creds } = self; - let version = conn - .parameter("server_version") - .context("failed to fetch postgres server version")? - .into(); + // Authenticate and connect to a compute node. + let auth = creds.authenticate(config, &mut stream).await; + let db_info = async { auth }.or_else(|e| stream.throw_error(e)).await?; - let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); + let (db, version, cancel_closure) = + db_info.connect().or_else(|e| stream.throw_error(e)).await?; + let cancel_key_data = session.enable_cancellation(cancel_closure); - Ok((socket, version, cancel_closure)) + stream + .write_message_noflush(&BeMessage::ParameterStatus( + BeParameterStatusMessage::ServerVersion(&version), + ))? + .write_message_noflush(&Be::BackendKeyData(cancel_key_data))? + .write_message(&BeMessage::ReadyForQuery) + .await?; + + /// This function will be called for writes to either direction. + fn inc_proxied(cnt: usize) { + // Consider inventing something more sophisticated + // if this ever becomes a bottleneck (cacheline bouncing). + NUM_BYTES_PROXIED_COUNTER.inc_by(cnt as u64); + } + + // Starting from here we only proxy the client's traffic. + let mut db = MetricsStream::new(db, inc_proxied); + let mut client = MetricsStream::new(stream.into_inner(), inc_proxied); + let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?; + + Ok(()) + } } #[cfg(test)] @@ -210,7 +218,7 @@ mod tests { use tokio::io::DuplexStream; use tokio_postgres::config::SslMode; - use tokio_postgres::tls::MakeTlsConnect; + use tokio_postgres::tls::{MakeTlsConnect, NoTls}; use tokio_postgres_rustls::MakeRustlsConnect; async fn dummy_proxy( @@ -264,7 +272,7 @@ mod tests { let proxy = tokio::spawn(dummy_proxy(client, Some(server_config.into()))); - tokio_postgres::Config::new() + let client_err = tokio_postgres::Config::new() .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Disable) @@ -273,11 +281,15 @@ mod tests { .err() // -> Option .context("client shouldn't be able to connect")?; - proxy + assert!(client_err.to_string().contains(ERR_INSECURE_CONNECTION)); + + let server_err = proxy .await? .err() // -> Option .context("server shouldn't accept client")?; + assert!(client_err.to_string().contains(&server_err.to_string())); + Ok(()) } @@ -329,4 +341,30 @@ mod tests { proxy.await? } + + #[tokio::test] + async fn give_user_an_error_for_bad_creds() -> anyhow::Result<()> { + let (client, server) = tokio::io::duplex(1024); + + let proxy = tokio::spawn(dummy_proxy(client, None)); + + let client_err = tokio_postgres::Config::new() + .ssl_mode(SslMode::Disable) + .connect_raw(server, NoTls) + .await + .err() // -> Option + .context("client shouldn't be able to connect")?; + + // TODO: this is ugly, but `format!` won't allow us to extract fmt string + assert!(client_err.to_string().contains("missing in startup packet")); + + let server_err = proxy + .await? + .err() // -> Option + .context("server shouldn't accept client")?; + + assert!(client_err.to_string().contains(&server_err.to_string())); + + Ok(()) + } } diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 8fd5bef388..fb0be84584 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,10 +1,12 @@ -use anyhow::Context; +use crate::error::UserFacingError; +use anyhow::bail; use bytes::BytesMut; use pin_project_lite::pin_project; use rustls::ServerConfig; use std::pin::Pin; use std::sync::Arc; use std::{io, task}; +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, ReadBuf}; use tokio_rustls::server::TlsStream; use zenith_utils::pq_proto::{BeMessage, FeMessage, FeStartupPacket}; @@ -35,38 +37,63 @@ impl PqStream { self.stream } - /// Get a reference to the underlying stream. + /// Get a shared reference to the underlying stream. pub fn get_ref(&self) -> &S { &self.stream } } +fn err_connection() -> io::Error { + io::Error::new(io::ErrorKind::ConnectionAborted, "connection is lost") +} + +// TODO: change error type of `FeMessage::read_fut` +fn from_anyhow(e: anyhow::Error) -> io::Error { + io::Error::new(io::ErrorKind::Other, e.to_string()) +} + impl PqStream { /// Receive [`FeStartupPacket`], which is a first packet sent by a client. - pub async fn read_startup_packet(&mut self) -> anyhow::Result { - match FeStartupPacket::read_fut(&mut self.stream).await? { - Some(FeMessage::StartupPacket(packet)) => Ok(packet), - None => anyhow::bail!("connection is lost"), - other => anyhow::bail!("bad message type: {:?}", other), + pub async fn read_startup_packet(&mut self) -> io::Result { + // TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket` + let msg = FeStartupPacket::read_fut(&mut self.stream) + .await + .map_err(from_anyhow)? + .ok_or_else(err_connection)?; + + match msg { + FeMessage::StartupPacket(packet) => Ok(packet), + _ => panic!("unreachable state"), } } - pub async fn read_message(&mut self) -> anyhow::Result { + pub async fn read_password_message(&mut self) -> io::Result { + match self.read_message().await? { + FeMessage::PasswordMessage(msg) => Ok(msg), + bad => Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("unexpected message type: {:?}", bad), + )), + } + } + + async fn read_message(&mut self) -> io::Result { FeMessage::read_fut(&mut self.stream) - .await? - .context("connection is lost") + .await + .map_err(from_anyhow)? + .ok_or_else(err_connection) } } impl PqStream { /// Write the message into an internal buffer, but don't flush the underlying stream. - pub fn write_message_noflush<'a>(&mut self, message: &BeMessage<'a>) -> io::Result<&mut Self> { + pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { BeMessage::write(&mut self.buffer, message)?; Ok(self) } /// Write the message into an internal buffer and flush it. - pub async fn write_message<'a>(&mut self, message: &BeMessage<'a>) -> io::Result<&mut Self> { + pub async fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { self.write_message_noflush(message)?; self.flush().await?; Ok(self) @@ -79,6 +106,25 @@ impl PqStream { self.stream.flush().await?; Ok(self) } + + /// Write the error message using [`Self::write_message`], then re-throw it. + /// Allowing string literals is safe under the assumption they might not contain any runtime info. + pub async fn throw_error_str(&mut self, error: &'static str) -> anyhow::Result { + // This method exists due to `&str` not implementing `Into` + self.write_message(&BeMessage::ErrorResponse(error)).await?; + bail!(error) + } + + /// Write the error message using [`Self::write_message`], then re-throw it. + /// Trait [`UserFacingError`] acts as an allowlist for error types. + pub async fn throw_error(&mut self, error: E) -> anyhow::Result + where + E: UserFacingError + Into, + { + let msg = error.to_string_client(); + self.write_message(&BeMessage::ErrorResponse(&msg)).await?; + bail!(error) + } } pin_project! { @@ -101,15 +147,25 @@ impl Stream { } } +#[derive(Debug, Error)] +#[error("Can't upgrade TLS stream")] +pub enum StreamUpgradeError { + #[error("Bad state reached: can't upgrade TLS stream")] + AlreadyTls, + + #[error("Can't upgrade stream: IO error: {0}")] + Io(#[from] io::Error), +} + impl Stream { /// If possible, upgrade raw stream into a secure TLS-based stream. - pub async fn upgrade(self, cfg: Arc) -> anyhow::Result { + pub async fn upgrade(self, cfg: Arc) -> Result { match self { Stream::Raw { raw } => { let tls = Box::new(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?); Ok(Stream::Tls { tls }) } - Stream::Tls { .. } => anyhow::bail!("can't upgrade TLS stream"), + Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), } } } diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs index 9fda3ed94f..799d45a165 100644 --- a/proxy/src/waiters.rs +++ b/proxy/src/waiters.rs @@ -1,11 +1,32 @@ -use anyhow::{anyhow, Context}; use hashbrown::HashMap; use parking_lot::Mutex; use pin_project_lite::pin_project; use std::pin::Pin; use std::task; +use thiserror::Error; use tokio::sync::oneshot; +#[derive(Debug, Error)] +pub enum RegisterError { + #[error("Waiter `{0}` already registered")] + Occupied(String), +} + +#[derive(Debug, Error)] +pub enum NotifyError { + #[error("Notify failed: waiter `{0}` not registered")] + NotFound(String), + + #[error("Notify failed: channel hangup")] + Hangup, +} + +#[derive(Debug, Error)] +pub enum WaitError { + #[error("Wait failed: channel hangup")] + Hangup, +} + pub struct Waiters(pub(self) Mutex>>); impl Default for Waiters { @@ -15,13 +36,13 @@ impl Default for Waiters { } impl Waiters { - pub fn register(&self, key: String) -> anyhow::Result> { + pub fn register(&self, key: String) -> Result, RegisterError> { let (tx, rx) = oneshot::channel(); self.0 .lock() .try_insert(key.clone(), tx) - .map_err(|_| anyhow!("waiter already registered"))?; + .map_err(|e| RegisterError::Occupied(e.entry.key().clone()))?; Ok(Waiter { receiver: rx, @@ -32,7 +53,7 @@ impl Waiters { }) } - pub fn notify(&self, key: &str, value: T) -> anyhow::Result<()> + pub fn notify(&self, key: &str, value: T) -> Result<(), NotifyError> where T: Send + Sync, { @@ -40,9 +61,9 @@ impl Waiters { .0 .lock() .remove(key) - .with_context(|| format!("key {} not found", key))?; + .ok_or_else(|| NotifyError::NotFound(key.to_string()))?; - tx.send(value).map_err(|_| anyhow!("waiter channel hangup")) + tx.send(value).map_err(|_| NotifyError::Hangup) } } @@ -66,13 +87,13 @@ pin_project! { } impl std::future::Future for Waiter<'_, T> { - type Output = anyhow::Result; + type Output = Result; fn poll(self: Pin<&mut Self>, cx: &mut task::Context<'_>) -> task::Poll { self.project() .receiver .poll(cx) - .map_err(|_| anyhow!("channel hangup")) + .map_err(|_| WaitError::Hangup) } } From 15a2a2bf0446653e6a737b932b3c11f616ec20ec Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Wed, 16 Mar 2022 23:00:01 +0300 Subject: [PATCH 0037/1022] release 2202-03-16 (#1373) production deploy --- .circleci/config.yml | 30 +- .github/workflows/benchmarking.yml | 2 +- Cargo.lock | 186 ++++++ Dockerfile | 86 +-- Dockerfile.build | 33 +- README.md | 18 +- compute_tools/src/pg_helpers.rs | 2 +- compute_tools/src/spec.rs | 2 +- control_plane/Cargo.toml | 1 + control_plane/safekeepers.conf | 6 +- control_plane/simple.conf | 2 +- control_plane/src/compute.rs | 64 +- control_plane/src/local_env.rs | 191 ++++-- control_plane/src/safekeeper.rs | 38 +- control_plane/src/storage.rs | 123 ++-- docker-entrypoint.sh | 2 +- docs/docker.md | 24 +- docs/rfcs/002-storage.md | 186 ++++++ docs/rfcs/003-laptop-cli.md | 267 ++++++++ docs/rfcs/004-durability.md | 218 +++++++ docs/rfcs/005-zenith_local.md | 103 +++ docs/rfcs/006-laptop-cli-v2-CLI.md | 64 ++ .../006-laptop-cli-v2-repository-structure.md | 140 ++++ docs/rfcs/007-serverless-on-laptop.md | 93 +++ docs/rfcs/008-push-pull.md | 66 ++ docs/rfcs/009-snapshot-first-storage-cli.md | 56 ++ docs/rfcs/009-snapshot-first-storage-pitr.md | 227 +++++++ docs/rfcs/009-snapshot-first-storage.md | 148 +++++ docs/rfcs/010-storage_details.md | 144 ++++ docs/rfcs/011-retention-policy.md | 91 +++ docs/rfcs/012-background-tasks.md | 38 ++ docs/rfcs/013-term-history.md | 147 +++++ docs/rfcs/README.md | 95 +++ docs/rfcs/images/storage.jpeg | Bin 0 -> 431075 bytes pageserver/src/bin/pageserver.rs | 44 +- pageserver/src/branches.rs | 428 ------------ pageserver/src/config.rs | 301 +++++++-- pageserver/src/http/models.rs | 125 +++- pageserver/src/http/openapi_spec.yml | 221 ++----- pageserver/src/http/routes.rs | 237 +++---- pageserver/src/layered_repository.rs | 147 +++-- .../src/layered_repository/inmemory_layer.rs | 9 +- pageserver/src/lib.rs | 2 +- pageserver/src/page_service.rs | 10 +- pageserver/src/remote_storage/README.md | 8 - pageserver/src/remote_storage/storage_sync.rs | 62 +- .../remote_storage/storage_sync/download.rs | 100 +-- .../src/remote_storage/storage_sync/index.rs | 37 +- .../src/remote_storage/storage_sync/upload.rs | 94 +-- pageserver/src/repository.rs | 23 +- pageserver/src/tenant_mgr.rs | 39 +- pageserver/src/timelines.rs | 408 ++++++++++++ pageserver/src/walrecord.rs | 12 +- postgres_ffi/src/xlog_utils.rs | 4 +- proxy/Cargo.toml | 2 + proxy/src/auth.rs | 124 +++- proxy/src/cancellation.rs | 2 +- proxy/src/compute.rs | 61 +- proxy/src/config.rs | 6 +- proxy/src/cplane_api.rs | 111 +++- proxy/src/error.rs | 17 + proxy/src/http.rs | 2 +- proxy/src/main.rs | 5 +- proxy/src/mgmt.rs | 20 +- proxy/src/proxy.rs | 176 +++-- proxy/src/stream.rs | 84 ++- proxy/src/waiters.rs | 37 +- test_runner/README.md | 2 +- test_runner/batch_others/test_auth.py | 27 +- test_runner/batch_others/test_backpressure.py | 4 +- .../batch_others/test_branch_behind.py | 33 +- .../batch_others/test_clog_truncate.py | 8 +- test_runner/batch_others/test_createdropdb.py | 16 +- test_runner/batch_others/test_createuser.py | 6 +- test_runner/batch_others/test_multixact.py | 4 +- test_runner/batch_others/test_next_xid.py | 2 +- .../batch_others/test_pageserver_api.py | 43 +- .../batch_others/test_pageserver_catchup.py | 4 +- .../batch_others/test_pageserver_restart.py | 4 +- .../batch_others/test_parallel_copy.py | 2 - test_runner/batch_others/test_proxy.py | 13 + .../batch_others/test_readonly_node.py | 23 +- .../batch_others/test_remote_storage.py | 8 +- .../batch_others/test_restart_compute.py | 5 +- .../batch_others/test_tenant_relocation.py | 14 +- test_runner/batch_others/test_tenants.py | 26 +- .../batch_others/test_timeline_size.py | 17 +- test_runner/batch_others/test_twophase.py | 1 - test_runner/batch_others/test_wal_acceptor.py | 116 ++-- .../batch_others/test_wal_acceptor_async.py | 4 +- test_runner/batch_others/test_zenith_cli.py | 65 +- test_runner/fixtures/compare_fixtures.py | 5 +- test_runner/fixtures/zenith_fixtures.py | 434 ++++++++----- .../performance/test_bulk_tenant_create.py | 15 +- vendor/postgres | 2 +- walkeeper/src/bin/safekeeper.rs | 88 ++- walkeeper/src/control_file.rs | 104 ++- walkeeper/src/control_file_upgrade.rs | 82 ++- walkeeper/src/handler.rs | 38 +- walkeeper/src/http/mod.rs | 1 + walkeeper/src/http/models.rs | 9 + walkeeper/src/http/routes.rs | 44 +- walkeeper/src/lib.rs | 4 +- walkeeper/src/safekeeper.rs | 168 +++-- walkeeper/src/timeline.rs | 128 ++-- walkeeper/src/wal_storage.rs | 23 +- zenith/src/main.rs | 614 ++++++++++++------ zenith_utils/Cargo.toml | 5 + zenith_utils/benches/benchmarks.rs | 22 + zenith_utils/src/auth.rs | 44 +- zenith_utils/src/zid.rs | 229 +++++-- 111 files changed, 5846 insertions(+), 2511 deletions(-) create mode 100644 docs/rfcs/002-storage.md create mode 100644 docs/rfcs/003-laptop-cli.md create mode 100644 docs/rfcs/004-durability.md create mode 100644 docs/rfcs/005-zenith_local.md create mode 100644 docs/rfcs/006-laptop-cli-v2-CLI.md create mode 100644 docs/rfcs/006-laptop-cli-v2-repository-structure.md create mode 100644 docs/rfcs/007-serverless-on-laptop.md create mode 100644 docs/rfcs/008-push-pull.md create mode 100644 docs/rfcs/009-snapshot-first-storage-cli.md create mode 100644 docs/rfcs/009-snapshot-first-storage-pitr.md create mode 100644 docs/rfcs/009-snapshot-first-storage.md create mode 100644 docs/rfcs/010-storage_details.md create mode 100644 docs/rfcs/011-retention-policy.md create mode 100644 docs/rfcs/012-background-tasks.md create mode 100644 docs/rfcs/013-term-history.md create mode 100644 docs/rfcs/README.md create mode 100644 docs/rfcs/images/storage.jpeg delete mode 100644 pageserver/src/branches.rs create mode 100644 pageserver/src/timelines.rs create mode 100644 proxy/src/error.rs create mode 100644 walkeeper/src/http/models.rs create mode 100644 zenith_utils/benches/benchmarks.rs diff --git a/.circleci/config.yml b/.circleci/config.yml index db9fc31334..d342e7c9f4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -440,8 +440,14 @@ jobs: command: | echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin DOCKER_TAG=$(git log --oneline|wc -l) - docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest - docker tag zenithdb/zenith:latest zenithdb/zenith:${DOCKER_TAG} && docker push zenithdb/zenith:${DOCKER_TAG} + docker build \ + --pull \ + --build-arg GIT_VERSION=${CIRCLE_SHA1} \ + --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ + --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ + --tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:latest . + docker push zenithdb/zenith:${DOCKER_TAG} + docker push zenithdb/zenith:latest # Build zenithdb/compute-node:latest image and push it to Docker hub docker-image-compute: @@ -468,8 +474,9 @@ jobs: command: | echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin DOCKER_TAG=$(git log --oneline|wc -l) - docker build -t zenithdb/compute-node:latest vendor/postgres && docker push zenithdb/compute-node:latest - docker tag zenithdb/compute-node:latest zenithdb/compute-node:${DOCKER_TAG} && docker push zenithdb/compute-node:${DOCKER_TAG} + docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:latest vendor/postgres + docker push zenithdb/compute-node:${DOCKER_TAG} + docker push zenithdb/compute-node:latest # Build production zenithdb/zenith:release image and push it to Docker hub docker-image-release: @@ -487,8 +494,14 @@ jobs: command: | echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin DOCKER_TAG="release-$(git log --oneline|wc -l)" - docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:release . && docker push zenithdb/zenith:release - docker tag zenithdb/zenith:release zenithdb/zenith:${DOCKER_TAG} && docker push zenithdb/zenith:${DOCKER_TAG} + docker build \ + --pull \ + --build-arg GIT_VERSION=${CIRCLE_SHA1} \ + --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ + --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ + --tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:release . + docker push zenithdb/zenith:${DOCKER_TAG} + docker push zenithdb/zenith:release # Build production zenithdb/compute-node:release image and push it to Docker hub docker-image-compute-release: @@ -515,8 +528,9 @@ jobs: command: | echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin DOCKER_TAG="release-$(git log --oneline|wc -l)" - docker build -t zenithdb/compute-node:release vendor/postgres && docker push zenithdb/compute-node:release - docker tag zenithdb/compute-node:release zenithdb/compute-node:${DOCKER_TAG} && docker push zenithdb/compute-node:${DOCKER_TAG} + docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:release vendor/postgres + docker push zenithdb/compute-node:${DOCKER_TAG} + docker push zenithdb/compute-node:release deploy-staging: docker: diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index dd23440afb..36df35297d 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -48,7 +48,7 @@ jobs: echo Python python3 --version poetry run python3 --version - echo Pipenv + echo Poetry poetry --version echo Pgbench $PG_BIN/pgbench --version diff --git a/Cargo.lock b/Cargo.lock index ba3c6729d6..750ac0edc2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -260,6 +260,18 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426" +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "bumpalo" version = "3.9.1" @@ -281,6 +293,15 @@ dependencies = [ "serde", ] +[[package]] +name = "cast" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a" +dependencies = [ + "rustc_version", +] + [[package]] name = "cc" version = "1.0.72" @@ -424,6 +445,7 @@ dependencies = [ "thiserror", "toml", "url", + "walkeeper", "workspace_hack", "zenith_utils", ] @@ -446,6 +468,76 @@ dependencies = [ "rustc_version", ] +[[package]] +name = "criterion" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10" +dependencies = [ + "atty", + "cast", + "clap 2.34.0", + "criterion-plot", + "csv", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e54ea8bc3fb1ee042f5aace6e3c6e025d3874866da222930f70ce62aceba0bfa" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c00d6d2ea26e8b151d99093005cb442fb9a37aeaca582a03ec70946f49ab5ed9" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "lazy_static", + "memoffset", + "scopeguard", +] + [[package]] name = "crossbeam-utils" version = "0.8.7" @@ -476,6 +568,28 @@ dependencies = [ "subtle", ] +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa 0.4.8", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + [[package]] name = "daemonize" version = "0.4.1" @@ -1259,6 +1373,12 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + [[package]] name = "opaque-debug" version = "0.3.0" @@ -1443,6 +1563,34 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "plotters" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c" + +[[package]] +name = "plotters-svg" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" +dependencies = [ + "plotters-backend", +] + [[package]] name = "postgres" version = "0.19.1" @@ -1591,6 +1739,7 @@ dependencies = [ "anyhow", "bytes", "clap 3.0.14", + "fail", "futures", "hashbrown 0.11.2", "hex", @@ -1606,6 +1755,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", + "thiserror", "tokio", "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", "tokio-postgres-rustls", @@ -1663,6 +1813,31 @@ dependencies = [ "rand_core", ] +[[package]] +name = "rayon" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "lazy_static", + "num_cpus", +] + [[package]] name = "rcgen" version = "0.8.14" @@ -2232,6 +2407,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tinyvec" version = "1.5.1" @@ -2854,6 +3039,7 @@ dependencies = [ "bincode", "byteorder", "bytes", + "criterion", "git-version", "hex", "hex-literal", diff --git a/Dockerfile b/Dockerfile index dd0dba60ca..9ee6abaa8a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,62 +1,62 @@ +# Build Postgres # -# Docker image for console integration testing. -# +#FROM zimg/rust:1.56 AS pg-build +FROM zenithdb/build:buster-20220309 AS pg-build +WORKDIR /pg + +USER root + +COPY vendor/postgres vendor/postgres +COPY Makefile Makefile -# -# Build Postgres separately --- this layer will be rebuilt only if one of -# mentioned paths will get any changes. -# -FROM zenithdb/build:buster AS pg-build -WORKDIR /zenith -COPY ./vendor/postgres vendor/postgres -COPY ./Makefile Makefile ENV BUILD_TYPE release -RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres -RUN rm -rf postgres_install/build +RUN set -e \ + && make -j $(nproc) -s postgres \ + && rm -rf tmp_install/build \ + && tar -C tmp_install -czf /postgres_install.tar.gz . -# # Build zenith binaries # -# TODO: build cargo deps as separate layer. We used cargo-chef before but that was -# net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work. -# -FROM zenithdb/build:buster AS build +#FROM zimg/rust:1.56 AS build +FROM zenithdb/build:buster-20220309 AS build +ARG GIT_VERSION=local -ARG GIT_VERSION -RUN if [ -z "$GIT_VERSION" ]; then echo "GIT_VERSION is reqired, use build_arg to pass it"; exit 1; fi - -WORKDIR /zenith -COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server +ARG CACHEPOT_BUCKET=zenith-rust-cachepot +ARG AWS_ACCESS_KEY_ID +ARG AWS_SECRET_ACCESS_KEY +#ENV RUSTC_WRAPPER cachepot +ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot +COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server COPY . . -RUN GIT_VERSION=$GIT_VERSION cargo build --release +RUN cargo build --release + +# Build final image # -# Copy binaries to resulting image. -# -FROM debian:buster-slim +FROM debian:bullseye-slim WORKDIR /data -RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl ca-certificates && \ - mkdir zenith_install +RUN set -e \ + && apt-get update \ + && apt-get install -y \ + libreadline-dev \ + libseccomp-dev \ + openssl \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ + && useradd -d /data zenith \ + && chown -R zenith:zenith /data + +COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/pageserver /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/safekeeper /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/proxy /usr/local/bin + +COPY --from=pg-build /pg/tmp_install/ /usr/local/ +COPY --from=pg-build /postgres_install.tar.gz /data/ -COPY --from=build /zenith/target/release/pageserver /usr/local/bin -COPY --from=build /zenith/target/release/safekeeper /usr/local/bin -COPY --from=build /zenith/target/release/proxy /usr/local/bin -COPY --from=pg-build /zenith/tmp_install postgres_install COPY docker-entrypoint.sh /docker-entrypoint.sh -# Remove build artifacts (~ 500 MB) -RUN rm -rf postgres_install/build && \ - # 'Install' Postgres binaries locally - cp -r postgres_install/* /usr/local/ && \ - # Prepare an archive of Postgres binaries (should be around 11 MB) - # and keep it inside container for an ease of deploy pipeline. - cd postgres_install && tar -czf /data/postgres_install.tar.gz . && cd .. && \ - rm -rf postgres_install - -RUN useradd -d /data zenith && chown -R zenith:zenith /data - VOLUME ["/data"] USER zenith EXPOSE 6400 diff --git a/Dockerfile.build b/Dockerfile.build index a9fd2cb0af..44a2aaafb9 100644 --- a/Dockerfile.build +++ b/Dockerfile.build @@ -1,16 +1,23 @@ -# -# Image with all the required dependencies to build https://github.com/zenithdb/zenith -# and Postgres from https://github.com/zenithdb/postgres -# Also includes some rust development and build tools. -# NB: keep in sync with rust image version in .circle/config.yml -# FROM rust:1.56.1-slim-buster -WORKDIR /zenith +WORKDIR /home/circleci/project -# Install postgres and zenith build dependencies -# clang is for rocksdb -RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ - libseccomp-dev pkg-config libssl-dev clang +RUN set -e \ + && apt-get update \ + && apt-get -yq install \ + automake \ + libtool \ + build-essential \ + bison \ + flex \ + libreadline-dev \ + zlib1g-dev \ + libxml2-dev \ + libseccomp-dev \ + pkg-config \ + libssl-dev \ + clang -# Install rust tools -RUN rustup component add clippy && cargo install cargo-audit +RUN set -e \ + && rustup component add clippy \ + && cargo install cargo-audit \ + && cargo install --git https://github.com/paritytech/cachepot diff --git a/README.md b/README.md index 8dd407f41a..c8acf526b9 100644 --- a/README.md +++ b/README.md @@ -57,12 +57,12 @@ pageserver init succeeded Starting pageserver at 'localhost:64000' in '.zenith' Pageserver started initializing for single for 7676 -Starting safekeeper at 'localhost:5454' in '.zenith/safekeepers/single' +Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/single' Safekeeper started # start postgres compute node > ./target/debug/zenith pg start main -Starting new postgres main on main... +Starting new postgres main on timeline 5b014a9e41b4b63ce1a1febc04503636 ... Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432 Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres' waiting for server to start.... done @@ -70,8 +70,8 @@ server started # check list of running postgres instances > ./target/debug/zenith pg list -BRANCH ADDRESS LSN STATUS -main 127.0.0.1:55432 0/1609610 running +NODE ADDRESS TIMELINES BRANCH NAME LSN STATUS +main 127.0.0.1:55432 5b014a9e41b4b63ce1a1febc04503636 main 0/1609610 running ``` 4. Now it is possible to connect to postgres and run some queries: @@ -91,13 +91,13 @@ postgres=# select * from t; 5. And create branches and run postgres on them: ```sh # create branch named migration_check -> ./target/debug/zenith branch migration_check main -Created branch 'migration_check' at 0/1609610 +> ./target/debug/zenith timeline branch --branch-name migration_check +Created timeline '0e9331cad6efbafe6a88dd73ae21a5c9' at Lsn 0/16F5830 for tenant: c03ba6b7ad4c5e9cf556f059ade44229. Ancestor timeline: 'main' # check branches tree -> ./target/debug/zenith branch - main - ┗━ @0/1609610: migration_check +> ./target/debug/zenith timeline list + main [5b014a9e41b4b63ce1a1febc04503636] + ┗━ @0/1609610: migration_check [0e9331cad6efbafe6a88dd73ae21a5c9] # start postgres on that branch > ./target/debug/zenith pg start migration_check diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 8b6dc04069..6a22b865fa 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -171,7 +171,7 @@ impl PgQuote for PgIdent { /// always quotes provided string with `""` and escapes every `"`. Not idempotent, /// i.e. if string is already escaped it will be escaped again. fn quote(&self) -> String { - let result = format!("\"{}\"", self.replace("\"", "\"\"")); + let result = format!("\"{}\"", self.replace('"', "\"\"")); result } } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 41e4174bf0..1dd7c0044e 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -215,7 +215,7 @@ pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> { if let Some(r) = pg_db { // XXX: db owner name is returned as quoted string from Postgres, // when quoting is needed. - let new_owner = if r.owner.starts_with('\"') { + let new_owner = if r.owner.starts_with('"') { db.owner.quote() } else { db.owner.clone() diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 5e972200c2..eff6b3ef2d 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -17,5 +17,6 @@ url = "2.2.2" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } pageserver = { path = "../pageserver" } +walkeeper = { path = "../walkeeper" } zenith_utils = { path = "../zenith_utils" } workspace_hack = { path = "../workspace_hack" } diff --git a/control_plane/safekeepers.conf b/control_plane/safekeepers.conf index 828d5a5a1e..df7dd2adca 100644 --- a/control_plane/safekeepers.conf +++ b/control_plane/safekeepers.conf @@ -5,16 +5,16 @@ listen_http_addr = '127.0.0.1:9898' auth_type = 'Trust' [[safekeepers]] -name = 'sk1' +id = 1 pg_port = 5454 http_port = 7676 [[safekeepers]] -name = 'sk2' +id = 2 pg_port = 5455 http_port = 7677 [[safekeepers]] -name = 'sk3' +id = 3 pg_port = 5456 http_port = 7678 diff --git a/control_plane/simple.conf b/control_plane/simple.conf index 796c6adbd9..2243a0a5f8 100644 --- a/control_plane/simple.conf +++ b/control_plane/simple.conf @@ -6,6 +6,6 @@ listen_http_addr = '127.0.0.1:9898' auth_type = 'Trust' [[safekeepers]] -name = 'single' +id = 1 pg_port = 5454 http_port = 7676 diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index a61191e7a4..64cd46fef6 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -37,7 +37,7 @@ impl ComputeControlPlane { // pgdatadirs // |- tenants // | |- - // | | |- + // | | |- pub fn load(env: LocalEnv) -> Result { let pageserver = Arc::new(PageServerNode::from_env(&env)); @@ -52,7 +52,7 @@ impl ComputeControlPlane { .with_context(|| format!("failed to list {}", tenant_dir.path().display()))? { let node = PostgresNode::from_dir_entry(timeline_dir?, &env, &pageserver)?; - nodes.insert((node.tenantid, node.name.clone()), Arc::new(node)); + nodes.insert((node.tenant_id, node.name.clone()), Arc::new(node)); } } @@ -73,40 +73,14 @@ impl ComputeControlPlane { .unwrap_or(self.base_port) } - // FIXME: see also parse_point_in_time in branches.rs. - fn parse_point_in_time( - &self, - tenantid: ZTenantId, - s: &str, - ) -> Result<(ZTimelineId, Option)> { - let mut strings = s.split('@'); - let name = strings.next().unwrap(); - - let lsn = strings - .next() - .map(Lsn::from_str) - .transpose() - .context("invalid LSN in point-in-time specification")?; - - // Resolve the timeline ID, given the human-readable branch name - let timeline_id = self - .pageserver - .branch_get_by_name(&tenantid, name)? - .timeline_id; - - Ok((timeline_id, lsn)) - } - pub fn new_node( &mut self, - tenantid: ZTenantId, + tenant_id: ZTenantId, name: &str, - timeline_spec: &str, + timeline_id: ZTimelineId, + lsn: Option, port: Option, ) -> Result> { - // Resolve the human-readable timeline spec into timeline ID and LSN - let (timelineid, lsn) = self.parse_point_in_time(tenantid, timeline_spec)?; - let port = port.unwrap_or_else(|| self.get_port()); let node = Arc::new(PostgresNode { name: name.to_owned(), @@ -114,9 +88,9 @@ impl ComputeControlPlane { env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), is_test: false, - timelineid, + timeline_id, lsn, - tenantid, + tenant_id, uses_wal_proposer: false, }); @@ -124,7 +98,7 @@ impl ComputeControlPlane { node.setup_pg_conf(self.env.pageserver.auth_type)?; self.nodes - .insert((tenantid, node.name.clone()), Arc::clone(&node)); + .insert((tenant_id, node.name.clone()), Arc::clone(&node)); Ok(node) } @@ -139,9 +113,9 @@ pub struct PostgresNode { pub env: LocalEnv, pageserver: Arc, is_test: bool, - pub timelineid: ZTimelineId, + pub timeline_id: ZTimelineId, pub lsn: Option, // if it's a read-only node. None for primary - pub tenantid: ZTenantId, + pub tenant_id: ZTenantId, uses_wal_proposer: bool, } @@ -173,8 +147,8 @@ impl PostgresNode { // Read a few options from the config file let context = format!("in config file {}", cfg_path_str); let port: u16 = conf.parse_field("port", &context)?; - let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?; - let tenantid: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?; + let timeline_id: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?; + let tenant_id: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?; let uses_wal_proposer = conf.get("wal_acceptors").is_some(); // parse recovery_target_lsn, if any @@ -188,9 +162,9 @@ impl PostgresNode { env: env.clone(), pageserver: Arc::clone(pageserver), is_test: false, - timelineid, + timeline_id, lsn: recovery_target_lsn, - tenantid, + tenant_id, uses_wal_proposer, }) } @@ -241,9 +215,9 @@ impl PostgresNode { ); let sql = if let Some(lsn) = lsn { - format!("basebackup {} {} {}", self.tenantid, self.timelineid, lsn) + format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn) } else { - format!("basebackup {} {}", self.tenantid, self.timelineid) + format!("basebackup {} {}", self.tenant_id, self.timeline_id) }; let mut client = self @@ -329,8 +303,8 @@ impl PostgresNode { conf.append("shared_preload_libraries", "zenith"); conf.append_line(""); conf.append("zenith.page_server_connstring", &pageserver_connstr); - conf.append("zenith.zenith_tenant", &self.tenantid.to_string()); - conf.append("zenith.zenith_timeline", &self.timelineid.to_string()); + conf.append("zenith.zenith_tenant", &self.tenant_id.to_string()); + conf.append("zenith.zenith_timeline", &self.timeline_id.to_string()); if let Some(lsn) = self.lsn { conf.append("recovery_target_lsn", &lsn.to_string()); } @@ -408,7 +382,7 @@ impl PostgresNode { } pub fn pgdata(&self) -> PathBuf { - self.env.pg_data_dir(&self.tenantid, &self.name) + self.env.pg_data_dir(&self.tenant_id, &self.name) } pub fn status(&self) -> &str { diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index b80e137cb9..2a1d51fe08 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -3,16 +3,20 @@ //! Now it also provides init method which acts like a stub for proper installation //! script which will use local paths. -use anyhow::{bail, Context}; +use anyhow::{bail, ensure, Context}; use serde::{Deserialize, Serialize}; +use std::collections::HashMap; use std::env; -use std::fmt::Write; use std::fs; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use zenith_utils::auth::{encode_from_key_file, Claims, Scope}; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{opt_display_serde, ZTenantId}; +use zenith_utils::zid::{ + HexZTenantId, HexZTimelineId, ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId, +}; + +use crate::safekeeper::SafekeeperNode; // // This data structures represents zenith CLI config @@ -21,7 +25,7 @@ use zenith_utils::zid::{opt_display_serde, ZTenantId}; // to 'zenith init --config=' option. See control_plane/simple.conf for // an example. // -#[derive(Serialize, Deserialize, Clone, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] pub struct LocalEnv { // Base directory for all the nodes (the pageserver, safekeepers and // compute nodes). @@ -45,9 +49,8 @@ pub struct LocalEnv { // Default tenant ID to use with the 'zenith' command line utility, when // --tenantid is not explicitly specified. - #[serde(with = "opt_display_serde")] #[serde(default)] - pub default_tenantid: Option, + pub default_tenant_id: Option, // used to issue tokens during e.g pg start #[serde(default)] @@ -57,11 +60,20 @@ pub struct LocalEnv { #[serde(default)] pub safekeepers: Vec, + + /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. + #[serde(default)] + // A `HashMap>` would be more appropriate here, + // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. + // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". + branch_name_mappings: HashMap>, } -#[derive(Serialize, Deserialize, Clone, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct PageServerConf { + // node id + pub id: ZNodeId, // Pageserver connection settings pub listen_pg_addr: String, pub listen_http_addr: String, @@ -76,6 +88,7 @@ pub struct PageServerConf { impl Default for PageServerConf { fn default() -> Self { Self { + id: ZNodeId(0), listen_pg_addr: String::new(), listen_http_addr: String::new(), auth_type: AuthType::Trust, @@ -84,10 +97,10 @@ impl Default for PageServerConf { } } -#[derive(Serialize, Deserialize, Clone, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct SafekeeperConf { - pub name: String, + pub id: ZNodeId, pub pg_port: u16, pub http_port: u16, pub sync: bool, @@ -96,7 +109,7 @@ pub struct SafekeeperConf { impl Default for SafekeeperConf { fn default() -> Self { Self { - name: String::new(), + id: ZNodeId(0), pg_port: 0, http_port: 0, sync: true, @@ -136,8 +149,74 @@ impl LocalEnv { self.base_data_dir.clone() } - pub fn safekeeper_data_dir(&self, node_name: &str) -> PathBuf { - self.base_data_dir.join("safekeepers").join(node_name) + pub fn safekeeper_data_dir(&self, data_dir_name: &str) -> PathBuf { + self.base_data_dir.join("safekeepers").join(data_dir_name) + } + + pub fn register_branch_mapping( + &mut self, + branch_name: String, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + ) -> anyhow::Result<()> { + let existing_values = self + .branch_name_mappings + .entry(branch_name.clone()) + .or_default(); + + let tenant_id = HexZTenantId::from(tenant_id); + let timeline_id = HexZTimelineId::from(timeline_id); + + let existing_ids = existing_values + .iter() + .find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id); + + if let Some((_, old_timeline_id)) = existing_ids { + if old_timeline_id == &timeline_id { + Ok(()) + } else { + bail!( + "branch '{}' is already mapped to timeline {}, cannot map to another timeline {}", + branch_name, + old_timeline_id, + timeline_id + ); + } + } else { + existing_values.push((tenant_id, timeline_id)); + Ok(()) + } + } + + pub fn get_branch_timeline_id( + &self, + branch_name: &str, + tenant_id: ZTenantId, + ) -> Option { + let tenant_id = HexZTenantId::from(tenant_id); + self.branch_name_mappings + .get(branch_name)? + .iter() + .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id) + .map(|&(_, timeline_id)| timeline_id) + .map(ZTimelineId::from) + } + + pub fn timeline_name_mappings(&self) -> HashMap { + self.branch_name_mappings + .iter() + .flat_map(|(name, tenant_timelines)| { + tenant_timelines.iter().map(|&(tenant_id, timeline_id)| { + ( + ZTenantTimelineId::new( + ZTenantId::from(tenant_id), + ZTimelineId::from(timeline_id), + ), + name.clone(), + ) + }) + }) + .collect() } /// Create a LocalEnv from a config file. @@ -179,8 +258,8 @@ impl LocalEnv { } // If no initial tenant ID was given, generate it. - if env.default_tenantid.is_none() { - env.default_tenantid = Some(ZTenantId::generate()); + if env.default_tenant_id.is_none() { + env.default_tenant_id = Some(HexZTenantId::from(ZTenantId::generate())); } env.base_data_dir = base_path(); @@ -210,6 +289,39 @@ impl LocalEnv { Ok(env) } + pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> { + // Currently, the user first passes a config file with 'zenith init --config=' + // We read that in, in `create_config`, and fill any missing defaults. Then it's saved + // to .zenith/config. TODO: We lose any formatting and comments along the way, which is + // a bit sad. + let mut conf_content = r#"# This file describes a locale deployment of the page server +# and safekeeeper node. It is read by the 'zenith' command-line +# utility. +"# + .to_string(); + + // Convert the LocalEnv to a toml file. + // + // This could be as simple as this: + // + // conf_content += &toml::to_string_pretty(env)?; + // + // But it results in a "values must be emitted before tables". I'm not sure + // why, AFAICS the table, i.e. 'safekeepers: Vec' is last. + // Maybe rust reorders the fields to squeeze avoid padding or something? + // In any case, converting to toml::Value first, and serializing that, works. + // See https://github.com/alexcrichton/toml-rs/issues/142 + conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?; + + let target_config_path = base_path.join("config"); + fs::write(&target_config_path, conf_content).with_context(|| { + format!( + "Failed to write config file into path '{}'", + target_config_path.display() + ) + }) + } + // this function is used only for testing purposes in CLI e g generate tokens during init pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result { let private_key_path = if self.private_key_path.is_absolute() { @@ -228,15 +340,15 @@ impl LocalEnv { pub fn init(&mut self) -> anyhow::Result<()> { // check if config already exists let base_path = &self.base_data_dir; - if base_path == Path::new("") { - bail!("repository base path is missing"); - } - if base_path.exists() { - bail!( - "directory '{}' already exists. Perhaps already initialized?", - base_path.to_str().unwrap() - ); - } + ensure!( + base_path != Path::new(""), + "repository base path is missing" + ); + ensure!( + !base_path.exists(), + "directory '{}' already exists. Perhaps already initialized?", + base_path.display() + ); fs::create_dir(&base_path)?; @@ -285,39 +397,10 @@ impl LocalEnv { fs::create_dir_all(self.pg_data_dirs_path())?; for safekeeper in &self.safekeepers { - fs::create_dir_all(self.safekeeper_data_dir(&safekeeper.name))?; + fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?; } - let mut conf_content = String::new(); - - // Currently, the user first passes a config file with 'zenith init --config=' - // We read that in, in `create_config`, and fill any missing defaults. Then it's saved - // to .zenith/config. TODO: We lose any formatting and comments along the way, which is - // a bit sad. - write!( - &mut conf_content, - r#"# This file describes a locale deployment of the page server -# and safekeeeper node. It is read by the 'zenith' command-line -# utility. -"# - )?; - - // Convert the LocalEnv to a toml file. - // - // This could be as simple as this: - // - // conf_content += &toml::to_string_pretty(env)?; - // - // But it results in a "values must be emitted before tables". I'm not sure - // why, AFAICS the table, i.e. 'safekeepers: Vec' is last. - // Maybe rust reorders the fields to squeeze avoid padding or something? - // In any case, converting to toml::Value first, and serializing that, works. - // See https://github.com/alexcrichton/toml-rs/issues/142 - conf_content += &toml::to_string_pretty(&toml::Value::try_from(&self)?)?; - - fs::write(base_path.join("config"), conf_content)?; - - Ok(()) + self.persist_config(base_path) } } diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index f5478b5922..969e2cd531 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -14,7 +14,9 @@ use postgres::Config; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use thiserror::Error; +use walkeeper::http::models::TimelineCreateRequest; use zenith_utils::http::error::HttpErrorBody; +use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; use crate::local_env::{LocalEnv, SafekeeperConf}; use crate::storage::PageServerNode; @@ -61,7 +63,7 @@ impl ResponseErrorMessageExt for Response { // #[derive(Debug)] pub struct SafekeeperNode { - pub name: String, + pub id: ZNodeId, pub conf: SafekeeperConf, @@ -77,10 +79,10 @@ impl SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { let pageserver = Arc::new(PageServerNode::from_env(env)); - println!("initializing for {} for {}", conf.name, conf.http_port); + println!("initializing for sk {} for {}", conf.id, conf.http_port); SafekeeperNode { - name: conf.name.clone(), + id: conf.id, conf: conf.clone(), pg_connection_config: Self::safekeeper_connection_config(conf.pg_port), env: env.clone(), @@ -98,8 +100,12 @@ impl SafekeeperNode { .unwrap() } + pub fn datadir_path_by_id(env: &LocalEnv, sk_id: ZNodeId) -> PathBuf { + env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref()) + } + pub fn datadir_path(&self) -> PathBuf { - self.env.safekeeper_data_dir(&self.name) + SafekeeperNode::datadir_path_by_id(&self.env, self.id) } pub fn pid_file(&self) -> PathBuf { @@ -120,6 +126,7 @@ impl SafekeeperNode { let mut cmd = Command::new(self.env.safekeeper_bin()?); fill_rust_env_vars( cmd.args(&["-D", self.datadir_path().to_str().unwrap()]) + .args(&["--id", self.id.to_string().as_ref()]) .args(&["--listen-pg", &listen_pg]) .args(&["--listen-http", &listen_http]) .args(&["--recall", "1 second"]) @@ -183,7 +190,7 @@ impl SafekeeperNode { pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { let pid_file = self.pid_file(); if !pid_file.exists() { - println!("Safekeeper {} is already stopped", self.name); + println!("Safekeeper {} is already stopped", self.id); return Ok(()); } let pid = read_pidfile(&pid_file)?; @@ -255,4 +262,25 @@ impl SafekeeperNode { .error_from_body()?; Ok(()) } + + pub fn timeline_create( + &self, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + peer_ids: Vec, + ) -> Result<()> { + Ok(self + .http_request( + Method::POST, + format!("{}/{}", self.http_base_url, "timeline"), + ) + .json(&TimelineCreateRequest { + tenant_id, + timeline_id, + peer_ids, + }) + .send()? + .error_from_body()? + .json()?) + } } diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index be594889ab..f6b7173067 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,3 +1,4 @@ +use std::convert::TryFrom; use std::io::Write; use std::net::TcpStream; use std::path::PathBuf; @@ -5,22 +6,23 @@ use std::process::Command; use std::time::Duration; use std::{io, result, thread}; -use anyhow::bail; +use anyhow::{bail, Context}; use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; -use pageserver::http::models::{BranchCreateRequest, TenantCreateRequest}; +use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponse}; +use pageserver::timelines::TimelineInfo; use postgres::{Config, NoTls}; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use thiserror::Error; use zenith_utils::http::error::HttpErrorBody; +use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::ZTenantId; +use zenith_utils::zid::{HexZTenantId, HexZTimelineId, ZTenantId, ZTimelineId}; use crate::local_env::LocalEnv; use crate::{fill_rust_env_vars, read_pidfile}; -use pageserver::branches::BranchInfo; use pageserver::tenant_mgr::TenantInfo; use zenith_utils::connstring::connection_address; @@ -98,11 +100,14 @@ impl PageServerNode { pub fn init( &self, - create_tenant: Option<&str>, + create_tenant: Option, + initial_timeline_id: Option, config_overrides: &[&str], - ) -> anyhow::Result<()> { + ) -> anyhow::Result { let mut cmd = Command::new(self.env.pageserver_bin()?); + let id = format!("id={}", self.env.pageserver.id); + // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let base_data_dir_param = self.env.base_data_dir.display().to_string(); let pg_distrib_dir_param = @@ -122,6 +127,7 @@ impl PageServerNode { args.extend(["-c", &authg_type_param]); args.extend(["-c", &listen_http_addr_param]); args.extend(["-c", &listen_pg_addr_param]); + args.extend(["-c", &id]); for config_override in config_overrides { args.extend(["-c", config_override]); @@ -134,19 +140,24 @@ impl PageServerNode { ]); } - if let Some(tenantid) = create_tenant { - args.extend(["--create-tenant", tenantid]) + let create_tenant = create_tenant.map(|id| id.to_string()); + if let Some(tenant_id) = create_tenant.as_deref() { + args.extend(["--create-tenant", tenant_id]) } - let status = fill_rust_env_vars(cmd.args(args)) - .status() - .expect("pageserver init failed"); + let initial_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); + let initial_timeline_id_string = initial_timeline_id.to_string(); + args.extend(["--initial-timeline-id", &initial_timeline_id_string]); - if !status.success() { + let init_output = fill_rust_env_vars(cmd.args(args)) + .output() + .context("pageserver init failed")?; + + if !init_output.status.success() { bail!("pageserver init failed"); } - Ok(()) + Ok(initial_timeline_id) } pub fn repo_path(&self) -> PathBuf { @@ -307,7 +318,7 @@ impl PageServerNode { } pub fn check_status(&self) -> Result<()> { - self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status")) + self.http_request(Method::GET, format!("{}/status", self.http_base_url)) .send()? .error_from_body()?; Ok(()) @@ -315,64 +326,76 @@ impl PageServerNode { pub fn tenant_list(&self) -> Result> { Ok(self - .http_request(Method::GET, format!("{}/{}", self.http_base_url, "tenant")) + .http_request(Method::GET, format!("{}/tenant", self.http_base_url)) .send()? .error_from_body()? .json()?) } - pub fn tenant_create(&self, tenantid: ZTenantId) -> Result<()> { - Ok(self - .http_request(Method::POST, format!("{}/{}", self.http_base_url, "tenant")) + pub fn tenant_create( + &self, + new_tenant_id: Option, + ) -> anyhow::Result> { + let tenant_id_string = self + .http_request(Method::POST, format!("{}/tenant", self.http_base_url)) .json(&TenantCreateRequest { - tenant_id: tenantid, + new_tenant_id: new_tenant_id.map(HexZTenantId::from), }) .send()? .error_from_body()? - .json()?) + .json::>()?; + + tenant_id_string + .map(|id| { + id.parse().with_context(|| { + format!( + "Failed to parse tennat creation response as tenant id: {}", + id + ) + }) + }) + .transpose() } - pub fn branch_list(&self, tenantid: &ZTenantId) -> Result> { - Ok(self + pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result> { + let timeline_infos: Vec = self .http_request( Method::GET, - format!("{}/branch/{}", self.http_base_url, tenantid), + format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), ) .send()? .error_from_body()? - .json()?) + .json()?; + + timeline_infos + .into_iter() + .map(TimelineInfo::try_from) + .collect() } - pub fn branch_create( + pub fn timeline_create( &self, - branch_name: &str, - startpoint: &str, - tenantid: &ZTenantId, - ) -> Result { - Ok(self - .http_request(Method::POST, format!("{}/branch", self.http_base_url)) - .json(&BranchCreateRequest { - tenant_id: tenantid.to_owned(), - name: branch_name.to_owned(), - start_point: startpoint.to_owned(), + tenant_id: ZTenantId, + new_timeline_id: Option, + ancestor_start_lsn: Option, + ancestor_timeline_id: Option, + ) -> anyhow::Result> { + let timeline_info_response = self + .http_request( + Method::POST, + format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), + ) + .json(&TimelineCreateRequest { + new_timeline_id: new_timeline_id.map(HexZTimelineId::from), + ancestor_start_lsn, + ancestor_timeline_id: ancestor_timeline_id.map(HexZTimelineId::from), }) .send()? .error_from_body()? - .json()?) - } + .json::>()?; - pub fn branch_get_by_name( - &self, - tenantid: &ZTenantId, - branch_name: &str, - ) -> Result { - Ok(self - .http_request( - Method::GET, - format!("{}/branch/{}/{}", self.http_base_url, tenantid, branch_name), - ) - .send()? - .error_for_status()? - .json()?) + timeline_info_response + .map(TimelineInfo::try_from) + .transpose() } } diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 45c41b4c19..93bb5f9cd7 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -4,7 +4,7 @@ set -eux if [ "$1" = 'pageserver' ]; then if [ ! -d "/data/tenants" ]; then echo "Initializing pageserver data directory" - pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" + pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" fi echo "Staring pageserver at 0.0.0.0:6400" pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /data diff --git a/docs/docker.md b/docs/docker.md index 14ba2146cb..cc54d012dd 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -7,32 +7,14 @@ Currently we build two main images: - [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). - [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres). -And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos: +And additional intermediate images: -- [zenithdb/build](https://hub.docker.com/repository/docker/zenithdb/build) — image with all the dependencies required to build Zenith and compute node images. This image is based on `rust:slim-buster`, so it also has a proper `rust` environment. Built from [/Dockerfile.build](/Dockerfile.build). - [zenithdb/compute-tools](https://hub.docker.com/repository/docker/zenithdb/compute-tools) — compute node configuration management tools. ## Building pipeline 1. Image `zenithdb/compute-tools` is re-built automatically. -2. Image `zenithdb/build` is built manually. If you want to introduce any new compile time dependencies to Zenith or compute node you have to update this image as well, build it and push to Docker Hub. +2. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo. -Build: -```sh -docker build -t zenithdb/build:buster -f Dockerfile.build . -``` - -Login: -```sh -docker login -``` - -Push to Docker Hub: -```sh -docker push zenithdb/build:buster -``` - -3. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo. - -4. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically. +3. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically. diff --git a/docs/rfcs/002-storage.md b/docs/rfcs/002-storage.md new file mode 100644 index 0000000000..5cac377272 --- /dev/null +++ b/docs/rfcs/002-storage.md @@ -0,0 +1,186 @@ +# Zenith storage node — alternative + +## **Design considerations** + +Simplify storage operations for people => Gain adoption/installs on laptops and small private installation => Attract customers to DBaaS by seamless integration between our tooling and cloud. + +Proposed architecture addresses: + +- High availability -- tolerates n/2 - 1 failures +- Multi-tenancy -- one storage for all databases +- Elasticity -- increase storage size on the go by adding nodes +- Snapshots / backups / PITR with S3 offload +- Compression + +Minuses are: + +- Quite a lot of work +- Single page access may touch few disk pages +- Some bloat in data — may slowdown sequential scans + +## **Summary** + +Storage cluster is sharded key-value store with ordered keys. Key (****page_key****) is a tuple of `(pg_id, db_id, timeline_id, rel_id, forkno, segno, pageno, lsn)`. Value is either page or page diff/wal. Each chunk (chunk == shard) stores approx 50-100GB ~~and automatically splits in half when grows bigger then soft 100GB limit~~. by having a fixed range of pageno's it is responsible for. Chunks placement on storage nodes is stored in a separate metadata service, so chunk can be freely moved around the cluster if it is need. Chunk itself is a filesystem directory with following sub directories: + +``` + +|-chunk_42/ + |-store/ -- contains lsm with pages/pagediffs ranging from + | page_key_lo to page_key_hi + |-wal/ + | |- db_1234/ db-specific wal files with pages from page_key_lo + | to page_key_hi + | + |-chunk.meta -- small file with snapshot references + (page_key_prefix+lsn+name) + and PITR regions (page_key_start, page_key_end) +``` + +## **Chunk** + +Chunk is responsible for storing pages potentially from different databases and relations. Each page is addressed by a lexicographically ordered tuple (****page_key****) with following fields: + +- `pg_id` -- unique id of given postgres instance (or postgres cluster as it is called in postgres docs) +- `db_id` -- database that was created by 'CREATE DATABASE' in a given postgres instance +- `db_timeline` -- used to create Copy-on-Write instances from snapshots, described later +- `rel_id` -- tuple of (relation_id, 0) for tables and (indexed_relation_id, rel_id) for indices. Done this way so table indices were closer to table itself on our global key space. +- `(forkno, segno, pageno)` -- page coordinates in postgres data files +- `lsn_timeline` -- postgres feature, increments when PITR was done. +- `lsn` -- lsn of current page version. + +Chunk stores pages and page diffs ranging from page_key_lo to page_key_hi. Processing node looks at page in wal record and sends record to a chunk responsible for this page range. When wal record arrives to a chunk it is initially stored in `chunk_id/wal/db_id/wal_segno.wal`. Then background process moves records from that wal files to the lsm tree in `chunk_id/store`. Or, more precisely, wal records would be materialized into lsm memtable and when that memtable is flushed to SSTable on disk we may trim the wal. That way some not durably (in the distributed sense) committed pages may enter the tree -- here we rely on processing node behavior: page request from processing node should contain proper lsm horizons so that storage node may respond with proper page version. + +LSM here is a usual LSM for variable-length values: at first data is stored in memory (we hold incoming wal records to be able to regenerate it after restart) at some balanced tree. When this tree grows big enough we dump it into disk file (SSTable) sorting records by key. Then SStables are mergesorted in the background to a different files. All file operation are sequential and do not require WAL for durability. + +Content of SSTable can be following: + +```jsx +(pg_id, db_id, ... , pageno=42, lsn=100) (full 8k page data) +(pg_id, db_id, ... , pageno=42, lsn=150) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=180) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=200) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=220) (full 8k page data) +(pg_id, db_id, ... , pageno=42, lsn=250) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=270) (per-page diff) +(pg_id, db_id, ... , pageno=5000, lsn=100) (full 8k page data) +``` + +So query for `pageno=42 up to lsn=260` would need to find closest entry less then this key, iterate back to the latest full page and iterate forward to apply diffs. How often page is materialized in lsn-version sequence is up to us -- let's say each 5th version should be a full page. + +### **Page deletion** + +To delete old pages we insert blind deletion marker `(pg_id, db_id, #trim_lsn < 150)` into a lsm tree. During merges such marker would indicate that all pages with smaller lsn should be discarded. Delete marker will travel down the tree levels hierarchy until it reaches last level. In non-PITR scenario where old page version are not needed at all such deletion marker would (in average) prevent old page versions propagation down the tree -- so all bloat would concentrate at higher tree layers without affecting bigger bottom layers. + +### **Recovery** + +Upon storage node restart recent WAL files are applied to appropriate pages and resulting pages stored in lsm memtable. So this should be fast since we are not writing anything to disk. + +### **Checkpointing** + +No such mechanism is needed. Or we may look at the storage node as at kind of continuous chekpointer. + +### **Full page writes (torn page protection)** + +Storage node never updates individual pages, only merges SSTable, so torn pages is not an issue. + +### **Snapshot** + +That is the part that I like about this design -- snapshot creation is instant and cheap operation that can have flexible granularity level: whole instance, database, table. Snapshot creation inserts a record in `chunk.meta` file with lsn of this snapshot and key prefix `(pg_id, db_id, db_timeline, rel_id, *)` that prohibits pages deletion within this range. Storage node may not know anything about page internals, but by changing number of fields in our prefix we may change snapshot granularity. + +It is again useful to remap `rel_id` to `(indexed_relation_id, rel_id)` so that snapshot of relation would include it's indices. Also table snapshot would trickily interact with catalog. Probably all table snapshots should hold also a catalog snapshot. And when node is started with such snapshot it should check that only tables from snapshot are queried. I assume here that for snapshot reading one need to start a new postgres instance. + +Storage consumed by snapshot is proportional to the amount of data changed. We may have some heuristic (calculated based on cost of different storages) about when to offload old snapshot to s3. For example, if current database has more then 40% of changed pages with respect to previous snapshot then we may offload that snapshot to s3, and release this space. + +**Starting db from snapshot** + +When we are starting database from snapshot it can be done in two ways. First, we may create new db_id, move all the data from snapshot to a new db and start a database. Second option is to create Copy-on-Write (CoW) instance out of snapshot and read old pages from old snapshot and store new pages separately. That is why there is `db_timeline` key field near `db_id` -- CoW (🐮) database should create new `db_timeline` and remember old `db_timeline`. Such a database can have hashmap of pages that it is changed to query pages from proper snapshot on the first try. `db_timeline` is located near `db_id` so that new page versions generated by new instance would not bloat data of initial snapshot. It is not clear for whether it is possibly to effectively support "stacked" CoW snapshot, so we may disallow them. (Well, one way to support them is to move `db_timeline` close to `lsn` -- so we may scan neighboring pages and find right one. But again that way we bloat snapshot with unrelated data and may slowdown full scans that are happening in different database). + +**Snapshot export/import** + +Once we may start CoW instances it is easy to run auxiliary postgres instance on this snapshot and run `COPY FROM (...) TO stdout` or `pg_dump` and export data from the snapshot to some portable formats. Also we may start postgres on a new empty database and run `COPY FROM stdin`. This way we can initialize new non-CoW databases and transfer snapshots via network. + +### **PITR area** + +In described scheme PITR is just a prohibition to delete any versions within some key prefix, either it is a database or a table key prefix. So PITR may have different settings for different tables, databases, etc. + +PITR is quite bloaty, so we may aggressively offload it to s3 -- we may push same (or bigger) SSTables to s3 and maintain lsm structure there. + +### **Compression** + +Since we are storing page diffs of variable sizes there is no structural dependency on a page size and we may compress it. Again that could be enabled only on pages with some key prefixes, so we may have this with db/table granularity. + +### **Chunk metadata** + +Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunck should always consult this data when merging SSTables and applying delete markers. + +### **Chunk splitting** + +*(NB: following paragraph is about how to avoid page splitting)* + +When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global matadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following: + +1. Find separation key and spawn two new chunks with [lo, mid) [mid, hi) boundaries. + +2. Prohibit WAL deletion and old SSTables deletion on original chunk. + +3. On each lsm layer we would need to split only one SSTable, all other would fit within left or right range. Symlink/split that files to new chunks. + +4. Start WAL replay on new chunks. + +5. Update global metadata about new chunk boundaries. + +6. Eventually (metadata update should be pushed to processing node by metadata service) storage node will start sending WAL and page requests to the new nodes. + +7. New chunk may start serving read queries when following conditions are met: + +a) it receives at least on WAL record from processing node + +b) it replayed all WAL up to the new received one + +c) checked by downlinks that there were no WAL gaps. + +Chunk split as it is described here is quite fast operation when it is happening on the local disk -- vast majority of files will be just moved without copying anything. I suggest to keep split always local and not to mix it with chunk moving around cluster. So if we want to split some chunk but there is small amount of free space left on the device, we should first move some chunks away from the node and then proceed with splitting. + +### Fixed chunks + +Alternative strategy is to not to split at all and have pageno-fixed chunk boundaries. When table is created we first materialize this chunk by storing first new pages only and chunks is small. Then chunk is growing while table is filled, but it can't grow substantially bigger then allowed pageno range, so at max it would be 1GB or whatever limit we want + some bloat due to snapshots and old page versions. + +### **Chunk lsm internals** + +So how to implement chunk's lsm? + +- Write from scratch and use RocksDB to prototype/benchmark, then switch to own lsm implementation. RocksDB can provide some sanity check for performance of home-brewed implementation and it would be easier to prototype. +- Use postgres as lego constructor. We may model memtable with postgres B-tree referencing some in-memory log of incoming records. SSTable merging may reuse postgres external merging algorithm, etc. One thing that would definitely not fit (or I didn't came up with idea how to fit that) -- is multi-tenancy. If we are storing pages from different databases we can't use postgres buffer pool, since there is no db_id in the page header. We can add new field there but IMO it would be no go for committing that to vanilla. + +Other possibility is to not to try to fit few databases in one storage node. But that way it is no go for multi-tenant cloud installation: we would need to run a lot of storage node instances on one physical storage node, all with it own local page cache. So that would be much closer to ordinary managed RDS. + +Multi-tenant storage makes sense even on a laptop, when you work with different databases, running tests with temp database, etc. And when installation grows bigger it start to make more and more sense, so it seems important. + +# Storage fleet + +# **Storage fleet** + +- When database is smaller then a chunk size we naturally can store them in one chunk (since their page_key would fit in some chunk's [hi, lo) range). + +Screenshot_2021-02-22_at_16 49 17 + +Few databases are stored in one chunk, replicated three times + +- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we alway may manually move chunks around the cluster. + +Screenshot_2021-02-22_at_16 49 10 + +Here one big database occupies two set of nodes. Also some chunks were moved around to restore replication factor after disk failure. In this case we also have "sharded" storage for a big database and issue wal writes to different chunks in parallel. + +## **Chunk placement strategies** + +There are few scenarios where we may want to move chunks around the cluster: + +- disk usage on some node is big +- some disk experienced a failure +- some node experienced a failure or need maintenance + +## **Chunk replication** + +Chunk replication may be done by cloning page ranges with respect to some lsn from peer nodes, updating global metadata, waiting for WAL to come, replaying previous WAL and becoming online -- more or less like during chunk split. + diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md new file mode 100644 index 0000000000..4d1f0a68f0 --- /dev/null +++ b/docs/rfcs/003-laptop-cli.md @@ -0,0 +1,267 @@ +# Command line interface (end-user) + +Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start. + +This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots. + +The most important concept here is a snapshot, which can be created/pushed/pulled/exported. Also, we may start temporary read-only postgres instance over any local snapshot. A more complex scenario would consist of several basic operations over snapshots. + +# Possible usage scenarios + +## Install zenith, run a postgres + +``` +> brew install pg-zenith +> zenith pg create # creates pgdata with default pattern pgdata$i +> zenith pg list +ID PGDATA USED STORAGE ENDPOINT +primary1 pgdata1 0G zenith-local localhost:5432 +``` + +## Import standalone postgres to zenith + +``` +> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg +[====================------------] 60% | 20MB/s +> zenith snapshot list +ID SIZE PARENT +oldpg 5G - + +> zenith pg create --snapshot oldpg +Started postgres on localhost:5432 + +> zenith pg list +ID PGDATA USED STORAGE ENDPOINT +primary1 pgdata1 5G zenith-local localhost:5432 + +> zenith snapshot destroy oldpg +Ok +``` + +Also, we may start snapshot import implicitly by looking at snapshot schema + +``` +> zenith pg create --snapshot basebackup://replication@localhost:5432/ +Downloading snapshot... Done. +Started postgres on localhost:5432 +Destroying snapshot... Done. +``` + +## Pull snapshot with some publicly shared database + +Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage). + +``` +> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies +``` + +## Create snapshot and push it to the cloud + +``` +> zenith snapshot create pgdata1@snap1 +> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1 +``` + +## Rollback database to the snapshot + +One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`. + +``` +> zenith pg list +ID PGDATA USED STORAGE ENDPOINT +primary1 pgdata1 5G zenith-local localhost:5432 + +> zenith snapshot create pgdata1@snap1 + +> zenith snapshot list +ID SIZE PARENT +oldpg 5G - +pgdata1@snap1 6G - +pgdata1@CURRENT 6G - + +> zenith pg checkout pgdata1@snap1 +Stopping postgres on pgdata1. +Rolling back pgdata1@CURRENT to pgdata1@snap1. +Starting postgres on pgdata1. + +> zenith snapshot list +ID SIZE PARENT +oldpg 5G - +pgdata1@snap1 6G - +pgdata1@HEAD{0} 6G - +pgdata1@CURRENT 6G - +``` + +Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state of the database in the data directory. When we are checking out some snapshot CURRENT will be set to this snapshot and the old CURRENT state will be named HEAD{0} (0 is the number of postgres timeline, it would be incremented after each such checkout). + +## Configure PITR area (Point In Time Recovery). + +PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite). + +``` +> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month +``` + +Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area. + +# Manual + +## storage + +Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default. + +**zenith storage attach** -t [native|s3] -c key=value -n name + +Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'. + + +**zenith storage list** + +Show currently attached storages. For example: + +``` +> zenith storage list +NAME USED TYPE OPTIONS PATH +local 5.1G zenith-local /opt/zenith/store/local +local.compr 20.4G zenith-local comression=on /opt/zenith/store/local.compr +zcloud 60G zenith-remote zenith.tech/stas/mystore +s3tank 80G S3 +``` + +**zenith storage detach** + +**zenith storage show** + + + +## pg + +Manages postgres data directories and can start postgreses with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themself. + +Pg is a term for a single postgres running on some data. I'm trying to avoid here separation of datadir management and postgres instance management -- both that concepts bundled here together. + +**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata + +Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr. + +--no-start: just init datadir without creating + +--snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1) + +--cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database) + +**zenith pg destroy** + +**zenith pg start** [--replica] pgdata + +Start postgres with proper extensions preloaded/installed. + +**zenith pg checkout** + +Rollback data directory to some previous snapshot. + +**zenith pg stop** pg_id + +**zenith pg list** + +``` +ROLE PGDATA USED STORAGE ENDPOINT +primary my_pg 5.1G local localhost:5432 +replica-1 localhost:5433 +replica-2 localhost:5434 +primary my_pg2 3.2G local.compr localhost:5435 +- my_pg3 9.2G local.compr - +``` + +**zenith pg show** + +``` +my_pg: + storage: local + space used on local: 5.1G + space used on all storages: 15.1G + snapshots: + on local: + snap1: 1G + snap2: 1G + on zcloud: + snap2: 1G + on s3tank: + snap5: 2G + pitr: + on s3tank: + pitr_one_month: 45G + +``` + +**zenith pg start-rest/graphql** pgdata + +Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea. + + +## snapshot + +Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout. + +**zenith snapshot create** pgdata_name@snap_name + +Creates a new snapshot in the same storage where pgdata_name exists. + +**zenith snapshot push** --to url pgdata_name@snap_name + +Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go. + +**zenith snapshot recv** + +Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket. + +**zenith snapshot pull** --from url or path + +Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format. + +**zenith snapshot import** --from basebackup://<...> or path + +Creates a new snapshot out of running postgres via basebackup protocol or basebackup files. + +**zenith snapshot export** + +Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay). + +**zenith snapshot diff** snap1 snap2 + +Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses. + +**zenith snapshot destroy** + +## pitr + +Pitr represents wal stream and ttl policy for that stream + +XXX: any suggestions on a better name? + +**zenith pitr create** name + +--ttl = inf | period + +--size-limit = inf | limit + +--storage = storage_name + +**zenith pitr extract-snapshot** pitr_name --lsn xxx + +Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export) + +**zenith pitr gc** pitr_name + +Force garbage collection on some PITR area. + +**zenith pitr list** + +**zenith pitr destroy** + + +## console + +**zenith console** + +Opens browser targeted at web console with the more or less same functionality as described here. diff --git a/docs/rfcs/004-durability.md b/docs/rfcs/004-durability.md new file mode 100644 index 0000000000..4543be3dae --- /dev/null +++ b/docs/rfcs/004-durability.md @@ -0,0 +1,218 @@ +Durability & Consensus +====================== + +When a transaction commits, a commit record is generated in the WAL. +When do we consider the WAL record as durable, so that we can +acknowledge the commit to the client and be reasonably certain that we +will not lose the transaction? + +Zenith uses a group of WAL safekeeper nodes to hold the generated WAL. +A WAL record is considered durable, when it has been written to a +majority of WAL safekeeper nodes. In this document, I use 5 +safekeepers, because I have five fingers. A WAL record is durable, +when at least 3 safekeepers have written it to disk. + +First, assume that only one primary node can be running at a +time. This can be achieved by Kubernetes or etcd or some +cloud-provider specific facility, or we can implement it +ourselves. These options are discussed in later chapters. For now, +assume that there is a Magic STONITH Fairy that ensures that. + +In addition to the WAL safekeeper nodes, the WAL is archived in +S3. WAL that has been archived to S3 can be removed from the +safekeepers, so the safekeepers don't need a lot of disk space. + + + +----------------+ + +-----> | WAL safekeeper | + | +----------------+ + | +----------------+ + +-----> | WAL safekeeper | ++------------+ | +----------------+ +| Primary | | +----------------+ +| Processing | ---------+-----> | WAL safekeeper | +| Node | | +----------------+ ++------------+ | +----------------+ + \ +-----> | WAL safekeeper | + \ | +----------------+ + \ | +----------------+ + \ +-----> | WAL safekeeper | + \ +----------------+ + \ + \ + \ + \ + \ +--------+ + \ | | + +--> | S3 | + | | + +--------+ + + +Every WAL safekeeper holds a section of WAL, and a VCL value. +The WAL can be divided into three portions: + + + VCL LSN + | | + V V +.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX +Archived WAL Completed WAL In-flight WAL + + +Note that all this WAL kept in a safekeeper is a contiguous section. +This is different from Aurora: In Aurora, there can be holes in the +WAL, and there is a Gossip protocol to fill the holes. That could be +implemented in the future, but let's keep it simple for now. WAL needs +to be written to a safekeeper in order. However, during crash +recovery, In-flight WAL that has already been stored in a safekeeper +can be truncated or overwritten. + +The Archived WAL has already been stored in S3, and can be removed from +the safekeeper. + +The Completed WAL has been written to at least three safekeepers. The +algorithm ensures that it is not lost, when at most two nodes fail at +the same time. + +The In-flight WAL has been persisted in the safekeeper, but if a crash +happens, it may still be overwritten or truncated. + + +The VCL point is determined in the Primary. It is not strictly +necessary to store it in the safekeepers, but it allows some +optimizations and sanity checks and is probably generally useful for +the system as whole. The VCL values stored in the safekeepers can lag +behind the VCL computed by the primary. + + +Primary node Normal operation +----------------------------- + +1. Generate some WAL. + +2. Send the WAL to all the safekeepers that you can reach. + +3. As soon as a quorum of safekeepers have acknowledged that they have + received and durably stored the WAL up to that LSN, update local VCL + value in memory, and acknowledge commits to the clients. + +4. Send the new VCL to all the safekeepers that were part of the quorum. + (Optional) + + +Primary Crash recovery +---------------------- + +When a new Primary node starts up, before it can generate any new WAL +it needs to contact a majority of the WAL safekeepers to compute the +VCL. Remember that there is a Magic STONITH fairy that ensures that +only node process can be doing this at a time. + +1. Contact all WAL safekeepers. Find the Max((Epoch, LSN)) tuple among the ones you + can reach. This is the Winner safekeeper, and its LSN becomes the new VCL. + +2. Update the other safekeepers you can reach, by copying all the WAL + from the Winner, starting from each safekeeper's old VCL point. Any old + In-Flight WAL from previous Epoch is truncated away. + +3. Increment Epoch, and send the new Epoch to the quorum of + safekeepers. (This ensures that if any of the safekeepers that we + could not reach later come back online, they will be considered as + older than this in any future recovery) + +You can now start generating new WAL, starting from the newly-computed +VCL. + +Optimizations +------------- + +As described, the Primary node sends all the WAL to all the WAL safekeepers. That +can be a lot of network traffic. Instead of sending the WAL directly from Primary, +some safekeepers can be daisy-chained off other safekeepers, or there can be a +broadcast mechanism among them. There should still be a direct connection from the +each safekeeper to the Primary for the acknowledgments though. + +Similarly, the responsibility for archiving WAL to S3 can be delegated to one of +the safekeepers, to reduce the load on the primary. + + +Magic STONITH fairy +------------------- + +Now that we have a system that works as long as only one primary node is running at a time, how +do we ensure that? + +1. Use etcd to grant a lease on a key. The primary node is only allowed to operate as primary + when it's holding a valid lease. If the primary node dies, the lease expires after a timeout + period, and a new node is allowed to become the primary. + +2. Use S3 to store the lease. S3's consistency guarantees are more lenient, so in theory you + cannot do this safely. In practice, it would probably be OK if you make the lease times and + timeouts long enough. This has the advantage that we don't need to introduce a new + component to the architecture. + +3. Use Raft or Paxos, with the WAL safekeepers acting as the Acceptors to form the quorum. The + next chapter describes this option. + + +Built-in Paxos +-------------- + +The WAL safekeepers act as PAXOS Acceptors, and the Processing nodes +as both Proposers and Learners. + +Each WAL safekeeper holds an Epoch value in addition to the VCL and +the WAL. Each request by the primary to safekeep WAL is accompanied by +an Epoch value. If a safekeeper receives a request with Epoch that +doesn't match its current Accepted Epoch, it must ignore (NACK) it. +(In different Paxos papers, Epochs are called "terms" or "round +numbers") + +When a node wants to become the primary, it generates a new Epoch +value that is higher than any previously observed Epoch value, and +globally unique. + + +Accepted Epoch: 555 VCL LSN + | | + V V +.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX +Archived WAL Completed WAL In-flight WAL + + +Primary node startup: + +1. Contact all WAL safekeepers that you can reach (if you cannot + connect to a quorum of them, you can give up immediately). Find the + latest Epoch among them. + +2. Generate a new globally unique Epoch, greater than the latest Epoch + found in previous step. + +2. Send the new Epoch in a Prepare message to a quorum of + safekeepers. (PAXOS Prepare message) + +3. Each safekeeper responds with a Promise. If a safekeeper has + already made a promise with a higher Epoch, it doesn't respond (or + responds with a NACK). After making a promise, the safekeeper stops + responding to any write requests with earlier Epoch. + +4. Once you have received a majority of promises, you know that the + VCL cannot advance on the old Epoch anymore. This effectively kills + any old primary server. + +5. Find the highest written LSN among the quorum of safekeepers (these + can be included in the Promise messages already). This is the new + VCL. If a new node starts the election process after this point, + it will compute the same or higher VCL. + +6. Copy the WAL from the safekeeper with the highest LSN to the other + safekeepers in the quorum, using the new Epoch. (PAXOS Accept + phase) + +7. You can now start generating new WAL starting from the VCL. If + another process starts the election process after this point and + gains control of a majority of the safekeepers, we will no longer + be able to advance the VCL. + diff --git a/docs/rfcs/005-zenith_local.md b/docs/rfcs/005-zenith_local.md new file mode 100644 index 0000000000..7b078e9ec0 --- /dev/null +++ b/docs/rfcs/005-zenith_local.md @@ -0,0 +1,103 @@ +# Zenith local + +Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together. Your comments on both parts are very welcome. + +#### Why do we need it? +- For distribution - this easy to use binary will help us to build adoption among developers. +- For internal use - to test all components together. + +In my understanding, we consider it to be just a mock-up version of zenith-cloud. +> Question: How much should we care about durability and security issues for a local setup? + + +#### Why is it better than a simple local postgres? + +- Easy one-line setup. As simple as `cargo install zenith && zenith start` + +- Quick and cheap creation of compute nodes over the same storage. +> Question: How can we describe a use-case for this feature? + +- Zenith-local can work with S3 directly. + +- Push and pull images (snapshots) to remote S3 to exchange data with other users. + +- Quick and cheap snapshot checkouts to switch back and forth in the database history. +> Question: Do we want it in the very first release? This feature seems quite complicated. + +#### Distribution: + +Ideally, just one binary that incorporates all elements we need. +> Question: Let's discuss pros and cons of having a separate package with modified PostgreSQL. + +#### Components: + +- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responces to show them in a user-friendly way. +CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md +WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli + +- **zenith-console** - WEB UI with same functionality as CLI. +>Note: not for the first release. + +- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below. + > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local. + +- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation). +> Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server? + +WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src + +- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith. +> Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)? +> Question: Do we use it together with local page store or they are interchangeable? + +WIP code is ??? + +- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed. +> Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system. + +WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper + +- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database. + + WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node + +#### REST API: + +Service endpoint: `http://localhost:3000` + +Resources: +- /storages - Where data lives: zenith-pageserver or zenith-s3 +- /pgs - Postgres - zenith-computenode +- /snapshots - snapshots **TODO** + +>Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all? + +Methods and their mapping to CLI: + +- /storages - zenith-pageserver or zenith-s3 + +CLI | REST API +------------- | ------------- +storage attach -n name --type [native\s3] --path=[datadir\URL] | PUT -d { "name": "name", "type": "native", "path": "/tmp" } /storages +storage detach -n name | DELETE /storages/:storage_name +storage list | GET /storages +storage show -n name | GET /storages/:storage_name + + +- /pgs - zenith-computenode + +CLI | REST API +------------- | ------------- +pg create -n name --s storage_name | PUT -d { "name": "name", "storage_name": "storage_name" } /pgs +pg destroy -n name | DELETE /pgs/:pg_name +pg start -n name --replica | POST -d {"action": "start", "is_replica":"replica"} /pgs/:pg_name /actions +pg stop -n name | POST -d {"action": "stop"} /pgs/:pg_name /actions +pg promote -n name | POST -d {"action": "promote"} /pgs/:pg_name /actions +pg list | GET /pgs +pg show -n name | GET /pgs/:pg_name + +- /snapshots **TODO** + +CLI | REST API +------------- | ------------- + diff --git a/docs/rfcs/006-laptop-cli-v2-CLI.md b/docs/rfcs/006-laptop-cli-v2-CLI.md new file mode 100644 index 0000000000..a04536922a --- /dev/null +++ b/docs/rfcs/006-laptop-cli-v2-CLI.md @@ -0,0 +1,64 @@ +Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog". + +# CLI v2 (after chatting with Carl) + +Zenith introduces the notion of a repository. + +```bash +zenith init +zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory +``` + +Once you have a cluster catalog you can explore it + +```bash +zenith log -- returns a list of commits +zenith status -- returns if there are changes in the catalog that can be committed +zenith commit -- commits the changes and generates a new commit hash +zenith branch experimental -- creates a branch called testdb based on a given commit hash +``` + +To make changes in the catalog you need to run compute nodes + +```bash +-- here is how you a compute node +zenith start /home/pipedpiper/northwind:main -- starts a compute instance +zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud +-- you can start a compute node against any hash or branch +zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start anothe compute instance (on different port) +-- you can start a compute node against any hash or branch +zenith start /home/pipedpiper/northwind: --port 8009 -- start anothe compute instance (on different port) + +-- After running some DML you can run +-- zenith status and see how there are two WAL streams one on top of +-- the main branch +zenith status +-- and another on top of the experimental branch +zenith status -b experimental + +-- you can commit each branch separately +zenith commit main +-- or +zenith commit -c /home/pipedpiper/northwind:experimental +``` + +Starting compute instances against cloud environments + +```bash +-- you can start a compute instance against the cloud environment +-- in this case all of the changes will be streamed into the cloud +zenith start https://zenith:tech/pipedpiper/northwind:main +zenith start https://zenith:tech/pipedpiper/northwind:main +zenith status -c https://zenith:tech/pipedpiper/northwind:main +zenith commit -c https://zenith:tech/pipedpiper/northwind:main +zenith branch -c https://zenith:tech/pipedpiper/northwind: experimental +``` + +Pushing data into the cloud + +```bash +-- pull all the commits from the cloud +zenith pull +-- push all the commits to the cloud +zenith push +``` diff --git a/docs/rfcs/006-laptop-cli-v2-repository-structure.md b/docs/rfcs/006-laptop-cli-v2-repository-structure.md new file mode 100644 index 0000000000..ee4e432182 --- /dev/null +++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md @@ -0,0 +1,140 @@ +# Repository format + +A Zenith repository is similar to a traditional PostgreSQL backup +archive, like a WAL-G bucket or pgbarman backup catalogue. It holds +multiple versions of a PostgreSQL database cluster. + +The distinguishing feature is that you can launch a Zenith Postgres +server directly against a branch in the repository, without having to +"restore" it first. Also, Zenith manages the storage automatically, +there is no separation between full and incremental backups nor WAL +archive. Zenith relies heavily on the WAL, and uses concepts similar +to incremental backups and WAL archiving internally, but it is hidden +from the user. + +## Directory structure, version 1 + +This first version is pretty straightforward but not very +efficient. Just something to get us started. + +The repository directory looks like this: + + .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/ + .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots// + .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history + + .zenith/refs/branches/mybranch + .zenith/refs/tags/foo + .zenith/refs/tags/bar + + .zenith/datadirs/ + +### Timelines + +A timeline is similar to PostgeSQL's timeline, but is identified by a +UUID instead of a 32-bit timeline Id. For user convenience, it can be +given a name that refers to the UUID (called a branch). + +All WAL is generated on a timeline. You can launch a read-only node +against a tag or arbitrary LSN on a timeline, but in order to write, +you need to create a timeline. + +Each timeline is stored in a directory under .zenith/timelines. It +consists of a WAL archive, containing all the WAL in the standard +PostgreSQL format, under the wal/ subdirectory. + +The 'snapshots/' subdirectory, contains "base backups" of the data +directory at a different LSNs. Each snapshot is simply a copy of the +Postgres data directory. + +When a new timeline is forked from a previous timeline, the ancestor +timeline's UUID is stored in the 'history' file. + +### Refs + +There are two kinds of named objects in the repository: branches and +tags. A branch is a human-friendly name for a timeline UUID, and a +tag is a human-friendly name for a specific LSN on a timeline +(timeline UUID + LSN). Like in git, these are just for user +convenience; you can also use timeline UUIDs and LSNs directly. + +Refs do have one additional purpose though: naming a timeline or LSN +prevents it from being automatically garbage collected. + +The refs directory contains a small text file for each tag/branch. It +contains the UUID of the timeline (and LSN, for tags). + +### Datadirs + +.zenith/datadirs contains PostgreSQL data directories. You can launch +a Postgres instance on one of them with: + +``` + postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c +``` + +All the actual data is kept in the timeline directories, under +.zenith/timelines. The data directories are only needed for active +PostgreQSL instances. After an instance is stopped, the data directory +can be safely removed. "zenith start" will recreate it quickly from +the data in .zenith/timelines, if it's missing. + +## Version 2 + +The format described above isn't very different from a traditional +daily base backup + WAL archive configuration. The main difference is +the nicer naming of branches and tags. + +That's not very efficient. For performance, we need something like +incremental backups that don't require making a full copy of all +data. So only store modified files or pages. And instead of having to +replay all WAL from the last snapshot, "slice" the WAL into +per-relation WAL files and only recover what's needed when a table is +accessed. + +In version 2, the file format in the "snapshots" subdirectory gets +more advanced. The exact format is TODO. But it should support: +- storing WAL records of individual relations/pages +- storing a delta from an older snapshot +- compression + + +## Operations + +### Garbage collection + +When you run "zenith gc", old timelines that are no longer needed are +removed. That involves collecting the list of "unreachable" objects, +starting from the named branches and tags. + +Also, if enough WAL has been generated on a timeline since last +snapshot, a new snapshot or delta is created. + +### zenith push/pull + +Compare the tags and branches on both servers, and copy missing ones. +For each branch, compare the timeline it points to in both servers. If +one is behind the other, copy the missing parts. + +FIXME: how do you prevent confusion if you have to clones of the same +repository, launch an instance on the same branch in both clones, and +later try to push/pull between them? Perhaps create a new timeline +every time you start up an instance? Then you would detect that the +timelines have diverged. That would match with the "epoch" concept +that we have in the WAL safekeepr + +### zenith checkout/commit + +In this format, there is no concept of a "working tree", and hence no +concept of checking out or committing. All modifications are done on +a branch or a timeline. As soon as you launch a server, the changes are +appended to the timeline. + +You can easily fork off a temporary timeline to emulate a "working tree". +You can later remove it and have it garbage collected, or to "commit", +re-point the branch to the new timeline. + +If we want to have a worktree and "zenith checkout/commit" concept, we can +emulate that with a temporary timeline. Create the temporary timeline at +"zenith checkout", and have "zenith commit" modify the branch to point to +the new timeline. diff --git a/docs/rfcs/007-serverless-on-laptop.md b/docs/rfcs/007-serverless-on-laptop.md new file mode 100644 index 0000000000..e6355f4a03 --- /dev/null +++ b/docs/rfcs/007-serverless-on-laptop.md @@ -0,0 +1,93 @@ +How it works now +---------------- + +1. Create repository, start page server on it + +``` +$ zenith init +... +created main branch +new zenith repository was created in .zenith + +$ zenith pageserver start +Starting pageserver at '127.0.0.1:64000' in .zenith +Page server started +``` + +2. Create a branch, and start a Postgres instance on it + +``` +$ zenith branch heikki main +branching at end of WAL: 0/15ECF68 + +$ zenith pg create heikki +Initializing Postgres on timeline 76cf9279915be7797095241638e64644... +Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432 + +$ zenith pg start pg1 +Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki' +waiting for server to start.... done +server started +``` + + +3. Connect to it and run queries + +``` +$ psql "dbname=postgres port=55432" +psql (14devel) +Type "help" for help. + +postgres=# +``` + + +Proposal: Serverless on your Laptop +----------------------------------- + +We've been talking about doing the "pg create" step automatically at +"pg start", to eliminate that step. What if we go further, go +serverless on your laptop, so that the workflow becomes just: + +1. Create repository, start page server on it (same as before) + +``` +$ zenith init +... +created main branch +new zenith repository was created in .zenith + +$ zenith pageserver start +Starting pageserver at '127.0.0.1:64000' in .zenith +Page server started +``` + +2. Create branch + +``` +$ zenith branch heikki main +branching at end of WAL: 0/15ECF68 +``` + +3. Connect to it: + +``` +$ psql "dbname=postgres port=5432 branch=heikki" +psql (14devel) +Type "help" for help. + +postgres=# +``` + + +The trick behind the scenes is that when you launch the page server, +it starts to listen on port 5432. When you connect to it with psql, it +looks at the 'branch' parameter that you passed in the connection +string. It automatically performs the "pg create" and "pg start" steps +for that branch, and then forwards the connection to the Postgres +instance that it launched. After you disconnect, if there are no more +active connections to the server running on the branch, it can +automatically shut it down again. + +This is how serverless would work in the cloud. We can do it on your +laptop, too. diff --git a/docs/rfcs/008-push-pull.md b/docs/rfcs/008-push-pull.md new file mode 100644 index 0000000000..272628e1ce --- /dev/null +++ b/docs/rfcs/008-push-pull.md @@ -0,0 +1,66 @@ +# Push and pull between pageservers + +Here is a proposal about implementing push/pull mechanics between pageservers. We also want to be able to push/pull to S3 but that would depend on the exact storage format so we don't touch that in this proposal. + +## Origin management + +The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that). + +``` +zenith origin add +zenith origin list +zenith origin remove +``` + +Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport. + +Behind the scenes, this commands may update toml file inside .zenith directory. + +## Push + +### Pushing branch + +``` +zenith push mybranch cloudserver # push to eponymous branch in cloudserver +zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver +``` + +Exact mechanics would be slightly different in the following situations: + +1) Destination branch does not exist. + + That is the simplest scenario. We can just create an empty branch (or timeline in internal terminology) and transfer all the pages/records that we have in our timeline. Right now each timeline is quite independent of other timelines so I suggest skipping any checks that there is a common ancestor and just fill it with data. Later when CoW timelines will land to the pageserver we may add that check and decide whether this timeline belongs to this pageserver repository or not [*]. + + The exact mechanics may be the following: + + * CLI asks local pageserver to perform push and hands over connection uri: `perform_push `. + * local pageserver connects to the remote pageserver and runs `branch_push ` + Handler for branch_create would create destination timeline and switch connection to copyboth mode. + * Sending pageserver may start iterator on that timeline and send all the records as copy messages. + +2) Destination branch exists and latest_valid_lsn is less than ours. + + In this case, we need to send missing records. To do that we need to find all pages that were changed since that remote LSN. Right now we don't have any tracking mechanism for that, so let's just iterate over all records and send ones that are newer than remote LSN. Later we probably should add a sparse bitmap that would track changed pages to avoid full scan. + +3) Destination branch exists and latest_valid_lsn is bigger than ours. + + In this case, we can't push to that branch. We can only pull. + +### Pulling branch + +Here we need to handle the same three cases, but also keep in mind that local pageserver can be behind NAT and we can't trivially re-use pushing by asking remote to 'perform_push' to our address. So we would need a new set of commands: + +* CLI calls `perform_pull ` on local pageserver. +* local pageserver calls `branch_pull ` on remote pageserver. +* remote pageserver sends records in our direction + +But despite the different set of commands code that performs iteration over records and receiving code that inserts that records can be the same for both pull and push. + + + +[*] It looks to me that there are two different possible approaches to handling unrelated timelines: + +1) Allow storing unrelated timelines in one repo. Some timelines may have parents and some may not. +2) Transparently create and manage several repositories in one pageserver. + +But that is the topic for a separate RFC/discussion. diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md new file mode 100644 index 0000000000..3f5386c165 --- /dev/null +++ b/docs/rfcs/009-snapshot-first-storage-cli.md @@ -0,0 +1,56 @@ +While working on export/import commands, I understood that they fit really well into "snapshot-first design". + +We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files. + +Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postges to zenith. + +So here is an attemt to design consistent CLI for diferent usage scenarios: + +#### 1. Start empty pageserver. +That is what we have now. +Init empty pageserver using `initdb` in temporary directory. + +`--storage_dest=FILE_PREFIX | S3_PREFIX |...` option defines object storage type, all other parameters are passed via env variables. Inspired by WAL-G style naming : https://wal-g.readthedocs.io/STORAGES/. + +Save`storage_dest` and other parameters in config. +Push snapshots to `storage_dest` in background. + +``` +zenith init --storage_dest=S3_PREFIX +zenith start +``` + +#### 2. Restart pageserver (manually or crash-recovery). +Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`. +Push snapshots to `storage_dest` in background. + +``` +zenith start +``` + +#### 3. Import. +Start pageserver from existing snapshot. +Path to snapshot provided via `--snapshot_path=FILE_PREFIX | S3_PREFIX | ...` +Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time operation. +Save`storage_dest` parameters in config. +Push snapshots to `storage_dest` in background. +``` +//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage. +zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX +zenith start +``` +How to pass credentials needed for `snapshot_path`? + +#### 4. Export. +Manually push snapshot to `snapshot_path` which differs from `storage_dest` +Optionally set `snapshot_format`, which can be plain pgdata format or zenith format. +``` +zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata +``` + +#### Notes and questions +- walkeeper s3_offload should use same (similar) syntax for storage. How to set it in UI? +- Why do we need `zenith init` as a separate command? Can't we init everything at first start? +- We can think of better names for all options. +- Export to plain postgres format will be useless, if we are not 100% compatible on page level. +I can recall at least one such difference - PD_WAL_LOGGED flag in pages. \ No newline at end of file diff --git a/docs/rfcs/009-snapshot-first-storage-pitr.md b/docs/rfcs/009-snapshot-first-storage-pitr.md new file mode 100644 index 0000000000..801613e2c9 --- /dev/null +++ b/docs/rfcs/009-snapshot-first-storage-pitr.md @@ -0,0 +1,227 @@ +# Preface + +GetPage@LSN can be called with older LSNs, and the page server needs +to be able to reconstruct older page versions. That's needed for +having read-only replicas that lag behind the primary, or that are +"anchored" at an older LSN, and internally in the page server whne you +branch at an older point in time. How do you do that? + +For now, I'm not considering incremental snapshots at all. I don't +think that changes things. So whenever you create a snapshot or a +snapshot file, it contains an image of all the pages, there is no need +to look at an older snapshot file. + +Also, I'm imagining that this works on a per-relation basis, so that +each snapshot file contains data for one relation. A "relation" is a +fuzzy concept - it could actually be one 1 GB relation segment. Or it +could include all the different "forks" of a relation, or you could +treat each fork as a separate relation for storage purpose. And once +we have the "non-relational" work is finished, a "relation" could +actually mean some other versioned object kept in the PostgreSQL data +directory. Let's ignore that for now. + +# Eric's RFC: + +Every now and then, you create a "snapshot". It means that you create +a new snapshot file for each relation that was modified after the last +snapshot, and write out the contents the relation as it is/was at the +snapshot LSN. Write-ahead log is stored separately in S3 by the WAL +safekeeping service, in the original PostgreSQL WAL file format. + + SNAPSHOT @100 WAL + . | + . | + . | + . | + SNAPSHOT @200 | + . | + . | + . | + . | + SNAPSHOT @300 | + . | + . V + IN-MEMORY @400 + +If a GetPage@LSN request comes from the primary, you return the latest +page from the in-memory layer. If there is no trace of the page in +memory, it means that it hasn't been modified since the last snapshot, +so you return the page from the latest snapshot, at LSN 300 in the +above example. + +PITR is implemented using the original WAL files: + +If a GetPage@LSN request comes from a read replica with LSN 250, you +read the image of the page from the snapshot at LSN 200, and you also +scan the WAL between 200 and 250, and apply all WAL records for the +requested page, to reconstruct it at LSN 250. + +Scanning the WAL naively for every GetPage@LSN request would be +expensive, so in practice you'd construct an in-memory data structure +of all the WAL between 200 and 250 once that allows quickly looking up +records for a given page. + +## Problems/questions + +I think you'll need to store the list of snapshot LSNs on each +timeline somewhere. + +If the latest snapshot of a relation is at LSN 100, and you request a +page at LSN 1000000, how do you know if there are some modifications +to it between 100 and 1000000 that you need to replay? You can scan +all the WAL between 100 and 1000000, but that would be expensive. + +You can skip that, if you know that a snapshot was taken e.g. at LSN +999900. Then you know that the fact that there is no snapshot file at +999900 means that the relation hasn't been modified between +100-999900. Then you only need to scan the WAL between 999900 and +1000000. However, there is no trace of a snapshot happening at LSN +999900 in the snapshot file for this relation, so you need to get +that information from somewhere else. + +Where do you get that information from? Perhaps you can scan all the +other relations, and if you see a snapshot file for *any* relation at +LSN 999900, you know that if there were modifications to this +relation, there would be a newer snapshot file for it, too. In other +words, the list of snapshots that have been taken can be constructed +by scanning all relations and computing the union of all snapshot LSNs +that you see for any relation. But that's expensive so at least you +should keep that in memory, after computing it once. Also, if you rely +on that, it's not possible to have snapshots at different intervals +for different files. That seems limiting. + +Another option is to explicitly store a list of snapshot LSNs in a +separate metadata file. + + +# Current implementation in the 'layered_repo' branch: + +We store snapshot files like in the RFC, but each snapshot file also +contains all the WAL in the range of LSNs, so that you don't need to +fetch the WAL separately from S3. So you have "layers" like this: + + SNAPSHOT+WAL 100-200 + | + | + | + | + SNAPSHOT+WAL 200-300 + | + | + | + | + IN-MEMORY 300- + +Each "snapshot+WAL" is a file that contains a snapshot - i.e. full +copy of each page in the relation, at the *start* LSN. In addition to +that, it contains all the WAL applicable to the relation from the +start LSN to the end LSN. With that, you can reconstruct any page +version in the range that the file covers. + + +## Problems/questions + +I can see one potential performance issue here, compared to the RFC. +Let's focus on a single relation for now. Imagine that you start from +an empty relation, and you receive WAL from 100 to 200, containing +a bunch of inserts and updates to the relation. You now have all that +WAL in memory: + + memory: WAL from 100-200 + +We decide that it's time to materialize that to a snapshot file on +disk. We materialize full image of the relation as it was at LSN 100 +to the snapshot file, and include all of the WAL. Since the relation +was initially empty, the "image" at the beginning of th range is empty +too. + +So now you have one file on on disk: + + SNAPSHOT+WAL 100-200 + +It contains a full image of the relation at LSN 100 and all WAL +between 100-200. (It's actually stored as a serialized BTreeMap of +page versions, with the page images and WAL records all stored +together in the same BtreeMap. But for this story, that's not +important.) + +We now receive more WAL updating the relation, up to LSN 300. We +decide it's time to materialize a new snapshot file, and we now have +two files: + + SNAPSHOT+WAL 100-200 + SNAPSHOT+WAL 200-300 + +Note that the latest "full snapshot" that we store on disk always lags +behind by one snapshot cycle. The first file contains a full image of +the relation at LSN 100, the second at LSN 200. When we have received +WAL up to LSN 300, we write a materialized image at LSN 200. That +seems a bit silly. In the design per your RFC, you would write a +snapshots at LSNs 200 and 300, instead. That seems better. + + + +# Third option (not implemented yet) + +Store snapshot files like in the RFC, but also store per-relation +WAL files that contain WAL in a range of LSNs for that relation. + + SNAPSHOT @100 WAL 100-200 + . | + . | + . | + . | + SNAPSHOT @200 WAL 200-300 + . | + . | + . | + . | + SNAPSHOT @300 + . + . + IN-MEMORY 300- + + +This could be the best of both worlds. The snapshot files would be +independent of the PostgreSQL WAL format. When it's time to write +snapshot file @300, you write a full image of the relation at LSN 300, +and you write the WAL that you had accumulated between 200 and 300 to +a separate file. That way, you don't "lag behind" for one snapshot +cycle like in the current implementation. But you still have the WAL +for a particular relation readily available alongside the snapshot +files, and you don't need to track what snapshot LSNs exist +separately. + +(If we wanted to minize the number of files, you could include the +snapshot @300 and the WAL between 200 and 300 in the same file, but I +feel it's probably better to keep them separate) + + + +# Further thoughts + +There's no fundamental reason why the LSNs of the snapshot files and the +ranges of the WAL files would need to line up. So this would be possible +too: + + SNAPSHOT @100 WAL 100-150 + . | + . | + . WAL 150-250 + . | + SNAPSHOT @200 | + . | + . WAL 250-400 + . | + . | + SNAPSHOT @300 | + . | + . | + IN-MEMORY 300- + +I'm not sure what the benefit of this would be. You could materialize +additional snapshot files in the middle of a range covered by a WAL +file, maybe? Might be useful to speed up access when you create a new +branch in the middle of an LSN range or if there's some other reason +to believe that a particular LSN is "interesting" and there will be +a lot of requests using it. diff --git a/docs/rfcs/009-snapshot-first-storage.md b/docs/rfcs/009-snapshot-first-storage.md new file mode 100644 index 0000000000..aeef54898a --- /dev/null +++ b/docs/rfcs/009-snapshot-first-storage.md @@ -0,0 +1,148 @@ +# Snapshot-first storage architecture + +Goals: +- Long-term storage of database pages. +- Easy snapshots; simple snapshot and branch management. +- Allow cloud-based snapshot/branch management. +- Allow cloud-centric branching; decouple branch state from running pageserver. +- Allow customer ownership of data via s3 permissions. +- Provide same or better performance for typical workloads, vs plain postgres. + +Non-goals: +- Service database reads from s3 (reads should be serviced from the pageserver cache). +- Keep every version of every page / Implement point-in-time recovery (possibly a future paid feature, based on WAL replay from an existing snapshot). + +## Principle of operation + +The database “lives in s3”. This means that all of the long term page storage is in s3, and the “live database”-- the version that lives in the pageserver-- is a set of “dirty pages” that haven’t yet been written back to s3. + +In practice, this is mostly similar to storing frequent snapshots to s3 of a database that lives primarily elsewhere. + +The main difference is that s3 is authoritative about which branches exist; pageservers consume branches, snapshots, and related metadata by reading them from s3. This allows cloud-based management of branches and snapshots, regardless of whether a pageserver is running or not. + +It’s expected that a pageserver should keep a copy of all pages, to shield users from s3 latency. A cheap/slow pageserver that falls back to s3 for some reads would be possible, but doesn’t seem very useful right now. + +Because s3 keeps all history, and the safekeeper(s) preserve any WAL records needed to reconstruct the most recent changes, the pageserver can store dirty pages in RAM or using non-durable local storage; this should allow very good write performance, since there is no need for fsync or journaling. + +Objects in s3 are immutable snapshots, never to be modified once written (only deleted). + +Objects in s3 are files, each containing a set of pages for some branch/relation/segment as of a specific time (LSN). A snapshot could be complete (meaning it has a copy of every page), or it could be incremental (containing only the pages that were modified since the previous snapshot). It’s expected that most snapshots are incremental to keep storage costs low. + +It’s expected that the pageserver would upload new snapshot objects frequently, e.g. somewhere between 30 seconds and 15 minutes, depending on cost/performance balance. + +No-longer needed snapshots can be “squashed”-- meaning snapshot N and snapshot N+1 can be read by some cloud agent software, which writes out a new object containing the combined set of pages (keeping only the newest version of each page) and then deletes the original snapshots. + +A pageserver only needs to store the set of pages needed to satisfy operations in flight: if a snapshot is still being written, the pageserver needs to hold historical pages so that snapshot captures a consistent moment in time (similar to what is needed to satisfy a slow replica). + +WAL records can be discarded once a snapshot has been stored to s3. (Unless we want to keep them longer as part of a point-in-time recovery feature.) + +## Pageserver operation + +To start a pageserver from a stored snapshot, the pageserver downloads a set of snapshots sufficient to start handling requests. We assume this includes the latest copy of every page, though it might be possible to start handling requests early, and retrieve pages for the first time only when needed. + +To halt a pageserver, one final snapshot should be written containing all pending WAL updates; then the pageserver and safekeepers can shut down. + +It’s assumed there is some cloud management service that ensures only one pageserver is active and servicing writes to a given branch. + +The pageserver needs to be able to track whether a given page has been modified since the last snapshot, and should be able to produce the set of dirty pages efficiently to create a new snapshot. + +The pageserver need only store pages that are “reachable” from a particular LSN. For example, a page may be written four times, at LSN 100, 200, 300, and 400. If no snapshot is being created when LSN 200 is written, the page at LSN 100 can be discarded. If a snapshot is triggered when the pageserver is at LSN 299, the pageserver must preserve the page from LSN 200 until that snapshot is complete. As before, the page at LSN 300 can be discarded when the LSN 400 pages is written (regardless of whether the LSN 200 snapshot has completed.) + +If the pageserver is servicing multiple branches, those branches may contain common history. While it would be possible to serve branches with zero knowledge of their common history, a pageserver could save a lot of space using an awareness of branch history to share the common set of pages. Computing the “liveness” of a historical page may be tricky in the face of multiple branches. + +The pageserver may store dirty pages to memory or to local block storage; any local block storage format is only temporary “overflow” storage, and is not expected to be readable by future software versions. + +The pageserver may store clean pages (those that are captured in a snapshot) any way it likes: in memory, in a local filesystem (possibly keeping a local copy of the snapshot file), or using some custom storage format. Reading pages from s3 would be functional, but is expected to be prohibitively slow. + +The mechanism for recovery after a pageserver failure is WAL redo. If we find that too slow in some situations (e.g. write-heavy workload causes long startup), we can write more frequent snapshots to keep the number of outstanding WAL records low. If that’s still not good enough, we could look at other options (e.g. redundant pageserver or an EBS page journal). + +A read-only pageserver is possible; such a pageserver could be a read-only cache of a specific snapshot, or could auto-update to the latest snapshot on some branch. Either way, no safekeeper is required. Multiple read-only pageservers could exist for a single branch or snapshot. + +## Cloud snapshot manager operation + +Cloud software may wish to do the following operations (commanded by a user, or based on some pre-programmed policy or other cloud agent): +Create/delete/clone/rename a database +Create a new branch (possibly from a historical snapshot) +Start/stop the pageserver/safekeeper on a branch +List databases/branches/snapshots that are visible to this user account + +Some metadata operations (e.g. list branches/snapshots of a particular db) could be performed by scanning the contents of a bucket and inspecting the file headers of each snapshot object. This might not be fast enough; it might be necessary to build a metadata service that can respond more quickly to some queries. + +This is especially true if there are public databases: there may be many thousands of buckets that are public, and scanning all of them is not a practical strategy for answering metadata queries. + +## Snapshot names, deletion and concurrency + +There may be race conditions between operations-- in particular, a “squash” operation may replace two snapshot objects (A, B) with some combined object (C). Since C is logically equivalent to B, anything that attempts to access B should be able to seamlessly switch over to C. It’s assumed that concurrent delete won’t disrupt a read in flight, but it may be possible for some process to read B’s header, and then discover on the next operation that B is gone. + +For this reason, any attempted read should attempt a fallback procedure (list objects; search list for an equivalent object) if an attempted read fails. This requires a predictable naming scheme, e.g. `XXXX_YYYY_ZZZZ_DDDD`, where `XXXX` is the branch unique id, and `YYYY` and `ZZZZ` are the starting/ending LSN values. `DDDD` is a timestamp indicating when the object was created; this is used to disambiguate a series of empty snapshots, or to help a snapshot policy engine understand which snapshots should be kept or discarded. + +## Branching + +A user may request a new branch from the cloud user interface. There is a sequence of things that needs to happen: +- If the branch is supposed to be based on the latest contents, the pageserver should perform an immediate snapshot. This is the parent snapshot for the new branch. +- Cloud software should create the new branch, by generating a new (random) unique branch identifier, and creating a placeholder snapshot object. + - The placeholder object is an empty snapshot containing only metadata (which anchors it to the right parent history) and no pages. + - The placeholder can be discarded when the first snapshot (containing data) is completed. Discarding is equivalent to squashing, when the snapshot contains no data. +- If the branch needs to be started immediately, a pageserver should be notified that it needs to start servicing the branch. This may not be the same pageserver that services the parent branch, though the common history may make it the best choice. + +Some of these steps could be combined into the pageserver, but that process would not be possible under all cases (e.g. if no pageserver is currently running, or if the branch is based on an older snapshot, or if a different pageserver will be serving the new branch). Regardless of which software drives the process, the result should look the same. + +## Long-term file format + +Snapshot files (and any other object stored in s3) must be readable by future software versions. + +It should be possible to build multiple tools (in addition to the pageserver) that can read and write this file format-- for example, to allow cloud snapshot management. + +Files should contain the following metadata, in addition to the set of pages: +- The version of the file format. +- A unique identifier for this branch (should be worldwide-unique and unchanging). +- Optionally, any human-readable names assigned to this branch (for management UI/debugging/logging). +- For incremental snapshots, the identifier of the predecessor snapshot. For new branches, this will be the parent snapshot (the point at which history diverges). +- The location of the predecessor branch snapshot, if different from this branch’s location. +- The LSN range `(parent, latest]` for this snapshot. For complete snapshots, the parent LSN can be 0. +- The UTC timestamp of the snapshot creation (which may be different from the time of its highest LSN, if the database is idle). +- A SHA2 checksum over the entire file (excluding the checksum itself), to preserve file integrity. + +A file may contain no pages, and an empty LSN range (probably `(latest, latest]`?), which serves as a placeholder for either a newly-created branch, or a snapshot of an idle database. + +Any human-readable names stored in the file may fall out of date if database/branch renames are allowed; there may need to be a cloud metadata service to query (current name -> unique identifier). We may choose instead to not store human-readable names in the database, or treat them as debugging information only. + +## S3 semantics, and other kinds of storage + +For development and testing, it may be easier to use other kinds of storage in place of s3. For example, a directory full of files can substitute for an s3 bucket with multiple objects. This mode is expected to match the s3 semantics (e.g. don’t edit existing files or use symlinks). Unit tests may omit files entirely and use an in-memory mock bucket. + +Some users may want to use a local or network filesystem in place of s3. This isn’t prohibited but it’s not a priority, either. + +Alternate implementations of s3 should be supported, including Google Cloud Storage. + +Azure Blob Storage should be supported. We assume (without evidence) that it’s semantically equivalent to s3 for this purpose. + +The properties of s3 that we depend on are: +list objects +streaming read of entire object +read byte range from object +streaming write new object (may use multipart upload for better relialibity) +delete object (that should not disrupt an already-started read). + +Uploaded files, restored backups, or s3 buckets controlled by users could contain malicious content. We should always validate that objects contain the content they’re supposed to. Incorrect, Corrupt or malicious-looking contents should cause software (cloud tools, pageserver) to fail gracefully. + +## Notes + +Possible simplifications, for a first draft implementation: +- Assume that dirty pages fit in pageserver RAM. Can use kernel virtual memory to page out to disk if needed. Can improve this later. +- Don’t worry about the details of the squashing process yet. +- Don’t implement cloud metadata service; try to make everything work using basic s3 list-objects and reads. +- Don’t implement rename, delete at first. +- Don’t implement public/private, just use s3 permissions. +- Don’t worry about sharing history yet-- each user has their own bucket and a full copy of all data. +- Don’t worry about history that spans multiple buckets. +- Don’t worry about s3 regions. +- Don’t support user-writeable s3 buckets; users get only read-only access at most. + +Open questions: +- How important is point-in-time recovery? When should we add this? How should it work? +- Should snapshot files use compression? +- Should we use snapshots for async replication? A spare pageserver could stay mostly warmed up by consuming snapshots as they’re created. +- Should manual snapshots, or snapshots triggered by branch creation, be named differently from snapshots that are triggered by a snapshot policy? +- When a new branch is created, should it always be served by the same pageserver that owns its parent branch? When should we start a new pageserver? +- How can pageserver software upgrade be done with minimal downtime? diff --git a/docs/rfcs/010-storage_details.md b/docs/rfcs/010-storage_details.md new file mode 100644 index 0000000000..8429a2d9e3 --- /dev/null +++ b/docs/rfcs/010-storage_details.md @@ -0,0 +1,144 @@ +# Storage details + +Here I tried to describe the current state of thinking about our storage subsystem as I understand it. Feel free to correct me. Also, I tried to address items from Heikki's TODO and be specific on some of the details. + +## Overview + +![storage](images/storage.jpeg) + +### MemStore + +MemStore holds the data between `latest_snapshot_lsn` and `latest_lsn`. It consists of PageIndex that holds references to WAL records or pages, PageStore that stores recently materialized pages, and WalStore that stores recently received WAL. + +### PageIndex + +PageIndex is an ordered collection that maps `(BufferTag, LSN)` to one of the following references (by reference I mean some information that is needed to access that data, e.g. file_id and offset): + +* PageStoreRef -- page offset in the PageStore +* LocalStoreRef -- snapshot_id and page offset inside of that snapshot +* WalStoreRef -- offset (and size optionally) of WalRecord in WalStore + +PageIndex holds information about all the pages in all incremental snapshots and in the latest full snapshot. If we aren't using page compression inside snapshots we actually can avoid storing references to the full snapshot and calculate page offsets based on relation sizes metadata in the full snapshot (assuming that full snapshot stores pages sorted by page number). However, I would suggest embracing page compression from the beginning and treat all pages as variable-sized. + +We assume that PageIndex is few orders of magnitude smaller than addressed data hence it should fit memory. We also don't care about crash tolerance as we can rebuild it from snapshots metadata and WAL records from WalStore or/and Safekeeper. + +### WalStore + +WalStore is a queue of recent WalRecords. I imagine that we can store recent WAL the same way as Postgres does -- as 16MB files on disk. On top of that, we can add some fixed-size cache that would keep some amount of segments in memory. + +For now, we may rely on the Safekeeper to safely store that recent WAL. But generally, I think we can pack all S3 operations into the page server so that it would be also responsible for the recent WAL pushdown to S3 (and Safekeeper may just delete WAL that was confirmed as S3-durable by the page server). + +### PageStore + +PageStore is storage for recently materialized pages (or in other words cache of getPage results). It is also can be implemented as a file-based queue with some memory cache on top of it. + +There are few possible options for PageStore: + +a) we just add all recently materialized pages there (so several versions of the same page can be stored there) -- that is more or less how it happens now with the current RocksDB implementation. + +b) overwrite older pages with the newer pages -- if there is no replica we probably don't need older pages. During page overwrite, we would also need to change PageStoreRef back to WalStoreRef in PageIndex. + +I imagine that newly created pages would just be added to the back of PageStore (again in queue-like fashion) and this way there wouldn't be any meaningful ordering inside of that queue. When we are forming a new incremental snapshot we may prohibit any updates to the current set of pages in PageStore (giving up on single page version rule) and cut off that whole set when snapshot creation is complete. + +With option b) we can also treat PageStor as an uncompleted increamental snapshot. + +### LocalStore + +LocalStore keeps the latest full snapshot and set of incremental snapshots on top of it. We add new snapshots when the number of changed pages grows bigger than a certain threshold. + +## Granularity + +By granularity, I mean a set of pages that goes into a certain full snapshot. Following things should be taken into account: + +* can we shard big databases between page servers? +* how much time will we spend applying WAL to access certain pages with older LSN's? +* how many files do we create for a single database? + +I can think of the following options here: + +1. whole database goes to one full snapshot. + * +: we never create a lot of files for one database + * +: the approach is quite straightforward, moving data around is simple + * -: can not be sharded + * -: long recovery -- we always need to recover the whole database +2. table segment is the unit of snapshotting + * +: straightforward for sharding + * +: individual segment can be quickly recovered with sliced WAL + * -: full snapshot can be really small (e.g. when the corresponding segment consists of a single page) and we can blow amount of files. Then we would spend eternity in directory scans and the amount of metadata for sharding can be also quite big. +3. range-partitioned snapshots -- snapshot includes all pages between [BuffTagLo, BuffTagHi] mixing different relations, databases, and potentially clusters (albeit from one tenant only). When full snapshot outgrows a certain limit (could be also a few gigabytes) we split the snapshot in two during the next full snapshot write. That approach would also require pages sorted by BuffTag inside our snapshots. + * +: addresses all mentioned issues + * -: harder to implement + +I think it is okay to start with table segments granularity and just check how we will perform in cases of lots of small tables and check is there any way besides c) to deal with it. + +Both PageStore and WalStore should be "sharded" by this granularity level. + +## Security + +We can generate different IAM keys for each tenant and potentially share them with users (in read-only mode?) or even allow users to provide their S3 buckets credentials. + +Also, S3 backups are usually encrypted by per-tenant privates keys. I'm not sure in what threat model such encryption would improve something (taking into account per-tenant IAM keys), but it seems that everybody is doing that (both AMZN and YNDX). Most likely that comes as a requirement about "cold backups" by some certification procedure. + +## Dynamics + +### WAL stream handling + +When a new WAL record is received we need to parse BufferTags in that record and insert them in PageIndex with WalStoreRef as a value. + +### getPage queries + +Look up the page in PageIndex. If the value is a page reference then just respond with that page. If the referenced value is WAL record then find the most recent page with the same BuffTag (that is why we need ordering in PageIndex); recover it by applying WAL records; save it in PageStore; respond with that page. + +### Starting page server without local data + +* build set of latest full snapshots and incremental snapshots on top of them +* load all their metadata into PageIndex +* Safekeeper should connect soon and we can ask for a WAL stream starting from the latest incremental snapshot +* for databases that are connected to us through the Safekeeper we can start loading the set of the latest snapshots or we can do that lazily based on getPage request (I'd better avoid doing that lazily for now without some access stats from the previous run and just transfer all data for active database from S3 to LocalStore). + +### Starting page server with local data (aka restart or reboot) + +* check that local snapshot files are consistent with S3 + +### Snapshot creation + +Track size of future snapshots based on info in MemStore and when it exceeds some threshold (taking into account our granularity level) create a new incremental snapshot. Always emit incremental snapshots from MemStore. + +To create a new snapshot we need to walk through WalStore to get the list of all changed pages, sort it, and get the latest versions of that pages from PageStore or by WAL replay. It makes sense to maintain that set in memory while we are receiving the WAL stream to avoid parsing WAL during snapshot creation. + +Full snapshot creation can be done by GC (or we can call that entity differently -- e.g. merger?) by merging the previous full snapshot with several incremental snapshots. + +### S3 pushdown + +When we have several full snapshots GC can push the old one with its increments to S3. + +### Branch creation + +Create a new timeline and replay sliced WAL up to a requested point. When the page is not in PageIndex ask the parent timeline about a page. Relation sizes are tricky. + +## File formats + +As far as I understand Bookfile/Aversion addresses versioning and serialization parts. + +As for exact data that should go to snapshots I think it is the following for each snapshot: + +* format version number +* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknow key are present. If we add something backward compatible to the file we can keep the version number. +* array of [BuffTag, corresponding offset in file] for pages -- IIUC that is analogous to ToC in Bookfile +* array of [(BuffTag, LSN), corresponding offset in file] for the WAL records +* pages, one by one +* WAL records, one by one + +It is also important to be able to load metadata quickly since it would be one of the main factors impacting the time of page server start. E.g. if would store/cache about 10TB of data per page server, the size of uncompressed page references would be about 30GB (10TB / ( 8192 bytes page size / ( ~18 bytes per ObjectTag + 8 bytes offset in the file))). + +1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when realtion_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset delatas would be small). +2) It makes sense to keep ToC at the beginning of the file to avoid extra seeks to locate it. Doesn't matter too much with the local files but matters on S3 -- if we are accessing a lot of ~1Gb files with the size of metadata ~ 1Mb then the time to transfer this metadata would be comparable with access latency itself (which is about a half of a second). So by slurping metadata with one read of file header instead of N reads we can improve the speed of page server start by this N factor. + +I think both of that optimizations can be done later, but that is something to keep in mind when we are designing our storage serialization routines. + +Also, there were some discussions about how to embed WAL in incremental snapshots. So far following ideas were mentioned: +1. snapshot lsn=200, includes WAL in range 200-300 +2. snapshot lsn=200, includes WAL in range 100-200 +3. data snapshots are separated from WAL snapshots + +Both options 2 and 3 look good. I'm inclined towards option 3 as it would allow us to apply different S3 pushdown strategies for data and WAL files (e.g. we may keep data snapshot until the next full snapshot, but we may push WAL snapshot to S3 just when they appeared if there are no replicas). diff --git a/docs/rfcs/011-retention-policy.md b/docs/rfcs/011-retention-policy.md new file mode 100644 index 0000000000..fde36c8108 --- /dev/null +++ b/docs/rfcs/011-retention-policy.md @@ -0,0 +1,91 @@ +# User-visible timeline history + +The user can specify a retention policy. The retention policy is +presented to the user as a PITR period and snapshots. The PITR period +is the amount of recent history that needs to be retained, as minutes, +hours, or days. Within that period, you can create a branch or +snapshot at any point in time, open a compute node, and start running +queries. Internally, a PITR period is represented as a range of LSNs + +The user can also create snapshots. A snapshot is a point in time, +internally represented by an LSN. The user gives the snapshot a name. + +The user can also specify an interval, at which the system creates +snapshots automatically. For example, create a snapshot every night at +2 AM. After some user-specified time, old automatically created +snapshots are removed. + + Snapshot Snapshot + PITR "Monday" "Tuesday" PITR + ----######----------+-------------+-------------######> + +If there are multiple branches, you can specify different policies or +different branches. + +The PITR period and user-visible snapshots together define the +retention policy. + +NOTE: As presented here, this is probably overly flexible. In reality, +we want to keep the user interface simple. Only allow a PITR period at +the tip of a branch, for example. But that doesn't make much +difference to the internals. + + +# Retention policy behind the scenes + +The retention policy consists of points (for snapshots) and ranges +(for PITR periods). + +The system must be able to reconstruct any page within the retention +policy. Other page versions can be garbage collected away. We have a +lot of flexibility on when to perform the garbage collection and how +aggressive it is. + + +# Base images and WAL slices + +The page versions are stored in two kinds of files: base images and +WAL slices. A base image contains a dump of all the pages of one +relation at a specific LSN. A WAL slice contains all the WAL in an LSN +range. + + + | + | + | + | --Base img @100 + + | | + | | WAL slice + | | 100-200 + | | + | --Base img @200 + + | | + | | WAL slice + | | 200-300 + | | + | + + | + V + + +To recover a page e.g. at LSN 150, you need the base image at LSN 100, +and the WAL slice 100-200. + +All of this works at a per-relation or per-relation-segment basis. If +a relation is updated very frequently, we create base images and WAL +slices for it more quickly. For a relation that's updated +infrequently, we hold the recent WAL for that relation longer, and +only write it out when we need to release the disk space occupied by +the original WAL. (We need a backstop like that, because until all the +WAL/base images have been been durably copied to S3, we must keep the +original WAL for that period somewhere, in the WAL service or in S3.) + + +# Branching + +Internally, branch points are also "retention points", in addition to +the user-visible snapshots. If a branch has been forked off at LSN +100, we need to be able to reconstruct any page on the parent branch +at that LSN, because it is needed by the child branch. If a page is +modified in the child, we don't need to keep that in the parent +anymore, though. diff --git a/docs/rfcs/012-background-tasks.md b/docs/rfcs/012-background-tasks.md new file mode 100644 index 0000000000..8692b187e6 --- /dev/null +++ b/docs/rfcs/012-background-tasks.md @@ -0,0 +1,38 @@ +# Eviction + + Write out in-memory layer to disk, into a delta layer. + +- To release memory +- To make it possible to advance disk_consistent_lsn and allow the WAL + service to release some WAL. + +- Triggered if we are short on memory +- Or if the oldest in-memory layer is so old that it's holding back + the WAL service from removing old WAL + +# Materialization + +Create a new image layer of a segment, by performing WAL redo + +- To reduce the amount of WAL that needs to be replayed on a GetPage request. +- To allow garbage collection of old layers + +- Triggered by distance to last full image of a page + +# Coalescing + +Replace N consecutive layers of a segment with one larger layer. + +- To reduce the number of small files that needs to be uploaded to S3 + + +# Bundling + +Zip together multiple small files belonging to different segments. + +- To reduce the number of small files that needs to be uploaded to S3 + + +# Garbage collection + +Remove a layer that's older than the GC horizon, and isn't needed anymore. diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md new file mode 100644 index 0000000000..0c359028ed --- /dev/null +++ b/docs/rfcs/013-term-history.md @@ -0,0 +1,147 @@ +# What + +Currently, apart from WAL safekeeper persistently stores only two logical clock +counter (aka term) values, sourced from the same sequence. The first is bumped +whenever safekeeper gives vote to proposer (or acknowledges already elected one) +and e.g. prevents electing two proposers with the same term -- it is actually +called `term` in the code. The second, called `epoch`, reflects progress of log +receival and this might lag behind `term`; safekeeper switches to epoch `n` when +it has received all committed log records from all `< n` terms. This roughly +correspones to proposed in + +https://github.com/zenithdb/rfcs/pull/3/files + + +This makes our biggest our difference from Raft. In Raft, every log record is +stamped with term in which it was generated; while we essentialy store in +`epoch` only the term of the highest record on this safekeeper -- when we know +it -- because during recovery generally we don't, and `epoch` is bumped directly +to the term of the proposer who performs the recovery when it is finished. It is +not immediately obvious that this simplification is safe. I thought and I still +think it is; model checking confirmed that. However, some details now make me +believe it is better to keep full term switching history (which is equivalent to +knowing term of each record). + +# Why + +Without knowing full history (list of pairs) of terms it is hard to +determine the exact divergence point, and if we don't perform truncation at that +point safety becomes questionable. Consider the following history, with +safekeepers A, B, C, D, E. n_m means record created by proposer in term n with +LSN m; (t=x, e=y) means safekeeper currently has term x and epoch y. + +1) P1 in term 1 writes 1.1 everywhere, which is committed, and some more only +on A. + +

+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=1, e=1) 1.1
+D(t=1, e=1) 1.1
+E(t=1, e=1) 1.1
+
+ +2) P2 is elected by CDE in term 2, epochStartLsn is 2, and writes 2.2, 2.3 on CD: + +
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=2, e=2) 1.1 2.2 2.3
+D(t=2, e=2) 1.1 2.2 2.3
+E(t=2, e=1) 1.1
+
+ + +3) P3 is elected by CDE in term 3, epochStartLsn is 4, and writes 3.4 on D: + +
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=3, e=2) 1.1 2.2 2.3
+D(t=3, e=3) 1.1 2.2 2.3 3.4
+E(t=3, e=1) 1.1
+
+ + +Now, A gets back and P3 starts recovering it. How it should proceed? There are +two options. + +## Don't try to find divergence point at all + +...start sending WAL conservatively since the horizon (1.1), and truncate +obsolete part of WAL only when recovery is finished, i.e. epochStartLsn (4) is +reached, i.e. 2.3 transferred -- that's what https://github.com/zenithdb/zenith/pull/505 proposes. + +Then the following is possible: + +4) P3 moves one record 2.2 to A. + +
+A(t=1, e=1) 1.1 2.2 1.3 1.4
+B(t=1, e=1) 1.1 1.2
+C(t=3, e=2) 1.1 2.2 2.3
+D(t=3, e=3) 1.1 2.2 2.3 3.4
+E(t=3, e=1) 1.1
+
+ +Now log of A is basically corrupted. Moreover, since ABE are all in epoch 1 and +A's log is the longest one, they can elect P4 who will commit such log. + +Note that this particular history couldn't happen if we forbid to *create* new +records in term n until majority of safekeepers switch to it. It would force CDE +to switch to 2 before 2.2 is created, and A could never become donor while his +log is corrupted. Generally with this additional barrier I believe the algorithm +becomes safe, but + - I don't like this kind of artificial barrier; + - I also feel somewhat discomfortable about even temporary having intentionally + corrupted WAL; + - I'd still model check the idea. + +## Find divergence point and truncate at it + +Then step 4 would delete 1.3 1.4 on A, and we are ok. The question is, how do we +do that? Without term switching history we have to resort to sending again since +the horizon and memcmp'ing records, which is inefficient and ugly. Or we can +maintain full history and determine truncation point by comparing 'wrong' and +'right' histories -- much like pg_rewind does -- and perform truncation + start +streaming right there. + +# Proposal + +- Add term history as array of pairs to safekeeper controlfile. +- Return it to proposer with VoteResponse so 1) proposer can tell it to other + nodes and 2) determine personal streaming starting point. However, since we + don't append WAL and update controlfile atomically, let's first always update + controlfile but send only the history of what we really have (up to highest + term in history where begin_lsn >= end of wal; this highest term replaces + current `epoch`). We also send end of wal as we do now to determine the donor. +- Create ProposerAnnouncement message which proposer sends before starting + streaming. It announces proposer as elected and + 1) Truncates wrong part of WAL on safekeeper + (divergence point is already calculated at proposer, but can be + cross-verified here). + 2) Communicates the 'right' history of its term (taken from donor). Seems + better to immediately put the history in the controlfile, + though safekeeper might not have full WAL for previous terms in it -- + this way is simpler, and we can't update WAL and controlfile atomically anyway. + + This also constitutes analogue of current epoch bump for those safekeepers + which don't need recovery, which is important for sync-safekeepers (bump + epoch without waiting records from new term). +- After ProposerAnnouncement proposer streams WAL since calculated starting + point -- only what is missing. + + +pros/cons: ++ (more) clear safety of WAL truncation -- we get very close to Raft ++ no unnecessary data sending (faster recovery for not-oldest-safekeepers, matters + only for 5+ nodes) ++ adds some observability at safekeepers + +- complexity, but not that much + + +# Misc + +- During model checking I did truncation on first locally non existent or + different record -- analogue of 'memcmp' variant described above. diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md new file mode 100644 index 0000000000..fdf6885929 --- /dev/null +++ b/docs/rfcs/README.md @@ -0,0 +1,95 @@ +This directory contains Request for Comments documents, or RFCs, for +features or concepts that have been proposed. Alternative names: +technical design doc, ERD, one-pager + +To make a new proposal, create a new text file in this directory and +open a Pull Request with it. That gives others a chance and a forum +to comment and discuss the design. + +When a feature is implemented and the code changes are committed, also +include the corresponding RFC in this directory. + +Some of the RFCs in this directory have been implemented in some form +or another, while others are on the roadmap, while still others are +just obsolete and forgotten about. So read them with a grain of salt, +but hopefully even the ones that don't reflect reality give useful +context information. + +## What + +We use Tech Design RFC’s to summarize what we are planning to +implement in our system. These RFCs should be created for large or not +obvious technical tasks, e.g. changes of the architecture or bigger +tasks that could take over a week, changes that touch multiple +components or their interaction. RFCs should fit into a couple of +pages, but could be longer on occasion. + +## Why + +We’re using RFCs to enable early review and collaboration, reduce +uncertainties, risk and save time during the implementation phase that +follows the Tech Design RFC. + +Tech Design RFCs also aim to avoid bus factor and are an additional +measure to keep more peers up to date & familiar with our design and +architecture. + +This is a crucial part for ensuring collaboration across timezones and +setting up for success a distributed team that works on complex +topics. + +## Prior art + +- Rust: [https://github.com/rust-lang/rfcs/blob/master/0000-template.md](https://github.com/rust-lang/rfcs/blob/master/0000-template.md) +- React.js: [https://github.com/reactjs/rfcs/blob/main/0000-template.md](https://github.com/reactjs/rfcs/blob/main/0000-template.md) +- Google fuchsia: [https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE](https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE) +- Apache: [https://cwiki.apache.org/confluence/display/GEODE/RFC+Template](https://cwiki.apache.org/confluence/display/GEODE/RFC+Template) / [https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process](https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process) + +## How + +RFC lifecycle: + +- Should be submitted in a pull request with and full RFC text in a commited markdown file and copy of the Summary and Motivation sections also included in the PR body. +- RFC should be published for review before most of the actual code is written. This isn’t a strict rule, don’t hesitate to experiment and build a POC in parallel with writing an RFC. +- Add labels to the PR in the same manner as you do Issues. Example TBD +- Request the review from your peers. Reviewing the RFCs from your peers is a priority, same as reviewing the actual code. +- The Tech Design RFC should evolve based on the feedback received and further during the development phase if problems are discovered with the taken approach +- RFCs stop evolving once the consensus is found or the proposal is implemented and merged. +- RFCs are not intended as a documentation that’s kept up to date **after** the implementation is finished. Do not update the Tech Design RFC when merged functionality evolves later on. In such situation a new RFC may be appropriate. + +### RFC template + +Note, a lot of the sections are marked as ‘if relevant’. They are included into the template as a reminder and to help inspiration. + +``` +# Name +Created on .. +Implemented on .. + +## Summary + +## Motivation + +## Non Goals (if relevant) + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +## Proposed implementation + +### Reliability, failure modes and corner cases (if relevant) + +### Interaction/Sequence diagram (if relevant) + +### Scalability (if relevant) + +### Security implications (if relevant) + +### Unresolved questions (if relevant) + +## Alternative implementation (if relevant) + +## Pros/cons of proposed approaches (if relevant) + +## Definition of Done (if relevant) + +``` diff --git a/docs/rfcs/images/storage.jpeg b/docs/rfcs/images/storage.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..1d72a018dc462a74ad01bb17561c98efd0745bca GIT binary patch literal 431075 zcmeFZcT`i~zb(2$C{jc35SoA>B3+6JHj0Q~p$jOz2-v8G1VMTS6#=D)h|)U>n9vac zL8^d|f}$cwA_`jw;cdRZ^Ugi@-7(&H=lyZ-x#PMWf;oKc6}0XYJh&yCi`3 zw28S10D}Pl4Eh7?&H*O?IMd#-_Z1Eu%&g3N$39k87FPCs931TX*x5O_5L_IbJe=(8 z+2blR- z_+>SYu?n2K!X|fJQ1eb^@jm(Eb)7=z-%}N|+-^j(a|nxwiiz)6R8l^mqOGH=cSPUd z#K}`8re@}+FI=>-wX=6{bocP|^7irdyLl@(Bs45MBIa&vTzo=eQdai8`wwy+=H@*q zd0P6c?D>oG`i91)=9ZVQTD!V?di(m{47?p1pO~DQ{xCC3rYwH?y!7Skw`JPT^p`ueYd{XfP9O#+OG8JbeIy>Y>qLZQEKK4unK4OaeR=h&`X7m(Au zvrq7NW^r97yS&zUs*u}__Z-3s+T{JTy`lXxvj1-bi~j#LvVR}ge;d~t3uekBSnj?0Z7 zs_}T)jIT{|S8jTlWP7|VfjPQZ^7{K;AR?~ph5gs9fQ6bDA4^|9PT{}w@yr)aNwd=2 z^jaVfpm#tYA3XrpIu9L0?ekR&SklY#n*7W!y8iPAlP0WBD<)->Z5~-4-|w^w%o>uK zb7&XNQ-9+Vuj7J41E;Q?9hdu~rO6}~Wpt833&FHmY?uo8qA+)!&IeIvc$C=93VEk!d%;uyoOjp%rsZ%sUs zT1VF+zxFq2w3yM}jK7d}*wJTFY*x`k&Wv$Q@A})?j$XhTtl7TIc()5Mx4R>vu=~Sy zfo)aiyamj&uai6dlVWJw{_)G7(4 zMEla2KNw}yiAcN!?)1&wE?MiJLCYH>Pvbhxu&TJO+MNeRr~G|ye)e$)nUFm{|Zd`-n3uKg$IvJ!}F8T&cE&1$!*za3rvZ#@p zM1QxU&sWVeCR+db1c34OhnC2#oo7!7+cy1i4CF|l)o{R0?yYjx@Y${}*HE7di;EDi zb^-VFxsonq(@8s&c>DYO?gE{0HSA#VUY9NY!`sE;RVFU>WRHtKuV z-=r9ZA6yX)S&Bl7?gAdGhNQkK+FI9aX#dt|PhsJ+(Ugy0XaKgAXYjT3G`&pj`fldBK?a}MTB6MGh zYdoee{sFo^M;&;HVe6vs=%9gh1eV!=qg|Vuxc1ju$y9) ziW3edBp2+D+t(!JwR(u9{ZxuAt*<=u^OsE=RrFgGMVngyp_69tkE>4 zo$Sr2^Q*%X@7`rn!tyaDB?_*kl7V=K=7sK|Gr^~A$NUR*^Ddq5>4*3s+n){}FTD|Y z|4B>P{YN?BG_Zl2>1S{Z22e)$9m3EXQPk#~BJShom{Brgjwc)9Os>kjjI#|%)n7U% z3;euz$iW!(73^FzgpwIyU%Nkz)3l}e)obc>ypELDlxkF;m}g6K)Ib4jwUjL4x z+Tmt(H*Ib|Ms#6W$FIVdR2>dh2O@(?-grCr&)Z<{!G0lEs-P8YRtLUV z!*1J7Rm(Sy2Ro}hvL7&{)>%kaj_`SP*z%Ab0I}OW@U_{$)!P4}YE7%%bQh4@c(Dt( zT-XJ^vPPlVcY!NMb^9H6DllwMmP5WDeNd|N`~HySn3-qoOVbN0HVJee`0b6*$=`^X z3mc1LzbpgEi!mlNo&MNhy`cLZO?f8G>-jr(TJ__%i$>XM_izyfC7Bbnpl%%d)xc0k z(9}|IIspn3^_H;9l}Bww$(M288n80bq1!?9gJ>3pIAdfNKzO(O45~6Z5E}kMc3HjJ z+o6nkbM}YzZJA$12m+3jL-^^XGUAy(c8kY2|95Mf0{6d|5+JE(+BI zExy5B;LRYlfzciuv?CT^EJ7e<;~Lhw3q?SSUYw|rKiOH!l@MbmE}WNlzx-*`QLjiQqR6MENpN*CL48% zH-opR{e~7VETBU&Rulr5jZa`OxhB@N37tk4BdzL6p-rNfrhbf#@iiY|8Gfo&FPA0+ ze@ni24FIkw!#DIcT+EEDwhv&m8>XwC;9Fi(ZS%cva($IjieE_-vtx_nDq%Z%!w0^e zvlmFwZkW?l*18vA-6Y$5^<<;{2h*?RS-w(#@$LByOtIL~XOXB<_KXzIvz)vLPLO;1 z5p;9J8j!S=(J(9ifd18VgT!|5SZ?cjNu4K0Jh^0aX}iW`-)=Xs4Wk z@Wmc&7+no)Ts@W&+G<-O7>aanPaspS9Xo%Ke7{z+GWE_QYc_Idualu+P#WX#`0DDF zq$za~Ki+;O8T&DfFp|u<4(lWRXRO(N!LLw9$bI+2c}hy@XzKjq`L;ya0-|}ezWkN< z_wU2lK3cPVGNlVS^-bgDcV1Lu)c60YcMP*Kt*=k|ST`e>ZF0yt`WIK1&(DaAazhSa z+inBC^ALsjhHkKnGqRx|;~CP^zt==&bxWVSEJxh3JL{e~HyCR)I>+a}kE0r{yCkm^ zg&45{m6bXRIWfFDr8+nsP{}IYYP88;HNfv(-(gIf>k-WXaSv+2)f+>R0~G)`U_~cWYTqLJh=*K=H9~BoM%fkjIz!PDQqf?vV9imK*wI72ZH@% zbnG>kd<}}!v4EvlHXlbj$~|%?La&JH1GEgSUI06dG1`IHmgg=Ir(UuE29Z5umh+@~ z^d7WUt<-;X;e7be;EZ#A92fkG@ugqDWox#jX}Xa6zh)-TNNxYGEC&#pnTDi~Ed&3| zOw?DgyYD~!mbDwTX0zNrfL*{b;k4Fe-JUJUa+i3ntDQ{P~ z4;L*$5OFasiiLfJ`;533C7vt=-0qI>DwDnWK<#Md4avewO+bkDe@D~)r`7HM$Dc@S zL4Fs=ybY0pnghGQFSs!>4q3}aFA(Xr+TM2w_f6p!Y`KtX14W%0i78V>|>cGOB8@ur?(Cjvo`1>Xp72dz{2D?aEN7Ln{IQ0 zg7UF3b6no;ZjF7UZM=%t_od07ck%p2Xb0VA}mhQMod%VfWoEc>sgcwTtAN|(|Mo)$M z&cW9ZCnAvY3V(VsV7owdUJK*VkLU1pZw3dXUMM|Y9NGH(bQkzmKO2QMF;b!xGQ`#x z?4L=5ItizH|88(sCr*eO2a+J*Ld?df!cBK@9YfET%j6+Fd;b z3*=}#HKgy;T}{;$rNheHME*JU3qi$hc#<_+H=24Lmx=>Wy7c4Jg$Alk$%GE4s$oLY zs-J{Xa~hb9oEhIts*yJlzVzzSl?kTfBp@;p8d}n5QyA*2cLyF4Q#1zi0-s8SbUHTjCkrgr6u(;wXco5 z&!u1F+G)4;CZ1ugOifj$<3c7E;yymT6Riv<2hF)JHm7czU`HeR(=bYsUyjFdMk<;6 zx*a%3JgYiRd1->dm%%2A65Mz$6q92(wQrkYGm-eX< zw?dTgwQV_k$^Damvd`44eZ$UYGqU@t+$h^KY%fC&dU+&BAfxih#8ekSDk^a?QuOHS z-|q<9cPB1{YkXArah(5Rtf#_Zm9S3MsG}CW1q#qxE`+%-x=ov?IWb12&zJ2qW;^|h z?2rC%_$Z^MEVm!1T_!CaVU0q49sT93)a`hvkUY%VB_XACx+&@H+bN+rN#2upOEW*1Xr@+GmbHNqY51OJG=>N=xzW~T~+EDj3^5*1>8e&i}xyDK%Yj6|k~VA}tVFo~bA$iw91Kbc0LH+jzVDV_ACrt%)DJX+?-)oW~ldWEY3_!io3N4sB9U zWfWJwV*xsq7E*Z*Gfz&pIca3eH^4^M&M}68AnZ(Go#;+=usWw7lG=bH3!=oMe@GR6 zAOvok7QC1IGLq~?R43}MrWqSr&4_IHBI{eaIY;8*eTCHQof?NiJ3Yb-(QR(&1Q*sd zXKCP3xc+?G;BR*U{}E7$K-aPBnL{QJsV!)-$hL4YYx)Ly1<}0rbE&{pYiRq7 z{YYVIo7Mcg@D5-TJyS+1qiYd5BjIEOU4cN&z+GvY!};u_uM~t921D}Ea?3zVP=3qW zqty|y4&YD2Mh*l9UWi%wMbv9rC_}`R4yE=vUmpS#LDbZpnt`8cPhxh0W=rM^RYT`R z-|6@Hz=?r4FEl3@vvIk0C#T)Bnn?87nAM5PzW%~qW%knP0G9*&9=7X3JfQWeVc!w` zB1DRjIg}WDGZrk>@1l&JH+c;m_H*R%IQ7fdJ)bACOMNWL038RN&omw4J3Zn#y~tz{ zhJ1Z8*Opgq8P2;h1s(fE^Gz?}T6yZ%t|-(FwLi`XO(GzC7X+x)*+I{I#VuPMXJ+QA zKadkmnC{RAK0V6L@_~Os%vLbiOUcMtwthWVfp+=Da3fg?_Zd~cYXg@LRjDi3aMZK# zqzYuPQaD-rLZYD%5$RM16+ZWOrzg6Ig2@zIDjyNaxjYroqpK^fkv@{Q;BjB5H2kdK zJ%Fqg(_`( z$yET?DPGM}cCdg4{3ibG-Jc{WtCMQl=3mE5l^qZ-Pi|SW)nMP@s1`M`y8xS!mHT3i zbl8){e#eDem5=rv8~(mmf;1mh#;Ls+q;j~A4MTrpTncUNh+iPt-?sTbT*pSNs+DF zFY^|B<_2FjJYo{|@mNlS$-k7z=!iZBfbu=i{WN)zR&ua_3;ddVdFYJwa^3;o5~&z- z$>K}fcU? zq@(Q!{B^?rI^lnv@c$QWRm;e#+|>swKG{^IeyY&^_4tZ+Yx8(fK)s8Frane51XdoI_EEtn`?Ngu9rQ_d9Ltwy(3XVt*i2VN$mawl<7iZ zf)NSu1lblhdxL}+6DVy^z8V9Y`h{i#gRJx%aVcf%*x~19;xfe!C47f7PJcdbHe{6Z z2Cdt{#u#-)VpZt-!DK42OPXn?u!$)B%|&AZJ5b>_#cHK)Zl$R+cr|s>%r5+pEsi0LPO|P$z4bL4GLs9Bpi`!fvjA=b+-K6-T-=Bi7*V*B^Oa*v(2cNdk5f*c;sO-c=2 zr-rVK2EA898N9d^LNl%_JQvlwv{N|z{qvu&b@cU?AtGU^Zx#qzb*F7pYC4=%yIP2x zSeU=7z|yqN3GdrH`AgZJqCoV*#q+$EvVOxuHV=NDJ6ZZ9(*&9|M^$(+dP`TA-yBDo`f})@)BhzZS5izZ0J&#=i9Enzu z>Q?5tq&s6K{&Dv1p(-2-7{qIptgudCt&A4<8qxK~=oWq-A=}XsSnlrjm)XAH%!$)m zk*!PD!u@DSe?8X(T-yce34K-xI7x1GH+E^WnD>m zo!1%7->C^lkrldnHRQz}I$-KU?@r&y*&b^JgzxjJ5Fn3qII4%~% zM+3~Zh0yA|gq;>40;M2ZRX^#r^MN3D+WYlWPqXrG+rJO`acD-J=`O8@E*_bN>&?!6k4Kjh=V2W!nE4yt8>SN*vjQ>l;Z#!tjzaVcle0r==?; z@WsuF&a&ZWzE93?Pz}t64G(I(`4o66<;pd2Yjct+5W4jxS!YL!98$bkfn;`F_#TqS z+}eneFh%+md$ z9<&I?4n&h{5>SR<w~1@JXlhN~zx1ODAu(sX5#1|^NY zbUym=_Y8UL$n08oz7_lO^_g#W5s4Yq6%yLTNo%QaA-)pflvX~?^^QZp2tOSQR+V5U z84@7k867tD8_Dur+}*_)oJH%cG^c-l`$56P4d$qwQeh^Vd-2ZZi72O?D!e`fg)Ag) z^zR};mW^9U}JSF*y3fb1qKFY!R2Nul=VZd$wSfGVZ;P`!x%yf({IyS$ZF~7SOtvI z1Vie(o`_b**3n(yp@Pr*+J^Es3Crp4;D^E#`1dKm3&5jL0xax7r@Qi_gu`m~;@64` zMA=!&_iVbIW3A-BB7SdP)MWi)?!A&`-Yc)P0gMN@O}juQ_6NlgIooCTFxNZY$ld7r zcj^62>)4T}s+OM{^zCOv2G0!5&D?0`2`}PF-gpEyEfS(}>^cZ(x;}W}1^vWSRkcj? zQRxn|G3LzLam}8zl>~j+i@(KWn2gz+$VlPu3>olB3HZgHYEoSC-7zkn^~Fu{%q58m zr<^FWDUVv$$M*LISw7tm`~)EMce1d1bGuFAzR-&k#P7{5SCfRATL5aI`<%o5>Y|G# z{N=-vfrsC{&u;PcnPbzp0jkI&=>#F2^x5tn9IS=``yL|A@%~ri>qc68flpIwM!&#i z0nvmwU+KdYvX`&8nJ(;ufY)yEOD{T_Aq(DqiqQvQ)pXNlTHuTQ4=RG6E&|!^23spC2|bT+Om~ zmu`K6%@X^GNd~T~SBfz#jUc>w0aU*@wy?3-&zFlD2#`rK?L@iW18$imHQw1g{S?t! z4MRhP(l{lw!-~a+p#VBlRUN=W-1B9c{U!Tvcwc{e2m3!MvPpJ-9v-~BIxE(9_F5(E zE8d6}lwhPrCAiS~?vV0#Zaa?sg3QtJ?oquh%g;U6a)l>P#~IE4kZndiE;k4<%3 zgKke7=_AB!+4Ue22%Jp_z(2HpM6mVf@|$XYJ(UKPw+YK>tj3;O{D;lh^r)_T*qg8m zNTT-BHEHP`YB#&2k<{S9-jcAi6^o5XXJL2o`kd5r?LADMKWCyYMBj`ymZG`t-9i|4 z6pcU$gt?Mf+bD&cVB>+wb~_)BZQU$KTN6aH?x~-Xwzdo3UtMDPWX(FgYC*$M*dUYm z?lgobzSR74U}9zO?I1txd_Gl!Q?iC-o{3U3b^ve1WTyh)18;=zJn8q2D;b-=m=8g( zy?Kf|E zg@&m>?XS1yPbr$byu+{SaL-aQDC$4NZv`2VUMNUEv3(w_AqCbzrQ(>8RIzVGxcj2R znL-bIHXm=%!=97+S;oa4UaZ1zwmSidZBu$A_=0+~uaFyKNZp2Fg$6%MOMAZIh%dgf zbZ^WZ!4bW1DNykK{ZI3aj zVYpRY(`1NARiCe<@vchNNssFjz$H*=u}}bG0bU>%N`ZqNEqLy!rhK7SuSeHH!>7)B zq*jXA#2ItJl`o4jY2^XyYct+Z9&t^>QS)$J@Ak+?9GaTum*`!F^Akf;+y%n(o7Yk` z-9NPD>dCI#`I1KA>R2d?H1AX&M8Qr_*_`q_HXWT@EEbEGcCrqzv<(jYq*sO~)dpy? zEs%b7$nayQb^&R|0XmHOET)>S3L+}$R_&9)Hg&Cy<0pp#GwaNqixb|N za~L00r?@7!pCZ-7xyaC3;^+t*CBWZ&>>0k1x>JFB>hdnZAz-2iQ)h%JPGs=*15q~% z8@zkfB`{v#u@Wdl$5Ok`QCVA_HxBA1N4A?KWobSwGD{$PSc|e+Ukm$qxCfpMA|UK- z=~Yk031DE0=}`v6=te7+WrY7z|CR!uc)L`uu!(ALQnFQLi0jO<(1gclhbTlMwAk^g zVM-gBR7B?pH$?B~=47+~yN40UgKf~h30MbqfGCbvz=VPB*B7mlkgQEs zakGsEO}%#`yowrBFB$)fqV|z34Ex_M-a2#a3jjClz(I%=R@||3~jQ(+i)TGsxk{Vg)bh6Q7Xzm*E5-btv8${Pg2s02v9x0MnS;&%y@J z>qg)!E_?j^m9W{@_V8(7c}RPfmkAWRH#c(u1LCY6#XALyz=+dCdaMA<5dv+h?dem+ z>67(#zKTRmlOHGj-r4GeWJbR@a$?iO!B`j13l8=`)to5k-^Wm+TCUyIN%gl%`muhZ z{h(KEX1oUevn?AQir;I-`hu_}Vm%H5(?aQ6BSH)Gb5QM0*T^|BdFpv1yR-`wn*cI|`+|2x25b~qm?aCMq9ROt z38G*;RQ7cv`9WeAp5rymtM19S`UQutAs&A=$I3sQY_oZo{k6PjG3qP}aE0*>DS+{% z#gnGv8FEm_21SxALv0*}-z7rD!Et`?RE3_Tw2FVqe90qyOjKt{khCrh@H$dGlkbk3-(RLI*zrz*yQkrSLA=7sred z8N~wrtq_)AEIkhCAVueLQG?IkxY(%ly8BF+$eRTGR7L8Cq~&*L1>#4RcH%>bzw>v2;K#3ThZON}Zkz|-(*za?UEhoXXojSX3M$b){o>17 zXH{$Sb4zhL$6NO8KY8tGo0970ldIR0fMM|F-WXEQzE+ZSF&d(SJG2`wK<&+aIc{|2 znul&{exjU>jiQFwy0EF|zLkr|JL7IP!U)p%J$h) z5u4EkCZsYM05bB6XtJwLra#Q^^HwmSic!A{PBUSuJVU_Xw}0k z=3gVCU{W*RjZ;cbPi{NNlh2y279Vgh&OMhhMR%sUl$G{y!* zc#ooa3v%q9iBN$@+hk@<+j@mtT+W1lWl=~M&;7@3j@7|J?Kq+8Y!JD3o2x#mqI=IBv z(swnig}S-V5-sLBzR7E^)b`6~z7p1m?eIOO6jcc$`lHlnJddgM+x;O}Rzll!00w=i zMkFrFIPh7Ule=;q7fkxff8eJWHxRN3$tO_;U@kTNJQ!Trw9{L*VMD#|VxJv27wd3N z%BDrAo9DTi0-v!cj$@B)(WY0$Xl!J*Lt?nu1 z6i<}9-2LOQ<+rdg>^K@~1f)8VHtL{red*>l?VtUel3~i4>Q=8+KOk4}c2K-Ame1Tx z$b9i}$w9!NFy7@LjV+lW2IBe_W75ed1D%QD3+&-0XtpOoCC+AQEHyf(%X zrF0Bf7P)P|OoRoZ0RQF%_S&(r4TO|NhQsiII|?!a$a}{^eU4YgZkqI@cqqehe&}?f zAcJiojGss@gGx%ud9cvt_EGa+?+%M(8ILiM-JMw+baYdIfmJR{P(18-iS3;9_93Jiq zQS5n(&e^_)(=B*CE4`HmLG`wsa$39`@1OAgR29y>hWeVpG_vYX8>F~)pyOw~lTjiV zfk}pyaDP6Zfe`A`%zomBnqGVj*(x2058NlBo1+e7R*BYL zGN?PS&LXq(584t}9@9oB-u(plGD8tUTP+9h%5&eXp4Q0ManBUFgE`6zbxyuNxeKUj znVg+3%^-bIDH6gQg*1t0RonJpGU-q;M6zqCF4?7}tA(k=;cc~vIORfE3ice=n9&v6 znuW2C==9UY_QPaN6m zKzxJNc7e8&(^C@_LF^7|S1s@NEFFCQ=jEiFyRuuH*{H@babb^-PtQF3EyG>|slXMb z*eNU$WcQ?UI=!Zzt$1c09=w=9ANLQCi8Jk#Ln{Ncr>7i^d*1#-P5n3Oz;93vSBh@L z@na6{iB?vGiQurxLkDjWwVNsrR$}^idsXQ}!iPgvxfcIXRjec-Js0wgovOpWpP(LP zysGxiliVt;iN755E5Get)SKocYhCAuSO1*g$9{E>GS;`6N#2VQU5v~)N(N)H7bEDY z8pVPG+PB}!L?&$8dP1;!u3tuTJNg)N)J+`A9_oVW(3-OAMvCqg>9XVzwoB&1=@bis z-kZruZ;cAsut|v_E28>?A)zq+HwsX*moQW=*iaFxFzPzgFN%Zwi(VH|THwfPkkG6c zYWV3|Zb`srcMsPC7f;X4hpGbYqpZaLEH!PQt|c3tsKgu^H@f*eOm6$^wE8X4V25_S zgN5I8H-iHA`zyM!y{kSGuA(n_2ilXfws9IsV$ zS-LWE4ZKOK@+{|1_stuOvJj$O^l9iGLaekHjhI5`yiJ-9a;O&(X_X=e84PUlHMOYJ zzJWz~GP43HF~-uk?L*MkjxyQ+-4CjHA@CKeiPy1d7(e^mV;4xgXSw82lPm7gUl*$R z8Y<4an?t;DnEJdZc6>!3*#BL?x45OKBHS29Ws){l^L2v zA5jd?>qF$l%3=;rU|?0tYMR4Oh(Y{vPg7nf^UED^5M+hjla$uu(f~f=S)oN(LbIT( z^^C+bG$DZqR6Rj;-iCIltnLDNa+B{p!@3p0Nr?#MhoX-sRBMz}hJZrBJu8N4x{8Ff z57Js37WN&xpIVkwHRWvA+sp`>X>9iOFtJq?OT{$(LrfpoaE@4W*UT0!BGVbH7?tdP zS8hC?AC9jnc0z3xU4M(FYIF@xS+9j^{E9|(&dG`FC?9ysIL5BV6{Fci>tV1Hny0F@ z1t$x;gJeX`IqST<_u+vI;;{bDdSi+B(*OzWiA=@u;f*kQZM5{Bwdh$w_fcD--OF*x z*zf)KUBdX}Js+QhBG+?P>wc%t!5yk3fn#fO;;He*AHnb>Lm3j$1mx(2x5M5gZ2}@Xa_76H54o! z%t)v_t~biPpQ+pRac9YxXJjJ|%v?m?#j&-YfD{!p_ZmI4`kO}ARI3mq7wRrz1|;x& z&TsqD#(2xk_eF5k0+Zm;MJqncNys^^kqm3;6j?w_=~V3dQKPL&S%|!wI;HoqykoOl zrIs0zfr}7&JNK}A+=EsYR{x!n-f=XDoHEN(ez4MQqJ1>ziEN_kl1Z-eMOM$ZkGrL~ z0fD`!6Ly}%zv=<&ZJ{4*qs#~4P*YjDpwkR0K+

z7l(=RCuMO8-2aITdx{aMA&ZLpREMSeS2w15purz3EVkPKPU(k@+Hy z1617Xjd+*(%<{H(v167J{=SHu(1&VsR8r;H!$prcfUccPm#Yb47JW&*_xY{e4z+oRL9`TxmD>X;5@cRDLCPMh5gM{D@UqnO z#LAwnoRol}LeKF_?d6>bsU=K0sbVp2Di?0ViuLUor-Vh*o}fEOP71B13)&&wS_m1Z z&bAAkzuH*t7wl7T5UUn*Oa1uV<0KjTtK7T;GEUKCH5waTWeq$F_3f|`7;}sc2Bb*c z^&dj|ZM)9UiM2oG7!8np+ER05)mRb<5wsG>^~c5j1m`I6o%2?Z>n~h!{c@i3*4r8* zzRIgD9QA!^f(@-V2oZ`hBog37H-Kb$yFO%NOB#wREM^1RQE-%uK@JM7uQSq4fC(~ZtD_s|7oA+ z$^A(tZ)4BRp7-=PUspTA^#(SL9YYJy{i!!eORs4N@>;hA6c7G+m*ZEq_Ge@aD2Jbu zJ}z$>Vn3N&Yoo+*snX3SXgpDw82Jyu}~0{-tL-xSNH5xWO&iM-qxvmm9Gvr zQ}60Eu*e^|ybt21BT&=B8l|wm3f>b@(+jOAV%PV_$cwtR54Li1^y)}iZ}%tZ?&lGD z$n>-T!3?k-gLJe9|NO@@G@EsPd-KJSUI5t? zx+mAbeyy?3L&X9l*PP9FnY6tB%P}a=GR61+3uHdA4DwcfIM?{)c@e@4giLZij3Ic1 z;wl+t0j;{jQ`%1Khbn)m@-IZvlgG3nzkjo(AFTn(c$PC z6MW3GWH}=0o??|)md)F>RGYL9V`qBa<$JkczaN1u)<8wBk^1&IP>+O7glfa7b}5S5 zk@;n5&3LFdyqlY^Vh@>5%;$a}cm;@k1ZB~ zp28{}yq<@Qij3v2e>0Y}T4|sqQ`q{^u^M6DDRy0IR^)fJo14Nh<^=xuEWx?@UxUBtaH+}t+B7=vhvD^wZAwJVX$#4S~OTTnqWzfBz6PuZx6mxL#+ z$*qLU)*Uwb1%&n^AZmsZ#asrxsq38tpou8<6|0T6aCUdCU&lO_61Oq&o)M6{XX2aw z@GD0#kmg`)kUncg$w|d?(PgHvGW4TR=?(4zPUCq(@u%t*Jc7f8T+Yjz-l%(V5HkCr z7M07eF^uk>Z{bZH>0Z#Gi9`p{b*lWf&WLPAu#CG=-jWx&rdBQ;64jVnXF)G;+yL0} zumfmmyfQtK`k<`_x|af#^uzeEChywjhKUEBdgoZUU!Qz$GIRX224`dKk~RWhIMJvd`Sq+uMiCF>nC|t2h%-2l z+z^E&2#EAVR5&b~T4>^Wn)m;tIt`{$Jr?F1nu~tsUxyTxB1`~nB#|KwktV&pZDvkH zG5-ji%$N0}D}&+FH7~g#{zoMJav3OSBLO^H!LTBDm-gHiZZoKwz_TNh5YnE3-2M|$4m(xYBlI&M8+F~17TUEI9nsP=e0tp>=IqPy#?QHV zvux&~LxWs5^jR|9*#bw$4*kn^+D~&KJfR<+G`jvQOlAAR)XJ>`O-?wT9%vtio}PK2 zhtvMch^-JFbt)uHS3=3= zK3~`2BY*DQs5CL*2ChKF6oT4*Vk2ZU-({%m*%ch1dlzJb*JJELAP=dk-ZM2s81kJa z-BqCAEkAM7mCA zwD(yLcJ#3WQcA0aH1B>KGlSO;#|!b=gA*tjf7_eK1S5`SWZ?TFtF#W9{dg0TB@{UT zKvb3h4}*A(66jP6e!m3qn)qtJ2ubnM>3DC5*I4ZNrYV;WN!hRbiO#@M_Z*Zjjw2Oqj)nB0FRcxO^L>V>%l_)$NLVB1zuc~M0*wWYGece_G9 zM^A#0O}V*uxm8*C%W-+zghOkG^f)e=ebTybEPM3}#7m*(hW{Kf#c)7f8%fYA+)tfO zhg7I7h?$Bn)%lG>ePX@s-*x#lx##ZNv)MQRo?H+$jWL!KS?S%gzWe6gDXfJwB;$!r~#9%2H{gj=I+2jD}nb9O<=_ z#qGHfiL(3+CkN*9c+P$FBB2tqw~cp_&%8tiUt*euj8lmGcUZx{Nh?CeDMbDYpZkgT zI7aryA1lLpitS(>L2G+X}@CKOEP-a_$(D!b;9%(L7?mgWibSXh6*dfJl z01rtk0-J+%4_Qb*kIC?WO?yIA6jbFKg6GIqkS3T!*P>8j>}Zz7*(0>n~xCOOyh~-Zw)HD=?=n zsG(5VpU`BN$YA!v0ZsE-owJ7?wlurw<_rtx)CVN2GWW>I=9!-QcV>zmbKUFL5u@pK z4@;20Kr$3%e_?X4Y5io4_M+mv;(nj|7FSa{4l0Ds%bsKs{*q-4Z-#OV4be;FKq0^! z3gQk5=>je~zpq++Fmk+}xHSJDmoH&oWsjVp+3^?q-DM(1Sqh_3kc`+v+~cKRQt0fr zNyGofOMk2xytN!^vp1;Vih{gDbFtT%VgW^N!0;j@rJe2Hs2jsF%|SFa5F>yqeZ9gkxFFjAvC=oNJyGRr z?R{VPk4!Vsj>~M=DWWu1na)Iwi>;v_gkS+<^?Fk0R_&{X@w-D)bwX;d?`oLs0{Rky ztFxjhpE4?8-*9{w2MS^zMvKOl+OAKkSmXO{a5nS$bBC|O!^z1rV{+fGmY$Frf{1D5 z&CL^%Ot3-h5D^lLX=S91bQ&9Eb#R+@7<~Sf-H%Ge)I%bXyvpecxyn0I#+Sv8eM#Lb z^9j-Dy*qIR?4_wgd#IpXi02$_mx$QdLMN!7DKM!MA9`OQ>oY0Zs(df}>atLWQpsUB z_Q0M5MGBfVgq*R?U4R<{pQ2lk(Lx>det5NJsAHwH!9q^6Zm(nIqPfKF>d*I$zdVOL z|8r19gXkc}3WR-!9Mhu~ltc9`8ykf)wsN*%Pa@}_8s9X;W>$p5;H5$oHxSZll^nct zf*hQaR&B)h-0^#;0dquSfkbB4{Ha@0lnZV{#!SXv-YTR7LWY0T&OMYa#tdAiE^YQ9 z1yLN3*TvOj5#SPulF~ZQAODZvKk1dG2fx|qts4EFAF=-;Tn}O?d)9B^##5-*VaFdz zAthmk-Lqk>wX{bIK{G*5ccw-~`{XV&+*hIC{pl)0p92uHdVAf#F$?;TX__8Sgxnb9 zw*`%f*%8}`N(@o`v@uQjpnK}1xr4Ojn4oiQ3pzXl-n3JUR|xATBRki4p)G|_-|8+P z^}Y4XwWjJ=cf#;X$TnTOYj>mWAH)B+bj(3O2gMUM-2)+t2nbnF>QuGOnBd1=y@&SL z>6r~peyl0PPD!i~J>`M{&}O}1f+tkV3E_?G!R9ZN9Ef+2Q}TG8H-~5}y95T6$hHKm z%b$7pHfreAiFWtmBBn)X5IzL|eAg%qI&6yAvF;lXu9ViFzi~3dsqr<2pe}uxH-JdE z4Iht&sjptyV-Fq9@b0{tE+WJpdLZ|A0DgqfBZJ;e}BN(-!4a-p!C`%ay|%+rqKvPFs&AELD-u1-!T5)C5)jPi}SZafZsQs(P~y+`I1sY zM$S`R@>)}7+sWABYv}8*j;H6swoePXcSp@1dBfH|`d>Ux!Mf=x;5@Zt5&QW0GLge5 zs0@5CZr<+CIdgT>O|0YH_)D&6o)cUzq}YpiD1V=!_0Kc3;8|mkMW8v2lJ!S(H3o@& zZNX}eu9m!-)D8iPONcl$0?p6S!}I^6r}{xPUwGoV8{|G#?O&KAQMKsR_pd+ogb zFZSL%9_qf|AD@Y`lqJa$naZ9-_K=|xl2F-Wie$@L7>pT_rR)l&LZJ}Zx5(I+B-t~z znL)D8SjLB0{65|1ocnv7bD#4)=en=^T#tU=$Nld-jQPxa-mm@ndOnNx0jb(#c-t8V zBJYRgDv3OfFIfjJxH~v9-QYO!pqAU}8#kM#RHGQv#6Jmd{)O;HQ>Xz3Mav*9C|Te# zZ@2%{o1#LbY75Q>H&@S{oI^eHpeB-Y|BIGd<}16U9<_h5H=daI|3UaisxHP8GjLfy zA=%>XOJ;a?kZ$o~eTA6%dFue9(vfgMHG^B9B0J2*_=NE!{a-)kK}=Xe(E}Nksv&Sk zHce$%oTe&~!xDI5r(8w3=B1hCae}@JWs2J6RrNs_h z3<>=}gMO`ew%)@IoNG-NuM9N`Gl`Qbyzwlk+)ZZd0j53*IX4k<#)fj2UXNyD^r6KO z%t-5=mO#CzbU{hO!{;DKomwpFJF>+-}wW%mVJ>^*zO`A-K z%|y7JkJXykp1!cD=6JPO1gj{uod#L_@sA8(B+p1J58Nf#jq-~86YwrXlj>hD3Rl1I zq73kF@h%9p8QDC)D^-Cvp*Wz^?7Z}hl-M*MWHaqI2lsoJEVV4?uQ2#mSaysP?4c@b zAIb*5!sluoTanH0uJ!U@*;#Y5u}nb;^Nz#FOirP*KaZNxjeEIj?aJUIPsATc&>B<@ zFE5eIyD7W~Y)fZy71yy%O|sC8NQIHHwB5wLMsDW@zJ4s6DdUNcINlHnH8bK?!Y}yZW zv@*RbN?De_w#vpVlJ)9{$ms8jj!-TY-!`VI+&kCfP#kb`743)W7 z;{&nJj&pk--yeeP2FW^`Fb}P{+T9kgV56o;*a^t2rCD=eY0KPWslE$>D{?A_1LE%#_$sC&K-TvRJiDk zLy&E()nLKB!y;7MftmeQb3=XOWBQm-sev+B14ulr!swo?8g52_+g8kzT$%^)@qFLx zY+TA*A77t8WpiSGvrt>4#OY_H*4LQg)}iYg5MJmScBzwU0KB8v07ls~Z4x>>dw%#s zWK?Ev`eA*2d&}&~nkU7pBbOH&vfj%ZL0lMZSQt&0s*gGlP4tg0kCnUlG5F$g=d0p1 z=A*}CBIr}6dNkLxMb44+80;0)Gf1u$RkSdvEt$wA*m>*wX}R)&M>2c~Jc3uOEEVzK zN|nj6)<`;3IaQL>RfV=KA$L#QkOekr(`c`vY%h&LWMQQ@vN zYS!(%#2v+VN?r?vl&G%aWypp}5s6YY~ta@(`-cOV*v zAoFM;^e0&9$|LyA!;(bF_&U)ybL#H^^C^>R^0Dih{QIxVJuAK9?|wHSQ*@Wm{jf9$ zDFYC@T~vTW_gR05zqO{4k&85+Td_CSvRqx3{p?dbmwfv;m#1>%U2WJE#<$6GFe$uC z8YV7fG{xV?u-zDx-QduYJ==4os&CoXFgjT*JIvl*%}-x|yRrUC%*BXS2xR3cy1lRAJEO&mPuTEBb*C!yOC-z zitEX9vA{XzP#)j1)Ri5m(m_R8t`*NP?ndAac5uXSb48%!s1BIM3tDQHaYugp84q;yy`Kh#H3iF<}p=omu5(krhUFe@o@G+OmT&azuul# zSrzx6ol1EmJ*5_>&FMA&StBo#D;uoEqY_cU;IA0~@^n}<)w~xg8B(IkP@}HhcFuf6 zk*jE;1`1T_!-VZs=Q$uKJ&@vc6HAYKer320PQB=&Zn%*4r1*VwP|>b?gOM&=B1Avk zqYf`uZ`^0=E+>{QZQqgVUhPBGY}04V z;r!mS`u~j)#NDJT`P6)ZY-{gg`EF{Q((HMLpwjs0{Fku`9 z6GUk^yheUZ@+%2?RNhl$|wl2Du*nGTgCALCS`wO_VMYrI}Jr6l&gxF#B@3ui=2UNc-;#dVdYh zEW~!&K{(^`Bq(uEd=g}McpVD~mZM_OpD?8$5k~w7)`Ywgp?UNT(zsRSu9aSP#mnY4 z&+H`UA-VDs;#g6tC`xD$2ao&tIz0~F9&gEI5Jo6^zrst1 z1yn#Y21UfK!s12Pff5(eFeQVOTxw$wz!C9!>5WQ8o=dER%4agE;Hj^DE%dm*G|1t# z(Y{nhy*`TmB(l3-nM31zm?athP%>pwzq~r3fc2P}LeI%D#bcf^?pN4Oo)%7n05~cI z3)+5?B~31p5?)Js6U`DLdHl)(_2lTMZ+!u--{n-7j;Z+Yo))^Srx~h!#*HFO&&Rs; z;4FyyRMSPj*2&2=e^fA7+8Cwk^RVHcj%sK>Dm<02{J5)=^@6I86w`BH0lY%TfviH* z^;G=i;wyZY>}{l+WB*l?q~4Xc1Mj&LmfU#c%^v#IOpG>5jj!fTGch&8NTw?|1U{qX zIgFVxgyp;$(L``?OFeYg3emW?a`)#TgGcuSQ2Du?SD1FOW^(y~Um?qtPrYgpg?s95{M z>i5D7ss}2>o>d7+R8CY$*D`tOJ<=9p@`8jYK7!EZhcNiJ85I^SD^Wgk8M`!hJD)gT zBpwm%_S!Iivd1Iqu~w78nVyt<{v)@hynix#7lBJ&@$qn;LQSiwB9^L%I@7n#Bxl@U ze5#lsCD-vus%7>Taknk>f>@}~V^A!n2XzP7A{dWU;ayRe$k*lkj#O4u6!#Mouh>v) zxbK9s;k^XE;R*Y{aLQ|I?d}DmRS_TsZ|yR=+MdCV<{vR4#2$0eoaslw#Ro2CdaS1D z`K*3;b!Ebi>&O!1zBbH&(VS6*%BWy+)q;>KAK!e1oH8m?P3XL3v-lwfvg9Hf+bQ+K zO7-aN4vqT|?o1IuA{_kxOL8PG$|I0_#DS{qf)=j+s#IunbKp&XdEU{aS>wmcb~YKO zN>$_&PI6v+qISB0)D)Lkl{Co?5LUIq&BAPX*`iHt4m(Nl`w~ z+EjQRcZ`Z5=e?jC@rzTrG{+$Zk26TYM*46^L}qYc($LYtd1%3bDa1Z}pzMGY;@S4i z7e%D$kU-x*Iu-HB;6fRZs#^Kf!-ty+gPOK;UVfw%yK-e-AMFcYcj{4O=o!Chacr{<{h~|6N{xLly$n^e!hWDY#7dTpfFQqNmSS}zcKCMV%r zQrljAjDW?1`llF@9xQS~FS7m!?OJuD#Oms(cDc`KQ}%%VkYKj7Nd}O~R)2~ygsIw? zYy&^=B%n*OS361?U&FC2Dj8JpkcE}5o@vRbF%`QLw`Z{74(O(4wDw~fFQGM&GQ`%m zU=@IAGVgR3pH?{(UHKG;&UvHsbX2jusr2UHqebStv1}6SUj)n$JCJD{N*R=$?9O03 zU-`lGf*z{`Dnztw@wCNC>1_ z&(R0!FcE+7@td#diOvsUODGp0wX#%2dT!mn!cd^8M!5t& z7CcruRCih{Y53GW52|MM)dbe9s05DYBcPP+s|jIxX1UzM$+Fuy>+Kqj)e*;*opxFuF@%u(_2d9x$W zeo35Ijj6=!XjJ9##H3jP7Wi&L+d3_XX)8RH~Pyx+1p)w-4tEUwjQlo zT^6fGhjgf9Th1e=z*Z6qNNyS$%^1oMsq@AVCPzF8_fE}Ga&JF6`Zl&Df172*X? zi8EuUS4c_c+nNcSn-qKS9kI0YgZ1(@G*WU#TGRvci z>Zy3s!R!a;-|?&-3Dta=xICR?U=LM;YD2d8iK#SRIOQ=&vGHLFKfG9xZRtYTIK5tY z98r4iZVu{C=Gdz9iSV?nE{nZ2oc?#eXtR6;)f5mT2qGo~Wm|yq0AdMUE1bCNKUT8mR8+}N zJv>k&4vF8LpMO8U>&PDbUe68z!6UF^C<_=57)CsLc;ALy+K(Kp7qgX!@YpBBH|H5u z2A_j{ICHFx9M*m~m+1#48V&)e3Gya|X(vIcgFL>M?|+bO+fgY?-mkRH@0puTqEi3^50Ku>o7+{p2U<)s6hC4u&3Zn+)~K`x;Q&n`H+cYmjS3?yrd z9HfG~FpW~0;zZM2WECZ75K9ml6wm~972+mJg4iY>m$Dgmdp`cEvfc@~V;3TF4lKaR z!5u;tuRqPuFzPlw>4o?*J?P|5jJq=PMj#kA3 zQsy6w>C7~Z9Z0syH7NB=L08b8_+YIU_m`(Clkrz4w6k+*ph3qVNQP1OzzlE2;JNi` zc5S>q7=}N$@TuCFvt8ftl=>X({a71jh-tj#eoqk(kK})R0zSI;-9+o1%bLx1Zl04r zOj+S}8jH7vd?p3Cg7pKl?_e7J+2h6G(18-MEm7+>t>5?f~ zIiq9q+v+Zr>i1SJ2o_$!lqSn>{~mfyASS)*+;tzq!Mg)V*vfO!uRw0T>}GCc@9t_S8sSJG>#tSuOWuO z&HlAJ`1|Xbe=+nKC1*6BT)c8k9h^U$CH4?&L^;LHot}Ghs*7J79#r;boObW`=Kz0& z@S#s&8bKY<>7@;0>aIELw$xXa%sDLSA{Wl0%{shdxCIey$Ny+}W)FVS4itfE^L`mc zK9;T#5cU=fN1KyPE8&=ZI}rPfkid${7>~DoG!4f9B8MwO_o^>O%$q;y>>hp%QPwCm z=(SrflAt6^IM&^G6iX=sq5}x+(PV6=h+1~ocTh)O1rWsA)5*l$xx&~>>s;}H!8yD?SU*2N#oe=k_W zF5^O+WkEI-9qVduz@7^GF{8T66ixB|athQc@}pPQsS*)jv#?RMPKdxIc(!;UO&EQ+ z-*bwm;FUxKOg1$zNl3^$LH~?1)78JJVzd$Ef|uiEdS*}*{pN{8lnJkTwg(R z^xlVbiS38SE{K3yZ{H9kOE*I&;2?gK5^mI@C98kstd^r*S@j9%x`dJ>y%$_;ZQ9z9 z0v%mK`bl*^YZK3}G3((K0xej8a95ifd_2L$b=7(xFIT+<*JA3z-D1fxcc<+~{RD6_ zaT#Ptlw#_H;|qoYm5Y{)nQ`{#t5nf^#m`NpAhKTK)!H@#Thgqv#k|mEILSiL_Y^6f zAiQ6aW|+QASQa`X!B@f>Nnhfb1|suOin6Ai5?4gEeeNKoQxzRh>OBsEGIu$Lg0OGSPp8L-{bvL%D&uCS-a%U^NKIK!j zNOqK`HmGS>4qf9bB-gvGe4}au9O82>cQm~N3Rnm0XP#E*94cN{ynbDN@!~d^y}U!W zr`=AbYNDIfxgOE9hy{{^j*TGAhW&F#jPEgLVmkW@ok)+s@@+8FoTzdvJa`G$7 z&&KEbbRo>hah%)3B&qf&Wr)=`DM9CkX=)neGyM$~{}j|Ckeff08p{xH%^-y%^S36x zBQ?t;QUf%6tg>KA``X{>CC{Tu4V0kd;l-=utHk!jUaHwQ8r!hk>>=|!q9P&IUsrLf zvdeEZK-!z%q@eVU#_&pJyxj52Xj8wc#{w4V#G3-9^N>ugIynBF`3K?2(kK1% z(OkGCn$FwGw4T@|pX>4nj9B*zKNgU<3&UWWz&%DcWe71ZwsqbfYONN-!zt%4XrEz` z1|DmR$bg5DPEw7a*;4iA)r+FaYlA4DI=bY*z4dhYg@Ys2q2f=O+GY&I+J*U;Z-CTG z>w6jO;wpJ8i=Je`^<-J;suF`!1KPtBA{e(&d_9$*46!Ojx|m+JTu8H#TTINgvd#?> zACAcwK|2T=Q7{0_+#S16zr4rbi?KG>g<*{S3LwSOA{ULrv`$7%9O~cTKx-@rMaSy+ zo;kVa*m!~{@BKsrGhM;GkIO-=$>HB=F8}Rx6jY1^q^m&k|1!#hYT&M96W8t?i0Avz zwBi9c(b2_i`dRGTWN_t{r@dC|T86*c=obYA=z9EO4&IZOrc8fX$A&jPG=~|%eUHtF z4q8BHovzDMEg7v>x#ht+WBCZO0k(qTfmc*ku#nlqfZL%A=`YJu2i^IvRS&G6xoTWn zVX*iB3_W*GsP3yd{=P%s{DNI|rwLT9HRD?tWdm3sv;c)<8`DToc_fY~v=^+fmSPli zZkS&PIG5G#!UyZ3<9ND1K8uoxfNaXe9-6pMJ%edfFNcL@2=!m-Lzx!+@O(Y$^5IUR zv081Mu~Nys$6fNTP6a$T-4UE6p35{HKho0(Cz;A|076udExlLM<_!{I{g|q&5^Z_2 z_O`C6rprB@8yycLO(URQU{numU)+IEy%~e`5PFu9acD-lV{2Ou=eu_}=TwUNo0q#k zE@z1MA8%DK;E{IxTv9Xs(MoGsfddlwb_x8N%&TNfD@>$|+>93=e_?mDZuQon(nAle zPyJ)q_Mo+m%rm*+M5keR9bzy;8kwa%MM>^t0f_jt10kMF^mfa&o0ym{gX6^GtgaJho|G9x zURS(v3-M*-DH2__)-E{EXBy_&78_rwvddkqv~W&( zI71#}YbgPBb)MtDKje?%H!H2G8oak&8?pDPVi#L1ty-YuM)Z*h8g2Zt5d=B81TthW z&jXYAp6FqiAly+Vqg>Y;jVr8(E!?(3%&DGi`Owm*kezs|QtL*e%}sNryAbb5QVXUk ze(B7Vk*W#(m9I_g@wVQ5FkITuD|avVn_nz-T<*+%mppuSPeT`|U9`Ls4}*9_E;_wO z_h5I^;FKDCZ{fXhiHIJGAh{G6k=X>7DVaUOTYjaC%A&Ot2=~;1cn!-GnP^zC^H7|5a5uY&vW3V5kDx%H~c-HTAzI@W-m^++m zV^r7ww)|t&i2}B`rFzq3md(U>yExTOAK6O|W3Y8ooq!ZdGm)=AmKIE$YYhjV7?DANHcYhup% z%^r-45$2b8M(|9rz$9o`)4&p`i#Uq1rYFi}B~32%A?7G5RlSyWKbE(~d6Yhvba4tk z;b?f`9#f!Txod9`6X;Fs&?hd`3y>u~zH<(|H?5~k7`XRQ)>LnEqU%6u`U9yuo1jK- z1^uHICruydp{lBeXbJs*kES1C^cPpF>_F~BO#0~7e)d%G*PS}^#W)Az@UxJpX8}2R zSV%SqQbudWKFm$m;d-eI)6mJK%1xUkT;Gs9H8XJl)vMNcdMofoObBX*s?2BunKZ#X zaMZ~ds3A^9!fukD9Ga-|m&l@*+&p(AgKP=ipeLe!&_h;l@%i%vL4*^bJMzTr6_qe&GiN)D28&L{t~4}z<%5Q zsURk-srb{hUYZY^l=6FBS7`KEsz}kd?e`OqKU&Iq7##0Joxt2&8SSBfg}QGrN!+Q< zV$#L4$a(u^PMyEsXPsTnC&hNFOCKjZk>9c^J`^GffQ>NbS$Y{wFVAH{eV`I`mJE+t z!mS_@<|!pw8D2uJ+PA8oF=u^Z3lsQOdl6c3t(fJ<%90RO`3Ly@8+QW^MdKWbS1JsH z3KdQFKQxM6bgAD zj8L6>0ZI|L@!@Uw#u?XHlm3$y*8Q1#1jo9H^-lbl=dLk+rZ?6jngrcWBn1)N<3qG| z12zPaEH~jqmvoV^ivywpGD;ELC%J@gzRZS`jzee(7(fEPWUqM(r;bu*(P8wfG+Qu6 z`@Jb}RZkoWTON>ccXqVO|Be%=ZyHX{XpTM524mkSdQf29WiYrdxV{@?r>zRr-*@~I zm!OHx&r*GcKNNEd14u1rlv@Tb`ox=4cij}Z?>~1fF~4=R@kYBJFH>6ub_D$1aH`&pnzV$6`!EVT%<71vEhg>pK*(a)x>K*$4p$H=g`TGJ%1`J@lC=HcH)kYwlG)<)oXg=!B(6u30;Oq@L5K8!)(h^+E#_} zZXf zoMcAd3EE=aicrXo@zN1J4S}fvA{ao%zZuKs0_NZAT#yDzpz>>%?cmmqzS}`T$N@3u zGQqk^_Dl%EyFpMQM7?OFmcj(|=hM8(=-F5ht#`oY`hs6-@BG}GoKIup#=(4(N8cu$ znr;`?<<38EiC(gpF%5-oOa3vx`Crb5k*dw{k=XJtu~cIuh|IIo1Tx;bX0ZFZF-JCQ z!5)rvJ8J#V_d6$GNy-0`9qG=nf85Xh$FcKJ693W`l^bOVRFNew0ShFRAh&Lhp}vWU z1;eB{S*HUwHO?MKlFo`sGVdzrihZQx-U8t~i{JrMl$&+B*)Wb~g2274OZI}k%0?m+ zgvhg_6Sg6VkIQh;JkaD(7%I&JRh5enN40Lj^4FE1%j^GJ5A{EP{9ocPMc6})!MYs^XN?)KW@YhS#eEBAiw z)8p{h@6ApO%@4JD*uXUKl_eS7gDGEWLg0eV4QZYuJvaMUSN?V7%S%6x#fkB%rLp&~ zK4i%q+1P-9^S`41C2#QG-sWH8I{cr9fPPQsre~o!ffX>4G4>#BTIYhMj9D5fu{Oqh zWk((|9Xo9$c;2%l=b??8FhA2UL?rcBKRFg@(n@2-UKIaZS3U%Log(2SU)y(v)ynIy z?l(fE<5r)tmSb zdY^?k4A|e1$+e%JjJegr#OSm;G)z;(WA?jk*NGe%yqu?$R^D6N=Er;uA@T#A;IiHgD-ZE*6jg!}5T)k#spd#bQ~j|j`a2rfnhLW@wN5fRW~;9DMDLav zQtO)c`#$QUyxmW{F3Uq$YX8Yv@-OZS32)}57p7940O4T5u4T)?;8M=dU=4m^6CM5> z6{IstU8(uLmF_82c?TTIul(W>k;CtJ%%|`U=5T>diXgZ$DMQhxzwwR7fx=Zs$s!e` zXe$ykj;}%@Nmf}p?W)(ogH@ru7tV2vTFIYH+>Ap)Ho?*M-{Q*t8~618;v2YFftyrw z7t8^S^A02mQ&xx#QSMe)AkaUI8NM4!qy!b)9)Nm38M`(1OfvAD$f-xa{LwLsNu6kc z>@hf#%M=Fal1B-W?Jp%whqd-2C%Rfv)Xk<`Hj_rhqK>`Tx76||JM{EEsJU>cF2uk5 zR}S-kVpbxIiL5_KlP5dHRjfoz93s~z<^peT@M@%g?`h} zQDdg2d;dfmi@9Y~_|*`MxjU&=$g3B=Zd5DhiS`{ZLh7e<+0$Bd#iGl#@YxB|oO)-q z&rJN<#nqpdy)ZR04kFhmnZt1Bq9}02nM!KE*vk-b*+1L#VT^fev-|rIqwep;pp5G6 zCW)q30o{*R)la!K%p1FT1Ut=& z@DXENS)Veo>dv_;6qU3Rk$AmDpVNQO>FEfT1f5LI_y2Sff9WYPC4ZHx=Qo;?MfWsq zrt@L=&L2}>H|Mc=sSAbGs$R_aC>%VgNXq2Q|FzExTB;}9TYjbOeO!)}bDhi(z(sUqSKK#+Qi4@pi_k&T|LK$tEtOMbsVypWUUrhu><2MJSz zZ0z$@_B>6o{k%88ZS|YagA?aJ?Hc@79|Sfkj7VLScv8<26B*VH+s`;T z0&dH5ZtOs|4$|NU^l$ZJknGFmu~*LI3To-OZ;gFB7V@j-WYG6krjJg3ha@P*WL$)h zibjNpJ?$&e>72tG@wmPCS(;3@b+b)cmvQ;(?dT3&-*(e1&rn}b;7K|jSKdK1|lnw1*b z%|6b*{MiYKJ>2JhBIe+J!&7HM=AD^3u}*MY7{Ox6fG~~#w>{ZCHUtaSZO_kD9bi-`5rO(Gmh2NZ6){Q=o7jO$Oh+_cB>E z%8Iip7iZm;YN40AcAaC4)IPB46OhUI;n%hvXz7L0w%$`W4qr?vxQ5@}-=PH#L2M}2 zpxhk{-D!7l2$bifS`BY#1pAx0UkUJyj+#jnvVYxHZ+#4ZJkGm1$v~|EA~O7|S69F9 z2mZTn5|jXE#oWT~1(*5O$dXeNsYJTJ=|H_`$E}*;55CkhZ|2Sy@C(-q+*I7;UJ@yX zE)V>{SpjslXbGfU+|3V&Qni4%sfMP}D`d}y1z$$HWTo1Qt>CcQG=DX&uuYTGED<u%cyk^S-WzBV9Ac8cLi2;)%6m1_#%V6+=`!M835cn_nN7yWI`5g_rjt~P? zY-Nahg%mDq6)Nj{WFxZgB!=xkg4rQde}W|Fm?J@SJL;gz)D9#9w;uTB@jYt=uPZhI z7Av|ZoSu%|@(K)P?EocJzx;y?Q&2wnwKvo2i`O1FUHK#=%mWn^TuR*dOG_B`0PBii zMJn2s((~&Aa*$cqGptkGPr7lu?JPTMo&AMab!)mB@I zby@2Em{;Lp@9bPp+1eNcFSkFbqVW#5EiVpXOD1D*vh5CrxlhO9TJkE@v__P_m43F_ zfu#C?yotscsfTWv_#r>Rfm84fIN`gU)(#{~jqJ(b04Y?C_0m(LcOV%)RFL+~wp4C= zU(OCBjDL%TrrFg#x$Yz)3nC1`MXG6QcqtjPwv6z?ydUAG@3{iL98C*jh&WEIY%9j| zflDOcb|9;=un{D7Yg4WoK71#d5hjhnw=#5gAY2OMO}b!vG=_8sv&9#`ScjldQ^#Nn zmAX3+ohMilNF8zn{GFd6sf?JZRxF?br$y^g?Bp21UCUe>qsMk2nV%5ADNF*U`Ewn} zvUkCoI7t(NF?6%2a{N0Gt(i@h?WIJ_S4TOT0QUd#9mD8;9oQcVsf>MyX&7lu4uFI!Jda3iDk7Ghn{5PV&3l?w<+jmQ4@q6wa7n8OaFqa5Q1GOEpv z)q{BZe;(LB5A5Gt(LZn4KTYzVKJ1@P|DRFvKaG;ClccN0BDp8P#i3}@8RaX+GnAi+ zg&V`RYx&UB^@p{=K7R?${`I@S-SZUGp5YA{3>QtWug;j%WA*~V-x3&LXzmlp)~^Oz z(l06c=wtTkN_2&OIc+fBpXJ%jyR1Q%Wc3H)kp>26mcg{!Z{I9f2iF&7&GQXYR1*t^ zHdXoxN{oc|z01lw`Xo+=WlyQbRU=zIXMta*6h=xmcFEz0lT))TqZu)Ql@Gm1A6y=i zuJPr0NQP_nJQn9QWV~D1%BF#|1)vDZ@*m53IqwqTQOn@c=~}CZY?*Grx)I-|=|js~ z{NPY(m&WSr5V#na+C7Is>$s@8G%GUZk(RtwRTSZ?tiwkYe;-BhnVEs2XHKU-Km9iA zqIRO;#S}BsI+h_orC5VS2^8;1J8+J{3&AwQKnnJCZA}jFXLZIjg>V55D7zr$f{@Z6crRhZ7hwjpnBMrSG zYyirHkH>5}MASUXyOb+xDih*U&Lc@aZ&_H5dxx@uvxR0jNiT!nO$8%>@+56}2+vtZ zMsgON&@FvU#LcrNhAdtjR}*tk6a2CV0=IPj$8@uxrHW#A7KHT83|OnXU^+>A(Ag#| z;ph%*xqmd)#hAs7AwiC8e2E|oeu*zM%xQJWH4w{ii8ZxBIz1AtG333$8}nXgIzJ@n z^#9?mkj>$xq}wbe51@>vqN`!J*)%-g}M`@3O!5d_4z~ zDok468h^R_o`~9?Vwi&4E0OEQ2j-G~Fpl;WR_v9or>X9byiO54!@8&^TH$0Jt4{LD ze_!|I^PZcJJ}nen=FEC9)U5#rWIhBcrUNi?WDBB1GX6w4htt)%K`nOV>viopjc2gt z)GNXdO)|Ee28v+;=Y;tY8ThrB9K3jsu_e5bvI7y_j&5dX#`BsY@0evh46QYEDf(`h zg>XR)#dU2Q5`10#P%`vdk~FWny8Q7^7j&3?nJpfm)3pAW$^Qe9`%H~m`_O}MQWZlJ zWegkzt+v?)v_gW-m;k);WcK);R}U`*>@5zkDXBkIv-!p*?#UlsLh{0jz^5Zo=RuOj zs}?(uzSAHzMGu3u7d%Y8R@gSF54-LNvY%!BC-(+@7SnhXJ3XrjvgO$8ZZck+kYd{S zgZ}fchk0l@G0Q-7w;6NUfe@CwOim{JaQox~OgUdJ_it>^wH{MODo zyu7BO@;R#YV)4<2)vdY+R9!~!RuYI)Cg`Je46$FL{&NlpMPzvGM76jWUFx7ad zC;Yvrp6XCsLW;0h!)-~Lury63RVdQ3PV-0oJMtWmKFN8zzhli>0MPI;Kzz|KKuz5X z1}SNow}9wikPX_HWS;fTRw(wIUW7Y4p|kVY(1keR^w|#X>2!n6qAtW;YTzLmKdqQ- zlmFyG{M~KAKhoqAHhxgAj5u%{Gas_4^w00J^k7x6Ajxb=@AVOc;R(%j5hW zS5&KMny|S4u}&<@Uh)(}-@nC}dQcfd7flI)gPfT7SlFr(%5(=(#32JVQ?8q^%q#!9 zxbT;Yz%ngK;}K$9zS|0?h#@d@O-&(iyMNGF5z4xcy_k7_C|4kaSbvbl2Uw8K>D7PL zIsk}XCVk1sv#WpQyF01a%IOBwp*m>~FC=TmL>`jKiTjP~V`_$b9MfA&Cc~NIfkcCT z`uf|$pXX6<<*lqKu5gH4(a(^kn>rGx7Bu2p|rpJs01mAoyLixpYd|X=!hEAj5a5?W`+%v2t{A*sUR_U*^7w z_~Q{HoA}LUM3l%0AO|1MJnDj$BG9bw@Nf^##QrdJG{CQ2Sh-6VHwIfFaaezsS|t;=rQmJU6zvq=1C=C^xd%c1JzmeA8#=S(i-Z|8@kp3!KAkqi+-56RepoV(~f z_T_E!8X`?g9smQBSZ(7Z1=7C*S$`?BA&jN>r2-CHqCi$Od>Ru6TJ9ibz2>l)Z-&;N z=ft3MAVb1M%$AD#IYt{DY>(DcCG1>3m`Cmw$ zp`d_OWULk+NKX5b>|_f}76NY#$V>g#Es^MDNy~c)jtaZYT`Xa;`VUp|QDHC0AzT5! z{fYi7$jM)${@?hGT@fE&e*+$$8$fB~9}SYd(&r|VeArsrBr7iX=~ouwa5&0>+|FRj zATL<=S8(mq%^g9s4aKE;*O{k{KkED-^E@DO{N}>ios;uj=^-#3O!MpxWPTL0O5TAi z9|B9FwH?TYJ9Y~+kWCJ-*4f&DY*$Xw>M+y@u%trmfHdl!3_Ccz87!8xFpNU5w=D-_ z5Ppt?;5!iWQE(;zHvPT6g5?$$c_7%9cOWz1pk+)6EZP2V zA8ikSQglMZSchhdZKtq$R-ChX#Q+yaeb!wf-$$F>f6Mw63qyO=U-$Fdut(9+Bn5wC1eD}L|&#HXd4&60b;4o1W4&yD;I zKJfFYlEk~~8ZSo1MqDF=h=zR)(F?pcf42OfzQ)^pK1CJ;G%EeTei7>g4_VLKFVlyO z2c5SC(>cg159*Itj=9=#4;?VAtavecae+5+DFNagX`TE7h$Ie+a<4_U73E8s$gMkSkYVm&MAM<$bAaNF@elqX;i_*KP$Afksa?Yj<}?E zz#V6}H1fCxZ9CHVMY{dKMYHt7@Bo>L_Y<{(gPkSiKEUndqppgNV;p_oS=6U~@9VL# z`HU;FBzt#hu`he-BnlB#*L zl@|AMWcgMArzrfT^Udkh(znOs)3$SQb5>`3`yIcVldD?eA*j`z4g4YbkraOZlh%qxlo5uwlkm}()4cS`n~`3T z!D96uvN>xS>`PkC|LSd;OUdc?bCG4HB%{L5vF}{W$=AnP>~G{JL9Ru#ahdiHFIegw zyzHA3X*}`Bh}L6XZy3`&Gf;Lhkv@1~ir+V=Jy_M&=+oHh@AJd|mX@f1y%z^GI0Sr+ zFY3(qr2E6`yfDpF?D~`7$5tb|IRy zcCk(x-+g&}pgV{w?NfMEV%2igiP+>Taial0AZLhwG%zXD84%!mkzJS7^`dFYIc_N@ z{Do{Ue$jYpurYE(bUl+3#t;mnfTo{?1e|st50WEyAcsKT`#*h=i<6#N2WC9Xi}dHm zRM2MtV^Z5nivpD5$uTFw-M_qH@QvO(a{?KqdOJDM*xr~Iig%-`(Ol@I0vJd17dQvz zc29)A`XQYGCLBms2nQrnUQWr%GzGRdyerxT7`BRV zRUkiGOtAHf^;c{OI_%r*NO168UFAxduYK++4T7G;deJ9P^1$KzQKTGYksd~4!9NpE zuo(sVwMmZ2lyKzu)5V8r3fi4!P96cVyCK2Bc+3M>Noq?EmktpdImrgHQAF$2>_996 z-T-%v(vJn@$q3?_HS-z|v+_c*VB@cm3JQw!)}a(ZT<^u^9Z2k7+JS%j>h2{W3L4l; z==!l?%!S%d;!F+Nzegilj>Agl`2LQgee!y<+K1_XTgm*Uvi?0`q%)hlWlaIp$Za{6 z;&A^fv{P#UN`#dnmJkGE+*D)Qa12eFVC1P2d46zGKaIom>fPd4v3&&(-urbZ)O<>> zx0Yj#bwPP#olmtxoVE?Fe^2KdR%_qepe(tZ_R#r{Z`6DwaoDt^fkb*-4_!n~4aq)3g(MW&L-s8R5gA5Gc8U;T2$3~gc4Oa{?0d}E z_ZiC=%uL_+{rQ~l`P}Ee&wb8)-{v=qbd-QxeNf+WGV%>Z@C~;Zu^|EeAq{$U@ zTy|MjqnULzia%*<_=t&fRd%4*q*`5?z-A^qq9A}1!<4@;LEA^&9eV1l?jq%^)j1WM zT5|lHI$u3q1r^A8kC%|ksd8Ip-3Ib`?NW6|=10>HJM!B}+!jp=itmNV*erd{(v^-i z!AQJo#N>l=&5izDxh)O=Xtqp5HQy%~VGnmS2Om4)-wV*)uaf-ThDL zqPH54R7~_oQ;t@AL=fDBAE4Gc@E~AXCa(K8hh->IxNWAoQ@Igb`8~%66};x zTzY(eeZrKhY^SIQyM+lX4<<7-;b@ILLN$bBk@sn?t;d1jeeqvCH!1rAodeo(2p2;> z_sbvV_sIuWn0+#7*6H|?v+5R_HG)g7^jLEmNAbI!CEDUzW3e%+IMsYR2M>%3>#Dg; zzcV{blvPJ{k{<5X37Sp{cvh(8T3@1|qQE4iDWH>HYM|+f+8a6y!P@GDVgw1N6*4(R z&v6dh_;gg{c8-ouPdbSjR9fi*C=T+PJuW1hFxOlK4&GHXJ5+?O8<}`BjCBZ1KW#Yp zW>IW{{G_6HM_-KMk_Np@FN3G2K2UBhQM_?Jftu-V^WDzu`)OwHIOlpq?R(Uo?ha_J zjtp*!5C$=j=J$SLh{h7EWdZr0+!x-N3VNEY#Y>_L5Rz`? z3aP)xRRsesN&vtobd)$k@Y1X1hob*OH&Fvu^AC`^+PGB0&>u03KR0Us+n=Kjh4YZ| z>bQvZgeDxzY}V_iQD-OC5Oj}I2JfXD_Va$Nv`$EKrxhxye|ttcXH$pGP0}Ci^m}IF zx1IL?<`-~~32J;Nf%52@(#Q|c*%}+piJk0gWUhgCrmHbA`)biy`xez#Gnw#wKz|5Ec#C`ts zz16a^q_$jrHW{qvHN|YPt^$Jl&LIf`&}6jB*(vyKQly`N6NE|u2)HW5=|`8fhQQU% z^M8G`q2psQ$MrGwbTSP8ov?5uXV!p5PG-#}fB%r>9p(aVhMc-SnyAvy%ls+A%~GI% z;*8IA0KGlKE@9%EkYRP4VJ)cBH4Yhs81;C{4R+toc3w-#VwI!*ip@yMGZ=rlKidfX zSER{L^DnC%WX=F(BWe$bmZ1Xc#J|<3eVR1ua&mXgFjZ5yH+@<%or#ZH{-yiSmB-4* z{r{%ZA2-{PE21M44s7)=j6Xoj4qEaFLj*sk9W|gpEMh+1OP*ZMjv&Wm`lSQNqzce3 zo<{H?m|X3MQMj4C!$NXAZ%#qUQseLO51>XkwLq5z(#J{kx+`Seg?@DeDwN>k9B{ej zn{V>|%gu9jYV<)dJ$hqSwD z?|TJj0eup5qoawUWR+>ib)3Kpl&$p_Y=$I%<+;$uLpLj9ZR7Wgow+o!Jt;XCROR*G zEe|)^D3tm^O%A44$#&NyApw0}D=?`cb2>nNn_5=)`aadpodss5T==35H8ouv_q zJ=nDg&1GkUJBpgfxxPy4*;Ki>B`tEU_R!w5P7I(&gs@iRrsrUC@bfJoSEa%N)&lN8 zioHX0+)DvS@H}mDJYzN?(8A`kUq1{iBE=yin(7PF5n<{Nx)Gcd{<6(@t@XNO_p<8k z)d6A@Wl-*L(-Sj0o$w^AZlTv=FU*-95xuiaGRt2n=yzl}J-PY9va}Mi zS^(m_)T-V_y1Lm|(tPn9X5@n4&{9UV01!Sjv$NHeb^Wfadnxar=)F^K%=5~ojq4Dz;ejhXA|CzuQblV|>TD7g6~s@M@DzN;ENdCObk%wih_wKKx8yu|OD|1OFrBpmy^O5?0Xp|sLE5{}OKaFxQ|H^wHHpdF>DDb&tS4lx+UG^R6!e`= zT5@qZy^NLeDfsH^Ja5;)8m?+^x+%!Q?5eBV!qWwh&Lx&S?=rLPewUGnNGu#x9G8iy zzr&2b1MUJbBm+eaGvGf8TS5>z8o%|C)5eq)~7c;9?g%`Pb(8Y&) zlEwvBWsPGOG`?%AMn_Cm*{P$&4rIlE-V9?9;5zuGUj@|3-ImHHv!-wZWGA1hDS*rz z$&a?TxE!h$U(T7Ig`e)Xh?9o|CzhpTISD=ynf2sfC)2UcB91}s3U#zJX0q>et!(J4 z7xIzCx;CNv7n;$o2WA{$;+k^r+1<1F$U`5aGYJ}qMY}Y=Z{I0RkhETAKR}w;$<+z- zSA`#)Y6a;g7rxNstiJG;Ihj6Yy-QUqymYEZH7V_k{h;I8QC2I#8|fqLCmQjb+*!a+ zr z;WeEEW_s&k{BfRY!@l?V{;rD;XI+)JH2+l=!7FQn^2m~SjFgzq2IckY6&M+(H1YDe zb8PK?g}~+u8szlL@r+AZN8_pBl(t})3u^L`GqxZZFimsV&MmT~LCydnnoLQ&m(}@g z{$aScOdrFoOO%r+yR9*oS@1eeV(pMR2xv|jHXU9Ly^Ny_>rqvY5?=1U)~GJQs=*(N|!X74(S8 zls6t@w_ZLMVKo~SNoC&sEjcpwse5va(Y#=q49DHz`VR3jkQ*Rh`O2~(btKxy9dzUj zXOB0)u0KAn9b88f)>mSYL|y3*;^jNK`2=d1=npL==;CtG+HGL4SSXaqYwI?S9~rpA z-s|#xGN#`)q=-+1+?9iL1Z`yhdg1;qT=*w1QpDU~Zq`1I<9mB6lZ#7PU+0Q4b@l~+ zg~3^|^%-mw-VsaALL*;6x#DjZ4R1~AR5MI>xous(o9gm5geE6dkfh$h_Xe!>b6ELr z57$XqMX)9C>WR8FpJwn~E>@AftWz^QM5 zUUxnLmg7FX4mdog=gjUMYJB`&xq?=t*eWX}x*l8ys-B*u0Kw(c^7+I_%$Af*Yk zEnC`;1QQU?$k9>7PLkO_KZ}36dJ67y@f%K9HT=`z-5xL|E7;}gTCLf5!G;cw#W*4}N~r{NYF95EGR6dcvtADI+Ao z$n5GXCkFl_XG*POYr2(v)04-LZ@Bx;--LY!Obs{`;Sx0|v~Nb!-?zR5Selo)tAYbw z|7B^^wo7GMEuw@987fl2jwIk%Te$d}muw6WVs2rH-}!OxB5f;h2Ej>Yt6=ek6p5xs ze9_)2ra`JA94=-e!?U zk*B~>aE`~GK(L?@dsBHrrVTeef!2E-UUat5W(RzgVo$oEaDs7mbb|tjWw{XfaO8x- z#!z<5_sc+o@1@d_&ErMxZQZHd&ENi2smg!b8KM?DPMaL;k{kcSAnouLGGRbbj>IsGLw%nvYU)u z^lke2yVECz$aecdlADn6=XNB@e8g5KARXr($=ZR{$ReMRy9Qy)Tuwhgeg?!#_%c`m z3Eh1Zgf~n00a98sKiWp2)yh5-`?k*5O~HzH`wpoA=4B=p5Wgz^GzKqsD4G=i3Gq#C zluRrn+}m(ONFFsM?1R^9CCM*FQ2Psr#waJ<1LSwk|BUtjg?UB$$*oMl5mazzKF@Qj zU1?@#8!sM*T&~R>YG2h{e9GuD7q$4*f{$E8_dIs0$H(SYBS zQB?Zl{ht2nmnk^S?QcB8GUAo_Lt=>{$OraQFUwQJ;{=N5 z3c3~_r@RVViz^eCTFm&~n9bC*#d&qneTjD8;W?w+Bnhyx}Z zp`ZhkIWZz*$tor_Z{ka+l2SJWUio;M1H*JhoZk*%GbBcBs0-9ag%_25rvGVw^ z0Yn*;5r~dy4=>yVyJtyyOXKV6oU1V^Y;&G83^KqMXa1RW{rN3S-#d`Opg@1h`u@eU zU#EK_seJ54{1v3}Dum=C6}S#ot2~!H06XOjKap?bBr!f_JlP_A0n>l);D7^BqfA-&5-M*QHPgvhucx}c0oYZ0)%81 zH5muK?`v_y32-dAL_ls`WDF=ARA@-+N4#Upf5O`ZQvvxVxpU^tkO&yF!U-JhZ1Ps1 zg4pba(geZXERJnJ(UM*$r>e%EuX^tOJ66J7v`G@fpKVaYt%%#fml0@>q*fMz)Gyhx zuR|#?iOHKKK-g%gzgB+UP<;Hsv@JL~i&2qJSPVD-6xwb&6skd{^7qgzI4pSjx9#7c!!r zb)4$;v*Ex#>7BNxT>hsDUT}h+EYNHcAWR+(Bhb>{M8y0!y|&rtTFOHeE((l2KIb=} z&ThdcJO73DJeiOR7KSjwT(rkQBqWf`|1xH7zbV%YhUUl4o9S2EfyhE(Phj4G$CF-j zO*{uW+|CMYxMIQh=wc%%3t@9)PWS$d>qmg)OLz1n98Km=E%(48 zPr>I01wdHitrQR^_yM{d(g2{us-N)ba}*Qw&aZyLZ?66SeJABB0-*KfGXZpMc#2wp zRMj*qPyv#g6kvq8gJc=lelUO{z`sBdcH`WC3PpI^H9;n<&@8}Z7+fb!Fq(Dt^g*2^ zLaUq<9oa;<`Haq~g%HavLXLHCoae0*2g;O!b^p)~=iqzW#GxS^8XRhr5P9syP|Vm} zN|DF;h0bZU#TB8L?-;4L%a%`8E?ns>{xTbIk|(m1LImVZ-PSkJHmkNi`iTP->L*)P zih3(OX+MHoVUHdC)a4=XuP6;%Ir>0LL{=LTAoUHz&^9YWm?R ziTYTqRXa~W8ns8#H|j4$N>Tv%81d8)by#n#$DBE5m%xN5aN&4EC!Y_d+<8lNa9*~T zO)cSNz=je;?e~Vb;TyOu{8<78ZgsCl*C(|P4P)m$z>_ayy8~KD0-Payg4gc2M^C5onnV5n|G;X_`Jn_e5X~2=f1e)i>a3_AfIHwU=2TYtz^A&Jp5&f zaswn8B=@I|BxO7S@rfugcr~}8j(N<>a`2ANO5z3_#$5A+V=C%uI}ZhXYR=Q|aYE{u zk3V|=1)JVbdbW)Pz)4>4hg~M~c!|1<&CNw-VUG+0^U3U9Bb%DKjfo%Tm95v+5^UGK zOkd?}Klgs@cSOHidB%bBYBEi4e_S76Q~3R)F+nHvdL4xOD7YPmhLiMnf>f(JVLs-3 zBnl%#DjfjgMPlk^$qo)#wj#%V`;`9EWA;Z3FIWhtf(nM&YX{3A*1NC;x>CJaf*lxN zefl~s0c}+(oq5;lRGzod`mh=Tc#w7Xe_iGW=@U<6bs<%r$vBsSzxUY&WO zbT013eB*C@&HufE{^q;?>qQyuD9E--jmrQC~)zQxLCU+d^v8F_04k-KIp_+JY&* zI-csm{PUrUn~%=vS6WjfMB-gOM4NudWl)m)r|{rsQ`duo1n+`R+8dBB;j~KZGFI3h z(7Aj2cuTa<)!dfo-JAklqQt$#?*X!z_W%dL^&QA(g*C`mPcxzy7X$D_TaF&@=VzO? zR33RZ!zEwt%tC}gej#Jz2}DU-BY3Yc84@eOc_gGxU3ve2@!tD5j1TBB3TlIaBb+U4eD7F?O+5gT((4Cv z8}hAn=b}ieFEHN6rN4Ge$Si?WXU`mcq>OsAS7=V1Z284ORieg6(Ql~|```w8L<8BW zmoxYzhdd>6ZpIT?MVv0^y}s0dDnEm~TwYCzRaoNO`ikR93RWAowm2NksrZy-?6ma* zvWpJ};AF6|@xilN9LL^DGZ;+r)J3;2tVApdXzV3<=D zKvgkwf*24I1rt<%fSeJBS0_$mM-XygBLH;l%iITQwt@dMrT>}I|LoHL9Q^;>rT@80|6`v1|7P)nn$WOG zw^fz$m-xF0eC~k~gxCFNA#S}O;;l5?U*Qy;7w_x33INFfxGP#t8|>Exzz?++pXPEx zNZ~Ss1}@j;+-+=HUpLIBebas{wNJBAuMHyP2Bs2&wOFzUn+tBw0HB0-RRTsUq44Q; zzMU%suWT0dxFekL^RW8UtxDj z8xochk+pQWoFuAzy8DTt^=lou+3?Wi`uiR36DZv{j&Us$na;z>ezAslRl_^hwJ||L zK3Dq?-)h{`9KTZyDj>YQT{2e^Bau|817=!98MGIuCN^)~RehqJUHG=}MXm<30-x@? zj}!f7c+Y-r>*Y=~s;es=6kZ&X5^5I_asJkEb*G&{X2`?Vs-AnXW?47Z z=#sK&ZS+)wfPnRrkfZBx`e=hdze`?T5*{L6X^3N9u)ybqK|41K*DvmK!E~AtymEbX zgu*5>@xjPKw5eN{UNOT}e$0Ty1!@pOC09M~P9cmju<#|aq*=MuWIUJkewLheoMZP= z24Mypz-4|!qa9+#W%7+|6t#hyg}Ro0^?cpO+Urzgoe-9_p?Avd5Jqq}&oOXaocG+7 zf2a(|))WGSx@22^#bx|^8HU$zdQvptcuQiInrFhkbK_J|+2%wol=S}BXC*?)jy+dn zuG|5cEF}cQ@#Wxp)p@v=ezhNeYok*`a8{TZfE)?nrW%4?Al|_-*F}7E=-TNz-_>=V z`jI%#LB`uJo*x6W6<_dBVo~@}QXTZX-^la1Wl#R4Ce)ccyO}I|4IZ#tr`BXxV-{Fc z{8h&-pgeH^{W~GUE~>ogG^H-y`F_e1(wTYHM^WxkwdnK=!1D1w`{JI z4O_*$Dc4nJJA?0c7GR*#e@O2cHY9(eHJ_I8&LOSGD0_qvuk-?qBqGUXu3 zE^0ypgIsT$%dYXjwwYX!r{E$dL(Tf`y@ai1h0|AfG@X`6xgQE5O_Bh2?hz^wSyuik zG7GKoBGW6b@v2-~kmx5mciT3$r!w46`LGx7%2UzIh=6<+{-}db-&>-li`6Wbg;b(LDIjAY5uK+14ak4pSe0cKn@k(TC39 zhZUapPi+UtE;;)9l>O0%|MFr4_7Zg~gjFCJF`JE|a2|0uu01>bq?k%xeOpYHH!T%h zvixgO?sqNVZ*zJypnsJ%Ja~qGJ1V73NB)L9?NQ6p6fd{8k(obvBg>?&L-f?tN!^#> z$F#ME_u;bFQv?KB26ClNmUDEhM_Eyx9BLd2PmW8B@xpB3LkOgg<^DW``zgo2?Xg51 zsofje8gV#%8~rusg#Ph2XO1P!alB*HeeqiJjo#s0vyVe7u_&~-HXo=Db|8Mw@BwFS zAxzSQFG>ry5!5h4LFD^_MK&LWxubodv>L4GRif4vUdP@y<{{)|x3{nF9h5Xkp&ts` z8m_k*hST_sXx55kT{~Kit3^mghZP!ECqqJ}NlQHd^QWMVm25Vr6{Gftf>$R@hjUGO zEDB5r4v#j{yR*+6w4Tz}e<*Rru+YzG!%acdDX6V&?R_lYW)kN?N}tUEVN_0Bo&;e7=~W@>be z$(&2{TIY0bJF;vcwuz$d51Fh&lk^WKw95gAgWU%>_$vEOrP-vs%P0I~)>|*;(8vpWF3v8#kx}@d z;XOzoP$P#qG9hz(a(N=M1TP}GYgqp!FcvN5>hBLSyt%5Kc4)u?zW3TLrPG5W$*Sk# zGhfEiXB$tuzuYqnw{ja6szp$~W=E~hL7G~z*Wom{ms7s4>ZMe%jlQLQ<)mjwl$Z0& zHOca?;w{fpy5zk2@)aJ4_-r3(drs#&Iq$4h_IHKi>jo!8@QZ_+icxr#A)qfvUIdRALvMV_km|&5@zD*`Gig{&%|hLO}#_1fsFg521}0D zeYz7ui`ojm*!BO`n3@-?zLo=DJV?F**R3VYVImmsV(CUQOzQDp-(D|Keq`fTc`0=E zBs|kXNL=(3HJkA7Z>i3$a-`u0+)1=38ze=?oO$p>f@CkZqR_3cHqwbnv*7590)cEN zlF;|*W1>_2{Vu_62vJ8yhz2EvrwW?R*rOJ?&3Gr?mJIueGYPJ)Sq`qz6dc31<7GG& z+)gRLbC~Z_<`cXDF?Rx~2>!(KgAw*?$n2!ilbcnW`P>_Ib$aE=JkahO_hq0c#3v{O z(Qx+Zt#AVfv!9%~=`onM9eP8AXOg_pynaBmV4~O^1#I9?!Pj-0S%{!5!Hj&dCJeFy zGe6J~elA1&+Y+r_H%=dYK!BHBY2K2Xn{Po(V~!JqsAryp-;O zKx+OVE)WyYej^Go8w_QtVCca%&3($ZRK24=fsg0Yd+f9zVE zbi_Hhd(%9*A(oW$t?KQYc6EK78^`ECer0}5Fzh|l3BN&o&pHt9pcFgW+-^yWH=Pub zN&+WpTyG1Sjq^{gcgw7IPKDV2-T>uaV2YYT#A${*UurVXeu)sTZ0Dl7#Ri?fOC9u6 z30Z+il>N-HHK+(~slnF--7@kaU-Og6xFg+pr}uB^YZ?|?7;TIauN`=9-gub=)R4{p z-!OMANLIwcWdy*;uHPAj?W!z*e6#+{3jY=i`^zSvZh(D#MoejI&^}_>Ly(vi;iCZe z+MZZW`iq$K2WZ-6o7i7#9tz?A)pt?90%TS*!DWyM1GHyt*0NkgZ$eg+r|6bN*|UJD zSh#mIMVaK=@VAMFHHD0-De7UFvYD4<#;+CN#~=4;fOW(h^}?Q4b8nen^3Ljml$pO$ zF!wF+qQtNS)d)$43d1jED;&O#Ic;mir|Mhjr*ERxIRBaT>rgiw^v7Fmh%rbodeU=PU&f7 z-kj0!d$bIdbuy5nUT^v6B`Tt$Dczhn@;1u>e`ptbzPe`OtuTI#J)nMew;sXS1@S-L6z+Mt>F*r%P_ zavsH{RY~;J6(P)i4zvo~59cn1U3)&iCp8n=Al&*m9{(|3l3>=?rEW6N^=%^aqG<)m za({&+&z&&WBd>X9o$&3~as~!5*5SSG!O$Xm+Fs zhL-YbJ-!k<>i)4nlIIt|{0!q<8}Y3R2M|Z40zv^QQqL<{h~d~Dpr2Hvvp+!J&#y-1 z!K{!6QiwnVUdRx!oMv8&wDtimuRk1XrI?*LhH&Ikn^S86q9o=tn`HX66qYD--N#3k z!!9;r=du;XdCxGl)|sKX_b4(>hjZ{&P}lrVPT8+uEQPkw$kjfu{r&{G+=n?O`ICu; z_cQm)jW(tOk@{IwwG6&LRV#o)LkK2V{9~mw{X7?|mVrEb26( z;f8`5rjMXS)WF8K7>@?)Uwy5pt&5e)Txb^m+?k;*Qo{OSdIO{8~wzfk==qCL*NaFW+F5z8vz|5&ZZhw?}srX8`op(F^8h zC%U+AQT-(0AqZ)cD9QXlq+JXQFjC)ukRB}~nFfG5N@;V4;MhYQ!73f?AFhWvhsxvy+1Lcl&a$FnSjJ7QremWY{?IhH!XR9>EF_fDt^w7r8`q)Vj zcjZ4{TuS~#A+Hqgg3PFdDqUj6V7SVuEwyo;w|SCcqCp zrUFKMJR?2sj*Uz3G7EM#_&4-QzEJ}08vjgm_%ofwFEIE&_Futa&?`ScojPP-rpFQr z5X)hmpd+n6reyfXPx#Bo{&(biU>3q0cC@Vu6<))OGePG)_a&F0B_*!KBLw2&@7=oM zDSKY;i=ms^UH9!-(lxaNi%aNS8^UDv{qmZ!64Ozawa?2ILIV-9{kb3G@7p<&#Yi~{ z2A`(v*$o;@d7Bu`R7bWx&kzD=vu97A6y}XuKT?jrHH52m-=MxC2mXR zbu5mJWAd$cH99!Q^3+BgtAgCk?sfki>%l3|7IkppJMfe*bX(+5$2~W(12*&zP#dIs zrQrvt=Eg9@(60@)4Wy6L05DGf3&7}JNBKb8@p6A+OHjVT3besqp%~!?7z8B&+$QH& zQb3ScT2EYf$I&$2$@S*5a4M9GP5pZPS!LPOGykY?JD8sI_3`GTA0TVMu1Okq_g8y8 z_X-nx>PFv&$y1wCf(QXp0glpqSvWloVsg&ax6V|*bGYDI1o?L3I-{a%<=EVaKTIY2 zms{!#CH_TG=@6hYRxPS4S5N)Wm{?pT(+*pX_DkJXJPFh(-Xicyb*~i=^-36!{2H1U z^tD=IH~nET(o0H^?7uU|zcQgeGvW|Mz;&~ZhK=oDbXC_g#!M~?x;E7K(-Q)KpTNxm zIBH%4rOS0E34;$JdoQA@aQ8|RXD@xnwm49#zS?Un9e4wqgqBk5_xutXP|PFCdLllh zABn;B%Yo7|>Dnh(V{$6m8PpzctIp^>)#skQOWlUTs)q0ZdgpX{&r9WpCMWtp?>8k} zw_Y3obRyR;P|M#r_dhG~Jr89i@&Fw+tyHS8TaBpWD}HQ)E-u_;%tWI|$i*gqT4ck0 zifgK=4dKn>{OY%2jZn%E`EjX$#~f4KpE`9SC?QVD7v z52Znjp>hEw$>+7j3KaR~Flu9lBq%^-M_v<)UyN8K-aYsS^1a{s$^TX;SWlpyg*Czg zzl-1uB5NpTeMBH;xU$hnkI6AJRi}<}$+)H>!v;HB!57Mw>THYt@~i*gj4Mdu0gw=L zn}7rqg93oP9`J|S0j669(Q%akATWzbbCSSSibIxPnW-m0)&v0VLg30kjK~0xFAk^! z4IaAwyTOBWi`ZWqJbe5IF!X0<=&$Dsz603W6TqP439&%2muo0sM}xe|iS#>U&7Bq$ zURL`DDC62YihbW-ot6benH}*1q$TQzVGsFJ4SncU;I~{ww3p$u$*58!iN#$BlD_-* zp8B^ZUQOY;7oUIo%Hny=_DLf*@i#t@=OFCNI164MfI8MQ$JhMm28oC~LcT|>t*OPP z6chcyo#l|SyOMNmwnjTUjNgFn%|9=J|IfDPZ&AcR!2Fq^{k!a{EL`x?pi_#nLczvB@H5EQNMn&B*W?qm`pt}w!FN~ z<3ewGz+xb%cws8=)Sj$4KEn4=aZ)_BqgS`C`&xTJd@mrIMZR58xW*7)t)Dp)*Jgpe zF{e!waq_j&7=bH_zQCB%P0^YfXgw394|&LC_@d#`9G5cJEgq!2dV;1 z-&+kA_cG5d#cZ*Z$da6ZR_!9snyWs%Jt9rC|@GE%(icOJQp)3}ht$WFyEw0Amh z{5WGcrfPXXR*&XvepK_?ML$EK(wgZ~zt8X1&;U#3Qb4v^%N050JJ3iXkKoDsW#+<)2hSI4-C z=ne$RFXE`lEv6(a3H3j11;T!HC;jaq{>5JuJAFhZ7V@1fi~-3D z5J|2)VxqvzVAc8#r2EbRV0HmUelu~Gdiqz_19dEU7`gojaSpbwF@o5QGzJj{|LdB& zFxmn;Bo82VO?yCoQUCm!qu?YX-U^HduziIH%btH`6+V|_>#rb>SNVW1MAlbz1Gc^( z!UZFluSrVaPI2xPzQ#X2W5=&py}!}1BfftCWDxwn&gi+r7&k?Q$?Sx$n*-iFDReaC zTbU7|RWjvnY`5mmswy*2$Ej#@sY)1Vxf1)oatRop+BZ+wiK+)^f)p_V8asFgr`-$- zGZPEwJ|*(BT2qvGU7;ewMxwhr&&Hj{BBq^2b{*O$-*e@FU(O58%}uvFKVO(@P(0B& z0=TooEttz_AsL+jlu2Rx8kC9<)cXA-p&A48hf3it&ICK><+XnPxM~npWpzsMwj_fg zPT;zi_*a<=HN+ioU@K7D8!pVm0WS`nI9!{|>RtQ%T9j$^TO@yC*{=PH-z6gdAn_i? zs784X7EyQM+f&&KH_Hc{rDWE5B4U+1MaRL_Y(Me)b0Ak6?^0|jGv~E)l1L+&N|=Zo zAhqQchF%;D5|MML!&f%JPmLDBg>aJ(2%4Eh^$Hv^&~^Y;x_k5?Y_B%@yPkc6wz&Ge zCK;ZGkax50NCU+*PJLbNE1#8ha(g-l9(>OCD4%vo=noJ0{5rz>Mw~&(Ky$RAlR1|k zl+f4koevjRnc?N$P^8TKH8yklgMO6*ldTb}TN6)8GTRk3`ON#7Zx@)vuU>A|6a2m% zU7O$U2}HdQIQtvWX*%>;I;(rKJsjwoLxBxNNYiB~FJa{9hAE%ZwW0_|CG4$&laV?y z)(0lx$8Hq(EPefI#eTpt8cX8{pDLU-J}Bth8c*!4N&!#0smM=SxnrJ1JSOuJpVSc4 z-Eqn_r_kDT#(amd-EyxypRorv<&?)oi*r)&u+G!v-GYhAx^*+Tt>?&w+xs|{aeYj5 zyLbDrv=+u}TPE2waR16`RnXOx3A0v_^59egGtRu>&?Ug{O5YsplvlyBL;0caVj=Tx zoQ`CUaM~L*l?`9U!}q>GgHrKRe#87~JC=WBbk>$Z{Je)F?1nCns6i<`*K-mR~A@ ztpwHu8;mw=oVXK?l!<1e3iQ}?%@H^k_q0!oXWVET44V-AjPzyI z8Gi$2T4yG`-UmUgNGyb=VyCciE1Kc}{%i%WOsuQZ>72=@N2G=~|XSc&UMOr#vMMtZz z+4+jC6{fLPF?|^xiGMi)__rrv8&&`TTit{toR0#8mLow3x*R{!~8!!jdMA=5`V7dAqFq@@UnKg)da=3*sv;*UZdp%tiCk_t>@0)oyP2 z<{u{QEnWA-in@yR^8GwKuiveoB@8HMb}+*IajIvmie&KeMK$B9hjNJ|oVE zRq(>44qE|#5F6iUd#gm!JIjLcMZU>ms4GGD>I^GTKR(}mPCtP%D{z;CG0owbaHij+ z7*6gbk>_gAn1b(4R7sCc$F`%l^~{#-!8|G!p?{RHiZK{|Qq+VBsYdax$Pv;z{(|)W zy^5cg@MAd0hD9FyJ|W8jh;te9N6qHa>2_gNrr#AMh;H00_SYKcde3_EjIm1iYs+JM z_d)xK|H#9+fuAf-S;K1)(zt3gdBuVj-vn|o)099OlwpA&csbylKOPcB)+6Y5Gb_WV zTXM%AD9G2jax|{yM_s@7+BchW_t*5+Y#q#W(vWzH5aA7^Bb|ma4ku3oS(6STAV1)y zZeUAI0J0TA%Z1p))$B4FL+_=-ndH_ z;$RQI%1|oMwx5EdG&NV$T$_$e^k zzVFFhM+R>p_v0lBxqOeNq(!^@-D6kU3-d8GQEcW{;sofZWjZIFY@_vkzoNQ#OE@nY zje1aR^l` zA!y`lcQoFWP|#&gfJ9lkiXVPGbJYeX?f!~j;EH|OOqRygl^xw$^>o4Hni?DOd>s`v zmN{RpquM?k3w+AG+oR6P28lXl)*>HTb(jU`${ zAf>cCe5%WyPG~RK?MSw$W;{{XdltLLhU?vGI7Dp7#CRcF*vp$Lr~;y)wdvlCYr3)T z1$kMUK>n8>tS@yoaXL?I%UC*Ed!}g`0Iisg=i3()nmCR6m76}shGu;6e06_sd_78) zKGN|Umz>4WO+$4ojEjp zeYweosnXy>2_gz18P8gzMLTgLPUz#w z+vVpsVMSGhc-1f1jlx$29v`YJ;i(Avzl4;dhEimq#uXch~Ch|zn9wWSsMCbG)*4-m@%hxu1+;s-qH z2gr0CvHMAGVRq|)Q+GF>9NoQ*+Rr8ecr=leUL%#iYJOaS1+#IiV@p)wyeKkP^H_j} z(ii`Y7k!Q{-XD@t{{Q zvgA-0iAPn47Le-wC(Q=I5Vvzt=+fx<@!9 zGWS$Jq%(7D7Ulyr37cmi!SjU>Yw`&zA2RFAY&r4B2}QUTR|y;C?5jawne0fPgSU?& z3mXu$ew;+=L65Do%LC@d<7I^t9&C>{Gb&EeRaM7<=EJGnTssP39En!$UC_ zHH?f9ZzCS4Jg^G4L1`!KBmt^1IQ5YdVSW|CRO8@)-U2PN$PU)PB)@#neRqbxV&Ww2 z%Rz->_YsVcl1SaK_A52-X|%jT3ZFaK7xbg{hIiGnzYIUQzo9DqVH&>+{eU%ZnF%x# zM`;5R^Oh>-z3wY48X9-Iika$Xok0|D#LdAGosy?H73%~)sMpSTdNV&6{w(_Zt^GkC z2GI%>N`^b&@@D7?6g;S3ks&L9ju0QUu6> zM-D!BZq_jmaJ%%8**xrWgBrDIwI?s_Xs%atFU^XDAt-pF)Unk5aMSdhY!eMy+J|!j z->lt$5}irW>H_?A29S4)1MaPsD_zx(+mcP_LZ0R7i}j?WnbEiY#aaKi)@M0OU^y}( z(9Uf&7lt5C!!M^20)T99bYv#!L-vE!bM0?TWfJW*Zt8W3^||n$Q_3@-W}9N7oLv3~ z7Cf!c{<%TOuA*9#@y5GniI4Lgq#_n-ke>DzuScK93pPXYL4J4ON~9MkCztUq%*D~r zA0S&_HN)Z3*LFwe(mt-LJ7=p3`FNm!JqWN996(Gx2MC9rfUE)|CJ32(kLyPzuGe{c}G!_YH`hPzN%W5_YW0F1pW-L#4}doMzVL;gZ_@#%E09p+nf zr9vqbG+IDRTpO=Iioc2+H%Gs~eq0q_t>YP9VmVw)XLnY&n_#vKrmyY{w-()0w|gYI zkxDuJj~3zYAO&%v63m{D+8K}-x%PrPw7|zGL6%bQVZlo9S@?>_^JA0L;an$;BnqL>7-`J>wtduwD3>DZp-}pc5eR({TZQK8dlE|d2 zSw@y*mk5O+*^*Ep>r@B{MYfC?k$nrHY@?*fo;^#(z9iWRF=iC9&RE7UOTVky>XrZ$ijfBqQq)uA+FUy~EUUU}E;;RD#k-^q{QSkYRyX6n zYB`^?jKz&q^oKJ!3iLDE{A;-1ul-XX)xbb>?ttuEfHgrWN0j?$906Jo*FP|3)4yXZ z{sLb7?c4m91g!*0+mSosjX+D>GNc)RzdM&u3u*KS@+_dLC<<30K3~L~>cKdJ8ORbH z$cnX4F{09vmoBF{T&;!DT+voq`=ciY_lAq{^}UYQWrPN{rNF z-eup(6K~UTFj|di$YuWn!7NKD;+#_#`l9&eq2t9#Nia zVV2-@Jn`j8F4vN4hIxywll~5^77rdBtwKjscF7mJ>&Eh&oX{(8zr%iBYR1W-?<_+{ zpvB>K%S=Hv=-{&dq9>xT4ix)QwIy6E|Wb!>q~-R2_x z7|$cbO7*3Qoz$vlKu4xdJm#SsV!uDTn~@3oVj_02Q7q;nZ>S|Zi#*3LPp;vb7;;w z18{wG_e*kwe))*ZEh)z;Ai0e9dU@!As_;y(zM!s@@*-}oz$`g74QSQyfYbSerc1rz zu!6Sd9vkfg`fM(u1_N5=rV5-4O`vVE0Rq-2m z6wAGEaSm~5?UcE*-_c23@v(t0LA&75*Gm`f`np{m{T8Rf<5SxW(8ivUHNJz+=Qd-> zB0t0qr2FaPA216dNU{Lt19r(AP`{up5A*{qYM_L7+ay#toE5hsLU^9A!hOb(dwp5; z{i-fAGh-~N$Zbyb!_@%kktb=+$H(wpSr4kX(8tFzP8brrah!un-q zU9cFOzrF9(E7$m9=1Y05Z{H0iyHZOeZ%P|`y>~CQ@KJ8M`TxNIVz+DoS{TFdf`M&y z`VZkoxGa6Efi)&g(dxmHoYc9|rM>5-WE{&etK$fSH^02OFv#cXZ<@KEf5^}T_aJ{G z43Po|g4i~bXiedSi$k&)KQ;&Oy+AdlqC@(b-J;it{UEF8*J2>J_YC$C4B$x2Jf|NY zYLo|5cX_74WZ4^v)BYwn>J=sL+e=g)zaoW6s|N!6q&j={zibE=Dx;Xu+qdnZqnmy4 zu!_zoaIoe{|I1S%L-H#hdyFqki%+DcNrpW4jeA&~Sh?)u)X~q1EH7Qj^(6XTt0&76 zQLSho57mJL&j}i}ZX@dxOr;cu##1F8r0kHKBCpgMq5HHTX8)?W|M_zON`;=$`yrJk zEo0xqA$6fjm1b{oE zz@d$&T>4+%QMHjj)D&4D>tWug)y@HArDOLL#&h*EzMwsU-dQ;^JG_lyqYJxu0oo>j z=2~-v9if(}nd2Gx*haJuzrBgq;H9qotc|k@rmu5OsPani0b2h0 z|2Oj6}9EI;6KBU@LMQR%SUW z^?Cu+#CdwU$l>)NV8tE0Z^&~evPffp*YnfW48y-%Q@`{;{$%b~cjF;C-~{fcGm~dK zA%_|mUfNB%K@WKvlS;+5-4&S?4JEH#B_8ETnH1P!R0V;+R{#8UGPWUa%TM&-`j%^P zE}ji~PNQ(MLX<=+o#)Nj(kjF5iRV}MT-IpY_c(On`O`3AA%21m$(Wo#?CVTL0fEp; zm&mYkC2c#b0c3f`Gvc|JBHs(K!k(6_xXeRrVJKE!=E)h5!U+Z!#wN_|Qa}@_2~#9UFniCJaIMoF<6R*g5CiO`XA|hgXn=( zzsnR7B8Lins)TUfO@~F{K7N{VY|>4sV+^ynlh4t0T7h}uj~K{*_)3MCDdzM~LQJ%$ zn>I!)?#GGH1fTU|gtbSNc>8s_vwP%+0#SDog|;*g@%b_k@NOG45ftNRfdJFCBg zsw7s*0>F{*faAye)g^;>VPW`>3&Z+J#ZttFT^2puNhVoZf1!_PdeZ<7lmqMleIS*D zP$aAt%ZF-8lD+Q$1-8D~)@t}f_Ak+1Pn}~l_Md&6WEt|lNdUS$NpU2<8tY`GC?Gx^ zy=Z@kT;W^|lqO%#e?@+ADp84p#) z)2>$E{*QNAdK8%J{%?5ka4TX2{c;)sOmFK2P^RkLl64K6ctEDf1v#PL#yZ;3g&X0c5t` z!R5yT+U@u&ff=kw%fUFq!XcHN!6oVp#Iw3Q3Ig2@NcO`U#wgw+HeK2iQ;O{75p4T` zE-P@dY$Sb5w1n-;Rl(IM6kU)q%CmBWzX&3$z%QU==pn)L-R7N0P4=y zDCi#z2KqqEM__Q8spt2=GVJzhs7)u;l%B+l}a z9^`2kzdPtXv>6OTmuW9U)J;7$nzuPSZl*ZwX5{!54)taoPkM!Es)f&(3$sRCjU{y;FRI6FvrO3J<_x!~{A^2Azw!K9R~d>k(U|s+rw(D1eh8|BrYo9G`7t+T9>0aT?OA<;$%yH*mnW64js=R zy(GI+Ed6!NiKVU5c3#t+1|yDx&c@aQ!ig`h4QefwO8+j#*c+amX477F|UUJi3AhH8hdHus-%36q8GV061t4Q0zW)7(m%~i;-WD-r!&U z9OTnZ0*$c`sIp1hVIFYfes`KVYAKx?~*|3abLQm?Q`(5IY~zg>I+SoI$lN;cgVN ztQn3XH59vN6CN%X4)ZIt;O(`##!B=lB%Ga^ofp^_dIWF z>uOURqW3@McJ{lu=QSwFun8zk+e_^0D2;^>Od(v0(7jNnz@@$eRMDaN(+D|oJh9Rf zf$4O+sd|&oL`z3OG$4*$475v`F=>(yhgGBn7bAq=a{ZTES_BNey^jkm(&0DIZ<9gw zsHPJwugw8aO!sF_03#IEuLR|CNt@;;$|buNxLQ8}y9YS=9q3^rDzu8`Ix2{k<$xc; zjR{M$%+w>EZE4uUjr*(??N!MPuWk7;9Aw!%S5I{f6voG;H_1%vE2a*Vrh4QA^8|{y zQIUFvyfI@0iNvs}>+UPf#U6ZF+9O+;bFjac4{)%&6w$*Rj{)KBKO~Li^`nU=4p_4{ zO@LBT6mTdmCsZ2&=U2;F60#zwFyWM6;`$EC@7Jvdeup#hyZhrab9M~6k;R%bl@d|uA+60;iH_|hlK(6&$MZ`DhRZy)}e^tHm?%V4&1 zI7LPs&FqYxi|r%d!aRg-Ufe`d0Ll`%fSOBl>_yH~6Eb({5<8!et;$pu;AnTw1^Up* z|3$^R{1#>d@H40;8*cz4-i>?&fSdaCzk?cK0D<(UZ-*|CQS*Rr#5M+`?4z<1zJuTZ zFAEUX?oJ1EsCSshh|rI%biUFV6WS3C2z>|nL!diZ8K{_A2bbcJSrODzuqM>*@dr@a zI^a_PO5eu;4+Z)K0t2e_&w_t3X6Wd}F*>qK6B9-Eg%Em?dyosy7QTbFfe;0d$S!(T z3UI$32O*D8m!Y_HIulSzd}`o3Xf5rpYU5U?U^_qjw0}?cher9o-|nBhm{q@nVxb_! z!3F4n2Ag6)9u-_!Z?hyQv=|sLYcVAd!zb06ofExT9{U}nSelS~A@3IcOR~MCN~^=Y zBgOmQU+T?W#e8a+IBPO)g1ckJeXK#*RkC|^=`Fp;jdW@e?D8|a{ReHD8jRWh3eGQL z3w8K5f`PCWu0*r6cz2i_9V9Y3Ege%>_I7UN1w*#>yS~1Ub_qm*;NQuz@=l*5riCGd z$PQ*iRNNLtmgoH74HcfWv7|_R7<0acp(=x(p%{oo*p#_hWumXcge*CVIURvB^ht<7 zR#>3!57;l}!Ly9om*YYOv^HZzdDV0mDWCWEGp;6tV-Kpc5+a)G{DL(F{5^(r`YYd( zY$hh<_93ID5&}&=9(M9ql#dMg@BcUgueZ@r>!o zSLA4rA2jXIF$-9OFPmf5z!QWRh=d(?CRLLIC}!lFfyFFN{Zy-5#C)43H|DHoWq0$$ z_+CR2r+i8xNjcW!lan0_ zO`7Wjm4|Q$VW=hD?;&+BZb=q=Zt%t->uVGG)v9B)Tel>{pGDmgOEMJe-}h^dd~Cs@ z*|6f9W+q3DZ(5K8)El%^O$o$@Mzuy-9#Can?_>sDYp^3sGB2NL1Eo4wMh127yB0ad zbQoUpN&>XYPja9YAovi&jR!qB{EvTIu8M=+v7V1ul28lOqomnoo@M$XZ#6Dc<^484RzDNuw zTw5sPl{Ga{Php#<;_-!MrhpGgr_Zp1_C75LX=0p3GE>EVL;!ma5%sc&fuy9eMgp?= zEKB=*m?q?9ufXnGh}Y^?IKLZX^Gu4E9*50`$Bb*mqWXhyXr8r;HFt&^+I0dh>F(<4 zcDS95L-=n}gQk1m|Mq-#^htTNnAL_r{57bH{P<)WV-xIXrMj_azD=TFNhK%pb%W`F zGADk&$J($Yt)94G9;6d&^4u7nRO&;3&B{mg0m=bnq{3phVEsPTwP?AEB&Vq9JVekl zG0D$)L-RRce{voXe4cDXz_wwJ!1c;CGjKt2^H<--i>P;7oaos6(ZP}70| z{6BWOqJL>;<6(jvnzWP|qcVxVRG93LY

pq&Jg!(lj9ahf zY%BI)(Qyhaw2`kUKE#~Pnn8~phpmD7k{=2R z<9UZcucP1okb@Keo!Il>AG{+lDU%lKvtrXBN=l(U*=i%`cW5Z^-MSXt>3~;SBupLK zQSW5r|KyHf%6h5Ao=dt+tGF+paK#?3Oe96p_mVNV-u|UV&c=gB>T`BpRbf9QCT;-4 z6(K*labCtxnu=6&0v2e!OBW%R;zzPJOAk))=g$T+ioGm<~-jbt>0uqs9}IZdBP zkP{4XZwUtp-OUmD_8;ctSIGQtEr;LE0soICxBe^h`cta`_R+G`l_?er_E2(O07ct> zOn;Or|JqH&glTg8CUu08?8%qz zo3LOz0azQ{!C~nB;>F|M8_ZLwI}z^BCWR22j|3F-IT>~YDFTR5gfQ(1Lg6#qkh1q9 z2EAC#bK3f2x(B?vaayHqPVZpVsE?kU_SuV}WS}5%iFBWqp>9br^-s{Jy+OLsV8%qT znnc;P5GucuVqdNcDON{cv>?^cQI+|IY}dYXy>gMbrNBH(tfLef$%KmQ#0)xcF`>f`6|D@kF7Y9MMi?G45qm5>ZSK7w(O zkL9+$LClUSY(54)=5%ba-^D{Of9iT0yJ?0Y|`3I zX+92R_iQlH{nHuc4{PhL4{~Rix+!Is?xM@PtKPB!?HwzBdxihqBp_VKW@G4`Dq1F; z57|;rIGz8MszWTU=)6tT6dSDEc#=w(P4WFI@@>QOF48ZAVaJWaN&jTMz5I~KY)v$8 zj644B<#JFcp50vlzEYPd4wYEkDg_YQCgb-`e9+l zl-792rDox0v8?{Sqvj|*0<92E)&msDjzMSrqEh(>_G=9qpdH=Et-pzTPIa*yxaNsrb3m~&0j=6&+ zp96ZW+%_pC`jj_?W>ayrIvf3Z5`;!ONqTaI)CMjI31wq~@{m!6uePkZn7%M3# zt?TnEo`1RL3tJcyF?ldp-H4<|{zg=p-40gvE46j7H=(mb6AsITCv6JIE)kXG3obP;-9zJBP(T zmq%F#^?XFVf%H#?bZrk;Kl8Gk1||hqm-bA!U8kbW96|!c_#_G%cO( zE7yqsbtLxBgHi%=e2kPrgmgk7@GFE{H3V-f!W{_v*7H1T81~}F^cQK*R6q_|@k~-* z&0jLyGRb&;{Wn=UvzjO2)&$QcZ1}`lySxyLmZ9J89=pD`^O<@?E z?mNm!gLQ>H4_Voc%suuE1~+3MWX4X+LAbFG&NE7rYh?4F$39u*60`^Lf|toh6>ye` zm}-}Xwi3g~5N##_j{bh;RV4L@DCq-$>qV1qBl!I{%nws_2zKRS>lb#CEr!AmpKT0q zGhxp8UPd$H_2{foQ6bRY-nh^AflvvAttbPz3VXM*5-b*3i9!l~#+Z zpM;`SPPXI=GwysO_Yng-V1o4HZWu20fIw25ja}_2#e7Y>2A!|b`yYN2aN*+gY3deB zGGOLsI$~!mSXC)6Gx)|}R#)a@+YF#+kUS9#wSdlKK=wdqa)kDz_n6gIW1 zNsfQXcu8KJg_?t>bFEXA!^dk4R$?cuwAw+4SHBYy0!QNtkuXUbC5PeoWRSN!A>Vwh zr6(#MWJlb_+!$H&-hSlw@#}>|Dr1*wm+W!ivA1CO{G~gf;07HZ98AdH`D(v%hF%-V zI;F5#(fPU5%we$U1CFla!6$L)ZUIU7Y0KzqPtTk=X<@-E9HPb~w8=mLm`6;rw6Yl* zn$8IZlj}`&h)0?Uo$f`2hahG(lrq@swR(QXPD2J&(}fd^OaCCG06^;f^V$3^VzqA* z9iqkLRy!H4vL%`10B;4w2zBv%gsO}P_iTU>0^3%Kzdi7wd`nZ*ctR>lE)bn{?26Sk z2Wy%1mMqcE8du(xW6kix_7>%9#4KDCN5#?I9dY7shFCW~aXOIi9g%IQed zT%2ToGaf5`d{!<^Gn_!LN0T$OhZ+p?4Emf3taSrDJ}6xD687m@zN@DT_Z4|W7|A<_ zZs#7_y43kh^^Diu)P#bS^}hj#B2!l9`$1;gwm#AwiU7L4RmTQC6vFo-j zrN{4B#eaBo04xn0y0h;Ql(!1Yai*Mydr{=x&9XwsmYUodVA9mBeU$seY{K{#wOJ;< zi?_AAWVIj#u!@9b4$2usucT+bzh#SCv48dv@c}QZK~XDup+&p}lm{++^5!MghA1 zH1qispIxyRr#(|Qr{#k_pa^kp6*7LT^xk*Vd!RX1=dQCmIsnig(AR)GP7#E=4>lOz z%emdiNkoQwoCh*$!zW7zjUDb>ZTF{jTzAgXhJmbQ*@~(`svbS z5%~Dx8$5m_Q>mKAt!;F0HaGWP=8Y$X;FO2LLcgPl*8M7cn&<7?6p?6gxz%MZbFFT% zY!D+LS$q%}ye{SrD|R9mkM}of8uRX(fgG@XmmS3>oLFF&Ty8X&n>vj!c=$y^m{Bwu zG(pS2%shcrf;(X4giyU!_S=&e{F z&D_hRV$Vpp<2QXww=G*_^z_tinR_8-U92FD{ZXbIV;U}uSeJje0ROU_*qLXUn>-&u zg{YbU2H@HB02W!ysSsimN&0M^Jn}qif0F2G>?thJ+59BN6+#v@iK&EC`KOUwiBPJT zuhBE0JMXBIf#+kSFFz_(@?B_T7vmbxAR|VT&wptavg|;dG$uM`Lq^m!@G$TtfM%~h zJd^Zk+WKT$^?D)O(=TSNpu4_x>jf^1yIf>tqIWY!5OLH6#z|!kB4vr%RUZ+lR__U5 z?h!luaN9AiRh0diJqxcGW7R*}aYq-DMD+oTfk`p(vHtTMc1s+lr7s-Ca^D68rpglR z<8WDui|aPyE0y(5;c9aH2UeROWJ;C3G0g=8SkL_9bLx*1JO8(cIfK)Xt`MNly(sAs z*#TdCX^=EGW>QrjAW{+2`|ROojMNv$uMvo6rS-TEGQ@FBdGNmEogr zJ|*Ct+c8}5AVP6xt9%&jK!a(hjmvEPh;od!o2Tc2t|Ei$9~xE!4OBlesG?JyRN)ba z%ErS*gubru)>p51$}jnvLgGOxIzuutD{@s# z4cvj$$F)7xTI^Rt&}wX2^Gt9OM4{b8DOH7)-L<a6_Id_TVFa&%j-HMOQ@+8b7hX~KjE z9Ux0eM8PHVCS7+^Dke%Y?g`G;!nfa>lYU z1J73B-!SnkzV6cVxr}8GY=uW}J%u9^$&I+Z-B{L!v6;K-u1ET^E!#7(-H0WO_Ak$O zQjFFq?&Z+*192zF*|XBT@RNw8?$)#Wjy-+EpZ9o0t1wLhD4O^AE3U{4pbWG6E!Yt2!~V&5bR1x> z!Ph>_sUB=SWpuZd@7i;%!3%oY$2eyMvL#int3&8QffONRKcxUU6TSiHP7K@ylLOY4 z0XnA6YCfPa!QV$$(QSg_<}qdSG;kf7zTXo{p5g%kdVtyVPSmF6BB06E0C0To+cN(8 zx75=B=of=vrdpGO}pC??~ZeCltag~4WiD1UFeko7k zRq2qKdq9^2pzl0cJYjh-bvNzN)JlGhji%Mh4`3I4^1P9b#e^AyftVE(1oB8C_x99PRGJqW~%x&32=3uZ(jvY#Df5< zh5yBj6p$Q*I#l~ukRRsfNbJqfde^kAnxMq6328+|L%;wV$U~los!a6A!XG?tF;0q6 zQ*^k?ASoiC?!my68{EXvsU>O#1cT03Fy;ujNF|XY0nR4%gQ|<^sYALm&W|!vcVo z9@J5uW$Dy{AB^jWY4)d-Bs=*_wGB0FVBK`p51w0kqSNSf3PU0=33XaL;Qb+o0ov(b zr%}KBV{U)^V`d(rB?$mDLhi#o2}}N#_=X<5vFdz}QQyI7KH-}R_Iws6#boJ&il_&2 z6owg8#e4@KGiXE7C!>O1!@0@C;@e0z>eVJUd+sqMqoI4~k2yP*zTQPPj?!7N@dX#* zX^Dw@d4hrgSz)07iXg!G{SE?F&QHt9?-Q^r{-1r!zKxS#K~t3BcgBvi&>6{TZgN#8 zMGk82PPj>yoQxMaA{E7ts(Vz;pxTuC>`qH(&B$LIKS(apz>oNeu&JCV7awb1> z+qis{6>smhb<}0L@iLi@PwFLG?5t8|k}qQ`;2h7R%CI!>E}(AvWBE%rHbDp*eV^9o z5i}DZ`cy$tJ!1ZQpZQ}}^eNud1LSWrHwZC66fWQahdd_-S8ARZ1@13|N#9t8w7a_P zkKJ?YtKHPF)E-dK7fHWQY<=UC_%-U`A=0f7{!#|}MV@To^OF*hRYCh!m(09l1>m@^9--yj+e?2RDqDi4o z_=>Sc@PcdbZy;zo0RhfvtG?H_c}yoJCOU>p)CZa3PYgR{JtY)?#FCr#JUpFLuX&pY zp>jr$poHmC%{cjX?qTujn@Ps6zWG_6raTl5OK}(%o;~}x{IMPj`i8f!o-fX5rli3$v%L~WQKR~dLU95>m7CD$yp?d}1NSe7$89fy z#CD7nOEZF5jXg8o<3a>Cb${t)u*(nqkZmTjQ?-~mM-mwuiXdyVe0rGQ;EN9Yf55j#`i zJru`DTb%;f03&oirZds72m{Hoxb+>o0|+ZTiy{5m;s7W-%d6+%DYuV>T6Wcqp0&v(LPT()x3 zz|l}m+;UFlbq+(U$9#@vy#5mh1-?fot@4aCu3ky`oMG%2G~RfG)gsR43Wv1s=t{L8 z@~DsRAjV;7KR!sMx1qnX@P^>G1esz18)_c%ov>|U!c&L4UkcOhknArkkYD`wujhpq0u|G2^?}ly|woW)BJnhlU#R-QLoUb(2VAr?v|Hg zzfjLoOyO*y=fkmho-p=MypchhbvQ~cG)clNVlr&zNOE|yAh`GeZ#-|WlX(yGmF$f1 zjm>Xk>$R2ZRGpM>XFGFiS7B^a#*g30bMuZ;JS!X6X* z)xL1RWS`wAm?0`Yx4wPZV&rUbsoM_1ImTk5q@stu&g&dKLm(Hj0l-F5J(J8X_f<-SbLw=5}f`=(DYPX83kQw_;m z1^yAFH?*26fHAyul-mEz_h>IE3cVtpG-OhyxaX$c!eGF~d0MTZI-6xm#vEk)nk0V3 zx7cZM8FLE71zQ}>va#JdPkTgX71`^ycBGbZZ~Cqb3L&xwa~kgbtfo?Pp6(nB{=M8_!4)pTC1XK&yyCFhA-c0FCD&O9S**gg8*+Lk3v7 zBao$!&Jz#-D^sA4wFLxW2|9FM%b8LtjSP@l8D|0S^(wMd4doKlM{T9RIKFNpo5AE8 zun^3~HXcR=0~CumGkU{Z(DQBR26zRopEd=UXz2f*k#Hm@{VF z(PR5$t|SQFa-He`e*vBz_r1S7UkNS>hkr&JwxVyPin*oU&cfy$#150zI`PD4z4$$+ zZ}roXyMho0=Ojjh9O}rfFD&*NRaV{FV$R59>BH^;uAnN_;Z?*3%YZk(VcOxR0Fsb1 zxdJcGNagBp04R3jMV;|JEIGqYk+_%THM-8B8d`oyGP4DRlDNo|5&M5MFlvx z&uH@cZ{Q>%=)!P)A~^EaaD)D>^jO~s_cl{)Po~TcjMANtU+)@r@u@P#qFUY-w#)R$ z^-9fn?cOzg_1RNw(%rMdP`+ON!KH(#o-&f zQ0h?~Am9MVsgdv6AU%MG$bh6iM*jPkfai2-4moK;I!mZo9>^gF;w@VthnGSoX3dQ? zua%Np0-mSxIh>aQh3$R`4FTWSm5=xw5QERuwPn}jxEvdh9PO%nKi##aq|oy% zuJd|scyuP;ZU99DZZguSi0Ex(BuAC@%`c|MXZFL4YxcRtxW24TI;UZUyQMlSbFV{s z>)P~0wu45<>pj6xPGvz+ds;2heI-zzXc;M2q|}TII?UN>=#*zwFJpQhEhP|FtJ*Q| zC%g%jZinm8i=B>qxFO`~@*;aI5Y{b^dB1(;?yzQ=`4x}O5ktTxG|@6f_EhK1>I+{l z8^(2SoShR z5=^k1{^%K%mHd1v2xT3Z#c%m;M2UZ4{rut2)?%f52VY$h*xAec;)C|L&P`3r_;}Zl z+=P(weBSp{C8eIp$Y;2#5fvGs3w>jCo3SmwPWdrzEYS~a6CK(#d4a%bE$jhz4yig@ zzP6!$go>70Z+82#bGQ0TN8f4u*{7D(UPEQF-pWCkLWm^%G%cxttB5RbIM!fZusDca zOii>`wEmFTez@!T2JV-C9@^zu+pWSg9N^?Gd16Kq17uUf}FfCm+Y=<27B-1TR)FqxR%FnCK; z8%3Y-c3tuS4ZYWYqcYx(#zVBMh5*gn*V^JaN2k+eW%}zAM*1505w)R(SMNrjs&n zC2ic>=u!_|;8-a7(gky=j-HD8v!3H?xJERg5NDr)llAcbGHjMtTgOrTCEie=x?O`5 z!XO%=Em|A^*M8cn+}X!r7+5F1kP65)qWJA@N~?7`^naO2j zZzz4-YwCq{;;$m|KQ~*=Px1r+O*dJi4h-ipK%`vd#kWRC^{BZzm+YfAsp&oUCo8_z zpEM5n9G>R3>dfS83J&;dj*gMYjDt~8$Vh;Now-GMG{cEkUm+2hRUiGkAn2;5FWY zez7B4eh0zoYUnZ5MOY$FE{(Z=mxp8vG{U<_b_MdojadLKj2jd0-%7v0=1ha5=ui3z zTE2t!2t{rk*lErAB}HV`lPr}$%>cxJmc0Onb&!98=G@%3BLqB}Qbpizpxdw;m1}AD z6hI%C$$+jqA=xpj^4~#dxA*iXn(*CQz!s*u(;}c>$PI?{)%x$C+@F-B&LqmXj-=!w5KP{c@ z8$p$gY@$_v_1(Q71Tg7lCqUBLg=`0O06bbsj|G5zBP2WYOSV6Z-c$|UJtn<%k%v+Y zC6snUTB(3gj+Ia z`OeA83=z8@EiU4}kbs6_&~kYoJ4Crgg^s`j9cBb!Es5VA3{ODEw^*36@5QTru6t_^ zSs!wf)3ms(=)(AU;a@h3zy8Ai;2syh9_%k)&EJ;opV~zehpIrYxXxfF^xKgfK%a6H z>(Z|7SZk2g{ra(C+*O;TK@(ob3BxOO6Bl!@2U`vYGpyeKYq}31Oak@Uwvg!k_1DK5xf8cWeTmiv_^OS6A>{}O0xMG& zthm^Kv)r34cJo?n5{Y3uHC5*y~vPvNWDOShUrYdSc)5?wF3;;l>pdaL?7y6KZ6z| z4)bQ7isFfaPK_ZqgAjIjG&Nb8dY5LRFZ&(jzjJC!2=lb>#jobXxV|w1R3zsC{FzT{ z#mI%ibb1#Kydyz?ZLLC*G_P<`19mG&XBwZ(|LUut9B>bcEQ6uOL0EyNfB}&@bLGz!Ry0ZdTv-8T5btOS7ehn82<} z_y^Kk0HgdO4&KR$+=D7zK#a|xsIf%o)(Y~Q0Zo<`2K_S7$P3&9007(!VLY;Z3w2O% zFa6GTbh7E+=pIK`6LSqk_nI?R90_L5&tcccMF7t&9ClRcEb1^-nppA-@2>cfxVf`> zM{GLrv#^Du8fe}vUoi5zo^}rVZv+kh%5M70%KP8qLjC94%`IL^hlW=cX8FN}i#KVS zy()d)_k;~*ZD$-&m$evcg%6B|9eA1^ox)pdxSVMEsK8K9YMB_AFnMZb(^D+f)IZs@ zEHY7?R{aF)Y~p2G2&T+6rmG7+Q+kr=ATiJb1J2853o<1b8;ObeV)EGz1Rs(9&=Zf` zzc_IFsL#i97lQNIRHfcGDZW|VzppWnFQhcMXyK-4k2nqf@n|Jl&odxP z0s1L_=~=Fl=K(X_^7a~v2D7M@({!`XGl}t+5Z_9rmAcEU)N=)lOQk11QUu7cvKOT< zpszsjuoY&)EbbPXJ+x%n-hSzbaN_+bMeGU>gNbqLv8&lRhgk$Ri_51W9dnO;^2Kr* zhqDr|DwyA6O+O>IJVt%Hht* zH`kwuN5v=Hsxq0oB+6>ZHdbj9OU@z|;(%HW)nX)0eniZsaG93gF)FIumUHi|@IvjZ zSg;A_`iC^W43#bag&NQEjW;4%M%u86E_F{D`2gu-_C~4OM;6lee*KzVdMDd4-6PMF z6d3oei|<(3XGt;V^fAS*leM;7*4?VmDjpx&gA~|aOZM8ln)?rQ{n5gJXYeN@-6;g; zHgLGT(r|+|+Mls~T5!~J^DMS}>5Z%3<6hakG3jcYgIFPbC2ekvAB_8(9QMDDgnu6i zf9yyQ&ED^EJ!t~3oFr+Qo68n{SwUOqkts6+!DxoL1vWm`s*-((tbtSNzh6(;QI>w% zIHJ*Gc($PAV)EI>JuiUl9>b7W62rnzuE?)~kYD=YzdY*y{tN$eOJ@WiohJNlK(hzV zCAtz}*XH!|SDmcq5FR#*oIvA|5$h>?w*Szch_N-i69PeE*Rb;#9O0YG`qw~PqI&NXICEcMM{Y{mWSb+(I5%dXwwpf|w>~Yz>TCDd=)foAvnzCeiB3ek8%0 z?@x;Yaf(OydHkJ#wSn{(RiGCk33eeG=^pQ zO6Ul~#T}+?F343GIA#ZqCo^Troy)h-847Mqofjm>R!DWb4u#+7J5yoXtNStuIYg=B zHvvbq=t;f%oZgtlM3yE>&vwQ&iY_%CCZ=>gS~*6Dd*57WqwS`UeHX74$Lb+nbAtU6 z`~KPVQ$;nB&DZns3h&)VEMwfQtjFrAcEjbukEWdXm_=K`%%nAF6TR{1c)G|ElwFhQ zTiszDT#;^(TYCN3b7{u+Wick&~SIW?MA#|-dm~hd(S|oZ`DNC?rU6wC&*dZ$@K7CHoHXawgNgE zo2^vWV7dl*&#;08bS(x)=?}i$kV8np1Ib3S(1WfR-j#-v1q(tF@9Z=@?>i-0k2+u9 zXB0N$@$o?BYgIK~v_Pq3@0$_l)Y^_b<-iLQdEbzIh~PKmJH~2fcrM3RI|mSlX2r)^ zC>o;rgwe>lu^4q@Z<2;cbH;6+89oT_j*cRmqqBwA5|IsVlxy&lgEpOrlY?-cQWxKPr{N-{Rz=%M9qPrRCU#YzY)MVE zN$kN}I1hNvM>9V>#lV;ptNoD1bcQ)?Uj9Z}vb-un02@-@9jW93=Y zfe1V@9=nGw;JTg8wHBrH8dDxRp_$;ozHI?}(juZAHXkd%@cE&gzio{22j`<@q40?f z74j6oo=U&cjCZRN%YJ9oL+_*?B7(ykj+ET48;z2S@x%+>veP#ia=IM1x7Pf?g=|aJ zCg$?c{^(Ua)k)xzRMwfx^LL|T()?(MTjN>RhV!|20;Nv}q%BWT&QVX0{cs6~tjT?C zmnUvpcSd<^$;-+(r`@?!7ABs!f3dj3d&Pd9gFwIW)DkVDB+uJ&a!S^mQc)ooQfz3IjXB|ShBK(OqlM|QSit*bKjyHHV5HwzfORzI1?!CN6(4p6T^x-((((0 z&AhJOuqqoVCjiOOi>a}7o`S8P(z%pL#uqdJL@XQ`c;Mhj^VLU+*Yulpdt_te_|gU%4FMkKt5Q7&l@Q$WX`$@wfm)-FQ}nHE;gTsCE>-;% zyEn9aU0P7?2qA#sj}6*XzD+ zb{OL$MIdVYmv*KLVI@z*11@37%6$9Uvng@VYuSMdjo(Hky%X2e9(&od=w4YovSYQ^ z8p3a12Z8owe`Z{Oo`)#dXI;bt@jnpZpYts+{%oCnyLsHDx)1c~oyF>5=gpG>=X|In zP};7gC5zgr3_zS1k;wCG%oO9C-v-(w<@s~8&}@D{Uo%crsh_xHbLX~8!TH`m z>TxdU4aZ%6S1pH^i9k$UiF5!K)AWs{lDrnvG0t>%$1#HRQ*#qwk<`INBV!pmLm&eh^PM4W5+F z>f5HlIw2mz02wO=)>4n?jNccO+yvvup%rclX|rJTH2l<$lB)@jd-QvAiA&!>tkBWoi7GhyBB7`K0OkvCbHV``nQXi*kVqLINc{GJ zF-bww0F2~*WUj_bcN?7qR{KS9f2w}6B_(OJmcvYX)K8mts@Dwyd-O4=oxp-&+N3&h zxrx!!9xkXJl3>39JLf=W#W{5)uI7EEr^xnut3nDcng+*xp+>TON*;Nh?cP+x#EU%% zet%Y-f-m^gvg=>}mR0*D0|fyP3n5iFehf`)Gld-o1dY$FwV#Vswy$P>V_xO$L;h}e zVTeO|cZIDSrS>*8QTHftVZ|4@(E~gbHCAtKFsy;5Z9B5TyRu*n!djsT5iF9^adYva zgbO{m7s?g(@nVQ{xBOfXRmUvTk}qz!R8Az>`fP0Pz-0d6s!drNXk@iO0caWksEqv}ZQ0Z>AB3y#feg_Fb(Gs3;d(;V{4vP8)t)#(|$SVOWkNC0e z^K=;S4)9;Ziui(vEdr1`iby8(L-*)0OoX#w93V?iwcFu+G&m}HaG|58B*3B+|r` zwcj=M7qD4tNp}QQQcUah+rBpd`nBhSa*EcG?k!I(4H_@jG(nt4SS0p>xu!rOWQnC|ge zUqH2y&;Uoeoo^ocErLKAdZJA;gKq(_!2I~<@D=`fQ5-!K3#hMi*-yWNQo&eOnoW7A z&mjQ9EjR#f_^RM30(}Q?R8;;4uHmLX@N`%4FE&f8v!>v5v+7amii|1Mxsc$-C*Vj1 zPaMxYe~+^I86S6GPW6{8TSXCs8=ycpNKo%5ZJh1eMbP_dX59CX{nFH+lOmhwUKvm-qP6K{ljJA2*&sWJb%ci^c2C{`R)Yc_ z>kTjV<=EbW_!BNMo&Q-|fzd!*MFw9!1aiE zBYx1M1kHmah*zyd^6n^ zKy;d?k80-?R|vyoGRGY8Y;#~^<-W0?%JF*PszmhPMD5gtI*z4rN4qWV+DBFoOsqPw zJafq9st0;0P9hlOo63^CRKhCYGom8 z`pvZ~JJ}gBgJgWnGRvntOupfuW^10Vz3ysK{4=xoROEPTdjm87$>kBv(&EAs0VOB0 zPI%OKO%PY}#C&by6v=3rEuE%E`J(Q|Lipl2@&%A#rv5jfN-};$#l~j+;*{YP!~4^1 z!%k*_l)w><#%Q!5XbD_c@j0rl2vM9u!8(<2ukRB}$?6YD0;^1k)06sL?-N0sSCL6I z575jpj7)SHg#w9jnW;WelZ~q-+q(jY@|OZ$SEY0v3tV1oz)2vqju22IhAA3nUpl=; z5X3n3nmdS1b?VNZYpm&Dyv}{9E0K&ynN%#uMRs7-YC(f0JTz9oVMyBt@KXyWEVL*s zc_owYX_{XBf(C~oXe42D0UFpBCUeW;@@MvawM`XcQmmFp8CxSnkd(nRvYs#D-6YmlH18A!xk@qh|Rq z)H!dq&8$arO~hj2^28nyFIzo^Nj;s{b_y*F$%_c%QaJKltHx2ESvV?`)d^8MxC&wI zV8NFM@%QVySP1ke?vV=KS6P7yto3IAnHY5R$uMrw9zX!R*R^!n2cJ9uW-*X`v?UZ! zK=G&4&Dn7eRq+04OQC|;0s^BxcG-(50$~pi%i^2hrnTD;5lS_hm)^SI9-{4R{dDnw zWyA4yoG5L=!e!lYKd79`@l_NP3Htj zk<5oScXQTZa!ERcuq!!d&35@#ZM_K^deIb2vS^*H_^Vhjsx0z6p`U|pB;dllvAkSy z_lQN_z)bqr%hieVwgz85#=kIOX7iNXvh~e8Kj^4UN9}7@{IXg{!)r1>uE|1OgN0AW zBB(|=9(NH|?Sf~Dr2r-@ufVZUUF$d~&2s%~e54F|vQ-YsA z2fQN#h1++^iaa1@?g-3x)6H|{Xw&al*UL~obekZMmB#G5=_aEkz8TFUJ^|ESA0+_5 zw#*weGP0QuC*IuAoKxT%XK_QCIe6HtXQ$}xnh|+~LRjRwv74@MI`eH$$(UCuxC!kJ ze1B$#N9#-LN<$Wm@{%Z15P+efk{8ZTaN`Rv+49o7X?04tjj9*?nXm?-!R=|AJ~FqY zi@+F1uDp|ESYACu)Q7YC%QL7Usvkt)uB3~S2aO@mH;ESBpVg+DHgjlrjmr} zaTpc@_;hBS0G}Dr2cUMYX;}(HSzPi^?84DYY3SioznpcNFUtf?7z&1u8V4U=HrlqG zZ>Tp-_ zbWL7Hc`gEb=#PHoh#%hCG2p7k>D&e2`7}QomTzzX?pNsgAVEec7=ZRef%ZOfzcK<% z85H}C5d=h%cL_iz-6g<9f;0jCda^O`AKXmdr!Bj}XTxkBIu+dkuLRtt5@)&WGgxn= zW!?CN4k>FMaS6G5#vTbzGTEmat>}$u?*Ef_{->5n1O(%!+wgH?WAVX~QlJq1^@-<@ zuJbn0Kl+j5pCD<1&S4!y$pa+Nso5wzFR$1m#)P@-D*|2X zfq+bRU9Q|B`hRqlEY4T-KpP+3WObr}yMfQ-#0r;~e#H`Nj_k8|*@)CesIE>wO@h#`b0qXs!Rhy-E!X|B5_e=Z@M$AL>5i zKb_-vSa)y5?V|KCtI)~gJ?-EifCB;L5?+D>TCZCVSV?0YG*s+u89PMW(yYe5&RIWC zm0nEeVqgzmc^CY*Q1h4U*l*^KJO9D_@mHF>#e?aR00N75V#9x)O4rGRz|>RFbQ7wX2HSRQ%p)$#>>66U$HKIdbrZh!heXP*=3&;41_%!N)OaB^fnwH(s{0povc z5z6-K|ItZ6%kfV-iHQP&0$y#)kCx+GKxFpw`>r!Nl#e)gi%*2!ADr|-k!*Ji=*`U{ z<|+II8y%G^YA}+c)VBxUUpu%DB4ckHa;@C*TZEx+!@exp4cp-;-c`tk>q@SFQUuB- z!k2-Y0b%DKe8n%yi1!zr**aJUZ+riPMS)flh67u|F})&;fo=bxBP0K&aDRLy7)X$O z898vpvJ?ZE0tKv#BWcyEV!znU_KDhap)EVdmGAIhe5OVky+^I&Upci(|LfxP zxtS0B-j!;$XEj{Q%cGW1Ig$HhB|+*c4qDjPT1ZInlw%kH}w|H}{VrDIL|_ZJh>gve&-`l!Wx%a9Y2B_kR5yf1m*WasB*%ZpEBK<-{21M{D+N zB-8F?z5CNfjW}OoJ{>e!se5QgA-|>Xt#OlXs025g(PPLrtK#8e_pU9hv6vG=3>kBu@jFsK$9z9W8X2`OZv5}!Ke}1gapr06BonabqfJjYa z@Lr#rF&(NZE7153GR^A1h#q*OY1#*z!3-7}@9eTJhLNzA#=8q|v5`y>Cl*tav3z{! zmgxqQO2tr%3g@CR>Ym8S#D*g$f(i+xlPT?@q1#3`KurCKz>a2$;=*LkiU9p>)xP(9 zFJ%~yo?bsM-R4B3qO+!Hc&}Zue2;@gFYMyXR8;lmF8jlW>o+HNa+?x{R^VrG3a^q-iRQ`E_UnNz%{A*^ zdVM3clI=b`L!2it0BwGG`3ctZ7Y3APB%IWRb&0JxFWKKY$8eZq9c1Jn>ce^CFIUwrCZ9T;Q1{LF zlu8{pTZocv+LClkX+>FhO7`4@P2_7!-$%P6epwdQCl`IseKm=v^X=;l;u)zrBA&KS z&wtA?=6Z$5;JSZygHdQ)%GO|Xxp&(8ytD49J2753+@ZRWJ|M&h#=8g?v~A~qhSu}z9i=g%GS z0)^b8%$Fizw@|(JIotq}>0#l^=Jl|VmQ9l;N)Bc*hTt-lw7%hxE2WYEZ@XkidDMfQwf5dX#VpvA|Du2)m-v+h?~GaFRhW6xoD&Lo|fr47Ot2D1D{*{g@{byEi7$6UO8w%ICa8E|`=_+w$pZQcB9QZ*f7o zL@`Qufya}e7|}7>W}%{gtPq0?M^@H8Ca^i~_}J5h(X|k zUEq6qMnwtm?m-%YG{IJj20zd7_PFf;*tsO5A6+E(t0M?e3@4KZD&+t;`!Cx$4yqH0 z03-Zs2sbu+-sT0Sac;+#?>2h23P`cn8{5RJr9R20T;Awan`Bnl^reT<1eFvKtg1Fe+KPCk~1=8@d+UKXp^ms~(H zOC}-3_pG)l=veetwLd#!Yp=o7!9n@0z7+!y*WN3Ns-MHmzw-;o`e0|8&*N?ZY|Uca7ZhG7`!%utAp(N&J;k2_{eqT&*WQRa45MQQti4iRU%;7m`d z?5d4fDMm*&$Cr37CMPR5jwW=G0j!K;f5V;sdYi~a%Px)-mr7D35;Uu{B{I%6I^9ug zjY)Ksr3RRm!5fGZU7S3=xQVV1yN&vkQTpY}s%cgFEsP`MP#b@ORowOUZ#|qT`~u%W zjEdbCT~*2cOA_^a5&R!Y%3|enJciz@> zCk2CgWQvOnUiu{#>CEanA&iQ6bv=@jYpr|xpS?PLkv8-?hfYaQFA}uBsjg{wg4%EL z=OPSP#7x#BrJnp}fF>CG{dKe=xQD`xnTk1L1 zdjId_{`aE#FGgT@H^BtT3{R{J67y#A^?l>C7I|lN zl0tWONR_WEc>gyvn*OPV`Sn5iB~tv07pe77&9Bjwx}Qt=ur~uy8YL_)1sW0PuQh`X zqGFUS1NDbyHhkW8w|A-7M!o`;D7jF)T=}pXL>d_s@~L9okR$m+7r?U06cSyQYFYEf3sUYMj&xKClA?oq&m$jyMsPWrok zpUH;we}9ADtBPNgypngVsH2!kZ^BoJhnzkZy=C#q315lhE)y@qzJM1dV&QhM;O%*2 zq_`m*hXK_%YZo#H2szf$%jf-=q?)8-#OPN0&QS@gN)>|v z5rfqoQt=7ybkKgoKkaY)`N99+!lV12>ac%pSs;63{e(9_2LGZO!Li6T>nukdF5*uNQ(9s=0Nybx@j)un4 zH%GN@I7#nX=fcLi7`?~0h#T%PeQlJs@(ZU=(F7yT76XK%ENGurCy4P9jB@pgiKe_` zwX+ClM*gEWQcxA3gmMU$Tl?J?{h^xr>7qQ-!8`m*+|3ikuQ*{%KG;arI+z%{tc@`& zV3?|RtP`GIisO8R>aASxCRJU$L-qkk{Qh!OertbyRa(K56TmS%QNdYZ7vB2D^&Sk) zTiL^{Dx&yI9(EMpj^VaybMEHiNG96a`=`$%JrN2~nY}Ry-M|9~>Zs$6wCLdzMQr4} z4~8c^CRLed_(9U6=Vzo6cz3!%;MvXHFFZip{F6|aJuiYni#!~NR+0E(FmRXNHvvCK*_bG zYl*6pkdc=O-T9oVNq5_LnUjKvcGEgUFJMK19BfEh00~ApZ61GqR~bgK9y1l&{F*-X zbwKo|+wVHAVn368eirlvQMpxyfnCS)&kb@+`XV~AE~Y++L$>V*Z!kvEYYSDFn_t>_ zCeVqzeE;a<^Z>};;E#3JZ(sOd`i-mwTH}Mu>T1zGVcLh|859C007z$ddM&;YXgt0( z*Z*_l@z4iY6qxrf-T5bh{MT|cfDay^VVi0w1&?`2E7=Zn=%A@;)D=e}~gliHcTy<18)BWGep zmU%^C<A7l@9T6# zXlwd;d$c%By_dfzr9!*mi0H6(9tdEP`BNSE*9G_Ig$KF9TNInY_ONsChcv)s$qDDJ zh4*6tSN-WjhG7EwS2A8G8ZEQ%C@#E$R4=%ZUdy*g({CSE>oE&7tkBq(ajUkz5mvxH zh7gYifIK2tO#qq<2f%!Dqi_tvQQ7hGb2xxZ24ndi;|pOG0Q!x&5haBu!2!j@P#qvm ztV5_@0s~k&G6Ff=!DyoyfF}R$F7)uyM-e~3dO*Q01U!{;A_@SUqnqjApJ5#ctRxtx z1YXa&_Y)uxJ{NJ27t{x!lkabbA@`UtE(9u-?;u*0UzTy{->dldUi@1w{yWadLMpc4 zgcm1{E^C9+`KD=($=^OSwMp^tIXhTL7So_?OlpxC6Q|_!Ex;(Q!^-|n<~@5gKD+gz z{cCMYlJ5*+SPh*#p|=qC`}Ay2F5`qFbCHd0sl&_6A6;SK@Qomb~e=RHF%2@(3STL zU>4@vo^Q}c4$0tHOITBvz;S7?ws#kDx!ZMmyDMgC(D*PnafGzp0N z$P3sJz~u+Z*A5r7&tnm$V|xQ+T^jkTjA>J0wdd?HT;}0%Gwe~V1r#aQ$$Aw?K!F~A z=63&q$>H!ixryT}$70YLozOF)`RCxRD*0+IE;8KY&S~e(y4`vVUU1!)3s&(1?ZB{n z@J=V~`Rt8DYg9aQWygX`c&)ClT)TH=HnK~ytcR{{P2(1W21kX8C_sW;u<}O^NBGv( zRO`6ode86$f8QyS2Odvu>7?Ed1fBj&mKi{d&w}_dpHJ5ib^r#?-pX z#Hm{^7+_nfL;^jWS0GsFi@`*zRexg1|FN|FIwT<*MAi&jKN#u?or{N3Ay72<6mCr3K=X7Nn`5od%HqA)V&>}D zpvS0}5hr^@hf4qUhyOvKBfpD+Zjzw=7h<;~J%NgL(6yS0E%{4+68L+ZA$H9JS4Ege zZ{1nD?G9S4*XnEmGDbIllkX>%YPY-q)GDF@z6wfg+0FOk60yo$Br~!54cablENf*K zLW&ihzM`n9sTnu?JZWOq;o{=L+Ci^#@m4JTJp+ALpiEDu4odwC$69{k29{_JMibx* zs0;?HYj0--Fj@(u(SC_v1x@psEDe#u*PN9h>b6T4ciaX*et#Yik-flO!e=+pp^e(% z=j63fax<3xh)9l$8SUu}$U(M9_R*M)K5~8C@+IYu_qz%th*tlSTT2R#Xrh@zoA$a=J>+S_4o_=g?5w;@^IN5Q~7bBqn^;E<=$a{Po zd~uAx06>yT?g1-!5_Axlu;KY%(2`_K=3sfiGY^4xI2A64Ny20`6niC ze-xD7QFgPmpje|=4U&6D!WpAp(=1@5G25Xk?}Q|Q0l;D7WR*<$UcsV$L*&jG+K-u~B;^9N7(mrn`s55)gB zvpoNEUGOhNC(xW+3F^fJ4ftiZvnxX*)3>8(;w4kYUPwxUOwK)!28+9R?`j=EEL!Nk zF)#i$Z6u$A65Z%SSI!N_$-7{L@Uv&02T{rw}$yjuz|q z+J%=^cV_ULtqcIX{W{p;FNPbwKAUpDh;#pgxX{+)DodUZ=!lpmrXt0{6-&O6`$mnG z6r5G%pmtKDv*d$k*J4AYxIq2^4n3?ns>2L-C~CyIM(2pKfn8$%I}ckipf=XpwZSui zXEL&#N-R|=iOl{kIkznfXzvvb13uv)+#wcMEdvaU5`VfMfZ8M4o!Gp2a(5M0dF=zrh1wh^GZA1CIA2$qR~@15 zwae7aU*~i&l^f!_kzY!`Y^Z+D$mPwI_W8GW3uUn{-YPmT4r`2kpasCPyV2Rlk~EQs z!-eml>Iv-wZ*f8(fV(4tFZv?aoq+?aI0{EUfc673f|#It4q!U?b{D_{m~$e>@*M=p z`@!VCL$hCd5orEhgJQ+NO8_GI*S8$PzXU)Z*}6bK3hapNc={Rqvlkf0LIVsEc+@e7 zXN_n_o&e1mQ!9i|o$$dz_?##%6z|jxU6G9^P(~a@;N<6*39&clUGPAuV$&0DYXw~f z#se`C6bUtp;5~LA&?3EwYa?*Dd3*;czs5)raPWf$Xk~f!?ivhB@f`%L%*>tUm^uf@ zU0osoCXq(E9^h2DgU$F33WA({A{}0rv4;%z=XHbbR#CJT0&<>*E9UQcyr4p0-5=lnZ<{e-*-p}oH z&uDIkn(y0lG{{EQ#QlPfU%f~nm`Ed)Hb2p`rib1b|NyCXqY-U=rvCYz*i_*WceYhkXZeIv@|W2`O};;sEJyTHmhn ze$eHB=YQu7ap+)t7J6|)7?DVnh6}O7EcdA7mj|g+>JvIS7-3g(H!{F++_T@;=5pStd zsx%bh>PuH$Q%v7`v=}`x^NgOrH6J+aOsYw#J6OgXKD7y7V%`$|rglH&W=;+%zyuD= zU?ks$@m;Q~WpR#|DYEMt{(DmOUqZWoDftvBKFt~MZv2ao9Pv}}8lQ_dlZ>;w&j8R2Y$}-otv4N`eHf|=BU}rb#{<#f`T=*qXxsAHxo)#C{ zO|3g4fx)ePdLw0YeRWk$k9otbKY2Gu3$#6Mp(hf_F8E@9jd98_#mdFn+WM^7w3%6Z zc?YHHYaLp8(7RV45HkpL`BcZBr8dJ(T>+S!DMd-JG%ZkS2_^I(keN7R&%ri)-smN8 zCZso=##Ll&Xgg1ZXq%q}&Hkl81BmHPl@@w=Smj&HPTG$+FfpRlgbbfGaHmnfQ|~NZ zyl8U~WE^~E@$mo%Xu$n&VvVIccqW_nL}V{|kR(7Ey_cv2Fd!Q_)uwh_aFn-Bx&4-! znEmy$OtE17_hg?vf4gTyB>NLz9+rv*4Jvd^&OZuxOBaynbb3>1tBaYq{wGf!ZeIv> z$d2&Kj(>o4e1{cUZJ6nd6WU(hjhQu7Cfsjs{|?H^F!9C}7mK-F=UDi=Na5En2}}g| zn>P@P3qSnI2`V7Db3N*Vc@mzGU4(mq;oye%h(_cc8xMthBW;u9^gKw+=X|(nIx9;|vpA^pj2<++tPiq?8gQd+y*sz8YgA#@D2Qk= zue=v%`_6cJ%`dQ3+tKH|_UX6o70MqO?rkWNzN3p(E$o;PKf5Pz&!P8;wY^f+$c)6s zM9&L9pI97U5jOm=HQl+<(XD8;*K}=&dHd-pBEA(9#pCfi9Xf>uLt#mYGeAF3(v_CE z)g^{i>7hb&&&DA<(K?&fH|+*GHI27=UP0>u1@jA?rR0~d%0IICrN0|nMWHqY8o4MN zg*uk7PoA{sd1N$42TEE`YZUpf8@o91oL6*X?W2o}i4miLyc$nPOxHCNPc7m9Tqt0` zkjFS;kv{aolqObQc;0_gEV1c%3QI(b_ItZ7E<=XrD| z%)lg95?N?jd>Pv{R{4>n3Jvd$;c88&*@#nhAn&Tk!)-2;S1%yzuNP_4`A1`o3tMKc z-bYuzms}2SS4nmAw%>Dn&4`$%y3l%j`*{1QBzf^eVeg}no|lF7A*F6>{bUR3)egSC zvMCmcYxZg1u$d#YY`oOeUOfIhjTKIWGnI)E3xh4|k&N8f(A;pVV9Oh!7$Fctdu0HH z_rcp^el*khk$CZz!NoExpGR%whssf>`^_U;sQQ|hH#;Om*x9vj6iaihT83G(W6ZN} zno7L%OUPqi96sp>N6g9(5xBAUX&CJcC!U#_t8qJ|*%e*9+@W=&k~(L+{=ILV)!TQY zBWj0)7wc}+HzQHS0TmnOZ|`YgjlPMJ%`Y8=zZNl*^eR7j zewq7}E47%Ci{YCbEd!nkO?=nOkXCJQWI2M0Mca73?#H*J(YvCY3x$8{urZ5Wwi`a>`kgWB$y6ss9UDRz@@G| zo7(pw?pVe~ls2p|E>E*Te3cV#D2)Un99!DUGWg^Y$#%bUDY zdE@KRQDbyCTqLH*xYYF6+Vu|D)=1q}&P240Ssgb5sR$ZiY7i7BzrHT%yvdP$- z*s{&ZK$m(vYtiq&vJ+;RHdJ)b-~{VkIw&bx^`m|XDJgL{XJTR|vTFOXm6JL%X}iNy zzwqo;kGC6XdflyeG0ew9ZK%7;G7l3Q;ao4!$WCZDL}ZLK8nw-0JcATxP=(k{-5}K( zm~)w<+gW~Hs?L&1M+dc4f8$b%gDf|`aCP=izH>Y-MRj>(ZO`6svN2%8t5)jTxZ%#s ztTx-Z8{uj^2_bJTvk0jH{ZJ7LW*lsN7z>bf=+L3e4oGqwXftvFUXDl|R zG`xqMa+`*}PP__u1z2%VD>!&Y(e2wTHhdgPH5Ib}pXR3RM_{RKp=ef6 zG~rARK*Y?bz2P(`Q;^;N0sWQE*LlTO4$^Z;6=*;)?xOEdM_VFvz)cB`RZxf98nS$ol5;K zs_`>yDtmsSb<+c7m^W8)p+ ze;pquP|Pl^yjd?|A#?)t2j6v+aMGn>*cLw6nS7jr#VaWAMuf#B5{B2g(K!cbsLwnxK&9d^yRv#^tTu zZ0!p^e4I^1*v0qHDl*?E1$B3tai6;qP@Ke@7u7YE6BYE0hj()#WvUB8N5Pts#|luw z9I-KVYE$C^v6|?u<$0>?u1yua^p77UDeOI0bj6oc%R`ySm~AbucK395(v&34i^WVc zzjtqWTi_3|4W#qi9AcL?L6u7y@TpHs8~T{0wOg3Y9e$+28gpF2C=5ndUs!ovKeE_V zt>M1y{`7uiD&$mIe^=%+@=tD-20{&4ZW(y3SV?t()yZ0Pt&h*xhKF*Pn{l=^NpAx# z&KgI1V$g%ShJ7sB85Uu|n2+=I@_ZiDPb!7a>>2Tfm?z6)8nH$7R_-KEkJDRcUv3We zZ%-X;PQcS{x;TX_A6O2~OdU>)?n57E(;-mK0oO(XtR>%>S*lvj9x(5gJ99}p)pKvk zUpbvfY?W!##I`F$=;<7!d^x^)1OcSBE4mGDT2%0Pd%auc#_!5-|JoW`7(nP*UGN0q zZ~@6-f^}?_-Fg~url>SU&O!B+`IZ={ulC8^`+-B5DJ0^DZ;5B#f?kokhzjkktCj6j z)Hyjh4dtcV=bmA+Bh?V+01+nwU2aUlaL2OAllkn9Xk;kBCw_6tH@;mqFl_2c+bX81 zMm?d)uDkZ)BVEKcQTxL-qWT4TqK#XmB*v%6TgX0uY_KkE92d~e(T*qtITo%pB$2_x z?{u%gH19*x#Y9!r&m`uF+Qfyt!4fO~d70$5YG5Z2b`XtjPC8LrfmL1{lGOMBH$(3= zb4k_Ai#u97#|k`#7+(>(f6rVK2UP^*FZm< zgWJZlnGT>ik%pto7Xri?2CAoRx41zK_^Y()yA=<0AGv2DTRaJ-1Vcz;`|Y?~=^A4{ z3OuS+XV>EphLjkj!`kweqqJGDQVYB#LLoHlYsXLWwtR1$S7lJ!4&@RLE4d#Y_|E!3kpQ}nK%ixL9=mnxxsmuE`LFo$iUT>m(8ka92rr|Us(6uuqh#J>SfjiWj9fEFurWs zWeHbN0~t&0lOhvnJ@m!ycAM}8JuLr%Hj$%g({5l|*RhA`nzF8Tnh$r@1GCD{s5wjc z>g)Zo^pCn(MwYQ^n+7JwwO&VM?fCdeOqKMsAmG70s7LGRK51wbOf}pYcG2D{?W)K5 z0`E^@9&thj-DTKgsXd8>-Q0--0?EWQGJ@oB5{@4DQ4J+}a(WtI(m7Qm_#Nb)N1(Uv zMDFNV!PgfN;uHtjz@M_fWmAkEtut#Aqm z^E#NiibWEBYK7W%sN_Kw{Cx*o*mM2L7xbiLg~FL*_jYhR=0Y;7UB)F38-2tE>WUOv zmW`i;cZ?bqNu)XPfa#zd_t2D&}SJnLnT9y#`bl{j^`8v zfUyr}xyy%nuN}&!bnKYjITNd&9T}BZ#eE z7&4|fb9eclO)*%|RQ#l-kXhpIXF6qi*kGa3iip03qz%u@L)z7{f}cvL-7<~KkaND8 zs7hJ#xZaoRRm;J}tR}=&<5@d(`O3J}EG7omWUR-eQJ#Te+qhBKc-oGm88id&&fddx zunT&AZI*vDfCn9q+Gys>Q1xKg->B6fsd5hud7S~;A@8}D$##m=F|pLiFwb76MR(b4 zX0{hd2fWyvt79S#`>5OR8_)cHt@M3Jbn?{frm&G#+=P++iq|Y9rXWfLC-=?~ejdl2 zMK7x-ljQZ~1qb)_&C3_>US4$7a&@ewh$<1#EsT!f6Meb26|m%$ICpu$4&^SdG9H*$ zvb9sqyOC37v{K^fS_5W-T=wnd_Zo}9Y@mCe%MHz`yCb|AG`J3vP7`9}W>M8sq|INa$0fN|~JlvfT zk-P})y5W5b5m@D#sZ-Z+pZ%i+>C-Vql+UR3Lr4Wb_wABf%}tqC#Kl5ljay}>9HeRT^=tP~LUp7(G#KfZd!Jc!Ez0ahBdwM}9IBJ}O zqkbG`kG5RU1~{S{$?9tmYRr}=F zw%PMtxa56!8x5X^eYw5th|dyeX@AdiNOh5`qhHz1x0-n~u+jN!BgNObIW zMW0wo4-?o3{eIBsYTvw63h4q0dSj$etPkhA7_^w`^821LLD7yjlH}H^10Zx5esO4S zlrS$HAGiMPi%oL=LeV6TmN23|U^Z+hKo@($lWe*Za3)Ig+5P%Js%SkfyyM zJah?e4^wpXake=sF<7v5CkM+|gjvO$7+KzX-bBJ|knCeGIjw;(;{cy^P4+DSKm9mV z;$nM7U1UmqSj}rl4p|as`h%FdV>hp79?v%yHh`*|cT7oL*S*z{_|=_zC!!xADn0d; zZ86RWCy_XHzbZgg|+?xU?M{u3pk9XBs1OTMmz&^;BqtwOydDYl`Z zb!V-{z9|2L-qAw~45U^xLPGF^1N((LV_9PcPhN&uTDMQ$y%PO3W_g9m-m)l1b5zH@ zR;j)(^={7ddDN2>;}P8xJ?|)>{Yh)K;1d-Fcs64!I!g%1guoW~!yms1j6M-lkglF- z8ZLLac5AS^IPN-M8N&`xX+K}|{KUly>V2)cT5})Hbx_@!>>9ucd{Xw=a;i+K%A<;h zxyZd>tWenB!5k}z5!l)8Y8Ju>;N&m{Q`j=okTK`E3Q?+L9b8l;#Ptbr$VbmpWkOx* z#_838$=ULWAF9;s%kuVRu!G@+L%mmwnPPL8!t?mu>!Lm6a>sIJQmgw0BX)yE z(su)tlWq_O-lK6>l+)`X--wq-c8(>5Oj{g#ggC7AQSRE~9n(^j0t~D70<{9D*yh>J zgqv^{ZrYk^HU#wSXSYRdWuVCFYDrj^RcmW@Ilsz~1jnMB-4Q+O))Udjmd?g;`le9}H zChUrwaYx}o#Ru)nkT-semr1Efa$LTQNbo-h!&;+^AuKc4h&Df}@Pv0*kKU#VK(89J zpOW1&f5qm+9j^=;fAQ&5RI6;&o1Dug`6&plq_$6d{d`$%B>x|K?;X`tyKawypeRyA zL^=^^QUs9>fuMYVfPnN)RHO?C(g_4ndJ_d;{gZy(c}odA3}BR7VuUf=z!z~h&R zMX#9O8ec}Hq#`QBERS=BI`qR@D4K}&C2N6Z;oca9=VJV6ZT@8w_6g?0l_@tkvjjA@ zQW}L6Wai>~aieY8**8*jG~L{rjt*7KBK=YFdAXyVSV`v0?9oiO>O~=kdS6rhnM%`j zJGFVn=v|l%E~WjXwm>*}n>R=+C+np`IQmXUZvN*Im8Hpgm7C_-*MyV_r`7T#)!ub` zF7@d4Vbj+xTD~jVx)S^tou3CH2Z`~t6o4QG-g{3Pa(bm4dEi}xYtknlgK!lCh_P8f zgz_`2;w&Vcob>Qaup%2`^Vb79L>xX}hqwyMVP2?cx@n5l$RazZr z_)XE`bWVbJleHuM#IFPShSg}XCwHtPP13RaQ1<9{qKEh}pszYf|A+hn&^l29Q3=4f zzR)+E5xMgWpbpjbp)$Zg!Jc5*Zwi3$S~rlFf*e`EkkTGz*vOvnzo&R8_a(L|7iL&H z5!H!pLyZ2h`U$6c&9#kM+e`dnp%)%ATvS$tga>!sizXmCq$0pY*UZKuI}V!3oYLkgl{*( zh}rUU^%5FfO8AVgi#o`0{b5Tn1~!3&A0V&0Tvhw*GA!F;7j4Uru5?@iHS^~}BKhg3 zj)+$0HOj;^#0z8$jVBgaYYu~aF!~59?s}hF+ESkv`?1x~fE8qQ1+e>t!5Ko>2b9wq zi(NuQkwN|k>k={<;tp>wu@t1{(KtiJUjBIy~P^QEIQ6HRy5p8O(OvkEA6?YUmQVWXf1JV}H!sKc*}r0|x2iO>Z~4;3JA{q~SKnW3{Q%`I@qqOfKM z4G;PjVP>Goyz8UCD+vSPKmq~2~k=fXM zXa9Ko4Sp^*{Hx*DP`Ru{$@kNmDhig)0#_kX8dJv?rQdzEJmZ(L%~HUuT{B*I^pOPLJ*Oc zYZ2?$iDXzqJ>vBp8~Bw=e6{X+eLZph9pyTQUTKkQgGpL_-mIqIlVVJ_pQ|T(uMS-d zJGcHv!;2@wGzsL9Kdq6%ATF0kGOl>WP8HGGVQ+T9wB0{OYcy-CC;Hy^UM^s#=8fC7 zM=u6F38(Nw(5W!O#EB9M2RLLK^4vNP$Jjf1{4Y#+K-AjkPp>_|9j$-%fb!SyKa%SH z2j2tTBC!xn@U**xHR2Q=*#DWEN2OZ})@NC9*>&Q{0?Th{#|D`0zbM#F!i79pz2fD)W_CKJoi2@+W!rZ@}GvHh^5%{E)NVV z!H}4a1#$ZCw~69k&Zm|7AAPg-4;*+r0haLS;&Y0C{T!j}QxNHJzuTduO>vUG0P6I( zK=L34k?miljBQQ|F)> zV-W~D8DxMsm%@%nVu|ZtN-yb0A-{bOD<5|rLPHc?dXQSSnqOh|L=Wz|EnMdLu$9F& z(=^oE2Tk0=;sUUQWxBG%3J<9t#ml*aN@H4Q7&0qFc79MGRZq&lcaA~OFxJ*6WaC&% zJ&+68ZWBaiW$ccJbcz3wMrOx#e&Yuvc?iaNK#QFVLFQ+rDO6UcI+F1rZ^KT}sC~HS zuBSNFncjj(hRdjV%--SBo9dAG`x4|?@sN?B3YZ$cC{l&NU)`;+UOm|1(48&n!tftY z@84DZ0gHRtEV5r`#A%y*A>lU#bIYt5xgihH9W5C=}#1fd0Kw*?#F=DFFq+ac+EUYP^QEL&&jW=JBw;p4}i1ofFY19 z_!)(nuNiq+hji;WR~Eo=VmX0^O5pfCv>Hene>-&bHw6YkPE)f!J|e|#z>o*|fwQcD zcdFT&-xS{wCoYc3#L{{3;<@wNFoT6%4*#r2^)}*YvW@r(uE%}&eQN%LVG505ue@AV z?gtf|Oz-qJXlq20-Fv2*Z@U?4u4U+ho}ieADC62j-`Hri8ppW_h&y~*!UQ$XW;SjG z9<>KswFj=Yf@(<0W4901ep6JH^Id8{32REgy~2D@lN!h-wuk@11__iE{1q(C%`&1X|5vjy4Q9u@;01#U{SyntQK!lN2r>s zlUhFZLeFYu!xU}-0&q}=E}J^ZmO}Aq2WM-Wc6S> zX7Q%EQr2je!o8;4__MjkIZelpkrnR|)rGAH z_Og6mc0WYb#4Hn-SI|Ry5cwYE3Lqfu7c6A-dR>UU-5}DQ@%ZM6<5{o`fQe34NJJv1 z`2bQ=Y8wXl3Mc1Q5CN5VeqIwX6#!Q@JD~*X7C;5gXq%vPHd#c3E)69uR2*|Z24eZP zA!oOFJaDnCnUD^~DTF32?K}VV^x^4qcv)2Z(FPmU5Y5M*f=x|zm+2f#Xk}yp zx^y&yK!O4dC_JHbMPBD+Yqw^VUbc~bPa(>uVY~~8SWN%HEe6#so+2)`p1d$g8N48h zF@O1LNQmlHc2A-@Rh-~HT`t$a=XdEn1!sY|X*>v8&oj6^n+{D}&bCKjb^&ts(6?G* z<{QL0-b@c4+0IDqDjT!XXFmC*NZm7HZ#hgYO!>tNry&xu2)+s_dy z{dXo*6pcR%xmB=tIxm0R&5UzC%ka70TJ_ThlOS1*hib~~$Sj4ZN2U;PsfP`up_^yP zmmoB5uaqTrLVhsncTv$bX=e6ha|wouM*K2w3c1-NdH1AW;c>~H zib>xylq^~lX~SqaLFRy|;4hR4)|dNBX(EvsCj7Z8Bm9oEcchYfC@6EkQ}|Xm6w=TG z*0TlDbH0g#IGAqsGSMfz?P(FM%l56F{EE^VL~yVVSO=^5#v2q4XC?EvA{e%fv>Q+| zyibDcofmOF>qGj2XXN}MMS1I&bmiSQ{&picA&QCu>_xznzq^^t2)#w~|2)!O&i&kosxqp^o}L<^oIHVL248)=gS{H;DE z`k^1dP{^x8zRA+4<>w$F3+bfvH^$agN6~@za};`7qMeY?A4B-EN}?knh6pX0Nlx2$ zPH`6HJuSS)Ra|;FqZsnEs#Eiq!~1|fS9#LbZ;Ao#00FZMoUOH=5XQ4^+LQ;kRR*@}zsv_;V7*T$M1Pmv?n z>IHcn0uW4(*OYp{xOg4!?RQ!M8Jy671BrwhonX#ZOG25?4f8T@Oolna2}3oV(!B`P zjV&&Dl)XFx`AKrU**3(P`2=?`FV?!q^l2t18NBrTxYMxH+N zFC4v`sMyzby*8JtR$=X_zN|f=vtvK8E%YYvHY*ZvSPb_$eMZ#2w7U1D$?7~RVkf~9 zp`whdXmRJQ1I1H>ocz*sm2=Cc~0SoCH96*dmj8Frfa8h(B9D_VD)}tFbn}kXX za83|E?mhWGAdHs{$oAcfuCt<}V2@H~o{*XkrR4~p)!vo|FVtW$t(*!Fe~zuZ_fMUk ztcDkf-VZ9h?Y8QA*7dDS{@z&Z$d0#3*Rz+oio}5VxoAuIVVK)TwUVhXVa8?-n(;rn zmb`i2%_p_8bMc7QQN4fah0aA?Rxvw1Daw<{vn{yv(#Y2l^eLntMCE;%bMrFjoN8Z* z>Llu}z7M$aJ*L>qaRxL&iwHb$#jChRWjiV&FE>CgqCaTOLbK?#(-Zv+e`i9j?Rs~ElRVN2eZ?uKMCJ~Hp8&<=~{u9;iO!h z4cZDyKDXiX*eKh@B&JypQIfHk!${y6A#n%uU-b_RNE?})@GjHViLDYJt8$;ZG?VT1 zn?mB~`+(c4icXntlKL5yDl=@_QF}rT!0k z)3d`%O&8vR<7^gdn!iWf)c?d6Zm{)%R@l?QLz^q6=FHgY6y13I=B1diNj;J}5E=o# zy#P#TIr+|V;#$(L=M+vwT)}kal-9>e_V6cb@h5v}RWP%}l*I$F?CH|ETT3N|CeK#o zd`qmDUd6Gq=v3!yWFta>gq-+IR&i--m(& zqHi+tPJfN;+t0s6@#FSf0R_MX(<0{)&`QlV;hW9FAy=F;%Zsjao@#cSuTh9iNVx7b z(lv|!1UO|fb#qC6G0-TE`epde8EhkAs6?T-J`ZTsJ?g#YT#h4psz>DDB4 z>s5tKqM2&tAIIIiaX>3i1sG`RpK6lFa%s=rOWQDZkRdq0j=e(K#_Ex*oZN{jP-qHC%90T{2{)Gm!>R0KD&@D`>2ht_EXMWa zu2+as#k&_|zMJ7nr+TjXnbV)O1nZygqCQXLrUy8C6k7~E@ncKA{2tBz=-}CtFbJRR zmrq5k?9YcH^DO-Xlgf0_2F^t*HJUlFYeZ2;oY#;^*zM;VRW2zS+mbbV+8UwZ4?UjG zS8?G{1pxuJW?YqbwXFwjO))D&)r;=3C0!;?gT~F-&V^+I1HRjSm9>4kzSGSMjNz}_ z5m#Wkcvd46q(+7w_T@!l)W0`XYQA3-F<)^qG>a>SyiOHD^2mtg{aV;qcy~6nT{xG} ztOWXG_x;#z|M~ToM)arXOTs|_J6#8joS8tb%+AocNz5~L(zEKBZ zad}2W4=gk)ougnS@2=JvkTf^I!9Vhoxo=+q)X1k4IXm>+%;sT3!E>&!H7@&dgtC=f z)ZUK2yt{boGD%HG>BoGk{5_ei@dLLm1Eicdf`!Zz8L+T@!%2l_Oqv;jzK~;2kI1U> z$XNcMi}CWzc4U`X8!6gy*2vj?mOc$jvx_|;-z^uPZrXcS7KF*Db6pRD2636SSG)n# zt6-A~mq1Wd^srCee!7P2dn=2VZ+iqukI%Jp(s15`NPyYI>NaGG6upiF2yRt}$n|58 zK?m#!xpJB4!qx!UzGl)~A=l9;zqeb3l$zX`#lI?sQA~IzUV#!{FX(r;?p}gB*Jo~Y zSY3{Fc$=O4+59|L_BjRs2g24Dr3~h1nY2#4XxTt%Rz%{d7~@*UTJa3D@(71A_I>#Q z@qv8I{sh#|uVfOY2$)Eag#Cg`{OhnPdDz&}@dpzQu{~iF`H18@9p_&5unnH~ef%}` z>-J!cnJif=kZ<*SqDtSsi}#xSFSdon3$uxthR6U5CGl9Jbd<_D7vB~UBpwbh5m9Ej zR~shGdxD?Wwqa+TFZDbw2<1u+pbyiN%r+CS{^VIELewtLO4~p^Jo*X_zF!X2zdiKT zv!gBdZMsxkG8&~-hta4tYfflvwWXe{4yG|01^X>tPGoKQGO$`Ru43)VEZBav5PL2u zdy!k81ry$8T%`A?Lm&OpZyVh`xEip-${t$~-TR`Cwgf$0zOz1~)hIaUHh6Vxo-w}Z z_Uby6q2=y|CC_Vc+@_sO@^nqhVltmsJ1Ep>Iys<1ML~m)(f^6-_tt}&?X{sVW!=ZN zu6;uZpR~A9?Tq0KLTzUv*fG8O@5q*Mfm4l-#Wq|6J`!Mr z>fIL#8^CeEb@<0);rSbjtiFeCJ5zfwgHHMKTOAq&PQ^xH)x4kx(Y_&dzrx-g{v_7+ z#0V&^;t0>{9CshKxf#=$y{Q%x0BJC1G{(-`t^3R^|*lGeu|MT#jaNTrs^I;?X? zwl4YW6P&kly3oXWF*K7jsRgxnt3{0u1EX!5_lSsvXqZ$*G}Gni54mCiZy!=pqQKt# zOw(6It$k^8RZ3WLG-DrD^CV`8VwIuz7u%HU=}h$kxG-meopQlh;J zI#+kbr?cN#ZL)dgBU~+KGNwtlh{3>C+}`RtdxeZ5kDW?a_7M`Rj+N@_pSw-}>hhdy zrbE&k($YrCYzMo!k@uJVtD@kOaian0RRmbCXc`G4>S2G(IV%A03k$rsozf4%c|cEv z_BEP1Zj?I?d1f!?_H$4w`yg-l8;JT}sn6?Pa&{unAH_Z+jq*l1@RRfm~ z@^J7BbZpa^;agBnZ`YOUI+~P|rL(-kLj#6k9?bkaLD7R7jB&1)XyauF zX*nKqhbnn2o#WrK)r`Ou;c<(~+zKl8xLdouHKXeojh7seKX#*Mg{#S}`0j4bzBBq? z0fb4k^$RUUs<(8ZC!qm!^6SETxX_}tvk43}i37(CdsoiR(ONG)@- z%l=!ZK{%U=$%V|(Q&B5#e=g-N887R0miMTlZAV+F*rR%%EJLp;0!H6xSp4Rxdec{T z%nF~NXZEwF&BCIEiHSuJM;Vntz~LfsVTdF$M575fTOqa~;#dpv2m#Kg^iv*F-(Xkn z8Y|ukcN{hjs3@HTz>Cb!!%xmq)rcI znCct#dn3b=ogp-bjNxzni?iY%_MJ(GnWRee4G5g4$@G$ZP;85yc0uES2u+0%0JFW=pSnSv_sXf2g-NR!={cMq%GTAfN zdBLl~-fB}mCLwD1r=`e9x}}xb0JVRkzA8G|`NVlqSW!mSv27_}Rw5FQ>%ZeM zml$lM?@dC?O}GRYBdUsGYh0D!bi~p{m9i4z)2Y0!+}qS1H+eY?@@KWEsMu-7DP!=n zI=1;HA`_Ux2CKvR6Ka@7-`7<8gucD+MxEL~-Ij8eYcyE4I;^rOA95L!#&C1$){IK` z`P5ZahToo#4&-;DFafEa4;o+KFnAPJY00-oG8HVBZNkee4=B`Pb})_b0GHw)|4pdm zmMeNTXQ7PI0MAn)AJMK|ok`eGBVFkTpFy5CbfP+bjbDwe0fC5%9V$u_J#w>r6Yhtu z+1X?Q)*-e2r2fMNf1ysx#8+U)55zf5c_fZd5t$r)x zR1v|HXaVZL>|Rm#hKUwcn*x|hQ-;F@-uUzjlJ5F99``c%Z+ZS1{`3Y0AX{Ox>juZQ z$crj8Gt-VLCP&dSERi0Co31a8NkRZ-1uMl1k!e#;pOOC9e_TflcV6HWuF8Vauj4Ux zXH!f2;;cNQ!!-SX+6@^G4l3bx>{gMOB&mnI%qp{bl(u7+iv)ie0N8rB(>wlo3rDYh zQ#gUn`D3uzW8Djhj4pVWy28tLFP1Q`#kBcA{?a9EDjk^=1#`ie3dPpVV8_-#X~DKW(K4n?y!Fon;=3Y(mdW8PNmhnzhum*0i4 z^(Wsk=)3Y+k47+5#SoB7eGTbA92guzNkSe-;siyo{`}l^r(9)RMC2lZZo0WE5qnz} zV>-6tkCshFV+GO#MSk2(PAH;2r^rFkMr_7sEmZvQxX^>A!Xl#~Uk@(2->eBjew((& zT3ou=ctbPf0eJhzt}2gkKG0q~L_WR!(xhrc8X2D(yz|kMyQ~_Di{IF6v0xJsw;ZQT zR_(bF2n0f1qg(;8{iblPcm=OI987n_M`BruA_^z-(yXX+(E+f-xMDAGF@{aN*>f`V z4w*d!4XOfWl-MkZGz&$YS@bWV&05duDkBCrWPRc<*U#{4H;JV5Y^(C1fIkE;M;JbP z-$GAp+x!8!N*4R&6nK`bF3|aI($Da&SQPgqC#?OlIS*qX#WeBu=|d93*~e9s8=2mJ z4|KIIqO)YYU)Jo-7m(9F7?J5Ttg&;^9*sD!HMqW~YD;<<0|NSQvsAZP=0 zwdO*@m^**<_dSwwh_QB@$qzx^>^pqQ@{4?#6i51DNX`~+@~|yUELNgz+cH?^e&&kn zAGxktw}169QFirExL?|gp+FNO7p){BdCsF?K_$+Xg+;C(%yo-(9SeR4HhzCF{_TOC z*vIKbitFskYg9|S#DLw1iea4vZR`Y|j;#s9FABKXxpyfaB_ zp-ia5(AliP$|dcNZB6`vt<6A5vJ70rn9R=VJINh&GgMw5%?RvSTrk%8EE^pHoTzV) zr|GELFtE}ooY@#&$G1A!rRL7}h~E50msB|OgS-!fMNoBuFE4r!g^Qv{t{)`gRp7Jp5)GAn@b_E(CI=sA=c-n>lyThzZ+tVV!-E0Hv7F_{ z)9>Vnr^>8G`=mAr&T8FE8W-t!E$|^WSO4cNsL@- zZ!7HIyn=O~x{~~e(YTh?1^xEK*MZ?>74c*NVi!icE-34d`UGX=JL37zo(!`;?qTEA zxuS84YQ6U#%u|9j3C(ADD`)8{%<6a%RGTo?HRCW#`6j*0ZhTSZe%fmPMdA%!i1#o} z-G8j8`HyF#{yHheWR0=1#ppN&cyJN+^VG!*KRiofdy@JNcaJ)Zp28pT8g9opb+jPf-+lwQAI34A80zW2Xx{NLO2*A?`?i{gKung6X6e<`N_`zZW> z=26ID+)BbTw$3V^H63u9wKa5O%Al=vwKw&v`gj~olkWreDn|-~k{AVSi@tK_NY_&n zZHCQM@zTc^bADmP_W79;PGGNyI8Um30zOC z!IgG`=-isr#ovI9inI#!;n&VN#-KftUwpQ>=MeStLrSPQ7uVxZm=8XSd};kughYFm zsv@wN%SQB<(Pwg2VH$UQZ9Q3@-`L2UPiJak_=n_oz8l>f*icTKCi&9tsVL0%>|F@Q zX09;Nx4|Q-272k&fJ%Lyjh3^oqyH;si+hp}mi9CFTqsUi=Zo6265oJm!IHQJIFn9! zMR@hW==Sb%=>0v+AKE|I)vmBVW%_tyKJ_P+JOy{nKR#6cPsc9)<`{a7a>zdg%NW+g z3omlE6~Nz4Km?U)^H$cS>CftQhQ)tY=z74uo-)~ZXDURL^^);Nnp@qbCw!LPE+%@9 z%pR<{#{Jx%yBabKxd0O=QAw8l6?FcGubXT?W9p}>+~&@O4rO9Gu=KK;S(L!I!(k0xohX-k8n z_G(^%t@vFVo(Se55wGRUDg^{9>;+M0UI#S>Gg)lDJGdtnkk-2`J+gY*!{z|(F!N>R z@0T1|Z69^>n+o3P8vmNQDcW{B588^nLSn+x4(jZlCuVk=#$2WTH2bjWkAvAry86~G z)j`oL_H=CJ@vWyHuyY|TiFS(BB9DBFzx+k^;XB7fVhUIkCW9>f6DB>bpu6ldTjf~e zRjHR2o#gH12LsBd?0*$MNxS>VC2K^=@&2cxZOfye905-@VCdjK6Ak9L_1eS3@AMW- zqNXuS;bVGRnlQiDEqfnHx-dp+vt7q2cKB8J6VMbNwjxU+CgcLq>vFx9<^F^4>iRx- zaPD|_6W}P@I44cHSJ8nW-t=h8u^Olc1os{tAzZ1IeQu@b&JtC|Ba@UKAD*Rpp0it2 z%g|^D?g1P<*Q(;y4Y(aCEXMxPCp;=66A|1}#xfj>AQ&Jp(v-_{=MRuEbKmB+`Q|~< zpzA7}#G^>tFP#@>m!!BmVJ9pGx)nk@2#tXhL@W=o0pr+d{er?%%6I$ zef#>%n~(Qh!o?d)OsWjY@+O9#E^{jqfA{)>Ab7(RvGA?eRUsr}LXue(ARh7Z@U)HhuM zz2&q&ynS#^U4HrA$*Xl$_7)x@1ee~X_$6>RT!qzNQ+@oSsrjz-_Ej|6o9^Oc{&*^i zdpG9y=qY^0DSr|p=K+lyl+~ruwMCR@v%u|Ym|^m)Y1~`KHDme~<-O!T#8fFLQ+`~d zEL#}5KvZkRYtg)E63tsn(k@OG%&}vsZ1})S$yEIFk)AT(>Bf1DG8xRbpw0%RbLPv& z+k*g%Se3(+Kp?y=sQJRwPG@(C%Z0@_kV5_%rNIALC1_rU^ntD-ogq<)iywjT6`@L; z;X21u4Gv$c#k^gMP%+<4I@(6RPxUR`l{csjRbKgTODe!dozN<4o;!daKliEDriys7 zIhg6gjB}u4Wc*vnZa;+=s9fLQ6is%i@5+bfyBJ!l}5gJxfVel_87z&jGwR?#L3~k~kmi+Qph_EGlLx7<(y= zaFD^`+Yx&80i8L{-$)e%CR&kceNJz?B0aP|$fEfY@!b969u3#alWJnDVwcfUYTANx~* z+RpV3P1Qf_@o83fN!PTydiM!&L}u=pe5w3Nbnk;o4K~H_xX^nWl>B6w^`#NRZX-i8 zOY%b^^2)||Ub}q5yZrIwgpw(e=Y|&?7wvCG4$lz+%r5c(tpAL8;{cu%{FAIjM)gBh zNxvy@!hq9io4^9}!ca91odLvhP9L7Lx9#X{rK&yKfIL#M(+dv&T(tE+y~{dH;g4;by^<-N31V2=)4Um> zX(b=^I1m^VD*Sq2*${J@bag=LmR6TZVJ7(i*&oKu`ooXa>zCEKB6Zy(y1sMwE}NhzQ4usB zX}Drs8P!XRE9#lV#_qn!wzqVy*wrEZyzLF8oYV0P>N1BzB0qdWi z3B!YH+VK~~oz(@_V)EXvypR6u{dxQyLS0zyNYEex5{|knCW|LpTCgn`oNv4u9QtLn zO>4@FOk=!^Fie!5dDe71pilgxN%%L#{lOGK%WD8gL0u06AwOG|UI0sb1+-6w6R#lw zOZcm}inGg#`eaH?^0i>w#65|68bZqx018?CP4P+xaAjwb_{&ZKpvZr26`B(OcXjjh zE&TtV*GO1wS7EFP(x=jd$3Jl8@xuB(sp9#eM{Y@Gx+D{fLwc1mu%62+O6gz z)QUavAxR3~70Vh0lgM9}Yk>8yrNAAc_Ih8myMA!Lc<|-xZwlj1BNL}i`4f1iGToG8 zJ&*AGw?iS?n1S-3o${KvOp?w?TKIeppI*a64Ty354tnbT5pxgWnp@JRG>0FCdK5ww z4%Ing%JTKSB7J5byCb?C-Zu=lkCe$N)Lg5dEo&4gtzCxFTags;5s|J~2!j(^cQ(hL ztyPZT)!ih@0ulZ2WM7yoVdC0#R%Tt3f@@e3a-N$#=i#PMr~di)Qu*=l@_J0osH=%! zt*e7Gu*3eFIwA@!9U-e9@kO3O?*I`1`#=Oh!Jh{F7t_PxYZjw$94!Qw_b=55QnSx^ z46GXxEffe4_}hx5QlWXwI{o_p@ryjWy2e8jvB4=^Z9}UR8~X53O*QY*j~;aXvw5FR zoxDh0^-h~(KJyj!FYAEtd)zZ&H`CM%y>mhxNE0T|kk)mZSuNoC!G(FhE#NQts0gmd&E#NjG{^e0fO8>Xd zOK)^O36gG*q8a-Tw?Eg>xKv~QSn4Le=b<}Q(x6gIxIW$T&vzaeot&$kt4yE1-7PUO z3KX`rE+go>$EEGJlP`{y&F()MkydizTVBpYUsFFxz9Ukg$#{O?R(>;64p+NOhpy3| zwUD4~=&|##q`;CSg{u2a@dQJ(GuS(ZR|+&?LK8;)}k&;$z-ciD5b=VEUXpyu^jOwE0o*G=5g%1{iX|VX64*ZR$L$cn==}R>(X+eo2W>0!0ipP%x#r;2PU;I*GsRmm2a}0hDUb z@+P~UuXoN(+|MK}_LfyVLz2wS2Rf^U87Nr>DH~O zdqAT16ZvxIGsTLQ@mC5T^n;-SnC8{{(s)k;(<8H$p>+t&H@0}ytS74f${l2orwLf z$5Sn~Nca;xB0vl^2=04@f*%FnCEckFc=%LzpC9f|73VReH?eNdc0ObA$s4cbm6pb> ztTTo<8PFFK<6YNWE63A&BwKuLTCNZgisM-{8D^n$?HkZm^80QNYP!<#qMyqBQM<42 z0nMwOp~$^0ZcQ~%tU<|?dZTY6#?7rDp*!rOwsq=fOIt+78vKHaq|+QbSWXu3*cvKy zQH1@bcs723d7W|Ffl7Xm(jGkjmXZ3(#hXpARiV-pl=0GWo`8SPbJX$Ks<`N7F|lK- zh9Id-yig}&!T?rTUlS+vL(AiNbq|`U*hTS+X^AflzN4&8q@9g7(e}kXmYImYyA^h{ z@0)B21cEefV+F z#8^_lN+l;;?56gkW**3SHaDf0y*`wiqwe45KBM9kXUn=V>FF0&>nOq6jfl!sL>-nX z%_G}6gI{V7i(3vZnvDAh)FuRslX|Wu4HYY_-Q)F=n%(2#xe%|g;xjEEoVag6h5}98Y+|SI+*{#h=C%FP#p73HLG6H(Bg{N)W9U72Bsad*6 zkk9iPT>E0}M{FYdpMh=B^CeAu*iCbl3GUKqG2BdhsE+!L`J|p7(|5`u=3nJHCaGPH zNKm3NLyx$lYBEDy$(&W(2q{jVIb=++JRYyv0lIVmi9iLC>z+>b6m&X! zU9n*ew@)f?u)ZI~S)s7|Dp~RCw2WTux!fnF4R5%Q%WJEgt!%80VFSYR*Vek4ik|Jl zH%qu9x9RaT?JCm4Bw>8sf+}tR87*|ZmX**DW}_h>#$rHl=eD?~^oo5@@~4k{5KVNu zafki;M5p!AYQV9?tZsRqv(>Of&!~MU6H{uw+ad{jGNQt`W{Ns$2p%jc*+@A{(HY%_ zqihD&6yJVGr4H4zP^Eq=+S9WaSl?`Rv4*YF)-XumEHE(pm$le8LuVy{#;t(C0^o^x zhEuN&5g=ya7=q>6C2n3S-|~5|BUr5=Plt~?>Yn{|S9Q?8!U4g!{C!H1jSDJc8{L_I z20FIhNT3Zj>(MA%UMYJy-lUJWYPaRUsi${P*N)f3i$2=3oI4f4Y+r5YEl90>=J{%< zz+e9P=N2IrF-{k$_}%`k>V5h3g;m46f={mXzyw#jwLd5M4KxZw1m=OZHUWa%_#1Se zfFRQVjl?>6poBDXI;4uD>9A44A|iKvJQ&wc29uT^5?-GgcBtJ*dHU7oez#ZrB9foU zRs$lQYi23EDc`$6-fXBLAb&2G-CJ176GY^x*b*ynE~`H{ie7=AV0T$;h*@16t8vdq zPYqIIy1rfL?YtJuqCqo8am%1Hc+w=h8>G(^(LQFp*7mN9r*3ENaMd@6?YUtY``DX+ zr=PYGtIBULG}fg}u1NA5@oC=ViOYZdnrOC?l=9jzVGU0qXl?yCKktvl>foI;vU1pz;3xtk&bWcSw z7Ur-=#JWPFnio+6V-zhmV*dW>wVtWF}BeFypmelRIn*Sa=+EcNXn&=JB#l z(%slufn=(zJ+zFD>0eAbSPjJ-`4VyuNPS(32O2!pm#VGI6>|Q zGjIulHkpsAVnVbKUF~A{sLWSdIIPibz0pEP3gNOT6{@rq9q#_!Az=`TwUGl-RM(O zn*gK+|HVN4Dm9SQz+$2a1cFj{f4JoIpPwwD>P9*yqVpCax8(q@Oh=Na|KU5}XrN@o z0SpxrQa`n>D9qG8sBLuD`1Nb1=KHcHV6!Uy@8TQ({EG&YH3O{efzKO;qH1EfCGKMDP4(QeJw3<1^ol|b8*6(=$lw?(q98F;9j9sdDd#Zl~J*+fB;FThQ!I1m>u?wk393uEJDDoAepdMg-O?y>#o}N~&)B zV3}t6mP8U74OK$_8~F!g4q&mQA2-MPo5C^iwcVmzTd;)ML>`Xt@!aN#ZDT z5n}s}^3bf^+)(+oj%<272sYBv0^rm)Yb8ZOAmJ83!^wk+vH;_eAx@3}y`fm;*@YD5^p#t-on%Ig3NAPc;Q;nW?o|*k=bJ)Zvx_ym2nskX@k;#vA3_sYt(Sajv!_}(!Sho zF!mraPBw?!NGatoH{Vr&Lo;wZEJhIwwQ}Ik&C|+)WcS^@Sg6!^lOy)F(!x;aA-Df0 zftL8Eu&C%C4isVrWHA#s$06P%FRZ<{=s_9e4;X`Px+xYso5paIYOz7eYC`nbAQWIs zibw4bkiW}m2g*ws*~zV_wH$CqGGL{uL%&1>PnJSxAIFuVAA3qKTdZ3^SA#0Xb|Ef{ ziJv`H3}OpHV+{KWlXZ?2jP3Vyb(S9<{TUXk*#wlw$E!dj$!l4_SE+O@#NY>X&k(@E zyO4(PzcCUqv4`&%BirJ>W60WK9%r!hl_w!%fJc}b7#L7=ho4x`zg(T$lPCcWdZS63 z{(#~|z?PYeg|C;qS7ATPBs2O){%E+pjf$|3Ae`#dW?B_{6fv%VwJm;#8!7NFx+|CF zMrgh&i+qzU@b$km#89>2suuyg3k?P!{4aWH6Mm%%kxWYUfDRJSGRx&4(p@*>^k z8X7My(uBS2^ukKT7`rrF%M@BFE#eINztyZS6!$Io2)KJl*VegK&#qe; z|A3)3mG_-ThQ`1MK^}Q?XU?L6_59TCQ9v%=0+$}20=cVC zM5E~o0t~Hp6`VO7fkKt9H*lGtws{L|*7kMFU=X}}biT_SuqnD`=GCe2>>FMOZc2U% zGe4;Ie45{-dkyaj*qZYv0WrR<r9v9v*}!rx-uuFM8!xA$>`zkj`Btu^z>@ZN^{b&6dpSL4XSus)1YNhNR( z0hlFT9Rh62{-hPigB^7JU5;qH6Y(a%6m*+TI~#qXaIW zv1#238X?b&=KRu0Z9yS_)M1JV@w8PUkXkcOufy#*;3{qpj`l!-J9xy&g;=bTXWyZV zbacJwyb%B?kwD0wnZ&^Lpq0(k@5^$7i5ic+NVQXoxA!g)T))cMNv!ID5KH-+lvM(Vg7D|Z;L$rdUOLJx60Sc^tZ&{k~H&( zDi50Q{7niS&uxqMp@@tIrPUykDokJLikI@xHkHX1YFP`(%+b2;Dy<(?0gY#5YR}Y;CoSc~fT8uq& zRevrnA>^QO6Q+a=%I?KOvC=smI@Fq_)ljD~S5~w26sJqDjFi)2(9)~1#joF%qK+hK zo!zA@wU!Pvn;?=HWzM3`$v5u3d!z9!dX_xrn_AI9;!3_x@Zzt@Z0P1Yr}uSOqIR=4 zEykLq`SXh;ulVXneeb#;cdRzv*Kd`!NmYXo_ zyH9S)ngM67!0)HLxPrYD^n};3Za}I!=V5kguuECGQO)6ct8zbLk)Fd}d|{yMfRG}8 zP-(dlOSA1Ly4X8;%o{<)b3R{NRpt_E& z{IS<4Cl5Al7c*@iQvD6mM}_*MZ)0hPPQgm)Whz%P%{)dXpfY2G@yMw}-&5rgyL%FQ|L^3DW#LbLU%_2~UD0 zj(+!=QN0lVEo*TpFDc>JYM!UIe1G}%OF{DB{N6h)dUB^Q@{CIqF3td;j?_rP`}PM% zuYnTjdI>;X1A5na`VjS!ReJQlWs1pm%KVYfe6-&siFb zU$98ggR|jO9G7XKznF!5->H4Lq08?v?+*~a>i4_&(y(`rR#Rt#A(SGPyv^WanC zTiM_;xN{+p5s|}`R!Ot@XdXMMKC1V;2}MTFcnog@GT9`4=VHsJ<#q}0SgfdNiJ5VE z(NLahkk~kPawLA0+9ch*%*Z0P#3g9DXahpg%1#{ZnA(jd^T8iQjSN}glYe+-$xP@9 znLfy<&6PVgZFP2k+#Z^kKoy$3V&eQrNRE?fR$l#dNc+Xfblvfm$D;6P=5y7QVx^$$ zTR5Eu#HLdJeQZn!1u;6`-#*U$o}PBU5$TpL2^v zX9lnc>p6-t=e%FDUVR`{6l5Y(R#3YY7#@KGV>9+H}I{)vas5r<%l$N>jtMlwJ#&L@~0qu&p4u-h?*CZ~Nibr;^=%AMVWa1{p3D zHAk=)!OftfWcEeCE8j4_Jbyqg52EU(8s#DCFH%j&1jjh|wZ0&Eou+(a+s(9@XGLcA zG0r_#ncN)$4{*xC8h2gYdb)CvkWw4&(FD34X5gL<%bz zJ6c*HzL!b)9Iye{{PM`eU0AUY6v$hZ>_foZ4(DkhTb@;9pIuz`FVx8)Pn9T8lmP9F zpFVq3AN;{x$S7rgJnAhUkp-s;l{8IaMAuXc^bRj5%140r&=ow%iu5@e_2rhm(fPZrpGPWcG20^*SjnB z7fo|4!~Ntybl)hZngKaPS%$S0f9&)ifm;g9|C2&A+4peyi-Qa89nPy@93e) zBSJWk5DGa@d`;eMw-T>S5AY6ZP~5GMGo?>sBVSD#41CAR`R?M0(_wBWo05gE&?UpX z5j``g3wZw6WH&QKmPTltrG>j?&PhIANB1R9i`kD*92 zdupn>`g{;`MgW=)TvgwmXB*PeF)(Q>P7O2&_Y&sqrh5bC?NxKe$Ju>uB1_Po$wS>| zz?jw%2IU6736U^e#9_X&->Gn`li2lIy@=1sTvYC^iBs3IjlXmlX`W3&ANiU=>nE-T z>rd{N?FOv{rxwiCekwGZZz-)E@S=Y5Jk1@?{M7gBma2PjR|rgxd(idL+#q_#U(cfK z^T7QZ_c#T5;*W360UxHXmv_lgPKTc2$85l6s|WmJy$w)isf>e7LV6&3PJny|=*1fW zmG0(NHuWe$jnz)Owk`%|=2MRJ` zTOB`>5`c#}jdH$tsXPW4d@ploRSzGOJl=LLeHJ%0b+P-J;QI%k1-C#T`h1Y--^^@_Y_$>mNjfq6YkWZeu-0-b`PWZCmCY|*CLjt3m5oqxoC(!)+ z(Tt&;c7hBvy&)RpjCxiCXB|T)tR|=g|?=A#K(B;ps1s;N#(Yl(pLGye|XG{-P1Q@S+KmbD8*gbc>rG zN609d<|MC>M-(D|gLYHTJ-uN3x|IbkO>~|L1(2l(%sdSs#*x>AOSwpJ@HKd!6MlTt zkXChNX}|X`3QA?+RPrUS1qC+l)pl@oYLCkWd+Ygn{KFs$?h*M_;NL+ieJ-b(d7g6u! zKIU06*sa;l5=WkgKO-9LBmzDwHRun@eWdSB2MJQv5N+@ri{h_+K71lV79lJS0CPEq zm;L-eT&I3H340KJJK%q0ZF2BPByR37}}(BgoMC|?5PY2Qz6@<64^ z{JE!8`lq&5aJ3fV%GC)_gMbi+7b>L6}V^=!@ydjfl^~%wtg8YTfnlYwD)|-Y{ zU+zgi1o0wL|IW8YQ(GHwR94Oqa@7kfr}EY&Zc$?#wyDFsdLhRiC*G#EqKliW19$CC zvD`yjvsL2e!=IyUIX}7r?{Aa8Uka(0}7VKbg)-%QfA%V%Oaj z9PZiN?Vg)<>KCb<5&9-oGI4j_XW#w{aktBNPVOv2T8C+Bo94?}E37#4P3rZ>IXD(t z2=yV}KPBo;m)B7=eo9~GDl@MpL=i7ve!{%nCYRLHP?pPeHpP%GSxjQAnAVCZ1v zDr3e66ZYGKsPyf-GW7ucJf^+1_S_<=!drK5X{qlrmo{WiZ`uZ~b|~mjt>v88&c}6A znZsh0gT@^>rFH?~_nT7rdb=ri5QqOF)Alo)2Pu>}7pomn3e(6OmHlk~k~Ew8++X%X zb1{otOrJmbxEhkm6psAR`n|yF+8-Zl1f|z=0|9+jzrf(}rM+H{V8v(;-iE1Z!q+tr z5w>z1Izu(CqBUgz}+Wz}7vUuA&q4qoS^!j}l*^V<~%Xe>mMAb|`%ni4cM%9rR>R*_AYYifZICG$CHE*_P%mC+y~Gc;qq!8)#YevkjcOuk(@7%DuXf5!qWzrm(0!s` zkO`0XbHd-N<%f*sT&<4fzr&p+oy6iLA)v-z1>*?G&H?yn zSVu+ZA9ntkDlLz0J(5KUc|kpO;}`Cq-gRgUB0VDJV%e^%wZvOxkKDSrBKKpj3WS)t z;YD8i((CvLTOn{-wpmynoy_0I#qQInP1k1(xjjQe}iP~Fqzq4iVW=9x)EdW+@^Xhdhm{t zSC3h6T>e56}S zx~6?m5&iyEyNItY`(+?=lssp@#;KlTfGJllOM9R=Q!Z<6DX*(z6*ss!IQ>oyFMvI2 zgJ}kdM`bfaTj0*h`#s9n#Zw-h7Tc=CWQVR?N>JlHOpfbb_R#HCNvF|vAfra4QZKu& zvKu5@fji~p-db68e;#g`7bbao)!BxJjI)^eMFB332ZZ(oEt}%almrzaeB-U_Nuk?q z`JLBd;i9)5t=|%FD1e3;l-LE#$s{{W~4DXA#BsQ$aH4HcW#K6C#!v& z%~{d%CjkbaU&bYn`*_b=9#T`A7emxO6X(0wBi>X;9*WAg2Np)!1%RcI<93GnWN1~Y5R4rl|?t%5z)#w z;ZNMcLvEnLPfRcYH}4tiXqrUYCO2Jaq6xHawWs+o7Uo1m+41LK`fBaBj#n>YD%gBY z--`qb%JGl5nZ@NLXBxlQNe#*p-|7N%4E-JH_7P%m3S>ZF%K(7f*z7!?Teb7-c=P)s zi<9(??=UP~P&ZXr>F za}gZ!i=ch&yRA$wbQt{l`&}B-3>HK-EWosB50od3R_i3{MAPS8o4D+|9FfX0qz5%b z;oN8iOZo0WC40TG7j@SxR7v{Nd7_9^H^*jMin!=+R@0sn!W;@rYzNu=oQaMl)BJ8%Ur?K|o<+RgfL!xN1W!siohj{vSmH|ffP!TpWOF4*j_!IcHjdqtSzfuj zQQO_!EkO(Bs0Ouk8!yN{bBlcuE*9d9nPyuZ$_fHB{f0T&)(+|*y3wB z39BK;9!nq%rQpahn|K+z+q2pL!94@ag*Z_R@;){^9}5F9u|V3Ami|Fz5j6;pm{uce2>|Eq z>}?~GoY(?VJ7qxh=E83P^ko$-V9zLgk9+bzVV3`9Yx6(1EK)2XYT`*5tI?>Pmjq@7U1V`ELctiy_5ja1CKVs4aS$P8`V{ZF2Bg|m>X$bIgMiRm8V|resSm@i@|wC#Wa-p)HRQodFvXP zH>HhtX>#?dmne`!tklxsf<)m(aFmQOiru+Y7)rd=?C2$Z(C8LHTg3WYxJcqo>WX#S zhu3l<0c_t0`xcv*ll$)4e%(M;s(WdpCtV66w>q&!n5m9>FVA&9K#J{cb#a``3%_&r z*&7);HPl&x;&5c2OBG$L*$efs-biRnarx14_LQ^s;yaIbmpv987OE7}1CoWPlQ6v1 zQ)c2-JWnZ*#Y9F9H*aI1CEsIJB^aP!|Azj!_WX8n*7mN|uJ%Bg@*d1u zaF8=;{mb#rfDSE*mz#^&hW!f|2KO#FmwXN8AllkS(8{sj8<8GLYu+77&2(WMf*Y}B zPH(cgaH10C2I_)#eNW7_PAi411J+byTe&~s6EOlL(Wc86ddS!4w~$*})?DU(lZwAV z3jIdhR#ABA2;0HDS=hM5@;mSCd{GRz?lx^QJqEL{ENtULo1upDth6xD!w| zzcr5M=$as$(09&HQ5vJ+ch5zRIN}`FosaGZ{GqOr5U{#J|7+>WMFOVtg9wk$S2?-% z+>P1owNexf7YwYCxRR@W%dcf9U_>fQ;r>YJf!57oQD^7tnlS&T@QUszCWBxQ)1FW9 zV2irpmtFpy`3%KL^+9dD*_2I*2QL**vGeWE@#-Y8RVeufplNe6jWF(+gzr`dAohlK z%Yi=y<>UvkX~WJ^ip#a#99g)wVTY6RKQ}t8Ly!acJEcBkzEN>$6x~LnOjdDk{_Omu z4($AahUvDge_v@a1)I?0XA<&0T(E%-!&vs8C35?`qp(IDx9o1lrBBlQ<*6#GZkNd4 z0X_K-CLq8?<8>A7n16B5L`vkIs+lYh=C-xj3N)t?4McWqh@*~@BNq=qz`g^@P zdIfeGRn6;Az<>lgK!ndzlllD>I_-_|Zat!!UA6qP81hj4=$nh2@88M>lzneo7n{3|r zi?Un7o55w9(W_8aCl&n`_Ogwc0PUvQ9_BHN>vJ=;tBQ7U0H;axKbvL$n~bu*Zql(C zb2H*vRwK6+pZj21#9RW~scLG*d8=c~7haJ`>UTKCS@#S$xv?*$|E|cBvYUh89_}_Z z0C6(UVQH?skDIjQ3Q4@7$3)1cqoY&6zErd!s1S|u^B9(nV6~2%YL+e}(S-eMu+Ae0 zh9IQvC`QXZZdX}(QoOoi3U-w#KP!IB_}}2c;p3+v{w#}#uAx_wBrYNc=7TiNE~{0C zOUc~Kd>4!^r|@CBv9)EWq(Yv@_9L5`yv0}dz}HAJ09TL4zi+>sa#A1j6df-p2d+Umvu_~b&T;@OC~oFSDaken@~G0ow~OP zc5Xdopy2xpIpGMp5cXfcK`%&(IB-O>q|-g4=*OeatBt_h7sp(cS6<4Sfr#{EY9^ox z1mjr9^)STAIm3;moZW*nAb=+l&qG#+L-P;^^&fmr?vs3h4;)pL|LFqLZj7>&1L_K< z>8lq1^w!9R%wE;of|Lt)NeZS2fNAw$$!41+u;sOo+;<*lB01rA z*raj=%q%vp87*r#TJ5YCt7Pd59qJTP1ASnca(DSvFdAYs0$1TLth-I3oFS6 zvbWL`?U5!VQ#=G1<1x*d=-pP;xh0m*cVSItdu#9YJLag)ir{xX{UQcw;N4!O3~9e|Sui$DdoB2_ckk|(+^~T) zeS=SOrjCGyDYHj$fpe5apChuvnsa3)tHB7_Kg9)p+k76c(gVIwgE;SBW%Ba{p8J!O zs&FLjTc?}q*moC{t+p^6pR*sA7ZaaYiBk$O<#YSF7;f8+ZZH4BOEd!wqwmD@aL3fq zFJFIZU=U(Hu?PC)p?$N#7(Vfrx9{j*UovyeO1N~^3E*1O{lxkzUA&%yXMVY&E~RNHsk z(qqcJ3a6_pIWp|!I5d)`43K@=`&m&Qy+~QH{*478?MsE~8z}%2e)H$aju^I-iV<`B zSHvYXj`RI@O+D59v^0)%40lSrq=T=9!f+ATx(-ou-PDDlx863NHuP&o9p&4VrvvCo z`+{`$K(x#LAg<0A4T~v@w4auMck4DY1aKCG!6y>V0SqJS5(9j*HJg#%IKI=G3R2d0Fa3!Q&I+J!86Blrfq2`QZzzZ0X+J}$UOsj*v(GFurM;X07nnOpjRoR9w;G|+ib$s_xGPjtZtHNppx>Y+FFZ;gseEbkRd;aYZ z+W`oqu(yTU(9kaubL=-7O2d+cEglJ(z(vLF!ap}Sb{vNIYPK}hPCFB|aEsyle1|&_MVhGh8@p)D0z_5cYUb~uH`#wAw!Ifj(3ci@%${x?F)jK zT;4Ukr>PPe+k#~xbLl=+d*W~?5emcWbkO8+tb834DP#!@#=d?c2~v~0zI}Z^bS?0e z*$b2NA9R;m@@jOd?)pQbgxt%OumN}12RsLrfXyaNd5-iLHXjGsX2G_IHiH)-Y(yut zYBr>5-fI2lcXZv;MOu`^8;*!`FI#lZx%*o&NOr3HGA?3b$#pt{?bH^IsVOcr7dQT> z{2^*;q?3L%LnG5}cx@)DP7`Wy2xJ{vQI3;2;L26Ff(9>prP=1u??Vn-!|!%Xo>dmD z1q;^+x9d=b8#qZ#wa$+y!(#WhWUsAC{PF2b40dqGwVi&Q{Mg*4dAx#c56>hNoY;G zc<;EkvJr#vl_|+ph#QSWOy7W#j;Bw}+%=?6-FAtsc8 zv!6nvMWbTkPDWHzhzsmDC_j(`b&Y&^gPR8@{)WVYQRshcgBw50Yue58nLl?&!d=%~ z%#;3}{GMjMtm$b;z%FSDOh!LhxFcGHIQbOcgpSKZ zztt2)YKmtAlqch+i767Y?){0jLiFB@8xv+f;?kK})aw`*?x6dYjstGdr_-?3hDtORynUbnQ?#+0_?)i--fb4px5VbvBM;$Pm z~4UIYuzbIf2-I-j2kXY8+$9r&N%zT6Nn74I%;hxTX{hQWYTKUtN1Z6 zV+f0XRR0rn*w?D2T9hgbR{x6g&eO*W6fs^kK+dJM$&c2iP7$kX>e>^ITj;|xo~8&Q zQnWCs|Hjalz5;MYx?#M{aulHgu{3?zj~K8 zQuJev9T#~yxS`DZ6dH}7QR}qi(UCX$;vMVtpq?dAYeW3X_d9!;Ue()HnXiijy6Hn~ zUY>06d_M-X@2x%kxOm3jppp;iIVOeM4n`I=*H6kf2P}6RT^t)RHPA2!M+jd{b_CwI zYl=~st0v2?AwtcV!76#ZzVapHo#OU)rq!{2JaJRMTU#+Y#C(OF@})~8IvK0gc`=jk0UnMKy0X3Ac} zS?RtFD2Ki&-rQJZlRK5`Z>(~it@d3@tk|~V{OP?*;~lU2cJb6 zLr>(*h*ko>LGXCM06Db2d;T|Q>Ao=;rTH85xF96aco1A3_OF{QZt+tGVDp?u0z4fR zvcLp+#P8!pz*XmVEamnFLJ)LBo$xtOP7D5-M@;$BSruxDpiGVUg(?ALVMUuO^(gYg zan$zwnUKH*d-A4u>PqYe>FM!pK+832`y{e|B z32}q`!ICdC8PN>}neR*%1bJz)q(I?xSF>_8oWM^QK-2tIduglu>8 zkv!1-8#I)LSP&NM zi>Z58MkOW`&53R0d27#x#bS4NkOMg*b97HjBECCyms&t;Qa0wTxrRnIju#JK{;O=3 z5)c!*fRCkeq5-=d{|4$r`+x$x1&}{hE$71pU@SCiw?`UofkuJ(|KkAQzv+Ab%|rfAsH+iMUcg}^ zaY%9(jQ)&K_^4_t-sbO!ga6xlRS`@uIQyu~2M?&h15&Z8AyiHP-ZrBnaN4m}^9!3~ zF!v?iOFSwKx0|Atweg{CAL;%7k(Z$c9jA8YR3YM8Z;+>_SbCt5EP<9oFO|V-=M`@h zc1S9glxTQ}?05Gunb<`NJw6+6SSSJlX2<{jrJ%salNhYY`(0n*fy87FKdD;sO=4B) z*?VqP!FL6ds^9!6&#uQonzrwO?f))Z^8aYA0$e?c^M~678Fn6D<_F)In|1CF2xkhZ zeoV310`b38oF@dI<&v30MnkT&;8E?tdOQ1Cqcv<&B{?-yO4V=WZ$f5Wi}fBpQ!AId z{*q$-@6Jppjwb=_xF{|=3_BE$I(HBpGF2(0Yh_DOK6%2J7$z0(-%o!ZlUkv>UKOeC}UEg;S=0j{S|VI=u3)a5KR z6)03t-Ov?;ETU5ni#nkOwmpD1Y7dat(l+5B#7OjjV?-Jl zM%(!SdVWE2TmNs+>iptgP-9UR#y}ASJ?4Xu??EuBAyh!wl>Qq;lxPGh_awx#<=>$F zU3@A)Cwn9&{2TNUNlM58DljmvA4Li5t+Ws%HcP;T0{Q9*9MuuAAD}JXOl1O|#F(tE z^BeSzo@h7!3J_uNqsX-7DB?2s*IfkZlw|i80^e0lU_ibGJS%S@@D-*aN$h_64LV@} zwtp9lp(M}vElw0Um@M%d1UQN#^SclS!1E>mqVXllzd5IB1Ule9DgOqs0SwW9p3#4v z(SLqM|M{)|=Y915j3cI@5q#)i}%>T~Qt1hOWaxl-0c?*{pffXgHmF z*m{`u(sr)J(OfI~X=G7!7!36~g3?xPbwpN1eJJfKR2)}h+)9bM?OUy9UK>(i92FAv zZ#CWjd%(&+ZBhPpH%;y@h@3~wZU7>OWB!bBd}9f^C)oik>5#P1(BB~ch+^fhXK{$7 zaWb#~{wf1j&d*^Xgy8pE-v5B~gDM@)2$V{fLx!18meK9>tOYGB5#s0LJ{wIqC^pb( zT;Y2(7sC`)xx@)DnGPEN0c#(Htl+b-aO@>Y#dm;O`_4U@KUI5nz`2RSxWxt8Fg~1G z`}U=SEOyv>4fCs=l?EHd`nhiPw9Hk9jh=jvpVU9_j{kFkDLu@q8uy5{^r9Wl^>~$g zoTv`OdH5Yb0s_EH8T@TJ(wU?AW=4BmK2CLxj;waI&dgpu3OeF(x zIK`S<_!4zxAb=Ri6^^M4OIZDt$s{)=WE@4_YFW!)%W74NhTkF%VQ2joEuu@E{3QIL zovAE@bVbjrOSunKZde^p{gI)6)wb{=a#J27+ay%|acv;H(27H4q#XLa%5aMYxi63gd?1t_Rh z*nipxzm)D5?el& zHc#eWebKT(xDcR1Se^(iw65^8(~_&i3AQfBpKaRtvMeTQsM)s|X&okQRc`JRVQ*WKE0*4y znB#7~i_PBb+chd3HAb5&mHUu+fI%7LqD5Ki+h+3uN3O9RUm@fuQ{^g))5I=b}cxRjR06nW4Yz+!1ss{WD# zqE|M}^-HLS^$T{IatFE=@T}BB^iiR2CN|UyTA%#Di^n%JElfNn>0)DLpCmk*dvBp# z!h?MB=Cm^iTZxSM6h2Zta%dl7`tC(qpBqI)*28KS{*mB}X}pNPS63@5J~(&YA8J)O ze*JCjW5OuauU~_#bOwtP#o;$o_!Y-&^O|EMMRLK3V+IumcG^0IKhhibcK0L>$OyJ(v^+L>tjzP1bv>80;u+AX37ojr{d|$-u9m5S zQXt__N!~&!9bKj!rl>fxp_VP2yvagHJ%STN*T^EcIn-zHHAnXGvhEbhL|^b#P}+

}UUJ zwdU3!(-gUzjd~-pDLt2lhI)>?EQW5kPs9|Be#xlXTnaKE8Y}c~=yhVhR>evS&ek`H z8lAQJ-i@DBAwOn(>M^x!9~KX=k2?ht@a|8=}aRbM!9Bjth3KFNwC{$BigNMEhO|!@0a0)E!F2{v76eZ z&o7KN>uG2qEp-Jt+*mf0d5Jmr^E)-Wl8;Brx2tDm#`>C=A3fj~Qb_27?B?oFzD*4uti<#NW)oM0a#2>l7K2uZ`O7`{$+_GLDS(b~!t zaB@;Qm^d5NS#p;L@@wN?g-GD)$3m_H6yuKU8?Bb|SguE2Qgoe{H#!r$u^s0qUte5< z4Hk(Q>09MU8_Q`;&U*)kTuV{!33ws?Yhf|G^Ji$Lg)$p&cC159P3@-lk=oC^u=5{Y zQt@Vw9~3S+LLb5#7Tsd9Zv(Q=VCvnr#$X>C1lA1) zfD>`!&WZGIlP&f#H|`5}-|=m3(zJI|JK73Rivz@cp_rU-*JeaM+-Y1oI!i>fYc}hE za%*4h9jg%Yvu5tB(<>fUGU7C`-$wjD7<+n*7gXD?-DURafNRBt>qPcpG^E8@`{?JQ zYg2u-PR`a#vd0(VuhFymtEl4dy}ak6DOhfH-IM3VQ`7r96yFDeegWz!(@)eBbrMfV z9SZ%3ihUs>&HQAaF3LMQekHbXX7B%J{HFi2e)+$b<5ZXMi{!_3u>cu_G6>k9q_~I2F8@iRJaiGs<*$j! zd2CvY-zLkt(v#ua} z_q1uLYajh*>@%631sr~)8T$v`(0@JyhGPdrX&0GYjfMa{j$gcW??1ik##m}3l-|`3 zZQ~5R$9?})kle3Kf0hsec7o6=vSCHtOsmIg-qyUeALo6rvCHZ4#xT|pbcxz_QN&Y+ z!|x;v~YTBy!nQoTgek0TIHXZCB5Lh+sI)j zyouVI{x1_zM@2Q_C`5OUvRaP*P<_*#{!PszlP`Th!@S<{OAWxM-G>-?Tk)Z4h*_r( z{pocoZU##}80m@RJdt{-&ek(#4kj}B7^nBFc-wHW<~@Eq*LTyIKc4u$Pqufa3$PC5>c$v%`sZuXeG#yi zC5rwAspPbaB$&*M6xa#wtG;nf4d|6-LMG2ARw<&f@m+|p?LvP6U83Wc8JV($j1PJf zw?u6t)twkGs9N9lw0W{x1t|+KA$45pRW(uD)pwebE<8X>onGYKpH3a`nBv zMUSRjnxqoR`^Wjeju6pPEh#7**a|H*sn~C+qRMi+u2{|p{0aH_N3m+HrTn|e?$xCk z@SBoUyf4QkOEdS9zB9v7X+V|L%Zp1rdkwfW4k*l){{{9NM+cZY5~4YP%8l>%0~FA- zlHLtn0~pKzBUHuW_=ps}0Y@F=14%Lz8)(kp7-(}}jNOcG8cR%LxM+%u1C|r6I(-NgysXlim<*PP5vXIR-R5dkb z^FhPVV)d53?lOYGQ$x?lcW=FD4PZ()bAdbPBhry?%YEv7TGWi=EPeL~S5J%LQ!F32 zU$U@W1ts*n`o>*g8E!%=RovjEVVoGuo%)klE;1^HF+Gn_dk_JJ=yf&y+lj{2E(Ca8CVj53cq5_f znpHP{)WxJ*?YVY=wZre*zfiJC?Kz7odS**9Fx^sbIaCk4iskl+!9;xs;qKkgh!yh2 zPDd{%Wwe%+D5nN)Orum##8l>ZTm(iSs>Y2Hcw(t|*5`!Bug_;2J%0F-j)Ein&bJnAGUpqM4-WpDEV z`}RiHGDPXGE`*;?uya-4fCSaI|Ha;SMm5!L>jpuw(Tj9Yno^Y_orp*m5RhJ?0wN$? zdLUTnO}ccI5~-2i3BC8;A@oiH0s%t2>)U6aefR#(J??V%IOC2xzMmm0i?!ah-uIpJ zna_NlmqgzP{7#&PWmH5XjN?%8w3I2-DbaRiK8I4+_Hf}utGEqR5mlB(>m%cF1HDK9LT_T31_C7O95 zPuukx70&W>S1L~M#PGHQ{2VGD8MLLqvu~d=HJsF^B6jfJ%g(KaX}l2-7+{T%<-$E2 zmDVRdib&7EzT4Uc+_#KoW>iFyvpi1kT@AS;$8>gex%=)@j6wB5eqjc;<7)h~Yu{qj2RPAh6 zv)?sU=N8C+%H=vIj-^MBEj2JiBxHHWKmJ}-QWUj~;NKY)V2>k@HNSl_|53Oy<+!j% zPxpg|^+R%%tPC|{giL=*2X;#uo}%6I;+}AdrAQX-bg24dA;kaHkK*AkN=HwCi7%K9 z8y}s=huif3XU2wTjXb%t3B=2+r>au@!H|88T3rAgRogo!KU`lz&f38I@sCef7=D7H z#>nvXfP~c0;3A$GjR=yqK4UZVfHjk(KJ>7B6YAS9qkS@&(f{}(sc@q>F=(}xhtBI$ zew21r4YGgEixs{qQAoy#xs@%KA781voyF@ZhIm?L6j#f*80mMT{C?U6tUr2K z6P;alUB!XVD=n#L5BjXKvcYvm(Om+Pv!xTzVp$y0{KERSnw|a@a?UPg1^WD9{sAqIC^H4DT33|P_r9XZw?2Lr?lFX0n6#5cD`O49j?SbZKw@<2*wJsEU`O>DfFDk5*GH`v~uw$!C{A$R$m%ICrj`bs;<~Qzp-@sL& zoqFi~Fa-HSRzG0l<$ilxp$hF*qlCcIWVM&uyVcXWBPZ{hjKJHIu1oXu-HTU5uu!R2 zV}q|gik>$G-wo_@eRh`22e6O#oFLqR@HjOp-O-9$6#|I0k@F?7Hm6p;3<4&Gm=?9f zQRv5E-L9mS;$oj0>gveJ<|d#IP&AKUXwT4?UfNtBi+?2!F>OWJP;`HNgzZ^DsiN^J zF?*pcG=?l;G|W=U+qgV#cC>tJ#ibj=EwY$TV>E)E>Av)ZuFE8Wxjm zdG6SukJu8tWe|08sP?M%+QLRlLYhTaXfEWs-+S)Wa7U$VF_@?FcAmCGx!|POS*`OX99Y`8ZQgw=c$JSWIVWB4g9irC@w9WtO??8)tfErUF?n%bT0J89KPM zSfDK@a(%)(`d2?1HoNOBpKJL;xwy<&b+SUl0B^F>*g?8Q*bMJ61%Zv+pG4Z7|W01*Wgt*hR~1fnKotl2cIoRE7HfZ z2&Xs!Fzrb>k4|)hmRoFv4?w#lKAmN;y09GkeZ z`(AIJ72P(}vS0hdvHBag5agp$wW1%oWus-w553ib;6cm8;mvn&3O>>nnFv~6LG9S} z?&wS-pQIvGIvrFKhEX#Fgehszhb{elL31}p5- zCk2J6cFg%~DXBqU6g6GiMRy0Y7^s&t7&#p(am}pZd5JG_&3Tc&zftPl1;+x}o?K*| zoW^Tk`2rMw{9S*7GO`fI#03C!OZy0-drWUu{W2j>ub>yGP*?mrPyz==HzJH#yO!EVT2ug-- zHm)zLMWX8it+QrkjCS`eqi;9Gt2f*qQlbY95P$x=DV47yu&Bt@N-ZKN@e{09MF1_q zdI-tPtj7yLR-NuMRb9=ODo^Ey-Km{rlE+7DZHrfbI8M5%mTjCV=b4F>ov)0WR!BIN zgiBJ!F7=cSuR6idUwx%vgCm>s*qfSoFa!v2w9`=agP9QKGK#K9F z0@Bb3xE^o?S7De_aD{XFDJT+u2_UDCpRO|Ep$H@%1ZxV$0$_}iP9AK*AB;U`ovzY6 z2Pypo5yEgmJ}$sXW%cKQmU}Sj5zYe`@0?L2`e8cGH*#kNj`w2+iWWdSfoxR(K+NIY z1IWaze|vpON4o$?VQ+Vuj6OCVCxsaSRj;Mmf!m1aI#+yka-Az0tWX|`#G~l0nkKE z4KRCt08#;e8Nr)>4B#IF_{SUgS4+(Q{S;=ZXa&EA(3ipj@LZ#=kyG+IquX{Od5ML5 ztcnI>o1$UyVHGKE0ZdlJ1@4xfeTK%2s;SbHVF1vdbmTsdTDJ>qeZHc1FPtNFtO8hK ze1q_#yt*>znK(P>g1f|wT4&>n8*f8E$ADhxf6HV23o`e+LkLsRV$H}4P@y5K z0a+HTnksrX*~wEB7w=_P+<2ujc2#NOGf0D2&JRQgB6z?|^LKDGV9CbOIGCkBJ`b;z z`#fbagM}Lmz44&Xpr4t|JX0Dmj{4CH@-al2UL^zc!IaRwh0>|Fn&z6M-%L~6y#HYY z8u?d7>A&3Eg~IIW(2WY~Ocf&c9in%mYea1&c2d{gcA8G~Nj)JB!*?XH?#O+HiXuzS z^QCpOZR`1bY!=ldwzbJA8#29VlIWHmLK}q%DsCFe$I^^JY3OUu52sm(ZrafvZJTD7|7ZJEZ%>^nuZkQ2Ukfjs?tsua+e z6=%0$^}Tafuop^zzhS{c(FLK+OBJZgt}emJ-TyL4zee9}*dp-y3kih>G+4Tti<`M}c5-s|a*y>+&UTg_j?AF}F;`I}qwhe{d^l9)^W5|+Jbf9y zXRviAuPuSpomY(oO^7&MikciwiyKt*f2?OEtzb|_$E&yn6P-uQMY7-R`Ci3+3AJBK z@~W%+me~-3FUCo=S5ePh6E%3KJ#Yx{7u4i2EP>x%0p8dbNFGk9S<}CG(#5RYArsUm zcSy6X*F22Sc)DC_)84RhQZFdbI^3q6y!s(F9+6R(9<%8I_Al*chHoD(2?m_b2e`s^ z_{sW9zW2t$cfOabK%@1KKi`Gg^>^uv==Y&E7HJLjuW6GdiK!{&?(dD;$AgV}oeUpW z_7kR}Y?~+bR$|ZB&!ScP(;@C^4tAcQvV_>iCGcerF-3G;dxHbhj7rvIyUkE``fYul zdmpbub)O6KU~S01qKBcm)8PGk{gqODeZrDh>`SaiZnSer%9|R1h+Bn`(!;Q}>`flG z2u*tsn+?0($*u_D=kZoy*l03+&^mUvCa6u5@N3Le^}R!$rTxh=4ONXPk-h45PZPlp zjgIYfs0?XK$=K)^SL%dYbrN|JW}S20zTfuVz^)nD0?~&^|7DNpHaU{9Ks#$!q7dCB z2Tr+#{la|V&C5p!#dsFc?@nKIUoFO*A%&OkB^EqJ2Jx&K(C4+~uh5oh&N-|morv@i zE-0jjL1jp5&<^bk0$H2^pDay`6_+tt~6CHPVPJ{`nchA zzWLm?rJhACM?j=kO*yd7GCXqDm5@+InS)%VZ9%T^HUDFc$7w8C+~Q^%?U33d;aV>B zMMiiM$|)9&4W!O}n6aX-_Ga$9R9)0|a+U>--dm)tTk6g`fT6bx(7UPj!AG8lcg>?$ zI;H%=zBPbIon4X+(FR?gDg1%9iRH0s(~@7epuszJRRH^0}IDrv?&Wr|y-6k?2Iy^f*hTT>Wz z$(|jKC_e;0=taeOL_Q37e%(@%uCn!^)3+0yvf1H7`?30nzFwAh31aiivdq}fM*sqz z>tV(_dVef^>Xn`n8?j2rGjbw7rIc=6iW1|cWAVlVs&dl?s@`hy7;`fP9 zFPLODyv5#BXo%I+IQ=L)5=UIbX=2|3^7`fVK({~>n94@~RQg3<-$94pD|={luZtS= zHhp*+Ymv@d`o_%*%$?(Y^bUO^2xs_p9e)#TVJlFFGL>^>R5oU<1HJ1Kj!eG&;rc;o zM5bP18)f0G6rKU>!fphF%mqiQ6M}fxGy5I;NnGWqZpfjl7d~}7DNL+|t-%ntGhFQd zP%DFWZ>e^x`8HjaInMCm5~GyRR$xr;GqGos57?;MiNCC+nFqJ(ys>@~ovc@*r+RMh zHQ`I%xi4+u!t~NgHS4Ogw$}UA94V=+uC#>BVYmI6b?{3&MuP!gO6ybmnk+NxacdXn zUI%wV`CISru*E&Q+)TaWxq3o}stt$-=edIDx;F2f3mx%CL0r$cd(@u<`98ObP{ zQEP>#@9oy3we{_OC?6mL+|Bs0urt|p=jH_;C9To?+D3ust^t#6+ng&>hAJI8mDgR` z3p_pO^%_^c+axOBm;#Z`I_@-HF>%W%V$0 z`yjeaLGAs)}_LX)f_~Ctp_v8&G2-yDKAo3 z3!9x}REn?>-7_c-x*qLlyKa?lcp`c!Ly!L$lDGmZJ_^nURQI1UxYcEv zZB)ECMfKcp+q?c&g-*(V@V%L$ylNH`L%4G3?hl*xd2B{R(<+l3&xC4J4ykfpYYu-+ z_Ta`;VM6BS7}9F&O^xZp%hSCSCJ#yk7RYWIusAB?Y{zmmXlSxy%xu=0o&y92P7SE2 z(iItj3mRlX0hOa1~mcYqpzE-YI+uG6`Eq*{>b)P<^<6?K7!u=6fo)E2_!@e~&aA7Q#I zs>pp1USU^`ohSJBcX7NuhWunZ#OlMo{Rw@5=7oSymL^3BXVdrAb{f_~Z)4F{F3yJM zp?2Q34xf-LWtyy8vOMom=Wsu$uIoa$aEF*(reV)cj_-~MTkIXvTxAoNt|AFG(1F*d zq_TJhk>w3uNF3N(ZO^WnszLm5I&ng`curUbldFb3#QJuY=9~?#LSx6)N5qJur58cRIy>9)A%irn>oZU>VWv?WY+Yd95sTNp`CHw`<45VpMp*$e<`V?RN{tIl&0{gp7X^K`$$IPnSxH!`eSKqtE?|Bi(u74|GGnzCn86K5LL76Rv{%f`Nd@iQez&7L zas2PZXsiSrThKH`eV4a-bWM;XOtKK-f1hh3dT}WMQJ+Q`J6k|RnXmK1uy@ZHCTFr&$wae z=ncB$z|GxZS0;j^!x|%dZxQ1;#?Y$C)yKszw#Eh z&hFbp%Ya0J?NK^5coFgeSu)>37R*9g8nI-AKJs_*w%}0TV9KqeEdB|i%vOsU_;AZl z)(Fda#)ZvTf;9?6a+S-3G~2@ZLpW5J8hNfWK4zpoP=!FRIsPy-BY^5VI9do`IqW6S;16u`Cq3kTp}not=%^NvC&lm#YiGj1;)foCOr*cYPdQ+fkR`tTKS3-85Npfk~KT zdi-4=8_UiwpUF&C+1U+#?DS^zdk%fPlRHHkx$1z+w zUH6dp&K}Fu6)t0sOBSDUqbZ2p#2b!92$sOb+&k>Kz9V1~2Rxh2{54n23u=}HCf2;q zeWcXa@~s&?J9elo!aAOWS6&Yja%v#1{3V!9+yFyAL7a&HrdM&;3_z3vw-EXq0HqWT zlt#IrO2MrAYK!Lp9-Ysp(3l_-& z64IE8sxUx$;lErfX2v>_Q4!Y7r#NS_?WTLv*-&mOmmJzaD0UQ&O@(2Sk8ICl>hEkO zz4r1s4zB0JO5!)?_)eU924?pr)7)r@3RR?(mk&Ra1|A{M_=>YTIQt=BxeAoDYi$oi z)P@r;W=GnHSABPT8me_|pqqXo6VufqPIiY4C@rJa>s*vEqVS!Ii<)FMcQ+Z#3NY{% zu`w~n`|G>>E9AA#H+S>%FUtbA+VRVgCwm8by>}uL`7ZrP5O{K@JYFY-f&kzz>Tu=` zND2ox0yN?3?v;}aFC?|>G!ISF#ZYIBdpH?_&QV&tr&~B7dIlkxH`%Q258%K^ z**K}DDdh8V41S0r>y3WC5O#jJR0r}p{AX|Pzu6TOTwNV3?L=3WH?ET_4t2Aak3Q0h zZb|b4X}lr~#mZx9P>^;6$qcpz#TsS@Rmz;4>x64M5ox!t%&_TM@bcvOj6Wp3D}Uh% zHNnilf93oAZ^k3>8W=|nG}{FR)k{NZhPI(f-Wi?VmU#*^qZt(ZHG=UyU2;s{Z)$2> zbhj+cg3w%biV5}6+mc`PXlK^s@4(RIj+pD+Q$;5<3wrXu@kO$?B2jW4)A$IgHYAoD zz-OK7yU*Bx%9k8q4sCVNe+t+@0n1i0*|8x05%_n9Ms9~GofGw+o&^4J*q_Gkj{*7P z4f$g({Kq=-$6WYhF8oV#VGCwHPqbx5ab7De1Gh1g7=_B+!YUr^?79-ajLL8jC6eIt zeKG+n^w@(4rlR2fEw?r7?K@{r>&T|1#e3JS40RtXvXWev)MZ$CS8S(9mi(y9y`qyf zH*P8Y^KVJUfo3agLcRe>Bg?-z&tQ&i$2c_l-WpdgAJk_Bne#=iL zebIe;7QM6>=#L{_m|m_Q_yG%KxjBSW`ZD#ZfSRIRAT5%)_Bw(wl8xg7%@a82dBAPL z0h|JAhpM&uQWnaAa^s_oqz1y4Km0JYX*Xi=3Ux+Xb0v>5*a)=0HyL_lgmpjT_K?y; zgPZL%A$7xLK1S*X?%W+lIk_WA*$bR=LS{H$b1EqT}S`A4}9_Ax)^ zzaP~TchwVNbum|A*xP_IRJboEx^5KVZ&yq69Mw;1BNTgYHsSt(hB+wyqJ>KAGAX?1 zGgHv->AB$ft;ADxh`4X{ zpZ|d~lE(L@b@~;+xjlW+t<2KZ^Et{+U`ZFSRhV8=W4dh`Cq`Zq#ANaI0SQ(Ub3+YV zgC1@MQ~gk5%#$Lcw*F`nKXz;kUC)Rkyy%lzxIiY(@r2m$@4X2Byvx?#4iUi`nA%sy zgBW$`%kylHx4l5S+*Udav_Y| z*YY00)S4ez)xT+i{)5w;XVcBuM$&FBoo4&U!G-8!K?^bELqlcKz>@3B-XS1N`Xtqn z{V3RSF94t~ksUgO7{Mb}>c$F1Es^zazwFaK3Wvrj#y`AdE}*eZ>n2XWOgQs@UNHWA zJ%kHj>}`Eq3akNgiUL@S>lFHV(JuD0XUEf=l$?4u!cNF_e}WK0z(2l|o;i9Dxb@{f<{WV- zehR#A4F=}>w)!aSM=HR{oJhv2Gor}OLl_+J!A78?u3xZz{8rFNA8EM}>$csr@93ImXiYbs zzea3#Claa$WjsXg(48^v+xAY4Z5@>P$DL`~SKZ&5*ebQga|4Z11W<82-5|d!_(tp3 zjlpD7RddR{du@dl9PbPXI0F_`bow00QjNqJCd+iqG(=3Y@^w#ZjK{MBsGeE|CrFjr zA4$)*&B>hS%95|=VF=rbJXj3}c2nptzpy-(q=b z?8M*M23licz#vQRyrfTRSfKt|4jr>II`(=a;fr9Hh%{H+8pnv6110R~3y^E+!A|RD z*FrUD+i)H}(Jo$gMU_?+i6tox#OEovvdwdyKpNP$suAb9svj%$^RBphw~_TD0~)Zo zyAL8O@;O!u!wOhAFQ|*PO_`9`SLAk!*4$qr>QJq=B4ndzZ0KNHu%bCgEHstQyCKc5 zE%eGMZpd=~s@Soig4HreNialPTDw`vpxOg? zIyJ!1PPW3H+qmU@_wrswSO7)qoa*rzP^>@2)qJp7mAJ$% z8*cDz&~3#V=&JjHjZ1j)51ozoZ%-z)OOf|!nJ(;9^hJikblfhUacg){&s1IyTVtMy zGa9O=R{3h$>6B}f99OHyNVO8>HysugG|is|mZlM=8*XdAw7AFVeGX9XrMWNbwW4?I zZ|c6t&@id3Gb=Lw;k#eFb>Jh*q+g|C+h1%ugvb89^eDuQUT>sm9V)_{tcKD_8IDdpd)}?oQ1t#nBu? zB|>xFc49n2^j1T|5xLvmbPUlM{yE~i`Cft_z?P7A^`xWE$N8m&xxV<=H!db?S9$wI zkjlRXb1f*!@wVGwil91)OLwZZ)A=CEdA?G3Zhn(?3O(t9Nl$uYsa4A!J=RzsZPnaQ z0GYBjf`>aK4Qy6u1sXphVIawE$a!JO>=Ir{$ai@<(7ES6Pb%~!zl35ml4q;ttQ4Pb zMogxWX{GnII%K+P6;hhsY--yGyDZ9Y-U?%e8Wzr(sM*?)rn^YWTY27zqYwNjocgVB z!Rnpg5yBkTHUUSZr*?TQ%)W8@eRH&K9t&HZYKo4Ii?-&8{WL7F z-@f8rld2ly?N-zK!#}FHau&c-N=pv!x21V8Hw@7B!Kf{agVVR}Y`)@olUs37`b3}J z&QM3^UqVGiH9jpm;Zh5ZXTM0_mua8TYe_RazPGV@d#vd>kHnCAr=1V7C+4UhyrS+DtBm6t$IIu_F2#_keJF|2G)W~LJqb@2C=YUfW|s8H1aW84{lx{ zQyz8kqb1eUMWP}Sj^uac4?dCu?!)wKESO&|t$!Q3vY4K!`VPpFo+s8*+OksPp0mZh z9`6`_P1Dr zVc}{CkBv9JSZhYKBSRg%}JU}0Q(W%F9sWej-h2% z(R8{jCM?O5hDG>MklWLL)7Gc2Gr~6XX1Y#qh128p=jcU+0Acnx1-^&hRw9T&&gvjf zvhA!hZ4dBK8P*eqxzXs$I`+m}UCJO`DSrOxBT9Pq2nj-7#hy&+L?MC>3Ff=X#HGKB zasE2P2TUIS?S*-B`D>~rrOFl$9#&~p$!!wsD8%g=rNo+04gRSKqWzjKE3 z`w_yaP~I8p<*y5Ak)P?F3+1nJ2<67<^N4}G{CsW~*R}R@vZ!rL_z?a7y~%`OSep=} z09HaAD9gTiBPq-)J%I9QX$U>}gT)7O2h?B8N0zSh4mH0l`(7zod7HCad%SKBZfA!T z{z@0Dlf|QE&7A59muSKnV8s^p!i>}llNV#F1MFmVMB@%)zeNX2hj=GUwIsC|Czh5|ZN|TkDH8@=GP&_bpV6pY$h<8J9)@|9WUx~fL`{Als zw9@Q@D}9$pc^`9Azns{9WbJ*sZ%?=D=&(KwVV|Ylb5Aroqbofw0-zJvSC09zK|#ap z*pHEUBoC#M^~D-$OsB5UKkd-uZ)jWU1TjUsm}ye9#Y2*N7wYDWWhA8_^2tmYN|fjE z@#VAh(EuEOX;ON?!)lQ%QiS=wyOob*yZ$v$zvaa%?Psl#b#Gt=*u>1OLAZoq=@Me? z!~nS?8EM$YfBfnYLEynCk99&BgNd>3J6vrY9a@E!2b?e6t=wKNagvp!*kF=2$DS zp$l6iZjfs>PFoD}COKSVJC|>^U7jJmq*Yw-_M_ung2`X5WPiJ^{d+zUT11PRTU$nH zQK$*uYj`3V?R^!rD$9iJMx{qUWl#H?U{oG1CWCqa2Y*ejI>7t6^JQ!5QjWr(pilR9 z-3ezX2}TBfbtQiyF${P#;-zskn#7DmBlWF$KK^?yfVE-Gf!I!GTaS4}c1`iT-QhRy zYc6lP96`sC?lk9MFr+g9)dk0xFL{{&G%EMKqHf~iiH zraatnpVDzLxMqOpT$X?XBzOt;(*f_%uao^AzBhNIrSOk6@hbS|N3f}GJgk=BMDr)8 zW$?FOlOLrq1KccbfQ|#m)}MiQEPsN)b08cXU`)R%cbQ`i^8U@4D257%L${21ime1WVBhUW^kl0jhuOVN8L z_rNFICxL%^+X^LVXq@~X_x#6m{^Q;LV?O_W4HM2kj~rio4XX$YOjx|OKD0^PBvv!6 zK6A@hb8hDhD}|e@Z_)2wv!yJ$wxzdPJZ)fvS0>a8cfc~%FaSw>k`Ty)&5h_morV30^?YkfJ^(}6E&I?f&;V9$O^w;UB0 zIKpt4i_vp1k}l34+Sk(DQxVQKjNno2gGFr~9iffnm<#eLny=qxj}RZ>`TExHGbxBL z?CQyhN^h0QY)NqW>?G&S-v=_l0R9hxt}DK*Gf;u*;t@-%A=p2w^bDHu#DT1=M(JoU z^6ukjOT8O4?!8+Nn05aito4^oomJG-ceHvh1ZF!cb!3l~KzWl&Zyoubr_|Hpd+JGR(PB;EJe&QGkt3F)_OhdI#wk0czH$r9 z>|9~E&0LhG!r|xT`pYqe5(7p>SAlRM@;6&V6H(;(1)f7^&Er#F#J*LF2T%~>s23ap zpPD?DJZHm8T?MVl-t=gM5~4`tTZgP&Y$!ImB00z_^?6L6e$-^L?>xFxRrD!6XCUq+ z!alI9c~YwXfXC4J+viPFuF=sU`*|tjkvbHFe9EJu*$5JxPZLd|JtUQ>$QC>IIZk|+ zRPLB2o2Ar=IzvxN#bl+|c{5Q%viwqcF`S$BY;tO=b2oy!3BoKbv$z**glx8zy0Ta~ zggZ2bM?H2Yx#sVxHD1L&sqXmvz{ViOuFHA zINeV^4?o00cw_8LIqIaRAj03iS4HnO&Fy=;m3w|sG2@oF7yUVodm4^1b&QF9npCtf z4rkEK(x6;bDNXm5!CR$6?_yU_{UNe!>FsZPQTl4^RECr$gzE7Bu`pZogAj$1t| zuaCvc=5I6_Ct!euuyAbxqeN)#VN4v@_QE15uUe&0#ZXp%cJElY5Q7L|L+ksCf@w^q zWO|G&F1h*B@rn~UUfCF{anc>54a*bt=1rH1wsh$`xM~RH?Anwv-44WpTO6^-rc+1= z|4!PRjP{sRR`3MvgJy9KS9m~Sk%+k=uM2Nfie+&p=I}dJf-sz$v9SCOR0&g7(VTwC zMpme{D1M(A+%-$|nkKy`j-8G>;@W$xp2v)KA=5EhYw;}5x^16k z^w5XgT2a2C_qnN6p80!a1#WvV_F?G7rJo=Ys4^Po%Z{#_R^jFs*(rq z-;rrwW_zdLHSySz&lPN|uz^MBpj>uY+a2fbEfir&3*L<7s#W{wFZr;*l}x$+)8*q& zn+c7pS`Ocv?wW}>wdqEVT*favnPF#6I@%iK&ZZRgw9yDN>a5Zvh<8yNjuNwqz;L4u z(?brsZN&w4xzg^dJ+cfVa?T(~jFj6lkuVeR*cK|#;+Q=;JW1DVJ0WEZC`nwws;D&a z2<%(eX**aXEWDZHA1xFiyJ~B+lj&4@YM<`nPT6rq(PQZ)!FRc~2Yr_d^VO9U!#0)P zB%ej0Ea{ZZ53bUwhJPDLd9hQM#gSSuu0w?)3v>v@%grulNVB&(lG=(Q&eUtHvR z!zeFI(G-z+kkrf%lp+27J+s_>>WpMG1UtVu;$0#9&G%8H#uA2g=Jd7}@q0xR3_#`Bjw zz$n1N3#yf~QBb^XZ2akq|JjA@E61}H)+HPLnBPlIboFhlJvb3B0mD^*K9&8Rf*B5_zZNf!2^)srhyjG!35 z_Uy|#@4R%HUJfFRHxHkoWcv|c(mWf=oO_UG#)Lru+@z@~Uuw;!3heUKyt79KtrlTz z99Wn9_0x+LA6&uYkga>&9QTh{MLtE+H-Xd~RhEjAYJDva?=&U7kBR$XrpP4N*dKg% z6cf|Z#$m_ysoF86l#yTUsE`-eheY1waixVlx#Xq&~xUyd%@G469n?H z<)JSyDq|-OOw!EL=tfJG2wZMhxEps*j zo3({d=knB4Lrqp?Y}k(0Fq|WecpP+OUNrKZ`aBB~y*YVME7oi`!hrOCF+Kj)SPAO6)JohQ+Tnve1`Q zr}27wax;WJf06L~iCfM|(Sf6e&zE^1Nt-0q+nfgBH%OfLgbG(W$0Q>{0S=>@Sq%?1Gd4AcZhS} z1bo!vb1p;**!0h!j5OqE5ZEYHut=a1W}o*Hl)3kS0^c&f|7!j+Z2QSD?1$wtKf81WpsJvBFS!#B;DZITeQL7tR?-+cJe=bNSo4) zzu(x{e1~n}9Z#-yD!ublb=iJJ35C`v)z-9>q<5-bdKF){zSp${=S|vBDB_JfZ5^X? z>j|b*qX`3%-3X>C2p^R-l^d|w(DS{B90tL~sMB540+i^QyJb+>16^|C$1 z@`p7#S7P4&0$WKM>sAdXYnfyt@WT{N8n;q`u}859#)HOB?MPg`*>i4rhQAb`>c0}U zdvDE6f@!JFcAMu6Y}<9>J*|m+QN`KaYj>?I>LxH8|DymL^LY}|(jkDgIQwm(hYRXM z^+ z%`}A=sXJWyV)Jd-uwz~KWtg~yz=r^WxAcT;A)tN4r1_BrI$^sCy^fWCgiJ6RJCuqa zk(OU%{B|^JbDXatBx@L?+j%uPLP&)(G0{&Z{&!`pzeW`Q5@7;=Bz!lFzYLWe!V7Q0 zSo2F0huf0c!jecy6Pk@WRh9NJi8%^yWtJaka z?*T?3NBasFJ07%Mp;tBpNi%1@kF)_{kdurk*0#v^tRxP#{FweFlKvJuwhn67^$bNFTOP38wE;(dYaI{{=P*w$}!vd;=;g(~=; zAk9(0Up$f$yqwT_ssxl!g99O%tnJSq(ED-^EMUX}4mC+<&AUdnS4+*;nXj&)`)qlB z#Cd6F9$6!5_qs5{-Sd#z6DCnyQ`2CTC*UdVoqmIs3e)O)t`zzdAMan@hizJzL-=$ZE4z}+?RuM7HwZ=*JTrn6WH}O7-VF{CYkMp3^MLD6H4e)9ftiJQTbFYX;i@ z4o5bh6&#qr6KS`e`I-*XMtwT<>%Y@~E+l6r0*Ek895jz_pu5))C+8jK*GK@aL5tj% zWJ?D_v7J}PrTo+hwpl<+r-MpbJ7(rR+Pl?)={+2LK(qUDepl3otk;7KuWG`I9pBVM z>)=(mKOncM07}W{tNDoYt^}-BBqOX-Aspb^MdNw|r(k$xt6aHWBp)-bHrlC?+#xCFhqTT+^|zOIQtXM9EZvDAn!5urXdJ%Q`hCh%G8CL|o5u)ss;k_^Jk$oR5E zwaXZ6sG{9#XX6m_YPiD@I4WEoU~9Cyz!rOn+mYx14;v|X#n8gLR;#3in}CzVO%93? z!UJ=*@LG$thaNN2NoA60eW}}}HNLMmdhg=I8hrZ&TT!wV!A{!WW@gM8e}u=JSbH1J z14=Wb9%pSgWce^Rn$m+3?jKjL1wQPj`Ei-t)VO?zKK9uKqNp?w>bobsjNwb6+PtTY zi>Yg*b>lw<7oFne;)lA?wer!eP5$2SOMBg%K2+{*$Cfp-))~dCZ(!s=wd|n4R;>Oi zCdHQm9g*;6?EW}~VKBVLwQ+bmJ7LQ;*S*ADwPWv1XRR(ey}=7#hhc0}UyR&dIKIDs z&GK#S_s@J*tSx;`(HcS$TV~;6#sbZK**tKWG9%-a<8=1m07z?9J>!+==krjf36!gdW=}b9CLKsdzMH3j zY87r8?KRH_X*Eczqk?HH(=JXWIHpDWfj~c`L6=<@oOuzvZDV8kJ zqGbxcMMd+z8n{gq9pph$>9@V8G}ZWtW5CDGs=y` zsXi&=n9@$?iG_2!f34P29k*2)F2MW*g+i^+4yd`f&q@ffU>u}iUNk&zr?t|wb>ic+ zkm6IJ+eA^(S2{WhXI^sE8~TTf3V21ksUrqX=2*Lg`h@!O%sG`dv66S!QgrIOZ71!A z?IvI4myPs|LvI2nZ!)8)$2?A_d5$4EI*Qi(_Ud3G_j@Ou{3@TbVDCpEMH=S^r%bu1 z5G^lpNIg_?;3l8!p|i;1$a0aD!(AqwioEheSlRNA_XO1=^l=7a0ojy_iBu z$8J||&AaYFC*f!E*rhgBl3Ef}ufD>l%E+ZWyNgTtsu!n%EUdIErR0T3Ke3nDrYvqP zIR04EFgoDC{eUGlqvn+2F#@(B&5Jqsy~)m2)3eB=J5uxd)Tg*E$@F)ySIGsWFj^Ii zn-sqxn{DQxUFk$?snx~W^t&<9l9q8*yIy0icdJrzs{87)Z1*z@Hegs{vl~(i)dt^I zy7^tcPKD<2M5g{&T`_hqjvoFxu`7E8dvvg<)LbCea+;#m4qa6i_+Wn3B0OZ5upAZ8 z{(7JIxh&CCiGp)4Z_nM_zLGS|-w)m&ch*lyHStV2;> zjiIg>vbI^6cIk*T1JFGj@Mc1!J26u&vMAq$RrJv_ft{0?E8?u)1<<km{QTw9216*l~q>a)f^xCRLMNULzZ5m zDoM1q`XzA``HM!O?#?BGs?KRM&Z}EuJvWWgqTqAmFoE_B!+1kk0+i~B+r6r>R#9}0 zvjVSAcJykomxH&prHjtuQC^OR%hLRN98iC}9AB3|VeO^fcrDvdWP7NCEArL4kWl8q z$IFKWE(|eWJ`ELg-nrenjmWE(#*;S<^i(P_C!D`b<%#1_!bWQyn|JaakM;74RSjcp zkHXMyWsAJ9tF~UUHK=gyojCrlWB-f2?~H14-PR3)ASgtn7YPbT2kD*I=mLs#gosKP zkRBiqLFr9EdQ<7rq<2E^MS7DGK#(R0H9&~>Tlbzj&N<_pz1CiP?Xkuw>n9=k>U`%r z=QE%AOaXjcQX*RSvKF~KW%D^WNjiUVpxd1yJ>@J5a)sKssEcBlHL;c_*@@=1qwPA8 zWvhQi@(+r%l83@lw@q&fwy(a z8D-u>cxY&>*u`{CaB~|G@R2}g>HabpWTl2Zu2vJ-H8X9KC6xE?fT^FO_yvl@5lGL>4 z0VS?HetUt0)&KSo$z({T7HlhhJ*AmBa?R#%Vm0;s8s#p3k54ttc`S&SuR!=oE2U zo{v+*#^gICMPJ!^)78?=|zD%36+2MoqJvA=irwIpE^ z8KfdF-gqn}afc4;_CZbCg1f+U`^+;%<=iptMvwC8W9>j>MP*P2ghncG9~MXusFoaM z6#nr;aQX676XSMbU~kNWVv*Ft+@zOZck9?@ir{huSt$)wvzi{4l!?&jA>I;U=v|d1 z?rlnR2{A4WaWM<$ZHl!DX(*936;rC*ODAE!{9K{zf;3Su3C$~Ir@k7mCfT5g4K49? z`vGpd6Qo9WG9&LqP(|#+D0gF=33jb@f#$#jhyZRR6u^<~UQdq`j1@~9SHAKgYG2Cv z)jCr#K4*RM+GsV24pEkBSF7y$t>WX4MraXv`;)bx(6rI8jqoTs-!ZoHn4oaCp>wv( z%C#tK7j#_YJfbW>%uOe*^Gl&N{6RfxxZIY6leVqV6yu8BXEjf6>Gian4*|VY z))GtAPfTYl(54j7XD;OHgfj5g!!T|63B7>YjkAu1H$VbuM}-imaoXr=EHqp=oyCWs zd$~PjwdS4wJ*N^TX|kAGSG!9%1p7poDv{=a20pNR_2tQdZ1AMrM=FV3rzYRM5QpWs zQFKtGym01%YH3S>Ewi}^GUbB9X??ntO=o;1|#0@7X?j0EMw4P!lEB(pW#3^!izqeinLuhBbmS}m~mw?rh zoLR@(4y^$7@)iHFB_#UOPY@89o$wr@aAYn&L7kgpKS6rs$m3lC4p7V+K1&NFXm@Wn zfU)3~j4Q6w2T>%O(HrsZqbW8Ps#&+n0`&QV9(ajnbT;(6!$SHj^OrvK&Y(1#R1|FK zcodC8`vJ|T^c&+%yzoe*9~*gn3_qpfbfqxE#?5{IInskl%2Dsjq6+GfejPrFcLQ_d zhK;It&2$hIODEO2QXdI*Y=Km%FEJ+rn)A`Hk^_Q@$|(!r+nR7lLj3}AT{o_y6S^y1 zunsxnVzSKG(K+_L3$+Hb+SO$fGB6Ny`wBdxLtW`$#96#ij$o|5;xp3sR&H6kjkd4* zrg;7xqbdpCjr6|b>|tT;O^vAY>LS`S2I$E77j0*uK%C)74j)yij9v-s>PXNZu3vWH zQ@Y~s)cKOXAcC)MDwK|^J||;-I>(M%_HFDY-AUR)7b>(GM%5tWm<@lruswehD}b6l zp^JG^u0BQj6pYDP^js``0PkVbzZ<6_Qn+E@o~dnw5?X-vu1SKa3I_K^`1OZ4&*B=c zc`j;4z^LT~^Eb5fV8(_vJD$#4hJ7yA$t`5$qgUmgq^q{bPkzB_2}y62Q<_nK3v({p ziVwr6I6Tt>G>Vh;jng-=ua^Q7FbOSEzC2C0o!nKcM_>3I7yp>L8XfKuDXRMY{ed*y zZmqJ>BgqcF%fpV`({9LbtDaLY!_fNa^bdo!gc)Jni@D_rTp_~ z;ken}!#ZUBC7$+kFE3tECDP5fbxYymy>Xhy!!IOiwn_v>A5?~}+wVNN`SlLxtro8E z-AJ*(N8JFDSpOSX^1m=DXo)DleJjqeQaIb%q-?CP7atL6{m9cNatNZi#Rvm8U59fo zmAu(%aKr4hyC?s6owsu;c#wZXhC)V#nEmoyeqNI2mz6-SuD=cf68)VQ19||!g{!-L z>WvMfTbkQr^t&48KflcJq*F6&k*91Y%J2eV7p5Ug8C{St6s0gi1AB_*#d$BW_}H#? zE@<7`H@L~MWICflg4iSH-GpaR z+7iJhf(S`-rr{2~mNtS}P36(T9yAB1Uv#O9tJ#V4+NNKCVP8`;K;rAsL5h7xhBck5 zoKU=In|^sz5I5>N=1uaj+wKVyt@qm$I5u7;JChSAq>1lU@| z0CQ*Wk6mMWvCJ~R=?(QdI~zeV`u1t~po?w``d)`u>(+;oC67!H_BAfWev0u6oL>u% zO#0Nm!n&UoXQVcJZ*IKb8A&^L$xZ6fdF50Vb)7f%T=WMv3m+%i-F+n;S$%tE8przs zO7;gN_WQnQ53DLxob#Ec5IMT>&IxARlQq9?ghwqhm~a|Ff?at>VfFbEItoSnbHZw- z!kiu`Qv5@73M{~kuBr}8pLD5o{Uv`F*H_!#JK_H1-4frw#jUH5Y0B_VyQy?z;|(GU z+8KLBGbPj1jPk0|n*trTx$+EcVMGlYF1cp2b9YlD_SoZ0orT%EyQK##(gwx)K1tUX zLHDyf;C4YpflFJQ{4E`)C*q?7*3-n~+-Bqjc;8}9axyq@zYly_Epx`q#O$_1bPrc^ zfQelV)3x2(LoVqbYd0)tMGP-|dZ~`c&vmHU-0(^?Q|K=<Ia#?&`=0IEf6)>;8g^X`L|T*NGq)kz_Mn(UEg#8Or3R8Xr!0I{)lV zoD9N78U={)LI>9pI;Jm4Yba)id@Kn}jZ;+^FHVU{t3$fwj_pGHAe4$Uxca+w!wu$6 z-WE$~e7n}d0<#b8+}Wir<)=IKrn>SEJV_J#PJK-rw6E`MXJ zd0IBr2{(=o3$LY(FRM;5_mJ^iGRduj!XoVWBsOVcG zXY~~T<;(V-5+DUr>g%+jye7OB0I2?wFRcCXqjs52(m45%w7k7Nl_NxcFYnfNag2vX zZI^Fv9U%#QMrMSwD>1!oUSh*svCUO6NS$?uPD9A|kV#c~yW)tht?l50aw?|sYA%xk zI2&s~1iRVsW$0$KOWSxZi`tD5uX@EvO$p!XEe{CGldw)8!)-9if?Eli_(7qQacrEgco#@HCEwVv#2 z?v#B-Tn5xwBv1jyB=IMWsrK16pN-5^IC~>V-gZ(~pYwg-zCtrbt*jT8Z9qnv)i@Tr zTlU$hQz@z$OL{{r_r@nJoiMdZt{W>wa|P4Knc+C78#tVxh`)^s`B<8ubSf|VTDk_vJAIDif~I{(_}^yrlZsA&}p$<67U9nO*L4exV)|}qJ*AC?=OJudXEBVKkHeAw6J^MY4(0AH!_5BlVwc6v>Fq@`{J0^sjrGw?`~)7 zG}34K2#v%5-N@tkM(|ziT2uo^HeP;Q{_c%>bBWXwM9x&!_ttOeVx=@>s{*R52DxFK z4|4vA|O$2h$ZDP3P+m$Sx#6-F0_VZ!d!;bJi%6xAyH@s`y+*Ttf0UuOElp zxZ}b(cnTWQR)l<5X(%M2V4_)<%zi=7MsURv#5OH&RjKc&H`($apUd(a!%>b!voxQl z74>SO`jrNo(E?h0rgZk6V!#-31p>t9kAbBAw#0rp&ImUT!F@S^B>_a6GAKTJ7I}IB z5FxqKpamo(PG*H?+#L(YycUEDzABlxg|@o2kc!}aK%W8Ew9}Tr`?9QZu7aOK_WYA) z*Mb?j{Dw}iH{RD{jzjmUJ5ZehV+m5DL|QEp6HJ%_Bj6R20x4>8MGkcZ5_^hIF^)kE z&k01>wGMCX(7Ln`27`cp_inA+749E%i5UqlGLNa^cHy=Xgd!6r<^kIig#PRf)P2x} zwbb(11FYMJhPt!#5O&f|d-(01$40Ds!$h^Ayi6gjRU!ox(5ZeZMY!(5kxD&w)Cfm-cP+UkxzZOxsR~uC7838UZNhciC+cY+m>_Q)-wM)9*>`A5W`Iy8WA&zic+daOiU*p#Y zs;U(ZwQ1oFg2tMp_Fk=m>YPT#BN_olccheOv{V&8HbpG)@}5d@4a}Tmyj9`v9HJ_9 z%H-Cps*SNXbttJEqC6|8R3L6=Rt#9&reB)>3F#BU-R0$W8`UsnQ70r+n>r$_ z65$+6M{Ok`b(wxSw=72-9i05AU0*Vl&R*$Wzz`c-0+oyz##+2=2MUJsIATp@AWMHa zZxwAmxdRhaRYCD`w=I@}9bUln+w2wgNP(I8X%($@N}T1~haH-D|GO?;qKx)*tA?M5Su zJ02{meJmFO-2ydpET*RkglFJ(sSAq1S}AL0EkyjizO1zw3L5nDzN1 zYFL?!FmlgCUANx5)9oAWrgQT(s~eBmlt`7qa_8Nbg&%>H8V91l2uDC8%}g#0h6cA? z<%5D%dj$dFLIP*#iorunGf4rOCk(_*$DmHX}cJTpR934JZ6 zq+b;_T^#MDN^_vG8~e4w_Vzygus5*FkV7OE5<#HZAY8(dg;+)z#p(ubwoPmzuSGt; z6($;Zp2e1TEv}VWK``_+#HO z2*T#J5ar8+D_^$1*n0BZSg#^p3OViJaQ{n%3&_YYtxNRekj=m0({$vcySxPnXe${R z@kvo*x#IaC@lEuHz&Hx~`puGV?Vx16w1w?#EQfAQfSDa7-_$Kbt7r0(??3O$-G(SD zR#?(mP%*WRYOAR&tgO(jOu)4G3y>*-gsI^yFUPjI+q4%`kPI`0I?JoK^g=+V?;hbO zhF;IQiQM#-bv(hV^&aJ+>be!BRH#Z%wGr39R3^43KYqHjEAH~n&aXBh^39J>;lUD4 zhe)M}0*$3%1S+TN4WdT2Eb@e{9viDQasYcn5H6q4k*pweG{p5qUe3C|0l-xjmlT>y z(t0*@vlAc2DI90w>P^;4I|5=O;L`|Q^~HG{Ut7Rrz?TW( zz1nz+N?@*{c+-b~pP&yJXkmh?S?4b;MR>?@e1j>r#0=+&F)+kNbhTPy%nFZ|HAf4t zE=RK&ah!KBd79Tuglp*uzqE*DydNKLp43Lks!|zHn{_Y^4bsOqVRc%So31^2w5nf^ z{II%ku}KC`hI7LVJsf_HCC&C(viBwbQ5m*2`c2Tqn@>`CS5!~qGWq77l)l8aWfGfx z;&P_mZN3LKN~SinryfX*>{WO;+l@vGeq(+MUtpSg`Pw$aViX8ZmT=45&7%b8K6v!N zTjZjE1M|x9v;K8|%P56r?$_bEPGnR|rN?DOA9JPXyf-nMz#wrJzkCy#fkV@TMO_>n zPy${&dL3(k?P>>KUND2Nb%<4*q_x`Y%Fecfe{3&Hqi=yyomwOBP)y2qiwQgnWFRBS zd!!$_YTQ(G{1eo6@T_C8V7yXR<7?y$fooyfYB}MRI2l$u$n9cZsZTL3@KnDnLNMv| z?D6egZ6)c58pZJh+UM~8JDJAh&aKp8lSSJnv5Dqi8g&wgg5^>Ac4H{IXU2mwwpRQ|=>8CT@37s%OeeoIEhG42- zGnSJ}&b9`qDG~UK^$6jq1!Tsm+%v1k=0W0_ZHWib4Pd^*NW^SWVO_`*JD*2WvnF3p zIkZq!7%;bSI%PSXYHUk4~g4R^;OhQ3|UU0XmT2~fGx{|t~J@1VRDyANf5 z6`=V~D@=B!x|X^^w>Wt^wLcHh5}{#bDqXNJ2%9|b(p8tGq2{wk=v=h*E~F+U1OpjLMFqu8bM7M7k%+P`DvwIl6I*ua4cWsV)fbcOS}1?br{ z#rgY-Sx1SLAwe~sV^P^4%~M7=4S^1|m5Zrv)h3G;S7fPZsEuOeb#g`7Bx%>gSPeCT zyp?{n@8kc&eSZf|M+9gi6z=;-?ih@5?H0Xw60_>jBKJ&5po=*66(z{CYis9J6(M?- zCr{(Hyc3Y%v1e0d{H{)~6=vd-ZvBIrx|=_?MeU&Y^C>Za7F07xRdNQb&RPZr4lqwL6INjXbe~sG$?10Ke7w5USZP($2j5yGw z=APKkqMk8jdQK{FKA7e1`4`y*ovD zx*qRdRC>ADZd!OtxO`;4d>iz-g|tqaNzOLXyCnnM0KptitoF5>h^~K`*IeuWXv6sx z*Xg{~e#xEH+dz_m>@I1lwQ|qqJB|k) zk1+Lb!l>yigbb|k=F5G1pK!6QU-}Tuq_u%Y?9te>Wfo9ZmC23I?cWUVKd#+qGdn5i z-gkD88Y*Lk8Z<94=i-Zaxz_vBn)Jox!5CCCis}j)0ePPtW(wkd+s9eLewE1T=}S>K ztA5RWQpkCvISXv^KzeVGXP_p!V^yzq%w1oN|5)jILWJT)ggEwW=P)A#FLLRq{!U6_ zb__WqRZ92^DKEryhgE1VCb7q5B4EMCV8wN%!7lyfnXvh*;$`qR8#Tnc*y`TS5`SL^ zP6IukA{2|J7ScS<4j;>0_&Fs{_MLS;Vv50qgzi--!`WxYwLK& zc*n+gPkvC2ke8>ERquiDCt6B3Oi~NzOJ=6dZ~p`t3-wsn(II3OGcLi^H}x7Zhe?NO zmGwD|O^&iEdpVpjQ|{M1pS3Q!jFf)}$P9FjfRY;AVqJ z?u%079dqJa!>!iH@~)O0ux(F~%{*1uwumII;WQ*yb#{!S_VE#k|LyhAz4hl)%wmUc zb4!3|Rk8Z&68Du=X^()C5JZWJb*JD-O%`FT3mUKW8JsyeNiSb|F}~_3 zb2#vnZC6bwwVtc5tg4GY$G0D76RfCTL4~z!ignHo3puPN-}D(yEdX1R*7B~vDV$dy zoX#Ti_ZdrQ;F)x4!|^wIzJGnjh-&07iB+*#0;>HcomK8$$dHN;o{r|u3Gu@rS-~oN z=jc%GeW|4KYzhVS)Ne9FLnHKD(4t~gH@!t0=W!Ka@_EG$;OpaLH#Dn zuhChh6*`v+{0kLs7Y7y>4hf0AwAFBZhEr*2V!_;qtRo9>UHCl38Nm}a81q=nO)ksQ zapH5EdA|9zt?vcG85>?kO)il`rNT^!+Uc2UW_incQipWcG&J`7+z0r7tl zc@P@zt`cTkfu-XzE@;S_b*%R)A8JrJmB(MlYUL`Hf5K@hZI49=?6$awD?TgbO8oTs zaf})_lT3+Zlcs;W=2)tPPYNvmP{uJBp%V8Xk^vlibUj9`Q?$&OGvyZx8bF=L^WuCk z2lD{RZaLHIdE1~yV7FiQ0492;pN#mGQ5+=U|?DUR3F9C?#+>UXtTQW!$nc?SlPF332=>6x9Sd{w9+X4pn5!JZnfZn3v}yDTD$z{P#S;t9W;)MZttNwFfcStvHtz{`5Pm- z!xCq*7dKhrW(8=3^lG~m+u3fjz8!wp3 z0~OC9xG{mn$v=Y7JeiFFM{S&wSB6$H^sb12BD7*4nDqNTXR zx_k+f*r4&!c7Tgl-)d(>Hci`yOjm^Q>Gi8@*%k4Dp z?V(2V17owYq4j|gVg|RW&o&^?o^)@_W(%xhD3HKv?S*X9W`^tU z+!?Fr^X?NXy312P+JK8!R^llv{4_W)Y4U>$~3EvRU?VHTSB9}(MIaL@^LIH zG7cJYtd|zy(@j1CZ8TchjJtQ@Y8mX@@bq(64rD~6yUemI&RblKZozS*xwoO15{-=J z^z+!L*W8$+G=X@J6z?vU`YaHeI|t)zRFz*thA8)Yknbv(-ZWMs;3G&=~8 z3;yWi>7v9vBZ2bfkzyX;S?l8n7w#pnc8|(&+DY|Kr^KIwuoLpD6 zbzj01acQU6fb(inr92^>g)AtyhjT__Yr5iN?1CNao>g#r7YnoN2Nv<|Z!16xM(YkK z!W(OU6s#zOzlk+VS{z{LVi>XIweY1U*j$M2T%9AoAjQCbPMZD6D@w2995LQj<#f}i zoQ=WIhjmB#eWx?~wd27BfjOCXDjuimRg50UMG@mw0398m0mQ1MBrM@XvQA=K(PWlV zMLVe1m8VRS_58!|yX+i}I$2T=&CKM4`kXwU?+MhZvqo^!?=)5|0BqyC>6H|8FMl+W zjHl=S1esZ&=n<5-AoN0TaulE|rXwktAUsyXQ|Lj@RYzP?Iv04o zywxgEUYSaobtWYysEr4&(Wl?2AZx8DZ^}EO>m*_?nEds76T}8u;hN4Hric5mguJNq zv5hv=b%2@dhRuc6a`}+jJL&Le=}R@`6t#Ts*B5z*(q`vlr@go*zw}moPEDqr&Zb?qg&fD;5^gye)&6Uz<_4C9cspe4+N8*LWGlP4rpIpyS zkeu~0(LbCx^FIIho>gD~U0n^r7iX{p4o$WSe9@rbnQPJ%0W^iirc7TYz8Bj3W-227 zIFP9MD3g7F=bIj1Z<%8<72m|%tas)6Cs^vXS{XNeZX6n8;VWP>W4anwBnga1q?+@o zRvC-)xRBKwQz*O5b$P{^;&peoSd0~P`cbN_O8UF;Y_`o4#~peax&zCxxTPZ%qGG(* zS$-3j4M&R^6pMNz-gwSAKkuszwfdxo6O}p|P-tA1=H2IDKiL$?So|4R_btboRWAs$!pUs8zaMyB9 z_C76@5Q+O1{Aq_*u6!V0N6&IiaJuO63MG6*Y4$olSec1-)XqF!gzoSnvR^hV_ez0m@u>W?;YpT4mDuMhhYR9 z-HMAwZJ)bmLUUAr>}2aHgk~1!?<2aTY1PgkTU1(Z_3~}^CpyDAA(n44#2oXD{YMhn zNrUh5a^A6j+?j)wdtHV1-*ci8k{agpa-0@KuFVLaE*b<42(58VuL7)e?LwU6vPz`y z^@V{PwH$an|Gl!n8!qz7oO<>f5BOFhB}?)AUprSTN@Xk|=Shz9=XWye*Nc9FNZiKT zm2=o=<91lm)9vlmIA`yv{RBPv|3_wkkXMEvtxX(5H-+NG4H1g2Rn-Cu)?$v9Ul}Y1 zYRm_>P7P{VCt`Z$VOE1{1HSQPdyNDYy1#r!Zif`16_{iHg^m z#ba*jJ{`w9B6GdkOlO}_L~4Sc9-ZU=wc>-B4i?Q zz&d#YRFoqi|8l@YWINp;YVAz835kD&Z2dk75WyD+{75t~5M^2AWH^AJ<_lScV9&dh z07uf;$@mFkhhXX?07@xSWvZeSt56i<1bSsaHPOQux9{mdenJkSktaxrxkr#0R zd(?)!)c9#*o(pqunDBg3hxU!8!2PhsSDZqsJ)kolqKxvWwlfd3L9-(fLJ})C(^P~m zIya=*XlHckZPLeZ{ufV)_@i6cACP5!`7L_$@00*>hS1|l0CiEq)lD{%C|y&TO5G5p z;h+KPtvJrplmwaS$jl-y87&2i#O|kGJ!>7ax9jYDyi?ExK4zct?WU$Mf&K_Vv;s{V zrK^xubgqdZlmCl7L|&YfeNU(R8k6*p@oLB9$;9<+ZLGGyTY%oXQiBFJL(=|Q=(V;F!r@he3msXn6Ic->qj-j;Bmz4wp(D8@tii~MyqYjFc0n&tZ2W1k zm8A+GKy#~^TV5v8N8aJ;lkbmo^SOQX#*dp&9{F*IX9Zx33wd2-3OWtM!T)K|@ZWy; z=jy$4V>Z!IZNdR{7C&M!^)1PeAOT0m=MD8i>XoWGCb4bhZ&*J!KXwNZy%M=~)#A5H zc@SRTX|v0K!n!Ojr4^{h!`cQEciJ+j8Xl1(q)TB7b$I!$XkMMT+ERV
~jwu<*@ zPkyBsf^)&_0JT_hJar~s4nrSC75DzTexIF{{pHh7G6G!%av}o#o|aFw)WnHKepA}! zUyF)l5vIN)^bZN+Rmifrs5fg3bqTEzODS`fa-TWMv}KkrH^@1wwOhIG9NRWLAS?UL z74xr0{;jp8J6_ReZ>QckymLRp3RN950~733?9wT!3}RnQr5<@DpK)k>mVlr z;XD?vI+~3OxKSG0Keee03toArj_qO_r~Y_7PxTSbSxZg28T5}0x&HgF_z!%O?0?K8 zMA9DSK9+kOuiW_5OCP=duEKsA3c~D6iyc0|3~j}tuX?vCGGcwJmoB(%n;&>ujP(+; zFizb3beSkEPYS9p{T&#q`NQ?!Cwu?#FQz(=5QJR8Msj*cw%v@+YUW2ZR^zzr#$Gjv z*lvH-iRU`TqTP2v&CgjwK_h>6&Hn!-fX5z-2`y0}^pL8AF5xH?VBlwIxpw--W&voX zrtwP)Hlxe8xSy2Pi{EOj5UgyidW^jVdH6e4)Nk8;?4wxq#wexq3vOi9(e|8a8p^SB za!k!Dc`vSz{a;xfhBjzLSAZ|$yv;(Tc6i0k6?mDZ<&vF7nBW_wCprqz<(y9WicgY) z`daJ4v6Z(Jn*M1S@W1u5e@l?)xAW+KJ<1=PYxZv$7ys#L|HO7O%Gmzh*-6xdADXiMF$@5D)Zi1!ld#5dVTHen#W&r>P4cj+EEoSjO6#UO|^z~}Z#)!zaUY`mY5(Wqt8|tv6pXNDm5lhE`xyxYc{#7+cN5*)O%qPhCWDUDe-#+BN?}M@@yuVfUnqY$S1moLAa)DO;iXcoZjgzxV|GZo*j!|yFOgB^L#X?CUV~Tk_Yvta^)UT zj@OmLe*FH7msQ1y=KhI6*xxqbe`MVdVKTOd7I(r*PVeExaA`o~%!XRF69!vXA09ni zk4_bc^$e!1(Vua;@gsGqp?_J}vzP*|BxGi1XSX#vtJS5*7c^lb>j4xi`}c-$+EAO4 z6F`&+XlGdf5XnwEph~m^86x;I5Qu?xBsP^pegd+MZ~@usd8&yVPQdLDeu5wbF%|q> z0Ezr^=O?HcdHRIVfgHjiTfqO}Z~zY}7zsVO0KGtbWI-vayxDpa5TK+5t^}P@8@e_oM0_aznglhl?`1>{F zBJ}Jt@+oQ!4_yqxb7xdco-rYT=A|8gGrKH~Y=>r?kqG0-fB|mcEd$^eLHNr6=DK&7 z4B)TA*o89!6mJ=Y3j)Yp?*V=>pon+aauy1}1pq(IAA(8u9fj>%5afP>P+dPk;llus zOF-^lMqwa?DQGEdj{w2kCD07b*b%}}UC1M#haZ~^{Wp(|?60Tx*Hin;)_&&-r6KD& z$=#M&b|kTDx~x65*QFm(xV|?{-p4@Abh&uD>A|4b==C}bE44g5Upm($ieKw%{q#xl zaIYy}bcW*KDY8ZJ^ei5)Pz={Tbj7)#ORlVYFgq0Fz}*LDc1O^X8YBgbm%`;PdmDTw ztMH2Zr-wCU#Ms$oDZrgr{R9OV15INYjCVa1(>~mSY#l~3an&SEaXu6xrL8Y3UTrOQ zRt2C2os=;^g~;(#qj$OD_K&#?PF7Z{(}F7`EF|c zs=gdZvHADznZTq{>!InUuEUTz9K3EQbZyji=p3$iv_out)A~k~t~I#tV$8+^I#D5X zDmnP~%})CDfcuwNP|3~C(c4}kZG~K%rgK<>rTCkxxqlYhdw*pf;uxGB#si(v3^XwJ zEbaubT`t3{MoFwB>>AG`@Eu(_PkBwKBZMzlM*jSRn*hDJ_rGik_M17>uf#3?n#lcg zjzX$=Ns3yYfts_lo1#uOP@nA$TLtLkR|CZV^b$fZrr2e>SJgzCvs7q3d)Z>D#iS*B z8O~7&(g0rTKR2lPZ8Q2`em+uIsJ!-jIDea0ZWvzg)7_l!ClXoJlSi>vu3X^I8C+w< z5G^`;KOpNs65;P+Qx2zbSo2ero&vjhGTgOAll&!&Z}g=hM`%d#&EXywXIk{q1|w_U zPmpe{qRU%w-G6ETzxv_1oRUvd*2a22sy{szpf~nuNH7qsR$YqFkb}G@`9jtO!)TZa zKb&Zy!VN7~)zcJbW9_<_OM*2-HbL(?3vl;^JdfDk5gi8nWk`SaTB3eDYhpI5Vdd$_ ziqVRoEmil6xn~}z|EPuJ$^~&E(x}XJMaW3uz98BC%F&8BMUtwD^2&gxf%CLPI^s%1 zdbdCzQUG6c{{6m~*Un2aZb^X|=uDg04mYTC$hQ-W`&N<3D}Uvh8i)B;(NeOhFXzD| zaE?|aC7wSQ&e1$`wJGX@jYwi)p?%9P`c6kM6X$M!SBV$s>+jtJ|3nA;&vE1b)sBJ- z>sVk>U<|oP208KjTZKui{Z7nRz{)7_;$GX z2I$oN_Z!MD5>|uwhuCEx1$^Us69-+iEOPOh=f^e{&8;So(08h~>wh2I=6`+qS&y@Y zgY22JbT`?@-w*KoiBQ#_y~zKJ^GOVIDn}|^D!gF9!Tx&c3z-CF51r5kys{afzX4tS z#&vT4>-?_3&Lu& z&ohj_*arvAQ+~!w1_|xFSPA5>6^csXoY-0v2rWA7?&IZHW!k<4dGeQi{HakE?O3u)!{e&oZ@vU!o%gAKjGEuoaN>S>F}C!gMXwNTIZb3DXvg{=ye$l5*5xpC z5@qS=uhPXvHL@iErM2~%$oOdOF4et5InY1i;A)|)53P-SVIIh0bI8uNnUi+AVpD%S z{(RHTYqgiH`mSuVv|6Rrmxu3#g3ABOasIUJku1~~DBn~Z0nmvB0M&B0CmaqG;4%!~ zt`itA`%HEb#%G`D=nPhO$I23(NR#mNiSmLr2t*5rCu0O^xZMNXLGHSwqD*bYm5r*I zC)XL1LU;mK zXb`Ei^X6?daSm6+0j)P0v$2AK=`A6r_b-1w``Rvc>2-BXiwAy;XjAu3-46eZfb4%H zMnVb=!<(W@KscB#HaFZY#h&}#aFN;W$sxLwRfHYr(ouND#Q7G)>354w^Lcm1|S7U%N~PFkfrsE_eb{V|vAFCnC+XUKf5`n_BvsV2yW<{x%ff0^0udvru-WTn(r3uIfx z3Jt!9d);+X9F3-g7%?XVyUIxrNqm3k!Ya&uZPd=spPbQMdOok0Xq#CvIvb8uEBjKR zAUoftHCnV>Sy53Hu0nhHV}REc3yEtoqGn-g;S;^E^h!2OOtW-&f7X;kqe{b0j*+9P zI)JQ?RrQ0O%4ZO`^_9SX*>d*(Fel8W_K`3Oh?8ra-nCsQ_Xy$`JJ7yt&8&TxUWBx9 z;2hn8yo6XJKkzmB-0S2oGo$)M7Z|qjj~mJ_PmdKAs3?M$!X~ss=Y_;;AYMuB&|f2rr;|&98;z0h!J3TO$9%-q4@uAz%IkvAoBXxL`8dCl-iT zM?PTtC%|kGLOI`_gg$!(yQVtN;H24Rc~;-__zM~JDVJ}5GGwa@GkZ$Bjha;ynzSU0 zVy%L?1ok$P!&d4U91}YO64@0CyH?#fiD$q6jbP~S!VTG*8eVIa){F^x97~qUkG#0E zU3r_ELWZ08{R)8HqcHb@IeDAR)qcL$felq))Is22Cv09~%*`qiO`%4YN~6Yl^Q|Wf zm)(0ZbHABaiamc|7yV~2Apfp={_lSxDuX3R2Q&yqUQsV-w`36RilQ>KmBOsMS2Mp^ z;*8(WxuZ;eazzcTXO#Gg9P~Tp$N!0I;(ug}`iC=R2sK<7?a{71({@-m)+B-Vz_{n4 zcY;tC4tQNl&cRLe-ekgHndeSYyWtkyDmtQt_il+@NUR~7t@wl9nP_pI{VY?W9yv?% zOQQrhP+sO)-i#^y>00|d$C#PbmGx3m*H?T|@8YAMkw{7>T~G)nYxpae_=8XVPy0F1 zURR6u^jfDc_$)_Jl<);fEg{HuvetFQ$cDgbLjK&nY(#LBI;) zKXVS$xy{sd%Qp^z7p;K!94Q+27zShsOF3yXM|JK zVNim0U2Lb5wI+J%2WOp!pWr2b?U?K8nXhXsoZ6$_sttg==>Ayd`d8Tb+nG2iGFW_b z4t9I2B2CJuW<+Sm{zi56nBcIPiP}|)!tP><_&Y$4UbtU64{*A}6AK(2pls~WP35aB|^9AUcc7dM97=?fi)&KBfCQiXcU>RB#$PqLJ z2qm0fVR0OK`wF$f!ScObPgTnxsq;CkaNBwdQYhhwmVS?-KP}mV1qO;c$t-2 zQJ%#$D?gNI$YYb5X};O}S(=^A^V}=(zg)&2elx$04*;F)`MK6!7ORzoW^GpHxh-7d zG)0P>pY3nAo(lG6^H_rMzV<1DIvchJR zxN8{WDNgj|FY3-e9)^>J`vThG5DD;f$>Q8X$5r9B_>+~x$%(3`Bi?}l7vc`%zrIgb zd`GkbDK*$eFM(&uyJps2OCW>1kU`5M;x74xhm$*Us;$<`Q+L^$&sJU1$A}Ik{^Q0^ zwzhz$WBH|r4C-IE(>4nWNLLt>m!mJR$?Nf|@)uhZcLd$>;Qh{ok_8<=aW`^1kii*@ zO`wf*9$eB=0qU~tXL!z!PPu}&eiP@bD&4f-5xp9b0TdJ;kUDOg0CX=1&gWDf7rKUC z3usW_4o8?;Rd1IHcKi2s1P}Ib5hZ1V{2v++OAB1+03o{Ufggn9J|}E*t+fi1YdrVe z5_$Z!i|3@<$nPhp={(>xPT76Gvay{NI!I_<=|dluqqkvq%QfnkcDr1j5;t||rrP^# zd0Z}HM4JF?ql1j_1&AW8@6gQEs9=HsF3&n%Hf~|6=&`Z3^KM^h>Z`YzRp;{isQuoO z_b`FJ;+(K7Ex=@z28JBO9Sl{Wn*y<+5!>MR*&d>|oDnRDm``=+;UIR`A$8EY^?xl7 zK?Pe%h;l^*WKcFd#Bv*RY?(2R2vr)rOZD_;Olk$Yv-4c-p(W}f^rC1R?6KvG;MW9x z#BI!vj1QZNTov9E<)y+ag*OZ?ch~nk;f&C^MJ`PRR5ZOh{-eSCVbJWqLHLLW0_;#5 z5QtSBhi*v$V|Bid-hy!d1hue1S8CxygbqnSGq%qQboS!?r6C$z!d+nQmjsj^m+~;Q zO>KCatFf`}9sYA0bG*C4AV{7{`}QqJB8(ojSVEAgofLY}M1g<~zi_qA;-&o2RQA>M zao%@=u0seFvhIF{*-Mb9A{%fj7~!T(a>Yfg82TtsVu1ZK%~e6m()tJ50)@N(hr0I; zYU=H}h7Zzf=pBMI3j#`&8f+910w`UYNC&|J1PO#*q=SHf5EP{&(xph~pi}`xKuQ9F z2uOk&4wA(8{GR##`R03O?tAWe=9~BVUo(eu_O-9Q*V=n66EWA@yQkYBwHR;uUn_Xc zK@20>pD0*Bg5zoBfe#`qSubuk+TFh1LdZqLe~e~`l*+syB}spDo3h%S$@7@OwnNFsbq-Ge$Vy;0MlJSg^XR@gYpeM<{qB@>?>_# zr+3#L@{x;I>VSyxBUaj{ZY^q!t|mbOIRu9_Q@Ozq$MEZwTL&s)ep~A{UY8Th#zn5& zIH7|0^RW~rL#@ZHzM$!YoFp9^&~$Yru9X$@;>nZmg}f_fju z!4|_(3M=-lE@{+|67AoD$?CHKVFbs5;CX46m8$S^O}PtjOn#3{ z9S}K+vL)+K)1ZSv(!}pyrhPPx<=_r?SJ64llh0}eRg5o~uP1#J<+6qn&ty6`c(O zH;Z%>LeC$uT1nHN!T5+o3WMAIB^)zdLOz7iG`62Nj9*z1mOm&t3ltYuoYcVEPmH;0 z06TQ=(9l9(Ob3dF6k6W;3(%t#N*6)jPd1Mh{sQ)`g?=?t=XrN{r3WgqxfA@|WyLFj zqXScLz5@EPepB5#XqUHbGg)1V6@CuAtx54he7$1A&n}wCz08zWGB=^u`$qp0M>s(d zI3kc=)?UML=+fisU`DpmeAUofNq<}L$T+J?rudno$k1A9CRZ#Z1VDk-qX$jv9X_M) zftVL2WmhlX`>rpMZud$N+Gkc26}cV!_+k9*^Xpz>#>`toiFI50avy4Suati0Uhye> z*&Lb|YSH>_GvNwHdc~X57vK)S+kf4M{x{j0ZXY&0nS$d*2Ol|8RH*rp%wXf*L*(tCpqnE7`ekL^$TGp`q02imX7f-)WK`wsZtZ?cPfzbwdj4QUNf zLaC*=)e5XQB-B;@V;Il$dYPSAD+H6sm=E|+zb7*xXtEx(D4ZT*)T|U_n$;1dJO3B( z%~&kLFP2x;P_BZB6=x|xQJuCcHIds85G3x zmG42w`vLgb*6VK6SFR~J3SXzs&TJ@tkK#-X5x++V?b?$xX>6QFo@fKo;9jCGqRI)p z=>1BrQBq>6dFqZ^wubAsAK!j9moaFtrB?OT0e72dA2Dqw`kZW*k-U&HcJzxc6OkVT za7nxC!37sKd6u3rh6kM+2O^))daJFHt0Qz)m2N~Qj@FI&Fw1@4!p z*(cjj7ZqL^Gy5i^eF@OvA9w8&YJS@gh@X;DnuiLmq*JI07IKD*QfXyXM&+Nku>MD<3LZ6$Y?x)VSaSbt-oJ78=@jGSJc1mL zS-6i^!|<&Wv%$Mq%l6kt+}%fZk~S;YvHi8}&hq7RpQ7~g{-afw|4ZsWSSHkYLki~m zKB5&=ND-SV>P^4?7XW33Df!AGe}T{CD}U?*p@jdKjwD18Xd7yyNm8FS*K_#HH5#Ci zg!76&K8Q&M*Pb0!Jx=~<1F1*QmBI?3-+YXt>xNUTNo}bVKN53)p6D-q?$(BczF~`K zuhye0GuN(-i+MO@1&mnqMBN_wdI6Ap1{F>A)MCh~!wWwYOG@ma!KRxz9VC49PFRY1 zJa;q-W%zLQ`@S^r@&f_TP`Q+8!d41+Lh&Wm?=PeAmydn#5W?X=XOCXF{nFDbwd z9$$n}@AK0U4~AhpzBoagH%53n+xhL%WRk9But3fG`2oY#;X6@~V2lh;=`U-3x}&?R z%JT~3a3Va)A5F*H7s%!t(n@|O@NuH7toOd&uTr=2f^{2DPc7Z2NU_ruG0^{*EKhZ*DwYIh_%ZOVlu7@n}; z(i2{+LFwZC_UFa^BGb&eiFHAiC97B&R~-GweFG5$Bu;o#gj~q}u^@6zVlO@rUx4X) z?iUIN_t}77lmA8q!=*}wuIQLMeh6OO>y?ySAXA<~t*^(cU@Xi`s!b@LS?-nMIzlRI zf?X!pY7rY^!a%zfTk8Kg~Lm%hK@|&i|}i7n8pfn;?~X zui`Bb*%3K_YZWXwvPUUfHqQMja#9@SK+d)n*+T%_8^lF|x3$7h#QprXE|7^@clv}SG#pP}pN(PVXcT)0feLcIU z9QZ@=*`bVrZ{{#tF1$~F3Y1`4aWfE~qwt24A41CLfiP4PVKW`AP>zZrk6t$ux|U+q z>X;OuaWQ*Tdg#zx@Bhf z6N`MVfaIymK5@2EBf?RdZYJxz26}-smcunCQ(Mqz4N328KD~&#?~w3)N(1>g|E|Oc zVt$2QL(eJWAwZc0f3SjINgQ<9X!{yh%;I`8-Jflu4vn?gvaJ`0+a>e%~eh6>p#d$IEf zKYD-iRy51^8)3UqsP~(d{B%E=cTA2fDaWb#uhjQ=P{Vkr(N5Mit?aV2^VQhn@(*|{ z@vX$`^u+KPBa3RCdvp_#^BBo){>Tr`Iy2}9NUWx1CGIy<3rfB9NgpF6I{J~7^y(X{ z_hVsFQZya|_{$Zd=?b3{m`FC7+S!xKzVytuFyTp>jG(?<{)7eV4#Sbb4j=)w?_U2| z#j>%&h4WfTbm7pK27eMN5_ap?NkRjTZ(jXQsaY%wvK{4YJ!#4@ENE9;AmwJzR~<#S zrS_jgFaJ%=^sjzQ+-O*t)0d_So`R4R7H^;!d6PI)RbhYf{F3gcMp<{g#xX`9k`v7Y zCI=8z*D~U1$HDx*y6QoE1aiD(arSarGNc*Y+#6 zq{TO0U^xDyfmIZ!W6m_d&WFaR^^XcBu6awV70W-8RRzDf^0!XUiMvx_=z?EAV(N zYGm}I84#IA8^*OEdJxI_ti;LwnzXCKz4|Aoe;oy!d*Cc+6wen+;weH5-1dScgFG7j}TAy=m`Kgmfy<> zBd^Ztax3WrlO%Or1#qYdCSg|NY57jgtO?LlBNLSp55 z&VV*Nk)R#l{Zh+sd;Chb-Kv*=j!e6S-6s7R6jK!94BG9;0a7k%u1`laQnUywf{KHb zT7O+FJX6V%OzN?axHjtgGeeUwc5W3DAc)RcIRoB0xid)E_6y`i4G;g3_-0-30LS-E zZ?JAmr(IgCc;tQ1jSkmz16LyeUWaXj@2Z#4X0dFyNVS5NTOG?-t(o8BTEdNvON<<% zDq-h3GDn}TZCIIDdDH(4C%>n@r0G($sr3}Ea>!6*F2(;rgwTz~yhW2lyEo}0rxy)2 z5OR*5L$W`Fd%xWO6s3j9z_f1e!j6nlxk1M zOUAO#i9+QY0F@r(Bi-3vh40Wmquc=dBWhfHA9`ZL@1J@2Vqa`=v>>qJNWH=59zOkB zrkBnQFJT9Ovj3L)92bxQb#w$BFubSx1W9MviB(DQ7o1~Rqu9si&Wqcf3ZxJdYCn#QManc}*<8zTuNUwLUnTwc+SEtsAZZ&L*+|*C>r;o48^(^9un--zYqVXe+|slORn%(+4?kSK+g8}l>~~y? z!HZc62mnuzUlVb=oWy!UTe3dK=kL$D;1x@~?!ghlamzQU*(NfmDc<*;kKIGWK0hdh z4c5oDMjrbVsfe}|C?d)1(6=iWoOrq8LwL~e-qkmBhi!L z#K4`yDzpwLR)$ueMPIKkatzn%()94TFZa}PTs#}8K5LM<$Xz;`e`a+xY5;&NGk7sH z^IE7|XsX0~3z}lC#`Hofed`{eJ>-p~-9}4oErq#4;c;y#B87Z|B=BP#`c$NlXG&kz zLKYQNQ)&Ek%6*fd0%C_s=)9alaL%m*g)FR@Z2UrI{onmEk-C2B_u(`6bpuH#)))n; zRTvNDGz4=%m(FT{Crkhw()o5`MmgZu)^lxt$EI^(9+wZDm}W$?Lj`0xOQ)D2nL7s5 zR7+<@T8SD-y+f0HB{9Cc|H0wfF5glQG4BXY|CZW!5*YsK!Y5jV*W#JSk+;}pY|DRnFdwUND3Yxj<9*2fy-H`z zIe`du=yha+fwd^)s(I&Hn!9CfF(rpVx?yo56GVTv%a00_e7am;%n=uwhd$Yc7D2Fdpkcph~+&MJmFX$rp3@DiA79@p zfzP}OHhqWaWZ-E|$L!^*(fw4W@Lh$8`(c1!Sjq}_aGK%X4X~IHaed%}d`&0B?jEpVZe#@a?aTHC0{keGWGaFpXT)wlAF75iP| z2+MqUr1$W+cp|;Z31?H8bMjFLWqk#-Fo-{rr(9@gA$ip8CAK69FKvlV6g*hIfj8G( z^o#2q8F^p)sx#rMSb_lps?k=l@aAqDWR)yhvN<%7gR1D8i@i4cD*J5AZ#Jfc)k?fm z!dF*7EHe{05AZ}T!W*0*UC6-uc(EFIk_}BnUnu>LN|R6B110?_H<>KyfZ?2P_d6G& z5V{!ez0Wdhu$^bm9nm&zBZ}MvjNf zpbANf6y)l+byjM7c^ORRZXdOrJ}>b^YRpzP6D9K`{Wl12kz3gfTuZw?)DSs=;a2Y~ z_A>8kEDYrOvNYoHra6y6MX!~t<(+COb5iTGJAkeL=!IzP@V|hZCVr0^bqDk7d6YhM z4aQu@ysKD*@^C*S^0ccBtRq%m7q%?5_0%(e`=G$0+;yYr>RS1&=BKBz^-qu#rYPMx z8p~v^oFp-J zF3spF=^FRKn9=0d@jyh8uzsq0TE;(6%bE+(aNLlp>qAm7Hq6<%?h_)U!cqMEM&6Z6 z=>>~2v>S9kLn-`1G$bFvmWqKwVvC=j?$78E4+JFU_wCI$_0-^(Bo`JA2DAmz+iIN7 zxKDI;w8wk8%lvlj_8er6ss>775XuzO=O?7Ga;yvfY(V%5hVZh)MgkG+bDk}-{3aw$ z>P^22n~z<~AI^voT9%saWRXO>}QkU(gY=ep0Ey+$%K`Mv`u^Cr^qgVgt@%z644%~+?V96MG^KhR| zeSG2|mY5z3B-{R`O2 zL46sd&=P9Fyo6!U=d_26D&sBJn5kt4HI7#8b4k&~-POkkZmMh16GQn?}0Dpm?n-GDRVZl8A z7idh}d4><1NA^Cp+l8Pf5 z*K?ri2is`!Ad)1DavrrPs0a`|Fgv;?_+!kJ#ly&|A(c{TokSy_1(@b~;h^kfzcT&G zO+UCG+DSS6M0l8MTf?K#osb|A?usWpoz1V>-#@Lc>ODPh4A9P)N6=Vjag7jtD&K7%6j7jPdg90@(d`s9>P6opM%J_5dD{3k!BM$T#H$G5;hdmQl*oby5` z*W`{%TSVEUcl!O!!KT3cy{F`1DGAI@@zL4AqqCHgv)zKDYnKRdvn_tbW_wJNah?I$kxgx{J01BcxLYJ7|_3l=D+|(AZ&9t@JJjbLvEOA{@jx0x!F%UK1Jy* z-xegES>(MI_{cliMMJkzipi=XGsx~N^N1)0G6He(BdWV(gs|Qg_o$=j?<$=5P(g~j@LX9$N4u5`=P9*G*a8^hy+^PS%9~oO)ygswagBQV04hH4^~qSG zQ3jiD$Jn3zKJ25phm5IX6CIv4Zsd<~Ihj@3BWbT6(B&3M*p+xgs0Jj2yqXco{o!|I zFd_xP|ElYXJ3F(sP5v!^D}M|}PJsluBu=`_y^N~|Ub-Nk`~u}?*gzylmoIMp@hk|G zcv<|i%S}2)c=ojP$<>nVxsuv z{sHy%ouQpO3#zHBP_yuX4HzS7COm+NpfDE1`bW(v0oRQ`N`?E4>O5W%|8Ta~3Q=zn zWTZxSct}#Fv5@c*wsu8Upf$<)r$J+G3a6^|>!W*vB_S`ir8PcGG1>SB-+CBze_0Gl z*y>K4hFZEVq)#%gbp_Hh4^-r{BIw^K&FX)3go^9u2u@wPGPDASfgqMdNy#@|STo|+ zIPKutw)A?PVbN99Tx+Z0Zhn}Xfy93^hV-v6Hx(b?3-l!091Z+(SuLemX=9 z<#;JtpO}!m>cTc3kz+)pVVMftmL!JpLlifErS)XG1_-~Ed(0#?WD5rLr z5D&;mBY2%!81{}J4Z2lLD9hCQH363-NZ z&vjpFllj{S6&s29sTY;wD_>dk;sMm&?z5wyW!k5pZ=!M)QO}5<((s30J5o);$TdaD zFD%-fAJ^Q)ekEFTUB^2s=PLZ5g^~92v_fy85M#Gkt!cI_M zL+7VV2-1|vkB0T_v7tD-G)r%O>Azoz3`Hp^-Jz5(XT@8yizc!S=gC&CL%Dnj(<{@> zhAKQD$TyMZEN3=biR6Ov@fSV^Ttznf`?g3`H$<5G71x2g9als~q@!eT&|^HKJOE8e zm?0VuNh$3)YSLi3%e2~l;mTe3=U1ySI#s;qKcNz)t4m=en(o8n1_JteQ1xW36sbg! zo{JRqhBM22P|`KM{U>7g>v98mAV!FAL6pV)%iukFUX7BVy9VNu6yBOu2w+D$9$i|b z*qDkXb?L2!JGSQidL^B=Pgq(t4MN(?tB%~AlHL@{my#@lK2kQZq*5J|K+{V9WII#1 zaAcAfq;I^*kv$YgK2v^C?{dJFfu#MBIP@f!>JXlY_%^7$f%?EiA!q4SUsYd+h{0E- zd0){V4W>p!#G*#*&c8bx83v}9=djJniNV;#HnQM!R|F)iQ3T_;jVdzzF#YX|sxGIR zN>3w$%eg|C~tDSm@5F={cibEN_z5)me7SJxe&Y z)%f$9{!8u=p_THik`8$qUYmr*omv zX2i4{XH-hRZNqj)a5b!Cd(_C}nTyRibW zIqX=@Z$E_SNcmuIr__7TT<^+;e-(2C5?zhYyV8*>k`b#7Sa!u<^YA`63&k3PX2|IQOS88#_Ac&+nd=om z<#*Wws(!yoW9+r*+JK#%-OMYDZ@Wyz?@E2Et_J?;qEK@Klb@T&u3&rB=quC=UNA?# zLZ`XOd23Gzt2V7^n_E|``$@n4%@i{ql;3*Q5bkyI+q?JBsvWZ;E)V54+8&N{FWOjh z=oP8a{lX-4DdTZW-&&W&iFvUV-xjP3ciI|Xw;CT$dU#q2uwaD4(Yi#VL1hjnP?aE* z;2*pz_zR{n{X=i`&4XYWj&#-jDh?fpiPWi}@ClK%95*@fquymM;&4ACT)su)B;HA5%6-0N8Kw~u3>2K5v51gKtypnt!Q zUr9pgG~s!Z+6hy$%FkwsX#=~dcXCTTcw2$5{gRA;D~g%MN`j2obU)(fnsM=&zP98s zQ4#f&cY9gF_h2~+!pRdXa;(D8(9177Is-!`>c*39JNICB4dBKo$G%8mqJFFo zK9C8-AU~*u3n&D*yZL8c;67vD`W&D;W6C$1{ZIYU^1mXXF{jWVj1}I6JUUO&XeMpO z>%+DYJa=BHhev3bX7Bg_%i;EdHwq~J38az^h$ z@gVh8w0v_n{6f*ZyraA3=N^Fe>-9d!ya5dcKBDH*v`|Ilc+e6y;39}V^)V9eVuHEr zMa``={xK~Zepg#{Sd2el;9l%ww-FI3VH`WgM_Cwsky_R*h35Jg$%KmS@6e2u^6{rw zhx`a=QE42Cw!U_?W`k8^ZBgc{%&ogq09+j74GEP%t3L0Dqxi(cNCq%=NHj2g&j#8X zaB2*-Q_JTefuXy4-?BiM!MyCi%zxK48AI`(3RR(n&P zacoh@8NLmxkgvJO>Z%byGT=y#Hbv8Hx8ealKhsDpW(JXWBja=M|zpmguL z>^Y4a#i<%=_lOUxqg2hP&%1&njvRb(O;7&<$JCE7oOV7Ym6z7;?UA)U_lm~_oMTw* zHcXVZT3kaYpYT2wIqTAl9DqfIIx{8qK;uff7f8~|0>1SXEAL+F#vZ9(NL5Lf6A66^6s$=%06ubw&Dx!GR|GQok*KIic{YkXzAQLO`oaZqhY zRZl4!uIVo@Fv$%~&gen?pYv%DgIO)Uely18j+(-TXW^Xxq-eH1Pix2Xk#>ua5u5{n z$9ZBG6bQ=0Fn;vqDc!q7b{8-_V0GN=O!LvwmGx8A3TmahEBx;FlCmcHr3oJE=$Ab( z0on%_9yH>}2IWG=yapNTzdH-gJy;97zh+m0Y;-tlVi&_;sCz1nr|oK8fi3&N%eCtlq3Fp0=r(%TXUQ(vb0(bDHBR-j05X zVz$N^dJ%N2Mje39hD}I}?7(KEUl$H&6h*u36vI0Hem7|MjQ!ttI zlW?NXg^xD5e4L0)?2?>LQy$bzKsNc}<$0>iKAeWDn-pm1xY02e0uW%*3>#QUSP}0Q zLn{;8dL%*zmnBIr$AWyj<(qsWC(9CxMm>YFv@;YD=)t861_s}Zfo~X}y_Ayk;BjJ# zB^XUSzK1RM~IOXfz^ z+{m81fi?emx7@If+iCgc9sW02BVMPYF3wLt`T=_xlKP~;zP-3eInqHinl5}TAk?VE zzP4FGama5-MOvJ}<`$3ri{lq=aYw)72hhxAq4ht2I8Cb1wP6&O>fR#>kKyj z{9gE3wi<_pk(kiMk;|j8U!p`0hOsGyw!7}7X!h42UF`}=$a+Td$L28W1~u=KC4V-z zozYi7L?*SySiHH~0CD;?!%9orFyax04$V^r=HQKbmRtyT$}0VM>~7BcY^xU8+1nka zz8;pLj0vjige2U}d8z@Z39 z!`CK8XR&NnAarmt9V@KMN;<$#_U$pFkA2d;jzIJ6HOxp&>zgy8V&V3PllOrBJm~Frm zr32?gl?`-E_U$SU6=GwRnpe1;zn%RaDk+&uQh&R)m@GQ}B>Ja9j-;`8x&hP{hC>XF z2iQcy;eV77FsUvKzYz%!2TK|>uw2FH9K$({?^%r#RVuPkcS5eMR-Ma+$)03XW~1B# zC+i|VI?IEKwP?DJ4<>;v(V&jkq}mwEz)+=;^r^49?PViYOPP01kownb z`;hz;-4ERY(0_?MzC}AZd%JZ`*+0+5NnQ?RZJ+8rDv6g(>$a-YeXjcMG~E|8hP0U; zDT0QRPNa8>QuI1W!^saSrL+4lwAlItij=3peTUdv2SPKeW5%G^EoDbApuN`MNfICL{n0UW%DVxi{I$?t46C=pWY1po0gkst z(a6SBXVKJq@^di6nL1b^!HVwkdlK1@8 zPOeLOEKLvDEA}hhGj&C`pJ+8dN&kdes4q(~0h@@i#w6TeeS%OVKvG=Kc{!01EZ)#@ z+W38T*u7|3w^{m@A)_}^Q_QMF#2R840f%m=0*Jmv98*Ta@fmC@y804xUwK)6qb`p< z>V3+X*>N)efxR_nOM9;rE6H@tXc)l~`C$b{69QFN&GwRYSqa91W}$R__ab$t5z;Icz!a+sGD^8Xw%nI%=qvkO85^eik$d7fJB#_3s(FZy<_e?n161 zMbT%8bC0^ELG`Xv1Sj)8oitNJuQUD|de=*ggmsV#PoYE$J5PN~JBvnxZEl2lnhqM4 zk01D>U#+NCxy}n~?YSiAwh=*nW4Uo+m?96U3+GFop|^ETXl_ijf^ zk6G4&NoZ$n4f_brtIzdKUj?c(LGZ+2K4EQl;hZf}>dcR<)_1HAmPE?}+@XaCEkx9c zAjTUmNwEYA@PR3i!6zO@98L&^DE^81qh4waPoFKv* z!9|-cVFLYDCu3x8f_Gmzu(s?8>|wzC_s$mYGDGACjj>#;Ib(gRkAN9!b+;_VAG9aF z)kPBd@ODu5lfhRWMH{XwLHGDqR7?hmPCDou2Ut`nL@9>dDq%1_?*V?-%JZ zB^}`%Q1k=I)n6P2%cVz30-o1;YLSh{MoiUGV#4+hIGBz_sYFUpY^l%DSB}h3Yj|90 zH=EHjit2|msU1R(?#pi<#tsOC3Z9bx9p>nJ^4uZnmI-uEM}Rd)hA1%tWTMW2mH?HR zQz)tgJ);qldCjRDnp|dBL)G!znEp7gh<$~s#b%BvW=Ck6ZjONKUx|&BB8B!U(;2NP z3xBR*b+R}4LD&a?RffbfB$J;-u z#ofE|a#YFTOt$f6gKX+*_7xxuw}T}akyF94XQ)ERSuq(@4J7U#AU~$H<^=NT-w9~_ zn#{H1=+pDlmgVr!ptqE1kYq~E#4Hd3)Vx*d9^g~`loQNSy$C0 zAelx=bcQ}{JWUvtKY(D@RbWm=+q=2R{xQ$hjpzMw_X8^f-;nv#1%G!LEkFd(giE2F z1`k#@6Vdjh`kvPyV#t?Z;wY8I!%-l;1btoN6_G|Y{WDHG(gwNdL*QARo<-kD9952SFo}2bzS)ZsmfQ+qKkw!JNVgJ(CXxu`()i{VuJkbyRK0Xg^=5Y^=*X4sHU^uX!tQT;<9(*}xa zoqa=^X)li1er`0J7O*m(`i^wI8$&mZ*7(#FUPn1cB;!c*edGnrls&a1rC>8@ktTt+ z+lK6YaK$LG^t$u{|JzYvZ9DlS82Jg9j5m7-1?p4qdO1qWuIZ!0idkF37eT_DtK9qM z>0kbpK|N>J=~6S9d>|)Ks1`WH`I(vGm-hP1P9tKueSGvP9>umT+-^TB!;LIzrh#3N zc1hg)`a8h@asf*a+P$=Tf{3K3di6!f4AoS9uVYT?Uf$s>e!WlTUpy18>3MS`l;N}f z^^Pb#*qY5-l0rY6jbc(yffMU0l65`YJw2w?YT6V8W)~BzUyqpcH!O&lax)p;H{ky- zuR$KP9m%VAj}w)+mJt`Adu}adL@vPbYBQoEI&PvNQ?uRb1Iw4HTQ}OBXX*NBUR&Wi8rX4Yyc_rVbSgkBmLrTfCOPrk~X`5 z*)_ul(xYtghRG4ckiUSrs-M1G-Yuno3U>KbSmCkEQI;F~MW}nBTvq`{YAq$;$OMG8 zFN$j;$KagJ^`fY3vs*06wZD(qyEU9T?jykPK~VkzFE9Knm-)=-N#4YcYa+@@GJfphaeme^?{hQX9Z$;`W6MW85a;|12t!bN=#jM_t{Fby!UjzRJ3jZu zay-DQ(s<17e7bl?xj`kVE%JBUUR-A@kA_P8rW&x9CaFcO?n03I@%0Hb82Zcy^}WQx zw_|<;>a8C)k#0Os&b&mRi!?4?dlaHi?u#B`VZL%?4r15Z;`F(FA8zh^U44rf)K`Dx zEx6qG>#J_eU*K)l^|6exVCR&s;0ZBkIn#bs?>d32iio@LN%uZ}kyCkSPFwEnP|eF; z{eZgP@^K<=wv5D>YxNu;d;t7npu0)FM#|W|u;#+J?U1;OB(50}L*8;VoY4+wEnM?& zy}A0*y&Q>;bZy?LJ*4@LNxMY4(*fnL9F^@f&)LCVRd>sKzYFOn*N;PaIQ!{ z{CuALbrt@7Ti@r!1L$c$8S>v$UTErb$d7tXEsPZLc2;bf4K|S`nSQ@t8{p;7;p{v% zE;-dRVTva|naL<(pzi`VYbdr;k$k8eTTlZ37U4n~Z>et0sBn2>d?J6$;wbq6L(=GL z#bXuG^z3ZjvbQ=C*zp7|PLX=J!bgv9{wTeqo<;I$86U5Burr9_x|R92wFOH$Z}= z{000Jpb@tIWnLm=H5PGfb2V+~>9p^%RJOHH=9e&)r?8a-hO4hnTCyK|HLPb3$WrUO zJdThc9KV6BjT}Y5eq{*y^Gy)ug0%w{G^=SE|J=mzPT`DvdXB8PE_+RD&I zYzI>Vv5)!z?N+sHo_ycC?i}x9CJ(npaqR%3FY8sMT#4k8IfOSHhL-D|g=|yUe4LMy z8o%^p*?(7h6!W=l&x!t>XjA17{dwU@4g)K?FW|AgAx#=1ad&UPOli`LWIdGYqtxc* zb)tar?a}3O6cmA$x6jah`2y=&GG?OwuYhp>)wsa_n{YaQ+|Vw1LaUUql&S1RQlnvS zZq(_Xb)N4h0b9c})jf~TyTUG|uF@DH9{}I~M@B*aGa(?hKWWWv9TDSCg_`U~b(MK< z_VK@CSILwJJ=PhmCBks~9w!jyveSN~1%{Guf=Fg)nHsh&B-%@x>YS}t*Lf0m-+59u zE-~wZ%)Oxoqo+T=#tsDmdjEmr-+%V_|1~v1-;47=#vz)S`}{D%6~h^6`{*nA4j*my zj6GF&+yTyZqbE$`>lHupPZsGrmq!V1;+covA^iN1soa#HTz$d8mtrPe0tFf-i1~SX ze`qF+){^IBH9^oK?qEk*`1Xq;%!>O9qj&BX|3@PMJw5!rLntSv5 zq1QSZpEL+OG#W&5f}?|24qX$x3my2jON-!yG%cJl`xGO-aI=xSkIUd^QB=3^h|~`(R5H1 zL3t=XiV(GLOGXNUuYDsoQ4y+12wNPW;TR< zJsW~gxpzRZJL?_UC;6@-5{FwG0^)QAA`o)GLAg&{`N!)yMe-E58~%!-KUb?iSq5Ib zq?}uE%|Tkk0((m1%sr)bG3>ZpJN)c1GzL_Fx+s@{`h*e%(2cZd0TD@fv&&ylu_mo5 z@gJY57!KOWFyx<1FOp+qqV_+Hq5DMp0KF8N4Ae!c6X5g|c`yYL-DO!Ndu-G(2HK6# zxR(B1HYr(V`O&*}eYZ~BRYM}6At6Q$L;k{2S|{9#4zr-o#mUYz}Acq2h%L@F9u4Q^`dUx7m-R(hoKBxDufxejE>5~7%t7R`L$jZX z76;Eq@eYN7%hm4%-t`{lMrqeCxZp$6yH(KQpJ;H(d9Vnxsu*r@wpnz%$oN%twu0`p z4~-*L(lIY;=wnCRa-}3r^dnfnj9v@}T5q+U54C(Wu&|u-@{)U4{gTohWiR8tmVRc% zyc5_b4Tu4|^-P!sHJiqWwuegc0L&$@g|uc1V#AY04b}Z_dD|DD2UbW#ZGJYwzC1{`$W)SvhW{)TF%h&s< z*}%#|#VUNTKHY^FkLjxgsdW@pXcsf|M)Hojt{wyBY2Qw(sHR+PI*& zRUX38qw)#1eONveUoTCon;6nk`?;khul=^2nTPxZj+gd!^w8hTM3>HTzLd?SX|KOt z2s+4mcwAz)E|?s#Rvc=SVE5zq;u0<3Ls-cbE2n~yGzoXcC)qO0=~48_)TjD%=o_Hc z8ZsFoR$@s%9l_`d)bI_{B^e~{=T)C`i3Le6Dn@Kq-)HTnO7F&>N1QIDTZ7FX5im(O z-wakt$aJLsb_X&}O+syc1*N2{xsyMA%hfJgDk<@&MFVO+IzA|JEA`mkt`|`gBJTDQ zH6&r67pdM3^r~|3>o46FsJ42#(gqZY(&ck9>Pi_swB1PJPJ{f9Kh% z0#e*pLv;K{v{IH(@~xXri%WV9NiHXCT$XFHPSPcgorjcy?qVBPdM72&?4V`fs&W!m z@HZl`xKU10vHJ2LJwzYJk}w0^wL zFw_gl#$F>9f6m|Ounfz)a;n4T!dGeGoLfpZhLcja=_1u=lNjD^d5SL>NYd=XHVY-8 z#h!sG-%tHUa@o};tk~UnV*LKd;|i5P&mNYv=C&HlK{p*a183FsfQSS6myn1thW|Gr zapCPc`KM9s7R~EPm3VZa%-45fyFq7z`oHjG8fa$#ppFwUqi>hF$6+q5g)`iC`hjVL z(&m6BJym|6aQd+L??k04(whPjKxraUlp+X7lMX6fLPC+=1O!x6 zx}btcuM#=}Do7`Y5KuvC0vaAAiL-z2%zD>*=bm-voSAdieeas{rz;pX&$G*Se?Ki9 zJy(u>c)uNeaC;E0^)UMAN33UHQk(OZcQs~g289ec7-jK!!y%N)X*kXYIX5wg6Z?sza}JkY8Nd<0qK~~8!bb8v$X1-pO*1&>m#3{6 zagnF#Yu{+rI^7ylL`Q4`a2J7h7hy309ab5xa(U(y?S0@YJ4Thcrk}35wuZ-I?_U?Z zG~>sqdju34LCdJSg@-%Th^zg$BOwUkb-ZMK_pMipFPUHMU)U~62=hB$H_+-DI?#Hi zhyzLXranPO5*7N8M;bE)(RWCQJ}s=EPcqiU_l0;&`HP0Kc)3&2^~{!bMFxE*c7L$I z7P&DWbTntMK8x&_v1w@T4BP>ozO~OQ#QUTX4nmv8J{6mYy3A9}=f`x|Oe;x29MGe+r#BP!9)egu%jO zq<|VOqw^@~QqIc5vb@3Gu&lOu?_=NgcupwSHRs{l%qkyTTs<3&NW<29rgchFBEZCr zI}dFvBUrt47$2Pp?9lKOZfvUJ3RH^YJo(~R>OsY)%=-gMpWvm`3NWgimHi%vokTHS zsNw|$)!Eh>yX1op_S+s3cJEowHtZKpBMcr55u}G#6ANR1I&+}q$1!4`p@}O~YN8tY ziN^VTD)$uRrHttfx9ZGySlKCgXV(APaJ8s8D9=J(%=^!WrnH5ZN7L|&eGxl9lN)6M z8&5B=R|O<>USP_z`BEIJ_4?`u9+5m)T(5I|SjrCt{7bM{J!S@xoIXPGt+-OedRwd;)WC_|J_~vvq1mD6MAJ!w^9%ohmS#)#$0pD%d zLc^hV`jvubz2MQ*2dEo0T$oV}jAko|X<+9FMbBBl=XvG4(Tjn%s#DG?Ckx7Rf|?(| zPtP7s*8w^MGdHp)iSOls-P&Tq3rYYDmSU8}3{@N?eXS z+X66{>480XrpKtyw1rC`+g_Yt<6`1vZ_PTNz=-m`UEzC`!{Q+(u4fpUWdn|KKB-89 z-7(MX;0{@FPPnt*J7Uhe{zz@0zVw>6(uU0_?V;bUz8gP_;ineqFouXV+e+A1+>OlG zwu((EG1U}utD+cAY2ON~7}2h&^5JZ_Z7gx`wSi%_eo|31UFgman8*ysSID^r8IdZM zZTqXwt=J|}E_;&>ZEY8g)!eJ@%x`m(;`xoxLCS<-h$<45qk~3U`f6lsO6+vAkT8N z=@gn>b!K`$@$$s7a{a0E-=y^lJzncIehY6Qf4#eN@#opGCaTV}D_N zN#mK}^WGB2f&_#1Zf~!`Rq2enarGP(1Qfosf3cx#Q2>4JxqW!4spA=KInBa4Jm*(* z_q)5w3imG@O}~&I_~6SII;!;^`e@Ox_22Xvr4nJ2A4aki{SE{XG1V`H^!xrZ2y5wE0HmNVLqihRH@m z3Z9d#9=ri#aTe7`4Y8iQvLW#YRvf?B^L0Kr9k+VVf1y^#GDBB5;y~VM z0W-sgYCH$iYvoeVe&t;_R@RVDkEK^wS2LM zP9a%KAa|Phl!x69hOtYp_OKPF`Do_&i-mi$RO;gAZ_%gC=S<1Bo#!G6xr4oTp}4B> zZqKRh5X-3NuLe`!+szCce44k2h-};4UIo4gDmqBo@b@JYK5zin5Dk?spEc`nbM?a! zkga}rdyf+T?o`q4i>D3n?eN#%qPfYj)ZC-cPm(mDq7T%%%P2NnZ?)-xo?pW^>cakx zYYzRkm;-I|Ia=KsT=GKzq0O_~3+rVb_w*=agh~xjV3%sUcVe=N@w@BpHPcPN$%eQucqU}GOMsaJ+;hv~1oWUTtnMVvmRPgK%n!FsTt!TAC@IWk#C zLc_QG59~%JM?TU8xkfX(wmLGnquqh6{B){Ogpk#YFjBP=G+x zmjK#bv7&o-#u%kHSBX{=HX*EuoSagHN4=fyfd#5MHo;_f{jE>51#DT_-m zkw>rb>#wXj^FEBl@b1MeVtL{`mNT0akpJizCPF+X8jl3!XsD#onOGlIn8Tm8!n z=9X)^zyljC4+Xga(@a3nYi$hG2PqE}gw?ZM|Ls-*1JW%GY7WkQ zkpnaIRD^^`L~E@C9)Ys7&jD{Y?-BW$Kq5Qq1J9%$& zh2Ot-yDawb>29XjvWy>eg$HLT{Nq^9!Z?~Jbhdb-EwA2#u=dS(CxnsF`(loX5+U50 zQU670N%WaB67Y2D6O=oah4w*A6-r-mMnI!|ccw=B{0-DtTS82-vS(kuTauq*m{)1} z{Ao#MC0qCG=l??KEnKa`gfK;d5es~O-<%PebY@pvFe2CcUA=GiWfhy>DiAQBT;>_4 zD|F?bBhR4Q8^(O3{pJ1OKZm;E%<5(DxSANOaF{)^51;n*dRtw4DD|E3Ilp6h4zAYu z62^U;4PW4EKL3T?>c2xI_uv0V;L8egmJTbKTcGhm&T3`!7w@@kL|o~;gmIszW84}e zkDPynD4z&br6X_slYkTSAK1Yeuu4GXPO@E5=wB2KPrNkUI0~4{rtN3^c?aiIE6+te zU1*Lzr|`fcUV=A84h#QM^i1qw9%Sp23fa{j@O zgVS5nk%=(v4@Y1h^eazAXc?OG(7pUS)kOcC&EsF+@^A6Jf8bwqU;e;=kfMea%{omJ z3lDSnsAlG131uzYX=^-l@a|DLuKj@V2A6WC+|_0FxO>R6mGJ5RH-98{#w7zy!(INvBTeIa_f%f5 zoK~g#8S+os{%_f%xKq22HyCu`S71SAO1g};s0i2%OLLC6AB|Dy7nlBQ0XgCS-qy#=EiJ-|Cc8nLuH4H zj08OW+-q5w@l`-XF8Y|;D2P6cGhpr{ljB?K*M^2yqV>q)$D<|w^}_$nWBn)X#{cs> z+02EZNAulItTPmkJ9N!s!R^(TEG{v=J?<@xihTszMa%{+XZ97QJJ&1R?UbR|wAL3o z&Ak6oHQiLQfKJ*jPdIKN;$G!Yr>Nu74~zW!93B3j4sfI(ZQe|eH>Y@7{t^OYnu9v1r?fquWN@j*My4Zh>4eVcAu$bXTX@WdD6wo98hWLAG>zEy7^ z@@EmN>VNch=tTSb`0{oa94};@V^IZUYL*m66Y1B7qR*JeBY8Wx0T-E!;>3vgSS_yE zr@j~M*q!fS8#p1P@k@kJz}@9-5l1Juy&UACxU`V90e(kAjF1=$%7@VE} zhb@a6SFEi|p_Qh+7wQ3Uqz zr}uBj^$b2m*F(}3b?(lb=U&04Vq}5088b#6Gb|=S^#9QO?U&H&Y?}r>5p(f+!Q
mAu>1d~c1!p1@=t5HH3Nj}PS42i^y}ELW9Cjj|I81j`aX2eh&&CRz>bv_%sHGwt z|F+IaXorC7$((brB7MRzOiPak?5HjndLAvSGpso-SsxMk=!7COD)=`r5+K?F!}n6! zaf;c;4&fEt>|P&+d90E9ru@(O)>W@6vYte8%h|RQpVlgsd-Fx_STI##KH?h8V^BA+ zi813Ra-BnmwyR5!3c_xBAurI(Lw#0n)^PVn**J8sUVU`r(36pdw&S%&0|ej0qMi`j0eKNPF*6xJIZKh>+CPSj zV09K2Lfek?J83wDwrSQW4)vb+0(`t4=!jz$F%q3n=}WX2HJ`@pjJBNmAxc_*lya>> zJ#<1GFr$xsN?BBR@FB(SNPB`nbRgz~b2I%SkB;O5ni=XR4jd40ea7P6{tX8Y1(!Rm z&6Q#Ut=+K-@=WilQtUK<0{>O?X>0}x5rRy0X6o=D_L`6mdFAWKetgvsgCTzliIGkl z_)_F?ibVy{p-JO`Eyg&rl|m}S!0tkp@m;NEVrQ!Fv-d$lJf_LxijuikcHt&(qP5e; zm_q(_fmj15I7AC`{9>1oZ3b8hz_A9c zM4YWp8Iu26m$AHetvw#yUhrBwkJE6k>Vfqvzp(C2jUyZdETmK!?rRrR?30hb`&9aM#`r?v z<;JE+k%5|%N=dnouCwrdO)!Upha^wjq3~Q@4om6KAh`ir6j#R?AgKTBWAo@PjV)!4 zefT@(x@Sys%-gKxm2dZ7%Z=t4j?tk8|G@CJD~V|s9qG-|1F2WJzROyy9&bqe$$=|Z zOD2v~y2gl#!Ym~m9=y!nJ;^}NrF$Bv0IjGkDA#mTebV4Sjp*7UFGgfNY{=z#)T17) z$bu8dQy1#`g=KkdX6-uMpIaRPKBU{IW~M;|KSh~EtN&}y zczWAf;`%#S7v5+njf<`i_=MMD0e>?^vwVNEzBoMWTqD$8?h_XJyC!AZO098XvwG3J zu7f3A@YXRFQS00|DRGHyaJ=#ANMjaIwdyYkg-G-@sI(NA6o^$nmnwYjU{G<6C+lrj z`|&KEfDAX2i+&={u53F+M@Q9=+E+XibPl7D)13+%h2L%L{Y&2a24vROc;>LupS*eN z&77W}Xz3|Kx*|k2nlli*_LI~iiX>vd46zI(MxCPH@qNJfr=A zUmD;@d%y&P_iKYAj5dug-0~!elLH{*;A+?P$i5~brR_R+=;nV96VBdZB2cK`|$X#-p$USuF<{M z_c9C9dj$qzNGIAj8P}Llfk^xjIG1Qee3nE0E^X#(Gwp_ujHY9g<9Z!(JxIpo>zTLL zu5;@mPf>1=j3_+9WZjPd(oUr*LhlJXNi+cm$iFdN@%zm#abw!2n%P z-v{<{k^4LQGJ<^_&+@^=|H43|l8kn7wz}D30x*dyK)%LqA(j?Fgc~2`9Y71MAJbT; zOG)(2AAQ2qH~q^zQ)D_?1#M5>)vpmL6PR=Va0O-u|54FPI}Yf%sj8{|*q+}&VBnpf zB+>llb*pKTl(Tgfvkmf`E|MREFKomP6hTl#YuXwBF5$xqM4u(eop^+*b6)~hanFg0@1dIoykU)P zTJyXSiid?Cd?8A#dBHznT{H3O2jnM^c;L#Lpi514Gr=3b8lvpAJY?aY44)s(Rx~+g zf5z>!biyYlm<5v{3Dvzww8#`go6h)Y!%0TnSVRYtx1!m}buFIOj?VZjhoGvg;o)J^ z6Ivzouk`N$2LSG+{Xug)7IZtdY|CgPiw&d6!8HQ*;dR~HKg@2AkDnI3ncd(ileKMi zJx|@piZFa{9z$dtrSPbbi=m1G6~Ln6D<+bE37bvrsGSLUf1aW)~9d-!p&-bAyQj<#Th;lT54~a zu6+7cX){vgK~GP5OH8Bj{b)O`G8Wh>Dj*YIzr4Tsom5E3djjSUP3qC5df(ug&teDK z66-k+>2DU@9~dwGv=JjHIG{DgAQwiMq43J@_wGStg^W(liJuRad|NssU7pQ1_$al^ zCyOoQei?)ZDUOQW&;c_YH=9a+IF0izjJ*uu`s~d*#H7b+o{cxkR9_infA?I-M5MAT zJ0{zJo_l|a2a3R-|gVXM}K zE~P&EwDEhPkFUx83fE`z5cNazeMuLyxo7FwbH0Nhqtq1*3J*8gkGK>2(-aVw2WAGH zj!*iR)laAbot`}VLf;EZ&QJ3F`tRtz)C$tsk#vt>yH#nd;j2ry%{{e>K2h~3*~=vP zX7#2={$2BGW$bK9hLYw>S$8FepXz$hGt$El4^bWp_{F7skngdGR9j3D`cUkTT|o#8 z1^J4;DwOQbme~#z$i8yjr9zt;ed(|`2bprw8Kcy7EtvwJ)#~Le;c%z-H3FhGdZ`QWn6`@7`e`K0Jg30 zjo*Cv%TcVB#OY?Qe9h38zH+&f9CTNYWwB`9i@K7fi!5{|niq=s>cFTEXj*HTl|!cr z*|r-U4$dyP%zKL8)PhUatDn94#tS}%TFSv7UVp^i7#Zb1<*bKb`Q$GT3BRk;pdt8sqM0i`7xrr=2D1USNc#{z<*7{ktl>AFVz5{oRt2+YUDs@#3h?6bl`Sf z+uxvGScmIrDKdNUnZNdaCi$u2J1ke<7}NVgP7KIk#r14;1Q4GL+*+idsR>1`-dtYSV${EKP-t)R;Rs#kvIj6w7J@-cG!}0+-{%w$>Vx3< zwYyFG80n5!_p^dR8yDCRlVG}IEttM4^zI}&^LIoYgrP%;`_ULpTyNl!SH;+f)xqoX zPQSYro@XeQ&j?#CNrSKsKPc{i9y5XCMQcdk)j3AP+iHGGKUMpI9Ht{8EI9Vn;p=0O zI9Q0>QcBv?0DRBqU?LB^vZqp#b?HW%i4#qoEx}(smb! z;b<(=bkw}fWY|i79(^Qc`CGISYN>3@CkNB}lsmh-mR>V2S#rMv>7HvuGw`Z}K^_OH zAh`B8bc^60)`M*h+3HNimR|kjP3$H72u@fizAWO>@mc>lD`deGaiD_f@6?2jkw*HE zBHFh|ufyjrlcaLgpP)Ii{oV779x>BbefEXNFe){KY^>9*V(Zn)xUk(9F(_x{Z(hL_ z$%vAUX#?C3h9bmm@E~Gfi?yqLMP0}rtZt)Jj2;ki$NYhP<<~%wO&SBKN1sNkf_BM6 zwrXH^tSx)UX2+U|srUdr_&WvmMMo6_WHAj2edc(dJ+c`!zhaDXHllb=?%SK?C@bC; zMfta7L0rmh43?_5mImGd;e&xyAAArOfQrR67)68aGJ>QqBt(uebzKf;wWERo zfZM=dY+Z+yvsQauSY9>ZNZD!*^7RAy8CJh@G11@DbH&cwM8b+sY4E{*pzuxOtfP?3 z%;pYzPk?n(h6M_5`f$N~X&5;9Z|WY6rDKX^hhd_y-O@j)sGh=lWM3f9`u31pX?Vhp zMVrf%YpQv3m^fD%}5XsuD`q}g^s1=^BY5W?>A(N8Xfs_*+Ol%<5XWqc&tT(3b4WHpN%G1LPX z9*mm2E%(zLc(P_qb!Z(|PYHO%Wtf?py|jpY%-PM59rW}USiPWcr(yaBcE5E4NtUjm zW%cTcRUBNT^|m1g5dw1*C(!>>3rt+(QeDeC9a4R|)-*k}QG3e9)|T}?mv~$QAuhTI z7NKz$>y5y_1P)Kl1cf+MV|CucCZ`t|4}xOrSm}YH@t4X6G?@V1i|>Sucs!*)KxG;+F20cz}qn#)iX^Vsf{R1hxqe zt?0}FGMKWc0PQ& z_6rWf?b%-LRUy5Fb z`}uilIbJmfu5Ms6r0;-oR-z*Hp&VjCf2UL%iMOKnjVr5}v{g%1*^TD=pQP`<>dOH{ zvEi~OCD-6-Fgmg#Q95RNdz}j85v&EP%8nOR*U&?setP=1x<-NEhrJCN3o~)Qdy%12 zg~AWD5)%y?iTi14s3QWNjpLU73B_NB=s4VT8CSkF1l-!IDpChnjq_HlKZWxk3iAOt z1F3qQQs`Op54QcnChw?d?ibZ+@`Fj0ZXZ*O^(?o$w*Dx-N6*)j^%F3KId!Wf?}kp- zA@FN6Y0O)M$ekpx!Q;HBuCgW`se{e7zdbrUBC-wZAEr>T6Gt$rbQ{jZ3nEaeyGl@ zEWprG@#O8p^uS=#{eSJ${_P&pf7CDkzdoZ|a&B_o+*-!sBMIMPh!ch+wT!INn&cl; z(>~MN5(|;fdftE26V$!=2c{VE7S`aSTq+fPc3r@wDM2eTJX@y3A}OipWTgO%Zj4SB z@&9u~OecWafx59MLYqV}2M5wYaO<%K>bXlK9-)ppU+(ujQanwqNiQChWHoO;;b4i0 zj#xX)WPV^l;UO6fjLsE0LrMh4a~B$>8%?9sWdhWg-A%21BikbigN|!O+wj2;M~I*P z63u~IGLKRFtDs>-^Ab0lTkBgUJ2Z;%-py8X?W%}VmhoQV&l%> zXkuQdZ_|m8g2)ixiqux*+V#9a^a*LGeT=CLGY!s$OpG?g0FndE9%f%n_VS9}CjpM8 z)@BzXO7p^>MxIcDWk|ElHs&PAD36R@gGFiRBBe1C&b5I_oyXB8uc<|yFp9ny9C~s+ zt6%b6ib47BlKVFIOX`_s&->?QIr3hqxdZzm1p$zS;UdZPS==ii48{@`&SvHZReIZW zEog?h_ZJ=ACzR@}l*1yWx3EM6ph6=ye<}22GH=*lwEvW^&Ry`{{;F%NxnxYJn~-r= z6EocVua^8(D3CC+$VAa1;_il=xSqFn{-a;;Mq*5IdRYQ}dI}#bSMriF-JpxEm>^Mw z>_wUZt(?T=#ew&cHe&&*D6z-io5(ruW^Q4ApmuUgt~fCf+kx-!R{z1-@KNBPqQul& zbI9kBTtYctv;T9!5byF7IM1!&Dh&gX^mb)7bj#A7u2|MG_AX#QBoYIh<2q4|rtj4c zj1n()?3$lkOX0J7n8=awVy*9{^(8F}rnjBEKu8j}IJZ3=$bg1J8w5GGZl!ly&+y?1 z+^cw5u094YiNng5^i3MF9j;4^F?IgclL5hVJ}BVc58@yH$DQQ2bINw3T~W%Q*?94s zcg0=LQzJ;n2v*jzjgk~b`#cy=l*l@NXY3OTxNfS$J_eq`m037OG zoQours0Hk!y^D8o+?`^J5Z|h`M(xN81DTd&VXv&a7*m0{Tt*A6SB8enQa~-5iK=57 zw8MFUST%3loWrtCdQo_eO=HH4-Rr5bUzZuu#r+e9vqVK*Q*SFjXN&Ls5?v&c*sm~~ zPj01_QIsN4gQMv*!5o{3>R!n0+_+n|=t<$GkVUnLBsllih$O{~u9jXH%WI}F9TY^j zg5<^YkI_Ed!BmtQEgMrL%hsgUy~}~$R-@@H@j%Xp>fEf?e?@$8=jak0+}t3pIzR@* zbKN+0ZIckUpyyEJMN`@5RgRyTZkk96PT6HdR_aXBO@(h50Q&V(;SngT_xEO-)bkhL zZ9{D|9~;EYT{C5s2zoUxEb~s_dR#6r>w-S?5&|i_wmnDJe%lUWVNgO3YN0#i`T1%^ zr+1svkt5iS(Vu`MP2r{{(vDzmh5NU0ldIqfnStSY5cInaxTm{`0%q zGUMV8o3%w8y(j^nX?HFi=v*z53w3-9rIagM$xm z{z`rV%}7i?s?IZ22ZrV+VS8#)@ELJ0GI`c*xlO3)w^Xx)uT~$A$$Dg0dMsmbt?YV6 zBn%-5MI1=c7Jyy1-|vc8oabVi6H`0U;OCe8(C0OMG(Qf4Ue-N_x0b{ghmG7N0`Sbo zqtyVmdWdoz)ahQx#J`I5G@8O}IIb@=Y|j}QSSAi}Y@QcyP8m>5dTwpiwWE~AfG+}Z z0s9nQ^IjVrRVWKsJ7`^b8t&J)5M(tWFVTGTLYBEszHl8MeE?muAibxE5bXPQCj++o z!Vd(ZFrIdVTB0o&MQBKxo!iL&IK)xSUL<@)D>ckT>NA`vjH&dmNV=QUER=^o6WNZ) zJ{`yS3cr{XG9UkDM(swXXW_|Ld@_TUTpr|YiR|q`;ihOrD9C#|g}Z?jb-*m}gFj(@ z{Wml5&2+44Lj(7FGD)#HL;L=2$D3CDpl_Z6%hAffqfV3h#;@~~FGdAoCdKt+L#DQh zz=)7U@181cES}!y-UHdVhiSQAazA4gQN&^13(|E^z)KRlymPnn1Z@V(O*s=)0SRq| zunYHo7K!gDJ?)PDtYR3W7hWPqd;Qw1)lye1{y-P=*OIRWVk$AHm$2oa*Zi>)>6rZU zw1VeD`Dd1zCP~)E-QD8FzlD5@*2()TwayhaS32li@07R_r82156o*wv4%vHQFDeyL zekJk7CzEgeV!~$=jNZVa^Ez2+*H9@hB%hNY9N3y?tDbgK|$;?YCV$dBF z$u^O=hoBp$aRR85%@oEh}%zzKb&9++&ZGV_@Un@qTgn>PeYmZBSOrHMZwaH$DTs;KPBz-PgZ3xpko zdeq8JLC^%`uk^_*UB`!vuWPDTX6>jqpDe#Qa?f7f=2)~iWOo}B{@zo#8->sbWFO+N zM)#r^ZR{yWY7bZ6shSYhkT6_FgMhGL-ke0vQTFaK{iS=GU5}%q;)p6sxVtD%&}qD5 z-ix3w8|XE&iIA5*`abODYH*IRE<8wS?KJFU2H-2?Q5+6LFdwnZItt;b%Qc(6BrR~# zq206`eZ|D4$Ec4le7LsBc!D!O@{AMbn36@m2@l;HqAXbv$}1GICEugI#E8#A69dzn zFTAE7-Oh4ziT7ZS6LI^>7FuHXLZm+X-EEjoUl9kqZxIfHs@j(q@9L;;gz{TK64pWp>0{_mz(`$(?Fr@K7v&xb2`0a#~f(I_s61tmp;7EtkU@XCf?9UVR)I z5)k!*Ygm9SKhl&K-`pp9Zn@%aF|mK6!KSTy+W-BN-j_V?Dc27qt`86EXX{yM3Bl%w zf$5ZEB!pgKzYYUQ{z;8DKihZSt@xhztaVSS;3ve7#vb1Eg5RG)c_Zfb&d&(<+7ANS zz@a&xq;DtWJV^{MOoZcNGgv9X{c&P ztv$Rv=8Xi~A6UE2dm!ZVQ)6Yf8D=*(eG|L;VO{MJYib0a@mE7Z8yl;0htAO7(iNbo z0BV~;qULR%#Ut9j#ZbVo;R%u7O~t$=#=X#$x4pN>f~s`3T{vEhAjQ1-(FS3(4}l1O zq~(wEq76?Lw<-p#NnCNYcq{_kR#ml)R-753GYvmTVX486?2W#;b+3hTDK-P!gw9Q+ zh&1iNv$?m;%V9xIa=?L;0Ycb@c|I1BKNj9bgaX*qt#qSJ zIw`my3adMPDnw3s8>RV~{DB}9>-7iLk$vps$To)z{A;xl`76;Mw-v(Q!vMuC6_~^9 zpit9vbBFdxmkiX_^)E?WU&}bzpTFk)vi<2aczWM6nZsAQolxTijzsBH5GCmST*aaB zTUMn(1T0tgRD$ut{F`iLX<6M)gc8B&SOeQfQqes?NBpa09CDG4tY&BZ4eo3G05|`E`Z`b_kmb~ON1M%=D{lQ^5j%>WVgxLNH|Hv6YYB4{K(oS5kPXb?=4Fv*^>{ z4wy%Ms%=u;Pk0YsNsI|$zeMP>aB$clzuIeEs|DTZ4!5;dRk}5yrxnlE+WbG)SFQ6+ z3q{@Ky0+kWZMgPaAj=8QY$I1*-YZ;o_h9soU|&K4Gmwm`3hZ;?gLY8;(t#Q4Wm{i% zL&tzw2Pb;}>Muh$zn1HIv*y|sj1{q+5@;PL$3?1wxIZX_NcML*=8=D4@_3uwgCDlL zvs>C?90(q|k7G;&D58z|FEuaRrq={WYrgh_?kB2uXsHqMAQTilEKLw-kd@9h5ja_r z8Z40=4&(SiH-y@c~ zvmm6oL-*!I1svBN_1zf59~c~c3A#m6|KNX)Fq5f@;C7f-G+8v6bZt9QE66IH*H)Y#HExJdCO;xc}s;_gVb5;q8$z0WD>ZuV{2rtGFx?WT9{&X+2T z3Uc%+9ixY#XN2zt)9oWR3tsv>+S&X_oUohzDO;^i+v^Rze9h7E;`n>j1f{$%*Q?1t zr9r6jB88{4&q>YEc`3g)5Dx7)5F1z1DV9=&lG8bnavkxk4(D>dw&k+~=;1G^qyQp{ z=oBfCmf5_VNZkkU#?XDbpJGqP&H7XZD2NnD*JA$|lic|JZ>xx_sL6Z=88 z7W|)Fi~YMI^*{ahp|28+Xi`|>6$D<|5~=(h!#}3_GtgXldR!l+hRTISRs8$P`5zt> z-6&d*$aEKT0@ARCqE;3}eA*bB5pz46@5c3|GH?0?rsvM&GM(%q>GF3d5#I23cM6ZA z#R(&l_2a=>0;(IV9ui6Zgft_eLlLK~;ZMVh*fsw#gq_ih_RM_We76V@hg!-+?&?#( zwam>I`(5-6ECSJ+8I5<|Vm-i6yos6VYV1a|BX3mf+NaghPvUot#iyh)Ry`W5HpNe} z)v4bWxE?QlOBGmQG_P*@6G3d(=C4Xnb677y6v-M$XS>I1b>fwzv(A}-k~rT4w>{Yl zuS8s8F13ILejM-s=Uv7zg4LsQO6TVzw^qLs{QJpl<~NJj0?tWf$E11a zYFjWBV&49Ny;kT(aiWFCbYv(#M9OCkQecLk+svzgM&X0l*Zskrem=#bVM^&)4;O>@ z`sT+qS9KKR0D=&AWXgQc6E;_}v$VbY1$#uF*OY z0GExR4T%X-i)hG-K2%1%GYDVO5pHYws4YjBZqF9B1JJEk>ywU>s`>H)Sb*Tk`DR#* z?%6^zM3OHAEu!Jy<~^A)p;%&yLwKQe&ZQ*0;{}O^M%R6Q!!v@ZUl(GzpYj>PMXtkB z{tC#3Bm**u`%BS34&ywPrjtb}MvMj;6TP5J7U~D!N z(8TcU9BXe%P7uBG# zb(z#NzkIP7pBUF6w_Gs)tAVE@PuO=s-(jOH)q1!gFaqw#rDfC3o z&0bVAMw#{@FW-rA`rD-EgrtzcyO;Tj+1W;UlAxU_pui^J9`crH;xlLs#Rkyck)1gv zdd1|4K^vXNIsnZfxgu$`^YQ1AJ0lOJ)0+u8w=Yx*bO|?$WYYn>saxh}y3YhKspvD0 z$3HEX{8|&+VMQhzbZ+{}Twjl?dwAS)@MSG4}*1#3#B zM^Fn1NcMH1yS4Me*SFyW<6qX|`npH8v+ET%b!9MPIfvK(m3YhJrVBO`?80OnOeZ8oLHPx838k?AkHi}}T20BeDY}>DIlLl_&3K(m5 zw7RM<{TRA2as+VJxjyg95cQvZ2Fth=WlTyF88aI;EHBDV1fO*5EzRkf|4xZj`d(1 z+T>NZ&^F1sdVBQ^N#)u)3)FqeHD9sihrva$Njd=Q&{t1yfmg^)xd}NE3OR?WAQjyi zmlooWulwRXy9)-ZHE^omO^0T^{h$mlxGmWYK+`#xF)SH$@9|Cd>GsoM=*RLeG7*-Db5DX?l@eep9Uxnn;W){& zx8?{w98MmB!8*G`S<~8>Moo`Khs6!s%YRkwNjjd+=9d9LrsTTf~vST9jA{Pd?#WJTYEgHIogk zH}390J>ltnoKf#)x`4cCgsbuO`-j4>>-H>JFwNo`D!L0}u>urf7pB3~mMY3w!h&AN zdCk9uw|=kw4rIL!Ujk3V|6lti!xP1i?6nSUImRDO< zzVfxUw(a<&u&d{!2ze55u?UgqD1P8>x}kFh%qsVFkO0N-rTI?$V9y&L!EGu3@rz?~ z*Ur5ucjrtMwyAc17;|cl*z&z{4Va@`*W|1l7)1E@@~b=aYo2H-3)Ec^p2SuJA2K{AXFNlaGMcx_EIhl10G zso>N?P9raYdT(E^`P7)9^c#vQp>$zlJY$T3U4aRO#EUg}c=w&a1npx4GtQ9wMuUQj zlOfDoA$|E?pTi_u zwzPNeFncQXN@qs^uL&g5>!93fz{KXouzbhyW7NTdRBGwS`_`*pJE-YqNzWGak zRxF5)P+qji#~1)ZGp6j9>S@?r&czfh39J@r={&h)P4aX3#dVJP-jdf+7nDWlrGsXOIHh zMa`!!F%1@`{DE=maH91gY0o0ZPZaG*=}G!K)id|AKK+^=@7t^pUU~nT{=~uryal#Q z2_i+(SR9DrMlDd{%FZFQ76}{wqnrH1=B#*lHhwN5lwPwfS|xw(s8iW>A=nm8l@d0F z0VLDzv)X~A%iWyj0gp10$orwU)!pkKn+aXhAB2aFIk3S#m5Hz|M{D6`ImYoLWEAAh zUTKm3?7h<9{^xa1EF~fx7H+ilUU3}dkDLp$YnUg$Ob=x}p|qc7R_P#xI#D*bqRURi z4`IQJxkI{&yPU4kQ(GX~6|5Om1dJ6us9x|*T$*|)KC{PM`1^jb*_)X}H5m^!Wp%ZN zn7XryW-#=7u*H>3O247r&&FGX&W7h$G^bL>kLLgiIp9@pru<{|)PDmyqDnQm&26wd z?e+0#@;wQB;d3%MB8vBgWdCqc53>Y3C|*<tO{{-PW6LT@eAG)=SN_CQeM9Vg^ z%~p`ia}Mi5)tPf1b8OrvRsC1}cZZJKeOQ?9^jm@5qg(h}4e9@|tRBKm4$;2TuFw%Z zDV!@bW9Kx`-S7{jiZ>TAquj8+z?BT|M#BLR{sKPwtS5~HG%loUN&GYd@^@%`5kd{h zB(B&)V3(q=C2l7Z+W9)BPH77lZ}1W2!;G6o*s|l{*`JN`eKJhCc60$CYJOmi79&+` z?k{a2u8?T)rQvZs__IyP&%N~>JZup z;w33q5t&kd6mnku{#&fZB`D~bWBEI!PjWe0MsJv{y503}Vv{VG(y-lN#eNxA9X()Q zcmAzt!K4LdxC@9t3a?m7h+El0T;yNr`aO!@S&_wgJRYk3R6~-o1S2a%ld*Ng z)|qXrryQzlom^+&S#=F^{md!o$CoZ-moPJ6m&U~imJ0we>6iY%viAYycPugpCfB5-TI%O3VaAqZ7mc}obCw>y@!q)yNxbC9wcTqz zseP7s(a9%Y$^NQ+Z=j3z-iY{J1LGC8)l7Y7t_X0=qO7%#5g&5uT9aa^i882FZKasAP}{klfZwFi@L*{pU#@^6U` zoo6r~<`IJJ1wxXS+RK0)-pnA5+e=9M&ZYd9pXU~5zmxQ{B+EORHEKjoChrzmdsw{e zg2Pi_8@L;&UnUw`OrWgUp}zD6O!YCQ$3BY0+m!>8VJ2|L0Pcla#wF+Ph$a7Gjr*3WabtsOIEbB#rnGtYElXpZyj^mKx{c62b6w z()Ryg@7?2}47>i(kt8XTbB02)ce{u>y1V+zna=S|Y;fz9FB z5;hFv$W?{ED2evI$JDGg^5;^2Uz1|}&4KY0nWja#7)_gn&F3qwd^gzCLoJn`J44H2 z1az0F(q=1(Td&a5zHGGe(`;8#`sJB!32G3;owvze%gd%TT}ef`B%t)TDJra&@0ds< zF&w~YrzTJ!p^RpR>sO&XCsmpfK;O+eajP(rtM>Ee+5++De!Hkr(_Ry4xm&{F@vpW9 zCZNvxUPD!~3-x!x?u>NyD}6pf3znzQ4lA&Za-Nl2u`9`Ye9A-QXY_M??bqU@jjB3S zd5K{5h!KhqZ}Fd?c}rZywdgP$FEC;-XW^t#s%mwScy0{w^}vsrL*eQEp#?+yudw^B zINeU(>D7~o!sjus;5pe5%cM}pU2tB8Z(>y%KmB~jqn|v)A{<;kKei zTVvD>BLU}MgMqG`b`{d9^x;_YNOIb4x=M}NY$@9c$L-#_huWvC*Hw1hmlEH*P=A@@ z(?X@8fN3LY$rF}qVLbsxI5TP;HQm1us7qu3=R5Hi`w4k=8CuF!;G9y4!R8fL5c}n8 z4-*c-FVbg3OEP29o;n}0o3XTeQ*0v}xA)vm!4hk&*%nYX*u&DEWgG$goLHceE2%BO z5HpbO14&AFkb|yAa`ZR4)m*A1`ZBchZyr%H>kws8IxagUW<8m9Ro))5cCJ zb=fTT>7=dtQr-*nd0K74z*uG z~m#0!_59*zzxnef#X_%IU;N?HXf0#d?y$hy|YE5949g+7i1dzry}1dpnc( zf|GVqNsj&I^V<_pAbimcUH@HNGkKrv5-Wt+$~r;&`8gNPFeRR7^SG0HLZZ>x|8YMS zd;PkXxT2~kcK|Z_q7u zXVA{TJ7#KnAxQm0@{kpe%<7@~6dM_CjL);yO)-GCkt>V+ICt(B(G(DY`wzVNurtz* zSHbD=vqduX=4MEgdy~%AbRxonG|01R$q-HX&ooqu8+pQ-+%R! z_dWT0le-Ls&1JTPr{4?H`2bRy-2PEeaIll{D!3iN4y06qKsY=FNYAevu%u!M< z>q%)0R^JlwrAg-^>QGJMkEbX)21BSIeuQeG{U&SHfa*&14>--RRR7k0Ww|->+hp?9 zNHG!J#Kw#138wPE8+(D*2OO*XcqRDW8{kpO-yiMJRn6f6uA)-+%suh(mhF*N_Z+^Z zJ`uLw6@=RAv>{MkUzoKA4xm|99rP8Xq=W%Ku0T^hnW9YS(yNn=n;5`<|8>j%MtvsA z+ed`vP`M|!^I-GV2B!oqp+hC{!LOt@%RIE;Zbp9NnYWSpA^EP%?i&PwdMDilwS@XC zP}AOfuIur*EPXlgaI1ux~6e&I*Mo=+FjOr4)sRJh5;Klu?< z(Wsfe8IiSGu%8BP<9>DX# zUNq2EQisyAVF2X&Hl>tx7Eq76d7hndF=;IodlIf0s^n3@#j!)Tm7C}a(N5!fXAQaq zZHr(dsm{0t*+^ch6=P=5L~rZa6IMrNBoBZz)mdaKtU_y|%tw`-8Y6A;wNh5Gqu)2^ zZ?3TxJoMwS;byv^R*dc@&*f{MTeQ+Q1BM1L!TSVRIP7<$C1nOy1b&BAwC*Gf+?MaR zRPdV64VC?=#*F-;U%)YB!5zu6XGcRW=ooEQxe^>r8|59mV=!59o6WY8@F`X}hmfSvnrqF}d^lr!=_Ywpaj8qHkf>5D-aen*; z(j8--T)r-&Di4;;IeqI-BCT!e*>?-d3m0ESHAj@Zhx;4|?~bHzF2}%}RajoU)N5bv zil^p`tM@bN+H7p^Ei_Lk>S#$!T)n&cB=%c?8m;EZZ!WEyV6y{R@gwS^X%&x>7jC7a=S!#suU?1X54dfx&5=z^R{QH zMDQs=#SqPn@9!pTI#Q9&hpia?&@)#^`_*(lpEg;Qs@LEB+$+=Z$Ka6tkME59y0&i- zvI!z#w^aAK7D);=qwL}85fvWR0d`Ee49r{658gT?1Lnqse(h*U@Z7sCshYps%Ej|Z z#Oh;^0-lP!+|uViVol{@V&L8Lqs4*B-u|<^(N;tKYg#`GKgXy-s@E4UdAnGcZO=Io ze41M>Q2#xw$~?&0#wH>i@>N7szw3%mUif=YN4Fa{G7WAQG^wh-OAJ15Vjon$AyDsl zpk_-HmYX1nzeRfk>Uvc8Adjh+GdgV^jL%duJ}9^dmcj%!*Lk&^K;_Z}!79c^Nplx~ z7%*>TcYjM*!J?q8t(n@JZmQ%}HD`c@=kP!A+4P;KQA>?uY)zge=+4k-Q`am!247eq z@kYo*u3RX4y8TGQpje7~tjk7ea0FKzXqX4OrX&6>q5bbeZs1j+ti$n#Q*YSztK=0B zY(jKjzE8Y6KsAf| z9{silBqwXIVdZEzdt}fIcs{WOvD8zX*2iKP9aMqJpYKdQDhvxMz84c>f|aTly?6JiFz}83B+E!5)SY?Qota$&Jk8F;DWq zeG37V#;LuJ7~c*xT)yrt5eMqYB4gAdL=@uBZ09o&HsrBM`Q~8Iw zKD2Hg@6Qby8bunn`%G^g-*jbrNltX&rk`9NP|YYGA`imDQiP>mm{G_fj$1#{*wHn? zHZ^<3lYKZjUL_xQlihjeN4=OPh<}@nidI7#Sk|=KmizjVFgWoTZ5X6@)( zM=dC$Ftiy^$uoe262md@!F-@BY=N@q0(EP}>}}ZXSelc|I;x=n1yyGQ5(v*Val%e$ zqz=Hb@vbNUAzLHghnLh?1^}at2C=c-t-!G1Be9>8r`+J6b<}+cEMk<-`LEae*XsRC z_Wt)d8l02l0V0C~_N1AQoT>*QCENN)mHkT|r`uE#&y@tbhP?WAS#~q!s6?}`mh3~p z`kjI3mQ&nDow}y)-G-#vFD-f&nti&Sn}M)CemaJUI*^X;&Bnre%;y$8-8Kq@ZMe{E`_>bHo=soOrAb1Vk$8FRH)iZ#VxNu(d$IMg9 zzMO;CM2p%Ds_fpp^!OSlgGn}gcwq83HF6t{X40~HSjrBmmzZbo0cjjX0N(hMe1j$; zN}y@dZki9(l$OZ2O@qRgsAqAKcGtv{zbH4Ee4J)%e{rx`%(7VN(&rr)kL*j|ehL-v zpA|^{tK8`XGlVOV9(jI!wW2 zRAIE-qVZ>1VHSYIU3eRM4x4OIuY7cf>#T$9zVk81er?_X+Hv{)e=djY@92HO@}*C* zGB1JTtUGyt_|EqY=d!>***@o_5PN+XpmA1+6Y~txTEZ&QH!4bb_dYt~`4{CI!_->t zb{P<-Lj^SdJtB^$fJiP#a!xd%6WMn=uuBwA17Y$=JaOT=b}>6fV|dNX#Q@{+Hz&0H za}V4<@ht!M9?b#M$68iTfL!X{X^^b(if+L@vj-)_QH#Gp7s;cb@9XQ(UBW!|8{bV0 ziFX$Aim?h4zosd=No(|vWM6tDd0^*Q9N@+3sm2swsBcF-Hq_(pj}1ONIDAy@!f^G| zd;8FrjlSStt42)kRHjE`Z5o7EiRSahS%A;nMQR;&#De)4s`|+;B2SHg9C(%-_<Fo z2gEe0AYCc_O~5zozkggCQUZj1vV++Q)ri$Yt>^TAVrZYq9;7u*Wh@{Z$^}2-fg9*> zU>&tM&7Q)JZXF{}?Zezg!}h~PJ#0x4;C4Oe20t2>p!biV)g@#JL*L{zyd;eNoJM8x z;IF{-bVGYMHJxQof9V^E<2BE$zOvhm(fuyXwPbPg_PW&kiTaiN=rSjH?D^nYr5n;a z!n5}(5|bvioENSK*mIfg)xObVr}~5J3W>NQT(Hy_v;(#RC)Bn)_T>+UKKpy7DoOj^ zpD)tf*SoYcaz?j$9KFdmn=wHrJ}7`h1_U(r$E?FLvd$;Kvv)G@*b{; z3qN!Z@>`uX9q3$sd+z+j*CoUg?!~Hou71%I`EyTsO*5lE@QGQ(-SpLsoKeH-B7&@w z>|#X8SGuWQ9HI)ozHr2Ead)LgXpUG(X5MhbR9JExSz@;VUGB=mWr^03C%0;V;3lo};M&g4OhE z+ttL+=f31|PLMos@YutfANhk#7^K{naI8IZ(5iH~B^#=STAl<g2}cpF z!I)b|Wu&%2v=ne7jr(DHSQ@O=l%e~~Ys7^Qi_ndhHLWk8qKU^J^ZU^M`8M_+|FR}I z3MUMCkRRv^Nvm{WT^%&e6pP07Ki)~}3O;y9KsC`brfbHTWo@ik!?qRr;gDMD{-Q19 zn^WZ5xz6i^SnIB|c)AOflWxw4O=J*1!G^Th(M(cEL9gz?O&`yW85fkC{dp>O(Ns|X z^u&COV#D`*U*}%$@V6k}!OGE%-1@V2)JjB>in9}w{tu^dVIB47L>WK-Uo2IPtYdIC z(~Tux(B@5r7=ayyc8b&+9VZip(0mt;*POVJ_-SCNioagBCpXdSl;q@g+S2r=09(H=BEZ11VsTP` zJ%9J0f3j}+ua4)l`WiBZ@B_O|7yS+%WilSN&tGhhml}zxKD=eV#OO3nalKUw#i7r0 zN9(sdi>j)`(+)RE?fXOBwk!*)i@XmQr1e;L>K>uZR+TVFUX#PuZ1+SDc9sg`8=Zs1 z2CfBex{w%%S|a7?8D^xTzhgswi!U)MO@dhKXH4aJJzdlU={>E)hR#WaB^zsru(e1@ z-!X1_!Qb_7JlnFCLsSp-0{Uq8==&~W{1nAYQP{j6D7XIQ(}^{J@0JuHeGoPSOnXuF z1dcB2Q{8iNd1n5}#W6R7%|2y72}W16tS&Xo&w^ql8(+}QVB$-DFdgfO9V+gQMc#3~#;2LuQQ--4T8xUTC-hl<59Az2)@9{yGazZs$ z81OCPVAE`ndvv$_X&vd(s9m%24S{wEukm`Jjr)7!t><+>wiMI)+jrdruQ@{V(RZX1 z>el_H?3u|opmu40;YiDUX>Gbs*#6r0*)L-ZTjbsslpMRMQqc)z7+2WRkRy;{?^ zw4VFr%E-XbAzJy9(NyzTZRIlU=OsU6&-&hN_{iBiIxI$dQGJtYe`-g!rKvh0YrL;M z>cZAkj5UWPTI3^qum=BpF6@8t$69RJ;pO{Tm*Hp<%beCzJHO+|C~xfM z!RhYqM8iv5??{jLpdvo-`8CC#PiPU3qytp&e0&la}&34 zPf(Sal!HZ7z7I>dg*QTV#$G+S_ ze{)~IyN_Z4lh1%Vh5SGTf9VbWPTQ}MkFH5C!m7KE4B#X0ZyXp4?t6dohM_grk)B5# zCkEF_r6;$+FM>x;eoeDG{$OnHuvJ$qQmwG?cTGr7Hj8U5!Tl&<|nbjQS%SG5=_BDB3mXo5UYa-0xl9w+B%ATPkxzHe|va! zM>nY4WZ$mCv&?m&%(Fi<$4o@L<+srRGGsz}=sHcoRE zIiZV#hsR@Iklk^shhrmfjQxW^Qwh&vEQl?_F@n=Bb zuse4h<){iqDI#5%*53}2LdS=fFl#U7HNGBWw!+bF}0o5f6t0z0yGdPt7M*r&V zmOl`aCZ?|xF%KKXu_HxrYr=(umP+!1Le{KGGx2;9;FwY&{tU>%o%NHi8I}M>JhG(1 zls5#me!twuHHQ%+bC%h%?zaK<@Xy#|r25#DKTp*aWpskqSAymaUIC$0^#GrZ;O^^! zLXEQ?0nQ)RlQj%34`YEpLYG0z*HK^|9|O?Hqf9(b3TpU>9m73}Rn$Z_EVIR?&##It z?)nRU#2rHD8D7{8z*yz@G>^o~tW9I$y$m#qM zYqsAkc1^C_AJpQGo7=TP(C(Fd6v*HoX0D_LQa_ za(qb-ha`OeQOcgd^cc?0-zP4Zl%dyAmxrDJSS7FL7k7_T0E%#rP={GC*^iym3t)AV zrN^LEz&tmQy{wNrypHl+l^IeTM<4hT@Lwd}$1OllQ`m_`V3<$t3_c(a@i=xF2rW-9 z#I8W8`U9j7dWJs%1AhwZDK3gUI|l5c_9+F>G#Uw1Tz-nJj(v%}PhUsP zpNDEynp0*CMIL!dUO0va%j#4IfR>jY*$aN_sM?flZ0W-MiZ+fCJ09!8ehpUS z2O=ln5my!HcQ(Dgl=HhVU}`l#!#=euJl z4ClrjtIm~?hj_%!mhmUXp^yGO062fR#f3CfV>((7R-`Xde5gDW=DZ+lZ_h+m0acsU zYj3vaQFrat&7|V%?_ROqXYF(X?U()wl9>&1rG<7$`R2ywu&#?!df&Hi$+P<>(BHc=Z?|Y^3URc;@^y~VfDlGXre0t0i~AP|`+}IYy1NmhhmMZz ztCig3sIz?=>LSY;9;RiC+zn6L1!s>1V5=DkMZFtqU!~oZI%ZKPZhA2O`8%&+A&xP9 zF>pI15*i5dJx;AoaO~lyi(hv~&RTZtSI)hV_5d_4R|f>Jr_z0DQ2|<8TYnr)`9FyS z_x>Rg1Pw60Fz^1(A?DIGy*)B43jZ=)o|QXa`k_PVYN`J%f0LPV+Yi4|SNML{^@F3b5F(~p(7L>Ot|Ir zipK5D$~V+cX}jCmJX?}>-`~49D_1%2Rt&Y~p8NkmG5g;Z2P5eEE+S_?0QXJ%0#IPO z=Q!3PfEe7Xr^6cHoOF$)%m_ByJE5yc|8GnAqbqk;hRI3M^^LaZpFu_`m=+24bl z${4j~M-@YBTz>V9X`(b0#H2n5s80@nyS}s{Z*0MiyM(W!B*};IH&oc81k$3;(}kFe ze+GKQzE2)c#|MClqay627w(HQZb1nE-@({;#s%z(ub<4!un^Qxm<{zjTt}@bXLoCZ zBr_-6ss@`IDOZ7(UY}s$9zx@umhITJ6Z)2F>nOjJrUkvCa{I-qzgR1L+>9F_fI@)* zvBu?)6ePq02ii`uBKNK}ExL4Ffqpejt`gmPI@2QP8o*Xw#eU&M8e74IQ~Gq3PXk;{ zmY@!lZLSJ#GN09qHbwf_#Gfalx!8r+@hA*)s|_@D3g=LXT~NnS;-@FD^H@1TSkIhs zCuTbySRIM$s7`2(2uy~|y*SZ&FvVE7XF}%dcWmGn=(yi`!f|}VUm`Lb6kOp7E4&;a zf_c@akh48~G!@pv350vKX89Jh5(d0C!=-&>2mQxz2cr}`IR_R78(PwWM3B-($*Y3h zOSrYOnzvO}Y*=KqyIpY5!jV6ALn6`zEqEd-paYaIcIoxLH9nEwR?%;6%2)Zt`@zD6 zEw`>QMb^j&{l!vNUV5_ zAb3qWfLW(fud&j|^eZzhzgVU0?)F@j5HpAJPC#r zEOV0)D|_ijHv4PJOKcQ832atRHa>pPIvt<>XCpM`K*ii1^Ran$$P#J72lwRX@O)u_ z`P?sH#3gugBO3#Vj$X@I)x^SjR+@Y@nkk|}eqQUZi$y?kK5@L|I}4ao33mLmX!ia+ zGwkh%$zK<}LCt6X(9)H?=_(yO979E`!g zYNK1YZ3IPd#aF)^grg`~k-6$r?P`nE6Q9$5X!1MM7Jmy5LydD=c5FN#v$K`k?ti#O z{@YH}Advw007 z)-7m6V?2LluRd5Ger0%fL*=oZb`Lg(iN!rF;CWY_gU_kPZlQuRKW25VH zjj_QJ^G59$TM?Ocr z%X!R?dN+1^>)VUybi6Xt(B2J~Kfv+^{b8lNXq|mL%}CN6=Vafb_B$?f*a$bBIQqz; zY~J#jWGiG?k9*g7hZ?1m2efImv(su>+_cDEbQN#gZ#y7MllwShxM^UBWPBU?BBLUDgr z1~>144$wn2s$6o4$XRxTZEMznDr1Z*@-CP8%3cKpUB!$YrhrMEe!H}n~J)@v71slp9(2e^-B##Uluhc~H{ zLiG1_Z;?yz%u?+iHksTK^i=b!sAtmVq0AtM$`6~l(+OSV!sDq-rRg;O?!wqDQdg^M zp0-c>oQ)d0aeWu)xZb}zuBq_W-EE)iZBcY4aEVq%=Oac-=o-MU?wzl5JV0~EN!a1t zmzg}kJSrt$Srv>Z|>Z71HhNk7t1IF2}U|2r*Jhx;;o)bgPzqnySrFewTL}6 zhYqP4PkgAjb1UXu%`T;fTOQuxpbPg@TwP8Yth2>`$6Pb_9qwRhqw9)wtQG zF;8e+*nGkHiCLWBj*A*npLb#cSjuqa7+ILL7xblLnZYveeI?Ge^;7G^PPGpWIaF>` z-02%(66D6?d|p!`W&U;rcZ&>TiT$F??2<;Lz9dpDt=h>_-L7gJaL-3p)xltskFmhO z;nUbp&KO(Nib@}jkp*UOW~6kOAND^{(&l|ie8NLLK>C+af|;q9kBnlVMqTg4ob#p% zBX0mb*&-9T>R(OCu2?2FM72ACV%7y>)yeLgS)#XrNXo$89?YXV>?i}St-Mz{O+_Rk zf6NI?PAJ@{@n5VPEJZ-izB{v64rC)tKIWi!0X7?HHawMZaAI#;Zsa@u%IKt9Wq0-& z7ov>lI<@`Ab5xqj1-fGp#r3@!dBap-pDV3};e5a#GQNsb$Uvo|?iQbZ#IdNK0)_@$ zX*FLI&0|Nn=h{XgX?KsT+S|GHo;RnRGVIHtw*IjO_TpCWfgLh0Grn-s}GeC{~7N>aVHA( z)qGsVUJq5Y`gEa5*h%bSsp-BG3AgwT8ekM*&=nMy%d%)wgY|g`--Z1sZ=gc3>p?G7n#Lu)xEc~_IYTA zjG6GjgZ-AbjU|rPa$MP(dEw6G6QQWJ^8b{>4SkW`Qa{XzG!N&egq6$s&i!b*UUD_{ zii^v&cO656dY|?R`G;HF&B&7t6yCA;a66h0Q>;QY-=BhsqCyvm!p)Sv48dcKZ%_89 z@Qj;&uuZ9SB>X)1-3w&kY#x^WCCG3sXTQ8rj=ARN-Us{5Is&`c|Ll{Cp9cg=m2OZj z$kBp=`c16GEN&HWEDnW$jhE{t$W1-Hj@tl2G=lJ0O}h;VhD>Je`05B)tXYbXdKg7AZI_13am^s?l;DLtDmyl2QQeD)o?b1Ue{ z@oua0n)s3JXd}3fd5OivPAfa(HZmXSs8H77Z8p2HlM*R)Xmnh?)KvD`K*mvyLBWSE zy8e-<#bNl~v<>cL3M?I!YoP?E=ey8-Wr=MN?+yg;Bo#Zmmj7(#F^j^a=rCcDwsW79;rFp zG@^L#26T(OJNfd-93K?A0P7UH`kRP@p$NR+sXPO5+;-ryng?Tqk)2aSLBcMWuil}1 zNx~|>NaJFez;U}<5ihW^17gqK+%#C+G4kvpXVW_BMoS2G6Omy>FUrjI)j9GZ@kqzE zx>z5sM;2!8zEa0{CEjouHhc{+-n!$VIBJ5x$vQ>%te%JEjEFVSXWguAT3LJF9eyIa zsWs(o!Yp5vHE$MxHRwo&BM&S%8hoQnVkB3FX?CXL;RD%rHRW5YoQXfL zwjUP5?9>ER@}-1caz3VN9)o-@-&fS3%l*QwgdAGnA8l#)IWI4=-|U`8hTFyy4H+vp zgyUYU5-xhhZT>g@mUR@I@(y%!Ou|cK&nBw%$0Q1sNY#9sczWjAPtC!I@w{;3gnRPx zpnHtq-(#)`{kILf|L{rfvHDo-gd!SXNtv-oVyyw@%_Ecr_cKB}v;e#;LE!kE?uM3P zY*DOv7{FArW7X@Z<#A#+S*isZcd}y{Gtis>X4JV4`U_}(T`+(5$12uSL3rAL!Hpgx zu_0ITR|ffV19{xexD_!NBLVCU8iNw_L)p2|c)$K0sJ9dxfOP8|{1Xba0k{_r0ZVFy z2zG6cJMH8ma@;qT4t%>h5KS182M18~GfCm+pK-Lzdv%Z#-1Wmu2`2!@F}sa`k^Kz| z%dBFqfOqlrASJ>yhKjdNRP) z&X`a&UwMls%M7?rwCMi9@J67k^Q-!p2#BEoTtllaCUz5meqR!E$cF>0tdY5L=n3Qb z@0CV2onuT4Q7 zbS?h?t1WXL5tB}{L>3oZrUD2x0Y?CRW9%xHZHzQ7Vb{(k(ed73ehurWXu?_DoH3ZU zXUQegdfeq($Y0z8X<;S~YKWphOi8uj3FQDHSj~p-xDn`?xi!ELdnN}f zYb#)JPh2LYQ#@BCT9=_UGo4%WD{mkNG^v~qYTQl3vZMT5kc4{NCyc%K636NUa|hJ8 z^Anit^2(wu%Rd_IvO7>9q|R0R0DEja)|V8gTk!`)_)gdcH6qt=jMD>P6bh+V6N4B* zj72s&>nnB;xz4;%Ua<_g3`Sn!7TgHH)g?u%0#>K=boBmKXq-C>>Uo7-Ly@{&LHUt> zID)QUM|}a_n-KXhE|<^X&mb=WDsc^nEwhZh7~FAL`oePAHsP9WCat7#qx)4CJ+K4QKqM}tO;`<5p$wsM3Nf|;fWFNP_Z-u+37BRSa2N7{$^CaSQ8^}Y z1uivKuo+FFepq04T;}&63Ui?JsLtT##|C=&dyf$PP>U+CJg(Z3R?X0Z{a^p6oEV-> zRvuNXli51&_tUq@Kip9W_$wCN?dSdub;Sw9&QhM4wQ9j#Nu(LauI?g$9nUKyFU1T6 zs$!izw@yDz0sVCu!F0Wef?I>G`Hr=&-w$1O`_-vRdeI+pyJxF0=-$!+=Dh!7VJZ6| z-bb&7M>hG^v}(n-L!d$3!0`)yQXe(f4yL1JVCJYY?lce&bNEuO+%x?b6lkm4qM5{Y zs|_A28;ofm(+y`NxN`yA2}m?>MzoYUGL3w))#F7dZ!GWh+N0OIYEncklon0XUKJ#K zp6m|PCPxo17_opWf4Tg(5G7b}ewVBDf#5_L;AvGAKwa6)i;N8Ulihqy@moo zF&>=IT>+M7+x1qXD^8>1cE0B>Ibq2590L`vNuV+8Qw-t4lT!;pC;0;p0o9KgjRhG&Js(wXFfF8wqoXD0*7rAWO7$rFk4mE2LRFf~SqVdB3pkggRs^D4l-B;>^GoG(6oqGzmkb>ohdeew&Mji$jdzLHU81W1wqrEW^xb2>q_su#O zFY7n($bWwFU0_FCoLKKtBMN1@bAthQGXcZ?fOS`i62A_(QV%jk9DexvEv_Vs$CQqm zC?P#O6IaeXI(|4lcuW4IUH(w2`E5BO?|9p4&VXlfZ0hz>8`iQmS4N4}^h07Vct5Dxx;ODR9& zX7^`OsF!$o?QT^#S6s-DGeOI4+Pmqm#J8e{Jt7V-;bmf)k3QaN8ZysPZ@yT!)O+}|Dc#W8?jeTjII4|F2O6_Ba% zAY9rvcA<^M<3c&*R5zqc-@>ET!so`h@;;|_djov=MW4;NeL4bJ6ZN2D2$6b(9>u7k z_sp%1*_n$Y4JA+rOUmbUXnA=VU9Vt8#4Cb7`3T>{-76Pgp;fe`R1Kt~Xs>O6J`1*T zEPL+|R*m6Tkev((p5uT~+!O;raM1QGNy(V8A6+lB+w5fb#l^03zvrtni)s-&BRKTi z>K;pJfi^V1H*-tSB{#}^IPj0u`K-1TZy7mF^oQT2KH*=C5rig%E;Z!!>Rmc$LKL(h3#XKil5G2jPKo0 zqgVqxisXCg=yXIMG~R^-q08&2a7SI*;e5XA7qj|XI)5qs8r{f^+ih^fiX-X%hMZR$ zUTrtn9jHly<!t)vMc+h^(n`%n=K+sKON71AX-lmezaZJ3$!^ z((6u& z@sn|DI8Os9?3@=cn@BVgQtOXlivYf~LMdbOwk76sJFD{GuV254e)|0RQ=GBTL4`ff zdui`bvWXQN0{@0!I1q{6zp zKWq-7b^BYLeVb*@>9LbCR`O&=fY07^5CbV0_b_C%u~Y8&^LZ*S&$@r!Kzw}EThS>& z4;5+%i=$GH1l_^GisbO|(l{4K+B45$3bB zl&QK}Z_W4K4(Tmw7bIQ>y#x{w9;~~!Cv)GuVvYws`g=O}+}GQ(F@#UASOzKJU^x60 z>HZJi%HI}E`|p1JKd)bt_F|d>>#&q%4(tpa8(?5*feI2b1w_|k(ZI+{fNWVlKv~5{ zd(ttzy5LW;ztF7Bvx<%SZ~hqgOJrEynDq!7LtbVBYg4hG9km;L!2@Hs${9l*&!`1v zJYupY!6*bG;|eDM^ME>x1Ucp{xP-8HfdO}L1Y)8tfa97(s#I-R6&uCYgA^8tC}7AF zvsvGNmSsHtGt>n3;{2)t1_RA`!~*Ma-bR!a1#HQY*ztg5G$s{rdcYM8w@fh;h@DCv zhY5RtA^YMA$bYf3o?}S>XC#vFk=3_sgJXVNMO^y5m*rpLPtnPU`TS9k0CNYs^!{hL z;csjI9B9@-w*1yi9zpCx$8C3Id}itZV5#tSA{oebru(;t*R+bj5zK)V#eGB#Jc(N5k$rnd}F9|T`mL+|n!tdZ^KDC6&*{OdW z=)nl6F%vFD%GFxf`-1~@OTn(jNk0cVC=N)^2C_{COLI*I#$u3VYt~s-AE7|&&nSFR z2v0V6CF2_KuDaKtHMxGWJc(uFm(L^v$Le+_vI8+;{3gIY)iUbCj!@3qQ?a86n0G=f5DT2F&-p2g!k-N}#oPcjTEfhA)TIE$RfnFR z_llWRoF!8y2s@ryxsD3Lxb!>=aHCC9fvS{`pj9KaQS*~tjxFF=k*g%kq~_O<|fmBYiCIYb@E_N(s&(Mb+lP! zlsniP0E|QS@bmY(in`VMaxxQCJ{pvE?K!{a*5-yC4Kjr&Q>FqQOP>tGdZ7D7YaMsd ztiEJUm2?dpe)87Qt;5jWZ(GOVOG%xw8Ir75bOUK1s$3j2as=LmDdW3X?TDuR%6dPu zLhOTFI+g}Z_rJf|_qM-uF7klQhsbX?HMf`_F%Y_g4P0c|vhV9^(Nt@@i~XcC0!F?Z z?n+qT4t3{+Wg<<=dK>sk<)SrSphgY>Jer8039ZCJ>|E*y6AA`hcCHXb;kx1KJvjvd zw~v@!4Ryb08iPH!J3$M@y~R**(9O*;fD=D>oMrFmUT zY>Z*4jdNLP@nM;!M5p61Hsd}9FE-tM=%Te#5|q>lx>pYEr>V@5vs|S$XhkFn^^AJJ zQLB$#*RK?OI_Jh=b+F-Gg>*fLtDRzAN181O9m8BdX`#{HT%_=&)Pk-%rZ*&Wl(n;E zjCP?Yjv8LiZVsRgj-Y6sTu47S-=bB(Z|V0H9mCCm_FDcmekYGM%!;2ARI`nL=pk_M zf^+n#aeRQQxO3;=rU#k+_L)5UVG;cE6TA6DUPNI#Ko zD)v?2zzl$hK}Itt1yBd-@AT!SET;srgh{>$qfNDDUbjW!vrhTajK)7LN*>EOf6L&< zO$AP>HVV#1cmANAGDUI^3pWl5*VNvua?#V4J z=>m?*VVIKA(%-Qt+ka8#q%>o;STfSvhrX(c zxipO`s%O39j?Sf59p3&9;-XJo8-iZkxfxE~vE{?p_tx%*=47u1s*~N6bw;hJT})ZW zYx?}K>1ZsEJ3*3e?^EQrsZ^}Eq|1X+xNlzIlf@CH9d+6LE^2%ZG_WqAWTyJ6?NSqRN^npQWVzw0TwhB>wD0@iV0^sI=ff4amh4@{W35U$UPpeXmkl_DU%kS!tEza5`5<6uWjsgWpVONd@7OPqEqLp$0yC99 zYg>9x(1rmi6`MA$`lkcw(XSUzdVTwBb25_S0(_?DEg)EoA{Lla&oHfEpvXsxt77|9 zYY8GnbTyoFXW^Fs=g1w@2uwxd|H1jp=QeoCr4 zncrPRTJ-b%rVi?>r*=(!nQpJQ*QJ+o%ElHr>3G$m?hbzdEJ3S`?T*u+i6yLYob( z3OimWe{U`uNQ`)Wey>=y%8<-F+7>lIb~oRMcL7MiMPD;U8SGChwtyotE=&_ImUUbg zo=6)qm-F4>5iK=lDOs6p@bJa`O?AgdwnDK)5%M)0IQ^94rL=j_hpuUq#0BGf^)qv3 zq<2PM_@%y?8kSaM<+XaE-BdRIWnfU0@blOi}LFLd~w~y*f z)YGT+i5HxX$q%3c^xPMq%lH&TlSVft9m@xN&TJ`&unS!pr#B$im?!HQY_ zPtWe|TWfXv(&x?BESmx$fyIfP7o}*#0mP4u=GP-5J+`IhK1g>YNuZzkDpq|k_-RmT zL$QRa+vg-$$lYG?fquYU`vU?NQ5U8uaeb4Bg@voQk88Q@qqsVbUElilys-{`&FRC) z)B8yt-e2WYAoMf8!!Kw%E}({wmI^}jy38&Zf4HsbSV)x%bz<1RQftlm^()}TzUR1w zm}=LdT3u;2Ew*9wdwumuUc6kaX>Bu5d>;g)HtkHp#UIkZeW)P>_nKkhfk@29`p;V{o~&H>#Lh~9LM&h$#pah=%1Y*Va3Jh!nf zdI-@d#F_-iAm{C`WOFQm?&(Fq@Xc57=159ew7Q+6mSR=%LXMBTJXB;30k?7h3P!7k zcV$A+6Y%=^gnV2YYLCQ>{m8J6HO4DfrIC%}v}%3NDo*c$IsUDQj5o!TngU>?YxrFQ zWCO4}i->skkHpKl`rqrTR0sV00zT3|-^n(Wc#)O<>E?{iiy{R9HfrWNg&bu|<0Sim z3!O(*J$HR2G(Mm@=TvHHx%9IF6PVoT47%wT^&l%LLKtmpK-|5k5bu1Q~@?1KTe=8OyTjO$7NyKcWh4PypiCo@QF+ z`tQ1n>@c8Q3(5I{J^N>*9f<_{iFFFzi`-!iV{%C7Zct*;lv%{M4P=V;M9@DT1pCo^ zbQh4@HlR*{(-G4JbOI@kv-w&N`C zTkiHB{(PsK*q339`!FDv*Y0I}SJ|!vg3<>Vvc(VM0Z!Hi>}0enG4t2H#=sp4up=;Q zjO?`{C$9Vb1A$Zs^5^ai*8P5DDkYPVz8Y!q2gJ(a{Z;IS!!NW&)8s%w_@C+4JD?l* zf%@GcW!*X1@!fu${OYrP}?0P>F?YdHaYhB@q{T$)YtYgQsfx1|R-OMH;G5 z+rUTr9X(QYeR1zy=*D0`vmWC>8mL=PZObAv;VbBP=e5#NxCD9<^CJYi3%522poQUM zo3qX!VU7IlU~tNU-;1>dSe%S$oPe?Ap9wywBB=SDu7ipAxxvZ0-;Ffw!2i%G0_Vn7 zZs-zz!;bX^jzodzINJ=M2K%-KnTi>>8^*8JR-l^$)WhKCrXaVvw6^iJk+v-f*3cwt z4q5)=S2Ml0UxtE|jrg+)g7Yy$0>BZBF+gE9vvN9-FK-|iDStpn%R6w}`c6~+8K5A>!Do^uV(2o*5KLiz3iJ`eMH1mr*%g(w6Gq{j!ADF&==H84C@ zS1L0)7YA?SepsTw2Jz^_<)9zsU|u_HSx_+Gi}zckgSXB$c(Db$|L2q(V1fj*{5SFP zvMq;Ey|!sM!=%=fGoJuX&Sn9WL1osjeKKyK-J$|!@?ID15`bX3w>Mwac!TYRCu}kv z{=%o=epp{N0>63I)rx!y@Bb5o;=IQi10^XT8#~{;!S9YtW`bqj*9Pzcco7ZEsiKqQ z2xce1Vx=Yy^TlomEMQ50fG_0Y^bys3DUe^CEFNj_?XngTJ0zgY?U;`N$_07Em(Ktg zlM2Lb)j2iB5QyTj@9NoNrQ>%U7=k}#1^ju3Z zod;a=Xvef6&Zd7{<$nj$_V;0IY|Ds-^z`rLbOVONsMdhq_YHN1XjOY);O?td2eK-! zo;!K2|EfX^M}IdjH?;8~n@P=$|XZ8)Tdy-$4kVphR1?+4yvNa`aKMA+uU`*Q`vZvPH{lF+ z&Yd~1q=&qa41OfAq4~~tf|cRjgQ4*wXe9XOUl_)(OxCVAW&prlRy1o^oEn=jS{O$W zs0=^4h+uHS8KM8}vzfA*!7kYn2iq8$_p6|tu8AIS!qkWKrfkSjz7`XsGWQuhnzfD! zsBO85|J2@hRqgGd-r%-bm}4o=VY}y*x&1OS^1`m7PUqJ8-`^Z>`i`$K(jex=kXXtn zeTr!3K&!bm7wSamfXDWc9_gq2M_wJ26*4;NbQ#rIifK1)u`N08`YjSka$TIUHn5x! zla1%G5V>~v#S6uEkQunRE> z67aK_OH0@-7>vP=x!YEr$THiO{hMdlGZS+$Lxu?QUY6<4(}*7$uQ?fqk?FXV?`Sy| zjuDx;ZdlK{4U+konjf$i;84km?n4k_8Q^HJxJ7~e1QRv(7jF{n2xFvvHwDL`=wjfG z0X7L!1HhtUI!qPJUpQt#g%P*<0P_WT)5w*2oteMDe6mP60h3;JKlqk%ZdRw+*_pii z-UpL~>=Q>W&-5!yK0EVRx!n;AXR=yW%jKu9J!{t`a3YTf*P!YYwm}~epVd|p29lDaB@|rNusGryADIP zs&~X9&!0bZ_HyywVHJ6w<64udd$YFoUbo*Y?aiEGV`-sm>uIIWQLd9H>GG+S;gtbq z9z=Dxvaat8b2UP9Ap@&ic|`1*?yD$$ibLkr>ZB*SV;Ae4Ex>5~)$~@w|6-A7?mUP8 z5}l1>iD~rUx55wt;JRfQS^VX2a*l)By|3*HoZODq2ISRiTH?+h5Xvgz<&VJ~&P3KO zT%Ym3*0O!dQ{pRHLNgIj|$zajVrSDX|q+j3e+FZx>>5;6gHCMuq5Br4BBk{^T zZX;GXdyEbR<_lWw0EjoJb~CJJ@bkQFDrPU7*p@{}?ULai!=6q(oi@YD zSSG^P0Ekw`>Ks6<$iQhle?Vma<@4dx0ut|tYxy505afI45!TTO(shmfZPuv8fOgaG5WTW(^gX3jgPLy`2{e#8SDKw%68YhIK`{E z>j|<$KQo^2I$=6mA2I^httP^6=IA=;u8K{ZD;w=H9jV%@+&VfgtZVLOI`JG5x4lKR&*X_gLcx_bUz%Q?~m9aebWRDxW3Y zJ%-~hXFbK$@>5O+b=t>$O>;mePRt7p$&*Xtg9CX_+?*6#nT~ENxTi_dApf{u`d;{|1XQV{pNF9pptEZORuJXUC1z*XECm zRcj?{-8^Q=v6Nhr-i~k0#fJ0F#Hd~NnTa4%li4&sWa#gS# zJ5*Nx4iXIA9-pmOz3g<(YrFpIv-TE*o}M+kO*6Bbbpl7g7ifZG>LsO=wF^UCMKn-w zn^a_!=uTcIZi^kDRSrD92kwwx|l*3ZuF1m+{;EQkKp>R|j`0ml53;ZJNFhmfVp)bLfbZB?+w%uC0-T zgSA*Kh;PjIAOycTySUDlT$&ctvJt;gl9u&Wx9^)}@e-d7i+h}CB-6up)S6jt=;rHt zJyzEvq3e)&aUsO!U3K48u@^yh@#fi^#u(v_hccR#6~x2$m-Gyph>rr_dS%kWmeX)6 z>xRA{xWF>sRM2ffmWKf25C5r=h*{yuV?l5r*7&CfANC*goS&IfSn4I)sgdRG=0VCX zkl%)()pAGt9&V_eJd+N|dz|Rs-Lg0d4}GnqckbTfxfvsdwnPShn$IO-F;|5`gH?q5 z>_9;tah&Aa-fdma!XTyyN4{Qjk(&>*!M;Vk<`( zfC%O+j?B&i0U1AolE;)zOh=G~|9~{x|Cc9z#{5S=%}mXk#ob;No_Z$FauVl@-;19* zeYQU^$U{cw(1|^9+WcIcZ^{q?{&mJMzwS|kXmTJ#hd0#p;xn+qTS;nO zl3_~fOQ$>2GP`m7xInlV>Lj@*llq7xjBIXe5ZyOo z1${Hqo!)M#06av+HDeiBpjC`vC-phdcd2xMCJrEl`H})!zM*(81;NCsEdvJOA6nKO zW|retPLVTN+<}N+`$0pQh(wajACTkDcQ9mYkZtsffJQMN5Z3~Cgj(c5c<(yMvIOm4 z{_}(V=ZO918vD;(`Jczd|1AfsE-nqx)Ca2(Sgxn0@cReTiia#JuG~D#wl?-3A0z#r z9$x%=jiCMLv5%rIekH>{Q&p(7ucEAr{BLAGP&$61hi)~!YRZ0wOFkdc`kyi1zi|lv zzi4@~S-VcC8X7&tukDnyoy&dbr?xZz)ZofPZ?W&7CUC%zwW|b2(QBp_U1-f!C(><+ znQ4s}p+UK&ho{@l9G&Hsdyu&e+#K@$&IyOQHOU(vZ4fqrW6=OWioM4`(VmrF^n>#w zJf)qxa<%U<^P|#roC?Ah^L+f%2K-`XE}SvqDulVvbm)$(5sUy@(WebY8C<_b)Zp-m zO#RRtxitQPwD-Yzx3QOg!JM}am8$Ap0&YVUza=_G%pw%Bnt`B+E^bJLyMI$_jOOi} zoQ_W$6zZ!T6jAN#KEL<6fUdj<+cKtZqX#B~I>XEe#-_54j1k^aj3ypcU9=9)9qvk3 zY@Hhmf~!GHQk*VFVCQk%!Gtng2|VA{$c|M($(KO9<(eE#UE@1%ZDeVOa|^A}Ez z&dyzYUu(AKn&zkPUF>#j3iocCz@@&y$qxA4;fvHvKv}lK5{w1zT^tTcHw~ULv_3{? zc9i?%&b8-Cizjr?hxm*eM`BHkhYDeLSw~p&F1hKHs~Zcpz>6jplrm|Fk+V&#vz zab9+|u*jpT_IB{iN*#`lmz-}IA+*H>PXs3#sgi;RaHJXJCNtphs%MFC$X3Sv~rXk^E`a!P~v|InU?^mr0V0tR(7__771< zKCRy}fqO7SXuS*gOeDWnP4YbZtA|@3?azw^2q;kN`BcHgj+-z~(Y09{6= z;F_D7Vcq@~3wi6~-Ayv>nTQS4qEQ@r5=v-W<2p$~3#98uLy(pFM7rK-VPxj?#+1nCL$h zK>yc199qqHe^l+87pY!G0IF*47nVX7JLFIHo)*^QI+IF1n5X!S?)!EECN$nWd z)w^Y@p%ubl`)RU1A*9iV!ndidv^nzI?gC|cSc0LPyn4DPRWiKoJ^MS`pIJS1h=JZu zA5CP^>3$4gV$gw@%Iv`kuw>TYQ|_`OPVc7XqYDl)iWQkHwm(i)rr*!%Jrwjf!$ga% zC%m|OUXHTO5KeWX)TD;Mn*H?go#GpT-%%y0pPsBgc)Ol>qsu znf0CY)i1_`P#+gZR(NC(V~hc(6Ptt{h6~}mHF#;9-EjWK+ddkGW=q$azD{$Sk)B$# z7S5>#E`4v8ytn6Qdm+EWWG51wU}^bM7@;kEYJm{VFYtvKv#2=y?0JQ{6TH8OnPg9K?nVf=*aKOe>HJp+gN8Yv~OPT&ubJ-bJSKr|JTz&3JhHc)5yX;&Q9p8P8 zJ?^uuz`2a2S$KCYT5-tJ>6yg#l(yXnoD2R}qy9}T2dotfWz*?=?0N?aT&nP(8g zSrm?M$B@s$cxgc3OH88C2?lTH)Q5laX6`xftG#4+-)&>hb?WD=ce`(z8tvUof$VTn z5zL}z%n~&FH=NMVN0M#^=Mo~eNSBql_kFv?&ubW!d?+m~GeM!6s2P;awi%yDg3?{k zQwTwnbqV@?o9wV5CwZe|Qq5B%S%c(2bTdshxPa>Wq8H+}np9D1IseN`1kwx#&DSo` zr5IM^iQ9DNwP}qEv`lz4;d&wESbCjnf9Wf)J%e?+tOft|jLHk9%3mtfAhtj%e_@n$ z*qWTx_)(?T#SPaPsM*2G6f(FtH$SiSp=P?4RM;W>q?$$!p7+gl_XLaRNj2w-pezlyvgAT zUY3P>GG{iZrL-#gG0?Qgj>Yzfl7S{BUs*Jd?$Kc{9E%<7)Q#rb6%u|r`dlBO~|kpS11;^a3M0lTo!UXhTp)ES@s9S z-_`BZ_laJrM9b@@gb4YBAR@cWH4ZtX3{qSSk^Z|vGT{ZY8R-5xH!$QPBumExZi;fD z0d{%X007P)j7iqs$$jWwq4;cR%t_S!B6YOrf;LCHQ>T}um+a3T6ItjYE*;O$5G5B= zN11sHb~3$_`bInPd1WlGYshNP)!Fa8U&F3@JxRVCp)ka`!tu7eI`!!_iO%OsFB@~e z)Rdi6%LH~5K&c#S;bRp|#sLB3-jLISHQ{;PH(loDbS0N8JyNbO&s)Dv-tK({Im+T` zzcRmGq`0c4e7*3(KF2BEF%3m(ZBI(_b2**)zF=glHvq2dT z3Somtn$mKTy0LKPG=@<1tSkLFN^kso%~qd+f47T9(+N4f?@pg);m?4_64^?F*VPrp zL(%BseRkAOMv0=eb}N|CuV7~;^Uk%L`Q={o_(zBr2tw^IXTFz&pIC~FFdBx8?Pl#J>!el( zI-aAYm$%(3>h-^V6{i0yD(%kAB*U=*kJ|;H0&)0H{r$iGf$biq{2p_ZQ3m~iVR28< zWl;Q}CQmf{UN=%C*v)*~ZZw2CN}8Ur&z*A;4CGySA;3LL0(tNt*^dfmV$sm>2}BZ{ zKbY^#=!KMNSF`NJv-!b-eUMAF8ApNrC>b*)n5#ikhLhw1QsmG9Dr&mNm=*MI9YTzhsGOSyTn{kXH+9Bbb z6x?eN-^#3eFNkZq^CO)#a@V2?g1*k0z_EESZyQ<@Y^Rl|*D1!OPhH&oxS!Aah>8Yk zsfQZ2xqtQlG4rNtwO{Ts_Z~4J?U+b!#@>*sMnFlyxn0gbFf9*%Xf3xYWU0rA=ZG<8 zK_C#ZT`|V|%rppc<`MNV4+K7aCj2M;%6-TTEECb_Ou9wM>uG~BA2VEw7{2ZH@7QD(XXhLu_nW1L5mj(Wa|U~hp& z7i&t2GmboKZR^?1^|3O_+@f%iTnI}KaZL)*ps9RMYLm&=494>rMox6VJ~AKavDFR{ zT?5+2*j7Q|<@!NCiUEVH?Y03-)+FFFKMW`$dpy`KyqB z0<49JKOjr$k(|tAAVDt)6JcA!G;FV169zNqkH?iD)(+8rD?twOD1X16JZyAn!Ct{e{+Z8>Sre+JL#bmxi*_jHfz z9A&qm>y!s)LGM~QN0$h2gJ`s{X=t7`j;L#V((ye7v$t2c=0#I+X)Hn>oWDby?Kraa zf@l{4o?|W&-hPViuPBl_Z}W9X9crSU<@IhvviGitP~0{cCK)u#bA-Uq^05R zWy^i8KAv`+wVP_CaI&xoFlSq+-g$~=9Hxbl=er&@W(tJAbh|KJqo$lNZ+Ql>5SEom z+Y31dZrs~0z2xo1!1Y#&4DJ4iA%e(Nbxd*ur(oZC%S(1PN?7c1=CNOwQy*r9O3*161lKr$BAEHXES*R-4(O%-?_Z+uL}tIp^Fy1$OKINMuM85C&ke% z@6YsVSlAH=uR;2cVNb8z^IdAqoSCO@q9 z&rT@+{LRNzRZR-KB&r>D#8y~ncb2Y~uC8H*{?P6Q;OEIsnISg*u0y+1pg^k%dD>sm zO;`(l<)^$X&@l7Nk}OrhPXDzMX~&)kFe{-2S}(b#6Cq4$NE{(j6jP9Dk3J~+K!vpP zAOYVdcf}Zh+XtSCX-2LOkdeq&WNd1iY*kOQ-_eVyvu`vVphO47xYGhk=0?lHJ+ z0t3a^H(4Ly5-k>f;J(>SW8#`?tX%z#cRGmER(%d#^7_|3W6nUmGw!o(1ph4w!I=;) zPyHS2!FF0)q zJ@_Hd#S=WJ>I0qqOXN^D5LCfQf=0Fc!@^yl2}@up4L#yC19{MJ?^9=4idd)B0nMY0 zXAFojhTfeBvP0k$tZQdIat}uE>!^FgNqe_zkzU=66~aRgFNF5sNLCH}v_e>^T)t$I z19ARr?!mMA`iH%|^(2iw~P7Jldt zaR!k+shu=7x=Hx+<#sHu;iaG#s}s4fa_imsZsRZaeNzak56C^{n2^J9v>-Yi5()1@ zCcCrKtO*7-y?RnzU^9$A?L9ZBH1bac#2r240_RLErGB83Q?V!?nCJ`rWMo0}z zy7leH#Zz*zn+~JijSX3ur{Aw7_0k=#%anlftQYTBrNuN~_JKp$1a1Vd;Worocw8GW z$?{Gyj8L0@UD~(5rG8}nz#HwsId?Op@?n-D{{`FKP!OUZ+Hsk19<(tn)K-{rh!WD) z=o>&(DeExQ*;gk;@B&?4j>g5BOYSZQWYEUSzYf_mHl)=OkRm7$&ctS;q&FElFA(v@ zGBj()$&5nkP2K}-uQp|W2ToC+i~;TATMB-(D5xv#!iymq54({QA#Rx-S}Q~YvT!Uz9%yFI0-5V(yUXE$i3 zg7CzpQ;`jOdr8OWhfrq24KTq~>KaG=^EDnKZ`%F}(j}>sHQ9{)o?aI<17VNZAz#{* zK#Q&>IPkjgdxi6XrO%7f%^$I0JM;O|CA&-23(pK6n~jBZaeYY9u1wQmR2JXxfIZz0 zOpZSTtKXw^18HkZV|ioFK)&dTk9q5eYuY_(bO!F|z-;8Ns9J_m85_o1<2Eo%J3&AA z4GT52F8d~`|1&3{%1`lQ+DL0rR7J%@Pq_ju6#Ek>D(-Gj0XGCxC9wtzjxzJnyaVLr zUYCMYOu2DmNLh05!)LiL^^?hG19Ej|LOu0<1@(Q6F^(;SC4Oh|W7=&D1*?!23&6~# z!(D#p>g#MjpT;qKki^k!!L+7N3lI~cgYJs*uIh*2^@os>jOeGdLFy>2w}h5xMcyKU z=Dd+2CtaMT)ZPKQsGUEVu&-!qa5`H7ghMfIjkl9Y;Q@~6q_RfV6e2vy7MF_`=c7FpI;Zd$Enq?eH{rHCh~9fpLc1S{=9O*BM>rv$J#f$L!lz#|G;R9j?jGA@1Ymfkofp+U-J>WGYkEjfH_Fe18Xt!b-2M%I(g%z-RBv35 zi#k0LaS~19fCi{q%0fZmL2MnSzrqYmoH;1Sd%9f6@A(b~!3Au@8*`5^jOYfaAadh& ztVSi_?Wh{cKJd-C&#jse8TXqs%7IePBf2y{536P{XwnXVcd3 zs=%u)+b%HUvRP7(GgBSWx*wbKYT*iWh3PV4%{=s@=fJtB)QJ5Kb8KF+-ZS}-UmH{b z<|9TRT>~XI&R|<4IiDCiCViqSOrqw6?#lW1*%uHT7y4of-DZotAyL#Os>5G%yhtOSR!wFverri)LU`X7& zi0j@+$rtU`m_ZN=(updzynW63774-1Gwa7q-qZ4e?aLhc-oA78g@F7##>6;-`OKK> zJb-XHK!y{q^QLH2VO;2-zs5KFH+O`mzt2$dAam_cEb7I?U$j?H6sOJH7n4sPb{cDlyN+1 z@EL0~Xmy&4+?pFReK9xP5Fe=dKrvI!utp=`3a9ba8!!JwzudjS z3*)^O?Ruq2%H6hGVs)vSL~{rGjCkq`7vGxIJ$lfeAqgj!U9QN#u@FTopp z(zrlJ`5~G1z z-mz#7Dneqd^XzC(p5cgC5#N!nwkuYRxX2cDLOVEA^8Nw)_*K6cwm!LtO-44tI&9PP z_f-{{7SVXas5z~(n`*PKvmQhos@2rtq&)#H8Ne*5l$f`Ei!#%Wm2AXjVrw_LT9rEU z6H}(1O>Wt^A%?wk9tCIM-ew`AmioM$oV*YH0nr{})9Ao`#x259SX#aqIPSVevI{ab zq=rA~%TPnKB3FH5j#SI_sgz^8v^&{P{(8*e?f=RH7R7vy@gC$uj{^FuO(HynvaaBp zO}82IOtPfv)FyB%A2%m7&WYdTcJ|=Xm4z-LX;(LB2i-Q}kPF@0B;JKFa^}>h$J2M! zCx4tLecr!m{9aL^#^Iul{a3*;=qgw;IBGO7Sr{ev41xx+&vMtUS4EVq3M*d-tdTxz zSjou)3SL`C=j^j@T5^8E3V;>>nV^!vSZfUgHW*KAa=vlQ{Nb$9$zVQT*i=c*gX)(F}4MT~c z1=2C}T?0V6njAJ`4BCfnwgJ{&UkX-|5%9*KXh4V93bUpM3_k*+; z%IE2UN;lOF_JSu-3f2lumu(e(4T>A^q8=qFnqCxQn@4{@HhsNES?kH~?ZAjJ^vBUM zps30$dHIp~n4%`P=kX$Hy+VM3+xs4E(;tsx-)Tb(u)ut!n0AY<9&QX1qRmwYY&7a* zODkE`LuQeuV)nHo4(%Y_+vXQUGmfNIH|hSG8H>8sCexaIEalvr*GG3> zhTQ%7VUK_*du;jQ*Ja>@ifh1A_1tu7?(Ne8wfuLrFV|Ct`mRNmgz>CnToD(Fuk7vYE z@O`fG=X2)QcUeMr$3*nRfvlL10*f~cET$Py3z!ARHsV9y(0_$WjBVQl`M!Dyv3jF0 zU6J)V=Rvipmwg{(+ZK>6G-iIAz&z@l47h~u+;y3Eb>S7nP;FSZNONC*-Sm_WYFM6* zF+2e7FTTYMiC*;@x&iuqfjTXoFd&P&Ho@rk_lVe~V!ek)HZ<#g_G|i-_*ASiTfQx0 zFO1NegyLk!C$>q_?m5uzg_UXN`7fw=`Ko_;uxCS9^-9EfaEZ}xOCIL6V|N7r94?K) zx9AV+L^kxUk5;3!M_NuOr_8wI{(6?8T9^K*Mo49U0LSr~*PI6YKG5}=y|kLnc`kBK z>7vHoYgxWK+L^wJB+~3d6%plU4_|89LSxtz79dt&-5yP3@xUpry@L2~>_XT&AkM^S z8fWs)vOfDY$5)RZeLIk@_C+#5U&Ye=24FQe@bz$#*@DJ&VuvVj1$@#v|L}#cP*qcX zvHK!v-_gSc5mJW!jZJMFp)chuWg+3)=>Y4V7bU~H17%QhV=PfWmxQ;3M{lDMCXFNB zes41cF7~HA?UCEXS9F^`BBZ}7TvqBU!2H?w(W=NjJDm`L-ZFHdHiiK?<4zp$1Na7GWtUZt^~45}ebJ53_i_aZc##4&M3Y1GVa$tNy6* zF-2X~zOtEd6Y=Sr&Q%ap6`hAQgqLm+2hMHA?BvY#hc)4EJ1ffE%nfb*D}M>p4MfOW z&O1-xnHR2dE-zpD_8Gc{pGH#ejiDhY2y|&E=()m@T`N$|4sTdflhBykxFhG4YvvrR zV97(&?sNJENXT?+7SBP74k4lHnJxV!tG_^jtdkIudOJ|2I*)YYl7ct8gov{T+vP{& zO%(#ady!pazOAYJY5Td8C{tQ`u{X#sti)w-_i&Sg<@8q$C&yUlWbz4p;D)ztGCFYDp&N@{kmxxtGJK)!;i5Gx>5Xm2#-491 z#nIhQXB_KBFE>9i?3>mZFy|oJPu-k)=a~Ze0a`gz=fMaLQ*L#_VJ#tqWc_f@4v6zz9Cm-NuO#q!~%0PLtzycXNi&Nf3tc(Lc+(|!sc;xbIMcj5egy7j7a z^N$l6Q?=LzP2Q`5&*yL6e6@_$D}{NCzxzsMEd+5CdM~tC05kq=HL7b%JRWnp+2)Yt&v{#fR{lFEus%h!3I{aRx$ii~T z$;sJCGGEj5n+c4G#RY`Xo=`3$V}M_t;I%&>e1>6P3@38KnmK%`sY(yt^IAw2i*@QV zDl5ynjBjuOUnxh{XMNau6%mn`AC_XMa-gV$HvRp(Bv0*0yi#lxkn|i1mCX+)rS43-^c;cn>iYbpv-# z3=Jq4v?mRv0kix`AGqMoVEE5a-y}3Nn@@Z=Rq4!~G^?2{>Fa&G-HOLTi){s4k0lvU zlomPDej?dWmdr<6|Hy@@#Q z8rnfv4_16FU%jGg?wW*%gmzYm?+x*Q5XoN&xx*a~cB?xnDHIIEh$lI)twjPOO%ZG2 zWT#c-&%ty|ZA~gli6A)>pGdc*n+&Kzw)SN1lDm~5Sa5Vw9JF?QR0y^qT8aE5F(cT` zWj>nM;bo>{bL(7_c2h9Ff&z26;mdNV-Zv9C9EoW+Yq43x39aS>i%z`SY_SyT;gFlQpO{G#mv+42JU>9|TPKA@bfa zYQ0-^g(~S-lI5gAy`g>DKIJ%1z1c@!*ExZfP$}y^t(cS&Mqw3t|p^^v({EJe2uC=2}oYZ4WF<%zG#-xpUk z*0dyi9BtKfRL^eV_SYlry6-siSaVn&G#_HUYSG`?g&Z1p=|J3b$#j|Tyvl{S6J-6Q zeL8kLiCH;38DdAowAiAkqLV*qh|5Um+Wesi7h*#FWN4EnfX{bAC8vdZ0 z)H}q&Lt`%5!DzbSH3@tdleb+Wqr5!4n%u74>hG&cRE(WEu5y~!f4)weGxT>Crcr`z zBAl0TFgOGU4d?BcHtOy=ARJNhEtI;Ua5B~rb&A{J+?k_kWk3^}h`UBQ2pSyCY!kxP zmudzS+60ES7AvoX`l;;yC}&lAUwal_pe&mv*vJ1_{y>cJ9cCHM3m=aVG=|U~bw+x3 zg6^W<-ehE!Ty*yl-RC6eYV_p8k@~I|!(}Rxf{l4ad&2uf*oYV?&KD*)4Okh?keVb1 zWZfEzdt}2<$1mcoYW+dW>tB{>SpjOjMyI@6A;Ea^8RXO?ffY~JU%?BYEC^~UP86N= zuAO)i>1W-=KD7_kcRQ+N4bJvJ4@+P8bnA8uKRfE!GuHdx8*78#*BCNdG{R7#+%gY- zJL>;wAYAE|`pKXSQQ?`c1-5u${c5&%5GK$69xOBL2`*qThNZ`R+QwPhxsh1&y!yqP z@#;bwW_yChA+7s5ANIZ;$T$LIL5mBdLu<#$8HrDkB0Bo$NB(}=ojG`^Q&n-ZS0o^x z|M&Phe@z?wkN@{iFuFt?RUT zBGutR;a96_tA|sWxAGV<(m>Z-&=M&1n}S^?(9b{Z2*L22o_uD$r#qNwvTMHOga9xV z;DMA=n%>X!;>qVQcUXdylg8+*VxQyGiRqiCZ4K@4ZVD-_N5A&&i$>k$zrhYc$JznE zbH-lV&ZYtlS{7Z8c7~DVSGAU(`$TVd+|ozmJV}oXi#U9m!QYE^R)~;)|TP;E{d! zeQ3gcw%2a=>B1(`I1=s>F#=Yx24_aG|uRJUC7_d;#u_a3!)$S7Iw3v>R3%`Wxi6c z%A5IbXJAE~r7yP%d9Z*@5L(*@e+kHOVsmz=jO>_}tHHLp{1Ik>bsDZr(-clLgV@n* zDN)o#GO`oSLvwD&*1BgMNPO-?&7}&JCRGjDawP{HlnI>2wB71gwfY((OjW1Z(9OdA zsMWMYU|=2CPFV1V_DZ;ocQujmGaq*$8^?r{?WNMXvZx9DV!s|yARm}dT8=&kKm++K zGTnvAY;d$G^4%7i6217qJEv;D=D~GTQ;u=A1Y{fZS@(~BUPPBhnOncwXUbmnr0YtC z!^zf|3fKuIxAY41J~?>vBBovayF$V7bz{NRpjgbCH>lNOG6dEUqXiCZ#E+{ie&FZV z>I^L3mkasez%q9hfKBb7mxwl&oxx4syhzKUNc_;c?-ObDxrs0tnf#1rz0#BUb)R+z z>cd$9OU?mc%>cxb(}TV6`cJif6lbd2sG%r9yR|l_(SoAbxb(5%X~++XknYX%PwuQQ z=m9J90NSKhGn{OFUa7!5LV;=01%f`;Ois|bFEzhTd}pGLA*xvOoBgYSA--Yu{;r(|2Hy?Q*vc@L#T zTcq2d2XGBK35cthX;1gW9<&SvHGF}F=`uf|_)Zg%1C6NMoee#BHtWDZxySn@lX=MR z%sO~b4AX9k@W#W|w1;Oe*zz^hpOAfjs@DCZdXIGbRII)L*iJkh9yd3fJi}<-#WvVr!aacxs!{OO^)O#7r*^E#g2zf578zI5? z&29_Q)NgTVsg##YK8Ig35s06YxzZ?2ZR&CHXUOOVaBif zqlD>x=wS@Z@Th1Gf%g5%>kP3Bx!c7p9$B|%d_hCq_#r=Qbh8eQv$$#d z2Z@-DR*YS=gWCa!8S{B0B$+<{Wns&2RCd6)It+*B5>&p1GycvypJcSS3&>c>|)S`(P;jVjt9 z{VAUQrLG&u6LmzuKJ5YtLmp}Jl6M)a542PCXuYjyx#fAIC?QDUW^pm>#(Nk65NFoJ zXjcI5$C_4Gd8>ZB$r7{Csynhr_L0NBYTfnycjMOiAW?(soP;O9MWfEP^YfN#_#$g$ zbjI;*u++dUt+Y=cqOv;ZyP<`JFgEZqcU)X)7S!heB9W&>%8Mpx9K*l7*rHIGr%iUh za6fIaZ2xMoaZ%qFI}XtNV1s1=n_6S_awE3pQ-OB&?GC%!o)eJ${(y9t!^nkv7*6z*G23K|+f$!VTy0(#-ywesho*)M z7KW!&Dl}c;#Z&JanlJ8rS9&R42&?-k{lBE0585T%pwmf)KlR-{iPKK=@HiWZlTL?o zPi736EpgqCySwWK*Y6W6kMz5qA5*7w*m_Ta6lKqboTAsgix0Fs3R0VSZ;#%M-ZYU> zS*YQsVW}`?14EMR*)gv(9+H`W5Vlj*4|sO)?(|2M29KCR*_^BAL1&p4Iu-{e#Kj@f z=#u1OOuJifbIT1xv!O!IOb|4u_P&4-W^wKhhzo@7*&8-7?R}eJR_-pckD7mO_D7zs zT`KRs{;hXfY`^@-2w<5FqbYLs_^^U6pw3JVVK>a6z5!pNu8P8M7sA>Qm#t?&N-mD>GY&LC?xO5j_nFpW@l+~l%ZyodFF*}pTy zi9h?dVuAk=Wy|j7>{ve>{Pv@#{e3gvOM!|XzGD)!96n$`BiL~4`nlf(MFaMonhFsx z9EL{ER<>PQf37c3_L{zs$p_x}zq zfoVUB;#|ZgVIU~I;&5nt{zh^mm*FM<5Vhx-r8ZHT$5m@Y`uc_Hckfxz5lnk24H^M} zi&^we%Q+(cb452=gj|!*=;rPEs$tkkTF16No49&HrpV{}J_YvAFN%-6TR3P`1|7nA zJ&|-KZ?6-`lgB5wDn1@{cuDTT{-eP<^+)9)afS4lD<0vpF~DK{=OR9F-l!EUo)VM} zE#4crzF~OJhhGj?Z|vLpmfyi`lUq_Gv!Ib%>0;S4GBr?BR>+HKhOKW?ywB3&7HX!k z)W~|b6Fn^Gx!5&pwN2!OlJxytqq2ydso|l+-d2ZO%63ELFoAzSrg!a3s|i~(@1W(t z;bBL=93CLFYHc%1Qo$+MlD#|X{^gRBkvlT`iDSDZsR^H1X{XLDWn z^C~tfd{gvT;AQ<)K9(Ucz`0Teyf##k(@6MuG~4W8UzH2=0<$+cbHLE+sHFC_Au*jN zK~vIpVTd(+pt~R#>tc0GW+UF@45 zGKSeUXCw7FzM;>8(=PLik=N0OSKe~Kmrk{n2%J50=rCa8Rv*D_!VI!fv8e6N-_>rs zIv9IhilrxAm6nh|xPMj%5X!?;!U~pqm!-v7h~R`*mz& z*JF_vbrIcMITYV~%JFvT;Uej4vMk%`Y`@b%2a+*e9XzwQ41*6GgNW6FEslK>sNVS_ z4jZL?Daz&^k>7{JPAJJ5a`C@4@tXlzgUigf8pnu80r(?fCr*9bn*D5_o7IpI&tNmA z%qRTe7wh8<()NHm$Lp`gsx1s{(GqrizcjCy0VLJ7;yu!D4mJnez9$#%d;IHIQul?0 z$?&zE9xPig}FuF(q@v@_m7pg zS@%(Qi6B;-#j&)+QzT5N8)mD+xto5wio5d{m6P}Red{G2Z{sjtR^Z%ahCVd5DsIm@ zp^l`V^t}{kG6N)NCBoY%;18Zp(ySIm{XLYM=c6lNSb+jYBuxHgp|? zttY`*cxf-9`{L^fv){x^KCPFt@t)2xQ3^I)=Kk#6$u@Y8h2+8DjUSteF6kKXA!6%0 zBxt4$5=a7h?<-%&O|u*KsHWSddeR@94wua3=dAcH{&@{F8sm#mjd*Y>4w6Q4Nm5Pt zBK^q9Q(ZUv1^c`cLl$l%sVBsqyCW=l@3EiuiQH?O`0~yD_Y{G&y%|}C+6UNqCvg@u z{g=}h&C~BtsqcL=dA>rI9jMda1gK`lZ%U%o-yY_dPF%NO(caq3;K!fr(=eLA4$$jU z#VezF%zE97FMezy8ge-DHxxTO6iqmL%KuYl%yyv}ZSlzdg(|wKr$v2p?T9opjN#xz z-Z~;>U!Gvz&+DM-vEv9co#Wh*>E&g9!);QLV|Nj1qcj;XDt(;HHm>&+IO1@u2w(HW z=tIG}4az_!7xm3;FUTiqbCHo;v8;^!c`@-Cgew(aUeJtUx4vYL;sf*B@uBBt6S^yn zT+#(p#XvJ8CX7>Ad(1;e%fYOm|F`VLpMS>?K6I3QC>M&`&1FS;>idqsp;z`JDC=oQYrMB6mSY7xWRQim#vpR66Yy(8i3$GpNG zX?`go#1IKpkB!Cx=E;K6IDU|QBBVZiG$eI1&3i^QWQ9{FGm7H#WY|cwNDXu}=5be4 zNba+YEDZ83#Kqcv2i*vB)(><>%R?MS~xNDe(+ zTB8MW7Wc0+PIlij-mkEKWMuE>V~g$c_l7G9Ta^m;JOs&(4TbUYW+-x$UyF^_->e9C zh?ECZI%TlGRYq~q{6C$^I&5_yhOl(C%qddp(FE&$zgsH^a+^B514%B&gwW-hux~mV zo~%wc)r$vJwVu#O&3!*~_tnCE&@SBPFN+hP8Pc`=x(FpB@pKEn&XFC?o2FCw%MQ`( z>P@E>)csvhPa;@fnkN_ve?9lK4w24=5cgQNEApE@`koxfeN?2J87B7jFl1ado=jUGZ*xY%2I zEyc-6esXOe);fo!SzaMyalS-=a-QE4y#cmsW)K zZ~E?2_1)uigg?iF-5NnB7iC!|qXY3g5FkZ;!59;0$O+UyPXnI2^4N7B4Op}frzw^v zDjjv{#WXR-$-{|mbt}}GBE;>_HAS_j9V3@dC@d)X(69M*R_~0jodYpRbWo-0z1QW3 zc1JY5o~=1_KFZL`&D=}+06zlZ@7w!diaK1-qLng42!#_P69RC;zSpnkdq!&$x^EqH zw(j0`{cQk|Yd-ALrlt1d@y#c+h495}b&eeY5NPBLsOFu$d$HxEr;lM2`&#MM&eEZH z>2E^*r`Gdn(wp>;^0CY+|M$lo|N0RaTm2~MEMdE6ZUbHfy5Y0LMYA*Sl`kRi4IBBY zJbo0Pl_OepDEM*APX7n43C%l3g8j9||Mlt=|9Qv4|95@k*XRDv#5rUfAY@`fZPZX@ z*!dxnM-43BtDJ0YK4H;^({*yV+tuuAx7kQi|8QUkX2mBUcyZWO8+Gx2U5esg)>N>3 zODdQr1HnFOL>J`X^A&3W?VQ$qXP102Jv5ROLWi8dl;(5s);FM-lXIs&5la?E8G8i znX>;9OR%fDvaIKIz6yKGNq1_YM)$kvmuU%T|E>O7ssL=Cy;Aq!v(+lg|(<6#JoLXm$x~o}@~`{*wSK?$->>uWU$ZY7ywm_Dp5Z}PrRCJuNgV4__L*zb$5@|| z>CWGqu$wz}f+eD`>BNmAW19{E>M)aahCty0X} zu1>C^y2^*s?!^jSdHjf^vl+~h6G40;g#_YcfH|!1tL6c0`IWD%CkqdQIZ=Z&MxKp%KuNz9NYhkmV>RCKr%tGd*X?x zkZLX*D9s(qxoPg*-zY9z>acKNGD*%#cT`k)KJB%`)6a{yknsfo5!d}nWBrGi=0DjH zQ(DrVwSE!|O)@Zptv@2&a$wbRM!bI)|x4Zkhb4prP%3XGTn3#!orKyH?VeDAIF z&-a!FJj~vfy|pHvN|oGV?W?~b@73#ydSdOVj_yAr#tgf%7F9F3ZDBP@DPttqfV)8Zu+t=(%!V%zeBj`fD#pU3q-jueCEUiE?x5 zofBADIbB0_72l_A2d-~Z6*jgOAdJ7$0xEZ1$)3?a5Nseih|NLw9ms{D<|V=YH$M>V zfv{-&gaT8zJQRd1IP|(m^lJEiAT~y(*Ux={rgT8JQwuo}ju>iyXj&*nr2a zl+Y1{Zpr>iMfzSx?*#m@>rvm4JGbiXZ?t5k+B9&}ERr9#9}+qh_1HV-u(jy1*{d(R z6Aw(rD+9^cKdx9V@~=zM@@Jk3{Wli?lMF433f2LErc$da1iptN4daEO>9$IJ5a@v@ z?!#y3^#uzCDDIA+Y`{BGL(*XU~ro2V6w zb|$Z7^BpRr9{tVXC_w*yI+jtty+$^Ms!S7hL&=#NSWOy4Q9;cKm;q}xvG$K&ye=aX z;C%x<*XoNLD4yk1i@tkSa`fq20<%+CQ2|3PkZ*{-eTL;IHUHT`r~6GE1LL!=jC>9@ z_B3?otY48HphkLco$4TzlrLC7bTloaZiKhnPw*10iQahVQX9ven^${d>MZuNJ)##0 zrDh1_?K4U7%DLA&G<|o%l-bfwRY($K)FTN4J3 zj;^nCr2IhG(467G-D@EX)zL=uhO#DLYO+lLGiDFBfbG}u0yQ@Zs@I*@w7@Y5rqi;4 zEea`s#u&hRpnkhZJ=15Lu)z^Bf+CGVfS57exw^zaMZ>xobQwS|)_MXJ;6P?fCx%`R zi!KrIOc4n{*b5|sV_5=Sn}DWzkgH3fc}6Xn#lF=Xk{YNR4t& zD{&_`%PtOl(MAkfq&?p{EQ8G?>qy+}y|K3yQlA`~*@GI7yC$?~VzD3Sg0=82i36mB zhs3OsBN-QZ@=94)d|`_ep6a$QqJ=wi$i~T;-~d+6={^Y^~4gA=1d5KR_wjkU?d0$B)gdLk-+w!M)y~PrAhnX z3yjkrFM=*|AuP{?Y^>EoE5gW1^dDZGKJ|<$?0e|H@Zy%s!SpCw&{iu9%UxpZBeT`! zc5EFPn(m&HxZeKy;Dy-hk(=fzrozkH+o6zbm6nS^r+@#{ zj(Pd-QK!viJV8+jl|Uew6|5iB5&)r&i30P$yENn%V^vtEkFZ*cUg6dj_?685zmEI< z6NtDRJ&1vgM~M*g;a~?G+X}kMe;@#cpM2&C!IkkF{S)t&7!i(8C|o-Rt;^hj#`G`P z7_RF4B{RU51-7UfdNQ5i23l z>pVkL(Wu^#i6=!dnA;JCETjZ*+? z5}eG)DkvVqt_!y^kjoEeL8I>yvI`EGHuHP$t4LN@C;*q1CmF$2D$z^S62Hyg{?yN-(S%$vU;+MUuA2>=m0}UJc zozo&0Aeg5?H3!*>rS0!$B%;`%FRujQgiD=l^LvwlxcVG+V+_BsEsg!p^GiH!@N~VK zN!bgrIex8FKcP}zv_9#g=kp~CRgS%eNuCosy*vkXc=cSLt%CAs_dhev{-0mB`cG0* ztnFwA^tvka@CSk_N1a{-QNgMyjEMokeYig~aUoegZA$&^~a8WO+m zCDZ$RO-UjV9PMfFfe>;(Oen#A^T8Hg!&2lkrlDOf{ftMR1Tvuob;7TYKyd=92J_Yx z;9SY|X`Ln!0w9W>2J#2uGCPXNQ%c+G1;Q5%ecCrL)yV}~;$4NF4);HD{r!8yvggHt znPQz-Vn%{oih3Ax4)o^5-as+98PP@Ht}5!gQ0Cwe=;m3$m39E?IN20-<95?UK!X!u z8of&P0lW3E1(9J3zQ#KhoZ#+rl&9JB&(+x2g;L5?D&Jq2PIu779t=FPfk5XwKvQ!fRb=MT>A%D~%RN z)Mwz~3@~PG1I65TidGF`@z`rn6Jei(xeH^<+=fzjE=B44U(fyyW&vSRmqcJji8xad zssXKQ(oTVdZkNf8|ADAP>u;da%{FFMni~jZ1je)ST7`|3KJDa2jWU@$M@1T#xeNYvK^&70!1-F zZ>%9dQ%|i$xBmVd9O!`_7?W%Ivv*kreyoFKZNM)P_H5vLC6G|du^$MRJ?kgwO_i*x zhkzYI%R^(iSjr`wJYWH5VmB@TuFIPwXGUNiGpvY}$?9 zIHhH(`~%^g*h;tWDc zg07_aBK&OqJ`yOW7om9-FGe4r=Y*h#t8c;PoVv?X@V@?v)|wE1vt=b-wGJ~j<8X55Mlm&I2c=T z(4LjbYy1Nd<5s}DgJEtyyXDUo#y8(MZ?afxre`RF*nwCF`XN1J4n!Cw&KuHH+F zGQP1028nGv^1fGObz*yPosvkifTS)5{{y_z^>?*0tO(TEKMi*NcTo9%nVDZ^=9iiI zLy)rLV&AjD>PRRvt7k}Mp`?s>P1rs>-MYp@>KH>7WC3oV{gW)_+emi0H1&+R;<>Z- zl&_zhrvk3TxuT`B>JO!6MnKh&f*nk=m3yiUddlm$Zf zK$wqhp&Vp#%j@v8W59bwpMzgqK~GH*kQ5{>;%PbKXN@NwqUikXRbPUxzC-wB3N~b& zjiGmX2*v@p(Lz|?1xB5-(49cmMV^!mb6o{vDnCQ7o?zD373vv*u+(1gCoPcYm>gxq zKz07iGEnVFpn=b_DMI8~^yg193`XKO48D;&9v~Mys3K9EVS_cy7uPYUap}SP{)Q9} zVtasX7KZq)q$SU!o2njUyY1bw)`a3EuUGx~zPU3bPs_2F`bai6ekaVQVqE(R*Qng+ zoVdsO?g^uEsY7&a+6iki$bzFqXma&OT3mAGE^0QWJi|yf70BiD30y^NB+Y2_Orp2p z)R`4H(QzlKI(O7IoS;fs^mWchCdRK{tQmC5%0H}WVSQ5&*+?=+3Wugjx=BAv_O{Be ziawZccIx%;aO(S{pZNoL6t$vP$qvd~L0~ObJK*%o&=dd`NQx5Nl)>mAA!z`EI7e?t z*=zy#RQCfx-avmofMhW6U>TT^`s?^H)E_Ui0==|Et*@iOlF~(@m(exoaTE~Zn2z)W z0E`WMSu+4!XWyWgDG+!+R29^B$RDah({WIj2tc}A;D1GT{rjCD=Wn7v{Xnn+__DXT3QRv30OPsG85SMIaRH#Qal<#LW_B{@G;5L_b%JI(R)OAZPuQWc z+r!}+<-L*H>!xeA(h*bYH&}BedMWR+w~xDCI$daHC;hdow8VVg>q|sgs$|Biy^HO| zF!v9{&;%UX9{`c=5Ey=yKMLlgF<|fSc8MXb6NgNmN8D)<)t9iw?aBc0Jum2YDEL90> z_W0Xp%Im{*-A-A)YrVrOcUGi=U%YCZ_I0%hct@UEkR!QLgL@$dPK61!+734R!LNDQ zLAG7j;I7Y70VfIeDZ@?&U*E}pqIpc~@%I}8)*7S8)th-)_f|$Rrg+|vsJ#;HXPN;* z{^NsT75{xFXn+Xlx4rOBTqHJz6qpUu%;YZE1j4s*h{ik*q z3^^@@`w2;K_gC}NPJQhBT7R#5&{5ILzsZpG`;!RQ8Jh&ty0^D7xL1=*0Y8DPUD`68)cXh)4b+o4_u2NrRvu^EzE~@% zo69}%Hs&smy4&p!XgVmzS0_v-&CR-Aebd!=BJRxKaKqnjp8{~w|9aZQ{;nJ7Ppr+K zuph1xnzTK1N1Rz%R|%SragdZ16ck+S_^PtcyTQh8@!@7%&mk!;%xRL`=Z{CRobgvB zBqJujCiQ)F8t@W7daF8B07rJhrsv5QffaJbWLpsU``GP^hkqS%IF8^;G)lSAMgoeGwRnJ~+;Yz%0Ri zAg=Ay#WE5>kc_06kCxjpc@(xnW8cv~PkGlv*Z@XLA(FY}L3MU7~vX?y%jWgU@EU>y`c0%-pR8CZH zTX@GDar$EfG0u{I`$q*qk@PWN){^{CDIh$yZdsk)x}GgG?SsAL-iGlk%U34X5(i+938w2O zV;&3J$Cm~Vt`R#Lm$o^GXg(VeO4L7b{Mx?;HAW`cV5?C)IXyc=~LXS#Te6Esa^ zGh#D&+jj4liftM;r~=aHVF+el4>Tzb0)tDuAnM>_x$!6ABWo-OfEw{35PLRhwICX2 zyfs!KY}N9frnq#NzN$y}GyNU&=Cv{vm%Wn^b1(l6DC+MD$^N@9Mk0vA4XC22h7N&Q z8#Qus-CH+J4~s{}KJIQ-w1|8@S#_zDvv<<&u$}rjus+1quL-@Pdl`f~BU!-70LAnA zcLZD1hU6P)Y{>+8k1*gphDQsdncS9WlE*8Eq6@sjqVtz#IP^7&rv90*E(zR_hw3~C ztWKIE8RD~meGA-+dNLJ+w9n<(V6IhAwEj7`pp6bVvR(XvAlmf?&Pw{X0SDii5zurP zxRt;vBVbs6>qP%`Q@>W~*Pi-cdHUG8xtn5|3iaNeJe#A5n9=6?v*9$Y_-C&CVRUo! zfI_{mZga!SRmtk-eMaUTFj^bs@QZWsPmDl+kAwg3ez1tW(NR31yl*!MFR!yXa!qu0 zZvJP6&;NIK{jajR|M2_t5)4Dz=K(p862L>h(SkeXUoDb0IL~my--8E7-J*edQf*YVjHPVM8e8AMVbo{ z;_y^z8aWVX)K_`^-hmQz!`+Vusjkv44@|<$&aB&_6j{G-bZ%mtp>>d2$9nUh`jC|x zW9;wDsX|u@y{;r+)Npo}DZPcwt^wNOc?dq1$FDpK>Ne!-F3d?tcfYoae-QSaax|qn zAat)k(9~gnVnQ6emauQ@xvoGth>JZv9s>0opqqyxXJmt(XP6y{}id zp{Md*&+YfSX^_TV!t(T`C%*oo@x0pYS48o{?TZsk7uB)5V z_1kL}iQGDyPq?Rw^NI^k+mG1o!hO|2Y)Jme6w3b^uZj!;T#PqpiW4jd zDBUAHfQ_&4a#J5Dx^GTnCNA9zmm63`ig_gpbe*F0rn9=r%Ano?Why%oTRjv2y|Jr9 za^K{3+$c*K-?r3a_Ib%oa;-@BK(%qv_~mdflanV{wzo)=J6ZcH#fhN64FZh1_ZE?r zQZJbV-M$6es>1h_R0TCbVeMwz)f}7~%_0MLsoQTenCMQyx#ka)!uTxT z;BS@F{Cn@hcbl$Y6Gm)<@A%m~i}1(A(tV+p0l$wZs_Ka!h-`T^e^ATbas|}`v zl++J|?pd}GWtQdU1v1$dxZtgX8(1FCf|^8=3y!KvRt_;({k8c%uT7P?(?hEJ%gt_> z9OGjdgIEY-NrcT>k_>6mR0BTDW)B(jkYO~Td2qq3uEiS(UMSJIbP$jdxVv5jPJ+B|>yj3O84b_5gYV?bX3 zpCHwM-2ZSFDLDkr;C)6eUjg)VO$JcP3kFVilSvjy=>$o}5!c?E* zP}Zli~9 z03{pR1#;kcc2fr@QV=Ik_Du{`F5De#=(HwPzY8&+Aj2+ge4nfT>>ZtHd)Cx`=qA!X zRgfGKrm=;HeW>R5IFEL)P99@bUKwU@(l?^BJ@Ucp`^VUQUm$pn+}NwFm@3N}L>guA z$x)A!EkZL$P03{MK~W$!v@2olOhrlgpJ7o>V~0c6NV*mdUeh{rGK! zTlfLOM8ex@0h$MSf8Uaon}S4&hok1BWyE%^vnhP1v^U2Dg(BiWuk`hI@5+kG6DSno z(Om%|hSwk0W!i5wQq({oTK4*Dx%5F}&%vSW3&}eY;!5@teJB;15l3t_s(M#;oa^wO zD4u-ejpOwr{+b=N*qs`@WYN05ks~D4hbv6y6BXv31$ldssMfVW4|AJ{*R1=-{OwtZ zKV?*E_5H-_>6h%yMWDuW$U07ilkY+-D*}y^Je^hac;@%>2t0l4vW+9qK|RXgyT4XF z4NYzy_h|0mvi2+&bpg=Won!4T_ZBSLWFj-MU{aAj*4E4B>=339^t7 z(V5N)8vCb1(vXkPWAP}8qacc^JPLl3>y21}#K3KUlGrsu$i`}Oup2(C)B2`BoKU{t z={N7HqV}?F7QPypA-z*)M?=J2pbe$P1$T~ag|!B3u(& z<;%%Wo4+M?dEuxsOQ0+LvR_--MeM`U@E?eYgZ z5woh(#2s}|9Bm7Pc z-0v-dz1U+c_0k1}hY|bWP9VCu8}wdzduD@lcL_94&vWd9#eHqWtfE-k&-+&&}qg!=<@4{J)w~F&LHA%?~rg@r=`WQam zUh>3B=&BbV%Tmd|LC{|T3t{@GJc#c$~NYH!bgyMca(Wl6GZ9%RdY zEZxYy?-;torRsRZ5smi4$I>_z21A-qd<<6rla^=!1&pgSAJlz@9%Z0$@1uCH^sCk; zol=V}Ku4>DWf3ZDW+V~nI_UiK(28XmG8qPhxsR8sd7&>EXgGHEGs15aOL4~*g)k$* z7N2Q7lg0+Npd~bEO2YsOOem=NoM%~!M7aDw2xNSFn)=Qr73%&2Oi$a)WU&JOPRbpM1%;qL2Urxl(` zoWFl6cVErr!&&}XIy7;TYAqy87|TEjFKO+vFWYF39XCEjKWnTtqdL^#zP3Ns>!qjL z*=tUjI}L=#c4}+l;T@(Vx$)CbQG3-yq)d&s#N*dAlL<^x-4f=gTn>IbRguEa-&rfvExkN2F!z3jD5pk$DCC}o1RbXLooL~;iP%z9 zQ!Da8s|(PnGZ|3PJlt+Mh}vU5wXOOBS{9?Fi=cp*F_wr9^y8<2R*NlmWanUpbM}jj z^ci<^wZ^Xz_}RtE6v-Cb$n|d;EmG3=E@xpjcd~sREe)B^BDn&ZD@&!B;|qp5i5mOb z-dndl(s1yaV_PO(Rx zVtIbxYP)-a##38G#)Cub_bV@R+>vD+L(-%HQ+$~;)a1uc#xx6PpIHsJWIa9wlR~ zjypMQJFXut5ne?^J?Lbs!f7z$wGcpi*vF1U85(yER*|z@FlV1jIdLkO=zHAwt=A1N z{-;l)&L{z53w*~kZc9+;qv z;qvsUKT=>jUd4W<+bJ{VBk9Xv=C*e2En#VhX9%VK0BvE=;ewvT(~{wue!|A?Y1RoO zZEs&ljwhZf3ieZpvCeBZaVwMl15O7Ympze5XqPr*5kFGQ3TEF^6b9soYc*iz(A{OQ_qqXv};Mh#9&0io1)?fvGARL#XsugGf z<8#LD{k7GSl>+X9Zyq{D_VzOS2CPND-Oar{Cb#JwZeo8X!rP9)FFICGZ!I&nC%Xa9 zK|>pKR-Q<6GARH4Tp=*tuNiY@j)w-dhqzVKcgWtKBbBT-0*rmEW8YFR|mnH-6;?>HWHfrHRGvf zG*dctaF@3COsuiYI7F7 zOL_CJZ>9T$CnrDL>~ex<)2GGeptCH}so-$0k9&ZMe60c6Ay3kY=i{tG0Dgyp#z4V~ zh#W_Y_I>d*rjnn z(#X4U>tpv?Th%|@7@p(aA9wQHwwS|Pj$VqnyKUNlEz%P?aR4Pj7<;9) znHHHbBNB-ZUcBX;r-4>kEw1bDJ3Es&efzxXd54BE+ZV%Phb|*Dt*9FS`4ll9q33Uh z{6L(!=!QVUJL&4z;5@tCE1U&0S4+X~GZ-V;Hu7-H+FLYI^$c%w@q%ls>*k;)E_RmN9eWS+XE$M# zSxWz?TR(>;nK-W>CWI2!K%-MnX$rz~t!d6#z#6KRtvpoCYn4D@$d9kPsg!e8evu7r zUGo||ltsVAnX05Mh`ymPSWXq8sna!ScPwdvhMlWzX#H}LZ_gb7XTag>&EvbL;_5U$ zT{IrLq^P}3`0OFX(E^$bAwmTT#g^GMMN7_VOme5Y6 zCnA!f8JmRe=pyc7d8(CkNWAcllnHd`MpGi>H9lu$tkrK5*Z%c|kI}x8WFfdyd-|1< z&a>6_u7vA+EQ>!7rC{v@$9B>75K%~+EV>TLAehuCpVOk1|ZF~E@sg6Onf$)YjrmB zP_0zVj!}N_gh-kUwrOb-=mq4(*^Nhfx5$5KD7G9vn0?#e2O@#BRgjxNe>+vc$I=MZ z$}dzJ1NrQUfo$V*I4P}C_Yq-{kWs~LE=n?_g|8(%c!ei>ns>k4w8r3Jg^jZt2vm&` zXr{W(A9R1#S`a2L{p@?R^#>sKsy|TV-AAY`5MZ39?Rnu@UX!3S>S)8#t5-mEyr*y= zA!x^^u9w0$j!WW@4c#kOT&e2e;YSX{wiRM;7znFdVEr3(%{X#Im+%}OIzM*J zRa^W!fo|vcl7c7rrd0!@aEk%%A6g`4IerhM;9 zR^YhqX9Ln&=1l-bdY~1DG~^XhNIe>XGh^cYBs^V-=m+;YxE*4EwDF|-`v;pWyWNV+ zE+Ehi{KIx#kqA>TA6Ak17(E?8Qw8|ARa;!fpJ0av(G`6d1(QC%F$|8B zysvt>B7o9cW5W?0$D@YWuo&-H0Oo1)2F>_4u~QaWJXid42SqP)D%sQ3TsrmjV$;yU z`MaMQK+3ao%hFI{jrq?*HSdQrA~k0BjdWi0Y}T>d6)Um9ExMImR5sqFJk|XCexah5 zP4=v!V`~h)VyYW~b}S2o|2&mW2P279N@v12u|{!q#0yjJBaalL|r19LMOii+la*SRT!$^1ns4<%hb^f%U6o^6oh>VSEPpvl<+|C8ds{NFN^KsjH$y*f zGTepmu~=B6YhY>hRHMR99IK;;>r2MrBNu~{oo>BL@^Q33%_AAma)nnIjNA!j>EFBw z9RiU|od+sRY5KNAg1Bd$wOpe>*)|nT_IchG*FD61;mi9|t#XULS=~V#8ZCZ?xQi#9 zL2siqH&v_AvguZ|e!aInro=tV&FT6_cPDjaJ6w~CQ9b##Jkp9*`2wh;SJ?l3Dl4p$ zqXTrJU=+@Xei!b7Ir?xHa%@2UOwpb_b0&Vcc*wG(EbYV9h-bTRY<}jZ%(D9U?^4wK zn`v_Yy4e&Vn?od2K=tmKLi-Uk=|&LINo#xKXY-_{6wdj5hs@7OX6AeplY2Y7^-hC_ zGE2FbOQT35&wfcnPR=1dE`P2%e{pRI=UljrP&`5?K!>Ri?rUwW?!6Jyrt-BZ@FvB0 z_hjXR?Dmm`cx~~cG&d5>F$wdqIRK}jlCt&7fWyUT>NS+ZoAUi1uim$=AR6aS+}wHS zlVVF6nW7!pmO2Lo%4XxOX^o;5gSg%vo6$ z+=VgJCK^@F2oj^o>5%!%d3|4WsW=@GEiF$JT9)F;y>$!PS)0Z=i|FXDR3HC=;G2n? zu9oR6VPY|Orc)L(=hAU7kX7n_c7Y3cAmsd0`k`7k>IWc#vJ?W_WqA7a#>2FIVV=Rev1&_*WlOGA4u}m^PEtS2k3t zsn=l~ESfv1rw>GubtoyQkM4T|1$zaF!)@)32s7d9ef=Kk72#6QRX4j%_g zBjS!bwirFXknM;j#e89~lf*LUM`@?G7pAWBINKw~Q9^Sr9-v zo(|?=sTYDK_SIL~zVSD@*3YuN)UmzpbS=rP^cnX>eW9RT26at^0siUw2-}EzX&iVq z8WM!cNadD$ScIq2aZ)>8dBIKoMRfLw>+GM*&VOj($w}NF%Fg|4e^i}6Z=)XzNg-OQ zIHXyEy%VP^fma=L5;X6*QX#Mmbr>|{42d$+1GPmzvwR;U%=gnZJp(VrH@u&!MY3XN zeZ6#TKZTS!q|O06OKODKE!XPy#q0g zXhdA~93AiQ*V;1^J3YFS!T;Pdp-E=>YM|Mhuy2w$TW>Zt$IG#?f!b^Vexfwi@un~J!hGyK)4M`Wp@Kroub^U+X`|^0G_x=BoqEby|O_)wdk)}ejjuwP8 zl_cv_6cQ?l81oTXvM-@1qZCOcd$NpOlC0SoGq&tAmNCrCr{CK--Fxmi-+LeT-0wY) zd+zV|JAdiXV?MLI-mmxT^?Ys5*M&1SgZ*5MYZ2sk`T2$UvCUJfhkb19hJv)$h21@$ z&Rw=rY$Nby6%s3H7#~Gbb>;pBp06G}k#a>*=j=Uh9?1JVg}Aj(=Lzr-CsYuZy$^ae zbQ&-&lUlc)DP*qd-?GOtxDYyTj+i*1cWF&dlt+|G5>mHGQJ%lT%(5haQZG@X-{Qz$ zm!F-Le{7o{^gc3aZF0w2#Ah1WTYXLcILE}8769(YQ*R3%-!i1{(gqqclJpV8**)f*4&$2r^?PF$`@g*sV0I`!i zp)oPB374Ex?b{-{y38sg4z1jDH*9As;ziIwV*~=x&!xo=x<(|fyRnqu&v6SzI2?Wj z^SX}gTe;so+k&~P zKlI4Cc&w^J-Q%@gdeYIXDSiAT++x$nn`CBYX3u!r(P38M=;$n3wQg_H(NKX4#%Hb} zxDjg+)pVk_`g;0SJ-D0>I``+CH;q!*Re4`_TNi}lvSr`>U=GM?;xr?^!J#Au4+ z)o3IBMf2SXqaHQ~8g_oZHW9RCX~VNQ0j{=_ypOH`3fqarbf!O6aE!FF2zfibPUf+6 z&(iwrTB(Ph86WWtiEd&A7e=WZ)Z1pznu${>#Uj*{3cp-*Bl&<(6-7f#z zEC6yZ69R!1no?6W9SDiuG{l+_EtvP)4AT#}DH=NPc3SU3b$+onvG7;RFP!1orzJuB zn@*Phdp+uZ2psUD?P(-RZ$P*+VV9}9f?hyXSIoQI!nUYR?H-z+_UdL=$*ZTrpQ4{5 zT>rRMHr|Rp1rS_3Wp3?{tNw-%#cyWL_iNZ15-Z*lV=knB4a`5;)2Sbx{euXtq!SAbOM}OWA~>K_=A*s5 zX5_96LnoWAR8>+(+B&ZMzCuU@+HpWA6U`!hofv@M+ZH3dGXFF%e|rpE%sQ5_d-njN zkmcZBK?$gEzL&TsH)RYzAwd2X;&bb0?$D^9EamvV&DD~(BD=a?9!w9$ zoo1mE$2VR&A{Wy$DH_F_V#ht`;CXv$YI{!Zy`gy9a|59~rX7zEyf+LH1YdVg1{hiP zxyQ6Ul1YR8-Y}yL-@d(bK0hai-dwRxmbdq)f%i)d?<#~HRG|o!=|f+y)7@Y((4h6> zob~_n|HWkudIY9xRzpsx^=w6EFkP-H{$7i-QoWi{ONQ%saG{GvacIXS+2;M4osu8a zLtaScA_x`#?wky-GF_HRZJ;y+y_uAWTGQWzein4DBRx4#%>T5?%1sg42!aw`tpK#c zqIpJPsS?hU5}899BMI*h7U{X<-un1;U!2U-Am*#?d-N@Y{qA;$<#oErFR`SDEZsFs z+c-K8R(D^9+4cL2Z{p;0?k9h_EiS_luAE*vI2{BrPg_8A<6*Go+C5`&VVN8bbif8G z#XybP*q?aZCeZiDbJxkB-IDeU)Gw$n06hX7H#_wcIozvsKn@4}$PIS^e*&z@3|po) z^^>moh7ij2TDL?i=uZOWbRMF1ys^V4}_XUGJQ==I4jQp)r1!Lzhap? zrUUUV=o;L;l8tmsB~HLSu};6~s#Kon=QO3Zn#h=k-nV|he$vqrTg$RY-l|^1yY6`N z36t*H+$B`V^~;6^0kc7!u4`0u#PXcl+xEp_jplTUB}e|ZG5LS?xqq;G==Vwe|E}-C z4SHkkPyqwf)H0M9UgO-+>~RnA-r|>$k1TIrN^&kYKEed7Lnaf3v^ll`Xu%a{IytyV z=a#&Qf87-sGjp}ICo4bu_90BamRk~%Z1xXUz;Vy87#F`G?!N^6&mEKj6lc?mle`oE zHo4Ockj)0_`+!M;OH~8l=wbvxlg$UZuT7qC_CRw__1V$)NP`Y0FyLBjKE<)7WU!-{ zI2Hs7{l}-}$~C`!@0OUyGG&z6k9n)UAy{1Y#E+^R2Y<|sr%o_ok9QUTx5!3>tm zH-up+M>NBosg;COWpe0+6~yeC3i%^cV$fW%!O51K(btqHGWJ~8Ed~9|v=utWrMuUR zJV#i=^_ZLnPyoJySw(hlDQ)wJx$EJS)aCjm_F3lZN~cZY=K^~H<8lAU%eRwS5`|hQ zbOtDbQ&xf=UoFm~n;h;+xA@w2Kz*xve_waPrkfku2kS_yDG}_)XZl4EOFqjpJ>2{i zuo5ji7_H=msnC>cnKyX6f}X=UEoa)MlU-nysn6Scy+^{1+iCg5p!Ggsx!iFyBl_4?Jl64khbh155?+(C$?!#{q6@DX~(-6F%6+PK3-n=QlEEc<%(-wJGrm?^dbM*OZPwT#UISA_pcDSu-Tj- z$hLv01vxf^n^opn8OQlOKVWt-i*j1y1TMEKzt?fVShLx)k@u_kKd9t!Eqbx`K_88s zK?lw?dN>@|x|5snZ!ce0fr6l!<9HlcS|)zv8$z$o8-pUO0#;tG5st$wosuT#Rg!0v z(!U{`K*3KH5*S6=;TA(d-Ta<;@c?z=1o(t6^&5iYozpNRPROl5&B#6n z!w1mlaTp8s8tOkX;S_vBr1TR)O^Wc_PV#=l(TvE83Ju_vx5IH@J4fV6-p?d(XGqz~ zlb0^h$|vhdegs8#HG_7=*Oh+4;a6l%cp#)$H@CU3aa!rS)OX0VU6;;Kwqj(0z% zCXP6kV7;1>;xx0B5QIcRD=sJXF4&zq?SI34q4L)0F_SojU~w4(f30xOo5l}zSwD_^ zaoUu2O?AbwmE6Oi=Pew#>eG|_<)`shUqD$6G}z1dJ5Gjjs)JHarD5n~9-#ac*TMymW?DMb*2u*OYL!$6}QV zVllM_gM#{>6W8F;%2iktTZHMX>_0bTnrpsZ)5J5_03Z5UzgI=%t0}&e4d7-DIl zKcgCn#5*vLfVyLdsQTra6Kxam!^cu1pr^H46N#m7 zy9UwmF6W8cG-5@*#75d2wGX?|J>s449^@))g6JE{hY;Rt&rO4z5D0hWNlP$9@(mHD zxeYJPyaXxNwyNj zv*Dgt442fd-kKg@!Ru%(jBtAXBsXh6GMKnh zoj(OTpd!`*NXM_VSIM6*3Nto!(?1Y*XzT5lnx_vLOMzL|N>XNi2+1*xtw(n|3mRCF zi1GKa&#AZ^W(szrq3ojD#B9{t+isiK5BIzFHeIvhp5vo}g~O`u!u^_$sgsfv?bKjc zQYrVYS*&l0MbRy~i7><$u15s>X_|x$^M!Eb#ZhB zS2A$M%!JBQcF)El6f?P`9CO&ORxkPt&Q`m>N^c93xSx)-OCi@0Ad_boXhfK8(F7M?BGzFFz)As5 zX21kI&HMrM1RY_(0DqH!@mQb^htH4ihD|wj0Inb>Lol`rP#KbGhRR=HbJ#N95F}wm zD8tltreO*P76@Fuz`_*5tD*8n4v91}4dyzT1<4$bI!r7CK+vEs%Np6qA=C6#N2;Oj zmWri4Pcm?pcsU2h18wF7cW*67f7J0yusc{=dT==RN%g~9+h5sh>(AKB3tOtitZTbF z_%ynGY068DrHDUD=dB{%N#th+P@h~;vb@q|`bPVe&cXcMGuyhJMIAiUSMqMI0mI2G zQO2KQYLy^b2!iYRS=egP7JI3&w~CP~4zI+>tw?s6xjm43;a=sDH9D?ijso5JHgB}` zUS4zYl*jG`@yHVv1T~OBizVF!C-YZX=_dRqg>?iL3Yg>oQhjI*Xn?4yM~%0cva?yh z%}AlTMF_|@Dbwag#Jc~zmHE@H* z(NwlPvYQ1HV1BSBH#nZz@Ut4JXZ&7z^jk!O`o*2F1M8)-8it5HU^52XQNe)R*r3q- zx96<8G#?A?hKB%j;AK>!vRR~lu%bc?Z%v14a6Yjm82bB;J!uD?xLqm#$Sq(ld0u#E zchBrN?y^VfyOX?Ss_01>mJM60y~v_uMI5Ea%p{;l+w*BstD}r5HVQB(tLgC$$u)?L z1kn_%B+CZpjl~Yfbd}uG*wgOXL9HA}yI}J~E#X%e8C<@b!{s;!lx$#MaOSDCeKl(05j!c`b+XJ_??C93yIS(#d3s+lJ5{-2 zMDe-#hR4x)FMH9w3u@OoUR-=Sv>~8VX-u5ARC5F8Imr=q&*Alz623S}rH9kQSH1T7 zK%{yun|Lw1&g-;WxqzU*;c}6I??<_pB7SnYUYk9x6uzkul0+HsY3lWj0Py!q16D-- zS@l%DA^w}}pkc!uR1WW?a5|ysEQzONzw%uPz6@l;t8}}(X5X2T%B?+1={AmY!zI45 z^>(%bmUz`$C%Q&oM^CXHzq@*T-^mgGZq)hql$$7|doUw}^Gf{?=xs#bAqds>ADknt z8VkOhdN%ig{;7(qixCku3y#rux{55w8s&pU!J89?i(f@|^*I#O-m5k$H&0w*Nd596 zSSkN}6U0b=u{6KKb^7Nj^-p}%c6TF25GP#`8%o{)qwvPjrCIg_MN(pKSfsieTxAJFWmlmVJJj~ztL689Z%O(Edw>rxI&?Qf- zVu`SA=_55Lf7P0MCEMRTP;ykLjPt;-4U+7h?yC}5ckr^*<7SNJ18Pb6qc6vHy-rL` z6aEyf4c5Q#0bCnQ;9+lS1>4u9ubQ99E6&M3_dvin5OPj2isu8aATA5;64uS5TQb$>VT@w>>BzYDU)g(NC4M`<^i zfzBk zD5)O5sBW_55>tQn$tiuhqnh;lmV)8d`kh7zz9+`9<3>im65oZtLYAZdZ5o%r6306+ z)95##Kvhf?G=Z5!^ImBj4Z4}&7PkHRj@j6Y=PvPV;B79+;JfF)|L#5kMwB7$(_C7; zvCpGhw#|Z#rZgR2mLA-R+n7rf%=aE*x(UT4x%)3@A9~SsB8w;u8_h%^Ki2(?r;YhL zo|f$5@GF_!hJMCZAd~hR5$*4yRR0`_#FIcu!w%hbVwMrw_@lGs)w53@6*fzm6zV!z zs*`nfYY)v$=f+<5X)D+$fifF(B2QdR#?kn_%#Y6Y!B5UsaM{^P@GU!A%)Pf`OEtdI zOlP5l)UgHCi^1c1%@&oq2TzxV?4M06pq6+lQY|D-Sze&8HMz1RwLprWxPEcp+}oDH z$5S_PzW`Sb^*@`qe+j2t9_`!1DY*IC#JFvOpCV?`)stxm?e!VIIVona+4gH7;_t)| zhUkAc3Na*mlTbjMuWcDY0sZ7^9a?Z1CLpCPCdb@ffWD)0RnJnuhwjmOuMNiWjh}a_ zGH*^i&TX+ox%Es|P%i`ps2}KHttggEV!Git$ruOJMz%FGxF7OscHf|ZF7tSsiJNpV zz55(RXn4409YLqIbkb^%Gx5R=lU4T&XRX@wCY_af1}g`4QRExEJU9Q+S!8VeIcf^S z{deJITKJ?O%lIoo}le(mcP9JZqVNPS(U9PO!t*dTeY1^ypa9I z&EN1w*Mgoiq}(xHVZDnwJu#i?GTFsAkpx(~Flf?c^rHT2kZcQ+*av2hMEwwi@_)NB zhNfd-z3^a=VdQ=RjjA>B4osJl7NtE<%_H9s`cfIS2NPhOTxgQakw&#?iUO>U_5(Yw zz;2_#&of1mQ$MDvLG~GxiF{n3+Q#L);#;AI?BtBLch<1>8Peo5XBV*+mIhB6fnrrY zX08X>^l~BmE?5<70I&4~3;k|WiC}228N6qfEP@Voqf14La9MGs#L|zTGLzP0yRypb^f|?(%Z}(1d0~Cc_1p4&M5=`jHV?&|*-(n$rD%ze zXlFK2K@<^&%kGP$kSFE};*1{HW7BLp6BdfHPK|gYr^+@?^|PPRwIt$~z;x^3qjrOI zblF+mRi?*{A`Xoqr|h_sfG`L!m_-u+b9t-W)62o+jq{vA_@viqIhJ(u)U$^s8}CMB zy0@&HJn?W8H2Q*U14!RTVyO9?Fy0^OonFIjnB3v#G`UJE5vCEXyA%sX?qJ2T7RBNF zH7hq|KWNrlJ+z?ucI{g4O`VUmwjgGZp>M%>nvN^>4y@LuDS3s+lY?3h?#jLwC7CIv z&s%yEjIP30Pwg}1ub0(No89*=%(Ic3bUESi;cj@10ToOtUM~nPhxhD@xQ$DkmGZn3 zrSX-!5$cb-C2CBZ@TIUqHFvgAS!Y^DN@SCSA6ev_&rh344s_{2>LX^zOj-HZcugtz z^HSI*=k7}7y(dkCns{` zC3dad{mPHKFe_b*CsulOvP-u2zB&YQPircW<0EE5d0bat4q>hP{!bnCN}q|>_B>DA znYa04)zQ&HCzpUK#7Xe7!%@&>NBrg!>W&_~`AdSK%URhnG&HD2pM`6HD`=d0!QDY} zaPFcGFz`jR8gDkT|F*g{ox3_~QB^Z;ScIx&p_6q?tEcu5qI|jU>kb(E6Tn9d=2IAS4}i^fY|(@4;YZd;M|u(a#p4LDt<#;e+{o89Ix5L z-m--GFe#kLlXkcOKzs+IyC`l*;K85& z$&_(bvB8{Ll-fWBOK2X=*%4K&9tZ23gXlHD9O>!^GimdrCeEEZBZg zqx{`F}V2SY=1WOeIDFxBCyV+~Fj{_5V}Wl+af`-YI2rF}!d zRdU}DLoT@VYr?EhEFBJ9Qcqy+pXVS89Hzq0vjK*O1`X*DC+s~D_FYfV1v|(U!kNM& zlEW_H3w~co2RxtSO^3dBSW7t27GRKy)cuCY@=}KH8{l5kWZ1#BFlYvSR=H$BWFp(= zbi3ynsju+-pdNF=oB?u-Aah6{&giNG>u)`k21x!L z)0brU*sDFzA6P-S6?15)Ob;4tQwnFTwI?qfJUbNYNE+xuF9F5sV z>)njf*@nd}R7?%Jp136#R_EY`-2{goQGRNb<&|Ucd_#O#_|YK1F74B|h1tQ&Up#!! zD;CM=lL%abi;bSKS)90%=}J&I-l>`aw`9~Z7hoMwt|}$XrJCtJBfziK)9CRK$b^I z<2=@fum!69Kxl**`?{gYDuM$QS+%SXq`%Pzm7n#Y2$U0L%vl(y!vK8Erc=Mv2Td|i zhnZz{SgtA8O?L*vr_&*DssSO;oX!(1)Z6?2X6rdu9UJIjlr^~gEW{XU>BJltlLc10 z>RAQMfm&hnJ-L|^JHT&!jbpii@}%y3xIbyefC5bdGo_+Xl(Cxi7lZT#)BlJ(a}y;D z(95b6Xy^F+8xioJF`5NXg}&_!bPt+2LnTc%aFn^{t`rYYR(NZAaSBU>+yNEoM0W<0 zmy+kl$;`0o?J!uC+Vv#AFS94PfUNLzoosa7x`8k|?))ZLsEUg2tUJzB9;idhOeSu~ zqrqvbU0v1@tX)V$MCgv3JEW zi;PccJ6fgVq|FqsiV1F$kbAnk-Pq^(@>E5+N5jN4#aS5FrSn68rE1K3&WjBzqwc=f z-VODJYgFG(LZ*OXrhN&<(w)bAy6bTJ(Hy#E+mqc*oujLcABepqIYX5;SW$NOp&Hpo z)4GFufVhG!%gH8L9q}0d>zOP}yPV3YpguuRss4Ag{;E(5i;AHPhY1mn zlJB@xukP)~y}WsF6`2QbL>p%qHNl(kI9lZ`GvD>6*JF=+K5!CDdROpZ63v&jvFU^A zzHwhL_NU>_O_7-_X7M|`2Tk~ztKB7ana^otlM)A($dvQb{KRG*F2eblojiy_r6KEnX)+Yj;+!dNVe&_o0?- zsHE7Qy#hNC3%KqE#uX-!?t??HjhIr58_X6;Nm6@Wp77~shbNCm>idZv=J(KB5%I?O zoZ6*3h>aUCMkU}Z!{A1On6qDrH1h%l=VzvqMZD0?))OByRPH=?@lxI$C#h?ud|d4+ zH4ljH2oGU4e}2DPWq2o0&b;FO@F3C6qaKg?L&h?T+(uzpSe67>v6iPAh!!VT3Dt zTnnY$__u=s|EBT!UpOf%fo9|@As<|{3LPpXCbPp* zNZguxzxy02tOV7H)7v@09&vEAo>ij3H-zMF408Z9YDgyYvE4l~J~5z<1=ui{LgIKo zfiIK1G(XKcGj}<&6T}-h0XKn-UX!MAC;kh&AGCXX7d#J+&5wl*Z&kn*psCAWZb^%k zJr<7u9WoC{?aR&}%P$>L%=mb4fddnn!f=e_@GNmaw`JZo-=so9>$?DkU~zy77MZ{q zU5o*XZ3{BKyyN3kr!oN1p`I96EG8VVg zu=!>qXH{xg<#V_MjpmZwGd|xtr<9n$U&w&gO2A8{nE+%bozceRhqbc-GZJt}7rpji z*}UURl9Nj;qc+VoOb764dDda5qs*55MsC)5qgw???*eyb-#=nrF|s64fqNwJ7dSJh z3C=z`l|#f#$OB;X^_|=t5t&yBbrV_Fr)`KcvqMY7+wEUR-s8!DZ~_Tlt4ELH+-kwh zQJl3z;4Q$RIqQ8xWQbtq%-U!3WSFz_&_mk1rt+A5+sg0oQ3RqHHY`deq5&pcSa#|M zzHs>c;FTJ}gI-nWJ08M-bA&YGAX>xmQ=I0A$=#?sK&DR$vx3>_i`6)OO~I(v!0#&X z0bE7^z-!;t0hQSSB$6|}q>5UrIeOJ2U=qZsJL>5&aMgGQhv$WD03PU7ltMqm5}v@! z9X9E|J;Dy5qXy!J8$zl8P$Fc^ss+t_zV;o7#&CS0K~Lr)cw?V$2$k`LqF`A20YuZC z(X8Wzr;$g!Agi=yf)`vzcB4X^`{E|c;BSx(4o7k=)Q?*_6Q1xH3P)kTB$9C(Zhu!U z-Usg1Fcu^&7&16WEuMq(CiTcd;ed@)x7mhV7MJlgyg+UWYr&r9L_mYubX`uEYTQf^ zY5T8rB5f*{9&rDAZ`1Akitt4YM-8r>BQKqvUm95feaOQNK=-jJwP0T&2RiCijI;Ad zEyJuM4Gv=LFf5Tt5`#)oqVrOhm4W=Xx{&x+U9bXpKKeC>NScs-34MK${tH!bUe*K? zpb6kR@D43Z01@soCV?ZZsKzWwo=ie^=>LOmOvGP;obQnIAO0DFcY(9r2f`9>5CqA1 zAlP`O$e53`||dQ0olyH+AJ4NeVYd}ci$FG!X4s-a*w z{8}H8jQbqC_n-aCWf`VDLv=yaMP5l3-*Na<`?$IS2rFLV7z^~~Jf`X} zWIIq;z{DQ_2tiaLeO6;D1K`W()64MF3V6iPGha9>z0!us`8DiNA{S}G4ZxVg{zXO8 zu=cxfn!${%CYs4V2Sv8F_uE!~$9`hC>WoFn5>D10dM}Rlw!RK2Sfd8BVF~bujnYRjNq(?|loB1w& zfC7E&!IN(~pbp!U;8#~c$56MtjI>ZhVzw}-+5|z2lex~rF^jt314u1^&m4S*XZSuB zw%rop6=RHP{o(tYsgo*5GY|&IZ{jowHc?v22Z^#)kSO~ap8R8;tdSL4@|A&XaPAa_ zkCH)Br(-+~0Oq~zgUd{Fk_SPum~cbW7W{R$3ZAunZ&H=`m*8lagoFkGLV;K^Ezs!E z+$HidUV{dprQ_J=wci0733tV#rVO(l8XO|oVOb)(zae6(2#(YZfZPKF6^`m+xx zh$9^iv%ckj`O*P3#2FCPWRGzCaxom?RfK2?qa%EIa>PkvA{w^BlZ>WSIbsXg1Q1|9 z0Y`4HxgX4Wb=ks?!`NnS!R*n^knRzRZm)d!NzNP0U}uRGkZLiIX402+wCOvf z867-0kBgiDI8dLB1c6Fqq9pqUyOaKaG|=z`w!Lm;`uRHyyFq&E%npZ8n2Bq+5e>pZ zB=jL039p8AJI{YZ^isZtXN-SQ#Ws*SJ*So<)hyqa-w>0kY+^VxcmcaS!=oQLd%J-? zFuIi8Jx*!Z_Fchdk?WvZ4hZ@x$EQa?Hk~HS^25@>qc4_@|CmqTv>@$OJ5Ed_Wg*#t zB|Cwd*PMn=nG9O|Po0?8Xe-(K*Qo9Q6ND=us!N4_3hJy#7X8>op*ptn%)Plz3=IV$ zgZ%VKod^a=*cn$+_5$Z_E|w!G z_)DFbWCN9u1uAhEgqQ3R5aSNX=7WTo+P*mw8YlI;;kODroR*B54aY?F~mQ(ByItLiAqMj7$xZ@O}zZ{dp!`4fHL?Z*}x z_C`TxaYJKn5bd*SK1a-?ip<^x;NHCqIgS_b1P$+?LGrK*bhQmO1}V%3P{qM9Fb0kj zhG@^6;T_@g3#e}hO)zE^4L$gSUi9!Q?yW2{p@N)~Ulkq@Ppm`=xW^ebDOw&dFnT?9 z!%{Wpsfq|CtntR64Ogk)(gxk|wD|M8sAJEb%T4A*Im~GkN!wrCS^X^Ejom;>Z&O`% zh<_BU|II0~o&P8R0HFN}v2)gnlf>4byjHGARLG}&0d&Fh&)O~&#((Nr@ipY-iYe|N z0@VkvH$Q!Su+2AB>E;(=vcS`4kE}k;V3{^pKvW!7-tY~v<^fb)gqk-3=!7UY9k=)h z*ZB@N$i^)Ou=mSM8iPK73$~yD^$9beyOb6UbR}k?j$Of0=3^6szac=)+W#o$CE$Vr z2$uFRfDNJw#Et!H#Ee@gAumI|96P0#>iQ}lw?8?K^0f87tCcgm;ko7t z{JsLF5-qp_TK8t&eO@m|?)DE3@2LBB7pC0aFA$|muCe+uEgpbm8Nr`Rq?jDuyVA6b zFh@4}g}Z@~*9pF~SD~M6$|rtsz!P-Y>r~~p3hoE2m zTzBZ_-wOhU>}#Z2CJ9g){PGOKe?Ec$hLi9jMi{;TmTBB$8ADASreO9^0m*EFU)p>_ z%no_Y0LjDke8B1z>81@eEj40;IaUzV*D8UV*{*|-bpOVQEf?HJK zXwN4g?VhU9zf+r!&^wc`y~o=vVw1rN0Xqx0=9v}@*YQGcyB_=TS}@H6UP(HvmWwiCYK z=57f=%;OcXBJ}bhhP{ZxlRjboh~r44u=cQ##hk!>pkT4``$YUv8S%bCcO`xXp|dCwmaFKkuNeOoXAw1F=HB*s_-LiWp_D?)Kr% zCLKR@)uEJE;puWY9n4^DjvuVye%SIfawVbpfT|`j?TU$Lv^hGU%|dV27U{?LKCcKr zp=uHnTDx~kDAqA3wO-C_0f^7?^OX*LrJ?%M%8-J=udUM?%+I&=;qPtc{<(C1gYB?2 zq#a=xFym;f0SlB!$#(XI9J*_LN%(NEH4;AM+crhF@{jMtQ)-HZp97O(=QM~;P@v~L@{cYLr z<(KLb*KtBh_IBb;X}C&aAW3338!POaA@x2#S9E)K=H{IyCqIB%Nt-*=UZO9mJR93HJL@_aL^nw1VeA@dOi-2PZXFW~KYtJj} zCKy(_h$U@%6i9C`M&MbFED1tuY%JZ(9R3n(FR(MSnnVk!y!AqaJNm0!X&m1r{^|LdU6KequI`3!UB7Ko+JMuQ1JuQd0l^*3D7EN}_$96zna_mcKB*QE3nOyjg- zVj_E7i6ZP(m@1TCi44=tF-eN zM&D0PpWpK@r#Z10TMR#bg~a|sx^37b`_v02>16P! z#Nn;z?2FpScQHMOdT{FJ*DmZlmFk}=su+_jXj2r~xmxx*(8q=N)?{k1bBEhNMu01E zW1l-eBP6qFw-V>__&Lu*NxQpke)7^MfA-Qb+9rb7cs=u5Ig3g?Bd_l!Th5*d?R5Dc zUIzWMmwE6F@jIjGf39M`#myX|xiGYuPx^_WUD#?&AZg2Mb@Kx;sTTH4rn; zh)Y+`D`XDZ#?S9nxwTc^H`Omd)`??9I`BDOaTldsUdhJG#4;tdQWZ5JPaiL zos_nIX5ZzXtohV2uA**_)cughpDRbWbW;U~tZ%6ESQ~X;OqM zhiYzPt1%ZYQC_H9r+Jll9(nS*Vx3dR^T%P=3kndf8EUBe{j?z$z-WWtv2Q$Z#BYD%-UnbWpZ_ac;={3oI0 zqvp4^AMG03BC_FHY6M}@)-m9`!G_-@r5b+CiuK@D4w*#Ep}6YdIW4J^~y@kuaha5T?PMEUTVcL*P(wH8X3 zy_%Dvi6k^FX?#wn6(>1QX3H(`oGAstRrt` zMjxR;0l8>F_k_~{*yfwTrf9TN%2lbo)z2S>H*$k{B`}j=i|OJ&%z3UZ>^krbv6iV@ zsd^w-d@lBEw!O`ub z&Nr%lhHL5G@Al^Zu}-T0S1#~O99v)kK0aH8nFp<};5=;3aE+6)75DbQpNMN6r?%@| zj4O@btg7mN)+n`n^)}j}BxEqW@oS$b4Uql2RcWG_8jh>-13(>g50v-%CY#UPe*G)# zenhI>DYv0Z9&%Ah>&_#*Ff3g>mh)0wp6a!=jGP@;%2BA2G0IU^7M)6WKl`!1?eOb$ z_e!>H=nX+8#P(<%$rS0Z;K!601%-kMU$c?AnIkGi8P-5jpCh9lLcaxUt?_)e~! z&q%Q!o|mdw^!Ia*hIb`{5oqWVNGL9#HqCp<&zq;cNp8t0R*{~8+ugQj%VzgaXhnV) zdh2en;aCzfQ7Jr_L^BGNqYTwlZy^Zd!c|#0Cfvc;dGv*lj&({%&5nb$g6})Vq6p7}>P-23mM1f4bF0v(EqTmOiQ^&VNZ<>tS%QNVDc6Zgs%rjku zfvbpqP#&fl!-sbQbE38xU9*(W;S?|D$_qTcukB`tZ+i=EbVA?jam|WdUH5ABjh82# zJ|I|oyfr~nqt8p`thp~G!?%Rj-3rHih&O0I-FK`^CAP13V7o%toz(aa)`Vo6w`MjC zeU~jy!%{9sQntzkUd=)Cj;`VE`FL2iYTfOq+eobwTRN7a?6?O)l>x8WYsik{W&(Bc z^~VOm>zm(b)^duemv^=t<8lAg^fYSq4yDam{->+lK&ST+xL#r}CpDL&P)VeZ5G($0)GHRsoVqFd`D;(Gp!;~;k#c9O`_Cp20qw&O9ZEqKW$CFD9L z!NuN%3T;+7XB)AZyXu~`@w&YqiA7$|6l84?+zyoR$z-~3IYwlT=nck1gv%$@$^k`% z<@Uzh;_ZiTvL1>|YF}*ruz}H)5IOVa;}!ui?+q-*ASb2BoF&iXt&mWpC5Aq`qI-re zV0LNFEM$Leu+h3KmEgK4bv@Qwu8Sto5q8{lWJVwm=pDR4lQ|$5#dWgE^9Vc-!MKA% z#&MkSm))87<1Y4W+uFYFCE~MATg1j=Ji@+E|5K6>tY#;Q_0dG&k-;aK={w^MZ^$1$ ztQfkd?*QM(^nS^WRdGYNEML#Sw8)6{@+d|nXF02RnHE=p>KdVB`0t*BwpzSSeEj`6HplLPbyeN7Aqg;%)a3~9&T)IYDIdOl(F8g3l8 zk+Y^Zf$aLE@C*Fz5}IRT!PD^d6}O*tb(+_6QoYhYmtDx8 z<-UKe_Ve{kP6vCgJ>$;eB;xH^x0r=Bjbyev)0%==%aoY;=;;^K^d;j-^w5mt1xDQY zV^Vo0q0cr31Rclg@9L;T93WJ5$!x1~O_zxNoSY z3tb#K_tMd_az#TqsT(x_0#jvsD7$^~Hf=wBmz=Qg&(sW(efr&r2*G3)@( z*_A5BJC$t^pamvZdD42I!5_D|2+hLa*D^irsSWou<#UwV=wqAj=j8eBN1Wp^91N7P zezKC>Ca6EPPq;0o4U9-*109Djnf^FF z#Rvz&wzh|%&rT=b_wus|6*Mk8Y5L&95y#_us9duRlkM~xM$kK^E@(IFa-keZNE#P@ z|2!}K>3ICQlFN_1#JJk*Iuyf3C}@b6#ScAVESsF~`{--Xe47HC~reY+WIj zRwNYh(bYb8&7~s$K9O^JLwD`Zdv6-bzR(j|bET12X#g1pc1L2@xL+BkCZ^IPDf9UO zH(4%TZqIgSXiNwNDXYCRe_a*h@;F(NcUKvho+Dj`!p+c=(zF0QX7V%pj1;|FGhJl+ zX_*I(d!AlZzd%#Y=)H7xUHTg>6$JMPNfhpaww6e?NdQ9CM}x z?Dq59MG37nyIQwhaMc~Wz)gz-)hkb~1dIkaeztRS3EveDle=^BTQtSV=AEg5N&$r! zXMT+iK4$?f?~O45F6_BV~5_!4Zq#wBw?lE#Cv{&FeB>-O9k#Ftp+wvo?j?WLd~_2eH2l7Lss$Z?6E;t z&7-upyv)~k=1Y0M8vX;e+kbhqii=4fXjs$-b=i(l5G+2UpX42r{=MG(U;Z}#+Xwz* zB)@7T8|gAPA8A-q6ejrDxahhFROb+_?bvzt8I%#@@~e!`R~jaXfhB3+2fI z>TBK#Gm`(!x^zRy6WNamwf6F_uErKdjj6wsT={vg)@YZvW)deJV}tV}XY&y8eVbUf z(lf@!($mJ@C#3gUAl$B6CaoUmaml%HY*nz}O2lpDN_6+u0p$=_#Q}J-iHEcb-S&Bi zPeJ$1c;B|iZPj0zeDt4eilB(y?Xk$-do44fFp#$%M>|=JVN~Wa)t*?uO@tjx2iGU{ z)N_OyYs!nMRjqE&rGg3t1JUr^G7na-yKTcpXvPC;6T+7GLfB3CnAi0t6tZ7M_IU<2 z=DqGMFOTkiURbj~;D*4#%{*JwC%RNFXo)$7vjvb;WWfgR%)e&Nwp~B0pnzE z*J#F`dn4~3#P?fpMQ^;C*NhO6wK$)P@R**kr^!@9*|rmf%6_?n;ef(-xTM|5QS!mD zy!O*a)5W;jWvkALR0w{YlK0VBgJ5RBYZg-o9ql+i_7RAsJO;YFCvQGf*P@Ssg_*DX z=~JlHWa z5qt189PtAoA!18_QaBVrfZ$%HbN3O3RTyB^w1&w!yJ{qC8ISW^CL<-hUT}{hyFCU@ z=5nG5I}K8H*(Yel9zc9U)X7{j#d26^EfEy$ME81$8ipFvRH@9*C zGZnPVinHn*mm=BMWaqx!R(s12juo71;2q-~0$Zfng3+*{8JZ=k2%5o#H4t9=Xqx2X zIZXNft&W0%S0!oHv3XClxIeOXd%8<>Bq2kU>@Ku`ubJ`w2in{|-xu!oWU~&$A}L#@ z?5gX*u)WMLM44Z%UI4AQBu=V~R_2B#k7VjF}?|x^X6{7rL zx_+QnzDR1L@zw=T@#_tJvpc!l;2nfIR6eN&Ddx^wMG{ayL@P)rjek&*<+cBI|20uN zsc6i*#qcFn$1Nf%^GV1XoYy29anAv_du$bY00kZ1T}I?r|3Nr1m0U8teR$*NQLzy3 z9Cv%EMW1(alIfBw_Hqp_nM`J|vndCR-p$a%7aSZ1G&Y%hgqG~a?pDbx zJT&^~YK6~o-zL$(GKXtt2+bSaZL9lM(my5g)5*R)Jzvd=E`^+UWn5l>eSGcXtE@e@ zhDy%%mhuM5O!~6U5YM!ah3B&A{! zx?+dZY6O27BWZ&Syqe=+EmQ7{8o%)}FZ(t%QgHB9@+bGGt^?uM#}|ek4YUF-olano zicIaZ3H9;o=Nq-(=h6M1c9k@}i_0nO^Jc8vGXVDk9GCGc@W%^oaoHhfrLCxraq5{R zGogpC950;5jD*C!l}e?BK5AS=>d>sy%j86si0UTmZa%Uj;&NOa!s+pFWQvS9_5n_j zYqp&&a9QLP$sXAmVA^y6JISE`5GZ)6xLIvdBk8Qh^%A{nm@ena6)2{xMDPVFHi(>U z65x95thu~Vr82{0-`fxF&kLHH$|NYjy4e>}LdgE~ z-4;R5Mnm2?9DCi@9#m_>yO7}eWW)REuu>T4PCyhxOhAuyaAltW2~E6~v&Ri6lEv?j zojuLI%x!J+AWYjRd@U(!sh^<_-Dq|7W*m0dVx^L0M$Iy7-^(B-Cz0F5vvRw%%Z2Xc z=RvN}hpUA%CjqVZOzpt^7r95{rrZ=7wf7%#w?j`_XG5RhoS|DsvA1y~Nmspb(Ua_yaHyG<>7o1s+A#)l*C|mxXBBhUfFgB3CQ0n}vU7df6Q}@^8 zvz6(r0?)~*ADR&C8#s1ywGDyd5u#MxckSBE!r?=CM+`*@MDE7V9D{tm!aCb(8E{qG zsK?4X4t?WwrghEr`-3ODjsVQ!22qA4ift*O9*`;l1>V6WFX}dMnQ;lDm0N9D0W#4? z2heeldyiZRs=7Pxqf3S+x4r=noFV(L+yAz-`75IsQAdG~?u5M(KCec^}k z4XG3P1$hVEbS&T=J<()(kj~3}j*fFg&-MSXrddwMy~lC>{sMAoA%K_P^^A^~g^jAB zQ$qHe#L;&Jlr+`z4AdR>1f@31R8Qrzuj>+|h&wZ^b7LMcMm*xK?72M??<~o6Hol;> zH`)GHt-nfM#-~U9c9uokwIrnh)nGKqksw}L;+~eCLhHQ0tomteS*170=fJMa7-x0J zja#fVmCY{DM-v{pAim!gdpk>%uo+)D$&r5tb@t|BZGyzMvD;}gmci#-wIVT)&(5#b z_ndqgVD#!v;j0$hYrFRq=QFhC=t3Wwu^Ttg{5V2(GVbVagOLUOH;l$UKy3Rm zZL)m~5r-t38tQM;5&-+~+z;!ZD!@FGrRx6f_c7ln0Ox?`{zOf^`U!AUAHWf0nd!;h z0RqhqvSqZ~5ZO-eTsJF!qQ2rC>7iBL(^IxPMY&(8zo=stxl9bGxe#a`YxdKKs^(@` z6qp_MwyD6B0-Ysxb8~>TxWJ-Es>+{@BPZ=$U$$#^_IJvcmCDF}Y)*xL9{j3b7r090 z)ol*hN5Cz8@p(>mbm+es)KfIns&G{yYr?dsGiBnosQ8S^VJ*ksT{lklw{gQB`@b*^ z|L(w>01P!eAOO=0KS;~C>g&a`IFeX`8mU|p$)M!Fy8p~O)V^I^EXw1X`Fz*Y1B~m6 z4dQ!vM-k4k6iNR34l?qXB;3J|8n_tDJ@d&GA zr+e2|uMFyc4!l4`5mI<1?$x{oyU3rfFRQLUtd!Pt<9Zo-X9Hqx1}(_IND<2-i<7!H@t?qQXBQ;B8v z+!{~C%tcAbVKG}GU%^hZLYEEr&^%(%bz4F8 zXq?TbyjSNArrbr{UdHPk_RcyJT+QM9&F7I+Dc|LLW*#O|Sfu$t!?aWE(>ZC+tHH|- zTO%Rf3vgi~wgodHGqeApS}PVd&+i~IC=vT{&~P9xR@(67L(|)LvjZI=4E=-ey^o@i zk~dQX3w+kSLQnwJZ)`KFFd@IaQNsICgzS~JH^B6OQL+XhvzCuJ?B1usVo_w z(V)hPtz8109m$nA5o|8sH~q&b3gwo^$Xpc83CF((7aj1bC_b_K<`my}N>iiCrIKvR zk$K%c;l~?4b01)wvzH`=5aBs_*2oVO1L_5<*VT#67Q~$ocHQS@oKi`Ine%plsZTY9b{dX1_l8F)ec9^COOcu3I!Sg|EuM`btoJ#2re{w70v`Z@t3TF$bZG|%uHsch5Ay4c} zI}c%}%8k;N>D_*k?ZTyO$!8g~7uT;gC3pvD+1qKkwLEKo@kXpRiybvmESnr2+f#89 z@n6DT+n+{H>F0R;AD&8oJ00Zd2ILI3RUUS$>CGsV?u>R&vo|@Y`S6f+SK(y4k8JDX zON}@D=3He2g-UvrmQI&GFbaq%Ja!84M*JIVV)22_Z~B9uM%+W3P?`jn1iIoP96sQy zh=FmdBs~piGV8lo)%R^URXRpg3D51FKyH|UShWykcrGj1opv<~cWd3AthI+FeQ_kN zzddgnYhs+4|GUi%5b6JvaZUXn7%qUEFjkZZP&H{~S_kpAm^g72qC(0u@{LW?R51_B z)G4io@&jADN4#HhB|RD(CQNu+X*3yCUU<@9oJWxv?EiLMu;4O&PTE6xbzxib6}5q^ zjYo^3kiFF7)1J07tIcnY8hdde-H44&C8s60axCqbWMC-FX$B-$v)#En?WG=2BQCcV zZLd%DUGLVshi?-q110Wtui}1xCXj3VEuVXY{3);hyFvV4%sT$gEC!wkpoSw50423n zkkNrelUBG)Z$81d^{L}PrS&ylBj+Lb2|of)MEsh^@++UCg^#AhC5p`4&$xZQx43X? zNc42+`LWN%#h5p8lbR+Uppv~5)!ovG{?q%AJLTG#FAVye?r}KsM zA(<;K(3OGF8o#(?zZd`7sMSDT`u+=#?Q@;|UcxWE+q=Q{4W*2=?BcVsk=1%si!7#W zi>Cq$0mY1mHUl)19{26s9?(ekzh^<|585Kuyu`V;k?5~izme1UfsE&DHLdoCHtUww z2igK!u^niBsbBx41z!+Z>QBwWg3~C4!F6(bKP#>LHV}i=r%wR(+2>$qYx*J#JKOcf z<0}Ut8;V6=TaC=pB6m`bF47suo;m2N9)s0>+>LIeXP%oy{s&h4JLV%;pR6^h&!j^( zFk#}t19d_K#ME2_)!2!ojXXhkn)GympMKf!^H`G$?~SJ$(zYB$N>3ucYTF-nft-r`| zn4z8Ci!H}3FH$a$?NL@UldSU^qV=#$o)ZLxi~=18FO@Lu{y`|>-s|%UW3ScacjRj} zXy*b6*%=e4VgpT$*cRlgVJmaVD5I`_* z0u7XcT27!5D|}Lf%Iq@bbtjAUxc1ml`H(Ix&9_Hb?MW$$0v#{rr*+=z;6npXtu75x z&WE%k`KYI6a6^={-Bc?)j{oB|FO>8%V^Nsn#S70%9!dr7e$#w4nQ6qY>}ak6i3L9k$@2=xzR57uFaPTD%awNcHQ62BY7i5ei*XtLJI5$U3(^XG-P7CmQ4ujO%uYE{aUs5 zat3{0tc`7z464Ngt@A!EWeZ)RA`bxfP%UYKBk3b8M$52d9#=jXc@q=!9m1~qVGdW- zSE4WzzVKF$;kak7-h}g~lk&Bn-ZIJF#zQ<@k>zR2$0+Ae?u4!u^vJ3~kJiB{4ln6I~tN=MW?2dX_R500a3bzL(O{gZVGLGJwfB*{YT{@zeCQ#NTAMk20i5^i4)#4%G z-cIsTkL-$PZmQ`|EAY_b@6{7O;1ZLc@G`7wMiAoIA<4@yMsGuxRkt8mQN~35WmSsd z0>2fK3w1tW=TXa*q!@4>tJ{)jyfD(B)yIFu^MuXR!3DO}m$!&iSXE|GLIpR&Bnm+WPjnYQakUu=wI!F}OKzif z;pcr>qz6{6)FuW!EM~+URd2)Y+w-ATFy06Az?9jRa*|w48YGxBYl=DIp-m+m0W*zC zU%lIc@*PZg)l4U1q!lHUjd?ES+};xy@x|-DZ#I;fTmbY|@wBRYC&<1;-6^YRIv<(4 z%{P0x!&1jv8p4!i|Eyat?pWmo1!3ZXUUTl0Y?bVT~iyi zXz@v*^1zCd*TIMb7KTDZ?1KZxle2|*UG~BZP=2&~b#lZE(#@CTJ-+;9UL3>2r?TCX z+h(fex`c(o)vxZM1#+)5CW7bppecQNU&-ez_PCZ4^Ee9=@%Hm(!R>cWrMo* zoOn-6hgXGO$Bs}&L*Zkhr84_N;mNdgFR<*!9ZwEaVn)195lC^*#>e$WEYqGSy{f2Y z@h^%m*(+6r_>vnUc7So#wq%$>dQMBCtB?ypL@UEa0-95RNNb-+?JE1YUE4L1$?M_s z%*o1YLZW&-E2zA!YP5J4-_4gEiHRv zb;O5MkJ0a4(K`IRUWAZ^l`s)2%sIpx|8C_u*VsF_zu(;*M$cvMa@-@0lDUW=n{V+7 z4Ur}XPI*87TF*-4-5gn(yjB&lbXWCb))eW3Lkh7i zq|~o9*2{Sb0(+feIhjme1BN!xFNE!c%TRZc#fr&@>9#1n=)#GB7jx!OqsvvBw#8Y! zy{f?kg9RnRyTfM#zJ?!Q{4h()0`21HVP?RQs`I4CxC$c@>UPcSrhf{Y@)*4zZhTNp z6Ou6j=LwS4I3{iDkuxM5Dlr(cGh`4~m6lKkny=W8O%;nw4K{hPkLPW5FL`lK&zwU| zg&ldnBzCvV%kNBV^Kf+V%irUPZ)JE)y^L8_fw7 zV$WLl-VXIEesS4?6lW1S7k{jCz${)Ws0V&^Y3ebfzsHo}pbe3`U`rxVFEQ{OP0-f1z;5%^Ip89E z;@LmmWdFC{32noOqu*P?=xxR@noMjc;v9fDph-A<4t5Pn2I1L>6|g>zyW=|%V;(=m zY>zafnL;pY8b8msWyI&O=98quiO{)mjjoAl82&5>bP z!CxrV-e_Um=S1=%z2yzSmJOejm`DQ=rBoQz8>D(kJ&|-RZL8TJ#8MDYPHiDf(i=>H zcs{Zo>l|23<*lZlNt~lM)u8FO`?DH4-RalioWEi@jEeN#`QSsfm%c+TW$FSvoR79Q zc=+2HNJt21l34s93iOwbysXCxkCtT+!f>&&kqY1#~0wovzw4f#6fEO41L$u zyRAnvw5eN)ompRz{DH@RrlvEeQvsds9uS-bqF&fDU<|nj7g4)`wkIepgDv&z5qO#m zo$(uui$n`9ScL7Qs)Dzt(sB?>7t85Zd|TCbG^YSN5Mlp!a|*z4{3#hsDFT4|!~j@=yb}aUJAaSdniki#vX0xgf~<=cq|tKbk`*K$11X zogM>P{C9}Qt-+_$8s4x0TM6WrYJwyEZMr`_k?*xrh8LL>M{A*5P-hflu&nLwCx4j= zSI~{B=+9$-XI*;l+;_-`FHH;@3AqwASf*0T|+kb(hx5z+M_*wUyZt=%W6M^6`6mBW_Kb3be^e8_|!xQ6^4-5$-^c zCAVNs5#J&H(U0M{lJj2?@E;!5wL}E{JxAdW;oh|n@N|+Q|Kw)T_QTD>t0ekQZWbL{ zM}VBcCLT%QDZyoShLkiQSNITZfMb^XRmltg0eDV~rEJQ)2)RXnhgg#!=hEB6jJHl9 zlf3`StmxmgAveAMRWogL3K$~LGeM@l?L|9yi{=xC_-w04#J!j4136Yh2<|a_qNGCG z9FL7cuHArDz-aM)lN4`cIyr&}raN#9`m@~|B{gaL-o%a=JZXVuD!4}ezVQ6p&JK_g}@H=|wn-@qe(~{c^$~Abq?TFnDlZw%%C>;SEA67<%aMU}dY#h{%b< zL~I)(F-zv=Q z11_gT=ZE|`xr>%%))S+Ky#|ji#(!geC3#mza;GVC7QGWuPDPZvgyxaG^T=~nMDfJ5 zGW)NK1^U8oV%tA7M?!WPJ8eTBJ^)ITHot2qWBf8nKG5~HVL55NekdgTO1nMQ?iPh$jv^bQ@jB!-zrgi^J!j;(lp38fT z9!Ur^?)5Y1hwz#*zy2To%3tsP`9qQI4@I^=6xsezWcz=w$d+H^W@)o>FX*xRk+h2b zLr*F*En-s>PsuiLy~Ag@GJe64eK(haff}wJ$mJszadd?rI>rCVm$cRJE=y?y6Ozv- zUu};mhp?dhaS#5l9)5$u@q_zvEyp0-M;NoFL$wos*Khkzk+gnI`v3N+=wJNmKMqv~ z{y0>zLjE{Z{gjn4{BfxI5l`0sUmvPe$0hZW^;u)tl#et(e5JaXOsMtbt2^xPBNwt93z8u)Fx0ujYv?{A}ZMFAY;XaYu zzbwPc;P*!~|D#0qM~Uo@64}p10>(c|WWSIa{-HlNfaNvt_k94o*BXG;+r2lC5G^V@ znSoHe^+T#+z^OMjUVa}}=|V_6md3gXBOd{k)#=3^dP4*HCso!h z1Ua_O96$qX-yslC5te`v0LOGEaFMVA?`%u}rA+~S5)yw~lCv-S^Lv9R>Zw2p-%cU?f{3_pjr?3UF!HFZVpx0zTn~OSC?s0$&DU1AO3d3Pj zS(WLMUG65e4^q zbG2o*XAND8stjl%CF;R%e)4H)XPd@_e!dGK)&BTkK6er7?bc1UmgSL4q~nXz1f|cLSl?3 z_i4%CHxqRp?uLu@zf^;6k$^XLsv^Z0nm>x6-%lXm+M=IRyWDBOabu_+)XIlgVY42a zO-o%z0XzZ-VeO@WB{t-50`j2rdqYBbzkYkkoVe8x*_3h;wmyQO%%6KtfdoH>ZY{%T z8wb7whyFU$yf^9_{>Cs10UFO4#FpwPl(Gl7`U;lJudiU9yE3lNk(^Rhu+t9$`An$t zVpiea<8o_>T4!dyj&A+1dYn`K+3K;1_RDX|O>H3^-~Drg)KRB04`jUBn${8p(Ql>E z-E)5<#kP$~bCYoChTT#8t{N|I-G@I6ynaUqdH9I_~mT7+mmO1=-9aeK%C8bl* zzg^~ko#}r*l_0-<3Os3s&mkJgYBX^FE}eK@2@{+DOysx@7GBOru3g* z$6uXsjv#ES9V4qvuOa4x0qk?85`FR#It+Sh3+82vcZggiA5^5Kx0V8w^bA`7Qz}`}~7xySnpko8h_t@mOpkN}m^w-AYuU?1q&*&a_O249eA^_dH@t4rO zw{&I`7P2b-y0eac^pX-AbPJzC-tR9!%e&LJ`~%_xw6&igKHP85tFGQ^Xzrhz^ABke zEM8(Llrah2OU~a~*$zsVDgiChL&;tj*hz4YI9Q=7D z{xDr_`(e8J>+1h$CAR-_SJhD`|6?T%h8d{?NRP#LQ#Wh#hd28EmwbvKW>egML7q z@e99R?EfHCiF*@1f&IX1Sv{8tMksmuSrjy_e*nleYx)N#^c*nx{F7%?;}4V12EfLU zNkb~o7T5sElx#Vw;^5E>H2j{rDc zI1&=__FsjGm=%d8bVCd{Krb;qLx0umX8yAJ!<1A?tqEb=MUzX%>AgdLMw-k4++TT% ztr;*2WQl0f96cj!!D11+;}_h?jot|ClKgSPh^1Xm(DSc6){7`~Jw7;c+Xzvjr2rgc z%|W*2joA3y)onxU!U6tH-;CO_i$B4Olt}g`m|GF#&0W!)JGpeH8K3bUwrkha{jvqO zW{*0|?Vl)30);;6{FFF z+ltYnXUguG_0Fly`qft(rXC%8s=79`ThjCOwPM;>H*+mDm|X2hki&1r!sRB1iWdSx zI?nJFp83KSGd`gHcrv2;W@6kUQJ0)-R&+VKCDU;R+lb|$S`JY6XNhDq?^YpxUG%%= zccNU&yM8Sub0MrvOwg41DR>qz8G0)MRxg8cB$p5u-`2wivg`Zm*rpHMq;ab+P2X;0 zv|C`Yc*rZT0*P;Shd z`ttP0%NJno1+OeV&WsN7GDHKyYIQ3D`pwt?;p#Q4+*;4r`(l>!X@0%o`Rd?870#ju zqIb)9;)y!2s{tO(=>qX6?fihkCgDfIiT4U^^#h%J*_`%A_A9;Ij_7+f@5*@gzZFEb z5TF69US@uhU^>YV(zP8@xg$Wvp0F_z`7NnD=1H<|Wxv#({c^L4on8WvnJ@6Cq41ZW zv3*9r1;>VBC%e3$ni6jpsnGbx-t9Y~^`YRwaxBX__1c&7enRaHJG+^nOFcx#R)i>= z#|_Ivb;Jvz_WHKEZH)5bJf5buF@DHwPdK`&85=M9w6&`767$x?HBRgaL*2gY(Gk@y;OJ8{d#z&VU!|?fGB|zK# zM^=rBG!uP!7L>dN@Iwze)!FP(BCT%<6+((USWmM-s& zcjiY5Uz(jWY>;u=EBd2BpN*D+bEkNXh$Hy|teUf+Xr_R<#cEjwc= zR(@jkf(&bS>!wme;bH$wVvi8l3ItZdz|l+$Xh3DjLTD$!*J!QJm^64rBViSqP!6ho zX>Cne-!=N^wW`50jJy4muB0p+Zi|HwSIK@|OXUD~S$PeBmxs7kP;L89y#fyd#6l=L zkl=Gj!^2n_RW$T4BBwk=dmq|cWIp(Dc%}~YNZ)<8C^*k1 zZa7yAxq`AIpTd_UOhD0$0o%Q^bB4*&a!-weq9Up9+GYI})jtz;#9@G@HB3uI&)W|o z2tI1$Zcyj^_rJjWiYiSu#N)z4IN}=WcT~Kt?!A;{n_?Un^>FjujW!>jkx)$y$X31` zI5Py$JLnPv-=LiCx0uQ{(=+?4)ArQVJ%4H~TFc3NJXOnd7wkVa&YI9jM5%uRHbPvb zlDHXmGuUj>syz0><>X)^!@Kf#&XpB6;?p6UMvyJcIdG{y^bU=uIX(sBjG{@^BZd^uV0q?m)z!SHyfJ9y#a|%r z+N);9dz~53ftl%IaMQ(W-p)s#n1|Hc&F$FY-g{Xxgm9t#0NhbW;`D3A-)e+eH=aq^m&w z&HpnjPqKtS$ufBKZT&DRgPV_M9@|3&lAWDO*Q{XzSXG$&-Z~-fZZ7y-`Q)F`5?i^f(nN9S+`2S# zNo^K=bP>1Bs+WsOEvntGYXd*1TnfimpB3z8&g*9G6b9lZO(bU`Q#rZ>blwkSt(w)O zu1%#`r5sPzmv3h1zLIrdM;cYrg=7tiV*Vg%GCDyZ`IEen#{h-NyBjN0*1w}qJXhl_MzsaF@V@Ionw)9mP zmirmj+Y-?iyM{-nW#YbhfdiNBr!Sv;3sY-r11hZ@^^$7t=7wrNV`Td`toHvFF*u^9 zK>992(MBl~Q&VQTSY2;bXjCPrjT{oUH?s!qnGWX(0YGB>c=r!f^nXph`NQOJ9NoYx zDkKfNZKWELa2IfO>#=}@ZRD~h5Ce`27T}Bwc)n_k{e^?@oxkuS`e+fLP`etUf*xWs z0H>VmgLBrK)OcAGZsR`MA;2CSO`ZzSr_o>AvDuX~*brU@0uWcnG*{L;|xM6V*D|*s@FaRz0;+8NGKUD(k?GEWmby?|n9?A>9@yLb^*x zsjuAQ>CGIyv{CE0djW-2E%r-YQoAO!K;C-B_z8VUam2hCR)K45r|Qlai+FhPxC%c? zhd=eqU7LC3bTVl_SF#^>*{(}{H;zDdKp;dcS%#UKh6PS_t&3>dp-<0tlr()P^e`dI zVOow|dm#}Cjm%+qay=Jush8ZI4gInAMx!jqr=Em#OdKYp#b}+%byyVsQfBL>VK%0J z=)?GK({~92mPIYIhBVNRVO_SjV=3-ubb^^mEJk`2v--?nV&Vy zH#+E_ObmT<;{+ISUUJ|Ja&(c@T3ScWCk(c2GrvXdF!Ytu5PMX0WZWe1%Mr6yy~yDP zC)v&vv&PU9yCDA9vg&13ViH|>ak4CJ`M6hp`%JO;c!|om`&;S6LuQ=MKl7bBFV*_? zl+#HOi`!3RAO1$0|-|zHYRP zk4R+cgt*ZMag|oh2*vff{m&H!=S$mh;d?Kw?f2IauYD)lII1-Up>#JW9{Q4fGQ3FuOX2 z>A=iFI$>) zAV%+zG6-Wh))3)^8pnXDiEWQ%$JUz6l!~L<*B-PBkG?H!*==&5j&YgkbkVO#U1_yd;gm0}(G{zFAi>+D@}+Fl*7crNWCMh- z8hH=hXozrAceP7o2Dzvj^H|&G@+lu%O?^g$Q8A8?- zqr{Jr%V(hEgR4!Qh?%G`$2{`j+ZqdcL_uZTEL(57bB>C?fyRO4V?8g`S0g{RKxih} z&`=yc&uS#P85fJ#iFTsF7hIQ77R7n==SiZlLr;wZo^(Byv@c!k7Vl>uMJJ%Fr=a2J zJ-!J3h4-oAVuZx|PHQ=W!+Q?!t2>J4v&L#B?}DI?6INS+W+bS)u${q9M>kE1kzJ-t z!mi~fYwsU0cAZPb>$U8*a+OMIaB_0luPV>c-v(j)iVZhM`H@Ws)h!wzl=%|{P7KU- zKYdk?0ZIZV6SWlt`GzO2oV#{425$fGZK5HNRzrru_tVEGtL~@gV?sCsaL37K2n%fa z<$WQ2ZepFEW@OA?KTdiOAKLL$h53tV%Y|-ceZmJ)8trjCGZo!~b1@lZ3Kp@6Zuw%> zRIkuCZ6KMfCoQSZa;HdGr}VC)zP^*pjl=Gcc{Dy}vZAE9Yq$U|NRF7IKP$CCE?P0$ zyz~eVb+>%ErQq3-thdZ^@nh{w6Z`Pe0mg@)P@rvNmktpTB~32Pr&v^k3yt-z{xRP;=m+6Lb>$hWj857C+6bt$gx+( z;ZD(V&BKw0rd-GL?9V+&DDtV@6UyEok|jnSq}YUX;41RN=g|dnBiWF+X+_`I<(d(Ml#|kg$=I6| zr`txIuk=Ok^zfymD~q!-TS-c%L_k25t|F~%bp(o}rtG$oalFY>Zn2(<%EvpDjBxd<)K?6FBu$KoQUMM>i#eKUf_71Z+* z`)jsJgo{`^PF!o?E>WKmg9wl+X^-J*Unj*-JQP`aSDMYyh-oju3urgvX?7dcWX*sN zXLsnmc#$1hc|ZAFyL@c!v250(lPq9Q%$wjIs2JBhOUpySrfY>1a_vv_ne|HCkw4v_ z`HXW%<8c}H4?^kI6reQPtJv+`wu}{wW(f??}|W zhPwm<=5AP-*Rtl5kY2>jzG=^Il~tH$ZZ|rerrDf)KHH3)3>)srz#huIa`#GRZEUhp zvJTvt5Z&4}sz@3krp5Rw4$rh|=r@@})x+wJa=Hsxo*Of{ga?~2S9;HM~f zQHOc46XFdMu9pw+$NRrs<)oSs^&HxcS17aaRlVuxIH9{Et7A=RuQtcxOu zL8Cs-9R^iwndCs72aghH$Su4W6y`|2IX@eCX=kM0p`DnQGpm}Yv6t}&?Lz%wd`^*uj*RPU7UF-Qxo67A6y zizr7gbK$imo0#manrK0Z zf~mIgG%T7fLA_3{o~niJ`M751p2hL98q*QJKa%>2QGVw&>(ha>by3FkQhaDIXru>j zl!!{qvH)R_)QrKmy)Cpwavd*mJz7Q zX>>FW{}j8sj;{daR71Pw1eRr z^U@TN)HLI@M9HCE9k>W~%tg)UPkny-D$|F@6-2oWxn__0wo6Iwc?4QrtO#kXB|v0% z2nheEVu$CNM--c&Q8kAazYc7tEI;uVuv&TLH4`&ra4zZnvjkI(7jws?7|ok-75<>} zUXkif7<}tBLOI`~Wj=)#Ien-CBXHqFD*fa`w)NYIhu@b-8SlNhOE;RMfm7!gNaI2> zh_RE6=?<@bXA)FU>aPZtmOM9D4zZ4_5-VTb>q@^2_Z2@MdSEZ=y@AvfNH8A-LDc{f zF`Cl}Y8S}uloPISc|sZ~=wR<56%h|>RBL?Tnb{{SlLr^qeL!C}4>4SkYziZomqgbi zdev+|v!JI$udfBNR=cB^bmEC0!@yhf#G>8WYYf|5KQ|Hny(XDoV1!@({Na=HTf^3W zC3gG;qpSa|gVrppVy>x#Y>O!0#Gn|&US-Xh_b9L@b3K|kskMlvCWzi-*m9Yg6db4b z!gk}FF~c^Rt*;N03r(Z#PNB0jFF2X=5Z?{EdHZ0l@G@*hQ+YF{>>LoZ`>G!jG}_Nh zhUY~z^Sj0+Bt7SN;2q_;He$gFp;8{UKom@Z5n+x=Wu zDBXp64gT&G{653C>g7g?nFq2Q5vF2}FJ>LIB%sqFmxw2)YS%`nqP}Zx>Y&HbpEObi|0lc^q2EIeKZ3|7AAChbF zGGPeUF7m<6=2ZwvSL1H)RL>z*KFY3!D3hbwh_eAeQFC?4rJD>KLnDkDFJ$5*rP?Hj6+rRXoU~e#hNQ*8_bxc_ zyos6>>D>QuP2`R(%Evn6nN-sKL>5>7ySD()(TOO_Ujk_CnSez+Gmw)3fw8NWFc-bz zdBxUKoSjR?mdQHvN;EoMNjK`F4%j)12_+-}a^aN82m;u`TQy@x z*w~wGq;tnp;t_~qX6;HAHUZ*|+?gNMMrUmzhL1KY>nW~KHQMu6_g?F`b@k43sC50u zC&Xc!P=pTPiEMvFFRha0RrAx1|W#u0k=g^*KTs`Bye!8#1 zgQjbfdZ2>h;pSW$Xu$IFIro0QXz00h3lelJG30He~uH2;W{ zhjgh;Uoyjzm*fEXfDdI->87yBufMArQ-(;8s``!5*TpG!!-=VW~kIiiQ zrlZ*9bwMcgj$<|yP0L!T$mj~iaa_B(UpK!xQYFdjWjNoiCl?jPS`P;U1Bk>r4|t{` zl^@!0WT@t04q>A`fMI9RP(Nxbo=F`V2iAN zt}QSXCqm&MHw!;$dmPpTy;I&MTv#2aRi;Ddt3HuS7@N$5Xwq3#Ahc|>lcgAXN+APF z3GRKiBcIB@!#JQ`n0hRAjYod>w6*Ys#>SicbCU2G6^706yw1umMQVQL`l@xrq9jA1 zUG{=b7r6R4frlGGph51AstH-Px-|pfCuc<52mv7%6or+9PQSZ3^u;bwZFkR_w#Bm@ z=x{&FK_tNv5r40X)4!Zhn#GL{d+%ld3QM82(AH3RGkK1wz?Wb$&0ZBX>MMjgR4Dy$ z=3y-UB<-);b|Zz$~A>IVnzNMAq5E`&ulousu*}aC#)0>SX>VH%0C6$97w{Z0KXO zyVuZSBdlV{=4_>Ji94L17#)rN%r(ufIh)K?d+SL|%Ea^U5RK(4zCg$$btmI&r^}4V zat39$yUhtI`b{Y)%CwS*@;vsDrD&$(`%d4M+p4v|UVW=WF^bVQWzeUy-mFP2C1OZ^ zZ~9Oc;t`I|9Nx*RK8@G-+Nb&{NzLg(=n1(KWGNRHV6lXx3+a-;ZRdXL2Z+rhBwbsF zgM5Y}Nyd;YX%Fct0cL}3#DL`WW*h!o-^)a|W%omK_V5dB@2>mDEQ$UZ?LPC6`p& zzJp<3o87e|rp)uw5EQ5~?m8E3o<=eRvJ6MU#eB{LjqM48J~;2v9at2M_-~@-A+K|!}(4XIeLk)-aRgMMXlx~4y)1M{5UQ`6z&9 z({uhFK=w4!zC$E`0I^V3S`vExxgoegF=N2)od;Fk=*WpJ2)R`f*$008S6ES_1ce z%FKkyxqzf%0C@7pMg1{Sf6S>rR^J~X;g8_?|D$aNmCWocE(5A30=k*~X!A z8o}qzue^0N%d>8vdQFdVfyu6rF`Pl@Goucytl}^6@P1zq`kVhTe4v^Gf~g}wlPM6; zm2h^0VJnDo1l`U~=;~buaPw?QPZ?}Wpl=I^Rh_1v0NP1Lbx0bZx;@3;O1U-H5c#v8 zoG|(&nwq)VilH#L!L}8Isf$2sDJjw8I>|(=xz7I;HVQN8iAM5NjM`&MdABhb*FGGlb8vPUX;wur+pXddk$4E zqz&A@G_%vTHCyR3&+FY?`@;5y-tu2E0sgUZBpovGgbk4q>DuL?l6<<# zC__R($B;eVDUSVJ0Ju|V7P1rq^QnV8LrD&CBHr#i0?UfLt z$o;YnM;-AX$^jIVTu2zjTMRa=jsU*c+X-lzLc17uS?{r5#?vGNAl%Mj( zCWR9?uP`=kpob<&Z)h2n31)*NL-OQfLwu)^&kYp>??IPv-8R_kB({r?E9P7mAx@g7 zAVJJaTZB`Dpsvmdon5{rN02oVH&en1{$ zmpqP`9}R8BCW!B-B-E!pw;5Yl%(GP-@s+!fXPL|uXVLZ`yP!#b6R@n8&>qO4?-05G zjUGW1X@D!DnNY@rRHl?-J&`M=g=q$P2iIQ=%D-RCtR3UKI0gxX#h_if$OkrC?HTC8 zR7>(Q-d^Tyb++xopoR)Nsdmw)=>_+Mc0>FZwbmjIj^p46R85v6px+|E+vg|uXz}sp zp6Qjm=!|9Zc$gVysB#;^R?ZzD$OyRzw5Qr|fFW}aXs1p#VK@U2Z6oy&&m7!qX538; zbGuYZ<@H!{n}jaMM|F5M>~s|E1~f)fviDR;91dE!8dZO|I`Afz)A~(bTUwaZm9^xY zy`l3{_J%u|Os@mGW|JNCP9O%04jq6JCh>!9upJBHyA$0w9&9gh3oSW1OHHxYWhkBX zldHLs3~7JIgaM=@pfuaGIs!(%!Xzd{_5!(Pn$30*BVg~)F6HkpoX}F-&qgvBkjOew z^zs15f{xxkosm$+6%7d4n9dr99}hZNO7;Y0HM7CaH)OXL&4ZyFNm@O|JL7|7yI4$$ z8?vDK+9^G*uf^nT^4bAl#Ff1}FutS@qDq4mw%>g>hGs6)| z-#wsUvr1uZVeRmg+6y_qWhTc7VLEjrl{F`&DndsL-Don@G<^jpyqA`Xc3+LDa!@6v z-p^67W!F$rq3mH|ZMx}=xm(M7DnSSRDAn;U!zcPMtQ?VubM>^EirftU7R>Y>vCE_L zxjW?LlgYX&!rX>^b!%wbf6ndozNgBgIr zE)kfQaQTSDrH)cPT_+O=SG?KC^Bj*sTS|_+ZZHQ>XqJ@{zvkJxYw0b_M zobh(r%gHi*y|ZfItYCJ#L!cBS5K85%c|U1JPz#qD=!+WO3gYX;_K~R%=x)Ez8?A<77Ez~jRK`L zwM42I#S?Xk_M}qV_IO$oT%j9RE?>Mi%MmS#n-_hbz!|T&E%m4@LW-dZB8<^7s?Z4^y2XNweclbDK1F&M@0jvNjpJcsVK9KGgouBtZxflGwv~f9hMy_w0Lry4K78d)GSa-o5wP z`#azFoyG2p(OPMbeEQZrN>bmQG4?djhSsrg{^})L=p5_KI}Z3X%HmVo@`U|dCzOM5 z=e(ppZ;ZKNz#x;Y9@MUS(}WTrAfb-WW^*=LQ{RLxc>&izbic%9Nk_6@Fip;AaJR0z zCwNK9Z@8bl#m%iblbvsqanx5d;VlUx9v@dHaEixkUbffOkV(brvTfqzVBz!ZTe0O4 zt80hsvU5*g-IE?(_k9s%7I!MFZ((yN;`VXURltH{S}DZf5cu#=(GwkP%@2PKB zFk)hH(PypjhXn7K$hE!UY37abE=|3alZ83j?zRZ+i)JfTSFPr(&u%9=QAKv*Rb2Am z495IMG6^($VtQVGhH=p#^Zm2Cp;|==Q|s z8Lc;hEZX0xhMBG`h^qidpA#`<#G7ygBQ|{;SejS+x)rBTQD9mH9aX=0wC2aSoiJ(D zVddY3@`j0so-W9r&8W;qj+{)YX!3d@ZP=$c;yVdB{hI6%ZGU^saLl3w*WI|O?7P=0 zqXuJ2Z2s}#u`B07AW7Nu8~lA*u}LbMa$G%b9VZ9xONcpY;}luCi4wlzVdbVm+M|2b zwXlSF-R!=^(X0Kilg6gn2FIQsfOnKHR5Vv2sij`YBCYs|T%N-Fs0mU>FpuKs7_8XT zmpE;`8#mSliLckYjp<3UHKE^DJX2Z9&w5fe#zc)bti7}MS<*FTWi#FF6SyJDd-)5d!l7~mg)P5rLBaX zJ|PfeD-{qVnE=|`u~8HTn9{en_A)r&`oCWX0Ki1_n}@)@#{WUQL>thkXN>_$I7JxD zlbxZG#>KYP07>)!NMaAzEH+EwxU~#HbR@tVe+GD?e0B0~CyzUMR>&)gyc)~*iod+i zCZiPqakLN^zROlfzJ7`_PqD`dDc!W|=T66O_>~Z6eE&obU^XA1yP}OMT(yp8H=4ls zM?PwiK268^L=&P4dA@Ak|Ik`UkP?#+(#rnn zx#*$7{neiJv?+$*UXwb3?JAg&tv(EtWb^160pXnpeG+9Q_2O<^TyYC99XuuJGEL1^ z@}kL@9%yFH<9K^TR}r$*j+tGuQXxVL{RX)38OwHkfUG( zN0rcpk`IvmDQ9Iwz!KoB`vB1gSQ}IDt1QbNi0wl)#cXLAI8866E%gw}8+LzyIDbPB ztR!3@v~tBjI^v?!#t>VdlSIjSTYze2KstqfHf8`!l*|}n1xDvvG^bd*JCBPPs^%f7 z*Z6+@qWx2W{`m><`nmQ)G}AUUH5EfX9FI)$)1dVh*(Xl_Hn(?P`E0-zS6o{_>^4ia z^}!40^Dj5L+a*ehSZwWCR_*8iE*mGzMA54DRJ~^no4Kd9>g{#!n+~gPy!*?tJpPuc z1mWAs-(I5T&!M*9l<+WthXf^n@k3p*_(3-f7>zM`F!wZmMiYO7Vr{m)4e|xiap*S$@fJl9jLx~!Ull*2$&?ZV2Oat`ni>RkVyUIhZ3=h7I zjDBlrtPjDuP7;-2C6AC5?vS~xkCyGUbg{)}rCV1O?2fw{VeAeZTfJLftEYj0PpyDd-gWD9- z`=4zy{mMHb`t%RnGD%wO2gqw4xX8fH7Nu`1IdLK(aw7BcXqWBzoC0M9Qzy`T^Y*)A z1BonUvA0Ham9#w5tP z0alMi@Z>9|2{9%8Yr23;s<|T7Ni;uh;7_t1ayCO?AK1Uzqf@@&hVu1(htIM*`?qHg z|Kric-|2Oz;&gr~B|M=FH%tM@CuTL>o(^jfOCvk%I@jGg>72_{T9>*3mFPLHdg)UhG_jqp_6DU?nfBlA$* zMemw`z^Il^Kf{~KmmzPX{*@Q|$wK_+QTOu;!+(5I?MR4{L_B7JKN0clD<8@DmyRm0 z=IzRMOg@Ju6)DLUn1K=1$^~)L&Ih5fUrqv=ZlHTt+>0Bjn)w@YqNzlZcj< zFek$JCX&00Z8Ju9N>ira%Sp$~E-)=;^=LV14&1DLKFQx66sD4sE2rWPg@vx0+55g* zNzv?>Z~lPx$*^@n(X|CniAa;RI>>9zb{3-}Y932%SOa#_rFpXILmdixlV`I2F>ef3 zg=JW3{*6P|wqF# Result<()> { Arg::new("init") .long("init") .takes_value(false) - .help("Initialize pageserver repo"), + .help("Initialize pageserver service: creates an initial config, tenant and timeline, if specified"), ) .arg( Arg::new("workdir") @@ -53,6 +59,13 @@ fn main() -> Result<()> { .help("Create tenant during init") .requires("init"), ) + .arg( + Arg::new("initial-timeline-id") + .long("initial-timeline-id") + .takes_value(true) + .help("Use a specific timeline id during init and tenant creation") + .requires("create-tenant"), + ) // See `settings.md` for more details on the extra configuration patameters pageserver can process .arg( Arg::new("config-override") @@ -61,7 +74,7 @@ fn main() -> Result<()> { .number_of_values(1) .multiple_occurrences(true) .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). - Any option has to be a valid toml document, example: `-c \"foo='hey'\"` `-c \"foo={value=1}\"`"), + Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), ) .get_matches(); @@ -72,7 +85,16 @@ fn main() -> Result<()> { let cfg_file_path = workdir.join("pageserver.toml"); let init = arg_matches.is_present("init"); - let create_tenant = arg_matches.value_of("create-tenant"); + let create_tenant = arg_matches + .value_of("create-tenant") + .map(ZTenantId::from_str) + .transpose() + .context("Failed to parse tenant id from the arguments")?; + let initial_timeline_id = arg_matches + .value_of("initial-timeline-id") + .map(ZTimelineId::from_str) + .transpose() + .context("Failed to parse timeline id from the arguments")?; // Set CWD to workdir for non-daemon modes env::set_current_dir(&workdir).with_context(|| { @@ -115,7 +137,14 @@ fn main() -> Result<()> { option_line ) })?; + for (key, item) in doc.iter() { + if key == "id" { + anyhow::ensure!( + init, + "node id can only be set during pageserver init and cannot be overridden" + ); + } toml.insert(key, item.clone()); } } @@ -136,7 +165,8 @@ fn main() -> Result<()> { // Create repo and exit if init was requested if init { - branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?; + timelines::init_pageserver(conf, create_tenant, initial_timeline_id) + .context("Failed to init pageserver")?; // write the config file std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| { format!( diff --git a/pageserver/src/branches.rs b/pageserver/src/branches.rs deleted file mode 100644 index 8a411060de..0000000000 --- a/pageserver/src/branches.rs +++ /dev/null @@ -1,428 +0,0 @@ -//! -//! Branch management code -//! -// TODO: move all paths construction to conf impl -// - -use anyhow::{bail, Context, Result}; -use postgres_ffi::ControlFileData; -use serde::{Deserialize, Serialize}; -use std::{ - fs, - path::Path, - process::{Command, Stdio}, - str::FromStr, - sync::Arc, -}; -use tracing::*; - -use zenith_utils::crashsafe_dir; -use zenith_utils::logging; -use zenith_utils::lsn::Lsn; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; - -use crate::walredo::WalRedoManager; -use crate::CheckpointConfig; -use crate::{config::PageServerConf, repository::Repository}; -use crate::{import_datadir, LOG_FILE_NAME}; -use crate::{repository::RepositoryTimeline, tenant_mgr}; - -#[derive(Serialize, Deserialize, Clone)] -pub struct BranchInfo { - pub name: String, - #[serde(with = "hex")] - pub timeline_id: ZTimelineId, - pub latest_valid_lsn: Lsn, - pub ancestor_id: Option, - pub ancestor_lsn: Option, - pub current_logical_size: usize, - pub current_logical_size_non_incremental: Option, -} - -impl BranchInfo { - pub fn from_path>( - path: T, - repo: &Arc, - include_non_incremental_logical_size: bool, - ) -> Result { - let path = path.as_ref(); - let name = path.file_name().unwrap().to_string_lossy().to_string(); - let timeline_id = std::fs::read_to_string(path) - .with_context(|| { - format!( - "Failed to read branch file contents at path '{}'", - path.display() - ) - })? - .parse::()?; - - let timeline = match repo.get_timeline(timeline_id)? { - RepositoryTimeline::Local(local_entry) => local_entry, - RepositoryTimeline::Remote { .. } => { - bail!("Timeline {} is remote, no branches to display", timeline_id) - } - }; - - // we use ancestor lsn zero if we don't have an ancestor, so turn this into an option based on timeline id - let (ancestor_id, ancestor_lsn) = match timeline.get_ancestor_timeline_id() { - Some(ancestor_id) => ( - Some(ancestor_id.to_string()), - Some(timeline.get_ancestor_lsn().to_string()), - ), - None => (None, None), - }; - - // non incremental size calculation can be heavy, so let it be optional - // needed for tests to check size calculation - let current_logical_size_non_incremental = include_non_incremental_logical_size - .then(|| { - timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) - }) - .transpose()?; - - Ok(BranchInfo { - name, - timeline_id, - latest_valid_lsn: timeline.get_last_record_lsn(), - ancestor_id, - ancestor_lsn, - current_logical_size: timeline.get_current_logical_size(), - current_logical_size_non_incremental, - }) - } -} - -#[derive(Debug, Clone, Copy)] -pub struct PointInTime { - pub timelineid: ZTimelineId, - pub lsn: Lsn, -} - -pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> { - // Initialize logger - // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages - let _log_file = logging::init(LOG_FILE_NAME, true)?; - - // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo - // process during repository initialization. - // - // FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched - // initdb in the background, and it kept running even after the "zenith init" had exited. - // In tests, we started the page server immediately after that, so that initdb was still - // running in the background, and we failed to run initdb again in the same directory. This - // has been solved for the rapid init+start case now, but the general race condition remains - // if you restart the server quickly. The WAL redo manager doesn't use a separate thread - // anymore, but I think that could still happen. - let dummy_redo_mgr = Arc::new(crate::walredo::DummyRedoManager {}); - - if let Some(tenantid) = create_tenant { - let tenantid = ZTenantId::from_str(tenantid)?; - println!("initializing tenantid {}", tenantid); - create_repo(conf, tenantid, dummy_redo_mgr).context("failed to create repo")?; - } - crashsafe_dir::create_dir_all(conf.tenants_path())?; - - println!("pageserver init succeeded"); - Ok(()) -} - -pub fn create_repo( - conf: &'static PageServerConf, - tenantid: ZTenantId, - wal_redo_manager: Arc, -) -> Result> { - let repo_dir = conf.tenant_path(&tenantid); - if repo_dir.exists() { - bail!("repo for {} already exists", tenantid) - } - - // top-level dir may exist if we are creating it through CLI - crashsafe_dir::create_dir_all(&repo_dir) - .with_context(|| format!("could not create directory {}", repo_dir.display()))?; - - crashsafe_dir::create_dir(conf.timelines_path(&tenantid))?; - crashsafe_dir::create_dir_all(conf.branches_path(&tenantid))?; - crashsafe_dir::create_dir_all(conf.tags_path(&tenantid))?; - - info!("created directory structure in {}", repo_dir.display()); - - // create a new timeline directory - let timeline_id = ZTimelineId::generate(); - let timelinedir = conf.timeline_path(&timeline_id, &tenantid); - - crashsafe_dir::create_dir(&timelinedir)?; - - let repo = Arc::new(crate::layered_repository::LayeredRepository::new( - conf, - wal_redo_manager, - tenantid, - conf.remote_storage_config.is_some(), - )); - - // Load data into pageserver - // TODO To implement zenith import we need to - // move data loading out of create_repo() - bootstrap_timeline(conf, tenantid, timeline_id, repo.as_ref())?; - - Ok(repo) -} - -// Returns checkpoint LSN from controlfile -fn get_lsn_from_controlfile(path: &Path) -> Result { - // Read control file to extract the LSN - let controlfile_path = path.join("global").join("pg_control"); - let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?; - let lsn = controlfile.checkPoint; - - Ok(Lsn(lsn)) -} - -// Create the cluster temporarily in 'initdbpath' directory inside the repository -// to get bootstrap data for timeline initialization. -// -fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { - info!("running initdb in {}... ", initdbpath.display()); - - let initdb_path = conf.pg_bin_dir().join("initdb"); - let initdb_output = Command::new(initdb_path) - .args(&["-D", initdbpath.to_str().unwrap()]) - .args(&["-U", &conf.superuser]) - .args(&["-E", "utf8"]) - .arg("--no-instructions") - // This is only used for a temporary installation that is deleted shortly after, - // so no need to fsync it - .arg("--no-sync") - .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) - .stdout(Stdio::null()) - .output() - .context("failed to execute initdb")?; - if !initdb_output.status.success() { - anyhow::bail!( - "initdb failed: '{}'", - String::from_utf8_lossy(&initdb_output.stderr) - ); - } - - Ok(()) -} - -// -// - run initdb to init temporary instance and get bootstrap data -// - after initialization complete, remove the temp dir. -// -fn bootstrap_timeline( - conf: &'static PageServerConf, - tenantid: ZTenantId, - tli: ZTimelineId, - repo: &dyn Repository, -) -> Result<()> { - let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered(); - - let initdb_path = conf.tenant_path(&tenantid).join("tmp"); - - // Init temporarily repo to get bootstrap data - run_initdb(conf, &initdb_path)?; - let pgdata_path = initdb_path; - - let lsn = get_lsn_from_controlfile(&pgdata_path)?.align(); - - // Import the contents of the data directory at the initial checkpoint - // LSN, and any WAL after that. - // Initdb lsn will be equal to last_record_lsn which will be set after import. - // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = repo.create_empty_timeline(tli, lsn)?; - import_datadir::import_timeline_from_postgres_datadir( - &pgdata_path, - timeline.writer().as_ref(), - lsn, - )?; - timeline.checkpoint(CheckpointConfig::Forced)?; - - println!( - "created initial timeline {} timeline.lsn {}", - tli, - timeline.get_last_record_lsn() - ); - - let data = tli.to_string(); - fs::write(conf.branch_path("main", &tenantid), data)?; - println!("created main branch"); - - // Remove temp dir. We don't need it anymore - fs::remove_dir_all(pgdata_path)?; - - Ok(()) -} - -pub(crate) fn get_branches( - conf: &PageServerConf, - tenantid: &ZTenantId, - include_non_incremental_logical_size: bool, -) -> Result> { - let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?; - - // Each branch has a corresponding record (text file) in the refs/branches - // with timeline_id. - let branches_dir = conf.branches_path(tenantid); - - std::fs::read_dir(&branches_dir) - .with_context(|| { - format!( - "Found no branches directory '{}' for tenant {}", - branches_dir.display(), - tenantid - ) - })? - .map(|dir_entry_res| { - let dir_entry = dir_entry_res.with_context(|| { - format!( - "Failed to list branches directory '{}' content for tenant {}", - branches_dir.display(), - tenantid - ) - })?; - BranchInfo::from_path( - dir_entry.path(), - &repo, - include_non_incremental_logical_size, - ) - }) - .collect() -} - -pub(crate) fn create_branch( - conf: &PageServerConf, - branchname: &str, - startpoint_str: &str, - tenantid: &ZTenantId, -) -> Result { - let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?; - - if conf.branch_path(branchname, tenantid).exists() { - anyhow::bail!("branch {} already exists", branchname); - } - - let mut startpoint = parse_point_in_time(conf, startpoint_str, tenantid)?; - let timeline = repo - .get_timeline(startpoint.timelineid)? - .local_timeline() - .context("Cannot branch off the timeline that's not present locally")?; - if startpoint.lsn == Lsn(0) { - // Find end of WAL on the old timeline - let end_of_wal = timeline.get_last_record_lsn(); - info!("branching at end of WAL: {}", end_of_wal); - startpoint.lsn = end_of_wal; - } else { - // Wait for the WAL to arrive and be processed on the parent branch up - // to the requested branch point. The repository code itself doesn't - // require it, but if we start to receive WAL on the new timeline, - // decoding the new WAL might need to look up previous pages, relation - // sizes etc. and that would get confused if the previous page versions - // are not in the repository yet. - timeline.wait_lsn(startpoint.lsn)?; - } - startpoint.lsn = startpoint.lsn.align(); - if timeline.get_ancestor_lsn() > startpoint.lsn { - // can we safely just branch from the ancestor instead? - anyhow::bail!( - "invalid startpoint {} for the branch {}: less than timeline ancestor lsn {:?}", - startpoint.lsn, - branchname, - timeline.get_ancestor_lsn() - ); - } - - let new_timeline_id = ZTimelineId::generate(); - - // Forward entire timeline creation routine to repository - // backend, so it can do all needed initialization - repo.branch_timeline(startpoint.timelineid, new_timeline_id, startpoint.lsn)?; - - // Remember the human-readable branch name for the new timeline. - // FIXME: there's a race condition, if you create a branch with the same - // name concurrently. - let data = new_timeline_id.to_string(); - fs::write(conf.branch_path(branchname, tenantid), data)?; - - Ok(BranchInfo { - name: branchname.to_string(), - timeline_id: new_timeline_id, - latest_valid_lsn: startpoint.lsn, - ancestor_id: Some(startpoint.timelineid.to_string()), - ancestor_lsn: Some(startpoint.lsn.to_string()), - current_logical_size: 0, - current_logical_size_non_incremental: Some(0), - }) -} - -// -// Parse user-given string that represents a point-in-time. -// -// We support multiple variants: -// -// Raw timeline id in hex, meaning the end of that timeline: -// bc62e7d612d0e6fe8f99a6dd2f281f9d -// -// A specific LSN on a timeline: -// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8 -// -// Same, with a human-friendly branch name: -// main -// main@2/15D3DD8 -// -// Human-friendly tag name: -// mytag -// -// -fn parse_point_in_time( - conf: &PageServerConf, - s: &str, - tenantid: &ZTenantId, -) -> Result { - let mut strings = s.split('@'); - let name = strings.next().unwrap(); - - let lsn = strings - .next() - .map(Lsn::from_str) - .transpose() - .context("invalid LSN in point-in-time specification")?; - - // Check if it's a tag - if lsn.is_none() { - let tagpath = conf.tag_path(name, tenantid); - if tagpath.exists() { - let pointstr = fs::read_to_string(tagpath)?; - - return parse_point_in_time(conf, &pointstr, tenantid); - } - } - - // Check if it's a branch - // Check if it's branch @ LSN - let branchpath = conf.branch_path(name, tenantid); - if branchpath.exists() { - let pointstr = fs::read_to_string(branchpath)?; - - let mut result = parse_point_in_time(conf, &pointstr, tenantid)?; - - result.lsn = lsn.unwrap_or(Lsn(0)); - return Ok(result); - } - - // Check if it's a timelineid - // Check if it's timelineid @ LSN - if let Ok(timelineid) = ZTimelineId::from_str(name) { - let tlipath = conf.timeline_path(&timelineid, tenantid); - if tlipath.exists() { - return Ok(PointInTime { - timelineid, - lsn: lsn.unwrap_or(Lsn(0)), - }); - } - } - - bail!("could not parse point-in-time {}", s); -} diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 8b65e7e2e6..dc85c83c17 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -8,7 +8,7 @@ use anyhow::{bail, ensure, Context, Result}; use toml_edit; use toml_edit::{Document, Item}; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; use std::convert::TryInto; use std::env; @@ -78,6 +78,10 @@ pub mod defaults { #[derive(Debug, Clone, PartialEq, Eq)] pub struct PageServerConf { + // Identifier of that particular pageserver so e g safekeepers + // can safely distinguish different pageservers + pub id: ZNodeId, + /// Example (default): 127.0.0.1:64000 pub listen_pg_addr: String, /// Example (default): 127.0.0.1:9898 @@ -118,6 +122,206 @@ pub struct PageServerConf { pub remote_storage_config: Option, } +// use dedicated enum for builder to better indicate the intention +// and avoid possible confusion with nested options +pub enum BuilderValue { + Set(T), + NotSet, +} + +impl BuilderValue { + pub fn ok_or(self, err: E) -> Result { + match self { + Self::Set(v) => Ok(v), + Self::NotSet => Err(err), + } + } +} + +// needed to simplify config construction +struct PageServerConfigBuilder { + listen_pg_addr: BuilderValue, + + listen_http_addr: BuilderValue, + + checkpoint_distance: BuilderValue, + checkpoint_period: BuilderValue, + + gc_horizon: BuilderValue, + gc_period: BuilderValue, + + wait_lsn_timeout: BuilderValue, + wal_redo_timeout: BuilderValue, + + superuser: BuilderValue, + + page_cache_size: BuilderValue, + max_file_descriptors: BuilderValue, + + workdir: BuilderValue, + + pg_distrib_dir: BuilderValue, + + auth_type: BuilderValue, + + // + auth_validation_public_key_path: BuilderValue>, + remote_storage_config: BuilderValue>, + + id: BuilderValue, +} + +impl Default for PageServerConfigBuilder { + fn default() -> Self { + use self::BuilderValue::*; + use defaults::*; + Self { + listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()), + listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()), + checkpoint_distance: Set(DEFAULT_CHECKPOINT_DISTANCE), + checkpoint_period: Set(humantime::parse_duration(DEFAULT_CHECKPOINT_PERIOD) + .expect("cannot parse default checkpoint period")), + gc_horizon: Set(DEFAULT_GC_HORIZON), + gc_period: Set(humantime::parse_duration(DEFAULT_GC_PERIOD) + .expect("cannot parse default gc period")), + wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) + .expect("cannot parse default wait lsn timeout")), + wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) + .expect("cannot parse default wal redo timeout")), + superuser: Set(DEFAULT_SUPERUSER.to_string()), + page_cache_size: Set(DEFAULT_PAGE_CACHE_SIZE), + max_file_descriptors: Set(DEFAULT_MAX_FILE_DESCRIPTORS), + workdir: Set(PathBuf::new()), + pg_distrib_dir: Set(env::current_dir() + .expect("cannot access current directory") + .join("tmp_install")), + auth_type: Set(AuthType::Trust), + auth_validation_public_key_path: Set(None), + remote_storage_config: Set(None), + id: NotSet, + } + } +} + +impl PageServerConfigBuilder { + pub fn listen_pg_addr(&mut self, listen_pg_addr: String) { + self.listen_pg_addr = BuilderValue::Set(listen_pg_addr) + } + + pub fn listen_http_addr(&mut self, listen_http_addr: String) { + self.listen_http_addr = BuilderValue::Set(listen_http_addr) + } + + pub fn checkpoint_distance(&mut self, checkpoint_distance: u64) { + self.checkpoint_distance = BuilderValue::Set(checkpoint_distance) + } + + pub fn checkpoint_period(&mut self, checkpoint_period: Duration) { + self.checkpoint_period = BuilderValue::Set(checkpoint_period) + } + + pub fn gc_horizon(&mut self, gc_horizon: u64) { + self.gc_horizon = BuilderValue::Set(gc_horizon) + } + + pub fn gc_period(&mut self, gc_period: Duration) { + self.gc_period = BuilderValue::Set(gc_period) + } + + pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) { + self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout) + } + + pub fn wal_redo_timeout(&mut self, wal_redo_timeout: Duration) { + self.wal_redo_timeout = BuilderValue::Set(wal_redo_timeout) + } + + pub fn superuser(&mut self, superuser: String) { + self.superuser = BuilderValue::Set(superuser) + } + + pub fn page_cache_size(&mut self, page_cache_size: usize) { + self.page_cache_size = BuilderValue::Set(page_cache_size) + } + + pub fn max_file_descriptors(&mut self, max_file_descriptors: usize) { + self.max_file_descriptors = BuilderValue::Set(max_file_descriptors) + } + + pub fn workdir(&mut self, workdir: PathBuf) { + self.workdir = BuilderValue::Set(workdir) + } + + pub fn pg_distrib_dir(&mut self, pg_distrib_dir: PathBuf) { + self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir) + } + + pub fn auth_type(&mut self, auth_type: AuthType) { + self.auth_type = BuilderValue::Set(auth_type) + } + + pub fn auth_validation_public_key_path( + &mut self, + auth_validation_public_key_path: Option, + ) { + self.auth_validation_public_key_path = BuilderValue::Set(auth_validation_public_key_path) + } + + pub fn remote_storage_config(&mut self, remote_storage_config: Option) { + self.remote_storage_config = BuilderValue::Set(remote_storage_config) + } + + pub fn id(&mut self, node_id: ZNodeId) { + self.id = BuilderValue::Set(node_id) + } + + pub fn build(self) -> Result { + Ok(PageServerConf { + listen_pg_addr: self + .listen_pg_addr + .ok_or(anyhow::anyhow!("missing listen_pg_addr"))?, + listen_http_addr: self + .listen_http_addr + .ok_or(anyhow::anyhow!("missing listen_http_addr"))?, + checkpoint_distance: self + .checkpoint_distance + .ok_or(anyhow::anyhow!("missing checkpoint_distance"))?, + checkpoint_period: self + .checkpoint_period + .ok_or(anyhow::anyhow!("missing checkpoint_period"))?, + gc_horizon: self + .gc_horizon + .ok_or(anyhow::anyhow!("missing gc_horizon"))?, + gc_period: self.gc_period.ok_or(anyhow::anyhow!("missing gc_period"))?, + wait_lsn_timeout: self + .wait_lsn_timeout + .ok_or(anyhow::anyhow!("missing wait_lsn_timeout"))?, + wal_redo_timeout: self + .wal_redo_timeout + .ok_or(anyhow::anyhow!("missing wal_redo_timeout"))?, + superuser: self.superuser.ok_or(anyhow::anyhow!("missing superuser"))?, + page_cache_size: self + .page_cache_size + .ok_or(anyhow::anyhow!("missing page_cache_size"))?, + max_file_descriptors: self + .max_file_descriptors + .ok_or(anyhow::anyhow!("missing max_file_descriptors"))?, + workdir: self.workdir.ok_or(anyhow::anyhow!("missing workdir"))?, + pg_distrib_dir: self + .pg_distrib_dir + .ok_or(anyhow::anyhow!("missing pg_distrib_dir"))?, + auth_type: self.auth_type.ok_or(anyhow::anyhow!("missing auth_type"))?, + auth_validation_public_key_path: self + .auth_validation_public_key_path + .ok_or(anyhow::anyhow!("missing auth_validation_public_key_path"))?, + remote_storage_config: self + .remote_storage_config + .ok_or(anyhow::anyhow!("missing remote_storage_config"))?, + id: self.id.ok_or(anyhow::anyhow!("missing id"))?, + }) + } +} + /// External backup storage configuration, enough for creating a client for that storage. #[derive(Debug, Clone, PartialEq, Eq)] pub struct RemoteStorageConfig { @@ -188,22 +392,6 @@ impl PageServerConf { self.tenants_path().join(tenantid.to_string()) } - pub fn tags_path(&self, tenantid: &ZTenantId) -> PathBuf { - self.tenant_path(tenantid).join("refs").join("tags") - } - - pub fn tag_path(&self, tag_name: &str, tenantid: &ZTenantId) -> PathBuf { - self.tags_path(tenantid).join(tag_name) - } - - pub fn branches_path(&self, tenantid: &ZTenantId) -> PathBuf { - self.tenant_path(tenantid).join("refs").join("branches") - } - - pub fn branch_path(&self, branch_name: &str, tenantid: &ZTenantId) -> PathBuf { - self.branches_path(tenantid).join(branch_name) - } - pub fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf { self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME) } @@ -212,10 +400,6 @@ impl PageServerConf { self.timelines_path(tenantid).join(timelineid.to_string()) } - pub fn ancestor_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf { - self.timeline_path(timelineid, tenantid).join("ancestor") - } - // // Postgres distribution paths // @@ -233,61 +417,41 @@ impl PageServerConf { /// /// This leaves any options not present in the file in the built-in defaults. pub fn parse_and_validate(toml: &Document, workdir: &Path) -> Result { - use defaults::*; - - let mut conf = PageServerConf { - workdir: workdir.to_path_buf(), - - listen_pg_addr: DEFAULT_PG_LISTEN_ADDR.to_string(), - listen_http_addr: DEFAULT_HTTP_LISTEN_ADDR.to_string(), - checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, - checkpoint_period: humantime::parse_duration(DEFAULT_CHECKPOINT_PERIOD)?, - gc_horizon: DEFAULT_GC_HORIZON, - gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)?, - wait_lsn_timeout: humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)?, - wal_redo_timeout: humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)?, - page_cache_size: DEFAULT_PAGE_CACHE_SIZE, - max_file_descriptors: DEFAULT_MAX_FILE_DESCRIPTORS, - - pg_distrib_dir: PathBuf::new(), - auth_validation_public_key_path: None, - auth_type: AuthType::Trust, - - remote_storage_config: None, - - superuser: DEFAULT_SUPERUSER.to_string(), - }; + let mut builder = PageServerConfigBuilder::default(); + builder.workdir(workdir.to_owned()); for (key, item) in toml.iter() { match key { - "listen_pg_addr" => conf.listen_pg_addr = parse_toml_string(key, item)?, - "listen_http_addr" => conf.listen_http_addr = parse_toml_string(key, item)?, - "checkpoint_distance" => conf.checkpoint_distance = parse_toml_u64(key, item)?, - "checkpoint_period" => conf.checkpoint_period = parse_toml_duration(key, item)?, - "gc_horizon" => conf.gc_horizon = parse_toml_u64(key, item)?, - "gc_period" => conf.gc_period = parse_toml_duration(key, item)?, - "wait_lsn_timeout" => conf.wait_lsn_timeout = parse_toml_duration(key, item)?, - "wal_redo_timeout" => conf.wal_redo_timeout = parse_toml_duration(key, item)?, - "initial_superuser_name" => conf.superuser = parse_toml_string(key, item)?, - "page_cache_size" => conf.page_cache_size = parse_toml_u64(key, item)? as usize, + "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?), + "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?), + "checkpoint_distance" => builder.checkpoint_distance(parse_toml_u64(key, item)?), + "checkpoint_period" => builder.checkpoint_period(parse_toml_duration(key, item)?), + "gc_horizon" => builder.gc_horizon(parse_toml_u64(key, item)?), + "gc_period" => builder.gc_period(parse_toml_duration(key, item)?), + "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?), + "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?), + "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?), + "page_cache_size" => builder.page_cache_size(parse_toml_u64(key, item)? as usize), "max_file_descriptors" => { - conf.max_file_descriptors = parse_toml_u64(key, item)? as usize + builder.max_file_descriptors(parse_toml_u64(key, item)? as usize) } "pg_distrib_dir" => { - conf.pg_distrib_dir = PathBuf::from(parse_toml_string(key, item)?) + builder.pg_distrib_dir(PathBuf::from(parse_toml_string(key, item)?)) } - "auth_validation_public_key_path" => { - conf.auth_validation_public_key_path = - Some(PathBuf::from(parse_toml_string(key, item)?)) - } - "auth_type" => conf.auth_type = parse_toml_auth_type(key, item)?, + "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some( + PathBuf::from(parse_toml_string(key, item)?), + )), + "auth_type" => builder.auth_type(parse_toml_auth_type(key, item)?), "remote_storage" => { - conf.remote_storage_config = Some(Self::parse_remote_storage_config(item)?) + builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?)) } + "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)), _ => bail!("unrecognized pageserver option '{}'", key), } } + let mut conf = builder.build().context("invalid config")?; + if conf.auth_type == AuthType::ZenithJWT { let auth_validation_public_key_path = conf .auth_validation_public_key_path @@ -301,9 +465,6 @@ impl PageServerConf { ); } - if conf.pg_distrib_dir == PathBuf::new() { - conf.pg_distrib_dir = env::current_dir()?.join("tmp_install") - }; if !conf.pg_distrib_dir.join("bin/postgres").exists() { bail!( "Can't find postgres binary at {}", @@ -398,6 +559,7 @@ impl PageServerConf { #[cfg(test)] pub fn dummy_conf(repo_dir: PathBuf) -> Self { PageServerConf { + id: ZNodeId(0), checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, checkpoint_period: Duration::from_secs(10), gc_horizon: defaults::DEFAULT_GC_HORIZON, @@ -482,15 +644,16 @@ max_file_descriptors = 333 # initial superuser role name to use when creating a new tenant initial_superuser_name = 'zzzz' +id = 10 - "#; +"#; #[test] fn parse_defaults() -> anyhow::Result<()> { let tempdir = tempdir()?; let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; // we have to create dummy pathes to overcome the validation errors - let config_string = format!("pg_distrib_dir='{}'", pg_distrib_dir.display()); + let config_string = format!("pg_distrib_dir='{}'\nid=10", pg_distrib_dir.display()); let toml = config_string.parse()?; let parsed_config = @@ -501,6 +664,7 @@ initial_superuser_name = 'zzzz' assert_eq!( parsed_config, PageServerConf { + id: ZNodeId(10), listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, @@ -544,6 +708,7 @@ initial_superuser_name = 'zzzz' assert_eq!( parsed_config, PageServerConf { + id: ZNodeId(10), listen_pg_addr: "127.0.0.1:64000".to_string(), listen_http_addr: "127.0.0.1:9898".to_string(), checkpoint_distance: 111, diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 6ce377c535..9844e7ea82 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -1,17 +1,124 @@ +use crate::timelines::TimelineInfo; +use anyhow::{anyhow, bail, Context}; use serde::{Deserialize, Serialize}; - -use crate::ZTenantId; +use zenith_utils::{ + lsn::Lsn, + zid::{HexZTenantId, HexZTimelineId, ZNodeId, ZTenantId, ZTimelineId}, +}; #[derive(Serialize, Deserialize)] -pub struct BranchCreateRequest { - #[serde(with = "hex")] - pub tenant_id: ZTenantId, - pub name: String, - pub start_point: String, +pub struct TimelineCreateRequest { + pub new_timeline_id: Option, + pub ancestor_timeline_id: Option, + pub ancestor_start_lsn: Option, } #[derive(Serialize, Deserialize)] pub struct TenantCreateRequest { - #[serde(with = "hex")] - pub tenant_id: ZTenantId, + pub new_tenant_id: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct TimelineInfoResponse { + pub kind: String, + #[serde(with = "hex")] + timeline_id: ZTimelineId, + #[serde(with = "hex")] + tenant_id: ZTenantId, + disk_consistent_lsn: String, + last_record_lsn: Option, + prev_record_lsn: Option, + ancestor_timeline_id: Option, + ancestor_lsn: Option, + current_logical_size: Option, + current_logical_size_non_incremental: Option, +} + +impl From for TimelineInfoResponse { + fn from(other: TimelineInfo) -> Self { + match other { + TimelineInfo::Local { + timeline_id, + tenant_id, + last_record_lsn, + prev_record_lsn, + ancestor_timeline_id, + ancestor_lsn, + disk_consistent_lsn, + current_logical_size, + current_logical_size_non_incremental, + } => TimelineInfoResponse { + kind: "Local".to_owned(), + timeline_id, + tenant_id, + disk_consistent_lsn: disk_consistent_lsn.to_string(), + last_record_lsn: Some(last_record_lsn.to_string()), + prev_record_lsn: Some(prev_record_lsn.to_string()), + ancestor_timeline_id: ancestor_timeline_id.map(HexZTimelineId::from), + ancestor_lsn: ancestor_lsn.map(|lsn| lsn.to_string()), + current_logical_size: Some(current_logical_size), + current_logical_size_non_incremental, + }, + TimelineInfo::Remote { + timeline_id, + tenant_id, + disk_consistent_lsn, + } => TimelineInfoResponse { + kind: "Remote".to_owned(), + timeline_id, + tenant_id, + disk_consistent_lsn: disk_consistent_lsn.to_string(), + last_record_lsn: None, + prev_record_lsn: None, + ancestor_timeline_id: None, + ancestor_lsn: None, + current_logical_size: None, + current_logical_size_non_incremental: None, + }, + } + } +} + +impl TryFrom for TimelineInfo { + type Error = anyhow::Error; + + fn try_from(other: TimelineInfoResponse) -> anyhow::Result { + let parse_lsn_hex_string = |lsn_string: String| { + lsn_string + .parse::() + .with_context(|| format!("Failed to parse Lsn as hex string from '{}'", lsn_string)) + }; + + let disk_consistent_lsn = parse_lsn_hex_string(other.disk_consistent_lsn)?; + Ok(match other.kind.as_str() { + "Local" => TimelineInfo::Local { + timeline_id: other.timeline_id, + tenant_id: other.tenant_id, + last_record_lsn: other + .last_record_lsn + .ok_or(anyhow!("Local timeline should have last_record_lsn")) + .and_then(parse_lsn_hex_string)?, + prev_record_lsn: other + .prev_record_lsn + .ok_or(anyhow!("Local timeline should have prev_record_lsn")) + .and_then(parse_lsn_hex_string)?, + ancestor_timeline_id: other.ancestor_timeline_id.map(ZTimelineId::from), + ancestor_lsn: other.ancestor_lsn.map(parse_lsn_hex_string).transpose()?, + disk_consistent_lsn, + current_logical_size: other.current_logical_size.ok_or(anyhow!("No "))?, + current_logical_size_non_incremental: other.current_logical_size_non_incremental, + }, + "Remote" => TimelineInfo::Remote { + timeline_id: other.timeline_id, + tenant_id: other.tenant_id, + disk_consistent_lsn, + }, + unknown => bail!("Unknown timeline kind: {}", unknown), + }) + } +} + +#[derive(Serialize)] +pub struct StatusResponse { + pub id: ZNodeId, } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index dcb81849e0..d322b051a6 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -17,7 +17,12 @@ paths: application/json: schema: type: object - /v1/timeline/{tenant_id}: + required: + - id + properties: + id: + type: integer + /v1/tenant/{tenant_id}/timeline: parameters: - name: tenant_id in: path @@ -25,19 +30,22 @@ paths: schema: type: string format: hex + - name: include-non-incremental-logical-size + in: query + schema: + type: string + description: Controls calculation of current_logical_size_non_incremental get: - description: List tenant timelines + description: Get timelines for tenant responses: "200": - description: array of brief timeline descriptions + description: TimelineInfo content: application/json: schema: type: array items: - # currently, just a timeline id string, but when remote index gets to be accessed - # remote/local timeline field would be added at least - type: string + $ref: "#/components/schemas/TimelineInfo" "400": description: Error when no tenant id found in path content: @@ -62,7 +70,7 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - /v1/timeline/{tenant_id}/{timeline_id}: + /v1/tenant/{tenant_id}/timeline/{timeline_id}: parameters: - name: tenant_id in: path @@ -76,8 +84,13 @@ paths: schema: type: string format: hex + - name: include-non-incremental-logical-size + in: query + schema: + type: string + description: Controls calculation of current_logical_size_non_incremental get: - description: Get timeline info for tenant's remote timeline + description: Get info about the timeline responses: "200": description: TimelineInfo @@ -86,7 +99,7 @@ paths: schema: $ref: "#/components/schemas/TimelineInfo" "400": - description: Error when no tenant id found in path or no branch name + description: Error when no tenant id found in path or no timeline id content: application/json: schema: @@ -109,7 +122,7 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - /v1/branch/{tenant_id}: + /v1/tenant/{tenant_id}/timeline/: parameters: - name: tenant_id in: path @@ -117,126 +130,33 @@ paths: schema: type: string format: hex - - name: include-non-incremental-logical-size - in: query - schema: - type: string - description: Controls calculation of current_logical_size_non_incremental - get: - description: Get branches for tenant - responses: - "200": - description: BranchInfo - content: - application/json: - schema: - type: array - items: - $ref: "#/components/schemas/BranchInfo" - "400": - description: Error when no tenant id found in path - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - /v1/branch/{tenant_id}/{branch_name}: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - format: hex - - name: branch_name - in: path - required: true - schema: - type: string - - name: include-non-incremental-logical-size - in: query - schema: - type: string - description: Controls calculation of current_logical_size_non_incremental - get: - description: Get branches for tenant - responses: - "200": - description: BranchInfo - content: - application/json: - schema: - $ref: "#/components/schemas/BranchInfo" - "400": - description: Error when no tenant id found in path or no branch name - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - /v1/branch/: post: - description: Create branch + description: | + Create a timeline. Returns new timeline id on success.\ + If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline. requestBody: content: application/json: schema: type: object - required: - - "tenant_id" - - "name" - - "start_point" properties: - tenant_id: + new_timeline_id: type: string format: hex - name: + ancestor_timeline_id: type: string - start_point: + format: hex + ancestor_start_lsn: type: string responses: "201": - description: BranchInfo + description: TimelineInfo content: application/json: schema: - $ref: "#/components/schemas/BranchInfo" + $ref: "#/components/schemas/TimelineInfo" "400": - description: Malformed branch create request + description: Malformed timeline create request content: application/json: schema: @@ -253,6 +173,12 @@ paths: application/json: schema: $ref: "#/components/schemas/ForbiddenError" + "409": + description: Timeline already exists, creation skipped + content: + application/json: + schema: + $ref: "#/components/schemas/AlreadyExistsError" "500": description: Generic operation error content: @@ -290,27 +216,26 @@ paths: schema: $ref: "#/components/schemas/Error" post: - description: Create tenant + description: | + Create a tenant. Returns new tenant id on success.\ + If no new tenant id is specified in parameters, it would be generated. It's an error to recreate the same tenant. requestBody: content: application/json: schema: type: object - required: - - "tenant_id" properties: - tenant_id: + new_tenant_id: type: string format: hex responses: "201": - description: CREATED + description: New tenant created successfully content: application/json: schema: - type: array - items: - type: string + type: string + format: hex "400": description: Malformed tenant create request content: @@ -329,6 +254,12 @@ paths: application/json: schema: $ref: "#/components/schemas/ForbiddenError" + "409": + description: Tenant already exists, creation skipped + content: + application/json: + schema: + $ref: "#/components/schemas/AlreadyExistsError" "500": description: Generic operation error content: @@ -353,38 +284,11 @@ components: type: string state: type: string - BranchInfo: - type: object - required: - - name - - timeline_id - - latest_valid_lsn - - current_logical_size - properties: - name: - type: string - timeline_id: - type: string - format: hex - ancestor_id: - type: string - format: hex - ancestor_lsn: - type: string - current_logical_size: - type: integer - current_logical_size_non_incremental: - type: integer - latest_valid_lsn: - type: integer TimelineInfo: type: object required: - timeline_id - tenant_id - - last_record_lsn - - prev_record_lsn - - start_lsn - disk_consistent_lsn properties: timeline_id: @@ -393,19 +297,21 @@ components: tenant_id: type: string format: hex - ancestor_timeline_id: - type: string - format: hex last_record_lsn: type: string prev_record_lsn: type: string - start_lsn: + ancestor_timeline_id: + type: string + format: hex + ancestor_lsn: type: string disk_consistent_lsn: type: string - timeline_state: - type: string + current_logical_size: + type: integer + current_logical_size_non_incremental: + type: integer Error: type: object @@ -421,6 +327,13 @@ components: properties: msg: type: string + AlreadyExistsError: + type: object + required: + - msg + properties: + msg: + type: string ForbiddenError: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index b13a45750e..8365601042 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,10 +1,8 @@ use std::sync::Arc; -use anyhow::{Context, Result}; -use hyper::header; +use anyhow::Result; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; -use serde::Serialize; use tracing::*; use zenith_utils::auth::JwtAuth; use zenith_utils::http::endpoint::attach_openapi_ui; @@ -15,19 +13,17 @@ use zenith_utils::http::{ endpoint, error::HttpErrorBody, json::{json_request, json_response}, - request::get_request_param, request::parse_request_param, }; use zenith_utils::http::{RequestExt, RouterBuilder}; -use zenith_utils::lsn::Lsn; -use zenith_utils::zid::{opt_display_serde, ZTimelineId}; +use zenith_utils::zid::{HexZTenantId, ZTimelineId}; -use super::models::BranchCreateRequest; -use super::models::TenantCreateRequest; -use crate::branches::BranchInfo; +use super::models::{ + StatusResponse, TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponse, +}; use crate::repository::RepositoryTimeline; -use crate::repository::TimelineSyncState; -use crate::{branches, config::PageServerConf, tenant_mgr, ZTenantId}; +use crate::timelines::TimelineInfo; +use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId}; #[derive(Debug)] struct State { @@ -64,31 +60,53 @@ fn get_config(request: &Request) -> &'static PageServerConf { } // healthcheck handler -async fn status_handler(_: Request) -> Result, ApiError> { - Ok(Response::builder() - .status(StatusCode::OK) - .header(header::CONTENT_TYPE, "application/json") - .body(Body::from("{}")) - .map_err(ApiError::from_err)?) +async fn status_handler(request: Request) -> Result, ApiError> { + let config = get_config(&request); + Ok(json_response( + StatusCode::OK, + StatusResponse { id: config.id }, + )?) } -async fn branch_create_handler(mut request: Request) -> Result, ApiError> { - let request_data: BranchCreateRequest = json_request(&mut request).await?; +async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { + let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let request_data: TimelineCreateRequest = json_request(&mut request).await?; - check_permission(&request, Some(request_data.tenant_id))?; + check_permission(&request, Some(tenant_id))?; - let response_data = tokio::task::spawn_blocking(move || { - let _enter = info_span!("/branch_create", name = %request_data.name, tenant = %request_data.tenant_id, startpoint=%request_data.start_point).entered(); - branches::create_branch( + let new_timeline_info = tokio::task::spawn_blocking(move || { + let _enter = info_span!("/timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn).entered(); + timelines::create_timeline( get_config(&request), - &request_data.name, - &request_data.start_point, - &request_data.tenant_id, + tenant_id, + request_data.new_timeline_id.map(ZTimelineId::from), + request_data.ancestor_timeline_id.map(ZTimelineId::from), + request_data.ancestor_start_lsn, ) }) .await .map_err(ApiError::from_err)??; - Ok(json_response(StatusCode::CREATED, response_data)?) + + Ok(match new_timeline_info { + Some(info) => json_response(StatusCode::CREATED, TimelineInfoResponse::from(info))?, + None => json_response(StatusCode::CONFLICT, ())?, + }) +} + +async fn timeline_list_handler(request: Request) -> Result, ApiError> { + let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); + let response_data: Vec = tokio::task::spawn_blocking(move || { + let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); + crate::timelines::get_timelines(tenant_id, include_non_incremental_logical_size) + }) + .await + .map_err(ApiError::from_err)?? + .into_iter() + .map(TimelineInfoResponse::from) + .collect(); + Ok(json_response(StatusCode::OK, response_data)?) } // Gate non incremental logical size calculation behind a flag @@ -106,113 +124,6 @@ fn get_include_non_incremental_logical_size(request: &Request) -> bool { .unwrap_or(false) } -async fn branch_list_handler(request: Request) -> Result, ApiError> { - let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?; - - let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); - - check_permission(&request, Some(tenantid))?; - - let response_data = tokio::task::spawn_blocking(move || { - let _enter = info_span!("branch_list", tenant = %tenantid).entered(); - crate::branches::get_branches( - get_config(&request), - &tenantid, - include_non_incremental_logical_size, - ) - }) - .await - .map_err(ApiError::from_err)??; - Ok(json_response(StatusCode::OK, response_data)?) -} - -async fn branch_detail_handler(request: Request) -> Result, ApiError> { - let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?; - let branch_name: String = get_request_param(&request, "branch_name")?.to_string(); - let conf = get_state(&request).conf; - let path = conf.branch_path(&branch_name, &tenantid); - - let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); - - let response_data = tokio::task::spawn_blocking(move || { - let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered(); - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - BranchInfo::from_path(path, &repo, include_non_incremental_logical_size) - }) - .await - .map_err(ApiError::from_err)??; - - Ok(json_response(StatusCode::OK, response_data)?) -} - -async fn timeline_list_handler(request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - - let conf = get_state(&request).conf; - let timelines_dir = conf.timelines_path(&tenant_id); - - let mut timelines_dir_contents = - tokio::fs::read_dir(&timelines_dir).await.with_context(|| { - format!( - "Failed to list timelines dir '{}' contents", - timelines_dir.display() - ) - })?; - - let mut local_timelines = Vec::new(); - while let Some(entry) = timelines_dir_contents.next_entry().await.with_context(|| { - format!( - "Failed to list timelines dir '{}' contents", - timelines_dir.display() - ) - })? { - let entry_path = entry.path(); - let entry_type = entry.file_type().await.with_context(|| { - format!( - "Failed to get file type of timeline dirs' entry '{}'", - entry_path.display() - ) - })?; - - if entry_type.is_dir() { - match entry.file_name().to_string_lossy().parse::() { - Ok(timeline_id) => local_timelines.push(timeline_id.to_string()), - Err(e) => error!( - "Failed to get parse timeline id from timeline dirs' entry '{}': {}", - entry_path.display(), - e - ), - } - } - } - - Ok(json_response(StatusCode::OK, local_timelines)?) -} - -#[derive(Debug, Serialize)] -#[serde(tag = "type")] -enum TimelineInfo { - Local { - #[serde(with = "hex")] - timeline_id: ZTimelineId, - #[serde(with = "hex")] - tenant_id: ZTenantId, - #[serde(with = "opt_display_serde")] - ancestor_timeline_id: Option, - last_record_lsn: Lsn, - prev_record_lsn: Lsn, - disk_consistent_lsn: Lsn, - timeline_state: Option, - }, - Remote { - #[serde(with = "hex")] - timeline_id: ZTimelineId, - #[serde(with = "hex")] - tenant_id: ZTenantId, - }, -} - async fn timeline_detail_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -224,24 +135,17 @@ async fn timeline_detail_handler(request: Request) -> Result(match repo.get_timeline(timeline_id)?.local_timeline() { - None => TimelineInfo::Remote { - timeline_id, - tenant_id, - }, - Some(timeline) => TimelineInfo::Local { - timeline_id, - tenant_id, - ancestor_timeline_id: timeline.get_ancestor_timeline_id(), - disk_consistent_lsn: timeline.get_disk_consistent_lsn(), - last_record_lsn: timeline.get_last_record_lsn(), - prev_record_lsn: timeline.get_prev_record_lsn(), - timeline_state: repo.get_timeline_state(timeline_id), - }, - }) + let include_non_incremental_logical_size = + get_include_non_incremental_logical_size(&request); + Ok::<_, anyhow::Error>(TimelineInfo::from_repo_timeline( + tenant_id, + repo.get_timeline(timeline_id)?, + include_non_incremental_logical_size, + )) }) .await - .map_err(ApiError::from_err)??; + .map_err(ApiError::from_err)? + .map(TimelineInfoResponse::from)?; Ok(json_response(StatusCode::OK, response_data)?) } @@ -258,7 +162,7 @@ async fn timeline_attach_handler(request: Request) -> Result { + RepositoryTimeline::Local { .. } => { anyhow::bail!("Timeline with id {} is already local", timeline_id) } RepositoryTimeline::Remote { @@ -318,13 +222,20 @@ async fn tenant_create_handler(mut request: Request) -> Result json_response(StatusCode::CREATED, HexZTenantId::from(id))?, + None => json_response(StatusCode::CONFLICT, ())?, + }) } async fn handler_404(_: Request) -> Result, ApiError> { @@ -354,23 +265,21 @@ pub fn make_router( router .data(Arc::new(State::new(conf, auth))) .get("/v1/status", status_handler) - .get("/v1/timeline/:tenant_id", timeline_list_handler) + .get("/v1/tenant", tenant_list_handler) + .post("/v1/tenant", tenant_create_handler) + .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler) + .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) .get( - "/v1/timeline/:tenant_id/:timeline_id", + "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler, ) .post( - "/v1/timeline/:tenant_id/:timeline_id/attach", + "/v1/tenant/:tenant_id/timeline/:timeline_id/attach", timeline_attach_handler, ) .post( - "/v1/timeline/:tenant_id/:timeline_id/detach", + "/v1/tenant/:tenant_id/timeline/:timeline_id/detach", timeline_detach_handler, ) - .get("/v1/branch/:tenant_id", branch_list_handler) - .get("/v1/branch/:tenant_id/:branch_name", branch_detail_handler) - .post("/v1/branch", branch_create_handler) - .get("/v1/tenant", tenant_list_handler) - .post("/v1/tenant", tenant_create_handler) .any(handler_404) } diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 5dae1902c1..9e0df5dab2 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -47,10 +47,8 @@ use crate::walredo::WalRedoManager; use crate::CheckpointConfig; use crate::{ZTenantId, ZTimelineId}; -use zenith_metrics::{ - register_histogram, register_int_gauge_vec, Histogram, IntGauge, IntGaugeVec, -}; -use zenith_metrics::{register_histogram_vec, HistogramVec}; +use zenith_metrics::{register_histogram_vec, Histogram, HistogramVec}; +use zenith_metrics::{register_int_gauge_vec, IntGauge, IntGaugeVec}; use zenith_utils::crashsafe_dir; use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn}; use zenith_utils::seqwait::SeqWait; @@ -87,16 +85,17 @@ lazy_static! { static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( "pageserver_storage_time", "Time spent on storage operations", - &["operation"] + &["operation", "tenant_id", "timeline_id"] ) .expect("failed to define a metric"); } // Metrics collected on operations on the storage repository. lazy_static! { - static ref RECONSTRUCT_TIME: Histogram = register_histogram!( + static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!( "pageserver_getpage_reconstruct_time", - "FIXME Time spent on storage operations" + "Time spent on storage operations", + &["tenant_id", "timeline_id"] ) .expect("failed to define a metric"); } @@ -137,19 +136,20 @@ pub struct LayeredRepository { /// Public interface impl Repository for LayeredRepository { fn get_timeline(&self, timelineid: ZTimelineId) -> Result { - let mut timelines = self.timelines.lock().unwrap(); - Ok( - match self.get_or_init_timeline(timelineid, &mut timelines)? { - LayeredTimelineEntry::Local(local) => RepositoryTimeline::Local(local), - LayeredTimelineEntry::Remote { - id, - disk_consistent_lsn, - } => RepositoryTimeline::Remote { - id, - disk_consistent_lsn, - }, - }, - ) + Ok(RepositoryTimeline::from(self.get_or_init_timeline( + timelineid, + &mut self.timelines.lock().unwrap(), + )?)) + } + + fn list_timelines(&self) -> Result> { + Ok(self + .timelines + .lock() + .unwrap() + .values() + .map(|timeline_entry| RepositoryTimeline::from(timeline_entry.clone())) + .collect()) } fn create_empty_timeline( @@ -247,8 +247,12 @@ impl Repository for LayeredRepository { horizon: u64, checkpoint_before_gc: bool, ) -> Result { + let timeline_str = target_timelineid + .map(|x| x.to_string()) + .unwrap_or_else(|| "-".to_string()); + STORAGE_TIME - .with_label_values(&["gc"]) + .with_label_values(&["gc", &self.tenantid.to_string(), &timeline_str]) .observe_closure_duration(|| { self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) }) @@ -428,6 +432,24 @@ impl LayeredTimelineEntry { } } +impl From for RepositoryTimeline { + fn from(layered_timeline: LayeredTimelineEntry) -> Self { + match layered_timeline { + LayeredTimelineEntry::Local(timeline) => RepositoryTimeline::Local { + id: timeline.timelineid, + timeline, + }, + LayeredTimelineEntry::Remote { + id, + disk_consistent_lsn, + } => RepositoryTimeline::Remote { + id, + disk_consistent_lsn, + }, + } + } +} + /// Private functions impl LayeredRepository { // Implementation of the public `get_timeline` function. This differs from the public @@ -762,6 +784,12 @@ pub struct LayeredTimeline { // ordering for its operations, but involves private modules, and macro trickery current_logical_size_gauge: IntGauge, + // Metrics histograms + reconstruct_time_histo: Histogram, + checkpoint_time_histo: Histogram, + flush_checkpoint_time_histo: Histogram, + forced_checkpoint_time_histo: Histogram, + /// If `true`, will backup its files that appear after each checkpointing to the remote storage. upload_relishes: AtomicBool, @@ -840,8 +868,7 @@ impl Timeline for LayeredTimeline { let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - RECONSTRUCT_TIME - .observe_closure_duration(|| self.materialize_page(seg, seg_blknum, lsn, &*layer)) + self.materialize_page(seg, seg_blknum, lsn, &*layer) } else { // FIXME: This can happen if PostgreSQL extends a relation but never writes // the page. See https://github.com/zenithdb/zenith/issues/841 @@ -893,12 +920,11 @@ impl Timeline for LayeredTimeline { let seg = SegmentTag { rel, segno: 0 }; - let result; - if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - result = layer.get_seg_exists(lsn)?; + let result = if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { + layer.get_seg_exists(lsn)? } else { - result = false; - } + false + }; trace!("get_rel_exists: {} at {} -> {}", rel, lsn, result); Ok(result) @@ -992,14 +1018,14 @@ impl Timeline for LayeredTimeline { /// metrics collection. fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()> { match cconf { - CheckpointConfig::Flush => STORAGE_TIME - .with_label_values(&["flush checkpoint"]) + CheckpointConfig::Flush => self + .flush_checkpoint_time_histo .observe_closure_duration(|| self.checkpoint_internal(0, false)), - CheckpointConfig::Forced => STORAGE_TIME - .with_label_values(&["forced checkpoint"]) + CheckpointConfig::Forced => self + .forced_checkpoint_time_histo .observe_closure_duration(|| self.checkpoint_internal(0, true)), - CheckpointConfig::Distance(distance) => STORAGE_TIME - .with_label_values(&["checkpoint"]) + CheckpointConfig::Distance(distance) => self + .checkpoint_time_histo .observe_closure_duration(|| self.checkpoint_internal(distance, true)), } } @@ -1098,6 +1124,31 @@ impl LayeredTimeline { let current_logical_size_gauge = LOGICAL_TIMELINE_SIZE .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) .unwrap(); + let reconstruct_time_histo = RECONSTRUCT_TIME + .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) + .unwrap(); + let checkpoint_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "checkpoint", + &tenantid.to_string(), + &timelineid.to_string(), + ]) + .unwrap(); + let flush_checkpoint_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "flush checkpoint", + &tenantid.to_string(), + &timelineid.to_string(), + ]) + .unwrap(); + let forced_checkpoint_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "forced checkpoint", + &tenantid.to_string(), + &timelineid.to_string(), + ]) + .unwrap(); + LayeredTimeline { conf, timelineid, @@ -1117,6 +1168,10 @@ impl LayeredTimeline { ancestor_lsn: metadata.ancestor_lsn(), current_logical_size: AtomicUsize::new(current_logical_size), current_logical_size_gauge, + reconstruct_time_histo, + checkpoint_time_histo, + flush_checkpoint_time_histo, + forced_checkpoint_time_histo, upload_relishes: AtomicBool::new(upload_relishes), write_lock: Mutex::new(()), @@ -1966,17 +2021,19 @@ impl LayeredTimeline { let mut layer_ref = layer; let mut curr_lsn = lsn; loop { - let result = layer_ref - .get_page_reconstruct_data(seg_blknum, curr_lsn, &mut data) - .with_context(|| { - format!( - "Failed to get reconstruct data {} {:?} {} {}", - layer_ref.get_seg_tag(), - layer_ref.filename(), - seg_blknum, - curr_lsn, - ) - })?; + let result = self.reconstruct_time_histo.observe_closure_duration(|| { + layer_ref + .get_page_reconstruct_data(seg_blknum, curr_lsn, &mut data) + .with_context(|| { + format!( + "Failed to get reconstruct data {} {:?} {} {}", + layer_ref.get_seg_tag(), + layer_ref.filename(), + seg_blknum, + curr_lsn, + ) + }) + })?; match result { PageReconstructResult::Complete => break, PageReconstructResult::Continue(cont_lsn) => { diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 17b061b20e..6e24bf6022 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -170,12 +170,11 @@ impl Layer for InMemoryLayer { fn filename(&self) -> PathBuf { let inner = self.inner.read().unwrap(); - let end_lsn; - if let Some(drop_lsn) = inner.end_lsn { - end_lsn = drop_lsn; + let end_lsn = if let Some(drop_lsn) = inner.end_lsn { + drop_lsn } else { - end_lsn = Lsn(u64::MAX); - } + Lsn(u64::MAX) + }; let delta_filename = DeltaFileName { seg: self.seg, diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 3a68f56187..3d66192c80 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -1,5 +1,4 @@ pub mod basebackup; -pub mod branches; pub mod config; pub mod http; pub mod import_datadir; @@ -12,6 +11,7 @@ pub mod repository; pub mod tenant_mgr; pub mod tenant_threads; pub mod thread_mgr; +pub mod timelines; pub mod virtual_file; pub mod walingest; pub mod walreceiver; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 7dc3c8c752..42a099cca5 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -298,7 +298,7 @@ lazy_static! { static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!( "pageserver_smgr_query_time", "Time spent on smgr query handling", - &["smgr_query_type"], + &["smgr_query_type", "tenant_id", "timeline_id"], TIME_BUCKETS.into() ) .expect("failed to define a metric"); @@ -340,20 +340,22 @@ impl PageServerHandler { }; let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; + let tenant_id = tenantid.to_string(); + let timeline_id = timelineid.to_string(); let response = match zenith_fe_msg { PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_exists"]) + .with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]) .observe_closure_duration(|| { self.handle_get_rel_exists_request(timeline.as_ref(), &req) }), PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_size"]) + .with_label_values(&["get_rel_size", &tenant_id, &timeline_id]) .observe_closure_duration(|| { self.handle_get_nblocks_request(timeline.as_ref(), &req) }), PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME - .with_label_values(&["get_page_at_lsn"]) + .with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]) .observe_closure_duration(|| { self.handle_get_page_at_lsn_request(timeline.as_ref(), &req) }), diff --git a/pageserver/src/remote_storage/README.md b/pageserver/src/remote_storage/README.md index 1c718acf06..3c77275da8 100644 --- a/pageserver/src/remote_storage/README.md +++ b/pageserver/src/remote_storage/README.md @@ -62,11 +62,3 @@ Based on previous evaluation, even `rusoto-s3` could be a better choice over thi So far, we don't adjust the remote storage based on GC thread loop results, only checkpointer loop affects the remote storage. Index module could be used as a base to implement a deferred GC mechanism, a "defragmentation" that repacks archives into new ones after GC is done removing the files from the archives. - -* bracnhes implementaion could be improved - -Currently, there's a code to sync the branches along with the timeline files: on upload, every local branch files that are missing remotely are uploaded, -on the timeline download, missing remote branch files are downlaoded. - -A branch is a per-tenant entity, yet a current implementaion requires synchronizing a timeline first to get the branch files locally. -Currently, there's no other way to know about the remote branch files, neither the file contents is verified and updated. diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index 6b588c8e5f..d14f849e15 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -14,13 +14,6 @@ //! Only GC removes local timeline files, the GC support is not added to sync currently, //! yet downloading extra files is not critically bad at this stage, GC can remove those again. //! -//! Along the timeline files, branch files are uploaded and downloaded every time a corresponding sync task is processed. -//! For simplicity, branch files are also treated as immutable: only missing files are uploaded or downloaded, no removals, amendments or file contents checks are done. -//! Also, the branches are copied as separate files, with no extra compressions done. -//! Despite branches information currently belonging to tenants, a tenants' timeline sync is required to upload or download the branch files, also, there's no way to know -//! the branch sync state outside of the sync loop. -//! This implementation is currently considered as temporary and is a subjec to change later. -//! //! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via listing the remote storage contents. //! It's enough to poll the remote state once on startup only, due to agreement that the pageserver has //! an exclusive write access to the remote storage: new files appear in the storage only after the same @@ -66,7 +59,6 @@ //! NOTE: No real contents or checksum check happens right now and is a subject to improve later. //! //! After the whole timeline is downloaded, [`crate::tenant_mgr::set_timeline_states`] function is used to update pageserver memory stage for the timeline processed. -//! No extra branch registration is done. //! //! When pageserver signals shutdown, current sync task gets finished and the loop exists. @@ -77,7 +69,7 @@ pub mod index; mod upload; use std::{ - collections::{BTreeSet, HashMap, HashSet, VecDeque}, + collections::{BTreeSet, HashMap, VecDeque}, num::{NonZeroU32, NonZeroUsize}, path::{Path, PathBuf}, sync::Arc, @@ -87,7 +79,6 @@ use anyhow::{bail, Context}; use futures::stream::{FuturesUnordered, StreamExt}; use lazy_static::lazy_static; use tokio::{ - fs, runtime::Runtime, sync::{ mpsc::{self, UnboundedReceiver}, @@ -101,8 +92,7 @@ use self::{ compression::ArchiveHeader, download::{download_timeline, DownloadedTimeline}, index::{ - ArchiveDescription, ArchiveId, RelativePath, RemoteTimeline, RemoteTimelineIndex, - TimelineIndexEntry, + ArchiveDescription, ArchiveId, RemoteTimeline, RemoteTimelineIndex, TimelineIndexEntry, }, upload::upload_timeline_checkpoint, }; @@ -843,28 +833,6 @@ async fn download_archive_header< Ok(header) } -async fn tenant_branch_files( - conf: &'static PageServerConf, - tenant_id: ZTenantId, -) -> anyhow::Result> { - let branches_dir = conf.branches_path(&tenant_id); - if !branches_dir.exists() { - return Ok(HashSet::new()); - } - - let mut branch_entries = fs::read_dir(&branches_dir) - .await - .context("Failed to list tenant branches dir contents")?; - - let mut branch_files = HashSet::new(); - while let Some(branch_entry) = branch_entries.next_entry().await? { - if branch_entry.file_type().await?.is_file() { - branch_files.insert(RelativePath::new(&branches_dir, branch_entry.path())?); - } - } - Ok(branch_files) -} - #[cfg(test)] mod test_utils { use std::{ @@ -971,30 +939,9 @@ mod test_utils { "Index contains unexpected sync ids" ); - let mut actual_branches = BTreeMap::new(); - let mut expected_branches = BTreeMap::new(); let mut actual_timeline_entries = BTreeMap::new(); let mut expected_timeline_entries = BTreeMap::new(); for sync_id in actual_sync_ids { - actual_branches.insert( - sync_id.tenant_id, - index_read - .branch_files(sync_id.tenant_id) - .into_iter() - .flat_map(|branch_paths| branch_paths.iter()) - .cloned() - .collect::>(), - ); - expected_branches.insert( - sync_id.tenant_id, - expected_index_with_descriptions - .branch_files(sync_id.tenant_id) - .into_iter() - .flat_map(|branch_paths| branch_paths.iter()) - .cloned() - .collect::>(), - ); - actual_timeline_entries.insert( sync_id, index_read.timeline_entry(&sync_id).unwrap().clone(), @@ -1009,11 +956,6 @@ mod test_utils { } drop(index_read); - assert_eq!( - actual_branches, expected_branches, - "Index contains unexpected branches" - ); - for (sync_id, actual_timeline_entry) in actual_timeline_entries { let expected_timeline_description = expected_timeline_entries .remove(&sync_id) diff --git a/pageserver/src/remote_storage/storage_sync/download.rs b/pageserver/src/remote_storage/storage_sync/download.rs index f268fc442a..00115ba8d5 100644 --- a/pageserver/src/remote_storage/storage_sync/download.rs +++ b/pageserver/src/remote_storage/storage_sync/download.rs @@ -1,10 +1,8 @@ //! Timeline synchrnonization logic to put files from archives on remote storage into pageserver's local directory. -//! Currently, tenant branch files are also downloaded, but this does not appear final. use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc}; use anyhow::{ensure, Context}; -use futures::{stream::FuturesUnordered, StreamExt}; use tokio::{fs, sync::RwLock}; use tracing::{debug, error, trace, warn}; use zenith_utils::{lsn::Lsn, zid::ZTenantId}; @@ -14,8 +12,8 @@ use crate::{ layered_repository::metadata::{metadata_path, TimelineMetadata}, remote_storage::{ storage_sync::{ - compression, index::TimelineIndexEntry, sync_queue, tenant_branch_files, - update_index_description, SyncKind, SyncTask, + compression, index::TimelineIndexEntry, sync_queue, update_index_description, SyncKind, + SyncTask, }, RemoteStorage, ZTenantTimelineId, }, @@ -42,8 +40,6 @@ pub(super) enum DownloadedTimeline { /// Timeline files that already exist locally are skipped during the download, but the local metadata file is /// updated in the end of every checkpoint archive extraction. /// -/// Before any archives are considered, the branch files are checked locally and remotely, all remote-only files are downloaded. -/// /// On an error, bumps the retries count and reschedules the download, with updated archive skip list /// (for any new successful archive downloads and extractions). pub(super) async fn download_timeline< @@ -113,22 +109,6 @@ pub(super) async fn download_timeline< } }; - if let Err(e) = download_missing_branches(conf, remote_assets.as_ref(), sync_id.tenant_id).await - { - error!( - "Failed to download missing branches for sync id {}: {:?}", - sync_id, e - ); - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Download(download), - )); - return DownloadedTimeline::FailedAndRescheduled { - disk_consistent_lsn, - }; - } - debug!("Downloading timeline archives"); let archives_to_download = remote_timeline .checkpoints() @@ -250,82 +230,6 @@ async fn read_local_metadata( .context("Failed to read local metadata files bytes")?) } -async fn download_missing_branches< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - conf: &'static PageServerConf, - (storage, index): &(S, RwLock), - tenant_id: ZTenantId, -) -> anyhow::Result<()> { - let local_branches = tenant_branch_files(conf, tenant_id) - .await - .context("Failed to list local branch files for the tenant")?; - let local_branches_dir = conf.branches_path(&tenant_id); - if !local_branches_dir.exists() { - fs::create_dir_all(&local_branches_dir) - .await - .with_context(|| { - format!( - "Failed to create local branches directory at path '{}'", - local_branches_dir.display() - ) - })?; - } - - if let Some(remote_branches) = index.read().await.branch_files(tenant_id) { - let mut remote_only_branches_downloads = remote_branches - .difference(&local_branches) - .map(|remote_only_branch| async move { - let branches_dir = conf.branches_path(&tenant_id); - let remote_branch_path = remote_only_branch.as_path(&branches_dir); - let storage_path = - storage.storage_path(&remote_branch_path).with_context(|| { - format!( - "Failed to derive a storage path for branch with local path '{}'", - remote_branch_path.display() - ) - })?; - let mut target_file = fs::OpenOptions::new() - .write(true) - .create_new(true) - .open(&remote_branch_path) - .await - .with_context(|| { - format!( - "Failed to create local branch file at '{}'", - remote_branch_path.display() - ) - })?; - storage - .download(&storage_path, &mut target_file) - .await - .with_context(|| { - format!( - "Failed to download branch file from the remote path {:?}", - storage_path - ) - })?; - Ok::<_, anyhow::Error>(()) - }) - .collect::>(); - - let mut branch_downloads_failed = false; - while let Some(download_result) = remote_only_branches_downloads.next().await { - if let Err(e) = download_result { - branch_downloads_failed = true; - error!("Failed to download a branch file: {:?}", e); - } - } - ensure!( - !branch_downloads_failed, - "Failed to download all branch files" - ); - } - - Ok(()) -} - #[cfg(test)] mod tests { use std::collections::BTreeSet; diff --git a/pageserver/src/remote_storage/storage_sync/index.rs b/pageserver/src/remote_storage/storage_sync/index.rs index 3d2680948d..81c99754c9 100644 --- a/pageserver/src/remote_storage/storage_sync/index.rs +++ b/pageserver/src/remote_storage/storage_sync/index.rs @@ -5,7 +5,7 @@ //! This way in the future, the index could be restored fast from its serialized stored form. use std::{ - collections::{BTreeMap, BTreeSet, HashMap, HashSet}, + collections::{BTreeMap, BTreeSet, HashMap}, path::{Path, PathBuf}, }; @@ -49,10 +49,9 @@ impl RelativePath { } /// An index to track tenant files that exist on the remote storage. -/// Currently, timeline archives and branch files are tracked. +/// Currently, timeline archive files are tracked only. #[derive(Debug, Clone)] pub struct RemoteTimelineIndex { - branch_files: HashMap>, timeline_files: HashMap, } @@ -65,7 +64,6 @@ impl RemoteTimelineIndex { paths: impl Iterator, ) -> Self { let mut index = Self { - branch_files: HashMap::new(), timeline_files: HashMap::new(), }; for path in paths { @@ -98,17 +96,6 @@ impl RemoteTimelineIndex { pub fn all_sync_ids(&self) -> impl Iterator + '_ { self.timeline_files.keys().copied() } - - pub fn add_branch_file(&mut self, tenant_id: ZTenantId, path: RelativePath) { - self.branch_files - .entry(tenant_id) - .or_insert_with(HashSet::new) - .insert(path); - } - - pub fn branch_files(&self, tenant_id: ZTenantId) -> Option<&HashSet> { - self.branch_files.get(&tenant_id) - } } #[derive(Debug, Clone, PartialEq, Eq)] @@ -306,20 +293,9 @@ fn try_parse_index_entry( .parse::() .with_context(|| format!("Failed to parse tenant id from path '{}'", path.display()))?; - let branches_path = conf.branches_path(&tenant_id); let timelines_path = conf.timelines_path(&tenant_id); - match ( - RelativePath::new(&branches_path, &path), - path.strip_prefix(&timelines_path), - ) { - (Ok(_), Ok(_)) => bail!( - "Path '{}' cannot start with both branches '{}' and the timelines '{}' prefixes", - path.display(), - branches_path.display(), - timelines_path.display() - ), - (Ok(branches_entry), Err(_)) => index.add_branch_file(tenant_id, branches_entry), - (Err(_), Ok(timelines_subpath)) => { + match path.strip_prefix(&timelines_path) { + Ok(timelines_subpath) => { let mut segments = timelines_subpath.iter(); let timeline_id = segments .next() @@ -375,11 +351,10 @@ fn try_parse_index_entry( } } } - (Err(branches_error), Err(timelines_strip_error)) => { + Err(timelines_strip_error) => { bail!( - "Path '{}' is not an index entry: it's neither parsable as a branch entry '{:#}' nor as an archive entry '{}'", + "Path '{}' is not an archive entry '{}'", path.display(), - branches_error, timelines_strip_error, ) } diff --git a/pageserver/src/remote_storage/storage_sync/upload.rs b/pageserver/src/remote_storage/storage_sync/upload.rs index 0f57d714dd..d064039ecc 100644 --- a/pageserver/src/remote_storage/storage_sync/upload.rs +++ b/pageserver/src/remote_storage/storage_sync/upload.rs @@ -1,13 +1,10 @@ //! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints. -//! Currently, tenant branch files are also uploaded, but this does not appear final. use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc}; -use anyhow::{ensure, Context}; -use futures::{stream::FuturesUnordered, StreamExt}; -use tokio::{fs, sync::RwLock}; +use anyhow::ensure; +use tokio::sync::RwLock; use tracing::{debug, error, warn}; -use zenith_utils::zid::ZTenantId; use crate::{ config::PageServerConf, @@ -15,7 +12,7 @@ use crate::{ storage_sync::{ compression, index::{RemoteTimeline, TimelineIndexEntry}, - sync_queue, tenant_branch_files, update_index_description, SyncKind, SyncTask, + sync_queue, update_index_description, SyncKind, SyncTask, }, RemoteStorage, ZTenantTimelineId, }, @@ -26,8 +23,6 @@ use super::{compression::ArchiveHeader, index::RemoteTimelineIndex, NewCheckpoin /// Attempts to compress and upload given checkpoint files. /// No extra checks for overlapping files is made: download takes care of that, ensuring no non-metadata local timeline files are overwritten. /// -/// Before the checkpoint files are uploaded, branch files are uploaded, if any local ones are missing remotely. -/// /// On an error, bumps the retries count and reschedules the entire task. /// On success, populates index data with new downloads. pub(super) async fn upload_timeline_checkpoint< @@ -41,19 +36,6 @@ pub(super) async fn upload_timeline_checkpoint< retries: u32, ) -> Option { debug!("Uploading checkpoint for sync id {}", sync_id); - if let Err(e) = upload_missing_branches(config, remote_assets.as_ref(), sync_id.tenant_id).await - { - error!( - "Failed to upload missing branches for sync id {}: {:?}", - sync_id, e - ); - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Upload(new_checkpoint), - )); - return Some(false); - } let new_upload_lsn = new_checkpoint.metadata.disk_consistent_lsn(); let index = &remote_assets.1; @@ -201,76 +183,6 @@ async fn try_upload_checkpoint< .map(|(header, header_size, _)| (header, header_size)) } -async fn upload_missing_branches< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - config: &'static PageServerConf, - (storage, index): &(S, RwLock), - tenant_id: ZTenantId, -) -> anyhow::Result<()> { - let local_branches = tenant_branch_files(config, tenant_id) - .await - .context("Failed to list local branch files for the tenant")?; - let index_read = index.read().await; - let remote_branches = index_read - .branch_files(tenant_id) - .cloned() - .unwrap_or_default(); - drop(index_read); - - let mut branch_uploads = local_branches - .difference(&remote_branches) - .map(|local_only_branch| async move { - let local_branch_path = local_only_branch.as_path(&config.branches_path(&tenant_id)); - let storage_path = storage.storage_path(&local_branch_path).with_context(|| { - format!( - "Failed to derive a storage path for branch with local path '{}'", - local_branch_path.display() - ) - })?; - let local_branch_file = fs::OpenOptions::new() - .read(true) - .open(&local_branch_path) - .await - .with_context(|| { - format!( - "Failed to open local branch file {} for reading", - local_branch_path.display() - ) - })?; - storage - .upload(local_branch_file, &storage_path) - .await - .with_context(|| { - format!( - "Failed to upload branch file to the remote path {:?}", - storage_path - ) - })?; - Ok::<_, anyhow::Error>(local_only_branch) - }) - .collect::>(); - - let mut branch_uploads_failed = false; - while let Some(upload_result) = branch_uploads.next().await { - match upload_result { - Ok(local_only_branch) => index - .write() - .await - .add_branch_file(tenant_id, local_only_branch.clone()), - Err(e) => { - error!("Failed to upload branch file: {:?}", e); - branch_uploads_failed = true; - } - } - } - - ensure!(!branch_uploads_failed, "Failed to upload all branch files"); - - Ok(()) -} - #[cfg(test)] mod tests { use tempfile::tempdir; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 6142953a58..be937b8d26 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -36,6 +36,10 @@ pub trait Repository: Send + Sync { /// Get Timeline handle for given zenith timeline ID. fn get_timeline(&self, timelineid: ZTimelineId) -> Result; + /// Lists timelines the repository contains. + /// Up to repository's implementation to omit certain timelines that ar not considered ready for use. + fn list_timelines(&self) -> Result>; + /// Create a new, empty timeline. The caller is responsible for loading data into it /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. fn create_empty_timeline( @@ -72,7 +76,10 @@ pub trait Repository: Send + Sync { pub enum RepositoryTimeline { /// Timeline, with its files present locally in pageserver's working directory. /// Loaded into pageserver's memory and ready to be used. - Local(Arc), + Local { + id: ZTimelineId, + timeline: Arc, + }, /// Timeline, found on the pageserver's remote storage, but not yet downloaded locally. Remote { id: ZTimelineId, @@ -83,17 +90,24 @@ pub enum RepositoryTimeline { impl RepositoryTimeline { pub fn local_timeline(&self) -> Option> { - if let Self::Local(local_timeline) = self { - Some(Arc::clone(local_timeline)) + if let Self::Local { timeline, .. } = self { + Some(Arc::clone(timeline)) } else { None } } + + pub fn id(&self) -> ZTimelineId { + match self { + Self::Local { id, .. } => *id, + Self::Remote { id, .. } => *id, + } + } } /// A state of the timeline synchronization with the remote storage. /// Contains `disk_consistent_lsn` of the corresponding remote timeline (latest checkpoint's disk_consistent_lsn). -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub enum TimelineSyncState { /// No further downloads from the remote storage are needed. /// The timeline state is up-to-date or ahead of the remote storage one, @@ -390,7 +404,6 @@ pub mod repo_harness { let tenant_id = ZTenantId::generate(); fs::create_dir_all(conf.tenant_path(&tenant_id))?; - fs::create_dir_all(conf.branches_path(&tenant_id))?; Ok(Self { conf, tenant_id }) } diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index d60b5fefd3..568088fc1d 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -1,19 +1,19 @@ //! This module acts as a switchboard to access different repositories managed by this //! page server. -use crate::branches; use crate::config::PageServerConf; use crate::layered_repository::LayeredRepository; use crate::repository::{Repository, Timeline, TimelineSyncState}; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; +use crate::timelines; use crate::walredo::PostgresRedoManager; use crate::CheckpointConfig; -use anyhow::{bail, Context, Result}; +use anyhow::{Context, Result}; use lazy_static::lazy_static; use log::*; use serde::{Deserialize, Serialize}; -use std::collections::{hash_map, HashMap}; +use std::collections::HashMap; use std::fmt; use std::sync::{Arc, Mutex, MutexGuard}; use zenith_utils::zid::{ZTenantId, ZTimelineId}; @@ -177,24 +177,27 @@ pub fn shutdown_all_tenants() { } } -pub fn create_repository_for_tenant( +pub fn create_tenant_repository( conf: &'static PageServerConf, - tenantid: ZTenantId, -) -> Result<()> { - let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid)); - let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?; - - match access_tenants().entry(tenantid) { - hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", tenantid), - hash_map::Entry::Vacant(v) => { - v.insert(Tenant { - state: TenantState::Idle, - repo, - }); + new_tenant_id: Option, +) -> Result> { + let new_tenant_id = new_tenant_id.unwrap_or_else(ZTenantId::generate); + let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, new_tenant_id)); + match timelines::create_repo(conf, new_tenant_id, wal_redo_manager)? { + Some(repo) => { + access_tenants() + .entry(new_tenant_id) + .or_insert_with(|| Tenant { + state: TenantState::Idle, + repo, + }); + Ok(Some(new_tenant_id)) + } + None => { + debug!("repository already exists for tenant {}", new_tenant_id); + Ok(None) } } - - Ok(()) } pub fn get_tenant_state(tenantid: ZTenantId) -> Option { diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs new file mode 100644 index 0000000000..4de131ef70 --- /dev/null +++ b/pageserver/src/timelines.rs @@ -0,0 +1,408 @@ +//! +//! Timeline management code +// + +use anyhow::{anyhow, bail, Context, Result}; +use postgres_ffi::ControlFileData; +use std::{ + fs, + path::Path, + process::{Command, Stdio}, + sync::Arc, +}; +use tracing::*; + +use zenith_utils::lsn::Lsn; +use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use zenith_utils::{crashsafe_dir, logging}; + +use crate::{config::PageServerConf, repository::Repository}; +use crate::{import_datadir, LOG_FILE_NAME}; +use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager}; +use crate::{repository::RepositoryTimeline, tenant_mgr}; +use crate::{repository::Timeline, CheckpointConfig}; + +#[derive(Clone)] +pub enum TimelineInfo { + Local { + timeline_id: ZTimelineId, + tenant_id: ZTenantId, + last_record_lsn: Lsn, + prev_record_lsn: Lsn, + ancestor_timeline_id: Option, + ancestor_lsn: Option, + disk_consistent_lsn: Lsn, + current_logical_size: usize, + current_logical_size_non_incremental: Option, + }, + Remote { + timeline_id: ZTimelineId, + tenant_id: ZTenantId, + disk_consistent_lsn: Lsn, + }, +} + +impl TimelineInfo { + pub fn from_repo_timeline( + tenant_id: ZTenantId, + repo_timeline: RepositoryTimeline, + include_non_incremental_logical_size: bool, + ) -> Self { + match repo_timeline { + RepositoryTimeline::Local { id, timeline } => { + let ancestor_timeline_id = timeline.get_ancestor_timeline_id(); + let ancestor_lsn = if ancestor_timeline_id.is_some() { + Some(timeline.get_ancestor_lsn()) + } else { + None + }; + + Self::Local { + timeline_id: id, + tenant_id, + last_record_lsn: timeline.get_last_record_lsn(), + prev_record_lsn: timeline.get_prev_record_lsn(), + ancestor_timeline_id, + ancestor_lsn, + disk_consistent_lsn: timeline.get_disk_consistent_lsn(), + current_logical_size: timeline.get_current_logical_size(), + current_logical_size_non_incremental: get_current_logical_size_non_incremental( + include_non_incremental_logical_size, + timeline.as_ref(), + ), + } + } + RepositoryTimeline::Remote { + id, + disk_consistent_lsn, + } => Self::Remote { + timeline_id: id, + tenant_id, + disk_consistent_lsn, + }, + } + } + + pub fn from_dyn_timeline( + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + timeline: &dyn Timeline, + include_non_incremental_logical_size: bool, + ) -> Self { + let ancestor_timeline_id = timeline.get_ancestor_timeline_id(); + let ancestor_lsn = if ancestor_timeline_id.is_some() { + Some(timeline.get_ancestor_lsn()) + } else { + None + }; + + Self::Local { + timeline_id, + tenant_id, + last_record_lsn: timeline.get_last_record_lsn(), + prev_record_lsn: timeline.get_prev_record_lsn(), + ancestor_timeline_id, + ancestor_lsn, + disk_consistent_lsn: timeline.get_disk_consistent_lsn(), + current_logical_size: timeline.get_current_logical_size(), + current_logical_size_non_incremental: get_current_logical_size_non_incremental( + include_non_incremental_logical_size, + timeline, + ), + } + } + + pub fn timeline_id(&self) -> ZTimelineId { + match *self { + TimelineInfo::Local { timeline_id, .. } => timeline_id, + TimelineInfo::Remote { timeline_id, .. } => timeline_id, + } + } + + pub fn tenant_id(&self) -> ZTenantId { + match *self { + TimelineInfo::Local { tenant_id, .. } => tenant_id, + TimelineInfo::Remote { tenant_id, .. } => tenant_id, + } + } +} + +fn get_current_logical_size_non_incremental( + include_non_incremental_logical_size: bool, + timeline: &dyn Timeline, +) -> Option { + if !include_non_incremental_logical_size { + return None; + } + match timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) { + Ok(size) => Some(size), + Err(e) => { + error!("Failed to get non-incremental logical size: {:?}", e); + None + } + } +} + +#[derive(Debug, Clone, Copy)] +pub struct PointInTime { + pub timeline_id: ZTimelineId, + pub lsn: Lsn, +} + +pub fn init_pageserver( + conf: &'static PageServerConf, + create_tenant: Option, + initial_timeline_id: Option, +) -> anyhow::Result<()> { + // Initialize logger + // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages + let _log_file = logging::init(LOG_FILE_NAME, true)?; + + // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo + // process during repository initialization. + // + // FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched + // initdb in the background, and it kept running even after the "zenith init" had exited. + // In tests, we started the page server immediately after that, so that initdb was still + // running in the background, and we failed to run initdb again in the same directory. This + // has been solved for the rapid init+start case now, but the general race condition remains + // if you restart the server quickly. The WAL redo manager doesn't use a separate thread + // anymore, but I think that could still happen. + let dummy_redo_mgr = Arc::new(crate::walredo::DummyRedoManager {}); + + crashsafe_dir::create_dir_all(conf.tenants_path())?; + + if let Some(tenant_id) = create_tenant { + println!("initializing tenantid {}", tenant_id); + let repo = create_repo(conf, tenant_id, dummy_redo_mgr) + .context("failed to create repo")? + .ok_or_else(|| anyhow!("For newely created pageserver, found already existing repository for tenant {}", tenant_id))?; + let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); + bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref()) + .context("failed to create initial timeline")?; + println!("initial timeline {} created", new_timeline_id) + } else if initial_timeline_id.is_some() { + println!("Ignoring initial timeline parameter, due to no tenant id to create given"); + } + + println!("pageserver init succeeded"); + Ok(()) +} + +pub fn create_repo( + conf: &'static PageServerConf, + tenant_id: ZTenantId, + wal_redo_manager: Arc, +) -> Result>> { + let repo_dir = conf.tenant_path(&tenant_id); + if repo_dir.exists() { + debug!("repo for {} already exists", tenant_id); + return Ok(None); + } + + // top-level dir may exist if we are creating it through CLI + crashsafe_dir::create_dir_all(&repo_dir) + .with_context(|| format!("could not create directory {}", repo_dir.display()))?; + crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; + info!("created directory structure in {}", repo_dir.display()); + + Ok(Some(Arc::new(LayeredRepository::new( + conf, + wal_redo_manager, + tenant_id, + conf.remote_storage_config.is_some(), + )))) +} + +// Returns checkpoint LSN from controlfile +fn get_lsn_from_controlfile(path: &Path) -> Result { + // Read control file to extract the LSN + let controlfile_path = path.join("global").join("pg_control"); + let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?; + let lsn = controlfile.checkPoint; + + Ok(Lsn(lsn)) +} + +// Create the cluster temporarily in 'initdbpath' directory inside the repository +// to get bootstrap data for timeline initialization. +// +fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { + info!("running initdb in {}... ", initdbpath.display()); + + let initdb_path = conf.pg_bin_dir().join("initdb"); + let initdb_output = Command::new(initdb_path) + .args(&["-D", initdbpath.to_str().unwrap()]) + .args(&["-U", &conf.superuser]) + .args(&["-E", "utf8"]) + .arg("--no-instructions") + // This is only used for a temporary installation that is deleted shortly after, + // so no need to fsync it + .arg("--no-sync") + .env_clear() + .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) + .stdout(Stdio::null()) + .output() + .context("failed to execute initdb")?; + if !initdb_output.status.success() { + bail!( + "initdb failed: '{}'", + String::from_utf8_lossy(&initdb_output.stderr) + ); + } + + Ok(()) +} + +// +// - run initdb to init temporary instance and get bootstrap data +// - after initialization complete, remove the temp dir. +// +fn bootstrap_timeline( + conf: &'static PageServerConf, + tenantid: ZTenantId, + tli: ZTimelineId, + repo: &dyn Repository, +) -> Result> { + let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered(); + + let initdb_path = conf.tenant_path(&tenantid).join("tmp"); + + // Init temporarily repo to get bootstrap data + run_initdb(conf, &initdb_path)?; + let pgdata_path = initdb_path; + + let lsn = get_lsn_from_controlfile(&pgdata_path)?.align(); + + // Import the contents of the data directory at the initial checkpoint + // LSN, and any WAL after that. + // Initdb lsn will be equal to last_record_lsn which will be set after import. + // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. + let timeline = repo.create_empty_timeline(tli, lsn)?; + import_datadir::import_timeline_from_postgres_datadir( + &pgdata_path, + timeline.writer().as_ref(), + lsn, + )?; + timeline.checkpoint(CheckpointConfig::Forced)?; + + println!( + "created initial timeline {} timeline.lsn {}", + tli, + timeline.get_last_record_lsn() + ); + + // Remove temp dir. We don't need it anymore + fs::remove_dir_all(pgdata_path)?; + + Ok(timeline) +} + +pub(crate) fn get_timelines( + tenant_id: ZTenantId, + include_non_incremental_logical_size: bool, +) -> Result> { + let repo = tenant_mgr::get_repository_for_tenant(tenant_id) + .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?; + + Ok(repo + .list_timelines() + .with_context(|| format!("Failed to list timelines for tenant {}", tenant_id))? + .into_iter() + .filter_map(|timeline| match timeline { + RepositoryTimeline::Local { timeline, id } => Some((id, timeline)), + RepositoryTimeline::Remote { .. } => None, + }) + .map(|(timeline_id, timeline)| { + TimelineInfo::from_dyn_timeline( + tenant_id, + timeline_id, + timeline.as_ref(), + include_non_incremental_logical_size, + ) + }) + .collect()) +} + +pub(crate) fn create_timeline( + conf: &'static PageServerConf, + tenant_id: ZTenantId, + new_timeline_id: Option, + ancestor_timeline_id: Option, + ancestor_start_lsn: Option, +) -> Result> { + let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + + if conf.timeline_path(&new_timeline_id, &tenant_id).exists() { + match repo.get_timeline(new_timeline_id)? { + RepositoryTimeline::Local { id, .. } => { + debug!("timeline {} already exists", id); + return Ok(None); + } + RepositoryTimeline::Remote { id, .. } => bail!( + "timeline {} already exists in pageserver's remote storage", + id + ), + } + } + + let mut start_lsn = ancestor_start_lsn.unwrap_or(Lsn(0)); + + let new_timeline_info = match ancestor_timeline_id { + Some(ancestor_timeline_id) => { + let ancestor_timeline = repo + .get_timeline(ancestor_timeline_id) + .with_context(|| format!("Cannot get ancestor timeline {}", ancestor_timeline_id))? + .local_timeline() + .with_context(|| { + format!( + "Cannot branch off the timeline {} that's not present locally", + ancestor_timeline_id + ) + })?; + + if start_lsn == Lsn(0) { + // Find end of WAL on the old timeline + let end_of_wal = ancestor_timeline.get_last_record_lsn(); + info!("branching at end of WAL: {}", end_of_wal); + start_lsn = end_of_wal; + } else { + // Wait for the WAL to arrive and be processed on the parent branch up + // to the requested branch point. The repository code itself doesn't + // require it, but if we start to receive WAL on the new timeline, + // decoding the new WAL might need to look up previous pages, relation + // sizes etc. and that would get confused if the previous page versions + // are not in the repository yet. + ancestor_timeline.wait_lsn(start_lsn)?; + } + start_lsn = start_lsn.align(); + + let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); + if ancestor_ancestor_lsn > start_lsn { + // can we safely just branch from the ancestor instead? + anyhow::bail!( + "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", + start_lsn, + ancestor_timeline_id, + ancestor_ancestor_lsn, + ); + } + repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?; + // load the timeline into memory + let loaded_timeline = repo.get_timeline(new_timeline_id)?; + TimelineInfo::from_repo_timeline(tenant_id, loaded_timeline, false) + } + None => { + let new_timeline = bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?; + TimelineInfo::from_dyn_timeline( + tenant_id, + new_timeline_id, + new_timeline.as_ref(), + false, + ) + } + }; + Ok(Some(new_timeline_info)) +} diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 378a015d4a..ca9107cdbf 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -268,12 +268,11 @@ impl XlXactParsedRecord { let info = xl_info & pg_constants::XLOG_XACT_OPMASK; // The record starts with time of commit/abort let xact_time = buf.get_i64_le(); - let xinfo; - if xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 { - xinfo = buf.get_u32_le(); + let xinfo = if xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 { + buf.get_u32_le() } else { - xinfo = 0; - } + 0 + }; let db_id; let ts_id; if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 { @@ -502,7 +501,6 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { 0..=pg_constants::XLR_MAX_BLOCK_ID => { /* XLogRecordBlockHeader */ let mut blk = DecodedBkpBlock::new(); - let fork_flags: u8; if block_id <= max_block_id { // TODO @@ -515,7 +513,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { } max_block_id = block_id; - fork_flags = buf.get_u8(); + let fork_flags: u8 = buf.get_u8(); blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK; blk.flags = fork_flags; blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0; diff --git a/postgres_ffi/src/xlog_utils.rs b/postgres_ffi/src/xlog_utils.rs index caf1940a9c..d2b2b5c122 100644 --- a/postgres_ffi/src/xlog_utils.rs +++ b/postgres_ffi/src/xlog_utils.rs @@ -132,6 +132,8 @@ pub fn get_current_timestamp() -> TimestampTz { } } +/// Return offset of the last valid record in the segment segno, starting +/// looking at start_offset. Returns start_offset if no records found. fn find_end_of_wal_segment( data_dir: &Path, segno: XLogSegNo, @@ -147,7 +149,7 @@ fn find_end_of_wal_segment( let mut rec_offs: usize = 0; let mut buf = [0u8; XLOG_BLCKSZ]; let file_name = XLogFileName(tli, segno, wal_seg_size); - let mut last_valid_rec_pos: usize = 0; + let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap(); file.seek(SeekFrom::Start(offs as u64))?; let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS]; diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index d8d5cbe5bf..dda018a1d8 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" anyhow = "1.0" bytes = { version = "1.0.1", features = ['serde'] } clap = "3.0" +fail = "0.5.0" futures = "0.3.13" hashbrown = "0.11.2" hex = "0.4.3" @@ -21,6 +22,7 @@ rustls = "0.19.1" scopeguard = "1.1.0" serde = "1" serde_json = "1" +thiserror = "1.0" tokio = { version = "1.11", features = ["macros"] } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } tokio-rustls = "0.22.0" diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index a5bdaeaeca..5e6357fe80 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,11 +1,79 @@ use crate::compute::DatabaseInfo; use crate::config::ProxyConfig; use crate::cplane_api::{self, CPlaneApi}; +use crate::error::UserFacingError; use crate::stream::PqStream; -use anyhow::{anyhow, bail, Context}; +use crate::waiters; use std::collections::HashMap; +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use zenith_utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage, FeMessage as Fe}; +use zenith_utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; + +/// Common authentication error. +#[derive(Debug, Error)] +pub enum AuthErrorImpl { + /// Authentication error reported by the console. + #[error(transparent)] + Console(#[from] cplane_api::AuthError), + + /// For passwords that couldn't be processed by [`parse_password`]. + #[error("Malformed password message")] + MalformedPassword, + + /// Errors produced by [`PqStream`]. + #[error(transparent)] + Io(#[from] std::io::Error), +} + +impl AuthErrorImpl { + pub fn auth_failed(msg: impl Into) -> Self { + AuthErrorImpl::Console(cplane_api::AuthError::auth_failed(msg)) + } +} + +impl From for AuthErrorImpl { + fn from(e: waiters::RegisterError) -> Self { + AuthErrorImpl::Console(cplane_api::AuthError::from(e)) + } +} + +impl From for AuthErrorImpl { + fn from(e: waiters::WaitError) -> Self { + AuthErrorImpl::Console(cplane_api::AuthError::from(e)) + } +} + +#[derive(Debug, Error)] +#[error(transparent)] +pub struct AuthError(Box); + +impl From for AuthError +where + AuthErrorImpl: From, +{ + fn from(e: T) -> Self { + AuthError(Box::new(e.into())) + } +} + +impl UserFacingError for AuthError { + fn to_string_client(&self) -> String { + use AuthErrorImpl::*; + match self.0.as_ref() { + Console(e) => e.to_string_client(), + MalformedPassword => self.to_string(), + _ => "Internal error".to_string(), + } + } +} + +#[derive(Debug, Error)] +pub enum ClientCredsParseError { + #[error("Parameter `{0}` is missing in startup packet")] + MissingKey(&'static str), +} + +impl UserFacingError for ClientCredsParseError {} /// Various client credentials which we use for authentication. #[derive(Debug, PartialEq, Eq)] @@ -15,13 +83,13 @@ pub struct ClientCredentials { } impl TryFrom> for ClientCredentials { - type Error = anyhow::Error; + type Error = ClientCredsParseError; fn try_from(mut value: HashMap) -> Result { let mut get_param = |key| { value .remove(key) - .with_context(|| format!("{} is missing in startup packet", key)) + .ok_or(ClientCredsParseError::MissingKey(key)) }; let user = get_param("user")?; @@ -37,10 +105,14 @@ impl ClientCredentials { self, config: &ProxyConfig, client: &mut PqStream, - ) -> anyhow::Result { + ) -> Result { + fail::fail_point!("proxy-authenticate", |_| { + Err(AuthError::auth_failed("failpoint triggered")) + }); + use crate::config::ClientAuthMethod::*; use crate::config::RouterConfig::*; - let db_info = match &config.router_config { + match &config.router_config { Static { host, port } => handle_static(host.clone(), *port, client, self).await, Dynamic(Mixed) => { if self.user.ends_with("@zenith") { @@ -51,9 +123,7 @@ impl ClientCredentials { } Dynamic(Password) => handle_existing_user(config, client, self).await, Dynamic(Link) => handle_new_user(config, client).await, - }; - - db_info.context("failed to authenticate client") + } } } @@ -66,18 +136,14 @@ async fn handle_static( port: u16, client: &mut PqStream, creds: ClientCredentials, -) -> anyhow::Result { +) -> Result { client .write_message(&Be::AuthenticationCleartextPassword) .await?; // Read client's password bytes - let msg = match client.read_message().await? { - Fe::PasswordMessage(msg) => msg, - bad => bail!("unexpected message type: {:?}", bad), - }; - - let cleartext_password = std::str::from_utf8(&msg)?.split('\0').next().unwrap(); + let msg = client.read_password_message().await?; + let cleartext_password = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; let db_info = DatabaseInfo { host, @@ -98,7 +164,7 @@ async fn handle_existing_user( config: &ProxyConfig, client: &mut PqStream, creds: ClientCredentials, -) -> anyhow::Result { +) -> Result { let psql_session_id = new_psql_session_id(); let md5_salt = rand::random(); @@ -107,18 +173,12 @@ async fn handle_existing_user( .await?; // Read client's password hash - let msg = match client.read_message().await? { - Fe::PasswordMessage(msg) => msg, - bad => bail!("unexpected message type: {:?}", bad), - }; + let msg = client.read_password_message().await?; + let md5_response = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; - let (_trailing_null, md5_response) = msg - .split_last() - .ok_or_else(|| anyhow!("unexpected password message"))?; - - let cplane = CPlaneApi::new(&config.auth_endpoint); + let cplane = CPlaneApi::new(config.auth_endpoint.clone()); let db_info = cplane - .authenticate_proxy_request(creds, md5_response, &md5_salt, &psql_session_id) + .authenticate_proxy_client(creds, md5_response, &md5_salt, &psql_session_id) .await?; client @@ -131,7 +191,7 @@ async fn handle_existing_user( async fn handle_new_user( config: &ProxyConfig, client: &mut PqStream, -) -> anyhow::Result { +) -> Result { let psql_session_id = new_psql_session_id(); let greeting = hello_message(&config.redirect_uri, &psql_session_id); @@ -143,8 +203,8 @@ async fn handle_new_user( .write_message(&Be::NoticeResponse(greeting)) .await?; - // Wait for web console response - waiter.await?.map_err(|e| anyhow!(e)) + // Wait for web console response (see `mgmt`) + waiter.await?.map_err(AuthErrorImpl::auth_failed) }) .await?; @@ -153,6 +213,10 @@ async fn handle_new_user( Ok(db_info) } +fn parse_password(bytes: &[u8]) -> Option<&str> { + std::str::from_utf8(bytes).ok()?.strip_suffix('\0') +} + fn hello_message(redirect_uri: &str, session_id: &str) -> String { format!( concat![ diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index c1a7e81be9..07d3bcc71a 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -6,7 +6,7 @@ use tokio::net::TcpStream; use tokio_postgres::{CancelToken, NoTls}; use zenith_utils::pq_proto::CancelKeyData; -/// Enables serving CancelRequests. +/// Enables serving `CancelRequest`s. #[derive(Default)] pub struct CancelMap(Mutex>>); diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 7c294bd488..64ce5d0a5a 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,6 +1,27 @@ -use anyhow::Context; +use crate::cancellation::CancelClosure; +use crate::error::UserFacingError; use serde::{Deserialize, Serialize}; -use std::net::{SocketAddr, ToSocketAddrs}; +use std::io; +use std::net::SocketAddr; +use thiserror::Error; +use tokio::net::TcpStream; +use tokio_postgres::NoTls; + +#[derive(Debug, Error)] +pub enum ConnectionError { + /// This error doesn't seem to reveal any secrets; for instance, + /// [`tokio_postgres::error::Kind`] doesn't contain ip addresses and such. + #[error("Failed to connect to the compute node: {0}")] + Postgres(#[from] tokio_postgres::Error), + + #[error("Failed to connect to the compute node")] + FailedToConnectToCompute, + + #[error("Failed to fetch compute node version")] + FailedToFetchPgVersion, +} + +impl UserFacingError for ConnectionError {} /// Compute node connection params. #[derive(Serialize, Deserialize, Debug, Default)] @@ -12,14 +33,38 @@ pub struct DatabaseInfo { pub password: Option, } +/// PostgreSQL version as [`String`]. +pub type Version = String; + impl DatabaseInfo { - pub fn socket_addr(&self) -> anyhow::Result { + async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { let host_port = format!("{}:{}", self.host, self.port); - host_port - .to_socket_addrs() - .with_context(|| format!("cannot resolve {} to SocketAddr", host_port))? - .next() - .context("cannot resolve at least one SocketAddr") + let socket = TcpStream::connect(host_port).await?; + let socket_addr = socket.peer_addr()?; + + Ok((socket_addr, socket)) + } + + /// Connect to a corresponding compute node. + pub async fn connect(self) -> Result<(TcpStream, Version, CancelClosure), ConnectionError> { + let (socket_addr, mut socket) = self + .connect_raw() + .await + .map_err(|_| ConnectionError::FailedToConnectToCompute)?; + + // TODO: establish a secure connection to the DB + let (client, conn) = tokio_postgres::Config::from(self) + .connect_raw(&mut socket, NoTls) + .await?; + + let version = conn + .parameter("server_version") + .ok_or(ConnectionError::FailedToFetchPgVersion)? + .into(); + + let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); + + Ok((socket, version, cancel_closure)) } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 9ab64db795..077ff02898 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, ensure, Context}; +use anyhow::{anyhow, bail, ensure, Context}; use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig}; use std::net::SocketAddr; use std::str::FromStr; @@ -29,7 +29,7 @@ impl FromStr for ClientAuthMethod { "password" => Ok(Password), "link" => Ok(Link), "mixed" => Ok(Mixed), - _ => Err(anyhow::anyhow!("Invlid option for router")), + _ => bail!("Invalid option for router: `{}`", s), } } } @@ -53,7 +53,7 @@ pub struct ProxyConfig { pub redirect_uri: String, /// control plane address where we would check auth. - pub auth_endpoint: String, + pub auth_endpoint: reqwest::Url, pub tls_config: Option, } diff --git a/proxy/src/cplane_api.rs b/proxy/src/cplane_api.rs index 187809717f..21fce79df3 100644 --- a/proxy/src/cplane_api.rs +++ b/proxy/src/cplane_api.rs @@ -1,52 +1,113 @@ use crate::auth::ClientCredentials; use crate::compute::DatabaseInfo; -use crate::waiters::{Waiter, Waiters}; -use anyhow::{anyhow, bail}; +use crate::error::UserFacingError; +use crate::mgmt; +use crate::waiters::{self, Waiter, Waiters}; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; +use thiserror::Error; lazy_static! { - static ref CPLANE_WAITERS: Waiters> = Default::default(); + static ref CPLANE_WAITERS: Waiters = Default::default(); } /// Give caller an opportunity to wait for cplane's reply. -pub async fn with_waiter(psql_session_id: impl Into, f: F) -> anyhow::Result +pub async fn with_waiter( + psql_session_id: impl Into, + action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R, +) -> Result where - F: FnOnce(Waiter<'static, Result>) -> R, - R: std::future::Future>, + R: std::future::Future>, + E: From, { let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; - f(waiter).await + action(waiter).await } -pub fn notify(psql_session_id: &str, msg: Result) -> anyhow::Result<()> { +pub fn notify( + psql_session_id: &str, + msg: Result, +) -> Result<(), waiters::NotifyError> { CPLANE_WAITERS.notify(psql_session_id, msg) } /// Zenith console API wrapper. -pub struct CPlaneApi<'a> { - auth_endpoint: &'a str, +pub struct CPlaneApi { + auth_endpoint: reqwest::Url, } -impl<'a> CPlaneApi<'a> { - pub fn new(auth_endpoint: &'a str) -> Self { +impl CPlaneApi { + pub fn new(auth_endpoint: reqwest::Url) -> Self { Self { auth_endpoint } } } -impl CPlaneApi<'_> { - pub async fn authenticate_proxy_request( +#[derive(Debug, Error)] +pub enum AuthErrorImpl { + /// Authentication error reported by the console. + #[error("Authentication failed: {0}")] + AuthFailed(String), + + /// HTTP status (other than 200) returned by the console. + #[error("Console responded with an HTTP status: {0}")] + HttpStatus(reqwest::StatusCode), + + #[error("Console responded with a malformed JSON: {0}")] + MalformedResponse(#[from] serde_json::Error), + + #[error(transparent)] + Transport(#[from] reqwest::Error), + + #[error(transparent)] + WaiterRegister(#[from] waiters::RegisterError), + + #[error(transparent)] + WaiterWait(#[from] waiters::WaitError), +} + +#[derive(Debug, Error)] +#[error(transparent)] +pub struct AuthError(Box); + +impl AuthError { + /// Smart constructor for authentication error reported by `mgmt`. + pub fn auth_failed(msg: impl Into) -> Self { + AuthError(Box::new(AuthErrorImpl::AuthFailed(msg.into()))) + } +} + +impl From for AuthError +where + AuthErrorImpl: From, +{ + fn from(e: T) -> Self { + AuthError(Box::new(e.into())) + } +} + +impl UserFacingError for AuthError { + fn to_string_client(&self) -> String { + use AuthErrorImpl::*; + match self.0.as_ref() { + AuthFailed(_) | HttpStatus(_) => self.to_string(), + _ => "Internal error".to_string(), + } + } +} + +impl CPlaneApi { + pub async fn authenticate_proxy_client( &self, creds: ClientCredentials, - md5_response: &[u8], + md5_response: &str, salt: &[u8; 4], psql_session_id: &str, - ) -> anyhow::Result { - let mut url = reqwest::Url::parse(self.auth_endpoint)?; + ) -> Result { + let mut url = self.auth_endpoint.clone(); url.query_pairs_mut() .append_pair("login", &creds.user) .append_pair("database", &creds.dbname) - .append_pair("md5response", std::str::from_utf8(md5_response)?) + .append_pair("md5response", md5_response) .append_pair("salt", &hex::encode(salt)) .append_pair("psql_session_id", psql_session_id); @@ -55,18 +116,20 @@ impl CPlaneApi<'_> { // TODO: leverage `reqwest::Client` to reuse connections let resp = reqwest::get(url).await?; if !resp.status().is_success() { - bail!("Auth failed: {}", resp.status()) + return Err(AuthErrorImpl::HttpStatus(resp.status()).into()); } let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?; println!("got auth info: #{:?}", auth_info); use ProxyAuthResponse::*; - match auth_info { - Ready { conn_info } => Ok(conn_info), - Error { error } => bail!(error), - NotReady { .. } => waiter.await?.map_err(|e| anyhow!(e)), - } + let db_info = match auth_info { + Ready { conn_info } => conn_info, + Error { error } => return Err(AuthErrorImpl::AuthFailed(error).into()), + NotReady { .. } => waiter.await?.map_err(AuthErrorImpl::AuthFailed)?, + }; + + Ok(db_info) }) .await } diff --git a/proxy/src/error.rs b/proxy/src/error.rs new file mode 100644 index 0000000000..e98e553f83 --- /dev/null +++ b/proxy/src/error.rs @@ -0,0 +1,17 @@ +/// Marks errors that may be safely shown to a client. +/// This trait can be seen as a specialized version of [`ToString`]. +/// +/// NOTE: This trait should not be implemented for [`anyhow::Error`], since it +/// is way too convenient and tends to proliferate all across the codebase, +/// ultimately leading to accidental leaks of sensitive data. +pub trait UserFacingError: ToString { + /// Format the error for client, stripping all sensitive info. + /// + /// Although this might be a no-op for many types, it's highly + /// recommended to override the default impl in case error type + /// contains anything sensitive: various IDs, IP addresses etc. + #[inline(always)] + fn to_string_client(&self) -> String { + self.to_string() + } +} diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 0b693d88dd..33d134678f 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -7,7 +7,7 @@ use zenith_utils::http::json::json_response; use zenith_utils::http::{RouterBuilder, RouterService}; async fn status_handler(_: Request) -> Result, ApiError> { - Ok(json_response(StatusCode::OK, "")?) + json_response(StatusCode::OK, "") } fn make_router() -> RouterBuilder { diff --git a/proxy/src/main.rs b/proxy/src/main.rs index fb3bf725b8..bd99d0a639 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -20,13 +20,14 @@ mod cancellation; mod compute; mod config; mod cplane_api; +mod error; mod http; mod mgmt; mod proxy; mod stream; mod waiters; -/// Flattens Result> into Result. +/// Flattens `Result>` into `Result`. async fn flatten_err( f: impl Future, JoinError>>, ) -> anyhow::Result<()> { @@ -122,7 +123,7 @@ async fn main() -> anyhow::Result<()> { None => RouterConfig::Dynamic(auth_method), Some(addr) => { if let ClientAuthMethod::Password = auth_method { - let (host, port) = addr.split_once(":").unwrap(); + let (host, port) = addr.split_once(':').unwrap(); RouterConfig::Static { host: host.to_string(), port: port.parse().unwrap(), diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 55b49b441f..e53542dfd2 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -79,6 +79,18 @@ enum PsqlSessionResult { Failure(String), } +/// A message received by `mgmt` when a compute node is ready. +pub type ComputeReady = Result; + +impl PsqlSessionResult { + fn into_compute_ready(self) -> ComputeReady { + match self { + Self::Success(db_info) => Ok(db_info), + Self::Failure(message) => Err(message), + } + } +} + impl postgres_backend::Handler for MgmtHandler { fn process_query( &mut self, @@ -99,13 +111,7 @@ fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::R let resp: PsqlSessionResponse = serde_json::from_str(query_string)?; - use PsqlSessionResult::*; - let msg = match resp.result { - Success(db_info) => Ok(db_info), - Failure(message) => Err(message), - }; - - match cplane_api::notify(&resp.session_id, msg) { + match cplane_api::notify(&resp.session_id, resp.result.into_compute_ready()) { Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 1dc301b792..3c7f59bc26 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -1,17 +1,18 @@ use crate::auth; -use crate::cancellation::{self, CancelClosure, CancelMap}; -use crate::compute::DatabaseInfo; +use crate::cancellation::{self, CancelMap}; use crate::config::{ProxyConfig, TlsConfig}; use crate::stream::{MetricsStream, PqStream, Stream}; use anyhow::{bail, Context}; +use futures::TryFutureExt; use lazy_static::lazy_static; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::net::TcpStream; -use tokio_postgres::NoTls; use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter}; use zenith_utils::pq_proto::{BeMessage as Be, *}; +const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; +const ERR_PROTO_VIOLATION: &str = "protocol violation"; + lazy_static! { static ref NUM_CONNECTIONS_ACCEPTED_COUNTER: IntCounter = register_int_counter!( new_common_metric_name("num_connections_accepted"), @@ -30,6 +31,7 @@ lazy_static! { .unwrap(); } +/// A small combinator for pluggable error logging. async fn log_error(future: F) -> F::Output where F: std::future::Future>, @@ -76,20 +78,21 @@ async fn handle_client( } let tls = config.tls_config.clone(); - if let Some((client, creds)) = handshake(stream, tls, cancel_map).await? { - cancel_map - .with_session(|session| async { - connect_client_to_db(config, session, client, creds).await - }) - .await?; - } + let (stream, creds) = match handshake(stream, tls, cancel_map).await? { + Some(x) => x, + None => return Ok(()), // it's a cancellation request + }; - Ok(()) + let client = Client::new(stream, creds); + cancel_map + .with_session(|session| client.connect_to_db(config, session)) + .await } -/// Handle a connection from one client. -/// For better testing experience, `stream` can be -/// any object satisfying the traits. +/// Establish a (most probably, secure) connection with the client. +/// For better testing experience, `stream` can be any object satisfying the traits. +/// It's easier to work with owned `stream` here as we need to updgrade it to TLS; +/// we also take an extra care of propagating only the select handshake errors to client. async fn handshake( stream: S, mut tls: Option, @@ -119,7 +122,7 @@ async fn handshake( stream = PqStream::new(stream.into_inner().upgrade(tls).await?); } } - _ => bail!("protocol violation"), + _ => bail!(ERR_PROTO_VIOLATION), }, GssEncRequest => match stream.get_ref() { Stream::Raw { .. } if !tried_gss => { @@ -128,18 +131,21 @@ async fn handshake( // Currently, we don't support GSSAPI stream.write_message(&Be::EncryptionResponse(false)).await?; } - _ => bail!("protocol violation"), + _ => bail!(ERR_PROTO_VIOLATION), }, StartupMessage { params, .. } => { // Check that the config has been consumed during upgrade // OR we didn't provide it at all (for dev purposes). if tls.is_some() { - let msg = "connection is insecure (try using `sslmode=require`)"; - stream.write_message(&Be::ErrorResponse(msg)).await?; - bail!(msg); + stream.throw_error_str(ERR_INSECURE_CONNECTION).await?; } - break Ok(Some((stream, params.try_into()?))); + // Here and forth: `or_else` demands that we use a future here + let creds = async { params.try_into() } + .or_else(|e| stream.throw_error(e)) + .await?; + + break Ok(Some((stream, creds))); } CancelRequest(cancel_key_data) => { cancel_map.cancel_session(cancel_key_data).await?; @@ -150,58 +156,60 @@ async fn handshake( } } -async fn connect_client_to_db( - config: &ProxyConfig, - session: cancellation::Session<'_>, - mut client: PqStream, +/// Thin connection context. +struct Client { + /// The underlying libpq protocol stream. + stream: PqStream, + /// Client credentials that we care about. creds: auth::ClientCredentials, -) -> anyhow::Result<()> { - let db_info = creds.authenticate(config, &mut client).await?; - let (db, version, cancel_closure) = connect_to_db(db_info).await?; - let cancel_key_data = session.enable_cancellation(cancel_closure); - - client - .write_message_noflush(&BeMessage::ParameterStatus( - BeParameterStatusMessage::ServerVersion(&version), - ))? - .write_message_noflush(&Be::BackendKeyData(cancel_key_data))? - .write_message(&BeMessage::ReadyForQuery) - .await?; - - // This function will be called for writes to either direction. - fn inc_proxied(cnt: usize) { - // Consider inventing something more sophisticated - // if this ever becomes a bottleneck (cacheline bouncing). - NUM_BYTES_PROXIED_COUNTER.inc_by(cnt as u64); - } - - let mut db = MetricsStream::new(db, inc_proxied); - let mut client = MetricsStream::new(client.into_inner(), inc_proxied); - let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?; - - Ok(()) } -/// Connect to a corresponding compute node. -async fn connect_to_db( - db_info: DatabaseInfo, -) -> anyhow::Result<(TcpStream, String, CancelClosure)> { - // TODO: establish a secure connection to the DB - let socket_addr = db_info.socket_addr()?; - let mut socket = TcpStream::connect(socket_addr).await?; +impl Client { + /// Construct a new connection context. + fn new(stream: PqStream, creds: auth::ClientCredentials) -> Self { + Self { stream, creds } + } +} - let (client, conn) = tokio_postgres::Config::from(db_info) - .connect_raw(&mut socket, NoTls) - .await?; +impl Client { + /// Let the client authenticate and connect to the designated compute node. + async fn connect_to_db( + self, + config: &ProxyConfig, + session: cancellation::Session<'_>, + ) -> anyhow::Result<()> { + let Self { mut stream, creds } = self; - let version = conn - .parameter("server_version") - .context("failed to fetch postgres server version")? - .into(); + // Authenticate and connect to a compute node. + let auth = creds.authenticate(config, &mut stream).await; + let db_info = async { auth }.or_else(|e| stream.throw_error(e)).await?; - let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); + let (db, version, cancel_closure) = + db_info.connect().or_else(|e| stream.throw_error(e)).await?; + let cancel_key_data = session.enable_cancellation(cancel_closure); - Ok((socket, version, cancel_closure)) + stream + .write_message_noflush(&BeMessage::ParameterStatus( + BeParameterStatusMessage::ServerVersion(&version), + ))? + .write_message_noflush(&Be::BackendKeyData(cancel_key_data))? + .write_message(&BeMessage::ReadyForQuery) + .await?; + + /// This function will be called for writes to either direction. + fn inc_proxied(cnt: usize) { + // Consider inventing something more sophisticated + // if this ever becomes a bottleneck (cacheline bouncing). + NUM_BYTES_PROXIED_COUNTER.inc_by(cnt as u64); + } + + // Starting from here we only proxy the client's traffic. + let mut db = MetricsStream::new(db, inc_proxied); + let mut client = MetricsStream::new(stream.into_inner(), inc_proxied); + let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?; + + Ok(()) + } } #[cfg(test)] @@ -210,7 +218,7 @@ mod tests { use tokio::io::DuplexStream; use tokio_postgres::config::SslMode; - use tokio_postgres::tls::MakeTlsConnect; + use tokio_postgres::tls::{MakeTlsConnect, NoTls}; use tokio_postgres_rustls::MakeRustlsConnect; async fn dummy_proxy( @@ -264,7 +272,7 @@ mod tests { let proxy = tokio::spawn(dummy_proxy(client, Some(server_config.into()))); - tokio_postgres::Config::new() + let client_err = tokio_postgres::Config::new() .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Disable) @@ -273,11 +281,15 @@ mod tests { .err() // -> Option .context("client shouldn't be able to connect")?; - proxy + assert!(client_err.to_string().contains(ERR_INSECURE_CONNECTION)); + + let server_err = proxy .await? .err() // -> Option .context("server shouldn't accept client")?; + assert!(client_err.to_string().contains(&server_err.to_string())); + Ok(()) } @@ -329,4 +341,30 @@ mod tests { proxy.await? } + + #[tokio::test] + async fn give_user_an_error_for_bad_creds() -> anyhow::Result<()> { + let (client, server) = tokio::io::duplex(1024); + + let proxy = tokio::spawn(dummy_proxy(client, None)); + + let client_err = tokio_postgres::Config::new() + .ssl_mode(SslMode::Disable) + .connect_raw(server, NoTls) + .await + .err() // -> Option + .context("client shouldn't be able to connect")?; + + // TODO: this is ugly, but `format!` won't allow us to extract fmt string + assert!(client_err.to_string().contains("missing in startup packet")); + + let server_err = proxy + .await? + .err() // -> Option + .context("server shouldn't accept client")?; + + assert!(client_err.to_string().contains(&server_err.to_string())); + + Ok(()) + } } diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 8fd5bef388..fb0be84584 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,10 +1,12 @@ -use anyhow::Context; +use crate::error::UserFacingError; +use anyhow::bail; use bytes::BytesMut; use pin_project_lite::pin_project; use rustls::ServerConfig; use std::pin::Pin; use std::sync::Arc; use std::{io, task}; +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, ReadBuf}; use tokio_rustls::server::TlsStream; use zenith_utils::pq_proto::{BeMessage, FeMessage, FeStartupPacket}; @@ -35,38 +37,63 @@ impl PqStream { self.stream } - /// Get a reference to the underlying stream. + /// Get a shared reference to the underlying stream. pub fn get_ref(&self) -> &S { &self.stream } } +fn err_connection() -> io::Error { + io::Error::new(io::ErrorKind::ConnectionAborted, "connection is lost") +} + +// TODO: change error type of `FeMessage::read_fut` +fn from_anyhow(e: anyhow::Error) -> io::Error { + io::Error::new(io::ErrorKind::Other, e.to_string()) +} + impl PqStream { /// Receive [`FeStartupPacket`], which is a first packet sent by a client. - pub async fn read_startup_packet(&mut self) -> anyhow::Result { - match FeStartupPacket::read_fut(&mut self.stream).await? { - Some(FeMessage::StartupPacket(packet)) => Ok(packet), - None => anyhow::bail!("connection is lost"), - other => anyhow::bail!("bad message type: {:?}", other), + pub async fn read_startup_packet(&mut self) -> io::Result { + // TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket` + let msg = FeStartupPacket::read_fut(&mut self.stream) + .await + .map_err(from_anyhow)? + .ok_or_else(err_connection)?; + + match msg { + FeMessage::StartupPacket(packet) => Ok(packet), + _ => panic!("unreachable state"), } } - pub async fn read_message(&mut self) -> anyhow::Result { + pub async fn read_password_message(&mut self) -> io::Result { + match self.read_message().await? { + FeMessage::PasswordMessage(msg) => Ok(msg), + bad => Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("unexpected message type: {:?}", bad), + )), + } + } + + async fn read_message(&mut self) -> io::Result { FeMessage::read_fut(&mut self.stream) - .await? - .context("connection is lost") + .await + .map_err(from_anyhow)? + .ok_or_else(err_connection) } } impl PqStream { /// Write the message into an internal buffer, but don't flush the underlying stream. - pub fn write_message_noflush<'a>(&mut self, message: &BeMessage<'a>) -> io::Result<&mut Self> { + pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { BeMessage::write(&mut self.buffer, message)?; Ok(self) } /// Write the message into an internal buffer and flush it. - pub async fn write_message<'a>(&mut self, message: &BeMessage<'a>) -> io::Result<&mut Self> { + pub async fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { self.write_message_noflush(message)?; self.flush().await?; Ok(self) @@ -79,6 +106,25 @@ impl PqStream { self.stream.flush().await?; Ok(self) } + + /// Write the error message using [`Self::write_message`], then re-throw it. + /// Allowing string literals is safe under the assumption they might not contain any runtime info. + pub async fn throw_error_str(&mut self, error: &'static str) -> anyhow::Result { + // This method exists due to `&str` not implementing `Into` + self.write_message(&BeMessage::ErrorResponse(error)).await?; + bail!(error) + } + + /// Write the error message using [`Self::write_message`], then re-throw it. + /// Trait [`UserFacingError`] acts as an allowlist for error types. + pub async fn throw_error(&mut self, error: E) -> anyhow::Result + where + E: UserFacingError + Into, + { + let msg = error.to_string_client(); + self.write_message(&BeMessage::ErrorResponse(&msg)).await?; + bail!(error) + } } pin_project! { @@ -101,15 +147,25 @@ impl Stream { } } +#[derive(Debug, Error)] +#[error("Can't upgrade TLS stream")] +pub enum StreamUpgradeError { + #[error("Bad state reached: can't upgrade TLS stream")] + AlreadyTls, + + #[error("Can't upgrade stream: IO error: {0}")] + Io(#[from] io::Error), +} + impl Stream { /// If possible, upgrade raw stream into a secure TLS-based stream. - pub async fn upgrade(self, cfg: Arc) -> anyhow::Result { + pub async fn upgrade(self, cfg: Arc) -> Result { match self { Stream::Raw { raw } => { let tls = Box::new(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?); Ok(Stream::Tls { tls }) } - Stream::Tls { .. } => anyhow::bail!("can't upgrade TLS stream"), + Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), } } } diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs index 9fda3ed94f..799d45a165 100644 --- a/proxy/src/waiters.rs +++ b/proxy/src/waiters.rs @@ -1,11 +1,32 @@ -use anyhow::{anyhow, Context}; use hashbrown::HashMap; use parking_lot::Mutex; use pin_project_lite::pin_project; use std::pin::Pin; use std::task; +use thiserror::Error; use tokio::sync::oneshot; +#[derive(Debug, Error)] +pub enum RegisterError { + #[error("Waiter `{0}` already registered")] + Occupied(String), +} + +#[derive(Debug, Error)] +pub enum NotifyError { + #[error("Notify failed: waiter `{0}` not registered")] + NotFound(String), + + #[error("Notify failed: channel hangup")] + Hangup, +} + +#[derive(Debug, Error)] +pub enum WaitError { + #[error("Wait failed: channel hangup")] + Hangup, +} + pub struct Waiters(pub(self) Mutex>>); impl Default for Waiters { @@ -15,13 +36,13 @@ impl Default for Waiters { } impl Waiters { - pub fn register(&self, key: String) -> anyhow::Result> { + pub fn register(&self, key: String) -> Result, RegisterError> { let (tx, rx) = oneshot::channel(); self.0 .lock() .try_insert(key.clone(), tx) - .map_err(|_| anyhow!("waiter already registered"))?; + .map_err(|e| RegisterError::Occupied(e.entry.key().clone()))?; Ok(Waiter { receiver: rx, @@ -32,7 +53,7 @@ impl Waiters { }) } - pub fn notify(&self, key: &str, value: T) -> anyhow::Result<()> + pub fn notify(&self, key: &str, value: T) -> Result<(), NotifyError> where T: Send + Sync, { @@ -40,9 +61,9 @@ impl Waiters { .0 .lock() .remove(key) - .with_context(|| format!("key {} not found", key))?; + .ok_or_else(|| NotifyError::NotFound(key.to_string()))?; - tx.send(value).map_err(|_| anyhow!("waiter channel hangup")) + tx.send(value).map_err(|_| NotifyError::Hangup) } } @@ -66,13 +87,13 @@ pin_project! { } impl std::future::Future for Waiter<'_, T> { - type Output = anyhow::Result; + type Output = Result; fn poll(self: Pin<&mut Self>, cx: &mut task::Context<'_>) -> task::Poll { self.project() .receiver .poll(cx) - .map_err(|_| anyhow!("channel hangup")) + .map_err(|_| WaitError::Hangup) } } diff --git a/test_runner/README.md b/test_runner/README.md index 514c5f1e3a..a56c2df2c0 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -89,7 +89,7 @@ def test_foobar(zenith_env_builder: ZenithEnvBuilder): # Now create the environment. This initializes the repository, and starts # up the page server and the safekeepers - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # Run the test ... diff --git a/test_runner/batch_others/test_auth.py b/test_runner/batch_others/test_auth.py index 7f86986e2e..bda6349ef9 100644 --- a/test_runner/batch_others/test_auth.py +++ b/test_runner/batch_others/test_auth.py @@ -1,14 +1,14 @@ from contextlib import closing from typing import Iterator from uuid import UUID, uuid4 -import psycopg2 from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithPageserverApiException +from requests.exceptions import HTTPError import pytest def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.pageserver_auth_enabled = True - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() ps = env.pageserver @@ -25,25 +25,31 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): ps.safe_psql("set FOO", password=tenant_token) ps.safe_psql("set FOO", password=management_token) + new_timeline_id = env.zenith_cli.create_branch('test_pageserver_auth', + tenant_id=env.initial_tenant) + # tenant can create branches - tenant_http_client.branch_create(env.initial_tenant, 'new1', 'main') + tenant_http_client.timeline_create(tenant_id=env.initial_tenant, + ancestor_timeline_id=new_timeline_id) # console can create branches for tenant - management_http_client.branch_create(env.initial_tenant, 'new2', 'main') + management_http_client.timeline_create(tenant_id=env.initial_tenant, + ancestor_timeline_id=new_timeline_id) # fail to create branch using token with different tenant_id with pytest.raises(ZenithPageserverApiException, match='Forbidden: Tenant id mismatch. Permission denied'): - invalid_tenant_http_client.branch_create(env.initial_tenant, "new3", "main") + invalid_tenant_http_client.timeline_create(tenant_id=env.initial_tenant, + ancestor_timeline_id=new_timeline_id) # create tenant using management token - management_http_client.tenant_create(uuid4()) + management_http_client.tenant_create() # fail to create tenant using tenant token with pytest.raises( ZenithPageserverApiException, match='Forbidden: Attempt to access management api with tenant scope. Permission denied' ): - tenant_http_client.tenant_create(uuid4()) + tenant_http_client.tenant_create() @pytest.mark.parametrize('with_wal_acceptors', [False, True]) @@ -51,11 +57,10 @@ def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_w zenith_env_builder.pageserver_auth_enabled = True if with_wal_acceptors: zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() - - branch = f"test_compute_auth_to_pageserver{with_wal_acceptors}" - env.zenith_cli.create_branch(branch, "main") + env = zenith_env_builder.init_start() + branch = f'test_compute_auth_to_pageserver{with_wal_acceptors}' + env.zenith_cli.create_branch(branch) pg = env.postgres.create_start(branch) with closing(pg.connect()) as conn: diff --git a/test_runner/batch_others/test_backpressure.py b/test_runner/batch_others/test_backpressure.py index 23af5b90ed..ff34121327 100644 --- a/test_runner/batch_others/test_backpressure.py +++ b/test_runner/batch_others/test_backpressure.py @@ -93,9 +93,9 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv def test_backpressure_received_lsn_lag(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # Create a branch for us - env.zenith_cli.create_branch("test_backpressure", "main") + env.zenith_cli.create_branch('test_backpressure') pg = env.postgres.create_start('test_backpressure', config_lines=['max_replication_write_lag=30MB']) diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index 860db51c8a..4e2be352f4 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -19,11 +19,10 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): # # See https://github.com/zenithdb/zenith/issues/1068 zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # Branch at the point where only 100 rows were inserted - env.zenith_cli.create_branch("test_branch_behind", "main") - + env.zenith_cli.create_branch('test_branch_behind') pgmain = env.postgres.create_start('test_branch_behind') log.info("postgres is running on 'test_branch_behind' branch") @@ -60,7 +59,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 200100 rows: {lsn_b}') # Branch at the point where only 100 rows were inserted - env.zenith_cli.create_branch("test_branch_behind_hundred", "test_branch_behind@" + lsn_a) + env.zenith_cli.create_branch('test_branch_behind_hundred', + 'test_branch_behind', + ancestor_start_lsn=lsn_a) # Insert many more rows. This generates enough WAL to fill a few segments. main_cur.execute(''' @@ -75,10 +76,12 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 400100 rows: {lsn_c}') # Branch at the point where only 200100 rows were inserted - env.zenith_cli.create_branch("test_branch_behind_more", "test_branch_behind@" + lsn_b) + env.zenith_cli.create_branch('test_branch_behind_more', + 'test_branch_behind', + ancestor_start_lsn=lsn_b) - pg_hundred = env.postgres.create_start("test_branch_behind_hundred") - pg_more = env.postgres.create_start("test_branch_behind_more") + pg_hundred = env.postgres.create_start('test_branch_behind_hundred') + pg_more = env.postgres.create_start('test_branch_behind_more') # On the 'hundred' branch, we should see only 100 rows hundred_pg_conn = pg_hundred.connect() @@ -99,19 +102,23 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): # Check bad lsn's for branching # branch at segment boundary - env.zenith_cli.create_branch("test_branch_segment_boundary", "test_branch_behind@0/3000000") - pg = env.postgres.create_start("test_branch_segment_boundary") + env.zenith_cli.create_branch('test_branch_segment_boundary', + 'test_branch_behind', + ancestor_start_lsn="0/3000000") + pg = env.postgres.create_start('test_branch_segment_boundary') cur = pg.connect().cursor() cur.execute('SELECT 1') assert cur.fetchone() == (1, ) # branch at pre-initdb lsn with pytest.raises(Exception, match="invalid branch start lsn"): - env.zenith_cli.create_branch("test_branch_preinitdb", "main@0/42") + env.zenith_cli.create_branch('test_branch_preinitdb', ancestor_start_lsn="0/42") # branch at pre-ancestor lsn with pytest.raises(Exception, match="less than timeline ancestor lsn"): - env.zenith_cli.create_branch("test_branch_preinitdb", "test_branch_behind@0/42") + env.zenith_cli.create_branch('test_branch_preinitdb', + 'test_branch_behind', + ancestor_start_lsn="0/42") # check that we cannot create branch based on garbage collected data with closing(env.pageserver.connect()) as psconn: @@ -123,7 +130,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): with pytest.raises(Exception, match="invalid branch start lsn"): # this gced_lsn is pretty random, so if gc is disabled this woudln't fail - env.zenith_cli.create_branch("test_branch_create_fail", f"test_branch_behind@{gced_lsn}") + env.zenith_cli.create_branch('test_branch_create_fail', + 'test_branch_behind', + ancestor_start_lsn=gced_lsn) # check that after gc everything is still there hundred_cur.execute('SELECT count(*) FROM foo') diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/batch_others/test_clog_truncate.py index 504f455936..b7eeedb23e 100644 --- a/test_runner/batch_others/test_clog_truncate.py +++ b/test_runner/batch_others/test_clog_truncate.py @@ -12,7 +12,7 @@ from fixtures.log_helper import log # def test_clog_truncate(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_clog_truncate", "empty") + env.zenith_cli.create_branch('test_clog_truncate', 'empty') # set agressive autovacuum to make sure that truncation will happen config = [ @@ -62,9 +62,9 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv): # create new branch after clog truncation and start a compute node on it log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}') - env.zenith_cli.create_branch("test_clog_truncate_new", - "test_clog_truncate@" + lsn_after_truncation) - + env.zenith_cli.create_branch('test_clog_truncate_new', + 'test_clog_truncate', + ancestor_start_lsn=lsn_after_truncation) pg2 = env.postgres.create_start('test_clog_truncate_new') log.info('postgres is running on test_clog_truncate_new branch') diff --git a/test_runner/batch_others/test_createdropdb.py b/test_runner/batch_others/test_createdropdb.py index 38243b298b..88937fa0dc 100644 --- a/test_runner/batch_others/test_createdropdb.py +++ b/test_runner/batch_others/test_createdropdb.py @@ -11,7 +11,7 @@ from fixtures.log_helper import log # def test_createdb(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_createdb", "empty") + env.zenith_cli.create_branch('test_createdb', 'empty') pg = env.postgres.create_start('test_createdb') log.info("postgres is running on 'test_createdb' branch") @@ -27,8 +27,7 @@ def test_createdb(zenith_simple_env: ZenithEnv): lsn = cur.fetchone()[0] # Create a branch - env.zenith_cli.create_branch("test_createdb2", "test_createdb@" + lsn) - + env.zenith_cli.create_branch('test_createdb2', 'test_createdb', ancestor_start_lsn=lsn) pg2 = env.postgres.create_start('test_createdb2') # Test that you can connect to the new database on both branches @@ -41,8 +40,7 @@ def test_createdb(zenith_simple_env: ZenithEnv): # def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir): env = zenith_simple_env - env.zenith_cli.create_branch("test_dropdb", "empty") - + env.zenith_cli.create_branch('test_dropdb', 'empty') pg = env.postgres.create_start('test_dropdb') log.info("postgres is running on 'test_dropdb' branch") @@ -66,10 +64,14 @@ def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir): lsn_after_drop = cur.fetchone()[0] # Create two branches before and after database drop. - env.zenith_cli.create_branch("test_before_dropdb", "test_dropdb@" + lsn_before_drop) + env.zenith_cli.create_branch('test_before_dropdb', + 'test_dropdb', + ancestor_start_lsn=lsn_before_drop) pg_before = env.postgres.create_start('test_before_dropdb') - env.zenith_cli.create_branch("test_after_dropdb", "test_dropdb@" + lsn_after_drop) + env.zenith_cli.create_branch('test_after_dropdb', + 'test_dropdb', + ancestor_start_lsn=lsn_after_drop) pg_after = env.postgres.create_start('test_after_dropdb') # Test that database exists on the branch before drop diff --git a/test_runner/batch_others/test_createuser.py b/test_runner/batch_others/test_createuser.py index 1959b47dcc..efb2af3f07 100644 --- a/test_runner/batch_others/test_createuser.py +++ b/test_runner/batch_others/test_createuser.py @@ -9,8 +9,7 @@ from fixtures.log_helper import log # def test_createuser(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_createuser", "empty") - + env.zenith_cli.create_branch('test_createuser', 'empty') pg = env.postgres.create_start('test_createuser') log.info("postgres is running on 'test_createuser' branch") @@ -25,8 +24,7 @@ def test_createuser(zenith_simple_env: ZenithEnv): lsn = cur.fetchone()[0] # Create a branch - env.zenith_cli.create_branch("test_createuser2", "test_createuser@" + lsn) - + env.zenith_cli.create_branch('test_createuser2', 'test_createuser', ancestor_start_lsn=lsn) pg2 = env.postgres.create_start('test_createuser2') # Test that you can connect to new branch as a new user diff --git a/test_runner/batch_others/test_multixact.py b/test_runner/batch_others/test_multixact.py index 6a2afd2ede..7a508a67fb 100644 --- a/test_runner/batch_others/test_multixact.py +++ b/test_runner/batch_others/test_multixact.py @@ -10,7 +10,7 @@ from fixtures.log_helper import log # def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir): env = zenith_simple_env - env.zenith_cli.create_branch("test_multixact", "empty") + env.zenith_cli.create_branch('test_multixact', 'empty') pg = env.postgres.create_start('test_multixact') log.info("postgres is running on 'test_multixact' branch") @@ -60,7 +60,7 @@ def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir): assert int(next_multixact_id) > int(next_multixact_id_old) # Branch at this point - env.zenith_cli.create_branch("test_multixact_new", "test_multixact@" + lsn) + env.zenith_cli.create_branch('test_multixact_new', 'test_multixact', ancestor_start_lsn=lsn) pg_new = env.postgres.create_start('test_multixact_new') log.info("postgres is running on 'test_multixact_new' branch") diff --git a/test_runner/batch_others/test_next_xid.py b/test_runner/batch_others/test_next_xid.py index 625abc39d3..fd0f761409 100644 --- a/test_runner/batch_others/test_next_xid.py +++ b/test_runner/batch_others/test_next_xid.py @@ -11,7 +11,7 @@ from fixtures.log_helper import log def test_next_xid(zenith_env_builder: ZenithEnvBuilder): # One safekeeper is enough for this test. zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() pg = env.postgres.create_start('main') diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index eccffc4d69..2aa3686904 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -1,8 +1,15 @@ -import json from uuid import uuid4, UUID -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient -from typing import cast -import pytest, psycopg2 +import pytest +from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient, zenith_binpath + + +# test that we cannot override node id +def test_pageserver_init_node_id(zenith_env_builder: ZenithEnvBuilder): + env = zenith_env_builder.init() + with pytest.raises( + Exception, + match="node id can only be set during pageserver init and cannot be overridden"): + env.pageserver.start(overrides=['--pageserver-config-override=id=10']) def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): @@ -16,21 +23,25 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): client.tenant_create(tenant_id) assert tenant_id.hex in {t['id'] for t in client.tenant_list()} - # check its timelines + timelines = client.timeline_list(tenant_id) + assert len(timelines) == 0, "initial tenant should not have any timelines" + + # create timeline + timeline_id = uuid4() + client.timeline_create(tenant_id=tenant_id, new_timeline_id=timeline_id) + timelines = client.timeline_list(tenant_id) assert len(timelines) > 0 - for timeline_id_str in timelines: - timeline_details = client.timeline_detail(tenant_id, UUID(timeline_id_str)) - assert timeline_details['type'] == 'Local' - assert timeline_details['tenant_id'] == tenant_id.hex - assert timeline_details['timeline_id'] == timeline_id_str - - # create branch - branch_name = uuid4().hex - client.branch_create(tenant_id, branch_name, "main") # check it is there - assert branch_name in {b['name'] for b in client.branch_list(tenant_id)} + assert timeline_id.hex in {b['timeline_id'] for b in client.timeline_list(tenant_id)} + for timeline in timelines: + timeline_id_str = str(timeline['timeline_id']) + timeline_details = client.timeline_detail(tenant_id=tenant_id, + timeline_id=UUID(timeline_id_str)) + assert timeline_details['kind'] == 'Local' + assert timeline_details['tenant_id'] == tenant_id.hex + assert timeline_details['timeline_id'] == timeline_id_str def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv): @@ -41,7 +52,7 @@ def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv): def test_pageserver_http_api_client_auth_enabled(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.pageserver_auth_enabled = True - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() management_token = env.auth_keys.generate_management_token() diff --git a/test_runner/batch_others/test_pageserver_catchup.py b/test_runner/batch_others/test_pageserver_catchup.py index 97dc0f3260..7093a1bdb3 100644 --- a/test_runner/batch_others/test_pageserver_catchup.py +++ b/test_runner/batch_others/test_pageserver_catchup.py @@ -14,9 +14,9 @@ from fixtures.log_helper import log # and new compute node contains all data. def test_pageserver_catchup_while_compute_down(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_pageserver_catchup_while_compute_down", "main") + env.zenith_cli.create_branch('test_pageserver_catchup_while_compute_down') pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down') pg_conn = pg.connect() diff --git a/test_runner/batch_others/test_pageserver_restart.py b/test_runner/batch_others/test_pageserver_restart.py index 0cfc50f0ff..57f9db8f96 100644 --- a/test_runner/batch_others/test_pageserver_restart.py +++ b/test_runner/batch_others/test_pageserver_restart.py @@ -13,9 +13,9 @@ from fixtures.log_helper import log def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder): # One safekeeper is enough for this test. zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_pageserver_restart", "main") + env.zenith_cli.create_branch('test_pageserver_restart') pg = env.postgres.create_start('test_pageserver_restart') pg_conn = pg.connect() diff --git a/test_runner/batch_others/test_parallel_copy.py b/test_runner/batch_others/test_parallel_copy.py index 6f87bc4a36..4b7cc58d42 100644 --- a/test_runner/batch_others/test_parallel_copy.py +++ b/test_runner/batch_others/test_parallel_copy.py @@ -1,7 +1,5 @@ from io import BytesIO import asyncio -import asyncpg -import subprocess from fixtures.zenith_fixtures import ZenithEnv, Postgres from fixtures.log_helper import log diff --git a/test_runner/batch_others/test_proxy.py b/test_runner/batch_others/test_proxy.py index 9510e880b2..d2039f9758 100644 --- a/test_runner/batch_others/test_proxy.py +++ b/test_runner/batch_others/test_proxy.py @@ -1,2 +1,15 @@ +import pytest + + def test_proxy_select_1(static_proxy): static_proxy.safe_psql("select 1;") + + +@pytest.mark.xfail # Proxy eats the extra connection options +def test_proxy_options(static_proxy): + schema_name = "tmp_schema_1" + with static_proxy.connect(schema=schema_name) as conn: + with conn.cursor() as cur: + cur.execute("SHOW search_path;") + search_path = cur.fetchall()[0][0] + assert schema_name == search_path diff --git a/test_runner/batch_others/test_readonly_node.py b/test_runner/batch_others/test_readonly_node.py index ba256e71f7..808ee62def 100644 --- a/test_runner/batch_others/test_readonly_node.py +++ b/test_runner/batch_others/test_readonly_node.py @@ -11,8 +11,7 @@ from fixtures.zenith_fixtures import ZenithEnv # def test_readonly_node(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_readonly_node", "empty") - + env.zenith_cli.create_branch('test_readonly_node', 'empty') pgmain = env.postgres.create_start('test_readonly_node') log.info("postgres is running on 'test_readonly_node' branch") @@ -53,12 +52,14 @@ def test_readonly_node(zenith_simple_env: ZenithEnv): log.info('LSN after 400100 rows: ' + lsn_c) # Create first read-only node at the point where only 100 rows were inserted - pg_hundred = env.postgres.create_start("test_readonly_node_hundred", - branch=f'test_readonly_node@{lsn_a}') + pg_hundred = env.postgres.create_start(branch_name='test_readonly_node', + node_name='test_readonly_node_hundred', + lsn=lsn_a) # And another at the point where 200100 rows were inserted - pg_more = env.postgres.create_start("test_readonly_node_more", - branch=f'test_readonly_node@{lsn_b}') + pg_more = env.postgres.create_start(branch_name='test_readonly_node', + node_name='test_readonly_node_more', + lsn=lsn_b) # On the 'hundred' node, we should see only 100 rows hundred_pg_conn = pg_hundred.connect() @@ -77,8 +78,9 @@ def test_readonly_node(zenith_simple_env: ZenithEnv): assert main_cur.fetchone() == (400100, ) # Check creating a node at segment boundary - pg = env.postgres.create_start("test_branch_segment_boundary", - branch="test_readonly_node@0/3000000") + pg = env.postgres.create_start(branch_name='test_readonly_node', + node_name='test_branch_segment_boundary', + lsn='0/3000000') cur = pg.connect().cursor() cur.execute('SELECT 1') assert cur.fetchone() == (1, ) @@ -86,5 +88,6 @@ def test_readonly_node(zenith_simple_env: ZenithEnv): # Create node at pre-initdb lsn with pytest.raises(Exception, match="invalid basebackup lsn"): # compute node startup with invalid LSN should fail - env.zenith_cli.pg_start("test_readonly_node_preinitdb", - timeline_spec="test_readonly_node@0/42") + env.postgres.create_start(branch_name='test_readonly_node', + node_name='test_readonly_node_preinitdb', + lsn='0/42') diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index fa6feaf412..edcc768819 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -42,8 +42,8 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, data_secret = 'very secret secret' ##### First start, insert secret data and upload it to the remote storage - env = zenith_env_builder.init() - pg = env.postgres.create_start() + env = zenith_env_builder.init_start() + pg = env.postgres.create_start('main') tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] @@ -85,7 +85,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, timeline_details = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) assert timeline_details['timeline_id'] == timeline_id assert timeline_details['tenant_id'] == tenant_id - if timeline_details['type'] == 'Local': + if timeline_details['kind'] == 'Local': log.info("timeline downloaded, checking its data") break attempts += 1 @@ -94,7 +94,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, log.debug("still waiting") time.sleep(1) - pg = env.postgres.create_start() + pg = env.postgres.create_start('main') with closing(pg.connect()) as conn: with conn.cursor() as cur: cur.execute(f'SELECT secret FROM t1 WHERE id = {data_id};') diff --git a/test_runner/batch_others/test_restart_compute.py b/test_runner/batch_others/test_restart_compute.py index f7810be555..fd06561c00 100644 --- a/test_runner/batch_others/test_restart_compute.py +++ b/test_runner/batch_others/test_restart_compute.py @@ -13,10 +13,9 @@ def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptor zenith_env_builder.pageserver_auth_enabled = True if with_wal_acceptors: zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() - - env.zenith_cli.create_branch("test_restart_compute", "main") + env = zenith_env_builder.init_start() + env.zenith_cli.create_branch('test_restart_compute') pg = env.postgres.create_start('test_restart_compute') log.info("postgres is running on 'test_restart_compute' branch") diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 5c6d78e730..7a9d478f16 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -122,21 +122,19 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, zenith_env_builder.num_safekeepers = 1 zenith_env_builder.enable_local_fs_remote_storage() - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # create folder for remote storage mock remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage' - tenant = env.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) + tenant = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) log.info("tenant to relocate %s", tenant) - env.zenith_cli.create_branch("test_tenant_relocation", "main", tenant_id=tenant) + env.zenith_cli.create_branch('test_tenant_relocation', tenant_id=tenant) - tenant_pg = env.postgres.create_start( - "test_tenant_relocation", - "main", # branch name, None means same as node name - tenant_id=tenant, - ) + tenant_pg = env.postgres.create_start(branch_name='main', + node_name='test_tenant_relocation', + tenant_id=tenant) # insert some data with closing(tenant_pg.connect()) as conn: diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/batch_others/test_tenants.py index 232c724870..e883018628 100644 --- a/test_runner/batch_others/test_tenants.py +++ b/test_runner/batch_others/test_tenants.py @@ -10,27 +10,23 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_wal_acce if with_wal_acceptors: zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() """Tests tenants with and without wal acceptors""" - tenant_1 = env.create_tenant() - tenant_2 = env.create_tenant() + tenant_1 = env.zenith_cli.create_tenant() + tenant_2 = env.zenith_cli.create_tenant() - env.zenith_cli.create_branch(f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", - "main", - tenant_id=tenant_1) - env.zenith_cli.create_branch(f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", - "main", - tenant_id=tenant_2) + env.zenith_cli.create_timeline( + f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', tenant_id=tenant_1) + env.zenith_cli.create_timeline( + f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', tenant_id=tenant_2) pg_tenant1 = env.postgres.create_start( - f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", - None, # branch name, None means same as node name - tenant_1, + f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', + tenant_id=tenant_1, ) pg_tenant2 = env.postgres.create_start( - f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", - None, # branch name, None means same as node name - tenant_2, + f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', + tenant_id=tenant_2, ) for pg in [pg_tenant1, pg_tenant2]: diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index b48f830528..7d8ab551b0 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -10,10 +10,10 @@ import time def test_timeline_size(zenith_simple_env: ZenithEnv): env = zenith_simple_env # Branch at the point where only 100 rows were inserted - env.zenith_cli.create_branch("test_timeline_size", "empty") + new_timeline_id = env.zenith_cli.create_branch('test_timeline_size', 'empty') client = env.pageserver.http_client() - res = client.branch_detail(env.initial_tenant, "test_timeline_size") + res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) assert res["current_logical_size"] == res["current_logical_size_non_incremental"] pgmain = env.postgres.create_start("test_timeline_size") @@ -31,11 +31,11 @@ def test_timeline_size(zenith_simple_env: ZenithEnv): FROM generate_series(1, 10) g """) - res = client.branch_detail(env.initial_tenant, "test_timeline_size") + res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) assert res["current_logical_size"] == res["current_logical_size_non_incremental"] cur.execute("TRUNCATE foo") - res = client.branch_detail(env.initial_tenant, "test_timeline_size") + res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) assert res["current_logical_size"] == res["current_logical_size_non_incremental"] @@ -67,18 +67,17 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() - env.zenith_cli.create_branch("test_timeline_size_quota", "main") + env = zenith_env_builder.init_start() + new_timeline_id = env.zenith_cli.create_branch('test_timeline_size_quota') client = env.pageserver.http_client() - res = client.branch_detail(env.initial_tenant, "test_timeline_size_quota") + res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) assert res["current_logical_size"] == res["current_logical_size_non_incremental"] pgmain = env.postgres.create_start( "test_timeline_size_quota", # Set small limit for the test - config_lines=['zenith.max_cluster_size=30MB'], - ) + config_lines=['zenith.max_cluster_size=30MB']) log.info("postgres is running on 'test_timeline_size_quota' branch") with closing(pgmain.connect()) as conn: diff --git a/test_runner/batch_others/test_twophase.py b/test_runner/batch_others/test_twophase.py index d6a1cd01e8..4afdc7e0be 100644 --- a/test_runner/batch_others/test_twophase.py +++ b/test_runner/batch_others/test_twophase.py @@ -10,7 +10,6 @@ from fixtures.log_helper import log def test_twophase(zenith_simple_env: ZenithEnv): env = zenith_simple_env env.zenith_cli.create_branch("test_twophase", "empty") - pg = env.postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5']) log.info("postgres is running on 'test_twophase' branch") diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 4d9e18bb58..bdc4c4f63c 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -13,7 +13,7 @@ from dataclasses import dataclass, field from multiprocessing import Process, Value from pathlib import Path from fixtures.zenith_fixtures import PgBin, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol -from fixtures.utils import lsn_to_hex, mkdir_if_needed +from fixtures.utils import lsn_to_hex, mkdir_if_needed, lsn_from_hex from fixtures.log_helper import log from typing import List, Optional, Any @@ -22,10 +22,9 @@ from typing import List, Optional, Any # succeed and data is written def test_normal_work(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() - - env.zenith_cli.create_branch("test_wal_acceptors_normal_work", "main") + env = zenith_env_builder.init_start() + env.zenith_cli.create_branch('test_wal_acceptors_normal_work') pg = env.postgres.create_start('test_wal_acceptors_normal_work') with closing(pg.connect()) as conn: @@ -39,9 +38,9 @@ def test_normal_work(zenith_env_builder: ZenithEnvBuilder): @dataclass -class BranchMetrics: - name: str - latest_valid_lsn: int +class TimelineMetrics: + timeline_id: str + last_record_lsn: int # One entry per each Safekeeper, order is the same flush_lsns: List[int] = field(default_factory=list) commit_lsns: List[int] = field(default_factory=list) @@ -51,27 +50,36 @@ class BranchMetrics: # against different timelines. def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() n_timelines = 3 - branches = ["test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)] + branch_names = [ + "test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines) + ] + # pageserver, safekeeper operate timelines via their ids (can be represented in hex as 'ad50847381e248feaac9876cc71ae418') + # that's not really human readable, so the branch names are introduced in Zenith CLI. + # Zenith CLI stores its branch <-> timeline mapping in its internals, + # but we need this to collect metrics from other servers, related to the timeline. + branch_names_to_timeline_ids = {} # start postgres on each timeline pgs = [] - for branch in branches: - env.zenith_cli.create_branch(branch, "main") - pgs.append(env.postgres.create_start(branch)) + for branch_name in branch_names: + new_timeline_id = env.zenith_cli.create_branch(branch_name) + pgs.append(env.postgres.create_start(branch_name)) + branch_names_to_timeline_ids[branch_name] = new_timeline_id tenant_id = env.initial_tenant - def collect_metrics(message: str) -> List[BranchMetrics]: + def collect_metrics(message: str) -> List[TimelineMetrics]: with env.pageserver.http_client() as pageserver_http: - branch_details = [ - pageserver_http.branch_detail(tenant_id=tenant_id, name=branch) - for branch in branches + timeline_details = [ + pageserver_http.timeline_detail( + tenant_id=tenant_id, timeline_id=branch_names_to_timeline_ids[branch_name]) + for branch_name in branch_names ] - # All changes visible to pageserver (latest_valid_lsn) should be + # All changes visible to pageserver (last_record_lsn) should be # confirmed by safekeepers first. As we cannot atomically get # state of both pageserver and safekeepers, we should start with # pageserver. Looking at outdated data from pageserver is ok. @@ -80,14 +88,14 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): # safekeepers' state, it will look contradictory. sk_metrics = [sk.http_client().get_metrics() for sk in env.safekeepers] - branch_metrics = [] + timeline_metrics = [] with env.pageserver.http_client() as pageserver_http: - for branch_detail in branch_details: - timeline_id: str = branch_detail["timeline_id"] + for timeline_detail in timeline_details: + timeline_id: str = timeline_detail["timeline_id"] - m = BranchMetrics( - name=branch_detail["name"], - latest_valid_lsn=branch_detail["latest_valid_lsn"], + m = TimelineMetrics( + timeline_id=timeline_id, + last_record_lsn=lsn_from_hex(timeline_detail["last_record_lsn"]), ) for sk_m in sk_metrics: m.flush_lsns.append(sk_m.flush_lsn_inexact[(tenant_id.hex, timeline_id)]) @@ -99,13 +107,13 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): # We only call collect_metrics() after a transaction is confirmed by # the compute node, which only happens after a consensus of safekeepers # has confirmed the transaction. We assume majority consensus here. - assert (2 * sum(m.latest_valid_lsn <= lsn + assert (2 * sum(m.last_record_lsn <= lsn for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers) - assert (2 * sum(m.latest_valid_lsn <= lsn + assert (2 * sum(m.last_record_lsn <= lsn for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers) - branch_metrics.append(m) - log.info(f"{message}: {branch_metrics}") - return branch_metrics + timeline_metrics.append(m) + log.info(f"{message}: {timeline_metrics}") + return timeline_metrics # TODO: https://github.com/zenithdb/zenith/issues/809 # collect_metrics("before CREATE TABLE") @@ -117,7 +125,7 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): pg.safe_psql("CREATE TABLE t(key int primary key, value text)") init_m = collect_metrics("after CREATE TABLE") - # Populate data for 2/3 branches + # Populate data for 2/3 timelines class MetricsChecker(threading.Thread): def __init__(self) -> None: super().__init__(daemon=True) @@ -155,15 +163,15 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): collect_metrics("after INSERT INTO") - # Check data for 2/3 branches + # Check data for 2/3 timelines for pg in pgs[:-1]: res = pg.safe_psql("SELECT sum(key) FROM t") assert res[0] == (5000050000, ) final_m = collect_metrics("after SELECT") - # Assume that LSNs (a) behave similarly in all branches; and (b) INSERT INTO alters LSN significantly. + # Assume that LSNs (a) behave similarly in all timelines; and (b) INSERT INTO alters LSN significantly. # Also assume that safekeepers will not be significantly out of sync in this test. - middle_lsn = (init_m[0].latest_valid_lsn + final_m[0].latest_valid_lsn) // 2 + middle_lsn = (init_m[0].last_record_lsn + final_m[0].last_record_lsn) // 2 assert max(init_m[0].flush_lsns) < middle_lsn < min(final_m[0].flush_lsns) assert max(init_m[0].commit_lsns) < middle_lsn < min(final_m[0].commit_lsns) assert max(init_m[1].flush_lsns) < middle_lsn < min(final_m[1].flush_lsns) @@ -181,9 +189,9 @@ def test_restarts(zenith_env_builder: ZenithEnvBuilder): n_acceptors = 3 zenith_env_builder.num_safekeepers = n_acceptors - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_wal_acceptors_restarts", "main") + env.zenith_cli.create_branch('test_wal_acceptors_restarts') pg = env.postgres.create_start('test_wal_acceptors_restarts') # we rely upon autocommit after each statement @@ -218,9 +226,9 @@ def delayed_wal_acceptor_start(wa): # When majority of acceptors is offline, commits are expected to be frozen def test_unavailability(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 2 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_wal_acceptors_unavailability", "main") + env.zenith_cli.create_branch('test_wal_acceptors_unavailability') pg = env.postgres.create_start('test_wal_acceptors_unavailability') # we rely upon autocommit after each statement @@ -289,9 +297,9 @@ def stop_value(): def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value): zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_wal_acceptors_race_conditions", "main") + env.zenith_cli.create_branch('test_wal_acceptors_race_conditions') pg = env.postgres.create_start('test_wal_acceptors_race_conditions') # we rely upon autocommit after each statement @@ -404,7 +412,7 @@ def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, # We don't really need the full environment for this test, just the # safekeepers would be enough. zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() timeline_id = uuid.uuid4() tenant_id = uuid.uuid4() @@ -454,9 +462,9 @@ def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_timeline_status", "main") + env.zenith_cli.create_branch('test_timeline_status') pg = env.postgres.create_start('test_timeline_status') wa = env.safekeepers[0] @@ -521,12 +529,7 @@ class SafekeeperEnv: http=self.port_distributor.get_port(), ) - if self.num_safekeepers == 1: - name = "single" - else: - name = f"sk{i}" - - safekeeper_dir = os.path.join(self.repo_dir, name) + safekeeper_dir = os.path.join(self.repo_dir, f"sk{i}") mkdir_if_needed(safekeeper_dir) args = [ @@ -537,6 +540,8 @@ class SafekeeperEnv: f"127.0.0.1:{port.http}", "-D", safekeeper_dir, + "--id", + str(i), "--daemonize" ] @@ -604,9 +609,8 @@ def test_safekeeper_without_pageserver(test_output_dir: str, def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): - def safekeepers_guc(env: ZenithEnv, sk_names: List[str]) -> str: - return ','.join( - [f'localhost:{sk.port.pg}' for sk in env.safekeepers if sk.name in sk_names]) + def safekeepers_guc(env: ZenithEnv, sk_names: List[int]) -> str: + return ','.join([f'localhost:{sk.port.pg}' for sk in env.safekeepers if sk.id in sk_names]) def execute_payload(pg: Postgres): with closing(pg.connect()) as conn: @@ -628,17 +632,17 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): http_cli = sk.http_client() try: status = http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"Safekeeper {sk.name} status: {status}") + log.info(f"Safekeeper {sk.id} status: {status}") except Exception as e: - log.info(f"Safekeeper {sk.name} status error: {e}") + log.info(f"Safekeeper {sk.id} status error: {e}") zenith_env_builder.num_safekeepers = 4 - env = zenith_env_builder.init() - env.zenith_cli.create_branch("test_replace_safekeeper", "main") + env = zenith_env_builder.init_start() + env.zenith_cli.create_branch('test_replace_safekeeper') log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() - active_safekeepers = ['sk1', 'sk2', 'sk3'] + active_safekeepers = [1, 2, 3] pg = env.postgres.create('test_replace_safekeeper') pg.adjust_for_wal_acceptors(safekeepers_guc(env, active_safekeepers)) pg.start() @@ -678,7 +682,7 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): log.info("Recreate postgres to replace failed sk1 with new sk4") pg.stop_and_destroy().create('test_replace_safekeeper') - active_safekeepers = ['sk2', 'sk3', 'sk4'] + active_safekeepers = [2, 3, 4] env.safekeepers[3].start() pg.adjust_for_wal_acceptors(safekeepers_guc(env, active_safekeepers)) pg.start() diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index 1d2a186eb7..31ace7eab3 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -200,9 +200,9 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_w # restart acceptors one by one, while executing and validating bank transactions def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_wal_acceptors_restarts_under_load", "main") + env.zenith_cli.create_branch('test_wal_acceptors_restarts_under_load') pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load') asyncio.run(run_restarts_under_load(pg, env.safekeepers)) diff --git a/test_runner/batch_others/test_zenith_cli.py b/test_runner/batch_others/test_zenith_cli.py index ce051dfd6e..4a62a1430a 100644 --- a/test_runner/batch_others/test_zenith_cli.py +++ b/test_runner/batch_others/test_zenith_cli.py @@ -7,52 +7,46 @@ from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserv from typing import cast -def helper_compare_branch_list(pageserver_http_client: ZenithPageserverHttpClient, - env: ZenithEnv, - initial_tenant: uuid.UUID): +def helper_compare_timeline_list(pageserver_http_client: ZenithPageserverHttpClient, + env: ZenithEnv, + initial_tenant: uuid.UUID): """ - Compare branches list returned by CLI and directly via API. - Filters out branches created by other tests. + Compare timelines list returned by CLI and directly via API. + Filters out timelines created by other tests. """ - branches = pageserver_http_client.branch_list(initial_tenant) - branches_api = sorted(map(lambda b: cast(str, b['name']), branches)) - branches_api = [b for b in branches_api if b.startswith('test_cli_') or b in ('empty', 'main')] - res = env.zenith_cli.list_branches() - branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n"))) - branches_cli = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')] + timelines_api = sorted( + map(lambda t: cast(str, t['timeline_id']), + pageserver_http_client.timeline_list(initial_tenant))) - res = env.zenith_cli.list_branches(tenant_id=initial_tenant) - branches_cli_with_tenant_arg = sorted( - map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n"))) - branches_cli_with_tenant_arg = [ - b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main') - ] + timelines_cli = env.zenith_cli.list_timelines() + assert timelines_cli == env.zenith_cli.list_timelines(initial_tenant) - assert branches_api == branches_cli == branches_cli_with_tenant_arg + cli_timeline_ids = sorted([timeline_id for (_, timeline_id) in timelines_cli]) + assert timelines_api == cli_timeline_ids -def test_cli_branch_list(zenith_simple_env: ZenithEnv): +def test_cli_timeline_list(zenith_simple_env: ZenithEnv): env = zenith_simple_env pageserver_http_client = env.pageserver.http_client() # Initial sanity check - helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant) - env.zenith_cli.create_branch("test_cli_branch_list_main", "empty") - helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant) + helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) + + # Create a branch for us + main_timeline_id = env.zenith_cli.create_branch('test_cli_branch_list_main') + helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Create a nested branch - res = env.zenith_cli.create_branch("test_cli_branch_list_nested", "test_cli_branch_list_main") - assert res.stderr == '' - helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant) + nested_timeline_id = env.zenith_cli.create_branch('test_cli_branch_list_nested', + 'test_cli_branch_list_main') + helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Check that all new branches are visible via CLI - res = env.zenith_cli.list_branches() - assert res.stderr == '' - branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n"))) + timelines_cli = [timeline_id for (_, timeline_id) in env.zenith_cli.list_timelines()] - assert 'test_cli_branch_list_main' in branches_cli - assert 'test_cli_branch_list_nested' in branches_cli + assert main_timeline_id.hex in timelines_cli + assert nested_timeline_id.hex in timelines_cli def helper_compare_tenant_list(pageserver_http_client: ZenithPageserverHttpClient, env: ZenithEnv): @@ -60,7 +54,6 @@ def helper_compare_tenant_list(pageserver_http_client: ZenithPageserverHttpClien tenants_api = sorted(map(lambda t: cast(str, t['id']), tenants)) res = env.zenith_cli.list_tenants() - assert res.stderr == '' tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines())) assert tenants_api == tenants_cli @@ -73,15 +66,13 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv): helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant - tenant1 = uuid.uuid4() - env.zenith_cli.create_tenant(tenant1) + tenant1 = env.zenith_cli.create_tenant() # check tenant1 appeared helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant - tenant2 = uuid.uuid4() - env.zenith_cli.create_tenant(tenant2) + tenant2 = env.zenith_cli.create_tenant() # check tenant2 appeared helper_compare_tenant_list(pageserver_http_client, env) @@ -97,7 +88,7 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv): def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): # Start with single sk zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # Connect to sk port on v4 loopback res = requests.get(f'http://127.0.0.1:{env.safekeepers[0].port.http}/v1/status') @@ -114,7 +105,7 @@ def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): def test_cli_start_stop(zenith_env_builder: ZenithEnvBuilder): # Start with single sk zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # Stop default ps/sk env.zenith_cli.pageserver_stop() diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 570c787184..750b02c894 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -64,9 +64,8 @@ class ZenithCompare(PgCompare): self._pg_bin = pg_bin # We only use one branch and one timeline - self.branch = branch_name - self.env.zenith_cli.create_branch(self.branch, "empty") - self._pg = self.env.postgres.create_start(self.branch) + self.env.zenith_cli.create_branch(branch_name, 'empty') + self._pg = self.env.postgres.create_start(branch_name) self.timeline = self.pg.safe_psql("SHOW zenith.zenith_timeline")[0][0] # Long-lived cursor, useful for flushing diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index b4b3de1db3..ec570a7dac 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1,6 +1,6 @@ from __future__ import annotations -from dataclasses import dataclass, field +from dataclasses import field import textwrap from cached_property import cached_property import asyncpg @@ -27,9 +27,8 @@ from dataclasses import dataclass # Type-related stuff from psycopg2.extensions import connection as PgConnection -from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple +from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, TypeVar, cast, Union, Tuple from typing_extensions import Literal -import pytest import requests import backoff # type: ignore @@ -58,6 +57,7 @@ Fn = TypeVar('Fn', bound=Callable[..., Any]) DEFAULT_OUTPUT_DIR = 'test_output' DEFAULT_POSTGRES_DIR = 'tmp_install' +DEFAULT_BRANCH_NAME = 'main' BASE_PORT = 15000 WORKER_PORT_NUM = 100 @@ -219,7 +219,7 @@ def can_bind(host: str, port: int) -> bool: class PortDistributor: - def __init__(self, base_port: int, port_number: int) -> None: + def __init__(self, base_port: int, port_number: int): self.iterator = iter(range(base_port, base_port + port_number)) def get_port(self) -> int: @@ -242,15 +242,20 @@ class PgProtocol: host: str, port: int, username: Optional[str] = None, - password: Optional[str] = None): + password: Optional[str] = None, + dbname: Optional[str] = None, + schema: Optional[str] = None): self.host = host self.port = port self.username = username self.password = password + self.dbname = dbname + self.schema = schema def connstr(self, *, - dbname: str = 'postgres', + dbname: Optional[str] = None, + schema: Optional[str] = None, username: Optional[str] = None, password: Optional[str] = None) -> str: """ @@ -259,6 +264,8 @@ class PgProtocol: username = username or self.username password = password or self.password + dbname = dbname or self.dbname or "postgres" + schema = schema or self.schema res = f'host={self.host} port={self.port} dbname={dbname}' if username: @@ -267,13 +274,17 @@ class PgProtocol: if password: res = f'{res} password={password}' + if schema: + res = f"{res} options='-c search_path={schema}'" + return res # autocommit=True here by default because that's what we need most of the time def connect(self, *, autocommit=True, - dbname: str = 'postgres', + dbname: Optional[str] = None, + schema: Optional[str] = None, username: Optional[str] = None, password: Optional[str] = None) -> PgConnection: """ @@ -282,11 +293,13 @@ class PgProtocol: This method passes all extra params to connstr. """ - conn = psycopg2.connect(self.connstr( - dbname=dbname, - username=username, - password=password, - )) + conn = psycopg2.connect( + self.connstr( + dbname=dbname, + schema=schema, + username=username, + password=password, + )) # WARNING: this setting affects *all* tests! conn.autocommit = autocommit return conn @@ -411,7 +424,8 @@ class ZenithEnvBuilder: pageserver_config_override: Optional[str] = None, num_safekeepers: int = 0, pageserver_auth_enabled: bool = False, - rust_log_override: Optional[str] = None): + rust_log_override: Optional[str] = None, + default_branch_name=DEFAULT_BRANCH_NAME): self.repo_dir = repo_dir self.rust_log_override = rust_log_override self.port_distributor = port_distributor @@ -419,6 +433,7 @@ class ZenithEnvBuilder: self.pageserver_config_override = pageserver_config_override self.num_safekeepers = num_safekeepers self.pageserver_auth_enabled = pageserver_auth_enabled + self.default_branch_name = default_branch_name self.env: Optional[ZenithEnv] = None self.s3_mock_server: Optional[MockS3Server] = None @@ -434,6 +449,14 @@ class ZenithEnvBuilder: self.env = ZenithEnv(self) return self.env + def start(self): + self.env.start() + + def init_start(self) -> ZenithEnv: + env = self.init() + self.start() + return env + """ Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path. Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. @@ -515,7 +538,7 @@ class ZenithEnv: initial_tenant - tenant ID of the initial tenant created in the repository - zenith_cli() - zenith_cli() can be used to run the 'zenith' CLI tool + zenith_cli - can be used to run the 'zenith' CLI tool create_tenant() - initializes a new tenant in the page server, returns the tenant id @@ -526,9 +549,7 @@ class ZenithEnv: self.port_distributor = config.port_distributor self.s3_mock_server = config.s3_mock_server self.zenith_cli = ZenithCli(env=self) - self.postgres = PostgresFactory(self) - self.safekeepers: List[Safekeeper] = [] # generate initial tenant ID here instead of letting 'zenith init' generate it, @@ -537,7 +558,7 @@ class ZenithEnv: # Create a config file corresponding to the options toml = textwrap.dedent(f""" - default_tenantid = '{self.initial_tenant.hex}' + default_tenant_id = '{self.initial_tenant.hex}' """) # Create config for pageserver @@ -549,6 +570,7 @@ class ZenithEnv: toml += textwrap.dedent(f""" [pageserver] + id=1 listen_pg_addr = 'localhost:{pageserver_port.pg}' listen_http_addr = 'localhost:{pageserver_port.http}' auth_type = '{pageserver_auth_type}' @@ -566,25 +588,21 @@ class ZenithEnv: pg=self.port_distributor.get_port(), http=self.port_distributor.get_port(), ) - - if config.num_safekeepers == 1: - name = "single" - else: - name = f"sk{i}" - toml += f""" -[[safekeepers]] -name = '{name}' -pg_port = {port.pg} -http_port = {port.http} -sync = false # Disable fsyncs to make the tests go faster - """ - safekeeper = Safekeeper(env=self, name=name, port=port) + id = i # assign ids sequentially + toml += textwrap.dedent(f""" + [[safekeepers]] + id = {id} + pg_port = {port.pg} + http_port = {port.http} + sync = false # Disable fsyncs to make the tests go faster + """) + safekeeper = Safekeeper(env=self, id=id, port=port) self.safekeepers.append(safekeeper) log.info(f"Config: {toml}") - self.zenith_cli.init(toml) + def start(self): # Start up the page server and all the safekeepers self.pageserver.start() @@ -595,12 +613,6 @@ sync = false # Disable fsyncs to make the tests go faster """ Get list of safekeeper endpoints suitable for wal_acceptors GUC """ return ','.join([f'localhost:{wa.port.pg}' for wa in self.safekeepers]) - def create_tenant(self, tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: - if tenant_id is None: - tenant_id = uuid.uuid4() - self.zenith_cli.create_tenant(tenant_id) - return tenant_id - @cached_property def auth_keys(self) -> AuthKeys: pub = (Path(self.repo_dir) / 'auth_public_key.pem').read_bytes() @@ -624,13 +636,11 @@ def _shared_simple_env(request: Any, port_distributor) -> Iterator[ZenithEnv]: shutil.rmtree(repo_dir, ignore_errors=True) with ZenithEnvBuilder(Path(repo_dir), port_distributor) as builder: - - env = builder.init() + env = builder.init_start() # For convenience in tests, create a branch from the freshly-initialized cluster. - env.zenith_cli.create_branch("empty", "main") + env.zenith_cli.create_branch('empty', ancestor_branch_name=DEFAULT_BRANCH_NAME) - # Return the builder to the caller yield env @@ -659,7 +669,7 @@ def zenith_env_builder(test_output_dir, port_distributor) -> Iterator[ZenithEnvB To use, define 'zenith_env_builder' fixture in your test to get access to the builder object. Set properties on it to describe the environment. Finally, initialize and start up the environment by calling - zenith_env_builder.init(). + zenith_env_builder.init_start(). After the initialization, you can launch compute nodes by calling the functions in the 'env.postgres' factory object, stop/start the @@ -679,7 +689,7 @@ class ZenithPageserverApiException(Exception): class ZenithPageserverHttpClient(requests.Session): - def __init__(self, port: int, auth_token: Optional[str] = None) -> None: + def __init__(self, port: int, auth_token: Optional[str] = None): super().__init__() self.port = port self.auth_token = auth_token @@ -702,38 +712,36 @@ class ZenithPageserverHttpClient(requests.Session): def timeline_attach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): res = self.post( - f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}/attach", ) + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/attach", + ) self.verbose_error(res) def timeline_detach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): res = self.post( - f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}/detach", ) - self.verbose_error(res) - - def branch_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def branch_create(self, tenant_id: uuid.UUID, name: str, start_point: str) -> Dict[Any, Any]: - res = self.post(f"http://localhost:{self.port}/v1/branch", - json={ - 'tenant_id': tenant_id.hex, - 'name': name, - 'start_point': start_point, - }) - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def branch_detail(self, tenant_id: uuid.UUID, name: str) -> Dict[Any, Any]: - res = self.get( - f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}/{name}?include-non-incremental-logical-size=1", + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/detach", ) self.verbose_error(res) + + def timeline_create( + self, + tenant_id: uuid.UUID, + new_timeline_id: Optional[uuid.UUID] = None, + ancestor_timeline_id: Optional[uuid.UUID] = None, + ancestor_start_lsn: Optional[str] = None, + ) -> Dict[Any, Any]: + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline", + json={ + 'new_timeline_id': + new_timeline_id.hex if new_timeline_id else None, + 'ancestor_start_lsn': + ancestor_start_lsn, + 'ancestor_timeline_id': + ancestor_timeline_id.hex if ancestor_timeline_id else None, + }) + self.verbose_error(res) + if res.status_code == 409: + raise Exception(f'could not create timeline: already exists for id {new_timeline_id}') + res_json = res.json() assert isinstance(res_json, dict) return res_json @@ -745,18 +753,22 @@ class ZenithPageserverHttpClient(requests.Session): assert isinstance(res_json, list) return res_json - def tenant_create(self, tenant_id: uuid.UUID): + def tenant_create(self, new_tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: res = self.post( f"http://localhost:{self.port}/v1/tenant", json={ - 'tenant_id': tenant_id.hex, + 'new_tenant_id': new_tenant_id.hex if new_tenant_id else None, }, ) self.verbose_error(res) - return res.json() + if res.status_code == 409: + raise Exception(f'could not create tenant: already exists for id {new_tenant_id}') + new_tenant_id = res.json() + assert isinstance(new_tenant_id, str) + return uuid.UUID(new_tenant_id) - def timeline_list(self, tenant_id: uuid.UUID) -> List[str]: - res = self.get(f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}") + def timeline_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline") self.verbose_error(res) res_json = res.json() assert isinstance(res_json, list) @@ -764,7 +776,8 @@ class ZenithPageserverHttpClient(requests.Session): def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: res = self.get( - f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}") + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1" + ) self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) @@ -798,57 +811,127 @@ class S3Storage: RemoteStorage = Union[LocalFsStorage, S3Storage] +CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", + re.MULTILINE) +CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", + re.MULTILINE) +TIMELINE_DATA_EXTRACTOR = re.compile(r"\s(?P[^\s]+)\s\[(?P[^\]]+)\]", + re.MULTILINE) + class ZenithCli: """ A typed wrapper around the `zenith` CLI tool. Supports main commands via typed methods and a way to run arbitrary command directly via CLI. """ - def __init__(self, env: ZenithEnv) -> None: + def __init__(self, env: ZenithEnv): self.env = env pass def create_tenant(self, tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: + """ + Creates a new tenant, returns its id and its initial timeline's id. + """ if tenant_id is None: tenant_id = uuid.uuid4() - self.raw_cli(['tenant', 'create', tenant_id.hex]) + res = self.raw_cli(['tenant', 'create', '--tenant-id', tenant_id.hex]) + res.check_returncode() return tenant_id def list_tenants(self) -> 'subprocess.CompletedProcess[str]': - return self.raw_cli(['tenant', 'list']) + res = self.raw_cli(['tenant', 'list']) + res.check_returncode() + return res + + def create_timeline(self, + new_branch_name: str, + tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: + cmd = [ + 'timeline', + 'create', + '--branch-name', + new_branch_name, + '--tenant-id', + (tenant_id or self.env.initial_tenant).hex, + ] + + res = self.raw_cli(cmd) + res.check_returncode() + + matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) + + created_timeline_id = None + if matches is not None: + created_timeline_id = matches.group('timeline_id') + + return uuid.UUID(created_timeline_id) def create_branch(self, - branch_name: str, - starting_point: str, - tenant_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]': - args = ['branch'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) - args.extend([branch_name, starting_point]) + new_branch_name: str = DEFAULT_BRANCH_NAME, + ancestor_branch_name: Optional[str] = None, + tenant_id: Optional[uuid.UUID] = None, + ancestor_start_lsn: Optional[str] = None) -> uuid.UUID: + cmd = [ + 'timeline', + 'branch', + '--branch-name', + new_branch_name, + '--tenant-id', + (tenant_id or self.env.initial_tenant).hex, + ] + if ancestor_branch_name is not None: + cmd.extend(['--ancestor-branch-name', ancestor_branch_name]) + if ancestor_start_lsn is not None: + cmd.extend(['--ancestor-start-lsn', ancestor_start_lsn]) - return self.raw_cli(args) + res = self.raw_cli(cmd) + res.check_returncode() - def list_branches(self, - tenant_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]': - args = ['branch'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) - return self.raw_cli(args) + matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) - def init(self, config_toml: str) -> 'subprocess.CompletedProcess[str]': + created_timeline_id = None + if matches is not None: + created_timeline_id = matches.group('timeline_id') + + if created_timeline_id is None: + raise Exception('could not find timeline id after `zenith timeline create` invocation') + else: + return uuid.UUID(created_timeline_id) + + def list_timelines(self, tenant_id: Optional[uuid.UUID] = None) -> List[Tuple[str, str]]: + """ + Returns a list of (branch_name, timeline_id) tuples out of parsed `zenith timeline list` CLI output. + """ + + # (L) main [b49f7954224a0ad25cc0013ea107b54b] + # (L) ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540] + res = self.raw_cli( + ['timeline', 'list', '--tenant-id', (tenant_id or self.env.initial_tenant).hex]) + timelines_cli = sorted( + map(lambda branch_and_id: (branch_and_id[0], branch_and_id[1]), + TIMELINE_DATA_EXTRACTOR.findall(res.stdout))) + return timelines_cli + + def init(self, + config_toml: str, + initial_timeline_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]': with tempfile.NamedTemporaryFile(mode='w+') as tmp: tmp.write(config_toml) tmp.flush() cmd = ['init', f'--config={tmp.name}'] + if initial_timeline_id: + cmd.extend(['--timeline-id', initial_timeline_id.hex]) append_pageserver_param_overrides(cmd, self.env.pageserver.remote_storage, self.env.pageserver.config_override) - return self.raw_cli(cmd) + res = self.raw_cli(cmd) + res.check_returncode() + return res - def pageserver_start(self) -> 'subprocess.CompletedProcess[str]': - start_args = ['pageserver', 'start'] + def pageserver_start(self, overrides=()) -> 'subprocess.CompletedProcess[str]': + start_args = ['pageserver', 'start', *overrides] append_pageserver_param_overrides(start_args, self.env.pageserver.remote_storage, self.env.pageserver.config_override) @@ -862,53 +945,69 @@ class ZenithCli: log.info(f"Stopping pageserver with {cmd}") return self.raw_cli(cmd) - def safekeeper_start(self, name: str) -> 'subprocess.CompletedProcess[str]': - return self.raw_cli(['safekeeper', 'start', name]) + def safekeeper_start(self, id: int) -> 'subprocess.CompletedProcess[str]': + return self.raw_cli(['safekeeper', 'start', str(id)]) def safekeeper_stop(self, - name: Optional[str] = None, + id: Optional[int] = None, immediate=False) -> 'subprocess.CompletedProcess[str]': args = ['safekeeper', 'stop'] + if id is not None: + args.extend(str(id)) if immediate: args.extend(['-m', 'immediate']) - if name is not None: - args.append(name) return self.raw_cli(args) def pg_create( self, - node_name: str, + branch_name: str, + node_name: Optional[str] = None, tenant_id: Optional[uuid.UUID] = None, - timeline_spec: Optional[str] = None, + lsn: Optional[str] = None, port: Optional[int] = None, ) -> 'subprocess.CompletedProcess[str]': - args = ['pg', 'create'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) + args = [ + 'pg', + 'create', + '--tenant-id', + (tenant_id or self.env.initial_tenant).hex, + '--branch-name', + branch_name, + ] + if lsn is not None: + args.extend(['--lsn', lsn]) if port is not None: - args.append(f'--port={port}') - args.append(node_name) - if timeline_spec is not None: - args.append(timeline_spec) - return self.raw_cli(args) + args.extend(['--port', str(port)]) + if node_name is not None: + args.append(node_name) + + res = self.raw_cli(args) + res.check_returncode() + return res def pg_start( self, node_name: str, tenant_id: Optional[uuid.UUID] = None, - timeline_spec: Optional[str] = None, + lsn: Optional[str] = None, port: Optional[int] = None, ) -> 'subprocess.CompletedProcess[str]': - args = ['pg', 'start'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) + args = [ + 'pg', + 'start', + '--tenant-id', + (tenant_id or self.env.initial_tenant).hex, + ] + if lsn is not None: + args.append(f'--lsn={lsn}') if port is not None: args.append(f'--port={port}') - args.append(node_name) - if timeline_spec is not None: - args.append(timeline_spec) + if node_name is not None: + args.append(node_name) - return self.raw_cli(args) + res = self.raw_cli(args) + res.check_returncode() + return res def pg_stop( self, @@ -916,12 +1015,16 @@ class ZenithCli: tenant_id: Optional[uuid.UUID] = None, destroy=False, ) -> 'subprocess.CompletedProcess[str]': - args = ['pg', 'stop'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) + args = [ + 'pg', + 'stop', + '--tenant-id', + (tenant_id or self.env.initial_tenant).hex, + ] if destroy: args.append('--destroy') - args.append(node_name) + if node_name is not None: + args.append(node_name) return self.raw_cli(args) @@ -996,8 +1099,7 @@ class ZenithPageserver(PgProtocol): env: ZenithEnv, port: PageserverPort, remote_storage: Optional[RemoteStorage] = None, - config_override: Optional[str] = None, - enable_auth=False): + config_override: Optional[str] = None): super().__init__(host='localhost', port=port.pg, username='zenith_admin') self.env = env self.running = False @@ -1005,14 +1107,15 @@ class ZenithPageserver(PgProtocol): self.remote_storage = remote_storage self.config_override = config_override - def start(self) -> 'ZenithPageserver': + def start(self, overrides=()) -> 'ZenithPageserver': """ Start the page server. + `overrides` allows to add some config to this pageserver start. Returns self. """ assert self.running == False - self.env.zenith_cli.pageserver_start() + self.env.zenith_cli.pageserver_start(overrides=overrides) self.running = True return self @@ -1024,7 +1127,6 @@ class ZenithPageserver(PgProtocol): if self.running: self.env.zenith_cli.pageserver_stop(immediate) self.running = False - return self def __enter__(self): @@ -1085,7 +1187,7 @@ class PgBin: self.env = os.environ.copy() self.env['LD_LIBRARY_PATH'] = os.path.join(str(pg_distrib_dir), 'lib') - def _fixpath(self, command: List[str]) -> None: + def _fixpath(self, command: List[str]): if '/' not in command[0]: command[0] = os.path.join(self.pg_bin_path, command[0]) @@ -1096,7 +1198,7 @@ class PgBin: env.update(env_add) return env - def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None) -> None: + def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None): """ Run one of the postgres binaries. @@ -1146,18 +1248,18 @@ class VanillaPostgres(PgProtocol): self.running = False self.pg_bin.run_capture(['initdb', '-D', pgdatadir]) - def configure(self, options: List[str]) -> None: + def configure(self, options: List[str]): """Append lines into postgresql.conf file.""" assert not self.running with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file: conf_file.writelines(options) - def start(self) -> None: + def start(self): assert not self.running self.running = True self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, 'start']) - def stop(self) -> None: + def stop(self): assert self.running self.running = False self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, 'stop']) @@ -1240,8 +1342,9 @@ class Postgres(PgProtocol): def create( self, - node_name: str, - branch: Optional[str] = None, + branch_name: str, + node_name: Optional[str] = None, + lsn: Optional[str] = None, config_lines: Optional[List[str]] = None, ) -> 'Postgres': """ @@ -1252,19 +1355,21 @@ class Postgres(PgProtocol): if not config_lines: config_lines = [] - if branch is None: - branch = node_name - - self.env.zenith_cli.pg_create(node_name, + self.node_name = node_name or f'{branch_name}_pg_node' + self.env.zenith_cli.pg_create(branch_name, + node_name=self.node_name, tenant_id=self.tenant_id, - port=self.port, - timeline_spec=branch) - self.node_name = node_name + lsn=lsn, + port=self.port) path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name self.pgdata_dir = os.path.join(self.env.repo_dir, path) if config_lines is None: config_lines = [] + + # set small 'max_replication_write_lag' to enable backpressure + # and make tests more stable. + config_lines = ['max_replication_write_lag=15MB'] + config_lines self.config(config_lines) return self @@ -1351,7 +1456,7 @@ class Postgres(PgProtocol): if self.running: assert self.node_name is not None - self.env.zenith_cli.pg_stop(self.node_name, tenant_id=self.tenant_id) + self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id) self.running = False return self @@ -1363,15 +1468,16 @@ class Postgres(PgProtocol): """ assert self.node_name is not None - self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, destroy=True) + self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, True) self.node_name = None return self def create_start( self, - node_name: str, - branch: Optional[str] = None, + branch_name: str, + node_name: Optional[str] = None, + lsn: Optional[str] = None, config_lines: Optional[List[str]] = None, ) -> 'Postgres': """ @@ -1381,9 +1487,10 @@ class Postgres(PgProtocol): """ self.create( + branch_name=branch_name, node_name=node_name, - branch=branch, config_lines=config_lines, + lsn=lsn, ).start() return self @@ -1403,9 +1510,10 @@ class PostgresFactory: self.instances: List[Postgres] = [] def create_start(self, - node_name: str = "main", - branch: Optional[str] = None, + branch_name: str, + node_name: Optional[str] = None, tenant_id: Optional[uuid.UUID] = None, + lsn: Optional[str] = None, config_lines: Optional[List[str]] = None) -> Postgres: pg = Postgres( @@ -1417,15 +1525,17 @@ class PostgresFactory: self.instances.append(pg) return pg.create_start( + branch_name=branch_name, node_name=node_name, - branch=branch, config_lines=config_lines, + lsn=lsn, ) def create(self, - node_name: str = "main", - branch: Optional[str] = None, + branch_name: str, + node_name: Optional[str] = None, tenant_id: Optional[uuid.UUID] = None, + lsn: Optional[str] = None, config_lines: Optional[List[str]] = None) -> Postgres: pg = Postgres( @@ -1438,8 +1548,9 @@ class PostgresFactory: self.instances.append(pg) return pg.create( + branch_name=branch_name, node_name=node_name, - branch=branch, + lsn=lsn, config_lines=config_lines, ) @@ -1466,12 +1577,14 @@ class Safekeeper: """ An object representing a running safekeeper daemon. """ env: ZenithEnv port: SafekeeperPort - name: str # identifier for logging + id: int auth_token: Optional[str] = None + running: bool = False def start(self) -> 'Safekeeper': - self.env.zenith_cli.safekeeper_start(self.name) - + assert self.running == False + self.env.zenith_cli.safekeeper_start(self.id) + self.running = True # wait for wal acceptor start by checking its status started_at = time.time() while True: @@ -1489,8 +1602,9 @@ class Safekeeper: return self def stop(self, immediate=False) -> 'Safekeeper': - log.info('Stopping safekeeper {}'.format(self.name)) - self.env.zenith_cli.safekeeper_stop(self.name, immediate) + log.info('Stopping safekeeper {}'.format(self.id)) + self.env.zenith_cli.safekeeper_stop(self.id, immediate) + self.running = False return self def append_logical_message(self, @@ -1539,7 +1653,7 @@ class SafekeeperMetrics: class SafekeeperHttpClient(requests.Session): - def __init__(self, port: int) -> None: + def __init__(self, port: int): super().__init__() self.port = port @@ -1657,7 +1771,7 @@ def list_files_to_compare(pgdata_dir: str): # pg is the existing and running compute node, that we want to compare with a basebackup def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Postgres): - # Get the timeline ID of our branch. We need it for the 'basebackup' command + # Get the timeline ID. We need it for the 'basebackup' command with closing(pg.connect()) as conn: with conn.cursor() as cur: cur.execute("SHOW zenith.zenith_timeline") diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index 6fd77f3020..fbef131ffd 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -23,28 +23,23 @@ def test_bulk_tenant_create( """Measure tenant creation time (with and without wal acceptors)""" if use_wal_acceptors == 'with_wa': zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() time_slices = [] for i in range(tenants_count): start = timeit.default_timer() - tenant = env.create_tenant() - env.zenith_cli.create_branch( - f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", - "main", - tenant_id=tenant) + tenant = env.zenith_cli.create_tenant() + env.zenith_cli.create_timeline( + f'test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}', tenant_id=tenant) # FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now? #if use_wal_acceptors == 'with_wa': # wa_factory.start_n_new(3) pg_tenant = env.postgres.create_start( - f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", - None, # branch name, None means same as node name - tenant, - ) + f'test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}', tenant_id=tenant) end = timeit.default_timer() time_slices.append(end - start) diff --git a/vendor/postgres b/vendor/postgres index 31dc24ab29..093aa160e5 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 31dc24ab29e6bdd5cfb85920a9c728f759c01b29 +Subproject commit 093aa160e5df19814ff19b995d36dd5ee03c7f8b diff --git a/walkeeper/src/bin/safekeeper.rs b/walkeeper/src/bin/safekeeper.rs index ea5d0cba14..6c45115e5f 100644 --- a/walkeeper/src/bin/safekeeper.rs +++ b/walkeeper/src/bin/safekeeper.rs @@ -1,17 +1,19 @@ // // Main entry point for the safekeeper executable // -use anyhow::{Context, Result}; +use anyhow::{bail, Context, Result}; use clap::{App, Arg}; use const_format::formatcp; use daemonize::Daemonize; use fs2::FileExt; -use std::fs::File; +use std::fs::{self, File}; +use std::io::{ErrorKind, Write}; use std::path::{Path, PathBuf}; use std::thread; use tracing::*; -use walkeeper::control_file::{self, CreateControlFile}; +use walkeeper::control_file::{self}; use zenith_utils::http::endpoint; +use zenith_utils::zid::ZNodeId; use zenith_utils::{logging, tcp_listener, GIT_VERSION}; use tokio::sync::mpsc; @@ -25,6 +27,7 @@ use zenith_utils::shutdown::exit_now; use zenith_utils::signals; const LOCK_FILE_NAME: &str = "safekeeper.lock"; +const ID_FILE_NAME: &str = "safekeeper.id"; fn main() -> Result<()> { zenith_metrics::set_common_metrics_prefix("safekeeper"); @@ -38,6 +41,12 @@ fn main() -> Result<()> { .takes_value(true) .help("Path to the safekeeper data directory"), ) + .arg( + Arg::new("init") + .long("init") + .takes_value(false) + .help("Initialize safekeeper with ID"), + ) .arg( Arg::new("listen-pg") .short('l') @@ -93,13 +102,13 @@ fn main() -> Result<()> { .takes_value(true) .help("Dump control file at path specifed by this argument and exit"), ) + .arg( + Arg::new("id").long("id").takes_value(true).help("safekeeper node id: integer") + ) .get_matches(); if let Some(addr) = arg_matches.value_of("dump-control-file") { - let state = control_file::FileStorage::load_control_file( - Path::new(addr), - CreateControlFile::False, - )?; + let state = control_file::FileStorage::load_control_file(Path::new(addr))?; let json = serde_json::to_string(&state)?; print!("{}", json); return Ok(()); @@ -136,10 +145,19 @@ fn main() -> Result<()> { conf.recall_period = humantime::parse_duration(recall)?; } - start_safekeeper(conf) + let mut given_id = None; + if let Some(given_id_str) = arg_matches.value_of("id") { + given_id = Some(ZNodeId( + given_id_str + .parse() + .context("failed to parse safekeeper id")?, + )); + } + + start_safekeeper(conf, given_id, arg_matches.is_present("init")) } -fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { +fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { let log_file = logging::init("safekeeper.log", conf.daemonize)?; info!("version: {}", GIT_VERSION); @@ -154,6 +172,12 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { ) })?; + // Set or read our ID. + set_id(&mut conf, given_id)?; + if init { + return Ok(()); + } + let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| { error!("failed to bind to address {}: {}", conf.listen_http_addr, e); e @@ -260,3 +284,49 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { std::process::exit(111); }) } + +/// Determine safekeeper id and set it in config. +fn set_id(conf: &mut SafeKeeperConf, given_id: Option) -> Result<()> { + let id_file_path = conf.workdir.join(ID_FILE_NAME); + + let my_id: ZNodeId; + // If ID exists, read it in; otherwise set one passed + match fs::read(&id_file_path) { + Ok(id_serialized) => { + my_id = ZNodeId( + std::str::from_utf8(&id_serialized) + .context("failed to parse safekeeper id")? + .parse() + .context("failed to parse safekeeper id")?, + ); + if let Some(given_id) = given_id { + if given_id != my_id { + bail!( + "safekeeper already initialized with id {}, can't set {}", + my_id, + given_id + ); + } + } + info!("safekeeper ID {}", my_id); + } + Err(error) => match error.kind() { + ErrorKind::NotFound => { + my_id = if let Some(given_id) = given_id { + given_id + } else { + bail!("safekeeper id is not specified"); + }; + let mut f = File::create(&id_file_path)?; + f.write_all(my_id.to_string().as_bytes())?; + f.sync_all()?; + info!("initialized safekeeper ID {}", my_id); + } + _ => { + return Err(error.into()); + } + }, + } + conf.my_id = my_id; + Ok(()) +} diff --git a/walkeeper/src/control_file.rs b/walkeeper/src/control_file.rs index 6016e00d1d..8b4e618661 100644 --- a/walkeeper/src/control_file.rs +++ b/walkeeper/src/control_file.rs @@ -27,13 +27,6 @@ const CONTROL_FILE_NAME: &str = "safekeeper.control"; const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial"; pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); -// A named boolean. -#[derive(Debug)] -pub enum CreateControlFile { - True, - False, -} - lazy_static! { static ref PERSIST_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!( "safekeeper_persist_control_file_seconds", @@ -94,28 +87,22 @@ impl FileStorage { pub fn load_control_file_conf( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, - create: CreateControlFile, ) -> Result { let path = conf.timeline_dir(zttid).join(CONTROL_FILE_NAME); - Self::load_control_file(path, create) + Self::load_control_file(path) } /// Read in the control file. /// If create=false and file doesn't exist, bails out. - pub fn load_control_file>( - control_file_path: P, - create: CreateControlFile, - ) -> Result { + pub fn load_control_file>(control_file_path: P) -> Result { info!( - "loading control file {}, create={:?}", + "loading control file {}", control_file_path.as_ref().display(), - create, ); let mut control_file = OpenOptions::new() .read(true) .write(true) - .create(matches!(create, CreateControlFile::True)) .open(&control_file_path) .with_context(|| { format!( @@ -124,41 +111,32 @@ impl FileStorage { ) })?; - // Empty file is legit on 'create', don't try to deser from it. - let state = if control_file.metadata().unwrap().len() == 0 { - if let CreateControlFile::False = create { - bail!("control file is empty"); - } - SafeKeeperState::new() - } else { - let mut buf = Vec::new(); - control_file - .read_to_end(&mut buf) - .context("failed to read control file")?; + let mut buf = Vec::new(); + control_file + .read_to_end(&mut buf) + .context("failed to read control file")?; - let calculated_checksum = crc32c::crc32c(&buf[..buf.len() - CHECKSUM_SIZE]); + let calculated_checksum = crc32c::crc32c(&buf[..buf.len() - CHECKSUM_SIZE]); - let expected_checksum_bytes: &[u8; CHECKSUM_SIZE] = - buf[buf.len() - CHECKSUM_SIZE..].try_into()?; - let expected_checksum = u32::from_le_bytes(*expected_checksum_bytes); + let expected_checksum_bytes: &[u8; CHECKSUM_SIZE] = + buf[buf.len() - CHECKSUM_SIZE..].try_into()?; + let expected_checksum = u32::from_le_bytes(*expected_checksum_bytes); - ensure!( - calculated_checksum == expected_checksum, + ensure!( + calculated_checksum == expected_checksum, + format!( + "safekeeper control file checksum mismatch: expected {} got {}", + expected_checksum, calculated_checksum + ) + ); + + let state = FileStorage::deser_sk_state(&mut &buf[..buf.len() - CHECKSUM_SIZE]) + .with_context(|| { format!( - "safekeeper control file checksum mismatch: expected {} got {}", - expected_checksum, calculated_checksum + "while reading control file {}", + control_file_path.as_ref().display(), ) - ); - - FileStorage::deser_sk_state(&mut &buf[..buf.len() - CHECKSUM_SIZE]).with_context( - || { - format!( - "while reading control file {}", - control_file_path.as_ref().display(), - ) - }, - )? - }; + })?; Ok(state) } } @@ -247,31 +225,38 @@ mod test { fn load_from_control_file( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, - create: CreateControlFile, ) -> Result<(FileStorage, SafeKeeperState)> { fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); Ok(( FileStorage::new(zttid, conf), - FileStorage::load_control_file_conf(conf, zttid, create)?, + FileStorage::load_control_file_conf(conf, zttid)?, )) } + fn create( + conf: &SafeKeeperConf, + zttid: &ZTenantTimelineId, + ) -> Result<(FileStorage, SafeKeeperState)> { + fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); + let state = SafeKeeperState::empty(); + let mut storage = FileStorage::new(zttid, conf); + storage.persist(&state)?; + Ok((storage, state)) + } + #[test] fn test_read_write_safekeeper_state() { let conf = stub_conf(); let zttid = ZTenantTimelineId::generate(); { - let (mut storage, mut state) = - load_from_control_file(&conf, &zttid, CreateControlFile::True) - .expect("failed to read state"); + let (mut storage, mut state) = create(&conf, &zttid).expect("failed to create state"); // change something - state.wal_start_lsn = Lsn(42); + state.commit_lsn = Lsn(42); storage.persist(&state).expect("failed to persist state"); } - let (_, state) = load_from_control_file(&conf, &zttid, CreateControlFile::False) - .expect("failed to read state"); - assert_eq!(state.wal_start_lsn, Lsn(42)); + let (_, state) = load_from_control_file(&conf, &zttid).expect("failed to read state"); + assert_eq!(state.commit_lsn, Lsn(42)); } #[test] @@ -279,11 +264,10 @@ mod test { let conf = stub_conf(); let zttid = ZTenantTimelineId::generate(); { - let (mut storage, mut state) = - load_from_control_file(&conf, &zttid, CreateControlFile::True) - .expect("failed to read state"); + let (mut storage, mut state) = create(&conf, &zttid).expect("failed to read state"); + // change something - state.wal_start_lsn = Lsn(42); + state.commit_lsn = Lsn(42); storage.persist(&state).expect("failed to persist state"); } let control_path = conf.timeline_dir(&zttid).join(CONTROL_FILE_NAME); @@ -291,7 +275,7 @@ mod test { data[0] += 1; // change the first byte of the file to fail checksum validation fs::write(&control_path, &data).expect("failed to write control file"); - match load_from_control_file(&conf, &zttid, CreateControlFile::False) { + match load_from_control_file(&conf, &zttid) { Err(err) => assert!(err .to_string() .contains("safekeeper control file checksum mismatch")), diff --git a/walkeeper/src/control_file_upgrade.rs b/walkeeper/src/control_file_upgrade.rs index 913bd02c1e..9effe42f8d 100644 --- a/walkeeper/src/control_file_upgrade.rs +++ b/walkeeper/src/control_file_upgrade.rs @@ -1,6 +1,6 @@ //! Code to deal with safekeeper control file upgrades use crate::safekeeper::{ - AcceptorState, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermSwitchEntry, + AcceptorState, Peers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermSwitchEntry, }; use anyhow::{bail, Result}; use serde::{Deserialize, Serialize}; @@ -26,7 +26,7 @@ struct SafeKeeperStateV1 { /// persistent acceptor state acceptor_state: AcceptorStateV1, /// information about server - server: ServerInfo, + server: ServerInfoV2, /// Unique id of the last *elected* proposer we dealed with. Not needed /// for correctness, exists for monitoring purposes. proposer_uuid: PgUuid, @@ -70,6 +70,39 @@ pub struct SafeKeeperStateV2 { pub wal_start_lsn: Lsn, } +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct ServerInfoV3 { + /// Postgres server version + pub pg_version: u32, + pub system_id: SystemId, + #[serde(with = "hex")] + pub tenant_id: ZTenantId, + /// Zenith timelineid + #[serde(with = "hex")] + pub timeline_id: ZTimelineId, + pub wal_seg_size: u32, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SafeKeeperStateV3 { + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfoV3, + /// Unique id of the last *elected* proposer we dealed with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// part of WAL acknowledged by quorum and available locally + pub commit_lsn: Lsn, + /// minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone) + pub truncate_lsn: Lsn, + // Safekeeper starts receiving WAL from this LSN, zeros before it ought to + // be skipped during decoding. + pub wal_start_lsn: Lsn, +} + pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { // migrate to storing full term history if version == 1 { @@ -83,12 +116,20 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result }]), }; return Ok(SafeKeeperState { + tenant_id: oldstate.server.tenant_id, + timeline_id: oldstate.server.ztli, acceptor_state: ac, - server: oldstate.server.clone(), + server: ServerInfo { + pg_version: oldstate.server.pg_version, + system_id: oldstate.server.system_id, + wal_seg_size: oldstate.server.wal_seg_size, + }, proposer_uuid: oldstate.proposer_uuid, commit_lsn: oldstate.commit_lsn, - truncate_lsn: oldstate.truncate_lsn, - wal_start_lsn: oldstate.wal_start_lsn, + s3_wal_lsn: Lsn(0), + peer_horizon_lsn: oldstate.truncate_lsn, + remote_consistent_lsn: Lsn(0), + peers: Peers(vec![]), }); // migrate to hexing some zids } else if version == 2 { @@ -97,17 +138,40 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result let server = ServerInfo { pg_version: oldstate.server.pg_version, system_id: oldstate.server.system_id, - tenant_id: oldstate.server.tenant_id, - timeline_id: oldstate.server.ztli, wal_seg_size: oldstate.server.wal_seg_size, }; return Ok(SafeKeeperState { + tenant_id: oldstate.server.tenant_id, + timeline_id: oldstate.server.ztli, acceptor_state: oldstate.acceptor_state, server, proposer_uuid: oldstate.proposer_uuid, commit_lsn: oldstate.commit_lsn, - truncate_lsn: oldstate.truncate_lsn, - wal_start_lsn: oldstate.wal_start_lsn, + s3_wal_lsn: Lsn(0), + peer_horizon_lsn: oldstate.truncate_lsn, + remote_consistent_lsn: Lsn(0), + peers: Peers(vec![]), + }); + // migrate to moving ztenantid/ztli to the top and adding some lsns + } else if version == 3 { + info!("reading safekeeper control file version {}", version); + let oldstate = SafeKeeperStateV3::des(&buf[..buf.len()])?; + let server = ServerInfo { + pg_version: oldstate.server.pg_version, + system_id: oldstate.server.system_id, + wal_seg_size: oldstate.server.wal_seg_size, + }; + return Ok(SafeKeeperState { + tenant_id: oldstate.server.tenant_id, + timeline_id: oldstate.server.timeline_id, + acceptor_state: oldstate.acceptor_state, + server, + proposer_uuid: oldstate.proposer_uuid, + commit_lsn: oldstate.commit_lsn, + s3_wal_lsn: Lsn(0), + peer_horizon_lsn: oldstate.truncate_lsn, + remote_consistent_lsn: Lsn(0), + peers: Peers(vec![]), }); } bail!("unsupported safekeeper control file version {}", version) diff --git a/walkeeper/src/handler.rs b/walkeeper/src/handler.rs index d1ead5cb37..ead6fab9fb 100644 --- a/walkeeper/src/handler.rs +++ b/walkeeper/src/handler.rs @@ -13,6 +13,7 @@ use postgres_ffi::xlog_utils::PG_TLI; use regex::Regex; use std::str::FromStr; use std::sync::Arc; +use tracing::info; use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend; use zenith_utils::postgres_backend::PostgresBackend; @@ -20,7 +21,6 @@ use zenith_utils::pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; use crate::callmemaybe::CallmeEvent; -use crate::control_file::CreateControlFile; use tokio::sync::mpsc::UnboundedSender; /// Safekeeper handler of postgres commands @@ -101,29 +101,19 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()> { let cmd = parse_cmd(query_string)?; - // Is this command is ztimeline scoped? - match cmd { - SafekeeperPostgresCommand::StartWalPush { .. } - | SafekeeperPostgresCommand::StartReplication { .. } - | SafekeeperPostgresCommand::IdentifySystem - | SafekeeperPostgresCommand::JSONCtrl { .. } => { - let tenantid = self.ztenantid.context("tenantid is required")?; - let timelineid = self.ztimelineid.context("timelineid is required")?; - if self.timeline.is_none() { - // START_WAL_PUSH is the only command that initializes the timeline in production. - // There is also JSON_CTRL command, which should initialize the timeline for testing. - let create_control_file = match cmd { - SafekeeperPostgresCommand::StartWalPush { .. } - | SafekeeperPostgresCommand::JSONCtrl { .. } => CreateControlFile::True, - _ => CreateControlFile::False, - }; - self.timeline.set( - &self.conf, - ZTenantTimelineId::new(tenantid, timelineid), - create_control_file, - )?; - } - } + info!("got query {:?}", query_string); + + let create = !(matches!(cmd, SafekeeperPostgresCommand::StartReplication { .. }) + || matches!(cmd, SafekeeperPostgresCommand::IdentifySystem)); + + let tenantid = self.ztenantid.context("tenantid is required")?; + let timelineid = self.ztimelineid.context("timelineid is required")?; + if self.timeline.is_none() { + self.timeline.set( + &self.conf, + ZTenantTimelineId::new(tenantid, timelineid), + create, + )?; } match cmd { diff --git a/walkeeper/src/http/mod.rs b/walkeeper/src/http/mod.rs index c82d1c0362..4c0be17ecd 100644 --- a/walkeeper/src/http/mod.rs +++ b/walkeeper/src/http/mod.rs @@ -1,2 +1,3 @@ +pub mod models; pub mod routes; pub use routes::make_router; diff --git a/walkeeper/src/http/models.rs b/walkeeper/src/http/models.rs new file mode 100644 index 0000000000..8a6ed7a812 --- /dev/null +++ b/walkeeper/src/http/models.rs @@ -0,0 +1,9 @@ +use serde::{Deserialize, Serialize}; +use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; + +#[derive(Serialize, Deserialize)] +pub struct TimelineCreateRequest { + pub tenant_id: ZTenantId, + pub timeline_id: ZTimelineId, + pub peer_ids: Vec, +} diff --git a/walkeeper/src/http/routes.rs b/walkeeper/src/http/routes.rs index 11a29ac6d3..74f7f4a735 100644 --- a/walkeeper/src/http/routes.rs +++ b/walkeeper/src/http/routes.rs @@ -1,13 +1,15 @@ use hyper::{Body, Request, Response, StatusCode}; + use serde::Serialize; use serde::Serializer; use std::fmt::Display; use std::sync::Arc; +use zenith_utils::http::json::json_request; use zenith_utils::http::{RequestExt, RouterBuilder}; use zenith_utils::lsn::Lsn; +use zenith_utils::zid::ZNodeId; use zenith_utils::zid::ZTenantTimelineId; -use crate::control_file::CreateControlFile; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; use crate::timeline::GlobalTimelines; @@ -18,9 +20,18 @@ use zenith_utils::http::json::json_response; use zenith_utils::http::request::parse_request_param; use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use super::models::TimelineCreateRequest; + +#[derive(Debug, Serialize)] +struct SafekeeperStatus { + id: ZNodeId, +} + /// Healthcheck handler. -async fn status_handler(_: Request) -> Result, ApiError> { - Ok(json_response(StatusCode::OK, "")?) +async fn status_handler(request: Request) -> Result, ApiError> { + let conf = get_conf(&request); + let status = SafekeeperStatus { id: conf.my_id }; + Ok(json_response(StatusCode::OK, status)?) } fn get_conf(request: &Request) -> &SafeKeeperConf { @@ -58,7 +69,11 @@ struct TimelineStatus { #[serde(serialize_with = "display_serialize")] commit_lsn: Lsn, #[serde(serialize_with = "display_serialize")] - truncate_lsn: Lsn, + s3_wal_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] + peer_horizon_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] + remote_consistent_lsn: Lsn, #[serde(serialize_with = "display_serialize")] flush_lsn: Lsn, } @@ -70,8 +85,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { + let request_data: TimelineCreateRequest = json_request(&mut request).await?; + + let zttid = ZTenantTimelineId { + tenant_id: request_data.tenant_id, + timeline_id: request_data.timeline_id, + }; + GlobalTimelines::create(get_conf(&request), zttid, request_data.peer_ids) + .map_err(ApiError::from_err)?; + + Ok(json_response(StatusCode::CREATED, ())?) +} + /// Safekeeper http router. pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder { let router = endpoint::make_router(); @@ -102,4 +131,5 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder "/v1/timeline/:tenant_id/:timeline_id", timeline_status_handler, ) + .post("/v1/timeline", timeline_create_handler) } diff --git a/walkeeper/src/lib.rs b/walkeeper/src/lib.rs index 6c3e0b264e..dfd71e4de2 100644 --- a/walkeeper/src/lib.rs +++ b/walkeeper/src/lib.rs @@ -2,7 +2,7 @@ use std::path::PathBuf; use std::time::Duration; -use zenith_utils::zid::ZTenantTimelineId; +use zenith_utils::zid::{ZNodeId, ZTenantTimelineId}; pub mod callmemaybe; pub mod control_file; @@ -46,6 +46,7 @@ pub struct SafeKeeperConf { pub listen_http_addr: String, pub ttl: Option, pub recall_period: Duration, + pub my_id: ZNodeId, } impl SafeKeeperConf { @@ -69,6 +70,7 @@ impl Default for SafeKeeperConf { listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), ttl: None, recall_period: defaults::DEFAULT_RECALL_PERIOD, + my_id: ZNodeId(0), } } } diff --git a/walkeeper/src/safekeeper.rs b/walkeeper/src/safekeeper.rs index fa624bb18f..53fd6f5588 100644 --- a/walkeeper/src/safekeeper.rs +++ b/walkeeper/src/safekeeper.rs @@ -10,6 +10,8 @@ use std::cmp::min; use std::fmt; use std::io::Read; use tracing::*; +use zenith_utils::zid::ZNodeId; +use zenith_utils::zid::ZTenantTimelineId; use lazy_static::lazy_static; @@ -25,12 +27,13 @@ use zenith_utils::pq_proto::ZenithFeedback; use zenith_utils::zid::{ZTenantId, ZTimelineId}; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 3; +pub const SK_FORMAT_VERSION: u32 = 4; const SK_PROTOCOL_VERSION: u32 = 1; const UNKNOWN_SERVER_VERSION: u32 = 0; /// Consensus logical timestamp. pub type Term = u64; +const INVALID_TERM: Term = 0; #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct TermSwitchEntry { @@ -128,18 +131,47 @@ pub struct ServerInfo { /// Postgres server version pub pg_version: u32, pub system_id: SystemId, - #[serde(with = "hex")] - pub tenant_id: ZTenantId, - /// Zenith timelineid - #[serde(with = "hex")] - pub timeline_id: ZTimelineId, pub wal_seg_size: u32, } +/// Data published by safekeeper to the peers +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PeerInfo { + /// LSN up to which safekeeper offloaded WAL to s3. + s3_wal_lsn: Lsn, + /// Term of the last entry. + term: Term, + /// LSN of the last record. + flush_lsn: Lsn, + /// Up to which LSN safekeeper regards its WAL as committed. + commit_lsn: Lsn, +} + +impl PeerInfo { + fn new() -> Self { + Self { + s3_wal_lsn: Lsn(0), + term: INVALID_TERM, + flush_lsn: Lsn(0), + commit_lsn: Lsn(0), + } + } +} + +// vector-based node id -> peer state map with very limited functionality we +// need/ +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Peers(pub Vec<(ZNodeId, PeerInfo)>); + /// Persistent information stored on safekeeper node /// On disk data is prefixed by magic and format version and followed by checksum. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SafeKeeperState { + #[serde(with = "hex")] + pub tenant_id: ZTenantId, + /// Zenith timelineid + #[serde(with = "hex")] + pub timeline_id: ZTimelineId, /// persistent acceptor state pub acceptor_state: AcceptorState, /// information about server @@ -148,19 +180,33 @@ pub struct SafeKeeperState { /// for correctness, exists for monitoring purposes. #[serde(with = "hex")] pub proposer_uuid: PgUuid, - /// part of WAL acknowledged by quorum and available locally + /// Part of WAL acknowledged by quorum and available locally. Always points + /// to record boundary. pub commit_lsn: Lsn, - /// minimal LSN which may be needed for recovery of some safekeeper (end_lsn - /// of last record streamed to everyone) - pub truncate_lsn: Lsn, - // Safekeeper starts receiving WAL from this LSN, zeros before it ought to - // be skipped during decoding. - pub wal_start_lsn: Lsn, + /// First LSN not yet offloaded to s3. Useful to persist to avoid finding + /// out offloading progress on boot. + pub s3_wal_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver. + pub remote_consistent_lsn: Lsn, + // Peers and their state as we remember it. Knowing peers themselves is + // fundamental; but state is saved here only for informational purposes and + // obviously can be stale. (Currently not saved at all, but let's provision + // place to have less file version upgrades). + pub peers: Peers, } impl SafeKeeperState { - pub fn new() -> SafeKeeperState { + pub fn new(zttid: &ZTenantTimelineId, peers: Vec) -> SafeKeeperState { SafeKeeperState { + tenant_id: zttid.tenant_id, + timeline_id: zttid.timeline_id, acceptor_state: AcceptorState { term: 0, term_history: TermHistory::empty(), @@ -168,21 +214,20 @@ impl SafeKeeperState { server: ServerInfo { pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ system_id: 0, /* Postgres system identifier */ - tenant_id: ZTenantId::from([0u8; 16]), - timeline_id: ZTimelineId::from([0u8; 16]), wal_seg_size: 0, }, proposer_uuid: [0; 16], - commit_lsn: Lsn(0), /* part of WAL acknowledged by quorum */ - truncate_lsn: Lsn(0), /* minimal LSN which may be needed for recovery of some safekeeper */ - wal_start_lsn: Lsn(0), + commit_lsn: Lsn(0), + s3_wal_lsn: Lsn(0), + peer_horizon_lsn: Lsn(0), + remote_consistent_lsn: Lsn(0), + peers: Peers(peers.iter().map(|p| (*p, PeerInfo::new())).collect()), } } -} -impl Default for SafeKeeperState { - fn default() -> Self { - Self::new() + #[cfg(test)] + pub fn empty() -> Self { + SafeKeeperState::new(&ZTenantTimelineId::empty(), vec![]) } } @@ -421,6 +466,7 @@ lazy_static! { struct SafeKeeperMetrics { commit_lsn: Gauge, + // WAL-related metrics are in WalStorageMetrics } impl SafeKeeperMetrics { @@ -443,7 +489,7 @@ pub struct SafeKeeper { /// not-yet-flushed pairs of same named fields in s.* pub commit_lsn: Lsn, - pub truncate_lsn: Lsn, + pub peer_horizon_lsn: Lsn, pub s: SafeKeeperState, // persistent part pub control_store: CTRL, @@ -462,16 +508,14 @@ where wal_store: WAL, state: SafeKeeperState, ) -> SafeKeeper { - if state.server.timeline_id != ZTimelineId::from([0u8; 16]) - && ztli != state.server.timeline_id - { - panic!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.server.timeline_id); + if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id { + panic!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id); } SafeKeeper { - metrics: SafeKeeperMetrics::new(state.server.tenant_id, ztli, state.commit_lsn), + metrics: SafeKeeperMetrics::new(state.tenant_id, ztli, state.commit_lsn), commit_lsn: state.commit_lsn, - truncate_lsn: state.truncate_lsn, + peer_horizon_lsn: state.peer_horizon_lsn, s: state, control_store, wal_store, @@ -532,12 +576,24 @@ where msg.pg_version, self.s.server.pg_version ); } + if msg.tenant_id != self.s.tenant_id { + bail!( + "invalid tenant ID, got {}, expected {}", + msg.tenant_id, + self.s.tenant_id + ); + } + if msg.ztli != self.s.timeline_id { + bail!( + "invalid timeline ID, got {}, expected {}", + msg.ztli, + self.s.timeline_id + ); + } // set basic info about server, if not yet // TODO: verify that is doesn't change after self.s.server.system_id = msg.system_id; - self.s.server.tenant_id = msg.tenant_id; - self.s.server.timeline_id = msg.ztli; self.s.server.wal_seg_size = msg.wal_seg_size; self.control_store .persist(&self.s) @@ -568,7 +624,7 @@ where term: self.s.acceptor_state.term, vote_given: false as u64, flush_lsn: self.wal_store.flush_lsn(), - truncate_lsn: self.s.truncate_lsn, + truncate_lsn: self.s.peer_horizon_lsn, term_history: self.get_term_history(), }; if self.s.acceptor_state.term < msg.term { @@ -593,14 +649,16 @@ where /// Form AppendResponse from current state. fn append_response(&self) -> AppendResponse { - AppendResponse { + let ar = AppendResponse { term: self.s.acceptor_state.term, flush_lsn: self.wal_store.flush_lsn(), commit_lsn: self.s.commit_lsn, // will be filled by the upper code to avoid bothering safekeeper hs_feedback: HotStandbyFeedback::empty(), zenith_feedback: ZenithFeedback::empty(), - } + }; + trace!("formed AppendResponse {:?}", ar); + ar } fn handle_elected(&mut self, msg: &ProposerElected) -> Result> { @@ -655,10 +713,11 @@ where if !msg.wal_data.is_empty() { self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?; - // If this was the first record we ever receieved, remember LSN to help - // find_end_of_wal skip the hole in the beginning. - if self.s.wal_start_lsn == Lsn(0) { - self.s.wal_start_lsn = msg.h.begin_lsn; + // If this was the first record we ever receieved, initialize + // commit_lsn to help find_end_of_wal skip the hole in the + // beginning. + if self.s.commit_lsn == Lsn(0) { + self.s.commit_lsn = msg.h.begin_lsn; sync_control_file = true; require_flush = true; } @@ -685,35 +744,36 @@ where .set(u64::from(self.commit_lsn) as f64); } - self.truncate_lsn = msg.h.truncate_lsn; + self.peer_horizon_lsn = msg.h.truncate_lsn; // Update truncate and commit LSN in control file. // To avoid negative impact on performance of extra fsync, do it only // when truncate_lsn delta exceeds WAL segment size. sync_control_file |= - self.s.truncate_lsn + (self.s.server.wal_seg_size as u64) < self.truncate_lsn; + self.s.peer_horizon_lsn + (self.s.server.wal_seg_size as u64) < self.peer_horizon_lsn; if sync_control_file { self.s.commit_lsn = self.commit_lsn; - self.s.truncate_lsn = self.truncate_lsn; + self.s.peer_horizon_lsn = self.peer_horizon_lsn; } if sync_control_file { self.control_store.persist(&self.s)?; } + trace!( + "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}", + msg.wal_data.len(), + msg.h.end_lsn, + msg.h.commit_lsn, + msg.h.truncate_lsn, + require_flush, + ); + // If flush_lsn hasn't updated, AppendResponse is not very useful. if !require_flush { return Ok(None); } let resp = self.append_response(); - trace!( - "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, resp {:?}", - msg.wal_data.len(), - msg.h.end_lsn, - msg.h.commit_lsn, - msg.h.truncate_lsn, - &resp, - ); Ok(Some(AcceptorProposerMessage::AppendResponse(resp))) } @@ -774,11 +834,11 @@ mod tests { #[test] fn test_voting() { let storage = InMemoryState { - persisted_state: SafeKeeperState::new(), + persisted_state: SafeKeeperState::empty(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::new()); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::empty()); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -806,11 +866,11 @@ mod tests { #[test] fn test_epoch_switch() { let storage = InMemoryState { - persisted_state: SafeKeeperState::new(), + persisted_state: SafeKeeperState::empty(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::new()); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::empty()); let mut ar_hdr = AppendRequestHeader { term: 1, diff --git a/walkeeper/src/timeline.rs b/walkeeper/src/timeline.rs index c639e81b79..ea8308b95e 100644 --- a/walkeeper/src/timeline.rs +++ b/walkeeper/src/timeline.rs @@ -1,7 +1,7 @@ //! This module contains timeline id -> safekeeper state map with file-backed //! persistence and support for interaction between sending and receiving wal. -use anyhow::{Context, Result}; +use anyhow::{bail, Context, Result}; use lazy_static::lazy_static; @@ -9,22 +9,24 @@ use std::cmp::{max, min}; use std::collections::HashMap; use std::fs::{self}; -use std::sync::{Arc, Condvar, Mutex}; +use std::sync::{Arc, Condvar, Mutex, MutexGuard}; use std::time::Duration; use tokio::sync::mpsc::UnboundedSender; use tracing::*; use zenith_utils::lsn::Lsn; -use zenith_utils::zid::ZTenantTimelineId; +use zenith_utils::zid::{ZNodeId, ZTenantTimelineId}; use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; -use crate::control_file::{self, CreateControlFile}; +use crate::control_file; +use crate::control_file::Storage as cf_storage; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, }; use crate::send_wal::HotStandbyFeedback; -use crate::wal_storage::{self, Storage}; +use crate::wal_storage; +use crate::wal_storage::Storage as wal_storage_iface; use crate::SafeKeeperConf; use zenith_utils::pq_proto::ZenithFeedback; @@ -87,21 +89,39 @@ struct SharedState { } impl SharedState { - /// Restore SharedState from control file. - /// If create=false and file doesn't exist, bails out. - fn create_restore( + /// Initialize timeline state, creating control file + fn create( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, - create: CreateControlFile, + peer_ids: Vec, ) -> Result { - let state = control_file::FileStorage::load_control_file_conf(conf, zttid, create) + let state = SafeKeeperState::new(zttid, peer_ids); + let control_store = control_file::FileStorage::new(zttid, conf); + let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); + let mut sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, state); + sk.control_store.persist(&sk.s)?; + + Ok(Self { + notified_commit_lsn: Lsn(0), + sk, + replicas: Vec::new(), + active: false, + num_computes: 0, + pageserver_connstr: None, + }) + } + + /// Restore SharedState from control file. + /// If file doesn't exist, bails out. + fn restore(conf: &SafeKeeperConf, zttid: &ZTenantTimelineId) -> Result { + let state = control_file::FileStorage::load_control_file_conf(conf, zttid) .context("failed to load from control file")?; let control_store = control_file::FileStorage::new(zttid, conf); let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); - info!("timeline {} created or restored", zttid.timeline_id); + info!("timeline {} restored", zttid.timeline_id); Ok(Self { notified_commit_lsn: Lsn(0), @@ -418,26 +438,13 @@ impl Timeline { // Utilities needed by various Connection-like objects pub trait TimelineTools { - fn set( - &mut self, - conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, - create: CreateControlFile, - ) -> Result<()>; + fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()>; fn get(&self) -> &Arc; } impl TimelineTools for Option> { - fn set( - &mut self, - conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, - create: CreateControlFile, - ) -> Result<()> { - // We will only set the timeline once. If it were to ever change, - // anyone who cloned the Arc would be out of date. - assert!(self.is_none()); + fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()> { *self = Some(GlobalTimelines::get(conf, zttid, create)?); Ok(()) } @@ -456,30 +463,73 @@ lazy_static! { pub struct GlobalTimelines; impl GlobalTimelines { + fn create_internal( + mut timelines: MutexGuard>>, + conf: &SafeKeeperConf, + zttid: ZTenantTimelineId, + peer_ids: Vec, + ) -> Result> { + match timelines.get(&zttid) { + Some(_) => bail!("timeline {} already exists", zttid), + None => { + // TODO: check directory existence + let dir = conf.timeline_dir(&zttid); + fs::create_dir_all(dir)?; + let shared_state = SharedState::create(conf, &zttid, peer_ids) + .context("failed to create shared state")?; + + let new_tli = Arc::new(Timeline::new(zttid, shared_state)); + timelines.insert(zttid, Arc::clone(&new_tli)); + Ok(new_tli) + } + } + } + + pub fn create( + conf: &SafeKeeperConf, + zttid: ZTenantTimelineId, + peer_ids: Vec, + ) -> Result> { + let timelines = TIMELINES.lock().unwrap(); + GlobalTimelines::create_internal(timelines, conf, zttid, peer_ids) + } + /// Get a timeline with control file loaded from the global TIMELINES map. - /// If control file doesn't exist and create=false, bails out. + /// If control file doesn't exist, bails out. pub fn get( conf: &SafeKeeperConf, zttid: ZTenantTimelineId, - create: CreateControlFile, + create: bool, ) -> Result> { let mut timelines = TIMELINES.lock().unwrap(); match timelines.get(&zttid) { Some(result) => Ok(Arc::clone(result)), None => { - if let CreateControlFile::True = create { - let dir = conf.timeline_dir(&zttid); - info!( - "creating timeline dir {}, create is {:?}", - dir.display(), - create - ); - fs::create_dir_all(dir)?; - } + let shared_state = + SharedState::restore(conf, &zttid).context("failed to restore shared state"); - let shared_state = SharedState::create_restore(conf, &zttid, create) - .context("failed to restore shared state")?; + let shared_state = match shared_state { + Ok(shared_state) => shared_state, + Err(error) => { + // TODO: always create timeline explicitly + if error + .root_cause() + .to_string() + .contains("No such file or directory") + && create + { + return GlobalTimelines::create_internal( + timelines, + conf, + zttid, + vec![], + ); + } else { + return Err(error); + } + } + }; let new_tli = Arc::new(Timeline::new(zttid, shared_state)); timelines.insert(zttid, Arc::clone(&new_tli)); diff --git a/walkeeper/src/wal_storage.rs b/walkeeper/src/wal_storage.rs index 73eccd0ae8..7cef525bee 100644 --- a/walkeeper/src/wal_storage.rs +++ b/walkeeper/src/wal_storage.rs @@ -301,7 +301,8 @@ impl Storage for PhysicalStorage { /// allows to postpone its initialization. fn init_storage(&mut self, state: &SafeKeeperState) -> Result<()> { if state.server.wal_seg_size == 0 { - // wal_seg_size is still unknown + // wal_seg_size is still unknown. This is dead path normally, should + // be used only in tests. return Ok(()); } @@ -315,9 +316,13 @@ impl Storage for PhysicalStorage { let wal_seg_size = state.server.wal_seg_size as usize; self.wal_seg_size = Some(wal_seg_size); - // we need to read WAL from disk to know which LSNs are stored on disk - self.write_lsn = - Lsn(find_end_of_wal(&self.timeline_dir, wal_seg_size, true, state.wal_start_lsn)?.0); + // Find out where stored WAL ends, starting at commit_lsn which is a + // known recent record boundary (unless we don't have WAL at all). + self.write_lsn = if state.commit_lsn == Lsn(0) { + Lsn(0) + } else { + Lsn(find_end_of_wal(&self.timeline_dir, wal_seg_size, true, state.commit_lsn)?.0) + }; self.write_record_lsn = self.write_lsn; @@ -326,11 +331,13 @@ impl Storage for PhysicalStorage { self.update_flush_lsn(); info!( - "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, truncate_lsn={}", - self.zttid.timeline_id, self.flush_record_lsn, state.commit_lsn, state.truncate_lsn, + "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}", + self.zttid.timeline_id, self.flush_record_lsn, state.commit_lsn, state.peer_horizon_lsn, ); - if self.flush_record_lsn < state.commit_lsn || self.flush_record_lsn < state.truncate_lsn { - warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or truncate_lsn from control file", self.zttid.timeline_id); + if self.flush_record_lsn < state.commit_lsn + || self.flush_record_lsn < state.peer_horizon_lsn + { + warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", self.zttid.timeline_id); } Ok(()) diff --git a/zenith/src/main.rs b/zenith/src/main.rs index a2a762f5be..dd35427d5d 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -1,4 +1,4 @@ -use anyhow::{bail, Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; use clap::{App, AppSettings, Arg, ArgMatches}; use control_plane::compute::ComputeControlPlane; use control_plane::local_env; @@ -9,7 +9,7 @@ use pageserver::config::defaults::{ DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR, }; -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::process::exit; use std::str::FromStr; use walkeeper::defaults::{ @@ -17,46 +17,53 @@ use walkeeper::defaults::{ DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; use zenith_utils::auth::{Claims, Scope}; +use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use zenith_utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}; use zenith_utils::GIT_VERSION; -use pageserver::branches::BranchInfo; +use pageserver::timelines::TimelineInfo; -// Default name of a safekeeper node, if not specified on the command line. -const DEFAULT_SAFEKEEPER_NAME: &str = "single"; +// Default id of a safekeeper node, if not specified on the command line. +const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1); +const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1); +const DEFAULT_BRANCH_NAME: &str = "main"; fn default_conf() -> String { format!( r#" # Default built-in configuration, defined in main.rs [pageserver] +id = {pageserver_id} listen_pg_addr = '{pageserver_pg_addr}' listen_http_addr = '{pageserver_http_addr}' auth_type = '{pageserver_auth_type}' [[safekeepers]] -name = '{safekeeper_name}' +id = {safekeeper_id} pg_port = {safekeeper_pg_port} http_port = {safekeeper_http_port} "#, + pageserver_id = DEFAULT_PAGESERVER_ID, pageserver_pg_addr = DEFAULT_PAGESERVER_PG_ADDR, pageserver_http_addr = DEFAULT_PAGESERVER_HTTP_ADDR, pageserver_auth_type = AuthType::Trust, - safekeeper_name = DEFAULT_SAFEKEEPER_NAME, + safekeeper_id = DEFAULT_SAFEKEEPER_ID, safekeeper_pg_port = DEFAULT_SAFEKEEPER_PG_PORT, safekeeper_http_port = DEFAULT_SAFEKEEPER_HTTP_PORT, ) } /// -/// Branches tree element used as a value in the HashMap. +/// Timelines tree element used as a value in the HashMap. /// -struct BranchTreeEl { - /// `BranchInfo` received from the `pageserver` via the `branch_list` libpq API call. - pub info: BranchInfo, - /// Holds all direct children of this branch referenced using `timeline_id`. - pub children: Vec, +struct TimelineTreeEl { + /// `TimelineInfo` received from the `pageserver` via the `timeline_list` http API call. + pub info: TimelineInfo, + /// Name, recovered from zenith config mappings + pub name: Option, + /// Holds all direct children of this timeline referenced using `timeline_id`. + pub children: BTreeSet, } // Main entry point for the 'zenith' CLI utility @@ -67,29 +74,28 @@ struct BranchTreeEl { // * Providing CLI api to the pageserver // * TODO: export/import to/from usual postgres fn main() -> Result<()> { - #[rustfmt::skip] // rustfmt squashes these into a single line otherwise - let pg_node_arg = Arg::new("node") - .index(1) - .help("Node name") - .required(true); - - #[rustfmt::skip] - let safekeeper_node_arg = Arg::new("node") - .index(1) - .help("Node name") + let branch_name_arg = Arg::new("branch-name") + .long("branch-name") + .takes_value(true) + .help("Name of the branch to be created or used as an alias for other services") .required(false); - let timeline_arg = Arg::new("timeline") - .index(2) - .help("Branch name or a point-in time specification") - .required(false); + let pg_node_arg = Arg::new("node").help("Postgres node name").required(false); - let tenantid_arg = Arg::new("tenantid") - .long("tenantid") + let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false); + + let tenant_id_arg = Arg::new("tenant-id") + .long("tenant-id") .help("Tenant id. Represented as a hexadecimal string 32 symbols length") .takes_value(true) .required(false); + let timeline_id_arg = Arg::new("timeline-id") + .long("timeline-id") + .help("Timeline id. Represented as a hexadecimal string 32 symbols length") + .takes_value(true) + .required(false); + let port_arg = Arg::new("port") .long("port") .required(false) @@ -111,6 +117,12 @@ fn main() -> Result<()> { .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more") .required(false); + let lsn_arg = Arg::new("lsn") + .long("lsn") + .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.") + .takes_value(true) + .required(false); + let matches = App::new("Zenith CLI") .setting(AppSettings::ArgRequiredElseHelp) .version(GIT_VERSION) @@ -118,6 +130,7 @@ fn main() -> Result<()> { App::new("init") .about("Initialize a new Zenith repository") .arg(pageserver_config_args.clone()) + .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) .arg( Arg::new("config") .long("config") @@ -126,17 +139,32 @@ fn main() -> Result<()> { ) ) .subcommand( - App::new("branch") - .about("Create a new branch") - .arg(Arg::new("branchname").required(false).index(1)) - .arg(Arg::new("start-point").required(false).index(2)) - .arg(tenantid_arg.clone()), + App::new("timeline") + .about("Manage timelines") + .subcommand(App::new("list") + .about("List all timelines, available to this pageserver") + .arg(tenant_id_arg.clone())) + .subcommand(App::new("branch") + .about("Create a new timeline, using another timeline as a base, copying its data") + .arg(tenant_id_arg.clone()) + .arg(branch_name_arg.clone()) + .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name").takes_value(true) + .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false)) + .arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn").takes_value(true) + .help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false))) + .subcommand(App::new("create") + .about("Create a new blank timeline") + .arg(tenant_id_arg.clone()) + .arg(branch_name_arg.clone())) ).subcommand( App::new("tenant") .setting(AppSettings::ArgRequiredElseHelp) .about("Manage tenants") .subcommand(App::new("list")) - .subcommand(App::new("create").arg(Arg::new("tenantid").required(false).index(1))) + .subcommand(App::new("create") + .arg(tenant_id_arg.clone()) + .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) + ) ) .subcommand( App::new("pageserver") @@ -154,16 +182,16 @@ fn main() -> Result<()> { .about("Manage safekeepers") .subcommand(App::new("start") .about("Start local safekeeper") - .arg(safekeeper_node_arg.clone()) + .arg(safekeeper_id_arg.clone()) ) .subcommand(App::new("stop") .about("Stop local safekeeper") - .arg(safekeeper_node_arg.clone()) + .arg(safekeeper_id_arg.clone()) .arg(stop_mode_arg.clone()) ) .subcommand(App::new("restart") .about("Restart local safekeeper") - .arg(safekeeper_node_arg.clone()) + .arg(safekeeper_id_arg.clone()) .arg(stop_mode_arg.clone()) ) ) @@ -171,12 +199,13 @@ fn main() -> Result<()> { App::new("pg") .setting(AppSettings::ArgRequiredElseHelp) .about("Manage postgres instances") - .subcommand(App::new("list").arg(tenantid_arg.clone())) + .subcommand(App::new("list").arg(tenant_id_arg.clone())) .subcommand(App::new("create") .about("Create a postgres compute node") .arg(pg_node_arg.clone()) - .arg(timeline_arg.clone()) - .arg(tenantid_arg.clone()) + .arg(branch_name_arg.clone()) + .arg(tenant_id_arg.clone()) + .arg(lsn_arg.clone()) .arg(port_arg.clone()) .arg( Arg::new("config-only") @@ -187,20 +216,21 @@ fn main() -> Result<()> { .subcommand(App::new("start") .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") .arg(pg_node_arg.clone()) - .arg(timeline_arg.clone()) - .arg(tenantid_arg.clone()) + .arg(tenant_id_arg.clone()) + .arg(branch_name_arg.clone()) + .arg(timeline_id_arg.clone()) + .arg(lsn_arg.clone()) .arg(port_arg.clone())) .subcommand( App::new("stop") - .arg(pg_node_arg.clone()) - .arg(timeline_arg.clone()) - .arg(tenantid_arg.clone()) - .arg( - Arg::new("destroy") - .help("Also delete data directory (now optional, should be default in future)") - .long("destroy") - .required(false) - ) + .arg(pg_node_arg.clone()) + .arg(tenant_id_arg.clone()) + .arg( + Arg::new("destroy") + .help("Also delete data directory (now optional, should be default in future)") + .long("destroy") + .required(false) + ) ) ) @@ -222,75 +252,89 @@ fn main() -> Result<()> { }; // Check for 'zenith init' command first. - let subcmd_result = if sub_name == "init" { - handle_init(sub_args) + let subcommand_result = if sub_name == "init" { + handle_init(sub_args).map(Some) } else { // all other commands need an existing config - let env = match LocalEnv::load_config() { - Ok(conf) => conf, - Err(e) => { - eprintln!("Error loading config: {}", e); - exit(1); - } - }; + let mut env = LocalEnv::load_config().context("Error loading config")?; + let original_env = env.clone(); - match sub_name { - "tenant" => handle_tenant(sub_args, &env), - "branch" => handle_branch(sub_args, &env), + let subcommand_result = match sub_name { + "tenant" => handle_tenant(sub_args, &mut env), + "timeline" => handle_timeline(sub_args, &mut env), "start" => handle_start_all(sub_args, &env), "stop" => handle_stop_all(sub_args, &env), "pageserver" => handle_pageserver(sub_args, &env), "pg" => handle_pg(sub_args, &env), "safekeeper" => handle_safekeeper(sub_args, &env), _ => bail!("unexpected subcommand {}", sub_name), + }; + + if original_env != env { + subcommand_result.map(|()| Some(env)) + } else { + subcommand_result.map(|()| None) } }; - if let Err(e) = subcmd_result { - eprintln!("command failed: {:#}", e); - exit(1); - } + match subcommand_result { + Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?, + Ok(None) => (), + Err(e) => { + eprintln!("command failed: {:?}", e); + exit(1); + } + } Ok(()) } /// -/// Prints branches list as a tree-like structure. +/// Prints timelines list as a tree-like structure. /// -fn print_branches_tree(branches: Vec) -> Result<()> { - let mut branches_hash: HashMap = HashMap::new(); +fn print_timelines_tree( + timelines: Vec, + mut timeline_name_mappings: HashMap, +) -> Result<()> { + let mut timelines_hash = timelines + .iter() + .map(|t| { + ( + t.timeline_id(), + TimelineTreeEl { + info: t.clone(), + children: BTreeSet::new(), + name: timeline_name_mappings + .remove(&ZTenantTimelineId::new(t.tenant_id(), t.timeline_id())), + }, + ) + }) + .collect::>(); - // Form a hash table of branch timeline_id -> BranchTreeEl. - for branch in &branches { - branches_hash.insert( - branch.timeline_id.to_string(), - BranchTreeEl { - info: branch.clone(), - children: Vec::new(), - }, - ); - } - - // Memorize all direct children of each branch. - for branch in &branches { - if let Some(tid) = &branch.ancestor_id { - branches_hash + // Memorize all direct children of each timeline. + for timeline in &timelines { + if let TimelineInfo::Local { + ancestor_timeline_id: Some(tid), + .. + } = timeline + { + timelines_hash .get_mut(tid) - .context("missing branch info in the HashMap")? + .context("missing timeline info in the HashMap")? .children - .push(branch.timeline_id.to_string()); + .insert(timeline.timeline_id()); } } - // Sort children by tid to bring some minimal order. - for branch in &mut branches_hash.values_mut() { - branch.children.sort(); - } - - for branch in branches_hash.values() { - // Start with root branches (no ancestors) first. - // Now there is 'main' branch only, but things may change. - if branch.info.ancestor_id.is_none() { - print_branch(0, &Vec::from([true]), branch, &branches_hash)?; + for timeline in timelines_hash.values() { + // Start with root local timelines (no ancestors) first. + if let TimelineInfo::Local { + ancestor_timeline_id, + .. + } = &timeline.info + { + if ancestor_timeline_id.is_none() { + print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?; + } } } @@ -298,27 +342,32 @@ fn print_branches_tree(branches: Vec) -> Result<()> { } /// -/// Recursively prints branch info with all its children. +/// Recursively prints timeline info with all its children. /// -fn print_branch( +fn print_timeline( nesting_level: usize, is_last: &[bool], - branch: &BranchTreeEl, - branches: &HashMap, + timeline: &TimelineTreeEl, + timelines: &HashMap, ) -> Result<()> { + let local_or_remote = match timeline.info { + TimelineInfo::Local { .. } => "(L)", + TimelineInfo::Remote { .. } => "(R)", + }; // Draw main padding - print!(" "); + print!("{} ", local_or_remote); if nesting_level > 0 { - let lsn = branch - .info - .ancestor_lsn - .as_ref() - .context("missing branch info in the HashMap")?; + let lsn_string = match &timeline.info { + TimelineInfo::Local { ancestor_lsn, .. } => ancestor_lsn + .map(|lsn| lsn.to_string()) + .unwrap_or_else(|| "Unknown local Lsn".to_string()), + TimelineInfo::Remote { .. } => "unknown Lsn (remote)".to_string(), + }; let mut br_sym = "┣━"; // Draw each nesting padding with proper style - // depending on whether its branch ended or not. + // depending on whether its timeline ended or not. if nesting_level > 1 { for l in &is_last[1..is_last.len() - 1] { if *l { @@ -329,73 +378,92 @@ fn print_branch( } } - // We are the last in this sub-branch + // We are the last in this sub-timeline if *is_last.last().unwrap() { br_sym = "┗━"; } - print!("{} @{}: ", br_sym, lsn); + print!("{} @{}: ", br_sym, lsn_string); } - // Finally print a branch name with new line - println!("{}", branch.info.name); + // Finally print a timeline id and name with new line + println!( + "{} [{}]", + timeline.name.as_deref().unwrap_or("_no_name_"), + timeline.info.timeline_id() + ); - let len = branch.children.len(); + let len = timeline.children.len(); let mut i: usize = 0; let mut is_last_new = Vec::from(is_last); is_last_new.push(false); - for child in &branch.children { + for child in &timeline.children { i += 1; - // Mark that the last padding is the end of the branch + // Mark that the last padding is the end of the timeline if i == len { if let Some(last) = is_last_new.last_mut() { *last = true; } } - print_branch( + print_timeline( nesting_level + 1, &is_last_new, - branches + timelines .get(child) - .context("missing branch info in the HashMap")?, - branches, + .context("missing timeline info in the HashMap")?, + timelines, )?; } Ok(()) } -/// Returns a map of timeline IDs to branch_name@lsn strings. +/// Returns a map of timeline IDs to timeline_id@lsn strings. /// Connects to the pageserver to query this information. -fn get_branch_infos( +fn get_timeline_infos( env: &local_env::LocalEnv, - tenantid: &ZTenantId, -) -> Result> { - let page_server = PageServerNode::from_env(env); - let branch_infos: Vec = page_server.branch_list(tenantid)?; - let branch_infos: HashMap = branch_infos + tenant_id: &ZTenantId, +) -> Result> { + Ok(PageServerNode::from_env(env) + .timeline_list(tenant_id)? .into_iter() - .map(|branch_info| (branch_info.timeline_id, branch_info)) - .collect(); - - Ok(branch_infos) + .map(|timeline_info| (timeline_info.timeline_id(), timeline_info)) + .collect()) } -// Helper function to parse --tenantid option, or get the default from config file -fn get_tenantid(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result { - if let Some(tenantid_cmd) = sub_match.value_of("tenantid") { - Ok(ZTenantId::from_str(tenantid_cmd)?) - } else if let Some(tenantid_conf) = env.default_tenantid { - Ok(tenantid_conf) +// Helper function to parse --tenant_id option, or get the default from config file +fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result { + if let Some(tenant_id_from_arguments) = parse_tenant_id(sub_match).transpose() { + tenant_id_from_arguments + } else if let Some(tenantid_conf) = env.default_tenant_id { + Ok(ZTenantId::from(tenantid_conf)) } else { - bail!("No tenantid. Use --tenantid, or set 'default_tenantid' in the config file"); + bail!("No tenant id. Use --tenant-id, or set 'default_tenant_id' in the config file"); } } -fn handle_init(init_match: &ArgMatches) -> Result<()> { +fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { + sub_match + .value_of("tenant-id") + .map(ZTenantId::from_str) + .transpose() + .context("Failed to parse tenant id from the argument string") +} + +fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result> { + sub_match + .value_of("timeline-id") + .map(ZTimelineId::from_str) + .transpose() + .context("Failed to parse timeline id from the argument string") +} + +fn handle_init(init_match: &ArgMatches) -> Result { + let initial_timeline_id_arg = parse_timeline_id(init_match)?; + // Create config file let toml_file: String = if let Some(config_path) = init_match.value_of("config") { // load and parse the file @@ -411,18 +479,29 @@ fn handle_init(init_match: &ArgMatches) -> Result<()> { env.init() .context("Failed to initialize zenith repository")?; + // default_tenantid was generated by the `env.init()` call above + let initial_tenant_id = ZTenantId::from(env.default_tenant_id.unwrap()); + // Call 'pageserver init'. let pageserver = PageServerNode::from_env(&env); - if let Err(e) = pageserver.init( - // default_tenantid was generated by the `env.init()` call above - Some(&env.default_tenantid.unwrap().to_string()), - &pageserver_config_overrides(init_match), - ) { - eprintln!("pageserver init failed: {}", e); - exit(1); - } + let initial_timeline_id = pageserver + .init( + Some(initial_tenant_id), + initial_timeline_id_arg, + &pageserver_config_overrides(init_match), + ) + .unwrap_or_else(|e| { + eprintln!("pageserver init failed: {}", e); + exit(1); + }); - Ok(()) + env.register_branch_mapping( + DEFAULT_BRANCH_NAME.to_owned(), + initial_tenant_id, + initial_timeline_id, + )?; + + Ok(env) } fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { @@ -433,7 +512,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { .collect() } -fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { let pageserver = PageServerNode::from_env(env); match tenant_match.subcommand() { Some(("list", _)) => { @@ -442,13 +521,16 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result } } Some(("create", create_match)) => { - let tenantid = match create_match.value_of("tenantid") { - Some(tenantid) => ZTenantId::from_str(tenantid)?, - None => ZTenantId::generate(), - }; - println!("using tenant id {}", tenantid); - pageserver.tenant_create(tenantid)?; - println!("tenant successfully created on the pageserver"); + let initial_tenant_id = parse_tenant_id(create_match)?; + let new_tenant_id = pageserver + .tenant_create(initial_tenant_id)? + .ok_or_else(|| { + anyhow!("Tenant with id {:?} was already created", initial_tenant_id) + })?; + println!( + "tenant {} successfully created on the pageserver", + new_tenant_id + ); } Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), None => bail!("no tenant subcommand provided"), @@ -456,24 +538,94 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result Ok(()) } -fn handle_branch(branch_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { let pageserver = PageServerNode::from_env(env); - let tenantid = get_tenantid(branch_match, env)?; + match timeline_match.subcommand() { + Some(("list", list_match)) => { + let tenant_id = get_tenant_id(list_match, env)?; + let timelines = pageserver.timeline_list(&tenant_id)?; + print_timelines_tree(timelines, env.timeline_name_mappings())?; + } + Some(("create", create_match)) => { + let tenant_id = get_tenant_id(create_match, env)?; + let new_branch_name = create_match + .value_of("branch-name") + .ok_or(anyhow!("No branch name provided"))?; + let timeline = pageserver + .timeline_create(tenant_id, None, None, None)? + .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; + let new_timeline_id = timeline.timeline_id(); - if let Some(branchname) = branch_match.value_of("branchname") { - let startpoint_str = branch_match - .value_of("start-point") - .context("Missing start-point")?; - let branch = pageserver.branch_create(branchname, startpoint_str, &tenantid)?; - println!( - "Created branch '{}' at {:?} for tenant: {}", - branch.name, branch.latest_valid_lsn, tenantid, - ); - } else { - // No arguments, list branches for tenant - let branches = pageserver.branch_list(&tenantid)?; - print_branches_tree(branches)?; + let last_record_lsn = match timeline { + TimelineInfo::Local { + last_record_lsn, .. + } => last_record_lsn, + TimelineInfo::Remote { .. } => { + bail!( + "Timeline {} was created as remote, not local", + new_timeline_id + ) + } + }; + env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; + + println!( + "Created timeline '{}' at Lsn {} for tenant: {}", + timeline.timeline_id(), + last_record_lsn, + tenant_id, + ); + } + Some(("branch", branch_match)) => { + let tenant_id = get_tenant_id(branch_match, env)?; + let new_branch_name = branch_match + .value_of("branch-name") + .ok_or(anyhow!("No branch name provided"))?; + let ancestor_branch_name = branch_match + .value_of("ancestor-branch-name") + .unwrap_or(DEFAULT_BRANCH_NAME); + let ancestor_timeline_id = env + .get_branch_timeline_id(ancestor_branch_name, tenant_id) + .ok_or_else(|| { + anyhow!( + "Found no timeline id for branch name '{}'", + ancestor_branch_name + ) + })?; + + let start_lsn = branch_match + .value_of("ancestor-start-lsn") + .map(Lsn::from_str) + .transpose() + .context("Failed to parse ancestor start Lsn from the request")?; + let timeline = pageserver + .timeline_create(tenant_id, None, start_lsn, Some(ancestor_timeline_id))? + .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; + let new_timeline_id = timeline.timeline_id(); + + let last_record_lsn = match timeline { + TimelineInfo::Local { + last_record_lsn, .. + } => last_record_lsn, + TimelineInfo::Remote { .. } => bail!( + "Timeline {} was created as remote, not local", + new_timeline_id + ), + }; + + env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; + + println!( + "Created timeline '{}' at Lsn {} for tenant: {}. Ancestor timeline: '{}'", + timeline.timeline_id(), + last_record_lsn, + tenant_id, + ancestor_branch_name, + ); + } + Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), + None => bail!("no tenant subcommand provided"), } Ok(()) @@ -487,63 +639,90 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let mut cplane = ComputeControlPlane::load(env.clone())?; - // All subcommands take an optional --tenantid option - let tenantid = get_tenantid(sub_args, env)?; + // All subcommands take an optional --tenant-id option + let tenant_id = get_tenant_id(sub_args, env)?; match sub_name { "list" => { - let branch_infos = get_branch_infos(env, &tenantid).unwrap_or_else(|e| { - eprintln!("Failed to load branch info: {}", e); + let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| { + eprintln!("Failed to load timeline info: {}", e); HashMap::new() }); - println!("NODE\tADDRESS\t\tBRANCH\tLSN\t\tSTATUS"); + let timeline_name_mappings = env.timeline_name_mappings(); + + println!("NODE\tADDRESS\tTIMELINE\tBRANCH NAME\tLSN\t\tSTATUS"); for ((_, node_name), node) in cplane .nodes .iter() - .filter(|((node_tenantid, _), _)| node_tenantid == &tenantid) + .filter(|((node_tenant_id, _), _)| node_tenant_id == &tenant_id) { // FIXME: This shows the LSN at the end of the timeline. It's not the // right thing to do for read-only nodes that might be anchored at an // older point in time, or following but lagging behind the primary. - let lsn_str = branch_infos - .get(&node.timelineid) - .map(|bi| bi.latest_valid_lsn.to_string()) - .unwrap_or_else(|| "?".to_string()); + let lsn_str = timeline_infos + .get(&node.timeline_id) + .map(|bi| match bi { + TimelineInfo::Local { + last_record_lsn, .. + } => last_record_lsn.to_string(), + TimelineInfo::Remote { .. } => "? (remote)".to_string(), + }) + .unwrap_or_else(|| '?'.to_string()); + + let branch_name = timeline_name_mappings + .get(&ZTenantTimelineId::new(tenant_id, node.timeline_id)) + .map(|name| name.as_str()) + .unwrap_or("?"); println!( - "{}\t{}\t{}\t{}\t{}", + "{}\t{}\t{}\t{}\t{}\t{}", node_name, node.address, - node.timelineid, // FIXME: resolve human-friendly branch name + node.timeline_id, + branch_name, lsn_str, node.status(), ); } } "create" => { - let node_name = sub_args.value_of("node").unwrap_or("main"); - let timeline_name = sub_args.value_of("timeline").unwrap_or(node_name); + let branch_name = sub_args + .value_of("branch-name") + .unwrap_or(DEFAULT_BRANCH_NAME); + let node_name = sub_args + .value_of("node") + .map(ToString::to_string) + .unwrap_or_else(|| format!("{}_node", branch_name)); + + let lsn = sub_args + .value_of("lsn") + .map(Lsn::from_str) + .transpose() + .context("Failed to parse Lsn from the request")?; + let timeline_id = env + .get_branch_timeline_id(branch_name, tenant_id) + .ok_or_else(|| anyhow!("Found no timeline id for branch name '{}'", branch_name))?; let port: Option = match sub_args.value_of("port") { Some(p) => Some(p.parse()?), None => None, }; - cplane.new_node(tenantid, node_name, timeline_name, port)?; + cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port)?; } "start" => { - let node_name = sub_args.value_of("node").unwrap_or("main"); - let timeline_name = sub_args.value_of("timeline"); - let port: Option = match sub_args.value_of("port") { Some(p) => Some(p.parse()?), None => None, }; + let node_name = sub_args + .value_of("node") + .ok_or_else(|| anyhow!("No node name was provided to start"))?; - let node = cplane.nodes.get(&(tenantid, node_name.to_owned())); + let node = cplane.nodes.get(&(tenant_id, node_name.to_owned())); let auth_token = if matches!(env.pageserver.auth_type, AuthType::ZenithJWT) { - let claims = Claims::new(Some(tenantid), Scope::Tenant); + let claims = Claims::new(Some(tenant_id), Scope::Tenant); Some(env.generate_auth_token(&claims)?) } else { @@ -551,40 +730,49 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { }; if let Some(node) = node { - if timeline_name.is_some() { - println!("timeline name ignored because node exists already"); - } println!("Starting existing postgres {}...", node_name); node.start(&auth_token)?; } else { + let branch_name = sub_args + .value_of("branch-name") + .unwrap_or(DEFAULT_BRANCH_NAME); + let timeline_id = env + .get_branch_timeline_id(branch_name, tenant_id) + .ok_or_else(|| { + anyhow!("Found no timeline id for branch name '{}'", branch_name) + })?; + let lsn = sub_args + .value_of("lsn") + .map(Lsn::from_str) + .transpose() + .context("Failed to parse Lsn from the request")?; // when used with custom port this results in non obvious behaviour // port is remembered from first start command, i e // start --port X // stop // start <-- will also use port X even without explicit port argument - let timeline_name = timeline_name.unwrap_or(node_name); println!( - "Starting new postgres {} on {}...", - node_name, timeline_name + "Starting new postgres {} on timeline {} ...", + node_name, timeline_id ); - let node = cplane.new_node(tenantid, node_name, timeline_name, port)?; + let node = cplane.new_node(tenant_id, node_name, timeline_id, lsn, port)?; node.start(&auth_token)?; } } "stop" => { - let node_name = sub_args.value_of("node").unwrap_or("main"); + let node_name = sub_args + .value_of("node") + .ok_or_else(|| anyhow!("No node name was provided to stop"))?; let destroy = sub_args.is_present("destroy"); let node = cplane .nodes - .get(&(tenantid, node_name.to_owned())) + .get(&(tenant_id, node_name.to_owned())) .with_context(|| format!("postgres {} is not found", node_name))?; node.stop(destroy)?; } - _ => { - bail!("Unexpected pg subcommand '{}'", sub_name) - } + _ => bail!("Unexpected pg subcommand '{}'", sub_name), } Ok(()) @@ -628,11 +816,11 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul Ok(()) } -fn get_safekeeper(env: &local_env::LocalEnv, name: &str) -> Result { - if let Some(node) = env.safekeepers.iter().find(|node| node.name == name) { +fn get_safekeeper(env: &local_env::LocalEnv, id: ZNodeId) -> Result { + if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) { Ok(SafekeeperNode::from_env(env, node)) } else { - bail!("could not find safekeeper '{}'", name) + bail!("could not find safekeeper '{}'", id) } } @@ -643,8 +831,12 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul }; // All the commands take an optional safekeeper name argument - let node_name = sub_args.value_of("node").unwrap_or(DEFAULT_SAFEKEEPER_NAME); - let safekeeper = get_safekeeper(env, node_name)?; + let sk_id = if let Some(id_str) = sub_args.value_of("id") { + ZNodeId(id_str.parse().context("while parsing safekeeper id")?) + } else { + DEFAULT_SAFEKEEPER_ID + }; + let safekeeper = get_safekeeper(env, sk_id)?; match sub_name { "start" => { @@ -697,7 +889,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); if let Err(e) = safekeeper.start() { - eprintln!("safekeeper '{}' start failed: {}", safekeeper.name, e); + eprintln!("safekeeper '{}' start failed: {}", safekeeper.id, e); exit(1); } } @@ -724,7 +916,7 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result< for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); if let Err(e) = safekeeper.stop(immediate) { - eprintln!("safekeeper '{}' stop failed: {}", safekeeper.name, e); + eprintln!("safekeeper '{}' stop failed: {}", safekeeper.id, e); } } Ok(()) diff --git a/zenith_utils/Cargo.toml b/zenith_utils/Cargo.toml index b22fcbf748..daaf345f8f 100644 --- a/zenith_utils/Cargo.toml +++ b/zenith_utils/Cargo.toml @@ -37,3 +37,8 @@ bytes = "1.0.1" hex-literal = "0.3" tempfile = "3.2" webpki = "0.21" +criterion = "0.3" + +[[bench]] +name = "benchmarks" +harness = false diff --git a/zenith_utils/benches/benchmarks.rs b/zenith_utils/benches/benchmarks.rs new file mode 100644 index 0000000000..c945d5021c --- /dev/null +++ b/zenith_utils/benches/benchmarks.rs @@ -0,0 +1,22 @@ +#![allow(unused)] + +use criterion::{criterion_group, criterion_main, Criterion}; +use zenith_utils::zid; + +pub fn bench_zid_stringify(c: &mut Criterion) { + // Can only use public methods. + let ztl = zid::ZTenantTimelineId::generate(); + + c.bench_function("zid.to_string", |b| { + b.iter(|| { + // FIXME measurement overhead? + //for _ in 0..1000 { + // ztl.tenant_id.to_string(); + //} + ztl.tenant_id.to_string(); + }) + }); +} + +criterion_group!(benches, bench_zid_stringify); +criterion_main!(benches); diff --git a/zenith_utils/src/auth.rs b/zenith_utils/src/auth.rs index 274dd13bee..cbc4fcee61 100644 --- a/zenith_utils/src/auth.rs +++ b/zenith_utils/src/auth.rs @@ -5,9 +5,7 @@ // The second one is that we wanted to use ed25519 keys, but they are also not supported until next version. So we go with RSA keys for now. // Relevant issue: https://github.com/Keats/jsonwebtoken/issues/162 -use hex::{self, FromHex}; -use serde::de::Error; -use serde::{self, Deserializer, Serializer}; +use serde; use std::fs; use std::path::Path; @@ -17,7 +15,7 @@ use jsonwebtoken::{ }; use serde::{Deserialize, Serialize}; -use crate::zid::ZTenantId; +use crate::zid::{HexZTenantId, ZTenantId}; const JWT_ALGORITHM: Algorithm = Algorithm::RS256; @@ -28,44 +26,18 @@ pub enum Scope { PageServerApi, } -pub fn to_hex_option(value: &Option, serializer: S) -> Result -where - S: Serializer, -{ - match value { - Some(tid) => hex::serialize(tid, serializer), - None => Option::serialize(value, serializer), - } -} - -fn from_hex_option<'de, D>(deserializer: D) -> Result, D::Error> -where - D: Deserializer<'de>, -{ - let opt: Option = Option::deserialize(deserializer)?; - match opt { - Some(tid) => Ok(Some(ZTenantId::from_hex(tid).map_err(Error::custom)?)), - None => Ok(None), - } -} - #[derive(Debug, Serialize, Deserialize, Clone)] pub struct Claims { - // this custom serialize/deserialize_with is needed because Option is not transparent to serde - // so clearest option is serde(with = "hex") but it is not working, for details see https://github.com/serde-rs/serde/issues/1301 - #[serde( - default, - skip_serializing_if = "Option::is_none", - serialize_with = "to_hex_option", - deserialize_with = "from_hex_option" - )] - pub tenant_id: Option, + pub tenant_id: Option, pub scope: Scope, } impl Claims { pub fn new(tenant_id: Option, scope: Scope) -> Self { - Self { tenant_id, scope } + Self { + tenant_id: tenant_id.map(HexZTenantId::from), + scope, + } } } @@ -75,7 +47,7 @@ pub fn check_permission(claims: &Claims, tenantid: Option) -> Result< bail!("Attempt to access management api with tenant scope. Permission denied") } (Scope::Tenant, Some(tenantid)) => { - if claims.tenant_id.unwrap() != tenantid { + if ZTenantId::from(claims.tenant_id.unwrap()) != tenantid { bail!("Tenant id mismatch. Permission denied") } Ok(()) diff --git a/zenith_utils/src/zid.rs b/zenith_utils/src/zid.rs index 2e93ab596c..e047e38da7 100644 --- a/zenith_utils/src/zid.rs +++ b/zenith_utils/src/zid.rs @@ -2,13 +2,100 @@ use std::{fmt, str::FromStr}; use hex::FromHex; use rand::Rng; -use serde::{Deserialize, Serialize}; +use serde::{ + de::{self, Visitor}, + Deserialize, Serialize, +}; -// Zenith ID is a 128-bit random ID. -// Used to represent various identifiers. Provides handy utility methods and impls. +macro_rules! mutual_from { + ($id1:ident, $id2:ident) => { + impl From<$id1> for $id2 { + fn from(id1: $id1) -> Self { + Self(id1.0.into()) + } + } + + impl From<$id2> for $id1 { + fn from(id2: $id2) -> Self { + Self(id2.0.into()) + } + } + }; +} + +/// Zenith ID is a 128-bit random ID. +/// Used to represent various identifiers. Provides handy utility methods and impls. +/// +/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look +/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. +/// Use [`HexZId`] to serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] struct ZId([u8; 16]); +/// [`ZId`] version that serializes and deserializes as a hex string. +/// Useful for various json serializations, where hex byte array from original id is not convenient. +/// +/// Plain `ZId` could be (de)serialized into hex string with `#[serde(with = "hex")]` attribute. +/// This however won't work on nested types like `Option` or `Vec`, see https://github.com/serde-rs/serde/issues/723 for the details. +/// Every separate type currently needs a new (de)serializing method for every type separately. +/// +/// To provide a generic way to serialize the ZId as a hex string where `#[serde(with = "hex")]` is not enough, this wrapper is created. +/// The default wrapper serialization is left unchanged due to +/// * byte array (de)serialization being faster and simpler +/// * byte deserialization being used in Safekeeper already, with those bytes coming from compute (see `ProposerGreeting` in safekeeper) +/// * current `HexZId`'s deserialization impl breaks on compute byte array deserialization, having it by default is dangerous +#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +struct HexZId([u8; 16]); + +impl Serialize for HexZId { + fn serialize(&self, ser: S) -> Result + where + S: serde::Serializer, + { + hex::encode(self.0).serialize(ser) + } +} + +impl<'de> Deserialize<'de> for HexZId { + fn deserialize(de: D) -> Result + where + D: serde::Deserializer<'de>, + { + de.deserialize_bytes(HexVisitor) + } +} + +struct HexVisitor; + +impl<'de> Visitor<'de> for HexVisitor { + type Value = HexZId; + + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "A hexadecimal representation of a 128-bit random Zenith ID" + ) + } + + fn visit_bytes(self, hex_bytes: &[u8]) -> Result + where + E: de::Error, + { + ZId::from_hex(hex_bytes) + .map(HexZId::from) + .map_err(de::Error::custom) + } + + fn visit_str(self, hex_bytes_str: &str) -> Result + where + E: de::Error, + { + Self::visit_bytes(self, hex_bytes_str.as_bytes()) + } +} + +mutual_from!(ZId, HexZId); + impl ZId { pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> ZId { let mut arr = [0u8; 16]; @@ -25,6 +112,17 @@ impl ZId { rand::thread_rng().fill(&mut tli_buf); ZId::from(tli_buf) } + + fn hex_encode(&self) -> String { + static HEX: &[u8] = b"0123456789abcdef"; + + let mut buf = vec![0u8; self.0.len() * 2]; + for (&b, chunk) in self.0.as_ref().iter().zip(buf.chunks_exact_mut(2)) { + chunk[0] = HEX[((b >> 4) & 0xf) as usize]; + chunk[1] = HEX[(b & 0xf) as usize]; + } + unsafe { String::from_utf8_unchecked(buf) } + } } impl FromStr for ZId { @@ -60,13 +158,13 @@ impl From<[u8; 16]> for ZId { impl fmt::Display for ZId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(&hex::encode(self.0)) + f.write_str(&self.hex_encode()) } } impl fmt::Debug for ZId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(&hex::encode(self.0)) + f.write_str(&self.hex_encode()) } } @@ -155,46 +253,80 @@ macro_rules! zid_newtype { /// is separate from PostgreSQL timelines, and doesn't have those /// limitations. A zenith timeline is identified by a 128-bit ID, which /// is usually printed out as a hex string. +/// +/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look +/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. +/// Use [`HexZTimelineId`] to serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. #[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] pub struct ZTimelineId(ZId); -zid_newtype!(ZTimelineId); +/// A [`ZTimelineId`] version that gets (de)serialized as a hex string. +/// Use in complex types, where `#[serde(with = "hex")]` does not work. +/// See [`HexZId`] for more details. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] +pub struct HexZTimelineId(HexZId); -// Zenith Tenant Id represents identifiar of a particular tenant. -// Is used for distinguishing requests and data belonging to different users. +impl std::fmt::Debug for HexZTimelineId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + ZTimelineId::from(*self).fmt(f) + } +} + +impl std::fmt::Display for HexZTimelineId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + ZTimelineId::from(*self).fmt(f) + } +} + +impl FromStr for HexZTimelineId { + type Err = ::Err; + + fn from_str(s: &str) -> Result { + Ok(HexZTimelineId::from(ZTimelineId::from_str(s)?)) + } +} + +zid_newtype!(ZTimelineId); +mutual_from!(ZTimelineId, HexZTimelineId); + +/// Zenith Tenant Id represents identifiar of a particular tenant. +/// Is used for distinguishing requests and data belonging to different users. +/// +/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look +/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. +/// Use [`HexZTenantId`] to serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] pub struct ZTenantId(ZId); -zid_newtype!(ZTenantId); +/// A [`ZTenantId`] version that gets (de)serialized as a hex string. +/// Use in complex types, where `#[serde(with = "hex")]` does not work. +/// See [`HexZId`] for more details. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] +pub struct HexZTenantId(HexZId); -/// Serde routines for Option (de)serialization, using `T:Display` representations for inner values. -/// Useful for Option and Option to get their hex representations into serialized string and deserialize them back. -pub mod opt_display_serde { - use serde::{de, Deserialize, Deserializer, Serialize, Serializer}; - use std::{fmt::Display, str::FromStr}; - - pub fn serialize(id: &Option, ser: S) -> Result - where - S: Serializer, - Id: Display, - { - id.as_ref().map(ToString::to_string).serialize(ser) - } - - pub fn deserialize<'de, D, Id>(des: D) -> Result, D::Error> - where - D: Deserializer<'de>, - Id: FromStr, - ::Err: Display, - { - Ok(if let Some(s) = Option::::deserialize(des)? { - Some(Id::from_str(&s).map_err(de::Error::custom)?) - } else { - None - }) +impl std::fmt::Debug for HexZTenantId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + ZTenantId::from(*self).fmt(f) } } +impl std::fmt::Display for HexZTenantId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + ZTenantId::from(*self).fmt(f) + } +} + +impl FromStr for HexZTenantId { + type Err = ::Err; + + fn from_str(s: &str) -> Result { + Ok(HexZTenantId::from(ZTenantId::from_str(s)?)) + } +} + +zid_newtype!(ZTenantId); +mutual_from!(ZTenantId, HexZTenantId); + // A pair uniquely identifying Zenith instance. #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash)] pub struct ZTenantTimelineId { @@ -213,6 +345,10 @@ impl ZTenantTimelineId { pub fn generate() -> Self { Self::new(ZTenantId::generate(), ZTimelineId::generate()) } + + pub fn empty() -> Self { + Self::new(ZTenantId::from([0u8; 16]), ZTimelineId::from([0u8; 16])) + } } impl fmt::Display for ZTenantTimelineId { @@ -221,6 +357,18 @@ impl fmt::Display for ZTenantTimelineId { } } +// Unique ID of a storage node (safekeeper or pageserver). Supposed to be issued +// by the console. +#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Debug, Serialize, Deserialize)] +#[serde(transparent)] +pub struct ZNodeId(pub u64); + +impl fmt::Display for ZNodeId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} + #[cfg(test)] mod tests { use std::fmt::Display; @@ -231,16 +379,15 @@ mod tests { #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] struct TestStruct + Display> { - #[serde(with = "opt_display_serde")] field: Option, } #[test] fn test_hex_serializations_tenant_id() { let original_struct = TestStruct { - field: Some(ZTenantId::from_array(hex!( + field: Some(HexZTenantId::from(ZTenantId::from_array(hex!( "11223344556677881122334455667788" - ))), + )))), }; let serialized_string = serde_json::to_string(&original_struct).unwrap(); @@ -249,7 +396,7 @@ mod tests { r#"{"field":"11223344556677881122334455667788"}"# ); - let deserialized_struct: TestStruct = + let deserialized_struct: TestStruct = serde_json::from_str(&serialized_string).unwrap(); assert_eq!(original_struct, deserialized_struct); } @@ -257,9 +404,9 @@ mod tests { #[test] fn test_hex_serializations_timeline_id() { let original_struct = TestStruct { - field: Some(ZTimelineId::from_array(hex!( + field: Some(HexZTimelineId::from(ZTimelineId::from_array(hex!( "AA223344556677881122334455667788" - ))), + )))), }; let serialized_string = serde_json::to_string(&original_struct).unwrap(); @@ -268,7 +415,7 @@ mod tests { r#"{"field":"aa223344556677881122334455667788"}"# ); - let deserialized_struct: TestStruct = + let deserialized_struct: TestStruct = serde_json::from_str(&serialized_string).unwrap(); assert_eq!(original_struct, deserialized_struct); } From 1fddb0556f9e3dea86857f62fdc42b2d0db3d6d0 Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Thu, 17 Mar 2022 00:01:17 +0300 Subject: [PATCH 0038/1022] deploy playbook fix - interaction with console (#1374) --- .circleci/ansible/deploy.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml index c95524a8a5..2dd109f99a 100644 --- a/.circleci/ansible/deploy.yaml +++ b/.circleci/ansible/deploy.yaml @@ -119,7 +119,7 @@ shell: cmd: | INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) - curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID + curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID tags: - pageserver @@ -169,6 +169,6 @@ shell: cmd: | INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) - curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID + curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID tags: - safekeeper From a7544eead59b4039ce18fcfc226b9e175f6521ed Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Thu, 17 Mar 2022 16:46:58 +0300 Subject: [PATCH 0039/1022] Remove the last non-borrowed string from `BeMessage` (#1376) --- proxy/src/auth.rs | 4 ++-- zenith_utils/src/pq_proto.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 5e6357fe80..e8fe65c081 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -200,7 +200,7 @@ async fn handle_new_user( client .write_message_noflush(&Be::AuthenticationOk)? .write_message_noflush(&BeParameterStatusMessage::encoding())? - .write_message(&Be::NoticeResponse(greeting)) + .write_message(&Be::NoticeResponse(&greeting)) .await?; // Wait for web console response (see `mgmt`) @@ -208,7 +208,7 @@ async fn handle_new_user( }) .await?; - client.write_message_noflush(&Be::NoticeResponse("Connecting to database.".into()))?; + client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; Ok(db_info) } diff --git a/zenith_utils/src/pq_proto.rs b/zenith_utils/src/pq_proto.rs index 355b38fc95..cb69418c07 100644 --- a/zenith_utils/src/pq_proto.rs +++ b/zenith_utils/src/pq_proto.rs @@ -425,7 +425,7 @@ pub enum BeMessage<'a> { ReadyForQuery, RowDescription(&'a [RowDescriptor<'a>]), XLogData(XLogDataBody<'a>), - NoticeResponse(String), + NoticeResponse(&'a str), KeepAlive(WalSndKeepAlive), } From 7738254f83c86e46795b34db834d18af97197d8d Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 17 Mar 2022 13:21:00 +0400 Subject: [PATCH 0040/1022] refactor timeline memory state management --- control_plane/src/storage.rs | 16 +- pageserver/src/bin/pageserver.rs | 49 +- pageserver/src/http/models.rs | 96 +++- pageserver/src/http/routes.rs | 188 +++++-- pageserver/src/layered_repository.rs | 471 ++++++++---------- pageserver/src/page_service.rs | 16 +- pageserver/src/remote_storage.rs | 37 +- pageserver/src/remote_storage/storage_sync.rs | 274 +++++----- .../remote_storage/storage_sync/download.rs | 73 +-- .../src/remote_storage/storage_sync/index.rs | 126 ++++- .../src/remote_storage/storage_sync/upload.rs | 110 ++-- pageserver/src/repository.rs | 257 ++++++---- pageserver/src/tenant_mgr.rs | 169 +++---- pageserver/src/timelines.rs | 348 +++++++------ pageserver/src/walreceiver.rs | 56 ++- .../batch_others/test_remote_storage.py | 39 +- .../batch_others/test_tenant_relocation.py | 81 ++- test_runner/fixtures/zenith_fixtures.py | 89 ++++ zenith/src/main.rs | 105 ++-- zenith_utils/src/http/error.rs | 6 + 20 files changed, 1484 insertions(+), 1122 deletions(-) diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index f6b7173067..ef43ba3c1e 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,4 +1,3 @@ -use std::convert::TryFrom; use std::io::Write; use std::net::TcpStream; use std::path::PathBuf; @@ -10,7 +9,7 @@ use anyhow::{bail, Context}; use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; -use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponse}; +use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest}; use pageserver::timelines::TimelineInfo; use postgres::{Config, NoTls}; use reqwest::blocking::{Client, RequestBuilder, Response}; @@ -358,7 +357,7 @@ impl PageServerNode { } pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result> { - let timeline_infos: Vec = self + let timeline_infos: Vec = self .http_request( Method::GET, format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), @@ -367,10 +366,7 @@ impl PageServerNode { .error_from_body()? .json()?; - timeline_infos - .into_iter() - .map(TimelineInfo::try_from) - .collect() + Ok(timeline_infos) } pub fn timeline_create( @@ -392,10 +388,8 @@ impl PageServerNode { }) .send()? .error_from_body()? - .json::>()?; + .json::>()?; - timeline_info_response - .map(TimelineInfo::try_from) - .transpose() + Ok(timeline_info_response) } } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index d37ba0cece..05fb14daca 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -18,7 +18,10 @@ use daemonize::Daemonize; use pageserver::{ config::{defaults::*, PageServerConf}, - http, page_cache, page_service, remote_storage, tenant_mgr, thread_mgr, + http, page_cache, page_service, + remote_storage::{self, SyncStartupData}, + repository::TimelineSyncStatusUpdate, + tenant_mgr, thread_mgr, thread_mgr::ThreadKind, timelines, virtual_file, LOG_FILE_NAME, }; @@ -227,11 +230,47 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() } let signals = signals::install_shutdown_handlers()?; - let sync_startup = remote_storage::start_local_timeline_sync(conf) + + // Initialize repositories with locally available timelines. + // Timelines that are only partially available locally (remote storage has more data than this pageserver) + // are scheduled for download and added to the repository once download is completed. + let SyncStartupData { + remote_index, + local_timeline_init_statuses, + } = remote_storage::start_local_timeline_sync(conf) .context("Failed to set up local files sync with external storage")?; - // Initialize tenant manager. - tenant_mgr::set_timeline_states(conf, sync_startup.initial_timeline_states); + for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses { + // initialize local tenant + let repo = tenant_mgr::load_local_repo(conf, tenant_id, &remote_index); + for (timeline_id, init_status) in local_timeline_init_statuses { + match init_status { + remote_storage::LocalTimelineInitStatus::LocallyComplete => { + debug!("timeline {} for tenant {} is locally complete, registering it in repository", tenant_id, timeline_id); + // Lets fail here loudly to be on the safe side. + // XXX: It may be a better api to actually distinguish between repository startup + // and processing of newly downloaded timelines. + repo.apply_timeline_remote_sync_status_update( + timeline_id, + TimelineSyncStatusUpdate::Downloaded, + ) + .with_context(|| { + format!( + "Failed to bootstrap timeline {} for tenant {}", + timeline_id, tenant_id + ) + })? + } + remote_storage::LocalTimelineInitStatus::NeedsSync => { + debug!( + "timeline {} for tenant {} needs sync, \ + so skipped for adding into repository until sync is finished", + tenant_id, timeline_id + ); + } + } + } + } // initialize authentication for incoming connections let auth = match &conf.auth_type { @@ -253,7 +292,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() None, "http_endpoint_thread", move || { - let router = http::make_router(conf, auth_cloned); + let router = http::make_router(conf, auth_cloned, remote_index); endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher()) }, )?; diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 9844e7ea82..8827713f11 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -1,11 +1,12 @@ -use crate::timelines::TimelineInfo; -use anyhow::{anyhow, bail, Context}; +use anyhow::Context; use serde::{Deserialize, Serialize}; use zenith_utils::{ lsn::Lsn, zid::{HexZTenantId, HexZTimelineId, ZNodeId, ZTenantId, ZTimelineId}, }; +use crate::timelines::{LocalTimelineInfo, TimelineInfo}; + #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { pub new_timeline_id: Option, @@ -18,8 +19,28 @@ pub struct TenantCreateRequest { pub new_tenant_id: Option, } +#[derive(Clone)] +pub enum TimelineInfoV1 { + Local { + timeline_id: ZTimelineId, + tenant_id: ZTenantId, + last_record_lsn: Lsn, + prev_record_lsn: Option, + ancestor_timeline_id: Option, + ancestor_lsn: Option, + disk_consistent_lsn: Lsn, + current_logical_size: Option, + current_logical_size_non_incremental: Option, + }, + Remote { + timeline_id: ZTimelineId, + tenant_id: ZTenantId, + disk_consistent_lsn: Lsn, + }, +} + #[derive(Serialize, Deserialize)] -pub struct TimelineInfoResponse { +pub struct TimelineInfoResponseV1 { pub kind: String, #[serde(with = "hex")] timeline_id: ZTimelineId, @@ -34,10 +55,10 @@ pub struct TimelineInfoResponse { current_logical_size_non_incremental: Option, } -impl From for TimelineInfoResponse { - fn from(other: TimelineInfo) -> Self { +impl From for TimelineInfoResponseV1 { + fn from(other: TimelineInfoV1) -> Self { match other { - TimelineInfo::Local { + TimelineInfoV1::Local { timeline_id, tenant_id, last_record_lsn, @@ -47,23 +68,23 @@ impl From for TimelineInfoResponse { disk_consistent_lsn, current_logical_size, current_logical_size_non_incremental, - } => TimelineInfoResponse { + } => TimelineInfoResponseV1 { kind: "Local".to_owned(), timeline_id, tenant_id, disk_consistent_lsn: disk_consistent_lsn.to_string(), last_record_lsn: Some(last_record_lsn.to_string()), - prev_record_lsn: Some(prev_record_lsn.to_string()), + prev_record_lsn: prev_record_lsn.map(|lsn| lsn.to_string()), ancestor_timeline_id: ancestor_timeline_id.map(HexZTimelineId::from), ancestor_lsn: ancestor_lsn.map(|lsn| lsn.to_string()), - current_logical_size: Some(current_logical_size), + current_logical_size, current_logical_size_non_incremental, }, - TimelineInfo::Remote { + TimelineInfoV1::Remote { timeline_id, tenant_id, disk_consistent_lsn, - } => TimelineInfoResponse { + } => TimelineInfoResponseV1 { kind: "Remote".to_owned(), timeline_id, tenant_id, @@ -79,10 +100,10 @@ impl From for TimelineInfoResponse { } } -impl TryFrom for TimelineInfo { +impl TryFrom for TimelineInfoV1 { type Error = anyhow::Error; - fn try_from(other: TimelineInfoResponse) -> anyhow::Result { + fn try_from(other: TimelineInfoResponseV1) -> anyhow::Result { let parse_lsn_hex_string = |lsn_string: String| { lsn_string .parse::() @@ -91,33 +112,68 @@ impl TryFrom for TimelineInfo { let disk_consistent_lsn = parse_lsn_hex_string(other.disk_consistent_lsn)?; Ok(match other.kind.as_str() { - "Local" => TimelineInfo::Local { + "Local" => TimelineInfoV1::Local { timeline_id: other.timeline_id, tenant_id: other.tenant_id, last_record_lsn: other .last_record_lsn - .ok_or(anyhow!("Local timeline should have last_record_lsn")) + .ok_or(anyhow::anyhow!( + "Local timeline should have last_record_lsn" + )) .and_then(parse_lsn_hex_string)?, prev_record_lsn: other .prev_record_lsn - .ok_or(anyhow!("Local timeline should have prev_record_lsn")) - .and_then(parse_lsn_hex_string)?, + .map(parse_lsn_hex_string) + .transpose()?, ancestor_timeline_id: other.ancestor_timeline_id.map(ZTimelineId::from), ancestor_lsn: other.ancestor_lsn.map(parse_lsn_hex_string).transpose()?, disk_consistent_lsn, - current_logical_size: other.current_logical_size.ok_or(anyhow!("No "))?, + current_logical_size: other.current_logical_size, current_logical_size_non_incremental: other.current_logical_size_non_incremental, }, - "Remote" => TimelineInfo::Remote { + "Remote" => TimelineInfoV1::Remote { timeline_id: other.timeline_id, tenant_id: other.tenant_id, disk_consistent_lsn, }, - unknown => bail!("Unknown timeline kind: {}", unknown), + unknown => anyhow::bail!("Unknown timeline kind: {}", unknown), }) } } +fn from_local( + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + local: &LocalTimelineInfo, +) -> TimelineInfoV1 { + TimelineInfoV1::Local { + timeline_id, + tenant_id, + last_record_lsn: local.last_record_lsn, + prev_record_lsn: local.prev_record_lsn, + ancestor_timeline_id: local.ancestor_timeline_id.map(ZTimelineId::from), + ancestor_lsn: local.ancestor_lsn, + disk_consistent_lsn: local.disk_consistent_lsn, + current_logical_size: local.current_logical_size, + current_logical_size_non_incremental: local.current_logical_size_non_incremental, + } +} + +impl From for TimelineInfoV1 { + fn from(t: TimelineInfo) -> Self { + match (t.local.as_ref(), t.remote.as_ref()) { + (None, None) => unreachable!(), + (None, Some(remote)) => TimelineInfoV1::Remote { + timeline_id: t.timeline_id, + tenant_id: t.tenant_id, + disk_consistent_lsn: remote.remote_consistent_lsn.unwrap_or(Lsn(0)), + }, + (Some(local), None) => from_local(t.tenant_id, t.timeline_id, local), + (Some(local), Some(_)) => from_local(t.tenant_id, t.timeline_id, local), + } + } +} + #[derive(Serialize)] pub struct StatusResponse { pub id: ZNodeId, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 8365601042..2d913afe4e 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use anyhow::Result; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; +use tokio::sync::RwLock; use tracing::*; use zenith_utils::auth::JwtAuth; use zenith_utils::http::endpoint::attach_openapi_ui; @@ -16,24 +17,32 @@ use zenith_utils::http::{ request::parse_request_param, }; use zenith_utils::http::{RequestExt, RouterBuilder}; -use zenith_utils::zid::{HexZTenantId, ZTimelineId}; +use zenith_utils::zid::{HexZTenantId, ZTenantTimelineId, ZTimelineId}; use super::models::{ - StatusResponse, TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponse, + StatusResponse, TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponseV1, + TimelineInfoV1, +}; +use crate::remote_storage::{schedule_timeline_download, RemoteTimelineIndex}; +use crate::timelines::{ + extract_remote_timeline_info, LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo, }; -use crate::repository::RepositoryTimeline; -use crate::timelines::TimelineInfo; use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId}; #[derive(Debug)] struct State { conf: &'static PageServerConf, auth: Option>, + remote_index: Arc>, allowlist_routes: Vec, } impl State { - fn new(conf: &'static PageServerConf, auth: Option>) -> Self { + fn new( + conf: &'static PageServerConf, + auth: Option>, + remote_index: Arc>, + ) -> Self { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() .map(|v| v.parse().unwrap()) @@ -42,6 +51,7 @@ impl State { conf, auth, allowlist_routes, + remote_index, } } } @@ -88,7 +98,7 @@ async fn timeline_create_handler(mut request: Request) -> Result json_response(StatusCode::CREATED, TimelineInfoResponse::from(info))?, + Some(info) => json_response(StatusCode::CREATED, info)?, None => json_response(StatusCode::CONFLICT, ())?, }) } @@ -97,15 +107,24 @@ async fn timeline_list_handler(request: Request) -> Result, let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); - let response_data: Vec = tokio::task::spawn_blocking(move || { + let local_timeline_infos = tokio::task::spawn_blocking(move || { let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); - crate::timelines::get_timelines(tenant_id, include_non_incremental_logical_size) + crate::timelines::get_local_timelines(tenant_id, include_non_incremental_logical_size) }) .await - .map_err(ApiError::from_err)?? - .into_iter() - .map(TimelineInfoResponse::from) - .collect(); + .map_err(ApiError::from_err)??; + + let remote_index = get_state(&request).remote_index.read().await; + let mut response_data = Vec::with_capacity(local_timeline_infos.len()); + for (timeline_id, local_timeline_info) in local_timeline_infos { + response_data.push(TimelineInfo { + tenant_id, + timeline_id, + local: Some(local_timeline_info), + remote: extract_remote_timeline_info(tenant_id, timeline_id, &remote_index), + }) + } + Ok(json_response(StatusCode::OK, response_data)?) } @@ -124,30 +143,76 @@ fn get_include_non_incremental_logical_size(request: &Request) -> bool { .unwrap_or(false) } -async fn timeline_detail_handler(request: Request) -> Result, ApiError> { +// common part for v1 and v2 handlers +async fn timeline_detail_common(request: Request) -> Result { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); - let response_data = tokio::task::spawn_blocking(move || { - let _enter = - info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id) - .entered(); + let span = info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id); + + let (local_timeline_info, span) = tokio::task::spawn_blocking(move || { + let entered = span.entered(); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - let include_non_incremental_logical_size = - get_include_non_incremental_logical_size(&request); - Ok::<_, anyhow::Error>(TimelineInfo::from_repo_timeline( - tenant_id, - repo.get_timeline(timeline_id)?, - include_non_incremental_logical_size, - )) + let local_timeline = { + repo.get_timeline(timeline_id) + .map(|timeline| { + LocalTimelineInfo::from_repo_timeline( + timeline, + include_non_incremental_logical_size, + ) + }) + .transpose()? + }; + Ok::<_, anyhow::Error>((local_timeline, entered.exit())) }) .await - .map_err(ApiError::from_err)? - .map(TimelineInfoResponse::from)?; + .map_err(ApiError::from_err)??; - Ok(json_response(StatusCode::OK, response_data)?) + let remote_timeline_info = { + let remote_index_read = get_state(&request).remote_index.read().await; + remote_index_read + .timeline_entry(&ZTenantTimelineId { + tenant_id, + timeline_id, + }) + .map(|remote_entry| RemoteTimelineInfo { + remote_consistent_lsn: remote_entry.disk_consistent_lsn(), + awaits_download: remote_entry.get_awaits_download(), + }) + }; + + let _enter = span.entered(); + + if local_timeline_info.is_none() && remote_timeline_info.is_none() { + return Err(ApiError::NotFound( + "Timeline is not found neither locally nor remotely".to_string(), + )); + } + + Ok(TimelineInfo { + tenant_id, + timeline_id, + local: local_timeline_info, + remote: remote_timeline_info, + }) +} + +// TODO remove when console adopts v2 +async fn timeline_detail_handler_v1(request: Request) -> Result, ApiError> { + let timeline_info = timeline_detail_common(request).await?; + Ok(json_response( + StatusCode::OK, + TimelineInfoResponseV1::from(TimelineInfoV1::from(timeline_info)), + )?) +} + +async fn timeline_detail_handler_v2(request: Request) -> Result, ApiError> { + let timeline_info = timeline_detail_common(request).await?; + + Ok(json_response(StatusCode::OK, timeline_info)?) } async fn timeline_attach_handler(request: Request) -> Result, ApiError> { @@ -155,31 +220,37 @@ async fn timeline_attach_handler(request: Request) -> Result { - anyhow::bail!("Timeline with id {} is already local", timeline_id) - } - RepositoryTimeline::Remote { - id: _, - disk_consistent_lsn: _, - } => { - // FIXME (rodionov) get timeline already schedules timeline for download, and duplicate tasks can cause errors - // first should be fixed in https://github.com/zenithdb/zenith/issues/997 - // TODO (rodionov) change timeline state to awaits download (incapsulate it somewhere in the repo) - // TODO (rodionov) can we safely request replication on the timeline before sync is completed? (can be implemented on top of the #997) - Ok(()) - } - } + let span = tokio::task::spawn_blocking(move || { + let entered = span.entered(); + if tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id).is_ok() { + anyhow::bail!("Timeline is already present locally") + }; + Ok(entered.exit()) }) .await .map_err(ApiError::from_err)??; + let mut remote_index_write = get_state(&request).remote_index.write().await; + + let _enter = span.entered(); // entered guard cannot live across awaits (non Send) + let index_entry = remote_index_write + .timeline_entry_mut(&ZTenantTimelineId { + tenant_id, + timeline_id, + }) + .ok_or_else(|| ApiError::BadRequest("Unknown remote timeline".to_string()))?; + + if index_entry.get_awaits_download() { + return Err(ApiError::NotFound( + "Timeline download is already in progress".to_string(), + )); + } + + index_entry.set_awaits_download(true); + schedule_timeline_download(tenant_id, timeline_id); + Ok(json_response(StatusCode::ACCEPTED, ())?) } @@ -221,13 +292,17 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result, ApiError> { pub fn make_router( conf: &'static PageServerConf, auth: Option>, + remote_index: Arc>, ) -> RouterBuilder { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); @@ -263,7 +339,7 @@ pub fn make_router( } router - .data(Arc::new(State::new(conf, auth))) + .data(Arc::new(State::new(conf, auth, remote_index))) .get("/v1/status", status_handler) .get("/v1/tenant", tenant_list_handler) .post("/v1/tenant", tenant_create_handler) @@ -271,7 +347,11 @@ pub fn make_router( .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id", - timeline_detail_handler, + timeline_detail_handler_v1, + ) + .get( + "/v2/tenant/:tenant_id/timeline/:timeline_id", + timeline_detail_handler_v2, ) .post( "/v1/tenant/:tenant_id/timeline/:timeline_id/attach", diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 9e0df5dab2..c17df84689 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -35,9 +35,9 @@ use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; use crate::config::PageServerConf; use crate::page_cache; use crate::relish::*; -use crate::remote_storage::{schedule_timeline_checkpoint_upload, schedule_timeline_download}; +use crate::remote_storage::{schedule_timeline_checkpoint_upload, RemoteTimelineIndex}; use crate::repository::{ - BlockNumber, GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncState, + BlockNumber, GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncStatusUpdate, TimelineWriter, ZenithWalRecord, }; use crate::thread_mgr; @@ -129,27 +129,46 @@ pub struct LayeredRepository { // timeout... gc_cs: Mutex<()>, walredo_mgr: Arc, + + // provides access to timeline data sitting in the remote storage + // supposed to be used for retrieval of remote consistent lsn in walreceiver + remote_index: Arc>, + /// Makes every timeline to backup their files to remote storage. upload_relishes: bool, } /// Public interface impl Repository for LayeredRepository { - fn get_timeline(&self, timelineid: ZTimelineId) -> Result { - Ok(RepositoryTimeline::from(self.get_or_init_timeline( - timelineid, - &mut self.timelines.lock().unwrap(), - )?)) + fn get_timeline(&self, timelineid: ZTimelineId) -> Option { + let timelines = self.timelines.lock().unwrap(); + self.get_timeline_internal(timelineid, &timelines) + .map(RepositoryTimeline::from) } - fn list_timelines(&self) -> Result> { - Ok(self - .timelines + fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result> { + let mut timelines = self.timelines.lock().unwrap(); + match self.get_timeline_load_internal(timelineid, &mut timelines)? { + Some(local_loaded_timeline) => Ok(local_loaded_timeline as _), + None => anyhow::bail!( + "cannot get local timeline: unknown timeline id: {}", + timelineid + ), + } + } + + fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)> { + self.timelines .lock() .unwrap() - .values() - .map(|timeline_entry| RepositoryTimeline::from(timeline_entry.clone())) - .collect()) + .iter() + .map(|(timeline_id, timeline_entry)| { + ( + *timeline_id, + RepositoryTimeline::from(timeline_entry.clone()), + ) + }) + .collect() } fn create_empty_timeline( @@ -176,10 +195,16 @@ impl Repository for LayeredRepository { self.upload_relishes, ); - let timeline_rc = Arc::new(timeline); - let r = timelines.insert(timelineid, LayeredTimelineEntry::Local(timeline_rc.clone())); - assert!(r.is_none()); - Ok(timeline_rc) + let timeline = Arc::new(timeline); + let r = timelines.insert( + timelineid, + LayeredTimelineEntry::Loaded(Arc::clone(&timeline)), + ); + ensure!( + r.is_none(), + "assertion failure, inserted duplicate timeline" + ); + Ok(timeline) } /// Branch a timeline @@ -190,14 +215,12 @@ impl Repository for LayeredRepository { let _gc_cs = self.gc_cs.lock().unwrap(); let mut timelines = self.timelines.lock().unwrap(); - let src_timeline = match self.get_or_init_timeline(src, &mut timelines)? { - LayeredTimelineEntry::Local(timeline) => timeline, - LayeredTimelineEntry::Remote { .. } => { - bail!("Cannot branch off the timeline {} that's not local", src) - } - }; + let src_timeline = self + .get_timeline_load_internal(src, &mut timelines) + // message about timeline being remote is one .context up in the stack + .context("failed to load timeline for branching")? + .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {}", &src))?; let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); - src_timeline .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) .context("invalid branch start lsn")?; @@ -232,6 +255,7 @@ impl Repository for LayeredRepository { ); crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?; Self::save_metadata(self.conf, dst, self.tenantid, &metadata, true)?; + timelines.insert(dst, LayeredTimelineEntry::Unloaded { id: dst, metadata }); info!("branched timeline {} from {} at {}", dst, src, start_lsn); @@ -261,11 +285,19 @@ impl Repository for LayeredRepository { fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()> { // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the - // checkpoints. We don't want to block everything else while the + // checkpoints. We don't want to block everything else while the // checkpoint runs. let timelines = self.timelines.lock().unwrap(); let timelines_to_checkpoint = timelines .iter() + // filter to get only loaded timelines + .filter_map(|(timelineid, entry)| match entry { + LayeredTimelineEntry::Loaded(timeline) => Some((timelineid, timeline)), + LayeredTimelineEntry::Unloaded { .. } => { + debug!("Skipping checkpoint for unloaded timeline {}", timelineid); + None + } + }) .map(|(timelineid, timeline)| (*timelineid, timeline.clone())) .collect::>(); drop(timelines); @@ -273,13 +305,7 @@ impl Repository for LayeredRepository { for (timelineid, timeline) in &timelines_to_checkpoint { let _entered = info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid).entered(); - match timeline { - LayeredTimelineEntry::Local(timeline) => timeline.checkpoint(cconf)?, - LayeredTimelineEntry::Remote { .. } => debug!( - "Cannot run the checkpoint for remote timeline {}", - timelineid - ), - } + timeline.checkpoint(cconf)?; } Ok(()) @@ -288,32 +314,10 @@ impl Repository for LayeredRepository { // Detaches the timeline from the repository. fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> { let mut timelines = self.timelines.lock().unwrap(); - match timelines.entry(timeline_id) { - Entry::Vacant(_) => { - bail!("cannot detach non existing timeline"); - } - Entry::Occupied(mut entry) => { - let timeline_entry = entry.get_mut(); + if timelines.remove(&timeline_id).is_none() { + bail!("cannot detach timeline that is not available locally"); + } - let timeline = match timeline_entry { - LayeredTimelineEntry::Remote { .. } => { - bail!("cannot detach remote timeline {}", timeline_id); - } - LayeredTimelineEntry::Local(timeline) => timeline, - }; - - // TODO (rodionov) keep local state in timeline itself (refactoring related to https://github.com/zenithdb/zenith/issues/997 and #1104) - - // FIXME this is local disk consistent lsn, need to keep the latest succesfully uploaded checkpoint lsn in timeline (metadata?) - // https://github.com/zenithdb/zenith/issues/1104 - let remote_disk_consistent_lsn = timeline.disk_consistent_lsn.load(); - // reference to timeline is dropped here - entry.insert(LayeredTimelineEntry::Remote { - id: timeline_id, - disk_consistent_lsn: remote_disk_consistent_lsn, - }); - } - }; // Release the lock to shutdown and remove the files without holding it drop(timelines); // shutdown the timeline (this shuts down the walreceiver) @@ -324,158 +328,142 @@ impl Repository for LayeredRepository { Ok(()) } - // TODO this method currentlly does not do anything to prevent (or react to) state updates between a sync task schedule and a sync task end (that causes this update). - // Sync task is enqueued and can error and be rescheduled, so some significant time may pass between the events. - // - /// Reacts on the timeline sync state change, changing pageserver's memory state for this timeline (unload or load of the timeline files). - fn set_timeline_state( + fn apply_timeline_remote_sync_status_update( &self, timeline_id: ZTimelineId, - new_state: TimelineSyncState, + timeline_sync_status_update: TimelineSyncStatusUpdate, ) -> Result<()> { debug!( - "set_timeline_state: timeline_id: {}, new_state: {:?}", - timeline_id, new_state + "apply_timeline_remote_sync_status_update timeline_id: {} update: {:?}", + timeline_id, timeline_sync_status_update ); - let mut timelines_accessor = self.timelines.lock().unwrap(); - - match new_state { - TimelineSyncState::Ready(_) => { - let reloaded_timeline = - self.init_local_timeline(timeline_id, &mut timelines_accessor)?; - timelines_accessor - .insert(timeline_id, LayeredTimelineEntry::Local(reloaded_timeline)); - None + match timeline_sync_status_update { + TimelineSyncStatusUpdate::Uploaded => { /* nothing to do, remote consistent lsn is managed by the remote storage */ } - TimelineSyncState::Evicted(_) => timelines_accessor.remove(&timeline_id), - TimelineSyncState::AwaitsDownload(disk_consistent_lsn) - | TimelineSyncState::CloudOnly(disk_consistent_lsn) => timelines_accessor.insert( - timeline_id, - LayeredTimelineEntry::Remote { - id: timeline_id, - disk_consistent_lsn, - }, - ), - }; - // NOTE we do not delete local data in case timeline became cloud only, this is performed in detach_timeline - drop(timelines_accessor); - + TimelineSyncStatusUpdate::Downloaded => { + match self.timelines.lock().unwrap().entry(timeline_id) { + Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."), + Entry::Vacant(entry) => { + // we need to get metadata of a timeline, another option is to pass it along with Downloaded status + let metadata = Self::load_metadata(self.conf, timeline_id, self.tenantid).context("failed to load local metadata")?; + // finally we make newly downloaded timeline visible to repository + entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, }) + }, + }; + } + } Ok(()) } - /// Layered repo does not store anything but - /// * local, fully loaded timelines, ready for usage - /// * remote timelines, that need a download task scheduled first before they can be used - /// - /// [`TimelineSyncState::Evicted`] and other non-local and non-remote states are not stored in the layered repo at all, - /// hence their statuses cannot be returned by the repo. - fn get_timeline_state(&self, timeline_id: ZTimelineId) -> Option { - let timelines_accessor = self.timelines.lock().unwrap(); - let timeline_entry = timelines_accessor.get(&timeline_id)?; - Some( - if timeline_entry - .local_or_schedule_download(self.tenantid) - .is_some() - { - TimelineSyncState::Ready(timeline_entry.disk_consistent_lsn()) - } else { - TimelineSyncState::CloudOnly(timeline_entry.disk_consistent_lsn()) - }, - ) + fn get_remote_index(&self) -> &tokio::sync::RwLock { + self.remote_index.as_ref() } } #[derive(Clone)] enum LayeredTimelineEntry { - Local(Arc), - Remote { + Loaded(Arc), + Unloaded { id: ZTimelineId, - /// metadata contents of the latest successfully uploaded checkpoint - disk_consistent_lsn: Lsn, + metadata: TimelineMetadata, }, } impl LayeredTimelineEntry { fn timeline_id(&self) -> ZTimelineId { match self { - LayeredTimelineEntry::Local(timeline) => timeline.timelineid, - LayeredTimelineEntry::Remote { id, .. } => *id, + LayeredTimelineEntry::Loaded(timeline) => timeline.timelineid, + LayeredTimelineEntry::Unloaded { id, .. } => *id, } } - /// Gets local timeline data, if it's present. Otherwise schedules a download fot the remote timeline and returns `None`. - fn local_or_schedule_download(&self, tenant_id: ZTenantId) -> Option<&LayeredTimeline> { + fn ancestor_timeline_id(&self) -> Option { match self { - Self::Local(local) => Some(local.as_ref()), - Self::Remote { - id: timeline_id, .. - } => { - debug!( - "Accessed a remote timeline {} for tenant {}, scheduling a timeline download", - timeline_id, tenant_id - ); - schedule_timeline_download(tenant_id, *timeline_id); - None + LayeredTimelineEntry::Loaded(timeline) => { + timeline.ancestor_timeline.as_ref().map(|t| t.timeline_id()) } + LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_timeline(), } } - /// Gets a current (latest for the remote case) disk consistent Lsn for the timeline. - fn disk_consistent_lsn(&self) -> Lsn { + fn ancestor_lsn(&self) -> Lsn { match self { - Self::Local(local) => local.disk_consistent_lsn.load(), - Self::Remote { - disk_consistent_lsn, - .. - } => *disk_consistent_lsn, + LayeredTimelineEntry::Loaded(timeline) => timeline.ancestor_lsn, + LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_lsn(), + } + } + + fn ensure_loaded(&self) -> anyhow::Result<&Arc> { + match self { + LayeredTimelineEntry::Loaded(timeline) => Ok(timeline), + LayeredTimelineEntry::Unloaded { .. } => { + anyhow::bail!("timeline is unloaded") + } } } } impl From for RepositoryTimeline { - fn from(layered_timeline: LayeredTimelineEntry) -> Self { - match layered_timeline { - LayeredTimelineEntry::Local(timeline) => RepositoryTimeline::Local { - id: timeline.timelineid, - timeline, - }, - LayeredTimelineEntry::Remote { - id, - disk_consistent_lsn, - } => RepositoryTimeline::Remote { - id, - disk_consistent_lsn, - }, + fn from(entry: LayeredTimelineEntry) -> Self { + match entry { + LayeredTimelineEntry::Loaded(timeline) => RepositoryTimeline::Loaded(timeline as _), + LayeredTimelineEntry::Unloaded { metadata, .. } => { + RepositoryTimeline::Unloaded { metadata } + } } } } /// Private functions impl LayeredRepository { - // Implementation of the public `get_timeline` function. This differs from the public - // interface in that the caller must already hold the mutex on the 'timelines' hashmap. - fn get_or_init_timeline( + // Implementation of the public `get_timeline` function. + // Differences from the public: + // * interface in that the caller must already hold the mutex on the 'timelines' hashmap. + fn get_timeline_internal( + &self, + timelineid: ZTimelineId, + timelines: &HashMap, + ) -> Option { + timelines.get(&timelineid).cloned() + } + + // Implementation of the public `get_timeline_load` function. + // Differences from the public: + // * interface in that the caller must already hold the mutex on the 'timelines' hashmap. + fn get_timeline_load_internal( &self, timelineid: ZTimelineId, timelines: &mut HashMap, - ) -> Result { + ) -> anyhow::Result>> { match timelines.get(&timelineid) { - Some(timeline_entry) => { - let _ = timeline_entry.local_or_schedule_download(self.tenantid); - Ok(timeline_entry.clone()) - } + Some(entry) => match entry { + LayeredTimelineEntry::Loaded(local_timeline) => { + trace!("timeline {} found loaded", &timelineid); + return Ok(Some(Arc::clone(local_timeline))); + } + LayeredTimelineEntry::Unloaded { .. } => { + trace!("timeline {} found unloaded", &timelineid) + } + }, None => { - let timeline = self.init_local_timeline(timelineid, timelines)?; - timelines.insert( - timelineid, - LayeredTimelineEntry::Local(Arc::clone(&timeline)), - ); - Ok(LayeredTimelineEntry::Local(timeline)) + trace!("timeline {} not found", &timelineid); + return Ok(None); } - } + }; + let timeline = self.load_local_timeline(timelineid, timelines)?; + let was_loaded = timelines.insert( + timelineid, + LayeredTimelineEntry::Loaded(Arc::clone(&timeline)), + ); + ensure!( + was_loaded.is_none() + || matches!(was_loaded, Some(LayeredTimelineEntry::Unloaded { .. })), + "assertion failure, inserted wrong timeline in an incorrect state" + ); + Ok(Some(timeline)) } - fn init_local_timeline( + fn load_local_timeline( &self, timelineid: ZTimelineId, timelines: &mut HashMap, @@ -486,8 +474,18 @@ impl LayeredRepository { let ancestor = metadata .ancestor_timeline() - .map(|ancestor_timelineid| self.get_or_init_timeline(ancestor_timelineid, timelines)) - .transpose()?; + .map(|ancestor_timeline_id| { + trace!( + "loading {}'s ancestor {}", + timelineid, + &ancestor_timeline_id + ); + self.get_timeline_load_internal(ancestor_timeline_id, timelines) + }) + .transpose() + .context("cannot load ancestor timeline")? + .flatten() + .map(LayeredTimelineEntry::Loaded); let _enter = info_span!("loading timeline", timeline = %timelineid, tenant = %self.tenantid) .entered(); @@ -513,6 +511,7 @@ impl LayeredRepository { conf: &'static PageServerConf, walredo_mgr: Arc, tenantid: ZTenantId, + remote_index: Arc>, upload_relishes: bool, ) -> LayeredRepository { LayeredRepository { @@ -521,6 +520,7 @@ impl LayeredRepository { timelines: Mutex::new(HashMap::new()), gc_cs: Mutex::new(()), walredo_mgr, + remote_index, upload_relishes, } } @@ -608,86 +608,46 @@ impl LayeredRepository { // grab mutex to prevent new timelines from being created here. let _gc_cs = self.gc_cs.lock().unwrap(); - let mut timelines = self.timelines.lock().unwrap(); - // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. - // - let mut timelineids: Vec = Vec::new(); - - // We scan the directory, not the in-memory hash table, because the hash - // table only contains entries for timelines that have been accessed. We - // need to take all timelines into account, not only the active ones. - let timelines_path = self.conf.timelines_path(&self.tenantid); - - for direntry in fs::read_dir(timelines_path)? { - let direntry = direntry?; - if let Some(fname) = direntry.file_name().to_str() { - if let Ok(timelineid) = fname.parse::() { - timelineids.push(timelineid); - } - } - } - - // Now collect info about branchpoints let mut all_branchpoints: BTreeSet<(ZTimelineId, Lsn)> = BTreeSet::new(); - for &timelineid in &timelineids { - let timeline = match self.get_or_init_timeline(timelineid, &mut timelines)? { - LayeredTimelineEntry::Local(timeline) => timeline, - LayeredTimelineEntry::Remote { .. } => { - warn!( - "Timeline {} is not local, cannot proceed with gc", - timelineid - ); - return Ok(totals); - } - }; + let mut timeline_ids = Vec::new(); + let mut timelines = self.timelines.lock().unwrap(); - if let Some(ancestor_timeline) = &timeline.ancestor_timeline { - let ancestor_timeline = - match ancestor_timeline.local_or_schedule_download(self.tenantid) { - Some(timeline) => timeline, - None => { - warn!( - "Timeline {} has ancestor {} is not local, cannot proceed with gc", - timelineid, - ancestor_timeline.timeline_id() - ); - return Ok(totals); - } - }; + for (timeline_id, timeline_entry) in timelines.iter() { + timeline_ids.push(*timeline_id); + + // This is unresolved question for now, how to do gc in presense of remote timelines + // especially when this is combined with branching. + // Somewhat related: https://github.com/zenithdb/zenith/issues/999 + if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() { // If target_timeline is specified, we only need to know branchpoints of its children if let Some(timelineid) = target_timelineid { - if ancestor_timeline.timelineid == timelineid { + if ancestor_timeline_id == &timelineid { all_branchpoints - .insert((ancestor_timeline.timelineid, timeline.ancestor_lsn)); + .insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); } } // Collect branchpoints for all timelines else { - all_branchpoints.insert((ancestor_timeline.timelineid, timeline.ancestor_lsn)); + all_branchpoints.insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); } } } // Ok, we now know all the branch points. // Perform GC for each timeline. - for timelineid in timelineids { + for timelineid in timeline_ids.into_iter() { if thread_mgr::is_shutdown_requested() { // We were requested to shut down. Stop and return with the progress we // made. break; } - // We have already loaded all timelines above - // so this operation is just a quick map lookup. - let timeline = match self.get_or_init_timeline(timelineid, &mut *timelines)? { - LayeredTimelineEntry::Local(timeline) => timeline, - LayeredTimelineEntry::Remote { .. } => { - debug!("Skipping GC for non-local timeline {}", timelineid); - continue; - } - }; + // Timeline is known to be local and loaded. + let timeline = self + .get_timeline_load_internal(timelineid, &mut *timelines)? + .expect("checked above that timeline is local and loaded"); // If target_timeline is specified, only GC it if let Some(target_timelineid) = target_timelineid { @@ -989,13 +949,13 @@ impl Timeline for LayeredTimeline { match &timeline.ancestor_timeline { None => break, Some(ancestor_entry) => { - match ancestor_entry.local_or_schedule_download(self.tenantid) { - Some(ancestor) => { - timeline = ancestor; - continue; - } - None => bail!("Cannot list relishes for timeline {} tenant {} due to its ancestor being remote only", self.timelineid, self.tenantid), - } + timeline = ancestor_entry.ensure_loaded().with_context( + || format!( + "cannot list relishes for timeline {} tenant {} due to its ancestor {} being either unloaded", + self.timelineid, self.tenantid, ancestor_entry.timeline_id(), + ) + )?; + continue; } } } @@ -1313,19 +1273,15 @@ impl LayeredTimeline { while lsn < timeline.ancestor_lsn { trace!("going into ancestor {} ", timeline.ancestor_lsn); - timeline = match timeline - .ancestor_timeline - .as_ref() - .and_then(|ancestor_entry| ancestor_entry.local_or_schedule_download(self.tenantid)) - { - Some(timeline) => timeline, - None => { - bail!( - "Cannot get the whole layer for read locked: timeline {} is not present locally", - self.timelineid - ) - } - }; + timeline = timeline + .ancestor_timeline + .as_ref() + .expect("there should be an ancestor") + .ensure_loaded() + .with_context(|| format!( + "Cannot get the whole layer for read locked: timeline {} is not present locally", + self.get_ancestor_timeline_id().unwrap()) + )?; } // Now we have the right starting timeline for our search. @@ -1366,18 +1322,13 @@ impl LayeredTimeline { // If not, check if there's a layer on the ancestor timeline match &timeline.ancestor_timeline { Some(ancestor_entry) => { - match ancestor_entry.local_or_schedule_download(self.tenantid) { - Some(ancestor) => { - lsn = timeline.ancestor_lsn; - timeline = ancestor; - trace!("recursing into ancestor at {}/{}", timeline.timelineid, lsn); - continue; - } - None => bail!( - "Cannot get a layer for read from remote ancestor timeline {}", - self.timelineid - ), - } + let ancestor = ancestor_entry + .ensure_loaded() + .context("cannot get a layer for read from ancestor because it is either remote or unloaded")?; + lsn = timeline.ancestor_lsn; + timeline = ancestor; + trace!("recursing into ancestor at {}/{}", timeline.timelineid, lsn); + continue; } None => return Ok(None), } @@ -1501,7 +1452,6 @@ impl LayeredTimeline { fn checkpoint_internal(&self, checkpoint_distance: u64, reconstruct_pages: bool) -> Result<()> { // Prevent concurrent checkpoints let _checkpoint_cs = self.checkpoint_cs.lock().unwrap(); - let write_guard = self.write_lock.lock().unwrap(); let mut layers = self.layers.lock().unwrap(); @@ -1862,10 +1812,10 @@ impl LayeredTimeline { ); } // Now check ancestor timelines, if any are present locally - else if let Some(ancestor) = - self.ancestor_timeline.as_ref().and_then(|timeline_entry| { - timeline_entry.local_or_schedule_download(self.tenantid) - }) + else if let Some(ancestor) = self + .ancestor_timeline + .as_ref() + .and_then(|timeline_entry| timeline_entry.ensure_loaded().ok()) { let prior_lsn = ancestor.get_last_record_lsn(); if seg.rel.is_blocky() { @@ -2435,9 +2385,8 @@ mod tests { metadata_bytes[512 - 4 - 2] ^= 1; std::fs::write(metadata_path, metadata_bytes)?; - let new_repo = harness.load(); - let err = new_repo.get_timeline(TIMELINE_ID).err().unwrap(); - assert_eq!(err.to_string(), "failed to load metadata"); + let err = harness.try_load().err().expect("should fail"); + assert_eq!(err.to_string(), "failed to load local metadata"); assert_eq!( err.source().unwrap().to_string(), "metadata checksum mismatch" @@ -2527,7 +2476,7 @@ mod tests { // Load the timeline. This will cause the files in the "future" to be renamed // away. let new_repo = harness.load(); - new_repo.get_timeline(TIMELINE_ID).unwrap(); + new_repo.get_timeline_load(TIMELINE_ID).unwrap(); drop(new_repo); for filename in future_filenames.iter() { @@ -2544,7 +2493,7 @@ mod tests { } let new_repo = harness.load(); - new_repo.get_timeline(TIMELINE_ID).unwrap(); + new_repo.get_timeline_load(TIMELINE_ID).unwrap(); drop(new_repo); for filename in future_filenames.iter() { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 42a099cca5..6e6b6415f3 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -322,8 +322,8 @@ impl PageServerHandler { let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered(); // Check that the timeline exists - let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid) - .context("Cannot handle pagerequests for a remote timeline")?; + let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid) + .context("Cannot load local timeline")?; /* switch client to COPYBOTH */ pgb.write_message(&BeMessage::CopyBothResponse)?; @@ -520,8 +520,8 @@ impl PageServerHandler { let _enter = span.enter(); // check that the timeline exists - let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid) - .context("Cannot handle basebackup request for a remote timeline")?; + let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid) + .context("Cannot load local timeline")?; let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { timeline @@ -655,8 +655,8 @@ impl postgres_backend::Handler for PageServerHandler { info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered(); // Check that the timeline exists - tenant_mgr::get_timeline_for_tenant(tenantid, timelineid) - .context("Failed to fetch local timeline for callmemaybe requests")?; + tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid) + .context("Cannot load local timeline")?; walreceiver::launch_wal_receiver(self.conf, tenantid, timelineid, &connstr)?; @@ -778,8 +778,8 @@ impl postgres_backend::Handler for PageServerHandler { let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid) - .context("Failed to fetch local timeline for checkpoint request")?; + let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid) + .context("Cannot load local timeline")?; timeline.checkpoint(CheckpointConfig::Forced)?; pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? diff --git a/pageserver/src/remote_storage.rs b/pageserver/src/remote_storage.rs index 4af1f8ed56..08fb16a679 100644 --- a/pageserver/src/remote_storage.rs +++ b/pageserver/src/remote_storage.rs @@ -89,32 +89,38 @@ use std::{ collections::HashMap, ffi, fs, path::{Path, PathBuf}, + sync::Arc, }; use anyhow::{bail, Context}; -use tokio::io; +use tokio::{io, sync::RwLock}; use tracing::{error, info}; use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +pub use self::storage_sync::index::{RemoteTimelineIndex, TimelineIndexEntry}; pub use self::storage_sync::{schedule_timeline_checkpoint_upload, schedule_timeline_download}; use self::{local_fs::LocalFs, rust_s3::S3}; use crate::{ config::{PageServerConf, RemoteStorageKind}, layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}, - repository::TimelineSyncState, }; pub use storage_sync::compression; +#[derive(Clone, Copy, Debug)] +pub enum LocalTimelineInitStatus { + LocallyComplete, + NeedsSync, +} + +type LocalTimelineInitStatuses = HashMap>; + /// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization. /// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still, /// to simplify the received code. pub struct SyncStartupData { - /// A sync state, derived from initial comparison of local timeline files and the remote archives, - /// before any sync tasks are executed. - /// To reuse the local file scan logic, the timeline states are returned even if no sync loop get started during init: - /// in this case, no remote files exist and all local timelines with correct metadata files are considered ready. - pub initial_timeline_states: HashMap>, + pub remote_index: Arc>, + pub local_timeline_init_statuses: LocalTimelineInitStatuses, } /// Based on the config, initiates the remote storage connection and starts a separate thread @@ -154,23 +160,18 @@ pub fn start_local_timeline_sync( .context("Failed to spawn the storage sync thread"), None => { info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled"); - let mut initial_timeline_states: HashMap< - ZTenantId, - HashMap, - > = HashMap::new(); - for (ZTenantTimelineId{tenant_id, timeline_id}, (timeline_metadata, _)) in + let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); + for (ZTenantTimelineId { tenant_id, timeline_id }, _) in local_timeline_files { - initial_timeline_states + local_timeline_init_statuses .entry(tenant_id) .or_default() - .insert( - timeline_id, - TimelineSyncState::Ready(timeline_metadata.disk_consistent_lsn()), - ); + .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete); } Ok(SyncStartupData { - initial_timeline_states, + local_timeline_init_statuses, + remote_index: Arc::new(RwLock::new(RemoteTimelineIndex::empty())), }) } } diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index d14f849e15..f1483375cb 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -58,7 +58,7 @@ //! Synchronization never removes any local from pageserver workdir or remote files from the remote storage, yet there could be overwrites of the same files (metadata file updates; future checksum mismatch fixes). //! NOTE: No real contents or checksum check happens right now and is a subject to improve later. //! -//! After the whole timeline is downloaded, [`crate::tenant_mgr::set_timeline_states`] function is used to update pageserver memory stage for the timeline processed. +//! After the whole timeline is downloaded, [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function is used to update pageserver memory stage for the timeline processed. //! //! When pageserver signals shutdown, current sync task gets finished and the loop exists. @@ -93,17 +93,25 @@ use self::{ download::{download_timeline, DownloadedTimeline}, index::{ ArchiveDescription, ArchiveId, RemoteTimeline, RemoteTimelineIndex, TimelineIndexEntry, + TimelineIndexEntryInner, }, upload::upload_timeline_checkpoint, }; -use super::{RemoteStorage, SyncStartupData, ZTenantTimelineId}; +use super::{ + LocalTimelineInitStatus, LocalTimelineInitStatuses, RemoteStorage, SyncStartupData, + ZTenantTimelineId, +}; use crate::{ config::PageServerConf, layered_repository::metadata::TimelineMetadata, - remote_storage::storage_sync::compression::read_archive_header, repository::TimelineSyncState, - tenant_mgr::set_timeline_states, thread_mgr, thread_mgr::ThreadKind, + remote_storage::storage_sync::compression::read_archive_header, + repository::TimelineSyncStatusUpdate, tenant_mgr::apply_timeline_sync_status_updates, + thread_mgr, thread_mgr::ThreadKind, }; -use zenith_metrics::{register_histogram_vec, register_int_gauge, HistogramVec, IntGauge}; +use zenith_metrics::{ + register_histogram_vec, register_int_counter, register_int_gauge, HistogramVec, IntCounter, + IntGauge, +}; use zenith_utils::zid::{ZTenantId, ZTimelineId}; lazy_static! { @@ -112,6 +120,11 @@ lazy_static! { "Number of storage sync items left in the queue" ) .expect("failed to register pageserver remote storage remaining sync items int gauge"); + static ref FATAL_TASK_FAILURES: IntCounter = register_int_counter!( + "pageserver_remote_storage_fatal_task_failures", + "Number of critically failed tasks" + ) + .expect("failed to register pageserver remote storage remaining sync items int gauge"); static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!( "pageserver_remote_storage_image_sync_time", "Time took to synchronize (download or upload) a whole pageserver image. \ @@ -379,10 +392,13 @@ pub(super) fn spawn_storage_sync_thread< None } }); - let remote_index = RemoteTimelineIndex::try_parse_descriptions_from_paths(conf, download_paths); - - let initial_timeline_states = schedule_first_sync_tasks(&remote_index, local_timeline_files); + let mut remote_index = + RemoteTimelineIndex::try_parse_descriptions_from_paths(conf, download_paths); + let local_timeline_init_statuses = + schedule_first_sync_tasks(&mut remote_index, local_timeline_files); + let remote_index = Arc::new(RwLock::new(remote_index)); + let remote_index_cloned = Arc::clone(&remote_index); thread_mgr::spawn( ThreadKind::StorageSync, None, @@ -393,7 +409,7 @@ pub(super) fn spawn_storage_sync_thread< runtime, conf, receiver, - remote_index, + remote_index_cloned, storage, max_concurrent_sync, max_sync_errors, @@ -402,12 +418,13 @@ pub(super) fn spawn_storage_sync_thread< ) .context("Failed to spawn remote storage sync thread")?; Ok(SyncStartupData { - initial_timeline_states, + remote_index, + local_timeline_init_statuses, }) } enum LoopStep { - NewStates(HashMap>), + SyncStatusUpdates(HashMap>), Shutdown, } @@ -419,13 +436,14 @@ fn storage_sync_loop< runtime: Runtime, conf: &'static PageServerConf, mut receiver: UnboundedReceiver, - index: RemoteTimelineIndex, + index: Arc>, storage: S, max_concurrent_sync: NonZeroUsize, max_sync_errors: NonZeroU32, ) -> anyhow::Result<()> { - let remote_assets = Arc::new((storage, RwLock::new(index))); + let remote_assets = Arc::new((storage, Arc::clone(&index))); loop { + let index = Arc::clone(&index); let loop_step = runtime.block_on(async { tokio::select! { new_timeline_states = loop_step( @@ -435,15 +453,15 @@ fn storage_sync_loop< max_concurrent_sync, max_sync_errors, ) - .instrument(debug_span!("storage_sync_loop_step")) => LoopStep::NewStates(new_timeline_states), + .instrument(debug_span!("storage_sync_loop_step")) => LoopStep::SyncStatusUpdates(new_timeline_states), _ = thread_mgr::shutdown_watcher() => LoopStep::Shutdown, } }); match loop_step { - LoopStep::NewStates(new_timeline_states) => { + LoopStep::SyncStatusUpdates(new_timeline_states) => { // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. - set_timeline_states(conf, new_timeline_states); + apply_timeline_sync_status_updates(conf, index, new_timeline_states); debug!("Sync loop step completed"); } LoopStep::Shutdown => { @@ -462,10 +480,10 @@ async fn loop_step< >( conf: &'static PageServerConf, receiver: &mut UnboundedReceiver, - remote_assets: Arc<(S, RwLock)>, + remote_assets: Arc<(S, Arc>)>, max_concurrent_sync: NonZeroUsize, max_sync_errors: NonZeroU32, -) -> HashMap> { +) -> HashMap> { let max_concurrent_sync = max_concurrent_sync.get(); let mut next_tasks = BTreeSet::new(); @@ -516,8 +534,10 @@ async fn loop_step< }) .collect::>(); - let mut new_timeline_states: HashMap> = - HashMap::with_capacity(max_concurrent_sync); + let mut new_timeline_states: HashMap< + ZTenantId, + HashMap, + > = HashMap::with_capacity(max_concurrent_sync); while let Some((sync_id, state_update)) = task_batch.next().await { debug!("Finished storage sync task for sync id {}", sync_id); if let Some(state_update) = state_update { @@ -540,24 +560,19 @@ async fn process_task< S: RemoteStorage + Send + Sync + 'static, >( conf: &'static PageServerConf, - remote_assets: Arc<(S, RwLock)>, + remote_assets: Arc<(S, Arc>)>, task: SyncTask, max_sync_errors: NonZeroU32, -) -> Option { +) -> Option { if task.retries > max_sync_errors.get() { error!( "Evicting task {:?} that failed {} times, exceeding the error threshold", task.kind, task.retries ); - return Some(TimelineSyncState::Evicted( - remote_assets - .as_ref() - .1 - .read() - .await - .timeline_entry(&task.sync_id) - .and_then(TimelineIndexEntry::disk_consistent_lsn), - )); + FATAL_TASK_FAILURES.inc(); + // FIXME (rodionov) this can potentially leave holes in timeline uploads + // planneed to be fixed as part of https://github.com/zenithdb/zenith/issues/977 + return None; } if task.retries > 0 { @@ -569,6 +584,8 @@ async fn process_task< tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await; } + let remote_index = Arc::clone(&remote_assets.1); + let sync_start = Instant::now(); let sync_name = task.kind.sync_name(); match task.kind { @@ -585,19 +602,25 @@ async fn process_task< match download_result { DownloadedTimeline::Abort => { register_sync_status(sync_start, sync_name, None); + remote_index + .write() + .await + .set_awaits_download(&task.sync_id, false) + .expect("timeline should be present in remote index"); None } - DownloadedTimeline::FailedAndRescheduled { - disk_consistent_lsn, - } => { + DownloadedTimeline::FailedAndRescheduled => { register_sync_status(sync_start, sync_name, Some(false)); - Some(TimelineSyncState::AwaitsDownload(disk_consistent_lsn)) + None } - DownloadedTimeline::Successful { - disk_consistent_lsn, - } => { + DownloadedTimeline::Successful => { register_sync_status(sync_start, sync_name, Some(true)); - Some(TimelineSyncState::Ready(disk_consistent_lsn)) + remote_index + .write() + .await + .set_awaits_download(&task.sync_id, false) + .expect("timeline should be present in remote index"); + Some(TimelineSyncStatusUpdate::Downloaded) } } } @@ -617,45 +640,45 @@ async fn process_task< } fn schedule_first_sync_tasks( - index: &RemoteTimelineIndex, + index: &mut RemoteTimelineIndex, local_timeline_files: HashMap)>, -) -> HashMap> { - let mut initial_timeline_statuses: HashMap> = - HashMap::new(); +) -> LocalTimelineInitStatuses { + let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); let mut new_sync_tasks = VecDeque::with_capacity(local_timeline_files.len().max(local_timeline_files.len())); for (sync_id, (local_metadata, local_files)) in local_timeline_files { - let local_disk_consistent_lsn = local_metadata.disk_consistent_lsn(); - let ZTenantTimelineId { tenant_id, timeline_id, } = sync_id; - match index.timeline_entry(&sync_id) { + match index.timeline_entry_mut(&sync_id) { Some(index_entry) => { - let timeline_status = compare_local_and_remote_timeline( + let (timeline_status, awaits_download) = compare_local_and_remote_timeline( &mut new_sync_tasks, sync_id, local_metadata, local_files, index_entry, ); - match timeline_status { - Some(timeline_status) => { - initial_timeline_statuses - .entry(tenant_id) - .or_default() - .insert(timeline_id, timeline_status); - } - None => error!( - "Failed to compare local and remote timeline for task {}", - sync_id - ), + let was_there = local_timeline_init_statuses + .entry(tenant_id) + .or_default() + .insert(timeline_id, timeline_status); + + if was_there.is_some() { + // defensive check + warn!( + "Overwriting timeline init sync status. Status {:?} Timeline {}", + timeline_status, timeline_id + ); } + index_entry.set_awaits_download(awaits_download); } None => { + // TODO (rodionov) does this mean that we've crashed during tenant creation? + // is it safe to upload this checkpoint? could it be half broken? new_sync_tasks.push_back(SyncTask::new( sync_id, 0, @@ -664,56 +687,18 @@ fn schedule_first_sync_tasks( metadata: local_metadata, }), )); - initial_timeline_statuses + local_timeline_init_statuses .entry(tenant_id) .or_default() - .insert( - timeline_id, - TimelineSyncState::Ready(local_disk_consistent_lsn), - ); + .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete); } } } - let unprocessed_remote_ids = |remote_id: &ZTenantTimelineId| { - initial_timeline_statuses - .get(&remote_id.tenant_id) - .and_then(|timelines| timelines.get(&remote_id.timeline_id)) - .is_none() - }; - for unprocessed_remote_id in index - .all_sync_ids() - .filter(unprocessed_remote_ids) - .collect::>() - { - let ZTenantTimelineId { - tenant_id: cloud_only_tenant_id, - timeline_id: cloud_only_timeline_id, - } = unprocessed_remote_id; - match index - .timeline_entry(&unprocessed_remote_id) - .and_then(TimelineIndexEntry::disk_consistent_lsn) - { - Some(remote_disk_consistent_lsn) => { - initial_timeline_statuses - .entry(cloud_only_tenant_id) - .or_default() - .insert( - cloud_only_timeline_id, - TimelineSyncState::CloudOnly(remote_disk_consistent_lsn), - ); - } - None => error!( - "Failed to find disk consistent LSN for remote timeline {}", - unprocessed_remote_id - ), - } - } - new_sync_tasks.into_iter().for_each(|task| { sync_queue::push(task); }); - initial_timeline_statuses + local_timeline_init_statuses } fn compare_local_and_remote_timeline( @@ -722,10 +707,21 @@ fn compare_local_and_remote_timeline( local_metadata: TimelineMetadata, local_files: Vec, remote_entry: &TimelineIndexEntry, -) -> Option { +) -> (LocalTimelineInitStatus, bool) { let local_lsn = local_metadata.disk_consistent_lsn(); let uploads = remote_entry.uploaded_checkpoints(); + let mut initial_timeline_status = LocalTimelineInitStatus::LocallyComplete; + + let mut awaits_download = false; + // TODO probably here we need more sophisticated logic, + // if more data is available remotely can we just download whats there? + // without trying to upload something. It may be tricky, needs further investigation. + // For now looks strange that we can request upload + // and dowload for the same timeline simultaneously. + // (upload needs to be only for previously unsynced files, not whole timeline dir). + // If one of the tasks fails they will be reordered in the queue which can lead + // to timeline being stuck in evicted state if !uploads.contains(&local_lsn) { new_sync_tasks.push_back(SyncTask::new( sync_id, @@ -735,6 +731,7 @@ fn compare_local_and_remote_timeline( metadata: local_metadata, }), )); + // Note that status here doesnt change. } let uploads_count = uploads.len(); @@ -743,7 +740,7 @@ fn compare_local_and_remote_timeline( .filter(|upload_lsn| upload_lsn <= &local_lsn) .map(ArchiveId) .collect(); - Some(if archives_to_skip.len() != uploads_count { + if archives_to_skip.len() != uploads_count { new_sync_tasks.push_back(SyncTask::new( sync_id, 0, @@ -752,10 +749,12 @@ fn compare_local_and_remote_timeline( archives_to_skip, }), )); - TimelineSyncState::AwaitsDownload(remote_entry.disk_consistent_lsn()?) - } else { - TimelineSyncState::Ready(remote_entry.disk_consistent_lsn().unwrap_or(local_lsn)) - }) + initial_timeline_status = LocalTimelineInitStatus::NeedsSync; + awaits_download = true; + // we do not need to manupulate with remote consistent lsn here + // because it will be updated when sync will be completed + } + (initial_timeline_status, awaits_download) } fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Option) { @@ -769,21 +768,23 @@ fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Optio .observe(secs_elapsed) } -async fn update_index_description< +async fn fetch_full_index< P: Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, >( - (storage, index): &(S, RwLock), + (storage, index): &(S, Arc>), timeline_dir: &Path, id: ZTenantTimelineId, ) -> anyhow::Result { - let mut index_write = index.write().await; - let full_index = match index_write.timeline_entry(&id) { + let index_read = index.read().await; + let full_index = match index_read.timeline_entry(&id).map(|e| e.inner()) { None => bail!("Timeline not found for sync id {}", id), - Some(TimelineIndexEntry::Full(_)) => bail!("Index is already populated for sync id {}", id), - Some(TimelineIndexEntry::Description(description)) => { + Some(TimelineIndexEntryInner::Full(_)) => { + bail!("Index is already populated for sync id {}", id) + } + Some(TimelineIndexEntryInner::Description(description)) => { let mut archive_header_downloads = FuturesUnordered::new(); - for (&archive_id, description) in description { + for (archive_id, description) in description { archive_header_downloads.push(async move { let header = download_archive_header(storage, timeline_dir, description) .await @@ -795,18 +796,22 @@ async fn update_index_description< let mut full_index = RemoteTimeline::empty(); while let Some(header_data) = archive_header_downloads.next().await { match header_data { - Ok((archive_id, header_size, header)) => full_index.update_archive_contents(archive_id.0, header, header_size), - Err((e, archive_id)) => bail!( - "Failed to download archive header for tenant {}, timeline {}, archive for Lsn {}: {}", - id.tenant_id, id.timeline_id, archive_id.0, - e - ), - } + Ok((archive_id, header_size, header)) => full_index.update_archive_contents(archive_id.0, header, header_size), + Err((e, archive_id)) => bail!( + "Failed to download archive header for tenant {}, timeline {}, archive for Lsn {}: {}", + id.tenant_id, id.timeline_id, archive_id.0, + e + ), + } } full_index } }; - index_write.add_timeline_entry(id, TimelineIndexEntry::Full(full_index.clone())); + drop(index_read); // tokio rw lock is not upgradeable + let mut index_write = index.write().await; + index_write + .upgrade_timeline_entry(&id, full_index.clone()) + .context("cannot upgrade timeline entry in remote index")?; Ok(full_index) } @@ -850,7 +855,7 @@ mod test_utils { #[track_caller] pub async fn ensure_correct_timeline_upload( harness: &RepoHarness, - remote_assets: Arc<(LocalFs, RwLock)>, + remote_assets: Arc<(LocalFs, Arc>)>, timeline_id: ZTimelineId, new_upload: NewCheckpoint, ) { @@ -909,11 +914,14 @@ mod test_utils { } pub async fn expect_timeline( - index: &RwLock, + index: &Arc>, sync_id: ZTenantTimelineId, ) -> RemoteTimeline { - if let Some(TimelineIndexEntry::Full(remote_timeline)) = - index.read().await.timeline_entry(&sync_id) + if let Some(TimelineIndexEntryInner::Full(remote_timeline)) = index + .read() + .await + .timeline_entry(&sync_id) + .map(|e| e.inner()) { remote_timeline.clone() } else { @@ -926,7 +934,7 @@ mod test_utils { #[track_caller] pub async fn assert_index_descriptions( - index: &RwLock, + index: &Arc>, expected_index_with_descriptions: RemoteTimelineIndex, ) { let index_read = index.read().await; @@ -965,26 +973,26 @@ mod test_utils { sync_id ) }); - let expected_timeline_description = match expected_timeline_description { - TimelineIndexEntry::Description(description) => description, - TimelineIndexEntry::Full(_) => panic!("Expected index entry for sync id {} is a full entry, while a description was expected", sync_id), + let expected_timeline_description = match expected_timeline_description.inner() { + TimelineIndexEntryInner::Description(description) => description, + TimelineIndexEntryInner::Full(_) => panic!("Expected index entry for sync id {} is a full entry, while a description was expected", sync_id), }; - match actual_timeline_entry { - TimelineIndexEntry::Description(actual_descriptions) => { + match actual_timeline_entry.inner() { + TimelineIndexEntryInner::Description(description) => { assert_eq!( - actual_descriptions, expected_timeline_description, + description, expected_timeline_description, "Index contains unexpected descriptions entry for sync id {}", sync_id ) } - TimelineIndexEntry::Full(actual_full_entry) => { + TimelineIndexEntryInner::Full(remote_timeline) => { let expected_lsns = expected_timeline_description .values() .map(|description| description.disk_consistent_lsn) .collect::>(); assert_eq!( - actual_full_entry.checkpoints().collect::>(), + remote_timeline.checkpoints().collect::>(), expected_lsns, "Timeline {} should have the same checkpoints uploaded", sync_id, diff --git a/pageserver/src/remote_storage/storage_sync/download.rs b/pageserver/src/remote_storage/storage_sync/download.rs index 00115ba8d5..e5362b2973 100644 --- a/pageserver/src/remote_storage/storage_sync/download.rs +++ b/pageserver/src/remote_storage/storage_sync/download.rs @@ -5,14 +5,14 @@ use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc}; use anyhow::{ensure, Context}; use tokio::{fs, sync::RwLock}; use tracing::{debug, error, trace, warn}; -use zenith_utils::{lsn::Lsn, zid::ZTenantId}; +use zenith_utils::zid::ZTenantId; use crate::{ config::PageServerConf, layered_repository::metadata::{metadata_path, TimelineMetadata}, remote_storage::{ storage_sync::{ - compression, index::TimelineIndexEntry, sync_queue, update_index_description, SyncKind, + compression, fetch_full_index, index::TimelineIndexEntryInner, sync_queue, SyncKind, SyncTask, }, RemoteStorage, ZTenantTimelineId, @@ -30,10 +30,10 @@ pub(super) enum DownloadedTimeline { Abort, /// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known. /// Initial download failed due to some error, the download task is rescheduled for another retry. - FailedAndRescheduled { disk_consistent_lsn: Lsn }, + FailedAndRescheduled, /// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known. /// Initial download successful. - Successful { disk_consistent_lsn: Lsn }, + Successful, } /// Attempts to download and uncompress files from all remote archives for the timeline given. @@ -47,7 +47,7 @@ pub(super) async fn download_timeline< S: RemoteStorage + Send + Sync + 'static, >( conf: &'static PageServerConf, - remote_assets: Arc<(S, RwLock)>, + remote_assets: Arc<(S, Arc>)>, sync_id: ZTenantTimelineId, mut download: TimelineDownload, retries: u32, @@ -58,19 +58,26 @@ pub(super) async fn download_timeline< tenant_id, timeline_id, } = sync_id; - let index_read = remote_assets.1.read().await; + let index = &remote_assets.1; + + let index_read = index.read().await; let remote_timeline = match index_read.timeline_entry(&sync_id) { None => { - error!("Cannot download: no timeline is present in the index for given ids"); + error!("Cannot download: no timeline is present in the index for given id"); return DownloadedTimeline::Abort; } - Some(index_entry) => match index_entry { - TimelineIndexEntry::Full(remote_timeline) => Cow::Borrowed(remote_timeline), - TimelineIndexEntry::Description(_) => { + + Some(index_entry) => match index_entry.inner() { + TimelineIndexEntryInner::Full(remote_timeline) => Cow::Borrowed(remote_timeline), + TimelineIndexEntryInner::Description(_) => { + // we do not check here for awaits_download because it is ok + // to call this function while the download is in progress + // so it is not a concurrent download, it is the same one + let remote_disk_consistent_lsn = index_entry.disk_consistent_lsn(); drop(index_read); debug!("Found timeline description for the given ids, downloading the full index"); - match update_index_description( + match fetch_full_index( remote_assets.as_ref(), &conf.timeline_path(&timeline_id, &tenant_id), sync_id, @@ -80,16 +87,15 @@ pub(super) async fn download_timeline< Ok(remote_timeline) => Cow::Owned(remote_timeline), Err(e) => { error!("Failed to download full timeline index: {:?}", e); + return match remote_disk_consistent_lsn { - Some(disk_consistent_lsn) => { + Some(_) => { sync_queue::push(SyncTask::new( sync_id, retries, SyncKind::Download(download), )); - DownloadedTimeline::FailedAndRescheduled { - disk_consistent_lsn, - } + DownloadedTimeline::FailedAndRescheduled } None => { error!("Cannot download: no disk consistent Lsn is present for the index entry"); @@ -101,12 +107,9 @@ pub(super) async fn download_timeline< } }, }; - let disk_consistent_lsn = match remote_timeline.checkpoints().max() { - Some(lsn) => lsn, - None => { - debug!("Cannot download: no disk consistent Lsn is present for the remote timeline"); - return DownloadedTimeline::Abort; - } + if remote_timeline.checkpoints().max().is_none() { + debug!("Cannot download: no disk consistent Lsn is present for the remote timeline"); + return DownloadedTimeline::Abort; }; debug!("Downloading timeline archives"); @@ -125,7 +128,7 @@ pub(super) async fn download_timeline< conf, sync_id, Arc::clone(&remote_assets), - remote_timeline.as_ref(), + &remote_timeline, archive_id, Arc::clone(&download.files_to_skip), ) @@ -142,9 +145,7 @@ pub(super) async fn download_timeline< retries, SyncKind::Download(download), )); - return DownloadedTimeline::FailedAndRescheduled { - disk_consistent_lsn, - }; + return DownloadedTimeline::FailedAndRescheduled; } Ok(()) => { debug!("Successfully downloaded archive {:?}", archive_id); @@ -154,9 +155,7 @@ pub(super) async fn download_timeline< } debug!("Finished downloading all timeline's archives"); - DownloadedTimeline::Successful { - disk_consistent_lsn, - } + DownloadedTimeline::Successful } async fn try_download_archive< @@ -168,7 +167,7 @@ async fn try_download_archive< tenant_id, timeline_id, }: ZTenantTimelineId, - remote_assets: Arc<(S, RwLock)>, + remote_assets: Arc<(S, Arc>)>, remote_timeline: &RemoteTimeline, archive_id: ArchiveId, files_to_skip: Arc>, @@ -256,13 +255,15 @@ mod tests { let repo_harness = RepoHarness::create("test_download_timeline")?; let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID); let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?; - let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths( - repo_harness.conf, - storage - .list() - .await? - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), + let index = Arc::new(RwLock::new( + RemoteTimelineIndex::try_parse_descriptions_from_paths( + repo_harness.conf, + storage + .list() + .await? + .into_iter() + .map(|storage_path| storage.local_path(&storage_path).unwrap()), + ), )); let remote_assets = Arc::new((storage, index)); let storage = &remote_assets.0; diff --git a/pageserver/src/remote_storage/storage_sync/index.rs b/pageserver/src/remote_storage/storage_sync/index.rs index 81c99754c9..7d6b4881f7 100644 --- a/pageserver/src/remote_storage/storage_sync/index.rs +++ b/pageserver/src/remote_storage/storage_sync/index.rs @@ -11,7 +11,7 @@ use std::{ use anyhow::{bail, ensure, Context}; use serde::{Deserialize, Serialize}; -use tracing::debug; +use tracing::*; use zenith_utils::{ lsn::Lsn, zid::{ZTenantId, ZTimelineId}, @@ -52,10 +52,16 @@ impl RelativePath { /// Currently, timeline archive files are tracked only. #[derive(Debug, Clone)] pub struct RemoteTimelineIndex { - timeline_files: HashMap, + timeline_entries: HashMap, } impl RemoteTimelineIndex { + pub fn empty() -> Self { + Self { + timeline_entries: HashMap::new(), + } + } + /// Attempts to parse file paths (not checking the file contents) and find files /// that can be tracked wiht the index. /// On parse falures, logs the error and continues, so empty index can be created from not suitable paths. @@ -63,9 +69,7 @@ impl RemoteTimelineIndex { conf: &'static PageServerConf, paths: impl Iterator, ) -> Self { - let mut index = Self { - timeline_files: HashMap::new(), - }; + let mut index = Self::empty(); for path in paths { if let Err(e) = try_parse_index_entry(&mut index, conf, path.as_ref()) { debug!( @@ -79,40 +83,100 @@ impl RemoteTimelineIndex { } pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&TimelineIndexEntry> { - self.timeline_files.get(id) + self.timeline_entries.get(id) } pub fn timeline_entry_mut( &mut self, id: &ZTenantTimelineId, ) -> Option<&mut TimelineIndexEntry> { - self.timeline_files.get_mut(id) + self.timeline_entries.get_mut(id) } pub fn add_timeline_entry(&mut self, id: ZTenantTimelineId, entry: TimelineIndexEntry) { - self.timeline_files.insert(id, entry); + self.timeline_entries.insert(id, entry); + } + + pub fn upgrade_timeline_entry( + &mut self, + id: &ZTenantTimelineId, + remote_timeline: RemoteTimeline, + ) -> anyhow::Result<()> { + let mut entry = self.timeline_entries.get_mut(id).ok_or(anyhow::anyhow!( + "timeline is unexpectedly missing from remote index" + ))?; + + if !matches!(entry.inner, TimelineIndexEntryInner::Description(_)) { + anyhow::bail!("timeline entry is not a description entry") + }; + + entry.inner = TimelineIndexEntryInner::Full(remote_timeline); + + Ok(()) } pub fn all_sync_ids(&self) -> impl Iterator + '_ { - self.timeline_files.keys().copied() + self.timeline_entries.keys().copied() + } + + pub fn set_awaits_download( + &mut self, + id: &ZTenantTimelineId, + awaits_download: bool, + ) -> anyhow::Result<()> { + self.timeline_entry_mut(id) + .ok_or_else(|| anyhow::anyhow!("unknown timeline sync {}", id))? + .set_awaits_download(awaits_download); + Ok(()) } } +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct DescriptionTimelineIndexEntry { + pub description: BTreeMap, + pub awaits_download: bool, +} + #[derive(Debug, Clone, PartialEq, Eq)] -pub enum TimelineIndexEntry { - /// An archive found on the remote storage, but not yet downloaded, only a metadata from its storage path is available, without archive contents. +pub struct FullTimelineIndexEntry { + pub remote_timeline: RemoteTimeline, + pub awaits_download: bool, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum TimelineIndexEntryInner { Description(BTreeMap), - /// Full archive metadata, including the file list, parsed from the archive header. Full(RemoteTimeline), } +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TimelineIndexEntry { + inner: TimelineIndexEntryInner, + awaits_download: bool, +} + impl TimelineIndexEntry { + pub fn new(inner: TimelineIndexEntryInner, awaits_download: bool) -> Self { + Self { + inner, + awaits_download, + } + } + + pub fn inner(&self) -> &TimelineIndexEntryInner { + &self.inner + } + + pub fn inner_mut(&mut self) -> &mut TimelineIndexEntryInner { + &mut self.inner + } + pub fn uploaded_checkpoints(&self) -> BTreeSet { - match self { - Self::Description(description) => { + match &self.inner { + TimelineIndexEntryInner::Description(description) => { description.keys().map(|archive_id| archive_id.0).collect() } - Self::Full(remote_timeline) => remote_timeline + TimelineIndexEntryInner::Full(remote_timeline) => remote_timeline .checkpoint_archives .keys() .map(|archive_id| archive_id.0) @@ -122,17 +186,25 @@ impl TimelineIndexEntry { /// Gets latest uploaded checkpoint's disk consisten Lsn for the corresponding timeline. pub fn disk_consistent_lsn(&self) -> Option { - match self { - Self::Description(description) => { + match &self.inner { + TimelineIndexEntryInner::Description(description) => { description.keys().map(|archive_id| archive_id.0).max() } - Self::Full(remote_timeline) => remote_timeline + TimelineIndexEntryInner::Full(remote_timeline) => remote_timeline .checkpoint_archives .keys() .map(|archive_id| archive_id.0) .max(), } } + + pub fn get_awaits_download(&self) -> bool { + self.awaits_download + } + + pub fn set_awaits_download(&mut self, awaits_download: bool) { + self.awaits_download = awaits_download; + } } /// Checkpoint archive's id, corresponding to the `disk_consistent_lsn` from the timeline's metadata file during checkpointing. @@ -331,13 +403,15 @@ fn try_parse_index_entry( tenant_id, timeline_id, }; - let timeline_index_entry = index - .timeline_files - .entry(sync_id) - .or_insert_with(|| TimelineIndexEntry::Description(BTreeMap::new())); - match timeline_index_entry { - TimelineIndexEntry::Description(descriptions) => { - descriptions.insert( + let timeline_index_entry = index.timeline_entries.entry(sync_id).or_insert_with(|| { + TimelineIndexEntry::new( + TimelineIndexEntryInner::Description(BTreeMap::default()), + false, + ) + }); + match timeline_index_entry.inner_mut() { + TimelineIndexEntryInner::Description(description) => { + description.insert( ArchiveId(disk_consistent_lsn), ArchiveDescription { header_size, @@ -346,7 +420,7 @@ fn try_parse_index_entry( }, ); } - TimelineIndexEntry::Full(_) => { + TimelineIndexEntryInner::Full(_) => { bail!("Cannot add parsed archive description to its full context in index with sync id {}", sync_id) } } diff --git a/pageserver/src/remote_storage/storage_sync/upload.rs b/pageserver/src/remote_storage/storage_sync/upload.rs index d064039ecc..8fdd91dd18 100644 --- a/pageserver/src/remote_storage/storage_sync/upload.rs +++ b/pageserver/src/remote_storage/storage_sync/upload.rs @@ -10,9 +10,9 @@ use crate::{ config::PageServerConf, remote_storage::{ storage_sync::{ - compression, - index::{RemoteTimeline, TimelineIndexEntry}, - sync_queue, update_index_description, SyncKind, SyncTask, + compression, fetch_full_index, + index::{RemoteTimeline, TimelineIndexEntry, TimelineIndexEntryInner}, + sync_queue, SyncKind, SyncTask, }, RemoteStorage, ZTenantTimelineId, }, @@ -30,7 +30,7 @@ pub(super) async fn upload_timeline_checkpoint< S: RemoteStorage + Send + Sync + 'static, >( config: &'static PageServerConf, - remote_assets: Arc<(S, RwLock)>, + remote_assets: Arc<(S, Arc>)>, sync_id: ZTenantTimelineId, new_checkpoint: NewCheckpoint, retries: u32, @@ -49,22 +49,24 @@ pub(super) async fn upload_timeline_checkpoint< let index_read = index.read().await; let remote_timeline = match index_read.timeline_entry(&sync_id) { None => None, - Some(TimelineIndexEntry::Full(remote_timeline)) => Some(Cow::Borrowed(remote_timeline)), - Some(TimelineIndexEntry::Description(_)) => { - debug!("Found timeline description for the given ids, downloading the full index"); - match update_index_description(remote_assets.as_ref(), &timeline_dir, sync_id).await { - Ok(remote_timeline) => Some(Cow::Owned(remote_timeline)), - Err(e) => { - error!("Failed to download full timeline index: {:?}", e); - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Upload(new_checkpoint), - )); - return Some(false); + Some(entry) => match entry.inner() { + TimelineIndexEntryInner::Full(remote_timeline) => Some(Cow::Borrowed(remote_timeline)), + TimelineIndexEntryInner::Description(_) => { + debug!("Found timeline description for the given ids, downloading the full index"); + match fetch_full_index(remote_assets.as_ref(), &timeline_dir, sync_id).await { + Ok(remote_timeline) => Some(Cow::Owned(remote_timeline)), + Err(e) => { + error!("Failed to download full timeline index: {:?}", e); + sync_queue::push(SyncTask::new( + sync_id, + retries, + SyncKind::Upload(new_checkpoint), + )); + return Some(false); + } } } - } + }, }; let already_contains_upload_lsn = remote_timeline @@ -95,22 +97,40 @@ pub(super) async fn upload_timeline_checkpoint< { Ok((archive_header, header_size)) => { let mut index_write = index.write().await; - match index_write.timeline_entry_mut(&sync_id) { - Some(TimelineIndexEntry::Full(remote_timeline)) => { - remote_timeline.update_archive_contents( - new_checkpoint.metadata.disk_consistent_lsn(), - archive_header, - header_size, - ); - } - None | Some(TimelineIndexEntry::Description(_)) => { + match index_write + .timeline_entry_mut(&sync_id) + .map(|e| e.inner_mut()) + { + None => { let mut new_timeline = RemoteTimeline::empty(); new_timeline.update_archive_contents( new_checkpoint.metadata.disk_consistent_lsn(), archive_header, header_size, ); - index_write.add_timeline_entry(sync_id, TimelineIndexEntry::Full(new_timeline)); + index_write.add_timeline_entry( + sync_id, + TimelineIndexEntry::new(TimelineIndexEntryInner::Full(new_timeline), false), + ) + } + Some(TimelineIndexEntryInner::Full(remote_timeline)) => { + remote_timeline.update_archive_contents( + new_checkpoint.metadata.disk_consistent_lsn(), + archive_header, + header_size, + ); + } + Some(TimelineIndexEntryInner::Description(_)) => { + let mut new_timeline = RemoteTimeline::empty(); + new_timeline.update_archive_contents( + new_checkpoint.metadata.disk_consistent_lsn(), + archive_header, + header_size, + ); + index_write.add_timeline_entry( + sync_id, + TimelineIndexEntry::new(TimelineIndexEntryInner::Full(new_timeline), false), + ) } } debug!("Checkpoint uploaded successfully"); @@ -136,7 +156,7 @@ async fn try_upload_checkpoint< S: RemoteStorage + Send + Sync + 'static, >( config: &'static PageServerConf, - remote_assets: Arc<(S, RwLock)>, + remote_assets: Arc<(S, Arc>)>, sync_id: ZTenantTimelineId, new_checkpoint: &NewCheckpoint, files_to_skip: BTreeSet, @@ -209,13 +229,15 @@ mod tests { let repo_harness = RepoHarness::create("reupload_timeline")?; let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID); let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?; - let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths( - repo_harness.conf, - storage - .list() - .await? - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), + let index = Arc::new(RwLock::new( + RemoteTimelineIndex::try_parse_descriptions_from_paths( + repo_harness.conf, + storage + .list() + .await? + .into_iter() + .map(|storage_path| storage.local_path(&storage_path).unwrap()), + ), )); let remote_assets = Arc::new((storage, index)); let index = &remote_assets.1; @@ -405,13 +427,15 @@ mod tests { let repo_harness = RepoHarness::create("reupload_timeline_rejected")?; let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID); let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?; - let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths( - repo_harness.conf, - storage - .list() - .await? - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), + let index = Arc::new(RwLock::new( + RemoteTimelineIndex::try_parse_descriptions_from_paths( + repo_harness.conf, + storage + .list() + .await? + .into_iter() + .map(|storage_path| storage.local_path(&storage_path).unwrap()), + ), )); let remote_assets = Arc::new((storage, index)); let storage = &remote_assets.0; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index be937b8d26..e335f42519 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,4 +1,6 @@ +use crate::layered_repository::metadata::TimelineMetadata; use crate::relish::*; +use crate::remote_storage::RemoteTimelineIndex; use crate::walrecord::MultiXactMember; use crate::CheckpointConfig; use anyhow::Result; @@ -6,6 +8,7 @@ use bytes::Bytes; use postgres_ffi::{MultiXactId, MultiXactOffset, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::HashSet; +use std::fmt::Display; use std::ops::{AddAssign, Deref}; use std::sync::{Arc, RwLockReadGuard}; use std::time::Duration; @@ -15,30 +18,43 @@ use zenith_utils::zid::ZTimelineId; /// Block number within a relish. This matches PostgreSQL's BlockNumber type. pub type BlockNumber = u32; +#[derive(Clone, Copy, Debug)] +pub enum TimelineSyncStatusUpdate { + Uploaded, + Downloaded, +} + +impl Display for TimelineSyncStatusUpdate { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + TimelineSyncStatusUpdate::Uploaded => "Uploaded", + TimelineSyncStatusUpdate::Downloaded => "Downloaded", + }; + f.write_str(s) + } +} /// /// A repository corresponds to one .zenith directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. pub trait Repository: Send + Sync { - fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>; - - /// Updates timeline based on the new sync state, received from the remote storage synchronization. + /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization. /// See [`crate::remote_storage`] for more details about the synchronization. - fn set_timeline_state( + fn apply_timeline_remote_sync_status_update( &self, timeline_id: ZTimelineId, - new_state: TimelineSyncState, + timeline_sync_status_update: TimelineSyncStatusUpdate, ) -> Result<()>; - /// Gets current synchronization state of the timeline. - /// See [`crate::remote_storage`] for more details about the synchronization. - fn get_timeline_state(&self, timeline_id: ZTimelineId) -> Option; - /// Get Timeline handle for given zenith timeline ID. - fn get_timeline(&self, timelineid: ZTimelineId) -> Result; + /// This function is idempotent. It doesnt change internal state in any way. + fn get_timeline(&self, timelineid: ZTimelineId) -> Option; + + /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded. + fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result>; /// Lists timelines the repository contains. /// Up to repository's implementation to omit certain timelines that ar not considered ready for use. - fn list_timelines(&self) -> Result>; + fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)>; /// Create a new, empty timeline. The caller is responsible for loading data into it /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. @@ -70,72 +86,44 @@ pub trait Repository: Send + Sync { /// perform one checkpoint iteration, flushing in-memory data on disk. /// this function is periodically called by checkponter thread. fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()>; + + /// detaches locally available timeline by stopping all threads and removing all the data. + fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>; + + // Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn. + fn get_remote_index(&self) -> &tokio::sync::RwLock; } /// A timeline, that belongs to the current repository. pub enum RepositoryTimeline { /// Timeline, with its files present locally in pageserver's working directory. /// Loaded into pageserver's memory and ready to be used. - Local { - id: ZTimelineId, - timeline: Arc, - }, - /// Timeline, found on the pageserver's remote storage, but not yet downloaded locally. - Remote { - id: ZTimelineId, - /// metadata contents of the latest successfully uploaded checkpoint - disk_consistent_lsn: Lsn, + Loaded(Arc), + + /// All the data is available locally, but not loaded into memory, so loading have to be done before actually using the timeline + Unloaded { + // It is ok to keep metadata here, because it is not changed when timeline is unloaded. + // FIXME can s3 sync actually change it? It can change it when timeline is in awaiting download state. + // but we currently do not download something for the timeline once it is local (even if there are new checkpoints) is it correct? + // also it is not that good to keep TimelineMetadata here, because it is layered repo implementation detail + metadata: TimelineMetadata, }, } -impl RepositoryTimeline { - pub fn local_timeline(&self) -> Option> { - if let Self::Local { timeline, .. } = self { - Some(Arc::clone(timeline)) - } else { - None - } - } - - pub fn id(&self) -> ZTimelineId { - match self { - Self::Local { id, .. } => *id, - Self::Remote { id, .. } => *id, - } - } -} - -/// A state of the timeline synchronization with the remote storage. -/// Contains `disk_consistent_lsn` of the corresponding remote timeline (latest checkpoint's disk_consistent_lsn). #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub enum TimelineSyncState { - /// No further downloads from the remote storage are needed. - /// The timeline state is up-to-date or ahead of the remote storage one, - /// ready to be used in any pageserver operation. - Ready(Lsn), - /// Timeline is scheduled for downloading, but its current local state is not up to date with the remote storage. - /// The timeline is not ready to be used in any pageserver operations, otherwise it might diverge its local state from the remote version, - /// making it impossible to sync it further. - AwaitsDownload(Lsn), - /// Timeline was not in the pageserver's local working directory, but was found on the remote storage, ready to be downloaded. - /// Cannot be used in any pageserver operations due to complete absence locally. - CloudOnly(Lsn), - /// Timeline was evicted from the pageserver's local working directory due to conflicting remote and local states or too many errors during the synchronization. - /// Such timelines cannot have their state synchronized further and may not have the data about remote timeline's disk_consistent_lsn, since eviction may happen - /// due to errors before the remote timeline contents is known. - Evicted(Option), +pub enum LocalTimelineState { + // timeline is loaded into memory (with layer map and all the bits), + Loaded, + // timeline is on disk locally and ready to be loaded into memory. + Unloaded, } -impl TimelineSyncState { - pub fn remote_disk_consistent_lsn(&self) -> Option { - Some(match self { - TimelineSyncState::Evicted(None) => return None, - TimelineSyncState::Ready(lsn) => lsn, - TimelineSyncState::AwaitsDownload(lsn) => lsn, - TimelineSyncState::CloudOnly(lsn) => lsn, - TimelineSyncState::Evicted(Some(lsn)) => lsn, - }) - .copied() +impl<'a> From<&'a RepositoryTimeline> for LocalTimelineState { + fn from(local_timeline_entry: &'a RepositoryTimeline) -> Self { + match local_timeline_entry { + RepositoryTimeline::Loaded(_) => LocalTimelineState::Loaded, + RepositoryTimeline::Unloaded { .. } => LocalTimelineState::Unloaded, + } } } @@ -362,7 +350,7 @@ pub mod repo_harness { use crate::{ config::PageServerConf, - layered_repository::{LayeredRepository, TIMELINES_SEGMENT_NAME}, + layered_repository::LayeredRepository, walredo::{WalRedoError, WalRedoManager}, }; @@ -395,7 +383,6 @@ pub mod repo_harness { let repo_dir = PageServerConf::test_repo_dir(test_name); let _ = fs::remove_dir_all(&repo_dir); fs::create_dir_all(&repo_dir)?; - fs::create_dir_all(&repo_dir.join(TIMELINES_SEGMENT_NAME))?; let conf = PageServerConf::dummy_conf(repo_dir); // Make a static copy of the config. This can never be free'd, but that's @@ -404,19 +391,45 @@ pub mod repo_harness { let tenant_id = ZTenantId::generate(); fs::create_dir_all(conf.tenant_path(&tenant_id))?; + fs::create_dir_all(conf.timelines_path(&tenant_id))?; Ok(Self { conf, tenant_id }) } pub fn load(&self) -> Box { + self.try_load().expect("failed to load test repo") + } + + pub fn try_load(&self) -> Result> { let walredo_mgr = Arc::new(TestRedoManager); - Box::new(LayeredRepository::new( + let repo = Box::new(LayeredRepository::new( self.conf, walredo_mgr, self.tenant_id, + Arc::new(tokio::sync::RwLock::new(RemoteTimelineIndex::empty())), false, - )) + )); + // populate repo with locally available timelines + for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) + .expect("should be able to read timelines dir") + { + let timeline_dir_entry = timeline_dir_entry.unwrap(); + let timeline_id: ZTimelineId = timeline_dir_entry + .path() + .file_name() + .unwrap() + .to_string_lossy() + .parse() + .unwrap(); + + repo.apply_timeline_remote_sync_status_update( + timeline_id, + TimelineSyncStatusUpdate::Downloaded, + )?; + } + + Ok(repo) } pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf { @@ -835,10 +848,9 @@ mod tests { // Create a branch, check that the relation is visible there repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; - let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { - Some(timeline) => timeline, - None => panic!("Should have a local timeline"), - }; + let newtline = repo + .get_timeline_load(NEW_TIMELINE_ID) + .expect("Should have a local timeline"); let new_writer = newtline.writer(); assert!(newtline @@ -896,10 +908,9 @@ mod tests { // Branch the history, modify relation differently on the new timeline repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; - let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { - Some(timeline) => timeline, - None => panic!("Should have a local timeline"), - }; + let newtline = repo + .get_timeline_load(NEW_TIMELINE_ID) + .expect("Should have a local timeline"); let new_writer = newtline.writer(); new_writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?; @@ -1046,11 +1057,9 @@ mod tests { make_some_layers(&tline, Lsn(0x20))?; repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; - let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { - Some(timeline) => timeline, - None => panic!("Should have a local timeline"), - }; - + let newtline = repo + .get_timeline_load(NEW_TIMELINE_ID) + .expect("Should have a local timeline"); // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; assert!(newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)).is_ok()); @@ -1067,10 +1076,9 @@ mod tests { make_some_layers(&tline, Lsn(0x20))?; repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; - let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { - Some(timeline) => timeline, - None => panic!("Should have a local timeline"), - }; + let newtline = repo + .get_timeline_load(NEW_TIMELINE_ID) + .expect("Should have a local timeline"); make_some_layers(&newtline, Lsn(0x60))?; @@ -1143,4 +1151,81 @@ mod tests { Ok(()) } + + #[test] + fn timeline_load() -> Result<()> { + const TEST_NAME: &str = "timeline_load"; + let harness = RepoHarness::create(TEST_NAME)?; + { + let repo = harness.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; + make_some_layers(&tline, Lsn(0x8000))?; + tline.checkpoint(CheckpointConfig::Forced)?; + } + + let repo = harness.load(); + let tline = repo + .get_timeline(TIMELINE_ID) + .expect("cannot load timeline"); + assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); + + assert!(repo.get_timeline_load(TIMELINE_ID).is_ok()); + + let tline = repo + .get_timeline(TIMELINE_ID) + .expect("cannot load timeline"); + assert!(matches!(tline, RepositoryTimeline::Loaded(_))); + + Ok(()) + } + + #[test] + fn timeline_load_with_ancestor() -> Result<()> { + const TEST_NAME: &str = "timeline_load"; + let harness = RepoHarness::create(TEST_NAME)?; + // create two timelines + { + let repo = harness.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + make_some_layers(&tline, Lsn(0x20))?; + tline.checkpoint(CheckpointConfig::Forced)?; + + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; + + let newtline = repo + .get_timeline_load(NEW_TIMELINE_ID) + .expect("Should have a local timeline"); + + make_some_layers(&newtline, Lsn(0x60))?; + tline.checkpoint(CheckpointConfig::Forced)?; + } + + // check that both of them are initially unloaded + let repo = harness.load(); + { + let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline"); + assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); + + let tline = repo + .get_timeline(NEW_TIMELINE_ID) + .expect("cannot get timeline"); + assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); + } + // load only child timeline + let _ = repo + .get_timeline_load(NEW_TIMELINE_ID) + .expect("cannot load timeline"); + + // check that both, child and ancestor are loaded + let tline = repo + .get_timeline(NEW_TIMELINE_ID) + .expect("cannot get timeline"); + assert!(matches!(tline, RepositoryTimeline::Loaded(_))); + + let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline"); + assert!(matches!(tline, RepositoryTimeline::Loaded(_))); + + Ok(()) + } } diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 568088fc1d..8584bdd424 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,16 +3,19 @@ use crate::config::PageServerConf; use crate::layered_repository::LayeredRepository; -use crate::repository::{Repository, Timeline, TimelineSyncState}; +use crate::remote_storage::RemoteTimelineIndex; +use crate::repository::{Repository, Timeline, TimelineSyncStatusUpdate}; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; use crate::timelines; +use crate::timelines::CreateRepo; use crate::walredo::PostgresRedoManager; use crate::CheckpointConfig; use anyhow::{Context, Result}; use lazy_static::lazy_static; use log::*; use serde::{Deserialize, Serialize}; +use std::collections::hash_map::Entry; use std::collections::HashMap; use std::fmt; use std::sync::{Arc, Mutex, MutexGuard}; @@ -57,79 +60,67 @@ fn access_tenants() -> MutexGuard<'static, HashMap> { TENANTS.lock().unwrap() } -/// Updates tenants' repositories, changing their timelines state in memory. -pub fn set_timeline_states( +// Sets up wal redo manager and repository for tenant. Reduces code duplocation. +// Used during pageserver startup, or when new tenant is attached to pageserver. +pub fn load_local_repo( conf: &'static PageServerConf, - timeline_states: HashMap>, -) { - if timeline_states.is_empty() { - debug!("no timeline state updates to perform"); - return; - } - - info!("Updating states for {} timelines", timeline_states.len()); - trace!("States: {:?}", timeline_states); - + tenant_id: ZTenantId, + remote_index: &Arc>, +) -> Arc { let mut m = access_tenants(); - for (tenant_id, timeline_states) in timeline_states { - let tenant = m.entry(tenant_id).or_insert_with(|| { - // TODO (rodionov) reuse one of the initialisation routines - // Set up a WAL redo manager, for applying WAL records. - let walredo_mgr = PostgresRedoManager::new(conf, tenant_id); + let tenant = m.entry(tenant_id).or_insert_with(|| { + // Set up a WAL redo manager, for applying WAL records. + let walredo_mgr = PostgresRedoManager::new(conf, tenant_id); - // Set up an object repository, for actual data storage. - let repo: Arc = Arc::new(LayeredRepository::new( - conf, - Arc::new(walredo_mgr), - tenant_id, - conf.remote_storage_config.is_some(), - )); - Tenant { - state: TenantState::Idle, - repo, - } - }); - if let Err(e) = put_timelines_into_tenant(tenant, tenant_id, timeline_states) { - error!( - "Failed to update timeline states for tenant {}: {:?}", - tenant_id, e - ); + // Set up an object repository, for actual data storage. + let repo: Arc = Arc::new(LayeredRepository::new( + conf, + Arc::new(walredo_mgr), + tenant_id, + Arc::clone(remote_index), + conf.remote_storage_config.is_some(), + )); + Tenant { + state: TenantState::Idle, + repo, } - } + }); + Arc::clone(&tenant.repo) } -fn put_timelines_into_tenant( - tenant: &mut Tenant, - tenant_id: ZTenantId, - timeline_states: HashMap, -) -> anyhow::Result<()> { - for (timeline_id, timeline_state) in timeline_states { - // If the timeline is being put into any other state than Ready, - // stop any threads operating on it. - // - // FIXME: This is racy. A page service thread could just get - // handle on the Timeline, before we call set_timeline_state() - if !matches!(timeline_state, TimelineSyncState::Ready(_)) { - thread_mgr::shutdown_threads(None, Some(tenant_id), Some(timeline_id)); - - // Should we run a final checkpoint to flush all the data to - // disk? Doesn't seem necessary; all of the states other than - // Ready imply that the data on local disk is corrupt or incomplete, - // and we don't want to flush that to disk. - } - - tenant - .repo - .set_timeline_state(timeline_id, timeline_state) - .with_context(|| { - format!( - "Failed to update timeline {} state to {:?}", - timeline_id, timeline_state - ) - })?; +/// Updates tenants' repositories, changing their timelines state in memory. +pub fn apply_timeline_sync_status_updates( + conf: &'static PageServerConf, + remote_index: Arc>, + sync_status_updates: HashMap>, +) { + if sync_status_updates.is_empty() { + debug!("no sync status updates to apply"); + return; } + info!( + "Applying sync status updates for {} timelines", + sync_status_updates.len() + ); + trace!("Sync status updates: {:?}", sync_status_updates); - Ok(()) + for (tenant_id, tenant_timelines_sync_status_updates) in sync_status_updates { + let repo = load_local_repo(conf, tenant_id, &remote_index); + + for (timeline_id, timeline_sync_status_update) in tenant_timelines_sync_status_updates { + match repo.apply_timeline_remote_sync_status_update(timeline_id, timeline_sync_status_update) + { + Ok(_) => debug!( + "successfully applied timeline sync status update: {} -> {}", + timeline_id, timeline_sync_status_update + ), + Err(e) => error!( + "Failed to apply timeline sync status update for tenant {}. timeline {} update {} Error: {:#}", + tenant_id, timeline_id, timeline_sync_status_update, e + ), + } + } + } } /// @@ -179,24 +170,30 @@ pub fn shutdown_all_tenants() { pub fn create_tenant_repository( conf: &'static PageServerConf, - new_tenant_id: Option, + tenantid: ZTenantId, + remote_index: Arc>, ) -> Result> { - let new_tenant_id = new_tenant_id.unwrap_or_else(ZTenantId::generate); - let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, new_tenant_id)); - match timelines::create_repo(conf, new_tenant_id, wal_redo_manager)? { - Some(repo) => { - access_tenants() - .entry(new_tenant_id) - .or_insert_with(|| Tenant { - state: TenantState::Idle, - repo, - }); - Ok(Some(new_tenant_id)) - } - None => { - debug!("repository already exists for tenant {}", new_tenant_id); + match access_tenants().entry(tenantid) { + Entry::Occupied(_) => { + debug!("tenant {} already exists", tenantid); Ok(None) } + Entry::Vacant(v) => { + let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid)); + let repo = timelines::create_repo( + conf, + tenantid, + CreateRepo::Real { + wal_redo_manager, + remote_index, + }, + )?; + v.insert(Tenant { + state: TenantState::Idle, + repo, + }); + Ok(Some(tenantid)) + } } } @@ -255,19 +252,19 @@ pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result Result> { get_repository_for_tenant(tenantid)? - .get_timeline(timelineid)? - .local_timeline() - .with_context(|| format!("cannot fetch timeline {}", timelineid)) + .get_timeline_load(timelineid) + .with_context(|| format!("Timeline {} not found for tenant {}", timelineid, tenantid)) } #[derive(Serialize, Deserialize, Clone)] diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 4de131ef70..9cfc21b413 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -2,8 +2,9 @@ //! Timeline management code // -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{bail, Context, Result}; use postgres_ffi::ControlFileData; +use serde::{Deserialize, Serialize}; use std::{ fs, path::Path, @@ -12,135 +13,126 @@ use std::{ }; use tracing::*; -use zenith_utils::lsn::Lsn; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; use zenith_utils::{crashsafe_dir, logging}; +use zenith_utils::{lsn::Lsn, zid::HexZTimelineId}; -use crate::{config::PageServerConf, repository::Repository}; +use crate::{ + config::PageServerConf, + layered_repository::metadata::TimelineMetadata, + remote_storage::RemoteTimelineIndex, + repository::{LocalTimelineState, Repository}, +}; use crate::{import_datadir, LOG_FILE_NAME}; use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager}; use crate::{repository::RepositoryTimeline, tenant_mgr}; use crate::{repository::Timeline, CheckpointConfig}; -#[derive(Clone)] -pub enum TimelineInfo { - Local { - timeline_id: ZTimelineId, - tenant_id: ZTenantId, - last_record_lsn: Lsn, - prev_record_lsn: Lsn, - ancestor_timeline_id: Option, - ancestor_lsn: Option, - disk_consistent_lsn: Lsn, - current_logical_size: usize, - current_logical_size_non_incremental: Option, - }, - Remote { - timeline_id: ZTimelineId, - tenant_id: ZTenantId, - disk_consistent_lsn: Lsn, - }, +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct LocalTimelineInfo { + pub ancestor_timeline_id: Option, + pub ancestor_lsn: Option, + pub last_record_lsn: Lsn, + pub prev_record_lsn: Option, + pub disk_consistent_lsn: Lsn, + pub current_logical_size: Option, // is None when timeline is Unloaded + pub current_logical_size_non_incremental: Option, + pub timeline_state: LocalTimelineState, } -impl TimelineInfo { - pub fn from_repo_timeline( - tenant_id: ZTenantId, - repo_timeline: RepositoryTimeline, - include_non_incremental_logical_size: bool, - ) -> Self { - match repo_timeline { - RepositoryTimeline::Local { id, timeline } => { - let ancestor_timeline_id = timeline.get_ancestor_timeline_id(); - let ancestor_lsn = if ancestor_timeline_id.is_some() { - Some(timeline.get_ancestor_lsn()) - } else { - None - }; - - Self::Local { - timeline_id: id, - tenant_id, - last_record_lsn: timeline.get_last_record_lsn(), - prev_record_lsn: timeline.get_prev_record_lsn(), - ancestor_timeline_id, - ancestor_lsn, - disk_consistent_lsn: timeline.get_disk_consistent_lsn(), - current_logical_size: timeline.get_current_logical_size(), - current_logical_size_non_incremental: get_current_logical_size_non_incremental( - include_non_incremental_logical_size, - timeline.as_ref(), - ), - } - } - RepositoryTimeline::Remote { - id, - disk_consistent_lsn, - } => Self::Remote { - timeline_id: id, - tenant_id, - disk_consistent_lsn, - }, - } - } - - pub fn from_dyn_timeline( - tenant_id: ZTenantId, - timeline_id: ZTimelineId, +impl LocalTimelineInfo { + pub fn from_loaded_timeline( timeline: &dyn Timeline, include_non_incremental_logical_size: bool, - ) -> Self { - let ancestor_timeline_id = timeline.get_ancestor_timeline_id(); - let ancestor_lsn = if ancestor_timeline_id.is_some() { - Some(timeline.get_ancestor_lsn()) - } else { - None - }; - - Self::Local { - timeline_id, - tenant_id, - last_record_lsn: timeline.get_last_record_lsn(), - prev_record_lsn: timeline.get_prev_record_lsn(), - ancestor_timeline_id, - ancestor_lsn, + ) -> anyhow::Result { + let last_record_lsn = timeline.get_last_record_lsn(); + let info = LocalTimelineInfo { + ancestor_timeline_id: timeline + .get_ancestor_timeline_id() + .map(HexZTimelineId::from), + ancestor_lsn: { + match timeline.get_ancestor_lsn() { + Lsn(0) => None, + lsn @ Lsn(_) => Some(lsn), + } + }, disk_consistent_lsn: timeline.get_disk_consistent_lsn(), - current_logical_size: timeline.get_current_logical_size(), - current_logical_size_non_incremental: get_current_logical_size_non_incremental( - include_non_incremental_logical_size, - timeline, - ), + last_record_lsn, + prev_record_lsn: Some(timeline.get_prev_record_lsn()), + timeline_state: LocalTimelineState::Loaded, + current_logical_size: Some(timeline.get_current_logical_size()), + current_logical_size_non_incremental: if include_non_incremental_logical_size { + Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?) + } else { + None + }, + }; + Ok(info) + } + + pub fn from_unloaded_timeline(metadata: &TimelineMetadata) -> Self { + LocalTimelineInfo { + ancestor_timeline_id: metadata.ancestor_timeline().map(HexZTimelineId::from), + ancestor_lsn: { + match metadata.ancestor_lsn() { + Lsn(0) => None, + lsn @ Lsn(_) => Some(lsn), + } + }, + disk_consistent_lsn: metadata.disk_consistent_lsn(), + last_record_lsn: metadata.disk_consistent_lsn(), + prev_record_lsn: metadata.prev_record_lsn(), + timeline_state: LocalTimelineState::Unloaded, + current_logical_size: None, + current_logical_size_non_incremental: None, } } - pub fn timeline_id(&self) -> ZTimelineId { - match *self { - TimelineInfo::Local { timeline_id, .. } => timeline_id, - TimelineInfo::Remote { timeline_id, .. } => timeline_id, - } - } - - pub fn tenant_id(&self) -> ZTenantId { - match *self { - TimelineInfo::Local { tenant_id, .. } => tenant_id, - TimelineInfo::Remote { tenant_id, .. } => tenant_id, + pub fn from_repo_timeline( + repo_timeline: RepositoryTimeline, + include_non_incremental_logical_size: bool, + ) -> anyhow::Result { + match repo_timeline { + RepositoryTimeline::Loaded(timeline) => { + Self::from_loaded_timeline(timeline.as_ref(), include_non_incremental_logical_size) + } + RepositoryTimeline::Unloaded { metadata } => { + Ok(Self::from_unloaded_timeline(&metadata)) + } } } } -fn get_current_logical_size_non_incremental( - include_non_incremental_logical_size: bool, - timeline: &dyn Timeline, -) -> Option { - if !include_non_incremental_logical_size { - return None; - } - match timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) { - Ok(size) => Some(size), - Err(e) => { - error!("Failed to get non-incremental logical size: {:?}", e); - None - } - } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct RemoteTimelineInfo { + pub remote_consistent_lsn: Option, + pub awaits_download: bool, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct TimelineInfo { + #[serde(with = "hex")] + pub tenant_id: ZTenantId, + #[serde(with = "hex")] + pub timeline_id: ZTimelineId, + pub local: Option, + pub remote: Option, +} + +pub fn extract_remote_timeline_info( + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + remote_index: &RemoteTimelineIndex, +) -> Option { + remote_index + .timeline_entry(&ZTenantTimelineId { + tenant_id, + timeline_id, + }) + .map(|remote_entry| RemoteTimelineInfo { + remote_consistent_lsn: remote_entry.disk_consistent_lsn(), + awaits_download: remote_entry.get_awaits_download(), + }) } #[derive(Debug, Clone, Copy)] @@ -158,25 +150,12 @@ pub fn init_pageserver( // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages let _log_file = logging::init(LOG_FILE_NAME, true)?; - // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo - // process during repository initialization. - // - // FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched - // initdb in the background, and it kept running even after the "zenith init" had exited. - // In tests, we started the page server immediately after that, so that initdb was still - // running in the background, and we failed to run initdb again in the same directory. This - // has been solved for the rapid init+start case now, but the general race condition remains - // if you restart the server quickly. The WAL redo manager doesn't use a separate thread - // anymore, but I think that could still happen. - let dummy_redo_mgr = Arc::new(crate::walredo::DummyRedoManager {}); - crashsafe_dir::create_dir_all(conf.tenants_path())?; if let Some(tenant_id) = create_tenant { println!("initializing tenantid {}", tenant_id); - let repo = create_repo(conf, tenant_id, dummy_redo_mgr) - .context("failed to create repo")? - .ok_or_else(|| anyhow!("For newely created pageserver, found already existing repository for tenant {}", tenant_id))?; + let repo = + create_repo(conf, tenant_id, CreateRepo::Dummy).context("failed to create repo")?; let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref()) .context("failed to create initial timeline")?; @@ -189,15 +168,45 @@ pub fn init_pageserver( Ok(()) } +pub enum CreateRepo { + Real { + wal_redo_manager: Arc, + remote_index: Arc>, + }, + Dummy, +} + pub fn create_repo( conf: &'static PageServerConf, tenant_id: ZTenantId, - wal_redo_manager: Arc, -) -> Result>> { + create_repo: CreateRepo, +) -> Result> { + let (wal_redo_manager, remote_index) = match create_repo { + CreateRepo::Real { + wal_redo_manager, + remote_index, + } => (wal_redo_manager, remote_index), + CreateRepo::Dummy => { + // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo + // process during repository initialization. + // + // FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched + // initdb in the background, and it kept running even after the "zenith init" had exited. + // In tests, we started the page server immediately after that, so that initdb was still + // running in the background, and we failed to run initdb again in the same directory. This + // has been solved for the rapid init+start case now, but the general race condition remains + // if you restart the server quickly. The WAL redo manager doesn't use a separate thread + // anymore, but I think that could still happen. + let wal_redo_manager = Arc::new(crate::walredo::DummyRedoManager {}); + + let remote_index = Arc::new(tokio::sync::RwLock::new(RemoteTimelineIndex::empty())); + (wal_redo_manager as _, remote_index) + } + }; + let repo_dir = conf.tenant_path(&tenant_id); if repo_dir.exists() { - debug!("repo for {} already exists", tenant_id); - return Ok(None); + bail!("tenant {} directory already exists", tenant_id); } // top-level dir may exist if we are creating it through CLI @@ -206,12 +215,13 @@ pub fn create_repo( crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; info!("created directory structure in {}", repo_dir.display()); - Ok(Some(Arc::new(LayeredRepository::new( + Ok(Arc::new(LayeredRepository::new( conf, wal_redo_manager, tenant_id, + remote_index, conf.remote_storage_config.is_some(), - )))) + ))) } // Returns checkpoint LSN from controlfile @@ -299,30 +309,25 @@ fn bootstrap_timeline( Ok(timeline) } -pub(crate) fn get_timelines( +pub(crate) fn get_local_timelines( tenant_id: ZTenantId, include_non_incremental_logical_size: bool, -) -> Result> { +) -> Result> { let repo = tenant_mgr::get_repository_for_tenant(tenant_id) .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?; + let repo_timelines = repo.list_timelines(); - Ok(repo - .list_timelines() - .with_context(|| format!("Failed to list timelines for tenant {}", tenant_id))? - .into_iter() - .filter_map(|timeline| match timeline { - RepositoryTimeline::Local { timeline, id } => Some((id, timeline)), - RepositoryTimeline::Remote { .. } => None, - }) - .map(|(timeline_id, timeline)| { - TimelineInfo::from_dyn_timeline( - tenant_id, - timeline_id, - timeline.as_ref(), + let mut local_timeline_info = Vec::with_capacity(repo_timelines.len()); + for (timeline_id, repository_timeline) in repo_timelines { + local_timeline_info.push(( + timeline_id, + LocalTimelineInfo::from_repo_timeline( + repository_timeline, include_non_incremental_logical_size, - ) - }) - .collect()) + )?, + )) + } + Ok(local_timeline_info) } pub(crate) fn create_timeline( @@ -336,16 +341,8 @@ pub(crate) fn create_timeline( let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; if conf.timeline_path(&new_timeline_id, &tenant_id).exists() { - match repo.get_timeline(new_timeline_id)? { - RepositoryTimeline::Local { id, .. } => { - debug!("timeline {} already exists", id); - return Ok(None); - } - RepositoryTimeline::Remote { id, .. } => bail!( - "timeline {} already exists in pageserver's remote storage", - id - ), - } + debug!("timeline {} already exists", new_timeline_id); + return Ok(None); } let mut start_lsn = ancestor_start_lsn.unwrap_or(Lsn(0)); @@ -353,15 +350,8 @@ pub(crate) fn create_timeline( let new_timeline_info = match ancestor_timeline_id { Some(ancestor_timeline_id) => { let ancestor_timeline = repo - .get_timeline(ancestor_timeline_id) - .with_context(|| format!("Cannot get ancestor timeline {}", ancestor_timeline_id))? - .local_timeline() - .with_context(|| { - format!( - "Cannot branch off the timeline {} that's not present locally", - ancestor_timeline_id - ) - })?; + .get_timeline_load(ancestor_timeline_id) + .context("Cannot branch off the timeline that's not present locally")?; if start_lsn == Lsn(0) { // Find end of WAL on the old timeline @@ -391,18 +381,20 @@ pub(crate) fn create_timeline( } repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?; // load the timeline into memory - let loaded_timeline = repo.get_timeline(new_timeline_id)?; - TimelineInfo::from_repo_timeline(tenant_id, loaded_timeline, false) + let loaded_timeline = repo.get_timeline_load(new_timeline_id)?; + LocalTimelineInfo::from_loaded_timeline(loaded_timeline.as_ref(), false) + .context("cannot fill timeline info")? } None => { let new_timeline = bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?; - TimelineInfo::from_dyn_timeline( - tenant_id, - new_timeline_id, - new_timeline.as_ref(), - false, - ) + LocalTimelineInfo::from_loaded_timeline(new_timeline.as_ref(), false) + .context("cannot fill timeline info")? } }; - Ok(Some(new_timeline_info)) + Ok(Some(TimelineInfo { + tenant_id, + timeline_id: new_timeline_id, + local: Some(new_timeline_info), + remote: None, + })) } diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 6fff1d062d..305dd4b3a2 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -31,6 +31,7 @@ use tracing::*; use zenith_utils::lsn::Lsn; use zenith_utils::pq_proto::ZenithFeedback; use zenith_utils::zid::ZTenantId; +use zenith_utils::zid::ZTenantTimelineId; use zenith_utils::zid::ZTimelineId; // @@ -111,18 +112,18 @@ fn get_wal_producer_connstr(tenantid: ZTenantId, timelineid: ZTimelineId) -> Str // fn thread_main( conf: &'static PageServerConf, - tenantid: ZTenantId, - timelineid: ZTimelineId, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, ) -> Result<()> { - let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered(); + let _enter = info_span!("WAL receiver", timeline = %timeline_id, tenant = %tenant_id).entered(); info!("WAL receiver thread started"); // Look up the current WAL producer address - let wal_producer_connstr = get_wal_producer_connstr(tenantid, timelineid); + let wal_producer_connstr = get_wal_producer_connstr(tenant_id, timeline_id); // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server, // and start streaming WAL from it. - let res = walreceiver_main(conf, tenantid, timelineid, &wal_producer_connstr); + let res = walreceiver_main(conf, tenant_id, timeline_id, &wal_producer_connstr); // TODO cleanup info messages if let Err(e) = res { @@ -130,20 +131,20 @@ fn thread_main( } else { info!( "walreceiver disconnected tenant {}, timelineid {}", - tenantid, timelineid + tenant_id, timeline_id ); } // Drop it from list of active WAL_RECEIVERS // so that next callmemaybe request launched a new thread - drop_wal_receiver(tenantid, timelineid); + drop_wal_receiver(tenant_id, timeline_id); Ok(()) } fn walreceiver_main( _conf: &PageServerConf, - tenantid: ZTenantId, - timelineid: ZTimelineId, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, wal_producer_connstr: &str, ) -> Result<(), Error> { // Connect to the database in replication mode. @@ -182,13 +183,16 @@ fn walreceiver_main( let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); let mut caught_up = false; - let timeline = - tenant_mgr::get_timeline_for_tenant(tenantid, timelineid).with_context(|| { - format!( - "Can not start the walrecever for a remote tenant {}, timeline {}", - tenantid, timelineid, - ) - })?; + let repo = tenant_mgr::get_repository_for_tenant(tenant_id) + .with_context(|| format!("no repository found for tenant {}", tenant_id))?; + let timeline = repo.get_timeline_load(timeline_id).with_context(|| { + format!( + "local timeline {} not found for tenant {}", + timeline_id, tenant_id + ) + })?; + + let remote_index = repo.get_remote_index(); // // Start streaming the WAL, from where we left off previously. @@ -292,11 +296,19 @@ fn walreceiver_main( }; if let Some(last_lsn) = status_update { - let timeline_synced_disk_consistent_lsn = - tenant_mgr::get_repository_for_tenant(tenantid)? - .get_timeline_state(timelineid) - .and_then(|state| state.remote_disk_consistent_lsn()) - .unwrap_or(Lsn(0)); + let timeline_remote_consistent_lsn = runtime.block_on(async { + remote_index + .read() + .await + // here we either do not have this timeline in remote index + // or there were no checkpoints for it yet + .timeline_entry(&ZTenantTimelineId { + tenant_id, + timeline_id, + }) + .and_then(|e| e.disk_consistent_lsn()) + .unwrap_or(Lsn(0)) // no checkpoint was uploaded + }); // The last LSN we processed. It is not guaranteed to survive pageserver crash. let write_lsn = u64::from(last_lsn); @@ -304,7 +316,7 @@ fn walreceiver_main( let flush_lsn = u64::from(timeline.get_disk_consistent_lsn()); // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. - let apply_lsn = u64::from(timeline_synced_disk_consistent_lsn); + let apply_lsn = u64::from(timeline_remote_consistent_lsn); let ts = SystemTime::now(); // Send zenith feedback message. diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index edcc768819..8689838089 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -5,7 +5,7 @@ import time, shutil, os from contextlib import closing from pathlib import Path from uuid import UUID -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.zenith_fixtures import ZenithEnvBuilder, assert_local, wait_for, wait_for_last_record_lsn, wait_for_upload from fixtures.log_helper import log import pytest @@ -26,7 +26,6 @@ import pytest # * queries the specific data, ensuring that it matches the one stored before # # The tests are done for all types of remote storage pageserver supports. -@pytest.mark.skip(reason="will be fixed with https://github.com/zenithdb/zenith/issues/1193") @pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3']) def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, storage_type: str): zenith_env_builder.rust_log_override = 'debug' @@ -45,6 +44,8 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, env = zenith_env_builder.init_start() pg = env.postgres.create_start('main') + client = env.pageserver.http_client() + tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] @@ -54,13 +55,21 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, CREATE TABLE t1(id int primary key, secret text); INSERT INTO t1 VALUES ({data_id}, '{data_secret}'); ''') + cur.execute("SELECT pg_current_wal_flush_lsn()") + current_lsn = int(cur.fetchone()[0].split('/')[1], base=16) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, UUID(tenant_id), UUID(timeline_id), current_lsn) # run checkpoint manually to be sure that data landed in remote storage with closing(env.pageserver.connect()) as psconn: with psconn.cursor() as pscur: - pscur.execute(f"do_gc {tenant_id} {timeline_id}") - log.info("waiting for upload") # TODO api to check if upload is done - time.sleep(2) + pscur.execute(f"checkpoint {tenant_id} {timeline_id}") + + log.info("waiting for upload") + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(client, UUID(tenant_id), UUID(timeline_id), current_lsn) + log.info("upload is done") ##### Stop the first pageserver instance, erase all its data env.postgres.stop_all() @@ -73,26 +82,12 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, ##### Second start, restore the data and ensure it's the same env.pageserver.start() - client = env.pageserver.http_client() client.timeline_attach(UUID(tenant_id), UUID(timeline_id)) - # FIXME cannot handle duplicate download requests (which might be caused by repeated timeline detail calls) - # subject to fix in https://github.com/zenithdb/zenith/issues/997 - time.sleep(5) log.info("waiting for timeline redownload") - attempts = 0 - while True: - timeline_details = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) - assert timeline_details['timeline_id'] == timeline_id - assert timeline_details['tenant_id'] == tenant_id - if timeline_details['kind'] == 'Local': - log.info("timeline downloaded, checking its data") - break - attempts += 1 - if attempts > 10: - raise Exception("timeline redownload failed") - log.debug("still waiting") - time.sleep(1) + wait_for(number_of_iterations=10, + interval=1, + func=lambda: assert_local(client, UUID(tenant_id), UUID(timeline_id))) pg = env.postgres.create_start('main') with closing(pg.connect()) as conn: diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 7a9d478f16..e4492e5393 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -3,17 +3,19 @@ import os import pathlib import subprocess import threading +from typing import Dict from uuid import UUID from fixtures.log_helper import log import time import signal import pytest -from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, ZenithPageserverHttpClient, zenith_binpath, pg_distrib_dir +from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, ZenithPageserverHttpClient, assert_local, wait_for, wait_for_last_record_lsn, wait_for_upload, zenith_binpath, pg_distrib_dir def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): - assert abs(a - b) / a < margin_ratio, (a, b, margin_ratio) + print("!" * 100, abs(a - b) / a) + assert abs(a - b) / a < margin_ratio, abs(a - b) / a @contextmanager @@ -34,6 +36,7 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, f"-c listen_pg_addr='localhost:{pg_port}'", f"-c listen_http_addr='localhost:{http_port}'", f"-c pg_distrib_dir='{pg_distrib_dir}'", + f"-c id=2", f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", ] @@ -57,20 +60,6 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, os.kill(pid, signal.SIGQUIT) -def wait_for(number_of_iterations: int, interval: int, func): - last_exception = None - for i in range(number_of_iterations): - try: - res = func() - except Exception as e: - log.info("waiting for %s iteration %s failed", func, i + 1) - last_exception = e - time.sleep(interval) - continue - return res - raise Exception("timed out while waiting for %s" % func) from last_exception - - @contextmanager def pg_cur(pg): with closing(pg.connect()) as conn: @@ -108,13 +97,6 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve log.info('load thread stopped') -def assert_local(pageserver_http_client: ZenithPageserverHttpClient, tenant: UUID, timeline: str): - timeline_detail = pageserver_http_client.timeline_detail(tenant, UUID(timeline)) - assert timeline_detail.get('type') == "Local", timeline_detail - return timeline_detail - - -@pytest.mark.skip(reason="will be fixed with https://github.com/zenithdb/zenith/issues/1193") @pytest.mark.parametrize('with_load', ['with_load', 'without_load']) def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, port_distributor: PortDistributor, @@ -129,7 +111,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, tenant = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) log.info("tenant to relocate %s", tenant) - + env.zenith_cli.create_root_branch('main', tenant_id=tenant) env.zenith_cli.create_branch('test_tenant_relocation', tenant_id=tenant) tenant_pg = env.postgres.create_start(branch_name='main', @@ -141,8 +123,8 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, with conn.cursor() as cur: # save timeline for later gc call cur.execute("SHOW zenith.zenith_timeline") - timeline = cur.fetchone()[0] - log.info("timeline to relocate %s", timeline) + timeline = UUID(cur.fetchone()[0]) + log.info("timeline to relocate %s", timeline.hex) # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -150,6 +132,15 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'some payload'") cur.execute("SELECT sum(key) FROM t") assert cur.fetchone() == (500500, ) + cur.execute("SELECT pg_current_wal_flush_lsn()") + + current_lsn = int(cur.fetchone()[0].split('/')[1], base=16) + + pageserver_http = env.pageserver.http_client() + + # wait until pageserver receives that data + wait_for_last_record_lsn(pageserver_http, tenant, timeline, current_lsn) + timeline_detail = pageserver_http.timeline_detail_v2(tenant, timeline) if with_load == 'with_load': # create load table @@ -165,12 +156,10 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, # run checkpoint manually to be sure that data landed in remote storage with closing(env.pageserver.connect()) as psconn: with psconn.cursor() as pscur: - pscur.execute(f"do_gc {tenant.hex} {timeline}") + pscur.execute(f"checkpoint {tenant.hex} {timeline.hex}") - # ensure upload is completed - pageserver_http_client = env.pageserver.http_client() - timeline_detail = pageserver_http_client.timeline_detail(tenant, UUID(timeline)) - assert timeline_detail['disk_consistent_lsn'] == timeline_detail['timeline_state']['Ready'] + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(pageserver_http, tenant, timeline, current_lsn) log.info("inititalizing new pageserver") # bootstrap second pageserver @@ -182,8 +171,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port) pageserver_bin = pathlib.Path(zenith_binpath) / 'pageserver' - new_pageserver_http_client = ZenithPageserverHttpClient(port=new_pageserver_http_port, - auth_token=None) + new_pageserver_http = ZenithPageserverHttpClient(port=new_pageserver_http_port, auth_token=None) with new_pageserver_helper(new_pageserver_dir, pageserver_bin, @@ -192,25 +180,18 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, new_pageserver_http_port): # call to attach timeline to new pageserver - new_pageserver_http_client.timeline_attach(tenant, UUID(timeline)) - # FIXME cannot handle duplicate download requests, subject to fix in https://github.com/zenithdb/zenith/issues/997 - time.sleep(5) - # new pageserver should in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint + new_pageserver_http.timeline_attach(tenant, timeline) + # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint new_timeline_detail = wait_for( number_of_iterations=5, interval=1, - func=lambda: assert_local(new_pageserver_http_client, tenant, timeline)) - assert new_timeline_detail['timeline_state'].get('Ready'), new_timeline_detail + func=lambda: assert_local(new_pageserver_http, tenant, timeline)) + # when load is active these checks can break because lsns are not static # so lets check with some margin - if with_load == 'without_load': - # TODO revisit this once https://github.com/zenithdb/zenith/issues/1049 is fixed - assert_abs_margin_ratio(new_timeline_detail['disk_consistent_lsn'], - timeline_detail['disk_consistent_lsn'], - 0.01) - assert_abs_margin_ratio(new_timeline_detail['timeline_state']['Ready'], - timeline_detail['timeline_state']['Ready'], - 0.01) + assert_abs_margin_ratio(new_timeline_detail['local']['disk_consistent_lsn'], + timeline_detail['local']['disk_consistent_lsn'], + 0.03) # callmemaybe to start replication from safekeeper to the new pageserver # when there is no load there is a clean checkpoint and no wal delta @@ -219,7 +200,9 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, with pg_cur(PgProtocol(host='localhost', port=new_pageserver_pg_port)) as cur: # "callmemaybe {} {} host={} port={} options='-c ztimelineid={} ztenantid={}'" safekeeper_connstring = f"host=localhost port={env.safekeepers[0].port.pg} options='-c ztimelineid={timeline} ztenantid={tenant} pageserver_connstr=postgresql://no_user:@localhost:{new_pageserver_pg_port}'" - cur.execute("callmemaybe {} {} {}".format(tenant, timeline, safekeeper_connstring)) + cur.execute("callmemaybe {} {} {}".format(tenant.hex, + timeline.hex, + safekeeper_connstring)) tenant_pg.stop() @@ -239,7 +222,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, # detach tenant from old pageserver before we check # that all the data is there to be sure that old pageserver # is no longer involved, and if it is, we will see the errors - pageserver_http_client.timeline_detach(tenant, UUID(timeline)) + pageserver_http.timeline_detach(tenant, timeline) with pg_cur(tenant_pg) as cur: # check that data is still there diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index ec570a7dac..c44a6e431f 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -783,6 +783,15 @@ class ZenithPageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json + def timeline_detail_v2(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: + res = self.get( + f"http://localhost:{self.port}/v2/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1" + ) + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + def get_metrics(self) -> str: res = self.get(f"http://localhost:{self.port}/metrics") self.verbose_error(res) @@ -866,6 +875,30 @@ class ZenithCli: return uuid.UUID(created_timeline_id) + def create_root_branch(self, branch_name: str, tenant_id: Optional[uuid.UUID] = None): + cmd = [ + 'timeline', + 'create', + '--branch-name', + branch_name, + '--tenant-id', + (tenant_id or self.env.initial_tenant).hex, + ] + + res = self.raw_cli(cmd) + res.check_returncode() + + matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) + + created_timeline_id = None + if matches is not None: + created_timeline_id = matches.group('timeline_id') + + if created_timeline_id is None: + raise Exception('could not find timeline id after `zenith timeline create` invocation') + else: + return uuid.UUID(created_timeline_id) + def create_branch(self, new_branch_name: str = DEFAULT_BRANCH_NAME, ancestor_branch_name: Optional[str] = None, @@ -1839,3 +1872,59 @@ def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Pos subprocess.run([cmd], stdout=stdout_f, shell=True) assert (mismatch, error) == ([], []) + + +def wait_for(number_of_iterations: int, interval: int, func): + last_exception = None + for i in range(number_of_iterations): + try: + res = func() + except Exception as e: + log.info("waiting for %s iteration %s failed", func, i + 1) + last_exception = e + time.sleep(interval) + continue + return res + raise Exception("timed out while waiting for %s" % func) from last_exception + + +def assert_local(pageserver_http_client: ZenithPageserverHttpClient, + tenant: uuid.UUID, + timeline: uuid.UUID): + timeline_detail = pageserver_http_client.timeline_detail_v2(tenant, timeline) + assert timeline_detail.get('local', {}).get("disk_consistent_lsn"), timeline_detail + return timeline_detail + + +def remote_consistent_lsn(pageserver_http_client: ZenithPageserverHttpClient, + tenant: uuid.UUID, + timeline: uuid.UUID) -> int: + detail = pageserver_http_client.timeline_detail_v2(tenant, timeline) + assert isinstance(detail['remote']['remote_consistent_lsn'], int) + return detail['remote']['remote_consistent_lsn'] + + +def wait_for_upload(pageserver_http_client: ZenithPageserverHttpClient, + tenant: uuid.UUID, + timeline: uuid.UUID, + lsn: int): + """waits for local timeline upload up to specified lsn""" + + wait_for(10, 1, lambda: remote_consistent_lsn(pageserver_http_client, tenant, timeline) >= lsn) + + +def last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, + tenant: uuid.UUID, + timeline: uuid.UUID) -> int: + detail = pageserver_http_client.timeline_detail_v2(tenant, timeline) + assert isinstance(detail['local']['last_record_lsn'], int) + return detail['local']['last_record_lsn'] + + +def wait_for_last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, + tenant: uuid.UUID, + timeline: uuid.UUID, + lsn: int): + """waits for pageserver to catch up to a certain lsn""" + + wait_for(10, 1, lambda: last_record_lsn(pageserver_http_client, tenant, timeline) >= lsn) diff --git a/zenith/src/main.rs b/zenith/src/main.rs index dd35427d5d..389c394103 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -299,42 +299,40 @@ fn print_timelines_tree( .iter() .map(|t| { ( - t.timeline_id(), + t.timeline_id, TimelineTreeEl { info: t.clone(), children: BTreeSet::new(), name: timeline_name_mappings - .remove(&ZTenantTimelineId::new(t.tenant_id(), t.timeline_id())), + .remove(&ZTenantTimelineId::new(t.tenant_id, t.timeline_id)), }, ) }) .collect::>(); // Memorize all direct children of each timeline. - for timeline in &timelines { - if let TimelineInfo::Local { - ancestor_timeline_id: Some(tid), - .. - } = timeline + for timeline in timelines.iter() { + if let Some(ancestor_timeline_id) = + timeline.local.as_ref().and_then(|l| l.ancestor_timeline_id) { timelines_hash - .get_mut(tid) + .get_mut(&ZTimelineId::from(ancestor_timeline_id)) .context("missing timeline info in the HashMap")? .children - .insert(timeline.timeline_id()); + .insert(timeline.timeline_id); } } for timeline in timelines_hash.values() { // Start with root local timelines (no ancestors) first. - if let TimelineInfo::Local { - ancestor_timeline_id, - .. - } = &timeline.info + if timeline + .info + .local + .as_ref() + .and_then(|l| l.ancestor_timeline_id) + .is_none() { - if ancestor_timeline_id.is_none() { - print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?; - } + print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?; } } @@ -350,20 +348,21 @@ fn print_timeline( timeline: &TimelineTreeEl, timelines: &HashMap, ) -> Result<()> { - let local_or_remote = match timeline.info { - TimelineInfo::Local { .. } => "(L)", - TimelineInfo::Remote { .. } => "(R)", + let local_remote = match (timeline.info.local.as_ref(), timeline.info.remote.as_ref()) { + (None, None) => unreachable!("in this case no info for a timeline is found"), + (None, Some(_)) => "(R)", + (Some(_), None) => "(L)", + (Some(_), Some(_)) => "(L+R)", }; // Draw main padding - print!("{} ", local_or_remote); + print!("{} ", local_remote); if nesting_level > 0 { - let lsn_string = match &timeline.info { - TimelineInfo::Local { ancestor_lsn, .. } => ancestor_lsn - .map(|lsn| lsn.to_string()) - .unwrap_or_else(|| "Unknown local Lsn".to_string()), - TimelineInfo::Remote { .. } => "unknown Lsn (remote)".to_string(), + let ancestor_lsn = match timeline.info.local.as_ref().and_then(|i| i.ancestor_lsn) { + Some(lsn) => lsn.to_string(), + None => "Unknown Lsn".to_string(), }; + let mut br_sym = "┣━"; // Draw each nesting padding with proper style @@ -383,14 +382,14 @@ fn print_timeline( br_sym = "┗━"; } - print!("{} @{}: ", br_sym, lsn_string); + print!("{} @{}: ", br_sym, ancestor_lsn); } // Finally print a timeline id and name with new line println!( "{} [{}]", timeline.name.as_deref().unwrap_or("_no_name_"), - timeline.info.timeline_id() + timeline.info.timeline_id ); let len = timeline.children.len(); @@ -430,7 +429,7 @@ fn get_timeline_infos( Ok(PageServerNode::from_env(env) .timeline_list(tenant_id)? .into_iter() - .map(|timeline_info| (timeline_info.timeline_id(), timeline_info)) + .map(|timeline_info| (timeline_info.timeline_id, timeline_info)) .collect()) } @@ -555,26 +554,17 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let timeline = pageserver .timeline_create(tenant_id, None, None, None)? .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; - let new_timeline_id = timeline.timeline_id(); + let new_timeline_id = timeline.timeline_id; - let last_record_lsn = match timeline { - TimelineInfo::Local { - last_record_lsn, .. - } => last_record_lsn, - TimelineInfo::Remote { .. } => { - bail!( - "Timeline {} was created as remote, not local", - new_timeline_id - ) - } - }; + let last_record_lsn = timeline + .local + .expect("no local timeline info") + .last_record_lsn; env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; println!( "Created timeline '{}' at Lsn {} for tenant: {}", - timeline.timeline_id(), - last_record_lsn, - tenant_id, + timeline.timeline_id, last_record_lsn, tenant_id, ); } Some(("branch", branch_match)) => { @@ -602,26 +592,18 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let timeline = pageserver .timeline_create(tenant_id, None, start_lsn, Some(ancestor_timeline_id))? .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; - let new_timeline_id = timeline.timeline_id(); + let new_timeline_id = timeline.timeline_id; - let last_record_lsn = match timeline { - TimelineInfo::Local { - last_record_lsn, .. - } => last_record_lsn, - TimelineInfo::Remote { .. } => bail!( - "Timeline {} was created as remote, not local", - new_timeline_id - ), - }; + let last_record_lsn = timeline + .local + .expect("no local timeline info") + .last_record_lsn; env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; println!( "Created timeline '{}' at Lsn {} for tenant: {}. Ancestor timeline: '{}'", - timeline.timeline_id(), - last_record_lsn, - tenant_id, - ancestor_branch_name, + timeline.timeline_id, last_record_lsn, tenant_id, ancestor_branch_name, ); } Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), @@ -662,13 +644,8 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { // older point in time, or following but lagging behind the primary. let lsn_str = timeline_infos .get(&node.timeline_id) - .map(|bi| match bi { - TimelineInfo::Local { - last_record_lsn, .. - } => last_record_lsn.to_string(), - TimelineInfo::Remote { .. } => "? (remote)".to_string(), - }) - .unwrap_or_else(|| '?'.to_string()); + .and_then(|bi| bi.local.as_ref().map(|l| l.last_record_lsn.to_string())) + .unwrap_or_else(|| "?".to_string()); let branch_name = timeline_name_mappings .get(&ZTenantTimelineId::new(tenant_id, node.timeline_id)) diff --git a/zenith_utils/src/http/error.rs b/zenith_utils/src/http/error.rs index 3262c33a51..b23fa029d4 100644 --- a/zenith_utils/src/http/error.rs +++ b/zenith_utils/src/http/error.rs @@ -14,6 +14,9 @@ pub enum ApiError { #[error("Unauthorized: {0}")] Unauthorized(String), + #[error("NotFound: {0}")] + NotFound(String), + #[error(transparent)] InternalServerError(#[from] anyhow::Error), } @@ -36,6 +39,9 @@ impl ApiError { self.to_string(), StatusCode::UNAUTHORIZED, ), + ApiError::NotFound(_) => { + HttpErrorBody::response_from_msg_and_status(self.to_string(), StatusCode::NOT_FOUND) + } ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::INTERNAL_SERVER_ERROR, From b19870cd88ed125101f928ddf533f393a7236f2f Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 17 Mar 2022 21:36:17 +0400 Subject: [PATCH 0041/1022] guard against partial uploads to local storage --- pageserver/src/remote_storage/local_fs.rs | 37 ++++++++++++++++++----- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/pageserver/src/remote_storage/local_fs.rs b/pageserver/src/remote_storage/local_fs.rs index 01f6028d17..6cce127a7c 100644 --- a/pageserver/src/remote_storage/local_fs.rs +++ b/pageserver/src/remote_storage/local_fs.rs @@ -5,6 +5,7 @@ //! volume is mounted to the local FS. use std::{ + ffi::OsString, future::Future, path::{Path, PathBuf}, pin::Pin, @@ -83,11 +84,21 @@ impl RemoteStorage for LocalFs { ) -> anyhow::Result<()> { let target_file_path = self.resolve_in_storage(to)?; create_target_directory(&target_file_path).await?; + // We need this dance with sort of durable rename (without fsyncs) + // to prevent partial uploads. This was really hit when pageserver shutdown + // cancelled the upload and partial file was left on the fs + let mut temp_extension = target_file_path + .extension() + .unwrap_or_default() + .to_os_string(); + + temp_extension.push(OsString::from(".temp")); + let temp_file_path = target_file_path.with_extension(temp_extension); let mut destination = io::BufWriter::new( fs::OpenOptions::new() .write(true) .create(true) - .open(&target_file_path) + .open(&temp_file_path) .await .with_context(|| { format!( @@ -101,16 +112,26 @@ impl RemoteStorage for LocalFs { .await .with_context(|| { format!( - "Failed to upload file to the local storage at '{}'", + "Failed to upload file (write temp) to the local storage at '{}'", + temp_file_path.display() + ) + })?; + + destination.flush().await.with_context(|| { + format!( + "Failed to upload (flush temp) file to the local storage at '{}'", + temp_file_path.display() + ) + })?; + + fs::rename(temp_file_path, &target_file_path) + .await + .with_context(|| { + format!( + "Failed to upload (rename) file to the local storage at '{}'", target_file_path.display() ) })?; - destination.flush().await.with_context(|| { - format!( - "Failed to upload file to the local storage at '{}'", - target_file_path.display() - ) - })?; Ok(()) } From 3b069f5aef3fbcfc370814f825767f17d6997f67 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 18 Mar 2022 21:27:48 +0200 Subject: [PATCH 0042/1022] Fix name of directory used in unit test. There's another test called 'timeline_load'. If the two tests run in parallel, they would conflict and fail. --- pageserver/src/repository.rs | 2 +- vendor/postgres | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index e335f42519..074bdf4d01 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1181,7 +1181,7 @@ mod tests { #[test] fn timeline_load_with_ancestor() -> Result<()> { - const TEST_NAME: &str = "timeline_load"; + const TEST_NAME: &str = "timeline_load_with_ancestor"; let harness = RepoHarness::create(TEST_NAME)?; // create two timelines { diff --git a/vendor/postgres b/vendor/postgres index 093aa160e5..5e9bc37322 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 093aa160e5df19814ff19b995d36dd5ee03c7f8b +Subproject commit 5e9bc3732266c072151df20d6772b47ca51e233f From 063f9ba81dfaa8f6c9b0b8797d41532715a40669 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 19 Mar 2022 02:38:29 +0200 Subject: [PATCH 0043/1022] Use serde_with to (de)serialize ZId and Lsn to hex --- Cargo.lock | 74 ++++++- control_plane/Cargo.toml | 1 + control_plane/src/local_env.rs | 26 +-- control_plane/src/storage.rs | 10 +- pageserver/Cargo.toml | 3 +- pageserver/src/http/models.rs | 84 ++++---- pageserver/src/http/routes.rs | 8 +- pageserver/src/tenant_mgr.rs | 4 +- pageserver/src/timelines.rs | 24 ++- .../batch_others/test_remote_storage.py | 3 +- .../batch_others/test_tenant_relocation.py | 7 +- test_runner/fixtures/zenith_fixtures.py | 14 +- zenith/src/main.rs | 8 +- zenith_utils/Cargo.toml | 1 + zenith_utils/src/auth.rs | 15 +- zenith_utils/src/zid.rs | 199 +----------------- 16 files changed, 192 insertions(+), 289 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 750ac0edc2..a9de71420b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -441,6 +441,7 @@ dependencies = [ "regex", "reqwest", "serde", + "serde_with", "tar", "thiserror", "toml", @@ -600,6 +601,41 @@ dependencies = [ "libc", ] +[[package]] +name = "darling" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0d720b8683f8dd83c65155f0530560cba68cd2bf395f6513a483caee57ff7f4" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a340f241d2ceed1deb47ae36c4144b2707ec7dd0b649f894cb39bb595986324" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim 0.10.0", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72c41b3b7352feb3211a0d743dc5700a4e3b60f51bd2b368892d1e0f9a95f44b" +dependencies = [ + "darling_core", + "quote", + "syn", +] + [[package]] name = "digest" version = "0.9.0" @@ -1038,6 +1074,12 @@ dependencies = [ "tokio-rustls 0.23.2", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "0.2.3" @@ -1422,7 +1464,6 @@ dependencies = [ "daemonize", "fail", "futures", - "hex", "hex-literal", "humantime", "hyper", @@ -1440,6 +1481,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", + "serde_with", "signal-hook", "tar", "tempfile", @@ -2075,6 +2117,12 @@ dependencies = [ "rustls 0.19.1", ] +[[package]] +name = "rustversion" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2cc38e8fa666e2de3c4aba7edeb5ffc5246c1c2ed0e3d17e560aeeba736b23f" + [[package]] name = "ryu" version = "1.0.9" @@ -2187,6 +2235,29 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_with" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec1e6ec4d8950e5b1e894eac0d360742f3b1407a6078a604a731c4b3f49cefbc" +dependencies = [ + "rustversion", + "serde", + "serde_with_macros", +] + +[[package]] +name = "serde_with_macros" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12e47be9471c72889ebafb5e14d5ff930d89ae7a67bbdb5f8abb564f845a927e" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "sha2" version = "0.9.9" @@ -3056,6 +3127,7 @@ dependencies = [ "rustls-split", "serde", "serde_json", + "serde_with", "signal-hook", "tempfile", "thiserror", diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index eff6b3ef2d..b52c7ad5a9 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" tar = "0.4.33" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } serde = { version = "1.0", features = ["derive"] } +serde_with = "1.12.0" toml = "0.5" lazy_static = "1.4" regex = "1" diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 2a1d51fe08..00ace431e6 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -5,6 +5,7 @@ use anyhow::{bail, ensure, Context}; use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use std::collections::HashMap; use std::env; use std::fs; @@ -12,9 +13,7 @@ use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use zenith_utils::auth::{encode_from_key_file, Claims, Scope}; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ - HexZTenantId, HexZTimelineId, ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId, -}; +use zenith_utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}; use crate::safekeeper::SafekeeperNode; @@ -25,6 +24,7 @@ use crate::safekeeper::SafekeeperNode; // to 'zenith init --config=' option. See control_plane/simple.conf for // an example. // +#[serde_as] #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] pub struct LocalEnv { // Base directory for all the nodes (the pageserver, safekeepers and @@ -50,7 +50,8 @@ pub struct LocalEnv { // Default tenant ID to use with the 'zenith' command line utility, when // --tenantid is not explicitly specified. #[serde(default)] - pub default_tenant_id: Option, + #[serde_as(as = "Option")] + pub default_tenant_id: Option, // used to issue tokens during e.g pg start #[serde(default)] @@ -66,7 +67,8 @@ pub struct LocalEnv { // A `HashMap>` would be more appropriate here, // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". - branch_name_mappings: HashMap>, + #[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")] + branch_name_mappings: HashMap>, } #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] @@ -164,9 +166,6 @@ impl LocalEnv { .entry(branch_name.clone()) .or_default(); - let tenant_id = HexZTenantId::from(tenant_id); - let timeline_id = HexZTimelineId::from(timeline_id); - let existing_ids = existing_values .iter() .find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id); @@ -193,7 +192,6 @@ impl LocalEnv { branch_name: &str, tenant_id: ZTenantId, ) -> Option { - let tenant_id = HexZTenantId::from(tenant_id); self.branch_name_mappings .get(branch_name)? .iter() @@ -207,13 +205,7 @@ impl LocalEnv { .iter() .flat_map(|(name, tenant_timelines)| { tenant_timelines.iter().map(|&(tenant_id, timeline_id)| { - ( - ZTenantTimelineId::new( - ZTenantId::from(tenant_id), - ZTimelineId::from(timeline_id), - ), - name.clone(), - ) + (ZTenantTimelineId::new(tenant_id, timeline_id), name.clone()) }) }) .collect() @@ -259,7 +251,7 @@ impl LocalEnv { // If no initial tenant ID was given, generate it. if env.default_tenant_id.is_none() { - env.default_tenant_id = Some(HexZTenantId::from(ZTenantId::generate())); + env.default_tenant_id = Some(ZTenantId::generate()); } env.base_data_dir = base_path(); diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index ef43ba3c1e..835c93bf1d 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -18,7 +18,7 @@ use thiserror::Error; use zenith_utils::http::error::HttpErrorBody; use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{HexZTenantId, HexZTimelineId, ZTenantId, ZTimelineId}; +use zenith_utils::zid::{ZTenantId, ZTimelineId}; use crate::local_env::LocalEnv; use crate::{fill_rust_env_vars, read_pidfile}; @@ -337,9 +337,7 @@ impl PageServerNode { ) -> anyhow::Result> { let tenant_id_string = self .http_request(Method::POST, format!("{}/tenant", self.http_base_url)) - .json(&TenantCreateRequest { - new_tenant_id: new_tenant_id.map(HexZTenantId::from), - }) + .json(&TenantCreateRequest { new_tenant_id }) .send()? .error_from_body()? .json::>()?; @@ -382,9 +380,9 @@ impl PageServerNode { format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), ) .json(&TimelineCreateRequest { - new_timeline_id: new_timeline_id.map(HexZTimelineId::from), + new_timeline_id, ancestor_start_lsn, - ancestor_timeline_id: ancestor_timeline_id.map(HexZTimelineId::from), + ancestor_timeline_id, }) .send()? .error_from_body()? diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index cfcb453732..efd2fa4a38 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -25,11 +25,12 @@ tokio-stream = "0.1.8" anyhow = { version = "1.0", features = ["backtrace"] } crc32c = "0.6.0" thiserror = "1.0" -hex = { version = "0.4.3", features = ["serde"] } tar = "0.4.33" humantime = "2.1.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1" +serde_with = "1.12.0" + toml_edit = { version = "0.13", features = ["easy"] } scopeguard = "1.1.0" async-trait = "0.1" diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 8827713f11..c28cd0def7 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -1,24 +1,39 @@ -use anyhow::Context; use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use zenith_utils::{ lsn::Lsn, - zid::{HexZTenantId, HexZTimelineId, ZNodeId, ZTenantId, ZTimelineId}, + zid::{ZNodeId, ZTenantId, ZTimelineId}, }; use crate::timelines::{LocalTimelineInfo, TimelineInfo}; +#[serde_as] #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { - pub new_timeline_id: Option, - pub ancestor_timeline_id: Option, + #[serde(default)] + #[serde_as(as = "Option")] + pub new_timeline_id: Option, + #[serde(default)] + #[serde_as(as = "Option")] + pub ancestor_timeline_id: Option, + #[serde(default)] + #[serde_as(as = "Option")] pub ancestor_start_lsn: Option, } +#[serde_as] #[derive(Serialize, Deserialize)] pub struct TenantCreateRequest { - pub new_tenant_id: Option, + #[serde(default)] + #[serde_as(as = "Option")] + pub new_tenant_id: Option, } +#[serde_as] +#[derive(Serialize, Deserialize)] +#[serde(transparent)] +pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId); + #[derive(Clone)] pub enum TimelineInfoV1 { Local { @@ -39,18 +54,24 @@ pub enum TimelineInfoV1 { }, } +#[serde_as] #[derive(Serialize, Deserialize)] pub struct TimelineInfoResponseV1 { pub kind: String, - #[serde(with = "hex")] + #[serde_as(as = "DisplayFromStr")] timeline_id: ZTimelineId, - #[serde(with = "hex")] + #[serde_as(as = "DisplayFromStr")] tenant_id: ZTenantId, - disk_consistent_lsn: String, - last_record_lsn: Option, - prev_record_lsn: Option, - ancestor_timeline_id: Option, - ancestor_lsn: Option, + #[serde_as(as = "DisplayFromStr")] + disk_consistent_lsn: Lsn, + #[serde_as(as = "Option")] + last_record_lsn: Option, + #[serde_as(as = "Option")] + prev_record_lsn: Option, + #[serde_as(as = "Option")] + ancestor_timeline_id: Option, + #[serde_as(as = "Option")] + ancestor_lsn: Option, current_logical_size: Option, current_logical_size_non_incremental: Option, } @@ -72,11 +93,11 @@ impl From for TimelineInfoResponseV1 { kind: "Local".to_owned(), timeline_id, tenant_id, - disk_consistent_lsn: disk_consistent_lsn.to_string(), - last_record_lsn: Some(last_record_lsn.to_string()), - prev_record_lsn: prev_record_lsn.map(|lsn| lsn.to_string()), - ancestor_timeline_id: ancestor_timeline_id.map(HexZTimelineId::from), - ancestor_lsn: ancestor_lsn.map(|lsn| lsn.to_string()), + disk_consistent_lsn, + last_record_lsn: Some(last_record_lsn), + prev_record_lsn, + ancestor_timeline_id, + ancestor_lsn, current_logical_size, current_logical_size_non_incremental, }, @@ -88,7 +109,7 @@ impl From for TimelineInfoResponseV1 { kind: "Remote".to_owned(), timeline_id, tenant_id, - disk_consistent_lsn: disk_consistent_lsn.to_string(), + disk_consistent_lsn, last_record_lsn: None, prev_record_lsn: None, ancestor_timeline_id: None, @@ -104,37 +125,24 @@ impl TryFrom for TimelineInfoV1 { type Error = anyhow::Error; fn try_from(other: TimelineInfoResponseV1) -> anyhow::Result { - let parse_lsn_hex_string = |lsn_string: String| { - lsn_string - .parse::() - .with_context(|| format!("Failed to parse Lsn as hex string from '{}'", lsn_string)) - }; - - let disk_consistent_lsn = parse_lsn_hex_string(other.disk_consistent_lsn)?; Ok(match other.kind.as_str() { "Local" => TimelineInfoV1::Local { timeline_id: other.timeline_id, tenant_id: other.tenant_id, - last_record_lsn: other - .last_record_lsn - .ok_or(anyhow::anyhow!( - "Local timeline should have last_record_lsn" - )) - .and_then(parse_lsn_hex_string)?, - prev_record_lsn: other - .prev_record_lsn - .map(parse_lsn_hex_string) - .transpose()?, + last_record_lsn: other.last_record_lsn.ok_or(anyhow::anyhow!( + "Local timeline should have last_record_lsn" + ))?, + prev_record_lsn: other.prev_record_lsn, ancestor_timeline_id: other.ancestor_timeline_id.map(ZTimelineId::from), - ancestor_lsn: other.ancestor_lsn.map(parse_lsn_hex_string).transpose()?, - disk_consistent_lsn, + ancestor_lsn: other.ancestor_lsn, + disk_consistent_lsn: other.disk_consistent_lsn, current_logical_size: other.current_logical_size, current_logical_size_non_incremental: other.current_logical_size_non_incremental, }, "Remote" => TimelineInfoV1::Remote { timeline_id: other.timeline_id, tenant_id: other.tenant_id, - disk_consistent_lsn, + disk_consistent_lsn: other.disk_consistent_lsn, }, unknown => anyhow::bail!("Unknown timeline kind: {}", unknown), }) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2d913afe4e..a1249f463a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -17,11 +17,11 @@ use zenith_utils::http::{ request::parse_request_param, }; use zenith_utils::http::{RequestExt, RouterBuilder}; -use zenith_utils::zid::{HexZTenantId, ZTenantTimelineId, ZTimelineId}; +use zenith_utils::zid::{ZTenantTimelineId, ZTimelineId}; use super::models::{ - StatusResponse, TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponseV1, - TimelineInfoV1, + StatusResponse, TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest, + TimelineInfoResponseV1, TimelineInfoV1, }; use crate::remote_storage::{schedule_timeline_download, RemoteTimelineIndex}; use crate::timelines::{ @@ -308,7 +308,7 @@ async fn tenant_create_handler(mut request: Request) -> Result json_response(StatusCode::CREATED, HexZTenantId::from(id))?, + Some(id) => json_response(StatusCode::CREATED, TenantCreateResponse(id))?, None => json_response(StatusCode::CONFLICT, ())?, }) } diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 8584bdd424..4d6dfd7488 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -15,6 +15,7 @@ use anyhow::{Context, Result}; use lazy_static::lazy_static; use log::*; use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use std::collections::hash_map::Entry; use std::collections::HashMap; use std::fmt; @@ -267,9 +268,10 @@ pub fn get_timeline_for_tenant_load( .with_context(|| format!("Timeline {} not found for tenant {}", timelineid, tenantid)) } +#[serde_as] #[derive(Serialize, Deserialize, Clone)] pub struct TenantInfo { - #[serde(with = "hex")] + #[serde_as(as = "DisplayFromStr")] pub id: ZTenantId, pub state: TenantState, } diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 9cfc21b413..00dd0f8f9c 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -5,6 +5,7 @@ use anyhow::{bail, Context, Result}; use postgres_ffi::ControlFileData; use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use std::{ fs, path::Path, @@ -13,9 +14,9 @@ use std::{ }; use tracing::*; +use zenith_utils::lsn::Lsn; use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; use zenith_utils::{crashsafe_dir, logging}; -use zenith_utils::{lsn::Lsn, zid::HexZTimelineId}; use crate::{ config::PageServerConf, @@ -28,12 +29,18 @@ use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager}; use crate::{repository::RepositoryTimeline, tenant_mgr}; use crate::{repository::Timeline, CheckpointConfig}; +#[serde_as] #[derive(Debug, Serialize, Deserialize, Clone)] pub struct LocalTimelineInfo { - pub ancestor_timeline_id: Option, + #[serde_as(as = "Option")] + pub ancestor_timeline_id: Option, + #[serde_as(as = "Option")] pub ancestor_lsn: Option, + #[serde_as(as = "DisplayFromStr")] pub last_record_lsn: Lsn, + #[serde_as(as = "Option")] pub prev_record_lsn: Option, + #[serde_as(as = "DisplayFromStr")] pub disk_consistent_lsn: Lsn, pub current_logical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, @@ -47,9 +54,7 @@ impl LocalTimelineInfo { ) -> anyhow::Result { let last_record_lsn = timeline.get_last_record_lsn(); let info = LocalTimelineInfo { - ancestor_timeline_id: timeline - .get_ancestor_timeline_id() - .map(HexZTimelineId::from), + ancestor_timeline_id: timeline.get_ancestor_timeline_id(), ancestor_lsn: { match timeline.get_ancestor_lsn() { Lsn(0) => None, @@ -72,7 +77,7 @@ impl LocalTimelineInfo { pub fn from_unloaded_timeline(metadata: &TimelineMetadata) -> Self { LocalTimelineInfo { - ancestor_timeline_id: metadata.ancestor_timeline().map(HexZTimelineId::from), + ancestor_timeline_id: metadata.ancestor_timeline(), ancestor_lsn: { match metadata.ancestor_lsn() { Lsn(0) => None, @@ -103,17 +108,20 @@ impl LocalTimelineInfo { } } +#[serde_as] #[derive(Debug, Serialize, Deserialize, Clone)] pub struct RemoteTimelineInfo { + #[serde_as(as = "Option")] pub remote_consistent_lsn: Option, pub awaits_download: bool, } +#[serde_as] #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { - #[serde(with = "hex")] + #[serde_as(as = "DisplayFromStr")] pub tenant_id: ZTenantId, - #[serde(with = "hex")] + #[serde_as(as = "DisplayFromStr")] pub timeline_id: ZTimelineId, pub local: Option, pub remote: Option, diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index 8689838089..07a122ede9 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -7,6 +7,7 @@ from pathlib import Path from uuid import UUID from fixtures.zenith_fixtures import ZenithEnvBuilder, assert_local, wait_for, wait_for_last_record_lsn, wait_for_upload from fixtures.log_helper import log +from fixtures.utils import lsn_from_hex import pytest @@ -56,7 +57,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, INSERT INTO t1 VALUES ({data_id}, '{data_secret}'); ''') cur.execute("SELECT pg_current_wal_flush_lsn()") - current_lsn = int(cur.fetchone()[0].split('/')[1], base=16) + current_lsn = lsn_from_hex(cur.fetchone()[0]) # wait until pageserver receives that data wait_for_last_record_lsn(client, UUID(tenant_id), UUID(timeline_id), current_lsn) diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index e4492e5393..12ce3eb760 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -11,6 +11,7 @@ import signal import pytest from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, ZenithPageserverHttpClient, assert_local, wait_for, wait_for_last_record_lsn, wait_for_upload, zenith_binpath, pg_distrib_dir +from fixtures.utils import lsn_from_hex def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @@ -134,7 +135,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, assert cur.fetchone() == (500500, ) cur.execute("SELECT pg_current_wal_flush_lsn()") - current_lsn = int(cur.fetchone()[0].split('/')[1], base=16) + current_lsn = lsn_from_hex(cur.fetchone()[0]) pageserver_http = env.pageserver.http_client() @@ -189,8 +190,8 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, # when load is active these checks can break because lsns are not static # so lets check with some margin - assert_abs_margin_ratio(new_timeline_detail['local']['disk_consistent_lsn'], - timeline_detail['local']['disk_consistent_lsn'], + assert_abs_margin_ratio(lsn_from_hex(new_timeline_detail['local']['disk_consistent_lsn']), + lsn_from_hex(timeline_detail['local']['disk_consistent_lsn']), 0.03) # callmemaybe to start replication from safekeeper to the new pageserver diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index c44a6e431f..fa68c4f476 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -33,7 +33,7 @@ from typing_extensions import Literal import requests import backoff # type: ignore -from .utils import (get_self_dir, mkdir_if_needed, subprocess_capture) +from .utils import (get_self_dir, lsn_from_hex, mkdir_if_needed, subprocess_capture) from fixtures.log_helper import log """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -1900,8 +1900,10 @@ def remote_consistent_lsn(pageserver_http_client: ZenithPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID) -> int: detail = pageserver_http_client.timeline_detail_v2(tenant, timeline) - assert isinstance(detail['remote']['remote_consistent_lsn'], int) - return detail['remote']['remote_consistent_lsn'] + + lsn_str = detail['remote']['remote_consistent_lsn'] + assert isinstance(lsn_str, str) + return lsn_from_hex(lsn_str) def wait_for_upload(pageserver_http_client: ZenithPageserverHttpClient, @@ -1917,8 +1919,10 @@ def last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID) -> int: detail = pageserver_http_client.timeline_detail_v2(tenant, timeline) - assert isinstance(detail['local']['last_record_lsn'], int) - return detail['local']['last_record_lsn'] + + lsn_str = detail['local']['last_record_lsn'] + assert isinstance(lsn_str, str) + return lsn_from_hex(lsn_str) def wait_for_last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, diff --git a/zenith/src/main.rs b/zenith/src/main.rs index 389c394103..f5d4184e63 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -316,7 +316,7 @@ fn print_timelines_tree( timeline.local.as_ref().and_then(|l| l.ancestor_timeline_id) { timelines_hash - .get_mut(&ZTimelineId::from(ancestor_timeline_id)) + .get_mut(&ancestor_timeline_id) .context("missing timeline info in the HashMap")? .children .insert(timeline.timeline_id); @@ -437,8 +437,8 @@ fn get_timeline_infos( fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result { if let Some(tenant_id_from_arguments) = parse_tenant_id(sub_match).transpose() { tenant_id_from_arguments - } else if let Some(tenantid_conf) = env.default_tenant_id { - Ok(ZTenantId::from(tenantid_conf)) + } else if let Some(default_id) = env.default_tenant_id { + Ok(default_id) } else { bail!("No tenant id. Use --tenant-id, or set 'default_tenant_id' in the config file"); } @@ -479,7 +479,7 @@ fn handle_init(init_match: &ArgMatches) -> Result { .context("Failed to initialize zenith repository")?; // default_tenantid was generated by the `env.init()` call above - let initial_tenant_id = ZTenantId::from(env.default_tenant_id.unwrap()); + let initial_tenant_id = env.default_tenant_id.unwrap(); // Call 'pageserver init'. let pageserver = PageServerNode::from_env(&env); diff --git a/zenith_utils/Cargo.toml b/zenith_utils/Cargo.toml index daaf345f8f..8e7f5f233c 100644 --- a/zenith_utils/Cargo.toml +++ b/zenith_utils/Cargo.toml @@ -27,6 +27,7 @@ hex = { version = "0.4.3", features = ["serde"] } rustls = "0.19.1" rustls-split = "0.2.1" git-version = "0.3.5" +serde_with = "1.12.0" zenith_metrics = { path = "../zenith_metrics" } workspace_hack = { path = "../workspace_hack" } diff --git a/zenith_utils/src/auth.rs b/zenith_utils/src/auth.rs index cbc4fcee61..8271121c63 100644 --- a/zenith_utils/src/auth.rs +++ b/zenith_utils/src/auth.rs @@ -14,8 +14,9 @@ use jsonwebtoken::{ decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, }; use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; -use crate::zid::{HexZTenantId, ZTenantId}; +use crate::zid::ZTenantId; const JWT_ALGORITHM: Algorithm = Algorithm::RS256; @@ -26,18 +27,18 @@ pub enum Scope { PageServerApi, } +#[serde_as] #[derive(Debug, Serialize, Deserialize, Clone)] pub struct Claims { - pub tenant_id: Option, + #[serde(default)] + #[serde_as(as = "Option")] + pub tenant_id: Option, pub scope: Scope, } impl Claims { pub fn new(tenant_id: Option, scope: Scope) -> Self { - Self { - tenant_id: tenant_id.map(HexZTenantId::from), - scope, - } + Self { tenant_id, scope } } } @@ -47,7 +48,7 @@ pub fn check_permission(claims: &Claims, tenantid: Option) -> Result< bail!("Attempt to access management api with tenant scope. Permission denied") } (Scope::Tenant, Some(tenantid)) => { - if ZTenantId::from(claims.tenant_id.unwrap()) != tenantid { + if claims.tenant_id.unwrap() != tenantid { bail!("Tenant id mismatch. Permission denied") } Ok(()) diff --git a/zenith_utils/src/zid.rs b/zenith_utils/src/zid.rs index e047e38da7..fce5ed97c1 100644 --- a/zenith_utils/src/zid.rs +++ b/zenith_utils/src/zid.rs @@ -2,100 +2,19 @@ use std::{fmt, str::FromStr}; use hex::FromHex; use rand::Rng; -use serde::{ - de::{self, Visitor}, - Deserialize, Serialize, -}; - -macro_rules! mutual_from { - ($id1:ident, $id2:ident) => { - impl From<$id1> for $id2 { - fn from(id1: $id1) -> Self { - Self(id1.0.into()) - } - } - - impl From<$id2> for $id1 { - fn from(id2: $id2) -> Self { - Self(id2.0.into()) - } - } - }; -} +use serde::{Deserialize, Serialize}; /// Zenith ID is a 128-bit random ID. /// Used to represent various identifiers. Provides handy utility methods and impls. /// /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. -/// Use [`HexZId`] to serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. +/// +/// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. +/// Check the `serde_with::serde_as` documentation for options for more complex types. #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] struct ZId([u8; 16]); -/// [`ZId`] version that serializes and deserializes as a hex string. -/// Useful for various json serializations, where hex byte array from original id is not convenient. -/// -/// Plain `ZId` could be (de)serialized into hex string with `#[serde(with = "hex")]` attribute. -/// This however won't work on nested types like `Option` or `Vec`, see https://github.com/serde-rs/serde/issues/723 for the details. -/// Every separate type currently needs a new (de)serializing method for every type separately. -/// -/// To provide a generic way to serialize the ZId as a hex string where `#[serde(with = "hex")]` is not enough, this wrapper is created. -/// The default wrapper serialization is left unchanged due to -/// * byte array (de)serialization being faster and simpler -/// * byte deserialization being used in Safekeeper already, with those bytes coming from compute (see `ProposerGreeting` in safekeeper) -/// * current `HexZId`'s deserialization impl breaks on compute byte array deserialization, having it by default is dangerous -#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] -struct HexZId([u8; 16]); - -impl Serialize for HexZId { - fn serialize(&self, ser: S) -> Result - where - S: serde::Serializer, - { - hex::encode(self.0).serialize(ser) - } -} - -impl<'de> Deserialize<'de> for HexZId { - fn deserialize(de: D) -> Result - where - D: serde::Deserializer<'de>, - { - de.deserialize_bytes(HexVisitor) - } -} - -struct HexVisitor; - -impl<'de> Visitor<'de> for HexVisitor { - type Value = HexZId; - - fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "A hexadecimal representation of a 128-bit random Zenith ID" - ) - } - - fn visit_bytes(self, hex_bytes: &[u8]) -> Result - where - E: de::Error, - { - ZId::from_hex(hex_bytes) - .map(HexZId::from) - .map_err(de::Error::custom) - } - - fn visit_str(self, hex_bytes_str: &str) -> Result - where - E: de::Error, - { - Self::visit_bytes(self, hex_bytes_str.as_bytes()) - } -} - -mutual_from!(ZId, HexZId); - impl ZId { pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> ZId { let mut arr = [0u8; 16]; @@ -256,76 +175,22 @@ macro_rules! zid_newtype { /// /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. -/// Use [`HexZTimelineId`] to serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. +/// See [`ZId`] for alternative ways to serialize it. #[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] pub struct ZTimelineId(ZId); -/// A [`ZTimelineId`] version that gets (de)serialized as a hex string. -/// Use in complex types, where `#[serde(with = "hex")]` does not work. -/// See [`HexZId`] for more details. -#[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] -pub struct HexZTimelineId(HexZId); - -impl std::fmt::Debug for HexZTimelineId { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - ZTimelineId::from(*self).fmt(f) - } -} - -impl std::fmt::Display for HexZTimelineId { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - ZTimelineId::from(*self).fmt(f) - } -} - -impl FromStr for HexZTimelineId { - type Err = ::Err; - - fn from_str(s: &str) -> Result { - Ok(HexZTimelineId::from(ZTimelineId::from_str(s)?)) - } -} - zid_newtype!(ZTimelineId); -mutual_from!(ZTimelineId, HexZTimelineId); /// Zenith Tenant Id represents identifiar of a particular tenant. /// Is used for distinguishing requests and data belonging to different users. /// /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. -/// Use [`HexZTenantId`] to serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. +/// See [`ZId`] for alternative ways to serialize it. #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] pub struct ZTenantId(ZId); -/// A [`ZTenantId`] version that gets (de)serialized as a hex string. -/// Use in complex types, where `#[serde(with = "hex")]` does not work. -/// See [`HexZId`] for more details. -#[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] -pub struct HexZTenantId(HexZId); - -impl std::fmt::Debug for HexZTenantId { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - ZTenantId::from(*self).fmt(f) - } -} - -impl std::fmt::Display for HexZTenantId { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - ZTenantId::from(*self).fmt(f) - } -} - -impl FromStr for HexZTenantId { - type Err = ::Err; - - fn from_str(s: &str) -> Result { - Ok(HexZTenantId::from(ZTenantId::from_str(s)?)) - } -} - zid_newtype!(ZTenantId); -mutual_from!(ZTenantId, HexZTenantId); // A pair uniquely identifying Zenith instance. #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash)] @@ -368,55 +233,3 @@ impl fmt::Display for ZNodeId { write!(f, "{}", self.0) } } - -#[cfg(test)] -mod tests { - use std::fmt::Display; - - use super::*; - use hex::FromHexError; - use hex_literal::hex; - - #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] - struct TestStruct + Display> { - field: Option, - } - - #[test] - fn test_hex_serializations_tenant_id() { - let original_struct = TestStruct { - field: Some(HexZTenantId::from(ZTenantId::from_array(hex!( - "11223344556677881122334455667788" - )))), - }; - - let serialized_string = serde_json::to_string(&original_struct).unwrap(); - assert_eq!( - serialized_string, - r#"{"field":"11223344556677881122334455667788"}"# - ); - - let deserialized_struct: TestStruct = - serde_json::from_str(&serialized_string).unwrap(); - assert_eq!(original_struct, deserialized_struct); - } - - #[test] - fn test_hex_serializations_timeline_id() { - let original_struct = TestStruct { - field: Some(HexZTimelineId::from(ZTimelineId::from_array(hex!( - "AA223344556677881122334455667788" - )))), - }; - - let serialized_string = serde_json::to_string(&original_struct).unwrap(); - assert_eq!( - serialized_string, - r#"{"field":"aa223344556677881122334455667788"}"# - ); - - let deserialized_struct: TestStruct = - serde_json::from_str(&serialized_string).unwrap(); - assert_eq!(original_struct, deserialized_struct); - } -} From 37ebbb598d625341db904e276d5ff5185ad311b2 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 15 Mar 2022 10:46:27 +0200 Subject: [PATCH 0044/1022] Add a macOs build --- .github/workflows/testing.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 218783387b..27e2962712 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -13,7 +13,7 @@ jobs: # If we want to duplicate this job for different # Rust toolchains (e.g. nightly or 1.37.0), add them here. rust_toolchain: [stable] - os: [ubuntu-latest] + os: [ubuntu-latest, macos-latest] timeout-minutes: 30 name: run regression test suite runs-on: ${{ matrix.os }} @@ -32,11 +32,17 @@ jobs: toolchain: ${{ matrix.rust_toolchain }} override: true - - name: Install postgres dependencies + - name: Install Ubuntu postgres dependencies + if: matrix.os == 'ubuntu-latest' run: | sudo apt update sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev + - name: Install macOs postgres dependencies + if: matrix.os == 'macos-latest' + run: | + brew install flex bison + - name: Set pg revision for caching id: pg_ver run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres) From 77ed2a0fa039fcb20e2617a597b4db39ee20155a Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 17 Mar 2022 10:06:42 +0200 Subject: [PATCH 0045/1022] Run GitHub testing workflow on every push --- .github/workflows/testing.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 27e2962712..83e46ce6be 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -1,10 +1,6 @@ name: Build and Test -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] +on: push jobs: regression-check: From bd6bef468c2a619ac8c39c04355c517334847b24 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sun, 20 Mar 2022 21:13:23 +0200 Subject: [PATCH 0046/1022] Provide single list timelines HTTP API handle --- pageserver/src/http/models.rs | 150 ------------------ pageserver/src/http/openapi_spec.yml | 33 +++- pageserver/src/http/routes.rs | 27 +--- .../batch_others/test_pageserver_api.py | 6 +- .../batch_others/test_tenant_relocation.py | 2 +- .../batch_others/test_timeline_size.py | 24 +-- test_runner/batch_others/test_wal_acceptor.py | 46 +++--- test_runner/fixtures/zenith_fixtures.py | 15 +- 8 files changed, 83 insertions(+), 220 deletions(-) diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index c28cd0def7..d1dfb911ba 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -5,8 +5,6 @@ use zenith_utils::{ zid::{ZNodeId, ZTenantId, ZTimelineId}, }; -use crate::timelines::{LocalTimelineInfo, TimelineInfo}; - #[serde_as] #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { @@ -34,154 +32,6 @@ pub struct TenantCreateRequest { #[serde(transparent)] pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId); -#[derive(Clone)] -pub enum TimelineInfoV1 { - Local { - timeline_id: ZTimelineId, - tenant_id: ZTenantId, - last_record_lsn: Lsn, - prev_record_lsn: Option, - ancestor_timeline_id: Option, - ancestor_lsn: Option, - disk_consistent_lsn: Lsn, - current_logical_size: Option, - current_logical_size_non_incremental: Option, - }, - Remote { - timeline_id: ZTimelineId, - tenant_id: ZTenantId, - disk_consistent_lsn: Lsn, - }, -} - -#[serde_as] -#[derive(Serialize, Deserialize)] -pub struct TimelineInfoResponseV1 { - pub kind: String, - #[serde_as(as = "DisplayFromStr")] - timeline_id: ZTimelineId, - #[serde_as(as = "DisplayFromStr")] - tenant_id: ZTenantId, - #[serde_as(as = "DisplayFromStr")] - disk_consistent_lsn: Lsn, - #[serde_as(as = "Option")] - last_record_lsn: Option, - #[serde_as(as = "Option")] - prev_record_lsn: Option, - #[serde_as(as = "Option")] - ancestor_timeline_id: Option, - #[serde_as(as = "Option")] - ancestor_lsn: Option, - current_logical_size: Option, - current_logical_size_non_incremental: Option, -} - -impl From for TimelineInfoResponseV1 { - fn from(other: TimelineInfoV1) -> Self { - match other { - TimelineInfoV1::Local { - timeline_id, - tenant_id, - last_record_lsn, - prev_record_lsn, - ancestor_timeline_id, - ancestor_lsn, - disk_consistent_lsn, - current_logical_size, - current_logical_size_non_incremental, - } => TimelineInfoResponseV1 { - kind: "Local".to_owned(), - timeline_id, - tenant_id, - disk_consistent_lsn, - last_record_lsn: Some(last_record_lsn), - prev_record_lsn, - ancestor_timeline_id, - ancestor_lsn, - current_logical_size, - current_logical_size_non_incremental, - }, - TimelineInfoV1::Remote { - timeline_id, - tenant_id, - disk_consistent_lsn, - } => TimelineInfoResponseV1 { - kind: "Remote".to_owned(), - timeline_id, - tenant_id, - disk_consistent_lsn, - last_record_lsn: None, - prev_record_lsn: None, - ancestor_timeline_id: None, - ancestor_lsn: None, - current_logical_size: None, - current_logical_size_non_incremental: None, - }, - } - } -} - -impl TryFrom for TimelineInfoV1 { - type Error = anyhow::Error; - - fn try_from(other: TimelineInfoResponseV1) -> anyhow::Result { - Ok(match other.kind.as_str() { - "Local" => TimelineInfoV1::Local { - timeline_id: other.timeline_id, - tenant_id: other.tenant_id, - last_record_lsn: other.last_record_lsn.ok_or(anyhow::anyhow!( - "Local timeline should have last_record_lsn" - ))?, - prev_record_lsn: other.prev_record_lsn, - ancestor_timeline_id: other.ancestor_timeline_id.map(ZTimelineId::from), - ancestor_lsn: other.ancestor_lsn, - disk_consistent_lsn: other.disk_consistent_lsn, - current_logical_size: other.current_logical_size, - current_logical_size_non_incremental: other.current_logical_size_non_incremental, - }, - "Remote" => TimelineInfoV1::Remote { - timeline_id: other.timeline_id, - tenant_id: other.tenant_id, - disk_consistent_lsn: other.disk_consistent_lsn, - }, - unknown => anyhow::bail!("Unknown timeline kind: {}", unknown), - }) - } -} - -fn from_local( - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - local: &LocalTimelineInfo, -) -> TimelineInfoV1 { - TimelineInfoV1::Local { - timeline_id, - tenant_id, - last_record_lsn: local.last_record_lsn, - prev_record_lsn: local.prev_record_lsn, - ancestor_timeline_id: local.ancestor_timeline_id.map(ZTimelineId::from), - ancestor_lsn: local.ancestor_lsn, - disk_consistent_lsn: local.disk_consistent_lsn, - current_logical_size: local.current_logical_size, - current_logical_size_non_incremental: local.current_logical_size_non_incremental, - } -} - -impl From for TimelineInfoV1 { - fn from(t: TimelineInfo) -> Self { - match (t.local.as_ref(), t.remote.as_ref()) { - (None, None) => unreachable!(), - (None, Some(remote)) => TimelineInfoV1::Remote { - timeline_id: t.timeline_id, - tenant_id: t.tenant_id, - disk_consistent_lsn: remote.remote_consistent_lsn.unwrap_or(Lsn(0)), - }, - (Some(local), None) => from_local(t.tenant_id, t.timeline_id, local), - (Some(local), Some(_)) => from_local(t.tenant_id, t.timeline_id, local), - } - } -} - #[derive(Serialize)] pub struct StatusResponse { pub id: ZNodeId, diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index d322b051a6..a9101d4bd6 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -148,6 +148,7 @@ paths: format: hex ancestor_start_lsn: type: string + format: hex responses: "201": description: TimelineInfo @@ -289,7 +290,6 @@ components: required: - timeline_id - tenant_id - - disk_consistent_lsn properties: timeline_id: type: string @@ -297,17 +297,44 @@ components: tenant_id: type: string format: hex + local: + $ref: "#/components/schemas/LocalTimelineInfo" + remote: + $ref: "#/components/schemas/RemoteTimelineInfo" + RemoteTimelineInfo: + type: object + required: + - awaits_download + properties: + awaits_download: + type: boolean + remote_consistent_lsn: + type: string + format: hex + LocalTimelineInfo: + type: object + required: + - last_record_lsn + - disk_consistent_lsn + - timeline_state + properties: last_record_lsn: type: string - prev_record_lsn: + format: hex + disk_consistent_lsn: + type: string + format: hex + timeline_state: type: string ancestor_timeline_id: type: string format: hex ancestor_lsn: type: string - disk_consistent_lsn: + format: hex + prev_record_lsn: type: string + format: hex current_logical_size: type: integer current_logical_size_non_incremental: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a1249f463a..3ca8b6334a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -21,7 +21,6 @@ use zenith_utils::zid::{ZTenantTimelineId, ZTimelineId}; use super::models::{ StatusResponse, TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest, - TimelineInfoResponseV1, TimelineInfoV1, }; use crate::remote_storage::{schedule_timeline_download, RemoteTimelineIndex}; use crate::timelines::{ @@ -143,8 +142,7 @@ fn get_include_non_incremental_logical_size(request: &Request) -> bool { .unwrap_or(false) } -// common part for v1 and v2 handlers -async fn timeline_detail_common(request: Request) -> Result { +async fn timeline_detail_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -192,25 +190,12 @@ async fn timeline_detail_common(request: Request) -> Result) -> Result, ApiError> { - let timeline_info = timeline_detail_common(request).await?; - Ok(json_response( - StatusCode::OK, - TimelineInfoResponseV1::from(TimelineInfoV1::from(timeline_info)), - )?) -} - -async fn timeline_detail_handler_v2(request: Request) -> Result, ApiError> { - let timeline_info = timeline_detail_common(request).await?; + }; Ok(json_response(StatusCode::OK, timeline_info)?) } @@ -347,11 +332,7 @@ pub fn make_router( .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id", - timeline_detail_handler_v1, - ) - .get( - "/v2/tenant/:tenant_id/timeline/:timeline_id", - timeline_detail_handler_v2, + timeline_detail_handler, ) .post( "/v1/tenant/:tenant_id/timeline/:timeline_id/attach", diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 2aa3686904..965ba9bcc3 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -39,10 +39,14 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): timeline_id_str = str(timeline['timeline_id']) timeline_details = client.timeline_detail(tenant_id=tenant_id, timeline_id=UUID(timeline_id_str)) - assert timeline_details['kind'] == 'Local' + assert timeline_details['tenant_id'] == tenant_id.hex assert timeline_details['timeline_id'] == timeline_id_str + local_timeline_details = timeline_details.get('local') + assert local_timeline_details is not None + assert local_timeline_details['timeline_state'] == 'Loaded' + def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv): env = zenith_simple_env diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 12ce3eb760..32fbc8f872 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -141,7 +141,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, # wait until pageserver receives that data wait_for_last_record_lsn(pageserver_http, tenant, timeline, current_lsn) - timeline_detail = pageserver_http.timeline_detail_v2(tenant, timeline) + timeline_detail = assert_local(pageserver_http, tenant, timeline) if with_load == 'with_load': # create load table diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 7d8ab551b0..0b341746ee 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -2,7 +2,7 @@ from contextlib import closing from uuid import UUID import psycopg2.extras import psycopg2.errors -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres +from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres, assert_local from fixtures.log_helper import log import time @@ -13,8 +13,9 @@ def test_timeline_size(zenith_simple_env: ZenithEnv): new_timeline_id = env.zenith_cli.create_branch('test_timeline_size', 'empty') client = env.pageserver.http_client() - res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) - assert res["current_logical_size"] == res["current_logical_size_non_incremental"] + timeline_details = assert_local(client, env.initial_tenant, new_timeline_id) + assert timeline_details['local']['current_logical_size'] == timeline_details['local'][ + 'current_logical_size_non_incremental'] pgmain = env.postgres.create_start("test_timeline_size") log.info("postgres is running on 'test_timeline_size' branch") @@ -31,12 +32,16 @@ def test_timeline_size(zenith_simple_env: ZenithEnv): FROM generate_series(1, 10) g """) - res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) - assert res["current_logical_size"] == res["current_logical_size_non_incremental"] + res = assert_local(client, env.initial_tenant, new_timeline_id) + local_details = res['local'] + assert local_details["current_logical_size"] == local_details[ + "current_logical_size_non_incremental"] cur.execute("TRUNCATE foo") - res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) - assert res["current_logical_size"] == res["current_logical_size_non_incremental"] + res = assert_local(client, env.initial_tenant, new_timeline_id) + local_details = res['local'] + assert local_details["current_logical_size"] == local_details[ + "current_logical_size_non_incremental"] # wait until received_lsn_lag is 0 @@ -71,8 +76,9 @@ def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder): new_timeline_id = env.zenith_cli.create_branch('test_timeline_size_quota') client = env.pageserver.http_client() - res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) - assert res["current_logical_size"] == res["current_logical_size_non_incremental"] + res = assert_local(client, env.initial_tenant, new_timeline_id) + assert res['local']["current_logical_size"] == res['local'][ + "current_logical_size_non_incremental"] pgmain = env.postgres.create_start( "test_timeline_size_quota", diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index bdc4c4f63c..37ce1a8bca 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -89,29 +89,33 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): sk_metrics = [sk.http_client().get_metrics() for sk in env.safekeepers] timeline_metrics = [] - with env.pageserver.http_client() as pageserver_http: - for timeline_detail in timeline_details: - timeline_id: str = timeline_detail["timeline_id"] + for timeline_detail in timeline_details: + timeline_id: str = timeline_detail["timeline_id"] - m = TimelineMetrics( - timeline_id=timeline_id, - last_record_lsn=lsn_from_hex(timeline_detail["last_record_lsn"]), - ) - for sk_m in sk_metrics: - m.flush_lsns.append(sk_m.flush_lsn_inexact[(tenant_id.hex, timeline_id)]) - m.commit_lsns.append(sk_m.commit_lsn_inexact[(tenant_id.hex, timeline_id)]) + local_timeline_detail = timeline_detail.get('local') + if local_timeline_detail is None: + log.debug(f"Timeline {timeline_id} is not present locally, skipping") + continue - for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns): - # Invariant. May be < when transaction is in progress. - assert commit_lsn <= flush_lsn - # We only call collect_metrics() after a transaction is confirmed by - # the compute node, which only happens after a consensus of safekeepers - # has confirmed the transaction. We assume majority consensus here. - assert (2 * sum(m.last_record_lsn <= lsn - for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers) - assert (2 * sum(m.last_record_lsn <= lsn - for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers) - timeline_metrics.append(m) + m = TimelineMetrics( + timeline_id=timeline_id, + last_record_lsn=lsn_from_hex(local_timeline_detail['last_record_lsn']), + ) + for sk_m in sk_metrics: + m.flush_lsns.append(sk_m.flush_lsn_inexact[(tenant_id.hex, timeline_id)]) + m.commit_lsns.append(sk_m.commit_lsn_inexact[(tenant_id.hex, timeline_id)]) + + for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns): + # Invariant. May be < when transaction is in progress. + assert commit_lsn <= flush_lsn + # We only call collect_metrics() after a transaction is confirmed by + # the compute node, which only happens after a consensus of safekeepers + # has confirmed the transaction. We assume majority consensus here. + assert (2 * sum(m.last_record_lsn <= lsn + for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers) + assert (2 * sum(m.last_record_lsn <= lsn + for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers) + timeline_metrics.append(m) log.info(f"{message}: {timeline_metrics}") return timeline_metrics diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index fa68c4f476..08ac09ee4c 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -783,15 +783,6 @@ class ZenithPageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def timeline_detail_v2(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: - res = self.get( - f"http://localhost:{self.port}/v2/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1" - ) - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - def get_metrics(self) -> str: res = self.get(f"http://localhost:{self.port}/metrics") self.verbose_error(res) @@ -1891,7 +1882,7 @@ def wait_for(number_of_iterations: int, interval: int, func): def assert_local(pageserver_http_client: ZenithPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID): - timeline_detail = pageserver_http_client.timeline_detail_v2(tenant, timeline) + timeline_detail = pageserver_http_client.timeline_detail(tenant, timeline) assert timeline_detail.get('local', {}).get("disk_consistent_lsn"), timeline_detail return timeline_detail @@ -1899,7 +1890,7 @@ def assert_local(pageserver_http_client: ZenithPageserverHttpClient, def remote_consistent_lsn(pageserver_http_client: ZenithPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID) -> int: - detail = pageserver_http_client.timeline_detail_v2(tenant, timeline) + detail = pageserver_http_client.timeline_detail(tenant, timeline) lsn_str = detail['remote']['remote_consistent_lsn'] assert isinstance(lsn_str, str) @@ -1918,7 +1909,7 @@ def wait_for_upload(pageserver_http_client: ZenithPageserverHttpClient, def last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID) -> int: - detail = pageserver_http_client.timeline_detail_v2(tenant, timeline) + detail = pageserver_http_client.timeline_detail(tenant, timeline) lsn_str = detail['local']['last_record_lsn'] assert isinstance(lsn_str, str) From e13bdd77fe97e0c081218639ca55668aac23aeaa Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 21 Mar 2022 14:42:24 +0400 Subject: [PATCH 0047/1022] add safekepeers gossip annd storage messaging rfcs they were in prs during rfc repo import in addition to just import I've added sequence diagrams to storage messaging rfc --- docs/rfcs/014-safekeepers-gossip.md | 69 +++++++ docs/rfcs/015-storage-messaging.md | 295 ++++++++++++++++++++++++++++ 2 files changed, 364 insertions(+) create mode 100644 docs/rfcs/014-safekeepers-gossip.md create mode 100644 docs/rfcs/015-storage-messaging.md diff --git a/docs/rfcs/014-safekeepers-gossip.md b/docs/rfcs/014-safekeepers-gossip.md new file mode 100644 index 0000000000..3d6cc04b94 --- /dev/null +++ b/docs/rfcs/014-safekeepers-gossip.md @@ -0,0 +1,69 @@ +# Safekeeper gossip + +Extracted from this [PR](https://github.com/zenithdb/rfcs/pull/13) + +## Motivation + +In some situations, safekeeper (SK) needs coordination with other SK's that serve the same tenant: + +1. WAL deletion. SK needs to know what WAL was already safely replicated to delete it. Now we keep WAL indefinitely. +2. Deciding on who is sending WAL to the pageserver. Now sending SK crash may lead to a livelock where nobody sends WAL to the pageserver. +3. To enable SK to SK direct recovery without involving the compute + +## Summary + +Compute node has connection strings to each safekeeper. During each compute->safekeeper connection establishment, the compute node should pass down all that connection strings to each safekeeper. With that info, safekeepers may establish Postgres connections to each other and periodically send ping messages with LSN payload. + +## Components + +safekeeper, compute, compute<->safekeeper protocol, possibly console (group SK addresses) + +## Proposed implementation + +Each safekeeper can periodically ping all its peers and share connectivity and liveness info. If the ping was not receiver for, let's say, four ping periods, we may consider sending safekeeper as dead. That would mean some of the alive safekeepers should connect to the pageserver. One way to decide which one exactly: `make_connection = my_node_id == min(alive_nodes)` + +Since safekeepers are multi-tenant, we may establish either per-tenant physical connections or per-safekeeper ones. So it makes sense to group "logical" connections between corresponding tenants on different nodes into a single physical connection. That means that we should implement an interconnect thread that maintains physical connections and periodically broadcasts info about all tenants. + +Right now console may assign any 3 SK addresses to a given compute node. That may lead to a high number of gossip connections between SK's. Instead, we can assign safekeeper triples to the compute node. But if we want to "break"/" change" group by an ad-hoc action, we can do it. + +### Corner cases + +- Current safekeeper may be alive but may not have connectivity to the pageserver + + To address that, we need to gossip visibility info. Based on that info, we may define SK as alive only when it can connect to the pageserver. + +- Current safekeeper may be alive but may not have connectivity with the compute node. + + We may broadcast last_received_lsn and presence of compute connection and decide who is alive based on that. + +- It is tricky to decide when to shut down gossip connections because we need to be sure that pageserver got all the committed (in the distributed sense, so local SK info is not enough) records, and it may never lose them. It is not a strict requirement since `--sync-safekeepers` that happen before the compute start will allow the pageserver to consume missing WAL, but it is better to do that in the background. So the condition may look like that: `majority_max(flush_lsn) == pageserver_s3_lsn` Here we rely on the two facts: + - that `--sync-safekeepers` happened after the compute shutdown, and it advanced local commit_lsn's allowing pageserver to consume that WAL. + + - we wait for the `pageserver_s3_lsn` advancement to avoid pageserver's last_received_lsn/disk_consistent_lsn going backward due to the disk/hardware failure and subsequent S3 recovery + + If those conditions are not met, we will have some gossip activity (but that may be okay). + +## Pros/cons + +Pros: + +- distributed, does not introduce new services (like etcd), does not add console as a storage dependency +- lays the foundation for gossip-based recovery + +Cons: + +- Only compute knows a set of safekeepers, but they should communicate even without compute node. In case of safekeepers restart, we will lose that info and can't gossip anymore. Hence we can't trim some WAL tail until the compute node start. Also, it is ugly. + +- If the console assigns a random set of safekeepers to each Postgres, we may end up in a situation where each safekeeper needs to have a connection with all other safekeepers. We can group safekeepers into isolated triples in the console to avoid that. Then "mixing" would happen only if we do rebalancing. + +## Alternative implementation + +We can have a selected node (e.g., console) with everybody reporting to it. + +## Security implications + +We don't increase the attack surface here. Communication can happen in a private network that is not exposed to users. + +## Scalability implications + +The only thing that may grow as we grow the number of computes is the number of gossip connections. But if we group safekeepers and assign a compute node to the random SK triple, the number of connections would be constant. diff --git a/docs/rfcs/015-storage-messaging.md b/docs/rfcs/015-storage-messaging.md new file mode 100644 index 0000000000..47bc9eb89c --- /dev/null +++ b/docs/rfcs/015-storage-messaging.md @@ -0,0 +1,295 @@ +# Storage messaging + +Created on 19.01.22 + +Initially created [here](https://github.com/zenithdb/rfcs/pull/16) by @kelvich. + +That it is an alternative to (014-safekeeper-gossip)[] + +## Motivation + +As in 014-safekeeper-gossip we need to solve the following problems: + +* Trim WAL on safekeepers +* Decide on which SK should push WAL to the S3 +* Decide on which SK should forward WAL to the pageserver +* Decide on when to shut down SK<->pageserver connection + +This RFC suggests a more generic and hopefully more manageable way to address those problems. However, unlike 014-safekeeper-gossip, it does not bring us any closer to safekeeper-to-safekeeper recovery but rather unties two sets of different issues we previously wanted to solve with gossip. + +Also, with this approach, we would not need "call me maybe" anymore, and the pageserver will have all the data required to understand that it needs to reconnect to another safekeeper. + +## Summary + +Instead of p2p gossip, let's have a centralized broker where all the storage nodes report per-timeline state. Each storage node should have a `--broker-url=1.2.3.4` CLI param. + +Here I propose two ways to do that. After a lot of arguing with myself, I'm leaning towards the etcd approach. My arguments for it are in the pros/cons section. Both options require adding a Grpc client in our codebase either directly or as an etcd dependency. + +## Non-goals + +That RFC does *not* suggest moving the compute to pageserver and compute to safekeeper mappings out of the console. The console is still the only place in the cluster responsible for the persistency of that info. So I'm implying that each pageserver and safekeeper exactly knows what timelines he serves, as it currently is. We need some mechanism for a new pageserver to discover mapping info, but that is out of the scope of this RFC. + +## Impacted components + +pageserver, safekeeper +adds either etcd or console as a storage dependency + +## Possible implementation: custom message broker in the console + +We've decided to go with an etcd approach instead of the message broker. + +

+Original suggestion +
+We can add a Grpc service in the console that acts as a message broker since the console knows the addresses of all the components. The broker can ignore the payload and only redirect messages. So, for example, each safekeeper may send a message to the peering safekeepers or to the pageserver responsible for a given timeline. + +Message format could be `{sender, destination, payload}`. + +The destination is either: +1. `sk_#{tenant}_#{timeline}` -- to be broadcasted on all safekeepers, responsible for that timeline, or +2. `pserver_#{tenant}_#{timeline}` -- to be broadcasted on all pageservers, responsible for that timeline + +Sender is either: +1. `sk_#{sk_id}`, or +2. `pserver_#{pserver_id}` + +I can think of the following behavior to address our original problems: + +* WAL trimming + Each safekeeper periodically broadcasts `(write_lsn, commit_lsn)` to all peering (peering == responsible for that timeline) safekeepers + +* Decide on which SK should push WAL to the S3 + + Each safekeeper periodically broadcasts `i_am_alive_#{current_timestamp}` message to all peering safekeepers. That way, safekeepers may maintain the vector of alive peers (loose one, with false negatives). Alive safekeeper with the minimal id pushes data to S3. + +* Decide on which SK should forward WAL to the pageserver + + Each safekeeper periodically sends (write_lsn, commit_lsn, compute_connected) to the relevant pageservers. With that info, pageserver can maintain a view of the safekeepers state, connect to a random one, and detect the moments (e.g., one the safekeepers is not making progress or down) when it needs to reconnect to another safekeeper. Pageserver should resolve exact IP addresses through the console, e.g., exchange `#sk_#{sk_id}` to `4.5.6.7:6400`. + + Pageserver connection to the safekeeper triggered by the state change `compute_connected: false -> true`. With that, we don't need "call me maybe" anymore. + + Also, we don't have a "peer address amnesia" problem as in the gossip approach (with gossip, after a simultaneous reboot, safekeepers wouldn't know each other addresses until the next compute connection). + +* Decide on when to shutdown sk<->pageserver connection + + Again, pageserver would have all the info to understand when to shut down the safekeeper connection. + +### Scalability + +One node is enough (c) No, seriously, it is enough. + +### High Availability + +Broker lives in the console, so we can rely on k8s maintaining the console app alive. + +If the console is down, we won't trim WAL and reconnect the pageserver to another safekeeper. But, at the same, if the console is down, we already can't accept new compute connections and start stopped computes, so we are making things a bit worse, but not dramatically. + +### Interactions + +``` + .________________. +sk_1 <-> | | <-> pserver_1 +... | Console broker | ... +sk_n <-> |________________| <-> pserver_m +``` +
+ + +## Implementation: etcd state store + +Alternatively, we can set up `etcd` and maintain the following data structure in it: + +```ruby +"compute_#{tenant}_#{timeline}" => { + safekeepers => { + "sk_#{sk_id}" => { + write_lsn: "0/AEDF130", + commit_lsn: "0/AEDF100", + compute_connected: true, + last_updated: 1642621138, + }, + } +} +``` + +As etcd doesn't support field updates in the nested objects that translates to the following set of keys: + +```ruby +"compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/write_lsn", +"compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/commit_lsn", +... +``` + +Each storage node can subscribe to the relevant sets of keys and maintain a local view of that structure. So in terms of the data flow, everything is the same as in the previous approach. Still, we can avoid implementing the message broker and prevent runtime storage dependency on a console. + +### Safekeeper address discovery + +During the startup safekeeper should publish the address he is listening on as the part of `{"sk_#{sk_id}" => ip_address}`. Then the pageserver can resolve `sk_#{sk_id}` to the actual address. This way it would work both locally and in the cloud setup. Safekeeper should have `--advertised-address` CLI option so that we can listen on e.g. 0.0.0.0 but advertize something more useful. + +### Safekeeper behavior + +For each timeline safekeeper periodically broadcasts `compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/*` fields. It subscribes to changes of `compute_#{tenant}_#{timeline}` -- that way safekeeper will have an information about peering safekeepers. +That amount of information is enough to properly trim WAL. To decide on who is pushing the data to S3 safekeeper may use etcd leases or broadcast a timestamp and hence track who is alive. + +### Pageserver behavior + +Pageserver subscribes to `compute_#{tenant}_#{timeline}` for each tenant it owns. With that info, pageserver can maintain a view of the safekeepers state, connect to a random one, and detect the moments (e.g., one the safekeepers is not making progress or down) when it needs to reconnect to another safekeeper. Pageserver should resolve exact IP addresses through the console, e.g., exchange `#sk_#{sk_id}` to `4.5.6.7:6400`. + +Pageserver connection to the safekeeper can be triggered by the state change `compute_connected: false -> true`. With that, we don't need "call me maybe" anymore. + +As an alternative to compute_connected, we can track timestamp of the latest message arrived to safekeeper from compute. Usually compute broadcasts KeepAlive to all safekeepers every second, so it'll be updated every second when connection is ok. Then the connection can be considered down when this timestamp isn't updated for a several seconds. + +This will help to faster detect issues with safekeeper (and switch to another) in the following cases: + + when compute failed but TCP connection stays alive until timeout (usually about a minute) + when safekeeper failed and didn't set compute_connected to false + +Another way to deal with [2] is to process (write_lsn, commit_lsn, compute_connected) as a KeepAlive on the pageserver side and detect issues when sk_id don't send anything for some time. This way is fully compliant to this RFC. + +Also, we don't have a "peer address amnesia" problem as in the gossip approach (with gossip, after a simultaneous reboot, safekeepers wouldn't know each other addresses until the next compute connection). + +### Interactions + +``` + .________________. +sk_1 <-> | | <-> pserver_1 +... | etcd | ... +sk_n <-> |________________| <-> pserver_m +``` + +### Sequence diagrams for different workflows + +#### Cluster startup + +```mermaid +sequenceDiagram + autonumber + participant C as Compute + participant SK1 + participant SK2 + participant SK3 + participant PS1 + participant PS2 + participant O as Orchestrator + participant M as Metadata Service + + PS1->>M: subscribe to updates to state of timeline N + C->>+SK1: WAL push + loop constantly update current lsns + SK1->>-M: I'm at lsn A + end + C->>+SK2: WAL push + loop constantly update current lsns + SK2->>-M: I'm at lsn B + end + C->>+SK3: WAL push + loop constantly update current lsns + SK3->>-M: I'm at lsn C + end + loop request pages + C->>+PS1: get_page@lsn + PS1->>-C: page image + end + M->>PS1: New compute appeared for timeline N. SK1 at A, SK2 at B, SK3 at C + note over PS1: Say SK1 at A=200, SK2 at B=150 SK3 at C=100
so connect to SK1 because it is the most up to date one + PS1->>SK1: start replication +``` + +#### Behavour of services during typical operations + +```mermaid +sequenceDiagram + autonumber + participant C as Compute + participant SK1 + participant SK2 + participant SK3 + participant PS1 + participant PS2 + participant O as Orchestrator + participant M as Metadata Service + + note over C,M: Scenario 1: Pageserver checkpoint + note over PS1: Upload data to S3 + PS1->>M: Update remote consistent lsn + M->>SK1: propagate remote consistent lsn update + note over SK1: truncate WAL up to remote consistent lsn + M->>SK2: propagate remote consistent lsn update + note over SK2: truncate WAL up to remote consistent lsn + M->>SK3: propagate remote consistent lsn update + note over SK3: truncate WAL up to remote consistent lsn + note over C,M: Scenario 2: SK1 finds itself lagging behind MAX(150 (SK2), 200 (SK2)) - 100 (SK1) > THRESHOLD + SK1->>SK2: Fetch WAL delta between 100 (SK1) and 200 (SK2) + note over C,M: Scenario 3: PS1 detects that SK1 is lagging behind: Connection from SK1 is broken or there is no messages from it in 30 seconds. + note over PS1: e.g. SK2 is at 150, SK3 is at 100, chose SK2 as a new replication source + PS1->>SK2: start replication +``` + +#### Behaviour during timeline relocation + +```mermaid +sequenceDiagram + autonumber + participant C as Compute + participant SK1 + participant SK2 + participant SK3 + participant PS1 + participant PS2 + participant O as Orchestrator + participant M as Metadata Service + + note over C,M: Timeline is being relocated from PS1 to PS2 + O->>+PS2: Attach timeline + PS2->>-O: 202 Accepted if timeline exists in S3 + note over PS2: Download timeline from S3 + note over O: Poll for timeline download (or subscribe to metadata service) + loop wait for attach to complete + O->>PS2: timeline detail should answer that timeline is ready + end + PS2->>M: Register downloaded timeline + PS2->>M: Get safekeepers for timeline, subscribe to changes + PS2->>SK1: Start replication to catch up + note over O: PS2 catched up, time to switch compute + O->>C: Restart compute with new pageserver url in config + note over C: Wal push is restarted + loop request pages + C->>+PS2: get_page@lsn + PS2->>-C: page image + end + O->>PS1: detach timeline + note over C,M: Scenario 1: Attach call failed + O--xPS2: Attach timeline + note over O: The operation can be safely retried,
if we hit some threshold we can try another pageserver + note over C,M: Scenario 2: Attach succeeded but pageserver failed to download the data or start replication + loop wait for attach to complete + O--xPS2: timeline detail should answer that timeline is ready + end + note over O: Can wait for a timeout, and then try another pageserver
there should be a limit on number of different pageservers to try + note over C,M: Scenario 3: Detach fails + O--xPS1: Detach timeline + note over O: can be retried, if continues to fail might lead to data duplication in s3 +``` + +# Pros/cons + +## Console broker/etcd vs gossip: + +Gossip pros: +* gossip allows running storage without the console or etcd + +Console broker/etcd pros: +* simpler +* solves "call me maybe" as well +* avoid possible N-to-N connection issues with gossip without grouping safekeepers in pre-defined triples + +## Console broker vs. etcd: + +Initially, I wanted to avoid etcd as a dependency mostly because I've seen how painful for Clickhouse was their ZooKeeper dependency: in each chat, at each conference, people were complaining about configuration and maintenance barriers with ZooKeeper. It was that bad that ClickHouse re-implemented ZooKeeper to embed it: https://clickhouse.com/docs/en/operations/clickhouse-keeper/. + +But with an etcd we are in a bit different situation: + +1. We don't need persistency and strong consistency guarantees for the data we store in the etcd +2. etcd uses Grpc as a protocol, and messages are pretty simple + +So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local zenith installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres). From a4d0d78e9ec82b3cc848f8b467b865b0507fcdad Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Wed, 23 Mar 2022 13:39:55 +0300 Subject: [PATCH 0048/1022] s3 settings for pageserver (#1388) --- .circleci/ansible/deploy.yaml | 14 ++++++++++++++ .circleci/ansible/production.hosts | 2 +- .circleci/ansible/staging.hosts | 2 +- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml index 2dd109f99a..2379ef8510 100644 --- a/.circleci/ansible/deploy.yaml +++ b/.circleci/ansible/deploy.yaml @@ -91,6 +91,20 @@ tags: - pageserver + - name: update config + when: current_version > remote_version or force_deploy + lineinfile: + path: /storage/pageserver/data/pageserver.toml + line: "{{ item }}" + loop: + - "[remote_storage]" + - "bucket_name = '{{ bucket_name }}'" + - "bucket_region = '{{ bucket_region }}'" + - "prefix_in_bucket = '{{ inventory_hostname }}'" + become: true + tags: + - pageserver + - name: upload systemd service definition when: current_version > remote_version or force_deploy ansible.builtin.template: diff --git a/.circleci/ansible/production.hosts b/.circleci/ansible/production.hosts index c5b4f664a6..3a0543f39a 100644 --- a/.circleci/ansible/production.hosts +++ b/.circleci/ansible/production.hosts @@ -1,5 +1,5 @@ [pageservers] -zenith-1-ps-1 +zenith-1-ps-1 bucket_name=zenith-storage-oregon bucket_region=us-west-2 [safekeepers] zenith-1-sk-1 diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index e625120bf3..2987e2c6fa 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -1,5 +1,5 @@ [pageservers] -zenith-us-stage-ps-1 +zenith-us-stage-ps-1 bucket_name=zenith-staging-storage-us-east-1 bucket_region=us-east-1 [safekeepers] zenith-us-stage-sk-1 From 15434ba7e0f870683abe83d3e9994f00e5599f3f Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 22 Mar 2022 13:05:14 +0200 Subject: [PATCH 0049/1022] Show cachepot build stats --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index 9ee6abaa8a..3bc1039129 100644 --- a/Dockerfile +++ b/Dockerfile @@ -31,6 +31,8 @@ COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/inclu COPY . . RUN cargo build --release +# Show build caching stats to check if it was used +RUN /usr/local/cargo/bin/cachepot -s # Build final image # From 123fcd5d0dbeb6712d51fbd574e0dc16a7cb853d Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 23 Mar 2022 09:08:56 +0200 Subject: [PATCH 0050/1022] Revert accidental bump of vendor/postgres submodule I accidentally bumped it in commit 3b069f5aef. It didn't seem to cause any harm, but it was not intentional. --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 5e9bc37322..093aa160e5 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 5e9bc3732266c072151df20d6772b47ca51e233f +Subproject commit 093aa160e5df19814ff19b995d36dd5ee03c7f8b From e80ae4306aa009ce8154bf12269c49275551a582 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 23 Mar 2022 16:47:05 +0400 Subject: [PATCH 0051/1022] change log level from info to debug for timeline gc messages --- pageserver/src/layered_repository.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index c17df84689..64ac00ab56 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1734,7 +1734,7 @@ impl LayeredTimeline { // 1. Is it newer than cutoff point? if l.get_end_lsn() > cutoff { - info!( + debug!( "keeping {} {}-{} because it's newer than cutoff {}", seg, l.get_start_lsn(), @@ -1757,7 +1757,7 @@ impl LayeredTimeline { for retain_lsn in &retain_lsns { // start_lsn is inclusive if &l.get_start_lsn() <= retain_lsn { - info!( + debug!( "keeping {} {}-{} because it's still might be referenced by child branch forked at {} is_dropped: {} is_incremental: {}", seg, l.get_start_lsn(), @@ -1783,7 +1783,7 @@ impl LayeredTimeline { disk_consistent_lsn, ) { - info!( + debug!( "keeping {} {}-{} because it is the latest layer", seg, l.get_start_lsn(), @@ -1806,7 +1806,7 @@ impl LayeredTimeline { // because LayerMap of this timeline is already locked. let mut is_tombstone = layers.layer_exists_at_lsn(l.get_seg_tag(), prior_lsn)?; if is_tombstone { - info!( + debug!( "earlier layer exists at {} in {}", prior_lsn, self.timelineid ); @@ -1819,7 +1819,7 @@ impl LayeredTimeline { { let prior_lsn = ancestor.get_last_record_lsn(); if seg.rel.is_blocky() { - info!( + debug!( "check blocky relish size {} at {} in {} for layer {}-{}", seg, prior_lsn, @@ -1831,7 +1831,7 @@ impl LayeredTimeline { Some(size) => { let (last_live_seg, _rel_blknum) = SegmentTag::from_blknum(seg.rel, size - 1); - info!( + debug!( "blocky rel size is {} last_live_seg.segno {} seg.segno {}", size, last_live_seg.segno, seg.segno ); @@ -1840,11 +1840,11 @@ impl LayeredTimeline { } } _ => { - info!("blocky rel doesn't exist"); + debug!("blocky rel doesn't exist"); } } } else { - info!( + debug!( "check non-blocky relish existence {} at {} in {} for layer {}-{}", seg, prior_lsn, @@ -1857,7 +1857,7 @@ impl LayeredTimeline { } if is_tombstone { - info!( + debug!( "keeping {} {}-{} because this layer serves as a tombstone for older layer", seg, l.get_start_lsn(), @@ -1874,7 +1874,7 @@ impl LayeredTimeline { } // We didn't find any reason to keep this file, so remove it. - info!( + debug!( "garbage collecting {} {}-{} is_dropped: {} is_incremental: {}", l.get_seg_tag(), l.get_start_lsn(), From 0be7ed0cb5c1ee0e52c67d28a2ebb3113b7d3c54 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 23 Mar 2022 17:13:01 +0400 Subject: [PATCH 0052/1022] decrease log message severity for timeline checkpoint internals --- pageserver/src/layered_repository.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 64ac00ab56..2c4393481d 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1529,7 +1529,7 @@ impl LayeredTimeline { && oldest_lsn >= freeze_end_lsn // this layer intersects with evicted layer and so also need to be evicted { - info!( + debug!( "the oldest layer is now {} which is {} bytes behind last_record_lsn", oldest_layer.filename().display(), distance From 8a86276a6ef6a8f79e11a264087e6f22790d67c5 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 23 Mar 2022 17:40:29 +0400 Subject: [PATCH 0053/1022] add more context to error --- pageserver/src/remote_storage/storage_sync/upload.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pageserver/src/remote_storage/storage_sync/upload.rs b/pageserver/src/remote_storage/storage_sync/upload.rs index 8fdd91dd18..431b5ec484 100644 --- a/pageserver/src/remote_storage/storage_sync/upload.rs +++ b/pageserver/src/remote_storage/storage_sync/upload.rs @@ -182,7 +182,13 @@ async fn try_upload_checkpoint< } }) .collect::>(); - ensure!(!files_to_upload.is_empty(), "No files to upload"); + + ensure!( + !files_to_upload.is_empty(), + "No files to upload. Upload request was: {:?}, already uploaded files: {:?}", + new_checkpoint.layers, + files_to_skip, + ); compression::archive_files_as_stream( &timeline_dir, From 8b8d78a3a01fddcd0ba3e6ad5af782f4a147e26f Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 23 Mar 2022 19:13:44 +0400 Subject: [PATCH 0054/1022] use main branch of our bookfile crate --- Cargo.lock | 2 +- pageserver/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a9de71420b..923f14e06e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -246,7 +246,7 @@ dependencies = [ [[package]] name = "bookfile" version = "0.3.0" -source = "git+https://github.com/zenithdb/bookfile.git?branch=generic-readext#d51a99c7a0be48c3d9cc7cb85c9b7fb05ce1100c" +source = "git+https://github.com/zenithdb/bookfile.git?rev=bf6e43825dfb6e749ae9b80e8372c8fea76cec2f#bf6e43825dfb6e749ae9b80e8372c8fea76cec2f" dependencies = [ "aversion", "byteorder", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index efd2fa4a38..46e6e2a8f1 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -bookfile = { git = "https://github.com/zenithdb/bookfile.git", branch="generic-readext" } +bookfile = { git = "https://github.com/zenithdb/bookfile.git", rev="bf6e43825dfb6e749ae9b80e8372c8fea76cec2f" } chrono = "0.4.19" rand = "0.8.3" regex = "1.4.5" From 8437fc056e9c95c3a925df4dd4317f4454b8198c Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 23 Mar 2022 22:03:12 +0400 Subject: [PATCH 0055/1022] some follow ups after s3 integration was enabled on staging * do not error out when upload file list is empty * ignore ephemeral files during sync initialization --- pageserver/src/layered_repository.rs | 2 +- pageserver/src/remote_storage.rs | 8 ++++- .../src/remote_storage/storage_sync/upload.rs | 29 ++++++++++--------- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 2c4393481d..9cb0a17e66 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -54,7 +54,7 @@ use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn}; use zenith_utils::seqwait::SeqWait; mod delta_layer; -mod ephemeral_file; +pub(crate) mod ephemeral_file; mod filename; mod global_layer_map; mod image_layer; diff --git a/pageserver/src/remote_storage.rs b/pageserver/src/remote_storage.rs index 08fb16a679..6eb7bd910b 100644 --- a/pageserver/src/remote_storage.rs +++ b/pageserver/src/remote_storage.rs @@ -94,12 +94,13 @@ use std::{ use anyhow::{bail, Context}; use tokio::{io, sync::RwLock}; -use tracing::{error, info}; +use tracing::{debug, error, info}; use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; pub use self::storage_sync::index::{RemoteTimelineIndex, TimelineIndexEntry}; pub use self::storage_sync::{schedule_timeline_checkpoint_upload, schedule_timeline_download}; use self::{local_fs::LocalFs, rust_s3::S3}; +use crate::layered_repository::ephemeral_file::is_ephemeral_file; use crate::{ config::{PageServerConf, RemoteStorageKind}, layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}, @@ -261,6 +262,8 @@ fn collect_timelines_for_tenant( Ok(timelines) } +// discover timeline files and extract timeline metadata +// NOTE: ephemeral files are excluded from the list fn collect_timeline_files( timeline_dir: &Path, ) -> anyhow::Result<(ZTimelineId, TimelineMetadata, Vec)> { @@ -280,6 +283,9 @@ fn collect_timeline_files( if entry_path.is_file() { if entry_path.file_name().and_then(ffi::OsStr::to_str) == Some(METADATA_FILE_NAME) { timeline_metadata_path = Some(entry_path); + } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) { + debug!("skipping ephemeral file {}", entry_path.display()); + continue; } else { timeline_files.push(entry_path); } diff --git a/pageserver/src/remote_storage/storage_sync/upload.rs b/pageserver/src/remote_storage/storage_sync/upload.rs index 431b5ec484..dfc4433694 100644 --- a/pageserver/src/remote_storage/storage_sync/upload.rs +++ b/pageserver/src/remote_storage/storage_sync/upload.rs @@ -2,7 +2,6 @@ use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc}; -use anyhow::ensure; use tokio::sync::RwLock; use tracing::{debug, error, warn}; @@ -95,7 +94,7 @@ pub(super) async fn upload_timeline_checkpoint< ) .await { - Ok((archive_header, header_size)) => { + Some(Ok((archive_header, header_size))) => { let mut index_write = index.write().await; match index_write .timeline_entry_mut(&sync_id) @@ -136,7 +135,7 @@ pub(super) async fn upload_timeline_checkpoint< debug!("Checkpoint uploaded successfully"); Some(true) } - Err(e) => { + Some(Err(e)) => { error!( "Failed to upload checkpoint: {:?}, requeueing the upload", e @@ -148,6 +147,7 @@ pub(super) async fn upload_timeline_checkpoint< )); Some(false) } + None => Some(true), } } @@ -160,7 +160,7 @@ async fn try_upload_checkpoint< sync_id: ZTenantTimelineId, new_checkpoint: &NewCheckpoint, files_to_skip: BTreeSet, -) -> anyhow::Result<(ArchiveHeader, u64)> { +) -> Option> { let ZTenantTimelineId { tenant_id, timeline_id, @@ -172,7 +172,7 @@ async fn try_upload_checkpoint< .iter() .filter(|&path_to_upload| { if files_to_skip.contains(path_to_upload) { - error!( + warn!( "Skipping file upload '{}', since it was already uploaded", path_to_upload.display() ); @@ -183,14 +183,15 @@ async fn try_upload_checkpoint< }) .collect::>(); - ensure!( - !files_to_upload.is_empty(), - "No files to upload. Upload request was: {:?}, already uploaded files: {:?}", - new_checkpoint.layers, - files_to_skip, - ); + if files_to_upload.is_empty() { + warn!( + "No files to upload. Upload request was: {:?}, already uploaded files: {:?}", + new_checkpoint.layers, files_to_skip + ); + return None; + } - compression::archive_files_as_stream( + let upload_result = compression::archive_files_as_stream( &timeline_dir, files_to_upload.into_iter(), &new_checkpoint.metadata, @@ -206,7 +207,9 @@ async fn try_upload_checkpoint< }, ) .await - .map(|(header, header_size, _)| (header, header_size)) + .map(|(header, header_size, _)| (header, header_size)); + + Some(upload_result) } #[cfg(test)] From c7188705173e41ac742dd9738b5a99699552a8eb Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 24 Mar 2022 09:46:07 +0200 Subject: [PATCH 0056/1022] Tiny refactoring of page_cache::init function. The init function only needs the 'page_cache_size' from the config, so seems slightly nicer to pass just that. --- pageserver/src/bin/pageserver.rs | 3 +-- pageserver/src/page_cache.rs | 9 +++------ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 05fb14daca..a2564d51d7 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -163,8 +163,7 @@ fn main() -> Result<()> { // Basic initialization of things that don't change after startup virtual_file::init(conf.max_file_descriptors); - - page_cache::init(conf); + page_cache::init(conf.page_cache_size); // Create repo and exit if init was requested if init { diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index b0c8d3a5d7..2992d9477b 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -53,7 +53,7 @@ use zenith_utils::{ }; use crate::layered_repository::writeback_ephemeral_file; -use crate::{config::PageServerConf, relish::RelTag}; +use crate::relish::RelTag; static PAGE_CACHE: OnceCell = OnceCell::new(); const TEST_PAGE_CACHE_SIZE: usize = 10; @@ -61,11 +61,8 @@ const TEST_PAGE_CACHE_SIZE: usize = 10; /// /// Initialize the page cache. This must be called once at page server startup. /// -pub fn init(conf: &'static PageServerConf) { - if PAGE_CACHE - .set(PageCache::new(conf.page_cache_size)) - .is_err() - { +pub fn init(size: usize) { + if PAGE_CACHE.set(PageCache::new(size)).is_err() { panic!("page cache already initialized"); } } From d3a9cb44a659b11d0df7f7e2fbded9e388fbe917 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 24 Mar 2022 02:05:35 +0400 Subject: [PATCH 0057/1022] tweak timeouts for tenant relocation test --- test_runner/batch_others/test_tenant_relocation.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 32fbc8f872..8213d2526b 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -3,10 +3,8 @@ import os import pathlib import subprocess import threading -from typing import Dict from uuid import UUID from fixtures.log_helper import log -import time import signal import pytest @@ -15,7 +13,6 @@ from fixtures.utils import lsn_from_hex def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): - print("!" * 100, abs(a - b) / a) assert abs(a - b) / a < margin_ratio, abs(a - b) / a @@ -235,10 +232,10 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, assert cur.fetchone() == (2001000, ) if with_load == 'with_load': - assert load_ok_event.wait(1) + assert load_ok_event.wait(3) log.info('stopping load thread') load_stop_event.set() - load_thread.join() + load_thread.join(timeout=10) log.info('load thread stopped') # bring old pageserver back for clean shutdown via zenith cli From b9a1a75b0d21fee7818777f91d2f297273d9d631 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 24 Mar 2022 11:48:50 +0400 Subject: [PATCH 0058/1022] clean up unused imports in python tests --- test_runner/batch_others/test_gc_aggressive.py | 7 ++----- test_runner/batch_others/test_next_xid.py | 3 --- test_runner/batch_others/test_old_request_lsn.py | 2 -- test_runner/batch_others/test_pageserver_api.py | 2 +- test_runner/batch_others/test_pageserver_catchup.py | 7 ------- test_runner/batch_others/test_pageserver_restart.py | 6 ------ test_runner/batch_others/test_remote_storage.py | 2 +- test_runner/batch_others/test_snapfiles_gc.py | 1 - test_runner/batch_others/test_timeline_size.py | 1 - test_runner/batch_others/test_zenith_cli.py | 2 -- 10 files changed, 4 insertions(+), 29 deletions(-) diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/batch_others/test_gc_aggressive.py index 9de6ba9f59..e4e4aa9f4a 100644 --- a/test_runner/batch_others/test_gc_aggressive.py +++ b/test_runner/batch_others/test_gc_aggressive.py @@ -1,10 +1,7 @@ -from contextlib import closing - import asyncio -import asyncpg import random -from fixtures.zenith_fixtures import ZenithEnv, Postgres, Safekeeper +from fixtures.zenith_fixtures import ZenithEnv, Postgres from fixtures.log_helper import log # Test configuration @@ -76,5 +73,5 @@ def test_gc_aggressive(zenith_simple_env: ZenithEnv): asyncio.run(update_and_gc(env, pg, timeline)) - row = cur.execute('SELECT COUNT(*), SUM(counter) FROM foo') + cur.execute('SELECT COUNT(*), SUM(counter) FROM foo') assert cur.fetchone() == (num_rows, updates_to_perform) diff --git a/test_runner/batch_others/test_next_xid.py b/test_runner/batch_others/test_next_xid.py index fd0f761409..03c27bcd70 100644 --- a/test_runner/batch_others/test_next_xid.py +++ b/test_runner/batch_others/test_next_xid.py @@ -1,9 +1,6 @@ -import pytest -import random import time from fixtures.zenith_fixtures import ZenithEnvBuilder -from fixtures.log_helper import log # Test restarting page server, while safekeeper and compute node keep diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py index d09fb24913..e7400cff96 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -1,5 +1,3 @@ -from contextlib import closing - from fixtures.zenith_fixtures import ZenithEnv from fixtures.log_helper import log diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 965ba9bcc3..13f6ef358e 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -1,6 +1,6 @@ from uuid import uuid4, UUID import pytest -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient, zenith_binpath +from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient # test that we cannot override node id diff --git a/test_runner/batch_others/test_pageserver_catchup.py b/test_runner/batch_others/test_pageserver_catchup.py index 7093a1bdb3..3c4b7f9569 100644 --- a/test_runner/batch_others/test_pageserver_catchup.py +++ b/test_runner/batch_others/test_pageserver_catchup.py @@ -1,11 +1,4 @@ -import pytest -import random -import time - -from contextlib import closing -from multiprocessing import Process, Value from fixtures.zenith_fixtures import ZenithEnvBuilder -from fixtures.log_helper import log # Test safekeeper sync and pageserver catch up diff --git a/test_runner/batch_others/test_pageserver_restart.py b/test_runner/batch_others/test_pageserver_restart.py index 57f9db8f96..20e6f4467e 100644 --- a/test_runner/batch_others/test_pageserver_restart.py +++ b/test_runner/batch_others/test_pageserver_restart.py @@ -1,9 +1,3 @@ -import pytest -import random -import time - -from contextlib import closing -from multiprocessing import Process, Value from fixtures.zenith_fixtures import ZenithEnvBuilder from fixtures.log_helper import log diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index 07a122ede9..e762f8589a 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -1,7 +1,7 @@ # It's possible to run any regular test with the local fs remote storage via # env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/zenith_zzz/'}" poetry ...... -import time, shutil, os +import shutil, os from contextlib import closing from pathlib import Path from uuid import UUID diff --git a/test_runner/batch_others/test_snapfiles_gc.py b/test_runner/batch_others/test_snapfiles_gc.py index c6d4512bc9..d00af53864 100644 --- a/test_runner/batch_others/test_snapfiles_gc.py +++ b/test_runner/batch_others/test_snapfiles_gc.py @@ -1,6 +1,5 @@ from contextlib import closing import psycopg2.extras -import time from fixtures.utils import print_gc_result from fixtures.zenith_fixtures import ZenithEnv from fixtures.log_helper import log diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 0b341746ee..db33493d61 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -1,5 +1,4 @@ from contextlib import closing -from uuid import UUID import psycopg2.extras import psycopg2.errors from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres, assert_local diff --git a/test_runner/batch_others/test_zenith_cli.py b/test_runner/batch_others/test_zenith_cli.py index 4a62a1430a..091d9ac8ba 100644 --- a/test_runner/batch_others/test_zenith_cli.py +++ b/test_runner/batch_others/test_zenith_cli.py @@ -1,8 +1,6 @@ -import json import uuid import requests -from psycopg2.extensions import cursor as PgCursor from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient from typing import cast From 825d3631707016717f05ae5bcb7c112af9feba8f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 24 Mar 2022 12:17:56 +0200 Subject: [PATCH 0059/1022] Remove some unnecessary Ord etc. trait implementations. It doesn't make much sense to compare TimelineMetadata structs with < or >. But we depended on that in the remote storage upload code, so replace BTreeSets with Vecs there. --- pageserver/src/layered_repository/metadata.rs | 2 +- pageserver/src/remote_storage/storage_sync.rs | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pageserver/src/layered_repository/metadata.rs b/pageserver/src/layered_repository/metadata.rs index 960a1b7fe3..99d786c4cd 100644 --- a/pageserver/src/layered_repository/metadata.rs +++ b/pageserver/src/layered_repository/metadata.rs @@ -28,7 +28,7 @@ pub const METADATA_FILE_NAME: &str = "metadata"; /// Metadata stored on disk for each timeline /// /// The fields correspond to the values we hold in memory, in LayeredTimeline. -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct TimelineMetadata { disk_consistent_lsn: Lsn, // This is only set if we know it. We track it in memory when the page diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index f1483375cb..4ad28e6f8f 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -142,7 +142,7 @@ lazy_static! { /// mpsc approach was picked to allow blocking the sync loop if no tasks are present, to avoid meaningless spinning. mod sync_queue { use std::{ - collections::{BTreeSet, HashMap}, + collections::HashMap, sync::atomic::{AtomicUsize, Ordering}, }; @@ -205,9 +205,9 @@ mod sync_queue { pub async fn next_task_batch( receiver: &mut UnboundedReceiver, mut max_batch_size: usize, - ) -> BTreeSet { + ) -> Vec { if max_batch_size == 0 { - return BTreeSet::new(); + return Vec::new(); } let mut tasks = HashMap::with_capacity(max_batch_size); @@ -244,7 +244,7 @@ mod sync_queue { /// A task to run in the async download/upload loop. /// Limited by the number of retries, after certain threshold the failing task gets evicted and the timeline disabled. -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] +#[derive(Debug, Clone)] pub struct SyncTask { sync_id: ZTenantTimelineId, retries: u32, @@ -261,7 +261,7 @@ impl SyncTask { } } -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] +#[derive(Debug, Clone)] enum SyncKind { /// A certain amount of images (archive files) to download. Download(TimelineDownload), @@ -281,7 +281,7 @@ impl SyncKind { /// Local timeline files for upload, appeared after the new checkpoint. /// Current checkpoint design assumes new files are added only, no deletions or amendment happens. -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] +#[derive(Debug, Clone)] pub struct NewCheckpoint { /// Relish file paths in the pageserver workdir, that were added for the corresponding checkpoint. layers: Vec, @@ -289,7 +289,7 @@ pub struct NewCheckpoint { } /// Info about the remote image files. -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] +#[derive(Debug, Clone)] struct TimelineDownload { files_to_skip: Arc>, archives_to_skip: BTreeSet, @@ -485,11 +485,11 @@ async fn loop_step< max_sync_errors: NonZeroU32, ) -> HashMap> { let max_concurrent_sync = max_concurrent_sync.get(); - let mut next_tasks = BTreeSet::new(); + let mut next_tasks = Vec::new(); // request the first task in blocking fashion to do less meaningless work if let Some(first_task) = sync_queue::next_task(receiver).await { - next_tasks.insert(first_task); + next_tasks.push(first_task); } else { debug!("Shutdown requested, stopping"); return HashMap::new(); From a201d33edceacf8c1687f4dce9e94230f25be064 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 24 Mar 2022 13:27:14 +0200 Subject: [PATCH 0060/1022] Properly print cachepot stats --- Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3bc1039129..5e55cd834f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,9 +30,9 @@ ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server COPY . . -RUN cargo build --release -# Show build caching stats to check if it was used -RUN /usr/local/cargo/bin/cachepot -s +# Show build caching stats to check if it was used in the end. +# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, loosing the compilation stats. +RUN cargo build --release && /usr/local/cargo/bin/cachepot -s # Build final image # From edc7bebcb5a452ad84c5c3cfd46b727c6e6f1c48 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 17 Mar 2022 18:52:27 +0200 Subject: [PATCH 0061/1022] Remove obvious panic sources --- pageserver/src/basebackup.rs | 21 +++++----- pageserver/src/bin/pageserver.rs | 8 ++-- pageserver/src/import_datadir.rs | 21 +++++----- pageserver/src/layered_repository.rs | 21 ++++++---- .../src/layered_repository/inmemory_layer.rs | 10 ++--- pageserver/src/page_cache.rs | 7 ++-- pageserver/src/page_service.rs | 1 - pageserver/src/tenant_threads.rs | 2 +- pageserver/src/thread_mgr.rs | 2 +- pageserver/src/timelines.rs | 6 +-- pageserver/src/virtual_file.rs | 3 +- pageserver/src/walingest.rs | 2 +- pageserver/src/walredo.rs | 42 ++++++++++++------- 13 files changed, 84 insertions(+), 62 deletions(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 1ee48eb2fc..c316fc43d1 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -145,16 +145,17 @@ impl<'a> Basebackup<'a> { .timeline .get_relish_size(RelishTag::Slru { slru, segno }, self.lsn)?; - if seg_size == None { - trace!( - "SLRU segment {}/{:>04X} was truncated", - slru.to_str(), - segno - ); - return Ok(()); - } - - let nblocks = seg_size.unwrap(); + let nblocks = match seg_size { + Some(seg_size) => seg_size, + None => { + trace!( + "SLRU segment {}/{:>04X} was truncated", + slru.to_str(), + segno + ); + return Ok(()); + } + }; let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * pg_constants::BLCKSZ as usize); diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index a2564d51d7..5a1b5e5e2c 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -30,7 +30,7 @@ use zenith_utils::postgres_backend; use zenith_utils::shutdown::exit_now; use zenith_utils::signals::{self, Signal}; -fn main() -> Result<()> { +fn main() -> anyhow::Result<()> { zenith_metrics::set_common_metrics_prefix("pageserver"); let arg_matches = App::new("Zenith page server") .about("Materializes WAL stream to pages and serves them to the postgres") @@ -116,7 +116,7 @@ fn main() -> Result<()> { // We're initializing the repo, so there's no config file yet DEFAULT_CONFIG_FILE .parse::() - .expect("could not parse built-in config file") + .context("could not parse built-in config file")? } else { // Supplement the CLI arguments with the config file let cfg_file_contents = std::fs::read_to_string(&cfg_file_path) @@ -209,7 +209,9 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() // There shouldn't be any logging to stdin/stdout. Redirect it to the main log so // that we will see any accidental manual fprintf's or backtraces. - let stdout = log_file.try_clone().unwrap(); + let stdout = log_file + .try_clone() + .with_context(|| format!("Failed to clone log file '{:?}'", log_file))?; let stderr = log_file; let daemonize = Daemonize::new() diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index e317118bb5..1e691fb2fe 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -70,11 +70,11 @@ pub fn import_timeline_from_postgres_datadir( let direntry = direntry?; //skip all temporary files - if direntry.file_name().to_str().unwrap() == "pgsql_tmp" { + if direntry.file_name().to_string_lossy() == "pgsql_tmp" { continue; } - let dboid = direntry.file_name().to_str().unwrap().parse::()?; + let dboid = direntry.file_name().to_string_lossy().parse::()?; for direntry in fs::read_dir(direntry.path())? { let direntry = direntry?; @@ -117,7 +117,7 @@ pub fn import_timeline_from_postgres_datadir( } for entry in fs::read_dir(path.join("pg_twophase"))? { let entry = entry?; - let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?; + let xid = u32::from_str_radix(&entry.path().to_string_lossy(), 16)?; import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?; } // TODO: Scan pg_tblspc @@ -156,16 +156,15 @@ fn import_relfile( lsn: Lsn, spcoid: Oid, dboid: Oid, -) -> Result<()> { +) -> anyhow::Result<()> { // Does it look like a relation file? trace!("importing rel file {}", path.display()); - let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap()); - if let Err(e) = p { - warn!("unrecognized file in postgres datadir: {:?} ({})", path, e); - return Err(e.into()); - } - let (relnode, forknum, segno) = p.unwrap(); + let (relnode, forknum, segno) = parse_relfilename(&path.file_name().unwrap().to_string_lossy()) + .map_err(|e| { + warn!("unrecognized file in postgres datadir: {:?} ({})", path, e); + e + })?; let mut file = File::open(path)?; let mut buf: [u8; 8192] = [0u8; 8192]; @@ -271,7 +270,7 @@ fn import_slru_file( // Does it look like an SLRU file? let mut file = File::open(path)?; let mut buf: [u8; 8192] = [0u8; 8192]; - let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?; + let segno = u32::from_str_radix(&path.file_name().unwrap().to_string_lossy(), 16)?; trace!("importing slru file {}", path.display()); diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 9cb0a17e66..4d8d0ada24 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -11,7 +11,7 @@ //! parent timeline, and the last LSN that has been written to disk. //! -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{anyhow, bail, ensure, Context, Result}; use bookfile::Book; use bytes::Bytes; use lazy_static::lazy_static; @@ -1157,9 +1157,9 @@ impl LayeredTimeline { for direntry in fs::read_dir(timeline_path)? { let direntry = direntry?; let fname = direntry.file_name(); - let fname = fname.to_str().unwrap(); + let fname = fname.to_string_lossy(); - if let Some(imgfilename) = ImageFileName::parse_str(fname) { + if let Some(imgfilename) = ImageFileName::parse_str(&fname) { // create an ImageLayer struct for each image file. if imgfilename.lsn > disk_consistent_lsn { warn!( @@ -1177,7 +1177,7 @@ impl LayeredTimeline { trace!("found layer {}", layer.filename().display()); layers.insert_historic(Arc::new(layer)); num_layers += 1; - } else if let Some(deltafilename) = DeltaFileName::parse_str(fname) { + } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { // Create a DeltaLayer struct for each delta file. ensure!(deltafilename.start_lsn < deltafilename.end_lsn); // The end-LSN is exclusive, while disk_consistent_lsn is @@ -1203,7 +1203,7 @@ impl LayeredTimeline { num_layers += 1; } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { // ignore these - } else if is_ephemeral_file(fname) { + } else if is_ephemeral_file(&fname) { // Delete any old ephemeral files trace!("deleting old ephemeral file in timeline dir: {}", fname); fs::remove_file(direntry.path())?; @@ -1938,7 +1938,7 @@ impl LayeredTimeline { seg_blknum: SegmentBlk, lsn: Lsn, layer: &dyn Layer, - ) -> Result { + ) -> anyhow::Result { // Check the page cache. We will get back the most recent page with lsn <= `lsn`. // The cached image can be returned directly if there is no WAL between the cached image // and requested LSN. The cached image can also be used to reduce the amount of WAL needed @@ -1950,7 +1950,9 @@ impl LayeredTimeline { match cached_lsn.cmp(&lsn) { cmp::Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check cmp::Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image - cmp::Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn + cmp::Ordering::Greater => { + bail!("the returned lsn should never be after the requested lsn") + } } Some((cached_lsn, cached_img)) } @@ -2341,7 +2343,10 @@ pub fn dump_layerfile_from_path(path: &Path) -> Result<()> { /// Add a suffix to a layer file's name: .{num}.old /// Uses the first available num (starts at 0) fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { - let filename = path.file_name().unwrap().to_str().unwrap(); + let filename = path + .file_name() + .ok_or_else(|| anyhow!("Path {} don't have a file name", path.display()))? + .to_string_lossy(); let mut new_path = path.clone(); for i in 0u32.. { diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 6e24bf6022..239fb341a5 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -17,7 +17,7 @@ use crate::layered_repository::LayeredTimeline; use crate::layered_repository::ZERO_PAGE; use crate::repository::ZenithWalRecord; use crate::{ZTenantId, ZTimelineId}; -use anyhow::{ensure, Result}; +use anyhow::{ensure, Result, bail}; use bytes::Bytes; use log::*; use std::collections::HashMap; @@ -150,9 +150,9 @@ impl InMemoryLayerInner { let pos = self.file.stream_position()?; // make room for the 'length' field by writing zeros as a placeholder. - self.file.seek(std::io::SeekFrom::Start(pos + 4)).unwrap(); + self.file.seek(std::io::SeekFrom::Start(pos + 4))?; - pv.ser_into(&mut self.file).unwrap(); + pv.ser_into(&mut self.file)?; // write the 'length' field. let len = self.file.stream_position()? - pos - 4; @@ -315,7 +315,7 @@ impl Layer for InMemoryLayer { return Ok(false); } } else { - panic!("dropped in-memory layer with no end LSN"); + bail!("dropped in-memory layer with no end LSN"); } } @@ -333,7 +333,7 @@ impl Layer for InMemoryLayer { /// Nothing to do here. When you drop the last reference to the layer, it will /// be deallocated. fn delete(&self) -> Result<()> { - panic!("can't delete an InMemoryLayer") + bail!("can't delete an InMemoryLayer") } fn is_incremental(&self) -> bool { diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 2992d9477b..ef802ba0e2 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -732,9 +732,10 @@ impl PageCache { CacheKey::MaterializedPage { hash_key: _, lsn: _, - } => { - panic!("unexpected dirty materialized page"); - } + } => Err(std::io::Error::new( + std::io::ErrorKind::Other, + "unexpected dirty materialized page", + )), CacheKey::EphemeralPage { file_id, blkno } => { writeback_ephemeral_file(*file_id, *blkno, buf) } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 6e6b6415f3..6acdc8e93d 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -574,7 +574,6 @@ impl postgres_backend::Handler for PageServerHandler { let data = self .auth .as_ref() - .as_ref() .unwrap() .decode(str::from_utf8(jwt_response)?)?; diff --git a/pageserver/src/tenant_threads.rs b/pageserver/src/tenant_threads.rs index 062af9f1ad..c370eb61c8 100644 --- a/pageserver/src/tenant_threads.rs +++ b/pageserver/src/tenant_threads.rs @@ -49,7 +49,7 @@ pub fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> // Garbage collect old files that are not needed for PITR anymore if conf.gc_horizon > 0 { let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - repo.gc_iteration(None, conf.gc_horizon, false).unwrap(); + repo.gc_iteration(None, conf.gc_horizon, false)?; } // TODO Write it in more adequate way using diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index a51f0909ca..d24d6bf016 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -250,7 +250,7 @@ pub fn shutdown_threads( let _ = join_handle.join(); } else { // The thread had not even fully started yet. Or it was shut down - // concurrently and alrady exited + // concurrently and already exited } } } diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 00dd0f8f9c..8c018ce70f 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -250,7 +250,7 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { let initdb_path = conf.pg_bin_dir().join("initdb"); let initdb_output = Command::new(initdb_path) - .args(&["-D", initdbpath.to_str().unwrap()]) + .args(&["-D", &initdbpath.to_string_lossy()]) .args(&["-U", &conf.superuser]) .args(&["-E", "utf8"]) .arg("--no-instructions") @@ -258,8 +258,8 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { // so no need to fsync it .arg("--no-sync") .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) + .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) .stdout(Stdio::null()) .output() .context("failed to execute initdb")?; diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 73671dcf4e..858cff29cb 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -226,7 +226,8 @@ impl VirtualFile { path: &Path, open_options: &OpenOptions, ) -> Result { - let parts = path.to_str().unwrap().split('/').collect::>(); + let path_str = path.to_string_lossy(); + let parts = path_str.split('/').collect::>(); let tenantid; let timelineid; if parts.len() > 5 && parts[parts.len() - 5] == "tenants" { diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 1962c9bbd3..506890476f 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -249,7 +249,7 @@ impl WalIngest { { let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT]; buf.copy_to_slice(&mut checkpoint_bytes); - let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes).unwrap(); + let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?; trace!( "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", xlog_checkpoint.oldestXid, diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 877b81b8d5..704b8f2583 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -375,7 +375,10 @@ impl PostgresRedoManager { ZenithWalRecord::Postgres { will_init: _, rec: _, - } => panic!("tried to pass postgres wal record to zenith WAL redo"), + } => { + error!("tried to pass postgres wal record to zenith WAL redo"); + return Err(WalRedoError::InvalidRequest); + } ZenithWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno, @@ -541,20 +544,23 @@ impl PostgresRedoProcess { } info!("running initdb in {:?}", datadir.display()); let initdb = Command::new(conf.pg_bin_dir().join("initdb")) - .args(&["-D", datadir.to_str().unwrap()]) + .args(&["-D", &datadir.to_string_lossy()]) .arg("-N") .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) + .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) .output() - .expect("failed to execute initdb"); + .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {}", e)))?; if !initdb.status.success() { - panic!( - "initdb failed: {}\nstderr:\n{}", - std::str::from_utf8(&initdb.stdout).unwrap(), - std::str::from_utf8(&initdb.stderr).unwrap() - ); + return Err(Error::new( + ErrorKind::Other, + format!( + "initdb failed\nstdout: {}\nstderr:\n{}", + String::from_utf8_lossy(&initdb.stdout), + String::from_utf8_lossy(&initdb.stderr) + ), + )); } else { // Limit shared cache for wal-redo-postres let mut config = OpenOptions::new() @@ -572,11 +578,16 @@ impl PostgresRedoProcess { .stderr(Stdio::piped()) .stdout(Stdio::piped()) .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) + .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) .env("PGDATA", &datadir) .spawn() - .expect("postgres --wal-redo command failed to start"); + .map_err(|e| { + Error::new( + e.kind(), + format!("postgres --wal-redo command failed to start: {}", e), + ) + })?; info!( "launched WAL redo postgres process on {:?}", @@ -636,7 +647,10 @@ impl PostgresRedoProcess { { build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); } else { - panic!("tried to pass zenith wal record to postgres WAL redo"); + return Err(Error::new( + ErrorKind::Other, + "tried to pass zenith wal record to postgres WAL redo", + )); } } build_get_page_msg(tag, &mut writebuf); From f6b1d76c3097c61b89b47849a52fb714b1f45cbf Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 18 Mar 2022 20:59:55 +0200 Subject: [PATCH 0062/1022] Replace assert! with ensure! for anyhow::Result functions --- pageserver/src/basebackup.rs | 10 ++++---- pageserver/src/layered_repository.rs | 16 ++++++------ .../src/layered_repository/delta_layer.rs | 12 ++++----- .../src/layered_repository/image_layer.rs | 20 +++++++-------- .../src/layered_repository/inmemory_layer.rs | 25 +++++++++++-------- pageserver/src/layered_repository/metadata.rs | 4 +-- pageserver/src/walreceiver.rs | 4 +-- 7 files changed, 48 insertions(+), 43 deletions(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index c316fc43d1..5711f1807d 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,7 +10,7 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{Context, Result}; +use anyhow::{ensure, Context, Result}; use bytes::{BufMut, BytesMut}; use log::*; use std::fmt::Write as FmtWrite; @@ -163,7 +163,7 @@ impl<'a> Basebackup<'a> { let img = self.timeline .get_page_at_lsn(RelishTag::Slru { slru, segno }, blknum, self.lsn)?; - assert!(img.len() == pg_constants::BLCKSZ as usize); + ensure!(img.len() == pg_constants::BLCKSZ as usize); slru_buf.extend_from_slice(&img); } @@ -197,7 +197,7 @@ impl<'a> Basebackup<'a> { String::from("global/pg_filenode.map") // filenode map for global tablespace } else { // User defined tablespaces are not supported - assert!(spcnode == pg_constants::DEFAULTTABLESPACE_OID); + ensure!(spcnode == pg_constants::DEFAULTTABLESPACE_OID); // Append dir path for each database let path = format!("base/{}", dbnode); @@ -211,7 +211,7 @@ impl<'a> Basebackup<'a> { format!("base/{}/pg_filenode.map", dbnode) }; - assert!(img.len() == 512); + ensure!(img.len() == 512); let header = new_tar_header(&path, img.len() as u64)?; self.ar.append(&header, &img[..])?; Ok(()) @@ -292,7 +292,7 @@ impl<'a> Basebackup<'a> { let wal_file_path = format!("pg_wal/{}", wal_file_name); let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?; let wal_seg = generate_wal_segment(segno, pg_control.system_identifier); - assert!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE); + ensure!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE); self.ar.append(&header, &wal_seg[..])?; Ok(()) } diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 4d8d0ada24..7ec11add9c 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -791,10 +791,10 @@ impl Timeline for LayeredTimeline { } /// Wait until WAL has been received up to the given LSN. - fn wait_lsn(&self, lsn: Lsn) -> Result<()> { + fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { // This should never be called from the WAL receiver thread, because that could lead // to a deadlock. - assert!( + ensure!( !IS_WAL_RECEIVER.with(|c| c.get()), "wait_lsn called by WAL receiver thread" ); @@ -1262,7 +1262,7 @@ impl LayeredTimeline { seg: SegmentTag, lsn: Lsn, self_layers: &MutexGuard, - ) -> Result, Lsn)>> { + ) -> anyhow::Result, Lsn)>> { trace!("get_layer_for_read called for {} at {}", seg, lsn); // If you requested a page at an older LSN, before the branch point, dig into @@ -1310,7 +1310,7 @@ impl LayeredTimeline { layer.get_end_lsn() ); - assert!(layer.get_start_lsn() <= lsn); + ensure!(layer.get_start_lsn() <= lsn); if layer.is_dropped() && layer.get_end_lsn() <= lsn { return Ok(None); @@ -1338,13 +1338,13 @@ impl LayeredTimeline { /// /// Get a handle to the latest layer for appending. /// - fn get_layer_for_write(&self, seg: SegmentTag, lsn: Lsn) -> Result> { + fn get_layer_for_write(&self, seg: SegmentTag, lsn: Lsn) -> anyhow::Result> { let mut layers = self.layers.lock().unwrap(); - assert!(lsn.is_aligned()); + ensure!(lsn.is_aligned()); let last_record_lsn = self.get_last_record_lsn(); - assert!( + ensure!( lsn > last_record_lsn, "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", lsn, @@ -1360,7 +1360,7 @@ impl LayeredTimeline { // Open layer exists, but it is dropped, so create a new one. if open_layer.is_dropped() { - assert!(!open_layer.is_writeable()); + ensure!(!open_layer.is_writeable()); // Layer that is created after dropped one represents a new relish segment. trace!( "creating layer for write for new relish segment after dropped layer {} at {}/{}", diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 7434b8de11..f6e5510339 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -209,10 +209,10 @@ impl Layer for DeltaLayer { blknum: SegmentBlk, lsn: Lsn, reconstruct_data: &mut PageReconstructData, - ) -> Result { + ) -> anyhow::Result { let mut need_image = true; - assert!((0..RELISH_SEG_SIZE).contains(&blknum)); + ensure!((0..RELISH_SEG_SIZE).contains(&blknum)); match &reconstruct_data.page_img { Some((cached_lsn, _)) if &self.end_lsn <= cached_lsn => { @@ -289,8 +289,8 @@ impl Layer for DeltaLayer { } /// Get size of the relation at given LSN - fn get_seg_size(&self, lsn: Lsn) -> Result { - assert!(lsn >= self.start_lsn); + fn get_seg_size(&self, lsn: Lsn) -> anyhow::Result { + ensure!(lsn >= self.start_lsn); ensure!( self.seg.rel.is_blocky(), "get_seg_size() called on a non-blocky rel" @@ -641,7 +641,7 @@ impl DeltaLayerWriter { /// /// 'seg_sizes' is a list of size changes to store with the actual data. /// - pub fn finish(self, seg_sizes: VecMap) -> Result { + pub fn finish(self, seg_sizes: VecMap) -> anyhow::Result { // Close the page-versions chapter let book = self.page_version_writer.close()?; @@ -652,7 +652,7 @@ impl DeltaLayerWriter { let book = chapter.close()?; if self.seg.rel.is_blocky() { - assert!(!seg_sizes.is_empty()); + ensure!(!seg_sizes.is_empty()); } // and seg_sizes to separate chapter diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 24445ff7e9..c706f58e39 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -146,9 +146,9 @@ impl Layer for ImageLayer { blknum: SegmentBlk, lsn: Lsn, reconstruct_data: &mut PageReconstructData, - ) -> Result { - assert!((0..RELISH_SEG_SIZE).contains(&blknum)); - assert!(lsn >= self.lsn); + ) -> anyhow::Result { + ensure!((0..RELISH_SEG_SIZE).contains(&blknum)); + ensure!(lsn >= self.lsn); match reconstruct_data.page_img { Some((cached_lsn, _)) if self.lsn <= cached_lsn => { @@ -432,7 +432,7 @@ impl ImageLayerWriter { seg: SegmentTag, lsn: Lsn, num_blocks: SegmentBlk, - ) -> Result { + ) -> anyhow::Result { // Create the file // // Note: This overwrites any existing file. There shouldn't be any. @@ -452,7 +452,7 @@ impl ImageLayerWriter { let chapter = if seg.rel.is_blocky() { book.new_chapter(BLOCKY_IMAGES_CHAPTER) } else { - assert_eq!(num_blocks, 1); + ensure!(num_blocks == 1); book.new_chapter(NONBLOCKY_IMAGE_CHAPTER) }; @@ -475,19 +475,19 @@ impl ImageLayerWriter { /// /// The page versions must be appended in blknum order. /// - pub fn put_page_image(&mut self, block_bytes: &[u8]) -> Result<()> { - assert!(self.num_blocks_written < self.num_blocks); + pub fn put_page_image(&mut self, block_bytes: &[u8]) -> anyhow::Result<()> { + ensure!(self.num_blocks_written < self.num_blocks); if self.seg.rel.is_blocky() { - assert_eq!(block_bytes.len(), BLOCK_SIZE); + ensure!(block_bytes.len() == BLOCK_SIZE); } self.page_image_writer.write_all(block_bytes)?; self.num_blocks_written += 1; Ok(()) } - pub fn finish(self) -> Result { + pub fn finish(self) -> anyhow::Result { // Check that the `put_page_image' was called for every block. - assert!(self.num_blocks_written == self.num_blocks); + ensure!(self.num_blocks_written == self.num_blocks); // Close the page-images chapter let book = self.page_image_writer.close()?; diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 239fb341a5..fed1fb6469 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -17,7 +17,7 @@ use crate::layered_repository::LayeredTimeline; use crate::layered_repository::ZERO_PAGE; use crate::repository::ZenithWalRecord; use crate::{ZTenantId, ZTimelineId}; -use anyhow::{ensure, Result, bail}; +use anyhow::{bail, ensure, Result}; use bytes::Bytes; use log::*; use std::collections::HashMap; @@ -224,10 +224,10 @@ impl Layer for InMemoryLayer { blknum: SegmentBlk, lsn: Lsn, reconstruct_data: &mut PageReconstructData, - ) -> Result { + ) -> anyhow::Result { let mut need_image = true; - assert!((0..RELISH_SEG_SIZE).contains(&blknum)); + ensure!((0..RELISH_SEG_SIZE).contains(&blknum)); { let inner = self.inner.read().unwrap(); @@ -288,8 +288,8 @@ impl Layer for InMemoryLayer { } /// Get size of the relation at given LSN - fn get_seg_size(&self, lsn: Lsn) -> Result { - assert!(lsn >= self.start_lsn); + fn get_seg_size(&self, lsn: Lsn) -> anyhow::Result { + ensure!(lsn >= self.start_lsn); ensure!( self.seg.rel.is_blocky(), "get_seg_size() called on a non-blocky rel" @@ -300,13 +300,13 @@ impl Layer for InMemoryLayer { } /// Does this segment exist at given LSN? - fn get_seg_exists(&self, lsn: Lsn) -> Result { + fn get_seg_exists(&self, lsn: Lsn) -> anyhow::Result { let inner = self.inner.read().unwrap(); // If the segment created after requested LSN, // it doesn't exist in the layer. But we shouldn't // have requested it in the first place. - assert!(lsn >= self.start_lsn); + ensure!(lsn >= self.start_lsn); // Is the requested LSN after the segment was dropped? if inner.dropped { @@ -466,8 +466,13 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree - pub fn put_page_version(&self, blknum: SegmentBlk, lsn: Lsn, pv: PageVersion) -> Result { - assert!((0..RELISH_SEG_SIZE).contains(&blknum)); + pub fn put_page_version( + &self, + blknum: SegmentBlk, + lsn: Lsn, + pv: PageVersion, + ) -> anyhow::Result { + ensure!((0..RELISH_SEG_SIZE).contains(&blknum)); trace!( "put_page_version blk {} of {} at {}/{}", @@ -479,7 +484,7 @@ impl InMemoryLayer { let mut inner = self.inner.write().unwrap(); inner.assert_writeable(); - assert!(lsn >= inner.latest_lsn); + ensure!(lsn >= inner.latest_lsn); inner.latest_lsn = lsn; // Write the page version to the file, and remember its offset in 'page_versions' diff --git a/pageserver/src/layered_repository/metadata.rs b/pageserver/src/layered_repository/metadata.rs index 99d786c4cd..17e0485093 100644 --- a/pageserver/src/layered_repository/metadata.rs +++ b/pageserver/src/layered_repository/metadata.rs @@ -96,7 +96,7 @@ impl TimelineMetadata { ); let data = TimelineMetadata::from(serialize::DeTimelineMetadata::des_prefix(data)?); - assert!(data.disk_consistent_lsn.is_aligned()); + ensure!(data.disk_consistent_lsn.is_aligned()); Ok(data) } @@ -104,7 +104,7 @@ impl TimelineMetadata { pub fn to_bytes(&self) -> anyhow::Result> { let serializeable_metadata = serialize::SeTimelineMetadata::from(self); let mut metadata_bytes = serialize::SeTimelineMetadata::ser(&serializeable_metadata)?; - assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE); + ensure!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE); metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8); let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]); diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 305dd4b3a2..43fb7db4b0 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -146,7 +146,7 @@ fn walreceiver_main( tenant_id: ZTenantId, timeline_id: ZTimelineId, wal_producer_connstr: &str, -) -> Result<(), Error> { +) -> anyhow::Result<(), Error> { // Connect to the database in replication mode. info!("connecting to {:?}", wal_producer_connstr); let connect_cfg = format!( @@ -255,7 +255,7 @@ fn walreceiver_main( // It is important to deal with the aligned records as lsn in getPage@LSN is // aligned and can be several bytes bigger. Without this alignment we are // at risk of hittind a deadlock. - assert!(lsn.is_aligned()); + anyhow::ensure!(lsn.is_aligned()); let writer = timeline.writer(); walingest.ingest_record(writer.as_ref(), recdata, lsn)?; From 6244fd9e7eb78cd056cc92e67ca2fc6bf67eca22 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 23 Mar 2022 00:57:20 +0200 Subject: [PATCH 0063/1022] Better error messages on zenith cli subcommand invocations --- control_plane/src/storage.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 835c93bf1d..c49d5743a9 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -148,12 +148,20 @@ impl PageServerNode { let initial_timeline_id_string = initial_timeline_id.to_string(); args.extend(["--initial-timeline-id", &initial_timeline_id_string]); - let init_output = fill_rust_env_vars(cmd.args(args)) + let cmd_with_args = cmd.args(args); + let init_output = fill_rust_env_vars(cmd_with_args) .output() - .context("pageserver init failed")?; + .with_context(|| { + format!("failed to init pageserver with command {:?}", cmd_with_args) + })?; if !init_output.status.success() { - bail!("pageserver init failed"); + bail!( + "init invocation failed, {}\nStdout: {}\nStderr: {}", + init_output.status, + String::from_utf8_lossy(&init_output.stdout), + String::from_utf8_lossy(&init_output.stderr) + ); } Ok(initial_timeline_id) From 28bc8e3f5c961532f4177fb3e803b73f6a2adb5a Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 23 Mar 2022 19:33:06 +0200 Subject: [PATCH 0064/1022] Log pageserver threads better and shut down on errors in them --- pageserver/src/bin/pageserver.rs | 33 +----------------------- pageserver/src/layered_repository.rs | 2 +- pageserver/src/lib.rs | 38 +++++++++++++++++++++++++++- pageserver/src/thread_mgr.rs | 38 +++++++++++++++++++++------- 4 files changed, 68 insertions(+), 43 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 5a1b5e5e2c..14249963de 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -26,7 +26,6 @@ use pageserver::{ timelines, virtual_file, LOG_FILE_NAME, }; use zenith_utils::http::endpoint; -use zenith_utils::postgres_backend; use zenith_utils::shutdown::exit_now; use zenith_utils::signals::{self, Signal}; @@ -322,38 +321,8 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() "Got {}. Terminating gracefully in fast shutdown mode", signal.name() ); - shutdown_pageserver(); + pageserver::shutdown_pageserver(); unreachable!() } }) } - -fn shutdown_pageserver() { - // Shut down the libpq endpoint thread. This prevents new connections from - // being accepted. - thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None); - - // Shut down any page service threads. - postgres_backend::set_pgbackend_shutdown_requested(); - thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None); - - // Shut down all the tenants. This flushes everything to disk and kills - // the checkpoint and GC threads. - tenant_mgr::shutdown_all_tenants(); - - // Stop syncing with remote storage. - // - // FIXME: Does this wait for the sync thread to finish syncing what's queued up? - // Should it? - thread_mgr::shutdown_threads(Some(ThreadKind::StorageSync), None, None); - - // Shut down the HTTP endpoint last, so that you can still check the server's - // status while it's shutting down. - thread_mgr::shutdown_threads(Some(ThreadKind::HttpEndpointListener), None, None); - - // There should be nothing left, but let's be sure - thread_mgr::shutdown_threads(None, None, None); - - info!("Shut down successfully completed"); - std::process::exit(0); -} diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 7ec11add9c..ac0afcb275 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -976,7 +976,7 @@ impl Timeline for LayeredTimeline { /// Public entry point for checkpoint(). All the logic is in the private /// checkpoint_internal function, this public facade just wraps it for /// metrics collection. - fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()> { + fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { match cconf { CheckpointConfig::Flush => self .flush_checkpoint_time_histo diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 3d66192c80..060fa54b23 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -19,8 +19,14 @@ pub mod walrecord; pub mod walredo; use lazy_static::lazy_static; +use tracing::info; use zenith_metrics::{register_int_gauge_vec, IntGaugeVec}; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use zenith_utils::{ + postgres_backend, + zid::{ZTenantId, ZTimelineId}, +}; + +use crate::thread_mgr::ThreadKind; lazy_static! { static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!( @@ -43,3 +49,33 @@ pub enum CheckpointConfig { // Flush all in-memory data and reconstruct all page images Forced, } + +pub fn shutdown_pageserver() { + // Shut down the libpq endpoint thread. This prevents new connections from + // being accepted. + thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None); + + // Shut down any page service threads. + postgres_backend::set_pgbackend_shutdown_requested(); + thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None); + + // Shut down all the tenants. This flushes everything to disk and kills + // the checkpoint and GC threads. + tenant_mgr::shutdown_all_tenants(); + + // Stop syncing with remote storage. + // + // FIXME: Does this wait for the sync thread to finish syncing what's queued up? + // Should it? + thread_mgr::shutdown_threads(Some(ThreadKind::StorageSync), None, None); + + // Shut down the HTTP endpoint last, so that you can still check the server's + // status while it's shutting down. + thread_mgr::shutdown_threads(Some(ThreadKind::HttpEndpointListener), None, None); + + // There should be nothing left, but let's be sure + thread_mgr::shutdown_threads(None, None, None); + + info!("Shut down successfully completed"); + std::process::exit(0); +} diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index d24d6bf016..c4202e80be 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -43,12 +43,14 @@ use std::thread::JoinHandle; use tokio::sync::watch; -use tracing::{info, warn}; +use tracing::{error, info, warn}; use lazy_static::lazy_static; use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use crate::shutdown_pageserver; + lazy_static! { /// Each thread that we track is associated with a "thread ID". It's just /// an increasing number that we assign, not related to any system thread @@ -125,7 +127,7 @@ struct PageServerThread { } /// Launch a new thread -pub fn spawn( +pub fn spawn( kind: ThreadKind, tenant_id: Option, timeline_id: Option, @@ -133,7 +135,7 @@ pub fn spawn( f: F, ) -> std::io::Result<()> where - F: FnOnce() -> Result<(), E> + Send + 'static, + F: FnOnce() -> anyhow::Result<()> + Send + 'static, { let (shutdown_tx, shutdown_rx) = watch::channel(()); let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); @@ -160,12 +162,14 @@ where .insert(thread_id, Arc::clone(&thread_rc)); let thread_rc2 = Arc::clone(&thread_rc); + let thread_name = name.to_string(); let join_handle = match thread::Builder::new() .name(name.to_string()) - .spawn(move || thread_wrapper(thread_id, thread_rc2, shutdown_rx, f)) + .spawn(move || thread_wrapper(thread_name, thread_id, thread_rc2, shutdown_rx, f)) { Ok(handle) => handle, Err(err) => { + error!("Failed to spawn thread '{}': {}", name, err); // Could not spawn the thread. Remove the entry THREADS.lock().unwrap().remove(&thread_id); return Err(err); @@ -180,13 +184,14 @@ where /// This wrapper function runs in a newly-spawned thread. It initializes the /// thread-local variables and calls the payload function -fn thread_wrapper( +fn thread_wrapper( + thread_name: String, thread_id: u64, thread: Arc, shutdown_rx: watch::Receiver<()>, f: F, ) where - F: FnOnce() -> Result<(), E> + Send + 'static, + F: FnOnce() -> anyhow::Result<()> + Send + 'static, { SHUTDOWN_RX.with(|rx| { *rx.borrow_mut() = Some(shutdown_rx); @@ -195,6 +200,8 @@ fn thread_wrapper( *ct.borrow_mut() = Some(thread); }); + info!("Starting thread '{}'", thread_name); + // We use AssertUnwindSafe here so that the payload function // doesn't need to be UnwindSafe. We don't do anything after the // unwinding that would expose us to unwind-unsafe behavior. @@ -203,9 +210,22 @@ fn thread_wrapper( // Remove our entry from the global hashmap. THREADS.lock().unwrap().remove(&thread_id); - // If the thread payload panic'd, exit with the panic. - if let Err(err) = result { - panic::resume_unwind(err); + match result { + Ok(Ok(())) => info!("Thread '{}' exited normally", thread_name), + Ok(Err(err)) => { + error!( + "Shutting down: thread '{}' exited with error: {:?}", + thread_name, err + ); + shutdown_pageserver(); + } + Err(err) => { + error!( + "Shutting down: thread '{}' panicked: {:?}", + thread_name, err + ); + shutdown_pageserver(); + } } } From b39d1b17177eb6fe9509b87cb8908f8128ab78bc Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 24 Mar 2022 14:05:15 +0200 Subject: [PATCH 0065/1022] Exit only on important thread failures --- pageserver/src/bin/pageserver.rs | 2 ++ pageserver/src/page_service.rs | 1 + pageserver/src/remote_storage/storage_sync.rs | 8 ++--- pageserver/src/tenant_mgr.rs | 35 ++++++++++++------- pageserver/src/thread_mgr.rs | 34 ++++++++++++------ pageserver/src/walreceiver.rs | 11 +++--- 6 files changed, 57 insertions(+), 34 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 14249963de..e217806147 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -291,6 +291,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() None, None, "http_endpoint_thread", + false, move || { let router = http::make_router(conf, auth_cloned, remote_index); endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher()) @@ -304,6 +305,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() None, None, "libpq endpoint thread", + false, move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type), )?; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 6acdc8e93d..4744f0fe52 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -228,6 +228,7 @@ pub fn thread_main( None, None, "serving Page Service thread", + false, move || page_service_conn_main(conf, local_auth, socket, auth_type), ) { // Thread creation failed. Log the error and continue. diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index 4ad28e6f8f..b01b152e0a 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -404,6 +404,7 @@ pub(super) fn spawn_storage_sync_thread< None, None, "Remote storage sync thread", + false, move || { storage_sync_loop( runtime, @@ -413,7 +414,8 @@ pub(super) fn spawn_storage_sync_thread< storage, max_concurrent_sync, max_sync_errors, - ) + ); + Ok(()) }, ) .context("Failed to spawn remote storage sync thread")?; @@ -440,7 +442,7 @@ fn storage_sync_loop< storage: S, max_concurrent_sync: NonZeroUsize, max_sync_errors: NonZeroU32, -) -> anyhow::Result<()> { +) { let remote_assets = Arc::new((storage, Arc::clone(&index))); loop { let index = Arc::clone(&index); @@ -470,8 +472,6 @@ fn storage_sync_loop< } } } - - Ok(()) } async fn loop_step< diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 4d6dfd7488..0bc18231c9 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -206,13 +206,13 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> Option { /// Change the state of a tenant to Active and launch its checkpointer and GC /// threads. If the tenant was already in Active state or Stopping, does nothing. /// -pub fn activate_tenant(conf: &'static PageServerConf, tenantid: ZTenantId) -> Result<()> { +pub fn activate_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> Result<()> { let mut m = access_tenants(); let tenant = m - .get_mut(&tenantid) - .with_context(|| format!("Tenant not found for id {}", tenantid))?; + .get_mut(&tenant_id) + .with_context(|| format!("Tenant not found for id {}", tenant_id))?; - info!("activating tenant {}", tenantid); + info!("activating tenant {}", tenant_id); match tenant.state { // If the tenant is already active, nothing to do. @@ -222,22 +222,31 @@ pub fn activate_tenant(conf: &'static PageServerConf, tenantid: ZTenantId) -> Re TenantState::Idle => { thread_mgr::spawn( ThreadKind::Checkpointer, - Some(tenantid), + Some(tenant_id), None, "Checkpointer thread", - move || crate::tenant_threads::checkpoint_loop(tenantid, conf), + true, + move || crate::tenant_threads::checkpoint_loop(tenant_id, conf), )?; - // FIXME: if we fail to launch the GC thread, but already launched the - // checkpointer, we're in a strange state. - - thread_mgr::spawn( + let gc_spawn_result = thread_mgr::spawn( ThreadKind::GarbageCollector, - Some(tenantid), + Some(tenant_id), None, "GC thread", - move || crate::tenant_threads::gc_loop(tenantid, conf), - )?; + true, + move || crate::tenant_threads::gc_loop(tenant_id, conf), + ) + .with_context(|| format!("Failed to launch GC thread for tenant {}", tenant_id)); + + if let Err(e) = &gc_spawn_result { + error!( + "Failed to start GC thread for tenant {}, stopping its checkpointer thread: {:?}", + tenant_id, e + ); + thread_mgr::shutdown_threads(Some(ThreadKind::Checkpointer), Some(tenant_id), None); + return gc_spawn_result; + } tenant.state = TenantState::Active; } diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index c4202e80be..cafdc5e700 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -43,7 +43,7 @@ use std::thread::JoinHandle; use tokio::sync::watch; -use tracing::{error, info, warn}; +use tracing::{debug, error, info, warn}; use lazy_static::lazy_static; @@ -132,6 +132,7 @@ pub fn spawn( tenant_id: Option, timeline_id: Option, name: &str, + fail_on_error: bool, f: F, ) -> std::io::Result<()> where @@ -165,8 +166,16 @@ where let thread_name = name.to_string(); let join_handle = match thread::Builder::new() .name(name.to_string()) - .spawn(move || thread_wrapper(thread_name, thread_id, thread_rc2, shutdown_rx, f)) - { + .spawn(move || { + thread_wrapper( + thread_name, + thread_id, + thread_rc2, + shutdown_rx, + fail_on_error, + f, + ) + }) { Ok(handle) => handle, Err(err) => { error!("Failed to spawn thread '{}': {}", name, err); @@ -189,6 +198,7 @@ fn thread_wrapper( thread_id: u64, thread: Arc, shutdown_rx: watch::Receiver<()>, + fail_on_error: bool, f: F, ) where F: FnOnce() -> anyhow::Result<()> + Send + 'static, @@ -200,7 +210,7 @@ fn thread_wrapper( *ct.borrow_mut() = Some(thread); }); - info!("Starting thread '{}'", thread_name); + debug!("Starting thread '{}'", thread_name); // We use AssertUnwindSafe here so that the payload function // doesn't need to be UnwindSafe. We don't do anything after the @@ -211,13 +221,17 @@ fn thread_wrapper( THREADS.lock().unwrap().remove(&thread_id); match result { - Ok(Ok(())) => info!("Thread '{}' exited normally", thread_name), + Ok(Ok(())) => debug!("Thread '{}' exited normally", thread_name), Ok(Err(err)) => { - error!( - "Shutting down: thread '{}' exited with error: {:?}", - thread_name, err - ); - shutdown_pageserver(); + if fail_on_error { + error!( + "Shutting down: thread '{}' exited with error: {:?}", + thread_name, err + ); + shutdown_pageserver(); + } else { + error!("Thread '{}' exited with error: {:?}", thread_name, err); + } } Err(err) => { error!( diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 43fb7db4b0..2c10ad315b 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -78,9 +78,11 @@ pub fn launch_wal_receiver( Some(tenantid), Some(timelineid), "WAL receiver thread", + false, move || { IS_WAL_RECEIVER.with(|c| c.set(true)); - thread_main(conf, tenantid, timelineid) + thread_main(conf, tenantid, timelineid); + Ok(()) }, )?; @@ -110,11 +112,7 @@ fn get_wal_producer_connstr(tenantid: ZTenantId, timelineid: ZTimelineId) -> Str // // This is the entry point for the WAL receiver thread. // -fn thread_main( - conf: &'static PageServerConf, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, -) -> Result<()> { +fn thread_main(conf: &'static PageServerConf, tenant_id: ZTenantId, timeline_id: ZTimelineId) { let _enter = info_span!("WAL receiver", timeline = %timeline_id, tenant = %tenant_id).entered(); info!("WAL receiver thread started"); @@ -138,7 +136,6 @@ fn thread_main( // Drop it from list of active WAL_RECEIVERS // so that next callmemaybe request launched a new thread drop_wal_receiver(tenant_id, timeline_id); - Ok(()) } fn walreceiver_main( From e3fa00972e4987f2a3653ab7d547c357a94129fc Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 25 Mar 2022 15:34:38 +0200 Subject: [PATCH 0066/1022] Use RwLocks in image and delta layers for more concurrency. With a Mutex, only one thread could read from the layer at a time. I did some ad hoc profiling with pgbench and saw that a fair amout of time was spent blocked on these Mutexes. --- .../src/layered_repository/delta_layer.rs | 51 ++++++++++++++----- .../src/layered_repository/image_layer.rs | 46 ++++++++++++----- 2 files changed, 72 insertions(+), 25 deletions(-) diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index f6e5510339..1a6e941fbe 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -58,7 +58,7 @@ use std::io::{BufWriter, Write}; use std::ops::Bound::Included; use std::os::unix::fs::FileExt; use std::path::{Path, PathBuf}; -use std::sync::{Mutex, MutexGuard}; +use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard, TryLockError}; use bookfile::{Book, BookWriter, BoundedReader, ChapterWriter}; @@ -142,7 +142,7 @@ pub struct DeltaLayer { dropped: bool, - inner: Mutex, + inner: RwLock, } pub struct DeltaLayerInner { @@ -316,7 +316,11 @@ impl Layer for DeltaLayer { /// it will need to be loaded back. /// fn unload(&self) -> Result<()> { - let mut inner = self.inner.lock().unwrap(); + let mut inner = match self.inner.try_write() { + Ok(inner) => inner, + Err(TryLockError::WouldBlock) => return Ok(()), + Err(TryLockError::Poisoned(_)) => panic!("DeltaLayer lock was poisoned"), + }; inner.page_version_metas = VecMap::default(); inner.seg_sizes = VecMap::default(); inner.loaded = false; @@ -406,16 +410,37 @@ impl DeltaLayer { } /// - /// Load the contents of the file into memory + /// Open the underlying file and read the metadata into memory, if it's + /// not loaded already. /// - fn load(&self) -> Result> { - // quick exit if already loaded - let mut inner = self.inner.lock().unwrap(); + fn load(&self) -> Result> { + loop { + // Quick exit if already loaded + let inner = self.inner.read().unwrap(); + if inner.loaded { + return Ok(inner); + } - if inner.loaded { - return Ok(inner); + // Need to open the file and load the metadata. Upgrade our lock to + // a write lock. (Or rather, release and re-lock in write mode.) + drop(inner); + let inner = self.inner.write().unwrap(); + if !inner.loaded { + self.load_inner(inner)?; + } else { + // Another thread loaded it while we were not holding the lock. + } + + // We now have the file open and loaded. There's no function to do + // that in the std library RwLock, so we have to release and re-lock + // in read mode. (To be precise, the lock guard was moved in the + // above call to `load_inner`, so it's already been released). And + // while we do that, another thread could unload again, so we have + // to re-check and retry if that happens. } + } + fn load_inner(&self, mut inner: RwLockWriteGuard) -> Result<()> { let path = self.path(); // Open the file if it's not open already. @@ -462,7 +487,7 @@ impl DeltaLayer { inner.seg_sizes = seg_sizes; inner.loaded = true; - Ok(inner) + Ok(()) } /// Create a DeltaLayer struct representing an existing file on disk. @@ -480,7 +505,7 @@ impl DeltaLayer { start_lsn: filename.start_lsn, end_lsn: filename.end_lsn, dropped: filename.dropped, - inner: Mutex::new(DeltaLayerInner { + inner: RwLock::new(DeltaLayerInner { loaded: false, book: None, page_version_metas: VecMap::default(), @@ -507,7 +532,7 @@ impl DeltaLayer { start_lsn: summary.start_lsn, end_lsn: summary.end_lsn, dropped: summary.dropped, - inner: Mutex::new(DeltaLayerInner { + inner: RwLock::new(DeltaLayerInner { loaded: false, book: None, page_version_metas: VecMap::default(), @@ -689,7 +714,7 @@ impl DeltaLayerWriter { start_lsn: self.start_lsn, end_lsn: self.end_lsn, dropped: self.dropped, - inner: Mutex::new(DeltaLayerInner { + inner: RwLock::new(DeltaLayerInner { loaded: false, book: None, page_version_metas: VecMap::default(), diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index c706f58e39..5b8ec46452 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -37,7 +37,7 @@ use std::convert::TryInto; use std::fs; use std::io::{BufWriter, Write}; use std::path::{Path, PathBuf}; -use std::sync::{Mutex, MutexGuard}; +use std::sync::{RwLock, RwLockReadGuard}; use bookfile::{Book, BookWriter, ChapterWriter}; @@ -93,7 +93,7 @@ pub struct ImageLayer { // This entry contains an image of all pages as of this LSN pub lsn: Lsn, - inner: Mutex, + inner: RwLock, } #[derive(Clone)] @@ -273,16 +273,38 @@ impl ImageLayer { } /// - /// Load the contents of the file into memory + /// Open the underlying file and read the metadata into memory, if it's + /// not loaded already. /// - fn load(&self) -> Result> { - // quick exit if already loaded - let mut inner = self.inner.lock().unwrap(); + fn load(&self) -> Result> { + loop { + // Quick exit if already loaded + let inner = self.inner.read().unwrap(); + if inner.book.is_some() { + return Ok(inner); + } - if inner.book.is_some() { - return Ok(inner); + // Need to open the file and load the metadata. Upgrade our lock to + // a write lock. (Or rather, release and re-lock in write mode.) + drop(inner); + let mut inner = self.inner.write().unwrap(); + if inner.book.is_none() { + self.load_inner(&mut inner)?; + } else { + // Another thread loaded it while we were not holding the lock. + } + + // We now have the file open and loaded. There's no function to do + // that in the std library RwLock, so we have to release and re-lock + // in read mode. (To be precise, the lock guard was moved in the + // above call to `load_inner`, so it's already been released). And + // while we do that, another thread could unload again, so we have + // to re-check and retry if that happens. + drop(inner); } + } + fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> { let path = self.path(); let file = VirtualFile::open(&path) .with_context(|| format!("Failed to open virtual file '{}'", path.display()))?; @@ -336,7 +358,7 @@ impl ImageLayer { image_type, }; - Ok(inner) + Ok(()) } /// Create an ImageLayer struct representing an existing file on disk @@ -352,7 +374,7 @@ impl ImageLayer { tenantid, seg: filename.seg, lsn: filename.lsn, - inner: Mutex::new(ImageLayerInner { + inner: RwLock::new(ImageLayerInner { book: None, image_type: ImageType::Blocky { num_blocks: 0 }, }), @@ -375,7 +397,7 @@ impl ImageLayer { tenantid: summary.tenantid, seg: summary.seg, lsn: summary.lsn, - inner: Mutex::new(ImageLayerInner { + inner: RwLock::new(ImageLayerInner { book: None, image_type: ImageType::Blocky { num_blocks: 0 }, }), @@ -522,7 +544,7 @@ impl ImageLayerWriter { tenantid: self.tenantid, seg: self.seg, lsn: self.lsn, - inner: Mutex::new(ImageLayerInner { + inner: RwLock::new(ImageLayerInner { book: None, image_type, }), From b8cba059a59f1c5e74cd8160af6aee4658c9744e Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 25 Mar 2022 20:52:58 +0200 Subject: [PATCH 0067/1022] temporary disable s3 integration on staging until LSM storge rewrite lands --- .circleci/ansible/deploy.yaml | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml index 2379ef8510..1f43adf950 100644 --- a/.circleci/ansible/deploy.yaml +++ b/.circleci/ansible/deploy.yaml @@ -91,19 +91,20 @@ tags: - pageserver - - name: update config - when: current_version > remote_version or force_deploy - lineinfile: - path: /storage/pageserver/data/pageserver.toml - line: "{{ item }}" - loop: - - "[remote_storage]" - - "bucket_name = '{{ bucket_name }}'" - - "bucket_region = '{{ bucket_region }}'" - - "prefix_in_bucket = '{{ inventory_hostname }}'" - become: true - tags: - - pageserver + # Temporary disabled until LSM storage rewrite lands + # - name: update config + # when: current_version > remote_version or force_deploy + # lineinfile: + # path: /storage/pageserver/data/pageserver.toml + # line: "{{ item }}" + # loop: + # - "[remote_storage]" + # - "bucket_name = '{{ bucket_name }}'" + # - "bucket_region = '{{ bucket_region }}'" + # - "prefix_in_bucket = '{{ inventory_hostname }}'" + # become: true + # tags: + # - pageserver - name: upload systemd service definition when: current_version > remote_version or force_deploy From 5e04dad3604ddc6da58558425f44c9e6b3f05def Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 25 Mar 2022 23:42:13 +0200 Subject: [PATCH 0068/1022] Add more variants of the sequential scan performance tests. More rows, and test with serial and parallel plans. But fewer iterations, so that the tests run in < 1 minutes, and we don't need to mark them as "slow". --- ...est_small_seqscans.py => test_seqscans.py} | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) rename test_runner/performance/{test_small_seqscans.py => test_seqscans.py} (65%) diff --git a/test_runner/performance/test_small_seqscans.py b/test_runner/performance/test_seqscans.py similarity index 65% rename from test_runner/performance/test_small_seqscans.py rename to test_runner/performance/test_seqscans.py index b98018ad97..85d0a24510 100644 --- a/test_runner/performance/test_small_seqscans.py +++ b/test_runner/performance/test_seqscans.py @@ -1,8 +1,5 @@ # Test sequential scan speed # -# The test table is large enough (3-4 MB) that it doesn't fit in the compute node -# cache, so the seqscans go to the page server. But small enough that it fits -# into memory in the page server. from contextlib import closing from dataclasses import dataclass from fixtures.zenith_fixtures import ZenithEnv @@ -12,11 +9,18 @@ from fixtures.compare_fixtures import PgCompare import pytest -@pytest.mark.parametrize('rows', [ - pytest.param(100000), - pytest.param(1000000, marks=pytest.mark.slow), -]) -def test_small_seqscans(zenith_with_baseline: PgCompare, rows: int): +@pytest.mark.parametrize( + 'rows,iters,workers', + [ + # The test table is large enough (3-4 MB) that it doesn't fit in the compute node + # cache, so the seqscans go to the page server. But small enough that it fits + # into memory in the page server. + pytest.param(100000, 100, 0), + # Also test with a larger table, with and without parallelism + pytest.param(10000000, 1, 0), + pytest.param(10000000, 1, 4) + ]) +def test_seqscans(zenith_with_baseline: PgCompare, rows: int, iters: int, workers: int): env = zenith_with_baseline with closing(env.pg.connect()) as conn: @@ -36,6 +40,8 @@ def test_small_seqscans(zenith_with_baseline: PgCompare, rows: int): assert int(shared_buffers) < int(table_size) env.zenbenchmark.record("table_size", table_size, 'bytes', MetricReport.TEST_PARAM) + cur.execute(f"set max_parallel_workers_per_gather = {workers}") + with env.record_duration('run'): - for i in range(1000): + for i in range(iters): cur.execute('select count(*) from t;') From 18dfc769d814f9753eb611a85d1ebeb81de0dafe Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 25 Mar 2022 11:27:21 +0200 Subject: [PATCH 0069/1022] Use cachepot to cache more rustc builds --- .circleci/config.yml | 15 +++++++++++++-- Dockerfile | 1 - Dockerfile.compute-tools | 9 +++++++-- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index d342e7c9f4..f05ad3e816 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -111,7 +111,12 @@ jobs: fi export CARGO_INCREMENTAL=0 + export CACHEPOT_BUCKET=zenith-rust-cachepot + export RUSTC_WRAPPER=cachepot + export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" + export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --bins --tests + cachepot -s - save_cache: name: Save rust cache @@ -464,7 +469,10 @@ jobs: name: Build and push compute-tools Docker image command: | echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin - docker build -t zenithdb/compute-tools:latest -f Dockerfile.compute-tools . + docker build \ + --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ + --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ + --tag zenithdb/compute-tools:latest -f Dockerfile.compute-tools . docker push zenithdb/compute-tools:latest - run: name: Init postgres submodule @@ -518,7 +526,10 @@ jobs: name: Build and push compute-tools Docker image command: | echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin - docker build -t zenithdb/compute-tools:release -f Dockerfile.compute-tools . + docker build \ + --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ + --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ + --tag zenithdb/compute-tools:release -f Dockerfile.compute-tools . docker push zenithdb/compute-tools:release - run: name: Init postgres submodule diff --git a/Dockerfile b/Dockerfile index 5e55cd834f..babc3b8e1d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,7 +24,6 @@ ARG GIT_VERSION=local ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID ARG AWS_SECRET_ACCESS_KEY -#ENV RUSTC_WRAPPER cachepot ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index a1f7582ee4..f7672251e6 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,12 +1,17 @@ # First transient image to build compute_tools binaries # NB: keep in sync with rust image version in .circle/config.yml -FROM rust:1.56.1-slim-buster AS rust-build +FROM zenithdb/build:buster-20220309 AS rust-build WORKDIR /zenith +ARG CACHEPOT_BUCKET=zenith-rust-cachepot +ARG AWS_ACCESS_KEY_ID +ARG AWS_SECRET_ACCESS_KEY +ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot + COPY . . -RUN cargo build -p compute_tools --release +RUN cargo build -p compute_tools --release && /usr/local/cargo/bin/cachepot -s # Final image that only has one binary FROM debian:buster-slim From d56a0ee19aeec715f9c839a9bcdc91c650000f1e Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 25 Mar 2022 11:48:30 +0200 Subject: [PATCH 0070/1022] Avoid recompiling tests for release profile --- .circleci/config.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index f05ad3e816..513d305b5d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -146,11 +146,13 @@ jobs: command: | if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run) + CARGO_FLAGS= elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix=() + CARGO_FLAGS=--release fi - "${cov_prefix[@]}" cargo test + "${cov_prefix[@]}" cargo test $CARGO_FLAGS # Install the rust binaries, for use by test jobs - run: From 55de0b88f5b02fe4a77d7b78640b51ca9f236baa Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 25 Mar 2022 23:53:37 +0200 Subject: [PATCH 0071/1022] Hide remote timeline index access details --- pageserver/src/http/routes.rs | 30 ++++++---- pageserver/src/layered_repository.rs | 10 ++-- pageserver/src/remote_storage.rs | 9 ++- pageserver/src/remote_storage/storage_sync.rs | 58 ++++++++++--------- .../remote_storage/storage_sync/download.rs | 30 +++++----- .../src/remote_storage/storage_sync/index.rs | 34 +++++++++-- .../src/remote_storage/storage_sync/upload.rs | 49 +++++++--------- pageserver/src/repository.rs | 6 +- pageserver/src/tenant_mgr.rs | 10 ++-- pageserver/src/timelines.rs | 25 ++------ 10 files changed, 134 insertions(+), 127 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 3ca8b6334a..13e79f8f55 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3,7 +3,6 @@ use std::sync::Arc; use anyhow::Result; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; -use tokio::sync::RwLock; use tracing::*; use zenith_utils::auth::JwtAuth; use zenith_utils::http::endpoint::attach_openapi_ui; @@ -22,17 +21,14 @@ use zenith_utils::zid::{ZTenantTimelineId, ZTimelineId}; use super::models::{ StatusResponse, TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest, }; -use crate::remote_storage::{schedule_timeline_download, RemoteTimelineIndex}; -use crate::timelines::{ - extract_remote_timeline_info, LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo, -}; +use crate::remote_storage::{schedule_timeline_download, RemoteIndex}; +use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo}; use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId}; -#[derive(Debug)] struct State { conf: &'static PageServerConf, auth: Option>, - remote_index: Arc>, + remote_index: RemoteIndex, allowlist_routes: Vec, } @@ -40,7 +36,7 @@ impl State { fn new( conf: &'static PageServerConf, auth: Option>, - remote_index: Arc>, + remote_index: RemoteIndex, ) -> Self { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() @@ -113,14 +109,24 @@ async fn timeline_list_handler(request: Request) -> Result, .await .map_err(ApiError::from_err)??; - let remote_index = get_state(&request).remote_index.read().await; let mut response_data = Vec::with_capacity(local_timeline_infos.len()); for (timeline_id, local_timeline_info) in local_timeline_infos { response_data.push(TimelineInfo { tenant_id, timeline_id, local: Some(local_timeline_info), - remote: extract_remote_timeline_info(tenant_id, timeline_id, &remote_index), + remote: get_state(&request) + .remote_index + .read() + .await + .timeline_entry(&ZTenantTimelineId { + tenant_id, + timeline_id, + }) + .map(|remote_entry| RemoteTimelineInfo { + remote_consistent_lsn: remote_entry.disk_consistent_lsn(), + awaits_download: remote_entry.get_awaits_download(), + }), }) } @@ -277,7 +283,7 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result, ApiError> { pub fn make_router( conf: &'static PageServerConf, auth: Option>, - remote_index: Arc>, + remote_index: RemoteIndex, ) -> RouterBuilder { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index ac0afcb275..bf5f52b18d 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -35,7 +35,7 @@ use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; use crate::config::PageServerConf; use crate::page_cache; use crate::relish::*; -use crate::remote_storage::{schedule_timeline_checkpoint_upload, RemoteTimelineIndex}; +use crate::remote_storage::{schedule_timeline_checkpoint_upload, RemoteIndex}; use crate::repository::{ BlockNumber, GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncStatusUpdate, TimelineWriter, ZenithWalRecord, @@ -132,7 +132,7 @@ pub struct LayeredRepository { // provides access to timeline data sitting in the remote storage // supposed to be used for retrieval of remote consistent lsn in walreceiver - remote_index: Arc>, + remote_index: RemoteIndex, /// Makes every timeline to backup their files to remote storage. upload_relishes: bool, @@ -355,8 +355,8 @@ impl Repository for LayeredRepository { Ok(()) } - fn get_remote_index(&self) -> &tokio::sync::RwLock { - self.remote_index.as_ref() + fn get_remote_index(&self) -> &RemoteIndex { + &self.remote_index } } @@ -511,7 +511,7 @@ impl LayeredRepository { conf: &'static PageServerConf, walredo_mgr: Arc, tenantid: ZTenantId, - remote_index: Arc>, + remote_index: RemoteIndex, upload_relishes: bool, ) -> LayeredRepository { LayeredRepository { diff --git a/pageserver/src/remote_storage.rs b/pageserver/src/remote_storage.rs index 6eb7bd910b..bdd6086b94 100644 --- a/pageserver/src/remote_storage.rs +++ b/pageserver/src/remote_storage.rs @@ -89,15 +89,14 @@ use std::{ collections::HashMap, ffi, fs, path::{Path, PathBuf}, - sync::Arc, }; use anyhow::{bail, Context}; -use tokio::{io, sync::RwLock}; +use tokio::io; use tracing::{debug, error, info}; use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; -pub use self::storage_sync::index::{RemoteTimelineIndex, TimelineIndexEntry}; +pub use self::storage_sync::index::{RemoteIndex, TimelineIndexEntry}; pub use self::storage_sync::{schedule_timeline_checkpoint_upload, schedule_timeline_download}; use self::{local_fs::LocalFs, rust_s3::S3}; use crate::layered_repository::ephemeral_file::is_ephemeral_file; @@ -120,7 +119,7 @@ type LocalTimelineInitStatuses = HashMap>, + pub remote_index: RemoteIndex, pub local_timeline_init_statuses: LocalTimelineInitStatuses, } @@ -172,7 +171,7 @@ pub fn start_local_timeline_sync( } Ok(SyncStartupData { local_timeline_init_statuses, - remote_index: Arc::new(RwLock::new(RemoteTimelineIndex::empty())), + remote_index: RemoteIndex::empty(), }) } } diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index b01b152e0a..9fe2ab2847 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -25,6 +25,7 @@ //! * all never local state gets scheduled for upload, such timelines are "local" and fully operational //! * the rest of the remote timelines are reported to pageserver, but not downloaded before they are actually accessed in pageserver, //! it may schedule the download on such occasions. +//! Then, the index is shared across pageserver under [`RemoteIndex`] guard to ensure proper synchronization. //! //! The synchronization unit is an archive: a set of timeline files (or relishes) and a special metadata file, all compressed into a blob. //! Currently, there's no way to process an archive partially, if the archive processing fails, it has to be started from zero next time again. @@ -80,10 +81,7 @@ use futures::stream::{FuturesUnordered, StreamExt}; use lazy_static::lazy_static; use tokio::{ runtime::Runtime, - sync::{ - mpsc::{self, UnboundedReceiver}, - RwLock, - }, + sync::mpsc::{self, UnboundedReceiver}, time::{Duration, Instant}, }; use tracing::*; @@ -92,8 +90,8 @@ use self::{ compression::ArchiveHeader, download::{download_timeline, DownloadedTimeline}, index::{ - ArchiveDescription, ArchiveId, RemoteTimeline, RemoteTimelineIndex, TimelineIndexEntry, - TimelineIndexEntryInner, + ArchiveDescription, ArchiveId, RemoteIndex, RemoteTimeline, RemoteTimelineIndex, + TimelineIndexEntry, TimelineIndexEntryInner, }, upload::upload_timeline_checkpoint, }; @@ -392,13 +390,14 @@ pub(super) fn spawn_storage_sync_thread< None } }); - let mut remote_index = - RemoteTimelineIndex::try_parse_descriptions_from_paths(conf, download_paths); + let remote_index = RemoteIndex::try_parse_descriptions_from_paths(conf, download_paths); - let local_timeline_init_statuses = - schedule_first_sync_tasks(&mut remote_index, local_timeline_files); - let remote_index = Arc::new(RwLock::new(remote_index)); - let remote_index_cloned = Arc::clone(&remote_index); + let local_timeline_init_statuses = schedule_first_sync_tasks( + &mut runtime.block_on(remote_index.write()), + local_timeline_files, + ); + + let loop_index = remote_index.clone(); thread_mgr::spawn( ThreadKind::StorageSync, None, @@ -410,7 +409,7 @@ pub(super) fn spawn_storage_sync_thread< runtime, conf, receiver, - remote_index_cloned, + loop_index, storage, max_concurrent_sync, max_sync_errors, @@ -438,14 +437,14 @@ fn storage_sync_loop< runtime: Runtime, conf: &'static PageServerConf, mut receiver: UnboundedReceiver, - index: Arc>, + index: RemoteIndex, storage: S, max_concurrent_sync: NonZeroUsize, max_sync_errors: NonZeroU32, ) { - let remote_assets = Arc::new((storage, Arc::clone(&index))); + let remote_assets = Arc::new((storage, index.clone())); loop { - let index = Arc::clone(&index); + let index = index.clone(); let loop_step = runtime.block_on(async { tokio::select! { new_timeline_states = loop_step( @@ -480,7 +479,7 @@ async fn loop_step< >( conf: &'static PageServerConf, receiver: &mut UnboundedReceiver, - remote_assets: Arc<(S, Arc>)>, + remote_assets: Arc<(S, RemoteIndex)>, max_concurrent_sync: NonZeroUsize, max_sync_errors: NonZeroU32, ) -> HashMap> { @@ -560,7 +559,7 @@ async fn process_task< S: RemoteStorage + Send + Sync + 'static, >( conf: &'static PageServerConf, - remote_assets: Arc<(S, Arc>)>, + remote_assets: Arc<(S, RemoteIndex)>, task: SyncTask, max_sync_errors: NonZeroU32, ) -> Option { @@ -584,7 +583,7 @@ async fn process_task< tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await; } - let remote_index = Arc::clone(&remote_assets.1); + let remote_index = &remote_assets.1; let sync_start = Instant::now(); let sync_name = task.kind.sync_name(); @@ -592,7 +591,7 @@ async fn process_task< SyncKind::Download(download_data) => { let download_result = download_timeline( conf, - remote_assets, + remote_assets.clone(), task.sync_id, download_data, task.retries + 1, @@ -772,7 +771,7 @@ async fn fetch_full_index< P: Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, >( - (storage, index): &(S, Arc>), + (storage, index): &(S, RemoteIndex), timeline_dir: &Path, id: ZTenantTimelineId, ) -> anyhow::Result { @@ -808,8 +807,9 @@ async fn fetch_full_index< } }; drop(index_read); // tokio rw lock is not upgradeable - let mut index_write = index.write().await; - index_write + index + .write() + .await .upgrade_timeline_entry(&id, full_index.clone()) .context("cannot upgrade timeline entry in remote index")?; Ok(full_index) @@ -855,7 +855,7 @@ mod test_utils { #[track_caller] pub async fn ensure_correct_timeline_upload( harness: &RepoHarness, - remote_assets: Arc<(LocalFs, Arc>)>, + remote_assets: Arc<(LocalFs, RemoteIndex)>, timeline_id: ZTimelineId, new_upload: NewCheckpoint, ) { @@ -872,7 +872,7 @@ mod test_utils { let (storage, index) = remote_assets.as_ref(); assert_index_descriptions( index, - RemoteTimelineIndex::try_parse_descriptions_from_paths( + &RemoteIndex::try_parse_descriptions_from_paths( harness.conf, remote_assets .0 @@ -914,7 +914,7 @@ mod test_utils { } pub async fn expect_timeline( - index: &Arc>, + index: &RemoteIndex, sync_id: ZTenantTimelineId, ) -> RemoteTimeline { if let Some(TimelineIndexEntryInner::Full(remote_timeline)) = index @@ -934,9 +934,11 @@ mod test_utils { #[track_caller] pub async fn assert_index_descriptions( - index: &Arc>, - expected_index_with_descriptions: RemoteTimelineIndex, + index: &RemoteIndex, + expected_index_with_descriptions: &RemoteIndex, ) { + let expected_index_with_descriptions = expected_index_with_descriptions.read().await; + let index_read = index.read().await; let actual_sync_ids = index_read.all_sync_ids().collect::>(); let expected_sync_ids = expected_index_with_descriptions diff --git a/pageserver/src/remote_storage/storage_sync/download.rs b/pageserver/src/remote_storage/storage_sync/download.rs index e5362b2973..32549c8650 100644 --- a/pageserver/src/remote_storage/storage_sync/download.rs +++ b/pageserver/src/remote_storage/storage_sync/download.rs @@ -3,7 +3,7 @@ use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc}; use anyhow::{ensure, Context}; -use tokio::{fs, sync::RwLock}; +use tokio::fs; use tracing::{debug, error, trace, warn}; use zenith_utils::zid::ZTenantId; @@ -20,8 +20,8 @@ use crate::{ }; use super::{ - index::{ArchiveId, RemoteTimeline, RemoteTimelineIndex}, - TimelineDownload, + index::{ArchiveId, RemoteTimeline}, + RemoteIndex, TimelineDownload, }; /// Timeline download result, with extra data, needed for downloading. @@ -47,7 +47,7 @@ pub(super) async fn download_timeline< S: RemoteStorage + Send + Sync + 'static, >( conf: &'static PageServerConf, - remote_assets: Arc<(S, Arc>)>, + remote_assets: Arc<(S, RemoteIndex)>, sync_id: ZTenantTimelineId, mut download: TimelineDownload, retries: u32, @@ -167,7 +167,7 @@ async fn try_download_archive< tenant_id, timeline_id, }: ZTenantTimelineId, - remote_assets: Arc<(S, Arc>)>, + remote_assets: Arc<(S, RemoteIndex)>, remote_timeline: &RemoteTimeline, archive_id: ArchiveId, files_to_skip: Arc>, @@ -255,16 +255,14 @@ mod tests { let repo_harness = RepoHarness::create("test_download_timeline")?; let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID); let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?; - let index = Arc::new(RwLock::new( - RemoteTimelineIndex::try_parse_descriptions_from_paths( - repo_harness.conf, - storage - .list() - .await? - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), - ), - )); + let index = RemoteIndex::try_parse_descriptions_from_paths( + repo_harness.conf, + storage + .list() + .await? + .into_iter() + .map(|storage_path| storage.local_path(&storage_path).unwrap()), + ); let remote_assets = Arc::new((storage, index)); let storage = &remote_assets.0; let index = &remote_assets.1; @@ -314,7 +312,7 @@ mod tests { .await; assert_index_descriptions( index, - RemoteTimelineIndex::try_parse_descriptions_from_paths( + &RemoteIndex::try_parse_descriptions_from_paths( repo_harness.conf, remote_assets .0 diff --git a/pageserver/src/remote_storage/storage_sync/index.rs b/pageserver/src/remote_storage/storage_sync/index.rs index 7d6b4881f7..d7bd1f1657 100644 --- a/pageserver/src/remote_storage/storage_sync/index.rs +++ b/pageserver/src/remote_storage/storage_sync/index.rs @@ -7,10 +7,12 @@ use std::{ collections::{BTreeMap, BTreeSet, HashMap}, path::{Path, PathBuf}, + sync::Arc, }; use anyhow::{bail, ensure, Context}; use serde::{Deserialize, Serialize}; +use tokio::sync::RwLock; use tracing::*; use zenith_utils::{ lsn::Lsn, @@ -55,11 +57,14 @@ pub struct RemoteTimelineIndex { timeline_entries: HashMap, } -impl RemoteTimelineIndex { +/// A wrapper to synchrnize access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`]. +pub struct RemoteIndex(Arc>); + +impl RemoteIndex { pub fn empty() -> Self { - Self { + Self(Arc::new(RwLock::new(RemoteTimelineIndex { timeline_entries: HashMap::new(), - } + }))) } /// Attempts to parse file paths (not checking the file contents) and find files @@ -69,7 +74,9 @@ impl RemoteTimelineIndex { conf: &'static PageServerConf, paths: impl Iterator, ) -> Self { - let mut index = Self::empty(); + let mut index = RemoteTimelineIndex { + timeline_entries: HashMap::new(), + }; for path in paths { if let Err(e) = try_parse_index_entry(&mut index, conf, path.as_ref()) { debug!( @@ -79,9 +86,26 @@ impl RemoteTimelineIndex { ); } } - index + + Self(Arc::new(RwLock::new(index))) } + pub async fn read(&self) -> tokio::sync::RwLockReadGuard<'_, RemoteTimelineIndex> { + self.0.read().await + } + + pub async fn write(&self) -> tokio::sync::RwLockWriteGuard<'_, RemoteTimelineIndex> { + self.0.write().await + } +} + +impl Clone for RemoteIndex { + fn clone(&self) -> Self { + Self(Arc::clone(&self.0)) + } +} + +impl RemoteTimelineIndex { pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&TimelineIndexEntry> { self.timeline_entries.get(id) } diff --git a/pageserver/src/remote_storage/storage_sync/upload.rs b/pageserver/src/remote_storage/storage_sync/upload.rs index dfc4433694..76e92c2781 100644 --- a/pageserver/src/remote_storage/storage_sync/upload.rs +++ b/pageserver/src/remote_storage/storage_sync/upload.rs @@ -2,7 +2,6 @@ use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc}; -use tokio::sync::RwLock; use tracing::{debug, error, warn}; use crate::{ @@ -17,7 +16,7 @@ use crate::{ }, }; -use super::{compression::ArchiveHeader, index::RemoteTimelineIndex, NewCheckpoint}; +use super::{compression::ArchiveHeader, NewCheckpoint, RemoteIndex}; /// Attempts to compress and upload given checkpoint files. /// No extra checks for overlapping files is made: download takes care of that, ensuring no non-metadata local timeline files are overwritten. @@ -29,7 +28,7 @@ pub(super) async fn upload_timeline_checkpoint< S: RemoteStorage + Send + Sync + 'static, >( config: &'static PageServerConf, - remote_assets: Arc<(S, Arc>)>, + remote_assets: Arc<(S, RemoteIndex)>, sync_id: ZTenantTimelineId, new_checkpoint: NewCheckpoint, retries: u32, @@ -156,7 +155,7 @@ async fn try_upload_checkpoint< S: RemoteStorage + Send + Sync + 'static, >( config: &'static PageServerConf, - remote_assets: Arc<(S, Arc>)>, + remote_assets: Arc<(S, RemoteIndex)>, sync_id: ZTenantTimelineId, new_checkpoint: &NewCheckpoint, files_to_skip: BTreeSet, @@ -238,16 +237,14 @@ mod tests { let repo_harness = RepoHarness::create("reupload_timeline")?; let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID); let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?; - let index = Arc::new(RwLock::new( - RemoteTimelineIndex::try_parse_descriptions_from_paths( - repo_harness.conf, - storage - .list() - .await? - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), - ), - )); + let index = RemoteIndex::try_parse_descriptions_from_paths( + repo_harness.conf, + storage + .list() + .await? + .into_iter() + .map(|storage_path| storage.local_path(&storage_path).unwrap()), + ); let remote_assets = Arc::new((storage, index)); let index = &remote_assets.1; @@ -436,16 +433,14 @@ mod tests { let repo_harness = RepoHarness::create("reupload_timeline_rejected")?; let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID); let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?; - let index = Arc::new(RwLock::new( - RemoteTimelineIndex::try_parse_descriptions_from_paths( - repo_harness.conf, - storage - .list() - .await? - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), - ), - )); + let index = RemoteIndex::try_parse_descriptions_from_paths( + repo_harness.conf, + storage + .list() + .await? + .into_iter() + .map(|storage_path| storage.local_path(&storage_path).unwrap()), + ); let remote_assets = Arc::new((storage, index)); let storage = &remote_assets.0; let index = &remote_assets.1; @@ -464,7 +459,7 @@ mod tests { first_checkpoint, ) .await; - let after_first_uploads = RemoteTimelineIndex::try_parse_descriptions_from_paths( + let after_first_uploads = RemoteIndex::try_parse_descriptions_from_paths( repo_harness.conf, remote_assets .0 @@ -495,7 +490,7 @@ mod tests { 0, ) .await; - assert_index_descriptions(index, after_first_uploads.clone()).await; + assert_index_descriptions(index, &after_first_uploads).await; let checkpoint_with_uploaded_lsn = create_local_timeline( &repo_harness, @@ -511,7 +506,7 @@ mod tests { 0, ) .await; - assert_index_descriptions(index, after_first_uploads.clone()).await; + assert_index_descriptions(index, &after_first_uploads).await; Ok(()) } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 074bdf4d01..36273e6d6c 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,6 +1,6 @@ use crate::layered_repository::metadata::TimelineMetadata; use crate::relish::*; -use crate::remote_storage::RemoteTimelineIndex; +use crate::remote_storage::RemoteIndex; use crate::walrecord::MultiXactMember; use crate::CheckpointConfig; use anyhow::Result; @@ -91,7 +91,7 @@ pub trait Repository: Send + Sync { fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>; // Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn. - fn get_remote_index(&self) -> &tokio::sync::RwLock; + fn get_remote_index(&self) -> &RemoteIndex; } /// A timeline, that belongs to the current repository. @@ -407,7 +407,7 @@ pub mod repo_harness { self.conf, walredo_mgr, self.tenant_id, - Arc::new(tokio::sync::RwLock::new(RemoteTimelineIndex::empty())), + RemoteIndex::empty(), false, )); // populate repo with locally available timelines diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 0bc18231c9..e7cc4ecbaf 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,7 +3,7 @@ use crate::config::PageServerConf; use crate::layered_repository::LayeredRepository; -use crate::remote_storage::RemoteTimelineIndex; +use crate::remote_storage::RemoteIndex; use crate::repository::{Repository, Timeline, TimelineSyncStatusUpdate}; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; @@ -66,7 +66,7 @@ fn access_tenants() -> MutexGuard<'static, HashMap> { pub fn load_local_repo( conf: &'static PageServerConf, tenant_id: ZTenantId, - remote_index: &Arc>, + remote_index: &RemoteIndex, ) -> Arc { let mut m = access_tenants(); let tenant = m.entry(tenant_id).or_insert_with(|| { @@ -78,7 +78,7 @@ pub fn load_local_repo( conf, Arc::new(walredo_mgr), tenant_id, - Arc::clone(remote_index), + remote_index.clone(), conf.remote_storage_config.is_some(), )); Tenant { @@ -92,7 +92,7 @@ pub fn load_local_repo( /// Updates tenants' repositories, changing their timelines state in memory. pub fn apply_timeline_sync_status_updates( conf: &'static PageServerConf, - remote_index: Arc>, + remote_index: RemoteIndex, sync_status_updates: HashMap>, ) { if sync_status_updates.is_empty() { @@ -172,7 +172,7 @@ pub fn shutdown_all_tenants() { pub fn create_tenant_repository( conf: &'static PageServerConf, tenantid: ZTenantId, - remote_index: Arc>, + remote_index: RemoteIndex, ) -> Result> { match access_tenants().entry(tenantid) { Entry::Occupied(_) => { diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 8c018ce70f..53c4124701 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -15,13 +15,13 @@ use std::{ use tracing::*; use zenith_utils::lsn::Lsn; -use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +use zenith_utils::zid::{ZTenantId, ZTimelineId}; use zenith_utils::{crashsafe_dir, logging}; use crate::{ config::PageServerConf, layered_repository::metadata::TimelineMetadata, - remote_storage::RemoteTimelineIndex, + remote_storage::RemoteIndex, repository::{LocalTimelineState, Repository}, }; use crate::{import_datadir, LOG_FILE_NAME}; @@ -127,22 +127,6 @@ pub struct TimelineInfo { pub remote: Option, } -pub fn extract_remote_timeline_info( - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - remote_index: &RemoteTimelineIndex, -) -> Option { - remote_index - .timeline_entry(&ZTenantTimelineId { - tenant_id, - timeline_id, - }) - .map(|remote_entry| RemoteTimelineInfo { - remote_consistent_lsn: remote_entry.disk_consistent_lsn(), - awaits_download: remote_entry.get_awaits_download(), - }) -} - #[derive(Debug, Clone, Copy)] pub struct PointInTime { pub timeline_id: ZTimelineId, @@ -179,7 +163,7 @@ pub fn init_pageserver( pub enum CreateRepo { Real { wal_redo_manager: Arc, - remote_index: Arc>, + remote_index: RemoteIndex, }, Dummy, } @@ -207,8 +191,7 @@ pub fn create_repo( // anymore, but I think that could still happen. let wal_redo_manager = Arc::new(crate::walredo::DummyRedoManager {}); - let remote_index = Arc::new(tokio::sync::RwLock::new(RemoteTimelineIndex::empty())); - (wal_redo_manager as _, remote_index) + (wal_redo_manager as _, RemoteIndex::empty()) } }; From 07342f751902b06b253847065f24ddca735e00b3 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 28 Mar 2022 13:03:46 +0300 Subject: [PATCH 0072/1022] Major storage format rewrite. This is a backwards-incompatible change. The new pageserver cannot read repositories created with an old pageserver binary, or vice versa. Simplify Repository to a value-store ------------------------------------ Move the responsibility of tracking relation metadata, like which relations exist and what are their sizes, from Repository to a new module, pgdatadir_mapping.rs. The interface to Repository is now a simple key-value PUT/GET operations. It's still not any old key-value store though. A Repository is still responsible from handling branching, and every GET operation comes with an LSN. Mapping from Postgres data directory to keys/values --------------------------------------------------- All the data is now stored in the key-value store. The 'pgdatadir_mapping.rs' module handles mapping from PostgreSQL objects like relation pages and SLRUs, to key-value pairs. The key to the Repository key-value store is a Key struct, which consists of a few integer fields. It's wide enough to store a full RelFileNode, fork and block number, and to distinguish those from metadata keys. 'pgdatadir_mapping.rs' is also responsible for maintaining a "partitioning" of the keyspace. Partitioning means splitting the keyspace so that each partition holds a roughly equal number of keys. The partitioning is used when new image layer files are created, so that each image layer file is roughly the same size. The partitioning is also responsible for reclaiming space used by deleted keys. The Repository implementation doesn't have any explicit support for deleting keys. Instead, the deleted keys are simply omitted from the partitioning, and when a new image layer is created, the omitted keys are not copied over to the new image layer. We might want to implement tombstone keys in the future, to reclaim space faster, but this will work for now. Changes to low-level layer file code ------------------------------------ The concept of a "segment" is gone. Each layer file can now store an arbitrary range of Keys. Checkpointing, compaction ------------------------- The background tasks are somewhat different now. Whenever checkpoint_distance is reached, the WAL receiver thread "freezes" the current in-memory layer, and creates a new one. This is a quick operation and doesn't perform any I/O yet. It then launches a background "layer flushing thread" to write the frozen layer to disk, as a new L0 delta layer. This mechanism takes care of durability. It replaces the checkpointing thread. Compaction is a new background operation that takes a bunch of L0 delta layers, and reshuffles the data in them. It runs in a separate compaction thread. Deployment ---------- This also contains changes to the ansible scripts that enable having multiple different pageservers running at the same time in the staging environment. We will use that to keep an old version of the pageserver running, for clusters created with the old version, at the same time with a new pageserver with the new binary. Author: Heikki Linnakangas Author: Konstantin Knizhnik Author: Andrey Taranik Reviewed-by: Matthias Van De Meent Reviewed-by: Bojan Serafimov Reviewed-by: Konstantin Knizhnik Reviewed-by: Anton Shyrabokau Reviewed-by: Dhammika Pathirana Reviewed-by: Kirill Bulatov Reviewed-by: Anastasia Lubennikova Reviewed-by: Alexey Kondratov --- .circleci/ansible/.gitignore | 2 + .circleci/ansible/deploy.yaml | 71 +- .circleci/ansible/production.hosts | 17 +- .circleci/ansible/scripts/init_pageserver.sh | 30 + .circleci/ansible/staging.hosts | 18 +- .circleci/config.yml | 2 +- Cargo.lock | 1 + docs/glossary.md | 55 +- docs/rfcs/014-storage-lsm.md | 145 ++ docs/settings.md | 8 +- pageserver/Cargo.toml | 1 + pageserver/src/basebackup.rs | 143 +- pageserver/src/bin/pageserver.rs | 2 +- pageserver/src/config.rs | 43 +- pageserver/src/http/routes.rs | 4 + pageserver/src/import_datadir.rs | 210 +- pageserver/src/keyspace.rs | 134 + pageserver/src/layered_repository.rs | 2242 ++++++++--------- pageserver/src/layered_repository/README.md | 188 +- .../src/layered_repository/delta_layer.rs | 615 +++-- pageserver/src/layered_repository/filename.rs | 300 +-- .../layered_repository/global_layer_map.rs | 142 -- .../src/layered_repository/image_layer.rs | 370 ++- .../src/layered_repository/inmemory_layer.rs | 747 ++---- .../src/layered_repository/interval_tree.rs | 468 ---- .../src/layered_repository/layer_map.rs | 711 +++--- pageserver/src/layered_repository/metadata.rs | 183 +- .../src/layered_repository/storage_layer.rs | 183 +- pageserver/src/lib.rs | 24 +- pageserver/src/page_cache.rs | 17 +- pageserver/src/page_service.rs | 122 +- pageserver/src/pgdatadir_mapping.rs | 1350 ++++++++++ pageserver/src/relish.rs | 226 -- pageserver/src/reltag.rs | 105 + pageserver/src/remote_storage/README.md | 2 +- pageserver/src/remote_storage/local_fs.rs | 2 +- pageserver/src/remote_storage/storage_sync.rs | 6 +- .../storage_sync/compression.rs | 2 +- .../src/remote_storage/storage_sync/index.rs | 2 +- pageserver/src/repository.rs | 1042 +++----- pageserver/src/tenant_mgr.rs | 55 +- pageserver/src/tenant_threads.rs | 28 +- pageserver/src/thread_mgr.rs | 9 +- pageserver/src/timelines.rs | 72 +- pageserver/src/walingest.rs | 965 +++++-- pageserver/src/walreceiver.rs | 24 +- pageserver/src/walrecord.rs | 64 +- pageserver/src/walredo.rs | 170 +- postgres_ffi/src/pg_constants.rs | 4 +- test_runner/batch_others/test_snapfiles_gc.py | 130 - test_runner/fixtures/utils.py | 5 +- vendor/postgres | 2 +- 52 files changed, 5878 insertions(+), 5585 deletions(-) create mode 100644 .circleci/ansible/.gitignore create mode 100644 .circleci/ansible/scripts/init_pageserver.sh create mode 100644 docs/rfcs/014-storage-lsm.md create mode 100644 pageserver/src/keyspace.rs delete mode 100644 pageserver/src/layered_repository/global_layer_map.rs delete mode 100644 pageserver/src/layered_repository/interval_tree.rs create mode 100644 pageserver/src/pgdatadir_mapping.rs delete mode 100644 pageserver/src/relish.rs create mode 100644 pageserver/src/reltag.rs delete mode 100644 test_runner/batch_others/test_snapfiles_gc.py diff --git a/.circleci/ansible/.gitignore b/.circleci/ansible/.gitignore new file mode 100644 index 0000000000..14a1c155ae --- /dev/null +++ b/.circleci/ansible/.gitignore @@ -0,0 +1,2 @@ +zenith_install.tar.gz +.zenith_current_version diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml index 1f43adf950..020a852a00 100644 --- a/.circleci/ansible/deploy.yaml +++ b/.circleci/ansible/deploy.yaml @@ -1,14 +1,11 @@ - name: Upload Zenith binaries - hosts: pageservers:safekeepers + hosts: storage gather_facts: False remote_user: admin - vars: - force_deploy: false tasks: - name: get latest version of Zenith binaries - ignore_errors: true register: current_version_file set_fact: current_version: "{{ lookup('file', '.zenith_current_version') | trim }}" @@ -16,48 +13,13 @@ - pageserver - safekeeper - - name: set zero value for current_version - when: current_version_file is failed - set_fact: - current_version: "0" - tags: - - pageserver - - safekeeper - - - name: get deployed version from content of remote file - ignore_errors: true - ansible.builtin.slurp: - src: /usr/local/.zenith_current_version - register: remote_version_file - tags: - - pageserver - - safekeeper - - - name: decode remote file content - when: remote_version_file is succeeded - set_fact: - remote_version: "{{ remote_version_file['content'] | b64decode | trim }}" - tags: - - pageserver - - safekeeper - - - name: set zero value for remote_version - when: remote_version_file is failed - set_fact: - remote_version: "0" - tags: - - pageserver - - safekeeper - - name: inform about versions - debug: msg="Version to deploy - {{ current_version }}, version on storage node - {{ remote_version }}" + debug: msg="Version to deploy - {{ current_version }}" tags: - pageserver - safekeeper - - name: upload and extract Zenith binaries to /usr/local - when: current_version > remote_version or force_deploy ansible.builtin.unarchive: owner: root group: root @@ -74,14 +36,24 @@ hosts: pageservers gather_facts: False remote_user: admin - vars: - force_deploy: false tasks: + + - name: upload init script + when: console_mgmt_base_url is defined + ansible.builtin.template: + src: scripts/init_pageserver.sh + dest: /tmp/init_pageserver.sh + owner: root + group: root + mode: '0755' + become: true + tags: + - pageserver + - name: init pageserver - when: current_version > remote_version or force_deploy shell: - cmd: sudo -u pageserver /usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data + cmd: /tmp/init_pageserver.sh args: creates: "/storage/pageserver/data/tenants" environment: @@ -107,7 +79,6 @@ # - pageserver - name: upload systemd service definition - when: current_version > remote_version or force_deploy ansible.builtin.template: src: systemd/pageserver.service dest: /etc/systemd/system/pageserver.service @@ -119,7 +90,6 @@ - pageserver - name: start systemd service - when: current_version > remote_version or force_deploy ansible.builtin.systemd: daemon_reload: yes name: pageserver @@ -130,7 +100,7 @@ - pageserver - name: post version to console - when: (current_version > remote_version or force_deploy) and console_mgmt_base_url is defined + when: console_mgmt_base_url is defined shell: cmd: | INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) @@ -142,22 +112,18 @@ hosts: safekeepers gather_facts: False remote_user: admin - vars: - force_deploy: false tasks: # in the future safekeepers should discover pageservers byself # but currently use first pageserver that was discovered - name: set first pageserver var for safekeepers - when: current_version > remote_version or force_deploy set_fact: first_pageserver: "{{ hostvars[groups['pageservers'][0]]['inventory_hostname'] }}" tags: - safekeeper - name: upload systemd service definition - when: current_version > remote_version or force_deploy ansible.builtin.template: src: systemd/safekeeper.service dest: /etc/systemd/system/safekeeper.service @@ -169,7 +135,6 @@ - safekeeper - name: start systemd service - when: current_version > remote_version or force_deploy ansible.builtin.systemd: daemon_reload: yes name: safekeeper @@ -180,7 +145,7 @@ - safekeeper - name: post version to console - when: (current_version > remote_version or force_deploy) and console_mgmt_base_url is defined + when: console_mgmt_base_url is defined shell: cmd: | INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) diff --git a/.circleci/ansible/production.hosts b/.circleci/ansible/production.hosts index 3a0543f39a..13224b7cf5 100644 --- a/.circleci/ansible/production.hosts +++ b/.circleci/ansible/production.hosts @@ -1,7 +1,16 @@ [pageservers] -zenith-1-ps-1 bucket_name=zenith-storage-oregon bucket_region=us-west-2 +zenith-1-ps-1 console_region_id=1 [safekeepers] -zenith-1-sk-1 -zenith-1-sk-2 -zenith-1-sk-3 +zenith-1-sk-1 console_region_id=1 +zenith-1-sk-2 console_region_id=1 +zenith-1-sk-3 console_region_id=1 + +[storage:children] +pageservers +safekeepers + +[storage:vars] +console_mgmt_base_url = http://console-release.local +bucket_name = zenith-storage-oregon +bucket_region = us-west-2 diff --git a/.circleci/ansible/scripts/init_pageserver.sh b/.circleci/ansible/scripts/init_pageserver.sh new file mode 100644 index 0000000000..1cbdd0db94 --- /dev/null +++ b/.circleci/ansible/scripts/init_pageserver.sh @@ -0,0 +1,30 @@ +#!/bin/sh + +# get instance id from meta-data service +INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) + +# store fqdn hostname in var +HOST=$(hostname -f) + + +cat < Page ID + + ++---+ +| | Layer file ++---+ +``` + + +# Memtable + +When new WAL arrives, it is first put into the Memtable. Despite the +name, the Memtable is not a purely in-memory data structure. It can +spill to a temporary file on disk if the system is low on memory, and +is accessed through a buffer cache. + +If the page server crashes, the Memtable is lost. It is rebuilt by +processing again the WAL that's newer than the latest layer in L0. + +The size of the Memtable is configured by the "checkpoint distance" +setting. Because anything that hasn't been flushed to disk and +uploaded to S3 yet needs to be kept in the safekeeper, the "checkpoint +distance" also determines the amount of WAL that needs to kept in the +safekeeper. + +# L0 + +When the Memtable fills up, it is written out to a new file in L0. The +files are immutable; when a file is created, it is never +modified. Each file in L0 is roughly 1 GB in size (*). Like the +Memtable, each file in L0 covers the whole key range. + +When enough files have been accumulated in L0, compaction +starts. Compaction processes all the files in L0 and reshuffles the +data to create a new set of files in L1. + + +(*) except in corner cases like if we want to shut down the page +server and want to flush out the memtable to disk even though it's not +full yet. + + +# L1 + +L1 consists of ~ 1 GB files like L0. But each file covers only part of +the overall key space, and a larger range of LSNs. This speeds up +searches. When you're looking for a given page, you need to check all +the files in L0, to see if they contain a page version for the requested +page. But in L1, you only need to check the files whose key range covers +the requested page. This is particularly important at cold start, when +checking a file means downloading it from S3. + +Partitioning by key range also helps with garbage collection. If only a +part of the database is updated, we will accumulate more files for +the hot part in L1, and old files can be removed without affecting the +cold part. + + +# Image layers + +So far, we've only talked about delta layers. In addition to the delta +layers, we create image layers, when "enough" WAL has been accumulated +for some part of the database. Each image layer covers a 1 GB range of +key space. It contains images of the pages at a single LSN, a snapshot +if you will. + +The exact heuristic for what "enough" means is not clear yet. Maybe +create a new image layer when 10 GB of WAL has been accumulated for a +1 GB segment. + +The image layers limit the number of layers that a search needs to +check. That put a cap on read latency, and it also allows garbage +collecting layers that are older than the GC horizon. + + +# Partitioning scheme + +When compaction happens and creates a new set of files in L1, how do +we partition the data into the files? + +- Goal is that each file is ~ 1 GB in size +- Try to match partition boundaries at relation boundaries. (See [1] + for how PebblesDB does this, and for why that's important) +- Greedy algorithm + +# Additional Reading + +[1] Paper on PebblesDB and how it does partitioning. +https://www.cs.utexas.edu/~rak/papers/sosp17-pebblesdb.pdf diff --git a/docs/settings.md b/docs/settings.md index 571cfba8df..69aadc602f 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -68,11 +68,11 @@ S3. The unit is # of bytes. -#### checkpoint_period +#### compaction_period -The pageserver checks whether `checkpoint_distance` has been reached -every `checkpoint_period` seconds. Default is 1 s, which should be -fine. +Every `compaction_period` seconds, the page server checks if +maintenance operations, like compaction, are needed on the layer +files. Default is 1 s, which should be fine. #### gc_horizon diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 46e6e2a8f1..de22d0dd77 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -12,6 +12,7 @@ bytes = { version = "1.0.1", features = ['serde'] } byteorder = "1.4.3" futures = "0.3.13" hyper = "0.14" +itertools = "0.10.3" lazy_static = "1.4.0" log = "0.4.14" clap = "3.0" diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 5711f1807d..e2a56f17d6 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -20,8 +20,9 @@ use std::sync::Arc; use std::time::SystemTime; use tar::{Builder, EntryType, Header}; -use crate::relish::*; +use crate::reltag::SlruKind; use crate::repository::Timeline; +use crate::DatadirTimelineImpl; use postgres_ffi::xlog_utils::*; use postgres_ffi::*; use zenith_utils::lsn::Lsn; @@ -31,7 +32,7 @@ use zenith_utils::lsn::Lsn; /// used for constructing tarball. pub struct Basebackup<'a> { ar: Builder<&'a mut dyn Write>, - timeline: &'a Arc, + timeline: &'a Arc, pub lsn: Lsn, prev_record_lsn: Lsn, } @@ -46,7 +47,7 @@ pub struct Basebackup<'a> { impl<'a> Basebackup<'a> { pub fn new( write: &'a mut dyn Write, - timeline: &'a Arc, + timeline: &'a Arc, req_lsn: Option, ) -> Result> { // Compute postgres doesn't have any previous WAL files, but the first @@ -64,13 +65,13 @@ impl<'a> Basebackup<'a> { // prev_lsn to Lsn(0) if we cannot provide the correct value. let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { // Backup was requested at a particular LSN. Wait for it to arrive. - timeline.wait_lsn(req_lsn)?; + timeline.tline.wait_lsn(req_lsn)?; // If the requested point is the end of the timeline, we can // provide prev_lsn. (get_last_record_rlsn() might return it as // zero, though, if no WAL has been generated on this timeline // yet.) - let end_of_timeline = timeline.get_last_record_rlsn(); + let end_of_timeline = timeline.tline.get_last_record_rlsn(); if req_lsn == end_of_timeline.last { (end_of_timeline.prev, req_lsn) } else { @@ -78,7 +79,7 @@ impl<'a> Basebackup<'a> { } } else { // Backup was requested at end of the timeline. - let end_of_timeline = timeline.get_last_record_rlsn(); + let end_of_timeline = timeline.tline.get_last_record_rlsn(); (end_of_timeline.prev, end_of_timeline.last) }; @@ -115,21 +116,24 @@ impl<'a> Basebackup<'a> { } // Gather non-relational files from object storage pages. - for obj in self.timeline.list_nonrels(self.lsn)? { - match obj { - RelishTag::Slru { slru, segno } => { - self.add_slru_segment(slru, segno)?; - } - RelishTag::FileNodeMap { spcnode, dbnode } => { - self.add_relmap_file(spcnode, dbnode)?; - } - RelishTag::TwoPhase { xid } => { - self.add_twophase_file(xid)?; - } - _ => {} + for kind in [ + SlruKind::Clog, + SlruKind::MultiXactOffsets, + SlruKind::MultiXactMembers, + ] { + for segno in self.timeline.list_slru_segments(kind, self.lsn)? { + self.add_slru_segment(kind, segno)?; } } + // Create tablespace directories + for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? { + self.add_dbdir(spcnode, dbnode, has_relmap_file)?; + } + for xid in self.timeline.list_twophase_files(self.lsn)? { + self.add_twophase_file(xid)?; + } + // Generate pg_control and bootstrap WAL segment. self.add_pgcontrol_file()?; self.ar.finish()?; @@ -141,28 +145,14 @@ impl<'a> Basebackup<'a> { // Generate SLRU segment files from repository. // fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { - let seg_size = self - .timeline - .get_relish_size(RelishTag::Slru { slru, segno }, self.lsn)?; - - let nblocks = match seg_size { - Some(seg_size) => seg_size, - None => { - trace!( - "SLRU segment {}/{:>04X} was truncated", - slru.to_str(), - segno - ); - return Ok(()); - } - }; + let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?; let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * pg_constants::BLCKSZ as usize); for blknum in 0..nblocks { - let img = - self.timeline - .get_page_at_lsn(RelishTag::Slru { slru, segno }, blknum, self.lsn)?; + let img = self + .timeline + .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?; ensure!(img.len() == pg_constants::BLCKSZ as usize); slru_buf.extend_from_slice(&img); @@ -177,16 +167,26 @@ impl<'a> Basebackup<'a> { } // - // Extract pg_filenode.map files from repository - // Along with them also send PG_VERSION for each database. + // Include database/tablespace directories. // - fn add_relmap_file(&mut self, spcnode: u32, dbnode: u32) -> anyhow::Result<()> { - let img = self.timeline.get_page_at_lsn( - RelishTag::FileNodeMap { spcnode, dbnode }, - 0, - self.lsn, - )?; - let path = if spcnode == pg_constants::GLOBALTABLESPACE_OID { + // Each directory contains a PG_VERSION file, and the default database + // directories also contain pg_filenode.map files. + // + fn add_dbdir( + &mut self, + spcnode: u32, + dbnode: u32, + has_relmap_file: bool, + ) -> anyhow::Result<()> { + let relmap_img = if has_relmap_file { + let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?; + ensure!(img.len() == 512); + Some(img) + } else { + None + }; + + if spcnode == pg_constants::GLOBALTABLESPACE_OID { let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes(); let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?; self.ar.append(&header, version_bytes)?; @@ -194,8 +194,32 @@ impl<'a> Basebackup<'a> { let header = new_tar_header("global/PG_VERSION", version_bytes.len() as u64)?; self.ar.append(&header, version_bytes)?; - String::from("global/pg_filenode.map") // filenode map for global tablespace + if let Some(img) = relmap_img { + // filenode map for global tablespace + let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?; + self.ar.append(&header, &img[..])?; + } else { + warn!("global/pg_filenode.map is missing"); + } } else { + // User defined tablespaces are not supported. However, as + // a special case, if a tablespace/db directory is + // completely empty, we can leave it out altogether. This + // makes taking a base backup after the 'tablespace' + // regression test pass, because the test drops the + // created tablespaces after the tests. + // + // FIXME: this wouldn't be necessary, if we handled + // XLOG_TBLSPC_DROP records. But we probably should just + // throw an error on CREATE TABLESPACE in the first place. + if !has_relmap_file + && self + .timeline + .list_rels(spcnode, dbnode, self.lsn)? + .is_empty() + { + return Ok(()); + } // User defined tablespaces are not supported ensure!(spcnode == pg_constants::DEFAULTTABLESPACE_OID); @@ -204,16 +228,17 @@ impl<'a> Basebackup<'a> { let header = new_tar_header_dir(&path)?; self.ar.append(&header, &mut io::empty())?; - let dst_path = format!("base/{}/PG_VERSION", dbnode); - let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes(); - let header = new_tar_header(&dst_path, version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; + if let Some(img) = relmap_img { + let dst_path = format!("base/{}/PG_VERSION", dbnode); + let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes(); + let header = new_tar_header(&dst_path, version_bytes.len() as u64)?; + self.ar.append(&header, version_bytes)?; - format!("base/{}/pg_filenode.map", dbnode) + let relmap_path = format!("base/{}/pg_filenode.map", dbnode); + let header = new_tar_header(&relmap_path, img.len() as u64)?; + self.ar.append(&header, &img[..])?; + } }; - ensure!(img.len() == 512); - let header = new_tar_header(&path, img.len() as u64)?; - self.ar.append(&header, &img[..])?; Ok(()) } @@ -221,9 +246,7 @@ impl<'a> Basebackup<'a> { // Extract twophase state files // fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { - let img = self - .timeline - .get_page_at_lsn(RelishTag::TwoPhase { xid }, 0, self.lsn)?; + let img = self.timeline.get_twophase_file(xid, self.lsn)?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); @@ -243,11 +266,11 @@ impl<'a> Basebackup<'a> { fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> { let checkpoint_bytes = self .timeline - .get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn) + .get_checkpoint(self.lsn) .context("failed to get checkpoint bytes")?; let pg_control_bytes = self .timeline - .get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn) + .get_control_file(self.lsn) .context("failed get control bytes")?; let mut pg_control = ControlFileData::decode(&pg_control_bytes)?; let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?; @@ -268,7 +291,7 @@ impl<'a> Basebackup<'a> { // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { - if self.lsn == self.timeline.get_ancestor_lsn() { + if self.lsn == self.timeline.tline.get_ancestor_lsn() { write!(zenith_signal, "PREV LSN: none")?; } else { write!(zenith_signal, "PREV LSN: invalid")?; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index e217806147..0af96cff66 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -20,7 +20,7 @@ use pageserver::{ config::{defaults::*, PageServerConf}, http, page_cache, page_service, remote_storage::{self, SyncStartupData}, - repository::TimelineSyncStatusUpdate, + repository::{Repository, TimelineSyncStatusUpdate}, tenant_mgr, thread_mgr, thread_mgr::ThreadKind, timelines, virtual_file, LOG_FILE_NAME, diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index dc85c83c17..0fdfb4ceed 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -31,7 +31,8 @@ pub mod defaults { // would be more appropriate. But a low value forces the code to be exercised more, // which is good for now to trigger bugs. pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; - pub const DEFAULT_CHECKPOINT_PERIOD: &str = "1 s"; + + pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s"; pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; pub const DEFAULT_GC_PERIOD: &str = "100 s"; @@ -57,7 +58,7 @@ pub mod defaults { #listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}' #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes -#checkpoint_period = '{DEFAULT_CHECKPOINT_PERIOD}' +#compaction_period = '{DEFAULT_COMPACTION_PERIOD}' #gc_period = '{DEFAULT_GC_PERIOD}' #gc_horizon = {DEFAULT_GC_HORIZON} @@ -91,7 +92,9 @@ pub struct PageServerConf { // This puts a backstop on how much WAL needs to be re-digested if the // page server crashes. pub checkpoint_distance: u64, - pub checkpoint_period: Duration, + + // How often to check if there's compaction work to be done. + pub compaction_period: Duration, pub gc_horizon: u64, pub gc_period: Duration, @@ -145,7 +148,8 @@ struct PageServerConfigBuilder { listen_http_addr: BuilderValue, checkpoint_distance: BuilderValue, - checkpoint_period: BuilderValue, + + compaction_period: BuilderValue, gc_horizon: BuilderValue, gc_period: BuilderValue, @@ -179,8 +183,8 @@ impl Default for PageServerConfigBuilder { listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()), listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()), checkpoint_distance: Set(DEFAULT_CHECKPOINT_DISTANCE), - checkpoint_period: Set(humantime::parse_duration(DEFAULT_CHECKPOINT_PERIOD) - .expect("cannot parse default checkpoint period")), + compaction_period: Set(humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) + .expect("cannot parse default compaction period")), gc_horizon: Set(DEFAULT_GC_HORIZON), gc_period: Set(humantime::parse_duration(DEFAULT_GC_PERIOD) .expect("cannot parse default gc period")), @@ -216,8 +220,8 @@ impl PageServerConfigBuilder { self.checkpoint_distance = BuilderValue::Set(checkpoint_distance) } - pub fn checkpoint_period(&mut self, checkpoint_period: Duration) { - self.checkpoint_period = BuilderValue::Set(checkpoint_period) + pub fn compaction_period(&mut self, compaction_period: Duration) { + self.compaction_period = BuilderValue::Set(compaction_period) } pub fn gc_horizon(&mut self, gc_horizon: u64) { @@ -286,9 +290,9 @@ impl PageServerConfigBuilder { checkpoint_distance: self .checkpoint_distance .ok_or(anyhow::anyhow!("missing checkpoint_distance"))?, - checkpoint_period: self - .checkpoint_period - .ok_or(anyhow::anyhow!("missing checkpoint_period"))?, + compaction_period: self + .compaction_period + .ok_or(anyhow::anyhow!("missing compaction_period"))?, gc_horizon: self .gc_horizon .ok_or(anyhow::anyhow!("missing gc_horizon"))?, @@ -337,10 +341,10 @@ pub struct RemoteStorageConfig { #[derive(Debug, Clone, PartialEq, Eq)] pub enum RemoteStorageKind { /// Storage based on local file system. - /// Specify a root folder to place all stored relish data into. + /// Specify a root folder to place all stored files into. LocalFs(PathBuf), - /// AWS S3 based storage, storing all relishes into the root - /// of the S3 bucket from the config. + /// AWS S3 based storage, storing all files in the S3 bucket + /// specified by the config AwsS3(S3Config), } @@ -425,7 +429,7 @@ impl PageServerConf { "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?), "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?), "checkpoint_distance" => builder.checkpoint_distance(parse_toml_u64(key, item)?), - "checkpoint_period" => builder.checkpoint_period(parse_toml_duration(key, item)?), + "compaction_period" => builder.compaction_period(parse_toml_duration(key, item)?), "gc_horizon" => builder.gc_horizon(parse_toml_u64(key, item)?), "gc_period" => builder.gc_period(parse_toml_duration(key, item)?), "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?), @@ -561,7 +565,7 @@ impl PageServerConf { PageServerConf { id: ZNodeId(0), checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, - checkpoint_period: Duration::from_secs(10), + compaction_period: Duration::from_secs(10), gc_horizon: defaults::DEFAULT_GC_HORIZON, gc_period: Duration::from_secs(10), wait_lsn_timeout: Duration::from_secs(60), @@ -631,7 +635,8 @@ listen_pg_addr = '127.0.0.1:64000' listen_http_addr = '127.0.0.1:9898' checkpoint_distance = 111 # in bytes -checkpoint_period = '111 s' + +compaction_period = '111 s' gc_period = '222 s' gc_horizon = 222 @@ -668,7 +673,7 @@ id = 10 listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, - checkpoint_period: humantime::parse_duration(defaults::DEFAULT_CHECKPOINT_PERIOD)?, + compaction_period: humantime::parse_duration(defaults::DEFAULT_COMPACTION_PERIOD)?, gc_horizon: defaults::DEFAULT_GC_HORIZON, gc_period: humantime::parse_duration(defaults::DEFAULT_GC_PERIOD)?, wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?, @@ -712,7 +717,7 @@ id = 10 listen_pg_addr: "127.0.0.1:64000".to_string(), listen_http_addr: "127.0.0.1:9898".to_string(), checkpoint_distance: 111, - checkpoint_period: Duration::from_secs(111), + compaction_period: Duration::from_secs(111), gc_horizon: 222, gc_period: Duration::from_secs(222), wait_lsn_timeout: Duration::from_secs(111), diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 13e79f8f55..82e818a47b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -22,6 +22,7 @@ use super::models::{ StatusResponse, TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest, }; use crate::remote_storage::{schedule_timeline_download, RemoteIndex}; +use crate::repository::Repository; use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo}; use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId}; @@ -162,8 +163,11 @@ async fn timeline_detail_handler(request: Request) -> Result( path: &Path, - writer: &dyn TimelineWriter, + tline: &mut DatadirTimeline, lsn: Lsn, ) -> Result<()> { let mut pg_control: Option = None; + let mut modification = tline.begin_modification(lsn); + modification.init_empty()?; + // Scan 'global' + let mut relfiles: Vec = Vec::new(); for direntry in fs::read_dir(path.join("global"))? { let direntry = direntry?; match direntry.file_name().to_str() { None => continue, Some("pg_control") => { - pg_control = Some(import_control_file(writer, lsn, &direntry.path())?); + pg_control = Some(import_control_file(&mut modification, &direntry.path())?); + } + Some("pg_filenode.map") => { + import_relmap_file( + &mut modification, + pg_constants::GLOBALTABLESPACE_OID, + 0, + &direntry.path(), + )?; } - Some("pg_filenode.map") => import_nonrel_file( - writer, - lsn, - RelishTag::FileNodeMap { - spcnode: pg_constants::GLOBALTABLESPACE_OID, - dbnode: 0, - }, - &direntry.path(), - )?, - // Load any relation files into the page server - _ => import_relfile( - &direntry.path(), - writer, - lsn, - pg_constants::GLOBALTABLESPACE_OID, - 0, - )?, + // Load any relation files into the page server (but only after the other files) + _ => relfiles.push(direntry.path()), } } + for relfile in relfiles { + import_relfile( + &mut modification, + &relfile, + pg_constants::GLOBALTABLESPACE_OID, + 0, + )?; + } // Scan 'base'. It contains database dirs, the database OID is the filename. // E.g. 'base/12345', where 12345 is the database OID. @@ -76,54 +82,56 @@ pub fn import_timeline_from_postgres_datadir( let dboid = direntry.file_name().to_string_lossy().parse::()?; + let mut relfiles: Vec = Vec::new(); for direntry in fs::read_dir(direntry.path())? { let direntry = direntry?; match direntry.file_name().to_str() { None => continue, - Some("PG_VERSION") => continue, - Some("pg_filenode.map") => import_nonrel_file( - writer, - lsn, - RelishTag::FileNodeMap { - spcnode: pg_constants::DEFAULTTABLESPACE_OID, - dbnode: dboid, - }, + Some("PG_VERSION") => { + //modification.put_dbdir_creation(pg_constants::DEFAULTTABLESPACE_OID, dboid)?; + } + Some("pg_filenode.map") => import_relmap_file( + &mut modification, + pg_constants::DEFAULTTABLESPACE_OID, + dboid, &direntry.path(), )?, // Load any relation files into the page server - _ => import_relfile( - &direntry.path(), - writer, - lsn, - pg_constants::DEFAULTTABLESPACE_OID, - dboid, - )?, + _ => relfiles.push(direntry.path()), } } + for relfile in relfiles { + import_relfile( + &mut modification, + &relfile, + pg_constants::DEFAULTTABLESPACE_OID, + dboid, + )?; + } } for entry in fs::read_dir(path.join("pg_xact"))? { let entry = entry?; - import_slru_file(writer, lsn, SlruKind::Clog, &entry.path())?; + import_slru_file(&mut modification, SlruKind::Clog, &entry.path())?; } for entry in fs::read_dir(path.join("pg_multixact").join("members"))? { let entry = entry?; - import_slru_file(writer, lsn, SlruKind::MultiXactMembers, &entry.path())?; + import_slru_file(&mut modification, SlruKind::MultiXactMembers, &entry.path())?; } for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? { let entry = entry?; - import_slru_file(writer, lsn, SlruKind::MultiXactOffsets, &entry.path())?; + import_slru_file(&mut modification, SlruKind::MultiXactOffsets, &entry.path())?; } for entry in fs::read_dir(path.join("pg_twophase"))? { let entry = entry?; let xid = u32::from_str_radix(&entry.path().to_string_lossy(), 16)?; - import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?; + import_twophase_file(&mut modification, xid, &entry.path())?; } // TODO: Scan pg_tblspc // We're done importing all the data files. - writer.advance_last_record_lsn(lsn); + modification.commit()?; // We expect the Postgres server to be shut down cleanly. let pg_control = pg_control.context("pg_control file not found")?; @@ -141,7 +149,7 @@ pub fn import_timeline_from_postgres_datadir( // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'. import_wal( &path.join("pg_wal"), - writer, + tline, Lsn(pg_control.checkPointCopy.redo), lsn, )?; @@ -150,10 +158,9 @@ pub fn import_timeline_from_postgres_datadir( } // subroutine of import_timeline_from_postgres_datadir(), to load one relation file. -fn import_relfile( +fn import_relfile( + modification: &mut DatadirModification, path: &Path, - timeline: &dyn TimelineWriter, - lsn: Lsn, spcoid: Oid, dboid: Oid, ) -> anyhow::Result<()> { @@ -169,26 +176,35 @@ fn import_relfile( let mut file = File::open(path)?; let mut buf: [u8; 8192] = [0u8; 8192]; + let len = file.metadata().unwrap().len(); + ensure!(len % pg_constants::BLCKSZ as u64 == 0); + let nblocks = len / pg_constants::BLCKSZ as u64; + + if segno != 0 { + todo!(); + } + + let rel = RelTag { + spcnode: spcoid, + dbnode: dboid, + relnode, + forknum, + }; + modification.put_rel_creation(rel, nblocks as u32)?; + let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32); loop { let r = file.read_exact(&mut buf); match r { Ok(_) => { - let rel = RelTag { - spcnode: spcoid, - dbnode: dboid, - relnode, - forknum, - }; - let tag = RelishTag::Relation(rel); - timeline.put_page_image(tag, blknum, lsn, Bytes::copy_from_slice(&buf))?; + modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; } // TODO: UnexpectedEof is expected Err(err) => match err.kind() { std::io::ErrorKind::UnexpectedEof => { // reached EOF. That's expected. - // FIXME: maybe check that we read the full length of the file? + ensure!(blknum == nblocks as u32, "unexpected EOF"); break; } _ => { @@ -202,16 +218,28 @@ fn import_relfile( Ok(()) } -/// -/// Import a "non-blocky" file into the repository -/// -/// This is used for small files like the control file, twophase files etc. that -/// are just slurped into the repository as one blob. -/// -fn import_nonrel_file( - timeline: &dyn TimelineWriter, - lsn: Lsn, - tag: RelishTag, +/// Import a relmapper (pg_filenode.map) file into the repository +fn import_relmap_file( + modification: &mut DatadirModification, + spcnode: Oid, + dbnode: Oid, + path: &Path, +) -> Result<()> { + let mut file = File::open(path)?; + let mut buffer = Vec::new(); + // read the whole file + file.read_to_end(&mut buffer)?; + + trace!("importing relmap file {}", path.display()); + + modification.put_relmap_file(spcnode, dbnode, Bytes::copy_from_slice(&buffer[..]))?; + Ok(()) +} + +/// Import a twophase state file (pg_twophase/) into the repository +fn import_twophase_file( + modification: &mut DatadirModification, + xid: TransactionId, path: &Path, ) -> Result<()> { let mut file = File::open(path)?; @@ -221,7 +249,7 @@ fn import_nonrel_file( trace!("importing non-rel file {}", path.display()); - timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buffer[..]))?; + modification.put_twophase_file(xid, Bytes::copy_from_slice(&buffer[..]))?; Ok(()) } @@ -230,9 +258,8 @@ fn import_nonrel_file( /// /// The control file is imported as is, but we also extract the checkpoint record /// from it and store it separated. -fn import_control_file( - timeline: &dyn TimelineWriter, - lsn: Lsn, +fn import_control_file( + modification: &mut DatadirModification, path: &Path, ) -> Result { let mut file = File::open(path)?; @@ -243,17 +270,12 @@ fn import_control_file( trace!("importing control file {}", path.display()); // Import it as ControlFile - timeline.put_page_image( - RelishTag::ControlFile, - 0, - lsn, - Bytes::copy_from_slice(&buffer[..]), - )?; + modification.put_control_file(Bytes::copy_from_slice(&buffer[..]))?; // Extract the checkpoint record and import it separately. let pg_control = ControlFileData::decode(&buffer)?; let checkpoint_bytes = pg_control.checkPointCopy.encode(); - timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, checkpoint_bytes)?; + modification.put_checkpoint(checkpoint_bytes)?; Ok(pg_control) } @@ -261,28 +283,34 @@ fn import_control_file( /// /// Import an SLRU segment file /// -fn import_slru_file( - timeline: &dyn TimelineWriter, - lsn: Lsn, +fn import_slru_file( + modification: &mut DatadirModification, slru: SlruKind, path: &Path, ) -> Result<()> { - // Does it look like an SLRU file? + trace!("importing slru file {}", path.display()); + let mut file = File::open(path)?; let mut buf: [u8; 8192] = [0u8; 8192]; let segno = u32::from_str_radix(&path.file_name().unwrap().to_string_lossy(), 16)?; - trace!("importing slru file {}", path.display()); + let len = file.metadata().unwrap().len(); + ensure!(len % pg_constants::BLCKSZ as u64 == 0); // we assume SLRU block size is the same as BLCKSZ + let nblocks = len / pg_constants::BLCKSZ as u64; + + ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as u64); + + modification.put_slru_segment_creation(slru, segno, nblocks as u32)?; let mut rpageno = 0; loop { let r = file.read_exact(&mut buf); match r { Ok(_) => { - timeline.put_page_image( - RelishTag::Slru { slru, segno }, + modification.put_slru_page_image( + slru, + segno, rpageno, - lsn, Bytes::copy_from_slice(&buf), )?; } @@ -291,7 +319,7 @@ fn import_slru_file( Err(err) => match err.kind() { std::io::ErrorKind::UnexpectedEof => { // reached EOF. That's expected. - // FIXME: maybe check that we read the full length of the file? + ensure!(rpageno == nblocks as u32, "unexpected EOF"); break; } _ => { @@ -300,8 +328,6 @@ fn import_slru_file( }, }; rpageno += 1; - - // TODO: Check that the file isn't unexpectedly large, not larger than SLRU_PAGES_PER_SEGMENT pages } Ok(()) @@ -309,9 +335,9 @@ fn import_slru_file( /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. -fn import_wal( +fn import_wal( walpath: &Path, - writer: &dyn TimelineWriter, + tline: &mut DatadirTimeline, startpoint: Lsn, endpoint: Lsn, ) -> Result<()> { @@ -321,7 +347,7 @@ fn import_wal( let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE); let mut last_lsn = startpoint; - let mut walingest = WalIngest::new(writer.deref(), startpoint)?; + let mut walingest = WalIngest::new(tline, startpoint)?; while last_lsn <= endpoint { // FIXME: assume postgresql tli 1 for now @@ -354,7 +380,7 @@ fn import_wal( let mut nrecords = 0; while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(writer, recdata, lsn)?; + walingest.ingest_record(tline, recdata, lsn)?; last_lsn = lsn; nrecords += 1; diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs new file mode 100644 index 0000000000..9973568b07 --- /dev/null +++ b/pageserver/src/keyspace.rs @@ -0,0 +1,134 @@ +use crate::repository::{key_range_size, singleton_range, Key}; +use postgres_ffi::pg_constants; +use std::ops::Range; + +// Target file size, when creating image and delta layers +pub const TARGET_FILE_SIZE_BYTES: u64 = 128 * 1024 * 1024; // 128 MB + +/// +/// Represents a set of Keys, in a compact form. +/// +#[derive(Clone, Debug)] +pub struct KeySpace { + /// Contiguous ranges of keys that belong to the key space. In key order, + /// and with no overlap. + pub ranges: Vec>, +} + +impl KeySpace { + /// + /// Partition a key space into roughly chunks of roughly 'target_size' bytes + /// in each patition. + /// + pub fn partition(&self, target_size: u64) -> KeyPartitioning { + // Assume that each value is 8k in size. + let target_nblocks = (target_size / pg_constants::BLCKSZ as u64) as usize; + + let mut parts = Vec::new(); + let mut current_part = Vec::new(); + let mut current_part_size: usize = 0; + for range in &self.ranges { + // If appending the next contiguous range in the keyspace to the current + // partition would cause it to be too large, start a new partition. + let this_size = key_range_size(range) as usize; + if current_part_size + this_size > target_nblocks && !current_part.is_empty() { + parts.push(KeySpace { + ranges: current_part, + }); + current_part = Vec::new(); + current_part_size = 0; + } + + // If the next range is larger than 'target_size', split it into + // 'target_size' chunks. + let mut remain_size = this_size; + let mut start = range.start; + while remain_size > target_nblocks { + let next = start.add(target_nblocks as u32); + parts.push(KeySpace { + ranges: vec![start..next], + }); + start = next; + remain_size -= target_nblocks + } + current_part.push(start..range.end); + current_part_size += remain_size; + } + + // add last partition that wasn't full yet. + if !current_part.is_empty() { + parts.push(KeySpace { + ranges: current_part, + }); + } + + KeyPartitioning { parts } + } +} + +/// +/// Represents a partitioning of the key space. +/// +/// The only kind of partitioning we do is to partition the key space into +/// partitions that are roughly equal in physical size (see KeySpace::partition). +/// But this data structure could represent any partitioning. +/// +#[derive(Clone, Debug, Default)] +pub struct KeyPartitioning { + pub parts: Vec, +} + +impl KeyPartitioning { + pub fn new() -> Self { + KeyPartitioning { parts: Vec::new() } + } +} + +/// +/// A helper object, to collect a set of keys and key ranges into a KeySpace +/// object. This takes care of merging adjacent keys and key ranges into +/// contiguous ranges. +/// +#[derive(Clone, Debug, Default)] +pub struct KeySpaceAccum { + accum: Option>, + + ranges: Vec>, +} + +impl KeySpaceAccum { + pub fn new() -> Self { + Self { + accum: None, + ranges: Vec::new(), + } + } + + pub fn add_key(&mut self, key: Key) { + self.add_range(singleton_range(key)) + } + + pub fn add_range(&mut self, range: Range) { + match self.accum.as_mut() { + Some(accum) => { + if range.start == accum.end { + accum.end = range.end; + } else { + assert!(range.start > accum.end); + self.ranges.push(accum.clone()); + *accum = range; + } + } + None => self.accum = Some(range), + } + } + + pub fn to_keyspace(mut self) -> KeySpace { + if let Some(accum) = self.accum.take() { + self.ranges.push(accum); + } + KeySpace { + ranges: self.ranges, + } + } +} diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index bf5f52b18d..837298a10e 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -14,32 +14,33 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use bookfile::Book; use bytes::Bytes; +use fail::fail_point; +use itertools::Itertools; use lazy_static::lazy_static; -use postgres_ffi::pg_constants::BLCKSZ; use tracing::*; -use std::cmp; +use std::cmp::{max, min, Ordering}; use std::collections::hash_map::Entry; +use std::collections::BTreeSet; use std::collections::HashMap; -use std::collections::{BTreeSet, HashSet}; use std::fs; use std::fs::{File, OpenOptions}; use std::io::Write; -use std::ops::{Bound::Included, Deref}; +use std::ops::{Bound::Included, Deref, Range}; use std::path::{Path, PathBuf}; -use std::sync::atomic::{self, AtomicBool, AtomicUsize}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard}; +use std::sync::atomic::{self, AtomicBool}; +use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError}; use std::time::Instant; use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; use crate::config::PageServerConf; +use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::page_cache; -use crate::relish::*; use crate::remote_storage::{schedule_timeline_checkpoint_upload, RemoteIndex}; use crate::repository::{ - BlockNumber, GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncStatusUpdate, - TimelineWriter, ZenithWalRecord, + GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncStatusUpdate, TimelineWriter, }; +use crate::repository::{Key, Value}; use crate::thread_mgr; use crate::virtual_file::VirtualFile; use crate::walreceiver::IS_WAL_RECEIVER; @@ -48,7 +49,6 @@ use crate::CheckpointConfig; use crate::{ZTenantId, ZTimelineId}; use zenith_metrics::{register_histogram_vec, Histogram, HistogramVec}; -use zenith_metrics::{register_int_gauge_vec, IntGauge, IntGaugeVec}; use zenith_utils::crashsafe_dir; use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn}; use zenith_utils::seqwait::SeqWait; @@ -56,30 +56,25 @@ use zenith_utils::seqwait::SeqWait; mod delta_layer; pub(crate) mod ephemeral_file; mod filename; -mod global_layer_map; mod image_layer; mod inmemory_layer; -mod interval_tree; mod layer_map; pub mod metadata; mod par_fsync; mod storage_layer; -use delta_layer::DeltaLayer; +use delta_layer::{DeltaLayer, DeltaLayerWriter}; use ephemeral_file::is_ephemeral_file; use filename::{DeltaFileName, ImageFileName}; -use image_layer::ImageLayer; +use image_layer::{ImageLayer, ImageLayerWriter}; use inmemory_layer::InMemoryLayer; use layer_map::LayerMap; -use storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, SegmentBlk, SegmentTag, RELISH_SEG_SIZE, -}; +use layer_map::SearchResult; +use storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; // re-export this function so that page_cache.rs can use it. pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; -static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); - // Metrics collected on operations on the storage repository. lazy_static! { static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( @@ -100,17 +95,6 @@ lazy_static! { .expect("failed to define a metric"); } -lazy_static! { - // NOTE: can be zero if pageserver was restarted and there hasn't been any - // activity yet. - static ref LOGICAL_TIMELINE_SIZE: IntGaugeVec = register_int_gauge_vec!( - "pageserver_logical_timeline_size", - "Logical timeline size (bytes)", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); -} - /// Parts of the `.zenith/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; @@ -118,7 +102,7 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; /// Repository consists of multiple timelines. Keep them in a hash table. /// pub struct LayeredRepository { - conf: &'static PageServerConf, + pub conf: &'static PageServerConf, tenantid: ZTenantId, timelines: Mutex>, // This mutex prevents creation of new timelines during GC. @@ -135,21 +119,23 @@ pub struct LayeredRepository { remote_index: RemoteIndex, /// Makes every timeline to backup their files to remote storage. - upload_relishes: bool, + upload_layers: bool, } /// Public interface impl Repository for LayeredRepository { - fn get_timeline(&self, timelineid: ZTimelineId) -> Option { + type Timeline = LayeredTimeline; + + fn get_timeline(&self, timelineid: ZTimelineId) -> Option> { let timelines = self.timelines.lock().unwrap(); self.get_timeline_internal(timelineid, &timelines) .map(RepositoryTimeline::from) } - fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result> { + fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result> { let mut timelines = self.timelines.lock().unwrap(); match self.get_timeline_load_internal(timelineid, &mut timelines)? { - Some(local_loaded_timeline) => Ok(local_loaded_timeline as _), + Some(local_loaded_timeline) => Ok(local_loaded_timeline), None => anyhow::bail!( "cannot get local timeline: unknown timeline id: {}", timelineid @@ -157,7 +143,7 @@ impl Repository for LayeredRepository { } } - fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)> { + fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)> { self.timelines .lock() .unwrap() @@ -175,7 +161,7 @@ impl Repository for LayeredRepository { &self, timelineid: ZTimelineId, initdb_lsn: Lsn, - ) -> Result> { + ) -> Result> { let mut timelines = self.timelines.lock().unwrap(); // Create the timeline directory, and write initial metadata to file. @@ -191,9 +177,9 @@ impl Repository for LayeredRepository { timelineid, self.tenantid, Arc::clone(&self.walredo_mgr), - 0, - self.upload_relishes, + self.upload_layers, ); + timeline.layers.lock().unwrap().next_open_layer_at = Some(initdb_lsn); let timeline = Arc::new(timeline); let r = timelines.insert( @@ -282,13 +268,46 @@ impl Repository for LayeredRepository { }) } - fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()> { + fn compaction_iteration(&self) -> Result<()> { + // Scan through the hashmap and collect a list of all the timelines, + // while holding the lock. Then drop the lock and actually perform the + // compactions. We don't want to block everything else while the + // compaction runs. + let timelines = self.timelines.lock().unwrap(); + let timelines_to_compact = timelines + .iter() + .map(|(timelineid, timeline)| (*timelineid, timeline.clone())) + .collect::>(); + drop(timelines); + + for (timelineid, timeline) in &timelines_to_compact { + let _entered = + info_span!("compact", timeline = %timelineid, tenant = %self.tenantid).entered(); + match timeline { + LayeredTimelineEntry::Loaded(timeline) => { + timeline.compact()?; + } + LayeredTimelineEntry::Unloaded { .. } => { + debug!("Cannot compact remote timeline {}", timelineid) + } + } + } + + Ok(()) + } + + /// + /// Flush all in-memory data to disk. + /// + /// Used at shutdown. + /// + fn checkpoint(&self) -> Result<()> { // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // checkpoints. We don't want to block everything else while the // checkpoint runs. let timelines = self.timelines.lock().unwrap(); - let timelines_to_checkpoint = timelines + let timelines_to_compact = timelines .iter() // filter to get only loaded timelines .filter_map(|(timelineid, entry)| match entry { @@ -302,10 +321,10 @@ impl Repository for LayeredRepository { .collect::>(); drop(timelines); - for (timelineid, timeline) in &timelines_to_checkpoint { + for (timelineid, timeline) in &timelines_to_compact { let _entered = info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid).entered(); - timeline.checkpoint(cconf)?; + timeline.checkpoint(CheckpointConfig::Flush)?; } Ok(()) @@ -403,7 +422,7 @@ impl LayeredTimelineEntry { } } -impl From for RepositoryTimeline { +impl From for RepositoryTimeline { fn from(entry: LayeredTimelineEntry) -> Self { match entry { LayeredTimelineEntry::Loaded(timeline) => RepositoryTimeline::Loaded(timeline as _), @@ -489,20 +508,18 @@ impl LayeredRepository { let _enter = info_span!("loading timeline", timeline = %timelineid, tenant = %self.tenantid) .entered(); - let mut timeline = LayeredTimeline::new( + let timeline = LayeredTimeline::new( self.conf, metadata, ancestor, timelineid, self.tenantid, Arc::clone(&self.walredo_mgr), - 0, // init with 0 and update after layers are loaded, - self.upload_relishes, + self.upload_layers, ); timeline .load_layer_map(disk_consistent_lsn) .context("failed to load layermap")?; - timeline.init_current_logical_size()?; Ok(Arc::new(timeline)) } @@ -512,7 +529,7 @@ impl LayeredRepository { walredo_mgr: Arc, tenantid: ZTenantId, remote_index: RemoteIndex, - upload_relishes: bool, + upload_layers: bool, ) -> LayeredRepository { LayeredRepository { tenantid, @@ -521,7 +538,7 @@ impl LayeredRepository { gc_cs: Mutex::new(()), walredo_mgr, remote_index, - upload_relishes, + upload_layers, } } @@ -673,7 +690,8 @@ impl LayeredRepository { timeline.checkpoint(CheckpointConfig::Forced)?; info!("timeline {} checkpoint_before_gc done", timelineid); } - let result = timeline.gc_timeline(branchpoints, cutoff)?; + timeline.update_gc_info(branchpoints, cutoff); + let result = timeline.gc()?; totals += result; timelines = self.timelines.lock().unwrap(); @@ -693,6 +711,8 @@ pub struct LayeredTimeline { layers: Mutex, + last_freeze_at: AtomicLsn, + // WAL redo manager walredo_mgr: Arc, @@ -725,33 +745,14 @@ pub struct LayeredTimeline { ancestor_timeline: Option, ancestor_lsn: Lsn, - // this variable indicates how much space is used from user's point of view, - // e.g. we do not account here for multiple versions of data and so on. - // this is counted incrementally based on physical relishes (excluding FileNodeMap) - // current_logical_size is not stored no disk and initialized on timeline creation using - // get_current_logical_size_non_incremental in init_current_logical_size - // this is needed because when we save it in metadata it can become out of sync - // because current_logical_size is consistent on last_record_lsn, not ondisk_consistent_lsn - // NOTE: current_logical_size also includes size of the ancestor - current_logical_size: AtomicUsize, // bytes - - // To avoid calling .with_label_values and formatting the tenant and timeline IDs to strings - // every time the logical size is updated, keep a direct reference to the Gauge here. - // unfortunately it doesnt forward atomic methods like .fetch_add - // so use two fields: actual size and metric - // see https://github.com/zenithdb/zenith/issues/622 for discussion - // TODO: it is possible to combine these two fields into single one using custom metric which uses SeqCst - // ordering for its operations, but involves private modules, and macro trickery - current_logical_size_gauge: IntGauge, - // Metrics histograms reconstruct_time_histo: Histogram, - checkpoint_time_histo: Histogram, - flush_checkpoint_time_histo: Histogram, - forced_checkpoint_time_histo: Histogram, + flush_time_histo: Histogram, + compact_time_histo: Histogram, + create_images_time_histo: Histogram, /// If `true`, will backup its files that appear after each checkpointing to the remote storage. - upload_relishes: AtomicBool, + upload_layers: AtomicBool, /// Ensures layers aren't frozen by checkpointer between /// [`LayeredTimeline::get_layer_for_write`] and layer reads. @@ -760,15 +761,24 @@ pub struct LayeredTimeline { /// to avoid deadlock. write_lock: Mutex<()>, - // Prevent concurrent checkpoints. - // Checkpoints are normally performed by one thread. But checkpoint can also be manually requested by admin - // (that's used in tests), and shutdown also forces a checkpoint. These forced checkpoints run in a different thread - // and could be triggered at the same time as a normal checkpoint. - checkpoint_cs: Mutex<()>, + /// Used to ensure that there is only one thread + layer_flush_lock: Mutex<()>, + + // Prevent concurrent compactions. + // Compactions are normally performed by one thread. But compaction can also be manually + // requested by admin (that's used in tests). These forced compactions run in a different + // thread and could be triggered at the same time as a normal, timed compaction. + compaction_cs: Mutex<()>, // Needed to ensure that we can't create a branch at a point that was already garbage collected latest_gc_cutoff_lsn: RwLock, + // List of child timelines and their branch points. This is needed to avoid + // garbage collecting data that is still needed by the child timelines. + gc_info: RwLock, + + partitioning: RwLock>, + // It may change across major versions so for simplicity // keep it after running initdb for a timeline. // It is needed in checks when we want to error on some operations @@ -778,6 +788,28 @@ pub struct LayeredTimeline { initdb_lsn: Lsn, } +/// +/// Information about how much history needs to be retained, needed by +/// Garbage Collection. +/// +struct GcInfo { + /// Specific LSNs that are needed. + /// + /// Currently, this includes all points where child branches have + /// been forked off from. In the future, could also include + /// explicit user-defined snapshot points. + retain_lsns: Vec, + + /// In addition to 'retain_lsns', keep everything newer than this + /// point. + /// + /// This is calculated by subtracting 'gc_horizon' setting from + /// last-record LSN + /// + /// FIXME: is this inclusive or exclusive? + cutoff: Lsn, +} + /// Public interface functions impl Timeline for LayeredTimeline { fn get_ancestor_lsn(&self) -> Lsn { @@ -815,162 +847,35 @@ impl Timeline for LayeredTimeline { self.latest_gc_cutoff_lsn.read().unwrap() } - /// Look up given page version. - fn get_page_at_lsn(&self, rel: RelishTag, rel_blknum: BlockNumber, lsn: Lsn) -> Result { - if !rel.is_blocky() && rel_blknum != 0 { - bail!( - "invalid request for block {} for non-blocky relish {}", - rel_blknum, - rel - ); - } - debug_assert!(lsn <= self.get_last_record_lsn()); - let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); - - if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - self.materialize_page(seg, seg_blknum, lsn, &*layer) - } else { - // FIXME: This can happen if PostgreSQL extends a relation but never writes - // the page. See https://github.com/zenithdb/zenith/issues/841 - // - // Would be nice to detect that situation better. - if seg.segno > 0 && self.get_rel_exists(rel, lsn)? { - warn!("Page {} blk {} at {} not found", rel, rel_blknum, lsn); - return Ok(ZERO_PAGE.clone()); - } - - bail!("segment {} not found at {}", rel, lsn); - } - } - - fn get_relish_size(&self, rel: RelishTag, lsn: Lsn) -> Result> { - if !rel.is_blocky() { - bail!( - "invalid get_relish_size request for non-blocky relish {}", - rel - ); - } + /// Look up the value with the given a key + fn get(&self, key: Key, lsn: Lsn) -> Result { debug_assert!(lsn <= self.get_last_record_lsn()); - let mut segno = 0; - loop { - let seg = SegmentTag { rel, segno }; - - let segsize; - if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - segsize = layer.get_seg_size(lsn)?; - trace!("get_seg_size: {} at {} -> {}", seg, lsn, segsize); - } else { - if segno == 0 { - return Ok(None); + // Check the page cache. We will get back the most recent page with lsn <= `lsn`. + // The cached image can be returned directly if there is no WAL between the cached image + // and requested LSN. The cached image can also be used to reduce the amount of WAL needed + // for redo. + let cached_page_img = match self.lookup_cached_page(&key, lsn) { + Some((cached_lsn, cached_img)) => { + match cached_lsn.cmp(&lsn) { + Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check + Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image + Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn } - segsize = 0; + Some((cached_lsn, cached_img)) } - - if segsize != RELISH_SEG_SIZE { - let result = segno * RELISH_SEG_SIZE + segsize; - return Ok(Some(result)); - } - segno += 1; - } - } - - fn get_rel_exists(&self, rel: RelishTag, lsn: Lsn) -> Result { - debug_assert!(lsn <= self.get_last_record_lsn()); - - let seg = SegmentTag { rel, segno: 0 }; - - let result = if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - layer.get_seg_exists(lsn)? - } else { - false + None => None, }; - trace!("get_rel_exists: {} at {} -> {}", rel, lsn, result); - Ok(result) - } - - fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result> { - let request_tag = RelTag { - spcnode, - dbnode, - relnode: 0, - forknum: 0, + let mut reconstruct_state = ValueReconstructState { + records: Vec::new(), + img: cached_page_img, }; - self.list_relishes(Some(request_tag), lsn) - } + self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; - fn list_nonrels(&self, lsn: Lsn) -> Result> { - info!("list_nonrels called at {}", lsn); - - self.list_relishes(None, lsn) - } - - fn list_relishes(&self, tag: Option, lsn: Lsn) -> Result> { - trace!("list_relishes called at {}", lsn); - debug_assert!(lsn <= self.get_last_record_lsn()); - - // List of all relishes along with a flag that marks if they exist at the given lsn. - let mut all_relishes_map: HashMap = HashMap::new(); - let mut result = HashSet::new(); - let mut timeline = self; - - // Iterate through layers back in time and find the most - // recent state of the relish. Don't add relish to the list - // if newer version is already there. - // - // This most recent version can represent dropped or existing relish. - // We will filter dropped relishes below. - // - loop { - let rels = timeline.layers.lock().unwrap().list_relishes(tag, lsn)?; - - for (&new_relish, &new_relish_exists) in rels.iter() { - match all_relishes_map.entry(new_relish) { - Entry::Occupied(o) => { - trace!( - "Newer version of the object {} is already found: exists {}", - new_relish, - o.get(), - ); - } - Entry::Vacant(v) => { - v.insert(new_relish_exists); - trace!( - "Newer version of the object {} NOT found. Insert NEW: exists {}", - new_relish, - new_relish_exists - ); - } - } - } - - match &timeline.ancestor_timeline { - None => break, - Some(ancestor_entry) => { - timeline = ancestor_entry.ensure_loaded().with_context( - || format!( - "cannot list relishes for timeline {} tenant {} due to its ancestor {} being either unloaded", - self.timelineid, self.tenantid, ancestor_entry.timeline_id(), - ) - )?; - continue; - } - } - } - - // Filter out dropped relishes - for (&new_relish, &new_relish_exists) in all_relishes_map.iter() { - if new_relish_exists { - result.insert(new_relish); - trace!("List object {}", new_relish); - } else { - trace!("Filtered out dropped object {}", new_relish); - } - } - - Ok(result) + self.reconstruct_time_histo + .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) } /// Public entry point for checkpoint(). All the logic is in the private @@ -978,15 +883,15 @@ impl Timeline for LayeredTimeline { /// metrics collection. fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { match cconf { - CheckpointConfig::Flush => self - .flush_checkpoint_time_histo - .observe_closure_duration(|| self.checkpoint_internal(0, false)), - CheckpointConfig::Forced => self - .forced_checkpoint_time_histo - .observe_closure_duration(|| self.checkpoint_internal(0, true)), - CheckpointConfig::Distance(distance) => self - .checkpoint_time_histo - .observe_closure_duration(|| self.checkpoint_internal(distance, true)), + CheckpointConfig::Flush => { + self.freeze_inmem_layer(false); + self.flush_frozen_layers(true) + } + CheckpointConfig::Forced => { + self.freeze_inmem_layer(false); + self.flush_frozen_layers(true)?; + self.compact() + } } } @@ -1019,51 +924,24 @@ impl Timeline for LayeredTimeline { self.last_record_lsn.load() } - fn get_current_logical_size(&self) -> usize { - self.current_logical_size.load(atomic::Ordering::Acquire) as usize - } - - fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { - let mut total_blocks: usize = 0; - - let _enter = info_span!("calc logical size", %lsn).entered(); - - // list of all relations in this timeline, including ancestor timelines - let all_rels = self.list_rels(0, 0, lsn)?; - - for rel in all_rels { - if let Some(size) = self.get_relish_size(rel, lsn)? { - total_blocks += size as usize; - } - } - - let non_rels = self.list_nonrels(lsn)?; - for non_rel in non_rels { - // TODO support TwoPhase - if matches!(non_rel, RelishTag::Slru { slru: _, segno: _ }) { - if let Some(size) = self.get_relish_size(non_rel, lsn)? { - total_blocks += size as usize; - } - } - } - - Ok(total_blocks * BLCKSZ as usize) - } - fn get_disk_consistent_lsn(&self) -> Lsn { self.disk_consistent_lsn.load() } + fn hint_partitioning(&self, partitioning: KeyPartitioning, lsn: Lsn) -> Result<()> { + self.partitioning + .write() + .unwrap() + .replace((partitioning, lsn)); + Ok(()) + } + fn writer<'a>(&'a self) -> Box { Box::new(LayeredTimelineWriter { tl: self, _write_guard: self.write_lock.lock().unwrap(), }) } - - fn upgrade_to_layered_timeline(&self) -> &crate::layered_repository::LayeredTimeline { - self - } } impl LayeredTimeline { @@ -1078,32 +956,28 @@ impl LayeredTimeline { timelineid: ZTimelineId, tenantid: ZTenantId, walredo_mgr: Arc, - current_logical_size: usize, - upload_relishes: bool, + upload_layers: bool, ) -> LayeredTimeline { - let current_logical_size_gauge = LOGICAL_TIMELINE_SIZE - .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) - .unwrap(); let reconstruct_time_histo = RECONSTRUCT_TIME .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) .unwrap(); - let checkpoint_time_histo = STORAGE_TIME + let flush_time_histo = STORAGE_TIME .get_metric_with_label_values(&[ - "checkpoint", + "layer flush", &tenantid.to_string(), &timelineid.to_string(), ]) .unwrap(); - let flush_checkpoint_time_histo = STORAGE_TIME + let compact_time_histo = STORAGE_TIME .get_metric_with_label_values(&[ - "flush checkpoint", + "compact", &tenantid.to_string(), &timelineid.to_string(), ]) .unwrap(); - let forced_checkpoint_time_histo = STORAGE_TIME + let create_images_time_histo = STORAGE_TIME .get_metric_with_label_values(&[ - "forced checkpoint", + "create images", &tenantid.to_string(), &timelineid.to_string(), ]) @@ -1124,18 +998,27 @@ impl LayeredTimeline { }), disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0), + last_freeze_at: AtomicLsn::new(0), + ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), - current_logical_size: AtomicUsize::new(current_logical_size), - current_logical_size_gauge, + reconstruct_time_histo, - checkpoint_time_histo, - flush_checkpoint_time_histo, - forced_checkpoint_time_histo, - upload_relishes: AtomicBool::new(upload_relishes), + flush_time_histo, + compact_time_histo, + create_images_time_histo, + + upload_layers: AtomicBool::new(upload_layers), write_lock: Mutex::new(()), - checkpoint_cs: Mutex::new(()), + layer_flush_lock: Mutex::new(()), + compaction_cs: Mutex::new(()), + + gc_info: RwLock::new(GcInfo { + retain_lsns: Vec::new(), + cutoff: Lsn(0), + }), + partitioning: RwLock::new(None), latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), @@ -1179,13 +1062,12 @@ impl LayeredTimeline { num_layers += 1; } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { // Create a DeltaLayer struct for each delta file. - ensure!(deltafilename.start_lsn < deltafilename.end_lsn); // The end-LSN is exclusive, while disk_consistent_lsn is // inclusive. For example, if disk_consistent_lsn is 100, it is // OK for a delta layer to have end LSN 101, but if the end LSN // is 102, then it might not have been fully flushed to disk // before crash. - if deltafilename.end_lsn > disk_consistent_lsn + 1 { + if deltafilename.lsn_range.end > disk_consistent_lsn + 1 { warn!( "found future delta layer {} on timeline {} disk_consistent_lsn is {}", deltafilename, self.timelineid, disk_consistent_lsn @@ -1212,41 +1094,14 @@ impl LayeredTimeline { } } - info!("loaded layer map with {} layers", num_layers); + layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1); - Ok(()) - } - - /// - /// Used to init current logical size on startup - /// - fn init_current_logical_size(&mut self) -> Result<()> { - if self.current_logical_size.load(atomic::Ordering::Relaxed) != 0 { - bail!("cannot init already initialized current logical size") - }; - let lsn = self.get_last_record_lsn(); - self.current_logical_size = - AtomicUsize::new(self.get_current_logical_size_non_incremental(lsn)?); - trace!( - "current_logical_size initialized to {}", - self.current_logical_size.load(atomic::Ordering::Relaxed) + info!( + "loaded layer map with {} layers at {}", + num_layers, disk_consistent_lsn ); - Ok(()) - } - /// - /// Get a handle to a Layer for reading. - /// - /// The returned Layer might be from an ancestor timeline, if the - /// segment hasn't been updated on this timeline yet. - /// - fn get_layer_for_read( - &self, - seg: SegmentTag, - lsn: Lsn, - ) -> Result, Lsn)>> { - let self_layers = self.layers.lock().unwrap(); - self.get_layer_for_read_locked(seg, lsn, &self_layers) + Ok(()) } /// @@ -1257,88 +1112,160 @@ impl LayeredTimeline { /// /// This function takes the current timeline's locked LayerMap as an argument, /// so callers can avoid potential race conditions. - fn get_layer_for_read_locked( + fn get_reconstruct_data( &self, - seg: SegmentTag, - lsn: Lsn, - self_layers: &MutexGuard, - ) -> anyhow::Result, Lsn)>> { - trace!("get_layer_for_read called for {} at {}", seg, lsn); - - // If you requested a page at an older LSN, before the branch point, dig into - // the right ancestor timeline. This can only happen if you launch a read-only - // node with an old LSN, a primary always uses a recent LSN in its requests. + key: Key, + request_lsn: Lsn, + reconstruct_state: &mut ValueReconstructState, + ) -> anyhow::Result<()> { + // Start from the current timeline. + let mut timeline_owned; let mut timeline = self; - let mut lsn = lsn; - while lsn < timeline.ancestor_lsn { - trace!("going into ancestor {} ", timeline.ancestor_lsn); - timeline = timeline - .ancestor_timeline - .as_ref() - .expect("there should be an ancestor") - .ensure_loaded() - .with_context(|| format!( - "Cannot get the whole layer for read locked: timeline {} is not present locally", - self.get_ancestor_timeline_id().unwrap()) - )?; - } + let mut path: Vec<(ValueReconstructResult, Lsn, Arc)> = Vec::new(); - // Now we have the right starting timeline for our search. - loop { - let layers_owned: MutexGuard; - let layers = if self as *const LayeredTimeline != timeline as *const LayeredTimeline { - layers_owned = timeline.layers.lock().unwrap(); - &layers_owned - } else { - self_layers - }; + // 'prev_lsn' tracks the last LSN that we were at in our search. It's used + // to check that each iteration make some progress, to break infinite + // looping if something goes wrong. + let mut prev_lsn = Lsn(u64::MAX); - // - // FIXME: If the relation has been dropped, does this return the right - // thing? The compute node should not normally request dropped relations, - // but if OID wraparound happens the same relfilenode might get reused - // for an unrelated relation. - // + let mut result = ValueReconstructResult::Continue; + let mut cont_lsn = Lsn(request_lsn.0 + 1); - // Do we have a layer on this timeline? - if let Some(layer) = layers.get(&seg, lsn) { - trace!( - "found layer in cache: {} {}-{}", - timeline.timelineid, - layer.get_start_lsn(), - layer.get_end_lsn() - ); + 'outer: loop { + // The function should have updated 'state' + //info!("CALLED for {} at {}: {:?} with {} records", reconstruct_state.key, reconstruct_state.lsn, result, reconstruct_state.records.len()); + match result { + ValueReconstructResult::Complete => return Ok(()), + ValueReconstructResult::Continue => { + if prev_lsn <= cont_lsn { + // Didn't make any progress in last iteration. Error out to avoid + // getting stuck in the loop. - ensure!(layer.get_start_lsn() <= lsn); - - if layer.is_dropped() && layer.get_end_lsn() <= lsn { - return Ok(None); + // For debugging purposes, print the path of layers that we traversed + // through. + for (r, c, l) in path { + error!( + "PATH: result {:?}, cont_lsn {}, layer: {}", + r, + c, + l.filename().display() + ); + } + bail!("could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", + key, + Lsn(cont_lsn.0 - 1), + request_lsn, + timeline.ancestor_lsn) + } + prev_lsn = cont_lsn; + } + ValueReconstructResult::Missing => { + bail!( + "could not find data for key {} at LSN {}, for request at LSN {}", + key, + cont_lsn, + request_lsn + ) } - - return Ok(Some((layer.clone(), lsn))); } - // If not, check if there's a layer on the ancestor timeline - match &timeline.ancestor_timeline { - Some(ancestor_entry) => { - let ancestor = ancestor_entry - .ensure_loaded() - .context("cannot get a layer for read from ancestor because it is either remote or unloaded")?; - lsn = timeline.ancestor_lsn; - timeline = ancestor; - trace!("recursing into ancestor at {}/{}", timeline.timelineid, lsn); + // Recurse into ancestor if needed + if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { + trace!( + "going into ancestor {}, cont_lsn is {}", + timeline.ancestor_lsn, + cont_lsn + ); + let ancestor = timeline.get_ancestor_timeline()?; + timeline_owned = ancestor; + timeline = &*timeline_owned; + prev_lsn = Lsn(u64::MAX); + continue; + } + + let layers = timeline.layers.lock().unwrap(); + + // Check the open and frozen in-memory layers first + if let Some(open_layer) = &layers.open_layer { + let start_lsn = open_layer.get_lsn_range().start; + if cont_lsn > start_lsn { + //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); + result = open_layer.get_value_reconstruct_data( + key, + open_layer.get_lsn_range().start..cont_lsn, + reconstruct_state, + )?; + cont_lsn = start_lsn; + path.push((result, cont_lsn, open_layer.clone())); continue; } - None => return Ok(None), + } + for frozen_layer in layers.frozen_layers.iter() { + let start_lsn = frozen_layer.get_lsn_range().start; + if cont_lsn > start_lsn { + //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); + result = frozen_layer.get_value_reconstruct_data( + key, + frozen_layer.get_lsn_range().start..cont_lsn, + reconstruct_state, + )?; + cont_lsn = start_lsn; + path.push((result, cont_lsn, frozen_layer.clone())); + continue 'outer; + } + } + + if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn)? { + //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display()); + + result = layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + )?; + cont_lsn = lsn_floor; + path.push((result, cont_lsn, layer)); + } else if self.ancestor_timeline.is_some() { + // Nothing on this timeline. Traverse to parent + result = ValueReconstructResult::Continue; + cont_lsn = Lsn(self.ancestor_lsn.0 + 1); + } else { + // Nothing found + result = ValueReconstructResult::Missing; } } } + fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> { + let cache = page_cache::get(); + + // FIXME: It's pointless to check the cache for things that are not 8kB pages. + // We should look at the key to determine if it's a cacheable object + let (lsn, read_guard) = + cache.lookup_materialized_page(self.tenantid, self.timelineid, key, lsn)?; + let img = Bytes::from(read_guard.to_vec()); + Some((lsn, img)) + } + + fn get_ancestor_timeline(&self) -> Result> { + let ancestor = self + .ancestor_timeline + .as_ref() + .expect("there should be an ancestor") + .ensure_loaded() + .with_context(|| { + format!( + "Cannot get the whole layer for read locked: timeline {} is not present locally", + self.get_ancestor_timeline_id().unwrap()) + })?; + Ok(Arc::clone(ancestor)) + } + /// /// Get a handle to the latest layer for appending. /// - fn get_layer_for_write(&self, seg: SegmentTag, lsn: Lsn) -> anyhow::Result> { + fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result> { let mut layers = self.layers.lock().unwrap(); ensure!(lsn.is_aligned()); @@ -1353,235 +1280,191 @@ impl LayeredTimeline { // Do we have a layer open for writing already? let layer; - if let Some(open_layer) = layers.get_open(&seg) { - if open_layer.get_start_lsn() > lsn { + if let Some(open_layer) = &layers.open_layer { + if open_layer.get_lsn_range().start > lsn { bail!("unexpected open layer in the future"); } - // Open layer exists, but it is dropped, so create a new one. - if open_layer.is_dropped() { - ensure!(!open_layer.is_writeable()); - // Layer that is created after dropped one represents a new relish segment. - trace!( - "creating layer for write for new relish segment after dropped layer {} at {}/{}", - seg, - self.timelineid, - lsn - ); - - layer = InMemoryLayer::create( - self.conf, - self.timelineid, - self.tenantid, - seg, - lsn, - last_record_lsn, - )?; - } else { - return Ok(open_layer); - } - } - // No writeable layer for this relation. Create one. - // - // Is this a completely new relation? Or the first modification after branching? - // - else if let Some((prev_layer, _prev_lsn)) = - self.get_layer_for_read_locked(seg, lsn, &layers)? - { - // Create new entry after the previous one. - let start_lsn; - if prev_layer.get_timeline_id() != self.timelineid { - // First modification on this timeline - start_lsn = self.ancestor_lsn + 1; - trace!( - "creating layer for write for {} at branch point {}", - seg, - start_lsn - ); - } else { - start_lsn = prev_layer.get_end_lsn(); - trace!( - "creating layer for write for {} after previous layer {}", - seg, - start_lsn - ); - } - trace!( - "prev layer is at {}/{} - {}", - prev_layer.get_timeline_id(), - prev_layer.get_start_lsn(), - prev_layer.get_end_lsn() - ); - layer = InMemoryLayer::create_successor_layer( - self.conf, - prev_layer, - self.timelineid, - self.tenantid, - start_lsn, - last_record_lsn, - )?; + layer = Arc::clone(open_layer); } else { - // New relation. + // No writeable layer yet. Create one. + let start_lsn = layers.next_open_layer_at.unwrap(); + trace!( - "creating layer for write for new rel {} at {}/{}", - seg, + "creating layer for write at {}/{} for record at {}", self.timelineid, + start_lsn, lsn ); + let new_layer = + InMemoryLayer::create(self.conf, self.timelineid, self.tenantid, start_lsn)?; + let layer_rc = Arc::new(new_layer); - layer = InMemoryLayer::create( - self.conf, - self.timelineid, - self.tenantid, - seg, - lsn, - last_record_lsn, - )?; + layers.open_layer = Some(Arc::clone(&layer_rc)); + layers.next_open_layer_at = None; + + layer = layer_rc; } + Ok(layer) + } - let layer_rc: Arc = Arc::new(layer); - layers.insert_open(Arc::clone(&layer_rc)); + fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> { + //info!("PUT: key {} at {}", key, lsn); + let layer = self.get_layer_for_write(lsn)?; + layer.put_value(key, lsn, val)?; + Ok(()) + } - Ok(layer_rc) + fn put_tombstone(&self, key_range: Range, lsn: Lsn) -> Result<()> { + let layer = self.get_layer_for_write(lsn)?; + layer.put_tombstone(key_range, lsn)?; + + Ok(()) + } + + fn finish_write(&self, new_lsn: Lsn) { + assert!(new_lsn.is_aligned()); + + self.last_record_lsn.advance(new_lsn); + } + + fn freeze_inmem_layer(&self, write_lock_held: bool) { + // Freeze the current open in-memory layer. It will be written to disk on next + // iteration. + let _write_guard = if write_lock_held { + None + } else { + Some(self.write_lock.lock().unwrap()) + }; + let mut layers = self.layers.lock().unwrap(); + if let Some(open_layer) = &layers.open_layer { + let open_layer_rc = Arc::clone(open_layer); + // Does this layer need freezing? + let end_lsn = Lsn(self.get_last_record_lsn().0 + 1); + open_layer.freeze(end_lsn); + + // The layer is no longer open, update the layer map to reflect this. + // We will replace it with on-disk historics below. + layers.frozen_layers.push_back(open_layer_rc); + layers.open_layer = None; + layers.next_open_layer_at = Some(end_lsn); + self.last_freeze_at.store(end_lsn); + } + drop(layers); } /// - /// Flush to disk all data that was written with the put_* functions + /// Check if more than 'checkpoint_distance' of WAL has been accumulated + /// in the in-memory layer, and initiate flushing it if so. /// - /// NOTE: This has nothing to do with checkpoint in PostgreSQL. - fn checkpoint_internal(&self, checkpoint_distance: u64, reconstruct_pages: bool) -> Result<()> { - // Prevent concurrent checkpoints - let _checkpoint_cs = self.checkpoint_cs.lock().unwrap(); - let write_guard = self.write_lock.lock().unwrap(); - let mut layers = self.layers.lock().unwrap(); + pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { + let last_lsn = self.get_last_record_lsn(); - // Bump the generation number in the layer map, so that we can distinguish - // entries inserted after the checkpoint started - let current_generation = layers.increment_generation(); + let distance = last_lsn.widening_sub(self.last_freeze_at.load()); + if distance >= self.conf.checkpoint_distance.into() { + self.freeze_inmem_layer(true); + self.last_freeze_at.store(last_lsn); + } + if let Ok(guard) = self.layer_flush_lock.try_lock() { + drop(guard); + let self_clone = Arc::clone(self); + thread_mgr::spawn( + thread_mgr::ThreadKind::LayerFlushThread, + Some(self.tenantid), + Some(self.timelineid), + "layer flush thread", + false, + move || self_clone.flush_frozen_layers(false), + )?; + } + Ok(()) + } - let RecordLsn { - last: last_record_lsn, - prev: prev_record_lsn, - } = self.last_record_lsn.load(); + /// Flush all frozen layers to disk. + /// + /// Only one thread at a time can be doing layer-flushing for a + /// given timeline. If 'wait' is true, and another thread is + /// currently doing the flushing, this function will wait for it + /// to finish. If 'wait' is false, this function will return + /// immediately instead. + fn flush_frozen_layers(&self, wait: bool) -> Result<()> { + let flush_lock_guard = if wait { + self.layer_flush_lock.lock().unwrap() + } else { + match self.layer_flush_lock.try_lock() { + Ok(guard) => guard, + Err(TryLockError::WouldBlock) => return Ok(()), + Err(TryLockError::Poisoned(err)) => panic!("{:?}", err), + } + }; - trace!("checkpoint starting at {}", last_record_lsn); + let timer = self.flush_time_histo.start_timer(); - // Take the in-memory layer with the oldest WAL record. If it's older - // than the threshold, write it out to disk as a new image and delta file. - // Repeat until all remaining in-memory layers are within the threshold. - // - // That's necessary to limit the amount of WAL that needs to be kept - // in the safekeepers, and that needs to be reprocessed on page server - // crash. TODO: It's not a great policy for keeping memory usage in - // check, though. We should also aim at flushing layers that consume - // a lot of memory and/or aren't receiving much updates anymore. - let mut disk_consistent_lsn = last_record_lsn; - - let mut layer_paths = Vec::new(); - let mut freeze_end_lsn = Lsn(0); - let mut evicted_layers = Vec::new(); - - // - // Determine which layers we need to evict and calculate max(latest_lsn) - // among those layers. - // - while let Some((oldest_layer_id, oldest_layer, oldest_generation)) = - layers.peek_oldest_open() - { - let oldest_lsn = oldest_layer.get_oldest_lsn(); - // Does this layer need freezing? - // - // Write out all in-memory layers that contain WAL older than CHECKPOINT_DISTANCE. - // If we reach a layer with the same - // generation number, we know that we have cycled through all layers that were open - // when we started. We don't want to process layers inserted after we started, to - // avoid getting into an infinite loop trying to process again entries that we - // inserted ourselves. - // - // Once we have decided to write out at least one layer, we must also write out - // any other layers that contain WAL older than the end LSN of the layers we have - // already decided to write out. In other words, we must write out all layers - // whose [oldest_lsn, latest_lsn) range overlaps with any of the other layers - // that we are writing out. Otherwise, when we advance 'disk_consistent_lsn', it's - // ambiguous whether those layers are already durable on disk or not. For example, - // imagine that there are two layers in memory that contain page versions in the - // following LSN ranges: - // - // A: 100-150 - // B: 110-200 - // - // If we flush layer A, we must also flush layer B, because they overlap. If we - // flushed only A, and advanced 'disk_consistent_lsn' to 150, we would break the - // rule that all WAL older than 'disk_consistent_lsn' are durable on disk, because - // B contains some WAL older than 150. On the other hand, if we flushed out A and - // advanced 'disk_consistent_lsn' only up to 110, after crash and restart we would - // delete the first layer because its end LSN is larger than 110. If we changed - // the deletion logic to not delete it, then we would start streaming at 110, and - // process again the WAL records in the range 110-150 that are already in layer A, - // and the WAL processing code does not cope with that. We solve that dilemma by - // insisting that if we write out the first layer, we also write out the second - // layer, and advance disk_consistent_lsn all the way up to 200. - // - let distance = last_record_lsn.widening_sub(oldest_lsn); - if (distance < 0 - || distance < checkpoint_distance.into() - || oldest_generation == current_generation) - && oldest_lsn >= freeze_end_lsn - // this layer intersects with evicted layer and so also need to be evicted - { - debug!( - "the oldest layer is now {} which is {} bytes behind last_record_lsn", - oldest_layer.filename().display(), - distance - ); - disk_consistent_lsn = oldest_lsn; + loop { + let layers = self.layers.lock().unwrap(); + if let Some(frozen_layer) = layers.frozen_layers.front() { + let frozen_layer = Arc::clone(frozen_layer); + drop(layers); // to allow concurrent reads and writes + self.flush_frozen_layer(frozen_layer)?; + } else { + // Drop the 'layer_flush_lock' *before* 'layers'. That + // way, if you freeze a layer, and then call + // flush_frozen_layers(false), it is guaranteed that + // if another thread was busy flushing layers and the + // call therefore returns immediately, the other + // thread will have seen the newly-frozen layer and + // will flush that too (assuming no errors). + drop(flush_lock_guard); + drop(layers); break; } - let latest_lsn = oldest_layer.get_latest_lsn(); - if latest_lsn > freeze_end_lsn { - freeze_end_lsn = latest_lsn; // calculate max of latest_lsn of the layers we're about to evict - } - layers.remove_open(oldest_layer_id); - evicted_layers.push((oldest_layer_id, oldest_layer)); } - // Freeze evicted layers - for (_evicted_layer_id, evicted_layer) in evicted_layers.iter() { - // Mark the layer as no longer accepting writes and record the end_lsn. - // This happens in-place, no new layers are created now. - evicted_layer.freeze(freeze_end_lsn); - layers.insert_historic(evicted_layer.clone()); + timer.stop_and_record(); + + Ok(()) + } + + /// Flush one frozen in-memory layer to disk, as a new delta layer. + fn flush_frozen_layer(&self, frozen_layer: Arc) -> Result<()> { + let new_delta = frozen_layer.write_to_disk()?; + let new_delta_path = new_delta.path(); + + // Sync the new layer to disk. + // + // We must also fsync the timeline dir to ensure the directory entries for + // new layer files are durable + // + // TODO: If we're running inside 'flush_frozen_layers' and there are multiple + // files to flush, it might be better to first write them all, and then fsync + // them all in parallel. + par_fsync::par_fsync(&[ + new_delta_path.clone(), + self.conf.timeline_path(&self.timelineid, &self.tenantid), + ])?; + + // Finally, replace the frozen in-memory layer with the new on-disk layers + { + let mut layers = self.layers.lock().unwrap(); + let l = layers.frozen_layers.pop_front(); + + // Only one thread may call this function at a time (for this + // timeline). If two threads tried to flush the same frozen + // layer to disk at the same time, that would not work. + assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer)); + + // Add the new delta layer to the LayerMap + layers.insert_historic(Arc::new(new_delta)); + + // release lock on 'layers' } - // Call unload() on all frozen layers, to release memory. - // This shouldn't be much memory, as only metadata is slurped - // into memory. - for layer in layers.iter_historic_layers() { - layer.unload()?; - } - - drop(layers); - drop(write_guard); - - // Create delta/image layers for evicted layers - for (_evicted_layer_id, evicted_layer) in evicted_layers.iter() { - let mut this_layer_paths = - self.evict_layer(evicted_layer.clone(), reconstruct_pages)?; - layer_paths.append(&mut this_layer_paths); - } - - // Sync layers - if !layer_paths.is_empty() { - // We must fsync the timeline dir to ensure the directory entries for - // new layer files are durable - layer_paths.push(self.conf.timeline_path(&self.timelineid, &self.tenantid)); - - // Fsync all the layer files and directory using multiple threads to - // minimize latency. - par_fsync::par_fsync(&layer_paths)?; - - layer_paths.pop().unwrap(); - } + // Update the metadata file, with new 'disk_consistent_lsn' + // + // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing + // *all* the layers, to avoid fsyncing the file multiple times. + let disk_consistent_lsn; + disk_consistent_lsn = Lsn(frozen_layer.get_lsn_range().end.0 - 1); // If we were able to advance 'disk_consistent_lsn', save it the metadata file. // After crash, we will restart WAL streaming and processing from that point. @@ -1595,6 +1478,10 @@ impl LayeredTimeline { // don't remember what the correct value that corresponds to some old // LSN is. But if we flush everything, then the value corresponding // current 'last_record_lsn' is correct and we can store it on disk. + let RecordLsn { + last: last_record_lsn, + prev: prev_record_lsn, + } = self.last_record_lsn.load(); let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn { Some(prev_record_lsn) } else { @@ -1615,6 +1502,11 @@ impl LayeredTimeline { self.initdb_lsn, ); + fail_point!("checkpoint-before-saving-metadata", |x| bail!( + "{}", + x.unwrap() + )); + LayeredRepository::save_metadata( self.conf, self.timelineid, @@ -1622,11 +1514,11 @@ impl LayeredTimeline { &metadata, false, )?; - if self.upload_relishes.load(atomic::Ordering::Relaxed) { + if self.upload_layers.load(atomic::Ordering::Relaxed) { schedule_timeline_checkpoint_upload( self.tenantid, self.timelineid, - layer_paths, + vec![new_delta_path], metadata, ); } @@ -1638,34 +1530,273 @@ impl LayeredTimeline { Ok(()) } - fn evict_layer( - &self, - layer: Arc, - reconstruct_pages: bool, - ) -> Result> { - let new_historics = layer.write_to_disk(self, reconstruct_pages)?; + pub fn compact(&self) -> Result<()> { + // + // High level strategy for compaction / image creation: + // + // 1. First, calculate the desired "partitioning" of the + // currently in-use key space. The goal is to partition the + // key space into roughly fixed-size chunks, but also take into + // account any existing image layers, and try to align the + // chunk boundaries with the existing image layers to avoid + // too much churn. Also try to align chunk boundaries with + // relation boundaries. In principle, we don't know about + // relation boundaries here, we just deal with key-value + // pairs, and the code in pgdatadir_mapping.rs knows how to + // map relations into key-value pairs. But in practice we know + // that 'field6' is the block number, and the fields 1-5 + // identify a relation. This is just an optimization, + // though. + // + // 2. Once we know the partitioning, for each partition, + // decide if it's time to create a new image layer. The + // criteria is: there has been too much "churn" since the last + // image layer? The "churn" is fuzzy concept, it's a + // combination of too many delta files, or too much WAL in + // total in the delta file. Or perhaps: if creating an image + // file would allow to delete some older files. + // + // 3. After that, we compact all level0 delta files if there + // are too many of them. While compacting, we also garbage + // collect any page versions that are no longer needed because + // of the new image layers we created in step 2. + // + // TODO: This hight level strategy hasn't been implemented yet. + // Below are functions compact_level0() and create_image_layers() + // but they are a bit ad hoc and don't quite work like it's explained + // above. Rewrite it. + let _compaction_cs = self.compaction_cs.lock().unwrap(); - let mut layer_paths = Vec::new(); - let _write_guard = self.write_lock.lock().unwrap(); - let mut layers = self.layers.lock().unwrap(); + let target_file_size = self.conf.checkpoint_distance; - // Finally, replace the frozen in-memory layer with the new on-disk layers - layers.remove_historic(layer); + // 1. The partitioning was already done by the code in + // pgdatadir_mapping.rs. We just use it here. + let partitioning_guard = self.partitioning.read().unwrap(); + if let Some((partitioning, lsn)) = partitioning_guard.as_ref() { + let timer = self.create_images_time_histo.start_timer(); + // Make a copy of the partitioning, so that we can release + // the lock. Otherwise we could block the WAL receiver. + let lsn = *lsn; + let parts = partitioning.parts.clone(); + drop(partitioning_guard); - // Add the historics to the LayerMap - for delta_layer in new_historics.delta_layers { - layer_paths.push(delta_layer.path()); - layers.insert_historic(Arc::new(delta_layer)); + // 2. Create new image layers for partitions that have been modified + // "enough". + for part in parts.iter() { + if self.time_for_new_image_layer(part, lsn, 3)? { + self.create_image_layer(part, lsn)?; + } + } + timer.stop_and_record(); + + // 3. Compact + let timer = self.compact_time_histo.start_timer(); + self.compact_level0(target_file_size)?; + timer.stop_and_record(); + } else { + info!("Could not compact because no partitioning specified yet"); } - for image_layer in new_historics.image_layers { - layer_paths.push(image_layer.path()); - layers.insert_historic(Arc::new(image_layer)); + + // Call unload() on all frozen layers, to release memory. + // This shouldn't be much memory, as only metadata is slurped + // into memory. + let layers = self.layers.lock().unwrap(); + for layer in layers.iter_historic_layers() { + layer.unload()?; } - Ok(layer_paths) + drop(layers); + + Ok(()) } + // Is it time to create a new image layer for the given partition? + fn time_for_new_image_layer( + &self, + partition: &KeySpace, + lsn: Lsn, + threshold: usize, + ) -> Result { + let layers = self.layers.lock().unwrap(); + + for part_range in &partition.ranges { + let image_coverage = layers.image_coverage(part_range, lsn)?; + for (img_range, last_img) in image_coverage { + let img_lsn = if let Some(ref last_img) = last_img { + last_img.get_lsn_range().end + } else { + Lsn(0) + }; + + let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?; + + info!( + "range {}-{}, has {} deltas on this timeline", + img_range.start, img_range.end, num_deltas + ); + if num_deltas >= threshold { + return Ok(true); + } + } + } + + Ok(false) + } + + fn create_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result<()> { + let img_range = + partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; + let mut image_layer_writer = + ImageLayerWriter::new(self.conf, self.timelineid, self.tenantid, &img_range, lsn)?; + + for range in &partition.ranges { + let mut key = range.start; + while key < range.end { + let img = self.get(key, lsn)?; + image_layer_writer.put_image(key, &img)?; + key = key.next(); + } + } + let image_layer = image_layer_writer.finish()?; + + // Sync the new layer to disk before adding it to the layer map, to make sure + // we don't garbage collect something based on the new layer, before it has + // reached the disk. + // + // We must also fsync the timeline dir to ensure the directory entries for + // new layer files are durable + // + // Compaction creates multiple image layers. It would be better to create them all + // and fsync them all in parallel. + par_fsync::par_fsync(&[ + image_layer.path(), + self.conf.timeline_path(&self.timelineid, &self.tenantid), + ])?; + + // FIXME: Do we need to do something to upload it to remote storage here? + + let mut layers = self.layers.lock().unwrap(); + layers.insert_historic(Arc::new(image_layer)); + drop(layers); + + Ok(()) + } + + fn compact_level0(&self, target_file_size: u64) -> Result<()> { + let layers = self.layers.lock().unwrap(); + + // We compact or "shuffle" the level-0 delta layers when 10 have + // accumulated. + static COMPACT_THRESHOLD: usize = 10; + + let level0_deltas = layers.get_level0_deltas()?; + + if level0_deltas.len() < COMPACT_THRESHOLD { + return Ok(()); + } + drop(layers); + + // FIXME: this function probably won't work correctly if there's overlap + // in the deltas. + let lsn_range = level0_deltas + .iter() + .map(|l| l.get_lsn_range()) + .reduce(|a, b| min(a.start, b.start)..max(a.end, b.end)) + .unwrap(); + + let all_values_iter = level0_deltas.iter().map(|l| l.iter()).kmerge_by(|a, b| { + if let Ok((a_key, a_lsn, _)) = a { + if let Ok((b_key, b_lsn, _)) = b { + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + } else { + false + } + } else { + true + } + }); + + // Merge the contents of all the input delta layers into a new set + // of delta layers, based on the current partitioning. + // + // TODO: this actually divides the layers into fixed-size chunks, not + // based on the partitioning. + // + // TODO: we should also opportunistically materialize and + // garbage collect what we can. + let mut new_layers = Vec::new(); + let mut prev_key: Option = None; + let mut writer: Option = None; + for x in all_values_iter { + let (key, lsn, value) = x?; + + if let Some(prev_key) = prev_key { + if key != prev_key && writer.is_some() { + let size = writer.as_mut().unwrap().size(); + if size > target_file_size { + new_layers.push(writer.take().unwrap().finish(prev_key.next())?); + writer = None; + } + } + } + + if writer.is_none() { + writer = Some(DeltaLayerWriter::new( + self.conf, + self.timelineid, + self.tenantid, + key, + lsn_range.clone(), + )?); + } + + writer.as_mut().unwrap().put_value(key, lsn, value)?; + prev_key = Some(key); + } + if let Some(writer) = writer { + new_layers.push(writer.finish(prev_key.unwrap().next())?); + } + + // Sync layers + if !new_layers.is_empty() { + let mut layer_paths: Vec = new_layers.iter().map(|l| l.path()).collect(); + + // also sync the directory + layer_paths.push(self.conf.timeline_path(&self.timelineid, &self.tenantid)); + + // Fsync all the layer files and directory using multiple threads to + // minimize latency. + par_fsync::par_fsync(&layer_paths)?; + + layer_paths.pop().unwrap(); + } + + let mut layers = self.layers.lock().unwrap(); + for l in new_layers { + layers.insert_historic(Arc::new(l)); + } + + // Now that we have reshuffled the data to set of new delta layers, we can + // delete the old ones + for l in level0_deltas { + l.delete()?; + layers.remove_historic(l.clone()); + } + drop(layers); + + Ok(()) + } + + /// Update information about which layer files need to be retained on + /// garbage collection. This is separate from actually performing the GC, + /// and is updated more frequently, so that compaction can remove obsolete + /// page versions more aggressively. /// - /// Garbage collect layer files on a timeline that are no longer needed. + /// TODO: that's wishful thinking, compaction doesn't actually do that + /// currently. /// /// The caller specifies how much history is needed with the two arguments: /// @@ -1682,15 +1813,29 @@ impl LayeredTimeline { /// the latest LSN subtracted by a constant, and doesn't do anything smart /// to figure out what read-only nodes might actually need.) /// + fn update_gc_info(&self, retain_lsns: Vec, cutoff: Lsn) { + let mut gc_info = self.gc_info.write().unwrap(); + gc_info.retain_lsns = retain_lsns; + gc_info.cutoff = cutoff; + } + + /// + /// Garbage collect layer files on a timeline that are no longer needed. + /// /// Currently, we don't make any attempt at removing unneeded page versions /// within a layer file. We can only remove the whole file if it's fully /// obsolete. /// - pub fn gc_timeline(&self, retain_lsns: Vec, cutoff: Lsn) -> Result { + fn gc(&self) -> Result { let now = Instant::now(); let mut result: GcResult = Default::default(); let disk_consistent_lsn = self.get_disk_consistent_lsn(); - let _checkpoint_cs = self.checkpoint_cs.lock().unwrap(); + + let _compaction_cs = self.compaction_cs.lock().unwrap(); + + let gc_info = self.gc_info.read().unwrap(); + let retain_lsns = &gc_info.retain_lsns; + let cutoff = gc_info.cutoff; let _enter = info_span!("garbage collection", timeline = %self.timelineid, tenant = %self.tenantid, cutoff = %cutoff).entered(); @@ -1709,8 +1854,7 @@ impl LayeredTimeline { // Garbage collect the layer if all conditions are satisfied: // 1. it is older than cutoff LSN; // 2. it doesn't need to be retained for 'retain_lsns'; - // 3. newer on-disk layer exists (only for non-dropped segments); - // 4. this layer doesn't serve as a tombstone for some older layer; + // 3. newer on-disk image layers cover the layer's whole key range // let mut layers = self.layers.lock().unwrap(); 'outer: for l in layers.iter_historic_layers() { @@ -1724,28 +1868,16 @@ impl LayeredTimeline { continue; } - let seg = l.get_seg_tag(); - - if seg.rel.is_relation() { - result.ondisk_relfiles_total += 1; - } else { - result.ondisk_nonrelfiles_total += 1; - } + result.layers_total += 1; // 1. Is it newer than cutoff point? - if l.get_end_lsn() > cutoff { + if l.get_lsn_range().end > cutoff { debug!( - "keeping {} {}-{} because it's newer than cutoff {}", - seg, - l.get_start_lsn(), - l.get_end_lsn(), + "keeping {} because it's newer than cutoff {}", + l.filename().display(), cutoff ); - if seg.rel.is_relation() { - result.ondisk_relfiles_needed_by_cutoff += 1; - } else { - result.ondisk_nonrelfiles_needed_by_cutoff += 1; - } + result.layers_needed_by_cutoff += 1; continue 'outer; } @@ -1754,135 +1886,49 @@ impl LayeredTimeline { // might be referenced by child branches forever. // We can track this in child timeline GC and delete parent layers when // they are no longer needed. This might be complicated with long inheritance chains. - for retain_lsn in &retain_lsns { + for retain_lsn in retain_lsns { // start_lsn is inclusive - if &l.get_start_lsn() <= retain_lsn { + if &l.get_lsn_range().start <= retain_lsn { debug!( - "keeping {} {}-{} because it's still might be referenced by child branch forked at {} is_dropped: {} is_incremental: {}", - seg, - l.get_start_lsn(), - l.get_end_lsn(), + "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", + l.filename().display(), retain_lsn, - l.is_dropped(), l.is_incremental(), ); - if seg.rel.is_relation() { - result.ondisk_relfiles_needed_by_branches += 1; - } else { - result.ondisk_nonrelfiles_needed_by_branches += 1; - } + result.layers_needed_by_branches += 1; continue 'outer; } } // 3. Is there a later on-disk layer for this relation? - if !l.is_dropped() - && !layers.newer_image_layer_exists( - l.get_seg_tag(), - l.get_end_lsn(), - disk_consistent_lsn, - ) - { + // + // The end-LSN is exclusive, while disk_consistent_lsn is + // inclusive. For example, if disk_consistent_lsn is 100, it is + // OK for a delta layer to have end LSN 101, but if the end LSN + // is 102, then it might not have been fully flushed to disk + // before crash. + // + // FIXME: This logic is wrong. See https://github.com/zenithdb/zenith/issues/707 + if !layers.newer_image_layer_exists( + &l.get_key_range(), + l.get_lsn_range().end, + disk_consistent_lsn + 1, + )? { debug!( - "keeping {} {}-{} because it is the latest layer", - seg, - l.get_start_lsn(), - l.get_end_lsn() + "keeping {} because it is the latest layer", + l.filename().display() ); - if seg.rel.is_relation() { - result.ondisk_relfiles_not_updated += 1; - } else { - result.ondisk_nonrelfiles_not_updated += 1; - } + result.layers_not_updated += 1; continue 'outer; } - // 4. Does this layer serve as a tombstone for some older layer? - if l.is_dropped() { - let prior_lsn = l.get_start_lsn().checked_sub(1u64).unwrap(); - - // Check if this layer serves as a tombstone for this timeline - // We have to do this separately from timeline check below, - // because LayerMap of this timeline is already locked. - let mut is_tombstone = layers.layer_exists_at_lsn(l.get_seg_tag(), prior_lsn)?; - if is_tombstone { - debug!( - "earlier layer exists at {} in {}", - prior_lsn, self.timelineid - ); - } - // Now check ancestor timelines, if any are present locally - else if let Some(ancestor) = self - .ancestor_timeline - .as_ref() - .and_then(|timeline_entry| timeline_entry.ensure_loaded().ok()) - { - let prior_lsn = ancestor.get_last_record_lsn(); - if seg.rel.is_blocky() { - debug!( - "check blocky relish size {} at {} in {} for layer {}-{}", - seg, - prior_lsn, - ancestor.timelineid, - l.get_start_lsn(), - l.get_end_lsn() - ); - match ancestor.get_relish_size(seg.rel, prior_lsn).unwrap() { - Some(size) => { - let (last_live_seg, _rel_blknum) = - SegmentTag::from_blknum(seg.rel, size - 1); - debug!( - "blocky rel size is {} last_live_seg.segno {} seg.segno {}", - size, last_live_seg.segno, seg.segno - ); - if last_live_seg.segno >= seg.segno { - is_tombstone = true; - } - } - _ => { - debug!("blocky rel doesn't exist"); - } - } - } else { - debug!( - "check non-blocky relish existence {} at {} in {} for layer {}-{}", - seg, - prior_lsn, - ancestor.timelineid, - l.get_start_lsn(), - l.get_end_lsn() - ); - is_tombstone = ancestor.get_rel_exists(seg.rel, prior_lsn).unwrap_or(false); - } - } - - if is_tombstone { - debug!( - "keeping {} {}-{} because this layer serves as a tombstone for older layer", - seg, - l.get_start_lsn(), - l.get_end_lsn() - ); - - if seg.rel.is_relation() { - result.ondisk_relfiles_needed_as_tombstone += 1; - } else { - result.ondisk_nonrelfiles_needed_as_tombstone += 1; - } - continue 'outer; - } - } - // We didn't find any reason to keep this file, so remove it. debug!( - "garbage collecting {} {}-{} is_dropped: {} is_incremental: {}", - l.get_seg_tag(), - l.get_start_lsn(), - l.get_end_lsn(), - l.is_dropped(), + "garbage collecting {} is_dropped: xx is_incremental: {}", + l.filename().display(), l.is_incremental(), ); - layers_to_remove.push(Arc::clone(&l)); + layers_to_remove.push(Arc::clone(l)); } // Actually delete the layers from disk and remove them from the map. @@ -1892,222 +1938,75 @@ impl LayeredTimeline { doomed_layer.delete()?; layers.remove_historic(doomed_layer.clone()); - match ( - doomed_layer.is_dropped(), - doomed_layer.get_seg_tag().rel.is_relation(), - ) { - (true, true) => result.ondisk_relfiles_dropped += 1, - (true, false) => result.ondisk_nonrelfiles_dropped += 1, - (false, true) => result.ondisk_relfiles_removed += 1, - (false, false) => result.ondisk_nonrelfiles_removed += 1, - } + result.layers_removed += 1; } result.elapsed = now.elapsed(); Ok(result) } - fn lookup_cached_page( + /// + /// Reconstruct a value, using the given base image and WAL records in 'data'. + /// + fn reconstruct_value( &self, - rel: &RelishTag, - rel_blknum: BlockNumber, - lsn: Lsn, - ) -> Option<(Lsn, Bytes)> { - let cache = page_cache::get(); - if let RelishTag::Relation(rel_tag) = &rel { - let (lsn, read_guard) = cache.lookup_materialized_page( - self.tenantid, - self.timelineid, - *rel_tag, - rel_blknum, - lsn, - )?; - let img = Bytes::from(read_guard.to_vec()); - Some((lsn, img)) - } else { - None - } - } - - /// - /// Reconstruct a page version from given Layer - /// - fn materialize_page( - &self, - seg: SegmentTag, - seg_blknum: SegmentBlk, - lsn: Lsn, - layer: &dyn Layer, - ) -> anyhow::Result { - // Check the page cache. We will get back the most recent page with lsn <= `lsn`. - // The cached image can be returned directly if there is no WAL between the cached image - // and requested LSN. The cached image can also be used to reduce the amount of WAL needed - // for redo. - let rel = seg.rel; - let rel_blknum = seg.segno * RELISH_SEG_SIZE + seg_blknum; - let cached_page_img = match self.lookup_cached_page(&rel, rel_blknum, lsn) { - Some((cached_lsn, cached_img)) => { - match cached_lsn.cmp(&lsn) { - cmp::Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check - cmp::Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image - cmp::Ordering::Greater => { - bail!("the returned lsn should never be after the requested lsn") - } - } - Some((cached_lsn, cached_img)) - } - None => None, - }; - - let mut data = PageReconstructData { - records: Vec::new(), - page_img: cached_page_img, - }; - - // Holds an Arc reference to 'layer_ref' when iterating in the loop below. - let mut layer_arc: Arc; - - // Call the layer's get_page_reconstruct_data function to get the base image - // and WAL records needed to materialize the page. If it returns 'Continue', - // call it again on the predecessor layer until we have all the required data. - let mut layer_ref = layer; - let mut curr_lsn = lsn; - loop { - let result = self.reconstruct_time_histo.observe_closure_duration(|| { - layer_ref - .get_page_reconstruct_data(seg_blknum, curr_lsn, &mut data) - .with_context(|| { - format!( - "Failed to get reconstruct data {} {:?} {} {}", - layer_ref.get_seg_tag(), - layer_ref.filename(), - seg_blknum, - curr_lsn, - ) - }) - })?; - match result { - PageReconstructResult::Complete => break, - PageReconstructResult::Continue(cont_lsn) => { - // Fetch base image / more WAL from the returned predecessor layer - if let Some((cont_layer, cont_lsn)) = self.get_layer_for_read(seg, cont_lsn)? { - if cont_lsn == curr_lsn { - // We landed on the same layer again. Shouldn't happen, but if it does, - // don't get stuck in an infinite loop. - bail!( - "could not find predecessor of layer {} at {}, layer returned its own LSN", - layer_ref.filename().display(), - cont_lsn - ); - } - layer_arc = cont_layer; - layer_ref = &*layer_arc; - curr_lsn = cont_lsn; - continue; - } else { - bail!( - "could not find predecessor of layer {} at {}", - layer_ref.filename().display(), - cont_lsn - ); - } - } - PageReconstructResult::Missing(lsn) => { - // Oops, we could not reconstruct the page. - if data.records.is_empty() { - // no records, and no base image. This can happen if PostgreSQL extends a relation - // but never writes the page. - // - // Would be nice to detect that situation better. - warn!("Page {} blk {} at {} not found", rel, rel_blknum, lsn); - return Ok(ZERO_PAGE.clone()); - } - bail!( - "No base image found for page {} blk {} at {}/{}", - rel, - rel_blknum, - self.timelineid, - lsn, - ); - } - } - } - - self.reconstruct_page(rel, rel_blknum, lsn, data) - } - - /// - /// Reconstruct a page version, using the given base image and WAL records in 'data'. - /// - fn reconstruct_page( - &self, - rel: RelishTag, - rel_blknum: BlockNumber, + key: Key, request_lsn: Lsn, - mut data: PageReconstructData, + mut data: ValueReconstructState, ) -> Result { // Perform WAL redo if needed data.records.reverse(); // If we have a page image, and no WAL, we're all set if data.records.is_empty() { - if let Some((img_lsn, img)) = &data.page_img { + if let Some((img_lsn, img)) = &data.img { trace!( - "found page image for blk {} in {} at {}, no WAL redo required", - rel_blknum, - rel, + "found page image for key {} at {}, no WAL redo required", + key, img_lsn ); Ok(img.clone()) } else { - // FIXME: this ought to be an error? - warn!( - "Page {} blk {} at {} not found", - rel, rel_blknum, request_lsn - ); - Ok(ZERO_PAGE.clone()) + bail!("base image for {} at {} not found", key, request_lsn); } } else { // We need to do WAL redo. // // If we don't have a base image, then the oldest WAL record better initialize // the page - if data.page_img.is_none() && !data.records.first().unwrap().1.will_init() { - // FIXME: this ought to be an error? - warn!( - "Base image for page {}/{} at {} not found, but got {} WAL records", - rel, - rel_blknum, + if data.img.is_none() && !data.records.first().unwrap().1.will_init() { + bail!( + "Base image for {} at {} not found, but got {} WAL records", + key, request_lsn, data.records.len() ); - Ok(ZERO_PAGE.clone()) } else { - let base_img = if let Some((_lsn, img)) = data.page_img { - trace!("found {} WAL records and a base image for blk {} in {} at {}, performing WAL redo", data.records.len(), rel_blknum, rel, request_lsn); + let base_img = if let Some((_lsn, img)) = data.img { + trace!( + "found {} WAL records and a base image for {} at {}, performing WAL redo", + data.records.len(), + key, + request_lsn + ); Some(img) } else { - trace!("found {} WAL records that will init the page for blk {} in {} at {}, performing WAL redo", data.records.len(), rel_blknum, rel, request_lsn); + trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); None }; let last_rec_lsn = data.records.last().unwrap().0; - let img = self.walredo_mgr.request_redo( - rel, - rel_blknum, - request_lsn, - base_img, - data.records, - )?; + let img = + self.walredo_mgr + .request_redo(key, request_lsn, base_img, data.records)?; - if let RelishTag::Relation(rel_tag) = &rel { + if img.len() == page_cache::PAGE_SZ { let cache = page_cache::get(); cache.memorize_materialized_page( self.tenantid, self.timelineid, - *rel_tag, - rel_blknum, + key, last_rec_lsn, &img, ); @@ -2117,40 +2016,6 @@ impl LayeredTimeline { } } } - - /// - /// This is a helper function to increase current_total_relation_size - /// - fn increase_current_logical_size(&self, diff: u32) { - let val = self - .current_logical_size - .fetch_add(diff as usize, atomic::Ordering::SeqCst); - trace!( - "increase_current_logical_size: {} + {} = {}", - val, - diff, - val + diff as usize, - ); - self.current_logical_size_gauge - .set(val as i64 + diff as i64); - } - - /// - /// This is a helper function to decrease current_total_relation_size - /// - fn decrease_current_logical_size(&self, diff: u32) { - let val = self - .current_logical_size - .fetch_sub(diff as usize, atomic::Ordering::SeqCst); - trace!( - "decrease_current_logical_size: {} - {} = {}", - val, - diff, - val - diff as usize, - ); - self.current_logical_size_gauge - .set(val as i64 - diff as i64); - } } struct LayeredTimelineWriter<'a> { @@ -2166,159 +2031,20 @@ impl Deref for LayeredTimelineWriter<'_> { } } -impl<'a> TimelineWriter for LayeredTimelineWriter<'a> { - fn put_wal_record( - &self, - lsn: Lsn, - rel: RelishTag, - rel_blknum: u32, - rec: ZenithWalRecord, - ) -> Result<()> { - if !rel.is_blocky() && rel_blknum != 0 { - bail!( - "invalid request for block {} for non-blocky relish {}", - rel_blknum, - rel - ); - } - ensure!(lsn.is_aligned(), "unaligned record LSN"); - - let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); - let layer = self.tl.get_layer_for_write(seg, lsn)?; - let delta_size = layer.put_wal_record(lsn, seg_blknum, rec)?; - self.tl - .increase_current_logical_size(delta_size * BLCKSZ as u32); - Ok(()) +impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { + fn put(&self, key: Key, lsn: Lsn, value: Value) -> Result<()> { + self.tl.put_value(key, lsn, value) } - fn put_page_image( - &self, - rel: RelishTag, - rel_blknum: BlockNumber, - lsn: Lsn, - img: Bytes, - ) -> Result<()> { - if !rel.is_blocky() && rel_blknum != 0 { - bail!( - "invalid request for block {} for non-blocky relish {}", - rel_blknum, - rel - ); - } - ensure!(lsn.is_aligned(), "unaligned record LSN"); - - let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); - - let layer = self.tl.get_layer_for_write(seg, lsn)?; - let delta_size = layer.put_page_image(seg_blknum, lsn, img)?; - - self.tl - .increase_current_logical_size(delta_size * BLCKSZ as u32); - Ok(()) - } - - fn put_truncation(&self, rel: RelishTag, lsn: Lsn, relsize: BlockNumber) -> Result<()> { - if !rel.is_blocky() { - bail!("invalid truncation for non-blocky relish {}", rel); - } - ensure!(lsn.is_aligned(), "unaligned record LSN"); - - debug!("put_truncation: {} to {} blocks at {}", rel, relsize, lsn); - - let oldsize = self - .tl - .get_relish_size(rel, self.tl.get_last_record_lsn())? - .with_context(|| { - format!( - "attempted to truncate non-existent relish {} at {}", - rel, lsn - ) - })?; - - if oldsize <= relsize { - return Ok(()); - } - let old_last_seg = (oldsize - 1) / RELISH_SEG_SIZE; - - let last_remain_seg = if relsize == 0 { - 0 - } else { - (relsize - 1) / RELISH_SEG_SIZE - }; - - // Drop segments beyond the last remaining segment. - for remove_segno in (last_remain_seg + 1)..=old_last_seg { - let seg = SegmentTag { - rel, - segno: remove_segno, - }; - - let layer = self.tl.get_layer_for_write(seg, lsn)?; - layer.drop_segment(lsn); - } - - // Truncate the last remaining segment to the specified size - if relsize == 0 || relsize % RELISH_SEG_SIZE != 0 { - let seg = SegmentTag { - rel, - segno: last_remain_seg, - }; - let layer = self.tl.get_layer_for_write(seg, lsn)?; - layer.put_truncation(lsn, relsize % RELISH_SEG_SIZE) - } - self.tl - .decrease_current_logical_size((oldsize - relsize) * BLCKSZ as u32); - Ok(()) - } - - fn drop_relish(&self, rel: RelishTag, lsn: Lsn) -> Result<()> { - trace!("drop_segment: {} at {}", rel, lsn); - - if rel.is_blocky() { - if let Some(oldsize) = self - .tl - .get_relish_size(rel, self.tl.get_last_record_lsn())? - { - let old_last_seg = if oldsize == 0 { - 0 - } else { - (oldsize - 1) / RELISH_SEG_SIZE - }; - - // Drop all segments of the relish - for remove_segno in 0..=old_last_seg { - let seg = SegmentTag { - rel, - segno: remove_segno, - }; - let layer = self.tl.get_layer_for_write(seg, lsn)?; - layer.drop_segment(lsn); - } - self.tl - .decrease_current_logical_size(oldsize * BLCKSZ as u32); - } else { - warn!( - "drop_segment called on non-existent relish {} at {}", - rel, lsn - ); - } - } else { - // TODO handle TwoPhase relishes - let (seg, _seg_blknum) = SegmentTag::from_blknum(rel, 0); - let layer = self.tl.get_layer_for_write(seg, lsn)?; - layer.drop_segment(lsn); - } - - Ok(()) + fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()> { + self.tl.put_tombstone(key_range, lsn) } /// /// Remember the (end of) last valid WAL record remembered in the timeline. /// - fn advance_last_record_lsn(&self, new_lsn: Lsn) { - assert!(new_lsn.is_aligned()); - - self.tl.last_record_lsn.advance(new_lsn); + fn finish_write(&self, new_lsn: Lsn) { + self.tl.finish_write(new_lsn); } } @@ -2328,10 +2054,10 @@ pub fn dump_layerfile_from_path(path: &Path) -> Result<()> { let book = Book::new(file)?; match book.magic() { - delta_layer::DELTA_FILE_MAGIC => { + crate::DELTA_FILE_MAGIC => { DeltaLayer::new_for_path(path, &book)?.dump()?; } - image_layer::IMAGE_FILE_MAGIC => { + crate::IMAGE_FILE_MAGIC => { ImageLayer::new_for_path(path, &book)?.dump()?; } magic => bail!("unrecognized magic identifier: {:?}", magic), @@ -2368,9 +2094,11 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { /// file format and directory layout. The test here are more low level. /// #[cfg(test)] -mod tests { +pub mod tests { use super::*; + use crate::keyspace::KeySpaceAccum; use crate::repository::repo_harness::*; + use rand::{thread_rng, Rng}; #[test] fn corrupt_metadata() -> Result<()> { @@ -2387,7 +2115,7 @@ mod tests { let mut metadata_bytes = std::fs::read(&metadata_path)?; assert_eq!(metadata_bytes.len(), 512); - metadata_bytes[512 - 4 - 2] ^= 1; + metadata_bytes[8] ^= 1; std::fs::write(metadata_path, metadata_bytes)?; let err = harness.try_load().err().expect("should fail"); @@ -2400,113 +2128,259 @@ mod tests { Ok(()) } - /// - /// Test the logic in 'load_layer_map' that removes layer files that are - /// newer than 'disk_consistent_lsn'. - /// + // Target file size in the unit tests. In production, the target + // file size is much larger, maybe 1 GB. But a small size makes it + // much faster to exercise all the logic for creating the files, + // garbage collection, compaction etc. + pub const TEST_FILE_SIZE: u64 = 4 * 1024 * 1024; + #[test] - fn future_layerfiles() -> Result<()> { - const TEST_NAME: &str = "future_layerfiles"; - let harness = RepoHarness::create(TEST_NAME)?; - let repo = harness.load(); + fn test_images() -> Result<()> { + let repo = RepoHarness::create("test_images")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + #[allow(non_snake_case)] + let TEST_KEY: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); - // Create a timeline with disk_consistent_lsn = 8000 - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; let writer = tline.writer(); - writer.advance_last_record_lsn(Lsn(0x8000)); + writer.put(TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?; + writer.finish_write(Lsn(0x10)); drop(writer); - repo.checkpoint_iteration(CheckpointConfig::Forced)?; - drop(repo); - let timeline_path = harness.timeline_path(&TIMELINE_ID); + tline.checkpoint(CheckpointConfig::Forced)?; + tline.compact()?; - let make_empty_file = |filename: &str| -> std::io::Result<()> { - let path = timeline_path.join(filename); + let writer = tline.writer(); + writer.put(TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?; + writer.finish_write(Lsn(0x20)); + drop(writer); - assert!(!path.exists()); - std::fs::write(&path, &[])?; + tline.checkpoint(CheckpointConfig::Forced)?; + tline.compact()?; - Ok(()) - }; + let writer = tline.writer(); + writer.put(TEST_KEY, Lsn(0x30), Value::Image(TEST_IMG("foo at 0x30")))?; + writer.finish_write(Lsn(0x30)); + drop(writer); - // Helper function to check that a relation file exists, and a corresponding - // .0.old file does not. - let assert_exists = |filename: &str| { - let path = timeline_path.join(filename); - assert!(path.exists(), "file {} was removed", filename); + tline.checkpoint(CheckpointConfig::Forced)?; + tline.compact()?; - // Check that there is no .old file - let backup_path = timeline_path.join(format!("{}.0.old", filename)); - assert!( - !backup_path.exists(), - "unexpected backup file {}", - backup_path.display() - ); - }; + let writer = tline.writer(); + writer.put(TEST_KEY, Lsn(0x40), Value::Image(TEST_IMG("foo at 0x40")))?; + writer.finish_write(Lsn(0x40)); + drop(writer); - // Helper function to check that a relation file does *not* exists, and a corresponding - // ..old file does. - let assert_is_renamed = |filename: &str, num: u32| { - let path = timeline_path.join(filename); - assert!( - !path.exists(), - "file {} was not removed as expected", - filename - ); + tline.checkpoint(CheckpointConfig::Forced)?; + tline.compact()?; - let backup_path = timeline_path.join(format!("{}.{}.old", filename, num)); - assert!( - backup_path.exists(), - "backup file {} was not created", - backup_path.display() - ); - }; + assert_eq!(tline.get(TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); + assert_eq!(tline.get(TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30")); + assert_eq!(tline.get(TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40")); - // These files are considered to be in the future and will be renamed out - // of the way - let future_filenames = vec![ - format!("pg_control_0_{:016X}", 0x8001), - format!("pg_control_0_{:016X}_{:016X}", 0x8001, 0x8008), - ]; - // But these are not: - let past_filenames = vec![ - format!("pg_control_0_{:016X}", 0x8000), - format!("pg_control_0_{:016X}_{:016X}", 0x7000, 0x8001), - ]; + Ok(()) + } - for filename in future_filenames.iter().chain(past_filenames.iter()) { - make_empty_file(filename)?; + // + // Insert 1000 key-value pairs with increasing keys, checkpoint, + // repeat 50 times. + // + #[test] + fn test_bulk_insert() -> Result<()> { + let repo = RepoHarness::create("test_bulk_insert")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + let mut lsn = Lsn(0x10); + + let mut keyspace = KeySpaceAccum::new(); + + let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); + let mut blknum = 0; + for _ in 0..50 { + for _ in 0..1000 { + test_key.field6 = blknum; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + )?; + writer.finish_write(lsn); + drop(writer); + + keyspace.add_key(test_key); + + lsn = Lsn(lsn.0 + 0x10); + blknum += 1; + } + + let cutoff = tline.get_last_record_lsn(); + let parts = keyspace + .clone() + .to_keyspace() + .partition(TEST_FILE_SIZE as u64); + tline.hint_partitioning(parts.clone(), lsn)?; + + tline.update_gc_info(Vec::new(), cutoff); + tline.checkpoint(CheckpointConfig::Forced)?; + tline.compact()?; + tline.gc()?; } - // Load the timeline. This will cause the files in the "future" to be renamed - // away. - let new_repo = harness.load(); - new_repo.get_timeline_load(TIMELINE_ID).unwrap(); - drop(new_repo); + Ok(()) + } - for filename in future_filenames.iter() { - assert_is_renamed(filename, 0); - } - for filename in past_filenames.iter() { - assert_exists(filename); + #[test] + fn test_random_updates() -> Result<()> { + let repo = RepoHarness::create("test_random_updates")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + const NUM_KEYS: usize = 1000; + + let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); + + let mut keyspace = KeySpaceAccum::new(); + + // Track when each page was last modified. Used to assert that + // a read sees the latest page version. + let mut updated = [Lsn(0); NUM_KEYS]; + + let mut lsn = Lsn(0); + #[allow(clippy::needless_range_loop)] + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = blknum as u32; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + )?; + writer.finish_write(lsn); + updated[blknum] = lsn; + drop(writer); + + keyspace.add_key(test_key); } - // Create the future files again, and load again. They should be renamed to - // *.1.old this time. - for filename in future_filenames.iter() { - make_empty_file(filename)?; + let parts = keyspace.to_keyspace().partition(TEST_FILE_SIZE as u64); + tline.hint_partitioning(parts, lsn)?; + + for _ in 0..50 { + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = blknum as u32; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + )?; + println!("updating {} at {}", blknum, lsn); + writer.finish_write(lsn); + drop(writer); + updated[blknum] = lsn; + } + + // Read all the blocks + for (blknum, last_lsn) in updated.iter().enumerate() { + test_key.field6 = blknum as u32; + assert_eq!( + tline.get(test_key, lsn)?, + TEST_IMG(&format!("{} at {}", blknum, last_lsn)) + ); + } + + // Perform a cycle of checkpoint, compaction, and GC + println!("checkpointing {}", lsn); + let cutoff = tline.get_last_record_lsn(); + tline.update_gc_info(Vec::new(), cutoff); + tline.checkpoint(CheckpointConfig::Forced)?; + tline.compact()?; + tline.gc()?; } - let new_repo = harness.load(); - new_repo.get_timeline_load(TIMELINE_ID).unwrap(); - drop(new_repo); + Ok(()) + } - for filename in future_filenames.iter() { - assert_is_renamed(filename, 0); - assert_is_renamed(filename, 1); + #[test] + fn test_traverse_branches() -> Result<()> { + let repo = RepoHarness::create("test_traverse_branches")?.load(); + let mut tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + const NUM_KEYS: usize = 1000; + + let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); + + let mut keyspace = KeySpaceAccum::new(); + + // Track when each page was last modified. Used to assert that + // a read sees the latest page version. + let mut updated = [Lsn(0); NUM_KEYS]; + + let mut lsn = Lsn(0); + #[allow(clippy::needless_range_loop)] + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = blknum as u32; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + )?; + writer.finish_write(lsn); + updated[blknum] = lsn; + drop(writer); + + keyspace.add_key(test_key); } - for filename in past_filenames.iter() { - assert_exists(filename); + + let parts = keyspace.to_keyspace().partition(TEST_FILE_SIZE as u64); + tline.hint_partitioning(parts, lsn)?; + + let mut tline_id = TIMELINE_ID; + for _ in 0..50 { + let new_tline_id = ZTimelineId::generate(); + repo.branch_timeline(tline_id, new_tline_id, lsn)?; + tline = repo.get_timeline_load(new_tline_id)?; + tline_id = new_tline_id; + + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = blknum as u32; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + )?; + println!("updating {} at {}", blknum, lsn); + writer.finish_write(lsn); + drop(writer); + updated[blknum] = lsn; + } + + // Read all the blocks + for (blknum, last_lsn) in updated.iter().enumerate() { + test_key.field6 = blknum as u32; + assert_eq!( + tline.get(test_key, lsn)?, + TEST_IMG(&format!("{} at {}", blknum, last_lsn)) + ); + } + + // Perform a cycle of checkpoint, compaction, and GC + println!("checkpointing {}", lsn); + let cutoff = tline.get_last_record_lsn(); + tline.update_gc_info(Vec::new(), cutoff); + tline.checkpoint(CheckpointConfig::Forced)?; + tline.compact()?; + tline.gc()?; } Ok(()) diff --git a/pageserver/src/layered_repository/README.md b/pageserver/src/layered_repository/README.md index 20f89ddc70..519478e417 100644 --- a/pageserver/src/layered_repository/README.md +++ b/pageserver/src/layered_repository/README.md @@ -1,40 +1,42 @@ # Overview -The on-disk format is based on immutable files. The page server receives a -stream of incoming WAL, parses the WAL records to determine which pages they -apply to, and accumulates the incoming changes in memory. Every now and then, -the accumulated changes are written out to new immutable files. This process is -called checkpointing. Old versions of on-disk files that are not needed by any -timeline are removed by GC process. - The main responsibility of the Page Server is to process the incoming WAL, and reprocess it into a format that allows reasonably quick access to any page -version. +version. The page server slices the incoming WAL per relation and page, and +packages the sliced WAL into suitably-sized "layer files". The layer files +contain all the history of the database, back to some reasonable retention +period. This system replaces the base backups and the WAL archive used in a +traditional PostgreSQL installation. The layer files are immutable, they are not +modified in-place after creation. New layer files are created for new incoming +WAL, and old layer files are removed when they are no longer needed. + +The on-disk format is based on immutable files. The page server receives a +stream of incoming WAL, parses the WAL records to determine which pages they +apply to, and accumulates the incoming changes in memory. Whenever enough WAL +has been accumulated in memory, it is written out to a new immutable file. That +process accumulates "L0 delta files" on disk. When enough L0 files have been +accumulated, they are merged and re-partitioned into L1 files, and old files +that are no longer needed are removed by Garbage Collection (GC). The incoming WAL contains updates to arbitrary pages in the system. The distribution depends on the workload: the updates could be totally random, or there could be a long stream of updates to a single relation when data is bulk -loaded, for example, or something in between. The page server slices the -incoming WAL per relation and page, and packages the sliced WAL into -suitably-sized "layer files". The layer files contain all the history of the -database, back to some reasonable retention period. This system replaces the -base backups and the WAL archive used in a traditional PostgreSQL -installation. The layer files are immutable, they are not modified in-place -after creation. New layer files are created for new incoming WAL, and old layer -files are removed when they are no longer needed. We could also replace layer -files with new files that contain the same information, merging small files for -example, but that hasn't been implemented yet. +loaded, for example, or something in between. +Cloud Storage Page Server Safekeeper + L1 L0 Memory WAL -Cloud Storage Page Server Safekeeper - Local disk Memory WAL - -|AAAA| |AAAA|AAAA| |AA -|BBBB| |BBBB|BBBB| | -|CCCC|CCCC| <---- |CCCC|CCCC|CCCC| <--- |CC <---- ADEBAABED -|DDDD|DDDD| |DDDD|DDDD| |DDD -|EEEE| |EEEE|EEEE|EEEE| |E - ++----+ +----+----+ +|AAAA| |AAAA|AAAA| +---+-----+ | ++----+ +----+----+ | | | |AA +|BBBB| |BBBB|BBBB| |BB | AA | |BB ++----+----+ +----+----+ |C | BB | |CC +|CCCC|CCCC| <---- |CCCC|CCCC| <--- |D | CC | <--- |DDD <---- ADEBAABED ++----+----+ +----+----+ | | DDD | |E +|DDDD|DDDD| |DDDD|DDDD| |E | | | ++----+----+ +----+----+ | | | +|EEEE| |EEEE|EEEE| +---+-----+ ++----+ +----+----+ In this illustration, WAL is received as a stream from the Safekeeper, from the right. It is immediately captured by the page server and stored quickly in @@ -42,39 +44,29 @@ memory. The page server memory can be thought of as a quick "reorder buffer", used to hold the incoming WAL and reorder it so that we keep the WAL records for the same page and relation close to each other. -From the page server memory, whenever enough WAL has been accumulated for one -relation segment, it is moved to local disk, as a new layer file, and the memory -is released. +From the page server memory, whenever enough WAL has been accumulated, it is flushed +to disk into a new L0 layer file, and the memory is released. + +When enough L0 files have been accumulated, they are merged together rand sliced +per key-space, producing a new set of files where each file contains a more +narrow key range, but larger LSN range. From the local disk, the layers are further copied to Cloud Storage, for long-term archival. After a layer has been copied to Cloud Storage, it can be removed from local disk, although we currently keep everything locally for fast access. If a layer is needed that isn't found locally, it is fetched from Cloud -Storage and stored in local disk. - -# Terms used in layered repository - -- Relish - one PostgreSQL relation or similarly treated file. -- Segment - one slice of a Relish that is stored in a LayeredTimeline. -- Layer - specific version of a relish Segment in a range of LSNs. +Storage and stored in local disk. L0 and L1 files are both uploaded to Cloud +Storage. # Layer map -The LayerMap tracks what layers exist for all the relishes in a timeline. - -LayerMap consists of two data structures: -- segs - All the layers keyed by segment tag -- open_layers - data structure that hold all open layers ordered by oldest_pending_lsn for quick access during checkpointing. oldest_pending_lsn is the LSN of the oldest page version stored in this layer. - -All operations that update InMemory Layers should update both structures to keep them up-to-date. - -- LayeredTimeline - implements Timeline interface. - -All methods of LayeredTimeline are aware of its ancestors and return data taking them into account. -TODO: Are there any exceptions to this? -For example, timeline.list_rels(lsn) will return all segments that are visible in this timeline at the LSN, -including ones that were not modified in this timeline and thus don't have a layer in the timeline's LayerMap. +The LayerMap tracks what layers exist in a timeline. +Currently, the layer map is just a resizeable array (Vec). On a GetPage@LSN or +other read request, the layer map scans through the array to find the right layer +that contains the data for the requested page. The read-code in LayeredTimeline +is aware of the ancestor, and returns data from the ancestor timeline if it's +not found on the current timeline. # Different kinds of layers @@ -92,11 +84,11 @@ To avoid OOM errors, InMemory layers can be spilled to disk into ephemeral file. TODO: Clarify the difference between Closed, Historic and Frozen. There are two kinds of OnDisk layers: -- ImageLayer represents an image or a snapshot of a 10 MB relish segment, at one particular LSN. -- DeltaLayer represents a collection of WAL records or page images in a range of LSNs, for one - relish segment. - -Dropped segments are always represented on disk by DeltaLayer. +- ImageLayer represents a snapshot of all the keys in a particular range, at one + particular LSN. Any keys that are not present in the ImageLayer are known not + to exist at that LSN. +- DeltaLayer represents a collection of WAL records or page images in a range of + LSNs, for a range of keys. # Layer life cycle @@ -109,71 +101,71 @@ layer or a delta layer, it is a valid end bound. An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1 Every layer starts its life as an Open In-Memory layer. When the page server -receives the first WAL record for a segment, it creates a new In-Memory layer -for it, and puts it to the layer map. Later, the layer is old enough, its -contents are written to disk, as On-Disk layers. This process is called -"evicting" a layer. +receives the first WAL record for a timeline, it creates a new In-Memory layer +for it, and puts it to the layer map. Later, when the layer becomes full, its +contents are written to disk, as an on-disk layers. -Layer eviction is a two-step process: First, the layer is marked as closed, so -that it no longer accepts new WAL records, and the layer map is updated -accordingly. If a new WAL record for that segment arrives after this step, a new -Open layer is created to hold it. After this first step, the layer is a Closed +Flushing a layer is a two-step process: First, the layer is marked as closed, so +that it no longer accepts new WAL records, and a new in-memory layer is created +to hold any WAL after that point. After this first step, the layer is a Closed InMemory state. This first step is called "freezing" the layer. -In the second step, new Delta and Image layers are created, containing all the -data in the Frozen InMemory layer. When the new layers are ready, the original -frozen layer is replaced with the new layers in the layer map, and the original -frozen layer is dropped, releasing the memory. +In the second step, a new Delta layers is created, containing all the data from +the Frozen InMemory layer. When it has been created and flushed to disk, the +original frozen layer is replaced with the new layers in the layer map, and the +original frozen layer is dropped, releasing the memory. # Layer files (On-disk layers) -The files are called "layer files". Each layer file corresponds -to one RELISH_SEG_SIZE slice of a PostgreSQL relation fork or -non-rel file in a range of LSNs. The layer files -for each timeline are stored in the timeline's subdirectory under +The files are called "layer files". Each layer file covers a range of keys, and +a range of LSNs (or a single LSN, in case of image layers). You can think of it +as a rectangle in the two-dimensional key-LSN space. The layer files for each +timeline are stored in the timeline's subdirectory under .zenith/tenants//timelines. -There are two kind of layer file: base images, and deltas. A base -image file contains a layer of a segment as it was at one LSN, -whereas a delta file contains modifications to a segment - mostly in -the form of WAL records - in a range of LSN +There are two kind of layer files: images, and delta layers. An image file +contains a snapshot of all keys at a particular LSN, whereas a delta file +contains modifications to a segment - mostly in the form of WAL records - in a +range of LSN. -base image file: +image file: - rel______ + 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568 + start key end key LSN + +The first parts define the key range that the layer covers. See +pgdatadir_mapping.rs for how the key space is used. The last part is the LSN. delta file: - rel_______ +Delta files are named similarly, but they cover a range of LSNs: -For example: + 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051 + start key end key start LSN end LSN - rel_1663_13990_2609_0_10_000000000169C348 - rel_1663_13990_2609_0_10_000000000169C348_0000000001702000 +A delta file contains all the key-values in the key-range that were updated in +the LSN range. If a key has not been modified, there is no trace of it in the +delta layer. -In addition to the relations, with "rel_*" prefix, we use the same -format for storing various smaller files from the PostgreSQL data -directory. They will use different suffixes and the naming scheme up -to the LSNs vary. The Zenith source code uses the term "relish" to -mean "a relation, or other file that's treated like a relation in the -storage" For example, a base image of a CLOG segment would be named -like this: - pg_xact_0000_0_00000000198B06B0 +A delta layer file can cover a part of the overall key space, as in the previous +example, or the whole key range like this: -There is no difference in how the relation and non-relation files are -managed, except that the first part of file names is different. -Internally, the relations and non-relation files that are managed in -the versioned store are together called "relishes". + 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000578C6B29-0000000057A50051 -If a file has been dropped, the last layer file for it is created -with the _DROPPED suffix, e.g. - - rel_1663_13990_2609_0_10_000000000169C348_0000000001702000_DROPPED +A file that covers the whole key range is called a L0 file (Level 0), while a +file that covers only part of the key range is called a L1 file. The "level" of +a file is not explicitly stored anywhere, you can only distinguish them by +looking at the key range that a file covers. The read-path doesn't need to +treat L0 and L1 files any differently. ## Notation used in this document +FIXME: This is somewhat obsolete, the layer files cover a key-range rather than +a particular relation nowadays. However, the description on how you find a page +version, and how branching and GC works is still valid. + The full path of a delta file looks like this: .zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000 diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 1a6e941fbe..bb5fa02be1 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -1,6 +1,5 @@ -//! //! A DeltaLayer represents a collection of WAL records or page images in a range of -//! LSNs, for one segment. It is stored on a file on disk. +//! LSNs, and in a range of Keys. It is stored on a file on disk. //! //! Usually a delta layer only contains differences - in the form of WAL records against //! a base LSN. However, if a segment is newly created, by creating a new relation or @@ -11,84 +10,74 @@ //! can happen when you create a new branch in the middle of a delta layer, and the WAL //! records on the new branch are put in a new delta layer. //! -//! When a delta file needs to be accessed, we slurp the metadata and segsize chapters +//! When a delta file needs to be accessed, we slurp the 'index' metadata //! into memory, into the DeltaLayerInner struct. See load() and unload() functions. -//! To access a page/WAL record, we search `page_version_metas` for the block # and LSN. -//! The byte ranges in the metadata can be used to find the page/WAL record in -//! PAGE_VERSIONS_CHAPTER. +//! To access a particular value, we search `index` for the given key. +//! The byte offset in the index can be used to find the value in +//! VALUES_CHAPTER. //! //! On disk, the delta files are stored in timelines/ directory. //! Currently, there are no subdirectories, and each delta file is named like this: //! -//! ______ +//! -__- page/WAL record +/// byte ranges in VALUES_CHAPTER +static INDEX_CHAPTER: u64 = 1; -/// Mapping from (block #, lsn) -> page/WAL record -/// byte ranges in PAGE_VERSIONS_CHAPTER -static PAGE_VERSION_METAS_CHAPTER: u64 = 1; /// Page/WAL bytes - cannot be interpreted -/// without PAGE_VERSION_METAS_CHAPTER -static PAGE_VERSIONS_CHAPTER: u64 = 2; -static SEG_SIZES_CHAPTER: u64 = 3; +/// without the page versions from the INDEX_CHAPTER +static VALUES_CHAPTER: u64 = 2; /// Contains the [`Summary`] struct -static SUMMARY_CHAPTER: u64 = 4; +static SUMMARY_CHAPTER: u64 = 3; #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] struct Summary { tenantid: ZTenantId, timelineid: ZTimelineId, - seg: SegmentTag, - - start_lsn: Lsn, - end_lsn: Lsn, - - dropped: bool, + key_range: Range, + lsn_range: Range, } impl From<&DeltaLayer> for Summary { @@ -96,33 +85,17 @@ impl From<&DeltaLayer> for Summary { Self { tenantid: layer.tenantid, timelineid: layer.timelineid, - seg: layer.seg, - - start_lsn: layer.start_lsn, - end_lsn: layer.end_lsn, - - dropped: layer.dropped, + key_range: layer.key_range.clone(), + lsn_range: layer.lsn_range.clone(), } } } -#[derive(Serialize, Deserialize)] -struct BlobRange { - offset: u64, - size: usize, -} - -fn read_blob(reader: &BoundedReader<&'_ F>, range: &BlobRange) -> Result> { - let mut buf = vec![0u8; range.size]; - reader.read_exact_at(&mut buf, range.offset)?; - Ok(buf) -} - /// /// DeltaLayer is the in-memory data structure associated with an /// on-disk delta file. We keep a DeltaLayer in memory for each /// file, in the LayerMap. If a layer is in "loaded" state, we have a -/// copy of the file in memory, in 'inner'. Otherwise the struct is +/// copy of the index in memory, in 'inner'. Otherwise the struct is /// just a placeholder for a file that exists on disk, and it needs to /// be loaded before using it in queries. /// @@ -131,47 +104,24 @@ pub struct DeltaLayer { pub tenantid: ZTenantId, pub timelineid: ZTimelineId, - pub seg: SegmentTag, - - // - // This entry contains all the changes from 'start_lsn' to 'end_lsn'. The - // start is inclusive, and end is exclusive. - // - pub start_lsn: Lsn, - pub end_lsn: Lsn, - - dropped: bool, + pub key_range: Range, + pub lsn_range: Range, inner: RwLock, } pub struct DeltaLayerInner { - /// If false, the 'page_version_metas' and 'seg_sizes' have not been - /// loaded into memory yet. + /// If false, the 'index' has not been loaded into memory yet. loaded: bool, + /// + /// All versions of all pages in the layer are kept here. + /// Indexed by block number and LSN. The value is an offset into the + /// chapter where the page version is stored. + /// + index: HashMap>, + book: Option>, - - /// All versions of all pages in the file are are kept here. - /// Indexed by block number and LSN. - page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>, - - /// `seg_sizes` tracks the size of the segment at different points in time. - seg_sizes: VecMap, -} - -impl DeltaLayerInner { - fn get_seg_size(&self, lsn: Lsn) -> Result { - // Scan the VecMap backwards, starting from the given entry. - let slice = self - .seg_sizes - .slice_range((Included(&Lsn(0)), Included(&lsn))); - if let Some((_entry_lsn, entry)) = slice.last() { - Ok(*entry) - } else { - bail!("could not find seg size in delta layer") - } - } } impl Layer for DeltaLayer { @@ -183,132 +133,93 @@ impl Layer for DeltaLayer { self.timelineid } - fn get_seg_tag(&self) -> SegmentTag { - self.seg + fn get_key_range(&self) -> Range { + self.key_range.clone() } - fn is_dropped(&self) -> bool { - self.dropped - } - - fn get_start_lsn(&self) -> Lsn { - self.start_lsn - } - - fn get_end_lsn(&self) -> Lsn { - self.end_lsn + fn get_lsn_range(&self) -> Range { + self.lsn_range.clone() } fn filename(&self) -> PathBuf { PathBuf::from(self.layer_name().to_string()) } - /// Look up given page in the cache. - fn get_page_reconstruct_data( + fn get_value_reconstruct_data( &self, - blknum: SegmentBlk, - lsn: Lsn, - reconstruct_data: &mut PageReconstructData, - ) -> anyhow::Result { + key: Key, + lsn_range: Range, + reconstruct_state: &mut ValueReconstructState, + ) -> anyhow::Result { let mut need_image = true; - ensure!((0..RELISH_SEG_SIZE).contains(&blknum)); - - match &reconstruct_data.page_img { - Some((cached_lsn, _)) if &self.end_lsn <= cached_lsn => { - return Ok(PageReconstructResult::Complete) - } - _ => {} - } + ensure!(self.key_range.contains(&key)); { // Open the file and lock the metadata in memory let inner = self.load()?; - let page_version_reader = inner + let values_reader = inner .book .as_ref() .expect("should be loaded in load call above") - .chapter_reader(PAGE_VERSIONS_CHAPTER)?; + .chapter_reader(VALUES_CHAPTER)?; - // Scan the metadata VecMap backwards, starting from the given entry. - let minkey = (blknum, Lsn(0)); - let maxkey = (blknum, lsn); - let iter = inner - .page_version_metas - .slice_range((Included(&minkey), Included(&maxkey))) - .iter() - .rev(); - for ((_blknum, pv_lsn), blob_range) in iter { - match &reconstruct_data.page_img { - Some((cached_lsn, _)) if pv_lsn <= cached_lsn => { - return Ok(PageReconstructResult::Complete) - } - _ => {} - } - - let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?; - - match pv { - PageVersion::Page(img) => { - // Found a page image, return it - reconstruct_data.page_img = Some((*pv_lsn, img)); - need_image = false; + // Scan the page versions backwards, starting from `lsn`. + if let Some(vec_map) = inner.index.get(&key) { + let slice = vec_map.slice_range(lsn_range); + let mut size = 0usize; + let mut first_pos = 0u64; + for (_entry_lsn, blob_ref) in slice.iter().rev() { + size += blob_ref.size(); + first_pos = blob_ref.pos(); + if blob_ref.will_init() { break; } - PageVersion::Wal(rec) => { - let will_init = rec.will_init(); - reconstruct_data.records.push((*pv_lsn, rec)); - if will_init { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; + } + if size != 0 { + let mut buf = vec![0u8; size]; + values_reader.read_exact_at(&mut buf, first_pos)?; + for (entry_lsn, blob_ref) in slice.iter().rev() { + let offs = (blob_ref.pos() - first_pos) as usize; + let val = Value::des(&buf[offs..offs + blob_ref.size()])?; + match val { + Value::Image(img) => { + reconstruct_state.img = Some((*entry_lsn, img)); + need_image = false; + break; + } + Value::WalRecord(rec) => { + let will_init = rec.will_init(); + reconstruct_state.records.push((*entry_lsn, rec)); + if will_init { + // This WAL record initializes the page, so no need to go further back + need_image = false; + break; + } + } } } } } - - // If we didn't find any records for this, check if the request is beyond EOF - if need_image - && reconstruct_data.records.is_empty() - && self.seg.rel.is_blocky() - && blknum >= inner.get_seg_size(lsn)? - { - return Ok(PageReconstructResult::Missing(self.start_lsn)); - } - // release metadata lock and close the file } // If an older page image is needed to reconstruct the page, let the // caller know. if need_image { - Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1))) + Ok(ValueReconstructResult::Continue) } else { - Ok(PageReconstructResult::Complete) + Ok(ValueReconstructResult::Complete) } } - /// Get size of the relation at given LSN - fn get_seg_size(&self, lsn: Lsn) -> anyhow::Result { - ensure!(lsn >= self.start_lsn); - ensure!( - self.seg.rel.is_blocky(), - "get_seg_size() called on a non-blocky rel" - ); + fn iter(&self) -> Box> + '_> { + let inner = self.load().unwrap(); - let inner = self.load()?; - inner.get_seg_size(lsn) - } - - /// Does this segment exist at given LSN? - fn get_seg_exists(&self, lsn: Lsn) -> Result { - // Is the requested LSN after the rel was dropped? - if self.dropped && lsn >= self.end_lsn { - return Ok(false); + match DeltaValueIter::new(inner) { + Ok(iter) => Box::new(iter), + Err(err) => Box::new(std::iter::once(Err(err))), } - - // Otherwise, it exists. - Ok(true) } /// @@ -316,13 +227,22 @@ impl Layer for DeltaLayer { /// it will need to be loaded back. /// fn unload(&self) -> Result<()> { + // FIXME: In debug mode, loading and unloading the index slows + // things down so much that you get timeout errors. At least + // with the test_parallel_copy test. So as an even more ad hoc + // stopgap fix for that, only unload every on average 10 + // checkpoint cycles. + use rand::RngCore; + if rand::thread_rng().next_u32() > (u32::MAX / 10) { + return Ok(()); + } + let mut inner = match self.inner.try_write() { Ok(inner) => inner, Err(TryLockError::WouldBlock) => return Ok(()), Err(TryLockError::Poisoned(_)) => panic!("DeltaLayer lock was poisoned"), }; - inner.page_version_metas = VecMap::default(); - inner.seg_sizes = VecMap::default(); + inner.index = HashMap::default(); inner.loaded = false; // Note: we keep the Book open. Is that a good idea? The virtual file @@ -349,45 +269,52 @@ impl Layer for DeltaLayer { /// debugging function to print out the contents of the layer fn dump(&self) -> Result<()> { println!( - "----- delta layer for ten {} tli {} seg {} {}-{} ----", - self.tenantid, self.timelineid, self.seg, self.start_lsn, self.end_lsn + "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----", + self.tenantid, + self.timelineid, + self.key_range.start, + self.key_range.end, + self.lsn_range.start, + self.lsn_range.end ); - println!("--- seg sizes ---"); let inner = self.load()?; - for (k, v) in inner.seg_sizes.as_slice() { - println!(" {}: {}", k, v); - } - println!("--- page versions ---"); let path = self.path(); let file = std::fs::File::open(&path)?; let book = Book::new(file)?; + let chapter = book.chapter_reader(VALUES_CHAPTER)?; - let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?; - for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() { - let mut desc = String::new(); + let mut values: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); + values.sort_by_key(|k| k.0); - let buf = read_blob(&chapter, blob_range)?; - let pv = PageVersion::des(&buf)?; + for (key, versions) in values { + for (lsn, blob_ref) in versions.as_slice() { + let mut desc = String::new(); + let mut buf = vec![0u8; blob_ref.size()]; + chapter.read_exact_at(&mut buf, blob_ref.pos())?; + let val = Value::des(&buf); - match pv { - PageVersion::Page(img) => { - write!(&mut desc, " img {} bytes", img.len())?; - } - PageVersion::Wal(rec) => { - let wal_desc = walrecord::describe_wal_record(&rec); - write!( - &mut desc, - " rec {} bytes will_init: {} {}", - blob_range.size, - rec.will_init(), - wal_desc - )?; + match val { + Ok(Value::Image(img)) => { + write!(&mut desc, " img {} bytes", img.len())?; + } + Ok(Value::WalRecord(rec)) => { + let wal_desc = walrecord::describe_wal_record(&rec); + write!( + &mut desc, + " rec {} bytes will_init: {} {}", + buf.len(), + rec.will_init(), + wal_desc + )?; + } + Err(err) => { + write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?; + } } + println!(" key {} at {}: {}", key, lsn, desc); } - - println!(" blk {} at {}: {}", blk, lsn, desc); } Ok(()) @@ -475,18 +402,13 @@ impl DeltaLayer { } } - let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?; - let page_version_metas = VecMap::des(&chapter)?; - - let chapter = book.read_chapter(SEG_SIZES_CHAPTER)?; - let seg_sizes = VecMap::des(&chapter)?; + let chapter = book.read_chapter(INDEX_CHAPTER)?; + let index = HashMap::des(&chapter)?; debug!("loaded from {}", &path.display()); - inner.page_version_metas = page_version_metas; - inner.seg_sizes = seg_sizes; + inner.index = index; inner.loaded = true; - Ok(()) } @@ -501,15 +423,12 @@ impl DeltaLayer { path_or_conf: PathOrConf::Conf(conf), timelineid, tenantid, - seg: filename.seg, - start_lsn: filename.start_lsn, - end_lsn: filename.end_lsn, - dropped: filename.dropped, + key_range: filename.key_range.clone(), + lsn_range: filename.lsn_range.clone(), inner: RwLock::new(DeltaLayerInner { loaded: false, book: None, - page_version_metas: VecMap::default(), - seg_sizes: VecMap::default(), + index: HashMap::default(), }), } } @@ -519,7 +438,7 @@ impl DeltaLayer { /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary. pub fn new_for_path(path: &Path, book: &Book) -> Result where - F: std::os::unix::prelude::FileExt, + F: FileExt, { let chapter = book.read_chapter(SUMMARY_CHAPTER)?; let summary = Summary::des(&chapter)?; @@ -528,25 +447,20 @@ impl DeltaLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), timelineid: summary.timelineid, tenantid: summary.tenantid, - seg: summary.seg, - start_lsn: summary.start_lsn, - end_lsn: summary.end_lsn, - dropped: summary.dropped, + key_range: summary.key_range, + lsn_range: summary.lsn_range, inner: RwLock::new(DeltaLayerInner { loaded: false, book: None, - page_version_metas: VecMap::default(), - seg_sizes: VecMap::default(), + index: HashMap::default(), }), }) } fn layer_name(&self) -> DeltaFileName { DeltaFileName { - seg: self.seg, - start_lsn: self.start_lsn, - end_lsn: self.end_lsn, - dropped: self.dropped, + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), } } @@ -567,24 +481,24 @@ impl DeltaLayer { /// /// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...) /// -/// 2. Write the contents by calling `put_page_version` for every page +/// 2. Write the contents by calling `put_value` for every page /// version to store in the layer. /// /// 3. Call `finish`. /// pub struct DeltaLayerWriter { conf: &'static PageServerConf, + path: PathBuf, timelineid: ZTimelineId, tenantid: ZTenantId, - seg: SegmentTag, - start_lsn: Lsn, - end_lsn: Lsn, - dropped: bool, - page_version_writer: ChapterWriter>, - pv_offset: u64, + key_start: Key, + lsn_range: Range, - page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>, + index: HashMap>, + + values_writer: ChapterWriter>, + end_offset: u64, } impl DeltaLayerWriter { @@ -595,94 +509,86 @@ impl DeltaLayerWriter { conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId, - seg: SegmentTag, - start_lsn: Lsn, - end_lsn: Lsn, - dropped: bool, + key_start: Key, + lsn_range: Range, ) -> Result { - // Create the file + // Create the file initially with a temporary filename. We don't know + // the end key yet, so we cannot form the final filename yet. We will + // rename it when we're done. // // Note: This overwrites any existing file. There shouldn't be any. // FIXME: throw an error instead? - let path = DeltaLayer::path_for( - &PathOrConf::Conf(conf), - timelineid, - tenantid, - &DeltaFileName { - seg, - start_lsn, - end_lsn, - dropped, - }, - ); + let path = conf.timeline_path(&timelineid, &tenantid).join(format!( + "{}-XXX__{:016X}-{:016X}.temp", + key_start, + u64::from(lsn_range.start), + u64::from(lsn_range.end) + )); let file = VirtualFile::create(&path)?; let buf_writer = BufWriter::new(file); let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?; // Open the page-versions chapter for writing. The calls to - // `put_page_version` will use this to write the contents. - let page_version_writer = book.new_chapter(PAGE_VERSIONS_CHAPTER); + // `put_value` will use this to write the contents. + let values_writer = book.new_chapter(VALUES_CHAPTER); Ok(DeltaLayerWriter { conf, + path, timelineid, tenantid, - seg, - start_lsn, - end_lsn, - dropped, - page_version_writer, - page_version_metas: VecMap::default(), - pv_offset: 0, + key_start, + lsn_range, + index: HashMap::new(), + values_writer, + end_offset: 0, }) } /// - /// Append a page version to the file. + /// Append a key-value pair to the file. /// - /// 'buf' is a serialized PageVersion. - /// The page versions must be appended in blknum, lsn order. + /// The values must be appended in key, lsn order. /// - pub fn put_page_version(&mut self, blknum: SegmentBlk, lsn: Lsn, buf: &[u8]) -> Result<()> { + pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> { + //info!("DELTA: key {} at {} on {}", key, lsn, self.path.display()); + assert!(self.lsn_range.start <= lsn); // Remember the offset and size metadata. The metadata is written // to a separate chapter, in `finish`. - let blob_range = BlobRange { - offset: self.pv_offset, - size: buf.len(), - }; - self.page_version_metas - .append((blknum, lsn), blob_range) - .unwrap(); - - // write the page version - self.page_version_writer.write_all(buf)?; - self.pv_offset += buf.len() as u64; + let off = self.end_offset; + let buf = Value::ser(&val)?; + let len = buf.len(); + self.values_writer.write_all(&buf)?; + self.end_offset += len as u64; + let vec_map = self.index.entry(key).or_default(); + let blob_ref = BlobRef::new(off, len, val.will_init()); + let old = vec_map.append_or_update_last(lsn, blob_ref).unwrap().0; + if old.is_some() { + // We already had an entry for this LSN. That's odd.. + bail!( + "Value for {} at {} already exists in delta layer being built", + key, + lsn + ); + } Ok(()) } + pub fn size(&self) -> u64 { + self.end_offset + } + /// /// Finish writing the delta layer. /// - /// 'seg_sizes' is a list of size changes to store with the actual data. - /// - pub fn finish(self, seg_sizes: VecMap) -> anyhow::Result { - // Close the page-versions chapter - let book = self.page_version_writer.close()?; + pub fn finish(self, key_end: Key) -> anyhow::Result { + // Close the values chapter + let book = self.values_writer.close()?; - // Write out page versions metadata - let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER); - let buf = VecMap::ser(&self.page_version_metas)?; - chapter.write_all(&buf)?; - let book = chapter.close()?; - - if self.seg.rel.is_blocky() { - ensure!(!seg_sizes.is_empty()); - } - - // and seg_sizes to separate chapter - let mut chapter = book.new_chapter(SEG_SIZES_CHAPTER); - let buf = VecMap::ser(&seg_sizes)?; + // Write out the index + let mut chapter = book.new_chapter(INDEX_CHAPTER); + let buf = HashMap::ser(&self.index)?; chapter.write_all(&buf)?; let book = chapter.close()?; @@ -690,12 +596,8 @@ impl DeltaLayerWriter { let summary = Summary { tenantid: self.tenantid, timelineid: self.timelineid, - seg: self.seg, - - start_lsn: self.start_lsn, - end_lsn: self.end_lsn, - - dropped: self.dropped, + key_range: self.key_start..key_end, + lsn_range: self.lsn_range.clone(), }; Summary::ser_into(&summary, &mut chapter)?; let book = chapter.close()?; @@ -710,20 +612,111 @@ impl DeltaLayerWriter { path_or_conf: PathOrConf::Conf(self.conf), tenantid: self.tenantid, timelineid: self.timelineid, - seg: self.seg, - start_lsn: self.start_lsn, - end_lsn: self.end_lsn, - dropped: self.dropped, + key_range: self.key_start..key_end, + lsn_range: self.lsn_range.clone(), inner: RwLock::new(DeltaLayerInner { loaded: false, + index: HashMap::new(), book: None, - page_version_metas: VecMap::default(), - seg_sizes: VecMap::default(), }), }; - trace!("created delta layer {}", &layer.path().display()); + // Rename the file to its final name + // + // Note: This overwrites any existing file. There shouldn't be any. + // FIXME: throw an error instead? + let final_path = DeltaLayer::path_for( + &PathOrConf::Conf(self.conf), + self.timelineid, + self.tenantid, + &DeltaFileName { + key_range: self.key_start..key_end, + lsn_range: self.lsn_range, + }, + ); + std::fs::rename(self.path, &final_path)?; + + trace!("created delta layer {}", final_path.display()); Ok(layer) } + + pub fn abort(self) { + match self.values_writer.close() { + Ok(book) => { + if let Err(err) = book.close() { + error!("error while closing delta layer file: {}", err); + } + } + Err(err) => { + error!("error while closing chapter writer: {}", err); + } + } + if let Err(err) = std::fs::remove_file(self.path) { + error!("error removing unfinished delta layer file: {}", err); + } + } +} + +/// +/// Iterator over all key-value pairse stored in a delta layer +/// +/// FIXME: This creates a Vector to hold the offsets of all key value pairs. +/// That takes up quite a lot of memory. Should do this in a more streaming +/// fashion. +/// +struct DeltaValueIter { + all_offsets: Vec<(Key, Lsn, BlobRef)>, + next_idx: usize, + data: Vec, +} + +impl Iterator for DeltaValueIter { + type Item = Result<(Key, Lsn, Value)>; + + fn next(&mut self) -> Option { + self.next_res().transpose() + } +} + +impl DeltaValueIter { + fn new(inner: RwLockReadGuard) -> Result { + let mut index: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); + index.sort_by_key(|x| x.0); + + let mut all_offsets: Vec<(Key, Lsn, BlobRef)> = Vec::new(); + for (key, vec_map) in index.iter() { + for (lsn, blob_ref) in vec_map.as_slice().iter() { + all_offsets.push((**key, *lsn, *blob_ref)); + } + } + + let values_reader = inner + .book + .as_ref() + .expect("should be loaded in load call above") + .chapter_reader(VALUES_CHAPTER)?; + let file_size = values_reader.len() as usize; + let mut layer = DeltaValueIter { + all_offsets, + next_idx: 0, + data: vec![0u8; file_size], + }; + values_reader.read_exact_at(&mut layer.data, 0)?; + + Ok(layer) + } + + fn next_res(&mut self) -> Result> { + if self.next_idx < self.all_offsets.len() { + let (key, lsn, blob_ref) = self.all_offsets[self.next_idx]; + let offs = blob_ref.pos() as usize; + let size = blob_ref.size(); + let val = Value::des(&self.data[offs..offs + size])?; + self.next_idx += 1; + Ok(Some((key, lsn, val))) + } else { + Ok(None) + } + } } diff --git a/pageserver/src/layered_repository/filename.rs b/pageserver/src/layered_repository/filename.rs index df23700dfd..cd63f014c4 100644 --- a/pageserver/src/layered_repository/filename.rs +++ b/pageserver/src/layered_repository/filename.rs @@ -2,29 +2,52 @@ //! Helper functions for dealing with filenames of the image and delta layer files. //! use crate::config::PageServerConf; -use crate::layered_repository::storage_layer::SegmentTag; -use crate::relish::*; +use crate::repository::Key; +use std::cmp::Ordering; use std::fmt; +use std::ops::Range; use std::path::PathBuf; use zenith_utils::lsn::Lsn; // Note: LayeredTimeline::load_layer_map() relies on this sort order -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] +#[derive(Debug, PartialEq, Eq, Clone)] pub struct DeltaFileName { - pub seg: SegmentTag, - pub start_lsn: Lsn, - pub end_lsn: Lsn, - pub dropped: bool, + pub key_range: Range, + pub lsn_range: Range, +} + +impl PartialOrd for DeltaFileName { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for DeltaFileName { + fn cmp(&self, other: &Self) -> Ordering { + let mut cmp; + + cmp = self.key_range.start.cmp(&other.key_range.start); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.key_range.end.cmp(&other.key_range.end); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.lsn_range.start.cmp(&other.lsn_range.start); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.lsn_range.end.cmp(&other.lsn_range.end); + + cmp + } } /// Represents the filename of a DeltaLayer /// -/// ______ -/// -/// or if it was dropped: -/// -/// _______DROPPED +/// -__- /// impl DeltaFileName { /// @@ -32,234 +55,123 @@ impl DeltaFileName { /// match the expected pattern. /// pub fn parse_str(fname: &str) -> Option { - let rel; - let mut parts; - if let Some(rest) = fname.strip_prefix("rel_") { - parts = rest.split('_'); - rel = RelishTag::Relation(RelTag { - spcnode: parts.next()?.parse::().ok()?, - dbnode: parts.next()?.parse::().ok()?, - relnode: parts.next()?.parse::().ok()?, - forknum: parts.next()?.parse::().ok()?, - }); - } else if let Some(rest) = fname.strip_prefix("pg_xact_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::Clog, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") { - parts = rest.split('_'); - rel = RelishTag::FileNodeMap { - spcnode: parts.next()?.parse::().ok()?, - dbnode: parts.next()?.parse::().ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_twophase_") { - parts = rest.split('_'); - rel = RelishTag::TwoPhase { - xid: parts.next()?.parse::().ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") { - parts = rest.split('_'); - rel = RelishTag::Checkpoint; - } else if let Some(rest) = fname.strip_prefix("pg_control_") { - parts = rest.split('_'); - rel = RelishTag::ControlFile; - } else { + let mut parts = fname.split("__"); + let mut key_parts = parts.next()?.split('-'); + let mut lsn_parts = parts.next()?.split('-'); + + let key_start_str = key_parts.next()?; + let key_end_str = key_parts.next()?; + let lsn_start_str = lsn_parts.next()?; + let lsn_end_str = lsn_parts.next()?; + if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() { return None; } - let segno = parts.next()?.parse::().ok()?; + let key_start = Key::from_hex(key_start_str).ok()?; + let key_end = Key::from_hex(key_end_str).ok()?; - let seg = SegmentTag { rel, segno }; + let start_lsn = Lsn::from_hex(lsn_start_str).ok()?; + let end_lsn = Lsn::from_hex(lsn_end_str).ok()?; - let start_lsn = Lsn::from_hex(parts.next()?).ok()?; - let end_lsn = Lsn::from_hex(parts.next()?).ok()?; - - let mut dropped = false; - if let Some(suffix) = parts.next() { - if suffix == "DROPPED" { - dropped = true; - } else { - return None; - } - } - if parts.next().is_some() { + if start_lsn >= end_lsn { return None; + // or panic? + } + + if key_start >= key_end { + return None; + // or panic? } Some(DeltaFileName { - seg, - start_lsn, - end_lsn, - dropped, + key_range: key_start..key_end, + lsn_range: start_lsn..end_lsn, }) } } impl fmt::Display for DeltaFileName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let basename = match self.seg.rel { - RelishTag::Relation(reltag) => format!( - "rel_{}_{}_{}_{}", - reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum - ), - RelishTag::Slru { - slru: SlruKind::Clog, - segno, - } => format!("pg_xact_{:04X}", segno), - RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno, - } => format!("pg_multixact_members_{:04X}", segno), - RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno, - } => format!("pg_multixact_offsets_{:04X}", segno), - RelishTag::FileNodeMap { spcnode, dbnode } => { - format!("pg_filenodemap_{}_{}", spcnode, dbnode) - } - RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid), - RelishTag::Checkpoint => "pg_control_checkpoint".to_string(), - RelishTag::ControlFile => "pg_control".to_string(), - }; - write!( f, - "{}_{}_{:016X}_{:016X}{}", - basename, - self.seg.segno, - u64::from(self.start_lsn), - u64::from(self.end_lsn), - if self.dropped { "_DROPPED" } else { "" } + "{}-{}__{:016X}-{:016X}", + self.key_range.start, + self.key_range.end, + u64::from(self.lsn_range.start), + u64::from(self.lsn_range.end), ) } } -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] +#[derive(Debug, PartialEq, Eq, Clone)] pub struct ImageFileName { - pub seg: SegmentTag, + pub key_range: Range, pub lsn: Lsn, } +impl PartialOrd for ImageFileName { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for ImageFileName { + fn cmp(&self, other: &Self) -> Ordering { + let mut cmp; + + cmp = self.key_range.start.cmp(&other.key_range.start); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.key_range.end.cmp(&other.key_range.end); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.lsn.cmp(&other.lsn); + + cmp + } +} + /// /// Represents the filename of an ImageLayer /// -/// _____ -/// +/// -__ impl ImageFileName { /// /// Parse a string as an image file name. Returns None if the filename does not /// match the expected pattern. /// pub fn parse_str(fname: &str) -> Option { - let rel; - let mut parts; - if let Some(rest) = fname.strip_prefix("rel_") { - parts = rest.split('_'); - rel = RelishTag::Relation(RelTag { - spcnode: parts.next()?.parse::().ok()?, - dbnode: parts.next()?.parse::().ok()?, - relnode: parts.next()?.parse::().ok()?, - forknum: parts.next()?.parse::().ok()?, - }); - } else if let Some(rest) = fname.strip_prefix("pg_xact_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::Clog, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") { - parts = rest.split('_'); - rel = RelishTag::FileNodeMap { - spcnode: parts.next()?.parse::().ok()?, - dbnode: parts.next()?.parse::().ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_twophase_") { - parts = rest.split('_'); - rel = RelishTag::TwoPhase { - xid: parts.next()?.parse::().ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") { - parts = rest.split('_'); - rel = RelishTag::Checkpoint; - } else if let Some(rest) = fname.strip_prefix("pg_control_") { - parts = rest.split('_'); - rel = RelishTag::ControlFile; - } else { + let mut parts = fname.split("__"); + let mut key_parts = parts.next()?.split('-'); + + let key_start_str = key_parts.next()?; + let key_end_str = key_parts.next()?; + let lsn_str = parts.next()?; + if parts.next().is_some() || key_parts.next().is_some() { return None; } - let segno = parts.next()?.parse::().ok()?; + let key_start = Key::from_hex(key_start_str).ok()?; + let key_end = Key::from_hex(key_end_str).ok()?; - let seg = SegmentTag { rel, segno }; + let lsn = Lsn::from_hex(lsn_str).ok()?; - let lsn = Lsn::from_hex(parts.next()?).ok()?; - - if parts.next().is_some() { - return None; - } - - Some(ImageFileName { seg, lsn }) + Some(ImageFileName { + key_range: key_start..key_end, + lsn, + }) } } impl fmt::Display for ImageFileName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let basename = match self.seg.rel { - RelishTag::Relation(reltag) => format!( - "rel_{}_{}_{}_{}", - reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum - ), - RelishTag::Slru { - slru: SlruKind::Clog, - segno, - } => format!("pg_xact_{:04X}", segno), - RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno, - } => format!("pg_multixact_members_{:04X}", segno), - RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno, - } => format!("pg_multixact_offsets_{:04X}", segno), - RelishTag::FileNodeMap { spcnode, dbnode } => { - format!("pg_filenodemap_{}_{}", spcnode, dbnode) - } - RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid), - RelishTag::Checkpoint => "pg_control_checkpoint".to_string(), - RelishTag::ControlFile => "pg_control".to_string(), - }; - write!( f, - "{}_{}_{:016X}", - basename, - self.seg.segno, + "{}-{}__{:016X}", + self.key_range.start, + self.key_range.end, u64::from(self.lsn), ) } diff --git a/pageserver/src/layered_repository/global_layer_map.rs b/pageserver/src/layered_repository/global_layer_map.rs deleted file mode 100644 index 169a89650a..0000000000 --- a/pageserver/src/layered_repository/global_layer_map.rs +++ /dev/null @@ -1,142 +0,0 @@ -//! -//! Global registry of open layers. -//! -//! Whenever a new in-memory layer is created to hold incoming WAL, it is registered -//! in [`GLOBAL_LAYER_MAP`], so that we can keep track of the total number of -//! in-memory layers in the system, and know when we need to evict some to release -//! memory. -//! -//! Each layer is assigned a unique ID when it's registered in the global registry. -//! The ID can be used to relocate the layer later, without having to hold locks. -//! - -use std::sync::atomic::{AtomicU8, Ordering}; -use std::sync::{Arc, RwLock}; - -use super::inmemory_layer::InMemoryLayer; - -use lazy_static::lazy_static; - -const MAX_USAGE_COUNT: u8 = 5; - -lazy_static! { - pub static ref GLOBAL_LAYER_MAP: RwLock = - RwLock::new(InMemoryLayers::default()); -} - -// TODO these types can probably be smaller -#[derive(PartialEq, Eq, Clone, Copy)] -pub struct LayerId { - index: usize, - tag: u64, // to avoid ABA problem -} - -enum SlotData { - Occupied(Arc), - /// Vacant slots form a linked list, the value is the index - /// of the next vacant slot in the list. - Vacant(Option), -} - -struct Slot { - tag: u64, - data: SlotData, - usage_count: AtomicU8, // for clock algorithm -} - -#[derive(Default)] -pub struct InMemoryLayers { - slots: Vec, - num_occupied: usize, - - // Head of free-slot list. - next_empty_slot_idx: Option, -} - -impl InMemoryLayers { - pub fn insert(&mut self, layer: Arc) -> LayerId { - let slot_idx = match self.next_empty_slot_idx { - Some(slot_idx) => slot_idx, - None => { - let idx = self.slots.len(); - self.slots.push(Slot { - tag: 0, - data: SlotData::Vacant(None), - usage_count: AtomicU8::new(0), - }); - idx - } - }; - let slots_len = self.slots.len(); - - let slot = &mut self.slots[slot_idx]; - - match slot.data { - SlotData::Occupied(_) => { - panic!("an occupied slot was in the free list"); - } - SlotData::Vacant(next_empty_slot_idx) => { - self.next_empty_slot_idx = next_empty_slot_idx; - } - } - - slot.data = SlotData::Occupied(layer); - slot.usage_count.store(1, Ordering::Relaxed); - - self.num_occupied += 1; - assert!(self.num_occupied <= slots_len); - - LayerId { - index: slot_idx, - tag: slot.tag, - } - } - - pub fn get(&self, layer_id: &LayerId) -> Option> { - let slot = self.slots.get(layer_id.index)?; // TODO should out of bounds indexes just panic? - if slot.tag != layer_id.tag { - return None; - } - - if let SlotData::Occupied(layer) = &slot.data { - let _ = slot.usage_count.fetch_update( - Ordering::Relaxed, - Ordering::Relaxed, - |old_usage_count| { - if old_usage_count < MAX_USAGE_COUNT { - Some(old_usage_count + 1) - } else { - None - } - }, - ); - Some(Arc::clone(layer)) - } else { - None - } - } - - // TODO this won't be a public API in the future - pub fn remove(&mut self, layer_id: &LayerId) { - let slot = &mut self.slots[layer_id.index]; - - if slot.tag != layer_id.tag { - return; - } - - match &slot.data { - SlotData::Occupied(_layer) => { - // TODO evict the layer - } - SlotData::Vacant(_) => unimplemented!(), - } - - slot.data = SlotData::Vacant(self.next_empty_slot_idx); - self.next_empty_slot_idx = Some(layer_id.index); - - assert!(self.num_occupied > 0); - self.num_occupied -= 1; - - slot.tag = slot.tag.wrapping_add(1); - } -} diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 5b8ec46452..ab51c36cae 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -1,55 +1,54 @@ -//! An ImageLayer represents an image or a snapshot of a segment at one particular LSN. -//! It is stored in a file on disk. +//! An ImageLayer represents an image or a snapshot of a key-range at +//! one particular LSN. It contains an image of all key-value pairs +//! in its key-range. Any key that falls into the image layer's range +//! but does not exist in the layer, does not exist. //! -//! On disk, the image files are stored in timelines/ directory. -//! Currently, there are no subdirectories, and each image layer file is named like this: +//! An image layer is stored in a file on disk. The file is stored in +//! timelines/ directory. Currently, there are no +//! subdirectories, and each image layer file is named like this: //! -//! Note that segno is -//! _____ +//! -__ //! //! For example: //! -//! 1663_13990_2609_0_5_000000000169C348 +//! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568 //! //! An image file is constructed using the 'bookfile' crate. //! //! Only metadata is loaded into memory by the load function. //! When images are needed, they are read directly from disk. //! -//! For blocky relishes, the images are stored in BLOCKY_IMAGES_CHAPTER. -//! All the images are required to be BLOCK_SIZE, which allows for random access. -//! -//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER. -//! use crate::config::PageServerConf; use crate::layered_repository::filename::{ImageFileName, PathOrConf}; use crate::layered_repository::storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, SegmentBlk, SegmentTag, + BlobRef, Layer, ValueReconstructResult, ValueReconstructState, }; -use crate::layered_repository::RELISH_SEG_SIZE; +use crate::repository::{Key, Value}; use crate::virtual_file::VirtualFile; +use crate::IMAGE_FILE_MAGIC; use crate::{ZTenantId, ZTimelineId}; -use anyhow::{anyhow, bail, ensure, Context, Result}; +use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use log::*; use serde::{Deserialize, Serialize}; -use std::convert::TryInto; +use std::collections::HashMap; use std::fs; use std::io::{BufWriter, Write}; +use std::ops::Range; use std::path::{Path, PathBuf}; -use std::sync::{RwLock, RwLockReadGuard}; +use std::sync::{RwLock, RwLockReadGuard, TryLockError}; use bookfile::{Book, BookWriter, ChapterWriter}; use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::Lsn; -// Magic constant to identify a Zenith segment image file -pub const IMAGE_FILE_MAGIC: u32 = 0x5A616E01 + 1; +/// Mapping from (key, lsn) -> page/WAL record +/// byte ranges in VALUES_CHAPTER +static INDEX_CHAPTER: u64 = 1; /// Contains each block in block # order -const BLOCKY_IMAGES_CHAPTER: u64 = 1; -const NONBLOCKY_IMAGE_CHAPTER: u64 = 2; +const VALUES_CHAPTER: u64 = 2; /// Contains the [`Summary`] struct const SUMMARY_CHAPTER: u64 = 3; @@ -58,7 +57,7 @@ const SUMMARY_CHAPTER: u64 = 3; struct Summary { tenantid: ZTenantId, timelineid: ZTimelineId, - seg: SegmentTag, + key_range: Range, lsn: Lsn, } @@ -68,19 +67,17 @@ impl From<&ImageLayer> for Summary { Self { tenantid: layer.tenantid, timelineid: layer.timelineid, - seg: layer.seg, + key_range: layer.key_range.clone(), lsn: layer.lsn, } } } -const BLOCK_SIZE: usize = 8192; - /// /// ImageLayer is the in-memory data structure associated with an on-disk image /// file. We keep an ImageLayer in memory for each file, in the LayerMap. If a -/// layer is in "loaded" state, we have a copy of the file in memory, in 'inner'. +/// layer is in "loaded" state, we have a copy of the index in memory, in 'inner'. /// Otherwise the struct is just a placeholder for a file that exists on disk, /// and it needs to be loaded before using it in queries. /// @@ -88,7 +85,7 @@ pub struct ImageLayer { path_or_conf: PathOrConf, pub tenantid: ZTenantId, pub timelineid: ZTimelineId, - pub seg: SegmentTag, + pub key_range: Range, // This entry contains an image of all pages as of this LSN pub lsn: Lsn, @@ -96,18 +93,16 @@ pub struct ImageLayer { inner: RwLock, } -#[derive(Clone)] -enum ImageType { - Blocky { num_blocks: SegmentBlk }, - NonBlocky, -} - pub struct ImageLayerInner { - /// If None, the 'image_type' has not been loaded into memory yet. + /// If false, the 'index' has not been loaded into memory yet. + loaded: bool, + + /// The underlying (virtual) file handle. None if the layer hasn't been loaded + /// yet. book: Option>, - /// Derived from filename and bookfile chapter metadata - image_type: ImageType, + /// offset of each value + index: HashMap, } impl Layer for ImageLayer { @@ -123,98 +118,82 @@ impl Layer for ImageLayer { self.timelineid } - fn get_seg_tag(&self) -> SegmentTag { - self.seg + fn get_key_range(&self) -> Range { + self.key_range.clone() } - fn is_dropped(&self) -> bool { - false - } - - fn get_start_lsn(&self) -> Lsn { - self.lsn - } - - fn get_end_lsn(&self) -> Lsn { + fn get_lsn_range(&self) -> Range { // End-bound is exclusive - self.lsn + 1 + self.lsn..(self.lsn + 1) } /// Look up given page in the file - fn get_page_reconstruct_data( + fn get_value_reconstruct_data( &self, - blknum: SegmentBlk, - lsn: Lsn, - reconstruct_data: &mut PageReconstructData, - ) -> anyhow::Result { - ensure!((0..RELISH_SEG_SIZE).contains(&blknum)); - ensure!(lsn >= self.lsn); - - match reconstruct_data.page_img { - Some((cached_lsn, _)) if self.lsn <= cached_lsn => { - return Ok(PageReconstructResult::Complete) - } - _ => {} - } + key: Key, + lsn_range: Range, + reconstruct_state: &mut ValueReconstructState, + ) -> anyhow::Result { + assert!(self.key_range.contains(&key)); + assert!(lsn_range.end >= self.lsn); let inner = self.load()?; - let buf = match &inner.image_type { - ImageType::Blocky { num_blocks } => { - // Check if the request is beyond EOF - if blknum >= *num_blocks { - return Ok(PageReconstructResult::Missing(lsn)); - } + if let Some(blob_ref) = inner.index.get(&key) { + let chapter = inner + .book + .as_ref() + .unwrap() + .chapter_reader(VALUES_CHAPTER)?; - let mut buf = vec![0u8; BLOCK_SIZE]; - let offset = BLOCK_SIZE as u64 * blknum as u64; - - let chapter = inner - .book - .as_ref() - .unwrap() - .chapter_reader(BLOCKY_IMAGES_CHAPTER)?; - - chapter.read_exact_at(&mut buf, offset).with_context(|| { + let mut blob = vec![0; blob_ref.size()]; + chapter + .read_exact_at(&mut blob, blob_ref.pos()) + .with_context(|| { format!( - "failed to read page from data file {} at offset {}", + "failed to read {} bytes from data file {} at offset {}", + blob_ref.size(), self.filename().display(), - offset + blob_ref.pos() ) })?; + let value = Bytes::from(blob); - buf - } - ImageType::NonBlocky => { - ensure!(blknum == 0); - inner - .book - .as_ref() - .unwrap() - .read_chapter(NONBLOCKY_IMAGE_CHAPTER)? - .into_vec() - } - }; - - reconstruct_data.page_img = Some((self.lsn, Bytes::from(buf))); - Ok(PageReconstructResult::Complete) - } - - /// Get size of the segment - fn get_seg_size(&self, _lsn: Lsn) -> Result { - let inner = self.load()?; - match inner.image_type { - ImageType::Blocky { num_blocks } => Ok(num_blocks), - ImageType::NonBlocky => Err(anyhow!("get_seg_size called for non-blocky segment")), + reconstruct_state.img = Some((self.lsn, value)); + Ok(ValueReconstructResult::Complete) + } else { + Ok(ValueReconstructResult::Missing) } } - /// Does this segment exist at given LSN? - fn get_seg_exists(&self, _lsn: Lsn) -> Result { - Ok(true) + fn iter(&self) -> Box>> { + todo!(); } fn unload(&self) -> Result<()> { + // Unload the index. + // + // TODO: we should access the index directly from pages on the disk, + // using the buffer cache. This load/unload mechanism is really ad hoc. + + // FIXME: In debug mode, loading and unloading the index slows + // things down so much that you get timeout errors. At least + // with the test_parallel_copy test. So as an even more ad hoc + // stopgap fix for that, only unload every on average 10 + // checkpoint cycles. + use rand::RngCore; + if rand::thread_rng().next_u32() > (u32::MAX / 10) { + return Ok(()); + } + + let mut inner = match self.inner.try_write() { + Ok(inner) => inner, + Err(TryLockError::WouldBlock) => return Ok(()), + Err(TryLockError::Poisoned(_)) => panic!("ImageLayer lock was poisoned"), + }; + inner.index = HashMap::default(); + inner.loaded = false; + Ok(()) } @@ -235,22 +214,22 @@ impl Layer for ImageLayer { /// debugging function to print out the contents of the layer fn dump(&self) -> Result<()> { println!( - "----- image layer for ten {} tli {} seg {} at {} ----", - self.tenantid, self.timelineid, self.seg, self.lsn + "----- image layer for ten {} tli {} key {}-{} at {} ----", + self.tenantid, self.timelineid, self.key_range.start, self.key_range.end, self.lsn ); let inner = self.load()?; - match inner.image_type { - ImageType::Blocky { num_blocks } => println!("({}) blocks ", num_blocks), - ImageType::NonBlocky => { - let chapter = inner - .book - .as_ref() - .unwrap() - .read_chapter(NONBLOCKY_IMAGE_CHAPTER)?; - println!("non-blocky ({} bytes)", chapter.len()); - } + let mut index_vec: Vec<(&Key, &BlobRef)> = inner.index.iter().collect(); + index_vec.sort_by_key(|x| x.1.pos()); + + for (key, blob_ref) in index_vec { + println!( + "key: {} size {} offset {}", + key, + blob_ref.size(), + blob_ref.pos() + ); } Ok(()) @@ -280,7 +259,7 @@ impl ImageLayer { loop { // Quick exit if already loaded let inner = self.inner.read().unwrap(); - if inner.book.is_some() { + if inner.loaded { return Ok(inner); } @@ -306,14 +285,16 @@ impl ImageLayer { fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> { let path = self.path(); - let file = VirtualFile::open(&path) - .with_context(|| format!("Failed to open virtual file '{}'", path.display()))?; - let book = Book::new(file).with_context(|| { - format!( - "Failed to open virtual file '{}' as a bookfile", - path.display() - ) - })?; + + // Open the file if it's not open already. + if inner.book.is_none() { + let file = VirtualFile::open(&path) + .with_context(|| format!("Failed to open file '{}'", path.display()))?; + inner.book = Some(Book::new(file).with_context(|| { + format!("Failed to open file '{}' as a bookfile", path.display()) + })?); + } + let book = inner.book.as_ref().unwrap(); match &self.path_or_conf { PathOrConf::Conf(_) => { @@ -340,23 +321,13 @@ impl ImageLayer { } } - let image_type = if self.seg.rel.is_blocky() { - let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?; - let images_len = chapter.len(); - ensure!(images_len % BLOCK_SIZE as u64 == 0); - let num_blocks: SegmentBlk = (images_len / BLOCK_SIZE as u64).try_into()?; - ImageType::Blocky { num_blocks } - } else { - let _chapter = book.chapter_reader(NONBLOCKY_IMAGE_CHAPTER)?; - ImageType::NonBlocky - }; + let chapter = book.read_chapter(INDEX_CHAPTER)?; + let index = HashMap::des(&chapter)?; - debug!("loaded from {}", &path.display()); + info!("loaded from {}", &path.display()); - *inner = ImageLayerInner { - book: Some(book), - image_type, - }; + inner.index = index; + inner.loaded = true; Ok(()) } @@ -372,11 +343,12 @@ impl ImageLayer { path_or_conf: PathOrConf::Conf(conf), timelineid, tenantid, - seg: filename.seg, + key_range: filename.key_range.clone(), lsn: filename.lsn, inner: RwLock::new(ImageLayerInner { book: None, - image_type: ImageType::Blocky { num_blocks: 0 }, + index: HashMap::new(), + loaded: false, }), } } @@ -395,18 +367,19 @@ impl ImageLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), timelineid: summary.timelineid, tenantid: summary.tenantid, - seg: summary.seg, + key_range: summary.key_range, lsn: summary.lsn, inner: RwLock::new(ImageLayerInner { book: None, - image_type: ImageType::Blocky { num_blocks: 0 }, + index: HashMap::new(), + loaded: false, }), }) } fn layer_name(&self) -> ImageFileName { ImageFileName { - seg: self.seg, + key_range: self.key_range.clone(), lsn: self.lsn, } } @@ -435,15 +408,18 @@ impl ImageLayer { /// pub struct ImageLayerWriter { conf: &'static PageServerConf, + path: PathBuf, timelineid: ZTimelineId, tenantid: ZTenantId, - seg: SegmentTag, + key_range: Range, lsn: Lsn, - num_blocks: SegmentBlk, + values_writer: Option>>, + end_offset: u64, - page_image_writer: ChapterWriter>, - num_blocks_written: SegmentBlk, + index: HashMap, + + finished: bool, } impl ImageLayerWriter { @@ -451,9 +427,8 @@ impl ImageLayerWriter { conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId, - seg: SegmentTag, + key_range: &Range, lsn: Lsn, - num_blocks: SegmentBlk, ) -> anyhow::Result { // Create the file // @@ -463,70 +438,75 @@ impl ImageLayerWriter { &PathOrConf::Conf(conf), timelineid, tenantid, - &ImageFileName { seg, lsn }, + &ImageFileName { + key_range: key_range.clone(), + lsn, + }, ); + info!("new image layer {}", path.display()); let file = VirtualFile::create(&path)?; let buf_writer = BufWriter::new(file); let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?; // Open the page-images chapter for writing. The calls to - // `put_page_image` will use this to write the contents. - let chapter = if seg.rel.is_blocky() { - book.new_chapter(BLOCKY_IMAGES_CHAPTER) - } else { - ensure!(num_blocks == 1); - book.new_chapter(NONBLOCKY_IMAGE_CHAPTER) - }; + // `put_image` will use this to write the contents. + let chapter = book.new_chapter(VALUES_CHAPTER); let writer = ImageLayerWriter { conf, + path, timelineid, tenantid, - seg, + key_range: key_range.clone(), lsn, - num_blocks, - page_image_writer: chapter, - num_blocks_written: 0, + values_writer: Some(chapter), + index: HashMap::new(), + end_offset: 0, + finished: false, }; Ok(writer) } /// - /// Write next page image to the file. + /// Write next value to the file. /// /// The page versions must be appended in blknum order. /// - pub fn put_page_image(&mut self, block_bytes: &[u8]) -> anyhow::Result<()> { - ensure!(self.num_blocks_written < self.num_blocks); - if self.seg.rel.is_blocky() { - ensure!(block_bytes.len() == BLOCK_SIZE); + pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> { + ensure!(self.key_range.contains(&key)); + let off = self.end_offset; + + if let Some(writer) = &mut self.values_writer { + let len = img.len(); + writer.write_all(img)?; + self.end_offset += len as u64; + + let old = self.index.insert(key, BlobRef::new(off, len, true)); + assert!(old.is_none()); + } else { + panic!() } - self.page_image_writer.write_all(block_bytes)?; - self.num_blocks_written += 1; + Ok(()) } - pub fn finish(self) -> anyhow::Result { - // Check that the `put_page_image' was called for every block. - ensure!(self.num_blocks_written == self.num_blocks); + pub fn finish(&mut self) -> anyhow::Result { + // Close the values chapter + let book = self.values_writer.take().unwrap().close()?; - // Close the page-images chapter - let book = self.page_image_writer.close()?; + // Write out the index + let mut chapter = book.new_chapter(INDEX_CHAPTER); + let buf = HashMap::ser(&self.index)?; + chapter.write_all(&buf)?; + let book = chapter.close()?; // Write out the summary chapter - let image_type = if self.seg.rel.is_blocky() { - ImageType::Blocky { - num_blocks: self.num_blocks, - } - } else { - ImageType::NonBlocky - }; let mut chapter = book.new_chapter(SUMMARY_CHAPTER); let summary = Summary { tenantid: self.tenantid, timelineid: self.timelineid, - seg: self.seg, + key_range: self.key_range.clone(), lsn: self.lsn, }; Summary::ser_into(&summary, &mut chapter)?; @@ -542,15 +522,31 @@ impl ImageLayerWriter { path_or_conf: PathOrConf::Conf(self.conf), timelineid: self.timelineid, tenantid: self.tenantid, - seg: self.seg, + key_range: self.key_range.clone(), lsn: self.lsn, inner: RwLock::new(ImageLayerInner { book: None, - image_type, + loaded: false, + index: HashMap::new(), }), }; trace!("created image layer {}", layer.path().display()); + self.finished = true; + Ok(layer) } } + +impl Drop for ImageLayerWriter { + fn drop(&mut self) { + if let Some(page_image_writer) = self.values_writer.take() { + if let Ok(book) = page_image_writer.close() { + let _ = book.close(); + } + } + if !self.finished { + let _ = fs::remove_file(&self.path); + } + } +} diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index fed1fb6469..b5d98a4ca3 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -1,30 +1,29 @@ -//! An in-memory layer stores recently received PageVersions. -//! The page versions are held in a BTreeMap. To avoid OOM errors, the map size is limited -//! and layers can be spilled to disk into ephemeral files. +//! An in-memory layer stores recently received key-value pairs. //! -//! And there's another BTreeMap to track the size of the relation. +//! The "in-memory" part of the name is a bit misleading: the actual page versions are +//! held in an ephemeral file, not in memory. The metadata for each page version, i.e. +//! its position in the file, is kept in memory, though. //! use crate::config::PageServerConf; use crate::layered_repository::delta_layer::{DeltaLayer, DeltaLayerWriter}; use crate::layered_repository::ephemeral_file::EphemeralFile; -use crate::layered_repository::filename::DeltaFileName; -use crate::layered_repository::image_layer::{ImageLayer, ImageLayerWriter}; use crate::layered_repository::storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag, - RELISH_SEG_SIZE, + BlobRef, Layer, ValueReconstructResult, ValueReconstructState, }; -use crate::layered_repository::LayeredTimeline; -use crate::layered_repository::ZERO_PAGE; -use crate::repository::ZenithWalRecord; +use crate::repository::{Key, Value}; +use crate::walrecord; use crate::{ZTenantId, ZTimelineId}; use anyhow::{bail, ensure, Result}; -use bytes::Bytes; use log::*; use std::collections::HashMap; -use std::io::Seek; +// avoid binding to Write (conflicts with std::io::Write) +// while being able to use std::fmt::Write's methods +use std::fmt::Write as _; +use std::io::Write; +use std::ops::Range; use std::os::unix::fs::FileExt; use std::path::PathBuf; -use std::sync::{Arc, RwLock}; +use std::sync::RwLock; use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::Lsn; use zenith_utils::vec_map::VecMap; @@ -33,7 +32,6 @@ pub struct InMemoryLayer { conf: &'static PageServerConf, tenantid: ZTenantId, timelineid: ZTimelineId, - seg: SegmentTag, /// /// This layer contains all the changes from 'start_lsn'. The @@ -41,27 +39,9 @@ pub struct InMemoryLayer { /// start_lsn: Lsn, - /// - /// LSN of the oldest page version stored in this layer. - /// - /// This is different from 'start_lsn' in that we enforce that the 'start_lsn' - /// of a layer always matches the 'end_lsn' of its predecessor, even if there - /// are no page versions until at a later LSN. That way you can detect any - /// missing layer files more easily. 'oldest_lsn' is the first page version - /// actually stored in this layer. In the range between 'start_lsn' and - /// 'oldest_lsn', there are no changes to the segment. - /// 'oldest_lsn' is used to adjust 'disk_consistent_lsn' and that is why it should - /// point to the beginning of WAL record. This is the other difference with 'start_lsn' - /// which points to end of WAL record. This is why 'oldest_lsn' can be smaller than 'start_lsn'. - /// - oldest_lsn: Lsn, - /// The above fields never change. The parts that do change are in 'inner', /// and protected by mutex. inner: RwLock, - - /// Predecessor layer might be needed? - incremental: bool, } pub struct InMemoryLayerInner { @@ -69,98 +49,25 @@ pub struct InMemoryLayerInner { /// Writes are only allowed when this is None end_lsn: Option, - /// If this relation was dropped, remember when that happened. - /// The drop LSN is recorded in [`end_lsn`]. - dropped: bool, + /// + /// All versions of all pages in the layer are kept here. Indexed + /// by block number and LSN. The value is an offset into the + /// ephemeral file where the page version is stored. + /// + index: HashMap>, - /// The PageVersion structs are stored in a serialized format in this file. - /// Each serialized PageVersion is preceded by a 'u32' length field. - /// 'page_versions' map stores offsets into this file. + /// The values are stored in a serialized format in this file. + /// Each serialized Value is preceded by a 'u32' length field. + /// PerSeg::page_versions map stores offsets into this file. file: EphemeralFile, - /// Metadata about all versions of all pages in the layer is kept - /// here. Indexed by block number and LSN. The value is an offset - /// into the ephemeral file where the page version is stored. - page_versions: HashMap>, - - /// - /// `seg_sizes` tracks the size of the segment at different points in time. - /// - /// For a blocky rel, there is always one entry, at the layer's start_lsn, - /// so that determining the size never depends on the predecessor layer. For - /// a non-blocky rel, 'seg_sizes' is not used and is always empty. - /// - seg_sizes: VecMap, - - /// - /// LSN of the newest page version stored in this layer. - /// - /// The difference between 'end_lsn' and 'latest_lsn' is the same as between - /// 'start_lsn' and 'oldest_lsn'. See comments in 'oldest_lsn'. - /// - latest_lsn: Lsn, + end_offset: u64, } impl InMemoryLayerInner { fn assert_writeable(&self) { assert!(self.end_lsn.is_none()); } - - fn get_seg_size(&self, lsn: Lsn) -> SegmentBlk { - // Scan the BTreeMap backwards, starting from the given entry. - let slice = self.seg_sizes.slice_range(..=lsn); - - // We make sure there is always at least one entry - if let Some((_entry_lsn, entry)) = slice.last() { - *entry - } else { - panic!("could not find seg size in in-memory layer"); - } - } - - /// - /// Read a page version from the ephemeral file. - /// - fn read_pv(&self, off: u64) -> Result { - let mut buf = Vec::new(); - self.read_pv_bytes(off, &mut buf)?; - Ok(PageVersion::des(&buf)?) - } - - /// - /// Read a page version from the ephemeral file, as raw bytes, at - /// the given offset. The bytes are read into 'buf', which is - /// expanded if necessary. Returns the size of the page version. - /// - fn read_pv_bytes(&self, off: u64, buf: &mut Vec) -> Result { - // read length - let mut lenbuf = [0u8; 4]; - self.file.read_exact_at(&mut lenbuf, off)?; - let len = u32::from_ne_bytes(lenbuf) as usize; - - if buf.len() < len { - buf.resize(len, 0); - } - self.file.read_exact_at(&mut buf[0..len], off + 4)?; - Ok(len) - } - - fn write_pv(&mut self, pv: &PageVersion) -> Result { - // remember starting position - let pos = self.file.stream_position()?; - - // make room for the 'length' field by writing zeros as a placeholder. - self.file.seek(std::io::SeekFrom::Start(pos + 4))?; - - pv.ser_into(&mut self.file)?; - - // write the 'length' field. - let len = self.file.stream_position()? - pos - 4; - let lenbuf = u32::to_ne_bytes(len as u32); - self.file.write_all_at(&lenbuf, pos)?; - - Ok(pos) - } } impl Layer for InMemoryLayer { @@ -170,21 +77,12 @@ impl Layer for InMemoryLayer { fn filename(&self) -> PathBuf { let inner = self.inner.read().unwrap(); - let end_lsn = if let Some(drop_lsn) = inner.end_lsn { - drop_lsn - } else { - Lsn(u64::MAX) - }; + let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX)); - let delta_filename = DeltaFileName { - seg: self.seg, - start_lsn: self.start_lsn, - end_lsn, - dropped: inner.dropped, - } - .to_string(); - - PathBuf::from(format!("inmem-{}", delta_filename)) + PathBuf::from(format!( + "inmem-{:016X}-{:016X}", + self.start_lsn.0, end_lsn.0 + )) } fn get_tenant_id(&self) -> ZTenantId { @@ -195,132 +93,78 @@ impl Layer for InMemoryLayer { self.timelineid } - fn get_seg_tag(&self) -> SegmentTag { - self.seg + fn get_key_range(&self) -> Range { + Key::MIN..Key::MAX } - fn get_start_lsn(&self) -> Lsn { - self.start_lsn - } - - fn get_end_lsn(&self) -> Lsn { + fn get_lsn_range(&self) -> Range { let inner = self.inner.read().unwrap(); - if let Some(end_lsn) = inner.end_lsn { + let end_lsn = if let Some(end_lsn) = inner.end_lsn { end_lsn } else { Lsn(u64::MAX) - } + }; + self.start_lsn..end_lsn } - fn is_dropped(&self) -> bool { - let inner = self.inner.read().unwrap(); - inner.dropped - } - - /// Look up given page in the cache. - fn get_page_reconstruct_data( + /// Look up given value in the layer. + fn get_value_reconstruct_data( &self, - blknum: SegmentBlk, - lsn: Lsn, - reconstruct_data: &mut PageReconstructData, - ) -> anyhow::Result { + key: Key, + lsn_range: Range, + reconstruct_state: &mut ValueReconstructState, + ) -> anyhow::Result { + ensure!(lsn_range.start <= self.start_lsn); let mut need_image = true; - ensure!((0..RELISH_SEG_SIZE).contains(&blknum)); + let inner = self.inner.read().unwrap(); - { - let inner = self.inner.read().unwrap(); - - // Scan the page versions backwards, starting from `lsn`. - if let Some(vec_map) = inner.page_versions.get(&blknum) { - let slice = vec_map.slice_range(..=lsn); - for (entry_lsn, pos) in slice.iter().rev() { - match &reconstruct_data.page_img { - Some((cached_lsn, _)) if entry_lsn <= cached_lsn => { - return Ok(PageReconstructResult::Complete) - } - _ => {} + // Scan the page versions backwards, starting from `lsn`. + if let Some(vec_map) = inner.index.get(&key) { + let slice = vec_map.slice_range(lsn_range); + for (entry_lsn, blob_ref) in slice.iter().rev() { + match &reconstruct_state.img { + Some((cached_lsn, _)) if entry_lsn <= cached_lsn => { + return Ok(ValueReconstructResult::Complete) } + _ => {} + } - let pv = inner.read_pv(*pos)?; - match pv { - PageVersion::Page(img) => { - reconstruct_data.page_img = Some((*entry_lsn, img)); + let mut buf = vec![0u8; blob_ref.size()]; + inner.file.read_exact_at(&mut buf, blob_ref.pos())?; + let value = Value::des(&buf)?; + match value { + Value::Image(img) => { + reconstruct_state.img = Some((*entry_lsn, img)); + return Ok(ValueReconstructResult::Complete); + } + Value::WalRecord(rec) => { + let will_init = rec.will_init(); + reconstruct_state.records.push((*entry_lsn, rec)); + if will_init { + // This WAL record initializes the page, so no need to go further back need_image = false; break; } - PageVersion::Wal(rec) => { - reconstruct_data.records.push((*entry_lsn, rec.clone())); - if rec.will_init() { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; - } - } } } } - - // If we didn't find any records for this, check if the request is beyond EOF - if need_image - && reconstruct_data.records.is_empty() - && self.seg.rel.is_blocky() - && blknum >= self.get_seg_size(lsn)? - { - return Ok(PageReconstructResult::Missing(self.start_lsn)); - } - - // release lock on 'inner' } + // release lock on 'inner' + // If an older page image is needed to reconstruct the page, let the - // caller know + // caller know. if need_image { - if self.incremental { - Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1))) - } else { - Ok(PageReconstructResult::Missing(self.start_lsn)) - } + Ok(ValueReconstructResult::Continue) } else { - Ok(PageReconstructResult::Complete) + Ok(ValueReconstructResult::Complete) } } - /// Get size of the relation at given LSN - fn get_seg_size(&self, lsn: Lsn) -> anyhow::Result { - ensure!(lsn >= self.start_lsn); - ensure!( - self.seg.rel.is_blocky(), - "get_seg_size() called on a non-blocky rel" - ); - - let inner = self.inner.read().unwrap(); - Ok(inner.get_seg_size(lsn)) - } - - /// Does this segment exist at given LSN? - fn get_seg_exists(&self, lsn: Lsn) -> anyhow::Result { - let inner = self.inner.read().unwrap(); - - // If the segment created after requested LSN, - // it doesn't exist in the layer. But we shouldn't - // have requested it in the first place. - ensure!(lsn >= self.start_lsn); - - // Is the requested LSN after the segment was dropped? - if inner.dropped { - if let Some(end_lsn) = inner.end_lsn { - if lsn >= end_lsn { - return Ok(false); - } - } else { - bail!("dropped in-memory layer with no end LSN"); - } - } - - // Otherwise, it exists - Ok(true) + fn iter(&self) -> Box>> { + todo!(); } /// Cannot unload anything in an in-memory layer, since there's no backing @@ -337,7 +181,8 @@ impl Layer for InMemoryLayer { } fn is_incremental(&self) -> bool { - self.incremental + // in-memory layer is always considered incremental. + true } fn is_in_memory(&self) -> bool { @@ -355,29 +200,36 @@ impl Layer for InMemoryLayer { .unwrap_or_default(); println!( - "----- in-memory layer for tli {} seg {} {}-{} {} ----", - self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped, + "----- in-memory layer for tli {} LSNs {}-{} ----", + self.timelineid, self.start_lsn, end_str, ); - for (k, v) in inner.seg_sizes.as_slice() { - println!("seg_sizes {}: {}", k, v); - } - - // List the blocks in order - let mut page_versions: Vec<(&SegmentBlk, &VecMap)> = - inner.page_versions.iter().collect(); - page_versions.sort_by_key(|k| k.0); - - for (blknum, versions) in page_versions { - for (lsn, off) in versions.as_slice() { - let pv = inner.read_pv(*off); - let pv_description = match pv { - Ok(PageVersion::Page(_img)) => "page", - Ok(PageVersion::Wal(_rec)) => "wal", - Err(_err) => "INVALID", - }; - - println!("blk {} at {}: {}\n", blknum, lsn, pv_description); + let mut buf = Vec::new(); + for (key, vec_map) in inner.index.iter() { + for (lsn, blob_ref) in vec_map.as_slice() { + let mut desc = String::new(); + buf.resize(blob_ref.size(), 0); + inner.file.read_exact_at(&mut buf, blob_ref.pos())?; + let val = Value::des(&buf); + match val { + Ok(Value::Image(img)) => { + write!(&mut desc, " img {} bytes", img.len())?; + } + Ok(Value::WalRecord(rec)) => { + let wal_desc = walrecord::describe_wal_record(&rec); + write!( + &mut desc, + " rec {} bytes will_init: {} {}", + buf.len(), + rec.will_init(), + wal_desc + )?; + } + Err(err) => { + write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?; + } + } + println!(" key {} at {}: {}", key, lsn, desc); } } @@ -385,23 +237,7 @@ impl Layer for InMemoryLayer { } } -/// A result of an inmemory layer data being written to disk. -pub struct LayersOnDisk { - pub delta_layers: Vec, - pub image_layers: Vec, -} - impl InMemoryLayer { - /// Return the oldest page version that's stored in this layer - pub fn get_oldest_lsn(&self) -> Lsn { - self.oldest_lsn - } - - pub fn get_latest_lsn(&self) -> Lsn { - let inner = self.inner.read().unwrap(); - inner.latest_lsn - } - /// /// Create a new, empty, in-memory layer /// @@ -409,291 +245,83 @@ impl InMemoryLayer { conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId, - seg: SegmentTag, start_lsn: Lsn, - oldest_lsn: Lsn, ) -> Result { trace!( - "initializing new empty InMemoryLayer for writing {} on timeline {} at {}", - seg, + "initializing new empty InMemoryLayer for writing on timeline {} at {}", timelineid, start_lsn ); - // The segment is initially empty, so initialize 'seg_sizes' with 0. - let mut seg_sizes = VecMap::default(); - if seg.rel.is_blocky() { - seg_sizes.append(start_lsn, 0).unwrap(); - } - let file = EphemeralFile::create(conf, tenantid, timelineid)?; Ok(InMemoryLayer { conf, timelineid, tenantid, - seg, start_lsn, - oldest_lsn, - incremental: false, inner: RwLock::new(InMemoryLayerInner { end_lsn: None, - dropped: false, + index: HashMap::new(), file, - page_versions: HashMap::new(), - seg_sizes, - latest_lsn: oldest_lsn, + end_offset: 0, }), }) } // Write operations - /// Remember new page version, as a WAL record over previous version - pub fn put_wal_record( - &self, - lsn: Lsn, - blknum: SegmentBlk, - rec: ZenithWalRecord, - ) -> Result { - self.put_page_version(blknum, lsn, PageVersion::Wal(rec)) - } - - /// Remember new page version, as a full page image - pub fn put_page_image(&self, blknum: SegmentBlk, lsn: Lsn, img: Bytes) -> Result { - self.put_page_version(blknum, lsn, PageVersion::Page(img)) - } - /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree - pub fn put_page_version( - &self, - blknum: SegmentBlk, - lsn: Lsn, - pv: PageVersion, - ) -> anyhow::Result { - ensure!((0..RELISH_SEG_SIZE).contains(&blknum)); - - trace!( - "put_page_version blk {} of {} at {}/{}", - blknum, - self.seg.rel, - self.timelineid, - lsn - ); + pub fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> { + trace!("put_value key {} at {}/{}", key, self.timelineid, lsn); let mut inner = self.inner.write().unwrap(); inner.assert_writeable(); - ensure!(lsn >= inner.latest_lsn); - inner.latest_lsn = lsn; - // Write the page version to the file, and remember its offset in 'page_versions' - { - let off = inner.write_pv(&pv)?; - let vec_map = inner.page_versions.entry(blknum).or_default(); - let old = vec_map.append_or_update_last(lsn, off).unwrap().0; - if old.is_some() { - // We already had an entry for this LSN. That's odd.. - warn!( - "Page version of rel {} blk {} at {} already exists", - self.seg.rel, blknum, lsn - ); - } - } - - // Also update the relation size, if this extended the relation. - if self.seg.rel.is_blocky() { - let newsize = blknum + 1; - - // use inner get_seg_size, since calling self.get_seg_size will try to acquire the lock, - // which we've just acquired above - let oldsize = inner.get_seg_size(lsn); - if newsize > oldsize { - trace!( - "enlarging segment {} from {} to {} blocks at {}", - self.seg, - oldsize, - newsize, - lsn - ); - - // If we are extending the relation by more than one page, initialize the "gap" - // with zeros - // - // XXX: What if the caller initializes the gap with subsequent call with same LSN? - // I don't think that can happen currently, but that is highly dependent on how - // PostgreSQL writes its WAL records and there's no guarantee of it. If it does - // happen, we would hit the "page version already exists" warning above on the - // subsequent call to initialize the gap page. - for gapblknum in oldsize..blknum { - let zeropv = PageVersion::Page(ZERO_PAGE.clone()); - trace!( - "filling gap blk {} with zeros for write of {}", - gapblknum, - blknum - ); - - // Write the page version to the file, and remember its offset in - // 'page_versions' - { - let off = inner.write_pv(&zeropv)?; - let vec_map = inner.page_versions.entry(gapblknum).or_default(); - let old = vec_map.append_or_update_last(lsn, off).unwrap().0; - if old.is_some() { - warn!( - "Page version of seg {} blk {} at {} already exists", - self.seg, gapblknum, lsn - ); - } - } - } - - inner.seg_sizes.append_or_update_last(lsn, newsize).unwrap(); - return Ok(newsize - oldsize); - } - } - - Ok(0) - } - - /// Remember that the relation was truncated at given LSN - pub fn put_truncation(&self, lsn: Lsn, new_size: SegmentBlk) { - assert!( - self.seg.rel.is_blocky(), - "put_truncation() called on a non-blocky rel" - ); - - let mut inner = self.inner.write().unwrap(); - inner.assert_writeable(); - - // check that this we truncate to a smaller size than segment was before the truncation - let old_size = inner.get_seg_size(lsn); - assert!(new_size < old_size); - - let (old, _delta_size) = inner - .seg_sizes - .append_or_update_last(lsn, new_size) - .unwrap(); + let off = inner.end_offset; + let buf = Value::ser(&val)?; + let len = buf.len(); + inner.file.write_all(&buf)?; + inner.end_offset += len as u64; + let vec_map = inner.index.entry(key).or_default(); + let blob_ref = BlobRef::new(off, len, val.will_init()); + let old = vec_map.append_or_update_last(lsn, blob_ref).unwrap().0; if old.is_some() { // We already had an entry for this LSN. That's odd.. - warn!("Inserting truncation, but had an entry for the LSN already"); - } - } - - /// Remember that the segment was dropped at given LSN - pub fn drop_segment(&self, lsn: Lsn) { - let mut inner = self.inner.write().unwrap(); - - assert!(inner.end_lsn.is_none()); - assert!(!inner.dropped); - inner.dropped = true; - assert!(self.start_lsn < lsn); - inner.end_lsn = Some(lsn); - - trace!("dropped segment {} at {}", self.seg, lsn); - } - - /// - /// Initialize a new InMemoryLayer for, by copying the state at the given - /// point in time from given existing layer. - /// - pub fn create_successor_layer( - conf: &'static PageServerConf, - src: Arc, - timelineid: ZTimelineId, - tenantid: ZTenantId, - start_lsn: Lsn, - oldest_lsn: Lsn, - ) -> Result { - let seg = src.get_seg_tag(); - - assert!(oldest_lsn.is_aligned()); - - trace!( - "initializing new InMemoryLayer for writing {} on timeline {} at {}", - seg, - timelineid, - start_lsn, - ); - - // Copy the segment size at the start LSN from the predecessor layer. - let mut seg_sizes = VecMap::default(); - if seg.rel.is_blocky() { - let size = src.get_seg_size(start_lsn)?; - seg_sizes.append(start_lsn, size).unwrap(); + warn!("Key {} at {} already exists", key, lsn); } - let file = EphemeralFile::create(conf, tenantid, timelineid)?; - - Ok(InMemoryLayer { - conf, - timelineid, - tenantid, - seg, - start_lsn, - oldest_lsn, - incremental: true, - inner: RwLock::new(InMemoryLayerInner { - end_lsn: None, - dropped: false, - file, - page_versions: HashMap::new(), - seg_sizes, - latest_lsn: oldest_lsn, - }), - }) + Ok(()) } - pub fn is_writeable(&self) -> bool { - let inner = self.inner.read().unwrap(); - inner.end_lsn.is_none() + pub fn put_tombstone(&self, _key_range: Range, _lsn: Lsn) -> Result<()> { + // TODO: Currently, we just leak the storage for any deleted keys + + Ok(()) } /// Make the layer non-writeable. Only call once. /// Records the end_lsn for non-dropped layers. - /// `end_lsn` is inclusive + /// `end_lsn` is exclusive pub fn freeze(&self, end_lsn: Lsn) { let mut inner = self.inner.write().unwrap(); - if inner.end_lsn.is_some() { - assert!(inner.dropped); - } else { - assert!(!inner.dropped); - assert!(self.start_lsn < end_lsn + 1); - inner.end_lsn = Some(Lsn(end_lsn.0 + 1)); + assert!(self.start_lsn < end_lsn); + inner.end_lsn = Some(end_lsn); - if let Some((lsn, _)) = inner.seg_sizes.as_slice().last() { - assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn); - } - - for (_blk, vec_map) in inner.page_versions.iter() { - for (lsn, _pos) in vec_map.as_slice() { - assert!(*lsn <= end_lsn); - } + for vec_map in inner.index.values() { + for (lsn, _pos) in vec_map.as_slice() { + assert!(*lsn < end_lsn); } } } - /// Write the this frozen in-memory layer to disk. + /// Write this frozen in-memory layer to disk. /// - /// Returns new layers that replace this one. - /// If not dropped and reconstruct_pages is true, returns a new image layer containing the page versions - /// at the `end_lsn`. Can also return a DeltaLayer that includes all the - /// WAL records between start and end LSN. (The delta layer is not needed - /// when a new relish is created with a single LSN, so that the start and - /// end LSN are the same.) - pub fn write_to_disk( - &self, - timeline: &LayeredTimeline, - reconstruct_pages: bool, - ) -> Result { - trace!( - "write_to_disk {} get_end_lsn is {}", - self.filename().display(), - self.get_end_lsn() - ); - + /// Returns a new delta layer with all the same data as this in-memory layer + pub fn write_to_disk(&self) -> Result { // Grab the lock in read-mode. We hold it over the I/O, but because this // layer is not writeable anymore, no one should be trying to acquire the // write lock on it, so we shouldn't block anyone. There's one exception @@ -705,105 +333,32 @@ impl InMemoryLayer { // rare though, so we just accept the potential latency hit for now. let inner = self.inner.read().unwrap(); - // Since `end_lsn` is exclusive, subtract 1 to calculate the last LSN - // that is included. - let end_lsn_exclusive = inner.end_lsn.unwrap(); - let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1); + let mut delta_layer_writer = DeltaLayerWriter::new( + self.conf, + self.timelineid, + self.tenantid, + Key::MIN, + self.start_lsn..inner.end_lsn.unwrap(), + )?; - // Figure out if we should create a delta layer, image layer, or both. - let image_lsn: Option; - let delta_end_lsn: Option; - if self.is_dropped() || !reconstruct_pages { - // The segment was dropped. Create just a delta layer containing all the - // changes up to and including the drop. - delta_end_lsn = Some(end_lsn_exclusive); - image_lsn = None; - } else if self.start_lsn == end_lsn_inclusive { - // The layer contains exactly one LSN. It's enough to write an image - // layer at that LSN. - delta_end_lsn = None; - image_lsn = Some(end_lsn_inclusive); - } else { - // Create a delta layer with all the changes up to the end LSN, - // and an image layer at the end LSN. - // - // Note that we the delta layer does *not* include the page versions - // at the end LSN. They are included in the image layer, and there's - // no need to store them twice. - delta_end_lsn = Some(end_lsn_inclusive); - image_lsn = Some(end_lsn_inclusive); - } - - let mut delta_layers = Vec::new(); - let mut image_layers = Vec::new(); - - if let Some(delta_end_lsn) = delta_end_lsn { - let mut delta_layer_writer = DeltaLayerWriter::new( - self.conf, - self.timelineid, - self.tenantid, - self.seg, - self.start_lsn, - delta_end_lsn, - self.is_dropped(), - )?; - - // Write all page versions, in block + LSN order - let mut buf: Vec = Vec::new(); - - let pv_iter = inner.page_versions.iter(); - let mut pages: Vec<(&SegmentBlk, &VecMap)> = pv_iter.collect(); - pages.sort_by_key(|(blknum, _vec_map)| *blknum); - for (blknum, vec_map) in pages { - for (lsn, pos) in vec_map.as_slice() { - if *lsn < delta_end_lsn { - let len = inner.read_pv_bytes(*pos, &mut buf)?; - delta_layer_writer.put_page_version(*blknum, *lsn, &buf[..len])?; - } + let mut do_steps = || -> Result<()> { + for (key, vec_map) in inner.index.iter() { + // Write all page versions + for (lsn, blob_ref) in vec_map.as_slice() { + let mut buf = vec![0u8; blob_ref.size()]; + inner.file.read_exact_at(&mut buf, blob_ref.pos())?; + let val = Value::des(&buf)?; + delta_layer_writer.put_value(*key, *lsn, val)?; } } - - // Create seg_sizes - let seg_sizes = if delta_end_lsn == end_lsn_exclusive { - inner.seg_sizes.clone() - } else { - inner.seg_sizes.split_at(&end_lsn_exclusive).0 - }; - - let delta_layer = delta_layer_writer.finish(seg_sizes)?; - delta_layers.push(delta_layer); + Ok(()) + }; + if let Err(err) = do_steps() { + delta_layer_writer.abort(); + return Err(err); } - drop(inner); - - // Write a new base image layer at the cutoff point - if let Some(image_lsn) = image_lsn { - let size = if self.seg.rel.is_blocky() { - self.get_seg_size(image_lsn)? - } else { - 1 - }; - let mut image_layer_writer = ImageLayerWriter::new( - self.conf, - self.timelineid, - self.tenantid, - self.seg, - image_lsn, - size, - )?; - - for blknum in 0..size { - let img = timeline.materialize_page(self.seg, blknum, image_lsn, &*self)?; - - image_layer_writer.put_page_image(&img)?; - } - let image_layer = image_layer_writer.finish()?; - image_layers.push(image_layer); - } - - Ok(LayersOnDisk { - delta_layers, - image_layers, - }) + let delta_layer = delta_layer_writer.finish(Key::MAX)?; + Ok(delta_layer) } } diff --git a/pageserver/src/layered_repository/interval_tree.rs b/pageserver/src/layered_repository/interval_tree.rs deleted file mode 100644 index 978ecd837e..0000000000 --- a/pageserver/src/layered_repository/interval_tree.rs +++ /dev/null @@ -1,468 +0,0 @@ -/// -/// IntervalTree is data structure for holding intervals. It is generic -/// to make unit testing possible, but the only real user of it is the layer map, -/// -/// It's inspired by the "segment tree" or a "statistic tree" as described in -/// https://en.wikipedia.org/wiki/Segment_tree. However, we use a B-tree to hold -/// the points instead of a binary tree. This is called an "interval tree" instead -/// of "segment tree" because the term "segment" is already using Zenith to mean -/// something else. To add to the confusion, there is another data structure known -/// as "interval tree" out there (see https://en.wikipedia.org/wiki/Interval_tree), -/// for storing intervals, but this isn't that. -/// -/// The basic idea is to have a B-tree of "interesting Points". At each Point, -/// there is a list of intervals that contain the point. The Points are formed -/// from the start bounds of each interval; there is a Point for each distinct -/// start bound. -/// -/// Operations: -/// -/// To find intervals that contain a given point, you search the b-tree to find -/// the nearest Point <= search key. Then you just return the list of intervals. -/// -/// To insert an interval, find the Point with start key equal to the inserted item. -/// If the Point doesn't exist yet, create it, by copying all the items from the -/// previous Point that cover the new Point. Then walk right, inserting the new -/// interval to all the Points that are contained by the new interval (including the -/// newly created Point). -/// -/// To remove an interval, you scan the tree for all the Points that are contained by -/// the removed interval, and remove it from the list in each Point. -/// -/// Requirements and assumptions: -/// -/// - Can store overlapping items -/// - But there are not many overlapping items -/// - The interval bounds don't change after it is added to the tree -/// - Intervals are uniquely identified by pointer equality. You must not be insert the -/// same interval object twice, and `remove` uses pointer equality to remove the right -/// interval. It is OK to have two intervals with the same bounds, however. -/// -use std::collections::BTreeMap; -use std::fmt::Debug; -use std::ops::Range; -use std::sync::Arc; - -pub struct IntervalTree -where - I: IntervalItem, -{ - points: BTreeMap>, -} - -struct Point { - /// All intervals that contain this point, in no particular order. - /// - /// We assume that there aren't a lot of overlappingg intervals, so that this vector - /// never grows very large. If that assumption doesn't hold, we could keep this ordered - /// by the end bound, to speed up `search`. But as long as there are only a few elements, - /// a linear search is OK. - elements: Vec>, -} - -/// Abstraction for an interval that can be stored in the tree -/// -/// The start bound is inclusive and the end bound is exclusive. End must be greater -/// than start. -pub trait IntervalItem { - type Key: Ord + Copy + Debug + Sized; - - fn start_key(&self) -> Self::Key; - fn end_key(&self) -> Self::Key; - - fn bounds(&self) -> Range { - self.start_key()..self.end_key() - } -} - -impl IntervalTree -where - I: IntervalItem, -{ - /// Return an element that contains 'key', or precedes it. - /// - /// If there are multiple candidates, returns the one with the highest 'end' key. - pub fn search(&self, key: I::Key) -> Option> { - // Find the greatest point that precedes or is equal to the search key. If there is - // none, returns None. - let (_, p) = self.points.range(..=key).next_back()?; - - // Find the element with the highest end key at this point - let highest_item = p - .elements - .iter() - .reduce(|a, b| { - // starting with Rust 1.53, could use `std::cmp::min_by_key` here - if a.end_key() > b.end_key() { - a - } else { - b - } - }) - .unwrap(); - Some(Arc::clone(highest_item)) - } - - /// Iterate over all items with start bound >= 'key' - pub fn iter_newer(&self, key: I::Key) -> IntervalIter { - IntervalIter { - point_iter: self.points.range(key..), - elem_iter: None, - } - } - - /// Iterate over all items - pub fn iter(&self) -> IntervalIter { - IntervalIter { - point_iter: self.points.range(..), - elem_iter: None, - } - } - - pub fn insert(&mut self, item: Arc) { - let start_key = item.start_key(); - let end_key = item.end_key(); - assert!(start_key < end_key); - let bounds = start_key..end_key; - - // Find the starting point and walk forward from there - let mut found_start_point = false; - let iter = self.points.range_mut(bounds); - for (point_key, point) in iter { - if *point_key == start_key { - found_start_point = true; - // It is an error to insert the same item to the tree twice. - assert!( - !point.elements.iter().any(|x| Arc::ptr_eq(x, &item)), - "interval is already in the tree" - ); - } - point.elements.push(Arc::clone(&item)); - } - if !found_start_point { - // Create a new Point for the starting point - - // Look at the previous point, and copy over elements that overlap with this - // new point - let mut new_elements: Vec> = Vec::new(); - if let Some((_, prev_point)) = self.points.range(..start_key).next_back() { - let overlapping_prev_elements = prev_point - .elements - .iter() - .filter(|x| x.bounds().contains(&start_key)) - .cloned(); - - new_elements.extend(overlapping_prev_elements); - } - new_elements.push(item); - - let new_point = Point { - elements: new_elements, - }; - self.points.insert(start_key, new_point); - } - } - - pub fn remove(&mut self, item: &Arc) { - // range search points - let start_key = item.start_key(); - let end_key = item.end_key(); - let bounds = start_key..end_key; - - let mut points_to_remove: Vec = Vec::new(); - let mut found_start_point = false; - for (point_key, point) in self.points.range_mut(bounds) { - if *point_key == start_key { - found_start_point = true; - } - let len_before = point.elements.len(); - point.elements.retain(|other| !Arc::ptr_eq(other, item)); - let len_after = point.elements.len(); - assert_eq!(len_after + 1, len_before); - if len_after == 0 { - points_to_remove.push(*point_key); - } - } - assert!(found_start_point); - - for k in points_to_remove { - self.points.remove(&k).unwrap(); - } - } -} - -pub struct IntervalIter<'a, I: ?Sized> -where - I: IntervalItem, -{ - point_iter: std::collections::btree_map::Range<'a, I::Key, Point>, - elem_iter: Option<(I::Key, std::slice::Iter<'a, Arc>)>, -} - -impl<'a, I> Iterator for IntervalIter<'a, I> -where - I: IntervalItem + ?Sized, -{ - type Item = Arc; - - fn next(&mut self) -> Option { - // Iterate over all elements in all the points in 'point_iter'. To avoid - // returning the same element twice, we only return each element at its - // starting point. - loop { - // Return next remaining element from the current point - if let Some((point_key, elem_iter)) = &mut self.elem_iter { - for elem in elem_iter { - if elem.start_key() == *point_key { - return Some(Arc::clone(elem)); - } - } - } - // No more elements at this point. Move to next point. - if let Some((point_key, point)) = self.point_iter.next() { - self.elem_iter = Some((*point_key, point.elements.iter())); - continue; - } else { - // No more points, all done - return None; - } - } - } -} - -impl Default for IntervalTree -where - I: IntervalItem, -{ - fn default() -> Self { - IntervalTree { - points: BTreeMap::new(), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::fmt; - - #[derive(Debug)] - struct MockItem { - start_key: u32, - end_key: u32, - val: String, - } - impl IntervalItem for MockItem { - type Key = u32; - - fn start_key(&self) -> u32 { - self.start_key - } - fn end_key(&self) -> u32 { - self.end_key - } - } - impl MockItem { - fn new(start_key: u32, end_key: u32) -> Self { - MockItem { - start_key, - end_key, - val: format!("{}-{}", start_key, end_key), - } - } - fn new_str(start_key: u32, end_key: u32, val: &str) -> Self { - MockItem { - start_key, - end_key, - val: format!("{}-{}: {}", start_key, end_key, val), - } - } - } - impl fmt::Display for MockItem { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.val) - } - } - #[rustfmt::skip] - fn assert_search( - tree: &IntervalTree, - key: u32, - expected: &[&str], - ) -> Option> { - if let Some(v) = tree.search(key) { - let vstr = v.to_string(); - - assert!(!expected.is_empty(), "search with {} returned {}, expected None", key, v); - assert!( - expected.contains(&vstr.as_str()), - "search with {} returned {}, expected one of: {:?}", - key, v, expected, - ); - - Some(v) - } else { - assert!( - expected.is_empty(), - "search with {} returned None, expected one of {:?}", - key, expected - ); - None - } - } - - fn assert_contents(tree: &IntervalTree, expected: &[&str]) { - let mut contents: Vec = tree.iter().map(|e| e.to_string()).collect(); - contents.sort(); - assert_eq!(contents, expected); - } - - fn dump_tree(tree: &IntervalTree) { - for (point_key, point) in tree.points.iter() { - print!("{}:", point_key); - for e in point.elements.iter() { - print!(" {}", e); - } - println!(); - } - } - - #[test] - fn test_interval_tree_simple() { - let mut tree: IntervalTree = IntervalTree::default(); - - // Simple, non-overlapping ranges. - tree.insert(Arc::new(MockItem::new(10, 11))); - tree.insert(Arc::new(MockItem::new(11, 12))); - tree.insert(Arc::new(MockItem::new(12, 13))); - tree.insert(Arc::new(MockItem::new(18, 19))); - tree.insert(Arc::new(MockItem::new(17, 18))); - tree.insert(Arc::new(MockItem::new(15, 16))); - - assert_search(&tree, 9, &[]); - assert_search(&tree, 10, &["10-11"]); - assert_search(&tree, 11, &["11-12"]); - assert_search(&tree, 12, &["12-13"]); - assert_search(&tree, 13, &["12-13"]); - assert_search(&tree, 14, &["12-13"]); - assert_search(&tree, 15, &["15-16"]); - assert_search(&tree, 16, &["15-16"]); - assert_search(&tree, 17, &["17-18"]); - assert_search(&tree, 18, &["18-19"]); - assert_search(&tree, 19, &["18-19"]); - assert_search(&tree, 20, &["18-19"]); - - // remove a few entries and search around them again - tree.remove(&assert_search(&tree, 10, &["10-11"]).unwrap()); // first entry - tree.remove(&assert_search(&tree, 12, &["12-13"]).unwrap()); // entry in the middle - tree.remove(&assert_search(&tree, 18, &["18-19"]).unwrap()); // last entry - assert_search(&tree, 9, &[]); - assert_search(&tree, 10, &[]); - assert_search(&tree, 11, &["11-12"]); - assert_search(&tree, 12, &["11-12"]); - assert_search(&tree, 14, &["11-12"]); - assert_search(&tree, 15, &["15-16"]); - assert_search(&tree, 17, &["17-18"]); - assert_search(&tree, 18, &["17-18"]); - } - - #[test] - fn test_interval_tree_overlap() { - let mut tree: IntervalTree = IntervalTree::default(); - - // Overlapping items - tree.insert(Arc::new(MockItem::new(22, 24))); - tree.insert(Arc::new(MockItem::new(23, 25))); - let x24_26 = Arc::new(MockItem::new(24, 26)); - tree.insert(Arc::clone(&x24_26)); - let x26_28 = Arc::new(MockItem::new(26, 28)); - tree.insert(Arc::clone(&x26_28)); - tree.insert(Arc::new(MockItem::new(25, 27))); - - assert_search(&tree, 22, &["22-24"]); - assert_search(&tree, 23, &["22-24", "23-25"]); - assert_search(&tree, 24, &["23-25", "24-26"]); - assert_search(&tree, 25, &["24-26", "25-27"]); - assert_search(&tree, 26, &["25-27", "26-28"]); - assert_search(&tree, 27, &["26-28"]); - assert_search(&tree, 28, &["26-28"]); - assert_search(&tree, 29, &["26-28"]); - - tree.remove(&x24_26); - tree.remove(&x26_28); - assert_search(&tree, 23, &["22-24", "23-25"]); - assert_search(&tree, 24, &["23-25"]); - assert_search(&tree, 25, &["25-27"]); - assert_search(&tree, 26, &["25-27"]); - assert_search(&tree, 27, &["25-27"]); - assert_search(&tree, 28, &["25-27"]); - assert_search(&tree, 29, &["25-27"]); - } - - #[test] - fn test_interval_tree_nested() { - let mut tree: IntervalTree = IntervalTree::default(); - - // Items containing other items - tree.insert(Arc::new(MockItem::new(31, 39))); - tree.insert(Arc::new(MockItem::new(32, 34))); - tree.insert(Arc::new(MockItem::new(33, 35))); - tree.insert(Arc::new(MockItem::new(30, 40))); - - assert_search(&tree, 30, &["30-40"]); - assert_search(&tree, 31, &["30-40", "31-39"]); - assert_search(&tree, 32, &["30-40", "32-34", "31-39"]); - assert_search(&tree, 33, &["30-40", "32-34", "33-35", "31-39"]); - assert_search(&tree, 34, &["30-40", "33-35", "31-39"]); - assert_search(&tree, 35, &["30-40", "31-39"]); - assert_search(&tree, 36, &["30-40", "31-39"]); - assert_search(&tree, 37, &["30-40", "31-39"]); - assert_search(&tree, 38, &["30-40", "31-39"]); - assert_search(&tree, 39, &["30-40"]); - assert_search(&tree, 40, &["30-40"]); - assert_search(&tree, 41, &["30-40"]); - } - - #[test] - fn test_interval_tree_duplicates() { - let mut tree: IntervalTree = IntervalTree::default(); - - // Duplicate keys - let item_a = Arc::new(MockItem::new_str(55, 56, "a")); - tree.insert(Arc::clone(&item_a)); - let item_b = Arc::new(MockItem::new_str(55, 56, "b")); - tree.insert(Arc::clone(&item_b)); - let item_c = Arc::new(MockItem::new_str(55, 56, "c")); - tree.insert(Arc::clone(&item_c)); - let item_d = Arc::new(MockItem::new_str(54, 56, "d")); - tree.insert(Arc::clone(&item_d)); - let item_e = Arc::new(MockItem::new_str(55, 57, "e")); - tree.insert(Arc::clone(&item_e)); - - dump_tree(&tree); - - assert_search( - &tree, - 55, - &["55-56: a", "55-56: b", "55-56: c", "54-56: d", "55-57: e"], - ); - tree.remove(&item_b); - dump_tree(&tree); - - assert_contents(&tree, &["54-56: d", "55-56: a", "55-56: c", "55-57: e"]); - - tree.remove(&item_d); - dump_tree(&tree); - assert_contents(&tree, &["55-56: a", "55-56: c", "55-57: e"]); - } - - #[test] - #[should_panic] - fn test_interval_tree_insert_twice() { - let mut tree: IntervalTree = IntervalTree::default(); - - // Inserting the same item twice is not cool - let item = Arc::new(MockItem::new(1, 2)); - tree.insert(Arc::clone(&item)); - tree.insert(Arc::clone(&item)); // fails assertion - } -} diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index fe82fd491c..c4929a6173 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -1,32 +1,29 @@ //! -//! The layer map tracks what layers exist for all the relishes in a timeline. +//! The layer map tracks what layers exist in a timeline. //! //! When the timeline is first accessed, the server lists of all layer files //! in the timelines/ directory, and populates this map with -//! ImageLayer and DeltaLayer structs corresponding to each file. When new WAL -//! is received, we create InMemoryLayers to hold the incoming records. Now and -//! then, in the checkpoint() function, the in-memory layers are frozen, forming -//! new image and delta layers and corresponding files are written to disk. +//! ImageLayer and DeltaLayer structs corresponding to each file. When the first +//! new WAL record is received, we create an InMemoryLayer to hold the incoming +//! records. Now and then, in the checkpoint() function, the in-memory layer is +//! are frozen, and it is split up into new image and delta layers and the +//! corresponding files are written to disk. //! -use crate::layered_repository::interval_tree::{IntervalItem, IntervalIter, IntervalTree}; -use crate::layered_repository::storage_layer::{Layer, SegmentTag}; +use crate::layered_repository::storage_layer::Layer; +use crate::layered_repository::storage_layer::{range_eq, range_overlaps}; use crate::layered_repository::InMemoryLayer; -use crate::relish::*; +use crate::repository::Key; use anyhow::Result; use lazy_static::lazy_static; -use std::cmp::Ordering; -use std::collections::{BinaryHeap, HashMap}; +use std::collections::VecDeque; +use std::ops::Range; use std::sync::Arc; +use tracing::*; use zenith_metrics::{register_int_gauge, IntGauge}; use zenith_utils::lsn::Lsn; -use super::global_layer_map::{LayerId, GLOBAL_LAYER_MAP}; - lazy_static! { - static ref NUM_INMEMORY_LAYERS: IntGauge = - register_int_gauge!("pageserver_inmemory_layers", "Number of layers in memory") - .expect("failed to define a metric"); static ref NUM_ONDISK_LAYERS: IntGauge = register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk") .expect("failed to define a metric"); @@ -37,98 +34,147 @@ lazy_static! { /// #[derive(Default)] pub struct LayerMap { - /// All the layers keyed by segment tag - segs: HashMap, + // + // 'open_layer' holds the current InMemoryLayer that is accepting new + // records. If it is None, 'next_open_layer_at' will be set instead, indicating + // where the start LSN of the next InMemoryLayer that is to be created. + // + pub open_layer: Option>, + pub next_open_layer_at: Option, - /// All in-memory layers, ordered by 'oldest_lsn' and generation - /// of each layer. This allows easy access to the in-memory layer that - /// contains the oldest WAL record. - open_layers: BinaryHeap, + /// + /// The frozen layer, if any, contains WAL older than the current 'open_layer' + /// or 'next_open_layer_at', but newer than any historic layer. The frozen + /// layer is during checkpointing, when an InMemoryLayer is being written out + /// to disk. + /// + pub frozen_layers: VecDeque>, - /// Generation number, used to distinguish newly inserted entries in the - /// binary heap from older entries during checkpoint. - current_generation: u64, + /// All the historic layers are kept here + + /// TODO: This is a placeholder implementation of a data structure + /// to hold information about all the layer files on disk and in + /// S3. Currently, it's just a vector and all operations perform a + /// linear scan over it. That obviously becomes slow as the + /// number of layers grows. I'm imagining that an R-tree or some + /// other 2D data structure would be the long-term solution here. + historic_layers: Vec>, +} + +/// Return value of LayerMap::search +pub struct SearchResult { + pub layer: Arc, + pub lsn_floor: Lsn, } impl LayerMap { /// - /// Look up a layer using the given segment tag and LSN. This differs from a - /// plain key-value lookup in that if there is any layer that covers the - /// given LSN, or precedes the given LSN, it is returned. In other words, - /// you don't need to know the exact start LSN of the layer. + /// Find the latest layer that covers the given 'key', with lsn < + /// 'end_lsn'. /// - pub fn get(&self, tag: &SegmentTag, lsn: Lsn) -> Option> { - let segentry = self.segs.get(tag)?; - - segentry.get(lsn) - } - + /// Returns the layer, if any, and an 'lsn_floor' value that + /// indicates which portion of the layer the caller should + /// check. 'lsn_floor' is normally the start-LSN of the layer, but + /// can be greater if there is an overlapping layer that might + /// contain the version, even if it's missing from the returned + /// layer. /// - /// Get the open layer for given segment for writing. Or None if no open - /// layer exists. - /// - pub fn get_open(&self, tag: &SegmentTag) -> Option> { - let segentry = self.segs.get(tag)?; + pub fn search(&self, key: Key, end_lsn: Lsn) -> Result> { + // linear search + // Find the latest image layer that covers the given key + let mut latest_img: Option> = None; + let mut latest_img_lsn: Option = None; + for l in self.historic_layers.iter() { + if l.is_incremental() { + continue; + } + if !l.get_key_range().contains(&key) { + continue; + } + let img_lsn = l.get_lsn_range().start; - segentry - .open_layer_id - .and_then(|layer_id| GLOBAL_LAYER_MAP.read().unwrap().get(&layer_id)) - } + if img_lsn >= end_lsn { + // too new + continue; + } + if Lsn(img_lsn.0 + 1) == end_lsn { + // found exact match + return Ok(Some(SearchResult { + layer: Arc::clone(l), + lsn_floor: img_lsn, + })); + } + if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) { + latest_img = Some(Arc::clone(l)); + latest_img_lsn = Some(img_lsn); + } + } - /// - /// Insert an open in-memory layer - /// - pub fn insert_open(&mut self, layer: Arc) { - let segentry = self.segs.entry(layer.get_seg_tag()).or_default(); - - let layer_id = segentry.update_open(Arc::clone(&layer)); - - let oldest_lsn = layer.get_oldest_lsn(); - - // After a crash and restart, 'oldest_lsn' of the oldest in-memory - // layer becomes the WAL streaming starting point, so it better not point - // in the middle of a WAL record. - assert!(oldest_lsn.is_aligned()); - - // Also add it to the binary heap - let open_layer_entry = OpenLayerEntry { - oldest_lsn: layer.get_oldest_lsn(), - layer_id, - generation: self.current_generation, - }; - self.open_layers.push(open_layer_entry); - - NUM_INMEMORY_LAYERS.inc(); - } - - /// Remove an open in-memory layer - pub fn remove_open(&mut self, layer_id: LayerId) { - // Note: we don't try to remove the entry from the binary heap. - // It will be removed lazily by peek_oldest_open() when it's made it to - // the top of the heap. - - let layer_opt = { - let mut global_map = GLOBAL_LAYER_MAP.write().unwrap(); - let layer_opt = global_map.get(&layer_id); - global_map.remove(&layer_id); - // TODO it's bad that a ref can still exist after being evicted from cache - layer_opt - }; - - if let Some(layer) = layer_opt { - let mut segentry = self.segs.get_mut(&layer.get_seg_tag()).unwrap(); - - if segentry.open_layer_id == Some(layer_id) { - // Also remove it from the SegEntry of this segment - segentry.open_layer_id = None; - } else { - // We could have already updated segentry.open for - // dropped (non-writeable) layer. This is fine. - assert!(!layer.is_writeable()); - assert!(layer.is_dropped()); + // Search the delta layers + let mut latest_delta: Option> = None; + for l in self.historic_layers.iter() { + if !l.is_incremental() { + continue; + } + if !l.get_key_range().contains(&key) { + continue; } - NUM_INMEMORY_LAYERS.dec(); + if l.get_lsn_range().start >= end_lsn { + // too new + continue; + } + + if l.get_lsn_range().end >= end_lsn { + // this layer contains the requested point in the key/lsn space. + // No need to search any further + trace!( + "found layer {} for request on {} at {}", + l.filename().display(), + key, + end_lsn + ); + latest_delta.replace(Arc::clone(l)); + break; + } + // this layer's end LSN is smaller than the requested point. If there's + // nothing newer, this is what we need to return. Remember this. + if let Some(ref old_candidate) = latest_delta { + if l.get_lsn_range().end > old_candidate.get_lsn_range().end { + latest_delta.replace(Arc::clone(l)); + } + } else { + latest_delta.replace(Arc::clone(l)); + } + } + if let Some(l) = latest_delta { + trace!( + "found (old) layer {} for request on {} at {}", + l.filename().display(), + key, + end_lsn + ); + let lsn_floor = std::cmp::max( + Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1), + l.get_lsn_range().start, + ); + Ok(Some(SearchResult { + lsn_floor, + layer: l, + })) + } else if let Some(l) = latest_img { + trace!( + "found img layer and no deltas for request on {} at {}", + key, + end_lsn + ); + Ok(Some(SearchResult { + lsn_floor: latest_img_lsn.unwrap(), + layer: l, + })) + } else { + trace!("no layer found for request on {} at {}", key, end_lsn); + Ok(None) } } @@ -136,9 +182,7 @@ impl LayerMap { /// Insert an on-disk layer /// pub fn insert_historic(&mut self, layer: Arc) { - let segentry = self.segs.entry(layer.get_seg_tag()).or_default(); - segentry.insert_historic(layer); - + self.historic_layers.push(layer); NUM_ONDISK_LAYERS.inc(); } @@ -147,61 +191,62 @@ impl LayerMap { /// /// This should be called when the corresponding file on disk has been deleted. /// + #[allow(dead_code)] pub fn remove_historic(&mut self, layer: Arc) { - let tag = layer.get_seg_tag(); + let len_before = self.historic_layers.len(); - if let Some(segentry) = self.segs.get_mut(&tag) { - segentry.historic.remove(&layer); - } + // FIXME: ptr_eq might fail to return true for 'dyn' + // references. Clippy complains about this. In practice it + // seems to work, the assertion below would be triggered + // otherwise but this ought to be fixed. + #[allow(clippy::vtable_address_comparisons)] + self.historic_layers + .retain(|other| !Arc::ptr_eq(other, &layer)); + + assert_eq!(self.historic_layers.len(), len_before - 1); NUM_ONDISK_LAYERS.dec(); } - // List relations along with a flag that marks if they exist at the given lsn. - // spcnode 0 and dbnode 0 have special meanings and mean all tabespaces/databases. - // Pass Tag if we're only interested in some relations. - pub fn list_relishes(&self, tag: Option, lsn: Lsn) -> Result> { - let mut rels: HashMap = HashMap::new(); - - for (seg, segentry) in self.segs.iter() { - match seg.rel { - RelishTag::Relation(reltag) => { - if let Some(request_rel) = tag { - if (request_rel.spcnode == 0 || reltag.spcnode == request_rel.spcnode) - && (request_rel.dbnode == 0 || reltag.dbnode == request_rel.dbnode) - { - if let Some(exists) = segentry.exists_at_lsn(lsn)? { - rels.insert(seg.rel, exists); - } - } - } - } - _ => { - if tag == None { - if let Some(exists) = segentry.exists_at_lsn(lsn)? { - rels.insert(seg.rel, exists); - } - } - } - } - } - Ok(rels) - } - /// Is there a newer image layer for given segment? /// /// This is used for garbage collection, to determine if an old layer can /// be deleted. /// We ignore segments newer than disk_consistent_lsn because they will be removed at restart + /// We also only look at historic layers + //#[allow(dead_code)] pub fn newer_image_layer_exists( &self, - seg: SegmentTag, + key_range: &Range, lsn: Lsn, disk_consistent_lsn: Lsn, - ) -> bool { - if let Some(segentry) = self.segs.get(&seg) { - segentry.newer_image_layer_exists(lsn, disk_consistent_lsn) - } else { - false + ) -> Result { + let mut range_remain = key_range.clone(); + + loop { + let mut made_progress = false; + for l in self.historic_layers.iter() { + if l.is_incremental() { + continue; + } + let img_lsn = l.get_lsn_range().start; + if !l.is_incremental() + && l.get_key_range().contains(&range_remain.start) + && img_lsn > lsn + && img_lsn < disk_consistent_lsn + { + made_progress = true; + let img_key_end = l.get_key_range().end; + + if img_key_end >= range_remain.end { + return Ok(true); + } + range_remain.start = img_key_end; + } + } + + if !made_progress { + return Ok(false); + } } } @@ -211,284 +256,148 @@ impl LayerMap { /// used for garbage collection, to determine if some alive layer /// exists at the lsn. If so, we shouldn't delete a newer dropped layer /// to avoid incorrectly making it visible. - pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result { - Ok(if let Some(segentry) = self.segs.get(&seg) { - segentry.exists_at_lsn(lsn)?.unwrap_or(false) - } else { - false - }) + /* + pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result { + Ok(if let Some(segentry) = self.historic_layers.get(&seg) { + segentry.exists_at_lsn(seg, lsn)?.unwrap_or(false) + } else { + false + }) + } + */ + + pub fn iter_historic_layers(&self) -> std::slice::Iter> { + self.historic_layers.iter() } - /// Return the oldest in-memory layer, along with its generation number. - pub fn peek_oldest_open(&mut self) -> Option<(LayerId, Arc, u64)> { - let global_map = GLOBAL_LAYER_MAP.read().unwrap(); + /// Find the last image layer that covers 'key', ignoring any image layers + /// newer than 'lsn'. + fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option> { + let mut candidate_lsn = Lsn(0); + let mut candidate = None; + for l in self.historic_layers.iter() { + if l.is_incremental() { + continue; + } - while let Some(oldest_entry) = self.open_layers.peek() { - if let Some(layer) = global_map.get(&oldest_entry.layer_id) { - return Some((oldest_entry.layer_id, layer, oldest_entry.generation)); - } else { - self.open_layers.pop(); + if !l.get_key_range().contains(&key) { + continue; + } + + let this_lsn = l.get_lsn_range().start; + if this_lsn > lsn { + continue; + } + if this_lsn < candidate_lsn { + // our previous candidate was better + continue; + } + candidate_lsn = this_lsn; + candidate = Some(Arc::clone(l)); + } + + candidate + } + + /// + /// Divide the whole given range of keys into sub-ranges based on the latest + /// image layer that covers each range. (This is used when creating new + /// image layers) + /// + // FIXME: clippy complains that the result type is very complex. She's probably + // right... + #[allow(clippy::type_complexity)] + pub fn image_coverage( + &self, + key_range: &Range, + lsn: Lsn, + ) -> Result, Option>)>> { + let mut points: Vec; + + points = vec![key_range.start]; + for l in self.historic_layers.iter() { + if l.get_lsn_range().start > lsn { + continue; + } + let range = l.get_key_range(); + if key_range.contains(&range.start) { + points.push(l.get_key_range().start); + } + if key_range.contains(&range.end) { + points.push(l.get_key_range().end); } } - None - } + points.push(key_range.end); - /// Increment the generation number used to stamp open in-memory layers. Layers - /// added with `insert_open` after this call will be associated with the new - /// generation. Returns the new generation number. - pub fn increment_generation(&mut self) -> u64 { - self.current_generation += 1; - self.current_generation - } + points.sort(); + points.dedup(); - pub fn iter_historic_layers(&self) -> HistoricLayerIter { - HistoricLayerIter { - seg_iter: self.segs.iter(), - iter: None, + // Ok, we now have a list of "interesting" points in the key space + + // For each range between the points, find the latest image + let mut start = *points.first().unwrap(); + let mut ranges = Vec::new(); + for end in points[1..].iter() { + let img = self.find_latest_image(start, lsn); + + ranges.push((start..*end, img)); + + start = *end; } + Ok(ranges) + } + + /// Count how many L1 delta layers there are that overlap with the + /// given key and LSN range. + pub fn count_deltas(&self, key_range: &Range, lsn_range: &Range) -> Result { + let mut result = 0; + for l in self.historic_layers.iter() { + if !l.is_incremental() { + continue; + } + if !range_overlaps(&l.get_lsn_range(), lsn_range) { + continue; + } + if !range_overlaps(&l.get_key_range(), key_range) { + continue; + } + + // We ignore level0 delta layers. Unless the whole keyspace fits + // into one partition + if !range_eq(key_range, &(Key::MIN..Key::MAX)) + && range_eq(&l.get_key_range(), &(Key::MIN..Key::MAX)) + { + continue; + } + + result += 1; + } + Ok(result) + } + + /// Return all L0 delta layers + pub fn get_level0_deltas(&self) -> Result>> { + let mut deltas = Vec::new(); + for l in self.historic_layers.iter() { + if !l.is_incremental() { + continue; + } + if l.get_key_range() != (Key::MIN..Key::MAX) { + continue; + } + deltas.push(Arc::clone(l)); + } + Ok(deltas) } /// debugging function to print out the contents of the layer map #[allow(unused)] pub fn dump(&self) -> Result<()> { println!("Begin dump LayerMap"); - for (seg, segentry) in self.segs.iter() { - if let Some(open) = &segentry.open_layer_id { - if let Some(layer) = GLOBAL_LAYER_MAP.read().unwrap().get(open) { - layer.dump()?; - } else { - println!("layer not found in global map"); - } - } - - for layer in segentry.historic.iter() { - layer.dump()?; - } + for layer in self.historic_layers.iter() { + layer.dump()?; } println!("End dump LayerMap"); Ok(()) } } - -impl IntervalItem for dyn Layer { - type Key = Lsn; - - fn start_key(&self) -> Lsn { - self.get_start_lsn() - } - fn end_key(&self) -> Lsn { - self.get_end_lsn() - } -} - -/// -/// Per-segment entry in the LayerMap::segs hash map. Holds all the layers -/// associated with the segment. -/// -/// The last layer that is open for writes is always an InMemoryLayer, -/// and is kept in a separate field, because there can be only one for -/// each segment. The older layers, stored on disk, are kept in an -/// IntervalTree. -#[derive(Default)] -struct SegEntry { - open_layer_id: Option, - historic: IntervalTree, -} - -impl SegEntry { - /// Does the segment exist at given LSN? - /// Return None if object is not found in this SegEntry. - fn exists_at_lsn(&self, lsn: Lsn) -> Result> { - if let Some(layer) = self.get(lsn) { - Ok(Some(layer.get_seg_exists(lsn)?)) - } else { - Ok(None) - } - } - - pub fn get(&self, lsn: Lsn) -> Option> { - if let Some(open_layer_id) = &self.open_layer_id { - let open_layer = GLOBAL_LAYER_MAP.read().unwrap().get(open_layer_id)?; - if open_layer.get_start_lsn() <= lsn { - return Some(open_layer); - } - } - - self.historic.search(lsn) - } - - pub fn newer_image_layer_exists(&self, lsn: Lsn, disk_consistent_lsn: Lsn) -> bool { - // We only check on-disk layers, because - // in-memory layers are not durable - - // The end-LSN is exclusive, while disk_consistent_lsn is - // inclusive. For example, if disk_consistent_lsn is 100, it is - // OK for a delta layer to have end LSN 101, but if the end LSN - // is 102, then it might not have been fully flushed to disk - // before crash. - self.historic - .iter_newer(lsn) - .any(|layer| !layer.is_incremental() && layer.get_end_lsn() <= disk_consistent_lsn + 1) - } - - // Set new open layer for a SegEntry. - // It's ok to rewrite previous open layer, - // but only if it is not writeable anymore. - pub fn update_open(&mut self, layer: Arc) -> LayerId { - if let Some(prev_open_layer_id) = &self.open_layer_id { - if let Some(prev_open_layer) = GLOBAL_LAYER_MAP.read().unwrap().get(prev_open_layer_id) - { - assert!(!prev_open_layer.is_writeable()); - } - } - let open_layer_id = GLOBAL_LAYER_MAP.write().unwrap().insert(layer); - self.open_layer_id = Some(open_layer_id); - open_layer_id - } - - pub fn insert_historic(&mut self, layer: Arc) { - self.historic.insert(layer); - } -} - -/// Entry held in LayerMap::open_layers, with boilerplate comparison routines -/// to implement a min-heap ordered by 'oldest_lsn' and 'generation' -/// -/// The generation number associated with each entry can be used to distinguish -/// recently-added entries (i.e after last call to increment_generation()) from older -/// entries with the same 'oldest_lsn'. -struct OpenLayerEntry { - oldest_lsn: Lsn, // copy of layer.get_oldest_lsn() - generation: u64, - layer_id: LayerId, -} -impl Ord for OpenLayerEntry { - fn cmp(&self, other: &Self) -> Ordering { - // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here - // to get that. Entries with identical oldest_lsn are ordered by generation - other - .oldest_lsn - .cmp(&self.oldest_lsn) - .then_with(|| other.generation.cmp(&self.generation)) - } -} -impl PartialOrd for OpenLayerEntry { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} -impl PartialEq for OpenLayerEntry { - fn eq(&self, other: &Self) -> bool { - self.cmp(other) == Ordering::Equal - } -} -impl Eq for OpenLayerEntry {} - -/// Iterator returned by LayerMap::iter_historic_layers() -pub struct HistoricLayerIter<'a> { - seg_iter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>, - iter: Option>, -} - -impl<'a> Iterator for HistoricLayerIter<'a> { - type Item = Arc; - - fn next(&mut self) -> std::option::Option<::Item> { - loop { - if let Some(x) = &mut self.iter { - if let Some(x) = x.next() { - return Some(Arc::clone(&x)); - } - } - if let Some((_tag, segentry)) = self.seg_iter.next() { - self.iter = Some(segentry.historic.iter()); - continue; - } else { - return None; - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::config::PageServerConf; - use std::str::FromStr; - use zenith_utils::zid::{ZTenantId, ZTimelineId}; - - /// Arbitrary relation tag, for testing. - const TESTREL_A: RelishTag = RelishTag::Relation(RelTag { - spcnode: 0, - dbnode: 111, - relnode: 1000, - forknum: 0, - }); - - lazy_static! { - static ref DUMMY_TIMELINEID: ZTimelineId = - ZTimelineId::from_str("00000000000000000000000000000000").unwrap(); - static ref DUMMY_TENANTID: ZTenantId = - ZTenantId::from_str("00000000000000000000000000000000").unwrap(); - } - - /// Construct a dummy InMemoryLayer for testing - fn dummy_inmem_layer( - conf: &'static PageServerConf, - segno: u32, - start_lsn: Lsn, - oldest_lsn: Lsn, - ) -> Arc { - Arc::new( - InMemoryLayer::create( - conf, - *DUMMY_TIMELINEID, - *DUMMY_TENANTID, - SegmentTag { - rel: TESTREL_A, - segno, - }, - start_lsn, - oldest_lsn, - ) - .unwrap(), - ) - } - - #[test] - fn test_open_layers() -> Result<()> { - let conf = PageServerConf::dummy_conf(PageServerConf::test_repo_dir("dummy_inmem_layer")); - let conf = Box::leak(Box::new(conf)); - std::fs::create_dir_all(conf.timeline_path(&DUMMY_TIMELINEID, &DUMMY_TENANTID))?; - - let mut layers = LayerMap::default(); - - let gen1 = layers.increment_generation(); - layers.insert_open(dummy_inmem_layer(conf, 0, Lsn(0x100), Lsn(0x100))); - layers.insert_open(dummy_inmem_layer(conf, 1, Lsn(0x100), Lsn(0x200))); - layers.insert_open(dummy_inmem_layer(conf, 2, Lsn(0x100), Lsn(0x120))); - layers.insert_open(dummy_inmem_layer(conf, 3, Lsn(0x100), Lsn(0x110))); - - let gen2 = layers.increment_generation(); - layers.insert_open(dummy_inmem_layer(conf, 4, Lsn(0x100), Lsn(0x110))); - layers.insert_open(dummy_inmem_layer(conf, 5, Lsn(0x100), Lsn(0x100))); - - // A helper function (closure) to pop the next oldest open entry from the layer map, - // and assert that it is what we'd expect - let mut assert_pop_layer = |expected_segno: u32, expected_generation: u64| { - let (layer_id, l, generation) = layers.peek_oldest_open().unwrap(); - assert!(l.get_seg_tag().segno == expected_segno); - assert!(generation == expected_generation); - layers.remove_open(layer_id); - }; - - assert_pop_layer(0, gen1); // 0x100 - assert_pop_layer(5, gen2); // 0x100 - assert_pop_layer(3, gen1); // 0x110 - assert_pop_layer(4, gen2); // 0x110 - assert_pop_layer(2, gen1); // 0x120 - assert_pop_layer(1, gen1); // 0x200 - - Ok(()) - } -} diff --git a/pageserver/src/layered_repository/metadata.rs b/pageserver/src/layered_repository/metadata.rs index 17e0485093..7daf899ba2 100644 --- a/pageserver/src/layered_repository/metadata.rs +++ b/pageserver/src/layered_repository/metadata.rs @@ -6,9 +6,10 @@ //! //! The module contains all structs and related helper methods related to timeline metadata. -use std::{convert::TryInto, path::PathBuf}; +use std::path::PathBuf; use anyhow::ensure; +use serde::{Deserialize, Serialize}; use zenith_utils::{ bin_ser::BeSer, lsn::Lsn, @@ -16,11 +17,13 @@ use zenith_utils::{ }; use crate::config::PageServerConf; +use crate::STORAGE_FORMAT_VERSION; -// Taken from PG_CONTROL_MAX_SAFE_SIZE -const METADATA_MAX_SAFE_SIZE: usize = 512; -const METADATA_CHECKSUM_SIZE: usize = std::mem::size_of::(); -const METADATA_MAX_DATA_SIZE: usize = METADATA_MAX_SAFE_SIZE - METADATA_CHECKSUM_SIZE; +/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic. +/// +/// This is the same assumption that PostgreSQL makes with the control file, +/// see PG_CONTROL_MAX_SAFE_SIZE +const METADATA_MAX_SIZE: usize = 512; /// The name of the metadata file pageserver creates per timeline. pub const METADATA_FILE_NAME: &str = "metadata"; @@ -30,6 +33,20 @@ pub const METADATA_FILE_NAME: &str = "metadata"; /// The fields correspond to the values we hold in memory, in LayeredTimeline. #[derive(Debug, Clone, PartialEq, Eq)] pub struct TimelineMetadata { + hdr: TimelineMetadataHeader, + body: TimelineMetadataBody, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +struct TimelineMetadataHeader { + checksum: u32, // CRC of serialized metadata body + size: u16, // size of serialized metadata + format_version: u16, // storage format version (used for compatibility checks) +} +const METADATA_HDR_SIZE: usize = std::mem::size_of::(); + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +struct TimelineMetadataBody { disk_consistent_lsn: Lsn, // This is only set if we know it. We track it in memory when the page // server is running, but we only track the value corresponding to @@ -69,130 +86,90 @@ impl TimelineMetadata { initdb_lsn: Lsn, ) -> Self { Self { - disk_consistent_lsn, - prev_record_lsn, - ancestor_timeline, - ancestor_lsn, - latest_gc_cutoff_lsn, - initdb_lsn, + hdr: TimelineMetadataHeader { + checksum: 0, + size: 0, + format_version: STORAGE_FORMAT_VERSION, + }, + body: TimelineMetadataBody { + disk_consistent_lsn, + prev_record_lsn, + ancestor_timeline, + ancestor_lsn, + latest_gc_cutoff_lsn, + initdb_lsn, + }, } } pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result { ensure!( - metadata_bytes.len() == METADATA_MAX_SAFE_SIZE, + metadata_bytes.len() == METADATA_MAX_SIZE, "metadata bytes size is wrong" ); - - let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE]; - let calculated_checksum = crc32c::crc32c(data); - - let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] = - metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?; - let expected_checksum = u32::from_le_bytes(*checksum_bytes); + let hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?; ensure!( - calculated_checksum == expected_checksum, + hdr.format_version == STORAGE_FORMAT_VERSION, + "format version mismatch" + ); + let metadata_size = hdr.size as usize; + ensure!( + metadata_size <= METADATA_MAX_SIZE, + "corrupted metadata file" + ); + let calculated_checksum = crc32c::crc32c(&metadata_bytes[METADATA_HDR_SIZE..metadata_size]); + ensure!( + hdr.checksum == calculated_checksum, "metadata checksum mismatch" ); + let body = TimelineMetadataBody::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; + ensure!( + body.disk_consistent_lsn.is_aligned(), + "disk_consistent_lsn is not aligned" + ); - let data = TimelineMetadata::from(serialize::DeTimelineMetadata::des_prefix(data)?); - ensure!(data.disk_consistent_lsn.is_aligned()); - - Ok(data) + Ok(TimelineMetadata { hdr, body }) } pub fn to_bytes(&self) -> anyhow::Result> { - let serializeable_metadata = serialize::SeTimelineMetadata::from(self); - let mut metadata_bytes = serialize::SeTimelineMetadata::ser(&serializeable_metadata)?; - ensure!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE); - metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8); - - let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]); - metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum)); + let body_bytes = self.body.ser()?; + let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); + let hdr = TimelineMetadataHeader { + size: metadata_size as u16, + format_version: STORAGE_FORMAT_VERSION, + checksum: crc32c::crc32c(&body_bytes), + }; + let hdr_bytes = hdr.ser()?; + let mut metadata_bytes = vec![0u8; METADATA_MAX_SIZE]; + metadata_bytes[0..METADATA_HDR_SIZE].copy_from_slice(&hdr_bytes); + metadata_bytes[METADATA_HDR_SIZE..metadata_size].copy_from_slice(&body_bytes); Ok(metadata_bytes) } /// [`Lsn`] that corresponds to the corresponding timeline directory /// contents, stored locally in the pageserver workdir. pub fn disk_consistent_lsn(&self) -> Lsn { - self.disk_consistent_lsn + self.body.disk_consistent_lsn } pub fn prev_record_lsn(&self) -> Option { - self.prev_record_lsn + self.body.prev_record_lsn } pub fn ancestor_timeline(&self) -> Option { - self.ancestor_timeline + self.body.ancestor_timeline } pub fn ancestor_lsn(&self) -> Lsn { - self.ancestor_lsn + self.body.ancestor_lsn } pub fn latest_gc_cutoff_lsn(&self) -> Lsn { - self.latest_gc_cutoff_lsn + self.body.latest_gc_cutoff_lsn } pub fn initdb_lsn(&self) -> Lsn { - self.initdb_lsn - } -} - -/// This module is for direct conversion of metadata to bytes and back. -/// For a certain metadata, besides the conversion a few verification steps has to -/// be done, so all serde derives are hidden from the user, to avoid accidental -/// verification-less metadata creation. -mod serialize { - use serde::{Deserialize, Serialize}; - use zenith_utils::{lsn::Lsn, zid::ZTimelineId}; - - use super::TimelineMetadata; - - #[derive(Serialize)] - pub(super) struct SeTimelineMetadata<'a> { - disk_consistent_lsn: &'a Lsn, - prev_record_lsn: &'a Option, - ancestor_timeline: &'a Option, - ancestor_lsn: &'a Lsn, - latest_gc_cutoff_lsn: &'a Lsn, - initdb_lsn: &'a Lsn, - } - - impl<'a> From<&'a TimelineMetadata> for SeTimelineMetadata<'a> { - fn from(other: &'a TimelineMetadata) -> Self { - Self { - disk_consistent_lsn: &other.disk_consistent_lsn, - prev_record_lsn: &other.prev_record_lsn, - ancestor_timeline: &other.ancestor_timeline, - ancestor_lsn: &other.ancestor_lsn, - latest_gc_cutoff_lsn: &other.latest_gc_cutoff_lsn, - initdb_lsn: &other.initdb_lsn, - } - } - } - - #[derive(Deserialize)] - pub(super) struct DeTimelineMetadata { - disk_consistent_lsn: Lsn, - prev_record_lsn: Option, - ancestor_timeline: Option, - ancestor_lsn: Lsn, - latest_gc_cutoff_lsn: Lsn, - initdb_lsn: Lsn, - } - - impl From for TimelineMetadata { - fn from(other: DeTimelineMetadata) -> Self { - Self { - disk_consistent_lsn: other.disk_consistent_lsn, - prev_record_lsn: other.prev_record_lsn, - ancestor_timeline: other.ancestor_timeline, - ancestor_lsn: other.ancestor_lsn, - latest_gc_cutoff_lsn: other.latest_gc_cutoff_lsn, - initdb_lsn: other.initdb_lsn, - } - } + self.body.initdb_lsn } } @@ -204,14 +181,14 @@ mod tests { #[test] fn metadata_serializes_correctly() { - let original_metadata = TimelineMetadata { - disk_consistent_lsn: Lsn(0x200), - prev_record_lsn: Some(Lsn(0x100)), - ancestor_timeline: Some(TIMELINE_ID), - ancestor_lsn: Lsn(0), - latest_gc_cutoff_lsn: Lsn(0), - initdb_lsn: Lsn(0), - }; + let original_metadata = TimelineMetadata::new( + Lsn(0x200), + Some(Lsn(0x100)), + Some(TIMELINE_ID), + Lsn(0), + Lsn(0), + Lsn(0), + ); let metadata_bytes = original_metadata .to_bytes() @@ -221,7 +198,7 @@ mod tests { .expect("Should deserialize its own bytes"); assert_eq!( - deserialized_metadata, original_metadata, + deserialized_metadata.body, original_metadata.body, "Metadata that was serialized to bytes and deserialized back should not change" ); } diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index 8976491fc0..de34545980 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -2,139 +2,102 @@ //! Common traits and structs for layers //! -use crate::relish::RelishTag; -use crate::repository::{BlockNumber, ZenithWalRecord}; +use crate::repository::{Key, Value}; +use crate::walrecord::ZenithWalRecord; use crate::{ZTenantId, ZTimelineId}; use anyhow::Result; use bytes::Bytes; use serde::{Deserialize, Serialize}; -use std::fmt; +use std::ops::Range; use std::path::PathBuf; use zenith_utils::lsn::Lsn; -// Size of one segment in pages (10 MB) -pub const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192; - -/// -/// Each relish stored in the repository is divided into fixed-sized "segments", -/// with 10 MB of key-space, or 1280 8k pages each. -/// -#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)] -pub struct SegmentTag { - pub rel: RelishTag, - pub segno: u32, -} - -/// SegmentBlk represents a block number within a segment, or the size of segment. -/// -/// This is separate from BlockNumber, which is used for block number within the -/// whole relish. Since this is just a type alias, the compiler will let you mix -/// them freely, but we use the type alias as documentation to make it clear -/// which one we're dealing with. -/// -/// (We could turn this into "struct SegmentBlk(u32)" to forbid accidentally -/// assigning a BlockNumber to SegmentBlk or vice versa, but that makes -/// operations more verbose). -pub type SegmentBlk = u32; - -impl fmt::Display for SegmentTag { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}.{}", self.rel, self.segno) +pub fn range_overlaps(a: &Range, b: &Range) -> bool +where + T: PartialOrd, +{ + if a.start < b.start { + a.end > b.start + } else { + b.end > a.start } } -impl SegmentTag { - /// Given a relish and block number, calculate the corresponding segment and - /// block number within the segment. - pub const fn from_blknum(rel: RelishTag, blknum: BlockNumber) -> (SegmentTag, SegmentBlk) { - ( - SegmentTag { - rel, - segno: blknum / RELISH_SEG_SIZE, - }, - blknum % RELISH_SEG_SIZE, - ) - } +pub fn range_eq(a: &Range, b: &Range) -> bool +where + T: PartialEq, +{ + a.start == b.start && a.end == b.end } +/// Struct used to communicate across calls to 'get_value_reconstruct_data'. /// -/// Represents a version of a page at a specific LSN. The LSN is the key of the -/// entry in the 'page_versions' hash, it is not duplicated here. +/// Before first call, you can fill in 'page_img' if you have an older cached +/// version of the page available. That can save work in +/// 'get_value_reconstruct_data', as it can stop searching for page versions +/// when all the WAL records going back to the cached image have been collected. /// -/// A page version can be stored as a full page image, or as WAL record that needs -/// to be applied over the previous page version to reconstruct this version. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub enum PageVersion { - Page(Bytes), - Wal(ZenithWalRecord), -} - -/// -/// Struct used to communicate across calls to 'get_page_reconstruct_data'. -/// -/// Before first call to get_page_reconstruct_data, you can fill in 'page_img' -/// if you have an older cached version of the page available. That can save -/// work in 'get_page_reconstruct_data', as it can stop searching for page -/// versions when all the WAL records going back to the cached image have been -/// collected. -/// -/// When get_page_reconstruct_data returns Complete, 'page_img' is set to an -/// image of the page, or the oldest WAL record in 'records' is a will_init-type +/// When get_value_reconstruct_data returns Complete, 'img' is set to an image +/// of the page, or the oldest WAL record in 'records' is a will_init-type /// record that initializes the page without requiring a previous image. /// /// If 'get_page_reconstruct_data' returns Continue, some 'records' may have /// been collected, but there are more records outside the current layer. Pass -/// the same PageReconstructData struct in the next 'get_page_reconstruct_data' +/// the same ValueReconstructState struct in the next 'get_value_reconstruct_data' /// call, to collect more records. /// -pub struct PageReconstructData { +#[derive(Debug)] +pub struct ValueReconstructState { pub records: Vec<(Lsn, ZenithWalRecord)>, - pub page_img: Option<(Lsn, Bytes)>, + pub img: Option<(Lsn, Bytes)>, } /// Return value from Layer::get_page_reconstruct_data -pub enum PageReconstructResult { +#[derive(Clone, Copy, Debug)] +pub enum ValueReconstructResult { /// Got all the data needed to reconstruct the requested page Complete, /// This layer didn't contain all the required data, the caller should look up /// the predecessor layer at the returned LSN and collect more data from there. - Continue(Lsn), + Continue, + /// This layer didn't contain data needed to reconstruct the page version at /// the returned LSN. This is usually considered an error, but might be OK /// in some circumstances. - Missing(Lsn), + Missing, } +/// A Layer contains all data in a "rectangle" consisting of a range of keys and +/// range of LSNs. /// -/// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs. /// There are two kinds of layers, in-memory and on-disk layers. In-memory -/// layers are used to ingest incoming WAL, and provide fast access -/// to the recent page versions. On-disk layers are stored as files on disk, and -/// are immutable. This trait presents the common functionality of -/// in-memory and on-disk layers. +/// layers are used to ingest incoming WAL, and provide fast access to the +/// recent page versions. On-disk layers are stored as files on disk, and are +/// immutable. This trait presents the common functionality of in-memory and +/// on-disk layers. +/// +/// Furthermore, there are two kinds of on-disk layers: delta and image layers. +/// A delta layer contains all modifications within a range of LSNs and keys. +/// An image layer is a snapshot of all the data in a key-range, at a single +/// LSN /// pub trait Layer: Send + Sync { fn get_tenant_id(&self) -> ZTenantId; - /// Identify the timeline this relish belongs to + /// Identify the timeline this layer belongs to fn get_timeline_id(&self) -> ZTimelineId; - /// Identify the relish segment - fn get_seg_tag(&self) -> SegmentTag; + /// Range of segments that this layer covers + fn get_key_range(&self) -> Range; /// Inclusive start bound of the LSN range that this layer holds - fn get_start_lsn(&self) -> Lsn; - /// Exclusive end bound of the LSN range that this layer holds. /// /// - For an open in-memory layer, this is MAX_LSN. /// - For a frozen in-memory layer or a delta layer, this is a valid end bound. /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1 - fn get_end_lsn(&self) -> Lsn; - - /// Is the segment represented by this layer dropped by PostgreSQL? - fn is_dropped(&self) -> bool; + fn get_lsn_range(&self) -> Range; /// Filename used to store this layer on disk. (Even in-memory layers /// implement this, to print a handy unique identifier for the layer for @@ -153,18 +116,12 @@ pub trait Layer: Send + Sync { /// is available. If this returns PageReconstructResult::Continue, look up /// the predecessor layer and call again with the same 'reconstruct_data' to /// collect more data. - fn get_page_reconstruct_data( + fn get_value_reconstruct_data( &self, - blknum: SegmentBlk, - lsn: Lsn, - reconstruct_data: &mut PageReconstructData, - ) -> Result; - - /// Return size of the segment at given LSN. (Only for blocky relations.) - fn get_seg_size(&self, lsn: Lsn) -> Result; - - /// Does the segment exist at given LSN? Or was it dropped before it. - fn get_seg_exists(&self, lsn: Lsn) -> Result; + key: Key, + lsn_range: Range, + reconstruct_data: &mut ValueReconstructState, + ) -> Result; /// Does this layer only contain some data for the segment (incremental), /// or does it contain a version of every page? This is important to know @@ -175,6 +132,9 @@ pub trait Layer: Send + Sync { /// Returns true for layers that are represented in memory. fn is_in_memory(&self) -> bool; + /// Iterate through all keys and values stored in the layer + fn iter(&self) -> Box> + '_>; + /// Release memory used by this layer. There is no corresponding 'load' /// function, that's done implicitly when you call one of the get-functions. fn unload(&self) -> Result<()>; @@ -185,3 +145,36 @@ pub trait Layer: Send + Sync { /// Dump summary of the contents of the layer to stdout fn dump(&self) -> Result<()>; } + +// Flag indicating that this version initialize the page +const WILL_INIT: u64 = 1; + +/// +/// Struct representing reference to BLOB in layers. Reference contains BLOB offset and size. +/// For WAL records (delta layer) it also contains `will_init` flag which helps to determine range of records +/// which needs to be applied without reading/deserializing records themselves. +/// +#[derive(Debug, Serialize, Deserialize, Copy, Clone)] +pub struct BlobRef(u64); + +impl BlobRef { + pub fn will_init(&self) -> bool { + (self.0 & WILL_INIT) != 0 + } + + pub fn pos(&self) -> u64 { + self.0 >> 32 + } + + pub fn size(&self) -> usize { + ((self.0 & 0xFFFFFFFF) >> 1) as usize + } + + pub fn new(pos: u64, size: usize, will_init: bool) -> BlobRef { + let mut blob_ref = (pos << 32) | ((size as u64) << 1); + if will_init { + blob_ref |= WILL_INIT; + } + BlobRef(blob_ref) + } +} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 060fa54b23..4790ab6652 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -2,10 +2,12 @@ pub mod basebackup; pub mod config; pub mod http; pub mod import_datadir; +pub mod keyspace; pub mod layered_repository; pub mod page_cache; pub mod page_service; -pub mod relish; +pub mod pgdatadir_mapping; +pub mod reltag; pub mod remote_storage; pub mod repository; pub mod tenant_mgr; @@ -28,6 +30,20 @@ use zenith_utils::{ use crate::thread_mgr::ThreadKind; +use layered_repository::LayeredRepository; +use pgdatadir_mapping::DatadirTimeline; + +/// Current storage format version +/// +/// This is embedded in the metadata file, and also in the header of all the +/// layer files. If you make any backwards-incompatible changes to the storage +/// format, bump this! +pub const STORAGE_FORMAT_VERSION: u16 = 1; + +// Magic constants used to identify different kinds of files +pub const IMAGE_FILE_MAGIC: u32 = 0x5A60_0000 | STORAGE_FORMAT_VERSION as u32; +pub const DELTA_FILE_MAGIC: u32 = 0x5A61_0000 | STORAGE_FORMAT_VERSION as u32; + lazy_static! { static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!( "pageserver_live_connections_count", @@ -42,14 +58,16 @@ pub const LOG_FILE_NAME: &str = "pageserver.log"; /// Config for the Repository checkpointer #[derive(Debug, Clone, Copy)] pub enum CheckpointConfig { - // Flush in-memory data that is older than this - Distance(u64), // Flush all in-memory data Flush, // Flush all in-memory data and reconstruct all page images Forced, } +pub type RepositoryImpl = LayeredRepository; + +pub type DatadirTimelineImpl = DatadirTimeline; + pub fn shutdown_pageserver() { // Shut down the libpq endpoint thread. This prevents new connections from // being accepted. diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index ef802ba0e2..299575f792 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -53,7 +53,7 @@ use zenith_utils::{ }; use crate::layered_repository::writeback_ephemeral_file; -use crate::relish::RelTag; +use crate::repository::Key; static PAGE_CACHE: OnceCell = OnceCell::new(); const TEST_PAGE_CACHE_SIZE: usize = 10; @@ -105,8 +105,7 @@ enum CacheKey { struct MaterializedPageHashKey { tenant_id: ZTenantId, timeline_id: ZTimelineId, - rel_tag: RelTag, - blknum: u32, + key: Key, } #[derive(Clone)] @@ -291,16 +290,14 @@ impl PageCache { &self, tenant_id: ZTenantId, timeline_id: ZTimelineId, - rel_tag: RelTag, - blknum: u32, + key: &Key, lsn: Lsn, ) -> Option<(Lsn, PageReadGuard)> { let mut cache_key = CacheKey::MaterializedPage { hash_key: MaterializedPageHashKey { tenant_id, timeline_id, - rel_tag, - blknum, + key: *key, }, lsn, }; @@ -323,8 +320,7 @@ impl PageCache { &self, tenant_id: ZTenantId, timeline_id: ZTimelineId, - rel_tag: RelTag, - blknum: u32, + key: Key, lsn: Lsn, img: &[u8], ) { @@ -332,8 +328,7 @@ impl PageCache { hash_key: MaterializedPageHashKey { tenant_id, timeline_id, - rel_tag, - blknum, + key, }, lsn, }; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 4744f0fe52..43e1ec275d 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -32,7 +32,9 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId}; use crate::basebackup; use crate::config::PageServerConf; -use crate::relish::*; +use crate::pgdatadir_mapping::DatadirTimeline; +use crate::reltag::RelTag; +use crate::repository::Repository; use crate::repository::Timeline; use crate::tenant_mgr; use crate::thread_mgr; @@ -398,8 +400,8 @@ impl PageServerHandler { /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. - fn wait_or_get_last_lsn( - timeline: &dyn Timeline, + fn wait_or_get_last_lsn( + timeline: &DatadirTimeline, mut lsn: Lsn, latest: bool, latest_gc_cutoff_lsn: &RwLockReadGuard, @@ -426,7 +428,7 @@ impl PageServerHandler { if lsn <= last_record_lsn { lsn = last_record_lsn; } else { - timeline.wait_lsn(lsn)?; + timeline.tline.wait_lsn(lsn)?; // Since we waited for 'lsn' to arrive, that is now the last // record LSN. (Or close enough for our purposes; the // last-record LSN can advance immediately after we return @@ -436,7 +438,7 @@ impl PageServerHandler { if lsn == Lsn(0) { bail!("invalid LSN(0) in request"); } - timeline.wait_lsn(lsn)?; + timeline.tline.wait_lsn(lsn)?; } ensure!( lsn >= **latest_gc_cutoff_lsn, @@ -446,54 +448,47 @@ impl PageServerHandler { Ok(lsn) } - fn handle_get_rel_exists_request( + fn handle_get_rel_exists_request( &self, - timeline: &dyn Timeline, + timeline: &DatadirTimeline, req: &PagestreamExistsRequest, ) -> Result { let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered(); - let tag = RelishTag::Relation(req.rel); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; - let exists = timeline.get_rel_exists(tag, lsn)?; + let exists = timeline.get_rel_exists(req.rel, lsn)?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { exists, })) } - fn handle_get_nblocks_request( + fn handle_get_nblocks_request( &self, - timeline: &dyn Timeline, + timeline: &DatadirTimeline, req: &PagestreamNblocksRequest, ) -> Result { let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered(); - let tag = RelishTag::Relation(req.rel); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; - let n_blocks = timeline.get_relish_size(tag, lsn)?; - - // Return 0 if relation is not found. - // This is what postgres smgr expects. - let n_blocks = n_blocks.unwrap_or(0); + let n_blocks = timeline.get_rel_size(req.rel, lsn)?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { n_blocks, })) } - fn handle_get_page_at_lsn_request( + fn handle_get_page_at_lsn_request( &self, - timeline: &dyn Timeline, + timeline: &DatadirTimeline, req: &PagestreamGetPageRequest, ) -> Result { let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn) .entered(); - let tag = RelishTag::Relation(req.rel); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; /* // Add a 1s delay to some requests. The delayed causes the requests to @@ -503,7 +498,7 @@ impl PageServerHandler { std::thread::sleep(std::time::Duration::from_millis(1000)); } */ - let page = timeline.get_page_at_lsn(tag, req.blkno, lsn)?; + let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn)?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page, @@ -523,7 +518,7 @@ impl PageServerHandler { // check that the timeline exists let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid) .context("Cannot load local timeline")?; - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) @@ -701,67 +696,19 @@ impl postgres_backend::Handler for PageServerHandler { let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?; pgb.write_message_noflush(&BeMessage::RowDescription(&[ - RowDescriptor::int8_col(b"layer_relfiles_total"), - RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"), - RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"), - RowDescriptor::int8_col(b"layer_relfiles_not_updated"), - RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"), - RowDescriptor::int8_col(b"layer_relfiles_removed"), - RowDescriptor::int8_col(b"layer_relfiles_dropped"), - RowDescriptor::int8_col(b"layer_nonrelfiles_total"), - RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"), - RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"), - RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"), - RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"), - RowDescriptor::int8_col(b"layer_nonrelfiles_removed"), - RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"), + RowDescriptor::int8_col(b"layers_total"), + RowDescriptor::int8_col(b"layers_needed_by_cutoff"), + RowDescriptor::int8_col(b"layers_needed_by_branches"), + RowDescriptor::int8_col(b"layers_not_updated"), + RowDescriptor::int8_col(b"layers_removed"), RowDescriptor::int8_col(b"elapsed"), ]))? .write_message_noflush(&BeMessage::DataRow(&[ - Some(result.ondisk_relfiles_total.to_string().as_bytes()), - Some( - result - .ondisk_relfiles_needed_by_cutoff - .to_string() - .as_bytes(), - ), - Some( - result - .ondisk_relfiles_needed_by_branches - .to_string() - .as_bytes(), - ), - Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()), - Some( - result - .ondisk_relfiles_needed_as_tombstone - .to_string() - .as_bytes(), - ), - Some(result.ondisk_relfiles_removed.to_string().as_bytes()), - Some(result.ondisk_relfiles_dropped.to_string().as_bytes()), - Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()), - Some( - result - .ondisk_nonrelfiles_needed_by_cutoff - .to_string() - .as_bytes(), - ), - Some( - result - .ondisk_nonrelfiles_needed_by_branches - .to_string() - .as_bytes(), - ), - Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()), - Some( - result - .ondisk_nonrelfiles_needed_as_tombstone - .to_string() - .as_bytes(), - ), - Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()), - Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()), + Some(result.layers_total.to_string().as_bytes()), + Some(result.layers_needed_by_cutoff.to_string().as_bytes()), + Some(result.layers_needed_by_branches.to_string().as_bytes()), + Some(result.layers_not_updated.to_string().as_bytes()), + Some(result.layers_removed.to_string().as_bytes()), Some(result.elapsed.as_millis().to_string().as_bytes()), ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; @@ -781,7 +728,14 @@ impl postgres_backend::Handler for PageServerHandler { let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid) .context("Cannot load local timeline")?; - timeline.checkpoint(CheckpointConfig::Forced)?; + timeline.tline.checkpoint(CheckpointConfig::Forced)?; + + // Also compact it. + // + // FIXME: This probably shouldn't be part of a "checkpoint" command, but a + // separate operation. Update the tests if you change this. + timeline.tline.compact()?; + pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs new file mode 100644 index 0000000000..7b0fc606de --- /dev/null +++ b/pageserver/src/pgdatadir_mapping.rs @@ -0,0 +1,1350 @@ +//! +//! This provides an abstraction to store PostgreSQL relations and other files +//! in the key-value store that implements the Repository interface. +//! +//! (TODO: The line between PUT-functions here and walingest.rs is a bit blurry, as +//! walingest.rs handles a few things like implicit relation creation and extension. +//! Clarify that) +//! +use crate::keyspace::{KeySpace, KeySpaceAccum, TARGET_FILE_SIZE_BYTES}; +use crate::reltag::{RelTag, SlruKind}; +use crate::repository::*; +use crate::repository::{Repository, Timeline}; +use crate::walrecord::ZenithWalRecord; +use anyhow::{bail, ensure, Result}; +use bytes::{Buf, Bytes}; +use postgres_ffi::{pg_constants, Oid, TransactionId}; +use serde::{Deserialize, Serialize}; +use std::collections::{HashMap, HashSet}; +use std::ops::Range; +use std::sync::atomic::{AtomicIsize, Ordering}; +use std::sync::{Arc, RwLockReadGuard}; +use tracing::{debug, error, trace, warn}; +use zenith_utils::bin_ser::BeSer; +use zenith_utils::lsn::AtomicLsn; +use zenith_utils::lsn::Lsn; + +/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type. +pub type BlockNumber = u32; + +pub struct DatadirTimeline +where + R: Repository, +{ + /// The underlying key-value store. Callers should not read or modify the + /// data in the underlying store directly. However, it is exposed to have + /// access to information like last-LSN, ancestor, and operations like + /// compaction. + pub tline: Arc, + + /// When did we last calculate the partitioning? + last_partitioning: AtomicLsn, + + /// Configuration: how often should the partitioning be recalculated. + repartition_threshold: u64, + + /// Current logical size of the "datadir", at the last LSN. + current_logical_size: AtomicIsize, +} + +impl DatadirTimeline { + pub fn new(tline: Arc, repartition_threshold: u64) -> Self { + DatadirTimeline { + tline, + last_partitioning: AtomicLsn::new(0), + current_logical_size: AtomicIsize::new(0), + repartition_threshold, + } + } + + /// (Re-)calculate the logical size of the database at the latest LSN. + /// + /// This can be a slow operation. + pub fn init_logical_size(&self) -> Result<()> { + let last_lsn = self.tline.get_last_record_lsn(); + self.current_logical_size.store( + self.get_current_logical_size_non_incremental(last_lsn)? as isize, + Ordering::SeqCst, + ); + Ok(()) + } + + /// Start ingesting a WAL record, or other atomic modification of + /// the timeline. + /// + /// This provides a transaction-like interface to perform a bunch + /// of modifications atomically, all stamped with one LSN. + /// + /// To ingest a WAL record, call begin_modification(lsn) to get a + /// DatadirModification object. Use the functions in the object to + /// modify the repository state, updating all the pages and metadata + /// that the WAL record affects. When you're done, call commit() to + /// commit the changes. + /// + /// Note that any pending modifications you make through the + /// modification object won't be visible to calls to the 'get' and list + /// functions of the timeline until you finish! And if you update the + /// same page twice, the last update wins. + /// + pub fn begin_modification(&self, lsn: Lsn) -> DatadirModification { + DatadirModification { + tline: self, + lsn, + pending_updates: HashMap::new(), + pending_deletions: Vec::new(), + pending_nblocks: 0, + } + } + + //------------------------------------------------------------------------------ + // Public GET functions + //------------------------------------------------------------------------------ + + /// Look up given page version. + pub fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result { + ensure!(tag.relnode != 0, "invalid relnode"); + + let nblocks = self.get_rel_size(tag, lsn)?; + if blknum >= nblocks { + debug!( + "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", + tag, blknum, lsn, nblocks + ); + return Ok(ZERO_PAGE.clone()); + } + + let key = rel_block_to_key(tag, blknum); + self.tline.get(key, lsn) + } + + /// Get size of a relation file + pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { + ensure!(tag.relnode != 0, "invalid relnode"); + + if (tag.forknum == pg_constants::FSM_FORKNUM + || tag.forknum == pg_constants::VISIBILITYMAP_FORKNUM) + && !self.get_rel_exists(tag, lsn)? + { + // FIXME: Postgres sometimes calls smgrcreate() to create + // FSM, and smgrnblocks() on it immediately afterwards, + // without extending it. Tolerate that by claiming that + // any non-existent FSM fork has size 0. + return Ok(0); + } + + let key = rel_size_to_key(tag); + let mut buf = self.tline.get(key, lsn)?; + Ok(buf.get_u32_le()) + } + + /// Does relation exist? + pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { + ensure!(tag.relnode != 0, "invalid relnode"); + + // fetch directory listing + let key = rel_dir_to_key(tag.spcnode, tag.dbnode); + let buf = self.tline.get(key, lsn)?; + let dir = RelDirectory::des(&buf)?; + + let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); + + Ok(exists) + } + + /// Get a list of all existing relations in given tablespace and database. + pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { + // fetch directory listing + let key = rel_dir_to_key(spcnode, dbnode); + let buf = self.tline.get(key, lsn)?; + let dir = RelDirectory::des(&buf)?; + + let rels: HashSet = + HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { + spcnode, + dbnode, + relnode: *relnode, + forknum: *forknum, + })); + + Ok(rels) + } + + /// Look up given SLRU page version. + pub fn get_slru_page_at_lsn( + &self, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + lsn: Lsn, + ) -> Result { + let key = slru_block_to_key(kind, segno, blknum); + self.tline.get(key, lsn) + } + + /// Get size of an SLRU segment + pub fn get_slru_segment_size( + &self, + kind: SlruKind, + segno: u32, + lsn: Lsn, + ) -> Result { + let key = slru_segment_size_to_key(kind, segno); + let mut buf = self.tline.get(key, lsn)?; + Ok(buf.get_u32_le()) + } + + /// Get size of an SLRU segment + pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { + // fetch directory listing + let key = slru_dir_to_key(kind); + let buf = self.tline.get(key, lsn)?; + let dir = SlruSegmentDirectory::des(&buf)?; + + let exists = dir.segments.get(&segno).is_some(); + Ok(exists) + } + + /// Get a list of SLRU segments + pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { + // fetch directory entry + let key = slru_dir_to_key(kind); + + let buf = self.tline.get(key, lsn)?; + let dir = SlruSegmentDirectory::des(&buf)?; + + Ok(dir.segments) + } + + pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + let key = relmap_file_key(spcnode, dbnode); + + let buf = self.tline.get(key, lsn)?; + Ok(buf) + } + + pub fn list_dbdirs(&self, lsn: Lsn) -> Result> { + // fetch directory entry + let buf = self.tline.get(DBDIR_KEY, lsn)?; + let dir = DbDirectory::des(&buf)?; + + Ok(dir.dbdirs) + } + + pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result { + let key = twophase_file_key(xid); + let buf = self.tline.get(key, lsn)?; + Ok(buf) + } + + pub fn list_twophase_files(&self, lsn: Lsn) -> Result> { + // fetch directory entry + let buf = self.tline.get(TWOPHASEDIR_KEY, lsn)?; + let dir = TwoPhaseDirectory::des(&buf)?; + + Ok(dir.xids) + } + + pub fn get_control_file(&self, lsn: Lsn) -> Result { + self.tline.get(CONTROLFILE_KEY, lsn) + } + + pub fn get_checkpoint(&self, lsn: Lsn) -> Result { + self.tline.get(CHECKPOINT_KEY, lsn) + } + + /// Get the LSN of the last ingested WAL record. + /// + /// This is just a convenience wrapper that calls through to the underlying + /// repository. + pub fn get_last_record_lsn(&self) -> Lsn { + self.tline.get_last_record_lsn() + } + + /// Check that it is valid to request operations with that lsn. + /// + /// This is just a convenience wrapper that calls through to the underlying + /// repository. + pub fn check_lsn_is_in_scope( + &self, + lsn: Lsn, + latest_gc_cutoff_lsn: &RwLockReadGuard, + ) -> Result<()> { + self.tline.check_lsn_is_in_scope(lsn, latest_gc_cutoff_lsn) + } + + /// Retrieve current logical size of the timeline + /// + /// NOTE: counted incrementally, includes ancestors, + pub fn get_current_logical_size(&self) -> usize { + let current_logical_size = self.current_logical_size.load(Ordering::Acquire); + match usize::try_from(current_logical_size) { + Ok(sz) => sz, + Err(_) => { + error!( + "current_logical_size is out of range: {}", + current_logical_size + ); + 0 + } + } + } + + /// Does the same as get_current_logical_size but counted on demand. + /// Used to initialize the logical size tracking on startup. + /// + /// Only relation blocks are counted currently. That excludes metadata, + /// SLRUs, twophase files etc. + pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { + // Fetch list of database dirs and iterate them + let buf = self.tline.get(DBDIR_KEY, lsn)?; + let dbdir = DbDirectory::des(&buf)?; + + let mut total_size: usize = 0; + for (spcnode, dbnode) in dbdir.dbdirs.keys() { + for rel in self.list_rels(*spcnode, *dbnode, lsn)? { + let relsize_key = rel_size_to_key(rel); + let mut buf = self.tline.get(relsize_key, lsn)?; + let relsize = buf.get_u32_le(); + + total_size += relsize as usize; + } + } + Ok(total_size * pg_constants::BLCKSZ as usize) + } + + /// + /// Get a KeySpace that covers all the Keys that are in use at the given LSN. + /// Anything that's not listed maybe removed from the underlying storage (from + /// that LSN forwards). + fn collect_keyspace(&self, lsn: Lsn) -> Result { + // Iterate through key ranges, greedily packing them into partitions + let mut result = KeySpaceAccum::new(); + + // The dbdir metadata always exists + result.add_key(DBDIR_KEY); + + // Fetch list of database dirs and iterate them + let buf = self.tline.get(DBDIR_KEY, lsn)?; + let dbdir = DbDirectory::des(&buf)?; + + let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect(); + dbs.sort_unstable(); + for (spcnode, dbnode) in dbs { + result.add_key(relmap_file_key(spcnode, dbnode)); + result.add_key(rel_dir_to_key(spcnode, dbnode)); + + let mut rels: Vec = self + .list_rels(spcnode, dbnode, lsn)? + .iter() + .cloned() + .collect(); + rels.sort_unstable(); + for rel in rels { + let relsize_key = rel_size_to_key(rel); + let mut buf = self.tline.get(relsize_key, lsn)?; + let relsize = buf.get_u32_le(); + + result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize)); + result.add_key(relsize_key); + } + } + + // Iterate SLRUs next + for kind in [ + SlruKind::Clog, + SlruKind::MultiXactMembers, + SlruKind::MultiXactOffsets, + ] { + let slrudir_key = slru_dir_to_key(kind); + result.add_key(slrudir_key); + let buf = self.tline.get(slrudir_key, lsn)?; + let dir = SlruSegmentDirectory::des(&buf)?; + let mut segments: Vec = dir.segments.iter().cloned().collect(); + segments.sort_unstable(); + for segno in segments { + let segsize_key = slru_segment_size_to_key(kind, segno); + let mut buf = self.tline.get(segsize_key, lsn)?; + let segsize = buf.get_u32_le(); + + result.add_range( + slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize), + ); + result.add_key(segsize_key); + } + } + + // Then pg_twophase + result.add_key(TWOPHASEDIR_KEY); + let buf = self.tline.get(TWOPHASEDIR_KEY, lsn)?; + let twophase_dir = TwoPhaseDirectory::des(&buf)?; + let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); + xids.sort_unstable(); + for xid in xids { + result.add_key(twophase_file_key(xid)); + } + + result.add_key(CONTROLFILE_KEY); + result.add_key(CHECKPOINT_KEY); + + Ok(result.to_keyspace()) + } +} + +/// DatadirModification represents an operation to ingest an atomic set of +/// updates to the repository. It is created by the 'begin_record' +/// function. It is called for each WAL record, so that all the modifications +/// by a one WAL record appear atomic. +pub struct DatadirModification<'a, R: Repository> { + /// The timeline this modification applies to. You can access this to + /// read the state, but note that any pending updates are *not* reflected + /// in the state in 'tline' yet. + pub tline: &'a DatadirTimeline, + + lsn: Lsn, + + // The modifications are not applied directly to the underyling key-value store. + // The put-functions add the modifications here, and they are flushed to the + // underlying key-value store by the 'finish' function. + pending_updates: HashMap, + pending_deletions: Vec>, + pending_nblocks: isize, +} + +impl<'a, R: Repository> DatadirModification<'a, R> { + /// Initialize a completely new repository. + /// + /// This inserts the directory metadata entries that are assumed to + /// always exist. + pub fn init_empty(&mut self) -> Result<()> { + let buf = DbDirectory::ser(&DbDirectory { + dbdirs: HashMap::new(), + })?; + self.put(DBDIR_KEY, Value::Image(buf.into())); + + let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { + xids: HashSet::new(), + })?; + self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); + + let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into(); + let empty_dir = Value::Image(buf); + self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone()); + self.put( + slru_dir_to_key(SlruKind::MultiXactMembers), + empty_dir.clone(), + ); + self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir); + + Ok(()) + } + + /// Put a new page version that can be constructed from a WAL record + /// + /// NOTE: this will *not* implicitly extend the relation, if the page is beyond the + /// current end-of-file. It's up to the caller to check that the relation size + /// matches the blocks inserted! + pub fn put_rel_wal_record( + &mut self, + rel: RelTag, + blknum: BlockNumber, + rec: ZenithWalRecord, + ) -> Result<()> { + ensure!(rel.relnode != 0, "invalid relnode"); + self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec)); + Ok(()) + } + + // Same, but for an SLRU. + pub fn put_slru_wal_record( + &mut self, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + rec: ZenithWalRecord, + ) -> Result<()> { + self.put( + slru_block_to_key(kind, segno, blknum), + Value::WalRecord(rec), + ); + Ok(()) + } + + /// Like put_wal_record, but with ready-made image of the page. + pub fn put_rel_page_image( + &mut self, + rel: RelTag, + blknum: BlockNumber, + img: Bytes, + ) -> Result<()> { + ensure!(rel.relnode != 0, "invalid relnode"); + self.put(rel_block_to_key(rel, blknum), Value::Image(img)); + Ok(()) + } + + pub fn put_slru_page_image( + &mut self, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + img: Bytes, + ) -> Result<()> { + self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img)); + Ok(()) + } + + /// Store a relmapper file (pg_filenode.map) in the repository + pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> Result<()> { + // Add it to the directory (if it doesn't exist already) + let buf = self.get(DBDIR_KEY)?; + let mut dbdir = DbDirectory::des(&buf)?; + + let r = dbdir.dbdirs.insert((spcnode, dbnode), true); + if r == None || r == Some(false) { + // The dbdir entry didn't exist, or it contained a + // 'false'. The 'insert' call already updated it with + // 'true', now write the updated 'dbdirs' map back. + let buf = DbDirectory::ser(&dbdir)?; + self.put(DBDIR_KEY, Value::Image(buf.into())); + } + if r == None { + // Create RelDirectory + let buf = RelDirectory::ser(&RelDirectory { + rels: HashSet::new(), + })?; + self.put( + rel_dir_to_key(spcnode, dbnode), + Value::Image(Bytes::from(buf)), + ); + } + + self.put(relmap_file_key(spcnode, dbnode), Value::Image(img)); + Ok(()) + } + + pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> Result<()> { + // Add it to the directory entry + let buf = self.get(TWOPHASEDIR_KEY)?; + let mut dir = TwoPhaseDirectory::des(&buf)?; + if !dir.xids.insert(xid) { + bail!("twophase file for xid {} already exists", xid); + } + self.put( + TWOPHASEDIR_KEY, + Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), + ); + + self.put(twophase_file_key(xid), Value::Image(img)); + Ok(()) + } + + pub fn put_control_file(&mut self, img: Bytes) -> Result<()> { + self.put(CONTROLFILE_KEY, Value::Image(img)); + Ok(()) + } + + pub fn put_checkpoint(&mut self, img: Bytes) -> Result<()> { + self.put(CHECKPOINT_KEY, Value::Image(img)); + Ok(()) + } + + pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> { + // Remove entry from dbdir + let buf = self.get(DBDIR_KEY)?; + let mut dir = DbDirectory::des(&buf)?; + if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() { + let buf = DbDirectory::ser(&dir)?; + self.put(DBDIR_KEY, Value::Image(buf.into())); + } else { + warn!( + "dropped dbdir for spcnode {} dbnode {} did not exist in db directory", + spcnode, dbnode + ); + } + + // FIXME: update pending_nblocks + + // Delete all relations and metadata files for the spcnode/dnode + self.delete(dbdir_key_range(spcnode, dbnode)); + Ok(()) + } + + /// Create a relation fork. + /// + /// 'nblocks' is the initial size. + pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { + ensure!(rel.relnode != 0, "invalid relnode"); + // It's possible that this is the first rel for this db in this + // tablespace. Create the reldir entry for it if so. + let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY)?)?; + let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); + let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() { + // Didn't exist. Update dbdir + dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false); + let buf = DbDirectory::ser(&dbdir)?; + self.put(DBDIR_KEY, Value::Image(buf.into())); + + // and create the RelDirectory + RelDirectory::default() + } else { + // reldir already exists, fetch it + RelDirectory::des(&self.get(rel_dir_key)?)? + }; + + // Add the new relation to the rel directory entry, and write it back + if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { + bail!("rel {} already exists", rel); + } + self.put( + rel_dir_key, + Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)), + ); + + // Put size + let size_key = rel_size_to_key(rel); + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + + self.pending_nblocks += nblocks as isize; + + // Even if nblocks > 0, we don't insert any actual blocks here. That's up to the + // caller. + + Ok(()) + } + + /// Truncate relation + pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { + ensure!(rel.relnode != 0, "invalid relnode"); + let size_key = rel_size_to_key(rel); + + // Fetch the old size first + let old_size = self.get(size_key)?.get_u32_le(); + + // Update the entry with the new size. + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + + // Update logical database size. + self.pending_nblocks -= old_size as isize - nblocks as isize; + Ok(()) + } + + /// Extend relation + pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { + ensure!(rel.relnode != 0, "invalid relnode"); + + // Put size + let size_key = rel_size_to_key(rel); + let old_size = self.get(size_key)?.get_u32_le(); + + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + + self.pending_nblocks += nblocks as isize - old_size as isize; + Ok(()) + } + + /// Drop a relation. + pub fn put_rel_drop(&mut self, rel: RelTag) -> Result<()> { + ensure!(rel.relnode != 0, "invalid relnode"); + + // Remove it from the directory entry + let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); + let buf = self.get(dir_key)?; + let mut dir = RelDirectory::des(&buf)?; + + if dir.rels.remove(&(rel.relnode, rel.forknum)) { + self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?))); + } else { + warn!("dropped rel {} did not exist in rel directory", rel); + } + + // update logical size + let size_key = rel_size_to_key(rel); + let old_size = self.get(size_key)?.get_u32_le(); + self.pending_nblocks -= old_size as isize; + + // Delete size entry, as well as all blocks + self.delete(rel_key_range(rel)); + + Ok(()) + } + + pub fn put_slru_segment_creation( + &mut self, + kind: SlruKind, + segno: u32, + nblocks: BlockNumber, + ) -> Result<()> { + // Add it to the directory entry + let dir_key = slru_dir_to_key(kind); + let buf = self.get(dir_key)?; + let mut dir = SlruSegmentDirectory::des(&buf)?; + + if !dir.segments.insert(segno) { + bail!("slru segment {:?}/{} already exists", kind, segno); + } + self.put( + dir_key, + Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), + ); + + // Put size + let size_key = slru_segment_size_to_key(kind, segno); + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + + // even if nblocks > 0, we don't insert any actual blocks here + + Ok(()) + } + + /// Extend SLRU segment + pub fn put_slru_extend( + &mut self, + kind: SlruKind, + segno: u32, + nblocks: BlockNumber, + ) -> Result<()> { + // Put size + let size_key = slru_segment_size_to_key(kind, segno); + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + Ok(()) + } + + /// This method is used for marking truncated SLRU files + pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> Result<()> { + // Remove it from the directory entry + let dir_key = slru_dir_to_key(kind); + let buf = self.get(dir_key)?; + let mut dir = SlruSegmentDirectory::des(&buf)?; + + if !dir.segments.remove(&segno) { + warn!("slru segment {:?}/{} does not exist", kind, segno); + } + self.put( + dir_key, + Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), + ); + + // Delete size entry, as well as all blocks + self.delete(slru_segment_key_range(kind, segno)); + + Ok(()) + } + + /// Drop a relmapper file (pg_filenode.map) + pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<()> { + // TODO + Ok(()) + } + + /// This method is used for marking truncated SLRU files + pub fn drop_twophase_file(&mut self, xid: TransactionId) -> Result<()> { + // Remove it from the directory entry + let buf = self.get(TWOPHASEDIR_KEY)?; + let mut dir = TwoPhaseDirectory::des(&buf)?; + + if !dir.xids.remove(&xid) { + warn!("twophase file for xid {} does not exist", xid); + } + self.put( + TWOPHASEDIR_KEY, + Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), + ); + + // Delete it + self.delete(twophase_key_range(xid)); + + Ok(()) + } + + /// + /// Finish this atomic update, writing all the updated keys to the + /// underlying timeline. + /// + pub fn commit(self) -> Result<()> { + let writer = self.tline.tline.writer(); + + let last_partitioning = self.tline.last_partitioning.load(); + let pending_nblocks = self.pending_nblocks; + + for (key, value) in self.pending_updates { + writer.put(key, self.lsn, value)?; + } + for key_range in self.pending_deletions { + writer.delete(key_range.clone(), self.lsn)?; + } + + writer.finish_write(self.lsn); + + if last_partitioning == Lsn(0) + || self.lsn.0 - last_partitioning.0 > self.tline.repartition_threshold + { + let keyspace = self.tline.collect_keyspace(self.lsn)?; + let partitioning = keyspace.partition(TARGET_FILE_SIZE_BYTES); + self.tline.tline.hint_partitioning(partitioning, self.lsn)?; + self.tline.last_partitioning.store(self.lsn); + } + + if pending_nblocks != 0 { + self.tline.current_logical_size.fetch_add( + pending_nblocks * pg_constants::BLCKSZ as isize, + Ordering::SeqCst, + ); + } + + Ok(()) + } + + // Internal helper functions to batch the modifications + + fn get(&self, key: Key) -> Result { + // Have we already updated the same key? Read the pending updated + // version in that case. + // + // Note: we don't check pending_deletions. It is an error to request a + // value that has been removed, deletion only avoids leaking storage. + if let Some(value) = self.pending_updates.get(&key) { + if let Value::Image(img) = value { + Ok(img.clone()) + } else { + // Currently, we never need to read back a WAL record that we + // inserted in the same "transaction". All the metadata updates + // work directly with Images, and we never need to read actual + // data pages. We could handle this if we had to, by calling + // the walredo manager, but let's keep it simple for now. + bail!("unexpected pending WAL record"); + } + } else { + let last_lsn = self.tline.get_last_record_lsn(); + self.tline.tline.get(key, last_lsn) + } + } + + fn put(&mut self, key: Key, val: Value) { + self.pending_updates.insert(key, val); + } + + fn delete(&mut self, key_range: Range) { + trace!("DELETE {}-{}", key_range.start, key_range.end); + self.pending_deletions.push(key_range); + } +} + +//--- Metadata structs stored in key-value pairs in the repository. + +#[derive(Debug, Serialize, Deserialize)] +struct DbDirectory { + // (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist) + dbdirs: HashMap<(Oid, Oid), bool>, +} + +#[derive(Debug, Serialize, Deserialize)] +struct TwoPhaseDirectory { + xids: HashSet, +} + +#[derive(Debug, Serialize, Deserialize, Default)] +struct RelDirectory { + // Set of relations that exist. (relfilenode, forknum) + // + // TODO: Store it as a btree or radix tree or something else that spans multiple + // key-value pairs, if you have a lot of relations + rels: HashSet<(Oid, u8)>, +} + +#[derive(Debug, Serialize, Deserialize)] +struct RelSizeEntry { + nblocks: u32, +} + +#[derive(Debug, Serialize, Deserialize, Default)] +struct SlruSegmentDirectory { + // Set of SLRU segments that exist. + segments: HashSet, +} + +static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; pg_constants::BLCKSZ as usize]); + +// Layout of the Key address space +// +// The Key struct, used to address the underlying key-value store, consists of +// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map +// all the data and metadata keys into those 18 bytes. +// +// Principles for the mapping: +// +// - Things that are often accessed or modified together, should be close to +// each other in the key space. For example, if a relation is extended by one +// block, we create a new key-value pair for the block data, and update the +// relation size entry. Because of that, the RelSize key comes after all the +// RelBlocks of a relation: the RelSize and the last RelBlock are always next +// to each other. +// +// The key space is divided into four major sections, identified by the first +// byte, and the form a hierarchy: +// +// 00 Relation data and metadata +// +// DbDir () -> (dbnode, spcnode) +// Filenodemap +// RelDir -> relnode forknum +// RelBlocks +// RelSize +// +// 01 SLRUs +// +// SlruDir kind +// SlruSegBlocks segno +// SlruSegSize +// +// 02 pg_twophase +// +// 03 misc +// controlfile +// checkpoint +// +// Below is a full list of the keyspace allocation: +// +// DbDir: +// 00 00000000 00000000 00000000 00 00000000 +// +// Filenodemap: +// 00 SPCNODE DBNODE 00000000 00 00000000 +// +// RelDir: +// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0) +// +// RelBlock: +// 00 SPCNODE DBNODE RELNODE FORK BLKNUM +// +// RelSize: +// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF +// +// SlruDir: +// 01 kind 00000000 00000000 00 00000000 +// +// SlruSegBlock: +// 01 kind 00000001 SEGNO 00 BLKNUM +// +// SlruSegSize: +// 01 kind 00000001 SEGNO 00 FFFFFFFF +// +// TwoPhaseDir: +// 02 00000000 00000000 00000000 00 00000000 +// +// TwoPhaseFile: +// 02 00000000 00000000 00000000 00 XID +// +// ControlFile: +// 03 00000000 00000000 00000000 00 00000000 +// +// Checkpoint: +// 03 00000000 00000000 00000000 00 00000001 + +//-- Section 01: relation data and metadata + +const DBDIR_KEY: Key = Key { + field1: 0x00, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0xffffffff, + field5: 0xff, + field6: 0xffffffff, + } +} + +fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + } +} + +fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 1, + } +} + +fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: blknum, + } +} + +fn rel_size_to_key(rel: RelTag) -> Key { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: 0xffffffff, + } +} + +fn rel_key_range(rel: RelTag) -> Range { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: 0, + }..Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum + 1, + field6: 0, + } +} + +//-- Section 02: SLRUs + +fn slru_dir_to_key(kind: SlruKind) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } +} + +fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 1, + field4: segno, + field5: 0, + field6: blknum, + } +} + +fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 1, + field4: segno, + field5: 0, + field6: 0xffffffff, + } +} + +fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range { + let field2 = match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }; + + Key { + field1: 0x01, + field2, + field3: segno, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: 0x01, + field2, + field3: segno, + field4: 0, + field5: 1, + field6: 0, + } +} + +//-- Section 03: pg_twophase + +const TWOPHASEDIR_KEY: Key = Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +fn twophase_file_key(xid: TransactionId) -> Key { + Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: xid, + } +} + +fn twophase_key_range(xid: TransactionId) -> Range { + let (next_xid, overflowed) = xid.overflowing_add(1); + + Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: xid, + }..Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: if overflowed { 1 } else { 0 }, + field6: next_xid, + } +} + +//-- Section 03: Control file +const CONTROLFILE_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +const CHECKPOINT_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 1, +}; + +// Reverse mappings for a few Keys. +// These are needed by WAL redo manager. + +pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> { + Ok(match key.field1 { + 0x00 => ( + RelTag { + spcnode: key.field2, + dbnode: key.field3, + relnode: key.field4, + forknum: key.field5, + }, + key.field6, + ), + _ => bail!("unexpected value kind 0x{:02x}", key.field1), + }) +} + +pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> { + Ok(match key.field1 { + 0x01 => { + let kind = match key.field2 { + 0x00 => SlruKind::Clog, + 0x01 => SlruKind::MultiXactMembers, + 0x02 => SlruKind::MultiXactOffsets, + _ => bail!("unrecognized slru kind 0x{:02x}", key.field2), + }; + let segno = key.field4; + let blknum = key.field6; + + (kind, segno, blknum) + } + _ => bail!("unexpected value kind 0x{:02x}", key.field1), + }) +} + +// +//-- Tests that should work the same with any Repository/Timeline implementation. +// + +#[cfg(test)] +pub fn create_test_timeline( + repo: R, + timeline_id: zenith_utils::zid::ZTimelineId, +) -> Result>> { + let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; + let tline = DatadirTimeline::new(tline, crate::layered_repository::tests::TEST_FILE_SIZE / 10); + let mut m = tline.begin_modification(Lsn(8)); + m.init_empty()?; + m.commit()?; + Ok(Arc::new(tline)) +} + +#[allow(clippy::bool_assert_comparison)] +#[cfg(test)] +mod tests { + //use super::repo_harness::*; + //use super::*; + + /* + fn assert_current_logical_size(timeline: &DatadirTimeline, lsn: Lsn) { + let incremental = timeline.get_current_logical_size(); + let non_incremental = timeline + .get_current_logical_size_non_incremental(lsn) + .unwrap(); + assert_eq!(incremental, non_incremental); + } + */ + + /* + /// + /// Test list_rels() function, with branches and dropped relations + /// + #[test] + fn test_list_rels_drop() -> Result<()> { + let repo = RepoHarness::create("test_list_rels_drop")?.load(); + let tline = create_empty_timeline(repo, TIMELINE_ID)?; + const TESTDB: u32 = 111; + + // Import initial dummy checkpoint record, otherwise the get_timeline() call + // after branching fails below + let mut writer = tline.begin_record(Lsn(0x10)); + writer.put_checkpoint(ZERO_CHECKPOINT.clone())?; + writer.finish()?; + + // Create a relation on the timeline + let mut writer = tline.begin_record(Lsn(0x20)); + writer.put_rel_page_image(TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + writer.finish()?; + + let writer = tline.begin_record(Lsn(0x00)); + writer.finish()?; + + // Check that list_rels() lists it after LSN 2, but no before it + assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&TESTREL_A)); + assert!(tline.list_rels(0, TESTDB, Lsn(0x20))?.contains(&TESTREL_A)); + assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A)); + + // Create a branch, check that the relation is visible there + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; + let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { + Some(timeline) => timeline, + None => panic!("Should have a local timeline"), + }; + let newtline = DatadirTimelineImpl::new(newtline); + assert!(newtline + .list_rels(0, TESTDB, Lsn(0x30))? + .contains(&TESTREL_A)); + + // Drop it on the branch + let mut new_writer = newtline.begin_record(Lsn(0x40)); + new_writer.drop_relation(TESTREL_A)?; + new_writer.finish()?; + + // Check that it's no longer listed on the branch after the point where it was dropped + assert!(newtline + .list_rels(0, TESTDB, Lsn(0x30))? + .contains(&TESTREL_A)); + assert!(!newtline + .list_rels(0, TESTDB, Lsn(0x40))? + .contains(&TESTREL_A)); + + // Run checkpoint and garbage collection and check that it's still not visible + newtline.tline.checkpoint(CheckpointConfig::Forced)?; + repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?; + + assert!(!newtline + .list_rels(0, TESTDB, Lsn(0x40))? + .contains(&TESTREL_A)); + + Ok(()) + } + */ + + /* + #[test] + fn test_read_beyond_eof() -> Result<()> { + let repo = RepoHarness::create("test_read_beyond_eof")?.load(); + let tline = create_test_timeline(repo, TIMELINE_ID)?; + + make_some_layers(&tline, Lsn(0x20))?; + let mut writer = tline.begin_record(Lsn(0x60)); + walingest.put_rel_page_image( + &mut writer, + TESTREL_A, + 0, + TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x60))), + )?; + writer.finish()?; + + // Test read before rel creation. Should error out. + assert!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x10)).is_err()); + + // Read block beyond end of relation at different points in time. + // These reads should fall into different delta, image, and in-memory layers. + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x20))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x25))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x30))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x35))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x45))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x55))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, ZERO_PAGE); + + // Test on an in-memory layer with no preceding layer + let mut writer = tline.begin_record(Lsn(0x70)); + walingest.put_rel_page_image( + &mut writer, + TESTREL_B, + 0, + TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x70))), + )?; + writer.finish()?; + + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_B, 1, Lsn(0x70))?, ZERO_PAGE); + + Ok(()) + } + */ +} diff --git a/pageserver/src/relish.rs b/pageserver/src/relish.rs deleted file mode 100644 index 9228829aef..0000000000 --- a/pageserver/src/relish.rs +++ /dev/null @@ -1,226 +0,0 @@ -//! -//! Zenith stores PostgreSQL relations, and some other files, in the -//! repository. The relations (i.e. tables and indexes) take up most -//! of the space in a typical installation, while the other files are -//! small. We call each relation and other file that is stored in the -//! repository a "relish". It comes from "rel"-ish, as in "kind of a -//! rel", because it covers relations as well as other things that are -//! not relations, but are treated similarly for the purposes of the -//! storage layer. -//! -//! This source file contains the definition of the RelishTag struct, -//! which uniquely identifies a relish. -//! -//! Relishes come in two flavors: blocky and non-blocky. Relations and -//! SLRUs are blocky, that is, they are divided into 8k blocks, and -//! the repository tracks their size. Other relishes are non-blocky: -//! the content of the whole relish is stored as one blob. Block -//! number must be passed as 0 for all operations on a non-blocky -//! relish. The one "block" that you store in a non-blocky relish can -//! have arbitrary size, but they are expected to be small, or you -//! will have performance issues. -//! -//! All relishes are versioned by LSN in the repository. -//! - -use serde::{Deserialize, Serialize}; -use std::fmt; - -use postgres_ffi::relfile_utils::forknumber_to_name; -use postgres_ffi::{Oid, TransactionId}; - -/// -/// RelishTag identifies one relish. -/// -#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub enum RelishTag { - // Relations correspond to PostgreSQL relation forks. Each - // PostgreSQL relation fork is considered a separate relish. - Relation(RelTag), - - // SLRUs include pg_clog, pg_multixact/members, and - // pg_multixact/offsets. There are other SLRUs in PostgreSQL, but - // they don't need to be stored permanently (e.g. pg_subtrans), - // or we do not support them in zenith yet (pg_commit_ts). - // - // These are currently never requested directly by the compute - // nodes, although in principle that would be possible. However, - // when a new compute node is created, these are included in the - // tarball that we send to the compute node to initialize the - // PostgreSQL data directory. - // - // Each SLRU segment in PostgreSQL is considered a separate - // relish. For example, pg_clog/0000, pg_clog/0001, and so forth. - // - // SLRU segments are divided into blocks, like relations. - Slru { slru: SlruKind, segno: u32 }, - - // Miscellaneous other files that need to be included in the - // tarball at compute node creation. These are non-blocky, and are - // expected to be small. - - // - // FileNodeMap represents PostgreSQL's 'pg_filenode.map' - // files. They are needed to map catalog table OIDs to filenode - // numbers. Usually the mapping is done by looking up a relation's - // 'relfilenode' field in the 'pg_class' system table, but that - // doesn't work for 'pg_class' itself and a few other such system - // relations. See PostgreSQL relmapper.c for details. - // - // Each database has a map file for its local mapped catalogs, - // and there is a separate map file for shared catalogs. - // - // These files are always 512 bytes long (although we don't check - // or care about that in the page server). - // - FileNodeMap { spcnode: Oid, dbnode: Oid }, - - // - // State files for prepared transactions (e.g pg_twophase/1234) - // - TwoPhase { xid: TransactionId }, - - // The control file, stored in global/pg_control - ControlFile, - - // Special entry that represents PostgreSQL checkpoint. It doesn't - // correspond to to any physical file in PostgreSQL, but we use it - // to track fields needed to restore the checkpoint data in the - // control file, when a compute node is created. - Checkpoint, -} - -impl RelishTag { - pub const fn is_blocky(&self) -> bool { - match self { - // These relishes work with blocks - RelishTag::Relation(_) | RelishTag::Slru { slru: _, segno: _ } => true, - - // and these don't - RelishTag::FileNodeMap { - spcnode: _, - dbnode: _, - } - | RelishTag::TwoPhase { xid: _ } - | RelishTag::ControlFile - | RelishTag::Checkpoint => false, - } - } - - // Physical relishes represent files and use - // RelationSizeEntry to track existing and dropped files. - // They can be both blocky and non-blocky. - pub const fn is_physical(&self) -> bool { - match self { - // These relishes represent physical files - RelishTag::Relation(_) - | RelishTag::Slru { .. } - | RelishTag::FileNodeMap { .. } - | RelishTag::TwoPhase { .. } => true, - - // and these don't - RelishTag::ControlFile | RelishTag::Checkpoint => false, - } - } - - // convenience function to check if this relish is a normal relation. - pub const fn is_relation(&self) -> bool { - matches!(self, RelishTag::Relation(_)) - } -} - -/// -/// Relation data file segment id throughout the Postgres cluster. -/// -/// Every data file in Postgres is uniquely identified by 4 numbers: -/// - relation id / node (`relnode`) -/// - database id (`dbnode`) -/// - tablespace id (`spcnode`), in short this is a unique id of a separate -/// directory to store data files. -/// - forknumber (`forknum`) is used to split different kinds of data of the same relation -/// between some set of files (`relnode`, `relnode_fsm`, `relnode_vm`). -/// -/// In native Postgres code `RelFileNode` structure and individual `ForkNumber` value -/// are used for the same purpose. -/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57). -/// -#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)] -pub struct RelTag { - pub forknum: u8, - pub spcnode: Oid, - pub dbnode: Oid, - pub relnode: Oid, -} - -/// Display RelTag in the same format that's used in most PostgreSQL debug messages: -/// -/// //[_fsm|_vm|_init] -/// -impl fmt::Display for RelTag { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if let Some(forkname) = forknumber_to_name(self.forknum) { - write!( - f, - "{}/{}/{}_{}", - self.spcnode, self.dbnode, self.relnode, forkname - ) - } else { - write!(f, "{}/{}/{}", self.spcnode, self.dbnode, self.relnode) - } - } -} - -/// Display RelTag in the same format that's used in most PostgreSQL debug messages: -/// -/// //[_fsm|_vm|_init] -/// -impl fmt::Display for RelishTag { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - RelishTag::Relation(rel) => rel.fmt(f), - RelishTag::Slru { slru, segno } => { - // e.g. pg_clog/0001 - write!(f, "{}/{:04X}", slru.to_str(), segno) - } - RelishTag::FileNodeMap { spcnode, dbnode } => { - write!(f, "relmapper file for spc {} db {}", spcnode, dbnode) - } - RelishTag::TwoPhase { xid } => { - write!(f, "pg_twophase/{:08X}", xid) - } - RelishTag::ControlFile => { - write!(f, "control file") - } - RelishTag::Checkpoint => { - write!(f, "checkpoint") - } - } - } -} - -/// -/// Non-relation transaction status files (clog (a.k.a. pg_xact) and -/// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer, -/// hence the name. -/// -/// These files are global for a postgres instance. -/// -/// These files are divided into segments, which are divided into -/// pages of the same BLCKSZ as used for relation files. -/// -#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub enum SlruKind { - Clog, - MultiXactMembers, - MultiXactOffsets, -} - -impl SlruKind { - pub fn to_str(&self) -> &'static str { - match self { - Self::Clog => "pg_xact", - Self::MultiXactMembers => "pg_multixact/members", - Self::MultiXactOffsets => "pg_multixact/offsets", - } - } -} diff --git a/pageserver/src/reltag.rs b/pageserver/src/reltag.rs new file mode 100644 index 0000000000..46ff468f2f --- /dev/null +++ b/pageserver/src/reltag.rs @@ -0,0 +1,105 @@ +use serde::{Deserialize, Serialize}; +use std::cmp::Ordering; +use std::fmt; + +use postgres_ffi::relfile_utils::forknumber_to_name; +use postgres_ffi::Oid; + +/// +/// Relation data file segment id throughout the Postgres cluster. +/// +/// Every data file in Postgres is uniquely identified by 4 numbers: +/// - relation id / node (`relnode`) +/// - database id (`dbnode`) +/// - tablespace id (`spcnode`), in short this is a unique id of a separate +/// directory to store data files. +/// - forknumber (`forknum`) is used to split different kinds of data of the same relation +/// between some set of files (`relnode`, `relnode_fsm`, `relnode_vm`). +/// +/// In native Postgres code `RelFileNode` structure and individual `ForkNumber` value +/// are used for the same purpose. +/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57). +/// +// FIXME: should move 'forknum' as last field to keep this consistent with Postgres. +// Then we could replace the custo Ord and PartialOrd implementations below with +// deriving them. +#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)] +pub struct RelTag { + pub forknum: u8, + pub spcnode: Oid, + pub dbnode: Oid, + pub relnode: Oid, +} + +impl PartialOrd for RelTag { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for RelTag { + fn cmp(&self, other: &Self) -> Ordering { + let mut cmp; + + cmp = self.spcnode.cmp(&other.spcnode); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.dbnode.cmp(&other.dbnode); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.relnode.cmp(&other.relnode); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.forknum.cmp(&other.forknum); + + cmp + } +} + +/// Display RelTag in the same format that's used in most PostgreSQL debug messages: +/// +/// //[_fsm|_vm|_init] +/// +impl fmt::Display for RelTag { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(forkname) = forknumber_to_name(self.forknum) { + write!( + f, + "{}/{}/{}_{}", + self.spcnode, self.dbnode, self.relnode, forkname + ) + } else { + write!(f, "{}/{}/{}", self.spcnode, self.dbnode, self.relnode) + } + } +} + +/// +/// Non-relation transaction status files (clog (a.k.a. pg_xact) and +/// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer, +/// hence the name. +/// +/// These files are global for a postgres instance. +/// +/// These files are divided into segments, which are divided into +/// pages of the same BLCKSZ as used for relation files. +/// +#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub enum SlruKind { + Clog, + MultiXactMembers, + MultiXactOffsets, +} + +impl SlruKind { + pub fn to_str(&self) -> &'static str { + match self { + Self::Clog => "pg_xact", + Self::MultiXactMembers => "pg_multixact/members", + Self::MultiXactOffsets => "pg_multixact/offsets", + } + } +} diff --git a/pageserver/src/remote_storage/README.md b/pageserver/src/remote_storage/README.md index 3c77275da8..339ddce866 100644 --- a/pageserver/src/remote_storage/README.md +++ b/pageserver/src/remote_storage/README.md @@ -17,7 +17,7 @@ This way, the backups are managed in background, not affecting directly other pa Current implementation * provides remote storage wrappers for AWS S3 and local FS * synchronizes the differences with local timelines and remote states as fast as possible -* uploads new relishes, frozen by pageserver checkpoint thread +* uploads new layer files * downloads and registers timelines, found on the remote storage, but missing locally, if those are requested somehow via pageserver (e.g. http api, gc) * uses compression when deals with files, for better S3 usage * maintains an index of what's stored remotely diff --git a/pageserver/src/remote_storage/local_fs.rs b/pageserver/src/remote_storage/local_fs.rs index 6cce127a7c..bac693c8d0 100644 --- a/pageserver/src/remote_storage/local_fs.rs +++ b/pageserver/src/remote_storage/local_fs.rs @@ -662,7 +662,7 @@ mod fs_tests { } async fn upload_dummy_file( - harness: &RepoHarness, + harness: &RepoHarness<'_>, storage: &LocalFs, name: &str, ) -> anyhow::Result { diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index 9fe2ab2847..ddd47ea981 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -27,7 +27,7 @@ //! it may schedule the download on such occasions. //! Then, the index is shared across pageserver under [`RemoteIndex`] guard to ensure proper synchronization. //! -//! The synchronization unit is an archive: a set of timeline files (or relishes) and a special metadata file, all compressed into a blob. +//! The synchronization unit is an archive: a set of layer files and a special metadata file, all compressed into a blob. //! Currently, there's no way to process an archive partially, if the archive processing fails, it has to be started from zero next time again. //! An archive contains set of files of a certain timeline, added during checkpoint(s) and the timeline metadata at that moment. //! The archive contains that metadata's `disk_consistent_lsn` in its name, to be able to restore partial index information from just a remote storage file list. @@ -281,7 +281,7 @@ impl SyncKind { /// Current checkpoint design assumes new files are added only, no deletions or amendment happens. #[derive(Debug, Clone)] pub struct NewCheckpoint { - /// Relish file paths in the pageserver workdir, that were added for the corresponding checkpoint. + /// layer file paths in the pageserver workdir, that were added for the corresponding checkpoint. layers: Vec, metadata: TimelineMetadata, } @@ -854,7 +854,7 @@ mod test_utils { #[track_caller] pub async fn ensure_correct_timeline_upload( - harness: &RepoHarness, + harness: &RepoHarness<'_>, remote_assets: Arc<(LocalFs, RemoteIndex)>, timeline_id: ZTimelineId, new_upload: NewCheckpoint, diff --git a/pageserver/src/remote_storage/storage_sync/compression.rs b/pageserver/src/remote_storage/storage_sync/compression.rs index ca245359bf..c5b041349a 100644 --- a/pageserver/src/remote_storage/storage_sync/compression.rs +++ b/pageserver/src/remote_storage/storage_sync/compression.rs @@ -10,7 +10,7 @@ //! Archiving is almost agnostic to timeline file types, with an exception of the metadata file, that's currently distinguished in the [un]compression code. //! The metadata file is treated separately when [de]compression is involved, to reduce the risk of corrupting the metadata file. //! When compressed, the metadata file is always required and stored as the last file in the archive stream. -//! When uncompressed, the metadata file gets naturally uncompressed last, to ensure that all other relishes are decompressed successfully first. +//! When uncompressed, the metadata file gets naturally uncompressed last, to ensure that all other layer files are decompressed successfully first. //! //! Archive structure: //! +----------------------------------------+ diff --git a/pageserver/src/remote_storage/storage_sync/index.rs b/pageserver/src/remote_storage/storage_sync/index.rs index d7bd1f1657..861b78fa3b 100644 --- a/pageserver/src/remote_storage/storage_sync/index.rs +++ b/pageserver/src/remote_storage/storage_sync/index.rs @@ -277,7 +277,7 @@ impl RemoteTimeline { .map(CheckpointArchive::disk_consistent_lsn) } - /// Lists all relish files in the given remote timeline. Omits the metadata file. + /// Lists all layer files in the given remote timeline. Omits the metadata file. pub fn stored_files(&self, timeline_dir: &Path) -> BTreeSet { self.timeline_files .values() diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 36273e6d6c..b960e037be 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,22 +1,173 @@ +use crate::keyspace::KeyPartitioning; use crate::layered_repository::metadata::TimelineMetadata; -use crate::relish::*; use crate::remote_storage::RemoteIndex; -use crate::walrecord::MultiXactMember; +use crate::walrecord::ZenithWalRecord; use crate::CheckpointConfig; -use anyhow::Result; +use anyhow::{bail, Result}; use bytes::Bytes; -use postgres_ffi::{MultiXactId, MultiXactOffset, TransactionId}; use serde::{Deserialize, Serialize}; -use std::collections::HashSet; +use std::fmt; use std::fmt::Display; -use std::ops::{AddAssign, Deref}; +use std::ops::{AddAssign, Range}; use std::sync::{Arc, RwLockReadGuard}; use std::time::Duration; use zenith_utils::lsn::{Lsn, RecordLsn}; use zenith_utils::zid::ZTimelineId; -/// Block number within a relish. This matches PostgreSQL's BlockNumber type. -pub type BlockNumber = u32; +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] +/// Key used in the Repository kv-store. +/// +/// The Repository treates this as an opaque struct, but see the code in pgdatadir_mapping.rs +/// for what we actually store in these fields. +pub struct Key { + pub field1: u8, + pub field2: u32, + pub field3: u32, + pub field4: u32, + pub field5: u8, + pub field6: u32, +} + +impl Key { + pub fn next(&self) -> Key { + self.add(1) + } + + pub fn add(&self, x: u32) -> Key { + let mut key = *self; + + let r = key.field6.overflowing_add(x); + key.field6 = r.0; + if r.1 { + let r = key.field5.overflowing_add(1); + key.field5 = r.0; + if r.1 { + let r = key.field4.overflowing_add(1); + key.field4 = r.0; + if r.1 { + let r = key.field3.overflowing_add(1); + key.field3 = r.0; + if r.1 { + let r = key.field2.overflowing_add(1); + key.field2 = r.0; + if r.1 { + let r = key.field1.overflowing_add(1); + key.field1 = r.0; + assert!(!r.1); + } + } + } + } + } + key + } + + pub fn from_array(b: [u8; 18]) -> Self { + Key { + field1: b[0], + field2: u32::from_be_bytes(b[1..5].try_into().unwrap()), + field3: u32::from_be_bytes(b[5..9].try_into().unwrap()), + field4: u32::from_be_bytes(b[9..13].try_into().unwrap()), + field5: b[13], + field6: u32::from_be_bytes(b[14..18].try_into().unwrap()), + } + } +} + +pub fn key_range_size(key_range: &Range) -> u32 { + let start = key_range.start; + let end = key_range.end; + + if end.field1 != start.field1 + || end.field2 != start.field2 + || end.field3 != start.field3 + || end.field4 != start.field4 + { + return u32::MAX; + } + + let start = (start.field5 as u64) << 32 | start.field6 as u64; + let end = (end.field5 as u64) << 32 | end.field6 as u64; + + let diff = end - start; + if diff > u32::MAX as u64 { + u32::MAX + } else { + diff as u32 + } +} + +pub fn singleton_range(key: Key) -> Range { + key..key.next() +} + +impl fmt::Display for Key { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{:02X}{:08X}{:08X}{:08X}{:02X}{:08X}", + self.field1, self.field2, self.field3, self.field4, self.field5, self.field6 + ) + } +} + +impl Key { + pub const MIN: Key = Key { + field1: u8::MIN, + field2: u32::MIN, + field3: u32::MIN, + field4: u32::MIN, + field5: u8::MIN, + field6: u32::MIN, + }; + pub const MAX: Key = Key { + field1: u8::MAX, + field2: u32::MAX, + field3: u32::MAX, + field4: u32::MAX, + field5: u8::MAX, + field6: u32::MAX, + }; + + pub fn from_hex(s: &str) -> Result { + if s.len() != 36 { + bail!("parse error"); + } + Ok(Key { + field1: u8::from_str_radix(&s[0..2], 16)?, + field2: u32::from_str_radix(&s[2..10], 16)?, + field3: u32::from_str_radix(&s[10..18], 16)?, + field4: u32::from_str_radix(&s[18..26], 16)?, + field5: u8::from_str_radix(&s[26..28], 16)?, + field6: u32::from_str_radix(&s[28..36], 16)?, + }) + } +} + +/// A 'value' stored for a one Key. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum Value { + /// An Image value contains a full copy of the value + Image(Bytes), + /// A WalRecord value contains a WAL record that needs to be + /// replayed get the full value. Replaying the WAL record + /// might need a previous version of the value (if will_init() + /// returns false), or it may be replayed stand-alone (true). + WalRecord(ZenithWalRecord), +} + +impl Value { + pub fn is_image(&self) -> bool { + matches!(self, Value::Image(_)) + } + + pub fn will_init(&self) -> bool { + match self { + Value::Image(_) => true, + Value::WalRecord(rec) => rec.will_init(), + } + } +} #[derive(Clone, Copy, Debug)] pub enum TimelineSyncStatusUpdate { @@ -37,6 +188,8 @@ impl Display for TimelineSyncStatusUpdate { /// A repository corresponds to one .zenith directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. pub trait Repository: Send + Sync { + type Timeline: Timeline; + /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization. /// See [`crate::remote_storage`] for more details about the synchronization. fn apply_timeline_remote_sync_status_update( @@ -47,14 +200,14 @@ pub trait Repository: Send + Sync { /// Get Timeline handle for given zenith timeline ID. /// This function is idempotent. It doesnt change internal state in any way. - fn get_timeline(&self, timelineid: ZTimelineId) -> Option; + fn get_timeline(&self, timelineid: ZTimelineId) -> Option>; /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded. - fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result>; + fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result>; /// Lists timelines the repository contains. /// Up to repository's implementation to omit certain timelines that ar not considered ready for use. - fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)>; + fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)>; /// Create a new, empty timeline. The caller is responsible for loading data into it /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. @@ -62,11 +215,16 @@ pub trait Repository: Send + Sync { &self, timelineid: ZTimelineId, initdb_lsn: Lsn, - ) -> Result>; + ) -> Result>; /// Branch a timeline fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>; + /// Flush all data to disk. + /// + /// this is used at graceful shutdown. + fn checkpoint(&self) -> Result<()>; + /// perform one garbage collection iteration, removing old data files from disk. /// this function is periodically called by gc thread. /// also it can be explicitly requested through page server api 'do_gc' command. @@ -83,9 +241,9 @@ pub trait Repository: Send + Sync { checkpoint_before_gc: bool, ) -> Result; - /// perform one checkpoint iteration, flushing in-memory data on disk. - /// this function is periodically called by checkponter thread. - fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()>; + /// perform one compaction iteration. + /// this function is periodically called by compactor thread. + fn compaction_iteration(&self) -> Result<()>; /// detaches locally available timeline by stopping all threads and removing all the data. fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>; @@ -95,10 +253,10 @@ pub trait Repository: Send + Sync { } /// A timeline, that belongs to the current repository. -pub enum RepositoryTimeline { +pub enum RepositoryTimeline { /// Timeline, with its files present locally in pageserver's working directory. /// Loaded into pageserver's memory and ready to be used. - Loaded(Arc), + Loaded(Arc), /// All the data is available locally, but not loaded into memory, so loading have to be done before actually using the timeline Unloaded { @@ -118,8 +276,8 @@ pub enum LocalTimelineState { Unloaded, } -impl<'a> From<&'a RepositoryTimeline> for LocalTimelineState { - fn from(local_timeline_entry: &'a RepositoryTimeline) -> Self { +impl<'a, T> From<&'a RepositoryTimeline> for LocalTimelineState { + fn from(local_timeline_entry: &'a RepositoryTimeline) -> Self { match local_timeline_entry { RepositoryTimeline::Loaded(_) => LocalTimelineState::Loaded, RepositoryTimeline::Unloaded { .. } => LocalTimelineState::Unloaded, @@ -132,42 +290,22 @@ impl<'a> From<&'a RepositoryTimeline> for LocalTimelineState { /// #[derive(Default)] pub struct GcResult { - pub ondisk_relfiles_total: u64, - pub ondisk_relfiles_needed_by_cutoff: u64, - pub ondisk_relfiles_needed_by_branches: u64, - pub ondisk_relfiles_not_updated: u64, - pub ondisk_relfiles_needed_as_tombstone: u64, - pub ondisk_relfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. - pub ondisk_relfiles_dropped: u64, // # of layer files removed because the relation was dropped - - pub ondisk_nonrelfiles_total: u64, - pub ondisk_nonrelfiles_needed_by_cutoff: u64, - pub ondisk_nonrelfiles_needed_by_branches: u64, - pub ondisk_nonrelfiles_not_updated: u64, - pub ondisk_nonrelfiles_needed_as_tombstone: u64, - pub ondisk_nonrelfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. - pub ondisk_nonrelfiles_dropped: u64, // # of layer files removed because the relation was dropped + pub layers_total: u64, + pub layers_needed_by_cutoff: u64, + pub layers_needed_by_branches: u64, + pub layers_not_updated: u64, + pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. pub elapsed: Duration, } impl AddAssign for GcResult { fn add_assign(&mut self, other: Self) { - self.ondisk_relfiles_total += other.ondisk_relfiles_total; - self.ondisk_relfiles_needed_by_cutoff += other.ondisk_relfiles_needed_by_cutoff; - self.ondisk_relfiles_needed_by_branches += other.ondisk_relfiles_needed_by_branches; - self.ondisk_relfiles_not_updated += other.ondisk_relfiles_not_updated; - self.ondisk_relfiles_needed_as_tombstone += other.ondisk_relfiles_needed_as_tombstone; - self.ondisk_relfiles_removed += other.ondisk_relfiles_removed; - self.ondisk_relfiles_dropped += other.ondisk_relfiles_dropped; - - self.ondisk_nonrelfiles_total += other.ondisk_nonrelfiles_total; - self.ondisk_nonrelfiles_needed_by_cutoff += other.ondisk_nonrelfiles_needed_by_cutoff; - self.ondisk_nonrelfiles_needed_by_branches += other.ondisk_nonrelfiles_needed_by_branches; - self.ondisk_nonrelfiles_not_updated += other.ondisk_nonrelfiles_not_updated; - self.ondisk_nonrelfiles_needed_as_tombstone += other.ondisk_nonrelfiles_needed_as_tombstone; - self.ondisk_nonrelfiles_removed += other.ondisk_nonrelfiles_removed; - self.ondisk_nonrelfiles_dropped += other.ondisk_nonrelfiles_dropped; + self.layers_total += other.layers_total; + self.layers_needed_by_cutoff += other.layers_needed_by_cutoff; + self.layers_needed_by_branches += other.layers_needed_by_branches; + self.layers_not_updated += other.layers_not_updated; + self.layers_removed += other.layers_removed; self.elapsed += other.elapsed; } @@ -190,23 +328,14 @@ pub trait Timeline: Send + Sync { fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard; /// Look up given page version. - fn get_page_at_lsn(&self, tag: RelishTag, blknum: BlockNumber, lsn: Lsn) -> Result; - - /// Get size of a relish - fn get_relish_size(&self, tag: RelishTag, lsn: Lsn) -> Result>; - - /// Does relation exist? - fn get_rel_exists(&self, tag: RelishTag, lsn: Lsn) -> Result; - - /// Get a list of all existing relations - /// Pass RelTag to get relation objects or None to get nonrels. - fn list_relishes(&self, tag: Option, lsn: Lsn) -> Result>; - - /// Get a list of all existing relations in given tablespace and database. - fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result>; - - /// Get a list of all existing non-relational objects - fn list_nonrels(&self, lsn: Lsn) -> Result>; + /// + /// NOTE: It is considerd an error to 'get' a key that doesn't exist. The abstraction + /// above this needs to store suitable metadata to track what data exists with + /// what keys, in separate metadata entries. If a non-existent key is requested, + /// the Repository implementation may incorrectly return a value from an ancestore + /// branch, for exampel, or waste a lot of cycles chasing the non-existing key. + /// + fn get(&self, key: Key, lsn: Lsn) -> Result; /// Get the ancestor's timeline id fn get_ancestor_timeline_id(&self) -> Option; @@ -219,7 +348,6 @@ pub trait Timeline: Send + Sync { // // These are called by the WAL receiver to digest WAL records. //------------------------------------------------------------------------------ - /// Atomically get both last and prev. fn get_last_record_rlsn(&self) -> RecordLsn; @@ -231,6 +359,10 @@ pub trait Timeline: Send + Sync { fn get_disk_consistent_lsn(&self) -> Lsn; /// Mutate the timeline with a [`TimelineWriter`]. + /// + /// FIXME: This ought to return &'a TimelineWriter, where TimelineWriter + /// is a generic type in this trait. But that doesn't currently work in + /// Rust: https://rust-lang.github.io/rfcs/1598-generic_associated_types.html fn writer<'a>(&'a self) -> Box; /// @@ -240,6 +372,19 @@ pub trait Timeline: Send + Sync { /// know anything about them here in the repository. fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>; + /// + /// Tell the implementation how the keyspace should be partitioned. + /// + /// FIXME: This is quite a hack. The code in pgdatadir_mapping.rs knows + /// which keys exist and what is the logical grouping of them. That's why + /// the code there (and in keyspace.rs) decides the partitioning, not the + /// layered_repository.rs implementation. That's a layering violation: + /// the Repository implementation ought to be responsible for the physical + /// layout, but currently it's more convenient to do it in pgdatadir_mapping.rs + /// rather than in layered_repository.rs. + /// + fn hint_partitioning(&self, partitioning: KeyPartitioning, lsn: Lsn) -> Result<()>; + /// /// Check that it is valid to request operations with that lsn. fn check_lsn_is_in_scope( @@ -247,107 +392,39 @@ pub trait Timeline: Send + Sync { lsn: Lsn, latest_gc_cutoff_lsn: &RwLockReadGuard, ) -> Result<()>; - - /// Retrieve current logical size of the timeline - /// - /// NOTE: counted incrementally, includes ancestors, - /// doesnt support TwoPhase relishes yet - fn get_current_logical_size(&self) -> usize; - - /// Does the same as get_current_logical_size but counted on demand. - /// Used in tests to ensure that incremental and non incremental variants match. - fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result; - - /// An escape hatch to allow "casting" a generic Timeline to LayeredTimeline. - fn upgrade_to_layered_timeline(&self) -> &crate::layered_repository::LayeredTimeline; } /// Various functions to mutate the timeline. // TODO Currently, Deref is used to allow easy access to read methods from this trait. // This is probably considered a bad practice in Rust and should be fixed eventually, // but will cause large code changes. -pub trait TimelineWriter: Deref { +pub trait TimelineWriter<'a> { /// Put a new page version that can be constructed from a WAL record /// /// This will implicitly extend the relation, if the page is beyond the /// current end-of-file. - fn put_wal_record( - &self, - lsn: Lsn, - tag: RelishTag, - blknum: BlockNumber, - rec: ZenithWalRecord, - ) -> Result<()>; + fn put(&self, key: Key, lsn: Lsn, value: Value) -> Result<()>; - /// Like put_wal_record, but with ready-made image of the page. - fn put_page_image( - &self, - tag: RelishTag, - blknum: BlockNumber, - lsn: Lsn, - img: Bytes, - ) -> Result<()>; + fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()>; - /// Truncate relation - fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: BlockNumber) -> Result<()>; - - /// This method is used for marking dropped relations and truncated SLRU files and aborted two phase records - fn drop_relish(&self, tag: RelishTag, lsn: Lsn) -> Result<()>; - - /// Track end of the latest digested WAL record. + /// Track the end of the latest digested WAL record. /// - /// Advance requires aligned LSN as an argument and would wake wait_lsn() callers. - /// Previous last record LSN is stored alongside the latest and can be read. - fn advance_last_record_lsn(&self, lsn: Lsn); -} - -/// Each update to a page is represented by a ZenithWalRecord. It can be a wrapper -/// around a PostgreSQL WAL record, or a custom zenith-specific "record". -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -pub enum ZenithWalRecord { - /// Native PostgreSQL WAL record - Postgres { will_init: bool, rec: Bytes }, - - /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear) - ClearVisibilityMapFlags { - new_heap_blkno: Option, - old_heap_blkno: Option, - flags: u8, - }, - /// Mark transaction IDs as committed on a CLOG page - ClogSetCommitted { xids: Vec }, - /// Mark transaction IDs as aborted on a CLOG page - ClogSetAborted { xids: Vec }, - /// Extend multixact offsets SLRU - MultixactOffsetCreate { - mid: MultiXactId, - moff: MultiXactOffset, - }, - /// Extend multixact members SLRU. - MultixactMembersCreate { - moff: MultiXactOffset, - members: Vec, - }, -} - -impl ZenithWalRecord { - /// Does replaying this WAL record initialize the page from scratch, or does - /// it need to be applied over the previous image of the page? - pub fn will_init(&self) -> bool { - match self { - ZenithWalRecord::Postgres { will_init, rec: _ } => *will_init, - - // None of the special zenith record types currently initialize the page - _ => false, - } - } + /// Call this after you have finished writing all the WAL up to 'lsn'. + /// + /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for + /// the 'lsn' or anything older. The previous last record LSN is stored alongside + /// the latest and can be read. + fn finish_write(&self, lsn: Lsn); } #[cfg(test)] pub mod repo_harness { use bytes::BytesMut; + use lazy_static::lazy_static; + use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; use std::{fs, path::PathBuf}; + use crate::RepositoryImpl; use crate::{ config::PageServerConf, layered_repository::LayeredRepository, @@ -368,18 +445,39 @@ pub mod repo_harness { pub fn TEST_IMG(s: &str) -> Bytes { let mut buf = BytesMut::new(); buf.extend_from_slice(s.as_bytes()); - buf.resize(8192, 0); + buf.resize(64, 0); buf.freeze() } - pub struct RepoHarness { - pub conf: &'static PageServerConf, - pub tenant_id: ZTenantId, + lazy_static! { + static ref LOCK: RwLock<()> = RwLock::new(()); } - impl RepoHarness { + pub struct RepoHarness<'a> { + pub conf: &'static PageServerConf, + pub tenant_id: ZTenantId, + + pub lock_guard: ( + Option>, + Option>, + ), + } + + impl<'a> RepoHarness<'a> { pub fn create(test_name: &'static str) -> Result { + Self::create_internal(test_name, false) + } + pub fn create_exclusive(test_name: &'static str) -> Result { + Self::create_internal(test_name, true) + } + fn create_internal(test_name: &'static str, exclusive: bool) -> Result { + let lock_guard = if exclusive { + (None, Some(LOCK.write().unwrap())) + } else { + (Some(LOCK.read().unwrap()), None) + }; + let repo_dir = PageServerConf::test_repo_dir(test_name); let _ = fs::remove_dir_all(&repo_dir); fs::create_dir_all(&repo_dir)?; @@ -393,23 +491,27 @@ pub mod repo_harness { fs::create_dir_all(conf.tenant_path(&tenant_id))?; fs::create_dir_all(conf.timelines_path(&tenant_id))?; - Ok(Self { conf, tenant_id }) + Ok(Self { + conf, + tenant_id, + lock_guard, + }) } - pub fn load(&self) -> Box { + pub fn load(&self) -> RepositoryImpl { self.try_load().expect("failed to load test repo") } - pub fn try_load(&self) -> Result> { + pub fn try_load(&self) -> Result { let walredo_mgr = Arc::new(TestRedoManager); - let repo = Box::new(LayeredRepository::new( + let repo = LayeredRepository::new( self.conf, walredo_mgr, self.tenant_id, RemoteIndex::empty(), false, - )); + ); // populate repo with locally available timelines for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) .expect("should be able to read timelines dir") @@ -438,21 +540,19 @@ pub mod repo_harness { } // Mock WAL redo manager that doesn't do much - struct TestRedoManager; + pub struct TestRedoManager; impl WalRedoManager for TestRedoManager { fn request_redo( &self, - rel: RelishTag, - blknum: BlockNumber, + key: Key, lsn: Lsn, base_img: Option, records: Vec<(Lsn, ZenithWalRecord)>, ) -> Result { let s = format!( - "redo for {} blk {} to get to {}, with {} and {} records", - rel, - blknum, + "redo for {} to get to {}, with {} and {} records", + key, lsn, if base_img.is_some() { "base image" @@ -462,6 +562,7 @@ pub mod repo_harness { records.len() ); println!("{}", s); + Ok(TEST_IMG(&s)) } } @@ -475,411 +576,43 @@ pub mod repo_harness { mod tests { use super::repo_harness::*; use super::*; - use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT}; - use std::fs; + //use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT}; + //use std::sync::Arc; + use bytes::BytesMut; + use hex_literal::hex; + use lazy_static::lazy_static; - /// Arbitrary relation tag, for testing. - const TESTREL_A_REL_TAG: RelTag = RelTag { - spcnode: 0, - dbnode: 111, - relnode: 1000, - forknum: 0, - }; - const TESTREL_A: RelishTag = RelishTag::Relation(TESTREL_A_REL_TAG); - const TESTREL_B: RelishTag = RelishTag::Relation(RelTag { - spcnode: 0, - dbnode: 111, - relnode: 1001, - forknum: 0, - }); - - fn assert_current_logical_size(timeline: &Arc, lsn: Lsn) { - let incremental = timeline.get_current_logical_size(); - let non_incremental = timeline - .get_current_logical_size_non_incremental(lsn) - .unwrap(); - assert_eq!(incremental, non_incremental); + lazy_static! { + static ref TEST_KEY: Key = Key::from_array(hex!("112222222233333333444444445500000001")); } - static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); - static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); - #[test] - fn test_relsize() -> Result<()> { - let repo = RepoHarness::create("test_relsize")?.load(); - // get_timeline() with non-existent timeline id should fail - //repo.get_timeline("11223344556677881122334455667788"); - - // Create timeline to work on + fn test_basic() -> Result<()> { + let repo = RepoHarness::create("test_basic")?.load(); let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let writer = tline.writer(); + writer.put(*TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?; + writer.finish_write(Lsn(0x10)); + drop(writer); - writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?; - writer.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?; - writer.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?; + let writer = tline.writer(); + writer.put(*TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?; + writer.finish_write(Lsn(0x20)); + drop(writer); - writer.advance_last_record_lsn(Lsn(0x50)); - - assert_current_logical_size(&tline, Lsn(0x50)); - - // The relation was created at LSN 2, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); - assert!(tline.get_relish_size(TESTREL_A, Lsn(0x10))?.is_none()); - - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), 1); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x50))?.unwrap(), 3); - - // Check page contents at each LSN - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x20))?, - TEST_IMG("foo blk 0 at 2") - ); - - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x30))?, - TEST_IMG("foo blk 0 at 3") - ); - - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x40))?, - TEST_IMG("foo blk 0 at 3") - ); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, - TEST_IMG("foo blk 1 at 4") - ); - - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x50))?, - TEST_IMG("foo blk 0 at 3") - ); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, - TEST_IMG("foo blk 1 at 4") - ); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 2, Lsn(0x50))?, - TEST_IMG("foo blk 2 at 5") - ); - - // Truncate last block - writer.put_truncation(TESTREL_A, Lsn(0x60), 2)?; - writer.advance_last_record_lsn(Lsn(0x60)); - assert_current_logical_size(&tline, Lsn(0x60)); - - // Check reported size and contents after truncation - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 2); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x60))?, - TEST_IMG("foo blk 0 at 3") - ); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, - TEST_IMG("foo blk 1 at 4") - ); - - // should still see the truncated block with older LSN - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x50))?.unwrap(), 3); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 2, Lsn(0x50))?, - TEST_IMG("foo blk 2 at 5") - ); - - // Truncate to zero length - writer.put_truncation(TESTREL_A, Lsn(0x68), 0)?; - writer.advance_last_record_lsn(Lsn(0x68)); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x68))?.unwrap(), 0); - - // Extend from 0 to 2 blocks, leaving a gap - writer.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?; - writer.advance_last_record_lsn(Lsn(0x70)); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x70))?.unwrap(), 2); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, ZERO_PAGE); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x70))?, - TEST_IMG("foo blk 1") - ); - - // Extend a lot more, leaving a big gap that spans across segments - // FIXME: This is currently broken, see https://github.com/zenithdb/zenith/issues/500 - /* - tline.put_page_image(TESTREL_A, 1500, Lsn(0x80), TEST_IMG("foo blk 1500"))?; - tline.advance_last_record_lsn(Lsn(0x80)); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x80))?.unwrap(), 1501); - for blk in 2..1500 { - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, blk, Lsn(0x80))?, - ZERO_PAGE); - } - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 1500, Lsn(0x80))?, - TEST_IMG("foo blk 1500")); - */ + assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); Ok(()) } - // Test what happens if we dropped a relation - // and then created it again within the same layer. - #[test] - fn test_drop_extend() -> Result<()> { - let repo = RepoHarness::create("test_drop_extend")?.load(); - - // Create timeline to work on - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - let writer = tline.writer(); - - writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - writer.advance_last_record_lsn(Lsn(0x20)); - - // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), 1); - - // Drop relish - writer.drop_relish(TESTREL_A, Lsn(0x30))?; - writer.advance_last_record_lsn(Lsn(0x30)); - - // Check that rel is not visible anymore - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false); - assert!(tline.get_relish_size(TESTREL_A, Lsn(0x30))?.is_none()); - - // Extend it again - writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?; - writer.advance_last_record_lsn(Lsn(0x40)); - - // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x40))?.unwrap(), 1); - - Ok(()) - } - - // Test what happens if we truncated a relation - // so that one of its segments was dropped - // and then extended it again within the same layer. - #[test] - fn test_truncate_extend() -> Result<()> { - let repo = RepoHarness::create("test_truncate_extend")?.load(); - - // Create timeline to work on - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - let writer = tline.writer(); - - //from storage_layer.rs - const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192; - let relsize = RELISH_SEG_SIZE * 2; - - // Create relation with relsize blocks - for blkno in 0..relsize { - let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); - writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?; - } - - writer.advance_last_record_lsn(Lsn(0x20)); - - // The relation was created at LSN 2, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); - assert!(tline.get_relish_size(TESTREL_A, Lsn(0x10))?.is_none()); - - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), - relsize - ); - - // Check relation content - for blkno in 0..relsize { - let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, blkno, lsn)?, - TEST_IMG(&data) - ); - } - - // Truncate relation so that second segment was dropped - // - only leave one page - writer.put_truncation(TESTREL_A, Lsn(0x60), 1)?; - writer.advance_last_record_lsn(Lsn(0x60)); - - // Check reported size and contents after truncation - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 1); - - for blkno in 0..1 { - let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x60))?, - TEST_IMG(&data) - ); - } - - // should still see all blocks with older LSN - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(0x50))?.unwrap(), - relsize - ); - for blkno in 0..relsize { - let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x50))?, - TEST_IMG(&data) - ); - } - - // Extend relation again. - // Add enough blocks to create second segment - for blkno in 0..relsize { - let lsn = Lsn(0x80); - let data = format!("foo blk {} at {}", blkno, lsn); - writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?; - } - writer.advance_last_record_lsn(Lsn(0x80)); - - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true); - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(0x80))?.unwrap(), - relsize - ); - // Check relation content - for blkno in 0..relsize { - let lsn = Lsn(0x80); - let data = format!("foo blk {} at {}", blkno, lsn); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x80))?, - TEST_IMG(&data) - ); - } - - Ok(()) - } - - /// Test get_relsize() and truncation with a file larger than 1 GB, so that it's - /// split into multiple 1 GB segments in Postgres. - #[test] - fn test_large_rel() -> Result<()> { - let repo = RepoHarness::create("test_large_rel")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - let writer = tline.writer(); - - let mut lsn = 0x10; - for blknum in 0..pg_constants::RELSEG_SIZE + 1 { - lsn += 0x10; - let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); - writer.put_page_image(TESTREL_A, blknum as BlockNumber, Lsn(lsn), img)?; - } - writer.advance_last_record_lsn(Lsn(lsn)); - - assert_current_logical_size(&tline, Lsn(lsn)); - - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), - pg_constants::RELSEG_SIZE + 1 - ); - - // Truncate one block - lsn += 0x10; - writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?; - writer.advance_last_record_lsn(Lsn(lsn)); - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), - pg_constants::RELSEG_SIZE - ); - assert_current_logical_size(&tline, Lsn(lsn)); - - // Truncate another block - lsn += 0x10; - writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?; - writer.advance_last_record_lsn(Lsn(lsn)); - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), - pg_constants::RELSEG_SIZE - 1 - ); - assert_current_logical_size(&tline, Lsn(lsn)); - - // Truncate to 1500, and then truncate all the way down to 0, one block at a time - // This tests the behavior at segment boundaries - let mut size: i32 = 3000; - while size >= 0 { - lsn += 0x10; - writer.put_truncation(TESTREL_A, Lsn(lsn), size as BlockNumber)?; - writer.advance_last_record_lsn(Lsn(lsn)); - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), - size as BlockNumber - ); - - size -= 1; - } - assert_current_logical_size(&tline, Lsn(lsn)); - - Ok(()) - } - - /// - /// Test list_rels() function, with branches and dropped relations - /// - #[test] - fn test_list_rels_drop() -> Result<()> { - let repo = RepoHarness::create("test_list_rels_drop")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - let writer = tline.writer(); - const TESTDB: u32 = 111; - - // Import initial dummy checkpoint record, otherwise the get_timeline() call - // after branching fails below - writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?; - - // Create a relation on the timeline - writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - - writer.advance_last_record_lsn(Lsn(0x30)); - - // Check that list_rels() lists it after LSN 2, but no before it - assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&TESTREL_A)); - assert!(tline.list_rels(0, TESTDB, Lsn(0x20))?.contains(&TESTREL_A)); - assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A)); - - // Create a branch, check that the relation is visible there - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; - let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) - .expect("Should have a local timeline"); - let new_writer = newtline.writer(); - - assert!(newtline - .list_rels(0, TESTDB, Lsn(0x30))? - .contains(&TESTREL_A)); - - // Drop it on the branch - new_writer.drop_relish(TESTREL_A, Lsn(0x40))?; - new_writer.advance_last_record_lsn(Lsn(0x40)); - - drop(new_writer); - - // Check that it's no longer listed on the branch after the point where it was dropped - assert!(newtline - .list_rels(0, TESTDB, Lsn(0x30))? - .contains(&TESTREL_A)); - assert!(!newtline - .list_rels(0, TESTDB, Lsn(0x40))? - .contains(&TESTREL_A)); - - // Run checkpoint and garbage collection and check that it's still not visible - newtline.checkpoint(CheckpointConfig::Forced)?; - repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?; - - assert!(!newtline - .list_rels(0, TESTDB, Lsn(0x40))? - .contains(&TESTREL_A)); - - Ok(()) + /// Convenience function to create a page image with given string as the only content + pub fn test_value(s: &str) -> Value { + let mut buf = BytesMut::new(); + buf.extend_from_slice(s.as_bytes()); + Value::Image(buf.freeze()) } /// @@ -890,21 +623,24 @@ mod tests { let repo = RepoHarness::create("test_branch")?.load(); let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let writer = tline.writer(); + use std::str::from_utf8; - // Import initial dummy checkpoint record, otherwise the get_timeline() call - // after branching fails below - writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?; + #[allow(non_snake_case)] + let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); + #[allow(non_snake_case)] + let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap(); - // Create a relation on the timeline - writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?; - writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?; + // Insert a value on the timeline + writer.put(TEST_KEY_A, Lsn(0x20), test_value("foo at 0x20"))?; + writer.put(TEST_KEY_B, Lsn(0x20), test_value("foobar at 0x20"))?; + writer.finish_write(Lsn(0x20)); - // Create another relation - writer.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?; + writer.put(TEST_KEY_A, Lsn(0x30), test_value("foo at 0x30"))?; + writer.finish_write(Lsn(0x30)); + writer.put(TEST_KEY_A, Lsn(0x40), test_value("foo at 0x40"))?; + writer.finish_write(Lsn(0x40)); - writer.advance_last_record_lsn(Lsn(0x40)); - assert_current_logical_size(&tline, Lsn(0x40)); + //assert_current_logical_size(&tline, Lsn(0x40)); // Branch the history, modify relation differently on the new timeline repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; @@ -912,71 +648,65 @@ mod tests { .get_timeline_load(NEW_TIMELINE_ID) .expect("Should have a local timeline"); let new_writer = newtline.writer(); - - new_writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?; - new_writer.advance_last_record_lsn(Lsn(0x40)); + new_writer.put(TEST_KEY_A, Lsn(0x40), test_value("bar at 0x40"))?; + new_writer.finish_write(Lsn(0x40)); // Check page contents on both branches assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x40))?, - TEST_IMG("foo blk 0 at 4") + from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?, + "foo at 0x40" ); - assert_eq!( - newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x40))?, - TEST_IMG("bar blk 0 at 4") + from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?, + "bar at 0x40" ); - assert_eq!( - newtline.get_page_at_lsn(TESTREL_B, 0, Lsn(0x40))?, - TEST_IMG("foobar blk 0 at 2") + from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?, + "foobar at 0x20" ); - assert_eq!(newtline.get_relish_size(TESTREL_B, Lsn(0x40))?.unwrap(), 1); - - assert_current_logical_size(&tline, Lsn(0x40)); + //assert_current_logical_size(&tline, Lsn(0x40)); Ok(()) } - fn make_some_layers(tline: &Arc, start_lsn: Lsn) -> Result<()> { + fn make_some_layers(tline: &T, start_lsn: Lsn) -> Result<()> { let mut lsn = start_lsn; + #[allow(non_snake_case)] { let writer = tline.writer(); // Create a relation on the timeline - writer.put_page_image( - TESTREL_A, - 0, + writer.put( + *TEST_KEY, lsn, - TEST_IMG(&format!("foo blk 0 at {}", lsn)), + Value::Image(TEST_IMG(&format!("foo at {}", lsn))), )?; + writer.finish_write(lsn); lsn += 0x10; - writer.put_page_image( - TESTREL_A, - 0, + writer.put( + *TEST_KEY, lsn, - TEST_IMG(&format!("foo blk 0 at {}", lsn)), + Value::Image(TEST_IMG(&format!("foo at {}", lsn))), )?; - writer.advance_last_record_lsn(lsn); + writer.finish_write(lsn); + lsn += 0x10; } tline.checkpoint(CheckpointConfig::Forced)?; { let writer = tline.writer(); - lsn += 0x10; - writer.put_page_image( - TESTREL_A, - 0, + writer.put( + *TEST_KEY, lsn, - TEST_IMG(&format!("foo blk 0 at {}", lsn)), + Value::Image(TEST_IMG(&format!("foo at {}", lsn))), )?; + writer.finish_write(lsn); lsn += 0x10; - writer.put_page_image( - TESTREL_A, - 0, + writer.put( + *TEST_KEY, lsn, - TEST_IMG(&format!("foo blk 0 at {}", lsn)), + Value::Image(TEST_IMG(&format!("foo at {}", lsn))), )?; - writer.advance_last_record_lsn(lsn); + writer.finish_write(lsn); } tline.checkpoint(CheckpointConfig::Forced) } @@ -985,11 +715,13 @@ mod tests { fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> { let repo = RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(&tline, Lsn(0x20))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 + // FIXME: this doesn't actually remove any layer currently, given how the checkpointing + // and compaction works. But it does set the 'cutoff' point so that the cross check + // below should fail. repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; // try to branch at lsn 25, should fail because we already garbage collected the data @@ -1029,32 +761,35 @@ mod tests { Ok(()) } + /* + // FIXME: This currently fails to error out. Calling GC doesn't currently + // remove the old value, we'd need to work a little harder #[test] - fn test_prohibit_get_page_at_lsn_for_garbage_collected_pages() -> Result<()> { + fn test_prohibit_get_for_garbage_collected_data() -> Result<()> { let repo = - RepoHarness::create("test_prohibit_get_page_at_lsn_for_garbage_collected_pages")? - .load(); + RepoHarness::create("test_prohibit_get_for_garbage_collected_data")? + .load(); let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(&tline, Lsn(0x20))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); - match tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)) { + match tline.get(*TEST_KEY, Lsn(0x25)) { Ok(_) => panic!("request for page should have failed"), Err(err) => assert!(err.to_string().contains("not found at")), } Ok(()) } + */ #[test] fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { let repo = RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - - make_some_layers(&tline, Lsn(0x20))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; let newtline = repo @@ -1062,92 +797,31 @@ mod tests { .expect("Should have a local timeline"); // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; - assert!(newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)).is_ok()); + assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok()); Ok(()) } - #[test] fn test_parent_keeps_data_forever_after_branching() -> Result<()> { - let harness = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?; - let repo = harness.load(); + let repo = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - - make_some_layers(&tline, Lsn(0x20))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; let newtline = repo .get_timeline_load(NEW_TIMELINE_ID) .expect("Should have a local timeline"); - make_some_layers(&newtline, Lsn(0x60))?; + make_some_layers(newtline.as_ref(), Lsn(0x60))?; // run gc on parent repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; - // check that the layer in parent before the branching point is still there - let tline_dir = harness.conf.timeline_path(&TIMELINE_ID, &harness.tenant_id); - - let expected_image_layer_path = tline_dir.join(format!( - "rel_{}_{}_{}_{}_{}_{:016X}_{:016X}", - TESTREL_A_REL_TAG.spcnode, - TESTREL_A_REL_TAG.dbnode, - TESTREL_A_REL_TAG.relnode, - TESTREL_A_REL_TAG.forknum, - 0, // seg is 0 - 0x20, - 0x30, - )); - assert!(fs::metadata(&expected_image_layer_path).is_ok()); - - Ok(()) - } - - #[test] - fn test_read_beyond_eof() -> Result<()> { - let harness = RepoHarness::create("test_read_beyond_eof")?; - let repo = harness.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - - make_some_layers(&tline, Lsn(0x20))?; - { - let writer = tline.writer(); - writer.put_page_image( - TESTREL_A, - 0, - Lsn(0x60), - TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x50))), - )?; - writer.advance_last_record_lsn(Lsn(0x60)); - } - - // Test read before rel creation. Should error out. - assert!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x10)).is_err()); - - // Read block beyond end of relation at different points in time. - // These reads should fall into different delta, image, and in-memory layers. - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x20))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x25))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x30))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x35))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x45))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x55))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, ZERO_PAGE); - - // Test on an in-memory layer with no preceding layer - { - let writer = tline.writer(); - writer.put_page_image( - TESTREL_B, - 0, - Lsn(0x70), - TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x70))), - )?; - writer.advance_last_record_lsn(Lsn(0x70)); - } - assert_eq!(tline.get_page_at_lsn(TESTREL_B, 1, Lsn(0x70))?, ZERO_PAGE); + // Check that the data is still accessible on the branch. + assert_eq!( + newtline.get(*TEST_KEY, Lsn(0x50))?, + TEST_IMG(&format!("foo at {}", Lsn(0x40))) + ); Ok(()) } @@ -1159,7 +833,7 @@ mod tests { { let repo = harness.load(); let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; - make_some_layers(&tline, Lsn(0x8000))?; + make_some_layers(tline.as_ref(), Lsn(0x8000))?; tline.checkpoint(CheckpointConfig::Forced)?; } @@ -1188,7 +862,7 @@ mod tests { let repo = harness.load(); let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(&tline, Lsn(0x20))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; tline.checkpoint(CheckpointConfig::Forced)?; repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; @@ -1197,7 +871,7 @@ mod tests { .get_timeline_load(NEW_TIMELINE_ID) .expect("Should have a local timeline"); - make_some_layers(&newtline, Lsn(0x60))?; + make_some_layers(newtline.as_ref(), Lsn(0x60))?; tline.checkpoint(CheckpointConfig::Forced)?; } diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index e7cc4ecbaf..aeff718803 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -4,13 +4,13 @@ use crate::config::PageServerConf; use crate::layered_repository::LayeredRepository; use crate::remote_storage::RemoteIndex; -use crate::repository::{Repository, Timeline, TimelineSyncStatusUpdate}; +use crate::repository::{Repository, TimelineSyncStatusUpdate}; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; use crate::timelines; use crate::timelines::CreateRepo; use crate::walredo::PostgresRedoManager; -use crate::CheckpointConfig; +use crate::{DatadirTimelineImpl, RepositoryImpl}; use anyhow::{Context, Result}; use lazy_static::lazy_static; use log::*; @@ -28,7 +28,9 @@ lazy_static! { struct Tenant { state: TenantState, - repo: Arc, + repo: Arc, + + timelines: HashMap>, } #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] @@ -67,14 +69,14 @@ pub fn load_local_repo( conf: &'static PageServerConf, tenant_id: ZTenantId, remote_index: &RemoteIndex, -) -> Arc { +) -> Arc { let mut m = access_tenants(); let tenant = m.entry(tenant_id).or_insert_with(|| { // Set up a WAL redo manager, for applying WAL records. let walredo_mgr = PostgresRedoManager::new(conf, tenant_id); // Set up an object repository, for actual data storage. - let repo: Arc = Arc::new(LayeredRepository::new( + let repo: Arc = Arc::new(LayeredRepository::new( conf, Arc::new(walredo_mgr), tenant_id, @@ -84,6 +86,7 @@ pub fn load_local_repo( Tenant { state: TenantState::Idle, repo, + timelines: HashMap::new(), } }); Arc::clone(&tenant.repo) @@ -138,7 +141,7 @@ pub fn shutdown_all_tenants() { thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiver), None, None); thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), None, None); - thread_mgr::shutdown_threads(Some(ThreadKind::Checkpointer), None, None); + thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), None, None); // Ok, no background threads running anymore. Flush any remaining data in // memory to disk. @@ -152,7 +155,7 @@ pub fn shutdown_all_tenants() { debug!("shutdown tenant {}", tenantid); match get_repository_for_tenant(tenantid) { Ok(repo) => { - if let Err(err) = repo.checkpoint_iteration(CheckpointConfig::Flush) { + if let Err(err) = repo.checkpoint() { error!( "Could not checkpoint tenant {} during shutdown: {:?}", tenantid, err @@ -192,6 +195,7 @@ pub fn create_tenant_repository( v.insert(Tenant { state: TenantState::Idle, repo, + timelines: HashMap::new(), }); Ok(Some(tenantid)) } @@ -203,7 +207,7 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> Option { } /// -/// Change the state of a tenant to Active and launch its checkpointer and GC +/// Change the state of a tenant to Active and launch its compactor and GC /// threads. If the tenant was already in Active state or Stopping, does nothing. /// pub fn activate_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> Result<()> { @@ -218,15 +222,15 @@ pub fn activate_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> R // If the tenant is already active, nothing to do. TenantState::Active => {} - // If it's Idle, launch the checkpointer and GC threads + // If it's Idle, launch the compactor and GC threads TenantState::Idle => { thread_mgr::spawn( - ThreadKind::Checkpointer, + ThreadKind::Compactor, Some(tenant_id), None, - "Checkpointer thread", + "Compactor thread", true, - move || crate::tenant_threads::checkpoint_loop(tenant_id, conf), + move || crate::tenant_threads::compact_loop(tenant_id, conf), )?; let gc_spawn_result = thread_mgr::spawn( @@ -244,7 +248,7 @@ pub fn activate_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> R "Failed to start GC thread for tenant {}, stopping its checkpointer thread: {:?}", tenant_id, e ); - thread_mgr::shutdown_threads(Some(ThreadKind::Checkpointer), Some(tenant_id), None); + thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None); return gc_spawn_result; } @@ -258,7 +262,7 @@ pub fn activate_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> R Ok(()) } -pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result> { +pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result> { let m = access_tenants(); let tenant = m .get(&tenantid) @@ -271,10 +275,27 @@ pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result Result> { - get_repository_for_tenant(tenantid)? +) -> Result> { + let mut m = access_tenants(); + let tenant = m + .get_mut(&tenantid) + .with_context(|| format!("Tenant {} not found", tenantid))?; + + if let Some(page_tline) = tenant.timelines.get(&timelineid) { + return Ok(Arc::clone(page_tline)); + } + // First access to this timeline. Create a DatadirTimeline wrapper for it + let tline = tenant + .repo .get_timeline_load(timelineid) - .with_context(|| format!("Timeline {} not found for tenant {}", timelineid, tenantid)) + .with_context(|| format!("Timeline {} not found for tenant {}", timelineid, tenantid))?; + + let repartition_distance = tenant.repo.conf.checkpoint_distance / 10; + + let page_tline = Arc::new(DatadirTimelineImpl::new(tline, repartition_distance)); + page_tline.init_logical_size()?; + tenant.timelines.insert(timelineid, Arc::clone(&page_tline)); + Ok(page_tline) } #[serde_as] diff --git a/pageserver/src/tenant_threads.rs b/pageserver/src/tenant_threads.rs index c370eb61c8..0d9a94cc5b 100644 --- a/pageserver/src/tenant_threads.rs +++ b/pageserver/src/tenant_threads.rs @@ -1,34 +1,42 @@ //! This module contains functions to serve per-tenant background processes, -//! such as checkpointer and GC +//! such as compaction and GC use crate::config::PageServerConf; +use crate::repository::Repository; use crate::tenant_mgr; use crate::tenant_mgr::TenantState; -use crate::CheckpointConfig; use anyhow::Result; use std::time::Duration; use tracing::*; use zenith_utils::zid::ZTenantId; /// -/// Checkpointer thread's main loop +/// Compaction thread's main loop /// -pub fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> { +pub fn compact_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> { + if let Err(err) = compact_loop_ext(tenantid, conf) { + error!("compact loop terminated with error: {:?}", err); + Err(err) + } else { + Ok(()) + } +} + +fn compact_loop_ext(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> { loop { if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { break; } - std::thread::sleep(conf.checkpoint_period); - trace!("checkpointer thread for tenant {} waking up", tenantid); + std::thread::sleep(conf.compaction_period); + trace!("compaction thread for tenant {} waking up", tenantid); - // checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE - // bytes of WAL since last checkpoint. + // Compact timelines let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - repo.checkpoint_iteration(CheckpointConfig::Distance(conf.checkpoint_distance))?; + repo.compaction_iteration()?; } trace!( - "checkpointer thread stopped for tenant {} state is {:?}", + "compaction thread stopped for tenant {} state is {:?}", tenantid, tenant_mgr::get_tenant_state(tenantid) ); diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index cafdc5e700..4484bb1db1 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -94,13 +94,16 @@ pub enum ThreadKind { // Thread that connects to a safekeeper to fetch WAL for one timeline. WalReceiver, - // Thread that handles checkpointing of all timelines for a tenant. - Checkpointer, + // Thread that handles compaction of all timelines for a tenant. + Compactor, // Thread that handles GC of a tenant GarbageCollector, - // Thread for synchronizing pageserver relish data with the remote storage. + // Thread that flushes frozen in-memory layers to disk + LayerFlushThread, + + // Thread for synchronizing pageserver layer files with the remote storage. // Shared by all tenants. StorageSync, } diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 53c4124701..105c3c869f 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -23,6 +23,7 @@ use crate::{ layered_repository::metadata::TimelineMetadata, remote_storage::RemoteIndex, repository::{LocalTimelineState, Repository}, + DatadirTimeline, RepositoryImpl, }; use crate::{import_datadir, LOG_FILE_NAME}; use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager}; @@ -48,26 +49,26 @@ pub struct LocalTimelineInfo { } impl LocalTimelineInfo { - pub fn from_loaded_timeline( - timeline: &dyn Timeline, + pub fn from_loaded_timeline( + datadir_tline: &DatadirTimeline, include_non_incremental_logical_size: bool, ) -> anyhow::Result { - let last_record_lsn = timeline.get_last_record_lsn(); + let last_record_lsn = datadir_tline.tline.get_last_record_lsn(); let info = LocalTimelineInfo { - ancestor_timeline_id: timeline.get_ancestor_timeline_id(), + ancestor_timeline_id: datadir_tline.tline.get_ancestor_timeline_id(), ancestor_lsn: { - match timeline.get_ancestor_lsn() { + match datadir_tline.tline.get_ancestor_lsn() { Lsn(0) => None, lsn @ Lsn(_) => Some(lsn), } }, - disk_consistent_lsn: timeline.get_disk_consistent_lsn(), + disk_consistent_lsn: datadir_tline.tline.get_disk_consistent_lsn(), last_record_lsn, - prev_record_lsn: Some(timeline.get_prev_record_lsn()), + prev_record_lsn: Some(datadir_tline.tline.get_prev_record_lsn()), timeline_state: LocalTimelineState::Loaded, - current_logical_size: Some(timeline.get_current_logical_size()), + current_logical_size: Some(datadir_tline.get_current_logical_size()), current_logical_size_non_incremental: if include_non_incremental_logical_size { - Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?) + Some(datadir_tline.get_current_logical_size_non_incremental(last_record_lsn)?) } else { None }, @@ -93,17 +94,19 @@ impl LocalTimelineInfo { } } - pub fn from_repo_timeline( - repo_timeline: RepositoryTimeline, + pub fn from_repo_timeline( + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + repo_timeline: &RepositoryTimeline, include_non_incremental_logical_size: bool, ) -> anyhow::Result { match repo_timeline { - RepositoryTimeline::Loaded(timeline) => { - Self::from_loaded_timeline(timeline.as_ref(), include_non_incremental_logical_size) - } - RepositoryTimeline::Unloaded { metadata } => { - Ok(Self::from_unloaded_timeline(&metadata)) + RepositoryTimeline::Loaded(_) => { + let datadir_tline = + tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id)?; + Self::from_loaded_timeline(&datadir_tline, include_non_incremental_logical_size) } + RepositoryTimeline::Unloaded { metadata } => Ok(Self::from_unloaded_timeline(metadata)), } } } @@ -172,7 +175,7 @@ pub fn create_repo( conf: &'static PageServerConf, tenant_id: ZTenantId, create_repo: CreateRepo, -) -> Result> { +) -> Result> { let (wal_redo_manager, remote_index) = match create_repo { CreateRepo::Real { wal_redo_manager, @@ -260,12 +263,12 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { // - run initdb to init temporary instance and get bootstrap data // - after initialization complete, remove the temp dir. // -fn bootstrap_timeline( +fn bootstrap_timeline( conf: &'static PageServerConf, tenantid: ZTenantId, tli: ZTimelineId, - repo: &dyn Repository, -) -> Result> { + repo: &R, +) -> Result<()> { let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered(); let initdb_path = conf.tenant_path(&tenantid).join("tmp"); @@ -281,23 +284,20 @@ fn bootstrap_timeline( // Initdb lsn will be equal to last_record_lsn which will be set after import. // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. let timeline = repo.create_empty_timeline(tli, lsn)?; - import_datadir::import_timeline_from_postgres_datadir( - &pgdata_path, - timeline.writer().as_ref(), - lsn, - )?; - timeline.checkpoint(CheckpointConfig::Forced)?; + let mut page_tline: DatadirTimeline = DatadirTimeline::new(timeline, u64::MAX); + import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?; + page_tline.tline.checkpoint(CheckpointConfig::Forced)?; println!( "created initial timeline {} timeline.lsn {}", tli, - timeline.get_last_record_lsn() + page_tline.tline.get_last_record_lsn() ); // Remove temp dir. We don't need it anymore fs::remove_dir_all(pgdata_path)?; - Ok(timeline) + Ok(()) } pub(crate) fn get_local_timelines( @@ -313,7 +313,9 @@ pub(crate) fn get_local_timelines( local_timeline_info.push(( timeline_id, LocalTimelineInfo::from_repo_timeline( - repository_timeline, + tenant_id, + timeline_id, + &repository_timeline, include_non_incremental_logical_size, )?, )) @@ -372,13 +374,17 @@ pub(crate) fn create_timeline( } repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?; // load the timeline into memory - let loaded_timeline = repo.get_timeline_load(new_timeline_id)?; - LocalTimelineInfo::from_loaded_timeline(loaded_timeline.as_ref(), false) + let loaded_timeline = + tenant_mgr::get_timeline_for_tenant_load(tenant_id, new_timeline_id)?; + LocalTimelineInfo::from_loaded_timeline(&loaded_timeline, false) .context("cannot fill timeline info")? } None => { - let new_timeline = bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?; - LocalTimelineInfo::from_loaded_timeline(new_timeline.as_ref(), false) + bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?; + // load the timeline into memory + let new_timeline = + tenant_mgr::get_timeline_for_tenant_load(tenant_id, new_timeline_id)?; + LocalTimelineInfo::from_loaded_timeline(&new_timeline, false) .context("cannot fill timeline info")? } }; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 506890476f..c6c6e89854 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -23,14 +23,16 @@ use postgres_ffi::nonrelfile_utils::clogpage_precedes; use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment; -use std::cmp::min; use anyhow::Result; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; -use crate::relish::*; -use crate::repository::*; +use std::collections::HashMap; + +use crate::pgdatadir_mapping::*; +use crate::reltag::{RelTag, SlruKind}; +use crate::repository::Repository; use crate::walrecord::*; use postgres_ffi::nonrelfile_utils::mx_offset_to_member_segment; use postgres_ffi::xlog_utils::*; @@ -40,22 +42,28 @@ use zenith_utils::lsn::Lsn; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); -pub struct WalIngest { +pub struct WalIngest<'a, R: Repository> { + timeline: &'a DatadirTimeline, + checkpoint: CheckPoint, checkpoint_modified: bool, + + relsize_cache: HashMap, } -impl WalIngest { - pub fn new(timeline: &dyn Timeline, startpoint: Lsn) -> Result { +impl<'a, R: Repository> WalIngest<'a, R> { + pub fn new(timeline: &DatadirTimeline, startpoint: Lsn) -> Result> { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. - let checkpoint_bytes = timeline.get_page_at_lsn(RelishTag::Checkpoint, 0, startpoint)?; + let checkpoint_bytes = timeline.get_checkpoint(startpoint)?; let checkpoint = CheckPoint::decode(&checkpoint_bytes)?; trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); Ok(WalIngest { + timeline, checkpoint, checkpoint_modified: false, + relsize_cache: HashMap::new(), }) } @@ -68,10 +76,12 @@ impl WalIngest { /// pub fn ingest_record( &mut self, - timeline: &dyn TimelineWriter, + timeline: &DatadirTimeline, recdata: Bytes, lsn: Lsn, ) -> Result<()> { + let mut modification = timeline.begin_modification(lsn); + let mut decoded = decode_wal_record(recdata); let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -86,48 +96,34 @@ impl WalIngest { if decoded.xl_rmid == pg_constants::RM_HEAP_ID || decoded.xl_rmid == pg_constants::RM_HEAP2_ID { - self.ingest_heapam_record(&mut buf, timeline, lsn, &mut decoded)?; + self.ingest_heapam_record(&mut buf, &mut modification, &mut decoded)?; } // Handle other special record types if decoded.xl_rmid == pg_constants::RM_SMGR_ID + && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == pg_constants::XLOG_SMGR_CREATE + { + let create = XlSmgrCreate::decode(&mut buf); + self.ingest_xlog_smgr_create(&mut modification, &create)?; + } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE { let truncate = XlSmgrTruncate::decode(&mut buf); - self.ingest_xlog_smgr_truncate(timeline, lsn, &truncate)?; + self.ingest_xlog_smgr_truncate(&mut modification, &truncate)?; } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_CREATE { let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(timeline, lsn, &createdb)?; + self.ingest_xlog_dbase_create(&mut modification, &createdb)?; } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_DROP { let dropdb = XlDropDatabase::decode(&mut buf); - - // To drop the database, we need to drop all the relations in it. Like in - // ingest_xlog_dbase_create(), use the previous record's LSN in the list_rels() call - let req_lsn = min(timeline.get_last_record_lsn(), lsn); - for tablespace_id in dropdb.tablespace_ids { - let rels = timeline.list_rels(tablespace_id, dropdb.db_id, req_lsn)?; - for rel in rels { - timeline.drop_relish(rel, lsn)?; - } - trace!( - "Drop FileNodeMap {}, {} at lsn {}", - tablespace_id, - dropdb.db_id, - lsn - ); - timeline.drop_relish( - RelishTag::FileNodeMap { - spcnode: tablespace_id, - dbnode: dropdb.db_id, - }, - lsn, - )?; + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification.drop_dbdir(tablespace_id, dropdb.db_id)?; } } } else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID { @@ -138,19 +134,17 @@ impl WalIngest { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - timeline.put_page_image( - RelishTag::Slru { - slru: SlruKind::Clog, - segno, - }, + self.put_slru_page_image( + &mut modification, + SlruKind::Clog, + segno, rpageno, - lsn, ZERO_PAGE.clone(), )?; } else { assert!(info == pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(&mut buf); - self.ingest_clog_truncate_record(timeline, lsn, &xlrec)?; + self.ingest_clog_truncate_record(&mut modification, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_XACT_ID { let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK; @@ -158,8 +152,7 @@ impl WalIngest { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); self.ingest_xact_record( - timeline, - lsn, + &mut modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT, )?; @@ -169,8 +162,7 @@ impl WalIngest { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); self.ingest_xact_record( - timeline, - lsn, + &mut modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT_PREPARED, )?; @@ -179,23 +171,11 @@ impl WalIngest { "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}", decoded.xl_xid, parsed_xact.xid, - lsn + lsn, ); - timeline.drop_relish( - RelishTag::TwoPhase { - xid: parsed_xact.xid, - }, - lsn, - )?; + modification.drop_twophase_file(parsed_xact.xid)?; } else if info == pg_constants::XLOG_XACT_PREPARE { - timeline.put_page_image( - RelishTag::TwoPhase { - xid: decoded.xl_xid, - }, - 0, - lsn, - Bytes::copy_from_slice(&buf[..]), - )?; + modification.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))?; } } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; @@ -204,38 +184,34 @@ impl WalIngest { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - timeline.put_page_image( - RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno, - }, + self.put_slru_page_image( + &mut modification, + SlruKind::MultiXactOffsets, + segno, rpageno, - lsn, ZERO_PAGE.clone(), )?; } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - timeline.put_page_image( - RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno, - }, + self.put_slru_page_image( + &mut modification, + SlruKind::MultiXactMembers, + segno, rpageno, - lsn, ZERO_PAGE.clone(), )?; } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { let xlrec = XlMultiXactCreate::decode(&mut buf); - self.ingest_multixact_create_record(timeline, lsn, &xlrec)?; + self.ingest_multixact_create_record(&mut modification, &xlrec)?; } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID { let xlrec = XlMultiXactTruncate::decode(&mut buf); - self.ingest_multixact_truncate_record(timeline, lsn, &xlrec)?; + self.ingest_multixact_truncate_record(&mut modification, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID { let xlrec = XlRelmapUpdate::decode(&mut buf); - self.ingest_relmap_page(timeline, lsn, &xlrec, &decoded)?; + self.ingest_relmap_page(&mut modification, &xlrec, &decoded)?; } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_NEXTOID { @@ -270,37 +246,37 @@ impl WalIngest { // Iterate through all the blocks that the record modifies, and // "put" a separate copy of the record for each block. for blk in decoded.blocks.iter() { - self.ingest_decoded_block(timeline, lsn, &decoded, blk)?; + self.ingest_decoded_block(&mut modification, lsn, &decoded, blk)?; } // If checkpoint data was updated, store the new version in the repository if self.checkpoint_modified { let new_checkpoint_bytes = self.checkpoint.encode(); - timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, new_checkpoint_bytes)?; + modification.put_checkpoint(new_checkpoint_bytes)?; self.checkpoint_modified = false; } // Now that this record has been fully handled, including updating the // checkpoint data, let the repository know that it is up-to-date to this LSN - timeline.advance_last_record_lsn(lsn); + modification.commit()?; Ok(()) } fn ingest_decoded_block( &mut self, - timeline: &dyn TimelineWriter, + modification: &mut DatadirModification, lsn: Lsn, decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, ) -> Result<()> { - let tag = RelishTag::Relation(RelTag { + let rel = RelTag { spcnode: blk.rnode_spcnode, dbnode: blk.rnode_dbnode, relnode: blk.rnode_relnode, forknum: blk.forknum as u8, - }); + }; // // Instead of storing full-page-image WAL record, @@ -330,13 +306,13 @@ impl WalIngest { image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes()); image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); assert_eq!(image.len(), pg_constants::BLCKSZ as usize); - timeline.put_page_image(tag, blk.blkno, lsn, image.freeze())?; + self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?; } else { let rec = ZenithWalRecord::Postgres { will_init: blk.will_init || blk.apply_image, rec: decoded.record.clone(), }; - timeline.put_wal_record(lsn, tag, blk.blkno, rec)?; + self.put_rel_wal_record(modification, rel, blk.blkno, rec)?; } Ok(()) } @@ -344,8 +320,7 @@ impl WalIngest { fn ingest_heapam_record( &mut self, buf: &mut Bytes, - timeline: &dyn TimelineWriter, - lsn: Lsn, + modification: &mut DatadirModification, decoded: &mut DecodedWALRecord, ) -> Result<()> { // Handle VM bit updates that are implicitly part of heap records. @@ -409,54 +384,76 @@ impl WalIngest { // Clear the VM bits if required. if new_heap_blkno.is_some() || old_heap_blkno.is_some() { - let vm_relish = RelishTag::Relation(RelTag { + let vm_rel = RelTag { forknum: pg_constants::VISIBILITYMAP_FORKNUM, spcnode: decoded.blocks[0].rnode_spcnode, dbnode: decoded.blocks[0].rnode_dbnode, relnode: decoded.blocks[0].rnode_relnode, - }); + }; - let new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); - let old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); - if new_vm_blk == old_vm_blk { - // An UPDATE record that needs to clear the bits for both old and the - // new page, both of which reside on the same VM page. - timeline.put_wal_record( - lsn, - vm_relish, - new_vm_blk.unwrap(), - ZenithWalRecord::ClearVisibilityMapFlags { - new_heap_blkno, - old_heap_blkno, - flags: pg_constants::VISIBILITYMAP_VALID_BITS, - }, - )?; - } else { - // Clear VM bits for one heap page, or for two pages that reside on - // different VM pages. - if let Some(new_vm_blk) = new_vm_blk { - timeline.put_wal_record( - lsn, - vm_relish, - new_vm_blk, + let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); + let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); + + // Sometimes, Postgres seems to create heap WAL records with the + // ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is + // not set. In fact, it's possible that the VM page does not exist at all. + // In that case, we don't want to store a record to clear the VM bit; + // replaying it would fail to find the previous image of the page, because + // it doesn't exist. So check if the VM page(s) exist, and skip the WAL + // record if it doesn't. + let vm_size = self.get_relsize(vm_rel)?; + if let Some(blknum) = new_vm_blk { + if blknum >= vm_size { + new_vm_blk = None; + } + } + if let Some(blknum) = old_vm_blk { + if blknum >= vm_size { + old_vm_blk = None; + } + } + + if new_vm_blk.is_some() || old_vm_blk.is_some() { + if new_vm_blk == old_vm_blk { + // An UPDATE record that needs to clear the bits for both old and the + // new page, both of which reside on the same VM page. + self.put_rel_wal_record( + modification, + vm_rel, + new_vm_blk.unwrap(), ZenithWalRecord::ClearVisibilityMapFlags { new_heap_blkno, - old_heap_blkno: None, - flags: pg_constants::VISIBILITYMAP_VALID_BITS, - }, - )?; - } - if let Some(old_vm_blk) = old_vm_blk { - timeline.put_wal_record( - lsn, - vm_relish, - old_vm_blk, - ZenithWalRecord::ClearVisibilityMapFlags { - new_heap_blkno: None, old_heap_blkno, flags: pg_constants::VISIBILITYMAP_VALID_BITS, }, )?; + } else { + // Clear VM bits for one heap page, or for two pages that reside on + // different VM pages. + if let Some(new_vm_blk) = new_vm_blk { + self.put_rel_wal_record( + modification, + vm_rel, + new_vm_blk, + ZenithWalRecord::ClearVisibilityMapFlags { + new_heap_blkno, + old_heap_blkno: None, + flags: pg_constants::VISIBILITYMAP_VALID_BITS, + }, + )?; + } + if let Some(old_vm_blk) = old_vm_blk { + self.put_rel_wal_record( + modification, + vm_rel, + old_vm_blk, + ZenithWalRecord::ClearVisibilityMapFlags { + new_heap_blkno: None, + old_heap_blkno, + flags: pg_constants::VISIBILITYMAP_VALID_BITS, + }, + )?; + } } } } @@ -467,8 +464,7 @@ impl WalIngest { /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record. fn ingest_xlog_dbase_create( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + modification: &mut DatadirModification, rec: &XlCreateDatabase, ) -> Result<()> { let db_id = rec.db_id; @@ -481,76 +477,79 @@ impl WalIngest { // cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for // the last valid LSN to advance up to it. So we use the previous record's LSN in the // get calls instead. - let req_lsn = min(timeline.get_last_record_lsn(), lsn); + let req_lsn = modification.tline.get_last_record_lsn(); - let rels = timeline.list_rels(src_tablespace_id, src_db_id, req_lsn)?; + let rels = modification + .tline + .list_rels(src_tablespace_id, src_db_id, req_lsn)?; - trace!("ingest_xlog_dbase_create: {} rels", rels.len()); + debug!("ingest_xlog_dbase_create: {} rels", rels.len()); + + // Copy relfilemap + let filemap = modification + .tline + .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)?; + modification.put_relmap_file(tablespace_id, db_id, filemap)?; let mut num_rels_copied = 0; let mut num_blocks_copied = 0; - for rel in rels { - if let RelishTag::Relation(src_rel) = rel { - assert_eq!(src_rel.spcnode, src_tablespace_id); - assert_eq!(src_rel.dbnode, src_db_id); + for src_rel in rels { + assert_eq!(src_rel.spcnode, src_tablespace_id); + assert_eq!(src_rel.dbnode, src_db_id); - let nblocks = timeline.get_relish_size(rel, req_lsn)?.unwrap_or(0); - let dst_rel = RelTag { - spcnode: tablespace_id, - dbnode: db_id, - relnode: src_rel.relnode, - forknum: src_rel.forknum, - }; + let nblocks = modification.tline.get_rel_size(src_rel, req_lsn)?; + let dst_rel = RelTag { + spcnode: tablespace_id, + dbnode: db_id, + relnode: src_rel.relnode, + forknum: src_rel.forknum, + }; - // Copy content - for blknum in 0..nblocks { - let content = timeline.get_page_at_lsn(rel, blknum, req_lsn)?; + modification.put_rel_creation(dst_rel, nblocks)?; - debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel); + // Copy content + debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks); + for blknum in 0..nblocks { + debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel); - timeline.put_page_image(RelishTag::Relation(dst_rel), blknum, lsn, content)?; - num_blocks_copied += 1; - } - - if nblocks == 0 { - // make sure we have some trace of the relation, even if it's empty - timeline.put_truncation(RelishTag::Relation(dst_rel), lsn, 0)?; - } - - num_rels_copied += 1; + let content = modification + .tline + .get_rel_page_at_lsn(src_rel, blknum, req_lsn)?; + modification.put_rel_page_image(dst_rel, blknum, content)?; + num_blocks_copied += 1; } + + num_rels_copied += 1; } - // Copy relfilemap - // TODO This implementation is very inefficient - - // it scans all non-rels only to find FileNodeMaps - for tag in timeline.list_nonrels(req_lsn)? { - if let RelishTag::FileNodeMap { spcnode, dbnode } = tag { - if spcnode == src_tablespace_id && dbnode == src_db_id { - let img = timeline.get_page_at_lsn(tag, 0, req_lsn)?; - let new_tag = RelishTag::FileNodeMap { - spcnode: tablespace_id, - dbnode: db_id, - }; - timeline.put_page_image(new_tag, 0, lsn, img)?; - break; - } - } - } info!( - "Created database {}/{}, copied {} blocks in {} rels at {}", - tablespace_id, db_id, num_blocks_copied, num_rels_copied, lsn + "Created database {}/{}, copied {} blocks in {} rels", + tablespace_id, db_id, num_blocks_copied, num_rels_copied ); Ok(()) } + fn ingest_xlog_smgr_create( + &mut self, + modification: &mut DatadirModification, + rec: &XlSmgrCreate, + ) -> Result<()> { + let rel = RelTag { + spcnode: rec.rnode.spcnode, + dbnode: rec.rnode.dbnode, + relnode: rec.rnode.relnode, + forknum: rec.forknum, + }; + self.put_rel_creation(modification, rel)?; + Ok(()) + } + /// Subroutine of ingest_record(), to handle an XLOG_SMGR_TRUNCATE record. /// /// This is the same logic as in PostgreSQL's smgr_redo() function. fn ingest_xlog_smgr_truncate( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + modification: &mut DatadirModification, rec: &XlSmgrTruncate, ) -> Result<()> { let spcnode = rec.rnode.spcnode; @@ -564,7 +563,7 @@ impl WalIngest { relnode, forknum: pg_constants::MAIN_FORKNUM, }; - timeline.put_truncation(RelishTag::Relation(rel), lsn, rec.blkno)?; + self.put_rel_truncation(modification, rel, rec.blkno)?; } if (rec.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 { let rel = RelTag { @@ -587,7 +586,7 @@ impl WalIngest { info!("Partial truncation of FSM is not supported"); } let num_fsm_blocks = 0; - timeline.put_truncation(RelishTag::Relation(rel), lsn, num_fsm_blocks)?; + self.put_rel_truncation(modification, rel, num_fsm_blocks)?; } if (rec.flags & pg_constants::SMGR_TRUNCATE_VM) != 0 { let rel = RelTag { @@ -606,7 +605,7 @@ impl WalIngest { info!("Partial truncation of VM is not supported"); } let num_vm_blocks = 0; - timeline.put_truncation(RelishTag::Relation(rel), lsn, num_vm_blocks)?; + self.put_rel_truncation(modification, rel, num_vm_blocks)?; } Ok(()) } @@ -615,8 +614,7 @@ impl WalIngest { /// fn ingest_xact_record( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + modification: &mut DatadirModification, parsed: &XlXactParsedRecord, is_commit: bool, ) -> Result<()> { @@ -632,12 +630,9 @@ impl WalIngest { // This subxact goes to different page. Write the record // for all the XIDs on the previous page, and continue // accumulating XIDs on this new page. - timeline.put_wal_record( - lsn, - RelishTag::Slru { - slru: SlruKind::Clog, - segno, - }, + modification.put_slru_wal_record( + SlruKind::Clog, + segno, rpageno, if is_commit { ZenithWalRecord::ClogSetCommitted { xids: page_xids } @@ -652,12 +647,9 @@ impl WalIngest { rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; page_xids.push(*subxact); } - timeline.put_wal_record( - lsn, - RelishTag::Slru { - slru: SlruKind::Clog, - segno, - }, + modification.put_slru_wal_record( + SlruKind::Clog, + segno, rpageno, if is_commit { ZenithWalRecord::ClogSetCommitted { xids: page_xids } @@ -674,7 +666,10 @@ impl WalIngest { dbnode: xnode.dbnode, relnode: xnode.relnode, }; - timeline.drop_relish(RelishTag::Relation(rel), lsn)?; + let last_lsn = self.timeline.get_last_record_lsn(); + if modification.tline.get_rel_exists(rel, last_lsn)? { + self.put_rel_drop(modification, rel)?; + } } } Ok(()) @@ -682,13 +677,12 @@ impl WalIngest { fn ingest_clog_truncate_record( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + modification: &mut DatadirModification, xlrec: &XlClogTruncate, ) -> Result<()> { info!( - "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {} lsn {}", - xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db, lsn + "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}", + xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db ); // Here we treat oldestXid and oldestXidDB @@ -719,23 +713,20 @@ impl WalIngest { } // Iterate via SLRU CLOG segments and drop segments that we're ready to truncate - // TODO This implementation is very inefficient - - // it scans all non-rels only to find Clog // // We cannot pass 'lsn' to the Timeline.list_nonrels(), or it // will block waiting for the last valid LSN to advance up to // it. So we use the previous record's LSN in the get calls // instead. - let req_lsn = min(timeline.get_last_record_lsn(), lsn); - for obj in timeline.list_nonrels(req_lsn)? { - if let RelishTag::Slru { slru, segno } = obj { - if slru == SlruKind::Clog { - let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; - if slru_may_delete_clogsegment(segpage, xlrec.pageno) { - timeline.drop_relish(RelishTag::Slru { slru, segno }, lsn)?; - trace!("Drop CLOG segment {:>04X} at lsn {}", segno, lsn); - } - } + let req_lsn = modification.tline.get_last_record_lsn(); + for segno in modification + .tline + .list_slru_segments(SlruKind::Clog, req_lsn)? + { + let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; + if slru_may_delete_clogsegment(segpage, xlrec.pageno) { + modification.drop_slru_segment(SlruKind::Clog, segno)?; + trace!("Drop CLOG segment {:>04X}", segno); } } @@ -744,8 +735,7 @@ impl WalIngest { fn ingest_multixact_create_record( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + modification: &mut DatadirModification, xlrec: &XlMultiXactCreate, ) -> Result<()> { // Create WAL record for updating the multixact-offsets page @@ -753,12 +743,9 @@ impl WalIngest { let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - timeline.put_wal_record( - lsn, - RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno, - }, + modification.put_slru_wal_record( + SlruKind::MultiXactOffsets, + segno, rpageno, ZenithWalRecord::MultixactOffsetCreate { mid: xlrec.mid, @@ -790,12 +777,9 @@ impl WalIngest { } let n_this_page = this_page_members.len(); - timeline.put_wal_record( - lsn, - RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno: pageno / pg_constants::SLRU_PAGES_PER_SEGMENT, - }, + modification.put_slru_wal_record( + SlruKind::MultiXactMembers, + pageno / pg_constants::SLRU_PAGES_PER_SEGMENT, pageno % pg_constants::SLRU_PAGES_PER_SEGMENT, ZenithWalRecord::MultixactMembersCreate { moff: offset, @@ -830,8 +814,7 @@ impl WalIngest { fn ingest_multixact_truncate_record( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + modification: &mut DatadirModification, xlrec: &XlMultiXactTruncate, ) -> Result<()> { self.checkpoint.oldestMulti = xlrec.end_trunc_off; @@ -847,13 +830,7 @@ impl WalIngest { // Delete all the segments except the last one. The last segment can still // contain, possibly partially, valid data. while segment != endsegment { - timeline.drop_relish( - RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno: segment as u32, - }, - lsn, - )?; + modification.drop_slru_segment(SlruKind::MultiXactMembers, segment as u32)?; /* move to next segment, handling wraparound correctly */ if segment == maxsegment { @@ -871,22 +848,538 @@ impl WalIngest { fn ingest_relmap_page( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + modification: &mut DatadirModification, xlrec: &XlRelmapUpdate, decoded: &DecodedWALRecord, ) -> Result<()> { - let tag = RelishTag::FileNodeMap { - spcnode: xlrec.tsid, - dbnode: xlrec.dbid, - }; - let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); // skip xl_relmap_update buf.advance(12); - timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buf[..]))?; + modification.put_relmap_file(xlrec.tsid, xlrec.dbid, Bytes::copy_from_slice(&buf[..]))?; + + Ok(()) + } + + fn put_rel_creation( + &mut self, + modification: &mut DatadirModification, + rel: RelTag, + ) -> Result<()> { + self.relsize_cache.insert(rel, 0); + modification.put_rel_creation(rel, 0)?; + Ok(()) + } + + fn put_rel_page_image( + &mut self, + modification: &mut DatadirModification, + rel: RelTag, + blknum: BlockNumber, + img: Bytes, + ) -> Result<()> { + self.handle_rel_extend(modification, rel, blknum)?; + modification.put_rel_page_image(rel, blknum, img)?; + Ok(()) + } + + fn put_rel_wal_record( + &mut self, + modification: &mut DatadirModification, + rel: RelTag, + blknum: BlockNumber, + rec: ZenithWalRecord, + ) -> Result<()> { + self.handle_rel_extend(modification, rel, blknum)?; + modification.put_rel_wal_record(rel, blknum, rec)?; + Ok(()) + } + + fn put_rel_truncation( + &mut self, + modification: &mut DatadirModification, + rel: RelTag, + nblocks: BlockNumber, + ) -> Result<()> { + modification.put_rel_truncation(rel, nblocks)?; + self.relsize_cache.insert(rel, nblocks); + Ok(()) + } + + fn put_rel_drop( + &mut self, + modification: &mut DatadirModification, + rel: RelTag, + ) -> Result<()> { + modification.put_rel_drop(rel)?; + self.relsize_cache.remove(&rel); + Ok(()) + } + + fn get_relsize(&mut self, rel: RelTag) -> Result { + if let Some(nblocks) = self.relsize_cache.get(&rel) { + Ok(*nblocks) + } else { + let last_lsn = self.timeline.get_last_record_lsn(); + let nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? { + 0 + } else { + self.timeline.get_rel_size(rel, last_lsn)? + }; + self.relsize_cache.insert(rel, nblocks); + Ok(nblocks) + } + } + + fn handle_rel_extend( + &mut self, + modification: &mut DatadirModification, + rel: RelTag, + blknum: BlockNumber, + ) -> Result<()> { + let new_nblocks = blknum + 1; + let old_nblocks = if let Some(nblocks) = self.relsize_cache.get(&rel) { + *nblocks + } else { + // Check if the relation exists. We implicitly create relations on first + // record. + // TODO: would be nice if to be more explicit about it + let last_lsn = self.timeline.get_last_record_lsn(); + let nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? { + // create it with 0 size initially, the logic below will extend it + modification.put_rel_creation(rel, 0)?; + 0 + } else { + self.timeline.get_rel_size(rel, last_lsn)? + }; + self.relsize_cache.insert(rel, nblocks); + nblocks + }; + + if new_nblocks > old_nblocks { + //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks); + modification.put_rel_extend(rel, new_nblocks)?; + + // fill the gap with zeros + for gap_blknum in old_nblocks..blknum { + modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?; + } + self.relsize_cache.insert(rel, new_nblocks); + } + Ok(()) + } + + fn put_slru_page_image( + &mut self, + modification: &mut DatadirModification, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + img: Bytes, + ) -> Result<()> { + self.handle_slru_extend(modification, kind, segno, blknum)?; + modification.put_slru_page_image(kind, segno, blknum, img)?; + Ok(()) + } + + fn handle_slru_extend( + &mut self, + modification: &mut DatadirModification, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + ) -> Result<()> { + // we don't use a cache for this like we do for relations. SLRUS are explcitly + // extended with ZEROPAGE records, not with commit records, so it happens + // a lot less frequently. + + let new_nblocks = blknum + 1; + // Check if the relation exists. We implicitly create relations on first + // record. + // TODO: would be nice if to be more explicit about it + let last_lsn = self.timeline.get_last_record_lsn(); + let old_nblocks = if !self + .timeline + .get_slru_segment_exists(kind, segno, last_lsn)? + { + // create it with 0 size initially, the logic below will extend it + modification.put_slru_segment_creation(kind, segno, 0)?; + 0 + } else { + self.timeline.get_slru_segment_size(kind, segno, last_lsn)? + }; + + if new_nblocks > old_nblocks { + trace!( + "extending SLRU {:?} seg {} from {} to {} blocks", + kind, + segno, + old_nblocks, + new_nblocks + ); + modification.put_slru_extend(kind, segno, new_nblocks)?; + + // fill the gap with zeros + for gap_blknum in old_nblocks..blknum { + modification.put_slru_page_image(kind, segno, gap_blknum, ZERO_PAGE.clone())?; + } + } + Ok(()) + } +} + +/// +/// Tests that should work the same with any Repository/Timeline implementation. +/// +#[allow(clippy::bool_assert_comparison)] +#[cfg(test)] +mod tests { + use super::*; + use crate::pgdatadir_mapping::create_test_timeline; + use crate::repository::repo_harness::*; + use postgres_ffi::pg_constants; + + /// Arbitrary relation tag, for testing. + const TESTREL_A: RelTag = RelTag { + spcnode: 0, + dbnode: 111, + relnode: 1000, + forknum: 0, + }; + + fn assert_current_logical_size(_timeline: &DatadirTimeline, _lsn: Lsn) { + // TODO + } + + static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); + + fn init_walingest_test(tline: &DatadirTimeline) -> Result> { + let mut m = tline.begin_modification(Lsn(0x10)); + m.put_checkpoint(ZERO_CHECKPOINT.clone())?; + m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file + m.commit()?; + let walingest = WalIngest::new(tline, Lsn(0x10))?; + + Ok(walingest) + } + + #[test] + fn test_relsize() -> Result<()> { + let repo = RepoHarness::create("test_relsize")?.load(); + let tline = create_test_timeline(repo, TIMELINE_ID)?; + let mut walingest = init_walingest_test(&tline)?; + + let mut m = tline.begin_modification(Lsn(0x20)); + walingest.put_rel_creation(&mut m, TESTREL_A)?; + walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + m.commit()?; + let mut m = tline.begin_modification(Lsn(0x30)); + walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?; + m.commit()?; + let mut m = tline.begin_modification(Lsn(0x40)); + walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?; + m.commit()?; + let mut m = tline.begin_modification(Lsn(0x50)); + walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?; + m.commit()?; + + assert_current_logical_size(&tline, Lsn(0x50)); + + // The relation was created at LSN 2, not visible at LSN 1 yet. + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); + assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10)).is_err()); + + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50))?, 3); + + // Check page contents at each LSN + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20))?, + TEST_IMG("foo blk 0 at 2") + ); + + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30))?, + TEST_IMG("foo blk 0 at 3") + ); + + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40))?, + TEST_IMG("foo blk 0 at 3") + ); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, + TEST_IMG("foo blk 1 at 4") + ); + + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50))?, + TEST_IMG("foo blk 0 at 3") + ); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, + TEST_IMG("foo blk 1 at 4") + ); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50))?, + TEST_IMG("foo blk 2 at 5") + ); + + // Truncate last block + let mut m = tline.begin_modification(Lsn(0x60)); + walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?; + m.commit()?; + assert_current_logical_size(&tline, Lsn(0x60)); + + // Check reported size and contents after truncation + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 2); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60))?, + TEST_IMG("foo blk 0 at 3") + ); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, + TEST_IMG("foo blk 1 at 4") + ); + + // should still see the truncated block with older LSN + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50))?, 3); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50))?, + TEST_IMG("foo blk 2 at 5") + ); + + // Truncate to zero length + let mut m = tline.begin_modification(Lsn(0x68)); + walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?; + m.commit()?; + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68))?, 0); + + // Extend from 0 to 2 blocks, leaving a gap + let mut m = tline.begin_modification(Lsn(0x70)); + walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?; + m.commit()?; + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70))?, 2); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, + ZERO_PAGE + ); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70))?, + TEST_IMG("foo blk 1") + ); + + // Extend a lot more, leaving a big gap that spans across segments + let mut m = tline.begin_modification(Lsn(0x80)); + walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?; + m.commit()?; + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, 1501); + for blk in 2..1500 { + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80))?, + ZERO_PAGE + ); + } + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80))?, + TEST_IMG("foo blk 1500") + ); + + Ok(()) + } + + // Test what happens if we dropped a relation + // and then created it again within the same layer. + #[test] + fn test_drop_extend() -> Result<()> { + let repo = RepoHarness::create("test_drop_extend")?.load(); + let tline = create_test_timeline(repo, TIMELINE_ID)?; + let mut walingest = init_walingest_test(&tline)?; + + let mut m = tline.begin_modification(Lsn(0x20)); + walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + m.commit()?; + + // Check that rel exists and size is correct + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1); + + // Drop rel + let mut m = tline.begin_modification(Lsn(0x30)); + walingest.put_rel_drop(&mut m, TESTREL_A)?; + m.commit()?; + + // Check that rel is not visible anymore + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false); + + // FIXME: should fail + //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30))?.is_none()); + + // Re-create it + let mut m = tline.begin_modification(Lsn(0x40)); + walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?; + m.commit()?; + + // Check that rel exists and size is correct + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40))?, 1); + + Ok(()) + } + + // Test what happens if we truncated a relation + // so that one of its segments was dropped + // and then extended it again within the same layer. + #[test] + fn test_truncate_extend() -> Result<()> { + let repo = RepoHarness::create("test_truncate_extend")?.load(); + let tline = create_test_timeline(repo, TIMELINE_ID)?; + let mut walingest = init_walingest_test(&tline)?; + + // Create a 20 MB relation (the size is arbitrary) + let relsize = 20 * 1024 * 1024 / 8192; + let mut m = tline.begin_modification(Lsn(0x20)); + for blkno in 0..relsize { + let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); + walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; + } + m.commit()?; + + // The relation was created at LSN 20, not visible at LSN 1 yet. + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); + assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10)).is_err()); + + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, relsize); + + // Check relation content + for blkno in 0..relsize { + let lsn = Lsn(0x20); + let data = format!("foo blk {} at {}", blkno, lsn); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, blkno, lsn)?, + TEST_IMG(&data) + ); + } + + // Truncate relation so that second segment was dropped + // - only leave one page + let mut m = tline.begin_modification(Lsn(0x60)); + walingest.put_rel_truncation(&mut m, TESTREL_A, 1)?; + m.commit()?; + + // Check reported size and contents after truncation + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 1); + + for blkno in 0..1 { + let lsn = Lsn(0x20); + let data = format!("foo blk {} at {}", blkno, lsn); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60))?, + TEST_IMG(&data) + ); + } + + // should still see all blocks with older LSN + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50))?, relsize); + for blkno in 0..relsize { + let lsn = Lsn(0x20); + let data = format!("foo blk {} at {}", blkno, lsn); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50))?, + TEST_IMG(&data) + ); + } + + // Extend relation again. + // Add enough blocks to create second segment + let lsn = Lsn(0x80); + let mut m = tline.begin_modification(lsn); + for blkno in 0..relsize { + let data = format!("foo blk {} at {}", blkno, lsn); + walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; + } + m.commit()?; + + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, relsize); + // Check relation content + for blkno in 0..relsize { + let lsn = Lsn(0x80); + let data = format!("foo blk {} at {}", blkno, lsn); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80))?, + TEST_IMG(&data) + ); + } + + Ok(()) + } + + /// Test get_relsize() and truncation with a file larger than 1 GB, so that it's + /// split into multiple 1 GB segments in Postgres. + #[test] + fn test_large_rel() -> Result<()> { + let repo = RepoHarness::create("test_large_rel")?.load(); + let tline = create_test_timeline(repo, TIMELINE_ID)?; + let mut walingest = init_walingest_test(&tline)?; + + let mut lsn = 0x10; + for blknum in 0..pg_constants::RELSEG_SIZE + 1 { + lsn += 0x10; + let mut m = tline.begin_modification(Lsn(lsn)); + let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); + walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?; + m.commit()?; + } + + assert_current_logical_size(&tline, Lsn(lsn)); + + assert_eq!( + tline.get_rel_size(TESTREL_A, Lsn(lsn))?, + pg_constants::RELSEG_SIZE + 1 + ); + + // Truncate one block + lsn += 0x10; + let mut m = tline.begin_modification(Lsn(lsn)); + walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE)?; + m.commit()?; + assert_eq!( + tline.get_rel_size(TESTREL_A, Lsn(lsn))?, + pg_constants::RELSEG_SIZE + ); + assert_current_logical_size(&tline, Lsn(lsn)); + + // Truncate another block + lsn += 0x10; + let mut m = tline.begin_modification(Lsn(lsn)); + walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE - 1)?; + m.commit()?; + assert_eq!( + tline.get_rel_size(TESTREL_A, Lsn(lsn))?, + pg_constants::RELSEG_SIZE - 1 + ); + assert_current_logical_size(&tline, Lsn(lsn)); + + // Truncate to 1500, and then truncate all the way down to 0, one block at a time + // This tests the behavior at segment boundaries + let mut size: i32 = 3000; + while size >= 0 { + lsn += 0x10; + let mut m = tline.begin_modification(Lsn(lsn)); + walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?; + m.commit()?; + assert_eq!( + tline.get_rel_size(TESTREL_A, Lsn(lsn))?, + size as BlockNumber + ); + + size -= 1; + } + assert_current_logical_size(&tline, Lsn(lsn)); Ok(()) } diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 2c10ad315b..e382475627 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -6,6 +6,7 @@ //! We keep one WAL receiver active per timeline. use crate::config::PageServerConf; +use crate::repository::{Repository, Timeline}; use crate::tenant_mgr; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; @@ -182,13 +183,13 @@ fn walreceiver_main( let repo = tenant_mgr::get_repository_for_tenant(tenant_id) .with_context(|| format!("no repository found for tenant {}", tenant_id))?; - let timeline = repo.get_timeline_load(timeline_id).with_context(|| { - format!( - "local timeline {} not found for tenant {}", - timeline_id, tenant_id - ) - })?; - + let timeline = + tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id).with_context(|| { + format!( + "local timeline {} not found for tenant {}", + timeline_id, tenant_id + ) + })?; let remote_index = repo.get_remote_index(); // @@ -251,11 +252,10 @@ fn walreceiver_main( // It is important to deal with the aligned records as lsn in getPage@LSN is // aligned and can be several bytes bigger. Without this alignment we are - // at risk of hittind a deadlock. + // at risk of hitting a deadlock. anyhow::ensure!(lsn.is_aligned()); - let writer = timeline.writer(); - walingest.ingest_record(writer.as_ref(), recdata, lsn)?; + walingest.ingest_record(&timeline, recdata, lsn)?; fail_point!("walreceiver-after-ingest"); @@ -267,6 +267,8 @@ fn walreceiver_main( caught_up = true; } + timeline.tline.check_checkpoint_distance()?; + Some(endlsn) } @@ -310,7 +312,7 @@ fn walreceiver_main( // The last LSN we processed. It is not guaranteed to survive pageserver crash. let write_lsn = u64::from(last_lsn); // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data - let flush_lsn = u64::from(timeline.get_disk_consistent_lsn()); + let flush_lsn = u64::from(timeline.tline.get_disk_consistent_lsn()); // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. let apply_lsn = u64::from(timeline_remote_consistent_lsn); diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index ca9107cdbf..5947a0c147 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -10,7 +10,47 @@ use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, Transacti use serde::{Deserialize, Serialize}; use tracing::*; -use crate::repository::ZenithWalRecord; +/// Each update to a page is represented by a ZenithWalRecord. It can be a wrapper +/// around a PostgreSQL WAL record, or a custom zenith-specific "record". +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub enum ZenithWalRecord { + /// Native PostgreSQL WAL record + Postgres { will_init: bool, rec: Bytes }, + + /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear) + ClearVisibilityMapFlags { + new_heap_blkno: Option, + old_heap_blkno: Option, + flags: u8, + }, + /// Mark transaction IDs as committed on a CLOG page + ClogSetCommitted { xids: Vec }, + /// Mark transaction IDs as aborted on a CLOG page + ClogSetAborted { xids: Vec }, + /// Extend multixact offsets SLRU + MultixactOffsetCreate { + mid: MultiXactId, + moff: MultiXactOffset, + }, + /// Extend multixact members SLRU. + MultixactMembersCreate { + moff: MultiXactOffset, + members: Vec, + }, +} + +impl ZenithWalRecord { + /// Does replaying this WAL record initialize the page from scratch, or does + /// it need to be applied over the previous image of the page? + pub fn will_init(&self) -> bool { + match self { + ZenithWalRecord::Postgres { will_init, rec: _ } => *will_init, + + // None of the special zenith record types currently initialize the page + _ => false, + } + } +} /// DecodedBkpBlock represents per-page data contained in a WAL record. #[derive(Default)] @@ -87,6 +127,28 @@ impl XlRelmapUpdate { } } +#[repr(C)] +#[derive(Debug)] +pub struct XlSmgrCreate { + pub rnode: RelFileNode, + // FIXME: This is ForkNumber in storage_xlog.h. That's an enum. Does it have + // well-defined size? + pub forknum: u8, +} + +impl XlSmgrCreate { + pub fn decode(buf: &mut Bytes) -> XlSmgrCreate { + XlSmgrCreate { + rnode: RelFileNode { + spcnode: buf.get_u32_le(), /* tablespace */ + dbnode: buf.get_u32_le(), /* database */ + relnode: buf.get_u32_le(), /* relation */ + }, + forknum: buf.get_u32_le() as u8, + } + } +} + #[repr(C)] #[derive(Debug)] pub struct XlSmgrTruncate { diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 704b8f2583..ae22f1eead 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -42,8 +42,10 @@ use zenith_utils::nonblock::set_nonblock; use zenith_utils::zid::ZTenantId; use crate::config::PageServerConf; -use crate::relish::*; -use crate::repository::ZenithWalRecord; +use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; +use crate::reltag::{RelTag, SlruKind}; +use crate::repository::Key; +use crate::walrecord::ZenithWalRecord; use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift; use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_offset; use postgres_ffi::nonrelfile_utils::mx_offset_to_member_offset; @@ -75,8 +77,7 @@ pub trait WalRedoManager: Send + Sync { /// the reords. fn request_redo( &self, - rel: RelishTag, - blknum: u32, + key: Key, lsn: Lsn, base_img: Option, records: Vec<(Lsn, ZenithWalRecord)>, @@ -92,8 +93,7 @@ pub struct DummyRedoManager {} impl crate::walredo::WalRedoManager for DummyRedoManager { fn request_redo( &self, - _rel: RelishTag, - _blknum: u32, + _key: Key, _lsn: Lsn, _base_img: Option, _records: Vec<(Lsn, ZenithWalRecord)>, @@ -152,28 +152,6 @@ fn can_apply_in_zenith(rec: &ZenithWalRecord) -> bool { } } -fn check_forknum(rel: &RelishTag, expected_forknum: u8) -> bool { - if let RelishTag::Relation(RelTag { - forknum, - spcnode: _, - dbnode: _, - relnode: _, - }) = rel - { - *forknum == expected_forknum - } else { - false - } -} - -fn check_slru_segno(rel: &RelishTag, expected_slru: SlruKind, expected_segno: u32) -> bool { - if let RelishTag::Slru { slru, segno } = rel { - *slru == expected_slru && *segno == expected_segno - } else { - false - } -} - /// An error happened in WAL redo #[derive(Debug, thiserror::Error)] pub enum WalRedoError { @@ -184,6 +162,8 @@ pub enum WalRedoError { InvalidState, #[error("cannot perform WAL redo for this request")] InvalidRequest, + #[error("cannot perform WAL redo for this record")] + InvalidRecord, } /// @@ -198,8 +178,7 @@ impl WalRedoManager for PostgresRedoManager { /// fn request_redo( &self, - rel: RelishTag, - blknum: u32, + key: Key, lsn: Lsn, base_img: Option, records: Vec<(Lsn, ZenithWalRecord)>, @@ -217,11 +196,10 @@ impl WalRedoManager for PostgresRedoManager { if rec_zenith != batch_zenith { let result = if batch_zenith { - self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..i]) + self.apply_batch_zenith(key, lsn, img, &records[batch_start..i]) } else { self.apply_batch_postgres( - rel, - blknum, + key, lsn, img, &records[batch_start..i], @@ -236,11 +214,10 @@ impl WalRedoManager for PostgresRedoManager { } // last batch if batch_zenith { - self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..]) + self.apply_batch_zenith(key, lsn, img, &records[batch_start..]) } else { self.apply_batch_postgres( - rel, - blknum, + key, lsn, img, &records[batch_start..], @@ -268,16 +245,15 @@ impl PostgresRedoManager { /// fn apply_batch_postgres( &self, - rel: RelishTag, - blknum: u32, + key: Key, lsn: Lsn, base_img: Option, records: &[(Lsn, ZenithWalRecord)], wal_redo_timeout: Duration, ) -> Result { - let start_time = Instant::now(); + let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; - let apply_result: Result; + let start_time = Instant::now(); let mut process_guard = self.process.lock().unwrap(); let lock_time = Instant::now(); @@ -291,16 +267,11 @@ impl PostgresRedoManager { WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64()); - let result = if let RelishTag::Relation(rel) = rel { - // Relational WAL records are applied using wal-redo-postgres - let buf_tag = BufferTag { rel, blknum }; - apply_result = process.apply_wal_records(buf_tag, base_img, records, wal_redo_timeout); - - apply_result.map_err(WalRedoError::IoError) - } else { - error!("unexpected non-relation relish: {:?}", rel); - Err(WalRedoError::InvalidRequest) - }; + // Relational WAL records are applied using wal-redo-postgres + let buf_tag = BufferTag { rel, blknum }; + let result = process + .apply_wal_records(buf_tag, base_img, records, wal_redo_timeout) + .map_err(WalRedoError::IoError); let end_time = Instant::now(); let duration = end_time.duration_since(lock_time); @@ -326,8 +297,7 @@ impl PostgresRedoManager { /// fn apply_batch_zenith( &self, - rel: RelishTag, - blknum: u32, + key: Key, lsn: Lsn, base_img: Option, records: &[(Lsn, ZenithWalRecord)], @@ -346,7 +316,7 @@ impl PostgresRedoManager { // Apply all the WAL records in the batch for (record_lsn, record) in records.iter() { - self.apply_record_zenith(rel, blknum, &mut page, *record_lsn, record)?; + self.apply_record_zenith(key, &mut page, *record_lsn, record)?; } // Success! let end_time = Instant::now(); @@ -365,8 +335,7 @@ impl PostgresRedoManager { fn apply_record_zenith( &self, - rel: RelishTag, - blknum: u32, + key: Key, page: &mut BytesMut, _record_lsn: Lsn, record: &ZenithWalRecord, @@ -384,10 +353,11 @@ impl PostgresRedoManager { old_heap_blkno, flags, } => { - // sanity check that this is modifying the correct relish + // sanity check that this is modifying the correct relation + let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; assert!( - check_forknum(&rel, pg_constants::VISIBILITYMAP_FORKNUM), - "ClearVisibilityMapFlags record on unexpected rel {:?}", + rel.forknum == pg_constants::VISIBILITYMAP_FORKNUM, + "ClearVisibilityMapFlags record on unexpected rel {}", rel ); if let Some(heap_blkno) = *new_heap_blkno { @@ -421,6 +391,14 @@ impl PostgresRedoManager { // Non-relational WAL records are handled here, with custom code that has the // same effects as the corresponding Postgres WAL redo function. ZenithWalRecord::ClogSetCommitted { xids } => { + let (slru_kind, segno, blknum) = + key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; + assert_eq!( + slru_kind, + SlruKind::Clog, + "ClogSetCommitted record with unexpected key {}", + key + ); for &xid in xids { let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE; let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -428,12 +406,17 @@ impl PostgresRedoManager { // Check that we're modifying the correct CLOG block. assert!( - check_slru_segno(&rel, SlruKind::Clog, expected_segno), - "ClogSetCommitted record for XID {} with unexpected rel {:?}", + segno == expected_segno, + "ClogSetCommitted record for XID {} with unexpected key {}", xid, - rel + key + ); + assert!( + blknum == expected_blknum, + "ClogSetCommitted record for XID {} with unexpected key {}", + xid, + key ); - assert!(blknum == expected_blknum); transaction_id_set_status( xid, @@ -443,6 +426,14 @@ impl PostgresRedoManager { } } ZenithWalRecord::ClogSetAborted { xids } => { + let (slru_kind, segno, blknum) = + key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; + assert_eq!( + slru_kind, + SlruKind::Clog, + "ClogSetAborted record with unexpected key {}", + key + ); for &xid in xids { let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE; let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -450,17 +441,30 @@ impl PostgresRedoManager { // Check that we're modifying the correct CLOG block. assert!( - check_slru_segno(&rel, SlruKind::Clog, expected_segno), - "ClogSetCommitted record for XID {} with unexpected rel {:?}", + segno == expected_segno, + "ClogSetAborted record for XID {} with unexpected key {}", xid, - rel + key + ); + assert!( + blknum == expected_blknum, + "ClogSetAborted record for XID {} with unexpected key {}", + xid, + key ); - assert!(blknum == expected_blknum); transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page); } } ZenithWalRecord::MultixactOffsetCreate { mid, moff } => { + let (slru_kind, segno, blknum) = + key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; + assert_eq!( + slru_kind, + SlruKind::MultiXactOffsets, + "MultixactOffsetCreate record with unexpected key {}", + key + ); // Compute the block and offset to modify. // See RecordNewMultiXact in PostgreSQL sources. let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; @@ -471,16 +475,29 @@ impl PostgresRedoManager { let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; assert!( - check_slru_segno(&rel, SlruKind::MultiXactOffsets, expected_segno), - "MultiXactOffsetsCreate record for multi-xid {} with unexpected rel {:?}", + segno == expected_segno, + "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", mid, - rel + key + ); + assert!( + blknum == expected_blknum, + "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", + mid, + key ); - assert!(blknum == expected_blknum); LittleEndian::write_u32(&mut page[offset..offset + 4], *moff); } ZenithWalRecord::MultixactMembersCreate { moff, members } => { + let (slru_kind, segno, blknum) = + key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; + assert_eq!( + slru_kind, + SlruKind::MultiXactMembers, + "MultixactMembersCreate record with unexpected key {}", + key + ); for (i, member) in members.iter().enumerate() { let offset = moff + i as u32; @@ -495,12 +512,17 @@ impl PostgresRedoManager { let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; assert!( - check_slru_segno(&rel, SlruKind::MultiXactMembers, expected_segno), - "MultiXactMembersCreate record at offset {} with unexpected rel {:?}", + segno == expected_segno, + "MultiXactMembersCreate record for offset {} with unexpected key {}", moff, - rel + key + ); + assert!( + blknum == expected_blknum, + "MultiXactMembersCreate record for offset {} with unexpected key {}", + moff, + key ); - assert!(blknum == expected_blknum); let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]); flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); diff --git a/postgres_ffi/src/pg_constants.rs b/postgres_ffi/src/pg_constants.rs index 76f837cefc..7230b841f5 100644 --- a/postgres_ffi/src/pg_constants.rs +++ b/postgres_ffi/src/pg_constants.rs @@ -24,6 +24,9 @@ pub const VISIBILITYMAP_FORKNUM: u8 = 2; pub const INIT_FORKNUM: u8 = 3; // From storage_xlog.h +pub const XLOG_SMGR_CREATE: u8 = 0x10; +pub const XLOG_SMGR_TRUNCATE: u8 = 0x20; + pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001; pub const SMGR_TRUNCATE_VM: u32 = 0x0002; pub const SMGR_TRUNCATE_FSM: u32 = 0x0004; @@ -113,7 +116,6 @@ pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4; // From pg_control.h and rmgrlist.h pub const XLOG_NEXTOID: u8 = 0x30; pub const XLOG_SWITCH: u8 = 0x40; -pub const XLOG_SMGR_TRUNCATE: u8 = 0x20; pub const XLOG_FPI_FOR_HINT: u8 = 0xA0; pub const XLOG_FPI: u8 = 0xB0; pub const DB_SHUTDOWNED: u32 = 1; diff --git a/test_runner/batch_others/test_snapfiles_gc.py b/test_runner/batch_others/test_snapfiles_gc.py deleted file mode 100644 index d00af53864..0000000000 --- a/test_runner/batch_others/test_snapfiles_gc.py +++ /dev/null @@ -1,130 +0,0 @@ -from contextlib import closing -import psycopg2.extras -from fixtures.utils import print_gc_result -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.log_helper import log - - -# -# Test Garbage Collection of old layer files -# -# This test is pretty tightly coupled with the current implementation of layered -# storage, in layered_repository.rs. -# -def test_layerfiles_gc(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_layerfiles_gc", "empty") - pg = env.postgres.create_start('test_layerfiles_gc') - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - - # Get the timeline ID of our branch. We need it for the 'do_gc' command - cur.execute("SHOW zenith.zenith_timeline") - timeline = cur.fetchone()[0] - - # Create a test table - cur.execute("CREATE TABLE foo(x integer)") - cur.execute("INSERT INTO foo VALUES (1)") - - cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass") - row = cur.fetchone() - log.info(f"relfilenode is {row[0]}") - - # Run GC, to clear out any garbage left behind in the catalogs by - # the CREATE TABLE command. We want to have a clean slate with no garbage - # before running the actual tests below, otherwise the counts won't match - # what we expect. - # - # Also run vacuum first to make it less likely that autovacuum or pruning - # kicks in and confuses our numbers. - cur.execute("VACUUM") - - # delete the row, to update the Visibility Map. We don't want the VM - # update to confuse our numbers either. - cur.execute("DELETE FROM foo") - - log.info("Running GC before test") - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - # remember the number of files - layer_relfiles_remain = (row['layer_relfiles_total'] - - row['layer_relfiles_removed']) - assert layer_relfiles_remain > 0 - - # Insert a row and run GC. Checkpoint should freeze the layer - # so that there is only the most recent image layer left for the rel, - # removing the old image and delta layer. - log.info("Inserting one row and running GC") - cur.execute("INSERT INTO foo VALUES (1)") - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - assert row['layer_relfiles_total'] == layer_relfiles_remain + 2 - assert row['layer_relfiles_removed'] == 2 - assert row['layer_relfiles_dropped'] == 0 - - # Insert two more rows and run GC. - # This should create new image and delta layer file with the new contents, and - # then remove the old one image and the just-created delta layer. - log.info("Inserting two more rows and running GC") - cur.execute("INSERT INTO foo VALUES (2)") - cur.execute("INSERT INTO foo VALUES (3)") - - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - assert row['layer_relfiles_total'] == layer_relfiles_remain + 2 - assert row['layer_relfiles_removed'] == 2 - assert row['layer_relfiles_dropped'] == 0 - - # Do it again. Should again create two new layer files and remove old ones. - log.info("Inserting two more rows and running GC") - cur.execute("INSERT INTO foo VALUES (2)") - cur.execute("INSERT INTO foo VALUES (3)") - - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - assert row['layer_relfiles_total'] == layer_relfiles_remain + 2 - assert row['layer_relfiles_removed'] == 2 - assert row['layer_relfiles_dropped'] == 0 - - # Run GC again, with no changes in the database. Should not remove anything. - log.info("Run GC again, with nothing to do") - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - assert row['layer_relfiles_total'] == layer_relfiles_remain - assert row['layer_relfiles_removed'] == 0 - assert row['layer_relfiles_dropped'] == 0 - - # - # Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage - # - log.info("Drop table and run GC again") - cur.execute("DROP TABLE foo") - - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - - # We still cannot remove the latest layers - # because they serve as tombstones for earlier layers. - assert row['layer_relfiles_dropped'] == 0 - # Each relation fork is counted separately, hence 3. - assert row['layer_relfiles_needed_as_tombstone'] == 3 - - # The catalog updates also create new layer files of the catalogs, which - # are counted as 'removed' - assert row['layer_relfiles_removed'] > 0 - - # TODO Change the test to check actual CG of dropped layers. - # Each relation fork is counted separately, hence 3. - #assert row['layer_relfiles_dropped'] == 3 - - # TODO: perhaps we should count catalog and user relations separately, - # to make this kind of testing more robust diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 236c225bfb..58f7294eb5 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -74,8 +74,5 @@ def lsn_from_hex(lsn_hex: str) -> int: def print_gc_result(row): log.info("GC duration {elapsed} ms".format_map(row)) log.info( - " REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}" - .format_map(row)) - log.info( - " NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}" + " total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}" .format_map(row)) diff --git a/vendor/postgres b/vendor/postgres index 093aa160e5..756a01aade 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 093aa160e5df19814ff19b995d36dd5ee03c7f8b +Subproject commit 756a01aade765d1d2ac115e7e189865ff697222b From 75002adc14b93a0c80b124f3677c04ae072dd739 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 28 Mar 2022 18:27:28 +0400 Subject: [PATCH 0073/1022] Make shared_buffers large in test_pageserver_catchup. We intentionally write while pageserver is down, so we shouldn't query it. Noticed by @petuhovskiy at https://github.com/zenithdb/postgres/pull/141#issuecomment-1080261700 --- test_runner/batch_others/test_pageserver_catchup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test_runner/batch_others/test_pageserver_catchup.py b/test_runner/batch_others/test_pageserver_catchup.py index 3c4b7f9569..758b018046 100644 --- a/test_runner/batch_others/test_pageserver_catchup.py +++ b/test_runner/batch_others/test_pageserver_catchup.py @@ -10,7 +10,9 @@ def test_pageserver_catchup_while_compute_down(zenith_env_builder: ZenithEnvBuil env = zenith_env_builder.init_start() env.zenith_cli.create_branch('test_pageserver_catchup_while_compute_down') - pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down') + # Make shared_buffers large to ensure we won't query pageserver while it is down. + pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down', + config_lines=['shared_buffers=512MB']) pg_conn = pg.connect() cur = pg_conn.cursor() From 780b46ad270c66960f3f4de8468891b4b030507e Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 28 Mar 2022 18:11:48 +0400 Subject: [PATCH 0074/1022] Bump vendor/postgres to fix commit_lsn going backwards. --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 756a01aade..19164aeacf 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 756a01aade765d1d2ac115e7e189865ff697222b +Subproject commit 19164aeacfd877ef75d67e70a71647f5d4c0cd2f From a8832024953d3bb6da5da76f8dd2007433119b87 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Mon, 28 Mar 2022 18:56:36 +0300 Subject: [PATCH 0075/1022] Enable S3 for pageserver on staging Follow-up for #1417. Previously we had a problem uploading to S3 due to huge ammount of existing not yet uploaded data. Now we have a fresh pageserver with LSM storage on staging, so we can try enabling it once again. --- .circleci/ansible/deploy.yaml | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml index 020a852a00..09aca8539e 100644 --- a/.circleci/ansible/deploy.yaml +++ b/.circleci/ansible/deploy.yaml @@ -63,20 +63,19 @@ tags: - pageserver - # Temporary disabled until LSM storage rewrite lands - # - name: update config - # when: current_version > remote_version or force_deploy - # lineinfile: - # path: /storage/pageserver/data/pageserver.toml - # line: "{{ item }}" - # loop: - # - "[remote_storage]" - # - "bucket_name = '{{ bucket_name }}'" - # - "bucket_region = '{{ bucket_region }}'" - # - "prefix_in_bucket = '{{ inventory_hostname }}'" - # become: true - # tags: - # - pageserver + - name: update remote storage (s3) config + when: current_version > remote_version or force_deploy + lineinfile: + path: /storage/pageserver/data/pageserver.toml + line: "{{ item }}" + loop: + - "[remote_storage]" + - "bucket_name = '{{ bucket_name }}'" + - "bucket_region = '{{ bucket_region }}'" + - "prefix_in_bucket = '{{ inventory_hostname }}'" + become: true + tags: + - pageserver - name: upload systemd service definition ansible.builtin.template: From 8a901de52a270b8bf8a97a256527037fb0031276 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Sat, 12 Mar 2022 20:28:44 +0000 Subject: [PATCH 0076/1022] Refactor control file update at safekeeper. Record global_commit_lsn, have common routine for control file update, add SafekeeperMemstate. --- walkeeper/src/safekeeper.rs | 133 +++++++++++++++++++++++------------- walkeeper/src/timeline.rs | 4 +- 2 files changed, 87 insertions(+), 50 deletions(-) diff --git a/walkeeper/src/safekeeper.rs b/walkeeper/src/safekeeper.rs index 53fd6f5588..8300b32b42 100644 --- a/walkeeper/src/safekeeper.rs +++ b/walkeeper/src/safekeeper.rs @@ -202,6 +202,14 @@ pub struct SafeKeeperState { pub peers: Peers, } +#[derive(Debug, Clone)] +// In memory safekeeper state. Fields mirror ones in `SafeKeeperState`; they are +// not flushed yet. +pub struct SafekeeperMemState { + pub commit_lsn: Lsn, + pub peer_horizon_lsn: Lsn, +} + impl SafeKeeperState { pub fn new(zttid: &ZTenantTimelineId, peers: Vec) -> SafeKeeperState { SafeKeeperState { @@ -470,14 +478,12 @@ struct SafeKeeperMetrics { } impl SafeKeeperMetrics { - fn new(tenant_id: ZTenantId, timeline_id: ZTimelineId, commit_lsn: Lsn) -> Self { + fn new(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Self { let tenant_id = tenant_id.to_string(); let timeline_id = timeline_id.to_string(); - let m = Self { + Self { commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&[&tenant_id, &timeline_id]), - }; - m.commit_lsn.set(u64::from(commit_lsn) as f64); - m + } } } @@ -487,9 +493,14 @@ pub struct SafeKeeper { // Cached metrics so we don't have to recompute labels on each update. metrics: SafeKeeperMetrics, - /// not-yet-flushed pairs of same named fields in s.* - pub commit_lsn: Lsn, - pub peer_horizon_lsn: Lsn, + /// Maximum commit_lsn between all nodes, can be ahead of local flush_lsn. + global_commit_lsn: Lsn, + /// LSN since the proposer safekeeper currently talking to appends WAL; + /// determines epoch switch point. + epoch_start_lsn: Lsn, + + pub inmem: SafekeeperMemState, // in memory part + pub s: SafeKeeperState, // persistent part pub control_store: CTRL, @@ -513,9 +524,13 @@ where } SafeKeeper { - metrics: SafeKeeperMetrics::new(state.tenant_id, ztli, state.commit_lsn), - commit_lsn: state.commit_lsn, - peer_horizon_lsn: state.peer_horizon_lsn, + metrics: SafeKeeperMetrics::new(state.tenant_id, ztli), + global_commit_lsn: state.commit_lsn, + epoch_start_lsn: Lsn(0), + inmem: SafekeeperMemState { + commit_lsn: state.commit_lsn, + peer_horizon_lsn: state.peer_horizon_lsn, + }, s: state, control_store, wal_store, @@ -602,9 +617,6 @@ where // pass wal_seg_size to read WAL and find flush_lsn self.wal_store.init_storage(&self.s)?; - // update tenant_id/timeline_id in metrics - self.metrics = SafeKeeperMetrics::new(msg.tenant_id, msg.ztli, self.commit_lsn); - info!( "processed greeting from proposer {:?}, sending term {:?}", msg.proposer_id, self.s.acceptor_state.term @@ -684,12 +696,49 @@ where Ok(None) } + /// Advance commit_lsn taking into account what we have locally + fn update_commit_lsn(&mut self) -> Result<()> { + let commit_lsn = min(self.global_commit_lsn, self.wal_store.flush_lsn()); + assert!(commit_lsn >= self.inmem.commit_lsn); + + self.inmem.commit_lsn = commit_lsn; + self.metrics.commit_lsn.set(self.inmem.commit_lsn.0 as f64); + + // If new commit_lsn reached epoch switch, force sync of control + // file: walproposer in sync mode is very interested when this + // happens. Note: this is for sync-safekeepers mode only, as + // otherwise commit_lsn might jump over epoch_start_lsn. + // Also note that commit_lsn can reach epoch_start_lsn earlier + // that we receive new epoch_start_lsn, and we still need to sync + // control file in this case. + if commit_lsn == self.epoch_start_lsn && self.s.commit_lsn != commit_lsn { + self.persist_control_file()?; + } + + // We got our first commit_lsn, which means we should sync + // everything to disk, to initialize the state. + if self.s.commit_lsn == Lsn(0) && commit_lsn > Lsn(0) { + self.wal_store.flush_wal()?; + self.persist_control_file()?; + } + + Ok(()) + } + + /// Persist in-memory state to the disk. + fn persist_control_file(&mut self) -> Result<()> { + self.s.commit_lsn = self.inmem.commit_lsn; + self.s.peer_horizon_lsn = self.inmem.peer_horizon_lsn; + + self.control_store.persist(&self.s) + } + /// Handle request to append WAL. #[allow(clippy::comparison_chain)] fn handle_append_request( &mut self, msg: &AppendRequest, - mut require_flush: bool, + require_flush: bool, ) -> Result> { if self.s.acceptor_state.term < msg.h.term { bail!("got AppendRequest before ProposerElected"); @@ -701,25 +750,22 @@ where return Ok(Some(AcceptorProposerMessage::AppendResponse(resp))); } - // After ProposerElected, which performs truncation, we should get only - // indeed append requests (but flush_lsn is advanced only on record - // boundary, so might be less). - assert!(self.wal_store.flush_lsn() <= msg.h.begin_lsn); + // Now we know that we are in the same term as the proposer, + // processing the message. + self.epoch_start_lsn = msg.h.epoch_start_lsn; + // TODO: don't update state without persisting to disk self.s.proposer_uuid = msg.h.proposer_uuid; - let mut sync_control_file = false; // do the job if !msg.wal_data.is_empty() { self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?; - // If this was the first record we ever receieved, initialize + // If this was the first record we ever received, initialize // commit_lsn to help find_end_of_wal skip the hole in the // beginning. - if self.s.commit_lsn == Lsn(0) { - self.s.commit_lsn = msg.h.begin_lsn; - sync_control_file = true; - require_flush = true; + if self.global_commit_lsn == Lsn(0) { + self.global_commit_lsn = msg.h.begin_lsn; } } @@ -728,35 +774,22 @@ where self.wal_store.flush_wal()?; } - // Advance commit_lsn taking into account what we have locally. - // commit_lsn can be 0, being unknown to new walproposer while he hasn't - // collected majority of its epoch acks yet, ignore it in this case. + // Update global_commit_lsn, verifying that it cannot decrease. if msg.h.commit_lsn != Lsn(0) { - let commit_lsn = min(msg.h.commit_lsn, self.wal_store.flush_lsn()); - // If new commit_lsn reached epoch switch, force sync of control - // file: walproposer in sync mode is very interested when this - // happens. Note: this is for sync-safekeepers mode only, as - // otherwise commit_lsn might jump over epoch_start_lsn. - sync_control_file |= commit_lsn == msg.h.epoch_start_lsn; - self.commit_lsn = commit_lsn; - self.metrics - .commit_lsn - .set(u64::from(self.commit_lsn) as f64); + assert!(msg.h.commit_lsn >= self.global_commit_lsn); + self.global_commit_lsn = msg.h.commit_lsn; } - self.peer_horizon_lsn = msg.h.truncate_lsn; + self.inmem.peer_horizon_lsn = msg.h.truncate_lsn; + self.update_commit_lsn()?; + // Update truncate and commit LSN in control file. // To avoid negative impact on performance of extra fsync, do it only // when truncate_lsn delta exceeds WAL segment size. - sync_control_file |= - self.s.peer_horizon_lsn + (self.s.server.wal_seg_size as u64) < self.peer_horizon_lsn; - if sync_control_file { - self.s.commit_lsn = self.commit_lsn; - self.s.peer_horizon_lsn = self.peer_horizon_lsn; - } - - if sync_control_file { - self.control_store.persist(&self.s)?; + if self.s.peer_horizon_lsn + (self.s.server.wal_seg_size as u64) + < self.inmem.peer_horizon_lsn + { + self.persist_control_file()?; } trace!( @@ -780,6 +813,10 @@ where /// Flush WAL to disk. Return AppendResponse with latest LSNs. fn handle_flush(&mut self) -> Result> { self.wal_store.flush_wal()?; + + // commit_lsn can be updated because we have new flushed data locally. + self.update_commit_lsn()?; + Ok(Some(AcceptorProposerMessage::AppendResponse( self.append_response(), ))) diff --git a/walkeeper/src/timeline.rs b/walkeeper/src/timeline.rs index ea8308b95e..b53f2e086b 100644 --- a/walkeeper/src/timeline.rs +++ b/walkeeper/src/timeline.rs @@ -340,7 +340,7 @@ impl Timeline { let replica_state = shared_state.replicas[replica_id].unwrap(); let deactivate = shared_state.notified_commit_lsn == Lsn(0) || // no data at all yet (replica_state.last_received_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. - replica_state.last_received_lsn >= shared_state.sk.commit_lsn); + replica_state.last_received_lsn >= shared_state.sk.inmem.commit_lsn); if deactivate { shared_state.deactivate(&self.zttid, callmemaybe_tx)?; return Ok(true); @@ -394,7 +394,7 @@ impl Timeline { rmsg = shared_state.sk.process_msg(msg)?; // locally available commit lsn. flush_lsn can be smaller than // commit_lsn if we are catching up safekeeper. - commit_lsn = shared_state.sk.commit_lsn; + commit_lsn = shared_state.sk.inmem.commit_lsn; // if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { From d88f8b4a7e0b8251db36b7ed1dad4888765e3b83 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Mon, 28 Mar 2022 20:47:55 +0300 Subject: [PATCH 0077/1022] Fix storage deploy condition in ansible playbook --- .circleci/ansible/deploy.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml index 09aca8539e..3540f01fcb 100644 --- a/.circleci/ansible/deploy.yaml +++ b/.circleci/ansible/deploy.yaml @@ -64,7 +64,6 @@ - pageserver - name: update remote storage (s3) config - when: current_version > remote_version or force_deploy lineinfile: path: /storage/pageserver/data/pageserver.toml line: "{{ item }}" From 9a4f0930c02906bdce0806db6dceed44c48e0c66 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Mon, 28 Mar 2022 22:10:15 +0300 Subject: [PATCH 0078/1022] Turn off S3 for pageserver on staging --- .circleci/ansible/deploy.yaml | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml index 3540f01fcb..b7ffd075a0 100644 --- a/.circleci/ansible/deploy.yaml +++ b/.circleci/ansible/deploy.yaml @@ -63,18 +63,21 @@ tags: - pageserver - - name: update remote storage (s3) config - lineinfile: - path: /storage/pageserver/data/pageserver.toml - line: "{{ item }}" - loop: - - "[remote_storage]" - - "bucket_name = '{{ bucket_name }}'" - - "bucket_region = '{{ bucket_region }}'" - - "prefix_in_bucket = '{{ inventory_hostname }}'" - become: true - tags: - - pageserver + # It seems that currently S3 integration does not play well + # even with fresh pageserver without a burden of old data. + # TODO: turn this back on once the issue is solved. + # - name: update remote storage (s3) config + # lineinfile: + # path: /storage/pageserver/data/pageserver.toml + # line: "{{ item }}" + # loop: + # - "[remote_storage]" + # - "bucket_name = '{{ bucket_name }}'" + # - "bucket_region = '{{ bucket_region }}'" + # - "prefix_in_bucket = '{{ inventory_hostname }}'" + # become: true + # tags: + # - pageserver - name: upload systemd service definition ansible.builtin.template: From 1aa57fc262bebb52b78dfa4054bdf9e8bd9cb48c Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Mon, 28 Mar 2022 12:07:23 -0700 Subject: [PATCH 0079/1022] Fix tone down compact log chatter Signed-off-by: Dhammika Pathirana --- pageserver/src/layered_repository.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 837298a10e..a0f1f2d830 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1628,6 +1628,9 @@ impl LayeredTimeline { }; let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?; + if num_deltas == 0 { + continue; + } info!( "range {}-{}, has {} deltas on this timeline", From 0e44887929daa9851fb0c6239d1011c41cde04b8 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 28 Mar 2022 22:33:05 +0300 Subject: [PATCH 0080/1022] Show more S3 logs and less verbove WAL logs --- pageserver/src/config.rs | 2 +- pageserver/src/layered_repository.rs | 2 +- pageserver/src/remote_storage/storage_sync.rs | 47 ++++++++++++------- pageserver/src/walreceiver.rs | 2 +- 4 files changed, 33 insertions(+), 20 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 0fdfb4ceed..9f7cd34a7a 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -41,7 +41,7 @@ pub mod defaults { pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; pub const DEFAULT_SUPERUSER: &str = "zenith_admin"; - pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 100; + pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 10; pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index a0f1f2d830..56d14fd4e9 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1594,7 +1594,7 @@ impl LayeredTimeline { self.compact_level0(target_file_size)?; timer.stop_and_record(); } else { - info!("Could not compact because no partitioning specified yet"); + debug!("Could not compact because no partitioning specified yet"); } // Call unload() on all frozen layers, to release memory. diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index ddd47ea981..cd6c40b46f 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -443,30 +443,38 @@ fn storage_sync_loop< max_sync_errors: NonZeroU32, ) { let remote_assets = Arc::new((storage, index.clone())); + info!("Starting remote storage sync loop"); loop { let index = index.clone(); let loop_step = runtime.block_on(async { tokio::select! { - new_timeline_states = loop_step( + step = loop_step( conf, &mut receiver, Arc::clone(&remote_assets), max_concurrent_sync, max_sync_errors, ) - .instrument(debug_span!("storage_sync_loop_step")) => LoopStep::SyncStatusUpdates(new_timeline_states), + .instrument(debug_span!("storage_sync_loop_step")) => step, _ = thread_mgr::shutdown_watcher() => LoopStep::Shutdown, } }); match loop_step { LoopStep::SyncStatusUpdates(new_timeline_states) => { - // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. - apply_timeline_sync_status_updates(conf, index, new_timeline_states); - debug!("Sync loop step completed"); + if new_timeline_states.is_empty() { + debug!("Sync loop step completed, no new timeline states"); + } else { + info!( + "Sync loop step completed, {} new timeline state update(s)", + new_timeline_states.len() + ); + // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. + apply_timeline_sync_status_updates(conf, index, new_timeline_states); + } } LoopStep::Shutdown => { - debug!("Shutdown requested, stopping"); + info!("Shutdown requested, stopping"); break; } } @@ -482,7 +490,7 @@ async fn loop_step< remote_assets: Arc<(S, RemoteIndex)>, max_concurrent_sync: NonZeroUsize, max_sync_errors: NonZeroU32, -) -> HashMap> { +) -> LoopStep { let max_concurrent_sync = max_concurrent_sync.get(); let mut next_tasks = Vec::new(); @@ -490,8 +498,7 @@ async fn loop_step< if let Some(first_task) = sync_queue::next_task(receiver).await { next_tasks.push(first_task); } else { - debug!("Shutdown requested, stopping"); - return HashMap::new(); + return LoopStep::Shutdown; }; next_tasks.extend( sync_queue::next_task_batch(receiver, max_concurrent_sync - 1) @@ -500,12 +507,17 @@ async fn loop_step< ); let remaining_queue_length = sync_queue::len(); - debug!( - "Processing {} tasks in batch, more tasks left to process: {}", - next_tasks.len(), - remaining_queue_length - ); REMAINING_SYNC_ITEMS.set(remaining_queue_length as i64); + if remaining_queue_length > 0 || !next_tasks.is_empty() { + info!( + "Processing {} tasks in batch, more tasks left to process: {}", + next_tasks.len(), + remaining_queue_length + ); + } else { + debug!("No tasks to process"); + return LoopStep::SyncStatusUpdates(HashMap::new()); + } let mut task_batch = next_tasks .into_iter() @@ -515,8 +527,9 @@ async fn loop_step< let sync_name = task.kind.sync_name(); let extra_step = match tokio::spawn( - process_task(conf, Arc::clone(&remote_assets), task, max_sync_errors) - .instrument(debug_span!("", sync_id = %sync_id, attempt, sync_name)), + process_task(conf, Arc::clone(&remote_assets), task, max_sync_errors).instrument( + debug_span!("process_sync_task", sync_id = %sync_id, attempt, sync_name), + ), ) .await { @@ -551,7 +564,7 @@ async fn loop_step< } } - new_timeline_states + LoopStep::SyncStatusUpdates(new_timeline_states) } async fn process_task< diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index e382475627..6de0b87478 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -70,7 +70,7 @@ pub fn launch_wal_receiver( match receivers.get_mut(&(tenantid, timelineid)) { Some(receiver) => { - info!("wal receiver already running, updating connection string"); + debug!("wal receiver already running, updating connection string"); receiver.wal_producer_connstr = wal_producer_connstr.into(); } None => { From be6a6958e26b2eae54fe00fd282772222d44b728 Mon Sep 17 00:00:00 2001 From: Anton Shyrabokau <97127717+antons-antons@users.noreply.github.com> Date: Mon, 28 Mar 2022 18:19:20 -0700 Subject: [PATCH 0081/1022] CI: rebuild postgres when Makefile changes (#1429) --- .circleci/config.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 8faa69d64e..4a03cbf3b5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -34,10 +34,13 @@ jobs: - checkout # Grab the postgres git revision to build a cache key. + # Append makefile as it could change the way postgres is built. # Note this works even though the submodule hasn't been checkout out yet. - run: name: Get postgres cache key - command: git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres + command: | + git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres + cat Makefile >> /tmp/cache-key-postgres - restore_cache: name: Restore postgres cache @@ -78,11 +81,14 @@ jobs: - checkout # Grab the postgres git revision to build a cache key. + # Append makefile as it could change the way postgres is built. # Note this works even though the submodule hasn't been checkout out yet. - run: name: Get postgres cache key command: | git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres + cat Makefile >> /tmp/cache-key-postgres + - restore_cache: name: Restore postgres cache From fd78110c2bd22fa2fdb4a3191df542b697858528 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 29 Mar 2022 09:57:00 +0300 Subject: [PATCH 0082/1022] Add default statement_timeout for tests (#1423) --- test_runner/fixtures/zenith_fixtures.py | 36 +++++++++++++++---------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 08ac09ee4c..2da021a49c 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -257,7 +257,8 @@ class PgProtocol: dbname: Optional[str] = None, schema: Optional[str] = None, username: Optional[str] = None, - password: Optional[str] = None) -> str: + password: Optional[str] = None, + statement_timeout_ms: Optional[int] = None) -> str: """ Build a libpq connection string for the Postgres instance. """ @@ -277,16 +278,23 @@ class PgProtocol: if schema: res = f"{res} options='-c search_path={schema}'" + if statement_timeout_ms: + res = f"{res} options='-c statement_timeout={statement_timeout_ms}'" + return res # autocommit=True here by default because that's what we need most of the time - def connect(self, - *, - autocommit=True, - dbname: Optional[str] = None, - schema: Optional[str] = None, - username: Optional[str] = None, - password: Optional[str] = None) -> PgConnection: + def connect( + self, + *, + autocommit=True, + dbname: Optional[str] = None, + schema: Optional[str] = None, + username: Optional[str] = None, + password: Optional[str] = None, + # individual statement timeout in seconds, 2 minutes should be enough for our tests + statement_timeout: Optional[int] = 120 + ) -> PgConnection: """ Connect to the node. Returns psycopg2's connection object. @@ -294,12 +302,12 @@ class PgProtocol: """ conn = psycopg2.connect( - self.connstr( - dbname=dbname, - schema=schema, - username=username, - password=password, - )) + self.connstr(dbname=dbname, + schema=schema, + username=username, + password=password, + statement_timeout_ms=statement_timeout * + 1000 if statement_timeout else None)) # WARNING: this setting affects *all* tests! conn.autocommit = autocommit return conn From eee0f51e0c3ea2d52269741124b68b8dac0e051c Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 28 Mar 2022 11:39:15 +0300 Subject: [PATCH 0083/1022] use cargo-hakari to manage workspace_hack crate workspace_hack is needed to avoid recompilation when different crates inside the workspace depend on the same packages but with different features being enabled. Problem occurs when you build crates separately one by one. So this is irrelevant to our CI setup because there we build all binaries at once, but it may be relevant for local development. this also changes cargo's resolver version to 2 --- .config/hakari.toml | 24 ++++++++++++++++ Cargo.lock | 15 ++++++++++ Cargo.toml | 1 + compute_tools/Cargo.toml | 1 + control_plane/Cargo.toml | 2 +- docs/sourcetree.md | 2 ++ pageserver/Cargo.toml | 2 +- postgres_ffi/Cargo.toml | 2 +- proxy/Cargo.toml | 1 + walkeeper/Cargo.toml | 2 +- workspace_hack/Cargo.toml | 60 ++++++++++++++++++++++++++++----------- workspace_hack/src/lib.rs | 24 +--------------- zenith/Cargo.toml | 2 +- zenith_metrics/Cargo.toml | 1 + zenith_utils/Cargo.toml | 2 +- 15 files changed, 96 insertions(+), 45 deletions(-) create mode 100644 .config/hakari.toml diff --git a/.config/hakari.toml b/.config/hakari.toml new file mode 100644 index 0000000000..7bccc6c4a3 --- /dev/null +++ b/.config/hakari.toml @@ -0,0 +1,24 @@ +# This file contains settings for `cargo hakari`. +# See https://docs.rs/cargo-hakari/latest/cargo_hakari/config for a full list of options. + +hakari-package = "workspace_hack" + +# Format for `workspace-hack = ...` lines in other Cargo.tomls. Requires cargo-hakari 0.9.8 or above. +dep-format-version = "2" + +# Setting workspace.resolver = "2" in the root Cargo.toml is HIGHLY recommended. +# Hakari works much better with the new feature resolver. +# For more about the new feature resolver, see: +# https://blog.rust-lang.org/2021/03/25/Rust-1.51.0.html#cargos-new-feature-resolver +resolver = "2" + +# Add triples corresponding to platforms commonly used by developers here. +# https://doc.rust-lang.org/rustc/platform-support.html +platforms = [ + # "x86_64-unknown-linux-gnu", + # "x86_64-apple-darwin", + # "x86_64-pc-windows-msvc", +] + +# Write out exact versions rather than a semver range. (Defaults to false.) +# exact-versions = true diff --git a/Cargo.lock b/Cargo.lock index 290d715f2c..40f4358d98 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -407,6 +407,7 @@ dependencies = [ "serde_json", "tar", "tokio", + "workspace_hack", ] [[package]] @@ -1803,6 +1804,7 @@ dependencies = [ "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", "tokio-postgres-rustls", "tokio-rustls 0.22.0", + "workspace_hack", "zenith_metrics", "zenith_utils", ] @@ -3041,7 +3043,14 @@ dependencies = [ name = "workspace_hack" version = "0.1.0" dependencies = [ + "anyhow", + "bytes", + "cc", + "clap 2.34.0", + "either", + "hashbrown 0.11.2", "libc", + "log", "memchr", "num-integer", "num-traits", @@ -3049,8 +3058,13 @@ dependencies = [ "quote", "regex", "regex-syntax", + "reqwest", + "scopeguard", "serde", "syn", + "tokio", + "tracing", + "tracing-core", ] [[package]] @@ -3101,6 +3115,7 @@ dependencies = [ "libc", "once_cell", "prometheus", + "workspace_hack", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index b20e64a06f..f3ac36dcb2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ members = [ "zenith_metrics", "zenith_utils", ] +resolver = "2" [profile.release] # This is useful for profiling and, to some extent, debug. diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 3adf762dcb..4ecf7f6499 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -17,3 +17,4 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1" tar = "0.4" tokio = { version = "1", features = ["macros", "rt", "rt-multi-thread"] } +workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index b52c7ad5a9..e118ea4793 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -20,4 +20,4 @@ reqwest = { version = "0.11", default-features = false, features = ["blocking", pageserver = { path = "../pageserver" } walkeeper = { path = "../walkeeper" } zenith_utils = { path = "../zenith_utils" } -workspace_hack = { path = "../workspace_hack" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 8d35d35f2f..89b07de8d2 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -67,6 +67,8 @@ For more detailed info, see `/walkeeper/README` `/workspace_hack`: The workspace_hack crate exists only to pin down some dependencies. +We use [cargo-hakari](https://crates.io/crates/cargo-hakari) for automation. + `/zenith` Main entry point for the 'zenith' CLI utility. diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index de22d0dd77..14eae31da8 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -51,7 +51,7 @@ async-compression = {version = "0.3", features = ["zstd", "tokio"]} postgres_ffi = { path = "../postgres_ffi" } zenith_metrics = { path = "../zenith_metrics" } zenith_utils = { path = "../zenith_utils" } -workspace_hack = { path = "../workspace_hack" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] hex-literal = "0.3" diff --git a/postgres_ffi/Cargo.toml b/postgres_ffi/Cargo.toml index 17f1ecd666..e8d471cb12 100644 --- a/postgres_ffi/Cargo.toml +++ b/postgres_ffi/Cargo.toml @@ -17,8 +17,8 @@ log = "0.4.14" memoffset = "0.6.2" thiserror = "1.0" serde = { version = "1.0", features = ["derive"] } -workspace_hack = { path = "../workspace_hack" } zenith_utils = { path = "../zenith_utils" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } [build-dependencies] bindgen = "0.59.1" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index dda018a1d8..72c394dad4 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -29,6 +29,7 @@ tokio-rustls = "0.22.0" zenith_utils = { path = "../zenith_utils" } zenith_metrics = { path = "../zenith_metrics" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] tokio-postgres-rustls = "0.8.0" diff --git a/walkeeper/Cargo.toml b/walkeeper/Cargo.toml index 193fc4acf6..f59c24816d 100644 --- a/walkeeper/Cargo.toml +++ b/walkeeper/Cargo.toml @@ -29,9 +29,9 @@ const_format = "0.2.21" tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } postgres_ffi = { path = "../postgres_ffi" } -workspace_hack = { path = "../workspace_hack" } zenith_metrics = { path = "../zenith_metrics" } zenith_utils = { path = "../zenith_utils" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] tempfile = "3.2" diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 48d81bbc07..6e6a0e09d7 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -1,22 +1,50 @@ +# This file is generated by `cargo hakari`. +# To regenerate, run: +# cargo hakari generate + [package] name = "workspace_hack" version = "0.1.0" -edition = "2021" +description = "workspace-hack package, managed by hakari" +# You can choose to publish this crate: see https://docs.rs/cargo-hakari/latest/cargo_hakari/publishing. +publish = false -[target.'cfg(all())'.dependencies] -libc = { version = "0.2", features = ["default", "extra_traits", "std"] } -memchr = { version = "2", features = ["default", "std", "use_std"] } +# The parts of the file between the BEGIN HAKARI SECTION and END HAKARI SECTION comments +# are managed by hakari. + +### BEGIN HAKARI SECTION +[dependencies] +anyhow = { version = "1", features = ["backtrace", "std"] } +bytes = { version = "1", features = ["serde", "std"] } +clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } +either = { version = "1", features = ["use_std"] } +hashbrown = { version = "0.11", features = ["ahash", "inline-more", "raw"] } +libc = { version = "0.2", features = ["extra_traits", "std"] } +log = { version = "0.4", default-features = false, features = ["serde", "std"] } +memchr = { version = "2", features = ["std", "use_std"] } num-integer = { version = "0.1", default-features = false, features = ["std"] } -num-traits = { version = "0.2", default-features = false, features = ["std"] } -regex = { version = "1", features = ["aho-corasick", "default", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } -regex-syntax = { version = "0.6", features = ["default", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } -serde = { version = "1", features = ["default", "derive", "serde_derive", "std"] } +num-traits = { version = "0.2", features = ["std"] } +regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } +regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } +reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "hyper-rustls", "json", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "stream", "tokio-rustls", "tokio-util", "webpki-roots"] } +scopeguard = { version = "1", features = ["use_std"] } +serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } +tokio = { version = "1", features = ["bytes", "fs", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "sync", "time", "tokio-macros"] } +tracing = { version = "0.1", features = ["attributes", "std", "tracing-attributes"] } +tracing-core = { version = "0.1", features = ["lazy_static", "std"] } -[target.'cfg(all())'.build-dependencies] -libc = { version = "0.2", features = ["default", "extra_traits", "std"] } -memchr = { version = "2", features = ["default", "std", "use_std"] } -proc-macro2 = { version = "1", features = ["default", "proc-macro"] } -quote = { version = "1", features = ["default", "proc-macro"] } -regex = { version = "1", features = ["aho-corasick", "default", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } -regex-syntax = { version = "0.6", features = ["default", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } -syn = { version = "1", features = ["clone-impls", "default", "derive", "full", "parsing", "printing", "proc-macro", "quote", "visit", "visit-mut"] } +[build-dependencies] +cc = { version = "1", default-features = false, features = ["jobserver", "parallel"] } +clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } +either = { version = "1", features = ["use_std"] } +libc = { version = "0.2", features = ["extra_traits", "std"] } +log = { version = "0.4", default-features = false, features = ["serde", "std"] } +memchr = { version = "2", features = ["std", "use_std"] } +proc-macro2 = { version = "1", features = ["proc-macro"] } +quote = { version = "1", features = ["proc-macro"] } +regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } +regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } +serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } +syn = { version = "1", features = ["clone-impls", "derive", "extra-traits", "full", "parsing", "printing", "proc-macro", "quote", "visit", "visit-mut"] } + +### END HAKARI SECTION diff --git a/workspace_hack/src/lib.rs b/workspace_hack/src/lib.rs index ceba3d145d..22489f632b 100644 --- a/workspace_hack/src/lib.rs +++ b/workspace_hack/src/lib.rs @@ -1,23 +1 @@ -//! This crate contains no code. -//! -//! The workspace_hack crate exists only to pin down some dependencies, -//! so that those dependencies always build with the same features, -//! under a few different cases that can be problematic: -//! - Running `cargo check` or `cargo build` from a crate sub-directory -//! instead of the workspace root. -//! - Running `cargo install`, which can only be done per-crate -//! -//! The dependency lists in Cargo.toml were automatically generated by -//! a tool called -//! [Hakari](https://github.com/facebookincubator/cargo-guppy/tree/main/tools/hakari). -//! -//! Hakari doesn't have a CLI yet; in the meantime the example code in -//! their `README` file is enough to regenerate the dependencies. -//! Hakari's output was pasted into Cargo.toml, except for the -//! following manual edits: -//! - `winapi` dependency was removed. This is probably just due to the -//! fact that Hakari's target analysis is incomplete. -//! -//! There isn't any penalty to this data falling out of date; it just -//! means that under the conditions above Cargo will rebuild more -//! packages than strictly necessary. +// This is a stub lib.rs. diff --git a/zenith/Cargo.toml b/zenith/Cargo.toml index 8adbda0723..74aeffb51c 100644 --- a/zenith/Cargo.toml +++ b/zenith/Cargo.toml @@ -15,4 +15,4 @@ control_plane = { path = "../control_plane" } walkeeper = { path = "../walkeeper" } postgres_ffi = { path = "../postgres_ffi" } zenith_utils = { path = "../zenith_utils" } -workspace_hack = { path = "../workspace_hack" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/zenith_metrics/Cargo.toml b/zenith_metrics/Cargo.toml index 0c921ede0b..906c5a1d64 100644 --- a/zenith_metrics/Cargo.toml +++ b/zenith_metrics/Cargo.toml @@ -8,3 +8,4 @@ prometheus = {version = "0.13", default_features=false} # removes protobuf depen libc = "0.2" lazy_static = "1.4" once_cell = "1.8.0" +workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/zenith_utils/Cargo.toml b/zenith_utils/Cargo.toml index 8e7f5f233c..e8ad0e627f 100644 --- a/zenith_utils/Cargo.toml +++ b/zenith_utils/Cargo.toml @@ -30,7 +30,7 @@ git-version = "0.3.5" serde_with = "1.12.0" zenith_metrics = { path = "../zenith_metrics" } -workspace_hack = { path = "../workspace_hack" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] byteorder = "1.4.3" From 9594362f74c2ea66a495da8d50c3cb25de67d62c Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 28 Mar 2022 17:34:13 +0300 Subject: [PATCH 0084/1022] change python cache version to 2 (fixes python cache in circle CI) --- .circleci/config.yml | 8 ++++---- scripts/pysync | 8 +++++++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4a03cbf3b5..e96964558b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -228,12 +228,12 @@ jobs: - checkout - restore_cache: keys: - - v1-python-deps-{{ checksum "poetry.lock" }} + - v2-python-deps-{{ checksum "poetry.lock" }} - run: name: Install deps command: ./scripts/pysync - save_cache: - key: v1-python-deps-{{ checksum "poetry.lock" }} + key: v2-python-deps-{{ checksum "poetry.lock" }} paths: - /home/circleci/.cache/pypoetry/virtualenvs - run: @@ -287,12 +287,12 @@ jobs: - run: git submodule update --init --depth 1 - restore_cache: keys: - - v1-python-deps-{{ checksum "poetry.lock" }} + - v2-python-deps-{{ checksum "poetry.lock" }} - run: name: Install deps command: ./scripts/pysync - save_cache: - key: v1-python-deps-{{ checksum "poetry.lock" }} + key: v2-python-deps-{{ checksum "poetry.lock" }} paths: - /home/circleci/.cache/pypoetry/virtualenvs - run: diff --git a/scripts/pysync b/scripts/pysync index e548973dea..12fa08beca 100755 --- a/scripts/pysync +++ b/scripts/pysync @@ -4,4 +4,10 @@ # It is intended to be a primary endpoint for all the people who want to # just setup test environment without going into details of python package management -poetry install --no-root # this installs dev dependencies by default +poetry config --list + +if [ -z "${CI}" ]; then + poetry install --no-root --no-interaction --ansi +else + poetry install --no-root +fi From ec3bc741653d8c14f99a27c58ff74f4046ba7969 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Thu, 17 Mar 2022 15:14:16 +0300 Subject: [PATCH 0085/1022] Add safekeeper information exchange through etcd. Safekeers now publish to and pull from etcd per-timeline data. Immediate goal is WAL truncation, for which every safekeeper must know remote_consistent_lsn; the next would be callmemaybe replacement. Adds corresponding '--broker' argument to safekeeper and ability to run etcd in tests. Adds test checking remote_consistent_lsn is indeed communicated. --- Cargo.lock | 252 +++++++++++++++++- control_plane/src/local_env.rs | 4 + control_plane/src/safekeeper.rs | 6 + test_runner/README.md | 2 + test_runner/batch_others/test_wal_acceptor.py | 46 +++- test_runner/fixtures/utils.py | 6 + test_runner/fixtures/zenith_fixtures.py | 75 +++++- walkeeper/Cargo.toml | 3 + walkeeper/src/bin/safekeeper.rs | 27 +- walkeeper/src/broker.rs | 211 +++++++++++++++ walkeeper/src/handler.rs | 9 +- walkeeper/src/http/routes.rs | 17 +- walkeeper/src/json_ctrl.rs | 6 +- walkeeper/src/lib.rs | 4 + walkeeper/src/safekeeper.rs | 20 +- walkeeper/src/send_wal.rs | 2 +- walkeeper/src/timeline.rs | 76 +++++- 17 files changed, 726 insertions(+), 40 deletions(-) create mode 100644 walkeeper/src/broker.rs diff --git a/Cargo.lock b/Cargo.lock index 40f4358d98..c770f576c9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -75,6 +75,27 @@ dependencies = [ "zstd-safe", ] +[[package]] +name = "async-stream" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "171374e7e3b2504e0e5236e3b59260560f9fe94bfe9ac39ba5e4e929c5590625" +dependencies = [ + "async-stream-impl", + "futures-core", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "648ed8c8d2ce5409ccd57453d9d1b214b342a0d69376a6feda1fd6cae3299308" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "async-trait" version = "0.1.52" @@ -703,6 +724,21 @@ dependencies = [ "termcolor", ] +[[package]] +name = "etcd-client" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "585de5039d1ecce74773db49ba4e8107e42be7c2cd0b1a9e7fce27181db7b118" +dependencies = [ + "http", + "prost", + "tokio", + "tokio-stream", + "tonic", + "tonic-build", + "tower-service", +] + [[package]] name = "fail" version = "0.5.0" @@ -741,6 +777,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "fixedbitset" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "279fb028e20b3c4c320317955b77c5e0c9701f05a1d309905d6fc702cdc5053e" + [[package]] name = "fnv" version = "1.0.7" @@ -926,7 +968,7 @@ dependencies = [ "indexmap", "slab", "tokio", - "tokio-util", + "tokio-util 0.6.9", "tracing", ] @@ -954,6 +996,15 @@ dependencies = [ "ahash 0.7.6", ] +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "hermit-abi" version = "0.1.19" @@ -1075,6 +1126,18 @@ dependencies = [ "tokio-rustls 0.23.2", ] +[[package]] +name = "hyper-timeout" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" +dependencies = [ + "hyper", + "pin-project-lite", + "tokio", + "tokio-io-timeout", +] + [[package]] name = "ident_case" version = "1.0.1" @@ -1308,9 +1371,9 @@ dependencies = [ [[package]] name = "mio" -version = "0.7.14" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8067b404fe97c70829f082dec8bcf4f71225d7eaea1d8645349cb76fa06205cc" +checksum = "ba272f85fa0b41fc91872be579b3bbe0f56b792aa361a380eb669469f68dafb2" dependencies = [ "libc", "log", @@ -1328,6 +1391,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "multimap" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" + [[package]] name = "nix" version = "0.23.1" @@ -1557,6 +1626,16 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" +[[package]] +name = "petgraph" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" +dependencies = [ + "fixedbitset", + "indexmap", +] + [[package]] name = "phf" version = "0.8.0" @@ -1776,6 +1855,59 @@ dependencies = [ "thiserror", ] +[[package]] +name = "prost" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "444879275cb4fd84958b1a1d5420d15e6fcf7c235fe47f053c9c2a80aceb6001" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62941722fb675d463659e49c4f3fe1fe792ff24fe5bbaa9c08cd3b98a1c354f5" +dependencies = [ + "bytes", + "heck", + "itertools", + "lazy_static", + "log", + "multimap", + "petgraph", + "prost", + "prost-types", + "regex", + "tempfile", + "which", +] + +[[package]] +name = "prost-derive" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9cc1a3263e07e0bf68e96268f37665207b49560d98739662cdfaae215c720fe" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534b7a0e836e3c482d2693070f982e39e7611da9695d4d1f5a4b186b51faef0a" +dependencies = [ + "bytes", + "prost", +] + [[package]] name = "proxy" version = "0.1.0" @@ -1979,7 +2111,7 @@ dependencies = [ "serde_urlencoded", "tokio", "tokio-rustls 0.23.2", - "tokio-util", + "tokio-util 0.6.9", "url", "wasm-bindgen", "wasm-bindgen-futures", @@ -2508,9 +2640,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.16.1" +version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c27a64b625de6d309e8c57716ba93021dccf1b3b5c97edd6d3dd2d2135afc0a" +checksum = "2af73ac49756f3f7c01172e34a23e5d0216f6c32333757c2c61feb2bbff5a5ee" dependencies = [ "bytes", "libc", @@ -2520,10 +2652,21 @@ dependencies = [ "once_cell", "pin-project-lite", "signal-hook-registry", + "socket2", "tokio-macros", "winapi", ] +[[package]] +name = "tokio-io-timeout" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" +dependencies = [ + "pin-project-lite", + "tokio", +] + [[package]] name = "tokio-macros" version = "1.7.0" @@ -2554,7 +2697,7 @@ dependencies = [ "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", "socket2", "tokio", - "tokio-util", + "tokio-util 0.6.9", ] [[package]] @@ -2576,7 +2719,7 @@ dependencies = [ "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", "socket2", "tokio", - "tokio-util", + "tokio-util 0.6.9", ] [[package]] @@ -2641,6 +2784,20 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-util" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64910e1b9c1901aaf5375561e35b9c057d95ff41a44ede043a03e09279eabaf1" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "log", + "pin-project-lite", + "tokio", +] + [[package]] name = "toml" version = "0.5.8" @@ -2663,6 +2820,75 @@ dependencies = [ "serde", ] +[[package]] +name = "tonic" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff08f4649d10a70ffa3522ca559031285d8e421d727ac85c60825761818f5d0a" +dependencies = [ + "async-stream", + "async-trait", + "base64 0.13.0", + "bytes", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-timeout", + "percent-encoding", + "pin-project", + "prost", + "prost-derive", + "tokio", + "tokio-stream", + "tokio-util 0.6.9", + "tower", + "tower-layer", + "tower-service", + "tracing", + "tracing-futures", +] + +[[package]] +name = "tonic-build" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9403f1bafde247186684b230dc6f38b5cd514584e8bec1dd32514be4745fa757" +dependencies = [ + "proc-macro2", + "prost-build", + "quote", + "syn", +] + +[[package]] +name = "tower" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a89fd63ad6adf737582df5db40d286574513c69a11dac5214dc3b5603d6713e" +dependencies = [ + "futures-core", + "futures-util", + "indexmap", + "pin-project", + "pin-project-lite", + "rand", + "slab", + "tokio", + "tokio-util 0.7.0", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-layer" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "343bc9466d3fe6b0f960ef45960509f84480bf4fd96f92901afe7ff3df9d3a62" + [[package]] name = "tower-service" version = "0.3.1" @@ -2676,6 +2902,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d8d93354fe2a8e50d5953f5ae2e47a3fc2ef03292e7ea46e3cc38f549525fb9" dependencies = [ "cfg-if", + "log", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -2768,6 +2995,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-segmentation" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99" + [[package]] name = "unicode-width" version = "0.1.9" @@ -2838,6 +3071,7 @@ dependencies = [ "const_format", "crc32c", "daemonize", + "etcd-client", "fs2", "hex", "humantime", @@ -2850,11 +3084,13 @@ dependencies = [ "rust-s3", "serde", "serde_json", + "serde_with", "signal-hook", "tempfile", "tokio", "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", "tracing", + "url", "walkdir", "workspace_hack", "zenith_metrics", diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 00ace431e6..2bdc76e876 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -57,6 +57,10 @@ pub struct LocalEnv { #[serde(default)] pub private_key_path: PathBuf, + // A comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'. + #[serde(default)] + pub broker_endpoints: Option, + pub pageserver: PageServerConf, #[serde(default)] diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 969e2cd531..89ab0a31ee 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -73,6 +73,8 @@ pub struct SafekeeperNode { pub http_base_url: String, pub pageserver: Arc, + + broker_endpoints: Option, } impl SafekeeperNode { @@ -89,6 +91,7 @@ impl SafekeeperNode { http_client: Client::new(), http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port), pageserver, + broker_endpoints: env.broker_endpoints.clone(), } } @@ -135,6 +138,9 @@ impl SafekeeperNode { if !self.conf.sync { cmd.arg("--no-sync"); } + if let Some(ref ep) = self.broker_endpoints { + cmd.args(&["--broker-endpoints", ep]); + } if !cmd.status()?.success() { bail!( diff --git a/test_runner/README.md b/test_runner/README.md index a56c2df2c0..ee171ae6a0 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -10,6 +10,8 @@ Prerequisites: below to run from other directories. - The zenith git repo, including the postgres submodule (for some tests, e.g. `pg_regress`) +- Some tests (involving storage nodes coordination) require etcd installed. Follow + [`the guide`](https://etcd.io/docs/v3.5/install/) to obtain it. ### Test Organization diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 37ce1a8bca..bdc526a125 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -13,7 +13,7 @@ from dataclasses import dataclass, field from multiprocessing import Process, Value from pathlib import Path from fixtures.zenith_fixtures import PgBin, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol -from fixtures.utils import lsn_to_hex, mkdir_if_needed, lsn_from_hex +from fixtures.utils import etcd_path, lsn_to_hex, mkdir_if_needed, lsn_from_hex from fixtures.log_helper import log from typing import List, Optional, Any @@ -22,6 +22,7 @@ from typing import List, Optional, Any # succeed and data is written def test_normal_work(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 + zenith_env_builder.broker = True env = zenith_env_builder.init_start() env.zenith_cli.create_branch('test_wal_acceptors_normal_work') @@ -326,6 +327,49 @@ def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value): proc.join() +# Test that safekeepers push their info to the broker and learn peer status from it +@pytest.mark.skipif(etcd_path() is None, reason="requires etcd which is not present in PATH") +def test_broker(zenith_env_builder: ZenithEnvBuilder): + zenith_env_builder.num_safekeepers = 3 + zenith_env_builder.broker = True + zenith_env_builder.enable_local_fs_remote_storage() + env = zenith_env_builder.init_start() + + env.zenith_cli.create_branch("test_broker", "main") + pg = env.postgres.create_start('test_broker') + pg.safe_psql("CREATE TABLE t(key int primary key, value text)") + + # learn zenith timeline from compute + tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] + timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + + # wait until remote_consistent_lsn gets advanced on all safekeepers + clients = [sk.http_client() for sk in env.safekeepers] + stat_before = [cli.timeline_status(tenant_id, timeline_id) for cli in clients] + log.info(f"statuses is {stat_before}") + + pg.safe_psql("INSERT INTO t SELECT generate_series(1,100), 'payload'") + # force checkpoint to advance remote_consistent_lsn + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor() as pscur: + pscur.execute(f"checkpoint {tenant_id} {timeline_id}") + # and wait till remote_consistent_lsn propagates to all safekeepers + started_at = time.time() + while True: + stat_after = [cli.timeline_status(tenant_id, timeline_id) for cli in clients] + if all( + lsn_from_hex(s_after.remote_consistent_lsn) > lsn_from_hex( + s_before.remote_consistent_lsn) for s_after, + s_before in zip(stat_after, stat_before)): + break + elapsed = time.time() - started_at + if elapsed > 20: + raise RuntimeError( + f"timed out waiting {elapsed:.0f}s for remote_consistent_lsn propagation: status before {stat_before}, status current {stat_after}" + ) + time.sleep(0.5) + + class ProposerPostgres(PgProtocol): """Object for running postgres without ZenithEnv""" def __init__(self, diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 58f7294eb5..f16fe1d9cf 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,4 +1,5 @@ import os +import shutil import subprocess from typing import Any, List @@ -76,3 +77,8 @@ def print_gc_result(row): log.info( " total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}" .format_map(row)) + + +# path to etcd binary or None if not present. +def etcd_path(): + return shutil.which("etcd") diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 2da021a49c..a95809687a 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -33,7 +33,7 @@ from typing_extensions import Literal import requests import backoff # type: ignore -from .utils import (get_self_dir, lsn_from_hex, mkdir_if_needed, subprocess_capture) +from .utils import (etcd_path, get_self_dir, mkdir_if_needed, subprocess_capture, lsn_from_hex) from fixtures.log_helper import log """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -433,7 +433,8 @@ class ZenithEnvBuilder: num_safekeepers: int = 0, pageserver_auth_enabled: bool = False, rust_log_override: Optional[str] = None, - default_branch_name=DEFAULT_BRANCH_NAME): + default_branch_name=DEFAULT_BRANCH_NAME, + broker: bool = False): self.repo_dir = repo_dir self.rust_log_override = rust_log_override self.port_distributor = port_distributor @@ -442,6 +443,7 @@ class ZenithEnvBuilder: self.num_safekeepers = num_safekeepers self.pageserver_auth_enabled = pageserver_auth_enabled self.default_branch_name = default_branch_name + self.broker = broker self.env: Optional[ZenithEnv] = None self.s3_mock_server: Optional[MockS3Server] = None @@ -517,6 +519,8 @@ class ZenithEnvBuilder: self.env.pageserver.stop(immediate=True) if self.s3_mock_server: self.s3_mock_server.kill() + if self.env.broker is not None: + self.env.broker.stop() class ZenithEnv: @@ -569,6 +573,16 @@ class ZenithEnv: default_tenant_id = '{self.initial_tenant.hex}' """) + self.broker = None + if config.broker: + # keep etcd datadir inside 'repo' + self.broker = Etcd(datadir=os.path.join(self.repo_dir, "etcd"), + port=self.port_distributor.get_port(), + peer_port=self.port_distributor.get_port()) + toml += textwrap.dedent(f""" + broker_endpoints = 'http://127.0.0.1:{self.broker.port}' + """) + # Create config for pageserver pageserver_port = PageserverPort( pg=self.port_distributor.get_port(), @@ -611,12 +625,15 @@ class ZenithEnv: self.zenith_cli.init(toml) def start(self): - # Start up the page server and all the safekeepers + # Start up the page server, all the safekeepers and the broker self.pageserver.start() for safekeeper in self.safekeepers: safekeeper.start() + if self.broker is not None: + self.broker.start() + def get_safekeeper_connstrs(self) -> str: """ Get list of safekeeper endpoints suitable for wal_acceptors GUC """ return ','.join([f'localhost:{wa.port.pg}' for wa in self.safekeepers]) @@ -1674,6 +1691,7 @@ class Safekeeper: class SafekeeperTimelineStatus: acceptor_epoch: int flush_lsn: str + remote_consistent_lsn: str @dataclass @@ -1697,7 +1715,8 @@ class SafekeeperHttpClient(requests.Session): res.raise_for_status() resj = res.json() return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'], - flush_lsn=resj['flush_lsn']) + flush_lsn=resj['flush_lsn'], + remote_consistent_lsn=resj['remote_consistent_lsn']) def get_metrics(self) -> SafekeeperMetrics: request_result = self.get(f"http://localhost:{self.port}/metrics") @@ -1718,6 +1737,54 @@ class SafekeeperHttpClient(requests.Session): return metrics +@dataclass +class Etcd: + """ An object managing etcd instance """ + datadir: str + port: int + peer_port: int + handle: Optional[subprocess.Popen[Any]] = None # handle of running daemon + + def check_status(self): + s = requests.Session() + s.mount('http://', requests.adapters.HTTPAdapter(max_retries=1)) # do not retry + s.get(f"http://localhost:{self.port}/health").raise_for_status() + + def start(self): + pathlib.Path(self.datadir).mkdir(exist_ok=True) + etcd_full_path = etcd_path() + if etcd_full_path is None: + raise Exception('etcd not found') + + with open(os.path.join(self.datadir, "etcd.log"), "wb") as log_file: + args = [ + etcd_full_path, + f"--data-dir={self.datadir}", + f"--listen-client-urls=http://localhost:{self.port}", + f"--advertise-client-urls=http://localhost:{self.port}", + f"--listen-peer-urls=http://localhost:{self.peer_port}" + ] + self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file) + + # wait for start + started_at = time.time() + while True: + try: + self.check_status() + except Exception as e: + elapsed = time.time() - started_at + if elapsed > 5: + raise RuntimeError(f"timed out waiting {elapsed:.0f}s for etcd start: {e}") + time.sleep(0.5) + else: + break # success + + def stop(self): + if self.handle is not None: + self.handle.terminate() + self.handle.wait() + + def get_test_output_dir(request: Any) -> str: """ Compute the working directory for an individual test. """ test_name = request.node.name diff --git a/walkeeper/Cargo.toml b/walkeeper/Cargo.toml index f59c24816d..e8523d27d1 100644 --- a/walkeeper/Cargo.toml +++ b/walkeeper/Cargo.toml @@ -22,11 +22,14 @@ anyhow = "1.0" crc32c = "0.6.0" humantime = "2.1.0" walkdir = "2" +url = "2.2.2" signal-hook = "0.3.10" serde = { version = "1.0", features = ["derive"] } +serde_with = {version = "1.12.0"} hex = "0.4.3" const_format = "0.2.21" tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } +etcd-client = "0.8.3" postgres_ffi = { path = "../postgres_ffi" } zenith_metrics = { path = "../zenith_metrics" } diff --git a/walkeeper/src/bin/safekeeper.rs b/walkeeper/src/bin/safekeeper.rs index 6c45115e5f..b3087a1004 100644 --- a/walkeeper/src/bin/safekeeper.rs +++ b/walkeeper/src/bin/safekeeper.rs @@ -11,18 +11,19 @@ use std::io::{ErrorKind, Write}; use std::path::{Path, PathBuf}; use std::thread; use tracing::*; +use url::{ParseError, Url}; use walkeeper::control_file::{self}; use zenith_utils::http::endpoint; use zenith_utils::zid::ZNodeId; use zenith_utils::{logging, tcp_listener, GIT_VERSION}; use tokio::sync::mpsc; -use walkeeper::callmemaybe; use walkeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR}; use walkeeper::http; use walkeeper::s3_offload; use walkeeper::wal_service; use walkeeper::SafeKeeperConf; +use walkeeper::{broker, callmemaybe}; use zenith_utils::shutdown::exit_now; use zenith_utils::signals; @@ -104,6 +105,11 @@ fn main() -> Result<()> { ) .arg( Arg::new("id").long("id").takes_value(true).help("safekeeper node id: integer") + ).arg( + Arg::new("broker-endpoints") + .long("broker-endpoints") + .takes_value(true) + .help("a comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'"), ) .get_matches(); @@ -154,6 +160,11 @@ fn main() -> Result<()> { )); } + if let Some(addr) = arg_matches.value_of("broker-endpoints") { + let collected_ep: Result, ParseError> = addr.split(',').map(Url::parse).collect(); + conf.broker_endpoints = Some(collected_ep?); + } + start_safekeeper(conf, given_id, arg_matches.is_present("init")) } @@ -259,11 +270,12 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b threads.push(wal_acceptor_thread); + let conf_cloned = conf.clone(); let callmemaybe_thread = thread::Builder::new() .name("callmemaybe thread".into()) .spawn(|| { // thread code - let thread_result = callmemaybe::thread_main(conf, rx); + let thread_result = callmemaybe::thread_main(conf_cloned, rx); if let Err(e) = thread_result { error!("callmemaybe thread terminated: {}", e); } @@ -271,6 +283,17 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b .unwrap(); threads.push(callmemaybe_thread); + if conf.broker_endpoints.is_some() { + let conf_ = conf.clone(); + threads.push( + thread::Builder::new() + .name("broker thread".into()) + .spawn(|| { + broker::thread_main(conf_); + })?, + ); + } + // TODO: put more thoughts into handling of failed threads // We probably should restart them. diff --git a/walkeeper/src/broker.rs b/walkeeper/src/broker.rs new file mode 100644 index 0000000000..147497d673 --- /dev/null +++ b/walkeeper/src/broker.rs @@ -0,0 +1,211 @@ +//! Communication with etcd, providing safekeeper peers and pageserver coordination. + +use anyhow::bail; +use anyhow::Context; +use anyhow::Error; +use anyhow::Result; +use etcd_client::Client; +use etcd_client::EventType; +use etcd_client::PutOptions; +use etcd_client::WatchOptions; +use lazy_static::lazy_static; +use regex::Regex; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; +use std::str::FromStr; +use std::time::Duration; +use tokio::task::JoinHandle; +use tokio::{runtime, time::sleep}; +use tracing::*; +use zenith_utils::zid::ZTenantId; +use zenith_utils::zid::ZTimelineId; +use zenith_utils::{ + lsn::Lsn, + zid::{ZNodeId, ZTenantTimelineId}, +}; + +use crate::{safekeeper::Term, timeline::GlobalTimelines, SafeKeeperConf}; + +const RETRY_INTERVAL_MSEC: u64 = 1000; +const PUSH_INTERVAL_MSEC: u64 = 1000; +const LEASE_TTL_SEC: i64 = 5; +// TODO: add global zenith installation ID. +const ZENITH_PREFIX: &str = "zenith"; + +/// Published data about safekeeper. Fields made optional for easy migrations. +#[serde_as] +#[derive(Deserialize, Serialize)] +pub struct SafekeeperInfo { + /// Term of the last entry. + pub last_log_term: Option, + /// LSN of the last record. + #[serde_as(as = "Option")] + pub flush_lsn: Option, + /// Up to which LSN safekeeper regards its WAL as committed. + #[serde_as(as = "Option")] + pub commit_lsn: Option, + /// LSN up to which safekeeper offloaded WAL to s3. + #[serde_as(as = "Option")] + pub s3_wal_lsn: Option, + /// LSN of last checkpoint uploaded by pageserver. + #[serde_as(as = "Option")] + pub remote_consistent_lsn: Option, + #[serde_as(as = "Option")] + pub peer_horizon_lsn: Option, +} + +pub fn thread_main(conf: SafeKeeperConf) { + let runtime = runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + + let _enter = info_span!("broker").entered(); + info!("started, broker endpoints {:?}", conf.broker_endpoints); + + runtime.block_on(async { + main_loop(conf).await; + }); +} + +/// Prefix to timeline related data. +fn timeline_path(zttid: &ZTenantTimelineId) -> String { + format!( + "{}/{}/{}", + ZENITH_PREFIX, zttid.tenant_id, zttid.timeline_id + ) +} + +/// Key to per timeline per safekeeper data. +fn timeline_safekeeper_path(zttid: &ZTenantTimelineId, sk_id: ZNodeId) -> String { + format!("{}/safekeeper/{}", timeline_path(zttid), sk_id) +} + +/// Push once in a while data about all active timelines to the broker. +async fn push_loop(conf: SafeKeeperConf) -> Result<()> { + let mut client = Client::connect(conf.broker_endpoints.as_ref().unwrap(), None).await?; + + // Get and maintain lease to automatically delete obsolete data + let lease = client.lease_grant(LEASE_TTL_SEC, None).await?; + let (mut keeper, mut ka_stream) = client.lease_keep_alive(lease.id()).await?; + + let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); + loop { + // Note: we lock runtime here and in timeline methods as GlobalTimelines + // is under plain mutex. That's ok, all this code is not performance + // sensitive and there is no risk of deadlock as we don't await while + // lock is held. + let active_tlis = GlobalTimelines::get_active_timelines(); + for zttid in &active_tlis { + if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) { + let sk_info = tli.get_public_info(); + let put_opts = PutOptions::new().with_lease(lease.id()); + client + .put( + timeline_safekeeper_path(zttid, conf.my_id), + serde_json::to_string(&sk_info)?, + Some(put_opts), + ) + .await + .context("failed to push safekeeper info")?; + } + } + // revive the lease + keeper + .keep_alive() + .await + .context("failed to send LeaseKeepAliveRequest")?; + ka_stream + .message() + .await + .context("failed to receive LeaseKeepAliveResponse")?; + sleep(push_interval).await; + } +} + +/// Subscribe and fetch all the interesting data from the broker. +async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { + lazy_static! { + static ref TIMELINE_SAFEKEEPER_RE: Regex = + Regex::new(r"^zenith/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$") + .unwrap(); + } + let mut client = Client::connect(conf.broker_endpoints.as_ref().unwrap(), None).await?; + loop { + let wo = WatchOptions::new().with_prefix(); + // TODO: subscribe only to my timelines + let (_, mut stream) = client.watch(ZENITH_PREFIX, Some(wo)).await?; + while let Some(resp) = stream.message().await? { + if resp.canceled() { + bail!("watch canceled"); + } + + for event in resp.events() { + if EventType::Put == event.event_type() { + if let Some(kv) = event.kv() { + if let Some(caps) = TIMELINE_SAFEKEEPER_RE.captures(kv.key_str()?) { + let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let zttid = ZTenantTimelineId::new(tenant_id, timeline_id); + let safekeeper_id = ZNodeId(caps.get(3).unwrap().as_str().parse()?); + let value_str = kv.value_str()?; + match serde_json::from_str::(value_str) { + Ok(safekeeper_info) => { + if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { + tli.record_safekeeper_info(&safekeeper_info, safekeeper_id)? + } + } + Err(err) => warn!( + "failed to deserialize safekeeper info {}: {}", + value_str, err + ), + } + } + } + } + } + } + } +} + +async fn main_loop(conf: SafeKeeperConf) { + let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC)); + let mut push_handle: Option>> = None; + let mut pull_handle: Option>> = None; + // Selecting on JoinHandles requires some squats; is there a better way to + // reap tasks individually? + + // Handling failures in task itself won't catch panic and in Tokio, task's + // panic doesn't kill the whole executor, so it is better to do reaping + // here. + loop { + tokio::select! { + res = async { push_handle.as_mut().unwrap().await }, if push_handle.is_some() => { + // was it panic or normal error? + let err = match res { + Ok(res_internal) => res_internal.unwrap_err(), + Err(err_outer) => err_outer.into(), + }; + warn!("push task failed: {:?}", err); + push_handle = None; + }, + res = async { pull_handle.as_mut().unwrap().await }, if pull_handle.is_some() => { + // was it panic or normal error? + let err = match res { + Ok(res_internal) => res_internal.unwrap_err(), + Err(err_outer) => err_outer.into(), + }; + warn!("pull task failed: {:?}", err); + pull_handle = None; + }, + _ = ticker.tick() => { + if push_handle.is_none() { + push_handle = Some(tokio::spawn(push_loop(conf.clone()))); + } + if pull_handle.is_none() { + pull_handle = Some(tokio::spawn(pull_loop(conf.clone()))); + } + } + } + } +} diff --git a/walkeeper/src/handler.rs b/walkeeper/src/handler.rs index ead6fab9fb..00d177da56 100644 --- a/walkeeper/src/handler.rs +++ b/walkeeper/src/handler.rs @@ -168,7 +168,14 @@ impl SafekeeperPostgresHandler { fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> { let start_pos = self.timeline.get().get_end_of_wal(); let lsn = start_pos.to_string(); - let sysid = self.timeline.get().get_info().server.system_id.to_string(); + let sysid = self + .timeline + .get() + .get_state() + .1 + .server + .system_id + .to_string(); let lsn_bytes = lsn.as_bytes(); let tli = PG_TLI.to_string(); let tli_bytes = tli.as_bytes(); diff --git a/walkeeper/src/http/routes.rs b/walkeeper/src/http/routes.rs index 74f7f4a735..06a0682c37 100644 --- a/walkeeper/src/http/routes.rs +++ b/walkeeper/src/http/routes.rs @@ -86,23 +86,24 @@ async fn timeline_status_handler(request: Request) -> Result Result<()> { fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: Lsn) -> Result<()> { // add new term to existing history - let history = spg.timeline.get().get_info().acceptor_state.term_history; + let history = spg.timeline.get().get_state().1.acceptor_state.term_history; let history = history.up_to(lsn.checked_sub(1u64).unwrap()); let mut history_entries = history.0; history_entries.push(TermSwitchEntry { term, lsn }); @@ -142,7 +142,7 @@ fn append_logical_message( msg: &AppendLogicalMessage, ) -> Result { let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); - let sk_state = spg.timeline.get().get_info(); + let sk_state = spg.timeline.get().get_state().1; let begin_lsn = msg.begin_lsn; let end_lsn = begin_lsn + wal_data.len() as u64; diff --git a/walkeeper/src/lib.rs b/walkeeper/src/lib.rs index dfd71e4de2..69423d42d8 100644 --- a/walkeeper/src/lib.rs +++ b/walkeeper/src/lib.rs @@ -1,9 +1,11 @@ // use std::path::PathBuf; use std::time::Duration; +use url::Url; use zenith_utils::zid::{ZNodeId, ZTenantTimelineId}; +pub mod broker; pub mod callmemaybe; pub mod control_file; pub mod control_file_upgrade; @@ -47,6 +49,7 @@ pub struct SafeKeeperConf { pub ttl: Option, pub recall_period: Duration, pub my_id: ZNodeId, + pub broker_endpoints: Option>, } impl SafeKeeperConf { @@ -71,6 +74,7 @@ impl Default for SafeKeeperConf { ttl: None, recall_period: defaults::DEFAULT_RECALL_PERIOD, my_id: ZNodeId(0), + broker_endpoints: None, } } } diff --git a/walkeeper/src/safekeeper.rs b/walkeeper/src/safekeeper.rs index 8300b32b42..307a67e5f3 100644 --- a/walkeeper/src/safekeeper.rs +++ b/walkeeper/src/safekeeper.rs @@ -193,7 +193,7 @@ pub struct SafeKeeperState { pub peer_horizon_lsn: Lsn, /// LSN of the oldest known checkpoint made by pageserver and successfully /// pushed to s3. We don't remove WAL beyond it. Persisted only for - /// informational purposes, we receive it from pageserver. + /// informational purposes, we receive it from pageserver (or broker). pub remote_consistent_lsn: Lsn, // Peers and their state as we remember it. Knowing peers themselves is // fundamental; but state is saved here only for informational purposes and @@ -203,11 +203,13 @@ pub struct SafeKeeperState { } #[derive(Debug, Clone)] -// In memory safekeeper state. Fields mirror ones in `SafeKeeperState`; they are -// not flushed yet. +// In memory safekeeper state. Fields mirror ones in `SafeKeeperState`; values +// are not flushed yet. pub struct SafekeeperMemState { pub commit_lsn: Lsn, + pub s3_wal_lsn: Lsn, // TODO: keep only persistent version pub peer_horizon_lsn: Lsn, + pub remote_consistent_lsn: Lsn, } impl SafeKeeperState { @@ -494,14 +496,13 @@ pub struct SafeKeeper { metrics: SafeKeeperMetrics, /// Maximum commit_lsn between all nodes, can be ahead of local flush_lsn. - global_commit_lsn: Lsn, + pub global_commit_lsn: Lsn, /// LSN since the proposer safekeeper currently talking to appends WAL; /// determines epoch switch point. epoch_start_lsn: Lsn, pub inmem: SafekeeperMemState, // in memory part - - pub s: SafeKeeperState, // persistent part + pub s: SafeKeeperState, // persistent part pub control_store: CTRL, pub wal_store: WAL, @@ -529,7 +530,9 @@ where epoch_start_lsn: Lsn(0), inmem: SafekeeperMemState { commit_lsn: state.commit_lsn, + s3_wal_lsn: state.s3_wal_lsn, peer_horizon_lsn: state.peer_horizon_lsn, + remote_consistent_lsn: state.remote_consistent_lsn, }, s: state, control_store, @@ -545,8 +548,7 @@ where .up_to(self.wal_store.flush_lsn()) } - #[cfg(test)] - fn get_epoch(&self) -> Term { + pub fn get_epoch(&self) -> Term { self.s.acceptor_state.get_epoch(self.wal_store.flush_lsn()) } @@ -697,7 +699,7 @@ where } /// Advance commit_lsn taking into account what we have locally - fn update_commit_lsn(&mut self) -> Result<()> { + pub fn update_commit_lsn(&mut self) -> Result<()> { let commit_lsn = min(self.global_commit_lsn, self.wal_store.flush_lsn()); assert!(commit_lsn >= self.inmem.commit_lsn); diff --git a/walkeeper/src/send_wal.rs b/walkeeper/src/send_wal.rs index 1febd71842..f12fb5cb4a 100644 --- a/walkeeper/src/send_wal.rs +++ b/walkeeper/src/send_wal.rs @@ -230,7 +230,7 @@ impl ReplicationConn { let mut wal_seg_size: usize; loop { - wal_seg_size = spg.timeline.get().get_info().server.wal_seg_size as usize; + wal_seg_size = spg.timeline.get().get_state().1.server.wal_seg_size as usize; if wal_seg_size == 0 { error!("Cannot start replication before connecting to wal_proposer"); sleep(Duration::from_secs(1)); diff --git a/walkeeper/src/timeline.rs b/walkeeper/src/timeline.rs index b53f2e086b..b10ab97cc1 100644 --- a/walkeeper/src/timeline.rs +++ b/walkeeper/src/timeline.rs @@ -17,12 +17,14 @@ use tracing::*; use zenith_utils::lsn::Lsn; use zenith_utils::zid::{ZNodeId, ZTenantTimelineId}; +use crate::broker::SafekeeperInfo; use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; use crate::control_file; use crate::control_file::Storage as cf_storage; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, + SafekeeperMemState, }; use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; @@ -349,6 +351,11 @@ impl Timeline { Ok(false) } + fn is_active(&self) -> bool { + let shared_state = self.mutex.lock().unwrap(); + shared_state.active + } + /// Timed wait for an LSN to be committed. /// /// Returns the last committed LSN, which will be at least @@ -410,8 +417,61 @@ impl Timeline { Ok(rmsg) } - pub fn get_info(&self) -> SafeKeeperState { - self.mutex.lock().unwrap().sk.s.clone() + pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) { + let shared_state = self.mutex.lock().unwrap(); + (shared_state.sk.inmem.clone(), shared_state.sk.s.clone()) + } + + /// Prepare public safekeeper info for reporting. + pub fn get_public_info(&self) -> SafekeeperInfo { + let shared_state = self.mutex.lock().unwrap(); + SafekeeperInfo { + last_log_term: Some(shared_state.sk.get_epoch()), + flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()), + // note: this value is not flushed to control file yet and can be lost + commit_lsn: Some(shared_state.sk.inmem.commit_lsn), + s3_wal_lsn: Some(shared_state.sk.inmem.s3_wal_lsn), + // TODO: rework feedbacks to avoid max here + remote_consistent_lsn: Some(max( + shared_state.get_replicas_state().remote_consistent_lsn, + shared_state.sk.inmem.remote_consistent_lsn, + )), + peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn), + } + } + + /// Update timeline state with peer safekeeper data. + pub fn record_safekeeper_info(&self, sk_info: &SafekeeperInfo, _sk_id: ZNodeId) -> Result<()> { + let mut shared_state = self.mutex.lock().unwrap(); + // Note: the check is too restrictive, generally we can update local + // commit_lsn if our history matches (is part of) history of advanced + // commit_lsn provider. + if let (Some(commit_lsn), Some(last_log_term)) = (sk_info.commit_lsn, sk_info.last_log_term) + { + if last_log_term == shared_state.sk.get_epoch() { + shared_state.sk.global_commit_lsn = + max(commit_lsn, shared_state.sk.global_commit_lsn); + shared_state.sk.update_commit_lsn()?; + let local_commit_lsn = min(commit_lsn, shared_state.sk.wal_store.flush_lsn()); + shared_state.sk.inmem.commit_lsn = + max(local_commit_lsn, shared_state.sk.inmem.commit_lsn); + } + } + if let Some(s3_wal_lsn) = sk_info.s3_wal_lsn { + shared_state.sk.inmem.s3_wal_lsn = max(s3_wal_lsn, shared_state.sk.inmem.s3_wal_lsn); + } + if let Some(remote_consistent_lsn) = sk_info.remote_consistent_lsn { + shared_state.sk.inmem.remote_consistent_lsn = max( + remote_consistent_lsn, + shared_state.sk.inmem.remote_consistent_lsn, + ); + } + if let Some(peer_horizon_lsn) = sk_info.peer_horizon_lsn { + shared_state.sk.inmem.peer_horizon_lsn = + max(peer_horizon_lsn, shared_state.sk.inmem.peer_horizon_lsn); + } + // TODO: sync control file + Ok(()) } pub fn add_replica(&self, state: ReplicaState) -> usize { @@ -495,7 +555,7 @@ impl GlobalTimelines { } /// Get a timeline with control file loaded from the global TIMELINES map. - /// If control file doesn't exist, bails out. + /// If control file doesn't exist and create=false, bails out. pub fn get( conf: &SafeKeeperConf, zttid: ZTenantTimelineId, @@ -537,4 +597,14 @@ impl GlobalTimelines { } } } + + /// Get ZTenantTimelineIDs of all active timelines. + pub fn get_active_timelines() -> Vec { + let timelines = TIMELINES.lock().unwrap(); + timelines + .iter() + .filter(|&(_, tli)| tli.is_active()) + .map(|(zttid, _)| *zttid) + .collect() + } } From ce0243bc12db72dba8b196dbee71af2434d28ead Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 29 Mar 2022 18:54:24 +0300 Subject: [PATCH 0086/1022] Add metric for last_record_lsn (#1430) --- pageserver/src/layered_repository.rs | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 56d14fd4e9..33f5694879 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -48,7 +48,9 @@ use crate::walredo::WalRedoManager; use crate::CheckpointConfig; use crate::{ZTenantId, ZTimelineId}; -use zenith_metrics::{register_histogram_vec, Histogram, HistogramVec}; +use zenith_metrics::{ + register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec, IntGauge, IntGaugeVec, +}; use zenith_utils::crashsafe_dir; use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn}; use zenith_utils::seqwait::SeqWait; @@ -95,6 +97,15 @@ lazy_static! { .expect("failed to define a metric"); } +lazy_static! { + static ref LAST_RECORD_LSN: IntGaugeVec = register_int_gauge_vec!( + "pageserver_last_record_lsn", + "Last record LSN grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); +} + /// Parts of the `.zenith/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; @@ -745,11 +756,12 @@ pub struct LayeredTimeline { ancestor_timeline: Option, ancestor_lsn: Lsn, - // Metrics histograms + // Metrics reconstruct_time_histo: Histogram, flush_time_histo: Histogram, compact_time_histo: Histogram, create_images_time_histo: Histogram, + last_record_gauge: IntGauge, /// If `true`, will backup its files that appear after each checkpointing to the remote storage. upload_layers: AtomicBool, @@ -982,6 +994,9 @@ impl LayeredTimeline { &timelineid.to_string(), ]) .unwrap(); + let last_record_gauge = LAST_RECORD_LSN + .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) + .unwrap(); LayeredTimeline { conf, @@ -1007,6 +1022,7 @@ impl LayeredTimeline { flush_time_histo, compact_time_histo, create_images_time_histo, + last_record_gauge, upload_layers: AtomicBool::new(upload_layers), @@ -1325,6 +1341,7 @@ impl LayeredTimeline { fn finish_write(&self, new_lsn: Lsn) { assert!(new_lsn.is_aligned()); + self.last_record_gauge.set(new_lsn.0 as i64); self.last_record_lsn.advance(new_lsn); } From 277e41f4b73d91bfb96383eab1f42c4e5f7a0ad9 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 29 Mar 2022 13:48:26 +0300 Subject: [PATCH 0087/1022] Show s3 spans in logs and improve the log messages --- pageserver/src/remote_storage/storage_sync.rs | 8 ++++---- zenith_utils/src/http/endpoint.rs | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index cd6c40b46f..50a260491b 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -321,8 +321,8 @@ pub fn schedule_timeline_checkpoint_upload( tenant_id, timeline_id ) } else { - warn!( - "Could not send an upload task for tenant {}, timeline {}: the sync queue is not initialized", + debug!( + "Upload task for tenant {}, timeline {} sent", tenant_id, timeline_id ) } @@ -455,7 +455,7 @@ fn storage_sync_loop< max_concurrent_sync, max_sync_errors, ) - .instrument(debug_span!("storage_sync_loop_step")) => step, + .instrument(info_span!("storage_sync_loop_step")) => step, _ = thread_mgr::shutdown_watcher() => LoopStep::Shutdown, } }); @@ -528,7 +528,7 @@ async fn loop_step< let extra_step = match tokio::spawn( process_task(conf, Arc::clone(&remote_assets), task, max_sync_errors).instrument( - debug_span!("process_sync_task", sync_id = %sync_id, attempt, sync_name), + info_span!("process_sync_task", sync_id = %sync_id, attempt, sync_name), ), ) .await diff --git a/zenith_utils/src/http/endpoint.rs b/zenith_utils/src/http/endpoint.rs index 0be08f45e1..7669f18cd2 100644 --- a/zenith_utils/src/http/endpoint.rs +++ b/zenith_utils/src/http/endpoint.rs @@ -160,7 +160,7 @@ pub fn serve_thread_main( where S: Future + Send + Sync, { - info!("Starting a http endpoint at {}", listener.local_addr()?); + info!("Starting an HTTP endpoint at {}", listener.local_addr()?); // Create a Service from the router above to handle incoming requests. let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap(); From 5c5629910f33bead0150821217c115db5ece5495 Mon Sep 17 00:00:00 2001 From: Anton Shyrabokau <97127717+antons-antons@users.noreply.github.com> Date: Tue, 29 Mar 2022 22:13:06 -0700 Subject: [PATCH 0088/1022] Add a test case for reading historic page versions (#1314) * Add a test case for reading historic page versions Test read_page_at_lsn returns correct results when compared to page inspect. Validate possiblity of reading pages from dropped relation. Ensure funcitons read latest version when null lsn supplied. Check that functions do not poison buffer cache with stale page versions. --- Makefile | 5 + .../batch_others/test_read_validation.py | 183 ++++++++++++++++++ vendor/postgres | 2 +- 3 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 test_runner/batch_others/test_read_validation.py diff --git a/Makefile b/Makefile index ef26ceee2d..d2a79661f2 100644 --- a/Makefile +++ b/Makefile @@ -78,6 +78,11 @@ postgres: postgres-configure \ $(MAKE) -C tmp_install/build/contrib/zenith install +@echo "Compiling contrib/zenith_test_utils" $(MAKE) -C tmp_install/build/contrib/zenith_test_utils install + +@echo "Compiling pg_buffercache" + $(MAKE) -C tmp_install/build/contrib/pg_buffercache install + +@echo "Compiling pageinspect" + $(MAKE) -C tmp_install/build/contrib/pageinspect install + .PHONY: postgres-clean postgres-clean: diff --git a/test_runner/batch_others/test_read_validation.py b/test_runner/batch_others/test_read_validation.py new file mode 100644 index 0000000000..ee41e6511c --- /dev/null +++ b/test_runner/batch_others/test_read_validation.py @@ -0,0 +1,183 @@ +from contextlib import closing + +from fixtures.zenith_fixtures import ZenithEnv +from fixtures.log_helper import log + +from psycopg2.errors import UndefinedTable +from psycopg2.errors import IoError + +pytest_plugins = ("fixtures.zenith_fixtures") + +extensions = ["pageinspect", "zenith_test_utils", "pg_buffercache"] + + +# +# Validation of reading different page versions +# +def test_read_validation(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + env.zenith_cli.create_branch("test_read_validation", "empty") + + pg = env.postgres.create_start("test_read_validation") + log.info("postgres is running on 'test_read_validation' branch") + + with closing(pg.connect()) as con: + with con.cursor() as c: + + for e in extensions: + c.execute("create extension if not exists {};".format(e)) + + c.execute("create table foo (c int) with (autovacuum_enabled = false)") + c.execute("insert into foo values (1)") + + c.execute("select lsn, lower, upper from page_header(get_raw_page('foo', 'main', 0));") + first = c.fetchone() + + c.execute("select relfilenode from pg_class where relname = 'foo'") + relfilenode = c.fetchone()[0] + + c.execute("insert into foo values (2);") + c.execute("select lsn, lower, upper from page_header(get_raw_page('foo', 'main', 0));") + second = c.fetchone() + + assert first != second, "Failed to update page" + + log.info("Test table is populated, validating buffer cache") + + c.execute( + "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode)) + assert c.fetchone()[0] > 0, "No buffers cached for the test relation" + + c.execute( + "select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {}" + .format(relfilenode)) + reln = c.fetchone() + + log.info("Clear buffer cache to ensure no stale pages are brought into the cache") + + c.execute("select clear_buffer_cache()") + + c.execute( + "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode)) + assert c.fetchone()[0] == 0, "Failed to clear buffer cache" + + log.info("Cache is clear, reading stale page version") + + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{}'))" + .format(first[0])) + direct_first = c.fetchone() + assert first == direct_first, "Failed fetch page at historic lsn" + + c.execute( + "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode)) + assert c.fetchone()[0] == 0, "relation buffers detected after invalidation" + + log.info("Cache is clear, reading latest page version without cache") + + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL))" + ) + direct_latest = c.fetchone() + assert second == direct_latest, "Failed fetch page at latest lsn" + + c.execute( + "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode)) + assert c.fetchone()[0] == 0, "relation buffers detected after invalidation" + + log.info( + "Cache is clear, reading stale page version without cache using relation identifiers" + ) + + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))" + .format(reln[0], reln[1], reln[2], first[0])) + direct_first = c.fetchone() + assert first == direct_first, "Failed fetch page at historic lsn using oid" + + log.info( + "Cache is clear, reading latest page version without cache using relation identifiers" + ) + + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, NULL ))" + .format(reln[0], reln[1], reln[2])) + direct_latest = c.fetchone() + assert second == direct_latest, "Failed fetch page at latest lsn" + + c.execute('drop table foo;') + + log.info( + "Relation dropped, attempting reading stale page version without cache using relation identifiers" + ) + + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))" + .format(reln[0], reln[1], reln[2], first[0])) + direct_first = c.fetchone() + assert first == direct_first, "Failed fetch page at historic lsn using oid" + + log.info("Validation page inspect won't allow reading pages of dropped relations") + try: + c.execute("select * from page_header(get_raw_page('foo', 'main', 0));") + assert False, "query should have failed" + except UndefinedTable as e: + log.info("Caught an expected failure: {}".format(e)) + + +def test_read_validation_neg(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + env.zenith_cli.create_branch("test_read_validation_neg", "empty") + + pg = env.postgres.create_start("test_read_validation_neg") + log.info("postgres is running on 'test_read_validation_neg' branch") + + with closing(pg.connect()) as con: + with con.cursor() as c: + + for e in extensions: + c.execute("create extension if not exists {};".format(e)) + + log.info("read a page of a missing relation") + try: + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0'))" + ) + assert False, "query should have failed" + except UndefinedTable as e: + log.info("Caught an expected failure: {}".format(e)) + + c.execute("create table foo (c int) with (autovacuum_enabled = false)") + c.execute("insert into foo values (1)") + + log.info("read a page at lsn 0") + try: + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0'))" + ) + assert False, "query should have failed" + except IoError as e: + log.info("Caught an expected failure: {}".format(e)) + + log.info("Pass NULL as an input") + expected = (None, None, None) + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0'))" + ) + assert c.fetchone() == expected, "Expected null output" + + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0'))" + ) + assert c.fetchone() == expected, "Expected null output" + + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0'))" + ) + assert c.fetchone() == expected, "Expected null output" + + # This check is currently failing, reading beyond EOF is returning a 0-page + log.info("Read beyond EOF") + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL))" + ) diff --git a/vendor/postgres b/vendor/postgres index 19164aeacf..5c278ed0ac 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 19164aeacfd877ef75d67e70a71647f5d4c0cd2f +Subproject commit 5c278ed0aca5dea9340d9af4ad5f004d905ff1b7 From 860923420468a3882b71929f2dbe59673484ddca Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 29 Mar 2022 22:44:33 +0300 Subject: [PATCH 0089/1022] decrease the log level to debug because it is too noisy --- pageserver/src/layered_repository.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 33f5694879..202a2ea756 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1645,11 +1645,8 @@ impl LayeredTimeline { }; let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?; - if num_deltas == 0 { - continue; - } - info!( + debug!( "range {}-{}, has {} deltas on this timeline", img_range.start, img_range.end, num_deltas ); From 649f324fe3b7dc5ff8b95cfaabf584753d53af16 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 30 Mar 2022 13:46:18 +0300 Subject: [PATCH 0090/1022] make logging in basebackup more consistent --- pageserver/src/basebackup.rs | 1 + pageserver/src/page_service.rs | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index e2a56f17d6..3caf27b9b3 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -65,6 +65,7 @@ impl<'a> Basebackup<'a> { // prev_lsn to Lsn(0) if we cannot provide the correct value. let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { // Backup was requested at a particular LSN. Wait for it to arrive. + info!("waiting for {}", req_lsn); timeline.tline.wait_lsn(req_lsn)?; // If the requested point is the end of the timeline, we can diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 43e1ec275d..e7a4117b3e 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -514,6 +514,7 @@ impl PageServerHandler { ) -> anyhow::Result<()> { let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty); let _enter = span.enter(); + info!("starting"); // check that the timeline exists let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid) @@ -536,7 +537,7 @@ impl PageServerHandler { basebackup.send_tarball()?; } pgb.write_message(&BeMessage::CopyDone)?; - debug!("CopyDone sent!"); + info!("done"); Ok(()) } From 1aa8fe43cf9b769ec728b126a6a5c20b6f9d388f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 31 Mar 2022 15:47:59 +0300 Subject: [PATCH 0091/1022] Fix race condition in image layer (#1440) * Fix race condition in image layer refer #1439 * Add explicit drop(inner) in layer load method * Add explicit drop(inner) in layer load method --- pageserver/src/layered_repository/image_layer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index ab51c36cae..ed9be913b9 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -267,7 +267,7 @@ impl ImageLayer { // a write lock. (Or rather, release and re-lock in write mode.) drop(inner); let mut inner = self.inner.write().unwrap(); - if inner.book.is_none() { + if !inner.loaded { self.load_inner(&mut inner)?; } else { // Another thread loaded it while we were not holding the lock. From a40b7cd516672a58d63de8015d848cd40ce33f08 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 31 Mar 2022 17:00:09 +0300 Subject: [PATCH 0092/1022] Fix timeouts in test_restarts_under_load (#1436) * Enable backpressure in test_restarts_under_load * Remove hacks because #644 is fixed now * Adjust config in test_restarts_under_load --- .../batch_others/test_wal_acceptor_async.py | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index 31ace7eab3..aadafc76cf 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -1,9 +1,10 @@ import asyncio +import uuid import asyncpg import random import time -from fixtures.zenith_fixtures import ZenithEnvBuilder, Postgres, Safekeeper +from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres, Safekeeper from fixtures.log_helper import getLogger from fixtures.utils import lsn_from_hex, lsn_to_hex from typing import List @@ -30,10 +31,6 @@ class BankClient(object): await self.conn.execute('DROP TABLE IF EXISTS bank_log') await self.conn.execute('CREATE TABLE bank_log(from_uid int, to_uid int, amount int)') - # TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed - await self.conn.execute('ALTER TABLE bank_accs SET (autovacuum_enabled = false)') - await self.conn.execute('ALTER TABLE bank_log SET (autovacuum_enabled = false)') - async def check_invariant(self): row = await self.conn.fetchrow('SELECT sum(amount) AS sum FROM bank_accs') assert row['sum'] == self.n_accounts * self.init_amount @@ -139,12 +136,15 @@ async def wait_for_lsn(safekeeper: Safekeeper, # On each iteration 1 acceptor is stopped, and 2 others should allow # background workers execute transactions. In the end, state should remain # consistent. -async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_workers=10): +async def run_restarts_under_load(env: ZenithEnv, + pg: Postgres, + acceptors: List[Safekeeper], + n_workers=10): n_accounts = 100 init_amount = 100000 max_transfer = 100 - period_time = 10 - iterations = 6 + period_time = 4 + iterations = 10 # Set timeout for this test at 5 minutes. It should be enough for test to complete # and less than CircleCI's no_output_timeout, taking into account that this timeout @@ -176,6 +176,11 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_w flush_lsn = lsn_to_hex(flush_lsn) log.info(f'Postgres flush_lsn {flush_lsn}') + pageserver_lsn = env.pageserver.http_client().timeline_detail( + uuid.UUID(tenant_id), uuid.UUID((timeline_id)))["local"]["last_record_lsn"] + sk_ps_lag = lsn_from_hex(flush_lsn) - lsn_from_hex(pageserver_lsn) + log.info(f'Pageserver last_record_lsn={pageserver_lsn} lag={sk_ps_lag / 1024}kb') + # Wait until alive safekeepers catch up with postgres for idx, safekeeper in enumerate(acceptors): if idx != victim_idx: @@ -203,9 +208,8 @@ def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder): env = zenith_env_builder.init_start() env.zenith_cli.create_branch('test_wal_acceptors_restarts_under_load') - pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load') + # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long + pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load', + config_lines=['max_replication_write_lag=1MB']) - asyncio.run(run_restarts_under_load(pg, env.safekeepers)) - - # TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed - pg.stop() + asyncio.run(run_restarts_under_load(env, pg, env.safekeepers)) From 8745b022a985f6b758f9bddb9aae8038608df677 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 31 Mar 2022 12:29:13 +0300 Subject: [PATCH 0093/1022] Extend LayerMap dump() function to print also open_layers and frozen_layers. Add verbose option to chose if we need to print all layer's keys or not. --- pageserver/src/bin/dump_layerfile.rs | 2 +- pageserver/src/layered_repository.rs | 8 ++++---- pageserver/src/layered_repository/delta_layer.rs | 6 +++++- pageserver/src/layered_repository/image_layer.rs | 6 +++++- .../src/layered_repository/inmemory_layer.rs | 6 +++++- pageserver/src/layered_repository/layer_map.rs | 16 ++++++++++++++-- .../src/layered_repository/storage_layer.rs | 2 +- 7 files changed, 35 insertions(+), 11 deletions(-) diff --git a/pageserver/src/bin/dump_layerfile.rs b/pageserver/src/bin/dump_layerfile.rs index b954ad5a15..27d41d50d9 100644 --- a/pageserver/src/bin/dump_layerfile.rs +++ b/pageserver/src/bin/dump_layerfile.rs @@ -25,7 +25,7 @@ fn main() -> Result<()> { // Basic initialization of things that don't change after startup virtual_file::init(10); - dump_layerfile_from_path(&path)?; + dump_layerfile_from_path(&path, true)?; Ok(()) } diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 202a2ea756..4a9d1c480d 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -2066,16 +2066,16 @@ impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { } /// Dump contents of a layer file to stdout. -pub fn dump_layerfile_from_path(path: &Path) -> Result<()> { +pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { let file = File::open(path)?; let book = Book::new(file)?; match book.magic() { crate::DELTA_FILE_MAGIC => { - DeltaLayer::new_for_path(path, &book)?.dump()?; + DeltaLayer::new_for_path(path, &book)?.dump(verbose)?; } crate::IMAGE_FILE_MAGIC => { - ImageLayer::new_for_path(path, &book)?.dump()?; + ImageLayer::new_for_path(path, &book)?.dump(verbose)?; } magic => bail!("unrecognized magic identifier: {:?}", magic), } @@ -2216,7 +2216,7 @@ pub mod tests { let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); let mut blknum = 0; for _ in 0..50 { - for _ in 0..1000 { + for _ in 0..10000 { test_key.field6 = blknum; let writer = tline.writer(); writer.put( diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index bb5fa02be1..0e59eb7a3c 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -267,7 +267,7 @@ impl Layer for DeltaLayer { } /// debugging function to print out the contents of the layer - fn dump(&self) -> Result<()> { + fn dump(&self, verbose: bool) -> Result<()> { println!( "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----", self.tenantid, @@ -278,6 +278,10 @@ impl Layer for DeltaLayer { self.lsn_range.end ); + if !verbose { + return Ok(()); + } + let inner = self.load()?; let path = self.path(); diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index ed9be913b9..2b9bf4a717 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -212,12 +212,16 @@ impl Layer for ImageLayer { } /// debugging function to print out the contents of the layer - fn dump(&self) -> Result<()> { + fn dump(&self, verbose: bool) -> Result<()> { println!( "----- image layer for ten {} tli {} key {}-{} at {} ----", self.tenantid, self.timelineid, self.key_range.start, self.key_range.end, self.lsn ); + if !verbose { + return Ok(()); + } + let inner = self.load()?; let mut index_vec: Vec<(&Key, &BlobRef)> = inner.index.iter().collect(); diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index b5d98a4ca3..8670442a2c 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -190,7 +190,7 @@ impl Layer for InMemoryLayer { } /// debugging function to print out the contents of the layer - fn dump(&self) -> Result<()> { + fn dump(&self, verbose: bool) -> Result<()> { let inner = self.inner.read().unwrap(); let end_str = inner @@ -204,6 +204,10 @@ impl Layer for InMemoryLayer { self.timelineid, self.start_lsn, end_str, ); + if !verbose { + return Ok(()); + } + let mut buf = Vec::new(); for (key, vec_map) in inner.index.iter() { for (lsn, blob_ref) in vec_map.as_slice() { diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index c4929a6173..b6a3bd82aa 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -392,10 +392,22 @@ impl LayerMap { /// debugging function to print out the contents of the layer map #[allow(unused)] - pub fn dump(&self) -> Result<()> { + pub fn dump(&self, verbose: bool) -> Result<()> { println!("Begin dump LayerMap"); + + println!("open_layer:"); + if let Some(open_layer) = &self.open_layer { + open_layer.dump(verbose)?; + } + + println!("frozen_layers:"); + for frozen_layer in self.frozen_layers.iter() { + frozen_layer.dump(verbose)?; + } + + println!("historic_layers:"); for layer in self.historic_layers.iter() { - layer.dump()?; + layer.dump(verbose)?; } println!("End dump LayerMap"); Ok(()) diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index de34545980..dcf5b63908 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -143,7 +143,7 @@ pub trait Layer: Send + Sync { fn delete(&self) -> Result<()>; /// Dump summary of the contents of the layer to stdout - fn dump(&self) -> Result<()>; + fn dump(&self, verbose: bool) -> Result<()>; } // Flag indicating that this version initialize the page From f5da6523882e2be24a5e4252be7c5f963fbc4c7c Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Thu, 31 Mar 2022 20:44:57 +0300 Subject: [PATCH 0094/1022] [proxy] Enable keepalives for all tcp connections (#1448) --- Cargo.lock | 16 ++++++++++++---- compute_tools/Cargo.toml | 2 +- pageserver/Cargo.toml | 2 +- proxy/Cargo.toml | 3 ++- proxy/src/compute.rs | 1 + proxy/src/proxy.rs | 24 ++++++++++++++++++++++++ walkeeper/Cargo.toml | 2 +- zenith_utils/Cargo.toml | 2 +- 8 files changed, 43 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c770f576c9..bb27df7012 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -916,7 +916,7 @@ checksum = "418d37c8b1d42553c93648be529cb70f920d3baf8ef469b74b9638df426e0b4c" dependencies = [ "cfg-if", "libc", - "wasi", + "wasi 0.10.0+wasi-snapshot-preview1", ] [[package]] @@ -1371,14 +1371,15 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba272f85fa0b41fc91872be579b3bbe0f56b792aa361a380eb669469f68dafb2" +checksum = "52da4364ffb0e4fe33a9841a98a3f3014fb964045ce4f7a45a398243c8d6b0c9" dependencies = [ "libc", "log", "miow", "ntapi", + "wasi 0.11.0+wasi-snapshot-preview1", "winapi", ] @@ -1931,6 +1932,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", + "socket2", "thiserror", "tokio", "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", @@ -2609,7 +2611,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" dependencies = [ "libc", - "wasi", + "wasi 0.10.0+wasi-snapshot-preview1", "winapi", ] @@ -3113,6 +3115,12 @@ version = "0.10.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + [[package]] name = "wasm-bindgen" version = "0.2.79" diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 4ecf7f6499..56047093f1 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -16,5 +16,5 @@ regex = "1" serde = { version = "1.0", features = ["derive"] } serde_json = "1" tar = "0.4" -tokio = { version = "1", features = ["macros", "rt", "rt-multi-thread"] } +tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] } workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 14eae31da8..6a77af1691 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -17,7 +17,7 @@ lazy_static = "1.4.0" log = "0.4.14" clap = "3.0" daemonize = "0.4.1" -tokio = { version = "1.11", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } +tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 72c394dad4..dc20695884 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -22,8 +22,9 @@ rustls = "0.19.1" scopeguard = "1.1.0" serde = "1" serde_json = "1" +socket2 = "0.4.4" thiserror = "1.0" -tokio = { version = "1.11", features = ["macros"] } +tokio = { version = "1.17", features = ["macros"] } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } tokio-rustls = "0.22.0" diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 64ce5d0a5a..7c0ab965a0 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -41,6 +41,7 @@ impl DatabaseInfo { let host_port = format!("{}:{}", self.host, self.port); let socket = TcpStream::connect(host_port).await?; let socket_addr = socket.peer_addr()?; + socket2::SockRef::from(&socket).set_keepalive(true)?; Ok((socket_addr, socket)) } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 3c7f59bc26..81581b5cf1 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -50,6 +50,10 @@ pub async fn thread_main( println!("proxy has shut down"); } + // When set for the server socket, the keepalive setting + // will be inherited by all accepted client sockets. + socket2::SockRef::from(&listener).set_keepalive(true)?; + let cancel_map = Arc::new(CancelMap::default()); loop { let (socket, peer_addr) = listener.accept().await?; @@ -367,4 +371,24 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn keepalive_is_inherited() -> anyhow::Result<()> { + use tokio::net::{TcpListener, TcpStream}; + + let listener = TcpListener::bind("127.0.0.1:0").await?; + let port = listener.local_addr()?.port(); + socket2::SockRef::from(&listener).set_keepalive(true)?; + + let t = tokio::spawn(async move { + let (client, _) = listener.accept().await?; + let keepalive = socket2::SockRef::from(&client).keepalive()?; + anyhow::Ok(keepalive) + }); + + let _ = TcpStream::connect(("127.0.0.1", port)).await?; + assert!(t.await??, "keepalive should be inherited"); + + Ok(()) + } } diff --git a/walkeeper/Cargo.toml b/walkeeper/Cargo.toml index e8523d27d1..ddce78e737 100644 --- a/walkeeper/Cargo.toml +++ b/walkeeper/Cargo.toml @@ -15,7 +15,7 @@ tracing = "0.1.27" clap = "3.0" daemonize = "0.4.1" rust-s3 = { version = "0.28", default-features = false, features = ["no-verify-ssl", "tokio-rustls-tls"] } -tokio = { version = "1.11", features = ["macros"] } +tokio = { version = "1.17", features = ["macros"] } postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } anyhow = "1.0" diff --git a/zenith_utils/Cargo.toml b/zenith_utils/Cargo.toml index e8ad0e627f..cf864b3a54 100644 --- a/zenith_utils/Cargo.toml +++ b/zenith_utils/Cargo.toml @@ -16,7 +16,7 @@ routerify = "3" serde = { version = "1.0", features = ["derive"] } serde_json = "1" thiserror = "1.0" -tokio = { version = "1.11", features = ["macros"]} +tokio = { version = "1.17", features = ["macros"]} tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } nix = "0.23.0" From af712798e75589a5186fe3c78fa683b901fe2566 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Fri, 1 Apr 2022 15:47:23 -0400 Subject: [PATCH 0095/1022] Fix pageserver readme formatting I put the diagram in a fixed-width block, since it wasn't rendering correctly on github. --- pageserver/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/README.md b/pageserver/README.md index 69080a16cc..1fd627785c 100644 --- a/pageserver/README.md +++ b/pageserver/README.md @@ -13,7 +13,7 @@ keeps track of WAL records which are not synced to S3 yet. The Page Server consists of multiple threads that operate on a shared repository of page versions: - +``` | WAL V +--------------+ @@ -46,7 +46,7 @@ Legend: ---> Data flow <--- - +``` Page Service ------------ From 43c16c514556bb0ccbeb3b0458f46d39866005aa Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 1 Apr 2022 20:48:03 +0300 Subject: [PATCH 0096/1022] Don't log ZIds in the timeline load span --- pageserver/src/layered_repository.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 4a9d1c480d..a352f31169 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -468,18 +468,20 @@ impl LayeredRepository { match timelines.get(&timelineid) { Some(entry) => match entry { LayeredTimelineEntry::Loaded(local_timeline) => { - trace!("timeline {} found loaded", &timelineid); + debug!("timeline {} found loaded into memory", &timelineid); return Ok(Some(Arc::clone(local_timeline))); } - LayeredTimelineEntry::Unloaded { .. } => { - trace!("timeline {} found unloaded", &timelineid) - } + LayeredTimelineEntry::Unloaded { .. } => {} }, None => { - trace!("timeline {} not found", &timelineid); + debug!("timeline {} not found", &timelineid); return Ok(None); } }; + debug!( + "timeline {} found on a local disk, but not loaded into the memory, loading", + &timelineid + ); let timeline = self.load_local_timeline(timelineid, timelines)?; let was_loaded = timelines.insert( timelineid, @@ -516,9 +518,7 @@ impl LayeredRepository { .context("cannot load ancestor timeline")? .flatten() .map(LayeredTimelineEntry::Loaded); - let _enter = - info_span!("loading timeline", timeline = %timelineid, tenant = %self.tenantid) - .entered(); + let _enter = info_span!("loading local timeline").entered(); let timeline = LayeredTimeline::new( self.conf, metadata, From 9e5423c86724cdd90cefd81791214870138b6983 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 1 Apr 2022 21:46:54 +0300 Subject: [PATCH 0097/1022] Assert in a more informative way --- postgres_ffi/src/xlog_utils.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/postgres_ffi/src/xlog_utils.rs b/postgres_ffi/src/xlog_utils.rs index d2b2b5c122..89fdbbf7ac 100644 --- a/postgres_ffi/src/xlog_utils.rs +++ b/postgres_ffi/src/xlog_utils.rs @@ -495,7 +495,13 @@ mod tests { .env("DYLD_LIBRARY_PATH", &lib_path) .output() .unwrap(); - assert!(initdb_output.status.success()); + assert!( + initdb_output.status.success(), + "initdb failed. Status: '{}', stdout: '{}', stderr: '{}'", + initdb_output.status, + String::from_utf8_lossy(&initdb_output.stdout), + String::from_utf8_lossy(&initdb_output.stderr), + ); // 2. Pick WAL generated by initdb let wal_dir = data_dir.join("pg_wal"); From 4c9447589a837266fb943cc0f32124191891cd9a Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 1 Apr 2022 23:23:13 +0300 Subject: [PATCH 0098/1022] Place an info span into gc loop step --- pageserver/src/layered_repository.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index a352f31169..f07a2639d3 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -630,6 +630,8 @@ impl LayeredRepository { horizon: u64, checkpoint_before_gc: bool, ) -> Result { + let _span_guard = + info_span!("gc iteration", tenant = %self.tenantid, timeline = ?target_timelineid); let mut totals: GcResult = Default::default(); let now = Instant::now(); From 1f0b406b633aa624f89d1632affabd03ab622171 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 31 Mar 2022 16:28:07 +0300 Subject: [PATCH 0099/1022] Perform repartitioning in compaction thread refer #1441 --- pageserver/src/layered_repository.rs | 5 +++++ pageserver/src/pgdatadir_mapping.rs | 21 +++++++++++---------- pageserver/src/timelines.rs | 2 +- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index f07a2639d3..a63f157552 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -41,6 +41,7 @@ use crate::repository::{ GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncStatusUpdate, TimelineWriter, }; use crate::repository::{Key, Value}; +use crate::tenant_mgr; use crate::thread_mgr; use crate::virtual_file::VirtualFile; use crate::walreceiver::IS_WAL_RECEIVER; @@ -1588,6 +1589,10 @@ impl LayeredTimeline { let target_file_size = self.conf.checkpoint_distance; + // Define partitioning schema if needed + tenant_mgr::get_timeline_for_tenant_load(self.tenantid, self.timelineid)? + .repartition(self.get_last_record_lsn())?; + // 1. The partitioning was already done by the code in // pgdatadir_mapping.rs. We just use it here. let partitioning_guard = self.partitioning.read().unwrap(); diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 7b0fc606de..75ace4ecee 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -388,6 +388,17 @@ impl DatadirTimeline { Ok(result.to_keyspace()) } + + pub fn repartition(&self, lsn: Lsn) -> Result<()> { + let last_partitioning = self.last_partitioning.load(); + if last_partitioning == Lsn(0) || lsn.0 - last_partitioning.0 > self.repartition_threshold { + let keyspace = self.collect_keyspace(lsn)?; + let partitioning = keyspace.partition(TARGET_FILE_SIZE_BYTES); + self.tline.hint_partitioning(partitioning, lsn)?; + self.last_partitioning.store(lsn); + } + Ok(()) + } } /// DatadirModification represents an operation to ingest an atomic set of @@ -767,7 +778,6 @@ impl<'a, R: Repository> DatadirModification<'a, R> { pub fn commit(self) -> Result<()> { let writer = self.tline.tline.writer(); - let last_partitioning = self.tline.last_partitioning.load(); let pending_nblocks = self.pending_nblocks; for (key, value) in self.pending_updates { @@ -779,15 +789,6 @@ impl<'a, R: Repository> DatadirModification<'a, R> { writer.finish_write(self.lsn); - if last_partitioning == Lsn(0) - || self.lsn.0 - last_partitioning.0 > self.tline.repartition_threshold - { - let keyspace = self.tline.collect_keyspace(self.lsn)?; - let partitioning = keyspace.partition(TARGET_FILE_SIZE_BYTES); - self.tline.tline.hint_partitioning(partitioning, self.lsn)?; - self.tline.last_partitioning.store(self.lsn); - } - if pending_nblocks != 0 { self.tline.current_logical_size.fetch_add( pending_nblocks * pg_constants::BLCKSZ as isize, diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 105c3c869f..ae713c260c 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -286,7 +286,7 @@ fn bootstrap_timeline( let timeline = repo.create_empty_timeline(tli, lsn)?; let mut page_tline: DatadirTimeline = DatadirTimeline::new(timeline, u64::MAX); import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?; - page_tline.tline.checkpoint(CheckpointConfig::Forced)?; + page_tline.tline.checkpoint(CheckpointConfig::Flush)?; println!( "created initial timeline {} timeline.lsn {}", From 92031d376af9c8d80e77ee33afdb9b7868281f9c Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 31 Mar 2022 16:44:01 +0300 Subject: [PATCH 0100/1022] Fix unit tests --- pageserver/src/layered_repository.rs | 6 ++++-- pageserver/src/timelines.rs | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index a63f157552..eb4f49ddd1 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1590,8 +1590,10 @@ impl LayeredTimeline { let target_file_size = self.conf.checkpoint_distance; // Define partitioning schema if needed - tenant_mgr::get_timeline_for_tenant_load(self.tenantid, self.timelineid)? - .repartition(self.get_last_record_lsn())?; + if let Ok(pgdir) = tenant_mgr::get_timeline_for_tenant_load(self.tenantid, self.timelineid) + { + pgdir.repartition(self.get_last_record_lsn())?; + } // 1. The partitioning was already done by the code in // pgdatadir_mapping.rs. We just use it here. diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index ae713c260c..105c3c869f 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -286,7 +286,7 @@ fn bootstrap_timeline( let timeline = repo.create_empty_timeline(tli, lsn)?; let mut page_tline: DatadirTimeline = DatadirTimeline::new(timeline, u64::MAX); import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?; - page_tline.tline.checkpoint(CheckpointConfig::Flush)?; + page_tline.tline.checkpoint(CheckpointConfig::Forced)?; println!( "created initial timeline {} timeline.lsn {}", From 232fe14297c6f12b6ad83b723ab6dcba09febc5e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 31 Mar 2022 20:23:56 +0300 Subject: [PATCH 0101/1022] Refactor partitioning --- pageserver/src/layered_repository.rs | 29 +++------------------------- pageserver/src/pgdatadir_mapping.rs | 25 +++++++++++++----------- pageserver/src/repository.rs | 14 -------------- 3 files changed, 17 insertions(+), 51 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index eb4f49ddd1..5ab6097960 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -34,7 +34,7 @@ use std::time::Instant; use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; use crate::config::PageServerConf; -use crate::keyspace::{KeyPartitioning, KeySpace}; +use crate::keyspace::KeySpace; use crate::page_cache; use crate::remote_storage::{schedule_timeline_checkpoint_upload, RemoteIndex}; use crate::repository::{ @@ -792,8 +792,6 @@ pub struct LayeredTimeline { // garbage collecting data that is still needed by the child timelines. gc_info: RwLock, - partitioning: RwLock>, - // It may change across major versions so for simplicity // keep it after running initdb for a timeline. // It is needed in checks when we want to error on some operations @@ -943,14 +941,6 @@ impl Timeline for LayeredTimeline { self.disk_consistent_lsn.load() } - fn hint_partitioning(&self, partitioning: KeyPartitioning, lsn: Lsn) -> Result<()> { - self.partitioning - .write() - .unwrap() - .replace((partitioning, lsn)); - Ok(()) - } - fn writer<'a>(&'a self) -> Box { Box::new(LayeredTimelineWriter { tl: self, @@ -1037,7 +1027,6 @@ impl LayeredTimeline { retain_lsns: Vec::new(), cutoff: Lsn(0), }), - partitioning: RwLock::new(None), latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), @@ -1592,23 +1581,11 @@ impl LayeredTimeline { // Define partitioning schema if needed if let Ok(pgdir) = tenant_mgr::get_timeline_for_tenant_load(self.tenantid, self.timelineid) { - pgdir.repartition(self.get_last_record_lsn())?; - } - - // 1. The partitioning was already done by the code in - // pgdatadir_mapping.rs. We just use it here. - let partitioning_guard = self.partitioning.read().unwrap(); - if let Some((partitioning, lsn)) = partitioning_guard.as_ref() { + let (partitioning, lsn) = pgdir.repartition(self.get_last_record_lsn())?; let timer = self.create_images_time_histo.start_timer(); - // Make a copy of the partitioning, so that we can release - // the lock. Otherwise we could block the WAL receiver. - let lsn = *lsn; - let parts = partitioning.parts.clone(); - drop(partitioning_guard); - // 2. Create new image layers for partitions that have been modified // "enough". - for part in parts.iter() { + for part in partitioning.parts.iter() { if self.time_for_new_image_layer(part, lsn, 3)? { self.create_image_layer(part, lsn)?; } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 75ace4ecee..fbd1b56180 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -6,7 +6,7 @@ //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! -use crate::keyspace::{KeySpace, KeySpaceAccum, TARGET_FILE_SIZE_BYTES}; +use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceAccum, TARGET_FILE_SIZE_BYTES}; use crate::reltag::{RelTag, SlruKind}; use crate::repository::*; use crate::repository::{Repository, Timeline}; @@ -18,10 +18,9 @@ use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::ops::Range; use std::sync::atomic::{AtomicIsize, Ordering}; -use std::sync::{Arc, RwLockReadGuard}; +use std::sync::{Arc, RwLock, RwLockReadGuard}; use tracing::{debug, error, trace, warn}; use zenith_utils::bin_ser::BeSer; -use zenith_utils::lsn::AtomicLsn; use zenith_utils::lsn::Lsn; /// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type. @@ -38,7 +37,7 @@ where pub tline: Arc, /// When did we last calculate the partitioning? - last_partitioning: AtomicLsn, + partitioning: RwLock<(KeyPartitioning, Lsn)>, /// Configuration: how often should the partitioning be recalculated. repartition_threshold: u64, @@ -51,7 +50,7 @@ impl DatadirTimeline { pub fn new(tline: Arc, repartition_threshold: u64) -> Self { DatadirTimeline { tline, - last_partitioning: AtomicLsn::new(0), + partitioning: RwLock::new((KeyPartitioning::new(), Lsn(0))), current_logical_size: AtomicIsize::new(0), repartition_threshold, } @@ -389,15 +388,19 @@ impl DatadirTimeline { Ok(result.to_keyspace()) } - pub fn repartition(&self, lsn: Lsn) -> Result<()> { - let last_partitioning = self.last_partitioning.load(); - if last_partitioning == Lsn(0) || lsn.0 - last_partitioning.0 > self.repartition_threshold { + pub fn repartition(&self, lsn: Lsn) -> Result<(KeyPartitioning, Lsn)> { + let partitioning_guard = self.partitioning.read().unwrap(); + if partitioning_guard.1 == Lsn(0) + || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold + { let keyspace = self.collect_keyspace(lsn)?; + drop(partitioning_guard); + let mut partitioning_guard = self.partitioning.write().unwrap(); let partitioning = keyspace.partition(TARGET_FILE_SIZE_BYTES); - self.tline.hint_partitioning(partitioning, lsn)?; - self.last_partitioning.store(lsn); + *partitioning_guard = (partitioning, lsn); + return Ok((partitioning_guard.0.clone(), lsn)); } - Ok(()) + Ok((partitioning_guard.0.clone(), partitioning_guard.1)) } } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index b960e037be..7e998b0ebe 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,4 +1,3 @@ -use crate::keyspace::KeyPartitioning; use crate::layered_repository::metadata::TimelineMetadata; use crate::remote_storage::RemoteIndex; use crate::walrecord::ZenithWalRecord; @@ -372,19 +371,6 @@ pub trait Timeline: Send + Sync { /// know anything about them here in the repository. fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>; - /// - /// Tell the implementation how the keyspace should be partitioned. - /// - /// FIXME: This is quite a hack. The code in pgdatadir_mapping.rs knows - /// which keys exist and what is the logical grouping of them. That's why - /// the code there (and in keyspace.rs) decides the partitioning, not the - /// layered_repository.rs implementation. That's a layering violation: - /// the Repository implementation ought to be responsible for the physical - /// layout, but currently it's more convenient to do it in pgdatadir_mapping.rs - /// rather than in layered_repository.rs. - /// - fn hint_partitioning(&self, partitioning: KeyPartitioning, lsn: Lsn) -> Result<()>; - /// /// Check that it is valid to request operations with that lsn. fn check_lsn_is_in_scope( From bef9b837f1171b9040dc959189796d835c1f8f9c Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 1 Apr 2022 12:09:35 +0300 Subject: [PATCH 0102/1022] Replace rwlock with mutex in repartition --- pageserver/src/layered_repository.rs | 12 ------------ pageserver/src/pgdatadir_mapping.rs | 10 ++++------ 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 5ab6097960..60b0e921ce 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -2220,12 +2220,6 @@ pub mod tests { } let cutoff = tline.get_last_record_lsn(); - let parts = keyspace - .clone() - .to_keyspace() - .partition(TEST_FILE_SIZE as u64); - tline.hint_partitioning(parts.clone(), lsn)?; - tline.update_gc_info(Vec::new(), cutoff); tline.checkpoint(CheckpointConfig::Forced)?; tline.compact()?; @@ -2268,9 +2262,6 @@ pub mod tests { keyspace.add_key(test_key); } - let parts = keyspace.to_keyspace().partition(TEST_FILE_SIZE as u64); - tline.hint_partitioning(parts, lsn)?; - for _ in 0..50 { for _ in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); @@ -2342,9 +2333,6 @@ pub mod tests { keyspace.add_key(test_key); } - let parts = keyspace.to_keyspace().partition(TEST_FILE_SIZE as u64); - tline.hint_partitioning(parts, lsn)?; - let mut tline_id = TIMELINE_ID; for _ in 0..50 { let new_tline_id = ZTimelineId::generate(); diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index fbd1b56180..2e0040f0c0 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -18,7 +18,7 @@ use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::ops::Range; use std::sync::atomic::{AtomicIsize, Ordering}; -use std::sync::{Arc, RwLock, RwLockReadGuard}; +use std::sync::{Arc, Mutex, RwLockReadGuard}; use tracing::{debug, error, trace, warn}; use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::Lsn; @@ -37,7 +37,7 @@ where pub tline: Arc, /// When did we last calculate the partitioning? - partitioning: RwLock<(KeyPartitioning, Lsn)>, + partitioning: Mutex<(KeyPartitioning, Lsn)>, /// Configuration: how often should the partitioning be recalculated. repartition_threshold: u64, @@ -50,7 +50,7 @@ impl DatadirTimeline { pub fn new(tline: Arc, repartition_threshold: u64) -> Self { DatadirTimeline { tline, - partitioning: RwLock::new((KeyPartitioning::new(), Lsn(0))), + partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), current_logical_size: AtomicIsize::new(0), repartition_threshold, } @@ -389,13 +389,11 @@ impl DatadirTimeline { } pub fn repartition(&self, lsn: Lsn) -> Result<(KeyPartitioning, Lsn)> { - let partitioning_guard = self.partitioning.read().unwrap(); + let mut partitioning_guard = self.partitioning.lock().unwrap(); if partitioning_guard.1 == Lsn(0) || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold { let keyspace = self.collect_keyspace(lsn)?; - drop(partitioning_guard); - let mut partitioning_guard = self.partitioning.write().unwrap(); let partitioning = keyspace.partition(TARGET_FILE_SIZE_BYTES); *partitioning_guard = (partitioning, lsn); return Ok((partitioning_guard.0.clone(), lsn)); From 572b3f48cf1fb1217efc8067fde2597f38dfa447 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 1 Apr 2022 19:40:39 +0300 Subject: [PATCH 0103/1022] Add compaction_target_size parameter --- pageserver/src/config.rs | 27 +++++++++++++++++++++++++++ pageserver/src/keyspace.rs | 3 --- pageserver/src/layered_repository.rs | 3 ++- pageserver/src/pgdatadir_mapping.rs | 8 ++++---- 4 files changed, 33 insertions(+), 8 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 9f7cd34a7a..0d5cac8b4f 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -30,8 +30,13 @@ pub mod defaults { // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB // would be more appropriate. But a low value forces the code to be exercised more, // which is good for now to trigger bugs. + // This parameter actually determines L0 layer file size. pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; + // Target file size, when creating image and delta layers. + // This parameter determines L1 layer file size. + pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; + pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s"; pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; @@ -58,6 +63,7 @@ pub mod defaults { #listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}' #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes +#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes #compaction_period = '{DEFAULT_COMPACTION_PERIOD}' #gc_period = '{DEFAULT_GC_PERIOD}' @@ -91,8 +97,13 @@ pub struct PageServerConf { // Flush out an inmemory layer, if it's holding WAL older than this // This puts a backstop on how much WAL needs to be re-digested if the // page server crashes. + // This parameter actually determines L0 layer file size. pub checkpoint_distance: u64, + // Target file size, when creating image and delta layers. + // This parameter determines L1 layer file size. + pub compaction_target_size: u64, + // How often to check if there's compaction work to be done. pub compaction_period: Duration, @@ -149,6 +160,7 @@ struct PageServerConfigBuilder { checkpoint_distance: BuilderValue, + compaction_target_size: BuilderValue, compaction_period: BuilderValue, gc_horizon: BuilderValue, @@ -183,6 +195,7 @@ impl Default for PageServerConfigBuilder { listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()), listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()), checkpoint_distance: Set(DEFAULT_CHECKPOINT_DISTANCE), + compaction_target_size: Set(DEFAULT_COMPACTION_TARGET_SIZE), compaction_period: Set(humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) .expect("cannot parse default compaction period")), gc_horizon: Set(DEFAULT_GC_HORIZON), @@ -220,6 +233,10 @@ impl PageServerConfigBuilder { self.checkpoint_distance = BuilderValue::Set(checkpoint_distance) } + pub fn compaction_target_size(&mut self, compaction_target_size: u64) { + self.compaction_target_size = BuilderValue::Set(compaction_target_size) + } + pub fn compaction_period(&mut self, compaction_period: Duration) { self.compaction_period = BuilderValue::Set(compaction_period) } @@ -290,6 +307,9 @@ impl PageServerConfigBuilder { checkpoint_distance: self .checkpoint_distance .ok_or(anyhow::anyhow!("missing checkpoint_distance"))?, + compaction_target_size: self + .compaction_target_size + .ok_or(anyhow::anyhow!("missing compaction_target_size"))?, compaction_period: self .compaction_period .ok_or(anyhow::anyhow!("missing compaction_period"))?, @@ -429,6 +449,9 @@ impl PageServerConf { "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?), "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?), "checkpoint_distance" => builder.checkpoint_distance(parse_toml_u64(key, item)?), + "compaction_target_size" => { + builder.compaction_target_size(parse_toml_u64(key, item)?) + } "compaction_period" => builder.compaction_period(parse_toml_duration(key, item)?), "gc_horizon" => builder.gc_horizon(parse_toml_u64(key, item)?), "gc_period" => builder.gc_period(parse_toml_duration(key, item)?), @@ -565,6 +588,7 @@ impl PageServerConf { PageServerConf { id: ZNodeId(0), checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, + compaction_target_size: 4 * 1024 * 1024, compaction_period: Duration::from_secs(10), gc_horizon: defaults::DEFAULT_GC_HORIZON, gc_period: Duration::from_secs(10), @@ -636,6 +660,7 @@ listen_http_addr = '127.0.0.1:9898' checkpoint_distance = 111 # in bytes +compaction_target_size = 111 # in bytes compaction_period = '111 s' gc_period = '222 s' @@ -673,6 +698,7 @@ id = 10 listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, + compaction_target_size: defaults::DEFAULT_COMPACTION_TARGET_SIZE, compaction_period: humantime::parse_duration(defaults::DEFAULT_COMPACTION_PERIOD)?, gc_horizon: defaults::DEFAULT_GC_HORIZON, gc_period: humantime::parse_duration(defaults::DEFAULT_GC_PERIOD)?, @@ -717,6 +743,7 @@ id = 10 listen_pg_addr: "127.0.0.1:64000".to_string(), listen_http_addr: "127.0.0.1:9898".to_string(), checkpoint_distance: 111, + compaction_target_size: 111, compaction_period: Duration::from_secs(111), gc_horizon: 222, gc_period: Duration::from_secs(222), diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs index 9973568b07..f6f0d7b7cf 100644 --- a/pageserver/src/keyspace.rs +++ b/pageserver/src/keyspace.rs @@ -2,9 +2,6 @@ use crate::repository::{key_range_size, singleton_range, Key}; use postgres_ffi::pg_constants; use std::ops::Range; -// Target file size, when creating image and delta layers -pub const TARGET_FILE_SIZE_BYTES: u64 = 128 * 1024 * 1024; // 128 MB - /// /// Represents a set of Keys, in a compact form. /// diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 60b0e921ce..2d9b680624 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1581,7 +1581,8 @@ impl LayeredTimeline { // Define partitioning schema if needed if let Ok(pgdir) = tenant_mgr::get_timeline_for_tenant_load(self.tenantid, self.timelineid) { - let (partitioning, lsn) = pgdir.repartition(self.get_last_record_lsn())?; + let (partitioning, lsn) = + pgdir.repartition(self.get_last_record_lsn(), self.conf.compaction_target_size)?; let timer = self.create_images_time_histo.start_timer(); // 2. Create new image layers for partitions that have been modified // "enough". diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 2e0040f0c0..af12084766 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -6,7 +6,7 @@ //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! -use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceAccum, TARGET_FILE_SIZE_BYTES}; +use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceAccum}; use crate::reltag::{RelTag, SlruKind}; use crate::repository::*; use crate::repository::{Repository, Timeline}; @@ -388,13 +388,13 @@ impl DatadirTimeline { Ok(result.to_keyspace()) } - pub fn repartition(&self, lsn: Lsn) -> Result<(KeyPartitioning, Lsn)> { + pub fn repartition(&self, lsn: Lsn, partition_size: u64) -> Result<(KeyPartitioning, Lsn)> { let mut partitioning_guard = self.partitioning.lock().unwrap(); if partitioning_guard.1 == Lsn(0) || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold { let keyspace = self.collect_keyspace(lsn)?; - let partitioning = keyspace.partition(TARGET_FILE_SIZE_BYTES); + let partitioning = keyspace.partition(partition_size); *partitioning_guard = (partitioning, lsn); return Ok((partitioning_guard.0.clone(), lsn)); } @@ -1215,7 +1215,7 @@ pub fn create_test_timeline( timeline_id: zenith_utils::zid::ZTimelineId, ) -> Result>> { let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; - let tline = DatadirTimeline::new(tline, crate::layered_repository::tests::TEST_FILE_SIZE / 10); + let tline = DatadirTimeline::new(tline, tline.conf.compaction_target_size / 10); let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; m.commit()?; From fcf613b6e3e5d4fefa1d53daeb677ccf7c64b5f8 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 1 Apr 2022 19:57:51 +0300 Subject: [PATCH 0104/1022] Fix unit tests build --- pageserver/src/pgdatadir_mapping.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index af12084766..0b9ea7c7a7 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1215,7 +1215,7 @@ pub fn create_test_timeline( timeline_id: zenith_utils::zid::ZTimelineId, ) -> Result>> { let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; - let tline = DatadirTimeline::new(tline, tline.conf.compaction_target_size / 10); + let tline = DatadirTimeline::new(tline, 256 * 1024); let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; m.commit()?; From a5a478c32193fcf6e04b3e9b2fa981d2bc5e82e2 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Mon, 4 Apr 2022 16:32:30 +0300 Subject: [PATCH 0105/1022] Bump vendor/postgres to store WAL on disk only (#1342) Now WAL is no longer held in compute memory --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 5c278ed0ac..8481459996 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 5c278ed0aca5dea9340d9af4ad5f004d905ff1b7 +Subproject commit 848145999653be213141a330569b6f2d9f53dbf2 From 089ba6abfe6c6e291489970b1c82dc5d3d6c0516 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 4 Apr 2022 20:12:25 +0300 Subject: [PATCH 0106/1022] Clean up some comments that still referred to 'segments' --- .../src/layered_repository/delta_layer.rs | 13 +++++------- .../src/layered_repository/image_layer.rs | 4 ++-- .../src/layered_repository/layer_map.rs | 20 ++----------------- .../src/layered_repository/storage_layer.rs | 4 ++-- 4 files changed, 11 insertions(+), 30 deletions(-) diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 0e59eb7a3c..955d4145f3 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -1,14 +1,11 @@ //! A DeltaLayer represents a collection of WAL records or page images in a range of //! LSNs, and in a range of Keys. It is stored on a file on disk. //! -//! Usually a delta layer only contains differences - in the form of WAL records against -//! a base LSN. However, if a segment is newly created, by creating a new relation or -//! extending an old one, there might be no base image. In that case, all the entries in -//! the delta layer must be page images or WAL records with the 'will_init' flag set, so -//! that they can be replayed without referring to an older page version. Also in some -//! circumstances, the predecessor layer might actually be another delta layer. That -//! can happen when you create a new branch in the middle of a delta layer, and the WAL -//! records on the new branch are put in a new delta layer. +//! Usually a delta layer only contains differences, in the form of WAL records +//! against a base LSN. However, if a relation extended or a whole new relation +//! is created, there would be no base for the new pages. The entries for them +//! must be page images or WAL records with the 'will_init' flag set, so that +//! they can be replayed without referring to an older page version. //! //! When a delta file needs to be accessed, we slurp the 'index' metadata //! into memory, into the DeltaLayerInner struct. See load() and unload() functions. diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 2b9bf4a717..68d1cd4a8a 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -405,8 +405,8 @@ impl ImageLayer { /// /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...) /// -/// 2. Write the contents by calling `put_page_image` for every page -/// in the segment. +/// 2. Write the contents by calling `put_page_image` for every key-value +/// pair in the key range. /// /// 3. Call `finish`. /// diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index b6a3bd82aa..8132ec9cc4 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -207,11 +207,11 @@ impl LayerMap { NUM_ONDISK_LAYERS.dec(); } - /// Is there a newer image layer for given segment? + /// Is there a newer image layer for given key-range? /// /// This is used for garbage collection, to determine if an old layer can /// be deleted. - /// We ignore segments newer than disk_consistent_lsn because they will be removed at restart + /// We ignore layers newer than disk_consistent_lsn because they will be removed at restart /// We also only look at historic layers //#[allow(dead_code)] pub fn newer_image_layer_exists( @@ -250,22 +250,6 @@ impl LayerMap { } } - /// Is there any layer for given segment that is alive at the lsn? - /// - /// This is a public wrapper for SegEntry fucntion, - /// used for garbage collection, to determine if some alive layer - /// exists at the lsn. If so, we shouldn't delete a newer dropped layer - /// to avoid incorrectly making it visible. - /* - pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result { - Ok(if let Some(segentry) = self.historic_layers.get(&seg) { - segentry.exists_at_lsn(seg, lsn)?.unwrap_or(false) - } else { - false - }) - } - */ - pub fn iter_historic_layers(&self) -> std::slice::Iter> { self.historic_layers.iter() } diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index dcf5b63908..2711640736 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -88,7 +88,7 @@ pub trait Layer: Send + Sync { /// Identify the timeline this layer belongs to fn get_timeline_id(&self) -> ZTimelineId; - /// Range of segments that this layer covers + /// Range of keys that this layer covers fn get_key_range(&self) -> Range; /// Inclusive start bound of the LSN range that this layer holds @@ -123,7 +123,7 @@ pub trait Layer: Send + Sync { reconstruct_data: &mut ValueReconstructState, ) -> Result; - /// Does this layer only contain some data for the segment (incremental), + /// Does this layer only contain some data for the key-range (incremental), /// or does it contain a version of every page? This is important to know /// for garbage collecting old layers: an incremental layer depends on /// the previous non-incremental layer. From 222b7233540d93327d26cb0566b1c30379451656 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 4 Apr 2022 20:12:28 +0300 Subject: [PATCH 0107/1022] Handle read errors when dumping a delta layer file. If a file is corrupt, let's not stop on first read error, but continue dumping. --- .../src/layered_repository/delta_layer.rs | 38 +++++++++++-------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 955d4145f3..7013c2417c 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -293,25 +293,31 @@ impl Layer for DeltaLayer { for (lsn, blob_ref) in versions.as_slice() { let mut desc = String::new(); let mut buf = vec![0u8; blob_ref.size()]; - chapter.read_exact_at(&mut buf, blob_ref.pos())?; - let val = Value::des(&buf); + match chapter.read_exact_at(&mut buf, blob_ref.pos()) { + Ok(()) => { + let val = Value::des(&buf); - match val { - Ok(Value::Image(img)) => { - write!(&mut desc, " img {} bytes", img.len())?; - } - Ok(Value::WalRecord(rec)) => { - let wal_desc = walrecord::describe_wal_record(&rec); - write!( - &mut desc, - " rec {} bytes will_init: {} {}", - buf.len(), - rec.will_init(), - wal_desc - )?; + match val { + Ok(Value::Image(img)) => { + write!(&mut desc, " img {} bytes", img.len())?; + } + Ok(Value::WalRecord(rec)) => { + let wal_desc = walrecord::describe_wal_record(&rec); + write!( + &mut desc, + " rec {} bytes will_init: {} {}", + buf.len(), + rec.will_init(), + wal_desc + )?; + } + Err(err) => { + write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?; + } + } } Err(err) => { - write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?; + write!(&mut desc, " READ ERROR: {}", err)?; } } println!(" key {} at {}: {}", key, lsn, desc); From 2f784144fe335e30811dca0f86c7ff20ec2978dc Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 4 Apr 2022 20:12:31 +0300 Subject: [PATCH 0108/1022] Avoid deadlock when locking two buffers. It happened in unit tests. If a thread tries to read a buffer while already holding a lock on one buffer, the code to find a victim buffer to evict could try to evict the buffer that's already locked. To fix, skip locked buffers. --- pageserver/src/page_cache.rs | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 299575f792..c485e46f47 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -41,7 +41,7 @@ use std::{ convert::TryInto, sync::{ atomic::{AtomicU8, AtomicUsize, Ordering}, - RwLock, RwLockReadGuard, RwLockWriteGuard, + RwLock, RwLockReadGuard, RwLockWriteGuard, TryLockError, }, }; @@ -683,16 +683,33 @@ impl PageCache { /// /// On return, the slot is empty and write-locked. fn find_victim(&self) -> (usize, RwLockWriteGuard) { - let iter_limit = self.slots.len() * 2; + let iter_limit = self.slots.len() * 10; let mut iters = 0; loop { + iters += 1; let slot_idx = self.next_evict_slot.fetch_add(1, Ordering::Relaxed) % self.slots.len(); let slot = &self.slots[slot_idx]; - if slot.dec_usage_count() == 0 || iters >= iter_limit { - let mut inner = slot.inner.write().unwrap(); - + if slot.dec_usage_count() == 0 { + let mut inner = match slot.inner.try_write() { + Ok(inner) => inner, + Err(TryLockError::Poisoned(err)) => { + panic!("buffer lock was poisoned: {:?}", err) + } + Err(TryLockError::WouldBlock) => { + // If we have looped through the whole buffer pool 10 times + // and still haven't found a victim buffer, something's wrong. + // Maybe all the buffers were in locked. That could happen in + // theory, if you have more threads holding buffers locked than + // there are buffers in the pool. In practice, with a reasonably + // large buffer pool it really shouldn't happen. + if iters > iter_limit { + panic!("could not find a victim buffer to evict"); + } + continue; + } + }; if let Some(old_key) = &inner.key { if inner.dirty { if let Err(err) = Self::writeback(old_key, inner.buf) { @@ -717,8 +734,6 @@ impl PageCache { } return (slot_idx, inner); } - - iters += 1; } } From d0c246ac3c0101fba6c8607dbb11444d8a0f589c Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Tue, 5 Apr 2022 20:01:57 +0300 Subject: [PATCH 0109/1022] Update pageserver OpenAPI spec with missing attach/detach methods (#1463) We have these methods for some time in the API, so mentioning them in the spec could be useful for console (see zenithdb/console#867), as we generate pageserver HTTP API golang client there. --- pageserver/src/http/openapi_spec.yml | 121 +++++++++++++++++++++++++-- pageserver/src/http/routes.rs | 5 +- zenith_utils/src/http/error.rs | 6 ++ 3 files changed, 125 insertions(+), 7 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index a9101d4bd6..b2760efe85 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -18,7 +18,7 @@ paths: schema: type: object required: - - id + - id properties: id: type: integer @@ -122,6 +122,110 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" + + + /v1/tenant/{tenant_id}/timeline/{timeline_id}/attach: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Attach remote timeline + responses: + "200": + description: Timeline attaching scheduled + "400": + description: Error when no tenant id found in path or no timeline id + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "404": + description: Timeline not found + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" + "409": + description: Timeline download is already in progress + content: + application/json: + schema: + $ref: "#/components/schemas/ConflictError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + + + /v1/tenant/{tenant_id}/timeline/{timeline_id}/detach: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Detach local timeline + responses: + "200": + description: Timeline detached + "400": + description: Error when no tenant id found in path or no timeline id + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + + /v1/tenant/{tenant_id}/timeline/: parameters: - name: tenant_id @@ -179,7 +283,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/AlreadyExistsError" + $ref: "#/components/schemas/ConflictError" "500": description: Generic operation error content: @@ -260,7 +364,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/AlreadyExistsError" + $ref: "#/components/schemas/ConflictError" "500": description: Generic operation error content: @@ -354,14 +458,21 @@ components: properties: msg: type: string - AlreadyExistsError: + ForbiddenError: type: object required: - msg properties: msg: type: string - ForbiddenError: + NotFoundError: + type: object + required: + - msg + properties: + msg: + type: string + ConflictError: type: object required: - msg diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 82e818a47b..207d2420bd 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -220,6 +220,7 @@ async fn timeline_attach_handler(request: Request) -> Result) -> Result { HttpErrorBody::response_from_msg_and_status(self.to_string(), StatusCode::NOT_FOUND) } + ApiError::Conflict(_) => { + HttpErrorBody::response_from_msg_and_status(self.to_string(), StatusCode::CONFLICT) + } ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::INTERNAL_SERVER_ERROR, From 6fe443e239531ca1fef4dbf5258c892b1baac6ef Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Wed, 6 Apr 2022 18:32:10 -0400 Subject: [PATCH 0110/1022] Improve random_writes test (#1469) If you want to test with a 3GB database by tweaking some constants you'll hit a query timeout. I fix that by batching the inserts. --- test_runner/performance/test_random_writes.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/test_runner/performance/test_random_writes.py b/test_runner/performance/test_random_writes.py index b41f2f72a8..ba9eabcd97 100644 --- a/test_runner/performance/test_random_writes.py +++ b/test_runner/performance/test_random_writes.py @@ -49,7 +49,15 @@ def test_random_writes(zenith_with_baseline: PgCompare): count integer default 0 ); """) - cur.execute(f"INSERT INTO Big (pk) values (generate_series(1,{n_rows}))") + + # Insert n_rows in batches to avoid query timeouts + rows_inserted = 0 + while rows_inserted < n_rows: + rows_to_insert = min(1000 * 1000, n_rows - rows_inserted) + low = rows_inserted + 1 + high = rows_inserted + rows_to_insert + cur.execute(f"INSERT INTO Big (pk) values (generate_series({low},{high}))") + rows_inserted += rows_to_insert # Get table size (can't be predicted because padding and alignment) cur.execute("SELECT pg_relation_size('Big');") From 6bc78a0e7729c206d8c4ebfdaed539017130d253 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 7 Apr 2022 01:44:26 +0300 Subject: [PATCH 0111/1022] Log more info in test_many_timelines asserts (#1473) It will help to debug #1470 as soon as it happens again --- test_runner/batch_others/test_wal_acceptor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index bdc526a125..8f87ff041f 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -108,14 +108,14 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns): # Invariant. May be < when transaction is in progress. - assert commit_lsn <= flush_lsn + assert commit_lsn <= flush_lsn, f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" # We only call collect_metrics() after a transaction is confirmed by # the compute node, which only happens after a consensus of safekeepers # has confirmed the transaction. We assume majority consensus here. assert (2 * sum(m.last_record_lsn <= lsn - for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers) + for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" assert (2 * sum(m.last_record_lsn <= lsn - for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers) + for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" timeline_metrics.append(m) log.info(f"{message}: {timeline_metrics}") return timeline_metrics From d5258cdc4df4f5130bb9ceea5dc47128bac6ce48 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Wed, 6 Apr 2022 20:05:24 -0400 Subject: [PATCH 0112/1022] [proxy] Don't print passwords (#1298) --- proxy/src/compute.rs | 12 +++++++++++- proxy/src/mgmt.rs | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 7c0ab965a0..3c0eee29bc 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -24,7 +24,7 @@ pub enum ConnectionError { impl UserFacingError for ConnectionError {} /// Compute node connection params. -#[derive(Serialize, Deserialize, Debug, Default)] +#[derive(Serialize, Deserialize, Default)] pub struct DatabaseInfo { pub host: String, pub port: u16, @@ -33,6 +33,16 @@ pub struct DatabaseInfo { pub password: Option, } +// Manually implement debug to omit personal and sensitive info +impl std::fmt::Debug for DatabaseInfo { + fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { + fmt.debug_struct("DatabaseInfo") + .field("host", &self.host) + .field("port", &self.port) + .finish() + } +} + /// PostgreSQL version as [`String`]. pub type Version = String; diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index e53542dfd2..ab6fdff040 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -107,7 +107,7 @@ impl postgres_backend::Handler for MgmtHandler { } fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::Result<()> { - println!("Got mgmt query: '{}'", query_string); + println!("Got mgmt query [redacted]"); // Content contains password, don't print it let resp: PsqlSessionResponse = serde_json::from_str(query_string)?; From 81ba23094e8578ed11cb1aae48cf10b79dc2f3cd Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 7 Apr 2022 20:38:26 +0300 Subject: [PATCH 0113/1022] Fix scripts to deploy sk4 on staging (#1476) Adjust ansible scripts and inventory for sk4 on staging --- .circleci/ansible/deploy.yaml | 24 ++++++++++++++++ .circleci/ansible/scripts/init_safekeeper.sh | 30 ++++++++++++++++++++ .circleci/ansible/staging.hosts | 1 + 3 files changed, 55 insertions(+) create mode 100644 .circleci/ansible/scripts/init_safekeeper.sh diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml index b7ffd075a0..2112102aa7 100644 --- a/.circleci/ansible/deploy.yaml +++ b/.circleci/ansible/deploy.yaml @@ -116,6 +116,30 @@ tasks: + - name: upload init script + when: console_mgmt_base_url is defined + ansible.builtin.template: + src: scripts/init_safekeeper.sh + dest: /tmp/init_safekeeper.sh + owner: root + group: root + mode: '0755' + become: true + tags: + - safekeeper + + - name: init safekeeper + shell: + cmd: /tmp/init_safekeeper.sh + args: + creates: "/storage/safekeeper/data/safekeeper.id" + environment: + ZENITH_REPO_DIR: "/storage/safekeeper/data" + LD_LIBRARY_PATH: "/usr/local/lib" + become: true + tags: + - safekeeper + # in the future safekeepers should discover pageservers byself # but currently use first pageserver that was discovered - name: set first pageserver var for safekeepers diff --git a/.circleci/ansible/scripts/init_safekeeper.sh b/.circleci/ansible/scripts/init_safekeeper.sh new file mode 100644 index 0000000000..2297788f59 --- /dev/null +++ b/.circleci/ansible/scripts/init_safekeeper.sh @@ -0,0 +1,30 @@ +#!/bin/sh + +# get instance id from meta-data service +INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) + +# store fqdn hostname in var +HOST=$(hostname -f) + + +cat < Date: Thu, 7 Apr 2022 20:50:08 +0300 Subject: [PATCH 0114/1022] Refactor the I/O functions. This introduces two new abstraction layers for I/O: - Block I/O, and - Blob I/O. The BlockReader trait abstracts a file or something else that can be read in 8kB pages. It is implemented by EphemeralFiles, and by a new FileBlockReader struct that allows reading arbitrary VirtualFiles in that manner, utilizing the page cache. There is also a new BlockCursor struct that works as a cursor over a BlockReader. When you create a BlockCursor and read the first page using it, it keeps the reference to the page. If you access the same page again, it avoids going to page cache and quickly returns the same page again. That can save a lot of lookups in the page cache if you perform multiple reads. The Blob-oriented API allows reading and writing "blobs" of arbitrary length. It is a layer on top of the block-oriented API. When you write a blob with the write_blob() function, it writes a length field followed by the actual data to the underlying block storage, and returns the offset where the blob was stored. The blob can be retrieved later using the offset. Finally, this replaces the I/O code in image-, delta-, and in-memory layers to use the new abstractions. These replace the 'bookfile' crate. This is a backwards-incompatible change to the storage format. --- Cargo.lock | 36 --- pageserver/Cargo.toml | 1 - pageserver/src/bin/dump_layerfile.rs | 2 + pageserver/src/layered_repository.rs | 23 +- pageserver/src/layered_repository/blob_io.rs | 122 ++++++++ pageserver/src/layered_repository/block_io.rs | 176 ++++++++++++ .../src/layered_repository/delta_layer.rs | 272 ++++++++---------- .../src/layered_repository/ephemeral_file.rs | 183 ++++++++---- .../src/layered_repository/image_layer.rs | 195 ++++++------- .../src/layered_repository/inmemory_layer.rs | 61 ++-- .../src/layered_repository/storage_layer.rs | 17 +- pageserver/src/lib.rs | 6 +- pageserver/src/page_cache.rs | 82 +++++- pageserver/src/virtual_file.rs | 3 +- 14 files changed, 774 insertions(+), 405 deletions(-) create mode 100644 pageserver/src/layered_repository/blob_io.rs create mode 100644 pageserver/src/layered_repository/block_io.rs diff --git a/Cargo.lock b/Cargo.lock index bb27df7012..e0b6288f63 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -141,30 +141,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" -[[package]] -name = "aversion" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41992ab8cfcc3026ef9abceffe0c2b0479c043183fc23825e30d22baab6df334" -dependencies = [ - "aversion-macros", - "byteorder", - "serde", - "serde_cbor", - "thiserror", -] - -[[package]] -name = "aversion-macros" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ba5785f953985aa0caca927ba4005880f3b4f53de87f134e810ae3549f744d2" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "aws-creds" version = "0.27.1" @@ -264,17 +240,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "bookfile" -version = "0.3.0" -source = "git+https://github.com/zenithdb/bookfile.git?rev=bf6e43825dfb6e749ae9b80e8372c8fea76cec2f#bf6e43825dfb6e749ae9b80e8372c8fea76cec2f" -dependencies = [ - "aversion", - "byteorder", - "serde", - "thiserror", -] - [[package]] name = "boxfnonce" version = "0.1.1" @@ -1524,7 +1489,6 @@ dependencies = [ "anyhow", "async-compression", "async-trait", - "bookfile", "byteorder", "bytes", "chrono", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 6a77af1691..a5283cb331 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -4,7 +4,6 @@ version = "0.1.0" edition = "2021" [dependencies] -bookfile = { git = "https://github.com/zenithdb/bookfile.git", rev="bf6e43825dfb6e749ae9b80e8372c8fea76cec2f" } chrono = "0.4.19" rand = "0.8.3" regex = "1.4.5" diff --git a/pageserver/src/bin/dump_layerfile.rs b/pageserver/src/bin/dump_layerfile.rs index 27d41d50d9..7cf39566ac 100644 --- a/pageserver/src/bin/dump_layerfile.rs +++ b/pageserver/src/bin/dump_layerfile.rs @@ -4,6 +4,7 @@ use anyhow::Result; use clap::{App, Arg}; use pageserver::layered_repository::dump_layerfile_from_path; +use pageserver::page_cache; use pageserver::virtual_file; use std::path::PathBuf; use zenith_utils::GIT_VERSION; @@ -24,6 +25,7 @@ fn main() -> Result<()> { // Basic initialization of things that don't change after startup virtual_file::init(10); + page_cache::init(100); dump_layerfile_from_path(&path, true)?; diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 2d9b680624..5adf4a89ff 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -12,7 +12,6 @@ //! use anyhow::{anyhow, bail, ensure, Context, Result}; -use bookfile::Book; use bytes::Bytes; use fail::fail_point; use itertools::Itertools; @@ -56,6 +55,8 @@ use zenith_utils::crashsafe_dir; use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn}; use zenith_utils::seqwait::SeqWait; +mod blob_io; +pub mod block_io; mod delta_layer; pub(crate) mod ephemeral_file; mod filename; @@ -2054,16 +2055,17 @@ impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { /// Dump contents of a layer file to stdout. pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { - let file = File::open(path)?; - let book = Book::new(file)?; + use std::os::unix::fs::FileExt; - match book.magic() { - crate::DELTA_FILE_MAGIC => { - DeltaLayer::new_for_path(path, &book)?.dump(verbose)?; - } - crate::IMAGE_FILE_MAGIC => { - ImageLayer::new_for_path(path, &book)?.dump(verbose)?; - } + // All layer files start with a two-byte "magic" value, to identify the kind of + // file. + let file = File::open(path)?; + let mut header_buf = [0u8; 2]; + file.read_exact_at(&mut header_buf, 0)?; + + match u16::from_be_bytes(header_buf) { + crate::IMAGE_FILE_MAGIC => ImageLayer::new_for_path(path, file)?.dump(verbose)?, + crate::DELTA_FILE_MAGIC => DeltaLayer::new_for_path(path, file)?.dump(verbose)?, magic => bail!("unrecognized magic identifier: {:?}", magic), } @@ -2274,7 +2276,6 @@ pub mod tests { lsn, Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; - println!("updating {} at {}", blknum, lsn); writer.finish_write(lsn); drop(writer); updated[blknum] = lsn; diff --git a/pageserver/src/layered_repository/blob_io.rs b/pageserver/src/layered_repository/blob_io.rs new file mode 100644 index 0000000000..10bfea934d --- /dev/null +++ b/pageserver/src/layered_repository/blob_io.rs @@ -0,0 +1,122 @@ +//! +//! Functions for reading and writing variable-sized "blobs". +//! +//! Each blob begins with a 4-byte length, followed by the actual data. +//! +use crate::layered_repository::block_io::{BlockCursor, BlockReader}; +use crate::page_cache::PAGE_SZ; +use std::cmp::min; +use std::io::Error; + +/// For reading +pub trait BlobCursor { + fn read_blob(&mut self, offset: u64) -> Result, std::io::Error> { + let mut buf = Vec::new(); + self.read_blob_into_buf(offset, &mut buf)?; + Ok(buf) + } + + fn read_blob_into_buf( + &mut self, + offset: u64, + dstbuf: &mut Vec, + ) -> Result<(), std::io::Error>; +} + +impl<'a, R> BlobCursor for BlockCursor +where + R: BlockReader, +{ + fn read_blob_into_buf( + &mut self, + offset: u64, + dstbuf: &mut Vec, + ) -> Result<(), std::io::Error> { + let mut blknum = (offset / PAGE_SZ as u64) as u32; + let mut off = (offset % PAGE_SZ as u64) as usize; + + let mut buf = self.read_blk(blknum)?; + + // read length + let mut len_buf = [0u8; 4]; + let thislen = PAGE_SZ - off; + if thislen < 4 { + // it is split across two pages + len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]); + blknum += 1; + buf = self.read_blk(blknum)?; + len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]); + off = 4 - thislen; + } else { + len_buf.copy_from_slice(&buf[off..off + 4]); + off += 4; + } + let len = u32::from_ne_bytes(len_buf) as usize; + + dstbuf.clear(); + + // Read the payload + let mut remain = len; + while remain > 0 { + let mut page_remain = PAGE_SZ - off; + if page_remain == 0 { + // continue on next page + blknum += 1; + buf = self.read_blk(blknum)?; + off = 0; + page_remain = PAGE_SZ; + } + let this_blk_len = min(remain, page_remain); + dstbuf.extend_from_slice(&buf[off..off + this_blk_len]); + remain -= this_blk_len; + off += this_blk_len; + } + Ok(()) + } +} + +pub trait BlobWriter { + fn write_blob(&mut self, srcbuf: &[u8]) -> Result; +} + +pub struct WriteBlobWriter +where + W: std::io::Write, +{ + inner: W, + offset: u64, +} + +impl WriteBlobWriter +where + W: std::io::Write, +{ + pub fn new(inner: W, start_offset: u64) -> Self { + WriteBlobWriter { + inner, + offset: start_offset, + } + } + + pub fn size(&self) -> u64 { + self.offset + } + + pub fn into_inner(self) -> W { + self.inner + } +} + +impl BlobWriter for WriteBlobWriter +where + W: std::io::Write, +{ + fn write_blob(&mut self, srcbuf: &[u8]) -> Result { + let offset = self.offset; + self.inner + .write_all(&((srcbuf.len()) as u32).to_ne_bytes())?; + self.inner.write_all(srcbuf)?; + self.offset += 4 + srcbuf.len() as u64; + Ok(offset) + } +} diff --git a/pageserver/src/layered_repository/block_io.rs b/pageserver/src/layered_repository/block_io.rs new file mode 100644 index 0000000000..2b8e31e1ee --- /dev/null +++ b/pageserver/src/layered_repository/block_io.rs @@ -0,0 +1,176 @@ +//! +//! Low-level Block-oriented I/O functions +//! +//! +//! + +use crate::page_cache; +use crate::page_cache::{ReadBufResult, PAGE_SZ}; +use lazy_static::lazy_static; +use std::ops::{Deref, DerefMut}; +use std::os::unix::fs::FileExt; +use std::sync::atomic::AtomicU64; + +/// This is implemented by anything that can read 8 kB (PAGE_SZ) +/// blocks, using the page cache +/// +/// There are currently two implementations: EphemeralFile, and FileBlockReader +/// below. +pub trait BlockReader { + type BlockLease: Deref + 'static; + + /// + /// Read a block. Returns a "lease" object that can be used to + /// access to the contents of the page. (For the page cache, the + /// lease object represents a lock on the buffer.) + /// + fn read_blk(&self, blknum: u32) -> Result; + + /// + /// Create a new "cursor" for reading from this reader. + /// + /// A cursor caches the last accessed page, allowing for faster + /// access if the same block is accessed repeatedly. + fn block_cursor(&self) -> BlockCursor<&Self> + where + Self: Sized, + { + BlockCursor::new(self) + } +} + +impl BlockReader for &B +where + B: BlockReader, +{ + type BlockLease = B::BlockLease; + + fn read_blk(&self, blknum: u32) -> Result { + (*self).read_blk(blknum) + } +} + +/// +/// A "cursor" for efficiently reading multiple pages from a BlockReader +/// +/// A cursor caches the last accessed page, allowing for faster access if the +/// same block is accessed repeatedly. +/// +/// You can access the last page with `*cursor`. 'read_blk' returns 'self', so +/// that in many cases you can use a BlockCursor as a drop-in replacement for +/// the underlying BlockReader. For example: +/// +/// ```no_run +/// # use pageserver::layered_repository::block_io::{BlockReader, FileBlockReader}; +/// # let reader: FileBlockReader = todo!(); +/// let cursor = reader.block_cursor(); +/// let buf = cursor.read_blk(1); +/// // do stuff with 'buf' +/// let buf = cursor.read_blk(2); +/// // do stuff with 'buf' +/// ``` +/// +pub struct BlockCursor +where + R: BlockReader, +{ + reader: R, + /// last accessed page + cache: Option<(u32, R::BlockLease)>, +} + +impl BlockCursor +where + R: BlockReader, +{ + pub fn new(reader: R) -> Self { + BlockCursor { + reader, + cache: None, + } + } + + pub fn read_blk(&mut self, blknum: u32) -> Result<&Self, std::io::Error> { + // Fast return if this is the same block as before + if let Some((cached_blk, _buf)) = &self.cache { + if *cached_blk == blknum { + return Ok(self); + } + } + + // Read the block from the underlying reader, and cache it + self.cache = None; + let buf = self.reader.read_blk(blknum)?; + self.cache = Some((blknum, buf)); + + Ok(self) + } +} + +impl Deref for BlockCursor +where + R: BlockReader, +{ + type Target = [u8; PAGE_SZ]; + + fn deref(&self) -> &::Target { + &self.cache.as_ref().unwrap().1 + } +} + +lazy_static! { + static ref NEXT_ID: AtomicU64 = AtomicU64::new(1); +} + +/// An adapter for reading a (virtual) file using the page cache. +/// +/// The file is assumed to be immutable. This doesn't provide any functions +/// for modifying the file, nor for invalidating the cache if it is modified. +pub struct FileBlockReader { + pub file: F, + + /// Unique ID of this file, used as key in the page cache. + file_id: u64, +} + +impl FileBlockReader +where + F: FileExt, +{ + pub fn new(file: F) -> Self { + let file_id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + + FileBlockReader { file_id, file } + } + + /// Read a page from the underlying file into given buffer. + fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> { + assert!(buf.len() == PAGE_SZ); + self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64) + } +} + +impl BlockReader for FileBlockReader +where + F: FileExt, +{ + type BlockLease = page_cache::PageReadGuard<'static>; + + fn read_blk(&self, blknum: u32) -> Result { + // Look up the right page + let cache = page_cache::get(); + loop { + match cache.read_immutable_buf(self.file_id, blknum) { + ReadBufResult::Found(guard) => break Ok(guard), + ReadBufResult::NotFound(mut write_guard) => { + // Read the page from disk into the buffer + self.fill_buffer(write_guard.deref_mut(), blknum)?; + write_guard.mark_valid(); + + // Swap for read lock + continue; + } + }; + } + } +} diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 7013c2417c..f8828b541f 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -23,21 +23,27 @@ //! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051 //! //! -//! A delta file is constructed using the 'bookfile' crate. Each file consists of three -//! parts: the 'index', the values, and a short summary header. They are stored as -//! separate chapters. +//! Every delta file consists of three parts: "summary", "index", and +//! "values". The summary is a fixed size header at the beginning of the file, +//! and it contains basic information about the layer, and offsets to the other +//! parts. The "index" is a serialized HashMap mapping from Key and LSN to an offset in the +//! "values" part. The actual page images and WAL records are stored in the +//! "values" part. //! use crate::config::PageServerConf; +use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; +use crate::layered_repository::block_io::{BlockCursor, BlockReader, FileBlockReader}; use crate::layered_repository::filename::{DeltaFileName, PathOrConf}; use crate::layered_repository::storage_layer::{ BlobRef, Layer, ValueReconstructResult, ValueReconstructState, }; +use crate::page_cache::{PageReadGuard, PAGE_SZ}; use crate::repository::{Key, Value}; use crate::virtual_file::VirtualFile; use crate::walrecord; -use crate::DELTA_FILE_MAGIC; use crate::{ZTenantId, ZTimelineId}; -use anyhow::{bail, ensure, Result}; +use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; +use anyhow::{bail, ensure, Context, Result}; use log::*; use serde::{Deserialize, Serialize}; use std::collections::HashMap; @@ -46,44 +52,43 @@ use zenith_utils::vec_map::VecMap; // while being able to use std::fmt::Write's methods use std::fmt::Write as _; use std::fs; -use std::io::BufWriter; -use std::io::Write; +use std::io::{BufWriter, Write}; +use std::io::{Seek, SeekFrom}; use std::ops::Range; use std::os::unix::fs::FileExt; use std::path::{Path, PathBuf}; use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard, TryLockError}; -use bookfile::{Book, BookWriter, ChapterWriter}; - use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::Lsn; -/// Mapping from (key, lsn) -> page/WAL record -/// byte ranges in VALUES_CHAPTER -static INDEX_CHAPTER: u64 = 1; - -/// Page/WAL bytes - cannot be interpreted -/// without the page versions from the INDEX_CHAPTER -static VALUES_CHAPTER: u64 = 2; - -/// Contains the [`Summary`] struct -static SUMMARY_CHAPTER: u64 = 3; - #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] struct Summary { + /// Magic value to identify this as a zenith delta file. Always DELTA_FILE_MAGIC. + magic: u16, + format_version: u16, + tenantid: ZTenantId, timelineid: ZTimelineId, key_range: Range, lsn_range: Range, + + /// Block number where the 'index' part of the file begins. + index_start_blk: u32, } impl From<&DeltaLayer> for Summary { fn from(layer: &DeltaLayer) -> Self { Self { + magic: DELTA_FILE_MAGIC, + format_version: STORAGE_FORMAT_VERSION, + tenantid: layer.tenantid, timelineid: layer.timelineid, key_range: layer.key_range.clone(), lsn_range: layer.lsn_range.clone(), + + index_start_blk: 0, } } } @@ -118,7 +123,11 @@ pub struct DeltaLayerInner { /// index: HashMap>, - book: Option>, + // values copied from summary + index_start_blk: u32, + + /// Reader object for reading blocks from the file. (None if not loaded yet) + file: Option>, } impl Layer for DeltaLayer { @@ -155,45 +164,28 @@ impl Layer for DeltaLayer { { // Open the file and lock the metadata in memory let inner = self.load()?; - let values_reader = inner - .book - .as_ref() - .expect("should be loaded in load call above") - .chapter_reader(VALUES_CHAPTER)?; // Scan the page versions backwards, starting from `lsn`. if let Some(vec_map) = inner.index.get(&key) { + let mut reader = inner.file.as_ref().unwrap().block_cursor(); let slice = vec_map.slice_range(lsn_range); - let mut size = 0usize; - let mut first_pos = 0u64; - for (_entry_lsn, blob_ref) in slice.iter().rev() { - size += blob_ref.size(); - first_pos = blob_ref.pos(); - if blob_ref.will_init() { - break; - } - } - if size != 0 { - let mut buf = vec![0u8; size]; - values_reader.read_exact_at(&mut buf, first_pos)?; - for (entry_lsn, blob_ref) in slice.iter().rev() { - let offs = (blob_ref.pos() - first_pos) as usize; - let val = Value::des(&buf[offs..offs + blob_ref.size()])?; - match val { - Value::Image(img) => { - reconstruct_state.img = Some((*entry_lsn, img)); + for (entry_lsn, blob_ref) in slice.iter().rev() { + let buf = reader.read_blob(blob_ref.pos())?; + let val = Value::des(&buf)?; + match val { + Value::Image(img) => { + reconstruct_state.img = Some((*entry_lsn, img)); + need_image = false; + break; + } + Value::WalRecord(rec) => { + let will_init = rec.will_init(); + reconstruct_state.records.push((*entry_lsn, rec)); + if will_init { + // This WAL record initializes the page, so no need to go further back need_image = false; break; } - Value::WalRecord(rec) => { - let will_init = rec.will_init(); - reconstruct_state.records.push((*entry_lsn, rec)); - if will_init { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; - } - } } } } @@ -210,7 +202,7 @@ impl Layer for DeltaLayer { } } - fn iter(&self) -> Box> + '_> { + fn iter<'a>(&'a self) -> Box> + 'a> { let inner = self.load().unwrap(); match DeltaValueIter::new(inner) { @@ -281,20 +273,16 @@ impl Layer for DeltaLayer { let inner = self.load()?; - let path = self.path(); - let file = std::fs::File::open(&path)?; - let book = Book::new(file)?; - let chapter = book.chapter_reader(VALUES_CHAPTER)?; - let mut values: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); values.sort_by_key(|k| k.0); + let mut reader = inner.file.as_ref().unwrap().block_cursor(); + for (key, versions) in values { for (lsn, blob_ref) in versions.as_slice() { let mut desc = String::new(); - let mut buf = vec![0u8; blob_ref.size()]; - match chapter.read_exact_at(&mut buf, blob_ref.pos()) { - Ok(()) => { + match reader.read_blob(blob_ref.pos()) { + Ok(buf) => { let val = Value::des(&buf); match val { @@ -378,19 +366,19 @@ impl DeltaLayer { let path = self.path(); // Open the file if it's not open already. - if inner.book.is_none() { - let file = VirtualFile::open(&path)?; - inner.book = Some(Book::new(file)?); + if inner.file.is_none() { + let file = VirtualFile::open(&path) + .with_context(|| format!("Failed to open file '{}'", path.display()))?; + inner.file = Some(FileBlockReader::new(file)); } - let book = inner.book.as_ref().unwrap(); + let file = inner.file.as_mut().unwrap(); + let summary_blk = file.read_blk(0)?; + let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; match &self.path_or_conf { PathOrConf::Conf(_) => { - let chapter = book.read_chapter(SUMMARY_CHAPTER)?; - let actual_summary = Summary::des(&chapter)?; - - let expected_summary = Summary::from(self); - + let mut expected_summary = Summary::from(self); + expected_summary.index_start_blk = actual_summary.index_start_blk; if actual_summary != expected_summary { bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary); } @@ -409,8 +397,13 @@ impl DeltaLayer { } } - let chapter = book.read_chapter(INDEX_CHAPTER)?; - let index = HashMap::des(&chapter)?; + file.file.seek(SeekFrom::Start( + actual_summary.index_start_blk as u64 * PAGE_SZ as u64, + ))?; + let mut buf_reader = std::io::BufReader::new(&mut file.file); + let index = HashMap::des_from(&mut buf_reader)?; + + inner.index_start_blk = actual_summary.index_start_blk; debug!("loaded from {}", &path.display()); @@ -434,8 +427,9 @@ impl DeltaLayer { lsn_range: filename.lsn_range.clone(), inner: RwLock::new(DeltaLayerInner { loaded: false, - book: None, index: HashMap::default(), + file: None, + index_start_blk: 0, }), } } @@ -443,12 +437,14 @@ impl DeltaLayer { /// Create a DeltaLayer struct representing an existing file on disk. /// /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary. - pub fn new_for_path(path: &Path, book: &Book) -> Result + pub fn new_for_path(path: &Path, file: F) -> Result where F: FileExt, { - let chapter = book.read_chapter(SUMMARY_CHAPTER)?; - let summary = Summary::des(&chapter)?; + let mut summary_buf = Vec::new(); + summary_buf.resize(PAGE_SZ, 0); + file.read_exact_at(&mut summary_buf, 0)?; + let summary = Summary::des_prefix(&summary_buf)?; Ok(DeltaLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), @@ -458,8 +454,9 @@ impl DeltaLayer { lsn_range: summary.lsn_range, inner: RwLock::new(DeltaLayerInner { loaded: false, - book: None, + file: None, index: HashMap::default(), + index_start_blk: 0, }), }) } @@ -504,8 +501,7 @@ pub struct DeltaLayerWriter { index: HashMap>, - values_writer: ChapterWriter>, - end_offset: u64, + blob_writer: WriteBlobWriter>, } impl DeltaLayerWriter { @@ -531,13 +527,10 @@ impl DeltaLayerWriter { u64::from(lsn_range.start), u64::from(lsn_range.end) )); - let file = VirtualFile::create(&path)?; + let mut file = VirtualFile::create(&path)?; + file.seek(SeekFrom::Start(PAGE_SZ as u64))?; let buf_writer = BufWriter::new(file); - let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?; - - // Open the page-versions chapter for writing. The calls to - // `put_value` will use this to write the contents. - let values_writer = book.new_chapter(VALUES_CHAPTER); + let blob_writer = WriteBlobWriter::new(buf_writer, PAGE_SZ as u64); Ok(DeltaLayerWriter { conf, @@ -547,8 +540,7 @@ impl DeltaLayerWriter { key_start, lsn_range, index: HashMap::new(), - values_writer, - end_offset: 0, + blob_writer, }) } @@ -558,17 +550,12 @@ impl DeltaLayerWriter { /// The values must be appended in key, lsn order. /// pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> { - //info!("DELTA: key {} at {} on {}", key, lsn, self.path.display()); assert!(self.lsn_range.start <= lsn); - // Remember the offset and size metadata. The metadata is written - // to a separate chapter, in `finish`. - let off = self.end_offset; - let buf = Value::ser(&val)?; - let len = buf.len(); - self.values_writer.write_all(&buf)?; - self.end_offset += len as u64; + + let off = self.blob_writer.write_blob(&Value::ser(&val)?)?; + let vec_map = self.index.entry(key).or_default(); - let blob_ref = BlobRef::new(off, len, val.will_init()); + let blob_ref = BlobRef::new(off, val.will_init()); let old = vec_map.append_or_update_last(lsn, blob_ref).unwrap().0; if old.is_some() { // We already had an entry for this LSN. That's odd.. @@ -583,38 +570,40 @@ impl DeltaLayerWriter { } pub fn size(&self) -> u64 { - self.end_offset + self.blob_writer.size() } /// /// Finish writing the delta layer. /// pub fn finish(self, key_end: Key) -> anyhow::Result { - // Close the values chapter - let book = self.values_writer.close()?; + let index_start_blk = + ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; + + let buf_writer = self.blob_writer.into_inner(); + let mut file = buf_writer.into_inner()?; // Write out the index - let mut chapter = book.new_chapter(INDEX_CHAPTER); let buf = HashMap::ser(&self.index)?; - chapter.write_all(&buf)?; - let book = chapter.close()?; + file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?; + file.write_all(&buf)?; - let mut chapter = book.new_chapter(SUMMARY_CHAPTER); + // Fill in the summary on blk 0 let summary = Summary { + magic: DELTA_FILE_MAGIC, + format_version: STORAGE_FORMAT_VERSION, tenantid: self.tenantid, timelineid: self.timelineid, key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), + index_start_blk, }; - Summary::ser_into(&summary, &mut chapter)?; - let book = chapter.close()?; - - // This flushes the underlying 'buf_writer'. - book.close()?; + file.seek(SeekFrom::Start(0))?; + Summary::ser_into(&summary, &mut file)?; // Note: Because we opened the file in write-only mode, we cannot // reuse the same VirtualFile for reading later. That's why we don't - // set inner.book here. The first read will have to re-open it. + // set inner.file here. The first read will have to re-open it. let layer = DeltaLayer { path_or_conf: PathOrConf::Conf(self.conf), tenantid: self.tenantid, @@ -624,7 +613,8 @@ impl DeltaLayerWriter { inner: RwLock::new(DeltaLayerInner { loaded: false, index: HashMap::new(), - book: None, + file: None, + index_start_blk, }), }; @@ -647,22 +637,6 @@ impl DeltaLayerWriter { Ok(layer) } - - pub fn abort(self) { - match self.values_writer.close() { - Ok(book) => { - if let Err(err) = book.close() { - error!("error while closing delta layer file: {}", err); - } - } - Err(err) => { - error!("error while closing chapter writer: {}", err); - } - } - if let Err(err) = std::fs::remove_file(self.path) { - error!("error removing unfinished delta layer file: {}", err); - } - } } /// @@ -672,13 +646,23 @@ impl DeltaLayerWriter { /// That takes up quite a lot of memory. Should do this in a more streaming /// fashion. /// -struct DeltaValueIter { +struct DeltaValueIter<'a> { all_offsets: Vec<(Key, Lsn, BlobRef)>, next_idx: usize, - data: Vec, + reader: BlockCursor>, } -impl Iterator for DeltaValueIter { +struct Adapter<'a>(RwLockReadGuard<'a, DeltaLayerInner>); + +impl<'a> BlockReader for Adapter<'a> { + type BlockLease = PageReadGuard<'static>; + + fn read_blk(&self, blknum: u32) -> Result { + self.0.file.as_ref().unwrap().read_blk(blknum) + } +} + +impl<'a> Iterator for DeltaValueIter<'a> { type Item = Result<(Key, Lsn, Value)>; fn next(&mut self) -> Option { @@ -686,8 +670,8 @@ impl Iterator for DeltaValueIter { } } -impl DeltaValueIter { - fn new(inner: RwLockReadGuard) -> Result { +impl<'a> DeltaValueIter<'a> { + fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result { let mut index: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); index.sort_by_key(|x| x.0); @@ -698,30 +682,24 @@ impl DeltaValueIter { } } - let values_reader = inner - .book - .as_ref() - .expect("should be loaded in load call above") - .chapter_reader(VALUES_CHAPTER)?; - let file_size = values_reader.len() as usize; - let mut layer = DeltaValueIter { + let iter = DeltaValueIter { all_offsets, next_idx: 0, - data: vec![0u8; file_size], + reader: BlockCursor::new(Adapter(inner)), }; - values_reader.read_exact_at(&mut layer.data, 0)?; - Ok(layer) + Ok(iter) } fn next_res(&mut self) -> Result> { if self.next_idx < self.all_offsets.len() { - let (key, lsn, blob_ref) = self.all_offsets[self.next_idx]; - let offs = blob_ref.pos() as usize; - let size = blob_ref.size(); - let val = Value::des(&self.data[offs..offs + size])?; + let (key, lsn, off) = &self.all_offsets[self.next_idx]; + + //let mut reader = BlobReader::new(self.inner.file.as_ref().unwrap()); + let buf = self.reader.read_blob(off.pos())?; + let val = Value::des(&buf)?; self.next_idx += 1; - Ok(Some((key, lsn, val))) + Ok(Some((*key, *lsn, val))) } else { Ok(None) } diff --git a/pageserver/src/layered_repository/ephemeral_file.rs b/pageserver/src/layered_repository/ephemeral_file.rs index 79a72f4563..d509186e6f 100644 --- a/pageserver/src/layered_repository/ephemeral_file.rs +++ b/pageserver/src/layered_repository/ephemeral_file.rs @@ -2,6 +2,8 @@ //! used to keep in-memory layers spilled on disk. use crate::config::PageServerConf; +use crate::layered_repository::blob_io::BlobWriter; +use crate::layered_repository::block_io::BlockReader; use crate::page_cache; use crate::page_cache::PAGE_SZ; use crate::page_cache::{ReadBufResult, WriteBufResult}; @@ -10,7 +12,7 @@ use lazy_static::lazy_static; use std::cmp::min; use std::collections::HashMap; use std::fs::OpenOptions; -use std::io::{Error, ErrorKind, Seek, SeekFrom, Write}; +use std::io::{Error, ErrorKind}; use std::ops::DerefMut; use std::path::PathBuf; use std::sync::{Arc, RwLock}; @@ -41,7 +43,7 @@ pub struct EphemeralFile { _timelineid: ZTimelineId, file: Arc, - pos: u64, + size: u64, } impl EphemeralFile { @@ -70,11 +72,11 @@ impl EphemeralFile { _tenantid: tenantid, _timelineid: timelineid, file: file_rc, - pos: 0, + size: 0, }) } - pub fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), Error> { + fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), Error> { let mut off = 0; while off < PAGE_SZ { let n = self @@ -93,6 +95,26 @@ impl EphemeralFile { } Ok(()) } + + fn get_buf_for_write(&self, blkno: u32) -> Result { + // Look up the right page + let cache = page_cache::get(); + let mut write_guard = match cache.write_ephemeral_buf(self.file_id, blkno) { + WriteBufResult::Found(guard) => guard, + WriteBufResult::NotFound(mut guard) => { + // Read the page from disk into the buffer + // TODO: if we're overwriting the whole page, no need to read it in first + self.fill_buffer(guard.deref_mut(), blkno)?; + guard.mark_valid(); + + // And then fall through to modify it. + guard + } + }; + write_guard.mark_dirty(); + + Ok(write_guard) + } } /// Does the given filename look like an ephemeral file? @@ -167,48 +189,49 @@ impl FileExt for EphemeralFile { } } -impl Write for EphemeralFile { - fn write(&mut self, buf: &[u8]) -> Result { - let n = self.write_at(buf, self.pos)?; - self.pos += n as u64; - Ok(n) - } +impl BlobWriter for EphemeralFile { + fn write_blob(&mut self, srcbuf: &[u8]) -> Result { + let pos = self.size; - fn flush(&mut self) -> Result<(), std::io::Error> { - // we don't need to flush data: - // * we either write input bytes or not, not keeping any intermediate data buffered - // * rust unix file `flush` impl does not flush things either, returning `Ok(())` - Ok(()) - } -} + let mut blknum = (self.size / PAGE_SZ as u64) as u32; + let mut off = (pos % PAGE_SZ as u64) as usize; -impl Seek for EphemeralFile { - fn seek(&mut self, pos: SeekFrom) -> Result { - match pos { - SeekFrom::Start(offset) => { - self.pos = offset; - } - SeekFrom::End(_offset) => { - return Err(Error::new( - ErrorKind::Other, - "SeekFrom::End not supported by EphemeralFile", - )); - } - SeekFrom::Current(offset) => { - let pos = self.pos as i128 + offset as i128; - if pos < 0 { - return Err(Error::new( - ErrorKind::InvalidInput, - "offset would be negative", - )); - } - if pos > u64::MAX as i128 { - return Err(Error::new(ErrorKind::InvalidInput, "offset overflow")); - } - self.pos = pos as u64; - } + let mut buf = self.get_buf_for_write(blknum)?; + + // Write the length field + let len_buf = u32::to_ne_bytes(srcbuf.len() as u32); + let thislen = PAGE_SZ - off; + if thislen < 4 { + // it needs to be split across pages + buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]); + blknum += 1; + buf = self.get_buf_for_write(blknum)?; + buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]); + off = 4 - thislen; + } else { + buf[off..off + 4].copy_from_slice(&len_buf); + off += 4; } - Ok(self.pos) + + // Write the payload + let mut buf_remain = srcbuf; + while !buf_remain.is_empty() { + let mut page_remain = PAGE_SZ - off; + if page_remain == 0 { + blknum += 1; + buf = self.get_buf_for_write(blknum)?; + off = 0; + page_remain = PAGE_SZ; + } + let this_blk_len = min(page_remain, buf_remain.len()); + buf[off..(off + this_blk_len)].copy_from_slice(&buf_remain[..this_blk_len]); + off += this_blk_len; + buf_remain = &buf_remain[this_blk_len..]; + } + drop(buf); + self.size += 4 + srcbuf.len() as u64; + + Ok(pos) } } @@ -239,11 +262,34 @@ pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Er } } +impl BlockReader for EphemeralFile { + type BlockLease = page_cache::PageReadGuard<'static>; + + fn read_blk(&self, blknum: u32) -> Result { + // Look up the right page + let cache = page_cache::get(); + loop { + match cache.read_ephemeral_buf(self.file_id, blknum) { + ReadBufResult::Found(guard) => return Ok(guard), + ReadBufResult::NotFound(mut write_guard) => { + // Read the page from disk into the buffer + self.fill_buffer(write_guard.deref_mut(), blknum)?; + write_guard.mark_valid(); + + // Swap for read lock + continue; + } + }; + } + } +} + #[cfg(test)] mod tests { use super::*; - use rand::seq::SliceRandom; - use rand::thread_rng; + use crate::layered_repository::blob_io::{BlobCursor, BlobWriter}; + use crate::layered_repository::block_io::BlockCursor; + use rand::{seq::SliceRandom, thread_rng, RngCore}; use std::fs; use std::str::FromStr; @@ -281,19 +327,19 @@ mod tests { fn test_ephemeral_files() -> Result<(), Error> { let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?; - let mut file_a = EphemeralFile::create(conf, tenantid, timelineid)?; + let file_a = EphemeralFile::create(conf, tenantid, timelineid)?; - file_a.write_all(b"foo")?; + file_a.write_all_at(b"foo", 0)?; assert_eq!("foo", read_string(&file_a, 0, 20)?); - file_a.write_all(b"bar")?; + file_a.write_all_at(b"bar", 3)?; assert_eq!("foobar", read_string(&file_a, 0, 20)?); // Open a lot of files, enough to cause some page evictions. let mut efiles = Vec::new(); for fileno in 0..100 { - let mut efile = EphemeralFile::create(conf, tenantid, timelineid)?; - efile.write_all(format!("file {}", fileno).as_bytes())?; + let efile = EphemeralFile::create(conf, tenantid, timelineid)?; + efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?; assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?); efiles.push((fileno, efile)); } @@ -307,4 +353,41 @@ mod tests { Ok(()) } + + #[test] + fn test_ephemeral_blobs() -> Result<(), Error> { + let (conf, tenantid, timelineid) = repo_harness("ephemeral_blobs")?; + + let mut file = EphemeralFile::create(conf, tenantid, timelineid)?; + + let pos_foo = file.write_blob(b"foo")?; + assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice()); + let pos_bar = file.write_blob(b"bar")?; + assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice()); + assert_eq!(b"bar", file.block_cursor().read_blob(pos_bar)?.as_slice()); + + let mut blobs = Vec::new(); + for i in 0..10000 { + let data = Vec::from(format!("blob{}", i).as_bytes()); + let pos = file.write_blob(&data)?; + blobs.push((pos, data)); + } + + let mut cursor = BlockCursor::new(&file); + for (pos, expected) in blobs { + let actual = cursor.read_blob(pos)?; + assert_eq!(actual, expected); + } + drop(cursor); + + // Test a large blob that spans multiple pages + let mut large_data = Vec::new(); + large_data.resize(20000, 0); + thread_rng().fill_bytes(&mut large_data); + let pos_large = file.write_blob(&large_data)?; + let result = file.block_cursor().read_blob(pos_large)?; + assert_eq!(result, large_data); + + Ok(()) + } } diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 68d1cd4a8a..a8e5de09f5 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -13,63 +13,70 @@ //! //! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568 //! -//! An image file is constructed using the 'bookfile' crate. +//! Every image layer file consists of three parts: "summary", +//! "index", and "values". The summary is a fixed size header at the +//! beginning of the file, and it contains basic information about the +//! layer, and offsets to the other parts. The "index" is a serialized +//! HashMap, mapping from Key to an offset in the "values" part. The +//! actual page images are stored in the "values" part. //! -//! Only metadata is loaded into memory by the load function. +//! Only the "index" is loaded into memory by the load function. //! When images are needed, they are read directly from disk. //! use crate::config::PageServerConf; +use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; +use crate::layered_repository::block_io::{BlockReader, FileBlockReader}; use crate::layered_repository::filename::{ImageFileName, PathOrConf}; use crate::layered_repository::storage_layer::{ BlobRef, Layer, ValueReconstructResult, ValueReconstructState, }; +use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value}; use crate::virtual_file::VirtualFile; -use crate::IMAGE_FILE_MAGIC; use crate::{ZTenantId, ZTimelineId}; +use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use log::*; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::fs; -use std::io::{BufWriter, Write}; +use std::io::Write; +use std::io::{Seek, SeekFrom}; use std::ops::Range; use std::path::{Path, PathBuf}; use std::sync::{RwLock, RwLockReadGuard, TryLockError}; -use bookfile::{Book, BookWriter, ChapterWriter}; - use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::Lsn; -/// Mapping from (key, lsn) -> page/WAL record -/// byte ranges in VALUES_CHAPTER -static INDEX_CHAPTER: u64 = 1; - -/// Contains each block in block # order -const VALUES_CHAPTER: u64 = 2; - -/// Contains the [`Summary`] struct -const SUMMARY_CHAPTER: u64 = 3; - #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] struct Summary { + /// Magic value to identify this as a zenith image file. Always IMAGE_FILE_MAGIC. + magic: u16, + format_version: u16, + tenantid: ZTenantId, timelineid: ZTimelineId, key_range: Range, - lsn: Lsn, + + /// Block number where the 'index' part of the file begins. + index_start_blk: u32, } impl From<&ImageLayer> for Summary { fn from(layer: &ImageLayer) -> Self { Self { + magic: IMAGE_FILE_MAGIC, + format_version: STORAGE_FORMAT_VERSION, tenantid: layer.tenantid, timelineid: layer.timelineid, key_range: layer.key_range.clone(), lsn: layer.lsn, + + index_start_blk: 0, } } } @@ -97,12 +104,14 @@ pub struct ImageLayerInner { /// If false, the 'index' has not been loaded into memory yet. loaded: bool, - /// The underlying (virtual) file handle. None if the layer hasn't been loaded - /// yet. - book: Option>, - /// offset of each value index: HashMap, + + // values copied from summary + index_start_blk: u32, + + /// Reader object for reading blocks from the file. (None if not loaded yet) + file: Option>, } impl Layer for ImageLayer { @@ -138,26 +147,21 @@ impl Layer for ImageLayer { assert!(lsn_range.end >= self.lsn); let inner = self.load()?; - if let Some(blob_ref) = inner.index.get(&key) { - let chapter = inner - .book + let buf = inner + .file .as_ref() .unwrap() - .chapter_reader(VALUES_CHAPTER)?; - - let mut blob = vec![0; blob_ref.size()]; - chapter - .read_exact_at(&mut blob, blob_ref.pos()) + .block_cursor() + .read_blob(blob_ref.pos()) .with_context(|| { format!( - "failed to read {} bytes from data file {} at offset {}", - blob_ref.size(), + "failed to read blob from data file {} at offset {}", self.filename().display(), blob_ref.pos() ) })?; - let value = Bytes::from(blob); + let value = Bytes::from(buf); reconstruct_state.img = Some((self.lsn, value)); Ok(ValueReconstructResult::Complete) @@ -228,12 +232,7 @@ impl Layer for ImageLayer { index_vec.sort_by_key(|x| x.1.pos()); for (key, blob_ref) in index_vec { - println!( - "key: {} size {} offset {}", - key, - blob_ref.size(), - blob_ref.pos() - ); + println!("key: {} offset {}", key, blob_ref.pos()); } Ok(()) @@ -291,21 +290,19 @@ impl ImageLayer { let path = self.path(); // Open the file if it's not open already. - if inner.book.is_none() { + if inner.file.is_none() { let file = VirtualFile::open(&path) .with_context(|| format!("Failed to open file '{}'", path.display()))?; - inner.book = Some(Book::new(file).with_context(|| { - format!("Failed to open file '{}' as a bookfile", path.display()) - })?); + inner.file = Some(FileBlockReader::new(file)); } - let book = inner.book.as_ref().unwrap(); + let file = inner.file.as_mut().unwrap(); + let summary_blk = file.read_blk(0)?; + let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; match &self.path_or_conf { PathOrConf::Conf(_) => { - let chapter = book.read_chapter(SUMMARY_CHAPTER)?; - let actual_summary = Summary::des(&chapter)?; - - let expected_summary = Summary::from(self); + let mut expected_summary = Summary::from(self); + expected_summary.index_start_blk = actual_summary.index_start_blk; if actual_summary != expected_summary { bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary); @@ -325,14 +322,18 @@ impl ImageLayer { } } - let chapter = book.read_chapter(INDEX_CHAPTER)?; - let index = HashMap::des(&chapter)?; + file.file.seek(SeekFrom::Start( + actual_summary.index_start_blk as u64 * PAGE_SZ as u64, + ))?; + let mut buf_reader = std::io::BufReader::new(&mut file.file); + let index = HashMap::des_from(&mut buf_reader)?; + + inner.index_start_blk = actual_summary.index_start_blk; info!("loaded from {}", &path.display()); inner.index = index; inner.loaded = true; - Ok(()) } @@ -350,9 +351,10 @@ impl ImageLayer { key_range: filename.key_range.clone(), lsn: filename.lsn, inner: RwLock::new(ImageLayerInner { - book: None, index: HashMap::new(), loaded: false, + file: None, + index_start_blk: 0, }), } } @@ -360,12 +362,14 @@ impl ImageLayer { /// Create an ImageLayer struct representing an existing file on disk. /// /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary. - pub fn new_for_path(path: &Path, book: &Book) -> Result + pub fn new_for_path(path: &Path, file: F) -> Result where F: std::os::unix::prelude::FileExt, { - let chapter = book.read_chapter(SUMMARY_CHAPTER)?; - let summary = Summary::des(&chapter)?; + let mut summary_buf = Vec::new(); + summary_buf.resize(PAGE_SZ, 0); + file.read_exact_at(&mut summary_buf, 0)?; + let summary = Summary::des_prefix(&summary_buf)?; Ok(ImageLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), @@ -374,9 +378,10 @@ impl ImageLayer { key_range: summary.key_range, lsn: summary.lsn, inner: RwLock::new(ImageLayerInner { - book: None, + file: None, index: HashMap::new(), loaded: false, + index_start_blk: 0, }), }) } @@ -412,18 +417,15 @@ impl ImageLayer { /// pub struct ImageLayerWriter { conf: &'static PageServerConf, - path: PathBuf, + _path: PathBuf, timelineid: ZTimelineId, tenantid: ZTenantId, key_range: Range, lsn: Lsn, - values_writer: Option>>, - end_offset: u64, - index: HashMap, - finished: bool, + blob_writer: WriteBlobWriter, } impl ImageLayerWriter { @@ -449,24 +451,17 @@ impl ImageLayerWriter { ); info!("new image layer {}", path.display()); let file = VirtualFile::create(&path)?; - let buf_writer = BufWriter::new(file); - let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?; - - // Open the page-images chapter for writing. The calls to - // `put_image` will use this to write the contents. - let chapter = book.new_chapter(VALUES_CHAPTER); + let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64); let writer = ImageLayerWriter { conf, - path, + _path: path, timelineid, tenantid, key_range: key_range.clone(), lsn, - values_writer: Some(chapter), index: HashMap::new(), - end_offset: 0, - finished: false, + blob_writer, }; Ok(writer) @@ -479,49 +474,41 @@ impl ImageLayerWriter { /// pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> { ensure!(self.key_range.contains(&key)); - let off = self.end_offset; + let off = self.blob_writer.write_blob(img)?; - if let Some(writer) = &mut self.values_writer { - let len = img.len(); - writer.write_all(img)?; - self.end_offset += len as u64; - - let old = self.index.insert(key, BlobRef::new(off, len, true)); - assert!(old.is_none()); - } else { - panic!() - } + let old = self.index.insert(key, BlobRef::new(off, true)); + assert!(old.is_none()); Ok(()) } - pub fn finish(&mut self) -> anyhow::Result { - // Close the values chapter - let book = self.values_writer.take().unwrap().close()?; + pub fn finish(self) -> anyhow::Result { + let index_start_blk = + ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; + + let mut file = self.blob_writer.into_inner(); // Write out the index - let mut chapter = book.new_chapter(INDEX_CHAPTER); let buf = HashMap::ser(&self.index)?; - chapter.write_all(&buf)?; - let book = chapter.close()?; + file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?; + file.write_all(&buf)?; - // Write out the summary chapter - let mut chapter = book.new_chapter(SUMMARY_CHAPTER); + // Fill in the summary on blk 0 let summary = Summary { + magic: IMAGE_FILE_MAGIC, + format_version: STORAGE_FORMAT_VERSION, tenantid: self.tenantid, timelineid: self.timelineid, key_range: self.key_range.clone(), lsn: self.lsn, + index_start_blk, }; - Summary::ser_into(&summary, &mut chapter)?; - let book = chapter.close()?; - - // This flushes the underlying 'buf_writer'. - book.close()?; + file.seek(SeekFrom::Start(0))?; + Summary::ser_into(&summary, &mut file)?; // Note: Because we open the file in write-only mode, we cannot // reuse the same VirtualFile for reading later. That's why we don't - // set inner.book here. The first read will have to re-open it. + // set inner.file here. The first read will have to re-open it. let layer = ImageLayer { path_or_conf: PathOrConf::Conf(self.conf), timelineid: self.timelineid, @@ -529,28 +516,14 @@ impl ImageLayerWriter { key_range: self.key_range.clone(), lsn: self.lsn, inner: RwLock::new(ImageLayerInner { - book: None, loaded: false, index: HashMap::new(), + file: None, + index_start_blk, }), }; trace!("created image layer {}", layer.path().display()); - self.finished = true; - Ok(layer) } } - -impl Drop for ImageLayerWriter { - fn drop(&mut self) { - if let Some(page_image_writer) = self.values_writer.take() { - if let Ok(book) = page_image_writer.close() { - let _ = book.close(); - } - } - if !self.finished { - let _ = fs::remove_file(&self.path); - } - } -} diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 8670442a2c..8a24528732 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -5,10 +5,12 @@ //! its position in the file, is kept in memory, though. //! use crate::config::PageServerConf; +use crate::layered_repository::blob_io::{BlobCursor, BlobWriter}; +use crate::layered_repository::block_io::BlockReader; use crate::layered_repository::delta_layer::{DeltaLayer, DeltaLayerWriter}; use crate::layered_repository::ephemeral_file::EphemeralFile; use crate::layered_repository::storage_layer::{ - BlobRef, Layer, ValueReconstructResult, ValueReconstructState, + Layer, ValueReconstructResult, ValueReconstructState, }; use crate::repository::{Key, Value}; use crate::walrecord; @@ -19,9 +21,7 @@ use std::collections::HashMap; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods use std::fmt::Write as _; -use std::io::Write; use std::ops::Range; -use std::os::unix::fs::FileExt; use std::path::PathBuf; use std::sync::RwLock; use zenith_utils::bin_ser::BeSer; @@ -54,14 +54,12 @@ pub struct InMemoryLayerInner { /// by block number and LSN. The value is an offset into the /// ephemeral file where the page version is stored. /// - index: HashMap>, + index: HashMap>, /// The values are stored in a serialized format in this file. /// Each serialized Value is preceded by a 'u32' length field. /// PerSeg::page_versions map stores offsets into this file. file: EphemeralFile, - - end_offset: u64, } impl InMemoryLayerInner { @@ -120,10 +118,12 @@ impl Layer for InMemoryLayer { let inner = self.inner.read().unwrap(); + let mut reader = inner.file.block_cursor(); + // Scan the page versions backwards, starting from `lsn`. if let Some(vec_map) = inner.index.get(&key) { let slice = vec_map.slice_range(lsn_range); - for (entry_lsn, blob_ref) in slice.iter().rev() { + for (entry_lsn, pos) in slice.iter().rev() { match &reconstruct_state.img { Some((cached_lsn, _)) if entry_lsn <= cached_lsn => { return Ok(ValueReconstructResult::Complete) @@ -131,8 +131,7 @@ impl Layer for InMemoryLayer { _ => {} } - let mut buf = vec![0u8; blob_ref.size()]; - inner.file.read_exact_at(&mut buf, blob_ref.pos())?; + let buf = reader.read_blob(*pos)?; let value = Value::des(&buf)?; match value { Value::Image(img) => { @@ -208,12 +207,12 @@ impl Layer for InMemoryLayer { return Ok(()); } + let mut cursor = inner.file.block_cursor(); let mut buf = Vec::new(); for (key, vec_map) in inner.index.iter() { - for (lsn, blob_ref) in vec_map.as_slice() { + for (lsn, pos) in vec_map.as_slice() { let mut desc = String::new(); - buf.resize(blob_ref.size(), 0); - inner.file.read_exact_at(&mut buf, blob_ref.pos())?; + cursor.read_blob_into_buf(*pos, &mut buf)?; let val = Value::des(&buf); match val { Ok(Value::Image(img)) => { @@ -268,7 +267,6 @@ impl InMemoryLayer { end_lsn: None, index: HashMap::new(), file, - end_offset: 0, }), }) } @@ -283,15 +281,10 @@ impl InMemoryLayer { inner.assert_writeable(); - let off = inner.end_offset; - let buf = Value::ser(&val)?; - let len = buf.len(); - inner.file.write_all(&buf)?; - inner.end_offset += len as u64; + let off = inner.file.write_blob(&Value::ser(&val)?)?; let vec_map = inner.index.entry(key).or_default(); - let blob_ref = BlobRef::new(off, len, val.will_init()); - let old = vec_map.append_or_update_last(lsn, blob_ref).unwrap().0; + let old = vec_map.append_or_update_last(lsn, off).unwrap().0; if old.is_some() { // We already had an entry for this LSN. That's odd.. warn!("Key {} at {} already exists", key, lsn); @@ -345,21 +338,21 @@ impl InMemoryLayer { self.start_lsn..inner.end_lsn.unwrap(), )?; - let mut do_steps = || -> Result<()> { - for (key, vec_map) in inner.index.iter() { - // Write all page versions - for (lsn, blob_ref) in vec_map.as_slice() { - let mut buf = vec![0u8; blob_ref.size()]; - inner.file.read_exact_at(&mut buf, blob_ref.pos())?; - let val = Value::des(&buf)?; - delta_layer_writer.put_value(*key, *lsn, val)?; - } + let mut buf = Vec::new(); + + let mut cursor = inner.file.block_cursor(); + + let mut keys: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); + keys.sort_by_key(|k| k.0); + + for (key, vec_map) in keys.iter() { + let key = **key; + // Write all page versions + for (lsn, pos) in vec_map.as_slice() { + cursor.read_blob_into_buf(*pos, &mut buf)?; + let val = Value::des(&buf)?; + delta_layer_writer.put_value(key, *lsn, val)?; } - Ok(()) - }; - if let Err(err) = do_steps() { - delta_layer_writer.abort(); - return Err(err); } let delta_layer = delta_layer_writer.finish(Key::MAX)?; diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index 2711640736..b5366da223 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -150,9 +150,10 @@ pub trait Layer: Send + Sync { const WILL_INIT: u64 = 1; /// -/// Struct representing reference to BLOB in layers. Reference contains BLOB offset and size. -/// For WAL records (delta layer) it also contains `will_init` flag which helps to determine range of records -/// which needs to be applied without reading/deserializing records themselves. +/// Struct representing reference to BLOB in layers. Reference contains BLOB +/// offset, and for WAL records it also contains `will_init` flag. The flag +/// helps to determine the range of records that needs to be applied, without +/// reading/deserializing records themselves. /// #[derive(Debug, Serialize, Deserialize, Copy, Clone)] pub struct BlobRef(u64); @@ -163,15 +164,11 @@ impl BlobRef { } pub fn pos(&self) -> u64 { - self.0 >> 32 + self.0 >> 1 } - pub fn size(&self) -> usize { - ((self.0 & 0xFFFFFFFF) >> 1) as usize - } - - pub fn new(pos: u64, size: usize, will_init: bool) -> BlobRef { - let mut blob_ref = (pos << 32) | ((size as u64) << 1); + pub fn new(pos: u64, will_init: bool) -> BlobRef { + let mut blob_ref = pos << 1; if will_init { blob_ref |= WILL_INIT; } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 4790ab6652..6d2631b2b1 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -38,11 +38,11 @@ use pgdatadir_mapping::DatadirTimeline; /// This is embedded in the metadata file, and also in the header of all the /// layer files. If you make any backwards-incompatible changes to the storage /// format, bump this! -pub const STORAGE_FORMAT_VERSION: u16 = 1; +pub const STORAGE_FORMAT_VERSION: u16 = 2; // Magic constants used to identify different kinds of files -pub const IMAGE_FILE_MAGIC: u32 = 0x5A60_0000 | STORAGE_FORMAT_VERSION as u32; -pub const DELTA_FILE_MAGIC: u32 = 0x5A61_0000 | STORAGE_FORMAT_VERSION as u32; +pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; +pub const DELTA_FILE_MAGIC: u16 = 0x5A61; lazy_static! { static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!( diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index c485e46f47..bd44384a44 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -56,7 +56,7 @@ use crate::layered_repository::writeback_ephemeral_file; use crate::repository::Key; static PAGE_CACHE: OnceCell = OnceCell::new(); -const TEST_PAGE_CACHE_SIZE: usize = 10; +const TEST_PAGE_CACHE_SIZE: usize = 50; /// /// Initialize the page cache. This must be called once at page server startup. @@ -90,6 +90,7 @@ const MAX_USAGE_COUNT: u8 = 5; /// CacheKey uniquely identifies a "thing" to cache in the page cache. /// #[derive(Debug, PartialEq, Eq, Clone)] +#[allow(clippy::enum_variant_names)] enum CacheKey { MaterializedPage { hash_key: MaterializedPageHashKey, @@ -99,6 +100,10 @@ enum CacheKey { file_id: u64, blkno: u32, }, + ImmutableFilePage { + file_id: u64, + blkno: u32, + }, } #[derive(Debug, PartialEq, Eq, Hash, Clone)] @@ -173,6 +178,8 @@ pub struct PageCache { ephemeral_page_map: RwLock>, + immutable_page_map: RwLock>, + /// The actual buffers with their metadata. slots: Box<[Slot]>, @@ -195,6 +202,12 @@ impl std::ops::Deref for PageReadGuard<'_> { } } +impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> { + fn as_ref(&self) -> &[u8; PAGE_SZ] { + self.0.buf + } +} + /// /// PageWriteGuard is a lease on a buffer for modifying it. The page is kept locked /// until the guard is dropped. @@ -226,6 +239,12 @@ impl std::ops::Deref for PageWriteGuard<'_> { } } +impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> { + fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] { + self.inner.buf + } +} + impl PageWriteGuard<'_> { /// Mark that the buffer contents are now valid. pub fn mark_valid(&mut self) { @@ -381,6 +400,36 @@ impl PageCache { } } + // Section 1.3: Public interface functions for working with immutable file pages. + + pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult { + let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno }; + + self.lock_for_read(&mut cache_key) + } + + /// Immediately drop all buffers belonging to given file, without writeback + pub fn drop_buffers_for_immutable(&self, drop_file_id: u64) { + for slot_idx in 0..self.slots.len() { + let slot = &self.slots[slot_idx]; + + let mut inner = slot.inner.write().unwrap(); + if let Some(key) = &inner.key { + match key { + CacheKey::ImmutableFilePage { file_id, blkno: _ } + if *file_id == drop_file_id => + { + // remove mapping for old buffer + self.remove_mapping(key); + inner.key = None; + inner.dirty = false; + } + _ => {} + } + } + } + } + // // Section 2: Internal interface functions for lookup/update. // @@ -578,6 +627,10 @@ impl PageCache { let map = self.ephemeral_page_map.read().unwrap(); Some(*map.get(&(*file_id, *blkno))?) } + CacheKey::ImmutableFilePage { file_id, blkno } => { + let map = self.immutable_page_map.read().unwrap(); + Some(*map.get(&(*file_id, *blkno))?) + } } } @@ -601,6 +654,10 @@ impl PageCache { let map = self.ephemeral_page_map.read().unwrap(); Some(*map.get(&(*file_id, *blkno))?) } + CacheKey::ImmutableFilePage { file_id, blkno } => { + let map = self.immutable_page_map.read().unwrap(); + Some(*map.get(&(*file_id, *blkno))?) + } } } @@ -632,6 +689,11 @@ impl PageCache { map.remove(&(*file_id, *blkno)) .expect("could not find old key in mapping"); } + CacheKey::ImmutableFilePage { file_id, blkno } => { + let mut map = self.immutable_page_map.write().unwrap(); + map.remove(&(*file_id, *blkno)) + .expect("could not find old key in mapping"); + } } } @@ -672,6 +734,16 @@ impl PageCache { } } } + CacheKey::ImmutableFilePage { file_id, blkno } => { + let mut map = self.immutable_page_map.write().unwrap(); + match map.entry((*file_id, *blkno)) { + Entry::Occupied(entry) => Some(*entry.get()), + Entry::Vacant(entry) => { + entry.insert(slot_idx); + None + } + } + } } } @@ -749,6 +821,13 @@ impl PageCache { CacheKey::EphemeralPage { file_id, blkno } => { writeback_ephemeral_file(*file_id, *blkno, buf) } + CacheKey::ImmutableFilePage { + file_id: _, + blkno: _, + } => Err(std::io::Error::new( + std::io::ErrorKind::Other, + "unexpected dirty immutable page", + )), } } @@ -779,6 +858,7 @@ impl PageCache { Self { materialized_page_map: Default::default(), ephemeral_page_map: Default::default(), + immutable_page_map: Default::default(), slots, next_evict_slot: AtomicUsize::new(0), } diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 858cff29cb..64f9db2338 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -65,6 +65,7 @@ lazy_static! { /// currently open, the 'handle' can still point to the slot where it was last kept. The /// 'tag' field is used to detect whether the handle still is valid or not. /// +#[derive(Debug)] pub struct VirtualFile { /// Lazy handle to the global file descriptor cache. The slot that this points to /// might contain our File, or it may be empty, or it may contain a File that @@ -88,7 +89,7 @@ pub struct VirtualFile { timelineid: String, } -#[derive(PartialEq, Clone, Copy)] +#[derive(Debug, PartialEq, Clone, Copy)] struct SlotHandle { /// Index into OPEN_FILES.slots index: usize, From c4b57e4b8fb55360bdb77cc9165be8fc31b0b469 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 7 Apr 2022 20:50:12 +0300 Subject: [PATCH 0115/1022] Move BlobRef It's not needed in image layers anymore, so move it into delta_layer.rs --- pageserver/src/layered_repository/blob_io.rs | 17 ++++++++++ pageserver/src/layered_repository/block_io.rs | 2 -- .../src/layered_repository/delta_layer.rs | 32 ++++++++++++++++++- .../src/layered_repository/image_layer.rs | 21 ++++++------ .../src/layered_repository/storage_layer.rs | 31 ------------------ 5 files changed, 57 insertions(+), 46 deletions(-) diff --git a/pageserver/src/layered_repository/blob_io.rs b/pageserver/src/layered_repository/blob_io.rs index 10bfea934d..aa90bbd0cf 100644 --- a/pageserver/src/layered_repository/blob_io.rs +++ b/pageserver/src/layered_repository/blob_io.rs @@ -10,12 +10,15 @@ use std::io::Error; /// For reading pub trait BlobCursor { + /// Read a blob into a new buffer. fn read_blob(&mut self, offset: u64) -> Result, std::io::Error> { let mut buf = Vec::new(); self.read_blob_into_buf(offset, &mut buf)?; Ok(buf) } + /// Read blob into the given buffer. Any previous contents in the buffer + /// are overwritten. fn read_blob_into_buf( &mut self, offset: u64, @@ -75,10 +78,19 @@ where } } +/// +/// Abstract trait for a data sink that you can write blobs to. +/// pub trait BlobWriter { + /// Write a blob of data. Returns the offset that it was written to, + /// which can be used to retrieve the data later. fn write_blob(&mut self, srcbuf: &[u8]) -> Result; } +/// +/// An implementation of BlobWriter to write blobs to anything that +/// implements std::io::Write. +/// pub struct WriteBlobWriter where W: std::io::Write, @@ -102,6 +114,11 @@ where self.offset } + /// Access the underlying Write object. + /// + /// NOTE: WriteBlobWriter keeps track of the current write offset. If + /// you write something directly to the inner Write object, it makes the + /// internally tracked 'offset' to go out of sync. So don't do that. pub fn into_inner(self) -> W { self.inner } diff --git a/pageserver/src/layered_repository/block_io.rs b/pageserver/src/layered_repository/block_io.rs index 2b8e31e1ee..a8992a6cb5 100644 --- a/pageserver/src/layered_repository/block_io.rs +++ b/pageserver/src/layered_repository/block_io.rs @@ -1,8 +1,6 @@ //! //! Low-level Block-oriented I/O functions //! -//! -//! use crate::page_cache; use crate::page_cache::{ReadBufResult, PAGE_SZ}; diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index f8828b541f..43122fd99d 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -35,7 +35,7 @@ use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter use crate::layered_repository::block_io::{BlockCursor, BlockReader, FileBlockReader}; use crate::layered_repository::filename::{DeltaFileName, PathOrConf}; use crate::layered_repository::storage_layer::{ - BlobRef, Layer, ValueReconstructResult, ValueReconstructState, + Layer, ValueReconstructResult, ValueReconstructState, }; use crate::page_cache::{PageReadGuard, PAGE_SZ}; use crate::repository::{Key, Value}; @@ -93,6 +93,36 @@ impl From<&DeltaLayer> for Summary { } } +// Flag indicating that this version initialize the page +const WILL_INIT: u64 = 1; + +/// +/// Struct representing reference to BLOB in layers. Reference contains BLOB +/// offset, and for WAL records it also contains `will_init` flag. The flag +/// helps to determine the range of records that needs to be applied, without +/// reading/deserializing records themselves. +/// +#[derive(Debug, Serialize, Deserialize, Copy, Clone)] +struct BlobRef(u64); + +impl BlobRef { + pub fn will_init(&self) -> bool { + (self.0 & WILL_INIT) != 0 + } + + pub fn pos(&self) -> u64 { + self.0 >> 1 + } + + pub fn new(pos: u64, will_init: bool) -> BlobRef { + let mut blob_ref = pos << 1; + if will_init { + blob_ref |= WILL_INIT; + } + BlobRef(blob_ref) + } +} + /// /// DeltaLayer is the in-memory data structure associated with an /// on-disk delta file. We keep a DeltaLayer in memory for each diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index a8e5de09f5..d0afce1549 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -28,7 +28,7 @@ use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter use crate::layered_repository::block_io::{BlockReader, FileBlockReader}; use crate::layered_repository::filename::{ImageFileName, PathOrConf}; use crate::layered_repository::storage_layer::{ - BlobRef, Layer, ValueReconstructResult, ValueReconstructState, + Layer, ValueReconstructResult, ValueReconstructState, }; use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value}; @@ -105,7 +105,7 @@ pub struct ImageLayerInner { loaded: bool, /// offset of each value - index: HashMap, + index: HashMap, // values copied from summary index_start_blk: u32, @@ -147,18 +147,18 @@ impl Layer for ImageLayer { assert!(lsn_range.end >= self.lsn); let inner = self.load()?; - if let Some(blob_ref) = inner.index.get(&key) { + if let Some(&offset) = inner.index.get(&key) { let buf = inner .file .as_ref() .unwrap() .block_cursor() - .read_blob(blob_ref.pos()) + .read_blob(offset) .with_context(|| { format!( "failed to read blob from data file {} at offset {}", self.filename().display(), - blob_ref.pos() + offset ) })?; let value = Bytes::from(buf); @@ -228,11 +228,8 @@ impl Layer for ImageLayer { let inner = self.load()?; - let mut index_vec: Vec<(&Key, &BlobRef)> = inner.index.iter().collect(); - index_vec.sort_by_key(|x| x.1.pos()); - - for (key, blob_ref) in index_vec { - println!("key: {} offset {}", key, blob_ref.pos()); + for (key, offset) in inner.index.iter() { + println!("key: {} offset {}", key, offset); } Ok(()) @@ -423,7 +420,7 @@ pub struct ImageLayerWriter { key_range: Range, lsn: Lsn, - index: HashMap, + index: HashMap, blob_writer: WriteBlobWriter, } @@ -476,7 +473,7 @@ impl ImageLayerWriter { ensure!(self.key_range.contains(&key)); let off = self.blob_writer.write_blob(img)?; - let old = self.index.insert(key, BlobRef::new(off, true)); + let old = self.index.insert(key, off); assert!(old.is_none()); Ok(()) diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index b5366da223..5ad43182f6 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -7,7 +7,6 @@ use crate::walrecord::ZenithWalRecord; use crate::{ZTenantId, ZTimelineId}; use anyhow::Result; use bytes::Bytes; -use serde::{Deserialize, Serialize}; use std::ops::Range; use std::path::PathBuf; @@ -145,33 +144,3 @@ pub trait Layer: Send + Sync { /// Dump summary of the contents of the layer to stdout fn dump(&self, verbose: bool) -> Result<()>; } - -// Flag indicating that this version initialize the page -const WILL_INIT: u64 = 1; - -/// -/// Struct representing reference to BLOB in layers. Reference contains BLOB -/// offset, and for WAL records it also contains `will_init` flag. The flag -/// helps to determine the range of records that needs to be applied, without -/// reading/deserializing records themselves. -/// -#[derive(Debug, Serialize, Deserialize, Copy, Clone)] -pub struct BlobRef(u64); - -impl BlobRef { - pub fn will_init(&self) -> bool { - (self.0 & WILL_INIT) != 0 - } - - pub fn pos(&self) -> u64 { - self.0 >> 1 - } - - pub fn new(pos: u64, will_init: bool) -> BlobRef { - let mut blob_ref = pos << 1; - if will_init { - blob_ref |= WILL_INIT; - } - BlobRef(blob_ref) - } -} From 214567bf8fafed56cd867698d9e54fafc7001b45 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 7 Apr 2022 20:50:16 +0300 Subject: [PATCH 0116/1022] Use B-tree for the index in image and delta layers. We now use a page cache for those, instead of slurping the whole index into memory. Fixes https://github.com/zenithdb/zenith/issues/1356 This is a backwards-incompatible change to the storage format, so bump STORAGE_FORMAT_VERSION. --- Cargo.lock | 1 + pageserver/Cargo.toml | 1 + pageserver/src/layered_repository.rs | 10 +- pageserver/src/layered_repository/block_io.rs | 45 + .../src/layered_repository/delta_layer.rs | 290 ++- .../src/layered_repository/disk_btree.rs | 979 ++++++++ .../disk_btree_test_data.rs | 2013 +++++++++++++++++ .../src/layered_repository/image_layer.rs | 144 +- .../src/layered_repository/inmemory_layer.rs | 7 - .../src/layered_repository/storage_layer.rs | 4 - pageserver/src/lib.rs | 2 +- pageserver/src/repository.rs | 16 +- 12 files changed, 3287 insertions(+), 225 deletions(-) create mode 100644 pageserver/src/layered_repository/disk_btree.rs create mode 100644 pageserver/src/layered_repository/disk_btree_test_data.rs diff --git a/Cargo.lock b/Cargo.lock index e0b6288f63..19ccd18a10 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1499,6 +1499,7 @@ dependencies = [ "daemonize", "fail", "futures", + "hex", "hex-literal", "humantime", "hyper", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index a5283cb331..4d79811bfb 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -10,6 +10,7 @@ regex = "1.4.5" bytes = { version = "1.0.1", features = ['serde'] } byteorder = "1.4.3" futures = "0.3.13" +hex = "0.4.3" hyper = "0.14" itertools = "0.10.3" lazy_static = "1.4.0" diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 5adf4a89ff..d7a250f31e 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -58,6 +58,7 @@ use zenith_utils::seqwait::SeqWait; mod blob_io; pub mod block_io; mod delta_layer; +mod disk_btree; pub(crate) mod ephemeral_file; mod filename; mod image_layer; @@ -1602,15 +1603,6 @@ impl LayeredTimeline { debug!("Could not compact because no partitioning specified yet"); } - // Call unload() on all frozen layers, to release memory. - // This shouldn't be much memory, as only metadata is slurped - // into memory. - let layers = self.layers.lock().unwrap(); - for layer in layers.iter_historic_layers() { - layer.unload()?; - } - drop(layers); - Ok(()) } diff --git a/pageserver/src/layered_repository/block_io.rs b/pageserver/src/layered_repository/block_io.rs index a8992a6cb5..2eba0aa403 100644 --- a/pageserver/src/layered_repository/block_io.rs +++ b/pageserver/src/layered_repository/block_io.rs @@ -4,6 +4,7 @@ use crate::page_cache; use crate::page_cache::{ReadBufResult, PAGE_SZ}; +use bytes::Bytes; use lazy_static::lazy_static; use std::ops::{Deref, DerefMut}; use std::os::unix::fs::FileExt; @@ -172,3 +173,47 @@ where } } } + +/// +/// Trait for block-oriented output +/// +pub trait BlockWriter { + /// + /// Write a page to the underlying storage. + /// + /// 'buf' must be of size PAGE_SZ. Returns the block number the page was + /// written to. + /// + fn write_blk(&mut self, buf: Bytes) -> Result; +} + +/// +/// A simple in-memory buffer of blocks. +/// +pub struct BlockBuf { + pub blocks: Vec, +} +impl BlockWriter for BlockBuf { + fn write_blk(&mut self, buf: Bytes) -> Result { + assert!(buf.len() == PAGE_SZ); + let blknum = self.blocks.len(); + self.blocks.push(buf); + tracing::info!("buffered block {}", blknum); + Ok(blknum as u32) + } +} + +impl BlockBuf { + pub fn new() -> Self { + BlockBuf { blocks: Vec::new() } + } + + pub fn size(&self) -> u64 { + (self.blocks.len() * PAGE_SZ) as u64 + } +} +impl Default for BlockBuf { + fn default() -> Self { + Self::new() + } +} diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 43122fd99d..dd6b5d3afa 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -7,14 +7,8 @@ //! must be page images or WAL records with the 'will_init' flag set, so that //! they can be replayed without referring to an older page version. //! -//! When a delta file needs to be accessed, we slurp the 'index' metadata -//! into memory, into the DeltaLayerInner struct. See load() and unload() functions. -//! To access a particular value, we search `index` for the given key. -//! The byte offset in the index can be used to find the value in -//! VALUES_CHAPTER. -//! -//! On disk, the delta files are stored in timelines/ directory. -//! Currently, there are no subdirectories, and each delta file is named like this: +//! The delta files are stored in timelines/ directory. Currently, +//! there are no subdirectories, and each delta file is named like this: //! //! -__- for Summary { @@ -89,6 +89,7 @@ impl From<&DeltaLayer> for Summary { lsn_range: layer.lsn_range.clone(), index_start_blk: 0, + index_root_blk: 0, } } } @@ -123,6 +124,46 @@ impl BlobRef { } } +const DELTA_KEY_SIZE: usize = KEY_SIZE + 8; +struct DeltaKey([u8; DELTA_KEY_SIZE]); + +/// +/// This is the key of the B-tree index stored in the delta layer. It consists +/// of the serialized representation of a Key and LSN. +/// +impl DeltaKey { + fn from_slice(buf: &[u8]) -> Self { + let mut bytes: [u8; DELTA_KEY_SIZE] = [0u8; DELTA_KEY_SIZE]; + bytes.copy_from_slice(buf); + DeltaKey(bytes) + } + + fn from_key_lsn(key: &Key, lsn: Lsn) -> Self { + let mut bytes: [u8; DELTA_KEY_SIZE] = [0u8; DELTA_KEY_SIZE]; + key.write_to_byte_slice(&mut bytes[0..KEY_SIZE]); + bytes[KEY_SIZE..].copy_from_slice(&u64::to_be_bytes(lsn.0)); + DeltaKey(bytes) + } + + fn key(&self) -> Key { + Key::from_slice(&self.0) + } + + fn lsn(&self) -> Lsn { + Lsn(u64::from_be_bytes(self.0[KEY_SIZE..].try_into().unwrap())) + } + + fn extract_key_from_buf(buf: &[u8]) -> Key { + Key::from_slice(&buf[..KEY_SIZE]) + } + + fn extract_lsn_from_buf(buf: &[u8]) -> Lsn { + let mut lsn_buf = [0u8; 8]; + lsn_buf.copy_from_slice(&buf[KEY_SIZE..]); + Lsn(u64::from_be_bytes(lsn_buf)) + } +} + /// /// DeltaLayer is the in-memory data structure associated with an /// on-disk delta file. We keep a DeltaLayer in memory for each @@ -143,18 +184,12 @@ pub struct DeltaLayer { } pub struct DeltaLayerInner { - /// If false, the 'index' has not been loaded into memory yet. + /// If false, the fields below have not been loaded into memory yet. loaded: bool, - /// - /// All versions of all pages in the layer are kept here. - /// Indexed by block number and LSN. The value is an offset into the - /// chapter where the page version is stored. - /// - index: HashMap>, - // values copied from summary index_start_blk: u32, + index_root_blk: u32, /// Reader object for reading blocks from the file. (None if not loaded yet) file: Option>, @@ -196,27 +231,46 @@ impl Layer for DeltaLayer { let inner = self.load()?; // Scan the page versions backwards, starting from `lsn`. - if let Some(vec_map) = inner.index.get(&key) { - let mut reader = inner.file.as_ref().unwrap().block_cursor(); - let slice = vec_map.slice_range(lsn_range); - for (entry_lsn, blob_ref) in slice.iter().rev() { - let buf = reader.read_blob(blob_ref.pos())?; - let val = Value::des(&buf)?; - match val { - Value::Image(img) => { - reconstruct_state.img = Some((*entry_lsn, img)); + let file = inner.file.as_ref().unwrap(); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + file, + ); + let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1)); + + let mut offsets: Vec<(Lsn, u64)> = Vec::new(); + + tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| { + let blob_ref = BlobRef(value); + if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] { + return false; + } + let entry_lsn = DeltaKey::extract_lsn_from_buf(key); + offsets.push((entry_lsn, blob_ref.pos())); + + !blob_ref.will_init() + })?; + + // Ok, 'offsets' now contains the offsets of all the entries we need to read + let mut cursor = file.block_cursor(); + for (entry_lsn, pos) in offsets { + let buf = cursor.read_blob(pos)?; + let val = Value::des(&buf)?; + match val { + Value::Image(img) => { + reconstruct_state.img = Some((entry_lsn, img)); + need_image = false; + break; + } + Value::WalRecord(rec) => { + let will_init = rec.will_init(); + reconstruct_state.records.push((entry_lsn, rec)); + if will_init { + // This WAL record initializes the page, so no need to go further back need_image = false; break; } - Value::WalRecord(rec) => { - let will_init = rec.will_init(); - reconstruct_state.records.push((*entry_lsn, rec)); - if will_init { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; - } - } } } } @@ -241,36 +295,6 @@ impl Layer for DeltaLayer { } } - /// - /// Release most of the memory used by this layer. If it's accessed again later, - /// it will need to be loaded back. - /// - fn unload(&self) -> Result<()> { - // FIXME: In debug mode, loading and unloading the index slows - // things down so much that you get timeout errors. At least - // with the test_parallel_copy test. So as an even more ad hoc - // stopgap fix for that, only unload every on average 10 - // checkpoint cycles. - use rand::RngCore; - if rand::thread_rng().next_u32() > (u32::MAX / 10) { - return Ok(()); - } - - let mut inner = match self.inner.try_write() { - Ok(inner) => inner, - Err(TryLockError::WouldBlock) => return Ok(()), - Err(TryLockError::Poisoned(_)) => panic!("DeltaLayer lock was poisoned"), - }; - inner.index = HashMap::default(); - inner.loaded = false; - - // Note: we keep the Book open. Is that a good idea? The virtual file - // machinery has its own rules for closing the file descriptor if it's not - // needed, but the Book struct uses up some memory, too. - - Ok(()) - } - fn delete(&self) -> Result<()> { // delete underlying file fs::remove_file(self.path())?; @@ -303,21 +327,36 @@ impl Layer for DeltaLayer { let inner = self.load()?; - let mut values: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); - values.sort_by_key(|k| k.0); + println!( + "index_start_blk: {}, root {}", + inner.index_start_blk, inner.index_root_blk + ); - let mut reader = inner.file.as_ref().unwrap().block_cursor(); + let file = inner.file.as_ref().unwrap(); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + file, + ); + + tree_reader.dump()?; + + let mut cursor = file.block_cursor(); + tree_reader.visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |delta_key, val| { + let blob_ref = BlobRef(val); + let key = DeltaKey::extract_key_from_buf(delta_key); + let lsn = DeltaKey::extract_lsn_from_buf(delta_key); - for (key, versions) in values { - for (lsn, blob_ref) in versions.as_slice() { let mut desc = String::new(); - match reader.read_blob(blob_ref.pos()) { + match cursor.read_blob(blob_ref.pos()) { Ok(buf) => { let val = Value::des(&buf); - match val { Ok(Value::Image(img)) => { - write!(&mut desc, " img {} bytes", img.len())?; + write!(&mut desc, " img {} bytes", img.len()).unwrap(); } Ok(Value::WalRecord(rec)) => { let wal_desc = walrecord::describe_wal_record(&rec); @@ -327,20 +366,22 @@ impl Layer for DeltaLayer { buf.len(), rec.will_init(), wal_desc - )?; + ) + .unwrap(); } Err(err) => { - write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?; + write!(&mut desc, " DESERIALIZATION ERROR: {}", err).unwrap(); } } } Err(err) => { - write!(&mut desc, " READ ERROR: {}", err)?; + write!(&mut desc, " READ ERROR: {}", err).unwrap(); } } println!(" key {} at {}: {}", key, lsn, desc); - } - } + true + }, + )?; Ok(()) } @@ -409,6 +450,7 @@ impl DeltaLayer { PathOrConf::Conf(_) => { let mut expected_summary = Summary::from(self); expected_summary.index_start_blk = actual_summary.index_start_blk; + expected_summary.index_root_blk = actual_summary.index_root_blk; if actual_summary != expected_summary { bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary); } @@ -427,17 +469,11 @@ impl DeltaLayer { } } - file.file.seek(SeekFrom::Start( - actual_summary.index_start_blk as u64 * PAGE_SZ as u64, - ))?; - let mut buf_reader = std::io::BufReader::new(&mut file.file); - let index = HashMap::des_from(&mut buf_reader)?; - inner.index_start_blk = actual_summary.index_start_blk; + inner.index_root_blk = actual_summary.index_root_blk; debug!("loaded from {}", &path.display()); - inner.index = index; inner.loaded = true; Ok(()) } @@ -457,9 +493,9 @@ impl DeltaLayer { lsn_range: filename.lsn_range.clone(), inner: RwLock::new(DeltaLayerInner { loaded: false, - index: HashMap::default(), file: None, index_start_blk: 0, + index_root_blk: 0, }), } } @@ -485,8 +521,8 @@ impl DeltaLayer { inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, - index: HashMap::default(), index_start_blk: 0, + index_root_blk: 0, }), }) } @@ -529,7 +565,7 @@ pub struct DeltaLayerWriter { key_start: Key, lsn_range: Range, - index: HashMap>, + tree: DiskBtreeBuilder, blob_writer: WriteBlobWriter>, } @@ -558,10 +594,15 @@ impl DeltaLayerWriter { u64::from(lsn_range.end) )); let mut file = VirtualFile::create(&path)?; + // make room for the header block file.seek(SeekFrom::Start(PAGE_SZ as u64))?; let buf_writer = BufWriter::new(file); let blob_writer = WriteBlobWriter::new(buf_writer, PAGE_SZ as u64); + // Initialize the b-tree index builder + let block_buf = BlockBuf::new(); + let tree_builder = DiskBtreeBuilder::new(block_buf); + Ok(DeltaLayerWriter { conf, path, @@ -569,7 +610,7 @@ impl DeltaLayerWriter { tenantid, key_start, lsn_range, - index: HashMap::new(), + tree: tree_builder, blob_writer, }) } @@ -584,23 +625,16 @@ impl DeltaLayerWriter { let off = self.blob_writer.write_blob(&Value::ser(&val)?)?; - let vec_map = self.index.entry(key).or_default(); let blob_ref = BlobRef::new(off, val.will_init()); - let old = vec_map.append_or_update_last(lsn, blob_ref).unwrap().0; - if old.is_some() { - // We already had an entry for this LSN. That's odd.. - bail!( - "Value for {} at {} already exists in delta layer being built", - key, - lsn - ); - } + + let delta_key = DeltaKey::from_key_lsn(&key, lsn); + self.tree.append(&delta_key.0, blob_ref.0)?; Ok(()) } pub fn size(&self) -> u64 { - self.blob_writer.size() + self.blob_writer.size() + self.tree.borrow_writer().size() } /// @@ -614,9 +648,11 @@ impl DeltaLayerWriter { let mut file = buf_writer.into_inner()?; // Write out the index - let buf = HashMap::ser(&self.index)?; + let (index_root_blk, block_buf) = self.tree.finish()?; file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?; - file.write_all(&buf)?; + for buf in block_buf.blocks { + file.write_all(buf.as_ref())?; + } // Fill in the summary on blk 0 let summary = Summary { @@ -627,6 +663,7 @@ impl DeltaLayerWriter { key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), index_start_blk, + index_root_blk, }; file.seek(SeekFrom::Start(0))?; Summary::ser_into(&summary, &mut file)?; @@ -642,9 +679,9 @@ impl DeltaLayerWriter { lsn_range: self.lsn_range.clone(), inner: RwLock::new(DeltaLayerInner { loaded: false, - index: HashMap::new(), file: None, index_start_blk, + index_root_blk, }), }; @@ -677,7 +714,7 @@ impl DeltaLayerWriter { /// fashion. /// struct DeltaValueIter<'a> { - all_offsets: Vec<(Key, Lsn, BlobRef)>, + all_offsets: Vec<(DeltaKey, BlobRef)>, next_idx: usize, reader: BlockCursor>, } @@ -702,15 +739,22 @@ impl<'a> Iterator for DeltaValueIter<'a> { impl<'a> DeltaValueIter<'a> { fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result { - let mut index: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); - index.sort_by_key(|x| x.0); + let file = inner.file.as_ref().unwrap(); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + file, + ); - let mut all_offsets: Vec<(Key, Lsn, BlobRef)> = Vec::new(); - for (key, vec_map) in index.iter() { - for (lsn, blob_ref) in vec_map.as_slice().iter() { - all_offsets.push((**key, *lsn, *blob_ref)); - } - } + let mut all_offsets: Vec<(DeltaKey, BlobRef)> = Vec::new(); + tree_reader.visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |key, value| { + all_offsets.push((DeltaKey::from_slice(key), BlobRef(value))); + true + }, + )?; let iter = DeltaValueIter { all_offsets, @@ -723,13 +767,15 @@ impl<'a> DeltaValueIter<'a> { fn next_res(&mut self) -> Result> { if self.next_idx < self.all_offsets.len() { - let (key, lsn, off) = &self.all_offsets[self.next_idx]; + let (delta_key, blob_ref) = &self.all_offsets[self.next_idx]; - //let mut reader = BlobReader::new(self.inner.file.as_ref().unwrap()); - let buf = self.reader.read_blob(off.pos())?; + let key = delta_key.key(); + let lsn = delta_key.lsn(); + + let buf = self.reader.read_blob(blob_ref.pos())?; let val = Value::des(&buf)?; self.next_idx += 1; - Ok(Some((*key, *lsn, val))) + Ok(Some((key, lsn, val))) } else { Ok(None) } diff --git a/pageserver/src/layered_repository/disk_btree.rs b/pageserver/src/layered_repository/disk_btree.rs new file mode 100644 index 0000000000..7a9fe6f2b7 --- /dev/null +++ b/pageserver/src/layered_repository/disk_btree.rs @@ -0,0 +1,979 @@ +//! +//! Simple on-disk B-tree implementation +//! +//! This is used as the index structure within image and delta layers +//! +//! Features: +//! - Fixed-width keys +//! - Fixed-width values (VALUE_SZ) +//! - The tree is created in a bulk operation. Insert/deletion after creation +//! is not suppported +//! - page-oriented +//! +//! TODO: +//! - better errors (e.g. with thiserror?) +//! - maybe something like an Adaptive Radix Tree would be more efficient? +//! - the values stored by image and delta layers are offsets into the file, +//! and they are in monotonically increasing order. Prefix compression would +//! be very useful for them, too. +//! - An Iterator interface would be more convenient for the callers than the +//! 'visit' function +//! +use anyhow; +use byteorder::{ReadBytesExt, BE}; +use bytes::{BufMut, Bytes, BytesMut}; +use hex; +use std::cmp::Ordering; + +use crate::layered_repository::block_io::{BlockReader, BlockWriter}; + +// The maximum size of a value stored in the B-tree. 5 bytes is enough currently. +pub const VALUE_SZ: usize = 5; +pub const MAX_VALUE: u64 = 0x007f_ffff_ffff; + +#[allow(dead_code)] +pub const PAGE_SZ: usize = 8192; + +#[derive(Clone, Copy, Debug)] +struct Value([u8; VALUE_SZ]); + +impl Value { + fn from_slice(slice: &[u8]) -> Value { + let mut b = [0u8; VALUE_SZ]; + b.copy_from_slice(slice); + Value(b) + } + + fn from_u64(x: u64) -> Value { + assert!(x <= 0x007f_ffff_ffff); + Value([ + (x >> 32) as u8, + (x >> 24) as u8, + (x >> 16) as u8, + (x >> 8) as u8, + x as u8, + ]) + } + + fn from_blknum(x: u32) -> Value { + Value([ + 0x80, + (x >> 24) as u8, + (x >> 16) as u8, + (x >> 8) as u8, + x as u8, + ]) + } + + #[allow(dead_code)] + fn is_offset(self) -> bool { + self.0[0] & 0x80 != 0 + } + + fn to_u64(self) -> u64 { + let b = &self.0; + (b[0] as u64) << 32 + | (b[1] as u64) << 24 + | (b[2] as u64) << 16 + | (b[3] as u64) << 8 + | b[4] as u64 + } + + fn to_blknum(self) -> u32 { + let b = &self.0; + assert!(b[0] == 0x80); + (b[1] as u32) << 24 | (b[2] as u32) << 16 | (b[3] as u32) << 8 | b[4] as u32 + } +} + +/// This is the on-disk representation. +struct OnDiskNode<'a, const L: usize> { + // Fixed-width fields + num_children: u16, + level: u8, + prefix_len: u8, + suffix_len: u8, + + // Variable-length fields. These are stored on-disk after the fixed-width + // fields, in this order. In the in-memory representation, these point to + // the right parts in the page buffer. + prefix: &'a [u8], + keys: &'a [u8], + values: &'a [u8], +} + +impl<'a, const L: usize> OnDiskNode<'a, L> { + /// + /// Interpret a PAGE_SZ page as a node. + /// + fn deparse(buf: &[u8]) -> OnDiskNode { + let mut cursor = std::io::Cursor::new(buf); + let num_children = cursor.read_u16::().unwrap(); + let level = cursor.read_u8().unwrap(); + let prefix_len = cursor.read_u8().unwrap(); + let suffix_len = cursor.read_u8().unwrap(); + + let mut off = cursor.position(); + let prefix_off = off as usize; + off += prefix_len as u64; + + let keys_off = off as usize; + let keys_len = num_children as usize * suffix_len as usize; + off += keys_len as u64; + + let values_off = off as usize; + let values_len = num_children as usize * VALUE_SZ as usize; + //off += values_len as u64; + + let prefix = &buf[prefix_off..prefix_off + prefix_len as usize]; + let keys = &buf[keys_off..keys_off + keys_len]; + let values = &buf[values_off..values_off + values_len]; + + OnDiskNode { + num_children, + level, + prefix_len, + suffix_len, + prefix, + keys, + values, + } + } + + /// + /// Read a value at 'idx' + /// + fn value(&self, idx: usize) -> Value { + let value_off = idx * VALUE_SZ; + let value_slice = &self.values[value_off..value_off + VALUE_SZ]; + Value::from_slice(value_slice) + } + + fn binary_search(&self, search_key: &[u8; L], keybuf: &mut [u8]) -> Result { + let mut size = self.num_children as usize; + let mut low = 0; + let mut high = size; + while low < high { + let mid = low + size / 2; + + let key_off = mid as usize * self.suffix_len as usize; + let suffix = &self.keys[key_off..key_off + self.suffix_len as usize]; + // Does this match? + keybuf[self.prefix_len as usize..].copy_from_slice(suffix); + + let cmp = keybuf[..].cmp(search_key); + + if cmp == Ordering::Less { + low = mid + 1; + } else if cmp == Ordering::Greater { + high = mid; + } else { + return Ok(mid); + } + size = high - low; + } + Err(low) + } +} + +/// +/// Public reader object, to search the tree. +/// +pub struct DiskBtreeReader +where + R: BlockReader, +{ + start_blk: u32, + root_blk: u32, + reader: R, +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum VisitDirection { + Forwards, + Backwards, +} + +impl DiskBtreeReader +where + R: BlockReader, +{ + pub fn new(start_blk: u32, root_blk: u32, reader: R) -> Self { + DiskBtreeReader { + start_blk, + root_blk, + reader, + } + } + + /// + /// Read the value for given key. Returns the value, or None if it doesn't exist. + /// + pub fn get(&self, search_key: &[u8; L]) -> anyhow::Result> { + let mut result: Option = None; + self.visit(search_key, VisitDirection::Forwards, |key, value| { + if key == search_key { + result = Some(value); + } + false + })?; + Ok(result) + } + + /// + /// Scan the tree, starting from 'search_key', in the given direction. 'visitor' + /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning + /// backwards) + /// + pub fn visit( + &self, + search_key: &[u8; L], + dir: VisitDirection, + mut visitor: V, + ) -> anyhow::Result + where + V: FnMut(&[u8], u64) -> bool, + { + self.search_recurse(self.root_blk, search_key, dir, &mut visitor) + } + + fn search_recurse( + &self, + node_blknum: u32, + search_key: &[u8; L], + dir: VisitDirection, + visitor: &mut V, + ) -> anyhow::Result + where + V: FnMut(&[u8], u64) -> bool, + { + // Locate the node. + let blk = self.reader.read_blk(self.start_blk + node_blknum)?; + + // Search all entries on this node + self.search_node(blk.as_ref(), search_key, dir, visitor) + } + + fn search_node( + &self, + node_buf: &[u8], + search_key: &[u8; L], + dir: VisitDirection, + visitor: &mut V, + ) -> anyhow::Result + where + V: FnMut(&[u8], u64) -> bool, + { + let node = OnDiskNode::deparse(node_buf); + let prefix_len = node.prefix_len as usize; + let suffix_len = node.suffix_len as usize; + + assert!(node.num_children > 0); + + let mut keybuf = Vec::new(); + keybuf.extend(node.prefix); + keybuf.resize(prefix_len + suffix_len, 0); + + if dir == VisitDirection::Forwards { + // Locate the first match + let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) { + Ok(idx) => idx, + Err(idx) => { + if node.level == 0 { + // Imagine that the node contains the following keys: + // + // 1 + // 3 <-- idx + // 5 + // + // If the search key is '2' and there is exact match, + // the binary search would return the index of key + // '3'. That's cool, '3' is the first key to return. + idx + } else { + // This is an internal page, so each key represents a lower + // bound for what's in the child page. If there is no exact + // match, we have to return the *previous* entry. + // + // 1 <-- return this + // 3 <-- idx + // 5 + idx.saturating_sub(1) + } + } + }; + // idx points to the first match now. Keep going from there + let mut key_off = idx * suffix_len; + while idx < node.num_children as usize { + let suffix = &node.keys[key_off..key_off + suffix_len]; + keybuf[prefix_len..].copy_from_slice(suffix); + let value = node.value(idx as usize); + #[allow(clippy::collapsible_if)] + if node.level == 0 { + // leaf + if !visitor(&keybuf, value.to_u64()) { + return Ok(false); + } + } else { + #[allow(clippy::collapsible_if)] + if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? { + return Ok(false); + } + } + idx += 1; + key_off += suffix_len; + } + } else { + let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) { + Ok(idx) => { + // Exact match. That's the first entry to return, and walk + // backwards from there. (The loop below starts from 'idx - + // 1', so add one here to compensate.) + idx + 1 + } + Err(idx) => { + // No exact match. The binary search returned the index of the + // first key that's > search_key. Back off by one, and walk + // backwards from there. (The loop below starts from idx - 1, + // so we don't need to subtract one here) + idx + } + }; + + // idx points to the first match + 1 now. Keep going from there. + let mut key_off = idx * suffix_len; + while idx > 0 { + idx -= 1; + key_off -= suffix_len; + let suffix = &node.keys[key_off..key_off + suffix_len]; + keybuf[prefix_len..].copy_from_slice(suffix); + let value = node.value(idx as usize); + #[allow(clippy::collapsible_if)] + if node.level == 0 { + // leaf + if !visitor(&keybuf, value.to_u64()) { + return Ok(false); + } + } else { + #[allow(clippy::collapsible_if)] + if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? { + return Ok(false); + } + } + if idx == 0 { + break; + } + } + } + Ok(true) + } + + #[allow(dead_code)] + pub fn dump(&self) -> anyhow::Result<()> { + self.dump_recurse(self.root_blk, &[], 0) + } + + fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> anyhow::Result<()> { + let blk = self.reader.read_blk(self.start_blk + blknum)?; + let buf: &[u8] = blk.as_ref(); + + let node = OnDiskNode::::deparse(buf); + + print!("{:indent$}", "", indent = depth * 2); + println!( + "blk #{}: path {}: prefix {}, suffix_len {}", + blknum, + hex::encode(path), + hex::encode(node.prefix), + node.suffix_len + ); + + let mut idx = 0; + let mut key_off = 0; + while idx < node.num_children { + let key = &node.keys[key_off..key_off + node.suffix_len as usize]; + let val = node.value(idx as usize); + print!("{:indent$}", "", indent = depth * 2 + 2); + println!("{}: {}", hex::encode(key), hex::encode(val.0)); + + if node.level > 0 { + let child_path = [path, node.prefix].concat(); + self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?; + } + idx += 1; + key_off += node.suffix_len as usize; + } + Ok(()) + } +} + +/// +/// Public builder object, for creating a new tree. +/// +/// Usage: Create a builder object by calling 'new', load all the data into the +/// tree by calling 'append' for each key-value pair, and then call 'finish' +/// +/// 'L' is the key length in bytes +pub struct DiskBtreeBuilder +where + W: BlockWriter, +{ + writer: W, + + /// + /// stack[0] is the current root page, stack.last() is the leaf. + /// + stack: Vec>, + + /// Last key that was appended to the tree. Used to sanity check that append + /// is called in increasing key order. + last_key: Option<[u8; L]>, +} + +impl DiskBtreeBuilder +where + W: BlockWriter, +{ + pub fn new(writer: W) -> Self { + DiskBtreeBuilder { + writer, + last_key: None, + stack: vec![BuildNode::new(0)], + } + } + + pub fn append(&mut self, key: &[u8; L], value: u64) -> Result<(), anyhow::Error> { + assert!(value <= MAX_VALUE); + if let Some(last_key) = &self.last_key { + assert!(key > last_key, "unsorted input"); + } + self.last_key = Some(*key); + + Ok(self.append_internal(key, Value::from_u64(value))?) + } + + fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<(), std::io::Error> { + // Try to append to the current leaf buffer + let last = self.stack.last_mut().unwrap(); + let level = last.level; + if last.push(key, value) { + return Ok(()); + } + + // It did not fit. Try to compress, and it it succeeds to make some room + // on the node, try appending to it again. + #[allow(clippy::collapsible_if)] + if last.compress() { + if last.push(key, value) { + return Ok(()); + } + } + + // Could not append to the current leaf. Flush it and create a new one. + self.flush_node()?; + + // Replace the node we flushed with an empty one and append the new + // key to it. + let mut last = BuildNode::new(level); + if !last.push(key, value) { + panic!("could not push to new leaf node"); + } + self.stack.push(last); + + Ok(()) + } + + fn flush_node(&mut self) -> Result<(), std::io::Error> { + let last = self.stack.pop().unwrap(); + let buf = last.pack(); + let downlink_key = last.first_key(); + let downlink_ptr = self.writer.write_blk(buf)?; + + // Append the downlink to the parent + if self.stack.is_empty() { + self.stack.push(BuildNode::new(last.level + 1)); + } + self.append_internal(&downlink_key, Value::from_blknum(downlink_ptr))?; + + Ok(()) + } + + /// + /// Flushes everything to disk, and returns the block number of the root page. + /// The caller must store the root block number "out-of-band", and pass it + /// to the DiskBtreeReader::new() when you want to read the tree again. + /// (In the image and delta layers, it is stored in the beginning of the file, + /// in the summary header) + /// + pub fn finish(mut self) -> Result<(u32, W), std::io::Error> { + // flush all levels, except the root. + while self.stack.len() > 1 { + self.flush_node()?; + } + + let root = self.stack.first().unwrap(); + let buf = root.pack(); + let root_blknum = self.writer.write_blk(buf)?; + + Ok((root_blknum, self.writer)) + } + + pub fn borrow_writer(&self) -> &W { + &self.writer + } +} + +/// +/// BuildNode represesnts an incomplete page that we are appending to. +/// +#[derive(Clone, Debug)] +struct BuildNode { + num_children: u16, + level: u8, + prefix: Vec, + suffix_len: usize, + + keys: Vec, + values: Vec, + + size: usize, // physical size of this node, if it was written to disk like this +} + +const NODE_SIZE: usize = PAGE_SZ; + +const NODE_HDR_SIZE: usize = 2 + 1 + 1 + 1; + +impl BuildNode { + fn new(level: u8) -> Self { + BuildNode { + num_children: 0, + level, + prefix: Vec::new(), + suffix_len: 0, + keys: Vec::new(), + values: Vec::new(), + size: NODE_HDR_SIZE, + } + } + + /// Try to append a key-value pair to this node. Returns 'true' on + /// success, 'false' if the page was full or the key was + /// incompatible with the prefix of the existing keys. + fn push(&mut self, key: &[u8; L], value: Value) -> bool { + // If we have already performed prefix-compression on the page, + // check that the incoming key has the same prefix. + if self.num_children > 0 { + // does the prefix allow it? + if !key.starts_with(&self.prefix) { + return false; + } + } else { + self.suffix_len = key.len(); + } + + // Is the node too full? + if self.size + self.suffix_len + VALUE_SZ >= NODE_SIZE { + return false; + } + + // All clear + self.num_children += 1; + self.keys.extend(&key[self.prefix.len()..]); + self.values.extend(value.0); + + assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize); + assert!(self.values.len() == self.num_children as usize * VALUE_SZ); + + self.size += self.suffix_len + VALUE_SZ; + + true + } + + /// + /// Perform prefix-compression. + /// + /// Returns 'true' on success, 'false' if no compression was possible. + /// + fn compress(&mut self) -> bool { + let first_suffix = self.first_suffix(); + let last_suffix = self.last_suffix(); + + // Find the common prefix among all keys + let mut prefix_len = 0; + while prefix_len < self.suffix_len { + if first_suffix[prefix_len] != last_suffix[prefix_len] { + break; + } + prefix_len += 1; + } + if prefix_len == 0 { + return false; + } + + // Can compress. Rewrite the keys without the common prefix. + self.prefix.extend(&self.keys[..prefix_len]); + + let mut new_keys = Vec::new(); + let mut key_off = 0; + while key_off < self.keys.len() { + let next_key_off = key_off + self.suffix_len; + new_keys.extend(&self.keys[key_off + prefix_len..next_key_off]); + key_off = next_key_off; + } + self.keys = new_keys; + self.suffix_len -= prefix_len; + + self.size -= prefix_len * self.num_children as usize; + self.size += prefix_len; + + assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize); + assert!(self.values.len() == self.num_children as usize * VALUE_SZ); + + true + } + + /// + /// Serialize the node to on-disk format. + /// + fn pack(&self) -> Bytes { + assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize); + assert!(self.values.len() == self.num_children as usize * VALUE_SZ); + assert!(self.num_children > 0); + + let mut buf = BytesMut::new(); + + buf.put_u16(self.num_children); + buf.put_u8(self.level); + buf.put_u8(self.prefix.len() as u8); + buf.put_u8(self.suffix_len as u8); + buf.put(&self.prefix[..]); + buf.put(&self.keys[..]); + buf.put(&self.values[..]); + + assert!(buf.len() == self.size); + + assert!(buf.len() <= PAGE_SZ); + buf.resize(PAGE_SZ, 0); + buf.freeze() + } + + fn first_suffix(&self) -> &[u8] { + &self.keys[..self.suffix_len] + } + fn last_suffix(&self) -> &[u8] { + &self.keys[self.keys.len() - self.suffix_len..] + } + + /// Return the full first key of the page, including the prefix + fn first_key(&self) -> [u8; L] { + let mut key = [0u8; L]; + key[..self.prefix.len()].copy_from_slice(&self.prefix); + key[self.prefix.len()..].copy_from_slice(self.first_suffix()); + key + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::Rng; + use std::collections::BTreeMap; + use std::sync::atomic::{AtomicUsize, Ordering}; + + #[derive(Clone, Default)] + struct TestDisk { + blocks: Vec, + } + impl TestDisk { + fn new() -> Self { + Self::default() + } + } + impl BlockReader for TestDisk { + type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>; + + fn read_blk(&self, blknum: u32) -> Result { + let mut buf = [0u8; PAGE_SZ]; + buf.copy_from_slice(&self.blocks[blknum as usize]); + Ok(std::rc::Rc::new(buf)) + } + } + impl BlockWriter for &mut TestDisk { + fn write_blk(&mut self, buf: Bytes) -> Result { + let blknum = self.blocks.len(); + self.blocks.push(buf); + Ok(blknum as u32) + } + } + + #[test] + fn basic() -> anyhow::Result<()> { + let mut disk = TestDisk::new(); + let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk); + + let all_keys: Vec<&[u8; 6]> = vec![ + b"xaaaaa", b"xaaaba", b"xaaaca", b"xabaaa", b"xababa", b"xabaca", b"xabada", b"xabadb", + ]; + let all_data: Vec<(&[u8; 6], u64)> = all_keys + .iter() + .enumerate() + .map(|(idx, key)| (*key, idx as u64)) + .collect(); + for (key, val) in all_data.iter() { + writer.append(key, *val)?; + } + + let (root_offset, _writer) = writer.finish()?; + + let reader = DiskBtreeReader::new(0, root_offset, disk); + + reader.dump()?; + + // Test the `get` function on all the keys. + for (key, val) in all_data.iter() { + assert_eq!(reader.get(key)?, Some(*val)); + } + // And on some keys that don't exist + assert_eq!(reader.get(b"aaaaaa")?, None); + assert_eq!(reader.get(b"zzzzzz")?, None); + assert_eq!(reader.get(b"xaaabx")?, None); + + // Test search with `visit` function + let search_key = b"xabaaa"; + let expected: Vec<(Vec, u64)> = all_data + .iter() + .filter(|(key, _value)| key[..] >= search_key[..]) + .map(|(key, value)| (key.to_vec(), *value)) + .collect(); + + let mut data = Vec::new(); + reader.visit(search_key, VisitDirection::Forwards, |key, value| { + data.push((key.to_vec(), value)); + true + })?; + assert_eq!(data, expected); + + // Test a backwards scan + let mut expected: Vec<(Vec, u64)> = all_data + .iter() + .filter(|(key, _value)| key[..] <= search_key[..]) + .map(|(key, value)| (key.to_vec(), *value)) + .collect(); + expected.reverse(); + let mut data = Vec::new(); + reader.visit(search_key, VisitDirection::Backwards, |key, value| { + data.push((key.to_vec(), value)); + true + })?; + assert_eq!(data, expected); + + // Backward scan where nothing matches + reader.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| { + panic!("found unexpected key {}: {}", hex::encode(key), value); + })?; + + // Full scan + let expected: Vec<(Vec, u64)> = all_data + .iter() + .map(|(key, value)| (key.to_vec(), *value)) + .collect(); + let mut data = Vec::new(); + reader.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| { + data.push((key.to_vec(), value)); + true + })?; + assert_eq!(data, expected); + + Ok(()) + } + + #[test] + fn lots_of_keys() -> anyhow::Result<()> { + let mut disk = TestDisk::new(); + let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk); + + const NUM_KEYS: u64 = 1000; + + let mut all_data: BTreeMap = BTreeMap::new(); + + for idx in 0..NUM_KEYS { + let key_int: u64 = 1 + idx * 2; + let key = u64::to_be_bytes(key_int); + writer.append(&key, idx)?; + + all_data.insert(key_int, idx); + } + + let (root_offset, _writer) = writer.finish()?; + + let reader = DiskBtreeReader::new(0, root_offset, disk); + + reader.dump()?; + + use std::sync::Mutex; + + let result = Mutex::new(Vec::new()); + let limit: AtomicUsize = AtomicUsize::new(10); + let take_ten = |key: &[u8], value: u64| { + let mut keybuf = [0u8; 8]; + keybuf.copy_from_slice(key); + let key_int = u64::from_be_bytes(keybuf); + + let mut result = result.lock().unwrap(); + result.push((key_int, value)); + + // keep going until we have 10 matches + result.len() < limit.load(Ordering::Relaxed) + }; + + for search_key_int in 0..(NUM_KEYS * 2 + 10) { + let search_key = u64::to_be_bytes(search_key_int); + assert_eq!( + reader.get(&search_key)?, + all_data.get(&search_key_int).cloned() + ); + + // Test a forward scan starting with this key + result.lock().unwrap().clear(); + reader.visit(&search_key, VisitDirection::Forwards, take_ten)?; + let expected = all_data + .range(search_key_int..) + .take(10) + .map(|(&key, &val)| (key, val)) + .collect::>(); + assert_eq!(*result.lock().unwrap(), expected); + + // And a backwards scan + result.lock().unwrap().clear(); + reader.visit(&search_key, VisitDirection::Backwards, take_ten)?; + let expected = all_data + .range(..=search_key_int) + .rev() + .take(10) + .map(|(&key, &val)| (key, val)) + .collect::>(); + assert_eq!(*result.lock().unwrap(), expected); + } + + // full scan + let search_key = u64::to_be_bytes(0); + limit.store(usize::MAX, Ordering::Relaxed); + result.lock().unwrap().clear(); + reader.visit(&search_key, VisitDirection::Forwards, take_ten)?; + let expected = all_data + .iter() + .map(|(&key, &val)| (key, val)) + .collect::>(); + assert_eq!(*result.lock().unwrap(), expected); + + // full scan + let search_key = u64::to_be_bytes(u64::MAX); + limit.store(usize::MAX, Ordering::Relaxed); + result.lock().unwrap().clear(); + reader.visit(&search_key, VisitDirection::Backwards, take_ten)?; + let expected = all_data + .iter() + .rev() + .map(|(&key, &val)| (key, val)) + .collect::>(); + assert_eq!(*result.lock().unwrap(), expected); + + Ok(()) + } + + #[test] + fn random_data() -> anyhow::Result<()> { + // Generate random keys with exponential distribution, to + // exercise the prefix compression + const NUM_KEYS: usize = 100000; + let mut all_data: BTreeMap = BTreeMap::new(); + for idx in 0..NUM_KEYS { + let u: f64 = rand::thread_rng().gen_range(0.0..1.0); + let t = -(f64::ln(u)); + let key_int = (t * 1000000.0) as u128; + + all_data.insert(key_int as u128, idx as u64); + } + + // Build a tree from it + let mut disk = TestDisk::new(); + let mut writer = DiskBtreeBuilder::<_, 16>::new(&mut disk); + + for (&key, &val) in all_data.iter() { + writer.append(&u128::to_be_bytes(key), val)?; + } + let (root_offset, _writer) = writer.finish()?; + + let reader = DiskBtreeReader::new(0, root_offset, disk); + + // Test get() operation on all the keys + for (&key, &val) in all_data.iter() { + let search_key = u128::to_be_bytes(key); + assert_eq!(reader.get(&search_key)?, Some(val)); + } + + // Test get() operations on random keys, most of which will not exist + for _ in 0..100000 { + let key_int = rand::thread_rng().gen::(); + let search_key = u128::to_be_bytes(key_int); + assert!(reader.get(&search_key)? == all_data.get(&key_int).cloned()); + } + + // Test boundary cases + assert!(reader.get(&u128::to_be_bytes(u128::MIN))? == all_data.get(&u128::MIN).cloned()); + assert!(reader.get(&u128::to_be_bytes(u128::MAX))? == all_data.get(&u128::MAX).cloned()); + + Ok(()) + } + + #[test] + #[should_panic(expected = "unsorted input")] + fn unsorted_input() { + let mut disk = TestDisk::new(); + let mut writer = DiskBtreeBuilder::<_, 2>::new(&mut disk); + + let _ = writer.append(b"ba", 1); + let _ = writer.append(b"bb", 2); + let _ = writer.append(b"aa", 3); + } + + /// + /// This test contains a particular data set, see disk_btree_test_data.rs + /// + #[test] + fn particular_data() -> anyhow::Result<()> { + // Build a tree from it + let mut disk = TestDisk::new(); + let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk); + + for (key, val) in disk_btree_test_data::TEST_DATA { + writer.append(&key, val)?; + } + let (root_offset, writer) = writer.finish()?; + + println!("SIZE: {} blocks", writer.blocks.len()); + + let reader = DiskBtreeReader::new(0, root_offset, disk); + + // Test get() operation on all the keys + for (key, val) in disk_btree_test_data::TEST_DATA { + assert_eq!(reader.get(&key)?, Some(val)); + } + + // Test full scan + let mut count = 0; + reader.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| { + count += 1; + true + })?; + assert_eq!(count, disk_btree_test_data::TEST_DATA.len()); + + reader.dump()?; + + Ok(()) + } +} + +#[cfg(test)] +#[path = "disk_btree_test_data.rs"] +mod disk_btree_test_data; diff --git a/pageserver/src/layered_repository/disk_btree_test_data.rs b/pageserver/src/layered_repository/disk_btree_test_data.rs new file mode 100644 index 0000000000..9462573f03 --- /dev/null +++ b/pageserver/src/layered_repository/disk_btree_test_data.rs @@ -0,0 +1,2013 @@ +use hex_literal::hex; + +/// Test data set for the 'particular_data' test in disk_btree.rs +/// +/// This test contains a particular data set, representing all the keys +/// generated by the 'test_random_updates' unit test. I extracted this while +/// trying to debug a failure in that test. The bug turned out to be +/// elsewhere, and I'm not sure if this is still useful, but keeping it for +/// now... Maybe it's a useful data set to show the typical key-values used +/// by a delta layer, for evaluating how well the prefix compression works. +#[rustfmt::skip] +pub static TEST_DATA: [([u8; 26], u64); 2000] = [ + (hex!("0122222222333333334444444455000000000000000000000010"), 0x004001), + (hex!("0122222222333333334444444455000000000000000000007cb0"), 0x0040a1), + (hex!("0122222222333333334444444455000000010000000000000020"), 0x004141), + (hex!("0122222222333333334444444455000000020000000000000030"), 0x0041e1), + (hex!("01222222223333333344444444550000000200000000000051a0"), 0x004281), + (hex!("0122222222333333334444444455000000030000000000000040"), 0x004321), + (hex!("0122222222333333334444444455000000030000000000006cf0"), 0x0043c1), + (hex!("0122222222333333334444444455000000030000000000007140"), 0x004461), + (hex!("0122222222333333334444444455000000040000000000000050"), 0x004501), + (hex!("01222222223333333344444444550000000400000000000047f0"), 0x0045a1), + (hex!("01222222223333333344444444550000000400000000000072b0"), 0x004641), + (hex!("0122222222333333334444444455000000050000000000000060"), 0x0046e1), + (hex!("0122222222333333334444444455000000050000000000005550"), 0x004781), + (hex!("0122222222333333334444444455000000060000000000000070"), 0x004821), + (hex!("01222222223333333344444444550000000600000000000044a0"), 0x0048c1), + (hex!("0122222222333333334444444455000000060000000000006870"), 0x004961), + (hex!("0122222222333333334444444455000000070000000000000080"), 0x004a01), + (hex!("0122222222333333334444444455000000080000000000000090"), 0x004aa1), + (hex!("0122222222333333334444444455000000080000000000004150"), 0x004b41), + (hex!("01222222223333333344444444550000000900000000000000a0"), 0x004be1), + (hex!("01222222223333333344444444550000000a00000000000000b0"), 0x004c81), + (hex!("01222222223333333344444444550000000a0000000000006680"), 0x004d21), + (hex!("01222222223333333344444444550000000b00000000000000c0"), 0x004dc1), + (hex!("01222222223333333344444444550000000b0000000000006230"), 0x004e61), + (hex!("01222222223333333344444444550000000c00000000000000d0"), 0x004f01), + (hex!("01222222223333333344444444550000000d00000000000000e0"), 0x004fa1), + (hex!("01222222223333333344444444550000000e00000000000000f0"), 0x005041), + (hex!("01222222223333333344444444550000000e0000000000006000"), 0x0050e1), + (hex!("01222222223333333344444444550000000f0000000000000100"), 0x005181), + (hex!("01222222223333333344444444550000000f00000000000053c0"), 0x005221), + (hex!("01222222223333333344444444550000000f0000000000006580"), 0x0052c1), + (hex!("0122222222333333334444444455000000100000000000000110"), 0x005361), + (hex!("01222222223333333344444444550000001000000000000046c0"), 0x005401), + (hex!("0122222222333333334444444455000000100000000000004e40"), 0x0054a1), + (hex!("0122222222333333334444444455000000110000000000000120"), 0x005541), + (hex!("0122222222333333334444444455000000120000000000000130"), 0x0055e1), + (hex!("01222222223333333344444444550000001200000000000066d0"), 0x005681), + (hex!("0122222222333333334444444455000000130000000000000140"), 0x005721), + (hex!("0122222222333333334444444455000000130000000000007710"), 0x0057c1), + (hex!("0122222222333333334444444455000000140000000000000150"), 0x005861), + (hex!("0122222222333333334444444455000000140000000000006c40"), 0x005901), + (hex!("0122222222333333334444444455000000150000000000000160"), 0x0059a1), + (hex!("0122222222333333334444444455000000150000000000005990"), 0x005a41), + (hex!("0122222222333333334444444455000000160000000000000170"), 0x005ae1), + (hex!("0122222222333333334444444455000000160000000000005530"), 0x005b81), + (hex!("0122222222333333334444444455000000170000000000000180"), 0x005c21), + (hex!("0122222222333333334444444455000000170000000000004290"), 0x005cc1), + (hex!("0122222222333333334444444455000000180000000000000190"), 0x005d61), + (hex!("01222222223333333344444444550000001800000000000051c0"), 0x005e01), + (hex!("01222222223333333344444444550000001900000000000001a0"), 0x005ea1), + (hex!("0122222222333333334444444455000000190000000000005420"), 0x005f41), + (hex!("0122222222333333334444444455000000190000000000005770"), 0x005fe1), + (hex!("01222222223333333344444444550000001900000000000079d0"), 0x006081), + (hex!("01222222223333333344444444550000001a00000000000001b0"), 0x006121), + (hex!("01222222223333333344444444550000001a0000000000006f70"), 0x0061c1), + (hex!("01222222223333333344444444550000001a0000000000007150"), 0x006261), + (hex!("01222222223333333344444444550000001b00000000000001c0"), 0x006301), + (hex!("01222222223333333344444444550000001b0000000000005070"), 0x0063a1), + (hex!("01222222223333333344444444550000001c00000000000001d0"), 0x006441), + (hex!("01222222223333333344444444550000001d00000000000001e0"), 0x0064e1), + (hex!("01222222223333333344444444550000001e00000000000001f0"), 0x006581), + (hex!("01222222223333333344444444550000001e0000000000005650"), 0x006621), + (hex!("01222222223333333344444444550000001f0000000000000200"), 0x0066c1), + (hex!("01222222223333333344444444550000001f0000000000006ca0"), 0x006761), + (hex!("0122222222333333334444444455000000200000000000000210"), 0x006801), + (hex!("0122222222333333334444444455000000200000000000005fc0"), 0x0068a1), + (hex!("0122222222333333334444444455000000210000000000000220"), 0x006941), + (hex!("0122222222333333334444444455000000210000000000006430"), 0x0069e1), + (hex!("0122222222333333334444444455000000220000000000000230"), 0x006a81), + (hex!("01222222223333333344444444550000002200000000000040e0"), 0x006b21), + (hex!("0122222222333333334444444455000000230000000000000240"), 0x006bc1), + (hex!("01222222223333333344444444550000002300000000000042d0"), 0x006c61), + (hex!("0122222222333333334444444455000000240000000000000250"), 0x006d01), + (hex!("0122222222333333334444444455000000250000000000000260"), 0x006da1), + (hex!("01222222223333333344444444550000002500000000000058c0"), 0x006e41), + (hex!("0122222222333333334444444455000000260000000000000270"), 0x006ee1), + (hex!("0122222222333333334444444455000000260000000000004020"), 0x006f81), + (hex!("0122222222333333334444444455000000270000000000000280"), 0x007021), + (hex!("0122222222333333334444444455000000280000000000000290"), 0x0070c1), + (hex!("0122222222333333334444444455000000280000000000007c00"), 0x007161), + (hex!("01222222223333333344444444550000002900000000000002a0"), 0x007201), + (hex!("01222222223333333344444444550000002a00000000000002b0"), 0x0072a1), + (hex!("01222222223333333344444444550000002b00000000000002c0"), 0x007341), + (hex!("01222222223333333344444444550000002c00000000000002d0"), 0x0073e1), + (hex!("01222222223333333344444444550000002c00000000000041b0"), 0x007481), + (hex!("01222222223333333344444444550000002c0000000000004c30"), 0x007521), + (hex!("01222222223333333344444444550000002d00000000000002e0"), 0x0075c1), + (hex!("01222222223333333344444444550000002d0000000000005e40"), 0x007661), + (hex!("01222222223333333344444444550000002d0000000000006990"), 0x007701), + (hex!("01222222223333333344444444550000002e00000000000002f0"), 0x0077a1), + (hex!("01222222223333333344444444550000002f0000000000000300"), 0x007841), + (hex!("01222222223333333344444444550000002f0000000000004a70"), 0x0078e1), + (hex!("01222222223333333344444444550000002f0000000000006b40"), 0x007981), + (hex!("0122222222333333334444444455000000300000000000000310"), 0x007a21), + (hex!("0122222222333333334444444455000000310000000000000320"), 0x007ac1), + (hex!("0122222222333333334444444455000000320000000000000330"), 0x007b61), + (hex!("01222222223333333344444444550000003200000000000041a0"), 0x007c01), + (hex!("0122222222333333334444444455000000320000000000007340"), 0x007ca1), + (hex!("0122222222333333334444444455000000320000000000007730"), 0x007d41), + (hex!("0122222222333333334444444455000000330000000000000340"), 0x007de1), + (hex!("01222222223333333344444444550000003300000000000055a0"), 0x007e81), + (hex!("0122222222333333334444444455000000340000000000000350"), 0x007f21), + (hex!("0122222222333333334444444455000000350000000000000360"), 0x007fc1), + (hex!("01222222223333333344444444550000003500000000000077a0"), 0x008061), + (hex!("0122222222333333334444444455000000360000000000000370"), 0x008101), + (hex!("0122222222333333334444444455000000370000000000000380"), 0x0081a1), + (hex!("0122222222333333334444444455000000380000000000000390"), 0x008241), + (hex!("01222222223333333344444444550000003900000000000003a0"), 0x0082e1), + (hex!("01222222223333333344444444550000003a00000000000003b0"), 0x008381), + (hex!("01222222223333333344444444550000003a00000000000071c0"), 0x008421), + (hex!("01222222223333333344444444550000003b00000000000003c0"), 0x0084c1), + (hex!("01222222223333333344444444550000003c00000000000003d0"), 0x008561), + (hex!("01222222223333333344444444550000003d00000000000003e0"), 0x008601), + (hex!("01222222223333333344444444550000003e00000000000003f0"), 0x0086a1), + (hex!("01222222223333333344444444550000003e00000000000062e0"), 0x008741), + (hex!("01222222223333333344444444550000003f0000000000000400"), 0x0087e1), + (hex!("0122222222333333334444444455000000400000000000000410"), 0x008881), + (hex!("0122222222333333334444444455000000400000000000004460"), 0x008921), + (hex!("0122222222333333334444444455000000400000000000005b90"), 0x0089c1), + (hex!("01222222223333333344444444550000004000000000000079b0"), 0x008a61), + (hex!("0122222222333333334444444455000000410000000000000420"), 0x008b01), + (hex!("0122222222333333334444444455000000420000000000000430"), 0x008ba1), + (hex!("0122222222333333334444444455000000420000000000005640"), 0x008c41), + (hex!("0122222222333333334444444455000000430000000000000440"), 0x008ce1), + (hex!("01222222223333333344444444550000004300000000000072a0"), 0x008d81), + (hex!("0122222222333333334444444455000000440000000000000450"), 0x008e21), + (hex!("0122222222333333334444444455000000450000000000000460"), 0x008ec1), + (hex!("0122222222333333334444444455000000450000000000005750"), 0x008f61), + (hex!("01222222223333333344444444550000004500000000000077b0"), 0x009001), + (hex!("0122222222333333334444444455000000460000000000000470"), 0x0090a1), + (hex!("0122222222333333334444444455000000470000000000000480"), 0x009141), + (hex!("0122222222333333334444444455000000480000000000000490"), 0x0091e1), + (hex!("01222222223333333344444444550000004800000000000069e0"), 0x009281), + (hex!("01222222223333333344444444550000004900000000000004a0"), 0x009321), + (hex!("0122222222333333334444444455000000490000000000007370"), 0x0093c1), + (hex!("01222222223333333344444444550000004a00000000000004b0"), 0x009461), + (hex!("01222222223333333344444444550000004a0000000000005cb0"), 0x009501), + (hex!("01222222223333333344444444550000004b00000000000004c0"), 0x0095a1), + (hex!("01222222223333333344444444550000004c00000000000004d0"), 0x009641), + (hex!("01222222223333333344444444550000004c0000000000004880"), 0x0096e1), + (hex!("01222222223333333344444444550000004c0000000000007a40"), 0x009781), + (hex!("01222222223333333344444444550000004d00000000000004e0"), 0x009821), + (hex!("01222222223333333344444444550000004d0000000000006390"), 0x0098c1), + (hex!("01222222223333333344444444550000004e00000000000004f0"), 0x009961), + (hex!("01222222223333333344444444550000004e0000000000004db0"), 0x009a01), + (hex!("01222222223333333344444444550000004f0000000000000500"), 0x009aa1), + (hex!("0122222222333333334444444455000000500000000000000510"), 0x009b41), + (hex!("0122222222333333334444444455000000510000000000000520"), 0x009be1), + (hex!("01222222223333333344444444550000005100000000000069c0"), 0x009c81), + (hex!("0122222222333333334444444455000000520000000000000530"), 0x009d21), + (hex!("0122222222333333334444444455000000520000000000006e60"), 0x009dc1), + (hex!("01222222223333333344444444550000005200000000000070c0"), 0x009e61), + (hex!("0122222222333333334444444455000000530000000000000540"), 0x009f01), + (hex!("0122222222333333334444444455000000530000000000005840"), 0x009fa1), + (hex!("0122222222333333334444444455000000540000000000000550"), 0x00a041), + (hex!("01222222223333333344444444550000005400000000000043e0"), 0x00a0e1), + (hex!("01222222223333333344444444550000005400000000000074e0"), 0x00a181), + (hex!("0122222222333333334444444455000000550000000000000560"), 0x00a221), + (hex!("0122222222333333334444444455000000550000000000003ee0"), 0x00a2c1), + (hex!("0122222222333333334444444455000000560000000000000570"), 0x00a361), + (hex!("0122222222333333334444444455000000570000000000000580"), 0x00a401), + (hex!("0122222222333333334444444455000000570000000000007030"), 0x00a4a1), + (hex!("0122222222333333334444444455000000580000000000000590"), 0x00a541), + (hex!("0122222222333333334444444455000000580000000000005340"), 0x00a5e1), + (hex!("01222222223333333344444444550000005800000000000059f0"), 0x00a681), + (hex!("0122222222333333334444444455000000580000000000006930"), 0x00a721), + (hex!("01222222223333333344444444550000005900000000000005a0"), 0x00a7c1), + (hex!("0122222222333333334444444455000000590000000000003f90"), 0x00a861), + (hex!("01222222223333333344444444550000005a00000000000005b0"), 0x00a901), + (hex!("01222222223333333344444444550000005b00000000000005c0"), 0x00a9a1), + (hex!("01222222223333333344444444550000005b00000000000062c0"), 0x00aa41), + (hex!("01222222223333333344444444550000005c00000000000005d0"), 0x00aae1), + (hex!("01222222223333333344444444550000005c0000000000005a70"), 0x00ab81), + (hex!("01222222223333333344444444550000005c0000000000005dd0"), 0x00ac21), + (hex!("01222222223333333344444444550000005d00000000000005e0"), 0x00acc1), + (hex!("01222222223333333344444444550000005d0000000000005730"), 0x00ad61), + (hex!("01222222223333333344444444550000005e00000000000005f0"), 0x00ae01), + (hex!("01222222223333333344444444550000005e0000000000004f40"), 0x00aea1), + (hex!("01222222223333333344444444550000005f0000000000000600"), 0x00af41), + (hex!("0122222222333333334444444455000000600000000000000610"), 0x00afe1), + (hex!("0122222222333333334444444455000000600000000000007c40"), 0x00b081), + (hex!("0122222222333333334444444455000000610000000000000620"), 0x00b121), + (hex!("0122222222333333334444444455000000610000000000007860"), 0x00b1c1), + (hex!("0122222222333333334444444455000000620000000000000630"), 0x00b261), + (hex!("0122222222333333334444444455000000620000000000005050"), 0x00b301), + (hex!("0122222222333333334444444455000000630000000000000640"), 0x00b3a1), + (hex!("0122222222333333334444444455000000640000000000000650"), 0x00b441), + (hex!("0122222222333333334444444455000000650000000000000660"), 0x00b4e1), + (hex!("0122222222333333334444444455000000650000000000005330"), 0x00b581), + (hex!("0122222222333333334444444455000000660000000000000670"), 0x00b621), + (hex!("0122222222333333334444444455000000660000000000004e20"), 0x00b6c1), + (hex!("0122222222333333334444444455000000660000000000005ee0"), 0x00b761), + (hex!("0122222222333333334444444455000000660000000000006360"), 0x00b801), + (hex!("0122222222333333334444444455000000670000000000000680"), 0x00b8a1), + (hex!("0122222222333333334444444455000000670000000000004040"), 0x00b941), + (hex!("0122222222333333334444444455000000680000000000000690"), 0x00b9e1), + (hex!("0122222222333333334444444455000000680000000000003f80"), 0x00ba81), + (hex!("01222222223333333344444444550000006800000000000041e0"), 0x00bb21), + (hex!("01222222223333333344444444550000006900000000000006a0"), 0x00bbc1), + (hex!("0122222222333333334444444455000000690000000000006080"), 0x00bc61), + (hex!("01222222223333333344444444550000006a00000000000006b0"), 0x00bd01), + (hex!("01222222223333333344444444550000006a00000000000042f0"), 0x00bda1), + (hex!("01222222223333333344444444550000006b00000000000006c0"), 0x00be41), + (hex!("01222222223333333344444444550000006b00000000000052f0"), 0x00bee1), + (hex!("01222222223333333344444444550000006b0000000000005980"), 0x00bf81), + (hex!("01222222223333333344444444550000006b0000000000006170"), 0x00c021), + (hex!("01222222223333333344444444550000006c00000000000006d0"), 0x00c0c1), + (hex!("01222222223333333344444444550000006d00000000000006e0"), 0x00c161), + (hex!("01222222223333333344444444550000006d0000000000006fb0"), 0x00c201), + (hex!("01222222223333333344444444550000006e00000000000006f0"), 0x00c2a1), + (hex!("01222222223333333344444444550000006e00000000000065b0"), 0x00c341), + (hex!("01222222223333333344444444550000006e0000000000007970"), 0x00c3e1), + (hex!("01222222223333333344444444550000006f0000000000000700"), 0x00c481), + (hex!("01222222223333333344444444550000006f0000000000005900"), 0x00c521), + (hex!("01222222223333333344444444550000006f0000000000006d90"), 0x00c5c1), + (hex!("0122222222333333334444444455000000700000000000000710"), 0x00c661), + (hex!("01222222223333333344444444550000007000000000000045c0"), 0x00c701), + (hex!("0122222222333333334444444455000000700000000000004d40"), 0x00c7a1), + (hex!("0122222222333333334444444455000000710000000000000720"), 0x00c841), + (hex!("0122222222333333334444444455000000710000000000004dc0"), 0x00c8e1), + (hex!("0122222222333333334444444455000000710000000000007550"), 0x00c981), + (hex!("0122222222333333334444444455000000720000000000000730"), 0x00ca21), + (hex!("0122222222333333334444444455000000720000000000003ec0"), 0x00cac1), + (hex!("01222222223333333344444444550000007200000000000045a0"), 0x00cb61), + (hex!("0122222222333333334444444455000000720000000000006770"), 0x00cc01), + (hex!("0122222222333333334444444455000000720000000000006bc0"), 0x00cca1), + (hex!("0122222222333333334444444455000000730000000000000740"), 0x00cd41), + (hex!("0122222222333333334444444455000000730000000000005250"), 0x00cde1), + (hex!("01222222223333333344444444550000007300000000000075f0"), 0x00ce81), + (hex!("0122222222333333334444444455000000740000000000000750"), 0x00cf21), + (hex!("0122222222333333334444444455000000740000000000003ff0"), 0x00cfc1), + (hex!("01222222223333333344444444550000007400000000000079e0"), 0x00d061), + (hex!("0122222222333333334444444455000000750000000000000760"), 0x00d101), + (hex!("0122222222333333334444444455000000750000000000004310"), 0x00d1a1), + (hex!("0122222222333333334444444455000000760000000000000770"), 0x00d241), + (hex!("0122222222333333334444444455000000770000000000000780"), 0x00d2e1), + (hex!("01222222223333333344444444550000007700000000000062f0"), 0x00d381), + (hex!("0122222222333333334444444455000000770000000000006940"), 0x00d421), + (hex!("0122222222333333334444444455000000780000000000000790"), 0x00d4c1), + (hex!("01222222223333333344444444550000007900000000000007a0"), 0x00d561), + (hex!("0122222222333333334444444455000000790000000000007af0"), 0x00d601), + (hex!("01222222223333333344444444550000007a00000000000007b0"), 0x00d6a1), + (hex!("01222222223333333344444444550000007b00000000000007c0"), 0x00d741), + (hex!("01222222223333333344444444550000007b00000000000067e0"), 0x00d7e1), + (hex!("01222222223333333344444444550000007b0000000000007890"), 0x00d881), + (hex!("01222222223333333344444444550000007c00000000000007d0"), 0x00d921), + (hex!("01222222223333333344444444550000007d00000000000007e0"), 0x00d9c1), + (hex!("01222222223333333344444444550000007e00000000000007f0"), 0x00da61), + (hex!("01222222223333333344444444550000007f0000000000000800"), 0x00db01), + (hex!("01222222223333333344444444550000007f0000000000005be0"), 0x00dba1), + (hex!("0122222222333333334444444455000000800000000000000810"), 0x00dc41), + (hex!("0122222222333333334444444455000000810000000000000820"), 0x00dce1), + (hex!("0122222222333333334444444455000000810000000000007190"), 0x00dd81), + (hex!("0122222222333333334444444455000000820000000000000830"), 0x00de21), + (hex!("0122222222333333334444444455000000820000000000004ab0"), 0x00dec1), + (hex!("0122222222333333334444444455000000830000000000000840"), 0x00df61), + (hex!("0122222222333333334444444455000000830000000000006720"), 0x00e001), + (hex!("0122222222333333334444444455000000840000000000000850"), 0x00e0a1), + (hex!("0122222222333333334444444455000000850000000000000860"), 0x00e141), + (hex!("01222222223333333344444444550000008500000000000054f0"), 0x00e1e1), + (hex!("0122222222333333334444444455000000850000000000007920"), 0x00e281), + (hex!("0122222222333333334444444455000000860000000000000870"), 0x00e321), + (hex!("01222222223333333344444444550000008600000000000060e0"), 0x00e3c1), + (hex!("0122222222333333334444444455000000860000000000006be0"), 0x00e461), + (hex!("0122222222333333334444444455000000870000000000000880"), 0x00e501), + (hex!("0122222222333333334444444455000000870000000000006820"), 0x00e5a1), + (hex!("0122222222333333334444444455000000880000000000000890"), 0x00e641), + (hex!("01222222223333333344444444550000008900000000000008a0"), 0x00e6e1), + (hex!("0122222222333333334444444455000000890000000000007c30"), 0x00e781), + (hex!("01222222223333333344444444550000008a00000000000008b0"), 0x00e821), + (hex!("01222222223333333344444444550000008b00000000000008c0"), 0x00e8c1), + (hex!("01222222223333333344444444550000008b0000000000005910"), 0x00e961), + (hex!("01222222223333333344444444550000008b0000000000006fe0"), 0x00ea01), + (hex!("01222222223333333344444444550000008c00000000000008d0"), 0x00eaa1), + (hex!("01222222223333333344444444550000008c0000000000006800"), 0x00eb41), + (hex!("01222222223333333344444444550000008d00000000000008e0"), 0x00ebe1), + (hex!("01222222223333333344444444550000008d0000000000005810"), 0x00ec81), + (hex!("01222222223333333344444444550000008d0000000000007c90"), 0x00ed21), + (hex!("01222222223333333344444444550000008e00000000000008f0"), 0x00edc1), + (hex!("01222222223333333344444444550000008e00000000000058f0"), 0x00ee61), + (hex!("01222222223333333344444444550000008f0000000000000900"), 0x00ef01), + (hex!("01222222223333333344444444550000008f0000000000005a30"), 0x00efa1), + (hex!("0122222222333333334444444455000000900000000000000910"), 0x00f041), + (hex!("0122222222333333334444444455000000900000000000006130"), 0x00f0e1), + (hex!("0122222222333333334444444455000000900000000000006550"), 0x00f181), + (hex!("0122222222333333334444444455000000910000000000000920"), 0x00f221), + (hex!("01222222223333333344444444550000009100000000000079f0"), 0x00f2c1), + (hex!("0122222222333333334444444455000000920000000000000930"), 0x00f361), + (hex!("0122222222333333334444444455000000920000000000005620"), 0x00f401), + (hex!("0122222222333333334444444455000000920000000000005e90"), 0x00f4a1), + (hex!("01222222223333333344444444550000009200000000000063d0"), 0x00f541), + (hex!("01222222223333333344444444550000009200000000000076c0"), 0x00f5e1), + (hex!("0122222222333333334444444455000000930000000000000940"), 0x00f681), + (hex!("01222222223333333344444444550000009300000000000044e0"), 0x00f721), + (hex!("0122222222333333334444444455000000940000000000000950"), 0x00f7c1), + (hex!("0122222222333333334444444455000000940000000000007a30"), 0x00f861), + (hex!("0122222222333333334444444455000000950000000000000960"), 0x00f901), + (hex!("0122222222333333334444444455000000950000000000007a70"), 0x00f9a1), + (hex!("0122222222333333334444444455000000960000000000000970"), 0x00fa41), + (hex!("0122222222333333334444444455000000970000000000000980"), 0x00fae1), + (hex!("0122222222333333334444444455000000970000000000007330"), 0x00fb81), + (hex!("0122222222333333334444444455000000980000000000000990"), 0x00fc21), + (hex!("0122222222333333334444444455000000980000000000005af0"), 0x00fcc1), + (hex!("0122222222333333334444444455000000980000000000007ae0"), 0x00fd61), + (hex!("01222222223333333344444444550000009900000000000009a0"), 0x00fe01), + (hex!("0122222222333333334444444455000000990000000000005160"), 0x00fea1), + (hex!("0122222222333333334444444455000000990000000000006850"), 0x00ff41), + (hex!("01222222223333333344444444550000009a00000000000009b0"), 0x00ffe1), + (hex!("01222222223333333344444444550000009b00000000000009c0"), 0x010081), + (hex!("01222222223333333344444444550000009b0000000000005010"), 0x010121), + (hex!("01222222223333333344444444550000009c00000000000009d0"), 0x0101c1), + (hex!("01222222223333333344444444550000009c00000000000042e0"), 0x010261), + (hex!("01222222223333333344444444550000009d00000000000009e0"), 0x010301), + (hex!("01222222223333333344444444550000009d00000000000057f0"), 0x0103a1), + (hex!("01222222223333333344444444550000009e00000000000009f0"), 0x010441), + (hex!("01222222223333333344444444550000009e0000000000004ef0"), 0x0104e1), + (hex!("01222222223333333344444444550000009f0000000000000a00"), 0x010581), + (hex!("01222222223333333344444444550000009f0000000000006110"), 0x010621), + (hex!("0122222222333333334444444455000000a00000000000000a10"), 0x0106c1), + (hex!("0122222222333333334444444455000000a10000000000000a20"), 0x010761), + (hex!("0122222222333333334444444455000000a100000000000040d0"), 0x010801), + (hex!("0122222222333333334444444455000000a10000000000007670"), 0x0108a1), + (hex!("0122222222333333334444444455000000a20000000000000a30"), 0x010941), + (hex!("0122222222333333334444444455000000a200000000000074d0"), 0x0109e1), + (hex!("0122222222333333334444444455000000a30000000000000a40"), 0x010a81), + (hex!("0122222222333333334444444455000000a30000000000004c90"), 0x010b21), + (hex!("0122222222333333334444444455000000a40000000000000a50"), 0x010bc1), + (hex!("0122222222333333334444444455000000a50000000000000a60"), 0x010c61), + (hex!("0122222222333333334444444455000000a60000000000000a70"), 0x010d01), + (hex!("0122222222333333334444444455000000a60000000000006d80"), 0x010da1), + (hex!("0122222222333333334444444455000000a60000000000007830"), 0x010e41), + (hex!("0122222222333333334444444455000000a70000000000000a80"), 0x010ee1), + (hex!("0122222222333333334444444455000000a700000000000064f0"), 0x010f81), + (hex!("0122222222333333334444444455000000a80000000000000a90"), 0x011021), + (hex!("0122222222333333334444444455000000a90000000000000aa0"), 0x0110c1), + (hex!("0122222222333333334444444455000000a90000000000005e30"), 0x011161), + (hex!("0122222222333333334444444455000000aa0000000000000ab0"), 0x011201), + (hex!("0122222222333333334444444455000000ab0000000000000ac0"), 0x0112a1), + (hex!("0122222222333333334444444455000000ac0000000000000ad0"), 0x011341), + (hex!("0122222222333333334444444455000000ac0000000000006d20"), 0x0113e1), + (hex!("0122222222333333334444444455000000ac0000000000007000"), 0x011481), + (hex!("0122222222333333334444444455000000ad0000000000000ae0"), 0x011521), + (hex!("0122222222333333334444444455000000ae0000000000000af0"), 0x0115c1), + (hex!("0122222222333333334444444455000000ae0000000000004a10"), 0x011661), + (hex!("0122222222333333334444444455000000af0000000000000b00"), 0x011701), + (hex!("0122222222333333334444444455000000af0000000000004e10"), 0x0117a1), + (hex!("0122222222333333334444444455000000b00000000000000b10"), 0x011841), + (hex!("0122222222333333334444444455000000b00000000000004280"), 0x0118e1), + (hex!("0122222222333333334444444455000000b000000000000077e0"), 0x011981), + (hex!("0122222222333333334444444455000000b10000000000000b20"), 0x011a21), + (hex!("0122222222333333334444444455000000b20000000000000b30"), 0x011ac1), + (hex!("0122222222333333334444444455000000b30000000000000b40"), 0x011b61), + (hex!("0122222222333333334444444455000000b30000000000004bc0"), 0x011c01), + (hex!("0122222222333333334444444455000000b40000000000000b50"), 0x011ca1), + (hex!("0122222222333333334444444455000000b50000000000000b60"), 0x011d41), + (hex!("0122222222333333334444444455000000b50000000000004fa0"), 0x011de1), + (hex!("0122222222333333334444444455000000b50000000000006a60"), 0x011e81), + (hex!("0122222222333333334444444455000000b60000000000000b70"), 0x011f21), + (hex!("0122222222333333334444444455000000b60000000000005630"), 0x011fc1), + (hex!("0122222222333333334444444455000000b70000000000000b80"), 0x012061), + (hex!("0122222222333333334444444455000000b80000000000000b90"), 0x012101), + (hex!("0122222222333333334444444455000000b80000000000006f80"), 0x0121a1), + (hex!("0122222222333333334444444455000000b90000000000000ba0"), 0x012241), + (hex!("0122222222333333334444444455000000ba0000000000000bb0"), 0x0122e1), + (hex!("0122222222333333334444444455000000bb0000000000000bc0"), 0x012381), + (hex!("0122222222333333334444444455000000bb00000000000047c0"), 0x012421), + (hex!("0122222222333333334444444455000000bb0000000000006060"), 0x0124c1), + (hex!("0122222222333333334444444455000000bc0000000000000bd0"), 0x012561), + (hex!("0122222222333333334444444455000000bd0000000000000be0"), 0x012601), + (hex!("0122222222333333334444444455000000bd0000000000004e80"), 0x0126a1), + (hex!("0122222222333333334444444455000000be0000000000000bf0"), 0x012741), + (hex!("0122222222333333334444444455000000bf0000000000000c00"), 0x0127e1), + (hex!("0122222222333333334444444455000000bf00000000000047a0"), 0x012881), + (hex!("0122222222333333334444444455000000bf0000000000006da0"), 0x012921), + (hex!("0122222222333333334444444455000000c00000000000000c10"), 0x0129c1), + (hex!("0122222222333333334444444455000000c10000000000000c20"), 0x012a61), + (hex!("0122222222333333334444444455000000c20000000000000c30"), 0x012b01), + (hex!("0122222222333333334444444455000000c20000000000004bd0"), 0x012ba1), + (hex!("0122222222333333334444444455000000c20000000000006ac0"), 0x012c41), + (hex!("0122222222333333334444444455000000c30000000000000c40"), 0x012ce1), + (hex!("0122222222333333334444444455000000c30000000000004660"), 0x012d81), + (hex!("0122222222333333334444444455000000c40000000000000c50"), 0x012e21), + (hex!("0122222222333333334444444455000000c50000000000000c60"), 0x012ec1), + (hex!("0122222222333333334444444455000000c60000000000000c70"), 0x012f61), + (hex!("0122222222333333334444444455000000c60000000000005880"), 0x013001), + (hex!("0122222222333333334444444455000000c60000000000006b70"), 0x0130a1), + (hex!("0122222222333333334444444455000000c70000000000000c80"), 0x013141), + (hex!("0122222222333333334444444455000000c80000000000000c90"), 0x0131e1), + (hex!("0122222222333333334444444455000000c80000000000005310"), 0x013281), + (hex!("0122222222333333334444444455000000c80000000000005db0"), 0x013321), + (hex!("0122222222333333334444444455000000c80000000000007040"), 0x0133c1), + (hex!("0122222222333333334444444455000000c80000000000007290"), 0x013461), + (hex!("0122222222333333334444444455000000c90000000000000ca0"), 0x013501), + (hex!("0122222222333333334444444455000000c90000000000004fe0"), 0x0135a1), + (hex!("0122222222333333334444444455000000ca0000000000000cb0"), 0x013641), + (hex!("0122222222333333334444444455000000ca0000000000006140"), 0x0136e1), + (hex!("0122222222333333334444444455000000ca0000000000007700"), 0x013781), + (hex!("0122222222333333334444444455000000cb0000000000000cc0"), 0x013821), + (hex!("0122222222333333334444444455000000cc0000000000000cd0"), 0x0138c1), + (hex!("0122222222333333334444444455000000cd0000000000000ce0"), 0x013961), + (hex!("0122222222333333334444444455000000cd0000000000003f20"), 0x013a01), + (hex!("0122222222333333334444444455000000cd00000000000040f0"), 0x013aa1), + (hex!("0122222222333333334444444455000000cd0000000000004ec0"), 0x013b41), + (hex!("0122222222333333334444444455000000ce0000000000000cf0"), 0x013be1), + (hex!("0122222222333333334444444455000000ce0000000000007200"), 0x013c81), + (hex!("0122222222333333334444444455000000cf0000000000000d00"), 0x013d21), + (hex!("0122222222333333334444444455000000cf00000000000046a0"), 0x013dc1), + (hex!("0122222222333333334444444455000000cf0000000000005960"), 0x013e61), + (hex!("0122222222333333334444444455000000d00000000000000d10"), 0x013f01), + (hex!("0122222222333333334444444455000000d00000000000005f30"), 0x013fa1), + (hex!("0122222222333333334444444455000000d10000000000000d20"), 0x014041), + (hex!("0122222222333333334444444455000000d10000000000007a00"), 0x0140e1), + (hex!("0122222222333333334444444455000000d20000000000000d30"), 0x014181), + (hex!("0122222222333333334444444455000000d30000000000000d40"), 0x014221), + (hex!("0122222222333333334444444455000000d40000000000000d50"), 0x0142c1), + (hex!("0122222222333333334444444455000000d50000000000000d60"), 0x014361), + (hex!("0122222222333333334444444455000000d50000000000004960"), 0x014401), + (hex!("0122222222333333334444444455000000d500000000000055d0"), 0x0144a1), + (hex!("0122222222333333334444444455000000d500000000000067d0"), 0x014541), + (hex!("0122222222333333334444444455000000d60000000000000d70"), 0x0145e1), + (hex!("0122222222333333334444444455000000d70000000000000d80"), 0x014681), + (hex!("0122222222333333334444444455000000d80000000000000d90"), 0x014721), + (hex!("0122222222333333334444444455000000d800000000000065f0"), 0x0147c1), + (hex!("0122222222333333334444444455000000d90000000000000da0"), 0x014861), + (hex!("0122222222333333334444444455000000d90000000000004980"), 0x014901), + (hex!("0122222222333333334444444455000000da0000000000000db0"), 0x0149a1), + (hex!("0122222222333333334444444455000000da00000000000048c0"), 0x014a41), + (hex!("0122222222333333334444444455000000da00000000000072c0"), 0x014ae1), + (hex!("0122222222333333334444444455000000da00000000000076b0"), 0x014b81), + (hex!("0122222222333333334444444455000000db0000000000000dc0"), 0x014c21), + (hex!("0122222222333333334444444455000000dc0000000000000dd0"), 0x014cc1), + (hex!("0122222222333333334444444455000000dc00000000000040a0"), 0x014d61), + (hex!("0122222222333333334444444455000000dc00000000000074c0"), 0x014e01), + (hex!("0122222222333333334444444455000000dd0000000000000de0"), 0x014ea1), + (hex!("0122222222333333334444444455000000dd0000000000004e50"), 0x014f41), + (hex!("0122222222333333334444444455000000dd0000000000007270"), 0x014fe1), + (hex!("0122222222333333334444444455000000de0000000000000df0"), 0x015081), + (hex!("0122222222333333334444444455000000de00000000000078d0"), 0x015121), + (hex!("0122222222333333334444444455000000df0000000000000e00"), 0x0151c1), + (hex!("0122222222333333334444444455000000df0000000000004d30"), 0x015261), + (hex!("0122222222333333334444444455000000df0000000000006c30"), 0x015301), + (hex!("0122222222333333334444444455000000e00000000000000e10"), 0x0153a1), + (hex!("0122222222333333334444444455000000e00000000000005d30"), 0x015441), + (hex!("0122222222333333334444444455000000e10000000000000e20"), 0x0154e1), + (hex!("0122222222333333334444444455000000e10000000000004610"), 0x015581), + (hex!("0122222222333333334444444455000000e100000000000051d0"), 0x015621), + (hex!("0122222222333333334444444455000000e10000000000005f10"), 0x0156c1), + (hex!("0122222222333333334444444455000000e20000000000000e30"), 0x015761), + (hex!("0122222222333333334444444455000000e20000000000007a90"), 0x015801), + (hex!("0122222222333333334444444455000000e30000000000000e40"), 0x0158a1), + (hex!("0122222222333333334444444455000000e30000000000005ae0"), 0x015941), + (hex!("0122222222333333334444444455000000e40000000000000e50"), 0x0159e1), + (hex!("0122222222333333334444444455000000e50000000000000e60"), 0x015a81), + (hex!("0122222222333333334444444455000000e50000000000004700"), 0x015b21), + (hex!("0122222222333333334444444455000000e500000000000065d0"), 0x015bc1), + (hex!("0122222222333333334444444455000000e60000000000000e70"), 0x015c61), + (hex!("0122222222333333334444444455000000e60000000000004fd0"), 0x015d01), + (hex!("0122222222333333334444444455000000e70000000000000e80"), 0x015da1), + (hex!("0122222222333333334444444455000000e70000000000005150"), 0x015e41), + (hex!("0122222222333333334444444455000000e70000000000005920"), 0x015ee1), + (hex!("0122222222333333334444444455000000e80000000000000e90"), 0x015f81), + (hex!("0122222222333333334444444455000000e80000000000004320"), 0x016021), + (hex!("0122222222333333334444444455000000e80000000000005ec0"), 0x0160c1), + (hex!("0122222222333333334444444455000000e90000000000000ea0"), 0x016161), + (hex!("0122222222333333334444444455000000e900000000000043b0"), 0x016201), + (hex!("0122222222333333334444444455000000ea0000000000000eb0"), 0x0162a1), + (hex!("0122222222333333334444444455000000ea0000000000003ea0"), 0x016341), + (hex!("0122222222333333334444444455000000ea0000000000004f50"), 0x0163e1), + (hex!("0122222222333333334444444455000000ea0000000000007520"), 0x016481), + (hex!("0122222222333333334444444455000000eb0000000000000ec0"), 0x016521), + (hex!("0122222222333333334444444455000000ec0000000000000ed0"), 0x0165c1), + (hex!("0122222222333333334444444455000000ec0000000000006670"), 0x016661), + (hex!("0122222222333333334444444455000000ed0000000000000ee0"), 0x016701), + (hex!("0122222222333333334444444455000000ee0000000000000ef0"), 0x0167a1), + (hex!("0122222222333333334444444455000000ee0000000000004d10"), 0x016841), + (hex!("0122222222333333334444444455000000ef0000000000000f00"), 0x0168e1), + (hex!("0122222222333333334444444455000000f00000000000000f10"), 0x016981), + (hex!("0122222222333333334444444455000000f00000000000007220"), 0x016a21), + (hex!("0122222222333333334444444455000000f00000000000007540"), 0x016ac1), + (hex!("0122222222333333334444444455000000f10000000000000f20"), 0x016b61), + (hex!("0122222222333333334444444455000000f100000000000066f0"), 0x016c01), + (hex!("0122222222333333334444444455000000f20000000000000f30"), 0x016ca1), + (hex!("0122222222333333334444444455000000f20000000000007810"), 0x016d41), + (hex!("0122222222333333334444444455000000f30000000000000f40"), 0x016de1), + (hex!("0122222222333333334444444455000000f30000000000007b70"), 0x016e81), + (hex!("0122222222333333334444444455000000f40000000000000f50"), 0x016f21), + (hex!("0122222222333333334444444455000000f400000000000059c0"), 0x016fc1), + (hex!("0122222222333333334444444455000000f50000000000000f60"), 0x017061), + (hex!("0122222222333333334444444455000000f50000000000003fb0"), 0x017101), + (hex!("0122222222333333334444444455000000f50000000000005740"), 0x0171a1), + (hex!("0122222222333333334444444455000000f500000000000064d0"), 0x017241), + (hex!("0122222222333333334444444455000000f50000000000006960"), 0x0172e1), + (hex!("0122222222333333334444444455000000f60000000000000f70"), 0x017381), + (hex!("0122222222333333334444444455000000f60000000000006d00"), 0x017421), + (hex!("0122222222333333334444444455000000f70000000000000f80"), 0x0174c1), + (hex!("0122222222333333334444444455000000f80000000000000f90"), 0x017561), + (hex!("0122222222333333334444444455000000f90000000000000fa0"), 0x017601), + (hex!("0122222222333333334444444455000000fa0000000000000fb0"), 0x0176a1), + (hex!("0122222222333333334444444455000000fa00000000000067b0"), 0x017741), + (hex!("0122222222333333334444444455000000fb0000000000000fc0"), 0x0177e1), + (hex!("0122222222333333334444444455000000fb0000000000004eb0"), 0x017881), + (hex!("0122222222333333334444444455000000fb0000000000006ef0"), 0x017921), + (hex!("0122222222333333334444444455000000fc0000000000000fd0"), 0x0179c1), + (hex!("0122222222333333334444444455000000fc0000000000004470"), 0x017a61), + (hex!("0122222222333333334444444455000000fc0000000000005940"), 0x017b01), + (hex!("0122222222333333334444444455000000fd0000000000000fe0"), 0x017ba1), + (hex!("0122222222333333334444444455000000fe0000000000000ff0"), 0x017c41), + (hex!("0122222222333333334444444455000000ff0000000000001000"), 0x017ce1), + (hex!("0122222222333333334444444455000000ff0000000000005690"), 0x017d81), + (hex!("0122222222333333334444444455000001000000000000001010"), 0x017e21), + (hex!("0122222222333333334444444455000001000000000000005210"), 0x017ec1), + (hex!("01222222223333333344444444550000010000000000000070a0"), 0x017f61), + (hex!("0122222222333333334444444455000001010000000000001020"), 0x018001), + (hex!("0122222222333333334444444455000001010000000000006b80"), 0x0180a1), + (hex!("0122222222333333334444444455000001020000000000001030"), 0x018141), + (hex!("0122222222333333334444444455000001030000000000001040"), 0x0181e1), + (hex!("0122222222333333334444444455000001030000000000004c80"), 0x018281), + (hex!("0122222222333333334444444455000001040000000000001050"), 0x018321), + (hex!("0122222222333333334444444455000001040000000000004850"), 0x0183c1), + (hex!("01222222223333333344444444550000010400000000000057b0"), 0x018461), + (hex!("0122222222333333334444444455000001050000000000001060"), 0x018501), + (hex!("01222222223333333344444444550000010500000000000048d0"), 0x0185a1), + (hex!("0122222222333333334444444455000001050000000000007870"), 0x018641), + (hex!("0122222222333333334444444455000001060000000000001070"), 0x0186e1), + (hex!("0122222222333333334444444455000001060000000000004f90"), 0x018781), + (hex!("0122222222333333334444444455000001060000000000006270"), 0x018821), + (hex!("0122222222333333334444444455000001070000000000001080"), 0x0188c1), + (hex!("01222222223333333344444444550000010700000000000063b0"), 0x018961), + (hex!("0122222222333333334444444455000001080000000000001090"), 0x018a01), + (hex!("01222222223333333344444444550000010900000000000010a0"), 0x018aa1), + (hex!("0122222222333333334444444455000001090000000000006f40"), 0x018b41), + (hex!("01222222223333333344444444550000010a00000000000010b0"), 0x018be1), + (hex!("01222222223333333344444444550000010a0000000000006640"), 0x018c81), + (hex!("01222222223333333344444444550000010b00000000000010c0"), 0x018d21), + (hex!("01222222223333333344444444550000010c00000000000010d0"), 0x018dc1), + (hex!("01222222223333333344444444550000010d00000000000010e0"), 0x018e61), + (hex!("01222222223333333344444444550000010e00000000000010f0"), 0x018f01), + (hex!("01222222223333333344444444550000010e0000000000005c40"), 0x018fa1), + (hex!("01222222223333333344444444550000010e0000000000007ba0"), 0x019041), + (hex!("01222222223333333344444444550000010f0000000000001100"), 0x0190e1), + (hex!("01222222223333333344444444550000010f0000000000005c30"), 0x019181), + (hex!("0122222222333333334444444455000001100000000000001110"), 0x019221), + (hex!("0122222222333333334444444455000001100000000000007640"), 0x0192c1), + (hex!("0122222222333333334444444455000001110000000000001120"), 0x019361), + (hex!("01222222223333333344444444550000011100000000000052c0"), 0x019401), + (hex!("0122222222333333334444444455000001110000000000005710"), 0x0194a1), + (hex!("0122222222333333334444444455000001110000000000006a00"), 0x019541), + (hex!("0122222222333333334444444455000001120000000000001130"), 0x0195e1), + (hex!("0122222222333333334444444455000001130000000000001140"), 0x019681), + (hex!("0122222222333333334444444455000001140000000000001150"), 0x019721), + (hex!("0122222222333333334444444455000001140000000000003fa0"), 0x0197c1), + (hex!("01222222223333333344444444550000011400000000000054b0"), 0x019861), + (hex!("0122222222333333334444444455000001140000000000006070"), 0x019901), + (hex!("0122222222333333334444444455000001150000000000001160"), 0x0199a1), + (hex!("0122222222333333334444444455000001150000000000005320"), 0x019a41), + (hex!("0122222222333333334444444455000001150000000000006600"), 0x019ae1), + (hex!("0122222222333333334444444455000001150000000000006df0"), 0x019b81), + (hex!("01222222223333333344444444550000011500000000000079c0"), 0x019c21), + (hex!("0122222222333333334444444455000001160000000000001170"), 0x019cc1), + (hex!("0122222222333333334444444455000001170000000000001180"), 0x019d61), + (hex!("0122222222333333334444444455000001170000000000004a60"), 0x019e01), + (hex!("01222222223333333344444444550000011700000000000063c0"), 0x019ea1), + (hex!("0122222222333333334444444455000001180000000000001190"), 0x019f41), + (hex!("0122222222333333334444444455000001180000000000004530"), 0x019fe1), + (hex!("01222222223333333344444444550000011800000000000077c0"), 0x01a081), + (hex!("01222222223333333344444444550000011900000000000011a0"), 0x01a121), + (hex!("01222222223333333344444444550000011a00000000000011b0"), 0x01a1c1), + (hex!("01222222223333333344444444550000011a00000000000041c0"), 0x01a261), + (hex!("01222222223333333344444444550000011a00000000000061e0"), 0x01a301), + (hex!("01222222223333333344444444550000011b00000000000011c0"), 0x01a3a1), + (hex!("01222222223333333344444444550000011c00000000000011d0"), 0x01a441), + (hex!("01222222223333333344444444550000011c0000000000005f90"), 0x01a4e1), + (hex!("01222222223333333344444444550000011d00000000000011e0"), 0x01a581), + (hex!("01222222223333333344444444550000011d0000000000004160"), 0x01a621), + (hex!("01222222223333333344444444550000011e00000000000011f0"), 0x01a6c1), + (hex!("01222222223333333344444444550000011e00000000000056d0"), 0x01a761), + (hex!("01222222223333333344444444550000011f0000000000001200"), 0x01a801), + (hex!("01222222223333333344444444550000011f0000000000004510"), 0x01a8a1), + (hex!("0122222222333333334444444455000001200000000000001210"), 0x01a941), + (hex!("0122222222333333334444444455000001210000000000001220"), 0x01a9e1), + (hex!("0122222222333333334444444455000001210000000000005140"), 0x01aa81), + (hex!("0122222222333333334444444455000001210000000000006710"), 0x01ab21), + (hex!("0122222222333333334444444455000001210000000000006f50"), 0x01abc1), + (hex!("0122222222333333334444444455000001220000000000001230"), 0x01ac61), + (hex!("0122222222333333334444444455000001220000000000005570"), 0x01ad01), + (hex!("0122222222333333334444444455000001220000000000007ac0"), 0x01ada1), + (hex!("0122222222333333334444444455000001230000000000001240"), 0x01ae41), + (hex!("0122222222333333334444444455000001240000000000001250"), 0x01aee1), + (hex!("0122222222333333334444444455000001240000000000006cd0"), 0x01af81), + (hex!("0122222222333333334444444455000001250000000000001260"), 0x01b021), + (hex!("01222222223333333344444444550000012500000000000046b0"), 0x01b0c1), + (hex!("0122222222333333334444444455000001250000000000005eb0"), 0x01b161), + (hex!("0122222222333333334444444455000001260000000000001270"), 0x01b201), + (hex!("0122222222333333334444444455000001260000000000004630"), 0x01b2a1), + (hex!("0122222222333333334444444455000001270000000000001280"), 0x01b341), + (hex!("0122222222333333334444444455000001270000000000004ff0"), 0x01b3e1), + (hex!("0122222222333333334444444455000001270000000000006ec0"), 0x01b481), + (hex!("0122222222333333334444444455000001280000000000001290"), 0x01b521), + (hex!("01222222223333333344444444550000012900000000000012a0"), 0x01b5c1), + (hex!("0122222222333333334444444455000001290000000000005f60"), 0x01b661), + (hex!("01222222223333333344444444550000012a00000000000012b0"), 0x01b701), + (hex!("01222222223333333344444444550000012a0000000000005480"), 0x01b7a1), + (hex!("01222222223333333344444444550000012b00000000000012c0"), 0x01b841), + (hex!("01222222223333333344444444550000012b00000000000065a0"), 0x01b8e1), + (hex!("01222222223333333344444444550000012b00000000000066c0"), 0x01b981), + (hex!("01222222223333333344444444550000012c00000000000012d0"), 0x01ba21), + (hex!("01222222223333333344444444550000012c00000000000064b0"), 0x01bac1), + (hex!("01222222223333333344444444550000012d00000000000012e0"), 0x01bb61), + (hex!("01222222223333333344444444550000012d00000000000049c0"), 0x01bc01), + (hex!("01222222223333333344444444550000012d0000000000004bf0"), 0x01bca1), + (hex!("01222222223333333344444444550000012e00000000000012f0"), 0x01bd41), + (hex!("01222222223333333344444444550000012e0000000000005ed0"), 0x01bde1), + (hex!("01222222223333333344444444550000012f0000000000001300"), 0x01be81), + (hex!("01222222223333333344444444550000012f00000000000049a0"), 0x01bf21), + (hex!("0122222222333333334444444455000001300000000000001310"), 0x01bfc1), + (hex!("0122222222333333334444444455000001300000000000007840"), 0x01c061), + (hex!("0122222222333333334444444455000001310000000000001320"), 0x01c101), + (hex!("0122222222333333334444444455000001310000000000005f70"), 0x01c1a1), + (hex!("0122222222333333334444444455000001320000000000001330"), 0x01c241), + (hex!("0122222222333333334444444455000001320000000000005a00"), 0x01c2e1), + (hex!("0122222222333333334444444455000001330000000000001340"), 0x01c381), + (hex!("0122222222333333334444444455000001330000000000006c70"), 0x01c421), + (hex!("0122222222333333334444444455000001340000000000001350"), 0x01c4c1), + (hex!("0122222222333333334444444455000001340000000000005c60"), 0x01c561), + (hex!("0122222222333333334444444455000001350000000000001360"), 0x01c601), + (hex!("0122222222333333334444444455000001350000000000004f10"), 0x01c6a1), + (hex!("0122222222333333334444444455000001360000000000001370"), 0x01c741), + (hex!("0122222222333333334444444455000001360000000000004c60"), 0x01c7e1), + (hex!("0122222222333333334444444455000001370000000000001380"), 0x01c881), + (hex!("0122222222333333334444444455000001380000000000001390"), 0x01c921), + (hex!("01222222223333333344444444550000013900000000000013a0"), 0x01c9c1), + (hex!("0122222222333333334444444455000001390000000000004ea0"), 0x01ca61), + (hex!("01222222223333333344444444550000013a00000000000013b0"), 0x01cb01), + (hex!("01222222223333333344444444550000013a0000000000007350"), 0x01cba1), + (hex!("01222222223333333344444444550000013b00000000000013c0"), 0x01cc41), + (hex!("01222222223333333344444444550000013c00000000000013d0"), 0x01cce1), + (hex!("01222222223333333344444444550000013c0000000000007050"), 0x01cd81), + (hex!("01222222223333333344444444550000013d00000000000013e0"), 0x01ce21), + (hex!("01222222223333333344444444550000013d0000000000006bd0"), 0x01cec1), + (hex!("01222222223333333344444444550000013e00000000000013f0"), 0x01cf61), + (hex!("01222222223333333344444444550000013e00000000000058e0"), 0x01d001), + (hex!("01222222223333333344444444550000013f0000000000001400"), 0x01d0a1), + (hex!("01222222223333333344444444550000013f0000000000004740"), 0x01d141), + (hex!("0122222222333333334444444455000001400000000000001410"), 0x01d1e1), + (hex!("0122222222333333334444444455000001400000000000003f10"), 0x01d281), + (hex!("0122222222333333334444444455000001400000000000006d40"), 0x01d321), + (hex!("01222222223333333344444444550000014000000000000072d0"), 0x01d3c1), + (hex!("0122222222333333334444444455000001410000000000001420"), 0x01d461), + (hex!("0122222222333333334444444455000001420000000000001430"), 0x01d501), + (hex!("0122222222333333334444444455000001430000000000001440"), 0x01d5a1), + (hex!("0122222222333333334444444455000001440000000000001450"), 0x01d641), + (hex!("0122222222333333334444444455000001450000000000001460"), 0x01d6e1), + (hex!("0122222222333333334444444455000001460000000000001470"), 0x01d781), + (hex!("01222222223333333344444444550000014600000000000055c0"), 0x01d821), + (hex!("0122222222333333334444444455000001470000000000001480"), 0x01d8c1), + (hex!("0122222222333333334444444455000001470000000000004570"), 0x01d961), + (hex!("0122222222333333334444444455000001470000000000004be0"), 0x01da01), + (hex!("0122222222333333334444444455000001480000000000001490"), 0x01daa1), + (hex!("0122222222333333334444444455000001480000000000005360"), 0x01db41), + (hex!("01222222223333333344444444550000014900000000000014a0"), 0x01dbe1), + (hex!("01222222223333333344444444550000014a00000000000014b0"), 0x01dc81), + (hex!("01222222223333333344444444550000014a00000000000053d0"), 0x01dd21), + (hex!("01222222223333333344444444550000014b00000000000014c0"), 0x01ddc1), + (hex!("01222222223333333344444444550000014b0000000000005950"), 0x01de61), + (hex!("01222222223333333344444444550000014c00000000000014d0"), 0x01df01), + (hex!("01222222223333333344444444550000014c0000000000004f60"), 0x01dfa1), + (hex!("01222222223333333344444444550000014d00000000000014e0"), 0x01e041), + (hex!("01222222223333333344444444550000014d0000000000004520"), 0x01e0e1), + (hex!("01222222223333333344444444550000014d0000000000005200"), 0x01e181), + (hex!("01222222223333333344444444550000014e00000000000014f0"), 0x01e221), + (hex!("01222222223333333344444444550000014e0000000000005bd0"), 0x01e2c1), + (hex!("01222222223333333344444444550000014f0000000000001500"), 0x01e361), + (hex!("01222222223333333344444444550000014f00000000000060d0"), 0x01e401), + (hex!("0122222222333333334444444455000001500000000000001510"), 0x01e4a1), + (hex!("01222222223333333344444444550000015000000000000075e0"), 0x01e541), + (hex!("0122222222333333334444444455000001510000000000001520"), 0x01e5e1), + (hex!("0122222222333333334444444455000001510000000000005c00"), 0x01e681), + (hex!("0122222222333333334444444455000001510000000000006af0"), 0x01e721), + (hex!("0122222222333333334444444455000001510000000000007b80"), 0x01e7c1), + (hex!("0122222222333333334444444455000001520000000000001530"), 0x01e861), + (hex!("0122222222333333334444444455000001520000000000004c70"), 0x01e901), + (hex!("0122222222333333334444444455000001530000000000001540"), 0x01e9a1), + (hex!("0122222222333333334444444455000001540000000000001550"), 0x01ea41), + (hex!("0122222222333333334444444455000001540000000000007cd0"), 0x01eae1), + (hex!("0122222222333333334444444455000001550000000000001560"), 0x01eb81), + (hex!("0122222222333333334444444455000001550000000000004ae0"), 0x01ec21), + (hex!("01222222223333333344444444550000015500000000000068c0"), 0x01ecc1), + (hex!("0122222222333333334444444455000001560000000000001570"), 0x01ed61), + (hex!("01222222223333333344444444550000015600000000000064a0"), 0x01ee01), + (hex!("0122222222333333334444444455000001570000000000001580"), 0x01eea1), + (hex!("0122222222333333334444444455000001580000000000001590"), 0x01ef41), + (hex!("0122222222333333334444444455000001580000000000006d30"), 0x01efe1), + (hex!("01222222223333333344444444550000015800000000000074f0"), 0x01f081), + (hex!("01222222223333333344444444550000015900000000000015a0"), 0x01f121), + (hex!("01222222223333333344444444550000015900000000000053a0"), 0x01f1c1), + (hex!("01222222223333333344444444550000015900000000000055e0"), 0x01f261), + (hex!("0122222222333333334444444455000001590000000000006210"), 0x01f301), + (hex!("01222222223333333344444444550000015900000000000067c0"), 0x01f3a1), + (hex!("01222222223333333344444444550000015a00000000000015b0"), 0x01f441), + (hex!("01222222223333333344444444550000015b00000000000015c0"), 0x01f4e1), + (hex!("01222222223333333344444444550000015c00000000000015d0"), 0x01f581), + (hex!("01222222223333333344444444550000015c0000000000004d80"), 0x01f621), + (hex!("01222222223333333344444444550000015c00000000000073f0"), 0x01f6c1), + (hex!("01222222223333333344444444550000015d00000000000015e0"), 0x01f761), + (hex!("01222222223333333344444444550000015e00000000000015f0"), 0x01f801), + (hex!("01222222223333333344444444550000015e0000000000004120"), 0x01f8a1), + (hex!("01222222223333333344444444550000015e0000000000004350"), 0x01f941), + (hex!("01222222223333333344444444550000015e0000000000007c50"), 0x01f9e1), + (hex!("01222222223333333344444444550000015f0000000000001600"), 0x01fa81), + (hex!("0122222222333333334444444455000001600000000000001610"), 0x01fb21), + (hex!("0122222222333333334444444455000001600000000000004840"), 0x01fbc1), + (hex!("0122222222333333334444444455000001600000000000004b10"), 0x01fc61), + (hex!("0122222222333333334444444455000001600000000000007060"), 0x01fd01), + (hex!("0122222222333333334444444455000001610000000000001620"), 0x01fda1), + (hex!("0122222222333333334444444455000001610000000000005300"), 0x01fe41), + (hex!("0122222222333333334444444455000001620000000000001630"), 0x01fee1), + (hex!("0122222222333333334444444455000001620000000000006530"), 0x01ff81), + (hex!("0122222222333333334444444455000001630000000000001640"), 0x020021), + (hex!("0122222222333333334444444455000001640000000000001650"), 0x0200c1), + (hex!("0122222222333333334444444455000001650000000000001660"), 0x020161), + (hex!("0122222222333333334444444455000001660000000000001670"), 0x020201), + (hex!("0122222222333333334444444455000001670000000000001680"), 0x0202a1), + (hex!("0122222222333333334444444455000001670000000000007310"), 0x020341), + (hex!("0122222222333333334444444455000001680000000000001690"), 0x0203e1), + (hex!("0122222222333333334444444455000001680000000000007b50"), 0x020481), + (hex!("01222222223333333344444444550000016900000000000016a0"), 0x020521), + (hex!("01222222223333333344444444550000016900000000000049d0"), 0x0205c1), + (hex!("01222222223333333344444444550000016a00000000000016b0"), 0x020661), + (hex!("01222222223333333344444444550000016a00000000000078b0"), 0x020701), + (hex!("01222222223333333344444444550000016b00000000000016c0"), 0x0207a1), + (hex!("01222222223333333344444444550000016b0000000000004100"), 0x020841), + (hex!("01222222223333333344444444550000016c00000000000016d0"), 0x0208e1), + (hex!("01222222223333333344444444550000016c0000000000006e00"), 0x020981), + (hex!("01222222223333333344444444550000016d00000000000016e0"), 0x020a21), + (hex!("01222222223333333344444444550000016e00000000000016f0"), 0x020ac1), + (hex!("01222222223333333344444444550000016e0000000000004ac0"), 0x020b61), + (hex!("01222222223333333344444444550000016e0000000000007820"), 0x020c01), + (hex!("01222222223333333344444444550000016f0000000000001700"), 0x020ca1), + (hex!("0122222222333333334444444455000001700000000000001710"), 0x020d41), + (hex!("0122222222333333334444444455000001700000000000005830"), 0x020de1), + (hex!("0122222222333333334444444455000001710000000000001720"), 0x020e81), + (hex!("01222222223333333344444444550000017100000000000072f0"), 0x020f21), + (hex!("0122222222333333334444444455000001720000000000001730"), 0x020fc1), + (hex!("0122222222333333334444444455000001720000000000004870"), 0x021061), + (hex!("01222222223333333344444444550000017200000000000070b0"), 0x021101), + (hex!("0122222222333333334444444455000001730000000000001740"), 0x0211a1), + (hex!("0122222222333333334444444455000001740000000000001750"), 0x021241), + (hex!("0122222222333333334444444455000001750000000000001760"), 0x0212e1), + (hex!("0122222222333333334444444455000001750000000000005670"), 0x021381), + (hex!("0122222222333333334444444455000001750000000000005870"), 0x021421), + (hex!("0122222222333333334444444455000001760000000000001770"), 0x0214c1), + (hex!("0122222222333333334444444455000001770000000000001780"), 0x021561), + (hex!("0122222222333333334444444455000001770000000000005000"), 0x021601), + (hex!("0122222222333333334444444455000001770000000000007090"), 0x0216a1), + (hex!("0122222222333333334444444455000001780000000000001790"), 0x021741), + (hex!("01222222223333333344444444550000017800000000000048a0"), 0x0217e1), + (hex!("0122222222333333334444444455000001780000000000006bf0"), 0x021881), + (hex!("01222222223333333344444444550000017900000000000017a0"), 0x021921), + (hex!("01222222223333333344444444550000017900000000000057d0"), 0x0219c1), + (hex!("0122222222333333334444444455000001790000000000006660"), 0x021a61), + (hex!("01222222223333333344444444550000017a00000000000017b0"), 0x021b01), + (hex!("01222222223333333344444444550000017a0000000000004970"), 0x021ba1), + (hex!("01222222223333333344444444550000017a0000000000005dc0"), 0x021c41), + (hex!("01222222223333333344444444550000017b00000000000017c0"), 0x021ce1), + (hex!("01222222223333333344444444550000017b0000000000004ee0"), 0x021d81), + (hex!("01222222223333333344444444550000017b00000000000054c0"), 0x021e21), + (hex!("01222222223333333344444444550000017c00000000000017d0"), 0x021ec1), + (hex!("01222222223333333344444444550000017c0000000000003fc0"), 0x021f61), + (hex!("01222222223333333344444444550000017c00000000000063e0"), 0x022001), + (hex!("01222222223333333344444444550000017c0000000000006520"), 0x0220a1), + (hex!("01222222223333333344444444550000017d00000000000017e0"), 0x022141), + (hex!("01222222223333333344444444550000017d0000000000006220"), 0x0221e1), + (hex!("01222222223333333344444444550000017d0000000000007120"), 0x022281), + (hex!("01222222223333333344444444550000017e00000000000017f0"), 0x022321), + (hex!("01222222223333333344444444550000017f0000000000001800"), 0x0223c1), + (hex!("0122222222333333334444444455000001800000000000001810"), 0x022461), + (hex!("0122222222333333334444444455000001810000000000001820"), 0x022501), + (hex!("01222222223333333344444444550000018100000000000041f0"), 0x0225a1), + (hex!("0122222222333333334444444455000001810000000000007590"), 0x022641), + (hex!("0122222222333333334444444455000001820000000000001830"), 0x0226e1), + (hex!("0122222222333333334444444455000001820000000000004ce0"), 0x022781), + (hex!("0122222222333333334444444455000001830000000000001840"), 0x022821), + (hex!("01222222223333333344444444550000018300000000000042c0"), 0x0228c1), + (hex!("0122222222333333334444444455000001840000000000001850"), 0x022961), + (hex!("0122222222333333334444444455000001840000000000004f70"), 0x022a01), + (hex!("0122222222333333334444444455000001850000000000001860"), 0x022aa1), + (hex!("0122222222333333334444444455000001850000000000006470"), 0x022b41), + (hex!("0122222222333333334444444455000001850000000000007500"), 0x022be1), + (hex!("0122222222333333334444444455000001860000000000001870"), 0x022c81), + (hex!("0122222222333333334444444455000001860000000000004770"), 0x022d21), + (hex!("0122222222333333334444444455000001870000000000001880"), 0x022dc1), + (hex!("0122222222333333334444444455000001870000000000006a30"), 0x022e61), + (hex!("0122222222333333334444444455000001880000000000001890"), 0x022f01), + (hex!("0122222222333333334444444455000001880000000000007410"), 0x022fa1), + (hex!("01222222223333333344444444550000018900000000000018a0"), 0x023041), + (hex!("01222222223333333344444444550000018900000000000044d0"), 0x0230e1), + (hex!("0122222222333333334444444455000001890000000000005ac0"), 0x023181), + (hex!("01222222223333333344444444550000018a00000000000018b0"), 0x023221), + (hex!("01222222223333333344444444550000018a0000000000006260"), 0x0232c1), + (hex!("01222222223333333344444444550000018a0000000000006d70"), 0x023361), + (hex!("01222222223333333344444444550000018b00000000000018c0"), 0x023401), + (hex!("01222222223333333344444444550000018b0000000000004aa0"), 0x0234a1), + (hex!("01222222223333333344444444550000018b0000000000006fd0"), 0x023541), + (hex!("01222222223333333344444444550000018c00000000000018d0"), 0x0235e1), + (hex!("01222222223333333344444444550000018c00000000000051b0"), 0x023681), + (hex!("01222222223333333344444444550000018c0000000000006650"), 0x023721), + (hex!("01222222223333333344444444550000018d00000000000018e0"), 0x0237c1), + (hex!("01222222223333333344444444550000018e00000000000018f0"), 0x023861), + (hex!("01222222223333333344444444550000018e00000000000041d0"), 0x023901), + (hex!("01222222223333333344444444550000018f0000000000001900"), 0x0239a1), + (hex!("01222222223333333344444444550000018f0000000000007600"), 0x023a41), + (hex!("0122222222333333334444444455000001900000000000001910"), 0x023ae1), + (hex!("0122222222333333334444444455000001900000000000005410"), 0x023b81), + (hex!("0122222222333333334444444455000001900000000000006760"), 0x023c21), + (hex!("0122222222333333334444444455000001910000000000001920"), 0x023cc1), + (hex!("0122222222333333334444444455000001920000000000001930"), 0x023d61), + (hex!("0122222222333333334444444455000001920000000000004ca0"), 0x023e01), + (hex!("0122222222333333334444444455000001920000000000005d80"), 0x023ea1), + (hex!("0122222222333333334444444455000001920000000000005fd0"), 0x023f41), + (hex!("01222222223333333344444444550000019200000000000070d0"), 0x023fe1), + (hex!("0122222222333333334444444455000001930000000000001940"), 0x024081), + (hex!("0122222222333333334444444455000001930000000000004010"), 0x024121), + (hex!("0122222222333333334444444455000001930000000000007ca0"), 0x0241c1), + (hex!("0122222222333333334444444455000001940000000000001950"), 0x024261), + (hex!("0122222222333333334444444455000001950000000000001960"), 0x024301), + (hex!("0122222222333333334444444455000001950000000000005380"), 0x0243a1), + (hex!("0122222222333333334444444455000001960000000000001970"), 0x024441), + (hex!("0122222222333333334444444455000001960000000000006de0"), 0x0244e1), + (hex!("0122222222333333334444444455000001970000000000001980"), 0x024581), + (hex!("01222222223333333344444444550000019700000000000048f0"), 0x024621), + (hex!("0122222222333333334444444455000001980000000000001990"), 0x0246c1), + (hex!("0122222222333333334444444455000001980000000000006510"), 0x024761), + (hex!("01222222223333333344444444550000019900000000000019a0"), 0x024801), + (hex!("0122222222333333334444444455000001990000000000007570"), 0x0248a1), + (hex!("0122222222333333334444444455000001990000000000007580"), 0x024941), + (hex!("01222222223333333344444444550000019a00000000000019b0"), 0x0249e1), + (hex!("01222222223333333344444444550000019a0000000000004050"), 0x024a81), + (hex!("01222222223333333344444444550000019a0000000000004ba0"), 0x024b21), + (hex!("01222222223333333344444444550000019a0000000000005540"), 0x024bc1), + (hex!("01222222223333333344444444550000019a00000000000061c0"), 0x024c61), + (hex!("01222222223333333344444444550000019a0000000000007c60"), 0x024d01), + (hex!("01222222223333333344444444550000019b00000000000019c0"), 0x024da1), + (hex!("01222222223333333344444444550000019b0000000000006240"), 0x024e41), + (hex!("01222222223333333344444444550000019c00000000000019d0"), 0x024ee1), + (hex!("01222222223333333344444444550000019d00000000000019e0"), 0x024f81), + (hex!("01222222223333333344444444550000019d0000000000004640"), 0x025021), + (hex!("01222222223333333344444444550000019d00000000000052a0"), 0x0250c1), + (hex!("01222222223333333344444444550000019d00000000000052b0"), 0x025161), + (hex!("01222222223333333344444444550000019e00000000000019f0"), 0x025201), + (hex!("01222222223333333344444444550000019f0000000000001a00"), 0x0252a1), + (hex!("01222222223333333344444444550000019f0000000000006b20"), 0x025341), + (hex!("0122222222333333334444444455000001a00000000000001a10"), 0x0253e1), + (hex!("0122222222333333334444444455000001a10000000000001a20"), 0x025481), + (hex!("0122222222333333334444444455000001a10000000000005460"), 0x025521), + (hex!("0122222222333333334444444455000001a10000000000005d20"), 0x0255c1), + (hex!("0122222222333333334444444455000001a100000000000068f0"), 0x025661), + (hex!("0122222222333333334444444455000001a20000000000001a30"), 0x025701), + (hex!("0122222222333333334444444455000001a20000000000007170"), 0x0257a1), + (hex!("0122222222333333334444444455000001a30000000000001a40"), 0x025841), + (hex!("0122222222333333334444444455000001a40000000000001a50"), 0x0258e1), + (hex!("0122222222333333334444444455000001a50000000000001a60"), 0x025981), + (hex!("0122222222333333334444444455000001a60000000000001a70"), 0x025a21), + (hex!("0122222222333333334444444455000001a70000000000001a80"), 0x025ac1), + (hex!("0122222222333333334444444455000001a70000000000005a90"), 0x025b61), + (hex!("0122222222333333334444444455000001a70000000000006440"), 0x025c01), + (hex!("0122222222333333334444444455000001a80000000000001a90"), 0x025ca1), + (hex!("0122222222333333334444444455000001a80000000000004800"), 0x025d41), + (hex!("0122222222333333334444444455000001a90000000000001aa0"), 0x025de1), + (hex!("0122222222333333334444444455000001aa0000000000001ab0"), 0x025e81), + (hex!("0122222222333333334444444455000001aa0000000000005b60"), 0x025f21), + (hex!("0122222222333333334444444455000001ab0000000000001ac0"), 0x025fc1), + (hex!("0122222222333333334444444455000001ab0000000000006700"), 0x026061), + (hex!("0122222222333333334444444455000001ab00000000000071d0"), 0x026101), + (hex!("0122222222333333334444444455000001ac0000000000001ad0"), 0x0261a1), + (hex!("0122222222333333334444444455000001ac0000000000007380"), 0x026241), + (hex!("0122222222333333334444444455000001ad0000000000001ae0"), 0x0262e1), + (hex!("0122222222333333334444444455000001ad0000000000006350"), 0x026381), + (hex!("0122222222333333334444444455000001ae0000000000001af0"), 0x026421), + (hex!("0122222222333333334444444455000001af0000000000001b00"), 0x0264c1), + (hex!("0122222222333333334444444455000001af0000000000007390"), 0x026561), + (hex!("0122222222333333334444444455000001b00000000000001b10"), 0x026601), + (hex!("0122222222333333334444444455000001b10000000000001b20"), 0x0266a1), + (hex!("0122222222333333334444444455000001b10000000000005cc0"), 0x026741), + (hex!("0122222222333333334444444455000001b20000000000001b30"), 0x0267e1), + (hex!("0122222222333333334444444455000001b20000000000004fb0"), 0x026881), + (hex!("0122222222333333334444444455000001b30000000000001b40"), 0x026921), + (hex!("0122222222333333334444444455000001b40000000000001b50"), 0x0269c1), + (hex!("0122222222333333334444444455000001b50000000000001b60"), 0x026a61), + (hex!("0122222222333333334444444455000001b60000000000001b70"), 0x026b01), + (hex!("0122222222333333334444444455000001b600000000000048e0"), 0x026ba1), + (hex!("0122222222333333334444444455000001b70000000000001b80"), 0x026c41), + (hex!("0122222222333333334444444455000001b70000000000005ca0"), 0x026ce1), + (hex!("0122222222333333334444444455000001b70000000000007900"), 0x026d81), + (hex!("0122222222333333334444444455000001b80000000000001b90"), 0x026e21), + (hex!("0122222222333333334444444455000001b80000000000004d90"), 0x026ec1), + (hex!("0122222222333333334444444455000001b90000000000001ba0"), 0x026f61), + (hex!("0122222222333333334444444455000001b90000000000003f40"), 0x027001), + (hex!("0122222222333333334444444455000001ba0000000000001bb0"), 0x0270a1), + (hex!("0122222222333333334444444455000001ba00000000000042a0"), 0x027141), + (hex!("0122222222333333334444444455000001ba00000000000067f0"), 0x0271e1), + (hex!("0122222222333333334444444455000001ba00000000000073a0"), 0x027281), + (hex!("0122222222333333334444444455000001bb0000000000001bc0"), 0x027321), + (hex!("0122222222333333334444444455000001bb0000000000004a00"), 0x0273c1), + (hex!("0122222222333333334444444455000001bb0000000000005e00"), 0x027461), + (hex!("0122222222333333334444444455000001bc0000000000001bd0"), 0x027501), + (hex!("0122222222333333334444444455000001bc0000000000004230"), 0x0275a1), + (hex!("0122222222333333334444444455000001bc0000000000005860"), 0x027641), + (hex!("0122222222333333334444444455000001bd0000000000001be0"), 0x0276e1), + (hex!("0122222222333333334444444455000001bd0000000000007c70"), 0x027781), + (hex!("0122222222333333334444444455000001be0000000000001bf0"), 0x027821), + (hex!("0122222222333333334444444455000001be0000000000007770"), 0x0278c1), + (hex!("0122222222333333334444444455000001be0000000000007cf0"), 0x027961), + (hex!("0122222222333333334444444455000001bf0000000000001c00"), 0x027a01), + (hex!("0122222222333333334444444455000001bf0000000000006490"), 0x027aa1), + (hex!("0122222222333333334444444455000001c00000000000001c10"), 0x027b41), + (hex!("0122222222333333334444444455000001c10000000000001c20"), 0x027be1), + (hex!("0122222222333333334444444455000001c10000000000004600"), 0x027c81), + (hex!("0122222222333333334444444455000001c20000000000001c30"), 0x027d21), + (hex!("0122222222333333334444444455000001c20000000000006e30"), 0x027dc1), + (hex!("0122222222333333334444444455000001c30000000000001c40"), 0x027e61), + (hex!("0122222222333333334444444455000001c40000000000001c50"), 0x027f01), + (hex!("0122222222333333334444444455000001c50000000000001c60"), 0x027fa1), + (hex!("0122222222333333334444444455000001c60000000000001c70"), 0x028041), + (hex!("0122222222333333334444444455000001c60000000000004240"), 0x0280e1), + (hex!("0122222222333333334444444455000001c60000000000005bb0"), 0x028181), + (hex!("0122222222333333334444444455000001c70000000000001c80"), 0x028221), + (hex!("0122222222333333334444444455000001c80000000000001c90"), 0x0282c1), + (hex!("0122222222333333334444444455000001c90000000000001ca0"), 0x028361), + (hex!("0122222222333333334444444455000001c90000000000006730"), 0x028401), + (hex!("0122222222333333334444444455000001ca0000000000001cb0"), 0x0284a1), + (hex!("0122222222333333334444444455000001ca00000000000070f0"), 0x028541), + (hex!("0122222222333333334444444455000001cb0000000000001cc0"), 0x0285e1), + (hex!("0122222222333333334444444455000001cb00000000000071a0"), 0x028681), + (hex!("0122222222333333334444444455000001cc0000000000001cd0"), 0x028721), + (hex!("0122222222333333334444444455000001cc0000000000005280"), 0x0287c1), + (hex!("0122222222333333334444444455000001cc0000000000005d90"), 0x028861), + (hex!("0122222222333333334444444455000001cd0000000000001ce0"), 0x028901), + (hex!("0122222222333333334444444455000001cd00000000000069b0"), 0x0289a1), + (hex!("0122222222333333334444444455000001ce0000000000001cf0"), 0x028a41), + (hex!("0122222222333333334444444455000001ce0000000000004540"), 0x028ae1), + (hex!("0122222222333333334444444455000001cf0000000000001d00"), 0x028b81), + (hex!("0122222222333333334444444455000001cf00000000000076a0"), 0x028c21), + (hex!("0122222222333333334444444455000001d00000000000001d10"), 0x028cc1), + (hex!("0122222222333333334444444455000001d000000000000060a0"), 0x028d61), + (hex!("0122222222333333334444444455000001d10000000000001d20"), 0x028e01), + (hex!("0122222222333333334444444455000001d20000000000001d30"), 0x028ea1), + (hex!("0122222222333333334444444455000001d30000000000001d40"), 0x028f41), + (hex!("0122222222333333334444444455000001d30000000000004000"), 0x028fe1), + (hex!("0122222222333333334444444455000001d30000000000004140"), 0x029081), + (hex!("0122222222333333334444444455000001d30000000000006790"), 0x029121), + (hex!("0122222222333333334444444455000001d40000000000001d50"), 0x0291c1), + (hex!("0122222222333333334444444455000001d50000000000001d60"), 0x029261), + (hex!("0122222222333333334444444455000001d60000000000001d70"), 0x029301), + (hex!("0122222222333333334444444455000001d60000000000004b50"), 0x0293a1), + (hex!("0122222222333333334444444455000001d60000000000007430"), 0x029441), + (hex!("0122222222333333334444444455000001d70000000000001d80"), 0x0294e1), + (hex!("0122222222333333334444444455000001d70000000000006920"), 0x029581), + (hex!("0122222222333333334444444455000001d80000000000001d90"), 0x029621), + (hex!("0122222222333333334444444455000001d80000000000005b30"), 0x0296c1), + (hex!("0122222222333333334444444455000001d90000000000001da0"), 0x029761), + (hex!("0122222222333333334444444455000001da0000000000001db0"), 0x029801), + (hex!("0122222222333333334444444455000001da0000000000004af0"), 0x0298a1), + (hex!("0122222222333333334444444455000001da0000000000007240"), 0x029941), + (hex!("0122222222333333334444444455000001da0000000000007470"), 0x0299e1), + (hex!("0122222222333333334444444455000001db0000000000001dc0"), 0x029a81), + (hex!("0122222222333333334444444455000001db00000000000045d0"), 0x029b21), + (hex!("0122222222333333334444444455000001dc0000000000001dd0"), 0x029bc1), + (hex!("0122222222333333334444444455000001dd0000000000001de0"), 0x029c61), + (hex!("0122222222333333334444444455000001dd0000000000004bb0"), 0x029d01), + (hex!("0122222222333333334444444455000001dd0000000000004cd0"), 0x029da1), + (hex!("0122222222333333334444444455000001dd0000000000006100"), 0x029e41), + (hex!("0122222222333333334444444455000001dd0000000000007bb0"), 0x029ee1), + (hex!("0122222222333333334444444455000001de0000000000001df0"), 0x029f81), + (hex!("0122222222333333334444444455000001de0000000000004260"), 0x02a021), + (hex!("0122222222333333334444444455000001de0000000000006040"), 0x02a0c1), + (hex!("0122222222333333334444444455000001df0000000000001e00"), 0x02a161), + (hex!("0122222222333333334444444455000001df0000000000005fa0"), 0x02a201), + (hex!("0122222222333333334444444455000001df0000000000006a70"), 0x02a2a1), + (hex!("0122222222333333334444444455000001df0000000000006dc0"), 0x02a341), + (hex!("0122222222333333334444444455000001e00000000000001e10"), 0x02a3e1), + (hex!("0122222222333333334444444455000001e00000000000007010"), 0x02a481), + (hex!("0122222222333333334444444455000001e10000000000001e20"), 0x02a521), + (hex!("0122222222333333334444444455000001e10000000000005720"), 0x02a5c1), + (hex!("0122222222333333334444444455000001e10000000000006830"), 0x02a661), + (hex!("0122222222333333334444444455000001e20000000000001e30"), 0x02a701), + (hex!("0122222222333333334444444455000001e20000000000005100"), 0x02a7a1), + (hex!("0122222222333333334444444455000001e30000000000001e40"), 0x02a841), + (hex!("0122222222333333334444444455000001e40000000000001e50"), 0x02a8e1), + (hex!("0122222222333333334444444455000001e40000000000003f30"), 0x02a981), + (hex!("0122222222333333334444444455000001e40000000000005220"), 0x02aa21), + (hex!("0122222222333333334444444455000001e50000000000001e60"), 0x02aac1), + (hex!("0122222222333333334444444455000001e50000000000006f60"), 0x02ab61), + (hex!("0122222222333333334444444455000001e60000000000001e70"), 0x02ac01), + (hex!("0122222222333333334444444455000001e60000000000006c80"), 0x02aca1), + (hex!("0122222222333333334444444455000001e70000000000001e80"), 0x02ad41), + (hex!("0122222222333333334444444455000001e80000000000001e90"), 0x02ade1), + (hex!("0122222222333333334444444455000001e80000000000004e30"), 0x02ae81), + (hex!("0122222222333333334444444455000001e90000000000001ea0"), 0x02af21), + (hex!("0122222222333333334444444455000001e90000000000005470"), 0x02afc1), + (hex!("0122222222333333334444444455000001ea0000000000001eb0"), 0x02b061), + (hex!("0122222222333333334444444455000001ea0000000000007980"), 0x02b101), + (hex!("0122222222333333334444444455000001eb0000000000001ec0"), 0x02b1a1), + (hex!("0122222222333333334444444455000001eb0000000000004390"), 0x02b241), + (hex!("0122222222333333334444444455000001eb0000000000005970"), 0x02b2e1), + (hex!("0122222222333333334444444455000001ec0000000000001ed0"), 0x02b381), + (hex!("0122222222333333334444444455000001ec0000000000005d50"), 0x02b421), + (hex!("0122222222333333334444444455000001ec00000000000076e0"), 0x02b4c1), + (hex!("0122222222333333334444444455000001ed0000000000001ee0"), 0x02b561), + (hex!("0122222222333333334444444455000001ed0000000000006190"), 0x02b601), + (hex!("0122222222333333334444444455000001ee0000000000001ef0"), 0x02b6a1), + (hex!("0122222222333333334444444455000001ee0000000000004900"), 0x02b741), + (hex!("0122222222333333334444444455000001ef0000000000001f00"), 0x02b7e1), + (hex!("0122222222333333334444444455000001ef0000000000006c60"), 0x02b881), + (hex!("0122222222333333334444444455000001f00000000000001f10"), 0x02b921), + (hex!("0122222222333333334444444455000001f00000000000006950"), 0x02b9c1), + (hex!("0122222222333333334444444455000001f10000000000001f20"), 0x02ba61), + (hex!("0122222222333333334444444455000001f10000000000006400"), 0x02bb01), + (hex!("0122222222333333334444444455000001f20000000000001f30"), 0x02bba1), + (hex!("0122222222333333334444444455000001f20000000000006f00"), 0x02bc41), + (hex!("0122222222333333334444444455000001f20000000000007b10"), 0x02bce1), + (hex!("0122222222333333334444444455000001f30000000000001f40"), 0x02bd81), + (hex!("0122222222333333334444444455000001f40000000000001f50"), 0x02be21), + (hex!("0122222222333333334444444455000001f50000000000001f60"), 0x02bec1), + (hex!("0122222222333333334444444455000001f500000000000044f0"), 0x02bf61), + (hex!("0122222222333333334444444455000001f60000000000001f70"), 0x02c001), + (hex!("0122222222333333334444444455000001f70000000000001f80"), 0x02c0a1), + (hex!("0122222222333333334444444455000001f70000000000004ad0"), 0x02c141), + (hex!("0122222222333333334444444455000001f80000000000001f90"), 0x02c1e1), + (hex!("0122222222333333334444444455000001f90000000000001fa0"), 0x02c281), + (hex!("0122222222333333334444444455000001f90000000000003f60"), 0x02c321), + (hex!("0122222222333333334444444455000001f90000000000004a80"), 0x02c3c1), + (hex!("0122222222333333334444444455000001fa0000000000001fb0"), 0x02c461), + (hex!("0122222222333333334444444455000001fa0000000000006f90"), 0x02c501), + (hex!("0122222222333333334444444455000001fb0000000000001fc0"), 0x02c5a1), + (hex!("0122222222333333334444444455000001fc0000000000001fd0"), 0x02c641), + (hex!("0122222222333333334444444455000001fc0000000000004a90"), 0x02c6e1), + (hex!("0122222222333333334444444455000001fd0000000000001fe0"), 0x02c781), + (hex!("0122222222333333334444444455000001fd0000000000005f50"), 0x02c821), + (hex!("0122222222333333334444444455000001fe0000000000001ff0"), 0x02c8c1), + (hex!("0122222222333333334444444455000001ff0000000000002000"), 0x02c961), + (hex!("0122222222333333334444444455000002000000000000002010"), 0x02ca01), + (hex!("0122222222333333334444444455000002000000000000005f00"), 0x02caa1), + (hex!("0122222222333333334444444455000002000000000000006840"), 0x02cb41), + (hex!("0122222222333333334444444455000002010000000000002020"), 0x02cbe1), + (hex!("0122222222333333334444444455000002020000000000002030"), 0x02cc81), + (hex!("0122222222333333334444444455000002030000000000002040"), 0x02cd21), + (hex!("0122222222333333334444444455000002040000000000002050"), 0x02cdc1), + (hex!("01222222223333333344444444550000020400000000000051f0"), 0x02ce61), + (hex!("0122222222333333334444444455000002050000000000002060"), 0x02cf01), + (hex!("0122222222333333334444444455000002060000000000002070"), 0x02cfa1), + (hex!("0122222222333333334444444455000002060000000000005c80"), 0x02d041), + (hex!("01222222223333333344444444550000020600000000000061d0"), 0x02d0e1), + (hex!("01222222223333333344444444550000020600000000000078c0"), 0x02d181), + (hex!("0122222222333333334444444455000002070000000000002080"), 0x02d221), + (hex!("0122222222333333334444444455000002070000000000006ba0"), 0x02d2c1), + (hex!("0122222222333333334444444455000002080000000000002090"), 0x02d361), + (hex!("01222222223333333344444444550000020900000000000020a0"), 0x02d401), + (hex!("01222222223333333344444444550000020900000000000067a0"), 0x02d4a1), + (hex!("01222222223333333344444444550000020a00000000000020b0"), 0x02d541), + (hex!("01222222223333333344444444550000020a0000000000004950"), 0x02d5e1), + (hex!("01222222223333333344444444550000020a0000000000004de0"), 0x02d681), + (hex!("01222222223333333344444444550000020b00000000000020c0"), 0x02d721), + (hex!("01222222223333333344444444550000020b0000000000004b00"), 0x02d7c1), + (hex!("01222222223333333344444444550000020c00000000000020d0"), 0x02d861), + (hex!("01222222223333333344444444550000020d00000000000020e0"), 0x02d901), + (hex!("01222222223333333344444444550000020e00000000000020f0"), 0x02d9a1), + (hex!("01222222223333333344444444550000020f0000000000002100"), 0x02da41), + (hex!("0122222222333333334444444455000002100000000000002110"), 0x02dae1), + (hex!("0122222222333333334444444455000002110000000000002120"), 0x02db81), + (hex!("0122222222333333334444444455000002110000000000004490"), 0x02dc21), + (hex!("0122222222333333334444444455000002120000000000002130"), 0x02dcc1), + (hex!("0122222222333333334444444455000002130000000000002140"), 0x02dd61), + (hex!("01222222223333333344444444550000021300000000000046d0"), 0x02de01), + (hex!("01222222223333333344444444550000021300000000000046e0"), 0x02dea1), + (hex!("0122222222333333334444444455000002130000000000004b70"), 0x02df41), + (hex!("0122222222333333334444444455000002140000000000002150"), 0x02dfe1), + (hex!("0122222222333333334444444455000002140000000000006c50"), 0x02e081), + (hex!("0122222222333333334444444455000002150000000000002160"), 0x02e121), + (hex!("01222222223333333344444444550000021500000000000043c0"), 0x02e1c1), + (hex!("0122222222333333334444444455000002160000000000002170"), 0x02e261), + (hex!("01222222223333333344444444550000021600000000000055b0"), 0x02e301), + (hex!("0122222222333333334444444455000002160000000000006150"), 0x02e3a1), + (hex!("0122222222333333334444444455000002170000000000002180"), 0x02e441), + (hex!("01222222223333333344444444550000021700000000000053b0"), 0x02e4e1), + (hex!("0122222222333333334444444455000002170000000000007460"), 0x02e581), + (hex!("0122222222333333334444444455000002180000000000002190"), 0x02e621), + (hex!("01222222223333333344444444550000021900000000000021a0"), 0x02e6c1), + (hex!("01222222223333333344444444550000021a00000000000021b0"), 0x02e761), + (hex!("01222222223333333344444444550000021a0000000000007650"), 0x02e801), + (hex!("01222222223333333344444444550000021b00000000000021c0"), 0x02e8a1), + (hex!("01222222223333333344444444550000021b0000000000004b20"), 0x02e941), + (hex!("01222222223333333344444444550000021c00000000000021d0"), 0x02e9e1), + (hex!("01222222223333333344444444550000021c0000000000007610"), 0x02ea81), + (hex!("01222222223333333344444444550000021d00000000000021e0"), 0x02eb21), + (hex!("01222222223333333344444444550000021d0000000000005f40"), 0x02ebc1), + (hex!("01222222223333333344444444550000021e00000000000021f0"), 0x02ec61), + (hex!("01222222223333333344444444550000021e0000000000005a50"), 0x02ed01), + (hex!("01222222223333333344444444550000021e0000000000005ff0"), 0x02eda1), + (hex!("01222222223333333344444444550000021f0000000000002200"), 0x02ee41), + (hex!("01222222223333333344444444550000021f00000000000043a0"), 0x02eee1), + (hex!("01222222223333333344444444550000021f0000000000004cb0"), 0x02ef81), + (hex!("01222222223333333344444444550000021f0000000000004e00"), 0x02f021), + (hex!("0122222222333333334444444455000002200000000000002210"), 0x02f0c1), + (hex!("0122222222333333334444444455000002210000000000002220"), 0x02f161), + (hex!("0122222222333333334444444455000002210000000000006290"), 0x02f201), + (hex!("0122222222333333334444444455000002210000000000007230"), 0x02f2a1), + (hex!("0122222222333333334444444455000002220000000000002230"), 0x02f341), + (hex!("0122222222333333334444444455000002220000000000006ea0"), 0x02f3e1), + (hex!("0122222222333333334444444455000002230000000000002240"), 0x02f481), + (hex!("0122222222333333334444444455000002230000000000004710"), 0x02f521), + (hex!("0122222222333333334444444455000002240000000000002250"), 0x02f5c1), + (hex!("0122222222333333334444444455000002250000000000002260"), 0x02f661), + (hex!("0122222222333333334444444455000002260000000000002270"), 0x02f701), + (hex!("0122222222333333334444444455000002260000000000005b40"), 0x02f7a1), + (hex!("0122222222333333334444444455000002260000000000006300"), 0x02f841), + (hex!("0122222222333333334444444455000002270000000000002280"), 0x02f8e1), + (hex!("0122222222333333334444444455000002270000000000005b80"), 0x02f981), + (hex!("0122222222333333334444444455000002280000000000002290"), 0x02fa21), + (hex!("0122222222333333334444444455000002280000000000003ed0"), 0x02fac1), + (hex!("0122222222333333334444444455000002280000000000004550"), 0x02fb61), + (hex!("01222222223333333344444444550000022800000000000077d0"), 0x02fc01), + (hex!("01222222223333333344444444550000022900000000000022a0"), 0x02fca1), + (hex!("0122222222333333334444444455000002290000000000006480"), 0x02fd41), + (hex!("01222222223333333344444444550000022a00000000000022b0"), 0x02fde1), + (hex!("01222222223333333344444444550000022a0000000000005450"), 0x02fe81), + (hex!("01222222223333333344444444550000022b00000000000022c0"), 0x02ff21), + (hex!("01222222223333333344444444550000022b0000000000006dd0"), 0x02ffc1), + (hex!("01222222223333333344444444550000022c00000000000022d0"), 0x030061), + (hex!("01222222223333333344444444550000022c0000000000006890"), 0x030101), + (hex!("01222222223333333344444444550000022d00000000000022e0"), 0x0301a1), + (hex!("01222222223333333344444444550000022e00000000000022f0"), 0x030241), + (hex!("01222222223333333344444444550000022e0000000000004f20"), 0x0302e1), + (hex!("01222222223333333344444444550000022f0000000000002300"), 0x030381), + (hex!("01222222223333333344444444550000022f0000000000005260"), 0x030421), + (hex!("01222222223333333344444444550000022f00000000000053f0"), 0x0304c1), + (hex!("0122222222333333334444444455000002300000000000002310"), 0x030561), + (hex!("01222222223333333344444444550000023000000000000050e0"), 0x030601), + (hex!("0122222222333333334444444455000002310000000000002320"), 0x0306a1), + (hex!("0122222222333333334444444455000002310000000000007800"), 0x030741), + (hex!("0122222222333333334444444455000002320000000000002330"), 0x0307e1), + (hex!("0122222222333333334444444455000002330000000000002340"), 0x030881), + (hex!("0122222222333333334444444455000002330000000000004d70"), 0x030921), + (hex!("0122222222333333334444444455000002330000000000005cf0"), 0x0309c1), + (hex!("0122222222333333334444444455000002340000000000002350"), 0x030a61), + (hex!("0122222222333333334444444455000002350000000000002360"), 0x030b01), + (hex!("0122222222333333334444444455000002350000000000006970"), 0x030ba1), + (hex!("0122222222333333334444444455000002360000000000002370"), 0x030c41), + (hex!("0122222222333333334444444455000002360000000000005270"), 0x030ce1), + (hex!("0122222222333333334444444455000002370000000000002380"), 0x030d81), + (hex!("0122222222333333334444444455000002370000000000005d70"), 0x030e21), + (hex!("0122222222333333334444444455000002380000000000002390"), 0x030ec1), + (hex!("01222222223333333344444444550000023800000000000069a0"), 0x030f61), + (hex!("01222222223333333344444444550000023900000000000023a0"), 0x031001), + (hex!("01222222223333333344444444550000023900000000000052e0"), 0x0310a1), + (hex!("0122222222333333334444444455000002390000000000005a10"), 0x031141), + (hex!("0122222222333333334444444455000002390000000000007440"), 0x0311e1), + (hex!("01222222223333333344444444550000023a00000000000023b0"), 0x031281), + (hex!("01222222223333333344444444550000023a0000000000003f00"), 0x031321), + (hex!("01222222223333333344444444550000023a0000000000004430"), 0x0313c1), + (hex!("01222222223333333344444444550000023a0000000000007070"), 0x031461), + (hex!("01222222223333333344444444550000023a00000000000074a0"), 0x031501), + (hex!("01222222223333333344444444550000023b00000000000023c0"), 0x0315a1), + (hex!("01222222223333333344444444550000023b0000000000004730"), 0x031641), + (hex!("01222222223333333344444444550000023b00000000000068b0"), 0x0316e1), + (hex!("01222222223333333344444444550000023c00000000000023d0"), 0x031781), + (hex!("01222222223333333344444444550000023c0000000000004680"), 0x031821), + (hex!("01222222223333333344444444550000023d00000000000023e0"), 0x0318c1), + (hex!("01222222223333333344444444550000023d00000000000059a0"), 0x031961), + (hex!("01222222223333333344444444550000023e00000000000023f0"), 0x031a01), + (hex!("01222222223333333344444444550000023f0000000000002400"), 0x031aa1), + (hex!("0122222222333333334444444455000002400000000000002410"), 0x031b41), + (hex!("0122222222333333334444444455000002400000000000004920"), 0x031be1), + (hex!("01222222223333333344444444550000024000000000000066e0"), 0x031c81), + (hex!("01222222223333333344444444550000024000000000000076f0"), 0x031d21), + (hex!("01222222223333333344444444550000024000000000000078e0"), 0x031dc1), + (hex!("0122222222333333334444444455000002410000000000002420"), 0x031e61), + (hex!("0122222222333333334444444455000002420000000000002430"), 0x031f01), + (hex!("0122222222333333334444444455000002420000000000006590"), 0x031fa1), + (hex!("0122222222333333334444444455000002430000000000002440"), 0x032041), + (hex!("0122222222333333334444444455000002430000000000004d00"), 0x0320e1), + (hex!("0122222222333333334444444455000002440000000000002450"), 0x032181), + (hex!("0122222222333333334444444455000002440000000000005f80"), 0x032221), + (hex!("0122222222333333334444444455000002450000000000002460"), 0x0322c1), + (hex!("0122222222333333334444444455000002450000000000004940"), 0x032361), + (hex!("0122222222333333334444444455000002460000000000002470"), 0x032401), + (hex!("0122222222333333334444444455000002470000000000002480"), 0x0324a1), + (hex!("0122222222333333334444444455000002470000000000004dd0"), 0x032541), + (hex!("0122222222333333334444444455000002470000000000005930"), 0x0325e1), + (hex!("01222222223333333344444444550000024700000000000061b0"), 0x032681), + (hex!("0122222222333333334444444455000002470000000000007740"), 0x032721), + (hex!("0122222222333333334444444455000002480000000000002490"), 0x0327c1), + (hex!("0122222222333333334444444455000002480000000000004890"), 0x032861), + (hex!("01222222223333333344444444550000024900000000000024a0"), 0x032901), + (hex!("01222222223333333344444444550000024a00000000000024b0"), 0x0329a1), + (hex!("01222222223333333344444444550000024b00000000000024c0"), 0x032a41), + (hex!("01222222223333333344444444550000024c00000000000024d0"), 0x032ae1), + (hex!("01222222223333333344444444550000024d00000000000024e0"), 0x032b81), + (hex!("01222222223333333344444444550000024d0000000000004070"), 0x032c21), + (hex!("01222222223333333344444444550000024e00000000000024f0"), 0x032cc1), + (hex!("01222222223333333344444444550000024e00000000000066a0"), 0x032d61), + (hex!("01222222223333333344444444550000024e0000000000006ab0"), 0x032e01), + (hex!("01222222223333333344444444550000024f0000000000002500"), 0x032ea1), + (hex!("0122222222333333334444444455000002500000000000002510"), 0x032f41), + (hex!("0122222222333333334444444455000002510000000000002520"), 0x032fe1), + (hex!("0122222222333333334444444455000002510000000000007320"), 0x033081), + (hex!("0122222222333333334444444455000002520000000000002530"), 0x033121), + (hex!("0122222222333333334444444455000002520000000000006410"), 0x0331c1), + (hex!("0122222222333333334444444455000002530000000000002540"), 0x033261), + (hex!("0122222222333333334444444455000002530000000000005110"), 0x033301), + (hex!("0122222222333333334444444455000002540000000000002550"), 0x0333a1), + (hex!("01222222223333333344444444550000025400000000000040c0"), 0x033441), + (hex!("0122222222333333334444444455000002540000000000006a40"), 0x0334e1), + (hex!("0122222222333333334444444455000002550000000000002560"), 0x033581), + (hex!("0122222222333333334444444455000002550000000000005190"), 0x033621), + (hex!("0122222222333333334444444455000002560000000000002570"), 0x0336c1), + (hex!("01222222223333333344444444550000025600000000000061f0"), 0x033761), + (hex!("0122222222333333334444444455000002570000000000002580"), 0x033801), + (hex!("0122222222333333334444444455000002580000000000002590"), 0x0338a1), + (hex!("01222222223333333344444444550000025800000000000043d0"), 0x033941), + (hex!("01222222223333333344444444550000025900000000000025a0"), 0x0339e1), + (hex!("0122222222333333334444444455000002590000000000006bb0"), 0x033a81), + (hex!("01222222223333333344444444550000025a00000000000025b0"), 0x033b21), + (hex!("01222222223333333344444444550000025a0000000000005fb0"), 0x033bc1), + (hex!("01222222223333333344444444550000025a00000000000064c0"), 0x033c61), + (hex!("01222222223333333344444444550000025b00000000000025c0"), 0x033d01), + (hex!("01222222223333333344444444550000025b0000000000005c10"), 0x033da1), + (hex!("01222222223333333344444444550000025c00000000000025d0"), 0x033e41), + (hex!("01222222223333333344444444550000025c0000000000007d00"), 0x033ee1), + (hex!("01222222223333333344444444550000025d00000000000025e0"), 0x033f81), + (hex!("01222222223333333344444444550000025e00000000000025f0"), 0x034021), + (hex!("01222222223333333344444444550000025e00000000000045e0"), 0x0340c1), + (hex!("01222222223333333344444444550000025e0000000000006ee0"), 0x034161), + (hex!("01222222223333333344444444550000025f0000000000002600"), 0x034201), + (hex!("01222222223333333344444444550000025f00000000000050b0"), 0x0342a1), + (hex!("01222222223333333344444444550000025f0000000000007690"), 0x034341), + (hex!("0122222222333333334444444455000002600000000000002610"), 0x0343e1), + (hex!("0122222222333333334444444455000002600000000000007b60"), 0x034481), + (hex!("0122222222333333334444444455000002610000000000002620"), 0x034521), + (hex!("0122222222333333334444444455000002620000000000002630"), 0x0345c1), + (hex!("0122222222333333334444444455000002630000000000002640"), 0x034661), + (hex!("0122222222333333334444444455000002640000000000002650"), 0x034701), + (hex!("0122222222333333334444444455000002650000000000002660"), 0x0347a1), + (hex!("0122222222333333334444444455000002650000000000006180"), 0x034841), + (hex!("0122222222333333334444444455000002660000000000002670"), 0x0348e1), + (hex!("0122222222333333334444444455000002660000000000005430"), 0x034981), + (hex!("0122222222333333334444444455000002660000000000007a60"), 0x034a21), + (hex!("0122222222333333334444444455000002670000000000002680"), 0x034ac1), + (hex!("01222222223333333344444444550000026700000000000077f0"), 0x034b61), + (hex!("0122222222333333334444444455000002680000000000002690"), 0x034c01), + (hex!("01222222223333333344444444550000026900000000000026a0"), 0x034ca1), + (hex!("01222222223333333344444444550000026a00000000000026b0"), 0x034d41), + (hex!("01222222223333333344444444550000026a0000000000007530"), 0x034de1), + (hex!("01222222223333333344444444550000026b00000000000026c0"), 0x034e81), + (hex!("01222222223333333344444444550000026b00000000000058b0"), 0x034f21), + (hex!("01222222223333333344444444550000026b00000000000066b0"), 0x034fc1), + (hex!("01222222223333333344444444550000026b0000000000006b10"), 0x035061), + (hex!("01222222223333333344444444550000026c00000000000026d0"), 0x035101), + (hex!("01222222223333333344444444550000026d00000000000026e0"), 0x0351a1), + (hex!("01222222223333333344444444550000026d0000000000004210"), 0x035241), + (hex!("01222222223333333344444444550000026d0000000000005490"), 0x0352e1), + (hex!("01222222223333333344444444550000026d0000000000005e60"), 0x035381), + (hex!("01222222223333333344444444550000026d00000000000068e0"), 0x035421), + (hex!("01222222223333333344444444550000026d0000000000007020"), 0x0354c1), + (hex!("01222222223333333344444444550000026d0000000000007300"), 0x035561), + (hex!("01222222223333333344444444550000026e00000000000026f0"), 0x035601), + (hex!("01222222223333333344444444550000026f0000000000002700"), 0x0356a1), + (hex!("01222222223333333344444444550000026f0000000000004910"), 0x035741), + (hex!("0122222222333333334444444455000002700000000000002710"), 0x0357e1), + (hex!("0122222222333333334444444455000002710000000000002720"), 0x035881), + (hex!("01222222223333333344444444550000027100000000000050c0"), 0x035921), + (hex!("0122222222333333334444444455000002720000000000002730"), 0x0359c1), + (hex!("0122222222333333334444444455000002730000000000002740"), 0x035a61), + (hex!("0122222222333333334444444455000002740000000000002750"), 0x035b01), + (hex!("0122222222333333334444444455000002740000000000007490"), 0x035ba1), + (hex!("0122222222333333334444444455000002750000000000002760"), 0x035c41), + (hex!("0122222222333333334444444455000002760000000000002770"), 0x035ce1), + (hex!("0122222222333333334444444455000002760000000000004790"), 0x035d81), + (hex!("0122222222333333334444444455000002770000000000002780"), 0x035e21), + (hex!("01222222223333333344444444550000027700000000000050a0"), 0x035ec1), + (hex!("0122222222333333334444444455000002780000000000002790"), 0x035f61), + (hex!("0122222222333333334444444455000002780000000000004330"), 0x036001), + (hex!("0122222222333333334444444455000002780000000000006b00"), 0x0360a1), + (hex!("01222222223333333344444444550000027900000000000027a0"), 0x036141), + (hex!("01222222223333333344444444550000027a00000000000027b0"), 0x0361e1), + (hex!("01222222223333333344444444550000027b00000000000027c0"), 0x036281), + (hex!("01222222223333333344444444550000027b0000000000004930"), 0x036321), + (hex!("01222222223333333344444444550000027b0000000000006250"), 0x0363c1), + (hex!("01222222223333333344444444550000027c00000000000027d0"), 0x036461), + (hex!("01222222223333333344444444550000027d00000000000027e0"), 0x036501), + (hex!("01222222223333333344444444550000027d0000000000005ce0"), 0x0365a1), + (hex!("01222222223333333344444444550000027d0000000000005fe0"), 0x036641), + (hex!("01222222223333333344444444550000027e00000000000027f0"), 0x0366e1), + (hex!("01222222223333333344444444550000027f0000000000002800"), 0x036781), + (hex!("01222222223333333344444444550000027f0000000000003e90"), 0x036821), + (hex!("01222222223333333344444444550000027f0000000000007910"), 0x0368c1), + (hex!("0122222222333333334444444455000002800000000000002810"), 0x036961), + (hex!("0122222222333333334444444455000002800000000000004990"), 0x036a01), + (hex!("0122222222333333334444444455000002800000000000006160"), 0x036aa1), + (hex!("0122222222333333334444444455000002800000000000006740"), 0x036b41), + (hex!("0122222222333333334444444455000002810000000000002820"), 0x036be1), + (hex!("0122222222333333334444444455000002820000000000002830"), 0x036c81), + (hex!("0122222222333333334444444455000002820000000000005170"), 0x036d21), + (hex!("0122222222333333334444444455000002830000000000002840"), 0x036dc1), + (hex!("0122222222333333334444444455000002840000000000002850"), 0x036e61), + (hex!("0122222222333333334444444455000002840000000000004810"), 0x036f01), + (hex!("0122222222333333334444444455000002840000000000006aa0"), 0x036fa1), + (hex!("0122222222333333334444444455000002850000000000002860"), 0x037041), + (hex!("0122222222333333334444444455000002860000000000002870"), 0x0370e1), + (hex!("0122222222333333334444444455000002860000000000005080"), 0x037181), + (hex!("0122222222333333334444444455000002870000000000002880"), 0x037221), + (hex!("0122222222333333334444444455000002870000000000004e60"), 0x0372c1), + (hex!("0122222222333333334444444455000002880000000000002890"), 0x037361), + (hex!("0122222222333333334444444455000002880000000000005060"), 0x037401), + (hex!("0122222222333333334444444455000002880000000000006f20"), 0x0374a1), + (hex!("01222222223333333344444444550000028900000000000028a0"), 0x037541), + (hex!("01222222223333333344444444550000028900000000000047e0"), 0x0375e1), + (hex!("01222222223333333344444444550000028a00000000000028b0"), 0x037681), + (hex!("01222222223333333344444444550000028a0000000000005ab0"), 0x037721), + (hex!("01222222223333333344444444550000028a0000000000007130"), 0x0377c1), + (hex!("01222222223333333344444444550000028a0000000000007660"), 0x037861), + (hex!("01222222223333333344444444550000028b00000000000028c0"), 0x037901), + (hex!("01222222223333333344444444550000028b00000000000054e0"), 0x0379a1), + (hex!("01222222223333333344444444550000028c00000000000028d0"), 0x037a41), + (hex!("01222222223333333344444444550000028c00000000000046f0"), 0x037ae1), + (hex!("01222222223333333344444444550000028c00000000000061a0"), 0x037b81), + (hex!("01222222223333333344444444550000028d00000000000028e0"), 0x037c21), + (hex!("01222222223333333344444444550000028e00000000000028f0"), 0x037cc1), + (hex!("01222222223333333344444444550000028e0000000000004130"), 0x037d61), + (hex!("01222222223333333344444444550000028f0000000000002900"), 0x037e01), + (hex!("01222222223333333344444444550000028f0000000000007510"), 0x037ea1), + (hex!("0122222222333333334444444455000002900000000000002910"), 0x037f41), + (hex!("0122222222333333334444444455000002900000000000004a40"), 0x037fe1), + (hex!("0122222222333333334444444455000002910000000000002920"), 0x038081), + (hex!("0122222222333333334444444455000002920000000000002930"), 0x038121), + (hex!("0122222222333333334444444455000002920000000000004e90"), 0x0381c1), + (hex!("0122222222333333334444444455000002930000000000002940"), 0x038261), + (hex!("0122222222333333334444444455000002930000000000006880"), 0x038301), + (hex!("0122222222333333334444444455000002940000000000002950"), 0x0383a1), + (hex!("0122222222333333334444444455000002940000000000007bc0"), 0x038441), + (hex!("0122222222333333334444444455000002950000000000002960"), 0x0384e1), + (hex!("0122222222333333334444444455000002960000000000002970"), 0x038581), + (hex!("01222222223333333344444444550000029600000000000059d0"), 0x038621), + (hex!("0122222222333333334444444455000002970000000000002980"), 0x0386c1), + (hex!("0122222222333333334444444455000002970000000000004a50"), 0x038761), + (hex!("0122222222333333334444444455000002970000000000005f20"), 0x038801), + (hex!("01222222223333333344444444550000029700000000000068d0"), 0x0388a1), + (hex!("0122222222333333334444444455000002980000000000002990"), 0x038941), + (hex!("0122222222333333334444444455000002980000000000004370"), 0x0389e1), + (hex!("0122222222333333334444444455000002980000000000004420"), 0x038a81), + (hex!("01222222223333333344444444550000029900000000000029a0"), 0x038b21), + (hex!("01222222223333333344444444550000029a00000000000029b0"), 0x038bc1), + (hex!("01222222223333333344444444550000029a0000000000006010"), 0x038c61), + (hex!("01222222223333333344444444550000029a0000000000006980"), 0x038d01), + (hex!("01222222223333333344444444550000029b00000000000029c0"), 0x038da1), + (hex!("01222222223333333344444444550000029c00000000000029d0"), 0x038e41), + (hex!("01222222223333333344444444550000029c0000000000007480"), 0x038ee1), + (hex!("01222222223333333344444444550000029d00000000000029e0"), 0x038f81), + (hex!("01222222223333333344444444550000029d0000000000005030"), 0x039021), + (hex!("01222222223333333344444444550000029d0000000000007780"), 0x0390c1), + (hex!("01222222223333333344444444550000029d0000000000007a50"), 0x039161), + (hex!("01222222223333333344444444550000029e00000000000029f0"), 0x039201), + (hex!("01222222223333333344444444550000029e00000000000074b0"), 0x0392a1), + (hex!("01222222223333333344444444550000029f0000000000002a00"), 0x039341), + (hex!("0122222222333333334444444455000002a00000000000002a10"), 0x0393e1), + (hex!("0122222222333333334444444455000002a10000000000002a20"), 0x039481), + (hex!("0122222222333333334444444455000002a20000000000002a30"), 0x039521), + (hex!("0122222222333333334444444455000002a20000000000004c50"), 0x0395c1), + (hex!("0122222222333333334444444455000002a20000000000006f10"), 0x039661), + (hex!("0122222222333333334444444455000002a30000000000002a40"), 0x039701), + (hex!("0122222222333333334444444455000002a40000000000002a50"), 0x0397a1), + (hex!("0122222222333333334444444455000002a40000000000005d60"), 0x039841), + (hex!("0122222222333333334444444455000002a50000000000002a60"), 0x0398e1), + (hex!("0122222222333333334444444455000002a50000000000005440"), 0x039981), + (hex!("0122222222333333334444444455000002a50000000000005890"), 0x039a21), + (hex!("0122222222333333334444444455000002a60000000000002a70"), 0x039ac1), + (hex!("0122222222333333334444444455000002a70000000000002a80"), 0x039b61), + (hex!("0122222222333333334444444455000002a700000000000054a0"), 0x039c01), + (hex!("0122222222333333334444444455000002a70000000000007280"), 0x039ca1), + (hex!("0122222222333333334444444455000002a80000000000002a90"), 0x039d41), + (hex!("0122222222333333334444444455000002a90000000000002aa0"), 0x039de1), + (hex!("0122222222333333334444444455000002aa0000000000002ab0"), 0x039e81), + (hex!("0122222222333333334444444455000002ab0000000000002ac0"), 0x039f21), + (hex!("0122222222333333334444444455000002ab0000000000006c90"), 0x039fc1), + (hex!("0122222222333333334444444455000002ac0000000000002ad0"), 0x03a061), + (hex!("0122222222333333334444444455000002ac0000000000006db0"), 0x03a101), + (hex!("0122222222333333334444444455000002ad0000000000002ae0"), 0x03a1a1), + (hex!("0122222222333333334444444455000002ad00000000000065e0"), 0x03a241), + (hex!("0122222222333333334444444455000002ad0000000000007b40"), 0x03a2e1), + (hex!("0122222222333333334444444455000002ae0000000000002af0"), 0x03a381), + (hex!("0122222222333333334444444455000002ae0000000000004d20"), 0x03a421), + (hex!("0122222222333333334444444455000002ae0000000000006f30"), 0x03a4c1), + (hex!("0122222222333333334444444455000002af0000000000002b00"), 0x03a561), + (hex!("0122222222333333334444444455000002b00000000000002b10"), 0x03a601), + (hex!("0122222222333333334444444455000002b00000000000004560"), 0x03a6a1), + (hex!("0122222222333333334444444455000002b00000000000005800"), 0x03a741), + (hex!("0122222222333333334444444455000002b00000000000005a60"), 0x03a7e1), + (hex!("0122222222333333334444444455000002b10000000000002b20"), 0x03a881), + (hex!("0122222222333333334444444455000002b10000000000007b30"), 0x03a921), + (hex!("0122222222333333334444444455000002b20000000000002b30"), 0x03a9c1), + (hex!("0122222222333333334444444455000002b20000000000004440"), 0x03aa61), + (hex!("0122222222333333334444444455000002b20000000000004f80"), 0x03ab01), + (hex!("0122222222333333334444444455000002b20000000000005020"), 0x03aba1), + (hex!("0122222222333333334444444455000002b30000000000002b40"), 0x03ac41), + (hex!("0122222222333333334444444455000002b40000000000002b50"), 0x03ace1), + (hex!("0122222222333333334444444455000002b50000000000002b60"), 0x03ad81), + (hex!("0122222222333333334444444455000002b500000000000059e0"), 0x03ae21), + (hex!("0122222222333333334444444455000002b60000000000002b70"), 0x03aec1), + (hex!("0122222222333333334444444455000002b70000000000002b80"), 0x03af61), + (hex!("0122222222333333334444444455000002b80000000000002b90"), 0x03b001), + (hex!("0122222222333333334444444455000002b80000000000004590"), 0x03b0a1), + (hex!("0122222222333333334444444455000002b800000000000047d0"), 0x03b141), + (hex!("0122222222333333334444444455000002b80000000000006030"), 0x03b1e1), + (hex!("0122222222333333334444444455000002b80000000000006a20"), 0x03b281), + (hex!("0122222222333333334444444455000002b80000000000006a90"), 0x03b321), + (hex!("0122222222333333334444444455000002b90000000000002ba0"), 0x03b3c1), + (hex!("0122222222333333334444444455000002ba0000000000002bb0"), 0x03b461), + (hex!("0122222222333333334444444455000002ba0000000000006e80"), 0x03b501), + (hex!("0122222222333333334444444455000002bb0000000000002bc0"), 0x03b5a1), + (hex!("0122222222333333334444444455000002bc0000000000002bd0"), 0x03b641), + (hex!("0122222222333333334444444455000002bc0000000000004b30"), 0x03b6e1), + (hex!("0122222222333333334444444455000002bd0000000000002be0"), 0x03b781), + (hex!("0122222222333333334444444455000002bd0000000000005e10"), 0x03b821), + (hex!("0122222222333333334444444455000002be0000000000002bf0"), 0x03b8c1), + (hex!("0122222222333333334444444455000002bf0000000000002c00"), 0x03b961), + (hex!("0122222222333333334444444455000002c00000000000002c10"), 0x03ba01), + (hex!("0122222222333333334444444455000002c10000000000002c20"), 0x03baa1), + (hex!("0122222222333333334444444455000002c10000000000003ef0"), 0x03bb41), + (hex!("0122222222333333334444444455000002c20000000000002c30"), 0x03bbe1), + (hex!("0122222222333333334444444455000002c200000000000056e0"), 0x03bc81), + (hex!("0122222222333333334444444455000002c30000000000002c40"), 0x03bd21), + (hex!("0122222222333333334444444455000002c30000000000004b60"), 0x03bdc1), + (hex!("0122222222333333334444444455000002c40000000000002c50"), 0x03be61), + (hex!("0122222222333333334444444455000002c400000000000045f0"), 0x03bf01), + (hex!("0122222222333333334444444455000002c40000000000005290"), 0x03bfa1), + (hex!("0122222222333333334444444455000002c50000000000002c60"), 0x03c041), + (hex!("0122222222333333334444444455000002c60000000000002c70"), 0x03c0e1), + (hex!("0122222222333333334444444455000002c60000000000006ae0"), 0x03c181), + (hex!("0122222222333333334444444455000002c70000000000002c80"), 0x03c221), + (hex!("0122222222333333334444444455000002c70000000000005680"), 0x03c2c1), + (hex!("0122222222333333334444444455000002c70000000000006e10"), 0x03c361), + (hex!("0122222222333333334444444455000002c80000000000002c90"), 0x03c401), + (hex!("0122222222333333334444444455000002c90000000000002ca0"), 0x03c4a1), + (hex!("0122222222333333334444444455000002ca0000000000002cb0"), 0x03c541), + (hex!("0122222222333333334444444455000002cb0000000000002cc0"), 0x03c5e1), + (hex!("0122222222333333334444444455000002cc0000000000002cd0"), 0x03c681), + (hex!("0122222222333333334444444455000002cc0000000000005b50"), 0x03c721), + (hex!("0122222222333333334444444455000002cd0000000000002ce0"), 0x03c7c1), + (hex!("0122222222333333334444444455000002ce0000000000002cf0"), 0x03c861), + (hex!("0122222222333333334444444455000002ce00000000000043f0"), 0x03c901), + (hex!("0122222222333333334444444455000002ce0000000000006420"), 0x03c9a1), + (hex!("0122222222333333334444444455000002cf0000000000002d00"), 0x03ca41), + (hex!("0122222222333333334444444455000002d00000000000002d10"), 0x03cae1), + (hex!("0122222222333333334444444455000002d10000000000002d20"), 0x03cb81), + (hex!("0122222222333333334444444455000002d10000000000005370"), 0x03cc21), + (hex!("0122222222333333334444444455000002d20000000000002d30"), 0x03ccc1), + (hex!("0122222222333333334444444455000002d20000000000005ef0"), 0x03cd61), + (hex!("0122222222333333334444444455000002d20000000000006570"), 0x03ce01), + (hex!("0122222222333333334444444455000002d30000000000002d40"), 0x03cea1), + (hex!("0122222222333333334444444455000002d30000000000007360"), 0x03cf41), + (hex!("0122222222333333334444444455000002d40000000000002d50"), 0x03cfe1), + (hex!("0122222222333333334444444455000002d400000000000079a0"), 0x03d081), + (hex!("0122222222333333334444444455000002d50000000000002d60"), 0x03d121), + (hex!("0122222222333333334444444455000002d50000000000004250"), 0x03d1c1), + (hex!("0122222222333333334444444455000002d50000000000006050"), 0x03d261), + (hex!("0122222222333333334444444455000002d60000000000002d70"), 0x03d301), + (hex!("0122222222333333334444444455000002d60000000000007080"), 0x03d3a1), + (hex!("0122222222333333334444444455000002d70000000000002d80"), 0x03d441), + (hex!("0122222222333333334444444455000002d80000000000002d90"), 0x03d4e1), + (hex!("0122222222333333334444444455000002d80000000000007110"), 0x03d581), + (hex!("0122222222333333334444444455000002d800000000000073c0"), 0x03d621), + (hex!("0122222222333333334444444455000002d800000000000075a0"), 0x03d6c1), + (hex!("0122222222333333334444444455000002d90000000000002da0"), 0x03d761), + (hex!("0122222222333333334444444455000002d90000000000004860"), 0x03d801), + (hex!("0122222222333333334444444455000002d90000000000006b60"), 0x03d8a1), + (hex!("0122222222333333334444444455000002da0000000000002db0"), 0x03d941), + (hex!("0122222222333333334444444455000002da0000000000006630"), 0x03d9e1), + (hex!("0122222222333333334444444455000002db0000000000002dc0"), 0x03da81), + (hex!("0122222222333333334444444455000002dc0000000000002dd0"), 0x03db21), + (hex!("0122222222333333334444444455000002dc0000000000004830"), 0x03dbc1), + (hex!("0122222222333333334444444455000002dd0000000000002de0"), 0x03dc61), + (hex!("0122222222333333334444444455000002de0000000000002df0"), 0x03dd01), + (hex!("0122222222333333334444444455000002de0000000000004f00"), 0x03dda1), + (hex!("0122222222333333334444444455000002df0000000000002e00"), 0x03de41), + (hex!("0122222222333333334444444455000002e00000000000002e10"), 0x03dee1), + (hex!("0122222222333333334444444455000002e10000000000002e20"), 0x03df81), + (hex!("0122222222333333334444444455000002e10000000000006e90"), 0x03e021), + (hex!("0122222222333333334444444455000002e20000000000002e30"), 0x03e0c1), + (hex!("0122222222333333334444444455000002e200000000000053e0"), 0x03e161), + (hex!("0122222222333333334444444455000002e30000000000002e40"), 0x03e201), + (hex!("0122222222333333334444444455000002e30000000000006020"), 0x03e2a1), + (hex!("0122222222333333334444444455000002e30000000000006540"), 0x03e341), + (hex!("0122222222333333334444444455000002e40000000000002e50"), 0x03e3e1), + (hex!("0122222222333333334444444455000002e50000000000002e60"), 0x03e481), + (hex!("0122222222333333334444444455000002e50000000000005180"), 0x03e521), + (hex!("0122222222333333334444444455000002e50000000000007bf0"), 0x03e5c1), + (hex!("0122222222333333334444444455000002e60000000000002e70"), 0x03e661), + (hex!("0122222222333333334444444455000002e60000000000005350"), 0x03e701), + (hex!("0122222222333333334444444455000002e60000000000007960"), 0x03e7a1), + (hex!("0122222222333333334444444455000002e70000000000002e80"), 0x03e841), + (hex!("0122222222333333334444444455000002e80000000000002e90"), 0x03e8e1), + (hex!("0122222222333333334444444455000002e90000000000002ea0"), 0x03e981), + (hex!("0122222222333333334444444455000002ea0000000000002eb0"), 0x03ea21), + (hex!("0122222222333333334444444455000002eb0000000000002ec0"), 0x03eac1), + (hex!("0122222222333333334444444455000002ec0000000000002ed0"), 0x03eb61), + (hex!("0122222222333333334444444455000002ec0000000000006c10"), 0x03ec01), + (hex!("0122222222333333334444444455000002ed0000000000002ee0"), 0x03eca1), + (hex!("0122222222333333334444444455000002ed0000000000005590"), 0x03ed41), + (hex!("0122222222333333334444444455000002ed0000000000005cd0"), 0x03ede1), + (hex!("0122222222333333334444444455000002ed0000000000006910"), 0x03ee81), + (hex!("0122222222333333334444444455000002ee0000000000002ef0"), 0x03ef21), + (hex!("0122222222333333334444444455000002ef0000000000002f00"), 0x03efc1), + (hex!("0122222222333333334444444455000002ef0000000000004ed0"), 0x03f061), + (hex!("0122222222333333334444444455000002f00000000000002f10"), 0x03f101), + (hex!("0122222222333333334444444455000002f00000000000004cf0"), 0x03f1a1), + (hex!("0122222222333333334444444455000002f00000000000005d10"), 0x03f241), + (hex!("0122222222333333334444444455000002f00000000000006860"), 0x03f2e1), + (hex!("0122222222333333334444444455000002f00000000000006b50"), 0x03f381), + (hex!("0122222222333333334444444455000002f00000000000007100"), 0x03f421), + (hex!("0122222222333333334444444455000002f00000000000007aa0"), 0x03f4c1), + (hex!("0122222222333333334444444455000002f10000000000002f20"), 0x03f561), + (hex!("0122222222333333334444444455000002f20000000000002f30"), 0x03f601), + (hex!("0122222222333333334444444455000002f200000000000044b0"), 0x03f6a1), + (hex!("0122222222333333334444444455000002f30000000000002f40"), 0x03f741), + (hex!("0122222222333333334444444455000002f300000000000075b0"), 0x03f7e1), + (hex!("0122222222333333334444444455000002f40000000000002f50"), 0x03f881), + (hex!("0122222222333333334444444455000002f400000000000060f0"), 0x03f921), + (hex!("0122222222333333334444444455000002f50000000000002f60"), 0x03f9c1), + (hex!("0122222222333333334444444455000002f50000000000007210"), 0x03fa61), + (hex!("0122222222333333334444444455000002f60000000000002f70"), 0x03fb01), + (hex!("0122222222333333334444444455000002f60000000000006610"), 0x03fba1), + (hex!("0122222222333333334444444455000002f70000000000002f80"), 0x03fc41), + (hex!("0122222222333333334444444455000002f70000000000007560"), 0x03fce1), + (hex!("0122222222333333334444444455000002f80000000000002f90"), 0x03fd81), + (hex!("0122222222333333334444444455000002f80000000000006320"), 0x03fe21), + (hex!("0122222222333333334444444455000002f90000000000002fa0"), 0x03fec1), + (hex!("0122222222333333334444444455000002f90000000000006e50"), 0x03ff61), + (hex!("0122222222333333334444444455000002fa0000000000002fb0"), 0x040001), + (hex!("0122222222333333334444444455000002fb0000000000002fc0"), 0x0400a1), + (hex!("0122222222333333334444444455000002fb0000000000004780"), 0x040141), + (hex!("0122222222333333334444444455000002fc0000000000002fd0"), 0x0401e1), + (hex!("0122222222333333334444444455000002fd0000000000002fe0"), 0x040281), + (hex!("0122222222333333334444444455000002fd0000000000005600"), 0x040321), + (hex!("0122222222333333334444444455000002fd0000000000006c00"), 0x0403c1), + (hex!("0122222222333333334444444455000002fe0000000000002ff0"), 0x040461), + (hex!("0122222222333333334444444455000002ff0000000000003000"), 0x040501), + (hex!("0122222222333333334444444455000003000000000000003010"), 0x0405a1), + (hex!("0122222222333333334444444455000003000000000000004080"), 0x040641), + (hex!("0122222222333333334444444455000003010000000000003020"), 0x0406e1), + (hex!("0122222222333333334444444455000003010000000000006340"), 0x040781), + (hex!("0122222222333333334444444455000003020000000000003030"), 0x040821), + (hex!("0122222222333333334444444455000003020000000000005b00"), 0x0408c1), + (hex!("0122222222333333334444444455000003020000000000007b20"), 0x040961), + (hex!("0122222222333333334444444455000003030000000000003040"), 0x040a01), + (hex!("01222222223333333344444444550000030300000000000056b0"), 0x040aa1), + (hex!("0122222222333333334444444455000003030000000000006280"), 0x040b41), + (hex!("0122222222333333334444444455000003030000000000007ad0"), 0x040be1), + (hex!("0122222222333333334444444455000003040000000000003050"), 0x040c81), + (hex!("0122222222333333334444444455000003040000000000005c50"), 0x040d21), + (hex!("0122222222333333334444444455000003050000000000003060"), 0x040dc1), + (hex!("01222222223333333344444444550000030500000000000072e0"), 0x040e61), + (hex!("0122222222333333334444444455000003060000000000003070"), 0x040f01), + (hex!("0122222222333333334444444455000003060000000000004360"), 0x040fa1), + (hex!("0122222222333333334444444455000003060000000000004380"), 0x041041), + (hex!("0122222222333333334444444455000003060000000000004820"), 0x0410e1), + (hex!("0122222222333333334444444455000003060000000000006d10"), 0x041181), + (hex!("0122222222333333334444444455000003070000000000003080"), 0x041221), + (hex!("0122222222333333334444444455000003070000000000004450"), 0x0412c1), + (hex!("0122222222333333334444444455000003080000000000003090"), 0x041361), + (hex!("0122222222333333334444444455000003080000000000005ad0"), 0x041401), + (hex!("01222222223333333344444444550000030900000000000030a0"), 0x0414a1), + (hex!("01222222223333333344444444550000030a00000000000030b0"), 0x041541), + (hex!("01222222223333333344444444550000030a0000000000007760"), 0x0415e1), + (hex!("01222222223333333344444444550000030b00000000000030c0"), 0x041681), + (hex!("01222222223333333344444444550000030b0000000000007a80"), 0x041721), + (hex!("01222222223333333344444444550000030c00000000000030d0"), 0x0417c1), + (hex!("01222222223333333344444444550000030d00000000000030e0"), 0x041861), + (hex!("01222222223333333344444444550000030d0000000000003eb0"), 0x041901), + (hex!("01222222223333333344444444550000030e00000000000030f0"), 0x0419a1), + (hex!("01222222223333333344444444550000030f0000000000003100"), 0x041a41), + (hex!("01222222223333333344444444550000030f0000000000004690"), 0x041ae1), + (hex!("01222222223333333344444444550000030f0000000000006900"), 0x041b81), + (hex!("0122222222333333334444444455000003100000000000003110"), 0x041c21), + (hex!("01222222223333333344444444550000031000000000000058a0"), 0x041cc1), + (hex!("0122222222333333334444444455000003110000000000003120"), 0x041d61), + (hex!("0122222222333333334444444455000003110000000000004200"), 0x041e01), + (hex!("0122222222333333334444444455000003120000000000003130"), 0x041ea1), + (hex!("0122222222333333334444444455000003130000000000003140"), 0x041f41), + (hex!("0122222222333333334444444455000003130000000000004d50"), 0x041fe1), + (hex!("0122222222333333334444444455000003130000000000005400"), 0x042081), + (hex!("0122222222333333334444444455000003130000000000005520"), 0x042121), + (hex!("0122222222333333334444444455000003140000000000003150"), 0x0421c1), + (hex!("0122222222333333334444444455000003140000000000006450"), 0x042261), + (hex!("0122222222333333334444444455000003150000000000003160"), 0x042301), + (hex!("01222222223333333344444444550000031500000000000062d0"), 0x0423a1), + (hex!("0122222222333333334444444455000003160000000000003170"), 0x042441), + (hex!("0122222222333333334444444455000003160000000000004c40"), 0x0424e1), + (hex!("0122222222333333334444444455000003160000000000007c80"), 0x042581), + (hex!("0122222222333333334444444455000003170000000000003180"), 0x042621), + (hex!("0122222222333333334444444455000003170000000000004400"), 0x0426c1), + (hex!("0122222222333333334444444455000003170000000000005090"), 0x042761), + (hex!("0122222222333333334444444455000003170000000000006cb0"), 0x042801), + (hex!("0122222222333333334444444455000003180000000000003190"), 0x0428a1), + (hex!("0122222222333333334444444455000003180000000000006560"), 0x042941), + (hex!("01222222223333333344444444550000031900000000000031a0"), 0x0429e1), + (hex!("01222222223333333344444444550000031900000000000052d0"), 0x042a81), + (hex!("01222222223333333344444444550000031900000000000057e0"), 0x042b21), + (hex!("01222222223333333344444444550000031a00000000000031b0"), 0x042bc1), + (hex!("01222222223333333344444444550000031a00000000000071e0"), 0x042c61), + (hex!("01222222223333333344444444550000031b00000000000031c0"), 0x042d01), + (hex!("01222222223333333344444444550000031c00000000000031d0"), 0x042da1), + (hex!("01222222223333333344444444550000031c0000000000004480"), 0x042e41), + (hex!("01222222223333333344444444550000031c0000000000005790"), 0x042ee1), + (hex!("01222222223333333344444444550000031c0000000000007be0"), 0x042f81), + (hex!("01222222223333333344444444550000031d00000000000031e0"), 0x043021), + (hex!("01222222223333333344444444550000031d0000000000005560"), 0x0430c1), + (hex!("01222222223333333344444444550000031e00000000000031f0"), 0x043161), + (hex!("01222222223333333344444444550000031f0000000000003200"), 0x043201), + (hex!("01222222223333333344444444550000031f0000000000004190"), 0x0432a1), + (hex!("0122222222333333334444444455000003200000000000003210"), 0x043341), + (hex!("0122222222333333334444444455000003210000000000003220"), 0x0433e1), + (hex!("0122222222333333334444444455000003220000000000003230"), 0x043481), + (hex!("0122222222333333334444444455000003230000000000003240"), 0x043521), + (hex!("01222222223333333344444444550000032300000000000069d0"), 0x0435c1), + (hex!("0122222222333333334444444455000003240000000000003250"), 0x043661), + (hex!("0122222222333333334444444455000003250000000000003260"), 0x043701), + (hex!("01222222223333333344444444550000032500000000000042b0"), 0x0437a1), + (hex!("01222222223333333344444444550000032500000000000064e0"), 0x043841), + (hex!("0122222222333333334444444455000003260000000000003270"), 0x0438e1), + (hex!("0122222222333333334444444455000003270000000000003280"), 0x043981), + (hex!("0122222222333333334444444455000003270000000000005b20"), 0x043a21), + (hex!("0122222222333333334444444455000003270000000000006330"), 0x043ac1), + (hex!("0122222222333333334444444455000003270000000000006810"), 0x043b61), + (hex!("0122222222333333334444444455000003280000000000003290"), 0x043c01), + (hex!("01222222223333333344444444550000032900000000000032a0"), 0x043ca1), + (hex!("01222222223333333344444444550000032900000000000056f0"), 0x043d41), + (hex!("0122222222333333334444444455000003290000000000005e20"), 0x043de1), + (hex!("0122222222333333334444444455000003290000000000005e70"), 0x043e81), + (hex!("01222222223333333344444444550000032a00000000000032b0"), 0x043f21), + (hex!("01222222223333333344444444550000032b00000000000032c0"), 0x043fc1), + (hex!("01222222223333333344444444550000032b0000000000005500"), 0x044061), + (hex!("01222222223333333344444444550000032b0000000000005a20"), 0x044101), + (hex!("01222222223333333344444444550000032c00000000000032d0"), 0x0441a1), + (hex!("01222222223333333344444444550000032c0000000000004060"), 0x044241), + (hex!("01222222223333333344444444550000032c0000000000004760"), 0x0442e1), + (hex!("01222222223333333344444444550000032d00000000000032e0"), 0x044381), + (hex!("01222222223333333344444444550000032d00000000000068a0"), 0x044421), + (hex!("01222222223333333344444444550000032e00000000000032f0"), 0x0444c1), + (hex!("01222222223333333344444444550000032f0000000000003300"), 0x044561), + (hex!("0122222222333333334444444455000003300000000000003310"), 0x044601), + (hex!("0122222222333333334444444455000003300000000000006e40"), 0x0446a1), + (hex!("0122222222333333334444444455000003310000000000003320"), 0x044741), + (hex!("0122222222333333334444444455000003310000000000004620"), 0x0447e1), + (hex!("0122222222333333334444444455000003320000000000003330"), 0x044881), + (hex!("0122222222333333334444444455000003330000000000003340"), 0x044921), + (hex!("0122222222333333334444444455000003330000000000004b80"), 0x0449c1), + (hex!("0122222222333333334444444455000003340000000000003350"), 0x044a61), + (hex!("0122222222333333334444444455000003350000000000003360"), 0x044b01), + (hex!("0122222222333333334444444455000003360000000000003370"), 0x044ba1), + (hex!("0122222222333333334444444455000003370000000000003380"), 0x044c41), + (hex!("0122222222333333334444444455000003380000000000003390"), 0x044ce1), + (hex!("01222222223333333344444444550000033900000000000033a0"), 0x044d81), + (hex!("0122222222333333334444444455000003390000000000006b90"), 0x044e21), + (hex!("01222222223333333344444444550000033a00000000000033b0"), 0x044ec1), + (hex!("01222222223333333344444444550000033a0000000000007420"), 0x044f61), + (hex!("01222222223333333344444444550000033b00000000000033c0"), 0x045001), + (hex!("01222222223333333344444444550000033b0000000000007620"), 0x0450a1), + (hex!("01222222223333333344444444550000033c00000000000033d0"), 0x045141), + (hex!("01222222223333333344444444550000033c0000000000006b30"), 0x0451e1), + (hex!("01222222223333333344444444550000033d00000000000033e0"), 0x045281), + (hex!("01222222223333333344444444550000033e00000000000033f0"), 0x045321), + (hex!("01222222223333333344444444550000033e00000000000048b0"), 0x0453c1), + (hex!("01222222223333333344444444550000033e0000000000004e70"), 0x045461), + (hex!("01222222223333333344444444550000033f0000000000003400"), 0x045501), + (hex!("01222222223333333344444444550000033f0000000000006380"), 0x0455a1), + (hex!("0122222222333333334444444455000003400000000000003410"), 0x045641), + (hex!("0122222222333333334444444455000003410000000000003420"), 0x0456e1), + (hex!("0122222222333333334444444455000003410000000000006090"), 0x045781), + (hex!("0122222222333333334444444455000003420000000000003430"), 0x045821), + (hex!("01222222223333333344444444550000034200000000000073d0"), 0x0458c1), + (hex!("0122222222333333334444444455000003430000000000003440"), 0x045961), + (hex!("0122222222333333334444444455000003430000000000006370"), 0x045a01), + (hex!("01222222223333333344444444550000034300000000000075c0"), 0x045aa1), + (hex!("0122222222333333334444444455000003440000000000003450"), 0x045b41), + (hex!("0122222222333333334444444455000003450000000000003460"), 0x045be1), + (hex!("0122222222333333334444444455000003460000000000003470"), 0x045c81), + (hex!("01222222223333333344444444550000034600000000000055f0"), 0x045d21), + (hex!("0122222222333333334444444455000003470000000000003480"), 0x045dc1), + (hex!("0122222222333333334444444455000003470000000000003fe0"), 0x045e61), + (hex!("0122222222333333334444444455000003480000000000003490"), 0x045f01), + (hex!("0122222222333333334444444455000003480000000000007990"), 0x045fa1), + (hex!("01222222223333333344444444550000034900000000000034a0"), 0x046041), + (hex!("0122222222333333334444444455000003490000000000004410"), 0x0460e1), + (hex!("01222222223333333344444444550000034a00000000000034b0"), 0x046181), + (hex!("01222222223333333344444444550000034a00000000000062a0"), 0x046221), + (hex!("01222222223333333344444444550000034a0000000000007260"), 0x0462c1), + (hex!("01222222223333333344444444550000034b00000000000034c0"), 0x046361), + (hex!("01222222223333333344444444550000034b0000000000005760"), 0x046401), + (hex!("01222222223333333344444444550000034b0000000000006200"), 0x0464a1), + (hex!("01222222223333333344444444550000034c00000000000034d0"), 0x046541), + (hex!("01222222223333333344444444550000034d00000000000034e0"), 0x0465e1), + (hex!("01222222223333333344444444550000034e00000000000034f0"), 0x046681), + (hex!("01222222223333333344444444550000034e0000000000007790"), 0x046721), + (hex!("01222222223333333344444444550000034f0000000000003500"), 0x0467c1), + (hex!("0122222222333333334444444455000003500000000000003510"), 0x046861), + (hex!("0122222222333333334444444455000003510000000000003520"), 0x046901), + (hex!("0122222222333333334444444455000003520000000000003530"), 0x0469a1), + (hex!("01222222223333333344444444550000035200000000000056a0"), 0x046a41), + (hex!("0122222222333333334444444455000003530000000000003540"), 0x046ae1), + (hex!("0122222222333333334444444455000003540000000000003550"), 0x046b81), + (hex!("01222222223333333344444444550000035400000000000047b0"), 0x046c21), + (hex!("0122222222333333334444444455000003550000000000003560"), 0x046cc1), + (hex!("0122222222333333334444444455000003550000000000004500"), 0x046d61), + (hex!("0122222222333333334444444455000003560000000000003570"), 0x046e01), + (hex!("0122222222333333334444444455000003560000000000004fc0"), 0x046ea1), + (hex!("0122222222333333334444444455000003560000000000007160"), 0x046f41), + (hex!("0122222222333333334444444455000003560000000000007400"), 0x046fe1), + (hex!("0122222222333333334444444455000003570000000000003580"), 0x047081), + (hex!("0122222222333333334444444455000003580000000000003590"), 0x047121), + (hex!("0122222222333333334444444455000003580000000000005a80"), 0x0471c1), + (hex!("01222222223333333344444444550000035900000000000035a0"), 0x047261), + (hex!("01222222223333333344444444550000035900000000000073b0"), 0x047301), + (hex!("01222222223333333344444444550000035a00000000000035b0"), 0x0473a1), + (hex!("01222222223333333344444444550000035a0000000000004c20"), 0x047441), + (hex!("01222222223333333344444444550000035b00000000000035c0"), 0x0474e1), + (hex!("01222222223333333344444444550000035b0000000000005120"), 0x047581), + (hex!("01222222223333333344444444550000035c00000000000035d0"), 0x047621), + (hex!("01222222223333333344444444550000035c0000000000004300"), 0x0476c1), + (hex!("01222222223333333344444444550000035c0000000000005a40"), 0x047761), + (hex!("01222222223333333344444444550000035c0000000000006620"), 0x047801), + (hex!("01222222223333333344444444550000035c0000000000006ed0"), 0x0478a1), + (hex!("01222222223333333344444444550000035d00000000000035e0"), 0x047941), + (hex!("01222222223333333344444444550000035d0000000000005df0"), 0x0479e1), + (hex!("01222222223333333344444444550000035e00000000000035f0"), 0x047a81), + (hex!("01222222223333333344444444550000035f0000000000003600"), 0x047b21), + (hex!("01222222223333333344444444550000035f00000000000058d0"), 0x047bc1), + (hex!("0122222222333333334444444455000003600000000000003610"), 0x047c61), + (hex!("0122222222333333334444444455000003600000000000007b90"), 0x047d01), + (hex!("0122222222333333334444444455000003610000000000003620"), 0x047da1), + (hex!("0122222222333333334444444455000003610000000000006ad0"), 0x047e41), + (hex!("0122222222333333334444444455000003620000000000003630"), 0x047ee1), + (hex!("01222222223333333344444444550000036200000000000063a0"), 0x047f81), + (hex!("0122222222333333334444444455000003630000000000003640"), 0x048021), + (hex!("0122222222333333334444444455000003630000000000007250"), 0x0480c1), + (hex!("0122222222333333334444444455000003640000000000003650"), 0x048161), + (hex!("0122222222333333334444444455000003640000000000005510"), 0x048201), + (hex!("0122222222333333334444444455000003640000000000007850"), 0x0482a1), + (hex!("0122222222333333334444444455000003650000000000003660"), 0x048341), + (hex!("0122222222333333334444444455000003660000000000003670"), 0x0483e1), + (hex!("0122222222333333334444444455000003660000000000004650"), 0x048481), + (hex!("01222222223333333344444444550000036600000000000050d0"), 0x048521), + (hex!("0122222222333333334444444455000003660000000000006eb0"), 0x0485c1), + (hex!("0122222222333333334444444455000003670000000000003680"), 0x048661), + (hex!("01222222223333333344444444550000036700000000000071f0"), 0x048701), + (hex!("0122222222333333334444444455000003680000000000003690"), 0x0487a1), + (hex!("01222222223333333344444444550000036900000000000036a0"), 0x048841), + (hex!("0122222222333333334444444455000003690000000000005c70"), 0x0488e1), + (hex!("01222222223333333344444444550000036a00000000000036b0"), 0x048981), + (hex!("01222222223333333344444444550000036a00000000000071b0"), 0x048a21), + (hex!("01222222223333333344444444550000036b00000000000036c0"), 0x048ac1), + (hex!("01222222223333333344444444550000036b0000000000004670"), 0x048b61), + (hex!("01222222223333333344444444550000036c00000000000036d0"), 0x048c01), + (hex!("01222222223333333344444444550000036c0000000000004750"), 0x048ca1), + (hex!("01222222223333333344444444550000036c0000000000006fa0"), 0x048d41), + (hex!("01222222223333333344444444550000036d00000000000036e0"), 0x048de1), + (hex!("01222222223333333344444444550000036d0000000000003f70"), 0x048e81), + (hex!("01222222223333333344444444550000036d0000000000004b90"), 0x048f21), + (hex!("01222222223333333344444444550000036d00000000000057a0"), 0x048fc1), + (hex!("01222222223333333344444444550000036e00000000000036f0"), 0x049061), + (hex!("01222222223333333344444444550000036e00000000000075d0"), 0x049101), + (hex!("01222222223333333344444444550000036f0000000000003700"), 0x0491a1), + (hex!("0122222222333333334444444455000003700000000000003710"), 0x049241), + (hex!("0122222222333333334444444455000003700000000000005aa0"), 0x0492e1), + (hex!("0122222222333333334444444455000003710000000000003720"), 0x049381), + (hex!("0122222222333333334444444455000003710000000000005130"), 0x049421), + (hex!("0122222222333333334444444455000003710000000000006fc0"), 0x0494c1), + (hex!("0122222222333333334444444455000003710000000000007b00"), 0x049561), + (hex!("0122222222333333334444444455000003720000000000003730"), 0x049601), + (hex!("01222222223333333344444444550000037200000000000054d0"), 0x0496a1), + (hex!("0122222222333333334444444455000003730000000000003740"), 0x049741), + (hex!("0122222222333333334444444455000003730000000000004220"), 0x0497e1), + (hex!("0122222222333333334444444455000003740000000000003750"), 0x049881), + (hex!("0122222222333333334444444455000003740000000000004720"), 0x049921), + (hex!("0122222222333333334444444455000003750000000000003760"), 0x0499c1), + (hex!("0122222222333333334444444455000003750000000000004110"), 0x049a61), + (hex!("0122222222333333334444444455000003760000000000003770"), 0x049b01), + (hex!("0122222222333333334444444455000003770000000000003780"), 0x049ba1), + (hex!("0122222222333333334444444455000003780000000000003790"), 0x049c41), + (hex!("0122222222333333334444444455000003780000000000004b40"), 0x049ce1), + (hex!("0122222222333333334444444455000003780000000000005660"), 0x049d81), + (hex!("0122222222333333334444444455000003780000000000005ea0"), 0x049e21), + (hex!("01222222223333333344444444550000037900000000000037a0"), 0x049ec1), + (hex!("01222222223333333344444444550000037a00000000000037b0"), 0x049f61), + (hex!("01222222223333333344444444550000037b00000000000037c0"), 0x04a001), + (hex!("01222222223333333344444444550000037c00000000000037d0"), 0x04a0a1), + (hex!("01222222223333333344444444550000037c0000000000004340"), 0x04a141), + (hex!("01222222223333333344444444550000037c0000000000005230"), 0x04a1e1), + (hex!("01222222223333333344444444550000037d00000000000037e0"), 0x04a281), + (hex!("01222222223333333344444444550000037d00000000000051e0"), 0x04a321), + (hex!("01222222223333333344444444550000037e00000000000037f0"), 0x04a3c1), + (hex!("01222222223333333344444444550000037e0000000000004090"), 0x04a461), + (hex!("01222222223333333344444444550000037e0000000000005c20"), 0x04a501), + (hex!("01222222223333333344444444550000037f0000000000003800"), 0x04a5a1), + (hex!("0122222222333333334444444455000003800000000000003810"), 0x04a641), + (hex!("0122222222333333334444444455000003800000000000007630"), 0x04a6e1), + (hex!("0122222222333333334444444455000003810000000000003820"), 0x04a781), + (hex!("0122222222333333334444444455000003820000000000003830"), 0x04a821), + (hex!("0122222222333333334444444455000003820000000000004170"), 0x04a8c1), + (hex!("0122222222333333334444444455000003830000000000003840"), 0x04a961), + (hex!("0122222222333333334444444455000003840000000000003850"), 0x04aa01), + (hex!("0122222222333333334444444455000003850000000000003860"), 0x04aaa1), + (hex!("0122222222333333334444444455000003850000000000004180"), 0x04ab41), + (hex!("0122222222333333334444444455000003850000000000005c90"), 0x04abe1), + (hex!("0122222222333333334444444455000003850000000000005da0"), 0x04ac81), + (hex!("0122222222333333334444444455000003850000000000006ff0"), 0x04ad21), + (hex!("0122222222333333334444444455000003860000000000003870"), 0x04adc1), + (hex!("01222222223333333344444444550000038600000000000065c0"), 0x04ae61), + (hex!("0122222222333333334444444455000003870000000000003880"), 0x04af01), + (hex!("0122222222333333334444444455000003870000000000007cc0"), 0x04afa1), + (hex!("0122222222333333334444444455000003880000000000003890"), 0x04b041), + (hex!("01222222223333333344444444550000038900000000000038a0"), 0x04b0e1), + (hex!("01222222223333333344444444550000038a00000000000038b0"), 0x04b181), + (hex!("01222222223333333344444444550000038a00000000000073e0"), 0x04b221), + (hex!("01222222223333333344444444550000038b00000000000038c0"), 0x04b2c1), + (hex!("01222222223333333344444444550000038c00000000000038d0"), 0x04b361), + (hex!("01222222223333333344444444550000038d00000000000038e0"), 0x04b401), + (hex!("01222222223333333344444444550000038d00000000000069f0"), 0x04b4a1), + (hex!("01222222223333333344444444550000038d0000000000007680"), 0x04b541), + (hex!("01222222223333333344444444550000038e00000000000038f0"), 0x04b5e1), + (hex!("01222222223333333344444444550000038f0000000000003900"), 0x04b681), + (hex!("01222222223333333344444444550000038f00000000000045b0"), 0x04b721), + (hex!("01222222223333333344444444550000038f0000000000007180"), 0x04b7c1), + (hex!("0122222222333333334444444455000003900000000000003910"), 0x04b861), + (hex!("0122222222333333334444444455000003910000000000003920"), 0x04b901), + (hex!("0122222222333333334444444455000003910000000000004a20"), 0x04b9a1), + (hex!("0122222222333333334444444455000003920000000000003930"), 0x04ba41), + (hex!("01222222223333333344444444550000039200000000000059b0"), 0x04bae1), + (hex!("0122222222333333334444444455000003930000000000003940"), 0x04bb81), + (hex!("0122222222333333334444444455000003930000000000006cc0"), 0x04bc21), + (hex!("0122222222333333334444444455000003940000000000003950"), 0x04bcc1), + (hex!("01222222223333333344444444550000039400000000000056c0"), 0x04bd61), + (hex!("0122222222333333334444444455000003950000000000003960"), 0x04be01), + (hex!("0122222222333333334444444455000003950000000000004cc0"), 0x04bea1), + (hex!("0122222222333333334444444455000003950000000000007720"), 0x04bf41), + (hex!("0122222222333333334444444455000003960000000000003970"), 0x04bfe1), + (hex!("0122222222333333334444444455000003960000000000004da0"), 0x04c081), + (hex!("0122222222333333334444444455000003960000000000004df0"), 0x04c121), + (hex!("0122222222333333334444444455000003960000000000004f30"), 0x04c1c1), + (hex!("01222222223333333344444444550000039600000000000050f0"), 0x04c261), + (hex!("0122222222333333334444444455000003960000000000007940"), 0x04c301), + (hex!("0122222222333333334444444455000003970000000000003980"), 0x04c3a1), + (hex!("0122222222333333334444444455000003970000000000005850"), 0x04c441), + (hex!("0122222222333333334444444455000003970000000000007bd0"), 0x04c4e1), + (hex!("0122222222333333334444444455000003980000000000003990"), 0x04c581), + (hex!("0122222222333333334444444455000003980000000000004c00"), 0x04c621), + (hex!("0122222222333333334444444455000003980000000000005580"), 0x04c6c1), + (hex!("01222222223333333344444444550000039900000000000039a0"), 0x04c761), + (hex!("0122222222333333334444444455000003990000000000005820"), 0x04c801), + (hex!("01222222223333333344444444550000039a00000000000039b0"), 0x04c8a1), + (hex!("01222222223333333344444444550000039b00000000000039c0"), 0x04c941), + (hex!("01222222223333333344444444550000039b0000000000004c10"), 0x04c9e1), + (hex!("01222222223333333344444444550000039b0000000000006460"), 0x04ca81), + (hex!("01222222223333333344444444550000039c00000000000039d0"), 0x04cb21), + (hex!("01222222223333333344444444550000039d00000000000039e0"), 0x04cbc1), + (hex!("01222222223333333344444444550000039d00000000000044c0"), 0x04cc61), + (hex!("01222222223333333344444444550000039d00000000000049e0"), 0x04cd01), + (hex!("01222222223333333344444444550000039e00000000000039f0"), 0x04cda1), + (hex!("01222222223333333344444444550000039f0000000000003a00"), 0x04ce41), + (hex!("0122222222333333334444444455000003a00000000000003a10"), 0x04cee1), + (hex!("0122222222333333334444444455000003a10000000000003a20"), 0x04cf81), + (hex!("0122222222333333334444444455000003a10000000000006a80"), 0x04d021), + (hex!("0122222222333333334444444455000003a20000000000003a30"), 0x04d0c1), + (hex!("0122222222333333334444444455000003a200000000000062b0"), 0x04d161), + (hex!("0122222222333333334444444455000003a30000000000003a40"), 0x04d201), + (hex!("0122222222333333334444444455000003a30000000000006ce0"), 0x04d2a1), + (hex!("0122222222333333334444444455000003a40000000000003a50"), 0x04d341), + (hex!("0122222222333333334444444455000003a50000000000003a60"), 0x04d3e1), + (hex!("0122222222333333334444444455000003a60000000000003a70"), 0x04d481), + (hex!("0122222222333333334444444455000003a60000000000007750"), 0x04d521), + (hex!("0122222222333333334444444455000003a70000000000003a80"), 0x04d5c1), + (hex!("0122222222333333334444444455000003a70000000000005b10"), 0x04d661), + (hex!("0122222222333333334444444455000003a80000000000003a90"), 0x04d701), + (hex!("0122222222333333334444444455000003a80000000000006c20"), 0x04d7a1), + (hex!("0122222222333333334444444455000003a90000000000003aa0"), 0x04d841), + (hex!("0122222222333333334444444455000003a90000000000005b70"), 0x04d8e1), + (hex!("0122222222333333334444444455000003a900000000000070e0"), 0x04d981), + (hex!("0122222222333333334444444455000003aa0000000000003ab0"), 0x04da21), + (hex!("0122222222333333334444444455000003aa00000000000049f0"), 0x04dac1), + (hex!("0122222222333333334444444455000003aa0000000000004d60"), 0x04db61), + (hex!("0122222222333333334444444455000003ab0000000000003ac0"), 0x04dc01), + (hex!("0122222222333333334444444455000003ac0000000000003ad0"), 0x04dca1), + (hex!("0122222222333333334444444455000003ac0000000000004580"), 0x04dd41), + (hex!("0122222222333333334444444455000003ad0000000000003ae0"), 0x04dde1), + (hex!("0122222222333333334444444455000003ae0000000000003af0"), 0x04de81), + (hex!("0122222222333333334444444455000003af0000000000003b00"), 0x04df21), + (hex!("0122222222333333334444444455000003b00000000000003b10"), 0x04dfc1), + (hex!("0122222222333333334444444455000003b10000000000003b20"), 0x04e061), + (hex!("0122222222333333334444444455000003b10000000000003fd0"), 0x04e101), + (hex!("0122222222333333334444444455000003b20000000000003b30"), 0x04e1a1), + (hex!("0122222222333333334444444455000003b30000000000003b40"), 0x04e241), + (hex!("0122222222333333334444444455000003b40000000000003b50"), 0x04e2e1), + (hex!("0122222222333333334444444455000003b40000000000007450"), 0x04e381), + (hex!("0122222222333333334444444455000003b50000000000003b60"), 0x04e421), + (hex!("0122222222333333334444444455000003b60000000000003b70"), 0x04e4c1), + (hex!("0122222222333333334444444455000003b70000000000003b80"), 0x04e561), + (hex!("0122222222333333334444444455000003b70000000000006d50"), 0x04e601), + (hex!("0122222222333333334444444455000003b80000000000003b90"), 0x04e6a1), + (hex!("0122222222333333334444444455000003b800000000000057c0"), 0x04e741), + (hex!("0122222222333333334444444455000003b800000000000078a0"), 0x04e7e1), + (hex!("0122222222333333334444444455000003b90000000000003ba0"), 0x04e881), + (hex!("0122222222333333334444444455000003b90000000000006750"), 0x04e921), + (hex!("0122222222333333334444444455000003ba0000000000003bb0"), 0x04e9c1), + (hex!("0122222222333333334444444455000003ba0000000000007a10"), 0x04ea61), + (hex!("0122222222333333334444444455000003ba0000000000007a20"), 0x04eb01), + (hex!("0122222222333333334444444455000003bb0000000000003bc0"), 0x04eba1), + (hex!("0122222222333333334444444455000003bb0000000000005bc0"), 0x04ec41), + (hex!("0122222222333333334444444455000003bc0000000000003bd0"), 0x04ece1), + (hex!("0122222222333333334444444455000003bc0000000000005e80"), 0x04ed81), + (hex!("0122222222333333334444444455000003bc0000000000007ab0"), 0x04ee21), + (hex!("0122222222333333334444444455000003bd0000000000003be0"), 0x04eec1), + (hex!("0122222222333333334444444455000003bd00000000000049b0"), 0x04ef61), + (hex!("0122222222333333334444444455000003be0000000000003bf0"), 0x04f001), + (hex!("0122222222333333334444444455000003be0000000000005780"), 0x04f0a1), + (hex!("0122222222333333334444444455000003be0000000000007930"), 0x04f141), + (hex!("0122222222333333334444444455000003bf0000000000003c00"), 0x04f1e1), + (hex!("0122222222333333334444444455000003bf0000000000005de0"), 0x04f281), + (hex!("0122222222333333334444444455000003bf00000000000060b0"), 0x04f321), + (hex!("0122222222333333334444444455000003bf00000000000060c0"), 0x04f3c1), + (hex!("0122222222333333334444444455000003bf0000000000006a50"), 0x04f461), + (hex!("0122222222333333334444444455000003c00000000000003c10"), 0x04f501), + (hex!("0122222222333333334444444455000003c00000000000004030"), 0x04f5a1), + (hex!("0122222222333333334444444455000003c10000000000003c20"), 0x04f641), + (hex!("0122222222333333334444444455000003c20000000000003c30"), 0x04f6e1), + (hex!("0122222222333333334444444455000003c200000000000040b0"), 0x04f781), + (hex!("0122222222333333334444444455000003c30000000000003c40"), 0x04f821), + (hex!("0122222222333333334444444455000003c40000000000003c50"), 0x04f8c1), + (hex!("0122222222333333334444444455000003c40000000000005ba0"), 0x04f961), + (hex!("0122222222333333334444444455000003c50000000000003c60"), 0x04fa01), + (hex!("0122222222333333334444444455000003c60000000000003c70"), 0x04faa1), + (hex!("0122222222333333334444444455000003c70000000000003c80"), 0x04fb41), + (hex!("0122222222333333334444444455000003c70000000000004270"), 0x04fbe1), + (hex!("0122222222333333334444444455000003c80000000000003c90"), 0x04fc81), + (hex!("0122222222333333334444444455000003c80000000000006e70"), 0x04fd21), + (hex!("0122222222333333334444444455000003c90000000000003ca0"), 0x04fdc1), + (hex!("0122222222333333334444444455000003ca0000000000003cb0"), 0x04fe61), + (hex!("0122222222333333334444444455000003ca0000000000006e20"), 0x04ff01), + (hex!("0122222222333333334444444455000003ca0000000000007c20"), 0x04ffa1), + (hex!("0122222222333333334444444455000003cb0000000000003cc0"), 0x050041), + (hex!("0122222222333333334444444455000003cc0000000000003cd0"), 0x0500e1), + (hex!("0122222222333333334444444455000003cc0000000000006120"), 0x050181), + (hex!("0122222222333333334444444455000003cc0000000000007950"), 0x050221), + (hex!("0122222222333333334444444455000003cd0000000000003ce0"), 0x0502c1), + (hex!("0122222222333333334444444455000003ce0000000000003cf0"), 0x050361), + (hex!("0122222222333333334444444455000003cf0000000000003d00"), 0x050401), + (hex!("0122222222333333334444444455000003d00000000000003d10"), 0x0504a1), + (hex!("0122222222333333334444444455000003d10000000000003d20"), 0x050541), + (hex!("0122222222333333334444444455000003d10000000000005e50"), 0x0505e1), + (hex!("0122222222333333334444444455000003d10000000000007880"), 0x050681), + (hex!("0122222222333333334444444455000003d20000000000003d30"), 0x050721), + (hex!("0122222222333333334444444455000003d20000000000005d00"), 0x0507c1), + (hex!("0122222222333333334444444455000003d30000000000003d40"), 0x050861), + (hex!("0122222222333333334444444455000003d30000000000005d40"), 0x050901), + (hex!("0122222222333333334444444455000003d300000000000063f0"), 0x0509a1), + (hex!("0122222222333333334444444455000003d40000000000003d50"), 0x050a41), + (hex!("0122222222333333334444444455000003d40000000000005700"), 0x050ae1), + (hex!("0122222222333333334444444455000003d400000000000078f0"), 0x050b81), + (hex!("0122222222333333334444444455000003d50000000000003d60"), 0x050c21), + (hex!("0122222222333333334444444455000003d60000000000003d70"), 0x050cc1), + (hex!("0122222222333333334444444455000003d70000000000003d80"), 0x050d61), + (hex!("0122222222333333334444444455000003d80000000000003d90"), 0x050e01), + (hex!("0122222222333333334444444455000003d80000000000006690"), 0x050ea1), + (hex!("0122222222333333334444444455000003d90000000000003da0"), 0x050f41), + (hex!("0122222222333333334444444455000003d900000000000076d0"), 0x050fe1), + (hex!("0122222222333333334444444455000003da0000000000003db0"), 0x051081), + (hex!("0122222222333333334444444455000003db0000000000003dc0"), 0x051121), + (hex!("0122222222333333334444444455000003db0000000000004a30"), 0x0511c1), + (hex!("0122222222333333334444444455000003db0000000000005390"), 0x051261), + (hex!("0122222222333333334444444455000003dc0000000000003dd0"), 0x051301), + (hex!("0122222222333333334444444455000003dc0000000000006d60"), 0x0513a1), + (hex!("0122222222333333334444444455000003dd0000000000003de0"), 0x051441), + (hex!("0122222222333333334444444455000003de0000000000003df0"), 0x0514e1), + (hex!("0122222222333333334444444455000003df0000000000003e00"), 0x051581), + (hex!("0122222222333333334444444455000003df0000000000005240"), 0x051621), + (hex!("0122222222333333334444444455000003df0000000000005610"), 0x0516c1), + (hex!("0122222222333333334444444455000003e00000000000003e10"), 0x051761), + (hex!("0122222222333333334444444455000003e00000000000006500"), 0x051801), + (hex!("0122222222333333334444444455000003e10000000000003e20"), 0x0518a1), + (hex!("0122222222333333334444444455000003e10000000000006a10"), 0x051941), + (hex!("0122222222333333334444444455000003e10000000000007c10"), 0x0519e1), + (hex!("0122222222333333334444444455000003e20000000000003e30"), 0x051a81), + (hex!("0122222222333333334444444455000003e20000000000006310"), 0x051b21), + (hex!("0122222222333333334444444455000003e30000000000003e40"), 0x051bc1), + (hex!("0122222222333333334444444455000003e40000000000003e50"), 0x051c61), + (hex!("0122222222333333334444444455000003e40000000000006780"), 0x051d01), + (hex!("0122222222333333334444444455000003e40000000000007ce0"), 0x051da1), + (hex!("0122222222333333334444444455000003e50000000000003e60"), 0x051e41), + (hex!("0122222222333333334444444455000003e60000000000003e70"), 0x051ee1), + (hex!("0122222222333333334444444455000003e60000000000005040"), 0x051f81), + (hex!("0122222222333333334444444455000003e60000000000005bf0"), 0x052021), + (hex!("0122222222333333334444444455000003e70000000000003e80"), 0x0520c1), + (hex!("0122222222333333334444444455000003e70000000000003f50"), 0x052161), +]; diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index d0afce1549..08e635f073 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -16,40 +16,43 @@ //! Every image layer file consists of three parts: "summary", //! "index", and "values". The summary is a fixed size header at the //! beginning of the file, and it contains basic information about the -//! layer, and offsets to the other parts. The "index" is a serialized -//! HashMap, mapping from Key to an offset in the "values" part. The +//! layer, and offsets to the other parts. The "index" is a B-tree, +//! mapping from Key to an offset in the "values" part. The //! actual page images are stored in the "values" part. -//! -//! Only the "index" is loaded into memory by the load function. -//! When images are needed, they are read directly from disk. -//! use crate::config::PageServerConf; use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; -use crate::layered_repository::block_io::{BlockReader, FileBlockReader}; +use crate::layered_repository::block_io::{BlockBuf, BlockReader, FileBlockReader}; +use crate::layered_repository::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; use crate::layered_repository::filename::{ImageFileName, PathOrConf}; use crate::layered_repository::storage_layer::{ Layer, ValueReconstructResult, ValueReconstructState, }; use crate::page_cache::PAGE_SZ; -use crate::repository::{Key, Value}; +use crate::repository::{Key, Value, KEY_SIZE}; use crate::virtual_file::VirtualFile; use crate::{ZTenantId, ZTimelineId}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; +use hex; use log::*; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; use std::fs; use std::io::Write; use std::io::{Seek, SeekFrom}; use std::ops::Range; use std::path::{Path, PathBuf}; -use std::sync::{RwLock, RwLockReadGuard, TryLockError}; +use std::sync::{RwLock, RwLockReadGuard}; use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::Lsn; +/// +/// Header stored in the beginning of the file +/// +/// After this comes the 'values' part, starting on block 1. After that, +/// the 'index' starts at the block indicated by 'index_start_blk' +/// #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] struct Summary { /// Magic value to identify this as a zenith image file. Always IMAGE_FILE_MAGIC. @@ -63,6 +66,9 @@ struct Summary { /// Block number where the 'index' part of the file begins. index_start_blk: u32, + /// Block within the 'index', where the B-tree root page is stored + index_root_blk: u32, + // the 'values' part starts after the summary header, on block 1. } impl From<&ImageLayer> for Summary { @@ -73,10 +79,10 @@ impl From<&ImageLayer> for Summary { tenantid: layer.tenantid, timelineid: layer.timelineid, key_range: layer.key_range.clone(), - lsn: layer.lsn, index_start_blk: 0, + index_root_blk: 0, } } } @@ -104,11 +110,9 @@ pub struct ImageLayerInner { /// If false, the 'index' has not been loaded into memory yet. loaded: bool, - /// offset of each value - index: HashMap, - // values copied from summary index_start_blk: u32, + index_root_blk: u32, /// Reader object for reading blocks from the file. (None if not loaded yet) file: Option>, @@ -147,21 +151,21 @@ impl Layer for ImageLayer { assert!(lsn_range.end >= self.lsn); let inner = self.load()?; - if let Some(&offset) = inner.index.get(&key) { - let buf = inner - .file - .as_ref() - .unwrap() - .block_cursor() - .read_blob(offset) - .with_context(|| { - format!( - "failed to read blob from data file {} at offset {}", - self.filename().display(), - offset - ) - })?; - let value = Bytes::from(buf); + + let file = inner.file.as_ref().unwrap(); + let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file); + + let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; + key.write_to_byte_slice(&mut keybuf); + if let Some(offset) = tree_reader.get(&keybuf)? { + let blob = file.block_cursor().read_blob(offset).with_context(|| { + format!( + "failed to read value from data file {} at offset {}", + self.filename().display(), + offset + ) + })?; + let value = Bytes::from(blob); reconstruct_state.img = Some((self.lsn, value)); Ok(ValueReconstructResult::Complete) @@ -174,33 +178,6 @@ impl Layer for ImageLayer { todo!(); } - fn unload(&self) -> Result<()> { - // Unload the index. - // - // TODO: we should access the index directly from pages on the disk, - // using the buffer cache. This load/unload mechanism is really ad hoc. - - // FIXME: In debug mode, loading and unloading the index slows - // things down so much that you get timeout errors. At least - // with the test_parallel_copy test. So as an even more ad hoc - // stopgap fix for that, only unload every on average 10 - // checkpoint cycles. - use rand::RngCore; - if rand::thread_rng().next_u32() > (u32::MAX / 10) { - return Ok(()); - } - - let mut inner = match self.inner.try_write() { - Ok(inner) => inner, - Err(TryLockError::WouldBlock) => return Ok(()), - Err(TryLockError::Poisoned(_)) => panic!("ImageLayer lock was poisoned"), - }; - inner.index = HashMap::default(); - inner.loaded = false; - - Ok(()) - } - fn delete(&self) -> Result<()> { // delete underlying file fs::remove_file(self.path())?; @@ -227,10 +204,16 @@ impl Layer for ImageLayer { } let inner = self.load()?; + let file = inner.file.as_ref().unwrap(); + let tree_reader = + DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file); - for (key, offset) in inner.index.iter() { - println!("key: {} offset {}", key, offset); - } + tree_reader.dump()?; + + tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| { + println!("key: {} offset {}", hex::encode(key), value); + true + })?; Ok(()) } @@ -300,6 +283,7 @@ impl ImageLayer { PathOrConf::Conf(_) => { let mut expected_summary = Summary::from(self); expected_summary.index_start_blk = actual_summary.index_start_blk; + expected_summary.index_root_blk = actual_summary.index_root_blk; if actual_summary != expected_summary { bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary); @@ -319,17 +303,8 @@ impl ImageLayer { } } - file.file.seek(SeekFrom::Start( - actual_summary.index_start_blk as u64 * PAGE_SZ as u64, - ))?; - let mut buf_reader = std::io::BufReader::new(&mut file.file); - let index = HashMap::des_from(&mut buf_reader)?; - inner.index_start_blk = actual_summary.index_start_blk; - - info!("loaded from {}", &path.display()); - - inner.index = index; + inner.index_root_blk = actual_summary.index_root_blk; inner.loaded = true; Ok(()) } @@ -348,10 +323,10 @@ impl ImageLayer { key_range: filename.key_range.clone(), lsn: filename.lsn, inner: RwLock::new(ImageLayerInner { - index: HashMap::new(), loaded: false, file: None, index_start_blk: 0, + index_root_blk: 0, }), } } @@ -376,9 +351,9 @@ impl ImageLayer { lsn: summary.lsn, inner: RwLock::new(ImageLayerInner { file: None, - index: HashMap::new(), loaded: false, index_start_blk: 0, + index_root_blk: 0, }), }) } @@ -420,9 +395,8 @@ pub struct ImageLayerWriter { key_range: Range, lsn: Lsn, - index: HashMap, - blob_writer: WriteBlobWriter, + tree: DiskBtreeBuilder, } impl ImageLayerWriter { @@ -447,9 +421,15 @@ impl ImageLayerWriter { }, ); info!("new image layer {}", path.display()); - let file = VirtualFile::create(&path)?; + let mut file = VirtualFile::create(&path)?; + // make room for the header block + file.seek(SeekFrom::Start(PAGE_SZ as u64))?; let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64); + // Initialize the b-tree index builder + let block_buf = BlockBuf::new(); + let tree_builder = DiskBtreeBuilder::new(block_buf); + let writer = ImageLayerWriter { conf, _path: path, @@ -457,7 +437,7 @@ impl ImageLayerWriter { tenantid, key_range: key_range.clone(), lsn, - index: HashMap::new(), + tree: tree_builder, blob_writer, }; @@ -473,8 +453,9 @@ impl ImageLayerWriter { ensure!(self.key_range.contains(&key)); let off = self.blob_writer.write_blob(img)?; - let old = self.index.insert(key, off); - assert!(old.is_none()); + let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; + key.write_to_byte_slice(&mut keybuf); + self.tree.append(&keybuf, off)?; Ok(()) } @@ -486,9 +467,11 @@ impl ImageLayerWriter { let mut file = self.blob_writer.into_inner(); // Write out the index - let buf = HashMap::ser(&self.index)?; file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?; - file.write_all(&buf)?; + let (index_root_blk, block_buf) = self.tree.finish()?; + for buf in block_buf.blocks { + file.write_all(buf.as_ref())?; + } // Fill in the summary on blk 0 let summary = Summary { @@ -499,6 +482,7 @@ impl ImageLayerWriter { key_range: self.key_range.clone(), lsn: self.lsn, index_start_blk, + index_root_blk, }; file.seek(SeekFrom::Start(0))?; Summary::ser_into(&summary, &mut file)?; @@ -514,9 +498,9 @@ impl ImageLayerWriter { lsn: self.lsn, inner: RwLock::new(ImageLayerInner { loaded: false, - index: HashMap::new(), file: None, index_start_blk, + index_root_blk, }), }; trace!("created image layer {}", layer.path().display()); diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 8a24528732..a45af51487 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -166,13 +166,6 @@ impl Layer for InMemoryLayer { todo!(); } - /// Cannot unload anything in an in-memory layer, since there's no backing - /// store. To release memory used by an in-memory layer, use 'freeze' to turn - /// it into an on-disk layer. - fn unload(&self) -> Result<()> { - Ok(()) - } - /// Nothing to do here. When you drop the last reference to the layer, it will /// be deallocated. fn delete(&self) -> Result<()> { diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index 5ad43182f6..e413f311c3 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -134,10 +134,6 @@ pub trait Layer: Send + Sync { /// Iterate through all keys and values stored in the layer fn iter(&self) -> Box> + '_>; - /// Release memory used by this layer. There is no corresponding 'load' - /// function, that's done implicitly when you call one of the get-functions. - fn unload(&self) -> Result<()>; - /// Permanently remove this layer from disk. fn delete(&self) -> Result<()>; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 6d2631b2b1..6dddef5f27 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -38,7 +38,7 @@ use pgdatadir_mapping::DatadirTimeline; /// This is embedded in the metadata file, and also in the header of all the /// layer files. If you make any backwards-incompatible changes to the storage /// format, bump this! -pub const STORAGE_FORMAT_VERSION: u16 = 2; +pub const STORAGE_FORMAT_VERSION: u16 = 3; // Magic constants used to identify different kinds of files pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 7e998b0ebe..02334d3229 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -3,6 +3,7 @@ use crate::remote_storage::RemoteIndex; use crate::walrecord::ZenithWalRecord; use crate::CheckpointConfig; use anyhow::{bail, Result}; +use byteorder::{ByteOrder, BE}; use bytes::Bytes; use serde::{Deserialize, Serialize}; use std::fmt; @@ -27,6 +28,8 @@ pub struct Key { pub field6: u32, } +pub const KEY_SIZE: usize = 18; + impl Key { pub fn next(&self) -> Key { self.add(1) @@ -61,7 +64,7 @@ impl Key { key } - pub fn from_array(b: [u8; 18]) -> Self { + pub fn from_slice(b: &[u8]) -> Self { Key { field1: b[0], field2: u32::from_be_bytes(b[1..5].try_into().unwrap()), @@ -71,6 +74,15 @@ impl Key { field6: u32::from_be_bytes(b[14..18].try_into().unwrap()), } } + + pub fn write_to_byte_slice(&self, buf: &mut [u8]) { + buf[0] = self.field1; + BE::write_u32(&mut buf[1..5], self.field2); + BE::write_u32(&mut buf[5..9], self.field3); + BE::write_u32(&mut buf[9..13], self.field4); + buf[13] = self.field5; + BE::write_u32(&mut buf[14..18], self.field6); + } } pub fn key_range_size(key_range: &Range) -> u32 { @@ -569,7 +581,7 @@ mod tests { use lazy_static::lazy_static; lazy_static! { - static ref TEST_KEY: Key = Key::from_array(hex!("112222222233333333444444445500000001")); + static ref TEST_KEY: Key = Key::from_slice(&hex!("112222222233333333444444445500000001")); } #[test] From 8e2a6661e901562ee72c70436a350b4af81968a2 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Mon, 11 Apr 2022 20:36:26 +0300 Subject: [PATCH 0117/1022] Make wal_storage initialization eager (#1489) --- walkeeper/src/safekeeper.rs | 18 ++++++++++-------- walkeeper/src/timeline.rs | 4 ++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/walkeeper/src/safekeeper.rs b/walkeeper/src/safekeeper.rs index 307a67e5f3..1e23d87b34 100644 --- a/walkeeper/src/safekeeper.rs +++ b/walkeeper/src/safekeeper.rs @@ -517,14 +517,16 @@ where pub fn new( ztli: ZTimelineId, control_store: CTRL, - wal_store: WAL, + mut wal_store: WAL, state: SafeKeeperState, - ) -> SafeKeeper { + ) -> Result> { if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id { - panic!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id); + bail!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id); } - SafeKeeper { + wal_store.init_storage(&state)?; + + Ok(SafeKeeper { metrics: SafeKeeperMetrics::new(state.tenant_id, ztli), global_commit_lsn: state.commit_lsn, epoch_start_lsn: Lsn(0), @@ -537,7 +539,7 @@ where s: state, control_store, wal_store, - } + }) } /// Get history of term switches for the available WAL @@ -877,7 +879,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::empty()); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::empty()).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -892,7 +894,7 @@ mod tests { let storage = InMemoryState { persisted_state: state.clone(), }; - sk = SafeKeeper::new(ztli, storage, sk.wal_store, state); + sk = SafeKeeper::new(ztli, storage, sk.wal_store, state).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request); @@ -909,7 +911,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::empty()); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::empty()).unwrap(); let mut ar_hdr = AppendRequestHeader { term: 1, diff --git a/walkeeper/src/timeline.rs b/walkeeper/src/timeline.rs index b10ab97cc1..a76ef77615 100644 --- a/walkeeper/src/timeline.rs +++ b/walkeeper/src/timeline.rs @@ -100,7 +100,7 @@ impl SharedState { let state = SafeKeeperState::new(zttid, peer_ids); let control_store = control_file::FileStorage::new(zttid, conf); let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); - let mut sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, state); + let mut sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, state)?; sk.control_store.persist(&sk.s)?; Ok(Self { @@ -127,7 +127,7 @@ impl SharedState { Ok(Self { notified_commit_lsn: Lsn(0), - sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, state), + sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, state)?, replicas: Vec::new(), active: false, num_computes: 0, From db63fa64ae863187bb044f569ad8aa63c9f5e58b Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 29 Oct 2021 23:21:40 +0300 Subject: [PATCH 0118/1022] Use rusoto lib for S3 relish_storage impl --- Cargo.lock | 3394 ----------------- pageserver/Cargo.toml | 6 +- pageserver/src/remote_storage.rs | 8 +- pageserver/src/remote_storage/README.md | 12 - .../{rust_s3.rs => s3_bucket.rs} | 247 +- 5 files changed, 135 insertions(+), 3532 deletions(-) delete mode 100644 Cargo.lock rename pageserver/src/remote_storage/{rust_s3.rs => s3_bucket.rs} (68%) diff --git a/Cargo.lock b/Cargo.lock deleted file mode 100644 index 19ccd18a10..0000000000 --- a/Cargo.lock +++ /dev/null @@ -1,3394 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 3 - -[[package]] -name = "addr2line" -version = "0.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9ecd88a8c8378ca913a680cd98f0f13ac67383d35993f86c90a70e3f137816b" -dependencies = [ - "gimli", -] - -[[package]] -name = "adler" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" - -[[package]] -name = "ahash" -version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e" - -[[package]] -name = "ahash" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" -dependencies = [ - "getrandom", - "once_cell", - "version_check", -] - -[[package]] -name = "aho-corasick" -version = "0.7.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" -dependencies = [ - "memchr", -] - -[[package]] -name = "ansi_term" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" -dependencies = [ - "winapi", -] - -[[package]] -name = "anyhow" -version = "1.0.53" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94a45b455c14666b85fc40a019e8ab9eb75e3a124e05494f5397122bc9eb06e0" -dependencies = [ - "backtrace", -] - -[[package]] -name = "async-compression" -version = "0.3.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2bf394cfbbe876f0ac67b13b6ca819f9c9f2fb9ec67223cceb1555fbab1c31a" -dependencies = [ - "futures-core", - "memchr", - "pin-project-lite", - "tokio", - "zstd", - "zstd-safe", -] - -[[package]] -name = "async-stream" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "171374e7e3b2504e0e5236e3b59260560f9fe94bfe9ac39ba5e4e929c5590625" -dependencies = [ - "async-stream-impl", - "futures-core", -] - -[[package]] -name = "async-stream-impl" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "648ed8c8d2ce5409ccd57453d9d1b214b342a0d69376a6feda1fd6cae3299308" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "async-trait" -version = "0.1.52" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061a7acccaa286c011ddc30970520b98fa40e00c9d644633fb26b5fc63a265e3" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "attohttpc" -version = "0.18.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e69e13a99a7e6e070bb114f7ff381e58c7ccc188630121fc4c2fe4bcf24cd072" -dependencies = [ - "http", - "log", - "rustls 0.20.2", - "serde", - "serde_json", - "url", - "webpki 0.22.0", - "webpki-roots", - "wildmatch", -] - -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi", -] - -[[package]] -name = "autocfg" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" - -[[package]] -name = "aws-creds" -version = "0.27.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "460a75eac8f3cb7683e0a9a588a83c3ff039331ea7bfbfbfcecf1dacab276e11" -dependencies = [ - "anyhow", - "attohttpc", - "dirs", - "rust-ini", - "serde", - "serde-xml-rs", - "serde_derive", - "url", -] - -[[package]] -name = "aws-region" -version = "0.23.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e37c2dc2c9047311911ef175e0ffbb3853f17c32b72cf3d562f455e5ff77267" -dependencies = [ - "anyhow", -] - -[[package]] -name = "backtrace" -version = "0.3.64" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e121dee8023ce33ab248d9ce1493df03c3b38a659b240096fcbd7048ff9c31f" -dependencies = [ - "addr2line", - "cc", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", -] - -[[package]] -name = "base64" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3441f0f7b02788e948e47f457ca01f1d7e6d92c693bc132c22b087d3141c03ff" - -[[package]] -name = "base64" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" - -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - -[[package]] -name = "bindgen" -version = "0.59.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bd2a9a458e8f4304c52c43ebb0cfbd520289f8379a52e329a38afda99bf8eb8" -dependencies = [ - "bitflags", - "cexpr", - "clang-sys", - "clap 2.34.0", - "env_logger", - "lazy_static", - "lazycell", - "log", - "peeking_take_while", - "proc-macro2", - "quote", - "regex", - "rustc-hash", - "shlex", - "which", -] - -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "block-buffer" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" -dependencies = [ - "generic-array", -] - -[[package]] -name = "boxfnonce" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426" - -[[package]] -name = "bstr" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" -dependencies = [ - "lazy_static", - "memchr", - "regex-automata", - "serde", -] - -[[package]] -name = "bumpalo" -version = "3.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a45a46ab1f2412e53d3a0ade76ffad2025804294569aae387231a0cd6e0899" - -[[package]] -name = "byteorder" -version = "1.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" - -[[package]] -name = "bytes" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" -dependencies = [ - "serde", -] - -[[package]] -name = "cast" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a" -dependencies = [ - "rustc_version", -] - -[[package]] -name = "cc" -version = "1.0.72" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee" -dependencies = [ - "jobserver", -] - -[[package]] -name = "cexpr" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" -dependencies = [ - "nom", -] - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "chrono" -version = "0.4.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" -dependencies = [ - "libc", - "num-integer", - "num-traits", - "time", - "winapi", -] - -[[package]] -name = "clang-sys" -version = "1.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cc00842eed744b858222c4c9faf7243aafc6d33f92f96935263ef4d8a41ce21" -dependencies = [ - "glob", - "libc", - "libloading", -] - -[[package]] -name = "clap" -version = "2.34.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" -dependencies = [ - "ansi_term", - "atty", - "bitflags", - "strsim 0.8.0", - "textwrap 0.11.0", - "unicode-width", - "vec_map", -] - -[[package]] -name = "clap" -version = "3.0.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b63edc3f163b3c71ec8aa23f9bd6070f77edbf3d1d198b164afa90ff00e4ec62" -dependencies = [ - "atty", - "bitflags", - "indexmap", - "os_str_bytes", - "strsim 0.10.0", - "termcolor", - "textwrap 0.14.2", -] - -[[package]] -name = "combine" -version = "4.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50b727aacc797f9fc28e355d21f34709ac4fc9adecfe470ad07b8f4464f53062" -dependencies = [ - "bytes", - "memchr", -] - -[[package]] -name = "compute_tools" -version = "0.1.0" -dependencies = [ - "anyhow", - "chrono", - "clap 3.0.14", - "env_logger", - "hyper", - "libc", - "log", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", - "regex", - "serde", - "serde_json", - "tar", - "tokio", - "workspace_hack", -] - -[[package]] -name = "const_format" -version = "0.2.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22bc6cd49b0ec407b680c3e380182b6ac63b73991cb7602de350352fc309b614" -dependencies = [ - "const_format_proc_macros", -] - -[[package]] -name = "const_format_proc_macros" -version = "0.2.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef196d5d972878a48da7decb7686eded338b4858fbabeed513d63a7c98b2b82d" -dependencies = [ - "proc-macro2", - "quote", - "unicode-xid", -] - -[[package]] -name = "control_plane" -version = "0.1.0" -dependencies = [ - "anyhow", - "lazy_static", - "nix", - "pageserver", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "regex", - "reqwest", - "serde", - "serde_with", - "tar", - "thiserror", - "toml", - "url", - "walkeeper", - "workspace_hack", - "zenith_utils", -] - -[[package]] -name = "cpufeatures" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469" -dependencies = [ - "libc", -] - -[[package]] -name = "crc32c" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee6b9c9389584bcba988bd0836086789b7f87ad91892d6a83d5291dbb24524b5" -dependencies = [ - "rustc_version", -] - -[[package]] -name = "criterion" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10" -dependencies = [ - "atty", - "cast", - "clap 2.34.0", - "criterion-plot", - "csv", - "itertools", - "lazy_static", - "num-traits", - "oorandom", - "plotters", - "rayon", - "regex", - "serde", - "serde_cbor", - "serde_derive", - "serde_json", - "tinytemplate", - "walkdir", -] - -[[package]] -name = "criterion-plot" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" -dependencies = [ - "cast", - "itertools", -] - -[[package]] -name = "crossbeam-channel" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e54ea8bc3fb1ee042f5aace6e3c6e025d3874866da222930f70ce62aceba0bfa" -dependencies = [ - "cfg-if", - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-deque" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" -dependencies = [ - "cfg-if", - "crossbeam-epoch", - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c00d6d2ea26e8b151d99093005cb442fb9a37aeaca582a03ec70946f49ab5ed9" -dependencies = [ - "cfg-if", - "crossbeam-utils", - "lazy_static", - "memoffset", - "scopeguard", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e5bed1f1c269533fa816a0a5492b3545209a205ca1a54842be180eb63a16a6" -dependencies = [ - "cfg-if", - "lazy_static", -] - -[[package]] -name = "crypto-mac" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bff07008ec701e8028e2ceb8f83f0e4274ee62bd2dbdc4fefff2e9a91824081a" -dependencies = [ - "generic-array", - "subtle", -] - -[[package]] -name = "crypto-mac" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1d1a86f49236c215f271d40892d5fc950490551400b02ef360692c29815c714" -dependencies = [ - "generic-array", - "subtle", -] - -[[package]] -name = "csv" -version = "1.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" -dependencies = [ - "bstr", - "csv-core", - "itoa 0.4.8", - "ryu", - "serde", -] - -[[package]] -name = "csv-core" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" -dependencies = [ - "memchr", -] - -[[package]] -name = "daemonize" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70c24513e34f53b640819f0ac9f705b673fcf4006d7aab8778bee72ebfc89815" -dependencies = [ - "boxfnonce", - "libc", -] - -[[package]] -name = "darling" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0d720b8683f8dd83c65155f0530560cba68cd2bf395f6513a483caee57ff7f4" -dependencies = [ - "darling_core", - "darling_macro", -] - -[[package]] -name = "darling_core" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a340f241d2ceed1deb47ae36c4144b2707ec7dd0b649f894cb39bb595986324" -dependencies = [ - "fnv", - "ident_case", - "proc-macro2", - "quote", - "strsim 0.10.0", - "syn", -] - -[[package]] -name = "darling_macro" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72c41b3b7352feb3211a0d743dc5700a4e3b60f51bd2b368892d1e0f9a95f44b" -dependencies = [ - "darling_core", - "quote", - "syn", -] - -[[package]] -name = "digest" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" -dependencies = [ - "generic-array", -] - -[[package]] -name = "dirs" -version = "4.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059" -dependencies = [ - "dirs-sys", -] - -[[package]] -name = "dirs-sys" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03d86534ed367a67548dc68113a0f5db55432fdfbb6e6f9d77704397d95d5780" -dependencies = [ - "libc", - "redox_users", - "winapi", -] - -[[package]] -name = "dlv-list" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68df3f2b690c1b86e65ef7830956aededf3cb0a16f898f79b9a6f421a7b6211b" -dependencies = [ - "rand", -] - -[[package]] -name = "either" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" - -[[package]] -name = "encoding_rs" -version = "0.8.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7896dc8abb250ffdda33912550faa54c88ec8b998dec0b2c55ab224921ce11df" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "env_logger" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b2cf0344971ee6c64c31be0d530793fba457d322dfec2810c453d0ef228f9c3" -dependencies = [ - "atty", - "humantime", - "log", - "regex", - "termcolor", -] - -[[package]] -name = "etcd-client" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "585de5039d1ecce74773db49ba4e8107e42be7c2cd0b1a9e7fce27181db7b118" -dependencies = [ - "http", - "prost", - "tokio", - "tokio-stream", - "tonic", - "tonic-build", - "tower-service", -] - -[[package]] -name = "fail" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3245a0ca564e7f3c797d20d833a6870f57a728ac967d5225b3ffdef4465011" -dependencies = [ - "lazy_static", - "log", - "rand", -] - -[[package]] -name = "fallible-iterator" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" - -[[package]] -name = "fastrand" -version = "1.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3fcf0cee53519c866c09b5de1f6c56ff9d647101f81c1964fa632e148896cdf" -dependencies = [ - "instant", -] - -[[package]] -name = "filetime" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "975ccf83d8d9d0d84682850a38c8169027be83368805971cc4f238c2b245bc98" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "winapi", -] - -[[package]] -name = "fixedbitset" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "279fb028e20b3c4c320317955b77c5e0c9701f05a1d309905d6fc702cdc5053e" - -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - -[[package]] -name = "form_urlencoded" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fc25a87fa4fd2094bffb06925852034d90a17f0d1e05197d4956d3555752191" -dependencies = [ - "matches", - "percent-encoding", -] - -[[package]] -name = "fs2" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" -dependencies = [ - "libc", - "winapi", -] - -[[package]] -name = "futures" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f73fe65f54d1e12b726f517d3e2135ca3125a437b6d998caf1962961f7172d9e" -dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-channel" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3083ce4b914124575708913bca19bfe887522d6e2e6d0952943f5eac4a74010" -dependencies = [ - "futures-core", - "futures-sink", -] - -[[package]] -name = "futures-core" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3" - -[[package]] -name = "futures-executor" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9420b90cfa29e327d0429f19be13e7ddb68fa1cccb09d65e5706b8c7a749b8a6" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-io" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b" - -[[package]] -name = "futures-macro" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "futures-sink" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21163e139fa306126e6eedaf49ecdb4588f939600f0b1e770f4205ee4b7fa868" - -[[package]] -name = "futures-task" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c66a976bf5909d801bbef33416c41372779507e7a6b3a5e25e4749c58f776a" - -[[package]] -name = "futures-util" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a" -dependencies = [ - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite", - "pin-utils", - "slab", -] - -[[package]] -name = "generic-array" -version = "0.14.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd48d33ec7f05fbfa152300fdad764757cbded343c1aa1cff2fbaf4134851803" -dependencies = [ - "typenum", - "version_check", -] - -[[package]] -name = "getrandom" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418d37c8b1d42553c93648be529cb70f920d3baf8ef469b74b9638df426e0b4c" -dependencies = [ - "cfg-if", - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", -] - -[[package]] -name = "gimli" -version = "0.26.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78cc372d058dcf6d5ecd98510e7fbc9e5aec4d21de70f65fea8fecebcd881bd4" - -[[package]] -name = "git-version" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6b0decc02f4636b9ccad390dcbe77b722a77efedfa393caf8379a51d5c61899" -dependencies = [ - "git-version-macro", - "proc-macro-hack", -] - -[[package]] -name = "git-version-macro" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe69f1cbdb6e28af2bac214e943b99ce8a0a06b447d15d3e61161b0423139f3f" -dependencies = [ - "proc-macro-hack", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "glob" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" - -[[package]] -name = "h2" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9f1f717ddc7b2ba36df7e871fd88db79326551d3d6f1fc406fbfd28b582ff8e" -dependencies = [ - "bytes", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http", - "indexmap", - "slab", - "tokio", - "tokio-util 0.6.9", - "tracing", -] - -[[package]] -name = "half" -version = "1.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" - -[[package]] -name = "hashbrown" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" -dependencies = [ - "ahash 0.4.7", -] - -[[package]] -name = "hashbrown" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" -dependencies = [ - "ahash 0.7.6", -] - -[[package]] -name = "heck" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" -dependencies = [ - "unicode-segmentation", -] - -[[package]] -name = "hermit-abi" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] - -[[package]] -name = "hex" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" -dependencies = [ - "serde", -] - -[[package]] -name = "hex-literal" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ebdb29d2ea9ed0083cd8cece49bbd968021bd99b0849edb4a9a7ee0fdf6a4e0" - -[[package]] -name = "hmac" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1441c6b1e930e2817404b5046f1f989899143a12bf92de603b69f4e0aee1e15" -dependencies = [ - "crypto-mac 0.10.1", - "digest", -] - -[[package]] -name = "hmac" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a2a2320eb7ec0ebe8da8f744d7812d9fc4cb4d09344ac01898dbcb6a20ae69b" -dependencies = [ - "crypto-mac 0.11.1", - "digest", -] - -[[package]] -name = "http" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f4c6746584866f0feabcc69893c5b51beef3831656a968ed7ae254cdc4fd03" -dependencies = [ - "bytes", - "fnv", - "itoa 1.0.1", -] - -[[package]] -name = "http-body" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ff4f84919677303da5f147645dbea6b1881f368d03ac84e1dc09031ebd7b2c6" -dependencies = [ - "bytes", - "http", - "pin-project-lite", -] - -[[package]] -name = "httparse" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9100414882e15fb7feccb4897e5f0ff0ff1ca7d1a86a23208ada4d7a18e6c6c4" - -[[package]] -name = "httpdate" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" - -[[package]] -name = "humantime" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" - -[[package]] -name = "hyper" -version = "0.14.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7ec3e62bdc98a2f0393a5048e4c30ef659440ea6e0e572965103e72bd836f55" -dependencies = [ - "bytes", - "futures-channel", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "httparse", - "httpdate", - "itoa 0.4.8", - "pin-project-lite", - "socket2", - "tokio", - "tower-service", - "tracing", - "want", -] - -[[package]] -name = "hyper-rustls" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d87c48c02e0dc5e3b849a2041db3029fd066650f8f717c07bf8ed78ccb895cac" -dependencies = [ - "http", - "hyper", - "rustls 0.20.2", - "tokio", - "tokio-rustls 0.23.2", -] - -[[package]] -name = "hyper-timeout" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" -dependencies = [ - "hyper", - "pin-project-lite", - "tokio", - "tokio-io-timeout", -] - -[[package]] -name = "ident_case" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" - -[[package]] -name = "idna" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" -dependencies = [ - "matches", - "unicode-bidi", - "unicode-normalization", -] - -[[package]] -name = "indexmap" -version = "1.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282a6247722caba404c065016bbfa522806e51714c34f5dfc3e4a3a46fcb4223" -dependencies = [ - "autocfg", - "hashbrown 0.11.2", -] - -[[package]] -name = "instant" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "ipnet" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f2d64f2edebec4ce84ad108148e67e1064789bee435edc5b60ad398714a3a9" - -[[package]] -name = "itertools" -version = "0.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3" -dependencies = [ - "either", -] - -[[package]] -name = "itoa" -version = "0.4.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" - -[[package]] -name = "itoa" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" - -[[package]] -name = "jobserver" -version = "0.1.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa" -dependencies = [ - "libc", -] - -[[package]] -name = "js-sys" -version = "0.3.56" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a38fc24e30fd564ce974c02bf1d337caddff65be6cc4735a1f7eab22a7440f04" -dependencies = [ - "wasm-bindgen", -] - -[[package]] -name = "jsonwebtoken" -version = "7.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afabcc15e437a6484fc4f12d0fd63068fe457bf93f1c148d3d9649c60b103f32" -dependencies = [ - "base64 0.12.3", - "pem 0.8.3", - "ring", - "serde", - "serde_json", - "simple_asn1", -] - -[[package]] -name = "kstring" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b310ccceade8121d7d77fee406160e457c2f4e7c7982d589da3499bc7ea4526" -dependencies = [ - "serde", -] - -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - -[[package]] -name = "lazycell" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" - -[[package]] -name = "libc" -version = "0.2.117" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e74d72e0f9b65b5b4ca49a346af3976df0f9c61d550727f349ecd559f251a26c" - -[[package]] -name = "libloading" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efbc0f03f9a775e9f6aed295c6a1ba2253c5757a9e03d55c6caa46a681abcddd" -dependencies = [ - "cfg-if", - "winapi", -] - -[[package]] -name = "lock_api" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88943dd7ef4a2e5a4bfa2753aaab3013e34ce2533d1996fb18ef591e315e2b3b" -dependencies = [ - "scopeguard", -] - -[[package]] -name = "log" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" -dependencies = [ - "cfg-if", - "serde", -] - -[[package]] -name = "matchers" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" -dependencies = [ - "regex-automata", -] - -[[package]] -name = "matches" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" - -[[package]] -name = "maybe-async" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6007f9dad048e0a224f27ca599d669fca8cfa0dac804725aab542b2eb032bce6" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "md-5" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b5a279bb9607f9f53c22d496eade00d138d1bdcccd07d74650387cf94942a15" -dependencies = [ - "block-buffer", - "digest", - "opaque-debug", -] - -[[package]] -name = "md5" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" - -[[package]] -name = "memchr" -version = "2.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" - -[[package]] -name = "memoffset" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" -dependencies = [ - "autocfg", -] - -[[package]] -name = "mime" -version = "0.3.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" - -[[package]] -name = "minimal-lexical" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" - -[[package]] -name = "miniz_oxide" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" -dependencies = [ - "adler", - "autocfg", -] - -[[package]] -name = "mio" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52da4364ffb0e4fe33a9841a98a3f3014fb964045ce4f7a45a398243c8d6b0c9" -dependencies = [ - "libc", - "log", - "miow", - "ntapi", - "wasi 0.11.0+wasi-snapshot-preview1", - "winapi", -] - -[[package]] -name = "miow" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" -dependencies = [ - "winapi", -] - -[[package]] -name = "multimap" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" - -[[package]] -name = "nix" -version = "0.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" -dependencies = [ - "bitflags", - "cc", - "cfg-if", - "libc", - "memoffset", -] - -[[package]] -name = "nom" -version = "7.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b1d11e1ef389c76fe5b81bcaf2ea32cf88b62bc494e19f493d0b30e7a930109" -dependencies = [ - "memchr", - "minimal-lexical", - "version_check", -] - -[[package]] -name = "ntapi" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" -dependencies = [ - "winapi", -] - -[[package]] -name = "num-bigint" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304" -dependencies = [ - "autocfg", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-integer" -version = "0.1.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" -dependencies = [ - "autocfg", - "num-traits", -] - -[[package]] -name = "num-traits" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" -dependencies = [ - "autocfg", -] - -[[package]] -name = "num_cpus" -version = "1.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" -dependencies = [ - "hermit-abi", - "libc", -] - -[[package]] -name = "object" -version = "0.27.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67ac1d3f9a1d3616fd9a60c8d74296f22406a238b6a72f5cc1e6f314df4ffbf9" -dependencies = [ - "memchr", -] - -[[package]] -name = "once_cell" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" - -[[package]] -name = "oorandom" -version = "11.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" - -[[package]] -name = "opaque-debug" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" - -[[package]] -name = "ordered-multimap" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c672c7ad9ec066e428c00eb917124a06f08db19e2584de982cc34b1f4c12485" -dependencies = [ - "dlv-list", - "hashbrown 0.9.1", -] - -[[package]] -name = "os_str_bytes" -version = "6.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" -dependencies = [ - "memchr", -] - -[[package]] -name = "pageserver" -version = "0.1.0" -dependencies = [ - "anyhow", - "async-compression", - "async-trait", - "byteorder", - "bytes", - "chrono", - "clap 3.0.14", - "const_format", - "crc32c", - "crossbeam-utils", - "daemonize", - "fail", - "futures", - "hex", - "hex-literal", - "humantime", - "hyper", - "itertools", - "lazy_static", - "log", - "nix", - "once_cell", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres_ffi", - "rand", - "regex", - "rust-s3", - "scopeguard", - "serde", - "serde_json", - "serde_with", - "signal-hook", - "tar", - "tempfile", - "thiserror", - "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "tokio-stream", - "toml_edit", - "tracing", - "tracing-futures", - "url", - "workspace_hack", - "zenith_metrics", - "zenith_utils", -] - -[[package]] -name = "parking_lot" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" -dependencies = [ - "instant", - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" -dependencies = [ - "cfg-if", - "instant", - "libc", - "redox_syscall", - "smallvec", - "winapi", -] - -[[package]] -name = "peeking_take_while" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" - -[[package]] -name = "pem" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd56cbd21fea48d0c440b41cd69c589faacade08c992d9a54e471b79d0fd13eb" -dependencies = [ - "base64 0.13.0", - "once_cell", - "regex", -] - -[[package]] -name = "pem" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9a3b09a20e374558580a4914d3b7d89bd61b954a5a5e1dcbea98753addb1947" -dependencies = [ - "base64 0.13.0", -] - -[[package]] -name = "percent-encoding" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" - -[[package]] -name = "petgraph" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" -dependencies = [ - "fixedbitset", - "indexmap", -] - -[[package]] -name = "phf" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" -dependencies = [ - "phf_shared", -] - -[[package]] -name = "phf_shared" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" -dependencies = [ - "siphasher", -] - -[[package]] -name = "pin-project" -version = "1.0.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58ad3879ad3baf4e44784bc6a718a8698867bb991f8ce24d1bcbe2cfb4c3a75e" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.0.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744b6f092ba29c3650faf274db506afd39944f48420f6c86b17cfe0ee1cb36bb" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e280fbe77cc62c91527259e9442153f4688736748d24660126286329742b4c6c" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "plotters" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" -dependencies = [ - "num-traits", - "plotters-backend", - "plotters-svg", - "wasm-bindgen", - "web-sys", -] - -[[package]] -name = "plotters-backend" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c" - -[[package]] -name = "plotters-svg" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" -dependencies = [ - "plotters-backend", -] - -[[package]] -name = "postgres" -version = "0.19.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" -dependencies = [ - "bytes", - "fallible-iterator", - "futures", - "log", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", -] - -[[package]] -name = "postgres" -version = "0.19.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858#9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" -dependencies = [ - "bytes", - "fallible-iterator", - "futures", - "log", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", - "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", -] - -[[package]] -name = "postgres-protocol" -version = "0.6.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" -dependencies = [ - "base64 0.13.0", - "byteorder", - "bytes", - "fallible-iterator", - "hmac 0.10.1", - "lazy_static", - "md-5", - "memchr", - "rand", - "sha2", - "stringprep", -] - -[[package]] -name = "postgres-protocol" -version = "0.6.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858#9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" -dependencies = [ - "base64 0.13.0", - "byteorder", - "bytes", - "fallible-iterator", - "hmac 0.10.1", - "lazy_static", - "md-5", - "memchr", - "rand", - "sha2", - "stringprep", -] - -[[package]] -name = "postgres-types" -version = "0.2.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" -dependencies = [ - "bytes", - "fallible-iterator", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", -] - -[[package]] -name = "postgres-types" -version = "0.2.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858#9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" -dependencies = [ - "bytes", - "fallible-iterator", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", -] - -[[package]] -name = "postgres_ffi" -version = "0.1.0" -dependencies = [ - "anyhow", - "bindgen", - "byteorder", - "bytes", - "chrono", - "crc32c", - "hex", - "lazy_static", - "log", - "memoffset", - "rand", - "regex", - "serde", - "thiserror", - "workspace_hack", - "zenith_utils", -] - -[[package]] -name = "ppv-lite86" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" - -[[package]] -name = "proc-macro-hack" -version = "0.5.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" - -[[package]] -name = "proc-macro2" -version = "1.0.36" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" -dependencies = [ - "unicode-xid", -] - -[[package]] -name = "prometheus" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7f64969ffd5dd8f39bd57a68ac53c163a095ed9d0fb707146da1b27025a3504" -dependencies = [ - "cfg-if", - "fnv", - "lazy_static", - "memchr", - "parking_lot", - "thiserror", -] - -[[package]] -name = "prost" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "444879275cb4fd84958b1a1d5420d15e6fcf7c235fe47f053c9c2a80aceb6001" -dependencies = [ - "bytes", - "prost-derive", -] - -[[package]] -name = "prost-build" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62941722fb675d463659e49c4f3fe1fe792ff24fe5bbaa9c08cd3b98a1c354f5" -dependencies = [ - "bytes", - "heck", - "itertools", - "lazy_static", - "log", - "multimap", - "petgraph", - "prost", - "prost-types", - "regex", - "tempfile", - "which", -] - -[[package]] -name = "prost-derive" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9cc1a3263e07e0bf68e96268f37665207b49560d98739662cdfaae215c720fe" -dependencies = [ - "anyhow", - "itertools", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "prost-types" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534b7a0e836e3c482d2693070f982e39e7611da9695d4d1f5a4b186b51faef0a" -dependencies = [ - "bytes", - "prost", -] - -[[package]] -name = "proxy" -version = "0.1.0" -dependencies = [ - "anyhow", - "bytes", - "clap 3.0.14", - "fail", - "futures", - "hashbrown 0.11.2", - "hex", - "hyper", - "lazy_static", - "md5", - "parking_lot", - "pin-project-lite", - "rand", - "rcgen", - "reqwest", - "rustls 0.19.1", - "scopeguard", - "serde", - "serde_json", - "socket2", - "thiserror", - "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "tokio-postgres-rustls", - "tokio-rustls 0.22.0", - "workspace_hack", - "zenith_metrics", - "zenith_utils", -] - -[[package]] -name = "quote" -version = "1.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "864d3e96a899863136fc6e99f3d7cae289dafe43bf2c5ac19b70df7210c0a145" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "rand" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" -dependencies = [ - "libc", - "rand_chacha", - "rand_core", - "rand_hc", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" -dependencies = [ - "getrandom", -] - -[[package]] -name = "rand_hc" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" -dependencies = [ - "rand_core", -] - -[[package]] -name = "rayon" -version = "1.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" -dependencies = [ - "autocfg", - "crossbeam-deque", - "either", - "rayon-core", -] - -[[package]] -name = "rayon-core" -version = "1.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" -dependencies = [ - "crossbeam-channel", - "crossbeam-deque", - "crossbeam-utils", - "lazy_static", - "num_cpus", -] - -[[package]] -name = "rcgen" -version = "0.8.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5911d1403f4143c9d56a702069d593e8d0f3fab880a85e103604d0893ea31ba7" -dependencies = [ - "chrono", - "pem 1.0.2", - "ring", - "yasna", -] - -[[package]] -name = "redox_syscall" -version = "0.2.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" -dependencies = [ - "bitflags", -] - -[[package]] -name = "redox_users" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" -dependencies = [ - "getrandom", - "redox_syscall", -] - -[[package]] -name = "regex" -version = "1.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" -dependencies = [ - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.6.25" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" - -[[package]] -name = "remove_dir_all" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" -dependencies = [ - "winapi", -] - -[[package]] -name = "reqwest" -version = "0.11.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f242f1488a539a79bac6dbe7c8609ae43b7914b7736210f239a37cccb32525" -dependencies = [ - "base64 0.13.0", - "bytes", - "encoding_rs", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "hyper", - "hyper-rustls", - "ipnet", - "js-sys", - "lazy_static", - "log", - "mime", - "percent-encoding", - "pin-project-lite", - "rustls 0.20.2", - "rustls-pemfile", - "serde", - "serde_json", - "serde_urlencoded", - "tokio", - "tokio-rustls 0.23.2", - "tokio-util 0.6.9", - "url", - "wasm-bindgen", - "wasm-bindgen-futures", - "web-sys", - "webpki-roots", - "winreg", -] - -[[package]] -name = "ring" -version = "0.16.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" -dependencies = [ - "cc", - "libc", - "once_cell", - "spin", - "untrusted", - "web-sys", - "winapi", -] - -[[package]] -name = "routerify" -version = "3.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945" -dependencies = [ - "http", - "hyper", - "lazy_static", - "percent-encoding", - "regex", -] - -[[package]] -name = "rust-ini" -version = "0.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63471c4aa97a1cf8332a5f97709a79a4234698de6a1f5087faf66f2dae810e22" -dependencies = [ - "cfg-if", - "ordered-multimap", -] - -[[package]] -name = "rust-s3" -version = "0.28.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dc0e521d1084d6950e050d4e2595f0fbdaa2b96bb795bab3d90a282288c5e49" -dependencies = [ - "anyhow", - "async-trait", - "aws-creds", - "aws-region", - "base64 0.13.0", - "cfg-if", - "chrono", - "hex", - "hmac 0.11.0", - "http", - "log", - "maybe-async", - "md5", - "percent-encoding", - "reqwest", - "serde", - "serde-xml-rs", - "serde_derive", - "sha2", - "tokio", - "tokio-stream", - "url", -] - -[[package]] -name = "rustc-demangle" -version = "0.1.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" - -[[package]] -name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - -[[package]] -name = "rustc_version" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" -dependencies = [ - "semver", -] - -[[package]] -name = "rustls" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7" -dependencies = [ - "base64 0.13.0", - "log", - "ring", - "sct 0.6.1", - "webpki 0.21.4", -] - -[[package]] -name = "rustls" -version = "0.20.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d37e5e2290f3e040b594b1a9e04377c2c671f1a1cfd9bfdef82106ac1c113f84" -dependencies = [ - "log", - "ring", - "sct 0.7.0", - "webpki 0.22.0", -] - -[[package]] -name = "rustls-pemfile" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eebeaeb360c87bfb72e84abdb3447159c0eaececf1bef2aecd65a8be949d1c9" -dependencies = [ - "base64 0.13.0", -] - -[[package]] -name = "rustls-split" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fb079b52cfdb005752b7c3c646048e702003576a8321058e4c8b38227c11aa6" -dependencies = [ - "rustls 0.19.1", -] - -[[package]] -name = "rustversion" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2cc38e8fa666e2de3c4aba7edeb5ffc5246c1c2ed0e3d17e560aeeba736b23f" - -[[package]] -name = "ryu" -version = "1.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f" - -[[package]] -name = "same-file" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "scopeguard" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" - -[[package]] -name = "sct" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b362b83898e0e69f38515b82ee15aa80636befe47c3b6d3d89a911e78fc228ce" -dependencies = [ - "ring", - "untrusted", -] - -[[package]] -name = "sct" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" -dependencies = [ - "ring", - "untrusted", -] - -[[package]] -name = "semver" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0486718e92ec9a68fbed73bb5ef687d71103b142595b406835649bebd33f72c7" - -[[package]] -name = "serde" -version = "1.0.136" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce31e24b01e1e524df96f1c2fdd054405f8d7376249a5110886fb4b658484789" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde-xml-rs" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65162e9059be2f6a3421ebbb4fef3e74b7d9e7c60c50a0e292c6239f19f1edfa" -dependencies = [ - "log", - "serde", - "thiserror", - "xml-rs", -] - -[[package]] -name = "serde_cbor" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" -dependencies = [ - "half", - "serde", -] - -[[package]] -name = "serde_derive" -version = "1.0.136" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08597e7152fcd306f41838ed3e37be9eaeed2b61c42e2117266a554fab4662f9" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "serde_json" -version = "1.0.78" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d23c1ba4cf0efd44be32017709280b32d1cea5c3f1275c3b6d9e8bc54f758085" -dependencies = [ - "itoa 1.0.1", - "ryu", - "serde", -] - -[[package]] -name = "serde_urlencoded" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" -dependencies = [ - "form_urlencoded", - "itoa 1.0.1", - "ryu", - "serde", -] - -[[package]] -name = "serde_with" -version = "1.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec1e6ec4d8950e5b1e894eac0d360742f3b1407a6078a604a731c4b3f49cefbc" -dependencies = [ - "rustversion", - "serde", - "serde_with_macros", -] - -[[package]] -name = "serde_with_macros" -version = "1.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12e47be9471c72889ebafb5e14d5ff930d89ae7a67bbdb5f8abb564f845a927e" -dependencies = [ - "darling", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "sha2" -version = "0.9.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800" -dependencies = [ - "block-buffer", - "cfg-if", - "cpufeatures", - "digest", - "opaque-debug", -] - -[[package]] -name = "sharded-slab" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31" -dependencies = [ - "lazy_static", -] - -[[package]] -name = "shlex" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" - -[[package]] -name = "signal-hook" -version = "0.3.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "647c97df271007dcea485bb74ffdb57f2e683f1306c854f468a0c244badabf2d" -dependencies = [ - "libc", - "signal-hook-registry", -] - -[[package]] -name = "signal-hook-registry" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" -dependencies = [ - "libc", -] - -[[package]] -name = "simple_asn1" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "692ca13de57ce0613a363c8c2f1de925adebc81b04c923ac60c5488bb44abe4b" -dependencies = [ - "chrono", - "num-bigint", - "num-traits", -] - -[[package]] -name = "siphasher" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a86232ab60fa71287d7f2ddae4a7073f6b7aac33631c3015abb556f08c6d0a3e" - -[[package]] -name = "slab" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9def91fd1e018fe007022791f865d0ccc9b3a0d5001e01aabb8b40e46000afb5" - -[[package]] -name = "smallvec" -version = "1.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" - -[[package]] -name = "socket2" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66d72b759436ae32898a2af0a14218dbf55efde3feeb170eb623637db85ee1e0" -dependencies = [ - "libc", - "winapi", -] - -[[package]] -name = "spin" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" - -[[package]] -name = "stringprep" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ee348cb74b87454fff4b551cbf727025810a004f88aeacae7f85b87f4e9a1c1" -dependencies = [ - "unicode-bidi", - "unicode-normalization", -] - -[[package]] -name = "strsim" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" - -[[package]] -name = "strsim" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" - -[[package]] -name = "subtle" -version = "2.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" - -[[package]] -name = "syn" -version = "1.0.86" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a65b3f4ffa0092e9887669db0eae07941f023991ab58ea44da8fe8e2d511c6b" -dependencies = [ - "proc-macro2", - "quote", - "unicode-xid", -] - -[[package]] -name = "tar" -version = "0.4.38" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6" -dependencies = [ - "filetime", - "libc", - "xattr", -] - -[[package]] -name = "tempfile" -version = "3.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" -dependencies = [ - "cfg-if", - "fastrand", - "libc", - "redox_syscall", - "remove_dir_all", - "winapi", -] - -[[package]] -name = "termcolor" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "textwrap" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" -dependencies = [ - "unicode-width", -] - -[[package]] -name = "textwrap" -version = "0.14.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0066c8d12af8b5acd21e00547c3797fde4e8677254a7ee429176ccebbe93dd80" - -[[package]] -name = "thiserror" -version = "1.0.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417" -dependencies = [ - "thiserror-impl", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "thread_local" -version = "1.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180" -dependencies = [ - "once_cell", -] - -[[package]] -name = "time" -version = "0.1.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" -dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi", -] - -[[package]] -name = "tinytemplate" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" -dependencies = [ - "serde", - "serde_json", -] - -[[package]] -name = "tinyvec" -version = "1.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c1c1d5a42b6245520c249549ec267180beaffcc0615401ac8e31853d4b6d8d2" -dependencies = [ - "tinyvec_macros", -] - -[[package]] -name = "tinyvec_macros" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" - -[[package]] -name = "tokio" -version = "1.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2af73ac49756f3f7c01172e34a23e5d0216f6c32333757c2c61feb2bbff5a5ee" -dependencies = [ - "bytes", - "libc", - "memchr", - "mio", - "num_cpus", - "once_cell", - "pin-project-lite", - "signal-hook-registry", - "socket2", - "tokio-macros", - "winapi", -] - -[[package]] -name = "tokio-io-timeout" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" -dependencies = [ - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tokio-macros" -version = "1.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b557f72f448c511a979e2564e55d74e6c4432fc96ff4f6241bc6bded342643b7" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "tokio-postgres" -version = "0.7.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" -dependencies = [ - "async-trait", - "byteorder", - "bytes", - "fallible-iterator", - "futures", - "log", - "parking_lot", - "percent-encoding", - "phf", - "pin-project-lite", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "socket2", - "tokio", - "tokio-util 0.6.9", -] - -[[package]] -name = "tokio-postgres" -version = "0.7.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858#9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" -dependencies = [ - "async-trait", - "byteorder", - "bytes", - "fallible-iterator", - "futures", - "log", - "parking_lot", - "percent-encoding", - "phf", - "pin-project-lite", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", - "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", - "socket2", - "tokio", - "tokio-util 0.6.9", -] - -[[package]] -name = "tokio-postgres-rustls" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7bd8c37d8c23cb6ecdc32fc171bade4e9c7f1be65f693a17afbaad02091a0a19" -dependencies = [ - "futures", - "ring", - "rustls 0.19.1", - "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "tokio-rustls 0.22.0", - "webpki 0.21.4", -] - -[[package]] -name = "tokio-rustls" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6" -dependencies = [ - "rustls 0.19.1", - "tokio", - "webpki 0.21.4", -] - -[[package]] -name = "tokio-rustls" -version = "0.23.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a27d5f2b839802bd8267fa19b0530f5a08b9c08cd417976be2a65d130fe1c11b" -dependencies = [ - "rustls 0.20.2", - "tokio", - "webpki 0.22.0", -] - -[[package]] -name = "tokio-stream" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50145484efff8818b5ccd256697f36863f587da82cf8b409c53adf1e840798e3" -dependencies = [ - "futures-core", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tokio-util" -version = "0.6.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e99e1983e5d376cd8eb4b66604d2e99e79f5bd988c3055891dcd8c9e2604cc0" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "log", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tokio-util" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64910e1b9c1901aaf5375561e35b9c057d95ff41a44ede043a03e09279eabaf1" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "log", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "toml" -version = "0.5.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa" -dependencies = [ - "serde", -] - -[[package]] -name = "toml_edit" -version = "0.13.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744e9ed5b352340aa47ce033716991b5589e23781acb97cad37d4ea70560f55b" -dependencies = [ - "combine", - "indexmap", - "itertools", - "kstring", - "serde", -] - -[[package]] -name = "tonic" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff08f4649d10a70ffa3522ca559031285d8e421d727ac85c60825761818f5d0a" -dependencies = [ - "async-stream", - "async-trait", - "base64 0.13.0", - "bytes", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "hyper", - "hyper-timeout", - "percent-encoding", - "pin-project", - "prost", - "prost-derive", - "tokio", - "tokio-stream", - "tokio-util 0.6.9", - "tower", - "tower-layer", - "tower-service", - "tracing", - "tracing-futures", -] - -[[package]] -name = "tonic-build" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9403f1bafde247186684b230dc6f38b5cd514584e8bec1dd32514be4745fa757" -dependencies = [ - "proc-macro2", - "prost-build", - "quote", - "syn", -] - -[[package]] -name = "tower" -version = "0.4.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a89fd63ad6adf737582df5db40d286574513c69a11dac5214dc3b5603d6713e" -dependencies = [ - "futures-core", - "futures-util", - "indexmap", - "pin-project", - "pin-project-lite", - "rand", - "slab", - "tokio", - "tokio-util 0.7.0", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "tower-layer" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343bc9466d3fe6b0f960ef45960509f84480bf4fd96f92901afe7ff3df9d3a62" - -[[package]] -name = "tower-service" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" - -[[package]] -name = "tracing" -version = "0.1.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d8d93354fe2a8e50d5953f5ae2e47a3fc2ef03292e7ea46e3cc38f549525fb9" -dependencies = [ - "cfg-if", - "log", - "pin-project-lite", - "tracing-attributes", - "tracing-core", -] - -[[package]] -name = "tracing-attributes" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8276d9a4a3a558d7b7ad5303ad50b53d58264641b82914b7ada36bd762e7a716" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "tracing-core" -version = "0.1.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03cfcb51380632a72d3111cb8d3447a8d908e577d31beeac006f836383d29a23" -dependencies = [ - "lazy_static", - "valuable", -] - -[[package]] -name = "tracing-futures" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2" -dependencies = [ - "pin-project", - "tracing", -] - -[[package]] -name = "tracing-log" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3" -dependencies = [ - "lazy_static", - "log", - "tracing-core", -] - -[[package]] -name = "tracing-subscriber" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74786ce43333fcf51efe947aed9718fbe46d5c7328ec3f1029e818083966d9aa" -dependencies = [ - "ansi_term", - "lazy_static", - "matchers", - "regex", - "sharded-slab", - "smallvec", - "thread_local", - "tracing", - "tracing-core", - "tracing-log", -] - -[[package]] -name = "try-lock" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" - -[[package]] -name = "typenum" -version = "1.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" - -[[package]] -name = "unicode-bidi" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a01404663e3db436ed2746d9fefef640d868edae3cceb81c3b8d5732fda678f" - -[[package]] -name = "unicode-normalization" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" -dependencies = [ - "tinyvec", -] - -[[package]] -name = "unicode-segmentation" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99" - -[[package]] -name = "unicode-width" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" - -[[package]] -name = "unicode-xid" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" - -[[package]] -name = "untrusted" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" - -[[package]] -name = "url" -version = "2.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" -dependencies = [ - "form_urlencoded", - "idna", - "matches", - "percent-encoding", -] - -[[package]] -name = "valuable" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" - -[[package]] -name = "vec_map" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" - -[[package]] -name = "version_check" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" - -[[package]] -name = "walkdir" -version = "2.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" -dependencies = [ - "same-file", - "winapi", - "winapi-util", -] - -[[package]] -name = "walkeeper" -version = "0.1.0" -dependencies = [ - "anyhow", - "byteorder", - "bytes", - "clap 3.0.14", - "const_format", - "crc32c", - "daemonize", - "etcd-client", - "fs2", - "hex", - "humantime", - "hyper", - "lazy_static", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres_ffi", - "regex", - "rust-s3", - "serde", - "serde_json", - "serde_with", - "signal-hook", - "tempfile", - "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "tracing", - "url", - "walkdir", - "workspace_hack", - "zenith_metrics", - "zenith_utils", -] - -[[package]] -name = "want" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" -dependencies = [ - "log", - "try-lock", -] - -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - -[[package]] -name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" - -[[package]] -name = "wasm-bindgen" -version = "0.2.79" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25f1af7423d8588a3d840681122e72e6a24ddbcb3f0ec385cac0d12d24256c06" -dependencies = [ - "cfg-if", - "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.79" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b21c0df030f5a177f3cba22e9bc4322695ec43e7257d865302900290bcdedca" -dependencies = [ - "bumpalo", - "lazy_static", - "log", - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-futures" -version = "0.4.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2eb6ec270a31b1d3c7e266b999739109abce8b6c87e4b31fcfcd788b65267395" -dependencies = [ - "cfg-if", - "js-sys", - "wasm-bindgen", - "web-sys", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.79" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f4203d69e40a52ee523b2529a773d5ffc1dc0071801c87b3d270b471b80ed01" -dependencies = [ - "quote", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.79" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa8a30d46208db204854cadbb5d4baf5fcf8071ba5bf48190c3e59937962ebc" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-backend", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.79" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d958d035c4438e28c70e4321a2911302f10135ce78a9c7834c0cab4123d06a2" - -[[package]] -name = "web-sys" -version = "0.3.56" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c060b319f29dd25724f09a2ba1418f142f539b2be99fbf4d2d5a8f7330afb8eb" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - -[[package]] -name = "webpki" -version = "0.21.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8e38c0608262c46d4a56202ebabdeb094cef7e560ca7a226c6bf055188aa4ea" -dependencies = [ - "ring", - "untrusted", -] - -[[package]] -name = "webpki" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f095d78192e208183081cc07bc5515ef55216397af48b873e5edcd72637fa1bd" -dependencies = [ - "ring", - "untrusted", -] - -[[package]] -name = "webpki-roots" -version = "0.22.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "552ceb903e957524388c4d3475725ff2c8b7960922063af6ce53c9a43da07449" -dependencies = [ - "webpki 0.22.0", -] - -[[package]] -name = "which" -version = "4.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a5a7e487e921cf220206864a94a89b6c6905bfc19f1057fa26a4cb360e5c1d2" -dependencies = [ - "either", - "lazy_static", - "libc", -] - -[[package]] -name = "wildmatch" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6c48bd20df7e4ced539c12f570f937c6b4884928a87fee70a479d72f031d4e0" - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - -[[package]] -name = "winreg" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69" -dependencies = [ - "winapi", -] - -[[package]] -name = "workspace_hack" -version = "0.1.0" -dependencies = [ - "anyhow", - "bytes", - "cc", - "clap 2.34.0", - "either", - "hashbrown 0.11.2", - "libc", - "log", - "memchr", - "num-integer", - "num-traits", - "proc-macro2", - "quote", - "regex", - "regex-syntax", - "reqwest", - "scopeguard", - "serde", - "syn", - "tokio", - "tracing", - "tracing-core", -] - -[[package]] -name = "xattr" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "244c3741f4240ef46274860397c7c74e50eb23624996930e484c16679633a54c" -dependencies = [ - "libc", -] - -[[package]] -name = "xml-rs" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3" - -[[package]] -name = "yasna" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e262a29d0e61ccf2b6190d7050d4b237535fc76ce4c1210d9caa316f71dffa75" -dependencies = [ - "chrono", -] - -[[package]] -name = "zenith" -version = "0.1.0" -dependencies = [ - "anyhow", - "clap 3.0.14", - "control_plane", - "pageserver", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres_ffi", - "serde_json", - "walkeeper", - "workspace_hack", - "zenith_utils", -] - -[[package]] -name = "zenith_metrics" -version = "0.1.0" -dependencies = [ - "lazy_static", - "libc", - "once_cell", - "prometheus", - "workspace_hack", -] - -[[package]] -name = "zenith_utils" -version = "0.1.0" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "bytes", - "criterion", - "git-version", - "hex", - "hex-literal", - "hyper", - "jsonwebtoken", - "lazy_static", - "nix", - "pin-project-lite", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "rand", - "routerify", - "rustls 0.19.1", - "rustls-split", - "serde", - "serde_json", - "serde_with", - "signal-hook", - "tempfile", - "thiserror", - "tokio", - "tracing", - "tracing-subscriber", - "webpki 0.21.4", - "workspace_hack", - "zenith_metrics", -] - -[[package]] -name = "zstd" -version = "0.10.0+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b1365becbe415f3f0fcd024e2f7b45bacfb5bdd055f0dc113571394114e7bdd" -dependencies = [ - "zstd-safe", -] - -[[package]] -name = "zstd-safe" -version = "4.1.4+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f7cd17c9af1a4d6c24beb1cc54b17e2ef7b593dc92f19e9d9acad8b182bbaee" -dependencies = [ - "libc", - "zstd-sys", -] - -[[package]] -name = "zstd-sys" -version = "1.6.3+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc49afa5c8d634e75761feda8c592051e7eeb4683ba827211eb0d731d3402ea8" -dependencies = [ - "cc", - "libc", -] diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 4d79811bfb..dccdca291c 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -18,6 +18,7 @@ log = "0.4.14" clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } +tokio-util = { version = "0.7", features = ["io"] } postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } @@ -34,7 +35,6 @@ serde_with = "1.12.0" toml_edit = { version = "0.13", features = ["easy"] } scopeguard = "1.1.0" -async-trait = "0.1" const_format = "0.2.21" tracing = "0.1.27" tracing-futures = "0.2" @@ -45,7 +45,9 @@ once_cell = "1.8.0" crossbeam-utils = "0.8.5" fail = "0.5.0" -rust-s3 = { version = "0.28", default-features = false, features = ["no-verify-ssl", "tokio-rustls-tls"] } +rusoto_core = "0.47" +rusoto_s3 = "0.47" +async-trait = "0.1" async-compression = {version = "0.3", features = ["zstd", "tokio"]} postgres_ffi = { path = "../postgres_ffi" } diff --git a/pageserver/src/remote_storage.rs b/pageserver/src/remote_storage.rs index bdd6086b94..02d37af5de 100644 --- a/pageserver/src/remote_storage.rs +++ b/pageserver/src/remote_storage.rs @@ -5,7 +5,7 @@ //! There are a few components the storage machinery consists of: //! * [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations: //! * [`local_fs`] allows to use local file system as an external storage -//! * [`rust_s3`] uses AWS S3 bucket as an external storage +//! * [`s3_bucket`] uses AWS S3 bucket as an external storage //! //! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync. //! Synchronization internals are split into submodules @@ -82,7 +82,7 @@ //! The sync queue processing also happens in batches, so the sync tasks can wait in the queue for some time. mod local_fs; -mod rust_s3; +mod s3_bucket; mod storage_sync; use std::{ @@ -98,7 +98,7 @@ use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; pub use self::storage_sync::index::{RemoteIndex, TimelineIndexEntry}; pub use self::storage_sync::{schedule_timeline_checkpoint_upload, schedule_timeline_download}; -use self::{local_fs::LocalFs, rust_s3::S3}; +use self::{local_fs::LocalFs, s3_bucket::S3Bucket}; use crate::layered_repository::ephemeral_file::is_ephemeral_file; use crate::{ config::{PageServerConf, RemoteStorageKind}, @@ -151,7 +151,7 @@ pub fn start_local_timeline_sync( storage_sync::spawn_storage_sync_thread( config, local_timeline_files, - S3::new(s3_config, &config.workdir)?, + S3Bucket::new(s3_config, &config.workdir)?, storage_config.max_concurrent_sync, storage_config.max_sync_errors, ) diff --git a/pageserver/src/remote_storage/README.md b/pageserver/src/remote_storage/README.md index 339ddce866..43a47e09d8 100644 --- a/pageserver/src/remote_storage/README.md +++ b/pageserver/src/remote_storage/README.md @@ -46,18 +46,6 @@ This could be avoided by a background thread/future storing the serialized index No file checksum assertion is done currently, but should be (AWS S3 returns file checksums during the `list` operation) -* sad rust-s3 api - -rust-s3 is not very pleasant to use: -1. it returns `anyhow::Result` and it's hard to distinguish "missing file" cases from "no connection" one, for instance -2. at least one function it its API that we need (`get_object_stream`) has `async` keyword and blocks (!), see details [here](https://github.com/zenithdb/zenith/pull/752#discussion_r728373091) -3. it's a prerelease library with unclear maintenance status -4. noisy on debug level - -But it's already used in the project, so for now it's reused to avoid bloating the dependency tree. -Based on previous evaluation, even `rusoto-s3` could be a better choice over this library, but needs further benchmarking. - - * gc is ignored So far, we don't adjust the remote storage based on GC thread loop results, only checkpointer loop affects the remote storage. diff --git a/pageserver/src/remote_storage/rust_s3.rs b/pageserver/src/remote_storage/s3_bucket.rs similarity index 68% rename from pageserver/src/remote_storage/rust_s3.rs rename to pageserver/src/remote_storage/s3_bucket.rs index 527bdf48ff..92b3b0cce8 100644 --- a/pageserver/src/remote_storage/rust_s3.rs +++ b/pageserver/src/remote_storage/s3_bucket.rs @@ -1,4 +1,4 @@ -//! AWS S3 storage wrapper around `rust_s3` library. +//! AWS S3 storage wrapper around `rusoto` library. //! //! Respects `prefix_in_bucket` property from [`S3Config`], //! allowing multiple pageservers to independently work with the same S3 bucket, if @@ -7,9 +7,17 @@ use std::path::{Path, PathBuf}; use anyhow::Context; -use s3::{bucket::Bucket, creds::Credentials, region::Region}; -use tokio::io::{self, AsyncWriteExt}; -use tracing::debug; +use rusoto_core::{ + credential::{InstanceMetadataProvider, StaticProvider}, + HttpClient, Region, +}; +use rusoto_s3::{ + DeleteObjectRequest, GetObjectRequest, ListObjectsV2Request, PutObjectRequest, S3Client, + StreamingBody, S3, +}; +use tokio::io; +use tokio_util::io::ReaderStream; +use tracing::{debug, trace}; use crate::{ config::S3Config, @@ -50,38 +58,50 @@ impl S3ObjectKey { } /// AWS S3 storage. -pub struct S3 { +pub struct S3Bucket { pageserver_workdir: &'static Path, - bucket: Bucket, + client: S3Client, + bucket_name: String, prefix_in_bucket: Option, } -impl S3 { - /// Creates the storage, errors if incorrect AWS S3 configuration provided. +impl S3Bucket { + /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result { + // TODO kb check this + // Keeping a single client may cause issues due to timeouts. + // https://github.com/rusoto/rusoto/issues/1686 + debug!( - "Creating s3 remote storage around bucket {}", + "Creating s3 remote storage for S3 bucket {}", aws_config.bucket_name ); let region = match aws_config.endpoint.clone() { - Some(endpoint) => Region::Custom { - endpoint, - region: aws_config.bucket_region.clone(), + Some(custom_endpoint) => Region::Custom { + name: aws_config.bucket_region.clone(), + endpoint: custom_endpoint, }, None => aws_config .bucket_region .parse::() .context("Failed to parse the s3 region from config")?, }; - - let credentials = Credentials::new( - aws_config.access_key_id.as_deref(), - aws_config.secret_access_key.as_deref(), - None, - None, - None, - ) - .context("Failed to create the s3 credentials")?; + let request_dispatcher = HttpClient::new().context("Failed to create S3 http client")?; + let client = if aws_config.access_key_id.is_none() && aws_config.secret_access_key.is_none() + { + trace!("Using IAM-based AWS access"); + S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region) + } else { + trace!("Using credentials-based AWS access"); + S3Client::new_with( + request_dispatcher, + StaticProvider::new_minimal( + aws_config.access_key_id.clone().unwrap_or_default(), + aws_config.secret_access_key.clone().unwrap_or_default(), + ), + region, + ) + }; let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| { let mut prefix = prefix; @@ -97,20 +117,16 @@ impl S3 { }); Ok(Self { - bucket: Bucket::new_with_path_style( - aws_config.bucket_name.as_str(), - region, - credentials, - ) - .context("Failed to create the s3 bucket")?, + client, pageserver_workdir, + bucket_name: aws_config.bucket_name.clone(), prefix_in_bucket, }) } } #[async_trait::async_trait] -impl RemoteStorage for S3 { +impl RemoteStorage for S3Bucket { type StoragePath = S3ObjectKey; fn storage_path(&self, local_path: &Path) -> anyhow::Result { @@ -129,48 +145,50 @@ impl RemoteStorage for S3 { } async fn list(&self) -> anyhow::Result> { - let list_response = self - .bucket - .list(self.prefix_in_bucket.clone().unwrap_or_default(), None) - .await - .context("Failed to list s3 objects")?; + let mut document_keys = Vec::new(); - Ok(list_response - .into_iter() - .flat_map(|response| response.contents) - .map(|s3_object| S3ObjectKey(s3_object.key)) - .collect()) + let mut continuation_token = None; + loop { + let fetch_response = self + .client + .list_objects_v2(ListObjectsV2Request { + bucket: self.bucket_name.clone(), + prefix: self.prefix_in_bucket.clone(), + continuation_token, + ..ListObjectsV2Request::default() + }) + .await?; + document_keys.extend( + fetch_response + .contents + .unwrap_or_default() + .into_iter() + .filter_map(|o| Some(S3ObjectKey(o.key?))), + ); + + match fetch_response.continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + + Ok(document_keys) } async fn upload( &self, - mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + from: impl io::AsyncRead + Unpin + Send + Sync + 'static, to: &Self::StoragePath, ) -> anyhow::Result<()> { - let mut upload_contents = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - io::copy(&mut from, &mut upload_contents) - .await - .context("Failed to read the upload contents")?; - upload_contents - .flush() - .await - .context("Failed to read the upload contents")?; - let upload_contents = upload_contents.into_inner().into_inner(); - - let (_, code) = self - .bucket - .put_object(to.key(), &upload_contents) - .await - .with_context(|| format!("Failed to create s3 object with key {}", to.key()))?; - if code != 200 { - Err(anyhow::format_err!( - "Received non-200 exit code during creating object with key '{}', code: {}", - to.key(), - code - )) - } else { - Ok(()) - } + self.client + .put_object(PutObjectRequest { + body: Some(StreamingBody::new(ReaderStream::new(from))), + bucket: self.bucket_name.clone(), + key: to.key().to_owned(), + ..PutObjectRequest::default() + }) + .await?; + Ok(()) } async fn download( @@ -178,25 +196,21 @@ impl RemoteStorage for S3 { from: &Self::StoragePath, to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), ) -> anyhow::Result<()> { - let (data, code) = self - .bucket - .get_object(from.key()) - .await - .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?; - if code != 200 { - Err(anyhow::format_err!( - "Received non-200 exit code during downloading object, code: {}", - code - )) - } else { - // we don't have to write vector into the destination this way, `to_write_all` would be enough. - // but we want to prepare for migration on `rusoto`, that has a streaming HTTP body instead here, with - // which it makes more sense to use `io::copy`. - io::copy(&mut data.as_slice(), to) - .await - .context("Failed to write downloaded data into the destination buffer")?; - Ok(()) + let object_output = self + .client + .get_object(GetObjectRequest { + bucket: self.bucket_name.clone(), + key: from.key().to_owned(), + ..GetObjectRequest::default() + }) + .await?; + + if let Some(body) = object_output.body { + let mut from = io::BufReader::new(body.into_async_read()); + io::copy(&mut from, to).await?; } + + Ok(()) } async fn download_range( @@ -209,40 +223,37 @@ impl RemoteStorage for S3 { // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 // and needs both ends to be exclusive let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1)); - let (data, code) = self - .bucket - .get_object_range(from.key(), start_inclusive, end_inclusive) - .await - .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?; - if code != 206 { - Err(anyhow::format_err!( - "Received non-206 exit code during downloading object range, code: {}", - code - )) - } else { - // see `download` function above for the comment on why `Vec` buffer is copied this way - io::copy(&mut data.as_slice(), to) - .await - .context("Failed to write downloaded range into the destination buffer")?; - Ok(()) + let range = Some(match end_inclusive { + Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive), + None => format!("bytes={}-", start_inclusive), + }); + let object_output = self + .client + .get_object(GetObjectRequest { + bucket: self.bucket_name.clone(), + key: from.key().to_owned(), + range, + ..GetObjectRequest::default() + }) + .await?; + + if let Some(body) = object_output.body { + let mut from = io::BufReader::new(body.into_async_read()); + io::copy(&mut from, to).await?; } + + Ok(()) } async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> { - let (_, code) = self - .bucket - .delete_object(path.key()) - .await - .with_context(|| format!("Failed to delete s3 object with key {}", path.key()))?; - if code != 204 { - Err(anyhow::format_err!( - "Received non-204 exit code during deleting object with key '{}', code: {}", - path.key(), - code - )) - } else { - Ok(()) - } + self.client + .delete_object(DeleteObjectRequest { + bucket: self.bucket_name.clone(), + key: path.key().to_owned(), + ..DeleteObjectRequest::default() + }) + .await?; + Ok(()) } } @@ -314,7 +325,7 @@ mod tests { #[test] fn storage_path_negatives() -> anyhow::Result<()> { #[track_caller] - fn storage_path_error(storage: &S3, mismatching_path: &Path) -> String { + fn storage_path_error(storage: &S3Bucket, mismatching_path: &Path) -> String { match storage.storage_path(mismatching_path) { Ok(wrong_key) => panic!( "Expected path '{}' to error, but got S3 key: {:?}", @@ -412,15 +423,11 @@ mod tests { Ok(()) } - fn dummy_storage(pageserver_workdir: &'static Path) -> S3 { - S3 { + fn dummy_storage(pageserver_workdir: &'static Path) -> S3Bucket { + S3Bucket { pageserver_workdir, - bucket: Bucket::new( - "dummy-bucket", - "us-east-1".parse().unwrap(), - Credentials::anonymous().unwrap(), - ) - .unwrap(), + client: S3Client::new("us-east-1".parse().unwrap()), + bucket_name: "dummy-bucket".to_string(), prefix_in_bucket: Some("dummy_prefix/".to_string()), } } From 0e9ee772af7406e943565a1985ef5c9117ad470c Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 28 Mar 2022 15:18:01 +0300 Subject: [PATCH 0119/1022] Use rusoto in safekeeper --- Cargo.lock | 3503 +++++++++++++++++++++++++++++++++++ walkeeper/Cargo.toml | 6 +- walkeeper/src/s3_offload.rs | 102 +- 3 files changed, 3573 insertions(+), 38 deletions(-) create mode 100644 Cargo.lock diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000000..1a9e261281 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,3503 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "addr2line" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9ecd88a8c8378ca913a680cd98f0f13ac67383d35993f86c90a70e3f137816b" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "ahash" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +dependencies = [ + "getrandom", + "once_cell", + "version_check", +] + +[[package]] +name = "aho-corasick" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +dependencies = [ + "memchr", +] + +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + +[[package]] +name = "anyhow" +version = "1.0.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94a45b455c14666b85fc40a019e8ab9eb75e3a124e05494f5397122bc9eb06e0" +dependencies = [ + "backtrace", +] + +[[package]] +name = "async-compression" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2bf394cfbbe876f0ac67b13b6ca819f9c9f2fb9ec67223cceb1555fbab1c31a" +dependencies = [ + "futures-core", + "memchr", + "pin-project-lite", + "tokio", + "zstd", + "zstd-safe", +] + +[[package]] +name = "async-stream" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dad5c83079eae9969be7fadefe640a1c566901f05ff91ab221de4b6f68d9507e" +dependencies = [ + "async-stream-impl", + "futures-core", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f203db73a71dfa2fb6dd22763990fa26f3d2625a6da2da900d23b87d26be27" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "async-trait" +version = "0.1.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "061a7acccaa286c011ddc30970520b98fa40e00c9d644633fb26b5fc63a265e3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "backtrace" +version = "0.3.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e121dee8023ce33ab248d9ce1493df03c3b38a659b240096fcbd7048ff9c31f" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "base64" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3441f0f7b02788e948e47f457ca01f1d7e6d92c693bc132c22b087d3141c03ff" + +[[package]] +name = "base64" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" + +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + +[[package]] +name = "bindgen" +version = "0.59.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bd2a9a458e8f4304c52c43ebb0cfbd520289f8379a52e329a38afda99bf8eb8" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "clap 2.34.0", + "env_logger", + "lazy_static", + "lazycell", + "log", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "which", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "block-buffer" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" +dependencies = [ + "generic-array", +] + +[[package]] +name = "boxfnonce" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426" + +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "bumpalo" +version = "3.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a45a46ab1f2412e53d3a0ade76ffad2025804294569aae387231a0cd6e0899" + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "bytes" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" +dependencies = [ + "serde", +] + +[[package]] +name = "cast" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a" +dependencies = [ + "rustc_version", +] + +[[package]] +name = "cc" +version = "1.0.72" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee" +dependencies = [ + "jobserver", +] + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" +dependencies = [ + "libc", + "num-integer", + "num-traits", + "serde", + "time", + "winapi", +] + +[[package]] +name = "clang-sys" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cc00842eed744b858222c4c9faf7243aafc6d33f92f96935263ef4d8a41ce21" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "clap" +version = "2.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" +dependencies = [ + "ansi_term", + "atty", + "bitflags", + "strsim 0.8.0", + "textwrap 0.11.0", + "unicode-width", + "vec_map", +] + +[[package]] +name = "clap" +version = "3.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b63edc3f163b3c71ec8aa23f9bd6070f77edbf3d1d198b164afa90ff00e4ec62" +dependencies = [ + "atty", + "bitflags", + "indexmap", + "os_str_bytes", + "strsim 0.10.0", + "termcolor", + "textwrap 0.14.2", +] + +[[package]] +name = "combine" +version = "4.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50b727aacc797f9fc28e355d21f34709ac4fc9adecfe470ad07b8f4464f53062" +dependencies = [ + "bytes", + "memchr", +] + +[[package]] +name = "compute_tools" +version = "0.1.0" +dependencies = [ + "anyhow", + "chrono", + "clap 3.0.14", + "env_logger", + "hyper", + "libc", + "log", + "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", + "regex", + "serde", + "serde_json", + "tar", + "tokio", + "workspace_hack", +] + +[[package]] +name = "const_format" +version = "0.2.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22bc6cd49b0ec407b680c3e380182b6ac63b73991cb7602de350352fc309b614" +dependencies = [ + "const_format_proc_macros", +] + +[[package]] +name = "const_format_proc_macros" +version = "0.2.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef196d5d972878a48da7decb7686eded338b4858fbabeed513d63a7c98b2b82d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "control_plane" +version = "0.1.0" +dependencies = [ + "anyhow", + "lazy_static", + "nix", + "pageserver", + "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "regex", + "reqwest", + "serde", + "serde_with", + "tar", + "thiserror", + "toml", + "url", + "walkeeper", + "workspace_hack", + "zenith_utils", +] + +[[package]] +name = "core-foundation" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" + +[[package]] +name = "cpufeatures" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32c" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee6b9c9389584bcba988bd0836086789b7f87ad91892d6a83d5291dbb24524b5" +dependencies = [ + "rustc_version", +] + +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "criterion" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10" +dependencies = [ + "atty", + "cast", + "clap 2.34.0", + "criterion-plot", + "csv", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aaa7bd5fb665c6864b5f963dd9097905c54125909c7aa94c9e18507cdbe6c53" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1145cf131a2c6ba0615079ab6a638f7e1973ac9c2634fcbeaaad6114246efe8c" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "lazy_static", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e5bed1f1c269533fa816a0a5492b3545209a205ca1a54842be180eb63a16a6" +dependencies = [ + "cfg-if", + "lazy_static", +] + +[[package]] +name = "crypto-mac" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bff07008ec701e8028e2ceb8f83f0e4274ee62bd2dbdc4fefff2e9a91824081a" +dependencies = [ + "generic-array", + "subtle", +] + +[[package]] +name = "crypto-mac" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1d1a86f49236c215f271d40892d5fc950490551400b02ef360692c29815c714" +dependencies = [ + "generic-array", + "subtle", +] + +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa 0.4.8", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] +name = "daemonize" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70c24513e34f53b640819f0ac9f705b673fcf4006d7aab8778bee72ebfc89815" +dependencies = [ + "boxfnonce", + "libc", +] + +[[package]] +name = "darling" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0d720b8683f8dd83c65155f0530560cba68cd2bf395f6513a483caee57ff7f4" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a340f241d2ceed1deb47ae36c4144b2707ec7dd0b649f894cb39bb595986324" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim 0.10.0", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72c41b3b7352feb3211a0d743dc5700a4e3b60f51bd2b368892d1e0f9a95f44b" +dependencies = [ + "darling_core", + "quote", + "syn", +] + +[[package]] +name = "digest" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" +dependencies = [ + "generic-array", +] + +[[package]] +name = "dirs-next" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" +dependencies = [ + "cfg-if", + "dirs-sys-next", +] + +[[package]] +name = "dirs-sys-next" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + +[[package]] +name = "encoding_rs" +version = "0.8.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dc8abb250ffdda33912550faa54c88ec8b998dec0b2c55ab224921ce11df" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "env_logger" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b2cf0344971ee6c64c31be0d530793fba457d322dfec2810c453d0ef228f9c3" +dependencies = [ + "atty", + "humantime", + "log", + "regex", + "termcolor", +] + +[[package]] +name = "etcd-client" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "585de5039d1ecce74773db49ba4e8107e42be7c2cd0b1a9e7fce27181db7b118" +dependencies = [ + "http", + "prost", + "tokio", + "tokio-stream", + "tonic", + "tonic-build", + "tower-service", +] + +[[package]] +name = "fail" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3245a0ca564e7f3c797d20d833a6870f57a728ac967d5225b3ffdef4465011" +dependencies = [ + "lazy_static", + "log", + "rand", +] + +[[package]] +name = "fallible-iterator" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" + +[[package]] +name = "fastrand" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3fcf0cee53519c866c09b5de1f6c56ff9d647101f81c1964fa632e148896cdf" +dependencies = [ + "instant", +] + +[[package]] +name = "filetime" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "975ccf83d8d9d0d84682850a38c8169027be83368805971cc4f238c2b245bc98" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "winapi", +] + +[[package]] +name = "fixedbitset" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "279fb028e20b3c4c320317955b77c5e0c9701f05a1d309905d6fc702cdc5053e" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "form_urlencoded" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fc25a87fa4fd2094bffb06925852034d90a17f0d1e05197d4956d3555752191" +dependencies = [ + "matches", + "percent-encoding", +] + +[[package]] +name = "fs2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "futures" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f73fe65f54d1e12b726f517d3e2135ca3125a437b6d998caf1962961f7172d9e" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3083ce4b914124575708913bca19bfe887522d6e2e6d0952943f5eac4a74010" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3" + +[[package]] +name = "futures-executor" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9420b90cfa29e327d0429f19be13e7ddb68fa1cccb09d65e5706b8c7a749b8a6" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b" + +[[package]] +name = "futures-macro" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21163e139fa306126e6eedaf49ecdb4588f939600f0b1e770f4205ee4b7fa868" + +[[package]] +name = "futures-task" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c66a976bf5909d801bbef33416c41372779507e7a6b3a5e25e4749c58f776a" + +[[package]] +name = "futures-util" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "generic-array" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd48d33ec7f05fbfa152300fdad764757cbded343c1aa1cff2fbaf4134851803" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418d37c8b1d42553c93648be529cb70f920d3baf8ef469b74b9638df426e0b4c" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.10.0+wasi-snapshot-preview1", +] + +[[package]] +name = "gimli" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78cc372d058dcf6d5ecd98510e7fbc9e5aec4d21de70f65fea8fecebcd881bd4" + +[[package]] +name = "git-version" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6b0decc02f4636b9ccad390dcbe77b722a77efedfa393caf8379a51d5c61899" +dependencies = [ + "git-version-macro", + "proc-macro-hack", +] + +[[package]] +name = "git-version-macro" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe69f1cbdb6e28af2bac214e943b99ce8a0a06b447d15d3e61161b0423139f3f" +dependencies = [ + "proc-macro-hack", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + +[[package]] +name = "h2" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9f1f717ddc7b2ba36df7e871fd88db79326551d3d6f1fc406fbfd28b582ff8e" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util 0.6.9", + "tracing", +] + +[[package]] +name = "half" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" + +[[package]] +name = "hashbrown" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" +dependencies = [ + "ahash", +] + +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +dependencies = [ + "serde", +] + +[[package]] +name = "hex-literal" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ebdb29d2ea9ed0083cd8cece49bbd968021bd99b0849edb4a9a7ee0fdf6a4e0" + +[[package]] +name = "hmac" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1441c6b1e930e2817404b5046f1f989899143a12bf92de603b69f4e0aee1e15" +dependencies = [ + "crypto-mac 0.10.1", + "digest", +] + +[[package]] +name = "hmac" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a2a2320eb7ec0ebe8da8f744d7812d9fc4cb4d09344ac01898dbcb6a20ae69b" +dependencies = [ + "crypto-mac 0.11.1", + "digest", +] + +[[package]] +name = "http" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31f4c6746584866f0feabcc69893c5b51beef3831656a968ed7ae254cdc4fd03" +dependencies = [ + "bytes", + "fnv", + "itoa 1.0.1", +] + +[[package]] +name = "http-body" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ff4f84919677303da5f147645dbea6b1881f368d03ac84e1dc09031ebd7b2c6" +dependencies = [ + "bytes", + "http", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9100414882e15fb7feccb4897e5f0ff0ff1ca7d1a86a23208ada4d7a18e6c6c4" + +[[package]] +name = "httpdate" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "hyper" +version = "0.14.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "043f0e083e9901b6cc658a77d1eb86f4fc650bbb977a4337dd63192826aa85dd" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa 1.0.1", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87c48c02e0dc5e3b849a2041db3029fd066650f8f717c07bf8ed78ccb895cac" +dependencies = [ + "http", + "hyper", + "rustls 0.20.2", + "tokio", + "tokio-rustls 0.23.2", +] + +[[package]] +name = "hyper-timeout" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" +dependencies = [ + "hyper", + "pin-project-lite", + "tokio", + "tokio-io-timeout", +] + +[[package]] +name = "hyper-tls" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" +dependencies = [ + "bytes", + "hyper", + "native-tls", + "tokio", + "tokio-native-tls", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "idna" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" +dependencies = [ + "matches", + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "indexmap" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282a6247722caba404c065016bbfa522806e51714c34f5dfc3e4a3a46fcb4223" +dependencies = [ + "autocfg", + "hashbrown", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "ipnet" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f2d64f2edebec4ce84ad108148e67e1064789bee435edc5b60ad398714a3a9" + +[[package]] +name = "itertools" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" + +[[package]] +name = "itoa" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" + +[[package]] +name = "jobserver" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a38fc24e30fd564ce974c02bf1d337caddff65be6cc4735a1f7eab22a7440f04" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "jsonwebtoken" +version = "7.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afabcc15e437a6484fc4f12d0fd63068fe457bf93f1c148d3d9649c60b103f32" +dependencies = [ + "base64 0.12.3", + "pem 0.8.3", + "ring", + "serde", + "serde_json", + "simple_asn1", +] + +[[package]] +name = "kstring" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b310ccceade8121d7d77fee406160e457c2f4e7c7982d589da3499bc7ea4526" +dependencies = [ + "serde", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "libc" +version = "0.2.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e74d72e0f9b65b5b4ca49a346af3976df0f9c61d550727f349ecd559f251a26c" + +[[package]] +name = "libloading" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efbc0f03f9a775e9f6aed295c6a1ba2253c5757a9e03d55c6caa46a681abcddd" +dependencies = [ + "cfg-if", + "winapi", +] + +[[package]] +name = "lock_api" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88943dd7ef4a2e5a4bfa2753aaab3013e34ce2533d1996fb18ef591e315e2b3b" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +dependencies = [ + "cfg-if", + "serde", +] + +[[package]] +name = "matchers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "matches" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" + +[[package]] +name = "md-5" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5a279bb9607f9f53c22d496eade00d138d1bdcccd07d74650387cf94942a15" +dependencies = [ + "block-buffer", + "digest", + "opaque-debug", +] + +[[package]] +name = "md5" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" + +[[package]] +name = "memchr" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" + +[[package]] +name = "memoffset" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" +dependencies = [ + "autocfg", +] + +[[package]] +name = "mime" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" +dependencies = [ + "adler", + "autocfg", +] + +[[package]] +name = "mio" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52da4364ffb0e4fe33a9841a98a3f3014fb964045ce4f7a45a398243c8d6b0c9" +dependencies = [ + "libc", + "log", + "miow", + "ntapi", + "wasi 0.11.0+wasi-snapshot-preview1", + "winapi", +] + +[[package]] +name = "miow" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" +dependencies = [ + "winapi", +] + +[[package]] +name = "multimap" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" + +[[package]] +name = "native-tls" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48ba9f7719b5a0f42f338907614285fb5fd70e53858141f69898a1fb7203b24d" +dependencies = [ + "lazy_static", + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "nix" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" +dependencies = [ + "bitflags", + "cc", + "cfg-if", + "libc", + "memoffset", +] + +[[package]] +name = "nom" +version = "7.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b1d11e1ef389c76fe5b81bcaf2ea32cf88b62bc494e19f493d0b30e7a930109" +dependencies = [ + "memchr", + "minimal-lexical", + "version_check", +] + +[[package]] +name = "ntapi" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c28774a7fd2fbb4f0babd8237ce554b73af68021b5f695a3cebd6c59bac0980f" +dependencies = [ + "winapi", +] + +[[package]] +name = "num-bigint" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "object" +version = "0.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67ac1d3f9a1d3616fd9a60c8d74296f22406a238b6a72f5cc1e6f314df4ffbf9" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" + +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + +[[package]] +name = "opaque-debug" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" + +[[package]] +name = "openssl" +version = "0.10.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c7ae222234c30df141154f159066c5093ff73b63204dcda7121eb082fc56a95" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-sys", +] + +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "openssl-sys" +version = "0.9.72" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e46109c383602735fa0a2e48dd2b7c892b048e1bf69e5c3b1d804b7d9c203cb" +dependencies = [ + "autocfg", + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "os_str_bytes" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" +dependencies = [ + "memchr", +] + +[[package]] +name = "pageserver" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-compression", + "async-trait", + "byteorder", + "bytes", + "chrono", + "clap 3.0.14", + "const_format", + "crc32c", + "crossbeam-utils", + "daemonize", + "fail", + "futures", + "hex", + "hex-literal", + "humantime", + "hyper", + "itertools", + "lazy_static", + "log", + "nix", + "once_cell", + "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "postgres_ffi", + "rand", + "regex", + "rusoto_core", + "rusoto_s3", + "scopeguard", + "serde", + "serde_json", + "serde_with", + "signal-hook", + "tar", + "tempfile", + "thiserror", + "tokio", + "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "tokio-stream", + "tokio-util 0.7.0", + "toml_edit", + "tracing", + "tracing-futures", + "url", + "workspace_hack", + "zenith_metrics", + "zenith_utils", +] + +[[package]] +name = "parking_lot" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" +dependencies = [ + "cfg-if", + "instant", + "libc", + "redox_syscall", + "smallvec", + "winapi", +] + +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + +[[package]] +name = "pem" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd56cbd21fea48d0c440b41cd69c589faacade08c992d9a54e471b79d0fd13eb" +dependencies = [ + "base64 0.13.0", + "once_cell", + "regex", +] + +[[package]] +name = "pem" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9a3b09a20e374558580a4914d3b7d89bd61b954a5a5e1dcbea98753addb1947" +dependencies = [ + "base64 0.13.0", +] + +[[package]] +name = "percent-encoding" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" + +[[package]] +name = "petgraph" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" +dependencies = [ + "fixedbitset", + "indexmap", +] + +[[package]] +name = "phf" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_shared" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58ad3879ad3baf4e44784bc6a718a8698867bb991f8ce24d1bcbe2cfb4c3a75e" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "744b6f092ba29c3650faf274db506afd39944f48420f6c86b17cfe0ee1cb36bb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e280fbe77cc62c91527259e9442153f4688736748d24660126286329742b4c6c" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58893f751c9b0412871a09abd62ecd2a00298c6c83befa223ef98c52aef40cbe" + +[[package]] +name = "plotters" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c" + +[[package]] +name = "plotters-svg" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "postgres" +version = "0.19.1" +source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" +dependencies = [ + "bytes", + "fallible-iterator", + "futures", + "log", + "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "tokio", + "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", +] + +[[package]] +name = "postgres" +version = "0.19.1" +source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858#9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" +dependencies = [ + "bytes", + "fallible-iterator", + "futures", + "log", + "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", + "tokio", + "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", +] + +[[package]] +name = "postgres-protocol" +version = "0.6.1" +source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" +dependencies = [ + "base64 0.13.0", + "byteorder", + "bytes", + "fallible-iterator", + "hmac 0.10.1", + "lazy_static", + "md-5", + "memchr", + "rand", + "sha2", + "stringprep", +] + +[[package]] +name = "postgres-protocol" +version = "0.6.1" +source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858#9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" +dependencies = [ + "base64 0.13.0", + "byteorder", + "bytes", + "fallible-iterator", + "hmac 0.10.1", + "lazy_static", + "md-5", + "memchr", + "rand", + "sha2", + "stringprep", +] + +[[package]] +name = "postgres-types" +version = "0.2.1" +source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" +dependencies = [ + "bytes", + "fallible-iterator", + "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", +] + +[[package]] +name = "postgres-types" +version = "0.2.1" +source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858#9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" +dependencies = [ + "bytes", + "fallible-iterator", + "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", +] + +[[package]] +name = "postgres_ffi" +version = "0.1.0" +dependencies = [ + "anyhow", + "bindgen", + "byteorder", + "bytes", + "chrono", + "crc32c", + "hex", + "lazy_static", + "log", + "memoffset", + "rand", + "regex", + "serde", + "thiserror", + "workspace_hack", + "zenith_utils", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" + +[[package]] +name = "proc-macro-hack" +version = "0.5.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" + +[[package]] +name = "proc-macro2" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "prometheus" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7f64969ffd5dd8f39bd57a68ac53c163a095ed9d0fb707146da1b27025a3504" +dependencies = [ + "cfg-if", + "fnv", + "lazy_static", + "memchr", + "parking_lot", + "thiserror", +] + +[[package]] +name = "prost" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "444879275cb4fd84958b1a1d5420d15e6fcf7c235fe47f053c9c2a80aceb6001" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62941722fb675d463659e49c4f3fe1fe792ff24fe5bbaa9c08cd3b98a1c354f5" +dependencies = [ + "bytes", + "heck", + "itertools", + "lazy_static", + "log", + "multimap", + "petgraph", + "prost", + "prost-types", + "regex", + "tempfile", + "which", +] + +[[package]] +name = "prost-derive" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9cc1a3263e07e0bf68e96268f37665207b49560d98739662cdfaae215c720fe" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534b7a0e836e3c482d2693070f982e39e7611da9695d4d1f5a4b186b51faef0a" +dependencies = [ + "bytes", + "prost", +] + +[[package]] +name = "proxy" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes", + "clap 3.0.14", + "fail", + "futures", + "hashbrown", + "hex", + "hyper", + "lazy_static", + "md5", + "parking_lot", + "pin-project-lite", + "rand", + "rcgen", + "reqwest", + "rustls 0.19.1", + "scopeguard", + "serde", + "serde_json", + "socket2", + "thiserror", + "tokio", + "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "tokio-postgres-rustls", + "tokio-rustls 0.22.0", + "workspace_hack", + "zenith_metrics", + "zenith_utils", +] + +[[package]] +name = "quote" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "864d3e96a899863136fc6e99f3d7cae289dafe43bf2c5ac19b70df7210c0a145" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", + "rand_hc", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_hc" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rayon" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "lazy_static", + "num_cpus", +] + +[[package]] +name = "rcgen" +version = "0.8.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5911d1403f4143c9d56a702069d593e8d0f3fab880a85e103604d0893ea31ba7" +dependencies = [ + "chrono", + "pem 1.0.2", + "ring", + "yasna", +] + +[[package]] +name = "redox_syscall" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +dependencies = [ + "bitflags", +] + +[[package]] +name = "redox_users" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" +dependencies = [ + "getrandom", + "redox_syscall", +] + +[[package]] +name = "regex" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" + +[[package]] +name = "remove_dir_all" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" +dependencies = [ + "winapi", +] + +[[package]] +name = "reqwest" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f242f1488a539a79bac6dbe7c8609ae43b7914b7736210f239a37cccb32525" +dependencies = [ + "base64 0.13.0", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-rustls", + "ipnet", + "js-sys", + "lazy_static", + "log", + "mime", + "percent-encoding", + "pin-project-lite", + "rustls 0.20.2", + "rustls-pemfile", + "serde", + "serde_json", + "serde_urlencoded", + "tokio", + "tokio-rustls 0.23.2", + "tokio-util 0.6.9", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "webpki-roots", + "winreg", +] + +[[package]] +name = "ring" +version = "0.16.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" +dependencies = [ + "cc", + "libc", + "once_cell", + "spin", + "untrusted", + "web-sys", + "winapi", +] + +[[package]] +name = "routerify" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945" +dependencies = [ + "http", + "hyper", + "lazy_static", + "percent-encoding", + "regex", +] + +[[package]] +name = "rusoto_core" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b4f000e8934c1b4f70adde180056812e7ea6b1a247952db8ee98c94cd3116cc" +dependencies = [ + "async-trait", + "base64 0.13.0", + "bytes", + "crc32fast", + "futures", + "http", + "hyper", + "hyper-tls", + "lazy_static", + "log", + "rusoto_credential", + "rusoto_signature", + "rustc_version", + "serde", + "serde_json", + "tokio", + "xml-rs", +] + +[[package]] +name = "rusoto_credential" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a46b67db7bb66f5541e44db22b0a02fed59c9603e146db3a9e633272d3bac2f" +dependencies = [ + "async-trait", + "chrono", + "dirs-next", + "futures", + "hyper", + "serde", + "serde_json", + "shlex", + "tokio", + "zeroize", +] + +[[package]] +name = "rusoto_s3" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "048c2fe811a823ad5a9acc976e8bf4f1d910df719dcf44b15c3e96c5b7a51027" +dependencies = [ + "async-trait", + "bytes", + "futures", + "rusoto_core", + "xml-rs", +] + +[[package]] +name = "rusoto_signature" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6264e93384b90a747758bcc82079711eacf2e755c3a8b5091687b5349d870bcc" +dependencies = [ + "base64 0.13.0", + "bytes", + "chrono", + "digest", + "futures", + "hex", + "hmac 0.11.0", + "http", + "hyper", + "log", + "md-5", + "percent-encoding", + "pin-project-lite", + "rusoto_credential", + "rustc_version", + "serde", + "sha2", + "tokio", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + +[[package]] +name = "rustls" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7" +dependencies = [ + "base64 0.13.0", + "log", + "ring", + "sct 0.6.1", + "webpki 0.21.4", +] + +[[package]] +name = "rustls" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d37e5e2290f3e040b594b1a9e04377c2c671f1a1cfd9bfdef82106ac1c113f84" +dependencies = [ + "log", + "ring", + "sct 0.7.0", + "webpki 0.22.0", +] + +[[package]] +name = "rustls-pemfile" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eebeaeb360c87bfb72e84abdb3447159c0eaececf1bef2aecd65a8be949d1c9" +dependencies = [ + "base64 0.13.0", +] + +[[package]] +name = "rustls-split" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fb079b52cfdb005752b7c3c646048e702003576a8321058e4c8b38227c11aa6" +dependencies = [ + "rustls 0.19.1", +] + +[[package]] +name = "rustversion" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2cc38e8fa666e2de3c4aba7edeb5ffc5246c1c2ed0e3d17e560aeeba736b23f" + +[[package]] +name = "ryu" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schannel" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f05ba609c234e60bee0d547fe94a4c7e9da733d1c962cf6e59efa4cd9c8bc75" +dependencies = [ + "lazy_static", + "winapi", +] + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "sct" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b362b83898e0e69f38515b82ee15aa80636befe47c3b6d3d89a911e78fc228ce" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "sct" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "security-framework" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dc14f172faf8a0194a3aded622712b0de276821addc574fa54fc0a1167e10dc" +dependencies = [ + "bitflags", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0160a13a177a45bfb43ce71c01580998474f556ad854dcbca936dd2841a5c556" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0486718e92ec9a68fbed73bb5ef687d71103b142595b406835649bebd33f72c7" + +[[package]] +name = "serde" +version = "1.0.136" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce31e24b01e1e524df96f1c2fdd054405f8d7376249a5110886fb4b658484789" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_cbor" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" +dependencies = [ + "half", + "serde", +] + +[[package]] +name = "serde_derive" +version = "1.0.136" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08597e7152fcd306f41838ed3e37be9eaeed2b61c42e2117266a554fab4662f9" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d23c1ba4cf0efd44be32017709280b32d1cea5c3f1275c3b6d9e8bc54f758085" +dependencies = [ + "itoa 1.0.1", + "ryu", + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa 1.0.1", + "ryu", + "serde", +] + +[[package]] +name = "serde_with" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec1e6ec4d8950e5b1e894eac0d360742f3b1407a6078a604a731c4b3f49cefbc" +dependencies = [ + "rustversion", + "serde", + "serde_with_macros", +] + +[[package]] +name = "serde_with_macros" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12e47be9471c72889ebafb5e14d5ff930d89ae7a67bbdb5f8abb564f845a927e" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "sha2" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800" +dependencies = [ + "block-buffer", + "cfg-if", + "cpufeatures", + "digest", + "opaque-debug", +] + +[[package]] +name = "sharded-slab" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" + +[[package]] +name = "signal-hook" +version = "0.3.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "647c97df271007dcea485bb74ffdb57f2e683f1306c854f468a0c244badabf2d" +dependencies = [ + "libc", + "signal-hook-registry", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" +dependencies = [ + "libc", +] + +[[package]] +name = "simple_asn1" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "692ca13de57ce0613a363c8c2f1de925adebc81b04c923ac60c5488bb44abe4b" +dependencies = [ + "chrono", + "num-bigint", + "num-traits", +] + +[[package]] +name = "siphasher" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a86232ab60fa71287d7f2ddae4a7073f6b7aac33631c3015abb556f08c6d0a3e" + +[[package]] +name = "slab" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9def91fd1e018fe007022791f865d0ccc9b3a0d5001e01aabb8b40e46000afb5" + +[[package]] +name = "smallvec" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" + +[[package]] +name = "socket2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66d72b759436ae32898a2af0a14218dbf55efde3feeb170eb623637db85ee1e0" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "spin" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" + +[[package]] +name = "stringprep" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ee348cb74b87454fff4b551cbf727025810a004f88aeacae7f85b87f4e9a1c1" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "subtle" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" + +[[package]] +name = "syn" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a65b3f4ffa0092e9887669db0eae07941f023991ab58ea44da8fe8e2d511c6b" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "tar" +version = "0.4.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6" +dependencies = [ + "filetime", + "libc", + "xattr", +] + +[[package]] +name = "tempfile" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" +dependencies = [ + "cfg-if", + "fastrand", + "libc", + "redox_syscall", + "remove_dir_all", + "winapi", +] + +[[package]] +name = "termcolor" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "textwrap" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0066c8d12af8b5acd21e00547c3797fde4e8677254a7ee429176ccebbe93dd80" + +[[package]] +name = "thiserror" +version = "1.0.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thread_local" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180" +dependencies = [ + "once_cell", +] + +[[package]] +name = "time" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" +dependencies = [ + "libc", + "wasi 0.10.0+wasi-snapshot-preview1", + "winapi", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "tinyvec" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c1c1d5a42b6245520c249549ec267180beaffcc0615401ac8e31853d4b6d8d2" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" + +[[package]] +name = "tokio" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2af73ac49756f3f7c01172e34a23e5d0216f6c32333757c2c61feb2bbff5a5ee" +dependencies = [ + "bytes", + "libc", + "memchr", + "mio", + "num_cpus", + "once_cell", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "winapi", +] + +[[package]] +name = "tokio-io-timeout" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" +dependencies = [ + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-macros" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b557f72f448c511a979e2564e55d74e6c4432fc96ff4f6241bc6bded342643b7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-postgres" +version = "0.7.1" +source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" +dependencies = [ + "async-trait", + "byteorder", + "bytes", + "fallible-iterator", + "futures", + "log", + "parking_lot", + "percent-encoding", + "phf", + "pin-project-lite", + "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "socket2", + "tokio", + "tokio-util 0.6.9", +] + +[[package]] +name = "tokio-postgres" +version = "0.7.1" +source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858#9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" +dependencies = [ + "async-trait", + "byteorder", + "bytes", + "fallible-iterator", + "futures", + "log", + "parking_lot", + "percent-encoding", + "phf", + "pin-project-lite", + "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", + "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", + "socket2", + "tokio", + "tokio-util 0.6.9", +] + +[[package]] +name = "tokio-postgres-rustls" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bd8c37d8c23cb6ecdc32fc171bade4e9c7f1be65f693a17afbaad02091a0a19" +dependencies = [ + "futures", + "ring", + "rustls 0.19.1", + "tokio", + "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "tokio-rustls 0.22.0", + "webpki 0.21.4", +] + +[[package]] +name = "tokio-rustls" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6" +dependencies = [ + "rustls 0.19.1", + "tokio", + "webpki 0.21.4", +] + +[[package]] +name = "tokio-rustls" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a27d5f2b839802bd8267fa19b0530f5a08b9c08cd417976be2a65d130fe1c11b" +dependencies = [ + "rustls 0.20.2", + "tokio", + "webpki 0.22.0", +] + +[[package]] +name = "tokio-stream" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50145484efff8818b5ccd256697f36863f587da82cf8b409c53adf1e840798e3" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e99e1983e5d376cd8eb4b66604d2e99e79f5bd988c3055891dcd8c9e2604cc0" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "log", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64910e1b9c1901aaf5375561e35b9c057d95ff41a44ede043a03e09279eabaf1" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "log", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "toml" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "744e9ed5b352340aa47ce033716991b5589e23781acb97cad37d4ea70560f55b" +dependencies = [ + "combine", + "indexmap", + "itertools", + "kstring", + "serde", +] + +[[package]] +name = "tonic" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff08f4649d10a70ffa3522ca559031285d8e421d727ac85c60825761818f5d0a" +dependencies = [ + "async-stream", + "async-trait", + "base64 0.13.0", + "bytes", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-timeout", + "percent-encoding", + "pin-project", + "prost", + "prost-derive", + "tokio", + "tokio-stream", + "tokio-util 0.6.9", + "tower", + "tower-layer", + "tower-service", + "tracing", + "tracing-futures", +] + +[[package]] +name = "tonic-build" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9403f1bafde247186684b230dc6f38b5cd514584e8bec1dd32514be4745fa757" +dependencies = [ + "proc-macro2", + "prost-build", + "quote", + "syn", +] + +[[package]] +name = "tower" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a89fd63ad6adf737582df5db40d286574513c69a11dac5214dc3b5603d6713e" +dependencies = [ + "futures-core", + "futures-util", + "indexmap", + "pin-project", + "pin-project-lite", + "rand", + "slab", + "tokio", + "tokio-util 0.7.0", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-layer" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "343bc9466d3fe6b0f960ef45960509f84480bf4fd96f92901afe7ff3df9d3a62" + +[[package]] +name = "tower-service" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" + +[[package]] +name = "tracing" +version = "0.1.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d8d93354fe2a8e50d5953f5ae2e47a3fc2ef03292e7ea46e3cc38f549525fb9" +dependencies = [ + "cfg-if", + "log", + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8276d9a4a3a558d7b7ad5303ad50b53d58264641b82914b7ada36bd762e7a716" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03cfcb51380632a72d3111cb8d3447a8d908e577d31beeac006f836383d29a23" +dependencies = [ + "lazy_static", + "valuable", +] + +[[package]] +name = "tracing-futures" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2" +dependencies = [ + "pin-project", + "tracing", +] + +[[package]] +name = "tracing-log" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3" +dependencies = [ + "lazy_static", + "log", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74786ce43333fcf51efe947aed9718fbe46d5c7328ec3f1029e818083966d9aa" +dependencies = [ + "ansi_term", + "lazy_static", + "matchers", + "regex", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", +] + +[[package]] +name = "try-lock" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" + +[[package]] +name = "typenum" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" + +[[package]] +name = "unicode-bidi" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a01404663e3db436ed2746d9fefef640d868edae3cceb81c3b8d5732fda678f" + +[[package]] +name = "unicode-normalization" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99" + +[[package]] +name = "unicode-width" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" + +[[package]] +name = "unicode-xid" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "untrusted" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" + +[[package]] +name = "url" +version = "2.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" +dependencies = [ + "form_urlencoded", + "idna", + "matches", + "percent-encoding", +] + +[[package]] +name = "valuable" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "walkdir" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +dependencies = [ + "same-file", + "winapi", + "winapi-util", +] + +[[package]] +name = "walkeeper" +version = "0.1.0" +dependencies = [ + "anyhow", + "byteorder", + "bytes", + "clap 3.0.14", + "const_format", + "crc32c", + "daemonize", + "etcd-client", + "fs2", + "hex", + "humantime", + "hyper", + "lazy_static", + "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "postgres_ffi", + "regex", + "rusoto_core", + "rusoto_s3", + "serde", + "serde_json", + "serde_with", + "signal-hook", + "tempfile", + "tokio", + "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "tokio-util 0.7.0", + "tracing", + "url", + "walkdir", + "workspace_hack", + "zenith_metrics", + "zenith_utils", +] + +[[package]] +name = "want" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" +dependencies = [ + "log", + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.10.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25f1af7423d8588a3d840681122e72e6a24ddbcb3f0ec385cac0d12d24256c06" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b21c0df030f5a177f3cba22e9bc4322695ec43e7257d865302900290bcdedca" +dependencies = [ + "bumpalo", + "lazy_static", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eb6ec270a31b1d3c7e266b999739109abce8b6c87e4b31fcfcd788b65267395" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f4203d69e40a52ee523b2529a773d5ffc1dc0071801c87b3d270b471b80ed01" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa8a30d46208db204854cadbb5d4baf5fcf8071ba5bf48190c3e59937962ebc" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d958d035c4438e28c70e4321a2911302f10135ce78a9c7834c0cab4123d06a2" + +[[package]] +name = "web-sys" +version = "0.3.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c060b319f29dd25724f09a2ba1418f142f539b2be99fbf4d2d5a8f7330afb8eb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki" +version = "0.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e38c0608262c46d4a56202ebabdeb094cef7e560ca7a226c6bf055188aa4ea" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "webpki" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f095d78192e208183081cc07bc5515ef55216397af48b873e5edcd72637fa1bd" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "webpki-roots" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "552ceb903e957524388c4d3475725ff2c8b7960922063af6ce53c9a43da07449" +dependencies = [ + "webpki 0.22.0", +] + +[[package]] +name = "which" +version = "4.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a5a7e487e921cf220206864a94a89b6c6905bfc19f1057fa26a4cb360e5c1d2" +dependencies = [ + "either", + "lazy_static", + "libc", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "winreg" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69" +dependencies = [ + "winapi", +] + +[[package]] +name = "workspace_hack" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes", + "cc", + "clap 2.34.0", + "either", + "hashbrown", + "libc", + "log", + "memchr", + "num-integer", + "num-traits", + "proc-macro2", + "quote", + "regex", + "regex-syntax", + "reqwest", + "scopeguard", + "serde", + "syn", + "tokio", + "tracing", + "tracing-core", +] + +[[package]] +name = "xattr" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "244c3741f4240ef46274860397c7c74e50eb23624996930e484c16679633a54c" +dependencies = [ + "libc", +] + +[[package]] +name = "xml-rs" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3" + +[[package]] +name = "yasna" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e262a29d0e61ccf2b6190d7050d4b237535fc76ce4c1210d9caa316f71dffa75" +dependencies = [ + "chrono", +] + +[[package]] +name = "zenith" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap 3.0.14", + "control_plane", + "pageserver", + "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "postgres_ffi", + "serde_json", + "walkeeper", + "workspace_hack", + "zenith_utils", +] + +[[package]] +name = "zenith_metrics" +version = "0.1.0" +dependencies = [ + "lazy_static", + "libc", + "once_cell", + "prometheus", + "workspace_hack", +] + +[[package]] +name = "zenith_utils" +version = "0.1.0" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "bytes", + "criterion", + "git-version", + "hex", + "hex-literal", + "hyper", + "jsonwebtoken", + "lazy_static", + "nix", + "pin-project-lite", + "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "rand", + "routerify", + "rustls 0.19.1", + "rustls-split", + "serde", + "serde_json", + "serde_with", + "signal-hook", + "tempfile", + "thiserror", + "tokio", + "tracing", + "tracing-subscriber", + "webpki 0.21.4", + "workspace_hack", + "zenith_metrics", +] + +[[package]] +name = "zeroize" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c88870063c39ee00ec285a2f8d6a966e5b6fb2becc4e8dac77ed0d370ed6006" + +[[package]] +name = "zstd" +version = "0.10.0+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b1365becbe415f3f0fcd024e2f7b45bacfb5bdd055f0dc113571394114e7bdd" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "4.1.4+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f7cd17c9af1a4d6c24beb1cc54b17e2ef7b593dc92f19e9d9acad8b182bbaee" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "1.6.3+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc49afa5c8d634e75761feda8c592051e7eeb4683ba827211eb0d731d3402ea8" +dependencies = [ + "cc", + "libc", +] diff --git a/walkeeper/Cargo.toml b/walkeeper/Cargo.toml index ddce78e737..86aa56c9ae 100644 --- a/walkeeper/Cargo.toml +++ b/walkeeper/Cargo.toml @@ -14,8 +14,7 @@ serde_json = "1" tracing = "0.1.27" clap = "3.0" daemonize = "0.4.1" -rust-s3 = { version = "0.28", default-features = false, features = ["no-verify-ssl", "tokio-rustls-tls"] } -tokio = { version = "1.17", features = ["macros"] } +tokio = { version = "1.17", features = ["macros", "fs"] } postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } anyhow = "1.0" @@ -30,6 +29,9 @@ hex = "0.4.3" const_format = "0.2.21" tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } etcd-client = "0.8.3" +tokio-util = { version = "0.7", features = ["io"] } +rusoto_core = "0.47" +rusoto_s3 = "0.47" postgres_ffi = { path = "../postgres_ffi" } zenith_metrics = { path = "../zenith_metrics" } diff --git a/walkeeper/src/s3_offload.rs b/walkeeper/src/s3_offload.rs index 2b3285e6c6..c796f53615 100644 --- a/walkeeper/src/s3_offload.rs +++ b/walkeeper/src/s3_offload.rs @@ -2,19 +2,19 @@ // Offload old WAL segments to S3 and remove them locally // -use anyhow::Result; +use anyhow::Context; use postgres_ffi::xlog_utils::*; -use s3::bucket::Bucket; -use s3::creds::Credentials; -use s3::region::Region; +use rusoto_core::credential::StaticProvider; +use rusoto_core::{HttpClient, Region}; +use rusoto_s3::{ListObjectsV2Request, PutObjectRequest, S3Client, StreamingBody, S3}; use std::collections::HashSet; use std::env; -use std::fs::{self, File}; -use std::io::prelude::*; use std::path::Path; use std::time::SystemTime; +use tokio::fs::{self, File}; use tokio::runtime; use tokio::time::sleep; +use tokio_util::io::ReaderStream; use tracing::*; use walkdir::WalkDir; @@ -39,11 +39,12 @@ pub fn thread_main(conf: SafeKeeperConf) { } async fn offload_files( - bucket: &Bucket, + client: &S3Client, + bucket_name: &str, listing: &HashSet, dir_path: &Path, conf: &SafeKeeperConf, -) -> Result { +) -> anyhow::Result { let horizon = SystemTime::now() - conf.ttl.unwrap(); let mut n: u64 = 0; for entry in WalkDir::new(dir_path) { @@ -57,12 +58,17 @@ async fn offload_files( let relpath = path.strip_prefix(&conf.workdir).unwrap(); let s3path = String::from("walarchive/") + relpath.to_str().unwrap(); if !listing.contains(&s3path) { - let mut file = File::open(&path)?; - let mut content = Vec::new(); - file.read_to_end(&mut content)?; - bucket.put_object(s3path, &content).await?; + let file = File::open(&path).await?; + client + .put_object(PutObjectRequest { + body: Some(StreamingBody::new(ReaderStream::new(file))), + bucket: bucket_name.to_string(), + key: s3path, + ..PutObjectRequest::default() + }) + .await?; - fs::remove_file(&path)?; + fs::remove_file(&path).await?; n += 1; } } @@ -70,35 +76,59 @@ async fn offload_files( Ok(n) } -async fn main_loop(conf: &SafeKeeperConf) -> Result<()> { +async fn main_loop(conf: &SafeKeeperConf) -> anyhow::Result<()> { let region = Region::Custom { - region: env::var("S3_REGION").unwrap(), - endpoint: env::var("S3_ENDPOINT").unwrap(), + name: env::var("S3_REGION").context("S3_REGION env var is not set")?, + endpoint: env::var("S3_ENDPOINT").context("S3_ENDPOINT env var is not set")?, }; - let credentials = Credentials::new( - Some(&env::var("S3_ACCESSKEY").unwrap()), - Some(&env::var("S3_SECRET").unwrap()), - None, - None, - None, - ) - .unwrap(); - // Create Bucket in REGION for BUCKET - let bucket = Bucket::new_with_path_style("zenith-testbucket", region, credentials)?; + let client = S3Client::new_with( + HttpClient::new().context("Failed to create S3 http client")?, + StaticProvider::new_minimal( + env::var("S3_ACCESSKEY").context("S3_ACCESSKEY env var is not set")?, + env::var("S3_SECRET").context("S3_SECRET env var is not set")?, + ), + region, + ); + + let bucket_name = "zenith-testbucket"; loop { - // List out contents of directory - let results = bucket - .list("walarchive/".to_string(), Some("".to_string())) - .await?; - let listing = results - .iter() - .flat_map(|b| b.contents.iter().map(|o| o.key.clone())) - .collect(); - - let n = offload_files(&bucket, &listing, &conf.workdir, conf).await?; + let listing = gather_wal_entries(&client, bucket_name).await?; + let n = offload_files(&client, bucket_name, &listing, &conf.workdir, conf).await?; info!("Offload {} files to S3", n); sleep(conf.ttl.unwrap()).await; } } + +async fn gather_wal_entries( + client: &S3Client, + bucket_name: &str, +) -> anyhow::Result> { + let mut document_keys = HashSet::new(); + + let mut continuation_token = None::; + loop { + let response = client + .list_objects_v2(ListObjectsV2Request { + bucket: bucket_name.to_string(), + prefix: Some("walarchive/".to_string()), + continuation_token, + ..ListObjectsV2Request::default() + }) + .await?; + document_keys.extend( + response + .contents + .unwrap_or_default() + .into_iter() + .filter_map(|o| o.key), + ); + + continuation_token = response.continuation_token; + if continuation_token.is_none() { + break; + } + } + Ok(document_keys) +} From 4f172e7612870909613eb7c8f9c3d3a41a426618 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 9 Apr 2022 01:15:20 +0300 Subject: [PATCH 0120/1022] Replicate S3 blob metadata in the remote storage --- pageserver/src/remote_storage.rs | 12 +- pageserver/src/remote_storage/local_fs.rs | 188 +++++++++++++++--- pageserver/src/remote_storage/s3_bucket.rs | 12 +- .../src/remote_storage/storage_sync/upload.rs | 1 + 4 files changed, 179 insertions(+), 34 deletions(-) diff --git a/pageserver/src/remote_storage.rs b/pageserver/src/remote_storage.rs index 02d37af5de..aebd74af5a 100644 --- a/pageserver/src/remote_storage.rs +++ b/pageserver/src/remote_storage.rs @@ -325,27 +325,35 @@ trait RemoteStorage: Send + Sync { &self, from: impl io::AsyncRead + Unpin + Send + Sync + 'static, to: &Self::StoragePath, + metadata: Option, ) -> anyhow::Result<()>; /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer. + /// Returns the metadata, if any was stored with the file previously. async fn download( &self, from: &Self::StoragePath, to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result<()>; + ) -> anyhow::Result>; /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer. + /// Returns the metadata, if any was stored with the file previously. async fn download_range( &self, from: &Self::StoragePath, start_inclusive: u64, end_exclusive: Option, to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result<()>; + ) -> anyhow::Result>; async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()>; } +/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry. +/// Immutable, cannot be changed once the file is created. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct StorageMetadata(HashMap); + fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> { if prefix == path { anyhow::bail!( diff --git a/pageserver/src/remote_storage/local_fs.rs b/pageserver/src/remote_storage/local_fs.rs index bac693c8d0..846adf8e9b 100644 --- a/pageserver/src/remote_storage/local_fs.rs +++ b/pageserver/src/remote_storage/local_fs.rs @@ -5,7 +5,6 @@ //! volume is mounted to the local FS. use std::{ - ffi::OsString, future::Future, path::{Path, PathBuf}, pin::Pin, @@ -18,7 +17,7 @@ use tokio::{ }; use tracing::*; -use super::{strip_path_prefix, RemoteStorage}; +use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; pub struct LocalFs { pageserver_workdir: &'static Path, @@ -54,6 +53,32 @@ impl LocalFs { ) } } + + async fn read_storage_metadata( + &self, + file_path: &Path, + ) -> anyhow::Result> { + let metadata_path = storage_metadata_path(&file_path); + if metadata_path.exists() && metadata_path.is_file() { + let metadata_string = fs::read_to_string(&metadata_path).await.with_context(|| { + format!( + "Failed to read metadata from the local storage at '{}'", + metadata_path.display() + ) + })?; + + serde_json::from_str(&metadata_string) + .with_context(|| { + format!( + "Failed to deserialize metadata from the local storage at '{}'", + metadata_path.display() + ) + }) + .map(|metadata| Some(StorageMetadata(metadata))) + } else { + Ok(None) + } + } } #[async_trait::async_trait] @@ -81,19 +106,14 @@ impl RemoteStorage for LocalFs { &self, mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static, to: &Self::StoragePath, + metadata: Option, ) -> anyhow::Result<()> { let target_file_path = self.resolve_in_storage(to)?; create_target_directory(&target_file_path).await?; // We need this dance with sort of durable rename (without fsyncs) // to prevent partial uploads. This was really hit when pageserver shutdown // cancelled the upload and partial file was left on the fs - let mut temp_extension = target_file_path - .extension() - .unwrap_or_default() - .to_os_string(); - - temp_extension.push(OsString::from(".temp")); - let temp_file_path = target_file_path.with_extension(temp_extension); + let temp_file_path = path_with_suffix_extension(&target_file_path, ".temp"); let mut destination = io::BufWriter::new( fs::OpenOptions::new() .write(true) @@ -132,6 +152,23 @@ impl RemoteStorage for LocalFs { target_file_path.display() ) })?; + + if let Some(storage_metadata) = metadata { + let storage_metadata_path = storage_metadata_path(&target_file_path); + fs::write( + &storage_metadata_path, + serde_json::to_string(&storage_metadata.0) + .context("Failed to serialize storage metadata as json")?, + ) + .await + .with_context(|| { + format!( + "Failed to write metadata to the local storage at '{}'", + storage_metadata_path.display() + ) + })?; + } + Ok(()) } @@ -139,7 +176,7 @@ impl RemoteStorage for LocalFs { &self, from: &Self::StoragePath, to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result<()> { + ) -> anyhow::Result> { let file_path = self.resolve_in_storage(from)?; if file_path.exists() && file_path.is_file() { @@ -162,7 +199,8 @@ impl RemoteStorage for LocalFs { ) })?; source.flush().await?; - Ok(()) + + self.read_storage_metadata(&file_path).await } else { bail!( "File '{}' either does not exist or is not a file", @@ -177,7 +215,7 @@ impl RemoteStorage for LocalFs { start_inclusive: u64, end_exclusive: Option, to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result<()> { + ) -> anyhow::Result> { if let Some(end_exclusive) = end_exclusive { ensure!( end_exclusive > start_inclusive, @@ -186,7 +224,7 @@ impl RemoteStorage for LocalFs { end_exclusive ); if start_inclusive == end_exclusive.saturating_sub(1) { - return Ok(()); + return Ok(None); } } let file_path = self.resolve_in_storage(from)?; @@ -220,7 +258,8 @@ impl RemoteStorage for LocalFs { file_path.display() ) })?; - Ok(()) + + self.read_storage_metadata(&file_path).await } else { bail!( "File '{}' either does not exist or is not a file", @@ -242,6 +281,17 @@ impl RemoteStorage for LocalFs { } } +fn path_with_suffix_extension(original_path: &Path, suffix: &str) -> PathBuf { + let mut extension_with_suffix = original_path.extension().unwrap_or_default().to_os_string(); + extension_with_suffix.push(suffix); + + original_path.with_extension(extension_with_suffix) +} + +fn storage_metadata_path(original_path: &Path) -> PathBuf { + path_with_suffix_extension(original_path, ".metadata") +} + fn get_all_files<'a, P>( directory_path: P, ) -> Pin>> + Send + Sync + 'a>> @@ -451,7 +501,7 @@ mod fs_tests { use super::*; use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID}; - use std::io::Write; + use std::{collections::HashMap, io::Write}; use tempfile::tempdir; #[tokio::test] @@ -465,7 +515,7 @@ mod fs_tests { ) .await?; let target_path = PathBuf::from("/").join("somewhere").join("else"); - match storage.upload(source, &target_path).await { + match storage.upload(source, &target_path, None).await { Ok(()) => panic!("Should not allow storing files with wrong target path"), Err(e) => { let message = format!("{:?}", e); @@ -475,14 +525,14 @@ mod fs_tests { } assert!(storage.list().await?.is_empty()); - let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1").await?; + let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1", None).await?; assert_eq!( storage.list().await?, vec![target_path_1.clone()], "Should list a single file after first upload" ); - let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2").await?; + let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2", None).await?; assert_eq!( list_files_sorted(&storage).await?, vec![target_path_1.clone(), target_path_2.clone()], @@ -503,12 +553,16 @@ mod fs_tests { let repo_harness = RepoHarness::create("download_file")?; let storage = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?; + let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?; let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - storage.download(&upload_target, &mut content_bytes).await?; - content_bytes.flush().await?; + let metadata = storage.download(&upload_target, &mut content_bytes).await?; + assert!( + metadata.is_none(), + "No metadata should be returned for no metadata upload" + ); + content_bytes.flush().await?; let contents = String::from_utf8(content_bytes.into_inner().into_inner())?; assert_eq!( dummy_contents(upload_name), @@ -533,12 +587,16 @@ mod fs_tests { let repo_harness = RepoHarness::create("download_file_range_positive")?; let storage = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?; + let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?; let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - storage + let metadata = storage .download_range(&upload_target, 0, None, &mut full_range_bytes) .await?; + assert!( + metadata.is_none(), + "No metadata should be returned for no metadata upload" + ); full_range_bytes.flush().await?; assert_eq!( dummy_contents(upload_name), @@ -548,7 +606,7 @@ mod fs_tests { let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); let same_byte = 1_000_000_000; - storage + let metadata = storage .download_range( &upload_target, same_byte, @@ -556,6 +614,10 @@ mod fs_tests { &mut zero_range_bytes, ) .await?; + assert!( + metadata.is_none(), + "No metadata should be returned for no metadata upload" + ); zero_range_bytes.flush().await?; assert!( zero_range_bytes.into_inner().into_inner().is_empty(), @@ -566,7 +628,7 @@ mod fs_tests { let (first_part_local, second_part_local) = uploaded_bytes.split_at(3); let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - storage + let metadata = storage .download_range( &upload_target, 0, @@ -574,6 +636,11 @@ mod fs_tests { &mut first_part_remote, ) .await?; + assert!( + metadata.is_none(), + "No metadata should be returned for no metadata upload" + ); + first_part_remote.flush().await?; let first_part_remote = first_part_remote.into_inner().into_inner(); assert_eq!( @@ -583,7 +650,7 @@ mod fs_tests { ); let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - storage + let metadata = storage .download_range( &upload_target, first_part_local.len() as u64, @@ -591,6 +658,11 @@ mod fs_tests { &mut second_part_remote, ) .await?; + assert!( + metadata.is_none(), + "No metadata should be returned for no metadata upload" + ); + second_part_remote.flush().await?; let second_part_remote = second_part_remote.into_inner().into_inner(); assert_eq!( @@ -607,7 +679,7 @@ mod fs_tests { let repo_harness = RepoHarness::create("download_file_range_negative")?; let storage = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?; + let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?; let start = 10000; let end = 234; @@ -645,7 +717,7 @@ mod fs_tests { let repo_harness = RepoHarness::create("delete_file")?; let storage = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?; + let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?; storage.delete(&upload_target).await?; assert!(storage.list().await?.is_empty()); @@ -661,10 +733,69 @@ mod fs_tests { Ok(()) } + #[tokio::test] + async fn file_with_metadata() -> anyhow::Result<()> { + let repo_harness = RepoHarness::create("download_file")?; + let storage = create_storage()?; + let upload_name = "upload_1"; + let metadata = StorageMetadata(HashMap::from([ + ("one".to_string(), "1".to_string()), + ("two".to_string(), "2".to_string()), + ])); + let upload_target = + upload_dummy_file(&repo_harness, &storage, upload_name, Some(metadata.clone())).await?; + + let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); + let full_download_metadata = storage.download(&upload_target, &mut content_bytes).await?; + + content_bytes.flush().await?; + let contents = String::from_utf8(content_bytes.into_inner().into_inner())?; + assert_eq!( + dummy_contents(upload_name), + contents, + "We should upload and download the same contents" + ); + + assert_eq!( + full_download_metadata.as_ref(), + Some(&metadata), + "We should get the same metadata back for full download" + ); + + let uploaded_bytes = dummy_contents(upload_name).into_bytes(); + let (first_part_local, _) = uploaded_bytes.split_at(3); + + let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); + let partial_download_metadata = storage + .download_range( + &upload_target, + 0, + Some(first_part_local.len() as u64), + &mut first_part_remote, + ) + .await?; + first_part_remote.flush().await?; + let first_part_remote = first_part_remote.into_inner().into_inner(); + assert_eq!( + first_part_local, + first_part_remote.as_slice(), + "First part bytes should be returned when requested" + ); + + assert_eq!( + partial_download_metadata.as_ref(), + Some(&metadata), + "We should get the same metadata back for partial download" + ); + + Ok(()) + } + async fn upload_dummy_file( harness: &RepoHarness<'_>, storage: &LocalFs, name: &str, + metadata: Option, ) -> anyhow::Result { let timeline_path = harness.timeline_path(&TIMELINE_ID); let relative_timeline_path = timeline_path.strip_prefix(&harness.conf.workdir)?; @@ -677,6 +808,7 @@ mod fs_tests { ) .await?, &storage_path, + metadata, ) .await?; Ok(storage_path) diff --git a/pageserver/src/remote_storage/s3_bucket.rs b/pageserver/src/remote_storage/s3_bucket.rs index 92b3b0cce8..bfd28168f4 100644 --- a/pageserver/src/remote_storage/s3_bucket.rs +++ b/pageserver/src/remote_storage/s3_bucket.rs @@ -24,6 +24,8 @@ use crate::{ remote_storage::{strip_path_prefix, RemoteStorage}, }; +use super::StorageMetadata; + const S3_FILE_SEPARATOR: char = '/'; #[derive(Debug, Eq, PartialEq)] @@ -179,12 +181,14 @@ impl RemoteStorage for S3Bucket { &self, from: impl io::AsyncRead + Unpin + Send + Sync + 'static, to: &Self::StoragePath, + metadata: Option, ) -> anyhow::Result<()> { self.client .put_object(PutObjectRequest { body: Some(StreamingBody::new(ReaderStream::new(from))), bucket: self.bucket_name.clone(), key: to.key().to_owned(), + metadata: metadata.map(|m| m.0), ..PutObjectRequest::default() }) .await?; @@ -195,7 +199,7 @@ impl RemoteStorage for S3Bucket { &self, from: &Self::StoragePath, to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result<()> { + ) -> anyhow::Result> { let object_output = self .client .get_object(GetObjectRequest { @@ -210,7 +214,7 @@ impl RemoteStorage for S3Bucket { io::copy(&mut from, to).await?; } - Ok(()) + Ok(object_output.metadata.map(StorageMetadata)) } async fn download_range( @@ -219,7 +223,7 @@ impl RemoteStorage for S3Bucket { start_inclusive: u64, end_exclusive: Option, to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result<()> { + ) -> anyhow::Result> { // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 // and needs both ends to be exclusive let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1)); @@ -242,7 +246,7 @@ impl RemoteStorage for S3Bucket { io::copy(&mut from, to).await?; } - Ok(()) + Ok(object_output.metadata.map(StorageMetadata)) } async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> { diff --git a/pageserver/src/remote_storage/storage_sync/upload.rs b/pageserver/src/remote_storage/storage_sync/upload.rs index 76e92c2781..f955e04474 100644 --- a/pageserver/src/remote_storage/storage_sync/upload.rs +++ b/pageserver/src/remote_storage/storage_sync/upload.rs @@ -201,6 +201,7 @@ async fn try_upload_checkpoint< .upload( archive_streamer, &remote_storage.storage_path(&timeline_dir.join(&archive_name))?, + None, ) .await }, From dc7e3ff05af8f0d669ffe9878d5c98b2d7c8e12c Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 9 Apr 2022 01:19:45 +0300 Subject: [PATCH 0121/1022] Fix rustc 1.60 clippy warnings --- pageserver/src/http/routes.rs | 15 ++++++--------- pageserver/src/layered_repository.rs | 3 +-- pageserver/src/layered_repository/filename.rs | 8 ++------ pageserver/src/layered_repository/layer_map.rs | 4 +--- pageserver/src/reltag.rs | 4 +--- pageserver/src/remote_storage/local_fs.rs | 2 +- .../remote_storage/storage_sync/compression.rs | 3 +-- .../src/remote_storage/storage_sync/download.rs | 4 ++-- walkeeper/src/http/routes.rs | 6 +++--- zenith_utils/src/http/json.rs | 4 ++-- 10 files changed, 20 insertions(+), 33 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 207d2420bd..a0d6e922a1 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -68,10 +68,7 @@ fn get_config(request: &Request) -> &'static PageServerConf { // healthcheck handler async fn status_handler(request: Request) -> Result, ApiError> { let config = get_config(&request); - Ok(json_response( - StatusCode::OK, - StatusResponse { id: config.id }, - )?) + json_response(StatusCode::OK, StatusResponse { id: config.id }) } async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { @@ -131,7 +128,7 @@ async fn timeline_list_handler(request: Request) -> Result, }) } - Ok(json_response(StatusCode::OK, response_data)?) + json_response(StatusCode::OK, response_data) } // Gate non incremental logical size calculation behind a flag @@ -207,7 +204,7 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result, ApiError> { @@ -247,7 +244,7 @@ async fn timeline_attach_handler(request: Request) -> Result) -> Result, ApiError> { @@ -266,7 +263,7 @@ async fn timeline_detach_handler(request: Request) -> Result) -> Result, ApiError> { @@ -280,7 +277,7 @@ async fn tenant_list_handler(request: Request) -> Result, A .await .map_err(ApiError::from_err)??; - Ok(json_response(StatusCode::OK, response_data)?) + json_response(StatusCode::OK, response_data) } async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index d7a250f31e..5e93e3389b 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1474,8 +1474,7 @@ impl LayeredTimeline { // // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing // *all* the layers, to avoid fsyncing the file multiple times. - let disk_consistent_lsn; - disk_consistent_lsn = Lsn(frozen_layer.get_lsn_range().end.0 - 1); + let disk_consistent_lsn = Lsn(frozen_layer.get_lsn_range().end.0 - 1); // If we were able to advance 'disk_consistent_lsn', save it the metadata file. // After crash, we will restart WAL streaming and processing from that point. diff --git a/pageserver/src/layered_repository/filename.rs b/pageserver/src/layered_repository/filename.rs index cd63f014c4..497912b408 100644 --- a/pageserver/src/layered_repository/filename.rs +++ b/pageserver/src/layered_repository/filename.rs @@ -25,9 +25,7 @@ impl PartialOrd for DeltaFileName { impl Ord for DeltaFileName { fn cmp(&self, other: &Self) -> Ordering { - let mut cmp; - - cmp = self.key_range.start.cmp(&other.key_range.start); + let mut cmp = self.key_range.start.cmp(&other.key_range.start); if cmp != Ordering::Equal { return cmp; } @@ -117,9 +115,7 @@ impl PartialOrd for ImageFileName { impl Ord for ImageFileName { fn cmp(&self, other: &Self) -> Ordering { - let mut cmp; - - cmp = self.key_range.start.cmp(&other.key_range.start); + let mut cmp = self.key_range.start.cmp(&other.key_range.start); if cmp != Ordering::Equal { return cmp; } diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 8132ec9cc4..3984ee550f 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -296,9 +296,7 @@ impl LayerMap { key_range: &Range, lsn: Lsn, ) -> Result, Option>)>> { - let mut points: Vec; - - points = vec![key_range.start]; + let mut points = vec![key_range.start]; for l in self.historic_layers.iter() { if l.get_lsn_range().start > lsn { continue; diff --git a/pageserver/src/reltag.rs b/pageserver/src/reltag.rs index 46ff468f2f..18e26cc37a 100644 --- a/pageserver/src/reltag.rs +++ b/pageserver/src/reltag.rs @@ -39,9 +39,7 @@ impl PartialOrd for RelTag { impl Ord for RelTag { fn cmp(&self, other: &Self) -> Ordering { - let mut cmp; - - cmp = self.spcnode.cmp(&other.spcnode); + let mut cmp = self.spcnode.cmp(&other.spcnode); if cmp != Ordering::Equal { return cmp; } diff --git a/pageserver/src/remote_storage/local_fs.rs b/pageserver/src/remote_storage/local_fs.rs index 846adf8e9b..b40089d53c 100644 --- a/pageserver/src/remote_storage/local_fs.rs +++ b/pageserver/src/remote_storage/local_fs.rs @@ -58,7 +58,7 @@ impl LocalFs { &self, file_path: &Path, ) -> anyhow::Result> { - let metadata_path = storage_metadata_path(&file_path); + let metadata_path = storage_metadata_path(file_path); if metadata_path.exists() && metadata_path.is_file() { let metadata_string = fs::read_to_string(&metadata_path).await.with_context(|| { format!( diff --git a/pageserver/src/remote_storage/storage_sync/compression.rs b/pageserver/src/remote_storage/storage_sync/compression.rs index c5b041349a..511f79e0cf 100644 --- a/pageserver/src/remote_storage/storage_sync/compression.rs +++ b/pageserver/src/remote_storage/storage_sync/compression.rs @@ -201,8 +201,7 @@ pub async fn read_archive_header( .await .context("Failed to decompress a header from the archive")?; - Ok(ArchiveHeader::des(&header_bytes) - .context("Failed to deserialize a header from the archive")?) + ArchiveHeader::des(&header_bytes).context("Failed to deserialize a header from the archive") } /// Reads the archive metadata out of the archive name: diff --git a/pageserver/src/remote_storage/storage_sync/download.rs b/pageserver/src/remote_storage/storage_sync/download.rs index 32549c8650..773b4a12e5 100644 --- a/pageserver/src/remote_storage/storage_sync/download.rs +++ b/pageserver/src/remote_storage/storage_sync/download.rs @@ -225,8 +225,8 @@ async fn read_local_metadata( let local_metadata_bytes = fs::read(&local_metadata_path) .await .context("Failed to read local metadata file bytes")?; - Ok(TimelineMetadata::from_bytes(&local_metadata_bytes) - .context("Failed to read local metadata files bytes")?) + TimelineMetadata::from_bytes(&local_metadata_bytes) + .context("Failed to read local metadata files bytes") } #[cfg(test)] diff --git a/walkeeper/src/http/routes.rs b/walkeeper/src/http/routes.rs index 06a0682c37..26b23cddcc 100644 --- a/walkeeper/src/http/routes.rs +++ b/walkeeper/src/http/routes.rs @@ -31,7 +31,7 @@ struct SafekeeperStatus { async fn status_handler(request: Request) -> Result, ApiError> { let conf = get_conf(&request); let status = SafekeeperStatus { id: conf.my_id }; - Ok(json_response(StatusCode::OK, status)?) + json_response(StatusCode::OK, status) } fn get_conf(request: &Request) -> &SafeKeeperConf { @@ -106,7 +106,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result, ApiError> { @@ -119,7 +119,7 @@ async fn timeline_create_handler(mut request: Request) -> Result Deserialize<'de>>( let whole_body = hyper::body::aggregate(request.body_mut()) .await .map_err(ApiError::from_err)?; - Ok(serde_json::from_reader(whole_body.reader()) - .map_err(|err| ApiError::BadRequest(format!("Failed to parse json request {}", err)))?) + serde_json::from_reader(whole_body.reader()) + .map_err(|err| ApiError::BadRequest(format!("Failed to parse json request {}", err))) } pub fn json_response( From 07a9553700310d6d6c2ba5c7e2e4484aeb98d899 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 11 Apr 2022 22:30:08 +0300 Subject: [PATCH 0122/1022] Add test for restore from WAL (#1366) * Add test for restore from WAL * Fix python formatting * Choose unused port in wal restore test * Move recovery tests to zenith_utils/scripts * Set LD_LIBRARY_PATH in wal recovery scripts * Fix python test formatting * Fix mypy warning * Bump postgres version * Bump postgres version --- test_runner/batch_others/test_wal_restore.py | 38 +++++++++++++++++++ vendor/postgres | 2 +- zenith_utils/scripts/restore_from_wal.sh | 20 ++++++++++ .../scripts/restore_from_wal_archive.sh | 20 ++++++++++ 4 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 test_runner/batch_others/test_wal_restore.py create mode 100755 zenith_utils/scripts/restore_from_wal.sh create mode 100755 zenith_utils/scripts/restore_from_wal_archive.sh diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py new file mode 100644 index 0000000000..a5855f2258 --- /dev/null +++ b/test_runner/batch_others/test_wal_restore.py @@ -0,0 +1,38 @@ +import os +import subprocess + +from fixtures.utils import mkdir_if_needed +from fixtures.zenith_fixtures import (ZenithEnvBuilder, + VanillaPostgres, + PortDistributor, + PgBin, + base_dir, + vanilla_pg, + pg_distrib_dir) +from fixtures.log_helper import log + + +def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, + test_output_dir, + port_distributor: PortDistributor): + zenith_env_builder.num_safekeepers = 1 + env = zenith_env_builder.init_start() + env.zenith_cli.create_branch("test_wal_restore") + pg = env.postgres.create_start('test_wal_restore') + pg.safe_psql("create table t as select generate_series(1,1000000)") + tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] + env.zenith_cli.pageserver_stop() + port = port_distributor.get_port() + data_dir = os.path.join(test_output_dir, 'pgsql.restored') + restored = VanillaPostgres(data_dir, PgBin(test_output_dir), port) + subprocess.call([ + 'bash', + os.path.join(base_dir, 'zenith_utils/scripts/restore_from_wal.sh'), + os.path.join(pg_distrib_dir, 'bin'), + os.path.join(test_output_dir, 'repo/safekeepers/sk1/{}/*'.format(tenant_id)), + data_dir, + str(port) + ]) + restored.start() + assert restored.safe_psql('select count(*) from t') == [(1000000, )] + restored.stop() diff --git a/vendor/postgres b/vendor/postgres index 8481459996..61afbf978b 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 848145999653be213141a330569b6f2d9f53dbf2 +Subproject commit 61afbf978b17764134ab6f1650bbdcadac147e71 diff --git a/zenith_utils/scripts/restore_from_wal.sh b/zenith_utils/scripts/restore_from_wal.sh new file mode 100755 index 0000000000..ef2171312b --- /dev/null +++ b/zenith_utils/scripts/restore_from_wal.sh @@ -0,0 +1,20 @@ +PG_BIN=$1 +WAL_PATH=$2 +DATA_DIR=$3 +PORT=$4 +SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-` +rm -fr $DATA_DIR +env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -D $DATA_DIR --sysid=$SYSID +echo port=$PORT >> $DATA_DIR/postgresql.conf +REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-` +declare -i WAL_SIZE=$REDO_POS+114 +$PG_BIN/pg_ctl -D $DATA_DIR -l logfile start +$PG_BIN/pg_ctl -D $DATA_DIR -l logfile stop -m immediate +cp $DATA_DIR/pg_wal/000000010000000000000001 . +cp $WAL_PATH/* $DATA_DIR/pg_wal/ +if [ -f $DATA_DIR/pg_wal/*.partial ] +then + (cd $DATA_DIR/pg_wal ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done) +fi +dd if=000000010000000000000001 of=$DATA_DIR/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc +rm -f 000000010000000000000001 diff --git a/zenith_utils/scripts/restore_from_wal_archive.sh b/zenith_utils/scripts/restore_from_wal_archive.sh new file mode 100755 index 0000000000..07f4fe1e4f --- /dev/null +++ b/zenith_utils/scripts/restore_from_wal_archive.sh @@ -0,0 +1,20 @@ +PG_BIN=$1 +WAL_PATH=$2 +DATA_DIR=$3 +PORT=$4 +SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-` +rm -fr $DATA_DIR /tmp/pg_wals +mkdir /tmp/pg_wals +env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U zenith_admin -D $DATA_DIR --sysid=$SYSID +echo port=$PORT >> $DATA_DIR/postgresql.conf +REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-` +declare -i WAL_SIZE=$REDO_POS+114 +cp $WAL_PATH/* /tmp/pg_wals +if [ -f $DATA_DIR/pg_wal/*.partial ] +then + (cd /tmp/pg_wals ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done) +fi +dd if=$DATA_DIR/pg_wal/000000010000000000000001 of=/tmp/pg_wals/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc +echo > $DATA_DIR/recovery.signal +rm -f $DATA_DIR/pg_wal/* +echo "restore_command = 'cp /tmp/pg_wals/%f %p'" >> $DATA_DIR/postgresql.conf From 0fbe657b2f268351dc5daabee09754a578be3948 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Wed, 13 Apr 2022 00:02:06 +0300 Subject: [PATCH 0123/1022] Fix remote e2e tests after repository rename (#1434) Also start them after release build instead of debug. It saves 3-5 minutes and we anyway use release mode in Docker images. --- .circleci/config.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index e96964558b..9d26d5d558 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -672,7 +672,7 @@ jobs: --data \ "{ \"state\": \"pending\", - \"context\": \"zenith-remote-ci\", + \"context\": \"neon-cloud-e2e\", \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\" }" - run: @@ -688,7 +688,7 @@ jobs: "{ \"ref\": \"main\", \"inputs\": { - \"ci_job_name\": \"zenith-remote-ci\", + \"ci_job_name\": \"neon-cloud-e2e\", \"commit_hash\": \"$CIRCLE_SHA1\", \"remote_repo\": \"$LOCAL_REPO\" } @@ -828,11 +828,11 @@ workflows: - remote-ci-trigger: # Context passes credentials for gh api context: CI_ACCESS_TOKEN - remote_repo: "zenithdb/console" + remote_repo: "neondatabase/cloud" requires: # XXX: Successful build doesn't mean everything is OK, but # the job to be triggered takes so much time to complete (~22 min) # that it's better not to wait for the commented-out steps - - build-zenith-debug + - build-zenith-release # - pg_regress-tests-release # - other-tests-release From 4af87f3d6097661c99cbf5b400c1af6c44819e43 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 13 Apr 2022 03:00:32 +0300 Subject: [PATCH 0124/1022] [proxy] Add SCRAM auth mechanism implementation (#1050) * [proxy] Add SCRAM auth * [proxy] Implement some tests for SCRAM * Refactoring + test fixes * Hide SCRAM mechanism behind `#[cfg(test)]` Currently we only use it in tests, so we hide all relevant module behind `#[cfg(test)]` to prevent "unused item" warnings. --- Cargo.lock | 35 +++- proxy/Cargo.toml | 11 +- proxy/src/auth.rs | 88 +++------- proxy/src/auth/credentials.rs | 70 ++++++++ proxy/src/auth/flow.rs | 102 ++++++++++++ proxy/src/main.rs | 39 +++-- proxy/src/parse.rs | 18 +++ proxy/src/proxy.rs | 229 ++++++++++++++++++++------ proxy/src/sasl.rs | 47 ++++++ proxy/src/sasl/channel_binding.rs | 85 ++++++++++ proxy/src/sasl/messages.rs | 67 ++++++++ proxy/src/sasl/stream.rs | 70 ++++++++ proxy/src/scram.rs | 59 +++++++ proxy/src/scram/exchange.rs | 134 ++++++++++++++++ proxy/src/scram/key.rs | 33 ++++ proxy/src/scram/messages.rs | 232 +++++++++++++++++++++++++++ proxy/src/scram/password.rs | 48 ++++++ proxy/src/scram/secret.rs | 116 ++++++++++++++ proxy/src/scram/signature.rs | 66 ++++++++ zenith_utils/src/postgres_backend.rs | 3 +- zenith_utils/src/pq_proto.rs | 36 ++++- 21 files changed, 1446 insertions(+), 142 deletions(-) create mode 100644 proxy/src/auth/credentials.rs create mode 100644 proxy/src/auth/flow.rs create mode 100644 proxy/src/parse.rs create mode 100644 proxy/src/sasl.rs create mode 100644 proxy/src/sasl/channel_binding.rs create mode 100644 proxy/src/sasl/messages.rs create mode 100644 proxy/src/sasl/stream.rs create mode 100644 proxy/src/scram.rs create mode 100644 proxy/src/scram/exchange.rs create mode 100644 proxy/src/scram/key.rs create mode 100644 proxy/src/scram/messages.rs create mode 100644 proxy/src/scram/password.rs create mode 100644 proxy/src/scram/secret.rs create mode 100644 proxy/src/scram/signature.rs diff --git a/Cargo.lock b/Cargo.lock index 1a9e261281..7df1c4ab7a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1907,12 +1907,15 @@ name = "proxy" version = "0.1.0" dependencies = [ "anyhow", + "async-trait", + "base64 0.13.0", "bytes", "clap 3.0.14", "fail", "futures", "hashbrown", "hex", + "hmac 0.10.1", "hyper", "lazy_static", "md5", @@ -1921,16 +1924,20 @@ dependencies = [ "rand", "rcgen", "reqwest", + "routerify 2.2.0", + "rstest", "rustls 0.19.1", "scopeguard", "serde", "serde_json", + "sha2", "socket2", "thiserror", "tokio", "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", "tokio-postgres-rustls", "tokio-rustls 0.22.0", + "tokio-stream", "workspace_hack", "zenith_metrics", "zenith_utils", @@ -2130,6 +2137,19 @@ dependencies = [ "winapi", ] +[[package]] +name = "routerify" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6bb49594c791cadb5ccfa5f36d41b498d40482595c199d10cd318800280bd9" +dependencies = [ + "http", + "hyper", + "lazy_static", + "percent-encoding", + "regex", +] + [[package]] name = "routerify" version = "3.0.0" @@ -2143,6 +2163,19 @@ dependencies = [ "regex", ] +[[package]] +name = "rstest" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d912f35156a3f99a66ee3e11ac2e0b3f34ac85a07e05263d05a7e2c8810d616f" +dependencies = [ + "cfg-if", + "proc-macro2", + "quote", + "rustc_version", + "syn", +] + [[package]] name = "rusoto_core" version = "0.47.0" @@ -3450,7 +3483,7 @@ dependencies = [ "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", "rand", - "routerify", + "routerify 3.0.0", "rustls 0.19.1", "rustls-split", "serde", diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index dc20695884..56b6dd7e20 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -5,12 +5,14 @@ edition = "2021" [dependencies] anyhow = "1.0" +base64 = "0.13.0" bytes = { version = "1.0.1", features = ['serde'] } clap = "3.0" fail = "0.5.0" futures = "0.3.13" hashbrown = "0.11.2" hex = "0.4.3" +hmac = "0.10.1" hyper = "0.14" lazy_static = "1.4.0" md5 = "0.7.0" @@ -18,20 +20,25 @@ parking_lot = "0.11.2" pin-project-lite = "0.2.7" rand = "0.8.3" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } +routerify = "2" rustls = "0.19.1" scopeguard = "1.1.0" serde = "1" serde_json = "1" +sha2 = "0.9.8" socket2 = "0.4.4" -thiserror = "1.0" +thiserror = "1.0.30" tokio = { version = "1.17", features = ["macros"] } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } tokio-rustls = "0.22.0" +tokio-stream = "0.1.8" zenith_utils = { path = "../zenith_utils" } zenith_metrics = { path = "../zenith_metrics" } workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] -tokio-postgres-rustls = "0.8.0" +async-trait = "0.1" rcgen = "0.8.14" +rstest = "0.12" +tokio-postgres-rustls = "0.8.0" diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index e8fe65c081..bda14d67a1 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,14 +1,24 @@ +mod credentials; + +#[cfg(test)] +mod flow; + use crate::compute::DatabaseInfo; use crate::config::ProxyConfig; use crate::cplane_api::{self, CPlaneApi}; use crate::error::UserFacingError; use crate::stream::PqStream; use crate::waiters; -use std::collections::HashMap; +use std::io; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use zenith_utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; +pub use credentials::ClientCredentials; + +#[cfg(test)] +pub use flow::*; + /// Common authentication error. #[derive(Debug, Error)] pub enum AuthErrorImpl { @@ -16,13 +26,17 @@ pub enum AuthErrorImpl { #[error(transparent)] Console(#[from] cplane_api::AuthError), + #[cfg(test)] + #[error(transparent)] + Sasl(#[from] crate::sasl::Error), + /// For passwords that couldn't be processed by [`parse_password`]. #[error("Malformed password message")] MalformedPassword, /// Errors produced by [`PqStream`]. #[error(transparent)] - Io(#[from] std::io::Error), + Io(#[from] io::Error), } impl AuthErrorImpl { @@ -67,70 +81,6 @@ impl UserFacingError for AuthError { } } -#[derive(Debug, Error)] -pub enum ClientCredsParseError { - #[error("Parameter `{0}` is missing in startup packet")] - MissingKey(&'static str), -} - -impl UserFacingError for ClientCredsParseError {} - -/// Various client credentials which we use for authentication. -#[derive(Debug, PartialEq, Eq)] -pub struct ClientCredentials { - pub user: String, - pub dbname: String, -} - -impl TryFrom> for ClientCredentials { - type Error = ClientCredsParseError; - - fn try_from(mut value: HashMap) -> Result { - let mut get_param = |key| { - value - .remove(key) - .ok_or(ClientCredsParseError::MissingKey(key)) - }; - - let user = get_param("user")?; - let db = get_param("database")?; - - Ok(Self { user, dbname: db }) - } -} - -impl ClientCredentials { - /// Use credentials to authenticate the user. - pub async fn authenticate( - self, - config: &ProxyConfig, - client: &mut PqStream, - ) -> Result { - fail::fail_point!("proxy-authenticate", |_| { - Err(AuthError::auth_failed("failpoint triggered")) - }); - - use crate::config::ClientAuthMethod::*; - use crate::config::RouterConfig::*; - match &config.router_config { - Static { host, port } => handle_static(host.clone(), *port, client, self).await, - Dynamic(Mixed) => { - if self.user.ends_with("@zenith") { - handle_existing_user(config, client, self).await - } else { - handle_new_user(config, client).await - } - } - Dynamic(Password) => handle_existing_user(config, client, self).await, - Dynamic(Link) => handle_new_user(config, client).await, - } - } -} - -fn new_psql_session_id() -> String { - hex::encode(rand::random::<[u8; 8]>()) -} - async fn handle_static( host: String, port: u16, @@ -169,7 +119,7 @@ async fn handle_existing_user( let md5_salt = rand::random(); client - .write_message(&Be::AuthenticationMD5Password(&md5_salt)) + .write_message(&Be::AuthenticationMD5Password(md5_salt)) .await?; // Read client's password hash @@ -213,6 +163,10 @@ async fn handle_new_user( Ok(db_info) } +fn new_psql_session_id() -> String { + hex::encode(rand::random::<[u8; 8]>()) +} + fn parse_password(bytes: &[u8]) -> Option<&str> { std::str::from_utf8(bytes).ok()?.strip_suffix('\0') } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs new file mode 100644 index 0000000000..7c8ba28622 --- /dev/null +++ b/proxy/src/auth/credentials.rs @@ -0,0 +1,70 @@ +//! User credentials used in authentication. + +use super::AuthError; +use crate::compute::DatabaseInfo; +use crate::config::ProxyConfig; +use crate::error::UserFacingError; +use crate::stream::PqStream; +use std::collections::HashMap; +use thiserror::Error; +use tokio::io::{AsyncRead, AsyncWrite}; + +#[derive(Debug, Error)] +pub enum ClientCredsParseError { + #[error("Parameter `{0}` is missing in startup packet")] + MissingKey(&'static str), +} + +impl UserFacingError for ClientCredsParseError {} + +/// Various client credentials which we use for authentication. +#[derive(Debug, PartialEq, Eq)] +pub struct ClientCredentials { + pub user: String, + pub dbname: String, +} + +impl TryFrom> for ClientCredentials { + type Error = ClientCredsParseError; + + fn try_from(mut value: HashMap) -> Result { + let mut get_param = |key| { + value + .remove(key) + .ok_or(ClientCredsParseError::MissingKey(key)) + }; + + let user = get_param("user")?; + let db = get_param("database")?; + + Ok(Self { user, dbname: db }) + } +} + +impl ClientCredentials { + /// Use credentials to authenticate the user. + pub async fn authenticate( + self, + config: &ProxyConfig, + client: &mut PqStream, + ) -> Result { + fail::fail_point!("proxy-authenticate", |_| { + Err(AuthError::auth_failed("failpoint triggered")) + }); + + use crate::config::ClientAuthMethod::*; + use crate::config::RouterConfig::*; + match &config.router_config { + Static { host, port } => super::handle_static(host.clone(), *port, client, self).await, + Dynamic(Mixed) => { + if self.user.ends_with("@zenith") { + super::handle_existing_user(config, client, self).await + } else { + super::handle_new_user(config, client).await + } + } + Dynamic(Password) => super::handle_existing_user(config, client, self).await, + Dynamic(Link) => super::handle_new_user(config, client).await, + } + } +} diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs new file mode 100644 index 0000000000..0fafaa2f47 --- /dev/null +++ b/proxy/src/auth/flow.rs @@ -0,0 +1,102 @@ +//! Main authentication flow. + +use super::{AuthError, AuthErrorImpl}; +use crate::stream::PqStream; +use crate::{sasl, scram}; +use std::io; +use tokio::io::{AsyncRead, AsyncWrite}; +use zenith_utils::pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; + +/// Every authentication selector is supposed to implement this trait. +pub trait AuthMethod { + /// Any authentication selector should provide initial backend message + /// containing auth method name and parameters, e.g. md5 salt. + fn first_message(&self) -> BeMessage<'_>; +} + +/// Initial state of [`AuthFlow`]. +pub struct Begin; + +/// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`]. +pub struct Scram<'a>(pub &'a scram::ServerSecret); + +impl AuthMethod for Scram<'_> { + #[inline(always)] + fn first_message(&self) -> BeMessage<'_> { + Be::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(scram::METHODS)) + } +} + +/// Use password-based auth in [`AuthFlow`]. +pub struct Md5( + /// Salt for client. + pub [u8; 4], +); + +impl AuthMethod for Md5 { + #[inline(always)] + fn first_message(&self) -> BeMessage<'_> { + Be::AuthenticationMD5Password(self.0) + } +} + +/// This wrapper for [`PqStream`] performs client authentication. +#[must_use] +pub struct AuthFlow<'a, Stream, State> { + /// The underlying stream which implements libpq's protocol. + stream: &'a mut PqStream, + /// State might contain ancillary data (see [`AuthFlow::begin`]). + state: State, +} + +/// Initial state of the stream wrapper. +impl<'a, S: AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { + /// Create a new wrapper for client authentication. + pub fn new(stream: &'a mut PqStream) -> Self { + Self { + stream, + state: Begin, + } + } + + /// Move to the next step by sending auth method's name & params to client. + pub async fn begin(self, method: M) -> io::Result> { + self.stream.write_message(&method.first_message()).await?; + + Ok(AuthFlow { + stream: self.stream, + state: method, + }) + } +} + +/// Stream wrapper for handling simple MD5 password auth. +impl AuthFlow<'_, S, Md5> { + /// Perform user authentication. Raise an error in case authentication failed. + #[allow(unused)] + pub async fn authenticate(self) -> Result<(), AuthError> { + unimplemented!("MD5 auth flow is yet to be implemented"); + } +} + +/// Stream wrapper for handling [SCRAM](crate::scram) auth. +impl AuthFlow<'_, S, Scram<'_>> { + /// Perform user authentication. Raise an error in case authentication failed. + pub async fn authenticate(self) -> Result<(), AuthError> { + // Initial client message contains the chosen auth method's name. + let msg = self.stream.read_password_message().await?; + let sasl = sasl::FirstMessage::parse(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; + + // Currently, the only supported SASL method is SCRAM. + if !scram::METHODS.contains(&sasl.method) { + return Err(AuthErrorImpl::auth_failed("method not supported").into()); + } + + let secret = self.state.0; + sasl::SaslStream::new(self.stream, sasl.message) + .authenticate(scram::Exchange::new(secret, rand::random, None)) + .await?; + + Ok(()) + } +} diff --git a/proxy/src/main.rs b/proxy/src/main.rs index bd99d0a639..862152bb7b 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -1,19 +1,8 @@ -/// -/// Postgres protocol proxy/router. -/// -/// This service listens psql port and can check auth via external service -/// (control plane API in our case) and can create new databases and accounts -/// in somewhat transparent manner (again via communication with control plane API). -/// -use anyhow::{bail, Context}; -use clap::{App, Arg}; -use config::ProxyConfig; -use futures::FutureExt; -use std::future::Future; -use tokio::{net::TcpListener, task::JoinError}; -use zenith_utils::GIT_VERSION; - -use crate::config::{ClientAuthMethod, RouterConfig}; +//! Postgres protocol proxy/router. +//! +//! This service listens psql port and can check auth via external service +//! (control plane API in our case) and can create new databases and accounts +//! in somewhat transparent manner (again via communication with control plane API). mod auth; mod cancellation; @@ -27,6 +16,24 @@ mod proxy; mod stream; mod waiters; +// Currently SCRAM is only used in tests +#[cfg(test)] +mod parse; +#[cfg(test)] +mod sasl; +#[cfg(test)] +mod scram; + +use anyhow::{bail, Context}; +use clap::{App, Arg}; +use config::ProxyConfig; +use futures::FutureExt; +use std::future::Future; +use tokio::{net::TcpListener, task::JoinError}; +use zenith_utils::GIT_VERSION; + +use crate::config::{ClientAuthMethod, RouterConfig}; + /// Flattens `Result>` into `Result`. async fn flatten_err( f: impl Future, JoinError>>, diff --git a/proxy/src/parse.rs b/proxy/src/parse.rs new file mode 100644 index 0000000000..8a05ff9c82 --- /dev/null +++ b/proxy/src/parse.rs @@ -0,0 +1,18 @@ +//! Small parsing helpers. + +use std::convert::TryInto; +use std::ffi::CStr; + +pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { + let pos = bytes.iter().position(|&x| x == 0)?; + let (cstr, other) = bytes.split_at(pos + 1); + // SAFETY: we've already checked that there's a terminator + Some((unsafe { CStr::from_bytes_with_nul_unchecked(cstr) }, other)) +} + +pub fn split_at_const(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> { + (bytes.len() >= N).then(|| { + let (head, tail) = bytes.split_at(N); + (head.try_into().unwrap(), tail) + }) +} diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 81581b5cf1..5b662f4c69 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -119,7 +119,6 @@ async fn handshake( // We can't perform TLS handshake without a config let enc = tls.is_some(); stream.write_message(&Be::EncryptionResponse(enc)).await?; - if let Some(tls) = tls.take() { // Upgrade raw stream into a secure TLS-backed stream. // NOTE: We've consumed `tls`; this fact will be used later. @@ -219,32 +218,14 @@ impl Client { #[cfg(test)] mod tests { use super::*; - - use tokio::io::DuplexStream; + use crate::{auth, scram}; + use async_trait::async_trait; + use rstest::rstest; use tokio_postgres::config::SslMode; use tokio_postgres::tls::{MakeTlsConnect, NoTls}; use tokio_postgres_rustls::MakeRustlsConnect; - async fn dummy_proxy( - client: impl AsyncRead + AsyncWrite + Unpin, - tls: Option, - ) -> anyhow::Result<()> { - let cancel_map = CancelMap::default(); - - // TODO: add some infra + tests for credentials - let (mut stream, _creds) = handshake(client, tls, &cancel_map) - .await? - .context("no stream")?; - - stream - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())? - .write_message(&BeMessage::ReadyForQuery) - .await?; - - Ok(()) - } - + /// Generate a set of TLS certificates: CA + server. fn generate_certs( hostname: &str, ) -> anyhow::Result<(rustls::Certificate, rustls::Certificate, rustls::PrivateKey)> { @@ -262,19 +243,115 @@ mod tests { )) } + struct ClientConfig<'a> { + config: rustls::ClientConfig, + hostname: &'a str, + } + + impl ClientConfig<'_> { + fn make_tls_connect( + self, + ) -> anyhow::Result> { + let mut mk = MakeRustlsConnect::new(self.config); + let tls = MakeTlsConnect::::make_tls_connect(&mut mk, self.hostname)?; + Ok(tls) + } + } + + /// Generate TLS certificates and build rustls configs for client and server. + fn generate_tls_config( + hostname: &str, + ) -> anyhow::Result<(ClientConfig<'_>, Arc)> { + let (ca, cert, key) = generate_certs(hostname)?; + + let server_config = { + let mut config = rustls::ServerConfig::new(rustls::NoClientAuth::new()); + config.set_single_cert(vec![cert], key)?; + config.into() + }; + + let client_config = { + let mut config = rustls::ClientConfig::new(); + config.root_store.add(&ca)?; + ClientConfig { config, hostname } + }; + + Ok((client_config, server_config)) + } + + #[async_trait] + trait TestAuth: Sized { + async fn authenticate( + self, + _stream: &mut PqStream>, + ) -> anyhow::Result<()> { + Ok(()) + } + } + + struct NoAuth; + impl TestAuth for NoAuth {} + + struct Scram(scram::ServerSecret); + + impl Scram { + fn new(password: &str) -> anyhow::Result { + let salt = rand::random::<[u8; 16]>(); + let secret = scram::ServerSecret::build(password, &salt, 256) + .context("failed to generate scram secret")?; + Ok(Scram(secret)) + } + + fn mock(user: &str) -> Self { + let salt = rand::random::<[u8; 32]>(); + Scram(scram::ServerSecret::mock(user, &salt)) + } + } + + #[async_trait] + impl TestAuth for Scram { + async fn authenticate( + self, + stream: &mut PqStream>, + ) -> anyhow::Result<()> { + auth::AuthFlow::new(stream) + .begin(auth::Scram(&self.0)) + .await? + .authenticate() + .await?; + + Ok(()) + } + } + + /// A dummy proxy impl which performs a handshake and reports auth success. + async fn dummy_proxy( + client: impl AsyncRead + AsyncWrite + Unpin + Send, + tls: Option, + auth: impl TestAuth + Send, + ) -> anyhow::Result<()> { + let cancel_map = CancelMap::default(); + let (mut stream, _creds) = handshake(client, tls, &cancel_map) + .await? + .context("handshake failed")?; + + auth.authenticate(&mut stream).await?; + + stream + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())? + .write_message(&BeMessage::ReadyForQuery) + .await?; + + Ok(()) + } + #[tokio::test] async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); - let server_config = { - let (_ca, cert, key) = generate_certs("localhost")?; - - let mut config = rustls::ServerConfig::new(rustls::NoClientAuth::new()); - config.set_single_cert(vec![cert], key)?; - config - }; - - let proxy = tokio::spawn(dummy_proxy(client, Some(server_config.into()))); + let (_, server_config) = generate_tls_config("localhost")?; + let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); let client_err = tokio_postgres::Config::new() .user("john_doe") @@ -301,30 +378,14 @@ mod tests { async fn handshake_tls() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); - let (ca, cert, key) = generate_certs("localhost")?; - - let server_config = { - let mut config = rustls::ServerConfig::new(rustls::NoClientAuth::new()); - config.set_single_cert(vec![cert], key)?; - config - }; - - let proxy = tokio::spawn(dummy_proxy(client, Some(server_config.into()))); - - let client_config = { - let mut config = rustls::ClientConfig::new(); - config.root_store.add(&ca)?; - config - }; - - let mut mk = MakeRustlsConnect::new(client_config); - let tls = MakeTlsConnect::::make_tls_connect(&mut mk, "localhost")?; + let (client_config, server_config) = generate_tls_config("localhost")?; + let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); let (_client, _conn) = tokio_postgres::Config::new() .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Require) - .connect_raw(server, tls) + .connect_raw(server, client_config.make_tls_connect()?) .await?; proxy.await? @@ -334,7 +395,7 @@ mod tests { async fn handshake_raw() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); - let proxy = tokio::spawn(dummy_proxy(client, None)); + let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth)); let (_client, _conn) = tokio_postgres::Config::new() .user("john_doe") @@ -350,7 +411,7 @@ mod tests { async fn give_user_an_error_for_bad_creds() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); - let proxy = tokio::spawn(dummy_proxy(client, None)); + let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth)); let client_err = tokio_postgres::Config::new() .ssl_mode(SslMode::Disable) @@ -391,4 +452,66 @@ mod tests { Ok(()) } + + #[rstest] + #[case("password_foo")] + #[case("pwd-bar")] + #[case("")] + #[tokio::test] + async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { + let (client, server) = tokio::io::duplex(1024); + + let (client_config, server_config) = generate_tls_config("localhost")?; + let proxy = tokio::spawn(dummy_proxy( + client, + Some(server_config), + Scram::new(password)?, + )); + + let (_client, _conn) = tokio_postgres::Config::new() + .user("user") + .dbname("db") + .password(password) + .ssl_mode(SslMode::Require) + .connect_raw(server, client_config.make_tls_connect()?) + .await?; + + proxy.await? + } + + #[tokio::test] + async fn scram_auth_mock() -> anyhow::Result<()> { + let (client, server) = tokio::io::duplex(1024); + + let (client_config, server_config) = generate_tls_config("localhost")?; + let proxy = tokio::spawn(dummy_proxy( + client, + Some(server_config), + Scram::mock("user"), + )); + + use rand::{distributions::Alphanumeric, Rng}; + let password: String = rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(rand::random::() as usize) + .map(char::from) + .collect(); + + let _client_err = tokio_postgres::Config::new() + .user("user") + .dbname("db") + .password(&password) // no password will match the mocked secret + .ssl_mode(SslMode::Require) + .connect_raw(server, client_config.make_tls_connect()?) + .await + .err() // -> Option + .context("client shouldn't be able to connect")?; + + let _server_err = proxy + .await? + .err() // -> Option + .context("server shouldn't accept client")?; + + Ok(()) + } } diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs new file mode 100644 index 0000000000..70a4d9946a --- /dev/null +++ b/proxy/src/sasl.rs @@ -0,0 +1,47 @@ +//! Simple Authentication and Security Layer. +//! +//! RFC: . +//! +//! Reference implementation: +//! * +//! * + +mod channel_binding; +mod messages; +mod stream; + +use std::io; +use thiserror::Error; + +pub use channel_binding::ChannelBinding; +pub use messages::FirstMessage; +pub use stream::SaslStream; + +/// Fine-grained auth errors help in writing tests. +#[derive(Error, Debug)] +pub enum Error { + #[error("Failed to authenticate client: {0}")] + AuthenticationFailed(&'static str), + + #[error("Channel binding failed: {0}")] + ChannelBindingFailed(&'static str), + + #[error("Unsupported channel binding method: {0}")] + ChannelBindingBadMethod(Box), + + #[error("Bad client message")] + BadClientMessage, + + #[error(transparent)] + Io(#[from] io::Error), +} + +/// A convenient result type for SASL exchange. +pub type Result = std::result::Result; + +/// Every SASL mechanism (e.g. [SCRAM](crate::scram)) is expected to implement this trait. +pub trait Mechanism: Sized { + /// Produce a server challenge to be sent to the client. + /// This is how this method is called in PostgreSQL (`libpq/sasl.h`). + fn exchange(self, input: &str) -> Result<(Option, String)>; +} diff --git a/proxy/src/sasl/channel_binding.rs b/proxy/src/sasl/channel_binding.rs new file mode 100644 index 0000000000..776adabe55 --- /dev/null +++ b/proxy/src/sasl/channel_binding.rs @@ -0,0 +1,85 @@ +//! Definition and parser for channel binding flag (a part of the `GS2` header). + +/// Channel binding flag (possibly with params). +#[derive(Debug, PartialEq, Eq)] +pub enum ChannelBinding { + /// Client doesn't support channel binding. + NotSupportedClient, + /// Client thinks server doesn't support channel binding. + NotSupportedServer, + /// Client wants to use this type of channel binding. + Required(T), +} + +impl ChannelBinding { + pub fn and_then(self, f: impl FnOnce(T) -> Result) -> Result, E> { + use ChannelBinding::*; + Ok(match self { + NotSupportedClient => NotSupportedClient, + NotSupportedServer => NotSupportedServer, + Required(x) => Required(f(x)?), + }) + } +} + +impl<'a> ChannelBinding<&'a str> { + // NB: FromStr doesn't work with lifetimes + pub fn parse(input: &'a str) -> Option { + use ChannelBinding::*; + Some(match input { + "n" => NotSupportedClient, + "y" => NotSupportedServer, + other => Required(other.strip_prefix("p=")?), + }) + } +} + +impl ChannelBinding { + /// Encode channel binding data as base64 for subsequent checks. + pub fn encode( + &self, + get_cbind_data: impl FnOnce(&T) -> Result, + ) -> Result, E> { + use ChannelBinding::*; + Ok(match self { + NotSupportedClient => { + // base64::encode("n,,") + "biws".into() + } + NotSupportedServer => { + // base64::encode("y,,") + "eSws".into() + } + Required(mode) => { + let msg = format!( + "p={mode},,{data}", + mode = mode, + data = get_cbind_data(mode)? + ); + base64::encode(msg).into() + } + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn channel_binding_encode() -> anyhow::Result<()> { + use ChannelBinding::*; + + let cases = [ + (NotSupportedClient, base64::encode("n,,")), + (NotSupportedServer, base64::encode("y,,")), + (Required("foo"), base64::encode("p=foo,,bar")), + ]; + + for (cb, input) in cases { + assert_eq!(cb.encode(|_| anyhow::Ok("bar".to_owned()))?, input); + } + + Ok(()) + } +} diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs new file mode 100644 index 0000000000..b1ae8cc426 --- /dev/null +++ b/proxy/src/sasl/messages.rs @@ -0,0 +1,67 @@ +//! Definitions for SASL messages. + +use crate::parse::{split_at_const, split_cstr}; +use zenith_utils::pq_proto::{BeAuthenticationSaslMessage, BeMessage}; + +/// SASL-specific payload of [`PasswordMessage`](zenith_utils::pq_proto::FeMessage::PasswordMessage). +#[derive(Debug)] +pub struct FirstMessage<'a> { + /// Authentication method, e.g. `"SCRAM-SHA-256"`. + pub method: &'a str, + /// Initial client message. + pub message: &'a str, +} + +impl<'a> FirstMessage<'a> { + // NB: FromStr doesn't work with lifetimes + pub fn parse(bytes: &'a [u8]) -> Option { + let (method_cstr, tail) = split_cstr(bytes)?; + let method = method_cstr.to_str().ok()?; + + let (len_bytes, bytes) = split_at_const(tail)?; + let len = u32::from_be_bytes(*len_bytes) as usize; + if len != bytes.len() { + return None; + } + + let message = std::str::from_utf8(bytes).ok()?; + Some(Self { method, message }) + } +} + +/// A single SASL message. +/// This struct is deliberately decoupled from lower-level +/// [`BeAuthenticationSaslMessage`](zenith_utils::pq_proto::BeAuthenticationSaslMessage). +#[derive(Debug)] +pub(super) enum ServerMessage { + /// We expect to see more steps. + Continue(T), + /// This is the final step. + Final(T), +} + +impl<'a> ServerMessage<&'a str> { + pub(super) fn to_reply(&self) -> BeMessage<'a> { + use BeAuthenticationSaslMessage::*; + BeMessage::AuthenticationSasl(match self { + ServerMessage::Continue(s) => Continue(s.as_bytes()), + ServerMessage::Final(s) => Final(s.as_bytes()), + }) + } +} +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_sasl_first_message() { + let proto = "SCRAM-SHA-256"; + let sasl = "n,,n=,r=KHQ2Gjc7NptyB8aov5/TnUy4"; + let sasl_len = (sasl.len() as u32).to_be_bytes(); + let bytes = [proto.as_bytes(), &[0], sasl_len.as_ref(), sasl.as_bytes()].concat(); + + let password = FirstMessage::parse(&bytes).unwrap(); + assert_eq!(password.method, proto); + assert_eq!(password.message, sasl); + } +} diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs new file mode 100644 index 0000000000..03649b8d11 --- /dev/null +++ b/proxy/src/sasl/stream.rs @@ -0,0 +1,70 @@ +//! Abstraction for the string-oriented SASL protocols. + +use super::{messages::ServerMessage, Mechanism}; +use crate::stream::PqStream; +use std::io; +use tokio::io::{AsyncRead, AsyncWrite}; + +/// Abstracts away all peculiarities of the libpq's protocol. +pub struct SaslStream<'a, S> { + /// The underlying stream. + stream: &'a mut PqStream, + /// Current password message we received from client. + current: bytes::Bytes, + /// First SASL message produced by client. + first: Option<&'a str>, +} + +impl<'a, S> SaslStream<'a, S> { + pub fn new(stream: &'a mut PqStream, first: &'a str) -> Self { + Self { + stream, + current: bytes::Bytes::new(), + first: Some(first), + } + } +} + +impl SaslStream<'_, S> { + // Receive a new SASL message from the client. + async fn recv(&mut self) -> io::Result<&str> { + if let Some(first) = self.first.take() { + return Ok(first); + } + + self.current = self.stream.read_password_message().await?; + let s = std::str::from_utf8(&self.current) + .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "bad encoding"))?; + + Ok(s) + } +} + +impl SaslStream<'_, S> { + // Send a SASL message to the client. + async fn send(&mut self, msg: &ServerMessage<&str>) -> io::Result<()> { + self.stream.write_message(&msg.to_reply()).await?; + Ok(()) + } +} + +impl SaslStream<'_, S> { + /// Perform SASL message exchange according to the underlying algorithm + /// until user is either authenticated or denied access. + pub async fn authenticate(mut self, mut mechanism: impl Mechanism) -> super::Result<()> { + loop { + let input = self.recv().await?; + let (moved, reply) = mechanism.exchange(input)?; + match moved { + Some(moved) => { + self.send(&ServerMessage::Continue(&reply)).await?; + mechanism = moved; + } + None => { + self.send(&ServerMessage::Final(&reply)).await?; + return Ok(()); + } + } + } + } +} diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs new file mode 100644 index 0000000000..f007f3e0b6 --- /dev/null +++ b/proxy/src/scram.rs @@ -0,0 +1,59 @@ +//! Salted Challenge Response Authentication Mechanism. +//! +//! RFC: . +//! +//! Reference implementation: +//! * +//! * + +mod exchange; +mod key; +mod messages; +mod password; +mod secret; +mod signature; + +pub use secret::*; + +pub use exchange::Exchange; +pub use secret::ServerSecret; + +use hmac::{Hmac, Mac, NewMac}; +use sha2::{Digest, Sha256}; + +// TODO: add SCRAM-SHA-256-PLUS +/// A list of supported SCRAM methods. +pub const METHODS: &[&str] = &["SCRAM-SHA-256"]; + +/// Decode base64 into array without any heap allocations +fn base64_decode_array(input: impl AsRef<[u8]>) -> Option<[u8; N]> { + let mut bytes = [0u8; N]; + + let size = base64::decode_config_slice(input, base64::STANDARD, &mut bytes).ok()?; + if size != N { + return None; + } + + Some(bytes) +} + +/// This function essentially is `Hmac(sha256, key, input)`. +/// Further reading: . +fn hmac_sha256<'a>(key: &[u8], parts: impl IntoIterator) -> [u8; 32] { + let mut mac = Hmac::::new_varkey(key).expect("bad key size"); + parts.into_iter().for_each(|s| mac.update(s)); + + // TODO: maybe newer `hmac` et al already migrated to regular arrays? + let mut result = [0u8; 32]; + result.copy_from_slice(mac.finalize().into_bytes().as_slice()); + result +} + +fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { + let mut hasher = Sha256::new(); + parts.into_iter().for_each(|s| hasher.update(s)); + + let mut result = [0u8; 32]; + result.copy_from_slice(hasher.finalize().as_slice()); + result +} diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs new file mode 100644 index 0000000000..5a986b965a --- /dev/null +++ b/proxy/src/scram/exchange.rs @@ -0,0 +1,134 @@ +//! Implementation of the SCRAM authentication algorithm. + +use super::messages::{ + ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN, +}; +use super::secret::ServerSecret; +use super::signature::SignatureBuilder; +use crate::sasl::{self, ChannelBinding, Error as SaslError}; + +/// The only channel binding mode we currently support. +#[derive(Debug)] +struct TlsServerEndPoint; + +impl std::fmt::Display for TlsServerEndPoint { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "tls-server-end-point") + } +} + +impl std::str::FromStr for TlsServerEndPoint { + type Err = sasl::Error; + + fn from_str(s: &str) -> Result { + match s { + "tls-server-end-point" => Ok(TlsServerEndPoint), + _ => Err(sasl::Error::ChannelBindingBadMethod(s.into())), + } + } +} + +#[derive(Debug)] +enum ExchangeState { + /// Waiting for [`ClientFirstMessage`]. + Initial, + /// Waiting for [`ClientFinalMessage`]. + SaltSent { + cbind_flag: ChannelBinding, + client_first_message_bare: String, + server_first_message: OwnedServerFirstMessage, + }, +} + +/// Server's side of SCRAM auth algorithm. +#[derive(Debug)] +pub struct Exchange<'a> { + state: ExchangeState, + secret: &'a ServerSecret, + nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN], + cert_digest: Option<&'a [u8]>, +} + +impl<'a> Exchange<'a> { + pub fn new( + secret: &'a ServerSecret, + nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN], + cert_digest: Option<&'a [u8]>, + ) -> Self { + Self { + state: ExchangeState::Initial, + secret, + nonce, + cert_digest, + } + } +} + +impl sasl::Mechanism for Exchange<'_> { + fn exchange(mut self, input: &str) -> sasl::Result<(Option, String)> { + use ExchangeState::*; + match &self.state { + Initial => { + let client_first_message = + ClientFirstMessage::parse(input).ok_or(SaslError::BadClientMessage)?; + + let server_first_message = client_first_message.build_server_first_message( + &(self.nonce)(), + &self.secret.salt_base64, + self.secret.iterations, + ); + let msg = server_first_message.as_str().to_owned(); + + self.state = SaltSent { + cbind_flag: client_first_message.cbind_flag.and_then(str::parse)?, + client_first_message_bare: client_first_message.bare.to_owned(), + server_first_message, + }; + + Ok((Some(self), msg)) + } + SaltSent { + cbind_flag, + client_first_message_bare, + server_first_message, + } => { + let client_final_message = + ClientFinalMessage::parse(input).ok_or(SaslError::BadClientMessage)?; + + let channel_binding = cbind_flag.encode(|_| { + self.cert_digest + .map(base64::encode) + .ok_or(SaslError::ChannelBindingFailed("no cert digest provided")) + })?; + + // This might've been caused by a MITM attack + if client_final_message.channel_binding != channel_binding { + return Err(SaslError::ChannelBindingFailed("data mismatch")); + } + + if client_final_message.nonce != server_first_message.nonce() { + return Err(SaslError::AuthenticationFailed("bad nonce")); + } + + let signature_builder = SignatureBuilder { + client_first_message_bare, + server_first_message: server_first_message.as_str(), + client_final_message_without_proof: client_final_message.without_proof, + }; + + let client_key = signature_builder + .build(&self.secret.stored_key) + .derive_client_key(&client_final_message.proof); + + if client_key.sha256() != self.secret.stored_key { + return Err(SaslError::AuthenticationFailed("keys don't match")); + } + + let msg = client_final_message + .build_server_final_message(signature_builder, &self.secret.server_key); + + Ok((None, msg)) + } + } + } +} diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs new file mode 100644 index 0000000000..1c13471bc3 --- /dev/null +++ b/proxy/src/scram/key.rs @@ -0,0 +1,33 @@ +//! Tools for client/server/stored key management. + +/// Faithfully taken from PostgreSQL. +pub const SCRAM_KEY_LEN: usize = 32; + +/// One of the keys derived from the [password](super::password::SaltedPassword). +/// We use the same structure for all keys, i.e. +/// `ClientKey`, `StoredKey`, and `ServerKey`. +#[derive(Default, Debug, PartialEq, Eq)] +#[repr(transparent)] +pub struct ScramKey { + bytes: [u8; SCRAM_KEY_LEN], +} + +impl ScramKey { + pub fn sha256(&self) -> Self { + super::sha256([self.as_ref()]).into() + } +} + +impl From<[u8; SCRAM_KEY_LEN]> for ScramKey { + #[inline(always)] + fn from(bytes: [u8; SCRAM_KEY_LEN]) -> Self { + Self { bytes } + } +} + +impl AsRef<[u8]> for ScramKey { + #[inline(always)] + fn as_ref(&self) -> &[u8] { + &self.bytes + } +} diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs new file mode 100644 index 0000000000..f6e6133adf --- /dev/null +++ b/proxy/src/scram/messages.rs @@ -0,0 +1,232 @@ +//! Definitions for SCRAM messages. + +use super::base64_decode_array; +use super::key::{ScramKey, SCRAM_KEY_LEN}; +use super::signature::SignatureBuilder; +use crate::sasl::ChannelBinding; +use std::fmt; +use std::ops::Range; + +/// Faithfully taken from PostgreSQL. +pub const SCRAM_RAW_NONCE_LEN: usize = 18; + +/// Although we ignore all extensions, we still have to validate the message. +fn validate_sasl_extensions<'a>(parts: impl Iterator) -> Option<()> { + for mut chars in parts.map(|s| s.chars()) { + let attr = chars.next()?; + if !('a'..'z').contains(&attr) && !('A'..'Z').contains(&attr) { + return None; + } + let eq = chars.next()?; + if eq != '=' { + return None; + } + } + + Some(()) +} + +#[derive(Debug)] +pub struct ClientFirstMessage<'a> { + /// `client-first-message-bare`. + pub bare: &'a str, + /// Channel binding mode. + pub cbind_flag: ChannelBinding<&'a str>, + /// (Client username)[]. + pub username: &'a str, + /// Client nonce. + pub nonce: &'a str, +} + +impl<'a> ClientFirstMessage<'a> { + // NB: FromStr doesn't work with lifetimes + pub fn parse(input: &'a str) -> Option { + let mut parts = input.split(','); + + let cbind_flag = ChannelBinding::parse(parts.next()?)?; + + // PG doesn't support authorization identity, + // so we don't bother defining GS2 header type + let authzid = parts.next()?; + if !authzid.is_empty() { + return None; + } + + // Unfortunately, `parts.as_str()` is unstable + let pos = authzid.as_ptr() as usize - input.as_ptr() as usize + 1; + let (_, bare) = input.split_at(pos); + + // In theory, these might be preceded by "reserved-mext" (i.e. "m=") + let username = parts.next()?.strip_prefix("n=")?; + let nonce = parts.next()?.strip_prefix("r=")?; + + // Validate but ignore auth extensions + validate_sasl_extensions(parts)?; + + Some(Self { + bare, + cbind_flag, + username, + nonce, + }) + } + + /// Build a response to [`ClientFirstMessage`]. + pub fn build_server_first_message( + &self, + nonce: &[u8; SCRAM_RAW_NONCE_LEN], + salt_base64: &str, + iterations: u32, + ) -> OwnedServerFirstMessage { + use std::fmt::Write; + + let mut message = String::new(); + write!(&mut message, "r={}", self.nonce).unwrap(); + base64::encode_config_buf(nonce, base64::STANDARD, &mut message); + let combined_nonce = 2..message.len(); + write!(&mut message, ",s={},i={}", salt_base64, iterations).unwrap(); + + // This design guarantees that it's impossible to create a + // server-first-message without receiving a client-first-message + OwnedServerFirstMessage { + message, + nonce: combined_nonce, + } + } +} + +#[derive(Debug)] +pub struct ClientFinalMessage<'a> { + /// `client-final-message-without-proof`. + pub without_proof: &'a str, + /// Channel binding data (base64). + pub channel_binding: &'a str, + /// Combined client & server nonce. + pub nonce: &'a str, + /// Client auth proof. + pub proof: [u8; SCRAM_KEY_LEN], +} + +impl<'a> ClientFinalMessage<'a> { + // NB: FromStr doesn't work with lifetimes + pub fn parse(input: &'a str) -> Option { + let (without_proof, proof) = input.rsplit_once(',')?; + + let mut parts = without_proof.split(','); + let channel_binding = parts.next()?.strip_prefix("c=")?; + let nonce = parts.next()?.strip_prefix("r=")?; + + // Validate but ignore auth extensions + validate_sasl_extensions(parts)?; + + let proof = base64_decode_array(proof.strip_prefix("p=")?)?; + + Some(Self { + without_proof, + channel_binding, + nonce, + proof, + }) + } + + /// Build a response to [`ClientFinalMessage`]. + pub fn build_server_final_message( + &self, + signature_builder: SignatureBuilder, + server_key: &ScramKey, + ) -> String { + let mut buf = String::from("v="); + base64::encode_config_buf( + signature_builder.build(server_key), + base64::STANDARD, + &mut buf, + ); + + buf + } +} + +/// We need to keep a convenient representation of this +/// message for the next authentication step. +pub struct OwnedServerFirstMessage { + /// Owned `server-first-message`. + message: String, + /// Slice into `message`. + nonce: Range, +} + +impl OwnedServerFirstMessage { + /// Extract combined nonce from the message. + #[inline(always)] + pub fn nonce(&self) -> &str { + &self.message[self.nonce.clone()] + } + + /// Get reference to a text representation of the message. + #[inline(always)] + pub fn as_str(&self) -> &str { + &self.message + } +} + +impl fmt::Debug for OwnedServerFirstMessage { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ServerFirstMessage") + .field("message", &self.as_str()) + .field("nonce", &self.nonce()) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_client_first_message() { + use ChannelBinding::*; + + // (Almost) real strings captured during debug sessions + let cases = [ + (NotSupportedClient, "n,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju"), + (NotSupportedServer, "y,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju"), + ( + Required("tls-server-end-point"), + "p=tls-server-end-point,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju", + ), + ]; + + for (cb, input) in cases { + let msg = ClientFirstMessage::parse(input).unwrap(); + + assert_eq!(msg.bare, "n=pepe,r=t8JwklwKecDLwSsA72rHmVju"); + assert_eq!(msg.username, "pepe"); + assert_eq!(msg.nonce, "t8JwklwKecDLwSsA72rHmVju"); + assert_eq!(msg.cbind_flag, cb); + } + } + + #[test] + fn parse_client_final_message() { + let input = [ + "c=eSws", + "r=iiYEfS3rOgn8S3rtpSdrOsHtPLWvIkdgmHxA0hf3JNOAG4dU", + "p=SRpfsIVS4Gk11w1LqQ4QvCUBZYQmqXNSDEcHqbQ3CHI=", + ] + .join(","); + + let msg = ClientFinalMessage::parse(&input).unwrap(); + assert_eq!( + msg.without_proof, + "c=eSws,r=iiYEfS3rOgn8S3rtpSdrOsHtPLWvIkdgmHxA0hf3JNOAG4dU" + ); + assert_eq!( + msg.nonce, + "iiYEfS3rOgn8S3rtpSdrOsHtPLWvIkdgmHxA0hf3JNOAG4dU" + ); + assert_eq!( + base64::encode(msg.proof), + "SRpfsIVS4Gk11w1LqQ4QvCUBZYQmqXNSDEcHqbQ3CHI=" + ); + } +} diff --git a/proxy/src/scram/password.rs b/proxy/src/scram/password.rs new file mode 100644 index 0000000000..656780d853 --- /dev/null +++ b/proxy/src/scram/password.rs @@ -0,0 +1,48 @@ +//! Password hashing routines. + +use super::key::ScramKey; + +pub const SALTED_PASSWORD_LEN: usize = 32; + +/// Salted hashed password is essential for [key](super::key) derivation. +#[repr(transparent)] +pub struct SaltedPassword { + bytes: [u8; SALTED_PASSWORD_LEN], +} + +impl SaltedPassword { + /// See `scram-common.c : scram_SaltedPassword` for details. + /// Further reading: (see `PBKDF2`). + pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword { + let one = 1_u32.to_be_bytes(); // magic + + let mut current = super::hmac_sha256(password, [salt, &one]); + let mut result = current; + for _ in 1..iterations { + current = super::hmac_sha256(password, [current.as_ref()]); + // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094 + for (i, x) in current.iter().enumerate() { + result[i] ^= x; + } + } + + result.into() + } + + /// Derive `ClientKey` from a salted hashed password. + pub fn client_key(&self) -> ScramKey { + super::hmac_sha256(&self.bytes, [b"Client Key".as_ref()]).into() + } + + /// Derive `ServerKey` from a salted hashed password. + pub fn server_key(&self) -> ScramKey { + super::hmac_sha256(&self.bytes, [b"Server Key".as_ref()]).into() + } +} + +impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword { + #[inline(always)] + fn from(bytes: [u8; SALTED_PASSWORD_LEN]) -> Self { + Self { bytes } + } +} diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs new file mode 100644 index 0000000000..e8d180bcdd --- /dev/null +++ b/proxy/src/scram/secret.rs @@ -0,0 +1,116 @@ +//! Tools for SCRAM server secret management. + +use super::base64_decode_array; +use super::key::ScramKey; + +/// Server secret is produced from [password](super::password::SaltedPassword) +/// and is used throughout the authentication process. +#[derive(Debug)] +pub struct ServerSecret { + /// Number of iterations for `PBKDF2` function. + pub iterations: u32, + /// Salt used to hash user's password. + pub salt_base64: String, + /// Hashed `ClientKey`. + pub stored_key: ScramKey, + /// Used by client to verify server's signature. + pub server_key: ScramKey, +} + +impl ServerSecret { + pub fn parse(input: &str) -> Option { + // SCRAM-SHA-256$:$: + let s = input.strip_prefix("SCRAM-SHA-256$")?; + let (params, keys) = s.split_once('$')?; + + let ((iterations, salt), (stored_key, server_key)) = + params.split_once(':').zip(keys.split_once(':'))?; + + let secret = ServerSecret { + iterations: iterations.parse().ok()?, + salt_base64: salt.to_owned(), + stored_key: base64_decode_array(stored_key)?.into(), + server_key: base64_decode_array(server_key)?.into(), + }; + + Some(secret) + } + + /// To avoid revealing information to an attacker, we use a + /// mocked server secret even if the user doesn't exist. + /// See `auth-scram.c : mock_scram_secret` for details. + pub fn mock(user: &str, nonce: &[u8; 32]) -> Self { + // Refer to `auth-scram.c : scram_mock_salt`. + let mocked_salt = super::sha256([user.as_bytes(), nonce]); + + Self { + iterations: 4096, + salt_base64: base64::encode(&mocked_salt), + stored_key: ScramKey::default(), + server_key: ScramKey::default(), + } + } + + /// Build a new server secret from the prerequisites. + /// XXX: We only use this function in tests. + #[cfg(test)] + pub fn build(password: &str, salt: &[u8], iterations: u32) -> Option { + // TODO: implement proper password normalization required by the RFC + if !password.is_ascii() { + return None; + } + + let password = super::password::SaltedPassword::new(password.as_bytes(), salt, iterations); + + Some(Self { + iterations, + salt_base64: base64::encode(&salt), + stored_key: password.client_key().sha256(), + server_key: password.server_key(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_scram_secret() { + let iterations = 4096; + let salt = "+/tQQax7twvwTj64mjBsxQ=="; + let stored_key = "D5h6KTMBlUvDJk2Y8ELfC1Sjtc6k9YHjRyuRZyBNJns="; + let server_key = "Pi3QHbcluX//NDfVkKlFl88GGzlJ5LkyPwcdlN/QBvI="; + + let secret = format!( + "SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}", + iterations = iterations, + salt = salt, + stored_key = stored_key, + server_key = server_key, + ); + + let parsed = ServerSecret::parse(&secret).unwrap(); + assert_eq!(parsed.iterations, iterations); + assert_eq!(parsed.salt_base64, salt); + + assert_eq!(base64::encode(parsed.stored_key), stored_key); + assert_eq!(base64::encode(parsed.server_key), server_key); + } + + #[test] + fn build_scram_secret() { + let salt = b"salt"; + let secret = ServerSecret::build("password", salt, 4096).unwrap(); + assert_eq!(secret.iterations, 4096); + assert_eq!(secret.salt_base64, base64::encode(salt)); + assert_eq!( + base64::encode(secret.stored_key.as_ref()), + "lF4cRm/Jky763CN4HtxdHnjV4Q8AWTNlKvGmEFFU8IQ=" + ); + assert_eq!( + base64::encode(secret.server_key.as_ref()), + "ub8OgRsftnk2ccDMOt7ffHXNcikRkQkq1lh4xaAqrSw=" + ); + } +} diff --git a/proxy/src/scram/signature.rs b/proxy/src/scram/signature.rs new file mode 100644 index 0000000000..1c2811d757 --- /dev/null +++ b/proxy/src/scram/signature.rs @@ -0,0 +1,66 @@ +//! Tools for client/server signature management. + +use super::key::{ScramKey, SCRAM_KEY_LEN}; + +/// A collection of message parts needed to derive the client's signature. +#[derive(Debug)] +pub struct SignatureBuilder<'a> { + pub client_first_message_bare: &'a str, + pub server_first_message: &'a str, + pub client_final_message_without_proof: &'a str, +} + +impl SignatureBuilder<'_> { + pub fn build(&self, key: &ScramKey) -> Signature { + let parts = [ + self.client_first_message_bare.as_bytes(), + b",", + self.server_first_message.as_bytes(), + b",", + self.client_final_message_without_proof.as_bytes(), + ]; + + super::hmac_sha256(key.as_ref(), parts).into() + } +} + +/// A computed value which, when xored with `ClientProof`, +/// produces `ClientKey` that we need for authentication. +#[derive(Debug)] +#[repr(transparent)] +pub struct Signature { + bytes: [u8; SCRAM_KEY_LEN], +} + +impl Signature { + /// Derive `ClientKey` from client's signature and proof. + pub fn derive_client_key(&self, proof: &[u8; SCRAM_KEY_LEN]) -> ScramKey { + // This is how the proof is calculated: + // + // 1. sha256(ClientKey) -> StoredKey + // 2. hmac_sha256(StoredKey, [messages...]) -> ClientSignature + // 3. ClientKey ^ ClientSignature -> ClientProof + // + // Step 3 implies that we can restore ClientKey from the proof + // by xoring the latter with the ClientSignature. Afterwards we + // can check that the presumed ClientKey meets our expectations. + let mut signature = self.bytes; + for (i, x) in proof.iter().enumerate() { + signature[i] ^= x; + } + + signature.into() + } +} + +impl From<[u8; SCRAM_KEY_LEN]> for Signature { + fn from(bytes: [u8; SCRAM_KEY_LEN]) -> Self { + Self { bytes } + } +} + +impl AsRef<[u8]> for Signature { + fn as_ref(&self) -> &[u8] { + &self.bytes + } +} diff --git a/zenith_utils/src/postgres_backend.rs b/zenith_utils/src/postgres_backend.rs index 83792f2aca..f984fb4417 100644 --- a/zenith_utils/src/postgres_backend.rs +++ b/zenith_utils/src/postgres_backend.rs @@ -375,9 +375,8 @@ impl PostgresBackend { } AuthType::MD5 => { rand::thread_rng().fill(&mut self.md5_salt); - let md5_salt = self.md5_salt; self.write_message(&BeMessage::AuthenticationMD5Password( - &md5_salt, + self.md5_salt, ))?; self.state = ProtoState::Authentication; } diff --git a/zenith_utils/src/pq_proto.rs b/zenith_utils/src/pq_proto.rs index cb69418c07..403e176b14 100644 --- a/zenith_utils/src/pq_proto.rs +++ b/zenith_utils/src/pq_proto.rs @@ -401,7 +401,8 @@ fn read_null_terminated(buf: &mut Bytes) -> anyhow::Result { #[derive(Debug)] pub enum BeMessage<'a> { AuthenticationOk, - AuthenticationMD5Password(&'a [u8; 4]), + AuthenticationMD5Password([u8; 4]), + AuthenticationSasl(BeAuthenticationSaslMessage<'a>), AuthenticationCleartextPassword, BackendKeyData(CancelKeyData), BindComplete, @@ -429,6 +430,13 @@ pub enum BeMessage<'a> { KeepAlive(WalSndKeepAlive), } +#[derive(Debug)] +pub enum BeAuthenticationSaslMessage<'a> { + Methods(&'a [&'a str]), + Continue(&'a [u8]), + Final(&'a [u8]), +} + #[derive(Debug)] pub enum BeParameterStatusMessage<'a> { Encoding(&'a str), @@ -611,6 +619,32 @@ impl<'a> BeMessage<'a> { .unwrap(); // write into BytesMut can't fail } + BeMessage::AuthenticationSasl(msg) => { + buf.put_u8(b'R'); + write_body(buf, |buf| { + use BeAuthenticationSaslMessage::*; + match msg { + Methods(methods) => { + buf.put_i32(10); // Specifies that SASL auth method is used. + for method in methods.iter() { + write_cstr(method.as_bytes(), buf)?; + } + buf.put_u8(0); // zero terminator for the list + } + Continue(extra) => { + buf.put_i32(11); // Continue SASL auth. + buf.put_slice(extra); + } + Final(extra) => { + buf.put_i32(12); // Send final SASL message. + buf.put_slice(extra); + } + } + Ok::<_, io::Error>(()) + }) + .unwrap() + } + BeMessage::BackendKeyData(key_data) => { buf.put_u8(b'K'); write_body(buf, |buf| { From 9b7a8e67a4ccd0957afd46d857d81374126fb255 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 12 Apr 2022 23:57:33 +0300 Subject: [PATCH 0125/1022] fix deadlock in upload_timeline_checkpoint It originated from the fact that we were calling to fetch_full_index without releasing the read guard, and fetch_full_index tries to acquire read again. For plain mutex it is already a deeadlock, for RW lock deadlock was achieved by an attempt to acquire write access later in the code while still having active read guard up in the stack This is sort of a bandaid because Kirill plans to change this code during removal of an archiving mechanism --- .../src/remote_storage/storage_sync/upload.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pageserver/src/remote_storage/storage_sync/upload.rs b/pageserver/src/remote_storage/storage_sync/upload.rs index f955e04474..7b6d58a661 100644 --- a/pageserver/src/remote_storage/storage_sync/upload.rs +++ b/pageserver/src/remote_storage/storage_sync/upload.rs @@ -1,6 +1,6 @@ //! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints. -use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc}; +use std::{collections::BTreeSet, path::PathBuf, sync::Arc}; use tracing::{debug, error, warn}; @@ -46,13 +46,21 @@ pub(super) async fn upload_timeline_checkpoint< let index_read = index.read().await; let remote_timeline = match index_read.timeline_entry(&sync_id) { - None => None, + None => { + drop(index_read); + None + } Some(entry) => match entry.inner() { - TimelineIndexEntryInner::Full(remote_timeline) => Some(Cow::Borrowed(remote_timeline)), + TimelineIndexEntryInner::Full(remote_timeline) => { + let r = Some(remote_timeline.clone()); + drop(index_read); + r + } TimelineIndexEntryInner::Description(_) => { + drop(index_read); debug!("Found timeline description for the given ids, downloading the full index"); match fetch_full_index(remote_assets.as_ref(), &timeline_dir, sync_id).await { - Ok(remote_timeline) => Some(Cow::Owned(remote_timeline)), + Ok(remote_timeline) => Some(remote_timeline), Err(e) => { error!("Failed to download full timeline index: {:?}", e); sync_queue::push(SyncTask::new( @@ -82,7 +90,6 @@ pub(super) async fn upload_timeline_checkpoint< let already_uploaded_files = remote_timeline .map(|timeline| timeline.stored_files(&timeline_dir)) .unwrap_or_default(); - drop(index_read); match try_upload_checkpoint( config, From 20414c4b16143e1757816c1cd015c01c5343b28d Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 13 Apr 2022 00:20:55 +0300 Subject: [PATCH 0126/1022] defuse possible deadlock in download_timeline too --- .../src/remote_storage/storage_sync/download.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pageserver/src/remote_storage/storage_sync/download.rs b/pageserver/src/remote_storage/storage_sync/download.rs index 773b4a12e5..e5aa74452b 100644 --- a/pageserver/src/remote_storage/storage_sync/download.rs +++ b/pageserver/src/remote_storage/storage_sync/download.rs @@ -1,6 +1,6 @@ //! Timeline synchrnonization logic to put files from archives on remote storage into pageserver's local directory. -use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc}; +use std::{collections::BTreeSet, path::PathBuf, sync::Arc}; use anyhow::{ensure, Context}; use tokio::fs; @@ -64,11 +64,16 @@ pub(super) async fn download_timeline< let remote_timeline = match index_read.timeline_entry(&sync_id) { None => { error!("Cannot download: no timeline is present in the index for given id"); + drop(index_read); return DownloadedTimeline::Abort; } Some(index_entry) => match index_entry.inner() { - TimelineIndexEntryInner::Full(remote_timeline) => Cow::Borrowed(remote_timeline), + TimelineIndexEntryInner::Full(remote_timeline) => { + let cloned = remote_timeline.clone(); + drop(index_read); + cloned + } TimelineIndexEntryInner::Description(_) => { // we do not check here for awaits_download because it is ok // to call this function while the download is in progress @@ -84,7 +89,7 @@ pub(super) async fn download_timeline< ) .await { - Ok(remote_timeline) => Cow::Owned(remote_timeline), + Ok(remote_timeline) => remote_timeline, Err(e) => { error!("Failed to download full timeline index: {:?}", e); From 87020f81265b14db527177b075e78752becb24cc Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 13 Apr 2022 10:59:29 +0300 Subject: [PATCH 0127/1022] Fix CI staging deploy (#1499) - Remove stopped safekeeper from inventory - Fix github pages address after neon rename --- .circleci/ansible/staging.hosts | 1 - .circleci/config.yml | 10 +++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index f6b7bf009f..69f058c2b9 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -5,7 +5,6 @@ zenith-us-stage-ps-2 console_region_id=27 [safekeepers] zenith-us-stage-sk-1 console_region_id=27 zenith-us-stage-sk-2 console_region_id=27 -zenith-us-stage-sk-3 console_region_id=27 zenith-us-stage-sk-4 console_region_id=27 [storage:children] diff --git a/.circleci/config.yml b/.circleci/config.yml index 9d26d5d558..f05e64072a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -405,7 +405,7 @@ jobs: - run: name: Build coverage report command: | - COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1 + COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1 scripts/coverage \ --dir=/tmp/zenith/coverage report \ @@ -416,8 +416,8 @@ jobs: name: Upload coverage report command: | LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME - REPORT_URL=https://zenithdb.github.io/zenith-coverage-data/$CIRCLE_SHA1 - COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1 + REPORT_URL=https://neondatabase.github.io/zenith-coverage-data/$CIRCLE_SHA1 + COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1 scripts/git-upload \ --repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-coverage-data.git \ @@ -593,7 +593,7 @@ jobs: name: Setup helm v3 command: | curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - helm repo add zenithdb https://zenithdb.github.io/helm-charts + helm repo add zenithdb https://neondatabase.github.io/helm-charts - run: name: Re-deploy proxy command: | @@ -643,7 +643,7 @@ jobs: name: Setup helm v3 command: | curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - helm repo add zenithdb https://zenithdb.github.io/helm-charts + helm repo add zenithdb https://neondatabase.github.io/helm-charts - run: name: Re-deploy proxy command: | From 58d5136a615f2c42e26ad78c16eb5fff965335df Mon Sep 17 00:00:00 2001 From: Daniil Date: Wed, 13 Apr 2022 17:16:25 +0300 Subject: [PATCH 0128/1022] compute_tools: check writability handler (#941) --- Cargo.lock | 1 + compute_tools/Cargo.toml | 1 + compute_tools/src/bin/zenith_ctl.rs | 2 ++ compute_tools/src/checker.rs | 46 +++++++++++++++++++++++++++++ compute_tools/src/http_api.rs | 13 ++++++-- compute_tools/src/lib.rs | 1 + 6 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 compute_tools/src/checker.rs diff --git a/Cargo.lock b/Cargo.lock index 7df1c4ab7a..0584b9d6d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -346,6 +346,7 @@ dependencies = [ "serde_json", "tar", "tokio", + "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", "workspace_hack", ] diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 56047093f1..fc52ce4e83 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -17,4 +17,5 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1" tar = "0.4" tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] } +tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/compute_tools/src/bin/zenith_ctl.rs b/compute_tools/src/bin/zenith_ctl.rs index 49ba653fa1..372afbc633 100644 --- a/compute_tools/src/bin/zenith_ctl.rs +++ b/compute_tools/src/bin/zenith_ctl.rs @@ -38,6 +38,7 @@ use clap::Arg; use log::info; use postgres::{Client, NoTls}; +use compute_tools::checker::create_writablity_check_data; use compute_tools::config; use compute_tools::http_api::launch_http_server; use compute_tools::logger::*; @@ -128,6 +129,7 @@ fn run_compute(state: &Arc>) -> Result { handle_roles(&read_state.spec, &mut client)?; handle_databases(&read_state.spec, &mut client)?; + create_writablity_check_data(&mut client)?; // 'Close' connection drop(client); diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs new file mode 100644 index 0000000000..63da6ea23e --- /dev/null +++ b/compute_tools/src/checker.rs @@ -0,0 +1,46 @@ +use std::sync::{Arc, RwLock}; + +use anyhow::{anyhow, Result}; +use log::error; +use postgres::Client; +use tokio_postgres::NoTls; + +use crate::zenith::ComputeState; + +pub fn create_writablity_check_data(client: &mut Client) -> Result<()> { + let query = " + CREATE TABLE IF NOT EXISTS health_check ( + id serial primary key, + updated_at timestamptz default now() + ); + INSERT INTO health_check VALUES (1, now()) + ON CONFLICT (id) DO UPDATE + SET updated_at = now();"; + let result = client.simple_query(query)?; + if result.len() < 2 { + return Err(anyhow::format_err!("executed {} queries", result.len())); + } + Ok(()) +} + +pub async fn check_writability(state: &Arc>) -> Result<()> { + let connstr = state.read().unwrap().connstr.clone(); + let (client, connection) = tokio_postgres::connect(&connstr, NoTls).await?; + if client.is_closed() { + return Err(anyhow!("connection to postgres closed")); + } + tokio::spawn(async move { + if let Err(e) = connection.await { + error!("connection error: {}", e); + } + }); + + let result = client + .simple_query("UPDATE health_check SET updated_at = now() WHERE id = 1;") + .await?; + + if result.len() != 1 { + return Err(anyhow!("statement can't be executed")); + } + Ok(()) +} diff --git a/compute_tools/src/http_api.rs b/compute_tools/src/http_api.rs index 02fab08a6e..7e1a876044 100644 --- a/compute_tools/src/http_api.rs +++ b/compute_tools/src/http_api.rs @@ -11,7 +11,7 @@ use log::{error, info}; use crate::zenith::*; // Service function to handle all available routes. -fn routes(req: Request, state: Arc>) -> Response { +async fn routes(req: Request, state: Arc>) -> Response { match (req.method(), req.uri().path()) { // Timestamp of the last Postgres activity in the plain text. (&Method::GET, "/last_activity") => { @@ -29,6 +29,15 @@ fn routes(req: Request, state: Arc>) -> Response { + info!("serving /check_writability GET request"); + let res = crate::checker::check_writability(&state).await; + match res { + Ok(_) => Response::new(Body::from("true")), + Err(e) => Response::new(Body::from(e.to_string())), + } + } + // Return the `404 Not Found` for any other routes. _ => { let mut not_found = Response::new(Body::from("404 Not Found")); @@ -48,7 +57,7 @@ async fn serve(state: Arc>) { async move { Ok::<_, Infallible>(service_fn(move |req: Request| { let state = state.clone(); - async move { Ok::<_, Infallible>(routes(req, state)) } + async move { Ok::<_, Infallible>(routes(req, state).await) } })) } }); diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 592011d95e..ffb9700a49 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -2,6 +2,7 @@ //! Various tools and helpers to handle cluster / compute node (Postgres) //! configuration. //! +pub mod checker; pub mod config; pub mod http_api; #[macro_use] From 1fd08107cab279c8fd0a0a042a5a04ec58a4fe0d Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Mon, 11 Apr 2022 13:59:26 -0700 Subject: [PATCH 0129/1022] Add ps compaction_threshold config Signed-off-by: Dhammika Pathirana Add ps compaction_threadhold knob for (#707) (#1484) --- pageserver/src/config.rs | 22 +++++++++++++++++++++- pageserver/src/layered_repository.rs | 8 +++----- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 0d5cac8b4f..067073cd9b 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -36,8 +36,8 @@ pub mod defaults { // Target file size, when creating image and delta layers. // This parameter determines L1 layer file size. pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; - pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s"; + pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; pub const DEFAULT_GC_PERIOD: &str = "100 s"; @@ -65,6 +65,7 @@ pub mod defaults { #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes #compaction_period = '{DEFAULT_COMPACTION_PERIOD}' +#compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}' #gc_period = '{DEFAULT_GC_PERIOD}' #gc_horizon = {DEFAULT_GC_HORIZON} @@ -107,6 +108,9 @@ pub struct PageServerConf { // How often to check if there's compaction work to be done. pub compaction_period: Duration, + // Level0 delta layer threshold for compaction. + pub compaction_threshold: usize, + pub gc_horizon: u64, pub gc_period: Duration, @@ -162,6 +166,7 @@ struct PageServerConfigBuilder { compaction_target_size: BuilderValue, compaction_period: BuilderValue, + compaction_threshold: BuilderValue, gc_horizon: BuilderValue, gc_period: BuilderValue, @@ -198,6 +203,7 @@ impl Default for PageServerConfigBuilder { compaction_target_size: Set(DEFAULT_COMPACTION_TARGET_SIZE), compaction_period: Set(humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) .expect("cannot parse default compaction period")), + compaction_threshold: Set(DEFAULT_COMPACTION_THRESHOLD), gc_horizon: Set(DEFAULT_GC_HORIZON), gc_period: Set(humantime::parse_duration(DEFAULT_GC_PERIOD) .expect("cannot parse default gc period")), @@ -241,6 +247,10 @@ impl PageServerConfigBuilder { self.compaction_period = BuilderValue::Set(compaction_period) } + pub fn compaction_threshold(&mut self, compaction_threshold: usize) { + self.compaction_threshold = BuilderValue::Set(compaction_threshold) + } + pub fn gc_horizon(&mut self, gc_horizon: u64) { self.gc_horizon = BuilderValue::Set(gc_horizon) } @@ -313,6 +323,9 @@ impl PageServerConfigBuilder { compaction_period: self .compaction_period .ok_or(anyhow::anyhow!("missing compaction_period"))?, + compaction_threshold: self + .compaction_threshold + .ok_or(anyhow::anyhow!("missing compaction_threshold"))?, gc_horizon: self .gc_horizon .ok_or(anyhow::anyhow!("missing gc_horizon"))?, @@ -453,6 +466,9 @@ impl PageServerConf { builder.compaction_target_size(parse_toml_u64(key, item)?) } "compaction_period" => builder.compaction_period(parse_toml_duration(key, item)?), + "compaction_threshold" => { + builder.compaction_threshold(parse_toml_u64(key, item)? as usize) + } "gc_horizon" => builder.gc_horizon(parse_toml_u64(key, item)?), "gc_period" => builder.gc_period(parse_toml_duration(key, item)?), "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?), @@ -590,6 +606,7 @@ impl PageServerConf { checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, compaction_target_size: 4 * 1024 * 1024, compaction_period: Duration::from_secs(10), + compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD, gc_horizon: defaults::DEFAULT_GC_HORIZON, gc_period: Duration::from_secs(10), wait_lsn_timeout: Duration::from_secs(60), @@ -662,6 +679,7 @@ checkpoint_distance = 111 # in bytes compaction_target_size = 111 # in bytes compaction_period = '111 s' +compaction_threshold = 2 gc_period = '222 s' gc_horizon = 222 @@ -700,6 +718,7 @@ id = 10 checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, compaction_target_size: defaults::DEFAULT_COMPACTION_TARGET_SIZE, compaction_period: humantime::parse_duration(defaults::DEFAULT_COMPACTION_PERIOD)?, + compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD, gc_horizon: defaults::DEFAULT_GC_HORIZON, gc_period: humantime::parse_duration(defaults::DEFAULT_GC_PERIOD)?, wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?, @@ -745,6 +764,7 @@ id = 10 checkpoint_distance: 111, compaction_target_size: 111, compaction_period: Duration::from_secs(111), + compaction_threshold: 2, gc_horizon: 222, gc_period: Duration::from_secs(222), wait_lsn_timeout: Duration::from_secs(111), diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 5e93e3389b..e178ba5222 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1680,13 +1680,11 @@ impl LayeredTimeline { fn compact_level0(&self, target_file_size: u64) -> Result<()> { let layers = self.layers.lock().unwrap(); - // We compact or "shuffle" the level-0 delta layers when 10 have - // accumulated. - static COMPACT_THRESHOLD: usize = 10; - let level0_deltas = layers.get_level0_deltas()?; - if level0_deltas.len() < COMPACT_THRESHOLD { + // We compact or "shuffle" the level-0 delta layers when they've + // accumulated over the compaction threshold. + if level0_deltas.len() < self.conf.compaction_threshold { return Ok(()); } drop(layers); From 49da76237bd073f3f5857d6476e7a2827115cadb Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 13 Apr 2022 18:56:27 +0300 Subject: [PATCH 0130/1022] remove noisy debug log message --- pageserver/src/layered_repository/block_io.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/pageserver/src/layered_repository/block_io.rs b/pageserver/src/layered_repository/block_io.rs index 2eba0aa403..d027b2f0e7 100644 --- a/pageserver/src/layered_repository/block_io.rs +++ b/pageserver/src/layered_repository/block_io.rs @@ -198,7 +198,6 @@ impl BlockWriter for BlockBuf { assert!(buf.len() == PAGE_SZ); let blknum = self.blocks.len(); self.blocks.push(buf); - tracing::info!("buffered block {}", blknum); Ok(blknum as u32) } } From 1d36c5a39e97006daa63b3cb2af0dee3cf1ee3e4 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 13 Apr 2022 19:19:44 +0300 Subject: [PATCH 0131/1022] reenable s3 on staging pagservers by default After deadlockk fix in https://github.com/neondatabase/neon/pull/1496 s3 seems to work normally. There is one more discovered issue but it is not a blocker so can be fixed separately. --- .circleci/ansible/deploy.yaml | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml index 2112102aa7..508843812a 100644 --- a/.circleci/ansible/deploy.yaml +++ b/.circleci/ansible/deploy.yaml @@ -63,21 +63,18 @@ tags: - pageserver - # It seems that currently S3 integration does not play well - # even with fresh pageserver without a burden of old data. - # TODO: turn this back on once the issue is solved. - # - name: update remote storage (s3) config - # lineinfile: - # path: /storage/pageserver/data/pageserver.toml - # line: "{{ item }}" - # loop: - # - "[remote_storage]" - # - "bucket_name = '{{ bucket_name }}'" - # - "bucket_region = '{{ bucket_region }}'" - # - "prefix_in_bucket = '{{ inventory_hostname }}'" - # become: true - # tags: - # - pageserver + - name: update remote storage (s3) config + lineinfile: + path: /storage/pageserver/data/pageserver.toml + line: "{{ item }}" + loop: + - "[remote_storage]" + - "bucket_name = '{{ bucket_name }}'" + - "bucket_region = '{{ bucket_region }}'" + - "prefix_in_bucket = '{{ inventory_hostname }}'" + become: true + tags: + - pageserver - name: upload systemd service definition ansible.builtin.template: From a0781f229c5574ab4fdae6b63175b7da8846921d Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Wed, 13 Apr 2022 14:08:42 -0700 Subject: [PATCH 0132/1022] Add ps compact command Signed-off-by: Dhammika Pathirana Add ps compact command to api (#707) (#1484) --- pageserver/src/page_service.rs | 20 ++++++++++++++++++++ pageserver/src/repository.rs | 6 ++++-- test_runner/fixtures/compare_fixtures.py | 3 +++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index e7a4117b3e..c09b032e48 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -713,6 +713,26 @@ impl postgres_backend::Handler for PageServerHandler { Some(result.elapsed.as_millis().to_string().as_bytes()), ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + } else if query_string.starts_with("compact ") { + // Run compaction immediately on given timeline. + // FIXME This is just for tests. Don't expect this to be exposed to + // the users or the api. + + // compact + let re = Regex::new(r"^compact ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)?").unwrap(); + + let caps = re + .captures(query_string) + .with_context(|| format!("Invalid compact: '{}'", query_string))?; + + let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; + let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid) + .context("Couldn't load timeline")?; + timeline.tline.compact()?; + + pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? + .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("checkpoint ") { // Run checkpoint immediately on given timeline. diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 02334d3229..eda9a3168d 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -252,8 +252,10 @@ pub trait Repository: Send + Sync { checkpoint_before_gc: bool, ) -> Result; - /// perform one compaction iteration. - /// this function is periodically called by compactor thread. + /// Perform one compaction iteration. + /// This function is periodically called by compactor thread. + /// Also it can be explicitly requested per timeline through page server + /// api's 'compact' command. fn compaction_iteration(&self) -> Result<()>; /// detaches locally available timeline by stopping all threads and removing all the data. diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 750b02c894..598ee10f8e 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -87,6 +87,9 @@ class ZenithCompare(PgCompare): def flush(self): self.pscur.execute(f"do_gc {self.env.initial_tenant.hex} {self.timeline} 0") + def compact(self): + self.pscur.execute(f"compact {self.env.initial_tenant.hex} {self.timeline}") + def report_peak_memory_use(self) -> None: self.zenbenchmark.record("peak_mem", self.zenbenchmark.get_peak_mem(self.env.pageserver) / 1024, From cdf04b6a9fb2d5d225d12a2a74fae6c6eec26da6 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 14 Apr 2022 09:31:35 +0300 Subject: [PATCH 0133/1022] Fix control file updates in safekeeper (#1452) Now control_file::Storage implements Deref for read-only access to the state. All updates should clone the state before modifying and persisting. --- walkeeper/src/control_file.rs | 57 ++++++++++++--- walkeeper/src/safekeeper.rs | 126 ++++++++++++++++++++-------------- walkeeper/src/timeline.rs | 16 ++--- 3 files changed, 127 insertions(+), 72 deletions(-) diff --git a/walkeeper/src/control_file.rs b/walkeeper/src/control_file.rs index 8b4e618661..7cc53edeb0 100644 --- a/walkeeper/src/control_file.rs +++ b/walkeeper/src/control_file.rs @@ -6,6 +6,7 @@ use lazy_static::lazy_static; use std::fs::{self, File, OpenOptions}; use std::io::{Read, Write}; +use std::ops::Deref; use std::path::{Path, PathBuf}; use tracing::*; @@ -37,8 +38,10 @@ lazy_static! { .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec"); } -pub trait Storage { - /// Persist safekeeper state on disk. +/// Storage should keep actual state inside of it. It should implement Deref +/// trait to access state fields and have persist method for updating that state. +pub trait Storage: Deref { + /// Persist safekeeper state on disk and update internal state. fn persist(&mut self, s: &SafeKeeperState) -> Result<()>; } @@ -48,19 +51,47 @@ pub struct FileStorage { timeline_dir: PathBuf, conf: SafeKeeperConf, persist_control_file_seconds: Histogram, + + /// Last state persisted to disk. + state: SafeKeeperState, } impl FileStorage { - pub fn new(zttid: &ZTenantTimelineId, conf: &SafeKeeperConf) -> FileStorage { + pub fn restore_new(zttid: &ZTenantTimelineId, conf: &SafeKeeperConf) -> Result { let timeline_dir = conf.timeline_dir(zttid); let tenant_id = zttid.tenant_id.to_string(); let timeline_id = zttid.timeline_id.to_string(); - FileStorage { + + let state = Self::load_control_file_conf(conf, zttid)?; + + Ok(FileStorage { timeline_dir, conf: conf.clone(), persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS .with_label_values(&[&tenant_id, &timeline_id]), - } + state, + }) + } + + pub fn create_new( + zttid: &ZTenantTimelineId, + conf: &SafeKeeperConf, + state: SafeKeeperState, + ) -> Result { + let timeline_dir = conf.timeline_dir(zttid); + let tenant_id = zttid.tenant_id.to_string(); + let timeline_id = zttid.timeline_id.to_string(); + + let mut store = FileStorage { + timeline_dir, + conf: conf.clone(), + persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS + .with_label_values(&[&tenant_id, &timeline_id]), + state: state.clone(), + }; + + store.persist(&state)?; + Ok(store) } // Check the magic/version in the on-disk data and deserialize it, if possible. @@ -141,6 +172,14 @@ impl FileStorage { } } +impl Deref for FileStorage { + type Target = SafeKeeperState; + + fn deref(&self) -> &Self::Target { + &self.state + } +} + impl Storage for FileStorage { // persists state durably to underlying storage // for description see https://lwn.net/Articles/457667/ @@ -201,6 +240,9 @@ impl Storage for FileStorage { .and_then(|f| f.sync_all()) .context("failed to sync control file directory")?; } + + // update internal state + self.state = s.clone(); Ok(()) } } @@ -228,7 +270,7 @@ mod test { ) -> Result<(FileStorage, SafeKeeperState)> { fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); Ok(( - FileStorage::new(zttid, conf), + FileStorage::restore_new(zttid, conf)?, FileStorage::load_control_file_conf(conf, zttid)?, )) } @@ -239,8 +281,7 @@ mod test { ) -> Result<(FileStorage, SafeKeeperState)> { fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); let state = SafeKeeperState::empty(); - let mut storage = FileStorage::new(zttid, conf); - storage.persist(&state)?; + let storage = FileStorage::create_new(zttid, conf, state.clone())?; Ok((storage, state)) } diff --git a/walkeeper/src/safekeeper.rs b/walkeeper/src/safekeeper.rs index 1e23d87b34..22a8481e45 100644 --- a/walkeeper/src/safekeeper.rs +++ b/walkeeper/src/safekeeper.rs @@ -210,6 +210,7 @@ pub struct SafekeeperMemState { pub s3_wal_lsn: Lsn, // TODO: keep only persistent version pub peer_horizon_lsn: Lsn, pub remote_consistent_lsn: Lsn, + pub proposer_uuid: PgUuid, } impl SafeKeeperState { @@ -502,9 +503,8 @@ pub struct SafeKeeper { epoch_start_lsn: Lsn, pub inmem: SafekeeperMemState, // in memory part - pub s: SafeKeeperState, // persistent part + pub state: CTRL, // persistent state storage - pub control_store: CTRL, pub wal_store: WAL, } @@ -516,14 +516,14 @@ where // constructor pub fn new( ztli: ZTimelineId, - control_store: CTRL, + state: CTRL, mut wal_store: WAL, - state: SafeKeeperState, ) -> Result> { if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id { bail!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id); } + // initialize wal_store, if state is already initialized wal_store.init_storage(&state)?; Ok(SafeKeeper { @@ -535,23 +535,25 @@ where s3_wal_lsn: state.s3_wal_lsn, peer_horizon_lsn: state.peer_horizon_lsn, remote_consistent_lsn: state.remote_consistent_lsn, + proposer_uuid: state.proposer_uuid, }, - s: state, - control_store, + state, wal_store, }) } /// Get history of term switches for the available WAL fn get_term_history(&self) -> TermHistory { - self.s + self.state .acceptor_state .term_history .up_to(self.wal_store.flush_lsn()) } pub fn get_epoch(&self) -> Term { - self.s.acceptor_state.get_epoch(self.wal_store.flush_lsn()) + self.state + .acceptor_state + .get_epoch(self.wal_store.flush_lsn()) } /// Process message from proposer and possibly form reply. Concurrent @@ -587,46 +589,47 @@ where ); } /* Postgres upgrade is not treated as fatal error */ - if msg.pg_version != self.s.server.pg_version - && self.s.server.pg_version != UNKNOWN_SERVER_VERSION + if msg.pg_version != self.state.server.pg_version + && self.state.server.pg_version != UNKNOWN_SERVER_VERSION { info!( "incompatible server version {}, expected {}", - msg.pg_version, self.s.server.pg_version + msg.pg_version, self.state.server.pg_version ); } - if msg.tenant_id != self.s.tenant_id { + if msg.tenant_id != self.state.tenant_id { bail!( "invalid tenant ID, got {}, expected {}", msg.tenant_id, - self.s.tenant_id + self.state.tenant_id ); } - if msg.ztli != self.s.timeline_id { + if msg.ztli != self.state.timeline_id { bail!( "invalid timeline ID, got {}, expected {}", msg.ztli, - self.s.timeline_id + self.state.timeline_id ); } // set basic info about server, if not yet // TODO: verify that is doesn't change after - self.s.server.system_id = msg.system_id; - self.s.server.wal_seg_size = msg.wal_seg_size; - self.control_store - .persist(&self.s) - .context("failed to persist shared state")?; + { + let mut state = self.state.clone(); + state.server.system_id = msg.system_id; + state.server.wal_seg_size = msg.wal_seg_size; + self.state.persist(&state)?; + } // pass wal_seg_size to read WAL and find flush_lsn - self.wal_store.init_storage(&self.s)?; + self.wal_store.init_storage(&self.state)?; info!( "processed greeting from proposer {:?}, sending term {:?}", - msg.proposer_id, self.s.acceptor_state.term + msg.proposer_id, self.state.acceptor_state.term ); Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting { - term: self.s.acceptor_state.term, + term: self.state.acceptor_state.term, }))) } @@ -637,17 +640,19 @@ where ) -> Result> { // initialize with refusal let mut resp = VoteResponse { - term: self.s.acceptor_state.term, + term: self.state.acceptor_state.term, vote_given: false as u64, flush_lsn: self.wal_store.flush_lsn(), - truncate_lsn: self.s.peer_horizon_lsn, + truncate_lsn: self.state.peer_horizon_lsn, term_history: self.get_term_history(), }; - if self.s.acceptor_state.term < msg.term { - self.s.acceptor_state.term = msg.term; + if self.state.acceptor_state.term < msg.term { + let mut state = self.state.clone(); + state.acceptor_state.term = msg.term; // persist vote before sending it out - self.control_store.persist(&self.s)?; - resp.term = self.s.acceptor_state.term; + self.state.persist(&state)?; + + resp.term = self.state.acceptor_state.term; resp.vote_given = true as u64; } info!("processed VoteRequest for term {}: {:?}", msg.term, &resp); @@ -656,9 +661,10 @@ where /// Bump our term if received a note from elected proposer with higher one fn bump_if_higher(&mut self, term: Term) -> Result<()> { - if self.s.acceptor_state.term < term { - self.s.acceptor_state.term = term; - self.control_store.persist(&self.s)?; + if self.state.acceptor_state.term < term { + let mut state = self.state.clone(); + state.acceptor_state.term = term; + self.state.persist(&state)?; } Ok(()) } @@ -666,9 +672,9 @@ where /// Form AppendResponse from current state. fn append_response(&self) -> AppendResponse { let ar = AppendResponse { - term: self.s.acceptor_state.term, + term: self.state.acceptor_state.term, flush_lsn: self.wal_store.flush_lsn(), - commit_lsn: self.s.commit_lsn, + commit_lsn: self.state.commit_lsn, // will be filled by the upper code to avoid bothering safekeeper hs_feedback: HotStandbyFeedback::empty(), zenith_feedback: ZenithFeedback::empty(), @@ -681,7 +687,7 @@ where info!("received ProposerElected {:?}", msg); self.bump_if_higher(msg.term)?; // If our term is higher, ignore the message (next feedback will inform the compute) - if self.s.acceptor_state.term > msg.term { + if self.state.acceptor_state.term > msg.term { return Ok(None); } @@ -692,8 +698,11 @@ where self.wal_store.truncate_wal(msg.start_streaming_at)?; // and now adopt term history from proposer - self.s.acceptor_state.term_history = msg.term_history.clone(); - self.control_store.persist(&self.s)?; + { + let mut state = self.state.clone(); + state.acceptor_state.term_history = msg.term_history.clone(); + self.state.persist(&state)?; + } info!("start receiving WAL since {:?}", msg.start_streaming_at); @@ -715,13 +724,13 @@ where // Also note that commit_lsn can reach epoch_start_lsn earlier // that we receive new epoch_start_lsn, and we still need to sync // control file in this case. - if commit_lsn == self.epoch_start_lsn && self.s.commit_lsn != commit_lsn { + if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn { self.persist_control_file()?; } // We got our first commit_lsn, which means we should sync // everything to disk, to initialize the state. - if self.s.commit_lsn == Lsn(0) && commit_lsn > Lsn(0) { + if self.state.commit_lsn == Lsn(0) && commit_lsn > Lsn(0) { self.wal_store.flush_wal()?; self.persist_control_file()?; } @@ -731,10 +740,12 @@ where /// Persist in-memory state to the disk. fn persist_control_file(&mut self) -> Result<()> { - self.s.commit_lsn = self.inmem.commit_lsn; - self.s.peer_horizon_lsn = self.inmem.peer_horizon_lsn; + let mut state = self.state.clone(); - self.control_store.persist(&self.s) + state.commit_lsn = self.inmem.commit_lsn; + state.peer_horizon_lsn = self.inmem.peer_horizon_lsn; + state.proposer_uuid = self.inmem.proposer_uuid; + self.state.persist(&state) } /// Handle request to append WAL. @@ -744,13 +755,13 @@ where msg: &AppendRequest, require_flush: bool, ) -> Result> { - if self.s.acceptor_state.term < msg.h.term { + if self.state.acceptor_state.term < msg.h.term { bail!("got AppendRequest before ProposerElected"); } // If our term is higher, immediately refuse the message. - if self.s.acceptor_state.term > msg.h.term { - let resp = AppendResponse::term_only(self.s.acceptor_state.term); + if self.state.acceptor_state.term > msg.h.term { + let resp = AppendResponse::term_only(self.state.acceptor_state.term); return Ok(Some(AcceptorProposerMessage::AppendResponse(resp))); } @@ -758,8 +769,7 @@ where // processing the message. self.epoch_start_lsn = msg.h.epoch_start_lsn; - // TODO: don't update state without persisting to disk - self.s.proposer_uuid = msg.h.proposer_uuid; + self.inmem.proposer_uuid = msg.h.proposer_uuid; // do the job if !msg.wal_data.is_empty() { @@ -790,7 +800,7 @@ where // Update truncate and commit LSN in control file. // To avoid negative impact on performance of extra fsync, do it only // when truncate_lsn delta exceeds WAL segment size. - if self.s.peer_horizon_lsn + (self.s.server.wal_seg_size as u64) + if self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64) < self.inmem.peer_horizon_lsn { self.persist_control_file()?; @@ -829,6 +839,8 @@ where #[cfg(test)] mod tests { + use std::ops::Deref; + use super::*; use crate::wal_storage::Storage; @@ -844,6 +856,14 @@ mod tests { } } + impl Deref for InMemoryState { + type Target = SafeKeeperState; + + fn deref(&self) -> &Self::Target { + &self.persisted_state + } + } + struct DummyWalStore { lsn: Lsn, } @@ -879,7 +899,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::empty()).unwrap(); + let mut sk = SafeKeeper::new(ztli, storage, wal_store).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -890,11 +910,11 @@ mod tests { } // reboot... - let state = sk.control_store.persisted_state.clone(); + let state = sk.state.persisted_state.clone(); let storage = InMemoryState { - persisted_state: state.clone(), + persisted_state: state, }; - sk = SafeKeeper::new(ztli, storage, sk.wal_store, state).unwrap(); + sk = SafeKeeper::new(ztli, storage, sk.wal_store).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request); @@ -911,7 +931,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::empty()).unwrap(); + let mut sk = SafeKeeper::new(ztli, storage, wal_store).unwrap(); let mut ar_hdr = AppendRequestHeader { term: 1, diff --git a/walkeeper/src/timeline.rs b/walkeeper/src/timeline.rs index a76ef77615..a2941a9a5c 100644 --- a/walkeeper/src/timeline.rs +++ b/walkeeper/src/timeline.rs @@ -21,7 +21,6 @@ use crate::broker::SafekeeperInfo; use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; use crate::control_file; -use crate::control_file::Storage as cf_storage; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, SafekeeperMemState, @@ -98,10 +97,9 @@ impl SharedState { peer_ids: Vec, ) -> Result { let state = SafeKeeperState::new(zttid, peer_ids); - let control_store = control_file::FileStorage::new(zttid, conf); + let control_store = control_file::FileStorage::create_new(zttid, conf, state)?; let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); - let mut sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, state)?; - sk.control_store.persist(&sk.s)?; + let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store)?; Ok(Self { notified_commit_lsn: Lsn(0), @@ -116,18 +114,14 @@ impl SharedState { /// Restore SharedState from control file. /// If file doesn't exist, bails out. fn restore(conf: &SafeKeeperConf, zttid: &ZTenantTimelineId) -> Result { - let state = control_file::FileStorage::load_control_file_conf(conf, zttid) - .context("failed to load from control file")?; - - let control_store = control_file::FileStorage::new(zttid, conf); - + let control_store = control_file::FileStorage::restore_new(zttid, conf)?; let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); info!("timeline {} restored", zttid.timeline_id); Ok(Self { notified_commit_lsn: Lsn(0), - sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, state)?, + sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store)?, replicas: Vec::new(), active: false, num_computes: 0, @@ -419,7 +413,7 @@ impl Timeline { pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) { let shared_state = self.mutex.lock().unwrap(); - (shared_state.sk.inmem.clone(), shared_state.sk.s.clone()) + (shared_state.sk.inmem.clone(), shared_state.sk.state.clone()) } /// Prepare public safekeeper info for reporting. From 570db6f1681b80e50dbc2d156d037b99ca742099 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 14 Apr 2022 11:28:38 +0300 Subject: [PATCH 0134/1022] Update README for Zenith -> Neon renaming. There's a lot of renaming left to do in the code and docs, but this is a start. Our binaries and many other things are still called "zenith", but I didn't change those in the README, because otherwise the examples won't work. I added a brief note at the top of the README to explain that we're in the process of renaming, until we've renamed everything. --- README.md | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index c8acf526b9..f99785e683 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,22 @@ -# Zenith +# Neon -Zenith is a serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributing data across a cluster of nodes. +Neon is a serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributing data across a cluster of nodes. + +The project used to be called "Zenith". Many of the commands and code comments +still refer to "zenith", but we are in the process of renaming things. ## Architecture overview -A Zenith installation consists of compute nodes and Zenith storage engine. +A Neon installation consists of compute nodes and Neon storage engine. -Compute nodes are stateless PostgreSQL nodes, backed by Zenith storage engine. +Compute nodes are stateless PostgreSQL nodes, backed by Neon storage engine. -Zenith storage engine consists of two major components: +Neon storage engine consists of two major components: - Pageserver. Scalable storage backend for compute nodes. - WAL service. The service that receives WAL from compute node and ensures that it is stored durably. Pageserver consists of: -- Repository - Zenith storage implementation. +- Repository - Neon storage implementation. - WAL receiver - service that receives WAL from WAL service and stores it in the repository. - Page service - service that communicates with compute nodes and responds with pages from the repository. - WAL redo - service that builds pages from base images and WAL records on Page service request. @@ -35,10 +38,10 @@ To run the `psql` client, install the `postgresql-client` package or modify `PAT To run the integration tests or Python scripts (not required to use the code), install Python (3.7 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory. -2. Build zenith and patched postgres +2. Build neon and patched postgres ```sh -git clone --recursive https://github.com/zenithdb/zenith.git -cd zenith +git clone --recursive https://github.com/neondatabase/neon.git +cd neon make -j5 ``` @@ -126,7 +129,7 @@ INSERT 0 1 ## Running tests ```sh -git clone --recursive https://github.com/zenithdb/zenith.git +git clone --recursive https://github.com/neondatabase/neon.git make # builds also postgres and installs it to ./tmp_install ./scripts/pytest ``` @@ -141,14 +144,14 @@ To view your `rustdoc` documentation in a browser, try running `cargo doc --no-d ### Postgres-specific terms -Due to Zenith's very close relation with PostgreSQL internals, there are numerous specific terms used. +Due to Neon's very close relation with PostgreSQL internals, there are numerous specific terms used. Same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use. To get more familiar with this aspect, refer to: -- [Zenith glossary](/docs/glossary.md) +- [Neon glossary](/docs/glossary.md) - [PostgreSQL glossary](https://www.postgresql.org/docs/13/glossary.html) -- Other PostgreSQL documentation and sources (Zenith fork sources can be found [here](https://github.com/zenithdb/postgres)) +- Other PostgreSQL documentation and sources (Neon fork sources can be found [here](https://github.com/neondatabase/postgres)) ## Join the development From 19954dfd8abe154b0db17d7eb45a04acec35cbaf Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 14 Apr 2022 13:31:37 +0300 Subject: [PATCH 0135/1022] Refactor proxy options test to not rely on the 'schema' argument. It was the only test that used the 'schema' argument to the connect() function. I'm about to refactor the option handling and will remove the special 'schema' argument altogether, so rewrite the test to not use it. --- test_runner/batch_others/test_proxy.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/test_runner/batch_others/test_proxy.py b/test_runner/batch_others/test_proxy.py index d2039f9758..a6f828f829 100644 --- a/test_runner/batch_others/test_proxy.py +++ b/test_runner/batch_others/test_proxy.py @@ -5,11 +5,14 @@ def test_proxy_select_1(static_proxy): static_proxy.safe_psql("select 1;") -@pytest.mark.xfail # Proxy eats the extra connection options +# Pass extra options to the server. +# +# Currently, proxy eats the extra connection options, so this fails. +# See https://github.com/neondatabase/neon/issues/1287 +@pytest.mark.xfail def test_proxy_options(static_proxy): - schema_name = "tmp_schema_1" - with static_proxy.connect(schema=schema_name) as conn: + with static_proxy.connect(options="-cproxytest.option=value") as conn: with conn.cursor() as cur: - cur.execute("SHOW search_path;") - search_path = cur.fetchall()[0][0] - assert schema_name == search_path + cur.execute("SHOW proxytest.option;") + value = cur.fetchall()[0][0] + assert value == 'value' From a009fe912a292c0df4479c98c4bb5d62c91e7b68 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 14 Apr 2022 13:31:40 +0300 Subject: [PATCH 0136/1022] Refactor connection option handling in python tests The PgProtocol.connect() function took extra options for username, database, etc. Remove those options, and have a generic way for each subclass of PgProtocol to provide some default options, with the capability override them in the connect() call. --- test_runner/batch_others/test_createuser.py | 2 +- .../batch_others/test_parallel_copy.py | 5 + test_runner/batch_others/test_wal_acceptor.py | 2 +- .../batch_pg_regress/test_isolation.py | 6 +- .../batch_pg_regress/test_pg_regress.py | 6 +- .../batch_pg_regress/test_zenith_regress.py | 6 +- test_runner/fixtures/zenith_fixtures.py | 128 ++++++++---------- 7 files changed, 69 insertions(+), 86 deletions(-) diff --git a/test_runner/batch_others/test_createuser.py b/test_runner/batch_others/test_createuser.py index efb2af3f07..f4bbbc8a7a 100644 --- a/test_runner/batch_others/test_createuser.py +++ b/test_runner/batch_others/test_createuser.py @@ -28,4 +28,4 @@ def test_createuser(zenith_simple_env: ZenithEnv): pg2 = env.postgres.create_start('test_createuser2') # Test that you can connect to new branch as a new user - assert pg2.safe_psql('select current_user', username='testuser') == [('testuser', )] + assert pg2.safe_psql('select current_user', user='testuser') == [('testuser', )] diff --git a/test_runner/batch_others/test_parallel_copy.py b/test_runner/batch_others/test_parallel_copy.py index 4b7cc58d42..a44acecf21 100644 --- a/test_runner/batch_others/test_parallel_copy.py +++ b/test_runner/batch_others/test_parallel_copy.py @@ -19,6 +19,11 @@ async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str) copy_input = repeat_bytes(buf.read(), 5000) pg_conn = await pg.connect_async() + + # PgProtocol.connect_async sets statement_timeout to 2 minutes. + # That's not enough for this test, on a slow system in debug mode. + await pg_conn.execute("SET statement_timeout='300s'") + await pg_conn.copy_to_table(table_name, source=copy_input) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 8f87ff041f..dffcd7cc61 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -379,7 +379,7 @@ class ProposerPostgres(PgProtocol): tenant_id: uuid.UUID, listen_addr: str, port: int): - super().__init__(host=listen_addr, port=port, username='zenith_admin') + super().__init__(host=listen_addr, port=port, user='zenith_admin', dbname='postgres') self.pgdata_dir: str = pgdata_dir self.pg_bin: PgBin = pg_bin diff --git a/test_runner/batch_pg_regress/test_isolation.py b/test_runner/batch_pg_regress/test_isolation.py index ddafc3815b..cde56d9b88 100644 --- a/test_runner/batch_pg_regress/test_isolation.py +++ b/test_runner/batch_pg_regress/test_isolation.py @@ -35,9 +35,9 @@ def test_isolation(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys ] env_vars = { - 'PGPORT': str(pg.port), - 'PGUSER': pg.username, - 'PGHOST': pg.host, + 'PGPORT': str(pg.default_options['port']), + 'PGUSER': pg.default_options['user'], + 'PGHOST': pg.default_options['host'], } # Run the command. diff --git a/test_runner/batch_pg_regress/test_pg_regress.py b/test_runner/batch_pg_regress/test_pg_regress.py index 5199f65216..07d2574f4a 100644 --- a/test_runner/batch_pg_regress/test_pg_regress.py +++ b/test_runner/batch_pg_regress/test_pg_regress.py @@ -35,9 +35,9 @@ def test_pg_regress(zenith_simple_env: ZenithEnv, test_output_dir: str, pg_bin, ] env_vars = { - 'PGPORT': str(pg.port), - 'PGUSER': pg.username, - 'PGHOST': pg.host, + 'PGPORT': str(pg.default_options['port']), + 'PGUSER': pg.default_options['user'], + 'PGHOST': pg.default_options['host'], } # Run the command. diff --git a/test_runner/batch_pg_regress/test_zenith_regress.py b/test_runner/batch_pg_regress/test_zenith_regress.py index 31d5b07093..2b57137d16 100644 --- a/test_runner/batch_pg_regress/test_zenith_regress.py +++ b/test_runner/batch_pg_regress/test_zenith_regress.py @@ -40,9 +40,9 @@ def test_zenith_regress(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, c log.info(pg_regress_command) env_vars = { - 'PGPORT': str(pg.port), - 'PGUSER': pg.username, - 'PGHOST': pg.host, + 'PGPORT': str(pg.default_options['port']), + 'PGUSER': pg.default_options['user'], + 'PGHOST': pg.default_options['host'], } # Run the command. diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index a95809687a..41d1443880 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -27,6 +27,7 @@ from dataclasses import dataclass # Type-related stuff from psycopg2.extensions import connection as PgConnection +from psycopg2.extensions import make_dsn, parse_dsn from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, TypeVar, cast, Union, Tuple from typing_extensions import Literal @@ -238,98 +239,69 @@ def port_distributor(worker_base_port): class PgProtocol: """ Reusable connection logic """ - def __init__(self, - host: str, - port: int, - username: Optional[str] = None, - password: Optional[str] = None, - dbname: Optional[str] = None, - schema: Optional[str] = None): - self.host = host - self.port = port - self.username = username - self.password = password - self.dbname = dbname - self.schema = schema + def __init__(self, **kwargs): + self.default_options = kwargs - def connstr(self, - *, - dbname: Optional[str] = None, - schema: Optional[str] = None, - username: Optional[str] = None, - password: Optional[str] = None, - statement_timeout_ms: Optional[int] = None) -> str: + def connstr(self, **kwargs) -> str: """ Build a libpq connection string for the Postgres instance. """ + return str(make_dsn(**self.conn_options(**kwargs))) - username = username or self.username - password = password or self.password - dbname = dbname or self.dbname or "postgres" - schema = schema or self.schema - res = f'host={self.host} port={self.port} dbname={dbname}' + def conn_options(self, **kwargs): + conn_options = self.default_options.copy() + if 'dsn' in kwargs: + conn_options.update(parse_dsn(kwargs['dsn'])) + conn_options.update(kwargs) - if username: - res = f'{res} user={username}' - - if password: - res = f'{res} password={password}' - - if schema: - res = f"{res} options='-c search_path={schema}'" - - if statement_timeout_ms: - res = f"{res} options='-c statement_timeout={statement_timeout_ms}'" - - return res + # Individual statement timeout in seconds. 2 minutes should be + # enough for our tests, but if you need a longer, you can + # change it by calling "SET statement_timeout" after + # connecting. + if 'options' in conn_options: + conn_options['options'] = f"-cstatement_timeout=120s " + conn_options['options'] + else: + conn_options['options'] = "-cstatement_timeout=120s" + return conn_options # autocommit=True here by default because that's what we need most of the time - def connect( - self, - *, - autocommit=True, - dbname: Optional[str] = None, - schema: Optional[str] = None, - username: Optional[str] = None, - password: Optional[str] = None, - # individual statement timeout in seconds, 2 minutes should be enough for our tests - statement_timeout: Optional[int] = 120 - ) -> PgConnection: + def connect(self, autocommit=True, **kwargs) -> PgConnection: """ Connect to the node. Returns psycopg2's connection object. This method passes all extra params to connstr. """ + conn = psycopg2.connect(**self.conn_options(**kwargs)) - conn = psycopg2.connect( - self.connstr(dbname=dbname, - schema=schema, - username=username, - password=password, - statement_timeout_ms=statement_timeout * - 1000 if statement_timeout else None)) # WARNING: this setting affects *all* tests! conn.autocommit = autocommit return conn - async def connect_async(self, - *, - dbname: str = 'postgres', - username: Optional[str] = None, - password: Optional[str] = None) -> asyncpg.Connection: + async def connect_async(self, **kwargs) -> asyncpg.Connection: """ Connect to the node from async python. Returns asyncpg's connection object. """ - conn = await asyncpg.connect( - host=self.host, - port=self.port, - database=dbname, - user=username or self.username, - password=password, - ) - return conn + # asyncpg takes slightly different options than psycopg2. Try + # to convert the defaults from the psycopg2 format. + + # The psycopg2 option 'dbname' is called 'database' is asyncpg + conn_options = self.conn_options(**kwargs) + if 'dbname' in conn_options: + conn_options['database'] = conn_options.pop('dbname') + + # Convert options='-c=' to server_settings + if 'options' in conn_options: + options = conn_options.pop('options') + for match in re.finditer('-c(\w*)=(\w*)', options): + key = match.group(1) + val = match.group(2) + if 'server_options' in conn_options: + conn_options['server_settings'].update({key: val}) + else: + conn_options['server_settings'] = {key: val} + return await asyncpg.connect(**conn_options) def safe_psql(self, query: str, **kwargs: Any) -> List[Any]: """ @@ -1149,10 +1121,10 @@ class ZenithPageserver(PgProtocol): port: PageserverPort, remote_storage: Optional[RemoteStorage] = None, config_override: Optional[str] = None): - super().__init__(host='localhost', port=port.pg, username='zenith_admin') + super().__init__(host='localhost', port=port.pg, user='zenith_admin') self.env = env self.running = False - self.service_port = port # do not shadow PgProtocol.port which is just int + self.service_port = port self.remote_storage = remote_storage self.config_override = config_override @@ -1291,7 +1263,7 @@ def pg_bin(test_output_dir: str) -> PgBin: class VanillaPostgres(PgProtocol): def __init__(self, pgdatadir: str, pg_bin: PgBin, port: int): - super().__init__(host='localhost', port=port) + super().__init__(host='localhost', port=port, dbname='postgres') self.pgdatadir = pgdatadir self.pg_bin = pg_bin self.running = False @@ -1335,8 +1307,14 @@ def vanilla_pg(test_output_dir: str) -> Iterator[VanillaPostgres]: class ZenithProxy(PgProtocol): def __init__(self, port: int): - super().__init__(host="127.0.0.1", username="pytest", password="pytest", port=port) + super().__init__(host="127.0.0.1", + user="pytest", + password="pytest", + port=port, + dbname='postgres') self.http_port = 7001 + self.host = "127.0.0.1" + self.port = port self._popen: Optional[subprocess.Popen[bytes]] = None def start_static(self, addr="127.0.0.1:5432") -> None: @@ -1380,13 +1358,13 @@ def static_proxy(vanilla_pg) -> Iterator[ZenithProxy]: class Postgres(PgProtocol): """ An object representing a running postgres daemon. """ def __init__(self, env: ZenithEnv, tenant_id: uuid.UUID, port: int): - super().__init__(host='localhost', port=port, username='zenith_admin') - + super().__init__(host='localhost', port=port, user='zenith_admin', dbname='postgres') self.env = env self.running = False self.node_name: Optional[str] = None # dubious, see asserts below self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA self.tenant_id = tenant_id + self.port = port # path to conf is /pgdatadirs/tenants///postgresql.conf def create( From 4a8c66345267bfb11882a10d0260e2aacec6d112 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 14 Apr 2022 13:31:42 +0300 Subject: [PATCH 0137/1022] Refactor pgbench tests. - Remove batch_others/test_pgbench.py. It was a quick check that pgbench works, without actually recording any performance numbers, but that doesn't seem very interesting anymore. Remove it to avoid confusing it with the actual pgbench benchmarks - Run pgbench with "-n" and "-S" options, for two different workloads: simple-updates, and SELECT-only. Previously, we would only run it with the "default" TPCB-like workload. That's more or less the same as the simple-update (-n) workload, but I think the simple-upload workload is more relevant for testing storage performance. The SELECT-only workload is a new thing to measure. - Merge test_perf_pgbench.py and test_perf_pgbench_remote.py. I added a new "remote" implementation of the PgCompare class, which allows running the same tests against an already-running Postgres instance. - Make the PgBenchRunResult.parse_from_output function more flexible. pgbench can print different lines depending on the command-line options, but the parsing function expected a particular set of lines. --- .github/workflows/benchmarking.yml | 13 +- test_runner/batch_others/test_pgbench.py | 14 -- test_runner/fixtures/benchmark_fixture.py | 145 ++++++++---------- test_runner/fixtures/compare_fixtures.py | 49 +++++- test_runner/fixtures/zenith_fixtures.py | 68 ++++++-- test_runner/performance/test_perf_pgbench.py | 116 ++++++++++++-- .../performance/test_perf_pgbench_remote.py | 124 --------------- 7 files changed, 279 insertions(+), 250 deletions(-) delete mode 100644 test_runner/batch_others/test_pgbench.py delete mode 100644 test_runner/performance/test_perf_pgbench_remote.py diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 36df35297d..72041c9d02 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -26,7 +26,7 @@ jobs: runs-on: [self-hosted, zenith-benchmarker] env: - PG_BIN: "/usr/pgsql-13/bin" + POSTGRES_DISTRIB_DIR: "/usr/pgsql-13" steps: - name: Checkout zenith repo @@ -51,7 +51,7 @@ jobs: echo Poetry poetry --version echo Pgbench - $PG_BIN/pgbench --version + $POSTGRES_DISTRIB_DIR/bin/pgbench --version # FIXME cluster setup is skipped due to various changes in console API # for now pre created cluster is used. When API gain some stability @@ -66,7 +66,7 @@ jobs: echo "Starting cluster" # wake up the cluster - $PG_BIN/psql $BENCHMARK_CONNSTR -c "SELECT 1" + $POSTGRES_DISTRIB_DIR/bin/psql $BENCHMARK_CONNSTR -c "SELECT 1" - name: Run benchmark # pgbench is installed system wide from official repo @@ -83,8 +83,11 @@ jobs: # sudo yum install postgresql13-contrib # actual binaries are located in /usr/pgsql-13/bin/ env: - TEST_PG_BENCH_TRANSACTIONS_MATRIX: "5000,10000,20000" - TEST_PG_BENCH_SCALES_MATRIX: "10,15" + # The pgbench test runs two tests of given duration against each scale. + # So the total runtime with these parameters is 2 * 2 * 300 = 1200, or 20 minutes. + # Plus time needed to initialize the test databases. + TEST_PG_BENCH_DURATIONS_MATRIX: "300" + TEST_PG_BENCH_SCALES_MATRIX: "10,100" PLATFORM: "zenith-staging" BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally diff --git a/test_runner/batch_others/test_pgbench.py b/test_runner/batch_others/test_pgbench.py deleted file mode 100644 index 09713023bc..0000000000 --- a/test_runner/batch_others/test_pgbench.py +++ /dev/null @@ -1,14 +0,0 @@ -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.log_helper import log - - -def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin): - env = zenith_simple_env - env.zenith_cli.create_branch("test_pgbench", "empty") - pg = env.postgres.create_start('test_pgbench') - log.info("postgres is running on 'test_pgbench' branch") - - connstr = pg.connstr() - - pg_bin.run_capture(['pgbench', '-i', connstr]) - pg_bin.run_capture(['pgbench'] + '-c 10 -T 5 -P 1 -M prepared'.split() + [connstr]) diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 480eb3f891..a904233e98 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -17,7 +17,7 @@ import warnings from contextlib import contextmanager # Type-related stuff -from typing import Iterator +from typing import Iterator, Optional """ This file contains fixtures for micro-benchmarks. @@ -51,17 +51,12 @@ in the test initialization, or measure disk usage after the test query. @dataclasses.dataclass class PgBenchRunResult: - scale: int number_of_clients: int number_of_threads: int number_of_transactions_actually_processed: int latency_average: float - latency_stddev: float - tps_including_connection_time: float - tps_excluding_connection_time: float - init_duration: float - init_start_timestamp: int - init_end_timestamp: int + latency_stddev: Optional[float] + tps: float run_duration: float run_start_timestamp: int run_end_timestamp: int @@ -69,56 +64,67 @@ class PgBenchRunResult: # TODO progress @classmethod - def parse_from_output( + def parse_from_stdout( cls, - out: 'subprocess.CompletedProcess[str]', - init_duration: float, - init_start_timestamp: int, - init_end_timestamp: int, + stdout: str, run_duration: float, run_start_timestamp: int, run_end_timestamp: int, ): - stdout_lines = out.stdout.splitlines() + stdout_lines = stdout.splitlines() + + latency_stddev = None + # we know significant parts of these values from test input # but to be precise take them from output - # scaling factor: 5 - assert "scaling factor" in stdout_lines[1] - scale = int(stdout_lines[1].split()[-1]) - # number of clients: 1 - assert "number of clients" in stdout_lines[3] - number_of_clients = int(stdout_lines[3].split()[-1]) - # number of threads: 1 - assert "number of threads" in stdout_lines[4] - number_of_threads = int(stdout_lines[4].split()[-1]) - # number of transactions actually processed: 1000/1000 - assert "number of transactions actually processed" in stdout_lines[6] - number_of_transactions_actually_processed = int(stdout_lines[6].split("/")[1]) - # latency average = 19.894 ms - assert "latency average" in stdout_lines[7] - latency_average = stdout_lines[7].split()[-2] - # latency stddev = 3.387 ms - assert "latency stddev" in stdout_lines[8] - latency_stddev = stdout_lines[8].split()[-2] - # tps = 50.219689 (including connections establishing) - assert "(including connections establishing)" in stdout_lines[9] - tps_including_connection_time = stdout_lines[9].split()[2] - # tps = 50.264435 (excluding connections establishing) - assert "(excluding connections establishing)" in stdout_lines[10] - tps_excluding_connection_time = stdout_lines[10].split()[2] + for line in stdout.splitlines(): + # scaling factor: 5 + if line.startswith("scaling factor:"): + scale = int(line.split()[-1]) + # number of clients: 1 + if line.startswith("number of clients: "): + number_of_clients = int(line.split()[-1]) + # number of threads: 1 + if line.startswith("number of threads: "): + number_of_threads = int(line.split()[-1]) + # number of transactions actually processed: 1000/1000 + # OR + # number of transactions actually processed: 1000 + if line.startswith("number of transactions actually processed"): + if "/" in line: + number_of_transactions_actually_processed = int(line.split("/")[1]) + else: + number_of_transactions_actually_processed = int(line.split()[-1]) + # latency average = 19.894 ms + if line.startswith("latency average"): + latency_average = float(line.split()[-2]) + # latency stddev = 3.387 ms + # (only printed with some options) + if line.startswith("latency stddev"): + latency_stddev = float(line.split()[-2]) + + # Get the TPS without initial connection time. The format + # of the tps lines changed in pgbench v14, but we accept + # either format: + # + # pgbench v13 and below: + # tps = 50.219689 (including connections establishing) + # tps = 50.264435 (excluding connections establishing) + # + # pgbench v14: + # initial connection time = 3.858 ms + # tps = 309.281539 (without initial connection time) + if (line.startswith("tps = ") and ("(excluding connections establishing)" in line + or "(without initial connection time)")): + tps = float(line.split()[2]) return cls( - scale=scale, number_of_clients=number_of_clients, number_of_threads=number_of_threads, number_of_transactions_actually_processed=number_of_transactions_actually_processed, - latency_average=float(latency_average), - latency_stddev=float(latency_stddev), - tps_including_connection_time=float(tps_including_connection_time), - tps_excluding_connection_time=float(tps_excluding_connection_time), - init_duration=init_duration, - init_start_timestamp=init_start_timestamp, - init_end_timestamp=init_end_timestamp, + latency_average=latency_average, + latency_stddev=latency_stddev, + tps=tps, run_duration=run_duration, run_start_timestamp=run_start_timestamp, run_end_timestamp=run_end_timestamp, @@ -187,60 +193,41 @@ class ZenithBenchmarker: report=MetricReport.LOWER_IS_BETTER, ) - def record_pg_bench_result(self, pg_bench_result: PgBenchRunResult): - self.record("scale", pg_bench_result.scale, '', MetricReport.TEST_PARAM) - self.record("number_of_clients", + def record_pg_bench_result(self, prefix: str, pg_bench_result: PgBenchRunResult): + self.record(f"{prefix}.number_of_clients", pg_bench_result.number_of_clients, '', MetricReport.TEST_PARAM) - self.record("number_of_threads", + self.record(f"{prefix}.number_of_threads", pg_bench_result.number_of_threads, '', MetricReport.TEST_PARAM) self.record( - "number_of_transactions_actually_processed", + f"{prefix}.number_of_transactions_actually_processed", pg_bench_result.number_of_transactions_actually_processed, '', # thats because this is predefined by test matrix and doesnt change across runs report=MetricReport.TEST_PARAM, ) - self.record("latency_average", + self.record(f"{prefix}.latency_average", pg_bench_result.latency_average, unit="ms", report=MetricReport.LOWER_IS_BETTER) - self.record("latency_stddev", - pg_bench_result.latency_stddev, - unit="ms", - report=MetricReport.LOWER_IS_BETTER) - self.record("tps_including_connection_time", - pg_bench_result.tps_including_connection_time, - '', - report=MetricReport.HIGHER_IS_BETTER) - self.record("tps_excluding_connection_time", - pg_bench_result.tps_excluding_connection_time, - '', - report=MetricReport.HIGHER_IS_BETTER) - self.record("init_duration", - pg_bench_result.init_duration, - unit="s", - report=MetricReport.LOWER_IS_BETTER) - self.record("init_start_timestamp", - pg_bench_result.init_start_timestamp, - '', - MetricReport.TEST_PARAM) - self.record("init_end_timestamp", - pg_bench_result.init_end_timestamp, - '', - MetricReport.TEST_PARAM) - self.record("run_duration", + if pg_bench_result.latency_stddev is not None: + self.record(f"{prefix}.latency_stddev", + pg_bench_result.latency_stddev, + unit="ms", + report=MetricReport.LOWER_IS_BETTER) + self.record(f"{prefix}.tps", pg_bench_result.tps, '', report=MetricReport.HIGHER_IS_BETTER) + self.record(f"{prefix}.run_duration", pg_bench_result.run_duration, unit="s", report=MetricReport.LOWER_IS_BETTER) - self.record("run_start_timestamp", + self.record(f"{prefix}.run_start_timestamp", pg_bench_result.run_start_timestamp, '', MetricReport.TEST_PARAM) - self.record("run_end_timestamp", + self.record(f"{prefix}.run_end_timestamp", pg_bench_result.run_end_timestamp, '', MetricReport.TEST_PARAM) diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 598ee10f8e..3c6a923587 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -2,7 +2,7 @@ import pytest from contextlib import contextmanager from abc import ABC, abstractmethod -from fixtures.zenith_fixtures import PgBin, PgProtocol, VanillaPostgres, ZenithEnv +from fixtures.zenith_fixtures import PgBin, PgProtocol, VanillaPostgres, RemotePostgres, ZenithEnv from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker # Type-related stuff @@ -162,6 +162,48 @@ class VanillaCompare(PgCompare): return self.zenbenchmark.record_duration(out_name) +class RemoteCompare(PgCompare): + """PgCompare interface for a remote postgres instance.""" + def __init__(self, zenbenchmark, remote_pg: RemotePostgres): + self._pg = remote_pg + self._zenbenchmark = zenbenchmark + + # Long-lived cursor, useful for flushing + self.conn = self.pg.connect() + self.cur = self.conn.cursor() + + @property + def pg(self): + return self._pg + + @property + def zenbenchmark(self): + return self._zenbenchmark + + @property + def pg_bin(self): + return self._pg.pg_bin + + def flush(self): + # TODO: flush the remote pageserver + pass + + def report_peak_memory_use(self) -> None: + # TODO: get memory usage from remote pageserver + pass + + def report_size(self) -> None: + # TODO: get storage size from remote pageserver + pass + + @contextmanager + def record_pageserver_writes(self, out_name): + yield # Do nothing + + def record_duration(self, out_name): + return self.zenbenchmark.record_duration(out_name) + + @pytest.fixture(scope='function') def zenith_compare(request, zenbenchmark, pg_bin, zenith_simple_env) -> ZenithCompare: branch_name = request.node.name @@ -173,6 +215,11 @@ def vanilla_compare(zenbenchmark, vanilla_pg) -> VanillaCompare: return VanillaCompare(zenbenchmark, vanilla_pg) +@pytest.fixture(scope='function') +def remote_compare(zenbenchmark, remote_pg) -> RemoteCompare: + return RemoteCompare(zenbenchmark, remote_pg) + + @pytest.fixture(params=["vanilla_compare", "zenith_compare"], ids=["vanilla", "zenith"]) def zenith_with_baseline(request) -> PgCompare: """Parameterized fixture that helps compare zenith against vanilla postgres. diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 41d1443880..f8ee39a5a1 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -123,6 +123,22 @@ def pytest_configure(config): top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR) mkdir_if_needed(top_output_dir) + # Find the postgres installation. + global pg_distrib_dir + env_postgres_bin = os.environ.get('POSTGRES_DISTRIB_DIR') + if env_postgres_bin: + pg_distrib_dir = env_postgres_bin + else: + pg_distrib_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR)) + log.info(f'pg_distrib_dir is {pg_distrib_dir}') + if os.getenv("REMOTE_ENV"): + # When testing against a remote server, we only need the client binary. + if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/psql')): + raise Exception('psql not found at "{}"'.format(pg_distrib_dir)) + else: + if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/postgres')): + raise Exception('postgres not found at "{}"'.format(pg_distrib_dir)) + if os.getenv("REMOTE_ENV"): # we are in remote env and do not have zenith binaries locally # this is the case for benchmarks run on self-hosted runner @@ -138,17 +154,6 @@ def pytest_configure(config): if not os.path.exists(os.path.join(zenith_binpath, 'pageserver')): raise Exception('zenith binaries not found at "{}"'.format(zenith_binpath)) - # Find the postgres installation. - global pg_distrib_dir - env_postgres_bin = os.environ.get('POSTGRES_DISTRIB_DIR') - if env_postgres_bin: - pg_distrib_dir = env_postgres_bin - else: - pg_distrib_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR)) - log.info(f'pg_distrib_dir is {pg_distrib_dir}') - if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/postgres')): - raise Exception('postgres not found at "{}"'.format(pg_distrib_dir)) - def zenfixture(func: Fn) -> Fn: """ @@ -1305,6 +1310,47 @@ def vanilla_pg(test_output_dir: str) -> Iterator[VanillaPostgres]: yield vanilla_pg +class RemotePostgres(PgProtocol): + def __init__(self, pg_bin: PgBin, remote_connstr: str): + super().__init__(**parse_dsn(remote_connstr)) + self.pg_bin = pg_bin + # The remote server is assumed to be running already + self.running = True + + def configure(self, options: List[str]): + raise Exception('cannot change configuration of remote Posgres instance') + + def start(self): + raise Exception('cannot start a remote Postgres instance') + + def stop(self): + raise Exception('cannot stop a remote Postgres instance') + + def get_subdir_size(self, subdir) -> int: + # TODO: Could use the server's Generic File Acccess functions if superuser. + # See https://www.postgresql.org/docs/14/functions-admin.html#FUNCTIONS-ADMIN-GENFILE + raise Exception('cannot get size of a Postgres instance') + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + # do nothing + pass + + +@pytest.fixture(scope='function') +def remote_pg(test_output_dir: str) -> Iterator[RemotePostgres]: + pg_bin = PgBin(test_output_dir) + + connstr = os.getenv("BENCHMARK_CONNSTR") + if connstr is None: + raise ValueError("no connstr provided, use BENCHMARK_CONNSTR environment variable") + + with RemotePostgres(pg_bin, connstr) as remote_pg: + yield remote_pg + + class ZenithProxy(PgProtocol): def __init__(self, port: int): super().__init__(host="127.0.0.1", diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 5ffce3c0be..d2de76913a 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -2,29 +2,113 @@ from contextlib import closing from fixtures.zenith_fixtures import PgBin, VanillaPostgres, ZenithEnv from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker +from fixtures.benchmark_fixture import PgBenchRunResult, MetricReport, ZenithBenchmarker from fixtures.log_helper import log +from pathlib import Path + +import pytest +from datetime import datetime +import calendar +import os +import timeit + + +def utc_now_timestamp() -> int: + return calendar.timegm(datetime.utcnow().utctimetuple()) + + +def init_pgbench(env: PgCompare, cmdline): + # calculate timestamps and durations separately + # timestamp is intended to be used for linking to grafana and logs + # duration is actually a metric and uses float instead of int for timestamp + init_start_timestamp = utc_now_timestamp() + t0 = timeit.default_timer() + with env.record_pageserver_writes('init.pageserver_writes'): + env.pg_bin.run_capture(cmdline) + env.flush() + init_duration = timeit.default_timer() - t0 + init_end_timestamp = utc_now_timestamp() + + env.zenbenchmark.record("init.duration", + init_duration, + unit="s", + report=MetricReport.LOWER_IS_BETTER) + env.zenbenchmark.record("init.start_timestamp", + init_start_timestamp, + '', + MetricReport.TEST_PARAM) + env.zenbenchmark.record("init.end_timestamp", init_end_timestamp, '', MetricReport.TEST_PARAM) + + +def run_pgbench(env: PgCompare, prefix: str, cmdline): + with env.record_pageserver_writes(f'{prefix}.pageserver_writes'): + run_start_timestamp = utc_now_timestamp() + t0 = timeit.default_timer() + out = env.pg_bin.run_capture(cmdline, ) + run_duration = timeit.default_timer() - t0 + run_end_timestamp = utc_now_timestamp() + env.flush() + + stdout = Path(f"{out}.stdout").read_text() + + res = PgBenchRunResult.parse_from_stdout( + stdout=stdout, + run_duration=run_duration, + run_start_timestamp=run_start_timestamp, + run_end_timestamp=run_end_timestamp, + ) + env.zenbenchmark.record_pg_bench_result(prefix, res) + # -# Run a very short pgbench test. +# Initialize a pgbench database, and run pgbench against it. # -# Collects three metrics: +# This makes runs two different pgbench workloads against the same +# initialized database, and 'duration' is the time of each run. So +# the total runtime is 2 * duration, plus time needed to initialize +# the test database. # -# 1. Time to initialize the pgbench database (pgbench -s5 -i) -# 2. Time to run 5000 pgbench transactions -# 3. Disk space used -# -def test_pgbench(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +# Currently, the # of connections is hardcoded at 4 +def run_test_pgbench(env: PgCompare, scale: int, duration: int): - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('init'): - env.pg_bin.run_capture(['pgbench', '-s5', '-i', env.pg.connstr()]) - env.flush() + # Record the scale and initialize + env.zenbenchmark.record("scale", scale, '', MetricReport.TEST_PARAM) + init_pgbench(env, ['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) - with env.record_duration('5000_xacts'): - env.pg_bin.run_capture(['pgbench', '-c1', '-t5000', env.pg.connstr()]) - env.flush() + # Run simple-update workload + run_pgbench(env, + "simple-update", + ['pgbench', '-n', '-c4', f'-T{duration}', '-P2', '-Mprepared', env.pg.connstr()]) + + # Run SELECT workload + run_pgbench(env, + "select-only", + ['pgbench', '-S', '-c4', f'-T{duration}', '-P2', '-Mprepared', env.pg.connstr()]) env.report_size() + + +def get_durations_matrix(): + durations = os.getenv("TEST_PG_BENCH_DURATIONS_MATRIX", default="45") + return list(map(int, durations.split(","))) + + +def get_scales_matrix(): + scales = os.getenv("TEST_PG_BENCH_SCALES_MATRIX", default="10") + return list(map(int, scales.split(","))) + + +# Run the pgbench tests against vanilla Postgres and zenith +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix()) +def test_pgbench(zenith_with_baseline: PgCompare, scale: int, duration: int): + run_test_pgbench(zenith_with_baseline, scale, duration) + + +# Run the pgbench tests against an existing Postgres cluster +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix()) +@pytest.mark.remote_cluster +def test_pgbench_remote(remote_compare: PgCompare, scale: int, duration: int): + run_test_pgbench(remote_compare, scale, duration) diff --git a/test_runner/performance/test_perf_pgbench_remote.py b/test_runner/performance/test_perf_pgbench_remote.py deleted file mode 100644 index 28472a16c8..0000000000 --- a/test_runner/performance/test_perf_pgbench_remote.py +++ /dev/null @@ -1,124 +0,0 @@ -import dataclasses -import os -import subprocess -from typing import List -from fixtures.benchmark_fixture import PgBenchRunResult, ZenithBenchmarker -import pytest -from datetime import datetime -import calendar -import timeit -import os - - -def utc_now_timestamp() -> int: - return calendar.timegm(datetime.utcnow().utctimetuple()) - - -@dataclasses.dataclass -class PgBenchRunner: - connstr: str - scale: int - transactions: int - pgbench_bin_path: str = "pgbench" - - def invoke(self, args: List[str]) -> 'subprocess.CompletedProcess[str]': - res = subprocess.run([self.pgbench_bin_path, *args], text=True, capture_output=True) - - if res.returncode != 0: - raise RuntimeError(f"pgbench failed. stdout: {res.stdout} stderr: {res.stderr}") - return res - - def init(self, vacuum: bool = True) -> 'subprocess.CompletedProcess[str]': - args = [] - if not vacuum: - args.append("--no-vacuum") - args.extend([f"--scale={self.scale}", "--initialize", self.connstr]) - return self.invoke(args) - - def run(self, jobs: int = 1, clients: int = 1): - return self.invoke([ - f"--transactions={self.transactions}", - f"--jobs={jobs}", - f"--client={clients}", - "--progress=2", # print progress every two seconds - self.connstr, - ]) - - -@pytest.fixture -def connstr(): - res = os.getenv("BENCHMARK_CONNSTR") - if res is None: - raise ValueError("no connstr provided, use BENCHMARK_CONNSTR environment variable") - return res - - -def get_transactions_matrix(): - transactions = os.getenv("TEST_PG_BENCH_TRANSACTIONS_MATRIX") - if transactions is None: - return [10**4, 10**5] - return list(map(int, transactions.split(","))) - - -def get_scales_matrix(): - scales = os.getenv("TEST_PG_BENCH_SCALES_MATRIX") - if scales is None: - return [10, 20] - return list(map(int, scales.split(","))) - - -@pytest.mark.parametrize("scale", get_scales_matrix()) -@pytest.mark.parametrize("transactions", get_transactions_matrix()) -@pytest.mark.remote_cluster -def test_pg_bench_remote_cluster(zenbenchmark: ZenithBenchmarker, - connstr: str, - scale: int, - transactions: int): - """ - The best way is to run same pack of tests both, for local zenith - and against staging, but currently local tests heavily depend on - things available only locally e.g. zenith binaries, pageserver api, etc. - Also separate test allows to run pgbench workload against vanilla postgres - or other systems that support postgres protocol. - - Also now this is more of a liveness test because it stresses pageserver internals, - so we clearly see what goes wrong in more "real" environment. - """ - pg_bin = os.getenv("PG_BIN") - if pg_bin is not None: - pgbench_bin_path = os.path.join(pg_bin, "pgbench") - else: - pgbench_bin_path = "pgbench" - - runner = PgBenchRunner( - connstr=connstr, - scale=scale, - transactions=transactions, - pgbench_bin_path=pgbench_bin_path, - ) - # calculate timestamps and durations separately - # timestamp is intended to be used for linking to grafana and logs - # duration is actually a metric and uses float instead of int for timestamp - init_start_timestamp = utc_now_timestamp() - t0 = timeit.default_timer() - runner.init() - init_duration = timeit.default_timer() - t0 - init_end_timestamp = utc_now_timestamp() - - run_start_timestamp = utc_now_timestamp() - t0 = timeit.default_timer() - out = runner.run() # TODO handle failures - run_duration = timeit.default_timer() - t0 - run_end_timestamp = utc_now_timestamp() - - res = PgBenchRunResult.parse_from_output( - out=out, - init_duration=init_duration, - init_start_timestamp=init_start_timestamp, - init_end_timestamp=init_end_timestamp, - run_duration=run_duration, - run_start_timestamp=run_start_timestamp, - run_end_timestamp=run_end_timestamp, - ) - - zenbenchmark.record_pg_bench_result(res) From 9e4de6bed02e9dc48af5b9d74a7759b0c2702b26 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 12 Apr 2022 20:29:35 +0300 Subject: [PATCH 0138/1022] Use RwLock instad of Mutex for layer map lock. For more concurrency --- pageserver/src/layered_repository.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index e178ba5222..95df385cfe 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -193,7 +193,7 @@ impl Repository for LayeredRepository { Arc::clone(&self.walredo_mgr), self.upload_layers, ); - timeline.layers.lock().unwrap().next_open_layer_at = Some(initdb_lsn); + timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); let timeline = Arc::new(timeline); let r = timelines.insert( @@ -725,7 +725,7 @@ pub struct LayeredTimeline { tenantid: ZTenantId, timelineid: ZTimelineId, - layers: Mutex, + layers: RwLock, last_freeze_at: AtomicLsn, @@ -997,7 +997,7 @@ impl LayeredTimeline { conf, timelineid, tenantid, - layers: Mutex::new(LayerMap::default()), + layers: RwLock::new(LayerMap::default()), walredo_mgr, @@ -1040,7 +1040,7 @@ impl LayeredTimeline { /// Returns all timeline-related files that were found and loaded. /// fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { - let mut layers = self.layers.lock().unwrap(); + let mut layers = self.layers.write().unwrap(); let mut num_layers = 0; // Scan timeline directory and create ImageFileName and DeltaFilename @@ -1194,7 +1194,7 @@ impl LayeredTimeline { continue; } - let layers = timeline.layers.lock().unwrap(); + let layers = timeline.layers.read().unwrap(); // Check the open and frozen in-memory layers first if let Some(open_layer) = &layers.open_layer { @@ -1276,7 +1276,7 @@ impl LayeredTimeline { /// Get a handle to the latest layer for appending. /// fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result> { - let mut layers = self.layers.lock().unwrap(); + let mut layers = self.layers.write().unwrap(); ensure!(lsn.is_aligned()); @@ -1347,7 +1347,7 @@ impl LayeredTimeline { } else { Some(self.write_lock.lock().unwrap()) }; - let mut layers = self.layers.lock().unwrap(); + let mut layers = self.layers.write().unwrap(); if let Some(open_layer) = &layers.open_layer { let open_layer_rc = Arc::clone(open_layer); // Does this layer need freezing? @@ -1412,7 +1412,7 @@ impl LayeredTimeline { let timer = self.flush_time_histo.start_timer(); loop { - let layers = self.layers.lock().unwrap(); + let layers = self.layers.read().unwrap(); if let Some(frozen_layer) = layers.frozen_layers.front() { let frozen_layer = Arc::clone(frozen_layer); drop(layers); // to allow concurrent reads and writes @@ -1456,7 +1456,7 @@ impl LayeredTimeline { // Finally, replace the frozen in-memory layer with the new on-disk layers { - let mut layers = self.layers.lock().unwrap(); + let mut layers = self.layers.write().unwrap(); let l = layers.frozen_layers.pop_front(); // Only one thread may call this function at a time (for this @@ -1612,7 +1612,7 @@ impl LayeredTimeline { lsn: Lsn, threshold: usize, ) -> Result { - let layers = self.layers.lock().unwrap(); + let layers = self.layers.read().unwrap(); for part_range in &partition.ranges { let image_coverage = layers.image_coverage(part_range, lsn)?; @@ -1670,7 +1670,7 @@ impl LayeredTimeline { // FIXME: Do we need to do something to upload it to remote storage here? - let mut layers = self.layers.lock().unwrap(); + let mut layers = self.layers.write().unwrap(); layers.insert_historic(Arc::new(image_layer)); drop(layers); @@ -1678,7 +1678,7 @@ impl LayeredTimeline { } fn compact_level0(&self, target_file_size: u64) -> Result<()> { - let layers = self.layers.lock().unwrap(); + let layers = self.layers.read().unwrap(); let level0_deltas = layers.get_level0_deltas()?; @@ -1768,7 +1768,7 @@ impl LayeredTimeline { layer_paths.pop().unwrap(); } - let mut layers = self.layers.lock().unwrap(); + let mut layers = self.layers.write().unwrap(); for l in new_layers { layers.insert_historic(Arc::new(l)); } @@ -1850,7 +1850,7 @@ impl LayeredTimeline { // 2. it doesn't need to be retained for 'retain_lsns'; // 3. newer on-disk image layers cover the layer's whole key range // - let mut layers = self.layers.lock().unwrap(); + let mut layers = self.layers.write().unwrap(); 'outer: for l in layers.iter_historic_layers() { // This layer is in the process of being flushed to disk. // It will be swapped out of the layer map, replaced with From d5ae9db997711d770b52511f8bbd2eef8067cedc Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Thu, 14 Apr 2022 10:09:03 -0400 Subject: [PATCH 0139/1022] Add s3 cost estimate to tests (#1478) --- pageserver/src/layered_repository.rs | 22 ++++++++++++++++- test_runner/fixtures/benchmark_fixture.py | 30 ++++++++++------------- test_runner/fixtures/compare_fixtures.py | 13 ++++++++++ 3 files changed, 47 insertions(+), 18 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 95df385cfe..36b081e400 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -49,7 +49,8 @@ use crate::CheckpointConfig; use crate::{ZTenantId, ZTimelineId}; use zenith_metrics::{ - register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec, IntGauge, IntGaugeVec, + register_histogram_vec, register_int_counter, register_int_gauge_vec, Histogram, HistogramVec, + IntCounter, IntGauge, IntGaugeVec, }; use zenith_utils::crashsafe_dir; use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn}; @@ -109,6 +110,21 @@ lazy_static! { .expect("failed to define a metric"); } +// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, +// or in testing they estimate how much we would upload if we did. +lazy_static! { + static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!( + "pageserver_num_persistent_files_created", + "Number of files created that are meant to be uploaded to cloud storage", + ) + .expect("failed to define a metric"); + static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!( + "pageserver_persistent_bytes_written", + "Total bytes written that are meant to be uploaded to cloud storage", + ) + .expect("failed to define a metric"); +} + /// Parts of the `.zenith/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; @@ -1524,6 +1540,10 @@ impl LayeredTimeline { &metadata, false, )?; + + NUM_PERSISTENT_FILES_CREATED.inc_by(1); + PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len()); + if self.upload_layers.load(atomic::Ordering::Relaxed) { schedule_timeline_checkpoint_upload( self.tenantid, diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index a904233e98..0735f16d73 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -236,10 +236,18 @@ class ZenithBenchmarker: """ Fetch the "cumulative # of bytes written" metric from the pageserver """ - # Fetch all the exposed prometheus metrics from page server - all_metrics = pageserver.http_client().get_metrics() - # Use a regular expression to extract the one we're interested in - # + metric_name = r'pageserver_disk_io_bytes{io_operation="write"}' + return self.get_int_counter_value(pageserver, metric_name) + + def get_peak_mem(self, pageserver) -> int: + """ + Fetch the "maxrss" metric from the pageserver + """ + metric_name = r'pageserver_maxrss_kb' + return self.get_int_counter_value(pageserver, metric_name) + + def get_int_counter_value(self, pageserver, metric_name) -> int: + """Fetch the value of given int counter from pageserver metrics.""" # TODO: If we start to collect more of the prometheus metrics in the # performance test suite like this, we should refactor this to load and # parse all the metrics into a more convenient structure in one go. @@ -247,20 +255,8 @@ class ZenithBenchmarker: # The metric should be an integer, as it's a number of bytes. But in general # all prometheus metrics are floats. So to be pedantic, read it as a float # and round to integer. - matches = re.search(r'^pageserver_disk_io_bytes{io_operation="write"} (\S+)$', - all_metrics, - re.MULTILINE) - assert matches - return int(round(float(matches.group(1)))) - - def get_peak_mem(self, pageserver) -> int: - """ - Fetch the "maxrss" metric from the pageserver - """ - # Fetch all the exposed prometheus metrics from page server all_metrics = pageserver.http_client().get_metrics() - # See comment in get_io_writes() - matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics, re.MULTILINE) + matches = re.search(fr'^{metric_name} (\S+)$', all_metrics, re.MULTILINE) assert matches return int(round(float(matches.group(1)))) diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 3c6a923587..93912d2da7 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -105,6 +105,19 @@ class ZenithCompare(PgCompare): 'MB', report=MetricReport.LOWER_IS_BETTER) + total_files = self.zenbenchmark.get_int_counter_value( + self.env.pageserver, "pageserver_num_persistent_files_created") + total_bytes = self.zenbenchmark.get_int_counter_value( + self.env.pageserver, "pageserver_persistent_bytes_written") + self.zenbenchmark.record("data_uploaded", + total_bytes / (1024 * 1024), + "MB", + report=MetricReport.LOWER_IS_BETTER) + self.zenbenchmark.record("num_files_uploaded", + total_files, + "", + report=MetricReport.LOWER_IS_BETTER) + def record_pageserver_writes(self, out_name): return self.zenbenchmark.record_pageserver_writes(self.env.pageserver, out_name) From 93e0ac2b7ae84747188d0da98061333b4a52a150 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 14 Apr 2022 16:17:47 +0300 Subject: [PATCH 0140/1022] Remove a couple of unused dependencies. Found by "cargo-udeps" --- Cargo.lock | 2 -- pageserver/Cargo.toml | 1 - proxy/Cargo.toml | 1 - 3 files changed, 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0584b9d6d2..5027c4bdc7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1551,7 +1551,6 @@ dependencies = [ "tokio-util 0.7.0", "toml_edit", "tracing", - "tracing-futures", "url", "workspace_hack", "zenith_metrics", @@ -1938,7 +1937,6 @@ dependencies = [ "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", "tokio-postgres-rustls", "tokio-rustls 0.22.0", - "tokio-stream", "workspace_hack", "zenith_metrics", "zenith_utils", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index dccdca291c..e92ac0421c 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -37,7 +37,6 @@ toml_edit = { version = "0.13", features = ["easy"] } scopeguard = "1.1.0" const_format = "0.2.21" tracing = "0.1.27" -tracing-futures = "0.2" signal-hook = "0.3.10" url = "2" nix = "0.23" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 56b6dd7e20..be03a2d4a9 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -31,7 +31,6 @@ thiserror = "1.0.30" tokio = { version = "1.17", features = ["macros"] } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } tokio-rustls = "0.22.0" -tokio-stream = "0.1.8" zenith_utils = { path = "../zenith_utils" } zenith_metrics = { path = "../zenith_metrics" } From 2cb39a162431716eeb835656c45ca1cff4eab544 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 14 Apr 2022 14:04:45 +0300 Subject: [PATCH 0141/1022] add missing files, update workspace hack --- Cargo.lock | 8 ++++---- workspace_hack/.gitattributes | 4 ++++ workspace_hack/Cargo.toml | 16 +++++++++++----- workspace_hack/build.rs | 2 ++ 4 files changed, 21 insertions(+), 9 deletions(-) create mode 100644 workspace_hack/.gitattributes create mode 100644 workspace_hack/build.rs diff --git a/Cargo.lock b/Cargo.lock index 5027c4bdc7..3a75687b36 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2112,7 +2112,6 @@ dependencies = [ "serde_urlencoded", "tokio", "tokio-rustls 0.23.2", - "tokio-util 0.6.9", "url", "wasm-bindgen", "wasm-bindgen-futures", @@ -3390,19 +3389,20 @@ dependencies = [ "anyhow", "bytes", "cc", + "chrono", "clap 2.34.0", "either", "hashbrown", + "indexmap", "libc", "log", "memchr", "num-integer", "num-traits", - "proc-macro2", - "quote", + "prost", + "rand", "regex", "regex-syntax", - "reqwest", "scopeguard", "serde", "syn", diff --git a/workspace_hack/.gitattributes b/workspace_hack/.gitattributes new file mode 100644 index 0000000000..3e9dba4b64 --- /dev/null +++ b/workspace_hack/.gitattributes @@ -0,0 +1,4 @@ +# Avoid putting conflict markers in the generated Cargo.toml file, since their presence breaks +# Cargo. +# Also do not check out the file as CRLF on Windows, as that's what hakari needs. +Cargo.toml merge=binary -crlf diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 6e6a0e09d7..84244b3363 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -16,32 +16,38 @@ publish = false [dependencies] anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } +chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "std", "time", "winapi"] } clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } either = { version = "1", features = ["use_std"] } hashbrown = { version = "0.11", features = ["ahash", "inline-more", "raw"] } +indexmap = { version = "1", default-features = false, features = ["std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } memchr = { version = "2", features = ["std", "use_std"] } num-integer = { version = "0.1", default-features = false, features = ["std"] } num-traits = { version = "0.2", features = ["std"] } +prost = { version = "0.9", features = ["prost-derive", "std"] } +rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } -reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "hyper-rustls", "json", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "stream", "tokio-rustls", "tokio-util", "webpki-roots"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } -tokio = { version = "1", features = ["bytes", "fs", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "sync", "time", "tokio-macros"] } -tracing = { version = "0.1", features = ["attributes", "std", "tracing-attributes"] } +tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } +tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } tracing-core = { version = "0.1", features = ["lazy_static", "std"] } [build-dependencies] +anyhow = { version = "1", features = ["backtrace", "std"] } +bytes = { version = "1", features = ["serde", "std"] } cc = { version = "1", default-features = false, features = ["jobserver", "parallel"] } clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } either = { version = "1", features = ["use_std"] } +hashbrown = { version = "0.11", features = ["ahash", "inline-more", "raw"] } +indexmap = { version = "1", default-features = false, features = ["std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } memchr = { version = "2", features = ["std", "use_std"] } -proc-macro2 = { version = "1", features = ["proc-macro"] } -quote = { version = "1", features = ["proc-macro"] } +prost = { version = "0.9", features = ["prost-derive", "std"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } diff --git a/workspace_hack/build.rs b/workspace_hack/build.rs new file mode 100644 index 0000000000..92518ef04c --- /dev/null +++ b/workspace_hack/build.rs @@ -0,0 +1,2 @@ +// A build script is required for cargo to consider build dependencies. +fn main() {} From e97f94cc30b7f08f308ce4086eae2f9497b0e413 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 14 Apr 2022 19:49:01 +0300 Subject: [PATCH 0142/1022] Bump rustc version --- .circleci/config.yml | 4 ++-- Dockerfile | 8 ++++---- Dockerfile.build | 2 +- Dockerfile.compute-tools | 2 +- README.md | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index f05e64072a..5aae143e48 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -5,10 +5,10 @@ executors: resource_class: xlarge docker: # NB: when changed, do not forget to update rust image tag in all Dockerfiles - - image: zimg/rust:1.56 + - image: zimg/rust:1.58 zenith-executor: docker: - - image: zimg/rust:1.56 + - image: zimg/rust:1.58 jobs: check-codestyle-rust: diff --git a/Dockerfile b/Dockerfile index babc3b8e1d..955d26cd0b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ # Build Postgres # -#FROM zimg/rust:1.56 AS pg-build -FROM zenithdb/build:buster-20220309 AS pg-build +#FROM zimg/rust:1.58 AS pg-build +FROM zenithdb/build:buster-20220414 AS pg-build WORKDIR /pg USER root @@ -17,8 +17,8 @@ RUN set -e \ # Build zenith binaries # -#FROM zimg/rust:1.56 AS build -FROM zenithdb/build:buster-20220309 AS build +#FROM zimg/rust:1.58 AS build +FROM zenithdb/build:buster-20220414 AS build ARG GIT_VERSION=local ARG CACHEPOT_BUCKET=zenith-rust-cachepot diff --git a/Dockerfile.build b/Dockerfile.build index 44a2aaafb9..c7d239647f 100644 --- a/Dockerfile.build +++ b/Dockerfile.build @@ -1,4 +1,4 @@ -FROM rust:1.56.1-slim-buster +FROM rust:1.58-slim-buster WORKDIR /home/circleci/project RUN set -e \ diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index f7672251e6..6a35a71bb3 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,6 +1,6 @@ # First transient image to build compute_tools binaries # NB: keep in sync with rust image version in .circle/config.yml -FROM zenithdb/build:buster-20220309 AS rust-build +FROM zenithdb/build:buster-20220414 AS rust-build WORKDIR /zenith diff --git a/README.md b/README.md index f99785e683..03f86887a7 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libsec libssl-dev clang pkg-config libpq-dev ``` -[Rust] 1.56.1 or later is also required. +[Rust] 1.58 or later is also required. To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively. From c9d897f9b6fecce83549aea725fd79cd8bdcdad8 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 15 Apr 2022 12:06:25 +0300 Subject: [PATCH 0143/1022] [proxy] Update rustls (#1510) --- Cargo.lock | 33 +++++++++++---------------------- proxy/Cargo.toml | 7 ++++--- proxy/src/config.rs | 28 +++++++++++++++++----------- proxy/src/proxy.rs | 18 ++++++++++++++---- 4 files changed, 46 insertions(+), 40 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3a75687b36..6409b33055 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1066,7 +1066,7 @@ dependencies = [ "hyper", "rustls 0.20.2", "tokio", - "tokio-rustls 0.23.2", + "tokio-rustls", ] [[package]] @@ -1926,7 +1926,8 @@ dependencies = [ "reqwest", "routerify 2.2.0", "rstest", - "rustls 0.19.1", + "rustls 0.20.2", + "rustls-pemfile", "scopeguard", "serde", "serde_json", @@ -1936,7 +1937,7 @@ dependencies = [ "tokio", "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", "tokio-postgres-rustls", - "tokio-rustls 0.22.0", + "tokio-rustls", "workspace_hack", "zenith_metrics", "zenith_utils", @@ -2111,7 +2112,7 @@ dependencies = [ "serde_json", "serde_urlencoded", "tokio", - "tokio-rustls 0.23.2", + "tokio-rustls", "url", "wasm-bindgen", "wasm-bindgen-futures", @@ -2823,35 +2824,23 @@ dependencies = [ [[package]] name = "tokio-postgres-rustls" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7bd8c37d8c23cb6ecdc32fc171bade4e9c7f1be65f693a17afbaad02091a0a19" +checksum = "606f2b73660439474394432239c82249c0d45eb5f23d91f401be1e33590444a7" dependencies = [ "futures", "ring", - "rustls 0.19.1", + "rustls 0.20.2", "tokio", "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "tokio-rustls 0.22.0", - "webpki 0.21.4", + "tokio-rustls", ] [[package]] name = "tokio-rustls" -version = "0.22.0" +version = "0.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6" -dependencies = [ - "rustls 0.19.1", - "tokio", - "webpki 0.21.4", -] - -[[package]] -name = "tokio-rustls" -version = "0.23.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a27d5f2b839802bd8267fa19b0530f5a08b9c08cd417976be2a65d130fe1c11b" +checksum = "4151fda0cf2798550ad0b34bcfc9b9dcc2a9d2471c895c68f3a8818e54f2389e" dependencies = [ "rustls 0.20.2", "tokio", diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index be03a2d4a9..20b459988a 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -21,7 +21,8 @@ pin-project-lite = "0.2.7" rand = "0.8.3" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } routerify = "2" -rustls = "0.19.1" +rustls = "0.20.0" +rustls-pemfile = "0.2.1" scopeguard = "1.1.0" serde = "1" serde_json = "1" @@ -30,7 +31,7 @@ socket2 = "0.4.4" thiserror = "1.0.30" tokio = { version = "1.17", features = ["macros"] } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } -tokio-rustls = "0.22.0" +tokio-rustls = "0.23.0" zenith_utils = { path = "../zenith_utils" } zenith_metrics = { path = "../zenith_metrics" } @@ -40,4 +41,4 @@ workspace_hack = { version = "0.1", path = "../workspace_hack" } async-trait = "0.1" rcgen = "0.8.14" rstest = "0.12" -tokio-postgres-rustls = "0.8.0" +tokio-postgres-rustls = "0.9.0" diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 077ff02898..aef079d089 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,10 +1,9 @@ -use anyhow::{anyhow, bail, ensure, Context}; -use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig}; +use anyhow::{bail, ensure, Context}; use std::net::SocketAddr; use std::str::FromStr; use std::sync::Arc; -pub type TlsConfig = Arc; +pub type TlsConfig = Arc; #[non_exhaustive] pub enum ClientAuthMethod { @@ -61,21 +60,28 @@ pub struct ProxyConfig { pub fn configure_ssl(key_path: &str, cert_path: &str) -> anyhow::Result { let key = { let key_bytes = std::fs::read(key_path).context("SSL key file")?; - let mut keys = pemfile::pkcs8_private_keys(&mut &key_bytes[..]) - .map_err(|_| anyhow!("couldn't read TLS keys"))?; + let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) + .context("couldn't read TLS keys")?; + ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - keys.pop().unwrap() + keys.pop().map(rustls::PrivateKey).unwrap() }; let cert_chain = { let cert_chain_bytes = std::fs::read(cert_path).context("SSL cert file")?; - pemfile::certs(&mut &cert_chain_bytes[..]) - .map_err(|_| anyhow!("couldn't read TLS certificates"))? + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .context("couldn't read TLS certificate chain")? + .into_iter() + .map(rustls::Certificate) + .collect() }; - let mut config = ServerConfig::new(NoClientAuth::new()); - config.set_single_cert(cert_chain, key)?; - config.versions = vec![ProtocolVersion::TLSv1_3]; + let config = rustls::ServerConfig::builder() + .with_safe_default_cipher_suites() + .with_safe_default_kx_groups() + .with_protocol_versions(&[&rustls::version::TLS13])? + .with_no_client_auth() + .with_single_cert(cert_chain, key)?; Ok(config.into()) } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 5b662f4c69..788179252b 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -265,14 +265,24 @@ mod tests { let (ca, cert, key) = generate_certs(hostname)?; let server_config = { - let mut config = rustls::ServerConfig::new(rustls::NoClientAuth::new()); - config.set_single_cert(vec![cert], key)?; + let config = rustls::ServerConfig::builder() + .with_safe_defaults() + .with_no_client_auth() + .with_single_cert(vec![cert], key)?; + config.into() }; let client_config = { - let mut config = rustls::ClientConfig::new(); - config.root_store.add(&ca)?; + let config = rustls::ClientConfig::builder() + .with_safe_defaults() + .with_root_certificates({ + let mut store = rustls::RootCertStore::empty(); + store.add(&ca)?; + store + }) + .with_no_client_auth(); + ClientConfig { config, hostname } }; From ab20f2c4918a0031545e2d3d49e0bfd25faa5181 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 15 Apr 2022 18:36:11 +0300 Subject: [PATCH 0144/1022] Use the same version of `rust-postgres` everywhere. (#1516) Turns out we still had a stale dep in `compute_tools`. --- Cargo.lock | 104 ++++++++------------------------------- Cargo.toml | 4 +- compute_tools/Cargo.toml | 2 +- 3 files changed, 23 insertions(+), 87 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6409b33055..0cdeb106ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -340,13 +340,13 @@ dependencies = [ "hyper", "libc", "log", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", + "postgres", "regex", "serde", "serde_json", "tar", "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "tokio-postgres", "workspace_hack", ] @@ -378,7 +378,7 @@ dependencies = [ "lazy_static", "nix", "pageserver", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "postgres", "regex", "reqwest", "serde", @@ -1529,9 +1529,9 @@ dependencies = [ "log", "nix", "once_cell", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "postgres", + "postgres-protocol", + "postgres-types", "postgres_ffi", "rand", "regex", @@ -1546,7 +1546,7 @@ dependencies = [ "tempfile", "thiserror", "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "tokio-postgres", "tokio-stream", "tokio-util 0.7.0", "toml_edit", @@ -1717,23 +1717,9 @@ dependencies = [ "fallible-iterator", "futures", "log", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "postgres-protocol", "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", -] - -[[package]] -name = "postgres" -version = "0.19.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858#9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" -dependencies = [ - "bytes", - "fallible-iterator", - "futures", - "log", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", - "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", + "tokio-postgres", ] [[package]] @@ -1754,24 +1740,6 @@ dependencies = [ "stringprep", ] -[[package]] -name = "postgres-protocol" -version = "0.6.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858#9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" -dependencies = [ - "base64 0.13.0", - "byteorder", - "bytes", - "fallible-iterator", - "hmac 0.10.1", - "lazy_static", - "md-5", - "memchr", - "rand", - "sha2", - "stringprep", -] - [[package]] name = "postgres-types" version = "0.2.1" @@ -1779,17 +1747,7 @@ source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d5 dependencies = [ "bytes", "fallible-iterator", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", -] - -[[package]] -name = "postgres-types" -version = "0.2.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858#9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" -dependencies = [ - "bytes", - "fallible-iterator", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", + "postgres-protocol", ] [[package]] @@ -1935,7 +1893,7 @@ dependencies = [ "socket2", "thiserror", "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "tokio-postgres", "tokio-postgres-rustls", "tokio-rustls", "workspace_hack", @@ -2793,30 +2751,8 @@ dependencies = [ "percent-encoding", "phf", "pin-project-lite", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "socket2", - "tokio", - "tokio-util 0.6.9", -] - -[[package]] -name = "tokio-postgres" -version = "0.7.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858#9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" -dependencies = [ - "async-trait", - "byteorder", - "bytes", - "fallible-iterator", - "futures", - "log", - "parking_lot", - "percent-encoding", - "phf", - "pin-project-lite", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", - "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", + "postgres-protocol", + "postgres-types", "socket2", "tokio", "tokio-util 0.6.9", @@ -2832,7 +2768,7 @@ dependencies = [ "ring", "rustls 0.20.2", "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "tokio-postgres", "tokio-rustls", ] @@ -3171,8 +3107,8 @@ dependencies = [ "humantime", "hyper", "lazy_static", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "postgres", + "postgres-protocol", "postgres_ffi", "regex", "rusoto_core", @@ -3183,7 +3119,7 @@ dependencies = [ "signal-hook", "tempfile", "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "tokio-postgres", "tokio-util 0.7.0", "tracing", "url", @@ -3432,7 +3368,7 @@ dependencies = [ "clap 3.0.14", "control_plane", "pageserver", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "postgres", "postgres_ffi", "serde_json", "walkeeper", @@ -3468,8 +3404,8 @@ dependencies = [ "lazy_static", "nix", "pin-project-lite", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "postgres", + "postgres-protocol", "rand", "routerify 3.0.0", "rustls 0.19.1", diff --git a/Cargo.toml b/Cargo.toml index f3ac36dcb2..b8283a6112 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,7 @@ resolver = "2" # Besides, debug info should not affect the performance. debug = true -# This is only needed for proxy's tests -# TODO: we should probably fork tokio-postgres-rustls instead +# This is only needed for proxy's tests. +# TODO: we should probably fork `tokio-postgres-rustls` instead. [patch.crates-io] tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index fc52ce4e83..856ec45c73 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -11,7 +11,7 @@ clap = "3.0" env_logger = "0.9" hyper = { version = "0.14", features = ["full"] } log = { version = "0.4", features = ["std", "serde"] } -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" } +postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } regex = "1" serde = { version = "1.0", features = ["derive"] } serde_json = "1" From 9946cd11256fc48c1b765cf62a6510c9a851251b Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 15 Apr 2022 18:52:44 +0400 Subject: [PATCH 0145/1022] Bump vendor/postgres to add safekeeper connection timeout. --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 61afbf978b..d7c8426e49 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 61afbf978b17764134ab6f1650bbdcadac147e71 +Subproject commit d7c8426e49cff3c791c3f2c4cde95f1fce665573 From 71269799500205ccd574d7820406309b2b1665de Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 15 Apr 2022 19:09:41 +0300 Subject: [PATCH 0146/1022] Remove custom neon Docker build image --- Dockerfile | 11 +++-------- Dockerfile.build | 23 ----------------------- Dockerfile.compute-tools | 5 ++--- 3 files changed, 5 insertions(+), 34 deletions(-) delete mode 100644 Dockerfile.build diff --git a/Dockerfile b/Dockerfile index 955d26cd0b..5e579be4e7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,5 @@ # Build Postgres -# -#FROM zimg/rust:1.58 AS pg-build -FROM zenithdb/build:buster-20220414 AS pg-build +FROM zimg/rust:1.58 AS pg-build WORKDIR /pg USER root @@ -16,22 +14,19 @@ RUN set -e \ && tar -C tmp_install -czf /postgres_install.tar.gz . # Build zenith binaries -# -#FROM zimg/rust:1.58 AS build -FROM zenithdb/build:buster-20220414 AS build +FROM zimg/rust:1.58 AS build ARG GIT_VERSION=local ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID ARG AWS_SECRET_ACCESS_KEY -ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server COPY . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, loosing the compilation stats. -RUN cargo build --release && /usr/local/cargo/bin/cachepot -s +RUN cargo build --release && cachepot -s # Build final image # diff --git a/Dockerfile.build b/Dockerfile.build deleted file mode 100644 index c7d239647f..0000000000 --- a/Dockerfile.build +++ /dev/null @@ -1,23 +0,0 @@ -FROM rust:1.58-slim-buster -WORKDIR /home/circleci/project - -RUN set -e \ - && apt-get update \ - && apt-get -yq install \ - automake \ - libtool \ - build-essential \ - bison \ - flex \ - libreadline-dev \ - zlib1g-dev \ - libxml2-dev \ - libseccomp-dev \ - pkg-config \ - libssl-dev \ - clang - -RUN set -e \ - && rustup component add clippy \ - && cargo install cargo-audit \ - && cargo install --git https://github.com/paritytech/cachepot diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 6a35a71bb3..a0cc21105b 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,17 +1,16 @@ # First transient image to build compute_tools binaries # NB: keep in sync with rust image version in .circle/config.yml -FROM zenithdb/build:buster-20220414 AS rust-build +FROM zimg/rust:1.58 AS rust-build WORKDIR /zenith ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID ARG AWS_SECRET_ACCESS_KEY -ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot COPY . . -RUN cargo build -p compute_tools --release && /usr/local/cargo/bin/cachepot -s +RUN cargo build -p compute_tools --release && cachepot -s # Final image that only has one binary FROM debian:buster-slim From 3ab090b43ad71643f457108613b89c521346d612 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 15 Apr 2022 21:32:08 +0300 Subject: [PATCH 0147/1022] Fix compute tools build --- Dockerfile.compute-tools | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index a0cc21105b..27bfbb5d1b 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -2,8 +2,6 @@ # NB: keep in sync with rust image version in .circle/config.yml FROM zimg/rust:1.58 AS rust-build -WORKDIR /zenith - ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID ARG AWS_SECRET_ACCESS_KEY @@ -15,4 +13,4 @@ RUN cargo build -p compute_tools --release && cachepot -s # Final image that only has one binary FROM debian:buster-slim -COPY --from=rust-build /zenith/target/release/zenith_ctl /usr/local/bin/zenith_ctl +COPY --from=rust-build /home/circleci/project/target/release/zenith_ctl /usr/local/bin/zenith_ctl From 4bc338babc22b835c377239651c97b5227053217 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 16 Apr 2022 10:01:42 +0300 Subject: [PATCH 0148/1022] Revert libc upgrade --- Dockerfile | 10 +++++++--- Dockerfile.build | 23 +++++++++++++++++++++++ Dockerfile.compute-tools | 13 ++++++++++--- 3 files changed, 40 insertions(+), 6 deletions(-) create mode 100644 Dockerfile.build diff --git a/Dockerfile b/Dockerfile index 5e579be4e7..a6ac923187 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,6 @@ # Build Postgres -FROM zimg/rust:1.58 AS pg-build +#FROM zimg/rust:1.58 AS pg-build +FROM zenithdb/build:buster-20220414 AS pg-build WORKDIR /pg USER root @@ -14,7 +15,8 @@ RUN set -e \ && tar -C tmp_install -czf /postgres_install.tar.gz . # Build zenith binaries -FROM zimg/rust:1.58 AS build +#FROM zimg/rust:1.58 AS build +FROM zenithdb/build:buster-20220414 AS build ARG GIT_VERSION=local ARG CACHEPOT_BUCKET=zenith-rust-cachepot @@ -26,7 +28,9 @@ COPY . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, loosing the compilation stats. -RUN cargo build --release && cachepot -s +#RUN cargo build --release && cachepot -s +ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot +RUN cargo build --release && /usr/local/cargo/bin/cachepot -s # Build final image # diff --git a/Dockerfile.build b/Dockerfile.build new file mode 100644 index 0000000000..c7d239647f --- /dev/null +++ b/Dockerfile.build @@ -0,0 +1,23 @@ +FROM rust:1.58-slim-buster +WORKDIR /home/circleci/project + +RUN set -e \ + && apt-get update \ + && apt-get -yq install \ + automake \ + libtool \ + build-essential \ + bison \ + flex \ + libreadline-dev \ + zlib1g-dev \ + libxml2-dev \ + libseccomp-dev \ + pkg-config \ + libssl-dev \ + clang + +RUN set -e \ + && rustup component add clippy \ + && cargo install cargo-audit \ + && cargo install --git https://github.com/paritytech/cachepot diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 27bfbb5d1b..18ebe61384 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,6 +1,10 @@ # First transient image to build compute_tools binaries # NB: keep in sync with rust image version in .circle/config.yml -FROM zimg/rust:1.58 AS rust-build + +#FROM zimg/rust:1.58 AS rust-build +FROM zenithdb/build:buster-20220414 AS rust-build + +WORKDIR /zenith ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID @@ -8,9 +12,12 @@ ARG AWS_SECRET_ACCESS_KEY COPY . . -RUN cargo build -p compute_tools --release && cachepot -s +#RUN cargo build -p compute_tools --release && cachepot -s +ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot +RUN cargo build -p compute_tools --release && /usr/local/cargo/bin/cachepot -s # Final image that only has one binary FROM debian:buster-slim -COPY --from=rust-build /home/circleci/project/target/release/zenith_ctl /usr/local/bin/zenith_ctl +#COPY --from=rust-build /home/circleci/project/target/release/zenith_ctl /usr/local/bin/zenith_ctl +COPY --from=rust-build /zenith/target/release/zenith_ctl /usr/local/bin/zenith_ctl From ed5f9acca94532b114b841017fa0492e349b6ef6 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 16 Apr 2022 13:38:48 +0300 Subject: [PATCH 0149/1022] Revert "Revert libc upgrade" (#1527) This reverts commit 4bc338babc22b835c377239651c97b5227053217. --- Dockerfile | 10 +++------- Dockerfile.build | 23 ----------------------- Dockerfile.compute-tools | 13 +++---------- 3 files changed, 6 insertions(+), 40 deletions(-) delete mode 100644 Dockerfile.build diff --git a/Dockerfile b/Dockerfile index a6ac923187..5e579be4e7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,5 @@ # Build Postgres -#FROM zimg/rust:1.58 AS pg-build -FROM zenithdb/build:buster-20220414 AS pg-build +FROM zimg/rust:1.58 AS pg-build WORKDIR /pg USER root @@ -15,8 +14,7 @@ RUN set -e \ && tar -C tmp_install -czf /postgres_install.tar.gz . # Build zenith binaries -#FROM zimg/rust:1.58 AS build -FROM zenithdb/build:buster-20220414 AS build +FROM zimg/rust:1.58 AS build ARG GIT_VERSION=local ARG CACHEPOT_BUCKET=zenith-rust-cachepot @@ -28,9 +26,7 @@ COPY . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, loosing the compilation stats. -#RUN cargo build --release && cachepot -s -ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot -RUN cargo build --release && /usr/local/cargo/bin/cachepot -s +RUN cargo build --release && cachepot -s # Build final image # diff --git a/Dockerfile.build b/Dockerfile.build deleted file mode 100644 index c7d239647f..0000000000 --- a/Dockerfile.build +++ /dev/null @@ -1,23 +0,0 @@ -FROM rust:1.58-slim-buster -WORKDIR /home/circleci/project - -RUN set -e \ - && apt-get update \ - && apt-get -yq install \ - automake \ - libtool \ - build-essential \ - bison \ - flex \ - libreadline-dev \ - zlib1g-dev \ - libxml2-dev \ - libseccomp-dev \ - pkg-config \ - libssl-dev \ - clang - -RUN set -e \ - && rustup component add clippy \ - && cargo install cargo-audit \ - && cargo install --git https://github.com/paritytech/cachepot diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 18ebe61384..27bfbb5d1b 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,10 +1,6 @@ # First transient image to build compute_tools binaries # NB: keep in sync with rust image version in .circle/config.yml - -#FROM zimg/rust:1.58 AS rust-build -FROM zenithdb/build:buster-20220414 AS rust-build - -WORKDIR /zenith +FROM zimg/rust:1.58 AS rust-build ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID @@ -12,12 +8,9 @@ ARG AWS_SECRET_ACCESS_KEY COPY . . -#RUN cargo build -p compute_tools --release && cachepot -s -ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot -RUN cargo build -p compute_tools --release && /usr/local/cargo/bin/cachepot -s +RUN cargo build -p compute_tools --release && cachepot -s # Final image that only has one binary FROM debian:buster-slim -#COPY --from=rust-build /home/circleci/project/target/release/zenith_ctl /usr/local/bin/zenith_ctl -COPY --from=rust-build /zenith/target/release/zenith_ctl /usr/local/bin/zenith_ctl +COPY --from=rust-build /home/circleci/project/target/release/zenith_ctl /usr/local/bin/zenith_ctl From 787f0d33f0f15209e1c7d803633280b6064ed11a Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 16 Apr 2022 23:36:42 +0300 Subject: [PATCH 0150/1022] Use another cachepot bucket for rust Docker build caches --- Dockerfile | 2 +- Dockerfile.compute-tools | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5e579be4e7..b2d4971345 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ RUN set -e \ FROM zimg/rust:1.58 AS build ARG GIT_VERSION=local -ARG CACHEPOT_BUCKET=zenith-rust-cachepot +ARG CACHEPOT_BUCKET=zenith-rust-cachepot-docker ARG AWS_ACCESS_KEY_ID ARG AWS_SECRET_ACCESS_KEY diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 27bfbb5d1b..dc67ae3032 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -2,7 +2,7 @@ # NB: keep in sync with rust image version in .circle/config.yml FROM zimg/rust:1.58 AS rust-build -ARG CACHEPOT_BUCKET=zenith-rust-cachepot +ARG CACHEPOT_BUCKET=zenith-rust-cachepot-docker ARG AWS_ACCESS_KEY_ID ARG AWS_SECRET_ACCESS_KEY From 3136a0754a85b20a8f20d623d63add3399b51c13 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 16 Apr 2022 23:03:13 +0300 Subject: [PATCH 0151/1022] Use mold in Docker images --- Dockerfile | 4 ++-- Dockerfile.compute-tools | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index b2d4971345..3467359ac4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ COPY Makefile Makefile ENV BUILD_TYPE release RUN set -e \ - && make -j $(nproc) -s postgres \ + && mold -run make -j $(nproc) -s postgres \ && rm -rf tmp_install/build \ && tar -C tmp_install -czf /postgres_install.tar.gz . @@ -26,7 +26,7 @@ COPY . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, loosing the compilation stats. -RUN cargo build --release && cachepot -s +RUN mold -run cargo build --release && cachepot -s # Build final image # diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index dc67ae3032..c2e33b9d98 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -8,7 +8,7 @@ ARG AWS_SECRET_ACCESS_KEY COPY . . -RUN cargo build -p compute_tools --release && cachepot -s +RUN mold -run cargo build -p compute_tools --release && cachepot -s # Final image that only has one binary FROM debian:buster-slim From 9b7dcc2bae88d3aedc97541066416c34993e9533 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sun, 17 Apr 2022 15:42:38 +0300 Subject: [PATCH 0152/1022] Use proper cachepot bucket --- .circleci/config.yml | 2 -- Dockerfile | 2 +- Dockerfile.compute-tools | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 5aae143e48..8752da506d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -117,8 +117,6 @@ jobs: fi export CARGO_INCREMENTAL=0 - export CACHEPOT_BUCKET=zenith-rust-cachepot - export RUSTC_WRAPPER=cachepot export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --bins --tests diff --git a/Dockerfile b/Dockerfile index 3467359ac4..ebc8731168 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ RUN set -e \ FROM zimg/rust:1.58 AS build ARG GIT_VERSION=local -ARG CACHEPOT_BUCKET=zenith-rust-cachepot-docker +ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID ARG AWS_SECRET_ACCESS_KEY diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index c2e33b9d98..3fc8702f3f 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -2,7 +2,7 @@ # NB: keep in sync with rust image version in .circle/config.yml FROM zimg/rust:1.58 AS rust-build -ARG CACHEPOT_BUCKET=zenith-rust-cachepot-docker +ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID ARG AWS_SECRET_ACCESS_KEY From 0ca2bd929b8753e946fff83cdaa8f2b0062f6ae1 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 15 Apr 2022 22:53:31 +0300 Subject: [PATCH 0153/1022] Remove log crate from pageserver --- Cargo.lock | 1 - pageserver/Cargo.toml | 1 - pageserver/src/basebackup.rs | 2 +- pageserver/src/layered_repository/delta_layer.rs | 2 +- pageserver/src/layered_repository/image_layer.rs | 2 +- pageserver/src/layered_repository/inmemory_layer.rs | 2 +- pageserver/src/tenant_mgr.rs | 2 +- pageserver/src/walredo.rs | 2 +- 8 files changed, 6 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0cdeb106ec..e93e73f087 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1526,7 +1526,6 @@ dependencies = [ "hyper", "itertools", "lazy_static", - "log", "nix", "once_cell", "postgres", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index e92ac0421c..3825795059 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -14,7 +14,6 @@ hex = "0.4.3" hyper = "0.14" itertools = "0.10.3" lazy_static = "1.4.0" -log = "0.4.14" clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 3caf27b9b3..077e7c9f83 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -12,13 +12,13 @@ //! use anyhow::{ensure, Context, Result}; use bytes::{BufMut, BytesMut}; -use log::*; use std::fmt::Write as FmtWrite; use std::io; use std::io::Write; use std::sync::Arc; use std::time::SystemTime; use tar::{Builder, EntryType, Header}; +use tracing::*; use crate::reltag::SlruKind; use crate::repository::Timeline; diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index dd6b5d3afa..6e3d65a94d 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -38,8 +38,8 @@ use crate::walrecord; use crate::{ZTenantId, ZTimelineId}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; -use log::*; use serde::{Deserialize, Serialize}; +use tracing::*; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods use std::fmt::Write as _; diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 08e635f073..0f334658bf 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -35,7 +35,6 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use hex; -use log::*; use serde::{Deserialize, Serialize}; use std::fs; use std::io::Write; @@ -43,6 +42,7 @@ use std::io::{Seek, SeekFrom}; use std::ops::Range; use std::path::{Path, PathBuf}; use std::sync::{RwLock, RwLockReadGuard}; +use tracing::*; use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::Lsn; diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index a45af51487..ffb5be1dd4 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -16,8 +16,8 @@ use crate::repository::{Key, Value}; use crate::walrecord; use crate::{ZTenantId, ZTimelineId}; use anyhow::{bail, ensure, Result}; -use log::*; use std::collections::HashMap; +use tracing::*; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods use std::fmt::Write as _; diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index aeff718803..2765554cf9 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -13,13 +13,13 @@ use crate::walredo::PostgresRedoManager; use crate::{DatadirTimelineImpl, RepositoryImpl}; use anyhow::{Context, Result}; use lazy_static::lazy_static; -use log::*; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use std::collections::hash_map::Entry; use std::collections::HashMap; use std::fmt; use std::sync::{Arc, Mutex, MutexGuard}; +use tracing::*; use zenith_utils::zid::{ZTenantId, ZTimelineId}; lazy_static! { diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index ae22f1eead..b7c6ecf726 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -21,7 +21,6 @@ use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use lazy_static::lazy_static; -use log::*; use nix::poll::*; use serde::Serialize; use std::fs; @@ -35,6 +34,7 @@ use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command}; use std::sync::Mutex; use std::time::Duration; use std::time::Instant; +use tracing::*; use zenith_metrics::{register_histogram, register_int_counter, Histogram, IntCounter}; use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::Lsn; From 5b297745324f759c4aa16037a165bef251fc8252 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 4 Apr 2022 15:42:13 +0400 Subject: [PATCH 0154/1022] Small refactoring after ec3bc741653d. Move record_safekeeper_info inside safekeeper.rs, fix commit_lsn update, sync control file. --- walkeeper/src/safekeeper.rs | 52 ++++++++++++++++++++++++++++++++++--- walkeeper/src/timeline.rs | 46 ++++++-------------------------- 2 files changed, 57 insertions(+), 41 deletions(-) diff --git a/walkeeper/src/safekeeper.rs b/walkeeper/src/safekeeper.rs index 22a8481e45..cf56261ee6 100644 --- a/walkeeper/src/safekeeper.rs +++ b/walkeeper/src/safekeeper.rs @@ -6,6 +6,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut}; use postgres_ffi::xlog_utils::TimeLineID; use serde::{Deserialize, Serialize}; +use std::cmp::max; use std::cmp::min; use std::fmt; use std::io::Read; @@ -15,6 +16,7 @@ use zenith_utils::zid::ZTenantTimelineId; use lazy_static::lazy_static; +use crate::broker::SafekeeperInfo; use crate::control_file; use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; @@ -497,6 +499,8 @@ pub struct SafeKeeper { metrics: SafeKeeperMetrics, /// Maximum commit_lsn between all nodes, can be ahead of local flush_lsn. + /// Note: be careful to set only if we are sure our WAL (term history) matches + /// committed one. pub global_commit_lsn: Lsn, /// LSN since the proposer safekeeper currently talking to appends WAL; /// determines epoch switch point. @@ -743,7 +747,9 @@ where let mut state = self.state.clone(); state.commit_lsn = self.inmem.commit_lsn; + state.s3_wal_lsn = self.inmem.s3_wal_lsn; state.peer_horizon_lsn = self.inmem.peer_horizon_lsn; + state.remote_consistent_lsn = self.inmem.remote_consistent_lsn; state.proposer_uuid = self.inmem.proposer_uuid; self.state.persist(&state) } @@ -788,10 +794,10 @@ where self.wal_store.flush_wal()?; } - // Update global_commit_lsn, verifying that it cannot decrease. + // Update global_commit_lsn if msg.h.commit_lsn != Lsn(0) { - assert!(msg.h.commit_lsn >= self.global_commit_lsn); - self.global_commit_lsn = msg.h.commit_lsn; + // We also obtain commit lsn from peers, so value arrived here might be stale (less) + self.global_commit_lsn = max(self.global_commit_lsn, msg.h.commit_lsn); } self.inmem.peer_horizon_lsn = msg.h.truncate_lsn; @@ -835,6 +841,46 @@ where self.append_response(), ))) } + + /// Update timeline state with peer safekeeper data. + pub fn record_safekeeper_info(&mut self, sk_info: &SafekeeperInfo) -> Result<()> { + let mut sync_control_file = false; + if let (Some(commit_lsn), Some(last_log_term)) = (sk_info.commit_lsn, sk_info.last_log_term) + { + // Note: the check is too restrictive, generally we can update local + // commit_lsn if our history matches (is part of) history of advanced + // commit_lsn provider. + if last_log_term == self.get_epoch() { + self.global_commit_lsn = max(commit_lsn, self.global_commit_lsn); + self.update_commit_lsn()?; + } + } + if let Some(s3_wal_lsn) = sk_info.s3_wal_lsn { + let new_s3_wal_lsn = max(s3_wal_lsn, self.inmem.s3_wal_lsn); + sync_control_file |= + self.state.s3_wal_lsn + (self.state.server.wal_seg_size as u64) < new_s3_wal_lsn; + self.inmem.s3_wal_lsn = new_s3_wal_lsn; + } + if let Some(remote_consistent_lsn) = sk_info.remote_consistent_lsn { + let new_remote_consistent_lsn = + max(remote_consistent_lsn, self.inmem.remote_consistent_lsn); + sync_control_file |= self.state.remote_consistent_lsn + + (self.state.server.wal_seg_size as u64) + < new_remote_consistent_lsn; + self.inmem.remote_consistent_lsn = new_remote_consistent_lsn; + } + if let Some(peer_horizon_lsn) = sk_info.peer_horizon_lsn { + let new_peer_horizon_lsn = max(peer_horizon_lsn, self.inmem.peer_horizon_lsn); + sync_control_file |= self.state.peer_horizon_lsn + + (self.state.server.wal_seg_size as u64) + < new_peer_horizon_lsn; + self.inmem.peer_horizon_lsn = new_peer_horizon_lsn; + } + if sync_control_file { + self.persist_control_file()?; + } + Ok(()) + } } #[cfg(test)] diff --git a/walkeeper/src/timeline.rs b/walkeeper/src/timeline.rs index a2941a9a5c..777db7eb2b 100644 --- a/walkeeper/src/timeline.rs +++ b/walkeeper/src/timeline.rs @@ -375,10 +375,9 @@ impl Timeline { } // Notify caught-up WAL senders about new WAL data received - pub fn notify_wal_senders(&self, commit_lsn: Lsn) { - let mut shared_state = self.mutex.lock().unwrap(); - if shared_state.notified_commit_lsn < commit_lsn { - shared_state.notified_commit_lsn = commit_lsn; + fn notify_wal_senders(&self, shared_state: &mut MutexGuard) { + if shared_state.notified_commit_lsn < shared_state.sk.inmem.commit_lsn { + shared_state.notified_commit_lsn = shared_state.sk.inmem.commit_lsn; self.cond.notify_all(); } } @@ -389,13 +388,9 @@ impl Timeline { msg: &ProposerAcceptorMessage, ) -> Result> { let mut rmsg: Option; - let commit_lsn: Lsn; { let mut shared_state = self.mutex.lock().unwrap(); rmsg = shared_state.sk.process_msg(msg)?; - // locally available commit lsn. flush_lsn can be smaller than - // commit_lsn if we are catching up safekeeper. - commit_lsn = shared_state.sk.inmem.commit_lsn; // if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { @@ -405,9 +400,10 @@ impl Timeline { resp.zenith_feedback = zenith_feedback; } } + + // Ping wal sender that new data might be available. + self.notify_wal_senders(&mut shared_state); } - // Ping wal sender that new data might be available. - self.notify_wal_senders(commit_lsn); Ok(rmsg) } @@ -437,34 +433,8 @@ impl Timeline { /// Update timeline state with peer safekeeper data. pub fn record_safekeeper_info(&self, sk_info: &SafekeeperInfo, _sk_id: ZNodeId) -> Result<()> { let mut shared_state = self.mutex.lock().unwrap(); - // Note: the check is too restrictive, generally we can update local - // commit_lsn if our history matches (is part of) history of advanced - // commit_lsn provider. - if let (Some(commit_lsn), Some(last_log_term)) = (sk_info.commit_lsn, sk_info.last_log_term) - { - if last_log_term == shared_state.sk.get_epoch() { - shared_state.sk.global_commit_lsn = - max(commit_lsn, shared_state.sk.global_commit_lsn); - shared_state.sk.update_commit_lsn()?; - let local_commit_lsn = min(commit_lsn, shared_state.sk.wal_store.flush_lsn()); - shared_state.sk.inmem.commit_lsn = - max(local_commit_lsn, shared_state.sk.inmem.commit_lsn); - } - } - if let Some(s3_wal_lsn) = sk_info.s3_wal_lsn { - shared_state.sk.inmem.s3_wal_lsn = max(s3_wal_lsn, shared_state.sk.inmem.s3_wal_lsn); - } - if let Some(remote_consistent_lsn) = sk_info.remote_consistent_lsn { - shared_state.sk.inmem.remote_consistent_lsn = max( - remote_consistent_lsn, - shared_state.sk.inmem.remote_consistent_lsn, - ); - } - if let Some(peer_horizon_lsn) = sk_info.peer_horizon_lsn { - shared_state.sk.inmem.peer_horizon_lsn = - max(peer_horizon_lsn, shared_state.sk.inmem.peer_horizon_lsn); - } - // TODO: sync control file + shared_state.sk.record_safekeeper_info(sk_info)?; + self.notify_wal_senders(&mut shared_state); Ok(()) } From 81879f8137ca91315f57ff415170dc14f411d492 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 18 Apr 2022 12:15:54 +0300 Subject: [PATCH 0155/1022] Restore missing cachepot env vars --- .circleci/config.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 8752da506d..5aae143e48 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -117,6 +117,8 @@ jobs: fi export CARGO_INCREMENTAL=0 + export CACHEPOT_BUCKET=zenith-rust-cachepot + export RUSTC_WRAPPER=cachepot export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --bins --tests From 81417788c8e0ed55611065cbc34c1e5366fe4ba1 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 18 Apr 2022 11:41:05 +0300 Subject: [PATCH 0156/1022] walkeeper -> safekeeper --- Cargo.lock | 82 +++++++++---------- Cargo.toml | 2 +- control_plane/Cargo.toml | 2 +- control_plane/src/safekeeper.rs | 2 +- docs/README.md | 2 +- docs/glossary.md | 8 +- docs/rfcs/009-snapshot-first-storage-cli.md | 12 +-- docs/sourcetree.md | 4 +- postgres_ffi/src/waldecoder.rs | 2 +- {walkeeper => safekeeper}/Cargo.toml | 2 +- {walkeeper => safekeeper}/README | 0 {walkeeper => safekeeper}/README_PROTO.md | 0 .../spec/ProposerAcceptorConsensus.cfg | 0 .../spec/ProposerAcceptorConsensus.tla | 0 .../src/bin/safekeeper.rs | 14 ++-- {walkeeper => safekeeper}/src/broker.rs | 0 {walkeeper => safekeeper}/src/callmemaybe.rs | 0 {walkeeper => safekeeper}/src/control_file.rs | 0 .../src/control_file_upgrade.rs | 0 {walkeeper => safekeeper}/src/handler.rs | 2 +- {walkeeper => safekeeper}/src/http/mod.rs | 0 {walkeeper => safekeeper}/src/http/models.rs | 0 {walkeeper => safekeeper}/src/http/routes.rs | 0 {walkeeper => safekeeper}/src/json_ctrl.rs | 0 {walkeeper => safekeeper}/src/lib.rs | 0 {walkeeper => safekeeper}/src/receive_wal.rs | 0 {walkeeper => safekeeper}/src/s3_offload.rs | 0 {walkeeper => safekeeper}/src/safekeeper.rs | 0 {walkeeper => safekeeper}/src/send_wal.rs | 0 {walkeeper => safekeeper}/src/timeline.rs | 0 {walkeeper => safekeeper}/src/wal_service.rs | 0 {walkeeper => safekeeper}/src/wal_storage.rs | 0 zenith/Cargo.toml | 2 +- zenith/src/main.rs | 2 +- 34 files changed, 69 insertions(+), 69 deletions(-) rename {walkeeper => safekeeper}/Cargo.toml (98%) rename {walkeeper => safekeeper}/README (100%) rename {walkeeper => safekeeper}/README_PROTO.md (100%) rename {walkeeper => safekeeper}/spec/ProposerAcceptorConsensus.cfg (100%) rename {walkeeper => safekeeper}/spec/ProposerAcceptorConsensus.tla (100%) rename {walkeeper => safekeeper}/src/bin/safekeeper.rs (97%) rename {walkeeper => safekeeper}/src/broker.rs (100%) rename {walkeeper => safekeeper}/src/callmemaybe.rs (100%) rename {walkeeper => safekeeper}/src/control_file.rs (100%) rename {walkeeper => safekeeper}/src/control_file_upgrade.rs (100%) rename {walkeeper => safekeeper}/src/handler.rs (98%) rename {walkeeper => safekeeper}/src/http/mod.rs (100%) rename {walkeeper => safekeeper}/src/http/models.rs (100%) rename {walkeeper => safekeeper}/src/http/routes.rs (100%) rename {walkeeper => safekeeper}/src/json_ctrl.rs (100%) rename {walkeeper => safekeeper}/src/lib.rs (100%) rename {walkeeper => safekeeper}/src/receive_wal.rs (100%) rename {walkeeper => safekeeper}/src/s3_offload.rs (100%) rename {walkeeper => safekeeper}/src/safekeeper.rs (100%) rename {walkeeper => safekeeper}/src/send_wal.rs (100%) rename {walkeeper => safekeeper}/src/timeline.rs (100%) rename {walkeeper => safekeeper}/src/wal_service.rs (100%) rename {walkeeper => safekeeper}/src/wal_storage.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index e93e73f087..a933b44356 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -381,13 +381,13 @@ dependencies = [ "postgres", "regex", "reqwest", + "safekeeper", "serde", "serde_with", "tar", "thiserror", "toml", "url", - "walkeeper", "workspace_hack", "zenith_utils", ] @@ -2290,6 +2290,45 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f" +[[package]] +name = "safekeeper" +version = "0.1.0" +dependencies = [ + "anyhow", + "byteorder", + "bytes", + "clap 3.0.14", + "const_format", + "crc32c", + "daemonize", + "etcd-client", + "fs2", + "hex", + "humantime", + "hyper", + "lazy_static", + "postgres", + "postgres-protocol", + "postgres_ffi", + "regex", + "rusoto_core", + "rusoto_s3", + "serde", + "serde_json", + "serde_with", + "signal-hook", + "tempfile", + "tokio", + "tokio-postgres", + "tokio-util 0.7.0", + "tracing", + "url", + "walkdir", + "workspace_hack", + "zenith_metrics", + "zenith_utils", +] + [[package]] name = "same-file" version = "1.0.6" @@ -3089,45 +3128,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "walkeeper" -version = "0.1.0" -dependencies = [ - "anyhow", - "byteorder", - "bytes", - "clap 3.0.14", - "const_format", - "crc32c", - "daemonize", - "etcd-client", - "fs2", - "hex", - "humantime", - "hyper", - "lazy_static", - "postgres", - "postgres-protocol", - "postgres_ffi", - "regex", - "rusoto_core", - "rusoto_s3", - "serde", - "serde_json", - "serde_with", - "signal-hook", - "tempfile", - "tokio", - "tokio-postgres", - "tokio-util 0.7.0", - "tracing", - "url", - "walkdir", - "workspace_hack", - "zenith_metrics", - "zenith_utils", -] - [[package]] name = "want" version = "0.3.0" @@ -3369,8 +3369,8 @@ dependencies = [ "pageserver", "postgres", "postgres_ffi", + "safekeeper", "serde_json", - "walkeeper", "workspace_hack", "zenith_utils", ] diff --git a/Cargo.toml b/Cargo.toml index b8283a6112..4b3b31e0b7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ members = [ "pageserver", "postgres_ffi", "proxy", - "walkeeper", + "safekeeper", "workspace_hack", "zenith", "zenith_metrics", diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index e118ea4793..80b6c00dd2 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -18,6 +18,6 @@ url = "2.2.2" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } pageserver = { path = "../pageserver" } -walkeeper = { path = "../walkeeper" } +safekeeper = { path = "../safekeeper" } zenith_utils = { path = "../zenith_utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 89ab0a31ee..e23138bd3f 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -14,7 +14,7 @@ use postgres::Config; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use thiserror::Error; -use walkeeper::http::models::TimelineCreateRequest; +use safekeeper::http::models::TimelineCreateRequest; use zenith_utils::http::error::HttpErrorBody; use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; diff --git a/docs/README.md b/docs/README.md index 0558fa24a8..a3fcd20bd2 100644 --- a/docs/README.md +++ b/docs/README.md @@ -10,5 +10,5 @@ - [pageserver/README](/pageserver/README) — pageserver overview. - [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview. - [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview. -- [walkeeper/README](/walkeeper/README) — WAL service overview. +- [safekeeper/README](/safekeeper/README) — WAL service overview. - [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core diff --git a/docs/glossary.md b/docs/glossary.md index 0f82f2d666..ecc57b9ed1 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -29,7 +29,7 @@ Each Branch lives in a corresponding timeline[] and has an ancestor[]. NOTE: This is an overloaded term. -A checkpoint record in the WAL marks a point in the WAL sequence at which it is guaranteed that all data files have been updated with all information from shared memory modified before that checkpoint; +A checkpoint record in the WAL marks a point in the WAL sequence at which it is guaranteed that all data files have been updated with all information from shared memory modified before that checkpoint; ### Checkpoint (Layered repository) @@ -108,10 +108,10 @@ PostgreSQL LSNs and functions to monitor them: * `pg_current_wal_lsn()` - Returns the current write-ahead log write location. * `pg_current_wal_flush_lsn()` - Returns the current write-ahead log flush location. * `pg_last_wal_receive_lsn()` - Returns the last write-ahead log location that has been received and synced to disk by streaming replication. While streaming replication is in progress this will increase monotonically. -* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically. +* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically. [source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html): -Zenith safekeeper LSNs. For more check [walkeeper/README_PROTO.md](/walkeeper/README_PROTO.md) +Zenith safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md) * `CommitLSN`: position in WAL confirmed by quorum safekeepers. * `RestartLSN`: position in WAL confirmed by all safekeepers. * `FlushLSN`: part of WAL persisted to the disk by safekeeper. @@ -190,7 +190,7 @@ or we do not support them in zenith yet (pg_commit_ts). Tenant represents a single customer, interacting with Zenith. Wal redo[] activity, timelines[], layers[] are managed for each tenant independently. One pageserver[] can serve multiple tenants at once. -One safekeeper +One safekeeper See `docs/multitenancy.md` for more. diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md index 3f5386c165..11ded3a724 100644 --- a/docs/rfcs/009-snapshot-first-storage-cli.md +++ b/docs/rfcs/009-snapshot-first-storage-cli.md @@ -12,7 +12,7 @@ Init empty pageserver using `initdb` in temporary directory. `--storage_dest=FILE_PREFIX | S3_PREFIX |...` option defines object storage type, all other parameters are passed via env variables. Inspired by WAL-G style naming : https://wal-g.readthedocs.io/STORAGES/. -Save`storage_dest` and other parameters in config. +Save`storage_dest` and other parameters in config. Push snapshots to `storage_dest` in background. ``` @@ -21,7 +21,7 @@ zenith start ``` #### 2. Restart pageserver (manually or crash-recovery). -Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`. +Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`. Push snapshots to `storage_dest` in background. ``` @@ -32,7 +32,7 @@ zenith start Start pageserver from existing snapshot. Path to snapshot provided via `--snapshot_path=FILE_PREFIX | S3_PREFIX | ...` Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time operation. -Save`storage_dest` parameters in config. +Save`storage_dest` parameters in config. Push snapshots to `storage_dest` in background. ``` //I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage. @@ -42,15 +42,15 @@ zenith start How to pass credentials needed for `snapshot_path`? #### 4. Export. -Manually push snapshot to `snapshot_path` which differs from `storage_dest` +Manually push snapshot to `snapshot_path` which differs from `storage_dest` Optionally set `snapshot_format`, which can be plain pgdata format or zenith format. ``` zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata ``` #### Notes and questions -- walkeeper s3_offload should use same (similar) syntax for storage. How to set it in UI? +- safekeeper s3_offload should use same (similar) syntax for storage. How to set it in UI? - Why do we need `zenith init` as a separate command? Can't we init everything at first start? - We can think of better names for all options. - Export to plain postgres format will be useless, if we are not 100% compatible on page level. -I can recall at least one such difference - PD_WAL_LOGGED flag in pages. \ No newline at end of file +I can recall at least one such difference - PD_WAL_LOGGED flag in pages. diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 89b07de8d2..b15294d67f 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -57,12 +57,12 @@ PostgreSQL extension that implements storage manager API and network communicati PostgreSQL extension that contains functions needed for testing and debugging. -`/walkeeper`: +`/safekeeper`: The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver. It acts as a holding area and redistribution center for recently generated WAL. -For more detailed info, see `/walkeeper/README` +For more detailed info, see `/safekeeper/README` `/workspace_hack`: The workspace_hack crate exists only to pin down some dependencies. diff --git a/postgres_ffi/src/waldecoder.rs b/postgres_ffi/src/waldecoder.rs index ac48b1b0f3..ce5aaf722d 100644 --- a/postgres_ffi/src/waldecoder.rs +++ b/postgres_ffi/src/waldecoder.rs @@ -4,7 +4,7 @@ //! This understands the WAL page and record format, enough to figure out where the WAL record //! boundaries are, and to reassemble WAL records that cross page boundaries. //! -//! This functionality is needed by both the pageserver and the walkeepers. The pageserver needs +//! This functionality is needed by both the pageserver and the safekeepers. The pageserver needs //! to look deeper into the WAL records to also understand which blocks they modify, the code //! for that is in pageserver/src/walrecord.rs //! diff --git a/walkeeper/Cargo.toml b/safekeeper/Cargo.toml similarity index 98% rename from walkeeper/Cargo.toml rename to safekeeper/Cargo.toml index 86aa56c9ae..ca5e2a6b55 100644 --- a/walkeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "walkeeper" +name = "safekeeper" version = "0.1.0" edition = "2021" diff --git a/walkeeper/README b/safekeeper/README similarity index 100% rename from walkeeper/README rename to safekeeper/README diff --git a/walkeeper/README_PROTO.md b/safekeeper/README_PROTO.md similarity index 100% rename from walkeeper/README_PROTO.md rename to safekeeper/README_PROTO.md diff --git a/walkeeper/spec/ProposerAcceptorConsensus.cfg b/safekeeper/spec/ProposerAcceptorConsensus.cfg similarity index 100% rename from walkeeper/spec/ProposerAcceptorConsensus.cfg rename to safekeeper/spec/ProposerAcceptorConsensus.cfg diff --git a/walkeeper/spec/ProposerAcceptorConsensus.tla b/safekeeper/spec/ProposerAcceptorConsensus.tla similarity index 100% rename from walkeeper/spec/ProposerAcceptorConsensus.tla rename to safekeeper/spec/ProposerAcceptorConsensus.tla diff --git a/walkeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs similarity index 97% rename from walkeeper/src/bin/safekeeper.rs rename to safekeeper/src/bin/safekeeper.rs index b3087a1004..490198231d 100644 --- a/walkeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -12,18 +12,18 @@ use std::path::{Path, PathBuf}; use std::thread; use tracing::*; use url::{ParseError, Url}; -use walkeeper::control_file::{self}; use zenith_utils::http::endpoint; use zenith_utils::zid::ZNodeId; use zenith_utils::{logging, tcp_listener, GIT_VERSION}; +use safekeeper::control_file::{self}; +use safekeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR}; +use safekeeper::http; +use safekeeper::s3_offload; +use safekeeper::wal_service; +use safekeeper::SafeKeeperConf; +use safekeeper::{broker, callmemaybe}; use tokio::sync::mpsc; -use walkeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR}; -use walkeeper::http; -use walkeeper::s3_offload; -use walkeeper::wal_service; -use walkeeper::SafeKeeperConf; -use walkeeper::{broker, callmemaybe}; use zenith_utils::shutdown::exit_now; use zenith_utils::signals; diff --git a/walkeeper/src/broker.rs b/safekeeper/src/broker.rs similarity index 100% rename from walkeeper/src/broker.rs rename to safekeeper/src/broker.rs diff --git a/walkeeper/src/callmemaybe.rs b/safekeeper/src/callmemaybe.rs similarity index 100% rename from walkeeper/src/callmemaybe.rs rename to safekeeper/src/callmemaybe.rs diff --git a/walkeeper/src/control_file.rs b/safekeeper/src/control_file.rs similarity index 100% rename from walkeeper/src/control_file.rs rename to safekeeper/src/control_file.rs diff --git a/walkeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs similarity index 100% rename from walkeeper/src/control_file_upgrade.rs rename to safekeeper/src/control_file_upgrade.rs diff --git a/walkeeper/src/handler.rs b/safekeeper/src/handler.rs similarity index 98% rename from walkeeper/src/handler.rs rename to safekeeper/src/handler.rs index 00d177da56..bb14049787 100644 --- a/walkeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -94,7 +94,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { Ok(()) } else { - bail!("Walkeeper received unexpected initial message: {:?}", sm); + bail!("Safekeeper received unexpected initial message: {:?}", sm); } } diff --git a/walkeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs similarity index 100% rename from walkeeper/src/http/mod.rs rename to safekeeper/src/http/mod.rs diff --git a/walkeeper/src/http/models.rs b/safekeeper/src/http/models.rs similarity index 100% rename from walkeeper/src/http/models.rs rename to safekeeper/src/http/models.rs diff --git a/walkeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs similarity index 100% rename from walkeeper/src/http/routes.rs rename to safekeeper/src/http/routes.rs diff --git a/walkeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs similarity index 100% rename from walkeeper/src/json_ctrl.rs rename to safekeeper/src/json_ctrl.rs diff --git a/walkeeper/src/lib.rs b/safekeeper/src/lib.rs similarity index 100% rename from walkeeper/src/lib.rs rename to safekeeper/src/lib.rs diff --git a/walkeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs similarity index 100% rename from walkeeper/src/receive_wal.rs rename to safekeeper/src/receive_wal.rs diff --git a/walkeeper/src/s3_offload.rs b/safekeeper/src/s3_offload.rs similarity index 100% rename from walkeeper/src/s3_offload.rs rename to safekeeper/src/s3_offload.rs diff --git a/walkeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs similarity index 100% rename from walkeeper/src/safekeeper.rs rename to safekeeper/src/safekeeper.rs diff --git a/walkeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs similarity index 100% rename from walkeeper/src/send_wal.rs rename to safekeeper/src/send_wal.rs diff --git a/walkeeper/src/timeline.rs b/safekeeper/src/timeline.rs similarity index 100% rename from walkeeper/src/timeline.rs rename to safekeeper/src/timeline.rs diff --git a/walkeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs similarity index 100% rename from walkeeper/src/wal_service.rs rename to safekeeper/src/wal_service.rs diff --git a/walkeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs similarity index 100% rename from walkeeper/src/wal_storage.rs rename to safekeeper/src/wal_storage.rs diff --git a/zenith/Cargo.toml b/zenith/Cargo.toml index 74aeffb51c..69283d3763 100644 --- a/zenith/Cargo.toml +++ b/zenith/Cargo.toml @@ -12,7 +12,7 @@ postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98 # FIXME: 'pageserver' is needed for BranchInfo. Refactor pageserver = { path = "../pageserver" } control_plane = { path = "../control_plane" } -walkeeper = { path = "../walkeeper" } +safekeeper = { path = "../safekeeper" } postgres_ffi = { path = "../postgres_ffi" } zenith_utils = { path = "../zenith_utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/zenith/src/main.rs b/zenith/src/main.rs index f5d4184e63..97b07b7b74 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -12,7 +12,7 @@ use pageserver::config::defaults::{ use std::collections::{BTreeSet, HashMap}; use std::process::exit; use std::str::FromStr; -use walkeeper::defaults::{ +use safekeeper::defaults::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; From 52e0816fa5a19bb741c7b053a4f6ae88bb4ff9c8 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 18 Apr 2022 11:49:46 +0300 Subject: [PATCH 0157/1022] wal_acceptor -> safekeeper --- control_plane/src/compute.rs | 4 +-- control_plane/src/safekeeper.rs | 2 +- safekeeper/src/bin/safekeeper.rs | 8 ++--- test_runner/batch_others/test_auth.py | 8 ++--- .../batch_others/test_restart_compute.py | 6 ++-- test_runner/batch_others/test_tenants.py | 18 +++++------ test_runner/batch_others/test_wal_acceptor.py | 32 +++++++++---------- .../batch_others/test_wal_acceptor_async.py | 6 ++-- test_runner/fixtures/log_helper.py | 2 +- test_runner/fixtures/zenith_fixtures.py | 8 ++--- .../performance/test_bulk_tenant_create.py | 12 +++---- zenith/src/main.rs | 6 ++-- 12 files changed, 56 insertions(+), 56 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 64cd46fef6..1c979acbdf 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -331,14 +331,14 @@ impl PostgresNode { // Configure the node to connect to the safekeepers conf.append("synchronous_standby_names", "walproposer"); - let wal_acceptors = self + let safekeepers = self .env .safekeepers .iter() .map(|sk| format!("localhost:{}", sk.pg_port)) .collect::>() .join(","); - conf.append("wal_acceptors", &wal_acceptors); + conf.append("wal_acceptors", &safekeepers); } else { // We only use setup without safekeepers for tests, // and don't care about data durability on pageserver, diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index e23138bd3f..6f11a4e03d 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -13,8 +13,8 @@ use nix::unistd::Pid; use postgres::Config; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; -use thiserror::Error; use safekeeper::http::models::TimelineCreateRequest; +use thiserror::Error; use zenith_utils::http::error::HttpErrorBody; use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 490198231d..e191cb52fd 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -257,18 +257,18 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b let (tx, rx) = mpsc::unbounded_channel(); let conf_cloned = conf.clone(); - let wal_acceptor_thread = thread::Builder::new() - .name("WAL acceptor thread".into()) + let safekeeper_thread = thread::Builder::new() + .name("Safekeeper thread".into()) .spawn(|| { // thread code let thread_result = wal_service::thread_main(conf_cloned, pg_listener, tx); if let Err(e) = thread_result { - info!("wal_service thread terminated: {}", e); + info!("safekeeper thread terminated: {}", e); } }) .unwrap(); - threads.push(wal_acceptor_thread); + threads.push(safekeeper_thread); let conf_cloned = conf.clone(); let callmemaybe_thread = thread::Builder::new() diff --git a/test_runner/batch_others/test_auth.py b/test_runner/batch_others/test_auth.py index bda6349ef9..a8ad384f27 100644 --- a/test_runner/batch_others/test_auth.py +++ b/test_runner/batch_others/test_auth.py @@ -52,14 +52,14 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): tenant_http_client.tenant_create() -@pytest.mark.parametrize('with_wal_acceptors', [False, True]) -def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool): +@pytest.mark.parametrize('with_safekeepers', [False, True]) +def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_safekeepers: bool): zenith_env_builder.pageserver_auth_enabled = True - if with_wal_acceptors: + if with_safekeepers: zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() - branch = f'test_compute_auth_to_pageserver{with_wal_acceptors}' + branch = f'test_compute_auth_to_pageserver{with_safekeepers}' env.zenith_cli.create_branch(branch) pg = env.postgres.create_start(branch) diff --git a/test_runner/batch_others/test_restart_compute.py b/test_runner/batch_others/test_restart_compute.py index fd06561c00..d6e7fd9e0d 100644 --- a/test_runner/batch_others/test_restart_compute.py +++ b/test_runner/batch_others/test_restart_compute.py @@ -8,10 +8,10 @@ from fixtures.log_helper import log # # Test restarting and recreating a postgres instance # -@pytest.mark.parametrize('with_wal_acceptors', [False, True]) -def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool): +@pytest.mark.parametrize('with_safekeepers', [False, True]) +def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_safekeepers: bool): zenith_env_builder.pageserver_auth_enabled = True - if with_wal_acceptors: + if with_safekeepers: zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/batch_others/test_tenants.py index e883018628..682af8de49 100644 --- a/test_runner/batch_others/test_tenants.py +++ b/test_runner/batch_others/test_tenants.py @@ -5,9 +5,9 @@ import pytest from fixtures.zenith_fixtures import ZenithEnvBuilder -@pytest.mark.parametrize('with_wal_acceptors', [False, True]) -def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool): - if with_wal_acceptors: +@pytest.mark.parametrize('with_safekeepers', [False, True]) +def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_safekeepers: bool): + if with_safekeepers: zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() @@ -15,17 +15,17 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_wal_acce tenant_1 = env.zenith_cli.create_tenant() tenant_2 = env.zenith_cli.create_tenant() - env.zenith_cli.create_timeline( - f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', tenant_id=tenant_1) - env.zenith_cli.create_timeline( - f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', tenant_id=tenant_2) + env.zenith_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', + tenant_id=tenant_1) + env.zenith_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', + tenant_id=tenant_2) pg_tenant1 = env.postgres.create_start( - f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', + f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', tenant_id=tenant_1, ) pg_tenant2 = env.postgres.create_start( - f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', + f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', tenant_id=tenant_2, ) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index dffcd7cc61..cc9ec9a275 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -25,8 +25,8 @@ def test_normal_work(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.broker = True env = zenith_env_builder.init_start() - env.zenith_cli.create_branch('test_wal_acceptors_normal_work') - pg = env.postgres.create_start('test_wal_acceptors_normal_work') + env.zenith_cli.create_branch('test_safekeepers_normal_work') + pg = env.postgres.create_start('test_safekeepers_normal_work') with closing(pg.connect()) as conn: with conn.cursor() as cur: @@ -56,7 +56,7 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): n_timelines = 3 branch_names = [ - "test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines) + "test_safekeepers_many_timelines_{}".format(tlin) for tlin in range(n_timelines) ] # pageserver, safekeeper operate timelines via their ids (can be represented in hex as 'ad50847381e248feaac9876cc71ae418') # that's not really human readable, so the branch names are introduced in Zenith CLI. @@ -196,8 +196,8 @@ def test_restarts(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = n_acceptors env = zenith_env_builder.init_start() - env.zenith_cli.create_branch('test_wal_acceptors_restarts') - pg = env.postgres.create_start('test_wal_acceptors_restarts') + env.zenith_cli.create_branch('test_safekeepers_restarts') + pg = env.postgres.create_start('test_safekeepers_restarts') # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -223,7 +223,7 @@ def test_restarts(zenith_env_builder: ZenithEnvBuilder): start_delay_sec = 2 -def delayed_wal_acceptor_start(wa): +def delayed_safekeeper_start(wa): time.sleep(start_delay_sec) wa.start() @@ -233,8 +233,8 @@ def test_unavailability(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 2 env = zenith_env_builder.init_start() - env.zenith_cli.create_branch('test_wal_acceptors_unavailability') - pg = env.postgres.create_start('test_wal_acceptors_unavailability') + env.zenith_cli.create_branch('test_safekeepers_unavailability') + pg = env.postgres.create_start('test_safekeepers_unavailability') # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -248,7 +248,7 @@ def test_unavailability(zenith_env_builder: ZenithEnvBuilder): # shutdown one of two acceptors, that is, majority env.safekeepers[0].stop() - proc = Process(target=delayed_wal_acceptor_start, args=(env.safekeepers[0], )) + proc = Process(target=delayed_safekeeper_start, args=(env.safekeepers[0], )) proc.start() start = time.time() @@ -260,7 +260,7 @@ def test_unavailability(zenith_env_builder: ZenithEnvBuilder): # for the world's balance, do the same with second acceptor env.safekeepers[1].stop() - proc = Process(target=delayed_wal_acceptor_start, args=(env.safekeepers[1], )) + proc = Process(target=delayed_safekeeper_start, args=(env.safekeepers[1], )) proc.start() start = time.time() @@ -304,8 +304,8 @@ def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value): zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() - env.zenith_cli.create_branch('test_wal_acceptors_race_conditions') - pg = env.postgres.create_start('test_wal_acceptors_race_conditions') + env.zenith_cli.create_branch('test_safekeepers_race_conditions') + pg = env.postgres.create_start('test_safekeepers_race_conditions') # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -396,7 +396,7 @@ class ProposerPostgres(PgProtocol): """ Path to postgresql.conf """ return os.path.join(self.pgdata_dir, 'postgresql.conf') - def create_dir_config(self, wal_acceptors: str): + def create_dir_config(self, safekeepers: str): """ Create dir and config for running --sync-safekeepers """ mkdir_if_needed(self.pg_data_dir_path()) @@ -407,7 +407,7 @@ class ProposerPostgres(PgProtocol): f"zenith.zenith_timeline = '{self.timeline_id.hex}'\n", f"zenith.zenith_tenant = '{self.tenant_id.hex}'\n", f"zenith.page_server_connstring = ''\n", - f"wal_acceptors = '{wal_acceptors}'\n", + f"wal_acceptors = '{safekeepers}'\n", f"listen_addresses = '{self.listen_addr}'\n", f"port = '{self.port}'\n", ] @@ -692,7 +692,7 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): env.safekeepers[3].stop() active_safekeepers = [1, 2, 3] pg = env.postgres.create('test_replace_safekeeper') - pg.adjust_for_wal_acceptors(safekeepers_guc(env, active_safekeepers)) + pg.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) pg.start() # learn zenith timeline from compute @@ -732,7 +732,7 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): pg.stop_and_destroy().create('test_replace_safekeeper') active_safekeepers = [2, 3, 4] env.safekeepers[3].start() - pg.adjust_for_wal_acceptors(safekeepers_guc(env, active_safekeepers)) + pg.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) pg.start() execute_payload(pg) diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index aadafc76cf..e3df8ea3eb 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -9,7 +9,7 @@ from fixtures.log_helper import getLogger from fixtures.utils import lsn_from_hex, lsn_to_hex from typing import List -log = getLogger('root.wal_acceptor_async') +log = getLogger('root.safekeeper_async') class BankClient(object): @@ -207,9 +207,9 @@ def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() - env.zenith_cli.create_branch('test_wal_acceptors_restarts_under_load') + env.zenith_cli.create_branch('test_safekeepers_restarts_under_load') # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long - pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load', + pg = env.postgres.create_start('test_safekeepers_restarts_under_load', config_lines=['max_replication_write_lag=1MB']) asyncio.run(run_restarts_under_load(env, pg, env.safekeepers)) diff --git a/test_runner/fixtures/log_helper.py b/test_runner/fixtures/log_helper.py index 9aa5f40bf3..7c2d83d4e3 100644 --- a/test_runner/fixtures/log_helper.py +++ b/test_runner/fixtures/log_helper.py @@ -25,7 +25,7 @@ LOGGING = { "root": { "level": "INFO" }, - "root.wal_acceptor_async": { + "root.safekeeper_async": { "level": "INFO" # a lot of logs on DEBUG level } } diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index f8ee39a5a1..e0f08a3bfb 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -612,7 +612,7 @@ class ZenithEnv: self.broker.start() def get_safekeeper_connstrs(self) -> str: - """ Get list of safekeeper endpoints suitable for wal_acceptors GUC """ + """ Get list of safekeeper endpoints suitable for safekeepers GUC """ return ','.join([f'localhost:{wa.port.pg}' for wa in self.safekeepers]) @cached_property @@ -1484,7 +1484,7 @@ class Postgres(PgProtocol): """ Path to postgresql.conf """ return os.path.join(self.pg_data_dir_path(), 'postgresql.conf') - def adjust_for_wal_acceptors(self, wal_acceptors: str) -> 'Postgres': + def adjust_for_safekeepers(self, safekeepers: str) -> 'Postgres': """ Adjust instance config for working with wal acceptors instead of pageserver (pre-configured by CLI) directly. @@ -1499,12 +1499,12 @@ class Postgres(PgProtocol): if ("synchronous_standby_names" in cfg_line or # don't ask pageserver to fetch WAL from compute "callmemaybe_connstring" in cfg_line or - # don't repeat wal_acceptors multiple times + # don't repeat safekeepers/wal_acceptors multiple times "wal_acceptors" in cfg_line): continue f.write(cfg_line) f.write("synchronous_standby_names = 'walproposer'\n") - f.write("wal_acceptors = '{}'\n".format(wal_acceptors)) + f.write("wal_acceptors = '{}'\n".format(safekeepers)) return self def config(self, lines: List[str]) -> 'Postgres': diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index fbef131ffd..f0729d3a07 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -13,15 +13,15 @@ from fixtures.zenith_fixtures import ZenithEnvBuilder @pytest.mark.parametrize('tenants_count', [1, 5, 10]) -@pytest.mark.parametrize('use_wal_acceptors', ['with_wa', 'without_wa']) +@pytest.mark.parametrize('use_safekeepers', ['with_wa', 'without_wa']) def test_bulk_tenant_create( zenith_env_builder: ZenithEnvBuilder, - use_wal_acceptors: str, + use_safekeepers: str, tenants_count: int, zenbenchmark, ): """Measure tenant creation time (with and without wal acceptors)""" - if use_wal_acceptors == 'with_wa': + if use_safekeepers == 'with_wa': zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() @@ -32,14 +32,14 @@ def test_bulk_tenant_create( tenant = env.zenith_cli.create_tenant() env.zenith_cli.create_timeline( - f'test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}', tenant_id=tenant) + f'test_bulk_tenant_create_{tenants_count}_{i}_{use_safekeepers}', tenant_id=tenant) # FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now? - #if use_wal_acceptors == 'with_wa': + #if use_safekeepers == 'with_sa': # wa_factory.start_n_new(3) pg_tenant = env.postgres.create_start( - f'test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}', tenant_id=tenant) + f'test_bulk_tenant_create_{tenants_count}_{i}_{use_safekeepers}', tenant_id=tenant) end = timeit.default_timer() time_slices.append(end - start) diff --git a/zenith/src/main.rs b/zenith/src/main.rs index 97b07b7b74..18368895a4 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -9,13 +9,13 @@ use pageserver::config::defaults::{ DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR, }; -use std::collections::{BTreeSet, HashMap}; -use std::process::exit; -use std::str::FromStr; use safekeeper::defaults::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; +use std::collections::{BTreeSet, HashMap}; +use std::process::exit; +use std::str::FromStr; use zenith_utils::auth::{Claims, Scope}; use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend::AuthType; From c15aa04714e82af1542b8ade1b6d8c1453474dee Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 14 Apr 2022 12:56:46 +0300 Subject: [PATCH 0158/1022] Move Cluster size limit RFC from rfcs repo --- docs/rfcs/cluster-size-limits.md | 79 ++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 docs/rfcs/cluster-size-limits.md diff --git a/docs/rfcs/cluster-size-limits.md b/docs/rfcs/cluster-size-limits.md new file mode 100644 index 0000000000..4696f2c7f0 --- /dev/null +++ b/docs/rfcs/cluster-size-limits.md @@ -0,0 +1,79 @@ +Cluster size limits +================== + +## Summary + +One of the resource consumption limits for free-tier users is a cluster size limit. + +To enforce it, we need to calculate the timeline size and check if the limit is reached before relation create/extend operations. +If the limit is reached, the query must fail with some meaningful error/warning. +We may want to exempt some operations from the quota to allow users free space to fit back into the limit. + +The stateless compute node that performs validation is separate from the storage that calculates the usage, so we need to exchange cluster size information between those components. + +## Motivation + +Limit the maximum size of a PostgreSQL instance to limit free tier users (and other tiers in the future). +First of all, this is needed to control our free tier production costs. +Another reason to limit resources is risk management — we haven't (fully) tested and optimized zenith for big clusters, +so we don't want to give users access to the functionality that we don't think is ready. + +## Components + +* pageserver - calculate the size consumed by a timeline and add it to the feedback message. +* safekeeper - pass feedback message from pageserver to compute. +* compute - receive feedback message, enforce size limit based on GUC `zenith.max_cluster_size`. +* console - set and update `zenith.max_cluster_size` setting + +## Proposed implementation + +First of all, it's necessary to define timeline size. + +The current approach is to count all data, including SLRUs. (not including WAL) +Here we think of it as a physical disk underneath the Postgres cluster. +This is how the `LOGICAL_TIMELINE_SIZE` metric is implemented in the pageserver. + +Alternatively, we could count only relation data. As in pg_database_size(). +This approach is somewhat more user-friendly because it is the data that is really affected by the user. +On the other hand, it puts us in a weaker position than other services, i.e., RDS. +We will need to refactor the timeline_size counter or add another counter to implement it. + +Timeline size is updated during wal digestion. It is not versioned and is valid at the last_received_lsn moment. +Then this size should be reported to compute node. + +`current_timeline_size` value is included in the walreceiver's custom feedback message: `ZenithFeedback.` + +(PR about protocol changes https://github.com/zenithdb/zenith/pull/1037). + +This message is received by the safekeeper and propagated to compute node as a part of `AppendResponse`. + +Finally, when compute node receives the `current_timeline_size` from safekeeper (or from pageserver directly), it updates the global variable. + +And then every zenith_extend() operation checks if limit is reached `(current_timeline_size > zenith.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so. +(see Postgres error codes [https://www.postgresql.org/docs/devel/errcodes-appendix.html](https://www.postgresql.org/docs/devel/errcodes-appendix.html)) + +TODO: +We can allow autovacuum processes to bypass this check, simply checking `IsAutoVacuumWorkerProcess()`. +It would be nice to allow manual VACUUM and VACUUM FULL to bypass the check, but it's uneasy to distinguish these operations at the low level. +See issues https://github.com/neondatabase/neon/issues/1245 +https://github.com/zenithdb/zenith/issues/1445 + +TODO: +We should warn users if the limit is soon to be reached. + +### **Reliability, failure modes and corner cases** + +1. `current_timeline_size` is valid at the last received and digested by pageserver lsn. + + If pageserver lags behind compute node, `current_timeline_size` will lag too. This lag can be tuned using backpressure, but it is not expected to be 0 all the time. + + So transactions that happen in this lsn range may cause limit overflow. Especially operations that generate (i.e., CREATE DATABASE) or free (i.e., TRUNCATE) a lot of data pages while generating a small amount of WAL. Are there other operations like this? + + Currently, CREATE DATABASE operations are restricted in the console. So this is not an issue. + + +### **Security implications** + +We treat compute as an untrusted component. That's why we try to isolate it with secure container runtime or a VM. +Malicious users may change the `zenith.max_cluster_size`, so we need an extra size limit check. +To cover this case, we also monitor the compute node size in the console. From 389bd1faeb91904e1bcd23dce10217abbd45ae53 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Sun, 17 Apr 2022 23:12:04 +0300 Subject: [PATCH 0159/1022] Support for SCRAM-SHA-256 in compute tools --- compute_tools/src/pg_helpers.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 6a22b865fa..1409a81b6b 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -132,7 +132,14 @@ impl Role { let mut params: String = "LOGIN".to_string(); if let Some(pass) = &self.encrypted_password { - params.push_str(&format!(" PASSWORD 'md5{}'", pass)); + // Some time ago we supported only md5 and treated all encrypted_password as md5. + // Now we also support SCRAM-SHA-256 and to preserve compatibility + // we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256. + if pass.starts_with("SCRAM-SHA-256") { + params.push_str(&format!(" PASSWORD '{}'", pass)); + } else { + params.push_str(&format!(" PASSWORD 'md5{}'", pass)); + } } else { params.push_str(" PASSWORD NULL"); } From a1e34772e56111403501f867e34693c863b95258 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 15 Apr 2022 18:13:26 +0300 Subject: [PATCH 0160/1022] Improve compute error logging --- control_plane/src/compute.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 1c979acbdf..c078c274cf 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -420,10 +420,15 @@ impl PostgresNode { if let Some(token) = auth_token { cmd.env("ZENITH_AUTH_TOKEN", token); } - let pg_ctl = cmd.status().context("pg_ctl failed")?; - if !pg_ctl.success() { - anyhow::bail!("pg_ctl failed"); + let pg_ctl = cmd.output().context("pg_ctl failed")?; + if !pg_ctl.status.success() { + anyhow::bail!( + "pg_ctl failed, exit code: {}, stdout: {}, stderr: {}", + pg_ctl.status, + String::from_utf8_lossy(&pg_ctl.stdout), + String::from_utf8_lossy(&pg_ctl.stderr), + ); } Ok(()) } From ef72eb84cf7eebb78d76993c8d1d32ecffd0c12d Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Tue, 19 Apr 2022 09:46:47 -0400 Subject: [PATCH 0161/1022] Remove zenfixture (#1534) --- test_runner/fixtures/zenith_fixtures.py | 35 ++++++++++--------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index e0f08a3bfb..8dfe219966 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -40,8 +40,8 @@ from fixtures.log_helper import log This file contains pytest fixtures. A fixture is a test resource that can be summoned by placing its name in the test's arguments. -A fixture is created with the decorator @zenfixture, which is a wrapper around -the standard pytest.fixture with some extra behavior. +A fixture is created with the decorator @pytest.fixture decorator. +See docs: https://docs.pytest.org/en/6.2.x/fixture.html There are several environment variables that can control the running of tests: ZENITH_BIN, POSTGRES_DISTRIB_DIR, etc. See README.md for more information. @@ -155,25 +155,18 @@ def pytest_configure(config): raise Exception('zenith binaries not found at "{}"'.format(zenith_binpath)) -def zenfixture(func: Fn) -> Fn: +def shareable_scope(fixture_name, config) -> Literal["session", "function"]: + """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar. + + This function can be used as a scope like this: + @pytest.fixture(scope=shareable_scope) + def myfixture(...) + ... """ - This is a python decorator for fixtures with a flexible scope. - - By default every test function will set up and tear down a new - database. In pytest, this is called fixtures "function" scope. - - If the environment variable TEST_SHARED_FIXTURES is set, then all - tests will share the same database. State, logs, etc. will be - stored in a directory called "shared". - """ - - scope: Literal['session', 'function'] = \ - 'function' if os.environ.get('TEST_SHARED_FIXTURES') is None else 'session' - - return pytest.fixture(func, scope=scope) + return 'function' if os.environ.get('TEST_SHARED_FIXTURES') is None else 'session' -@zenfixture +@pytest.fixture(scope=shareable_scope) def worker_seq_no(worker_id: str): # worker_id is a pytest-xdist fixture # it can be master or gw @@ -184,7 +177,7 @@ def worker_seq_no(worker_id: str): return int(worker_id[2:]) -@zenfixture +@pytest.fixture(scope=shareable_scope) def worker_base_port(worker_seq_no: int): # so we divide ports in ranges of 100 ports # so workers have disjoint set of ports for services @@ -237,7 +230,7 @@ class PortDistributor: 'port range configured for test is exhausted, consider enlarging the range') -@zenfixture +@pytest.fixture(scope=shareable_scope) def port_distributor(worker_base_port): return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM) @@ -622,7 +615,7 @@ class ZenithEnv: return AuthKeys(pub=pub, priv=priv) -@zenfixture +@pytest.fixture(scope=shareable_scope) def _shared_simple_env(request: Any, port_distributor) -> Iterator[ZenithEnv]: """ Internal fixture backing the `zenith_simple_env` fixture. If TEST_SHARED_FIXTURES From 44bfc529f668fcb4fe79c521e6970382803d1178 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 19 Apr 2022 22:06:02 +0300 Subject: [PATCH 0162/1022] Require specifying the upload size in remote storage --- pageserver/src/remote_storage.rs | 3 ++ pageserver/src/remote_storage/local_fs.rs | 32 ++++++++++------------ pageserver/src/remote_storage/s3_bucket.rs | 6 +++- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/pageserver/src/remote_storage.rs b/pageserver/src/remote_storage.rs index aebd74af5a..8167830347 100644 --- a/pageserver/src/remote_storage.rs +++ b/pageserver/src/remote_storage.rs @@ -324,6 +324,9 @@ trait RemoteStorage: Send + Sync { async fn upload( &self, from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + /// S3 PUT request requires the content length to be specified, + /// otherwise it starts to fail with the concurrent connection count increasing. + from_size_kb: usize, to: &Self::StoragePath, metadata: Option, ) -> anyhow::Result<()>; diff --git a/pageserver/src/remote_storage/local_fs.rs b/pageserver/src/remote_storage/local_fs.rs index b40089d53c..15c69beebb 100644 --- a/pageserver/src/remote_storage/local_fs.rs +++ b/pageserver/src/remote_storage/local_fs.rs @@ -104,7 +104,8 @@ impl RemoteStorage for LocalFs { async fn upload( &self, - mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + from_size_kb: usize, to: &Self::StoragePath, metadata: Option, ) -> anyhow::Result<()> { @@ -128,7 +129,7 @@ impl RemoteStorage for LocalFs { })?, ); - io::copy(&mut from, &mut destination) + io::copy(&mut from.take(from_size_kb as u64), &mut destination) .await .with_context(|| { format!( @@ -509,13 +510,13 @@ mod fs_tests { let repo_harness = RepoHarness::create("upload_file")?; let storage = create_storage()?; - let source = create_file_for_upload( + let (file, size) = create_file_for_upload( &storage.pageserver_workdir.join("whatever"), "whatever_contents", ) .await?; let target_path = PathBuf::from("/").join("somewhere").join("else"); - match storage.upload(source, &target_path, None).await { + match storage.upload(file, size, &target_path, None).await { Ok(()) => panic!("Should not allow storing files with wrong target path"), Err(e) => { let message = format!("{:?}", e); @@ -800,24 +801,17 @@ mod fs_tests { let timeline_path = harness.timeline_path(&TIMELINE_ID); let relative_timeline_path = timeline_path.strip_prefix(&harness.conf.workdir)?; let storage_path = storage.root.join(relative_timeline_path).join(name); - storage - .upload( - create_file_for_upload( - &storage.pageserver_workdir.join(name), - &dummy_contents(name), - ) - .await?, - &storage_path, - metadata, - ) - .await?; + + let from_path = storage.pageserver_workdir.join(name); + let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?; + storage.upload(file, size, &storage_path, metadata).await?; Ok(storage_path) } async fn create_file_for_upload( path: &Path, contents: &str, - ) -> anyhow::Result> { + ) -> anyhow::Result<(io::BufReader, usize)> { std::fs::create_dir_all(path.parent().unwrap())?; let mut file_for_writing = std::fs::OpenOptions::new() .write(true) @@ -825,8 +819,10 @@ mod fs_tests { .open(path)?; write!(file_for_writing, "{}", contents)?; drop(file_for_writing); - Ok(io::BufReader::new( - fs::OpenOptions::new().read(true).open(&path).await?, + let file_size = path.metadata()?.len() as usize; + Ok(( + io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?), + file_size, )) } diff --git a/pageserver/src/remote_storage/s3_bucket.rs b/pageserver/src/remote_storage/s3_bucket.rs index bfd28168f4..b99fa478c4 100644 --- a/pageserver/src/remote_storage/s3_bucket.rs +++ b/pageserver/src/remote_storage/s3_bucket.rs @@ -180,12 +180,16 @@ impl RemoteStorage for S3Bucket { async fn upload( &self, from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + from_size_kb: usize, to: &Self::StoragePath, metadata: Option, ) -> anyhow::Result<()> { self.client .put_object(PutObjectRequest { - body: Some(StreamingBody::new(ReaderStream::new(from))), + body: Some(StreamingBody::new_with_size( + ReaderStream::new(from), + from_size_kb, + )), bucket: self.bucket_name.clone(), key: to.key().to_owned(), metadata: metadata.map(|m| m.0), From 3e6087a12f26ebefe6b91ea78be5d927c72b2a48 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 8 Apr 2022 20:17:37 +0300 Subject: [PATCH 0163/1022] Remove S3 archiving --- Cargo.lock | 44 - pageserver/Cargo.toml | 1 - pageserver/src/bin/pageserver.rs | 2 +- pageserver/src/bin/pageserver_zst.rs | 334 ---- pageserver/src/http/openapi_spec.yml | 1 + pageserver/src/http/routes.rs | 154 +- pageserver/src/layered_repository.rs | 7 +- pageserver/src/remote_storage.rs | 75 +- pageserver/src/remote_storage/README.md | 52 - pageserver/src/remote_storage/local_fs.rs | 21 +- pageserver/src/remote_storage/s3_bucket.rs | 14 +- pageserver/src/remote_storage/storage_sync.rs | 1766 +++++++++++------ .../storage_sync/compression.rs | 612 ------ .../remote_storage/storage_sync/download.rs | 591 +++--- .../src/remote_storage/storage_sync/index.rs | 657 +++--- .../src/remote_storage/storage_sync/upload.rs | 810 ++++---- pageserver/src/repository.rs | 2 - pageserver/src/tenant_mgr.rs | 4 +- pageserver/src/timelines.rs | 4 +- pageserver/src/walreceiver.rs | 2 +- .../batch_others/test_remote_storage.py | 45 +- zenith/src/main.rs | 4 +- 22 files changed, 2360 insertions(+), 2842 deletions(-) delete mode 100644 pageserver/src/bin/pageserver_zst.rs delete mode 100644 pageserver/src/remote_storage/README.md delete mode 100644 pageserver/src/remote_storage/storage_sync/compression.rs diff --git a/Cargo.lock b/Cargo.lock index a933b44356..3480f120e0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -55,20 +55,6 @@ dependencies = [ "backtrace", ] -[[package]] -name = "async-compression" -version = "0.3.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2bf394cfbbe876f0ac67b13b6ca819f9c9f2fb9ec67223cceb1555fbab1c31a" -dependencies = [ - "futures-core", - "memchr", - "pin-project-lite", - "tokio", - "zstd", - "zstd-safe", -] - [[package]] name = "async-stream" version = "0.3.3" @@ -1508,7 +1494,6 @@ name = "pageserver" version = "0.1.0" dependencies = [ "anyhow", - "async-compression", "async-trait", "byteorder", "bytes", @@ -3428,32 +3413,3 @@ name = "zeroize" version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c88870063c39ee00ec285a2f8d6a966e5b6fb2becc4e8dac77ed0d370ed6006" - -[[package]] -name = "zstd" -version = "0.10.0+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b1365becbe415f3f0fcd024e2f7b45bacfb5bdd055f0dc113571394114e7bdd" -dependencies = [ - "zstd-safe", -] - -[[package]] -name = "zstd-safe" -version = "4.1.4+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f7cd17c9af1a4d6c24beb1cc54b17e2ef7b593dc92f19e9d9acad8b182bbaee" -dependencies = [ - "libc", - "zstd-sys", -] - -[[package]] -name = "zstd-sys" -version = "1.6.3+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc49afa5c8d634e75761feda8c592051e7eeb4683ba827211eb0d731d3402ea8" -dependencies = [ - "cc", - "libc", -] diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 3825795059..1a533af95f 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -46,7 +46,6 @@ fail = "0.5.0" rusoto_core = "0.47" rusoto_s3 = "0.47" async-trait = "0.1" -async-compression = {version = "0.3", features = ["zstd", "tokio"]} postgres_ffi = { path = "../postgres_ffi" } zenith_metrics = { path = "../zenith_metrics" } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 0af96cff66..1610a26239 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -293,7 +293,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() "http_endpoint_thread", false, move || { - let router = http::make_router(conf, auth_cloned, remote_index); + let router = http::make_router(conf, auth_cloned, remote_index)?; endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher()) }, )?; diff --git a/pageserver/src/bin/pageserver_zst.rs b/pageserver/src/bin/pageserver_zst.rs deleted file mode 100644 index 5b8f8cc3c6..0000000000 --- a/pageserver/src/bin/pageserver_zst.rs +++ /dev/null @@ -1,334 +0,0 @@ -//! A CLI helper to deal with remote storage (S3, usually) blobs as archives. -//! See [`compression`] for more details about the archives. - -use std::{collections::BTreeSet, path::Path}; - -use anyhow::{bail, ensure, Context}; -use clap::{App, Arg}; -use pageserver::{ - layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}, - remote_storage::compression, -}; -use tokio::{fs, io}; -use zenith_utils::GIT_VERSION; - -const LIST_SUBCOMMAND: &str = "list"; -const ARCHIVE_ARG_NAME: &str = "archive"; - -const EXTRACT_SUBCOMMAND: &str = "extract"; -const TARGET_DIRECTORY_ARG_NAME: &str = "target_directory"; - -const CREATE_SUBCOMMAND: &str = "create"; -const SOURCE_DIRECTORY_ARG_NAME: &str = "source_directory"; - -#[tokio::main(flavor = "current_thread")] -async fn main() -> anyhow::Result<()> { - let arg_matches = App::new("pageserver zst blob [un]compressor utility") - .version(GIT_VERSION) - .subcommands(vec![ - App::new(LIST_SUBCOMMAND) - .about("List the archive contents") - .arg( - Arg::new(ARCHIVE_ARG_NAME) - .required(true) - .takes_value(true) - .help("An archive to list the contents of"), - ), - App::new(EXTRACT_SUBCOMMAND) - .about("Extracts the archive into the directory") - .arg( - Arg::new(ARCHIVE_ARG_NAME) - .required(true) - .takes_value(true) - .help("An archive to extract"), - ) - .arg( - Arg::new(TARGET_DIRECTORY_ARG_NAME) - .required(false) - .takes_value(true) - .help("A directory to extract the archive into. Optional, will use the current directory if not specified"), - ), - App::new(CREATE_SUBCOMMAND) - .about("Creates an archive with the contents of a directory (only the first level files are taken, metadata file has to be present in the same directory)") - .arg( - Arg::new(SOURCE_DIRECTORY_ARG_NAME) - .required(true) - .takes_value(true) - .help("A directory to use for creating the archive"), - ) - .arg( - Arg::new(TARGET_DIRECTORY_ARG_NAME) - .required(false) - .takes_value(true) - .help("A directory to create the archive in. Optional, will use the current directory if not specified"), - ), - ]) - .get_matches(); - - let subcommand_name = match arg_matches.subcommand_name() { - Some(name) => name, - None => bail!("No subcommand specified"), - }; - - let subcommand_matches = match arg_matches.subcommand_matches(subcommand_name) { - Some(matches) => matches, - None => bail!( - "No subcommand arguments were recognized for subcommand '{}'", - subcommand_name - ), - }; - - let target_dir = Path::new( - subcommand_matches - .value_of(TARGET_DIRECTORY_ARG_NAME) - .unwrap_or("./"), - ); - - match subcommand_name { - LIST_SUBCOMMAND => { - let archive = match subcommand_matches.value_of(ARCHIVE_ARG_NAME) { - Some(archive) => Path::new(archive), - None => bail!("No '{}' argument is specified", ARCHIVE_ARG_NAME), - }; - list_archive(archive).await - } - EXTRACT_SUBCOMMAND => { - let archive = match subcommand_matches.value_of(ARCHIVE_ARG_NAME) { - Some(archive) => Path::new(archive), - None => bail!("No '{}' argument is specified", ARCHIVE_ARG_NAME), - }; - extract_archive(archive, target_dir).await - } - CREATE_SUBCOMMAND => { - let source_dir = match subcommand_matches.value_of(SOURCE_DIRECTORY_ARG_NAME) { - Some(source) => Path::new(source), - None => bail!("No '{}' argument is specified", SOURCE_DIRECTORY_ARG_NAME), - }; - create_archive(source_dir, target_dir).await - } - unknown => bail!("Unknown subcommand {}", unknown), - } -} - -async fn list_archive(archive: &Path) -> anyhow::Result<()> { - let archive = archive.canonicalize().with_context(|| { - format!( - "Failed to get the absolute path for the archive path '{}'", - archive.display() - ) - })?; - ensure!( - archive.is_file(), - "Path '{}' is not an archive file", - archive.display() - ); - println!("Listing an archive at path '{}'", archive.display()); - let archive_name = match archive.file_name().and_then(|name| name.to_str()) { - Some(name) => name, - None => bail!( - "Failed to get the archive name from the path '{}'", - archive.display() - ), - }; - - let archive_bytes = fs::read(&archive) - .await - .context("Failed to read the archive bytes")?; - - let header = compression::read_archive_header(archive_name, &mut archive_bytes.as_slice()) - .await - .context("Failed to read the archive header")?; - - let empty_path = Path::new(""); - println!("-------------------------------"); - - let longest_path_in_archive = header - .files - .iter() - .filter_map(|file| Some(file.subpath.as_path(empty_path).to_str()?.len())) - .max() - .unwrap_or_default() - .max(METADATA_FILE_NAME.len()); - - for regular_file in &header.files { - println!( - "File: {:width$} uncompressed size: {} bytes", - regular_file.subpath.as_path(empty_path).display(), - regular_file.size, - width = longest_path_in_archive, - ) - } - println!( - "File: {:width$} uncompressed size: {} bytes", - METADATA_FILE_NAME, - header.metadata_file_size, - width = longest_path_in_archive, - ); - println!("-------------------------------"); - - Ok(()) -} - -async fn extract_archive(archive: &Path, target_dir: &Path) -> anyhow::Result<()> { - let archive = archive.canonicalize().with_context(|| { - format!( - "Failed to get the absolute path for the archive path '{}'", - archive.display() - ) - })?; - ensure!( - archive.is_file(), - "Path '{}' is not an archive file", - archive.display() - ); - let archive_name = match archive.file_name().and_then(|name| name.to_str()) { - Some(name) => name, - None => bail!( - "Failed to get the archive name from the path '{}'", - archive.display() - ), - }; - - if !target_dir.exists() { - fs::create_dir_all(target_dir).await.with_context(|| { - format!( - "Failed to create the target dir at path '{}'", - target_dir.display() - ) - })?; - } - let target_dir = target_dir.canonicalize().with_context(|| { - format!( - "Failed to get the absolute path for the target dir path '{}'", - target_dir.display() - ) - })?; - ensure!( - target_dir.is_dir(), - "Path '{}' is not a directory", - target_dir.display() - ); - let mut dir_contents = fs::read_dir(&target_dir) - .await - .context("Failed to list the target directory contents")?; - let dir_entry = dir_contents - .next_entry() - .await - .context("Failed to list the target directory contents")?; - ensure!( - dir_entry.is_none(), - "Target directory '{}' is not empty", - target_dir.display() - ); - - println!( - "Extracting an archive at path '{}' into directory '{}'", - archive.display(), - target_dir.display() - ); - - let mut archive_file = fs::File::open(&archive).await.with_context(|| { - format!( - "Failed to get the archive name from the path '{}'", - archive.display() - ) - })?; - let header = compression::read_archive_header(archive_name, &mut archive_file) - .await - .context("Failed to read the archive header")?; - compression::uncompress_with_header(&BTreeSet::new(), &target_dir, header, &mut archive_file) - .await - .context("Failed to extract the archive") -} - -async fn create_archive(source_dir: &Path, target_dir: &Path) -> anyhow::Result<()> { - let source_dir = source_dir.canonicalize().with_context(|| { - format!( - "Failed to get the absolute path for the source dir path '{}'", - source_dir.display() - ) - })?; - ensure!( - source_dir.is_dir(), - "Path '{}' is not a directory", - source_dir.display() - ); - - if !target_dir.exists() { - fs::create_dir_all(target_dir).await.with_context(|| { - format!( - "Failed to create the target dir at path '{}'", - target_dir.display() - ) - })?; - } - let target_dir = target_dir.canonicalize().with_context(|| { - format!( - "Failed to get the absolute path for the target dir path '{}'", - target_dir.display() - ) - })?; - ensure!( - target_dir.is_dir(), - "Path '{}' is not a directory", - target_dir.display() - ); - - println!( - "Compressing directory '{}' and creating resulting archive in directory '{}'", - source_dir.display(), - target_dir.display() - ); - - let mut metadata_file_contents = None; - let mut files_co_archive = Vec::new(); - - let mut source_dir_contents = fs::read_dir(&source_dir) - .await - .context("Failed to read the source directory contents")?; - - while let Some(source_dir_entry) = source_dir_contents - .next_entry() - .await - .context("Failed to read a source dir entry")? - { - let entry_path = source_dir_entry.path(); - if entry_path.is_file() { - if entry_path.file_name().and_then(|name| name.to_str()) == Some(METADATA_FILE_NAME) { - let metadata_bytes = fs::read(entry_path) - .await - .context("Failed to read metata file bytes in the source dir")?; - metadata_file_contents = Some( - TimelineMetadata::from_bytes(&metadata_bytes) - .context("Failed to parse metata file contents in the source dir")?, - ); - } else { - files_co_archive.push(entry_path); - } - } - } - - let metadata = match metadata_file_contents { - Some(metadata) => metadata, - None => bail!( - "No metadata file found in the source dir '{}', cannot create the archive", - source_dir.display() - ), - }; - - let _ = compression::archive_files_as_stream( - &source_dir, - files_co_archive.iter(), - &metadata, - move |mut archive_streamer, archive_name| async move { - let archive_target = target_dir.join(&archive_name); - let mut archive_file = fs::File::create(&archive_target).await?; - io::copy(&mut archive_streamer, &mut archive_file).await?; - Ok(archive_target) - }, - ) - .await - .context("Failed to create an archive")?; - - Ok(()) -} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index b2760efe85..c0b07418f3 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -409,6 +409,7 @@ components: type: object required: - awaits_download + - remote_consistent_lsn properties: awaits_download: type: boolean diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a0d6e922a1..f49b1d7ba3 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use anyhow::Result; +use anyhow::{Context, Result}; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use tracing::*; @@ -21,7 +21,10 @@ use zenith_utils::zid::{ZTenantTimelineId, ZTimelineId}; use super::models::{ StatusResponse, TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest, }; -use crate::remote_storage::{schedule_timeline_download, RemoteIndex}; +use crate::config::RemoteStorageKind; +use crate::remote_storage::{ + download_index_part, schedule_timeline_download, LocalFs, RemoteIndex, RemoteTimeline, S3Bucket, +}; use crate::repository::Repository; use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo}; use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId}; @@ -31,6 +34,12 @@ struct State { auth: Option>, remote_index: RemoteIndex, allowlist_routes: Vec, + remote_storage: Option, +} + +enum GenericRemoteStorage { + Local(LocalFs), + S3(S3Bucket), } impl State { @@ -38,17 +47,34 @@ impl State { conf: &'static PageServerConf, auth: Option>, remote_index: RemoteIndex, - ) -> Self { + ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() .map(|v| v.parse().unwrap()) .collect::>(); - Self { + // Note that this remote storage is created separately from the main one in the sync_loop. + // It's fine since it's stateless and some code duplication saves us from bloating the code around with generics. + let remote_storage = conf + .remote_storage_config + .as_ref() + .map(|storage_config| match &storage_config.storage { + RemoteStorageKind::LocalFs(root) => { + LocalFs::new(root.clone(), &conf.workdir).map(GenericRemoteStorage::Local) + } + RemoteStorageKind::AwsS3(s3_config) => { + S3Bucket::new(s3_config, &conf.workdir).map(GenericRemoteStorage::S3) + } + }) + .transpose() + .context("Failed to init generic remote storage")?; + + Ok(Self { conf, auth, allowlist_routes, remote_index, - } + remote_storage, + }) } } @@ -122,8 +148,8 @@ async fn timeline_list_handler(request: Request) -> Result, timeline_id, }) .map(|remote_entry| RemoteTimelineInfo { - remote_consistent_lsn: remote_entry.disk_consistent_lsn(), - awaits_download: remote_entry.get_awaits_download(), + remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(), + awaits_download: remote_entry.awaits_download, }), }) } @@ -184,8 +210,8 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result { + tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id)) + .await + .context("Failed to create new timeline directory")?; + new_timeline.awaits_download = true; + new_timeline + } + Ok(None) => return Err(ApiError::NotFound("Unknown remote timeline".to_string())), + Err(e) => { + error!("Failed to retrieve remote timeline data: {:?}", e); + return Err(ApiError::NotFound( + "Failed to retrieve remote timeline".to_string(), + )); + } + }; + let mut index_accessor = remote_index.write().await; + match index_accessor.timeline_entry_mut(&sync_id) { + Some(remote_timeline) => { + if remote_timeline.awaits_download { + return Err(ApiError::Conflict( + "Timeline download is already in progress".to_string(), + )); + } + remote_timeline.awaits_download = true; + } + None => index_accessor.add_timeline_entry(sync_id, new_timeline), + } + schedule_timeline_download(tenant_id, timeline_id); json_response(StatusCode::ACCEPTED, ()) } +async fn try_download_shard_data( + state: &State, + sync_id: ZTenantTimelineId, +) -> anyhow::Result> { + let shard = match state.remote_storage.as_ref() { + Some(GenericRemoteStorage::Local(local_storage)) => { + download_index_part(state.conf, local_storage, sync_id).await + } + Some(GenericRemoteStorage::S3(s3_storage)) => { + download_index_part(state.conf, s3_storage, sync_id).await + } + None => return Ok(None), + } + .with_context(|| format!("Failed to download index shard for timeline {}", sync_id))?; + + let timeline_path = state + .conf + .timeline_path(&sync_id.timeline_id, &sync_id.tenant_id); + RemoteTimeline::from_index_part(&timeline_path, shard) + .map(Some) + .with_context(|| { + format!( + "Failed to convert index shard into remote timeline for timeline {}", + sync_id + ) + }) +} + async fn timeline_detach_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -317,7 +407,7 @@ pub fn make_router( conf: &'static PageServerConf, auth: Option>, remote_index: RemoteIndex, -) -> RouterBuilder { +) -> anyhow::Result> { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); if auth.is_some() { @@ -331,8 +421,10 @@ pub fn make_router( })) } - router - .data(Arc::new(State::new(conf, auth, remote_index))) + Ok(router + .data(Arc::new( + State::new(conf, auth, remote_index).context("Failed to initialize router state")?, + )) .get("/v1/status", status_handler) .get("/v1/tenant", tenant_list_handler) .post("/v1/tenant", tenant_create_handler) @@ -350,5 +442,5 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/detach", timeline_detach_handler, ) - .any(handler_404) + .any(handler_404)) } diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 36b081e400..6769c9cfbc 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -387,8 +387,6 @@ impl Repository for LayeredRepository { timeline_id, timeline_sync_status_update ); match timeline_sync_status_update { - TimelineSyncStatusUpdate::Uploaded => { /* nothing to do, remote consistent lsn is managed by the remote storage */ - } TimelineSyncStatusUpdate::Downloaded => { match self.timelines.lock().unwrap().entry(timeline_id) { Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."), @@ -650,7 +648,8 @@ impl LayeredRepository { checkpoint_before_gc: bool, ) -> Result { let _span_guard = - info_span!("gc iteration", tenant = %self.tenantid, timeline = ?target_timelineid); + info_span!("gc iteration", tenant = %self.tenantid, timeline = ?target_timelineid) + .entered(); let mut totals: GcResult = Default::default(); let now = Instant::now(); @@ -1548,7 +1547,7 @@ impl LayeredTimeline { schedule_timeline_checkpoint_upload( self.tenantid, self.timelineid, - vec![new_delta_path], + new_delta_path, metadata, ); } diff --git a/pageserver/src/remote_storage.rs b/pageserver/src/remote_storage.rs index 8167830347..effc8dcdf4 100644 --- a/pageserver/src/remote_storage.rs +++ b/pageserver/src/remote_storage.rs @@ -9,7 +9,6 @@ //! //! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync. //! Synchronization internals are split into submodules -//! * [`storage_sync::compression`] for a custom remote storage format used to store timeline files in archives //! * [`storage_sync::index`] to keep track of remote tenant files, the metadata and their mappings to local files //! * [`storage_sync::upload`] and [`storage_sync::download`] to manage archive creation and upload; download and extraction, respectively //! @@ -54,25 +53,32 @@ //! The checkpoint uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either). //! See [`crate::layered_repository`] for the upload calls and the adjacent logic. //! -//! Synchronization logic is able to communicate back with updated timeline sync states, [`TimelineSyncState`], -//! submitted via [`crate::tenant_mgr::set_timeline_states`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state. +//! Synchronization logic is able to communicate back with updated timeline sync states, [`crate::repository::TimelineSyncStatusUpdate`], +//! submitted via [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state. //! Such submissions happen in two cases: //! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future //! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory //! -//! When the pageserver terminates, the upload loop finishes a current sync task (if any) and exits. +//! When the pageserver terminates, the sync loop finishes a current sync task (if any) and exits. //! -//! The storage logic considers `image` as a set of local files, fully representing a certain timeline at given moment (identified with `disk_consistent_lsn`). +//! The storage logic considers `image` as a set of local files (layers), fully representing a certain timeline at given moment (identified with `disk_consistent_lsn` from the corresponding `metadata` file). //! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed //! by the storage upload, if enabled. -//! Yet timeline cannot alter already existing files, and normally cannot remote those too: only a GC process is capable of removing unused files. +//! Yet timeline cannot alter already existing files, and cannot remove those too: only a GC process is capable of removing unused files. //! This way, remote storage synchronization relies on the fact that every checkpoint is incremental and local files are "immutable": //! * when a certain checkpoint gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same state //! * no files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten //! when the newer image is downloaded //! -//! To optimize S3 storage (and access), the sync loop compresses the checkpoint files before placing them to S3, and uncompresses them back, keeping track of timeline files and metadata. -//! Also, the remote file list is queried once only, at startup, to avoid possible extra costs and latency issues. +//! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure. +//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexShard`], containing the list of remote files. +//! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download. +//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`], +//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its shard contents, if needed, same as any layer files. +//! +//! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed. +//! Bulk index data download happens only initially, on pageserer startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only, +//! when a new timeline is scheduled for the download. //! //! NOTES: //! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage @@ -86,7 +92,7 @@ mod s3_bucket; mod storage_sync; use std::{ - collections::HashMap, + collections::{HashMap, HashSet}, ffi, fs, path::{Path, PathBuf}, }; @@ -94,22 +100,36 @@ use std::{ use anyhow::{bail, Context}; use tokio::io; use tracing::{debug, error, info}; -use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; -pub use self::storage_sync::index::{RemoteIndex, TimelineIndexEntry}; -pub use self::storage_sync::{schedule_timeline_checkpoint_upload, schedule_timeline_download}; -use self::{local_fs::LocalFs, s3_bucket::S3Bucket}; -use crate::layered_repository::ephemeral_file::is_ephemeral_file; +pub use self::{ + local_fs::LocalFs, + s3_bucket::S3Bucket, + storage_sync::{ + download_index_part, + index::{IndexPart, RemoteIndex, RemoteTimeline}, + schedule_timeline_checkpoint_upload, schedule_timeline_download, + }, +}; use crate::{ config::{PageServerConf, RemoteStorageKind}, - layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}, + layered_repository::{ + ephemeral_file::is_ephemeral_file, + metadata::{TimelineMetadata, METADATA_FILE_NAME}, + }, }; +use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; -pub use storage_sync::compression; - +/// A timeline status to share with pageserver's sync counterpart, +/// after comparing local and remote timeline state. #[derive(Clone, Copy, Debug)] pub enum LocalTimelineInitStatus { + /// The timeline has every remote layer present locally. + /// There could be some layers requiring uploading, + /// but this does not block the timeline from any user interaction. LocallyComplete, + /// A timeline has some files remotely, that are not present locally and need downloading. + /// Downloading might update timeline's metadata locally and current pageserver logic deals with local layers only, + /// so the data needs to be downloaded first before the timeline can be used. NeedsSync, } @@ -179,7 +199,7 @@ pub fn start_local_timeline_sync( fn local_tenant_timeline_files( config: &'static PageServerConf, -) -> anyhow::Result)>> { +) -> anyhow::Result)>> { let mut local_tenant_timeline_files = HashMap::new(); let tenants_dir = config.tenants_path(); for tenants_dir_entry in fs::read_dir(&tenants_dir) @@ -214,9 +234,8 @@ fn local_tenant_timeline_files( fn collect_timelines_for_tenant( config: &'static PageServerConf, tenant_path: &Path, -) -> anyhow::Result)>> { - let mut timelines: HashMap)> = - HashMap::new(); +) -> anyhow::Result)>> { + let mut timelines = HashMap::new(); let tenant_id = tenant_path .file_name() .and_then(ffi::OsStr::to_str) @@ -265,8 +284,8 @@ fn collect_timelines_for_tenant( // NOTE: ephemeral files are excluded from the list fn collect_timeline_files( timeline_dir: &Path, -) -> anyhow::Result<(ZTimelineId, TimelineMetadata, Vec)> { - let mut timeline_files = Vec::new(); +) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet)> { + let mut timeline_files = HashSet::new(); let mut timeline_metadata_path = None; let timeline_id = timeline_dir @@ -286,7 +305,7 @@ fn collect_timeline_files( debug!("skipping ephemeral file {}", entry_path.display()); continue; } else { - timeline_files.push(entry_path); + timeline_files.insert(entry_path); } } } @@ -307,7 +326,7 @@ fn collect_timeline_files( /// This storage tries to be unaware of any layered repository context, /// providing basic CRUD operations for storage files. #[async_trait::async_trait] -trait RemoteStorage: Send + Sync { +pub trait RemoteStorage: Send + Sync { /// A way to uniquely reference a file in the remote storage. type StoragePath; @@ -324,9 +343,9 @@ trait RemoteStorage: Send + Sync { async fn upload( &self, from: impl io::AsyncRead + Unpin + Send + Sync + 'static, - /// S3 PUT request requires the content length to be specified, - /// otherwise it starts to fail with the concurrent connection count increasing. - from_size_kb: usize, + // S3 PUT request requires the content length to be specified, + // otherwise it starts to fail with the concurrent connection count increasing. + from_size_bytes: usize, to: &Self::StoragePath, metadata: Option, ) -> anyhow::Result<()>; diff --git a/pageserver/src/remote_storage/README.md b/pageserver/src/remote_storage/README.md deleted file mode 100644 index 43a47e09d8..0000000000 --- a/pageserver/src/remote_storage/README.md +++ /dev/null @@ -1,52 +0,0 @@ -# Non-implementation details - -This document describes the current state of the backup system in pageserver, existing limitations and concerns, why some things are done the way they are the future development plans. -Detailed description on how the synchronization works and how it fits into the rest of the pageserver can be found in the [storage module](./../remote_storage.rs) and its submodules. -Ideally, this document should disappear after current implementation concerns are mitigated, with the remaining useful knowledge bits moved into rustdocs. - -## Approach - -Backup functionality is a new component, appeared way after the core DB functionality was implemented. -Pageserver layer functionality is also quite volatile at the moment, there's a risk its local file management changes over time. - -To avoid adding more chaos into that, backup functionality is currently designed as a relatively standalone component, with the majority of its logic placed in a standalone async loop. -This way, the backups are managed in background, not affecting directly other pageserver parts: this way the backup and restoration process may lag behind, but eventually keep up with the reality. To track that, a set of prometheus metrics is exposed from pageserver. - -## What's done - -Current implementation -* provides remote storage wrappers for AWS S3 and local FS -* synchronizes the differences with local timelines and remote states as fast as possible -* uploads new layer files -* downloads and registers timelines, found on the remote storage, but missing locally, if those are requested somehow via pageserver (e.g. http api, gc) -* uses compression when deals with files, for better S3 usage -* maintains an index of what's stored remotely -* evicts failing tasks and stops the corresponding timelines - -The tasks are delayed with every retry and the retries are capped, to avoid poisonous tasks. -After any task eviction, or any error at startup checks (e.g. obviously different and wrong local and remote states fot the same timeline), -the timeline has to be stopped from submitting further checkpoint upload tasks, which is done along the corresponding timeline status change. - -No good optimisations or performance testing is done, the feature is disabled by default and gets polished over time. -It's planned to deal with all questions that are currently on and prepare the feature to be enabled by default in cloud environments. - -### Peculiarities - -As mentioned, the backup component is rather new and under development currently, so not all things are done properly from the start. -Here's the list of known compromises with comments: - -* Remote storage file model is currently a custom archive format, that's not possible to deserialize without a particular Rust code of ours (including `serde`). -We also don't optimize the archivation and pack every timeline checkpoint separately, so the resulting blob's size that gets on S3 could be arbitrary. -But, it's a single blob, which is way better than storing ~780 small files separately. - -* Archive index restoration requires reading every blob's head. -This could be avoided by a background thread/future storing the serialized index in the remote storage. - -* no proper file comparison - -No file checksum assertion is done currently, but should be (AWS S3 returns file checksums during the `list` operation) - -* gc is ignored - -So far, we don't adjust the remote storage based on GC thread loop results, only checkpointer loop affects the remote storage. -Index module could be used as a base to implement a deferred GC mechanism, a "defragmentation" that repacks archives into new ones after GC is done removing the files from the archives. diff --git a/pageserver/src/remote_storage/local_fs.rs b/pageserver/src/remote_storage/local_fs.rs index 15c69beebb..952b2e69fe 100644 --- a/pageserver/src/remote_storage/local_fs.rs +++ b/pageserver/src/remote_storage/local_fs.rs @@ -105,7 +105,7 @@ impl RemoteStorage for LocalFs { async fn upload( &self, from: impl io::AsyncRead + Unpin + Send + Sync + 'static, - from_size_kb: usize, + from_size_bytes: usize, to: &Self::StoragePath, metadata: Option, ) -> anyhow::Result<()> { @@ -129,7 +129,11 @@ impl RemoteStorage for LocalFs { })?, ); - io::copy(&mut from.take(from_size_kb as u64), &mut destination) + let from_size_bytes = from_size_bytes as u64; + // Require to read 1 byte more than the expected to check later, that the stream and its size match. + let mut buffer_to_read = from.take(from_size_bytes + 1); + + let bytes_read = io::copy(&mut buffer_to_read, &mut destination) .await .with_context(|| { format!( @@ -138,6 +142,19 @@ impl RemoteStorage for LocalFs { ) })?; + ensure!( + bytes_read == from_size_bytes, + "Provided stream has actual size {} fthat is smaller than the given stream size {}", + bytes_read, + from_size_bytes + ); + + ensure!( + buffer_to_read.read(&mut [0]).await? == 0, + "Provided stream has bigger size than the given stream size {}", + from_size_bytes + ); + destination.flush().await.with_context(|| { format!( "Failed to upload (flush temp) file to the local storage at '{}'", diff --git a/pageserver/src/remote_storage/s3_bucket.rs b/pageserver/src/remote_storage/s3_bucket.rs index b99fa478c4..b69634a1b6 100644 --- a/pageserver/src/remote_storage/s3_bucket.rs +++ b/pageserver/src/remote_storage/s3_bucket.rs @@ -17,7 +17,7 @@ use rusoto_s3::{ }; use tokio::io; use tokio_util::io::ReaderStream; -use tracing::{debug, trace}; +use tracing::debug; use crate::{ config::S3Config, @@ -70,10 +70,6 @@ pub struct S3Bucket { impl S3Bucket { /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result { - // TODO kb check this - // Keeping a single client may cause issues due to timeouts. - // https://github.com/rusoto/rusoto/issues/1686 - debug!( "Creating s3 remote storage for S3 bucket {}", aws_config.bucket_name @@ -91,10 +87,10 @@ impl S3Bucket { let request_dispatcher = HttpClient::new().context("Failed to create S3 http client")?; let client = if aws_config.access_key_id.is_none() && aws_config.secret_access_key.is_none() { - trace!("Using IAM-based AWS access"); + debug!("Using IAM-based AWS access"); S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region) } else { - trace!("Using credentials-based AWS access"); + debug!("Using credentials-based AWS access"); S3Client::new_with( request_dispatcher, StaticProvider::new_minimal( @@ -180,7 +176,7 @@ impl RemoteStorage for S3Bucket { async fn upload( &self, from: impl io::AsyncRead + Unpin + Send + Sync + 'static, - from_size_kb: usize, + from_size_bytes: usize, to: &Self::StoragePath, metadata: Option, ) -> anyhow::Result<()> { @@ -188,7 +184,7 @@ impl RemoteStorage for S3Bucket { .put_object(PutObjectRequest { body: Some(StreamingBody::new_with_size( ReaderStream::new(from), - from_size_kb, + from_size_bytes, )), bucket: self.bucket_name.clone(), key: to.key().to_owned(), diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index 50a260491b..6ba55372c2 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -9,34 +9,32 @@ //! The pair's shared buffer of a fixed size serves as an implicit queue, holding [`SyncTask`] for local files upload/download operations. //! //! The queue gets emptied by a single thread with the loop, that polls the tasks in batches of deduplicated tasks (size configurable). -//! Every task in a batch processed concurrently, which is possible due to incremental nature of the timelines: +//! A task from the batch corresponds to a single timeline, with its files to sync merged together. +//! Every batch task and layer file in the task is processed concurrently, which is possible due to incremental nature of the timelines: //! it's not asserted, but assumed that timeline's checkpoints only add the files locally, not removing or amending the existing ones. //! Only GC removes local timeline files, the GC support is not added to sync currently, //! yet downloading extra files is not critically bad at this stage, GC can remove those again. //! -//! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via listing the remote storage contents. -//! It's enough to poll the remote state once on startup only, due to agreement that the pageserver has -//! an exclusive write access to the remote storage: new files appear in the storage only after the same -//! pageserver writes them. -//! It's important to do so, since storages like S3 can get slower and more expensive as the number of files grows. +//! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via downloading and merging the index data for all timelines, +//! present locally. +//! It's enough to poll such timelines' remote state once on startup only, due to an agreement that only one pageserver at a time has an exclusive +//! write access to remote portion of timelines that are attached to the pagegserver. //! The index state is used to issue initial sync tasks, if needed: //! * all timelines with local state behind the remote gets download tasks scheduled. -//! Such timelines are considered "remote" before the download succeeds, so a number of operations (gc, checkpoints) on that timeline are unavailable. -//! * all never local state gets scheduled for upload, such timelines are "local" and fully operational -//! * the rest of the remote timelines are reported to pageserver, but not downloaded before they are actually accessed in pageserver, -//! it may schedule the download on such occasions. +//! Such timelines are considered "remote" before the download succeeds, so a number of operations (gc, checkpoints) on that timeline are unavailable +//! before up-to-date layers and metadata file are downloaded locally. +//! * all newer local state gets scheduled for upload, such timelines are "local" and fully operational +//! * remote timelines not present locally are unknown to pageserver, but can be downloaded on a separate request +//! //! Then, the index is shared across pageserver under [`RemoteIndex`] guard to ensure proper synchronization. +//! The remote index gets updated after very remote storage change (after an upload), same as the index part files remotely. //! -//! The synchronization unit is an archive: a set of layer files and a special metadata file, all compressed into a blob. -//! Currently, there's no way to process an archive partially, if the archive processing fails, it has to be started from zero next time again. -//! An archive contains set of files of a certain timeline, added during checkpoint(s) and the timeline metadata at that moment. -//! The archive contains that metadata's `disk_consistent_lsn` in its name, to be able to restore partial index information from just a remote storage file list. -//! The index is created at startup (possible due to exclusive ownership over the remote storage by the pageserver) and keeps track of which files were stored -//! in what remote archives. -//! Among other tasks, the index is used to prevent invalid uploads and non-existing downloads on demand. -//! Refer to [`compression`] and [`index`] for more details on the archives and index respectively. +//! Remote timeline contains a set of layer files, created during checkpoint(s) and the serialized [`IndexPart`] file with timeline metadata and all remote layer paths inside. +//! Those paths are used instead of `S3 list` command to avoid its slowliness and expenciveness for big amount of files. +//! If the index part does not contain some file path but it's present remotely, such file is invisible to pageserver and ignored. +//! Among other tasks, the index is used to prevent invalid uploads and non-existing downloads on demand, refer to [`index`] for more details. //! -//! The list construction is currently the only place where the storage sync can return an [`Err`] to the user. +//! Index construction is currently the only place where the storage sync can return an [`Err`] to the user. //! New sync tasks are accepted via [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] functions, //! disregarding of the corresponding loop startup. //! It's up to the caller to avoid synchronizations if the loop is disabled: otherwise, the sync tasks will be ignored. @@ -44,42 +42,39 @@ //! reschedule the same task, with possibly less files to sync: //! * download tasks currently never replace existing local file with metadata file as an exception //! (but this is a subject to change when checksum checks are implemented: all files could get overwritten on a checksum mismatch) -//! * download tasks carry the information of skipped acrhives, so resubmissions are not downloading successfully processed archives again +//! * download tasks carry the information of skipped acrhives, so resubmissions are not downloading successfully processed layers again +//! * downloads do not contain any actual files to download, so that "external", sync pageserver code is able to schedule the timeline download +//! without accessing any extra information about its files. //! -//! Not every upload of the same timeline gets processed: if the checkpoint with the same `disk_consistent_lsn` was already uploaded, no reuploads happen, as checkpoints -//! are considered to be immutable. The order of `lsn` during upload submissions is allowed to be arbitrary and not required to be ascending. +//! Uploads and downloads sync layer files in arbitrary order, but only after all layer files are synched the local metadada (for download) and remote index part (for upload) are updated, +//! to avoid having a corrupt state without the relevant layer files. //! Refer to [`upload`] and [`download`] for more details. //! -//! Current uploads are per-checkpoint and don't accumulate any data with optimal size for storing on S3. -//! The downloaded archives get processed sequentially, from smaller `disk_consistent_lsn` to larger, with metadata files being added as last. -//! The archive unpacking is designed to unpack metadata as the last file, so the risk of leaving the corrupt timeline due to uncompression error is small (while not eliminated entirely and that should be improved). -//! There's a reschedule threshold that evicts tasks that fail too much and stops the corresponding timeline so it does not diverge from the state on the remote storage. -//! Among other pageserver-specific changes to such evicted timelines, no uploads are expected to come from them to ensure the remote storage state does not get corrupted. -//! -//! Synchronization never removes any local from pageserver workdir or remote files from the remote storage, yet there could be overwrites of the same files (metadata file updates; future checksum mismatch fixes). +//! Synchronization never removes any local files from pageserver workdir or remote files from the remote storage, yet there could be overwrites of the same files (index part and metadata file updates, future checksum mismatch fixes). //! NOTE: No real contents or checksum check happens right now and is a subject to improve later. //! //! After the whole timeline is downloaded, [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function is used to update pageserver memory stage for the timeline processed. //! //! When pageserver signals shutdown, current sync task gets finished and the loop exists. -/// Expose the module for a binary CLI tool that deals with the corresponding blobs. -pub mod compression; mod download; pub mod index; mod upload; use std::{ - collections::{BTreeSet, HashMap, VecDeque}, + collections::{hash_map, HashMap, HashSet, VecDeque}, + fmt::Debug, num::{NonZeroU32, NonZeroUsize}, + ops::ControlFlow, path::{Path, PathBuf}, sync::Arc, }; -use anyhow::{bail, Context}; +use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; use lazy_static::lazy_static; use tokio::{ + fs, runtime::Runtime, sync::mpsc::{self, UnboundedReceiver}, time::{Duration, Instant}, @@ -87,23 +82,21 @@ use tokio::{ use tracing::*; use self::{ - compression::ArchiveHeader, - download::{download_timeline, DownloadedTimeline}, - index::{ - ArchiveDescription, ArchiveId, RemoteIndex, RemoteTimeline, RemoteTimelineIndex, - TimelineIndexEntry, TimelineIndexEntryInner, - }, - upload::upload_timeline_checkpoint, + download::{download_timeline_layers, DownloadedTimeline}, + index::{IndexPart, RemoteIndex, RemoteTimeline, RemoteTimelineIndex}, + upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, }; use super::{ LocalTimelineInitStatus, LocalTimelineInitStatuses, RemoteStorage, SyncStartupData, ZTenantTimelineId, }; use crate::{ - config::PageServerConf, layered_repository::metadata::TimelineMetadata, - remote_storage::storage_sync::compression::read_archive_header, - repository::TimelineSyncStatusUpdate, tenant_mgr::apply_timeline_sync_status_updates, - thread_mgr, thread_mgr::ThreadKind, + config::PageServerConf, + layered_repository::metadata::{metadata_path, TimelineMetadata}, + repository::TimelineSyncStatusUpdate, + tenant_mgr::apply_timeline_sync_status_updates, + thread_mgr, + thread_mgr::ThreadKind, }; use zenith_metrics::{ @@ -112,6 +105,8 @@ use zenith_metrics::{ }; use zenith_utils::zid::{ZTenantId, ZTimelineId}; +pub use self::download::download_index_part; + lazy_static! { static ref REMAINING_SYNC_ITEMS: IntGauge = register_int_gauge!( "pageserver_remote_storage_remaining_sync_items", @@ -140,7 +135,7 @@ lazy_static! { /// mpsc approach was picked to allow blocking the sync loop if no tasks are present, to avoid meaningless spinning. mod sync_queue { use std::{ - collections::HashMap, + collections::{hash_map, HashMap}, sync::atomic::{AtomicUsize, Ordering}, }; @@ -150,13 +145,14 @@ mod sync_queue { use tracing::{debug, warn}; use super::SyncTask; + use zenith_utils::zid::ZTenantTimelineId; - static SENDER: OnceCell> = OnceCell::new(); + static SENDER: OnceCell> = OnceCell::new(); static LENGTH: AtomicUsize = AtomicUsize::new(0); /// Initializes the queue with the given sender channel that is used to put the tasks into later. /// Errors if called more than once. - pub fn init(sender: UnboundedSender) -> anyhow::Result<()> { + pub fn init(sender: UnboundedSender<(ZTenantTimelineId, SyncTask)>) -> anyhow::Result<()> { SENDER .set(sender) .map_err(|_sender| anyhow!("sync queue was already initialized"))?; @@ -165,9 +161,9 @@ mod sync_queue { /// Adds a new task to the queue, if the queue was initialized, returning `true` on success. /// On any error, or if the queue was not initialized, the task gets dropped (not scheduled) and `false` is returned. - pub fn push(new_task: SyncTask) -> bool { + pub fn push(sync_id: ZTenantTimelineId, new_task: SyncTask) -> bool { if let Some(sender) = SENDER.get() { - match sender.send(new_task) { + match sender.send((sync_id, new_task)) { Err(e) => { warn!( "Failed to enqueue a sync task: the receiver is dropped: {}", @@ -189,7 +185,9 @@ mod sync_queue { /// Polls a new task from the queue, using its receiver counterpart. /// Does not block if the queue is empty, returning [`None`] instead. /// Needed to correctly track the queue length. - pub async fn next_task(receiver: &mut UnboundedReceiver) -> Option { + pub async fn next_task( + receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, + ) -> Option<(ZTenantTimelineId, SyncTask)> { let task = receiver.recv().await; if task.is_some() { LENGTH.fetch_sub(1, Ordering::Relaxed); @@ -199,25 +197,35 @@ mod sync_queue { /// Fetches a task batch, not bigger than the given limit. /// Not blocking, can return fewer tasks if the queue does not contain enough. - /// Duplicate entries are eliminated and not considered in batch size calculations. + /// Batch tasks are split by timelines, with all related tasks merged into one (download/upload) + /// or two (download and upload, if both were found in the queue during batch construction). pub async fn next_task_batch( - receiver: &mut UnboundedReceiver, + receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, mut max_batch_size: usize, - ) -> Vec { + ) -> HashMap { if max_batch_size == 0 { - return Vec::new(); + return HashMap::new(); } - let mut tasks = HashMap::with_capacity(max_batch_size); + let mut tasks: HashMap = + HashMap::with_capacity(max_batch_size); loop { match receiver.try_recv() { - Ok(new_task) => { + Ok((sync_id, new_task)) => { LENGTH.fetch_sub(1, Ordering::Relaxed); - if tasks.insert(new_task.sync_id, new_task).is_none() { - max_batch_size -= 1; - if max_batch_size == 0 { - break; + match tasks.entry(sync_id) { + hash_map::Entry::Occupied(o) => { + let current = o.remove(); + tasks.insert(sync_id, current.merge(new_task)); } + hash_map::Entry::Vacant(v) => { + v.insert(new_task); + } + } + + max_batch_size -= 1; + if max_batch_size == 0 { + break; } } Err(TryRecvError::Disconnected) => { @@ -231,7 +239,7 @@ mod sync_queue { } } - tasks.into_values().collect() + tasks } /// Length of the queue, assuming that all receiver counterparts were only called using the queue api. @@ -242,55 +250,162 @@ mod sync_queue { /// A task to run in the async download/upload loop. /// Limited by the number of retries, after certain threshold the failing task gets evicted and the timeline disabled. -#[derive(Debug, Clone)] -pub struct SyncTask { - sync_id: ZTenantTimelineId, - retries: u32, - kind: SyncKind, +#[derive(Debug)] +pub enum SyncTask { + /// A checkpoint outcome with possible local file updates that need actualization in the remote storage. + /// Not necessary more fresh than the one already uploaded. + Download(SyncData), + /// A certain amount of image files to download. + Upload(SyncData), + /// Both upload and download layers need to be synced. + DownloadAndUpload(SyncData, SyncData), } -impl SyncTask { - fn new(sync_id: ZTenantTimelineId, retries: u32, kind: SyncKind) -> Self { - Self { - sync_id, - retries, - kind, - } +/// Stores the data to synd and its retries, to evict the tasks failing to frequently. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SyncData { + retries: u32, + data: T, +} + +impl SyncData { + fn new(retries: u32, data: T) -> Self { + Self { retries, data } } } -#[derive(Debug, Clone)] -enum SyncKind { - /// A certain amount of images (archive files) to download. - Download(TimelineDownload), - /// A checkpoint outcome with possible local file updates that need actualization in the remote storage. - /// Not necessary more fresh than the one already uploaded. - Upload(NewCheckpoint), -} +impl SyncTask { + fn download(download_task: TimelineDownload) -> Self { + Self::Download(SyncData::new(0, download_task)) + } -impl SyncKind { - fn sync_name(&self) -> &'static str { + fn upload(upload_task: TimelineUpload) -> Self { + Self::Upload(SyncData::new(0, upload_task)) + } + + /// Merges two tasks into one with the following rules: + /// + /// * Download + Download = Download with the retry counter reset and the layers to skip combined + /// * DownloadAndUpload + Download = DownloadAndUpload with Upload unchanged and the Download counterparts united by the same rules + /// * Upload + Upload = Upload with the retry counter reset and the layers to upload and the uploaded layers combined + /// * DownloadAndUpload + Upload = DownloadAndUpload with Download unchanged and the Upload counterparts united by the same rules + /// * Upload + Download = DownloadAndUpload with both tasks unchanged + /// * DownloadAndUpload + DownloadAndUpload = DownloadAndUpload with both parts united by the same rules + fn merge(mut self, other: Self) -> Self { + match (&mut self, other) { + ( + SyncTask::DownloadAndUpload(download_data, _) | SyncTask::Download(download_data), + SyncTask::Download(new_download_data), + ) + | ( + SyncTask::Download(download_data), + SyncTask::DownloadAndUpload(new_download_data, _), + ) => { + download_data + .data + .layers_to_skip + .extend(new_download_data.data.layers_to_skip.into_iter()); + download_data.retries = 0; + } + (SyncTask::Upload(upload), SyncTask::Download(new_download_data)) => { + self = SyncTask::DownloadAndUpload(new_download_data, upload.clone()); + } + + ( + SyncTask::DownloadAndUpload(_, upload_data) | SyncTask::Upload(upload_data), + SyncTask::Upload(new_upload_data), + ) + | (SyncTask::Upload(upload_data), SyncTask::DownloadAndUpload(_, new_upload_data)) => { + upload_data + .data + .layers_to_upload + .extend(new_upload_data.data.layers_to_upload.into_iter()); + upload_data + .data + .uploaded_layers + .extend(new_upload_data.data.uploaded_layers.into_iter()); + upload_data.retries = 0; + + if new_upload_data.data.metadata.disk_consistent_lsn() + > upload_data.data.metadata.disk_consistent_lsn() + { + upload_data.data.metadata = new_upload_data.data.metadata; + } + } + (SyncTask::Download(download), SyncTask::Upload(new_upload_data)) => { + self = SyncTask::DownloadAndUpload(download.clone(), new_upload_data) + } + + ( + SyncTask::DownloadAndUpload(download_data, upload_data), + SyncTask::DownloadAndUpload(new_download_data, new_upload_data), + ) => { + download_data + .data + .layers_to_skip + .extend(new_download_data.data.layers_to_skip.into_iter()); + download_data.retries = 0; + + upload_data + .data + .layers_to_upload + .extend(new_upload_data.data.layers_to_upload.into_iter()); + upload_data + .data + .uploaded_layers + .extend(new_upload_data.data.uploaded_layers.into_iter()); + upload_data.retries = 0; + + if new_upload_data.data.metadata.disk_consistent_lsn() + > upload_data.data.metadata.disk_consistent_lsn() + { + upload_data.data.metadata = new_upload_data.data.metadata; + } + } + } + + self + } + + fn name(&self) -> &'static str { match self { - Self::Download(_) => "download", - Self::Upload(_) => "upload", + SyncTask::Download(_) => "download", + SyncTask::Upload(_) => "upload", + SyncTask::DownloadAndUpload(_, _) => "download and upload", + } + } + + fn retries(&self) -> u32 { + match self { + SyncTask::Download(data) => data.retries, + SyncTask::Upload(data) => data.retries, + SyncTask::DownloadAndUpload(download_data, upload_data) => { + download_data.retries.max(upload_data.retries) + } } } } /// Local timeline files for upload, appeared after the new checkpoint. /// Current checkpoint design assumes new files are added only, no deletions or amendment happens. -#[derive(Debug, Clone)] -pub struct NewCheckpoint { - /// layer file paths in the pageserver workdir, that were added for the corresponding checkpoint. - layers: Vec, +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TimelineUpload { + /// Layer file path in the pageserver workdir, that were added for the corresponding checkpoint. + layers_to_upload: HashSet, + /// Already uploaded layers. Used to store the data about the uploads between task retries + /// and to record the data into the remote index after the task got completed or evicted. + uploaded_layers: HashSet, metadata: TimelineMetadata, } -/// Info about the remote image files. -#[derive(Debug, Clone)] -struct TimelineDownload { - files_to_skip: Arc>, - archives_to_skip: BTreeSet, +/// A timeline download task. +/// Does not contain the file list to download, to allow other +/// parts of the pageserer code to schedule the task +/// without using the remote index or any other ways to list the remote timleine files. +/// Skips the files that are already downloaded. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TimelineDownload { + layers_to_skip: HashSet, } /// Adds the new checkpoint files as an upload sync task to the queue. @@ -300,22 +415,20 @@ struct TimelineDownload { pub fn schedule_timeline_checkpoint_upload( tenant_id: ZTenantId, timeline_id: ZTimelineId, - layers: Vec, + new_layer: PathBuf, metadata: TimelineMetadata, ) { - if layers.is_empty() { - debug!("Skipping empty layers upload task"); - return; - } - - if !sync_queue::push(SyncTask::new( + if !sync_queue::push( ZTenantTimelineId { tenant_id, timeline_id, }, - 0, - SyncKind::Upload(NewCheckpoint { layers, metadata }), - )) { + SyncTask::upload(TimelineUpload { + layers_to_upload: HashSet::from([new_layer]), + uploaded_layers: HashSet::new(), + metadata, + }), + ) { warn!( "Could not send an upload task for tenant {}, timeline {}", tenant_id, timeline_id @@ -329,12 +442,10 @@ pub fn schedule_timeline_checkpoint_upload( } /// Requests the download of the entire timeline for a given tenant. -/// No existing local files are currently owerwritten, except the metadata file. -/// The timeline downloads checkpoint archives, from the earliest `disc_consistent_lsn` to the latest, -/// replacing the metadata file as the lasat file in every archive uncompression result. +/// No existing local files are currently overwritten, except the metadata file (if its disk_consistent_lsn is less than the downloaded one). +/// The metadata file is always updated last, to avoid inconsistencies. /// -/// On any failure, the task gets retried, omitting already downloaded archives and files -/// (yet requiring to download the entire archive even if it got partially extracted before the failure). +/// On any failure, the task gets retried, omitting already downloaded layers. /// /// Ensure that the loop is started otherwise the task is never processed. pub fn schedule_timeline_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { @@ -342,31 +453,30 @@ pub fn schedule_timeline_download(tenant_id: ZTenantId, timeline_id: ZTimelineId "Scheduling timeline download for tenant {}, timeline {}", tenant_id, timeline_id ); - sync_queue::push(SyncTask::new( + sync_queue::push( ZTenantTimelineId { tenant_id, timeline_id, }, - 0, - SyncKind::Download(TimelineDownload { - files_to_skip: Arc::new(BTreeSet::new()), - archives_to_skip: BTreeSet::new(), + SyncTask::download(TimelineDownload { + layers_to_skip: HashSet::new(), }), - )); + ); } /// Uses a remote storage given to start the storage sync loop. /// See module docs for loop step description. -pub(super) fn spawn_storage_sync_thread< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( +pub(super) fn spawn_storage_sync_thread( conf: &'static PageServerConf, - local_timeline_files: HashMap)>, + local_timeline_files: HashMap)>, storage: S, max_concurrent_sync: NonZeroUsize, max_sync_errors: NonZeroU32, -) -> anyhow::Result { +) -> anyhow::Result +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ let (sender, receiver) = mpsc::unbounded_channel(); sync_queue::init(sender)?; @@ -375,22 +485,13 @@ pub(super) fn spawn_storage_sync_thread< .build() .context("Failed to create storage sync runtime")?; - let download_paths = runtime - // TODO could take long time, consider [de]serializing [`RemoteTimelineIndex`] instead - .block_on(storage.list()) - .context("Failed to list remote storage files")? - .into_iter() - .filter_map(|remote_path| match storage.local_path(&remote_path) { - Ok(local_path) => Some(local_path), - Err(e) => { - error!( - "Failed to find local path for remote path {:?}: {:?}", - remote_path, e - ); - None - } - }); - let remote_index = RemoteIndex::try_parse_descriptions_from_paths(conf, download_paths); + let applicable_index_parts = runtime.block_on(try_fetch_index_parts( + conf, + &storage, + local_timeline_files.keys().copied().collect(), + )); + + let remote_index = RemoteIndex::from_parts(conf, applicable_index_parts)?; let local_timeline_init_statuses = schedule_first_sync_tasks( &mut runtime.block_on(remote_index.write()), @@ -409,8 +510,8 @@ pub(super) fn spawn_storage_sync_thread< runtime, conf, receiver, + Arc::new(storage), loop_index, - storage, max_concurrent_sync, max_sync_errors, ); @@ -424,44 +525,40 @@ pub(super) fn spawn_storage_sync_thread< }) } -enum LoopStep { - SyncStatusUpdates(HashMap>), - Shutdown, -} - #[allow(clippy::too_many_arguments)] -fn storage_sync_loop< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( +fn storage_sync_loop( runtime: Runtime, conf: &'static PageServerConf, - mut receiver: UnboundedReceiver, + mut receiver: UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, + storage: Arc, index: RemoteIndex, - storage: S, max_concurrent_sync: NonZeroUsize, max_sync_errors: NonZeroU32, -) { - let remote_assets = Arc::new((storage, index.clone())); +) where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ info!("Starting remote storage sync loop"); loop { - let index = index.clone(); + let loop_index = index.clone(); + let storage = Arc::clone(&storage); let loop_step = runtime.block_on(async { tokio::select! { step = loop_step( conf, &mut receiver, - Arc::clone(&remote_assets), + storage, + loop_index, max_concurrent_sync, max_sync_errors, ) .instrument(info_span!("storage_sync_loop_step")) => step, - _ = thread_mgr::shutdown_watcher() => LoopStep::Shutdown, + _ = thread_mgr::shutdown_watcher() => ControlFlow::Break(()), } }); match loop_step { - LoopStep::SyncStatusUpdates(new_timeline_states) => { + ControlFlow::Continue(new_timeline_states) => { if new_timeline_states.is_empty() { debug!("Sync loop step completed, no new timeline states"); } else { @@ -470,10 +567,10 @@ fn storage_sync_loop< new_timeline_states.len() ); // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. - apply_timeline_sync_status_updates(conf, index, new_timeline_states); + apply_timeline_sync_status_updates(conf, &index, new_timeline_states); } } - LoopStep::Shutdown => { + ControlFlow::Break(()) => { info!("Shutdown requested, stopping"); break; } @@ -481,68 +578,64 @@ fn storage_sync_loop< } } -async fn loop_step< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( +async fn loop_step( conf: &'static PageServerConf, - receiver: &mut UnboundedReceiver, - remote_assets: Arc<(S, RemoteIndex)>, + receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, + storage: Arc, + index: RemoteIndex, max_concurrent_sync: NonZeroUsize, max_sync_errors: NonZeroU32, -) -> LoopStep { +) -> ControlFlow<(), HashMap>> +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ let max_concurrent_sync = max_concurrent_sync.get(); - let mut next_tasks = Vec::new(); // request the first task in blocking fashion to do less meaningless work - if let Some(first_task) = sync_queue::next_task(receiver).await { - next_tasks.push(first_task); - } else { - return LoopStep::Shutdown; - }; - next_tasks.extend( - sync_queue::next_task_batch(receiver, max_concurrent_sync - 1) - .await - .into_iter(), - ); + let (first_sync_id, first_task) = + if let Some(first_task) = sync_queue::next_task(receiver).await { + first_task + } else { + return ControlFlow::Break(()); + }; + + let mut batched_tasks = sync_queue::next_task_batch(receiver, max_concurrent_sync - 1).await; + match batched_tasks.entry(first_sync_id) { + hash_map::Entry::Occupied(o) => { + let current = o.remove(); + batched_tasks.insert(first_sync_id, current.merge(first_task)); + } + hash_map::Entry::Vacant(v) => { + v.insert(first_task); + } + } let remaining_queue_length = sync_queue::len(); REMAINING_SYNC_ITEMS.set(remaining_queue_length as i64); - if remaining_queue_length > 0 || !next_tasks.is_empty() { + if remaining_queue_length > 0 || !batched_tasks.is_empty() { info!( - "Processing {} tasks in batch, more tasks left to process: {}", - next_tasks.len(), + "Processing tasks for {} timelines in batch, more tasks left to process: {}", + batched_tasks.len(), remaining_queue_length ); } else { debug!("No tasks to process"); - return LoopStep::SyncStatusUpdates(HashMap::new()); + return ControlFlow::Continue(HashMap::new()); } - let mut task_batch = next_tasks + let mut sync_results = batched_tasks .into_iter() - .map(|task| async { - let sync_id = task.sync_id; - let attempt = task.retries; - let sync_name = task.kind.sync_name(); - - let extra_step = match tokio::spawn( - process_task(conf, Arc::clone(&remote_assets), task, max_sync_errors).instrument( - info_span!("process_sync_task", sync_id = %sync_id, attempt, sync_name), - ), - ) - .await - { - Ok(extra_step) => extra_step, - Err(e) => { - error!( - "Failed to process storage sync task for tenant {}, timeline {}: {:?}", - sync_id.tenant_id, sync_id.timeline_id, e - ); - None - } - }; - (sync_id, extra_step) + .map(|(sync_id, task)| { + let storage = Arc::clone(&storage); + let index = index.clone(); + async move { + let state_update = + process_sync_task(conf, storage, index, max_sync_errors, sync_id, task) + .instrument(info_span!("process_sync_tasks", sync_id = %sync_id)) + .await; + (sync_id, state_update) + } }) .collect::>(); @@ -550,45 +643,86 @@ async fn loop_step< ZTenantId, HashMap, > = HashMap::with_capacity(max_concurrent_sync); - while let Some((sync_id, state_update)) = task_batch.next().await { + while let Some((sync_id, state_update)) = sync_results.next().await { debug!("Finished storage sync task for sync id {}", sync_id); if let Some(state_update) = state_update { - let ZTenantTimelineId { - tenant_id, - timeline_id, - } = sync_id; new_timeline_states - .entry(tenant_id) + .entry(sync_id.tenant_id) .or_default() - .insert(timeline_id, state_update); + .insert(sync_id.timeline_id, state_update); } } - LoopStep::SyncStatusUpdates(new_timeline_states) + ControlFlow::Continue(new_timeline_states) } -async fn process_task< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( +async fn process_sync_task( conf: &'static PageServerConf, - remote_assets: Arc<(S, RemoteIndex)>, - task: SyncTask, + storage: Arc, + index: RemoteIndex, max_sync_errors: NonZeroU32, -) -> Option { - if task.retries > max_sync_errors.get() { - error!( - "Evicting task {:?} that failed {} times, exceeding the error threshold", - task.kind, task.retries - ); - FATAL_TASK_FAILURES.inc(); - // FIXME (rodionov) this can potentially leave holes in timeline uploads - // planneed to be fixed as part of https://github.com/zenithdb/zenith/issues/977 - return None; - } + sync_id: ZTenantTimelineId, + task: SyncTask, +) -> Option +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let sync_start = Instant::now(); + let current_remote_timeline = { index.read().await.timeline_entry(&sync_id).cloned() }; - if task.retries > 0 { - let seconds_to_wait = 2.0_f64.powf(task.retries as f64 - 1.0).min(30.0); + let task = match validate_task_retries(sync_id, task, max_sync_errors) { + ControlFlow::Continue(task) => task, + ControlFlow::Break(aborted_task) => { + match aborted_task { + SyncTask::Download(_) => { + index + .write() + .await + .set_awaits_download(&sync_id, false) + .ok(); + } + SyncTask::Upload(failed_upload_data) => { + if let Err(e) = update_remote_data( + conf, + storage.as_ref(), + &index, + sync_id, + &failed_upload_data.data, + true, + ) + .await + { + error!("Failed to update remote timeline {}: {:?}", sync_id, e); + } + } + SyncTask::DownloadAndUpload(_, failed_upload_data) => { + index + .write() + .await + .set_awaits_download(&sync_id, false) + .ok(); + if let Err(e) = update_remote_data( + conf, + storage.as_ref(), + &index, + sync_id, + &failed_upload_data.data, + true, + ) + .await + { + error!("Failed to update remote timeline {}: {:?}", sync_id, e); + } + } + } + return None; + } + }; + + let current_task_attempt = task.retries(); + if current_task_attempt > 0 { + let seconds_to_wait = 2.0_f64.powf(current_task_attempt as f64 - 1.0).min(30.0); debug!( "Waiting {} seconds before starting the task", seconds_to_wait @@ -596,64 +730,372 @@ async fn process_task< tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await; } - let remote_index = &remote_assets.1; - - let sync_start = Instant::now(); - let sync_name = task.kind.sync_name(); - match task.kind { - SyncKind::Download(download_data) => { - let download_result = download_timeline( + let task_name = task.name(); + match task { + SyncTask::Download(new_download_data) => { + download_timeline( conf, - remote_assets.clone(), - task.sync_id, - download_data, - task.retries + 1, + (storage.as_ref(), &index), + current_remote_timeline.as_ref(), + sync_id, + new_download_data, + sync_start, + task_name, ) - .await; - - match download_result { - DownloadedTimeline::Abort => { - register_sync_status(sync_start, sync_name, None); - remote_index - .write() - .await - .set_awaits_download(&task.sync_id, false) - .expect("timeline should be present in remote index"); - None - } - DownloadedTimeline::FailedAndRescheduled => { - register_sync_status(sync_start, sync_name, Some(false)); - None - } - DownloadedTimeline::Successful => { - register_sync_status(sync_start, sync_name, Some(true)); - remote_index - .write() - .await - .set_awaits_download(&task.sync_id, false) - .expect("timeline should be present in remote index"); - Some(TimelineSyncStatusUpdate::Downloaded) - } - } + .await } - SyncKind::Upload(layer_upload) => { - let sync_status = upload_timeline_checkpoint( + SyncTask::Upload(new_upload_data) => { + upload_timeline( conf, - remote_assets, - task.sync_id, - layer_upload, - task.retries + 1, + (storage.as_ref(), &index), + current_remote_timeline.as_ref(), + sync_id, + new_upload_data, + sync_start, + task_name, ) .await; - register_sync_status(sync_start, sync_name, sync_status); None } + SyncTask::DownloadAndUpload(new_download_data, new_upload_data) => { + let status_update = download_timeline( + conf, + (storage.as_ref(), &index), + current_remote_timeline.as_ref(), + sync_id, + new_download_data, + sync_start, + task_name, + ) + .await; + + upload_timeline( + conf, + (storage.as_ref(), &index), + current_remote_timeline.as_ref(), + sync_id, + new_upload_data, + sync_start, + task_name, + ) + .await; + + status_update + } } } +async fn download_timeline( + conf: &'static PageServerConf, + (storage, index): (&S, &RemoteIndex), + current_remote_timeline: Option<&RemoteTimeline>, + sync_id: ZTenantTimelineId, + new_download_data: SyncData, + sync_start: Instant, + task_name: &str, +) -> Option +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + match download_timeline_layers(storage, current_remote_timeline, sync_id, new_download_data) + .await + { + DownloadedTimeline::Abort => { + register_sync_status(sync_start, task_name, None); + if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) { + error!( + "Timeline {} was expected to be in the remote index after a download attempt, but it's absent: {:?}", + sync_id, e + ); + } + None + } + DownloadedTimeline::FailedAndRescheduled => { + register_sync_status(sync_start, task_name, Some(false)); + None + } + DownloadedTimeline::Successful(mut download_data) => { + match update_local_metadata(conf, sync_id, current_remote_timeline).await { + Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) { + Ok(()) => { + register_sync_status(sync_start, task_name, Some(true)); + Some(TimelineSyncStatusUpdate::Downloaded) + } + Err(e) => { + error!( + "Timeline {} was expected to be in the remote index after a sucessful download, but it's absent: {:?}", + sync_id, e + ); + None + } + }, + Err(e) => { + error!("Failed to update local timeline metadata: {:?}", e); + download_data.retries += 1; + sync_queue::push(sync_id, SyncTask::Download(download_data)); + register_sync_status(sync_start, task_name, Some(false)); + None + } + } + } + } +} + +async fn update_local_metadata( + conf: &'static PageServerConf, + sync_id: ZTenantTimelineId, + remote_timeline: Option<&RemoteTimeline>, +) -> anyhow::Result<()> { + let remote_metadata = match remote_timeline { + Some(timeline) => &timeline.metadata, + None => { + info!("No remote timeline to update local metadata from, skipping the update"); + return Ok(()); + } + }; + let remote_lsn = remote_metadata.disk_consistent_lsn(); + + let local_metadata_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id); + let local_lsn = if local_metadata_path.exists() { + let local_metadata = read_metadata_file(&local_metadata_path) + .await + .with_context(|| { + format!( + "Failed to load local metadata from path '{}'", + local_metadata_path.display() + ) + })?; + + Some(local_metadata.disk_consistent_lsn()) + } else { + None + }; + + if local_lsn < Some(remote_lsn) { + info!( + "Updating local timeline metadata from remote timeline: local disk_consistent_lsn={:?}, remote disk_consistent_lsn={}", + local_lsn, remote_lsn + ); + + let remote_metadata_bytes = remote_metadata + .to_bytes() + .context("Failed to serialize remote metadata to bytes")?; + fs::write(&local_metadata_path, &remote_metadata_bytes) + .await + .with_context(|| { + format!( + "Failed to write remote metadata bytes locally to path '{}'", + local_metadata_path.display() + ) + })?; + } else { + info!("Local metadata at path '{}' has later disk consistent Lsn ({:?}) than the remote one ({}), skipping the update", local_metadata_path.display(), local_lsn, remote_lsn); + } + + Ok(()) +} + +async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result { + TimelineMetadata::from_bytes( + &fs::read(metadata_path) + .await + .context("Failed to read local metadata bytes from fs")?, + ) + .context("Failed to parse metadata bytes") +} + +async fn upload_timeline( + conf: &'static PageServerConf, + (storage, index): (&S, &RemoteIndex), + current_remote_timeline: Option<&RemoteTimeline>, + sync_id: ZTenantTimelineId, + new_upload_data: SyncData, + sync_start: Instant, + task_name: &str, +) where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let mut uploaded_data = + match upload_timeline_layers(storage, current_remote_timeline, sync_id, new_upload_data) + .await + { + UploadedTimeline::FailedAndRescheduled => { + register_sync_status(sync_start, task_name, Some(false)); + return; + } + UploadedTimeline::Successful(upload_data) => upload_data, + UploadedTimeline::SuccessfulAfterLocalFsUpdate(mut outdated_upload_data) => { + let local_metadata_path = + metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id); + let local_metadata = match read_metadata_file(&local_metadata_path).await { + Ok(metadata) => metadata, + Err(e) => { + error!( + "Failed to load local metadata from path '{}': {:?}", + local_metadata_path.display(), + e + ); + outdated_upload_data.retries += 1; + sync_queue::push(sync_id, SyncTask::Upload(outdated_upload_data)); + register_sync_status(sync_start, task_name, Some(false)); + return; + } + }; + + outdated_upload_data.data.metadata = local_metadata; + outdated_upload_data + } + }; + + match update_remote_data(conf, storage, index, sync_id, &uploaded_data.data, false).await { + Ok(()) => register_sync_status(sync_start, task_name, Some(true)), + Err(e) => { + error!("Failed to update remote timeline {}: {:?}", sync_id, e); + uploaded_data.retries += 1; + sync_queue::push(sync_id, SyncTask::Upload(uploaded_data)); + register_sync_status(sync_start, task_name, Some(false)); + } + } +} + +async fn update_remote_data( + conf: &'static PageServerConf, + storage: &S, + index: &RemoteIndex, + sync_id: ZTenantTimelineId, + uploaded_data: &TimelineUpload, + upload_failed: bool, +) -> anyhow::Result<()> +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let updated_remote_timeline = { + let mut index_accessor = index.write().await; + + match index_accessor.timeline_entry_mut(&sync_id) { + Some(existing_entry) => { + if existing_entry.metadata.disk_consistent_lsn() + < uploaded_data.metadata.disk_consistent_lsn() + { + existing_entry.metadata = uploaded_data.metadata.clone(); + } + if upload_failed { + existing_entry + .add_upload_failures(uploaded_data.layers_to_upload.iter().cloned()); + } else { + existing_entry + .add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned()); + } + existing_entry.clone() + } + None => { + let mut new_remote_timeline = RemoteTimeline::new(uploaded_data.metadata.clone()); + if upload_failed { + new_remote_timeline + .add_upload_failures(uploaded_data.layers_to_upload.iter().cloned()); + } else { + new_remote_timeline + .add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned()); + } + + index_accessor.add_timeline_entry(sync_id, new_remote_timeline.clone()); + new_remote_timeline + } + } + }; + + let timeline_path = conf.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id); + let new_index_part = + IndexPart::from_remote_timeline(&timeline_path, updated_remote_timeline) + .context("Failed to create an index part from the updated remote timeline")?; + + upload_index_part(conf, storage, sync_id, new_index_part) + .await + .context("Failed to upload new index part") +} + +fn validate_task_retries( + sync_id: ZTenantTimelineId, + task: SyncTask, + max_sync_errors: NonZeroU32, +) -> ControlFlow { + let max_sync_errors = max_sync_errors.get(); + let mut skip_upload = false; + let mut skip_download = false; + + match &task { + SyncTask::Download(download_data) | SyncTask::DownloadAndUpload(download_data, _) + if download_data.retries > max_sync_errors => + { + error!( + "Evicting download task for timeline {} that failed {} times, exceeding the error threshold {}", + sync_id, download_data.retries, max_sync_errors + ); + skip_download = true; + } + SyncTask::Upload(upload_data) | SyncTask::DownloadAndUpload(_, upload_data) + if upload_data.retries > max_sync_errors => + { + error!( + "Evicting upload task for timeline {} that failed {} times, exceeding the error threshold {}", + sync_id, upload_data.retries, max_sync_errors + ); + skip_upload = true; + } + _ => {} + } + + match task { + aborted_task @ SyncTask::Download(_) if skip_download => ControlFlow::Break(aborted_task), + aborted_task @ SyncTask::Upload(_) if skip_upload => ControlFlow::Break(aborted_task), + aborted_task @ SyncTask::DownloadAndUpload(_, _) if skip_upload && skip_download => { + ControlFlow::Break(aborted_task) + } + SyncTask::DownloadAndUpload(download_task, _) if skip_upload => { + ControlFlow::Continue(SyncTask::Download(download_task)) + } + SyncTask::DownloadAndUpload(_, upload_task) if skip_download => { + ControlFlow::Continue(SyncTask::Upload(upload_task)) + } + not_skipped => ControlFlow::Continue(not_skipped), + } +} + +async fn try_fetch_index_parts( + conf: &'static PageServerConf, + storage: &S, + keys: HashSet, +) -> HashMap +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let mut index_parts = HashMap::with_capacity(keys.len()); + + let mut part_downloads = keys + .into_iter() + .map(|id| async move { (id, download_index_part(conf, storage, id).await) }) + .collect::>(); + + while let Some((id, part_upload_result)) = part_downloads.next().await { + match part_upload_result { + Ok(index_part) => { + debug!("Successfully fetched index part for {}", id); + index_parts.insert(id, index_part); + } + Err(e) => warn!("Failed to fetch index part for {}: {:?}", id, e), + } + } + + index_parts +} + fn schedule_first_sync_tasks( index: &mut RemoteTimelineIndex, - local_timeline_files: HashMap)>, + local_timeline_files: HashMap)>, ) -> LocalTimelineInitStatuses { let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); @@ -661,71 +1103,66 @@ fn schedule_first_sync_tasks( VecDeque::with_capacity(local_timeline_files.len().max(local_timeline_files.len())); for (sync_id, (local_metadata, local_files)) in local_timeline_files { - let ZTenantTimelineId { - tenant_id, - timeline_id, - } = sync_id; match index.timeline_entry_mut(&sync_id) { - Some(index_entry) => { + Some(remote_timeline) => { let (timeline_status, awaits_download) = compare_local_and_remote_timeline( &mut new_sync_tasks, sync_id, local_metadata, local_files, - index_entry, + remote_timeline, ); let was_there = local_timeline_init_statuses - .entry(tenant_id) + .entry(sync_id.tenant_id) .or_default() - .insert(timeline_id, timeline_status); + .insert(sync_id.timeline_id, timeline_status); if was_there.is_some() { // defensive check warn!( "Overwriting timeline init sync status. Status {:?} Timeline {}", - timeline_status, timeline_id + timeline_status, sync_id.timeline_id ); } - index_entry.set_awaits_download(awaits_download); + remote_timeline.awaits_download = awaits_download; } None => { // TODO (rodionov) does this mean that we've crashed during tenant creation? // is it safe to upload this checkpoint? could it be half broken? - new_sync_tasks.push_back(SyncTask::new( + new_sync_tasks.push_back(( sync_id, - 0, - SyncKind::Upload(NewCheckpoint { - layers: local_files, + SyncTask::upload(TimelineUpload { + layers_to_upload: local_files, + uploaded_layers: HashSet::new(), metadata: local_metadata, }), )); local_timeline_init_statuses - .entry(tenant_id) + .entry(sync_id.tenant_id) .or_default() - .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete); + .insert( + sync_id.timeline_id, + LocalTimelineInitStatus::LocallyComplete, + ); } } } - new_sync_tasks.into_iter().for_each(|task| { - sync_queue::push(task); + new_sync_tasks.into_iter().for_each(|(sync_id, task)| { + sync_queue::push(sync_id, task); }); local_timeline_init_statuses } fn compare_local_and_remote_timeline( - new_sync_tasks: &mut VecDeque, + new_sync_tasks: &mut VecDeque<(ZTenantTimelineId, SyncTask)>, sync_id: ZTenantTimelineId, local_metadata: TimelineMetadata, - local_files: Vec, - remote_entry: &TimelineIndexEntry, + local_files: HashSet, + remote_entry: &RemoteTimeline, ) -> (LocalTimelineInitStatus, bool) { - let local_lsn = local_metadata.disk_consistent_lsn(); - let uploads = remote_entry.uploaded_checkpoints(); + let remote_files = remote_entry.stored_files(); - let mut initial_timeline_status = LocalTimelineInitStatus::LocallyComplete; - - let mut awaits_download = false; // TODO probably here we need more sophisticated logic, // if more data is available remotely can we just download whats there? // without trying to upload something. It may be tricky, needs further investigation. @@ -734,38 +1171,37 @@ fn compare_local_and_remote_timeline( // (upload needs to be only for previously unsynced files, not whole timeline dir). // If one of the tasks fails they will be reordered in the queue which can lead // to timeline being stuck in evicted state - if !uploads.contains(&local_lsn) { - new_sync_tasks.push_back(SyncTask::new( + let number_of_layers_to_download = remote_files.difference(&local_files).count(); + let (initial_timeline_status, awaits_download) = if number_of_layers_to_download > 0 { + new_sync_tasks.push_back(( sync_id, - 0, - SyncKind::Upload(NewCheckpoint { - layers: local_files.clone(), + SyncTask::download(TimelineDownload { + layers_to_skip: local_files.clone(), + }), + )); + (LocalTimelineInitStatus::NeedsSync, true) + // we do not need to manupulate with remote consistent lsn here + // because it will be updated when sync will be completed + } else { + (LocalTimelineInitStatus::LocallyComplete, false) + }; + + let layers_to_upload = local_files + .difference(remote_files) + .cloned() + .collect::>(); + if !layers_to_upload.is_empty() { + new_sync_tasks.push_back(( + sync_id, + SyncTask::upload(TimelineUpload { + layers_to_upload, + uploaded_layers: HashSet::new(), metadata: local_metadata, }), )); - // Note that status here doesnt change. + // Note that status here doesn't change. } - let uploads_count = uploads.len(); - let archives_to_skip: BTreeSet = uploads - .into_iter() - .filter(|upload_lsn| upload_lsn <= &local_lsn) - .map(ArchiveId) - .collect(); - if archives_to_skip.len() != uploads_count { - new_sync_tasks.push_back(SyncTask::new( - sync_id, - 0, - SyncKind::Download(TimelineDownload { - files_to_skip: Arc::new(local_files.into_iter().collect()), - archives_to_skip, - }), - )); - initial_timeline_status = LocalTimelineInitStatus::NeedsSync; - awaits_download = true; - // we do not need to manupulate with remote consistent lsn here - // because it will be updated when sync will be completed - } (initial_timeline_status, awaits_download) } @@ -780,322 +1216,44 @@ fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Optio .observe(secs_elapsed) } -async fn fetch_full_index< - P: Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - (storage, index): &(S, RemoteIndex), - timeline_dir: &Path, - id: ZTenantTimelineId, -) -> anyhow::Result { - let index_read = index.read().await; - let full_index = match index_read.timeline_entry(&id).map(|e| e.inner()) { - None => bail!("Timeline not found for sync id {}", id), - Some(TimelineIndexEntryInner::Full(_)) => { - bail!("Index is already populated for sync id {}", id) - } - Some(TimelineIndexEntryInner::Description(description)) => { - let mut archive_header_downloads = FuturesUnordered::new(); - for (archive_id, description) in description { - archive_header_downloads.push(async move { - let header = download_archive_header(storage, timeline_dir, description) - .await - .map_err(|e| (e, archive_id))?; - Ok((archive_id, description.header_size, header)) - }); - } - - let mut full_index = RemoteTimeline::empty(); - while let Some(header_data) = archive_header_downloads.next().await { - match header_data { - Ok((archive_id, header_size, header)) => full_index.update_archive_contents(archive_id.0, header, header_size), - Err((e, archive_id)) => bail!( - "Failed to download archive header for tenant {}, timeline {}, archive for Lsn {}: {}", - id.tenant_id, id.timeline_id, archive_id.0, - e - ), - } - } - full_index - } - }; - drop(index_read); // tokio rw lock is not upgradeable - index - .write() - .await - .upgrade_timeline_entry(&id, full_index.clone()) - .context("cannot upgrade timeline entry in remote index")?; - Ok(full_index) -} - -async fn download_archive_header< - P: Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - storage: &S, - timeline_dir: &Path, - description: &ArchiveDescription, -) -> anyhow::Result { - let mut header_buf = std::io::Cursor::new(Vec::new()); - let remote_path = storage.storage_path(&timeline_dir.join(&description.archive_name))?; - storage - .download_range( - &remote_path, - 0, - Some(description.header_size), - &mut header_buf, - ) - .await?; - let header_buf = header_buf.into_inner(); - let header = read_archive_header(&description.archive_name, &mut header_buf.as_slice()).await?; - Ok(header) -} - #[cfg(test)] mod test_utils { - use std::{ - collections::{BTreeMap, BTreeSet}, - fs, - }; - - use super::*; - use crate::{ - layered_repository::metadata::metadata_path, remote_storage::local_fs::LocalFs, - repository::repo_harness::RepoHarness, - }; use zenith_utils::lsn::Lsn; - #[track_caller] - pub async fn ensure_correct_timeline_upload( + use crate::repository::repo_harness::RepoHarness; + + use super::*; + + pub async fn create_local_timeline( harness: &RepoHarness<'_>, - remote_assets: Arc<(LocalFs, RemoteIndex)>, - timeline_id: ZTimelineId, - new_upload: NewCheckpoint, - ) { - let sync_id = ZTenantTimelineId::new(harness.tenant_id, timeline_id); - upload_timeline_checkpoint( - harness.conf, - Arc::clone(&remote_assets), - sync_id, - new_upload.clone(), - 0, - ) - .await; - - let (storage, index) = remote_assets.as_ref(); - assert_index_descriptions( - index, - &RemoteIndex::try_parse_descriptions_from_paths( - harness.conf, - remote_assets - .0 - .list() - .await - .unwrap() - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), - ), - ) - .await; - - let new_remote_timeline = expect_timeline(index, sync_id).await; - let new_remote_lsn = new_remote_timeline - .checkpoints() - .max() - .expect("Remote timeline should have an lsn after reupload"); - let upload_lsn = new_upload.metadata.disk_consistent_lsn(); - assert!( - new_remote_lsn >= upload_lsn, - "Remote timeline after upload should have the biggest Lsn out of all uploads" - ); - assert!( - new_remote_timeline.contains_checkpoint_at(upload_lsn), - "Should contain upload lsn among the remote ones" - ); - - let remote_files_after_upload = new_remote_timeline - .stored_files(&harness.conf.timeline_path(&timeline_id, &harness.tenant_id)); - for new_uploaded_layer in &new_upload.layers { - assert!( - remote_files_after_upload.contains(new_uploaded_layer), - "Remote files do not contain layer that should be uploaded: '{}'", - new_uploaded_layer.display() - ); - } - - assert_timeline_files_match(harness, timeline_id, new_remote_timeline); - } - - pub async fn expect_timeline( - index: &RemoteIndex, - sync_id: ZTenantTimelineId, - ) -> RemoteTimeline { - if let Some(TimelineIndexEntryInner::Full(remote_timeline)) = index - .read() - .await - .timeline_entry(&sync_id) - .map(|e| e.inner()) - { - remote_timeline.clone() - } else { - panic!( - "Expect to have a full remote timeline in the index for sync id {}", - sync_id - ) - } - } - - #[track_caller] - pub async fn assert_index_descriptions( - index: &RemoteIndex, - expected_index_with_descriptions: &RemoteIndex, - ) { - let expected_index_with_descriptions = expected_index_with_descriptions.read().await; - - let index_read = index.read().await; - let actual_sync_ids = index_read.all_sync_ids().collect::>(); - let expected_sync_ids = expected_index_with_descriptions - .all_sync_ids() - .collect::>(); - assert_eq!( - actual_sync_ids, expected_sync_ids, - "Index contains unexpected sync ids" - ); - - let mut actual_timeline_entries = BTreeMap::new(); - let mut expected_timeline_entries = BTreeMap::new(); - for sync_id in actual_sync_ids { - actual_timeline_entries.insert( - sync_id, - index_read.timeline_entry(&sync_id).unwrap().clone(), - ); - expected_timeline_entries.insert( - sync_id, - expected_index_with_descriptions - .timeline_entry(&sync_id) - .unwrap() - .clone(), - ); - } - drop(index_read); - - for (sync_id, actual_timeline_entry) in actual_timeline_entries { - let expected_timeline_description = expected_timeline_entries - .remove(&sync_id) - .unwrap_or_else(|| { - panic!( - "Failed to find an expected timeline with id {} in the index", - sync_id - ) - }); - let expected_timeline_description = match expected_timeline_description.inner() { - TimelineIndexEntryInner::Description(description) => description, - TimelineIndexEntryInner::Full(_) => panic!("Expected index entry for sync id {} is a full entry, while a description was expected", sync_id), - }; - - match actual_timeline_entry.inner() { - TimelineIndexEntryInner::Description(description) => { - assert_eq!( - description, expected_timeline_description, - "Index contains unexpected descriptions entry for sync id {}", - sync_id - ) - } - TimelineIndexEntryInner::Full(remote_timeline) => { - let expected_lsns = expected_timeline_description - .values() - .map(|description| description.disk_consistent_lsn) - .collect::>(); - assert_eq!( - remote_timeline.checkpoints().collect::>(), - expected_lsns, - "Timeline {} should have the same checkpoints uploaded", - sync_id, - ) - } - } - } - } - - pub fn assert_timeline_files_match( - harness: &RepoHarness, - remote_timeline_id: ZTimelineId, - remote_timeline: RemoteTimeline, - ) { - let local_timeline_dir = harness.timeline_path(&remote_timeline_id); - let local_paths = fs::read_dir(&local_timeline_dir) - .unwrap() - .map(|dir| dir.unwrap().path()) - .collect::>(); - let mut reported_remote_files = remote_timeline.stored_files(&local_timeline_dir); - let local_metadata_path = - metadata_path(harness.conf, remote_timeline_id, harness.tenant_id); - let local_metadata = TimelineMetadata::from_bytes( - &fs::read(&local_metadata_path) - .expect("Failed to read metadata file when comparing remote and local image files"), - ) - .expect( - "Failed to parse metadata file contents when comparing remote and local image files", - ); - assert!( - remote_timeline.contains_checkpoint_at(local_metadata.disk_consistent_lsn()), - "Should contain local lsn among the remote ones after the upload" - ); - reported_remote_files.insert(local_metadata_path); - - assert_eq!( - local_paths, reported_remote_files, - "Remote image files and local image files are different, missing locally: {:?}, missing remotely: {:?}", - reported_remote_files.difference(&local_paths).collect::>(), - local_paths.difference(&reported_remote_files).collect::>(), - ); - - if let Some(remote_file) = reported_remote_files.iter().next() { - let actual_remote_paths = fs::read_dir( - remote_file - .parent() - .expect("Remote files are expected to have their timeline dir as parent"), - ) - .unwrap() - .map(|dir| dir.unwrap().path()) - .collect::>(); - - let unreported_remote_files = actual_remote_paths - .difference(&reported_remote_files) - .collect::>(); - assert!( - unreported_remote_files.is_empty(), - "Unexpected extra remote files that were not listed: {:?}", - unreported_remote_files - ) - } - } - - pub fn create_local_timeline( - harness: &RepoHarness, timeline_id: ZTimelineId, filenames: &[&str], metadata: TimelineMetadata, - ) -> anyhow::Result { + ) -> anyhow::Result { let timeline_path = harness.timeline_path(&timeline_id); - fs::create_dir_all(&timeline_path)?; + fs::create_dir_all(&timeline_path).await?; - let mut layers = Vec::with_capacity(filenames.len()); + let mut layers_to_upload = HashSet::with_capacity(filenames.len()); for &file in filenames { let file_path = timeline_path.join(file); - fs::write(&file_path, dummy_contents(file).into_bytes())?; - layers.push(file_path); + fs::write(&file_path, dummy_contents(file).into_bytes()).await?; + layers_to_upload.insert(file_path); } fs::write( metadata_path(harness.conf, timeline_id, harness.tenant_id), metadata.to_bytes()?, - )?; + ) + .await?; - Ok(NewCheckpoint { layers, metadata }) + Ok(TimelineUpload { + layers_to_upload, + uploaded_layers: HashSet::new(), + metadata, + }) } - fn dummy_contents(name: &str) -> String { + pub fn dummy_contents(name: &str) -> String { format!("contents for {}", name) } @@ -1103,3 +1261,367 @@ mod test_utils { TimelineMetadata::new(disk_consistent_lsn, None, None, Lsn(0), Lsn(0), Lsn(0)) } } + +#[cfg(test)] +mod tests { + use std::collections::BTreeSet; + + use super::{test_utils::dummy_metadata, *}; + use zenith_utils::lsn::Lsn; + + #[test] + fn download_sync_tasks_merge() { + let download_1 = SyncTask::Download(SyncData::new( + 2, + TimelineDownload { + layers_to_skip: HashSet::from([PathBuf::from("one")]), + }, + )); + let download_2 = SyncTask::Download(SyncData::new( + 6, + TimelineDownload { + layers_to_skip: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), + }, + )); + + let merged_download = match download_1.merge(download_2) { + SyncTask::Download(merged_download) => merged_download, + wrong_merge_result => panic!("Unexpected merge result: {:?}", wrong_merge_result), + }; + + assert_eq!( + merged_download.retries, 0, + "Merged task should have its retries counter reset" + ); + + assert_eq!( + merged_download + .data + .layers_to_skip + .into_iter() + .collect::>(), + BTreeSet::from([ + PathBuf::from("one"), + PathBuf::from("two"), + PathBuf::from("three") + ]), + "Merged download tasks should a combined set of layers to skip" + ); + } + + #[test] + fn upload_sync_tasks_merge() { + let metadata_1 = dummy_metadata(Lsn(1)); + let metadata_2 = dummy_metadata(Lsn(2)); + assert!(metadata_2.disk_consistent_lsn() > metadata_1.disk_consistent_lsn()); + + let upload_1 = SyncTask::Upload(SyncData::new( + 2, + TimelineUpload { + layers_to_upload: HashSet::from([PathBuf::from("one")]), + uploaded_layers: HashSet::from([PathBuf::from("u_one")]), + metadata: metadata_1, + }, + )); + let upload_2 = SyncTask::Upload(SyncData::new( + 6, + TimelineUpload { + layers_to_upload: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), + uploaded_layers: HashSet::from([PathBuf::from("u_two")]), + metadata: metadata_2.clone(), + }, + )); + + let merged_upload = match upload_1.merge(upload_2) { + SyncTask::Upload(merged_upload) => merged_upload, + wrong_merge_result => panic!("Unexpected merge result: {:?}", wrong_merge_result), + }; + + assert_eq!( + merged_upload.retries, 0, + "Merged task should have its retries counter reset" + ); + + let upload = merged_upload.data; + assert_eq!( + upload.layers_to_upload.into_iter().collect::>(), + BTreeSet::from([ + PathBuf::from("one"), + PathBuf::from("two"), + PathBuf::from("three") + ]), + "Merged upload tasks should a combined set of layers to upload" + ); + + assert_eq!( + upload.uploaded_layers.into_iter().collect::>(), + BTreeSet::from([PathBuf::from("u_one"), PathBuf::from("u_two"),]), + "Merged upload tasks should a combined set of uploaded layers" + ); + + assert_eq!( + upload.metadata, metadata_2, + "Merged upload tasks should have a metadata with biggest disk_consistent_lsn" + ); + } + + #[test] + fn upload_and_download_sync_tasks_merge() { + let download_data = SyncData::new( + 3, + TimelineDownload { + layers_to_skip: HashSet::from([PathBuf::from("d_one")]), + }, + ); + + let upload_data = SyncData::new( + 2, + TimelineUpload { + layers_to_upload: HashSet::from([PathBuf::from("u_one")]), + uploaded_layers: HashSet::from([PathBuf::from("u_one_2")]), + metadata: dummy_metadata(Lsn(1)), + }, + ); + + let (merged_download, merged_upload) = match SyncTask::Download(download_data.clone()) + .merge(SyncTask::Upload(upload_data.clone())) + { + SyncTask::DownloadAndUpload(merged_download, merged_upload) => { + (merged_download, merged_upload) + } + wrong_merge_result => panic!("Unexpected merge result: {:?}", wrong_merge_result), + }; + + assert_eq!( + merged_download, download_data, + "When upload and dowload are merged, both should be unchanged" + ); + assert_eq!( + merged_upload, upload_data, + "When upload and dowload are merged, both should be unchanged" + ); + } + + #[test] + fn uploaddownload_and_upload_sync_tasks_merge() { + let download_data = SyncData::new( + 3, + TimelineDownload { + layers_to_skip: HashSet::from([PathBuf::from("d_one")]), + }, + ); + + let metadata_1 = dummy_metadata(Lsn(5)); + let metadata_2 = dummy_metadata(Lsn(2)); + assert!(metadata_1.disk_consistent_lsn() > metadata_2.disk_consistent_lsn()); + + let upload_download = SyncTask::DownloadAndUpload( + download_data.clone(), + SyncData::new( + 2, + TimelineUpload { + layers_to_upload: HashSet::from([PathBuf::from("one")]), + uploaded_layers: HashSet::from([PathBuf::from("u_one")]), + metadata: metadata_1.clone(), + }, + ), + ); + + let new_upload = SyncTask::Upload(SyncData::new( + 6, + TimelineUpload { + layers_to_upload: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), + uploaded_layers: HashSet::from([PathBuf::from("u_two")]), + metadata: metadata_2, + }, + )); + + let (merged_download, merged_upload) = match upload_download.merge(new_upload) { + SyncTask::DownloadAndUpload(merged_download, merged_upload) => { + (merged_download, merged_upload) + } + wrong_merge_result => panic!("Unexpected merge result: {:?}", wrong_merge_result), + }; + + assert_eq!( + merged_download, download_data, + "When uploaddowload and upload tasks are merged, download should be unchanged" + ); + + assert_eq!( + merged_upload.retries, 0, + "Merged task should have its retries counter reset" + ); + let upload = merged_upload.data; + assert_eq!( + upload.layers_to_upload.into_iter().collect::>(), + BTreeSet::from([ + PathBuf::from("one"), + PathBuf::from("two"), + PathBuf::from("three") + ]), + "Merged upload tasks should a combined set of layers to upload" + ); + + assert_eq!( + upload.uploaded_layers.into_iter().collect::>(), + BTreeSet::from([PathBuf::from("u_one"), PathBuf::from("u_two"),]), + "Merged upload tasks should a combined set of uploaded layers" + ); + + assert_eq!( + upload.metadata, metadata_1, + "Merged upload tasks should have a metadata with biggest disk_consistent_lsn" + ); + } + + #[test] + fn uploaddownload_and_download_sync_tasks_merge() { + let upload_data = SyncData::new( + 22, + TimelineUpload { + layers_to_upload: HashSet::from([PathBuf::from("one")]), + uploaded_layers: HashSet::from([PathBuf::from("u_one")]), + metadata: dummy_metadata(Lsn(22)), + }, + ); + + let upload_download = SyncTask::DownloadAndUpload( + SyncData::new( + 2, + TimelineDownload { + layers_to_skip: HashSet::from([PathBuf::from("one")]), + }, + ), + upload_data.clone(), + ); + + let new_download = SyncTask::Download(SyncData::new( + 6, + TimelineDownload { + layers_to_skip: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), + }, + )); + + let (merged_download, merged_upload) = match upload_download.merge(new_download) { + SyncTask::DownloadAndUpload(merged_download, merged_upload) => { + (merged_download, merged_upload) + } + wrong_merge_result => panic!("Unexpected merge result: {:?}", wrong_merge_result), + }; + + assert_eq!( + merged_upload, upload_data, + "When uploaddowload and download tasks are merged, upload should be unchanged" + ); + + assert_eq!( + merged_download.retries, 0, + "Merged task should have its retries counter reset" + ); + assert_eq!( + merged_download + .data + .layers_to_skip + .into_iter() + .collect::>(), + BTreeSet::from([ + PathBuf::from("one"), + PathBuf::from("two"), + PathBuf::from("three") + ]), + "Merged download tasks should a combined set of layers to skip" + ); + } + + #[test] + fn uploaddownload_sync_tasks_merge() { + let metadata_1 = dummy_metadata(Lsn(1)); + let metadata_2 = dummy_metadata(Lsn(2)); + assert!(metadata_2.disk_consistent_lsn() > metadata_1.disk_consistent_lsn()); + + let upload_download = SyncTask::DownloadAndUpload( + SyncData::new( + 2, + TimelineDownload { + layers_to_skip: HashSet::from([PathBuf::from("one")]), + }, + ), + SyncData::new( + 2, + TimelineUpload { + layers_to_upload: HashSet::from([PathBuf::from("one")]), + uploaded_layers: HashSet::from([PathBuf::from("u_one")]), + metadata: metadata_1, + }, + ), + ); + let new_upload_download = SyncTask::DownloadAndUpload( + SyncData::new( + 6, + TimelineDownload { + layers_to_skip: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), + }, + ), + SyncData::new( + 6, + TimelineUpload { + layers_to_upload: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), + uploaded_layers: HashSet::from([PathBuf::from("u_two")]), + metadata: metadata_2.clone(), + }, + ), + ); + + let (merged_download, merged_upload) = match upload_download.merge(new_upload_download) { + SyncTask::DownloadAndUpload(merged_download, merged_upload) => { + (merged_download, merged_upload) + } + wrong_merge_result => panic!("Unexpected merge result: {:?}", wrong_merge_result), + }; + + assert_eq!( + merged_download.retries, 0, + "Merged task should have its retries counter reset" + ); + assert_eq!( + merged_download + .data + .layers_to_skip + .into_iter() + .collect::>(), + BTreeSet::from([ + PathBuf::from("one"), + PathBuf::from("two"), + PathBuf::from("three") + ]), + "Merged download tasks should a combined set of layers to skip" + ); + + assert_eq!( + merged_upload.retries, 0, + "Merged task should have its retries counter reset" + ); + let upload = merged_upload.data; + assert_eq!( + upload.layers_to_upload.into_iter().collect::>(), + BTreeSet::from([ + PathBuf::from("one"), + PathBuf::from("two"), + PathBuf::from("three") + ]), + "Merged upload tasks should a combined set of layers to upload" + ); + + assert_eq!( + upload.uploaded_layers.into_iter().collect::>(), + BTreeSet::from([PathBuf::from("u_one"), PathBuf::from("u_two"),]), + "Merged upload tasks should a combined set of uploaded layers" + ); + + assert_eq!( + upload.metadata, metadata_2, + "Merged upload tasks should have a metadata with biggest disk_consistent_lsn" + ); + } +} diff --git a/pageserver/src/remote_storage/storage_sync/compression.rs b/pageserver/src/remote_storage/storage_sync/compression.rs deleted file mode 100644 index 511f79e0cf..0000000000 --- a/pageserver/src/remote_storage/storage_sync/compression.rs +++ /dev/null @@ -1,612 +0,0 @@ -//! A set of structs to represent a compressed part of the timeline, and methods to asynchronously compress and uncompress a stream of data, -//! without holding the entire data in memory. -//! For the latter, both compress and uncompress functions operate buffered streams (currently hardcoded size of [`ARCHIVE_STREAM_BUFFER_SIZE_BYTES`]), -//! not attempting to hold the entire archive in memory. -//! -//! The compression is done with zstd streaming algorithm via the `async-compression` crate. -//! The crate does not contain any knobs to tweak the compression, but otherwise is one of the only ones that's both async and has an API to manage the part of an archive. -//! Zstd was picked as the best algorithm among the ones available in the crate, after testing the initial timeline file compression. -//! -//! Archiving is almost agnostic to timeline file types, with an exception of the metadata file, that's currently distinguished in the [un]compression code. -//! The metadata file is treated separately when [de]compression is involved, to reduce the risk of corrupting the metadata file. -//! When compressed, the metadata file is always required and stored as the last file in the archive stream. -//! When uncompressed, the metadata file gets naturally uncompressed last, to ensure that all other layer files are decompressed successfully first. -//! -//! Archive structure: -//! +----------------------------------------+ -//! | header | file_1, ..., file_k, metadata | -//! +----------------------------------------+ -//! -//! The archive consists of two separate zstd archives: -//! * header archive, that contains all files names and their sizes and relative paths in the timeline directory -//! Header is a Rust structure, serialized into bytes and compressed with zstd. -//! * files archive, that has metadata file as the last one, all compressed with zstd into a single binary blob -//! -//! Header offset is stored in the file name, along with the `disk_consistent_lsn` from the metadata file. -//! See [`parse_archive_name`] and [`ARCHIVE_EXTENSION`] for the name details, example: `00000000016B9150-.zst_9732`. -//! This way, the header could be retrieved without reading an entire archive file. - -use std::{ - collections::BTreeSet, - future::Future, - io::Cursor, - path::{Path, PathBuf}, - sync::Arc, -}; - -use anyhow::{bail, ensure, Context}; -use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder}; -use serde::{Deserialize, Serialize}; -use tokio::{ - fs, - io::{self, AsyncReadExt, AsyncWriteExt}, -}; -use tracing::*; -use zenith_utils::{bin_ser::BeSer, lsn::Lsn}; - -use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}; - -use super::index::RelativePath; - -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct ArchiveHeader { - /// All regular timeline files, excluding the metadata file. - pub files: Vec, - // Metadata file name is known to the system, as its location relative to the timeline dir, - // so no need to store anything but its size in bytes. - pub metadata_file_size: u64, -} - -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)] -pub struct FileEntry { - /// Uncompressed file size, bytes. - pub size: u64, - /// A path, relative to the directory root, used when compressing the directory contents. - pub subpath: RelativePath, -} - -const ARCHIVE_EXTENSION: &str = "-.zst_"; -const ARCHIVE_STREAM_BUFFER_SIZE_BYTES: usize = 4 * 1024 * 1024; - -/// Streams an archive of files given into a stream target, defined by the closure. -/// -/// The closure approach is picked for cases like S3, where we would need a name of the file before we can get a stream to write the bytes into. -/// Current idea is to place the header size in the name of the file, to enable the fast partial remote file index restoration without actually reading remote storage file contents. -/// -/// Performs the compression in multiple steps: -/// * prepares an archive header, stripping the `source_dir` prefix from the `files` -/// * generates the name of the archive -/// * prepares archive producer future, knowing the header and the file list -/// An `impl AsyncRead` and `impl AsyncWrite` pair of connected streams is created to implement the partial contents streaming. -/// The writer end gets into the archive producer future, to put the header and a stream of compressed files. -/// * prepares archive consumer future, by executing the provided closure -/// The closure gets the reader end stream and the name of the file to create a future that would stream the file contents elsewhere. -/// * runs and waits for both futures to complete -/// * on a successful completion of both futures, header, its size and the user-defined consumer future return data is returned -/// Due to the design above, the archive name and related data is visible inside the consumer future only, so it's possible to return the data, -/// needed for future processing. -pub async fn archive_files_as_stream( - source_dir: &Path, - files: impl Iterator, - metadata: &TimelineMetadata, - create_archive_consumer: Cons, -) -> anyhow::Result<(ArchiveHeader, u64, ConsRet)> -where - Cons: FnOnce(Box, String) -> Fut - + Send - + 'static, - Fut: Future> + Send + 'static, - ConsRet: Send + Sync + 'static, -{ - let metadata_bytes = metadata - .to_bytes() - .context("Failed to create metadata bytes")?; - let (archive_header, compressed_header_bytes) = - prepare_header(source_dir, files, &metadata_bytes) - .await - .context("Failed to prepare file for archivation")?; - - let header_size = compressed_header_bytes.len() as u64; - let (write, read) = io::duplex(ARCHIVE_STREAM_BUFFER_SIZE_BYTES); - let archive_filler = write_archive_contents( - source_dir.to_path_buf(), - archive_header.clone(), - metadata_bytes, - write, - ); - let archive_name = archive_name(metadata.disk_consistent_lsn(), header_size); - let archive_stream = - Cursor::new(compressed_header_bytes).chain(ZstdEncoder::new(io::BufReader::new(read))); - - let (archive_creation_result, archive_upload_result) = tokio::join!( - tokio::spawn(archive_filler), - tokio::spawn(async move { - create_archive_consumer(Box::new(archive_stream), archive_name).await - }) - ); - archive_creation_result - .context("Failed to spawn archive creation future")? - .context("Failed to create an archive")?; - let upload_return_value = archive_upload_result - .context("Failed to spawn archive upload future")? - .context("Failed to upload the archive")?; - - Ok((archive_header, header_size, upload_return_value)) -} - -/// Similar to [`archive_files_as_stream`], creates a pair of streams to uncompress the 2nd part of the archive, -/// that contains files and is located after the header. -/// S3 allows downloading partial file contents for a given file key (i.e. name), to accommodate this retrieval, -/// a closure is used. -/// Same concepts with two concurrent futures, user-defined closure, future and return value apply here, but the -/// consumer and the receiver ends are swapped, since the uncompression happens. -pub async fn uncompress_file_stream_with_index( - destination_dir: PathBuf, - files_to_skip: Arc>, - disk_consistent_lsn: Lsn, - header: ArchiveHeader, - header_size: u64, - create_archive_file_part: Prod, -) -> anyhow::Result -where - Prod: FnOnce(Box, String) -> Fut - + Send - + 'static, - Fut: Future> + Send + 'static, - ProdRet: Send + Sync + 'static, -{ - let (write, mut read) = io::duplex(ARCHIVE_STREAM_BUFFER_SIZE_BYTES); - let archive_name = archive_name(disk_consistent_lsn, header_size); - - let (archive_download_result, archive_uncompress_result) = tokio::join!( - tokio::spawn(async move { create_archive_file_part(Box::new(write), archive_name).await }), - tokio::spawn(async move { - uncompress_with_header(&files_to_skip, &destination_dir, header, &mut read).await - }) - ); - - let download_value = archive_download_result - .context("Failed to spawn archive download future")? - .context("Failed to download an archive")?; - archive_uncompress_result - .context("Failed to spawn archive uncompress future")? - .context("Failed to uncompress the archive")?; - - Ok(download_value) -} - -/// Reads archive header from the stream given: -/// * parses the file name to get the header size -/// * reads the exact amount of bytes -/// * uncompresses and deserializes those -pub async fn read_archive_header( - archive_name: &str, - from: &mut A, -) -> anyhow::Result { - let (_, header_size) = parse_archive_name(Path::new(archive_name))?; - - let mut compressed_header_bytes = vec![0; header_size as usize]; - from.read_exact(&mut compressed_header_bytes) - .await - .with_context(|| { - format!( - "Failed to read header header from the archive {}", - archive_name - ) - })?; - - let mut header_bytes = Vec::new(); - ZstdDecoder::new(io::BufReader::new(compressed_header_bytes.as_slice())) - .read_to_end(&mut header_bytes) - .await - .context("Failed to decompress a header from the archive")?; - - ArchiveHeader::des(&header_bytes).context("Failed to deserialize a header from the archive") -} - -/// Reads the archive metadata out of the archive name: -/// * `disk_consistent_lsn` of the checkpoint that was archived -/// * size of the archive header -pub fn parse_archive_name(archive_path: &Path) -> anyhow::Result<(Lsn, u64)> { - let archive_name = archive_path - .file_name() - .with_context(|| format!("Archive '{}' has no file name", archive_path.display()))? - .to_string_lossy(); - let (lsn_str, header_size_str) = - archive_name - .rsplit_once(ARCHIVE_EXTENSION) - .with_context(|| { - format!( - "Archive '{}' has incorrect extension, expected to contain '{}'", - archive_path.display(), - ARCHIVE_EXTENSION - ) - })?; - let disk_consistent_lsn = Lsn::from_hex(lsn_str).with_context(|| { - format!( - "Archive '{}' has an invalid disk consistent lsn in its extension", - archive_path.display(), - ) - })?; - let header_size = header_size_str.parse::().with_context(|| { - format!( - "Archive '{}' has an invalid a header offset number in its extension", - archive_path.display(), - ) - })?; - Ok((disk_consistent_lsn, header_size)) -} - -fn archive_name(disk_consistent_lsn: Lsn, header_size: u64) -> String { - let archive_name = format!( - "{:016X}{ARCHIVE_EXTENSION}{}", - u64::from(disk_consistent_lsn), - header_size, - ARCHIVE_EXTENSION = ARCHIVE_EXTENSION, - ); - archive_name -} - -pub async fn uncompress_with_header( - files_to_skip: &BTreeSet, - destination_dir: &Path, - header: ArchiveHeader, - archive_after_header: impl io::AsyncRead + Send + Sync + Unpin, -) -> anyhow::Result<()> { - debug!("Uncompressing archive into {}", destination_dir.display()); - let mut archive = ZstdDecoder::new(io::BufReader::new(archive_after_header)); - - if !destination_dir.exists() { - fs::create_dir_all(&destination_dir) - .await - .with_context(|| { - format!( - "Failed to create target directory at {}", - destination_dir.display() - ) - })?; - } else if !destination_dir.is_dir() { - bail!( - "Destination path '{}' is not a valid directory", - destination_dir.display() - ); - } - debug!("Will extract {} files from the archive", header.files.len()); - for entry in header.files { - uncompress_entry( - &mut archive, - &entry.subpath.as_path(destination_dir), - entry.size, - files_to_skip, - ) - .await - .with_context(|| format!("Failed to uncompress archive entry {:?}", entry))?; - } - uncompress_entry( - &mut archive, - &destination_dir.join(METADATA_FILE_NAME), - header.metadata_file_size, - files_to_skip, - ) - .await - .context("Failed to uncompress the metadata entry")?; - Ok(()) -} - -async fn uncompress_entry( - archive: &mut ZstdDecoder>, - destination_path: &Path, - entry_size: u64, - files_to_skip: &BTreeSet, -) -> anyhow::Result<()> { - if let Some(parent) = destination_path.parent() { - fs::create_dir_all(parent).await.with_context(|| { - format!( - "Failed to create parent directory for {}", - destination_path.display() - ) - })?; - }; - - if files_to_skip.contains(destination_path) { - debug!("Skipping {}", destination_path.display()); - copy_n_bytes(entry_size, archive, &mut io::sink()) - .await - .context("Failed to skip bytes in the archive")?; - return Ok(()); - } - - let mut destination = - io::BufWriter::new(fs::File::create(&destination_path).await.with_context(|| { - format!( - "Failed to open file {} for extraction", - destination_path.display() - ) - })?); - copy_n_bytes(entry_size, archive, &mut destination) - .await - .with_context(|| { - format!( - "Failed to write extracted archive contents into file {}", - destination_path.display() - ) - })?; - destination - .flush() - .await - .context("Failed to flush the streaming archive bytes")?; - Ok(()) -} - -async fn write_archive_contents( - source_dir: PathBuf, - header: ArchiveHeader, - metadata_bytes: Vec, - mut archive_input: io::DuplexStream, -) -> anyhow::Result<()> { - debug!("Starting writing files into archive"); - for file_entry in header.files { - let path = file_entry.subpath.as_path(&source_dir); - let mut source_file = - io::BufReader::new(fs::File::open(&path).await.with_context(|| { - format!( - "Failed to open file for archiving to path {}", - path.display() - ) - })?); - let bytes_written = io::copy(&mut source_file, &mut archive_input) - .await - .with_context(|| { - format!( - "Failed to open add a file into archive, file path {}", - path.display() - ) - })?; - ensure!( - file_entry.size == bytes_written, - "File {} was written to the archive incompletely", - path.display() - ); - trace!( - "Added file '{}' ({} bytes) into the archive", - path.display(), - bytes_written - ); - } - let metadata_bytes_written = io::copy(&mut metadata_bytes.as_slice(), &mut archive_input) - .await - .context("Failed to add metadata into the archive")?; - ensure!( - header.metadata_file_size == metadata_bytes_written, - "Metadata file was written to the archive incompletely", - ); - - archive_input - .shutdown() - .await - .context("Failed to finalize the archive")?; - debug!("Successfully streamed all files into the archive"); - Ok(()) -} - -async fn prepare_header( - source_dir: &Path, - files: impl Iterator, - metadata_bytes: &[u8], -) -> anyhow::Result<(ArchiveHeader, Vec)> { - let mut archive_files = Vec::new(); - for file_path in files { - let file_metadata = fs::metadata(file_path).await.with_context(|| { - format!( - "Failed to read metadata during archive indexing for {}", - file_path.display() - ) - })?; - ensure!( - file_metadata.is_file(), - "Archive indexed path {} is not a file", - file_path.display() - ); - - if file_path.file_name().and_then(|name| name.to_str()) != Some(METADATA_FILE_NAME) { - let entry = FileEntry { - subpath: RelativePath::new(source_dir, file_path).with_context(|| { - format!( - "File '{}' does not belong to pageserver workspace", - file_path.display() - ) - })?, - size: file_metadata.len(), - }; - archive_files.push(entry); - } - } - - let header = ArchiveHeader { - files: archive_files, - metadata_file_size: metadata_bytes.len() as u64, - }; - - debug!("Appending a header for {} files", header.files.len()); - let header_bytes = header.ser().context("Failed to serialize a header")?; - debug!("Header bytes len {}", header_bytes.len()); - let mut compressed_header_bytes = Vec::new(); - ZstdEncoder::new(io::BufReader::new(header_bytes.as_slice())) - .read_to_end(&mut compressed_header_bytes) - .await - .context("Failed to compress header bytes")?; - debug!( - "Compressed header bytes len {}", - compressed_header_bytes.len() - ); - Ok((header, compressed_header_bytes)) -} - -async fn copy_n_bytes( - n: u64, - from: &mut (impl io::AsyncRead + Send + Sync + Unpin), - into: &mut (impl io::AsyncWrite + Send + Sync + Unpin), -) -> anyhow::Result<()> { - let bytes_written = io::copy(&mut from.take(n), into).await?; - ensure!( - bytes_written == n, - "Failed to read exactly {} bytes from the input, bytes written: {}", - n, - bytes_written, - ); - Ok(()) -} - -#[cfg(test)] -mod tests { - use tokio::{fs, io::AsyncSeekExt}; - - use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID}; - - use super::*; - - #[tokio::test] - async fn compress_and_uncompress() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("compress_and_uncompress")?; - let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID); - init_directory( - &timeline_dir, - vec![ - ("first", "first_contents"), - ("second", "second_contents"), - (METADATA_FILE_NAME, "wrong_metadata"), - ], - ) - .await?; - let timeline_files = list_file_paths_with_contents(&timeline_dir).await?; - assert_eq!( - timeline_files, - vec![ - ( - timeline_dir.join("first"), - FileContents::Text("first_contents".to_string()) - ), - ( - timeline_dir.join(METADATA_FILE_NAME), - FileContents::Text("wrong_metadata".to_string()) - ), - ( - timeline_dir.join("second"), - FileContents::Text("second_contents".to_string()) - ), - ], - "Initial timeline contents should contain two normal files and a wrong metadata file" - ); - - let metadata = TimelineMetadata::new(Lsn(0x30), None, None, Lsn(0), Lsn(0), Lsn(0)); - let paths_to_archive = timeline_files - .into_iter() - .map(|(path, _)| path) - .collect::>(); - - let tempdir = tempfile::tempdir()?; - let base_path = tempdir.path().to_path_buf(); - let (header, header_size, archive_target) = archive_files_as_stream( - &timeline_dir, - paths_to_archive.iter(), - &metadata, - move |mut archive_streamer, archive_name| async move { - let archive_target = base_path.join(&archive_name); - let mut archive_file = fs::File::create(&archive_target).await?; - io::copy(&mut archive_streamer, &mut archive_file).await?; - Ok(archive_target) - }, - ) - .await?; - - let mut file = fs::File::open(&archive_target).await?; - file.seek(io::SeekFrom::Start(header_size)).await?; - let target_dir = tempdir.path().join("extracted"); - uncompress_with_header(&BTreeSet::new(), &target_dir, header, file).await?; - - let extracted_files = list_file_paths_with_contents(&target_dir).await?; - - assert_eq!( - extracted_files, - vec![ - ( - target_dir.join("first"), - FileContents::Text("first_contents".to_string()) - ), - ( - target_dir.join(METADATA_FILE_NAME), - FileContents::Binary(metadata.to_bytes()?) - ), - ( - target_dir.join("second"), - FileContents::Text("second_contents".to_string()) - ), - ], - "Extracted files should contain all local timeline files besides its metadata, which should be taken from the arguments" - ); - - Ok(()) - } - - async fn init_directory( - root: &Path, - files_with_contents: Vec<(&str, &str)>, - ) -> anyhow::Result<()> { - fs::create_dir_all(root).await?; - for (file_name, contents) in files_with_contents { - fs::File::create(root.join(file_name)) - .await? - .write_all(contents.as_bytes()) - .await?; - } - Ok(()) - } - - #[derive(PartialEq, Eq, PartialOrd, Ord)] - enum FileContents { - Text(String), - Binary(Vec), - } - - impl std::fmt::Debug for FileContents { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Text(text) => f.debug_tuple("Text").field(text).finish(), - Self::Binary(bytes) => f - .debug_tuple("Binary") - .field(&format!("{} bytes", bytes.len())) - .finish(), - } - } - } - - async fn list_file_paths_with_contents( - root: &Path, - ) -> anyhow::Result> { - let mut file_paths = Vec::new(); - - let mut dir_listings = vec![fs::read_dir(root).await?]; - while let Some(mut dir_listing) = dir_listings.pop() { - while let Some(entry) = dir_listing.next_entry().await? { - let entry_path = entry.path(); - if entry_path.is_file() { - let contents = match String::from_utf8(fs::read(&entry_path).await?) { - Ok(text) => FileContents::Text(text), - Err(e) => FileContents::Binary(e.into_bytes()), - }; - file_paths.push((entry_path, contents)); - } else if entry_path.is_dir() { - dir_listings.push(fs::read_dir(entry_path).await?); - } else { - info!( - "Skipping path '{}' as it's not a file or a directory", - entry_path.display() - ); - } - } - } - - file_paths.sort(); - Ok(file_paths) - } -} diff --git a/pageserver/src/remote_storage/storage_sync/download.rs b/pageserver/src/remote_storage/storage_sync/download.rs index e5aa74452b..81ed649c8a 100644 --- a/pageserver/src/remote_storage/storage_sync/download.rs +++ b/pageserver/src/remote_storage/storage_sync/download.rs @@ -1,30 +1,76 @@ -//! Timeline synchrnonization logic to put files from archives on remote storage into pageserver's local directory. +//! Timeline synchrnonization logic to fetch the layer files from remote storage into pageserver's local directory. -use std::{collections::BTreeSet, path::PathBuf, sync::Arc}; +use std::fmt::Debug; -use anyhow::{ensure, Context}; +use anyhow::Context; +use futures::stream::{FuturesUnordered, StreamExt}; use tokio::fs; use tracing::{debug, error, trace, warn}; -use zenith_utils::zid::ZTenantId; use crate::{ config::PageServerConf, - layered_repository::metadata::{metadata_path, TimelineMetadata}, + layered_repository::metadata::metadata_path, remote_storage::{ - storage_sync::{ - compression, fetch_full_index, index::TimelineIndexEntryInner, sync_queue, SyncKind, - SyncTask, - }, + storage_sync::{sync_queue, SyncTask}, RemoteStorage, ZTenantTimelineId, }, }; use super::{ - index::{ArchiveId, RemoteTimeline}, - RemoteIndex, TimelineDownload, + index::{IndexPart, RemoteTimeline}, + SyncData, TimelineDownload, }; +/// Retrieves index data from the remote storage for a given timeline. +pub async fn download_index_part( + conf: &'static PageServerConf, + storage: &S, + sync_id: ZTenantTimelineId, +) -> anyhow::Result +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) + .with_file_name(IndexPart::FILE_NAME) + .with_extension(IndexPart::FILE_EXTENSION); + let part_storage_path = storage.storage_path(&index_part_path).with_context(|| { + format!( + "Failed to get the index part storage path for local path '{}'", + index_part_path.display() + ) + })?; + let mut index_part_bytes = Vec::new(); + storage + .download(&part_storage_path, &mut index_part_bytes) + .await + .with_context(|| { + format!( + "Failed to download an index part from storage path '{:?}'", + part_storage_path + ) + })?; + + let index_part: IndexPart = serde_json::from_slice(&index_part_bytes).with_context(|| { + format!( + "Failed to deserialize index part file from storage path '{:?}'", + part_storage_path + ) + })?; + + let missing_files = index_part.missing_files(); + if !missing_files.is_empty() { + warn!( + "Found missing layers in index part for timeline {}: {:?}", + sync_id, missing_files + ); + } + + Ok(index_part) +} + /// Timeline download result, with extra data, needed for downloading. +#[derive(Debug)] pub(super) enum DownloadedTimeline { /// Remote timeline data is either absent or corrupt, no download possible. Abort, @@ -33,222 +79,136 @@ pub(super) enum DownloadedTimeline { FailedAndRescheduled, /// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known. /// Initial download successful. - Successful, + Successful(SyncData), } -/// Attempts to download and uncompress files from all remote archives for the timeline given. +/// Attempts to download all given timeline's layers. /// Timeline files that already exist locally are skipped during the download, but the local metadata file is -/// updated in the end of every checkpoint archive extraction. +/// updated in the end, if the remote one contains a newer disk_consistent_lsn. /// -/// On an error, bumps the retries count and reschedules the download, with updated archive skip list -/// (for any new successful archive downloads and extractions). -pub(super) async fn download_timeline< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - conf: &'static PageServerConf, - remote_assets: Arc<(S, RemoteIndex)>, +/// On an error, bumps the retries count and updates the files to skip with successful downloads, rescheduling the task. +pub(super) async fn download_timeline_layers<'a, P, S>( + storage: &'a S, + remote_timeline: Option<&'a RemoteTimeline>, sync_id: ZTenantTimelineId, - mut download: TimelineDownload, - retries: u32, -) -> DownloadedTimeline { - debug!("Downloading layers for sync id {}", sync_id); - - let ZTenantTimelineId { - tenant_id, - timeline_id, - } = sync_id; - let index = &remote_assets.1; - - let index_read = index.read().await; - let remote_timeline = match index_read.timeline_entry(&sync_id) { + mut download_data: SyncData, +) -> DownloadedTimeline +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let remote_timeline = match remote_timeline { + Some(remote_timeline) => { + if !remote_timeline.awaits_download { + error!("Timeline with sync id {} is not awaiting download", sync_id); + return DownloadedTimeline::Abort; + } + remote_timeline + } None => { - error!("Cannot download: no timeline is present in the index for given id"); - drop(index_read); + error!( + "Timeline with sync id {} is not present in the remote index", + sync_id + ); return DownloadedTimeline::Abort; } - - Some(index_entry) => match index_entry.inner() { - TimelineIndexEntryInner::Full(remote_timeline) => { - let cloned = remote_timeline.clone(); - drop(index_read); - cloned - } - TimelineIndexEntryInner::Description(_) => { - // we do not check here for awaits_download because it is ok - // to call this function while the download is in progress - // so it is not a concurrent download, it is the same one - - let remote_disk_consistent_lsn = index_entry.disk_consistent_lsn(); - drop(index_read); - debug!("Found timeline description for the given ids, downloading the full index"); - match fetch_full_index( - remote_assets.as_ref(), - &conf.timeline_path(&timeline_id, &tenant_id), - sync_id, - ) - .await - { - Ok(remote_timeline) => remote_timeline, - Err(e) => { - error!("Failed to download full timeline index: {:?}", e); - - return match remote_disk_consistent_lsn { - Some(_) => { - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Download(download), - )); - DownloadedTimeline::FailedAndRescheduled - } - None => { - error!("Cannot download: no disk consistent Lsn is present for the index entry"); - DownloadedTimeline::Abort - } - }; - } - } - } - }, - }; - if remote_timeline.checkpoints().max().is_none() { - debug!("Cannot download: no disk consistent Lsn is present for the remote timeline"); - return DownloadedTimeline::Abort; }; - debug!("Downloading timeline archives"); - let archives_to_download = remote_timeline - .checkpoints() - .map(ArchiveId) - .filter(|remote_archive| !download.archives_to_skip.contains(remote_archive)) + debug!("Downloading timeline layers for sync id {}", sync_id); + let download = &mut download_data.data; + + let layers_to_download = remote_timeline + .stored_files() + .difference(&download.layers_to_skip) + .cloned() .collect::>(); - let archives_total = archives_to_download.len(); - debug!("Downloading {} archives of a timeline", archives_total); - trace!("Archives to download: {:?}", archives_to_download); + trace!("Layers to download: {:?}", layers_to_download); - for (archives_downloaded, archive_id) in archives_to_download.into_iter().enumerate() { - match try_download_archive( - conf, - sync_id, - Arc::clone(&remote_assets), - &remote_timeline, - archive_id, - Arc::clone(&download.files_to_skip), - ) - .await - { - Err(e) => { - let archives_left = archives_total - archives_downloaded; - error!( - "Failed to download archive {:?} (archives downloaded: {}; archives left: {}) for tenant {} timeline {}, requeueing the download: {:?}", - archive_id, archives_downloaded, archives_left, tenant_id, timeline_id, e + let mut download_tasks = layers_to_download + .into_iter() + .map(|layer_desination_path| async move { + if layer_desination_path.exists() { + debug!( + "Layer already exists locally, skipping download: {}", + layer_desination_path.display() ); - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Download(download), - )); - return DownloadedTimeline::FailedAndRescheduled; + } else { + let layer_storage_path = storage + .storage_path(&layer_desination_path) + .with_context(|| { + format!( + "Failed to get the layer storage path for local path '{}'", + layer_desination_path.display() + ) + })?; + + let mut destination_file = fs::File::create(&layer_desination_path) + .await + .with_context(|| { + format!( + "Failed to create a destination file for layer '{}'", + layer_desination_path.display() + ) + })?; + + storage + .download(&layer_storage_path, &mut destination_file) + .await + .with_context(|| { + format!( + "Failed to download a layer from storage path '{:?}'", + layer_storage_path + ) + })?; } - Ok(()) => { - debug!("Successfully downloaded archive {:?}", archive_id); - download.archives_to_skip.insert(archive_id); + Ok::<_, anyhow::Error>(layer_desination_path) + }) + .collect::>(); + + debug!("Downloading {} layers of a timeline", download_tasks.len()); + + let mut errors_happened = false; + while let Some(download_result) = download_tasks.next().await { + match download_result { + Ok(downloaded_path) => { + download.layers_to_skip.insert(downloaded_path); + } + Err(e) => { + errors_happened = true; + error!( + "Failed to download a layer for timeline {}: {:?}", + sync_id, e + ); } } } - debug!("Finished downloading all timeline's archives"); - DownloadedTimeline::Successful -} - -async fn try_download_archive< - P: Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - conf: &'static PageServerConf, - ZTenantTimelineId { - tenant_id, - timeline_id, - }: ZTenantTimelineId, - remote_assets: Arc<(S, RemoteIndex)>, - remote_timeline: &RemoteTimeline, - archive_id: ArchiveId, - files_to_skip: Arc>, -) -> anyhow::Result<()> { - debug!("Downloading archive {:?}", archive_id); - let archive_to_download = remote_timeline - .archive_data(archive_id) - .with_context(|| format!("Archive {:?} not found in remote storage", archive_id))?; - let (archive_header, header_size) = remote_timeline - .restore_header(archive_id) - .context("Failed to restore header when downloading an archive")?; - - match read_local_metadata(conf, timeline_id, tenant_id).await { - Ok(local_metadata) => ensure!( - // need to allow `<=` instead of `<` due to cases when a failed archive can be redownloaded - local_metadata.disk_consistent_lsn() <= archive_to_download.disk_consistent_lsn(), - "Cannot download archive with Lsn {} since it's earlier than local Lsn {}", - archive_to_download.disk_consistent_lsn(), - local_metadata.disk_consistent_lsn() - ), - Err(e) => warn!("Failed to read local metadata file, assuming it's safe to override its with the download. Read: {:#}", e), + if errors_happened { + debug!("Reenqueuing failed download task for timeline {}", sync_id); + download_data.retries += 1; + sync_queue::push(sync_id, SyncTask::Download(download_data)); + DownloadedTimeline::FailedAndRescheduled + } else { + debug!("Finished downloading all timeline's layers"); + DownloadedTimeline::Successful(download_data) } - compression::uncompress_file_stream_with_index( - conf.timeline_path(&timeline_id, &tenant_id), - files_to_skip, - archive_to_download.disk_consistent_lsn(), - archive_header, - header_size, - move |mut archive_target, archive_name| async move { - let archive_local_path = conf - .timeline_path(&timeline_id, &tenant_id) - .join(&archive_name); - let remote_storage = &remote_assets.0; - remote_storage - .download_range( - &remote_storage.storage_path(&archive_local_path)?, - header_size, - None, - &mut archive_target, - ) - .await - }, - ) - .await?; - - Ok(()) -} - -async fn read_local_metadata( - conf: &'static PageServerConf, - timeline_id: zenith_utils::zid::ZTimelineId, - tenant_id: ZTenantId, -) -> anyhow::Result { - let local_metadata_path = metadata_path(conf, timeline_id, tenant_id); - let local_metadata_bytes = fs::read(&local_metadata_path) - .await - .context("Failed to read local metadata file bytes")?; - TimelineMetadata::from_bytes(&local_metadata_bytes) - .context("Failed to read local metadata files bytes") } #[cfg(test)] mod tests { - use std::collections::BTreeSet; + use std::collections::{BTreeSet, HashSet}; use tempfile::tempdir; - use tokio::fs; use zenith_utils::lsn::Lsn; use crate::{ remote_storage::{ - local_fs::LocalFs, - storage_sync::test_utils::{ - assert_index_descriptions, assert_timeline_files_match, create_local_timeline, - dummy_metadata, ensure_correct_timeline_upload, expect_timeline, + storage_sync::{ + index::RelativePath, + test_utils::{create_local_timeline, dummy_metadata}, }, + LocalFs, }, repository::repo_harness::{RepoHarness, TIMELINE_ID}, }; @@ -256,80 +216,185 @@ mod tests { use super::*; #[tokio::test] - async fn test_download_timeline() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("test_download_timeline")?; - let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?; - let index = RemoteIndex::try_parse_descriptions_from_paths( - repo_harness.conf, - storage - .list() - .await? - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), + async fn download_timeline() -> anyhow::Result<()> { + let harness = RepoHarness::create("download_timeline")?; + let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"]; + let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; + let current_retries = 3; + let metadata = dummy_metadata(Lsn(0x30)); + let local_timeline_path = harness.timeline_path(&TIMELINE_ID); + let timeline_upload = + create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; + + for local_path in timeline_upload.layers_to_upload { + let remote_path = storage.storage_path(&local_path)?; + let remote_parent_dir = remote_path.parent().unwrap(); + if !remote_parent_dir.exists() { + fs::create_dir_all(&remote_parent_dir).await?; + } + fs::copy(&local_path, &remote_path).await?; + } + let mut read_dir = fs::read_dir(&local_timeline_path).await?; + while let Some(dir_entry) = read_dir.next_entry().await? { + if dir_entry.file_name().to_str() == Some("layer_to_keep_locally") { + continue; + } else { + fs::remove_file(dir_entry.path()).await?; + } + } + + let mut remote_timeline = RemoteTimeline::new(metadata.clone()); + remote_timeline.awaits_download = true; + remote_timeline.add_timeline_layers( + layer_files + .iter() + .map(|layer| local_timeline_path.join(layer)), ); - let remote_assets = Arc::new((storage, index)); - let storage = &remote_assets.0; - let index = &remote_assets.1; - let regular_timeline_path = repo_harness.timeline_path(&TIMELINE_ID); - let regular_timeline = create_local_timeline( - &repo_harness, - TIMELINE_ID, - &["a", "b"], - dummy_metadata(Lsn(0x30)), - )?; - ensure_correct_timeline_upload( - &repo_harness, - Arc::clone(&remote_assets), - TIMELINE_ID, - regular_timeline, - ) - .await; - // upload multiple checkpoints for the same timeline - let regular_timeline = create_local_timeline( - &repo_harness, - TIMELINE_ID, - &["c", "d"], - dummy_metadata(Lsn(0x40)), - )?; - ensure_correct_timeline_upload( - &repo_harness, - Arc::clone(&remote_assets), - TIMELINE_ID, - regular_timeline, - ) - .await; - - fs::remove_dir_all(®ular_timeline_path).await?; - let remote_regular_timeline = expect_timeline(index, sync_id).await; - - download_timeline( - repo_harness.conf, - Arc::clone(&remote_assets), + let download_data = match download_timeline_layers( + &storage, + Some(&remote_timeline), sync_id, - TimelineDownload { - files_to_skip: Arc::new(BTreeSet::new()), - archives_to_skip: BTreeSet::new(), - }, - 0, + SyncData::new( + current_retries, + TimelineDownload { + layers_to_skip: HashSet::from([local_timeline_path.join("layer_to_skip")]), + }, + ), ) - .await; - assert_index_descriptions( - index, - &RemoteIndex::try_parse_descriptions_from_paths( - repo_harness.conf, - remote_assets - .0 - .list() - .await - .unwrap() - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), + .await + { + DownloadedTimeline::Successful(data) => data, + wrong_result => panic!( + "Expected a successful download for timeline, but got: {:?}", + wrong_result + ), + }; + + assert_eq!( + current_retries, download_data.retries, + "On successful download, retries are not expected to change" + ); + assert_eq!( + download_data + .data + .layers_to_skip + .into_iter() + .collect::>(), + layer_files + .iter() + .map(|layer| local_timeline_path.join(layer)) + .collect(), + "On successful download, layers to skip should contain all downloaded files and present layers that were skipped" + ); + + let mut downloaded_files = BTreeSet::new(); + let mut read_dir = fs::read_dir(&local_timeline_path).await?; + while let Some(dir_entry) = read_dir.next_entry().await? { + downloaded_files.insert(dir_entry.path()); + } + + assert_eq!( + downloaded_files, + layer_files + .iter() + .filter(|layer| layer != &&"layer_to_skip") + .map(|layer| local_timeline_path.join(layer)) + .collect(), + "On successful download, all layers that were not skipped, should be downloaded" + ); + + Ok(()) + } + + #[tokio::test] + async fn download_timeline_negatives() -> anyhow::Result<()> { + let harness = RepoHarness::create("download_timeline_negatives")?; + let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?; + + let empty_remote_timeline_download = download_timeline_layers( + &storage, + None, + sync_id, + SyncData::new( + 0, + TimelineDownload { + layers_to_skip: HashSet::new(), + }, ), ) .await; - assert_timeline_files_match(&repo_harness, TIMELINE_ID, remote_regular_timeline); + assert!( + matches!(empty_remote_timeline_download, DownloadedTimeline::Abort), + "Should not allow downloading for empty remote timeline" + ); + + let not_expecting_download_remote_timeline = RemoteTimeline::new(dummy_metadata(Lsn(5))); + assert!( + !not_expecting_download_remote_timeline.awaits_download, + "Should not expect download for the timeline" + ); + let already_downloading_remote_timeline_download = download_timeline_layers( + &storage, + Some(¬_expecting_download_remote_timeline), + sync_id, + SyncData::new( + 0, + TimelineDownload { + layers_to_skip: HashSet::new(), + }, + ), + ) + .await; + assert!( + matches!( + dbg!(already_downloading_remote_timeline_download), + DownloadedTimeline::Abort, + ), + "Should not allow downloading for remote timeline that does not expect it" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_download_index_part() -> anyhow::Result<()> { + let harness = RepoHarness::create("test_download_index_part")?; + let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + + let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; + let metadata = dummy_metadata(Lsn(0x30)); + let local_timeline_path = harness.timeline_path(&TIMELINE_ID); + + let index_part = IndexPart::new( + HashSet::from([ + RelativePath::new(&local_timeline_path, local_timeline_path.join("one"))?, + RelativePath::new(&local_timeline_path, local_timeline_path.join("two"))?, + ]), + HashSet::from([RelativePath::new( + &local_timeline_path, + local_timeline_path.join("three"), + )?]), + metadata.disk_consistent_lsn(), + metadata.to_bytes()?, + ); + + let local_index_part_path = + metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id) + .with_file_name(IndexPart::FILE_NAME) + .with_extension(IndexPart::FILE_EXTENSION); + let storage_path = storage.storage_path(&local_index_part_path)?; + fs::create_dir_all(storage_path.parent().unwrap()).await?; + fs::write(&storage_path, serde_json::to_vec(&index_part)?).await?; + + let downloaded_index_part = download_index_part(harness.conf, &storage, sync_id).await?; + + assert_eq!( + downloaded_index_part, index_part, + "Downloaded index part should be the same as the one in storage" + ); Ok(()) } diff --git a/pageserver/src/remote_storage/storage_sync/index.rs b/pageserver/src/remote_storage/storage_sync/index.rs index 861b78fa3b..918bda1039 100644 --- a/pageserver/src/remote_storage/storage_sync/index.rs +++ b/pageserver/src/remote_storage/storage_sync/index.rs @@ -1,63 +1,56 @@ -//! In-memory index to track the tenant files on the remote strorage, mitigating the storage format differences between the local and remote files. -//! Able to restore itself from the storage archive data and reconstruct archive indices on demand. -//! -//! The index is intended to be portable, so deliberately does not store any local paths inside. -//! This way in the future, the index could be restored fast from its serialized stored form. +//! In-memory index to track the tenant files on the remote storage. +//! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about +//! remote timeline layers and its metadata. use std::{ - collections::{BTreeMap, BTreeSet, HashMap}, + collections::{HashMap, HashSet}, path::{Path, PathBuf}, sync::Arc, }; -use anyhow::{bail, ensure, Context}; +use anyhow::{Context, Ok}; use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use tokio::sync::RwLock; -use tracing::*; -use zenith_utils::{ - lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, -}; use crate::{ - config::PageServerConf, - layered_repository::TIMELINES_SEGMENT_NAME, - remote_storage::{ - storage_sync::compression::{parse_archive_name, FileEntry}, - ZTenantTimelineId, - }, + config::PageServerConf, layered_repository::metadata::TimelineMetadata, + remote_storage::ZTenantTimelineId, }; - -use super::compression::ArchiveHeader; +use zenith_utils::lsn::Lsn; /// A part of the filesystem path, that needs a root to become a path again. #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +#[serde(transparent)] pub struct RelativePath(String); impl RelativePath { /// Attempts to strip off the base from path, producing a relative path or an error. pub fn new>(base: &Path, path: P) -> anyhow::Result { - let relative = path - .as_ref() - .strip_prefix(base) - .context("path is not relative to base")?; + let path = path.as_ref(); + let relative = path.strip_prefix(base).with_context(|| { + format!( + "path '{}' is not relative to base '{}'", + path.display(), + base.display() + ) + })?; Ok(RelativePath(relative.to_string_lossy().to_string())) } /// Joins the relative path with the base path. - pub fn as_path(&self, base: &Path) -> PathBuf { + fn as_path(&self, base: &Path) -> PathBuf { base.join(&self.0) } } /// An index to track tenant files that exist on the remote storage. -/// Currently, timeline archive files are tracked only. #[derive(Debug, Clone)] pub struct RemoteTimelineIndex { - timeline_entries: HashMap, + timeline_entries: HashMap, } -/// A wrapper to synchrnize access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`]. +/// A wrapper to synchronize the access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`]. pub struct RemoteIndex(Arc>); impl RemoteIndex { @@ -67,27 +60,22 @@ impl RemoteIndex { }))) } - /// Attempts to parse file paths (not checking the file contents) and find files - /// that can be tracked wiht the index. - /// On parse falures, logs the error and continues, so empty index can be created from not suitable paths. - pub fn try_parse_descriptions_from_paths>( + pub fn from_parts( conf: &'static PageServerConf, - paths: impl Iterator, - ) -> Self { - let mut index = RemoteTimelineIndex { - timeline_entries: HashMap::new(), - }; - for path in paths { - if let Err(e) = try_parse_index_entry(&mut index, conf, path.as_ref()) { - debug!( - "Failed to parse path '{}' as index entry: {:#}", - path.as_ref().display(), - e - ); - } + index_parts: HashMap, + ) -> anyhow::Result { + let mut timeline_entries = HashMap::new(); + + for (sync_id, index_part) in index_parts { + let timeline_path = conf.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id); + let remote_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part) + .context("Failed to restore remote timeline data from index part")?; + timeline_entries.insert(sync_id, remote_timeline); } - Self(Arc::new(RwLock::new(index))) + Ok(Self(Arc::new(RwLock::new(RemoteTimelineIndex { + timeline_entries, + })))) } pub async fn read(&self) -> tokio::sync::RwLockReadGuard<'_, RemoteTimelineIndex> { @@ -106,39 +94,18 @@ impl Clone for RemoteIndex { } impl RemoteTimelineIndex { - pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&TimelineIndexEntry> { + pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&RemoteTimeline> { self.timeline_entries.get(id) } - pub fn timeline_entry_mut( - &mut self, - id: &ZTenantTimelineId, - ) -> Option<&mut TimelineIndexEntry> { + pub fn timeline_entry_mut(&mut self, id: &ZTenantTimelineId) -> Option<&mut RemoteTimeline> { self.timeline_entries.get_mut(id) } - pub fn add_timeline_entry(&mut self, id: ZTenantTimelineId, entry: TimelineIndexEntry) { + pub fn add_timeline_entry(&mut self, id: ZTenantTimelineId, entry: RemoteTimeline) { self.timeline_entries.insert(id, entry); } - pub fn upgrade_timeline_entry( - &mut self, - id: &ZTenantTimelineId, - remote_timeline: RemoteTimeline, - ) -> anyhow::Result<()> { - let mut entry = self.timeline_entries.get_mut(id).ok_or(anyhow::anyhow!( - "timeline is unexpectedly missing from remote index" - ))?; - - if !matches!(entry.inner, TimelineIndexEntryInner::Description(_)) { - anyhow::bail!("timeline entry is not a description entry") - }; - - entry.inner = TimelineIndexEntryInner::Full(remote_timeline); - - Ok(()) - } - pub fn all_sync_ids(&self) -> impl Iterator + '_ { self.timeline_entries.keys().copied() } @@ -150,351 +117,295 @@ impl RemoteTimelineIndex { ) -> anyhow::Result<()> { self.timeline_entry_mut(id) .ok_or_else(|| anyhow::anyhow!("unknown timeline sync {}", id))? - .set_awaits_download(awaits_download); + .awaits_download = awaits_download; Ok(()) } } -#[derive(Debug, Clone, PartialEq, Eq, Default)] -pub struct DescriptionTimelineIndexEntry { - pub description: BTreeMap, - pub awaits_download: bool, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct FullTimelineIndexEntry { - pub remote_timeline: RemoteTimeline, - pub awaits_download: bool, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum TimelineIndexEntryInner { - Description(BTreeMap), - Full(RemoteTimeline), -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct TimelineIndexEntry { - inner: TimelineIndexEntryInner, - awaits_download: bool, -} - -impl TimelineIndexEntry { - pub fn new(inner: TimelineIndexEntryInner, awaits_download: bool) -> Self { - Self { - inner, - awaits_download, - } - } - - pub fn inner(&self) -> &TimelineIndexEntryInner { - &self.inner - } - - pub fn inner_mut(&mut self) -> &mut TimelineIndexEntryInner { - &mut self.inner - } - - pub fn uploaded_checkpoints(&self) -> BTreeSet { - match &self.inner { - TimelineIndexEntryInner::Description(description) => { - description.keys().map(|archive_id| archive_id.0).collect() - } - TimelineIndexEntryInner::Full(remote_timeline) => remote_timeline - .checkpoint_archives - .keys() - .map(|archive_id| archive_id.0) - .collect(), - } - } - - /// Gets latest uploaded checkpoint's disk consisten Lsn for the corresponding timeline. - pub fn disk_consistent_lsn(&self) -> Option { - match &self.inner { - TimelineIndexEntryInner::Description(description) => { - description.keys().map(|archive_id| archive_id.0).max() - } - TimelineIndexEntryInner::Full(remote_timeline) => remote_timeline - .checkpoint_archives - .keys() - .map(|archive_id| archive_id.0) - .max(), - } - } - - pub fn get_awaits_download(&self) -> bool { - self.awaits_download - } - - pub fn set_awaits_download(&mut self, awaits_download: bool) { - self.awaits_download = awaits_download; - } -} - -/// Checkpoint archive's id, corresponding to the `disk_consistent_lsn` from the timeline's metadata file during checkpointing. -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] -pub struct ArchiveId(pub(super) Lsn); - -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] -struct FileId(ArchiveId, ArchiveEntryNumber); - -type ArchiveEntryNumber = usize; - -/// All archives and files in them, representing a certain timeline. -/// Uses file and archive IDs to reference those without ownership issues. +/// Restored index part data about the timeline, stored in the remote index. #[derive(Debug, PartialEq, Eq, Clone)] pub struct RemoteTimeline { - timeline_files: BTreeMap, - checkpoint_archives: BTreeMap, -} + timeline_layers: HashSet, + missing_layers: HashSet, -/// Archive metadata, enough to restore a header with the timeline data. -#[derive(Debug, PartialEq, Eq, Clone)] -pub struct CheckpointArchive { - disk_consistent_lsn: Lsn, - metadata_file_size: u64, - files: BTreeSet, - archive_header_size: u64, -} - -impl CheckpointArchive { - pub fn disk_consistent_lsn(&self) -> Lsn { - self.disk_consistent_lsn - } + pub metadata: TimelineMetadata, + pub awaits_download: bool, } impl RemoteTimeline { - pub fn empty() -> Self { + pub fn new(metadata: TimelineMetadata) -> Self { Self { - timeline_files: BTreeMap::new(), - checkpoint_archives: BTreeMap::new(), + timeline_layers: HashSet::new(), + missing_layers: HashSet::new(), + metadata, + awaits_download: false, } } - pub fn checkpoints(&self) -> impl Iterator + '_ { - self.checkpoint_archives - .values() - .map(CheckpointArchive::disk_consistent_lsn) + pub fn add_timeline_layers(&mut self, new_layers: impl IntoIterator) { + self.timeline_layers.extend(new_layers.into_iter()); + } + + pub fn add_upload_failures(&mut self, upload_failures: impl IntoIterator) { + self.missing_layers.extend(upload_failures.into_iter()); } /// Lists all layer files in the given remote timeline. Omits the metadata file. - pub fn stored_files(&self, timeline_dir: &Path) -> BTreeSet { - self.timeline_files - .values() - .map(|file_entry| file_entry.subpath.as_path(timeline_dir)) - .collect() + pub fn stored_files(&self) -> &HashSet { + &self.timeline_layers } - pub fn contains_checkpoint_at(&self, disk_consistent_lsn: Lsn) -> bool { - self.checkpoint_archives - .contains_key(&ArchiveId(disk_consistent_lsn)) + pub fn from_index_part(timeline_path: &Path, index_part: IndexPart) -> anyhow::Result { + let metadata = TimelineMetadata::from_bytes(&index_part.metadata_bytes)?; + Ok(Self { + timeline_layers: to_local_paths(timeline_path, index_part.timeline_layers), + missing_layers: to_local_paths(timeline_path, index_part.missing_layers), + metadata, + awaits_download: false, + }) } +} - pub fn archive_data(&self, archive_id: ArchiveId) -> Option<&CheckpointArchive> { - self.checkpoint_archives.get(&archive_id) - } +/// Part of the remote index, corresponding to a certain timeline. +/// Contains the data about all files in the timeline, present remotely and its metadata. +#[serde_as] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +pub struct IndexPart { + timeline_layers: HashSet, + /// Currently is not really used in pageserver, + /// present to manually keep track of the layer files that pageserver might never retrieve. + /// + /// Such "holes" might appear if any upload task was evicted on an error threshold: + /// the this layer will only be rescheduled for upload on pageserver restart. + missing_layers: HashSet, + #[serde_as(as = "DisplayFromStr")] + disk_consistent_lsn: Lsn, + metadata_bytes: Vec, +} - /// Restores a header of a certain remote archive from the memory data. - /// Returns the header and its compressed size in the archive, both can be used to uncompress that archive. - pub fn restore_header(&self, archive_id: ArchiveId) -> anyhow::Result<(ArchiveHeader, u64)> { - let archive = self - .checkpoint_archives - .get(&archive_id) - .with_context(|| format!("Archive {:?} not found", archive_id))?; +impl IndexPart { + pub const FILE_NAME: &'static str = "index_part"; + pub const FILE_EXTENSION: &'static str = "json"; - let mut header_files = Vec::with_capacity(archive.files.len()); - for (expected_archive_position, archive_file) in archive.files.iter().enumerate() { - let &FileId(archive_id, archive_position) = archive_file; - ensure!( - expected_archive_position == archive_position, - "Archive header is corrupt, file # {} from archive {:?} header is missing", - expected_archive_position, - archive_id, - ); - - let timeline_file = self.timeline_files.get(archive_file).with_context(|| { - format!( - "File with id {:?} not found for archive {:?}", - archive_file, archive_id - ) - })?; - header_files.push(timeline_file.clone()); - } - - Ok(( - ArchiveHeader { - files: header_files, - metadata_file_size: archive.metadata_file_size, - }, - archive.archive_header_size, - )) - } - - /// Updates (creates, if necessary) the data about certain archive contents. - pub fn update_archive_contents( - &mut self, + #[cfg(test)] + pub fn new( + timeline_layers: HashSet, + missing_layers: HashSet, disk_consistent_lsn: Lsn, - header: ArchiveHeader, - header_size: u64, - ) { - let archive_id = ArchiveId(disk_consistent_lsn); - let mut common_archive_files = BTreeSet::new(); - for (file_index, file_entry) in header.files.into_iter().enumerate() { - let file_id = FileId(archive_id, file_index); - self.timeline_files.insert(file_id, file_entry); - common_archive_files.insert(file_id); + metadata_bytes: Vec, + ) -> Self { + Self { + timeline_layers, + missing_layers, + disk_consistent_lsn, + metadata_bytes, } + } - let metadata_file_size = header.metadata_file_size; - self.checkpoint_archives - .entry(archive_id) - .or_insert_with(|| CheckpointArchive { - metadata_file_size, - files: BTreeSet::new(), - archive_header_size: header_size, - disk_consistent_lsn, - }) - .files - .extend(common_archive_files.into_iter()); + pub fn missing_files(&self) -> &HashSet { + &self.missing_layers + } + + pub fn from_remote_timeline( + timeline_path: &Path, + remote_timeline: RemoteTimeline, + ) -> anyhow::Result { + let metadata_bytes = remote_timeline.metadata.to_bytes()?; + Ok(Self { + timeline_layers: to_relative_paths(timeline_path, remote_timeline.timeline_layers) + .context("Failed to convert timeline layers' paths to relative ones")?, + missing_layers: to_relative_paths(timeline_path, remote_timeline.missing_layers) + .context("Failed to convert missing layers' paths to relative ones")?, + disk_consistent_lsn: remote_timeline.metadata.disk_consistent_lsn(), + metadata_bytes, + }) } } -/// Metadata abput timeline checkpoint archive, parsed from its remote storage path. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct ArchiveDescription { - pub header_size: u64, - pub disk_consistent_lsn: Lsn, - pub archive_name: String, +fn to_local_paths( + timeline_path: &Path, + paths: impl IntoIterator, +) -> HashSet { + paths + .into_iter() + .map(|path| path.as_path(timeline_path)) + .collect() } -fn try_parse_index_entry( - index: &mut RemoteTimelineIndex, - conf: &'static PageServerConf, - path: &Path, -) -> anyhow::Result<()> { - let tenants_dir = conf.tenants_path(); - let tenant_id = path - .strip_prefix(&tenants_dir) - .with_context(|| { - format!( - "Path '{}' does not belong to tenants directory '{}'", - path.display(), - tenants_dir.display(), - ) - })? - .iter() - .next() - .with_context(|| format!("Found no tenant id in path '{}'", path.display()))? - .to_string_lossy() - .parse::() - .with_context(|| format!("Failed to parse tenant id from path '{}'", path.display()))?; - - let timelines_path = conf.timelines_path(&tenant_id); - match path.strip_prefix(&timelines_path) { - Ok(timelines_subpath) => { - let mut segments = timelines_subpath.iter(); - let timeline_id = segments - .next() - .with_context(|| { - format!( - "{} directory of tenant {} (path '{}') is not an index entry", - TIMELINES_SEGMENT_NAME, - tenant_id, - path.display() - ) - })? - .to_string_lossy() - .parse::() - .with_context(|| { - format!("Failed to parse timeline id from path '{}'", path.display()) - })?; - - let (disk_consistent_lsn, header_size) = - parse_archive_name(path).with_context(|| { - format!( - "Failed to parse archive name out in path '{}'", - path.display() - ) - })?; - - let archive_name = path - .file_name() - .with_context(|| format!("Archive '{}' has no file name", path.display()))? - .to_string_lossy() - .to_string(); - - let sync_id = ZTenantTimelineId { - tenant_id, - timeline_id, - }; - let timeline_index_entry = index.timeline_entries.entry(sync_id).or_insert_with(|| { - TimelineIndexEntry::new( - TimelineIndexEntryInner::Description(BTreeMap::default()), - false, - ) - }); - match timeline_index_entry.inner_mut() { - TimelineIndexEntryInner::Description(description) => { - description.insert( - ArchiveId(disk_consistent_lsn), - ArchiveDescription { - header_size, - disk_consistent_lsn, - archive_name, - }, - ); - } - TimelineIndexEntryInner::Full(_) => { - bail!("Cannot add parsed archive description to its full context in index with sync id {}", sync_id) - } - } - } - Err(timelines_strip_error) => { - bail!( - "Path '{}' is not an archive entry '{}'", - path.display(), - timelines_strip_error, - ) - } - } - Ok(()) +fn to_relative_paths( + timeline_path: &Path, + paths: impl IntoIterator, +) -> anyhow::Result> { + paths + .into_iter() + .map(|path| RelativePath::new(timeline_path, path)) + .collect() } #[cfg(test)] mod tests { + use std::collections::BTreeSet; + use super::*; + use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID}; #[test] - fn header_restoration_preserves_file_order() { - let header = ArchiveHeader { - files: vec![ - FileEntry { - size: 5, - subpath: RelativePath("one".to_string()), - }, - FileEntry { - size: 1, - subpath: RelativePath("two".to_string()), - }, - FileEntry { - size: 222, - subpath: RelativePath("zero".to_string()), - }, - ], - metadata_file_size: 5, + fn index_part_conversion() { + let harness = RepoHarness::create("index_part_conversion").unwrap(); + let timeline_path = harness.timeline_path(&TIMELINE_ID); + let metadata = + TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1)); + let remote_timeline = RemoteTimeline { + timeline_layers: HashSet::from([ + timeline_path.join("layer_1"), + timeline_path.join("layer_2"), + ]), + missing_layers: HashSet::from([ + timeline_path.join("missing_1"), + timeline_path.join("missing_2"), + ]), + metadata: metadata.clone(), + awaits_download: false, }; - let lsn = Lsn(1); - let mut remote_timeline = RemoteTimeline::empty(); - remote_timeline.update_archive_contents(lsn, header.clone(), 15); - - let (restored_header, _) = remote_timeline - .restore_header(ArchiveId(lsn)) - .expect("Should be able to restore header from a valid remote timeline"); + let index_part = IndexPart::from_remote_timeline(&timeline_path, remote_timeline.clone()) + .expect("Correct remote timeline should be convertable to index part"); assert_eq!( - header, restored_header, - "Header restoration should preserve file order" + index_part.timeline_layers.iter().collect::>(), + BTreeSet::from([ + &RelativePath("layer_1".to_string()), + &RelativePath("layer_2".to_string()) + ]), + "Index part should have all remote timeline layers after the conversion" + ); + assert_eq!( + index_part.missing_layers.iter().collect::>(), + BTreeSet::from([ + &RelativePath("missing_1".to_string()), + &RelativePath("missing_2".to_string()) + ]), + "Index part should have all missing remote timeline layers after the conversion" + ); + assert_eq!( + index_part.disk_consistent_lsn, + metadata.disk_consistent_lsn(), + "Index part should have disk consistent lsn from the timeline" + ); + assert_eq!( + index_part.metadata_bytes, + metadata + .to_bytes() + .expect("Failed to serialize correct metadata into bytes"), + "Index part should have all missing remote timeline layers after the conversion" + ); + + let restored_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part) + .expect("Correct index part should be convertable to remote timeline"); + + let original_metadata = &remote_timeline.metadata; + let restored_metadata = &restored_timeline.metadata; + // we have to compare the metadata this way, since its header is different after creation and restoration, + // but that is now consireded ok. + assert_eq!( + original_metadata.disk_consistent_lsn(), + restored_metadata.disk_consistent_lsn(), + "remote timeline -> index part -> remote timeline conversion should not alter metadata" + ); + assert_eq!( + original_metadata.prev_record_lsn(), + restored_metadata.prev_record_lsn(), + "remote timeline -> index part -> remote timeline conversion should not alter metadata" + ); + assert_eq!( + original_metadata.ancestor_timeline(), + restored_metadata.ancestor_timeline(), + "remote timeline -> index part -> remote timeline conversion should not alter metadata" + ); + assert_eq!( + original_metadata.ancestor_lsn(), + restored_metadata.ancestor_lsn(), + "remote timeline -> index part -> remote timeline conversion should not alter metadata" + ); + assert_eq!( + original_metadata.latest_gc_cutoff_lsn(), + restored_metadata.latest_gc_cutoff_lsn(), + "remote timeline -> index part -> remote timeline conversion should not alter metadata" + ); + assert_eq!( + original_metadata.initdb_lsn(), + restored_metadata.initdb_lsn(), + "remote timeline -> index part -> remote timeline conversion should not alter metadata" + ); + + assert_eq!( + remote_timeline.awaits_download, restored_timeline.awaits_download, + "remote timeline -> index part -> remote timeline conversion should not loose download flag" + ); + + assert_eq!( + remote_timeline + .timeline_layers + .into_iter() + .collect::>(), + restored_timeline + .timeline_layers + .into_iter() + .collect::>(), + "remote timeline -> index part -> remote timeline conversion should not loose layer data" + ); + assert_eq!( + remote_timeline + .missing_layers + .into_iter() + .collect::>(), + restored_timeline + .missing_layers + .into_iter() + .collect::>(), + "remote timeline -> index part -> remote timeline conversion should not loose missing file data" ); } + + #[test] + fn index_part_conversion_negatives() { + let harness = RepoHarness::create("index_part_conversion_negatives").unwrap(); + let timeline_path = harness.timeline_path(&TIMELINE_ID); + let metadata = + TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1)); + + let conversion_result = IndexPart::from_remote_timeline( + &timeline_path, + RemoteTimeline { + timeline_layers: HashSet::from([ + PathBuf::from("bad_path"), + timeline_path.join("layer_2"), + ]), + missing_layers: HashSet::from([ + timeline_path.join("missing_1"), + timeline_path.join("missing_2"), + ]), + metadata: metadata.clone(), + awaits_download: false, + }, + ); + assert!(conversion_result.is_err(), "Should not be able to convert metadata with layer paths that are not in the timeline directory"); + + let conversion_result = IndexPart::from_remote_timeline( + &timeline_path, + RemoteTimeline { + timeline_layers: HashSet::from([ + timeline_path.join("layer_1"), + timeline_path.join("layer_2"), + ]), + missing_layers: HashSet::from([ + PathBuf::from("bad_path"), + timeline_path.join("missing_2"), + ]), + metadata, + awaits_download: false, + }, + ); + assert!(conversion_result.is_err(), "Should not be able to convert metadata with missing layer paths that are not in the timeline directory"); + } } diff --git a/pageserver/src/remote_storage/storage_sync/upload.rs b/pageserver/src/remote_storage/storage_sync/upload.rs index 7b6d58a661..81758ce3ef 100644 --- a/pageserver/src/remote_storage/storage_sync/upload.rs +++ b/pageserver/src/remote_storage/storage_sync/upload.rs @@ -1,520 +1,456 @@ //! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints. -use std::{collections::BTreeSet, path::PathBuf, sync::Arc}; +use std::{fmt::Debug, path::PathBuf}; -use tracing::{debug, error, warn}; +use anyhow::Context; +use futures::stream::{FuturesUnordered, StreamExt}; +use tokio::fs; +use tracing::{debug, error, trace, warn}; use crate::{ config::PageServerConf, + layered_repository::metadata::metadata_path, remote_storage::{ - storage_sync::{ - compression, fetch_full_index, - index::{RemoteTimeline, TimelineIndexEntry, TimelineIndexEntryInner}, - sync_queue, SyncKind, SyncTask, - }, + storage_sync::{index::RemoteTimeline, sync_queue, SyncTask}, RemoteStorage, ZTenantTimelineId, }, }; -use super::{compression::ArchiveHeader, NewCheckpoint, RemoteIndex}; +use super::{index::IndexPart, SyncData, TimelineUpload}; -/// Attempts to compress and upload given checkpoint files. -/// No extra checks for overlapping files is made: download takes care of that, ensuring no non-metadata local timeline files are overwritten. +/// Serializes and uploads the given index part data to the remote storage. +pub(super) async fn upload_index_part( + conf: &'static PageServerConf, + storage: &S, + sync_id: ZTenantTimelineId, + index_part: IndexPart, +) -> anyhow::Result<()> +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let index_part_bytes = serde_json::to_vec(&index_part) + .context("Failed to serialize index part file into bytes")?; + let index_part_size = index_part_bytes.len(); + let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes)); + + let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) + .with_file_name(IndexPart::FILE_NAME) + .with_extension(IndexPart::FILE_EXTENSION); + let index_part_storage_path = storage.storage_path(&index_part_path).with_context(|| { + format!( + "Failed to get the index part storage path for local path '{}'", + index_part_path.display() + ) + })?; + + storage + .upload( + index_part_bytes, + index_part_size, + &index_part_storage_path, + None, + ) + .await + .with_context(|| { + format!( + "Failed to upload index part to the storage path '{:?}'", + index_part_storage_path + ) + }) +} + +/// Timeline upload result, with extra data, needed for uploading. +#[derive(Debug)] +pub(super) enum UploadedTimeline { + /// Upload failed due to some error, the upload task is rescheduled for another retry. + FailedAndRescheduled, + /// No issues happened during the upload, all task files were put into the remote storage. + Successful(SyncData), + /// No failures happened during the upload, but some files were removed locally before the upload task completed + /// (could happen due to retries, for instance, if GC happens in the interim). + /// Such files are considered "not needed" and ignored, but the task's metadata should be discarded and the new one loaded from the local file. + SuccessfulAfterLocalFsUpdate(SyncData), +} + +/// Attempts to upload given layer files. +/// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload. /// /// On an error, bumps the retries count and reschedules the entire task. -/// On success, populates index data with new downloads. -pub(super) async fn upload_timeline_checkpoint< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - config: &'static PageServerConf, - remote_assets: Arc<(S, RemoteIndex)>, +pub(super) async fn upload_timeline_layers<'a, P, S>( + storage: &'a S, + remote_timeline: Option<&'a RemoteTimeline>, sync_id: ZTenantTimelineId, - new_checkpoint: NewCheckpoint, - retries: u32, -) -> Option { - debug!("Uploading checkpoint for sync id {}", sync_id); - let new_upload_lsn = new_checkpoint.metadata.disk_consistent_lsn(); + mut upload_data: SyncData, +) -> UploadedTimeline +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let upload = &mut upload_data.data; + let new_upload_lsn = upload.metadata.disk_consistent_lsn(); + debug!( + "Uploading timeline layers for sync id {}, new lsn: {}", + sync_id, new_upload_lsn + ); - let index = &remote_assets.1; - - let ZTenantTimelineId { - tenant_id, - timeline_id, - } = sync_id; - let timeline_dir = config.timeline_path(&timeline_id, &tenant_id); - - let index_read = index.read().await; - let remote_timeline = match index_read.timeline_entry(&sync_id) { - None => { - drop(index_read); - None - } - Some(entry) => match entry.inner() { - TimelineIndexEntryInner::Full(remote_timeline) => { - let r = Some(remote_timeline.clone()); - drop(index_read); - r - } - TimelineIndexEntryInner::Description(_) => { - drop(index_read); - debug!("Found timeline description for the given ids, downloading the full index"); - match fetch_full_index(remote_assets.as_ref(), &timeline_dir, sync_id).await { - Ok(remote_timeline) => Some(remote_timeline), - Err(e) => { - error!("Failed to download full timeline index: {:?}", e); - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Upload(new_checkpoint), - )); - return Some(false); - } - } - } - }, - }; - - let already_contains_upload_lsn = remote_timeline - .as_ref() - .map(|remote_timeline| remote_timeline.contains_checkpoint_at(new_upload_lsn)) - .unwrap_or(false); - if already_contains_upload_lsn { - warn!( - "Received a checkpoint with Lsn {} that's already been uploaded to remote storage, skipping the upload.", - new_upload_lsn - ); - return None; - } - - let already_uploaded_files = remote_timeline - .map(|timeline| timeline.stored_files(&timeline_dir)) + let already_uploaded_layers = remote_timeline + .map(|timeline| timeline.stored_files()) + .cloned() .unwrap_or_default(); - match try_upload_checkpoint( - config, - Arc::clone(&remote_assets), - sync_id, - &new_checkpoint, - already_uploaded_files, - ) - .await - { - Some(Ok((archive_header, header_size))) => { - let mut index_write = index.write().await; - match index_write - .timeline_entry_mut(&sync_id) - .map(|e| e.inner_mut()) - { - None => { - let mut new_timeline = RemoteTimeline::empty(); - new_timeline.update_archive_contents( - new_checkpoint.metadata.disk_consistent_lsn(), - archive_header, - header_size, - ); - index_write.add_timeline_entry( - sync_id, - TimelineIndexEntry::new(TimelineIndexEntryInner::Full(new_timeline), false), + let layers_to_upload = upload + .layers_to_upload + .difference(&already_uploaded_layers) + .cloned() + .collect::>(); + + trace!("Layers to upload: {:?}", layers_to_upload); + + let mut upload_tasks = layers_to_upload + .into_iter() + .map(|source_path| async move { + let storage_path = storage + .storage_path(&source_path) + .with_context(|| { + format!( + "Failed to get the layer storage path for local path '{}'", + source_path.display() ) - } - Some(TimelineIndexEntryInner::Full(remote_timeline)) => { - remote_timeline.update_archive_contents( - new_checkpoint.metadata.disk_consistent_lsn(), - archive_header, - header_size, - ); - } - Some(TimelineIndexEntryInner::Description(_)) => { - let mut new_timeline = RemoteTimeline::empty(); - new_timeline.update_archive_contents( - new_checkpoint.metadata.disk_consistent_lsn(), - archive_header, - header_size, - ); - index_write.add_timeline_entry( - sync_id, - TimelineIndexEntry::new(TimelineIndexEntryInner::Full(new_timeline), false), + }) + .map_err(UploadError::Other)?; + + let source_file = match fs::File::open(&source_path).await.with_context(|| { + format!( + "Failed to upen a source file for layer '{}'", + source_path.display() + ) + }) { + Ok(file) => file, + Err(e) => return Err(UploadError::MissingLocalFile(source_path, e)), + }; + + let source_size = source_file + .metadata() + .await + .with_context(|| { + format!( + "Failed to get the source file metadata for layer '{}'", + source_path.display() ) - } + }) + .map_err(UploadError::Other)? + .len() as usize; + + match storage + .upload(source_file, source_size, &storage_path, None) + .await + .with_context(|| { + format!( + "Failed to upload a layer from local path '{}'", + source_path.display() + ) + }) { + Ok(()) => Ok(source_path), + Err(e) => Err(UploadError::MissingLocalFile(source_path, e)), } - debug!("Checkpoint uploaded successfully"); - Some(true) + }) + .collect::>(); + + debug!("uploading {} layers of a timeline", upload_tasks.len()); + + let mut errors_happened = false; + let mut local_fs_updated = false; + while let Some(upload_result) = upload_tasks.next().await { + match upload_result { + Ok(uploaded_path) => { + upload.layers_to_upload.remove(&uploaded_path); + upload.uploaded_layers.insert(uploaded_path); + } + Err(e) => match e { + UploadError::Other(e) => { + errors_happened = true; + error!("Failed to upload a layer for timeline {}: {:?}", sync_id, e); + } + UploadError::MissingLocalFile(source_path, e) => { + if source_path.exists() { + errors_happened = true; + error!("Failed to upload a layer for timeline {}: {:?}", sync_id, e); + } else { + local_fs_updated = true; + upload.layers_to_upload.remove(&source_path); + warn!("Missing locally a layer file scheduled for upload, skipping"); + } + } + }, } - Some(Err(e)) => { - error!( - "Failed to upload checkpoint: {:?}, requeueing the upload", - e - ); - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Upload(new_checkpoint), - )); - Some(false) + } + + if errors_happened { + debug!("Reenqueuing failed upload task for timeline {}", sync_id); + upload_data.retries += 1; + sync_queue::push(sync_id, SyncTask::Upload(upload_data)); + UploadedTimeline::FailedAndRescheduled + } else { + debug!("Finished uploading all timeline's layers"); + if local_fs_updated { + UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data) + } else { + UploadedTimeline::Successful(upload_data) } - None => Some(true), } } -async fn try_upload_checkpoint< - P: Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - config: &'static PageServerConf, - remote_assets: Arc<(S, RemoteIndex)>, - sync_id: ZTenantTimelineId, - new_checkpoint: &NewCheckpoint, - files_to_skip: BTreeSet, -) -> Option> { - let ZTenantTimelineId { - tenant_id, - timeline_id, - } = sync_id; - let timeline_dir = config.timeline_path(&timeline_id, &tenant_id); - - let files_to_upload = new_checkpoint - .layers - .iter() - .filter(|&path_to_upload| { - if files_to_skip.contains(path_to_upload) { - warn!( - "Skipping file upload '{}', since it was already uploaded", - path_to_upload.display() - ); - false - } else { - true - } - }) - .collect::>(); - - if files_to_upload.is_empty() { - warn!( - "No files to upload. Upload request was: {:?}, already uploaded files: {:?}", - new_checkpoint.layers, files_to_skip - ); - return None; - } - - let upload_result = compression::archive_files_as_stream( - &timeline_dir, - files_to_upload.into_iter(), - &new_checkpoint.metadata, - move |archive_streamer, archive_name| async move { - let timeline_dir = config.timeline_path(&timeline_id, &tenant_id); - let remote_storage = &remote_assets.0; - remote_storage - .upload( - archive_streamer, - &remote_storage.storage_path(&timeline_dir.join(&archive_name))?, - None, - ) - .await - }, - ) - .await - .map(|(header, header_size, _)| (header, header_size)); - - Some(upload_result) +enum UploadError { + MissingLocalFile(PathBuf, anyhow::Error), + Other(anyhow::Error), } #[cfg(test)] mod tests { + use std::collections::{BTreeSet, HashSet}; + use tempfile::tempdir; use zenith_utils::lsn::Lsn; use crate::{ remote_storage::{ - local_fs::LocalFs, storage_sync::{ - index::ArchiveId, - test_utils::{ - assert_index_descriptions, create_local_timeline, dummy_metadata, - ensure_correct_timeline_upload, expect_timeline, - }, + index::RelativePath, + test_utils::{create_local_timeline, dummy_metadata}, }, + LocalFs, }, repository::repo_harness::{RepoHarness, TIMELINE_ID}, }; - use super::*; + use super::{upload_index_part, *}; #[tokio::test] - async fn reupload_timeline() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("reupload_timeline")?; - let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?; - let index = RemoteIndex::try_parse_descriptions_from_paths( - repo_harness.conf, - storage - .list() - .await? - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), - ); - let remote_assets = Arc::new((storage, index)); - let index = &remote_assets.1; + async fn regular_layer_upload() -> anyhow::Result<()> { + let harness = RepoHarness::create("regular_layer_upload")?; + let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let first_upload_metadata = dummy_metadata(Lsn(0x10)); - let first_checkpoint = create_local_timeline( - &repo_harness, - TIMELINE_ID, - &["a", "b"], - first_upload_metadata.clone(), - )?; - let local_timeline_path = repo_harness.timeline_path(&TIMELINE_ID); - ensure_correct_timeline_upload( - &repo_harness, - Arc::clone(&remote_assets), - TIMELINE_ID, - first_checkpoint, + let layer_files = ["a", "b"]; + let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; + let current_retries = 3; + let metadata = dummy_metadata(Lsn(0x30)); + let local_timeline_path = harness.timeline_path(&TIMELINE_ID); + let timeline_upload = + create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; + assert!( + storage.list().await?.is_empty(), + "Storage should be empty before any uploads are made" + ); + + let upload_result = upload_timeline_layers( + &storage, + None, + sync_id, + SyncData::new(current_retries, timeline_upload.clone()), ) .await; - let uploaded_timeline = expect_timeline(index, sync_id).await; - let uploaded_archives = uploaded_timeline - .checkpoints() - .map(ArchiveId) - .collect::>(); + let upload_data = match upload_result { + UploadedTimeline::Successful(upload_data) => upload_data, + wrong_result => panic!( + "Expected a successful upload for timeline, but got: {:?}", + wrong_result + ), + }; + assert_eq!( - uploaded_archives.len(), - 1, - "Only one archive is expected after a first upload" + current_retries, upload_data.retries, + "On successful upload, retries are not expected to change" ); - let first_uploaded_archive = uploaded_archives.first().copied().unwrap(); - assert_eq!( - uploaded_timeline.checkpoints().last(), - Some(first_upload_metadata.disk_consistent_lsn()), - "Metadata that was uploaded, should have its Lsn stored" + let upload = &upload_data.data; + assert!( + upload.layers_to_upload.is_empty(), + "Successful upload should have no layers left to upload" ); assert_eq!( - uploaded_timeline - .archive_data(uploaded_archives.first().copied().unwrap()) - .unwrap() - .disk_consistent_lsn(), - first_upload_metadata.disk_consistent_lsn(), - "Uploaded archive should have corresponding Lsn" - ); - assert_eq!( - uploaded_timeline.stored_files(&local_timeline_path), - vec![local_timeline_path.join("a"), local_timeline_path.join("b")] - .into_iter() + upload + .uploaded_layers + .iter() + .cloned() + .collect::>(), + layer_files + .iter() + .map(|layer_file| local_timeline_path.join(layer_file)) .collect(), - "Should have all files from the first checkpoint" + "Successful upload should have all layers uploaded" + ); + assert_eq!( + upload.metadata, metadata, + "Successful upload should not chage its metadata" ); - let second_upload_metadata = dummy_metadata(Lsn(0x40)); - let second_checkpoint = create_local_timeline( - &repo_harness, - TIMELINE_ID, - &["b", "c"], - second_upload_metadata.clone(), - )?; - assert!( - first_upload_metadata.disk_consistent_lsn() - < second_upload_metadata.disk_consistent_lsn() + let storage_files = storage.list().await?; + assert_eq!( + storage_files.len(), + layer_files.len(), + "All layers should be uploaded" ); - ensure_correct_timeline_upload( - &repo_harness, - Arc::clone(&remote_assets), - TIMELINE_ID, - second_checkpoint, + assert_eq!( + storage_files + .into_iter() + .map(|storage_path| storage.local_path(&storage_path)) + .collect::>>()?, + layer_files + .into_iter() + .map(|file| local_timeline_path.join(file)) + .collect(), + "Uploaded files should match with the local ones" + ); + + Ok(()) + } + + // Currently, GC can run between upload retries, removing local layers scheduled for upload. Test this scenario. + #[tokio::test] + async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> { + let harness = RepoHarness::create("layer_upload_after_local_fs_update")?; + let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + + let layer_files = ["a1", "b1"]; + let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?; + let current_retries = 5; + let metadata = dummy_metadata(Lsn(0x40)); + + let local_timeline_path = harness.timeline_path(&TIMELINE_ID); + let layers_to_upload = { + let mut layers = layer_files.to_vec(); + layers.push("layer_to_remove"); + layers + }; + let timeline_upload = + create_local_timeline(&harness, TIMELINE_ID, &layers_to_upload, metadata.clone()) + .await?; + assert!( + storage.list().await?.is_empty(), + "Storage should be empty before any uploads are made" + ); + + fs::remove_file(local_timeline_path.join("layer_to_remove")).await?; + + let upload_result = upload_timeline_layers( + &storage, + None, + sync_id, + SyncData::new(current_retries, timeline_upload.clone()), ) .await; - let updated_timeline = expect_timeline(index, sync_id).await; - let mut updated_archives = updated_timeline - .checkpoints() - .map(ArchiveId) - .collect::>(); + let upload_data = match upload_result { + UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data) => upload_data, + wrong_result => panic!( + "Expected a successful after local fs upload for timeline, but got: {:?}", + wrong_result + ), + }; + assert_eq!( - updated_archives.len(), - 2, - "Two archives are expected after a successful update of the upload" + current_retries, upload_data.retries, + "On successful upload, retries are not expected to change" ); - updated_archives.retain(|archive_id| archive_id != &first_uploaded_archive); + let upload = &upload_data.data; + assert!( + upload.layers_to_upload.is_empty(), + "Successful upload should have no layers left to upload, even those that were removed from the local fs" + ); assert_eq!( - updated_archives.len(), - 1, - "Only one new archive is expected among the uploaded" - ); - let second_uploaded_archive = updated_archives.last().copied().unwrap(); - assert_eq!( - updated_timeline.checkpoints().max(), - Some(second_upload_metadata.disk_consistent_lsn()), - "Metadata that was uploaded, should have its Lsn stored" + upload + .uploaded_layers + .iter() + .cloned() + .collect::>(), + layer_files + .iter() + .map(|layer_file| local_timeline_path.join(layer_file)) + .collect(), + "Successful upload should have all layers uploaded" ); assert_eq!( - updated_timeline - .archive_data(second_uploaded_archive) - .unwrap() - .disk_consistent_lsn(), - second_upload_metadata.disk_consistent_lsn(), - "Uploaded archive should have corresponding Lsn" - ); - assert_eq!( - updated_timeline.stored_files(&local_timeline_path), - vec![ - local_timeline_path.join("a"), - local_timeline_path.join("b"), - local_timeline_path.join("c"), - ] - .into_iter() - .collect(), - "Should have all files from both checkpoints without duplicates" + upload.metadata, metadata, + "Successful upload should not chage its metadata" ); - let third_upload_metadata = dummy_metadata(Lsn(0x20)); - let third_checkpoint = create_local_timeline( - &repo_harness, - TIMELINE_ID, - &["d"], - third_upload_metadata.clone(), - )?; - assert_ne!( - third_upload_metadata.disk_consistent_lsn(), - first_upload_metadata.disk_consistent_lsn() - ); - assert!( - third_upload_metadata.disk_consistent_lsn() - < second_upload_metadata.disk_consistent_lsn() - ); - ensure_correct_timeline_upload( - &repo_harness, - Arc::clone(&remote_assets), - TIMELINE_ID, - third_checkpoint, - ) - .await; - - let updated_timeline = expect_timeline(index, sync_id).await; - let mut updated_archives = updated_timeline - .checkpoints() - .map(ArchiveId) - .collect::>(); + let storage_files = storage.list().await?; assert_eq!( - updated_archives.len(), - 3, - "Three archives are expected after two successful updates of the upload" - ); - updated_archives.retain(|archive_id| { - archive_id != &first_uploaded_archive && archive_id != &second_uploaded_archive - }); - assert_eq!( - updated_archives.len(), - 1, - "Only one new archive is expected among the uploaded" - ); - let third_uploaded_archive = updated_archives.last().copied().unwrap(); - assert!( - updated_timeline.checkpoints().max().unwrap() - > third_upload_metadata.disk_consistent_lsn(), - "Should not influence the last lsn by uploading an older checkpoint" + storage_files.len(), + layer_files.len(), + "All layers should be uploaded" ); assert_eq!( - updated_timeline - .archive_data(third_uploaded_archive) - .unwrap() - .disk_consistent_lsn(), - third_upload_metadata.disk_consistent_lsn(), - "Uploaded archive should have corresponding Lsn" - ); - assert_eq!( - updated_timeline.stored_files(&local_timeline_path), - vec![ - local_timeline_path.join("a"), - local_timeline_path.join("b"), - local_timeline_path.join("c"), - local_timeline_path.join("d"), - ] - .into_iter() - .collect(), - "Should have all files from three checkpoints without duplicates" + storage_files + .into_iter() + .map(|storage_path| storage.local_path(&storage_path)) + .collect::>>()?, + layer_files + .into_iter() + .map(|file| local_timeline_path.join(file)) + .collect(), + "Uploaded files should match with the local ones" ); Ok(()) } #[tokio::test] - async fn reupload_timeline_rejected() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("reupload_timeline_rejected")?; - let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?; - let index = RemoteIndex::try_parse_descriptions_from_paths( - repo_harness.conf, - storage - .list() - .await? - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), - ); - let remote_assets = Arc::new((storage, index)); - let storage = &remote_assets.0; - let index = &remote_assets.1; + async fn test_upload_index_part() -> anyhow::Result<()> { + let harness = RepoHarness::create("test_upload_index_part")?; + let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let first_upload_metadata = dummy_metadata(Lsn(0x10)); - let first_checkpoint = create_local_timeline( - &repo_harness, - TIMELINE_ID, - &["a", "b"], - first_upload_metadata.clone(), - )?; - ensure_correct_timeline_upload( - &repo_harness, - Arc::clone(&remote_assets), - TIMELINE_ID, - first_checkpoint, - ) - .await; - let after_first_uploads = RemoteIndex::try_parse_descriptions_from_paths( - repo_harness.conf, - remote_assets - .0 - .list() - .await - .unwrap() - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), + let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?; + let metadata = dummy_metadata(Lsn(0x40)); + let local_timeline_path = harness.timeline_path(&TIMELINE_ID); + + let index_part = IndexPart::new( + HashSet::from([ + RelativePath::new(&local_timeline_path, local_timeline_path.join("one"))?, + RelativePath::new(&local_timeline_path, local_timeline_path.join("two"))?, + ]), + HashSet::from([RelativePath::new( + &local_timeline_path, + local_timeline_path.join("three"), + )?]), + metadata.disk_consistent_lsn(), + metadata.to_bytes()?, ); - let normal_upload_metadata = dummy_metadata(Lsn(0x20)); - assert_ne!( - normal_upload_metadata.disk_consistent_lsn(), - first_upload_metadata.disk_consistent_lsn() + assert!( + storage.list().await?.is_empty(), + "Storage should be empty before any uploads are made" + ); + upload_index_part(harness.conf, &storage, sync_id, index_part.clone()).await?; + + let storage_files = storage.list().await?; + assert_eq!( + storage_files.len(), + 1, + "Should have only the index part file uploaded" ); - let checkpoint_with_no_files = create_local_timeline( - &repo_harness, - TIMELINE_ID, - &[], - normal_upload_metadata.clone(), - )?; - upload_timeline_checkpoint( - repo_harness.conf, - Arc::clone(&remote_assets), - sync_id, - checkpoint_with_no_files, - 0, - ) - .await; - assert_index_descriptions(index, &after_first_uploads).await; + let index_part_path = storage_files.first().unwrap(); + assert_eq!( + index_part_path.file_stem().and_then(|name| name.to_str()), + Some(IndexPart::FILE_NAME), + "Remote index part should have the correct name" + ); + assert_eq!( + index_part_path + .extension() + .and_then(|extension| extension.to_str()), + Some(IndexPart::FILE_EXTENSION), + "Remote index part should have the correct extension" + ); - let checkpoint_with_uploaded_lsn = create_local_timeline( - &repo_harness, - TIMELINE_ID, - &["something", "new"], - first_upload_metadata.clone(), - )?; - upload_timeline_checkpoint( - repo_harness.conf, - Arc::clone(&remote_assets), - sync_id, - checkpoint_with_uploaded_lsn, - 0, - ) - .await; - assert_index_descriptions(index, &after_first_uploads).await; + let remote_index_part: IndexPart = + serde_json::from_slice(&fs::read(&index_part_path).await?)?; + assert_eq!( + index_part, remote_index_part, + "Remote index part should match the local one" + ); Ok(()) } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index eda9a3168d..d75b4efe71 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -182,14 +182,12 @@ impl Value { #[derive(Clone, Copy, Debug)] pub enum TimelineSyncStatusUpdate { - Uploaded, Downloaded, } impl Display for TimelineSyncStatusUpdate { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let s = match self { - TimelineSyncStatusUpdate::Uploaded => "Uploaded", TimelineSyncStatusUpdate::Downloaded => "Downloaded", }; f.write_str(s) diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 2765554cf9..71e85c58e6 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -95,7 +95,7 @@ pub fn load_local_repo( /// Updates tenants' repositories, changing their timelines state in memory. pub fn apply_timeline_sync_status_updates( conf: &'static PageServerConf, - remote_index: RemoteIndex, + remote_index: &RemoteIndex, sync_status_updates: HashMap>, ) { if sync_status_updates.is_empty() { @@ -109,7 +109,7 @@ pub fn apply_timeline_sync_status_updates( trace!("Sync status updates: {:?}", sync_status_updates); for (tenant_id, tenant_timelines_sync_status_updates) in sync_status_updates { - let repo = load_local_repo(conf, tenant_id, &remote_index); + let repo = load_local_repo(conf, tenant_id, remote_index); for (timeline_id, timeline_sync_status_update) in tenant_timelines_sync_status_updates { match repo.apply_timeline_remote_sync_status_update(timeline_id, timeline_sync_status_update) diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 105c3c869f..586d27d5b1 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -114,8 +114,8 @@ impl LocalTimelineInfo { #[serde_as] #[derive(Debug, Serialize, Deserialize, Clone)] pub struct RemoteTimelineInfo { - #[serde_as(as = "Option")] - pub remote_consistent_lsn: Option, + #[serde_as(as = "DisplayFromStr")] + pub remote_consistent_lsn: Lsn, pub awaits_download: bool, } diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 6de0b87478..e09af09820 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -305,7 +305,7 @@ fn walreceiver_main( tenant_id, timeline_id, }) - .and_then(|e| e.disk_consistent_lsn()) + .map(|remote_timeline| remote_timeline.metadata.disk_consistent_lsn()) .unwrap_or(Lsn(0)) // no checkpoint was uploaded }); diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index e762f8589a..f2d654423a 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -18,6 +18,7 @@ import pytest # * starts a pageserver with remote storage, stores specific data in its tables # * triggers a checkpoint (which produces a local data scheduled for backup), gets the corresponding timeline id # * polls the timeline status to ensure it's copied remotely +# * inserts more data in the pageserver and repeats the process, to check multiple checkpoints case # * stops the pageserver, clears all local directories # # 2. Second pageserver @@ -50,27 +51,30 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute(f''' - CREATE TABLE t1(id int primary key, secret text); - INSERT INTO t1 VALUES ({data_id}, '{data_secret}'); - ''') - cur.execute("SELECT pg_current_wal_flush_lsn()") - current_lsn = lsn_from_hex(cur.fetchone()[0]) + checkpoint_numbers = range(1, 3) - # wait until pageserver receives that data - wait_for_last_record_lsn(client, UUID(tenant_id), UUID(timeline_id), current_lsn) + for checkpoint_number in checkpoint_numbers: + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute(f''' + CREATE TABLE t{checkpoint_number}(id int primary key, secret text); + INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); + ''') + cur.execute("SELECT pg_current_wal_flush_lsn()") + current_lsn = lsn_from_hex(cur.fetchone()[0]) - # run checkpoint manually to be sure that data landed in remote storage - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor() as pscur: - pscur.execute(f"checkpoint {tenant_id} {timeline_id}") + # wait until pageserver receives that data + wait_for_last_record_lsn(client, UUID(tenant_id), UUID(timeline_id), current_lsn) - log.info("waiting for upload") - # wait until pageserver successfully uploaded a checkpoint to remote storage - wait_for_upload(client, UUID(tenant_id), UUID(timeline_id), current_lsn) - log.info("upload is done") + # run checkpoint manually to be sure that data landed in remote storage + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor() as pscur: + pscur.execute(f"checkpoint {tenant_id} {timeline_id}") + + log.info(f'waiting for checkpoint {checkpoint_number} upload') + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(client, UUID(tenant_id), UUID(timeline_id), current_lsn) + log.info(f'upload of checkpoint {checkpoint_number} is done') ##### Stop the first pageserver instance, erase all its data env.postgres.stop_all() @@ -93,5 +97,6 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, pg = env.postgres.create_start('main') with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute(f'SELECT secret FROM t1 WHERE id = {data_id};') - assert cur.fetchone() == (data_secret, ) + for checkpoint_number in checkpoint_numbers: + cur.execute(f'SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};') + assert cur.fetchone() == (f'{data_secret}|{checkpoint_number}', ) diff --git a/zenith/src/main.rs b/zenith/src/main.rs index 18368895a4..f248a5db5b 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -550,7 +550,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let tenant_id = get_tenant_id(create_match, env)?; let new_branch_name = create_match .value_of("branch-name") - .ok_or(anyhow!("No branch name provided"))?; + .ok_or_else(|| anyhow!("No branch name provided"))?; let timeline = pageserver .timeline_create(tenant_id, None, None, None)? .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; @@ -571,7 +571,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let tenant_id = get_tenant_id(branch_match, env)?; let new_branch_name = branch_match .value_of("branch-name") - .ok_or(anyhow!("No branch name provided"))?; + .ok_or_else(|| anyhow!("No branch name provided"))?; let ancestor_branch_name = branch_match .value_of("ancestor-branch-name") .unwrap_or(DEFAULT_BRANCH_NAME); From 91fb21225a7a6fda0eed6d916cc6ebc8c0920aab Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 20 Apr 2022 00:46:29 +0300 Subject: [PATCH 0164/1022] Show more logs during S3 sync --- pageserver/src/remote_storage/storage_sync.rs | 111 +++++++----------- .../remote_storage/storage_sync/download.rs | 49 +++----- .../src/remote_storage/storage_sync/upload.rs | 51 ++++---- 3 files changed, 83 insertions(+), 128 deletions(-) diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index 6ba55372c2..649e563dbc 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -165,10 +165,7 @@ mod sync_queue { if let Some(sender) = SENDER.get() { match sender.send((sync_id, new_task)) { Err(e) => { - warn!( - "Failed to enqueue a sync task: the receiver is dropped: {}", - e - ); + warn!("Failed to enqueue a sync task: the receiver is dropped: {e}"); false } Ok(()) => { @@ -429,15 +426,9 @@ pub fn schedule_timeline_checkpoint_upload( metadata, }), ) { - warn!( - "Could not send an upload task for tenant {}, timeline {}", - tenant_id, timeline_id - ) + warn!("Could not send an upload task for tenant {tenant_id}, timeline {timeline_id}",) } else { - debug!( - "Upload task for tenant {}, timeline {} sent", - tenant_id, timeline_id - ) + debug!("Upload task for tenant {tenant_id}, timeline {timeline_id} sent") } } @@ -449,10 +440,7 @@ pub fn schedule_timeline_checkpoint_upload( /// /// Ensure that the loop is started otherwise the task is never processed. pub fn schedule_timeline_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { - debug!( - "Scheduling timeline download for tenant {}, timeline {}", - tenant_id, timeline_id - ); + debug!("Scheduling timeline download for tenant {tenant_id}, timeline {timeline_id}"); sync_queue::push( ZTenantTimelineId { tenant_id, @@ -614,11 +602,7 @@ where let remaining_queue_length = sync_queue::len(); REMAINING_SYNC_ITEMS.set(remaining_queue_length as i64); if remaining_queue_length > 0 || !batched_tasks.is_empty() { - info!( - "Processing tasks for {} timelines in batch, more tasks left to process: {}", - batched_tasks.len(), - remaining_queue_length - ); + info!("Processing tasks for {} timelines in batch, more tasks left to process: {remaining_queue_length}", batched_tasks.len()); } else { debug!("No tasks to process"); return ControlFlow::Continue(HashMap::new()); @@ -644,7 +628,7 @@ where HashMap, > = HashMap::with_capacity(max_concurrent_sync); while let Some((sync_id, state_update)) = sync_results.next().await { - debug!("Finished storage sync task for sync id {}", sync_id); + debug!("Finished storage sync task for sync id {sync_id}"); if let Some(state_update) = state_update { new_timeline_states .entry(sync_id.tenant_id) @@ -693,7 +677,7 @@ where ) .await { - error!("Failed to update remote timeline {}: {:?}", sync_id, e); + error!("Failed to update remote timeline {sync_id}: {e:?}"); } } SyncTask::DownloadAndUpload(_, failed_upload_data) => { @@ -712,7 +696,7 @@ where ) .await { - error!("Failed to update remote timeline {}: {:?}", sync_id, e); + error!("Failed to update remote timeline {sync_id}: {e:?}"); } } } @@ -720,18 +704,17 @@ where } }; + let task_name = task.name(); let current_task_attempt = task.retries(); + info!("Sync task '{task_name}' processing started, attempt #{current_task_attempt}"); + if current_task_attempt > 0 { let seconds_to_wait = 2.0_f64.powf(current_task_attempt as f64 - 1.0).min(30.0); - debug!( - "Waiting {} seconds before starting the task", - seconds_to_wait - ); + info!("Waiting {seconds_to_wait} seconds before starting the '{task_name}' task"); tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await; } - let task_name = task.name(); - match task { + let status_update = match task { SyncTask::Download(new_download_data) => { download_timeline( conf, @@ -782,7 +765,11 @@ where status_update } - } + }; + + info!("Finished processing the task"); + + status_update } async fn download_timeline( @@ -804,10 +791,7 @@ where DownloadedTimeline::Abort => { register_sync_status(sync_start, task_name, None); if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) { - error!( - "Timeline {} was expected to be in the remote index after a download attempt, but it's absent: {:?}", - sync_id, e - ); + error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}"); } None } @@ -823,15 +807,12 @@ where Some(TimelineSyncStatusUpdate::Downloaded) } Err(e) => { - error!( - "Timeline {} was expected to be in the remote index after a sucessful download, but it's absent: {:?}", - sync_id, e - ); + error!("Timeline {sync_id} was expected to be in the remote index after a sucessful download, but it's absent: {e:?}"); None } }, Err(e) => { - error!("Failed to update local timeline metadata: {:?}", e); + error!("Failed to update local timeline metadata: {e:?}"); download_data.retries += 1; sync_queue::push(sync_id, SyncTask::Download(download_data)); register_sync_status(sync_start, task_name, Some(false)); @@ -873,10 +854,7 @@ async fn update_local_metadata( }; if local_lsn < Some(remote_lsn) { - info!( - "Updating local timeline metadata from remote timeline: local disk_consistent_lsn={:?}, remote disk_consistent_lsn={}", - local_lsn, remote_lsn - ); + info!("Updating local timeline metadata from remote timeline: local disk_consistent_lsn={local_lsn:?}, remote disk_consistent_lsn={remote_lsn}"); let remote_metadata_bytes = remote_metadata .to_bytes() @@ -890,7 +868,7 @@ async fn update_local_metadata( ) })?; } else { - info!("Local metadata at path '{}' has later disk consistent Lsn ({:?}) than the remote one ({}), skipping the update", local_metadata_path.display(), local_lsn, remote_lsn); + info!("Local metadata at path '{}' has later disk consistent Lsn ({local_lsn:?}) than the remote one ({remote_lsn}), skipping the update", local_metadata_path.display()); } Ok(()) @@ -933,9 +911,8 @@ async fn upload_timeline( Ok(metadata) => metadata, Err(e) => { error!( - "Failed to load local metadata from path '{}': {:?}", - local_metadata_path.display(), - e + "Failed to load local metadata from path '{}': {e:?}", + local_metadata_path.display() ); outdated_upload_data.retries += 1; sync_queue::push(sync_id, SyncTask::Upload(outdated_upload_data)); @@ -952,7 +929,7 @@ async fn upload_timeline( match update_remote_data(conf, storage, index, sync_id, &uploaded_data.data, false).await { Ok(()) => register_sync_status(sync_start, task_name, Some(true)), Err(e) => { - error!("Failed to update remote timeline {}: {:?}", sync_id, e); + error!("Failed to update remote timeline {sync_id}: {e:?}"); uploaded_data.retries += 1; sync_queue::push(sync_id, SyncTask::Upload(uploaded_data)); register_sync_status(sync_start, task_name, Some(false)); @@ -972,6 +949,7 @@ where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { + info!("Updating remote index for the timeline"); let updated_remote_timeline = { let mut index_accessor = index.write().await; @@ -1012,6 +990,7 @@ where IndexPart::from_remote_timeline(&timeline_path, updated_remote_timeline) .context("Failed to create an index part from the updated remote timeline")?; + info!("Uploading remote data for the timeline"); upload_index_part(conf, storage, sync_id, new_index_part) .await .context("Failed to upload new index part") @@ -1031,8 +1010,8 @@ fn validate_task_retries( if download_data.retries > max_sync_errors => { error!( - "Evicting download task for timeline {} that failed {} times, exceeding the error threshold {}", - sync_id, download_data.retries, max_sync_errors + "Evicting download task for timeline {sync_id} that failed {} times, exceeding the error threshold {max_sync_errors}", + download_data.retries ); skip_download = true; } @@ -1040,9 +1019,9 @@ fn validate_task_retries( if upload_data.retries > max_sync_errors => { error!( - "Evicting upload task for timeline {} that failed {} times, exceeding the error threshold {}", - sync_id, upload_data.retries, max_sync_errors - ); + "Evicting upload task for timeline {sync_id} that failed {} times, exceeding the error threshold {max_sync_errors}", + upload_data.retries, + ); skip_upload = true; } _ => {} @@ -1083,10 +1062,10 @@ where while let Some((id, part_upload_result)) = part_downloads.next().await { match part_upload_result { Ok(index_part) => { - debug!("Successfully fetched index part for {}", id); + debug!("Successfully fetched index part for {id}"); index_parts.insert(id, index_part); } - Err(e) => warn!("Failed to fetch index part for {}: {:?}", id, e), + Err(e) => warn!("Failed to fetch index part for {id}: {e:?}"), } } @@ -1120,8 +1099,8 @@ fn schedule_first_sync_tasks( if was_there.is_some() { // defensive check warn!( - "Overwriting timeline init sync status. Status {:?} Timeline {}", - timeline_status, sync_id.timeline_id + "Overwriting timeline init sync status. Status {timeline_status:?}, timeline {}", + sync_id.timeline_id ); } remote_timeline.awaits_download = awaits_download; @@ -1207,7 +1186,7 @@ fn compare_local_and_remote_timeline( fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Option) { let secs_elapsed = sync_start.elapsed().as_secs_f64(); - debug!("Processed a sync task in {} seconds", secs_elapsed); + info!("Processed a sync task in {secs_elapsed:.2} seconds"); match sync_status { Some(true) => IMAGE_SYNC_TIME.with_label_values(&[sync_name, "success"]), Some(false) => IMAGE_SYNC_TIME.with_label_values(&[sync_name, "failure"]), @@ -1254,7 +1233,7 @@ mod test_utils { } pub fn dummy_contents(name: &str) -> String { - format!("contents for {}", name) + format!("contents for {name}") } pub fn dummy_metadata(disk_consistent_lsn: Lsn) -> TimelineMetadata { @@ -1286,7 +1265,7 @@ mod tests { let merged_download = match download_1.merge(download_2) { SyncTask::Download(merged_download) => merged_download, - wrong_merge_result => panic!("Unexpected merge result: {:?}", wrong_merge_result), + wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), }; assert_eq!( @@ -1334,7 +1313,7 @@ mod tests { let merged_upload = match upload_1.merge(upload_2) { SyncTask::Upload(merged_upload) => merged_upload, - wrong_merge_result => panic!("Unexpected merge result: {:?}", wrong_merge_result), + wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), }; assert_eq!( @@ -1389,7 +1368,7 @@ mod tests { SyncTask::DownloadAndUpload(merged_download, merged_upload) => { (merged_download, merged_upload) } - wrong_merge_result => panic!("Unexpected merge result: {:?}", wrong_merge_result), + wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), }; assert_eq!( @@ -1440,7 +1419,7 @@ mod tests { SyncTask::DownloadAndUpload(merged_download, merged_upload) => { (merged_download, merged_upload) } - wrong_merge_result => panic!("Unexpected merge result: {:?}", wrong_merge_result), + wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), }; assert_eq!( @@ -1507,7 +1486,7 @@ mod tests { SyncTask::DownloadAndUpload(merged_download, merged_upload) => { (merged_download, merged_upload) } - wrong_merge_result => panic!("Unexpected merge result: {:?}", wrong_merge_result), + wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), }; assert_eq!( @@ -1577,7 +1556,7 @@ mod tests { SyncTask::DownloadAndUpload(merged_download, merged_upload) => { (merged_download, merged_upload) } - wrong_merge_result => panic!("Unexpected merge result: {:?}", wrong_merge_result), + wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), }; assert_eq!( diff --git a/pageserver/src/remote_storage/storage_sync/download.rs b/pageserver/src/remote_storage/storage_sync/download.rs index 81ed649c8a..eb805cd0cc 100644 --- a/pageserver/src/remote_storage/storage_sync/download.rs +++ b/pageserver/src/remote_storage/storage_sync/download.rs @@ -5,7 +5,7 @@ use std::fmt::Debug; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; use tokio::fs; -use tracing::{debug, error, trace, warn}; +use tracing::{debug, error, info, warn}; use crate::{ config::PageServerConf, @@ -45,25 +45,16 @@ where .download(&part_storage_path, &mut index_part_bytes) .await .with_context(|| { - format!( - "Failed to download an index part from storage path '{:?}'", - part_storage_path - ) + format!("Failed to download an index part from storage path '{part_storage_path:?}'") })?; let index_part: IndexPart = serde_json::from_slice(&index_part_bytes).with_context(|| { - format!( - "Failed to deserialize index part file from storage path '{:?}'", - part_storage_path - ) + format!("Failed to deserialize index part file from storage path '{part_storage_path:?}'") })?; let missing_files = index_part.missing_files(); if !missing_files.is_empty() { - warn!( - "Found missing layers in index part for timeline {}: {:?}", - sync_id, missing_files - ); + warn!("Found missing layers in index part for timeline {sync_id}: {missing_files:?}"); } Ok(index_part) @@ -100,21 +91,17 @@ where let remote_timeline = match remote_timeline { Some(remote_timeline) => { if !remote_timeline.awaits_download { - error!("Timeline with sync id {} is not awaiting download", sync_id); + error!("Timeline with sync id {sync_id} is not awaiting download"); return DownloadedTimeline::Abort; } remote_timeline } None => { - error!( - "Timeline with sync id {} is not present in the remote index", - sync_id - ); + error!("Timeline with sync id {sync_id} is not present in the remote index"); return DownloadedTimeline::Abort; } }; - debug!("Downloading timeline layers for sync id {}", sync_id); let download = &mut download_data.data; let layers_to_download = remote_timeline @@ -123,7 +110,8 @@ where .cloned() .collect::>(); - trace!("Layers to download: {:?}", layers_to_download); + debug!("Layers to download: {layers_to_download:?}"); + info!("Downloading {} timeline layers", layers_to_download.len()); let mut download_tasks = layers_to_download .into_iter() @@ -157,8 +145,7 @@ where .await .with_context(|| { format!( - "Failed to download a layer from storage path '{:?}'", - layer_storage_path + "Failed to download a layer from storage path '{layer_storage_path:?}'" ) })?; } @@ -166,8 +153,6 @@ where }) .collect::>(); - debug!("Downloading {} layers of a timeline", download_tasks.len()); - let mut errors_happened = false; while let Some(download_result) = download_tasks.next().await { match download_result { @@ -176,21 +161,18 @@ where } Err(e) => { errors_happened = true; - error!( - "Failed to download a layer for timeline {}: {:?}", - sync_id, e - ); + error!("Failed to download a layer for timeline {sync_id}: {e:?}"); } } } if errors_happened { - debug!("Reenqueuing failed download task for timeline {}", sync_id); + debug!("Reenqueuing failed download task for timeline {sync_id}"); download_data.retries += 1; sync_queue::push(sync_id, SyncTask::Download(download_data)); DownloadedTimeline::FailedAndRescheduled } else { - debug!("Finished downloading all timeline's layers"); + info!("Successfully downloaded all layers"); DownloadedTimeline::Successful(download_data) } } @@ -266,10 +248,9 @@ mod tests { .await { DownloadedTimeline::Successful(data) => data, - wrong_result => panic!( - "Expected a successful download for timeline, but got: {:?}", - wrong_result - ), + wrong_result => { + panic!("Expected a successful download for timeline, but got: {wrong_result:?}") + } }; assert_eq!( diff --git a/pageserver/src/remote_storage/storage_sync/upload.rs b/pageserver/src/remote_storage/storage_sync/upload.rs index 81758ce3ef..b4a2f6f989 100644 --- a/pageserver/src/remote_storage/storage_sync/upload.rs +++ b/pageserver/src/remote_storage/storage_sync/upload.rs @@ -5,7 +5,7 @@ use std::{fmt::Debug, path::PathBuf}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; use tokio::fs; -use tracing::{debug, error, trace, warn}; +use tracing::{debug, error, info, warn}; use crate::{ config::PageServerConf, @@ -53,10 +53,7 @@ where ) .await .with_context(|| { - format!( - "Failed to upload index part to the storage path '{:?}'", - index_part_storage_path - ) + format!("Failed to upload index part to the storage path '{index_part_storage_path:?}'") }) } @@ -89,10 +86,6 @@ where { let upload = &mut upload_data.data; let new_upload_lsn = upload.metadata.disk_consistent_lsn(); - debug!( - "Uploading timeline layers for sync id {}, new lsn: {}", - sync_id, new_upload_lsn - ); let already_uploaded_layers = remote_timeline .map(|timeline| timeline.stored_files()) @@ -105,7 +98,11 @@ where .cloned() .collect::>(); - trace!("Layers to upload: {:?}", layers_to_upload); + debug!("Layers to upload: {layers_to_upload:?}"); + info!( + "Uploading {} timeline layers, new lsn: {new_upload_lsn}", + layers_to_upload.len(), + ); let mut upload_tasks = layers_to_upload .into_iter() @@ -157,8 +154,6 @@ where }) .collect::>(); - debug!("uploading {} layers of a timeline", upload_tasks.len()); - let mut errors_happened = false; let mut local_fs_updated = false; while let Some(upload_result) = upload_tasks.next().await { @@ -170,16 +165,19 @@ where Err(e) => match e { UploadError::Other(e) => { errors_happened = true; - error!("Failed to upload a layer for timeline {}: {:?}", sync_id, e); + error!("Failed to upload a layer for timeline {sync_id}: {e:?}"); } UploadError::MissingLocalFile(source_path, e) => { if source_path.exists() { errors_happened = true; - error!("Failed to upload a layer for timeline {}: {:?}", sync_id, e); + error!("Failed to upload a layer for timeline {sync_id}: {e:?}"); } else { local_fs_updated = true; upload.layers_to_upload.remove(&source_path); - warn!("Missing locally a layer file scheduled for upload, skipping"); + warn!( + "Missing locally a layer file {} scheduled for upload, skipping", + source_path.display() + ); } } }, @@ -187,17 +185,16 @@ where } if errors_happened { - debug!("Reenqueuing failed upload task for timeline {}", sync_id); + debug!("Reenqueuing failed upload task for timeline {sync_id}"); upload_data.retries += 1; sync_queue::push(sync_id, SyncTask::Upload(upload_data)); UploadedTimeline::FailedAndRescheduled + } else if local_fs_updated { + info!("Successfully uploaded all layers, some local layers were removed during the upload"); + UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data) } else { - debug!("Finished uploading all timeline's layers"); - if local_fs_updated { - UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data) - } else { - UploadedTimeline::Successful(upload_data) - } + info!("Successfully uploaded all layers"); + UploadedTimeline::Successful(upload_data) } } @@ -253,10 +250,9 @@ mod tests { let upload_data = match upload_result { UploadedTimeline::Successful(upload_data) => upload_data, - wrong_result => panic!( - "Expected a successful upload for timeline, but got: {:?}", - wrong_result - ), + wrong_result => { + panic!("Expected a successful upload for timeline, but got: {wrong_result:?}") + } }; assert_eq!( @@ -344,8 +340,7 @@ mod tests { let upload_data = match upload_result { UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data) => upload_data, wrong_result => panic!( - "Expected a successful after local fs upload for timeline, but got: {:?}", - wrong_result + "Expected a successful after local fs upload for timeline, but got: {wrong_result:?}" ), }; From 170badd62604c050f671b1cd65a572f630f17e09 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 20 Apr 2022 11:11:07 +0300 Subject: [PATCH 0165/1022] Capture the postgres log in all tests that start a vanilla Postgres. --- test_runner/fixtures/zenith_fixtures.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 8dfe219966..a9c4c0f395 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1273,10 +1273,14 @@ class VanillaPostgres(PgProtocol): with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file: conf_file.writelines(options) - def start(self): + def start(self, log_path: Optional[str] = None): assert not self.running self.running = True - self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, 'start']) + + if log_path is None: + log_path = os.path.join(self.pgdatadir, "pg.log") + + self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, '-l', log_path, 'start']) def stop(self): assert self.running From 5e95338ee9a898ab42e96050ee348720fbe50861 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 20 Apr 2022 11:16:13 +0300 Subject: [PATCH 0166/1022] Improve logging in test_wal_restore.py - Capture the output of the restore_from_wal.sh in a log file - Kill "restored" Postgres server on test failure --- test_runner/batch_others/test_wal_restore.py | 24 +++++++++----------- zenith_utils/scripts/restore_from_wal.sh | 1 + 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py index a5855f2258..8cc27a455c 100644 --- a/test_runner/batch_others/test_wal_restore.py +++ b/test_runner/batch_others/test_wal_restore.py @@ -1,7 +1,6 @@ import os import subprocess -from fixtures.utils import mkdir_if_needed from fixtures.zenith_fixtures import (ZenithEnvBuilder, VanillaPostgres, PortDistributor, @@ -13,6 +12,7 @@ from fixtures.log_helper import log def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, + pg_bin: PgBin, test_output_dir, port_distributor: PortDistributor): zenith_env_builder.num_safekeepers = 1 @@ -24,15 +24,13 @@ def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, env.zenith_cli.pageserver_stop() port = port_distributor.get_port() data_dir = os.path.join(test_output_dir, 'pgsql.restored') - restored = VanillaPostgres(data_dir, PgBin(test_output_dir), port) - subprocess.call([ - 'bash', - os.path.join(base_dir, 'zenith_utils/scripts/restore_from_wal.sh'), - os.path.join(pg_distrib_dir, 'bin'), - os.path.join(test_output_dir, 'repo/safekeepers/sk1/{}/*'.format(tenant_id)), - data_dir, - str(port) - ]) - restored.start() - assert restored.safe_psql('select count(*) from t') == [(1000000, )] - restored.stop() + with VanillaPostgres(data_dir, PgBin(test_output_dir), port) as restored: + pg_bin.run_capture([ + os.path.join(base_dir, 'zenith_utils/scripts/restore_from_wal.sh'), + os.path.join(pg_distrib_dir, 'bin'), + os.path.join(test_output_dir, 'repo/safekeepers/sk1/{}/*'.format(tenant_id)), + data_dir, + str(port) + ]) + restored.start() + assert restored.safe_psql('select count(*) from t') == [(1000000, )] diff --git a/zenith_utils/scripts/restore_from_wal.sh b/zenith_utils/scripts/restore_from_wal.sh index ef2171312b..f05fbc609a 100755 --- a/zenith_utils/scripts/restore_from_wal.sh +++ b/zenith_utils/scripts/restore_from_wal.sh @@ -1,3 +1,4 @@ +#!/bin/bash PG_BIN=$1 WAL_PATH=$2 DATA_DIR=$3 From ac52f4f2d66885c25b99befd942825c16fd2759e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 20 Apr 2022 13:24:38 +0300 Subject: [PATCH 0167/1022] Set superuser when initializing database for wal recovery (#1544) --- test_runner/batch_others/test_wal_restore.py | 2 +- zenith_utils/scripts/restore_from_wal.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py index 8cc27a455c..2dbde954fc 100644 --- a/test_runner/batch_others/test_wal_restore.py +++ b/test_runner/batch_others/test_wal_restore.py @@ -33,4 +33,4 @@ def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, str(port) ]) restored.start() - assert restored.safe_psql('select count(*) from t') == [(1000000, )] + assert restored.safe_psql('select count(*) from t', user='zenith_admin') == [(1000000, )] diff --git a/zenith_utils/scripts/restore_from_wal.sh b/zenith_utils/scripts/restore_from_wal.sh index f05fbc609a..4983449f24 100755 --- a/zenith_utils/scripts/restore_from_wal.sh +++ b/zenith_utils/scripts/restore_from_wal.sh @@ -5,7 +5,7 @@ DATA_DIR=$3 PORT=$4 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-` rm -fr $DATA_DIR -env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -D $DATA_DIR --sysid=$SYSID +env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U zenith_admin -D $DATA_DIR --sysid=$SYSID echo port=$PORT >> $DATA_DIR/postgresql.conf REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-` declare -i WAL_SIZE=$REDO_POS+114 From e660e12f797beebc62f17ba230c42ba0afc44315 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 20 Apr 2022 12:18:24 +0300 Subject: [PATCH 0168/1022] Update rustls-split and rustls versions. All dependencies now use rustls 0.20.2, so we no longer need to build two versions of it. --- Cargo.lock | 48 ++++++++-------------------- zenith_utils/Cargo.toml | 5 +-- zenith_utils/src/postgres_backend.rs | 4 +-- zenith_utils/src/sock_split.rs | 28 +++++++++------- zenith_utils/tests/ssl_test.rs | 37 +++++++++++++-------- 5 files changed, 57 insertions(+), 65 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3480f120e0..ef289776e1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1050,7 +1050,7 @@ checksum = "d87c48c02e0dc5e3b849a2041db3029fd066650f8f717c07bf8ed78ccb895cac" dependencies = [ "http", "hyper", - "rustls 0.20.2", + "rustls", "tokio", "tokio-rustls", ] @@ -1868,7 +1868,7 @@ dependencies = [ "reqwest", "routerify 2.2.0", "rstest", - "rustls 0.20.2", + "rustls", "rustls-pemfile", "scopeguard", "serde", @@ -2048,7 +2048,7 @@ dependencies = [ "mime", "percent-encoding", "pin-project-lite", - "rustls 0.20.2", + "rustls", "rustls-pemfile", "serde", "serde_json", @@ -2222,26 +2222,13 @@ dependencies = [ [[package]] name = "rustls" -version = "0.19.1" +version = "0.20.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7" -dependencies = [ - "base64 0.13.0", - "log", - "ring", - "sct 0.6.1", - "webpki 0.21.4", -] - -[[package]] -name = "rustls" -version = "0.20.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d37e5e2290f3e040b594b1a9e04377c2c671f1a1cfd9bfdef82106ac1c113f84" +checksum = "4fbfeb8d0ddb84706bc597a5574ab8912817c52a397f819e5b614e2265206921" dependencies = [ "log", "ring", - "sct 0.7.0", + "sct", "webpki 0.22.0", ] @@ -2256,11 +2243,11 @@ dependencies = [ [[package]] name = "rustls-split" -version = "0.2.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fb079b52cfdb005752b7c3c646048e702003576a8321058e4c8b38227c11aa6" +checksum = "78802c9612b4689d207acff746f38132ca1b12dadb55d471aa5f10fd580f47d3" dependencies = [ - "rustls 0.19.1", + "rustls", ] [[package]] @@ -2339,16 +2326,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" -[[package]] -name = "sct" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b362b83898e0e69f38515b82ee15aa80636befe47c3b6d3d89a911e78fc228ce" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "sct" version = "0.7.0" @@ -2789,7 +2766,7 @@ checksum = "606f2b73660439474394432239c82249c0d45eb5f23d91f401be1e33590444a7" dependencies = [ "futures", "ring", - "rustls 0.20.2", + "rustls", "tokio", "tokio-postgres", "tokio-rustls", @@ -2801,7 +2778,7 @@ version = "0.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4151fda0cf2798550ad0b34bcfc9b9dcc2a9d2471c895c68f3a8818e54f2389e" dependencies = [ - "rustls 0.20.2", + "rustls", "tokio", "webpki 0.22.0", ] @@ -3392,7 +3369,8 @@ dependencies = [ "postgres-protocol", "rand", "routerify 3.0.0", - "rustls 0.19.1", + "rustls", + "rustls-pemfile", "rustls-split", "serde", "serde_json", diff --git a/zenith_utils/Cargo.toml b/zenith_utils/Cargo.toml index cf864b3a54..2b1caa9be2 100644 --- a/zenith_utils/Cargo.toml +++ b/zenith_utils/Cargo.toml @@ -24,8 +24,8 @@ signal-hook = "0.3.10" rand = "0.8.3" jsonwebtoken = "7" hex = { version = "0.4.3", features = ["serde"] } -rustls = "0.19.1" -rustls-split = "0.2.1" +rustls = "0.20.2" +rustls-split = "0.3.0" git-version = "0.3.5" serde_with = "1.12.0" @@ -39,6 +39,7 @@ hex-literal = "0.3" tempfile = "3.2" webpki = "0.21" criterion = "0.3" +rustls-pemfile = "0.2.1" [[bench]] name = "benchmarks" diff --git a/zenith_utils/src/postgres_backend.rs b/zenith_utils/src/postgres_backend.rs index f984fb4417..fab3c388b1 100644 --- a/zenith_utils/src/postgres_backend.rs +++ b/zenith_utils/src/postgres_backend.rs @@ -304,8 +304,8 @@ impl PostgresBackend { pub fn start_tls(&mut self) -> anyhow::Result<()> { match self.stream.take() { Some(Stream::Bidirectional(bidi_stream)) => { - let session = rustls::ServerSession::new(&self.tls_config.clone().unwrap()); - self.stream = Some(Stream::Bidirectional(bidi_stream.start_tls(session)?)); + let conn = rustls::ServerConnection::new(self.tls_config.clone().unwrap())?; + self.stream = Some(Stream::Bidirectional(bidi_stream.start_tls(conn)?)); Ok(()) } stream => { diff --git a/zenith_utils/src/sock_split.rs b/zenith_utils/src/sock_split.rs index c62963e113..5e4598daf1 100644 --- a/zenith_utils/src/sock_split.rs +++ b/zenith_utils/src/sock_split.rs @@ -4,7 +4,7 @@ use std::{ sync::Arc, }; -use rustls::Session; +use rustls::Connection; /// Wrapper supporting reads of a shared TcpStream. pub struct ArcTcpRead(Arc); @@ -56,7 +56,7 @@ impl BufStream { pub enum ReadStream { Tcp(BufReader), - Tls(rustls_split::ReadHalf), + Tls(rustls_split::ReadHalf), } impl io::Read for ReadStream { @@ -79,7 +79,7 @@ impl ReadStream { pub enum WriteStream { Tcp(Arc), - Tls(rustls_split::WriteHalf), + Tls(rustls_split::WriteHalf), } impl WriteStream { @@ -107,11 +107,11 @@ impl io::Write for WriteStream { } } -type TlsStream = rustls::StreamOwned; +type TlsStream = rustls::StreamOwned; pub enum BidiStream { Tcp(BufStream), - /// This variant is boxed, because [`rustls::ServerSession`] is quite larger than [`BufStream`]. + /// This variant is boxed, because [`rustls::ServerConnection`] is quite larger than [`BufStream`]. Tls(Box>), } @@ -127,7 +127,7 @@ impl BidiStream { if how == Shutdown::Read { tls_boxed.sock.get_ref().shutdown(how) } else { - tls_boxed.sess.send_close_notify(); + tls_boxed.conn.send_close_notify(); let res = tls_boxed.flush(); tls_boxed.sock.get_ref().shutdown(how)?; res @@ -154,19 +154,23 @@ impl BidiStream { // TODO would be nice to avoid the Arc here let socket = Arc::try_unwrap(reader.into_inner().0).unwrap(); - let (read_half, write_half) = - rustls_split::split(socket, tls_boxed.sess, read_buf_cfg, write_buf_cfg); + let (read_half, write_half) = rustls_split::split( + socket, + Connection::Server(tls_boxed.conn), + read_buf_cfg, + write_buf_cfg, + ); (ReadStream::Tls(read_half), WriteStream::Tls(write_half)) } } } - pub fn start_tls(self, mut session: rustls::ServerSession) -> io::Result { + pub fn start_tls(self, mut conn: rustls::ServerConnection) -> io::Result { match self { Self::Tcp(mut stream) => { - session.complete_io(&mut stream)?; - assert!(!session.is_handshaking()); - Ok(Self::Tls(Box::new(TlsStream::new(session, stream)))) + conn.complete_io(&mut stream)?; + assert!(!conn.is_handshaking()); + Ok(Self::Tls(Box::new(TlsStream::new(conn, stream)))) } Self::Tls { .. } => Err(io::Error::new( io::ErrorKind::InvalidInput, diff --git a/zenith_utils/tests/ssl_test.rs b/zenith_utils/tests/ssl_test.rs index ef2bf1ed4a..0e330c44f8 100644 --- a/zenith_utils/tests/ssl_test.rs +++ b/zenith_utils/tests/ssl_test.rs @@ -8,7 +8,6 @@ use std::{ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use lazy_static::lazy_static; -use rustls::Session; use zenith_utils::postgres_backend::{AuthType, Handler, PostgresBackend}; @@ -23,11 +22,11 @@ fn make_tcp_pair() -> (TcpStream, TcpStream) { lazy_static! { static ref KEY: rustls::PrivateKey = { let mut cursor = Cursor::new(include_bytes!("key.pem")); - rustls::internal::pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone() + rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone()) }; static ref CERT: rustls::Certificate = { let mut cursor = Cursor::new(include_bytes!("cert.pem")); - rustls::internal::pemfile::certs(&mut cursor).unwrap()[0].clone() + rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone()) }; } @@ -45,17 +44,23 @@ fn ssl() { let ssl_response = client_sock.read_u8().unwrap(); assert_eq!(b'S', ssl_response); - let mut cfg = rustls::ClientConfig::new(); - cfg.root_store.add(&CERT).unwrap(); + let cfg = rustls::ClientConfig::builder() + .with_safe_defaults() + .with_root_certificates({ + let mut store = rustls::RootCertStore::empty(); + store.add(&CERT).unwrap(); + store + }) + .with_no_client_auth(); let client_config = Arc::new(cfg); - let dns_name = webpki::DNSNameRef::try_from_ascii_str("localhost").unwrap(); - let mut session = rustls::ClientSession::new(&client_config, dns_name); + let dns_name = "localhost".try_into().unwrap(); + let mut conn = rustls::ClientConnection::new(client_config, dns_name).unwrap(); - session.complete_io(&mut client_sock).unwrap(); - assert!(!session.is_handshaking()); + conn.complete_io(&mut client_sock).unwrap(); + assert!(!conn.is_handshaking()); - let mut stream = rustls::Stream::new(&mut session, &mut client_sock); + let mut stream = rustls::Stream::new(&mut conn, &mut client_sock); // StartupMessage stream.write_u32::(9).unwrap(); @@ -105,8 +110,10 @@ fn ssl() { } let mut handler = TestHandler { got_query: false }; - let mut cfg = rustls::ServerConfig::new(rustls::NoClientAuth::new()); - cfg.set_single_cert(vec![CERT.clone()], KEY.clone()) + let cfg = rustls::ServerConfig::builder() + .with_safe_defaults() + .with_no_client_auth() + .with_single_cert(vec![CERT.clone()], KEY.clone()) .unwrap(); let tls_config = Some(Arc::new(cfg)); @@ -209,8 +216,10 @@ fn server_forces_ssl() { } let mut handler = TestHandler; - let mut cfg = rustls::ServerConfig::new(rustls::NoClientAuth::new()); - cfg.set_single_cert(vec![CERT.clone()], KEY.clone()) + let cfg = rustls::ServerConfig::builder() + .with_safe_defaults() + .with_no_client_auth() + .with_single_cert(vec![CERT.clone()], KEY.clone()) .unwrap(); let tls_config = Some(Arc::new(cfg)); From 9eaa21317c9f00f549e633f71bb44edc28ab821a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 20 Apr 2022 14:27:44 +0300 Subject: [PATCH 0169/1022] Update jsonwebtoken crate. With this, we no longer need to build two versions of 'pem' and 'base64' crates. Introduces a duplicate version of 'time' crate, though, but it's still progress. --- Cargo.lock | 93 ++++++++++++++++++++++++---------------- zenith_utils/Cargo.toml | 2 +- zenith_utils/src/auth.rs | 22 +++++----- 3 files changed, 68 insertions(+), 49 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ef289776e1..ac53fc3662 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -119,12 +119,6 @@ dependencies = [ "rustc-demangle", ] -[[package]] -name = "base64" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3441f0f7b02788e948e47f457ca01f1d7e6d92c693bc132c22b087d3141c03ff" - [[package]] name = "base64" version = "0.13.0" @@ -260,7 +254,7 @@ dependencies = [ "num-integer", "num-traits", "serde", - "time", + "time 0.1.44", "winapi", ] @@ -1163,12 +1157,12 @@ dependencies = [ [[package]] name = "jsonwebtoken" -version = "7.2.0" +version = "8.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afabcc15e437a6484fc4f12d0fd63068fe457bf93f1c148d3d9649c60b103f32" +checksum = "cc9051c17f81bae79440afa041b3a278e1de71bfb96d32454b477fd4703ccb6f" dependencies = [ - "base64 0.12.3", - "pem 0.8.3", + "base64", + "pem", "ring", "serde", "serde_json", @@ -1382,9 +1376,9 @@ dependencies = [ [[package]] name = "num-bigint" -version = "0.2.6" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304" +checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f" dependencies = [ "autocfg", "num-integer", @@ -1420,6 +1414,15 @@ dependencies = [ "libc", ] +[[package]] +name = "num_threads" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aba1801fb138d8e85e11d0fc70baf4fe1cdfffda7c6cd34a854905df588e5ed0" +dependencies = [ + "libc", +] + [[package]] name = "object" version = "0.27.1" @@ -1572,24 +1575,13 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" -[[package]] -name = "pem" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd56cbd21fea48d0c440b41cd69c589faacade08c992d9a54e471b79d0fd13eb" -dependencies = [ - "base64 0.13.0", - "once_cell", - "regex", -] - [[package]] name = "pem" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9a3b09a20e374558580a4914d3b7d89bd61b954a5a5e1dcbea98753addb1947" dependencies = [ - "base64 0.13.0", + "base64", ] [[package]] @@ -1711,7 +1703,7 @@ name = "postgres-protocol" version = "0.6.1" source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" dependencies = [ - "base64 0.13.0", + "base64", "byteorder", "bytes", "fallible-iterator", @@ -1850,7 +1842,7 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", - "base64 0.13.0", + "base64", "bytes", "clap 3.0.14", "fail", @@ -1885,6 +1877,15 @@ dependencies = [ "zenith_utils", ] +[[package]] +name = "quickcheck" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6" +dependencies = [ + "rand", +] + [[package]] name = "quote" version = "1.0.15" @@ -1966,7 +1967,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5911d1403f4143c9d56a702069d593e8d0f3fab880a85e103604d0893ea31ba7" dependencies = [ "chrono", - "pem 1.0.2", + "pem", "ring", "yasna", ] @@ -2031,7 +2032,7 @@ version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "87f242f1488a539a79bac6dbe7c8609ae43b7914b7736210f239a37cccb32525" dependencies = [ - "base64 0.13.0", + "base64", "bytes", "encoding_rs", "futures-core", @@ -2124,7 +2125,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b4f000e8934c1b4f70adde180056812e7ea6b1a247952db8ee98c94cd3116cc" dependencies = [ "async-trait", - "base64 0.13.0", + "base64", "bytes", "crc32fast", "futures", @@ -2179,7 +2180,7 @@ version = "0.47.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6264e93384b90a747758bcc82079711eacf2e755c3a8b5091687b5349d870bcc" dependencies = [ - "base64 0.13.0", + "base64", "bytes", "chrono", "digest", @@ -2238,7 +2239,7 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5eebeaeb360c87bfb72e84abdb3447159c0eaececf1bef2aecd65a8be949d1c9" dependencies = [ - "base64 0.13.0", + "base64", ] [[package]] @@ -2490,13 +2491,14 @@ dependencies = [ [[package]] name = "simple_asn1" -version = "0.4.1" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "692ca13de57ce0613a363c8c2f1de925adebc81b04c923ac60c5488bb44abe4b" +checksum = "4a762b1c38b9b990c694b9c2f8abe3372ce6a9ceaae6bca39cfc46e054f45745" dependencies = [ - "chrono", "num-bigint", "num-traits", + "thiserror", + "time 0.3.9", ] [[package]] @@ -2661,6 +2663,25 @@ dependencies = [ "winapi", ] +[[package]] +name = "time" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2702e08a7a860f005826c6815dcac101b19b5eb330c27fe4a5928fec1d20ddd" +dependencies = [ + "itoa 1.0.1", + "libc", + "num_threads", + "quickcheck", + "time-macros", +] + +[[package]] +name = "time-macros" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792" + [[package]] name = "tinytemplate" version = "1.2.1" @@ -2852,7 +2873,7 @@ checksum = "ff08f4649d10a70ffa3522ca559031285d8e421d727ac85c60825761818f5d0a" dependencies = [ "async-stream", "async-trait", - "base64 0.13.0", + "base64", "bytes", "futures-core", "futures-util", diff --git a/zenith_utils/Cargo.toml b/zenith_utils/Cargo.toml index 2b1caa9be2..ca98c8a2e2 100644 --- a/zenith_utils/Cargo.toml +++ b/zenith_utils/Cargo.toml @@ -22,7 +22,7 @@ tracing-subscriber = { version = "0.3", features = ["env-filter"] } nix = "0.23.0" signal-hook = "0.3.10" rand = "0.8.3" -jsonwebtoken = "7" +jsonwebtoken = "8" hex = { version = "0.4.3", features = ["serde"] } rustls = "0.20.2" rustls-split = "0.3.0" diff --git a/zenith_utils/src/auth.rs b/zenith_utils/src/auth.rs index 8271121c63..3bdabacad4 100644 --- a/zenith_utils/src/auth.rs +++ b/zenith_utils/src/auth.rs @@ -1,8 +1,6 @@ // For details about authentication see docs/authentication.md -// TODO there are two issues for our use case in jsonwebtoken library which will be resolved in next release -// The first one is that there is no way to disable expiration claim, but it can be excluded from validation, so use this as a workaround for now. -// Relevant issue: https://github.com/Keats/jsonwebtoken/issues/190 -// The second one is that we wanted to use ed25519 keys, but they are also not supported until next version. So we go with RSA keys for now. +// +// TODO: use ed25519 keys // Relevant issue: https://github.com/Keats/jsonwebtoken/issues/162 use serde; @@ -59,19 +57,19 @@ pub fn check_permission(claims: &Claims, tenantid: Option) -> Result< } pub struct JwtAuth { - decoding_key: DecodingKey<'static>, + decoding_key: DecodingKey, validation: Validation, } impl JwtAuth { - pub fn new(decoding_key: DecodingKey<'_>) -> Self { + pub fn new(decoding_key: DecodingKey) -> Self { + let mut validation = Validation::new(JWT_ALGORITHM); + // The default 'required_spec_claims' is 'exp'. But we don't want to require + // expiration. + validation.required_spec_claims = [].into(); Self { - decoding_key: decoding_key.into_static(), - validation: Validation { - algorithms: vec![JWT_ALGORITHM], - validate_exp: false, - ..Default::default() - }, + decoding_key, + validation, } } From 86bf4301b77332490662f39d52d9271c8c52ecd8 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 20 Apr 2022 14:36:54 +0300 Subject: [PATCH 0170/1022] Remove unnecessary dependency on 'webpki' --- Cargo.lock | 17 +++-------------- zenith_utils/Cargo.toml | 1 - 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ac53fc3662..9775ebe6b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2230,7 +2230,7 @@ dependencies = [ "log", "ring", "sct", - "webpki 0.22.0", + "webpki", ] [[package]] @@ -2801,7 +2801,7 @@ checksum = "4151fda0cf2798550ad0b34bcfc9b9dcc2a9d2471c895c68f3a8818e54f2389e" dependencies = [ "rustls", "tokio", - "webpki 0.22.0", + "webpki", ] [[package]] @@ -3209,16 +3209,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "webpki" -version = "0.21.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8e38c0608262c46d4a56202ebabdeb094cef7e560ca7a226c6bf055188aa4ea" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "webpki" version = "0.22.0" @@ -3235,7 +3225,7 @@ version = "0.22.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "552ceb903e957524388c4d3475725ff2c8b7960922063af6ce53c9a43da07449" dependencies = [ - "webpki 0.22.0", + "webpki", ] [[package]] @@ -3402,7 +3392,6 @@ dependencies = [ "tokio", "tracing", "tracing-subscriber", - "webpki 0.21.4", "workspace_hack", "zenith_metrics", ] diff --git a/zenith_utils/Cargo.toml b/zenith_utils/Cargo.toml index ca98c8a2e2..dd83fa4a92 100644 --- a/zenith_utils/Cargo.toml +++ b/zenith_utils/Cargo.toml @@ -37,7 +37,6 @@ byteorder = "1.4.3" bytes = "1.0.1" hex-literal = "0.3" tempfile = "3.2" -webpki = "0.21" criterion = "0.3" rustls-pemfile = "0.2.1" From cbdfd8c71989e478ba50a63fda8c0687be8ea458 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 20 Apr 2022 14:42:05 +0300 Subject: [PATCH 0171/1022] Update 'routerify' dependency in proxy. routerify version 3 is used in zenith_utils, use the same version in proxy to avoid having to build two versions. --- Cargo.lock | 17 ++--------------- proxy/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9775ebe6b6..1cf8562787 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1858,7 +1858,7 @@ dependencies = [ "rand", "rcgen", "reqwest", - "routerify 2.2.0", + "routerify", "rstest", "rustls", "rustls-pemfile", @@ -2079,19 +2079,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "routerify" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c6bb49594c791cadb5ccfa5f36d41b498d40482595c199d10cd318800280bd9" -dependencies = [ - "http", - "hyper", - "lazy_static", - "percent-encoding", - "regex", -] - [[package]] name = "routerify" version = "3.0.0" @@ -3379,7 +3366,7 @@ dependencies = [ "postgres", "postgres-protocol", "rand", - "routerify 3.0.0", + "routerify", "rustls", "rustls-pemfile", "rustls-split", diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 20b459988a..a4bd99db38 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -20,7 +20,7 @@ parking_lot = "0.11.2" pin-project-lite = "0.2.7" rand = "0.8.3" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } -routerify = "2" +routerify = "3" rustls = "0.20.0" rustls-pemfile = "0.2.1" scopeguard = "1.1.0" From e113c6fa8d5d478bbf7e78297a4f63b20474719b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 20 Apr 2022 16:23:16 +0300 Subject: [PATCH 0172/1022] Print a warning if unlinking an ephemeral file fails. Unlink failure isn't serious on its own, we were about to remove the file anyway, but it shouldn't happen and could be a symptom of something more serious. We just saw "No such file or directory" errors happening from ephemeral file writeback in staging, and I suspect if we had this warning in place, we would have seen these warnings too, if the problem was that the ephemeral file was removed before dropping the EphemeralFile struct. Next time it happens, we'll have more information. --- pageserver/src/layered_repository/ephemeral_file.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pageserver/src/layered_repository/ephemeral_file.rs b/pageserver/src/layered_repository/ephemeral_file.rs index d509186e6f..060d44f810 100644 --- a/pageserver/src/layered_repository/ephemeral_file.rs +++ b/pageserver/src/layered_repository/ephemeral_file.rs @@ -16,6 +16,7 @@ use std::io::{Error, ErrorKind}; use std::ops::DerefMut; use std::path::PathBuf; use std::sync::{Arc, RwLock}; +use tracing::*; use zenith_utils::zid::ZTenantId; use zenith_utils::zid::ZTimelineId; @@ -244,9 +245,15 @@ impl Drop for EphemeralFile { // remove entry from the hash map EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id); - // unlink file - // FIXME: print error - let _ = std::fs::remove_file(&self.file.path); + // unlink the file + let res = std::fs::remove_file(&self.file.path); + if let Err(e) = res { + warn!( + "could not remove ephemeral file '{}': {}", + self.file.path.display(), + e + ); + } } } From e41ad3be0fb72c0e83bca01def6bf68537c7dfac Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 20 Apr 2022 16:21:43 +0300 Subject: [PATCH 0173/1022] add more context to writeback error --- pageserver/src/layered_repository/ephemeral_file.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pageserver/src/layered_repository/ephemeral_file.rs b/pageserver/src/layered_repository/ephemeral_file.rs index 060d44f810..a2f8cda461 100644 --- a/pageserver/src/layered_repository/ephemeral_file.rs +++ b/pageserver/src/layered_repository/ephemeral_file.rs @@ -259,8 +259,17 @@ impl Drop for EphemeralFile { pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Error> { if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) { - file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64)?; - Ok(()) + match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) { + Ok(_) => Ok(()), + Err(e) => Err(std::io::Error::new( + ErrorKind::Other, + format!( + "failed to write back to ephemeral file at {} error: {}", + file.path.display(), + e + ), + )), + } } else { Err(std::io::Error::new( ErrorKind::Other, From 334a1d6b5dd2c476bff082c297d1f5a725408875 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 20 Apr 2022 21:25:12 +0300 Subject: [PATCH 0174/1022] Fix materialized page caching with delta layers. We only checked the cache page version when collecting WAL records in an in-memory layer, not in a delta layer. Refactor the code so that we always stop collecting WAL records when we reach a cached materialized page. Fix the assertion on the LSN range in InMemoryLayer::get_value_reconstruct_data. It was supposed to check that the requested LSN range is within the layer's LSN range, but the inequality was backwards. That went unnoticed before, because the caller always passed the layer's start LSN as the requested LSN range's start LSN, but now we might stop the search earlier, if we have a cached page version. Co-authored-by: Konstantin Knizhnik --- pageserver/src/layered_repository.rs | 25 +++++++++++++++---- .../src/layered_repository/delta_layer.rs | 1 + .../src/layered_repository/image_layer.rs | 1 + .../src/layered_repository/inmemory_layer.rs | 9 +------ 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 6769c9cfbc..c66e4708ff 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1149,6 +1149,12 @@ impl LayeredTimeline { let mut path: Vec<(ValueReconstructResult, Lsn, Arc)> = Vec::new(); + let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { + *cached_lsn + } else { + Lsn(0) + }; + // 'prev_lsn' tracks the last LSN that we were at in our search. It's used // to check that each iteration make some progress, to break infinite // looping if something goes wrong. @@ -1159,10 +1165,14 @@ impl LayeredTimeline { 'outer: loop { // The function should have updated 'state' - //info!("CALLED for {} at {}: {:?} with {} records", reconstruct_state.key, reconstruct_state.lsn, result, reconstruct_state.records.len()); + //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn); match result { ValueReconstructResult::Complete => return Ok(()), ValueReconstructResult::Continue => { + // If we reached an earlier cached page image, we're done. + if cont_lsn == cached_lsn + 1 { + return Ok(()); + } if prev_lsn <= cont_lsn { // Didn't make any progress in last iteration. Error out to avoid // getting stuck in the loop. @@ -1216,12 +1226,15 @@ impl LayeredTimeline { let start_lsn = open_layer.get_lsn_range().start; if cont_lsn > start_lsn { //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); + // Get all the data needed to reconstruct the page version from this layer. + // But if we have an older cached page image, no need to go past that. + let lsn_floor = max(cached_lsn + 1, start_lsn); result = open_layer.get_value_reconstruct_data( key, - open_layer.get_lsn_range().start..cont_lsn, + lsn_floor..cont_lsn, reconstruct_state, )?; - cont_lsn = start_lsn; + cont_lsn = lsn_floor; path.push((result, cont_lsn, open_layer.clone())); continue; } @@ -1230,12 +1243,13 @@ impl LayeredTimeline { let start_lsn = frozen_layer.get_lsn_range().start; if cont_lsn > start_lsn { //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); + let lsn_floor = max(cached_lsn + 1, start_lsn); result = frozen_layer.get_value_reconstruct_data( key, - frozen_layer.get_lsn_range().start..cont_lsn, + lsn_floor..cont_lsn, reconstruct_state, )?; - cont_lsn = start_lsn; + cont_lsn = lsn_floor; path.push((result, cont_lsn, frozen_layer.clone())); continue 'outer; } @@ -1244,6 +1258,7 @@ impl LayeredTimeline { if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn)? { //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display()); + let lsn_floor = max(cached_lsn + 1, lsn_floor); result = layer.get_value_reconstruct_data( key, lsn_floor..cont_lsn, diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 6e3d65a94d..03b7e453b3 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -222,6 +222,7 @@ impl Layer for DeltaLayer { lsn_range: Range, reconstruct_state: &mut ValueReconstructState, ) -> anyhow::Result { + ensure!(lsn_range.start >= self.lsn_range.start); let mut need_image = true; ensure!(self.key_range.contains(&key)); diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 0f334658bf..fa91198a79 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -148,6 +148,7 @@ impl Layer for ImageLayer { reconstruct_state: &mut ValueReconstructState, ) -> anyhow::Result { assert!(self.key_range.contains(&key)); + assert!(lsn_range.start >= self.lsn); assert!(lsn_range.end >= self.lsn); let inner = self.load()?; diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index ffb5be1dd4..33e1eabd8e 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -113,7 +113,7 @@ impl Layer for InMemoryLayer { lsn_range: Range, reconstruct_state: &mut ValueReconstructState, ) -> anyhow::Result { - ensure!(lsn_range.start <= self.start_lsn); + ensure!(lsn_range.start >= self.start_lsn); let mut need_image = true; let inner = self.inner.read().unwrap(); @@ -124,13 +124,6 @@ impl Layer for InMemoryLayer { if let Some(vec_map) = inner.index.get(&key) { let slice = vec_map.slice_range(lsn_range); for (entry_lsn, pos) in slice.iter().rev() { - match &reconstruct_state.img { - Some((cached_lsn, _)) if entry_lsn <= cached_lsn => { - return Ok(ValueReconstructResult::Complete) - } - _ => {} - } - let buf = reader.read_blob(*pos)?; let value = Value::des(&buf)?; match value { From 9d3779c1247eeda8e99f414bc0af7021a6550f4f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 20 Apr 2022 21:25:16 +0300 Subject: [PATCH 0175/1022] Add a counter for materialized page cache hits. --- pageserver/src/layered_repository.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index c66e4708ff..59a3def1fb 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -49,8 +49,8 @@ use crate::CheckpointConfig; use crate::{ZTenantId, ZTimelineId}; use zenith_metrics::{ - register_histogram_vec, register_int_counter, register_int_gauge_vec, Histogram, HistogramVec, - IntCounter, IntGauge, IntGaugeVec, + register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec, + Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, }; use zenith_utils::crashsafe_dir; use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn}; @@ -101,6 +101,15 @@ lazy_static! { .expect("failed to define a metric"); } +lazy_static! { + static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!( + "materialize_page_cache_hits", + "Number of cache hits from materialized page cache", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); +} + lazy_static! { static ref LAST_RECORD_LSN: IntGaugeVec = register_int_gauge_vec!( "pageserver_last_record_lsn", @@ -778,6 +787,7 @@ pub struct LayeredTimeline { // Metrics reconstruct_time_histo: Histogram, + materialized_page_cache_hit_counter: IntCounter, flush_time_histo: Histogram, compact_time_histo: Histogram, create_images_time_histo: Histogram, @@ -983,6 +993,9 @@ impl LayeredTimeline { let reconstruct_time_histo = RECONSTRUCT_TIME .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) .unwrap(); + let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT + .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) + .unwrap(); let flush_time_histo = STORAGE_TIME .get_metric_with_label_values(&[ "layer flush", @@ -1029,6 +1042,7 @@ impl LayeredTimeline { ancestor_lsn: metadata.ancestor_lsn(), reconstruct_time_histo, + materialized_page_cache_hit_counter, flush_time_histo, compact_time_histo, create_images_time_histo, @@ -1171,6 +1185,7 @@ impl LayeredTimeline { ValueReconstructResult::Continue => { // If we reached an earlier cached page image, we're done. if cont_lsn == cached_lsn + 1 { + self.materialized_page_cache_hit_counter.inc_by(1); return Ok(()); } if prev_lsn <= cont_lsn { From 629688fd6c144482cf73be1c75334dfc376d88b8 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 20 Apr 2022 16:24:33 +0300 Subject: [PATCH 0176/1022] Drop redundant resolver setting for 2021 edition --- .config/hakari.toml | 2 ++ Cargo.toml | 1 - pre-commit.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.config/hakari.toml b/.config/hakari.toml index 7bccc6c4a3..42d184b857 100644 --- a/.config/hakari.toml +++ b/.config/hakari.toml @@ -10,6 +10,8 @@ dep-format-version = "2" # Hakari works much better with the new feature resolver. # For more about the new feature resolver, see: # https://blog.rust-lang.org/2021/03/25/Rust-1.51.0.html#cargos-new-feature-resolver +# Have to keep the resolver still here since hakari requires this field, +# despite it's now the default for 2021 edition & cargo. resolver = "2" # Add triples corresponding to platforms commonly used by developers here. diff --git a/Cargo.toml b/Cargo.toml index 4b3b31e0b7..1405f26517 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,6 @@ members = [ "zenith_metrics", "zenith_utils", ] -resolver = "2" [profile.release] # This is useful for profiling and, to some extent, debug. diff --git a/pre-commit.py b/pre-commit.py index 1e886e403b..ea6a22a7fe 100755 --- a/pre-commit.py +++ b/pre-commit.py @@ -29,7 +29,7 @@ def colorify( def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str: - cmd = "rustfmt --edition=2018" + cmd = "rustfmt --edition=2021" if not fix_inplace: cmd += " --check" if no_color: From 81cad6277a2666ca47b97848628ffeafd6bf6aba Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 20 Apr 2022 16:38:33 +0300 Subject: [PATCH 0177/1022] Move and library crates into a dedicated directory and rename them --- Cargo.lock | 127 ++++++++---------- Cargo.toml | 4 +- compute_tools/src/bin/zenith_ctl.rs | 2 +- control_plane/Cargo.toml | 2 +- control_plane/src/compute.rs | 11 +- control_plane/src/local_env.rs | 8 +- control_plane/src/safekeeper.rs | 8 +- control_plane/src/storage.rs | 12 +- docs/README.md | 2 +- docs/authentication.md | 2 +- docs/sourcetree.md | 22 +-- {zenith_metrics => libs/metrics}/Cargo.toml | 4 +- {zenith_metrics => libs/metrics}/src/lib.rs | 0 .../metrics}/src/wrappers.rs | 8 +- .../postgres_ffi}/Cargo.toml | 4 +- {postgres_ffi => libs/postgres_ffi}/README | 0 {postgres_ffi => libs/postgres_ffi}/build.rs | 4 +- .../postgres_ffi}/pg_control_ffi.h | 0 .../postgres_ffi}/samples/pg_hba.conf | 0 .../postgres_ffi}/src/controlfile_utils.rs | 4 +- .../postgres_ffi}/src/lib.rs | 0 .../postgres_ffi}/src/nonrelfile_utils.rs | 0 .../postgres_ffi}/src/pg_constants.rs | 0 .../postgres_ffi}/src/relfile_utils.rs | 0 .../postgres_ffi}/src/waldecoder.rs | 2 +- .../postgres_ffi}/src/xlog_utils.rs | 22 +-- {zenith_utils => libs/utils}/Cargo.toml | 6 +- .../utils}/benches/benchmarks.rs | 2 +- {zenith_utils => libs/utils}/build.rs | 0 .../utils}/scripts/restore_from_wal.sh | 0 .../scripts/restore_from_wal_archive.sh | 0 {zenith_utils => libs/utils}/src/accum.rs | 2 +- {zenith_utils => libs/utils}/src/auth.rs | 0 {zenith_utils => libs/utils}/src/bin_ser.rs | 0 .../utils}/src/connstring.rs | 0 .../utils}/src/crashsafe_dir.rs | 0 .../utils}/src/http/endpoint.rs | 5 +- .../utils}/src/http/error.rs | 0 {zenith_utils => libs/utils}/src/http/json.rs | 0 {zenith_utils => libs/utils}/src/http/mod.rs | 0 .../utils}/src/http/request.rs | 0 {zenith_utils => libs/utils}/src/lib.rs | 4 +- {zenith_utils => libs/utils}/src/logging.rs | 0 {zenith_utils => libs/utils}/src/lsn.rs | 0 {zenith_utils => libs/utils}/src/nonblock.rs | 0 .../utils}/src/postgres_backend.rs | 0 {zenith_utils => libs/utils}/src/pq_proto.rs | 2 +- {zenith_utils => libs/utils}/src/seqwait.rs | 0 .../utils}/src/seqwait_async.rs | 0 {zenith_utils => libs/utils}/src/shutdown.rs | 0 {zenith_utils => libs/utils}/src/signals.rs | 0 .../utils}/src/sock_split.rs | 0 {zenith_utils => libs/utils}/src/sync.rs | 2 +- .../utils}/src/tcp_listener.rs | 0 {zenith_utils => libs/utils}/src/vec_map.rs | 0 {zenith_utils => libs/utils}/src/zid.rs | 0 .../utils}/tests/bin_ser_test.rs | 2 +- {zenith_utils => libs/utils}/tests/cert.pem | 0 {zenith_utils => libs/utils}/tests/key.pem | 0 .../utils}/tests/ssl_test.rs | 2 +- pageserver/Cargo.toml | 6 +- pageserver/src/basebackup.rs | 2 +- pageserver/src/bin/dump_layerfile.rs | 2 +- pageserver/src/bin/pageserver.rs | 24 ++-- pageserver/src/bin/update_metadata.rs | 3 +- pageserver/src/config.rs | 6 +- pageserver/src/http/models.rs | 2 +- pageserver/src/http/routes.rs | 26 ++-- pageserver/src/import_datadir.rs | 2 +- pageserver/src/layered_repository.rs | 12 +- .../src/layered_repository/delta_layer.rs | 8 +- .../src/layered_repository/ephemeral_file.rs | 3 +- pageserver/src/layered_repository/filename.rs | 2 +- .../src/layered_repository/image_layer.rs | 8 +- .../src/layered_repository/inmemory_layer.rs | 10 +- .../src/layered_repository/layer_map.rs | 4 +- pageserver/src/layered_repository/metadata.rs | 2 +- .../src/layered_repository/storage_layer.rs | 6 +- pageserver/src/lib.rs | 7 +- pageserver/src/page_cache.rs | 2 +- pageserver/src/page_service.rs | 17 ++- pageserver/src/pgdatadir_mapping.rs | 5 +- pageserver/src/remote_storage.rs | 2 +- pageserver/src/remote_storage/storage_sync.rs | 15 +-- .../remote_storage/storage_sync/download.rs | 5 +- .../src/remote_storage/storage_sync/index.rs | 7 +- .../src/remote_storage/storage_sync/upload.rs | 5 +- pageserver/src/repository.rs | 8 +- pageserver/src/tenant_mgr.rs | 2 +- pageserver/src/tenant_threads.rs | 2 +- pageserver/src/thread_mgr.rs | 2 +- pageserver/src/timelines.rs | 8 +- pageserver/src/virtual_file.rs | 4 +- pageserver/src/walingest.rs | 2 +- pageserver/src/walreceiver.rs | 10 +- pageserver/src/walredo.rs | 7 +- proxy/Cargo.toml | 4 +- proxy/src/auth.rs | 2 +- proxy/src/auth/flow.rs | 2 +- proxy/src/cancellation.rs | 2 +- proxy/src/http.rs | 5 +- proxy/src/main.rs | 4 +- proxy/src/mgmt.rs | 2 +- proxy/src/proxy.rs | 4 +- proxy/src/sasl/messages.rs | 6 +- proxy/src/stream.rs | 2 +- safekeeper/Cargo.toml | 6 +- safekeeper/src/bin/safekeeper.rs | 12 +- safekeeper/src/broker.rs | 10 +- safekeeper/src/callmemaybe.rs | 6 +- safekeeper/src/control_file.rs | 10 +- safekeeper/src/control_file_upgrade.rs | 2 +- safekeeper/src/handler.rs | 11 +- safekeeper/src/http/models.rs | 2 +- safekeeper/src/http/routes.rs | 21 +-- safekeeper/src/json_ctrl.rs | 10 +- safekeeper/src/lib.rs | 2 +- safekeeper/src/receive_wal.rs | 8 +- safekeeper/src/safekeeper.rs | 15 +-- safekeeper/src/send_wal.rs | 15 ++- safekeeper/src/timeline.rs | 9 +- safekeeper/src/wal_service.rs | 2 +- safekeeper/src/wal_storage.rs | 5 +- test_runner/batch_others/test_wal_restore.py | 2 +- workspace_hack/Cargo.toml | 5 +- zenith/Cargo.toml | 4 +- zenith/src/main.rs | 12 +- 127 files changed, 355 insertions(+), 360 deletions(-) rename {zenith_metrics => libs/metrics}/Cargo.toml (69%) rename {zenith_metrics => libs/metrics}/src/lib.rs (100%) rename {zenith_metrics => libs/metrics}/src/wrappers.rs (96%) rename {postgres_ffi => libs/postgres_ffi}/Cargo.toml (77%) rename {postgres_ffi => libs/postgres_ffi}/README (100%) rename {postgres_ffi => libs/postgres_ffi}/build.rs (96%) rename {postgres_ffi => libs/postgres_ffi}/pg_control_ffi.h (100%) rename {postgres_ffi => libs/postgres_ffi}/samples/pg_hba.conf (100%) rename {postgres_ffi => libs/postgres_ffi}/src/controlfile_utils.rs (97%) rename {postgres_ffi => libs/postgres_ffi}/src/lib.rs (100%) rename {postgres_ffi => libs/postgres_ffi}/src/nonrelfile_utils.rs (100%) rename {postgres_ffi => libs/postgres_ffi}/src/pg_constants.rs (100%) rename {postgres_ffi => libs/postgres_ffi}/src/relfile_utils.rs (100%) rename {postgres_ffi => libs/postgres_ffi}/src/waldecoder.rs (99%) rename {postgres_ffi => libs/postgres_ffi}/src/xlog_utils.rs (98%) rename {zenith_utils => libs/utils}/Cargo.toml (88%) rename {zenith_utils => libs/utils}/benches/benchmarks.rs (96%) rename {zenith_utils => libs/utils}/build.rs (100%) rename {zenith_utils => libs/utils}/scripts/restore_from_wal.sh (100%) rename {zenith_utils => libs/utils}/scripts/restore_from_wal_archive.sh (100%) rename {zenith_utils => libs/utils}/src/accum.rs (96%) rename {zenith_utils => libs/utils}/src/auth.rs (100%) rename {zenith_utils => libs/utils}/src/bin_ser.rs (100%) rename {zenith_utils => libs/utils}/src/connstring.rs (100%) rename {zenith_utils => libs/utils}/src/crashsafe_dir.rs (100%) rename {zenith_utils => libs/utils}/src/http/endpoint.rs (97%) rename {zenith_utils => libs/utils}/src/http/error.rs (100%) rename {zenith_utils => libs/utils}/src/http/json.rs (100%) rename {zenith_utils => libs/utils}/src/http/mod.rs (100%) rename {zenith_utils => libs/utils}/src/http/request.rs (100%) rename {zenith_utils => libs/utils}/src/lib.rs (95%) rename {zenith_utils => libs/utils}/src/logging.rs (100%) rename {zenith_utils => libs/utils}/src/lsn.rs (100%) rename {zenith_utils => libs/utils}/src/nonblock.rs (100%) rename {zenith_utils => libs/utils}/src/postgres_backend.rs (100%) rename {zenith_utils => libs/utils}/src/pq_proto.rs (99%) rename {zenith_utils => libs/utils}/src/seqwait.rs (100%) rename {zenith_utils => libs/utils}/src/seqwait_async.rs (100%) rename {zenith_utils => libs/utils}/src/shutdown.rs (100%) rename {zenith_utils => libs/utils}/src/signals.rs (100%) rename {zenith_utils => libs/utils}/src/sock_split.rs (100%) rename {zenith_utils => libs/utils}/src/sync.rs (99%) rename {zenith_utils => libs/utils}/src/tcp_listener.rs (100%) rename {zenith_utils => libs/utils}/src/vec_map.rs (100%) rename {zenith_utils => libs/utils}/src/zid.rs (100%) rename {zenith_utils => libs/utils}/tests/bin_ser_test.rs (96%) rename {zenith_utils => libs/utils}/tests/cert.pem (100%) rename {zenith_utils => libs/utils}/tests/key.pem (100%) rename {zenith_utils => libs/utils}/tests/ssl_test.rs (98%) diff --git a/Cargo.lock b/Cargo.lock index 1cf8562787..508b56125d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -225,9 +225,6 @@ name = "cc" version = "1.0.72" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee" -dependencies = [ - "jobserver", -] [[package]] name = "cexpr" @@ -368,8 +365,8 @@ dependencies = [ "thiserror", "toml", "url", + "utils", "workspace_hack", - "zenith_utils", ] [[package]] @@ -1137,15 +1134,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" -[[package]] -name = "jobserver" -version = "0.1.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa" -dependencies = [ - "libc", -] - [[package]] name = "js-sys" version = "0.3.56" @@ -1272,6 +1260,17 @@ dependencies = [ "autocfg", ] +[[package]] +name = "metrics" +version = "0.1.0" +dependencies = [ + "lazy_static", + "libc", + "once_cell", + "prometheus", + "workspace_hack", +] + [[package]] name = "mime" version = "0.3.16" @@ -1514,6 +1513,7 @@ dependencies = [ "hyper", "itertools", "lazy_static", + "metrics", "nix", "once_cell", "postgres", @@ -1539,9 +1539,8 @@ dependencies = [ "toml_edit", "tracing", "url", + "utils", "workspace_hack", - "zenith_metrics", - "zenith_utils", ] [[package]] @@ -1744,8 +1743,8 @@ dependencies = [ "regex", "serde", "thiserror", + "utils", "workspace_hack", - "zenith_utils", ] [[package]] @@ -1853,6 +1852,7 @@ dependencies = [ "hyper", "lazy_static", "md5", + "metrics", "parking_lot", "pin-project-lite", "rand", @@ -1872,9 +1872,8 @@ dependencies = [ "tokio-postgres", "tokio-postgres-rustls", "tokio-rustls", + "utils", "workspace_hack", - "zenith_metrics", - "zenith_utils", ] [[package]] @@ -2267,6 +2266,7 @@ dependencies = [ "humantime", "hyper", "lazy_static", + "metrics", "postgres", "postgres-protocol", "postgres_ffi", @@ -2283,10 +2283,9 @@ dependencies = [ "tokio-util 0.7.0", "tracing", "url", + "utils", "walkdir", "workspace_hack", - "zenith_metrics", - "zenith_utils", ] [[package]] @@ -3063,6 +3062,43 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "utils" +version = "0.1.0" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "bytes", + "criterion", + "git-version", + "hex", + "hex-literal", + "hyper", + "jsonwebtoken", + "lazy_static", + "metrics", + "nix", + "pin-project-lite", + "postgres", + "postgres-protocol", + "rand", + "routerify", + "rustls", + "rustls-pemfile", + "rustls-split", + "serde", + "serde_json", + "serde_with", + "signal-hook", + "tempfile", + "thiserror", + "tokio", + "tracing", + "tracing-subscriber", + "workspace_hack", +] + [[package]] name = "valuable" version = "0.1.0" @@ -3272,7 +3308,6 @@ version = "0.1.0" dependencies = [ "anyhow", "bytes", - "cc", "chrono", "clap 2.34.0", "either", @@ -3331,56 +3366,8 @@ dependencies = [ "postgres_ffi", "safekeeper", "serde_json", + "utils", "workspace_hack", - "zenith_utils", -] - -[[package]] -name = "zenith_metrics" -version = "0.1.0" -dependencies = [ - "lazy_static", - "libc", - "once_cell", - "prometheus", - "workspace_hack", -] - -[[package]] -name = "zenith_utils" -version = "0.1.0" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "bytes", - "criterion", - "git-version", - "hex", - "hex-literal", - "hyper", - "jsonwebtoken", - "lazy_static", - "nix", - "pin-project-lite", - "postgres", - "postgres-protocol", - "rand", - "routerify", - "rustls", - "rustls-pemfile", - "rustls-split", - "serde", - "serde_json", - "serde_with", - "signal-hook", - "tempfile", - "thiserror", - "tokio", - "tracing", - "tracing-subscriber", - "workspace_hack", - "zenith_metrics", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 1405f26517..35c18ba237 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,13 +3,11 @@ members = [ "compute_tools", "control_plane", "pageserver", - "postgres_ffi", "proxy", "safekeeper", "workspace_hack", "zenith", - "zenith_metrics", - "zenith_utils", + "libs/*", ] [profile.release] diff --git a/compute_tools/src/bin/zenith_ctl.rs b/compute_tools/src/bin/zenith_ctl.rs index 372afbc633..a5dfb1c875 100644 --- a/compute_tools/src/bin/zenith_ctl.rs +++ b/compute_tools/src/bin/zenith_ctl.rs @@ -157,7 +157,7 @@ fn run_compute(state: &Arc>) -> Result { } fn main() -> Result<()> { - // TODO: re-use `zenith_utils::logging` later + // TODO: re-use `utils::logging` later init_logger(DEFAULT_LOG_LEVEL)?; // Env variable is set by `cargo` diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 80b6c00dd2..33d01f7556 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -19,5 +19,5 @@ reqwest = { version = "0.11", default-features = false, features = ["blocking", pageserver = { path = "../pageserver" } safekeeper = { path = "../safekeeper" } -zenith_utils = { path = "../zenith_utils" } +utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index c078c274cf..2549baca5d 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -11,11 +11,12 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Result}; -use zenith_utils::connstring::connection_host_port; -use zenith_utils::lsn::Lsn; -use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::ZTenantId; -use zenith_utils::zid::ZTimelineId; +use utils::{ + connstring::connection_host_port, + lsn::Lsn, + postgres_backend::AuthType, + zid::{ZTenantId, ZTimelineId}, +}; use crate::local_env::LocalEnv; use crate::postgresql_conf::PostgresConf; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 2bdc76e876..12ee88cdc9 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -11,9 +11,11 @@ use std::env; use std::fs; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; -use zenith_utils::auth::{encode_from_key_file, Claims, Scope}; -use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}; +use utils::{ + auth::{encode_from_key_file, Claims, Scope}, + postgres_backend::AuthType, + zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, +}; use crate::safekeeper::SafekeeperNode; diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 6f11a4e03d..b094016131 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -15,13 +15,15 @@ use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use safekeeper::http::models::TimelineCreateRequest; use thiserror::Error; -use zenith_utils::http::error::HttpErrorBody; -use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; +use utils::{ + connstring::connection_address, + http::error::HttpErrorBody, + zid::{ZNodeId, ZTenantId, ZTimelineId}, +}; use crate::local_env::{LocalEnv, SafekeeperConf}; use crate::storage::PageServerNode; use crate::{fill_rust_env_vars, read_pidfile}; -use zenith_utils::connstring::connection_address; #[derive(Error, Debug)] pub enum SafekeeperHttpError { diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index c49d5743a9..a01ffd30f6 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -15,15 +15,17 @@ use postgres::{Config, NoTls}; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use thiserror::Error; -use zenith_utils::http::error::HttpErrorBody; -use zenith_utils::lsn::Lsn; -use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use utils::{ + connstring::connection_address, + http::error::HttpErrorBody, + lsn::Lsn, + postgres_backend::AuthType, + zid::{ZTenantId, ZTimelineId}, +}; use crate::local_env::LocalEnv; use crate::{fill_rust_env_vars, read_pidfile}; use pageserver::tenant_mgr::TenantInfo; -use zenith_utils::connstring::connection_address; #[derive(Error, Debug)] pub enum PageserverHttpError { diff --git a/docs/README.md b/docs/README.md index a3fcd20bd2..99d635bb33 100644 --- a/docs/README.md +++ b/docs/README.md @@ -8,7 +8,7 @@ - [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI. - [sourcetree.md](sourcetree.md) — Overview of the source tree layeout. - [pageserver/README](/pageserver/README) — pageserver overview. -- [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview. +- [postgres_ffi/README](/libs/postgres_ffi/README) — Postgres FFI overview. - [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview. - [safekeeper/README](/safekeeper/README) — WAL service overview. - [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core diff --git a/docs/authentication.md b/docs/authentication.md index de408624ae..7200ffc62f 100644 --- a/docs/authentication.md +++ b/docs/authentication.md @@ -27,4 +27,4 @@ management_token = jwt.encode({"scope": "pageserverapi"}, auth_keys.priv, algori tenant_token = jwt.encode({"scope": "tenant", "tenant_id": ps.initial_tenant}, auth_keys.priv, algorithm="RS256") ``` -Utility functions to work with jwts in rust are located in zenith_utils/src/auth.rs +Utility functions to work with jwts in rust are located in libs/utils/src/auth.rs diff --git a/docs/sourcetree.md b/docs/sourcetree.md index b15294d67f..5fd5fe19e5 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -30,11 +30,6 @@ The pageserver has a few different duties: For more detailed info, see `/pageserver/README` -`/postgres_ffi`: - -Utility functions for interacting with PostgreSQL file formats. -Misc constants, copied from PostgreSQL headers. - `/proxy`: Postgres protocol proxy/router. @@ -74,14 +69,21 @@ We use [cargo-hakari](https://crates.io/crates/cargo-hakari) for automation. Main entry point for the 'zenith' CLI utility. TODO: Doesn't it belong to control_plane? -`/zenith_metrics`: +`/libs`: +Unites granular neon helper crates under the hood. +`/libs/postgres_ffi`: + +Utility functions for interacting with PostgreSQL file formats. +Misc constants, copied from PostgreSQL headers. + +`/libs/utils`: +Generic helpers that are shared between other crates in this repository. +A subject for future modularization. + +`/libs/metrics`: Helpers for exposing Prometheus metrics from the server. -`/zenith_utils`: - -Helpers that are shared between other crates in this repository. - ## Using Python Note that Debian/Ubuntu Python packages are stale, as it commonly happens, so manual installation of dependencies is not recommended. diff --git a/zenith_metrics/Cargo.toml b/libs/metrics/Cargo.toml similarity index 69% rename from zenith_metrics/Cargo.toml rename to libs/metrics/Cargo.toml index 906c5a1d64..3b6ff4691d 100644 --- a/zenith_metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "zenith_metrics" +name = "metrics" version = "0.1.0" edition = "2021" @@ -8,4 +8,4 @@ prometheus = {version = "0.13", default_features=false} # removes protobuf depen libc = "0.2" lazy_static = "1.4" once_cell = "1.8.0" -workspace_hack = { version = "0.1", path = "../workspace_hack" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/zenith_metrics/src/lib.rs b/libs/metrics/src/lib.rs similarity index 100% rename from zenith_metrics/src/lib.rs rename to libs/metrics/src/lib.rs diff --git a/zenith_metrics/src/wrappers.rs b/libs/metrics/src/wrappers.rs similarity index 96% rename from zenith_metrics/src/wrappers.rs rename to libs/metrics/src/wrappers.rs index 48202bc15e..de334add99 100644 --- a/zenith_metrics/src/wrappers.rs +++ b/libs/metrics/src/wrappers.rs @@ -8,8 +8,8 @@ use std::io::{Read, Result, Write}; /// /// ``` /// # use std::io::{Result, Read}; -/// # use zenith_metrics::{register_int_counter, IntCounter}; -/// # use zenith_metrics::CountedReader; +/// # use metrics::{register_int_counter, IntCounter}; +/// # use metrics::CountedReader; /// # /// # lazy_static::lazy_static! { /// # static ref INT_COUNTER: IntCounter = register_int_counter!( @@ -83,8 +83,8 @@ impl Read for CountedReader<'_, T> { /// /// ``` /// # use std::io::{Result, Write}; -/// # use zenith_metrics::{register_int_counter, IntCounter}; -/// # use zenith_metrics::CountedWriter; +/// # use metrics::{register_int_counter, IntCounter}; +/// # use metrics::CountedWriter; /// # /// # lazy_static::lazy_static! { /// # static ref INT_COUNTER: IntCounter = register_int_counter!( diff --git a/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml similarity index 77% rename from postgres_ffi/Cargo.toml rename to libs/postgres_ffi/Cargo.toml index e8d471cb12..7be5ca1b93 100644 --- a/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -17,8 +17,8 @@ log = "0.4.14" memoffset = "0.6.2" thiserror = "1.0" serde = { version = "1.0", features = ["derive"] } -zenith_utils = { path = "../zenith_utils" } -workspace_hack = { version = "0.1", path = "../workspace_hack" } +utils = { path = "../utils" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } [build-dependencies] bindgen = "0.59.1" diff --git a/postgres_ffi/README b/libs/postgres_ffi/README similarity index 100% rename from postgres_ffi/README rename to libs/postgres_ffi/README diff --git a/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs similarity index 96% rename from postgres_ffi/build.rs rename to libs/postgres_ffi/build.rs index 3b4b37f9ee..0043b9ab58 100644 --- a/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -88,8 +88,8 @@ fn main() { // 'pg_config --includedir-server' would perhaps be the more proper way to find it, // but this will do for now. // - .clang_arg("-I../tmp_install/include/server") - .clang_arg("-I../tmp_install/include/postgresql/server") + .clang_arg("-I../../tmp_install/include/server") + .clang_arg("-I../../tmp_install/include/postgresql/server") // // Finish the builder and generate the bindings. // diff --git a/postgres_ffi/pg_control_ffi.h b/libs/postgres_ffi/pg_control_ffi.h similarity index 100% rename from postgres_ffi/pg_control_ffi.h rename to libs/postgres_ffi/pg_control_ffi.h diff --git a/postgres_ffi/samples/pg_hba.conf b/libs/postgres_ffi/samples/pg_hba.conf similarity index 100% rename from postgres_ffi/samples/pg_hba.conf rename to libs/postgres_ffi/samples/pg_hba.conf diff --git a/postgres_ffi/src/controlfile_utils.rs b/libs/postgres_ffi/src/controlfile_utils.rs similarity index 97% rename from postgres_ffi/src/controlfile_utils.rs rename to libs/postgres_ffi/src/controlfile_utils.rs index b72c86c71c..4df2342b90 100644 --- a/postgres_ffi/src/controlfile_utils.rs +++ b/libs/postgres_ffi/src/controlfile_utils.rs @@ -43,7 +43,7 @@ impl ControlFileData { /// Interpret a slice of bytes as a Postgres control file. /// pub fn decode(buf: &[u8]) -> Result { - use zenith_utils::bin_ser::LeSer; + use utils::bin_ser::LeSer; // Check that the slice has the expected size. The control file is // padded with zeros up to a 512 byte sector size, so accept a @@ -77,7 +77,7 @@ impl ControlFileData { /// /// The CRC is recomputed to match the contents of the fields. pub fn encode(&self) -> Bytes { - use zenith_utils::bin_ser::LeSer; + use utils::bin_ser::LeSer; // Serialize into a new buffer. let b = self.ser().unwrap(); diff --git a/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs similarity index 100% rename from postgres_ffi/src/lib.rs rename to libs/postgres_ffi/src/lib.rs diff --git a/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs similarity index 100% rename from postgres_ffi/src/nonrelfile_utils.rs rename to libs/postgres_ffi/src/nonrelfile_utils.rs diff --git a/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs similarity index 100% rename from postgres_ffi/src/pg_constants.rs rename to libs/postgres_ffi/src/pg_constants.rs diff --git a/postgres_ffi/src/relfile_utils.rs b/libs/postgres_ffi/src/relfile_utils.rs similarity index 100% rename from postgres_ffi/src/relfile_utils.rs rename to libs/postgres_ffi/src/relfile_utils.rs diff --git a/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs similarity index 99% rename from postgres_ffi/src/waldecoder.rs rename to libs/postgres_ffi/src/waldecoder.rs index ce5aaf722d..9d1089ed46 100644 --- a/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -18,7 +18,7 @@ use crc32c::*; use log::*; use std::cmp::min; use thiserror::Error; -use zenith_utils::lsn::Lsn; +use utils::lsn::Lsn; pub struct WalStreamDecoder { lsn: Lsn, diff --git a/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs similarity index 98% rename from postgres_ffi/src/xlog_utils.rs rename to libs/postgres_ffi/src/xlog_utils.rs index 89fdbbf7ac..1645c44de5 100644 --- a/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -28,7 +28,7 @@ use std::io::prelude::*; use std::io::SeekFrom; use std::path::{Path, PathBuf}; use std::time::SystemTime; -use zenith_utils::lsn::Lsn; +use utils::lsn::Lsn; pub const XLOG_FNAME_LEN: usize = 24; pub const XLOG_BLCKSZ: usize = 8192; @@ -351,17 +351,17 @@ pub fn main() { impl XLogRecord { pub fn from_slice(buf: &[u8]) -> XLogRecord { - use zenith_utils::bin_ser::LeSer; + use utils::bin_ser::LeSer; XLogRecord::des(buf).unwrap() } pub fn from_bytes(buf: &mut B) -> XLogRecord { - use zenith_utils::bin_ser::LeSer; + use utils::bin_ser::LeSer; XLogRecord::des_from(&mut buf.reader()).unwrap() } pub fn encode(&self) -> Bytes { - use zenith_utils::bin_ser::LeSer; + use utils::bin_ser::LeSer; self.ser().unwrap().into() } @@ -373,19 +373,19 @@ impl XLogRecord { impl XLogPageHeaderData { pub fn from_bytes(buf: &mut B) -> XLogPageHeaderData { - use zenith_utils::bin_ser::LeSer; + use utils::bin_ser::LeSer; XLogPageHeaderData::des_from(&mut buf.reader()).unwrap() } } impl XLogLongPageHeaderData { pub fn from_bytes(buf: &mut B) -> XLogLongPageHeaderData { - use zenith_utils::bin_ser::LeSer; + use utils::bin_ser::LeSer; XLogLongPageHeaderData::des_from(&mut buf.reader()).unwrap() } pub fn encode(&self) -> Bytes { - use zenith_utils::bin_ser::LeSer; + use utils::bin_ser::LeSer; self.ser().unwrap().into() } } @@ -394,12 +394,12 @@ pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::(); impl CheckPoint { pub fn encode(&self) -> Bytes { - use zenith_utils::bin_ser::LeSer; + use utils::bin_ser::LeSer; self.ser().unwrap().into() } pub fn decode(buf: &[u8]) -> Result { - use zenith_utils::bin_ser::LeSer; + use utils::bin_ser::LeSer; Ok(CheckPoint::des(buf)?) } @@ -477,7 +477,9 @@ mod tests { #[test] pub fn test_find_end_of_wal() { // 1. Run initdb to generate some WAL - let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(".."); + let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join(".."); let data_dir = top_path.join("test_output/test_find_end_of_wal"); let initdb_path = top_path.join("tmp_install/bin/initdb"); let lib_path = top_path.join("tmp_install/lib"); diff --git a/zenith_utils/Cargo.toml b/libs/utils/Cargo.toml similarity index 88% rename from zenith_utils/Cargo.toml rename to libs/utils/Cargo.toml index dd83fa4a92..35eb443809 100644 --- a/zenith_utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "zenith_utils" +name = "utils" version = "0.1.0" edition = "2021" @@ -29,8 +29,8 @@ rustls-split = "0.3.0" git-version = "0.3.5" serde_with = "1.12.0" -zenith_metrics = { path = "../zenith_metrics" } -workspace_hack = { version = "0.1", path = "../workspace_hack" } +metrics = { path = "../metrics" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } [dev-dependencies] byteorder = "1.4.3" diff --git a/zenith_utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs similarity index 96% rename from zenith_utils/benches/benchmarks.rs rename to libs/utils/benches/benchmarks.rs index c945d5021c..0339939934 100644 --- a/zenith_utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -1,7 +1,7 @@ #![allow(unused)] use criterion::{criterion_group, criterion_main, Criterion}; -use zenith_utils::zid; +use utils::zid; pub fn bench_zid_stringify(c: &mut Criterion) { // Can only use public methods. diff --git a/zenith_utils/build.rs b/libs/utils/build.rs similarity index 100% rename from zenith_utils/build.rs rename to libs/utils/build.rs diff --git a/zenith_utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh similarity index 100% rename from zenith_utils/scripts/restore_from_wal.sh rename to libs/utils/scripts/restore_from_wal.sh diff --git a/zenith_utils/scripts/restore_from_wal_archive.sh b/libs/utils/scripts/restore_from_wal_archive.sh similarity index 100% rename from zenith_utils/scripts/restore_from_wal_archive.sh rename to libs/utils/scripts/restore_from_wal_archive.sh diff --git a/zenith_utils/src/accum.rs b/libs/utils/src/accum.rs similarity index 96% rename from zenith_utils/src/accum.rs rename to libs/utils/src/accum.rs index d3ad61e514..0fb0190a92 100644 --- a/zenith_utils/src/accum.rs +++ b/libs/utils/src/accum.rs @@ -5,7 +5,7 @@ /// For example, to calculate the smallest value among some integers: /// /// ``` -/// use zenith_utils::accum::Accum; +/// use utils::accum::Accum; /// /// let values = [1, 2, 3]; /// diff --git a/zenith_utils/src/auth.rs b/libs/utils/src/auth.rs similarity index 100% rename from zenith_utils/src/auth.rs rename to libs/utils/src/auth.rs diff --git a/zenith_utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs similarity index 100% rename from zenith_utils/src/bin_ser.rs rename to libs/utils/src/bin_ser.rs diff --git a/zenith_utils/src/connstring.rs b/libs/utils/src/connstring.rs similarity index 100% rename from zenith_utils/src/connstring.rs rename to libs/utils/src/connstring.rs diff --git a/zenith_utils/src/crashsafe_dir.rs b/libs/utils/src/crashsafe_dir.rs similarity index 100% rename from zenith_utils/src/crashsafe_dir.rs rename to libs/utils/src/crashsafe_dir.rs diff --git a/zenith_utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs similarity index 97% rename from zenith_utils/src/http/endpoint.rs rename to libs/utils/src/http/endpoint.rs index 7669f18cd2..77acab496f 100644 --- a/zenith_utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -5,12 +5,11 @@ use anyhow::anyhow; use hyper::header::AUTHORIZATION; use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server}; use lazy_static::lazy_static; +use metrics::{new_common_metric_name, register_int_counter, Encoder, IntCounter, TextEncoder}; use routerify::ext::RequestExt; use routerify::RequestInfo; use routerify::{Middleware, Router, RouterBuilder, RouterService}; use tracing::info; -use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter}; -use zenith_metrics::{Encoder, TextEncoder}; use std::future::Future; use std::net::TcpListener; @@ -36,7 +35,7 @@ async fn prometheus_metrics_handler(_req: Request) -> Result anyhow::Result<()> { /// # Ok(()) diff --git a/zenith_utils/src/seqwait.rs b/libs/utils/src/seqwait.rs similarity index 100% rename from zenith_utils/src/seqwait.rs rename to libs/utils/src/seqwait.rs diff --git a/zenith_utils/src/seqwait_async.rs b/libs/utils/src/seqwait_async.rs similarity index 100% rename from zenith_utils/src/seqwait_async.rs rename to libs/utils/src/seqwait_async.rs diff --git a/zenith_utils/src/shutdown.rs b/libs/utils/src/shutdown.rs similarity index 100% rename from zenith_utils/src/shutdown.rs rename to libs/utils/src/shutdown.rs diff --git a/zenith_utils/src/signals.rs b/libs/utils/src/signals.rs similarity index 100% rename from zenith_utils/src/signals.rs rename to libs/utils/src/signals.rs diff --git a/zenith_utils/src/sock_split.rs b/libs/utils/src/sock_split.rs similarity index 100% rename from zenith_utils/src/sock_split.rs rename to libs/utils/src/sock_split.rs diff --git a/zenith_utils/src/sync.rs b/libs/utils/src/sync.rs similarity index 99% rename from zenith_utils/src/sync.rs rename to libs/utils/src/sync.rs index 5e61480bc3..48f0ff6384 100644 --- a/zenith_utils/src/sync.rs +++ b/libs/utils/src/sync.rs @@ -29,7 +29,7 @@ impl SyncFuture { /// Example: /// /// ``` - /// # use zenith_utils::sync::SyncFuture; + /// # use utils::sync::SyncFuture; /// # use std::future::Future; /// # use tokio::io::AsyncReadExt; /// # diff --git a/zenith_utils/src/tcp_listener.rs b/libs/utils/src/tcp_listener.rs similarity index 100% rename from zenith_utils/src/tcp_listener.rs rename to libs/utils/src/tcp_listener.rs diff --git a/zenith_utils/src/vec_map.rs b/libs/utils/src/vec_map.rs similarity index 100% rename from zenith_utils/src/vec_map.rs rename to libs/utils/src/vec_map.rs diff --git a/zenith_utils/src/zid.rs b/libs/utils/src/zid.rs similarity index 100% rename from zenith_utils/src/zid.rs rename to libs/utils/src/zid.rs diff --git a/zenith_utils/tests/bin_ser_test.rs b/libs/utils/tests/bin_ser_test.rs similarity index 96% rename from zenith_utils/tests/bin_ser_test.rs rename to libs/utils/tests/bin_ser_test.rs index ada43a1189..f357837a55 100644 --- a/zenith_utils/tests/bin_ser_test.rs +++ b/libs/utils/tests/bin_ser_test.rs @@ -2,7 +2,7 @@ use bytes::{Buf, BytesMut}; use hex_literal::hex; use serde::Deserialize; use std::io::Read; -use zenith_utils::bin_ser::LeSer; +use utils::bin_ser::LeSer; #[derive(Debug, PartialEq, Deserialize)] pub struct HeaderData { diff --git a/zenith_utils/tests/cert.pem b/libs/utils/tests/cert.pem similarity index 100% rename from zenith_utils/tests/cert.pem rename to libs/utils/tests/cert.pem diff --git a/zenith_utils/tests/key.pem b/libs/utils/tests/key.pem similarity index 100% rename from zenith_utils/tests/key.pem rename to libs/utils/tests/key.pem diff --git a/zenith_utils/tests/ssl_test.rs b/libs/utils/tests/ssl_test.rs similarity index 98% rename from zenith_utils/tests/ssl_test.rs rename to libs/utils/tests/ssl_test.rs index 0e330c44f8..002361667b 100644 --- a/zenith_utils/tests/ssl_test.rs +++ b/libs/utils/tests/ssl_test.rs @@ -9,7 +9,7 @@ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use lazy_static::lazy_static; -use zenith_utils::postgres_backend::{AuthType, Handler, PostgresBackend}; +use utils::postgres_backend::{AuthType, Handler, PostgresBackend}; fn make_tcp_pair() -> (TcpStream, TcpStream) { let listener = TcpListener::bind("127.0.0.1:0").unwrap(); diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 1a533af95f..7b44dafb09 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -47,9 +47,9 @@ rusoto_core = "0.47" rusoto_s3 = "0.47" async-trait = "0.1" -postgres_ffi = { path = "../postgres_ffi" } -zenith_metrics = { path = "../zenith_metrics" } -zenith_utils = { path = "../zenith_utils" } +postgres_ffi = { path = "../libs/postgres_ffi" } +metrics = { path = "../libs/metrics" } +utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 077e7c9f83..78a27e460f 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -25,7 +25,7 @@ use crate::repository::Timeline; use crate::DatadirTimelineImpl; use postgres_ffi::xlog_utils::*; use postgres_ffi::*; -use zenith_utils::lsn::Lsn; +use utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, /// created mostly to avoid passing a lot of parameters between various functions diff --git a/pageserver/src/bin/dump_layerfile.rs b/pageserver/src/bin/dump_layerfile.rs index 7cf39566ac..af73ef6bdb 100644 --- a/pageserver/src/bin/dump_layerfile.rs +++ b/pageserver/src/bin/dump_layerfile.rs @@ -7,7 +7,7 @@ use pageserver::layered_repository::dump_layerfile_from_path; use pageserver::page_cache; use pageserver::virtual_file; use std::path::PathBuf; -use zenith_utils::GIT_VERSION; +use utils::GIT_VERSION; fn main() -> Result<()> { let arg_matches = App::new("Zenith dump_layerfile utility") diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 1610a26239..867bea1b06 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -2,14 +2,6 @@ use std::{env, path::Path, str::FromStr}; use tracing::*; -use zenith_utils::{ - auth::JwtAuth, - logging, - postgres_backend::AuthType, - tcp_listener, - zid::{ZTenantId, ZTimelineId}, - GIT_VERSION, -}; use anyhow::{bail, Context, Result}; @@ -25,12 +17,20 @@ use pageserver::{ thread_mgr::ThreadKind, timelines, virtual_file, LOG_FILE_NAME, }; -use zenith_utils::http::endpoint; -use zenith_utils::shutdown::exit_now; -use zenith_utils::signals::{self, Signal}; +use utils::{ + auth::JwtAuth, + http::endpoint, + logging, + postgres_backend::AuthType, + shutdown::exit_now, + signals::{self, Signal}, + tcp_listener, + zid::{ZTenantId, ZTimelineId}, + GIT_VERSION, +}; fn main() -> anyhow::Result<()> { - zenith_metrics::set_common_metrics_prefix("pageserver"); + metrics::set_common_metrics_prefix("pageserver"); let arg_matches = App::new("Zenith page server") .about("Materializes WAL stream to pages and serves them to the postgres") .version(GIT_VERSION) diff --git a/pageserver/src/bin/update_metadata.rs b/pageserver/src/bin/update_metadata.rs index bfbb6179c5..fae5e5c2e3 100644 --- a/pageserver/src/bin/update_metadata.rs +++ b/pageserver/src/bin/update_metadata.rs @@ -6,8 +6,7 @@ use clap::{App, Arg}; use pageserver::layered_repository::metadata::TimelineMetadata; use std::path::PathBuf; use std::str::FromStr; -use zenith_utils::lsn::Lsn; -use zenith_utils::GIT_VERSION; +use utils::{lsn::Lsn, GIT_VERSION}; fn main() -> Result<()> { let arg_matches = App::new("Zenith update metadata utility") diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 067073cd9b..0cba3f48f8 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -7,8 +7,10 @@ use anyhow::{bail, ensure, Context, Result}; use toml_edit; use toml_edit::{Document, Item}; -use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; +use utils::{ + postgres_backend::AuthType, + zid::{ZNodeId, ZTenantId, ZTimelineId}, +}; use std::convert::TryInto; use std::env; diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index d1dfb911ba..9b51e48477 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -1,6 +1,6 @@ use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; -use zenith_utils::{ +use utils::{ lsn::Lsn, zid::{ZNodeId, ZTenantId, ZTimelineId}, }; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index f49b1d7ba3..82ea5d1d09 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -4,19 +4,6 @@ use anyhow::{Context, Result}; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use tracing::*; -use zenith_utils::auth::JwtAuth; -use zenith_utils::http::endpoint::attach_openapi_ui; -use zenith_utils::http::endpoint::auth_middleware; -use zenith_utils::http::endpoint::check_permission; -use zenith_utils::http::error::ApiError; -use zenith_utils::http::{ - endpoint, - error::HttpErrorBody, - json::{json_request, json_response}, - request::parse_request_param, -}; -use zenith_utils::http::{RequestExt, RouterBuilder}; -use zenith_utils::zid::{ZTenantTimelineId, ZTimelineId}; use super::models::{ StatusResponse, TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest, @@ -27,7 +14,18 @@ use crate::remote_storage::{ }; use crate::repository::Repository; use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo}; -use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId}; +use crate::{config::PageServerConf, tenant_mgr, timelines}; +use utils::{ + auth::JwtAuth, + http::{ + endpoint::{self, attach_openapi_ui, auth_middleware, check_permission}, + error::{ApiError, HttpErrorBody}, + json::{json_request, json_response}, + request::parse_request_param, + RequestExt, RouterBuilder, + }, + zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, +}; struct State { conf: &'static PageServerConf, diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 232892973e..8f49903e6c 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -20,7 +20,7 @@ use postgres_ffi::waldecoder::*; use postgres_ffi::xlog_utils::*; use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED}; use postgres_ffi::{Oid, TransactionId}; -use zenith_utils::lsn::Lsn; +use utils::lsn::Lsn; /// /// Import all relation data pages from local disk into the repository. diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 59a3def1fb..7525bdb94e 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -46,15 +46,17 @@ use crate::virtual_file::VirtualFile; use crate::walreceiver::IS_WAL_RECEIVER; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; -use crate::{ZTenantId, ZTimelineId}; -use zenith_metrics::{ +use metrics::{ register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, }; -use zenith_utils::crashsafe_dir; -use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn}; -use zenith_utils::seqwait::SeqWait; +use utils::{ + crashsafe_dir, + lsn::{AtomicLsn, Lsn, RecordLsn}, + seqwait::SeqWait, + zid::{ZTenantId, ZTimelineId}, +}; mod blob_io; pub mod block_io; diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 03b7e453b3..c5530a5789 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -35,7 +35,6 @@ use crate::page_cache::{PageReadGuard, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::virtual_file::VirtualFile; use crate::walrecord; -use crate::{ZTenantId, ZTimelineId}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use serde::{Deserialize, Serialize}; @@ -51,8 +50,11 @@ use std::os::unix::fs::FileExt; use std::path::{Path, PathBuf}; use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; -use zenith_utils::bin_ser::BeSer; -use zenith_utils::lsn::Lsn; +use utils::{ + bin_ser::BeSer, + lsn::Lsn, + zid::{ZTenantId, ZTimelineId}, +}; /// /// Header stored in the beginning of the file diff --git a/pageserver/src/layered_repository/ephemeral_file.rs b/pageserver/src/layered_repository/ephemeral_file.rs index a2f8cda461..9537d3939c 100644 --- a/pageserver/src/layered_repository/ephemeral_file.rs +++ b/pageserver/src/layered_repository/ephemeral_file.rs @@ -17,8 +17,7 @@ use std::ops::DerefMut; use std::path::PathBuf; use std::sync::{Arc, RwLock}; use tracing::*; -use zenith_utils::zid::ZTenantId; -use zenith_utils::zid::ZTimelineId; +use utils::zid::{ZTenantId, ZTimelineId}; use std::os::unix::fs::FileExt; diff --git a/pageserver/src/layered_repository/filename.rs b/pageserver/src/layered_repository/filename.rs index 497912b408..f088088277 100644 --- a/pageserver/src/layered_repository/filename.rs +++ b/pageserver/src/layered_repository/filename.rs @@ -8,7 +8,7 @@ use std::fmt; use std::ops::Range; use std::path::PathBuf; -use zenith_utils::lsn::Lsn; +use utils::lsn::Lsn; // Note: LayeredTimeline::load_layer_map() relies on this sort order #[derive(Debug, PartialEq, Eq, Clone)] diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index fa91198a79..0e38d46e7a 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -30,7 +30,6 @@ use crate::layered_repository::storage_layer::{ use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value, KEY_SIZE}; use crate::virtual_file::VirtualFile; -use crate::{ZTenantId, ZTimelineId}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; @@ -44,8 +43,11 @@ use std::path::{Path, PathBuf}; use std::sync::{RwLock, RwLockReadGuard}; use tracing::*; -use zenith_utils::bin_ser::BeSer; -use zenith_utils::lsn::Lsn; +use utils::{ + bin_ser::BeSer, + lsn::Lsn, + zid::{ZTenantId, ZTimelineId}, +}; /// /// Header stored in the beginning of the file diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 33e1eabd8e..714a0bc579 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -14,19 +14,21 @@ use crate::layered_repository::storage_layer::{ }; use crate::repository::{Key, Value}; use crate::walrecord; -use crate::{ZTenantId, ZTimelineId}; use anyhow::{bail, ensure, Result}; use std::collections::HashMap; use tracing::*; +use utils::{ + bin_ser::BeSer, + lsn::Lsn, + vec_map::VecMap, + zid::{ZTenantId, ZTimelineId}, +}; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods use std::fmt::Write as _; use std::ops::Range; use std::path::PathBuf; use std::sync::RwLock; -use zenith_utils::bin_ser::BeSer; -use zenith_utils::lsn::Lsn; -use zenith_utils::vec_map::VecMap; pub struct InMemoryLayer { conf: &'static PageServerConf, diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 3984ee550f..03ee8b8ef1 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -16,12 +16,12 @@ use crate::layered_repository::InMemoryLayer; use crate::repository::Key; use anyhow::Result; use lazy_static::lazy_static; +use metrics::{register_int_gauge, IntGauge}; use std::collections::VecDeque; use std::ops::Range; use std::sync::Arc; use tracing::*; -use zenith_metrics::{register_int_gauge, IntGauge}; -use zenith_utils::lsn::Lsn; +use utils::lsn::Lsn; lazy_static! { static ref NUM_ONDISK_LAYERS: IntGauge = diff --git a/pageserver/src/layered_repository/metadata.rs b/pageserver/src/layered_repository/metadata.rs index 7daf899ba2..0b47f8d697 100644 --- a/pageserver/src/layered_repository/metadata.rs +++ b/pageserver/src/layered_repository/metadata.rs @@ -10,7 +10,7 @@ use std::path::PathBuf; use anyhow::ensure; use serde::{Deserialize, Serialize}; -use zenith_utils::{ +use utils::{ bin_ser::BeSer, lsn::Lsn, zid::{ZTenantId, ZTimelineId}, diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index e413f311c3..aad631c5c4 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -4,13 +4,15 @@ use crate::repository::{Key, Value}; use crate::walrecord::ZenithWalRecord; -use crate::{ZTenantId, ZTimelineId}; use anyhow::Result; use bytes::Bytes; use std::ops::Range; use std::path::PathBuf; -use zenith_utils::lsn::Lsn; +use utils::{ + lsn::Lsn, + zid::{ZTenantId, ZTimelineId}, +}; pub fn range_overlaps(a: &Range, b: &Range) -> bool where diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 6dddef5f27..e6ac159ef2 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -22,13 +22,10 @@ pub mod walredo; use lazy_static::lazy_static; use tracing::info; -use zenith_metrics::{register_int_gauge_vec, IntGaugeVec}; -use zenith_utils::{ - postgres_backend, - zid::{ZTenantId, ZTimelineId}, -}; +use utils::postgres_backend; use crate::thread_mgr::ThreadKind; +use metrics::{register_int_gauge_vec, IntGaugeVec}; use layered_repository::LayeredRepository; use pgdatadir_mapping::DatadirTimeline; diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index bd44384a44..0c179b95c5 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -47,7 +47,7 @@ use std::{ use once_cell::sync::OnceCell; use tracing::error; -use zenith_utils::{ +use utils::{ lsn::Lsn, zid::{ZTenantId, ZTimelineId}, }; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index c09b032e48..8f5ea2e845 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -20,15 +20,13 @@ use std::str; use std::str::FromStr; use std::sync::{Arc, RwLockReadGuard}; use tracing::*; -use zenith_metrics::{register_histogram_vec, HistogramVec}; -use zenith_utils::auth::{self, JwtAuth}; -use zenith_utils::auth::{Claims, Scope}; -use zenith_utils::lsn::Lsn; -use zenith_utils::postgres_backend::is_socket_read_timed_out; -use zenith_utils::postgres_backend::PostgresBackend; -use zenith_utils::postgres_backend::{self, AuthType}; -use zenith_utils::pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC}; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use utils::{ + auth::{self, Claims, JwtAuth, Scope}, + lsn::Lsn, + postgres_backend::{self, is_socket_read_timed_out, AuthType, PostgresBackend}, + pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC}, + zid::{ZTenantId, ZTimelineId}, +}; use crate::basebackup; use crate::config::PageServerConf; @@ -41,6 +39,7 @@ use crate::thread_mgr; use crate::thread_mgr::ThreadKind; use crate::walreceiver; use crate::CheckpointConfig; +use metrics::{register_histogram_vec, HistogramVec}; // Wrapped in libpq CopyData enum PagestreamFeMessage { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 0b9ea7c7a7..071eccc05d 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -20,8 +20,7 @@ use std::ops::Range; use std::sync::atomic::{AtomicIsize, Ordering}; use std::sync::{Arc, Mutex, RwLockReadGuard}; use tracing::{debug, error, trace, warn}; -use zenith_utils::bin_ser::BeSer; -use zenith_utils::lsn::Lsn; +use utils::{bin_ser::BeSer, lsn::Lsn}; /// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type. pub type BlockNumber = u32; @@ -1212,7 +1211,7 @@ pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> { #[cfg(test)] pub fn create_test_timeline( repo: R, - timeline_id: zenith_utils::zid::ZTimelineId, + timeline_id: utils::zid::ZTimelineId, ) -> Result>> { let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; let tline = DatadirTimeline::new(tline, 256 * 1024); diff --git a/pageserver/src/remote_storage.rs b/pageserver/src/remote_storage.rs index effc8dcdf4..8a09f7b9ca 100644 --- a/pageserver/src/remote_storage.rs +++ b/pageserver/src/remote_storage.rs @@ -117,7 +117,7 @@ use crate::{ metadata::{TimelineMetadata, METADATA_FILE_NAME}, }, }; -use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; /// A timeline status to share with pageserver's sync counterpart, /// after comparing local and remote timeline state. diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index 649e563dbc..4d1ec2e225 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -86,10 +86,7 @@ use self::{ index::{IndexPart, RemoteIndex, RemoteTimeline, RemoteTimelineIndex}, upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, }; -use super::{ - LocalTimelineInitStatus, LocalTimelineInitStatuses, RemoteStorage, SyncStartupData, - ZTenantTimelineId, -}; +use super::{LocalTimelineInitStatus, LocalTimelineInitStatuses, RemoteStorage, SyncStartupData}; use crate::{ config::PageServerConf, layered_repository::metadata::{metadata_path, TimelineMetadata}, @@ -99,11 +96,11 @@ use crate::{ thread_mgr::ThreadKind, }; -use zenith_metrics::{ +use metrics::{ register_histogram_vec, register_int_counter, register_int_gauge, HistogramVec, IntCounter, IntGauge, }; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; pub use self::download::download_index_part; @@ -145,7 +142,7 @@ mod sync_queue { use tracing::{debug, warn}; use super::SyncTask; - use zenith_utils::zid::ZTenantTimelineId; + use utils::zid::ZTenantTimelineId; static SENDER: OnceCell> = OnceCell::new(); static LENGTH: AtomicUsize = AtomicUsize::new(0); @@ -1197,7 +1194,7 @@ fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Optio #[cfg(test)] mod test_utils { - use zenith_utils::lsn::Lsn; + use utils::lsn::Lsn; use crate::repository::repo_harness::RepoHarness; @@ -1246,7 +1243,7 @@ mod tests { use std::collections::BTreeSet; use super::{test_utils::dummy_metadata, *}; - use zenith_utils::lsn::Lsn; + use utils::lsn::Lsn; #[test] fn download_sync_tasks_merge() { diff --git a/pageserver/src/remote_storage/storage_sync/download.rs b/pageserver/src/remote_storage/storage_sync/download.rs index eb805cd0cc..7fe25ab36e 100644 --- a/pageserver/src/remote_storage/storage_sync/download.rs +++ b/pageserver/src/remote_storage/storage_sync/download.rs @@ -12,9 +12,10 @@ use crate::{ layered_repository::metadata::metadata_path, remote_storage::{ storage_sync::{sync_queue, SyncTask}, - RemoteStorage, ZTenantTimelineId, + RemoteStorage, }, }; +use utils::zid::ZTenantTimelineId; use super::{ index::{IndexPart, RemoteTimeline}, @@ -182,7 +183,7 @@ mod tests { use std::collections::{BTreeSet, HashSet}; use tempfile::tempdir; - use zenith_utils::lsn::Lsn; + use utils::lsn::Lsn; use crate::{ remote_storage::{ diff --git a/pageserver/src/remote_storage/storage_sync/index.rs b/pageserver/src/remote_storage/storage_sync/index.rs index 918bda1039..d847e03a24 100644 --- a/pageserver/src/remote_storage/storage_sync/index.rs +++ b/pageserver/src/remote_storage/storage_sync/index.rs @@ -13,11 +13,8 @@ use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use tokio::sync::RwLock; -use crate::{ - config::PageServerConf, layered_repository::metadata::TimelineMetadata, - remote_storage::ZTenantTimelineId, -}; -use zenith_utils::lsn::Lsn; +use crate::{config::PageServerConf, layered_repository::metadata::TimelineMetadata}; +use utils::{lsn::Lsn, zid::ZTenantTimelineId}; /// A part of the filesystem path, that needs a root to become a path again. #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] diff --git a/pageserver/src/remote_storage/storage_sync/upload.rs b/pageserver/src/remote_storage/storage_sync/upload.rs index b4a2f6f989..d2ff77e92e 100644 --- a/pageserver/src/remote_storage/storage_sync/upload.rs +++ b/pageserver/src/remote_storage/storage_sync/upload.rs @@ -12,9 +12,10 @@ use crate::{ layered_repository::metadata::metadata_path, remote_storage::{ storage_sync::{index::RemoteTimeline, sync_queue, SyncTask}, - RemoteStorage, ZTenantTimelineId, + RemoteStorage, }, }; +use utils::zid::ZTenantTimelineId; use super::{index::IndexPart, SyncData, TimelineUpload}; @@ -208,7 +209,7 @@ mod tests { use std::collections::{BTreeSet, HashSet}; use tempfile::tempdir; - use zenith_utils::lsn::Lsn; + use utils::lsn::Lsn; use crate::{ remote_storage::{ diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index d75b4efe71..fc438cce9c 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -11,8 +11,10 @@ use std::fmt::Display; use std::ops::{AddAssign, Range}; use std::sync::{Arc, RwLockReadGuard}; use std::time::Duration; -use zenith_utils::lsn::{Lsn, RecordLsn}; -use zenith_utils::zid::ZTimelineId; +use utils::{ + lsn::{Lsn, RecordLsn}, + zid::ZTimelineId, +}; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] /// Key used in the Repository kv-store. @@ -431,7 +433,7 @@ pub mod repo_harness { use super::*; use hex_literal::hex; - use zenith_utils::zid::ZTenantId; + use utils::zid::ZTenantId; pub const TIMELINE_ID: ZTimelineId = ZTimelineId::from_array(hex!("11223344556677881122334455667788")); diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 71e85c58e6..33bb4dc2e0 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -20,7 +20,7 @@ use std::collections::HashMap; use std::fmt; use std::sync::{Arc, Mutex, MutexGuard}; use tracing::*; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use utils::zid::{ZTenantId, ZTimelineId}; lazy_static! { static ref TENANTS: Mutex> = Mutex::new(HashMap::new()); diff --git a/pageserver/src/tenant_threads.rs b/pageserver/src/tenant_threads.rs index 0d9a94cc5b..4dcc15f817 100644 --- a/pageserver/src/tenant_threads.rs +++ b/pageserver/src/tenant_threads.rs @@ -7,7 +7,7 @@ use crate::tenant_mgr::TenantState; use anyhow::Result; use std::time::Duration; use tracing::*; -use zenith_utils::zid::ZTenantId; +use utils::zid::ZTenantId; /// /// Compaction thread's main loop diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index 4484bb1db1..2866c6be44 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -47,7 +47,7 @@ use tracing::{debug, error, info, warn}; use lazy_static::lazy_static; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use utils::zid::{ZTenantId, ZTimelineId}; use crate::shutdown_pageserver; diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 586d27d5b1..abbabc8b31 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -14,9 +14,11 @@ use std::{ }; use tracing::*; -use zenith_utils::lsn::Lsn; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; -use zenith_utils::{crashsafe_dir, logging}; +use utils::{ + crashsafe_dir, logging, + lsn::Lsn, + zid::{ZTenantId, ZTimelineId}, +}; use crate::{ config::PageServerConf, diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 64f9db2338..4ce245a74f 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -11,15 +11,15 @@ //! src/backend/storage/file/fd.c //! use lazy_static::lazy_static; +use once_cell::sync::OnceCell; use std::fs::{File, OpenOptions}; use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write}; use std::os::unix::fs::FileExt; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{RwLock, RwLockWriteGuard}; -use zenith_metrics::{register_histogram_vec, register_int_gauge_vec, HistogramVec, IntGaugeVec}; -use once_cell::sync::OnceCell; +use metrics::{register_histogram_vec, register_int_gauge_vec, HistogramVec, IntGaugeVec}; // Metrics collected on disk IO operations const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index c6c6e89854..583cdecb1d 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -38,7 +38,7 @@ use postgres_ffi::nonrelfile_utils::mx_offset_to_member_segment; use postgres_ffi::xlog_utils::*; use postgres_ffi::TransactionId; use postgres_ffi::{pg_constants, CheckPoint}; -use zenith_utils::lsn::Lsn; +use utils::lsn::Lsn; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index e09af09820..ce4e4d45fb 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -29,11 +29,11 @@ use tokio_postgres::replication::ReplicationStream; use tokio_postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow}; use tokio_stream::StreamExt; use tracing::*; -use zenith_utils::lsn::Lsn; -use zenith_utils::pq_proto::ZenithFeedback; -use zenith_utils::zid::ZTenantId; -use zenith_utils::zid::ZTenantTimelineId; -use zenith_utils::zid::ZTimelineId; +use utils::{ + lsn::Lsn, + pq_proto::ZenithFeedback, + zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, +}; // // We keep one WAL Receiver active per timeline. diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index b7c6ecf726..dcffcda6bb 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -35,17 +35,14 @@ use std::sync::Mutex; use std::time::Duration; use std::time::Instant; use tracing::*; -use zenith_metrics::{register_histogram, register_int_counter, Histogram, IntCounter}; -use zenith_utils::bin_ser::BeSer; -use zenith_utils::lsn::Lsn; -use zenith_utils::nonblock::set_nonblock; -use zenith_utils::zid::ZTenantId; +use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock, zid::ZTenantId}; use crate::config::PageServerConf; use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; use crate::walrecord::ZenithWalRecord; +use metrics::{register_histogram, register_int_counter, Histogram, IntCounter}; use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift; use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_offset; use postgres_ffi::nonrelfile_utils::mx_offset_to_member_offset; diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index a4bd99db38..81086a0cad 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -33,8 +33,8 @@ tokio = { version = "1.17", features = ["macros"] } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } tokio-rustls = "0.23.0" -zenith_utils = { path = "../zenith_utils" } -zenith_metrics = { path = "../zenith_metrics" } +utils = { path = "../libs/utils" } +metrics = { path = "../libs/metrics" } workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index bda14d67a1..4c54e2f9eb 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -12,7 +12,7 @@ use crate::waiters; use std::io; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use zenith_utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; +use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; pub use credentials::ClientCredentials; diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 0fafaa2f47..bcfd94a9ed 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -5,7 +5,7 @@ use crate::stream::PqStream; use crate::{sasl, scram}; use std::io; use tokio::io::{AsyncRead, AsyncWrite}; -use zenith_utils::pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; +use utils::pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; /// Every authentication selector is supposed to implement this trait. pub trait AuthMethod { diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 07d3bcc71a..a801313635 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -4,7 +4,7 @@ use parking_lot::Mutex; use std::net::SocketAddr; use tokio::net::TcpStream; use tokio_postgres::{CancelToken, NoTls}; -use zenith_utils::pq_proto::CancelKeyData; +use utils::pq_proto::CancelKeyData; /// Enables serving `CancelRequest`s. #[derive(Default)] diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 33d134678f..5a75718742 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -1,10 +1,7 @@ use anyhow::anyhow; use hyper::{Body, Request, Response, StatusCode}; use std::net::TcpListener; -use zenith_utils::http::endpoint; -use zenith_utils::http::error::ApiError; -use zenith_utils::http::json::json_response; -use zenith_utils::http::{RouterBuilder, RouterService}; +use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService}; async fn status_handler(_: Request) -> Result, ApiError> { json_response(StatusCode::OK, "") diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 862152bb7b..8df46619ec 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -30,7 +30,7 @@ use config::ProxyConfig; use futures::FutureExt; use std::future::Future; use tokio::{net::TcpListener, task::JoinError}; -use zenith_utils::GIT_VERSION; +use utils::GIT_VERSION; use crate::config::{ClientAuthMethod, RouterConfig}; @@ -43,7 +43,7 @@ async fn flatten_err( #[tokio::main] async fn main() -> anyhow::Result<()> { - zenith_metrics::set_common_metrics_prefix("zenith_proxy"); + metrics::set_common_metrics_prefix("zenith_proxy"); let arg_matches = App::new("Zenith proxy/router") .version(GIT_VERSION) .arg( diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index ab6fdff040..23ad8a2013 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -5,7 +5,7 @@ use std::{ net::{TcpListener, TcpStream}, thread, }; -use zenith_utils::{ +use utils::{ postgres_backend::{self, AuthType, PostgresBackend}, pq_proto::{BeMessage, SINGLE_COL_ROWDESC}, }; diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 788179252b..f7de1618df 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -5,10 +5,10 @@ use crate::stream::{MetricsStream, PqStream, Stream}; use anyhow::{bail, Context}; use futures::TryFutureExt; use lazy_static::lazy_static; +use metrics::{new_common_metric_name, register_int_counter, IntCounter}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; -use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter}; -use zenith_utils::pq_proto::{BeMessage as Be, *}; +use utils::pq_proto::{BeMessage as Be, *}; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; const ERR_PROTO_VIOLATION: &str = "protocol violation"; diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs index b1ae8cc426..58be6268fe 100644 --- a/proxy/src/sasl/messages.rs +++ b/proxy/src/sasl/messages.rs @@ -1,9 +1,9 @@ //! Definitions for SASL messages. use crate::parse::{split_at_const, split_cstr}; -use zenith_utils::pq_proto::{BeAuthenticationSaslMessage, BeMessage}; +use utils::pq_proto::{BeAuthenticationSaslMessage, BeMessage}; -/// SASL-specific payload of [`PasswordMessage`](zenith_utils::pq_proto::FeMessage::PasswordMessage). +/// SASL-specific payload of [`PasswordMessage`](utils::pq_proto::FeMessage::PasswordMessage). #[derive(Debug)] pub struct FirstMessage<'a> { /// Authentication method, e.g. `"SCRAM-SHA-256"`. @@ -31,7 +31,7 @@ impl<'a> FirstMessage<'a> { /// A single SASL message. /// This struct is deliberately decoupled from lower-level -/// [`BeAuthenticationSaslMessage`](zenith_utils::pq_proto::BeAuthenticationSaslMessage). +/// [`BeAuthenticationSaslMessage`](utils::pq_proto::BeAuthenticationSaslMessage). #[derive(Debug)] pub(super) enum ServerMessage { /// We expect to see more steps. diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index fb0be84584..42b0185fde 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -9,7 +9,7 @@ use std::{io, task}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, ReadBuf}; use tokio_rustls::server::TlsStream; -use zenith_utils::pq_proto::{BeMessage, FeMessage, FeStartupPacket}; +use utils::pq_proto::{BeMessage, FeMessage, FeStartupPacket}; pin_project! { /// Stream wrapper which implements libpq's protocol. diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index ca5e2a6b55..76d40cdc2e 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -33,9 +33,9 @@ tokio-util = { version = "0.7", features = ["io"] } rusoto_core = "0.47" rusoto_s3 = "0.47" -postgres_ffi = { path = "../postgres_ffi" } -zenith_metrics = { path = "../zenith_metrics" } -zenith_utils = { path = "../zenith_utils" } +postgres_ffi = { path = "../libs/postgres_ffi" } +metrics = { path = "../libs/metrics" } +utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index e191cb52fd..7434f921cb 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -10,11 +10,9 @@ use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::path::{Path, PathBuf}; use std::thread; +use tokio::sync::mpsc; use tracing::*; use url::{ParseError, Url}; -use zenith_utils::http::endpoint; -use zenith_utils::zid::ZNodeId; -use zenith_utils::{logging, tcp_listener, GIT_VERSION}; use safekeeper::control_file::{self}; use safekeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR}; @@ -23,15 +21,15 @@ use safekeeper::s3_offload; use safekeeper::wal_service; use safekeeper::SafeKeeperConf; use safekeeper::{broker, callmemaybe}; -use tokio::sync::mpsc; -use zenith_utils::shutdown::exit_now; -use zenith_utils::signals; +use utils::{ + http::endpoint, logging, shutdown::exit_now, signals, tcp_listener, zid::ZNodeId, GIT_VERSION, +}; const LOCK_FILE_NAME: &str = "safekeeper.lock"; const ID_FILE_NAME: &str = "safekeeper.id"; fn main() -> Result<()> { - zenith_metrics::set_common_metrics_prefix("safekeeper"); + metrics::set_common_metrics_prefix("safekeeper"); let arg_matches = App::new("Zenith safekeeper") .about("Store WAL stream to local file system and push it to WAL receivers") .version(GIT_VERSION) diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 147497d673..b84b5cf789 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -17,14 +17,12 @@ use std::time::Duration; use tokio::task::JoinHandle; use tokio::{runtime, time::sleep}; use tracing::*; -use zenith_utils::zid::ZTenantId; -use zenith_utils::zid::ZTimelineId; -use zenith_utils::{ - lsn::Lsn, - zid::{ZNodeId, ZTenantTimelineId}, -}; use crate::{safekeeper::Term, timeline::GlobalTimelines, SafeKeeperConf}; +use utils::{ + lsn::Lsn, + zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, +}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; diff --git a/safekeeper/src/callmemaybe.rs b/safekeeper/src/callmemaybe.rs index 1e52ec927b..8c3fbe26ba 100644 --- a/safekeeper/src/callmemaybe.rs +++ b/safekeeper/src/callmemaybe.rs @@ -16,8 +16,10 @@ use tokio::sync::mpsc::UnboundedReceiver; use tokio::task; use tokio_postgres::NoTls; use tracing::*; -use zenith_utils::connstring::connection_host_port; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use utils::{ + connstring::connection_host_port, + zid::{ZTenantId, ZTimelineId}, +}; async fn request_callback( pageserver_connstr: String, diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 7cc53edeb0..c49b4c058a 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -10,13 +10,11 @@ use std::ops::Deref; use std::path::{Path, PathBuf}; use tracing::*; -use zenith_metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; -use zenith_utils::bin_ser::LeSer; - -use zenith_utils::zid::ZTenantTimelineId; use crate::control_file_upgrade::upgrade_control_file; use crate::safekeeper::{SafeKeeperState, SK_FORMAT_VERSION, SK_MAGIC}; +use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; +use utils::{bin_ser::LeSer, zid::ZTenantTimelineId}; use crate::SafeKeeperConf; @@ -251,10 +249,10 @@ impl Storage for FileStorage { mod test { use super::FileStorage; use super::*; - use crate::{safekeeper::SafeKeeperState, SafeKeeperConf, ZTenantTimelineId}; + use crate::{safekeeper::SafeKeeperState, SafeKeeperConf}; use anyhow::Result; use std::fs; - use zenith_utils::lsn::Lsn; + use utils::{lsn::Lsn, zid::ZTenantTimelineId}; fn stub_conf() -> SafeKeeperConf { let workdir = tempfile::tempdir().unwrap().into_path(); diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 9effe42f8d..0cb14298cb 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -5,7 +5,7 @@ use crate::safekeeper::{ use anyhow::{bail, Result}; use serde::{Deserialize, Serialize}; use tracing::*; -use zenith_utils::{ +use utils::{ bin_ser::LeSer, lsn::Lsn, pq_proto::SystemId, diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index bb14049787..7d86523b0e 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -14,11 +14,12 @@ use regex::Regex; use std::str::FromStr; use std::sync::Arc; use tracing::info; -use zenith_utils::lsn::Lsn; -use zenith_utils::postgres_backend; -use zenith_utils::postgres_backend::PostgresBackend; -use zenith_utils::pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; -use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +use utils::{ + lsn::Lsn, + postgres_backend::{self, PostgresBackend}, + pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}, + zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, +}; use crate::callmemaybe::CallmeEvent; use tokio::sync::mpsc::UnboundedSender; diff --git a/safekeeper/src/http/models.rs b/safekeeper/src/http/models.rs index 8a6ed7a812..ca18e64096 100644 --- a/safekeeper/src/http/models.rs +++ b/safekeeper/src/http/models.rs @@ -1,5 +1,5 @@ use serde::{Deserialize, Serialize}; -use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; +use utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 26b23cddcc..2d22332db9 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -4,21 +4,22 @@ use serde::Serialize; use serde::Serializer; use std::fmt::Display; use std::sync::Arc; -use zenith_utils::http::json::json_request; -use zenith_utils::http::{RequestExt, RouterBuilder}; -use zenith_utils::lsn::Lsn; -use zenith_utils::zid::ZNodeId; -use zenith_utils::zid::ZTenantTimelineId; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; use crate::timeline::GlobalTimelines; use crate::SafeKeeperConf; -use zenith_utils::http::endpoint; -use zenith_utils::http::error::ApiError; -use zenith_utils::http::json::json_response; -use zenith_utils::http::request::parse_request_param; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use utils::{ + http::{ + endpoint, + error::ApiError, + json::{json_request, json_response}, + request::parse_request_param, + RequestExt, RouterBuilder, + }, + lsn::Lsn, + zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, +}; use super::models::TimelineCreateRequest; diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index ad5d790105..407fafd990 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -22,9 +22,11 @@ use crate::timeline::TimelineTools; use postgres_ffi::pg_constants; use postgres_ffi::xlog_utils; use postgres_ffi::{uint32, uint64, Oid, XLogRecord}; -use zenith_utils::lsn::Lsn; -use zenith_utils::postgres_backend::PostgresBackend; -use zenith_utils::pq_proto::{BeMessage, RowDescriptor, TEXT_OID}; +use utils::{ + lsn::Lsn, + postgres_backend::PostgresBackend, + pq_proto::{BeMessage, RowDescriptor, TEXT_OID}, +}; #[derive(Serialize, Deserialize, Debug)] pub struct AppendLogicalMessage { @@ -191,7 +193,7 @@ struct XlLogicalMessage { impl XlLogicalMessage { pub fn encode(&self) -> Bytes { - use zenith_utils::bin_ser::LeSer; + use utils::bin_ser::LeSer; self.ser().unwrap().into() } } diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 69423d42d8..8951e8f680 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -3,7 +3,7 @@ use std::path::PathBuf; use std::time::Duration; use url::Url; -use zenith_utils::zid::{ZNodeId, ZTenantTimelineId}; +use utils::zid::{ZNodeId, ZTenantTimelineId}; pub mod broker; pub mod callmemaybe; diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index e6b12a0d81..3ad99ab0df 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -7,7 +7,6 @@ use anyhow::{anyhow, bail, Result}; use bytes::BytesMut; use tokio::sync::mpsc::UnboundedSender; use tracing::*; -use zenith_utils::sock_split::ReadStream; use crate::timeline::Timeline; @@ -23,8 +22,11 @@ use crate::safekeeper::ProposerAcceptorMessage; use crate::handler::SafekeeperPostgresHandler; use crate::timeline::TimelineTools; -use zenith_utils::postgres_backend::PostgresBackend; -use zenith_utils::pq_proto::{BeMessage, FeMessage}; +use utils::{ + postgres_backend::PostgresBackend, + pq_proto::{BeMessage, FeMessage}, + sock_split::ReadStream, +}; use crate::callmemaybe::CallmeEvent; diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index cf56261ee6..59174f34a2 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -11,8 +11,6 @@ use std::cmp::min; use std::fmt; use std::io::Read; use tracing::*; -use zenith_utils::zid::ZNodeId; -use zenith_utils::zid::ZTenantTimelineId; use lazy_static::lazy_static; @@ -20,13 +18,14 @@ use crate::broker::SafekeeperInfo; use crate::control_file; use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; +use metrics::{register_gauge_vec, Gauge, GaugeVec}; use postgres_ffi::xlog_utils::MAX_SEND_SIZE; -use zenith_metrics::{register_gauge_vec, Gauge, GaugeVec}; -use zenith_utils::bin_ser::LeSer; -use zenith_utils::lsn::Lsn; -use zenith_utils::pq_proto::SystemId; -use zenith_utils::pq_proto::ZenithFeedback; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use utils::{ + bin_ser::LeSer, + lsn::Lsn, + pq_proto::{SystemId, ZenithFeedback}, + zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, +}; pub const SK_MAGIC: u32 = 0xcafeceefu32; pub const SK_FORMAT_VERSION: u32 = 4; diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index f12fb5cb4a..960f70d154 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -19,13 +19,14 @@ use std::time::Duration; use std::{str, thread}; use tokio::sync::mpsc::UnboundedSender; use tracing::*; -use zenith_utils::bin_ser::BeSer; -use zenith_utils::lsn::Lsn; -use zenith_utils::postgres_backend::PostgresBackend; -use zenith_utils::pq_proto::{BeMessage, FeMessage, WalSndKeepAlive, XLogDataBody, ZenithFeedback}; -use zenith_utils::sock_split::ReadStream; - -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use utils::{ + bin_ser::BeSer, + lsn::Lsn, + postgres_backend::PostgresBackend, + pq_proto::{BeMessage, FeMessage, WalSndKeepAlive, XLogDataBody, ZenithFeedback}, + sock_split::ReadStream, + zid::{ZTenantId, ZTimelineId}, +}; // See: https://www.postgresql.org/docs/13/protocol-replication.html const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h'; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 777db7eb2b..fbae34251c 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -14,8 +14,11 @@ use std::time::Duration; use tokio::sync::mpsc::UnboundedSender; use tracing::*; -use zenith_utils::lsn::Lsn; -use zenith_utils::zid::{ZNodeId, ZTenantTimelineId}; +use utils::{ + lsn::Lsn, + pq_proto::ZenithFeedback, + zid::{ZNodeId, ZTenantTimelineId}, +}; use crate::broker::SafekeeperInfo; use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; @@ -30,8 +33,6 @@ use crate::wal_storage; use crate::wal_storage::Storage as wal_storage_iface; use crate::SafeKeeperConf; -use zenith_utils::pq_proto::ZenithFeedback; - const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); /// Replica status update + hot standby feedback diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 305e59bcd3..468ac28526 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -12,7 +12,7 @@ use crate::callmemaybe::CallmeEvent; use crate::handler::SafekeeperPostgresHandler; use crate::SafeKeeperConf; use tokio::sync::mpsc::UnboundedSender; -use zenith_utils::postgres_backend::{AuthType, PostgresBackend}; +use utils::postgres_backend::{AuthType, PostgresBackend}; /// Accept incoming TCP connections and spawn them into a background thread. pub fn thread_main( diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 7cef525bee..69a4fb11e1 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -20,8 +20,7 @@ use std::path::{Path, PathBuf}; use tracing::*; -use zenith_utils::lsn::Lsn; -use zenith_utils::zid::ZTenantTimelineId; +use utils::{lsn::Lsn, zid::ZTenantTimelineId}; use crate::safekeeper::SafeKeeperState; @@ -30,7 +29,7 @@ use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ}; use postgres_ffi::waldecoder::WalStreamDecoder; -use zenith_metrics::{ +use metrics::{ register_gauge_vec, register_histogram_vec, Gauge, GaugeVec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS, }; diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py index 2dbde954fc..49421aa4e8 100644 --- a/test_runner/batch_others/test_wal_restore.py +++ b/test_runner/batch_others/test_wal_restore.py @@ -26,7 +26,7 @@ def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, data_dir = os.path.join(test_output_dir, 'pgsql.restored') with VanillaPostgres(data_dir, PgBin(test_output_dir), port) as restored: pg_bin.run_capture([ - os.path.join(base_dir, 'zenith_utils/scripts/restore_from_wal.sh'), + os.path.join(base_dir, 'libs/utils/scripts/restore_from_wal.sh'), os.path.join(pg_distrib_dir, 'bin'), os.path.join(test_output_dir, 'repo/safekeepers/sk1/{}/*'.format(tenant_id)), data_dir, diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 84244b3363..f178b5b766 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -24,8 +24,8 @@ indexmap = { version = "1", default-features = false, features = ["std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } memchr = { version = "2", features = ["std", "use_std"] } -num-integer = { version = "0.1", default-features = false, features = ["std"] } -num-traits = { version = "0.2", features = ["std"] } +num-integer = { version = "0.1", default-features = false, features = ["i128"] } +num-traits = { version = "0.2", features = ["i128", "std"] } prost = { version = "0.9", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } @@ -39,7 +39,6 @@ tracing-core = { version = "0.1", features = ["lazy_static", "std"] } [build-dependencies] anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } -cc = { version = "1", default-features = false, features = ["jobserver", "parallel"] } clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } either = { version = "1", features = ["use_std"] } hashbrown = { version = "0.11", features = ["ahash", "inline-more", "raw"] } diff --git a/zenith/Cargo.toml b/zenith/Cargo.toml index 69283d3763..9692e97331 100644 --- a/zenith/Cargo.toml +++ b/zenith/Cargo.toml @@ -13,6 +13,6 @@ postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98 pageserver = { path = "../pageserver" } control_plane = { path = "../control_plane" } safekeeper = { path = "../safekeeper" } -postgres_ffi = { path = "../postgres_ffi" } -zenith_utils = { path = "../zenith_utils" } +postgres_ffi = { path = "../libs/postgres_ffi" } +utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/zenith/src/main.rs b/zenith/src/main.rs index f248a5db5b..afbbbe395b 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -16,11 +16,13 @@ use safekeeper::defaults::{ use std::collections::{BTreeSet, HashMap}; use std::process::exit; use std::str::FromStr; -use zenith_utils::auth::{Claims, Scope}; -use zenith_utils::lsn::Lsn; -use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}; -use zenith_utils::GIT_VERSION; +use utils::{ + auth::{Claims, Scope}, + lsn::Lsn, + postgres_backend::AuthType, + zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + GIT_VERSION, +}; use pageserver::timelines::TimelineInfo; From abcd7a4b1fe62840160e48a8d10d96a571f8592e Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Thu, 21 Apr 2022 12:22:12 +0400 Subject: [PATCH 0178/1022] Insert less data in test_wal_restore. Otherwise it sometimes hits 2m statement timeout in CI. --- test_runner/batch_others/test_wal_restore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py index 49421aa4e8..b0f34f4aae 100644 --- a/test_runner/batch_others/test_wal_restore.py +++ b/test_runner/batch_others/test_wal_restore.py @@ -19,7 +19,7 @@ def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, env = zenith_env_builder.init_start() env.zenith_cli.create_branch("test_wal_restore") pg = env.postgres.create_start('test_wal_restore') - pg.safe_psql("create table t as select generate_series(1,1000000)") + pg.safe_psql("create table t as select generate_series(1,300000)") tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] env.zenith_cli.pageserver_stop() port = port_distributor.get_port() @@ -33,4 +33,4 @@ def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, str(port) ]) restored.start() - assert restored.safe_psql('select count(*) from t', user='zenith_admin') == [(1000000, )] + assert restored.safe_psql('select count(*) from t', user='zenith_admin') == [(300000, )] From 263d60f12def9e4d2206e7587b0be073ac622755 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 21 Apr 2022 16:37:32 +0300 Subject: [PATCH 0179/1022] Add prometheus metric for time spent waiting for WAL to arrive --- pageserver/src/layered_repository.rs | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 7525bdb94e..ff6498a489 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -110,6 +110,12 @@ lazy_static! { &["tenant_id", "timeline_id"] ) .expect("failed to define a metric"); + static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!( + "wait_lsn_time", + "Time spent waiting for WAL to arrive", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); } lazy_static! { @@ -794,6 +800,7 @@ pub struct LayeredTimeline { compact_time_histo: Histogram, create_images_time_histo: Histogram, last_record_gauge: IntGauge, + wait_lsn_time_histo: Histogram, /// If `true`, will backup its files that appear after each checkpointing to the remote storage. upload_layers: AtomicBool, @@ -873,14 +880,15 @@ impl Timeline for LayeredTimeline { "wait_lsn called by WAL receiver thread" ); - self.last_record_lsn - .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) - .with_context(|| { - format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", - lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() - ) - })?; + self.wait_lsn_time_histo.observe_closure_duration( + || self.last_record_lsn + .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) + .with_context(|| { + format!( + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", + lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() + ) + }))?; Ok(()) } @@ -1022,6 +1030,9 @@ impl LayeredTimeline { let last_record_gauge = LAST_RECORD_LSN .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) .unwrap(); + let wait_lsn_time_histo = WAIT_LSN_TIME + .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) + .unwrap(); LayeredTimeline { conf, @@ -1049,6 +1060,7 @@ impl LayeredTimeline { compact_time_histo, create_images_time_histo, last_record_gauge, + wait_lsn_time_histo, upload_layers: AtomicBool::new(upload_layers), From dafdf9b9524a034f25bb67d5d6f62a375a892862 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 21 Apr 2022 16:37:36 +0300 Subject: [PATCH 0180/1022] Handle EINTR --- libs/utils/src/pq_proto.rs | 23 +++++++++++++++++++---- pageserver/src/walredo.rs | 7 ++++++- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index 0e4c4907e7..e1677f4311 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -100,6 +100,21 @@ pub struct FeExecuteMessage { #[derive(Debug)] pub struct FeCloseMessage {} +/// Retry a read on EINTR +/// +/// This runs the enclosed expression, and if it returns +/// Err(io::ErrorKind::Interrupted), retries it. +macro_rules! retry_read { + ( $x:expr ) => { + loop { + match $x { + Err(e) if e.kind() == io::ErrorKind::Interrupted => continue, + res => break res, + } + } + }; +} + impl FeMessage { /// Read one message from the stream. /// This function returns `Ok(None)` in case of EOF. @@ -141,12 +156,12 @@ impl FeMessage { // Each libpq message begins with a message type byte, followed by message length // If the client closes the connection, return None. But if the client closes the // connection in the middle of a message, we will return an error. - let tag = match stream.read_u8().await { + let tag = match retry_read!(stream.read_u8().await) { Ok(b) => b, Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), Err(e) => return Err(e.into()), }; - let len = stream.read_u32().await?; + let len = retry_read!(stream.read_u32().await)?; // The message length includes itself, so it better be at least 4 let bodylen = len @@ -207,7 +222,7 @@ impl FeStartupPacket { // reading 4 bytes, to be precise), return None to indicate that the connection // was closed. This matches the PostgreSQL server's behavior, which avoids noise // in the log if the client opens connection but closes it immediately. - let len = match stream.read_u32().await { + let len = match retry_read!(stream.read_u32().await) { Ok(len) => len as usize, Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), Err(e) => return Err(e.into()), @@ -217,7 +232,7 @@ impl FeStartupPacket { bail!("invalid message length"); } - let request_code = stream.read_u32().await?; + let request_code = retry_read!(stream.read_u32().await)?; // the rest of startup packet are params let params_len = len - 8; diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index dcffcda6bb..6338b839ae 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -700,7 +700,12 @@ impl PostgresRedoProcess { // If we have more data to write, wake up if 'stdin' becomes writeable or // we have data to read. Otherwise only wake up if there's data to read. let nfds = if nwrite < writebuf.len() { 3 } else { 2 }; - let n = nix::poll::poll(&mut pollfds[0..nfds], wal_redo_timeout.as_millis() as i32)?; + let n = loop { + match nix::poll::poll(&mut pollfds[0..nfds], wal_redo_timeout.as_millis() as i32) { + Err(e) if e == nix::errno::Errno::EINTR => continue, + res => break res, + } + }?; if n == 0 { return Err(Error::new(ErrorKind::Other, "WAL redo timed out")); From a4700c9bbeb5302d4452ef6f445b514fa3822b85 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 21 Apr 2022 20:32:48 +0300 Subject: [PATCH 0181/1022] Use pprof to get flamegraph of get_page and get_relsize requests. This depends on a hacked version of the 'pprof-rs' crate. Because of that, it's under an optional 'profiling' feature. It is disabled by default, but enabled for release builds in CircleCI config. It doesn't currently work on macOS. The flamegraph is written to 'flamegraph.svg' in the pageserver workdir when the 'pageserver' process exits. Add a performance test that runs the perf_pgbench test, with profiling enabled. --- .circleci/config.yml | 4 +- Cargo.lock | 167 +++++++++++++++++++ pageserver/Cargo.toml | 6 + pageserver/src/bin/pageserver.rs | 13 +- pageserver/src/config.rs | 42 ++++- pageserver/src/lib.rs | 1 + pageserver/src/page_service.rs | 9 +- pageserver/src/profiling.rs | 95 +++++++++++ test_runner/fixtures/zenith_fixtures.py | 12 ++ test_runner/performance/test_perf_pgbench.py | 24 ++- 10 files changed, 363 insertions(+), 10 deletions(-) create mode 100644 pageserver/src/profiling.rs diff --git a/.circleci/config.yml b/.circleci/config.yml index 5aae143e48..643c853854 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -113,7 +113,7 @@ jobs: CARGO_FLAGS= elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix=() - CARGO_FLAGS=--release + CARGO_FLAGS="--release --features profiling" fi export CARGO_INCREMENTAL=0 @@ -369,7 +369,7 @@ jobs: when: always command: | du -sh /tmp/test_output/* - find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete + find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" -delete du -sh /tmp/test_output/* - store_artifacts: path: /tmp/test_output diff --git a/Cargo.lock b/Cargo.lock index 508b56125d..3ca3671207 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -55,6 +55,15 @@ dependencies = [ "backtrace", ] +[[package]] +name = "arrayvec" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd9fd44efafa8690358b7408d253adf110036b88f55672a933f01d616ad9b1b9" +dependencies = [ + "nodrop", +] + [[package]] name = "async-stream" version = "0.3.3" @@ -196,6 +205,12 @@ version = "3.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4a45a46ab1f2412e53d3a0ade76ffad2025804294569aae387231a0cd6e0899" +[[package]] +name = "bytemuck" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdead85bdec19c194affaeeb670c0e41fe23de31459efd1c174d049269cf02cc" + [[package]] name = "byteorder" version = "1.4.3" @@ -385,6 +400,15 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" +[[package]] +name = "cpp_demangle" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eeaa953eaad386a53111e47172c2fedba671e5684c8dd601a5f474f4f118710f" +dependencies = [ + "cfg-if", +] + [[package]] name = "cpufeatures" version = "0.2.1" @@ -580,6 +604,15 @@ dependencies = [ "syn", ] +[[package]] +name = "debugid" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6ee87af31d84ef885378aebca32be3d682b0e0dc119d5b4860a2c5bb5046730" +dependencies = [ + "uuid", +] + [[package]] name = "digest" version = "0.9.0" @@ -691,6 +724,18 @@ dependencies = [ "winapi", ] +[[package]] +name = "findshlibs" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64" +dependencies = [ + "cc", + "lazy_static", + "libc", + "winapi", +] + [[package]] name = "fixedbitset" version = "0.4.1" @@ -1098,6 +1143,24 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "inferno" +version = "0.10.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3886428c6400486522cf44b8626e7b94ad794c14390290f2a274dcf728a58f" +dependencies = [ + "ahash", + "atty", + "indexmap", + "itoa 1.0.1", + "lazy_static", + "log", + "num-format", + "quick-xml", + "rgb", + "str_stack", +] + [[package]] name = "instant" version = "0.1.12" @@ -1251,6 +1314,15 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" +[[package]] +name = "memmap2" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "057a3db23999c867821a7a59feb06a578fcb03685e983dff90daf9e7d24ac08f" +dependencies = [ + "libc", +] + [[package]] name = "memoffset" version = "0.6.5" @@ -1353,6 +1425,12 @@ dependencies = [ "memoffset", ] +[[package]] +name = "nodrop" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" + [[package]] name = "nom" version = "7.1.0" @@ -1384,6 +1462,16 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-format" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bafe4179722c2894288ee77a9f044f02811c86af699344c498b0840c698a2465" +dependencies = [ + "arrayvec", + "itoa 0.4.8", +] + [[package]] name = "num-integer" version = "0.1.44" @@ -1520,6 +1608,7 @@ dependencies = [ "postgres-protocol", "postgres-types", "postgres_ffi", + "pprof", "rand", "regex", "rusoto_core", @@ -1747,6 +1836,25 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pprof" +version = "0.6.1" +source = "git+https://github.com/neondatabase/pprof-rs.git?branch=wallclock-profiling#4e011a87d22fb4d21d15cc38bce81ff1c75e4bc9" +dependencies = [ + "backtrace", + "cfg-if", + "findshlibs", + "inferno", + "lazy_static", + "libc", + "log", + "nix", + "parking_lot", + "symbolic-demangle", + "tempfile", + "thiserror", +] + [[package]] name = "ppv-lite86" version = "0.2.16" @@ -1876,6 +1984,15 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "quick-xml" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8533f14c8382aaad0d592c812ac3b826162128b65662331e1127b45c3d18536b" +dependencies = [ + "memchr", +] + [[package]] name = "quickcheck" version = "1.0.3" @@ -2063,6 +2180,15 @@ dependencies = [ "winreg", ] +[[package]] +name = "rgb" +version = "0.8.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e74fdc210d8f24a7dbfedc13b04ba5764f5232754ccebfdf5fff1bad791ccbc6" +dependencies = [ + "bytemuck", +] + [[package]] name = "ring" version = "0.16.20" @@ -2521,6 +2647,18 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "str_stack" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb" + [[package]] name = "stringprep" version = "0.1.2" @@ -2549,6 +2687,29 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" +[[package]] +name = "symbolic-common" +version = "8.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac6aac7b803adc9ee75344af7681969f76d4b38e4723c6eaacf3b28f5f1d87ff" +dependencies = [ + "debugid", + "memmap2", + "stable_deref_trait", + "uuid", +] + +[[package]] +name = "symbolic-demangle" +version = "8.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8143ea5aa546f86c64f9b9aafdd14223ffad4ecd2d58575c63c21335909c99a7" +dependencies = [ + "cpp_demangle", + "rustc-demangle", + "symbolic-common", +] + [[package]] name = "syn" version = "1.0.86" @@ -3099,6 +3260,12 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "uuid" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" + [[package]] name = "valuable" version = "0.1.0" diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 7b44dafb09..eb58b90ad9 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -3,6 +3,10 @@ name = "pageserver" version = "0.1.0" edition = "2021" +[features] +default = [] +profiling = ["pprof"] + [dependencies] chrono = "0.4.19" rand = "0.8.3" @@ -32,6 +36,8 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_with = "1.12.0" +pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true } + toml_edit = { version = "0.13", features = ["easy"] } scopeguard = "1.1.0" const_format = "0.2.21" diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 867bea1b06..9b944cc2ec 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -10,7 +10,7 @@ use daemonize::Daemonize; use pageserver::{ config::{defaults::*, PageServerConf}, - http, page_cache, page_service, + http, page_cache, page_service, profiling, remote_storage::{self, SyncStartupData}, repository::{Repository, TimelineSyncStatusUpdate}, tenant_mgr, thread_mgr, @@ -29,11 +29,15 @@ use utils::{ GIT_VERSION, }; +fn version() -> String { + format!("{} profiling:{}", GIT_VERSION, cfg!(feature = "profiling")) +} + fn main() -> anyhow::Result<()> { metrics::set_common_metrics_prefix("pageserver"); let arg_matches = App::new("Zenith page server") .about("Materializes WAL stream to pages and serves them to the postgres") - .version(GIT_VERSION) + .version(&*version()) .arg( Arg::new("daemonize") .short('d') @@ -283,6 +287,9 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() }; info!("Using auth: {:#?}", conf.auth_type); + // start profiler (if enabled) + let profiler_guard = profiling::init_profiler(conf); + // Spawn a new thread for the http endpoint // bind before launching separate thread so the error reported before startup exits let auth_cloned = auth.clone(); @@ -315,6 +322,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() "Got {}. Terminating in immediate shutdown mode", signal.name() ); + profiling::exit_profiler(conf, &profiler_guard); std::process::exit(111); } @@ -323,6 +331,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() "Got {}. Terminating gracefully in fast shutdown mode", signal.name() ); + profiling::exit_profiler(conf, &profiler_guard); pageserver::shutdown_pageserver(); unreachable!() } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 0cba3f48f8..24ab45386d 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -140,6 +140,27 @@ pub struct PageServerConf { pub auth_validation_public_key_path: Option, pub remote_storage_config: Option, + + pub profiling: ProfilingConfig, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ProfilingConfig { + Disabled, + PageRequests, +} + +impl FromStr for ProfilingConfig { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + let result = match s { + "disabled" => ProfilingConfig::Disabled, + "page_requests" => ProfilingConfig::PageRequests, + _ => bail!("invalid value \"{}\" for profiling option, valid values are \"disabled\" and \"page_requests\"", s), + }; + Ok(result) + } } // use dedicated enum for builder to better indicate the intention @@ -192,6 +213,8 @@ struct PageServerConfigBuilder { remote_storage_config: BuilderValue>, id: BuilderValue, + + profiling: BuilderValue, } impl Default for PageServerConfigBuilder { @@ -224,6 +247,7 @@ impl Default for PageServerConfigBuilder { auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), id: NotSet, + profiling: Set(ProfilingConfig::Disabled), } } } @@ -308,6 +332,10 @@ impl PageServerConfigBuilder { self.id = BuilderValue::Set(node_id) } + pub fn profiling(&mut self, profiling: ProfilingConfig) { + self.profiling = BuilderValue::Set(profiling) + } + pub fn build(self) -> Result { Ok(PageServerConf { listen_pg_addr: self @@ -357,6 +385,7 @@ impl PageServerConfigBuilder { .remote_storage_config .ok_or(anyhow::anyhow!("missing remote_storage_config"))?, id: self.id.ok_or(anyhow::anyhow!("missing id"))?, + profiling: self.profiling.ok_or(anyhow::anyhow!("missing profiling"))?, }) } } @@ -486,11 +515,12 @@ impl PageServerConf { "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some( PathBuf::from(parse_toml_string(key, item)?), )), - "auth_type" => builder.auth_type(parse_toml_auth_type(key, item)?), + "auth_type" => builder.auth_type(parse_toml_from_str(key, item)?), "remote_storage" => { builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?)) } "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)), + "profiling" => builder.profiling(parse_toml_from_str(key, item)?), _ => bail!("unrecognized pageserver option '{}'", key), } } @@ -623,6 +653,7 @@ impl PageServerConf { auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, + profiling: ProfilingConfig::Disabled, } } } @@ -656,11 +687,14 @@ fn parse_toml_duration(name: &str, item: &Item) -> Result { Ok(humantime::parse_duration(s)?) } -fn parse_toml_auth_type(name: &str, item: &Item) -> Result { +fn parse_toml_from_str(name: &str, item: &Item) -> Result +where + T: FromStr, +{ let v = item .as_str() .with_context(|| format!("configure option {} is not a string", name))?; - AuthType::from_str(v) + T::from_str(v) } #[cfg(test)] @@ -733,6 +767,7 @@ id = 10 auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, + profiling: ProfilingConfig::Disabled, }, "Correct defaults should be used when no config values are provided" ); @@ -779,6 +814,7 @@ id = 10 auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, + profiling: ProfilingConfig::Disabled, }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index e6ac159ef2..a761f0dfe2 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -7,6 +7,7 @@ pub mod layered_repository; pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; +pub mod profiling; pub mod reltag; pub mod remote_storage; pub mod repository; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 8f5ea2e845..8c90195131 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -29,8 +29,9 @@ use utils::{ }; use crate::basebackup; -use crate::config::PageServerConf; +use crate::config::{PageServerConf, ProfilingConfig}; use crate::pgdatadir_mapping::DatadirTimeline; +use crate::profiling::profpoint_start; use crate::reltag::RelTag; use crate::repository::Repository; use crate::repository::Timeline; @@ -331,7 +332,10 @@ impl PageServerHandler { pgb.write_message(&BeMessage::CopyBothResponse)?; while !thread_mgr::is_shutdown_requested() { - match pgb.read_message() { + let msg = pgb.read_message(); + + let profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); + match msg { Ok(message) => { if let Some(message) = message { trace!("query: {:?}", message); @@ -383,6 +387,7 @@ impl PageServerHandler { } } } + drop(profiling_guard); } Ok(()) } diff --git a/pageserver/src/profiling.rs b/pageserver/src/profiling.rs new file mode 100644 index 0000000000..e2c12c9e12 --- /dev/null +++ b/pageserver/src/profiling.rs @@ -0,0 +1,95 @@ +//! +//! Support for profiling +//! +//! This relies on a modified version of the 'pprof-rs' crate. That's not very +//! nice, so to avoid a hard dependency on that, this is an optional feature. +//! +use crate::config::{PageServerConf, ProfilingConfig}; + +/// The actual implementation is in the `profiling_impl` submodule. If the profiling +/// feature is not enabled, it's just a dummy implementation that panics if you +/// try to enabled profiling in the configuration. +pub use profiling_impl::*; + +#[cfg(feature = "profiling")] +mod profiling_impl { + use super::*; + use pprof; + use std::marker::PhantomData; + + /// Start profiling the current thread. Returns a guard object; + /// the profiling continues until the guard is dropped. + /// + /// Note: profiling is not re-entrant. If you call 'profpoint_start' while + /// profiling is already started, nothing happens, and the profiling will be + /// stopped when either guard object is dropped. + #[inline] + pub fn profpoint_start( + conf: &crate::config::PageServerConf, + point: ProfilingConfig, + ) -> Option { + if conf.profiling == point { + pprof::start_profiling(); + Some(ProfilingGuard(PhantomData)) + } else { + None + } + } + + /// A hack to remove Send and Sync from the ProfilingGuard. Because the + /// profiling is attached to current thread. + //// + /// See comments in https://github.com/rust-lang/rust/issues/68318 + type PhantomUnsend = std::marker::PhantomData<*mut u8>; + + pub struct ProfilingGuard(PhantomUnsend); + + impl Drop for ProfilingGuard { + fn drop(&mut self) { + pprof::stop_profiling(); + } + } + + /// Initialize the profiler. This must be called before any 'profpoint_start' calls. + pub fn init_profiler(conf: &PageServerConf) -> Option { + if conf.profiling != ProfilingConfig::Disabled { + Some(pprof::ProfilerGuardBuilder::default().build().unwrap()) + } else { + None + } + } + + /// Exit the profiler. Writes the flamegraph to current workdir. + pub fn exit_profiler(_conf: &PageServerConf, profiler_guard: &Option) { + // Write out the flamegraph + if let Some(profiler_guard) = profiler_guard { + if let Ok(report) = profiler_guard.report().build() { + // this gets written under the workdir + let file = std::fs::File::create("flamegraph.svg").unwrap(); + let mut options = pprof::flamegraph::Options::default(); + options.image_width = Some(2500); + report.flamegraph_with_options(file, &mut options).unwrap(); + } + } + } +} + +/// Dummy implementation when compiling without profiling feature +#[cfg(not(feature = "profiling"))] +mod profiling_impl { + use super::*; + + pub fn profpoint_start(_conf: &PageServerConf, _point: ProfilingConfig) -> () { + () + } + + pub fn init_profiler(conf: &PageServerConf) -> () { + if conf.profiling != ProfilingConfig::Disabled { + // shouldn't happen, we don't allow profiling in the config if the support + // for it is disabled. + panic!("profiling enabled but the binary was compiled without profiling support"); + } + } + + pub fn exit_profiler(_conf: &PageServerConf, _guard: &()) {} +} diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index a9c4c0f395..9a2d6cdc88 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -155,6 +155,18 @@ def pytest_configure(config): raise Exception('zenith binaries not found at "{}"'.format(zenith_binpath)) +def profiling_supported(): + """Return True if the pageserver was compiled with the 'profiling' feature + """ + bin_pageserver = os.path.join(str(zenith_binpath), 'pageserver') + res = subprocess.run([bin_pageserver, '--version'], + check=True, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + return "profiling:true" in res.stdout + + def shareable_scope(fixture_name, config) -> Literal["session", "function"]: """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar. diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index d2de76913a..fc10ca4d6c 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -1,5 +1,5 @@ from contextlib import closing -from fixtures.zenith_fixtures import PgBin, VanillaPostgres, ZenithEnv +from fixtures.zenith_fixtures import PgBin, VanillaPostgres, ZenithEnv, profiling_supported from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare from fixtures.benchmark_fixture import PgBenchRunResult, MetricReport, ZenithBenchmarker @@ -106,6 +106,28 @@ def test_pgbench(zenith_with_baseline: PgCompare, scale: int, duration: int): run_test_pgbench(zenith_with_baseline, scale, duration) +# Run the pgbench tests, and generate a flamegraph from it +# This requires that the pageserver was built with the 'profiling' feature. +# +# TODO: If the profiling is cheap enough, there's no need to run the same test +# twice, with and without profiling. But for now, run it separately, so that we +# can see how much overhead the profiling adds. +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix()) +def test_pgbench_flamegraph(zenbenchmark, pg_bin, zenith_env_builder, scale: int, duration: int): + zenith_env_builder.num_safekeepers = 1 + zenith_env_builder.pageserver_config_override = ''' +profiling="page_requests" +''' + if not profiling_supported(): + pytest.skip("pageserver was built without 'profiling' feature") + + env = zenith_env_builder.init_start() + env.zenith_cli.create_branch("empty", "main") + + run_test_pgbench(ZenithCompare(zenbenchmark, env, pg_bin, "pgbench"), scale, duration) + + # Run the pgbench tests against an existing Postgres cluster @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix()) From 5f83c9290b482dc90006c400dfc68e85a17af785 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 25 Feb 2022 19:33:44 +0300 Subject: [PATCH 0182/1022] Make it possible to specify per-tenant configuration parameters Add tenant config API and 'zenith tenant config' CLI command. Add 'show' query to pageserver protocol for tenantspecific config parameters Refactoring: move tenant_config code to a separate module. Save tenant conf file to tenant's directory, when tenant is created to recover it on pageserver restart. Ignore error during tenant config loading, while it is not supported by console Define PiTR interval for GC. refer #1320 --- control_plane/src/storage.rs | 53 ++++- pageserver/src/bin/pageserver.rs | 5 +- pageserver/src/config.rs | 205 +++++----------- pageserver/src/http/models.rs | 48 +++- pageserver/src/http/openapi_spec.yml | 88 ++++++- pageserver/src/http/routes.rs | 67 +++++- pageserver/src/layered_repository.rs | 238 +++++++++++++++++-- pageserver/src/lib.rs | 1 + pageserver/src/page_service.rs | 41 +++- pageserver/src/repository.rs | 31 ++- pageserver/src/tenant_config.rs | 162 +++++++++++++ pageserver/src/tenant_mgr.rs | 43 +++- pageserver/src/tenant_threads.rs | 23 +- pageserver/src/timelines.rs | 10 +- pageserver/src/walreceiver.rs | 2 +- test_runner/batch_others/test_tenant_conf.py | 49 ++++ test_runner/fixtures/zenith_fixtures.py | 23 +- zenith/src/main.rs | 34 ++- 18 files changed, 915 insertions(+), 208 deletions(-) create mode 100644 pageserver/src/tenant_config.rs create mode 100644 test_runner/batch_others/test_tenant_conf.py diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index a01ffd30f6..7520ad9304 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::io::Write; use std::net::TcpStream; use std::path::PathBuf; @@ -9,7 +10,7 @@ use anyhow::{bail, Context}; use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; -use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest}; +use pageserver::http::models::{TenantConfigRequest, TenantCreateRequest, TimelineCreateRequest}; use pageserver::timelines::TimelineInfo; use postgres::{Config, NoTls}; use reqwest::blocking::{Client, RequestBuilder, Response}; @@ -344,10 +345,32 @@ impl PageServerNode { pub fn tenant_create( &self, new_tenant_id: Option, + settings: HashMap<&str, &str>, ) -> anyhow::Result> { let tenant_id_string = self .http_request(Method::POST, format!("{}/tenant", self.http_base_url)) - .json(&TenantCreateRequest { new_tenant_id }) + .json(&TenantCreateRequest { + new_tenant_id, + checkpoint_distance: settings + .get("checkpoint_distance") + .map(|x| x.parse::()) + .transpose()?, + compaction_target_size: settings + .get("compaction_target_size") + .map(|x| x.parse::()) + .transpose()?, + compaction_period: settings.get("compaction_period").map(|x| x.to_string()), + compaction_threshold: settings + .get("compaction_threshold") + .map(|x| x.parse::()) + .transpose()?, + gc_horizon: settings + .get("gc_horizon") + .map(|x| x.parse::()) + .transpose()?, + gc_period: settings.get("gc_period").map(|x| x.to_string()), + pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()), + }) .send()? .error_from_body()? .json::>()?; @@ -364,6 +387,32 @@ impl PageServerNode { .transpose() } + pub fn tenant_config(&self, tenant_id: ZTenantId, settings: HashMap<&str, &str>) -> Result<()> { + self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url)) + .json(&TenantConfigRequest { + tenant_id, + checkpoint_distance: settings + .get("checkpoint_distance") + .map(|x| x.parse::().unwrap()), + compaction_target_size: settings + .get("compaction_target_size") + .map(|x| x.parse::().unwrap()), + compaction_period: settings.get("compaction_period").map(|x| x.to_string()), + compaction_threshold: settings + .get("compaction_threshold") + .map(|x| x.parse::().unwrap()), + gc_horizon: settings + .get("gc_horizon") + .map(|x| x.parse::().unwrap()), + gc_period: settings.get("gc_period").map(|x| x.to_string()), + pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()), + }) + .send()? + .error_from_body()?; + + Ok(()) + } + pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result> { let timeline_infos: Vec = self .http_request( diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 9b944cc2ec..5c135e4eb4 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -246,11 +246,12 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses { // initialize local tenant - let repo = tenant_mgr::load_local_repo(conf, tenant_id, &remote_index); + let repo = tenant_mgr::load_local_repo(conf, tenant_id, &remote_index) + .with_context(|| format!("Failed to load repo for tenant {}", tenant_id))?; for (timeline_id, init_status) in local_timeline_init_statuses { match init_status { remote_storage::LocalTimelineInitStatus::LocallyComplete => { - debug!("timeline {} for tenant {} is locally complete, registering it in repository", tenant_id, timeline_id); + debug!("timeline {} for tenant {} is locally complete, registering it in repository", timeline_id, tenant_id); // Lets fail here loudly to be on the safe side. // XXX: It may be a better api to actually distinguish between repository startup // and processing of newly downloaded timelines. diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 24ab45386d..b2c4a62796 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -5,6 +5,12 @@ //! See also `settings.md` for better description on every parameter. use anyhow::{bail, ensure, Context, Result}; +use std::convert::TryInto; +use std::env; +use std::num::{NonZeroU32, NonZeroUsize}; +use std::path::{Path, PathBuf}; +use std::str::FromStr; +use std::time::Duration; use toml_edit; use toml_edit::{Document, Item}; use utils::{ @@ -12,16 +18,11 @@ use utils::{ zid::{ZNodeId, ZTenantId, ZTimelineId}, }; -use std::convert::TryInto; -use std::env; -use std::num::{NonZeroU32, NonZeroUsize}; -use std::path::{Path, PathBuf}; -use std::str::FromStr; -use std::time::Duration; - use crate::layered_repository::TIMELINES_SEGMENT_NAME; +use crate::tenant_config::{TenantConf, TenantConfOpt}; pub mod defaults { + use crate::tenant_config::defaults::*; use const_format::formatcp; pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; @@ -29,21 +30,6 @@ pub mod defaults { pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); - // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB - // would be more appropriate. But a low value forces the code to be exercised more, - // which is good for now to trigger bugs. - // This parameter actually determines L0 layer file size. - pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; - - // Target file size, when creating image and delta layers. - // This parameter determines L1 layer file size. - pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; - pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s"; - pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; - - pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; - pub const DEFAULT_GC_PERIOD: &str = "100 s"; - pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; @@ -64,14 +50,6 @@ pub mod defaults { #listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}' #listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}' -#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes -#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes -#compaction_period = '{DEFAULT_COMPACTION_PERIOD}' -#compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}' - -#gc_period = '{DEFAULT_GC_PERIOD}' -#gc_horizon = {DEFAULT_GC_HORIZON} - #wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}' #wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}' @@ -80,6 +58,16 @@ pub mod defaults { # initial superuser role name to use when creating a new tenant #initial_superuser_name = '{DEFAULT_SUPERUSER}' +# [tenant_config] +#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes +#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes +#compaction_period = '{DEFAULT_COMPACTION_PERIOD}' +#compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}' + +#gc_period = '{DEFAULT_GC_PERIOD}' +#gc_horizon = {DEFAULT_GC_HORIZON} +#pitr_interval = '{DEFAULT_PITR_INTERVAL}' + # [remote_storage] "### @@ -97,25 +85,6 @@ pub struct PageServerConf { /// Example (default): 127.0.0.1:9898 pub listen_http_addr: String, - // Flush out an inmemory layer, if it's holding WAL older than this - // This puts a backstop on how much WAL needs to be re-digested if the - // page server crashes. - // This parameter actually determines L0 layer file size. - pub checkpoint_distance: u64, - - // Target file size, when creating image and delta layers. - // This parameter determines L1 layer file size. - pub compaction_target_size: u64, - - // How often to check if there's compaction work to be done. - pub compaction_period: Duration, - - // Level0 delta layer threshold for compaction. - pub compaction_threshold: usize, - - pub gc_horizon: u64, - pub gc_period: Duration, - // Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call. pub wait_lsn_timeout: Duration, // How long to wait for WAL redo to complete. @@ -142,6 +111,7 @@ pub struct PageServerConf { pub remote_storage_config: Option, pub profiling: ProfilingConfig, + pub default_tenant_conf: TenantConf, } #[derive(Debug, Clone, PartialEq, Eq)] @@ -185,15 +155,6 @@ struct PageServerConfigBuilder { listen_http_addr: BuilderValue, - checkpoint_distance: BuilderValue, - - compaction_target_size: BuilderValue, - compaction_period: BuilderValue, - compaction_threshold: BuilderValue, - - gc_horizon: BuilderValue, - gc_period: BuilderValue, - wait_lsn_timeout: BuilderValue, wal_redo_timeout: BuilderValue, @@ -224,14 +185,6 @@ impl Default for PageServerConfigBuilder { Self { listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()), listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()), - checkpoint_distance: Set(DEFAULT_CHECKPOINT_DISTANCE), - compaction_target_size: Set(DEFAULT_COMPACTION_TARGET_SIZE), - compaction_period: Set(humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) - .expect("cannot parse default compaction period")), - compaction_threshold: Set(DEFAULT_COMPACTION_THRESHOLD), - gc_horizon: Set(DEFAULT_GC_HORIZON), - gc_period: Set(humantime::parse_duration(DEFAULT_GC_PERIOD) - .expect("cannot parse default gc period")), wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) .expect("cannot parse default wait lsn timeout")), wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) @@ -261,30 +214,6 @@ impl PageServerConfigBuilder { self.listen_http_addr = BuilderValue::Set(listen_http_addr) } - pub fn checkpoint_distance(&mut self, checkpoint_distance: u64) { - self.checkpoint_distance = BuilderValue::Set(checkpoint_distance) - } - - pub fn compaction_target_size(&mut self, compaction_target_size: u64) { - self.compaction_target_size = BuilderValue::Set(compaction_target_size) - } - - pub fn compaction_period(&mut self, compaction_period: Duration) { - self.compaction_period = BuilderValue::Set(compaction_period) - } - - pub fn compaction_threshold(&mut self, compaction_threshold: usize) { - self.compaction_threshold = BuilderValue::Set(compaction_threshold) - } - - pub fn gc_horizon(&mut self, gc_horizon: u64) { - self.gc_horizon = BuilderValue::Set(gc_horizon) - } - - pub fn gc_period(&mut self, gc_period: Duration) { - self.gc_period = BuilderValue::Set(gc_period) - } - pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) { self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout) } @@ -344,22 +273,6 @@ impl PageServerConfigBuilder { listen_http_addr: self .listen_http_addr .ok_or(anyhow::anyhow!("missing listen_http_addr"))?, - checkpoint_distance: self - .checkpoint_distance - .ok_or(anyhow::anyhow!("missing checkpoint_distance"))?, - compaction_target_size: self - .compaction_target_size - .ok_or(anyhow::anyhow!("missing compaction_target_size"))?, - compaction_period: self - .compaction_period - .ok_or(anyhow::anyhow!("missing compaction_period"))?, - compaction_threshold: self - .compaction_threshold - .ok_or(anyhow::anyhow!("missing compaction_threshold"))?, - gc_horizon: self - .gc_horizon - .ok_or(anyhow::anyhow!("missing gc_horizon"))?, - gc_period: self.gc_period.ok_or(anyhow::anyhow!("missing gc_period"))?, wait_lsn_timeout: self .wait_lsn_timeout .ok_or(anyhow::anyhow!("missing wait_lsn_timeout"))?, @@ -386,6 +299,8 @@ impl PageServerConfigBuilder { .ok_or(anyhow::anyhow!("missing remote_storage_config"))?, id: self.id.ok_or(anyhow::anyhow!("missing id"))?, profiling: self.profiling.ok_or(anyhow::anyhow!("missing profiling"))?, + // TenantConf is handled separately + default_tenant_conf: TenantConf::default(), }) } } @@ -488,20 +403,12 @@ impl PageServerConf { let mut builder = PageServerConfigBuilder::default(); builder.workdir(workdir.to_owned()); + let mut t_conf: TenantConfOpt = Default::default(); + for (key, item) in toml.iter() { match key { "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?), "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?), - "checkpoint_distance" => builder.checkpoint_distance(parse_toml_u64(key, item)?), - "compaction_target_size" => { - builder.compaction_target_size(parse_toml_u64(key, item)?) - } - "compaction_period" => builder.compaction_period(parse_toml_duration(key, item)?), - "compaction_threshold" => { - builder.compaction_threshold(parse_toml_u64(key, item)? as usize) - } - "gc_horizon" => builder.gc_horizon(parse_toml_u64(key, item)?), - "gc_period" => builder.gc_period(parse_toml_duration(key, item)?), "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?), "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?), "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?), @@ -519,6 +426,9 @@ impl PageServerConf { "remote_storage" => { builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?)) } + "tenant_conf" => { + t_conf = Self::parse_toml_tenant_conf(item)?; + } "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)), "profiling" => builder.profiling(parse_toml_from_str(key, item)?), _ => bail!("unrecognized pageserver option '{}'", key), @@ -547,9 +457,42 @@ impl PageServerConf { ); } + conf.default_tenant_conf = t_conf.merge(TenantConf::default()); + Ok(conf) } + // subroutine of parse_and_validate to parse `[tenant_conf]` section + + pub fn parse_toml_tenant_conf(item: &toml_edit::Item) -> Result { + let mut t_conf: TenantConfOpt = Default::default(); + for (key, item) in item + .as_table() + .ok_or(anyhow::anyhow!("invalid tenant config"))? + .iter() + { + match key { + "checkpoint_distance" => { + t_conf.checkpoint_distance = Some(parse_toml_u64(key, item)?) + } + "compaction_target_size" => { + t_conf.compaction_target_size = Some(parse_toml_u64(key, item)?) + } + "compaction_period" => { + t_conf.compaction_period = Some(parse_toml_duration(key, item)?) + } + "compaction_threshold" => { + t_conf.compaction_threshold = Some(parse_toml_u64(key, item)? as usize) + } + "gc_horizon" => t_conf.gc_horizon = Some(parse_toml_u64(key, item)?), + "gc_period" => t_conf.gc_period = Some(parse_toml_duration(key, item)?), + "pitr_interval" => t_conf.pitr_interval = Some(parse_toml_duration(key, item)?), + _ => bail!("unrecognized tenant config option '{}'", key), + } + } + Ok(t_conf) + } + /// subroutine of parse_config(), to parse the `[remote_storage]` table. fn parse_remote_storage_config(toml: &toml_edit::Item) -> anyhow::Result { let local_path = toml.get("local_path"); @@ -635,12 +578,6 @@ impl PageServerConf { pub fn dummy_conf(repo_dir: PathBuf) -> Self { PageServerConf { id: ZNodeId(0), - checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, - compaction_target_size: 4 * 1024 * 1024, - compaction_period: Duration::from_secs(10), - compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD, - gc_horizon: defaults::DEFAULT_GC_HORIZON, - gc_period: Duration::from_secs(10), wait_lsn_timeout: Duration::from_secs(60), wal_redo_timeout: Duration::from_secs(60), page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE, @@ -654,6 +591,7 @@ impl PageServerConf { auth_validation_public_key_path: None, remote_storage_config: None, profiling: ProfilingConfig::Disabled, + default_tenant_conf: TenantConf::dummy_conf(), } } } @@ -711,15 +649,6 @@ mod tests { listen_pg_addr = '127.0.0.1:64000' listen_http_addr = '127.0.0.1:9898' -checkpoint_distance = 111 # in bytes - -compaction_target_size = 111 # in bytes -compaction_period = '111 s' -compaction_threshold = 2 - -gc_period = '222 s' -gc_horizon = 222 - wait_lsn_timeout = '111 s' wal_redo_timeout = '111 s' @@ -751,12 +680,6 @@ id = 10 id: ZNodeId(10), listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, - compaction_target_size: defaults::DEFAULT_COMPACTION_TARGET_SIZE, - compaction_period: humantime::parse_duration(defaults::DEFAULT_COMPACTION_PERIOD)?, - compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD, - gc_horizon: defaults::DEFAULT_GC_HORIZON, - gc_period: humantime::parse_duration(defaults::DEFAULT_GC_PERIOD)?, wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?, wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?, superuser: defaults::DEFAULT_SUPERUSER.to_string(), @@ -768,6 +691,7 @@ id = 10 auth_validation_public_key_path: None, remote_storage_config: None, profiling: ProfilingConfig::Disabled, + default_tenant_conf: TenantConf::default(), }, "Correct defaults should be used when no config values are provided" ); @@ -798,12 +722,6 @@ id = 10 id: ZNodeId(10), listen_pg_addr: "127.0.0.1:64000".to_string(), listen_http_addr: "127.0.0.1:9898".to_string(), - checkpoint_distance: 111, - compaction_target_size: 111, - compaction_period: Duration::from_secs(111), - compaction_threshold: 2, - gc_horizon: 222, - gc_period: Duration::from_secs(222), wait_lsn_timeout: Duration::from_secs(111), wal_redo_timeout: Duration::from_secs(111), superuser: "zzzz".to_string(), @@ -815,6 +733,7 @@ id = 10 auth_validation_public_key_path: None, remote_storage_config: None, profiling: ProfilingConfig::Disabled, + default_tenant_conf: TenantConf::default(), }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 9b51e48477..b24b3dc316 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -20,11 +20,18 @@ pub struct TimelineCreateRequest { } #[serde_as] -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Default)] pub struct TenantCreateRequest { #[serde(default)] #[serde_as(as = "Option")] pub new_tenant_id: Option, + pub checkpoint_distance: Option, + pub compaction_target_size: Option, + pub compaction_period: Option, + pub compaction_threshold: Option, + pub gc_horizon: Option, + pub gc_period: Option, + pub pitr_interval: Option, } #[serde_as] @@ -36,3 +43,42 @@ pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId pub struct StatusResponse { pub id: ZNodeId, } + +impl TenantCreateRequest { + pub fn new(new_tenant_id: Option) -> TenantCreateRequest { + TenantCreateRequest { + new_tenant_id, + ..Default::default() + } + } +} + +#[serde_as] +#[derive(Serialize, Deserialize)] +pub struct TenantConfigRequest { + pub tenant_id: ZTenantId, + #[serde(default)] + #[serde_as(as = "Option")] + pub checkpoint_distance: Option, + pub compaction_target_size: Option, + pub compaction_period: Option, + pub compaction_threshold: Option, + pub gc_horizon: Option, + pub gc_period: Option, + pub pitr_interval: Option, +} + +impl TenantConfigRequest { + pub fn new(tenant_id: ZTenantId) -> TenantConfigRequest { + TenantConfigRequest { + tenant_id, + checkpoint_distance: None, + compaction_target_size: None, + compaction_period: None, + compaction_threshold: None, + gc_horizon: None, + gc_period: None, + pitr_interval: None, + } + } +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index c0b07418f3..9932a2d08d 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -328,11 +328,7 @@ paths: content: application/json: schema: - type: object - properties: - new_tenant_id: - type: string - format: hex + $ref: "#/components/schemas/TenantCreateInfo" responses: "201": description: New tenant created successfully @@ -371,7 +367,48 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - + /v1/tenant/config: + put: + description: | + Update tenant's config. + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/TenantConfigInfo" + responses: + "200": + description: OK + content: + application/json: + schema: + type: array + items: + $ref: "#/components/schemas/TenantInfo" + "400": + description: Malformed tenant config request + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" components: securitySchemes: JWT: @@ -389,6 +426,45 @@ components: type: string state: type: string + TenantCreateInfo: + type: object + properties: + new_tenant_id: + type: string + format: hex + tenant_id: + type: string + format: hex + gc_period: + type: string + gc_horizon: + type: integer + pitr_interval: + type: string + checkpoint_distance: + type: integer + compaction_period: + type: string + compaction_threshold: + type: string + TenantConfigInfo: + type: object + properties: + tenant_id: + type: string + format: hex + gc_period: + type: string + gc_horizon: + type: integer + pitr_interval: + type: string + checkpoint_distance: + type: integer + compaction_period: + type: string + compaction_threshold: + type: string TimelineInfo: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 82ea5d1d09..2db56015ad 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -6,13 +6,15 @@ use hyper::{Body, Request, Response, Uri}; use tracing::*; use super::models::{ - StatusResponse, TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest, + StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, + TimelineCreateRequest, }; use crate::config::RemoteStorageKind; use crate::remote_storage::{ download_index_part, schedule_timeline_download, LocalFs, RemoteIndex, RemoteTimeline, S3Bucket, }; use crate::repository::Repository; +use crate::tenant_config::TenantConfOpt; use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo}; use crate::{config::PageServerConf, tenant_mgr, timelines}; use utils::{ @@ -375,6 +377,27 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result) -> Result) -> Result, ApiError> { + let request_data: TenantConfigRequest = json_request(&mut request).await?; + let tenant_id = request_data.tenant_id; + // check for management permission + check_permission(&request, Some(tenant_id))?; + + let mut tenant_conf: TenantConfOpt = Default::default(); + if let Some(gc_period) = request_data.gc_period { + tenant_conf.gc_period = + Some(humantime::parse_duration(&gc_period).map_err(ApiError::from_err)?); + } + tenant_conf.gc_horizon = request_data.gc_horizon; + + if let Some(pitr_interval) = request_data.pitr_interval { + tenant_conf.pitr_interval = + Some(humantime::parse_duration(&pitr_interval).map_err(ApiError::from_err)?); + } + + tenant_conf.checkpoint_distance = request_data.checkpoint_distance; + tenant_conf.compaction_target_size = request_data.compaction_target_size; + tenant_conf.compaction_threshold = request_data.compaction_threshold; + + if let Some(compaction_period) = request_data.compaction_period { + tenant_conf.compaction_period = + Some(humantime::parse_duration(&compaction_period).map_err(ApiError::from_err)?); + } + + tokio::task::spawn_blocking(move || { + let _enter = info_span!("tenant_config", tenant = ?tenant_id).entered(); + + tenant_mgr::update_tenant_config(tenant_conf, tenant_id) + }) + .await + .map_err(ApiError::from_err)??; + + Ok(json_response(StatusCode::OK, ())?) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -426,6 +488,7 @@ pub fn make_router( .get("/v1/status", status_handler) .get("/v1/tenant", tenant_list_handler) .post("/v1/tenant", tenant_create_handler) + .put("/v1/tenant/config", tenant_config_handler) .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler) .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) .get( diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index ff6498a489..3afef51a23 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -29,11 +29,13 @@ use std::ops::{Bound::Included, Deref, Range}; use std::path::{Path, PathBuf}; use std::sync::atomic::{self, AtomicBool}; use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError}; -use std::time::Instant; +use std::time::{Duration, Instant, SystemTime}; use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; use crate::config::PageServerConf; use crate::keyspace::KeySpace; +use crate::tenant_config::{TenantConf, TenantConfOpt}; + use crate::page_cache; use crate::remote_storage::{schedule_timeline_checkpoint_upload, RemoteIndex}; use crate::repository::{ @@ -51,6 +53,7 @@ use metrics::{ register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, }; +use toml_edit; use utils::{ crashsafe_dir, lsn::{AtomicLsn, Lsn, RecordLsn}, @@ -149,7 +152,15 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; /// Repository consists of multiple timelines. Keep them in a hash table. /// pub struct LayeredRepository { + // Global pageserver config parameters pub conf: &'static PageServerConf, + + // Overridden tenant-specific config parameters. + // We keep TenantConfOpt sturct here to preserve the information + // about parameters that are not set. + // This is necessary to allow global config updates. + tenant_conf: Arc>, + tenantid: ZTenantId, timelines: Mutex>, // This mutex prevents creation of new timelines during GC. @@ -219,6 +230,7 @@ impl Repository for LayeredRepository { let timeline = LayeredTimeline::new( self.conf, + Arc::clone(&self.tenant_conf), metadata, None, timelineid, @@ -302,6 +314,7 @@ impl Repository for LayeredRepository { &self, target_timelineid: Option, horizon: u64, + pitr: Duration, checkpoint_before_gc: bool, ) -> Result { let timeline_str = target_timelineid @@ -311,7 +324,7 @@ impl Repository for LayeredRepository { STORAGE_TIME .with_label_values(&["gc", &self.tenantid.to_string(), &timeline_str]) .observe_closure_duration(|| { - self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) + self.gc_iteration_internal(target_timelineid, horizon, pitr, checkpoint_before_gc) }) } @@ -480,6 +493,64 @@ impl From for RepositoryTimeline { /// Private functions impl LayeredRepository { + pub fn get_checkpoint_distance(&self) -> u64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .checkpoint_distance + .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) + } + + pub fn get_compaction_target_size(&self) -> u64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .compaction_target_size + .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) + } + + pub fn get_compaction_period(&self) -> Duration { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .compaction_period + .unwrap_or(self.conf.default_tenant_conf.compaction_period) + } + + pub fn get_compaction_threshold(&self) -> usize { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .compaction_threshold + .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) + } + + pub fn get_gc_horizon(&self) -> u64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .gc_horizon + .unwrap_or(self.conf.default_tenant_conf.gc_horizon) + } + + pub fn get_gc_period(&self) -> Duration { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .gc_period + .unwrap_or(self.conf.default_tenant_conf.gc_period) + } + + pub fn get_pitr_interval(&self) -> Duration { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .pitr_interval + .unwrap_or(self.conf.default_tenant_conf.pitr_interval) + } + + pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) -> Result<()> { + let mut tenant_conf = self.tenant_conf.write().unwrap(); + + tenant_conf.update(&new_tenant_conf); + + LayeredRepository::persist_tenant_config(self.conf, self.tenantid, *tenant_conf)?; + Ok(()) + } + // Implementation of the public `get_timeline` function. // Differences from the public: // * interface in that the caller must already hold the mutex on the 'timelines' hashmap. @@ -553,8 +624,10 @@ impl LayeredRepository { .flatten() .map(LayeredTimelineEntry::Loaded); let _enter = info_span!("loading local timeline").entered(); + let timeline = LayeredTimeline::new( self.conf, + Arc::clone(&self.tenant_conf), metadata, ancestor, timelineid, @@ -571,6 +644,7 @@ impl LayeredRepository { pub fn new( conf: &'static PageServerConf, + tenant_conf: TenantConfOpt, walredo_mgr: Arc, tenantid: ZTenantId, remote_index: RemoteIndex, @@ -579,6 +653,7 @@ impl LayeredRepository { LayeredRepository { tenantid, conf, + tenant_conf: Arc::new(RwLock::new(tenant_conf)), timelines: Mutex::new(HashMap::new()), gc_cs: Mutex::new(()), walredo_mgr, @@ -587,6 +662,71 @@ impl LayeredRepository { } } + /// Locate and load config + pub fn load_tenant_config( + conf: &'static PageServerConf, + tenantid: ZTenantId, + ) -> anyhow::Result { + let target_config_path = TenantConf::path(conf, tenantid); + + info!("load tenantconf from {}", target_config_path.display()); + + // FIXME If the config file is not found, assume that we're attaching + // a detached tenant and config is passed via attach command. + // https://github.com/neondatabase/neon/issues/1555 + if !target_config_path.exists() { + info!( + "Zenith tenant config is not found in {}", + target_config_path.display() + ); + return Ok(Default::default()); + } + + // load and parse file + let config = fs::read_to_string(target_config_path)?; + + let toml = config.parse::()?; + + let mut tenant_conf: TenantConfOpt = Default::default(); + for (key, item) in toml.iter() { + match key { + "tenant_conf" => { + tenant_conf = PageServerConf::parse_toml_tenant_conf(item)?; + } + _ => bail!("unrecognized pageserver option '{}'", key), + } + } + + Ok(tenant_conf) + } + + pub fn persist_tenant_config( + conf: &'static PageServerConf, + tenantid: ZTenantId, + tenant_conf: TenantConfOpt, + ) -> anyhow::Result<()> { + let _enter = info_span!("saving tenantconf").entered(); + let target_config_path = TenantConf::path(conf, tenantid); + info!("save tenantconf to {}", target_config_path.display()); + + let mut conf_content = r#"# This file contains a specific per-tenant's config. +# It is read in case of pageserver restart. + +# [tenant_config] +"# + .to_string(); + + // Convert the config to a toml file. + conf_content += &toml_edit::easy::to_string(&tenant_conf)?; + + fs::write(&target_config_path, conf_content).with_context(|| { + format!( + "Failed to write config file into path '{}'", + target_config_path.display() + ) + }) + } + /// Save timeline metadata to file fn save_metadata( conf: &'static PageServerConf, @@ -662,6 +802,7 @@ impl LayeredRepository { &self, target_timelineid: Option, horizon: u64, + pitr: Duration, checkpoint_before_gc: bool, ) -> Result { let _span_guard = @@ -738,7 +879,7 @@ impl LayeredRepository { timeline.checkpoint(CheckpointConfig::Forced)?; info!("timeline {} checkpoint_before_gc done", timelineid); } - timeline.update_gc_info(branchpoints, cutoff); + timeline.update_gc_info(branchpoints, cutoff, pitr); let result = timeline.gc()?; totals += result; @@ -753,6 +894,7 @@ impl LayeredRepository { pub struct LayeredTimeline { conf: &'static PageServerConf, + tenant_conf: Arc>, tenantid: ZTenantId, timelineid: ZTimelineId, @@ -857,6 +999,11 @@ struct GcInfo { /// /// FIXME: is this inclusive or exclusive? cutoff: Lsn, + + /// In addition to 'retain_lsns', keep everything newer than 'SystemTime::now()' + /// minus 'pitr_interval' + /// + pitr: Duration, } /// Public interface functions @@ -987,12 +1134,34 @@ impl Timeline for LayeredTimeline { } impl LayeredTimeline { + fn get_checkpoint_distance(&self) -> u64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .checkpoint_distance + .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) + } + + fn get_compaction_target_size(&self) -> u64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .compaction_target_size + .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) + } + + fn get_compaction_threshold(&self) -> usize { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .compaction_threshold + .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) + } + /// Open a Timeline handle. /// /// Loads the metadata for the timeline into memory, but not the layer map. #[allow(clippy::too_many_arguments)] fn new( conf: &'static PageServerConf, + tenant_conf: Arc>, metadata: TimelineMetadata, ancestor: Option, timelineid: ZTimelineId, @@ -1036,6 +1205,7 @@ impl LayeredTimeline { LayeredTimeline { conf, + tenant_conf, timelineid, tenantid, layers: RwLock::new(LayerMap::default()), @@ -1071,6 +1241,7 @@ impl LayeredTimeline { gc_info: RwLock::new(GcInfo { retain_lsns: Vec::new(), cutoff: Lsn(0), + pitr: Duration::ZERO, }), latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), @@ -1431,7 +1602,7 @@ impl LayeredTimeline { let last_lsn = self.get_last_record_lsn(); let distance = last_lsn.widening_sub(self.last_freeze_at.load()); - if distance >= self.conf.checkpoint_distance.into() { + if distance >= self.get_checkpoint_distance().into() { self.freeze_inmem_layer(true); self.last_freeze_at.store(last_lsn); } @@ -1640,13 +1811,15 @@ impl LayeredTimeline { // above. Rewrite it. let _compaction_cs = self.compaction_cs.lock().unwrap(); - let target_file_size = self.conf.checkpoint_distance; + let target_file_size = self.get_checkpoint_distance(); // Define partitioning schema if needed if let Ok(pgdir) = tenant_mgr::get_timeline_for_tenant_load(self.tenantid, self.timelineid) { - let (partitioning, lsn) = - pgdir.repartition(self.get_last_record_lsn(), self.conf.compaction_target_size)?; + let (partitioning, lsn) = pgdir.repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + )?; let timer = self.create_images_time_histo.start_timer(); // 2. Create new image layers for partitions that have been modified // "enough". @@ -1747,7 +1920,7 @@ impl LayeredTimeline { // We compact or "shuffle" the level-0 delta layers when they've // accumulated over the compaction threshold. - if level0_deltas.len() < self.conf.compaction_threshold { + if level0_deltas.len() < self.get_compaction_threshold() { return Ok(()); } drop(layers); @@ -1870,10 +2043,11 @@ impl LayeredTimeline { /// the latest LSN subtracted by a constant, and doesn't do anything smart /// to figure out what read-only nodes might actually need.) /// - fn update_gc_info(&self, retain_lsns: Vec, cutoff: Lsn) { + fn update_gc_info(&self, retain_lsns: Vec, cutoff: Lsn, pitr: Duration) { let mut gc_info = self.gc_info.write().unwrap(); gc_info.retain_lsns = retain_lsns; gc_info.cutoff = cutoff; + gc_info.pitr = pitr; } /// @@ -1884,7 +2058,7 @@ impl LayeredTimeline { /// obsolete. /// fn gc(&self) -> Result { - let now = Instant::now(); + let now = SystemTime::now(); let mut result: GcResult = Default::default(); let disk_consistent_lsn = self.get_disk_consistent_lsn(); @@ -1893,6 +2067,7 @@ impl LayeredTimeline { let gc_info = self.gc_info.read().unwrap(); let retain_lsns = &gc_info.retain_lsns; let cutoff = gc_info.cutoff; + let pitr = gc_info.pitr; let _enter = info_span!("garbage collection", timeline = %self.timelineid, tenant = %self.tenantid, cutoff = %cutoff).entered(); @@ -1910,8 +2085,9 @@ impl LayeredTimeline { // // Garbage collect the layer if all conditions are satisfied: // 1. it is older than cutoff LSN; - // 2. it doesn't need to be retained for 'retain_lsns'; - // 3. newer on-disk image layers cover the layer's whole key range + // 2. it is older than PITR interval; + // 3. it doesn't need to be retained for 'retain_lsns'; + // 4. newer on-disk image layers cover the layer's whole key range // let mut layers = self.layers.write().unwrap(); 'outer: for l in layers.iter_historic_layers() { @@ -1937,8 +2113,31 @@ impl LayeredTimeline { result.layers_needed_by_cutoff += 1; continue 'outer; } - - // 2. Is it needed by a child branch? + // 2. It is newer than PiTR interval? + // We use modification time of layer file to estimate update time. + // This estimation is not quite precise but maintaining LSN->timestamp map seems to be overkill. + // It is not expected that users will need high precision here. And this estimation + // is conservative: modification time of file is always newer than actual time of version + // creation. So it is safe for users. + // TODO A possible "bloat" issue still persists here. + // If modification time changes because of layer upload/download, we will keep these files + // longer than necessary. + // https://github.com/neondatabase/neon/issues/1554 + // + if let Ok(metadata) = fs::metadata(&l.filename()) { + let last_modified = metadata.modified()?; + if now.duration_since(last_modified)? < pitr { + debug!( + "keeping {} because it's modification time {:?} is newer than PITR {:?}", + l.filename().display(), + last_modified, + pitr + ); + result.layers_needed_by_pitr += 1; + continue 'outer; + } + } + // 3. Is it needed by a child branch? // NOTE With that wee would keep data that // might be referenced by child branches forever. // We can track this in child timeline GC and delete parent layers when @@ -1957,7 +2156,7 @@ impl LayeredTimeline { } } - // 3. Is there a later on-disk layer for this relation? + // 4. Is there a later on-disk layer for this relation? // // The end-LSN is exclusive, while disk_consistent_lsn is // inclusive. For example, if disk_consistent_lsn is 100, it is @@ -1998,7 +2197,7 @@ impl LayeredTimeline { result.layers_removed += 1; } - result.elapsed = now.elapsed(); + result.elapsed = now.elapsed()?; Ok(result) } @@ -2275,7 +2474,8 @@ pub mod tests { } let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff); + + tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO); tline.checkpoint(CheckpointConfig::Forced)?; tline.compact()?; tline.gc()?; @@ -2345,7 +2545,7 @@ pub mod tests { // Perform a cycle of checkpoint, compaction, and GC println!("checkpointing {}", lsn); let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff); + tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO); tline.checkpoint(CheckpointConfig::Forced)?; tline.compact()?; tline.gc()?; @@ -2422,7 +2622,7 @@ pub mod tests { // Perform a cycle of checkpoint, compaction, and GC println!("checkpointing {}", lsn); let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff); + tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO); tline.checkpoint(CheckpointConfig::Forced)?; tline.compact()?; tline.gc()?; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index a761f0dfe2..94219c7840 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -11,6 +11,7 @@ pub mod profiling; pub mod reltag; pub mod remote_storage; pub mod repository; +pub mod tenant_config; pub mod tenant_mgr; pub mod tenant_threads; pub mod thread_mgr; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 8c90195131..58d617448a 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -19,6 +19,7 @@ use std::net::TcpListener; use std::str; use std::str::FromStr; use std::sync::{Arc, RwLockReadGuard}; +use std::time::Duration; use tracing::*; use utils::{ auth::{self, Claims, JwtAuth, Scope}, @@ -676,6 +677,37 @@ impl postgres_backend::Handler for PageServerHandler { } } pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + } else if query_string.starts_with("show ") { + // show + let (_, params_raw) = query_string.split_at("show ".len()); + let params = params_raw.split(' ').collect::>(); + ensure!(params.len() == 1, "invalid param number for config command"); + let tenantid = ZTenantId::from_str(params[0])?; + let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + pgb.write_message_noflush(&BeMessage::RowDescription(&[ + RowDescriptor::int8_col(b"checkpoint_distance"), + RowDescriptor::int8_col(b"compaction_target_size"), + RowDescriptor::int8_col(b"compaction_period"), + RowDescriptor::int8_col(b"compaction_threshold"), + RowDescriptor::int8_col(b"gc_horizon"), + RowDescriptor::int8_col(b"gc_period"), + RowDescriptor::int8_col(b"pitr_interval"), + ]))? + .write_message_noflush(&BeMessage::DataRow(&[ + Some(repo.get_checkpoint_distance().to_string().as_bytes()), + Some(repo.get_compaction_target_size().to_string().as_bytes()), + Some( + repo.get_compaction_period() + .as_secs() + .to_string() + .as_bytes(), + ), + Some(repo.get_compaction_threshold().to_string().as_bytes()), + Some(repo.get_gc_horizon().to_string().as_bytes()), + Some(repo.get_gc_period().as_secs().to_string().as_bytes()), + Some(repo.get_pitr_interval().as_secs().to_string().as_bytes()), + ]))? + .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("do_gc ") { // Run GC immediately on given timeline. // FIXME: This is just for tests. See test_runner/batch_others/test_gc.py. @@ -693,16 +725,20 @@ impl postgres_backend::Handler for PageServerHandler { let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + + let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + let gc_horizon: u64 = caps .get(4) .map(|h| h.as_str().parse()) - .unwrap_or(Ok(self.conf.gc_horizon))?; + .unwrap_or_else(|| Ok(repo.get_gc_horizon()))?; let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?; + let result = repo.gc_iteration(Some(timelineid), gc_horizon, Duration::ZERO, true)?; pgb.write_message_noflush(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"layers_total"), RowDescriptor::int8_col(b"layers_needed_by_cutoff"), + RowDescriptor::int8_col(b"layers_needed_by_pitr"), RowDescriptor::int8_col(b"layers_needed_by_branches"), RowDescriptor::int8_col(b"layers_not_updated"), RowDescriptor::int8_col(b"layers_removed"), @@ -711,6 +747,7 @@ impl postgres_backend::Handler for PageServerHandler { .write_message_noflush(&BeMessage::DataRow(&[ Some(result.layers_total.to_string().as_bytes()), Some(result.layers_needed_by_cutoff.to_string().as_bytes()), + Some(result.layers_needed_by_pitr.to_string().as_bytes()), Some(result.layers_needed_by_branches.to_string().as_bytes()), Some(result.layers_not_updated.to_string().as_bytes()), Some(result.layers_removed.to_string().as_bytes()), diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index fc438cce9c..f7c2f036a6 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -249,6 +249,7 @@ pub trait Repository: Send + Sync { &self, timelineid: Option, horizon: u64, + pitr: Duration, checkpoint_before_gc: bool, ) -> Result; @@ -305,6 +306,7 @@ impl<'a, T> From<&'a RepositoryTimeline> for LocalTimelineState { pub struct GcResult { pub layers_total: u64, pub layers_needed_by_cutoff: u64, + pub layers_needed_by_pitr: u64, pub layers_needed_by_branches: u64, pub layers_not_updated: u64, pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. @@ -315,6 +317,7 @@ pub struct GcResult { impl AddAssign for GcResult { fn add_assign(&mut self, other: Self) { self.layers_total += other.layers_total; + self.layers_needed_by_pitr += other.layers_needed_by_pitr; self.layers_needed_by_cutoff += other.layers_needed_by_cutoff; self.layers_needed_by_branches += other.layers_needed_by_branches; self.layers_not_updated += other.layers_not_updated; @@ -432,6 +435,7 @@ pub mod repo_harness { }; use super::*; + use crate::tenant_config::{TenantConf, TenantConfOpt}; use hex_literal::hex; use utils::zid::ZTenantId; @@ -454,8 +458,23 @@ pub mod repo_harness { static ref LOCK: RwLock<()> = RwLock::new(()); } + impl From for TenantConfOpt { + fn from(tenant_conf: TenantConf) -> Self { + Self { + checkpoint_distance: Some(tenant_conf.checkpoint_distance), + compaction_target_size: Some(tenant_conf.compaction_target_size), + compaction_period: Some(tenant_conf.compaction_period), + compaction_threshold: Some(tenant_conf.compaction_threshold), + gc_horizon: Some(tenant_conf.gc_horizon), + gc_period: Some(tenant_conf.gc_period), + pitr_interval: Some(tenant_conf.pitr_interval), + } + } + } + pub struct RepoHarness<'a> { pub conf: &'static PageServerConf, + pub tenant_conf: TenantConf, pub tenant_id: ZTenantId, pub lock_guard: ( @@ -487,12 +506,15 @@ pub mod repo_harness { // OK in a test. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); + let tenant_conf = TenantConf::dummy_conf(); + let tenant_id = ZTenantId::generate(); fs::create_dir_all(conf.tenant_path(&tenant_id))?; fs::create_dir_all(conf.timelines_path(&tenant_id))?; Ok(Self { conf, + tenant_conf, tenant_id, lock_guard, }) @@ -507,6 +529,7 @@ pub mod repo_harness { let repo = LayeredRepository::new( self.conf, + TenantConfOpt::from(self.tenant_conf), walredo_mgr, self.tenant_id, RemoteIndex::empty(), @@ -722,7 +745,7 @@ mod tests { // FIXME: this doesn't actually remove any layer currently, given how the checkpointing // and compaction works. But it does set the 'cutoff' point so that the cross check // below should fail. - repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; + repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; // try to branch at lsn 25, should fail because we already garbage collected the data match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) { @@ -773,7 +796,7 @@ mod tests { let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; - repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; + repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); match tline.get(*TEST_KEY, Lsn(0x25)) { @@ -796,7 +819,7 @@ mod tests { .get_timeline_load(NEW_TIMELINE_ID) .expect("Should have a local timeline"); // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; + repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok()); Ok(()) @@ -815,7 +838,7 @@ mod tests { make_some_layers(newtline.as_ref(), Lsn(0x60))?; // run gc on parent - repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; + repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; // Check that the data is still accessible on the branch. assert_eq!( diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs new file mode 100644 index 0000000000..818b6de1b1 --- /dev/null +++ b/pageserver/src/tenant_config.rs @@ -0,0 +1,162 @@ +//! Functions for handling per-tenant configuration options +//! +//! If tenant is created with --config option, +//! the tenant-specific config will be stored in tenant's directory. +//! Otherwise, global pageserver's config is used. +//! +//! If the tenant config file is corrupted, the tenant will be disabled. +//! We cannot use global or default config instead, because wrong settings +//! may lead to a data loss. +//! +use crate::config::PageServerConf; +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; +use std::time::Duration; +use utils::zid::ZTenantId; + +pub const TENANT_CONFIG_NAME: &str = "config"; + +pub mod defaults { + // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB + // would be more appropriate. But a low value forces the code to be exercised more, + // which is good for now to trigger bugs. + // This parameter actually determines L0 layer file size. + pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; + + // Target file size, when creating image and delta layers. + // This parameter determines L1 layer file size. + pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; + + pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s"; + pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; + + pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; + pub const DEFAULT_GC_PERIOD: &str = "100 s"; + pub const DEFAULT_PITR_INTERVAL: &str = "30 days"; +} + +/// Per-tenant configuration options +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct TenantConf { + // Flush out an inmemory layer, if it's holding WAL older than this + // This puts a backstop on how much WAL needs to be re-digested if the + // page server crashes. + // This parameter actually determines L0 layer file size. + pub checkpoint_distance: u64, + // Target file size, when creating image and delta layers. + // This parameter determines L1 layer file size. + pub compaction_target_size: u64, + // How often to check if there's compaction work to be done. + pub compaction_period: Duration, + // Level0 delta layer threshold for compaction. + pub compaction_threshold: usize, + // Determines how much history is retained, to allow + // branching and read replicas at an older point in time. + // The unit is #of bytes of WAL. + // Page versions older than this are garbage collected away. + pub gc_horizon: u64, + // Interval at which garbage collection is triggered. + pub gc_period: Duration, + // Determines how much history is retained, to allow + // branching and read replicas at an older point in time. + // The unit is time. + // Page versions older than this are garbage collected away. + pub pitr_interval: Duration, +} + +/// Same as TenantConf, but this struct preserves the information about +/// which parameters are set and which are not. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] +pub struct TenantConfOpt { + pub checkpoint_distance: Option, + pub compaction_target_size: Option, + pub compaction_period: Option, + pub compaction_threshold: Option, + pub gc_horizon: Option, + pub gc_period: Option, + pub pitr_interval: Option, +} + +impl TenantConfOpt { + pub fn merge(&self, global_conf: TenantConf) -> TenantConf { + TenantConf { + checkpoint_distance: self + .checkpoint_distance + .unwrap_or(global_conf.checkpoint_distance), + compaction_target_size: self + .compaction_target_size + .unwrap_or(global_conf.compaction_target_size), + compaction_period: self + .compaction_period + .unwrap_or(global_conf.compaction_period), + compaction_threshold: self + .compaction_threshold + .unwrap_or(global_conf.compaction_threshold), + gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon), + gc_period: self.gc_period.unwrap_or(global_conf.gc_period), + pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval), + } + } + + pub fn update(&mut self, other: &TenantConfOpt) { + if let Some(checkpoint_distance) = other.checkpoint_distance { + self.checkpoint_distance = Some(checkpoint_distance); + } + if let Some(compaction_target_size) = other.compaction_target_size { + self.compaction_target_size = Some(compaction_target_size); + } + if let Some(compaction_period) = other.compaction_period { + self.compaction_period = Some(compaction_period); + } + if let Some(compaction_threshold) = other.compaction_threshold { + self.compaction_threshold = Some(compaction_threshold); + } + if let Some(gc_horizon) = other.gc_horizon { + self.gc_horizon = Some(gc_horizon); + } + if let Some(gc_period) = other.gc_period { + self.gc_period = Some(gc_period); + } + if let Some(pitr_interval) = other.pitr_interval { + self.pitr_interval = Some(pitr_interval); + } + } +} + +impl TenantConf { + pub fn default() -> TenantConf { + use defaults::*; + + TenantConf { + checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, + compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE, + compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) + .expect("cannot parse default compaction period"), + compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, + gc_horizon: DEFAULT_GC_HORIZON, + gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) + .expect("cannot parse default gc period"), + pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL) + .expect("cannot parse default PITR interval"), + } + } + + /// Points to a place in pageserver's local directory, + /// where certain tenant's tenantconf file should be located. + pub fn path(conf: &'static PageServerConf, tenantid: ZTenantId) -> PathBuf { + conf.tenant_path(&tenantid).join(TENANT_CONFIG_NAME) + } + + #[cfg(test)] + pub fn dummy_conf() -> Self { + TenantConf { + checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, + compaction_target_size: 4 * 1024 * 1024, + compaction_period: Duration::from_secs(10), + compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD, + gc_horizon: defaults::DEFAULT_GC_HORIZON, + gc_period: Duration::from_secs(10), + pitr_interval: Duration::from_secs(60 * 60), + } + } +} diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 33bb4dc2e0..8a69062dba 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -5,6 +5,7 @@ use crate::config::PageServerConf; use crate::layered_repository::LayeredRepository; use crate::remote_storage::RemoteIndex; use crate::repository::{Repository, TimelineSyncStatusUpdate}; +use crate::tenant_config::TenantConfOpt; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; use crate::timelines; @@ -63,13 +64,13 @@ fn access_tenants() -> MutexGuard<'static, HashMap> { TENANTS.lock().unwrap() } -// Sets up wal redo manager and repository for tenant. Reduces code duplocation. +// Sets up wal redo manager and repository for tenant. Reduces code duplication. // Used during pageserver startup, or when new tenant is attached to pageserver. pub fn load_local_repo( conf: &'static PageServerConf, tenant_id: ZTenantId, remote_index: &RemoteIndex, -) -> Arc { +) -> Result> { let mut m = access_tenants(); let tenant = m.entry(tenant_id).or_insert_with(|| { // Set up a WAL redo manager, for applying WAL records. @@ -78,6 +79,7 @@ pub fn load_local_repo( // Set up an object repository, for actual data storage. let repo: Arc = Arc::new(LayeredRepository::new( conf, + Default::default(), Arc::new(walredo_mgr), tenant_id, remote_index.clone(), @@ -89,7 +91,12 @@ pub fn load_local_repo( timelines: HashMap::new(), } }); - Arc::clone(&tenant.repo) + + // Restore tenant config + let tenant_conf = LayeredRepository::load_tenant_config(conf, tenant_id)?; + tenant.repo.update_tenant_config(tenant_conf)?; + + Ok(Arc::clone(&tenant.repo)) } /// Updates tenants' repositories, changing their timelines state in memory. @@ -109,7 +116,16 @@ pub fn apply_timeline_sync_status_updates( trace!("Sync status updates: {:?}", sync_status_updates); for (tenant_id, tenant_timelines_sync_status_updates) in sync_status_updates { - let repo = load_local_repo(conf, tenant_id, remote_index); + let repo = match load_local_repo(conf, tenant_id, remote_index) { + Ok(repo) => repo, + Err(e) => { + error!( + "Failed to load repo for tenant {} Error: {:#}", + tenant_id, e + ); + continue; + } + }; for (timeline_id, timeline_sync_status_update) in tenant_timelines_sync_status_updates { match repo.apply_timeline_remote_sync_status_update(timeline_id, timeline_sync_status_update) @@ -174,6 +190,7 @@ pub fn shutdown_all_tenants() { pub fn create_tenant_repository( conf: &'static PageServerConf, + tenant_conf: TenantConfOpt, tenantid: ZTenantId, remote_index: RemoteIndex, ) -> Result> { @@ -186,6 +203,7 @@ pub fn create_tenant_repository( let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid)); let repo = timelines::create_repo( conf, + tenant_conf, tenantid, CreateRepo::Real { wal_redo_manager, @@ -202,6 +220,14 @@ pub fn create_tenant_repository( } } +pub fn update_tenant_config(tenant_conf: TenantConfOpt, tenantid: ZTenantId) -> Result<()> { + info!("configuring tenant {}", tenantid); + let repo = get_repository_for_tenant(tenantid)?; + + repo.update_tenant_config(tenant_conf)?; + Ok(()) +} + pub fn get_tenant_state(tenantid: ZTenantId) -> Option { Some(access_tenants().get(&tenantid)?.state) } @@ -210,7 +236,7 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> Option { /// Change the state of a tenant to Active and launch its compactor and GC /// threads. If the tenant was already in Active state or Stopping, does nothing. /// -pub fn activate_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> Result<()> { +pub fn activate_tenant(tenant_id: ZTenantId) -> Result<()> { let mut m = access_tenants(); let tenant = m .get_mut(&tenant_id) @@ -230,7 +256,7 @@ pub fn activate_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> R None, "Compactor thread", true, - move || crate::tenant_threads::compact_loop(tenant_id, conf), + move || crate::tenant_threads::compact_loop(tenant_id), )?; let gc_spawn_result = thread_mgr::spawn( @@ -239,7 +265,7 @@ pub fn activate_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> R None, "GC thread", true, - move || crate::tenant_threads::gc_loop(tenant_id, conf), + move || crate::tenant_threads::gc_loop(tenant_id), ) .with_context(|| format!("Failed to launch GC thread for tenant {}", tenant_id)); @@ -251,7 +277,6 @@ pub fn activate_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> R thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None); return gc_spawn_result; } - tenant.state = TenantState::Active; } @@ -290,7 +315,7 @@ pub fn get_timeline_for_tenant_load( .get_timeline_load(timelineid) .with_context(|| format!("Timeline {} not found for tenant {}", timelineid, tenantid))?; - let repartition_distance = tenant.repo.conf.checkpoint_distance / 10; + let repartition_distance = tenant.repo.get_checkpoint_distance() / 10; let page_tline = Arc::new(DatadirTimelineImpl::new(tline, repartition_distance)); page_tline.init_logical_size()?; diff --git a/pageserver/src/tenant_threads.rs b/pageserver/src/tenant_threads.rs index 4dcc15f817..b904d9040d 100644 --- a/pageserver/src/tenant_threads.rs +++ b/pageserver/src/tenant_threads.rs @@ -1,6 +1,5 @@ //! This module contains functions to serve per-tenant background processes, //! such as compaction and GC -use crate::config::PageServerConf; use crate::repository::Repository; use crate::tenant_mgr; use crate::tenant_mgr::TenantState; @@ -12,8 +11,8 @@ use utils::zid::ZTenantId; /// /// Compaction thread's main loop /// -pub fn compact_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> { - if let Err(err) = compact_loop_ext(tenantid, conf) { +pub fn compact_loop(tenantid: ZTenantId) -> Result<()> { + if let Err(err) = compact_loop_ext(tenantid) { error!("compact loop terminated with error: {:?}", err); Err(err) } else { @@ -21,13 +20,15 @@ pub fn compact_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Resul } } -fn compact_loop_ext(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> { +fn compact_loop_ext(tenantid: ZTenantId) -> Result<()> { loop { if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { break; } + let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + let compaction_period = repo.get_compaction_period(); - std::thread::sleep(conf.compaction_period); + std::thread::sleep(compaction_period); trace!("compaction thread for tenant {} waking up", tenantid); // Compact timelines @@ -46,23 +47,23 @@ fn compact_loop_ext(tenantid: ZTenantId, conf: &'static PageServerConf) -> Resul /// /// GC thread's main loop /// -pub fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> { +pub fn gc_loop(tenantid: ZTenantId) -> Result<()> { loop { if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { break; } trace!("gc thread for tenant {} waking up", tenantid); - + let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + let gc_horizon = repo.get_gc_horizon(); // Garbage collect old files that are not needed for PITR anymore - if conf.gc_horizon > 0 { - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - repo.gc_iteration(None, conf.gc_horizon, false)?; + if gc_horizon > 0 { + repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?; } // TODO Write it in more adequate way using // condvar.wait_timeout() or something - let mut sleep_time = conf.gc_period.as_secs(); + let mut sleep_time = repo.get_gc_period().as_secs(); while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == Some(TenantState::Active) { sleep_time -= 1; diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index abbabc8b31..adc531e6bb 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -25,6 +25,7 @@ use crate::{ layered_repository::metadata::TimelineMetadata, remote_storage::RemoteIndex, repository::{LocalTimelineState, Repository}, + tenant_config::TenantConfOpt, DatadirTimeline, RepositoryImpl, }; use crate::{import_datadir, LOG_FILE_NAME}; @@ -151,8 +152,8 @@ pub fn init_pageserver( if let Some(tenant_id) = create_tenant { println!("initializing tenantid {}", tenant_id); - let repo = - create_repo(conf, tenant_id, CreateRepo::Dummy).context("failed to create repo")?; + let repo = create_repo(conf, Default::default(), tenant_id, CreateRepo::Dummy) + .context("failed to create repo")?; let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref()) .context("failed to create initial timeline")?; @@ -175,6 +176,7 @@ pub enum CreateRepo { pub fn create_repo( conf: &'static PageServerConf, + tenant_conf: TenantConfOpt, tenant_id: ZTenantId, create_repo: CreateRepo, ) -> Result> { @@ -211,8 +213,12 @@ pub fn create_repo( crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; info!("created directory structure in {}", repo_dir.display()); + // Save tenant's config + LayeredRepository::persist_tenant_config(conf, tenant_id, tenant_conf)?; + Ok(Arc::new(LayeredRepository::new( conf, + tenant_conf, wal_redo_manager, tenant_id, remote_index, diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index ce4e4d45fb..357aab7221 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -93,7 +93,7 @@ pub fn launch_wal_receiver( receivers.insert((tenantid, timelineid), receiver); // Update tenant state and start tenant threads, if they are not running yet. - tenant_mgr::activate_tenant(conf, tenantid)?; + tenant_mgr::activate_tenant(tenantid)?; } }; Ok(()) diff --git a/test_runner/batch_others/test_tenant_conf.py b/test_runner/batch_others/test_tenant_conf.py new file mode 100644 index 0000000000..f74e6aad1d --- /dev/null +++ b/test_runner/batch_others/test_tenant_conf.py @@ -0,0 +1,49 @@ +from contextlib import closing + +import pytest + +from fixtures.zenith_fixtures import ZenithEnvBuilder + + +def test_tenant_config(zenith_env_builder: ZenithEnvBuilder): + env = zenith_env_builder.init_start() + """Test per tenant configuration""" + tenant = env.zenith_cli.create_tenant( + conf={ + 'checkpoint_distance': '10000', + 'compaction_target_size': '1048576', + 'compaction_period': '60sec', + 'compaction_threshold': '20', + 'gc_horizon': '1024', + 'gc_period': '100sec', + 'pitr_interval': '3600sec', + }) + + env.zenith_cli.create_timeline(f'test_tenant_conf', tenant_id=tenant) + pg = env.postgres.create_start( + "test_tenant_conf", + "main", + tenant, + ) + + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor() as pscur: + pscur.execute(f"show {tenant.hex}") + assert pscur.fetchone() == (10000, 1048576, 60, 20, 1024, 100, 3600) + + # update the config and ensure that it has changed + env.zenith_cli.config_tenant(tenant_id=tenant, + conf={ + 'checkpoint_distance': '100000', + 'compaction_target_size': '1048576', + 'compaction_period': '30sec', + 'compaction_threshold': '15', + 'gc_horizon': '256', + 'gc_period': '10sec', + 'pitr_interval': '360sec', + }) + + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor() as pscur: + pscur.execute(f"show {tenant.hex}") + assert pscur.fetchone() == (100000, 1048576, 30, 15, 256, 10, 360) diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 9a2d6cdc88..d295a79953 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -835,16 +835,35 @@ class ZenithCli: self.env = env pass - def create_tenant(self, tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: + def create_tenant(self, + tenant_id: Optional[uuid.UUID] = None, + conf: Optional[Dict[str, str]] = None) -> uuid.UUID: """ Creates a new tenant, returns its id and its initial timeline's id. """ if tenant_id is None: tenant_id = uuid.uuid4() - res = self.raw_cli(['tenant', 'create', '--tenant-id', tenant_id.hex]) + if conf is None: + res = self.raw_cli(['tenant', 'create', '--tenant-id', tenant_id.hex]) + else: + res = self.raw_cli( + ['tenant', 'create', '--tenant-id', tenant_id.hex] + + sum(list(map(lambda kv: (['-c', kv[0] + ':' + kv[1]]), conf.items())), [])) res.check_returncode() return tenant_id + def config_tenant(self, tenant_id: uuid.UUID, conf: Dict[str, str]): + """ + Update tenant config. + """ + if conf is None: + res = self.raw_cli(['tenant', 'config', '--tenant-id', tenant_id.hex]) + else: + res = self.raw_cli( + ['tenant', 'config', '--tenant-id', tenant_id.hex] + + sum(list(map(lambda kv: (['-c', kv[0] + ':' + kv[1]]), conf.items())), [])) + res.check_returncode() + def list_tenants(self) -> 'subprocess.CompletedProcess[str]': res = self.raw_cli(['tenant', 'list']) res.check_returncode() diff --git a/zenith/src/main.rs b/zenith/src/main.rs index afbbbe395b..cd0cf470e8 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -166,7 +166,12 @@ fn main() -> Result<()> { .subcommand(App::new("create") .arg(tenant_id_arg.clone()) .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) - ) + .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false)) + ) + .subcommand(App::new("config") + .arg(tenant_id_arg.clone()) + .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false)) + ) ) .subcommand( App::new("pageserver") @@ -523,8 +528,12 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re } Some(("create", create_match)) => { let initial_tenant_id = parse_tenant_id(create_match)?; + let tenant_conf: HashMap<_, _> = create_match + .values_of("config") + .map(|vals| vals.flat_map(|c| c.split_once(':')).collect()) + .unwrap_or_default(); let new_tenant_id = pageserver - .tenant_create(initial_tenant_id)? + .tenant_create(initial_tenant_id, tenant_conf)? .ok_or_else(|| { anyhow!("Tenant with id {:?} was already created", initial_tenant_id) })?; @@ -533,6 +542,27 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re new_tenant_id ); } + Some(("config", create_match)) => { + let tenant_id = get_tenant_id(create_match, env)?; + let tenant_conf: HashMap<_, _> = create_match + .values_of("config") + .map(|vals| vals.flat_map(|c| c.split_once(':')).collect()) + .unwrap_or_default(); + + pageserver + .tenant_config(tenant_id, tenant_conf) + .unwrap_or_else(|e| { + anyhow!( + "Tenant config failed for tenant with id {} : {}", + tenant_id, + e + ); + }); + println!( + "tenant {} successfully configured on the pageserver", + tenant_id + ); + } Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), None => bail!("no tenant subcommand provided"), } From d3f356e7a81464b3dcf5a5076d7bb8ef4ca30ff6 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 22 Apr 2022 17:31:58 +0300 Subject: [PATCH 0183/1022] Update `rust-postgres` project-wide (#1525) * Update `rust-postgres` project-wide This commit points to https://github.com/neondatabase/rust-postgres/commits/neon in order to test our patches on top of the latest version of this crate. * [proxy] Update `hmac` and `sha2` --- Cargo.lock | 196 ++++++++++++++++++++++++++++++--------- Cargo.toml | 2 +- compute_tools/Cargo.toml | 4 +- control_plane/Cargo.toml | 2 +- libs/utils/Cargo.toml | 4 +- pageserver/Cargo.toml | 8 +- proxy/Cargo.toml | 8 +- proxy/src/scram.rs | 4 +- safekeeper/Cargo.toml | 6 +- zenith/Cargo.toml | 2 +- 10 files changed, 170 insertions(+), 66 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3ca3671207..978cd20d12 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -181,6 +181,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "block-buffer" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf7fe51849ea569fd452f37822f606a5cabb684dc918707a0193fd4664ff324" +dependencies = [ + "generic-array", +] + [[package]] name = "boxfnonce" version = "0.1.1" @@ -518,13 +527,13 @@ dependencies = [ ] [[package]] -name = "crypto-mac" -version = "0.10.1" +name = "crypto-common" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bff07008ec701e8028e2ceb8f83f0e4274ee62bd2dbdc4fefff2e9a91824081a" +checksum = "57952ca27b5e3606ff4dd79b0020231aaf9d6aa76dc05fd30137538c50bd3ce8" dependencies = [ "generic-array", - "subtle", + "typenum", ] [[package]] @@ -622,6 +631,17 @@ dependencies = [ "generic-array", ] +[[package]] +name = "digest" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2fb860ca6fafa5552fb6d0e816a69c8e49f0908bf524e30a90d97c85892d506" +dependencies = [ + "block-buffer 0.10.2", + "crypto-common", + "subtle", +] + [[package]] name = "dirs-next" version = "2.0.0" @@ -994,24 +1014,23 @@ version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ebdb29d2ea9ed0083cd8cece49bbd968021bd99b0849edb4a9a7ee0fdf6a4e0" -[[package]] -name = "hmac" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1441c6b1e930e2817404b5046f1f989899143a12bf92de603b69f4e0aee1e15" -dependencies = [ - "crypto-mac 0.10.1", - "digest", -] - [[package]] name = "hmac" version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a2a2320eb7ec0ebe8da8f744d7812d9fc4cb4d09344ac01898dbcb6a20ae69b" dependencies = [ - "crypto-mac 0.11.1", - "digest", + "crypto-mac", + "digest 0.9.0", +] + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest 0.10.3", ] [[package]] @@ -1297,11 +1316,20 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b5a279bb9607f9f53c22d496eade00d138d1bdcccd07d74650387cf94942a15" dependencies = [ - "block-buffer", - "digest", + "block-buffer 0.9.0", + "digest 0.9.0", "opaque-debug", ] +[[package]] +name = "md-5" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "658646b21e0b72f7866c7038ab086d3d5e1cd6271f060fd37defb241949d0582" +dependencies = [ + "digest 0.10.3", +] + [[package]] name = "md5" version = "0.7.0" @@ -1640,7 +1668,17 @@ checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" dependencies = [ "instant", "lock_api", - "parking_lot_core", + "parking_lot_core 0.8.5", +] + +[[package]] +name = "parking_lot" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f5ec2493a61ac0506c0f4199f99070cbe83857b0337006a30f3e6719b8ef58" +dependencies = [ + "lock_api", + "parking_lot_core 0.9.2", ] [[package]] @@ -1657,6 +1695,19 @@ dependencies = [ "winapi", ] +[[package]] +name = "parking_lot_core" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "995f667a6c822200b0433ac218e05582f0e2efa1b922a3fd2fbaadc5f87bab37" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-sys", +] + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -1690,18 +1741,18 @@ dependencies = [ [[package]] name = "phf" -version = "0.8.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" dependencies = [ "phf_shared", ] [[package]] name = "phf_shared" -version = "0.8.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" dependencies = [ "siphasher", ] @@ -1774,40 +1825,39 @@ dependencies = [ [[package]] name = "postgres" -version = "0.19.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" +version = "0.19.2" +source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "bytes", "fallible-iterator", "futures", "log", - "postgres-protocol", "tokio", "tokio-postgres", ] [[package]] name = "postgres-protocol" -version = "0.6.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" +version = "0.6.4" +source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "base64", "byteorder", "bytes", "fallible-iterator", - "hmac 0.10.1", + "hmac 0.12.1", "lazy_static", - "md-5", + "md-5 0.10.1", "memchr", "rand", - "sha2", + "sha2 0.10.2", "stringprep", ] [[package]] name = "postgres-types" -version = "0.2.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" +version = "0.2.3" +source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "bytes", "fallible-iterator", @@ -1886,7 +1936,7 @@ dependencies = [ "fnv", "lazy_static", "memchr", - "parking_lot", + "parking_lot 0.11.2", "thiserror", ] @@ -1956,12 +2006,12 @@ dependencies = [ "futures", "hashbrown", "hex", - "hmac 0.10.1", + "hmac 0.12.1", "hyper", "lazy_static", "md5", "metrics", - "parking_lot", + "parking_lot 0.12.0", "pin-project-lite", "rand", "rcgen", @@ -1973,7 +2023,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", - "sha2", + "sha2 0.10.2", "socket2", "thiserror", "tokio", @@ -2295,20 +2345,20 @@ dependencies = [ "base64", "bytes", "chrono", - "digest", + "digest 0.9.0", "futures", "hex", "hmac 0.11.0", "http", "hyper", "log", - "md-5", + "md-5 0.9.1", "percent-encoding", "pin-project-lite", "rusoto_credential", "rustc_version", "serde", - "sha2", + "sha2 0.9.9", "tokio", ] @@ -2560,13 +2610,24 @@ version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800" dependencies = [ - "block-buffer", + "block-buffer 0.9.0", "cfg-if", "cpufeatures", - "digest", + "digest 0.9.0", "opaque-debug", ] +[[package]] +name = "sha2" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55deaec60f81eefe3cce0dc50bda92d6d8e88f2a27df7c5033b42afeb1ed2676" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest 0.10.3", +] + [[package]] name = "sharded-slab" version = "0.1.4" @@ -2906,8 +2967,8 @@ dependencies = [ [[package]] name = "tokio-postgres" -version = "0.7.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" +version = "0.7.6" +source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "async-trait", "byteorder", @@ -2915,7 +2976,7 @@ dependencies = [ "fallible-iterator", "futures", "log", - "parking_lot", + "parking_lot 0.12.0", "percent-encoding", "phf", "pin-project-lite", @@ -2923,7 +2984,7 @@ dependencies = [ "postgres-types", "socket2", "tokio", - "tokio-util 0.6.9", + "tokio-util 0.7.0", ] [[package]] @@ -3460,6 +3521,49 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-sys" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5acdd78cb4ba54c0045ac14f62d8f94a03d10047904ae2a40afa1e99d8f70825" +dependencies = [ + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_msvc" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17cffbe740121affb56fad0fc0e421804adf0ae00891205213b5cecd30db881d" + +[[package]] +name = "windows_i686_gnu" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2564fde759adb79129d9b4f54be42b32c89970c18ebf93124ca8870a498688ed" + +[[package]] +name = "windows_i686_msvc" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cd9d32ba70453522332c14d38814bceeb747d80b3958676007acadd7e166956" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfce6deae227ee8d356d19effc141a509cc503dfd1f850622ec4b0f84428e1f4" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d19538ccc21819d01deaf88d6a17eae6596a12e9aafdbb97916fb49896d89de9" + [[package]] name = "winreg" version = "0.7.0" diff --git a/Cargo.toml b/Cargo.toml index 35c18ba237..3838637d37 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,4 +18,4 @@ debug = true # This is only needed for proxy's tests. # TODO: we should probably fork `tokio-postgres-rustls` instead. [patch.crates-io] -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } +tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 856ec45c73..42db763961 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -11,11 +11,11 @@ clap = "3.0" env_logger = "0.9" hyper = { version = "0.14", features = ["full"] } log = { version = "0.4", features = ["std", "serde"] } -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } +postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } regex = "1" serde = { version = "1.0", features = ["derive"] } serde_json = "1" tar = "0.4" tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] } -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } +tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 33d01f7556..41417aab9a 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [dependencies] tar = "0.4.33" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } +postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } serde = { version = "1.0", features = ["derive"] } serde_with = "1.12.0" toml = "0.5" diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 35eb443809..d83b02d7ae 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -10,8 +10,8 @@ bytes = "1.0.1" hyper = { version = "0.14.7", features = ["full"] } lazy_static = "1.4.0" pin-project-lite = "0.2.7" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } -postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } +postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } routerify = "3" serde = { version = "1.0", features = ["derive"] } serde_json = "1" diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index eb58b90ad9..6648d8417a 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -22,10 +22,10 @@ clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } tokio-util = { version = "0.7", features = ["io"] } -postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } -postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } +postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-stream = "0.1.8" anyhow = { version = "1.0", features = ["backtrace"] } crc32c = "0.6.0" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 81086a0cad..25aebc03e8 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -12,11 +12,11 @@ fail = "0.5.0" futures = "0.3.13" hashbrown = "0.11.2" hex = "0.4.3" -hmac = "0.10.1" +hmac = "0.12.1" hyper = "0.14" lazy_static = "1.4.0" md5 = "0.7.0" -parking_lot = "0.11.2" +parking_lot = "0.12" pin-project-lite = "0.2.7" rand = "0.8.3" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } @@ -26,11 +26,11 @@ rustls-pemfile = "0.2.1" scopeguard = "1.1.0" serde = "1" serde_json = "1" -sha2 = "0.9.8" +sha2 = "0.10.2" socket2 = "0.4.4" thiserror = "1.0.30" tokio = { version = "1.17", features = ["macros"] } -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } +tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-rustls = "0.23.0" utils = { path = "../libs/utils" } diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index f007f3e0b6..44671084ee 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -18,7 +18,7 @@ pub use secret::*; pub use exchange::Exchange; pub use secret::ServerSecret; -use hmac::{Hmac, Mac, NewMac}; +use hmac::{Hmac, Mac}; use sha2::{Digest, Sha256}; // TODO: add SCRAM-SHA-256-PLUS @@ -40,7 +40,7 @@ fn base64_decode_array(input: impl AsRef<[u8]>) -> Option<[u8; N /// This function essentially is `Hmac(sha256, key, input)`. /// Further reading: . fn hmac_sha256<'a>(key: &[u8], parts: impl IntoIterator) -> [u8; 32] { - let mut mac = Hmac::::new_varkey(key).expect("bad key size"); + let mut mac = Hmac::::new_from_slice(key).expect("bad key size"); parts.into_iter().for_each(|s| mac.update(s)); // TODO: maybe newer `hmac` et al already migrated to regular arrays? diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 76d40cdc2e..8a31311b8f 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -15,8 +15,8 @@ tracing = "0.1.27" clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["macros", "fs"] } -postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } +postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } anyhow = "1.0" crc32c = "0.6.0" humantime = "2.1.0" @@ -27,7 +27,7 @@ serde = { version = "1.0", features = ["derive"] } serde_with = {version = "1.12.0"} hex = "0.4.3" const_format = "0.2.21" -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } +tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } etcd-client = "0.8.3" tokio-util = { version = "0.7", features = ["io"] } rusoto_core = "0.47" diff --git a/zenith/Cargo.toml b/zenith/Cargo.toml index 9692e97331..0f72051f74 100644 --- a/zenith/Cargo.toml +++ b/zenith/Cargo.toml @@ -7,7 +7,7 @@ edition = "2021" clap = "3.0" anyhow = "1.0" serde_json = "1" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } +postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } # FIXME: 'pageserver' is needed for BranchInfo. Refactor pageserver = { path = "../pageserver" } From 867aede71516756ff0ec1dba540fe7fc23bb7113 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Fri, 22 Apr 2022 10:45:47 -0400 Subject: [PATCH 0184/1022] Add idle compute restart time test (#1514) --- test_runner/performance/test_startup.py | 48 +++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 test_runner/performance/test_startup.py diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py new file mode 100644 index 0000000000..e30912ce32 --- /dev/null +++ b/test_runner/performance/test_startup.py @@ -0,0 +1,48 @@ +from contextlib import closing + +from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.benchmark_fixture import ZenithBenchmarker + + +def test_startup(zenith_env_builder: ZenithEnvBuilder, zenbenchmark: ZenithBenchmarker): + zenith_env_builder.num_safekeepers = 3 + env = zenith_env_builder.init_start() + + # Start + env.zenith_cli.create_branch('test_startup') + with zenbenchmark.record_duration("startup_time"): + pg = env.postgres.create_start('test_startup') + pg.safe_psql("select 1;") + + # Restart + pg.stop_and_destroy() + with zenbenchmark.record_duration("restart_time"): + pg.create_start('test_startup') + pg.safe_psql("select 1;") + + # Fill up + num_rows = 1000000 # 30 MB + num_tables = 100 + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + for i in range(num_tables): + cur.execute(f'create table t_{i} (i integer);') + cur.execute(f'insert into t_{i} values (generate_series(1,{num_rows}));') + + # Read + with zenbenchmark.record_duration("read_time"): + pg.safe_psql("select * from t_0;") + + # Read again + with zenbenchmark.record_duration("second_read_time"): + pg.safe_psql("select * from t_0;") + + # Restart + pg.stop_and_destroy() + with zenbenchmark.record_duration("restart_with_data"): + pg.create_start('test_startup') + pg.safe_psql("select 1;") + + # Read + with zenbenchmark.record_duration("read_after_restart"): + pg.safe_psql("select * from t_0;") From 1fb3d081854a31f9afd1f4e5161fa4cbf9738299 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 22 Apr 2022 21:31:27 +0300 Subject: [PATCH 0185/1022] Use a 1-byte length header for short blobs. Notably, this shaves 3 bytes from each small WAL record stored in ephemeral or delta layers. --- pageserver/src/layered_repository/blob_io.rs | 72 ++++++++++++++----- .../src/layered_repository/ephemeral_file.rs | 42 +++++++---- 2 files changed, 83 insertions(+), 31 deletions(-) diff --git a/pageserver/src/layered_repository/blob_io.rs b/pageserver/src/layered_repository/blob_io.rs index aa90bbd0cf..3aeeb2b2c8 100644 --- a/pageserver/src/layered_repository/blob_io.rs +++ b/pageserver/src/layered_repository/blob_io.rs @@ -1,12 +1,20 @@ //! //! Functions for reading and writing variable-sized "blobs". //! -//! Each blob begins with a 4-byte length, followed by the actual data. +//! Each blob begins with a 1- or 4-byte length field, followed by the +//! actual data. If the length is smaller than 128 bytes, the length +//! is written as a one byte. If it's larger than that, the length +//! is written as a four-byte integer, in big-endian, with the high +//! bit set. This way, we can detect whether it's 1- or 4-byte header +//! by peeking at the first byte. +//! +//! len < 128: 0XXXXXXX +//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! use crate::layered_repository::block_io::{BlockCursor, BlockReader}; use crate::page_cache::PAGE_SZ; use std::cmp::min; -use std::io::Error; +use std::io::{Error, ErrorKind}; /// For reading pub trait BlobCursor { @@ -40,21 +48,30 @@ where let mut buf = self.read_blk(blknum)?; - // read length - let mut len_buf = [0u8; 4]; - let thislen = PAGE_SZ - off; - if thislen < 4 { - // it is split across two pages - len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]); - blknum += 1; - buf = self.read_blk(blknum)?; - len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]); - off = 4 - thislen; + // peek at the first byte, to determine if it's a 1- or 4-byte length + let first_len_byte = buf[off]; + let len: usize = if first_len_byte < 0x80 { + // 1-byte length header + off += 1; + first_len_byte as usize } else { - len_buf.copy_from_slice(&buf[off..off + 4]); - off += 4; - } - let len = u32::from_ne_bytes(len_buf) as usize; + // 4-byte length header + let mut len_buf = [0u8; 4]; + let thislen = PAGE_SZ - off; + if thislen < 4 { + // it is split across two pages + len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]); + blknum += 1; + buf = self.read_blk(blknum)?; + len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]); + off = 4 - thislen; + } else { + len_buf.copy_from_slice(&buf[off..off + 4]); + off += 4; + } + len_buf[0] &= 0x7f; + u32::from_be_bytes(len_buf) as usize + }; dstbuf.clear(); @@ -130,10 +147,27 @@ where { fn write_blob(&mut self, srcbuf: &[u8]) -> Result { let offset = self.offset; - self.inner - .write_all(&((srcbuf.len()) as u32).to_ne_bytes())?; + + if srcbuf.len() < 128 { + // Short blob. Write a 1-byte length header + let len_buf = srcbuf.len() as u8; + self.inner.write_all(&[len_buf])?; + self.offset += 1; + } else { + // Write a 4-byte length header + if srcbuf.len() > 0x7fff_ffff { + return Err(Error::new( + ErrorKind::Other, + format!("blob too large ({} bytes)", srcbuf.len()), + )); + } + let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes(); + len_buf[0] |= 0x80; + self.inner.write_all(&len_buf)?; + self.offset += 4; + } self.inner.write_all(srcbuf)?; - self.offset += 4 + srcbuf.len() as u64; + self.offset += srcbuf.len() as u64; Ok(offset) } } diff --git a/pageserver/src/layered_repository/ephemeral_file.rs b/pageserver/src/layered_repository/ephemeral_file.rs index 9537d3939c..cdde9d5d13 100644 --- a/pageserver/src/layered_repository/ephemeral_file.rs +++ b/pageserver/src/layered_repository/ephemeral_file.rs @@ -199,18 +199,24 @@ impl BlobWriter for EphemeralFile { let mut buf = self.get_buf_for_write(blknum)?; // Write the length field - let len_buf = u32::to_ne_bytes(srcbuf.len() as u32); - let thislen = PAGE_SZ - off; - if thislen < 4 { - // it needs to be split across pages - buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]); - blknum += 1; - buf = self.get_buf_for_write(blknum)?; - buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]); - off = 4 - thislen; + if srcbuf.len() < 0x80 { + buf[off] = srcbuf.len() as u8; + off += 1; } else { - buf[off..off + 4].copy_from_slice(&len_buf); - off += 4; + let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32); + len_buf[0] |= 0x80; + let thislen = PAGE_SZ - off; + if thislen < 4 { + // it needs to be split across pages + buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]); + blknum += 1; + buf = self.get_buf_for_write(blknum)?; + buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]); + off = 4 - thislen; + } else { + buf[off..off + 4].copy_from_slice(&len_buf); + off += 4; + } } // Write the payload @@ -229,7 +235,13 @@ impl BlobWriter for EphemeralFile { buf_remain = &buf_remain[this_blk_len..]; } drop(buf); - self.size += 4 + srcbuf.len() as u64; + + if srcbuf.len() < 0x80 { + self.size += 1; + } else { + self.size += 4; + } + self.size += srcbuf.len() as u64; Ok(pos) } @@ -387,6 +399,12 @@ mod tests { let pos = file.write_blob(&data)?; blobs.push((pos, data)); } + // also test with a large blobs + for i in 0..100 { + let data = format!("blob{}", i).as_bytes().repeat(100); + let pos = file.write_blob(&data)?; + blobs.push((pos, data)); + } let mut cursor = BlockCursor::new(&file); for (pos, expected) in blobs { From 56f6269a8e86f110a4eb78d2019279e1a6cca2f2 Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Mon, 25 Apr 2022 11:34:51 +0300 Subject: [PATCH 0186/1022] rename docker images to neondatabase docker account (#1570) * rename docker images to neondatabase docker account * docker images build fix (permisions for Cargo.lock) --- .circleci/ansible/deploy.yaml | 10 +-- .circleci/ansible/get_binaries.sh | 32 +++---- .circleci/config.yml | 94 ++++++++++----------- .circleci/helm-values/production.proxy.yaml | 3 + .circleci/helm-values/staging.proxy.yaml | 3 + Dockerfile | 4 +- Dockerfile.compute-tools | 4 +- 7 files changed, 80 insertions(+), 70 deletions(-) diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml index 508843812a..a8154ba3b0 100644 --- a/.circleci/ansible/deploy.yaml +++ b/.circleci/ansible/deploy.yaml @@ -1,14 +1,14 @@ -- name: Upload Zenith binaries +- name: Upload Neon binaries hosts: storage gather_facts: False remote_user: admin tasks: - - name: get latest version of Zenith binaries + - name: get latest version of Neon binaries register: current_version_file set_fact: - current_version: "{{ lookup('file', '.zenith_current_version') | trim }}" + current_version: "{{ lookup('file', '.neon_current_version') | trim }}" tags: - pageserver - safekeeper @@ -19,11 +19,11 @@ - pageserver - safekeeper - - name: upload and extract Zenith binaries to /usr/local + - name: upload and extract Neon binaries to /usr/local ansible.builtin.unarchive: owner: root group: root - src: zenith_install.tar.gz + src: neon_install.tar.gz dest: /usr/local become: true tags: diff --git a/.circleci/ansible/get_binaries.sh b/.circleci/ansible/get_binaries.sh index 242a9e87e2..a4b4372d9f 100755 --- a/.circleci/ansible/get_binaries.sh +++ b/.circleci/ansible/get_binaries.sh @@ -4,10 +4,10 @@ set -e RELEASE=${RELEASE:-false} -# look at docker hub for latest tag fo zenith docker image +# look at docker hub for latest tag for neon docker image if [ "${RELEASE}" = "true" ]; then echo "search latest relase tag" - VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/zenithdb/zenith/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | tail -1) + VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | tail -1) if [ -z "${VERSION}" ]; then echo "no any docker tags found, exiting..." exit 1 @@ -16,7 +16,7 @@ if [ "${RELEASE}" = "true" ]; then fi else echo "search latest dev tag" - VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/zenithdb/zenith/tags |jq -r -S '.[].name' | grep -v release | tail -1) + VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -v release | tail -1) if [ -z "${VERSION}" ]; then echo "no any docker tags found, exiting..." exit 1 @@ -28,25 +28,25 @@ fi echo "found ${VERSION}" # do initial cleanup -rm -rf zenith_install postgres_install.tar.gz zenith_install.tar.gz .zenith_current_version -mkdir zenith_install +rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version +mkdir neon_install # retrive binaries from docker image echo "getting binaries from docker image" -docker pull --quiet zenithdb/zenith:${TAG} -ID=$(docker create zenithdb/zenith:${TAG}) +docker pull --quiet neondatabase/neon:${TAG} +ID=$(docker create neondatabase/neon:${TAG}) docker cp ${ID}:/data/postgres_install.tar.gz . -tar -xzf postgres_install.tar.gz -C zenith_install -docker cp ${ID}:/usr/local/bin/pageserver zenith_install/bin/ -docker cp ${ID}:/usr/local/bin/safekeeper zenith_install/bin/ -docker cp ${ID}:/usr/local/bin/proxy zenith_install/bin/ -docker cp ${ID}:/usr/local/bin/postgres zenith_install/bin/ +tar -xzf postgres_install.tar.gz -C neon_install +docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/ +docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ +docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ +docker cp ${ID}:/usr/local/bin/postgres neon_install/bin/ docker rm -vf ${ID} # store version to file (for ansible playbooks) and create binaries tarball -echo ${VERSION} > zenith_install/.zenith_current_version -echo ${VERSION} > .zenith_current_version -tar -czf zenith_install.tar.gz -C zenith_install . +echo ${VERSION} > neon_install/.neon_current_version +echo ${VERSION} > .neon_current_version +tar -czf neon_install.tar.gz -C neon_install . # do final cleaup -rm -rf zenith_install postgres_install.tar.gz +rm -rf neon_install postgres_install.tar.gz diff --git a/.circleci/config.yml b/.circleci/config.yml index 643c853854..471d64a82f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,18 +1,18 @@ version: 2.1 executors: - zenith-xlarge-executor: + neon-xlarge-executor: resource_class: xlarge docker: # NB: when changed, do not forget to update rust image tag in all Dockerfiles - image: zimg/rust:1.58 - zenith-executor: + neon-executor: docker: - image: zimg/rust:1.58 jobs: check-codestyle-rust: - executor: zenith-xlarge-executor + executor: neon-xlarge-executor steps: - checkout - run: @@ -22,7 +22,7 @@ jobs: # A job to build postgres build-postgres: - executor: zenith-xlarge-executor + executor: neon-xlarge-executor parameters: build_type: type: enum @@ -67,9 +67,9 @@ jobs: paths: - tmp_install - # A job to build zenith rust code - build-zenith: - executor: zenith-xlarge-executor + # A job to build Neon rust code + build-neon: + executor: neon-xlarge-executor parameters: build_type: type: enum @@ -223,7 +223,7 @@ jobs: - "*" check-codestyle-python: - executor: zenith-executor + executor: neon-executor steps: - checkout - restore_cache: @@ -246,7 +246,7 @@ jobs: command: poetry run mypy . run-pytest: - executor: zenith-executor + executor: neon-executor parameters: # pytest args to specify the tests to run. # @@ -390,7 +390,7 @@ jobs: - "*" coverage-report: - executor: zenith-xlarge-executor + executor: neon-xlarge-executor steps: - attach_workspace: at: /tmp/zenith @@ -420,7 +420,7 @@ jobs: COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1 scripts/git-upload \ - --repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-coverage-data.git \ + --repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/neondatabase/zenith-coverage-data.git \ --message="Add code coverage for $COMMIT_URL" \ copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE @@ -437,7 +437,7 @@ jobs: \"target_url\": \"$REPORT_URL\" }" - # Build zenithdb/zenith:latest image and push it to Docker hub + # Build neondatabase/neon:latest image and push it to Docker hub docker-image: docker: - image: cimg/base:2021.04 @@ -451,18 +451,18 @@ jobs: - run: name: Build and push Docker image command: | - echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin + echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin DOCKER_TAG=$(git log --oneline|wc -l) docker build \ --pull \ --build-arg GIT_VERSION=${CIRCLE_SHA1} \ --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ - --tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:latest . - docker push zenithdb/zenith:${DOCKER_TAG} - docker push zenithdb/zenith:latest + --tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:latest . + docker push neondatabase/neon:${DOCKER_TAG} + docker push neondatabase/neon:latest - # Build zenithdb/compute-node:latest image and push it to Docker hub + # Build neondatabase/compute-node:latest image and push it to Docker hub docker-image-compute: docker: - image: cimg/base:2021.04 @@ -470,31 +470,31 @@ jobs: - checkout - setup_remote_docker: docker_layer_caching: true - # Build zenithdb/compute-tools:latest image and push it to Docker hub + # Build neondatabase/compute-tools:latest image and push it to Docker hub # TODO: this should probably also use versioned tag, not just :latest. # XXX: but should it? We build and use it only locally now. - run: name: Build and push compute-tools Docker image command: | - echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin + echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin docker build \ --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ - --tag zenithdb/compute-tools:latest -f Dockerfile.compute-tools . - docker push zenithdb/compute-tools:latest + --tag neondatabase/compute-tools:latest -f Dockerfile.compute-tools . + docker push neondatabase/compute-tools:latest - run: name: Init postgres submodule command: git submodule update --init --depth 1 - run: name: Build and push compute-node Docker image command: | - echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin + echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin DOCKER_TAG=$(git log --oneline|wc -l) - docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:latest vendor/postgres - docker push zenithdb/compute-node:${DOCKER_TAG} - docker push zenithdb/compute-node:latest + docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:latest vendor/postgres + docker push neondatabase/compute-node:${DOCKER_TAG} + docker push neondatabase/compute-node:latest - # Build production zenithdb/zenith:release image and push it to Docker hub + # Build production neondatabase/neon:release image and push it to Docker hub docker-image-release: docker: - image: cimg/base:2021.04 @@ -508,18 +508,18 @@ jobs: - run: name: Build and push Docker image command: | - echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin + echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin DOCKER_TAG="release-$(git log --oneline|wc -l)" docker build \ --pull \ --build-arg GIT_VERSION=${CIRCLE_SHA1} \ --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ - --tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:release . - docker push zenithdb/zenith:${DOCKER_TAG} - docker push zenithdb/zenith:release + --tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:release . + docker push neondatabase/neon:${DOCKER_TAG} + docker push neondatabase/neon:release - # Build production zenithdb/compute-node:release image and push it to Docker hub + # Build production neondatabase/compute-node:release image and push it to Docker hub docker-image-compute-release: docker: - image: cimg/base:2021.04 @@ -527,29 +527,29 @@ jobs: - checkout - setup_remote_docker: docker_layer_caching: true - # Build zenithdb/compute-tools:release image and push it to Docker hub + # Build neondatabase/compute-tools:release image and push it to Docker hub # TODO: this should probably also use versioned tag, not just :latest. # XXX: but should it? We build and use it only locally now. - run: name: Build and push compute-tools Docker image command: | - echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin + echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin docker build \ --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ - --tag zenithdb/compute-tools:release -f Dockerfile.compute-tools . - docker push zenithdb/compute-tools:release + --tag neondatabase/compute-tools:release -f Dockerfile.compute-tools . + docker push neondatabase/compute-tools:release - run: name: Init postgres submodule command: git submodule update --init --depth 1 - run: name: Build and push compute-node Docker image command: | - echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin + echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin DOCKER_TAG="release-$(git log --oneline|wc -l)" - docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:release vendor/postgres - docker push zenithdb/compute-node:${DOCKER_TAG} - docker push zenithdb/compute-node:release + docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:release vendor/postgres + docker push neondatabase/compute-node:${DOCKER_TAG} + docker push neondatabase/compute-node:release deploy-staging: docker: @@ -575,7 +575,7 @@ jobs: rm -f ssh-key ssh-key-cert.pub ansible-playbook deploy.yaml -i staging.hosts - rm -f zenith_install.tar.gz .zenith_current_version + rm -f neon_install.tar.gz .neon_current_version deploy-staging-proxy: docker: @@ -625,7 +625,7 @@ jobs: rm -f ssh-key ssh-key-cert.pub ansible-playbook deploy.yaml -i production.hosts - rm -f zenith_install.tar.gz .zenith_current_version + rm -f neon_install.tar.gz .neon_current_version deploy-release-proxy: docker: @@ -704,8 +704,8 @@ workflows: matrix: parameters: build_type: ["debug", "release"] - - build-zenith: - name: build-zenith-<< matrix.build_type >> + - build-neon: + name: build-neon-<< matrix.build_type >> matrix: parameters: build_type: ["debug", "release"] @@ -720,7 +720,7 @@ workflows: test_selection: batch_pg_regress needs_postgres_source: true requires: - - build-zenith-<< matrix.build_type >> + - build-neon-<< matrix.build_type >> - run-pytest: name: other-tests-<< matrix.build_type >> matrix: @@ -728,7 +728,7 @@ workflows: build_type: ["debug", "release"] test_selection: batch_others requires: - - build-zenith-<< matrix.build_type >> + - build-neon-<< matrix.build_type >> - run-pytest: name: benchmarks context: PERF_TEST_RESULT_CONNSTR @@ -737,7 +737,7 @@ workflows: run_in_parallel: false save_perf_report: true requires: - - build-zenith-release + - build-neon-release - coverage-report: # Context passes credentials for gh api context: CI_ACCESS_TOKEN @@ -833,6 +833,6 @@ workflows: # XXX: Successful build doesn't mean everything is OK, but # the job to be triggered takes so much time to complete (~22 min) # that it's better not to wait for the commented-out steps - - build-zenith-release + - build-neon-release # - pg_regress-tests-release # - other-tests-release diff --git a/.circleci/helm-values/production.proxy.yaml b/.circleci/helm-values/production.proxy.yaml index 27aa169c79..f2148c1d2c 100644 --- a/.circleci/helm-values/production.proxy.yaml +++ b/.circleci/helm-values/production.proxy.yaml @@ -1,6 +1,9 @@ # Helm chart values for zenith-proxy. # This is a YAML-formatted file. +image: + repository: neondatabase/neon + settings: authEndpoint: "https://console.zenith.tech/authenticate_proxy_request/" uri: "https://console.zenith.tech/psql_session/" diff --git a/.circleci/helm-values/staging.proxy.yaml b/.circleci/helm-values/staging.proxy.yaml index bdce4d80da..f4d9855476 100644 --- a/.circleci/helm-values/staging.proxy.yaml +++ b/.circleci/helm-values/staging.proxy.yaml @@ -1,6 +1,9 @@ # Helm chart values for zenith-proxy. # This is a YAML-formatted file. +image: + repository: neondatabase/neon + settings: authEndpoint: "https://console.stage.zenith.tech/authenticate_proxy_request/" uri: "https://console.stage.zenith.tech/psql_session/" diff --git a/Dockerfile b/Dockerfile index ebc8731168..a7afd1f335 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,7 +26,9 @@ COPY . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, loosing the compilation stats. -RUN mold -run cargo build --release && cachepot -s +RUN set -e \ + && sudo -E "PATH=$PATH" mold -run cargo build --release \ + && cachepot -s # Build final image # diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 3fc8702f3f..bbe0f517ce 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -8,7 +8,9 @@ ARG AWS_SECRET_ACCESS_KEY COPY . . -RUN mold -run cargo build -p compute_tools --release && cachepot -s +RUN set -e \ + && sudo -E "PATH=$PATH" mold -run cargo build -p compute_tools --release \ + && cachepot -s # Final image that only has one binary FROM debian:buster-slim From 8f6a16127117d63b96c25cbf8b105ebc75a8e9c0 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 22 Apr 2022 17:07:09 +0300 Subject: [PATCH 0187/1022] Show better layer load errors --- pageserver/src/layered_repository/delta_layer.rs | 9 +++++++-- pageserver/src/layered_repository/image_layer.rs | 4 +++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index c5530a5789..ef4c3cccb0 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -290,7 +290,10 @@ impl Layer for DeltaLayer { } fn iter<'a>(&'a self) -> Box> + 'a> { - let inner = self.load().unwrap(); + let inner = match self.load() { + Ok(inner) => inner, + Err(e) => panic!("Failed to load a delta layer: {e:?}"), + }; match DeltaValueIter::new(inner) { Ok(iter) => Box::new(iter), @@ -422,7 +425,9 @@ impl DeltaLayer { drop(inner); let inner = self.inner.write().unwrap(); if !inner.loaded { - self.load_inner(inner)?; + self.load_inner(inner).with_context(|| { + format!("Failed to load delta layer {}", self.path().display()) + })?; } else { // Another thread loaded it while we were not holding the lock. } diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 0e38d46e7a..d7657ecac6 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -254,7 +254,9 @@ impl ImageLayer { drop(inner); let mut inner = self.inner.write().unwrap(); if !inner.loaded { - self.load_inner(&mut inner)?; + self.load_inner(&mut inner).with_context(|| { + format!("Failed to load image layer {}", self.path().display()) + })? } else { // Another thread loaded it while we were not holding the lock. } From 78a6cb247f1c37287bf88687c3309b5be99ee720 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 7 Apr 2022 20:37:42 +0300 Subject: [PATCH 0188/1022] allow the users to create extensions: GRANT CREATE ON DATABASE --- compute_tools/src/bin/zenith_ctl.rs | 1 + compute_tools/src/spec.rs | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/compute_tools/src/bin/zenith_ctl.rs b/compute_tools/src/bin/zenith_ctl.rs index a5dfb1c875..3685f8e8b4 100644 --- a/compute_tools/src/bin/zenith_ctl.rs +++ b/compute_tools/src/bin/zenith_ctl.rs @@ -129,6 +129,7 @@ fn run_compute(state: &Arc>) -> Result { handle_roles(&read_state.spec, &mut client)?; handle_databases(&read_state.spec, &mut client)?; + handle_grants(&read_state.spec, &mut client)?; create_writablity_check_data(&mut client)?; // 'Close' connection diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 1dd7c0044e..27114b8202 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -244,3 +244,24 @@ pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> { Ok(()) } + +// Grant CREATE ON DATABASE to the database owner +// to allow clients create trusted extensions. +pub fn handle_grants(spec: &ClusterSpec, client: &mut Client) -> Result<()> { + info!("cluster spec grants:"); + + for db in &spec.cluster.databases { + let dbname = &db.name; + + let query: String = format!( + "GRANT CREATE ON DATABASE {} TO {}", + dbname.quote(), + db.owner.quote() + ); + info!("grant query {}", &query); + + client.execute(query.as_str(), &[])?; + } + + Ok(()) +} From d060a97c548dc2a395be0772f67ee306b3df14a5 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 22 Apr 2022 21:32:54 +0300 Subject: [PATCH 0189/1022] Simplify clippy runs --- .circleci/config.yml | 14 -------------- .github/workflows/testing.yml | 17 ++++++----------- run_clippy.sh | 2 +- 3 files changed, 7 insertions(+), 26 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 471d64a82f..3397bcc7b7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -132,20 +132,6 @@ jobs: - ~/.cargo/git - target - # Run style checks - # has to run separately from cargo fmt section - # since needs to run with dependencies - - run: - name: cargo clippy - command: | - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run) - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - fi - - "${cov_prefix[@]}" ./run_clippy.sh - # Run rust unit tests - run: name: cargo test diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 83e46ce6be..6d109b9bb5 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -36,8 +36,7 @@ jobs: - name: Install macOs postgres dependencies if: matrix.os == 'macos-latest' - run: | - brew install flex bison + run: brew install flex bison - name: Set pg revision for caching id: pg_ver @@ -53,8 +52,7 @@ jobs: - name: Build postgres if: steps.cache_pg.outputs.cache-hit != 'true' - run: | - make postgres + run: make postgres - name: Cache cargo deps id: cache_cargo @@ -64,13 +62,10 @@ jobs: ~/.cargo/registry ~/.cargo/git target - key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }} - # Use `env CARGO_INCREMENTAL=0` to mitigate https://github.com/rust-lang/rust/issues/91696 for rustc 1.57.0 - - name: Run cargo build - run: | - env CARGO_INCREMENTAL=0 cargo build --workspace --bins --examples --tests + - name: Run cargo clippy + run: ./run_clippy.sh - name: Run cargo test - run: | - env CARGO_INCREMENTAL=0 cargo test -- --nocapture --test-threads=1 + run: cargo test --all --all-targets diff --git a/run_clippy.sh b/run_clippy.sh index 4ca944c1f1..f26dbaa0f3 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -12,4 +12,4 @@ # * `-A unknown_lints` – do not warn about unknown lint suppressions # that people with newer toolchains might use # * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) -cargo clippy "${@:2}" --all-targets --all-features --all --tests -- -A unknown_lints -D warnings +cargo clippy --all --all-targets --all-features -- -A unknown_lints -D warnings From fec050ce97239a8c63680c70572e043513880acb Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 22 Apr 2022 22:12:25 +0300 Subject: [PATCH 0190/1022] Fix macos clippy issues --- pageserver/src/http/routes.rs | 2 +- pageserver/src/profiling.rs | 16 +++++++++++----- run_clippy.sh | 15 +++++++++++---- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2db56015ad..05485ef3b6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -453,7 +453,7 @@ async fn tenant_config_handler(mut request: Request) -> Result) -> Result, ApiError> { diff --git a/pageserver/src/profiling.rs b/pageserver/src/profiling.rs index e2c12c9e12..84132659d6 100644 --- a/pageserver/src/profiling.rs +++ b/pageserver/src/profiling.rs @@ -74,22 +74,28 @@ mod profiling_impl { } } -/// Dummy implementation when compiling without profiling feature +/// Dummy implementation when compiling without profiling feature or for non-linux OSes. #[cfg(not(feature = "profiling"))] mod profiling_impl { use super::*; - pub fn profpoint_start(_conf: &PageServerConf, _point: ProfilingConfig) -> () { - () + pub struct DummyProfilerGuard; + + pub fn profpoint_start( + _conf: &PageServerConf, + _point: ProfilingConfig, + ) -> Option { + None } - pub fn init_profiler(conf: &PageServerConf) -> () { + pub fn init_profiler(conf: &PageServerConf) -> Option { if conf.profiling != ProfilingConfig::Disabled { // shouldn't happen, we don't allow profiling in the config if the support // for it is disabled. panic!("profiling enabled but the binary was compiled without profiling support"); } + None } - pub fn exit_profiler(_conf: &PageServerConf, _guard: &()) {} + pub fn exit_profiler(_conf: &PageServerConf, _guard: &Option) {} } diff --git a/run_clippy.sh b/run_clippy.sh index f26dbaa0f3..13af3fd2c5 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -9,7 +9,14 @@ # In vscode, this setting is Rust-analyzer>Check On Save:Command -# * `-A unknown_lints` – do not warn about unknown lint suppressions -# that people with newer toolchains might use -# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) -cargo clippy --all --all-targets --all-features -- -A unknown_lints -D warnings +# Not every feature is supported in macOS builds, e.g. `profiling`, +# avoid running regular linting script that checks every feature. +if [[ "$OSTYPE" == "darwin"* ]]; then + # no extra features to test currently, add more here when needed + cargo clippy --all --all-targets -- -A unknown_lints -D warnings +else + # * `-A unknown_lints` – do not warn about unknown lint suppressions + # that people with newer toolchains might use + # * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) + cargo clippy --all --all-targets --all-features -- -A unknown_lints -D warnings +fi From eabf6f89e46533a87cbf2fa7d5206fcff6458e63 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 25 Apr 2022 23:41:11 +0300 Subject: [PATCH 0191/1022] Use item.get for tenant config toml parsing Previously we've used table interface, but there was no easy way to pass it as an override to pageserver through cli. Use the same strategy as for remote storage config parsing --- pageserver/src/config.rs | 56 +++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index b2c4a62796..df4d9910ee 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -466,30 +466,40 @@ impl PageServerConf { pub fn parse_toml_tenant_conf(item: &toml_edit::Item) -> Result { let mut t_conf: TenantConfOpt = Default::default(); - for (key, item) in item - .as_table() - .ok_or(anyhow::anyhow!("invalid tenant config"))? - .iter() - { - match key { - "checkpoint_distance" => { - t_conf.checkpoint_distance = Some(parse_toml_u64(key, item)?) - } - "compaction_target_size" => { - t_conf.compaction_target_size = Some(parse_toml_u64(key, item)?) - } - "compaction_period" => { - t_conf.compaction_period = Some(parse_toml_duration(key, item)?) - } - "compaction_threshold" => { - t_conf.compaction_threshold = Some(parse_toml_u64(key, item)? as usize) - } - "gc_horizon" => t_conf.gc_horizon = Some(parse_toml_u64(key, item)?), - "gc_period" => t_conf.gc_period = Some(parse_toml_duration(key, item)?), - "pitr_interval" => t_conf.pitr_interval = Some(parse_toml_duration(key, item)?), - _ => bail!("unrecognized tenant config option '{}'", key), - } + if let Some(checkpoint_distance) = item.get("checkpoint_distance") { + t_conf.checkpoint_distance = + Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?); } + + if let Some(compaction_target_size) = item.get("compaction_target_size") { + t_conf.compaction_target_size = Some(parse_toml_u64( + "compaction_target_size", + compaction_target_size, + )?); + } + + if let Some(compaction_period) = item.get("compaction_period") { + t_conf.compaction_period = + Some(parse_toml_duration("compaction_period", compaction_period)?); + } + + if let Some(compaction_threshold) = item.get("compaction_threshold") { + t_conf.compaction_threshold = + Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?); + } + + if let Some(gc_horizon) = item.get("gc_horizon") { + t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?); + } + + if let Some(gc_period) = item.get("gc_period") { + t_conf.gc_period = Some(parse_toml_duration("gc_period", gc_period)?); + } + + if let Some(pitr_interval) = item.get("pitr_interval") { + t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?); + } + Ok(t_conf) } From 778744d35ca4ff57237a6ef5b4323084797de9bd Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 25 Apr 2022 16:29:23 +0300 Subject: [PATCH 0192/1022] Limit concurrent S3 and IAM interactions --- Cargo.lock | 2 +- docs/settings.md | 7 +- pageserver/src/config.rs | 215 +++++++++--------- pageserver/src/remote_storage.rs | 4 +- pageserver/src/remote_storage/s3_bucket.rs | 33 ++- pageserver/src/remote_storage/storage_sync.rs | 84 +++---- 6 files changed, 195 insertions(+), 150 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 978cd20d12..3797e4e76b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1899,7 +1899,7 @@ dependencies = [ "libc", "log", "nix", - "parking_lot", + "parking_lot 0.11.2", "symbolic-demangle", "tempfile", "thiserror", diff --git a/docs/settings.md b/docs/settings.md index 69aadc602f..530876a42a 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -156,6 +156,9 @@ access_key_id = 'SOMEKEYAAAAASADSAH*#' # Secret access key to connect to the bucket ("password" part of the credentials) secret_access_key = 'SOMEsEcReTsd292v' + +# S3 API query limit to avoid getting errors/throttling from AWS. +concurrency_limit = 100 ``` ###### General remote storage configuration @@ -167,8 +170,8 @@ Besides, there are parameters common for all types of remote storage that can be ```toml [remote_storage] -# Max number of concurrent connections to open for uploading to or downloading from the remote storage. -max_concurrent_sync = 100 +# Max number of concurrent timeline synchronized (layers uploaded or downloaded) with the remote storage at the same time. +max_concurrent_timelines_sync = 50 # Max number of errors a single task can have before it's considered failed and not attempted to run anymore. max_sync_errors = 10 diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index df4d9910ee..8bfe8b57ec 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -4,8 +4,7 @@ //! file, or on the command line. //! See also `settings.md` for better description on every parameter. -use anyhow::{bail, ensure, Context, Result}; -use std::convert::TryInto; +use anyhow::{anyhow, bail, ensure, Context, Result}; use std::env; use std::num::{NonZeroU32, NonZeroUsize}; use std::path::{Path, PathBuf}; @@ -34,8 +33,18 @@ pub mod defaults { pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; pub const DEFAULT_SUPERUSER: &str = "zenith_admin"; - pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 10; + /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage. + /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency + /// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach. + /// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed. + pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC: usize = 50; pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; + /// Currently, sync happens with AWS S3, that has two limits on requests per second: + /// ~200 RPS for IAM services + /// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html + /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests + /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ + pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; @@ -127,7 +136,7 @@ impl FromStr for ProfilingConfig { let result = match s { "disabled" => ProfilingConfig::Disabled, "page_requests" => ProfilingConfig::PageRequests, - _ => bail!("invalid value \"{}\" for profiling option, valid values are \"disabled\" and \"page_requests\"", s), + _ => bail!("invalid value \"{s}\" for profiling option, valid values are \"disabled\" and \"page_requests\""), }; Ok(result) } @@ -269,36 +278,36 @@ impl PageServerConfigBuilder { Ok(PageServerConf { listen_pg_addr: self .listen_pg_addr - .ok_or(anyhow::anyhow!("missing listen_pg_addr"))?, + .ok_or(anyhow!("missing listen_pg_addr"))?, listen_http_addr: self .listen_http_addr - .ok_or(anyhow::anyhow!("missing listen_http_addr"))?, + .ok_or(anyhow!("missing listen_http_addr"))?, wait_lsn_timeout: self .wait_lsn_timeout - .ok_or(anyhow::anyhow!("missing wait_lsn_timeout"))?, + .ok_or(anyhow!("missing wait_lsn_timeout"))?, wal_redo_timeout: self .wal_redo_timeout - .ok_or(anyhow::anyhow!("missing wal_redo_timeout"))?, - superuser: self.superuser.ok_or(anyhow::anyhow!("missing superuser"))?, + .ok_or(anyhow!("missing wal_redo_timeout"))?, + superuser: self.superuser.ok_or(anyhow!("missing superuser"))?, page_cache_size: self .page_cache_size - .ok_or(anyhow::anyhow!("missing page_cache_size"))?, + .ok_or(anyhow!("missing page_cache_size"))?, max_file_descriptors: self .max_file_descriptors - .ok_or(anyhow::anyhow!("missing max_file_descriptors"))?, - workdir: self.workdir.ok_or(anyhow::anyhow!("missing workdir"))?, + .ok_or(anyhow!("missing max_file_descriptors"))?, + workdir: self.workdir.ok_or(anyhow!("missing workdir"))?, pg_distrib_dir: self .pg_distrib_dir - .ok_or(anyhow::anyhow!("missing pg_distrib_dir"))?, - auth_type: self.auth_type.ok_or(anyhow::anyhow!("missing auth_type"))?, + .ok_or(anyhow!("missing pg_distrib_dir"))?, + auth_type: self.auth_type.ok_or(anyhow!("missing auth_type"))?, auth_validation_public_key_path: self .auth_validation_public_key_path - .ok_or(anyhow::anyhow!("missing auth_validation_public_key_path"))?, + .ok_or(anyhow!("missing auth_validation_public_key_path"))?, remote_storage_config: self .remote_storage_config - .ok_or(anyhow::anyhow!("missing remote_storage_config"))?, - id: self.id.ok_or(anyhow::anyhow!("missing id"))?, - profiling: self.profiling.ok_or(anyhow::anyhow!("missing profiling"))?, + .ok_or(anyhow!("missing remote_storage_config"))?, + id: self.id.ok_or(anyhow!("missing id"))?, + profiling: self.profiling.ok_or(anyhow!("missing profiling"))?, // TenantConf is handled separately default_tenant_conf: TenantConf::default(), }) @@ -309,7 +318,7 @@ impl PageServerConfigBuilder { #[derive(Debug, Clone, PartialEq, Eq)] pub struct RemoteStorageConfig { /// Max allowed number of concurrent sync operations between pageserver and the remote storage. - pub max_concurrent_sync: NonZeroUsize, + pub max_concurrent_timelines_sync: NonZeroUsize, /// Max allowed errors before the sync task is considered failed and evicted. pub max_sync_errors: NonZeroU32, /// The storage connection configuration. @@ -350,6 +359,9 @@ pub struct S3Config { /// /// Example: `http://127.0.0.1:5000` pub endpoint: Option, + /// AWS S3 has various limits on its API calls, we need not to exceed those. + /// See [`defaults::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. + pub concurrency_limit: NonZeroUsize, } impl std::fmt::Debug for S3Config { @@ -358,6 +370,7 @@ impl std::fmt::Debug for S3Config { .field("bucket_name", &self.bucket_name) .field("bucket_region", &self.bucket_region) .field("prefix_in_bucket", &self.prefix_in_bucket) + .field("concurrency_limit", &self.concurrency_limit) .finish() } } @@ -431,7 +444,7 @@ impl PageServerConf { } "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)), "profiling" => builder.profiling(parse_toml_from_str(key, item)?), - _ => bail!("unrecognized pageserver option '{}'", key), + _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -509,32 +522,23 @@ impl PageServerConf { let bucket_name = toml.get("bucket_name"); let bucket_region = toml.get("bucket_region"); - let max_concurrent_sync: NonZeroUsize = if let Some(s) = toml.get("max_concurrent_sync") { - parse_toml_u64("max_concurrent_sync", s) - .and_then(|toml_u64| { - toml_u64.try_into().with_context(|| { - format!("'max_concurrent_sync' value {} is too large", toml_u64) - }) - }) - .ok() - .and_then(NonZeroUsize::new) - .context("'max_concurrent_sync' must be a non-zero positive integer")? - } else { - NonZeroUsize::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC).unwrap() - }; - let max_sync_errors: NonZeroU32 = if let Some(s) = toml.get("max_sync_errors") { - parse_toml_u64("max_sync_errors", s) - .and_then(|toml_u64| { - toml_u64.try_into().with_context(|| { - format!("'max_sync_errors' value {} is too large", toml_u64) - }) - }) - .ok() - .and_then(NonZeroU32::new) - .context("'max_sync_errors' must be a non-zero positive integer")? - } else { - NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS).unwrap() - }; + let max_concurrent_timelines_sync = NonZeroUsize::new( + parse_optional_integer("max_concurrent_timelines_sync", toml)? + .unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC), + ) + .context("Failed to parse 'max_concurrent_timelines_sync' as a positive integer")?; + + let max_sync_errors = NonZeroU32::new( + parse_optional_integer("max_sync_errors", toml)? + .unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS), + ) + .context("Failed to parse 'max_sync_errors' as a positive integer")?; + + let concurrency_limit = NonZeroUsize::new( + parse_optional_integer("concurrency_limit", toml)? + .unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT), + ) + .context("Failed to parse 'concurrency_limit' as a positive integer")?; let storage = match (local_path, bucket_name, bucket_region) { (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"), @@ -565,6 +569,7 @@ impl PageServerConf { .get("endpoint") .map(|endpoint| parse_toml_string("endpoint", endpoint)) .transpose()?, + concurrency_limit, }), (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from( parse_toml_string("local_path", local_path)?, @@ -573,7 +578,7 @@ impl PageServerConf { }; Ok(RemoteStorageConfig { - max_concurrent_sync, + max_concurrent_timelines_sync, max_sync_errors, storage, }) @@ -581,7 +586,7 @@ impl PageServerConf { #[cfg(test)] pub fn test_repo_dir(test_name: &str) -> PathBuf { - PathBuf::from(format!("../tmp_check/test_{}", test_name)) + PathBuf::from(format!("../tmp_check/test_{test_name}")) } #[cfg(test)] @@ -611,7 +616,7 @@ impl PageServerConf { fn parse_toml_string(name: &str, item: &Item) -> Result { let s = item .as_str() - .with_context(|| format!("configure option {} is not a string", name))?; + .with_context(|| format!("configure option {name} is not a string"))?; Ok(s.to_string()) } @@ -620,17 +625,34 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result { // for our use, though. let i: i64 = item .as_integer() - .with_context(|| format!("configure option {} is not an integer", name))?; + .with_context(|| format!("configure option {name} is not an integer"))?; if i < 0 { - bail!("configure option {} cannot be negative", name); + bail!("configure option {name} cannot be negative"); } Ok(i as u64) } +fn parse_optional_integer(name: &str, item: &toml_edit::Item) -> anyhow::Result> +where + I: TryFrom, + E: std::error::Error + Send + Sync + 'static, +{ + let toml_integer = match item.get(name) { + Some(item) => item + .as_integer() + .with_context(|| format!("configure option {name} is not an integer"))?, + None => return Ok(None), + }; + + I::try_from(toml_integer) + .map(Some) + .with_context(|| format!("configure option {name} is too large")) +} + fn parse_toml_duration(name: &str, item: &Item) -> Result { let s = item .as_str() - .with_context(|| format!("configure option {} is not a string", name))?; + .with_context(|| format!("configure option {name} is not a string"))?; Ok(humantime::parse_duration(s)?) } @@ -641,7 +663,7 @@ where { let v = item .as_str() - .with_context(|| format!("configure option {} is not a string", name))?; + .with_context(|| format!("configure option {name} is not a string"))?; T::from_str(v) } @@ -679,10 +701,8 @@ id = 10 let config_string = format!("pg_distrib_dir='{}'\nid=10", pg_distrib_dir.display()); let toml = config_string.parse()?; - let parsed_config = - PageServerConf::parse_and_validate(&toml, &workdir).unwrap_or_else(|e| { - panic!("Failed to parse config '{}', reason: {}", config_string, e) - }); + let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir) + .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}")); assert_eq!( parsed_config, @@ -715,16 +735,13 @@ id = 10 let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; let config_string = format!( - "{}pg_distrib_dir='{}'", - ALL_BASE_VALUES_TOML, + "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'", pg_distrib_dir.display() ); let toml = config_string.parse()?; - let parsed_config = - PageServerConf::parse_and_validate(&toml, &workdir).unwrap_or_else(|e| { - panic!("Failed to parse config '{}', reason: {}", config_string, e) - }); + let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir) + .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}")); assert_eq!( parsed_config, @@ -772,37 +789,33 @@ local_path = '{}'"#, for remote_storage_config_str in identical_toml_declarations { let config_string = format!( - r#"{} + r#"{ALL_BASE_VALUES_TOML} pg_distrib_dir='{}' -{}"#, - ALL_BASE_VALUES_TOML, +{remote_storage_config_str}"#, pg_distrib_dir.display(), - remote_storage_config_str, ); let toml = config_string.parse()?; let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| { - panic!("Failed to parse config '{}', reason: {}", config_string, e) - }) + .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}")) .remote_storage_config .expect("Should have remote storage config for the local FS"); assert_eq!( - parsed_remote_storage_config, - RemoteStorageConfig { - max_concurrent_sync: NonZeroUsize::new( - defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC - ) - .unwrap(), - max_sync_errors: NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS) + parsed_remote_storage_config, + RemoteStorageConfig { + max_concurrent_timelines_sync: NonZeroUsize::new( + defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC + ) .unwrap(), - storage: RemoteStorageKind::LocalFs(local_storage_path.clone()), - }, - "Remote storage config should correctly parse the local FS config and fill other storage defaults" - ); + max_sync_errors: NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS) + .unwrap(), + storage: RemoteStorageKind::LocalFs(local_storage_path.clone()), + }, + "Remote storage config should correctly parse the local FS config and fill other storage defaults" + ); } Ok(()) } @@ -818,52 +831,49 @@ pg_distrib_dir='{}' let access_key_id = "SOMEKEYAAAAASADSAH*#".to_string(); let secret_access_key = "SOMEsEcReTsd292v".to_string(); let endpoint = "http://localhost:5000".to_string(); - let max_concurrent_sync = NonZeroUsize::new(111).unwrap(); + let max_concurrent_timelines_sync = NonZeroUsize::new(111).unwrap(); let max_sync_errors = NonZeroU32::new(222).unwrap(); + let s3_concurrency_limit = NonZeroUsize::new(333).unwrap(); let identical_toml_declarations = &[ format!( r#"[remote_storage] -max_concurrent_sync = {} -max_sync_errors = {} -bucket_name = '{}' -bucket_region = '{}' -prefix_in_bucket = '{}' -access_key_id = '{}' -secret_access_key = '{}' -endpoint = '{}'"#, - max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key, endpoint +max_concurrent_timelines_sync = {max_concurrent_timelines_sync} +max_sync_errors = {max_sync_errors} +bucket_name = '{bucket_name}' +bucket_region = '{bucket_region}' +prefix_in_bucket = '{prefix_in_bucket}' +access_key_id = '{access_key_id}' +secret_access_key = '{secret_access_key}' +endpoint = '{endpoint}' +concurrency_limit = {s3_concurrency_limit}"# ), format!( - "remote_storage={{max_concurrent_sync={}, max_sync_errors={}, bucket_name='{}', bucket_region='{}', prefix_in_bucket='{}', access_key_id='{}', secret_access_key='{}', endpoint='{}'}}", - max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key, endpoint + "remote_storage={{max_concurrent_timelines_sync={max_concurrent_timelines_sync}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\ + bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', access_key_id='{access_key_id}', secret_access_key='{secret_access_key}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}", ), ]; for remote_storage_config_str in identical_toml_declarations { let config_string = format!( - r#"{} + r#"{ALL_BASE_VALUES_TOML} pg_distrib_dir='{}' -{}"#, - ALL_BASE_VALUES_TOML, +{remote_storage_config_str}"#, pg_distrib_dir.display(), - remote_storage_config_str, ); let toml = config_string.parse()?; let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| { - panic!("Failed to parse config '{}', reason: {}", config_string, e) - }) + .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}")) .remote_storage_config .expect("Should have remote storage config for S3"); assert_eq!( parsed_remote_storage_config, RemoteStorageConfig { - max_concurrent_sync, + max_concurrent_timelines_sync, max_sync_errors, storage: RemoteStorageKind::AwsS3(S3Config { bucket_name: bucket_name.clone(), @@ -871,7 +881,8 @@ pg_distrib_dir='{}' access_key_id: Some(access_key_id.clone()), secret_access_key: Some(secret_access_key.clone()), prefix_in_bucket: Some(prefix_in_bucket.clone()), - endpoint: Some(endpoint.clone()) + endpoint: Some(endpoint.clone()), + concurrency_limit: s3_concurrency_limit, }), }, "Remote storage config should correctly parse the S3 config" diff --git a/pageserver/src/remote_storage.rs b/pageserver/src/remote_storage.rs index 8a09f7b9ca..39595b7167 100644 --- a/pageserver/src/remote_storage.rs +++ b/pageserver/src/remote_storage.rs @@ -161,7 +161,7 @@ pub fn start_local_timeline_sync( config, local_timeline_files, LocalFs::new(root.clone(), &config.workdir)?, - storage_config.max_concurrent_sync, + storage_config.max_concurrent_timelines_sync, storage_config.max_sync_errors, ) }, @@ -172,7 +172,7 @@ pub fn start_local_timeline_sync( config, local_timeline_files, S3Bucket::new(s3_config, &config.workdir)?, - storage_config.max_concurrent_sync, + storage_config.max_concurrent_timelines_sync, storage_config.max_sync_errors, ) }, diff --git a/pageserver/src/remote_storage/s3_bucket.rs b/pageserver/src/remote_storage/s3_bucket.rs index b69634a1b6..73d828d150 100644 --- a/pageserver/src/remote_storage/s3_bucket.rs +++ b/pageserver/src/remote_storage/s3_bucket.rs @@ -15,7 +15,7 @@ use rusoto_s3::{ DeleteObjectRequest, GetObjectRequest, ListObjectsV2Request, PutObjectRequest, S3Client, StreamingBody, S3, }; -use tokio::io; +use tokio::{io, sync::Semaphore}; use tokio_util::io::ReaderStream; use tracing::debug; @@ -65,6 +65,10 @@ pub struct S3Bucket { client: S3Client, bucket_name: String, prefix_in_bucket: Option, + // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded. + // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold. + // The helps to ensure we don't exceed the thresholds. + concurrency_limiter: Semaphore, } impl S3Bucket { @@ -119,6 +123,7 @@ impl S3Bucket { pageserver_workdir, bucket_name: aws_config.bucket_name.clone(), prefix_in_bucket, + concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()), }) } } @@ -147,6 +152,11 @@ impl RemoteStorage for S3Bucket { let mut continuation_token = None; loop { + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 list")?; let fetch_response = self .client .list_objects_v2(ListObjectsV2Request { @@ -180,6 +190,11 @@ impl RemoteStorage for S3Bucket { to: &Self::StoragePath, metadata: Option, ) -> anyhow::Result<()> { + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 upload")?; self.client .put_object(PutObjectRequest { body: Some(StreamingBody::new_with_size( @@ -200,6 +215,11 @@ impl RemoteStorage for S3Bucket { from: &Self::StoragePath, to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), ) -> anyhow::Result> { + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 download")?; let object_output = self .client .get_object(GetObjectRequest { @@ -231,6 +251,11 @@ impl RemoteStorage for S3Bucket { Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive), None => format!("bytes={}-", start_inclusive), }); + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 range download")?; let object_output = self .client .get_object(GetObjectRequest { @@ -250,6 +275,11 @@ impl RemoteStorage for S3Bucket { } async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> { + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 delete")?; self.client .delete_object(DeleteObjectRequest { bucket: self.bucket_name.clone(), @@ -433,6 +463,7 @@ mod tests { client: S3Client::new("us-east-1".parse().unwrap()), bucket_name: "dummy-bucket".to_string(), prefix_in_bucket: Some("dummy_prefix/".to_string()), + concurrency_limiter: Semaphore::new(1), } } diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index 4d1ec2e225..20012f32d7 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -62,7 +62,7 @@ pub mod index; mod upload; use std::{ - collections::{hash_map, HashMap, HashSet, VecDeque}, + collections::{HashMap, HashSet, VecDeque}, fmt::Debug, num::{NonZeroU32, NonZeroUsize}, ops::ControlFlow, @@ -132,7 +132,9 @@ lazy_static! { /// mpsc approach was picked to allow blocking the sync loop if no tasks are present, to avoid meaningless spinning. mod sync_queue { use std::{ - collections::{hash_map, HashMap}, + collections::{hash_map, HashMap, HashSet}, + num::NonZeroUsize, + ops::ControlFlow, sync::atomic::{AtomicUsize, Ordering}, }; @@ -179,7 +181,7 @@ mod sync_queue { /// Polls a new task from the queue, using its receiver counterpart. /// Does not block if the queue is empty, returning [`None`] instead. /// Needed to correctly track the queue length. - pub async fn next_task( + async fn next_task( receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, ) -> Option<(ZTenantTimelineId, SyncTask)> { let task = receiver.recv().await; @@ -195,15 +197,29 @@ mod sync_queue { /// or two (download and upload, if both were found in the queue during batch construction). pub async fn next_task_batch( receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, - mut max_batch_size: usize, - ) -> HashMap { - if max_batch_size == 0 { - return HashMap::new(); - } - let mut tasks: HashMap = - HashMap::with_capacity(max_batch_size); + max_timelines_to_sync: NonZeroUsize, + ) -> ControlFlow<(), HashMap> { + // request the first task in blocking fashion to do less meaningless work + let (first_sync_id, first_task) = if let Some(first_task) = next_task(receiver).await { + first_task + } else { + debug!("Queue sender part was dropped, aborting"); + return ControlFlow::Break(()); + }; + + let max_timelines_to_sync = max_timelines_to_sync.get(); + let mut batched_timelines = HashSet::with_capacity(max_timelines_to_sync); + batched_timelines.insert(first_sync_id.timeline_id); + + let mut tasks = HashMap::new(); + tasks.insert(first_sync_id, first_task); loop { + if batched_timelines.len() >= max_timelines_to_sync { + debug!("Filled a full task batch with {max_timelines_to_sync} timeline sync operations"); + break; + } + match receiver.try_recv() { Ok((sync_id, new_task)) => { LENGTH.fetch_sub(1, Ordering::Relaxed); @@ -216,24 +232,23 @@ mod sync_queue { v.insert(new_task); } } - - max_batch_size -= 1; - if max_batch_size == 0 { - break; - } + batched_timelines.insert(sync_id.timeline_id); } Err(TryRecvError::Disconnected) => { debug!("Sender disconnected, batch collection aborted"); break; } Err(TryRecvError::Empty) => { - debug!("No more data in the sync queue, task batch is not full"); + debug!( + "No more data in the sync queue, task batch is not full, length: {}, max allowed size: {max_timelines_to_sync}", + batched_timelines.len() + ); break; } } } - tasks + ControlFlow::Continue(tasks) } /// Length of the queue, assuming that all receiver counterparts were only called using the queue api. @@ -455,7 +470,7 @@ pub(super) fn spawn_storage_sync_thread( conf: &'static PageServerConf, local_timeline_files: HashMap)>, storage: S, - max_concurrent_sync: NonZeroUsize, + max_concurrent_timelines_sync: NonZeroUsize, max_sync_errors: NonZeroU32, ) -> anyhow::Result where @@ -497,7 +512,7 @@ where receiver, Arc::new(storage), loop_index, - max_concurrent_sync, + max_concurrent_timelines_sync, max_sync_errors, ); Ok(()) @@ -517,7 +532,7 @@ fn storage_sync_loop( mut receiver: UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, storage: Arc, index: RemoteIndex, - max_concurrent_sync: NonZeroUsize, + max_concurrent_timelines_sync: NonZeroUsize, max_sync_errors: NonZeroU32, ) where P: Debug + Send + Sync + 'static, @@ -534,7 +549,7 @@ fn storage_sync_loop( &mut receiver, storage, loop_index, - max_concurrent_sync, + max_concurrent_timelines_sync, max_sync_errors, ) .instrument(info_span!("storage_sync_loop_step")) => step, @@ -568,34 +583,19 @@ async fn loop_step( receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, storage: Arc, index: RemoteIndex, - max_concurrent_sync: NonZeroUsize, + max_concurrent_timelines_sync: NonZeroUsize, max_sync_errors: NonZeroU32, ) -> ControlFlow<(), HashMap>> where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { - let max_concurrent_sync = max_concurrent_sync.get(); - - // request the first task in blocking fashion to do less meaningless work - let (first_sync_id, first_task) = - if let Some(first_task) = sync_queue::next_task(receiver).await { - first_task - } else { - return ControlFlow::Break(()); + let batched_tasks = + match sync_queue::next_task_batch(receiver, max_concurrent_timelines_sync).await { + ControlFlow::Continue(batch) => batch, + ControlFlow::Break(()) => return ControlFlow::Break(()), }; - let mut batched_tasks = sync_queue::next_task_batch(receiver, max_concurrent_sync - 1).await; - match batched_tasks.entry(first_sync_id) { - hash_map::Entry::Occupied(o) => { - let current = o.remove(); - batched_tasks.insert(first_sync_id, current.merge(first_task)); - } - hash_map::Entry::Vacant(v) => { - v.insert(first_task); - } - } - let remaining_queue_length = sync_queue::len(); REMAINING_SYNC_ITEMS.set(remaining_queue_length as i64); if remaining_queue_length > 0 || !batched_tasks.is_empty() { @@ -623,7 +623,7 @@ where let mut new_timeline_states: HashMap< ZTenantId, HashMap, - > = HashMap::with_capacity(max_concurrent_sync); + > = HashMap::with_capacity(max_concurrent_timelines_sync.get()); while let Some((sync_id, state_update)) = sync_results.next().await { debug!("Finished storage sync task for sync id {sync_id}"); if let Some(state_update) = state_update { From 3fd234da07165401d339c59ce15577a2f0465951 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 26 Apr 2022 13:48:42 +0400 Subject: [PATCH 0193/1022] Enable etcd for safekeepers in deploy. --- .circleci/ansible/production.hosts | 1 + .circleci/ansible/staging.hosts | 1 + .circleci/ansible/systemd/safekeeper.service | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.circleci/ansible/production.hosts b/.circleci/ansible/production.hosts index 13224b7cf5..f32b57154c 100644 --- a/.circleci/ansible/production.hosts +++ b/.circleci/ansible/production.hosts @@ -14,3 +14,4 @@ safekeepers console_mgmt_base_url = http://console-release.local bucket_name = zenith-storage-oregon bucket_region = us-west-2 +etcd_endpoints = etcd-release.local:2379 diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index 69f058c2b9..71166c531e 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -15,3 +15,4 @@ safekeepers console_mgmt_base_url = http://console-staging.local bucket_name = zenith-staging-storage-us-east-1 bucket_region = us-east-1 +etcd_endpoints = etcd-staging.local:2379 diff --git a/.circleci/ansible/systemd/safekeeper.service b/.circleci/ansible/systemd/safekeeper.service index e75602b609..cac38d8756 100644 --- a/.circleci/ansible/systemd/safekeeper.service +++ b/.circleci/ansible/systemd/safekeeper.service @@ -6,7 +6,7 @@ After=network.target auditd.service Type=simple User=safekeeper Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib -ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data +ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed KillSignal=SIGINT From 8b9d523f3cb1a140912bb5c0fdd67e176a10b45c Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 26 Apr 2022 19:37:56 +0400 Subject: [PATCH 0194/1022] Remove old WAL on safekeepers. Remove when it is consumed by all of 1) pageserver (remote_consistent_lsn) 2) safekeeper peers 3) s3 WAL offloading. In test s3 offloading for now is mocked by directly bumping s3_wal_lsn. ref #1403 --- safekeeper/src/bin/safekeeper.rs | 13 ++++- safekeeper/src/broker.rs | 7 ++- safekeeper/src/http/routes.rs | 20 ++++++++ safekeeper/src/lib.rs | 1 + safekeeper/src/remove_wal.rs | 25 ++++++++++ safekeeper/src/safekeeper.rs | 24 +++++++++ safekeeper/src/timeline.rs | 24 +++++++++ safekeeper/src/wal_storage.rs | 48 +++++++++++++++++- test_runner/batch_others/test_wal_acceptor.py | 49 +++++++++++++++++++ test_runner/fixtures/zenith_fixtures.py | 9 ++++ 10 files changed, 215 insertions(+), 5 deletions(-) create mode 100644 safekeeper/src/remove_wal.rs diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 7434f921cb..3fea3581a8 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -16,11 +16,11 @@ use url::{ParseError, Url}; use safekeeper::control_file::{self}; use safekeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR}; -use safekeeper::http; -use safekeeper::s3_offload; +use safekeeper::remove_wal; use safekeeper::wal_service; use safekeeper::SafeKeeperConf; use safekeeper::{broker, callmemaybe}; +use safekeeper::{http, s3_offload}; use utils::{ http::endpoint, logging, shutdown::exit_now, signals, tcp_listener, zid::ZNodeId, GIT_VERSION, }; @@ -292,6 +292,15 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b ); } + let conf_ = conf.clone(); + threads.push( + thread::Builder::new() + .name("WAL removal thread".into()) + .spawn(|| { + remove_wal::thread_main(conf_); + })?, + ); + // TODO: put more thoughts into handling of failed threads // We probably should restart them. diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index b84b5cf789..8ce7bdf0e5 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -32,23 +32,28 @@ const ZENITH_PREFIX: &str = "zenith"; /// Published data about safekeeper. Fields made optional for easy migrations. #[serde_as] -#[derive(Deserialize, Serialize)] +#[derive(Debug, Deserialize, Serialize)] pub struct SafekeeperInfo { /// Term of the last entry. pub last_log_term: Option, /// LSN of the last record. #[serde_as(as = "Option")] + #[serde(default)] pub flush_lsn: Option, /// Up to which LSN safekeeper regards its WAL as committed. #[serde_as(as = "Option")] + #[serde(default)] pub commit_lsn: Option, /// LSN up to which safekeeper offloaded WAL to s3. #[serde_as(as = "Option")] + #[serde(default)] pub s3_wal_lsn: Option, /// LSN of last checkpoint uploaded by pageserver. #[serde_as(as = "Option")] + #[serde(default)] pub remote_consistent_lsn: Option, #[serde_as(as = "Option")] + #[serde(default)] pub peer_horizon_lsn: Option, } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 2d22332db9..fab8724430 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -5,6 +5,7 @@ use serde::Serializer; use std::fmt::Display; use std::sync::Arc; +use crate::broker::SafekeeperInfo; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; use crate::timeline::GlobalTimelines; @@ -123,6 +124,20 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, ApiError> { + let zttid = ZTenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + let safekeeper_info: SafekeeperInfo = json_request(&mut request).await?; + + let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?; + tli.record_safekeeper_info(&safekeeper_info, ZNodeId(1))?; + + json_response(StatusCode::OK, ()) +} + /// Safekeeper http router. pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder { let router = endpoint::make_router(); @@ -134,4 +149,9 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder timeline_status_handler, ) .post("/v1/timeline", timeline_create_handler) + // for tests + .post( + "/v1/record_safekeeper_info/:tenant_id/:timeline_id", + record_safekeeper_info, + ) } diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 8951e8f680..6509e8166a 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -13,6 +13,7 @@ pub mod handler; pub mod http; pub mod json_ctrl; pub mod receive_wal; +pub mod remove_wal; pub mod s3_offload; pub mod safekeeper; pub mod send_wal; diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs new file mode 100644 index 0000000000..9474f65e5f --- /dev/null +++ b/safekeeper/src/remove_wal.rs @@ -0,0 +1,25 @@ +//! Thread removing old WAL. + +use std::{thread, time::Duration}; + +use tracing::*; + +use crate::{timeline::GlobalTimelines, SafeKeeperConf}; + +pub fn thread_main(conf: SafeKeeperConf) { + let wal_removal_interval = Duration::from_millis(5000); + loop { + let active_tlis = GlobalTimelines::get_active_timelines(); + for zttid in &active_tlis { + if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) { + if let Err(e) = tli.remove_old_wal() { + warn!( + "failed to remove WAL for tenant {} timeline {}: {}", + tli.zttid.tenant_id, tli.zttid.timeline_id, e + ); + } + } + } + thread::sleep(wal_removal_interval) + } +} diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 59174f34a2..048753152b 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -5,6 +5,8 @@ use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use postgres_ffi::xlog_utils::TimeLineID; + +use postgres_ffi::xlog_utils::XLogSegNo; use serde::{Deserialize, Serialize}; use std::cmp::max; use std::cmp::min; @@ -880,6 +882,24 @@ where } Ok(()) } + + /// Get oldest segno we still need to keep. We hold WAL till it is consumed + /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3 + /// offloading. + /// While it is safe to use inmem values for determining horizon, + /// we use persistent to make possible normal states less surprising. + pub fn get_horizon_segno(&self) -> XLogSegNo { + let horizon_lsn = min( + min( + self.state.remote_consistent_lsn, + self.state.peer_horizon_lsn, + ), + self.state.s3_wal_lsn, + ); + let res = horizon_lsn.segment_number(self.state.server.wal_seg_size as usize); + info!("horizon is {}, res {}", horizon_lsn, res); + res + } } #[cfg(test)] @@ -935,6 +955,10 @@ mod tests { fn flush_wal(&mut self) -> Result<()> { Ok(()) } + + fn remove_up_to(&self) -> Box Result<()>> { + Box::new(move |_segno_up_to: XLogSegNo| Ok(())) + } } #[test] diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index fbae34251c..4a507015d3 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -4,6 +4,7 @@ use anyhow::{bail, Context, Result}; use lazy_static::lazy_static; +use postgres_ffi::xlog_utils::XLogSegNo; use std::cmp::{max, min}; use std::collections::HashMap; @@ -88,6 +89,7 @@ struct SharedState { active: bool, num_computes: u32, pageserver_connstr: Option, + last_removed_segno: XLogSegNo, } impl SharedState { @@ -109,6 +111,7 @@ impl SharedState { active: false, num_computes: 0, pageserver_connstr: None, + last_removed_segno: 0, }) } @@ -127,6 +130,7 @@ impl SharedState { active: false, num_computes: 0, pageserver_connstr: None, + last_removed_segno: 0, }) } @@ -459,6 +463,26 @@ impl Timeline { let shared_state = self.mutex.lock().unwrap(); shared_state.sk.wal_store.flush_lsn() } + + pub fn remove_old_wal(&self) -> Result<()> { + let horizon_segno: XLogSegNo; + let remover: Box Result<(), anyhow::Error>>; + { + let shared_state = self.mutex.lock().unwrap(); + horizon_segno = shared_state.sk.get_horizon_segno(); + remover = shared_state.sk.wal_store.remove_up_to(); + if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { + return Ok(()); + } + // release the lock before removing + } + let _enter = + info_span!("", timeline = %self.zttid.tenant_id, tenant = %self.zttid.timeline_id) + .entered(); + remover(horizon_segno - 1)?; + self.mutex.lock().unwrap().last_removed_segno = horizon_segno; + Ok(()) + } } // Utilities needed by various Connection-like objects diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 69a4fb11e1..503bd7c543 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -11,10 +11,12 @@ use anyhow::{anyhow, bail, Context, Result}; use std::io::{Read, Seek, SeekFrom}; use lazy_static::lazy_static; -use postgres_ffi::xlog_utils::{find_end_of_wal, XLogSegNo, PG_TLI}; +use postgres_ffi::xlog_utils::{ + find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, XLogSegNo, PG_TLI, +}; use std::cmp::min; -use std::fs::{self, File, OpenOptions}; +use std::fs::{self, remove_file, File, OpenOptions}; use std::io::Write; use std::path::{Path, PathBuf}; @@ -101,6 +103,10 @@ pub trait Storage { /// Durably store WAL on disk, up to the last written WAL record. fn flush_wal(&mut self) -> Result<()>; + + /// Remove all segments <= given segno. Returns closure as we want to do + /// that without timeline lock. + fn remove_up_to(&self) -> Box Result<()>>; } /// PhysicalStorage is a storage that stores WAL on disk. Writes are separated from flushes @@ -466,6 +472,44 @@ impl Storage for PhysicalStorage { self.update_flush_lsn(); Ok(()) } + + fn remove_up_to(&self) -> Box Result<()>> { + let timeline_dir = self.timeline_dir.clone(); + let wal_seg_size = self.wal_seg_size.unwrap(); + Box::new(move |segno_up_to: XLogSegNo| { + remove_up_to(&timeline_dir, wal_seg_size, segno_up_to) + }) + } +} + +/// Remove all WAL segments in timeline_dir <= given segno. +fn remove_up_to(timeline_dir: &Path, wal_seg_size: usize, segno_up_to: XLogSegNo) -> Result<()> { + let mut n_removed = 0; + for entry in fs::read_dir(&timeline_dir)? { + let entry = entry?; + let entry_path = entry.path(); + let fname = entry_path.file_name().unwrap(); + + if let Some(fname_str) = fname.to_str() { + /* Ignore files that are not XLOG segments */ + if !IsXLogFileName(fname_str) && !IsPartialXLogFileName(fname_str) { + continue; + } + let (segno, _) = XLogFromFileName(fname_str, wal_seg_size); + if segno <= segno_up_to { + remove_file(entry_path)?; + n_removed += 1; + } + } + } + let segno_from = segno_up_to - n_removed + 1; + info!( + "removed {} WAL segments [{}; {}]", + n_removed, + XLogFileName(PG_TLI, segno_from, wal_seg_size), + XLogFileName(PG_TLI, segno_up_to, wal_seg_size) + ); + Ok(()) } pub struct WalReader { diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index cc9ec9a275..395084af0e 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -370,6 +370,55 @@ def test_broker(zenith_env_builder: ZenithEnvBuilder): time.sleep(0.5) +# Test that old WAL consumed by peers and pageserver is removed from safekeepers. +@pytest.mark.skipif(etcd_path() is None, reason="requires etcd which is not present in PATH") +def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): + zenith_env_builder.num_safekeepers = 2 + zenith_env_builder.broker = True + # to advance remote_consistent_llsn + zenith_env_builder.enable_local_fs_remote_storage() + env = zenith_env_builder.init_start() + + env.zenith_cli.create_branch('test_safekeepers_wal_removal') + pg = env.postgres.create_start('test_safekeepers_wal_removal') + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + # we rely upon autocommit after each statement + # as waiting for acceptors happens there + cur.execute('CREATE TABLE t(key int primary key, value text)') + cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + + tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] + timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + + # force checkpoint to advance remote_consistent_lsn + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor() as pscur: + pscur.execute(f"checkpoint {tenant_id} {timeline_id}") + + # We will wait for first segment removal. Make sure they exist for starter. + first_segments = [ + os.path.join(sk.data_dir(), tenant_id, timeline_id, '000000010000000000000001') + for sk in env.safekeepers + ] + assert all(os.path.exists(p) for p in first_segments) + + http_cli = env.safekeepers[0].http_client() + # Pretend WAL is offloaded to s3. + http_cli.record_safekeeper_info(tenant_id, timeline_id, {'s3_wal_lsn': 'FFFFFFFF/FEFFFFFF'}) + + # wait till first segment is removed on all safekeepers + started_at = time.time() + while True: + if all(not os.path.exists(p) for p in first_segments): + break + elapsed = time.time() - started_at + if elapsed > 20: + raise RuntimeError(f"timed out waiting {elapsed:.0f}s for first segment get removed") + time.sleep(0.5) + + class ProposerPostgres(PgProtocol): """Object for running postgres without ZenithEnv""" def __init__(self, diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index d295a79953..e16d1acf2f 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1738,6 +1738,9 @@ class Safekeeper: def http_client(self) -> SafekeeperHttpClient: return SafekeeperHttpClient(port=self.port.http) + def data_dir(self) -> str: + return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}") + @dataclass class SafekeeperTimelineStatus: @@ -1770,6 +1773,12 @@ class SafekeeperHttpClient(requests.Session): flush_lsn=resj['flush_lsn'], remote_consistent_lsn=resj['remote_consistent_lsn']) + def record_safekeeper_info(self, tenant_id: str, timeline_id: str, body): + res = self.post( + f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", + json=body) + res.raise_for_status() + def get_metrics(self) -> SafekeeperMetrics: request_result = self.get(f"http://localhost:{self.port}/metrics") request_result.raise_for_status() From b2e35fffa6743aa6a768337a3cd9ffdfa4f255aa Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Wed, 20 Apr 2022 23:36:33 -0700 Subject: [PATCH 0195/1022] Fix ancestor layer traversal (#1484) Signed-off-by: Dhammika Pathirana --- pageserver/src/layered_repository.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 3afef51a23..0dc54385b2 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1466,10 +1466,10 @@ impl LayeredTimeline { )?; cont_lsn = lsn_floor; path.push((result, cont_lsn, layer)); - } else if self.ancestor_timeline.is_some() { + } else if timeline.ancestor_timeline.is_some() { // Nothing on this timeline. Traverse to parent result = ValueReconstructResult::Continue; - cont_lsn = Lsn(self.ancestor_lsn.0 + 1); + cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); } else { // Nothing found result = ValueReconstructResult::Missing; From 6391862d8a791ee6d9377c588c1a4de08b13ed5a Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Thu, 21 Apr 2022 11:50:38 -0700 Subject: [PATCH 0196/1022] Add branch traversal test Signed-off-by: Dhammika Pathirana --- .../batch_others/test_ancestor_branch.py | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 test_runner/batch_others/test_ancestor_branch.py diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py new file mode 100644 index 0000000000..fa12f25894 --- /dev/null +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -0,0 +1,111 @@ +import subprocess +import asyncio +from contextlib import closing + +import psycopg2.extras +import pytest +from fixtures.log_helper import log +from fixtures.zenith_fixtures import ZenithEnvBuilder + + +# +# Create ancestor branches off the main branch. +# +def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): + + # Use safekeeper in this test to avoid a subtle race condition. + # Without safekeeper, walreceiver reconnection can stuck + # because of IO deadlock. + # + # See https://github.com/zenithdb/zenith/issues/1068 + zenith_env_builder.num_safekeepers = 1 + env = zenith_env_builder.init() + + # Override defaults, 1M gc_horizon and 4M checkpoint_distance. + # Extend compaction_period and gc_period to disable background compaction and gc. + env.pageserver.start(overrides=[ + '--pageserver-config-override="gc_period"="10 m"', + '--pageserver-config-override="gc_horizon"=1048576', + '--pageserver-config-override="checkpoint_distance"=4194304', + '--pageserver-config-override="compaction_period"="10 m"', + '--pageserver-config-override="compaction_threshold"=2' + ]) + env.safekeepers[0].start() + + pg_branch0 = env.postgres.create_start('main') + branch0_cur = pg_branch0.connect().cursor() + branch0_cur.execute("SHOW zenith.zenith_timeline") + branch0_timeline = branch0_cur.fetchone()[0] + log.info(f"b0 timeline {branch0_timeline}") + + # Create table, and insert 100k rows. + branch0_cur.execute('SELECT pg_current_wal_insert_lsn()') + branch0_lsn = branch0_cur.fetchone()[0] + log.info(f"b0 at lsn {branch0_lsn}") + + branch0_cur.execute('CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)') + branch0_cur.execute(''' + INSERT INTO foo + SELECT '00112233445566778899AABBCCDDEEFF' || ':branch0:' || g + FROM generate_series(1, 100000) g + ''') + branch0_cur.execute('SELECT pg_current_wal_insert_lsn()') + lsn_100 = branch0_cur.fetchone()[0] + log.info(f'LSN after 100 rows: {lsn_100}') + + # Create branch1. + env.zenith_cli.create_branch('branch1', 'main', ancestor_start_lsn=lsn_100) + pg_branch1 = env.postgres.create_start('branch1') + log.info("postgres is running on 'branch1' branch") + + branch1_cur = pg_branch1.connect().cursor() + branch1_cur.execute("SHOW zenith.zenith_timeline") + branch1_timeline = branch1_cur.fetchone()[0] + log.info(f"b1 timeline {branch1_timeline}") + + branch1_cur.execute('SELECT pg_current_wal_insert_lsn()') + branch1_lsn = branch1_cur.fetchone()[0] + log.info(f"b1 at lsn {branch1_lsn}") + + # Insert 100k rows. + branch1_cur.execute(''' + INSERT INTO foo + SELECT '00112233445566778899AABBCCDDEEFF' || ':branch1:' || g + FROM generate_series(1, 100000) g + ''') + branch1_cur.execute('SELECT pg_current_wal_insert_lsn()') + lsn_200 = branch1_cur.fetchone()[0] + log.info(f'LSN after 100 rows: {lsn_200}') + + # Create branch2. + env.zenith_cli.create_branch('branch2', 'branch1', ancestor_start_lsn=lsn_200) + pg_branch2 = env.postgres.create_start('branch2') + log.info("postgres is running on 'branch1' branch") + + branch2_cur = pg_branch2.connect().cursor() + branch2_cur.execute("SHOW zenith.zenith_timeline") + branch2_lsn = branch2_cur.fetchone()[0] + log.info(f"b2 timeline {branch1_timeline}") + + branch2_cur.execute('SELECT pg_current_wal_insert_lsn()') + branch2_lsn = branch2_cur.fetchone()[0] + log.info(f"b2 at lsn {branch2_lsn}") + + # Insert 100k rows. + branch2_cur.execute(''' + INSERT INTO foo + SELECT '00112233445566778899AABBCCDDEEFF' || ':branch2:' || g + FROM generate_series(1, 100000) g + ''') + branch2_cur.execute('SELECT pg_current_wal_insert_lsn()') + lsn_300 = branch2_cur.fetchone()[0] + log.info(f'LSN after 300 rows: {lsn_300}') + + branch0_cur.execute('SELECT count(*) FROM foo') + assert branch0_cur.fetchone() == (100000, ) + + branch1_cur.execute('SELECT count(*) FROM foo') + assert branch1_cur.fetchone() == (200000, ) + + branch2_cur.execute('SELECT count(*) FROM foo') + assert branch2_cur.fetchone() == (300000, ) From aeb4f81c3bb74e4b0adc570f760e785bf8463533 Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Thu, 21 Apr 2022 21:04:00 -0700 Subject: [PATCH 0197/1022] Add branch traversal unit test Signed-off-by: Dhammika Pathirana --- pageserver/src/layered_repository.rs | 57 ++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 0dc54385b2..679daa8248 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -2630,4 +2630,61 @@ pub mod tests { Ok(()) } + + #[test] + fn test_traverse_ancestors() -> Result<()> { + let repo = RepoHarness::create("test_traverse_ancestors")?.load(); + let mut tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + const NUM_KEYS: usize = 100; + const NUM_TLINES: usize = 50; + + let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); + // Track page mutation lsns across different timelines. + let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES]; + + let mut lsn = Lsn(0); + let mut tline_id = TIMELINE_ID; + + #[allow(clippy::needless_range_loop)] + for idx in 0..NUM_TLINES { + let new_tline_id = ZTimelineId::generate(); + repo.branch_timeline(tline_id, new_tline_id, lsn)?; + tline = repo.get_timeline_load(new_tline_id)?; + tline_id = new_tline_id; + + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = blknum as u32; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))), + )?; + println!("updating [{}][{}] at {}", idx, blknum, lsn); + writer.finish_write(lsn); + drop(writer); + updated[idx][blknum] = lsn; + } + } + + // Read pages from leaf timeline across all ancestors. + for (idx, lsns) in updated.iter().enumerate() { + for (blknum, lsn) in lsns.iter().enumerate() { + // Skip empty mutations. + if lsn.0 == 0 { + continue; + } + println!("chekcking [{}][{}] at {}", idx, blknum, lsn); + test_key.field6 = blknum as u32; + assert_eq!( + tline.get(test_key, *lsn)?, + TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn)) + ); + } + } + Ok(()) + } } From 091cefaa92afdecb8a260729ae39270b6a45193f Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Fri, 22 Apr 2022 17:17:44 -0700 Subject: [PATCH 0198/1022] Fix add compaction for key partitioning Signed-off-by: Dhammika Pathirana --- .../batch_others/test_ancestor_branch.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index fa12f25894..1e96369314 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -28,7 +28,8 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): '--pageserver-config-override="gc_horizon"=1048576', '--pageserver-config-override="checkpoint_distance"=4194304', '--pageserver-config-override="compaction_period"="10 m"', - '--pageserver-config-override="compaction_threshold"=2' + '--pageserver-config-override="compaction_threshold"=2', + '--pageserver-config-override="compaction_target_size"=4194304' ]) env.safekeepers[0].start() @@ -51,7 +52,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): ''') branch0_cur.execute('SELECT pg_current_wal_insert_lsn()') lsn_100 = branch0_cur.fetchone()[0] - log.info(f'LSN after 100 rows: {lsn_100}') + log.info(f'LSN after 100k rows: {lsn_100}') # Create branch1. env.zenith_cli.create_branch('branch1', 'main', ancestor_start_lsn=lsn_100) @@ -75,17 +76,17 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): ''') branch1_cur.execute('SELECT pg_current_wal_insert_lsn()') lsn_200 = branch1_cur.fetchone()[0] - log.info(f'LSN after 100 rows: {lsn_200}') + log.info(f'LSN after 200k rows: {lsn_200}') # Create branch2. env.zenith_cli.create_branch('branch2', 'branch1', ancestor_start_lsn=lsn_200) pg_branch2 = env.postgres.create_start('branch2') - log.info("postgres is running on 'branch1' branch") - + log.info("postgres is running on 'branch2' branch") branch2_cur = pg_branch2.connect().cursor() + branch2_cur.execute("SHOW zenith.zenith_timeline") - branch2_lsn = branch2_cur.fetchone()[0] - log.info(f"b2 timeline {branch1_timeline}") + branch2_timeline = branch2_cur.fetchone()[0] + log.info(f"b2 timeline {branch2_timeline}") branch2_cur.execute('SELECT pg_current_wal_insert_lsn()') branch2_lsn = branch2_cur.fetchone()[0] @@ -99,7 +100,11 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): ''') branch2_cur.execute('SELECT pg_current_wal_insert_lsn()') lsn_300 = branch2_cur.fetchone()[0] - log.info(f'LSN after 300 rows: {lsn_300}') + log.info(f'LSN after 300k rows: {lsn_300}') + + # Run compaction on branch1. + psconn = env.pageserver.connect() + psconn.cursor().execute(f'''compact {env.initial_tenant.hex} {branch1_timeline} {lsn_200}''') branch0_cur.execute('SELECT count(*) FROM foo') assert branch0_cur.fetchone() == (100000, ) From 66694e736a2e53bd611198507bc9efdb9770921c Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Mon, 25 Apr 2022 13:55:00 -0700 Subject: [PATCH 0199/1022] Fix add ps tenant config Signed-off-by: Dhammika Pathirana --- .../batch_others/test_ancestor_branch.py | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index 1e96369314..aeb45348ad 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -19,21 +19,22 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): # # See https://github.com/zenithdb/zenith/issues/1068 zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # Override defaults, 1M gc_horizon and 4M checkpoint_distance. # Extend compaction_period and gc_period to disable background compaction and gc. - env.pageserver.start(overrides=[ - '--pageserver-config-override="gc_period"="10 m"', - '--pageserver-config-override="gc_horizon"=1048576', - '--pageserver-config-override="checkpoint_distance"=4194304', - '--pageserver-config-override="compaction_period"="10 m"', - '--pageserver-config-override="compaction_threshold"=2', - '--pageserver-config-override="compaction_target_size"=4194304' - ]) - env.safekeepers[0].start() + tenant = env.zenith_cli.create_tenant( + conf={ + 'gc_period': '10 m', + 'gc_horizon': '1048576', + 'checkpoint_distance': '4194304', + 'compaction_period': '10 m', + 'compaction_threshold': '2', + 'compaction_target_size': '4194304', + }) - pg_branch0 = env.postgres.create_start('main') + env.zenith_cli.create_timeline(f'main', tenant_id=tenant) + pg_branch0 = env.postgres.create_start('main', tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() branch0_cur.execute("SHOW zenith.zenith_timeline") branch0_timeline = branch0_cur.fetchone()[0] @@ -55,8 +56,8 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 100k rows: {lsn_100}') # Create branch1. - env.zenith_cli.create_branch('branch1', 'main', ancestor_start_lsn=lsn_100) - pg_branch1 = env.postgres.create_start('branch1') + env.zenith_cli.create_branch('branch1', 'main', tenant_id=tenant, ancestor_start_lsn=lsn_100) + pg_branch1 = env.postgres.create_start('branch1', tenant_id=tenant) log.info("postgres is running on 'branch1' branch") branch1_cur = pg_branch1.connect().cursor() @@ -79,8 +80,8 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 200k rows: {lsn_200}') # Create branch2. - env.zenith_cli.create_branch('branch2', 'branch1', ancestor_start_lsn=lsn_200) - pg_branch2 = env.postgres.create_start('branch2') + env.zenith_cli.create_branch('branch2', 'branch1', tenant_id=tenant, ancestor_start_lsn=lsn_200) + pg_branch2 = env.postgres.create_start('branch2', tenant_id=tenant) log.info("postgres is running on 'branch2' branch") branch2_cur = pg_branch2.connect().cursor() @@ -104,7 +105,8 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): # Run compaction on branch1. psconn = env.pageserver.connect() - psconn.cursor().execute(f'''compact {env.initial_tenant.hex} {branch1_timeline} {lsn_200}''') + log.info(f'compact {tenant.hex} {branch1_timeline} {lsn_200}') + psconn.cursor().execute(f'''compact {tenant.hex} {branch1_timeline} {lsn_200}''') branch0_cur.execute('SELECT count(*) FROM foo') assert branch0_cur.fetchone() == (100000, ) From 695b5f9d88c33b4c141a9d701b9e43ecb9f49f81 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 27 Apr 2022 13:42:48 +0300 Subject: [PATCH 0200/1022] Remove obsolete failpoint in proxy When failpoint feature is disabled it throws away passed code so code inside is not guaranteed to compile when feature is disabled. In this particular case code is obsolete so removing it. --- Cargo.lock | 1 - proxy/Cargo.toml | 1 - proxy/src/auth/credentials.rs | 4 ---- 3 files changed, 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3797e4e76b..bac5dfb674 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2002,7 +2002,6 @@ dependencies = [ "base64", "bytes", "clap 3.0.14", - "fail", "futures", "hashbrown", "hex", diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 25aebc03e8..f7e872ceb9 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -8,7 +8,6 @@ anyhow = "1.0" base64 = "0.13.0" bytes = { version = "1.0.1", features = ['serde'] } clap = "3.0" -fail = "0.5.0" futures = "0.3.13" hashbrown = "0.11.2" hex = "0.4.3" diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 7c8ba28622..c3bb6da4f8 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -48,10 +48,6 @@ impl ClientCredentials { config: &ProxyConfig, client: &mut PqStream, ) -> Result { - fail::fail_point!("proxy-authenticate", |_| { - Err(AuthError::auth_failed("failpoint triggered")) - }); - use crate::config::ClientAuthMethod::*; use crate::config::RouterConfig::*; match &config.router_config { From 29539b056100c7c0b3574ec13789ef91e9d748d9 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 27 Apr 2022 19:09:28 +0300 Subject: [PATCH 0201/1022] Set wal_keep_size to zero (#1507) wal_keep_size is already set to 0 in our cloud setup, but we don't use this value in tests. This commit fixes wal_keep_size in control_plane and adds tests for WAL recycling and lagging safekeepers. --- control_plane/src/compute.rs | 7 +-- test_runner/batch_others/test_wal_acceptor.py | 55 ++++++++++++++++++- .../batch_others/test_wal_acceptor_async.py | 37 ++++++++++--- test_runner/fixtures/utils.py | 11 ++++ 4 files changed, 95 insertions(+), 15 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 2549baca5d..92d0e080d8 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -273,12 +273,7 @@ impl PostgresNode { conf.append("wal_sender_timeout", "5s"); conf.append("listen_addresses", &self.address.ip().to_string()); conf.append("port", &self.address.port().to_string()); - - // Never clean up old WAL. TODO: We should use a replication - // slot or something proper, to prevent the compute node - // from removing WAL that hasn't been streamed to the safekeeper or - // page server yet. (gh issue #349) - conf.append("wal_keep_size", "10TB"); + conf.append("wal_keep_size", "0"); // Configure the node to fetch pages from pageserver let pageserver_connstr = { diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 395084af0e..94059e2a4c 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -13,7 +13,7 @@ from dataclasses import dataclass, field from multiprocessing import Process, Value from pathlib import Path from fixtures.zenith_fixtures import PgBin, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol -from fixtures.utils import etcd_path, lsn_to_hex, mkdir_if_needed, lsn_from_hex +from fixtures.utils import etcd_path, get_dir_size, lsn_to_hex, mkdir_if_needed, lsn_from_hex from fixtures.log_helper import log from typing import List, Optional, Any @@ -791,3 +791,56 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): env.safekeepers[1].stop(immediate=True) execute_payload(pg) show_statuses(env.safekeepers, tenant_id, timeline_id) + + +# We have `wal_keep_size=0`, so postgres should trim WAL once it's broadcasted +# to all safekeepers. This test checks that compute WAL can fit into small number +# of WAL segments. +def test_wal_deleted_after_broadcast(zenith_env_builder: ZenithEnvBuilder): + # used to calculate delta in collect_stats + last_lsn = .0 + + # returns LSN and pg_wal size, all in MB + def collect_stats(pg: Postgres, cur, enable_logs=True): + nonlocal last_lsn + assert pg.pgdata_dir is not None + + log.info('executing INSERT to generate WAL') + cur.execute("select pg_current_wal_lsn()") + current_lsn = lsn_from_hex(cur.fetchone()[0]) / 1024 / 1024 + pg_wal_size = get_dir_size(os.path.join(pg.pgdata_dir, 'pg_wal')) / 1024 / 1024 + if enable_logs: + log.info(f"LSN delta: {current_lsn - last_lsn} MB, current WAL size: {pg_wal_size} MB") + last_lsn = current_lsn + return current_lsn, pg_wal_size + + # generates about ~20MB of WAL, to create at least one new segment + def generate_wal(cur): + cur.execute("INSERT INTO t SELECT generate_series(1,300000), 'payload'") + + zenith_env_builder.num_safekeepers = 3 + env = zenith_env_builder.init_start() + + env.zenith_cli.create_branch('test_wal_deleted_after_broadcast') + # Adjust checkpoint config to prevent keeping old WAL segments + pg = env.postgres.create_start( + 'test_wal_deleted_after_broadcast', + config_lines=['min_wal_size=32MB', 'max_wal_size=32MB', 'log_checkpoints=on']) + + pg_conn = pg.connect() + cur = pg_conn.cursor() + cur.execute('CREATE TABLE t(key int, value text)') + + collect_stats(pg, cur) + + # generate WAL to simulate normal workload + for i in range(5): + generate_wal(cur) + collect_stats(pg, cur) + + log.info('executing checkpoint') + cur.execute('CHECKPOINT') + wal_size_after_checkpoint = collect_stats(pg, cur)[1] + + # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) + assert wal_size_after_checkpoint < 16 * 2.5 diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index e3df8ea3eb..c484b6401c 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -139,13 +139,12 @@ async def wait_for_lsn(safekeeper: Safekeeper, async def run_restarts_under_load(env: ZenithEnv, pg: Postgres, acceptors: List[Safekeeper], - n_workers=10): - n_accounts = 100 - init_amount = 100000 - max_transfer = 100 - period_time = 4 - iterations = 10 - + n_workers=10, + n_accounts=100, + init_amount=100000, + max_transfer=100, + period_time=4, + iterations=10): # Set timeout for this test at 5 minutes. It should be enough for test to complete # and less than CircleCI's no_output_timeout, taking into account that this timeout # is checked only at the beginning of every iteration. @@ -202,7 +201,7 @@ async def run_restarts_under_load(env: ZenithEnv, await pg_conn.close() -# restart acceptors one by one, while executing and validating bank transactions +# Restart acceptors one by one, while executing and validating bank transactions def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() @@ -213,3 +212,25 @@ def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder): config_lines=['max_replication_write_lag=1MB']) asyncio.run(run_restarts_under_load(env, pg, env.safekeepers)) + + +# Restart acceptors one by one and test that everything is working as expected +# when checkpoins are triggered frequently by max_wal_size=32MB. Because we have +# wal_keep_size=0, there will be aggressive WAL segments recycling. +def test_restarts_frequent_checkpoints(zenith_env_builder: ZenithEnvBuilder): + zenith_env_builder.num_safekeepers = 3 + env = zenith_env_builder.init_start() + + env.zenith_cli.create_branch('test_restarts_frequent_checkpoints') + # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long + pg = env.postgres.create_start('test_restarts_frequent_checkpoints', + config_lines=[ + 'max_replication_write_lag=1MB', + 'min_wal_size=32MB', + 'max_wal_size=32MB', + 'log_checkpoints=on' + ]) + + # we try to simulate large (flush_lsn - truncate_lsn) lag, to test that WAL segments + # are not removed before broadcasted to all safekeepers, with the help of replication slot + asyncio.run(run_restarts_under_load(env, pg, env.safekeepers, period_time=15, iterations=5)) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index f16fe1d9cf..98af511036 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -82,3 +82,14 @@ def print_gc_result(row): # path to etcd binary or None if not present. def etcd_path(): return shutil.which("etcd") + + +# Traverse directory to get total size. +def get_dir_size(path: str) -> int: + """Return size in bytes.""" + totalbytes = 0 + for root, dirs, files in os.walk(path): + for name in files: + totalbytes += os.path.getsize(os.path.join(root, name)) + + return totalbytes From 5c5c3c64f3153b4b67c0ed4f51d4ab14c8aa1da2 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 26 Apr 2022 19:35:07 +0300 Subject: [PATCH 0202/1022] Fix tenant config parsing. Add a test --- Cargo.lock | 11 ++++ pageserver/Cargo.toml | 1 + pageserver/src/config.rs | 2 +- pageserver/src/layered_repository.rs | 4 +- pageserver/src/tenant_config.rs | 6 ++ test_runner/batch_others/test_tenant_conf.py | 59 +++++++++++++------- 6 files changed, 61 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bac5dfb674..58125ca41c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1073,6 +1073,16 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +[[package]] +name = "humantime-serde" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57a3db5ea5923d99402c94e9feb261dc5ee9b4efa158b0315f788cf549cc200c" +dependencies = [ + "humantime", + "serde", +] + [[package]] name = "hyper" version = "0.14.17" @@ -1626,6 +1636,7 @@ dependencies = [ "hex", "hex-literal", "humantime", + "humantime-serde", "hyper", "itertools", "lazy_static", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 6648d8417a..5607baf698 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -35,6 +35,7 @@ humantime = "2.1.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_with = "1.12.0" +humantime-serde = "1.1.1" pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 8bfe8b57ec..aed7eabb76 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -439,7 +439,7 @@ impl PageServerConf { "remote_storage" => { builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?)) } - "tenant_conf" => { + "tenant_config" => { t_conf = Self::parse_toml_tenant_conf(item)?; } "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)), diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 679daa8248..d9e1244f2e 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -690,7 +690,7 @@ impl LayeredRepository { let mut tenant_conf: TenantConfOpt = Default::default(); for (key, item) in toml.iter() { match key { - "tenant_conf" => { + "tenant_config" => { tenant_conf = PageServerConf::parse_toml_tenant_conf(item)?; } _ => bail!("unrecognized pageserver option '{}'", key), @@ -712,7 +712,7 @@ impl LayeredRepository { let mut conf_content = r#"# This file contains a specific per-tenant's config. # It is read in case of pageserver restart. -# [tenant_config] +[tenant_config] "# .to_string(); diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index 818b6de1b1..a175f6abbe 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -47,6 +47,7 @@ pub struct TenantConf { // This parameter determines L1 layer file size. pub compaction_target_size: u64, // How often to check if there's compaction work to be done. + #[serde(with = "humantime_serde")] pub compaction_period: Duration, // Level0 delta layer threshold for compaction. pub compaction_threshold: usize, @@ -56,11 +57,13 @@ pub struct TenantConf { // Page versions older than this are garbage collected away. pub gc_horizon: u64, // Interval at which garbage collection is triggered. + #[serde(with = "humantime_serde")] pub gc_period: Duration, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. // The unit is time. // Page versions older than this are garbage collected away. + #[serde(with = "humantime_serde")] pub pitr_interval: Duration, } @@ -70,10 +73,13 @@ pub struct TenantConf { pub struct TenantConfOpt { pub checkpoint_distance: Option, pub compaction_target_size: Option, + #[serde(with = "humantime_serde")] pub compaction_period: Option, pub compaction_threshold: Option, pub gc_horizon: Option, + #[serde(with = "humantime_serde")] pub gc_period: Option, + #[serde(with = "humantime_serde")] pub pitr_interval: Option, } diff --git a/test_runner/batch_others/test_tenant_conf.py b/test_runner/batch_others/test_tenant_conf.py index f74e6aad1d..64359a1dc3 100644 --- a/test_runner/batch_others/test_tenant_conf.py +++ b/test_runner/batch_others/test_tenant_conf.py @@ -3,21 +3,22 @@ from contextlib import closing import pytest from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.log_helper import log def test_tenant_config(zenith_env_builder: ZenithEnvBuilder): + # set some non-default global config + zenith_env_builder.pageserver_config_override = ''' +page_cache_size=444; +wait_lsn_timeout='111 s'; +tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' + env = zenith_env_builder.init_start() """Test per tenant configuration""" - tenant = env.zenith_cli.create_tenant( - conf={ - 'checkpoint_distance': '10000', - 'compaction_target_size': '1048576', - 'compaction_period': '60sec', - 'compaction_threshold': '20', - 'gc_horizon': '1024', - 'gc_period': '100sec', - 'pitr_interval': '3600sec', - }) + tenant = env.zenith_cli.create_tenant(conf={ + 'checkpoint_distance': '20000', + 'gc_period': '30sec', + }) env.zenith_cli.create_timeline(f'test_tenant_conf', tenant_id=tenant) pg = env.postgres.create_start( @@ -26,24 +27,44 @@ def test_tenant_config(zenith_env_builder: ZenithEnvBuilder): tenant, ) + # check the configuration of the default tenant + # it should match global configuration + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor() as pscur: + pscur.execute(f"show {env.initial_tenant.hex}") + res = pscur.fetchone() + log.info(f"initial_tenant res: {res}") + assert res == (10000, 1048576, 1, 10, 67108864, 100, 2592000) + + # check the configuration of the new tenant with closing(env.pageserver.connect()) as psconn: with psconn.cursor() as pscur: pscur.execute(f"show {tenant.hex}") - assert pscur.fetchone() == (10000, 1048576, 60, 20, 1024, 100, 3600) + res = pscur.fetchone() + log.info(f"res: {res}") + assert res == (20000, 1048576, 1, 10, 67108864, 30, 2592000) # update the config and ensure that it has changed env.zenith_cli.config_tenant(tenant_id=tenant, conf={ - 'checkpoint_distance': '100000', - 'compaction_target_size': '1048576', - 'compaction_period': '30sec', - 'compaction_threshold': '15', - 'gc_horizon': '256', - 'gc_period': '10sec', - 'pitr_interval': '360sec', + 'checkpoint_distance': '15000', + 'gc_period': '80sec', }) with closing(env.pageserver.connect()) as psconn: with psconn.cursor() as pscur: pscur.execute(f"show {tenant.hex}") - assert pscur.fetchone() == (100000, 1048576, 30, 15, 256, 10, 360) + res = pscur.fetchone() + log.info(f"after config res: {res}") + assert res == (15000, 1048576, 1, 10, 67108864, 80, 2592000) + + # restart the pageserver and ensure that the config is still correct + env.pageserver.stop() + env.pageserver.start() + + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor() as pscur: + pscur.execute(f"show {tenant.hex}") + res = pscur.fetchone() + log.info(f"after restart res: {res}") + assert res == (15000, 1048576, 1, 10, 67108864, 80, 2592000) From 4a46b01caf1ad039c3a0f06f68dae54fe95b7b2c Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 27 Apr 2022 11:16:44 +0300 Subject: [PATCH 0203/1022] Properly populate local timeline map --- pageserver/src/bin/pageserver.rs | 51 +---- pageserver/src/http/routes.rs | 6 +- pageserver/src/layered_repository.rs | 2 +- pageserver/src/page_service.rs | 10 +- pageserver/src/tenant_mgr.rs | 299 ++++++++++++++++----------- pageserver/src/timelines.rs | 18 +- pageserver/src/walreceiver.rs | 2 +- 7 files changed, 207 insertions(+), 181 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 5c135e4eb4..728dcb53de 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -10,10 +10,7 @@ use daemonize::Daemonize; use pageserver::{ config::{defaults::*, PageServerConf}, - http, page_cache, page_service, profiling, - remote_storage::{self, SyncStartupData}, - repository::{Repository, TimelineSyncStatusUpdate}, - tenant_mgr, thread_mgr, + http, page_cache, page_service, profiling, tenant_mgr, thread_mgr, thread_mgr::ThreadKind, timelines, virtual_file, LOG_FILE_NAME, }; @@ -235,47 +232,8 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() let signals = signals::install_shutdown_handlers()?; - // Initialize repositories with locally available timelines. - // Timelines that are only partially available locally (remote storage has more data than this pageserver) - // are scheduled for download and added to the repository once download is completed. - let SyncStartupData { - remote_index, - local_timeline_init_statuses, - } = remote_storage::start_local_timeline_sync(conf) - .context("Failed to set up local files sync with external storage")?; - - for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses { - // initialize local tenant - let repo = tenant_mgr::load_local_repo(conf, tenant_id, &remote_index) - .with_context(|| format!("Failed to load repo for tenant {}", tenant_id))?; - for (timeline_id, init_status) in local_timeline_init_statuses { - match init_status { - remote_storage::LocalTimelineInitStatus::LocallyComplete => { - debug!("timeline {} for tenant {} is locally complete, registering it in repository", timeline_id, tenant_id); - // Lets fail here loudly to be on the safe side. - // XXX: It may be a better api to actually distinguish between repository startup - // and processing of newly downloaded timelines. - repo.apply_timeline_remote_sync_status_update( - timeline_id, - TimelineSyncStatusUpdate::Downloaded, - ) - .with_context(|| { - format!( - "Failed to bootstrap timeline {} for tenant {}", - timeline_id, tenant_id - ) - })? - } - remote_storage::LocalTimelineInitStatus::NeedsSync => { - debug!( - "timeline {} for tenant {} needs sync, \ - so skipped for adding into repository until sync is finished", - tenant_id, timeline_id - ); - } - } - } - } + // start profiler (if enabled) + let profiler_guard = profiling::init_profiler(conf); // initialize authentication for incoming connections let auth = match &conf.auth_type { @@ -288,8 +246,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() }; info!("Using auth: {:#?}", conf.auth_type); - // start profiler (if enabled) - let profiler_guard = profiling::init_profiler(conf); + let remote_index = tenant_mgr::init_tenant_mgr(conf)?; // Spawn a new thread for the http endpoint // bind before launching separate thread so the error reported before startup exits diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 05485ef3b6..f1b482cf50 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -244,7 +244,7 @@ async fn timeline_attach_handler(request: Request) -> Result) -> Result, A crate::tenant_mgr::list_tenants() }) .await - .map_err(ApiError::from_err)??; + .map_err(ApiError::from_err)?; json_response(StatusCode::OK, response_data) } @@ -377,7 +377,7 @@ async fn tenant_create_handler(mut request: Request) -> Result> = Mutex::new(HashMap::new()); +mod tenants_state { + use std::{ + collections::HashMap, + sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}, + }; + + use utils::zid::ZTenantId; + + use crate::tenant_mgr::Tenant; + + lazy_static::lazy_static! { + static ref TENANTS: RwLock> = RwLock::new(HashMap::new()); + } + + pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap> { + TENANTS + .read() + .expect("Failed to read() tenants lock, it got poisoned") + } + + pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap> { + TENANTS + .write() + .expect("Failed to write() tenants lock, it got poisoned") + } } struct Tenant { state: TenantState, + /// Contains in-memory state, including the timeline that might not yet flushed on disk or loaded form disk. repo: Arc, - - timelines: HashMap>, + /// Timelines, located locally in the pageserver's datadir. + /// Whatever manipulations happen, local timelines are not removed, only incremented with files. + /// + /// Local timelines have more metadata that's loaded into memory, + /// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`]. + local_timelines: HashMap>, } #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] @@ -60,43 +88,17 @@ impl fmt::Display for TenantState { } } -fn access_tenants() -> MutexGuard<'static, HashMap> { - TENANTS.lock().unwrap() -} - -// Sets up wal redo manager and repository for tenant. Reduces code duplication. -// Used during pageserver startup, or when new tenant is attached to pageserver. -pub fn load_local_repo( - conf: &'static PageServerConf, - tenant_id: ZTenantId, - remote_index: &RemoteIndex, -) -> Result> { - let mut m = access_tenants(); - let tenant = m.entry(tenant_id).or_insert_with(|| { - // Set up a WAL redo manager, for applying WAL records. - let walredo_mgr = PostgresRedoManager::new(conf, tenant_id); - - // Set up an object repository, for actual data storage. - let repo: Arc = Arc::new(LayeredRepository::new( - conf, - Default::default(), - Arc::new(walredo_mgr), - tenant_id, - remote_index.clone(), - conf.remote_storage_config.is_some(), - )); - Tenant { - state: TenantState::Idle, - repo, - timelines: HashMap::new(), - } - }); - - // Restore tenant config - let tenant_conf = LayeredRepository::load_tenant_config(conf, tenant_id)?; - tenant.repo.update_tenant_config(tenant_conf)?; - - Ok(Arc::clone(&tenant.repo)) +/// Initialize repositories with locally available timelines. +/// Timelines that are only partially available locally (remote storage has more data than this pageserver) +/// are scheduled for download and added to the repository once download is completed. +pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result { + let SyncStartupData { + remote_index, + local_timeline_init_statuses, + } = remote_storage::start_local_timeline_sync(conf) + .context("Failed to set up local files sync with external storage")?; + init_local_repositories(conf, local_timeline_init_statuses, &remote_index)?; + Ok(remote_index) } /// Updates tenants' repositories, changing their timelines state in memory. @@ -113,32 +115,28 @@ pub fn apply_timeline_sync_status_updates( "Applying sync status updates for {} timelines", sync_status_updates.len() ); - trace!("Sync status updates: {:?}", sync_status_updates); + debug!("Sync status updates: {sync_status_updates:?}"); - for (tenant_id, tenant_timelines_sync_status_updates) in sync_status_updates { + for (tenant_id, status_updates) in sync_status_updates { let repo = match load_local_repo(conf, tenant_id, remote_index) { Ok(repo) => repo, Err(e) => { - error!( - "Failed to load repo for tenant {} Error: {:#}", - tenant_id, e - ); + error!("Failed to load repo for tenant {tenant_id} Error: {e:?}",); continue; } }; - for (timeline_id, timeline_sync_status_update) in tenant_timelines_sync_status_updates { - match repo.apply_timeline_remote_sync_status_update(timeline_id, timeline_sync_status_update) + for (timeline_id, status_update) in status_updates { + match repo.apply_timeline_remote_sync_status_update(timeline_id, status_update) { - Ok(_) => debug!( - "successfully applied timeline sync status update: {} -> {}", - timeline_id, timeline_sync_status_update - ), + Ok(()) => debug!("successfully applied timeline sync status update: {timeline_id} -> {status_update}"), Err(e) => error!( - "Failed to apply timeline sync status update for tenant {}. timeline {} update {} Error: {:#}", - tenant_id, timeline_id, timeline_sync_status_update, e + "Failed to apply timeline sync status update for tenant {tenant_id}. timeline {timeline_id} update {status_update} Error: {e:?}" ), } + match status_update { + TimelineSyncStatusUpdate::Downloaded => todo!("TODO kb "), + } } } } @@ -147,7 +145,7 @@ pub fn apply_timeline_sync_status_updates( /// Shut down all tenants. This runs as part of pageserver shutdown. /// pub fn shutdown_all_tenants() { - let mut m = access_tenants(); + let mut m = tenants_state::write_tenants(); let mut tenantids = Vec::new(); for (tenantid, tenant) in m.iter_mut() { tenant.state = TenantState::Stopping; @@ -167,22 +165,16 @@ pub fn shutdown_all_tenants() { // should be no more activity in any of the repositories. // // On error, log it but continue with the shutdown for other tenants. - for tenantid in tenantids { - debug!("shutdown tenant {}", tenantid); - match get_repository_for_tenant(tenantid) { + for tenant_id in tenantids { + debug!("shutdown tenant {tenant_id}"); + match get_repository_for_tenant(tenant_id) { Ok(repo) => { if let Err(err) = repo.checkpoint() { - error!( - "Could not checkpoint tenant {} during shutdown: {:?}", - tenantid, err - ); + error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}"); } } Err(err) => { - error!( - "Could not get repository for tenant {} during shutdown: {:?}", - tenantid, err - ); + error!("Could not get repository for tenant {tenant_id} during shutdown: {err:?}"); } } } @@ -191,20 +183,20 @@ pub fn shutdown_all_tenants() { pub fn create_tenant_repository( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, - tenantid: ZTenantId, + tenant_id: ZTenantId, remote_index: RemoteIndex, -) -> Result> { - match access_tenants().entry(tenantid) { +) -> anyhow::Result> { + match tenants_state::write_tenants().entry(tenant_id) { Entry::Occupied(_) => { - debug!("tenant {} already exists", tenantid); + debug!("tenant {tenant_id} already exists"); Ok(None) } Entry::Vacant(v) => { - let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid)); + let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); let repo = timelines::create_repo( conf, tenant_conf, - tenantid, + tenant_id, CreateRepo::Real { wal_redo_manager, remote_index, @@ -213,36 +205,39 @@ pub fn create_tenant_repository( v.insert(Tenant { state: TenantState::Idle, repo, - timelines: HashMap::new(), + local_timelines: HashMap::new(), }); - Ok(Some(tenantid)) + Ok(Some(tenant_id)) } } } -pub fn update_tenant_config(tenant_conf: TenantConfOpt, tenantid: ZTenantId) -> Result<()> { - info!("configuring tenant {}", tenantid); - let repo = get_repository_for_tenant(tenantid)?; +pub fn update_tenant_config( + tenant_conf: TenantConfOpt, + tenant_id: ZTenantId, +) -> anyhow::Result<()> { + info!("configuring tenant {tenant_id}"); + let repo = get_repository_for_tenant(tenant_id)?; repo.update_tenant_config(tenant_conf)?; Ok(()) } pub fn get_tenant_state(tenantid: ZTenantId) -> Option { - Some(access_tenants().get(&tenantid)?.state) + Some(tenants_state::read_tenants().get(&tenantid)?.state) } /// /// Change the state of a tenant to Active and launch its compactor and GC /// threads. If the tenant was already in Active state or Stopping, does nothing. /// -pub fn activate_tenant(tenant_id: ZTenantId) -> Result<()> { - let mut m = access_tenants(); +pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> { + let mut m = tenants_state::write_tenants(); let tenant = m .get_mut(&tenant_id) - .with_context(|| format!("Tenant not found for id {}", tenant_id))?; + .with_context(|| format!("Tenant not found for id {tenant_id}"))?; - info!("activating tenant {}", tenant_id); + info!("activating tenant {tenant_id}"); match tenant.state { // If the tenant is already active, nothing to do. @@ -267,13 +262,10 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> Result<()> { true, move || crate::tenant_threads::gc_loop(tenant_id), ) - .with_context(|| format!("Failed to launch GC thread for tenant {}", tenant_id)); + .with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}")); if let Err(e) = &gc_spawn_result { - error!( - "Failed to start GC thread for tenant {}, stopping its checkpointer thread: {:?}", - tenant_id, e - ); + error!("Failed to start GC thread for tenant {tenant_id}, stopping its checkpointer thread: {e:?}"); thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None); return gc_spawn_result; } @@ -287,39 +279,42 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> Result<()> { Ok(()) } -pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result> { - let m = access_tenants(); +pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result> { + let m = tenants_state::read_tenants(); let tenant = m - .get(&tenantid) - .with_context(|| format!("Tenant {} not found", tenantid))?; + .get(&tenant_id) + .with_context(|| format!("Tenant {tenant_id} not found"))?; Ok(Arc::clone(&tenant.repo)) } -// Retrieve timeline for tenant. Load it into memory if it is not already loaded -pub fn get_timeline_for_tenant_load( - tenantid: ZTenantId, - timelineid: ZTimelineId, -) -> Result> { - let mut m = access_tenants(); +/// Retrieves local timeline for tenant. +/// Loads it into memory if it is not already loaded. +pub fn get_local_timeline_with_load( + tenant_id: ZTenantId, + timeline_id: ZTimelineId, +) -> anyhow::Result> { + let mut m = tenants_state::write_tenants(); let tenant = m - .get_mut(&tenantid) - .with_context(|| format!("Tenant {} not found", tenantid))?; + .get_mut(&tenant_id) + .with_context(|| format!("Tenant {tenant_id} not found"))?; - if let Some(page_tline) = tenant.timelines.get(&timelineid) { + if let Some(page_tline) = tenant.local_timelines.get(&timeline_id) { return Ok(Arc::clone(page_tline)); } // First access to this timeline. Create a DatadirTimeline wrapper for it let tline = tenant .repo - .get_timeline_load(timelineid) - .with_context(|| format!("Timeline {} not found for tenant {}", timelineid, tenantid))?; + .get_timeline_load(timeline_id) + .with_context(|| format!("Timeline {timeline_id} not found for tenant {tenant_id}"))?; let repartition_distance = tenant.repo.get_checkpoint_distance() / 10; let page_tline = Arc::new(DatadirTimelineImpl::new(tline, repartition_distance)); page_tline.init_logical_size()?; - tenant.timelines.insert(timelineid, Arc::clone(&page_tline)); + tenant + .local_timelines + .insert(timeline_id, Arc::clone(&page_tline)); Ok(page_tline) } @@ -331,15 +326,87 @@ pub struct TenantInfo { pub state: TenantState, } -pub fn list_tenants() -> Result> { - access_tenants() +pub fn list_tenants() -> Vec { + tenants_state::read_tenants() .iter() - .map(|v| { - let (id, tenant) = v; - Ok(TenantInfo { - id: *id, - state: tenant.state, - }) + .map(|(id, tenant)| TenantInfo { + id: *id, + state: tenant.state, }) .collect() } + +fn init_local_repositories( + conf: &'static PageServerConf, + local_timeline_init_statuses: HashMap>, + remote_index: &RemoteIndex, +) -> anyhow::Result<(), anyhow::Error> { + for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses { + // initialize local tenant + let repo = load_local_repo(conf, tenant_id, remote_index) + .with_context(|| format!("Failed to load repo for tenant {}", tenant_id))?; + for (timeline_id, init_status) in local_timeline_init_statuses { + match init_status { + LocalTimelineInitStatus::LocallyComplete => { + debug!("timeline {} for tenant {} is locally complete, registering it in repository", timeline_id, tenant_id); + // Lets fail here loudly to be on the safe side. + // XXX: It may be a better api to actually distinguish between repository startup + // and processing of newly downloaded timelines. + repo.apply_timeline_remote_sync_status_update( + timeline_id, + TimelineSyncStatusUpdate::Downloaded, + ) + .with_context(|| { + format!( + "Failed to bootstrap timeline {} for tenant {}", + timeline_id, tenant_id + ) + })? + } + LocalTimelineInitStatus::NeedsSync => { + debug!( + "timeline {} for tenant {} needs sync, \ + so skipped for adding into repository until sync is finished", + tenant_id, timeline_id + ); + } + } + } + } + Ok(()) +} + +// Sets up wal redo manager and repository for tenant. Reduces code duplication. +// Used during pageserver startup, or when new tenant is attached to pageserver. +fn load_local_repo( + conf: &'static PageServerConf, + tenant_id: ZTenantId, + remote_index: &RemoteIndex, +) -> anyhow::Result> { + let mut m = tenants_state::write_tenants(); + let tenant = m.entry(tenant_id).or_insert_with(|| { + // Set up a WAL redo manager, for applying WAL records. + let walredo_mgr = PostgresRedoManager::new(conf, tenant_id); + + // Set up an object repository, for actual data storage. + let repo: Arc = Arc::new(LayeredRepository::new( + conf, + TenantConfOpt::default(), + Arc::new(walredo_mgr), + tenant_id, + remote_index.clone(), + conf.remote_storage_config.is_some(), + )); + Tenant { + state: TenantState::Idle, + repo, + local_timelines: HashMap::new(), + } + }); + + // Restore tenant config + let tenant_conf = LayeredRepository::load_tenant_config(conf, tenant_id)?; + tenant.repo.update_tenant_config(tenant_conf)?; + + Ok(Arc::clone(&tenant.repo)) +} diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index adc531e6bb..acc92bb4a2 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -2,7 +2,7 @@ //! Timeline management code // -use anyhow::{bail, Context, Result}; +use anyhow::{bail, ensure, Context, Result}; use postgres_ffi::ControlFileData; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; @@ -106,7 +106,7 @@ impl LocalTimelineInfo { match repo_timeline { RepositoryTimeline::Loaded(_) => { let datadir_tline = - tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id)?; + tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id)?; Self::from_loaded_timeline(&datadir_tline, include_non_incremental_logical_size) } RepositoryTimeline::Unloaded { metadata } => Ok(Self::from_unloaded_timeline(metadata)), @@ -152,7 +152,7 @@ pub fn init_pageserver( if let Some(tenant_id) = create_tenant { println!("initializing tenantid {}", tenant_id); - let repo = create_repo(conf, Default::default(), tenant_id, CreateRepo::Dummy) + let repo = create_repo(conf, TenantConfOpt::default(), tenant_id, CreateRepo::Dummy) .context("failed to create repo")?; let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref()) @@ -203,9 +203,11 @@ pub fn create_repo( }; let repo_dir = conf.tenant_path(&tenant_id); - if repo_dir.exists() { - bail!("tenant {} directory already exists", tenant_id); - } + ensure!( + repo_dir.exists(), + "cannot create new tenant repo: '{}' directory already exists", + tenant_id + ); // top-level dir may exist if we are creating it through CLI crashsafe_dir::create_dir_all(&repo_dir) @@ -383,7 +385,7 @@ pub(crate) fn create_timeline( repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?; // load the timeline into memory let loaded_timeline = - tenant_mgr::get_timeline_for_tenant_load(tenant_id, new_timeline_id)?; + tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; LocalTimelineInfo::from_loaded_timeline(&loaded_timeline, false) .context("cannot fill timeline info")? } @@ -391,7 +393,7 @@ pub(crate) fn create_timeline( bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?; // load the timeline into memory let new_timeline = - tenant_mgr::get_timeline_for_tenant_load(tenant_id, new_timeline_id)?; + tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; LocalTimelineInfo::from_loaded_timeline(&new_timeline, false) .context("cannot fill timeline info")? } diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 357aab7221..b7a33364c9 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -184,7 +184,7 @@ fn walreceiver_main( let repo = tenant_mgr::get_repository_for_tenant(tenant_id) .with_context(|| format!("no repository found for tenant {}", tenant_id))?; let timeline = - tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id).with_context(|| { + tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id).with_context(|| { format!( "local timeline {} not found for tenant {}", timeline_id, tenant_id From 6cca57f95a6aced70c1c932a580edaf621177b8b Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 27 Apr 2022 15:55:59 +0300 Subject: [PATCH 0204/1022] Properly remove from the local timeline map --- pageserver/src/http/routes.rs | 3 +- pageserver/src/layered_repository.rs | 55 +++++------ pageserver/src/repository.rs | 2 +- pageserver/src/tenant_mgr.rs | 136 +++++++++++++++++++-------- pageserver/src/timelines.rs | 2 +- 5 files changed, 123 insertions(+), 75 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index f1b482cf50..295a1e9f02 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -347,8 +347,7 @@ async fn timeline_detach_handler(request: Request) -> Result>, - tenantid: ZTenantId, + tenant_id: ZTenantId, timelines: Mutex>, // This mutex prevents creation of new timelines during GC. // Adding yet another mutex (in addition to `timelines`) is needed because holding @@ -223,10 +223,10 @@ impl Repository for LayeredRepository { let mut timelines = self.timelines.lock().unwrap(); // Create the timeline directory, and write initial metadata to file. - crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenantid))?; + crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenant_id))?; let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); - Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata, true)?; + Self::save_metadata(self.conf, timelineid, self.tenant_id, &metadata, true)?; let timeline = LayeredTimeline::new( self.conf, @@ -234,7 +234,7 @@ impl Repository for LayeredRepository { metadata, None, timelineid, - self.tenantid, + self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, ); @@ -283,7 +283,7 @@ impl Repository for LayeredRepository { }; // create a new timeline directory - let timelinedir = self.conf.timeline_path(&dst, &self.tenantid); + let timelinedir = self.conf.timeline_path(&dst, &self.tenant_id); crashsafe_dir::create_dir(&timelinedir)?; @@ -298,8 +298,8 @@ impl Repository for LayeredRepository { *src_timeline.latest_gc_cutoff_lsn.read().unwrap(), src_timeline.initdb_lsn, ); - crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?; - Self::save_metadata(self.conf, dst, self.tenantid, &metadata, true)?; + crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; + Self::save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; timelines.insert(dst, LayeredTimelineEntry::Unloaded { id: dst, metadata }); info!("branched timeline {} from {} at {}", dst, src, start_lsn); @@ -322,7 +322,7 @@ impl Repository for LayeredRepository { .unwrap_or_else(|| "-".to_string()); STORAGE_TIME - .with_label_values(&["gc", &self.tenantid.to_string(), &timeline_str]) + .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str]) .observe_closure_duration(|| { self.gc_iteration_internal(target_timelineid, horizon, pitr, checkpoint_before_gc) }) @@ -342,7 +342,7 @@ impl Repository for LayeredRepository { for (timelineid, timeline) in &timelines_to_compact { let _entered = - info_span!("compact", timeline = %timelineid, tenant = %self.tenantid).entered(); + info_span!("compact", timeline = %timelineid, tenant = %self.tenant_id).entered(); match timeline { LayeredTimelineEntry::Loaded(timeline) => { timeline.compact()?; @@ -383,27 +383,16 @@ impl Repository for LayeredRepository { for (timelineid, timeline) in &timelines_to_compact { let _entered = - info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid).entered(); + info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenant_id) + .entered(); timeline.checkpoint(CheckpointConfig::Flush)?; } Ok(()) } - // Detaches the timeline from the repository. - fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> { - let mut timelines = self.timelines.lock().unwrap(); - if timelines.remove(&timeline_id).is_none() { - bail!("cannot detach timeline that is not available locally"); - } - - // Release the lock to shutdown and remove the files without holding it - drop(timelines); - // shutdown the timeline (this shuts down the walreceiver) - thread_mgr::shutdown_threads(None, Some(self.tenantid), Some(timeline_id)); - - // remove timeline files (maybe avoid this for ease of debugging if something goes wrong) - fs::remove_dir_all(self.conf.timeline_path(&timeline_id, &self.tenantid))?; + fn detach_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> { + self.timelines.lock().unwrap().remove(&timeline_id); Ok(()) } @@ -422,7 +411,7 @@ impl Repository for LayeredRepository { Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."), Entry::Vacant(entry) => { // we need to get metadata of a timeline, another option is to pass it along with Downloaded status - let metadata = Self::load_metadata(self.conf, timeline_id, self.tenantid).context("failed to load local metadata")?; + let metadata = Self::load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?; // finally we make newly downloaded timeline visible to repository entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, }) }, @@ -547,7 +536,7 @@ impl LayeredRepository { tenant_conf.update(&new_tenant_conf); - LayeredRepository::persist_tenant_config(self.conf, self.tenantid, *tenant_conf)?; + LayeredRepository::persist_tenant_config(self.conf, self.tenant_id, *tenant_conf)?; Ok(()) } @@ -605,7 +594,7 @@ impl LayeredRepository { timelineid: ZTimelineId, timelines: &mut HashMap, ) -> anyhow::Result> { - let metadata = Self::load_metadata(self.conf, timelineid, self.tenantid) + let metadata = Self::load_metadata(self.conf, timelineid, self.tenant_id) .context("failed to load metadata")?; let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -631,7 +620,7 @@ impl LayeredRepository { metadata, ancestor, timelineid, - self.tenantid, + self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, ); @@ -646,12 +635,12 @@ impl LayeredRepository { conf: &'static PageServerConf, tenant_conf: TenantConfOpt, walredo_mgr: Arc, - tenantid: ZTenantId, + tenant_id: ZTenantId, remote_index: RemoteIndex, upload_layers: bool, ) -> LayeredRepository { LayeredRepository { - tenantid, + tenant_id, conf, tenant_conf: Arc::new(RwLock::new(tenant_conf)), timelines: Mutex::new(HashMap::new()), @@ -806,7 +795,7 @@ impl LayeredRepository { checkpoint_before_gc: bool, ) -> Result { let _span_guard = - info_span!("gc iteration", tenant = %self.tenantid, timeline = ?target_timelineid) + info_span!("gc iteration", tenant = %self.tenant_id, timeline = ?target_timelineid) .entered(); let mut totals: GcResult = Default::default(); let now = Instant::now(); @@ -890,6 +879,10 @@ impl LayeredRepository { totals.elapsed = now.elapsed(); Ok(totals) } + + pub fn tenant_id(&self) -> ZTenantId { + self.tenant_id + } } pub struct LayeredTimeline { diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index f7c2f036a6..6c75f035ca 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -259,7 +259,7 @@ pub trait Repository: Send + Sync { /// api's 'compact' command. fn compaction_iteration(&self) -> Result<()>; - /// detaches locally available timeline by stopping all threads and removing all the data. + /// detaches timeline-related in-memory data. fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>; // Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn. diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 36a4b989b7..ace6938e6d 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,6 +3,7 @@ use crate::config::PageServerConf; use crate::layered_repository::LayeredRepository; +use crate::pgdatadir_mapping::DatadirTimeline; use crate::remote_storage::{self, LocalTimelineInitStatus, RemoteIndex, SyncStartupData}; use crate::repository::{Repository, TimelineSyncStatusUpdate}; use crate::tenant_config::TenantConfOpt; @@ -12,7 +13,7 @@ use crate::timelines; use crate::timelines::CreateRepo; use crate::walredo::PostgresRedoManager; use crate::{DatadirTimelineImpl, RepositoryImpl}; -use anyhow::Context; +use anyhow::{bail, Context}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use std::collections::hash_map::Entry; @@ -125,18 +126,11 @@ pub fn apply_timeline_sync_status_updates( continue; } }; - - for (timeline_id, status_update) in status_updates { - match repo.apply_timeline_remote_sync_status_update(timeline_id, status_update) - { - Ok(()) => debug!("successfully applied timeline sync status update: {timeline_id} -> {status_update}"), - Err(e) => error!( - "Failed to apply timeline sync status update for tenant {tenant_id}. timeline {timeline_id} update {status_update} Error: {e:?}" - ), - } - match status_update { - TimelineSyncStatusUpdate::Downloaded => todo!("TODO kb "), - } + match register_new_timelines(&repo, status_updates) { + Ok(()) => info!("successfully applied tenant {tenant_id} sync status updates"), + Err(e) => error!( + "Failed to apply timeline sync timeline status updates for tenant {tenant_id}: {e:?}" + ), } } } @@ -302,22 +296,49 @@ pub fn get_local_timeline_with_load( if let Some(page_tline) = tenant.local_timelines.get(&timeline_id) { return Ok(Arc::clone(page_tline)); } - // First access to this timeline. Create a DatadirTimeline wrapper for it - let tline = tenant - .repo - .get_timeline_load(timeline_id) - .with_context(|| format!("Timeline {timeline_id} not found for tenant {tenant_id}"))?; - let repartition_distance = tenant.repo.get_checkpoint_distance() / 10; - - let page_tline = Arc::new(DatadirTimelineImpl::new(tline, repartition_distance)); - page_tline.init_logical_size()?; + let page_tline = new_local_timeline(&tenant.repo, timeline_id) + .with_context(|| format!("Failed to create new local timeline for tenant {tenant_id}"))?; tenant .local_timelines .insert(timeline_id, Arc::clone(&page_tline)); Ok(page_tline) } +pub fn detach_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { + // shutdown the timeline threads (this shuts down the walreceiver) + thread_mgr::shutdown_threads(None, Some(tenant_id), Some(timeline_id)); + + match tenants_state::write_tenants().get_mut(&tenant_id) { + Some(tenant) => { + tenant + .repo + .detach_timeline(timeline_id) + .context("Failed to detach inmem tenant timeline")?; + tenant.local_timelines.remove(&timeline_id); + } + None => bail!("Tenant {tenant_id} not found in local tenant state"), + } + + Ok(()) +} + +fn new_local_timeline( + repo: &RepositoryImpl, + timeline_id: ZTimelineId, +) -> anyhow::Result>> { + let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| { + format!("Inmem timeline {timeline_id} not found in tenant's repository") + })?; + let repartition_distance = repo.get_checkpoint_distance() / 10; + let page_tline = Arc::new(DatadirTimelineImpl::new( + inmem_timeline, + repartition_distance, + )); + page_tline.init_logical_size()?; + Ok(page_tline) +} + #[serde_as] #[derive(Serialize, Deserialize, Clone)] pub struct TenantInfo { @@ -344,38 +365,73 @@ fn init_local_repositories( for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses { // initialize local tenant let repo = load_local_repo(conf, tenant_id, remote_index) - .with_context(|| format!("Failed to load repo for tenant {}", tenant_id))?; + .with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?; + + let mut status_updates = HashMap::with_capacity(local_timeline_init_statuses.len()); for (timeline_id, init_status) in local_timeline_init_statuses { match init_status { LocalTimelineInitStatus::LocallyComplete => { - debug!("timeline {} for tenant {} is locally complete, registering it in repository", timeline_id, tenant_id); - // Lets fail here loudly to be on the safe side. - // XXX: It may be a better api to actually distinguish between repository startup - // and processing of newly downloaded timelines. - repo.apply_timeline_remote_sync_status_update( - timeline_id, - TimelineSyncStatusUpdate::Downloaded, - ) - .with_context(|| { - format!( - "Failed to bootstrap timeline {} for tenant {}", - timeline_id, tenant_id - ) - })? + debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository"); + status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded); } LocalTimelineInitStatus::NeedsSync => { debug!( - "timeline {} for tenant {} needs sync, \ - so skipped for adding into repository until sync is finished", - tenant_id, timeline_id + "timeline {tenant_id} for tenant {timeline_id} needs sync, \ + so skipped for adding into repository until sync is finished" ); } } } + + // Lets fail here loudly to be on the safe side. + // XXX: It may be a better api to actually distinguish between repository startup + // and processing of newly downloaded timelines. + register_new_timelines(&repo, status_updates) + .with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))? } Ok(()) } +fn register_new_timelines( + repo: &LayeredRepository, + status_updates: HashMap, +) -> anyhow::Result<()> { + let mut registration_queue = Vec::with_capacity(status_updates.len()); + + // first need to register the in-mem representations, to avoid missing ancestors during the local disk data registration + for (timeline_id, status_update) in status_updates { + repo.apply_timeline_remote_sync_status_update(timeline_id, status_update) + .with_context(|| { + format!("Failed to load timeline {timeline_id} into in-memory repository") + })?; + match status_update { + TimelineSyncStatusUpdate::Downloaded => registration_queue.push(timeline_id), + } + } + + for timeline_id in registration_queue { + let tenant_id = repo.tenant_id(); + match tenants_state::write_tenants().get_mut(&tenant_id) { + Some(tenant) => match tenant.local_timelines.entry(timeline_id) { + Entry::Occupied(_) => { + bail!("Local timeline {timeline_id} already registered") + } + Entry::Vacant(v) => { + v.insert(new_local_timeline(repo, timeline_id).with_context(|| { + format!("Failed to register new local timeline for tenant {tenant_id}") + })?); + } + }, + None => bail!( + "Tenant {} not found in local tenant state", + repo.tenant_id() + ), + } + } + + Ok(()) +} + // Sets up wal redo manager and repository for tenant. Reduces code duplication. // Used during pageserver startup, or when new tenant is attached to pageserver. fn load_local_repo( diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index acc92bb4a2..85ad294da9 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -204,7 +204,7 @@ pub fn create_repo( let repo_dir = conf.tenant_path(&tenant_id); ensure!( - repo_dir.exists(), + !repo_dir.exists(), "cannot create new tenant repo: '{}' directory already exists", tenant_id ); From 2911eb084aefc82791e28b668d2b06383b38c0de Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 28 Apr 2022 00:49:03 +0300 Subject: [PATCH 0205/1022] Remove timeline files on detach --- pageserver/src/http/routes.rs | 3 ++- pageserver/src/layered_repository.rs | 6 ++++- .../remote_storage/storage_sync/download.rs | 2 +- pageserver/src/tenant_mgr.rs | 24 ++++++++++++++----- .../batch_others/test_tenant_relocation.py | 9 +++++++ 5 files changed, 35 insertions(+), 9 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 295a1e9f02..311ae5adf4 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -347,7 +347,8 @@ async fn timeline_detach_handler(request: Request) -> Result anyhow::Result<()> { - self.timelines.lock().unwrap().remove(&timeline_id); + let mut timelines = self.timelines.lock().unwrap(); + ensure!( + timelines.remove(&timeline_id).is_some(), + "cannot detach timeline {timeline_id} that is not available locally" + ); Ok(()) } diff --git a/pageserver/src/remote_storage/storage_sync/download.rs b/pageserver/src/remote_storage/storage_sync/download.rs index 7fe25ab36e..c7a2b1fd22 100644 --- a/pageserver/src/remote_storage/storage_sync/download.rs +++ b/pageserver/src/remote_storage/storage_sync/download.rs @@ -332,7 +332,7 @@ mod tests { .await; assert!( matches!( - dbg!(already_downloading_remote_timeline_download), + already_downloading_remote_timeline_download, DownloadedTimeline::Abort, ), "Should not allow downloading for remote timeline that does not expect it" diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index ace6938e6d..3e0a907d00 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -56,7 +56,7 @@ struct Tenant { /// Contains in-memory state, including the timeline that might not yet flushed on disk or loaded form disk. repo: Arc, /// Timelines, located locally in the pageserver's datadir. - /// Whatever manipulations happen, local timelines are not removed, only incremented with files. + /// Timelines can entirely be removed entirely by the `detach` operation only. /// /// Local timelines have more metadata that's loaded into memory, /// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`]. @@ -126,8 +126,8 @@ pub fn apply_timeline_sync_status_updates( continue; } }; - match register_new_timelines(&repo, status_updates) { - Ok(()) => info!("successfully applied tenant {tenant_id} sync status updates"), + match apply_timeline_remote_sync_status_updates(&repo, status_updates) { + Ok(()) => info!("successfully applied sync status updates for tenant {tenant_id}"), Err(e) => error!( "Failed to apply timeline sync timeline status updates for tenant {tenant_id}: {e:?}" ), @@ -305,7 +305,11 @@ pub fn get_local_timeline_with_load( Ok(page_tline) } -pub fn detach_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { +pub fn detach_timeline( + conf: &'static PageServerConf, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, +) -> anyhow::Result<()> { // shutdown the timeline threads (this shuts down the walreceiver) thread_mgr::shutdown_threads(None, Some(tenant_id), Some(timeline_id)); @@ -320,6 +324,14 @@ pub fn detach_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow None => bail!("Tenant {tenant_id} not found in local tenant state"), } + let local_timeline_directory = conf.timeline_path(&timeline_id, &tenant_id); + std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { + format!( + "Failed to remove local timeline directory '{}'", + local_timeline_directory.display() + ) + })?; + Ok(()) } @@ -386,13 +398,13 @@ fn init_local_repositories( // Lets fail here loudly to be on the safe side. // XXX: It may be a better api to actually distinguish between repository startup // and processing of newly downloaded timelines. - register_new_timelines(&repo, status_updates) + apply_timeline_remote_sync_status_updates(&repo, status_updates) .with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))? } Ok(()) } -fn register_new_timelines( +fn apply_timeline_remote_sync_status_updates( repo: &LayeredRepository, status_updates: HashMap, ) -> anyhow::Result<()> { diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 8213d2526b..41907adf1a 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -217,6 +217,13 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, tenant_pg.start() + timeline_to_detach_local_path = env.repo_dir / 'tenants' / tenant.hex / 'timelines' / timeline.hex + files_before_detach = os.listdir(timeline_to_detach_local_path) + assert 'metadata' in files_before_detach, f'Regular timeline {timeline_to_detach_local_path} should have the metadata file,\ + but got: {files_before_detach}' + assert len(files_before_detach) > 2, f'Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\ + but got {files_before_detach}' + # detach tenant from old pageserver before we check # that all the data is there to be sure that old pageserver # is no longer involved, and if it is, we will see the errors @@ -238,6 +245,8 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, load_thread.join(timeout=10) log.info('load thread stopped') + assert not os.path.exists(timeline_to_detach_local_path), f'After detach, local timeline dir {timeline_to_detach_local_path} should be removed' + # bring old pageserver back for clean shutdown via zenith cli # new pageserver will be shut down by the context manager cli_config_lines = (env.repo_dir / 'config').read_text().splitlines() From 76388abeb6ecda513f50b9b89199e3f575cbe630 Mon Sep 17 00:00:00 2001 From: chaitanya sharma <86035+phoenix24@users.noreply.github.com> Date: Fri, 29 Apr 2022 14:22:46 +0300 Subject: [PATCH 0206/1022] Rename READMEs with .md extension, and fix links to them. Commit edba2e97 renamed pageserver/README to pageserver/README.md, but forgot to update links to it. Fix. Rename libs/postgres_ffi/README and safekeeper/README files to also have the the .md extension, so that github can render them nicely. Quote ascii-diagram in safekeeper/README.md so that it renders correctly. --- docs/README.md | 6 +++--- docs/sourcetree.md | 4 ++-- libs/postgres_ffi/{README => README.md} | 0 safekeeper/{README => README.md} | 6 ++++-- 4 files changed, 9 insertions(+), 7 deletions(-) rename libs/postgres_ffi/{README => README.md} (100%) rename safekeeper/{README => README.md} (99%) diff --git a/docs/README.md b/docs/README.md index 99d635bb33..886363dccc 100644 --- a/docs/README.md +++ b/docs/README.md @@ -7,8 +7,8 @@ - [glossary.md](glossary.md) — Glossary of all the terms used in codebase. - [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI. - [sourcetree.md](sourcetree.md) — Overview of the source tree layeout. -- [pageserver/README](/pageserver/README) — pageserver overview. -- [postgres_ffi/README](/libs/postgres_ffi/README) — Postgres FFI overview. +- [pageserver/README.md](/pageserver/README.md) — pageserver overview. +- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) — Postgres FFI overview. - [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview. -- [safekeeper/README](/safekeeper/README) — WAL service overview. +- [safekeeper/README.md](/safekeeper/README.md) — WAL service overview. - [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 5fd5fe19e5..5ddc6208d2 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -28,7 +28,7 @@ The pageserver has a few different duties: - Receive WAL from the WAL service and decode it. - Replay WAL that's applicable to the chunks that the Page Server maintains -For more detailed info, see `/pageserver/README` +For more detailed info, see [/pageserver/README](/pageserver/README.md) `/proxy`: @@ -57,7 +57,7 @@ PostgreSQL extension that contains functions needed for testing and debugging. The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver. It acts as a holding area and redistribution center for recently generated WAL. -For more detailed info, see `/safekeeper/README` +For more detailed info, see [/safekeeper/README](/safekeeper/README.md) `/workspace_hack`: The workspace_hack crate exists only to pin down some dependencies. diff --git a/libs/postgres_ffi/README b/libs/postgres_ffi/README.md similarity index 100% rename from libs/postgres_ffi/README rename to libs/postgres_ffi/README.md diff --git a/safekeeper/README b/safekeeper/README.md similarity index 99% rename from safekeeper/README rename to safekeeper/README.md index 4407837463..3f097d0c24 100644 --- a/safekeeper/README +++ b/safekeeper/README.md @@ -7,6 +7,7 @@ replica. A replication slot is used in the primary to prevent the primary from discarding WAL that hasn't been streamed to the WAL service yet. +``` +--------------+ +------------------+ | | WAL | | | Compute node | ----------> | WAL Service | @@ -23,7 +24,7 @@ service yet. | Pageservers | | | +--------------+ - +``` The WAL service consists of multiple WAL safekeepers that all store a @@ -31,6 +32,7 @@ copy of the WAL. A WAL record is considered durable when the majority of safekeepers have received and stored the WAL to local disk. A consensus algorithm based on Paxos is used to manage the quorum. +``` +-------------------------------------------+ | WAL Service | | | @@ -48,7 +50,7 @@ consensus algorithm based on Paxos is used to manage the quorum. | +------------+ | | | +-------------------------------------------+ - +``` The primary connects to the WAL safekeepers, so it works in a "push" fashion. That's different from how streaming replication usually From 05f8e6a050fb7af35950e69b30a23be2cc40e78a Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 25 Apr 2022 16:56:19 +0300 Subject: [PATCH 0207/1022] Use fsync+rename for atomic downloads from remote storage Use failpoint in test_remote_storage to check the behavior --- pageserver/Cargo.toml | 6 +- pageserver/src/bin/pageserver.rs | 7 +- pageserver/src/http/routes.rs | 72 +++++++------- pageserver/src/layered_repository.rs | 2 +- pageserver/src/page_service.rs | 3 + pageserver/src/remote_storage.rs | 18 ++++ pageserver/src/remote_storage/local_fs.rs | 13 +-- pageserver/src/remote_storage/storage_sync.rs | 92 ++++++++++++++--- .../remote_storage/storage_sync/download.rs | 99 +++++++++++++++++-- .../batch_others/test_remote_storage.py | 38 +++++-- 10 files changed, 274 insertions(+), 76 deletions(-) diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 5607baf698..23c16dd5be 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -4,8 +4,12 @@ version = "0.1.0" edition = "2021" [features] -default = [] +# It is simpler infra-wise to have failpoints enabled by default +# It shouldnt affect perf in any way because failpoints +# are not placed in hot code paths +default = ["failpoints"] profiling = ["pprof"] +failpoints = ["fail/failpoints"] [dependencies] chrono = "0.4.19" diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 728dcb53de..01fcc1224f 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -27,7 +27,12 @@ use utils::{ }; fn version() -> String { - format!("{} profiling:{}", GIT_VERSION, cfg!(feature = "profiling")) + format!( + "{} profiling:{} failpoints:{}", + GIT_VERSION, + cfg!(feature = "profiling"), + fail::has_failpoints() + ) } fn main() -> anyhow::Result<()> { diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 311ae5adf4..c589813d69 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -179,43 +179,47 @@ async fn timeline_detail_handler(request: Request) -> Result(local_timeline) + }) + .await + .ok() + .and_then(|r| r.ok()) + .flatten(); - let (local_timeline_info, span) = tokio::task::spawn_blocking(move || { - let entered = span.entered(); - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - let local_timeline = { - repo.get_timeline(timeline_id) - .as_ref() - .map(|timeline| { - LocalTimelineInfo::from_repo_timeline( - tenant_id, - timeline_id, - timeline, - include_non_incremental_logical_size, - ) + let remote_timeline_info = { + let remote_index_read = get_state(&request).remote_index.read().await; + remote_index_read + .timeline_entry(&ZTenantTimelineId { + tenant_id, + timeline_id, + }) + .map(|remote_entry| RemoteTimelineInfo { + remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(), + awaits_download: remote_entry.awaits_download, }) - .transpose()? }; - Ok::<_, anyhow::Error>((local_timeline, entered.exit())) - }) - .await - .map_err(ApiError::from_err)??; - - let remote_timeline_info = { - let remote_index_read = get_state(&request).remote_index.read().await; - remote_index_read - .timeline_entry(&ZTenantTimelineId { - tenant_id, - timeline_id, - }) - .map(|remote_entry| RemoteTimelineInfo { - remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(), - awaits_download: remote_entry.awaits_download, - }) - }; - - let _enter = span.entered(); + (local_timeline_info, remote_timeline_info) + } + .instrument(info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id)) + .await; if local_timeline_info.is_none() && remote_timeline_info.is_none() { return Err(ApiError::NotFound( diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 116fbf03a2..bbeb245f0a 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -721,7 +721,7 @@ impl LayeredRepository { } /// Save timeline metadata to file - fn save_metadata( + pub fn save_metadata( conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 8adbdc5d9d..ec08a840b0 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -667,7 +667,10 @@ impl postgres_backend::Handler for PageServerHandler { // on connect pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("failpoints ") { + ensure!(fail::has_failpoints(), "Cannot manage failpoints because pageserver was compiled without failpoints support"); + let (_, failpoints) = query_string.split_at("failpoints ".len()); + for failpoint in failpoints.split(';') { if let Some((name, actions)) = failpoint.split_once('=') { info!("cfg failpoint: {} {}", name, actions); diff --git a/pageserver/src/remote_storage.rs b/pageserver/src/remote_storage.rs index 39595b7167..cfa09dce14 100644 --- a/pageserver/src/remote_storage.rs +++ b/pageserver/src/remote_storage.rs @@ -101,6 +101,7 @@ use anyhow::{bail, Context}; use tokio::io; use tracing::{debug, error, info}; +use self::storage_sync::TEMP_DOWNLOAD_EXTENSION; pub use self::{ local_fs::LocalFs, s3_bucket::S3Bucket, @@ -304,12 +305,29 @@ fn collect_timeline_files( } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) { debug!("skipping ephemeral file {}", entry_path.display()); continue; + } else if entry_path.extension().and_then(ffi::OsStr::to_str) + == Some(TEMP_DOWNLOAD_EXTENSION) + { + info!("removing temp download file at {}", entry_path.display()); + fs::remove_file(&entry_path).with_context(|| { + format!( + "failed to remove temp download file at {}", + entry_path.display() + ) + })?; } else { timeline_files.insert(entry_path); } } } + // FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed + // then attach is lost. There would be no retries for that, + // initial collect will fail because there is no metadata. + // We either need to start download if we see empty dir after restart or attach caller should + // be aware of that and retry attach if awaits_download for timeline switched from true to false + // but timelinne didnt appear locally. + // Check what happens with remote index in that case. let timeline_metadata_path = match timeline_metadata_path { Some(path) => path, None => bail!("No metadata file found in the timeline directory"), diff --git a/pageserver/src/remote_storage/local_fs.rs b/pageserver/src/remote_storage/local_fs.rs index 952b2e69fe..6772a4fbd6 100644 --- a/pageserver/src/remote_storage/local_fs.rs +++ b/pageserver/src/remote_storage/local_fs.rs @@ -17,6 +17,8 @@ use tokio::{ }; use tracing::*; +use crate::remote_storage::storage_sync::path_with_suffix_extension; + use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; pub struct LocalFs { @@ -114,7 +116,7 @@ impl RemoteStorage for LocalFs { // We need this dance with sort of durable rename (without fsyncs) // to prevent partial uploads. This was really hit when pageserver shutdown // cancelled the upload and partial file was left on the fs - let temp_file_path = path_with_suffix_extension(&target_file_path, ".temp"); + let temp_file_path = path_with_suffix_extension(&target_file_path, "temp"); let mut destination = io::BufWriter::new( fs::OpenOptions::new() .write(true) @@ -299,15 +301,8 @@ impl RemoteStorage for LocalFs { } } -fn path_with_suffix_extension(original_path: &Path, suffix: &str) -> PathBuf { - let mut extension_with_suffix = original_path.extension().unwrap_or_default().to_os_string(); - extension_with_suffix.push(suffix); - - original_path.with_extension(extension_with_suffix) -} - fn storage_metadata_path(original_path: &Path) -> PathBuf { - path_with_suffix_extension(original_path, ".metadata") + path_with_suffix_extension(original_path, "metadata") } fn get_all_files<'a, P>( diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index 20012f32d7..2d3416cd32 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -62,7 +62,9 @@ pub mod index; mod upload; use std::{ + borrow::Cow, collections::{HashMap, HashSet, VecDeque}, + ffi::OsStr, fmt::Debug, num::{NonZeroU32, NonZeroUsize}, ops::ControlFlow, @@ -89,7 +91,10 @@ use self::{ use super::{LocalTimelineInitStatus, LocalTimelineInitStatuses, RemoteStorage, SyncStartupData}; use crate::{ config::PageServerConf, - layered_repository::metadata::{metadata_path, TimelineMetadata}, + layered_repository::{ + metadata::{metadata_path, TimelineMetadata}, + LayeredRepository, + }, repository::TimelineSyncStatusUpdate, tenant_mgr::apply_timeline_sync_status_updates, thread_mgr, @@ -103,6 +108,7 @@ use metrics::{ use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; pub use self::download::download_index_part; +pub use self::download::TEMP_DOWNLOAD_EXTENSION; lazy_static! { static ref REMAINING_SYNC_ITEMS: IntGauge = register_int_gauge!( @@ -782,8 +788,14 @@ where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { - match download_timeline_layers(storage, current_remote_timeline, sync_id, new_download_data) - .await + match download_timeline_layers( + conf, + storage, + current_remote_timeline, + sync_id, + new_download_data, + ) + .await { DownloadedTimeline::Abort => { register_sync_status(sync_start, task_name, None); @@ -852,18 +864,28 @@ async fn update_local_metadata( if local_lsn < Some(remote_lsn) { info!("Updating local timeline metadata from remote timeline: local disk_consistent_lsn={local_lsn:?}, remote disk_consistent_lsn={remote_lsn}"); - - let remote_metadata_bytes = remote_metadata - .to_bytes() - .context("Failed to serialize remote metadata to bytes")?; - fs::write(&local_metadata_path, &remote_metadata_bytes) - .await - .with_context(|| { - format!( - "Failed to write remote metadata bytes locally to path '{}'", - local_metadata_path.display() - ) - })?; + // clone because spawn_blocking requires static lifetime + let cloned_metadata = remote_metadata.to_owned(); + let ZTenantTimelineId { + tenant_id, + timeline_id, + } = sync_id; + tokio::task::spawn_blocking(move || { + LayeredRepository::save_metadata(conf, timeline_id, tenant_id, &cloned_metadata, true) + }) + .await + .with_context(|| { + format!( + "failed to join save_metadata task for {}", + local_metadata_path.display() + ) + })? + .with_context(|| { + format!( + "Failed to write remote metadata bytes locally to path '{}'", + local_metadata_path.display() + ) + })?; } else { info!("Local metadata at path '{}' has later disk consistent Lsn ({local_lsn:?}) than the remote one ({remote_lsn}), skipping the update", local_metadata_path.display()); } @@ -1062,7 +1084,7 @@ where debug!("Successfully fetched index part for {id}"); index_parts.insert(id, index_part); } - Err(e) => warn!("Failed to fetch index part for {id}: {e:?}"), + Err(e) => warn!("Failed to fetch index part for {id}: {e}"), } } @@ -1192,6 +1214,20 @@ fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Optio .observe(secs_elapsed) } +pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { + let new_extension = match original_path + .as_ref() + .extension() + .map(OsStr::to_string_lossy) + { + Some(extension) => Cow::Owned(format!("{extension}.{suffix}")), + None => Cow::Borrowed(suffix), + }; + original_path + .as_ref() + .with_extension(new_extension.as_ref()) +} + #[cfg(test)] mod test_utils { use utils::lsn::Lsn; @@ -1600,4 +1636,28 @@ mod tests { "Merged upload tasks should have a metadata with biggest disk_consistent_lsn" ); } + + #[test] + fn test_path_with_suffix_extension() { + let p = PathBuf::from("/foo/bar"); + assert_eq!( + &path_with_suffix_extension(&p, "temp").to_string_lossy(), + "/foo/bar.temp" + ); + let p = PathBuf::from("/foo/bar"); + assert_eq!( + &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + "/foo/bar.temp.temp" + ); + let p = PathBuf::from("/foo/bar.baz"); + assert_eq!( + &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + "/foo/bar.baz.temp.temp" + ); + let p = PathBuf::from("/foo/bar.baz"); + assert_eq!( + &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + "/foo/bar.baz..temp" + ); + } } diff --git a/pageserver/src/remote_storage/storage_sync/download.rs b/pageserver/src/remote_storage/storage_sync/download.rs index c7a2b1fd22..7e2496b796 100644 --- a/pageserver/src/remote_storage/storage_sync/download.rs +++ b/pageserver/src/remote_storage/storage_sync/download.rs @@ -1,17 +1,20 @@ //! Timeline synchrnonization logic to fetch the layer files from remote storage into pageserver's local directory. -use std::fmt::Debug; +use std::{collections::HashSet, fmt::Debug, path::Path}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; -use tokio::fs; +use tokio::{ + fs, + io::{self, AsyncWriteExt}, +}; use tracing::{debug, error, info, warn}; use crate::{ config::PageServerConf, layered_repository::metadata::metadata_path, remote_storage::{ - storage_sync::{sync_queue, SyncTask}, + storage_sync::{path_with_suffix_extension, sync_queue, SyncTask}, RemoteStorage, }, }; @@ -22,6 +25,8 @@ use super::{ SyncData, TimelineDownload, }; +pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; + /// Retrieves index data from the remote storage for a given timeline. pub async fn download_index_part( conf: &'static PageServerConf, @@ -46,7 +51,7 @@ where .download(&part_storage_path, &mut index_part_bytes) .await .with_context(|| { - format!("Failed to download an index part from storage path '{part_storage_path:?}'") + format!("Failed to download an index part from storage path {part_storage_path:?}") })?; let index_part: IndexPart = serde_json::from_slice(&index_part_bytes).with_context(|| { @@ -80,6 +85,7 @@ pub(super) enum DownloadedTimeline { /// /// On an error, bumps the retries count and updates the files to skip with successful downloads, rescheduling the task. pub(super) async fn download_timeline_layers<'a, P, S>( + conf: &'static PageServerConf, storage: &'a S, remote_timeline: Option<&'a RemoteTimeline>, sync_id: ZTenantTimelineId, @@ -132,12 +138,24 @@ where ) })?; - let mut destination_file = fs::File::create(&layer_desination_path) - .await - .with_context(|| { + // Perform a rename inspired by durable_rename from file_utils.c. + // The sequence: + // write(tmp) + // fsync(tmp) + // rename(tmp, new) + // fsync(new) + // fsync(parent) + // For more context about durable_rename check this email from postgres mailing list: + // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com + // If pageserver crashes the temp file will be deleted on startup and re-downloaded. + let temp_file_path = + path_with_suffix_extension(&layer_desination_path, TEMP_DOWNLOAD_EXTENSION); + + let mut destination_file = + fs::File::create(&temp_file_path).await.with_context(|| { format!( "Failed to create a destination file for layer '{}'", - layer_desination_path.display() + temp_file_path.display() ) })?; @@ -149,15 +167,55 @@ where "Failed to download a layer from storage path '{layer_storage_path:?}'" ) })?; + + // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: + // A file will not be closed immediately when it goes out of scope if there are any IO operations + // that have not yet completed. To ensure that a file is closed immediately when it is dropped, + // you should call flush before dropping it. + // + // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because + // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations. + // But for additional safety lets check/wait for any pending operations. + destination_file.flush().await.with_context(|| { + format!( + "failed to flush source file at {}", + temp_file_path.display() + ) + })?; + + // not using sync_data because it can lose file size update + destination_file.sync_all().await.with_context(|| { + format!( + "failed to fsync source file at {}", + temp_file_path.display() + ) + })?; + drop(destination_file); + + fail::fail_point!("remote-storage-download-pre-rename", |_| { + anyhow::bail!("remote-storage-download-pre-rename failpoint triggered") + }); + + fs::rename(&temp_file_path, &layer_desination_path).await?; + + fsync_path(&layer_desination_path).await.with_context(|| { + format!( + "Cannot fsync layer destination path {}", + layer_desination_path.display(), + ) + })?; } Ok::<_, anyhow::Error>(layer_desination_path) }) .collect::>(); let mut errors_happened = false; + // keep files we've downloaded to remove them from layers_to_skip if directory fsync fails + let mut undo = HashSet::new(); while let Some(download_result) = download_tasks.next().await { match download_result { Ok(downloaded_path) => { + undo.insert(downloaded_path.clone()); download.layers_to_skip.insert(downloaded_path); } Err(e) => { @@ -167,6 +225,24 @@ where } } + // fsync timeline directory which is a parent directory for downloaded files + let ZTenantTimelineId { + tenant_id, + timeline_id, + } = &sync_id; + let timeline_dir = conf.timeline_path(timeline_id, tenant_id); + if let Err(e) = fsync_path(&timeline_dir).await { + error!( + "Cannot fsync parent directory {} error {}", + timeline_dir.display(), + e + ); + for item in undo { + download.layers_to_skip.remove(&item); + } + errors_happened = true; + } + if errors_happened { debug!("Reenqueuing failed download task for timeline {sync_id}"); download_data.retries += 1; @@ -178,6 +254,10 @@ where } } +async fn fsync_path(path: impl AsRef) -> Result<(), io::Error> { + fs::File::open(path).await?.sync_all().await +} + #[cfg(test)] mod tests { use std::collections::{BTreeSet, HashSet}; @@ -236,6 +316,7 @@ mod tests { ); let download_data = match download_timeline_layers( + harness.conf, &storage, Some(&remote_timeline), sync_id, @@ -297,6 +378,7 @@ mod tests { let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?; let empty_remote_timeline_download = download_timeline_layers( + harness.conf, &storage, None, sync_id, @@ -319,6 +401,7 @@ mod tests { "Should not expect download for the timeline" ); let already_downloading_remote_timeline_download = download_timeline_layers( + harness.conf, &storage, Some(¬_expecting_download_remote_timeline), sync_id, diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index f2d654423a..59a9cfa378 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -4,10 +4,11 @@ import shutil, os from contextlib import closing from pathlib import Path +import time from uuid import UUID from fixtures.zenith_fixtures import ZenithEnvBuilder, assert_local, wait_for, wait_for_last_record_lsn, wait_for_upload from fixtures.log_helper import log -from fixtures.utils import lsn_from_hex +from fixtures.utils import lsn_from_hex, lsn_to_hex import pytest @@ -23,14 +24,14 @@ import pytest # # 2. Second pageserver # * starts another pageserver, connected to the same remote storage -# * same timeline id is queried for status, triggering timeline's download +# * timeline_attach is called for the same timeline id # * timeline status is polled until it's downloaded # * queries the specific data, ensuring that it matches the one stored before # # The tests are done for all types of remote storage pageserver supports. @pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3']) def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, storage_type: str): - zenith_env_builder.rust_log_override = 'debug' + # zenith_env_builder.rust_log_override = 'debug' zenith_env_builder.num_safekeepers = 1 if storage_type == 'local_fs': zenith_env_builder.enable_local_fs_remote_storage() @@ -67,9 +68,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, wait_for_last_record_lsn(client, UUID(tenant_id), UUID(timeline_id), current_lsn) # run checkpoint manually to be sure that data landed in remote storage - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor() as pscur: - pscur.execute(f"checkpoint {tenant_id} {timeline_id}") + env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") log.info(f'waiting for checkpoint {checkpoint_number} upload') # wait until pageserver successfully uploaded a checkpoint to remote storage @@ -87,6 +86,27 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, ##### Second start, restore the data and ensure it's the same env.pageserver.start() + # Introduce failpoint in download + env.pageserver.safe_psql(f"failpoints remote-storage-download-pre-rename=return") + + client.timeline_attach(UUID(tenant_id), UUID(timeline_id)) + + # is there a better way to assert that fafilpoint triggered? + time.sleep(10) + + # assert cannot attach timeline that is scheduled for download + with pytest.raises(Exception, match="Timeline download is already in progress"): + client.timeline_attach(UUID(tenant_id), UUID(timeline_id)) + + detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) + log.info("Timeline detail with active failpoint: %s", detail) + assert detail['local'] is None + assert detail['remote']['awaits_download'] + + # trigger temporary download files removal + env.pageserver.stop() + env.pageserver.start() + client.timeline_attach(UUID(tenant_id), UUID(timeline_id)) log.info("waiting for timeline redownload") @@ -94,6 +114,12 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, interval=1, func=lambda: assert_local(client, UUID(tenant_id), UUID(timeline_id))) + detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) + assert detail['local'] is not None + log.info("Timeline detail after attach completed: %s", detail) + assert lsn_from_hex(detail['local']['last_record_lsn']) == current_lsn + assert not detail['remote']['awaits_download'] + pg = env.postgres.create_start('main') with closing(pg.connect()) as conn: with conn.cursor() as cur: From 67b4e38092066c7633c37ee05e4d64fa9b9a2b01 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 28 Apr 2022 00:41:06 +0300 Subject: [PATCH 0208/1022] remporarily disable test_backpressure_received_lsn_lag --- test_runner/batch_others/test_backpressure.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test_runner/batch_others/test_backpressure.py b/test_runner/batch_others/test_backpressure.py index ff34121327..6658b337ec 100644 --- a/test_runner/batch_others/test_backpressure.py +++ b/test_runner/batch_others/test_backpressure.py @@ -1,6 +1,7 @@ from contextlib import closing, contextmanager import psycopg2.extras -from fixtures.zenith_fixtures import ZenithEnvBuilder +import pytest +from fixtures.zenith_fixtures import PgProtocol, ZenithEnvBuilder from fixtures.log_helper import log import os import time @@ -91,6 +92,7 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv # If backpressure is enabled and tuned properly, insertion will be throttled, but the query will not timeout. +@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/1587") def test_backpressure_received_lsn_lag(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() From aa933d3961f25ff3ebb00f0a04c89dfa4ee5ceb4 Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Fri, 29 Apr 2022 20:05:14 +0300 Subject: [PATCH 0209/1022] proxy settings update for new domain (#1597) --- .circleci/helm-values/production.proxy.yaml | 6 +++--- .circleci/helm-values/staging.proxy.yaml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.circleci/helm-values/production.proxy.yaml b/.circleci/helm-values/production.proxy.yaml index f2148c1d2c..e13968a6a8 100644 --- a/.circleci/helm-values/production.proxy.yaml +++ b/.circleci/helm-values/production.proxy.yaml @@ -5,8 +5,8 @@ image: repository: neondatabase/neon settings: - authEndpoint: "https://console.zenith.tech/authenticate_proxy_request/" - uri: "https://console.zenith.tech/psql_session/" + authEndpoint: "https://console.neon.tech/authenticate_proxy_request/" + uri: "https://console.neon.tech/psql_session/" # -- Additional labels for zenith-proxy pods podLabels: @@ -28,7 +28,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-type: external service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing - external-dns.alpha.kubernetes.io/hostname: start.zenith.tech + external-dns.alpha.kubernetes.io/hostname: start.zenith.tech,connect.neon.tech,pg.neon.tech metrics: enabled: true diff --git a/.circleci/helm-values/staging.proxy.yaml b/.circleci/helm-values/staging.proxy.yaml index f4d9855476..34ba972b64 100644 --- a/.circleci/helm-values/staging.proxy.yaml +++ b/.circleci/helm-values/staging.proxy.yaml @@ -5,8 +5,8 @@ image: repository: neondatabase/neon settings: - authEndpoint: "https://console.stage.zenith.tech/authenticate_proxy_request/" - uri: "https://console.stage.zenith.tech/psql_session/" + authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/" + uri: "https://console.stage.neon.tech/psql_session/" # -- Additional labels for zenith-proxy pods podLabels: @@ -20,7 +20,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-type: external service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing - external-dns.alpha.kubernetes.io/hostname: start.stage.zenith.tech + external-dns.alpha.kubernetes.io/hostname: connect.stage.neon.tech metrics: enabled: true From 7e1db8c8a1de346d3a6350e1079fd7e6eb30033c Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 29 Apr 2022 17:08:51 +0300 Subject: [PATCH 0210/1022] Show which virtual file got the deserialization errors --- pageserver/src/layered_repository/delta_layer.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index ef4c3cccb0..4952f64ccd 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -258,8 +258,18 @@ impl Layer for DeltaLayer { // Ok, 'offsets' now contains the offsets of all the entries we need to read let mut cursor = file.block_cursor(); for (entry_lsn, pos) in offsets { - let buf = cursor.read_blob(pos)?; - let val = Value::des(&buf)?; + let buf = cursor.read_blob(pos).with_context(|| { + format!( + "Failed to read blob from virtual file {}", + file.file.path.display() + ) + })?; + let val = Value::des(&buf).with_context(|| { + format!( + "Failed to deserialize file blob from virtual file {}", + file.file.path.display() + ) + })?; match val { Value::Image(img) => { reconstruct_state.img = Some((entry_lsn, img)); From 038ea4c1280416dbcee8b1c3e24d84871602c75c Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Sat, 30 Apr 2022 22:04:08 +0300 Subject: [PATCH 0211/1022] proxy notice message update (#1600) --- proxy/src/auth.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 4c54e2f9eb..c6d32040dc 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -174,7 +174,7 @@ fn parse_password(bytes: &[u8]) -> Option<&str> { fn hello_message(redirect_uri: &str, session_id: &str) -> String { format!( concat![ - "☀️ Welcome to Zenith!\n", + "☀️ Welcome to Neon!\n", "To proceed with database creation, open the following link:\n\n", " {redirect_uri}{session_id}\n\n", "It needs to be done once and we will send you '.pgpass' file,\n", From f3f12db2cbdcfa994d3a798d609ba16f9ac38baa Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Fri, 29 Apr 2022 11:48:56 -0700 Subject: [PATCH 0212/1022] Add gc churn threshold knob (#1594) Signed-off-by: Dhammika Pathirana --- control_plane/src/storage.rs | 7 +++++++ pageserver/src/config.rs | 1 + pageserver/src/http/models.rs | 3 +++ pageserver/src/http/routes.rs | 2 ++ pageserver/src/layered_repository.rs | 25 +++++++++++++++++-------- pageserver/src/page_service.rs | 2 ++ pageserver/src/repository.rs | 1 + pageserver/src/tenant_config.rs | 12 ++++++++++++ 8 files changed, 45 insertions(+), 8 deletions(-) diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 7520ad9304..3a63bf6960 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -369,6 +369,10 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose()?, gc_period: settings.get("gc_period").map(|x| x.to_string()), + image_creation_threshold: settings + .get("image_creation_threshold") + .map(|x| x.parse::()) + .transpose()?, pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()), }) .send()? @@ -405,6 +409,9 @@ impl PageServerNode { .get("gc_horizon") .map(|x| x.parse::().unwrap()), gc_period: settings.get("gc_period").map(|x| x.to_string()), + image_creation_threshold: settings + .get("image_creation_threshold") + .map(|x| x.parse::().unwrap()), pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()), }) .send()? diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index aed7eabb76..14ca976448 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -75,6 +75,7 @@ pub mod defaults { #gc_period = '{DEFAULT_GC_PERIOD}' #gc_horizon = {DEFAULT_GC_HORIZON} +#image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD} #pitr_interval = '{DEFAULT_PITR_INTERVAL}' # [remote_storage] diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index b24b3dc316..e9aaa72416 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -31,6 +31,7 @@ pub struct TenantCreateRequest { pub compaction_threshold: Option, pub gc_horizon: Option, pub gc_period: Option, + pub image_creation_threshold: Option, pub pitr_interval: Option, } @@ -65,6 +66,7 @@ pub struct TenantConfigRequest { pub compaction_threshold: Option, pub gc_horizon: Option, pub gc_period: Option, + pub image_creation_threshold: Option, pub pitr_interval: Option, } @@ -78,6 +80,7 @@ impl TenantConfigRequest { compaction_threshold: None, gc_horizon: None, gc_period: None, + image_creation_threshold: None, pitr_interval: None, } } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index c589813d69..5903dea372 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -387,6 +387,7 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result usize { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .image_creation_threshold + .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) + } + pub fn get_pitr_interval(&self) -> Duration { let tenant_conf = self.tenant_conf.read().unwrap(); tenant_conf @@ -1152,6 +1159,13 @@ impl LayeredTimeline { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + fn get_image_creation_threshold(&self) -> usize { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .image_creation_threshold + .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) + } + /// Open a Timeline handle. /// /// Loads the metadata for the timeline into memory, but not the layer map. @@ -1821,7 +1835,7 @@ impl LayeredTimeline { // 2. Create new image layers for partitions that have been modified // "enough". for part in partitioning.parts.iter() { - if self.time_for_new_image_layer(part, lsn, 3)? { + if self.time_for_new_image_layer(part, lsn)? { self.create_image_layer(part, lsn)?; } } @@ -1839,12 +1853,7 @@ impl LayeredTimeline { } // Is it time to create a new image layer for the given partition? - fn time_for_new_image_layer( - &self, - partition: &KeySpace, - lsn: Lsn, - threshold: usize, - ) -> Result { + fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result { let layers = self.layers.read().unwrap(); for part_range in &partition.ranges { @@ -1862,7 +1871,7 @@ impl LayeredTimeline { "range {}-{}, has {} deltas on this timeline", img_range.start, img_range.end, num_deltas ); - if num_deltas >= threshold { + if num_deltas >= self.get_image_creation_threshold() { return Ok(true); } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index ec08a840b0..0adafab8ba 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -694,6 +694,7 @@ impl postgres_backend::Handler for PageServerHandler { RowDescriptor::int8_col(b"compaction_threshold"), RowDescriptor::int8_col(b"gc_horizon"), RowDescriptor::int8_col(b"gc_period"), + RowDescriptor::int8_col(b"image_creation_threshold"), RowDescriptor::int8_col(b"pitr_interval"), ]))? .write_message_noflush(&BeMessage::DataRow(&[ @@ -708,6 +709,7 @@ impl postgres_backend::Handler for PageServerHandler { Some(repo.get_compaction_threshold().to_string().as_bytes()), Some(repo.get_gc_horizon().to_string().as_bytes()), Some(repo.get_gc_period().as_secs().to_string().as_bytes()), + Some(repo.get_image_creation_threshold().to_string().as_bytes()), Some(repo.get_pitr_interval().as_secs().to_string().as_bytes()), ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 6c75f035ca..5044f2bfc5 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -467,6 +467,7 @@ pub mod repo_harness { compaction_threshold: Some(tenant_conf.compaction_threshold), gc_horizon: Some(tenant_conf.gc_horizon), gc_period: Some(tenant_conf.gc_period), + image_creation_threshold: Some(tenant_conf.image_creation_threshold), pitr_interval: Some(tenant_conf.pitr_interval), } } diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index a175f6abbe..9bf223e59e 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -32,6 +32,7 @@ pub mod defaults { pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; pub const DEFAULT_GC_PERIOD: &str = "100 s"; + pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; pub const DEFAULT_PITR_INTERVAL: &str = "30 days"; } @@ -59,6 +60,8 @@ pub struct TenantConf { // Interval at which garbage collection is triggered. #[serde(with = "humantime_serde")] pub gc_period: Duration, + // Delta layer churn threshold to create L1 image layers. + pub image_creation_threshold: usize, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. // The unit is time. @@ -79,6 +82,7 @@ pub struct TenantConfOpt { pub gc_horizon: Option, #[serde(with = "humantime_serde")] pub gc_period: Option, + pub image_creation_threshold: Option, #[serde(with = "humantime_serde")] pub pitr_interval: Option, } @@ -100,6 +104,9 @@ impl TenantConfOpt { .unwrap_or(global_conf.compaction_threshold), gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon), gc_period: self.gc_period.unwrap_or(global_conf.gc_period), + image_creation_threshold: self + .image_creation_threshold + .unwrap_or(global_conf.image_creation_threshold), pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval), } } @@ -123,6 +130,9 @@ impl TenantConfOpt { if let Some(gc_period) = other.gc_period { self.gc_period = Some(gc_period); } + if let Some(image_creation_threshold) = other.image_creation_threshold { + self.image_creation_threshold = Some(image_creation_threshold); + } if let Some(pitr_interval) = other.pitr_interval { self.pitr_interval = Some(pitr_interval); } @@ -142,6 +152,7 @@ impl TenantConf { gc_horizon: DEFAULT_GC_HORIZON, gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) .expect("cannot parse default gc period"), + image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD, pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL) .expect("cannot parse default PITR interval"), } @@ -162,6 +173,7 @@ impl TenantConf { compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD, gc_horizon: defaults::DEFAULT_GC_HORIZON, gc_period: Duration::from_secs(10), + image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD, pitr_interval: Duration::from_secs(60 * 60), } } From 3128e8c75ce7eacd4a33113ed78448d6c05b1dce Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Fri, 29 Apr 2022 12:47:57 -0700 Subject: [PATCH 0213/1022] Fix tenant conf test Signed-off-by: Dhammika Pathirana --- test_runner/batch_others/test_tenant_conf.py | 59 +++++++++++++++++--- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/test_runner/batch_others/test_tenant_conf.py b/test_runner/batch_others/test_tenant_conf.py index 64359a1dc3..b85a541f10 100644 --- a/test_runner/batch_others/test_tenant_conf.py +++ b/test_runner/batch_others/test_tenant_conf.py @@ -1,6 +1,7 @@ from contextlib import closing import pytest +import psycopg2.extras from fixtures.zenith_fixtures import ZenithEnvBuilder from fixtures.log_helper import log @@ -30,19 +31,39 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' # check the configuration of the default tenant # it should match global configuration with closing(env.pageserver.connect()) as psconn: - with psconn.cursor() as pscur: + with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: + log.info(f"show {env.initial_tenant.hex}") pscur.execute(f"show {env.initial_tenant.hex}") res = pscur.fetchone() - log.info(f"initial_tenant res: {res}") - assert res == (10000, 1048576, 1, 10, 67108864, 100, 2592000) + assert all( + i in res.items() for i in { + "checkpoint_distance": 10000, + "compaction_target_size": 1048576, + "compaction_period": 1, + "compaction_threshold": 10, + "gc_horizon": 67108864, + "gc_period": 100, + "image_creation_threshold": 3, + "pitr_interval": 2592000 + }.items()) # check the configuration of the new tenant with closing(env.pageserver.connect()) as psconn: - with psconn.cursor() as pscur: + with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: pscur.execute(f"show {tenant.hex}") res = pscur.fetchone() log.info(f"res: {res}") - assert res == (20000, 1048576, 1, 10, 67108864, 30, 2592000) + assert all( + i in res.items() for i in { + "checkpoint_distance": 20000, + "compaction_target_size": 1048576, + "compaction_period": 1, + "compaction_threshold": 10, + "gc_horizon": 67108864, + "gc_period": 30, + "image_creation_threshold": 3, + "pitr_interval": 2592000 + }.items()) # update the config and ensure that it has changed env.zenith_cli.config_tenant(tenant_id=tenant, @@ -52,19 +73,39 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' }) with closing(env.pageserver.connect()) as psconn: - with psconn.cursor() as pscur: + with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: pscur.execute(f"show {tenant.hex}") res = pscur.fetchone() log.info(f"after config res: {res}") - assert res == (15000, 1048576, 1, 10, 67108864, 80, 2592000) + assert all( + i in res.items() for i in { + "checkpoint_distance": 15000, + "compaction_target_size": 1048576, + "compaction_period": 1, + "compaction_threshold": 10, + "gc_horizon": 67108864, + "gc_period": 80, + "image_creation_threshold": 3, + "pitr_interval": 2592000 + }.items()) # restart the pageserver and ensure that the config is still correct env.pageserver.stop() env.pageserver.start() with closing(env.pageserver.connect()) as psconn: - with psconn.cursor() as pscur: + with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: pscur.execute(f"show {tenant.hex}") res = pscur.fetchone() log.info(f"after restart res: {res}") - assert res == (15000, 1048576, 1, 10, 67108864, 80, 2592000) + assert all( + i in res.items() for i in { + "checkpoint_distance": 15000, + "compaction_target_size": 1048576, + "compaction_period": 1, + "compaction_threshold": 10, + "gc_horizon": 67108864, + "gc_period": 80, + "image_creation_threshold": 3, + "pitr_interval": 2592000 + }.items()) From 992874c916ad77e04d526eb7882706c2495a1426 Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Sun, 1 May 2022 13:52:08 -0700 Subject: [PATCH 0214/1022] Fix update ps settings doc Signed-off-by: Dhammika Pathirana --- docs/settings.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/settings.md b/docs/settings.md index 530876a42a..b3925528cd 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -74,6 +74,10 @@ Every `compaction_period` seconds, the page server checks if maintenance operations, like compaction, are needed on the layer files. Default is 1 s, which should be fine. +#### compaction_target_size + +File sizes for L0 delta and L1 image layers. Default is 128MB. + #### gc_horizon `gz_horizon` determines how much history is retained, to allow @@ -85,6 +89,14 @@ away. Interval at which garbage collection is triggered. Default is 100 s. +#### image_creation_threshold + +L0 delta layer threshold for L1 iamge layer creation. Default is 3. + +#### pitr_interval + +WAL retention duration for PITR branching. Default is 30 days. + #### initial_superuser_name Name of the initial superuser role, passed to initdb when a new tenant From 2477d2f9e233b2b9b8a010f0dd6a847d029be23a Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Mon, 2 May 2022 04:37:16 +0300 Subject: [PATCH 0215/1022] Deploy standalone SRAM proxy on staging --- .circleci/config.yml | 1 + .../helm-values/staging.proxy-scram.yaml | 30 +++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 .circleci/helm-values/staging.proxy-scram.yaml diff --git a/.circleci/config.yml b/.circleci/config.yml index 3397bcc7b7..f8787edcfb 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -585,6 +585,7 @@ jobs: command: | DOCKER_TAG=$(git log --oneline|wc -l) helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait + helm upgrade zenith-proxy-scram zenithdb/zenith-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait deploy-release: diff --git a/.circleci/helm-values/staging.proxy-scram.yaml b/.circleci/helm-values/staging.proxy-scram.yaml new file mode 100644 index 0000000000..1a9ab239b4 --- /dev/null +++ b/.circleci/helm-values/staging.proxy-scram.yaml @@ -0,0 +1,30 @@ +# Helm chart values for zenith-proxy. +# This is a YAML-formatted file. + +image: + repository: neondatabase/neon + +settings: + authBackend: "console" + authEndpoint: "https://console.stage.neon.tech:9095/management/api/v2" + +# -- Additional labels for zenith-proxy pods +podLabels: + zenith_service: proxy-scram + zenith_env: staging + zenith_region: us-east-1 + zenith_region_slug: virginia + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: *.cloud.stage.neon.tech + +metrics: + enabled: true + serviceMonitor: + enabled: true + selector: + release: kube-prometheus-stack From 8f479a712f49eb5baed065fcec00c987493105dd Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Mon, 2 May 2022 11:38:25 +0300 Subject: [PATCH 0216/1022] minor fixes in proxy deployment --- .circleci/ansible/.gitignore | 2 ++ .circleci/config.yml | 7 +++---- .circleci/helm-values/staging.proxy-scram.yaml | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.circleci/ansible/.gitignore b/.circleci/ansible/.gitignore index 14a1c155ae..441d9a8b82 100644 --- a/.circleci/ansible/.gitignore +++ b/.circleci/ansible/.gitignore @@ -1,2 +1,4 @@ zenith_install.tar.gz .zenith_current_version +neon_install.tar.gz +.neon_current_version diff --git a/.circleci/config.yml b/.circleci/config.yml index f8787edcfb..2ed079f031 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -579,14 +579,13 @@ jobs: name: Setup helm v3 command: | curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - helm repo add zenithdb https://neondatabase.github.io/helm-charts + helm repo add neondatabase https://neondatabase.github.io/helm-charts - run: name: Re-deploy proxy command: | DOCKER_TAG=$(git log --oneline|wc -l) - helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait - helm upgrade zenith-proxy-scram zenithdb/zenith-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait - + helm upgrade zenith-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait + helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait deploy-release: docker: diff --git a/.circleci/helm-values/staging.proxy-scram.yaml b/.circleci/helm-values/staging.proxy-scram.yaml index 1a9ab239b4..8c7bf835bc 100644 --- a/.circleci/helm-values/staging.proxy-scram.yaml +++ b/.circleci/helm-values/staging.proxy-scram.yaml @@ -6,7 +6,7 @@ image: settings: authBackend: "console" - authEndpoint: "https://console.stage.neon.tech:9095/management/api/v2" + authEndpoint: "http://console-staging.local/management/api/v2/healthz" # -- Additional labels for zenith-proxy pods podLabels: From 68ba6a58a0b7a1eeb4102a79e6896b4508e8018e Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Mon, 2 May 2022 11:43:55 +0300 Subject: [PATCH 0217/1022] authEndpoint fix --- .circleci/helm-values/staging.proxy-scram.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/helm-values/staging.proxy-scram.yaml b/.circleci/helm-values/staging.proxy-scram.yaml index 8c7bf835bc..0391697641 100644 --- a/.circleci/helm-values/staging.proxy-scram.yaml +++ b/.circleci/helm-values/staging.proxy-scram.yaml @@ -6,7 +6,7 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-staging.local/management/api/v2/healthz" + authEndpoint: "http://console-staging.local/management/api/v2" # -- Additional labels for zenith-proxy pods podLabels: From 4b1bd32e4a17fe6ecda43f3d8c67ce0726d37690 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Tue, 12 Apr 2022 01:04:02 +0300 Subject: [PATCH 0218/1022] Drop `Debug` impl for `ScramKey` and `ServerSecret` There's a notion that accidental misuse of those implementations might reveal authentication secrets. --- proxy/src/scram/exchange.rs | 3 --- proxy/src/scram/key.rs | 2 +- proxy/src/scram/secret.rs | 1 - 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 5a986b965a..802fe61db5 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -8,7 +8,6 @@ use super::signature::SignatureBuilder; use crate::sasl::{self, ChannelBinding, Error as SaslError}; /// The only channel binding mode we currently support. -#[derive(Debug)] struct TlsServerEndPoint; impl std::fmt::Display for TlsServerEndPoint { @@ -28,7 +27,6 @@ impl std::str::FromStr for TlsServerEndPoint { } } -#[derive(Debug)] enum ExchangeState { /// Waiting for [`ClientFirstMessage`]. Initial, @@ -41,7 +39,6 @@ enum ExchangeState { } /// Server's side of SCRAM auth algorithm. -#[derive(Debug)] pub struct Exchange<'a> { state: ExchangeState, secret: &'a ServerSecret, diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs index 1c13471bc3..73dd5e1d5c 100644 --- a/proxy/src/scram/key.rs +++ b/proxy/src/scram/key.rs @@ -6,7 +6,7 @@ pub const SCRAM_KEY_LEN: usize = 32; /// One of the keys derived from the [password](super::password::SaltedPassword). /// We use the same structure for all keys, i.e. /// `ClientKey`, `StoredKey`, and `ServerKey`. -#[derive(Default, Debug, PartialEq, Eq)] +#[derive(Default, PartialEq, Eq)] #[repr(transparent)] pub struct ScramKey { bytes: [u8; SCRAM_KEY_LEN], diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index e8d180bcdd..bf935d3510 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -5,7 +5,6 @@ use super::key::ScramKey; /// Server secret is produced from [password](super::password::SaltedPassword) /// and is used throughout the authentication process. -#[derive(Debug)] pub struct ServerSecret { /// Number of iterations for `PBKDF2` function. pub iterations: u32, From 9df8915b03e03135bf3f8f78fa00435c94aa3ccd Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Tue, 12 Apr 2022 01:12:07 +0300 Subject: [PATCH 0219/1022] [proxy] `sasl::Mechanism` may return `Output` during exchange This is needed to forward the `ClientKey` that's required to connect the proxy to a compute. Co-authored-by: bojanserafimov --- proxy/src/sasl.rs | 13 ++++++++++++- proxy/src/sasl/messages.rs | 1 + proxy/src/sasl/stream.rs | 13 +++++++++---- proxy/src/scram.rs | 4 ++-- proxy/src/scram/exchange.rs | 10 ++++++---- 5 files changed, 30 insertions(+), 11 deletions(-) diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs index 70a4d9946a..cd9032bfb9 100644 --- a/proxy/src/sasl.rs +++ b/proxy/src/sasl.rs @@ -39,9 +39,20 @@ pub enum Error { /// A convenient result type for SASL exchange. pub type Result = std::result::Result; +/// A result of one SASL exchange. +pub enum Step { + /// We should continue exchanging messages. + Continue(T), + /// The client has been authenticated successfully. + Authenticated(R), +} + /// Every SASL mechanism (e.g. [SCRAM](crate::scram)) is expected to implement this trait. pub trait Mechanism: Sized { + /// What's produced as a result of successful authentication. + type Output; + /// Produce a server challenge to be sent to the client. /// This is how this method is called in PostgreSQL (`libpq/sasl.h`). - fn exchange(self, input: &str) -> Result<(Option, String)>; + fn exchange(self, input: &str) -> Result<(Step, String)>; } diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs index 58be6268fe..f48aee4f26 100644 --- a/proxy/src/sasl/messages.rs +++ b/proxy/src/sasl/messages.rs @@ -49,6 +49,7 @@ impl<'a> ServerMessage<&'a str> { }) } } + #[cfg(test)] mod tests { use super::*; diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs index 03649b8d11..0e782c5f29 100644 --- a/proxy/src/sasl/stream.rs +++ b/proxy/src/sasl/stream.rs @@ -51,18 +51,23 @@ impl SaslStream<'_, S> { impl SaslStream<'_, S> { /// Perform SASL message exchange according to the underlying algorithm /// until user is either authenticated or denied access. - pub async fn authenticate(mut self, mut mechanism: impl Mechanism) -> super::Result<()> { + pub async fn authenticate( + mut self, + mut mechanism: M, + ) -> super::Result { loop { let input = self.recv().await?; let (moved, reply) = mechanism.exchange(input)?; + + use super::Step::*; match moved { - Some(moved) => { + Continue(moved) => { self.send(&ServerMessage::Continue(&reply)).await?; mechanism = moved; } - None => { + Authenticated(result) => { self.send(&ServerMessage::Final(&reply)).await?; - return Ok(()); + return Ok(result); } } } diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index 44671084ee..22fce7ac7e 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -13,10 +13,10 @@ mod password; mod secret; mod signature; -pub use secret::*; - pub use exchange::Exchange; +pub use key::ScramKey; pub use secret::ServerSecret; +pub use secret::*; use hmac::{Hmac, Mac}; use sha2::{Digest, Sha256}; diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 802fe61db5..cad77e15f5 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -62,8 +62,10 @@ impl<'a> Exchange<'a> { } impl sasl::Mechanism for Exchange<'_> { - fn exchange(mut self, input: &str) -> sasl::Result<(Option, String)> { - use ExchangeState::*; + type Output = super::ScramKey; + + fn exchange(mut self, input: &str) -> sasl::Result<(sasl::Step, String)> { + use {sasl::Step::*, ExchangeState::*}; match &self.state { Initial => { let client_first_message = @@ -82,7 +84,7 @@ impl sasl::Mechanism for Exchange<'_> { server_first_message, }; - Ok((Some(self), msg)) + Ok((Continue(self), msg)) } SaltSent { cbind_flag, @@ -124,7 +126,7 @@ impl sasl::Mechanism for Exchange<'_> { let msg = client_final_message .build_server_final_message(signature_builder, &self.secret.server_key); - Ok((None, msg)) + Ok((Authenticated(client_key), msg)) } } } From af0195b60478bc82cbb7c95c1421b5ab4c3e752e Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 27 Apr 2022 13:34:59 +0300 Subject: [PATCH 0220/1022] [proxy] Introduce `cloud::Api` for communication with Neon Cloud * `cloud::legacy` talks to Cloud API V1. * `cloud::api` defines Cloud API v2. * `cloud::local` mocks the Cloud API V2 using a local postgres instance. * It's possible to choose between API versions using the `--api-version` flag. --- proxy/Cargo.toml | 2 +- proxy/src/auth.rs | 129 +++++++++++-------- proxy/src/auth/credentials.rs | 30 ++--- proxy/src/auth/flow.rs | 28 +--- proxy/src/cloud.rs | 46 +++++++ proxy/src/cloud/api.rs | 120 +++++++++++++++++ proxy/src/{cplane_api.rs => cloud/legacy.rs} | 65 +++------- proxy/src/cloud/local.rs | 76 +++++++++++ proxy/src/compute.rs | 63 +++------ proxy/src/config.rs | 84 +++++------- proxy/src/main.rs | 108 ++++++++-------- proxy/src/mgmt.rs | 8 +- proxy/src/proxy.rs | 4 +- proxy/src/scram.rs | 4 +- proxy/src/scram/key.rs | 4 + 15 files changed, 471 insertions(+), 300 deletions(-) create mode 100644 proxy/src/cloud.rs create mode 100644 proxy/src/cloud/api.rs rename proxy/src/{cplane_api.rs => cloud/legacy.rs} (81%) create mode 100644 proxy/src/cloud/local.rs diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index f7e872ceb9..73412609f3 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -5,6 +5,7 @@ edition = "2021" [dependencies] anyhow = "1.0" +async-trait = "0.1" base64 = "0.13.0" bytes = { version = "1.0.1", features = ['serde'] } clap = "3.0" @@ -37,7 +38,6 @@ metrics = { path = "../libs/metrics" } workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] -async-trait = "0.1" rcgen = "0.8.14" rstest = "0.12" tokio-postgres-rustls = "0.9.0" diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index c6d32040dc..5234dfc2c6 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,22 +1,16 @@ mod credentials; - -#[cfg(test)] mod flow; -use crate::compute::DatabaseInfo; -use crate::config::ProxyConfig; -use crate::cplane_api::{self, CPlaneApi}; +use crate::config::{CloudApi, ProxyConfig}; use crate::error::UserFacingError; use crate::stream::PqStream; -use crate::waiters; +use crate::{cloud, compute, waiters}; use std::io; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; pub use credentials::ClientCredentials; - -#[cfg(test)] pub use flow::*; /// Common authentication error. @@ -24,9 +18,14 @@ pub use flow::*; pub enum AuthErrorImpl { /// Authentication error reported by the console. #[error(transparent)] - Console(#[from] cplane_api::AuthError), + Console(#[from] cloud::AuthError), + + #[error(transparent)] + GetAuthInfo(#[from] cloud::api::GetAuthInfoError), + + #[error(transparent)] + WakeCompute(#[from] cloud::api::WakeComputeError), - #[cfg(test)] #[error(transparent)] Sasl(#[from] crate::sasl::Error), @@ -41,19 +40,19 @@ pub enum AuthErrorImpl { impl AuthErrorImpl { pub fn auth_failed(msg: impl Into) -> Self { - AuthErrorImpl::Console(cplane_api::AuthError::auth_failed(msg)) + AuthErrorImpl::Console(cloud::AuthError::auth_failed(msg)) } } impl From for AuthErrorImpl { fn from(e: waiters::RegisterError) -> Self { - AuthErrorImpl::Console(cplane_api::AuthError::from(e)) + AuthErrorImpl::Console(cloud::AuthError::from(e)) } } impl From for AuthErrorImpl { fn from(e: waiters::WaitError) -> Self { - AuthErrorImpl::Console(cplane_api::AuthError::from(e)) + AuthErrorImpl::Console(cloud::AuthError::from(e)) } } @@ -81,40 +80,28 @@ impl UserFacingError for AuthError { } } -async fn handle_static( - host: String, - port: u16, - client: &mut PqStream, - creds: ClientCredentials, -) -> Result { - client - .write_message(&Be::AuthenticationCleartextPassword) - .await?; - - // Read client's password bytes - let msg = client.read_password_message().await?; - let cleartext_password = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; - - let db_info = DatabaseInfo { - host, - port, - dbname: creds.dbname.clone(), - user: creds.user.clone(), - password: Some(cleartext_password.into()), - }; - - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; - - Ok(db_info) -} - -async fn handle_existing_user( +async fn handle_user( config: &ProxyConfig, client: &mut PqStream, creds: ClientCredentials, -) -> Result { +) -> Result { + if creds.is_existing_user() { + match &config.cloud_endpoint { + CloudApi::V1(api) => handle_existing_user_v1(api, client, creds).await, + CloudApi::V2(api) => handle_existing_user_v2(api.as_ref(), client, creds).await, + } + } else { + let redirect_uri = config.redirect_uri.as_ref(); + handle_new_user(redirect_uri, client).await + } +} + +/// Authenticate user via a legacy cloud API endpoint. +async fn handle_existing_user_v1( + cloud: &cloud::Legacy, + client: &mut PqStream, + creds: ClientCredentials, +) -> Result { let psql_session_id = new_psql_session_id(); let md5_salt = rand::random(); @@ -126,8 +113,7 @@ async fn handle_existing_user( let msg = client.read_password_message().await?; let md5_response = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; - let cplane = CPlaneApi::new(config.auth_endpoint.clone()); - let db_info = cplane + let db_info = cloud .authenticate_proxy_client(creds, md5_response, &md5_salt, &psql_session_id) .await?; @@ -135,17 +121,53 @@ async fn handle_existing_user( .write_message_noflush(&Be::AuthenticationOk)? .write_message_noflush(&BeParameterStatusMessage::encoding())?; - Ok(db_info) + Ok(compute::NodeInfo { + db_info, + scram_keys: None, + }) +} + +/// Authenticate user via a new cloud API endpoint which supports SCRAM. +async fn handle_existing_user_v2( + cloud: &(impl cloud::Api + ?Sized), + client: &mut PqStream, + creds: ClientCredentials, +) -> Result { + let auth_info = cloud.get_auth_info(&creds).await?; + + let flow = AuthFlow::new(client); + let scram_keys = match auth_info { + cloud::api::AuthInfo::Md5(_) => { + // TODO: decide if we should support MD5 in api v2 + return Err(AuthErrorImpl::auth_failed("MD5 is not supported").into()); + } + cloud::api::AuthInfo::Scram(secret) => { + let scram = Scram(&secret); + Some(compute::ScramKeys { + client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), + server_key: secret.server_key.as_bytes(), + }) + } + }; + + client + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())?; + + Ok(compute::NodeInfo { + db_info: cloud.wake_compute(&creds).await?, + scram_keys, + }) } async fn handle_new_user( - config: &ProxyConfig, + redirect_uri: &str, client: &mut PqStream, -) -> Result { +) -> Result { let psql_session_id = new_psql_session_id(); - let greeting = hello_message(&config.redirect_uri, &psql_session_id); + let greeting = hello_message(redirect_uri, &psql_session_id); - let db_info = cplane_api::with_waiter(psql_session_id, |waiter| async { + let db_info = cloud::with_waiter(psql_session_id, |waiter| async { // Give user a URL to spawn a new database client .write_message_noflush(&Be::AuthenticationOk)? @@ -160,7 +182,10 @@ async fn handle_new_user( client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; - Ok(db_info) + Ok(compute::NodeInfo { + db_info, + scram_keys: None, + }) } fn new_psql_session_id() -> String { diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index c3bb6da4f8..a3d06b49a2 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,7 +1,7 @@ //! User credentials used in authentication. use super::AuthError; -use crate::compute::DatabaseInfo; +use crate::compute; use crate::config::ProxyConfig; use crate::error::UserFacingError; use crate::stream::PqStream; @@ -18,12 +18,20 @@ pub enum ClientCredsParseError { impl UserFacingError for ClientCredsParseError {} /// Various client credentials which we use for authentication. -#[derive(Debug, PartialEq, Eq)] +/// Note that we don't store any kind of client key or password here. +#[derive(Debug, Clone, PartialEq, Eq)] pub struct ClientCredentials { pub user: String, pub dbname: String, } +impl ClientCredentials { + pub fn is_existing_user(&self) -> bool { + // This logic will likely change in the future. + self.user.ends_with("@zenith") + } +} + impl TryFrom> for ClientCredentials { type Error = ClientCredsParseError; @@ -47,20 +55,8 @@ impl ClientCredentials { self, config: &ProxyConfig, client: &mut PqStream, - ) -> Result { - use crate::config::ClientAuthMethod::*; - use crate::config::RouterConfig::*; - match &config.router_config { - Static { host, port } => super::handle_static(host.clone(), *port, client, self).await, - Dynamic(Mixed) => { - if self.user.ends_with("@zenith") { - super::handle_existing_user(config, client, self).await - } else { - super::handle_new_user(config, client).await - } - } - Dynamic(Password) => super::handle_existing_user(config, client, self).await, - Dynamic(Link) => super::handle_new_user(config, client).await, - } + ) -> Result { + // This method is just a convenient facade for `handle_user` + super::handle_user(config, client, self).await } } diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index bcfd94a9ed..3eed0f0a23 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -27,19 +27,6 @@ impl AuthMethod for Scram<'_> { } } -/// Use password-based auth in [`AuthFlow`]. -pub struct Md5( - /// Salt for client. - pub [u8; 4], -); - -impl AuthMethod for Md5 { - #[inline(always)] - fn first_message(&self) -> BeMessage<'_> { - Be::AuthenticationMD5Password(self.0) - } -} - /// This wrapper for [`PqStream`] performs client authentication. #[must_use] pub struct AuthFlow<'a, Stream, State> { @@ -70,19 +57,10 @@ impl<'a, S: AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { } } -/// Stream wrapper for handling simple MD5 password auth. -impl AuthFlow<'_, S, Md5> { - /// Perform user authentication. Raise an error in case authentication failed. - #[allow(unused)] - pub async fn authenticate(self) -> Result<(), AuthError> { - unimplemented!("MD5 auth flow is yet to be implemented"); - } -} - /// Stream wrapper for handling [SCRAM](crate::scram) auth. impl AuthFlow<'_, S, Scram<'_>> { /// Perform user authentication. Raise an error in case authentication failed. - pub async fn authenticate(self) -> Result<(), AuthError> { + pub async fn authenticate(self) -> Result { // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; let sasl = sasl::FirstMessage::parse(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; @@ -93,10 +71,10 @@ impl AuthFlow<'_, S, Scram<'_>> { } let secret = self.state.0; - sasl::SaslStream::new(self.stream, sasl.message) + let key = sasl::SaslStream::new(self.stream, sasl.message) .authenticate(scram::Exchange::new(secret, rand::random, None)) .await?; - Ok(()) + Ok(key) } } diff --git a/proxy/src/cloud.rs b/proxy/src/cloud.rs new file mode 100644 index 0000000000..679cfb97e1 --- /dev/null +++ b/proxy/src/cloud.rs @@ -0,0 +1,46 @@ +mod local; + +mod legacy; +pub use legacy::{AuthError, AuthErrorImpl, Legacy}; + +pub mod api; +pub use api::{Api, BoxedApi}; + +use crate::mgmt; +use crate::waiters::{self, Waiter, Waiters}; +use lazy_static::lazy_static; + +lazy_static! { + static ref CPLANE_WAITERS: Waiters = Default::default(); +} + +/// Give caller an opportunity to wait for the cloud's reply. +pub async fn with_waiter( + psql_session_id: impl Into, + action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R, +) -> Result +where + R: std::future::Future>, + E: From, +{ + let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; + action(waiter).await +} + +pub fn notify(psql_session_id: &str, msg: mgmt::ComputeReady) -> Result<(), waiters::NotifyError> { + CPLANE_WAITERS.notify(psql_session_id, msg) +} + +/// Construct a new opaque cloud API provider. +pub fn new(url: reqwest::Url) -> anyhow::Result { + Ok(match url.scheme() { + "https" | "http" => { + todo!("build a real cloud wrapper") + } + "postgresql" | "postgres" | "pg" => { + // Just point to a local running postgres instance. + Box::new(local::Local { url }) + } + other => anyhow::bail!("unsupported url scheme: {other}"), + }) +} diff --git a/proxy/src/cloud/api.rs b/proxy/src/cloud/api.rs new file mode 100644 index 0000000000..713140c1e6 --- /dev/null +++ b/proxy/src/cloud/api.rs @@ -0,0 +1,120 @@ +//! Declaration of Cloud API V2. + +use crate::{auth, scram}; +use async_trait::async_trait; +use serde::{Deserialize, Serialize}; +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum GetAuthInfoError { + // We shouldn't include the actual secret here. + #[error("Bad authentication secret")] + BadSecret, + + #[error("Bad client credentials: {0:?}")] + BadCredentials(crate::auth::ClientCredentials), + + #[error(transparent)] + Io(#[from] std::io::Error), +} + +// TODO: convert to an enum and describe possible sub-errors (see above) +#[derive(Debug, Error)] +#[error("Failed to wake up the compute node")] +pub struct WakeComputeError; + +/// Opaque implementation of Cloud API. +pub type BoxedApi = Box; + +/// Cloud API methods required by the proxy. +#[async_trait] +pub trait Api { + /// Get authentication information for the given user. + async fn get_auth_info( + &self, + creds: &auth::ClientCredentials, + ) -> Result; + + /// Wake up the compute node and return the corresponding connection info. + async fn wake_compute( + &self, + creds: &auth::ClientCredentials, + ) -> Result; +} + +/// Auth secret which is managed by the cloud. +pub enum AuthInfo { + /// Md5 hash of user's password. + Md5([u8; 16]), + /// [SCRAM](crate::scram) authentication info. + Scram(scram::ServerSecret), +} + +/// Compute node connection params provided by the cloud. +/// Note how it implements serde traits, since we receive it over the wire. +#[derive(Serialize, Deserialize, Default)] +pub struct DatabaseInfo { + pub host: String, + pub port: u16, + pub dbname: String, + pub user: String, + + /// [Cloud API V1](super::legacy) returns cleartext password, + /// but [Cloud API V2](super::api) implements [SCRAM](crate::scram) + /// authentication, so we can leverage this method and cope without password. + pub password: Option, +} + +// Manually implement debug to omit personal and sensitive info. +impl std::fmt::Debug for DatabaseInfo { + fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { + fmt.debug_struct("DatabaseInfo") + .field("host", &self.host) + .field("port", &self.port) + .finish() + } +} + +impl From for tokio_postgres::Config { + fn from(db_info: DatabaseInfo) -> Self { + let mut config = tokio_postgres::Config::new(); + + config + .host(&db_info.host) + .port(db_info.port) + .dbname(&db_info.dbname) + .user(&db_info.user); + + if let Some(password) = db_info.password { + config.password(password); + } + + config + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn parse_db_info() -> anyhow::Result<()> { + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "password": "password", + }))?; + + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + }))?; + + Ok(()) + } +} diff --git a/proxy/src/cplane_api.rs b/proxy/src/cloud/legacy.rs similarity index 81% rename from proxy/src/cplane_api.rs rename to proxy/src/cloud/legacy.rs index 21fce79df3..7d99995f1a 100644 --- a/proxy/src/cplane_api.rs +++ b/proxy/src/cloud/legacy.rs @@ -1,42 +1,19 @@ +//! Cloud API V1. + +use super::api::DatabaseInfo; use crate::auth::ClientCredentials; -use crate::compute::DatabaseInfo; use crate::error::UserFacingError; -use crate::mgmt; -use crate::waiters::{self, Waiter, Waiters}; -use lazy_static::lazy_static; +use crate::waiters; use serde::{Deserialize, Serialize}; use thiserror::Error; -lazy_static! { - static ref CPLANE_WAITERS: Waiters = Default::default(); -} - -/// Give caller an opportunity to wait for cplane's reply. -pub async fn with_waiter( - psql_session_id: impl Into, - action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R, -) -> Result -where - R: std::future::Future>, - E: From, -{ - let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; - action(waiter).await -} - -pub fn notify( - psql_session_id: &str, - msg: Result, -) -> Result<(), waiters::NotifyError> { - CPLANE_WAITERS.notify(psql_session_id, msg) -} - -/// Zenith console API wrapper. -pub struct CPlaneApi { +/// Neon cloud API provider. +pub struct Legacy { auth_endpoint: reqwest::Url, } -impl CPlaneApi { +impl Legacy { + /// Construct a new legacy cloud API provider. pub fn new(auth_endpoint: reqwest::Url) -> Self { Self { auth_endpoint } } @@ -95,7 +72,17 @@ impl UserFacingError for AuthError { } } -impl CPlaneApi { +// NOTE: the order of constructors is important. +// https://serde.rs/enum-representations.html#untagged +#[derive(Serialize, Deserialize, Debug)] +#[serde(untagged)] +enum ProxyAuthResponse { + Ready { conn_info: DatabaseInfo }, + Error { error: String }, + NotReady { ready: bool }, // TODO: get rid of `ready` +} + +impl Legacy { pub async fn authenticate_proxy_client( &self, creds: ClientCredentials, @@ -111,8 +98,8 @@ impl CPlaneApi { .append_pair("salt", &hex::encode(salt)) .append_pair("psql_session_id", psql_session_id); - with_waiter(psql_session_id, |waiter| async { - println!("cplane request: {}", url); + super::with_waiter(psql_session_id, |waiter| async { + println!("cloud request: {}", url); // TODO: leverage `reqwest::Client` to reuse connections let resp = reqwest::get(url).await?; if !resp.status().is_success() { @@ -135,16 +122,6 @@ impl CPlaneApi { } } -// NOTE: the order of constructors is important. -// https://serde.rs/enum-representations.html#untagged -#[derive(Serialize, Deserialize, Debug)] -#[serde(untagged)] -enum ProxyAuthResponse { - Ready { conn_info: DatabaseInfo }, - Error { error: String }, - NotReady { ready: bool }, // TODO: get rid of `ready` -} - #[cfg(test)] mod tests { use super::*; diff --git a/proxy/src/cloud/local.rs b/proxy/src/cloud/local.rs new file mode 100644 index 0000000000..88eda6630c --- /dev/null +++ b/proxy/src/cloud/local.rs @@ -0,0 +1,76 @@ +//! Local mock of Cloud API V2. + +use super::api::{self, Api, AuthInfo, DatabaseInfo}; +use crate::auth::ClientCredentials; +use crate::scram; +use async_trait::async_trait; + +/// Mocked cloud for testing purposes. +pub struct Local { + /// Database url, e.g. `postgres://user:password@localhost:5432/database`. + pub url: reqwest::Url, +} + +#[async_trait] +impl Api for Local { + async fn get_auth_info( + &self, + creds: &ClientCredentials, + ) -> Result { + // We wrap `tokio_postgres::Error` because we don't want to infect the + // method's error type with a detail that's specific to debug mode only. + let io_error = |e| std::io::Error::new(std::io::ErrorKind::Other, e); + + // Perhaps we could persist this connection, but then we'd have to + // write more code for reopening it if it got closed, which doesn't + // seem worth it. + let (client, connection) = + tokio_postgres::connect(self.url.as_str(), tokio_postgres::NoTls) + .await + .map_err(io_error)?; + + tokio::spawn(connection); + let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1"; + let rows = client + .query(query, &[&creds.user]) + .await + .map_err(io_error)?; + + match &rows[..] { + // We can't get a secret if there's no such user. + [] => Err(api::GetAuthInfoError::BadCredentials(creds.to_owned())), + // We shouldn't get more than one row anyway. + [row, ..] => { + let entry = row.try_get(0).map_err(io_error)?; + scram::ServerSecret::parse(entry) + .map(AuthInfo::Scram) + .or_else(|| { + // It could be an md5 hash if it's not a SCRAM secret. + let text = entry.strip_prefix("md5")?; + Some(AuthInfo::Md5({ + let mut bytes = [0u8; 16]; + hex::decode_to_slice(text, &mut bytes).ok()?; + bytes + })) + }) + // Putting the secret into this message is a security hazard! + .ok_or(api::GetAuthInfoError::BadSecret) + } + } + } + + async fn wake_compute( + &self, + creds: &ClientCredentials, + ) -> Result { + // Local setup doesn't have a dedicated compute node, + // so we just return the local database we're pointed at. + Ok(DatabaseInfo { + host: self.url.host_str().unwrap_or("localhost").to_owned(), + port: self.url.port().unwrap_or(5432), + dbname: creds.dbname.to_owned(), + user: creds.user.to_owned(), + password: None, + }) + } +} diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 3c0eee29bc..9949e91ea2 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,6 +1,6 @@ use crate::cancellation::CancelClosure; +use crate::cloud::api::DatabaseInfo; use crate::error::UserFacingError; -use serde::{Deserialize, Serialize}; use std::io; use std::net::SocketAddr; use thiserror::Error; @@ -23,32 +23,21 @@ pub enum ConnectionError { impl UserFacingError for ConnectionError {} -/// Compute node connection params. -#[derive(Serialize, Deserialize, Default)] -pub struct DatabaseInfo { - pub host: String, - pub port: u16, - pub dbname: String, - pub user: String, - pub password: Option, -} - -// Manually implement debug to omit personal and sensitive info -impl std::fmt::Debug for DatabaseInfo { - fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { - fmt.debug_struct("DatabaseInfo") - .field("host", &self.host) - .field("port", &self.port) - .finish() - } -} - /// PostgreSQL version as [`String`]. pub type Version = String; -impl DatabaseInfo { +/// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`. +pub type ScramKeys = tokio_postgres::config::ScramKeys<32>; + +/// Compute node connection params. +pub struct NodeInfo { + pub db_info: DatabaseInfo, + pub scram_keys: Option, +} + +impl NodeInfo { async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { - let host_port = format!("{}:{}", self.host, self.port); + let host_port = format!("{}:{}", self.db_info.host, self.db_info.port); let socket = TcpStream::connect(host_port).await?; let socket_addr = socket.peer_addr()?; socket2::SockRef::from(&socket).set_keepalive(true)?; @@ -63,11 +52,13 @@ impl DatabaseInfo { .await .map_err(|_| ConnectionError::FailedToConnectToCompute)?; - // TODO: establish a secure connection to the DB - let (client, conn) = tokio_postgres::Config::from(self) - .connect_raw(&mut socket, NoTls) - .await?; + let mut config = tokio_postgres::Config::from(self.db_info); + if let Some(scram_keys) = self.scram_keys { + config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(scram_keys)); + } + // TODO: establish a secure connection to the DB + let (client, conn) = config.connect_raw(&mut socket, NoTls).await?; let version = conn .parameter("server_version") .ok_or(ConnectionError::FailedToFetchPgVersion)? @@ -78,21 +69,3 @@ impl DatabaseInfo { Ok((socket, version, cancel_closure)) } } - -impl From for tokio_postgres::Config { - fn from(db_info: DatabaseInfo) -> Self { - let mut config = tokio_postgres::Config::new(); - - config - .host(&db_info.host) - .port(db_info.port) - .dbname(&db_info.dbname) - .user(&db_info.user); - - if let Some(password) = db_info.password { - config.password(password); - } - - config - } -} diff --git a/proxy/src/config.rs b/proxy/src/config.rs index aef079d089..6b30df604d 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,65 +1,43 @@ +use crate::cloud; use anyhow::{bail, ensure, Context}; -use std::net::SocketAddr; -use std::str::FromStr; use std::sync::Arc; -pub type TlsConfig = Arc; - -#[non_exhaustive] -pub enum ClientAuthMethod { - Password, - Link, - - /// Use password auth only if username ends with "@zenith" - Mixed, -} - -pub enum RouterConfig { - Static { host: String, port: u16 }, - Dynamic(ClientAuthMethod), -} - -impl FromStr for ClientAuthMethod { - type Err = anyhow::Error; - - fn from_str(s: &str) -> anyhow::Result { - use ClientAuthMethod::*; - match s { - "password" => Ok(Password), - "link" => Ok(Link), - "mixed" => Ok(Mixed), - _ => bail!("Invalid option for router: `{}`", s), - } - } -} - pub struct ProxyConfig { - /// main entrypoint for users to connect to - pub proxy_address: SocketAddr, + /// Unauthenticated users will be redirected to this URL. + pub redirect_uri: reqwest::Url, - /// method of assigning compute nodes - pub router_config: RouterConfig, - - /// internally used for status and prometheus metrics - pub http_address: SocketAddr, - - /// management endpoint. Upon user account creation control plane - /// will notify us here, so that we can 'unfreeze' user session. - /// TODO It uses postgres protocol over TCP but should be migrated to http. - pub mgmt_address: SocketAddr, - - /// send unauthenticated users to this URI - pub redirect_uri: String, - - /// control plane address where we would check auth. - pub auth_endpoint: reqwest::Url, + /// Cloud API endpoint for user authentication. + pub cloud_endpoint: CloudApi, + /// TLS configuration for the proxy. pub tls_config: Option, } -pub fn configure_ssl(key_path: &str, cert_path: &str) -> anyhow::Result { +/// Cloud API configuration. +pub enum CloudApi { + /// We'll drop this one when [`CloudApi::V2`] is stable. + V1(crate::cloud::Legacy), + /// The new version of the cloud API. + V2(crate::cloud::BoxedApi), +} + +impl CloudApi { + /// Configure Cloud API provider. + pub fn new(version: &str, url: reqwest::Url) -> anyhow::Result { + Ok(match version { + "v1" => Self::V1(cloud::Legacy::new(url)), + "v2" => Self::V2(cloud::new(url)?), + _ => bail!("unknown cloud API version: {}", version), + }) + } +} + +pub type TlsConfig = Arc; + +/// Configure TLS for the main endpoint. +pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result { let key = { - let key_bytes = std::fs::read(key_path).context("SSL key file")?; + let key_bytes = std::fs::read(key_path).context("TLS key file")?; let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) .context("couldn't read TLS keys")?; @@ -68,7 +46,7 @@ pub fn configure_ssl(key_path: &str, cert_path: &str) -> anyhow::Result>` into `Result`. async fn flatten_err( f: impl Future, JoinError>>, @@ -44,7 +37,7 @@ async fn flatten_err( #[tokio::main] async fn main() -> anyhow::Result<()> { metrics::set_common_metrics_prefix("zenith_proxy"); - let arg_matches = App::new("Zenith proxy/router") + let arg_matches = App::new("Neon proxy/router") .version(GIT_VERSION) .arg( Arg::new("proxy") @@ -97,77 +90,80 @@ async fn main() -> anyhow::Result<()> { .short('a') .long("auth-endpoint") .takes_value(true) - .help("API endpoint for authenticating users") + .help("cloud API endpoint for authenticating users") .default_value("http://localhost:3000/authenticate_proxy_request/"), ) .arg( - Arg::new("ssl-key") - .short('k') - .long("ssl-key") + Arg::new("api-version") + .long("api-version") .takes_value(true) - .help("path to SSL key for client postgres connections"), + .default_value("v1") + .possible_values(["v1", "v2"]) + .help("cloud API version to be used for authentication"), ) .arg( - Arg::new("ssl-cert") - .short('c') - .long("ssl-cert") + Arg::new("tls-key") + .short('k') + .long("tls-key") + .alias("ssl-key") // backwards compatibility .takes_value(true) - .help("path to SSL cert for client postgres connections"), + .help("path to TLS key for client postgres connections"), + ) + .arg( + Arg::new("tls-cert") + .short('c') + .long("tls-cert") + .alias("ssl-cert") // backwards compatibility + .takes_value(true) + .help("path to TLS cert for client postgres connections"), ) .get_matches(); let tls_config = match ( - arg_matches.value_of("ssl-key"), - arg_matches.value_of("ssl-cert"), + arg_matches.value_of("tls-key"), + arg_matches.value_of("tls-cert"), ) { - (Some(key_path), Some(cert_path)) => Some(config::configure_ssl(key_path, cert_path)?), + (Some(key_path), Some(cert_path)) => Some(config::configure_tls(key_path, cert_path)?), (None, None) => None, - _ => bail!("either both or neither ssl-key and ssl-cert must be specified"), + _ => bail!("either both or neither tls-key and tls-cert must be specified"), }; - let auth_method = arg_matches.value_of("auth-method").unwrap().parse()?; - let router_config = match arg_matches.value_of("static-router") { - None => RouterConfig::Dynamic(auth_method), - Some(addr) => { - if let ClientAuthMethod::Password = auth_method { - let (host, port) = addr.split_once(':').unwrap(); - RouterConfig::Static { - host: host.to_string(), - port: port.parse().unwrap(), - } - } else { - bail!("static-router requires --auth-method password") - } - } - }; + let proxy_address: SocketAddr = arg_matches.value_of("proxy").unwrap().parse()?; + let mgmt_address: SocketAddr = arg_matches.value_of("mgmt").unwrap().parse()?; + let http_address: SocketAddr = arg_matches.value_of("http").unwrap().parse()?; + + let cloud_endpoint = config::CloudApi::new( + arg_matches.value_of("api-version").unwrap(), + arg_matches.value_of("auth-endpoint").unwrap().parse()?, + )?; let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig { - router_config, - proxy_address: arg_matches.value_of("proxy").unwrap().parse()?, - mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?, - http_address: arg_matches.value_of("http").unwrap().parse()?, redirect_uri: arg_matches.value_of("uri").unwrap().parse()?, - auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, + cloud_endpoint, tls_config, })); println!("Version: {}", GIT_VERSION); // Check that we can bind to address before further initialization - println!("Starting http on {}", config.http_address); - let http_listener = TcpListener::bind(config.http_address).await?.into_std()?; + println!("Starting http on {}", http_address); + let http_listener = TcpListener::bind(http_address).await?.into_std()?; - println!("Starting mgmt on {}", config.mgmt_address); - let mgmt_listener = TcpListener::bind(config.mgmt_address).await?.into_std()?; + println!("Starting mgmt on {}", mgmt_address); + let mgmt_listener = TcpListener::bind(mgmt_address).await?.into_std()?; - println!("Starting proxy on {}", config.proxy_address); - let proxy_listener = TcpListener::bind(config.proxy_address).await?; + println!("Starting proxy on {}", proxy_address); + let proxy_listener = TcpListener::bind(proxy_address).await?; - let http = tokio::spawn(http::thread_main(http_listener)); - let proxy = tokio::spawn(proxy::thread_main(config, proxy_listener)); - let mgmt = tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)); + let tasks = [ + tokio::spawn(http::thread_main(http_listener)), + tokio::spawn(proxy::thread_main(config, proxy_listener)), + tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)), + ] + .map(flatten_err); - let tasks = [flatten_err(http), flatten_err(proxy), flatten_err(mgmt)]; + // This will block until all tasks have completed. + // Furthermore, the first one to fail will cancel the rest. let _: Vec<()> = futures::future::try_join_all(tasks).await?; Ok(()) diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 23ad8a2013..c48df653d3 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -1,4 +1,4 @@ -use crate::{compute::DatabaseInfo, cplane_api}; +use crate::cloud; use anyhow::Context; use serde::Deserialize; use std::{ @@ -75,12 +75,12 @@ struct PsqlSessionResponse { #[derive(Deserialize)] enum PsqlSessionResult { - Success(DatabaseInfo), + Success(cloud::api::DatabaseInfo), Failure(String), } /// A message received by `mgmt` when a compute node is ready. -pub type ComputeReady = Result; +pub type ComputeReady = Result; impl PsqlSessionResult { fn into_compute_ready(self) -> ComputeReady { @@ -111,7 +111,7 @@ fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::R let resp: PsqlSessionResponse = serde_json::from_str(query_string)?; - match cplane_api::notify(&resp.session_id, resp.result.into_compute_ready()) { + match cloud::notify(&resp.session_id, resp.result.into_compute_ready()) { Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index f7de1618df..4bce2bf40d 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -185,10 +185,10 @@ impl Client { // Authenticate and connect to a compute node. let auth = creds.authenticate(config, &mut stream).await; - let db_info = async { auth }.or_else(|e| stream.throw_error(e)).await?; + let node = async { auth }.or_else(|e| stream.throw_error(e)).await?; let (db, version, cancel_closure) = - db_info.connect().or_else(|e| stream.throw_error(e)).await?; + node.connect().or_else(|e| stream.throw_error(e)).await?; let cancel_key_data = session.enable_cancellation(cancel_closure); stream diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index 22fce7ac7e..7cc4191435 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -9,10 +9,12 @@ mod exchange; mod key; mod messages; -mod password; mod secret; mod signature; +#[cfg(test)] +mod password; + pub use exchange::Exchange; pub use key::ScramKey; pub use secret::ServerSecret; diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs index 73dd5e1d5c..e9c65fcef3 100644 --- a/proxy/src/scram/key.rs +++ b/proxy/src/scram/key.rs @@ -16,6 +16,10 @@ impl ScramKey { pub fn sha256(&self) -> Self { super::sha256([self.as_ref()]).into() } + + pub fn as_bytes(&self) -> [u8; SCRAM_KEY_LEN] { + self.bytes + } } impl From<[u8; SCRAM_KEY_LEN]> for ScramKey { From 0323bb58701767b8ce5c816637ba316166f6fb41 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Sat, 30 Apr 2022 00:58:57 +0300 Subject: [PATCH 0221/1022] [proxy] Refactor cplane API and add new console SCRAM auth API Now proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following backends are currently implemented: * legacy old method, when username ends with `@zenith` it uses md5 auth dbname as the cluster name; otherwise, it sends a login link and waits for the console to call back * console new SCRAM-based console API; uses SNI info to select the destination cluster * postgres uses postgres to select auth secrets of existing roles. Useful for local testing * link sends login link for all usernames --- .gitignore | 3 + Cargo.lock | 1 + proxy/Cargo.toml | 1 + proxy/README.md | 33 ++++ proxy/src/auth.rs | 159 +++------------ proxy/src/auth/credentials.rs | 12 +- proxy/src/{cloud.rs => auth_backend.rs} | 25 +-- proxy/src/auth_backend/console.rs | 236 +++++++++++++++++++++++ proxy/src/auth_backend/legacy_console.rs | 206 ++++++++++++++++++++ proxy/src/auth_backend/link.rs | 52 +++++ proxy/src/auth_backend/postgres.rs | 93 +++++++++ proxy/src/cloud/api.rs | 120 ------------ proxy/src/cloud/legacy.rs | 160 --------------- proxy/src/cloud/local.rs | 76 -------- proxy/src/compute.rs | 2 +- proxy/src/config.rs | 56 +++--- proxy/src/main.rs | 37 +--- proxy/src/mgmt.rs | 10 +- proxy/src/proxy.rs | 6 +- proxy/src/scram/secret.rs | 1 + test_runner/fixtures/zenith_fixtures.py | 11 +- 21 files changed, 722 insertions(+), 578 deletions(-) create mode 100644 proxy/README.md rename proxy/src/{cloud.rs => auth_backend.rs} (56%) create mode 100644 proxy/src/auth_backend/console.rs create mode 100644 proxy/src/auth_backend/legacy_console.rs create mode 100644 proxy/src/auth_backend/link.rs create mode 100644 proxy/src/auth_backend/postgres.rs delete mode 100644 proxy/src/cloud/api.rs delete mode 100644 proxy/src/cloud/legacy.rs delete mode 100644 proxy/src/cloud/local.rs diff --git a/.gitignore b/.gitignore index 2ecdaa2053..adb1b41503 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,6 @@ test_output/ # Coverage *.profraw *.profdata + +*.key +*.crt diff --git a/Cargo.lock b/Cargo.lock index 58125ca41c..2c081e8beb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2040,6 +2040,7 @@ dependencies = [ "tokio-postgres", "tokio-postgres-rustls", "tokio-rustls", + "url", "utils", "workspace_hack", ] diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 73412609f3..43880d645a 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -32,6 +32,7 @@ thiserror = "1.0.30" tokio = { version = "1.17", features = ["macros"] } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-rustls = "0.23.0" +url = "2.2.2" utils = { path = "../libs/utils" } metrics = { path = "../libs/metrics" } diff --git a/proxy/README.md b/proxy/README.md new file mode 100644 index 0000000000..458a7d9bbf --- /dev/null +++ b/proxy/README.md @@ -0,0 +1,33 @@ +# Proxy + +Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following backends are currently implemented: + +* legacy + old method, when username ends with `@zenith` it uses md5 auth dbname as the cluster name; otherwise, it sends a login link and waits for the console to call back +* console + new SCRAM-based console API; uses SNI info to select the destination cluster +* postgres + uses postgres to select auth secrets of existing roles. Useful for local testing +* link + sends login link for all usernames + +## Using SNI-based routing on localhost + +Now proxy determines cluster name from the subdomain, request to the `my-cluster-42.somedomain.tld` will be routed to the cluster named `my-cluster-42`. Unfortunately `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy: + +``` +openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me" + +``` + +now you can start proxy: + +``` +./target/debug/proxy -c server.crt -k server.key +``` + +and connect to it: + +``` +PGSSLROOTCERT=./server.crt psql 'postgres://my-cluster-42.localtest.me:1234?sslmode=verify-full' +``` diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 5234dfc2c6..d4e21d78a0 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,14 +1,14 @@ mod credentials; mod flow; -use crate::config::{CloudApi, ProxyConfig}; +use crate::auth_backend::{console, legacy_console, link, postgres}; +use crate::config::{AuthBackendType, ProxyConfig}; use crate::error::UserFacingError; use crate::stream::PqStream; -use crate::{cloud, compute, waiters}; +use crate::{auth_backend, compute, waiters}; use std::io; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; pub use credentials::ClientCredentials; pub use flow::*; @@ -18,13 +18,10 @@ pub use flow::*; pub enum AuthErrorImpl { /// Authentication error reported by the console. #[error(transparent)] - Console(#[from] cloud::AuthError), + Console(#[from] auth_backend::AuthError), #[error(transparent)] - GetAuthInfo(#[from] cloud::api::GetAuthInfoError), - - #[error(transparent)] - WakeCompute(#[from] cloud::api::WakeComputeError), + GetAuthInfo(#[from] auth_backend::console::ConsoleAuthError), #[error(transparent)] Sasl(#[from] crate::sasl::Error), @@ -40,19 +37,19 @@ pub enum AuthErrorImpl { impl AuthErrorImpl { pub fn auth_failed(msg: impl Into) -> Self { - AuthErrorImpl::Console(cloud::AuthError::auth_failed(msg)) + AuthErrorImpl::Console(auth_backend::AuthError::auth_failed(msg)) } } impl From for AuthErrorImpl { fn from(e: waiters::RegisterError) -> Self { - AuthErrorImpl::Console(cloud::AuthError::from(e)) + AuthErrorImpl::Console(auth_backend::AuthError::from(e)) } } impl From for AuthErrorImpl { fn from(e: waiters::WaitError) -> Self { - AuthErrorImpl::Console(cloud::AuthError::from(e)) + AuthErrorImpl::Console(auth_backend::AuthError::from(e)) } } @@ -82,131 +79,25 @@ impl UserFacingError for AuthError { async fn handle_user( config: &ProxyConfig, - client: &mut PqStream, + client: &mut PqStream, creds: ClientCredentials, ) -> Result { - if creds.is_existing_user() { - match &config.cloud_endpoint { - CloudApi::V1(api) => handle_existing_user_v1(api, client, creds).await, - CloudApi::V2(api) => handle_existing_user_v2(api.as_ref(), client, creds).await, + match config.auth_backend { + AuthBackendType::LegacyConsole => { + legacy_console::handle_user( + &config.auth_endpoint, + &config.auth_link_uri, + client, + &creds, + ) + .await } - } else { - let redirect_uri = config.redirect_uri.as_ref(); - handle_new_user(redirect_uri, client).await + AuthBackendType::Console => { + console::handle_user(config.auth_endpoint.as_ref(), client, &creds).await + } + AuthBackendType::Postgres => { + postgres::handle_user(&config.auth_endpoint, client, &creds).await + } + AuthBackendType::Link => link::handle_user(config.auth_link_uri.as_ref(), client).await, } } - -/// Authenticate user via a legacy cloud API endpoint. -async fn handle_existing_user_v1( - cloud: &cloud::Legacy, - client: &mut PqStream, - creds: ClientCredentials, -) -> Result { - let psql_session_id = new_psql_session_id(); - let md5_salt = rand::random(); - - client - .write_message(&Be::AuthenticationMD5Password(md5_salt)) - .await?; - - // Read client's password hash - let msg = client.read_password_message().await?; - let md5_response = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; - - let db_info = cloud - .authenticate_proxy_client(creds, md5_response, &md5_salt, &psql_session_id) - .await?; - - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; - - Ok(compute::NodeInfo { - db_info, - scram_keys: None, - }) -} - -/// Authenticate user via a new cloud API endpoint which supports SCRAM. -async fn handle_existing_user_v2( - cloud: &(impl cloud::Api + ?Sized), - client: &mut PqStream, - creds: ClientCredentials, -) -> Result { - let auth_info = cloud.get_auth_info(&creds).await?; - - let flow = AuthFlow::new(client); - let scram_keys = match auth_info { - cloud::api::AuthInfo::Md5(_) => { - // TODO: decide if we should support MD5 in api v2 - return Err(AuthErrorImpl::auth_failed("MD5 is not supported").into()); - } - cloud::api::AuthInfo::Scram(secret) => { - let scram = Scram(&secret); - Some(compute::ScramKeys { - client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), - server_key: secret.server_key.as_bytes(), - }) - } - }; - - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; - - Ok(compute::NodeInfo { - db_info: cloud.wake_compute(&creds).await?, - scram_keys, - }) -} - -async fn handle_new_user( - redirect_uri: &str, - client: &mut PqStream, -) -> Result { - let psql_session_id = new_psql_session_id(); - let greeting = hello_message(redirect_uri, &psql_session_id); - - let db_info = cloud::with_waiter(psql_session_id, |waiter| async { - // Give user a URL to spawn a new database - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())? - .write_message(&Be::NoticeResponse(&greeting)) - .await?; - - // Wait for web console response (see `mgmt`) - waiter.await?.map_err(AuthErrorImpl::auth_failed) - }) - .await?; - - client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; - - Ok(compute::NodeInfo { - db_info, - scram_keys: None, - }) -} - -fn new_psql_session_id() -> String { - hex::encode(rand::random::<[u8; 8]>()) -} - -fn parse_password(bytes: &[u8]) -> Option<&str> { - std::str::from_utf8(bytes).ok()?.strip_suffix('\0') -} - -fn hello_message(redirect_uri: &str, session_id: &str) -> String { - format!( - concat![ - "☀️ Welcome to Neon!\n", - "To proceed with database creation, open the following link:\n\n", - " {redirect_uri}{session_id}\n\n", - "It needs to be done once and we will send you '.pgpass' file,\n", - "which will allow you to access or create ", - "databases without opening your web browser." - ], - redirect_uri = redirect_uri, - session_id = session_id, - ) -} diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index a3d06b49a2..88677de511 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -23,6 +23,10 @@ impl UserFacingError for ClientCredsParseError {} pub struct ClientCredentials { pub user: String, pub dbname: String, + + // New console API requires SNI info to determine cluster name. + // Other Auth backends don't need it. + pub sni_cluster: Option, } impl ClientCredentials { @@ -45,7 +49,11 @@ impl TryFrom> for ClientCredentials { let user = get_param("user")?; let db = get_param("database")?; - Ok(Self { user, dbname: db }) + Ok(Self { + user, + dbname: db, + sni_cluster: None, + }) } } @@ -54,7 +62,7 @@ impl ClientCredentials { pub async fn authenticate( self, config: &ProxyConfig, - client: &mut PqStream, + client: &mut PqStream, ) -> Result { // This method is just a convenient facade for `handle_user` super::handle_user(config, client, self).await diff --git a/proxy/src/cloud.rs b/proxy/src/auth_backend.rs similarity index 56% rename from proxy/src/cloud.rs rename to proxy/src/auth_backend.rs index 679cfb97e1..54362bf719 100644 --- a/proxy/src/cloud.rs +++ b/proxy/src/auth_backend.rs @@ -1,10 +1,9 @@ -mod local; +pub mod console; +pub mod legacy_console; +pub mod link; +pub mod postgres; -mod legacy; -pub use legacy::{AuthError, AuthErrorImpl, Legacy}; - -pub mod api; -pub use api::{Api, BoxedApi}; +pub use legacy_console::{AuthError, AuthErrorImpl}; use crate::mgmt; use crate::waiters::{self, Waiter, Waiters}; @@ -30,17 +29,3 @@ where pub fn notify(psql_session_id: &str, msg: mgmt::ComputeReady) -> Result<(), waiters::NotifyError> { CPLANE_WAITERS.notify(psql_session_id, msg) } - -/// Construct a new opaque cloud API provider. -pub fn new(url: reqwest::Url) -> anyhow::Result { - Ok(match url.scheme() { - "https" | "http" => { - todo!("build a real cloud wrapper") - } - "postgresql" | "postgres" | "pg" => { - // Just point to a local running postgres instance. - Box::new(local::Local { url }) - } - other => anyhow::bail!("unsupported url scheme: {other}"), - }) -} diff --git a/proxy/src/auth_backend/console.rs b/proxy/src/auth_backend/console.rs new file mode 100644 index 0000000000..863e929489 --- /dev/null +++ b/proxy/src/auth_backend/console.rs @@ -0,0 +1,236 @@ +//! Declaration of Cloud API V2. + +use crate::{ + auth::{self, AuthFlow}, + compute, scram, +}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; + +use crate::auth::ClientCredentials; +use crate::stream::PqStream; + +use tokio::io::{AsyncRead, AsyncWrite}; +use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; + +#[derive(Debug, Error)] +pub enum ConsoleAuthError { + // We shouldn't include the actual secret here. + #[error("Bad authentication secret")] + BadSecret, + + #[error("Bad client credentials: {0:?}")] + BadCredentials(crate::auth::ClientCredentials), + + /// For passwords that couldn't be processed by [`parse_password`]. + #[error("Absend SNI information")] + SniMissing, + + #[error(transparent)] + BadUrl(#[from] url::ParseError), + + #[error(transparent)] + Io(#[from] std::io::Error), + + /// HTTP status (other than 200) returned by the console. + #[error("Console responded with an HTTP status: {0}")] + HttpStatus(reqwest::StatusCode), + + #[error(transparent)] + Transport(#[from] reqwest::Error), + + #[error("Console responded with a malformed JSON: '{0}'")] + MalformedResponse(#[from] serde_json::Error), + + #[error("Console responded with a malformed compute address: '{0}'")] + MalformedComputeAddress(String), +} + +#[derive(Serialize, Deserialize, Debug)] +struct GetRoleSecretResponse { + role_secret: String, +} + +#[derive(Serialize, Deserialize, Debug)] +struct GetWakeComputeResponse { + address: String, +} + +/// Auth secret which is managed by the cloud. +pub enum AuthInfo { + /// Md5 hash of user's password. + Md5([u8; 16]), + /// [SCRAM](crate::scram) authentication info. + Scram(scram::ServerSecret), +} + +/// Compute node connection params provided by the cloud. +/// Note how it implements serde traits, since we receive it over the wire. +#[derive(Serialize, Deserialize, Default)] +pub struct DatabaseInfo { + pub host: String, + pub port: u16, + pub dbname: String, + pub user: String, + + /// [Cloud API V1](super::legacy) returns cleartext password, + /// but [Cloud API V2](super::api) implements [SCRAM](crate::scram) + /// authentication, so we can leverage this method and cope without password. + pub password: Option, +} + +// Manually implement debug to omit personal and sensitive info. +impl std::fmt::Debug for DatabaseInfo { + fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { + fmt.debug_struct("DatabaseInfo") + .field("host", &self.host) + .field("port", &self.port) + .finish() + } +} + +impl From for tokio_postgres::Config { + fn from(db_info: DatabaseInfo) -> Self { + let mut config = tokio_postgres::Config::new(); + + config + .host(&db_info.host) + .port(db_info.port) + .dbname(&db_info.dbname) + .user(&db_info.user); + + if let Some(password) = db_info.password { + config.password(password); + } + + config + } +} + +async fn get_auth_info( + auth_endpoint: &str, + user: &str, + cluster: &str, +) -> Result { + let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_get_role_secret"))?; + + url.query_pairs_mut() + .append_pair("cluster", cluster) + .append_pair("role", user); + + // TODO: use a proper logger + println!("cplane request: {}", url); + + let resp = reqwest::get(url).await?; + if !resp.status().is_success() { + return Err(ConsoleAuthError::HttpStatus(resp.status())); + } + + let response: GetRoleSecretResponse = serde_json::from_str(resp.text().await?.as_str())?; + + scram::ServerSecret::parse(response.role_secret.as_str()) + .map(AuthInfo::Scram) + .ok_or(ConsoleAuthError::BadSecret) +} + +/// Wake up the compute node and return the corresponding connection info. +async fn wake_compute( + auth_endpoint: &str, + cluster: &str, +) -> Result<(String, u16), ConsoleAuthError> { + let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_wake_compute"))?; + url.query_pairs_mut().append_pair("cluster", cluster); + + // TODO: use a proper logger + println!("cplane request: {}", url); + + let resp = reqwest::get(url).await?; + if !resp.status().is_success() { + return Err(ConsoleAuthError::HttpStatus(resp.status())); + } + + let response: GetWakeComputeResponse = serde_json::from_str(resp.text().await?.as_str())?; + let (host, port) = response + .address + .split_once(':') + .ok_or_else(|| ConsoleAuthError::MalformedComputeAddress(response.address.clone()))?; + let port: u16 = port + .parse() + .map_err(|_| ConsoleAuthError::MalformedComputeAddress(response.address.clone()))?; + + Ok((host.to_string(), port)) +} + +pub async fn handle_user( + auth_endpoint: &str, + client: &mut PqStream, + creds: &ClientCredentials, +) -> Result { + let cluster = creds + .sni_cluster + .as_ref() + .ok_or(ConsoleAuthError::SniMissing)?; + let user = creds.user.as_str(); + + // Step 1: get the auth secret + let auth_info = get_auth_info(auth_endpoint, user, cluster).await?; + + let flow = AuthFlow::new(client); + let scram_keys = match auth_info { + AuthInfo::Md5(_) => { + // TODO: decide if we should support MD5 in api v2 + return Err(crate::auth::AuthErrorImpl::auth_failed("MD5 is not supported").into()); + } + AuthInfo::Scram(secret) => { + let scram = auth::Scram(&secret); + Some(compute::ScramKeys { + client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), + server_key: secret.server_key.as_bytes(), + }) + } + }; + + client + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())?; + + // Step 2: wake compute + let (host, port) = wake_compute(auth_endpoint, cluster).await?; + + Ok(compute::NodeInfo { + db_info: DatabaseInfo { + host, + port, + dbname: creds.dbname.clone(), + user: creds.user.clone(), + password: None, + }, + scram_keys, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn parse_db_info() -> anyhow::Result<()> { + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "password": "password", + }))?; + + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + }))?; + + Ok(()) + } +} diff --git a/proxy/src/auth_backend/legacy_console.rs b/proxy/src/auth_backend/legacy_console.rs new file mode 100644 index 0000000000..29997d2389 --- /dev/null +++ b/proxy/src/auth_backend/legacy_console.rs @@ -0,0 +1,206 @@ +//! Cloud API V1. + +use super::console::DatabaseInfo; + +use crate::auth::ClientCredentials; +use crate::stream::PqStream; + +use crate::{compute, waiters}; +use serde::{Deserialize, Serialize}; + +use tokio::io::{AsyncRead, AsyncWrite}; +use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; + +use thiserror::Error; + +use crate::error::UserFacingError; + +#[derive(Debug, Error)] +pub enum AuthErrorImpl { + /// Authentication error reported by the console. + #[error("Authentication failed: {0}")] + AuthFailed(String), + + /// HTTP status (other than 200) returned by the console. + #[error("Console responded with an HTTP status: {0}")] + HttpStatus(reqwest::StatusCode), + + #[error("Console responded with a malformed JSON: {0}")] + MalformedResponse(#[from] serde_json::Error), + + #[error(transparent)] + Transport(#[from] reqwest::Error), + + #[error(transparent)] + WaiterRegister(#[from] waiters::RegisterError), + + #[error(transparent)] + WaiterWait(#[from] waiters::WaitError), +} + +#[derive(Debug, Error)] +#[error(transparent)] +pub struct AuthError(Box); + +impl AuthError { + /// Smart constructor for authentication error reported by `mgmt`. + pub fn auth_failed(msg: impl Into) -> Self { + AuthError(Box::new(AuthErrorImpl::AuthFailed(msg.into()))) + } +} + +impl From for AuthError +where + AuthErrorImpl: From, +{ + fn from(e: T) -> Self { + AuthError(Box::new(e.into())) + } +} + +impl UserFacingError for AuthError { + fn to_string_client(&self) -> String { + use AuthErrorImpl::*; + match self.0.as_ref() { + AuthFailed(_) | HttpStatus(_) => self.to_string(), + _ => "Internal error".to_string(), + } + } +} + +// NOTE: the order of constructors is important. +// https://serde.rs/enum-representations.html#untagged +#[derive(Serialize, Deserialize, Debug)] +#[serde(untagged)] +enum ProxyAuthResponse { + Ready { conn_info: DatabaseInfo }, + Error { error: String }, + NotReady { ready: bool }, // TODO: get rid of `ready` +} + +async fn authenticate_proxy_client( + auth_endpoint: &reqwest::Url, + creds: &ClientCredentials, + md5_response: &str, + salt: &[u8; 4], + psql_session_id: &str, +) -> Result { + let mut url = auth_endpoint.clone(); + url.query_pairs_mut() + .append_pair("login", &creds.user) + .append_pair("database", &creds.dbname) + .append_pair("md5response", md5_response) + .append_pair("salt", &hex::encode(salt)) + .append_pair("psql_session_id", psql_session_id); + + super::with_waiter(psql_session_id, |waiter| async { + println!("cloud request: {}", url); + // TODO: leverage `reqwest::Client` to reuse connections + let resp = reqwest::get(url).await?; + if !resp.status().is_success() { + return Err(AuthErrorImpl::HttpStatus(resp.status()).into()); + } + + let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?; + println!("got auth info: #{:?}", auth_info); + + use ProxyAuthResponse::*; + let db_info = match auth_info { + Ready { conn_info } => conn_info, + Error { error } => return Err(AuthErrorImpl::AuthFailed(error).into()), + NotReady { .. } => waiter.await?.map_err(AuthErrorImpl::AuthFailed)?, + }; + + Ok(db_info) + }) + .await +} + +async fn handle_existing_user( + auth_endpoint: &reqwest::Url, + client: &mut PqStream, + creds: &ClientCredentials, +) -> Result { + let psql_session_id = super::link::new_psql_session_id(); + let md5_salt = rand::random(); + + client + .write_message(&Be::AuthenticationMD5Password(md5_salt)) + .await?; + + // Read client's password hash + let msg = client.read_password_message().await?; + let md5_response = parse_password(&msg).ok_or(crate::auth::AuthErrorImpl::MalformedPassword)?; + + let db_info = authenticate_proxy_client( + auth_endpoint, + creds, + md5_response, + &md5_salt, + &psql_session_id, + ) + .await?; + + client + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())?; + + Ok(compute::NodeInfo { + db_info, + scram_keys: None, + }) +} + +pub async fn handle_user( + auth_endpoint: &reqwest::Url, + auth_link_uri: &reqwest::Url, + client: &mut PqStream, + creds: &ClientCredentials, +) -> Result { + if creds.is_existing_user() { + handle_existing_user(auth_endpoint, client, creds).await + } else { + super::link::handle_user(auth_link_uri.as_ref(), client).await + } +} + +fn parse_password(bytes: &[u8]) -> Option<&str> { + std::str::from_utf8(bytes).ok()?.strip_suffix('\0') +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn test_proxy_auth_response() { + // Ready + let auth: ProxyAuthResponse = serde_json::from_value(json!({ + "ready": true, + "conn_info": DatabaseInfo::default(), + })) + .unwrap(); + assert!(matches!( + auth, + ProxyAuthResponse::Ready { + conn_info: DatabaseInfo { .. } + } + )); + + // Error + let auth: ProxyAuthResponse = serde_json::from_value(json!({ + "ready": false, + "error": "too bad, so sad", + })) + .unwrap(); + assert!(matches!(auth, ProxyAuthResponse::Error { .. })); + + // NotReady + let auth: ProxyAuthResponse = serde_json::from_value(json!({ + "ready": false, + })) + .unwrap(); + assert!(matches!(auth, ProxyAuthResponse::NotReady { .. })); + } +} diff --git a/proxy/src/auth_backend/link.rs b/proxy/src/auth_backend/link.rs new file mode 100644 index 0000000000..9bdb9e21c4 --- /dev/null +++ b/proxy/src/auth_backend/link.rs @@ -0,0 +1,52 @@ +use crate::{compute, stream::PqStream}; +use tokio::io::{AsyncRead, AsyncWrite}; +use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; + +fn hello_message(redirect_uri: &str, session_id: &str) -> String { + format!( + concat![ + "☀️ Welcome to Neon!\n", + "To proceed with database creation, open the following link:\n\n", + " {redirect_uri}{session_id}\n\n", + "It needs to be done once and we will send you '.pgpass' file,\n", + "which will allow you to access or create ", + "databases without opening your web browser." + ], + redirect_uri = redirect_uri, + session_id = session_id, + ) +} + +pub fn new_psql_session_id() -> String { + hex::encode(rand::random::<[u8; 8]>()) +} + +pub async fn handle_user( + redirect_uri: &str, + client: &mut PqStream, +) -> Result { + let psql_session_id = new_psql_session_id(); + let greeting = hello_message(redirect_uri, &psql_session_id); + + let db_info = crate::auth_backend::with_waiter(psql_session_id, |waiter| async { + // Give user a URL to spawn a new database + client + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())? + .write_message(&Be::NoticeResponse(&greeting)) + .await?; + + // Wait for web console response (see `mgmt`) + waiter + .await? + .map_err(crate::auth::AuthErrorImpl::auth_failed) + }) + .await?; + + client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; + + Ok(compute::NodeInfo { + db_info, + scram_keys: None, + }) +} diff --git a/proxy/src/auth_backend/postgres.rs b/proxy/src/auth_backend/postgres.rs new file mode 100644 index 0000000000..148c2a2518 --- /dev/null +++ b/proxy/src/auth_backend/postgres.rs @@ -0,0 +1,93 @@ +//! Local mock of Cloud API V2. + +use super::console::{self, AuthInfo, DatabaseInfo}; +use crate::scram; +use crate::{auth::ClientCredentials, compute}; + +use crate::stream::PqStream; +use tokio::io::{AsyncRead, AsyncWrite}; +use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; + +async fn get_auth_info( + auth_endpoint: &str, + creds: &ClientCredentials, +) -> Result { + // We wrap `tokio_postgres::Error` because we don't want to infect the + // method's error type with a detail that's specific to debug mode only. + let io_error = |e| std::io::Error::new(std::io::ErrorKind::Other, e); + + // Perhaps we could persist this connection, but then we'd have to + // write more code for reopening it if it got closed, which doesn't + // seem worth it. + let (client, connection) = tokio_postgres::connect(auth_endpoint, tokio_postgres::NoTls) + .await + .map_err(io_error)?; + + tokio::spawn(connection); + let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1"; + let rows = client + .query(query, &[&creds.user]) + .await + .map_err(io_error)?; + + match &rows[..] { + // We can't get a secret if there's no such user. + [] => Err(console::ConsoleAuthError::BadCredentials(creds.to_owned())), + // We shouldn't get more than one row anyway. + [row, ..] => { + let entry = row.try_get(0).map_err(io_error)?; + scram::ServerSecret::parse(entry) + .map(AuthInfo::Scram) + .or_else(|| { + // It could be an md5 hash if it's not a SCRAM secret. + let text = entry.strip_prefix("md5")?; + Some(AuthInfo::Md5({ + let mut bytes = [0u8; 16]; + hex::decode_to_slice(text, &mut bytes).ok()?; + bytes + })) + }) + // Putting the secret into this message is a security hazard! + .ok_or(console::ConsoleAuthError::BadSecret) + } + } +} + +pub async fn handle_user( + auth_endpoint: &reqwest::Url, + client: &mut PqStream, + creds: &ClientCredentials, +) -> Result { + let auth_info = get_auth_info(auth_endpoint.as_ref(), creds).await?; + + let flow = crate::auth::AuthFlow::new(client); + let scram_keys = match auth_info { + AuthInfo::Md5(_) => { + // TODO: decide if we should support MD5 in api v2 + return Err(crate::auth::AuthErrorImpl::auth_failed("MD5 is not supported").into()); + } + AuthInfo::Scram(secret) => { + let scram = crate::auth::Scram(&secret); + Some(compute::ScramKeys { + client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), + server_key: secret.server_key.as_bytes(), + }) + } + }; + + client + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())?; + + Ok(compute::NodeInfo { + db_info: DatabaseInfo { + // TODO: handle that near CLI params parsing + host: auth_endpoint.host_str().unwrap_or("localhost").to_owned(), + port: auth_endpoint.port().unwrap_or(5432), + dbname: creds.dbname.to_owned(), + user: creds.user.to_owned(), + password: None, + }, + scram_keys, + }) +} diff --git a/proxy/src/cloud/api.rs b/proxy/src/cloud/api.rs deleted file mode 100644 index 713140c1e6..0000000000 --- a/proxy/src/cloud/api.rs +++ /dev/null @@ -1,120 +0,0 @@ -//! Declaration of Cloud API V2. - -use crate::{auth, scram}; -use async_trait::async_trait; -use serde::{Deserialize, Serialize}; -use thiserror::Error; - -#[derive(Debug, Error)] -pub enum GetAuthInfoError { - // We shouldn't include the actual secret here. - #[error("Bad authentication secret")] - BadSecret, - - #[error("Bad client credentials: {0:?}")] - BadCredentials(crate::auth::ClientCredentials), - - #[error(transparent)] - Io(#[from] std::io::Error), -} - -// TODO: convert to an enum and describe possible sub-errors (see above) -#[derive(Debug, Error)] -#[error("Failed to wake up the compute node")] -pub struct WakeComputeError; - -/// Opaque implementation of Cloud API. -pub type BoxedApi = Box; - -/// Cloud API methods required by the proxy. -#[async_trait] -pub trait Api { - /// Get authentication information for the given user. - async fn get_auth_info( - &self, - creds: &auth::ClientCredentials, - ) -> Result; - - /// Wake up the compute node and return the corresponding connection info. - async fn wake_compute( - &self, - creds: &auth::ClientCredentials, - ) -> Result; -} - -/// Auth secret which is managed by the cloud. -pub enum AuthInfo { - /// Md5 hash of user's password. - Md5([u8; 16]), - /// [SCRAM](crate::scram) authentication info. - Scram(scram::ServerSecret), -} - -/// Compute node connection params provided by the cloud. -/// Note how it implements serde traits, since we receive it over the wire. -#[derive(Serialize, Deserialize, Default)] -pub struct DatabaseInfo { - pub host: String, - pub port: u16, - pub dbname: String, - pub user: String, - - /// [Cloud API V1](super::legacy) returns cleartext password, - /// but [Cloud API V2](super::api) implements [SCRAM](crate::scram) - /// authentication, so we can leverage this method and cope without password. - pub password: Option, -} - -// Manually implement debug to omit personal and sensitive info. -impl std::fmt::Debug for DatabaseInfo { - fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { - fmt.debug_struct("DatabaseInfo") - .field("host", &self.host) - .field("port", &self.port) - .finish() - } -} - -impl From for tokio_postgres::Config { - fn from(db_info: DatabaseInfo) -> Self { - let mut config = tokio_postgres::Config::new(); - - config - .host(&db_info.host) - .port(db_info.port) - .dbname(&db_info.dbname) - .user(&db_info.user); - - if let Some(password) = db_info.password { - config.password(password); - } - - config - } -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn parse_db_info() -> anyhow::Result<()> { - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - "password": "password", - }))?; - - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - }))?; - - Ok(()) - } -} diff --git a/proxy/src/cloud/legacy.rs b/proxy/src/cloud/legacy.rs deleted file mode 100644 index 7d99995f1a..0000000000 --- a/proxy/src/cloud/legacy.rs +++ /dev/null @@ -1,160 +0,0 @@ -//! Cloud API V1. - -use super::api::DatabaseInfo; -use crate::auth::ClientCredentials; -use crate::error::UserFacingError; -use crate::waiters; -use serde::{Deserialize, Serialize}; -use thiserror::Error; - -/// Neon cloud API provider. -pub struct Legacy { - auth_endpoint: reqwest::Url, -} - -impl Legacy { - /// Construct a new legacy cloud API provider. - pub fn new(auth_endpoint: reqwest::Url) -> Self { - Self { auth_endpoint } - } -} - -#[derive(Debug, Error)] -pub enum AuthErrorImpl { - /// Authentication error reported by the console. - #[error("Authentication failed: {0}")] - AuthFailed(String), - - /// HTTP status (other than 200) returned by the console. - #[error("Console responded with an HTTP status: {0}")] - HttpStatus(reqwest::StatusCode), - - #[error("Console responded with a malformed JSON: {0}")] - MalformedResponse(#[from] serde_json::Error), - - #[error(transparent)] - Transport(#[from] reqwest::Error), - - #[error(transparent)] - WaiterRegister(#[from] waiters::RegisterError), - - #[error(transparent)] - WaiterWait(#[from] waiters::WaitError), -} - -#[derive(Debug, Error)] -#[error(transparent)] -pub struct AuthError(Box); - -impl AuthError { - /// Smart constructor for authentication error reported by `mgmt`. - pub fn auth_failed(msg: impl Into) -> Self { - AuthError(Box::new(AuthErrorImpl::AuthFailed(msg.into()))) - } -} - -impl From for AuthError -where - AuthErrorImpl: From, -{ - fn from(e: T) -> Self { - AuthError(Box::new(e.into())) - } -} - -impl UserFacingError for AuthError { - fn to_string_client(&self) -> String { - use AuthErrorImpl::*; - match self.0.as_ref() { - AuthFailed(_) | HttpStatus(_) => self.to_string(), - _ => "Internal error".to_string(), - } - } -} - -// NOTE: the order of constructors is important. -// https://serde.rs/enum-representations.html#untagged -#[derive(Serialize, Deserialize, Debug)] -#[serde(untagged)] -enum ProxyAuthResponse { - Ready { conn_info: DatabaseInfo }, - Error { error: String }, - NotReady { ready: bool }, // TODO: get rid of `ready` -} - -impl Legacy { - pub async fn authenticate_proxy_client( - &self, - creds: ClientCredentials, - md5_response: &str, - salt: &[u8; 4], - psql_session_id: &str, - ) -> Result { - let mut url = self.auth_endpoint.clone(); - url.query_pairs_mut() - .append_pair("login", &creds.user) - .append_pair("database", &creds.dbname) - .append_pair("md5response", md5_response) - .append_pair("salt", &hex::encode(salt)) - .append_pair("psql_session_id", psql_session_id); - - super::with_waiter(psql_session_id, |waiter| async { - println!("cloud request: {}", url); - // TODO: leverage `reqwest::Client` to reuse connections - let resp = reqwest::get(url).await?; - if !resp.status().is_success() { - return Err(AuthErrorImpl::HttpStatus(resp.status()).into()); - } - - let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?; - println!("got auth info: #{:?}", auth_info); - - use ProxyAuthResponse::*; - let db_info = match auth_info { - Ready { conn_info } => conn_info, - Error { error } => return Err(AuthErrorImpl::AuthFailed(error).into()), - NotReady { .. } => waiter.await?.map_err(AuthErrorImpl::AuthFailed)?, - }; - - Ok(db_info) - }) - .await - } -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn test_proxy_auth_response() { - // Ready - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": true, - "conn_info": DatabaseInfo::default(), - })) - .unwrap(); - assert!(matches!( - auth, - ProxyAuthResponse::Ready { - conn_info: DatabaseInfo { .. } - } - )); - - // Error - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": false, - "error": "too bad, so sad", - })) - .unwrap(); - assert!(matches!(auth, ProxyAuthResponse::Error { .. })); - - // NotReady - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": false, - })) - .unwrap(); - assert!(matches!(auth, ProxyAuthResponse::NotReady { .. })); - } -} diff --git a/proxy/src/cloud/local.rs b/proxy/src/cloud/local.rs deleted file mode 100644 index 88eda6630c..0000000000 --- a/proxy/src/cloud/local.rs +++ /dev/null @@ -1,76 +0,0 @@ -//! Local mock of Cloud API V2. - -use super::api::{self, Api, AuthInfo, DatabaseInfo}; -use crate::auth::ClientCredentials; -use crate::scram; -use async_trait::async_trait; - -/// Mocked cloud for testing purposes. -pub struct Local { - /// Database url, e.g. `postgres://user:password@localhost:5432/database`. - pub url: reqwest::Url, -} - -#[async_trait] -impl Api for Local { - async fn get_auth_info( - &self, - creds: &ClientCredentials, - ) -> Result { - // We wrap `tokio_postgres::Error` because we don't want to infect the - // method's error type with a detail that's specific to debug mode only. - let io_error = |e| std::io::Error::new(std::io::ErrorKind::Other, e); - - // Perhaps we could persist this connection, but then we'd have to - // write more code for reopening it if it got closed, which doesn't - // seem worth it. - let (client, connection) = - tokio_postgres::connect(self.url.as_str(), tokio_postgres::NoTls) - .await - .map_err(io_error)?; - - tokio::spawn(connection); - let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1"; - let rows = client - .query(query, &[&creds.user]) - .await - .map_err(io_error)?; - - match &rows[..] { - // We can't get a secret if there's no such user. - [] => Err(api::GetAuthInfoError::BadCredentials(creds.to_owned())), - // We shouldn't get more than one row anyway. - [row, ..] => { - let entry = row.try_get(0).map_err(io_error)?; - scram::ServerSecret::parse(entry) - .map(AuthInfo::Scram) - .or_else(|| { - // It could be an md5 hash if it's not a SCRAM secret. - let text = entry.strip_prefix("md5")?; - Some(AuthInfo::Md5({ - let mut bytes = [0u8; 16]; - hex::decode_to_slice(text, &mut bytes).ok()?; - bytes - })) - }) - // Putting the secret into this message is a security hazard! - .ok_or(api::GetAuthInfoError::BadSecret) - } - } - } - - async fn wake_compute( - &self, - creds: &ClientCredentials, - ) -> Result { - // Local setup doesn't have a dedicated compute node, - // so we just return the local database we're pointed at. - Ok(DatabaseInfo { - host: self.url.host_str().unwrap_or("localhost").to_owned(), - port: self.url.port().unwrap_or(5432), - dbname: creds.dbname.to_owned(), - user: creds.user.to_owned(), - password: None, - }) - } -} diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 9949e91ea2..c3c5ba47fb 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,5 +1,5 @@ +use crate::auth_backend::console::DatabaseInfo; use crate::cancellation::CancelClosure; -use crate::cloud::api::DatabaseInfo; use crate::error::UserFacingError; use std::io; use std::net::SocketAddr; diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 6b30df604d..077a07beb9 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,35 +1,39 @@ -use crate::cloud; -use anyhow::{bail, ensure, Context}; -use std::sync::Arc; +use anyhow::{ensure, Context}; +use std::{str::FromStr, sync::Arc}; + +#[non_exhaustive] +pub enum AuthBackendType { + LegacyConsole, + Console, + Postgres, + Link, +} + +impl FromStr for AuthBackendType { + type Err = anyhow::Error; + + fn from_str(s: &str) -> anyhow::Result { + println!("ClientAuthMethod::from_str: '{}'", s); + use AuthBackendType::*; + match s { + "legacy" => Ok(LegacyConsole), + "console" => Ok(Console), + "postgres" => Ok(Postgres), + "link" => Ok(Link), + _ => Err(anyhow::anyhow!("Invlid option for auth method")), + } + } +} pub struct ProxyConfig { - /// Unauthenticated users will be redirected to this URL. - pub redirect_uri: reqwest::Url, - - /// Cloud API endpoint for user authentication. - pub cloud_endpoint: CloudApi, - /// TLS configuration for the proxy. pub tls_config: Option, -} -/// Cloud API configuration. -pub enum CloudApi { - /// We'll drop this one when [`CloudApi::V2`] is stable. - V1(crate::cloud::Legacy), - /// The new version of the cloud API. - V2(crate::cloud::BoxedApi), -} + pub auth_backend: AuthBackendType, -impl CloudApi { - /// Configure Cloud API provider. - pub fn new(version: &str, url: reqwest::Url) -> anyhow::Result { - Ok(match version { - "v1" => Self::V1(cloud::Legacy::new(url)), - "v2" => Self::V2(cloud::new(url)?), - _ => bail!("unknown cloud API version: {}", version), - }) - } + pub auth_endpoint: reqwest::Url, + + pub auth_link_uri: reqwest::Url, } pub type TlsConfig = Arc; diff --git a/proxy/src/main.rs b/proxy/src/main.rs index ce9889ce30..fc2a368b85 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -5,8 +5,8 @@ //! in somewhat transparent manner (again via communication with control plane API). mod auth; +mod auth_backend; mod cancellation; -mod cloud; mod compute; mod config; mod error; @@ -48,18 +48,11 @@ async fn main() -> anyhow::Result<()> { .default_value("127.0.0.1:4432"), ) .arg( - Arg::new("auth-method") - .long("auth-method") + Arg::new("auth-backend") + .long("auth-backend") .takes_value(true) - .help("Possible values: password | link | mixed") - .default_value("mixed"), - ) - .arg( - Arg::new("static-router") - .short('s') - .long("static-router") - .takes_value(true) - .help("Route all clients to host:port"), + .help("Possible values: legacy | console | postgres | link") + .default_value("legacy"), ) .arg( Arg::new("mgmt") @@ -82,7 +75,7 @@ async fn main() -> anyhow::Result<()> { .short('u') .long("uri") .takes_value(true) - .help("redirect unauthenticated users to given uri") + .help("redirect unauthenticated users to the given uri in case of link auth") .default_value("http://localhost:3000/psql_session/"), ) .arg( @@ -93,14 +86,6 @@ async fn main() -> anyhow::Result<()> { .help("cloud API endpoint for authenticating users") .default_value("http://localhost:3000/authenticate_proxy_request/"), ) - .arg( - Arg::new("api-version") - .long("api-version") - .takes_value(true) - .default_value("v1") - .possible_values(["v1", "v2"]) - .help("cloud API version to be used for authentication"), - ) .arg( Arg::new("tls-key") .short('k') @@ -132,15 +117,11 @@ async fn main() -> anyhow::Result<()> { let mgmt_address: SocketAddr = arg_matches.value_of("mgmt").unwrap().parse()?; let http_address: SocketAddr = arg_matches.value_of("http").unwrap().parse()?; - let cloud_endpoint = config::CloudApi::new( - arg_matches.value_of("api-version").unwrap(), - arg_matches.value_of("auth-endpoint").unwrap().parse()?, - )?; - let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig { - redirect_uri: arg_matches.value_of("uri").unwrap().parse()?, - cloud_endpoint, tls_config, + auth_backend: arg_matches.value_of("auth-backend").unwrap().parse()?, + auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, + auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, })); println!("Version: {}", GIT_VERSION); diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index c48df653d3..93618fff68 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -1,4 +1,4 @@ -use crate::cloud; +use crate::auth_backend; use anyhow::Context; use serde::Deserialize; use std::{ @@ -10,6 +10,8 @@ use utils::{ pq_proto::{BeMessage, SINGLE_COL_ROWDESC}, }; +/// TODO: move all of that to auth-backend/link.rs when we ditch legacy-console backend + /// /// Main proxy listener loop. /// @@ -75,12 +77,12 @@ struct PsqlSessionResponse { #[derive(Deserialize)] enum PsqlSessionResult { - Success(cloud::api::DatabaseInfo), + Success(auth_backend::console::DatabaseInfo), Failure(String), } /// A message received by `mgmt` when a compute node is ready. -pub type ComputeReady = Result; +pub type ComputeReady = Result; impl PsqlSessionResult { fn into_compute_ready(self) -> ComputeReady { @@ -111,7 +113,7 @@ fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::R let resp: PsqlSessionResponse = serde_json::from_str(query_string)?; - match cloud::notify(&resp.session_id, resp.result.into_compute_ready()) { + match auth_backend::notify(&resp.session_id, resp.result.into_compute_ready()) { Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 4bce2bf40d..4bdbac8510 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -73,7 +73,7 @@ pub async fn thread_main( async fn handle_client( config: &ProxyConfig, cancel_map: &CancelMap, - stream: impl AsyncRead + AsyncWrite + Unpin, + stream: impl AsyncRead + AsyncWrite + Unpin + Send, ) -> anyhow::Result<()> { // The `closed` counter will increase when this future is destroyed. NUM_CONNECTIONS_ACCEPTED_COUNTER.inc(); @@ -148,6 +148,8 @@ async fn handshake( .or_else(|e| stream.throw_error(e)) .await?; + // TODO: set creds.cluster here when SNI info is available + break Ok(Some((stream, creds))); } CancelRequest(cancel_key_data) => { @@ -174,7 +176,7 @@ impl Client { } } -impl Client { +impl Client { /// Let the client authenticate and connect to the designated compute node. async fn connect_to_db( self, diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index bf935d3510..765aef4443 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -38,6 +38,7 @@ impl ServerSecret { /// To avoid revealing information to an attacker, we use a /// mocked server secret even if the user doesn't exist. /// See `auth-scram.c : mock_scram_secret` for details. + #[allow(dead_code)] pub fn mock(user: &str, nonce: &[u8; 32]) -> Self { // Refer to `auth-scram.c : scram_mock_salt`. let mocked_salt = super::sha256([user.as_bytes(), nonce]); diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index e16d1acf2f..5614cea68b 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1382,8 +1382,8 @@ def remote_pg(test_output_dir: str) -> Iterator[RemotePostgres]: class ZenithProxy(PgProtocol): def __init__(self, port: int): super().__init__(host="127.0.0.1", - user="pytest", - password="pytest", + user="proxy_user", + password="pytest2", port=port, dbname='postgres') self.http_port = 7001 @@ -1399,8 +1399,8 @@ class ZenithProxy(PgProtocol): args = [bin_proxy] args.extend(["--http", f"{self.host}:{self.http_port}"]) args.extend(["--proxy", f"{self.host}:{self.port}"]) - args.extend(["--auth-method", "password"]) - args.extend(["--static-router", addr]) + args.extend(["--auth-backend", "postgres"]) + args.extend(["--auth-endpoint", "postgres://proxy_auth:pytest1@localhost:5432/postgres"]) self._popen = subprocess.Popen(args) self._wait_until_ready() @@ -1422,7 +1422,8 @@ class ZenithProxy(PgProtocol): def static_proxy(vanilla_pg) -> Iterator[ZenithProxy]: """Zenith proxy that routes directly to vanilla postgres.""" vanilla_pg.start() - vanilla_pg.safe_psql("create user pytest with password 'pytest';") + vanilla_pg.safe_psql("create user proxy_auth with password 'pytest1' superuser") + vanilla_pg.safe_psql("create user proxy_user with password 'pytest2'") with ZenithProxy(4432) as proxy: proxy.start_static() From 9a396e1feb9f35e4f2d57d38a2ac07070ecc1b4b Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Mon, 2 May 2022 00:35:15 +0300 Subject: [PATCH 0222/1022] Support SNI-based routing in proxy --- proxy/src/auth.rs | 2 ++ proxy/src/auth/credentials.rs | 6 +++--- proxy/src/auth_backend/console.rs | 15 +++++++++++---- proxy/src/proxy.rs | 7 +++++-- 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index d4e21d78a0..2463f31645 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -6,6 +6,7 @@ use crate::config::{AuthBackendType, ProxyConfig}; use crate::error::UserFacingError; use crate::stream::PqStream; use crate::{auth_backend, compute, waiters}; +use console::ConsoleAuthError::SniMissing; use std::io; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; @@ -72,6 +73,7 @@ impl UserFacingError for AuthError { match self.0.as_ref() { Console(e) => e.to_string_client(), MalformedPassword => self.to_string(), + GetAuthInfo(e) if matches!(e, SniMissing) => e.to_string(), _ => "Internal error".to_string(), } } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 88677de511..9d2272b5ad 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -24,9 +24,9 @@ pub struct ClientCredentials { pub user: String, pub dbname: String, - // New console API requires SNI info to determine cluster name. + // New console API requires SNI info to determine the cluster name. // Other Auth backends don't need it. - pub sni_cluster: Option, + pub sni_data: Option, } impl ClientCredentials { @@ -52,7 +52,7 @@ impl TryFrom> for ClientCredentials { Ok(Self { user, dbname: db, - sni_cluster: None, + sni_data: None, }) } } diff --git a/proxy/src/auth_backend/console.rs b/proxy/src/auth_backend/console.rs index 863e929489..55a0889af4 100644 --- a/proxy/src/auth_backend/console.rs +++ b/proxy/src/auth_backend/console.rs @@ -22,10 +22,12 @@ pub enum ConsoleAuthError { #[error("Bad client credentials: {0:?}")] BadCredentials(crate::auth::ClientCredentials), - /// For passwords that couldn't be processed by [`parse_password`]. - #[error("Absend SNI information")] + #[error("SNI info is missing, please upgrade the postgres client library")] SniMissing, + #[error("Unexpected SNI content")] + SniWrong, + #[error(transparent)] BadUrl(#[from] url::ParseError), @@ -166,10 +168,15 @@ pub async fn handle_user( client: &mut PqStream, creds: &ClientCredentials, ) -> Result { + // Determine cluster name from SNI. let cluster = creds - .sni_cluster + .sni_data .as_ref() - .ok_or(ConsoleAuthError::SniMissing)?; + .ok_or(ConsoleAuthError::SniMissing)? + .split_once('.') + .ok_or(ConsoleAuthError::SniWrong)? + .0; + let user = creds.user.as_str(); // Step 1: get the auth secret diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 4bdbac8510..821ce377f5 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -144,11 +144,14 @@ async fn handshake( } // Here and forth: `or_else` demands that we use a future here - let creds = async { params.try_into() } + let mut creds: auth::ClientCredentials = async { params.try_into() } .or_else(|e| stream.throw_error(e)) .await?; - // TODO: set creds.cluster here when SNI info is available + // Set SNI info when available + if let Stream::Tls { tls } = stream.get_ref() { + creds.sni_data = tls.get_ref().1.sni_hostname().map(|s| s.to_owned()); + } break Ok(Some((stream, creds))); } From ad25736f3a38540965cd86a5feee593a7c1fbdb5 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 2 May 2022 18:14:36 +0300 Subject: [PATCH 0223/1022] Exit pageserver process with correct error code When we shutdown pageserver due to an error (e g one of th important thrads panicked) use 1 exit code so systemd can properly restart it --- pageserver/src/bin/pageserver.rs | 2 +- pageserver/src/lib.rs | 4 ++-- pageserver/src/thread_mgr.rs | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 01fcc1224f..2139bea37e 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -295,7 +295,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() signal.name() ); profiling::exit_profiler(conf, &profiler_guard); - pageserver::shutdown_pageserver(); + pageserver::shutdown_pageserver(0); unreachable!() } }) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 94219c7840..0b1c53172c 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -67,7 +67,7 @@ pub type RepositoryImpl = LayeredRepository; pub type DatadirTimelineImpl = DatadirTimeline; -pub fn shutdown_pageserver() { +pub fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint thread. This prevents new connections from // being accepted. thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None); @@ -94,5 +94,5 @@ pub fn shutdown_pageserver() { thread_mgr::shutdown_threads(None, None, None); info!("Shut down successfully completed"); - std::process::exit(0); + std::process::exit(exit_code); } diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index 2866c6be44..f7f8467ae0 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -231,7 +231,7 @@ fn thread_wrapper( "Shutting down: thread '{}' exited with error: {:?}", thread_name, err ); - shutdown_pageserver(); + shutdown_pageserver(1); } else { error!("Thread '{}' exited with error: {:?}", thread_name, err); } @@ -241,7 +241,7 @@ fn thread_wrapper( "Shutting down: thread '{}' panicked: {:?}", thread_name, err ); - shutdown_pageserver(); + shutdown_pageserver(1); } } } From 5cb501c2b32697afaf24fea6359f7c90fe14dcd1 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sun, 1 May 2022 21:57:33 +0300 Subject: [PATCH 0224/1022] Make remote storage test less flacky --- test_runner/batch_others/test_remote_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index 59a9cfa378..e205f79957 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -117,7 +117,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) assert detail['local'] is not None log.info("Timeline detail after attach completed: %s", detail) - assert lsn_from_hex(detail['local']['last_record_lsn']) == current_lsn + assert lsn_from_hex(detail['local']['last_record_lsn']) >= current_lsn, 'current db Lsn should shoud not be less than the one stored on remote storage' assert not detail['remote']['awaits_download'] pg = env.postgres.create_start('main') From 801b749e1dd0de501b7fd4dbe4d494f40fc64515 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Mon, 2 May 2022 18:08:30 +0300 Subject: [PATCH 0225/1022] Set correct authEndpoint for the new proxy --- .circleci/helm-values/staging.proxy-scram.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/helm-values/staging.proxy-scram.yaml b/.circleci/helm-values/staging.proxy-scram.yaml index 0391697641..d95ae3bfc2 100644 --- a/.circleci/helm-values/staging.proxy-scram.yaml +++ b/.circleci/helm-values/staging.proxy-scram.yaml @@ -6,7 +6,7 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-staging.local/management/api/v2" + authEndpoint: "http://console-staging.local:9095/management/api/v2" # -- Additional labels for zenith-proxy pods podLabels: From 87a6c4d0511c1eac5229c7257256d384e6cb347c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 2 May 2022 21:47:54 +0300 Subject: [PATCH 0226/1022] RFC on connection routing and authentication. This documents how we want this to work. We're not quite there yet. --- docs/rfcs/016-connection-routing.md | 151 ++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 docs/rfcs/016-connection-routing.md diff --git a/docs/rfcs/016-connection-routing.md b/docs/rfcs/016-connection-routing.md new file mode 100644 index 0000000000..603a0725d6 --- /dev/null +++ b/docs/rfcs/016-connection-routing.md @@ -0,0 +1,151 @@ +# Dispatching a connection + +For each client connection, Neon service needs to authenticate the +connection, and route it to the right PostgreSQL instance. + +## Authentication + +There are three different ways to authenticate: + +- anonymous; no authentication needed +- PostgreSQL authentication +- github single sign-on using browser + +In anonymous access, the user doesn't need to perform any +authentication at all. This can be used e.g. in interactive PostgreSQL +documentation, allowing you to run the examples very quickly. Similar +to sqlfiddle.com. + +PostgreSQL authentication works the same as always. All the different +PostgreSQL authentication options like SCRAM, kerberos, etc. are +available. [1] + +The third option is to authenticate with github single sign-on. When +you open the connection in psql, you get a link that you open with +your browser. Opening the link redirects you to github authentication, +and lets the connection to proceed. This is also known as "Link auth" [2]. + + +## Routing the connection + +When a client starts a connection, it needs to be routed to the +correct PostgreSQL instance. Routing can be done by the proxy, acting +as a man-in-the-middle, or the connection can be routed at the network +level based on the hostname or IP address. + +Either way, Neon needs to identify which PostgreSQL instance the +connection should be routed to. If the instance is not already +running, it needs to be started. Some connections always require a new +PostgreSQL instance to be created, e.g. if you want to run a one-off +query against a particular point-in-time. + +The PostgreSQL instance is identified by: +- Neon account (possibly anonymous) +- cluster (known as tenant in the storage?) +- branch or snapshot name +- timestamp (PITR) +- primary or read-replica +- one-off read replica +- one-off writeable branch + +When you are using regular PostgreSQL authentication or anonymous +access, the connection URL needs to contain all the information needed +for the routing. With github single sign-on, the browser is involved +and some details - the Neon account in particular - can be deduced +from the authentication exchange. + +There are three methods for identifying the PostgreSQL instance: + +- Browser interaction (link auth) +- Options in the connection URL and the domain name +- A pre-defined endpoint, identified by domain name or IP address + +### Link Auth + + postgres://@start.neon.tech/ + +This gives you a link that you open in browser. Clicking the link +performs github authentication, and the Neon account name is +provided to the proxy behind the scenes. The proxy routes the +connection to the primary PostgreSQL instance in cluster called +"main", branch "main". + +Further ideas: +- You could pre-define a different target for link auth + connections in the UI. +- You could have a drop-down in the browser, allowing you to connect + to any cluster you want. Link Auth can be like Teleport. + +### Connection URL + +The connection URL looks like this: + + postgres://@.db.neon.tech/ + +By default, this connects you to the primary PostgreSQL instance +running on the "main" branch in the named cluster [3]. However, you can +change that by specifying options in the connection URL. The following +options are supported: + +| option name | Description | Examples | +| --- | --- | --- | +| cluster | Cluster name | cluster:myproject | +| branch | Branch name | branch:main | +| timestamp | Connect to an instance at given point-in-time. | timestamp:2022-04-08 timestamp:2022-04-08T11:42:16Z | +| lsn | Connect to an instance at given LSN | lsn:0/12FF0420 | +| read-replica | Connect to a read-replica. If the parameter is 'new', a new instance is created for this session. | read-replica read-replica:new | + +For example, to read branch 'testing' as it was on Mar 31, 2022, you could +specify a timestamp in the connection URL [4]: + + postgres://alice@cluster-1234.db.neon.tech/postgres?options=branch:testing,timestamp:2022-03-31 + +Connecting with cluster name and options can be disabled in the UI. If +disabled, you can only connect using a pre-defined endpoint. + +### Pre-defined Endpoint + +Instead of providing the cluster name, branch, and all those options +in the connection URL, you can define a named endpoint with the same +options. + +In the UI, click "create endpoint". Fill in the details: + +- Cluster name +- Branch +- timestamp or LSN +- is this for the primary or for a read replica +- etc. + +When you click Finish, a named endpoint is created. You can now use the endpoint ID to connect: + + postgres://@.endpoint.neon.tech/ + + +An endpoint can be assigned a static or dynamic IP address, so that +you can connect to it with clients that don't support TLS SNI. Maybe +bypass the proxy altogether, but that ought to be invisible to the +user. + +You can limit the range of source IP addresses that are allowed to +connect to an endpoint. An endpoint can also be exposed in an Amazon +VPC, allowing direct connections from applications. + + +# Footnotes + +[1] I'm not sure how feasible it is to set up configure like Kerberos +or LDAP in a cloud environment. But in principle I think we should +allow customers to have the full power of PostgreSQL, including all +authentication options. However, it's up to the customer to configure +it correctly. + +[2] Link is a way to both authenticate and to route the connection + +[3] This assumes that cluster-ids are globally unique, across all +Neon accounts. + +[4] The syntax accepted in the connection URL is limited by libpq. The +only way to pass arbitrary options to the server (or our proxy) is +with the "options" keyword, and the options must be percent-encoded. I +think the above would work but i haven't tested it From baa59512b8e0f5ca535025d9fc879f31fc18b39f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 3 May 2022 08:07:14 +0300 Subject: [PATCH 0227/1022] Traverse frozen layer in get_reconstruct_data in reverse order (#1601) * Traverse frozen layer in get_reconstruct_data in reverse order * Fix comments on frozen layers. Note explicitly the order that the layers are in the queue. * Add fail point to reproduce failpoint iteration error Co-authored-by: Heikki Linnakangas --- pageserver/src/layered_repository.rs | 9 ++++++--- pageserver/src/layered_repository/layer_map.rs | 11 +++++++---- test_runner/batch_others/test_ancestor_branch.py | 4 ++++ 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 080ac2852d..59e73d961d 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1432,7 +1432,8 @@ impl LayeredTimeline { let layers = timeline.layers.read().unwrap(); - // Check the open and frozen in-memory layers first + // Check the open and frozen in-memory layers first, in order from newest + // to oldest. if let Some(open_layer) = &layers.open_layer { let start_lsn = open_layer.get_lsn_range().start; if cont_lsn > start_lsn { @@ -1450,7 +1451,7 @@ impl LayeredTimeline { continue; } } - for frozen_layer in layers.frozen_layers.iter() { + for frozen_layer in layers.frozen_layers.iter().rev() { let start_lsn = frozen_layer.get_lsn_range().start; if cont_lsn > start_lsn { //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); @@ -1695,7 +1696,9 @@ impl LayeredTimeline { self.conf.timeline_path(&self.timelineid, &self.tenantid), ])?; - // Finally, replace the frozen in-memory layer with the new on-disk layers + fail_point!("flush-frozen"); + + // Finally, replace the frozen in-memory layer with the new on-disk layer { let mut layers = self.layers.write().unwrap(); let l = layers.frozen_layers.pop_front(); diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 03ee8b8ef1..91a900dde0 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -43,10 +43,13 @@ pub struct LayerMap { pub next_open_layer_at: Option, /// - /// The frozen layer, if any, contains WAL older than the current 'open_layer' - /// or 'next_open_layer_at', but newer than any historic layer. The frozen - /// layer is during checkpointing, when an InMemoryLayer is being written out - /// to disk. + /// Frozen layers, if any. Frozen layers are in-memory layers that + /// are no longer added to, but haven't been written out to disk + /// yet. They contain WAL older than the current 'open_layer' or + /// 'next_open_layer_at', but newer than any historic layer. + /// The frozen layers are in order from oldest to newest, so that + /// the newest one is in the 'back' of the VecDeque, and the oldest + /// in the 'front'. /// pub frozen_layers: VecDeque>, diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index aeb45348ad..75fe3cde0f 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -33,6 +33,10 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): 'compaction_target_size': '4194304', }) + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: + pscur.execute("failpoints flush-frozen=sleep(10000)") + env.zenith_cli.create_timeline(f'main', tenant_id=tenant) pg_branch0 = env.postgres.create_start('main', tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() From 62449d60683e93f8f54b5c79fdcb89b74853d695 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 3 May 2022 09:25:12 +0300 Subject: [PATCH 0228/1022] Bump vendor/postgres (#1573) This brings us the performance improvements to WAL redo from https://github.com/neondatabase/postgres/pull/144 --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index d7c8426e49..a13fe64a3e 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit d7c8426e49cff3c791c3f2c4cde95f1fce665573 +Subproject commit a13fe64a3eff1743ff17141a2e6057f5103829f0 From 9ede38b6c4aec5a1d49f0e83278f112f1eb4069e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 3 May 2022 09:28:57 +0300 Subject: [PATCH 0229/1022] Support finding LSN from a commit timestamp. A new `get_lsn_by_timestamp` command is added to the libpq page service API. An extra timestamp field is now stored in an extra field after each Clog page. It is the timestamp of the latest commit, among all the transactions on the Clog page. To find the overall latest commit, we need to scan all Clog pages, but this isn't a very frequent operation so that's not too bad. To find the LSN that corresponds to a timestamp, we perform a binary search. The binary search starts with min = last LSN when GC ran, and max = latest LSN on the timeline. On each iteration of the search we check if there are any commits with a higher-than-requested timestamp at that LSN. Implements github issue 1361. --- libs/postgres_ffi/src/xlog_utils.rs | 6 +- libs/utils/src/pq_proto.rs | 12 +++ pageserver/src/basebackup.rs | 12 ++- pageserver/src/page_service.rs | 30 +++++- pageserver/src/pgdatadir_mapping.rs | 108 +++++++++++++++++++ pageserver/src/walingest.rs | 10 +- pageserver/src/walrecord.rs | 5 +- pageserver/src/walredo.rs | 22 +++- test_runner/batch_others/test_lsn_mapping.py | 84 +++++++++++++++ test_runner/fixtures/zenith_fixtures.py | 1 + 10 files changed, 282 insertions(+), 8 deletions(-) create mode 100644 test_runner/batch_others/test_lsn_mapping.py diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 1645c44de5..bd4b7df690 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -118,11 +118,15 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn { } pub fn get_current_timestamp() -> TimestampTz { + to_pg_timestamp(SystemTime::now()) +} + +pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz { const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */ const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */ const SECS_PER_DAY: u64 = 86400; const USECS_PER_SEC: u64 = 1000000; - match SystemTime::now().duration_since(SystemTime::UNIX_EPOCH) { + match time.duration_since(SystemTime::UNIX_EPOCH) { Ok(n) => { ((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY)) * USECS_PER_SEC diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index e1677f4311..ce86cf8c91 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -503,6 +503,18 @@ impl RowDescriptor<'_> { formatcode: 0, } } + + pub const fn text_col(name: &[u8]) -> RowDescriptor { + RowDescriptor { + name, + tableoid: 0, + attnum: 0, + typoid: TEXT_OID, + typlen: -1, + typmod: 0, + formatcode: 0, + } + } } #[derive(Debug)] diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 78a27e460f..14e6d40759 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -154,9 +154,17 @@ impl<'a> Basebackup<'a> { let img = self .timeline .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?; - ensure!(img.len() == pg_constants::BLCKSZ as usize); - slru_buf.extend_from_slice(&img); + if slru == SlruKind::Clog { + ensure!( + img.len() == pg_constants::BLCKSZ as usize + || img.len() == pg_constants::BLCKSZ as usize + 8 + ); + } else { + ensure!(img.len() == pg_constants::BLCKSZ as usize); + } + + slru_buf.extend_from_slice(&img[..pg_constants::BLCKSZ as usize]); } let segname = format!("{}/{:>04X}", slru.to_str(), segno); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 0adafab8ba..e584a101cd 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -31,7 +31,7 @@ use utils::{ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; -use crate::pgdatadir_mapping::DatadirTimeline; +use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp}; use crate::profiling::profpoint_start; use crate::reltag::RelTag; use crate::repository::Repository; @@ -42,6 +42,7 @@ use crate::thread_mgr::ThreadKind; use crate::walreceiver; use crate::CheckpointConfig; use metrics::{register_histogram_vec, HistogramVec}; +use postgres_ffi::xlog_utils::to_pg_timestamp; // Wrapped in libpq CopyData enum PagestreamFeMessage { @@ -805,6 +806,33 @@ impl postgres_backend::Handler for PageServerHandler { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + } else if query_string.starts_with("get_lsn_by_timestamp ") { + // Locate LSN of last transaction with timestamp less or equal than sppecified + // TODO lazy static + let re = Regex::new(r"^get_lsn_by_timestamp ([[:xdigit:]]+) ([[:xdigit:]]+) '(.*)'$") + .unwrap(); + let caps = re + .captures(query_string) + .with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?; + + let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; + let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) + .context("Cannot load local timeline")?; + + let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; + let timestamp_pg = to_pg_timestamp(timestamp); + + pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( + b"lsn", + )]))?; + let result = match timeline.find_lsn_for_timestamp(timestamp_pg)? { + LsnForTimestamp::Present(lsn) => format!("{}", lsn), + LsnForTimestamp::Future(_lsn) => "future".into(), + LsnForTimestamp::Past(_lsn) => "past".into(), + }; + pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { bail!("unknown command"); } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 071eccc05d..c052aa3d69 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -13,6 +13,7 @@ use crate::repository::{Repository, Timeline}; use crate::walrecord::ZenithWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; +use postgres_ffi::xlog_utils::TimestampTz; use postgres_ffi::{pg_constants, Oid, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; @@ -45,6 +46,13 @@ where current_logical_size: AtomicIsize, } +#[derive(Debug)] +pub enum LsnForTimestamp { + Present(Lsn), + Future(Lsn), + Past(Lsn), +} + impl DatadirTimeline { pub fn new(tline: Arc, repartition_threshold: u64) -> Self { DatadirTimeline { @@ -202,6 +210,106 @@ impl DatadirTimeline { Ok(exists) } + /// Locate LSN, such that all transactions that committed before + /// 'search_timestamp' are visible, but nothing newer is. + /// + /// This is not exact. Commit timestamps are not guaranteed to be ordered, + /// so it's not well defined which LSN you get if there were multiple commits + /// "in flight" at that point in time. + /// + pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result { + let gc_cutoff_lsn_guard = self.tline.get_latest_gc_cutoff_lsn(); + let min_lsn = *gc_cutoff_lsn_guard; + let max_lsn = self.tline.get_last_record_lsn(); + + // LSNs are always 8-byte aligned. low/mid/high represent the + // LSN divided by 8. + let mut low = min_lsn.0 / 8; + let mut high = max_lsn.0 / 8 + 1; + + let mut found_smaller = false; + let mut found_larger = false; + while low < high { + // cannot overflow, high and low are both smaller than u64::MAX / 2 + let mid = (high + low) / 2; + + let cmp = self.is_latest_commit_timestamp_ge_than( + search_timestamp, + Lsn(mid * 8), + &mut found_smaller, + &mut found_larger, + )?; + + if cmp { + high = mid; + } else { + low = mid + 1; + } + } + match (found_smaller, found_larger) { + (false, false) => { + // This can happen if no commit records have been processed yet, e.g. + // just after importing a cluster. + bail!("no commit timestamps found"); + } + (true, false) => { + // Didn't find any commit timestamps larger than the request + Ok(LsnForTimestamp::Future(max_lsn)) + } + (false, true) => { + // Didn't find any commit timestamps smaller than the request + Ok(LsnForTimestamp::Past(max_lsn)) + } + (true, true) => { + // low is the LSN of the first commit record *after* the search_timestamp, + // Back off by one to get to the point just before the commit. + // + // FIXME: it would be better to get the LSN of the previous commit. + // Otherwise, if you restore to the returned LSN, the database will + // include physical changes from later commits that will be marked + // as aborted, and will need to be vacuumed away. + Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8))) + } + } + } + + /// + /// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any + /// commits that committed after 'search_timestamp', at LSN 'probe_lsn'. + /// + /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits + /// with a smaller/larger timestamp. + /// + fn is_latest_commit_timestamp_ge_than( + &self, + search_timestamp: TimestampTz, + probe_lsn: Lsn, + found_smaller: &mut bool, + found_larger: &mut bool, + ) -> Result { + for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? { + let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?; + for blknum in (0..nblocks).rev() { + let clog_page = + self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?; + + if clog_page.len() == pg_constants::BLCKSZ as usize + 8 { + let mut timestamp_bytes = [0u8; 8]; + timestamp_bytes.copy_from_slice(&clog_page[pg_constants::BLCKSZ as usize..]); + let timestamp = TimestampTz::from_be_bytes(timestamp_bytes); + + if timestamp >= search_timestamp { + *found_larger = true; + return Ok(true); + } else { + *found_smaller = true; + } + } + } + } + Ok(false) + } + /// Get a list of SLRU segments pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { // fetch directory entry diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 583cdecb1d..a929e290ad 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -635,7 +635,10 @@ impl<'a, R: Repository> WalIngest<'a, R> { segno, rpageno, if is_commit { - ZenithWalRecord::ClogSetCommitted { xids: page_xids } + ZenithWalRecord::ClogSetCommitted { + xids: page_xids, + timestamp: parsed.xact_time, + } } else { ZenithWalRecord::ClogSetAborted { xids: page_xids } }, @@ -652,7 +655,10 @@ impl<'a, R: Repository> WalIngest<'a, R> { segno, rpageno, if is_commit { - ZenithWalRecord::ClogSetCommitted { xids: page_xids } + ZenithWalRecord::ClogSetCommitted { + xids: page_xids, + timestamp: parsed.xact_time, + } } else { ZenithWalRecord::ClogSetAborted { xids: page_xids } }, diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 5947a0c147..e8699cfa22 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -24,7 +24,10 @@ pub enum ZenithWalRecord { flags: u8, }, /// Mark transaction IDs as committed on a CLOG page - ClogSetCommitted { xids: Vec }, + ClogSetCommitted { + xids: Vec, + timestamp: TimestampTz, + }, /// Mark transaction IDs as aborted on a CLOG page ClogSetAborted { xids: Vec }, /// Extend multixact offsets SLRU diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 6338b839ae..777718b311 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -283,6 +283,11 @@ impl PostgresRedoManager { // If something went wrong, don't try to reuse the process. Kill it, and // next request will launch a new one. if result.is_err() { + error!( + "error applying {} WAL records to reconstruct page image at LSN {}", + records.len(), + lsn + ); let process = process_guard.take().unwrap(); process.kill(); } @@ -387,7 +392,7 @@ impl PostgresRedoManager { } // Non-relational WAL records are handled here, with custom code that has the // same effects as the corresponding Postgres WAL redo function. - ZenithWalRecord::ClogSetCommitted { xids } => { + ZenithWalRecord::ClogSetCommitted { xids, timestamp } => { let (slru_kind, segno, blknum) = key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; assert_eq!( @@ -421,6 +426,21 @@ impl PostgresRedoManager { page, ); } + + // Append the timestamp + if page.len() == pg_constants::BLCKSZ as usize + 8 { + page.truncate(pg_constants::BLCKSZ as usize); + } + if page.len() == pg_constants::BLCKSZ as usize { + page.extend_from_slice(×tamp.to_be_bytes()); + } else { + warn!( + "CLOG blk {} in seg {} has invalid size {}", + blknum, + segno, + page.len() + ); + } } ZenithWalRecord::ClogSetAborted { xids } => { let (slru_kind, segno, blknum) = diff --git a/test_runner/batch_others/test_lsn_mapping.py b/test_runner/batch_others/test_lsn_mapping.py new file mode 100644 index 0000000000..37113b46f2 --- /dev/null +++ b/test_runner/batch_others/test_lsn_mapping.py @@ -0,0 +1,84 @@ +from contextlib import closing +from datetime import timedelta, timezone, tzinfo +import math +from uuid import UUID +import psycopg2.extras +import psycopg2.errors +from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres +from fixtures.log_helper import log +import time + + +# +# Test pageserver get_lsn_by_timestamp API +# +def test_lsn_mapping(zenith_env_builder: ZenithEnvBuilder): + zenith_env_builder.num_safekeepers = 1 + env = zenith_env_builder.init_start() + + new_timeline_id = env.zenith_cli.create_branch('test_lsn_mapping') + pgmain = env.postgres.create_start("test_lsn_mapping") + log.info("postgres is running on 'test_lsn_mapping' branch") + + ps_conn = env.pageserver.connect() + ps_cur = ps_conn.cursor() + conn = pgmain.connect() + cur = conn.cursor() + + # Create table, and insert rows, each in a separate transaction + # Disable synchronous_commit to make this initialization go faster. + # + # Each row contains current insert LSN and the current timestamp, when + # the row was inserted. + cur.execute("SET synchronous_commit=off") + cur.execute("CREATE TABLE foo (x integer)") + tbl = [] + for i in range(1000): + cur.execute(f"INSERT INTO foo VALUES({i})") + cur.execute(f'SELECT clock_timestamp()') + # Get the timestamp at UTC + after_timestamp = cur.fetchone()[0].replace(tzinfo=None) + tbl.append([i, after_timestamp]) + + # Execute one more transaction with synchronous_commit enabled, to flush + # all the previous transactions + cur.execute("SET synchronous_commit=on") + cur.execute("INSERT INTO foo VALUES (-1)") + + # Check edge cases: timestamp in the future + probe_timestamp = tbl[-1][1] + timedelta(hours=1) + ps_cur.execute( + f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'" + ) + result = ps_cur.fetchone()[0] + assert result == 'future' + + # timestamp too the far history + probe_timestamp = tbl[0][1] - timedelta(hours=10) + ps_cur.execute( + f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'" + ) + result = ps_cur.fetchone()[0] + assert result == 'past' + + # Probe a bunch of timestamps in the valid range + for i in range(1, len(tbl), 100): + probe_timestamp = tbl[i][1] + + # Call get_lsn_by_timestamp to get the LSN + ps_cur.execute( + f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'" + ) + lsn = ps_cur.fetchone()[0] + + # Launch a new read-only node at that LSN, and check that only the rows + # that were supposed to be committed at that point in time are visible. + pg_here = env.postgres.create_start(branch_name='test_lsn_mapping', + node_name='test_lsn_mapping_read', + lsn=lsn) + with closing(pg_here.connect()) as conn_here: + with conn_here.cursor() as cur_here: + cur_here.execute("SELECT max(x) FROM foo") + assert cur_here.fetchone()[0] == i + + pg_here.stop_and_destroy() diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 5614cea68b..5b25b1c457 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1572,6 +1572,7 @@ class Postgres(PgProtocol): assert self.node_name is not None self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, True) self.node_name = None + self.running = False return self From ff7e9a86c6f61a9c23f538904f7d378126a6597e Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 3 May 2022 12:00:42 +0300 Subject: [PATCH 0230/1022] turn panic into an error with more details --- pageserver/src/layered_repository.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 59e73d961d..1205f8d867 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1504,12 +1504,20 @@ impl LayeredTimeline { let ancestor = self .ancestor_timeline .as_ref() - .expect("there should be an ancestor") + .with_context(|| { + format!( + "Ancestor is missing. Timeline id: {} Ancestor id {:?}", + self.timelineid, + self.get_ancestor_timeline_id(), + ) + })? .ensure_loaded() .with_context(|| { format!( - "Cannot get the whole layer for read locked: timeline {} is not present locally", - self.get_ancestor_timeline_id().unwrap()) + "Ancestor timeline is not is not loaded. Timeline id: {} Ancestor id {:?}", + self.timelineid, + self.get_ancestor_timeline_id(), + ) })?; Ok(Arc::clone(ancestor)) } From e7cba0b60722af46742094fa43c4def394cc010a Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 2 May 2022 23:36:15 +0300 Subject: [PATCH 0231/1022] use thiserror instead of anyhow in disk_btree --- .../src/layered_repository/disk_btree.rs | 105 ++++++++++++------ 1 file changed, 70 insertions(+), 35 deletions(-) diff --git a/pageserver/src/layered_repository/disk_btree.rs b/pageserver/src/layered_repository/disk_btree.rs index 7a9fe6f2b7..e747192d96 100644 --- a/pageserver/src/layered_repository/disk_btree.rs +++ b/pageserver/src/layered_repository/disk_btree.rs @@ -11,7 +11,6 @@ //! - page-oriented //! //! TODO: -//! - better errors (e.g. with thiserror?) //! - maybe something like an Adaptive Radix Tree would be more efficient? //! - the values stored by image and delta layers are offsets into the file, //! and they are in monotonically increasing order. Prefix compression would @@ -19,11 +18,12 @@ //! - An Iterator interface would be more convenient for the callers than the //! 'visit' function //! -use anyhow; use byteorder::{ReadBytesExt, BE}; use bytes::{BufMut, Bytes, BytesMut}; use hex; -use std::cmp::Ordering; +use std::{cmp::Ordering, io, result}; +use thiserror::Error; +use tracing::error; use crate::layered_repository::block_io::{BlockReader, BlockWriter}; @@ -86,6 +86,23 @@ impl Value { } } +#[derive(Error, Debug)] +pub enum DiskBtreeError { + #[error("Attempt to append a value that is too large {0} > {}", MAX_VALUE)] + AppendOverflow(u64), + + #[error("Unsorted input: key {key:?} is <= last_key {last_key:?}")] + UnsortedInput { key: Box<[u8]>, last_key: Box<[u8]> }, + + #[error("Could not push to new leaf node")] + FailedToPushToNewLeafNode, + + #[error("IoError: {0}")] + Io(#[from] io::Error), +} + +pub type Result = result::Result; + /// This is the on-disk representation. struct OnDiskNode<'a, const L: usize> { // Fixed-width fields @@ -106,12 +123,12 @@ impl<'a, const L: usize> OnDiskNode<'a, L> { /// /// Interpret a PAGE_SZ page as a node. /// - fn deparse(buf: &[u8]) -> OnDiskNode { + fn deparse(buf: &[u8]) -> Result> { let mut cursor = std::io::Cursor::new(buf); - let num_children = cursor.read_u16::().unwrap(); - let level = cursor.read_u8().unwrap(); - let prefix_len = cursor.read_u8().unwrap(); - let suffix_len = cursor.read_u8().unwrap(); + let num_children = cursor.read_u16::()?; + let level = cursor.read_u8()?; + let prefix_len = cursor.read_u8()?; + let suffix_len = cursor.read_u8()?; let mut off = cursor.position(); let prefix_off = off as usize; @@ -129,7 +146,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> { let keys = &buf[keys_off..keys_off + keys_len]; let values = &buf[values_off..values_off + values_len]; - OnDiskNode { + Ok(OnDiskNode { num_children, level, prefix_len, @@ -137,7 +154,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> { prefix, keys, values, - } + }) } /// @@ -149,7 +166,11 @@ impl<'a, const L: usize> OnDiskNode<'a, L> { Value::from_slice(value_slice) } - fn binary_search(&self, search_key: &[u8; L], keybuf: &mut [u8]) -> Result { + fn binary_search( + &self, + search_key: &[u8; L], + keybuf: &mut [u8], + ) -> result::Result { let mut size = self.num_children as usize; let mut low = 0; let mut high = size; @@ -209,7 +230,7 @@ where /// /// Read the value for given key. Returns the value, or None if it doesn't exist. /// - pub fn get(&self, search_key: &[u8; L]) -> anyhow::Result> { + pub fn get(&self, search_key: &[u8; L]) -> Result> { let mut result: Option = None; self.visit(search_key, VisitDirection::Forwards, |key, value| { if key == search_key { @@ -230,7 +251,7 @@ where search_key: &[u8; L], dir: VisitDirection, mut visitor: V, - ) -> anyhow::Result + ) -> Result where V: FnMut(&[u8], u64) -> bool, { @@ -243,7 +264,7 @@ where search_key: &[u8; L], dir: VisitDirection, visitor: &mut V, - ) -> anyhow::Result + ) -> Result where V: FnMut(&[u8], u64) -> bool, { @@ -260,11 +281,11 @@ where search_key: &[u8; L], dir: VisitDirection, visitor: &mut V, - ) -> anyhow::Result + ) -> Result where V: FnMut(&[u8], u64) -> bool, { - let node = OnDiskNode::deparse(node_buf); + let node = OnDiskNode::deparse(node_buf)?; let prefix_len = node.prefix_len as usize; let suffix_len = node.suffix_len as usize; @@ -369,15 +390,15 @@ where } #[allow(dead_code)] - pub fn dump(&self) -> anyhow::Result<()> { + pub fn dump(&self) -> Result<()> { self.dump_recurse(self.root_blk, &[], 0) } - fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> anyhow::Result<()> { + fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> { let blk = self.reader.read_blk(self.start_blk + blknum)?; let buf: &[u8] = blk.as_ref(); - let node = OnDiskNode::::deparse(buf); + let node = OnDiskNode::::deparse(buf)?; print!("{:indent$}", "", indent = depth * 2); println!( @@ -442,17 +463,24 @@ where } } - pub fn append(&mut self, key: &[u8; L], value: u64) -> Result<(), anyhow::Error> { - assert!(value <= MAX_VALUE); + pub fn append(&mut self, key: &[u8; L], value: u64) -> Result<()> { + if value > MAX_VALUE { + return Err(DiskBtreeError::AppendOverflow(value)); + } if let Some(last_key) = &self.last_key { - assert!(key > last_key, "unsorted input"); + if key <= last_key { + return Err(DiskBtreeError::UnsortedInput { + key: key.as_slice().into(), + last_key: last_key.as_slice().into(), + }); + } } self.last_key = Some(*key); - Ok(self.append_internal(key, Value::from_u64(value))?) + self.append_internal(key, Value::from_u64(value)) } - fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<(), std::io::Error> { + fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<()> { // Try to append to the current leaf buffer let last = self.stack.last_mut().unwrap(); let level = last.level; @@ -476,14 +504,15 @@ where // key to it. let mut last = BuildNode::new(level); if !last.push(key, value) { - panic!("could not push to new leaf node"); + return Err(DiskBtreeError::FailedToPushToNewLeafNode); } + self.stack.push(last); Ok(()) } - fn flush_node(&mut self) -> Result<(), std::io::Error> { + fn flush_node(&mut self) -> Result<()> { let last = self.stack.pop().unwrap(); let buf = last.pack(); let downlink_key = last.first_key(); @@ -505,7 +534,7 @@ where /// (In the image and delta layers, it is stored in the beginning of the file, /// in the summary header) /// - pub fn finish(mut self) -> Result<(u32, W), std::io::Error> { + pub fn finish(mut self) -> Result<(u32, W)> { // flush all levels, except the root. while self.stack.len() > 1 { self.flush_node()?; @@ -692,14 +721,14 @@ mod tests { impl BlockReader for TestDisk { type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>; - fn read_blk(&self, blknum: u32) -> Result { + fn read_blk(&self, blknum: u32) -> io::Result { let mut buf = [0u8; PAGE_SZ]; buf.copy_from_slice(&self.blocks[blknum as usize]); Ok(std::rc::Rc::new(buf)) } } impl BlockWriter for &mut TestDisk { - fn write_blk(&mut self, buf: Bytes) -> Result { + fn write_blk(&mut self, buf: Bytes) -> io::Result { let blknum = self.blocks.len(); self.blocks.push(buf); Ok(blknum as u32) @@ -707,7 +736,7 @@ mod tests { } #[test] - fn basic() -> anyhow::Result<()> { + fn basic() -> Result<()> { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk); @@ -788,7 +817,7 @@ mod tests { } #[test] - fn lots_of_keys() -> anyhow::Result<()> { + fn lots_of_keys() -> Result<()> { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk); @@ -882,7 +911,7 @@ mod tests { } #[test] - fn random_data() -> anyhow::Result<()> { + fn random_data() -> Result<()> { // Generate random keys with exponential distribution, to // exercise the prefix compression const NUM_KEYS: usize = 100000; @@ -927,21 +956,27 @@ mod tests { } #[test] - #[should_panic(expected = "unsorted input")] fn unsorted_input() { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 2>::new(&mut disk); let _ = writer.append(b"ba", 1); let _ = writer.append(b"bb", 2); - let _ = writer.append(b"aa", 3); + let err = writer.append(b"aa", 3).expect_err("should've failed"); + match err { + DiskBtreeError::UnsortedInput { key, last_key } => { + assert_eq!(key.as_ref(), b"aa".as_slice()); + assert_eq!(last_key.as_ref(), b"bb".as_slice()); + } + _ => panic!("unexpected error variant, expected DiskBtreeError::UnsortedInput"), + } } /// /// This test contains a particular data set, see disk_btree_test_data.rs /// #[test] - fn particular_data() -> anyhow::Result<()> { + fn particular_data() -> Result<()> { // Build a tree from it let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk); From 2f9b17b9e5b68ae2b469618ecfdbf64d4188f041 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 28 Apr 2022 11:19:41 +0300 Subject: [PATCH 0232/1022] Add simple test of pageserver recovery after crash. To cause a crash, use failpoints in checkpointer --- .circleci/config.yml | 2 +- pageserver/src/bin/pageserver.rs | 30 ++++++++++- pageserver/src/layered_repository.rs | 2 + test_runner/batch_others/test_recovery.py | 64 +++++++++++++++++++++++ test_runner/fixtures/zenith_fixtures.py | 13 +++++ 5 files changed, 108 insertions(+), 3 deletions(-) create mode 100644 test_runner/batch_others/test_recovery.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 2ed079f031..864246ad2e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -121,7 +121,7 @@ jobs: export RUSTC_WRAPPER=cachepot export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" - "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --bins --tests + "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests cachepot -s - save_cache: diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 2139bea37e..6a5d4533d0 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -8,6 +8,7 @@ use anyhow::{bail, Context, Result}; use clap::{App, Arg}; use daemonize::Daemonize; +use fail::FailScenario; use pageserver::{ config::{defaults::*, PageServerConf}, http, page_cache, page_service, profiling, tenant_mgr, thread_mgr, @@ -84,8 +85,23 @@ fn main() -> anyhow::Result<()> { .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), ) + .arg( + Arg::new("enabled-features") + .long("enabled-features") + .takes_value(false) + .help("Show enabled compile time features"), + ) .get_matches(); + if arg_matches.is_present("enabled-features") { + let features: &[&str] = &[ + #[cfg(feature = "failpoints")] + "failpoints", + ]; + println!("{{\"features\": {features:?} }}"); + return Ok(()); + } + let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith")); let workdir = workdir .canonicalize() @@ -166,6 +182,14 @@ fn main() -> anyhow::Result<()> { // as a ref. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); + // If failpoints are used, terminate the whole pageserver process if they are hit. + let scenario = FailScenario::setup(); + if fail::has_failpoints() { + std::panic::set_hook(Box::new(|_| { + std::process::exit(1); + })); + } + // Basic initialization of things that don't change after startup virtual_file::init(conf.max_file_descriptors); page_cache::init(conf.page_cache_size); @@ -181,10 +205,12 @@ fn main() -> anyhow::Result<()> { cfg_file_path.display() ) })?; - Ok(()) } else { - start_pageserver(conf, daemonize).context("Failed to start pageserver") + start_pageserver(conf, daemonize).context("Failed to start pageserver")?; } + + scenario.teardown(); + Ok(()) } fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> { diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 1205f8d867..e678c8f4cb 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1703,6 +1703,7 @@ impl LayeredTimeline { new_delta_path.clone(), self.conf.timeline_path(&self.timelineid, &self.tenantid), ])?; + fail_point!("checkpoint-before-sync"); fail_point!("flush-frozen"); @@ -1727,6 +1728,7 @@ impl LayeredTimeline { // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing // *all* the layers, to avoid fsyncing the file multiple times. let disk_consistent_lsn = Lsn(frozen_layer.get_lsn_range().end.0 - 1); + fail_point!("checkpoint-after-sync"); // If we were able to advance 'disk_consistent_lsn', save it the metadata file. // After crash, we will restart WAL streaming and processing from that point. diff --git a/test_runner/batch_others/test_recovery.py b/test_runner/batch_others/test_recovery.py new file mode 100644 index 0000000000..dbfa943a7a --- /dev/null +++ b/test_runner/batch_others/test_recovery.py @@ -0,0 +1,64 @@ +import os +import time +import psycopg2.extras +import json +from ast import Assert +from contextlib import closing +from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.log_helper import log + + +# +# Test pageserver recovery after crash +# +def test_pageserver_recovery(zenith_env_builder: ZenithEnvBuilder): + zenith_env_builder.num_safekeepers = 1 + # Override default checkpointer settings to run it more often + zenith_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" + + env = zenith_env_builder.init() + + # Check if failpoints enables. Otherwise the test doesn't make sense + f = env.zenith_cli.pageserver_enabled_features() + + assert "failpoints" in f["features"], "Build pageserver with --features=failpoints option to run this test" + zenith_env_builder.start() + + # Create a branch for us + env.zenith_cli.create_branch("test_pageserver_recovery", "main") + + pg = env.postgres.create_start('test_pageserver_recovery') + log.info("postgres is running on 'test_pageserver_recovery' branch") + + connstr = pg.connstr() + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: + # Create and initialize test table + cur.execute("CREATE TABLE foo(x bigint)") + cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))") + + # Sleep for some time to let checkpoint create image layers + time.sleep(2) + + # Configure failpoints + pscur.execute( + "failpoints checkpoint-before-sync=sleep(2000);checkpoint-after-sync=panic") + + # Do some updates until pageserver is crashed + try: + while True: + cur.execute("update foo set x=x+1") + except Exception as err: + log.info(f"Excepted server crash {err}") + + log.info("Wait before server restart") + env.pageserver.stop() + env.pageserver.start() + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("select count(*) from foo") + assert cur.fetchone() == (100000, ) diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 5b25b1c457..9319a53778 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -980,6 +980,19 @@ class ZenithCli: res.check_returncode() return res + def pageserver_enabled_features(self) -> Any: + bin_pageserver = os.path.join(str(zenith_binpath), 'pageserver') + args = [bin_pageserver, '--enabled-features'] + log.info('Running command "{}"'.format(' '.join(args))) + + res = subprocess.run(args, + check=True, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + log.info(f"pageserver_enabled_features success: {res.stdout}") + return json.loads(res.stdout) + def pageserver_start(self, overrides=()) -> 'subprocess.CompletedProcess[str]': start_args = ['pageserver', 'start', *overrides] append_pageserver_param_overrides(start_args, From 2f83f793bc3f5cf4008904a91f34383bd0350439 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 3 May 2022 17:14:58 +0300 Subject: [PATCH 0233/1022] print more details when thread fails --- pageserver/src/thread_mgr.rs | 42 +++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index f7f8467ae0..b908f220ee 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -130,12 +130,14 @@ struct PageServerThread { } /// Launch a new thread +/// Note: if shutdown_process_on_error is set to true failure +/// of the thread will lead to shutdown of entire process pub fn spawn( kind: ThreadKind, tenant_id: Option, timeline_id: Option, name: &str, - fail_on_error: bool, + shutdown_process_on_error: bool, f: F, ) -> std::io::Result<()> where @@ -175,7 +177,7 @@ where thread_id, thread_rc2, shutdown_rx, - fail_on_error, + shutdown_process_on_error, f, ) }) { @@ -201,7 +203,7 @@ fn thread_wrapper( thread_id: u64, thread: Arc, shutdown_rx: watch::Receiver<()>, - fail_on_error: bool, + shutdown_process_on_error: bool, f: F, ) where F: FnOnce() -> anyhow::Result<()> + Send + 'static, @@ -221,27 +223,41 @@ fn thread_wrapper( let result = panic::catch_unwind(AssertUnwindSafe(f)); // Remove our entry from the global hashmap. - THREADS.lock().unwrap().remove(&thread_id); + let thread = THREADS + .lock() + .unwrap() + .remove(&thread_id) + .expect("no thread in registry"); match result { Ok(Ok(())) => debug!("Thread '{}' exited normally", thread_name), Ok(Err(err)) => { - if fail_on_error { + if shutdown_process_on_error { error!( - "Shutting down: thread '{}' exited with error: {:?}", - thread_name, err + "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", + thread_name, thread.tenant_id, thread.timeline_id, err ); shutdown_pageserver(1); } else { - error!("Thread '{}' exited with error: {:?}", thread_name, err); + error!( + "Thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", + thread_name, thread.tenant_id, thread.timeline_id, err + ); } } Err(err) => { - error!( - "Shutting down: thread '{}' panicked: {:?}", - thread_name, err - ); - shutdown_pageserver(1); + if shutdown_process_on_error { + error!( + "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", + thread_name, thread.tenant_id, thread.timeline_id, err + ); + shutdown_pageserver(1); + } else { + error!( + "Thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", + thread_name, thread.tenant_id, thread.timeline_id, err + ); + } } } } From 5642d0b2b86e967eb2b8f71dcb7540f815c22ed6 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Tue, 3 May 2022 23:57:24 +0300 Subject: [PATCH 0234/1022] Change shutdown_process_on_error thread spawn settings. Now princeple is following: acceptor threads (libpq and http) error will bring the pageserver down, but all per-tenant thread failures will be treated as an error. --- pageserver/src/bin/pageserver.rs | 4 ++-- pageserver/src/tenant_mgr.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 6a5d4533d0..9cb7e6f13d 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -287,7 +287,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() None, None, "http_endpoint_thread", - false, + true, move || { let router = http::make_router(conf, auth_cloned, remote_index)?; endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher()) @@ -301,7 +301,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() None, None, "libpq endpoint thread", - false, + true, move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type), )?; diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 3e0a907d00..507e749e8c 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -244,7 +244,7 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> { Some(tenant_id), None, "Compactor thread", - true, + false, move || crate::tenant_threads::compact_loop(tenant_id), )?; @@ -253,7 +253,7 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> { Some(tenant_id), None, "GC thread", - true, + false, move || crate::tenant_threads::gc_loop(tenant_id), ) .with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}")); From 9dfa145c7c7fa826eef12ef36d710db1b40152a3 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 3 May 2022 23:51:47 +0300 Subject: [PATCH 0235/1022] tone down tenant not found error --- libs/utils/src/postgres_backend.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index fab3c388b1..857df0ec84 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -433,7 +433,12 @@ impl PostgresBackend { // full cause of the error, not just the top-level context + its trace. // We don't want to send that in the ErrorResponse though, // because it's not relevant to the compute node logs. - error!("query handler for '{}' failed: {:?}", query_string, e); + if query_string.starts_with("callmemaybe") { + // FIXME avoid printing a backtrace for tenant x not found errors until this is properly fixed + error!("query handler for '{}' failed: {}", query_string, e); + } else { + error!("query handler for '{}' failed: {:?}", query_string, e); + } self.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?; // TODO: untangle convoluted control flow if e.to_string().contains("failed to run") { From 51a0f2683bd299dfe30be511dafdf10dcfcf422d Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Wed, 4 May 2022 01:18:08 +0300 Subject: [PATCH 0236/1022] fix scram-proxy addresses --- .circleci/config.yml | 2 +- .circleci/helm-values/staging.proxy-scram.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 864246ad2e..85654b5d45 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -584,7 +584,7 @@ jobs: name: Re-deploy proxy command: | DOCKER_TAG=$(git log --oneline|wc -l) - helm upgrade zenith-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait + helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait deploy-release: diff --git a/.circleci/helm-values/staging.proxy-scram.yaml b/.circleci/helm-values/staging.proxy-scram.yaml index d95ae3bfc2..f72a9d4557 100644 --- a/.circleci/helm-values/staging.proxy-scram.yaml +++ b/.circleci/helm-values/staging.proxy-scram.yaml @@ -20,7 +20,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-type: external service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing - external-dns.alpha.kubernetes.io/hostname: *.cloud.stage.neon.tech + external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech metrics: enabled: true From 748c5a577b5b4ed9f3dee1a0cc85724893883c2e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 4 May 2022 10:54:44 +0300 Subject: [PATCH 0237/1022] Bump vendor/postgres. (#1616) Includes fix for https://github.com/neondatabase/neon/issues/1615 --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index a13fe64a3e..868e7be7ff 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit a13fe64a3eff1743ff17141a2e6057f5103829f0 +Subproject commit 868e7be7ff7dd1d026917892b3951f812e9d4a08 From b9fd8a36ad3b7cccc98d71930ef18338c34aa2d7 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Sun, 1 May 2022 16:58:34 +0400 Subject: [PATCH 0238/1022] Remember timeline_start_lsn and local_start_lsn on safekeeper. Make it remember when timeline starts in general and on this safekeeper in particular (the point might be later on new safekeeper replacing failed one). Bumps control file and walproposer protocol versions. While protocol is bumped, also add safekeeper node id to AcceptorProposerGreeting. ref #1561 --- safekeeper/src/control_file_upgrade.rs | 43 ++++++++++++++ safekeeper/src/http/routes.rs | 6 ++ safekeeper/src/json_ctrl.rs | 3 +- safekeeper/src/safekeeper.rs | 57 ++++++++++++++++--- safekeeper/src/timeline.rs | 4 +- test_runner/batch_others/test_wal_acceptor.py | 10 +++- test_runner/fixtures/zenith_fixtures.py | 4 +- 7 files changed, 114 insertions(+), 13 deletions(-) diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 0cb14298cb..d11206eff6 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -103,6 +103,43 @@ pub struct SafeKeeperStateV3 { pub wal_start_lsn: Lsn, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SafeKeeperStateV4 { + #[serde(with = "hex")] + pub tenant_id: ZTenantId, + /// Zenith timelineid + #[serde(with = "hex")] + pub timeline_id: ZTimelineId, + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfo, + /// Unique id of the last *elected* proposer we dealed with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// Part of WAL acknowledged by quorum and available locally. Always points + /// to record boundary. + pub commit_lsn: Lsn, + /// First LSN not yet offloaded to s3. Useful to persist to avoid finding + /// out offloading progress on boot. + pub s3_wal_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver (or broker). + pub remote_consistent_lsn: Lsn, + // Peers and their state as we remember it. Knowing peers themselves is + // fundamental; but state is saved here only for informational purposes and + // obviously can be stale. (Currently not saved at all, but let's provision + // place to have less file version upgrades). + pub peers: Peers, +} + pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { // migrate to storing full term history if version == 1 { @@ -125,6 +162,8 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result wal_seg_size: oldstate.server.wal_seg_size, }, proposer_uuid: oldstate.proposer_uuid, + timeline_start_lsn: Lsn(0), + local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, s3_wal_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, @@ -146,6 +185,8 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result acceptor_state: oldstate.acceptor_state, server, proposer_uuid: oldstate.proposer_uuid, + timeline_start_lsn: Lsn(0), + local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, s3_wal_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, @@ -167,6 +208,8 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result acceptor_state: oldstate.acceptor_state, server, proposer_uuid: oldstate.proposer_uuid, + timeline_start_lsn: Lsn(0), + local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, s3_wal_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index fab8724430..d7cbcb094e 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -69,6 +69,10 @@ struct TimelineStatus { timeline_id: ZTimelineId, acceptor_state: AcceptorStateStatus, #[serde(serialize_with = "display_serialize")] + timeline_start_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] + local_start_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] commit_lsn: Lsn, #[serde(serialize_with = "display_serialize")] s3_wal_lsn: Lsn, @@ -102,6 +106,8 @@ async fn timeline_status_handler(request: Request) -> Result Result<()> { let greeting_request = ProposerAcceptorMessage::Greeting(ProposerGreeting { - protocol_version: 1, // current protocol + protocol_version: 2, // current protocol pg_version: 0, // unknown proposer_id: [0u8; 16], system_id: 0, @@ -124,6 +124,7 @@ fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: L term, start_streaming_at: lsn, term_history: history, + timeline_start_lsn: Lsn(0), }); spg.timeline.get().process_msg(&proposer_elected_request)?; diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 048753152b..67d41d0b58 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -30,8 +30,8 @@ use utils::{ }; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 4; -const SK_PROTOCOL_VERSION: u32 = 1; +pub const SK_FORMAT_VERSION: u32 = 5; +const SK_PROTOCOL_VERSION: u32 = 2; const UNKNOWN_SERVER_VERSION: u32 = 0; /// Consensus logical timestamp. @@ -52,7 +52,7 @@ impl TermHistory { } // Parse TermHistory as n_entries followed by TermSwitchEntry pairs - pub fn from_bytes(mut bytes: Bytes) -> Result { + pub fn from_bytes(bytes: &mut Bytes) -> Result { if bytes.remaining() < 4 { bail!("TermHistory misses len"); } @@ -183,6 +183,13 @@ pub struct SafeKeeperState { /// for correctness, exists for monitoring purposes. #[serde(with = "hex")] pub proposer_uuid: PgUuid, + /// Since which LSN this timeline generally starts. Safekeeper might have + /// joined later. + pub timeline_start_lsn: Lsn, + /// Since which LSN safekeeper has (had) WAL for this timeline. + /// All WAL segments next to one containing local_start_lsn are + /// filled with data from the beginning. + pub local_start_lsn: Lsn, /// Part of WAL acknowledged by quorum and available locally. Always points /// to record boundary. pub commit_lsn: Lsn, @@ -231,6 +238,8 @@ impl SafeKeeperState { wal_seg_size: 0, }, proposer_uuid: [0; 16], + timeline_start_lsn: Lsn(0), + local_start_lsn: Lsn(0), commit_lsn: Lsn(0), s3_wal_lsn: Lsn(0), peer_horizon_lsn: Lsn(0), @@ -268,6 +277,7 @@ pub struct ProposerGreeting { #[derive(Debug, Serialize)] pub struct AcceptorGreeting { term: u64, + node_id: ZNodeId, } /// Vote request sent from proposer to safekeepers @@ -286,6 +296,7 @@ pub struct VoteResponse { flush_lsn: Lsn, truncate_lsn: Lsn, term_history: TermHistory, + timeline_start_lsn: Lsn, } /* @@ -297,6 +308,7 @@ pub struct ProposerElected { pub term: Term, pub start_streaming_at: Lsn, pub term_history: TermHistory, + pub timeline_start_lsn: Lsn, } /// Request with WAL message sent from proposer to safekeeper. Along the way it @@ -387,10 +399,15 @@ impl ProposerAcceptorMessage { } let term = msg_bytes.get_u64_le(); let start_streaming_at = msg_bytes.get_u64_le().into(); - let term_history = TermHistory::from_bytes(msg_bytes)?; + let term_history = TermHistory::from_bytes(&mut msg_bytes)?; + if msg_bytes.remaining() < 8 { + bail!("ProposerElected message is not complete"); + } + let timeline_start_lsn = msg_bytes.get_u64_le().into(); let msg = ProposerElected { term, start_streaming_at, + timeline_start_lsn, term_history, }; Ok(ProposerAcceptorMessage::Elected(msg)) @@ -437,6 +454,7 @@ impl AcceptorProposerMessage { AcceptorProposerMessage::Greeting(msg) => { buf.put_u64_le('g' as u64); buf.put_u64_le(msg.term); + buf.put_u64_le(msg.node_id.0); } AcceptorProposerMessage::VoteResponse(msg) => { buf.put_u64_le('v' as u64); @@ -449,6 +467,7 @@ impl AcceptorProposerMessage { buf.put_u64_le(e.term); buf.put_u64_le(e.lsn.into()); } + buf.put_u64_le(msg.timeline_start_lsn.into()); } AcceptorProposerMessage::AppendResponse(msg) => { buf.put_u64_le('a' as u64); @@ -511,6 +530,8 @@ pub struct SafeKeeper { pub state: CTRL, // persistent state storage pub wal_store: WAL, + + node_id: ZNodeId, // safekeeper's node id } impl SafeKeeper @@ -523,6 +544,7 @@ where ztli: ZTimelineId, state: CTRL, mut wal_store: WAL, + node_id: ZNodeId, ) -> Result> { if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id { bail!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id); @@ -544,6 +566,7 @@ where }, state, wal_store, + node_id, }) } @@ -635,6 +658,7 @@ where ); Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting { term: self.state.acceptor_state.term, + node_id: self.node_id, }))) } @@ -650,6 +674,7 @@ where flush_lsn: self.wal_store.flush_lsn(), truncate_lsn: self.state.peer_horizon_lsn, term_history: self.get_term_history(), + timeline_start_lsn: self.state.timeline_start_lsn, }; if self.state.acceptor_state.term < msg.term { let mut state = self.state.clone(); @@ -705,6 +730,23 @@ where // and now adopt term history from proposer { let mut state = self.state.clone(); + + // Remeber point where WAL begins globally, if not yet. + if state.timeline_start_lsn == Lsn(0) { + state.timeline_start_lsn = msg.timeline_start_lsn; + info!( + "setting timeline_start_lsn to {:?}", + state.timeline_start_lsn + ); + } + + // Remember point where WAL begins locally, if not yet. (I doubt the + // second condition is ever possible) + if state.local_start_lsn == Lsn(0) || state.local_start_lsn >= msg.start_streaming_at { + state.local_start_lsn = msg.start_streaming_at; + info!("setting local_start_lsn to {:?}", state.local_start_lsn); + } + state.acceptor_state.term_history = msg.term_history.clone(); self.state.persist(&state)?; } @@ -968,7 +1010,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store).unwrap(); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -983,7 +1025,7 @@ mod tests { let storage = InMemoryState { persisted_state: state, }; - sk = SafeKeeper::new(ztli, storage, sk.wal_store).unwrap(); + sk = SafeKeeper::new(ztli, storage, sk.wal_store, ZNodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request); @@ -1000,7 +1042,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store).unwrap(); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { term: 1, @@ -1023,6 +1065,7 @@ mod tests { term: 1, lsn: Lsn(3), }]), + timeline_start_lsn: Lsn(0), }; sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) .unwrap(); diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 4a507015d3..745d8e0893 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -102,7 +102,7 @@ impl SharedState { let state = SafeKeeperState::new(zttid, peer_ids); let control_store = control_file::FileStorage::create_new(zttid, conf, state)?; let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); - let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store)?; + let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?; Ok(Self { notified_commit_lsn: Lsn(0), @@ -125,7 +125,7 @@ impl SharedState { Ok(Self { notified_commit_lsn: Lsn(0), - sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store)?, + sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?, replicas: Vec::new(), active: false, num_computes: 0, diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 94059e2a4c..702c27a79b 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -573,7 +573,9 @@ def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] # fetch something sensible from status - epoch = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch + tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) + epoch = tli_status.acceptor_epoch + timeline_start_lsn = tli_status.timeline_start_lsn pg.safe_psql("create table t(i int)") @@ -581,9 +583,13 @@ def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): pg.stop().start() pg.safe_psql("insert into t values(10)") - epoch_after_reboot = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch + tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) + epoch_after_reboot = tli_status.acceptor_epoch assert epoch_after_reboot > epoch + # and timeline_start_lsn stays the same + assert tli_status.timeline_start_lsn == timeline_start_lsn + class SafekeeperEnv: def __init__(self, diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 9319a53778..d6d07d78d3 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1762,6 +1762,7 @@ class SafekeeperTimelineStatus: acceptor_epoch: int flush_lsn: str remote_consistent_lsn: str + timeline_start_lsn: str @dataclass @@ -1786,7 +1787,8 @@ class SafekeeperHttpClient(requests.Session): resj = res.json() return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'], flush_lsn=resj['flush_lsn'], - remote_consistent_lsn=resj['remote_consistent_lsn']) + remote_consistent_lsn=resj['remote_consistent_lsn'], + timeline_start_lsn=resj['timeline_start_lsn']) def record_safekeeper_info(self, tenant_id: str, timeline_id: str, body): res = self.post( From e58c83870fdb318d866953183192dfe97dcb6db8 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 4 May 2022 13:36:31 +0400 Subject: [PATCH 0239/1022] Bump vendor/postgres to to send timeline_start_lsn. --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 868e7be7ff..ce3057955a 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 868e7be7ff7dd1d026917892b3951f812e9d4a08 +Subproject commit ce3057955ac962662c6fe0d00d793bfccedf7ca8 From b68e3b03ed851ed582841822a9d603f02d698b42 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 4 May 2022 16:19:21 +0400 Subject: [PATCH 0240/1022] Fix control file update for b9fd8a36ad3b --- safekeeper/src/control_file_upgrade.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index d11206eff6..22716de1a0 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -216,6 +216,29 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), }); + // migrate to having timeline_start_lsn + } else if version == 4 { + info!("reading safekeeper control file version {}", version); + let oldstate = SafeKeeperStateV4::des(&buf[..buf.len()])?; + let server = ServerInfo { + pg_version: oldstate.server.pg_version, + system_id: oldstate.server.system_id, + wal_seg_size: oldstate.server.wal_seg_size, + }; + return Ok(SafeKeeperState { + tenant_id: oldstate.tenant_id, + timeline_id: oldstate.timeline_id, + acceptor_state: oldstate.acceptor_state, + server, + proposer_uuid: oldstate.proposer_uuid, + timeline_start_lsn: Lsn(0), + local_start_lsn: Lsn(0), + commit_lsn: oldstate.commit_lsn, + s3_wal_lsn: Lsn(0), + peer_horizon_lsn: oldstate.peer_horizon_lsn, + remote_consistent_lsn: Lsn(0), + peers: Peers(vec![]), + }); } bail!("unsupported safekeeper control file version {}", version) } From e2cf77441df67a9b9a49cbdb2120096decb0e0da Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 3 May 2022 13:23:18 +0300 Subject: [PATCH 0241/1022] Implement pg_database_size(). In this implementation dbsize equals sum of all relation sizes, excluding shared ones. --- pageserver/src/page_service.rs | 56 +++++++++++++++++++ test_runner/batch_others/test_createdropdb.py | 11 +++- vendor/postgres | 2 +- 3 files changed, 67 insertions(+), 2 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index e584a101cd..da3dedfc84 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -44,11 +44,14 @@ use crate::CheckpointConfig; use metrics::{register_histogram_vec, HistogramVec}; use postgres_ffi::xlog_utils::to_pg_timestamp; +use postgres_ffi::pg_constants; + // Wrapped in libpq CopyData enum PagestreamFeMessage { Exists(PagestreamExistsRequest), Nblocks(PagestreamNblocksRequest), GetPage(PagestreamGetPageRequest), + DbSize(PagestreamDbSizeRequest), } // Wrapped in libpq CopyData @@ -57,6 +60,7 @@ enum PagestreamBeMessage { Nblocks(PagestreamNblocksResponse), GetPage(PagestreamGetPageResponse), Error(PagestreamErrorResponse), + DbSize(PagestreamDbSizeResponse), } #[derive(Debug)] @@ -81,6 +85,13 @@ struct PagestreamGetPageRequest { blkno: u32, } +#[derive(Debug)] +struct PagestreamDbSizeRequest { + latest: bool, + lsn: Lsn, + dbnode: u32, +} + #[derive(Debug)] struct PagestreamExistsResponse { exists: bool, @@ -101,6 +112,11 @@ struct PagestreamErrorResponse { message: String, } +#[derive(Debug)] +struct PagestreamDbSizeResponse { + db_size: i64, +} + impl PagestreamFeMessage { fn parse(mut body: Bytes) -> anyhow::Result { // TODO these gets can fail @@ -142,6 +158,11 @@ impl PagestreamFeMessage { }, blkno: body.get_u32(), })), + 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { + latest: body.get_u8() != 0, + lsn: Lsn::from(body.get_u64()), + dbnode: body.get_u32(), + })), _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body), } } @@ -172,6 +193,10 @@ impl PagestreamBeMessage { bytes.put(resp.message.as_bytes()); bytes.put_u8(0); // null terminator } + Self::DbSize(resp) => { + bytes.put_u8(104); /* tag from pagestore_client.h */ + bytes.put_i64(resp.db_size); + } } bytes.into() @@ -367,6 +392,11 @@ impl PageServerHandler { .observe_closure_duration(|| { self.handle_get_page_at_lsn_request(timeline.as_ref(), &req) }), + PagestreamFeMessage::DbSize(req) => SMGR_QUERY_TIME + .with_label_values(&["get_db_size", &tenant_id, &timeline_id]) + .observe_closure_duration(|| { + self.handle_db_size_request(timeline.as_ref(), &req) + }), }; let response = response.unwrap_or_else(|e| { @@ -487,6 +517,32 @@ impl PageServerHandler { })) } + fn handle_db_size_request( + &self, + timeline: &DatadirTimeline, + req: &PagestreamDbSizeRequest, + ) -> Result { + let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered(); + let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + + let all_rels = timeline.list_rels(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?; + let mut total_blocks: i64 = 0; + + for rel in all_rels { + if rel.forknum == 0 { + let n_blocks = timeline.get_rel_size(rel, lsn).unwrap_or(0); + total_blocks += n_blocks as i64; + } + } + + let db_size = total_blocks * pg_constants::BLCKSZ as i64; + + Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse { + db_size, + })) + } + fn handle_get_page_at_lsn_request( &self, timeline: &DatadirTimeline, diff --git a/test_runner/batch_others/test_createdropdb.py b/test_runner/batch_others/test_createdropdb.py index 88937fa0dc..24898be70a 100644 --- a/test_runner/batch_others/test_createdropdb.py +++ b/test_runner/batch_others/test_createdropdb.py @@ -32,7 +32,16 @@ def test_createdb(zenith_simple_env: ZenithEnv): # Test that you can connect to the new database on both branches for db in (pg, pg2): - db.connect(dbname='foodb').close() + with closing(db.connect(dbname='foodb')) as conn: + with conn.cursor() as cur: + # Check database size in both branches + cur.execute( + 'select pg_size_pretty(pg_database_size(%s)), pg_size_pretty(sum(pg_relation_size(oid))) from pg_class where relisshared is false;', + ('foodb', )) + res = cur.fetchone() + # check that dbsize equals sum of all relation sizes, excluding shared ones + # This is how we define dbsize in zenith for now + assert res[0] == res[1] # diff --git a/vendor/postgres b/vendor/postgres index ce3057955a..f8c12bb06c 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit ce3057955ac962662c6fe0d00d793bfccedf7ca8 +Subproject commit f8c12bb06c314e823dbc890229c28016c1f9a0fe From b8880bfaab048576034515ff2b8174b4dc21e260 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 4 May 2022 17:27:16 +0300 Subject: [PATCH 0242/1022] Bump vendor/postgres --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index f8c12bb06c..d35bd7132f 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit f8c12bb06c314e823dbc890229c28016c1f9a0fe +Subproject commit d35bd7132ff6ed600577934e5389c7657087fbe1 From c4bc604e5f7d08e785cfd48d6a11c60b3555c598 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Wed, 4 May 2022 11:23:04 -0400 Subject: [PATCH 0243/1022] Fix pg list table alignment #1633 Fixes #1628 - add [`comfy_table`](https://github.com/Nukesor/comfy-table/tree/main) and use it to construct table for `pg list` CLI command Comparison - Old: ``` NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS main 127.0.0.1:55432 3823dd05e35d71f6ccf33049de366d70 main 0/16FB140 running migration_check 127.0.0.1:55433 3823dd05e35d71f6ccf33049de366d70 main 0/16FB140 running ``` - New: ``` NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS main 127.0.0.1:55432 3823dd05e35d71f6ccf33049de366d70 main 0/16FB140 running migration_check 127.0.0.1:55433 3823dd05e35d71f6ccf33049de366d70 main 0/16FB140 running ``` --- Cargo.lock | 68 ++++++++++++++++++++++++++++++++++++++++++++++ zenith/Cargo.toml | 1 + zenith/src/main.rs | 29 ++++++++++++++------ 3 files changed, 90 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2c081e8beb..e9b24b2f84 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -330,6 +330,18 @@ dependencies = [ "memchr", ] +[[package]] +name = "comfy-table" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b103d85ca6e209388771bfb7aa6b68a7aeec4afbf6f0a0264bfbf50360e5212e" +dependencies = [ + "crossterm", + "strum", + "strum_macros", + "unicode-width", +] + [[package]] name = "compute_tools" version = "0.1.0" @@ -526,6 +538,31 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "crossterm" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2102ea4f781910f8a5b98dd061f4c2023f479ce7bb1236330099ceb5a93cf17" +dependencies = [ + "bitflags", + "crossterm_winapi", + "libc", + "mio", + "parking_lot 0.12.0", + "signal-hook", + "signal-hook-mio", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c" +dependencies = [ + "winapi", +] + [[package]] name = "crypto-common" version = "0.1.3" @@ -2664,6 +2701,17 @@ dependencies = [ "signal-hook-registry", ] +[[package]] +name = "signal-hook-mio" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af" +dependencies = [ + "libc", + "mio", + "signal-hook", +] + [[package]] name = "signal-hook-registry" version = "1.4.0" @@ -2753,6 +2801,25 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "strum" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb" + +[[package]] +name = "strum_macros" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "subtle" version = "2.4.1" @@ -3642,6 +3709,7 @@ version = "0.1.0" dependencies = [ "anyhow", "clap 3.0.14", + "comfy-table", "control_plane", "pageserver", "postgres", diff --git a/zenith/Cargo.toml b/zenith/Cargo.toml index 0f72051f74..58f1f5751d 100644 --- a/zenith/Cargo.toml +++ b/zenith/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" clap = "3.0" anyhow = "1.0" serde_json = "1" +comfy-table = "5.0.1" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } # FIXME: 'pageserver' is needed for BranchInfo. Refactor diff --git a/zenith/src/main.rs b/zenith/src/main.rs index cd0cf470e8..ff2beec463 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -665,7 +665,19 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let timeline_name_mappings = env.timeline_name_mappings(); - println!("NODE\tADDRESS\tTIMELINE\tBRANCH NAME\tLSN\t\tSTATUS"); + let mut table = comfy_table::Table::new(); + + table.load_preset(comfy_table::presets::NOTHING); + + table.set_header(&[ + "NODE", + "ADDRESS", + "TIMELINE", + "BRANCH NAME", + "LSN", + "STATUS", + ]); + for ((_, node_name), node) in cplane .nodes .iter() @@ -684,16 +696,17 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { .map(|name| name.as_str()) .unwrap_or("?"); - println!( - "{}\t{}\t{}\t{}\t{}\t{}", - node_name, - node.address, - node.timeline_id, + table.add_row(&[ + node_name.as_str(), + &node.address.to_string(), + &node.timeline_id.to_string(), branch_name, - lsn_str, + lsn_str.as_str(), node.status(), - ); + ]); } + + println!("{table}"); } "create" => { let branch_name = sub_args From 02e5083695d0ed17f7dbb1ca852f504fca42fdcc Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Wed, 4 May 2022 12:45:01 -0400 Subject: [PATCH 0244/1022] Add hot page test (#1479) --- poetry.lock | 30 +++++++++++++++---- pyproject.toml | 1 + test_runner/fixtures/compare_fixtures.py | 5 +++- test_runner/fixtures/zenith_fixtures.py | 2 +- test_runner/performance/test_hot_page.py | 36 +++++++++++++++++++++++ test_runner/performance/test_hot_table.py | 35 ++++++++++++++++++++++ 6 files changed, 101 insertions(+), 8 deletions(-) create mode 100644 test_runner/performance/test_hot_page.py create mode 100644 test_runner/performance/test_hot_table.py diff --git a/poetry.lock b/poetry.lock index fe18ad226c..a7cbe0aa3c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -822,7 +822,7 @@ python-versions = "*" [[package]] name = "moto" -version = "3.0.4" +version = "3.1.7" description = "A library that allows your python tests to easily mock out the boto library" category = "main" optional = false @@ -844,6 +844,7 @@ importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} Jinja2 = ">=2.10.1" jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} MarkupSafe = "!=2.0.0a1" +pyparsing = {version = ">=3.0.0", optional = true, markers = "extra == \"server\""} python-dateutil = ">=2.1,<3.0.0" python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""} pytz = "*" @@ -855,7 +856,7 @@ werkzeug = "*" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "setuptools"] +all = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.0)", "setuptools"] apigateway = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)"] apigatewayv2 = ["PyYAML (>=5.1)"] appsync = ["graphql-core"] @@ -864,14 +865,16 @@ batch = ["docker (>=2.5.1)"] cloudformation = ["docker (>=2.5.1)", "PyYAML (>=5.1)", "cfn-lint (>=0.4.0)"] cognitoidp = ["python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)"] ds = ["sshpubkeys (>=3.1.0)"] +dynamodb = ["docker (>=2.5.1)"] dynamodb2 = ["docker (>=2.5.1)"] dynamodbstreams = ["docker (>=2.5.1)"] ec2 = ["sshpubkeys (>=3.1.0)"] efs = ["sshpubkeys (>=3.1.0)"] +glue = ["pyparsing (>=3.0.0)"] iotdata = ["jsondiff (>=1.1.2)"] route53resolver = ["sshpubkeys (>=3.1.0)"] s3 = ["PyYAML (>=5.1)"] -server = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "setuptools", "flask", "flask-cors"] +server = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.0)", "setuptools", "flask", "flask-cors"] ssm = ["PyYAML (>=5.1)", "dataclasses"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] @@ -1068,6 +1071,17 @@ python-versions = ">=3.6" py = "*" pytest = ">=3.10" +[[package]] +name = "pytest-lazy-fixture" +version = "0.6.3" +description = "It helps to use fixtures in pytest.mark.parametrize" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +pytest = ">=3.2.5" + [[package]] name = "pytest-xdist" version = "2.5.0" @@ -1361,7 +1375,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.7" -content-hash = "58762accad4122026c650fa43421a900546e89f9908e2268410e7b11cc8c6c4e" +content-hash = "dc63b6e02d0ceccdc4b5616e9362c149a27fdcc6c54fda63a3b115a5b980c42e" [metadata.files] aiopg = [ @@ -1679,8 +1693,8 @@ mccabe = [ {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, ] moto = [ - {file = "moto-3.0.4-py2.py3-none-any.whl", hash = "sha256:79646213d8438385182f4eea79e28725f94b3d0d3dc9a3eda81db47e0ebef6cc"}, - {file = "moto-3.0.4.tar.gz", hash = "sha256:168b8a3cb4dd8a6df8e51d582761cefa9657b9f45ac7e1eb24dae394ebc9e000"}, + {file = "moto-3.1.7-py3-none-any.whl", hash = "sha256:4ab6fb8dd150343e115d75e3dbdb5a8f850fc7236790819d7cef438c11ee6e89"}, + {file = "moto-3.1.7.tar.gz", hash = "sha256:20607a0fd0cf6530e05ffb623ca84d3f45d50bddbcec2a33705a0cf471e71289"}, ] mypy = [ {file = "mypy-0.910-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457"}, @@ -1855,6 +1869,10 @@ pytest-forked = [ {file = "pytest-forked-1.4.0.tar.gz", hash = "sha256:8b67587c8f98cbbadfdd804539ed5455b6ed03802203485dd2f53c1422d7440e"}, {file = "pytest_forked-1.4.0-py3-none-any.whl", hash = "sha256:bbbb6717efc886b9d64537b41fb1497cfaf3c9601276be8da2cccfea5a3c8ad8"}, ] +pytest-lazy-fixture = [ + {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"}, + {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"}, +] pytest-xdist = [ {file = "pytest-xdist-2.5.0.tar.gz", hash = "sha256:4580deca3ff04ddb2ac53eba39d76cb5dd5edeac050cb6fbc768b0dd712b4edf"}, {file = "pytest_xdist-2.5.0-py3-none-any.whl", hash = "sha256:6fe5c74fec98906deb8f2d2b616b5c782022744978e7bd4695d39c8f42d0ce65"}, diff --git a/pyproject.toml b/pyproject.toml index 7dbdcc0304..335c6d61d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ boto3 = "^1.20.40" boto3-stubs = "^1.20.40" moto = {version = "^3.0.0", extras = ["server"]} backoff = "^1.11.1" +pytest-lazy-fixture = "^0.6.3" [tool.poetry.dev-dependencies] yapf = "==0.31.0" diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 93912d2da7..d70f57aa52 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -130,7 +130,10 @@ class VanillaCompare(PgCompare): def __init__(self, zenbenchmark, vanilla_pg: VanillaPostgres): self._pg = vanilla_pg self._zenbenchmark = zenbenchmark - vanilla_pg.configure(['shared_buffers=1MB']) + vanilla_pg.configure([ + 'shared_buffers=1MB', + 'synchronous_commit=off', + ]) vanilla_pg.start() # Long-lived cursor, useful for flushing diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index d6d07d78d3..784d2d4b26 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1315,7 +1315,7 @@ class VanillaPostgres(PgProtocol): """Append lines into postgresql.conf file.""" assert not self.running with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file: - conf_file.writelines(options) + conf_file.write("\n".join(options)) def start(self, log_path: Optional[str] = None): assert not self.running diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py new file mode 100644 index 0000000000..2042b0d548 --- /dev/null +++ b/test_runner/performance/test_hot_page.py @@ -0,0 +1,36 @@ +import pytest +from contextlib import closing +from fixtures.compare_fixtures import PgCompare +from pytest_lazyfixture import lazy_fixture # type: ignore + + +@pytest.mark.parametrize( + "env", + [ + # The test is too slow to run in CI, but fast enough to run with remote tests + pytest.param(lazy_fixture("zenith_compare"), id="zenith", marks=pytest.mark.slow), + pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), + pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), + ]) +def test_hot_page(env: PgCompare): + # Update the same page many times, then measure read performance + num_writes = 1000000 + + with closing(env.pg.connect()) as conn: + with conn.cursor() as cur: + + # Write many updates to the same row + with env.record_duration('write'): + cur.execute('create table t (i integer);') + cur.execute('insert into t values (0);') + for i in range(num_writes): + cur.execute(f'update t set i = {i};') + + # Write 3-4 MB to evict t from compute cache + cur.execute('create table f (i integer);') + cur.execute(f'insert into f values (generate_series(1,100000));') + + # Read + with env.record_duration('read'): + cur.execute('select * from t;') + cur.fetchall() diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py new file mode 100644 index 0000000000..11e047b8c3 --- /dev/null +++ b/test_runner/performance/test_hot_table.py @@ -0,0 +1,35 @@ +import pytest +from contextlib import closing +from fixtures.compare_fixtures import PgCompare +from pytest_lazyfixture import lazy_fixture # type: ignore + + +@pytest.mark.parametrize( + "env", + [ + # The test is too slow to run in CI, but fast enough to run with remote tests + pytest.param(lazy_fixture("zenith_compare"), id="zenith", marks=pytest.mark.slow), + pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), + pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), + ]) +def test_hot_table(env: PgCompare): + # Update a small table many times, then measure read performance + num_rows = 100000 # Slightly larger than shared buffers size TODO validate + num_writes = 1000000 + num_reads = 10 + + with closing(env.pg.connect()) as conn: + with conn.cursor() as cur: + + # Write many updates to a small table + with env.record_duration('write'): + cur.execute('create table t (i integer primary key);') + cur.execute(f'insert into t values (generate_series(1,{num_rows}));') + for i in range(num_writes): + cur.execute(f'update t set i = {i + num_rows} WHERE i = {i};') + + # Read the table + with env.record_duration('read'): + for i in range(num_reads): + cur.execute('select * from t;') + cur.fetchall() From bc569dde51639073cf241369f3fc872121d0c811 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Wed, 4 May 2022 17:41:05 -0400 Subject: [PATCH 0245/1022] Remove some unwraps from waldecoder (#1539) --- libs/postgres_ffi/src/waldecoder.rs | 22 +++++++-- libs/postgres_ffi/src/xlog_utils.rs | 46 ++++++++++--------- pageserver/src/basebackup.rs | 5 +- pageserver/src/import_datadir.rs | 2 +- .../src/layered_repository/delta_layer.rs | 2 +- .../src/layered_repository/inmemory_layer.rs | 2 +- pageserver/src/walingest.rs | 5 +- pageserver/src/walrecord.rs | 32 ++++++------- safekeeper/src/json_ctrl.rs | 4 +- 9 files changed, 70 insertions(+), 50 deletions(-) diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index 9d1089ed46..95ea9660e8 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -89,7 +89,12 @@ impl WalStreamDecoder { return Ok(None); } - let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf); + let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| { + WalDecodeError { + msg: format!("long header deserialization failed {}", e), + lsn: self.lsn, + } + })?; if hdr.std.xlp_pageaddr != self.lsn.0 { return Err(WalDecodeError { @@ -106,7 +111,12 @@ impl WalStreamDecoder { return Ok(None); } - let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf); + let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| { + WalDecodeError { + msg: format!("header deserialization failed {}", e), + lsn: self.lsn, + } + })?; if hdr.xlp_pageaddr != self.lsn.0 { return Err(WalDecodeError { @@ -188,7 +198,13 @@ impl WalStreamDecoder { } // We now have a record in the 'recordbuf' local variable. - let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]); + let xlogrec = + XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]).map_err(|e| { + WalDecodeError { + msg: format!("xlog record deserialization failed {}", e), + lsn: self.lsn, + } + })?; let mut crc = 0; crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]); diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index bd4b7df690..7882058868 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -15,7 +15,7 @@ use crate::XLogPageHeaderData; use crate::XLogRecord; use crate::XLOG_PAGE_MAGIC; -use anyhow::{bail, Result}; +use anyhow::bail; use byteorder::{ByteOrder, LittleEndian}; use bytes::BytesMut; use bytes::{Buf, Bytes}; @@ -28,6 +28,8 @@ use std::io::prelude::*; use std::io::SeekFrom; use std::path::{Path, PathBuf}; use std::time::SystemTime; +use utils::bin_ser::DeserializeError; +use utils::bin_ser::SerializeError; use utils::lsn::Lsn; pub const XLOG_FNAME_LEN: usize = 24; @@ -144,7 +146,7 @@ fn find_end_of_wal_segment( tli: TimeLineID, wal_seg_size: usize, start_offset: usize, // start reading at this point -) -> Result { +) -> anyhow::Result { // step back to the beginning of the page to read it in... let mut offs: usize = start_offset - start_offset % XLOG_BLCKSZ; let mut contlen: usize = 0; @@ -272,7 +274,7 @@ pub fn find_end_of_wal( wal_seg_size: usize, precise: bool, start_lsn: Lsn, // start reading WAL at this point or later -) -> Result<(XLogRecPtr, TimeLineID)> { +) -> anyhow::Result<(XLogRecPtr, TimeLineID)> { let mut high_segno: XLogSegNo = 0; let mut high_tli: TimeLineID = 0; let mut high_ispartial = false; @@ -354,19 +356,19 @@ pub fn main() { } impl XLogRecord { - pub fn from_slice(buf: &[u8]) -> XLogRecord { + pub fn from_slice(buf: &[u8]) -> Result { use utils::bin_ser::LeSer; - XLogRecord::des(buf).unwrap() + XLogRecord::des(buf) } - pub fn from_bytes(buf: &mut B) -> XLogRecord { + pub fn from_bytes(buf: &mut B) -> Result { use utils::bin_ser::LeSer; - XLogRecord::des_from(&mut buf.reader()).unwrap() + XLogRecord::des_from(&mut buf.reader()) } - pub fn encode(&self) -> Bytes { + pub fn encode(&self) -> Result { use utils::bin_ser::LeSer; - self.ser().unwrap().into() + Ok(self.ser()?.into()) } // Is this record an XLOG_SWITCH record? They need some special processing, @@ -376,35 +378,35 @@ impl XLogRecord { } impl XLogPageHeaderData { - pub fn from_bytes(buf: &mut B) -> XLogPageHeaderData { + pub fn from_bytes(buf: &mut B) -> Result { use utils::bin_ser::LeSer; - XLogPageHeaderData::des_from(&mut buf.reader()).unwrap() + XLogPageHeaderData::des_from(&mut buf.reader()) } } impl XLogLongPageHeaderData { - pub fn from_bytes(buf: &mut B) -> XLogLongPageHeaderData { + pub fn from_bytes(buf: &mut B) -> Result { use utils::bin_ser::LeSer; - XLogLongPageHeaderData::des_from(&mut buf.reader()).unwrap() + XLogLongPageHeaderData::des_from(&mut buf.reader()) } - pub fn encode(&self) -> Bytes { + pub fn encode(&self) -> Result { use utils::bin_ser::LeSer; - self.ser().unwrap().into() + self.ser().map(|b| b.into()) } } pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::(); impl CheckPoint { - pub fn encode(&self) -> Bytes { + pub fn encode(&self) -> Result { use utils::bin_ser::LeSer; - self.ser().unwrap().into() + Ok(self.ser()?.into()) } - pub fn decode(buf: &[u8]) -> Result { + pub fn decode(buf: &[u8]) -> Result { use utils::bin_ser::LeSer; - Ok(CheckPoint::des(buf)?) + CheckPoint::des(buf) } /// Update next XID based on provided new_xid and stored epoch. @@ -442,7 +444,7 @@ impl CheckPoint { // Generate new, empty WAL segment. // We need this segment to start compute node. // -pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes { +pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result { let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize); let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, pg_constants::WAL_SEGMENT_SIZE); @@ -462,12 +464,12 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes { xlp_xlog_blcksz: XLOG_BLCKSZ as u32, }; - let hdr_bytes = hdr.encode(); + let hdr_bytes = hdr.encode()?; seg_buf.extend_from_slice(&hdr_bytes); //zero out the rest of the file seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0); - seg_buf.freeze() + Ok(seg_buf.freeze()) } #[cfg(test)] diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 14e6d40759..92d35130d8 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,7 +10,7 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{ensure, Context, Result}; +use anyhow::{anyhow, ensure, Context, Result}; use bytes::{BufMut, BytesMut}; use std::fmt::Write as FmtWrite; use std::io; @@ -323,7 +323,8 @@ impl<'a> Basebackup<'a> { let wal_file_name = XLogFileName(PG_TLI, segno, pg_constants::WAL_SEGMENT_SIZE); let wal_file_path = format!("pg_wal/{}", wal_file_name); let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?; - let wal_seg = generate_wal_segment(segno, pg_control.system_identifier); + let wal_seg = generate_wal_segment(segno, pg_control.system_identifier) + .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; ensure!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE); self.ar.append(&header, &wal_seg[..])?; Ok(()) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 8f49903e6c..703ee8f1b1 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -274,7 +274,7 @@ fn import_control_file( // Extract the checkpoint record and import it separately. let pg_control = ControlFileData::decode(&buffer)?; - let checkpoint_bytes = pg_control.checkPointCopy.encode(); + let checkpoint_bytes = pg_control.checkPointCopy.encode()?; modification.put_checkpoint(checkpoint_bytes)?; Ok(pg_control) diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 4952f64ccd..1e1ec716a6 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -375,7 +375,7 @@ impl Layer for DeltaLayer { write!(&mut desc, " img {} bytes", img.len()).unwrap(); } Ok(Value::WalRecord(rec)) => { - let wal_desc = walrecord::describe_wal_record(&rec); + let wal_desc = walrecord::describe_wal_record(&rec).unwrap(); write!( &mut desc, " rec {} bytes will_init: {} {}", diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 714a0bc579..856baa2e8a 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -207,7 +207,7 @@ impl Layer for InMemoryLayer { write!(&mut desc, " img {} bytes", img.len())?; } Ok(Value::WalRecord(rec)) => { - let wal_desc = walrecord::describe_wal_record(&rec); + let wal_desc = walrecord::describe_wal_record(&rec).unwrap(); write!( &mut desc, " rec {} bytes will_init: {} {}", diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index a929e290ad..fbdb328d2c 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -21,6 +21,7 @@ //! redo Postgres process, but some records it can handle directly with //! bespoken Rust code. +use anyhow::Context; use postgres_ffi::nonrelfile_utils::clogpage_precedes; use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment; @@ -82,7 +83,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { ) -> Result<()> { let mut modification = timeline.begin_modification(lsn); - let mut decoded = decode_wal_record(recdata); + let mut decoded = decode_wal_record(recdata).context("failed decoding wal record")?; let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -251,7 +252,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { // If checkpoint data was updated, store the new version in the repository if self.checkpoint_modified { - let new_checkpoint_bytes = self.checkpoint.encode(); + let new_checkpoint_bytes = self.checkpoint.encode()?; modification.put_checkpoint(new_checkpoint_bytes)?; self.checkpoint_modified = false; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index e8699cfa22..5a384360e2 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -1,6 +1,7 @@ //! //! Functions for parsing WAL records. //! +use anyhow::Result; use bytes::{Buf, Bytes}; use postgres_ffi::pg_constants; use postgres_ffi::xlog_utils::{TimestampTz, XLOG_SIZE_OF_XLOG_RECORD}; @@ -9,6 +10,7 @@ use postgres_ffi::{BlockNumber, OffsetNumber}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; use serde::{Deserialize, Serialize}; use tracing::*; +use utils::bin_ser::DeserializeError; /// Each update to a page is represented by a ZenithWalRecord. It can be a wrapper /// around a PostgreSQL WAL record, or a custom zenith-specific "record". @@ -503,7 +505,7 @@ impl XlMultiXactTruncate { // block data // ... // main data -pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { +pub fn decode_wal_record(record: Bytes) -> Result { let mut rnode_spcnode: u32 = 0; let mut rnode_dbnode: u32 = 0; let mut rnode_relnode: u32 = 0; @@ -514,7 +516,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { // 1. Parse XLogRecord struct // FIXME: assume little-endian here - let xlogrec = XLogRecord::from_bytes(&mut buf); + let xlogrec = XLogRecord::from_bytes(&mut buf)?; trace!( "decode_wal_record xl_rmid = {} xl_info = {}", @@ -742,34 +744,32 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { assert_eq!(buf.remaining(), main_data_len as usize); } - DecodedWALRecord { + Ok(DecodedWALRecord { xl_xid: xlogrec.xl_xid, xl_info: xlogrec.xl_info, xl_rmid: xlogrec.xl_rmid, record, blocks, main_data_offset, - } + }) } /// /// Build a human-readable string to describe a WAL record /// /// For debugging purposes -pub fn describe_wal_record(rec: &ZenithWalRecord) -> String { +pub fn describe_wal_record(rec: &ZenithWalRecord) -> Result { match rec { - ZenithWalRecord::Postgres { will_init, rec } => { - format!( - "will_init: {}, {}", - will_init, - describe_postgres_wal_record(rec) - ) - } - _ => format!("{:?}", rec), + ZenithWalRecord::Postgres { will_init, rec } => Ok(format!( + "will_init: {}, {}", + will_init, + describe_postgres_wal_record(rec)? + )), + _ => Ok(format!("{:?}", rec)), } } -fn describe_postgres_wal_record(record: &Bytes) -> String { +fn describe_postgres_wal_record(record: &Bytes) -> Result { // TODO: It would be nice to use the PostgreSQL rmgrdesc infrastructure for this. // Maybe use the postgres wal redo process, the same used for replaying WAL records? // Or could we compile the rmgrdesc routines into the dump_layer_file() binary directly, @@ -782,7 +782,7 @@ fn describe_postgres_wal_record(record: &Bytes) -> String { // 1. Parse XLogRecord struct // FIXME: assume little-endian here - let xlogrec = XLogRecord::from_bytes(&mut buf); + let xlogrec = XLogRecord::from_bytes(&mut buf)?; let unknown_str: String; @@ -830,5 +830,5 @@ fn describe_postgres_wal_record(record: &Bytes) -> String { } }; - String::from(result) + Ok(String::from(result)) } diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index d21d5ad73b..43514997d4 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -239,13 +239,13 @@ fn encode_logical_message(prefix: &str, message: &str) -> Vec { xl_crc: 0, // crc will be calculated later }; - let header_bytes = header.encode(); + let header_bytes = header.encode().expect("failed to encode header"); let crc = crc32c_append(0, &data); let crc = crc32c_append(crc, &header_bytes[0..xlog_utils::XLOG_RECORD_CRC_OFFS]); header.xl_crc = crc; let mut wal: Vec = Vec::new(); - wal.extend_from_slice(&header.encode()); + wal.extend_from_slice(&header.encode().expect("failed to encode header")); wal.extend_from_slice(&data); // WAL start position must be aligned at 8 bytes, From c46fe90010adaee8a241a2241b417ecff2f037d9 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Thu, 5 May 2022 07:43:55 +0400 Subject: [PATCH 0246/1022] Fix division by zero in WAL removal. --- safekeeper/src/safekeeper.rs | 4 +--- safekeeper/src/timeline.rs | 4 ++++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 67d41d0b58..68361fd672 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -938,9 +938,7 @@ where ), self.state.s3_wal_lsn, ); - let res = horizon_lsn.segment_number(self.state.server.wal_seg_size as usize); - info!("horizon is {}, res {}", horizon_lsn, res); - res + horizon_lsn.segment_number(self.state.server.wal_seg_size as usize) } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 745d8e0893..47137091da 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -469,6 +469,10 @@ impl Timeline { let remover: Box Result<(), anyhow::Error>>; { let shared_state = self.mutex.lock().unwrap(); + // WAL seg size not initialized yet, no WAL exists. + if shared_state.sk.state.server.wal_seg_size == 0 { + return Ok(()); + } horizon_segno = shared_state.sk.get_horizon_segno(); remover = shared_state.sk.wal_store.remove_up_to(); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { From 0f3ec83172b38684c8ec74a33f8db4fb9a79df2f Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 3 May 2022 17:16:46 +0300 Subject: [PATCH 0247/1022] avoid detach with alive branches --- pageserver/src/layered_repository.rs | 15 ++++++++++++++- .../batch_others/test_ancestor_branch.py | 18 +++++++++++++++--- .../batch_others/test_tenant_relocation.py | 7 ++++--- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index e678c8f4cb..69271467a6 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -393,9 +393,22 @@ impl Repository for LayeredRepository { fn detach_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> { let mut timelines = self.timelines.lock().unwrap(); + // check no child timelines, because detach will remove files, which will brake child branches + // FIXME this can still be violated because we do not guarantee + // that all ancestors are downloaded/attached to the same pageserver + let num_children = timelines + .iter() + .filter(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id)) + .count(); + + ensure!( + num_children == 0, + "Cannot detach timeline which has child timelines" + ); + ensure!( timelines.remove(&timeline_id).is_some(), - "cannot detach timeline {timeline_id} that is not available locally" + "Cannot detach timeline {timeline_id} that is not available locally" ); Ok(()) } diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index 75fe3cde0f..d6b073492d 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -1,11 +1,9 @@ -import subprocess -import asyncio from contextlib import closing import psycopg2.extras import pytest from fixtures.log_helper import log -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverApiException # @@ -120,3 +118,17 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): branch2_cur.execute('SELECT count(*) FROM foo') assert branch2_cur.fetchone() == (300000, ) + + +def test_ancestor_branch_detach(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + + parent_timeline_id = env.zenith_cli.create_branch("test_ancestor_branch_detach_parent", "empty") + + env.zenith_cli.create_branch("test_ancestor_branch_detach_branch1", + "test_ancestor_branch_detach_parent") + + ps_http = env.pageserver.http_client() + with pytest.raises(ZenithPageserverApiException, + match="Failed to detach inmem tenant timeline"): + ps_http.timeline_detach(env.initial_tenant, parent_timeline_id) diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 41907adf1a..7e71c0a157 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -109,10 +109,11 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, tenant = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) log.info("tenant to relocate %s", tenant) - env.zenith_cli.create_root_branch('main', tenant_id=tenant) - env.zenith_cli.create_branch('test_tenant_relocation', tenant_id=tenant) - tenant_pg = env.postgres.create_start(branch_name='main', + # attach does not download ancestor branches (should it?), just use root branch for now + env.zenith_cli.create_root_branch('test_tenant_relocation', tenant_id=tenant) + + tenant_pg = env.postgres.create_start(branch_name='test_tenant_relocation', node_name='test_tenant_relocation', tenant_id=tenant) From ad5eaa6027166b41e6485c49c7ea496e7c6515f0 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Thu, 5 May 2022 10:53:10 -0400 Subject: [PATCH 0248/1022] Use node's LSN for read-only nodes (#1642) Fixes #1410. --- zenith/src/main.rs | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/zenith/src/main.rs b/zenith/src/main.rs index ff2beec463..87bb5f3f60 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -683,13 +683,21 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { .iter() .filter(|((node_tenant_id, _), _)| node_tenant_id == &tenant_id) { - // FIXME: This shows the LSN at the end of the timeline. It's not the - // right thing to do for read-only nodes that might be anchored at an - // older point in time, or following but lagging behind the primary. - let lsn_str = timeline_infos - .get(&node.timeline_id) - .and_then(|bi| bi.local.as_ref().map(|l| l.last_record_lsn.to_string())) - .unwrap_or_else(|| "?".to_string()); + let lsn_str = match node.lsn { + None => { + // -> primary node + // Use the LSN at the end of the timeline. + timeline_infos + .get(&node.timeline_id) + .and_then(|bi| bi.local.as_ref().map(|l| l.last_record_lsn.to_string())) + .unwrap_or_else(|| "?".to_string()) + } + Some(lsn) => { + // -> read-only node + // Use the node's LSN. + lsn.to_string() + } + }; let branch_name = timeline_name_mappings .get(&ZTenantTimelineId::new(tenant_id, node.timeline_id)) From 52a7e3155e3fd6132794f53623698e12e403f711 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 4 May 2022 14:53:18 +0300 Subject: [PATCH 0249/1022] Add local path to the Layer trait and historic layers --- pageserver/src/http/routes.rs | 6 +- pageserver/src/layered_repository.rs | 122 ++++++++++++++---- .../src/layered_repository/delta_layer.rs | 4 + .../src/layered_repository/image_layer.rs | 4 + .../src/layered_repository/inmemory_layer.rs | 4 + .../src/layered_repository/layer_map.rs | 2 +- .../src/layered_repository/storage_layer.rs | 3 + pageserver/src/remote_storage.rs | 8 +- pageserver/src/remote_storage/storage_sync.rs | 18 ++- 9 files changed, 131 insertions(+), 40 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 5903dea372..f12e4c4051 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -11,7 +11,7 @@ use super::models::{ }; use crate::config::RemoteStorageKind; use crate::remote_storage::{ - download_index_part, schedule_timeline_download, LocalFs, RemoteIndex, RemoteTimeline, S3Bucket, + download_index_part, schedule_layer_download, LocalFs, RemoteIndex, RemoteTimeline, S3Bucket, }; use crate::repository::Repository; use crate::tenant_config::TenantConfOpt; @@ -273,7 +273,7 @@ async fn timeline_attach_handler(request: Request) -> Result) -> Result index_accessor.add_timeline_entry(sync_id, new_timeline), } - schedule_timeline_download(tenant_id, timeline_id); + schedule_layer_download(tenant_id, timeline_id); json_response(StatusCode::ACCEPTED, ()) } diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 69271467a6..6719c22738 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -20,8 +20,8 @@ use tracing::*; use std::cmp::{max, min, Ordering}; use std::collections::hash_map::Entry; -use std::collections::BTreeSet; use std::collections::HashMap; +use std::collections::{BTreeSet, HashSet}; use std::fs; use std::fs::{File, OpenOptions}; use std::io::Write; @@ -37,7 +37,7 @@ use crate::keyspace::KeySpace; use crate::tenant_config::{TenantConf, TenantConfOpt}; use crate::page_cache; -use crate::remote_storage::{schedule_timeline_checkpoint_upload, RemoteIndex}; +use crate::remote_storage::{self, RemoteIndex}; use crate::repository::{ GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncStatusUpdate, TimelineWriter, }; @@ -428,7 +428,7 @@ impl Repository for LayeredRepository { Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."), Entry::Vacant(entry) => { // we need to get metadata of a timeline, another option is to pass it along with Downloaded status - let metadata = Self::load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?; + let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?; // finally we make newly downloaded timeline visible to repository entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, }) }, @@ -618,7 +618,7 @@ impl LayeredRepository { timelineid: ZTimelineId, timelines: &mut HashMap, ) -> anyhow::Result> { - let metadata = Self::load_metadata(self.conf, timelineid, self.tenant_id) + let metadata = load_metadata(self.conf, timelineid, self.tenant_id) .context("failed to load metadata")?; let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -776,17 +776,6 @@ impl LayeredRepository { Ok(()) } - fn load_metadata( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - ) -> Result { - let path = metadata_path(conf, timelineid, tenantid); - info!("loading metadata from {}", path.display()); - let metadata_bytes = std::fs::read(&path)?; - TimelineMetadata::from_bytes(&metadata_bytes) - } - // // How garbage collection works: // @@ -1796,10 +1785,10 @@ impl LayeredTimeline { PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len()); if self.upload_layers.load(atomic::Ordering::Relaxed) { - schedule_timeline_checkpoint_upload( + remote_storage::schedule_layer_upload( self.tenantid, self.timelineid, - new_delta_path, + HashSet::from([new_delta_path]), metadata, ); } @@ -1860,11 +1849,23 @@ impl LayeredTimeline { let timer = self.create_images_time_histo.start_timer(); // 2. Create new image layers for partitions that have been modified // "enough". + let mut layer_paths_to_upload = HashSet::with_capacity(partitioning.parts.len()); for part in partitioning.parts.iter() { if self.time_for_new_image_layer(part, lsn)? { - self.create_image_layer(part, lsn)?; + let new_path = self.create_image_layer(part, lsn)?; + layer_paths_to_upload.insert(new_path); } } + if self.upload_layers.load(atomic::Ordering::Relaxed) { + let metadata = load_metadata(self.conf, self.timelineid, self.tenantid) + .context("failed to load local metadata")?; + remote_storage::schedule_layer_upload( + self.tenantid, + self.timelineid, + layer_paths_to_upload, + metadata, + ); + } timer.stop_and_record(); // 3. Compact @@ -1906,7 +1907,7 @@ impl LayeredTimeline { Ok(false) } - fn create_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result<()> { + fn create_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result { let img_range = partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; let mut image_layer_writer = @@ -1939,10 +1940,11 @@ impl LayeredTimeline { // FIXME: Do we need to do something to upload it to remote storage here? let mut layers = self.layers.write().unwrap(); + let new_path = image_layer.path(); layers.insert_historic(Arc::new(image_layer)); drop(layers); - Ok(()) + Ok(new_path) } fn compact_level0(&self, target_file_size: u64) -> Result<()> { @@ -2037,18 +2039,43 @@ impl LayeredTimeline { } let mut layers = self.layers.write().unwrap(); + let mut new_layer_paths = HashSet::with_capacity(new_layers.len()); for l in new_layers { + new_layer_paths.insert(l.path()); layers.insert_historic(Arc::new(l)); } + if self.upload_layers.load(atomic::Ordering::Relaxed) { + let metadata = load_metadata(self.conf, self.timelineid, self.tenantid) + .context("failed to load local metadata")?; + remote_storage::schedule_layer_upload( + self.tenantid, + self.timelineid, + new_layer_paths, + metadata, + ); + } + // Now that we have reshuffled the data to set of new delta layers, we can // delete the old ones + let mut layer_paths_do_delete = HashSet::with_capacity(level0_deltas.len()); for l in level0_deltas { l.delete()?; - layers.remove_historic(l.clone()); + if let Some(path) = l.local_path() { + layer_paths_do_delete.insert(path); + } + layers.remove_historic(l); } drop(layers); + if self.upload_layers.load(atomic::Ordering::Relaxed) { + remote_storage::schedule_layer_delete( + self.tenantid, + self.timelineid, + layer_paths_do_delete, + ); + } + Ok(()) } @@ -2111,7 +2138,7 @@ impl LayeredTimeline { debug!("retain_lsns: {:?}", retain_lsns); - let mut layers_to_remove: Vec> = Vec::new(); + let mut layers_to_remove = Vec::new(); // Scan all on-disk layers in the timeline. // @@ -2222,13 +2249,24 @@ impl LayeredTimeline { // Actually delete the layers from disk and remove them from the map. // (couldn't do this in the loop above, because you cannot modify a collection // while iterating it. BTreeMap::retain() would be another option) + let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len()); for doomed_layer in layers_to_remove { doomed_layer.delete()?; - layers.remove_historic(doomed_layer.clone()); - + if let Some(path) = doomed_layer.local_path() { + layer_paths_to_delete.insert(path); + } + layers.remove_historic(doomed_layer); result.layers_removed += 1; } + if self.upload_layers.load(atomic::Ordering::Relaxed) { + remote_storage::schedule_layer_delete( + self.tenantid, + self.timelineid, + layer_paths_to_delete, + ); + } + result.elapsed = now.elapsed()?; Ok(result) } @@ -2375,6 +2413,26 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { bail!("couldn't find an unused backup number for {:?}", path) } +fn load_metadata( + conf: &'static PageServerConf, + timeline_id: ZTimelineId, + tenant_id: ZTenantId, +) -> anyhow::Result { + let metadata_path = metadata_path(conf, timeline_id, tenant_id); + let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { + format!( + "Failed to read metadata bytes from path {}", + metadata_path.display() + ) + })?; + TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| { + format!( + "Failed to parse metadata bytes from path {}", + metadata_path.display() + ) + }) +} + /// /// Tests that are specific to the layered storage format. /// @@ -2409,9 +2467,19 @@ pub mod tests { let err = harness.try_load().err().expect("should fail"); assert_eq!(err.to_string(), "failed to load local metadata"); - assert_eq!( - err.source().unwrap().to_string(), - "metadata checksum mismatch" + + let mut found_error_message = false; + let mut err_source = err.source(); + while let Some(source) = err_source { + if source.to_string() == "metadata checksum mismatch" { + found_error_message = true; + break; + } + err_source = source.source(); + } + assert!( + found_error_message, + "didn't find the corrupted metadata error" ); Ok(()) diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 1e1ec716a6..e78b05695c 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -218,6 +218,10 @@ impl Layer for DeltaLayer { PathBuf::from(self.layer_name().to_string()) } + fn local_path(&self) -> Option { + Some(self.path()) + } + fn get_value_reconstruct_data( &self, key: Key, diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index d7657ecac6..c0c8e7789a 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -125,6 +125,10 @@ impl Layer for ImageLayer { PathBuf::from(self.layer_name().to_string()) } + fn local_path(&self) -> Option { + Some(self.path()) + } + fn get_tenant_id(&self) -> ZTenantId { self.tenantid } diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 856baa2e8a..bffb946f7e 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -85,6 +85,10 @@ impl Layer for InMemoryLayer { )) } + fn local_path(&self) -> Option { + None + } + fn get_tenant_id(&self) -> ZTenantId { self.tenantid } diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 91a900dde0..7a2d0d5bcd 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -253,7 +253,7 @@ impl LayerMap { } } - pub fn iter_historic_layers(&self) -> std::slice::Iter> { + pub fn iter_historic_layers(&self) -> impl Iterator> { self.historic_layers.iter() } diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index aad631c5c4..9fcc8907d3 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -105,6 +105,9 @@ pub trait Layer: Send + Sync { /// log messages, even though they're never not on disk.) fn filename(&self) -> PathBuf; + /// If a layer has a corresponding file on a local filesystem, return its path. + fn local_path(&self) -> Option; + /// /// Return data needed to reconstruct given page at LSN. /// diff --git a/pageserver/src/remote_storage.rs b/pageserver/src/remote_storage.rs index cfa09dce14..4db0f6667d 100644 --- a/pageserver/src/remote_storage.rs +++ b/pageserver/src/remote_storage.rs @@ -14,7 +14,7 @@ //! //! * public API via to interact with the external world: //! * [`start_local_timeline_sync`] to launch a background async loop to handle the synchronization -//! * [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] to enqueue a new upload and download tasks, +//! * [`schedule_layer_upload`], [`schedule_layer_download`] and [`schedule_layer_delete`] to enqueue a new upload and download tasks, //! to be processed by the async loop //! //! Here's a schematic overview of all interactions backup and the rest of the pageserver perform: @@ -71,10 +71,10 @@ //! when the newer image is downloaded //! //! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure. -//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexShard`], containing the list of remote files. +//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexPart`], containing the list of remote files. //! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download. //! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`], -//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its shard contents, if needed, same as any layer files. +//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its part contents, if needed, same as any layer files. //! //! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed. //! Bulk index data download happens only initially, on pageserer startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only, @@ -108,7 +108,7 @@ pub use self::{ storage_sync::{ download_index_part, index::{IndexPart, RemoteIndex, RemoteTimeline}, - schedule_timeline_checkpoint_upload, schedule_timeline_download, + schedule_layer_delete, schedule_layer_download, schedule_layer_upload, }, }; use crate::{ diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index 2d3416cd32..127655ce87 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -427,10 +427,10 @@ pub struct TimelineDownload { /// On task failure, it gets retried again from the start a number of times. /// /// Ensure that the loop is started otherwise the task is never processed. -pub fn schedule_timeline_checkpoint_upload( +pub fn schedule_layer_upload( tenant_id: ZTenantId, timeline_id: ZTimelineId, - new_layer: PathBuf, + layers_to_upload: HashSet, metadata: TimelineMetadata, ) { if !sync_queue::push( @@ -439,7 +439,7 @@ pub fn schedule_timeline_checkpoint_upload( timeline_id, }, SyncTask::upload(TimelineUpload { - layers_to_upload: HashSet::from([new_layer]), + layers_to_upload, uploaded_layers: HashSet::new(), metadata, }), @@ -450,6 +450,14 @@ pub fn schedule_timeline_checkpoint_upload( } } +pub fn schedule_layer_delete( + _tenant_id: ZTenantId, + _timeline_id: ZTimelineId, + _layers_to_delete: HashSet, +) { + // TODO kb implement later +} + /// Requests the download of the entire timeline for a given tenant. /// No existing local files are currently overwritten, except the metadata file (if its disk_consistent_lsn is less than the downloaded one). /// The metadata file is always updated last, to avoid inconsistencies. @@ -457,8 +465,8 @@ pub fn schedule_timeline_checkpoint_upload( /// On any failure, the task gets retried, omitting already downloaded layers. /// /// Ensure that the loop is started otherwise the task is never processed. -pub fn schedule_timeline_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { - debug!("Scheduling timeline download for tenant {tenant_id}, timeline {timeline_id}"); +pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { + debug!("Scheduling layer download for tenant {tenant_id}, timeline {timeline_id}"); sync_queue::push( ZTenantTimelineId { tenant_id, From 2ef0e5c6edbec21b52cf27e5ebf3fb6241918319 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 4 May 2022 22:33:53 +0300 Subject: [PATCH 0250/1022] Do not require metadata in every upload sync task --- pageserver/src/layered_repository.rs | 23 ++-- .../src/layered_repository/storage_layer.rs | 2 +- pageserver/src/remote_storage/storage_sync.rs | 113 +++++++++++------- .../src/remote_storage/storage_sync/upload.rs | 18 ++- 4 files changed, 91 insertions(+), 65 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 6719c22738..77c01a7c66 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1789,7 +1789,7 @@ impl LayeredTimeline { self.tenantid, self.timelineid, HashSet::from([new_delta_path]), - metadata, + Some(metadata), ); } @@ -1857,13 +1857,11 @@ impl LayeredTimeline { } } if self.upload_layers.load(atomic::Ordering::Relaxed) { - let metadata = load_metadata(self.conf, self.timelineid, self.tenantid) - .context("failed to load local metadata")?; remote_storage::schedule_layer_upload( self.tenantid, self.timelineid, layer_paths_to_upload, - metadata, + None, ); } timer.stop_and_record(); @@ -2045,17 +2043,6 @@ impl LayeredTimeline { layers.insert_historic(Arc::new(l)); } - if self.upload_layers.load(atomic::Ordering::Relaxed) { - let metadata = load_metadata(self.conf, self.timelineid, self.tenantid) - .context("failed to load local metadata")?; - remote_storage::schedule_layer_upload( - self.tenantid, - self.timelineid, - new_layer_paths, - metadata, - ); - } - // Now that we have reshuffled the data to set of new delta layers, we can // delete the old ones let mut layer_paths_do_delete = HashSet::with_capacity(level0_deltas.len()); @@ -2069,6 +2056,12 @@ impl LayeredTimeline { drop(layers); if self.upload_layers.load(atomic::Ordering::Relaxed) { + remote_storage::schedule_layer_upload( + self.tenantid, + self.timelineid, + new_layer_paths, + None, + ); remote_storage::schedule_layer_delete( self.tenantid, self.timelineid, diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index 9fcc8907d3..aaf765b83d 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -105,7 +105,7 @@ pub trait Layer: Send + Sync { /// log messages, even though they're never not on disk.) fn filename(&self) -> PathBuf; - /// If a layer has a corresponding file on a local filesystem, return its path. + /// If a layer has a corresponding file on a local filesystem, return its absolute path. fn local_path(&self) -> Option; /// diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index 127655ce87..8a26685a7d 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -72,7 +72,7 @@ use std::{ sync::Arc, }; -use anyhow::Context; +use anyhow::{bail, Context}; use futures::stream::{FuturesUnordered, StreamExt}; use lazy_static::lazy_static; use tokio::{ @@ -341,8 +341,16 @@ impl SyncTask { .extend(new_upload_data.data.uploaded_layers.into_iter()); upload_data.retries = 0; - if new_upload_data.data.metadata.disk_consistent_lsn() - > upload_data.data.metadata.disk_consistent_lsn() + if new_upload_data + .data + .metadata + .as_ref() + .map(|meta| meta.disk_consistent_lsn()) + > upload_data + .data + .metadata + .as_ref() + .map(|meta| meta.disk_consistent_lsn()) { upload_data.data.metadata = new_upload_data.data.metadata; } @@ -371,8 +379,16 @@ impl SyncTask { .extend(new_upload_data.data.uploaded_layers.into_iter()); upload_data.retries = 0; - if new_upload_data.data.metadata.disk_consistent_lsn() - > upload_data.data.metadata.disk_consistent_lsn() + if new_upload_data + .data + .metadata + .as_ref() + .map(|meta| meta.disk_consistent_lsn()) + > upload_data + .data + .metadata + .as_ref() + .map(|meta| meta.disk_consistent_lsn()) { upload_data.data.metadata = new_upload_data.data.metadata; } @@ -410,7 +426,7 @@ pub struct TimelineUpload { /// Already uploaded layers. Used to store the data about the uploads between task retries /// and to record the data into the remote index after the task got completed or evicted. uploaded_layers: HashSet, - metadata: TimelineMetadata, + metadata: Option, } /// A timeline download task. @@ -431,7 +447,7 @@ pub fn schedule_layer_upload( tenant_id: ZTenantId, timeline_id: ZTimelineId, layers_to_upload: HashSet, - metadata: TimelineMetadata, + metadata: Option, ) { if !sync_queue::push( ZTenantTimelineId { @@ -932,23 +948,24 @@ async fn upload_timeline( } UploadedTimeline::Successful(upload_data) => upload_data, UploadedTimeline::SuccessfulAfterLocalFsUpdate(mut outdated_upload_data) => { - let local_metadata_path = - metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id); - let local_metadata = match read_metadata_file(&local_metadata_path).await { - Ok(metadata) => metadata, - Err(e) => { - error!( - "Failed to load local metadata from path '{}': {e:?}", - local_metadata_path.display() - ); - outdated_upload_data.retries += 1; - sync_queue::push(sync_id, SyncTask::Upload(outdated_upload_data)); - register_sync_status(sync_start, task_name, Some(false)); - return; - } - }; - - outdated_upload_data.data.metadata = local_metadata; + if outdated_upload_data.data.metadata.is_some() { + let local_metadata_path = + metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id); + let local_metadata = match read_metadata_file(&local_metadata_path).await { + Ok(metadata) => metadata, + Err(e) => { + error!( + "Failed to load local metadata from path '{}': {e:?}", + local_metadata_path.display() + ); + outdated_upload_data.retries += 1; + sync_queue::push(sync_id, SyncTask::Upload(outdated_upload_data)); + register_sync_status(sync_start, task_name, Some(false)); + return; + } + }; + outdated_upload_data.data.metadata = Some(local_metadata); + } outdated_upload_data } }; @@ -982,11 +999,14 @@ where match index_accessor.timeline_entry_mut(&sync_id) { Some(existing_entry) => { - if existing_entry.metadata.disk_consistent_lsn() - < uploaded_data.metadata.disk_consistent_lsn() - { - existing_entry.metadata = uploaded_data.metadata.clone(); + if let Some(new_metadata) = uploaded_data.metadata.as_ref() { + if existing_entry.metadata.disk_consistent_lsn() + < new_metadata.disk_consistent_lsn() + { + existing_entry.metadata = new_metadata.clone(); + } } + if upload_failed { existing_entry .add_upload_failures(uploaded_data.layers_to_upload.iter().cloned()); @@ -997,7 +1017,11 @@ where existing_entry.clone() } None => { - let mut new_remote_timeline = RemoteTimeline::new(uploaded_data.metadata.clone()); + let new_metadata = match uploaded_data.metadata.as_ref() { + Some(new_metadata) => new_metadata, + None => bail!("For timeline {sync_id} upload, there's no upload metadata and no remote index entry, cannot create a new one"), + }; + let mut new_remote_timeline = RemoteTimeline::new(new_metadata.clone()); if upload_failed { new_remote_timeline .add_upload_failures(uploaded_data.layers_to_upload.iter().cloned()); @@ -1140,7 +1164,7 @@ fn schedule_first_sync_tasks( SyncTask::upload(TimelineUpload { layers_to_upload: local_files, uploaded_layers: HashSet::new(), - metadata: local_metadata, + metadata: Some(local_metadata), }), )); local_timeline_init_statuses @@ -1202,7 +1226,7 @@ fn compare_local_and_remote_timeline( SyncTask::upload(TimelineUpload { layers_to_upload, uploaded_layers: HashSet::new(), - metadata: local_metadata, + metadata: Some(local_metadata), }), )); // Note that status here doesn't change. @@ -1269,7 +1293,7 @@ mod test_utils { Ok(TimelineUpload { layers_to_upload, uploaded_layers: HashSet::new(), - metadata, + metadata: Some(metadata), }) } @@ -1340,7 +1364,7 @@ mod tests { TimelineUpload { layers_to_upload: HashSet::from([PathBuf::from("one")]), uploaded_layers: HashSet::from([PathBuf::from("u_one")]), - metadata: metadata_1, + metadata: Some(metadata_1), }, )); let upload_2 = SyncTask::Upload(SyncData::new( @@ -1348,7 +1372,7 @@ mod tests { TimelineUpload { layers_to_upload: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), uploaded_layers: HashSet::from([PathBuf::from("u_two")]), - metadata: metadata_2.clone(), + metadata: Some(metadata_2.clone()), }, )); @@ -1380,7 +1404,8 @@ mod tests { ); assert_eq!( - upload.metadata, metadata_2, + upload.metadata, + Some(metadata_2), "Merged upload tasks should have a metadata with biggest disk_consistent_lsn" ); } @@ -1399,7 +1424,7 @@ mod tests { TimelineUpload { layers_to_upload: HashSet::from([PathBuf::from("u_one")]), uploaded_layers: HashSet::from([PathBuf::from("u_one_2")]), - metadata: dummy_metadata(Lsn(1)), + metadata: Some(dummy_metadata(Lsn(1))), }, ); @@ -1442,7 +1467,7 @@ mod tests { TimelineUpload { layers_to_upload: HashSet::from([PathBuf::from("one")]), uploaded_layers: HashSet::from([PathBuf::from("u_one")]), - metadata: metadata_1.clone(), + metadata: Some(metadata_1.clone()), }, ), ); @@ -1452,7 +1477,7 @@ mod tests { TimelineUpload { layers_to_upload: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), uploaded_layers: HashSet::from([PathBuf::from("u_two")]), - metadata: metadata_2, + metadata: Some(metadata_2), }, )); @@ -1490,7 +1515,8 @@ mod tests { ); assert_eq!( - upload.metadata, metadata_1, + upload.metadata, + Some(metadata_1), "Merged upload tasks should have a metadata with biggest disk_consistent_lsn" ); } @@ -1502,7 +1528,7 @@ mod tests { TimelineUpload { layers_to_upload: HashSet::from([PathBuf::from("one")]), uploaded_layers: HashSet::from([PathBuf::from("u_one")]), - metadata: dummy_metadata(Lsn(22)), + metadata: Some(dummy_metadata(Lsn(22))), }, ); @@ -1572,7 +1598,7 @@ mod tests { TimelineUpload { layers_to_upload: HashSet::from([PathBuf::from("one")]), uploaded_layers: HashSet::from([PathBuf::from("u_one")]), - metadata: metadata_1, + metadata: Some(metadata_1), }, ), ); @@ -1588,7 +1614,7 @@ mod tests { TimelineUpload { layers_to_upload: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), uploaded_layers: HashSet::from([PathBuf::from("u_two")]), - metadata: metadata_2.clone(), + metadata: Some(metadata_2.clone()), }, ), ); @@ -1640,7 +1666,8 @@ mod tests { ); assert_eq!( - upload.metadata, metadata_2, + upload.metadata, + Some(metadata_2), "Merged upload tasks should have a metadata with biggest disk_consistent_lsn" ); } diff --git a/pageserver/src/remote_storage/storage_sync/upload.rs b/pageserver/src/remote_storage/storage_sync/upload.rs index d2ff77e92e..91a0a0d6ce 100644 --- a/pageserver/src/remote_storage/storage_sync/upload.rs +++ b/pageserver/src/remote_storage/storage_sync/upload.rs @@ -86,7 +86,10 @@ where S: RemoteStorage + Send + Sync + 'static, { let upload = &mut upload_data.data; - let new_upload_lsn = upload.metadata.disk_consistent_lsn(); + let new_upload_lsn = upload + .metadata + .as_ref() + .map(|meta| meta.disk_consistent_lsn()); let already_uploaded_layers = remote_timeline .map(|timeline| timeline.stored_files()) @@ -101,7 +104,7 @@ where debug!("Layers to upload: {layers_to_upload:?}"); info!( - "Uploading {} timeline layers, new lsn: {new_upload_lsn}", + "Uploading {} timeline layers, new lsn: {new_upload_lsn:?}", layers_to_upload.len(), ); @@ -234,8 +237,10 @@ mod tests { let current_retries = 3; let metadata = dummy_metadata(Lsn(0x30)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); - let timeline_upload = + let mut timeline_upload = create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; + timeline_upload.metadata = None; + assert!( storage.list().await?.is_empty(), "Storage should be empty before any uploads are made" @@ -278,8 +283,8 @@ mod tests { "Successful upload should have all layers uploaded" ); assert_eq!( - upload.metadata, metadata, - "Successful upload should not chage its metadata" + upload.metadata, None, + "Successful upload without metadata should not have it returned either" ); let storage_files = storage.list().await?; @@ -367,7 +372,8 @@ mod tests { "Successful upload should have all layers uploaded" ); assert_eq!( - upload.metadata, metadata, + upload.metadata, + Some(metadata), "Successful upload should not chage its metadata" ); From 4024bfe73605ce5c0ff13f7c337a2543d3ec7158 Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Thu, 5 May 2022 22:21:07 +0300 Subject: [PATCH 0251/1022] get_binaries script fix (#1638) * get_binaries script fix * minor improvment for get_binaries --- .circleci/ansible/get_binaries.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/ansible/get_binaries.sh b/.circleci/ansible/get_binaries.sh index a4b4372d9f..c613213a75 100755 --- a/.circleci/ansible/get_binaries.sh +++ b/.circleci/ansible/get_binaries.sh @@ -7,7 +7,7 @@ RELEASE=${RELEASE:-false} # look at docker hub for latest tag for neon docker image if [ "${RELEASE}" = "true" ]; then echo "search latest relase tag" - VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | tail -1) + VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1) if [ -z "${VERSION}" ]; then echo "no any docker tags found, exiting..." exit 1 @@ -16,7 +16,7 @@ if [ "${RELEASE}" = "true" ]; then fi else echo "search latest dev tag" - VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -v release | tail -1) + VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1) if [ -z "${VERSION}" ]; then echo "no any docker tags found, exiting..." exit 1 From 954859f6c5648aa351b5c4a0b05b3db0f369a0ab Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 5 May 2022 13:15:53 +0300 Subject: [PATCH 0252/1022] add readme for performance tests with the current state of things --- test_runner/performance/README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 test_runner/performance/README.md diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md new file mode 100644 index 0000000000..7812c73f0c --- /dev/null +++ b/test_runner/performance/README.md @@ -0,0 +1,23 @@ +# What performance tests do we have and how we run them + +Performanse tests are build using infrastructure of our usual python integration tests. + +## Tests that are run against local installation + +Most off the performance tests run against local installation. This causes some problems because safekeeper(s) and a pageserver share resources of one single host and one underlyinng disk. + +These tests are run in CI in the same environment as the usual integration tests. So environment may not yield comarable results because this is the machine that CI provider gives us. + +## Remote tests + +There are a few tests that marked with `pytest.mark.remote_cluster`. These tests do not use local installation and onnly need a connection string to run. So they can be used for every postgresql comatible database. Currenntly these tests are run against our staging daily. Staging is not an isolated environment, so it adds to possible noise due to activity of other clusters. + +## Noise + +All tests run only once. Usually to obtain more consistent performance numbers test is performed multiple times and then some statistics is applied to results, like min/max/avg/median etc. + +## Results collection + +Local tests results for main branch and results of daily performance tests are stored in neon cluster deployed in production environment and there is a grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commits though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks. + +There is also an inconsistency in test naming. Test name should be the same across platforms and results can be differentiated by the platform field. But now platform is sometimes included in test name because of the way how parametrization works in pytest. Ie there is a platform switch in the dashboard with zenith-local-ci and zenith-staging variants. I e some tests under zenith-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[zenith]` which is highly confusing. From 1ad5658d9cd044b15059bdfb3417b19d5c6c8008 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 5 May 2022 19:55:08 +0300 Subject: [PATCH 0253/1022] Fix typos --- test_runner/performance/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index 7812c73f0c..c2354a7e5b 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -1,20 +1,20 @@ # What performance tests do we have and how we run them -Performanse tests are build using infrastructure of our usual python integration tests. +Performance tests are built using infrastructure of our usual python integration tests. ## Tests that are run against local installation -Most off the performance tests run against local installation. This causes some problems because safekeeper(s) and a pageserver share resources of one single host and one underlyinng disk. +Most of the performance tests run against a local installation. This causes some problems because safekeeper(s) and the pageserver share resources of one single host and one underlying disk. These tests are run in CI in the same environment as the usual integration tests. So environment may not yield comarable results because this is the machine that CI provider gives us. ## Remote tests -There are a few tests that marked with `pytest.mark.remote_cluster`. These tests do not use local installation and onnly need a connection string to run. So they can be used for every postgresql comatible database. Currenntly these tests are run against our staging daily. Staging is not an isolated environment, so it adds to possible noise due to activity of other clusters. +There are a few tests that marked with `pytest.mark.remote_cluster`. These tests do not use local installation and only need a connection string to run. So they can be used for every postgresql compatible database. Currently these tests are run against our staging daily. Staging is not an isolated environment, so it adds to possible noise due to activity of other clusters. ## Noise -All tests run only once. Usually to obtain more consistent performance numbers test is performed multiple times and then some statistics is applied to results, like min/max/avg/median etc. +All tests run only once. Usually to obtain more consistent performance numbers test is performed multiple times and then some statistics is applied to the results, like min/max/avg/median etc. ## Results collection From 30a7598172e085cbe0687746ccc5d0cdbd460554 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 5 May 2022 20:04:54 +0300 Subject: [PATCH 0254/1022] Some copy-editing. --- test_runner/performance/README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index c2354a7e5b..776565b679 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -1,23 +1,23 @@ # What performance tests do we have and how we run them -Performance tests are built using infrastructure of our usual python integration tests. +Performance tests are built using the same infrastructure as our usual python integration tests. There are some extra fixtures that help to collect performance metrics, and to run tests against both vanilla PostgreSQL and Neon for comparison. ## Tests that are run against local installation -Most of the performance tests run against a local installation. This causes some problems because safekeeper(s) and the pageserver share resources of one single host and one underlying disk. +Most of the performance tests run against a local installation. This is not very representative of a production environment. Firstly, Postgres, safekeeper(s) and the pageserver have to share CPU and I/O resources, which can add noise to the results. Secondly, network overhead is eliminated. -These tests are run in CI in the same environment as the usual integration tests. So environment may not yield comarable results because this is the machine that CI provider gives us. +In the CI, the performance tests are run in the same environment as the other integration tests. We don't have control over the host that the CI runs on, so the environment may vary widely from one run to another, which makes the results across different runs noisy to compare. ## Remote tests -There are a few tests that marked with `pytest.mark.remote_cluster`. These tests do not use local installation and only need a connection string to run. So they can be used for every postgresql compatible database. Currently these tests are run against our staging daily. Staging is not an isolated environment, so it adds to possible noise due to activity of other clusters. +There are a few tests that marked with `pytest.mark.remote_cluster`. These tests do not set up a local environment, and instead require a libpq connection string to connect to. So they can be run on any Postgres compatible database. Currently, the CI runs these tests our staging environment daily. Staging is not an isolated environment, so there can be noise in the results due to activity of other clusters. ## Noise -All tests run only once. Usually to obtain more consistent performance numbers test is performed multiple times and then some statistics is applied to the results, like min/max/avg/median etc. +All tests run only once. Usually to obtain more consistent performance numbers, a test should be repeated multiple times and the results be aggregated, for example by taking min, max, avg, or median. ## Results collection -Local tests results for main branch and results of daily performance tests are stored in neon cluster deployed in production environment and there is a grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commits though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks. +Local test results for main branch, and results of daily performance tests, are stored in a neon project deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks. -There is also an inconsistency in test naming. Test name should be the same across platforms and results can be differentiated by the platform field. But now platform is sometimes included in test name because of the way how parametrization works in pytest. Ie there is a platform switch in the dashboard with zenith-local-ci and zenith-staging variants. I e some tests under zenith-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[zenith]` which is highly confusing. +There is also an inconsistency in test naming. Test name should be the same across platforms, and results can be differentiated by the platform field. But currently, platform is sometimes included in test name because of the way how parametrization works in pytest. I.e. there is a platform switch in the dashboard with zenith-local-ci and zenith-staging variants. I.e. some tests under zenith-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[zenith]` which is highly confusing. From 11a44eda0ecd4d41757e88df1d5fe3e3ecc73114 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Thu, 5 May 2022 23:48:16 +0300 Subject: [PATCH 0255/1022] Add TLS support in scram-proxy (#1643) * Add TLS support in scram-proxy * Fix authEndpoint --- .circleci/helm-values/staging.proxy-scram.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.circleci/helm-values/staging.proxy-scram.yaml b/.circleci/helm-values/staging.proxy-scram.yaml index f72a9d4557..91422e754a 100644 --- a/.circleci/helm-values/staging.proxy-scram.yaml +++ b/.circleci/helm-values/staging.proxy-scram.yaml @@ -6,7 +6,8 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-staging.local:9095/management/api/v2" + authEndpoint: "http://console-staging.local/management/api/v2" + domain: "*.cloud.stage.neon.tech" # -- Additional labels for zenith-proxy pods podLabels: From ef40e404cf15cc335fbf6a226879e5358aa628eb Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Thu, 5 May 2022 19:06:53 -0400 Subject: [PATCH 0256/1022] Rename zenith crate to neon_local (#1625) --- Cargo.lock | 34 ++++++++++++------------- Cargo.toml | 2 +- README.md | 16 ++++++------ {zenith => neon_local}/Cargo.toml | 2 +- {zenith => neon_local}/src/main.rs | 17 ++++++------- test_runner/fixtures/zenith_fixtures.py | 2 +- 6 files changed, 36 insertions(+), 37 deletions(-) rename {zenith => neon_local}/Cargo.toml (96%) rename {zenith => neon_local}/src/main.rs (98%) diff --git a/Cargo.lock b/Cargo.lock index e9b24b2f84..3c38dc8150 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1487,6 +1487,23 @@ dependencies = [ "tempfile", ] +[[package]] +name = "neon_local" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap 3.0.14", + "comfy-table", + "control_plane", + "pageserver", + "postgres", + "postgres_ffi", + "safekeeper", + "serde_json", + "utils", + "workspace_hack", +] + [[package]] name = "nix" version = "0.23.1" @@ -3703,23 +3720,6 @@ dependencies = [ "chrono", ] -[[package]] -name = "zenith" -version = "0.1.0" -dependencies = [ - "anyhow", - "clap 3.0.14", - "comfy-table", - "control_plane", - "pageserver", - "postgres", - "postgres_ffi", - "safekeeper", - "serde_json", - "utils", - "workspace_hack", -] - [[package]] name = "zeroize" version = "1.5.2" diff --git a/Cargo.toml b/Cargo.toml index 3838637d37..f0934853f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,7 @@ members = [ "proxy", "safekeeper", "workspace_hack", - "zenith", + "neon_local", "libs/*", ] diff --git a/README.md b/README.md index 03f86887a7..8876831265 100644 --- a/README.md +++ b/README.md @@ -49,14 +49,14 @@ make -j5 ```sh # Create repository in .zenith with proper paths to binaries and data # Later that would be responsibility of a package install script -> ./target/debug/zenith init +> ./target/debug/neon_local init initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229 created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8 created main branch pageserver init succeeded # start pageserver and safekeeper -> ./target/debug/zenith start +> ./target/debug/neon_local start Starting pageserver at 'localhost:64000' in '.zenith' Pageserver started initializing for single for 7676 @@ -64,7 +64,7 @@ Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/single' Safekeeper started # start postgres compute node -> ./target/debug/zenith pg start main +> ./target/debug/neon_local pg start main Starting new postgres main on timeline 5b014a9e41b4b63ce1a1febc04503636 ... Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432 Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres' @@ -72,7 +72,7 @@ waiting for server to start.... done server started # check list of running postgres instances -> ./target/debug/zenith pg list +> ./target/debug/neon_local pg list NODE ADDRESS TIMELINES BRANCH NAME LSN STATUS main 127.0.0.1:55432 5b014a9e41b4b63ce1a1febc04503636 main 0/1609610 running ``` @@ -94,16 +94,16 @@ postgres=# select * from t; 5. And create branches and run postgres on them: ```sh # create branch named migration_check -> ./target/debug/zenith timeline branch --branch-name migration_check +> ./target/debug/neon_local timeline branch --branch-name migration_check Created timeline '0e9331cad6efbafe6a88dd73ae21a5c9' at Lsn 0/16F5830 for tenant: c03ba6b7ad4c5e9cf556f059ade44229. Ancestor timeline: 'main' # check branches tree -> ./target/debug/zenith timeline list +> ./target/debug/neon_local timeline list main [5b014a9e41b4b63ce1a1febc04503636] ┗━ @0/1609610: migration_check [0e9331cad6efbafe6a88dd73ae21a5c9] # start postgres on that branch -> ./target/debug/zenith pg start migration_check +> ./target/debug/neon_local pg start migration_check Starting postgres node at 'host=127.0.0.1 port=55433 user=stas' waiting for server to start.... done @@ -123,7 +123,7 @@ INSERT 0 1 6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances you have just started. You can stop them all with one command: ```sh -> ./target/debug/zenith stop +> ./target/debug/neon_local stop ``` ## Running tests diff --git a/zenith/Cargo.toml b/neon_local/Cargo.toml similarity index 96% rename from zenith/Cargo.toml rename to neon_local/Cargo.toml index 58f1f5751d..78d339789f 100644 --- a/zenith/Cargo.toml +++ b/neon_local/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "zenith" +name = "neon_local" version = "0.1.0" edition = "2021" diff --git a/zenith/src/main.rs b/neon_local/src/main.rs similarity index 98% rename from zenith/src/main.rs rename to neon_local/src/main.rs index 87bb5f3f60..158e43f68f 100644 --- a/zenith/src/main.rs +++ b/neon_local/src/main.rs @@ -62,15 +62,15 @@ http_port = {safekeeper_http_port} struct TimelineTreeEl { /// `TimelineInfo` received from the `pageserver` via the `timeline_list` http API call. pub info: TimelineInfo, - /// Name, recovered from zenith config mappings + /// Name, recovered from neon config mappings pub name: Option, /// Holds all direct children of this timeline referenced using `timeline_id`. pub children: BTreeSet, } -// Main entry point for the 'zenith' CLI utility +// Main entry point for the 'neon_local' CLI utility // -// This utility helps to manage zenith installation. That includes following: +// This utility helps to manage neon installation. That includes following: // * Management of local postgres installations running on top of the // pageserver. // * Providing CLI api to the pageserver @@ -125,12 +125,12 @@ fn main() -> Result<()> { .takes_value(true) .required(false); - let matches = App::new("Zenith CLI") + let matches = App::new("Neon CLI") .setting(AppSettings::ArgRequiredElseHelp) .version(GIT_VERSION) .subcommand( App::new("init") - .about("Initialize a new Zenith repository") + .about("Initialize a new Neon repository") .arg(pageserver_config_args.clone()) .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) .arg( @@ -258,7 +258,7 @@ fn main() -> Result<()> { None => bail!("no subcommand provided"), }; - // Check for 'zenith init' command first. + // Check for 'neon init' command first. let subcommand_result = if sub_name == "init" { handle_init(sub_args).map(Some) } else { @@ -481,9 +481,8 @@ fn handle_init(init_match: &ArgMatches) -> Result { }; let mut env = - LocalEnv::create_config(&toml_file).context("Failed to create zenith configuration")?; - env.init() - .context("Failed to initialize zenith repository")?; + LocalEnv::create_config(&toml_file).context("Failed to create neon configuration")?; + env.init().context("Failed to initialize neon repository")?; // default_tenantid was generated by the `env.init()` call above let initial_tenant_id = env.default_tenant_id.unwrap(); diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 784d2d4b26..7acf0552df 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1108,7 +1108,7 @@ class ZenithCli: assert type(arguments) == list - bin_zenith = os.path.join(str(zenith_binpath), 'zenith') + bin_zenith = os.path.join(str(zenith_binpath), 'neon_local') args = [bin_zenith] + arguments log.info('Running command "{}"'.format(' '.join(args))) From dd6dca90726c66da7398d80fc13ebeddf945b5ee Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 6 May 2022 13:03:07 +0400 Subject: [PATCH 0257/1022] Bump vendor/postgres to shut down on wrong basebackup. --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index d35bd7132f..9a9459a7f9 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit d35bd7132ff6ed600577934e5389c7657087fbe1 +Subproject commit 9a9459a7f9cbcaa0e35ff1f2f34c419238fdec7e From d4e155aaa3b818981717e5b1a1ac6fb7af5cc9cd Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 4 May 2022 18:28:46 +0300 Subject: [PATCH 0258/1022] Librarify common etcd timeline logic --- Cargo.lock | 182 ++++++++++++++--- control_plane/src/local_env.rs | 4 + control_plane/src/safekeeper.rs | 5 + libs/etcd_broker/Cargo.toml | 17 ++ libs/etcd_broker/src/lib.rs | 335 +++++++++++++++++++++++++++++++ libs/utils/src/zid.rs | 2 +- neon_local/src/main.rs | 15 +- safekeeper/Cargo.toml | 4 +- safekeeper/src/bin/safekeeper.rs | 11 +- safekeeper/src/broker.rs | 137 ++++--------- safekeeper/src/http/routes.rs | 4 +- safekeeper/src/lib.rs | 3 + safekeeper/src/safekeeper.rs | 4 +- safekeeper/src/timeline.rs | 48 ++++- workspace_hack/Cargo.toml | 9 +- 15 files changed, 633 insertions(+), 147 deletions(-) create mode 100644 libs/etcd_broker/Cargo.toml create mode 100644 libs/etcd_broker/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 3c38dc8150..ac40a2931f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -48,9 +48,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.53" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94a45b455c14666b85fc40a019e8ab9eb75e3a124e05494f5397122bc9eb06e0" +checksum = "08f9b8508dccb7687a1d6c4ce66b2b0ecef467c94667de27d8d7fe1f8d2a9cdc" dependencies = [ "backtrace", ] @@ -113,6 +113,49 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "axum" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4af7447fc1214c1f3a1ace861d0216a6c8bb13965b64bbad9650f375b67689a" +dependencies = [ + "async-trait", + "axum-core", + "bitflags", + "bytes", + "futures-util", + "http", + "http-body", + "hyper", + "itoa 1.0.1", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "serde", + "sync_wrapper", + "tokio", + "tower", + "tower-http", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bdc19781b16e32f8a7200368a336fa4509d4b72ef15dd4e41df5290855ee1e6" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http", + "http-body", + "mime", +] + [[package]] name = "backtrace" version = "0.3.64" @@ -320,6 +363,15 @@ dependencies = [ "textwrap 0.14.2", ] +[[package]] +name = "cmake" +version = "0.1.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8ad8cef104ac57b68b89df3208164d228503abbdce70f6880ffa3d970e7443a" +dependencies = [ + "cc", +] + [[package]] name = "combine" version = "4.6.3" @@ -730,9 +782,9 @@ dependencies = [ [[package]] name = "etcd-client" -version = "0.8.4" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "585de5039d1ecce74773db49ba4e8107e42be7c2cd0b1a9e7fce27181db7b118" +checksum = "c434d2800b273a506b82397aad2f20971636f65e47b27c027f77d498530c5954" dependencies = [ "http", "prost", @@ -740,9 +792,26 @@ dependencies = [ "tokio-stream", "tonic", "tonic-build", + "tower", "tower-service", ] +[[package]] +name = "etcd_broker" +version = "0.1.0" +dependencies = [ + "etcd-client", + "regex", + "serde", + "serde_json", + "serde_with", + "thiserror", + "tokio", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "fail" version = "0.5.0" @@ -1027,6 +1096,12 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "heck" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" + [[package]] name = "hermit-abi" version = "0.1.19" @@ -1092,6 +1167,12 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-range-header" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29" + [[package]] name = "httparse" version = "1.6.0" @@ -1357,6 +1438,12 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" +[[package]] +name = "matchit" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73cbba799671b762df5a175adf59ce145165747bb891505c43d09aefbbf38beb" + [[package]] name = "md-5" version = "0.9.1" @@ -1613,9 +1700,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" +checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9" [[package]] name = "oorandom" @@ -1976,6 +2063,16 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" +[[package]] +name = "prettyplease" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9e07e3a46d0771a8a06b5f4441527802830b43e679ba12f44960f48dd4c6803" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro-hack" version = "0.5.19" @@ -2007,9 +2104,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.9.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "444879275cb4fd84958b1a1d5420d15e6fcf7c235fe47f053c9c2a80aceb6001" +checksum = "a07b0857a71a8cb765763950499cae2413c3f9cede1133478c43600d9e146890" dependencies = [ "bytes", "prost-derive", @@ -2017,12 +2114,14 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.9.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62941722fb675d463659e49c4f3fe1fe792ff24fe5bbaa9c08cd3b98a1c354f5" +checksum = "120fbe7988713f39d780a58cf1a7ef0d7ef66c6d87e5aa3438940c05357929f4" dependencies = [ "bytes", - "heck", + "cfg-if", + "cmake", + "heck 0.4.0", "itertools", "lazy_static", "log", @@ -2037,9 +2136,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.9.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9cc1a3263e07e0bf68e96268f37665207b49560d98739662cdfaae215c720fe" +checksum = "7b670f45da57fb8542ebdbb6105a925fe571b67f9e7ed9f47a06a84e72b4e7cc" dependencies = [ "anyhow", "itertools", @@ -2050,9 +2149,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.9.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534b7a0e836e3c482d2693070f982e39e7611da9695d4d1f5a4b186b51faef0a" +checksum = "2d0a014229361011dc8e69c8a1ec6c2e8d0f2af7c91e3ea3f5b2170298461e68" dependencies = [ "bytes", "prost", @@ -2224,9 +2323,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.5.4" +version = "1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286" dependencies = [ "aho-corasick", "memchr", @@ -2501,7 +2600,7 @@ dependencies = [ "const_format", "crc32c", "daemonize", - "etcd-client", + "etcd_broker", "fs2", "hex", "humantime", @@ -2830,7 +2929,7 @@ version = "0.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38" dependencies = [ - "heck", + "heck 0.3.3", "proc-macro2", "quote", "rustversion", @@ -2868,15 +2967,21 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.86" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a65b3f4ffa0092e9887669db0eae07941f023991ab58ea44da8fe8e2d511c6b" +checksum = "7ff7c592601f11445996a06f8ad0c27f094a58857c2f89e97974ab9235b92c52" dependencies = [ "proc-macro2", "quote", "unicode-xid", ] +[[package]] +name = "sync_wrapper" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8" + [[package]] name = "tar" version = "0.4.38" @@ -3170,12 +3275,13 @@ dependencies = [ [[package]] name = "tonic" -version = "0.6.2" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff08f4649d10a70ffa3522ca559031285d8e421d727ac85c60825761818f5d0a" +checksum = "30fb54bf1e446f44d870d260d99957e7d11fb9d0a0f5bd1a662ad1411cc103f9" dependencies = [ "async-stream", "async-trait", + "axum", "base64", "bytes", "futures-core", @@ -3191,7 +3297,7 @@ dependencies = [ "prost-derive", "tokio", "tokio-stream", - "tokio-util 0.6.9", + "tokio-util 0.7.0", "tower", "tower-layer", "tower-service", @@ -3201,10 +3307,11 @@ dependencies = [ [[package]] name = "tonic-build" -version = "0.6.2" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9403f1bafde247186684b230dc6f38b5cd514584e8bec1dd32514be4745fa757" +checksum = "c03447cdc9eaf8feffb6412dcb27baf2db11669a6c4789f29da799aabfb99547" dependencies = [ + "prettyplease", "proc-macro2", "prost-build", "quote", @@ -3231,6 +3338,25 @@ dependencies = [ "tracing", ] +[[package]] +name = "tower-http" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e980386f06883cf4d0578d6c9178c81f68b45d77d00f2c2c1bc034b3439c2c56" +dependencies = [ + "bitflags", + "bytes", + "futures-core", + "futures-util", + "http", + "http-body", + "http-range-header", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", +] + [[package]] name = "tower-layer" version = "0.3.1" @@ -3672,13 +3798,16 @@ dependencies = [ name = "workspace_hack" version = "0.1.0" dependencies = [ + "ahash", "anyhow", "bytes", "chrono", "clap 2.34.0", "either", + "fail", "hashbrown", "indexmap", + "itoa 0.4.8", "libc", "log", "memchr", @@ -3692,6 +3821,7 @@ dependencies = [ "serde", "syn", "tokio", + "tokio-util 0.7.0", "tracing", "tracing-core", ] diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 12ee88cdc9..5aeff505b6 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -63,6 +63,10 @@ pub struct LocalEnv { #[serde(default)] pub broker_endpoints: Option, + /// A prefix to all to any key when pushing/polling etcd from a node. + #[serde(default)] + pub broker_etcd_prefix: Option, + pub pageserver: PageServerConf, #[serde(default)] diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index b094016131..074ee72f69 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -77,6 +77,7 @@ pub struct SafekeeperNode { pub pageserver: Arc, broker_endpoints: Option, + broker_etcd_prefix: Option, } impl SafekeeperNode { @@ -94,6 +95,7 @@ impl SafekeeperNode { http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port), pageserver, broker_endpoints: env.broker_endpoints.clone(), + broker_etcd_prefix: env.broker_etcd_prefix.clone(), } } @@ -143,6 +145,9 @@ impl SafekeeperNode { if let Some(ref ep) = self.broker_endpoints { cmd.args(&["--broker-endpoints", ep]); } + if let Some(prefix) = self.broker_etcd_prefix.as_deref() { + cmd.args(&["--broker-etcd-prefix", prefix]); + } if !cmd.status()?.success() { bail!( diff --git a/libs/etcd_broker/Cargo.toml b/libs/etcd_broker/Cargo.toml new file mode 100644 index 0000000000..65bd406131 --- /dev/null +++ b/libs/etcd_broker/Cargo.toml @@ -0,0 +1,17 @@ +[package] + name = "etcd_broker" + version = "0.1.0" + edition = "2021" + + [dependencies] + etcd-client = "0.9.0" + regex = "1.4.5" + serde = { version = "1.0", features = ["derive"] } + serde_json = "1" + serde_with = "1.12.0" + + utils = { path = "../utils" } + workspace_hack = { version = "0.1", path = "../../workspace_hack" } + tokio = "1" + tracing = "0.1" + thiserror = "1" diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs new file mode 100644 index 0000000000..01cc0cf162 --- /dev/null +++ b/libs/etcd_broker/src/lib.rs @@ -0,0 +1,335 @@ +//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent). +//! Intended to connect services to each other, not to store their data. +use std::{ + collections::{hash_map, HashMap}, + fmt::Display, + str::FromStr, +}; + +use regex::{Captures, Regex}; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; + +pub use etcd_client::*; + +use tokio::{sync::mpsc, task::JoinHandle}; +use tracing::*; +use utils::{ + lsn::Lsn, + zid::{ZNodeId, ZTenantId, ZTenantTimelineId}, +}; + +#[derive(Debug, Deserialize, Serialize)] +struct SafekeeperTimeline { + safekeeper_id: ZNodeId, + info: SkTimelineInfo, +} + +/// Published data about safekeeper's timeline. Fields made optional for easy migrations. +#[serde_as] +#[derive(Debug, Deserialize, Serialize)] +pub struct SkTimelineInfo { + /// Term of the last entry. + pub last_log_term: Option, + /// LSN of the last record. + #[serde_as(as = "Option")] + #[serde(default)] + pub flush_lsn: Option, + /// Up to which LSN safekeeper regards its WAL as committed. + #[serde_as(as = "Option")] + #[serde(default)] + pub commit_lsn: Option, + /// LSN up to which safekeeper offloaded WAL to s3. + #[serde_as(as = "Option")] + #[serde(default)] + pub s3_wal_lsn: Option, + /// LSN of last checkpoint uploaded by pageserver. + #[serde_as(as = "Option")] + #[serde(default)] + pub remote_consistent_lsn: Option, + #[serde_as(as = "Option")] + #[serde(default)] + pub peer_horizon_lsn: Option, + #[serde(default)] + pub wal_stream_connection_string: Option, +} + +#[derive(Debug, thiserror::Error)] +pub enum BrokerError { + #[error("Etcd client error: {0}. Context: {1}")] + EtcdClient(etcd_client::Error, String), + #[error("Error during parsing etcd data: {0}")] + ParsingError(String), + #[error("Internal error: {0}")] + InternalError(String), +} + +/// A way to control the data retrieval from a certain subscription. +pub struct SkTimelineSubscription { + safekeeper_timeline_updates: + mpsc::UnboundedReceiver>>, + kind: SkTimelineSubscriptionKind, + watcher_handle: JoinHandle>, + watcher: Watcher, +} + +impl SkTimelineSubscription { + /// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet. + pub async fn fetch_data( + &mut self, + ) -> Option>> { + self.safekeeper_timeline_updates.recv().await + } + + /// Cancels the subscription, stopping the data poller and waiting for it to shut down. + pub async fn cancel(mut self) -> Result<(), BrokerError> { + self.watcher.cancel().await.map_err(|e| { + BrokerError::EtcdClient( + e, + format!( + "Failed to cancel timeline subscription, kind: {:?}", + self.kind + ), + ) + })?; + self.watcher_handle.await.map_err(|e| { + BrokerError::InternalError(format!( + "Failed to join the timeline updates task, kind: {:?}, error: {e}", + self.kind + )) + })? + } +} + +/// The subscription kind to the timeline updates from safekeeper. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct SkTimelineSubscriptionKind { + broker_prefix: String, + kind: SubscriptionKind, +} + +impl SkTimelineSubscriptionKind { + pub fn all(broker_prefix: String) -> Self { + Self { + broker_prefix, + kind: SubscriptionKind::All, + } + } + + pub fn tenant(broker_prefix: String, tenant: ZTenantId) -> Self { + Self { + broker_prefix, + kind: SubscriptionKind::Tenant(tenant), + } + } + + pub fn timeline(broker_prefix: String, timeline: ZTenantTimelineId) -> Self { + Self { + broker_prefix, + kind: SubscriptionKind::Timeline(timeline), + } + } + + fn watch_regex(&self) -> Regex { + match self.kind { + SubscriptionKind::All => Regex::new(&format!( + r"^{}/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$", + self.broker_prefix + )) + .expect("wrong regex for 'everything' subscription"), + SubscriptionKind::Tenant(tenant_id) => Regex::new(&format!( + r"^{}/{tenant_id}/([[:xdigit:]]+)/safekeeper/([[:digit:]])$", + self.broker_prefix + )) + .expect("wrong regex for 'tenant' subscription"), + SubscriptionKind::Timeline(ZTenantTimelineId { + tenant_id, + timeline_id, + }) => Regex::new(&format!( + r"^{}/{tenant_id}/{timeline_id}/safekeeper/([[:digit:]])$", + self.broker_prefix + )) + .expect("wrong regex for 'timeline' subscription"), + } + } + + /// Etcd key to use for watching a certain timeline updates from safekeepers. + pub fn watch_key(&self) -> String { + match self.kind { + SubscriptionKind::All => self.broker_prefix.to_string(), + SubscriptionKind::Tenant(tenant_id) => { + format!("{}/{tenant_id}/safekeeper", self.broker_prefix) + } + SubscriptionKind::Timeline(ZTenantTimelineId { + tenant_id, + timeline_id, + }) => format!( + "{}/{tenant_id}/{timeline_id}/safekeeper", + self.broker_prefix + ), + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +enum SubscriptionKind { + /// Get every timeline update. + All, + /// Get certain tenant timelines' updates. + Tenant(ZTenantId), + /// Get certain timeline updates. + Timeline(ZTenantTimelineId), +} + +/// Creates a background task to poll etcd for timeline updates from safekeepers. +/// Stops and returns `Err` on any error during etcd communication. +/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle, +/// exiting normally in such cases. +pub async fn subscribe_to_safekeeper_timeline_updates( + client: &mut Client, + subscription: SkTimelineSubscriptionKind, +) -> Result { + info!("Subscribing to timeline updates, subscription kind: {subscription:?}"); + + let (watcher, mut stream) = client + .watch( + subscription.watch_key(), + Some(WatchOptions::new().with_prefix()), + ) + .await + .map_err(|e| { + BrokerError::EtcdClient( + e, + format!("Failed to init the watch for subscription {subscription:?}"), + ) + })?; + + let (timeline_updates_sender, safekeeper_timeline_updates) = mpsc::unbounded_channel(); + + let subscription_kind = subscription.kind; + let regex = subscription.watch_regex(); + let watcher_handle = tokio::spawn(async move { + while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!( + "Failed to get messages from the subscription stream, kind: {subscription_kind:?}, error: {e}" + )))? { + if resp.canceled() { + info!("Watch for timeline updates subscription was canceled, exiting"); + break; + } + + let mut timeline_updates: HashMap> = + HashMap::new(); + + let events = resp.events(); + debug!("Processing {} events", events.len()); + + for event in events { + if EventType::Put == event.event_type() { + if let Some(kv) = event.kv() { + match parse_etcd_key_value(subscription_kind, ®ex, kv) { + Ok(Some((zttid, timeline))) => { + match timeline_updates + .entry(zttid) + .or_default() + .entry(timeline.safekeeper_id) + { + hash_map::Entry::Occupied(mut o) => { + if o.get().flush_lsn < timeline.info.flush_lsn { + o.insert(timeline.info); + } + } + hash_map::Entry::Vacant(v) => { + v.insert(timeline.info); + } + } + } + Ok(None) => {} + Err(e) => error!("Failed to parse timeline update: {e}"), + }; + } + } + } + + if let Err(e) = timeline_updates_sender.send(timeline_updates) { + info!("Timeline updates sender got dropped, exiting: {e}"); + break; + } + } + + Ok(()) + }); + + Ok(SkTimelineSubscription { + kind: subscription, + safekeeper_timeline_updates, + watcher_handle, + watcher, + }) +} + +fn parse_etcd_key_value( + subscription_kind: SubscriptionKind, + regex: &Regex, + kv: &KeyValue, +) -> Result, BrokerError> { + let caps = if let Some(caps) = regex.captures(kv.key_str().map_err(|e| { + BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as key str")) + })?) { + caps + } else { + return Ok(None); + }; + + let (zttid, safekeeper_id) = match subscription_kind { + SubscriptionKind::All => ( + ZTenantTimelineId::new( + parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?, + parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?, + ), + ZNodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?), + ), + SubscriptionKind::Tenant(tenant_id) => ( + ZTenantTimelineId::new( + tenant_id, + parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?, + ), + ZNodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?), + ), + SubscriptionKind::Timeline(zttid) => ( + zttid, + ZNodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?), + ), + }; + + let info_str = kv.value_str().map_err(|e| { + BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as value str")) + })?; + Ok(Some(( + zttid, + SafekeeperTimeline { + safekeeper_id, + info: serde_json::from_str(info_str).map_err(|e| { + BrokerError::ParsingError(format!( + "Failed to parse '{info_str}' as safekeeper timeline info: {e}" + )) + })?, + }, + ))) +} + +fn parse_capture(caps: &Captures, index: usize) -> Result +where + T: FromStr, + ::Err: Display, +{ + let capture_match = caps + .get(index) + .ok_or_else(|| format!("Failed to get capture match at index {index}"))? + .as_str(); + capture_match.parse().map_err(|e| { + format!( + "Failed to parse {} from {capture_match}: {e}", + std::any::type_name::() + ) + }) +} diff --git a/libs/utils/src/zid.rs b/libs/utils/src/zid.rs index fce5ed97c1..44d81cda50 100644 --- a/libs/utils/src/zid.rs +++ b/libs/utils/src/zid.rs @@ -224,7 +224,7 @@ impl fmt::Display for ZTenantTimelineId { // Unique ID of a storage node (safekeeper or pageserver). Supposed to be issued // by the console. -#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Debug, Serialize, Deserialize)] +#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)] #[serde(transparent)] pub struct ZNodeId(pub u64); diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index 158e43f68f..8b54054080 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -517,7 +517,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { .collect() } -fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { +fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> { let pageserver = PageServerNode::from_env(env); match tenant_match.subcommand() { Some(("list", _)) => { @@ -550,17 +550,8 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re pageserver .tenant_config(tenant_id, tenant_conf) - .unwrap_or_else(|e| { - anyhow!( - "Tenant config failed for tenant with id {} : {}", - tenant_id, - e - ); - }); - println!( - "tenant {} successfully configured on the pageserver", - tenant_id - ); + .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?; + println!("tenant {tenant_id} successfully configured on the pageserver"); } Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), None => bail!("no tenant subcommand provided"), diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 8a31311b8f..44587dd384 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -24,11 +24,10 @@ walkdir = "2" url = "2.2.2" signal-hook = "0.3.10" serde = { version = "1.0", features = ["derive"] } -serde_with = {version = "1.12.0"} +serde_with = "1.12.0" hex = "0.4.3" const_format = "0.2.21" tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -etcd-client = "0.8.3" tokio-util = { version = "0.7", features = ["io"] } rusoto_core = "0.47" rusoto_s3 = "0.47" @@ -36,6 +35,7 @@ rusoto_s3 = "0.47" postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } utils = { path = "../libs/utils" } +etcd_broker = { path = "../libs/etcd_broker" } workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 3fea3581a8..7e979840c2 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -109,6 +109,12 @@ fn main() -> Result<()> { .takes_value(true) .help("a comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'"), ) + .arg( + Arg::new("broker-etcd-prefix") + .long("broker-etcd-prefix") + .takes_value(true) + .help("a prefix to always use when polling/pusing data in etcd from this safekeeper"), + ) .get_matches(); if let Some(addr) = arg_matches.value_of("dump-control-file") { @@ -118,7 +124,7 @@ fn main() -> Result<()> { return Ok(()); } - let mut conf: SafeKeeperConf = Default::default(); + let mut conf = SafeKeeperConf::default(); if let Some(dir) = arg_matches.value_of("datadir") { // change into the data directory. @@ -162,6 +168,9 @@ fn main() -> Result<()> { let collected_ep: Result, ParseError> = addr.split(',').map(Url::parse).collect(); conf.broker_endpoints = Some(collected_ep?); } + if let Some(prefix) = arg_matches.value_of("broker-etcd-prefix") { + conf.broker_etcd_prefix = prefix.to_string(); + } start_safekeeper(conf, given_id, arg_matches.is_present("init")) } diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 8ce7bdf0e5..c9ae1a8d98 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -1,61 +1,22 @@ //! Communication with etcd, providing safekeeper peers and pageserver coordination. -use anyhow::bail; use anyhow::Context; use anyhow::Error; use anyhow::Result; -use etcd_client::Client; -use etcd_client::EventType; -use etcd_client::PutOptions; -use etcd_client::WatchOptions; -use lazy_static::lazy_static; -use regex::Regex; -use serde::{Deserialize, Serialize}; -use serde_with::{serde_as, DisplayFromStr}; -use std::str::FromStr; +use etcd_broker::Client; +use etcd_broker::PutOptions; +use etcd_broker::SkTimelineSubscriptionKind; use std::time::Duration; use tokio::task::JoinHandle; use tokio::{runtime, time::sleep}; use tracing::*; -use crate::{safekeeper::Term, timeline::GlobalTimelines, SafeKeeperConf}; -use utils::{ - lsn::Lsn, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, -}; +use crate::{timeline::GlobalTimelines, SafeKeeperConf}; +use utils::zid::{ZNodeId, ZTenantTimelineId}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; const LEASE_TTL_SEC: i64 = 5; -// TODO: add global zenith installation ID. -const ZENITH_PREFIX: &str = "zenith"; - -/// Published data about safekeeper. Fields made optional for easy migrations. -#[serde_as] -#[derive(Debug, Deserialize, Serialize)] -pub struct SafekeeperInfo { - /// Term of the last entry. - pub last_log_term: Option, - /// LSN of the last record. - #[serde_as(as = "Option")] - #[serde(default)] - pub flush_lsn: Option, - /// Up to which LSN safekeeper regards its WAL as committed. - #[serde_as(as = "Option")] - #[serde(default)] - pub commit_lsn: Option, - /// LSN up to which safekeeper offloaded WAL to s3. - #[serde_as(as = "Option")] - #[serde(default)] - pub s3_wal_lsn: Option, - /// LSN of last checkpoint uploaded by pageserver. - #[serde_as(as = "Option")] - #[serde(default)] - pub remote_consistent_lsn: Option, - #[serde_as(as = "Option")] - #[serde(default)] - pub peer_horizon_lsn: Option, -} pub fn thread_main(conf: SafeKeeperConf) { let runtime = runtime::Builder::new_current_thread() @@ -71,22 +32,21 @@ pub fn thread_main(conf: SafeKeeperConf) { }); } -/// Prefix to timeline related data. -fn timeline_path(zttid: &ZTenantTimelineId) -> String { +/// Key to per timeline per safekeeper data. +fn timeline_safekeeper_path( + broker_prefix: String, + zttid: ZTenantTimelineId, + sk_id: ZNodeId, +) -> String { format!( - "{}/{}/{}", - ZENITH_PREFIX, zttid.tenant_id, zttid.timeline_id + "{}/{sk_id}", + SkTimelineSubscriptionKind::timeline(broker_prefix, zttid).watch_key() ) } -/// Key to per timeline per safekeeper data. -fn timeline_safekeeper_path(zttid: &ZTenantTimelineId, sk_id: ZNodeId) -> String { - format!("{}/safekeeper/{}", timeline_path(zttid), sk_id) -} - /// Push once in a while data about all active timelines to the broker. -async fn push_loop(conf: SafeKeeperConf) -> Result<()> { - let mut client = Client::connect(conf.broker_endpoints.as_ref().unwrap(), None).await?; +async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { + let mut client = Client::connect(&conf.broker_endpoints.as_ref().unwrap(), None).await?; // Get and maintain lease to automatically delete obsolete data let lease = client.lease_grant(LEASE_TTL_SEC, None).await?; @@ -98,14 +58,17 @@ async fn push_loop(conf: SafeKeeperConf) -> Result<()> { // is under plain mutex. That's ok, all this code is not performance // sensitive and there is no risk of deadlock as we don't await while // lock is held. - let active_tlis = GlobalTimelines::get_active_timelines(); - for zttid in &active_tlis { - if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) { - let sk_info = tli.get_public_info(); + for zttid in GlobalTimelines::get_active_timelines() { + if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { + let sk_info = tli.get_public_info()?; let put_opts = PutOptions::new().with_lease(lease.id()); client .put( - timeline_safekeeper_path(zttid, conf.my_id), + timeline_safekeeper_path( + conf.broker_etcd_prefix.clone(), + zttid, + conf.my_id, + ), serde_json::to_string(&sk_info)?, Some(put_opts), ) @@ -128,45 +91,31 @@ async fn push_loop(conf: SafeKeeperConf) -> Result<()> { /// Subscribe and fetch all the interesting data from the broker. async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { - lazy_static! { - static ref TIMELINE_SAFEKEEPER_RE: Regex = - Regex::new(r"^zenith/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$") - .unwrap(); - } - let mut client = Client::connect(conf.broker_endpoints.as_ref().unwrap(), None).await?; - loop { - let wo = WatchOptions::new().with_prefix(); - // TODO: subscribe only to my timelines - let (_, mut stream) = client.watch(ZENITH_PREFIX, Some(wo)).await?; - while let Some(resp) = stream.message().await? { - if resp.canceled() { - bail!("watch canceled"); - } + let mut client = Client::connect(&conf.broker_endpoints.as_ref().unwrap(), None).await?; - for event in resp.events() { - if EventType::Put == event.event_type() { - if let Some(kv) = event.kv() { - if let Some(caps) = TIMELINE_SAFEKEEPER_RE.captures(kv.key_str()?) { - let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let zttid = ZTenantTimelineId::new(tenant_id, timeline_id); - let safekeeper_id = ZNodeId(caps.get(3).unwrap().as_str().parse()?); - let value_str = kv.value_str()?; - match serde_json::from_str::(value_str) { - Ok(safekeeper_info) => { - if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { - tli.record_safekeeper_info(&safekeeper_info, safekeeper_id)? - } - } - Err(err) => warn!( - "failed to deserialize safekeeper info {}: {}", - value_str, err - ), - } + let mut subscription = etcd_broker::subscribe_to_safekeeper_timeline_updates( + &mut client, + SkTimelineSubscriptionKind::all(conf.broker_etcd_prefix.clone()), + ) + .await + .context("failed to subscribe for safekeeper info")?; + + loop { + match subscription.fetch_data().await { + Some(new_info) => { + for (zttid, sk_info) in new_info { + // note: there are blocking operations below, but it's considered fine for now + if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { + for (safekeeper_id, info) in sk_info { + tli.record_safekeeper_info(&info, safekeeper_id)? } } } } + None => { + debug!("timeline updates sender closed, aborting the pull loop"); + return Ok(()); + } } } } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index d7cbcb094e..e731db5617 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,3 +1,4 @@ +use etcd_broker::SkTimelineInfo; use hyper::{Body, Request, Response, StatusCode}; use serde::Serialize; @@ -5,7 +6,6 @@ use serde::Serializer; use std::fmt::Display; use std::sync::Arc; -use crate::broker::SafekeeperInfo; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; use crate::timeline::GlobalTimelines; @@ -136,7 +136,7 @@ async fn record_safekeeper_info(mut request: Request) -> Result>, + pub broker_etcd_prefix: String, } impl SafeKeeperConf { @@ -76,6 +78,7 @@ impl Default for SafeKeeperConf { recall_period: defaults::DEFAULT_RECALL_PERIOD, my_id: ZNodeId(0), broker_endpoints: None, + broker_etcd_prefix: defaults::DEFAULT_NEON_BROKER_PREFIX.to_string(), } } } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 68361fd672..b9264565dc 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -4,6 +4,7 @@ use anyhow::{bail, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; +use etcd_broker::SkTimelineInfo; use postgres_ffi::xlog_utils::TimeLineID; use postgres_ffi::xlog_utils::XLogSegNo; @@ -16,7 +17,6 @@ use tracing::*; use lazy_static::lazy_static; -use crate::broker::SafekeeperInfo; use crate::control_file; use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; @@ -886,7 +886,7 @@ where } /// Update timeline state with peer safekeeper data. - pub fn record_safekeeper_info(&mut self, sk_info: &SafekeeperInfo) -> Result<()> { + pub fn record_safekeeper_info(&mut self, sk_info: &SkTimelineInfo) -> Result<()> { let mut sync_control_file = false; if let (Some(commit_lsn), Some(last_log_term)) = (sk_info.commit_lsn, sk_info.last_log_term) { diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 47137091da..140d6660ac 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -3,6 +3,7 @@ use anyhow::{bail, Context, Result}; +use etcd_broker::SkTimelineInfo; use lazy_static::lazy_static; use postgres_ffi::xlog_utils::XLogSegNo; @@ -21,7 +22,6 @@ use utils::{ zid::{ZNodeId, ZTenantTimelineId}, }; -use crate::broker::SafekeeperInfo; use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; use crate::control_file; @@ -89,6 +89,7 @@ struct SharedState { active: bool, num_computes: u32, pageserver_connstr: Option, + listen_pg_addr: String, last_removed_segno: XLogSegNo, } @@ -111,6 +112,7 @@ impl SharedState { active: false, num_computes: 0, pageserver_connstr: None, + listen_pg_addr: conf.listen_pg_addr.clone(), last_removed_segno: 0, }) } @@ -130,6 +132,7 @@ impl SharedState { active: false, num_computes: 0, pageserver_connstr: None, + listen_pg_addr: conf.listen_pg_addr.clone(), last_removed_segno: 0, }) } @@ -418,9 +421,9 @@ impl Timeline { } /// Prepare public safekeeper info for reporting. - pub fn get_public_info(&self) -> SafekeeperInfo { + pub fn get_public_info(&self) -> anyhow::Result { let shared_state = self.mutex.lock().unwrap(); - SafekeeperInfo { + Ok(SkTimelineInfo { last_log_term: Some(shared_state.sk.get_epoch()), flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()), // note: this value is not flushed to control file yet and can be lost @@ -432,11 +435,23 @@ impl Timeline { shared_state.sk.inmem.remote_consistent_lsn, )), peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn), - } + wal_stream_connection_string: shared_state + .pageserver_connstr + .as_deref() + .map(|pageserver_connstr| { + wal_stream_connection_string( + self.zttid, + &shared_state.listen_pg_addr, + pageserver_connstr, + ) + }) + .transpose() + .context("Failed to get the pageserver callmemaybe connstr")?, + }) } /// Update timeline state with peer safekeeper data. - pub fn record_safekeeper_info(&self, sk_info: &SafekeeperInfo, _sk_id: ZNodeId) -> Result<()> { + pub fn record_safekeeper_info(&self, sk_info: &SkTimelineInfo, _sk_id: ZNodeId) -> Result<()> { let mut shared_state = self.mutex.lock().unwrap(); shared_state.sk.record_safekeeper_info(sk_info)?; self.notify_wal_senders(&mut shared_state); @@ -489,6 +504,29 @@ impl Timeline { } } +// pageserver connstr is needed to be able to distinguish between different pageservers +// it is required to correctly manage callmemaybe subscriptions when more than one pageserver is involved +// TODO it is better to use some sort of a unique id instead of connection string, see https://github.com/zenithdb/zenith/issues/1105 +fn wal_stream_connection_string( + ZTenantTimelineId { + tenant_id, + timeline_id, + }: ZTenantTimelineId, + listen_pg_addr_str: &str, + pageserver_connstr: &str, +) -> anyhow::Result { + let me_connstr = format!("postgresql://no_user@{}/no_db", listen_pg_addr_str); + let me_conf = me_connstr + .parse::() + .with_context(|| { + format!("Failed to parse pageserver connection string '{me_connstr}' as a postgres one") + })?; + let (host, port) = utils::connstring::connection_host_port(&me_conf); + Ok(format!( + "host={host} port={port} options='-c ztimelineid={timeline_id} ztenantid={tenant_id} pageserver_connstr={pageserver_connstr}'", + )) +} + // Utilities needed by various Connection-like objects pub trait TimelineTools { fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()>; diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index f178b5b766..2bb22f2d3b 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -14,29 +14,34 @@ publish = false ### BEGIN HAKARI SECTION [dependencies] +ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "std", "time", "winapi"] } clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } either = { version = "1", features = ["use_std"] } +fail = { version = "0.5", default-features = false, features = ["failpoints"] } hashbrown = { version = "0.11", features = ["ahash", "inline-more", "raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } +itoa = { version = "0.4", features = ["i128", "std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } memchr = { version = "2", features = ["std", "use_std"] } num-integer = { version = "0.1", default-features = false, features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "std"] } -prost = { version = "0.9", features = ["prost-derive", "std"] } +prost = { version = "0.10", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } +tokio-util = { version = "0.7", features = ["codec", "io"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } tracing-core = { version = "0.1", features = ["lazy_static", "std"] } [build-dependencies] +ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } @@ -46,7 +51,7 @@ indexmap = { version = "1", default-features = false, features = ["std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } memchr = { version = "2", features = ["std", "use_std"] } -prost = { version = "0.9", features = ["prost-derive", "std"] } +prost = { version = "0.10", features = ["prost-derive", "std"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } From de37f982dba67eae85b64c48259a0a36dbcc0e09 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 4 May 2022 17:06:44 +0300 Subject: [PATCH 0259/1022] Share the remote storage as a crate --- Cargo.lock | 71 +-- control_plane/src/storage.rs | 17 +- docs/settings.md | 20 +- libs/remote_storage/Cargo.toml | 20 + libs/remote_storage/src/lib.rs | 232 ++++++++++ .../remote_storage/src}/local_fs.rs | 186 ++++---- .../remote_storage/src}/s3_bucket.rs | 147 +++---- pageserver/Cargo.toml | 8 +- pageserver/README.md | 6 +- pageserver/src/config.rs | 120 +---- pageserver/src/http/routes.rs | 29 +- pageserver/src/layered_repository.rs | 14 +- pageserver/src/lib.rs | 2 +- pageserver/src/remote_storage.rs | 412 ------------------ pageserver/src/repository.rs | 2 +- .../src/{remote_storage => }/storage_sync.rs | 373 +++++++++++++--- .../storage_sync/download.rs | 54 +-- .../storage_sync/index.rs | 0 .../storage_sync/upload.rs | 53 ++- pageserver/src/tenant_mgr.rs | 5 +- pageserver/src/timelines.rs | 2 +- safekeeper/Cargo.toml | 3 +- safekeeper/src/s3_offload.rs | 107 ++--- test_runner/fixtures/zenith_fixtures.py | 29 +- workspace_hack/Cargo.toml | 6 + 25 files changed, 961 insertions(+), 957 deletions(-) create mode 100644 libs/remote_storage/Cargo.toml create mode 100644 libs/remote_storage/src/lib.rs rename {pageserver/src/remote_storage => libs/remote_storage/src}/local_fs.rs (81%) rename {pageserver/src/remote_storage => libs/remote_storage/src}/s3_bucket.rs (74%) delete mode 100644 pageserver/src/remote_storage.rs rename pageserver/src/{remote_storage => }/storage_sync.rs (77%) rename pageserver/src/{remote_storage => }/storage_sync/download.rs (93%) rename pageserver/src/{remote_storage => }/storage_sync/index.rs (100%) rename pageserver/src/{remote_storage => }/storage_sync/upload.rs (93%) diff --git a/Cargo.lock b/Cargo.lock index ac40a2931f..148517a777 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -48,9 +48,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.57" +version = "1.0.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f9b8508dccb7687a1d6c4ce66b2b0ecef467c94667de27d8d7fe1f8d2a9cdc" +checksum = "94a45b455c14666b85fc40a019e8ab9eb75e3a124e05494f5397122bc9eb06e0" dependencies = [ "backtrace", ] @@ -1700,9 +1700,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.10.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9" +checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" [[package]] name = "oorandom" @@ -1763,7 +1763,6 @@ name = "pageserver" version = "0.1.0" dependencies = [ "anyhow", - "async-trait", "byteorder", "bytes", "chrono", @@ -1791,8 +1790,7 @@ dependencies = [ "pprof", "rand", "regex", - "rusoto_core", - "rusoto_s3", + "remote_storage", "scopeguard", "serde", "serde_json", @@ -1804,7 +1802,6 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-stream", - "tokio-util 0.7.0", "toml_edit", "tracing", "url", @@ -2104,9 +2101,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.10.1" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a07b0857a71a8cb765763950499cae2413c3f9cede1133478c43600d9e146890" +checksum = "bc03e116981ff7d8da8e5c220e374587b98d294af7ba7dd7fda761158f00086f" dependencies = [ "bytes", "prost-derive", @@ -2114,9 +2111,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.10.1" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "120fbe7988713f39d780a58cf1a7ef0d7ef66c6d87e5aa3438940c05357929f4" +checksum = "65a1118354442de7feb8a2a76f3d80ef01426bd45542c8c1fdffca41a758f846" dependencies = [ "bytes", "cfg-if", @@ -2347,6 +2344,23 @@ version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" +[[package]] +name = "remote_storage" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "rusoto_core", + "rusoto_s3", + "serde", + "serde_json", + "tempfile", + "tokio", + "tokio-util 0.7.0", + "tracing", + "workspace_hack", +] + [[package]] name = "remove_dir_all" version = "0.5.3" @@ -2446,9 +2460,9 @@ dependencies = [ [[package]] name = "rusoto_core" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b4f000e8934c1b4f70adde180056812e7ea6b1a247952db8ee98c94cd3116cc" +checksum = "1db30db44ea73551326269adcf7a2169428a054f14faf9e1768f2163494f2fa2" dependencies = [ "async-trait", "base64", @@ -2471,9 +2485,9 @@ dependencies = [ [[package]] name = "rusoto_credential" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a46b67db7bb66f5541e44db22b0a02fed59c9603e146db3a9e633272d3bac2f" +checksum = "ee0a6c13db5aad6047b6a44ef023dbbc21a056b6dab5be3b79ce4283d5c02d05" dependencies = [ "async-trait", "chrono", @@ -2489,9 +2503,9 @@ dependencies = [ [[package]] name = "rusoto_s3" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "048c2fe811a823ad5a9acc976e8bf4f1d910df719dcf44b15c3e96c5b7a51027" +checksum = "7aae4677183411f6b0b412d66194ef5403293917d66e70ab118f07cc24c5b14d" dependencies = [ "async-trait", "bytes", @@ -2502,9 +2516,9 @@ dependencies = [ [[package]] name = "rusoto_signature" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6264e93384b90a747758bcc82079711eacf2e755c3a8b5091687b5349d870bcc" +checksum = "a5ae95491c8b4847931e291b151127eccd6ff8ca13f33603eb3d0035ecb05272" dependencies = [ "base64", "bytes", @@ -2611,8 +2625,7 @@ dependencies = [ "postgres-protocol", "postgres_ffi", "regex", - "rusoto_core", - "rusoto_s3", + "remote_storage", "serde", "serde_json", "serde_with", @@ -3275,9 +3288,9 @@ dependencies = [ [[package]] name = "tonic" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30fb54bf1e446f44d870d260d99957e7d11fb9d0a0f5bd1a662ad1411cc103f9" +checksum = "5be9d60db39854b30b835107500cf0aca0b0d14d6e1c3de124217c23a29c2ddb" dependencies = [ "async-stream", "async-trait", @@ -3307,9 +3320,9 @@ dependencies = [ [[package]] name = "tonic-build" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c03447cdc9eaf8feffb6412dcb27baf2db11669a6c4789f29da799aabfb99547" +checksum = "d9263bf4c9bfaae7317c1c2faf7f18491d2fe476f70c414b73bf5d445b00ffa1" dependencies = [ "prettyplease", "proc-macro2", @@ -3805,7 +3818,13 @@ dependencies = [ "clap 2.34.0", "either", "fail", + "futures-channel", + "futures-task", + "futures-util", + "generic-array", "hashbrown", + "hex", + "hyper", "indexmap", "itoa 0.4.8", "libc", diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 3a63bf6960..adb924d430 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -186,8 +186,6 @@ impl PageServerNode { ); io::stdout().flush().unwrap(); - let mut cmd = Command::new(self.env.pageserver_bin()?); - let repo_path = self.repo_path(); let mut args = vec!["-D", repo_path.to_str().unwrap()]; @@ -195,9 +193,11 @@ impl PageServerNode { args.extend(["-c", config_override]); } - fill_rust_env_vars(cmd.args(&args).arg("--daemonize")); + let mut cmd = Command::new(self.env.pageserver_bin()?); + let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize")); + filled_cmd = fill_aws_secrets_vars(filled_cmd); - if !cmd.status()?.success() { + if !filled_cmd.status()?.success() { bail!( "Pageserver failed to start. See '{}' for details.", self.repo_path().join("pageserver.log").display() @@ -457,3 +457,12 @@ impl PageServerNode { Ok(timeline_info_response) } } + +fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command { + for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] { + if let Ok(value) = std::env::var(env_key) { + cmd = cmd.env(env_key, value); + } + } + cmd +} diff --git a/docs/settings.md b/docs/settings.md index b3925528cd..017d349bb6 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -6,7 +6,6 @@ If there's no such file during `init` phase of the server, it creates the file i There's a possibility to pass an arbitrary config value to the pageserver binary as an argument: such values override the values in the config file, if any are specified for the same key and get into the final config during init phase. - ### Config example ```toml @@ -35,9 +34,9 @@ Yet, it validates the config values it can (e.g. postgres install dir) and error Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#table) in TOML specification and -* either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'` +- either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'` -* or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}` +- or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}` ### Config values @@ -57,7 +56,7 @@ but it will trigger a checkpoint operation to get it back below the limit. `checkpoint_distance` also determines how much WAL needs to be kept -durable in the safekeeper. The safekeeper must have capacity to hold +durable in the safekeeper. The safekeeper must have capacity to hold this much WAL, with some headroom, otherwise you can get stuck in a situation where the safekeeper is full and stops accepting new WAL, but the pageserver is not flushing out and releasing the space in the @@ -72,7 +71,7 @@ The unit is # of bytes. Every `compaction_period` seconds, the page server checks if maintenance operations, like compaction, are needed on the layer -files. Default is 1 s, which should be fine. +files. Default is 1 s, which should be fine. #### compaction_target_size @@ -163,16 +162,12 @@ bucket_region = 'eu-north-1' # Optional, pageserver uses entire bucket if the prefix is not specified. prefix_in_bucket = '/some/prefix/' -# Access key to connect to the bucket ("login" part of the credentials) -access_key_id = 'SOMEKEYAAAAASADSAH*#' - -# Secret access key to connect to the bucket ("password" part of the credentials) -secret_access_key = 'SOMEsEcReTsd292v' - # S3 API query limit to avoid getting errors/throttling from AWS. concurrency_limit = 100 ``` +If no IAM bucket access is used during the remote storage usage, use the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to set the access credentials. + ###### General remote storage configuration Pagesever allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used. @@ -183,13 +178,12 @@ Besides, there are parameters common for all types of remote storage that can be ```toml [remote_storage] # Max number of concurrent timeline synchronized (layers uploaded or downloaded) with the remote storage at the same time. -max_concurrent_timelines_sync = 50 +max_concurrent_syncs = 50 # Max number of errors a single task can have before it's considered failed and not attempted to run anymore. max_sync_errors = 10 ``` - ## safekeeper TODO diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml new file mode 100644 index 0000000000..291f6e50ac --- /dev/null +++ b/libs/remote_storage/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "remote_storage" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = { version = "1.0", features = ["backtrace"] } +tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] } +tokio-util = { version = "0.7", features = ["io"] } +tracing = "0.1.27" +rusoto_core = "0.48" +rusoto_s3 = "0.48" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1" +async-trait = "0.1" + +workspace_hack = { version = "0.1", path = "../../workspace_hack" } + +[dev-dependencies] +tempfile = "3.2" diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs new file mode 100644 index 0000000000..9bbb855dd5 --- /dev/null +++ b/libs/remote_storage/src/lib.rs @@ -0,0 +1,232 @@ +//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage. +//! No other modules from this tree are supposed to be used directly by the external code. +//! +//! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations: +//! * [`local_fs`] allows to use local file system as an external storage +//! * [`s3_bucket`] uses AWS S3 bucket as an external storage +//! +mod local_fs; +mod s3_bucket; + +use std::{ + borrow::Cow, + collections::HashMap, + ffi::OsStr, + num::{NonZeroU32, NonZeroUsize}, + path::{Path, PathBuf}, +}; + +use anyhow::Context; +use tokio::io; +use tracing::info; + +pub use self::{ + local_fs::LocalFs, + s3_bucket::{S3Bucket, S3ObjectKey}, +}; + +/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage. +/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency +/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach. +/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed. +pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50; +pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; +/// Currently, sync happens with AWS S3, that has two limits on requests per second: +/// ~200 RPS for IAM services +/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html +/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests +/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ +pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; + +/// Storage (potentially remote) API to manage its state. +/// This storage tries to be unaware of any layered repository context, +/// providing basic CRUD operations for storage files. +#[async_trait::async_trait] +pub trait RemoteStorage: Send + Sync { + /// A way to uniquely reference a file in the remote storage. + type RemoteObjectId; + + /// Attempts to derive the storage path out of the local path, if the latter is correct. + fn remote_object_id(&self, local_path: &Path) -> anyhow::Result; + + /// Gets the download path of the given storage file. + fn local_path(&self, remote_object_id: &Self::RemoteObjectId) -> anyhow::Result; + + /// Lists all items the storage has right now. + async fn list(&self) -> anyhow::Result>; + + /// Streams the local file contents into remote into the remote storage entry. + async fn upload( + &self, + from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + // S3 PUT request requires the content length to be specified, + // otherwise it starts to fail with the concurrent connection count increasing. + from_size_bytes: usize, + to: &Self::RemoteObjectId, + metadata: Option, + ) -> anyhow::Result<()>; + + /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer. + /// Returns the metadata, if any was stored with the file previously. + async fn download( + &self, + from: &Self::RemoteObjectId, + to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), + ) -> anyhow::Result>; + + /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer. + /// Returns the metadata, if any was stored with the file previously. + async fn download_byte_range( + &self, + from: &Self::RemoteObjectId, + start_inclusive: u64, + end_exclusive: Option, + to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), + ) -> anyhow::Result>; + + async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>; +} + +/// TODO kb +pub enum GenericRemoteStorage { + Local(LocalFs), + S3(S3Bucket), +} + +impl GenericRemoteStorage { + pub fn new( + working_directory: PathBuf, + storage_config: &RemoteStorageConfig, + ) -> anyhow::Result { + match &storage_config.storage { + RemoteStorageKind::LocalFs(root) => { + info!("Using fs root '{}' as a remote storage", root.display()); + LocalFs::new(root.clone(), working_directory).map(GenericRemoteStorage::Local) + } + RemoteStorageKind::AwsS3(s3_config) => { + info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'", + s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); + S3Bucket::new(s3_config, working_directory).map(GenericRemoteStorage::S3) + } + } + } +} + +/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry. +/// Immutable, cannot be changed once the file is created. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct StorageMetadata(HashMap); + +fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> { + if prefix == path { + anyhow::bail!( + "Prefix and the path are equal, cannot strip: '{}'", + prefix.display() + ) + } else { + path.strip_prefix(prefix).with_context(|| { + format!( + "Path '{}' is not prefixed with '{}'", + path.display(), + prefix.display(), + ) + }) + } +} + +/// External backup storage configuration, enough for creating a client for that storage. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct RemoteStorageConfig { + /// Max allowed number of concurrent sync operations between the API user and the remote storage. + pub max_concurrent_syncs: NonZeroUsize, + /// Max allowed errors before the sync task is considered failed and evicted. + pub max_sync_errors: NonZeroU32, + /// The storage connection configuration. + pub storage: RemoteStorageKind, +} + +/// A kind of a remote storage to connect to, with its connection configuration. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum RemoteStorageKind { + /// Storage based on local file system. + /// Specify a root folder to place all stored files into. + LocalFs(PathBuf), + /// AWS S3 based storage, storing all files in the S3 bucket + /// specified by the config + AwsS3(S3Config), +} + +/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write). +#[derive(Clone, PartialEq, Eq)] +pub struct S3Config { + /// Name of the bucket to connect to. + pub bucket_name: String, + /// The region where the bucket is located at. + pub bucket_region: String, + /// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once. + pub prefix_in_bucket: Option, + /// A base URL to send S3 requests to. + /// By default, the endpoint is derived from a region name, assuming it's + /// an AWS S3 region name, erroring on wrong region name. + /// Endpoint provides a way to support other S3 flavors and their regions. + /// + /// Example: `http://127.0.0.1:5000` + pub endpoint: Option, + /// AWS S3 has various limits on its API calls, we need not to exceed those. + /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. + pub concurrency_limit: NonZeroUsize, +} + +impl std::fmt::Debug for S3Config { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("S3Config") + .field("bucket_name", &self.bucket_name) + .field("bucket_region", &self.bucket_region) + .field("prefix_in_bucket", &self.prefix_in_bucket) + .field("concurrency_limit", &self.concurrency_limit) + .finish() + } +} + +pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { + let new_extension = match original_path + .as_ref() + .extension() + .map(OsStr::to_string_lossy) + { + Some(extension) => Cow::Owned(format!("{extension}.{suffix}")), + None => Cow::Borrowed(suffix), + }; + original_path + .as_ref() + .with_extension(new_extension.as_ref()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_path_with_suffix_extension() { + let p = PathBuf::from("/foo/bar"); + assert_eq!( + &path_with_suffix_extension(&p, "temp").to_string_lossy(), + "/foo/bar.temp" + ); + let p = PathBuf::from("/foo/bar"); + assert_eq!( + &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + "/foo/bar.temp.temp" + ); + let p = PathBuf::from("/foo/bar.baz"); + assert_eq!( + &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + "/foo/bar.baz.temp.temp" + ); + let p = PathBuf::from("/foo/bar.baz"); + assert_eq!( + &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + "/foo/bar.baz..temp" + ); + } +} diff --git a/pageserver/src/remote_storage/local_fs.rs b/libs/remote_storage/src/local_fs.rs similarity index 81% rename from pageserver/src/remote_storage/local_fs.rs rename to libs/remote_storage/src/local_fs.rs index 6772a4fbd6..50243352ee 100644 --- a/pageserver/src/remote_storage/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -1,7 +1,7 @@ //! Local filesystem acting as a remote storage. -//! Multiple pageservers can use the same "storage" of this kind by using different storage roots. +//! Multiple API users can use the same "storage" of this kind by using different storage roots. //! -//! This storage used in pageserver tests, but can also be used in cases when a certain persistent +//! This storage used in tests, but can also be used in cases when a certain persistent //! volume is mounted to the local FS. use std::{ @@ -17,18 +17,18 @@ use tokio::{ }; use tracing::*; -use crate::remote_storage::storage_sync::path_with_suffix_extension; +use crate::path_with_suffix_extension; use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; pub struct LocalFs { - pageserver_workdir: &'static Path, - root: PathBuf, + working_directory: PathBuf, + storage_root: PathBuf, } impl LocalFs { /// Attempts to create local FS storage, along with its root directory. - pub fn new(root: PathBuf, pageserver_workdir: &'static Path) -> anyhow::Result { + pub fn new(root: PathBuf, working_directory: PathBuf) -> anyhow::Result { if !root.exists() { std::fs::create_dir_all(&root).with_context(|| { format!( @@ -38,15 +38,15 @@ impl LocalFs { })?; } Ok(Self { - pageserver_workdir, - root, + working_directory, + storage_root: root, }) } fn resolve_in_storage(&self, path: &Path) -> anyhow::Result { if path.is_relative() { - Ok(self.root.join(path)) - } else if path.starts_with(&self.root) { + Ok(self.storage_root.join(path)) + } else if path.starts_with(&self.storage_root) { Ok(path.to_path_buf()) } else { bail!( @@ -85,30 +85,30 @@ impl LocalFs { #[async_trait::async_trait] impl RemoteStorage for LocalFs { - type StoragePath = PathBuf; + type RemoteObjectId = PathBuf; - fn storage_path(&self, local_path: &Path) -> anyhow::Result { - Ok(self.root.join( - strip_path_prefix(self.pageserver_workdir, local_path) + fn remote_object_id(&self, local_path: &Path) -> anyhow::Result { + Ok(self.storage_root.join( + strip_path_prefix(&self.working_directory, local_path) .context("local path does not belong to this storage")?, )) } - fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result { - let relative_path = strip_path_prefix(&self.root, storage_path) + fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result { + let relative_path = strip_path_prefix(&self.storage_root, storage_path) .context("local path does not belong to this storage")?; - Ok(self.pageserver_workdir.join(relative_path)) + Ok(self.working_directory.join(relative_path)) } - async fn list(&self) -> anyhow::Result> { - get_all_files(&self.root).await + async fn list(&self) -> anyhow::Result> { + get_all_files(&self.storage_root).await } async fn upload( &self, from: impl io::AsyncRead + Unpin + Send + Sync + 'static, from_size_bytes: usize, - to: &Self::StoragePath, + to: &Self::RemoteObjectId, metadata: Option, ) -> anyhow::Result<()> { let target_file_path = self.resolve_in_storage(to)?; @@ -194,7 +194,7 @@ impl RemoteStorage for LocalFs { async fn download( &self, - from: &Self::StoragePath, + from: &Self::RemoteObjectId, to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), ) -> anyhow::Result> { let file_path = self.resolve_in_storage(from)?; @@ -229,9 +229,9 @@ impl RemoteStorage for LocalFs { } } - async fn download_range( + async fn download_byte_range( &self, - from: &Self::StoragePath, + from: &Self::RemoteObjectId, start_inclusive: u64, end_exclusive: Option, to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), @@ -288,7 +288,7 @@ impl RemoteStorage for LocalFs { } } - async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> { + async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> { let file_path = self.resolve_in_storage(path)?; if file_path.exists() && file_path.is_file() { Ok(fs::remove_file(file_path).await?) @@ -354,29 +354,30 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> #[cfg(test)] mod pure_tests { - use crate::{ - layered_repository::metadata::METADATA_FILE_NAME, - repository::repo_harness::{RepoHarness, TIMELINE_ID}, - }; + use tempfile::tempdir; use super::*; #[test] fn storage_path_positive() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("storage_path_positive")?; + let workdir = tempdir()?.path().to_owned(); + let storage_root = PathBuf::from("somewhere").join("else"); let storage = LocalFs { - pageserver_workdir: &repo_harness.conf.workdir, - root: storage_root.clone(), + working_directory: workdir.clone(), + storage_root: storage_root.clone(), }; - let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("file_name"); - let expected_path = storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?); + let local_path = workdir + .join("timelines") + .join("some_timeline") + .join("file_name"); + let expected_path = storage_root.join(local_path.strip_prefix(&workdir)?); assert_eq!( expected_path, - storage.storage_path(&local_path).expect("Matching path should map to storage path normally"), - "File paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir" + storage.remote_object_id(&local_path).expect("Matching path should map to storage path normally"), + "File paths from workdir should be stored in local fs storage with the same path they have relative to the workdir" ); Ok(()) @@ -386,7 +387,7 @@ mod pure_tests { fn storage_path_negatives() -> anyhow::Result<()> { #[track_caller] fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String { - match storage.storage_path(mismatching_path) { + match storage.remote_object_id(mismatching_path) { Ok(wrong_path) => panic!( "Expected path '{}' to error, but got storage path: {:?}", mismatching_path.display(), @@ -396,16 +397,16 @@ mod pure_tests { } } - let repo_harness = RepoHarness::create("storage_path_negatives")?; + let workdir = tempdir()?.path().to_owned(); let storage_root = PathBuf::from("somewhere").join("else"); let storage = LocalFs { - pageserver_workdir: &repo_harness.conf.workdir, - root: storage_root, + working_directory: workdir.clone(), + storage_root, }; - let error_string = storage_path_error(&storage, &repo_harness.conf.workdir); + let error_string = storage_path_error(&storage, &workdir); assert!(error_string.contains("does not belong to this storage")); - assert!(error_string.contains(repo_harness.conf.workdir.to_str().unwrap())); + assert!(error_string.contains(workdir.to_str().unwrap())); let mismatching_path_str = "/something/else"; let error_message = storage_path_error(&storage, Path::new(mismatching_path_str)); @@ -414,7 +415,7 @@ mod pure_tests { "Error should mention wrong path" ); assert!( - error_message.contains(repo_harness.conf.workdir.to_str().unwrap()), + error_message.contains(workdir.to_str().unwrap()), "Error should mention server workdir" ); assert!(error_message.contains("does not belong to this storage")); @@ -424,29 +425,28 @@ mod pure_tests { #[test] fn local_path_positive() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("local_path_positive")?; + let workdir = tempdir()?.path().to_owned(); let storage_root = PathBuf::from("somewhere").join("else"); let storage = LocalFs { - pageserver_workdir: &repo_harness.conf.workdir, - root: storage_root.clone(), + working_directory: workdir.clone(), + storage_root: storage_root.clone(), }; let name = "not a metadata"; - let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name); + let local_path = workdir.join("timelines").join("some_timeline").join(name); assert_eq!( local_path, storage - .local_path( - &storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?) - ) + .local_path(&storage_root.join(local_path.strip_prefix(&workdir)?)) .expect("For a valid input, valid local path should be parsed"), "Should be able to parse metadata out of the correctly named remote delta file" ); - let local_metadata_path = repo_harness - .timeline_path(&TIMELINE_ID) - .join(METADATA_FILE_NAME); - let remote_metadata_path = storage.storage_path(&local_metadata_path)?; + let local_metadata_path = workdir + .join("timelines") + .join("some_timeline") + .join("metadata"); + let remote_metadata_path = storage.remote_object_id(&local_metadata_path)?; assert_eq!( local_metadata_path, storage @@ -472,11 +472,10 @@ mod pure_tests { } } - let repo_harness = RepoHarness::create("local_path_negatives")?; let storage_root = PathBuf::from("somewhere").join("else"); let storage = LocalFs { - pageserver_workdir: &repo_harness.conf.workdir, - root: storage_root, + working_directory: tempdir()?.path().to_owned(), + storage_root, }; let totally_wrong_path = "wrong_wrong_wrong"; @@ -488,16 +487,19 @@ mod pure_tests { #[test] fn download_destination_matches_original_path() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_destination_matches_original_path")?; - let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name"); + let workdir = tempdir()?.path().to_owned(); + let original_path = workdir + .join("timelines") + .join("some_timeline") + .join("some name"); let storage_root = PathBuf::from("somewhere").join("else"); let dummy_storage = LocalFs { - pageserver_workdir: &repo_harness.conf.workdir, - root: storage_root, + working_directory: workdir, + storage_root, }; - let storage_path = dummy_storage.storage_path(&original_path)?; + let storage_path = dummy_storage.remote_object_id(&original_path)?; let download_destination = dummy_storage.local_path(&storage_path)?; assert_eq!( @@ -512,18 +514,17 @@ mod pure_tests { #[cfg(test)] mod fs_tests { use super::*; - use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID}; use std::{collections::HashMap, io::Write}; use tempfile::tempdir; #[tokio::test] async fn upload_file() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("upload_file")?; + let workdir = tempdir()?.path().to_owned(); let storage = create_storage()?; let (file, size) = create_file_for_upload( - &storage.pageserver_workdir.join("whatever"), + &storage.working_directory.join("whatever"), "whatever_contents", ) .await?; @@ -538,14 +539,14 @@ mod fs_tests { } assert!(storage.list().await?.is_empty()); - let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1", None).await?; + let target_path_1 = upload_dummy_file(&workdir, &storage, "upload_1", None).await?; assert_eq!( storage.list().await?, vec![target_path_1.clone()], "Should list a single file after first upload" ); - let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2", None).await?; + let target_path_2 = upload_dummy_file(&workdir, &storage, "upload_2", None).await?; assert_eq!( list_files_sorted(&storage).await?, vec![target_path_1.clone(), target_path_2.clone()], @@ -556,17 +557,16 @@ mod fs_tests { } fn create_storage() -> anyhow::Result { - let pageserver_workdir = Box::leak(Box::new(tempdir()?.path().to_owned())); - let storage = LocalFs::new(tempdir()?.path().to_owned(), pageserver_workdir)?; - Ok(storage) + LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned()) } #[tokio::test] async fn download_file() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_file")?; + let workdir = tempdir()?.path().to_owned(); + let storage = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); let metadata = storage.download(&upload_target, &mut content_bytes).await?; @@ -597,14 +597,15 @@ mod fs_tests { #[tokio::test] async fn download_file_range_positive() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_file_range_positive")?; + let workdir = tempdir()?.path().to_owned(); + let storage = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); let metadata = storage - .download_range(&upload_target, 0, None, &mut full_range_bytes) + .download_byte_range(&upload_target, 0, None, &mut full_range_bytes) .await?; assert!( metadata.is_none(), @@ -620,7 +621,7 @@ mod fs_tests { let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); let same_byte = 1_000_000_000; let metadata = storage - .download_range( + .download_byte_range( &upload_target, same_byte, Some(same_byte + 1), // exclusive end @@ -642,7 +643,7 @@ mod fs_tests { let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); let metadata = storage - .download_range( + .download_byte_range( &upload_target, 0, Some(first_part_local.len() as u64), @@ -664,7 +665,7 @@ mod fs_tests { let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); let metadata = storage - .download_range( + .download_byte_range( &upload_target, first_part_local.len() as u64, Some((first_part_local.len() + second_part_local.len()) as u64), @@ -689,16 +690,17 @@ mod fs_tests { #[tokio::test] async fn download_file_range_negative() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_file_range_negative")?; + let workdir = tempdir()?.path().to_owned(); + let storage = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; let start = 10000; let end = 234; assert!(start > end, "Should test an incorrect range"); match storage - .download_range(&upload_target, start, Some(end), &mut io::sink()) + .download_byte_range(&upload_target, start, Some(end), &mut io::sink()) .await { Ok(_) => panic!("Should not allow downloading wrong ranges"), @@ -712,7 +714,7 @@ mod fs_tests { let non_existing_path = PathBuf::from("somewhere").join("else"); match storage - .download_range(&non_existing_path, 1, Some(3), &mut io::sink()) + .download_byte_range(&non_existing_path, 1, Some(3), &mut io::sink()) .await { Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"), @@ -727,10 +729,11 @@ mod fs_tests { #[tokio::test] async fn delete_file() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("delete_file")?; + let workdir = tempdir()?.path().to_owned(); + let storage = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; storage.delete(&upload_target).await?; assert!(storage.list().await?.is_empty()); @@ -748,7 +751,8 @@ mod fs_tests { #[tokio::test] async fn file_with_metadata() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_file")?; + let workdir = tempdir()?.path().to_owned(); + let storage = create_storage()?; let upload_name = "upload_1"; let metadata = StorageMetadata(HashMap::from([ @@ -756,7 +760,7 @@ mod fs_tests { ("two".to_string(), "2".to_string()), ])); let upload_target = - upload_dummy_file(&repo_harness, &storage, upload_name, Some(metadata.clone())).await?; + upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?; let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); let full_download_metadata = storage.download(&upload_target, &mut content_bytes).await?; @@ -780,7 +784,7 @@ mod fs_tests { let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); let partial_download_metadata = storage - .download_range( + .download_byte_range( &upload_target, 0, Some(first_part_local.len() as u64), @@ -805,16 +809,16 @@ mod fs_tests { } async fn upload_dummy_file( - harness: &RepoHarness<'_>, + workdir: &Path, storage: &LocalFs, name: &str, metadata: Option, ) -> anyhow::Result { - let timeline_path = harness.timeline_path(&TIMELINE_ID); - let relative_timeline_path = timeline_path.strip_prefix(&harness.conf.workdir)?; - let storage_path = storage.root.join(relative_timeline_path).join(name); + let timeline_path = workdir.join("timelines").join("some_timeline"); + let relative_timeline_path = timeline_path.strip_prefix(&workdir)?; + let storage_path = storage.storage_root.join(relative_timeline_path).join(name); - let from_path = storage.pageserver_workdir.join(name); + let from_path = storage.working_directory.join(name); let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?; storage.upload(file, size, &storage_path, metadata).await?; Ok(storage_path) diff --git a/pageserver/src/remote_storage/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs similarity index 74% rename from pageserver/src/remote_storage/s3_bucket.rs rename to libs/remote_storage/src/s3_bucket.rs index 73d828d150..01aaf7ca7e 100644 --- a/pageserver/src/remote_storage/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -1,7 +1,7 @@ //! AWS S3 storage wrapper around `rusoto` library. //! //! Respects `prefix_in_bucket` property from [`S3Config`], -//! allowing multiple pageservers to independently work with the same S3 bucket, if +//! allowing multiple api users to independently work with the same S3 bucket, if //! their bucket prefixes are both specified and different. use std::path::{Path, PathBuf}; @@ -19,16 +19,13 @@ use tokio::{io, sync::Semaphore}; use tokio_util::io::ReaderStream; use tracing::debug; -use crate::{ - config::S3Config, - remote_storage::{strip_path_prefix, RemoteStorage}, -}; +use crate::{strip_path_prefix, RemoteStorage, S3Config}; use super::StorageMetadata; -const S3_FILE_SEPARATOR: char = '/'; +const S3_PREFIX_SEPARATOR: char = '/'; -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Hash)] pub struct S3ObjectKey(String); impl S3ObjectKey { @@ -36,11 +33,7 @@ impl S3ObjectKey { &self.0 } - fn download_destination( - &self, - pageserver_workdir: &Path, - prefix_to_strip: Option<&str>, - ) -> PathBuf { + fn download_destination(&self, workdir: &Path, prefix_to_strip: Option<&str>) -> PathBuf { let path_without_prefix = match prefix_to_strip { Some(prefix) => self.0.strip_prefix(prefix).unwrap_or_else(|| { panic!( @@ -51,9 +44,9 @@ impl S3ObjectKey { None => &self.0, }; - pageserver_workdir.join( + workdir.join( path_without_prefix - .split(S3_FILE_SEPARATOR) + .split(S3_PREFIX_SEPARATOR) .collect::(), ) } @@ -61,7 +54,7 @@ impl S3ObjectKey { /// AWS S3 storage. pub struct S3Bucket { - pageserver_workdir: &'static Path, + workdir: PathBuf, client: S3Client, bucket_name: String, prefix_in_bucket: Option, @@ -73,7 +66,7 @@ pub struct S3Bucket { impl S3Bucket { /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. - pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result { + pub fn new(aws_config: &S3Config, workdir: PathBuf) -> anyhow::Result { debug!( "Creating s3 remote storage for S3 bucket {}", aws_config.bucket_name @@ -89,8 +82,11 @@ impl S3Bucket { .context("Failed to parse the s3 region from config")?, }; let request_dispatcher = HttpClient::new().context("Failed to create S3 http client")?; - let client = if aws_config.access_key_id.is_none() && aws_config.secret_access_key.is_none() - { + + let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").ok(); + let secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").ok(); + + let client = if access_key_id.is_none() && secret_access_key.is_none() { debug!("Using IAM-based AWS access"); S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region) } else { @@ -98,8 +94,8 @@ impl S3Bucket { S3Client::new_with( request_dispatcher, StaticProvider::new_minimal( - aws_config.access_key_id.clone().unwrap_or_default(), - aws_config.secret_access_key.clone().unwrap_or_default(), + access_key_id.unwrap_or_default(), + secret_access_key.unwrap_or_default(), ), region, ) @@ -107,12 +103,12 @@ impl S3Bucket { let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| { let mut prefix = prefix; - while prefix.starts_with(S3_FILE_SEPARATOR) { + while prefix.starts_with(S3_PREFIX_SEPARATOR) { prefix = &prefix[1..] } let mut prefix = prefix.to_string(); - while prefix.ends_with(S3_FILE_SEPARATOR) { + while prefix.ends_with(S3_PREFIX_SEPARATOR) { prefix.pop(); } prefix @@ -120,7 +116,7 @@ impl S3Bucket { Ok(Self { client, - pageserver_workdir, + workdir, bucket_name: aws_config.bucket_name.clone(), prefix_in_bucket, concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()), @@ -130,24 +126,23 @@ impl S3Bucket { #[async_trait::async_trait] impl RemoteStorage for S3Bucket { - type StoragePath = S3ObjectKey; + type RemoteObjectId = S3ObjectKey; - fn storage_path(&self, local_path: &Path) -> anyhow::Result { - let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?; + fn remote_object_id(&self, local_path: &Path) -> anyhow::Result { + let relative_path = strip_path_prefix(&self.workdir, local_path)?; let mut key = self.prefix_in_bucket.clone().unwrap_or_default(); for segment in relative_path { - key.push(S3_FILE_SEPARATOR); + key.push(S3_PREFIX_SEPARATOR); key.push_str(&segment.to_string_lossy()); } Ok(S3ObjectKey(key)) } - fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result { - Ok(storage_path - .download_destination(self.pageserver_workdir, self.prefix_in_bucket.as_deref())) + fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result { + Ok(storage_path.download_destination(&self.workdir, self.prefix_in_bucket.as_deref())) } - async fn list(&self) -> anyhow::Result> { + async fn list(&self) -> anyhow::Result> { let mut document_keys = Vec::new(); let mut continuation_token = None; @@ -187,7 +182,7 @@ impl RemoteStorage for S3Bucket { &self, from: impl io::AsyncRead + Unpin + Send + Sync + 'static, from_size_bytes: usize, - to: &Self::StoragePath, + to: &Self::RemoteObjectId, metadata: Option, ) -> anyhow::Result<()> { let _guard = self @@ -212,7 +207,7 @@ impl RemoteStorage for S3Bucket { async fn download( &self, - from: &Self::StoragePath, + from: &Self::RemoteObjectId, to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), ) -> anyhow::Result> { let _guard = self @@ -237,9 +232,9 @@ impl RemoteStorage for S3Bucket { Ok(object_output.metadata.map(StorageMetadata)) } - async fn download_range( + async fn download_byte_range( &self, - from: &Self::StoragePath, + from: &Self::RemoteObjectId, start_inclusive: u64, end_exclusive: Option, to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), @@ -274,7 +269,7 @@ impl RemoteStorage for S3Bucket { Ok(object_output.metadata.map(StorageMetadata)) } - async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> { + async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> { let _guard = self .concurrency_limiter .acquire() @@ -293,34 +288,30 @@ impl RemoteStorage for S3Bucket { #[cfg(test)] mod tests { - use crate::{ - layered_repository::metadata::METADATA_FILE_NAME, - repository::repo_harness::{RepoHarness, TIMELINE_ID}, - }; + use tempfile::tempdir; use super::*; #[test] fn download_destination() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_destination")?; - - let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("test_name"); - let relative_path = local_path.strip_prefix(&repo_harness.conf.workdir)?; + let workdir = tempdir()?.path().to_owned(); + let local_path = workdir.join("one").join("two").join("test_name"); + let relative_path = local_path.strip_prefix(&workdir)?; let key = S3ObjectKey(format!( "{}{}", - S3_FILE_SEPARATOR, + S3_PREFIX_SEPARATOR, relative_path .iter() .map(|segment| segment.to_str().unwrap()) .collect::>() - .join(&S3_FILE_SEPARATOR.to_string()), + .join(&S3_PREFIX_SEPARATOR.to_string()), )); assert_eq!( local_path, - key.download_destination(&repo_harness.conf.workdir, None), - "Download destination should consist of s3 path joined with the pageserver workdir prefix" + key.download_destination(&workdir, None), + "Download destination should consist of s3 path joined with the workdir prefix" ); Ok(()) @@ -328,24 +319,21 @@ mod tests { #[test] fn storage_path_positive() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("storage_path_positive")?; + let workdir = tempdir()?.path().to_owned(); let segment_1 = "matching"; let segment_2 = "file"; - let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2); + let local_path = &workdir.join(segment_1).join(segment_2); - let storage = dummy_storage(&repo_harness.conf.workdir); + let storage = dummy_storage(workdir); let expected_key = S3ObjectKey(format!( - "{}{SEPARATOR}{}{SEPARATOR}{}", + "{}{S3_PREFIX_SEPARATOR}{segment_1}{S3_PREFIX_SEPARATOR}{segment_2}", storage.prefix_in_bucket.as_deref().unwrap_or_default(), - segment_1, - segment_2, - SEPARATOR = S3_FILE_SEPARATOR, )); let actual_key = storage - .storage_path(local_path) + .remote_object_id(local_path) .expect("Matching path should map to S3 path normally"); assert_eq!( expected_key, @@ -360,7 +348,7 @@ mod tests { fn storage_path_negatives() -> anyhow::Result<()> { #[track_caller] fn storage_path_error(storage: &S3Bucket, mismatching_path: &Path) -> String { - match storage.storage_path(mismatching_path) { + match storage.remote_object_id(mismatching_path) { Ok(wrong_key) => panic!( "Expected path '{}' to error, but got S3 key: {:?}", mismatching_path.display(), @@ -370,10 +358,10 @@ mod tests { } } - let repo_harness = RepoHarness::create("storage_path_negatives")?; - let storage = dummy_storage(&repo_harness.conf.workdir); + let workdir = tempdir()?.path().to_owned(); + let storage = dummy_storage(workdir.clone()); - let error_message = storage_path_error(&storage, &repo_harness.conf.workdir); + let error_message = storage_path_error(&storage, &workdir); assert!( error_message.contains("Prefix and the path are equal"), "Message '{}' does not contain the required string", @@ -387,7 +375,7 @@ mod tests { "Error should mention wrong path" ); assert!( - error_message.contains(repo_harness.conf.workdir.to_str().unwrap()), + error_message.contains(workdir.to_str().unwrap()), "Error should mention server workdir" ); assert!( @@ -401,20 +389,17 @@ mod tests { #[test] fn local_path_positive() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("local_path_positive")?; - let storage = dummy_storage(&repo_harness.conf.workdir); - let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID); - let relative_timeline_path = timeline_dir.strip_prefix(&repo_harness.conf.workdir)?; + let workdir = tempdir()?.path().to_owned(); + let storage = dummy_storage(workdir.clone()); + let timeline_dir = workdir.join("timelines").join("test_timeline"); + let relative_timeline_path = timeline_dir.strip_prefix(&workdir)?; let s3_key = create_s3_key( &relative_timeline_path.join("not a metadata"), storage.prefix_in_bucket.as_deref(), ); assert_eq!( - s3_key.download_destination( - &repo_harness.conf.workdir, - storage.prefix_in_bucket.as_deref() - ), + s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()), storage .local_path(&s3_key) .expect("For a valid input, valid S3 info should be parsed"), @@ -422,14 +407,11 @@ mod tests { ); let s3_key = create_s3_key( - &relative_timeline_path.join(METADATA_FILE_NAME), + &relative_timeline_path.join("metadata"), storage.prefix_in_bucket.as_deref(), ); assert_eq!( - s3_key.download_destination( - &repo_harness.conf.workdir, - storage.prefix_in_bucket.as_deref() - ), + s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()), storage .local_path(&s3_key) .expect("For a valid input, valid S3 info should be parsed"), @@ -441,12 +423,15 @@ mod tests { #[test] fn download_destination_matches_original_path() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_destination_matches_original_path")?; - let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name"); + let workdir = tempdir()?.path().to_owned(); + let original_path = workdir + .join("timelines") + .join("some_timeline") + .join("some name"); - let dummy_storage = dummy_storage(&repo_harness.conf.workdir); + let dummy_storage = dummy_storage(workdir); - let key = dummy_storage.storage_path(&original_path)?; + let key = dummy_storage.remote_object_id(&original_path)?; let download_destination = dummy_storage.local_path(&key)?; assert_eq!( @@ -457,9 +442,9 @@ mod tests { Ok(()) } - fn dummy_storage(pageserver_workdir: &'static Path) -> S3Bucket { + fn dummy_storage(workdir: PathBuf) -> S3Bucket { S3Bucket { - pageserver_workdir, + workdir, client: S3Client::new("us-east-1".parse().unwrap()), bucket_name: "dummy-bucket".to_string(), prefix_in_bucket: Some("dummy_prefix/".to_string()), @@ -471,7 +456,7 @@ mod tests { S3ObjectKey(relative_file_path.iter().fold( prefix.unwrap_or_default().to_string(), |mut path_string, segment| { - path_string.push(S3_FILE_SEPARATOR); + path_string.push(S3_PREFIX_SEPARATOR); path_string.push_str(segment.to_str().unwrap()); path_string }, diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 23c16dd5be..d4cceafc61 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [features] # It is simpler infra-wise to have failpoints enabled by default -# It shouldnt affect perf in any way because failpoints +# It shouldn't affect perf in any way because failpoints # are not placed in hot code paths default = ["failpoints"] profiling = ["pprof"] @@ -25,7 +25,6 @@ lazy_static = "1.4.0" clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } -tokio-util = { version = "0.7", features = ["io"] } postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } @@ -54,13 +53,10 @@ once_cell = "1.8.0" crossbeam-utils = "0.8.5" fail = "0.5.0" -rusoto_core = "0.47" -rusoto_s3 = "0.47" -async-trait = "0.1" - postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } utils = { path = "../libs/utils" } +remote_storage = { path = "../libs/remote_storage" } workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] diff --git a/pageserver/README.md b/pageserver/README.md index 1fd627785c..cf841d1e46 100644 --- a/pageserver/README.md +++ b/pageserver/README.md @@ -135,7 +135,7 @@ The backup service is disabled by default and can be enabled to interact with a CLI examples: * Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"` -* AWS S3 : `${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/',access_key_id='SOMEKEYAAAAASADSAH*#',secret_access_key='SOMEsEcReTsd292v'}"` +* AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"` For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS. For local S3 installations, refer to the their documentation for name format and credentials. @@ -155,11 +155,9 @@ or bucket_name = 'some-sample-bucket' bucket_region = 'eu-north-1' prefix_in_bucket = '/test_prefix/' -access_key_id = 'SOMEKEYAAAAASADSAH*#' -secret_access_key = 'SOMEsEcReTsd292v' ``` -Also, `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` variables can be used to specify the credentials instead of any of the ways above. +`AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed. TODO: Sharding -------------------- diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 14ca976448..5257732c5c 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -5,6 +5,7 @@ //! See also `settings.md` for better description on every parameter. use anyhow::{anyhow, bail, ensure, Context, Result}; +use remote_storage::{RemoteStorageConfig, RemoteStorageKind, S3Config}; use std::env; use std::num::{NonZeroU32, NonZeroUsize}; use std::path::{Path, PathBuf}; @@ -33,18 +34,6 @@ pub mod defaults { pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; pub const DEFAULT_SUPERUSER: &str = "zenith_admin"; - /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage. - /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency - /// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach. - /// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed. - pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC: usize = 50; - pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; - /// Currently, sync happens with AWS S3, that has two limits on requests per second: - /// ~200 RPS for IAM services - /// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html - /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests - /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ - pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; @@ -315,67 +304,6 @@ impl PageServerConfigBuilder { } } -/// External backup storage configuration, enough for creating a client for that storage. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct RemoteStorageConfig { - /// Max allowed number of concurrent sync operations between pageserver and the remote storage. - pub max_concurrent_timelines_sync: NonZeroUsize, - /// Max allowed errors before the sync task is considered failed and evicted. - pub max_sync_errors: NonZeroU32, - /// The storage connection configuration. - pub storage: RemoteStorageKind, -} - -/// A kind of a remote storage to connect to, with its connection configuration. -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum RemoteStorageKind { - /// Storage based on local file system. - /// Specify a root folder to place all stored files into. - LocalFs(PathBuf), - /// AWS S3 based storage, storing all files in the S3 bucket - /// specified by the config - AwsS3(S3Config), -} - -/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write). -#[derive(Clone, PartialEq, Eq)] -pub struct S3Config { - /// Name of the bucket to connect to. - pub bucket_name: String, - /// The region where the bucket is located at. - pub bucket_region: String, - /// A "subfolder" in the bucket, to use the same bucket separately by multiple pageservers at once. - pub prefix_in_bucket: Option, - /// "Login" to use when connecting to bucket. - /// Can be empty for cases like AWS k8s IAM - /// where we can allow certain pods to connect - /// to the bucket directly without any credentials. - pub access_key_id: Option, - /// "Password" to use when connecting to bucket. - pub secret_access_key: Option, - /// A base URL to send S3 requests to. - /// By default, the endpoint is derived from a region name, assuming it's - /// an AWS S3 region name, erroring on wrong region name. - /// Endpoint provides a way to support other S3 flavors and their regions. - /// - /// Example: `http://127.0.0.1:5000` - pub endpoint: Option, - /// AWS S3 has various limits on its API calls, we need not to exceed those. - /// See [`defaults::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. - pub concurrency_limit: NonZeroUsize, -} - -impl std::fmt::Debug for S3Config { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("S3Config") - .field("bucket_name", &self.bucket_name) - .field("bucket_region", &self.bucket_region) - .field("prefix_in_bucket", &self.prefix_in_bucket) - .field("concurrency_limit", &self.concurrency_limit) - .finish() - } -} - impl PageServerConf { // // Repository paths, relative to workdir. @@ -523,21 +451,21 @@ impl PageServerConf { let bucket_name = toml.get("bucket_name"); let bucket_region = toml.get("bucket_region"); - let max_concurrent_timelines_sync = NonZeroUsize::new( - parse_optional_integer("max_concurrent_timelines_sync", toml)? - .unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC), + let max_concurrent_syncs = NonZeroUsize::new( + parse_optional_integer("max_concurrent_syncs", toml)? + .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS), ) - .context("Failed to parse 'max_concurrent_timelines_sync' as a positive integer")?; + .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?; let max_sync_errors = NonZeroU32::new( parse_optional_integer("max_sync_errors", toml)? - .unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS), + .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS), ) .context("Failed to parse 'max_sync_errors' as a positive integer")?; let concurrency_limit = NonZeroUsize::new( parse_optional_integer("concurrency_limit", toml)? - .unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT), + .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT), ) .context("Failed to parse 'concurrency_limit' as a positive integer")?; @@ -552,16 +480,6 @@ impl PageServerConf { (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config { bucket_name: parse_toml_string("bucket_name", bucket_name)?, bucket_region: parse_toml_string("bucket_region", bucket_region)?, - access_key_id: toml - .get("access_key_id") - .map(|access_key_id| parse_toml_string("access_key_id", access_key_id)) - .transpose()?, - secret_access_key: toml - .get("secret_access_key") - .map(|secret_access_key| { - parse_toml_string("secret_access_key", secret_access_key) - }) - .transpose()?, prefix_in_bucket: toml .get("prefix_in_bucket") .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket)) @@ -579,7 +497,7 @@ impl PageServerConf { }; Ok(RemoteStorageConfig { - max_concurrent_timelines_sync, + max_concurrent_syncs, max_sync_errors, storage, }) @@ -807,11 +725,11 @@ pg_distrib_dir='{}' assert_eq!( parsed_remote_storage_config, RemoteStorageConfig { - max_concurrent_timelines_sync: NonZeroUsize::new( - defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC + max_concurrent_syncs: NonZeroUsize::new( + remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS ) .unwrap(), - max_sync_errors: NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS) + max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS) .unwrap(), storage: RemoteStorageKind::LocalFs(local_storage_path.clone()), }, @@ -829,29 +747,25 @@ pg_distrib_dir='{}' let bucket_name = "some-sample-bucket".to_string(); let bucket_region = "eu-north-1".to_string(); let prefix_in_bucket = "test_prefix".to_string(); - let access_key_id = "SOMEKEYAAAAASADSAH*#".to_string(); - let secret_access_key = "SOMEsEcReTsd292v".to_string(); let endpoint = "http://localhost:5000".to_string(); - let max_concurrent_timelines_sync = NonZeroUsize::new(111).unwrap(); + let max_concurrent_syncs = NonZeroUsize::new(111).unwrap(); let max_sync_errors = NonZeroU32::new(222).unwrap(); let s3_concurrency_limit = NonZeroUsize::new(333).unwrap(); let identical_toml_declarations = &[ format!( r#"[remote_storage] -max_concurrent_timelines_sync = {max_concurrent_timelines_sync} +max_concurrent_syncs = {max_concurrent_syncs} max_sync_errors = {max_sync_errors} bucket_name = '{bucket_name}' bucket_region = '{bucket_region}' prefix_in_bucket = '{prefix_in_bucket}' -access_key_id = '{access_key_id}' -secret_access_key = '{secret_access_key}' endpoint = '{endpoint}' concurrency_limit = {s3_concurrency_limit}"# ), format!( - "remote_storage={{max_concurrent_timelines_sync={max_concurrent_timelines_sync}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\ - bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', access_key_id='{access_key_id}', secret_access_key='{secret_access_key}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}", + "remote_storage={{max_concurrent_syncs={max_concurrent_syncs}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\ + bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}", ), ]; @@ -874,13 +788,11 @@ pg_distrib_dir='{}' assert_eq!( parsed_remote_storage_config, RemoteStorageConfig { - max_concurrent_timelines_sync, + max_concurrent_syncs, max_sync_errors, storage: RemoteStorageKind::AwsS3(S3Config { bucket_name: bucket_name.clone(), bucket_region: bucket_region.clone(), - access_key_id: Some(access_key_id.clone()), - secret_access_key: Some(secret_access_key.clone()), prefix_in_bucket: Some(prefix_in_bucket.clone()), endpoint: Some(endpoint.clone()), concurrency_limit: s3_concurrency_limit, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index f12e4c4051..8940efbda0 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3,17 +3,16 @@ use std::sync::Arc; use anyhow::{Context, Result}; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; +use remote_storage::GenericRemoteStorage; use tracing::*; use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest, }; -use crate::config::RemoteStorageKind; -use crate::remote_storage::{ - download_index_part, schedule_layer_download, LocalFs, RemoteIndex, RemoteTimeline, S3Bucket, -}; use crate::repository::Repository; +use crate::storage_sync; +use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant_config::TenantConfOpt; use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo}; use crate::{config::PageServerConf, tenant_mgr, timelines}; @@ -37,11 +36,6 @@ struct State { remote_storage: Option, } -enum GenericRemoteStorage { - Local(LocalFs), - S3(S3Bucket), -} - impl State { fn new( conf: &'static PageServerConf, @@ -57,14 +51,7 @@ impl State { let remote_storage = conf .remote_storage_config .as_ref() - .map(|storage_config| match &storage_config.storage { - RemoteStorageKind::LocalFs(root) => { - LocalFs::new(root.clone(), &conf.workdir).map(GenericRemoteStorage::Local) - } - RemoteStorageKind::AwsS3(s3_config) => { - S3Bucket::new(s3_config, &conf.workdir).map(GenericRemoteStorage::S3) - } - }) + .map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config)) .transpose() .context("Failed to init generic remote storage")?; @@ -273,7 +260,7 @@ async fn timeline_attach_handler(request: Request) -> Result) -> Result index_accessor.add_timeline_entry(sync_id, new_timeline), } - schedule_layer_download(tenant_id, timeline_id); + storage_sync::schedule_layer_download(tenant_id, timeline_id); json_response(StatusCode::ACCEPTED, ()) } @@ -319,10 +306,10 @@ async fn try_download_shard_data( ) -> anyhow::Result> { let shard = match state.remote_storage.as_ref() { Some(GenericRemoteStorage::Local(local_storage)) => { - download_index_part(state.conf, local_storage, sync_id).await + storage_sync::download_index_part(state.conf, local_storage, sync_id).await } Some(GenericRemoteStorage::S3(s3_storage)) => { - download_index_part(state.conf, s3_storage, sync_id).await + storage_sync::download_index_part(state.conf, s3_storage, sync_id).await } None => return Ok(None), } diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 77c01a7c66..da2699b15d 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -34,10 +34,9 @@ use std::time::{Duration, Instant, SystemTime}; use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; use crate::config::PageServerConf; use crate::keyspace::KeySpace; +use crate::storage_sync::index::RemoteIndex; use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::page_cache; -use crate::remote_storage::{self, RemoteIndex}; use crate::repository::{ GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncStatusUpdate, TimelineWriter, }; @@ -48,6 +47,7 @@ use crate::virtual_file::VirtualFile; use crate::walreceiver::IS_WAL_RECEIVER; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; +use crate::{page_cache, storage_sync}; use metrics::{ register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec, @@ -1785,7 +1785,7 @@ impl LayeredTimeline { PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len()); if self.upload_layers.load(atomic::Ordering::Relaxed) { - remote_storage::schedule_layer_upload( + storage_sync::schedule_layer_upload( self.tenantid, self.timelineid, HashSet::from([new_delta_path]), @@ -1857,7 +1857,7 @@ impl LayeredTimeline { } } if self.upload_layers.load(atomic::Ordering::Relaxed) { - remote_storage::schedule_layer_upload( + storage_sync::schedule_layer_upload( self.tenantid, self.timelineid, layer_paths_to_upload, @@ -2056,13 +2056,13 @@ impl LayeredTimeline { drop(layers); if self.upload_layers.load(atomic::Ordering::Relaxed) { - remote_storage::schedule_layer_upload( + storage_sync::schedule_layer_upload( self.tenantid, self.timelineid, new_layer_paths, None, ); - remote_storage::schedule_layer_delete( + storage_sync::schedule_layer_delete( self.tenantid, self.timelineid, layer_paths_do_delete, @@ -2253,7 +2253,7 @@ impl LayeredTimeline { } if self.upload_layers.load(atomic::Ordering::Relaxed) { - remote_storage::schedule_layer_delete( + storage_sync::schedule_layer_delete( self.tenantid, self.timelineid, layer_paths_to_delete, diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 0b1c53172c..83985069ec 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -9,8 +9,8 @@ pub mod page_service; pub mod pgdatadir_mapping; pub mod profiling; pub mod reltag; -pub mod remote_storage; pub mod repository; +pub mod storage_sync; pub mod tenant_config; pub mod tenant_mgr; pub mod tenant_threads; diff --git a/pageserver/src/remote_storage.rs b/pageserver/src/remote_storage.rs deleted file mode 100644 index 4db0f6667d..0000000000 --- a/pageserver/src/remote_storage.rs +++ /dev/null @@ -1,412 +0,0 @@ -//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage. -//! This particular module serves as a public API border between pageserver and the internal storage machinery. -//! No other modules from this tree are supposed to be used directly by the external code. -//! -//! There are a few components the storage machinery consists of: -//! * [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations: -//! * [`local_fs`] allows to use local file system as an external storage -//! * [`s3_bucket`] uses AWS S3 bucket as an external storage -//! -//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync. -//! Synchronization internals are split into submodules -//! * [`storage_sync::index`] to keep track of remote tenant files, the metadata and their mappings to local files -//! * [`storage_sync::upload`] and [`storage_sync::download`] to manage archive creation and upload; download and extraction, respectively -//! -//! * public API via to interact with the external world: -//! * [`start_local_timeline_sync`] to launch a background async loop to handle the synchronization -//! * [`schedule_layer_upload`], [`schedule_layer_download`] and [`schedule_layer_delete`] to enqueue a new upload and download tasks, -//! to be processed by the async loop -//! -//! Here's a schematic overview of all interactions backup and the rest of the pageserver perform: -//! -//! +------------------------+ +--------->-------+ -//! | | - - - (init async loop) - - - -> | | -//! | | | | -//! | | -------------------------------> | async | -//! | pageserver | (enqueue timeline sync task) | upload/download | -//! | | | loop | -//! | | <------------------------------- | | -//! | | (apply new timeline sync states) | | -//! +------------------------+ +---------<-------+ -//! | -//! | -//! CRUD layer file operations | -//! (upload/download/delete/list, etc.) | -//! V -//! +------------------------+ -//! | | -//! | [`RemoteStorage`] impl | -//! | | -//! | pageserver assumes it | -//! | owns exclusive write | -//! | access to this storage | -//! +------------------------+ -//! -//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so. -//! The loop inits the storage connection and checks the remote files stored. -//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server). -//! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can -//! query their downloads later if they are accessed. -//! -//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata. -//! If the storage sync loop was successfully started before, pageserver schedules the new checkpoint file uploads after every checkpoint. -//! The checkpoint uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either). -//! See [`crate::layered_repository`] for the upload calls and the adjacent logic. -//! -//! Synchronization logic is able to communicate back with updated timeline sync states, [`crate::repository::TimelineSyncStatusUpdate`], -//! submitted via [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state. -//! Such submissions happen in two cases: -//! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future -//! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory -//! -//! When the pageserver terminates, the sync loop finishes a current sync task (if any) and exits. -//! -//! The storage logic considers `image` as a set of local files (layers), fully representing a certain timeline at given moment (identified with `disk_consistent_lsn` from the corresponding `metadata` file). -//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed -//! by the storage upload, if enabled. -//! Yet timeline cannot alter already existing files, and cannot remove those too: only a GC process is capable of removing unused files. -//! This way, remote storage synchronization relies on the fact that every checkpoint is incremental and local files are "immutable": -//! * when a certain checkpoint gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same state -//! * no files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten -//! when the newer image is downloaded -//! -//! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure. -//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexPart`], containing the list of remote files. -//! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download. -//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`], -//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its part contents, if needed, same as any layer files. -//! -//! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed. -//! Bulk index data download happens only initially, on pageserer startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only, -//! when a new timeline is scheduled for the download. -//! -//! NOTES: -//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage -//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API. -//! -//! * the sync tasks may not processed immediately after the submission: if they error and get re-enqueued, their execution might be backed off to ensure error cap is not exceeded too fast. -//! The sync queue processing also happens in batches, so the sync tasks can wait in the queue for some time. - -mod local_fs; -mod s3_bucket; -mod storage_sync; - -use std::{ - collections::{HashMap, HashSet}, - ffi, fs, - path::{Path, PathBuf}, -}; - -use anyhow::{bail, Context}; -use tokio::io; -use tracing::{debug, error, info}; - -use self::storage_sync::TEMP_DOWNLOAD_EXTENSION; -pub use self::{ - local_fs::LocalFs, - s3_bucket::S3Bucket, - storage_sync::{ - download_index_part, - index::{IndexPart, RemoteIndex, RemoteTimeline}, - schedule_layer_delete, schedule_layer_download, schedule_layer_upload, - }, -}; -use crate::{ - config::{PageServerConf, RemoteStorageKind}, - layered_repository::{ - ephemeral_file::is_ephemeral_file, - metadata::{TimelineMetadata, METADATA_FILE_NAME}, - }, -}; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; - -/// A timeline status to share with pageserver's sync counterpart, -/// after comparing local and remote timeline state. -#[derive(Clone, Copy, Debug)] -pub enum LocalTimelineInitStatus { - /// The timeline has every remote layer present locally. - /// There could be some layers requiring uploading, - /// but this does not block the timeline from any user interaction. - LocallyComplete, - /// A timeline has some files remotely, that are not present locally and need downloading. - /// Downloading might update timeline's metadata locally and current pageserver logic deals with local layers only, - /// so the data needs to be downloaded first before the timeline can be used. - NeedsSync, -} - -type LocalTimelineInitStatuses = HashMap>; - -/// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization. -/// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still, -/// to simplify the received code. -pub struct SyncStartupData { - pub remote_index: RemoteIndex, - pub local_timeline_init_statuses: LocalTimelineInitStatuses, -} - -/// Based on the config, initiates the remote storage connection and starts a separate thread -/// that ensures that pageserver and the remote storage are in sync with each other. -/// If no external configuration connection given, no thread or storage initialization is done. -/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states. -pub fn start_local_timeline_sync( - config: &'static PageServerConf, -) -> anyhow::Result { - let local_timeline_files = local_tenant_timeline_files(config) - .context("Failed to collect local tenant timeline files")?; - - match &config.remote_storage_config { - Some(storage_config) => match &storage_config.storage { - RemoteStorageKind::LocalFs(root) => { - info!("Using fs root '{}' as a remote storage", root.display()); - storage_sync::spawn_storage_sync_thread( - config, - local_timeline_files, - LocalFs::new(root.clone(), &config.workdir)?, - storage_config.max_concurrent_timelines_sync, - storage_config.max_sync_errors, - ) - }, - RemoteStorageKind::AwsS3(s3_config) => { - info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'", - s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); - storage_sync::spawn_storage_sync_thread( - config, - local_timeline_files, - S3Bucket::new(s3_config, &config.workdir)?, - storage_config.max_concurrent_timelines_sync, - storage_config.max_sync_errors, - ) - }, - } - .context("Failed to spawn the storage sync thread"), - None => { - info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled"); - let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); - for (ZTenantTimelineId { tenant_id, timeline_id }, _) in - local_timeline_files - { - local_timeline_init_statuses - .entry(tenant_id) - .or_default() - .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete); - } - Ok(SyncStartupData { - local_timeline_init_statuses, - remote_index: RemoteIndex::empty(), - }) - } - } -} - -fn local_tenant_timeline_files( - config: &'static PageServerConf, -) -> anyhow::Result)>> { - let mut local_tenant_timeline_files = HashMap::new(); - let tenants_dir = config.tenants_path(); - for tenants_dir_entry in fs::read_dir(&tenants_dir) - .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? - { - match &tenants_dir_entry { - Ok(tenants_dir_entry) => { - match collect_timelines_for_tenant(config, &tenants_dir_entry.path()) { - Ok(collected_files) => { - local_tenant_timeline_files.extend(collected_files.into_iter()) - } - Err(e) => error!( - "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}", - tenants_dir.display(), - tenants_dir_entry, - e - ), - } - } - Err(e) => error!( - "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}", - tenants_dir_entry, - tenants_dir.display(), - e - ), - } - } - - Ok(local_tenant_timeline_files) -} - -fn collect_timelines_for_tenant( - config: &'static PageServerConf, - tenant_path: &Path, -) -> anyhow::Result)>> { - let mut timelines = HashMap::new(); - let tenant_id = tenant_path - .file_name() - .and_then(ffi::OsStr::to_str) - .unwrap_or_default() - .parse::() - .context("Could not parse tenant id out of the tenant dir name")?; - let timelines_dir = config.timelines_path(&tenant_id); - - for timelines_dir_entry in fs::read_dir(&timelines_dir).with_context(|| { - format!( - "Failed to list timelines dir entry for tenant {}", - tenant_id - ) - })? { - match timelines_dir_entry { - Ok(timelines_dir_entry) => { - let timeline_path = timelines_dir_entry.path(); - match collect_timeline_files(&timeline_path) { - Ok((timeline_id, metadata, timeline_files)) => { - timelines.insert( - ZTenantTimelineId { - tenant_id, - timeline_id, - }, - (metadata, timeline_files), - ); - } - Err(e) => error!( - "Failed to process timeline dir contents at '{}', reason: {:?}", - timeline_path.display(), - e - ), - } - } - Err(e) => error!( - "Failed to list timelines for entry tenant {}, reason: {:?}", - tenant_id, e - ), - } - } - - Ok(timelines) -} - -// discover timeline files and extract timeline metadata -// NOTE: ephemeral files are excluded from the list -fn collect_timeline_files( - timeline_dir: &Path, -) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet)> { - let mut timeline_files = HashSet::new(); - let mut timeline_metadata_path = None; - - let timeline_id = timeline_dir - .file_name() - .and_then(ffi::OsStr::to_str) - .unwrap_or_default() - .parse::() - .context("Could not parse timeline id out of the timeline dir name")?; - let timeline_dir_entries = - fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; - for entry in timeline_dir_entries { - let entry_path = entry.context("Failed to list timeline dir entry")?.path(); - if entry_path.is_file() { - if entry_path.file_name().and_then(ffi::OsStr::to_str) == Some(METADATA_FILE_NAME) { - timeline_metadata_path = Some(entry_path); - } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) { - debug!("skipping ephemeral file {}", entry_path.display()); - continue; - } else if entry_path.extension().and_then(ffi::OsStr::to_str) - == Some(TEMP_DOWNLOAD_EXTENSION) - { - info!("removing temp download file at {}", entry_path.display()); - fs::remove_file(&entry_path).with_context(|| { - format!( - "failed to remove temp download file at {}", - entry_path.display() - ) - })?; - } else { - timeline_files.insert(entry_path); - } - } - } - - // FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed - // then attach is lost. There would be no retries for that, - // initial collect will fail because there is no metadata. - // We either need to start download if we see empty dir after restart or attach caller should - // be aware of that and retry attach if awaits_download for timeline switched from true to false - // but timelinne didnt appear locally. - // Check what happens with remote index in that case. - let timeline_metadata_path = match timeline_metadata_path { - Some(path) => path, - None => bail!("No metadata file found in the timeline directory"), - }; - let metadata = TimelineMetadata::from_bytes( - &fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, - ) - .context("Failed to parse timeline metadata file bytes")?; - - Ok((timeline_id, metadata, timeline_files)) -} - -/// Storage (potentially remote) API to manage its state. -/// This storage tries to be unaware of any layered repository context, -/// providing basic CRUD operations for storage files. -#[async_trait::async_trait] -pub trait RemoteStorage: Send + Sync { - /// A way to uniquely reference a file in the remote storage. - type StoragePath; - - /// Attempts to derive the storage path out of the local path, if the latter is correct. - fn storage_path(&self, local_path: &Path) -> anyhow::Result; - - /// Gets the download path of the given storage file. - fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result; - - /// Lists all items the storage has right now. - async fn list(&self) -> anyhow::Result>; - - /// Streams the local file contents into remote into the remote storage entry. - async fn upload( - &self, - from: impl io::AsyncRead + Unpin + Send + Sync + 'static, - // S3 PUT request requires the content length to be specified, - // otherwise it starts to fail with the concurrent connection count increasing. - from_size_bytes: usize, - to: &Self::StoragePath, - metadata: Option, - ) -> anyhow::Result<()>; - - /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer. - /// Returns the metadata, if any was stored with the file previously. - async fn download( - &self, - from: &Self::StoragePath, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result>; - - /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer. - /// Returns the metadata, if any was stored with the file previously. - async fn download_range( - &self, - from: &Self::StoragePath, - start_inclusive: u64, - end_exclusive: Option, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result>; - - async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()>; -} - -/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry. -/// Immutable, cannot be changed once the file is created. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct StorageMetadata(HashMap); - -fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> { - if prefix == path { - anyhow::bail!( - "Prefix and the path are equal, cannot strip: '{}'", - prefix.display() - ) - } else { - path.strip_prefix(prefix).with_context(|| { - format!( - "Path '{}' is not prefixed with '{}'", - path.display(), - prefix.display(), - ) - }) - } -} diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 5044f2bfc5..d25dc8914d 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,5 +1,5 @@ use crate::layered_repository::metadata::TimelineMetadata; -use crate::remote_storage::RemoteIndex; +use crate::storage_sync::index::RemoteIndex; use crate::walrecord::ZenithWalRecord; use crate::CheckpointConfig; use anyhow::{bail, Result}; diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/storage_sync.rs similarity index 77% rename from pageserver/src/remote_storage/storage_sync.rs rename to pageserver/src/storage_sync.rs index 8a26685a7d..bcc18e8ce4 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -1,3 +1,87 @@ +//! There are a few components the storage machinery consists of: +//! +//! * [`RemoteStorage`] that is used to interact with an arbitrary external storage +//! +//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync. +//! Synchronization internals are split into submodules +//! * [`storage_sync::index`] to keep track of remote tenant files, the metadata and their mappings to local files +//! * [`storage_sync::upload`] and [`storage_sync::download`] to manage archive creation and upload; download and extraction, respectively +//! +//! * public API via to interact with the external world: +//! * [`start_local_timeline_sync`] to launch a background async loop to handle the synchronization +//! * [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] to enqueue a new upload and download tasks, +//! to be processed by the async loop +//! +//! Here's a schematic overview of all interactions backup and the rest of the pageserver perform: +//! +//! +------------------------+ +--------->-------+ +//! | | - - - (init async loop) - - - -> | | +//! | | | | +//! | | -------------------------------> | async | +//! | pageserver | (enqueue timeline sync task) | upload/download | +//! | | | loop | +//! | | <------------------------------- | | +//! | | (apply new timeline sync states) | | +//! +------------------------+ +---------<-------+ +//! | +//! | +//! CRUD layer file operations | +//! (upload/download/delete/list, etc.) | +//! V +//! +------------------------+ +//! | | +//! | [`RemoteStorage`] impl | +//! | | +//! | pageserver assumes it | +//! | owns exclusive write | +//! | access to this storage | +//! +------------------------+ +//! +//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so. +//! The loop inits the storage connection and checks the remote files stored. +//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server). +//! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can +//! query their downloads later if they are accessed. +//! +//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata. +//! If the storage sync loop was successfully started before, pageserver schedules the new checkpoint file uploads after every checkpoint. +//! The checkpoint uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either). +//! See [`crate::layered_repository`] for the upload calls and the adjacent logic. +//! +//! Synchronization logic is able to communicate back with updated timeline sync states, [`crate::repository::TimelineSyncStatusUpdate`], +//! submitted via [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state. +//! Such submissions happen in two cases: +//! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future +//! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory +//! +//! When the pageserver terminates, the sync loop finishes a current sync task (if any) and exits. +//! +//! The storage logic considers `image` as a set of local files (layers), fully representing a certain timeline at given moment (identified with `disk_consistent_lsn` from the corresponding `metadata` file). +//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed +//! by the storage upload, if enabled. +//! Yet timeline cannot alter already existing files, and cannot remove those too: only a GC process is capable of removing unused files. +//! This way, remote storage synchronization relies on the fact that every checkpoint is incremental and local files are "immutable": +//! * when a certain checkpoint gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same state +//! * no files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten +//! when the newer image is downloaded +//! +//! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure. +//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexShard`], containing the list of remote files. +//! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download. +//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`], +//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its shard contents, if needed, same as any layer files. +//! +//! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed. +//! Bulk index data download happens only initially, on pageserer startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only, +//! when a new timeline is scheduled for the download. +//! +//! NOTES: +//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage +//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API. +//! +//! * the sync tasks may not processed immediately after the submission: if they error and get re-enqueued, their execution might be backed off to ensure error cap is not exceeded too fast. +//! The sync queue processing also happens in batches, so the sync tasks can wait in the queue for some time. +//! //! A synchronization logic for the [`RemoteStorage`] and pageserver in-memory state to ensure correct synchronizations //! between local tenant files and their counterparts from the remote storage. //! @@ -62,7 +146,6 @@ pub mod index; mod upload; use std::{ - borrow::Cow, collections::{HashMap, HashSet, VecDeque}, ffi::OsStr, fmt::Debug, @@ -75,6 +158,7 @@ use std::{ use anyhow::{bail, Context}; use futures::stream::{FuturesUnordered, StreamExt}; use lazy_static::lazy_static; +use remote_storage::{GenericRemoteStorage, RemoteStorage}; use tokio::{ fs, runtime::Runtime, @@ -85,17 +169,18 @@ use tracing::*; use self::{ download::{download_timeline_layers, DownloadedTimeline}, - index::{IndexPart, RemoteIndex, RemoteTimeline, RemoteTimelineIndex}, + index::{IndexPart, RemoteTimeline, RemoteTimelineIndex}, upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, }; -use super::{LocalTimelineInitStatus, LocalTimelineInitStatuses, RemoteStorage, SyncStartupData}; use crate::{ config::PageServerConf, layered_repository::{ - metadata::{metadata_path, TimelineMetadata}, + ephemeral_file::is_ephemeral_file, + metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}, LayeredRepository, }, repository::TimelineSyncStatusUpdate, + storage_sync::{self, index::RemoteIndex}, tenant_mgr::apply_timeline_sync_status_updates, thread_mgr, thread_mgr::ThreadKind, @@ -134,6 +219,232 @@ lazy_static! { .expect("failed to register pageserver image sync time histogram vec"); } +/// A timeline status to share with pageserver's sync counterpart, +/// after comparing local and remote timeline state. +#[derive(Clone, Copy, Debug)] +pub enum LocalTimelineInitStatus { + /// The timeline has every remote layer present locally. + /// There could be some layers requiring uploading, + /// but this does not block the timeline from any user interaction. + LocallyComplete, + /// A timeline has some files remotely, that are not present locally and need downloading. + /// Downloading might update timeline's metadata locally and current pageserver logic deals with local layers only, + /// so the data needs to be downloaded first before the timeline can be used. + NeedsSync, +} + +type LocalTimelineInitStatuses = HashMap>; + +/// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization. +/// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still, +/// to simplify the received code. +pub struct SyncStartupData { + pub remote_index: RemoteIndex, + pub local_timeline_init_statuses: LocalTimelineInitStatuses, +} + +/// Based on the config, initiates the remote storage connection and starts a separate thread +/// that ensures that pageserver and the remote storage are in sync with each other. +/// If no external configuration connection given, no thread or storage initialization is done. +/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states. +pub fn start_local_timeline_sync( + config: &'static PageServerConf, +) -> anyhow::Result { + let local_timeline_files = local_tenant_timeline_files(config) + .context("Failed to collect local tenant timeline files")?; + + match config.remote_storage_config.as_ref() { + Some(storage_config) => { + match GenericRemoteStorage::new(config.workdir.clone(), storage_config) + .context("Failed to init the generic remote storage")? + { + GenericRemoteStorage::Local(local_fs_storage) => { + storage_sync::spawn_storage_sync_thread( + config, + local_timeline_files, + local_fs_storage, + storage_config.max_concurrent_syncs, + storage_config.max_sync_errors, + ) + } + GenericRemoteStorage::S3(s3_bucket_storage) => { + storage_sync::spawn_storage_sync_thread( + config, + local_timeline_files, + s3_bucket_storage, + storage_config.max_concurrent_syncs, + storage_config.max_sync_errors, + ) + } + } + .context("Failed to spawn the storage sync thread") + } + None => { + info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled"); + let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); + for ( + ZTenantTimelineId { + tenant_id, + timeline_id, + }, + _, + ) in local_timeline_files + { + local_timeline_init_statuses + .entry(tenant_id) + .or_default() + .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete); + } + Ok(SyncStartupData { + local_timeline_init_statuses, + remote_index: RemoteIndex::empty(), + }) + } + } +} + +fn local_tenant_timeline_files( + config: &'static PageServerConf, +) -> anyhow::Result)>> { + let mut local_tenant_timeline_files = HashMap::new(); + let tenants_dir = config.tenants_path(); + for tenants_dir_entry in std::fs::read_dir(&tenants_dir) + .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? + { + match &tenants_dir_entry { + Ok(tenants_dir_entry) => { + match collect_timelines_for_tenant(config, &tenants_dir_entry.path()) { + Ok(collected_files) => { + local_tenant_timeline_files.extend(collected_files.into_iter()) + } + Err(e) => error!( + "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}", + tenants_dir.display(), + tenants_dir_entry, + e + ), + } + } + Err(e) => error!( + "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}", + tenants_dir_entry, + tenants_dir.display(), + e + ), + } + } + + Ok(local_tenant_timeline_files) +} + +fn collect_timelines_for_tenant( + config: &'static PageServerConf, + tenant_path: &Path, +) -> anyhow::Result)>> { + let mut timelines = HashMap::new(); + let tenant_id = tenant_path + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .context("Could not parse tenant id out of the tenant dir name")?; + let timelines_dir = config.timelines_path(&tenant_id); + + for timelines_dir_entry in std::fs::read_dir(&timelines_dir).with_context(|| { + format!( + "Failed to list timelines dir entry for tenant {}", + tenant_id + ) + })? { + match timelines_dir_entry { + Ok(timelines_dir_entry) => { + let timeline_path = timelines_dir_entry.path(); + match collect_timeline_files(&timeline_path) { + Ok((timeline_id, metadata, timeline_files)) => { + timelines.insert( + ZTenantTimelineId { + tenant_id, + timeline_id, + }, + (metadata, timeline_files), + ); + } + Err(e) => error!( + "Failed to process timeline dir contents at '{}', reason: {:?}", + timeline_path.display(), + e + ), + } + } + Err(e) => error!( + "Failed to list timelines for entry tenant {}, reason: {:?}", + tenant_id, e + ), + } + } + + Ok(timelines) +} + +// discover timeline files and extract timeline metadata +// NOTE: ephemeral files are excluded from the list +fn collect_timeline_files( + timeline_dir: &Path, +) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet)> { + let mut timeline_files = HashSet::new(); + let mut timeline_metadata_path = None; + + let timeline_id = timeline_dir + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .context("Could not parse timeline id out of the timeline dir name")?; + let timeline_dir_entries = + std::fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; + for entry in timeline_dir_entries { + let entry_path = entry.context("Failed to list timeline dir entry")?.path(); + if entry_path.is_file() { + if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) { + timeline_metadata_path = Some(entry_path); + } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) { + debug!("skipping ephemeral file {}", entry_path.display()); + continue; + } else if entry_path.extension().and_then(OsStr::to_str) + == Some(TEMP_DOWNLOAD_EXTENSION) + { + info!("removing temp download file at {}", entry_path.display()); + std::fs::remove_file(&entry_path).with_context(|| { + format!( + "failed to remove temp download file at {}", + entry_path.display() + ) + })?; + } else { + timeline_files.insert(entry_path); + } + } + } + + // FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed + // then attach is lost. There would be no retries for that, + // initial collect will fail because there is no metadata. + // We either need to start download if we see empty dir after restart or attach caller should + // be aware of that and retry attach if awaits_download for timeline switched from true to false + // but timelinne didnt appear locally. + // Check what happens with remote index in that case. + let timeline_metadata_path = match timeline_metadata_path { + Some(path) => path, + None => bail!("No metadata file found in the timeline directory"), + }; + let metadata = TimelineMetadata::from_bytes( + &std::fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, + ) + .context("Failed to parse timeline metadata file bytes")?; + + Ok((timeline_id, metadata, timeline_files)) +} + /// Wraps mpsc channel bits around into a queue interface. /// mpsc approach was picked to allow blocking the sync loop if no tasks are present, to avoid meaningless spinning. mod sync_queue { @@ -505,7 +816,7 @@ pub(super) fn spawn_storage_sync_thread( ) -> anyhow::Result where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { let (sender, receiver) = mpsc::unbounded_channel(); sync_queue::init(sender)?; @@ -566,7 +877,7 @@ fn storage_sync_loop( max_sync_errors: NonZeroU32, ) where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { info!("Starting remote storage sync loop"); loop { @@ -618,7 +929,7 @@ async fn loop_step( ) -> ControlFlow<(), HashMap>> where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { let batched_tasks = match sync_queue::next_task_batch(receiver, max_concurrent_timelines_sync).await { @@ -677,7 +988,7 @@ async fn process_sync_task( ) -> Option where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { let sync_start = Instant::now(); let current_remote_timeline = { index.read().await.timeline_entry(&sync_id).cloned() }; @@ -810,7 +1121,7 @@ async fn download_timeline( ) -> Option where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { match download_timeline_layers( conf, @@ -936,7 +1247,7 @@ async fn upload_timeline( task_name: &str, ) where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { let mut uploaded_data = match upload_timeline_layers(storage, current_remote_timeline, sync_id, new_upload_data) @@ -991,7 +1302,7 @@ async fn update_remote_data( ) -> anyhow::Result<()> where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { info!("Updating remote index for the timeline"); let updated_remote_timeline = { @@ -1101,7 +1412,7 @@ async fn try_fetch_index_parts( ) -> HashMap where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { let mut index_parts = HashMap::with_capacity(keys.len()); @@ -1246,20 +1557,6 @@ fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Optio .observe(secs_elapsed) } -pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { - let new_extension = match original_path - .as_ref() - .extension() - .map(OsStr::to_string_lossy) - { - Some(extension) => Cow::Owned(format!("{extension}.{suffix}")), - None => Cow::Borrowed(suffix), - }; - original_path - .as_ref() - .with_extension(new_extension.as_ref()) -} - #[cfg(test)] mod test_utils { use utils::lsn::Lsn; @@ -1671,28 +1968,4 @@ mod tests { "Merged upload tasks should have a metadata with biggest disk_consistent_lsn" ); } - - #[test] - fn test_path_with_suffix_extension() { - let p = PathBuf::from("/foo/bar"); - assert_eq!( - &path_with_suffix_extension(&p, "temp").to_string_lossy(), - "/foo/bar.temp" - ); - let p = PathBuf::from("/foo/bar"); - assert_eq!( - &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), - "/foo/bar.temp.temp" - ); - let p = PathBuf::from("/foo/bar.baz"); - assert_eq!( - &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), - "/foo/bar.baz.temp.temp" - ); - let p = PathBuf::from("/foo/bar.baz"); - assert_eq!( - &path_with_suffix_extension(&p, ".temp").to_string_lossy(), - "/foo/bar.baz..temp" - ); - } } diff --git a/pageserver/src/remote_storage/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs similarity index 93% rename from pageserver/src/remote_storage/storage_sync/download.rs rename to pageserver/src/storage_sync/download.rs index 7e2496b796..dca08bca5d 100644 --- a/pageserver/src/remote_storage/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -4,6 +4,7 @@ use std::{collections::HashSet, fmt::Debug, path::Path}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; +use remote_storage::{path_with_suffix_extension, RemoteStorage}; use tokio::{ fs, io::{self, AsyncWriteExt}, @@ -13,10 +14,7 @@ use tracing::{debug, error, info, warn}; use crate::{ config::PageServerConf, layered_repository::metadata::metadata_path, - remote_storage::{ - storage_sync::{path_with_suffix_extension, sync_queue, SyncTask}, - RemoteStorage, - }, + storage_sync::{sync_queue, SyncTask}, }; use utils::zid::ZTenantTimelineId; @@ -35,17 +33,19 @@ pub async fn download_index_part( ) -> anyhow::Result where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME) .with_extension(IndexPart::FILE_EXTENSION); - let part_storage_path = storage.storage_path(&index_part_path).with_context(|| { - format!( - "Failed to get the index part storage path for local path '{}'", - index_part_path.display() - ) - })?; + let part_storage_path = storage + .remote_object_id(&index_part_path) + .with_context(|| { + format!( + "Failed to get the index part storage path for local path '{}'", + index_part_path.display() + ) + })?; let mut index_part_bytes = Vec::new(); storage .download(&part_storage_path, &mut index_part_bytes) @@ -93,7 +93,7 @@ pub(super) async fn download_timeline_layers<'a, P, S>( ) -> DownloadedTimeline where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { let remote_timeline = match remote_timeline { Some(remote_timeline) => { @@ -130,7 +130,7 @@ where ); } else { let layer_storage_path = storage - .storage_path(&layer_desination_path) + .remote_object_id(&layer_desination_path) .with_context(|| { format!( "Failed to get the layer storage path for local path '{}'", @@ -262,18 +262,16 @@ async fn fsync_path(path: impl AsRef) -> Result<(), io::Error> { mod tests { use std::collections::{BTreeSet, HashSet}; + use remote_storage::{LocalFs, RemoteStorage}; use tempfile::tempdir; use utils::lsn::Lsn; use crate::{ - remote_storage::{ - storage_sync::{ - index::RelativePath, - test_utils::{create_local_timeline, dummy_metadata}, - }, - LocalFs, - }, repository::repo_harness::{RepoHarness, TIMELINE_ID}, + storage_sync::{ + index::RelativePath, + test_utils::{create_local_timeline, dummy_metadata}, + }, }; use super::*; @@ -283,7 +281,10 @@ mod tests { let harness = RepoHarness::create("download_timeline")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"]; - let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; + let storage = LocalFs::new( + tempdir()?.path().to_path_buf(), + harness.conf.workdir.clone(), + )?; let current_retries = 3; let metadata = dummy_metadata(Lsn(0x30)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); @@ -291,7 +292,7 @@ mod tests { create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; for local_path in timeline_upload.layers_to_upload { - let remote_path = storage.storage_path(&local_path)?; + let remote_path = storage.remote_object_id(&local_path)?; let remote_parent_dir = remote_path.parent().unwrap(); if !remote_parent_dir.exists() { fs::create_dir_all(&remote_parent_dir).await?; @@ -375,7 +376,7 @@ mod tests { async fn download_timeline_negatives() -> anyhow::Result<()> { let harness = RepoHarness::create("download_timeline_negatives")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?; + let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?; let empty_remote_timeline_download = download_timeline_layers( harness.conf, @@ -429,7 +430,10 @@ mod tests { let harness = RepoHarness::create("test_download_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; + let storage = LocalFs::new( + tempdir()?.path().to_path_buf(), + harness.conf.workdir.clone(), + )?; let metadata = dummy_metadata(Lsn(0x30)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); @@ -450,7 +454,7 @@ mod tests { metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME) .with_extension(IndexPart::FILE_EXTENSION); - let storage_path = storage.storage_path(&local_index_part_path)?; + let storage_path = storage.remote_object_id(&local_index_part_path)?; fs::create_dir_all(storage_path.parent().unwrap()).await?; fs::write(&storage_path, serde_json::to_vec(&index_part)?).await?; diff --git a/pageserver/src/remote_storage/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs similarity index 100% rename from pageserver/src/remote_storage/storage_sync/index.rs rename to pageserver/src/storage_sync/index.rs diff --git a/pageserver/src/remote_storage/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs similarity index 93% rename from pageserver/src/remote_storage/storage_sync/upload.rs rename to pageserver/src/storage_sync/upload.rs index 91a0a0d6ce..55089df7bc 100644 --- a/pageserver/src/remote_storage/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -4,20 +4,21 @@ use std::{fmt::Debug, path::PathBuf}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; +use remote_storage::RemoteStorage; use tokio::fs; use tracing::{debug, error, info, warn}; use crate::{ config::PageServerConf, layered_repository::metadata::metadata_path, - remote_storage::{ - storage_sync::{index::RemoteTimeline, sync_queue, SyncTask}, - RemoteStorage, - }, + storage_sync::{sync_queue, SyncTask}, }; use utils::zid::ZTenantTimelineId; -use super::{index::IndexPart, SyncData, TimelineUpload}; +use super::{ + index::{IndexPart, RemoteTimeline}, + SyncData, TimelineUpload, +}; /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part( @@ -28,7 +29,7 @@ pub(super) async fn upload_index_part( ) -> anyhow::Result<()> where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { let index_part_bytes = serde_json::to_vec(&index_part) .context("Failed to serialize index part file into bytes")?; @@ -38,12 +39,15 @@ where let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME) .with_extension(IndexPart::FILE_EXTENSION); - let index_part_storage_path = storage.storage_path(&index_part_path).with_context(|| { - format!( - "Failed to get the index part storage path for local path '{}'", - index_part_path.display() - ) - })?; + let index_part_storage_path = + storage + .remote_object_id(&index_part_path) + .with_context(|| { + format!( + "Failed to get the index part storage path for local path '{}'", + index_part_path.display() + ) + })?; storage .upload( @@ -83,7 +87,7 @@ pub(super) async fn upload_timeline_layers<'a, P, S>( ) -> UploadedTimeline where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { let upload = &mut upload_data.data; let new_upload_lsn = upload @@ -112,7 +116,7 @@ where .into_iter() .map(|source_path| async move { let storage_path = storage - .storage_path(&source_path) + .remote_object_id(&source_path) .with_context(|| { format!( "Failed to get the layer storage path for local path '{}'", @@ -211,18 +215,16 @@ enum UploadError { mod tests { use std::collections::{BTreeSet, HashSet}; + use remote_storage::LocalFs; use tempfile::tempdir; use utils::lsn::Lsn; use crate::{ - remote_storage::{ - storage_sync::{ - index::RelativePath, - test_utils::{create_local_timeline, dummy_metadata}, - }, - LocalFs, - }, repository::repo_harness::{RepoHarness, TIMELINE_ID}, + storage_sync::{ + index::RelativePath, + test_utils::{create_local_timeline, dummy_metadata}, + }, }; use super::{upload_index_part, *}; @@ -233,7 +235,10 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b"]; - let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; + let storage = LocalFs::new( + tempdir()?.path().to_path_buf(), + harness.conf.workdir.clone(), + )?; let current_retries = 3; let metadata = dummy_metadata(Lsn(0x30)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); @@ -315,7 +320,7 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a1", "b1"]; - let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?; + let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?; let current_retries = 5; let metadata = dummy_metadata(Lsn(0x40)); @@ -403,7 +408,7 @@ mod tests { let harness = RepoHarness::create("test_upload_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?; + let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?; let metadata = dummy_metadata(Lsn(0x40)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 507e749e8c..20a723b5b5 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -4,8 +4,9 @@ use crate::config::PageServerConf; use crate::layered_repository::LayeredRepository; use crate::pgdatadir_mapping::DatadirTimeline; -use crate::remote_storage::{self, LocalTimelineInitStatus, RemoteIndex, SyncStartupData}; use crate::repository::{Repository, TimelineSyncStatusUpdate}; +use crate::storage_sync::index::RemoteIndex; +use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; @@ -96,7 +97,7 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result, + remote_storage: &S3Bucket, + listing: &HashSet, dir_path: &Path, conf: &SafeKeeperConf, ) -> anyhow::Result { @@ -55,17 +57,12 @@ async fn offload_files( && IsXLogFileName(entry.file_name().to_str().unwrap()) && entry.metadata().unwrap().created().unwrap() <= horizon { - let relpath = path.strip_prefix(&conf.workdir).unwrap(); - let s3path = String::from("walarchive/") + relpath.to_str().unwrap(); - if !listing.contains(&s3path) { + let remote_path = remote_storage.remote_object_id(path)?; + if !listing.contains(&remote_path) { let file = File::open(&path).await?; - client - .put_object(PutObjectRequest { - body: Some(StreamingBody::new(ReaderStream::new(file))), - bucket: bucket_name.to_string(), - key: s3path, - ..PutObjectRequest::default() - }) + let file_length = file.metadata().await?.len() as usize; + remote_storage + .upload(BufReader::new(file), file_length, &remote_path, None) .await?; fs::remove_file(&path).await?; @@ -77,58 +74,34 @@ async fn offload_files( } async fn main_loop(conf: &SafeKeeperConf) -> anyhow::Result<()> { - let region = Region::Custom { - name: env::var("S3_REGION").context("S3_REGION env var is not set")?, - endpoint: env::var("S3_ENDPOINT").context("S3_ENDPOINT env var is not set")?, + let remote_storage = match GenericRemoteStorage::new( + conf.workdir.clone(), + &RemoteStorageConfig { + max_concurrent_syncs: NonZeroUsize::new(10).unwrap(), + max_sync_errors: NonZeroU32::new(1).unwrap(), + storage: remote_storage::RemoteStorageKind::AwsS3(S3Config { + bucket_name: "zenith-testbucket".to_string(), + bucket_region: env::var("S3_REGION").context("S3_REGION env var is not set")?, + prefix_in_bucket: Some("walarchive/".to_string()), + endpoint: Some(env::var("S3_ENDPOINT").context("S3_ENDPOINT env var is not set")?), + concurrency_limit: NonZeroUsize::new(20).unwrap(), + }), + }, + )? { + GenericRemoteStorage::Local(_) => { + bail!("Unexpected: got local storage for the remote config") + } + GenericRemoteStorage::S3(remote_storage) => remote_storage, }; - let client = S3Client::new_with( - HttpClient::new().context("Failed to create S3 http client")?, - StaticProvider::new_minimal( - env::var("S3_ACCESSKEY").context("S3_ACCESSKEY env var is not set")?, - env::var("S3_SECRET").context("S3_SECRET env var is not set")?, - ), - region, - ); - - let bucket_name = "zenith-testbucket"; - loop { - let listing = gather_wal_entries(&client, bucket_name).await?; - let n = offload_files(&client, bucket_name, &listing, &conf.workdir, conf).await?; - info!("Offload {} files to S3", n); + let listing = remote_storage + .list() + .await? + .into_iter() + .collect::>(); + let n = offload_files(&remote_storage, &listing, &conf.workdir, conf).await?; + info!("Offload {n} files to S3"); sleep(conf.ttl.unwrap()).await; } } - -async fn gather_wal_entries( - client: &S3Client, - bucket_name: &str, -) -> anyhow::Result> { - let mut document_keys = HashSet::new(); - - let mut continuation_token = None::; - loop { - let response = client - .list_objects_v2(ListObjectsV2Request { - bucket: bucket_name.to_string(), - prefix: Some("walarchive/".to_string()), - continuation_token, - ..ListObjectsV2Request::default() - }) - .await?; - document_keys.extend( - response - .contents - .unwrap_or_default() - .into_iter() - .filter_map(|o| o.key), - ); - - continuation_token = response.continuation_token; - if continuation_token.is_none() { - break; - } - } - Ok(document_keys) -} diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 7acf0552df..3bb7c606d3 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -472,20 +472,16 @@ class ZenithEnvBuilder: mock_endpoint = self.s3_mock_server.endpoint() mock_region = self.s3_mock_server.region() - mock_access_key = self.s3_mock_server.access_key() - mock_secret_key = self.s3_mock_server.secret_key() boto3.client( 's3', endpoint_url=mock_endpoint, region_name=mock_region, - aws_access_key_id=mock_access_key, - aws_secret_access_key=mock_secret_key, + aws_access_key_id=self.s3_mock_server.access_key(), + aws_secret_access_key=self.s3_mock_server.secret_key(), ).create_bucket(Bucket=bucket_name) self.pageserver_remote_storage = S3Storage(bucket=bucket_name, endpoint=mock_endpoint, - region=mock_region, - access_key=mock_access_key, - secret_key=mock_secret_key) + region=mock_region) def __enter__(self): return self @@ -811,8 +807,6 @@ class LocalFsStorage: class S3Storage: bucket: str region: str - access_key: Optional[str] - secret_key: Optional[str] endpoint: Optional[str] @@ -998,7 +992,14 @@ class ZenithCli: append_pageserver_param_overrides(start_args, self.env.pageserver.remote_storage, self.env.pageserver.config_override) - return self.raw_cli(start_args) + + s3_env_vars = None + if self.env.s3_mock_server: + s3_env_vars = { + 'AWS_ACCESS_KEY_ID': self.env.s3_mock_server.access_key(), + 'AWS_SECRET_ACCESS_KEY': self.env.s3_mock_server.secret_key(), + } + return self.raw_cli(start_args, extra_env_vars=s3_env_vars) def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]': cmd = ['pageserver', 'stop'] @@ -1093,6 +1094,7 @@ class ZenithCli: def raw_cli(self, arguments: List[str], + extra_env_vars: Optional[Dict[str, str]] = None, check_return_code=True) -> 'subprocess.CompletedProcess[str]': """ Run "zenith" with the specified arguments. @@ -1117,9 +1119,10 @@ class ZenithCli: env_vars = os.environ.copy() env_vars['ZENITH_REPO_DIR'] = str(self.env.repo_dir) env_vars['POSTGRES_DISTRIB_DIR'] = str(pg_distrib_dir) - if self.env.rust_log_override is not None: env_vars['RUST_LOG'] = self.env.rust_log_override + for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): + env_vars[extra_env_key] = extra_env_value # Pass coverage settings var = 'LLVM_PROFILE_FILE' @@ -1217,10 +1220,6 @@ def append_pageserver_param_overrides( pageserver_storage_override = f"bucket_name='{pageserver_remote_storage.bucket}',\ bucket_region='{pageserver_remote_storage.region}'" - if pageserver_remote_storage.access_key is not None: - pageserver_storage_override += f",access_key_id='{pageserver_remote_storage.access_key}'" - if pageserver_remote_storage.secret_key is not None: - pageserver_storage_override += f",secret_access_key='{pageserver_remote_storage.secret_key}'" if pageserver_remote_storage.endpoint is not None: pageserver_storage_override += f",endpoint='{pageserver_remote_storage.endpoint}'" diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 2bb22f2d3b..92877faef7 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -21,7 +21,13 @@ chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "st clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } either = { version = "1", features = ["use_std"] } fail = { version = "0.5", default-features = false, features = ["failpoints"] } +futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] } +futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] } +futures-util = { version = "0.3", default-features = false, features = ["alloc", "async-await", "async-await-macro", "channel", "futures-channel", "futures-io", "futures-macro", "futures-sink", "io", "memchr", "sink", "slab", "std"] } +generic-array = { version = "0.14", default-features = false, features = ["more_lengths"] } hashbrown = { version = "0.11", features = ["ahash", "inline-more", "raw"] } +hex = { version = "0.4", features = ["alloc", "serde", "std"] } +hyper = { version = "0.14", features = ["client", "full", "h2", "http1", "http2", "runtime", "server", "socket2", "stream", "tcp"] } indexmap = { version = "1", default-features = false, features = ["std"] } itoa = { version = "0.4", features = ["i128", "std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } From 10e4da399737f26a3584ab8822e701e382e2dd43 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 2 May 2022 10:46:13 +0300 Subject: [PATCH 0260/1022] Rework timeline batching --- pageserver/src/http/routes.rs | 15 +- pageserver/src/layered_repository.rs | 115 ++-- pageserver/src/storage_sync.rs | 884 +++++++-------------------- pageserver/src/storage_sync/index.rs | 4 +- 4 files changed, 292 insertions(+), 726 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 8940efbda0..0104df826e 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -267,7 +267,7 @@ async fn timeline_attach_handler(request: Request) -> Result { tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id)) .await @@ -300,11 +300,11 @@ async fn timeline_attach_handler(request: Request) -> Result anyhow::Result> { - let shard = match state.remote_storage.as_ref() { + let index_part = match state.remote_storage.as_ref() { Some(GenericRemoteStorage::Local(local_storage)) => { storage_sync::download_index_part(state.conf, local_storage, sync_id).await } @@ -313,18 +313,15 @@ async fn try_download_shard_data( } None => return Ok(None), } - .with_context(|| format!("Failed to download index shard for timeline {}", sync_id))?; + .with_context(|| format!("Failed to download index part for timeline {sync_id}"))?; let timeline_path = state .conf .timeline_path(&sync_id.timeline_id, &sync_id.tenant_id); - RemoteTimeline::from_index_part(&timeline_path, shard) + RemoteTimeline::from_index_part(&timeline_path, index_part) .map(Some) .with_context(|| { - format!( - "Failed to convert index shard into remote timeline for timeline {}", - sync_id - ) + format!("Failed to convert index part into remote timeline for timeline {sync_id}") }) } diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index da2699b15d..039bf8d1ed 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -455,7 +455,7 @@ enum LayeredTimelineEntry { impl LayeredTimelineEntry { fn timeline_id(&self) -> ZTimelineId { match self { - LayeredTimelineEntry::Loaded(timeline) => timeline.timelineid, + LayeredTimelineEntry::Loaded(timeline) => timeline.timeline_id, LayeredTimelineEntry::Unloaded { id, .. } => *id, } } @@ -615,21 +615,17 @@ impl LayeredRepository { fn load_local_timeline( &self, - timelineid: ZTimelineId, + timeline_id: ZTimelineId, timelines: &mut HashMap, ) -> anyhow::Result> { - let metadata = load_metadata(self.conf, timelineid, self.tenant_id) + let metadata = load_metadata(self.conf, timeline_id, self.tenant_id) .context("failed to load metadata")?; let disk_consistent_lsn = metadata.disk_consistent_lsn(); let ancestor = metadata .ancestor_timeline() .map(|ancestor_timeline_id| { - trace!( - "loading {}'s ancestor {}", - timelineid, - &ancestor_timeline_id - ); + trace!("loading {timeline_id}'s ancestor {}", &ancestor_timeline_id); self.get_timeline_load_internal(ancestor_timeline_id, timelines) }) .transpose() @@ -643,7 +639,7 @@ impl LayeredRepository { Arc::clone(&self.tenant_conf), metadata, ancestor, - timelineid, + timeline_id, self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, @@ -902,8 +898,8 @@ pub struct LayeredTimeline { conf: &'static PageServerConf, tenant_conf: Arc>, - tenantid: ZTenantId, - timelineid: ZTimelineId, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, layers: RwLock, @@ -1177,50 +1173,50 @@ impl LayeredTimeline { tenant_conf: Arc>, metadata: TimelineMetadata, ancestor: Option, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: ZTimelineId, + tenant_id: ZTenantId, walredo_mgr: Arc, upload_layers: bool, ) -> LayeredTimeline { let reconstruct_time_histo = RECONSTRUCT_TIME - .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) .unwrap(); let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT - .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) .unwrap(); let flush_time_histo = STORAGE_TIME .get_metric_with_label_values(&[ "layer flush", - &tenantid.to_string(), - &timelineid.to_string(), + &tenant_id.to_string(), + &timeline_id.to_string(), ]) .unwrap(); let compact_time_histo = STORAGE_TIME .get_metric_with_label_values(&[ "compact", - &tenantid.to_string(), - &timelineid.to_string(), + &tenant_id.to_string(), + &timeline_id.to_string(), ]) .unwrap(); let create_images_time_histo = STORAGE_TIME .get_metric_with_label_values(&[ "create images", - &tenantid.to_string(), - &timelineid.to_string(), + &tenant_id.to_string(), + &timeline_id.to_string(), ]) .unwrap(); let last_record_gauge = LAST_RECORD_LSN - .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) .unwrap(); let wait_lsn_time_histo = WAIT_LSN_TIME - .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) .unwrap(); LayeredTimeline { conf, tenant_conf, - timelineid, - tenantid, + timeline_id, + tenant_id, layers: RwLock::new(LayerMap::default()), walredo_mgr, @@ -1272,7 +1268,7 @@ impl LayeredTimeline { // Scan timeline directory and create ImageFileName and DeltaFilename // structs representing all files on disk - let timeline_path = self.conf.timeline_path(&self.timelineid, &self.tenantid); + let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); for direntry in fs::read_dir(timeline_path)? { let direntry = direntry?; @@ -1284,7 +1280,7 @@ impl LayeredTimeline { if imgfilename.lsn > disk_consistent_lsn { warn!( "found future image layer {} on timeline {} disk_consistent_lsn is {}", - imgfilename, self.timelineid, disk_consistent_lsn + imgfilename, self.timeline_id, disk_consistent_lsn ); rename_to_backup(direntry.path())?; @@ -1292,7 +1288,7 @@ impl LayeredTimeline { } let layer = - ImageLayer::new(self.conf, self.timelineid, self.tenantid, &imgfilename); + ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename); trace!("found layer {}", layer.filename().display()); layers.insert_historic(Arc::new(layer)); @@ -1307,7 +1303,7 @@ impl LayeredTimeline { if deltafilename.lsn_range.end > disk_consistent_lsn + 1 { warn!( "found future delta layer {} on timeline {} disk_consistent_lsn is {}", - deltafilename, self.timelineid, disk_consistent_lsn + deltafilename, self.timeline_id, disk_consistent_lsn ); rename_to_backup(direntry.path())?; @@ -1315,7 +1311,7 @@ impl LayeredTimeline { } let layer = - DeltaLayer::new(self.conf, self.timelineid, self.tenantid, &deltafilename); + DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename); trace!("found layer {}", layer.filename().display()); layers.insert_historic(Arc::new(layer)); @@ -1497,7 +1493,7 @@ impl LayeredTimeline { // FIXME: It's pointless to check the cache for things that are not 8kB pages. // We should look at the key to determine if it's a cacheable object let (lsn, read_guard) = - cache.lookup_materialized_page(self.tenantid, self.timelineid, key, lsn)?; + cache.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)?; let img = Bytes::from(read_guard.to_vec()); Some((lsn, img)) } @@ -1509,7 +1505,7 @@ impl LayeredTimeline { .with_context(|| { format!( "Ancestor is missing. Timeline id: {} Ancestor id {:?}", - self.timelineid, + self.timeline_id, self.get_ancestor_timeline_id(), ) })? @@ -1517,7 +1513,7 @@ impl LayeredTimeline { .with_context(|| { format!( "Ancestor timeline is not is not loaded. Timeline id: {} Ancestor id {:?}", - self.timelineid, + self.timeline_id, self.get_ancestor_timeline_id(), ) })?; @@ -1554,12 +1550,12 @@ impl LayeredTimeline { trace!( "creating layer for write at {}/{} for record at {}", - self.timelineid, + self.timeline_id, start_lsn, lsn ); let new_layer = - InMemoryLayer::create(self.conf, self.timelineid, self.tenantid, start_lsn)?; + InMemoryLayer::create(self.conf, self.timeline_id, self.tenant_id, start_lsn)?; let layer_rc = Arc::new(new_layer); layers.open_layer = Some(Arc::clone(&layer_rc)); @@ -1633,8 +1629,8 @@ impl LayeredTimeline { let self_clone = Arc::clone(self); thread_mgr::spawn( thread_mgr::ThreadKind::LayerFlushThread, - Some(self.tenantid), - Some(self.timelineid), + Some(self.tenant_id), + Some(self.timeline_id), "layer flush thread", false, move || self_clone.flush_frozen_layers(false), @@ -1703,7 +1699,7 @@ impl LayeredTimeline { // them all in parallel. par_fsync::par_fsync(&[ new_delta_path.clone(), - self.conf.timeline_path(&self.timelineid, &self.tenantid), + self.conf.timeline_path(&self.timeline_id, &self.tenant_id), ])?; fail_point!("checkpoint-before-sync"); @@ -1775,8 +1771,8 @@ impl LayeredTimeline { LayeredRepository::save_metadata( self.conf, - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, &metadata, false, )?; @@ -1786,8 +1782,8 @@ impl LayeredTimeline { if self.upload_layers.load(atomic::Ordering::Relaxed) { storage_sync::schedule_layer_upload( - self.tenantid, - self.timelineid, + self.tenant_id, + self.timeline_id, HashSet::from([new_delta_path]), Some(metadata), ); @@ -1840,7 +1836,8 @@ impl LayeredTimeline { let target_file_size = self.get_checkpoint_distance(); // Define partitioning schema if needed - if let Ok(pgdir) = tenant_mgr::get_local_timeline_with_load(self.tenantid, self.timelineid) + if let Ok(pgdir) = + tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) { let (partitioning, lsn) = pgdir.repartition( self.get_last_record_lsn(), @@ -1858,8 +1855,8 @@ impl LayeredTimeline { } if self.upload_layers.load(atomic::Ordering::Relaxed) { storage_sync::schedule_layer_upload( - self.tenantid, - self.timelineid, + self.tenant_id, + self.timeline_id, layer_paths_to_upload, None, ); @@ -1909,7 +1906,7 @@ impl LayeredTimeline { let img_range = partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; let mut image_layer_writer = - ImageLayerWriter::new(self.conf, self.timelineid, self.tenantid, &img_range, lsn)?; + ImageLayerWriter::new(self.conf, self.timeline_id, self.tenant_id, &img_range, lsn)?; for range in &partition.ranges { let mut key = range.start; @@ -1932,7 +1929,7 @@ impl LayeredTimeline { // and fsync them all in parallel. par_fsync::par_fsync(&[ image_layer.path(), - self.conf.timeline_path(&self.timelineid, &self.tenantid), + self.conf.timeline_path(&self.timeline_id, &self.tenant_id), ])?; // FIXME: Do we need to do something to upload it to remote storage here? @@ -2008,8 +2005,8 @@ impl LayeredTimeline { if writer.is_none() { writer = Some(DeltaLayerWriter::new( self.conf, - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, key, lsn_range.clone(), )?); @@ -2027,7 +2024,7 @@ impl LayeredTimeline { let mut layer_paths: Vec = new_layers.iter().map(|l| l.path()).collect(); // also sync the directory - layer_paths.push(self.conf.timeline_path(&self.timelineid, &self.tenantid)); + layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); // Fsync all the layer files and directory using multiple threads to // minimize latency. @@ -2057,14 +2054,14 @@ impl LayeredTimeline { if self.upload_layers.load(atomic::Ordering::Relaxed) { storage_sync::schedule_layer_upload( - self.tenantid, - self.timelineid, + self.tenant_id, + self.timeline_id, new_layer_paths, None, ); storage_sync::schedule_layer_delete( - self.tenantid, - self.timelineid, + self.tenant_id, + self.timeline_id, layer_paths_do_delete, ); } @@ -2121,7 +2118,7 @@ impl LayeredTimeline { let cutoff = gc_info.cutoff; let pitr = gc_info.pitr; - let _enter = info_span!("garbage collection", timeline = %self.timelineid, tenant = %self.tenantid, cutoff = %cutoff).entered(); + let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %cutoff).entered(); // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. // See branch_timeline() for details. @@ -2254,8 +2251,8 @@ impl LayeredTimeline { if self.upload_layers.load(atomic::Ordering::Relaxed) { storage_sync::schedule_layer_delete( - self.tenantid, - self.timelineid, + self.tenant_id, + self.timeline_id, layer_paths_to_delete, ); } @@ -2323,8 +2320,8 @@ impl LayeredTimeline { if img.len() == page_cache::PAGE_SZ { let cache = page_cache::get(); cache.memorize_materialized_page( - self.tenantid, - self.timelineid, + self.tenant_id, + self.timeline_id, key, last_rec_lsn, &img, diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index bcc18e8ce4..b6091015b9 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -92,12 +92,12 @@ //! A queue is implemented in the [`sync_queue`] module as a pair of sender and receiver channels, to block on zero tasks instead of checking the queue. //! The pair's shared buffer of a fixed size serves as an implicit queue, holding [`SyncTask`] for local files upload/download operations. //! -//! The queue gets emptied by a single thread with the loop, that polls the tasks in batches of deduplicated tasks (size configurable). -//! A task from the batch corresponds to a single timeline, with its files to sync merged together. -//! Every batch task and layer file in the task is processed concurrently, which is possible due to incremental nature of the timelines: -//! it's not asserted, but assumed that timeline's checkpoints only add the files locally, not removing or amending the existing ones. -//! Only GC removes local timeline files, the GC support is not added to sync currently, -//! yet downloading extra files is not critically bad at this stage, GC can remove those again. +//! The queue gets emptied by a single thread with the loop, that polls the tasks in batches of deduplicated tasks. +//! A task from the batch corresponds to a single timeline, with its files to sync merged together: given that only one task sync loop step is active at a time, +//! timeline uploads and downloads can happen concurrently, in no particular order due to incremental nature of the timeline layers. +//! Deletion happens only after a successful upload only, otherwise the compation output might make the timeline inconsistent until both tasks are fully processed without errors. +//! Upload and download update the remote data (inmemory index and S3 json index part file) only after every layer is successfully synchronized, while the deletion task +//! does otherwise: it requires to have the remote data updated first succesfully: blob files will be invisible to pageserver this way. //! //! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via downloading and merging the index data for all timelines, //! present locally. @@ -119,7 +119,7 @@ //! Among other tasks, the index is used to prevent invalid uploads and non-existing downloads on demand, refer to [`index`] for more details. //! //! Index construction is currently the only place where the storage sync can return an [`Err`] to the user. -//! New sync tasks are accepted via [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] functions, +//! New sync tasks are accepted via [`schedule_layer_upload`], [`schedule_layer_download`] and [`schedule_layer_delete`] functions, //! disregarding of the corresponding loop startup. //! It's up to the caller to avoid synchronizations if the loop is disabled: otherwise, the sync tasks will be ignored. //! After the initial state is loaded into memory and the loop starts, any further [`Err`] results do not stop the loop, but rather @@ -449,7 +449,7 @@ fn collect_timeline_files( /// mpsc approach was picked to allow blocking the sync loop if no tasks are present, to avoid meaningless spinning. mod sync_queue { use std::{ - collections::{hash_map, HashMap, HashSet}, + collections::{HashMap, HashSet}, num::NonZeroUsize, ops::ControlFlow, sync::atomic::{AtomicUsize, Ordering}, @@ -460,7 +460,7 @@ mod sync_queue { use tokio::sync::mpsc::{error::TryRecvError, UnboundedReceiver, UnboundedSender}; use tracing::{debug, warn}; - use super::SyncTask; + use super::{SyncTask, SyncTaskBatch}; use utils::zid::ZTenantTimelineId; static SENDER: OnceCell> = OnceCell::new(); @@ -512,10 +512,10 @@ mod sync_queue { /// Not blocking, can return fewer tasks if the queue does not contain enough. /// Batch tasks are split by timelines, with all related tasks merged into one (download/upload) /// or two (download and upload, if both were found in the queue during batch construction). - pub async fn next_task_batch( + pub(super) async fn next_task_batch( receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, max_timelines_to_sync: NonZeroUsize, - ) -> ControlFlow<(), HashMap> { + ) -> ControlFlow<(), HashMap> { // request the first task in blocking fashion to do less meaningless work let (first_sync_id, first_task) = if let Some(first_task) = next_task(receiver).await { first_task @@ -529,26 +529,21 @@ mod sync_queue { batched_timelines.insert(first_sync_id.timeline_id); let mut tasks = HashMap::new(); - tasks.insert(first_sync_id, first_task); + tasks.insert(first_sync_id, SyncTaskBatch::new(first_task)); loop { if batched_timelines.len() >= max_timelines_to_sync { - debug!("Filled a full task batch with {max_timelines_to_sync} timeline sync operations"); + debug!( + "Filled a full task batch with {} timeline sync operations", + batched_timelines.len() + ); break; } match receiver.try_recv() { Ok((sync_id, new_task)) => { LENGTH.fetch_sub(1, Ordering::Relaxed); - match tasks.entry(sync_id) { - hash_map::Entry::Occupied(o) => { - let current = o.remove(); - tasks.insert(sync_id, current.merge(new_task)); - } - hash_map::Entry::Vacant(v) => { - v.insert(new_task); - } - } + tasks.entry(sync_id).or_default().add(new_task); batched_timelines.insert(sync_id.timeline_id); } Err(TryRecvError::Disconnected) => { @@ -583,8 +578,8 @@ pub enum SyncTask { Download(SyncData), /// A certain amount of image files to download. Upload(SyncData), - /// Both upload and download layers need to be synced. - DownloadAndUpload(SyncData, SyncData), + /// Delete remote files. + Delete(SyncData>), } /// Stores the data to synd and its retries, to evict the tasks failing to frequently. @@ -609,121 +604,70 @@ impl SyncTask { Self::Upload(SyncData::new(0, upload_task)) } - /// Merges two tasks into one with the following rules: - /// - /// * Download + Download = Download with the retry counter reset and the layers to skip combined - /// * DownloadAndUpload + Download = DownloadAndUpload with Upload unchanged and the Download counterparts united by the same rules - /// * Upload + Upload = Upload with the retry counter reset and the layers to upload and the uploaded layers combined - /// * DownloadAndUpload + Upload = DownloadAndUpload with Download unchanged and the Upload counterparts united by the same rules - /// * Upload + Download = DownloadAndUpload with both tasks unchanged - /// * DownloadAndUpload + DownloadAndUpload = DownloadAndUpload with both parts united by the same rules - fn merge(mut self, other: Self) -> Self { - match (&mut self, other) { - ( - SyncTask::DownloadAndUpload(download_data, _) | SyncTask::Download(download_data), - SyncTask::Download(new_download_data), - ) - | ( - SyncTask::Download(download_data), - SyncTask::DownloadAndUpload(new_download_data, _), - ) => { - download_data - .data - .layers_to_skip - .extend(new_download_data.data.layers_to_skip.into_iter()); - download_data.retries = 0; - } - (SyncTask::Upload(upload), SyncTask::Download(new_download_data)) => { - self = SyncTask::DownloadAndUpload(new_download_data, upload.clone()); - } + fn delete(layers_to_delete: HashSet) -> Self { + Self::Delete(SyncData::new(0, layers_to_delete)) + } +} - ( - SyncTask::DownloadAndUpload(_, upload_data) | SyncTask::Upload(upload_data), - SyncTask::Upload(new_upload_data), - ) - | (SyncTask::Upload(upload_data), SyncTask::DownloadAndUpload(_, new_upload_data)) => { - upload_data - .data - .layers_to_upload - .extend(new_upload_data.data.layers_to_upload.into_iter()); - upload_data - .data - .uploaded_layers - .extend(new_upload_data.data.uploaded_layers.into_iter()); - upload_data.retries = 0; +#[derive(Debug, Default)] +struct SyncTaskBatch { + upload: Option>, + download: Option>, + delete: Option>>, +} - if new_upload_data - .data - .metadata - .as_ref() - .map(|meta| meta.disk_consistent_lsn()) - > upload_data +impl SyncTaskBatch { + fn new(task: SyncTask) -> Self { + let mut new_self = Self::default(); + new_self.add(task); + new_self + } + + fn add(&mut self, task: SyncTask) { + match task { + SyncTask::Download(new_download) => match &mut self.download { + Some(batch_download) => { + batch_download.retries = batch_download.retries.min(new_download.retries); + batch_download .data + .layers_to_skip + .extend(new_download.data.layers_to_skip.into_iter()); + } + None => self.download = Some(new_download), + }, + SyncTask::Upload(new_upload) => match &mut self.upload { + Some(batch_upload) => { + batch_upload.retries = batch_upload.retries.min(new_upload.retries); + + let batch_data = &mut batch_upload.data; + let new_data = new_upload.data; + batch_data + .layers_to_upload + .extend(new_data.layers_to_upload.into_iter()); + batch_data + .uploaded_layers + .extend(new_data.uploaded_layers.into_iter()); + if batch_data .metadata .as_ref() .map(|meta| meta.disk_consistent_lsn()) - { - upload_data.data.metadata = new_upload_data.data.metadata; + <= new_data + .metadata + .as_ref() + .map(|meta| meta.disk_consistent_lsn()) + { + batch_data.metadata = new_data.metadata; + } } - } - (SyncTask::Download(download), SyncTask::Upload(new_upload_data)) => { - self = SyncTask::DownloadAndUpload(download.clone(), new_upload_data) - } - - ( - SyncTask::DownloadAndUpload(download_data, upload_data), - SyncTask::DownloadAndUpload(new_download_data, new_upload_data), - ) => { - download_data - .data - .layers_to_skip - .extend(new_download_data.data.layers_to_skip.into_iter()); - download_data.retries = 0; - - upload_data - .data - .layers_to_upload - .extend(new_upload_data.data.layers_to_upload.into_iter()); - upload_data - .data - .uploaded_layers - .extend(new_upload_data.data.uploaded_layers.into_iter()); - upload_data.retries = 0; - - if new_upload_data - .data - .metadata - .as_ref() - .map(|meta| meta.disk_consistent_lsn()) - > upload_data - .data - .metadata - .as_ref() - .map(|meta| meta.disk_consistent_lsn()) - { - upload_data.data.metadata = new_upload_data.data.metadata; + None => self.upload = Some(new_upload), + }, + SyncTask::Delete(new_delete) => match &mut self.delete { + Some(batch_delete) => { + batch_delete.retries = batch_delete.retries.min(new_delete.retries); + batch_delete.data.extend(new_delete.data.into_iter()); } - } - } - - self - } - - fn name(&self) -> &'static str { - match self { - SyncTask::Download(_) => "download", - SyncTask::Upload(_) => "upload", - SyncTask::DownloadAndUpload(_, _) => "download and upload", - } - } - - fn retries(&self) -> u32 { - match self { - SyncTask::Download(data) => data.retries, - SyncTask::Upload(data) => data.retries, - SyncTask::DownloadAndUpload(download_data, upload_data) => { - download_data.retries.max(upload_data.retries) - } + None => self.delete = Some(new_delete), + }, } } } @@ -760,6 +704,7 @@ pub fn schedule_layer_upload( layers_to_upload: HashSet, metadata: Option, ) { + debug!("Scheduling layer upload for tenant {tenant_id}, timeline {timeline_id}, to upload: {layers_to_upload:?}"); if !sync_queue::push( ZTenantTimelineId { tenant_id, @@ -771,18 +716,29 @@ pub fn schedule_layer_upload( metadata, }), ) { - warn!("Could not send an upload task for tenant {tenant_id}, timeline {timeline_id}",) + warn!("Could not send an upload task for tenant {tenant_id}, timeline {timeline_id}") } else { debug!("Upload task for tenant {tenant_id}, timeline {timeline_id} sent") } } pub fn schedule_layer_delete( - _tenant_id: ZTenantId, - _timeline_id: ZTimelineId, - _layers_to_delete: HashSet, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + layers_to_delete: HashSet, ) { - // TODO kb implement later + debug!("Scheduling layer deletion for tenant {tenant_id}, timeline {timeline_id}, to delete: {layers_to_delete:?}"); + if !sync_queue::push( + ZTenantTimelineId { + tenant_id, + timeline_id, + }, + SyncTask::delete(layers_to_delete), + ) { + warn!("Could not send deletion task for tenant {tenant_id}, timeline {timeline_id}") + } else { + debug!("Deletion task for tenant {tenant_id}, timeline {timeline_id} sent") + } } /// Requests the download of the entire timeline for a given tenant. @@ -948,13 +904,13 @@ where let mut sync_results = batched_tasks .into_iter() - .map(|(sync_id, task)| { + .map(|(sync_id, batch)| { let storage = Arc::clone(&storage); let index = index.clone(); async move { let state_update = - process_sync_task(conf, storage, index, max_sync_errors, sync_id, task) - .instrument(info_span!("process_sync_tasks", sync_id = %sync_id)) + process_sync_task_batch(conf, storage, index, max_sync_errors, sync_id, batch) + .instrument(info_span!("process_sync_task_batch", sync_id = %sync_id)) .await; (sync_id, state_update) } @@ -978,13 +934,13 @@ where ControlFlow::Continue(new_timeline_states) } -async fn process_sync_task( +async fn process_sync_task_batch( conf: &'static PageServerConf, storage: Arc, index: RemoteIndex, max_sync_errors: NonZeroU32, sync_id: ZTenantTimelineId, - task: SyncTask, + batch: SyncTaskBatch, ) -> Option where P: Debug + Send + Sync + 'static, @@ -993,124 +949,103 @@ where let sync_start = Instant::now(); let current_remote_timeline = { index.read().await.timeline_entry(&sync_id).cloned() }; - let task = match validate_task_retries(sync_id, task, max_sync_errors) { - ControlFlow::Continue(task) => task, - ControlFlow::Break(aborted_task) => { - match aborted_task { - SyncTask::Download(_) => { - index - .write() - .await - .set_awaits_download(&sync_id, false) - .ok(); - } - SyncTask::Upload(failed_upload_data) => { - if let Err(e) = update_remote_data( - conf, - storage.as_ref(), - &index, - sync_id, - &failed_upload_data.data, - true, - ) + let upload_data = batch.upload.clone(); + let download_data = batch.download.clone(); + let ((), status_update) = tokio::join!( + async { + if let Some(upload_data) = upload_data { + match validate_task_retries(upload_data, max_sync_errors) + .instrument(info_span!("retries_validation")) .await - { - error!("Failed to update remote timeline {sync_id}: {e:?}"); + { + ControlFlow::Continue(new_upload_data) => { + upload_timeline_data( + conf, + (storage.as_ref(), &index), + current_remote_timeline.as_ref(), + sync_id, + new_upload_data, + sync_start, + "upload", + ) + .await; } - } - SyncTask::DownloadAndUpload(_, failed_upload_data) => { - index - .write() + ControlFlow::Break(failed_upload_data) => { + if let Err(e) = update_remote_data( + conf, + storage.as_ref(), + &index, + sync_id, + &failed_upload_data.data, + true, + ) .await - .set_awaits_download(&sync_id, false) - .ok(); - if let Err(e) = update_remote_data( - conf, - storage.as_ref(), - &index, - sync_id, - &failed_upload_data.data, - true, - ) - .await - { - error!("Failed to update remote timeline {sync_id}: {e:?}"); + { + error!("Failed to update remote timeline {sync_id}: {e:?}"); + } } } } - return None; } - }; - - let task_name = task.name(); - let current_task_attempt = task.retries(); - info!("Sync task '{task_name}' processing started, attempt #{current_task_attempt}"); - - if current_task_attempt > 0 { - let seconds_to_wait = 2.0_f64.powf(current_task_attempt as f64 - 1.0).min(30.0); - info!("Waiting {seconds_to_wait} seconds before starting the '{task_name}' task"); - tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await; - } - - let status_update = match task { - SyncTask::Download(new_download_data) => { - download_timeline( - conf, - (storage.as_ref(), &index), - current_remote_timeline.as_ref(), - sync_id, - new_download_data, - sync_start, - task_name, - ) - .await - } - SyncTask::Upload(new_upload_data) => { - upload_timeline( - conf, - (storage.as_ref(), &index), - current_remote_timeline.as_ref(), - sync_id, - new_upload_data, - sync_start, - task_name, - ) - .await; + .instrument(info_span!("upload_timeline_data")), + async { + if let Some(download_data) = download_data { + match validate_task_retries(download_data, max_sync_errors) + .instrument(info_span!("retries_validation")) + .await + { + ControlFlow::Continue(new_download_data) => { + return download_timeline_data( + conf, + (storage.as_ref(), &index), + current_remote_timeline.as_ref(), + sync_id, + new_download_data, + sync_start, + "download", + ) + .await + } + ControlFlow::Break(_) => { + index + .write() + .await + .set_awaits_download(&sync_id, false) + .ok(); + } + } + } None } - SyncTask::DownloadAndUpload(new_download_data, new_upload_data) => { - let status_update = download_timeline( - conf, - (storage.as_ref(), &index), - current_remote_timeline.as_ref(), - sync_id, - new_download_data, - sync_start, - task_name, - ) - .await; + .instrument(info_span!("download_timeline_data")), + ); - upload_timeline( - conf, - (storage.as_ref(), &index), - current_remote_timeline.as_ref(), - sync_id, - new_upload_data, - sync_start, - task_name, - ) - .await; - - status_update + if let Some(delete_data) = batch.delete { + match validate_task_retries(delete_data, max_sync_errors) + .instrument(info_span!("retries_validation")) + .await + { + ControlFlow::Continue(new_delete_data) => { + delete_timeline_data( + conf, + (storage.as_ref(), &index), + current_remote_timeline.as_ref(), + sync_id, + new_delete_data, + sync_start, + "delete", + ) + .instrument(info_span!("delete_timeline_data")) + .await; + } + ControlFlow::Break(_) => {} } - }; - - info!("Finished processing the task"); + } status_update } -async fn download_timeline( +async fn download_timeline_data( conf: &'static PageServerConf, (storage, index): (&S, &RemoteIndex), current_remote_timeline: Option<&RemoteTimeline>, @@ -1228,6 +1163,31 @@ async fn update_local_metadata( Ok(()) } +async fn delete_timeline_data( + conf: &PageServerConf, + index: (&S, &RemoteIndex), + as_ref: Option<&RemoteTimeline>, + sync_id: ZTenantTimelineId, + new_delete_data: SyncData>, + sync_start: Instant, + task_name: &str, +) -> Option<()> +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + // match update_remote_data(conf, storage, index, sync_id, &uploaded_data.data, false).await { + // Ok(()) => register_sync_status(sync_start, task_name, Some(true)), + // Err(e) => { + // error!("Failed to update remote timeline {sync_id}: {e:?}"); + // uploaded_data.retries += 1; + // sync_queue::push(sync_id, SyncTask::Upload(uploaded_data)); + // register_sync_status(sync_start, task_name, Some(false)); + // } + // } + todo!("TODO kb") +} + async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result { TimelineMetadata::from_bytes( &fs::read(metadata_path) @@ -1237,7 +1197,7 @@ async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result( +async fn upload_timeline_data( conf: &'static PageServerConf, (storage, index): (&S, &RemoteIndex), current_remote_timeline: Option<&RemoteTimeline>, @@ -1245,7 +1205,8 @@ async fn upload_timeline( new_upload_data: SyncData, sync_start: Instant, task_name: &str, -) where +) -> Option<()> +where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { @@ -1255,7 +1216,7 @@ async fn upload_timeline( { UploadedTimeline::FailedAndRescheduled => { register_sync_status(sync_start, task_name, Some(false)); - return; + return None; } UploadedTimeline::Successful(upload_data) => upload_data, UploadedTimeline::SuccessfulAfterLocalFsUpdate(mut outdated_upload_data) => { @@ -1272,7 +1233,7 @@ async fn upload_timeline( outdated_upload_data.retries += 1; sync_queue::push(sync_id, SyncTask::Upload(outdated_upload_data)); register_sync_status(sync_start, task_name, Some(false)); - return; + return None; } }; outdated_upload_data.data.metadata = Some(local_metadata); @@ -1282,12 +1243,16 @@ async fn upload_timeline( }; match update_remote_data(conf, storage, index, sync_id, &uploaded_data.data, false).await { - Ok(()) => register_sync_status(sync_start, task_name, Some(true)), + Ok(()) => { + register_sync_status(sync_start, task_name, Some(true)); + Some(()) + } Err(e) => { error!("Failed to update remote timeline {sync_id}: {e:?}"); uploaded_data.retries += 1; sync_queue::push(sync_id, SyncTask::Upload(uploaded_data)); register_sync_status(sync_start, task_name, Some(false)); + None } } } @@ -1358,51 +1323,25 @@ where .context("Failed to upload new index part") } -fn validate_task_retries( - sync_id: ZTenantTimelineId, - task: SyncTask, +async fn validate_task_retries( + sync_data: SyncData, max_sync_errors: NonZeroU32, -) -> ControlFlow { +) -> ControlFlow, SyncData> { + let current_attempt = sync_data.retries; let max_sync_errors = max_sync_errors.get(); - let mut skip_upload = false; - let mut skip_download = false; - - match &task { - SyncTask::Download(download_data) | SyncTask::DownloadAndUpload(download_data, _) - if download_data.retries > max_sync_errors => - { - error!( - "Evicting download task for timeline {sync_id} that failed {} times, exceeding the error threshold {max_sync_errors}", - download_data.retries - ); - skip_download = true; - } - SyncTask::Upload(upload_data) | SyncTask::DownloadAndUpload(_, upload_data) - if upload_data.retries > max_sync_errors => - { - error!( - "Evicting upload task for timeline {sync_id} that failed {} times, exceeding the error threshold {max_sync_errors}", - upload_data.retries, - ); - skip_upload = true; - } - _ => {} + if current_attempt >= max_sync_errors { + error!( + "Aborting task that failed {current_attempt} times, exceeding retries threshold of {max_sync_errors}", + ); + return ControlFlow::Break(sync_data); } - match task { - aborted_task @ SyncTask::Download(_) if skip_download => ControlFlow::Break(aborted_task), - aborted_task @ SyncTask::Upload(_) if skip_upload => ControlFlow::Break(aborted_task), - aborted_task @ SyncTask::DownloadAndUpload(_, _) if skip_upload && skip_download => { - ControlFlow::Break(aborted_task) - } - SyncTask::DownloadAndUpload(download_task, _) if skip_upload => { - ControlFlow::Continue(SyncTask::Download(download_task)) - } - SyncTask::DownloadAndUpload(_, upload_task) if skip_download => { - ControlFlow::Continue(SyncTask::Upload(upload_task)) - } - not_skipped => ControlFlow::Continue(not_skipped), + if current_attempt > 0 { + let seconds_to_wait = 2.0_f64.powf(current_attempt as f64 - 1.0).min(30.0); + info!("Waiting {seconds_to_wait} seconds before starting the task"); + tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await; } + ControlFlow::Continue(sync_data) } async fn try_fetch_index_parts( @@ -1602,370 +1541,3 @@ mod test_utils { TimelineMetadata::new(disk_consistent_lsn, None, None, Lsn(0), Lsn(0), Lsn(0)) } } - -#[cfg(test)] -mod tests { - use std::collections::BTreeSet; - - use super::{test_utils::dummy_metadata, *}; - use utils::lsn::Lsn; - - #[test] - fn download_sync_tasks_merge() { - let download_1 = SyncTask::Download(SyncData::new( - 2, - TimelineDownload { - layers_to_skip: HashSet::from([PathBuf::from("one")]), - }, - )); - let download_2 = SyncTask::Download(SyncData::new( - 6, - TimelineDownload { - layers_to_skip: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), - }, - )); - - let merged_download = match download_1.merge(download_2) { - SyncTask::Download(merged_download) => merged_download, - wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), - }; - - assert_eq!( - merged_download.retries, 0, - "Merged task should have its retries counter reset" - ); - - assert_eq!( - merged_download - .data - .layers_to_skip - .into_iter() - .collect::>(), - BTreeSet::from([ - PathBuf::from("one"), - PathBuf::from("two"), - PathBuf::from("three") - ]), - "Merged download tasks should a combined set of layers to skip" - ); - } - - #[test] - fn upload_sync_tasks_merge() { - let metadata_1 = dummy_metadata(Lsn(1)); - let metadata_2 = dummy_metadata(Lsn(2)); - assert!(metadata_2.disk_consistent_lsn() > metadata_1.disk_consistent_lsn()); - - let upload_1 = SyncTask::Upload(SyncData::new( - 2, - TimelineUpload { - layers_to_upload: HashSet::from([PathBuf::from("one")]), - uploaded_layers: HashSet::from([PathBuf::from("u_one")]), - metadata: Some(metadata_1), - }, - )); - let upload_2 = SyncTask::Upload(SyncData::new( - 6, - TimelineUpload { - layers_to_upload: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), - uploaded_layers: HashSet::from([PathBuf::from("u_two")]), - metadata: Some(metadata_2.clone()), - }, - )); - - let merged_upload = match upload_1.merge(upload_2) { - SyncTask::Upload(merged_upload) => merged_upload, - wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), - }; - - assert_eq!( - merged_upload.retries, 0, - "Merged task should have its retries counter reset" - ); - - let upload = merged_upload.data; - assert_eq!( - upload.layers_to_upload.into_iter().collect::>(), - BTreeSet::from([ - PathBuf::from("one"), - PathBuf::from("two"), - PathBuf::from("three") - ]), - "Merged upload tasks should a combined set of layers to upload" - ); - - assert_eq!( - upload.uploaded_layers.into_iter().collect::>(), - BTreeSet::from([PathBuf::from("u_one"), PathBuf::from("u_two"),]), - "Merged upload tasks should a combined set of uploaded layers" - ); - - assert_eq!( - upload.metadata, - Some(metadata_2), - "Merged upload tasks should have a metadata with biggest disk_consistent_lsn" - ); - } - - #[test] - fn upload_and_download_sync_tasks_merge() { - let download_data = SyncData::new( - 3, - TimelineDownload { - layers_to_skip: HashSet::from([PathBuf::from("d_one")]), - }, - ); - - let upload_data = SyncData::new( - 2, - TimelineUpload { - layers_to_upload: HashSet::from([PathBuf::from("u_one")]), - uploaded_layers: HashSet::from([PathBuf::from("u_one_2")]), - metadata: Some(dummy_metadata(Lsn(1))), - }, - ); - - let (merged_download, merged_upload) = match SyncTask::Download(download_data.clone()) - .merge(SyncTask::Upload(upload_data.clone())) - { - SyncTask::DownloadAndUpload(merged_download, merged_upload) => { - (merged_download, merged_upload) - } - wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), - }; - - assert_eq!( - merged_download, download_data, - "When upload and dowload are merged, both should be unchanged" - ); - assert_eq!( - merged_upload, upload_data, - "When upload and dowload are merged, both should be unchanged" - ); - } - - #[test] - fn uploaddownload_and_upload_sync_tasks_merge() { - let download_data = SyncData::new( - 3, - TimelineDownload { - layers_to_skip: HashSet::from([PathBuf::from("d_one")]), - }, - ); - - let metadata_1 = dummy_metadata(Lsn(5)); - let metadata_2 = dummy_metadata(Lsn(2)); - assert!(metadata_1.disk_consistent_lsn() > metadata_2.disk_consistent_lsn()); - - let upload_download = SyncTask::DownloadAndUpload( - download_data.clone(), - SyncData::new( - 2, - TimelineUpload { - layers_to_upload: HashSet::from([PathBuf::from("one")]), - uploaded_layers: HashSet::from([PathBuf::from("u_one")]), - metadata: Some(metadata_1.clone()), - }, - ), - ); - - let new_upload = SyncTask::Upload(SyncData::new( - 6, - TimelineUpload { - layers_to_upload: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), - uploaded_layers: HashSet::from([PathBuf::from("u_two")]), - metadata: Some(metadata_2), - }, - )); - - let (merged_download, merged_upload) = match upload_download.merge(new_upload) { - SyncTask::DownloadAndUpload(merged_download, merged_upload) => { - (merged_download, merged_upload) - } - wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), - }; - - assert_eq!( - merged_download, download_data, - "When uploaddowload and upload tasks are merged, download should be unchanged" - ); - - assert_eq!( - merged_upload.retries, 0, - "Merged task should have its retries counter reset" - ); - let upload = merged_upload.data; - assert_eq!( - upload.layers_to_upload.into_iter().collect::>(), - BTreeSet::from([ - PathBuf::from("one"), - PathBuf::from("two"), - PathBuf::from("three") - ]), - "Merged upload tasks should a combined set of layers to upload" - ); - - assert_eq!( - upload.uploaded_layers.into_iter().collect::>(), - BTreeSet::from([PathBuf::from("u_one"), PathBuf::from("u_two"),]), - "Merged upload tasks should a combined set of uploaded layers" - ); - - assert_eq!( - upload.metadata, - Some(metadata_1), - "Merged upload tasks should have a metadata with biggest disk_consistent_lsn" - ); - } - - #[test] - fn uploaddownload_and_download_sync_tasks_merge() { - let upload_data = SyncData::new( - 22, - TimelineUpload { - layers_to_upload: HashSet::from([PathBuf::from("one")]), - uploaded_layers: HashSet::from([PathBuf::from("u_one")]), - metadata: Some(dummy_metadata(Lsn(22))), - }, - ); - - let upload_download = SyncTask::DownloadAndUpload( - SyncData::new( - 2, - TimelineDownload { - layers_to_skip: HashSet::from([PathBuf::from("one")]), - }, - ), - upload_data.clone(), - ); - - let new_download = SyncTask::Download(SyncData::new( - 6, - TimelineDownload { - layers_to_skip: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), - }, - )); - - let (merged_download, merged_upload) = match upload_download.merge(new_download) { - SyncTask::DownloadAndUpload(merged_download, merged_upload) => { - (merged_download, merged_upload) - } - wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), - }; - - assert_eq!( - merged_upload, upload_data, - "When uploaddowload and download tasks are merged, upload should be unchanged" - ); - - assert_eq!( - merged_download.retries, 0, - "Merged task should have its retries counter reset" - ); - assert_eq!( - merged_download - .data - .layers_to_skip - .into_iter() - .collect::>(), - BTreeSet::from([ - PathBuf::from("one"), - PathBuf::from("two"), - PathBuf::from("three") - ]), - "Merged download tasks should a combined set of layers to skip" - ); - } - - #[test] - fn uploaddownload_sync_tasks_merge() { - let metadata_1 = dummy_metadata(Lsn(1)); - let metadata_2 = dummy_metadata(Lsn(2)); - assert!(metadata_2.disk_consistent_lsn() > metadata_1.disk_consistent_lsn()); - - let upload_download = SyncTask::DownloadAndUpload( - SyncData::new( - 2, - TimelineDownload { - layers_to_skip: HashSet::from([PathBuf::from("one")]), - }, - ), - SyncData::new( - 2, - TimelineUpload { - layers_to_upload: HashSet::from([PathBuf::from("one")]), - uploaded_layers: HashSet::from([PathBuf::from("u_one")]), - metadata: Some(metadata_1), - }, - ), - ); - let new_upload_download = SyncTask::DownloadAndUpload( - SyncData::new( - 6, - TimelineDownload { - layers_to_skip: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), - }, - ), - SyncData::new( - 6, - TimelineUpload { - layers_to_upload: HashSet::from([PathBuf::from("two"), PathBuf::from("three")]), - uploaded_layers: HashSet::from([PathBuf::from("u_two")]), - metadata: Some(metadata_2.clone()), - }, - ), - ); - - let (merged_download, merged_upload) = match upload_download.merge(new_upload_download) { - SyncTask::DownloadAndUpload(merged_download, merged_upload) => { - (merged_download, merged_upload) - } - wrong_merge_result => panic!("Unexpected merge result: {wrong_merge_result:?}"), - }; - - assert_eq!( - merged_download.retries, 0, - "Merged task should have its retries counter reset" - ); - assert_eq!( - merged_download - .data - .layers_to_skip - .into_iter() - .collect::>(), - BTreeSet::from([ - PathBuf::from("one"), - PathBuf::from("two"), - PathBuf::from("three") - ]), - "Merged download tasks should a combined set of layers to skip" - ); - - assert_eq!( - merged_upload.retries, 0, - "Merged task should have its retries counter reset" - ); - let upload = merged_upload.data; - assert_eq!( - upload.layers_to_upload.into_iter().collect::>(), - BTreeSet::from([ - PathBuf::from("one"), - PathBuf::from("two"), - PathBuf::from("three") - ]), - "Merged upload tasks should a combined set of layers to upload" - ); - - assert_eq!( - upload.uploaded_layers.into_iter().collect::>(), - BTreeSet::from([PathBuf::from("u_one"), PathBuf::from("u_two"),]), - "Merged upload tasks should a combined set of uploaded layers" - ); - - assert_eq!( - upload.metadata, - Some(metadata_2), - "Merged upload tasks should have a metadata with biggest disk_consistent_lsn" - ); - } -} diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index d847e03a24..b52ce8c95f 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -8,7 +8,7 @@ use std::{ sync::Arc, }; -use anyhow::{Context, Ok}; +use anyhow::{anyhow, Context, Ok}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use tokio::sync::RwLock; @@ -113,7 +113,7 @@ impl RemoteTimelineIndex { awaits_download: bool, ) -> anyhow::Result<()> { self.timeline_entry_mut(id) - .ok_or_else(|| anyhow::anyhow!("unknown timeline sync {}", id))? + .ok_or_else(|| anyhow!("unknown timeline sync {id}"))? .awaits_download = awaits_download; Ok(()) } From 64a602b8f3b743b543f5b36cad7aa39e82491b0c Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sun, 1 May 2022 12:10:24 +0300 Subject: [PATCH 0261/1022] Delete timeline layers --- pageserver/src/layered_repository.rs | 2 +- .../src/layered_repository/layer_map.rs | 19 +- pageserver/src/storage_sync.rs | 230 ++++++++++++------ pageserver/src/storage_sync/delete.rs | 1 + pageserver/src/storage_sync/download.rs | 5 + pageserver/src/storage_sync/index.rs | 7 + pageserver/src/storage_sync/upload.rs | 5 + 7 files changed, 184 insertions(+), 85 deletions(-) create mode 100644 pageserver/src/storage_sync/delete.rs diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 039bf8d1ed..01c2b961eb 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1881,7 +1881,7 @@ impl LayeredTimeline { for part_range in &partition.ranges { let image_coverage = layers.image_coverage(part_range, lsn)?; for (img_range, last_img) in image_coverage { - let img_lsn = if let Some(ref last_img) = last_img { + let img_lsn = if let Some(last_img) = last_img { last_img.get_lsn_range().end } else { Lsn(0) diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 7a2d0d5bcd..7491294c03 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -132,17 +132,15 @@ impl LayerMap { // this layer contains the requested point in the key/lsn space. // No need to search any further trace!( - "found layer {} for request on {} at {}", + "found layer {} for request on {key} at {end_lsn}", l.filename().display(), - key, - end_lsn ); latest_delta.replace(Arc::clone(l)); break; } // this layer's end LSN is smaller than the requested point. If there's // nothing newer, this is what we need to return. Remember this. - if let Some(ref old_candidate) = latest_delta { + if let Some(old_candidate) = &latest_delta { if l.get_lsn_range().end > old_candidate.get_lsn_range().end { latest_delta.replace(Arc::clone(l)); } @@ -152,10 +150,8 @@ impl LayerMap { } if let Some(l) = latest_delta { trace!( - "found (old) layer {} for request on {} at {}", + "found (old) layer {} for request on {key} at {end_lsn}", l.filename().display(), - key, - end_lsn ); let lsn_floor = std::cmp::max( Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1), @@ -166,17 +162,13 @@ impl LayerMap { layer: l, })) } else if let Some(l) = latest_img { - trace!( - "found img layer and no deltas for request on {} at {}", - key, - end_lsn - ); + trace!("found img layer and no deltas for request on {key} at {end_lsn}"); Ok(Some(SearchResult { lsn_floor: latest_img_lsn.unwrap(), layer: l, })) } else { - trace!("no layer found for request on {} at {}", key, end_lsn); + trace!("no layer found for request on {key} at {end_lsn}"); Ok(None) } } @@ -194,7 +186,6 @@ impl LayerMap { /// /// This should be called when the corresponding file on disk has been deleted. /// - #[allow(dead_code)] pub fn remove_historic(&mut self, layer: Arc) { let len_before = self.historic_layers.len(); diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index b6091015b9..52e0df3784 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -141,6 +141,7 @@ //! //! When pageserver signals shutdown, current sync task gets finished and the loop exists. +mod delete; mod download; pub mod index; mod upload; @@ -168,6 +169,7 @@ use tokio::{ use tracing::*; use self::{ + delete::delete_timeline_layers, download::{download_timeline_layers, DownloadedTimeline}, index::{IndexPart, RemoteTimeline, RemoteTimelineIndex}, upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, @@ -579,7 +581,7 @@ pub enum SyncTask { /// A certain amount of image files to download. Upload(SyncData), /// Delete remote files. - Delete(SyncData>), + Delete(SyncData), } /// Stores the data to synd and its retries, to evict the tasks failing to frequently. @@ -604,8 +606,8 @@ impl SyncTask { Self::Upload(SyncData::new(0, upload_task)) } - fn delete(layers_to_delete: HashSet) -> Self { - Self::Delete(SyncData::new(0, layers_to_delete)) + fn delete(delete_task: TimelineDelete) -> Self { + Self::Delete(SyncData::new(0, delete_task)) } } @@ -613,7 +615,7 @@ impl SyncTask { struct SyncTaskBatch { upload: Option>, download: Option>, - delete: Option>>, + delete: Option>, } impl SyncTaskBatch { @@ -664,7 +666,15 @@ impl SyncTaskBatch { SyncTask::Delete(new_delete) => match &mut self.delete { Some(batch_delete) => { batch_delete.retries = batch_delete.retries.min(new_delete.retries); - batch_delete.data.extend(new_delete.data.into_iter()); + + batch_delete + .data + .layers_to_delete + .extend(new_delete.data.layers_to_delete.into_iter()); + batch_delete + .data + .deleted_layers + .extend(new_delete.data.deleted_layers.into_iter()); } None => self.delete = Some(new_delete), }, @@ -694,6 +704,13 @@ pub struct TimelineDownload { layers_to_skip: HashSet, } +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TimelineDelete { + layers_to_delete: HashSet, + deleted_layers: HashSet, + deletion_registered: bool, +} + /// Adds the new checkpoint files as an upload sync task to the queue. /// On task failure, it gets retried again from the start a number of times. /// @@ -733,7 +750,11 @@ pub fn schedule_layer_delete( tenant_id, timeline_id, }, - SyncTask::delete(layers_to_delete), + SyncTask::delete(TimelineDelete { + layers_to_delete, + deleted_layers: HashSet::new(), + deletion_registered: false, + }), ) { warn!("Could not send deletion task for tenant {tenant_id}, timeline {timeline_id}") } else { @@ -951,7 +972,7 @@ where let upload_data = batch.upload.clone(); let download_data = batch.download.clone(); - let ((), status_update) = tokio::join!( + let (upload_result, status_update) = tokio::join!( async { if let Some(upload_data) = upload_data { match validate_task_retries(upload_data, max_sync_errors) @@ -969,6 +990,7 @@ where "upload", ) .await; + return Some(()); } ControlFlow::Break(failed_upload_data) => { if let Err(e) = update_remote_data( @@ -976,8 +998,10 @@ where storage.as_ref(), &index, sync_id, - &failed_upload_data.data, - true, + RemoteDataUpdate::Upload { + uploaded_data: failed_upload_data.data, + upload_failed: true, + }, ) .await { @@ -986,6 +1010,7 @@ where } } } + None } .instrument(info_span!("upload_timeline_data")), async { @@ -1029,7 +1054,6 @@ where delete_timeline_data( conf, (storage.as_ref(), &index), - current_remote_timeline.as_ref(), sync_id, new_delete_data, sync_start, @@ -1038,7 +1062,19 @@ where .instrument(info_span!("delete_timeline_data")) .await; } - ControlFlow::Break(_) => {} + ControlFlow::Break(failed_delete_data) => { + if let Err(e) = update_remote_data( + conf, + storage.as_ref(), + &index, + sync_id, + RemoteDataUpdate::Delete(&failed_delete_data.data.deleted_layers), + ) + .await + { + error!("Failed to update remote timeline {sync_id}: {e:?}"); + } + } } } @@ -1072,22 +1108,19 @@ where if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) { error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}"); } - None } DownloadedTimeline::FailedAndRescheduled => { register_sync_status(sync_start, task_name, Some(false)); - None } DownloadedTimeline::Successful(mut download_data) => { match update_local_metadata(conf, sync_id, current_remote_timeline).await { Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) { Ok(()) => { register_sync_status(sync_start, task_name, Some(true)); - Some(TimelineSyncStatusUpdate::Downloaded) + return Some(TimelineSyncStatusUpdate::Downloaded); } Err(e) => { error!("Timeline {sync_id} was expected to be in the remote index after a sucessful download, but it's absent: {e:?}"); - None } }, Err(e) => { @@ -1095,11 +1128,12 @@ where download_data.retries += 1; sync_queue::push(sync_id, SyncTask::Download(download_data)); register_sync_status(sync_start, task_name, Some(false)); - None } } } } + + None } async fn update_local_metadata( @@ -1164,28 +1198,39 @@ async fn update_local_metadata( } async fn delete_timeline_data( - conf: &PageServerConf, - index: (&S, &RemoteIndex), - as_ref: Option<&RemoteTimeline>, + conf: &'static PageServerConf, + (storage, index): (&S, &RemoteIndex), sync_id: ZTenantTimelineId, - new_delete_data: SyncData>, + mut new_delete_data: SyncData, sync_start: Instant, task_name: &str, -) -> Option<()> -where +) where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { - // match update_remote_data(conf, storage, index, sync_id, &uploaded_data.data, false).await { - // Ok(()) => register_sync_status(sync_start, task_name, Some(true)), - // Err(e) => { - // error!("Failed to update remote timeline {sync_id}: {e:?}"); - // uploaded_data.retries += 1; - // sync_queue::push(sync_id, SyncTask::Upload(uploaded_data)); - // register_sync_status(sync_start, task_name, Some(false)); - // } - // } - todo!("TODO kb") + let timeline_delete = &mut new_delete_data.data; + + if !timeline_delete.deletion_registered { + if let Err(e) = update_remote_data( + conf, + storage, + index, + sync_id, + RemoteDataUpdate::Delete(&timeline_delete.layers_to_delete), + ) + .await + { + error!("Failed to update remote timeline {sync_id}: {e:?}"); + new_delete_data.retries += 1; + sync_queue::push(sync_id, SyncTask::Delete(new_delete_data)); + register_sync_status(sync_start, task_name, Some(false)); + return; + } + } + timeline_delete.deletion_registered = true; + + let sync_status = delete_timeline_layers(storage, sync_id, new_delete_data).await; + register_sync_status(sync_start, task_name, Some(sync_status)); } async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result { @@ -1205,8 +1250,7 @@ async fn upload_timeline_data( new_upload_data: SyncData, sync_start: Instant, task_name: &str, -) -> Option<()> -where +) where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { @@ -1216,7 +1260,7 @@ where { UploadedTimeline::FailedAndRescheduled => { register_sync_status(sync_start, task_name, Some(false)); - return None; + return; } UploadedTimeline::Successful(upload_data) => upload_data, UploadedTimeline::SuccessfulAfterLocalFsUpdate(mut outdated_upload_data) => { @@ -1233,37 +1277,54 @@ where outdated_upload_data.retries += 1; sync_queue::push(sync_id, SyncTask::Upload(outdated_upload_data)); register_sync_status(sync_start, task_name, Some(false)); - return None; + return; } }; + outdated_upload_data.data.metadata = Some(local_metadata); } outdated_upload_data } }; - match update_remote_data(conf, storage, index, sync_id, &uploaded_data.data, false).await { + match update_remote_data( + conf, + storage, + index, + sync_id, + RemoteDataUpdate::Upload { + uploaded_data: uploaded_data.data.clone(), + upload_failed: false, + }, + ) + .await + { Ok(()) => { register_sync_status(sync_start, task_name, Some(true)); - Some(()) } Err(e) => { error!("Failed to update remote timeline {sync_id}: {e:?}"); uploaded_data.retries += 1; sync_queue::push(sync_id, SyncTask::Upload(uploaded_data)); register_sync_status(sync_start, task_name, Some(false)); - None } } } +enum RemoteDataUpdate<'a> { + Upload { + uploaded_data: TimelineUpload, + upload_failed: bool, + }, + Delete(&'a HashSet), +} + async fn update_remote_data( conf: &'static PageServerConf, storage: &S, index: &RemoteIndex, sync_id: ZTenantTimelineId, - uploaded_data: &TimelineUpload, - upload_failed: bool, + update: RemoteDataUpdate<'_>, ) -> anyhow::Result<()> where P: Debug + Send + Sync + 'static, @@ -1275,40 +1336,59 @@ where match index_accessor.timeline_entry_mut(&sync_id) { Some(existing_entry) => { - if let Some(new_metadata) = uploaded_data.metadata.as_ref() { - if existing_entry.metadata.disk_consistent_lsn() - < new_metadata.disk_consistent_lsn() - { - existing_entry.metadata = new_metadata.clone(); + match update { + RemoteDataUpdate::Upload { + uploaded_data, + upload_failed, + } => { + if let Some(new_metadata) = uploaded_data.metadata.as_ref() { + if existing_entry.metadata.disk_consistent_lsn() + < new_metadata.disk_consistent_lsn() + { + existing_entry.metadata = new_metadata.clone(); + } + } + if upload_failed { + existing_entry.add_upload_failures( + uploaded_data.layers_to_upload.iter().cloned(), + ); + } else { + existing_entry + .add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned()); + } + } + RemoteDataUpdate::Delete(layers_to_remove) => { + existing_entry.remove_layers(layers_to_remove) } - } - - if upload_failed { - existing_entry - .add_upload_failures(uploaded_data.layers_to_upload.iter().cloned()); - } else { - existing_entry - .add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned()); } existing_entry.clone() } - None => { - let new_metadata = match uploaded_data.metadata.as_ref() { - Some(new_metadata) => new_metadata, - None => bail!("For timeline {sync_id} upload, there's no upload metadata and no remote index entry, cannot create a new one"), - }; - let mut new_remote_timeline = RemoteTimeline::new(new_metadata.clone()); - if upload_failed { - new_remote_timeline - .add_upload_failures(uploaded_data.layers_to_upload.iter().cloned()); - } else { - new_remote_timeline - .add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned()); - } + None => match update { + RemoteDataUpdate::Upload { + uploaded_data, + upload_failed, + } => { + let new_metadata = match uploaded_data.metadata.as_ref() { + Some(new_metadata) => new_metadata, + None => bail!("For timeline {sync_id} upload, there's no upload metadata and no remote index entry, cannot create a new one"), + }; + let mut new_remote_timeline = RemoteTimeline::new(new_metadata.clone()); + if upload_failed { + new_remote_timeline + .add_upload_failures(uploaded_data.layers_to_upload.iter().cloned()); + } else { + new_remote_timeline + .add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned()); + } - index_accessor.add_timeline_entry(sync_id, new_remote_timeline.clone()); - new_remote_timeline - } + index_accessor.add_timeline_entry(sync_id, new_remote_timeline.clone()); + new_remote_timeline + } + RemoteDataUpdate::Delete(_) => { + warn!("No remote index entry for timeline {sync_id}, skipping deletion"); + return Ok(()); + } + }, } }; @@ -1541,3 +1621,13 @@ mod test_utils { TimelineMetadata::new(disk_consistent_lsn, None, None, Lsn(0), Lsn(0), Lsn(0)) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn batching_tests() { + todo!("TODO kb") + } +} diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/pageserver/src/storage_sync/delete.rs @@ -0,0 +1 @@ + diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index dca08bca5d..3cd6de57c7 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -120,6 +120,11 @@ where debug!("Layers to download: {layers_to_download:?}"); info!("Downloading {} timeline layers", layers_to_download.len()); + if layers_to_download.is_empty() { + info!("No layers to download after filtering, skipping"); + return DownloadedTimeline::Successful(download_data); + } + let mut download_tasks = layers_to_download .into_iter() .map(|layer_desination_path| async move { diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index b52ce8c95f..7764a810bc 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -147,6 +147,13 @@ impl RemoteTimeline { self.missing_layers.extend(upload_failures.into_iter()); } + pub fn remove_layers(&mut self, layers_to_remove: &HashSet) { + self.timeline_layers + .retain(|layer| !layers_to_remove.contains(layer)); + self.missing_layers + .retain(|layer| !layers_to_remove.contains(layer)); + } + /// Lists all layer files in the given remote timeline. Omits the metadata file. pub fn stored_files(&self) -> &HashSet { &self.timeline_layers diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 55089df7bc..1e2594ac70 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -106,6 +106,11 @@ where .cloned() .collect::>(); + if layers_to_upload.is_empty() { + info!("No layers to upload after filtering, aborting"); + return UploadedTimeline::Successful(upload_data); + } + debug!("Layers to upload: {layers_to_upload:?}"); info!( "Uploading {} timeline layers, new lsn: {new_upload_lsn:?}", From 0a7735a65676737bb97440511ccd742bfdce68dd Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sun, 1 May 2022 19:07:17 +0300 Subject: [PATCH 0262/1022] Rework remote storage sync queue, general refactoring --- .../src/remote_storage/storage_sync/delete.rs | 223 ++++++ pageserver/src/storage_sync.rs | 725 ++++++++++++------ pageserver/src/storage_sync/delete.rs | 227 ++++++ pageserver/src/storage_sync/download.rs | 30 +- pageserver/src/storage_sync/upload.rs | 47 +- 5 files changed, 974 insertions(+), 278 deletions(-) create mode 100644 pageserver/src/remote_storage/storage_sync/delete.rs diff --git a/pageserver/src/remote_storage/storage_sync/delete.rs b/pageserver/src/remote_storage/storage_sync/delete.rs new file mode 100644 index 0000000000..00e7c85e35 --- /dev/null +++ b/pageserver/src/remote_storage/storage_sync/delete.rs @@ -0,0 +1,223 @@ +//! Timeline synchrnonization logic to delete a bulk of timeline's remote files from the remote storage. + +use anyhow::Context; +use futures::stream::{FuturesUnordered, StreamExt}; +use tracing::{debug, error, info}; +use utils::zid::ZTenantTimelineId; + +use crate::remote_storage::{ + storage_sync::{SyncQueue, SyncTask}, + RemoteStorage, +}; + +use super::{LayersDeletion, SyncData}; + +/// Attempts to remove the timleline layers from the remote storage. +/// If the task had not adjusted the metadata before, the deletion will fail. +pub(super) async fn delete_timeline_layers<'a, P, S>( + storage: &'a S, + sync_queue: &SyncQueue, + sync_id: ZTenantTimelineId, + mut delete_data: SyncData, +) -> bool +where + P: std::fmt::Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + if !delete_data.data.deletion_registered { + error!("Cannot delete timeline layers before the deletion metadata is not registered, reenqueueing"); + delete_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Delete(delete_data)); + return false; + } + + if delete_data.data.layers_to_delete.is_empty() { + info!("No layers to delete, skipping"); + return true; + } + + let layers_to_delete = delete_data + .data + .layers_to_delete + .drain() + .collect::>(); + debug!("Layers to delete: {layers_to_delete:?}"); + info!("Deleting {} timeline layers", layers_to_delete.len()); + + let mut delete_tasks = layers_to_delete + .into_iter() + .map(|local_layer_path| async { + let storage_path = match storage.storage_path(&local_layer_path).with_context(|| { + format!( + "Failed to get the layer storage path for local path '{}'", + local_layer_path.display() + ) + }) { + Ok(path) => path, + Err(e) => return Err((e, local_layer_path)), + }; + + match storage.delete(&storage_path).await.with_context(|| { + format!( + "Failed to delete remote layer from storage at '{:?}'", + storage_path + ) + }) { + Ok(()) => Ok(local_layer_path), + Err(e) => Err((e, local_layer_path)), + } + }) + .collect::>(); + + let mut errored = false; + while let Some(deletion_result) = delete_tasks.next().await { + match deletion_result { + Ok(local_layer_path) => { + debug!( + "Successfully deleted layer {} for timeline {sync_id}", + local_layer_path.display() + ); + delete_data.data.deleted_layers.insert(local_layer_path); + } + Err((e, local_layer_path)) => { + errored = true; + error!( + "Failed to delete layer {} for timeline {sync_id}: {e:?}", + local_layer_path.display() + ); + delete_data.data.layers_to_delete.insert(local_layer_path); + } + } + } + + if errored { + debug!("Reenqueuing failed delete task for timeline {sync_id}"); + delete_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Delete(delete_data)); + } + errored +} + +#[cfg(test)] +mod tests { + use std::{collections::HashSet, num::NonZeroUsize}; + + use itertools::Itertools; + use tempfile::tempdir; + use tokio::fs; + use utils::lsn::Lsn; + + use crate::{ + remote_storage::{ + storage_sync::test_utils::{create_local_timeline, dummy_metadata}, + LocalFs, + }, + repository::repo_harness::{RepoHarness, TIMELINE_ID}, + }; + + use super::*; + + #[tokio::test] + async fn delete_timeline_negative() -> anyhow::Result<()> { + let harness = RepoHarness::create("delete_timeline_negative")?; + let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; + + let deleted = delete_timeline_layers( + &storage, + &sync_queue, + sync_id, + SyncData { + retries: 1, + data: LayersDeletion { + deleted_layers: HashSet::new(), + layers_to_delete: HashSet::new(), + deletion_registered: false, + }, + }, + ) + .await; + + assert!( + !deleted, + "Should not start the deletion for task with delete metadata unregistered" + ); + + Ok(()) + } + + #[tokio::test] + async fn delete_timeline() -> anyhow::Result<()> { + let harness = RepoHarness::create("delete_timeline")?; + let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + + let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let layer_files = ["a", "b", "c", "d"]; + let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; + let current_retries = 3; + let metadata = dummy_metadata(Lsn(0x30)); + let local_timeline_path = harness.timeline_path(&TIMELINE_ID); + let timeline_upload = + create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; + for local_path in timeline_upload.layers_to_upload { + let remote_path = storage.storage_path(&local_path)?; + let remote_parent_dir = remote_path.parent().unwrap(); + if !remote_parent_dir.exists() { + fs::create_dir_all(&remote_parent_dir).await?; + } + fs::copy(&local_path, &remote_path).await?; + } + assert_eq!( + storage + .list() + .await? + .into_iter() + .map(|remote_path| storage.local_path(&remote_path).unwrap()) + .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) + .sorted() + .collect::>(), + layer_files + .iter() + .map(|layer_str| layer_str.to_string()) + .sorted() + .collect::>(), + "Expect to have all layer files remotely before deletion" + ); + + let deleted = delete_timeline_layers( + &storage, + &sync_queue, + sync_id, + SyncData { + retries: current_retries, + data: LayersDeletion { + deleted_layers: HashSet::new(), + layers_to_delete: HashSet::from([ + local_timeline_path.join("a"), + local_timeline_path.join("c"), + local_timeline_path.join("something_different"), + ]), + deletion_registered: true, + }, + }, + ) + .await; + assert!(deleted, "Should be able to delete timeline files"); + + assert_eq!( + storage + .list() + .await? + .into_iter() + .map(|remote_path| storage.local_path(&remote_path).unwrap()) + .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) + .sorted() + .collect::>(), + vec!["b".to_string(), "d".to_string()], + "Expect to have only non-deleted files remotely" + ); + + Ok(()) + } +} diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 52e0df3784..b8c6f7fdab 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -147,23 +147,27 @@ pub mod index; mod upload; use std::{ - collections::{HashMap, HashSet, VecDeque}, + collections::{hash_map, HashMap, HashSet, VecDeque}, ffi::OsStr, fmt::Debug, num::{NonZeroU32, NonZeroUsize}, ops::ControlFlow, path::{Path, PathBuf}, - sync::Arc, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, }; -use anyhow::{bail, Context}; +use anyhow::{anyhow, bail, Context}; use futures::stream::{FuturesUnordered, StreamExt}; use lazy_static::lazy_static; +use once_cell::sync::OnceCell; use remote_storage::{GenericRemoteStorage, RemoteStorage}; use tokio::{ fs, runtime::Runtime, - sync::mpsc::{self, UnboundedReceiver}, + sync::mpsc::{self, error::TryRecvError, UnboundedReceiver, UnboundedSender}, time::{Duration, Instant}, }; use tracing::*; @@ -221,6 +225,8 @@ lazy_static! { .expect("failed to register pageserver image sync time histogram vec"); } +static SYNC_QUEUE: OnceCell = OnceCell::new(); + /// A timeline status to share with pageserver's sync counterpart, /// after comparing local and remote timeline state. #[derive(Clone, Copy, Debug)] @@ -449,144 +455,131 @@ fn collect_timeline_files( /// Wraps mpsc channel bits around into a queue interface. /// mpsc approach was picked to allow blocking the sync loop if no tasks are present, to avoid meaningless spinning. -mod sync_queue { - use std::{ - collections::{HashMap, HashSet}, - num::NonZeroUsize, - ops::ControlFlow, - sync::atomic::{AtomicUsize, Ordering}, - }; +struct SyncQueue { + len: AtomicUsize, + max_timelines_per_batch: NonZeroUsize, + sender: UnboundedSender<(ZTenantTimelineId, SyncTask)>, +} - use anyhow::anyhow; - use once_cell::sync::OnceCell; - use tokio::sync::mpsc::{error::TryRecvError, UnboundedReceiver, UnboundedSender}; - use tracing::{debug, warn}; - - use super::{SyncTask, SyncTaskBatch}; - use utils::zid::ZTenantTimelineId; - - static SENDER: OnceCell> = OnceCell::new(); - static LENGTH: AtomicUsize = AtomicUsize::new(0); - - /// Initializes the queue with the given sender channel that is used to put the tasks into later. - /// Errors if called more than once. - pub fn init(sender: UnboundedSender<(ZTenantTimelineId, SyncTask)>) -> anyhow::Result<()> { - SENDER - .set(sender) - .map_err(|_sender| anyhow!("sync queue was already initialized"))?; - Ok(()) +impl SyncQueue { + fn new( + max_timelines_per_batch: NonZeroUsize, + ) -> (Self, UnboundedReceiver<(ZTenantTimelineId, SyncTask)>) { + let (sender, receiver) = mpsc::unbounded_channel(); + ( + Self { + len: AtomicUsize::new(0), + max_timelines_per_batch, + sender, + }, + receiver, + ) } - /// Adds a new task to the queue, if the queue was initialized, returning `true` on success. - /// On any error, or if the queue was not initialized, the task gets dropped (not scheduled) and `false` is returned. - pub fn push(sync_id: ZTenantTimelineId, new_task: SyncTask) -> bool { - if let Some(sender) = SENDER.get() { - match sender.send((sync_id, new_task)) { - Err(e) => { - warn!("Failed to enqueue a sync task: the receiver is dropped: {e}"); - false - } - Ok(()) => { - LENGTH.fetch_add(1, Ordering::Relaxed); - true - } + fn push(&self, sync_id: ZTenantTimelineId, new_task: SyncTask) { + match self.sender.send((sync_id, new_task)) { + Ok(()) => { + self.len.fetch_add(1, Ordering::Relaxed); + } + Err(e) => { + error!("failed to push sync task to queue: {e}"); } - } else { - warn!("Failed to enqueue a sync task: the sender is not initialized"); - false } } - /// Polls a new task from the queue, using its receiver counterpart. - /// Does not block if the queue is empty, returning [`None`] instead. - /// Needed to correctly track the queue length. - async fn next_task( - receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, - ) -> Option<(ZTenantTimelineId, SyncTask)> { - let task = receiver.recv().await; - if task.is_some() { - LENGTH.fetch_sub(1, Ordering::Relaxed); - } - task - } - - /// Fetches a task batch, not bigger than the given limit. - /// Not blocking, can return fewer tasks if the queue does not contain enough. - /// Batch tasks are split by timelines, with all related tasks merged into one (download/upload) - /// or two (download and upload, if both were found in the queue during batch construction). - pub(super) async fn next_task_batch( - receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, - max_timelines_to_sync: NonZeroUsize, - ) -> ControlFlow<(), HashMap> { + /// Fetches a task batch, getting every existing entry from the queue, grouping by timelines and merging the tasks for every timeline. + /// A timeline has to care to not to delete cetain layers from the remote storage before the corresponding uploads happen. + /// Otherwise, due to "immutable" nature of the layers, the order of their deletion/uploading/downloading does not matter. + /// Hence, we merge the layers together into single task per timeline and run those concurrently (with the deletion happening only after successful uploading). + async fn next_task_batch( + &self, + // The queue is based on two ends of a channel and has to be accessible statically without blocking for submissions from the sync code. + // Its receiver needs &mut, so we cannot place it in the same container with the other end and get both static and non-blocking access. + // Hence toss this around to use it from the sync loop directly as &mut. + sync_queue_receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, + ) -> HashMap { // request the first task in blocking fashion to do less meaningless work - let (first_sync_id, first_task) = if let Some(first_task) = next_task(receiver).await { + let (first_sync_id, first_task) = if let Some(first_task) = sync_queue_receiver.recv().await + { + self.len.fetch_sub(1, Ordering::Relaxed); first_task } else { - debug!("Queue sender part was dropped, aborting"); - return ControlFlow::Break(()); + info!("Queue sender part was dropped, aborting"); + return HashMap::new(); }; + let mut timelines_left_to_batch = self.max_timelines_per_batch.get() - 1; + let mut tasks_to_process = self.len(); - let max_timelines_to_sync = max_timelines_to_sync.get(); - let mut batched_timelines = HashSet::with_capacity(max_timelines_to_sync); - batched_timelines.insert(first_sync_id.timeline_id); + let mut batches = HashMap::with_capacity(tasks_to_process); + batches.insert(first_sync_id, SyncTaskBatch::new(first_task)); - let mut tasks = HashMap::new(); - tasks.insert(first_sync_id, SyncTaskBatch::new(first_task)); + let mut tasks_to_reenqueue = Vec::with_capacity(tasks_to_process); - loop { - if batched_timelines.len() >= max_timelines_to_sync { - debug!( - "Filled a full task batch with {} timeline sync operations", - batched_timelines.len() - ); - break; - } - - match receiver.try_recv() { + // Pull the queue channel until we get all tasks that were there at the beginning of the batch construction. + // Yet do not put all timelines in the batch, but only the first ones that fit the timeline limit. + // Still merge the rest of the pulled tasks and reenqueue those for later. + while tasks_to_process > 0 { + match sync_queue_receiver.try_recv() { Ok((sync_id, new_task)) => { - LENGTH.fetch_sub(1, Ordering::Relaxed); - tasks.entry(sync_id).or_default().add(new_task); - batched_timelines.insert(sync_id.timeline_id); + self.len.fetch_sub(1, Ordering::Relaxed); + tasks_to_process -= 1; + + match batches.entry(sync_id) { + hash_map::Entry::Occupied(mut v) => v.get_mut().add(new_task), + hash_map::Entry::Vacant(v) => { + timelines_left_to_batch = timelines_left_to_batch.saturating_sub(1); + if timelines_left_to_batch == 0 { + tasks_to_reenqueue.push((sync_id, new_task)); + } else { + v.insert(SyncTaskBatch::new(new_task)); + } + } + } } Err(TryRecvError::Disconnected) => { debug!("Sender disconnected, batch collection aborted"); break; } Err(TryRecvError::Empty) => { - debug!( - "No more data in the sync queue, task batch is not full, length: {}, max allowed size: {max_timelines_to_sync}", - batched_timelines.len() - ); + debug!("No more data in the sync queue, task batch is not full"); break; } } } - ControlFlow::Continue(tasks) + debug!( + "Batched {} timelines, reenqueuing {}", + batches.len(), + tasks_to_reenqueue.len() + ); + for (id, task) in tasks_to_reenqueue { + self.push(id, task); + } + + batches } - /// Length of the queue, assuming that all receiver counterparts were only called using the queue api. - pub fn len() -> usize { - LENGTH.load(Ordering::Relaxed) + fn len(&self) -> usize { + self.len.load(Ordering::Relaxed) } } /// A task to run in the async download/upload loop. /// Limited by the number of retries, after certain threshold the failing task gets evicted and the timeline disabled. -#[derive(Debug)] -pub enum SyncTask { +#[derive(Debug, Clone)] +enum SyncTask { /// A checkpoint outcome with possible local file updates that need actualization in the remote storage. /// Not necessary more fresh than the one already uploaded. - Download(SyncData), + Download(SyncData), /// A certain amount of image files to download. - Upload(SyncData), + Upload(SyncData), /// Delete remote files. - Delete(SyncData), + Delete(SyncData), } /// Stores the data to synd and its retries, to evict the tasks failing to frequently. #[derive(Debug, Clone, PartialEq, Eq)] -pub struct SyncData { +struct SyncData { retries: u32, data: T, } @@ -598,24 +591,24 @@ impl SyncData { } impl SyncTask { - fn download(download_task: TimelineDownload) -> Self { + fn download(download_task: LayersDownload) -> Self { Self::Download(SyncData::new(0, download_task)) } - fn upload(upload_task: TimelineUpload) -> Self { + fn upload(upload_task: LayersUpload) -> Self { Self::Upload(SyncData::new(0, upload_task)) } - fn delete(delete_task: TimelineDelete) -> Self { + fn delete(delete_task: LayersDeletion) -> Self { Self::Delete(SyncData::new(0, delete_task)) } } -#[derive(Debug, Default)] +#[derive(Debug, Default, PartialEq, Eq)] struct SyncTaskBatch { - upload: Option>, - download: Option>, - delete: Option>, + upload: Option>, + download: Option>, + delete: Option>, } impl SyncTaskBatch { @@ -666,6 +659,31 @@ impl SyncTaskBatch { SyncTask::Delete(new_delete) => match &mut self.delete { Some(batch_delete) => { batch_delete.retries = batch_delete.retries.min(new_delete.retries); + // Need to reregister deletions, but it's ok to register already deleted files once again, they will be skipped. + batch_delete.data.deletion_registered = batch_delete + .data + .deletion_registered + .min(new_delete.data.deletion_registered); + + // Do not download and upload the layers getting removed in the same batch + if let Some(batch_download) = &mut self.download { + batch_download + .data + .layers_to_skip + .extend(new_delete.data.layers_to_delete.iter().cloned()); + batch_download + .data + .layers_to_skip + .extend(new_delete.data.deleted_layers.iter().cloned()); + } + if let Some(batch_upload) = &mut self.upload { + let not_deleted = |layer: &PathBuf| { + !new_delete.data.layers_to_delete.contains(layer) + && !new_delete.data.deleted_layers.contains(layer) + }; + batch_upload.data.layers_to_upload.retain(not_deleted); + batch_upload.data.uploaded_layers.retain(not_deleted); + } batch_delete .data @@ -685,7 +703,7 @@ impl SyncTaskBatch { /// Local timeline files for upload, appeared after the new checkpoint. /// Current checkpoint design assumes new files are added only, no deletions or amendment happens. #[derive(Debug, Clone, PartialEq, Eq)] -pub struct TimelineUpload { +struct LayersUpload { /// Layer file path in the pageserver workdir, that were added for the corresponding checkpoint. layers_to_upload: HashSet, /// Already uploaded layers. Used to store the data about the uploads between task retries @@ -700,14 +718,19 @@ pub struct TimelineUpload { /// without using the remote index or any other ways to list the remote timleine files. /// Skips the files that are already downloaded. #[derive(Debug, Clone, PartialEq, Eq)] -pub struct TimelineDownload { +struct LayersDownload { layers_to_skip: HashSet, } #[derive(Debug, Clone, PartialEq, Eq)] -pub struct TimelineDelete { +struct LayersDeletion { layers_to_delete: HashSet, deleted_layers: HashSet, + /// Pageserver uses [`IndexPart`] as a source of truth for listing the files per timeline. + /// This object gets serialized and placed into the remote storage. + /// So if we manage to update pageserver's [`RemoteIndex`] and update the index part on the remote storage, + /// the corresponding files on S3 won't exist for pageserver albeit being physically present on that remote storage still. + /// Then all that's left is to remove the files from the remote storage, without concerns about consistency. deletion_registered: bool, } @@ -721,45 +744,55 @@ pub fn schedule_layer_upload( layers_to_upload: HashSet, metadata: Option, ) { - debug!("Scheduling layer upload for tenant {tenant_id}, timeline {timeline_id}, to upload: {layers_to_upload:?}"); - if !sync_queue::push( + let sync_queue = match SYNC_QUEUE.get() { + Some(queue) => queue, + None => { + warn!("Could not send an upload task for tenant {tenant_id}, timeline {timeline_id}"); + return; + } + }; + sync_queue.push( ZTenantTimelineId { tenant_id, timeline_id, }, - SyncTask::upload(TimelineUpload { + SyncTask::upload(LayersUpload { layers_to_upload, uploaded_layers: HashSet::new(), metadata, }), - ) { - warn!("Could not send an upload task for tenant {tenant_id}, timeline {timeline_id}") - } else { - debug!("Upload task for tenant {tenant_id}, timeline {timeline_id} sent") - } + ); + debug!("Upload task for tenant {tenant_id}, timeline {timeline_id} sent") } +/// Adds the new files to delete as a deletion task to the queue. +/// On task failure, it gets retried again from the start a number of times. +/// +/// Ensure that the loop is started otherwise the task is never processed. pub fn schedule_layer_delete( tenant_id: ZTenantId, timeline_id: ZTimelineId, layers_to_delete: HashSet, ) { - debug!("Scheduling layer deletion for tenant {tenant_id}, timeline {timeline_id}, to delete: {layers_to_delete:?}"); - if !sync_queue::push( + let sync_queue = match SYNC_QUEUE.get() { + Some(queue) => queue, + None => { + warn!("Could not send deletion task for tenant {tenant_id}, timeline {timeline_id}"); + return; + } + }; + sync_queue.push( ZTenantTimelineId { tenant_id, timeline_id, }, - SyncTask::delete(TimelineDelete { + SyncTask::delete(LayersDeletion { layers_to_delete, deleted_layers: HashSet::new(), deletion_registered: false, }), - ) { - warn!("Could not send deletion task for tenant {tenant_id}, timeline {timeline_id}") - } else { - debug!("Deletion task for tenant {tenant_id}, timeline {timeline_id} sent") - } + ); + debug!("Deletion task for tenant {tenant_id}, timeline {timeline_id} sent") } /// Requests the download of the entire timeline for a given tenant. @@ -771,15 +804,23 @@ pub fn schedule_layer_delete( /// Ensure that the loop is started otherwise the task is never processed. pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { debug!("Scheduling layer download for tenant {tenant_id}, timeline {timeline_id}"); - sync_queue::push( + let sync_queue = match SYNC_QUEUE.get() { + Some(queue) => queue, + None => { + warn!("Could not send download task for tenant {tenant_id}, timeline {timeline_id}"); + return; + } + }; + sync_queue.push( ZTenantTimelineId { tenant_id, timeline_id, }, - SyncTask::download(TimelineDownload { + SyncTask::download(LayersDownload { layers_to_skip: HashSet::new(), }), ); + debug!("Download task for tenant {tenant_id}, timeline {timeline_id} sent") } /// Uses a remote storage given to start the storage sync loop. @@ -795,8 +836,14 @@ where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { - let (sender, receiver) = mpsc::unbounded_channel(); - sync_queue::init(sender)?; + let (sync_queue, sync_queue_receiver) = SyncQueue::new(max_concurrent_timelines_sync); + SYNC_QUEUE + .set(sync_queue) + .map_err(|_queue| anyhow!("Could not initialize sync queue"))?; + let sync_queue = match SYNC_QUEUE.get() { + Some(queue) => queue, + None => bail!("Could not get sync queue during the sync loop step, aborting"), + }; let runtime = tokio::runtime::Builder::new_current_thread() .enable_all() @@ -813,6 +860,7 @@ where let local_timeline_init_statuses = schedule_first_sync_tasks( &mut runtime.block_on(remote_index.write()), + sync_queue, local_timeline_files, ); @@ -827,10 +875,12 @@ where storage_sync_loop( runtime, conf, - receiver, - Arc::new(storage), - loop_index, - max_concurrent_timelines_sync, + ( + Arc::new(storage), + loop_index, + sync_queue, + sync_queue_receiver, + ), max_sync_errors, ); Ok(()) @@ -843,14 +893,15 @@ where }) } -#[allow(clippy::too_many_arguments)] fn storage_sync_loop( runtime: Runtime, conf: &'static PageServerConf, - mut receiver: UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, - storage: Arc, - index: RemoteIndex, - max_concurrent_timelines_sync: NonZeroUsize, + (storage, index, sync_queue, mut sync_queue_receiver): ( + Arc, + RemoteIndex, + &SyncQueue, + UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, + ), max_sync_errors: NonZeroU32, ) where P: Debug + Send + Sync + 'static, @@ -859,15 +910,12 @@ fn storage_sync_loop( info!("Starting remote storage sync loop"); loop { let loop_index = index.clone(); - let storage = Arc::clone(&storage); + let loop_storage = Arc::clone(&storage); let loop_step = runtime.block_on(async { tokio::select! { step = loop_step( conf, - &mut receiver, - storage, - loop_index, - max_concurrent_timelines_sync, + (loop_storage, loop_index, sync_queue, &mut sync_queue_receiver), max_sync_errors, ) .instrument(info_span!("storage_sync_loop_step")) => step, @@ -898,23 +946,21 @@ fn storage_sync_loop( async fn loop_step( conf: &'static PageServerConf, - receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, - storage: Arc, - index: RemoteIndex, - max_concurrent_timelines_sync: NonZeroUsize, + (storage, index, sync_queue, sync_queue_receiver): ( + Arc, + RemoteIndex, + &SyncQueue, + &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, + ), max_sync_errors: NonZeroU32, ) -> ControlFlow<(), HashMap>> where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { - let batched_tasks = - match sync_queue::next_task_batch(receiver, max_concurrent_timelines_sync).await { - ControlFlow::Continue(batch) => batch, - ControlFlow::Break(()) => return ControlFlow::Break(()), - }; + let batched_tasks = sync_queue.next_task_batch(sync_queue_receiver).await; - let remaining_queue_length = sync_queue::len(); + let remaining_queue_length = sync_queue.len(); REMAINING_SYNC_ITEMS.set(remaining_queue_length as i64); if remaining_queue_length > 0 || !batched_tasks.is_empty() { info!("Processing tasks for {} timelines in batch, more tasks left to process: {remaining_queue_length}", batched_tasks.len()); @@ -929,10 +975,15 @@ where let storage = Arc::clone(&storage); let index = index.clone(); async move { - let state_update = - process_sync_task_batch(conf, storage, index, max_sync_errors, sync_id, batch) - .instrument(info_span!("process_sync_task_batch", sync_id = %sync_id)) - .await; + let state_update = process_sync_task_batch( + conf, + (storage, index, sync_queue), + max_sync_errors, + sync_id, + batch, + ) + .instrument(info_span!("process_sync_task_batch", sync_id = %sync_id)) + .await; (sync_id, state_update) } }) @@ -941,7 +992,7 @@ where let mut new_timeline_states: HashMap< ZTenantId, HashMap, - > = HashMap::with_capacity(max_concurrent_timelines_sync.get()); + > = HashMap::new(); while let Some((sync_id, state_update)) = sync_results.next().await { debug!("Finished storage sync task for sync id {sync_id}"); if let Some(state_update) = state_update { @@ -957,8 +1008,7 @@ where async fn process_sync_task_batch( conf: &'static PageServerConf, - storage: Arc, - index: RemoteIndex, + (storage, index, sync_queue): (Arc, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, sync_id: ZTenantTimelineId, batch: SyncTaskBatch, @@ -972,6 +1022,13 @@ where let upload_data = batch.upload.clone(); let download_data = batch.download.clone(); + // Run both upload and download tasks concurrently (not in parallel): + // download and upload tasks do not conflict and spoil the pageserver state even if they are executed in parallel. + // Under "spoiling" here means potentially inconsistent layer set that misses some of the layers, declared present + // in local (implicitly, via Lsn values and related memory state) or remote (explicitly via remote layer file paths) metadata. + // When operating in a system without tasks failing over the error threshold, + // current batching and task processing systems aim to update the layer set and metadata files (remote and local), + // without "loosing" such layer files. let (upload_result, status_update) = tokio::join!( async { if let Some(upload_data) = upload_data { @@ -982,7 +1039,7 @@ where ControlFlow::Continue(new_upload_data) => { upload_timeline_data( conf, - (storage.as_ref(), &index), + (storage.as_ref(), &index, sync_queue), current_remote_timeline.as_ref(), sync_id, new_upload_data, @@ -1022,14 +1079,14 @@ where ControlFlow::Continue(new_download_data) => { return download_timeline_data( conf, - (storage.as_ref(), &index), + (storage.as_ref(), &index, sync_queue), current_remote_timeline.as_ref(), sync_id, new_download_data, sync_start, "download", ) - .await + .await; } ControlFlow::Break(_) => { index @@ -1046,35 +1103,40 @@ where ); if let Some(delete_data) = batch.delete { - match validate_task_retries(delete_data, max_sync_errors) - .instrument(info_span!("retries_validation")) - .await - { - ControlFlow::Continue(new_delete_data) => { - delete_timeline_data( - conf, - (storage.as_ref(), &index), - sync_id, - new_delete_data, - sync_start, - "delete", - ) - .instrument(info_span!("delete_timeline_data")) - .await; - } - ControlFlow::Break(failed_delete_data) => { - if let Err(e) = update_remote_data( - conf, - storage.as_ref(), - &index, - sync_id, - RemoteDataUpdate::Delete(&failed_delete_data.data.deleted_layers), - ) + if upload_result.is_some() { + match validate_task_retries(delete_data, max_sync_errors) + .instrument(info_span!("retries_validation")) .await - { - error!("Failed to update remote timeline {sync_id}: {e:?}"); + { + ControlFlow::Continue(new_delete_data) => { + delete_timeline_data( + conf, + (storage.as_ref(), &index, sync_queue), + sync_id, + new_delete_data, + sync_start, + "delete", + ) + .instrument(info_span!("delete_timeline_data")) + .await; + } + ControlFlow::Break(failed_delete_data) => { + if let Err(e) = update_remote_data( + conf, + storage.as_ref(), + &index, + sync_id, + RemoteDataUpdate::Delete(&failed_delete_data.data.deleted_layers), + ) + .await + { + error!("Failed to update remote timeline {sync_id}: {e:?}"); + } } } + } else { + sync_queue.push(sync_id, SyncTask::Delete(delete_data)); + warn!("Skipping delete task due to failed upload tasks, reenqueuing"); } } @@ -1083,10 +1145,10 @@ where async fn download_timeline_data( conf: &'static PageServerConf, - (storage, index): (&S, &RemoteIndex), + (storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue), current_remote_timeline: Option<&RemoteTimeline>, sync_id: ZTenantTimelineId, - new_download_data: SyncData, + new_download_data: SyncData, sync_start: Instant, task_name: &str, ) -> Option @@ -1097,6 +1159,7 @@ where match download_timeline_layers( conf, storage, + sync_queue, current_remote_timeline, sync_id, new_download_data, @@ -1126,7 +1189,7 @@ where Err(e) => { error!("Failed to update local timeline metadata: {e:?}"); download_data.retries += 1; - sync_queue::push(sync_id, SyncTask::Download(download_data)); + sync_queue.push(sync_id, SyncTask::Download(download_data)); register_sync_status(sync_start, task_name, Some(false)); } } @@ -1199,14 +1262,14 @@ async fn update_local_metadata( async fn delete_timeline_data( conf: &'static PageServerConf, - (storage, index): (&S, &RemoteIndex), + (storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue), sync_id: ZTenantTimelineId, - mut new_delete_data: SyncData, + mut new_delete_data: SyncData, sync_start: Instant, task_name: &str, ) where P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, { let timeline_delete = &mut new_delete_data.data; @@ -1222,14 +1285,14 @@ async fn delete_timeline_data( { error!("Failed to update remote timeline {sync_id}: {e:?}"); new_delete_data.retries += 1; - sync_queue::push(sync_id, SyncTask::Delete(new_delete_data)); + sync_queue.push(sync_id, SyncTask::Delete(new_delete_data)); register_sync_status(sync_start, task_name, Some(false)); return; } } timeline_delete.deletion_registered = true; - let sync_status = delete_timeline_layers(storage, sync_id, new_delete_data).await; + let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await; register_sync_status(sync_start, task_name, Some(sync_status)); } @@ -1244,48 +1307,31 @@ async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result( conf: &'static PageServerConf, - (storage, index): (&S, &RemoteIndex), + (storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue), current_remote_timeline: Option<&RemoteTimeline>, sync_id: ZTenantTimelineId, - new_upload_data: SyncData, + new_upload_data: SyncData, sync_start: Instant, task_name: &str, ) where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { - let mut uploaded_data = - match upload_timeline_layers(storage, current_remote_timeline, sync_id, new_upload_data) - .await - { - UploadedTimeline::FailedAndRescheduled => { - register_sync_status(sync_start, task_name, Some(false)); - return; - } - UploadedTimeline::Successful(upload_data) => upload_data, - UploadedTimeline::SuccessfulAfterLocalFsUpdate(mut outdated_upload_data) => { - if outdated_upload_data.data.metadata.is_some() { - let local_metadata_path = - metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id); - let local_metadata = match read_metadata_file(&local_metadata_path).await { - Ok(metadata) => metadata, - Err(e) => { - error!( - "Failed to load local metadata from path '{}': {e:?}", - local_metadata_path.display() - ); - outdated_upload_data.retries += 1; - sync_queue::push(sync_id, SyncTask::Upload(outdated_upload_data)); - register_sync_status(sync_start, task_name, Some(false)); - return; - } - }; - - outdated_upload_data.data.metadata = Some(local_metadata); - } - outdated_upload_data - } - }; + let mut uploaded_data = match upload_timeline_layers( + storage, + sync_queue, + current_remote_timeline, + sync_id, + new_upload_data, + ) + .await + { + UploadedTimeline::FailedAndRescheduled => { + register_sync_status(sync_start, task_name, Some(false)); + return; + } + UploadedTimeline::Successful(upload_data) => upload_data, + }; match update_remote_data( conf, @@ -1305,7 +1351,7 @@ async fn upload_timeline_data( Err(e) => { error!("Failed to update remote timeline {sync_id}: {e:?}"); uploaded_data.retries += 1; - sync_queue::push(sync_id, SyncTask::Upload(uploaded_data)); + sync_queue.push(sync_id, SyncTask::Upload(uploaded_data)); register_sync_status(sync_start, task_name, Some(false)); } } @@ -1313,7 +1359,7 @@ async fn upload_timeline_data( enum RemoteDataUpdate<'a> { Upload { - uploaded_data: TimelineUpload, + uploaded_data: LayersUpload, upload_failed: bool, }, Delete(&'a HashSet), @@ -1455,6 +1501,7 @@ where fn schedule_first_sync_tasks( index: &mut RemoteTimelineIndex, + sync_queue: &SyncQueue, local_timeline_files: HashMap)>, ) -> LocalTimelineInitStatuses { let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); @@ -1491,7 +1538,7 @@ fn schedule_first_sync_tasks( // is it safe to upload this checkpoint? could it be half broken? new_sync_tasks.push_back(( sync_id, - SyncTask::upload(TimelineUpload { + SyncTask::upload(LayersUpload { layers_to_upload: local_files, uploaded_layers: HashSet::new(), metadata: Some(local_metadata), @@ -1509,7 +1556,7 @@ fn schedule_first_sync_tasks( } new_sync_tasks.into_iter().for_each(|(sync_id, task)| { - sync_queue::push(sync_id, task); + sync_queue.push(sync_id, task); }); local_timeline_init_statuses } @@ -1535,7 +1582,7 @@ fn compare_local_and_remote_timeline( let (initial_timeline_status, awaits_download) = if number_of_layers_to_download > 0 { new_sync_tasks.push_back(( sync_id, - SyncTask::download(TimelineDownload { + SyncTask::download(LayersDownload { layers_to_skip: local_files.clone(), }), )); @@ -1553,7 +1600,7 @@ fn compare_local_and_remote_timeline( if !layers_to_upload.is_empty() { new_sync_tasks.push_back(( sync_id, - SyncTask::upload(TimelineUpload { + SyncTask::upload(LayersUpload { layers_to_upload, uploaded_layers: HashSet::new(), metadata: Some(local_metadata), @@ -1584,12 +1631,12 @@ mod test_utils { use super::*; - pub async fn create_local_timeline( + pub(super) async fn create_local_timeline( harness: &RepoHarness<'_>, timeline_id: ZTimelineId, filenames: &[&str], metadata: TimelineMetadata, - ) -> anyhow::Result { + ) -> anyhow::Result { let timeline_path = harness.timeline_path(&timeline_id); fs::create_dir_all(&timeline_path).await?; @@ -1606,28 +1653,212 @@ mod test_utils { ) .await?; - Ok(TimelineUpload { + Ok(LayersUpload { layers_to_upload, uploaded_layers: HashSet::new(), metadata: Some(metadata), }) } - pub fn dummy_contents(name: &str) -> String { + pub(super) fn dummy_contents(name: &str) -> String { format!("contents for {name}") } - pub fn dummy_metadata(disk_consistent_lsn: Lsn) -> TimelineMetadata { + pub(super) fn dummy_metadata(disk_consistent_lsn: Lsn) -> TimelineMetadata { TimelineMetadata::new(disk_consistent_lsn, None, None, Lsn(0), Lsn(0), Lsn(0)) } } #[cfg(test)] mod tests { + use super::test_utils::dummy_metadata; + use crate::repository::repo_harness::TIMELINE_ID; + use hex_literal::hex; + use utils::lsn::Lsn; + use super::*; - #[test] - fn batching_tests() { - todo!("TODO kb") + const TEST_SYNC_ID: ZTenantTimelineId = ZTenantTimelineId { + tenant_id: ZTenantId::from_array(hex!("11223344556677881122334455667788")), + timeline_id: TIMELINE_ID, + }; + + #[tokio::test] + async fn separate_task_ids_batch() { + let (sync_queue, mut sync_queue_receiver) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + assert_eq!(sync_queue.len(), 0); + + let sync_id_2 = ZTenantTimelineId { + tenant_id: ZTenantId::from_array(hex!("22223344556677881122334455667788")), + timeline_id: TIMELINE_ID, + }; + let sync_id_3 = ZTenantTimelineId { + tenant_id: ZTenantId::from_array(hex!("33223344556677881122334455667788")), + timeline_id: TIMELINE_ID, + }; + assert!(sync_id_2 != TEST_SYNC_ID); + assert!(sync_id_2 != sync_id_3); + assert!(sync_id_3 != TEST_SYNC_ID); + + let download_task = SyncTask::download(LayersDownload { + layers_to_skip: HashSet::from([PathBuf::from("sk")]), + }); + let upload_task = SyncTask::upload(LayersUpload { + layers_to_upload: HashSet::from([PathBuf::from("up")]), + uploaded_layers: HashSet::from([PathBuf::from("upl")]), + metadata: Some(dummy_metadata(Lsn(2))), + }); + let delete_task = SyncTask::delete(LayersDeletion { + layers_to_delete: HashSet::from([PathBuf::from("de")]), + deleted_layers: HashSet::from([PathBuf::from("del")]), + deletion_registered: false, + }); + + sync_queue.push(TEST_SYNC_ID, download_task.clone()); + sync_queue.push(sync_id_2, upload_task.clone()); + sync_queue.push(sync_id_3, delete_task.clone()); + + let submitted_tasks_count = sync_queue.len(); + assert_eq!(submitted_tasks_count, 3); + let mut batch = sync_queue.next_task_batch(&mut sync_queue_receiver).await; + assert_eq!( + batch.len(), + submitted_tasks_count, + "Batch should consist of all tasks submitted" + ); + + assert_eq!( + Some(SyncTaskBatch::new(download_task)), + batch.remove(&TEST_SYNC_ID) + ); + assert_eq!( + Some(SyncTaskBatch::new(upload_task)), + batch.remove(&sync_id_2) + ); + assert_eq!( + Some(SyncTaskBatch::new(delete_task)), + batch.remove(&sync_id_3) + ); + + assert!(batch.is_empty(), "Should check all batch tasks"); + assert_eq!(sync_queue.len(), 0); + } + + #[tokio::test] + async fn same_task_id_separate_tasks_batch() { + let (sync_queue, mut sync_queue_receiver) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + assert_eq!(sync_queue.len(), 0); + + let download = LayersDownload { + layers_to_skip: HashSet::from([PathBuf::from("sk")]), + }; + let upload = LayersUpload { + layers_to_upload: HashSet::from([PathBuf::from("up")]), + uploaded_layers: HashSet::from([PathBuf::from("upl")]), + metadata: Some(dummy_metadata(Lsn(2))), + }; + let delete = LayersDeletion { + layers_to_delete: HashSet::from([PathBuf::from("de")]), + deleted_layers: HashSet::from([PathBuf::from("del")]), + deletion_registered: false, + }; + + sync_queue.push(TEST_SYNC_ID, SyncTask::download(download.clone())); + sync_queue.push(TEST_SYNC_ID, SyncTask::upload(upload.clone())); + sync_queue.push(TEST_SYNC_ID, SyncTask::delete(delete.clone())); + + let submitted_tasks_count = sync_queue.len(); + assert_eq!(submitted_tasks_count, 3); + let mut batch = sync_queue.next_task_batch(&mut sync_queue_receiver).await; + assert_eq!( + batch.len(), + 1, + "Queue should have one batch merged from 3 sync tasks of the same user" + ); + + assert_eq!( + Some(SyncTaskBatch { + upload: Some(SyncData { + retries: 0, + data: upload + }), + download: Some(SyncData { + retries: 0, + data: download + }), + delete: Some(SyncData { + retries: 0, + data: delete + }), + }), + batch.remove(&TEST_SYNC_ID), + "Should have one batch containing all tasks unchanged" + ); + + assert!(batch.is_empty(), "Should check all batch tasks"); + assert_eq!(sync_queue.len(), 0); + } + + #[tokio::test] + async fn same_task_id_same_tasks_batch() { + let (sync_queue, mut sync_queue_receiver) = SyncQueue::new(NonZeroUsize::new(1).unwrap()); + let download_1 = LayersDownload { + layers_to_skip: HashSet::from([PathBuf::from("sk1")]), + }; + let download_2 = LayersDownload { + layers_to_skip: HashSet::from([PathBuf::from("sk2")]), + }; + let download_3 = LayersDownload { + layers_to_skip: HashSet::from([PathBuf::from("sk3")]), + }; + let download_4 = LayersDownload { + layers_to_skip: HashSet::from([PathBuf::from("sk4")]), + }; + + let sync_id_2 = ZTenantTimelineId { + tenant_id: ZTenantId::from_array(hex!("22223344556677881122334455667788")), + timeline_id: TIMELINE_ID, + }; + assert!(sync_id_2 != TEST_SYNC_ID); + + sync_queue.push(TEST_SYNC_ID, SyncTask::download(download_1.clone())); + sync_queue.push(TEST_SYNC_ID, SyncTask::download(download_2.clone())); + sync_queue.push(sync_id_2, SyncTask::download(download_3.clone())); + sync_queue.push(TEST_SYNC_ID, SyncTask::download(download_4.clone())); + assert_eq!(sync_queue.len(), 4); + + let mut smallest_batch = sync_queue.next_task_batch(&mut sync_queue_receiver).await; + assert_eq!( + smallest_batch.len(), + 1, + "Queue should have one batch merged from the all sync tasks, but not the other user's task" + ); + assert_eq!( + Some(SyncTaskBatch { + download: Some(SyncData { + retries: 0, + data: LayersDownload { + layers_to_skip: { + let mut set = HashSet::new(); + set.extend(download_1.layers_to_skip.into_iter()); + set.extend(download_2.layers_to_skip.into_iter()); + set.extend(download_4.layers_to_skip.into_iter()); + set + }, + } + }), + upload: None, + delete: None, + }), + smallest_batch.remove(&TEST_SYNC_ID), + "Should have one batch containing all tasks merged for the tenant first appeared in the batch" + ); + + assert!(smallest_batch.is_empty(), "Should check all batch tasks"); + assert_eq!( + sync_queue.len(), + 1, + "Should have one task left out of the batch" + ); } } diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index 8b13789179..047ad6c2be 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -1 +1,228 @@ +//! Timeline synchrnonization logic to delete a bulk of timeline's remote files from the remote storage. +use anyhow::Context; +use futures::stream::{FuturesUnordered, StreamExt}; +use tracing::{debug, error, info}; + +use crate::storage_sync::{SyncQueue, SyncTask}; +use remote_storage::RemoteStorage; +use utils::zid::ZTenantTimelineId; + +use super::{LayersDeletion, SyncData}; + +/// Attempts to remove the timleline layers from the remote storage. +/// If the task had not adjusted the metadata before, the deletion will fail. +pub(super) async fn delete_timeline_layers<'a, P, S>( + storage: &'a S, + sync_queue: &SyncQueue, + sync_id: ZTenantTimelineId, + mut delete_data: SyncData, +) -> bool +where + P: std::fmt::Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + if !delete_data.data.deletion_registered { + error!("Cannot delete timeline layers before the deletion metadata is not registered, reenqueueing"); + delete_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Delete(delete_data)); + return false; + } + + if delete_data.data.layers_to_delete.is_empty() { + info!("No layers to delete, skipping"); + return true; + } + + let layers_to_delete = delete_data + .data + .layers_to_delete + .drain() + .collect::>(); + debug!("Layers to delete: {layers_to_delete:?}"); + info!("Deleting {} timeline layers", layers_to_delete.len()); + + let mut delete_tasks = layers_to_delete + .into_iter() + .map(|local_layer_path| async { + let storage_path = + match storage + .remote_object_id(&local_layer_path) + .with_context(|| { + format!( + "Failed to get the layer storage path for local path '{}'", + local_layer_path.display() + ) + }) { + Ok(path) => path, + Err(e) => return Err((e, local_layer_path)), + }; + + match storage.delete(&storage_path).await.with_context(|| { + format!( + "Failed to delete remote layer from storage at '{:?}'", + storage_path + ) + }) { + Ok(()) => Ok(local_layer_path), + Err(e) => Err((e, local_layer_path)), + } + }) + .collect::>(); + + let mut errored = false; + while let Some(deletion_result) = delete_tasks.next().await { + match deletion_result { + Ok(local_layer_path) => { + debug!( + "Successfully deleted layer {} for timeline {sync_id}", + local_layer_path.display() + ); + delete_data.data.deleted_layers.insert(local_layer_path); + } + Err((e, local_layer_path)) => { + errored = true; + error!( + "Failed to delete layer {} for timeline {sync_id}: {e:?}", + local_layer_path.display() + ); + delete_data.data.layers_to_delete.insert(local_layer_path); + } + } + } + + if errored { + debug!("Reenqueuing failed delete task for timeline {sync_id}"); + delete_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Delete(delete_data)); + } + errored +} + +#[cfg(test)] +mod tests { + use std::{collections::HashSet, num::NonZeroUsize}; + + use itertools::Itertools; + use tempfile::tempdir; + use tokio::fs; + use utils::lsn::Lsn; + + use crate::{ + repository::repo_harness::{RepoHarness, TIMELINE_ID}, + storage_sync::test_utils::{create_local_timeline, dummy_metadata}, + }; + use remote_storage::LocalFs; + + use super::*; + + #[tokio::test] + async fn delete_timeline_negative() -> anyhow::Result<()> { + let harness = RepoHarness::create("delete_timeline_negative")?; + let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let storage = LocalFs::new( + tempdir()?.path().to_path_buf(), + harness.conf.workdir.clone(), + )?; + + let deleted = delete_timeline_layers( + &storage, + &sync_queue, + sync_id, + SyncData { + retries: 1, + data: LayersDeletion { + deleted_layers: HashSet::new(), + layers_to_delete: HashSet::new(), + deletion_registered: false, + }, + }, + ) + .await; + + assert!( + !deleted, + "Should not start the deletion for task with delete metadata unregistered" + ); + + Ok(()) + } + + #[tokio::test] + async fn delete_timeline() -> anyhow::Result<()> { + let harness = RepoHarness::create("delete_timeline")?; + let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + + let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let layer_files = ["a", "b", "c", "d"]; + let storage = LocalFs::new( + tempdir()?.path().to_path_buf(), + harness.conf.workdir.clone(), + )?; + let current_retries = 3; + let metadata = dummy_metadata(Lsn(0x30)); + let local_timeline_path = harness.timeline_path(&TIMELINE_ID); + let timeline_upload = + create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; + for local_path in timeline_upload.layers_to_upload { + let remote_path = storage.remote_object_id(&local_path)?; + let remote_parent_dir = remote_path.parent().unwrap(); + if !remote_parent_dir.exists() { + fs::create_dir_all(&remote_parent_dir).await?; + } + fs::copy(&local_path, &remote_path).await?; + } + assert_eq!( + storage + .list() + .await? + .into_iter() + .map(|remote_path| storage.local_path(&remote_path).unwrap()) + .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) + .sorted() + .collect::>(), + layer_files + .iter() + .map(|layer_str| layer_str.to_string()) + .sorted() + .collect::>(), + "Expect to have all layer files remotely before deletion" + ); + + let deleted = delete_timeline_layers( + &storage, + &sync_queue, + sync_id, + SyncData { + retries: current_retries, + data: LayersDeletion { + deleted_layers: HashSet::new(), + layers_to_delete: HashSet::from([ + local_timeline_path.join("a"), + local_timeline_path.join("c"), + local_timeline_path.join("something_different"), + ]), + deletion_registered: true, + }, + }, + ) + .await; + assert!(deleted, "Should be able to delete timeline files"); + + assert_eq!( + storage + .list() + .await? + .into_iter() + .map(|remote_path| storage.local_path(&remote_path).unwrap()) + .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) + .sorted() + .collect::>(), + vec!["b".to_string(), "d".to_string()], + "Expect to have only non-deleted files remotely" + ); + + Ok(()) + } +} diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 3cd6de57c7..98a0a0e2fc 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -12,15 +12,13 @@ use tokio::{ use tracing::{debug, error, info, warn}; use crate::{ - config::PageServerConf, - layered_repository::metadata::metadata_path, - storage_sync::{sync_queue, SyncTask}, + config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, }; use utils::zid::ZTenantTimelineId; use super::{ index::{IndexPart, RemoteTimeline}, - SyncData, TimelineDownload, + LayersDownload, SyncData, SyncQueue, }; pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; @@ -76,7 +74,7 @@ pub(super) enum DownloadedTimeline { FailedAndRescheduled, /// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known. /// Initial download successful. - Successful(SyncData), + Successful(SyncData), } /// Attempts to download all given timeline's layers. @@ -87,9 +85,10 @@ pub(super) enum DownloadedTimeline { pub(super) async fn download_timeline_layers<'a, P, S>( conf: &'static PageServerConf, storage: &'a S, + sync_queue: &'a SyncQueue, remote_timeline: Option<&'a RemoteTimeline>, sync_id: ZTenantTimelineId, - mut download_data: SyncData, + mut download_data: SyncData, ) -> DownloadedTimeline where P: Debug + Send + Sync + 'static, @@ -251,7 +250,7 @@ where if errors_happened { debug!("Reenqueuing failed download task for timeline {sync_id}"); download_data.retries += 1; - sync_queue::push(sync_id, SyncTask::Download(download_data)); + sync_queue.push(sync_id, SyncTask::Download(download_data)); DownloadedTimeline::FailedAndRescheduled } else { info!("Successfully downloaded all layers"); @@ -265,7 +264,10 @@ async fn fsync_path(path: impl AsRef) -> Result<(), io::Error> { #[cfg(test)] mod tests { - use std::collections::{BTreeSet, HashSet}; + use std::{ + collections::{BTreeSet, HashSet}, + num::NonZeroUsize, + }; use remote_storage::{LocalFs, RemoteStorage}; use tempfile::tempdir; @@ -284,6 +286,8 @@ mod tests { #[tokio::test] async fn download_timeline() -> anyhow::Result<()> { let harness = RepoHarness::create("download_timeline")?; + let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"]; let storage = LocalFs::new( @@ -324,11 +328,12 @@ mod tests { let download_data = match download_timeline_layers( harness.conf, &storage, + &sync_queue, Some(&remote_timeline), sync_id, SyncData::new( current_retries, - TimelineDownload { + LayersDownload { layers_to_skip: HashSet::from([local_timeline_path.join("layer_to_skip")]), }, ), @@ -380,17 +385,19 @@ mod tests { #[tokio::test] async fn download_timeline_negatives() -> anyhow::Result<()> { let harness = RepoHarness::create("download_timeline_negatives")?; + let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?; let empty_remote_timeline_download = download_timeline_layers( harness.conf, &storage, + &sync_queue, None, sync_id, SyncData::new( 0, - TimelineDownload { + LayersDownload { layers_to_skip: HashSet::new(), }, ), @@ -409,11 +416,12 @@ mod tests { let already_downloading_remote_timeline_download = download_timeline_layers( harness.conf, &storage, + &sync_queue, Some(¬_expecting_download_remote_timeline), sync_id, SyncData::new( 0, - TimelineDownload { + LayersDownload { layers_to_skip: HashSet::new(), }, ), diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 1e2594ac70..f9d606f2b8 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -8,16 +8,14 @@ use remote_storage::RemoteStorage; use tokio::fs; use tracing::{debug, error, info, warn}; -use crate::{ - config::PageServerConf, - layered_repository::metadata::metadata_path, - storage_sync::{sync_queue, SyncTask}, -}; use utils::zid::ZTenantTimelineId; use super::{ index::{IndexPart, RemoteTimeline}, - SyncData, TimelineUpload, + LayersUpload, SyncData, SyncQueue, +}; +use crate::{ + config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, }; /// Serializes and uploads the given index part data to the remote storage. @@ -68,11 +66,7 @@ pub(super) enum UploadedTimeline { /// Upload failed due to some error, the upload task is rescheduled for another retry. FailedAndRescheduled, /// No issues happened during the upload, all task files were put into the remote storage. - Successful(SyncData), - /// No failures happened during the upload, but some files were removed locally before the upload task completed - /// (could happen due to retries, for instance, if GC happens in the interim). - /// Such files are considered "not needed" and ignored, but the task's metadata should be discarded and the new one loaded from the local file. - SuccessfulAfterLocalFsUpdate(SyncData), + Successful(SyncData), } /// Attempts to upload given layer files. @@ -81,9 +75,10 @@ pub(super) enum UploadedTimeline { /// On an error, bumps the retries count and reschedules the entire task. pub(super) async fn upload_timeline_layers<'a, P, S>( storage: &'a S, + sync_queue: &SyncQueue, remote_timeline: Option<&'a RemoteTimeline>, sync_id: ZTenantTimelineId, - mut upload_data: SyncData, + mut upload_data: SyncData, ) -> UploadedTimeline where P: Debug + Send + Sync + 'static, @@ -168,7 +163,6 @@ where .collect::>(); let mut errors_happened = false; - let mut local_fs_updated = false; while let Some(upload_result) = upload_tasks.next().await { match upload_result { Ok(uploaded_path) => { @@ -185,7 +179,16 @@ where errors_happened = true; error!("Failed to upload a layer for timeline {sync_id}: {e:?}"); } else { - local_fs_updated = true; + // We have run the upload sync task, but the file we wanted to upload is gone. + // This is "fine" due the asynchronous nature of the sync loop: it only reacts to events and might need to + // retry the upload tasks, if S3 or network is down: but during this time, pageserver might still operate and + // run compaction/gc threads, removing redundant files from disk. + // It's not good to pause GC/compaction because of those and we would rather skip such uploads. + // + // Yet absence of such files might also mean that the timeline metadata file was updated (GC moves the Lsn forward, for instance). + // We don't try to read a more recent version, since it could contain `disk_consistent_lsn` that does not have its upload finished yet. + // This will create "missing" layers and make data inconsistent. + // Instead, we only update the metadata when it was submitted in an upload task as a checkpoint result. upload.layers_to_upload.remove(&source_path); warn!( "Missing locally a layer file {} scheduled for upload, skipping", @@ -200,11 +203,8 @@ where if errors_happened { debug!("Reenqueuing failed upload task for timeline {sync_id}"); upload_data.retries += 1; - sync_queue::push(sync_id, SyncTask::Upload(upload_data)); + sync_queue.push(sync_id, SyncTask::Upload(upload_data)); UploadedTimeline::FailedAndRescheduled - } else if local_fs_updated { - info!("Successfully uploaded all layers, some local layers were removed during the upload"); - UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data) } else { info!("Successfully uploaded all layers"); UploadedTimeline::Successful(upload_data) @@ -218,7 +218,10 @@ enum UploadError { #[cfg(test)] mod tests { - use std::collections::{BTreeSet, HashSet}; + use std::{ + collections::{BTreeSet, HashSet}, + num::NonZeroUsize, + }; use remote_storage::LocalFs; use tempfile::tempdir; @@ -237,6 +240,7 @@ mod tests { #[tokio::test] async fn regular_layer_upload() -> anyhow::Result<()> { let harness = RepoHarness::create("regular_layer_upload")?; + let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b"]; @@ -258,6 +262,7 @@ mod tests { let upload_result = upload_timeline_layers( &storage, + &sync_queue, None, sync_id, SyncData::new(current_retries, timeline_upload.clone()), @@ -322,6 +327,7 @@ mod tests { #[tokio::test] async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> { let harness = RepoHarness::create("layer_upload_after_local_fs_update")?; + let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a1", "b1"]; @@ -347,6 +353,7 @@ mod tests { let upload_result = upload_timeline_layers( &storage, + &sync_queue, None, sync_id, SyncData::new(current_retries, timeline_upload.clone()), @@ -354,7 +361,7 @@ mod tests { .await; let upload_data = match upload_result { - UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data) => upload_data, + UploadedTimeline::Successful(upload_data) => upload_data, wrong_result => panic!( "Expected a successful after local fs upload for timeline, but got: {wrong_result:?}" ), From cf59b515195fbd56e02e5bee11991a1c375d0a69 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Mon, 9 May 2022 11:11:46 -0400 Subject: [PATCH 0263/1022] Update README (Running local installation section) (#1649) --- README.md | 49 +++++++++++++++++++++++------------- control_plane/src/storage.rs | 3 +++ 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 8876831265..af384d2672 100644 --- a/README.md +++ b/README.md @@ -50,31 +50,29 @@ make -j5 # Create repository in .zenith with proper paths to binaries and data # Later that would be responsibility of a package install script > ./target/debug/neon_local init -initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229 -created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8 -created main branch +initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c +created initial timeline de200bd42b49cc1814412c7e592dd6e9 timeline.lsn 0/16B5A50 +initial timeline de200bd42b49cc1814412c7e592dd6e9 created pageserver init succeeded # start pageserver and safekeeper > ./target/debug/neon_local start -Starting pageserver at 'localhost:64000' in '.zenith' +Starting pageserver at '127.0.0.1:64000' in '.zenith' Pageserver started -initializing for single for 7676 -Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/single' +initializing for sk 1 for 7676 +Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/sk1' Safekeeper started # start postgres compute node > ./target/debug/neon_local pg start main -Starting new postgres main on timeline 5b014a9e41b4b63ce1a1febc04503636 ... -Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432 +Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ... +Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432 Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres' -waiting for server to start.... done -server started # check list of running postgres instances > ./target/debug/neon_local pg list -NODE ADDRESS TIMELINES BRANCH NAME LSN STATUS -main 127.0.0.1:55432 5b014a9e41b4b63ce1a1febc04503636 main 0/1609610 running + NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS + main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running ``` 4. Now it is possible to connect to postgres and run some queries: @@ -95,17 +93,24 @@ postgres=# select * from t; ```sh # create branch named migration_check > ./target/debug/neon_local timeline branch --branch-name migration_check -Created timeline '0e9331cad6efbafe6a88dd73ae21a5c9' at Lsn 0/16F5830 for tenant: c03ba6b7ad4c5e9cf556f059ade44229. Ancestor timeline: 'main' +Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main' # check branches tree > ./target/debug/neon_local timeline list - main [5b014a9e41b4b63ce1a1febc04503636] - ┗━ @0/1609610: migration_check [0e9331cad6efbafe6a88dd73ae21a5c9] +(L) main [de200bd42b49cc1814412c7e592dd6e9] +(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601] # start postgres on that branch -> ./target/debug/neon_local pg start migration_check -Starting postgres node at 'host=127.0.0.1 port=55433 user=stas' -waiting for server to start.... done +> ./target/debug/neon_local pg start migration_check --branch-name migration_check +Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ... +Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433 +Starting postgres node at 'host=127.0.0.1 port=55433 user=zenith_admin dbname=postgres' + +# check the new list of running postgres instances +> ./target/debug/neon_local pg list + NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS + main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running + migration_check 127.0.0.1:55433 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running # this new postgres instance will have all the data from 'main' postgres, # but all modifications would not affect data in original postgres @@ -118,6 +123,14 @@ postgres=# select * from t; postgres=# insert into t values(2,2); INSERT 0 1 + +# check that the new change doesn't affect the 'main' postgres +> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres +postgres=# select * from t; + key | value +-----+------- + 1 | 1 +(1 row) ``` 6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index adb924d430..d2e63a22de 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -167,6 +167,9 @@ impl PageServerNode { ); } + // echo the captured output of the init command + println!("{}", String::from_utf8_lossy(&init_output.stdout)); + Ok(initial_timeline_id) } From 87dfa997345cc5a825aba4acc581edbf4806b4f7 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Tue, 10 May 2022 09:55:14 -0400 Subject: [PATCH 0264/1022] Update layered_repository REAMDE (#1659) --- pageserver/src/layered_repository/README.md | 43 +++++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/pageserver/src/layered_repository/README.md b/pageserver/src/layered_repository/README.md index 519478e417..70c571a507 100644 --- a/pageserver/src/layered_repository/README.md +++ b/pageserver/src/layered_repository/README.md @@ -23,6 +23,7 @@ distribution depends on the workload: the updates could be totally random, or there could be a long stream of updates to a single relation when data is bulk loaded, for example, or something in between. +``` Cloud Storage Page Server Safekeeper L1 L0 Memory WAL @@ -37,6 +38,7 @@ Cloud Storage Page Server Safekeeper +----+----+ +----+----+ | | | |EEEE| |EEEE|EEEE| +---+-----+ +----+ +----+----+ +``` In this illustration, WAL is received as a stream from the Safekeeper, from the right. It is immediately captured by the page server and stored quickly in @@ -47,7 +49,7 @@ the same page and relation close to each other. From the page server memory, whenever enough WAL has been accumulated, it is flushed to disk into a new L0 layer file, and the memory is released. -When enough L0 files have been accumulated, they are merged together rand sliced +When enough L0 files have been accumulated, they are merged together and sliced per key-space, producing a new set of files where each file contains a more narrow key range, but larger LSN range. @@ -121,7 +123,7 @@ The files are called "layer files". Each layer file covers a range of keys, and a range of LSNs (or a single LSN, in case of image layers). You can think of it as a rectangle in the two-dimensional key-LSN space. The layer files for each timeline are stored in the timeline's subdirectory under -.zenith/tenants//timelines. +`.zenith/tenants//timelines`. There are two kind of layer files: images, and delta layers. An image file contains a snapshot of all keys at a particular LSN, whereas a delta file @@ -130,8 +132,11 @@ range of LSN. image file: +``` 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568 start key end key LSN +``` + The first parts define the key range that the layer covers. See pgdatadir_mapping.rs for how the key space is used. The last part is the LSN. @@ -140,8 +145,10 @@ delta file: Delta files are named similarly, but they cover a range of LSNs: +``` 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051 start key end key start LSN end LSN +``` A delta file contains all the key-values in the key-range that were updated in the LSN range. If a key has not been modified, there is no trace of it in the @@ -151,7 +158,9 @@ delta layer. A delta layer file can cover a part of the overall key space, as in the previous example, or the whole key range like this: +``` 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000578C6B29-0000000057A50051 +``` A file that covers the whole key range is called a L0 file (Level 0), while a file that covers only part of the key range is called a L1 file. The "level" of @@ -168,7 +177,9 @@ version, and how branching and GC works is still valid. The full path of a delta file looks like this: +``` .zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000 +``` For simplicity, the examples below use a simplified notation for the paths. The tenant ID is left out, the timeline ID is replaced with @@ -177,8 +188,10 @@ with a human-readable table name. The LSNs are also shorter. For example, a base image file at LSN 100 and a delta file between 100-200 for 'orders' table on 'main' branch is represented like this: +``` main/orders_100 main/orders_100_200 +``` # Creating layer files @@ -188,12 +201,14 @@ branch called 'main' and two tables, 'orders' and 'customers'. The end of WAL is currently at LSN 250. In this starting situation, you would have these files on disk: +``` main/orders_100 main/orders_100_200 main/orders_200 main/customers_100 main/customers_100_200 main/customers_200 +``` In addition to those files, the recent changes between LSN 200 and the end of WAL at 250 are kept in memory. If the page server crashes, the @@ -224,6 +239,7 @@ If the customers table is modified later, a new file is created for it at the next checkpoint. The new file will cover the "gap" from the last layer file, so the LSN ranges are always contiguous: +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -236,6 +252,7 @@ last layer file, so the LSN ranges are always contiguous: main/customers_200 main/customers_200_500 main/customers_500 +``` ## Reading page versions @@ -259,15 +276,18 @@ involves replaying any WAL records applicable to the page between LSNs Imagine that a child branch is created at LSN 250: +``` @250 ----main--+--------------------------> \ +---child--------------> +``` Then, the 'orders' table is updated differently on the 'main' and 'child' branches. You now have this situation on disk: +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -282,6 +302,7 @@ Then, the 'orders' table is updated differently on the 'main' and child/orders_300 child/orders_300_400 child/orders_400 +``` Because the 'customers' table hasn't been modified on the child branch, there is no file for it there. If you request a page for it on @@ -294,6 +315,7 @@ is linear, and the request's LSN identifies unambiguously which file you need to look at. For example, the history for the 'orders' table on the 'main' branch consists of these files: +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -301,10 +323,12 @@ on the 'main' branch consists of these files: main/orders_300 main/orders_300_400 main/orders_400 +``` And from the 'child' branch's point of view, it consists of these files: +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -313,6 +337,7 @@ files: child/orders_300 child/orders_300_400 child/orders_400 +``` The branch metadata includes the point where the child branch was created, LSN 250. If a page request comes with LSN 275, we read the @@ -345,6 +370,7 @@ Let's look at the single branch scenario again. Imagine that the end of the branch is LSN 525, so that the GC horizon is currently at 525-150 = 375 +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -357,11 +383,13 @@ of the branch is LSN 525, so that the GC horizon is currently at main/customers_100 main/customers_100_200 main/customers_200 +``` We can remove the following files because the end LSNs of those files are older than GC horizon 375, and there are more recent layer files for the table: +``` main/orders_100 DELETE main/orders_100_200 DELETE main/orders_200 DELETE @@ -374,8 +402,9 @@ table: main/customers_100 DELETE main/customers_100_200 DELETE main/customers_200 KEEP, NO NEWER VERSION +``` -'main/customers_100_200' is old enough, but it cannot be +'main/customers_200' is old enough, but it cannot be removed because there is no newer layer file for the table. Things get slightly more complicated with multiple branches. All of @@ -384,6 +413,7 @@ retain older shapshot files that are still needed by child branches. For example, if child branch is created at LSN 150, and the 'customers' table is updated on the branch, you would have these files: +``` main/orders_100 KEEP, NEEDED BY child BRANCH main/orders_100_200 KEEP, NEEDED BY child BRANCH main/orders_200 DELETE @@ -398,6 +428,7 @@ table is updated on the branch, you would have these files: main/customers_200 KEEP, NO NEWER VERSION child/customers_150_300 DELETE child/customers_300 KEEP, NO NEWER VERSION +``` In this situation, 'main/orders_100' and 'main/orders_100_200' cannot be removed, even though they are older than the GC horizon, because @@ -407,6 +438,7 @@ and 'main/orders_200_300' can still be removed. If 'orders' is modified later on the 'child' branch, we will create a new base image and delta file for it on the child: +``` main/orders_100 main/orders_100_200 @@ -419,6 +451,7 @@ new base image and delta file for it on the child: child/customers_300 child/orders_150_400 child/orders_400 +``` After this, the 'main/orders_100' and 'main/orders_100_200' file could be removed. It is no longer needed by the child branch, because there @@ -434,6 +467,7 @@ Describe GC and checkpoint interval settings. In principle, each relation can be checkpointed separately, i.e. the LSN ranges of the files don't need to line up. So this would be legal: +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -446,6 +480,7 @@ LSN ranges of the files don't need to line up. So this would be legal: main/customers_250 main/customers_250_500 main/customers_500 +``` However, the code currently always checkpoints all relations together. So that situation doesn't arise in practice. @@ -468,11 +503,13 @@ does that. It could be useful, however, as a transient state when garbage collecting around branch points, or explicit recovery points. For example, if we start with this: +``` main/orders_100 main/orders_100_200 main/orders_200 main/orders_200_300 main/orders_300 +``` And there is a branch or explicit recovery point at LSN 150, we could replace 'main/orders_100_200' with 'main/orders_150' to keep a From 6cb14b4200429bc2eb50b5f9879918188965b156 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 10 May 2022 20:44:56 +0400 Subject: [PATCH 0265/1022] Optionally remove WAL on safekeepers without s3 offloading. And do that on staging, until offloading is merged. --- .circleci/ansible/production.hosts | 1 + .circleci/ansible/staging.hosts | 1 + .circleci/ansible/systemd/safekeeper.service | 2 +- safekeeper/src/bin/safekeeper.rs | 15 +++++++++++++++ safekeeper/src/lib.rs | 2 ++ safekeeper/src/remove_wal.rs | 2 +- safekeeper/src/safekeeper.rs | 9 +++++++-- safekeeper/src/timeline.rs | 4 ++-- 8 files changed, 30 insertions(+), 6 deletions(-) diff --git a/.circleci/ansible/production.hosts b/.circleci/ansible/production.hosts index f32b57154c..2ed8f517f7 100644 --- a/.circleci/ansible/production.hosts +++ b/.circleci/ansible/production.hosts @@ -15,3 +15,4 @@ console_mgmt_base_url = http://console-release.local bucket_name = zenith-storage-oregon bucket_region = us-west-2 etcd_endpoints = etcd-release.local:2379 +safekeeper_enable_s3_offload = true diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index 71166c531e..3ea815b907 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -16,3 +16,4 @@ console_mgmt_base_url = http://console-staging.local bucket_name = zenith-staging-storage-us-east-1 bucket_region = us-east-1 etcd_endpoints = etcd-staging.local:2379 +safekeeper_enable_s3_offload = false diff --git a/.circleci/ansible/systemd/safekeeper.service b/.circleci/ansible/systemd/safekeeper.service index cac38d8756..55088db859 100644 --- a/.circleci/ansible/systemd/safekeeper.service +++ b/.circleci/ansible/systemd/safekeeper.service @@ -6,7 +6,7 @@ After=network.target auditd.service Type=simple User=safekeeper Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib -ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} +ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --enable-s3-offload={{ safekeeper_enable_s3_offload }} ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed KillSignal=SIGINT diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 7e979840c2..d0df7093ff 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -115,6 +115,14 @@ fn main() -> Result<()> { .takes_value(true) .help("a prefix to always use when polling/pusing data in etcd from this safekeeper"), ) + .arg( + Arg::new("enable-s3-offload") + .long("enable-s3-offload") + .takes_value(true) + .default_value("true") + .default_missing_value("true") + .help("Enable/disable s3 offloading. When disabled, safekeeper removes WAL ignoring s3 WAL horizon."), + ) .get_matches(); if let Some(addr) = arg_matches.value_of("dump-control-file") { @@ -172,6 +180,13 @@ fn main() -> Result<()> { conf.broker_etcd_prefix = prefix.to_string(); } + // Seems like there is no better way to accept bool values explicitly in clap. + conf.s3_offload_enabled = arg_matches + .value_of("enable-s3-offload") + .unwrap() + .parse() + .context("failed to parse bool enable-s3-offload bool")?; + start_safekeeper(conf, given_id, arg_matches.is_present("init")) } diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index f74e5be992..c848de9e71 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -53,6 +53,7 @@ pub struct SafeKeeperConf { pub my_id: ZNodeId, pub broker_endpoints: Option>, pub broker_etcd_prefix: String, + pub s3_offload_enabled: bool, } impl SafeKeeperConf { @@ -79,6 +80,7 @@ impl Default for SafeKeeperConf { my_id: ZNodeId(0), broker_endpoints: None, broker_etcd_prefix: defaults::DEFAULT_NEON_BROKER_PREFIX.to_string(), + s3_offload_enabled: true, } } } diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index 9474f65e5f..3278d51bd3 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -12,7 +12,7 @@ pub fn thread_main(conf: SafeKeeperConf) { let active_tlis = GlobalTimelines::get_active_timelines(); for zttid in &active_tlis { if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) { - if let Err(e) = tli.remove_old_wal() { + if let Err(e) = tli.remove_old_wal(conf.s3_offload_enabled) { warn!( "failed to remove WAL for tenant {} timeline {}: {}", tli.zttid.tenant_id, tli.zttid.timeline_id, e diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index b9264565dc..fff1c269b6 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -930,13 +930,18 @@ where /// offloading. /// While it is safe to use inmem values for determining horizon, /// we use persistent to make possible normal states less surprising. - pub fn get_horizon_segno(&self) -> XLogSegNo { + pub fn get_horizon_segno(&self, s3_offload_enabled: bool) -> XLogSegNo { + let s3_offload_horizon = if s3_offload_enabled { + self.state.s3_wal_lsn + } else { + Lsn(u64::MAX) + }; let horizon_lsn = min( min( self.state.remote_consistent_lsn, self.state.peer_horizon_lsn, ), - self.state.s3_wal_lsn, + s3_offload_horizon, ); horizon_lsn.segment_number(self.state.server.wal_seg_size as usize) } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 140d6660ac..8b1072a54b 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -479,7 +479,7 @@ impl Timeline { shared_state.sk.wal_store.flush_lsn() } - pub fn remove_old_wal(&self) -> Result<()> { + pub fn remove_old_wal(&self, s3_offload_enabled: bool) -> Result<()> { let horizon_segno: XLogSegNo; let remover: Box Result<(), anyhow::Error>>; { @@ -488,7 +488,7 @@ impl Timeline { if shared_state.sk.state.server.wal_seg_size == 0 { return Ok(()); } - horizon_segno = shared_state.sk.get_horizon_segno(); + horizon_segno = shared_state.sk.get_horizon_segno(s3_offload_enabled); remover = shared_state.sk.wal_store.remove_up_to(); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { return Ok(()); From d710dff9756ca006ffb2bc7362f8137f5ca06f48 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 10 May 2022 16:28:00 +0300 Subject: [PATCH 0266/1022] Remove unnecessary Serialize/Deserialize traits from VecMap. It's never stored on disk. Let's be tidy. --- libs/utils/src/vec_map.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs index 558721c724..9953b447c8 100644 --- a/libs/utils/src/vec_map.rs +++ b/libs/utils/src/vec_map.rs @@ -1,11 +1,9 @@ use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds}; -use serde::{Deserialize, Serialize}; - /// Ordered map datastructure implemented in a Vec. /// Append only - can only add keys that are larger than the /// current max key. -#[derive(Clone, Debug, Serialize, Deserialize)] +#[derive(Clone, Debug)] pub struct VecMap(Vec<(K, V)>); impl Default for VecMap { From e6e883eb12503a3a013074c03f06d8a047f44c6c Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 11 May 2022 15:23:17 +0300 Subject: [PATCH 0267/1022] Do not set LSN for new FPI page (#1657) * Do not set LSN for new FPI page refer #1656 * Add page_is_new, page_get_lsn, page_set_lsn functions * Fix page_is_new implementation * Add comment from XLogReadBufferForRedoExtended --- libs/postgres_ffi/src/lib.rs | 19 +++++++++++++++++++ pageserver/src/walingest.rs | 11 +++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 923fbe4d5a..28d9a13dbf 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -8,6 +8,7 @@ #![allow(deref_nullptr)] use serde::{Deserialize, Serialize}; +use utils::lsn::Lsn; include!(concat!(env!("OUT_DIR"), "/bindings.rs")); @@ -37,3 +38,21 @@ pub const fn transaction_id_precedes(id1: TransactionId, id2: TransactionId) -> let diff = id1.wrapping_sub(id2) as i32; diff < 0 } + +// Check if page is not yet initialized (port of Postgres PageIsInit() macro) +pub fn page_is_new(pg: &[u8]) -> bool { + pg[14] == 0 && pg[15] == 0 // pg_upper == 0 +} + +// ExtractLSN from page header +pub fn page_get_lsn(pg: &[u8]) -> Lsn { + Lsn( + ((u32::from_le_bytes(pg[0..4].try_into().unwrap()) as u64) << 32) + | u32::from_le_bytes(pg[4..8].try_into().unwrap()) as u64, + ) +} + +pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) { + pg[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes()); + pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); +} diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index fbdb328d2c..5223125ce6 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -24,6 +24,7 @@ use anyhow::Context; use postgres_ffi::nonrelfile_utils::clogpage_precedes; use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment; +use postgres_ffi::{page_is_new, page_set_lsn}; use anyhow::Result; use bytes::{Buf, Bytes, BytesMut}; @@ -304,8 +305,14 @@ impl<'a, R: Repository> WalIngest<'a, R> { image.resize(image.len() + blk.hole_length as usize, 0u8); image.unsplit(tail); } - image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes()); - image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); + // + // Match the logic of XLogReadBufferForRedoExtended: + // The page may be uninitialized. If so, we can't set the LSN because + // that would corrupt the page. + // + if !page_is_new(&image) { + page_set_lsn(&mut image, lsn) + } assert_eq!(image.len(), pg_constants::BLCKSZ as usize); self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?; } else { From 5bd879f6418903a62b47758441a90153f9979237 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Wed, 11 May 2022 15:20:48 +0300 Subject: [PATCH 0268/1022] Proxy: update protocol after cluster->project rename --- proxy/src/auth_backend/console.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/proxy/src/auth_backend/console.rs b/proxy/src/auth_backend/console.rs index 55a0889af4..41a822701f 100644 --- a/proxy/src/auth_backend/console.rs +++ b/proxy/src/auth_backend/console.rs @@ -117,7 +117,7 @@ async fn get_auth_info( let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_get_role_secret"))?; url.query_pairs_mut() - .append_pair("cluster", cluster) + .append_pair("project", cluster) .append_pair("role", user); // TODO: use a proper logger @@ -141,7 +141,7 @@ async fn wake_compute( cluster: &str, ) -> Result<(String, u16), ConsoleAuthError> { let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_wake_compute"))?; - url.query_pairs_mut().append_pair("cluster", cluster); + url.query_pairs_mut().append_pair("project", cluster); // TODO: use a proper logger println!("cplane request: {}", url); From b338b5dffef46264e3d35887d9698432d2a7cc40 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 11 May 2022 19:39:12 +0400 Subject: [PATCH 0269/1022] Make callmemaybe less agressive until we fix it/migrate to bigger machines. --- safekeeper/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index c848de9e71..03236d4e65 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -31,7 +31,7 @@ pub mod defaults { pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); - pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(1); + pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(10); } #[derive(Debug, Clone)] From 20361395bb038659e476fb1566eb8ddff92612c6 Mon Sep 17 00:00:00 2001 From: Anton Shyrabokau <97127717+antons-antons@users.noreply.github.com> Date: Wed, 11 May 2022 11:36:53 -0700 Subject: [PATCH 0270/1022] Add zenith-us-stage-sk-5 to circleci inventory (#1665) Co-authored-by: Debian --- .circleci/ansible/staging.hosts | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index 3ea815b907..b2bacb89ca 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -6,6 +6,7 @@ zenith-us-stage-ps-2 console_region_id=27 zenith-us-stage-sk-1 console_region_id=27 zenith-us-stage-sk-2 console_region_id=27 zenith-us-stage-sk-4 console_region_id=27 +zenith-us-stage-sk-5 console_region_id=27 [storage:children] pageservers From c8640910353a8c226f516d70e337d2eb137dfc88 Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Wed, 11 May 2022 16:13:26 -0700 Subject: [PATCH 0271/1022] Fix err msg typo Signed-off-by: Dhammika Pathirana --- pageserver/src/layered_repository.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 01c2b961eb..6a614e184f 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1512,7 +1512,7 @@ impl LayeredTimeline { .ensure_loaded() .with_context(|| { format!( - "Ancestor timeline is not is not loaded. Timeline id: {} Ancestor id {:?}", + "Ancestor timeline is not loaded. Timeline id: {} Ancestor id {:?}", self.timeline_id, self.get_ancestor_timeline_id(), ) From 2bde77fced256600295a0a1c09c6335aed679dac Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 12 May 2022 07:56:02 +0300 Subject: [PATCH 0272/1022] =?UTF-8?q?Do=20not=20apply=20records=20with=20L?= =?UTF-8?q?SN=20smaller=20than=20LSN=20of=20cached=20image=20in=20del?= =?UTF-8?q?=E2=80=A6=20(#1672)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Do not apply records with LSN smaller than LSN of cached image in delta layer * Do not apply records with LSN smaller than LSN of cached image in delta layer --- pageserver/src/layered_repository/delta_layer.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index e78b05695c..638df6f42a 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -254,6 +254,9 @@ impl Layer for DeltaLayer { return false; } let entry_lsn = DeltaKey::extract_lsn_from_buf(key); + if entry_lsn < lsn_range.start { + return false; + } offsets.push((entry_lsn, blob_ref.pos())); !blob_ref.will_init() From 5da4f3a4df88ac2b28565eea1604bbc8272a845e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 12 May 2022 10:31:04 +0300 Subject: [PATCH 0273/1022] Refactor DeltaLayer::dump() function Put most of the code in a closure that returns Result, so that we can use the ?-operator for error handling. That's simpler. --- .../src/layered_repository/delta_layer.rs | 59 +++++++++---------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 638df6f42a..1c48f3def5 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -38,10 +38,6 @@ use crate::walrecord; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use serde::{Deserialize, Serialize}; -use tracing::*; -// avoid binding to Write (conflicts with std::io::Write) -// while being able to use std::fmt::Write's methods -use std::fmt::Write as _; use std::fs; use std::io::{BufWriter, Write}; use std::io::{Seek, SeekFrom}; @@ -49,6 +45,7 @@ use std::ops::Range; use std::os::unix::fs::FileExt; use std::path::{Path, PathBuf}; use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; +use tracing::*; use utils::{ bin_ser::BeSer, @@ -365,6 +362,28 @@ impl Layer for DeltaLayer { tree_reader.dump()?; let mut cursor = file.block_cursor(); + + // A subroutine to dump a single blob + let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result { + let buf = cursor.read_blob(blob_ref.pos())?; + let val = Value::des(&buf)?; + let desc = match val { + Value::Image(img) => { + format!(" img {} bytes", img.len()) + } + Value::WalRecord(rec) => { + let wal_desc = walrecord::describe_wal_record(&rec)?; + format!( + " rec {} bytes will_init: {} {}", + buf.len(), + rec.will_init(), + wal_desc + ) + } + }; + Ok(desc) + }; + tree_reader.visit( &[0u8; DELTA_KEY_SIZE], VisitDirection::Forwards, @@ -373,34 +392,10 @@ impl Layer for DeltaLayer { let key = DeltaKey::extract_key_from_buf(delta_key); let lsn = DeltaKey::extract_lsn_from_buf(delta_key); - let mut desc = String::new(); - match cursor.read_blob(blob_ref.pos()) { - Ok(buf) => { - let val = Value::des(&buf); - match val { - Ok(Value::Image(img)) => { - write!(&mut desc, " img {} bytes", img.len()).unwrap(); - } - Ok(Value::WalRecord(rec)) => { - let wal_desc = walrecord::describe_wal_record(&rec).unwrap(); - write!( - &mut desc, - " rec {} bytes will_init: {} {}", - buf.len(), - rec.will_init(), - wal_desc - ) - .unwrap(); - } - Err(err) => { - write!(&mut desc, " DESERIALIZATION ERROR: {}", err).unwrap(); - } - } - } - Err(err) => { - write!(&mut desc, " READ ERROR: {}", err).unwrap(); - } - } + let desc = match dump_blob(blob_ref) { + Ok(desc) => desc, + Err(err) => format!("ERROR: {}", err), + }; println!(" key {} at {}: {}", key, lsn, desc); true }, From b426775aa0dc3caa5287a91593c976f45fed0314 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Thu, 12 May 2022 12:07:09 +0300 Subject: [PATCH 0274/1022] Use compute-tools from the new neondatabase Docker Hub repo --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 9a9459a7f9..0ea7598329 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 9a9459a7f9cbcaa0e35ff1f2f34c419238fdec7e +Subproject commit 0ea7598329a83b818293137cc18bf7d42bf2fe68 From b10ae195b78835ba895d90ccc1573a0a018d8a28 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Thu, 12 May 2022 12:40:55 +0300 Subject: [PATCH 0275/1022] Set vendor/postgres back to the main branch I accidentally merged postgres PR that was referencing non-main branch. --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 0ea7598329..d62ec22eff 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 0ea7598329a83b818293137cc18bf7d42bf2fe68 +Subproject commit d62ec22effeca7b5794ab2c15a3fd9ee5a4a5b99 From 4538f1e1b839556aab12e5aa7d1c38646253ec97 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 12 May 2022 14:18:35 +0300 Subject: [PATCH 0276/1022] Correctly operate etcd safekeeper timeline data --- libs/etcd_broker/src/lib.rs | 21 +++++++++++++------ safekeeper/src/broker.rs | 2 +- safekeeper/src/timeline.rs | 41 ++----------------------------------- 3 files changed, 18 insertions(+), 46 deletions(-) diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index 01cc0cf162..1b27f99ccf 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -51,7 +51,7 @@ pub struct SkTimelineInfo { #[serde(default)] pub peer_horizon_lsn: Option, #[serde(default)] - pub wal_stream_connection_string: Option, + pub safekeeper_connection_string: Option, } #[derive(Debug, thiserror::Error)] @@ -217,16 +217,22 @@ pub async fn subscribe_to_safekeeper_timeline_updates( break; } - let mut timeline_updates: HashMap> = - HashMap::new(); + let mut timeline_updates: HashMap> = HashMap::new(); + // Keep track that the timeline data updates from etcd arrive in the right order. + // https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas + // > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering. + let mut timeline_etcd_versions: HashMap = HashMap::new(); + let events = resp.events(); debug!("Processing {} events", events.len()); for event in events { if EventType::Put == event.event_type() { - if let Some(kv) = event.kv() { - match parse_etcd_key_value(subscription_kind, ®ex, kv) { + if let Some(new_etcd_kv) = event.kv() { + let new_kv_version = new_etcd_kv.version(); + + match parse_etcd_key_value(subscription_kind, ®ex, new_etcd_kv) { Ok(Some((zttid, timeline))) => { match timeline_updates .entry(zttid) @@ -234,12 +240,15 @@ pub async fn subscribe_to_safekeeper_timeline_updates( .entry(timeline.safekeeper_id) { hash_map::Entry::Occupied(mut o) => { - if o.get().flush_lsn < timeline.info.flush_lsn { + let old_etcd_kv_version = timeline_etcd_versions.get(&zttid).copied().unwrap_or(i64::MIN); + if old_etcd_kv_version < new_kv_version { o.insert(timeline.info); + timeline_etcd_versions.insert(zttid,new_kv_version); } } hash_map::Entry::Vacant(v) => { v.insert(timeline.info); + timeline_etcd_versions.insert(zttid,new_kv_version); } } } diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index c9ae1a8d98..d9c60c9db0 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -60,7 +60,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // lock is held. for zttid in GlobalTimelines::get_active_timelines() { if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { - let sk_info = tli.get_public_info()?; + let sk_info = tli.get_public_info(&conf)?; let put_opts = PutOptions::new().with_lease(lease.id()); client .put( diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 8b1072a54b..a12f628e06 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -89,7 +89,6 @@ struct SharedState { active: bool, num_computes: u32, pageserver_connstr: Option, - listen_pg_addr: String, last_removed_segno: XLogSegNo, } @@ -112,7 +111,6 @@ impl SharedState { active: false, num_computes: 0, pageserver_connstr: None, - listen_pg_addr: conf.listen_pg_addr.clone(), last_removed_segno: 0, }) } @@ -132,7 +130,6 @@ impl SharedState { active: false, num_computes: 0, pageserver_connstr: None, - listen_pg_addr: conf.listen_pg_addr.clone(), last_removed_segno: 0, }) } @@ -421,7 +418,7 @@ impl Timeline { } /// Prepare public safekeeper info for reporting. - pub fn get_public_info(&self) -> anyhow::Result { + pub fn get_public_info(&self, conf: &SafeKeeperConf) -> anyhow::Result { let shared_state = self.mutex.lock().unwrap(); Ok(SkTimelineInfo { last_log_term: Some(shared_state.sk.get_epoch()), @@ -435,18 +432,7 @@ impl Timeline { shared_state.sk.inmem.remote_consistent_lsn, )), peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn), - wal_stream_connection_string: shared_state - .pageserver_connstr - .as_deref() - .map(|pageserver_connstr| { - wal_stream_connection_string( - self.zttid, - &shared_state.listen_pg_addr, - pageserver_connstr, - ) - }) - .transpose() - .context("Failed to get the pageserver callmemaybe connstr")?, + safekeeper_connection_string: Some(conf.listen_pg_addr.clone()), }) } @@ -504,29 +490,6 @@ impl Timeline { } } -// pageserver connstr is needed to be able to distinguish between different pageservers -// it is required to correctly manage callmemaybe subscriptions when more than one pageserver is involved -// TODO it is better to use some sort of a unique id instead of connection string, see https://github.com/zenithdb/zenith/issues/1105 -fn wal_stream_connection_string( - ZTenantTimelineId { - tenant_id, - timeline_id, - }: ZTenantTimelineId, - listen_pg_addr_str: &str, - pageserver_connstr: &str, -) -> anyhow::Result { - let me_connstr = format!("postgresql://no_user@{}/no_db", listen_pg_addr_str); - let me_conf = me_connstr - .parse::() - .with_context(|| { - format!("Failed to parse pageserver connection string '{me_connstr}' as a postgres one") - })?; - let (host, port) = utils::connstring::connection_host_port(&me_conf); - Ok(format!( - "host={host} port={port} options='-c ztimelineid={timeline_id} ztenantid={tenant_id} pageserver_connstr={pageserver_connstr}'", - )) -} - // Utilities needed by various Connection-like objects pub trait TimelineTools { fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()>; From ec8861b8cc54f61d509925b67babc1af765c37ef Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 12 May 2022 19:53:07 +0300 Subject: [PATCH 0277/1022] Fix pageserver metrics names (#1682) Try to follow Prometheus style-guide https://prometheus.io/docs/practices/naming/ for metrics names. More specifically: - Use `pageserver_` prefix for all pagserver metrics - Specify `_seconds` unit in time metrics - Use unit as a suffix in other cases, such as `_hits`, `_bytes`, `_records` - Use `_total` suffix for accumulating counters (note that Histograms append that suffix internally) --- pageserver/src/layered_repository.rs | 14 +++++++------- pageserver/src/lib.rs | 2 +- pageserver/src/page_service.rs | 2 +- pageserver/src/storage_sync.rs | 4 ++-- pageserver/src/virtual_file.rs | 6 +++--- pageserver/src/walredo.rs | 8 ++++---- test_runner/fixtures/compare_fixtures.py | 4 ++-- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 6a614e184f..b02ab00a21 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -89,7 +89,7 @@ pub use crate::layered_repository::ephemeral_file::writeback as writeback_epheme // Metrics collected on operations on the storage repository. lazy_static! { static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( - "pageserver_storage_time", + "pageserver_storage_operations_seconds", "Time spent on storage operations", &["operation", "tenant_id", "timeline_id"] ) @@ -99,8 +99,8 @@ lazy_static! { // Metrics collected on operations on the storage repository. lazy_static! { static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!( - "pageserver_getpage_reconstruct_time", - "Time spent on storage operations", + "pageserver_getpage_reconstruct_seconds", + "Time spent in reconstruct_value", &["tenant_id", "timeline_id"] ) .expect("failed to define a metric"); @@ -108,13 +108,13 @@ lazy_static! { lazy_static! { static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!( - "materialize_page_cache_hits", + "pageserver_materialized_cache_hits_total", "Number of cache hits from materialized page cache", &["tenant_id", "timeline_id"] ) .expect("failed to define a metric"); static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!( - "wait_lsn_time", + "pageserver_wait_lsn_seconds", "Time spent waiting for WAL to arrive", &["tenant_id", "timeline_id"] ) @@ -134,12 +134,12 @@ lazy_static! { // or in testing they estimate how much we would upload if we did. lazy_static! { static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!( - "pageserver_num_persistent_files_created", + "pageserver_created_persistent_files_total", "Number of files created that are meant to be uploaded to cloud storage", ) .expect("failed to define a metric"); static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!( - "pageserver_persistent_bytes_written", + "pageserver_written_persistent_bytes_total", "Total bytes written that are meant to be uploaded to cloud storage", ) .expect("failed to define a metric"); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 83985069ec..fdce0e5c5f 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -45,7 +45,7 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61; lazy_static! { static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!( - "pageserver_live_connections_count", + "pageserver_live_connections", "Number of live network connections", &["pageserver_connection_kind"] ) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index da3dedfc84..88273cfa57 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -326,7 +326,7 @@ const TIME_BUCKETS: &[f64] = &[ lazy_static! { static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!( - "pageserver_smgr_query_time", + "pageserver_smgr_query_seconds", "Time spent on smgr query handling", &["smgr_query_type", "tenant_id", "timeline_id"], TIME_BUCKETS.into() diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index b8c6f7fdab..7755e67c8d 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -208,12 +208,12 @@ lazy_static! { ) .expect("failed to register pageserver remote storage remaining sync items int gauge"); static ref FATAL_TASK_FAILURES: IntCounter = register_int_counter!( - "pageserver_remote_storage_fatal_task_failures", + "pageserver_remote_storage_fatal_task_failures_total", "Number of critically failed tasks" ) .expect("failed to register pageserver remote storage remaining sync items int gauge"); static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!( - "pageserver_remote_storage_image_sync_time", + "pageserver_remote_storage_image_sync_seconds", "Time took to synchronize (download or upload) a whole pageserver image. \ Grouped by `operation_kind` (upload|download) and `status` (success|failure)", &["operation_kind", "status"], diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 4ce245a74f..37d70372b5 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -34,7 +34,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ lazy_static! { static ref STORAGE_IO_TIME: HistogramVec = register_histogram_vec!( - "pageserver_io_time", + "pageserver_io_operations_seconds", "Time spent in IO operations", &["operation", "tenant_id", "timeline_id"], STORAGE_IO_TIME_BUCKETS.into() @@ -43,8 +43,8 @@ lazy_static! { } lazy_static! { static ref STORAGE_IO_SIZE: IntGaugeVec = register_int_gauge_vec!( - "pageserver_io_size", - "Amount of bytes", + "pageserver_io_operations_bytes_total", + "Total amount of bytes read/written in IO operations", &["operation", "tenant_id", "timeline_id"] ) .expect("failed to define a metric"); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 777718b311..e556c24548 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -106,16 +106,16 @@ impl crate::walredo::WalRedoManager for DummyRedoManager { // each tenant. lazy_static! { static ref WAL_REDO_TIME: Histogram = - register_histogram!("pageserver_wal_redo_time", "Time spent on WAL redo") + register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo") .expect("failed to define a metric"); static ref WAL_REDO_WAIT_TIME: Histogram = register_histogram!( - "pageserver_wal_redo_wait_time", + "pageserver_wal_redo_wait_seconds", "Time spent waiting for access to the WAL redo process" ) .expect("failed to define a metric"); static ref WAL_REDO_RECORD_COUNTER: IntCounter = register_int_counter!( - "pageserver_wal_records_replayed", - "Number of WAL records replayed" + "pageserver_replayed_wal_records_total", + "Number of WAL records replayed in WAL redo process" ) .unwrap(); } diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index d70f57aa52..d572901ed1 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -106,9 +106,9 @@ class ZenithCompare(PgCompare): report=MetricReport.LOWER_IS_BETTER) total_files = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_num_persistent_files_created") + self.env.pageserver, "pageserver_created_persistent_files_total") total_bytes = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_persistent_bytes_written") + self.env.pageserver, "pageserver_written_persistent_bytes_total") self.zenbenchmark.record("data_uploaded", total_bytes / (1024 * 1024), "MB", From 5812e26b906d8007aed1f3d407e52d0e126c6d18 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Thu, 12 May 2022 16:33:09 -0400 Subject: [PATCH 0278/1022] Create an initial timeline on CLI tenant creation (#1689) Resolves #1655 --- neon_local/src/main.rs | 23 +++++++++++++++++++ .../batch_others/test_ancestor_branch.py | 1 - test_runner/batch_others/test_zenith_cli.py | 12 +++++++++- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index 8b54054080..75944fe107 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -540,6 +540,29 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an "tenant {} successfully created on the pageserver", new_tenant_id ); + + // Create an initial timeline for the new tenant + let new_timeline_id = parse_timeline_id(create_match)?; + let timeline = pageserver + .timeline_create(new_tenant_id, new_timeline_id, None, None)? + .context(format!( + "Failed to create initial timeline for tenant {new_tenant_id}" + ))?; + let new_timeline_id = timeline.timeline_id; + let last_record_lsn = timeline + .local + .context(format!("Failed to get last record LSN: no local timeline info for timeline {new_timeline_id}"))? + .last_record_lsn; + + env.register_branch_mapping( + DEFAULT_BRANCH_NAME.to_string(), + new_tenant_id, + new_timeline_id, + )?; + + println!( + "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}", + ); } Some(("config", create_match)) => { let tenant_id = get_tenant_id(create_match, env)?; diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index d6b073492d..982921084f 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -35,7 +35,6 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: pscur.execute("failpoints flush-frozen=sleep(10000)") - env.zenith_cli.create_timeline(f'main', tenant_id=tenant) pg_branch0 = env.postgres.create_start('main', tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() branch0_cur.execute("SHOW zenith.zenith_timeline") diff --git a/test_runner/batch_others/test_zenith_cli.py b/test_runner/batch_others/test_zenith_cli.py index 091d9ac8ba..81567dba12 100644 --- a/test_runner/batch_others/test_zenith_cli.py +++ b/test_runner/batch_others/test_zenith_cli.py @@ -1,7 +1,7 @@ import uuid import requests -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient +from fixtures.zenith_fixtures import DEFAULT_BRANCH_NAME, ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient from typing import cast @@ -83,6 +83,16 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv): assert tenant2.hex in tenants +def test_cli_tenant_create(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + tenant_id = env.zenith_cli.create_tenant() + timelines = env.zenith_cli.list_timelines(tenant_id) + + # an initial timeline should be created upon tenant creation + assert len(timelines) == 1 + assert timelines[0][0] == DEFAULT_BRANCH_NAME + + def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): # Start with single sk zenith_env_builder.num_safekeepers = 1 From ae20751724779986632a6cbc316b50c7568ff2d5 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Thu, 12 May 2022 17:27:08 -0400 Subject: [PATCH 0279/1022] update `ZenithCli::create_tenant` return signature (#1692) to include the initial timeline's ID in addition to the new tenant's ID. Context: follow-up of https://github.com/neondatabase/neon/pull/1689 --- .../batch_others/test_ancestor_branch.py | 2 +- test_runner/batch_others/test_tenant_conf.py | 2 +- .../batch_others/test_tenant_relocation.py | 2 +- test_runner/batch_others/test_tenants.py | 4 ++-- test_runner/batch_others/test_zenith_cli.py | 6 +++--- test_runner/fixtures/zenith_fixtures.py | 17 +++++++++++------ .../performance/test_bulk_tenant_create.py | 2 +- 7 files changed, 20 insertions(+), 15 deletions(-) diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index 982921084f..c07b9d6dd1 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -21,7 +21,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): # Override defaults, 1M gc_horizon and 4M checkpoint_distance. # Extend compaction_period and gc_period to disable background compaction and gc. - tenant = env.zenith_cli.create_tenant( + tenant, _ = env.zenith_cli.create_tenant( conf={ 'gc_period': '10 m', 'gc_horizon': '1048576', diff --git a/test_runner/batch_others/test_tenant_conf.py b/test_runner/batch_others/test_tenant_conf.py index b85a541f10..d627d8a6ee 100644 --- a/test_runner/batch_others/test_tenant_conf.py +++ b/test_runner/batch_others/test_tenant_conf.py @@ -16,7 +16,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' env = zenith_env_builder.init_start() """Test per tenant configuration""" - tenant = env.zenith_cli.create_tenant(conf={ + tenant, _ = env.zenith_cli.create_tenant(conf={ 'checkpoint_distance': '20000', 'gc_period': '30sec', }) diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 7e71c0a157..20694a240c 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -107,7 +107,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, # create folder for remote storage mock remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage' - tenant = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) + tenant, _ = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) log.info("tenant to relocate %s", tenant) # attach does not download ancestor branches (should it?), just use root branch for now diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/batch_others/test_tenants.py index 682af8de49..1b593cfee3 100644 --- a/test_runner/batch_others/test_tenants.py +++ b/test_runner/batch_others/test_tenants.py @@ -12,8 +12,8 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_safekeep env = zenith_env_builder.init_start() """Tests tenants with and without wal acceptors""" - tenant_1 = env.zenith_cli.create_tenant() - tenant_2 = env.zenith_cli.create_tenant() + tenant_1, _ = env.zenith_cli.create_tenant() + tenant_2, _ = env.zenith_cli.create_tenant() env.zenith_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', tenant_id=tenant_1) diff --git a/test_runner/batch_others/test_zenith_cli.py b/test_runner/batch_others/test_zenith_cli.py index 81567dba12..bff17fa679 100644 --- a/test_runner/batch_others/test_zenith_cli.py +++ b/test_runner/batch_others/test_zenith_cli.py @@ -64,13 +64,13 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv): helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant - tenant1 = env.zenith_cli.create_tenant() + tenant1, _ = env.zenith_cli.create_tenant() # check tenant1 appeared helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant - tenant2 = env.zenith_cli.create_tenant() + tenant2, _ = env.zenith_cli.create_tenant() # check tenant2 appeared helper_compare_tenant_list(pageserver_http_client, env) @@ -85,7 +85,7 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv): def test_cli_tenant_create(zenith_simple_env: ZenithEnv): env = zenith_simple_env - tenant_id = env.zenith_cli.create_tenant() + tenant_id, _ = env.zenith_cli.create_tenant() timelines = env.zenith_cli.list_timelines(tenant_id) # an initial timeline should be created upon tenant creation diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 3bb7c606d3..fe20f1abbf 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -831,20 +831,25 @@ class ZenithCli: def create_tenant(self, tenant_id: Optional[uuid.UUID] = None, - conf: Optional[Dict[str, str]] = None) -> uuid.UUID: + timeline_id: Optional[uuid.UUID] = None, + conf: Optional[Dict[str, str]] = None) -> Tuple[uuid.UUID, uuid.UUID]: """ Creates a new tenant, returns its id and its initial timeline's id. """ if tenant_id is None: tenant_id = uuid.uuid4() + if timeline_id is None: + timeline_id = uuid.uuid4() if conf is None: - res = self.raw_cli(['tenant', 'create', '--tenant-id', tenant_id.hex]) + res = self.raw_cli([ + 'tenant', 'create', '--tenant-id', tenant_id.hex, '--timeline-id', timeline_id.hex + ]) else: - res = self.raw_cli( - ['tenant', 'create', '--tenant-id', tenant_id.hex] + - sum(list(map(lambda kv: (['-c', kv[0] + ':' + kv[1]]), conf.items())), [])) + res = self.raw_cli([ + 'tenant', 'create', '--tenant-id', tenant_id.hex, '--timeline-id', timeline_id.hex + ] + sum(list(map(lambda kv: (['-c', kv[0] + ':' + kv[1]]), conf.items())), [])) res.check_returncode() - return tenant_id + return tenant_id, timeline_id def config_tenant(self, tenant_id: uuid.UUID, conf: Dict[str, str]): """ diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index f0729d3a07..0e16d3e749 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -30,7 +30,7 @@ def test_bulk_tenant_create( for i in range(tenants_count): start = timeit.default_timer() - tenant = env.zenith_cli.create_tenant() + tenant, _ = env.zenith_cli.create_tenant() env.zenith_cli.create_timeline( f'test_bulk_tenant_create_{tenants_count}_{i}_{use_safekeepers}', tenant_id=tenant) From 85884a1599895a9875c7f0139854aa7dae21148e Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 13 May 2022 00:42:13 +0300 Subject: [PATCH 0280/1022] Disable tenant relocation python test --- test_runner/batch_others/test_tenant_relocation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 20694a240c..279b3a0a25 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -95,6 +95,10 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve log.info('load thread stopped') +@pytest.mark.skip( + reason= + "needs to replace callmemaybe call with better idea how to migrate timelines between pageservers" +) @pytest.mark.parametrize('with_load', ['with_load', 'without_load']) def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, port_distributor: PortDistributor, From 0030da57a8c6deb9795d8d9789b9996a976ad9c9 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Fri, 13 May 2022 02:24:08 +0300 Subject: [PATCH 0281/1022] compute-tools: grant rw priveleges to the all created users --- compute_tools/src/spec.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 27114b8202..334e0a9e05 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -136,13 +136,20 @@ pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> { xact.execute(query.as_str(), &[])?; } } else { - info!("role name {}", &name); + info!("role name: '{}'", &name); let mut query: String = format!("CREATE ROLE {} ", name.quote()); - info!("role create query {}", &query); + info!("role create query: '{}'", &query); info_print!(" -> create"); query.push_str(&role.to_pg_options()); xact.execute(query.as_str(), &[])?; + + let grant_query = format!( + "grant pg_read_all_data, pg_write_all_data to {}", + name.quote() + ); + xact.execute(grant_query.as_str(), &[])?; + info!("role grant query: '{}'", &grant_query); } info_print!("\n"); From 51c0f9ab2b394a31358cfd187c7fdeb34372553e Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 13 May 2022 00:56:15 +0300 Subject: [PATCH 0282/1022] Force git version to be up to date via decl macro --- Cargo.lock | 4 ++++ libs/utils/build.rs | 3 --- libs/utils/src/lib.rs | 20 ++++++++++++++------ neon_local/Cargo.toml | 1 + neon_local/src/main.rs | 3 ++- pageserver/Cargo.toml | 1 + pageserver/src/bin/dump_layerfile.rs | 4 +++- pageserver/src/bin/pageserver.rs | 9 +++++---- pageserver/src/bin/update_metadata.rs | 4 +++- proxy/Cargo.toml | 1 + proxy/src/main.rs | 6 ++++-- safekeeper/Cargo.toml | 1 + safekeeper/src/bin/safekeeper.rs | 6 ++++-- 13 files changed, 43 insertions(+), 20 deletions(-) delete mode 100644 libs/utils/build.rs diff --git a/Cargo.lock b/Cargo.lock index 148517a777..e1e1a0f067 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1582,6 +1582,7 @@ dependencies = [ "clap 3.0.14", "comfy-table", "control_plane", + "git-version", "pageserver", "postgres", "postgres_ffi", @@ -1773,6 +1774,7 @@ dependencies = [ "daemonize", "fail", "futures", + "git-version", "hex", "hex-literal", "humantime", @@ -2164,6 +2166,7 @@ dependencies = [ "bytes", "clap 3.0.14", "futures", + "git-version", "hashbrown", "hex", "hmac 0.12.1", @@ -2616,6 +2619,7 @@ dependencies = [ "daemonize", "etcd_broker", "fs2", + "git-version", "hex", "humantime", "hyper", diff --git a/libs/utils/build.rs b/libs/utils/build.rs deleted file mode 100644 index ee3346ae66..0000000000 --- a/libs/utils/build.rs +++ /dev/null @@ -1,3 +0,0 @@ -fn main() { - println!("cargo:rerun-if-env-changed=GIT_VERSION"); -} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index de266efe64..0398ce5e15 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -76,9 +76,17 @@ pub mod signals; // so if we changed the index state git_version will pick that up and rerun the macro. // // Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`. -use git_version::git_version; -pub const GIT_VERSION: &str = git_version!( - prefix = "git:", - fallback = concat!("git-env:", env!("GIT_VERSION")), - args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha -); +#[macro_export] +// TODO kb add identifier into the capture +macro_rules! project_git_version { + () => { + const GIT_VERSION: &str = git_version::git_version!( + prefix = "git:", + fallback = concat!( + "git-env:", + env!("GIT_VERSION", "Missing GIT_VERSION envvar") + ), + args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha + ); + }; +} diff --git a/neon_local/Cargo.toml b/neon_local/Cargo.toml index 78d339789f..8ebd7d5c17 100644 --- a/neon_local/Cargo.toml +++ b/neon_local/Cargo.toml @@ -9,6 +9,7 @@ anyhow = "1.0" serde_json = "1" comfy-table = "5.0.1" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +git-version = "0.3.5" # FIXME: 'pageserver' is needed for BranchInfo. Refactor pageserver = { path = "../pageserver" } diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index 75944fe107..2f470309ff 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -21,7 +21,7 @@ use utils::{ lsn::Lsn, postgres_backend::AuthType, zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, - GIT_VERSION, + project_git_version, }; use pageserver::timelines::TimelineInfo; @@ -30,6 +30,7 @@ use pageserver::timelines::TimelineInfo; const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1); const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; +project_git_version!(); fn default_conf() -> String { format!( diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index d4cceafc61..9cc8444531 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -52,6 +52,7 @@ nix = "0.23" once_cell = "1.8.0" crossbeam-utils = "0.8.5" fail = "0.5.0" +git-version = "0.3.5" postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } diff --git a/pageserver/src/bin/dump_layerfile.rs b/pageserver/src/bin/dump_layerfile.rs index af73ef6bdb..cb08acadff 100644 --- a/pageserver/src/bin/dump_layerfile.rs +++ b/pageserver/src/bin/dump_layerfile.rs @@ -7,7 +7,9 @@ use pageserver::layered_repository::dump_layerfile_from_path; use pageserver::page_cache; use pageserver::virtual_file; use std::path::PathBuf; -use utils::GIT_VERSION; +use utils::project_git_version; + +project_git_version!(); fn main() -> Result<()> { let arg_matches = App::new("Zenith dump_layerfile utility") diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 9cb7e6f13d..73ef5c5f4d 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -20,17 +20,18 @@ use utils::{ http::endpoint, logging, postgres_backend::AuthType, + project_git_version, shutdown::exit_now, signals::{self, Signal}, tcp_listener, zid::{ZTenantId, ZTimelineId}, - GIT_VERSION, }; +project_git_version!(); + fn version() -> String { format!( - "{} profiling:{} failpoints:{}", - GIT_VERSION, + "{GIT_VERSION} profiling:{} failpoints:{}", cfg!(feature = "profiling"), fail::has_failpoints() ) @@ -217,7 +218,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() // Initialize logger let log_file = logging::init(LOG_FILE_NAME, daemonize)?; - info!("version: {}", GIT_VERSION); + info!("version: {GIT_VERSION}"); // TODO: Check that it looks like a valid repository before going further diff --git a/pageserver/src/bin/update_metadata.rs b/pageserver/src/bin/update_metadata.rs index fae5e5c2e3..3e69ad5c66 100644 --- a/pageserver/src/bin/update_metadata.rs +++ b/pageserver/src/bin/update_metadata.rs @@ -6,7 +6,9 @@ use clap::{App, Arg}; use pageserver::layered_repository::metadata::TimelineMetadata; use std::path::PathBuf; use std::str::FromStr; -use utils::{lsn::Lsn, GIT_VERSION}; +use utils::{lsn::Lsn, project_git_version}; + +project_git_version!(); fn main() -> Result<()> { let arg_matches = App::new("Zenith update metadata utility") diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 43880d645a..4e45698e3e 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -33,6 +33,7 @@ tokio = { version = "1.17", features = ["macros"] } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-rustls = "0.23.0" url = "2.2.2" +git-version = "0.3.5" utils = { path = "../libs/utils" } metrics = { path = "../libs/metrics" } diff --git a/proxy/src/main.rs b/proxy/src/main.rs index fc2a368b85..7d5105c88f 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -25,7 +25,9 @@ use config::ProxyConfig; use futures::FutureExt; use std::{future::Future, net::SocketAddr}; use tokio::{net::TcpListener, task::JoinError}; -use utils::GIT_VERSION; +use utils::project_git_version; + +project_git_version!(); /// Flattens `Result>` into `Result`. async fn flatten_err( @@ -124,7 +126,7 @@ async fn main() -> anyhow::Result<()> { auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, })); - println!("Version: {}", GIT_VERSION); + println!("Version: {GIT_VERSION}"); // Check that we can bind to address before further initialization println!("Starting http on {}", http_address); diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 5e1ceee02e..417cf58cd5 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -29,6 +29,7 @@ hex = "0.4.3" const_format = "0.2.21" tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-util = { version = "0.7", features = ["io"] } +git-version = "0.3.5" postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index d0df7093ff..06a15a90b0 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -22,11 +22,13 @@ use safekeeper::SafeKeeperConf; use safekeeper::{broker, callmemaybe}; use safekeeper::{http, s3_offload}; use utils::{ - http::endpoint, logging, shutdown::exit_now, signals, tcp_listener, zid::ZNodeId, GIT_VERSION, + http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener, + zid::ZNodeId, }; const LOCK_FILE_NAME: &str = "safekeeper.lock"; const ID_FILE_NAME: &str = "safekeeper.id"; +project_git_version!(); fn main() -> Result<()> { metrics::set_common_metrics_prefix("safekeeper"); @@ -193,7 +195,7 @@ fn main() -> Result<()> { fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { let log_file = logging::init("safekeeper.log", conf.daemonize)?; - info!("version: {}", GIT_VERSION); + info!("version: {GIT_VERSION}"); // Prevent running multiple safekeepers on the same directory let lock_file_path = conf.workdir.join(LOCK_FILE_NAME); From b683308791d81f005089aed35981c73d78fbb93c Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 13 May 2022 01:05:55 +0300 Subject: [PATCH 0283/1022] Return GIT_VERSION back to storage binaries --- libs/utils/src/lib.rs | 55 +++++++++++++++------------ neon_local/src/main.rs | 4 +- pageserver/src/bin/dump_layerfile.rs | 2 +- pageserver/src/bin/pageserver.rs | 2 +- pageserver/src/bin/update_metadata.rs | 2 +- proxy/src/main.rs | 2 +- safekeeper/src/bin/safekeeper.rs | 2 +- 7 files changed, 37 insertions(+), 32 deletions(-) diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 0398ce5e15..4810909712 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -54,33 +54,38 @@ pub mod nonblock; // Default signal handling pub mod signals; -// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages -// -// we have several cases: -// * building locally from git repo -// * building in CI from git repo -// * building in docker (either in CI or locally) -// -// One thing to note is that .git is not available in docker (and it is bad to include it there). -// So everything becides docker build is covered by git_version crate. -// For docker use environment variable to pass git version, which is then retrieved by buildscript (build.rs). -// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro. -// Git version received from environment variable used as a fallback in git_version invokation. -// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option. -// So the build script will be run only when GIT_VERSION envvar has changed. -// -// Why not to use buildscript to get git commit sha directly without procmacro from different crate? -// Caching and workspaces complicates that. In case `utils` is not -// recompiled due to caching then version may become outdated. -// git_version crate handles that case by introducing a dependency on .git internals via include_bytes! macro, -// so if we changed the index state git_version will pick that up and rerun the macro. -// -// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`. +/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages +/// +/// we have several cases: +/// * building locally from git repo +/// * building in CI from git repo +/// * building in docker (either in CI or locally) +/// +/// One thing to note is that .git is not available in docker (and it is bad to include it there). +/// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required. +/// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro. +/// Git version received from environment variable used as a fallback in git_version invokation. +/// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option. +/// So the build script will be run only when GIT_VERSION envvar has changed. +/// +/// Why not to use buildscript to get git commit sha directly without procmacro from different crate? +/// Caching and workspaces complicates that. In case `utils` is not +/// recompiled due to caching then version may become outdated. +/// git_version crate handles that case by introducing a dependency on .git internals via include_bytes! macro, +/// so if we changed the index state git_version will pick that up and rerun the macro. +/// +/// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`. +/// +/// ############################################################################################# +/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details. +/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036 +/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains +/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation. +/// The problem needs further investigation and regular `const` declaration instead of a macro. #[macro_export] -// TODO kb add identifier into the capture macro_rules! project_git_version { - () => { - const GIT_VERSION: &str = git_version::git_version!( + ($const_identifier:ident) => { + const $const_identifier: &str = git_version::git_version!( prefix = "git:", fallback = concat!( "git-env:", diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index 2f470309ff..6538cdefc4 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -20,8 +20,8 @@ use utils::{ auth::{Claims, Scope}, lsn::Lsn, postgres_backend::AuthType, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, project_git_version, + zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use pageserver::timelines::TimelineInfo; @@ -30,7 +30,7 @@ use pageserver::timelines::TimelineInfo; const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1); const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; -project_git_version!(); +project_git_version!(GIT_VERSION); fn default_conf() -> String { format!( diff --git a/pageserver/src/bin/dump_layerfile.rs b/pageserver/src/bin/dump_layerfile.rs index cb08acadff..87390a1b06 100644 --- a/pageserver/src/bin/dump_layerfile.rs +++ b/pageserver/src/bin/dump_layerfile.rs @@ -9,7 +9,7 @@ use pageserver::virtual_file; use std::path::PathBuf; use utils::project_git_version; -project_git_version!(); +project_git_version!(GIT_VERSION); fn main() -> Result<()> { let arg_matches = App::new("Zenith dump_layerfile utility") diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 73ef5c5f4d..190e38e341 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -27,7 +27,7 @@ use utils::{ zid::{ZTenantId, ZTimelineId}, }; -project_git_version!(); +project_git_version!(GIT_VERSION); fn version() -> String { format!( diff --git a/pageserver/src/bin/update_metadata.rs b/pageserver/src/bin/update_metadata.rs index 3e69ad5c66..983fdb8647 100644 --- a/pageserver/src/bin/update_metadata.rs +++ b/pageserver/src/bin/update_metadata.rs @@ -8,7 +8,7 @@ use std::path::PathBuf; use std::str::FromStr; use utils::{lsn::Lsn, project_git_version}; -project_git_version!(); +project_git_version!(GIT_VERSION); fn main() -> Result<()> { let arg_matches = App::new("Zenith update metadata utility") diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 7d5105c88f..f46e19e5d6 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -27,7 +27,7 @@ use std::{future::Future, net::SocketAddr}; use tokio::{net::TcpListener, task::JoinError}; use utils::project_git_version; -project_git_version!(); +project_git_version!(GIT_VERSION); /// Flattens `Result>` into `Result`. async fn flatten_err( diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 06a15a90b0..65e71fcc74 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -28,7 +28,7 @@ use utils::{ const LOCK_FILE_NAME: &str = "safekeeper.lock"; const ID_FILE_NAME: &str = "safekeeper.id"; -project_git_version!(); +project_git_version!(GIT_VERSION); fn main() -> Result<()> { metrics::set_common_metrics_prefix("safekeeper"); From 22d997049c4cf5415b208a6fb397e1c3174980b8 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Fri, 6 May 2022 20:03:28 +0300 Subject: [PATCH 0284/1022] libs/utils/http/request: add ensure_no_body --- libs/utils/src/http/request.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs index 3bc8993c26..8e3d357397 100644 --- a/libs/utils/src/http/request.rs +++ b/libs/utils/src/http/request.rs @@ -1,7 +1,7 @@ use std::str::FromStr; use super::error::ApiError; -use hyper::{Body, Request}; +use hyper::{body::HttpBody, Body, Request}; use routerify::ext::RequestExt; pub fn get_request_param<'a>( @@ -31,3 +31,10 @@ pub fn parse_request_param( ))), } } + +pub async fn ensure_no_body(request: &mut Request) -> Result<(), ApiError> { + match request.body_mut().data().await { + Some(_) => Err(ApiError::BadRequest("Unexpected request body".into())), + None => Ok(()), + } +} From 07b85e7cfcf7d69c12e528ddde42d51444bbed27 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 12 May 2022 19:55:01 +0300 Subject: [PATCH 0285/1022] Safekeeper refactor: move callmemaybe_tx from SafekeeperPostgresBackend to Timeline --- safekeeper/src/bin/safekeeper.rs | 8 +-- safekeeper/src/handler.rs | 8 +-- safekeeper/src/receive_wal.rs | 11 +--- safekeeper/src/send_wal.rs | 6 +-- safekeeper/src/timeline.rs | 90 ++++++++++++++++++-------------- safekeeper/src/wal_service.rs | 19 ++----- 6 files changed, 66 insertions(+), 76 deletions(-) diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 65e71fcc74..6955d2aa5c 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -17,6 +17,7 @@ use url::{ParseError, Url}; use safekeeper::control_file::{self}; use safekeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR}; use safekeeper::remove_wal; +use safekeeper::timeline::GlobalTimelines; use safekeeper::wal_service; use safekeeper::SafeKeeperConf; use safekeeper::{broker, callmemaybe}; @@ -251,6 +252,8 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; + let (callmemaybe_tx, callmemaybe_rx) = mpsc::unbounded_channel(); + GlobalTimelines::set_callmemaybe_tx(callmemaybe_tx); let conf_ = conf.clone(); threads.push( @@ -279,13 +282,12 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b ); } - let (tx, rx) = mpsc::unbounded_channel(); let conf_cloned = conf.clone(); let safekeeper_thread = thread::Builder::new() .name("Safekeeper thread".into()) .spawn(|| { // thread code - let thread_result = wal_service::thread_main(conf_cloned, pg_listener, tx); + let thread_result = wal_service::thread_main(conf_cloned, pg_listener); if let Err(e) = thread_result { info!("safekeeper thread terminated: {}", e); } @@ -299,7 +301,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b .name("callmemaybe thread".into()) .spawn(|| { // thread code - let thread_result = callmemaybe::thread_main(conf_cloned, rx); + let thread_result = callmemaybe::thread_main(conf_cloned, callmemaybe_rx); if let Err(e) = thread_result { error!("callmemaybe thread terminated: {}", e); } diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 7d86523b0e..9af78661f9 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -21,9 +21,6 @@ use utils::{ zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, }; -use crate::callmemaybe::CallmeEvent; -use tokio::sync::mpsc::UnboundedSender; - /// Safekeeper handler of postgres commands pub struct SafekeeperPostgresHandler { pub conf: SafeKeeperConf, @@ -33,8 +30,6 @@ pub struct SafekeeperPostgresHandler { pub ztimelineid: Option, pub timeline: Option>, pageserver_connstr: Option, - //sender to communicate with callmemaybe thread - pub tx: UnboundedSender, } /// Parsed Postgres command. @@ -140,7 +135,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { } impl SafekeeperPostgresHandler { - pub fn new(conf: SafeKeeperConf, tx: UnboundedSender) -> Self { + pub fn new(conf: SafeKeeperConf) -> Self { SafekeeperPostgresHandler { conf, appname: None, @@ -148,7 +143,6 @@ impl SafekeeperPostgresHandler { ztimelineid: None, timeline: None, pageserver_connstr: None, - tx, } } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 3ad99ab0df..0ef335c9ed 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -5,7 +5,6 @@ use anyhow::{anyhow, bail, Result}; use bytes::BytesMut; -use tokio::sync::mpsc::UnboundedSender; use tracing::*; use crate::timeline::Timeline; @@ -28,8 +27,6 @@ use utils::{ sock_split::ReadStream, }; -use crate::callmemaybe::CallmeEvent; - pub struct ReceiveWalConn<'pg> { /// Postgres connection pg_backend: &'pg mut PostgresBackend, @@ -91,10 +88,9 @@ impl<'pg> ReceiveWalConn<'pg> { // Register the connection and defer unregister. spg.timeline .get() - .on_compute_connect(self.pageserver_connstr.as_ref(), &spg.tx)?; + .on_compute_connect(self.pageserver_connstr.as_ref())?; let _guard = ComputeConnectionGuard { timeline: Arc::clone(spg.timeline.get()), - callmemaybe_tx: spg.tx.clone(), }; let mut next_msg = Some(next_msg); @@ -194,13 +190,10 @@ impl ProposerPollStream { struct ComputeConnectionGuard { timeline: Arc, - callmemaybe_tx: UnboundedSender, } impl Drop for ComputeConnectionGuard { fn drop(&mut self) { - self.timeline - .on_compute_disconnect(&self.callmemaybe_tx) - .unwrap(); + self.timeline.on_compute_disconnect().unwrap(); } } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 960f70d154..d52dd6ea57 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -264,13 +264,13 @@ impl ReplicationConn { } else { let pageserver_connstr = pageserver_connstr.expect("there should be a pageserver connection string since this is not a wal_proposer_recovery"); let zttid = spg.timeline.get().zttid; - let tx_clone = spg.tx.clone(); + let tx_clone = spg.timeline.get().callmemaybe_tx.clone(); let subscription_key = SubscriptionStateKey::new( zttid.tenant_id, zttid.timeline_id, pageserver_connstr.clone(), ); - spg.tx + tx_clone .send(CallmeEvent::Pause(subscription_key)) .unwrap_or_else(|e| { error!("failed to send Pause request to callmemaybe thread {}", e); @@ -315,7 +315,7 @@ impl ReplicationConn { } else { // TODO: also check once in a while whether we are walsender // to right pageserver. - if spg.timeline.get().check_deactivate(replica_id, &spg.tx)? { + if spg.timeline.get().check_deactivate(replica_id)? { // Shut down, timeline is suspended. // TODO create proper error type for this bail!("end streaming to {:?}", spg.appname); diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index a12f628e06..c73d6af4ac 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -275,15 +275,21 @@ impl SharedState { /// Database instance (tenant) pub struct Timeline { pub zttid: ZTenantTimelineId, + pub callmemaybe_tx: UnboundedSender, mutex: Mutex, /// conditional variable used to notify wal senders cond: Condvar, } impl Timeline { - fn new(zttid: ZTenantTimelineId, shared_state: SharedState) -> Timeline { + fn new( + zttid: ZTenantTimelineId, + callmemaybe_tx: UnboundedSender, + shared_state: SharedState, + ) -> Timeline { Timeline { zttid, + callmemaybe_tx, mutex: Mutex::new(shared_state), cond: Condvar::new(), } @@ -292,34 +298,27 @@ impl Timeline { /// Register compute connection, starting timeline-related activity if it is /// not running yet. /// Can fail only if channel to a static thread got closed, which is not normal at all. - pub fn on_compute_connect( - &self, - pageserver_connstr: Option<&String>, - callmemaybe_tx: &UnboundedSender, - ) -> Result<()> { + pub fn on_compute_connect(&self, pageserver_connstr: Option<&String>) -> Result<()> { let mut shared_state = self.mutex.lock().unwrap(); shared_state.num_computes += 1; // FIXME: currently we always adopt latest pageserver connstr, but we // should have kind of generations assigned by compute to distinguish // the latest one or even pass it through consensus to reliably deliver // to all safekeepers. - shared_state.activate(&self.zttid, pageserver_connstr, callmemaybe_tx)?; + shared_state.activate(&self.zttid, pageserver_connstr, &self.callmemaybe_tx)?; Ok(()) } /// De-register compute connection, shutting down timeline activity if /// pageserver doesn't need catchup. /// Can fail only if channel to a static thread got closed, which is not normal at all. - pub fn on_compute_disconnect( - &self, - callmemaybe_tx: &UnboundedSender, - ) -> Result<()> { + pub fn on_compute_disconnect(&self) -> Result<()> { let mut shared_state = self.mutex.lock().unwrap(); shared_state.num_computes -= 1; // If there is no pageserver, can suspend right away; otherwise let // walsender do that. if shared_state.num_computes == 0 && shared_state.pageserver_connstr.is_none() { - shared_state.deactivate(&self.zttid, callmemaybe_tx)?; + shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; } Ok(()) } @@ -327,11 +326,7 @@ impl Timeline { /// Deactivate tenant if there is no computes and pageserver is caughtup, /// assuming the pageserver status is in replica_id. /// Returns true if deactivated. - pub fn check_deactivate( - &self, - replica_id: usize, - callmemaybe_tx: &UnboundedSender, - ) -> Result { + pub fn check_deactivate(&self, replica_id: usize) -> Result { let mut shared_state = self.mutex.lock().unwrap(); if !shared_state.active { // already suspended @@ -343,7 +338,7 @@ impl Timeline { (replica_state.last_received_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. replica_state.last_received_lsn >= shared_state.sk.inmem.commit_lsn); if deactivate { - shared_state.deactivate(&self.zttid, callmemaybe_tx)?; + shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; return Ok(true); } } @@ -508,22 +503,35 @@ impl TimelineTools for Option> { } } +struct GlobalTimelinesState { + timelines: HashMap>, + callmemaybe_tx: Option>, +} + lazy_static! { - pub static ref TIMELINES: Mutex>> = - Mutex::new(HashMap::new()); + static ref TIMELINES_STATE: Mutex = Mutex::new(GlobalTimelinesState { + timelines: HashMap::new(), + callmemaybe_tx: None + }); } /// A zero-sized struct used to manage access to the global timelines map. pub struct GlobalTimelines; impl GlobalTimelines { + pub fn set_callmemaybe_tx(callmemaybe_tx: UnboundedSender) { + let mut state = TIMELINES_STATE.lock().unwrap(); + assert!(state.callmemaybe_tx.is_none()); + state.callmemaybe_tx = Some(callmemaybe_tx); + } + fn create_internal( - mut timelines: MutexGuard>>, + mut state: MutexGuard, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, peer_ids: Vec, ) -> Result> { - match timelines.get(&zttid) { + match state.timelines.get(&zttid) { Some(_) => bail!("timeline {} already exists", zttid), None => { // TODO: check directory existence @@ -532,8 +540,12 @@ impl GlobalTimelines { let shared_state = SharedState::create(conf, &zttid, peer_ids) .context("failed to create shared state")?; - let new_tli = Arc::new(Timeline::new(zttid, shared_state)); - timelines.insert(zttid, Arc::clone(&new_tli)); + let new_tli = Arc::new(Timeline::new( + zttid, + state.callmemaybe_tx.as_ref().unwrap().clone(), + shared_state, + )); + state.timelines.insert(zttid, Arc::clone(&new_tli)); Ok(new_tli) } } @@ -544,20 +556,20 @@ impl GlobalTimelines { zttid: ZTenantTimelineId, peer_ids: Vec, ) -> Result> { - let timelines = TIMELINES.lock().unwrap(); - GlobalTimelines::create_internal(timelines, conf, zttid, peer_ids) + let state = TIMELINES_STATE.lock().unwrap(); + GlobalTimelines::create_internal(state, conf, zttid, peer_ids) } - /// Get a timeline with control file loaded from the global TIMELINES map. + /// Get a timeline with control file loaded from the global TIMELINES_STATE.timelines map. /// If control file doesn't exist and create=false, bails out. pub fn get( conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool, ) -> Result> { - let mut timelines = TIMELINES.lock().unwrap(); + let mut state = TIMELINES_STATE.lock().unwrap(); - match timelines.get(&zttid) { + match state.timelines.get(&zttid) { Some(result) => Ok(Arc::clone(result)), None => { let shared_state = @@ -573,20 +585,19 @@ impl GlobalTimelines { .contains("No such file or directory") && create { - return GlobalTimelines::create_internal( - timelines, - conf, - zttid, - vec![], - ); + return GlobalTimelines::create_internal(state, conf, zttid, vec![]); } else { return Err(error); } } }; - let new_tli = Arc::new(Timeline::new(zttid, shared_state)); - timelines.insert(zttid, Arc::clone(&new_tli)); + let new_tli = Arc::new(Timeline::new( + zttid, + state.callmemaybe_tx.as_ref().unwrap().clone(), + shared_state, + )); + state.timelines.insert(zttid, Arc::clone(&new_tli)); Ok(new_tli) } } @@ -594,8 +605,9 @@ impl GlobalTimelines { /// Get ZTenantTimelineIDs of all active timelines. pub fn get_active_timelines() -> Vec { - let timelines = TIMELINES.lock().unwrap(); - timelines + let state = TIMELINES_STATE.lock().unwrap(); + state + .timelines .iter() .filter(|&(_, tli)| tli.is_active()) .map(|(zttid, _)| *zttid) diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 468ac28526..5980160788 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -8,29 +8,22 @@ use std::net::{TcpListener, TcpStream}; use std::thread; use tracing::*; -use crate::callmemaybe::CallmeEvent; use crate::handler::SafekeeperPostgresHandler; use crate::SafeKeeperConf; -use tokio::sync::mpsc::UnboundedSender; use utils::postgres_backend::{AuthType, PostgresBackend}; /// Accept incoming TCP connections and spawn them into a background thread. -pub fn thread_main( - conf: SafeKeeperConf, - listener: TcpListener, - tx: UnboundedSender, -) -> Result<()> { +pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> Result<()> { loop { match listener.accept() { Ok((socket, peer_addr)) => { debug!("accepted connection from {}", peer_addr); let conf = conf.clone(); - let tx_clone = tx.clone(); let _ = thread::Builder::new() .name("WAL service thread".into()) .spawn(move || { - if let Err(err) = handle_socket(socket, conf, tx_clone) { + if let Err(err) = handle_socket(socket, conf) { error!("connection handler exited: {}", err); } }) @@ -51,16 +44,12 @@ fn get_tid() -> u64 { /// This is run by `thread_main` above, inside a background thread. /// -fn handle_socket( - socket: TcpStream, - conf: SafeKeeperConf, - tx: UnboundedSender, -) -> Result<()> { +fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<()> { let _enter = info_span!("", tid = ?get_tid()).entered(); socket.set_nodelay(true)?; - let mut conn_handler = SafekeeperPostgresHandler::new(conf, tx); + let mut conn_handler = SafekeeperPostgresHandler::new(conf); let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, false)?; // libpq replication protocol between safekeeper and replicas/pagers pgbackend.run(&mut conn_handler)?; From bf899a57d9a2b20ba812a4002c0ac3234f064d26 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 12 May 2022 23:40:29 +0300 Subject: [PATCH 0286/1022] Safekeeper: add timeline/tenant force delete HTTP endpoings (closes #895) * There is no auth in Safekeeper HTTP at all currently, so simply calling `check_permission` is not enough. * There are no checks of Safekeeper still working with the data, as "still working" is burry now: a timeline may be "active" while there are no compute nodes and all data is propagated. * Still, callmemaybe is deactivated, and timeline is removed from the internal map. It can easily sneak back in case of race conditions and implicit creations, though. --- safekeeper/src/http/routes.rs | 48 +++++++- safekeeper/src/lib.rs | 9 +- safekeeper/src/timeline.rs | 98 ++++++++++++++- test_runner/batch_others/test_wal_acceptor.py | 113 ++++++++++++++++++ test_runner/fixtures/zenith_fixtures.py | 15 +++ 5 files changed, 277 insertions(+), 6 deletions(-) diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index e731db5617..62fbd2ff2f 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -3,19 +3,20 @@ use hyper::{Body, Request, Response, StatusCode}; use serde::Serialize; use serde::Serializer; +use std::collections::HashMap; use std::fmt::Display; use std::sync::Arc; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; -use crate::timeline::GlobalTimelines; +use crate::timeline::{GlobalTimelines, TimelineDeleteForceResult}; use crate::SafeKeeperConf; use utils::{ http::{ endpoint, error::ApiError, json::{json_request, json_response}, - request::parse_request_param, + request::{ensure_no_body, parse_request_param}, RequestExt, RouterBuilder, }, lsn::Lsn, @@ -130,6 +131,44 @@ async fn timeline_create_handler(mut request: Request) -> Result, +) -> Result, ApiError> { + let zttid = ZTenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + ensure_no_body(&mut request).await?; + json_response( + StatusCode::OK, + GlobalTimelines::delete_force(get_conf(&request), &zttid).map_err(ApiError::from_err)?, + ) +} + +/// Deactivates all timelines for the tenant and removes its data directory. +/// See `timeline_delete_force_handler`. +async fn tenant_delete_force_handler( + mut request: Request, +) -> Result, ApiError> { + let tenant_id = parse_request_param(&request, "tenant_id")?; + ensure_no_body(&mut request).await?; + json_response( + StatusCode::OK, + GlobalTimelines::delete_force_all_for_tenant(get_conf(&request), &tenant_id) + .map_err(ApiError::from_err)? + .iter() + .map(|(zttid, resp)| (format!("{}", zttid.timeline_id), *resp)) + .collect::>(), + ) +} + /// Used only in tests to hand craft required data. async fn record_safekeeper_info(mut request: Request) -> Result, ApiError> { let zttid = ZTenantTimelineId::new( @@ -155,6 +194,11 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder timeline_status_handler, ) .post("/v1/timeline", timeline_create_handler) + .delete( + "/v1/tenant/:tenant_id/timeline/:timeline_id", + timeline_delete_force_handler, + ) + .delete("/v1/tenant/:tenant_id", tenant_delete_force_handler) // for tests .post( "/v1/record_safekeeper_info/:tenant_id/:timeline_id", diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 03236d4e65..09b2e68a49 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -3,7 +3,7 @@ use std::path::PathBuf; use std::time::Duration; use url::Url; -use utils::zid::{ZNodeId, ZTenantTimelineId}; +use utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId}; pub mod broker; pub mod callmemaybe; @@ -57,9 +57,12 @@ pub struct SafeKeeperConf { } impl SafeKeeperConf { + pub fn tenant_dir(&self, tenant_id: &ZTenantId) -> PathBuf { + self.workdir.join(tenant_id.to_string()) + } + pub fn timeline_dir(&self, zttid: &ZTenantTimelineId) -> PathBuf { - self.workdir - .join(zttid.tenant_id.to_string()) + self.tenant_dir(&zttid.tenant_id) .join(zttid.timeline_id.to_string()) } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index c73d6af4ac..84ad53d72d 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -7,6 +7,8 @@ use etcd_broker::SkTimelineInfo; use lazy_static::lazy_static; use postgres_ffi::xlog_utils::XLogSegNo; +use serde::Serialize; + use std::cmp::{max, min}; use std::collections::HashMap; use std::fs::{self}; @@ -19,7 +21,7 @@ use tracing::*; use utils::{ lsn::Lsn, pq_proto::ZenithFeedback, - zid::{ZNodeId, ZTenantTimelineId}, + zid::{ZNodeId, ZTenantId, ZTenantTimelineId}, }; use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; @@ -345,6 +347,20 @@ impl Timeline { Ok(false) } + /// Deactivates the timeline, assuming it is being deleted. + /// Returns whether the timeline was already active. + /// + /// The callmemaybe thread is stopped by the deactivation message. We assume all other threads + /// will stop by themselves eventually (possibly with errors, but no panics). There should be no + /// compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but + /// we're deleting the timeline anyway. + pub fn deactivate_for_delete(&self) -> Result { + let mut shared_state = self.mutex.lock().unwrap(); + let was_active = shared_state.active; + shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; + Ok(was_active) + } + fn is_active(&self) -> bool { let shared_state = self.mutex.lock().unwrap(); shared_state.active @@ -515,6 +531,12 @@ lazy_static! { }); } +#[derive(Clone, Copy, Serialize)] +pub struct TimelineDeleteForceResult { + pub dir_existed: bool, + pub was_active: bool, +} + /// A zero-sized struct used to manage access to the global timelines map. pub struct GlobalTimelines; @@ -613,4 +635,78 @@ impl GlobalTimelines { .map(|(zttid, _)| *zttid) .collect() } + + fn delete_force_internal( + conf: &SafeKeeperConf, + zttid: &ZTenantTimelineId, + was_active: bool, + ) -> Result { + match std::fs::remove_dir_all(conf.timeline_dir(zttid)) { + Ok(_) => Ok(TimelineDeleteForceResult { + dir_existed: true, + was_active, + }), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(TimelineDeleteForceResult { + dir_existed: false, + was_active, + }), + Err(e) => Err(e.into()), + } + } + + /// Deactivates and deletes the timeline, see `Timeline::deactivate_for_delete()`, the deletes + /// the corresponding data directory. + /// We assume all timeline threads do not care about `GlobalTimelines` not containing the timeline + /// anymore, and they will eventually terminate without panics. + /// + /// There are multiple ways the timeline may be accidentally "re-created" (so we end up with two + /// `Timeline` objects in memory): + /// a) a compute node connects after this method is called, or + /// b) an HTTP GET request about the timeline is made and it's able to restore the current state, or + /// c) an HTTP POST request for timeline creation is made after the timeline is already deleted. + /// TODO: ensure all of the above never happens. + pub fn delete_force( + conf: &SafeKeeperConf, + zttid: &ZTenantTimelineId, + ) -> Result { + info!("deleting timeline {}", zttid); + let was_active = match TIMELINES_STATE.lock().unwrap().timelines.remove(zttid) { + None => false, + Some(tli) => tli.deactivate_for_delete()?, + }; + GlobalTimelines::delete_force_internal(conf, zttid, was_active) + } + + /// Deactivates and deletes all timelines for the tenant, see `delete()`. + /// Returns map of all timelines which the tenant had, `true` if a timeline was active. + pub fn delete_force_all_for_tenant( + conf: &SafeKeeperConf, + tenant_id: &ZTenantId, + ) -> Result> { + info!("deleting all timelines for tenant {}", tenant_id); + let mut state = TIMELINES_STATE.lock().unwrap(); + let mut deleted = HashMap::new(); + for (zttid, tli) in &state.timelines { + if zttid.tenant_id == *tenant_id { + deleted.insert( + *zttid, + GlobalTimelines::delete_force_internal( + conf, + zttid, + tli.deactivate_for_delete()?, + )?, + ); + } + } + // TODO: test that the exact subset of timelines is removed. + state + .timelines + .retain(|zttid, _| !deleted.contains_key(zttid)); + match std::fs::remove_dir_all(conf.tenant_dir(tenant_id)) { + Ok(_) => (), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), + e => e?, + }; + Ok(deleted) + } } diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 702c27a79b..e297f91f2c 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -850,3 +850,116 @@ def test_wal_deleted_after_broadcast(zenith_env_builder: ZenithEnvBuilder): # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) assert wal_size_after_checkpoint < 16 * 2.5 + + +def test_delete_force(zenith_env_builder: ZenithEnvBuilder): + zenith_env_builder.num_safekeepers = 1 + env = zenith_env_builder.init_start() + + # Create two tenants: one will be deleted, other should be preserved. + tenant_id = env.initial_tenant.hex + timeline_id_1 = env.zenith_cli.create_branch('br1').hex # Acive, delete explicitly + timeline_id_2 = env.zenith_cli.create_branch('br2').hex # Inactive, delete explictly + timeline_id_3 = env.zenith_cli.create_branch('br3').hex # Active, delete with the tenant + timeline_id_4 = env.zenith_cli.create_branch('br4').hex # Inactive, delete with the tenant + + tenant_id_other = env.zenith_cli.create_tenant().hex + timeline_id_other = env.zenith_cli.create_root_branch( + 'br-other', tenant_id=uuid.UUID(hex=tenant_id_other)).hex + + # Populate branches + pg_1 = env.postgres.create_start('br1') + pg_2 = env.postgres.create_start('br2') + pg_3 = env.postgres.create_start('br3') + pg_4 = env.postgres.create_start('br4') + pg_other = env.postgres.create_start('br-other', tenant_id=uuid.UUID(hex=tenant_id_other)) + for pg in [pg_1, pg_2, pg_3, pg_4, pg_other]: + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute('CREATE TABLE t(key int primary key)') + sk = env.safekeepers[0] + sk_data_dir = Path(sk.data_dir()) + sk_http = sk.http_client() + assert (sk_data_dir / tenant_id / timeline_id_1).is_dir() + assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() + assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() + assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() + assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + + # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state. + pg_2.stop_and_destroy() + pg_4.stop_and_destroy() + sk.stop() + sk.start() + + # Ensure connections to Safekeeper are established + for pg in [pg_1, pg_3, pg_other]: + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute('INSERT INTO t (key) VALUES (1)') + + # Remove initial tenant's br1 (active) + assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == { + "dir_existed": True, + "was_active": True, + } + assert not (sk_data_dir / tenant_id / timeline_id_1).exists() + assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() + assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() + assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() + assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + + # Ensure repeated deletion succeeds + assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == { + "dir_existed": False, "was_active": False + } + assert not (sk_data_dir / tenant_id / timeline_id_1).exists() + assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() + assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() + assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() + assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + + # Remove initial tenant's br2 (inactive) + assert sk_http.timeline_delete_force(tenant_id, timeline_id_2) == { + "dir_existed": True, + "was_active": False, + } + assert not (sk_data_dir / tenant_id / timeline_id_1).exists() + assert not (sk_data_dir / tenant_id / timeline_id_2).exists() + assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() + assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() + assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + + # Remove non-existing branch, should succeed + assert sk_http.timeline_delete_force(tenant_id, '00' * 16) == { + "dir_existed": False, + "was_active": False, + } + assert not (sk_data_dir / tenant_id / timeline_id_1).exists() + assert not (sk_data_dir / tenant_id / timeline_id_2).exists() + assert (sk_data_dir / tenant_id / timeline_id_3).exists() + assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() + assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + + # Remove initial tenant fully (two branches are active) + response = sk_http.tenant_delete_force(tenant_id) + assert response == { + timeline_id_3: { + "dir_existed": True, + "was_active": True, + } + } + assert not (sk_data_dir / tenant_id).exists() + assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + + # Remove initial tenant again. + response = sk_http.tenant_delete_force(tenant_id) + assert response == {} + assert not (sk_data_dir / tenant_id).exists() + assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + + # Ensure the other tenant still works + sk_http.timeline_status(tenant_id_other, timeline_id_other) + with closing(pg_other.connect()) as conn: + with conn.cursor() as cur: + cur.execute('INSERT INTO t (key) VALUES (123)') diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index fe20f1abbf..357db4c16d 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1800,6 +1800,21 @@ class SafekeeperHttpClient(requests.Session): json=body) res.raise_for_status() + def timeline_delete_force(self, tenant_id: str, timeline_id: str) -> Dict[Any, Any]: + res = self.delete( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def tenant_delete_force(self, tenant_id: str) -> Dict[Any, Any]: + res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + def get_metrics(self) -> SafekeeperMetrics: request_result = self.get(f"http://localhost:{self.port}/metrics") request_result.raise_for_status() From aa7c601eca425d82e616e0fc0468dac8a2a35db2 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 12 May 2022 20:53:40 +0300 Subject: [PATCH 0287/1022] Fix pitr_interval check in GC: Use timestamp->LSN mapping instead of file modification time. Fix 'latest_gc_cutoff_lsn' - set it to the minimum of pitr_cutoff and gc_cutoff. Add new test: test_pitr_gc --- pageserver/src/layered_repository.rs | 76 +++++++++++++++-------- test_runner/batch_others/test_pitr_gc.py | 77 ++++++++++++++++++++++++ test_runner/fixtures/utils.py | 3 +- 3 files changed, 131 insertions(+), 25 deletions(-) create mode 100644 test_runner/batch_others/test_pitr_gc.py diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index b02ab00a21..24f9bcff37 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -74,6 +74,7 @@ pub mod metadata; mod par_fsync; mod storage_layer; +use crate::pgdatadir_mapping::LsnForTimestamp; use delta_layer::{DeltaLayer, DeltaLayerWriter}; use ephemeral_file::is_ephemeral_file; use filename::{DeltaFileName, ImageFileName}; @@ -81,6 +82,7 @@ use image_layer::{ImageLayer, ImageLayerWriter}; use inmemory_layer::InMemoryLayer; use layer_map::LayerMap; use layer_map::SearchResult; +use postgres_ffi::xlog_utils::to_pg_timestamp; use storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; // re-export this function so that page_cache.rs can use it. @@ -2118,11 +2120,49 @@ impl LayeredTimeline { let cutoff = gc_info.cutoff; let pitr = gc_info.pitr; + // Calculate pitr cutoff point. + // By default, we don't want to GC anything. + let mut pitr_cutoff_lsn: Lsn = *self.get_latest_gc_cutoff_lsn(); + + if let Ok(timeline) = + tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) + { + // First, calculate pitr_cutoff_timestamp and then convert it to LSN. + // If we don't have enough data to convert to LSN, + // play safe and don't remove any layers. + if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { + let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); + + match timeline.find_lsn_for_timestamp(pitr_timestamp)? { + LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, + LsnForTimestamp::Future(lsn) => { + debug!("future({})", lsn); + } + LsnForTimestamp::Past(lsn) => { + debug!("past({})", lsn); + } + } + debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn) + } + } else { + // We don't have local timeline in mocked cargo tests. + // So, just ignore pitr_interval setting in this case. + pitr_cutoff_lsn = cutoff; + } + + let new_gc_cutoff = Lsn::min(cutoff, pitr_cutoff_lsn); + + // Nothing to GC. Return early. + if *self.get_latest_gc_cutoff_lsn() == new_gc_cutoff { + result.elapsed = now.elapsed()?; + return Ok(result); + } + let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %cutoff).entered(); // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. // See branch_timeline() for details. - *self.latest_gc_cutoff_lsn.write().unwrap() = cutoff; + *self.latest_gc_cutoff_lsn.write().unwrap() = new_gc_cutoff; info!("GC starting"); @@ -2162,30 +2202,18 @@ impl LayeredTimeline { result.layers_needed_by_cutoff += 1; continue 'outer; } - // 2. It is newer than PiTR interval? - // We use modification time of layer file to estimate update time. - // This estimation is not quite precise but maintaining LSN->timestamp map seems to be overkill. - // It is not expected that users will need high precision here. And this estimation - // is conservative: modification time of file is always newer than actual time of version - // creation. So it is safe for users. - // TODO A possible "bloat" issue still persists here. - // If modification time changes because of layer upload/download, we will keep these files - // longer than necessary. - // https://github.com/neondatabase/neon/issues/1554 - // - if let Ok(metadata) = fs::metadata(&l.filename()) { - let last_modified = metadata.modified()?; - if now.duration_since(last_modified)? < pitr { - debug!( - "keeping {} because it's modification time {:?} is newer than PITR {:?}", - l.filename().display(), - last_modified, - pitr - ); - result.layers_needed_by_pitr += 1; - continue 'outer; - } + + // 2. It is newer than PiTR cutoff point? + if l.get_lsn_range().end > pitr_cutoff_lsn { + debug!( + "keeping {} because it's newer than pitr_cutoff_lsn {}", + l.filename().display(), + pitr_cutoff_lsn + ); + result.layers_needed_by_pitr += 1; + continue 'outer; } + // 3. Is it needed by a child branch? // NOTE With that wee would keep data that // might be referenced by child branches forever. diff --git a/test_runner/batch_others/test_pitr_gc.py b/test_runner/batch_others/test_pitr_gc.py new file mode 100644 index 0000000000..fe9159b4bb --- /dev/null +++ b/test_runner/batch_others/test_pitr_gc.py @@ -0,0 +1,77 @@ +import subprocess +from contextlib import closing + +import psycopg2.extras +import pytest +from fixtures.log_helper import log +from fixtures.utils import print_gc_result +from fixtures.zenith_fixtures import ZenithEnvBuilder + + +# +# Check pitr_interval GC behavior. +# Insert some data, run GC and create a branch in the past. +# +def test_pitr_gc(zenith_env_builder: ZenithEnvBuilder): + + zenith_env_builder.num_safekeepers = 1 + # Set pitr interval such that we need to keep the data + zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '1day', gc_horizon = 0}" + + env = zenith_env_builder.init_start() + pgmain = env.postgres.create_start('main') + log.info("postgres is running on 'main' branch") + + main_pg_conn = pgmain.connect() + main_cur = main_pg_conn.cursor() + + main_cur.execute("SHOW zenith.zenith_timeline") + timeline = main_cur.fetchone()[0] + + # Create table + main_cur.execute('CREATE TABLE foo (t text)') + + for i in range(10000): + main_cur.execute(''' + INSERT INTO foo + SELECT 'long string to consume some space'; + ''') + + if i == 99: + # keep some early lsn to test branch creation after GC + main_cur.execute('SELECT pg_current_wal_insert_lsn(), txid_current()') + res = main_cur.fetchone() + lsn_a = res[0] + xid_a = res[1] + log.info(f'LSN after 100 rows: {lsn_a} xid {xid_a}') + + main_cur.execute('SELECT pg_current_wal_insert_lsn(), txid_current()') + res = main_cur.fetchone() + debug_lsn = res[0] + debug_xid = res[1] + log.info(f'LSN after 10000 rows: {debug_lsn} xid {debug_xid}') + + # run GC + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: + pscur.execute(f"compact {env.initial_tenant.hex} {timeline}") + # perform agressive GC. Data still should be kept because of the PITR setting. + pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") + row = pscur.fetchone() + print_gc_result(row) + + # Branch at the point where only 100 rows were inserted + # It must have been preserved by PITR setting + env.zenith_cli.create_branch('test_pitr_gc_hundred', 'main', ancestor_start_lsn=lsn_a) + + pg_hundred = env.postgres.create_start('test_pitr_gc_hundred') + + # On the 'hundred' branch, we should see only 100 rows + hundred_pg_conn = pg_hundred.connect() + hundred_cur = hundred_pg_conn.cursor() + hundred_cur.execute('SELECT count(*) FROM foo') + assert hundred_cur.fetchone() == (100, ) + + # All the rows are visible on the main branch + main_cur.execute('SELECT count(*) FROM foo') + assert main_cur.fetchone() == (10000, ) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 98af511036..7b95e729d9 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -75,7 +75,8 @@ def lsn_from_hex(lsn_hex: str) -> int: def print_gc_result(row): log.info("GC duration {elapsed} ms".format_map(row)) log.info( - " total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}" + " total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_pitr {layers_needed_by_pitr}" + " needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}" .format_map(row)) From a2561f0a78116fc775732cb36c7df992d4d3a07a Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 13 May 2022 16:01:41 +0300 Subject: [PATCH 0288/1022] Use tenant's pitr_interval instead of hardroded 0 in the command. Adjust python tests that use the --- pageserver/src/layered_repository.rs | 11 ++++++++--- pageserver/src/page_service.rs | 5 +++-- test_runner/batch_others/test_branch_behind.py | 2 ++ test_runner/batch_others/test_gc_aggressive.py | 11 +++++++---- .../batch_others/test_old_request_lsn.py | 17 ++++++++++++----- test_runner/batch_others/test_pitr_gc.py | 2 +- test_runner/performance/test_bulk_insert.py | 1 - test_runner/performance/test_random_writes.py | 1 - 8 files changed, 33 insertions(+), 17 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 24f9bcff37..c7536cc959 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -2121,7 +2121,7 @@ impl LayeredTimeline { let pitr = gc_info.pitr; // Calculate pitr cutoff point. - // By default, we don't want to GC anything. + // If we cannot determine a cutoff LSN, be conservative and don't GC anything. let mut pitr_cutoff_lsn: Lsn = *self.get_latest_gc_cutoff_lsn(); if let Ok(timeline) = @@ -2137,6 +2137,7 @@ impl LayeredTimeline { LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, LsnForTimestamp::Future(lsn) => { debug!("future({})", lsn); + pitr_cutoff_lsn = cutoff; } LsnForTimestamp::Past(lsn) => { debug!("past({})", lsn); @@ -2144,7 +2145,7 @@ impl LayeredTimeline { } debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn) } - } else { + } else if cfg!(test) { // We don't have local timeline in mocked cargo tests. // So, just ignore pitr_interval setting in this case. pitr_cutoff_lsn = cutoff; @@ -2153,7 +2154,11 @@ impl LayeredTimeline { let new_gc_cutoff = Lsn::min(cutoff, pitr_cutoff_lsn); // Nothing to GC. Return early. - if *self.get_latest_gc_cutoff_lsn() == new_gc_cutoff { + if *self.get_latest_gc_cutoff_lsn() >= new_gc_cutoff { + info!( + "Nothing to GC for timeline {}. cutoff_lsn {}", + self.timeline_id, new_gc_cutoff + ); result.elapsed = now.elapsed()?; return Ok(result); } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 88273cfa57..28d6bf2621 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -19,7 +19,6 @@ use std::net::TcpListener; use std::str; use std::str::FromStr; use std::sync::{Arc, RwLockReadGuard}; -use std::time::Duration; use tracing::*; use utils::{ auth::{self, Claims, JwtAuth, Scope}, @@ -796,7 +795,9 @@ impl postgres_backend::Handler for PageServerHandler { .unwrap_or_else(|| Ok(repo.get_gc_horizon()))?; let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - let result = repo.gc_iteration(Some(timelineid), gc_horizon, Duration::ZERO, true)?; + // Use tenant's pitr setting + let pitr = repo.get_pitr_interval(); + let result = repo.gc_iteration(Some(timelineid), gc_horizon, pitr, true)?; pgb.write_message_noflush(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"layers_total"), RowDescriptor::int8_col(b"layers_needed_by_cutoff"), diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index 4e2be352f4..fc84af5283 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -19,6 +19,8 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): # # See https://github.com/zenithdb/zenith/issues/1068 zenith_env_builder.num_safekeepers = 1 + # Disable pitr, because here we want to test branch creation after GC + zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = zenith_env_builder.init_start() # Branch at the point where only 100 rows were inserted diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/batch_others/test_gc_aggressive.py index e4e4aa9f4a..519a6dda1c 100644 --- a/test_runner/batch_others/test_gc_aggressive.py +++ b/test_runner/batch_others/test_gc_aggressive.py @@ -1,7 +1,7 @@ import asyncio import random -from fixtures.zenith_fixtures import ZenithEnv, Postgres +from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres from fixtures.log_helper import log # Test configuration @@ -50,9 +50,12 @@ async def update_and_gc(env: ZenithEnv, pg: Postgres, timeline: str): # # (repro for https://github.com/zenithdb/zenith/issues/1047) # -def test_gc_aggressive(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_gc_aggressive", "empty") +def test_gc_aggressive(zenith_env_builder: ZenithEnvBuilder): + + # Disable pitr, because here we want to test branch creation after GC + zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" + env = zenith_env_builder.init_start() + env.zenith_cli.create_branch("test_gc_aggressive", "main") pg = env.postgres.create_start('test_gc_aggressive') log.info('postgres is running on test_gc_aggressive branch') diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py index e7400cff96..cf7fe09b1e 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -1,5 +1,7 @@ -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.zenith_fixtures import ZenithEnvBuilder from fixtures.log_helper import log +from fixtures.utils import print_gc_result +import psycopg2.extras # @@ -12,9 +14,11 @@ from fixtures.log_helper import log # just a hint that the page hasn't been modified since that LSN, and the page # server should return the latest page version regardless of the LSN. # -def test_old_request_lsn(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_old_request_lsn", "empty") +def test_old_request_lsn(zenith_env_builder: ZenithEnvBuilder): + # Disable pitr, because here we want to test branch creation after GC + zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" + env = zenith_env_builder.init_start() + env.zenith_cli.create_branch("test_old_request_lsn", "main") pg = env.postgres.create_start('test_old_request_lsn') log.info('postgres is running on test_old_request_lsn branch') @@ -26,7 +30,7 @@ def test_old_request_lsn(zenith_simple_env: ZenithEnv): timeline = cur.fetchone()[0] psconn = env.pageserver.connect() - pscur = psconn.cursor() + pscur = psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers. @@ -53,6 +57,9 @@ def test_old_request_lsn(zenith_simple_env: ZenithEnv): # garbage collections so that the page server will remove old page versions. for i in range(10): pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") + row = pscur.fetchone() + print_gc_result(row) + for j in range(100): cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;') diff --git a/test_runner/batch_others/test_pitr_gc.py b/test_runner/batch_others/test_pitr_gc.py index fe9159b4bb..ee19bddfe8 100644 --- a/test_runner/batch_others/test_pitr_gc.py +++ b/test_runner/batch_others/test_pitr_gc.py @@ -16,7 +16,7 @@ def test_pitr_gc(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 # Set pitr interval such that we need to keep the data - zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '1day', gc_horizon = 0}" + zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" env = zenith_env_builder.init_start() pgmain = env.postgres.create_start('main') diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 4e73bedcc0..3b57ac73cc 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -18,7 +18,6 @@ from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare def test_bulk_insert(zenith_with_baseline: PgCompare): env = zenith_with_baseline - # Get the timeline ID of our branch. We need it for the 'do_gc' command with closing(env.pg.connect()) as conn: with conn.cursor() as cur: cur.execute("create table huge (i int, j int);") diff --git a/test_runner/performance/test_random_writes.py b/test_runner/performance/test_random_writes.py index ba9eabcd97..205388bd90 100644 --- a/test_runner/performance/test_random_writes.py +++ b/test_runner/performance/test_random_writes.py @@ -8,7 +8,6 @@ from fixtures.log_helper import log import psycopg2.extras import random import time -from fixtures.utils import print_gc_result # This is a clear-box test that demonstrates the worst case scenario for the From 768c846eeb9f90450e06185ce477ed1a566a0f22 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Fri, 13 May 2022 17:06:25 +0300 Subject: [PATCH 0289/1022] Fix test_delete_force from #1653 conflicting with #1692 --- test_runner/batch_others/test_wal_acceptor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index e297f91f2c..67c9d6070e 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -863,16 +863,16 @@ def test_delete_force(zenith_env_builder: ZenithEnvBuilder): timeline_id_3 = env.zenith_cli.create_branch('br3').hex # Active, delete with the tenant timeline_id_4 = env.zenith_cli.create_branch('br4').hex # Inactive, delete with the tenant - tenant_id_other = env.zenith_cli.create_tenant().hex - timeline_id_other = env.zenith_cli.create_root_branch( - 'br-other', tenant_id=uuid.UUID(hex=tenant_id_other)).hex + tenant_id_other_uuid, timeline_id_other_uuid = env.zenith_cli.create_tenant() + tenant_id_other = tenant_id_other_uuid.hex + timeline_id_other = timeline_id_other_uuid.hex # Populate branches pg_1 = env.postgres.create_start('br1') pg_2 = env.postgres.create_start('br2') pg_3 = env.postgres.create_start('br3') pg_4 = env.postgres.create_start('br4') - pg_other = env.postgres.create_start('br-other', tenant_id=uuid.UUID(hex=tenant_id_other)) + pg_other = env.postgres.create_start('main', tenant_id=uuid.UUID(hex=tenant_id_other)) for pg in [pg_1, pg_2, pg_3, pg_4, pg_other]: with closing(pg.connect()) as conn: with conn.cursor() as cur: From cded72a580266d978fee5260be9e0e56abbb42b9 Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Fri, 13 May 2022 20:41:54 +0300 Subject: [PATCH 0290/1022] remove sk-2 from staging inventory list (#1699) --- .circleci/ansible/staging.hosts | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index b2bacb89ca..8e89e843d9 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -4,7 +4,6 @@ zenith-us-stage-ps-2 console_region_id=27 [safekeepers] zenith-us-stage-sk-1 console_region_id=27 -zenith-us-stage-sk-2 console_region_id=27 zenith-us-stage-sk-4 console_region_id=27 zenith-us-stage-sk-5 console_region_id=27 From 081d5dac5eba534bac74624e0f935d4c0b28af6b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 13 May 2022 21:41:00 +0300 Subject: [PATCH 0291/1022] Bump vendor/postgres. Includes change to reduce log noise from inmem_smgr. --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index d62ec22eff..1db115cecb 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit d62ec22effeca7b5794ab2c15a3fd9ee5a4a5b99 +Subproject commit 1db115cecb3dbc2a74c5efa964fdf3a8a341c4d2 From a10cac980f703bf5ec50e37a14aac5e6d6261525 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sun, 15 May 2022 00:25:38 +0300 Subject: [PATCH 0292/1022] Continue with pageserver startup, if loading some tenants fail. Fixes https://github.com/neondatabase/neon/issues/1664 --- pageserver/src/tenant_mgr.rs | 83 ++++++++++++------- .../batch_others/test_broken_timeline.py | 80 ++++++++++++++++++ 2 files changed, 135 insertions(+), 28 deletions(-) create mode 100644 test_runner/batch_others/test_broken_timeline.py diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 20a723b5b5..9bde9a5c4a 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -78,6 +78,9 @@ pub enum TenantState { // The local disk might have some newer files that don't exist in cloud storage yet. // The tenant cannot be accessed anymore for any reason, but graceful shutdown. Stopping, + + // Something went wrong loading the tenant state + Broken, } impl fmt::Display for TenantState { @@ -86,6 +89,7 @@ impl fmt::Display for TenantState { TenantState::Active => f.write_str("Active"), TenantState::Idle => f.write_str("Idle"), TenantState::Stopping => f.write_str("Stopping"), + TenantState::Broken => f.write_str("Broken"), } } } @@ -99,7 +103,22 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result { + tenant.state = TenantState::Stopping; + tenantids.push(*tenantid) + } + TenantState::Broken => {} + } } drop(m); @@ -270,6 +294,10 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> { TenantState::Stopping => { // don't re-activate it if it's being stopped } + + TenantState::Broken => { + // cannot activate + } } Ok(()) } @@ -370,38 +398,37 @@ pub fn list_tenants() -> Vec { .collect() } -fn init_local_repositories( +fn init_local_repository( conf: &'static PageServerConf, - local_timeline_init_statuses: HashMap>, + tenant_id: ZTenantId, + local_timeline_init_statuses: HashMap, remote_index: &RemoteIndex, ) -> anyhow::Result<(), anyhow::Error> { - for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses { - // initialize local tenant - let repo = load_local_repo(conf, tenant_id, remote_index) - .with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?; + // initialize local tenant + let repo = load_local_repo(conf, tenant_id, remote_index) + .with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?; - let mut status_updates = HashMap::with_capacity(local_timeline_init_statuses.len()); - for (timeline_id, init_status) in local_timeline_init_statuses { - match init_status { - LocalTimelineInitStatus::LocallyComplete => { - debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository"); - status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded); - } - LocalTimelineInitStatus::NeedsSync => { - debug!( - "timeline {tenant_id} for tenant {timeline_id} needs sync, \ - so skipped for adding into repository until sync is finished" - ); - } + let mut status_updates = HashMap::with_capacity(local_timeline_init_statuses.len()); + for (timeline_id, init_status) in local_timeline_init_statuses { + match init_status { + LocalTimelineInitStatus::LocallyComplete => { + debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository"); + status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded); + } + LocalTimelineInitStatus::NeedsSync => { + debug!( + "timeline {tenant_id} for tenant {timeline_id} needs sync, \ + so skipped for adding into repository until sync is finished" + ); } } - - // Lets fail here loudly to be on the safe side. - // XXX: It may be a better api to actually distinguish between repository startup - // and processing of newly downloaded timelines. - apply_timeline_remote_sync_status_updates(&repo, status_updates) - .with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))? } + + // Lets fail here loudly to be on the safe side. + // XXX: It may be a better api to actually distinguish between repository startup + // and processing of newly downloaded timelines. + apply_timeline_remote_sync_status_updates(&repo, status_updates) + .with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))?; Ok(()) } diff --git a/test_runner/batch_others/test_broken_timeline.py b/test_runner/batch_others/test_broken_timeline.py new file mode 100644 index 0000000000..17eadb33b4 --- /dev/null +++ b/test_runner/batch_others/test_broken_timeline.py @@ -0,0 +1,80 @@ +import pytest +from contextlib import closing +from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.log_helper import log +import os + + +# Test restarting page server, while safekeeper and compute node keep +# running. +def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder): + # One safekeeper is enough for this test. + zenith_env_builder.num_safekeepers = 3 + env = zenith_env_builder.init_start() + + tenant_timelines = [] + + for n in range(4): + tenant_id_uuid, timeline_id_uuid = env.zenith_cli.create_tenant() + tenant_id = tenant_id_uuid.hex + timeline_id = timeline_id_uuid.hex + + pg = env.postgres.create_start(f'main', tenant_id=tenant_id_uuid) + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'") + + cur.execute("SHOW zenith.zenith_timeline") + timeline_id = cur.fetchone()[0] + pg.stop() + tenant_timelines.append((tenant_id, timeline_id, pg)) + + # Stop the pageserver + env.pageserver.stop() + + # Leave the first timeline alone, but corrupt the others in different ways + (tenant0, timeline0, pg0) = tenant_timelines[0] + + # Corrupt metadata file on timeline 1 + (tenant1, timeline1, pg1) = tenant_timelines[1] + metadata_path = "{}/tenants/{}/timelines/{}/metadata".format(env.repo_dir, tenant1, timeline1) + print(f'overwriting metadata file at {metadata_path}') + f = open(metadata_path, "w") + f.write("overwritten with garbage!") + f.close() + + # Missing layer files file on timeline 2. (This would actually work + # if we had Cloud Storage enabled in this test.) + (tenant2, timeline2, pg2) = tenant_timelines[2] + timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant2, timeline2) + for filename in os.listdir(timeline_path): + if filename.startswith('00000'): + # Looks like a layer file. Remove it + os.remove(f'{timeline_path}/{filename}') + + # Corrupt layer files file on timeline 3 + (tenant3, timeline3, pg3) = tenant_timelines[3] + timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant3, timeline3) + for filename in os.listdir(timeline_path): + if filename.startswith('00000'): + # Looks like a layer file. Corrupt it + f = open(f'{timeline_path}/{filename}', "w") + f.write("overwritten with garbage!") + f.close() + + env.pageserver.start() + + # Tenant 0 should still work + pg0.start() + with closing(pg0.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SELECT COUNT(*) FROM t") + assert cur.fetchone()[0] == 100 + + # But all others are broken + for n in range(1, 4): + (tenant, timeline, pg) = tenant_timelines[n] + with pytest.raises(Exception, match="Cannot load local timeline") as err: + pg.start() + log.info(f'compute startup failed as expected: {err}') From 51ea9c3053c9ab5d2be837c2eeb0dd149b038229 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 16 May 2022 09:58:58 +0300 Subject: [PATCH 0293/1022] Don't swallow panics when the pageserver is build with failpoints. It's very confusing, and because you don't get a stack trace and error message in the logs, makes debugging very hard. However, the 'test_pageserver_recovery' test relied on that behavior. To support that, add a new "exit" action to the pageserver 'failpoints' command, so that you can explicitly request to exit the process when a failpoint is hit. --- pageserver/src/bin/pageserver.rs | 7 +------ pageserver/src/page_service.rs | 13 ++++++++++++- test_runner/batch_others/test_recovery.py | 4 ++-- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 190e38e341..c6cb460f8f 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -183,13 +183,8 @@ fn main() -> anyhow::Result<()> { // as a ref. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - // If failpoints are used, terminate the whole pageserver process if they are hit. + // Initialize up failpoints support let scenario = FailScenario::setup(); - if fail::has_failpoints() { - std::panic::set_hook(Box::new(|_| { - std::process::exit(1); - })); - } // Basic initialization of things that don't change after startup virtual_file::init(conf.max_file_descriptors); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 28d6bf2621..03264c9782 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -730,7 +730,18 @@ impl postgres_backend::Handler for PageServerHandler { for failpoint in failpoints.split(';') { if let Some((name, actions)) = failpoint.split_once('=') { info!("cfg failpoint: {} {}", name, actions); - fail::cfg(name, actions).unwrap(); + + // We recognize one extra "action" that's not natively recognized + // by the failpoints crate: exit, to immediately kill the process + if actions == "exit" { + fail::cfg_callback(name, || { + info!("Exit requested by failpoint"); + std::process::exit(1); + }) + .unwrap(); + } else { + fail::cfg(name, actions).unwrap(); + } } else { bail!("Invalid failpoints format"); } diff --git a/test_runner/batch_others/test_recovery.py b/test_runner/batch_others/test_recovery.py index dbfa943a7a..eb1747efa5 100644 --- a/test_runner/batch_others/test_recovery.py +++ b/test_runner/batch_others/test_recovery.py @@ -45,14 +45,14 @@ def test_pageserver_recovery(zenith_env_builder: ZenithEnvBuilder): # Configure failpoints pscur.execute( - "failpoints checkpoint-before-sync=sleep(2000);checkpoint-after-sync=panic") + "failpoints checkpoint-before-sync=sleep(2000);checkpoint-after-sync=exit") # Do some updates until pageserver is crashed try: while True: cur.execute("update foo set x=x+1") except Exception as err: - log.info(f"Excepted server crash {err}") + log.info(f"Expected server crash {err}") log.info("Wait before server restart") env.pageserver.stop() From 33cac863d74acb2bafc2f51cf364bf26b2d4d8c4 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 13 May 2022 17:04:51 +0300 Subject: [PATCH 0294/1022] Test simple.conf and handle broker_endpoints better --- control_plane/src/local_env.rs | 102 +++++++++++++++++------- control_plane/src/safekeeper.rs | 23 ++++-- libs/remote_storage/src/lib.rs | 3 +- neon_local/src/main.rs | 6 +- test_runner/fixtures/zenith_fixtures.py | 2 +- 5 files changed, 99 insertions(+), 37 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 5aeff505b6..35167ebabf 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -4,6 +4,7 @@ //! script which will use local paths. use anyhow::{bail, ensure, Context}; +use reqwest::Url; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use std::collections::HashMap; @@ -59,9 +60,10 @@ pub struct LocalEnv { #[serde(default)] pub private_key_path: PathBuf, - // A comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'. + // Broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'. #[serde(default)] - pub broker_endpoints: Option, + #[serde_as(as = "Vec")] + pub broker_endpoints: Vec, /// A prefix to all to any key when pushing/polling etcd from a node. #[serde(default)] @@ -184,12 +186,7 @@ impl LocalEnv { if old_timeline_id == &timeline_id { Ok(()) } else { - bail!( - "branch '{}' is already mapped to timeline {}, cannot map to another timeline {}", - branch_name, - old_timeline_id, - timeline_id - ); + bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}"); } } else { existing_values.push((tenant_id, timeline_id)); @@ -225,7 +222,7 @@ impl LocalEnv { /// /// Unlike 'load_config', this function fills in any defaults that are missing /// from the config file. - pub fn create_config(toml: &str) -> anyhow::Result { + pub fn parse_config(toml: &str) -> anyhow::Result { let mut env: LocalEnv = toml::from_str(toml)?; // Find postgres binaries. @@ -238,25 +235,20 @@ impl LocalEnv { env.pg_distrib_dir = cwd.join("tmp_install") } } - if !env.pg_distrib_dir.join("bin/postgres").exists() { - bail!( - "Can't find postgres binary at {}", - env.pg_distrib_dir.display() - ); - } // Find zenith binaries. if env.zenith_distrib_dir == Path::new("") { - env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); - } - for binary in ["pageserver", "safekeeper"] { - if !env.zenith_distrib_dir.join(binary).exists() { - bail!( - "Can't find binary '{}' in zenith distrib dir '{}'", - binary, - env.zenith_distrib_dir.display() - ); - } + let current_exec_path = + env::current_exe().context("Failed to find current excecutable's path")?; + env.zenith_distrib_dir = current_exec_path + .parent() + .with_context(|| { + format!( + "Failed to find a parent directory for executable {}", + current_exec_path.display(), + ) + })? + .to_owned(); } // If no initial tenant ID was given, generate it. @@ -351,6 +343,20 @@ impl LocalEnv { "directory '{}' already exists. Perhaps already initialized?", base_path.display() ); + for binary in ["pageserver", "safekeeper"] { + if !self.zenith_distrib_dir.join(binary).exists() { + bail!( + "Can't find binary '{binary}' in zenith distrib dir '{}'", + self.zenith_distrib_dir.display() + ); + } + } + if !self.pg_distrib_dir.join("bin/postgres").exists() { + bail!( + "Can't find postgres binary at {}", + self.pg_distrib_dir.display() + ); + } fs::create_dir(&base_path)?; @@ -408,7 +414,49 @@ impl LocalEnv { fn base_path() -> PathBuf { match std::env::var_os("ZENITH_REPO_DIR") { - Some(val) => PathBuf::from(val.to_str().unwrap()), - None => ".zenith".into(), + Some(val) => PathBuf::from(val), + None => PathBuf::from(".zenith"), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn simple_conf_parsing() { + let simple_conf_toml = include_str!("../simple.conf"); + let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml); + assert!( + simple_conf_parse_result.is_ok(), + "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}" + ); + + let regular_url_string = "broker_endpoints = ['localhost:1111']"; + let regular_url_toml = simple_conf_toml.replace( + "[pageserver]", + &format!("\n{regular_url_string}\n[pageserver]"), + ); + match LocalEnv::parse_config(®ular_url_toml) { + Ok(regular_url_parsed) => { + assert_eq!( + regular_url_parsed.broker_endpoints, + vec!["localhost:1111".parse().unwrap()], + "Unexpectedly parsed broker endpoint url" + ); + } + Err(e) => panic!("failed to parse simple config {regular_url_toml}, reason: {e}"), + } + + let spoiled_url_string = "broker_endpoints = ['!@$XOXO%^&']"; + let spoiled_url_toml = simple_conf_toml.replace( + "[pageserver]", + &format!("\n{spoiled_url_string}\n[pageserver]"), + ); + let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml); + assert!( + spoiled_url_parse_result.is_err(), + "expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}" + ); } } diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 074ee72f69..aeeb4a50ec 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -12,7 +12,7 @@ use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; use postgres::Config; use reqwest::blocking::{Client, RequestBuilder, Response}; -use reqwest::{IntoUrl, Method}; +use reqwest::{IntoUrl, Method, Url}; use safekeeper::http::models::TimelineCreateRequest; use thiserror::Error; use utils::{ @@ -52,7 +52,7 @@ impl ResponseErrorMessageExt for Response { Err(SafekeeperHttpError::Response( match self.json::() { Ok(err_body) => format!("Error: {}", err_body.msg), - Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), + Err(_) => format!("Http error ({}) at {url}.", status.as_u16()), }, )) } @@ -76,7 +76,7 @@ pub struct SafekeeperNode { pub pageserver: Arc, - broker_endpoints: Option, + broker_endpoints: Vec, broker_etcd_prefix: Option, } @@ -142,8 +142,21 @@ impl SafekeeperNode { if !self.conf.sync { cmd.arg("--no-sync"); } - if let Some(ref ep) = self.broker_endpoints { - cmd.args(&["--broker-endpoints", ep]); + + if !self.broker_endpoints.is_empty() { + cmd.args(&[ + "--broker-endpoints", + &self.broker_endpoints.iter().map(Url::as_str).fold( + String::new(), + |mut comma_separated_urls, url| { + if !comma_separated_urls.is_empty() { + comma_separated_urls.push(','); + } + comma_separated_urls.push_str(url); + comma_separated_urls + }, + ), + ]); } if let Some(prefix) = self.broker_etcd_prefix.as_deref() { cmd.args(&["--broker-etcd-prefix", prefix]); diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 9bbb855dd5..8092e4fc49 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -87,7 +87,8 @@ pub trait RemoteStorage: Send + Sync { async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>; } -/// TODO kb +/// Every storage, currently supported. +/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics. pub enum GenericRemoteStorage { Local(LocalFs), S3(S3Bucket), diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index 6538cdefc4..e5ac46d3b1 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -275,7 +275,7 @@ fn main() -> Result<()> { "pageserver" => handle_pageserver(sub_args, &env), "pg" => handle_pg(sub_args, &env), "safekeeper" => handle_safekeeper(sub_args, &env), - _ => bail!("unexpected subcommand {}", sub_name), + _ => bail!("unexpected subcommand {sub_name}"), }; if original_env != env { @@ -289,7 +289,7 @@ fn main() -> Result<()> { Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?, Ok(None) => (), Err(e) => { - eprintln!("command failed: {:?}", e); + eprintln!("command failed: {e:?}"); exit(1); } } @@ -482,7 +482,7 @@ fn handle_init(init_match: &ArgMatches) -> Result { }; let mut env = - LocalEnv::create_config(&toml_file).context("Failed to create neon configuration")?; + LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; env.init().context("Failed to initialize neon repository")?; // default_tenantid was generated by the `env.init()` call above diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 357db4c16d..50b7ef6dbb 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -558,7 +558,7 @@ class ZenithEnv: port=self.port_distributor.get_port(), peer_port=self.port_distributor.get_port()) toml += textwrap.dedent(f""" - broker_endpoints = 'http://127.0.0.1:{self.broker.port}' + broker_endpoints = ['http://127.0.0.1:{self.broker.port}'] """) # Create config for pageserver From c700032dd2735bfb7c8053be40fc8ffa34a575df Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 16 May 2022 14:40:49 +0300 Subject: [PATCH 0295/1022] Run the regression tests in CI also for PRs opened from forked repos. --- .github/workflows/testing.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 6d109b9bb5..79b2ba05d0 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -1,6 +1,8 @@ name: Build and Test -on: push +on: + pull_request: + push: jobs: regression-check: From c41549f630fa7adbe360f78be9c8f94952cfe4eb Mon Sep 17 00:00:00 2001 From: chaitanya sharma <86035+phoenix24@users.noreply.github.com> Date: Mon, 16 May 2022 20:12:08 +0530 Subject: [PATCH 0296/1022] Update readme build for osx (#1709) --- README.md | 61 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index af384d2672..39cbd2a222 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,8 @@ Pageserver consists of: ## Running local installation + +#### building on Ubuntu/ Debian (Linux) 1. Install build dependencies and other useful packages On Ubuntu or Debian this set of packages should be sufficient to build the code: @@ -31,21 +33,60 @@ apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libsec libssl-dev clang pkg-config libpq-dev ``` -[Rust] 1.58 or later is also required. +2. [Install Rust](https://www.rust-lang.org/tools/install) +``` +# recommended approach from https://www.rust-lang.org/tools/install +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +``` -To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively. +3. Install PostgreSQL Client +``` +apt install postgresql-client +``` -To run the integration tests or Python scripts (not required to use the code), install -Python (3.7 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory. - -2. Build neon and patched postgres +4. Build neon and patched postgres ```sh git clone --recursive https://github.com/neondatabase/neon.git cd neon make -j5 ``` -3. Start pageserver and postgres on top of it (should be called from repo root): + +#### building on OSX (12.3.1) +1. Install XCode +``` +xcode-select --install +``` + +2. [Install Rust](https://www.rust-lang.org/tools/install) +``` +# recommended approach from https://www.rust-lang.org/tools/install +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +``` + +3. Install PostgreSQL Client +``` +# from https://stackoverflow.com/questions/44654216/correct-way-to-install-psql-without-full-postgres-on-macos +brew install libpq +brew link --force libpq +``` + +4. Build neon and patched postgres +```sh +git clone --recursive https://github.com/neondatabase/neon.git +cd neon +make -j5 +``` + +#### dependency installation notes +To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively. + +To run the integration tests or Python scripts (not required to use the code), install +Python (3.7 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory. + + +#### running neon database +1. Start pageserver and postgres on top of it (should be called from repo root): ```sh # Create repository in .zenith with proper paths to binaries and data # Later that would be responsibility of a package install script @@ -75,7 +116,7 @@ Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=po main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running ``` -4. Now it is possible to connect to postgres and run some queries: +2. Now it is possible to connect to postgres and run some queries: ```text > psql -p55432 -h 127.0.0.1 -U zenith_admin postgres postgres=# CREATE TABLE t(key int primary key, value text); @@ -89,7 +130,7 @@ postgres=# select * from t; (1 row) ``` -5. And create branches and run postgres on them: +3. And create branches and run postgres on them: ```sh # create branch named migration_check > ./target/debug/neon_local timeline branch --branch-name migration_check @@ -133,7 +174,7 @@ postgres=# select * from t; (1 row) ``` -6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances +4. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances you have just started. You can stop them all with one command: ```sh > ./target/debug/neon_local stop From e4a70faa08a480caa648a533c9ca579db8709fad Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Mon, 16 May 2022 11:05:43 -0400 Subject: [PATCH 0297/1022] Add more information to timeline-related APIs (#1673) Resolves #1488. - implemented `GET tenant/:tenant_id/timeline/:timeline_id/wal_receiver` endpoint - returned `thread_id` in `thread_mgr::spawn` - added `latest_gc_cutoff_lsn` field to `LocalTimelineInfo` struct --- pageserver/src/http/openapi_spec.yml | 62 ++++++++++++++++ pageserver/src/http/routes.rs | 28 ++++++++ pageserver/src/tenant_mgr.rs | 1 + pageserver/src/thread_mgr.rs | 4 +- pageserver/src/timelines.rs | 4 ++ pageserver/src/walreceiver.rs | 72 +++++++++++++++---- .../batch_others/test_pageserver_api.py | 41 ++++++++++- test_runner/fixtures/zenith_fixtures.py | 9 +++ 8 files changed, 204 insertions(+), 17 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 9932a2d08d..55f7b3c5a7 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -123,6 +123,53 @@ paths: schema: $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/timeline/{timeline_id}/wal_receiver: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + get: + description: Get wal receiver's data attached to the timeline + responses: + "200": + description: WalReceiverEntry + content: + application/json: + schema: + $ref: "#/components/schemas/WalReceiverEntry" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "404": + description: Error when no wal receiver is running or found + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/timeline/{timeline_id}/attach: parameters: @@ -520,6 +567,21 @@ components: type: integer current_logical_size_non_incremental: type: integer + WalReceiverEntry: + type: object + required: + - thread_id + - wal_producer_connstr + properties: + thread_id: + type: integer + wal_producer_connstr: + type: string + last_received_msg_lsn: + type: string + format: hex + last_received_msg_ts: + type: integer Error: type: object diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 0104df826e..bb650a34ed 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -224,6 +224,30 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result, ApiError> { + let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + + let wal_receiver = tokio::task::spawn_blocking(move || { + let _enter = + info_span!("wal_receiver_get", tenant = %tenant_id, timeline = %timeline_id).entered(); + + crate::walreceiver::get_wal_receiver_entry(tenant_id, timeline_id) + }) + .await + .map_err(ApiError::from_err)? + .ok_or_else(|| { + ApiError::NotFound(format!( + "WAL receiver not found for tenant {} and timeline {}", + tenant_id, timeline_id + )) + })?; + + json_response(StatusCode::OK, wal_receiver) +} + async fn timeline_attach_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -485,6 +509,10 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler, ) + .get( + "/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver", + wal_receiver_get_handler, + ) .post( "/v1/tenant/:tenant_id/timeline/:timeline_id/attach", timeline_attach_handler, diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 9bde9a5c4a..bbe66d7f80 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -281,6 +281,7 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> { false, move || crate::tenant_threads::gc_loop(tenant_id), ) + .map(|_thread_id| ()) // update the `Result::Ok` type to match the outer function's return signature .with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}")); if let Err(e) = &gc_spawn_result { diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index b908f220ee..473cddda58 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -139,7 +139,7 @@ pub fn spawn( name: &str, shutdown_process_on_error: bool, f: F, -) -> std::io::Result<()> +) -> std::io::Result where F: FnOnce() -> anyhow::Result<()> + Send + 'static, { @@ -193,7 +193,7 @@ where drop(jh_guard); // The thread is now running. Nothing more to do here - Ok(()) + Ok(thread_id) } /// This wrapper function runs in a newly-spawned thread. It initializes the diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 7cfd33c40b..eadf5bf4e0 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -45,6 +45,8 @@ pub struct LocalTimelineInfo { #[serde_as(as = "Option")] pub prev_record_lsn: Option, #[serde_as(as = "DisplayFromStr")] + pub latest_gc_cutoff_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] pub disk_consistent_lsn: Lsn, pub current_logical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, @@ -68,6 +70,7 @@ impl LocalTimelineInfo { disk_consistent_lsn: datadir_tline.tline.get_disk_consistent_lsn(), last_record_lsn, prev_record_lsn: Some(datadir_tline.tline.get_prev_record_lsn()), + latest_gc_cutoff_lsn: *datadir_tline.tline.get_latest_gc_cutoff_lsn(), timeline_state: LocalTimelineState::Loaded, current_logical_size: Some(datadir_tline.get_current_logical_size()), current_logical_size_non_incremental: if include_non_incremental_logical_size { @@ -91,6 +94,7 @@ impl LocalTimelineInfo { disk_consistent_lsn: metadata.disk_consistent_lsn(), last_record_lsn: metadata.disk_consistent_lsn(), prev_record_lsn: metadata.prev_record_lsn(), + latest_gc_cutoff_lsn: metadata.latest_gc_cutoff_lsn(), timeline_state: LocalTimelineState::Unloaded, current_logical_size: None, current_logical_size_non_incremental: None, diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index b7a33364c9..b8f349af8f 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -18,6 +18,8 @@ use lazy_static::lazy_static; use postgres_ffi::waldecoder::*; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use std::cell::Cell; use std::collections::HashMap; use std::str::FromStr; @@ -35,11 +37,19 @@ use utils::{ zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, }; -// -// We keep one WAL Receiver active per timeline. -// -struct WalReceiverEntry { +/// +/// A WAL receiver's data stored inside the global `WAL_RECEIVERS`. +/// We keep one WAL receiver active per timeline. +/// +#[serde_as] +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct WalReceiverEntry { + thread_id: u64, wal_producer_connstr: String, + #[serde_as(as = "Option")] + last_received_msg_lsn: Option, + /// the timestamp (in microseconds) of the last received message + last_received_msg_ts: Option, } lazy_static! { @@ -74,7 +84,7 @@ pub fn launch_wal_receiver( receiver.wal_producer_connstr = wal_producer_connstr.into(); } None => { - thread_mgr::spawn( + let thread_id = thread_mgr::spawn( ThreadKind::WalReceiver, Some(tenantid), Some(timelineid), @@ -88,7 +98,10 @@ pub fn launch_wal_receiver( )?; let receiver = WalReceiverEntry { + thread_id, wal_producer_connstr: wal_producer_connstr.into(), + last_received_msg_lsn: None, + last_received_msg_ts: None, }; receivers.insert((tenantid, timelineid), receiver); @@ -99,15 +112,13 @@ pub fn launch_wal_receiver( Ok(()) } -// Look up current WAL producer connection string in the hash table -fn get_wal_producer_connstr(tenantid: ZTenantId, timelineid: ZTimelineId) -> String { +/// Look up a WAL receiver's data in the global `WAL_RECEIVERS` +pub fn get_wal_receiver_entry( + tenant_id: ZTenantId, + timeline_id: ZTimelineId, +) -> Option { let receivers = WAL_RECEIVERS.lock().unwrap(); - - receivers - .get(&(tenantid, timelineid)) - .unwrap() - .wal_producer_connstr - .clone() + receivers.get(&(tenant_id, timeline_id)).cloned() } // @@ -118,7 +129,18 @@ fn thread_main(conf: &'static PageServerConf, tenant_id: ZTenantId, timeline_id: info!("WAL receiver thread started"); // Look up the current WAL producer address - let wal_producer_connstr = get_wal_producer_connstr(tenant_id, timeline_id); + let wal_producer_connstr = { + match get_wal_receiver_entry(tenant_id, timeline_id) { + Some(e) => e.wal_producer_connstr, + None => { + info!( + "Unable to create the WAL receiver thread: no WAL receiver entry found for tenant {} and timeline {}", + tenant_id, timeline_id + ); + return; + } + } + }; // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server, // and start streaming WAL from it. @@ -318,6 +340,28 @@ fn walreceiver_main( let apply_lsn = u64::from(timeline_remote_consistent_lsn); let ts = SystemTime::now(); + // Update the current WAL receiver's data stored inside the global hash table `WAL_RECEIVERS` + { + let mut receivers = WAL_RECEIVERS.lock().unwrap(); + let entry = match receivers.get_mut(&(tenant_id, timeline_id)) { + Some(e) => e, + None => { + anyhow::bail!( + "no WAL receiver entry found for tenant {} and timeline {}", + tenant_id, + timeline_id + ); + } + }; + + entry.last_received_msg_lsn = Some(last_lsn); + entry.last_received_msg_ts = Some( + ts.duration_since(SystemTime::UNIX_EPOCH) + .expect("Received message time should be before UNIX EPOCH!") + .as_micros(), + ); + } + // Send zenith feedback message. // Regular standby_status_update fields are put into this message. let zenith_status_update = ZenithFeedback { diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 13f6ef358e..7fe3b4dff5 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -1,6 +1,12 @@ from uuid import uuid4, UUID import pytest -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient +from fixtures.zenith_fixtures import ( + DEFAULT_BRANCH_NAME, + ZenithEnv, + ZenithEnvBuilder, + ZenithPageserverHttpClient, + ZenithPageserverApiException, +) # test that we cannot override node id @@ -48,6 +54,39 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): assert local_timeline_details['timeline_state'] == 'Loaded' +def test_pageserver_http_get_wal_receiver_not_found(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + client = env.pageserver.http_client() + + tenant_id, timeline_id = env.zenith_cli.create_tenant() + + # no PG compute node is running, so no WAL receiver is running + with pytest.raises(ZenithPageserverApiException) as e: + _ = client.wal_receiver_get(tenant_id, timeline_id) + assert "Not Found" in str(e.value) + + +def test_pageserver_http_get_wal_receiver_success(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + client = env.pageserver.http_client() + + tenant_id, timeline_id = env.zenith_cli.create_tenant() + pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id) + + res = client.wal_receiver_get(tenant_id, timeline_id) + assert list(res.keys()) == [ + "thread_id", + "wal_producer_connstr", + "last_received_msg_lsn", + "last_received_msg_ts", + ] + + # make a DB modification then expect getting a new WAL receiver's data + pg.safe_psql("CREATE TABLE t(key int primary key, value text)") + res2 = client.wal_receiver_get(tenant_id, timeline_id) + assert res2["last_received_msg_lsn"] > res["last_received_msg_lsn"] + + def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv): env = zenith_simple_env client = env.pageserver.http_client() diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 50b7ef6dbb..14eae60248 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -786,6 +786,15 @@ class ZenithPageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json + def wal_receiver_get(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/wal_receiver" + ) + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + def get_metrics(self) -> str: res = self.get(f"http://localhost:{self.port}/metrics") self.verbose_error(res) From 85b5c0e98921a0a254021a55c5186aa1ca18813b Mon Sep 17 00:00:00 2001 From: chaitanya sharma <86035+phoenix24@users.noreply.github.com> Date: Fri, 13 May 2022 20:14:20 +0000 Subject: [PATCH 0298/1022] List profiling as a feature with 'pageserver --enabled-features' Fixes https://github.com/neondatabase/neon/issues/1627 --- pageserver/src/bin/pageserver.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index c6cb460f8f..4cc1dcbc5a 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -98,6 +98,8 @@ fn main() -> anyhow::Result<()> { let features: &[&str] = &[ #[cfg(feature = "failpoints")] "failpoints", + #[cfg(feature = "profiling")] + "profiling", ]; println!("{{\"features\": {features:?} }}"); return Ok(()); From bea84150b2be74db6c2cfc4107de3b582c86c352 Mon Sep 17 00:00:00 2001 From: chaitanya sharma <86035+phoenix24@users.noreply.github.com> Date: Sun, 15 May 2022 04:17:28 +0530 Subject: [PATCH 0299/1022] Fix the markdown rendering on 004-durability.md RFC --- docs/rfcs/004-durability.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/rfcs/004-durability.md b/docs/rfcs/004-durability.md index 4543be3dae..d4716156d1 100644 --- a/docs/rfcs/004-durability.md +++ b/docs/rfcs/004-durability.md @@ -22,7 +22,7 @@ In addition to the WAL safekeeper nodes, the WAL is archived in S3. WAL that has been archived to S3 can be removed from the safekeepers, so the safekeepers don't need a lot of disk space. - +``` +----------------+ +-----> | WAL safekeeper | | +----------------+ @@ -42,23 +42,23 @@ safekeepers, so the safekeepers don't need a lot of disk space. \ \ \ - \ +--------+ - \ | | - +--> | S3 | - | | - +--------+ - + \ +--------+ + \ | | + +------> | S3 | + | | + +--------+ +``` Every WAL safekeeper holds a section of WAL, and a VCL value. The WAL can be divided into three portions: - +``` VCL LSN | | V V .................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX Archived WAL Completed WAL In-flight WAL - +``` Note that all this WAL kept in a safekeeper is a contiguous section. This is different from Aurora: In Aurora, there can be holes in the From 9a0fed0880dd1d1f482763b8de7c3a2c219fcf43 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 3 May 2022 14:11:29 +0300 Subject: [PATCH 0300/1022] Enable at least 1 safekeeper in every test --- .circleci/ansible/systemd/pageserver.service | 2 +- control_plane/src/local_env.rs | 3 + control_plane/src/safekeeper.rs | 1 + control_plane/src/storage.rs | 11 +++ docker-entrypoint.sh | 6 +- pageserver/src/config.rs | 95 ++++++++++++++++--- safekeeper/src/bin/safekeeper.rs | 28 +++--- safekeeper/src/broker.rs | 5 +- safekeeper/src/lib.rs | 4 +- .../batch_others/test_ancestor_branch.py | 7 -- test_runner/batch_others/test_backpressure.py | 1 - test_runner/batch_others/test_next_xid.py | 2 - .../batch_others/test_pageserver_restart.py | 2 - .../batch_others/test_remote_storage.py | 1 - .../batch_others/test_tenant_relocation.py | 10 +- .../batch_others/test_timeline_size.py | 1 - test_runner/batch_others/test_wal_acceptor.py | 13 ++- test_runner/batch_others/test_wal_restore.py | 1 - test_runner/batch_others/test_zenith_cli.py | 4 - test_runner/fixtures/zenith_fixtures.py | 44 +++++---- 20 files changed, 161 insertions(+), 80 deletions(-) diff --git a/.circleci/ansible/systemd/pageserver.service b/.circleci/ansible/systemd/pageserver.service index d346643e58..54a7b1ba0a 100644 --- a/.circleci/ansible/systemd/pageserver.service +++ b/.circleci/ansible/systemd/pageserver.service @@ -6,7 +6,7 @@ After=network.target auditd.service Type=simple User=pageserver Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib -ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /storage/pageserver/data +ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed KillSignal=SIGINT diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 35167ebabf..a8636f9073 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -97,6 +97,7 @@ pub struct PageServerConf { // jwt auth token used for communication with pageserver pub auth_token: String, + pub broker_endpoints: Vec, } impl Default for PageServerConf { @@ -107,6 +108,7 @@ impl Default for PageServerConf { listen_http_addr: String::new(), auth_type: AuthType::Trust, auth_token: String::new(), + broker_endpoints: Vec::new(), } } } @@ -401,6 +403,7 @@ impl LocalEnv { self.pageserver.auth_token = self.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?; + self.pageserver.broker_endpoints = self.broker_endpoints.clone(); fs::create_dir_all(self.pg_data_dirs_path())?; diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index aeeb4a50ec..c5b7f830bf 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -137,6 +137,7 @@ impl SafekeeperNode { .args(&["--listen-pg", &listen_pg]) .args(&["--listen-http", &listen_http]) .args(&["--recall", "1 second"]) + .args(&["--broker-endpoints", &self.broker_endpoints.join(",")]) .arg("--daemonize"), ); if !self.conf.sync { diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index d2e63a22de..0b9fddd64a 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -121,6 +121,16 @@ impl PageServerNode { ); let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr); + let broker_endpoints_param = format!( + "broker_endpoints=[{}]", + self.env + .pageserver + .broker_endpoints + .iter() + .map(|url| format!("'{url}'")) + .collect::>() + .join(",") + ); let mut args = Vec::with_capacity(20); args.push("--init"); @@ -129,6 +139,7 @@ impl PageServerNode { args.extend(["-c", &authg_type_param]); args.extend(["-c", &listen_http_addr_param]); args.extend(["-c", &listen_pg_addr_param]); + args.extend(["-c", &broker_endpoints_param]); args.extend(["-c", &id]); for config_override in config_overrides { diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 93bb5f9cd7..0e4cf45f29 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -7,7 +7,11 @@ if [ "$1" = 'pageserver' ]; then pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" fi echo "Staring pageserver at 0.0.0.0:6400" - pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /data + if [ -z '${BROKER_ENDPOINTS}' ]; then + pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /data + else + pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['${BROKER_ENDPOINTS}']" -D /data + fi else "$@" fi diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 5257732c5c..8748683f32 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -13,6 +13,7 @@ use std::str::FromStr; use std::time::Duration; use toml_edit; use toml_edit::{Document, Item}; +use url::Url; use utils::{ postgres_backend::AuthType, zid::{ZNodeId, ZTenantId, ZTimelineId}, @@ -111,6 +112,9 @@ pub struct PageServerConf { pub profiling: ProfilingConfig, pub default_tenant_conf: TenantConf, + + /// Etcd broker endpoints to connect to. + pub broker_endpoints: Vec, } #[derive(Debug, Clone, PartialEq, Eq)] @@ -175,6 +179,7 @@ struct PageServerConfigBuilder { id: BuilderValue, profiling: BuilderValue, + broker_endpoints: BuilderValue>, } impl Default for PageServerConfigBuilder { @@ -200,6 +205,7 @@ impl Default for PageServerConfigBuilder { remote_storage_config: Set(None), id: NotSet, profiling: Set(ProfilingConfig::Disabled), + broker_endpoints: NotSet, } } } @@ -256,6 +262,10 @@ impl PageServerConfigBuilder { self.remote_storage_config = BuilderValue::Set(remote_storage_config) } + pub fn broker_endpoints(&mut self, broker_endpoints: Vec) { + self.broker_endpoints = BuilderValue::Set(broker_endpoints) + } + pub fn id(&mut self, node_id: ZNodeId) { self.id = BuilderValue::Set(node_id) } @@ -264,7 +274,15 @@ impl PageServerConfigBuilder { self.profiling = BuilderValue::Set(profiling) } - pub fn build(self) -> Result { + pub fn build(self) -> anyhow::Result { + let broker_endpoints = self + .broker_endpoints + .ok_or(anyhow!("No broker endpoints provided"))?; + ensure!( + !broker_endpoints.is_empty(), + "Empty broker endpoints collection provided" + ); + Ok(PageServerConf { listen_pg_addr: self .listen_pg_addr @@ -300,6 +318,7 @@ impl PageServerConfigBuilder { profiling: self.profiling.ok_or(anyhow!("missing profiling"))?, // TenantConf is handled separately default_tenant_conf: TenantConf::default(), + broker_endpoints, }) } } @@ -341,7 +360,7 @@ impl PageServerConf { /// validating the input and failing on errors. /// /// This leaves any options not present in the file in the built-in defaults. - pub fn parse_and_validate(toml: &Document, workdir: &Path) -> Result { + pub fn parse_and_validate(toml: &Document, workdir: &Path) -> anyhow::Result { let mut builder = PageServerConfigBuilder::default(); builder.workdir(workdir.to_owned()); @@ -373,6 +392,16 @@ impl PageServerConf { } "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)), "profiling" => builder.profiling(parse_toml_from_str(key, item)?), + "broker_endpoints" => builder.broker_endpoints( + parse_toml_array(key, item)? + .into_iter() + .map(|endpoint_str| { + endpoint_str.parse::().with_context(|| { + format!("Array item {endpoint_str} for key {key} is not a valid url endpoint") + }) + }) + .collect::>()?, + ), _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -526,6 +555,7 @@ impl PageServerConf { remote_storage_config: None, profiling: ProfilingConfig::Disabled, default_tenant_conf: TenantConf::dummy_conf(), + broker_endpoints: Vec::new(), } } } @@ -576,14 +606,36 @@ fn parse_toml_duration(name: &str, item: &Item) -> Result { Ok(humantime::parse_duration(s)?) } -fn parse_toml_from_str(name: &str, item: &Item) -> Result +fn parse_toml_from_str(name: &str, item: &Item) -> anyhow::Result where - T: FromStr, + T: FromStr, + ::Err: std::fmt::Display, { let v = item .as_str() .with_context(|| format!("configure option {name} is not a string"))?; - T::from_str(v) + T::from_str(v).map_err(|e| { + anyhow!( + "Failed to parse string as {parse_type} for configure option {name}: {e}", + parse_type = stringify!(T) + ) + }) +} + +fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result> { + let array = item + .as_array() + .with_context(|| format!("configure option {name} is not an array"))?; + + array + .iter() + .map(|value| { + value + .as_str() + .map(str::to_string) + .with_context(|| format!("Array item {value:?} for key {name} is not a string")) + }) + .collect() } #[cfg(test)] @@ -616,12 +668,16 @@ id = 10 fn parse_defaults() -> anyhow::Result<()> { let tempdir = tempdir()?; let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - // we have to create dummy pathes to overcome the validation errors - let config_string = format!("pg_distrib_dir='{}'\nid=10", pg_distrib_dir.display()); + let broker_endpoint = "http://127.0.0.1:7777"; + // we have to create dummy values to overcome the validation errors + let config_string = format!( + "pg_distrib_dir='{}'\nid=10\nbroker_endpoints = ['{broker_endpoint}']", + pg_distrib_dir.display() + ); let toml = config_string.parse()?; let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}")); + .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); assert_eq!( parsed_config, @@ -641,6 +697,9 @@ id = 10 remote_storage_config: None, profiling: ProfilingConfig::Disabled, default_tenant_conf: TenantConf::default(), + broker_endpoints: vec![broker_endpoint + .parse() + .expect("Failed to parse a valid broker endpoint URL")], }, "Correct defaults should be used when no config values are provided" ); @@ -652,15 +711,16 @@ id = 10 fn parse_basic_config() -> anyhow::Result<()> { let tempdir = tempdir()?; let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; + let broker_endpoint = "http://127.0.0.1:7777"; let config_string = format!( - "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'", + "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoints = ['{broker_endpoint}']", pg_distrib_dir.display() ); let toml = config_string.parse()?; let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}")); + .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); assert_eq!( parsed_config, @@ -680,6 +740,9 @@ id = 10 remote_storage_config: None, profiling: ProfilingConfig::Disabled, default_tenant_conf: TenantConf::default(), + broker_endpoints: vec![broker_endpoint + .parse() + .expect("Failed to parse a valid broker endpoint URL")], }, "Should be able to parse all basic config values correctly" ); @@ -691,6 +754,7 @@ id = 10 fn parse_remote_fs_storage_config() -> anyhow::Result<()> { let tempdir = tempdir()?; let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; + let broker_endpoint = "http://127.0.0.1:7777"; let local_storage_path = tempdir.path().join("local_remote_storage"); @@ -710,6 +774,7 @@ local_path = '{}'"#, let config_string = format!( r#"{ALL_BASE_VALUES_TOML} pg_distrib_dir='{}' +broker_endpoints = ['{broker_endpoint}'] {remote_storage_config_str}"#, pg_distrib_dir.display(), @@ -718,7 +783,9 @@ pg_distrib_dir='{}' let toml = config_string.parse()?; let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}")) + .unwrap_or_else(|e| { + panic!("Failed to parse config '{config_string}', reason: {e:?}") + }) .remote_storage_config .expect("Should have remote storage config for the local FS"); @@ -751,6 +818,7 @@ pg_distrib_dir='{}' let max_concurrent_syncs = NonZeroUsize::new(111).unwrap(); let max_sync_errors = NonZeroU32::new(222).unwrap(); let s3_concurrency_limit = NonZeroUsize::new(333).unwrap(); + let broker_endpoint = "http://127.0.0.1:7777"; let identical_toml_declarations = &[ format!( @@ -773,6 +841,7 @@ concurrency_limit = {s3_concurrency_limit}"# let config_string = format!( r#"{ALL_BASE_VALUES_TOML} pg_distrib_dir='{}' +broker_endpoints = ['{broker_endpoint}'] {remote_storage_config_str}"#, pg_distrib_dir.display(), @@ -781,7 +850,9 @@ pg_distrib_dir='{}' let toml = config_string.parse()?; let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}")) + .unwrap_or_else(|e| { + panic!("Failed to parse config '{config_string}', reason: {e:?}") + }) .remote_storage_config .expect("Should have remote storage config for S3"); diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 6955d2aa5c..d7875a9069 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -1,7 +1,7 @@ // // Main entry point for the safekeeper executable // -use anyhow::{bail, Context, Result}; +use anyhow::{bail, ensure, Context, Result}; use clap::{App, Arg}; use const_format::formatcp; use daemonize::Daemonize; @@ -31,7 +31,7 @@ const LOCK_FILE_NAME: &str = "safekeeper.lock"; const ID_FILE_NAME: &str = "safekeeper.id"; project_git_version!(GIT_VERSION); -fn main() -> Result<()> { +fn main() -> anyhow::Result<()> { metrics::set_common_metrics_prefix("safekeeper"); let arg_matches = App::new("Zenith safekeeper") .about("Store WAL stream to local file system and push it to WAL receivers") @@ -177,8 +177,12 @@ fn main() -> Result<()> { if let Some(addr) = arg_matches.value_of("broker-endpoints") { let collected_ep: Result, ParseError> = addr.split(',').map(Url::parse).collect(); - conf.broker_endpoints = Some(collected_ep?); + conf.broker_endpoints = collected_ep.context("Failed to parse broker endpoint urls")?; } + ensure!( + !conf.broker_endpoints.is_empty(), + "No broker endpoints provided" + ); if let Some(prefix) = arg_matches.value_of("broker-etcd-prefix") { conf.broker_etcd_prefix = prefix.to_string(); } @@ -309,16 +313,14 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b .unwrap(); threads.push(callmemaybe_thread); - if conf.broker_endpoints.is_some() { - let conf_ = conf.clone(); - threads.push( - thread::Builder::new() - .name("broker thread".into()) - .spawn(|| { - broker::thread_main(conf_); - })?, - ); - } + let conf_ = conf.clone(); + threads.push( + thread::Builder::new() + .name("broker thread".into()) + .spawn(|| { + broker::thread_main(conf_); + })?, + ); let conf_ = conf.clone(); threads.push( diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index d9c60c9db0..c906bc1e74 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -46,7 +46,7 @@ fn timeline_safekeeper_path( /// Push once in a while data about all active timelines to the broker. async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { - let mut client = Client::connect(&conf.broker_endpoints.as_ref().unwrap(), None).await?; + let mut client = Client::connect(&conf.broker_endpoints, None).await?; // Get and maintain lease to automatically delete obsolete data let lease = client.lease_grant(LEASE_TTL_SEC, None).await?; @@ -91,7 +91,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { /// Subscribe and fetch all the interesting data from the broker. async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { - let mut client = Client::connect(&conf.broker_endpoints.as_ref().unwrap(), None).await?; + let mut client = Client::connect(&conf.broker_endpoints, None).await?; let mut subscription = etcd_broker::subscribe_to_safekeeper_timeline_updates( &mut client, @@ -99,7 +99,6 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { ) .await .context("failed to subscribe for safekeeper info")?; - loop { match subscription.fetch_data().await { Some(new_info) => { diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 09b2e68a49..131076fab6 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -51,7 +51,7 @@ pub struct SafeKeeperConf { pub ttl: Option, pub recall_period: Duration, pub my_id: ZNodeId, - pub broker_endpoints: Option>, + pub broker_endpoints: Vec, pub broker_etcd_prefix: String, pub s3_offload_enabled: bool, } @@ -81,7 +81,7 @@ impl Default for SafeKeeperConf { ttl: None, recall_period: defaults::DEFAULT_RECALL_PERIOD, my_id: ZNodeId(0), - broker_endpoints: None, + broker_endpoints: Vec::new(), broker_etcd_prefix: defaults::DEFAULT_NEON_BROKER_PREFIX.to_string(), s3_offload_enabled: true, } diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index c07b9d6dd1..5dbd6d2e26 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -10,13 +10,6 @@ from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserv # Create ancestor branches off the main branch. # def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): - - # Use safekeeper in this test to avoid a subtle race condition. - # Without safekeeper, walreceiver reconnection can stuck - # because of IO deadlock. - # - # See https://github.com/zenithdb/zenith/issues/1068 - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() # Override defaults, 1M gc_horizon and 4M checkpoint_distance. diff --git a/test_runner/batch_others/test_backpressure.py b/test_runner/batch_others/test_backpressure.py index 6658b337ec..81f45b749b 100644 --- a/test_runner/batch_others/test_backpressure.py +++ b/test_runner/batch_others/test_backpressure.py @@ -94,7 +94,6 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv @pytest.mark.skip("See https://github.com/neondatabase/neon/issues/1587") def test_backpressure_received_lsn_lag(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() # Create a branch for us env.zenith_cli.create_branch('test_backpressure') diff --git a/test_runner/batch_others/test_next_xid.py b/test_runner/batch_others/test_next_xid.py index 03c27bcd70..1ab1addad3 100644 --- a/test_runner/batch_others/test_next_xid.py +++ b/test_runner/batch_others/test_next_xid.py @@ -6,8 +6,6 @@ from fixtures.zenith_fixtures import ZenithEnvBuilder # Test restarting page server, while safekeeper and compute node keep # running. def test_next_xid(zenith_env_builder: ZenithEnvBuilder): - # One safekeeper is enough for this test. - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() pg = env.postgres.create_start('main') diff --git a/test_runner/batch_others/test_pageserver_restart.py b/test_runner/batch_others/test_pageserver_restart.py index 20e6f4467e..69f5ea85ce 100644 --- a/test_runner/batch_others/test_pageserver_restart.py +++ b/test_runner/batch_others/test_pageserver_restart.py @@ -5,8 +5,6 @@ from fixtures.log_helper import log # Test restarting page server, while safekeeper and compute node keep # running. def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder): - # One safekeeper is enough for this test. - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() env.zenith_cli.create_branch('test_pageserver_restart') diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index e205f79957..3c7bd08996 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -32,7 +32,6 @@ import pytest @pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3']) def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, storage_type: str): # zenith_env_builder.rust_log_override = 'debug' - zenith_env_builder.num_safekeepers = 1 if storage_type == 'local_fs': zenith_env_builder.enable_local_fs_remote_storage() elif storage_type == 'mock_s3': diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 279b3a0a25..85a91b9ce1 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -8,7 +8,7 @@ from fixtures.log_helper import log import signal import pytest -from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, ZenithPageserverHttpClient, assert_local, wait_for, wait_for_last_record_lsn, wait_for_upload, zenith_binpath, pg_distrib_dir +from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, Etcd, ZenithPageserverHttpClient, assert_local, wait_for, wait_for_last_record_lsn, wait_for_upload, zenith_binpath, pg_distrib_dir from fixtures.utils import lsn_from_hex @@ -21,7 +21,8 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, pageserver_bin: pathlib.Path, remote_storage_mock_path: pathlib.Path, pg_port: int, - http_port: int): + http_port: int, + broker: Etcd): """ cannot use ZenithPageserver yet because it depends on zenith cli which currently lacks support for multiple pageservers @@ -36,6 +37,7 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, f"-c pg_distrib_dir='{pg_distrib_dir}'", f"-c id=2", f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", + f"-c broker_endpoints=['{broker.client_url()}']", ] subprocess.check_output(cmd, text=True) @@ -103,7 +105,6 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, port_distributor: PortDistributor, with_load: str): - zenith_env_builder.num_safekeepers = 1 zenith_env_builder.enable_local_fs_remote_storage() env = zenith_env_builder.init_start() @@ -180,7 +181,8 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, pageserver_bin, remote_storage_mock_path, new_pageserver_pg_port, - new_pageserver_http_port): + new_pageserver_http_port, + zenith_env_builder.broker): # call to attach timeline to new pageserver new_pageserver_http.timeline_attach(tenant, timeline) diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index db33493d61..0b33b56df3 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -70,7 +70,6 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() new_timeline_id = env.zenith_cli.create_branch('test_timeline_size_quota') diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 67c9d6070e..85798156a7 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -12,7 +12,7 @@ from contextlib import closing from dataclasses import dataclass, field from multiprocessing import Process, Value from pathlib import Path -from fixtures.zenith_fixtures import PgBin, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol +from fixtures.zenith_fixtures import PgBin, Etcd, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol from fixtures.utils import etcd_path, get_dir_size, lsn_to_hex, mkdir_if_needed, lsn_from_hex from fixtures.log_helper import log from typing import List, Optional, Any @@ -22,7 +22,6 @@ from typing import List, Optional, Any # succeed and data is written def test_normal_work(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 - zenith_env_builder.broker = True env = zenith_env_builder.init_start() env.zenith_cli.create_branch('test_safekeepers_normal_work') @@ -331,7 +330,6 @@ def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value): @pytest.mark.skipif(etcd_path() is None, reason="requires etcd which is not present in PATH") def test_broker(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 - zenith_env_builder.broker = True zenith_env_builder.enable_local_fs_remote_storage() env = zenith_env_builder.init_start() @@ -374,7 +372,6 @@ def test_broker(zenith_env_builder: ZenithEnvBuilder): @pytest.mark.skipif(etcd_path() is None, reason="requires etcd which is not present in PATH") def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 2 - zenith_env_builder.broker = True # to advance remote_consistent_llsn zenith_env_builder.enable_local_fs_remote_storage() env = zenith_env_builder.init_start() @@ -557,8 +554,6 @@ def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): - - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() env.zenith_cli.create_branch('test_timeline_status') @@ -599,6 +594,9 @@ class SafekeeperEnv: num_safekeepers: int = 1): self.repo_dir = repo_dir self.port_distributor = port_distributor + self.broker = Etcd(datadir=os.path.join(self.repo_dir, "etcd"), + port=self.port_distributor.get_port(), + peer_port=self.port_distributor.get_port()) self.pg_bin = pg_bin self.num_safekeepers = num_safekeepers self.bin_safekeeper = os.path.join(str(zenith_binpath), 'safekeeper') @@ -645,6 +643,8 @@ class SafekeeperEnv: safekeeper_dir, "--id", str(i), + "--broker-endpoints", + self.broker.client_url(), "--daemonize" ] @@ -698,7 +698,6 @@ def test_safekeeper_without_pageserver(test_output_dir: str, repo_dir, port_distributor, pg_bin, - num_safekeepers=1, ) with env: diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py index b0f34f4aae..f4aceac5e8 100644 --- a/test_runner/batch_others/test_wal_restore.py +++ b/test_runner/batch_others/test_wal_restore.py @@ -15,7 +15,6 @@ def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, pg_bin: PgBin, test_output_dir, port_distributor: PortDistributor): - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() env.zenith_cli.create_branch("test_wal_restore") pg = env.postgres.create_start('test_wal_restore') diff --git a/test_runner/batch_others/test_zenith_cli.py b/test_runner/batch_others/test_zenith_cli.py index bff17fa679..103d51aae5 100644 --- a/test_runner/batch_others/test_zenith_cli.py +++ b/test_runner/batch_others/test_zenith_cli.py @@ -94,8 +94,6 @@ def test_cli_tenant_create(zenith_simple_env: ZenithEnv): def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): - # Start with single sk - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() # Connect to sk port on v4 loopback @@ -111,8 +109,6 @@ def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): def test_cli_start_stop(zenith_env_builder: ZenithEnvBuilder): - # Start with single sk - zenith_env_builder.num_safekeepers = 1 env = zenith_env_builder.init_start() # Stop default ps/sk diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 14eae60248..09f7f26588 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -412,11 +412,10 @@ class ZenithEnvBuilder: port_distributor: PortDistributor, pageserver_remote_storage: Optional[RemoteStorage] = None, pageserver_config_override: Optional[str] = None, - num_safekeepers: int = 0, + num_safekeepers: int = 1, pageserver_auth_enabled: bool = False, rust_log_override: Optional[str] = None, - default_branch_name=DEFAULT_BRANCH_NAME, - broker: bool = False): + default_branch_name=DEFAULT_BRANCH_NAME): self.repo_dir = repo_dir self.rust_log_override = rust_log_override self.port_distributor = port_distributor @@ -425,7 +424,10 @@ class ZenithEnvBuilder: self.num_safekeepers = num_safekeepers self.pageserver_auth_enabled = pageserver_auth_enabled self.default_branch_name = default_branch_name - self.broker = broker + # keep etcd datadir inside 'repo' + self.broker = Etcd(datadir=os.path.join(self.repo_dir, "etcd"), + port=self.port_distributor.get_port(), + peer_port=self.port_distributor.get_port()) self.env: Optional[ZenithEnv] = None self.s3_mock_server: Optional[MockS3Server] = None @@ -551,14 +553,9 @@ class ZenithEnv: default_tenant_id = '{self.initial_tenant.hex}' """) - self.broker = None - if config.broker: - # keep etcd datadir inside 'repo' - self.broker = Etcd(datadir=os.path.join(self.repo_dir, "etcd"), - port=self.port_distributor.get_port(), - peer_port=self.port_distributor.get_port()) - toml += textwrap.dedent(f""" - broker_endpoints = ['http://127.0.0.1:{self.broker.port}'] + self.broker = config.broker + toml += textwrap.dedent(f""" + broker_endpoints = ['{self.broker.client_url()}'] """) # Create config for pageserver @@ -1851,24 +1848,29 @@ class Etcd: peer_port: int handle: Optional[subprocess.Popen[Any]] = None # handle of running daemon + def client_url(self): + return f'http://127.0.0.1:{self.port}' + def check_status(self): s = requests.Session() s.mount('http://', requests.adapters.HTTPAdapter(max_retries=1)) # do not retry - s.get(f"http://localhost:{self.port}/health").raise_for_status() + s.get(f"{self.client_url()}/health").raise_for_status() def start(self): pathlib.Path(self.datadir).mkdir(exist_ok=True) etcd_full_path = etcd_path() if etcd_full_path is None: - raise Exception('etcd not found') + raise Exception('etcd binary not found locally') + client_url = self.client_url() + log.info(f'Starting etcd to listen incoming connections at "{client_url}"') with open(os.path.join(self.datadir, "etcd.log"), "wb") as log_file: args = [ etcd_full_path, f"--data-dir={self.datadir}", - f"--listen-client-urls=http://localhost:{self.port}", - f"--advertise-client-urls=http://localhost:{self.port}", - f"--listen-peer-urls=http://localhost:{self.peer_port}" + f"--listen-client-urls={client_url}", + f"--advertise-client-urls={client_url}", + f"--listen-peer-urls=http://127.0.0.1:{self.peer_port}" ] self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file) @@ -1920,7 +1922,13 @@ def test_output_dir(request: Any) -> str: return test_dir -SKIP_DIRS = frozenset(('pg_wal', 'pg_stat', 'pg_stat_tmp', 'pg_subtrans', 'pg_logical')) +SKIP_DIRS = frozenset(('pg_wal', + 'pg_stat', + 'pg_stat_tmp', + 'pg_subtrans', + 'pg_logical', + 'pg_replslot/wal_proposer_slot', + 'pg_xact')) SKIP_FILES = frozenset(('pg_internal.init', 'pg.log', From a884f4cf6bcfae751166ad0f0b5dd6b99a67cba8 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sun, 8 May 2022 00:32:57 +0300 Subject: [PATCH 0301/1022] Add etcd to neon_local --- Cargo.lock | 1 + control_plane/simple.conf | 3 + control_plane/src/etcd.rs | 93 +++++++++++++ control_plane/src/lib.rs | 1 + control_plane/src/local_env.rs | 122 ++++++++++++------ control_plane/src/safekeeper.rs | 31 ++--- control_plane/src/storage.rs | 12 +- docker-entrypoint.sh | 15 ++- docs/settings.md | 17 ++- libs/etcd_broker/src/lib.rs | 30 +++-- neon_local/src/main.rs | 64 +++++---- pageserver/Cargo.toml | 1 + pageserver/src/config.rs | 25 +++- safekeeper/src/bin/safekeeper.rs | 26 ++-- safekeeper/src/broker.rs | 4 +- safekeeper/src/lib.rs | 3 +- test_runner/batch_others/test_wal_acceptor.py | 4 +- test_runner/fixtures/utils.py | 12 +- test_runner/fixtures/zenith_fixtures.py | 14 +- 19 files changed, 331 insertions(+), 147 deletions(-) create mode 100644 control_plane/src/etcd.rs diff --git a/Cargo.lock b/Cargo.lock index e1e1a0f067..a3974f6776 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1772,6 +1772,7 @@ dependencies = [ "crc32c", "crossbeam-utils", "daemonize", + "etcd_broker", "fail", "futures", "git-version", diff --git a/control_plane/simple.conf b/control_plane/simple.conf index 2243a0a5f8..925e2f14ee 100644 --- a/control_plane/simple.conf +++ b/control_plane/simple.conf @@ -9,3 +9,6 @@ auth_type = 'Trust' id = 1 pg_port = 5454 http_port = 7676 + +[etcd_broker] +broker_endpoints = ['http://127.0.0.1:2379'] diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs new file mode 100644 index 0000000000..df657dd1be --- /dev/null +++ b/control_plane/src/etcd.rs @@ -0,0 +1,93 @@ +use std::{ + fs, + path::PathBuf, + process::{Command, Stdio}, +}; + +use anyhow::Context; +use nix::{ + sys::signal::{kill, Signal}, + unistd::Pid, +}; + +use crate::{local_env, read_pidfile}; + +pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { + let etcd_broker = &env.etcd_broker; + println!( + "Starting etcd broker using {}", + etcd_broker.etcd_binary_path.display() + ); + + let etcd_data_dir = env.base_data_dir.join("etcd"); + fs::create_dir_all(&etcd_data_dir).with_context(|| { + format!( + "Failed to create etcd data dir: {}", + etcd_data_dir.display() + ) + })?; + + let etcd_stdout_file = + fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| { + format!( + "Failed to create ectd stout file in directory {}", + etcd_data_dir.display() + ) + })?; + let etcd_stderr_file = + fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| { + format!( + "Failed to create ectd stderr file in directory {}", + etcd_data_dir.display() + ) + })?; + let client_urls = etcd_broker.comma_separated_endpoints(); + + let etcd_process = Command::new(&etcd_broker.etcd_binary_path) + .args(&[ + format!("--data-dir={}", etcd_data_dir.display()), + format!("--listen-client-urls={client_urls}"), + format!("--advertise-client-urls={client_urls}"), + ]) + .stdout(Stdio::from(etcd_stdout_file)) + .stderr(Stdio::from(etcd_stderr_file)) + .spawn() + .context("Failed to spawn etcd subprocess")?; + let pid = etcd_process.id(); + + let etcd_pid_file_path = etcd_pid_file_path(env); + fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| { + format!( + "Failed to create etcd pid file at {}", + etcd_pid_file_path.display() + ) + })?; + + Ok(()) +} + +pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { + let etcd_path = &env.etcd_broker.etcd_binary_path; + println!("Stopping etcd broker at {}", etcd_path.display()); + + let etcd_pid_file_path = etcd_pid_file_path(env); + let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| { + format!( + "Failed to read etcd pid filea at {}", + etcd_pid_file_path.display() + ) + })?); + + kill(pid, Signal::SIGTERM).with_context(|| { + format!( + "Failed to stop etcd with pid {pid} at {}", + etcd_pid_file_path.display() + ) + })?; + + Ok(()) +} + +fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf { + env.base_data_dir.join("etcd.pid") +} diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index a2ecdd3d64..c3469c3350 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -12,6 +12,7 @@ use std::path::Path; use std::process::Command; pub mod compute; +pub mod etcd; pub mod local_env; pub mod postgresql_conf; pub mod safekeeper; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index a8636f9073..c73af7d338 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -60,14 +60,7 @@ pub struct LocalEnv { #[serde(default)] pub private_key_path: PathBuf, - // Broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'. - #[serde(default)] - #[serde_as(as = "Vec")] - pub broker_endpoints: Vec, - - /// A prefix to all to any key when pushing/polling etcd from a node. - #[serde(default)] - pub broker_etcd_prefix: Option, + pub etcd_broker: EtcdBroker, pub pageserver: PageServerConf, @@ -83,6 +76,62 @@ pub struct LocalEnv { branch_name_mappings: HashMap>, } +/// Etcd broker config for cluster internal communication. +#[serde_as] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +pub struct EtcdBroker { + /// A prefix to all to any key when pushing/polling etcd from a node. + #[serde(default)] + pub broker_etcd_prefix: Option, + + /// Broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'. + #[serde(default)] + #[serde_as(as = "Vec")] + pub broker_endpoints: Vec, + + /// Etcd binary path to use. + #[serde(default)] + pub etcd_binary_path: PathBuf, +} + +impl EtcdBroker { + pub fn locate_etcd() -> anyhow::Result { + let which_output = Command::new("which") + .arg("etcd") + .output() + .context("Failed to run 'which etcd' command")?; + let stdout = String::from_utf8_lossy(&which_output.stdout); + ensure!( + which_output.status.success(), + "'which etcd' invocation failed. Status: {}, stdout: {stdout}, stderr: {}", + which_output.status, + String::from_utf8_lossy(&which_output.stderr) + ); + + let etcd_path = PathBuf::from(stdout.trim()); + ensure!( + etcd_path.is_file(), + "'which etcd' invocation was successful, but the path it returned is not a file or does not exist: {}", + etcd_path.display() + ); + + Ok(etcd_path) + } + + pub fn comma_separated_endpoints(&self) -> String { + self.broker_endpoints.iter().map(Url::as_str).fold( + String::new(), + |mut comma_separated_urls, url| { + if !comma_separated_urls.is_empty() { + comma_separated_urls.push(','); + } + comma_separated_urls.push_str(url); + comma_separated_urls + }, + ) + } +} + #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct PageServerConf { @@ -97,7 +146,6 @@ pub struct PageServerConf { // jwt auth token used for communication with pageserver pub auth_token: String, - pub broker_endpoints: Vec, } impl Default for PageServerConf { @@ -108,7 +156,6 @@ impl Default for PageServerConf { listen_http_addr: String::new(), auth_type: AuthType::Trust, auth_token: String::new(), - broker_endpoints: Vec::new(), } } } @@ -240,17 +287,7 @@ impl LocalEnv { // Find zenith binaries. if env.zenith_distrib_dir == Path::new("") { - let current_exec_path = - env::current_exe().context("Failed to find current excecutable's path")?; - env.zenith_distrib_dir = current_exec_path - .parent() - .with_context(|| { - format!( - "Failed to find a parent directory for executable {}", - current_exec_path.display(), - ) - })? - .to_owned(); + env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); } // If no initial tenant ID was given, generate it. @@ -345,6 +382,22 @@ impl LocalEnv { "directory '{}' already exists. Perhaps already initialized?", base_path.display() ); + if !self.pg_distrib_dir.join("bin/postgres").exists() { + bail!( + "Can't find postgres binary at {}", + self.pg_distrib_dir.display() + ); + } + for binary in ["pageserver", "safekeeper"] { + if !self.zenith_distrib_dir.join(binary).exists() { + bail!( + "Can't find binary '{}' in zenith distrib dir '{}'", + binary, + self.zenith_distrib_dir.display() + ); + } + } + for binary in ["pageserver", "safekeeper"] { if !self.zenith_distrib_dir.join(binary).exists() { bail!( @@ -403,7 +456,6 @@ impl LocalEnv { self.pageserver.auth_token = self.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?; - self.pageserver.broker_endpoints = self.broker_endpoints.clone(); fs::create_dir_all(self.pg_data_dirs_path())?; @@ -435,26 +487,12 @@ mod tests { "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}" ); - let regular_url_string = "broker_endpoints = ['localhost:1111']"; - let regular_url_toml = simple_conf_toml.replace( - "[pageserver]", - &format!("\n{regular_url_string}\n[pageserver]"), - ); - match LocalEnv::parse_config(®ular_url_toml) { - Ok(regular_url_parsed) => { - assert_eq!( - regular_url_parsed.broker_endpoints, - vec!["localhost:1111".parse().unwrap()], - "Unexpectedly parsed broker endpoint url" - ); - } - Err(e) => panic!("failed to parse simple config {regular_url_toml}, reason: {e}"), - } - - let spoiled_url_string = "broker_endpoints = ['!@$XOXO%^&']"; - let spoiled_url_toml = simple_conf_toml.replace( - "[pageserver]", - &format!("\n{spoiled_url_string}\n[pageserver]"), + let string_to_replace = "broker_endpoints = ['http://127.0.0.1:2379']"; + let spoiled_url_str = "broker_endpoints = ['!@$XOXO%^&']"; + let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str); + assert!( + spoiled_url_toml.contains(spoiled_url_str), + "Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}" ); let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml); assert!( diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index c5b7f830bf..407cd05c73 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -12,7 +12,7 @@ use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; use postgres::Config; use reqwest::blocking::{Client, RequestBuilder, Response}; -use reqwest::{IntoUrl, Method, Url}; +use reqwest::{IntoUrl, Method}; use safekeeper::http::models::TimelineCreateRequest; use thiserror::Error; use utils::{ @@ -75,9 +75,6 @@ pub struct SafekeeperNode { pub http_base_url: String, pub pageserver: Arc, - - broker_endpoints: Vec, - broker_etcd_prefix: Option, } impl SafekeeperNode { @@ -94,8 +91,6 @@ impl SafekeeperNode { http_client: Client::new(), http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port), pageserver, - broker_endpoints: env.broker_endpoints.clone(), - broker_etcd_prefix: env.broker_etcd_prefix.clone(), } } @@ -137,29 +132,21 @@ impl SafekeeperNode { .args(&["--listen-pg", &listen_pg]) .args(&["--listen-http", &listen_http]) .args(&["--recall", "1 second"]) - .args(&["--broker-endpoints", &self.broker_endpoints.join(",")]) + .args(&[ + "--broker-endpoints", + &self.env.etcd_broker.comma_separated_endpoints(), + ]) .arg("--daemonize"), ); if !self.conf.sync { cmd.arg("--no-sync"); } - if !self.broker_endpoints.is_empty() { - cmd.args(&[ - "--broker-endpoints", - &self.broker_endpoints.iter().map(Url::as_str).fold( - String::new(), - |mut comma_separated_urls, url| { - if !comma_separated_urls.is_empty() { - comma_separated_urls.push(','); - } - comma_separated_urls.push_str(url); - comma_separated_urls - }, - ), - ]); + let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints(); + if !comma_separated_endpoints.is_empty() { + cmd.args(&["--broker-endpoints", &comma_separated_endpoints]); } - if let Some(prefix) = self.broker_etcd_prefix.as_deref() { + if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() { cmd.args(&["--broker-etcd-prefix", prefix]); } diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 0b9fddd64a..7dbc19e145 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -124,7 +124,7 @@ impl PageServerNode { let broker_endpoints_param = format!( "broker_endpoints=[{}]", self.env - .pageserver + .etcd_broker .broker_endpoints .iter() .map(|url| format!("'{url}'")) @@ -142,6 +142,16 @@ impl PageServerNode { args.extend(["-c", &broker_endpoints_param]); args.extend(["-c", &id]); + let broker_etcd_prefix_param = self + .env + .etcd_broker + .broker_etcd_prefix + .as_ref() + .map(|prefix| format!("broker_etcd_prefix='{prefix}'")); + if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() { + args.extend(["-c", broker_etcd_prefix_param]); + } + for config_override in config_overrides { args.extend(["-c", config_override]); } diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 0e4cf45f29..6bcbc76551 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -1,17 +1,20 @@ #!/bin/sh set -eux +broker_endpoints_param="${BROKER_ENDPOINT:-absent}" +if [ "$broker_endpoints_param" != "absent" ]; then + broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']" +else + broker_endpoints_param='' +fi + if [ "$1" = 'pageserver' ]; then if [ ! -d "/data/tenants" ]; then echo "Initializing pageserver data directory" - pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" + pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" $broker_endpoints_param fi echo "Staring pageserver at 0.0.0.0:6400" - if [ -z '${BROKER_ENDPOINTS}' ]; then - pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /data - else - pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['${BROKER_ENDPOINTS}']" -D /data - fi + pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data else "$@" fi diff --git a/docs/settings.md b/docs/settings.md index 017d349bb6..9564ef626f 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -25,10 +25,14 @@ max_file_descriptors = '100' # initial superuser role name to use when creating a new tenant initial_superuser_name = 'zenith_admin' +broker_etcd_prefix = 'neon' +broker_endpoints = ['some://etcd'] + # [remote_storage] ``` -The config above shows default values for all basic pageserver settings. +The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user, +see the corresponding section below. Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank. Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start. @@ -46,6 +50,17 @@ Example: `${PAGESERVER_BIN} -c "checkpoint_period = '100 s'" -c "remote_storage= Note that TOML distinguishes between strings and integers, the former require single or double quotes around them. +#### broker_endpoints + +A list of endpoints (etcd currently) to connect and pull the information from. +Mandatory, does not have a default, since requires etcd to be started as a separate process, +and its connection url should be specified separately. + +#### broker_etcd_prefix + +A prefix to add for every etcd key used, to separate one group of related instances from another, in the same cluster. +Default is `neon`. + #### checkpoint_distance `checkpoint_distance` is the amount of incoming WAL that is held in diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index 1b27f99ccf..76181f9ba1 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -19,6 +19,10 @@ use utils::{ zid::{ZNodeId, ZTenantId, ZTenantTimelineId}, }; +/// Default value to use for prefixing to all etcd keys with. +/// This way allows isolating safekeeper/pageserver groups in the same etcd cluster. +pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon"; + #[derive(Debug, Deserialize, Serialize)] struct SafekeeperTimeline { safekeeper_id: ZNodeId, @@ -104,28 +108,28 @@ impl SkTimelineSubscription { /// The subscription kind to the timeline updates from safekeeper. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct SkTimelineSubscriptionKind { - broker_prefix: String, + broker_etcd_prefix: String, kind: SubscriptionKind, } impl SkTimelineSubscriptionKind { - pub fn all(broker_prefix: String) -> Self { + pub fn all(broker_etcd_prefix: String) -> Self { Self { - broker_prefix, + broker_etcd_prefix, kind: SubscriptionKind::All, } } - pub fn tenant(broker_prefix: String, tenant: ZTenantId) -> Self { + pub fn tenant(broker_etcd_prefix: String, tenant: ZTenantId) -> Self { Self { - broker_prefix, + broker_etcd_prefix, kind: SubscriptionKind::Tenant(tenant), } } - pub fn timeline(broker_prefix: String, timeline: ZTenantTimelineId) -> Self { + pub fn timeline(broker_etcd_prefix: String, timeline: ZTenantTimelineId) -> Self { Self { - broker_prefix, + broker_etcd_prefix, kind: SubscriptionKind::Timeline(timeline), } } @@ -134,12 +138,12 @@ impl SkTimelineSubscriptionKind { match self.kind { SubscriptionKind::All => Regex::new(&format!( r"^{}/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$", - self.broker_prefix + self.broker_etcd_prefix )) .expect("wrong regex for 'everything' subscription"), SubscriptionKind::Tenant(tenant_id) => Regex::new(&format!( r"^{}/{tenant_id}/([[:xdigit:]]+)/safekeeper/([[:digit:]])$", - self.broker_prefix + self.broker_etcd_prefix )) .expect("wrong regex for 'tenant' subscription"), SubscriptionKind::Timeline(ZTenantTimelineId { @@ -147,7 +151,7 @@ impl SkTimelineSubscriptionKind { timeline_id, }) => Regex::new(&format!( r"^{}/{tenant_id}/{timeline_id}/safekeeper/([[:digit:]])$", - self.broker_prefix + self.broker_etcd_prefix )) .expect("wrong regex for 'timeline' subscription"), } @@ -156,16 +160,16 @@ impl SkTimelineSubscriptionKind { /// Etcd key to use for watching a certain timeline updates from safekeepers. pub fn watch_key(&self) -> String { match self.kind { - SubscriptionKind::All => self.broker_prefix.to_string(), + SubscriptionKind::All => self.broker_etcd_prefix.to_string(), SubscriptionKind::Tenant(tenant_id) => { - format!("{}/{tenant_id}/safekeeper", self.broker_prefix) + format!("{}/{tenant_id}/safekeeper", self.broker_etcd_prefix) } SubscriptionKind::Timeline(ZTenantTimelineId { tenant_id, timeline_id, }) => format!( "{}/{tenant_id}/{timeline_id}/safekeeper", - self.broker_prefix + self.broker_etcd_prefix ), } } diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index e5ac46d3b1..f04af9cfdd 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -1,10 +1,10 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{App, AppSettings, Arg, ArgMatches}; use control_plane::compute::ComputeControlPlane; -use control_plane::local_env; -use control_plane::local_env::LocalEnv; +use control_plane::local_env::{EtcdBroker, LocalEnv}; use control_plane::safekeeper::SafekeeperNode; use control_plane::storage::PageServerNode; +use control_plane::{etcd, local_env}; use pageserver::config::defaults::{ DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR, @@ -14,6 +14,7 @@ use safekeeper::defaults::{ DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; use std::collections::{BTreeSet, HashMap}; +use std::path::Path; use std::process::exit; use std::str::FromStr; use utils::{ @@ -32,28 +33,27 @@ const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); -fn default_conf() -> String { +fn default_conf(etcd_binary_path: &Path) -> String { format!( r#" # Default built-in configuration, defined in main.rs +[etcd_broker] +broker_endpoints = ['http://localhost:2379'] +etcd_binary_path = '{etcd_binary_path}' + [pageserver] -id = {pageserver_id} -listen_pg_addr = '{pageserver_pg_addr}' -listen_http_addr = '{pageserver_http_addr}' +id = {DEFAULT_PAGESERVER_ID} +listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}' +listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}' auth_type = '{pageserver_auth_type}' [[safekeepers]] -id = {safekeeper_id} -pg_port = {safekeeper_pg_port} -http_port = {safekeeper_http_port} +id = {DEFAULT_SAFEKEEPER_ID} +pg_port = {DEFAULT_SAFEKEEPER_PG_PORT} +http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT} "#, - pageserver_id = DEFAULT_PAGESERVER_ID, - pageserver_pg_addr = DEFAULT_PAGESERVER_PG_ADDR, - pageserver_http_addr = DEFAULT_PAGESERVER_HTTP_ADDR, + etcd_binary_path = etcd_binary_path.display(), pageserver_auth_type = AuthType::Trust, - safekeeper_id = DEFAULT_SAFEKEEPER_ID, - safekeeper_pg_port = DEFAULT_SAFEKEEPER_PG_PORT, - safekeeper_http_port = DEFAULT_SAFEKEEPER_HTTP_PORT, ) } @@ -167,12 +167,12 @@ fn main() -> Result<()> { .subcommand(App::new("create") .arg(tenant_id_arg.clone()) .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) - .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false)) - ) + .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false)) + ) .subcommand(App::new("config") .arg(tenant_id_arg.clone()) - .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false)) - ) + .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false)) + ) ) .subcommand( App::new("pageserver") @@ -468,17 +468,17 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result Result { +fn handle_init(init_match: &ArgMatches) -> anyhow::Result { let initial_timeline_id_arg = parse_timeline_id(init_match)?; // Create config file let toml_file: String = if let Some(config_path) = init_match.value_of("config") { // load and parse the file std::fs::read_to_string(std::path::Path::new(config_path)) - .with_context(|| format!("Could not read configuration file \"{}\"", config_path))? + .with_context(|| format!("Could not read configuration file '{config_path}'"))? } else { // Built-in default config - default_conf() + default_conf(&EtcdBroker::locate_etcd()?) }; let mut env = @@ -497,7 +497,7 @@ fn handle_init(init_match: &ArgMatches) -> Result { &pageserver_config_overrides(init_match), ) .unwrap_or_else(|e| { - eprintln!("pageserver init failed: {}", e); + eprintln!("pageserver init failed: {e}"); exit(1); }); @@ -920,20 +920,23 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul Ok(()) } -fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> { + etcd::start_etcd_process(env)?; let pageserver = PageServerNode::from_env(env); // Postgres nodes are not started automatically if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) { - eprintln!("pageserver start failed: {}", e); + eprintln!("pageserver start failed: {e}"); + try_stop_etcd_process(env); exit(1); } for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); if let Err(e) = safekeeper.start() { - eprintln!("safekeeper '{}' start failed: {}", safekeeper.id, e); + eprintln!("safekeeper '{}' start failed: {e}", safekeeper.id); + try_stop_etcd_process(env); exit(1); } } @@ -963,5 +966,14 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result< eprintln!("safekeeper '{}' stop failed: {}", safekeeper.id, e); } } + + try_stop_etcd_process(env); + Ok(()) } + +fn try_stop_etcd_process(env: &local_env::LocalEnv) { + if let Err(e) = etcd::stop_etcd_process(env) { + eprintln!("etcd stop failed: {e}"); + } +} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 9cc8444531..290f52e0b2 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -55,6 +55,7 @@ fail = "0.5.0" git-version = "0.3.5" postgres_ffi = { path = "../libs/postgres_ffi" } +etcd_broker = { path = "../libs/etcd_broker" } metrics = { path = "../libs/metrics" } utils = { path = "../libs/utils" } remote_storage = { path = "../libs/remote_storage" } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 8748683f32..a9215c0701 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -113,6 +113,10 @@ pub struct PageServerConf { pub profiling: ProfilingConfig, pub default_tenant_conf: TenantConf, + /// A prefix to add in etcd brokers before every key. + /// Can be used for isolating different pageserver groups withing the same etcd cluster. + pub broker_etcd_prefix: String, + /// Etcd broker endpoints to connect to. pub broker_endpoints: Vec, } @@ -179,6 +183,7 @@ struct PageServerConfigBuilder { id: BuilderValue, profiling: BuilderValue, + broker_etcd_prefix: BuilderValue, broker_endpoints: BuilderValue>, } @@ -205,7 +210,8 @@ impl Default for PageServerConfigBuilder { remote_storage_config: Set(None), id: NotSet, profiling: Set(ProfilingConfig::Disabled), - broker_endpoints: NotSet, + broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()), + broker_endpoints: Set(Vec::new()), } } } @@ -266,6 +272,10 @@ impl PageServerConfigBuilder { self.broker_endpoints = BuilderValue::Set(broker_endpoints) } + pub fn broker_etcd_prefix(&mut self, broker_etcd_prefix: String) { + self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix) + } + pub fn id(&mut self, node_id: ZNodeId) { self.id = BuilderValue::Set(node_id) } @@ -278,10 +288,6 @@ impl PageServerConfigBuilder { let broker_endpoints = self .broker_endpoints .ok_or(anyhow!("No broker endpoints provided"))?; - ensure!( - !broker_endpoints.is_empty(), - "Empty broker endpoints collection provided" - ); Ok(PageServerConf { listen_pg_addr: self @@ -319,6 +325,9 @@ impl PageServerConfigBuilder { // TenantConf is handled separately default_tenant_conf: TenantConf::default(), broker_endpoints, + broker_etcd_prefix: self + .broker_etcd_prefix + .ok_or(anyhow!("missing broker_etcd_prefix"))?, }) } } @@ -392,6 +401,7 @@ impl PageServerConf { } "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)), "profiling" => builder.profiling(parse_toml_from_str(key, item)?), + "broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?), "broker_endpoints" => builder.broker_endpoints( parse_toml_array(key, item)? .into_iter() @@ -556,6 +566,7 @@ impl PageServerConf { profiling: ProfilingConfig::Disabled, default_tenant_conf: TenantConf::dummy_conf(), broker_endpoints: Vec::new(), + broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), } } } @@ -700,6 +711,7 @@ id = 10 broker_endpoints: vec![broker_endpoint .parse() .expect("Failed to parse a valid broker endpoint URL")], + broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), }, "Correct defaults should be used when no config values are provided" ); @@ -743,6 +755,7 @@ id = 10 broker_endpoints: vec![broker_endpoint .parse() .expect("Failed to parse a valid broker endpoint URL")], + broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), }, "Should be able to parse all basic config values correctly" ); @@ -795,7 +808,7 @@ broker_endpoints = ['{broker_endpoint}'] max_concurrent_syncs: NonZeroUsize::new( remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS ) - .unwrap(), + .unwrap(), max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS) .unwrap(), storage: RemoteStorageKind::LocalFs(local_storage_path.clone()), diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index d7875a9069..2d47710a88 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -1,7 +1,7 @@ // // Main entry point for the safekeeper executable // -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{bail, Context, Result}; use clap::{App, Arg}; use const_format::formatcp; use daemonize::Daemonize; @@ -179,10 +179,6 @@ fn main() -> anyhow::Result<()> { let collected_ep: Result, ParseError> = addr.split(',').map(Url::parse).collect(); conf.broker_endpoints = collected_ep.context("Failed to parse broker endpoint urls")?; } - ensure!( - !conf.broker_endpoints.is_empty(), - "No broker endpoints provided" - ); if let Some(prefix) = arg_matches.value_of("broker-etcd-prefix") { conf.broker_etcd_prefix = prefix.to_string(); } @@ -313,14 +309,18 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b .unwrap(); threads.push(callmemaybe_thread); - let conf_ = conf.clone(); - threads.push( - thread::Builder::new() - .name("broker thread".into()) - .spawn(|| { - broker::thread_main(conf_); - })?, - ); + if !conf.broker_endpoints.is_empty() { + let conf_ = conf.clone(); + threads.push( + thread::Builder::new() + .name("broker thread".into()) + .spawn(|| { + broker::thread_main(conf_); + })?, + ); + } else { + warn!("No broker endpoints providing, starting without node sync") + } let conf_ = conf.clone(); threads.push( diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index c906bc1e74..d7217be20a 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -34,13 +34,13 @@ pub fn thread_main(conf: SafeKeeperConf) { /// Key to per timeline per safekeeper data. fn timeline_safekeeper_path( - broker_prefix: String, + broker_etcd_prefix: String, zttid: ZTenantTimelineId, sk_id: ZNodeId, ) -> String { format!( "{}/{sk_id}", - SkTimelineSubscriptionKind::timeline(broker_prefix, zttid).watch_key() + SkTimelineSubscriptionKind::timeline(broker_etcd_prefix, zttid).watch_key() ) } diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 131076fab6..a87e5da686 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -27,7 +27,6 @@ pub mod defaults { pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); - pub const DEFAULT_NEON_BROKER_PREFIX: &str = "neon"; pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); @@ -82,7 +81,7 @@ impl Default for SafeKeeperConf { recall_period: defaults::DEFAULT_RECALL_PERIOD, my_id: ZNodeId(0), broker_endpoints: Vec::new(), - broker_etcd_prefix: defaults::DEFAULT_NEON_BROKER_PREFIX.to_string(), + broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), s3_offload_enabled: true, } } diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 85798156a7..e1b7bd91ee 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -13,7 +13,7 @@ from dataclasses import dataclass, field from multiprocessing import Process, Value from pathlib import Path from fixtures.zenith_fixtures import PgBin, Etcd, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol -from fixtures.utils import etcd_path, get_dir_size, lsn_to_hex, mkdir_if_needed, lsn_from_hex +from fixtures.utils import get_dir_size, lsn_to_hex, mkdir_if_needed, lsn_from_hex from fixtures.log_helper import log from typing import List, Optional, Any @@ -327,7 +327,6 @@ def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value): # Test that safekeepers push their info to the broker and learn peer status from it -@pytest.mark.skipif(etcd_path() is None, reason="requires etcd which is not present in PATH") def test_broker(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 zenith_env_builder.enable_local_fs_remote_storage() @@ -369,7 +368,6 @@ def test_broker(zenith_env_builder: ZenithEnvBuilder): # Test that old WAL consumed by peers and pageserver is removed from safekeepers. -@pytest.mark.skipif(etcd_path() is None, reason="requires etcd which is not present in PATH") def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 2 # to advance remote_consistent_llsn diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 7b95e729d9..ba9bc6e113 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,8 +1,9 @@ import os import shutil import subprocess +from pathlib import Path -from typing import Any, List +from typing import Any, List, Optional from fixtures.log_helper import log @@ -80,9 +81,12 @@ def print_gc_result(row): .format_map(row)) -# path to etcd binary or None if not present. -def etcd_path(): - return shutil.which("etcd") +def etcd_path() -> Path: + path_output = shutil.which("etcd") + if path_output is None: + raise RuntimeError('etcd not found in PATH') + else: + return Path(path_output) # Traverse directory to get total size. diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 09f7f26588..78de78144c 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -555,7 +555,9 @@ class ZenithEnv: self.broker = config.broker toml += textwrap.dedent(f""" + [etcd_broker] broker_endpoints = ['{self.broker.client_url()}'] + etcd_binary_path = '{self.broker.binary_path}' """) # Create config for pageserver @@ -1846,6 +1848,7 @@ class Etcd: datadir: str port: int peer_port: int + binary_path: Path = etcd_path() handle: Optional[subprocess.Popen[Any]] = None # handle of running daemon def client_url(self): @@ -1858,15 +1861,15 @@ class Etcd: def start(self): pathlib.Path(self.datadir).mkdir(exist_ok=True) - etcd_full_path = etcd_path() - if etcd_full_path is None: - raise Exception('etcd binary not found locally') + + if not self.binary_path.is_file(): + raise RuntimeError(f"etcd broker binary '{self.binary_path}' is not a file") client_url = self.client_url() log.info(f'Starting etcd to listen incoming connections at "{client_url}"') with open(os.path.join(self.datadir, "etcd.log"), "wb") as log_file: args = [ - etcd_full_path, + self.binary_path, f"--data-dir={self.datadir}", f"--listen-client-urls={client_url}", f"--advertise-client-urls={client_url}", @@ -1927,8 +1930,7 @@ SKIP_DIRS = frozenset(('pg_wal', 'pg_stat_tmp', 'pg_subtrans', 'pg_logical', - 'pg_replslot/wal_proposer_slot', - 'pg_xact')) + 'pg_replslot/wal_proposer_slot')) SKIP_FILES = frozenset(('pg_internal.init', 'pg.log', From f2881bbd8a90bc4b04fb1693ad3a684b260a0f98 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 14 May 2022 15:03:12 +0300 Subject: [PATCH 0302/1022] Start and stop single etcd and mock s3 servers globally in python tests --- .circleci/config.yml | 2 +- control_plane/src/safekeeper.rs | 4 - test_runner/README.md | 1 - .../batch_others/test_tenant_relocation.py | 8 +- test_runner/fixtures/zenith_fixtures.py | 151 ++++++++++-------- 5 files changed, 87 insertions(+), 79 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 85654b5d45..62ae60eb18 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -355,7 +355,7 @@ jobs: when: always command: | du -sh /tmp/test_output/* - find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" -delete + find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "etcd.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" -delete du -sh /tmp/test_output/* - store_artifacts: path: /tmp/test_output diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 407cd05c73..1ac06cb2d2 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -132,10 +132,6 @@ impl SafekeeperNode { .args(&["--listen-pg", &listen_pg]) .args(&["--listen-http", &listen_http]) .args(&["--recall", "1 second"]) - .args(&[ - "--broker-endpoints", - &self.env.etcd_broker.comma_separated_endpoints(), - ]) .arg("--daemonize"), ); if !self.conf.sync { diff --git a/test_runner/README.md b/test_runner/README.md index ee171ae6a0..059bbb83cc 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -51,7 +51,6 @@ Useful environment variables: should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. `ZENITH_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as -`FORCE_MOCK_S3`: inits every test's pageserver with a mock S3 used as a remote storage. `--pageserver-config-override=${value}` parameter values when zenith cli is invoked `RUST_LOG`: logging configuration to pass into Zenith CLI diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 85a91b9ce1..0e5dd6eadf 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -3,8 +3,10 @@ import os import pathlib import subprocess import threading +import typing from uuid import UUID from fixtures.log_helper import log +from typing import Optional import signal import pytest @@ -22,7 +24,7 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, remote_storage_mock_path: pathlib.Path, pg_port: int, http_port: int, - broker: Etcd): + broker: Optional[Etcd]): """ cannot use ZenithPageserver yet because it depends on zenith cli which currently lacks support for multiple pageservers @@ -37,9 +39,11 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, f"-c pg_distrib_dir='{pg_distrib_dir}'", f"-c id=2", f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", - f"-c broker_endpoints=['{broker.client_url()}']", ] + if broker is not None: + cmd.append(f"-c broker_endpoints=['{broker.client_url()}']", ) + subprocess.check_output(cmd, text=True) # actually run new pageserver diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 78de78144c..8fca56143e 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -61,7 +61,7 @@ DEFAULT_POSTGRES_DIR = 'tmp_install' DEFAULT_BRANCH_NAME = 'main' BASE_PORT = 15000 -WORKER_PORT_NUM = 100 +WORKER_PORT_NUM = 1000 def pytest_addoption(parser): @@ -178,7 +178,7 @@ def shareable_scope(fixture_name, config) -> Literal["session", "function"]: return 'function' if os.environ.get('TEST_SHARED_FIXTURES') is None else 'session' -@pytest.fixture(scope=shareable_scope) +@pytest.fixture(scope='session') def worker_seq_no(worker_id: str): # worker_id is a pytest-xdist fixture # it can be master or gw @@ -189,7 +189,7 @@ def worker_seq_no(worker_id: str): return int(worker_id[2:]) -@pytest.fixture(scope=shareable_scope) +@pytest.fixture(scope='session') def worker_base_port(worker_seq_no: int): # so we divide ports in ranges of 100 ports # so workers have disjoint set of ports for services @@ -242,11 +242,30 @@ class PortDistributor: 'port range configured for test is exhausted, consider enlarging the range') -@pytest.fixture(scope=shareable_scope) +@pytest.fixture(scope='session') def port_distributor(worker_base_port): return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM) +@pytest.fixture(scope='session') +def default_broker(request: Any, port_distributor: PortDistributor): + client_port = port_distributor.get_port() + # multiple pytest sessions could get launched in parallel, get them different datadirs + etcd_datadir = os.path.join(get_test_output_dir(request), f"etcd_datadir_{client_port}") + pathlib.Path(etcd_datadir).mkdir(exist_ok=True, parents=True) + + broker = Etcd(datadir=etcd_datadir, port=client_port, peer_port=port_distributor.get_port()) + yield broker + broker.stop() + + +@pytest.fixture(scope='session') +def mock_s3_server(port_distributor: PortDistributor): + mock_s3_server = MockS3Server(port_distributor.get_port()) + yield mock_s3_server + mock_s3_server.kill() + + class PgProtocol: """ Reusable connection logic """ def __init__(self, **kwargs): @@ -410,7 +429,9 @@ class ZenithEnvBuilder: def __init__(self, repo_dir: Path, port_distributor: PortDistributor, - pageserver_remote_storage: Optional[RemoteStorage] = None, + broker: Etcd, + mock_s3_server: MockS3Server, + remote_storage: Optional[RemoteStorage] = None, pageserver_config_override: Optional[str] = None, num_safekeepers: int = 1, pageserver_auth_enabled: bool = False, @@ -419,24 +440,15 @@ class ZenithEnvBuilder: self.repo_dir = repo_dir self.rust_log_override = rust_log_override self.port_distributor = port_distributor - self.pageserver_remote_storage = pageserver_remote_storage + self.remote_storage = remote_storage + self.broker = broker + self.mock_s3_server = mock_s3_server self.pageserver_config_override = pageserver_config_override self.num_safekeepers = num_safekeepers self.pageserver_auth_enabled = pageserver_auth_enabled self.default_branch_name = default_branch_name - # keep etcd datadir inside 'repo' - self.broker = Etcd(datadir=os.path.join(self.repo_dir, "etcd"), - port=self.port_distributor.get_port(), - peer_port=self.port_distributor.get_port()) self.env: Optional[ZenithEnv] = None - self.s3_mock_server: Optional[MockS3Server] = None - - if os.getenv('FORCE_MOCK_S3') is not None: - bucket_name = f'{repo_dir.name}_bucket' - log.warning(f'Unconditionally initializing mock S3 server for bucket {bucket_name}') - self.enable_s3_mock_remote_storage(bucket_name) - def init(self) -> ZenithEnv: # Cannot create more than one environment from one builder assert self.env is None, "environment already initialized" @@ -457,9 +469,8 @@ class ZenithEnvBuilder: """ def enable_local_fs_remote_storage(self, force_enable=True): - assert force_enable or self.pageserver_remote_storage is None, "remote storage is enabled already" - self.pageserver_remote_storage = LocalFsStorage( - Path(self.repo_dir / 'local_fs_remote_storage')) + assert force_enable or self.remote_storage is None, "remote storage is enabled already" + self.remote_storage = LocalFsStorage(Path(self.repo_dir / 'local_fs_remote_storage')) """ Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already. @@ -468,22 +479,19 @@ class ZenithEnvBuilder: """ def enable_s3_mock_remote_storage(self, bucket_name: str, force_enable=True): - assert force_enable or self.pageserver_remote_storage is None, "remote storage is enabled already" - if not self.s3_mock_server: - self.s3_mock_server = MockS3Server(self.port_distributor.get_port()) - - mock_endpoint = self.s3_mock_server.endpoint() - mock_region = self.s3_mock_server.region() + assert force_enable or self.remote_storage is None, "remote storage is enabled already" + mock_endpoint = self.mock_s3_server.endpoint() + mock_region = self.mock_s3_server.region() boto3.client( 's3', endpoint_url=mock_endpoint, region_name=mock_region, - aws_access_key_id=self.s3_mock_server.access_key(), - aws_secret_access_key=self.s3_mock_server.secret_key(), + aws_access_key_id=self.mock_s3_server.access_key(), + aws_secret_access_key=self.mock_s3_server.secret_key(), ).create_bucket(Bucket=bucket_name) - self.pageserver_remote_storage = S3Storage(bucket=bucket_name, - endpoint=mock_endpoint, - region=mock_region) + self.remote_storage = S3Storage(bucket=bucket_name, + endpoint=mock_endpoint, + region=mock_region) def __enter__(self): return self @@ -497,10 +505,6 @@ class ZenithEnvBuilder: for sk in self.env.safekeepers: sk.stop(immediate=True) self.env.pageserver.stop(immediate=True) - if self.s3_mock_server: - self.s3_mock_server.kill() - if self.env.broker is not None: - self.env.broker.stop() class ZenithEnv: @@ -539,10 +543,12 @@ class ZenithEnv: self.repo_dir = config.repo_dir self.rust_log_override = config.rust_log_override self.port_distributor = config.port_distributor - self.s3_mock_server = config.s3_mock_server + self.s3_mock_server = config.mock_s3_server self.zenith_cli = ZenithCli(env=self) self.postgres = PostgresFactory(self) self.safekeepers: List[Safekeeper] = [] + self.broker = config.broker + self.remote_storage = config.remote_storage # generate initial tenant ID here instead of letting 'zenith init' generate it, # so that we don't need to dig it out of the config file afterwards. @@ -553,7 +559,6 @@ class ZenithEnv: default_tenant_id = '{self.initial_tenant.hex}' """) - self.broker = config.broker toml += textwrap.dedent(f""" [etcd_broker] broker_endpoints = ['{self.broker.client_url()}'] @@ -578,7 +583,6 @@ class ZenithEnv: # Create a corresponding ZenithPageserver object self.pageserver = ZenithPageserver(self, port=pageserver_port, - remote_storage=config.pageserver_remote_storage, config_override=config.pageserver_config_override) # Create config and a Safekeeper object for each safekeeper @@ -602,15 +606,13 @@ class ZenithEnv: self.zenith_cli.init(toml) def start(self): - # Start up the page server, all the safekeepers and the broker + # Start up broker, pageserver and all safekeepers + self.broker.try_start() self.pageserver.start() for safekeeper in self.safekeepers: safekeeper.start() - if self.broker is not None: - self.broker.start() - def get_safekeeper_connstrs(self) -> str: """ Get list of safekeeper endpoints suitable for safekeepers GUC """ return ','.join([f'localhost:{wa.port.pg}' for wa in self.safekeepers]) @@ -623,7 +625,10 @@ class ZenithEnv: @pytest.fixture(scope=shareable_scope) -def _shared_simple_env(request: Any, port_distributor) -> Iterator[ZenithEnv]: +def _shared_simple_env(request: Any, + port_distributor: PortDistributor, + mock_s3_server: MockS3Server, + default_broker: Etcd) -> Iterator[ZenithEnv]: """ Internal fixture backing the `zenith_simple_env` fixture. If TEST_SHARED_FIXTURES is set, this is shared by all tests using `zenith_simple_env`. @@ -637,7 +642,8 @@ def _shared_simple_env(request: Any, port_distributor) -> Iterator[ZenithEnv]: repo_dir = os.path.join(str(top_output_dir), "shared_repo") shutil.rmtree(repo_dir, ignore_errors=True) - with ZenithEnvBuilder(Path(repo_dir), port_distributor) as builder: + with ZenithEnvBuilder(Path(repo_dir), port_distributor, default_broker, + mock_s3_server) as builder: env = builder.init_start() # For convenience in tests, create a branch from the freshly-initialized cluster. @@ -659,12 +665,13 @@ def zenith_simple_env(_shared_simple_env: ZenithEnv) -> Iterator[ZenithEnv]: yield _shared_simple_env _shared_simple_env.postgres.stop_all() - if _shared_simple_env.s3_mock_server: - _shared_simple_env.s3_mock_server.kill() @pytest.fixture(scope='function') -def zenith_env_builder(test_output_dir, port_distributor) -> Iterator[ZenithEnvBuilder]: +def zenith_env_builder(test_output_dir, + port_distributor: PortDistributor, + mock_s3_server: MockS3Server, + default_broker: Etcd) -> Iterator[ZenithEnvBuilder]: """ Fixture to create a Zenith environment for test. @@ -682,7 +689,8 @@ def zenith_env_builder(test_output_dir, port_distributor) -> Iterator[ZenithEnvB repo_dir = os.path.join(test_output_dir, "repo") # Return the builder to the caller - with ZenithEnvBuilder(Path(repo_dir), port_distributor) as builder: + with ZenithEnvBuilder(Path(repo_dir), port_distributor, default_broker, + mock_s3_server) as builder: yield builder @@ -979,9 +987,10 @@ class ZenithCli: cmd = ['init', f'--config={tmp.name}'] if initial_timeline_id: cmd.extend(['--timeline-id', initial_timeline_id.hex]) - append_pageserver_param_overrides(cmd, - self.env.pageserver.remote_storage, - self.env.pageserver.config_override) + append_pageserver_param_overrides( + params_to_update=cmd, + remote_storage=self.env.remote_storage, + pageserver_config_override=self.env.pageserver.config_override) res = self.raw_cli(cmd) res.check_returncode() @@ -1002,9 +1011,10 @@ class ZenithCli: def pageserver_start(self, overrides=()) -> 'subprocess.CompletedProcess[str]': start_args = ['pageserver', 'start', *overrides] - append_pageserver_param_overrides(start_args, - self.env.pageserver.remote_storage, - self.env.pageserver.config_override) + append_pageserver_param_overrides( + params_to_update=start_args, + remote_storage=self.env.remote_storage, + pageserver_config_override=self.env.pageserver.config_override) s3_env_vars = None if self.env.s3_mock_server: @@ -1174,16 +1184,11 @@ class ZenithPageserver(PgProtocol): Initializes the repository via `zenith init`. """ - def __init__(self, - env: ZenithEnv, - port: PageserverPort, - remote_storage: Optional[RemoteStorage] = None, - config_override: Optional[str] = None): + def __init__(self, env: ZenithEnv, port: PageserverPort, config_override: Optional[str] = None): super().__init__(host='localhost', port=port.pg, user='zenith_admin') self.env = env self.running = False self.service_port = port - self.remote_storage = remote_storage self.config_override = config_override def start(self, overrides=()) -> 'ZenithPageserver': @@ -1223,21 +1228,21 @@ class ZenithPageserver(PgProtocol): def append_pageserver_param_overrides( params_to_update: List[str], - pageserver_remote_storage: Optional[RemoteStorage], + remote_storage: Optional[RemoteStorage], pageserver_config_override: Optional[str] = None, ): - if pageserver_remote_storage is not None: - if isinstance(pageserver_remote_storage, LocalFsStorage): - pageserver_storage_override = f"local_path='{pageserver_remote_storage.root}'" - elif isinstance(pageserver_remote_storage, S3Storage): - pageserver_storage_override = f"bucket_name='{pageserver_remote_storage.bucket}',\ - bucket_region='{pageserver_remote_storage.region}'" + if remote_storage is not None: + if isinstance(remote_storage, LocalFsStorage): + pageserver_storage_override = f"local_path='{remote_storage.root}'" + elif isinstance(remote_storage, S3Storage): + pageserver_storage_override = f"bucket_name='{remote_storage.bucket}',\ + bucket_region='{remote_storage.region}'" - if pageserver_remote_storage.endpoint is not None: - pageserver_storage_override += f",endpoint='{pageserver_remote_storage.endpoint}'" + if remote_storage.endpoint is not None: + pageserver_storage_override += f",endpoint='{remote_storage.endpoint}'" else: - raise Exception(f'Unknown storage configuration {pageserver_remote_storage}') + raise Exception(f'Unknown storage configuration {remote_storage}') params_to_update.append( f'--pageserver-config-override=remote_storage={{{pageserver_storage_override}}}') @@ -1859,7 +1864,11 @@ class Etcd: s.mount('http://', requests.adapters.HTTPAdapter(max_retries=1)) # do not retry s.get(f"{self.client_url()}/health").raise_for_status() - def start(self): + def try_start(self): + if self.handle is not None: + log.debug(f'etcd is already running on port {self.port}') + return + pathlib.Path(self.datadir).mkdir(exist_ok=True) if not self.binary_path.is_file(): From 9ccbb8d331c3eef25f01815e7d058d6260c02bf3 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 17 May 2022 10:31:13 +0300 Subject: [PATCH 0303/1022] Make "neon_local stop" less verbose. I got annoyed by all the noise in CI test output. Before: $ ./target/release/neon_local stop Stop pageserver gracefully Pageserver still receives connections Pageserver stopped receiving connections Pageserver status is: Reqwest error: error sending request for url (http://127.0.0.1:9898/v1/status): error trying to connect: tcp connect error: Connection refused (os error 111) initializing for sk 1 for 7676 Stop safekeeper gracefully Safekeeper still receives connections Safekeeper stopped receiving connections Safekeeper status is: Reqwest error: error sending request for url (http://127.0.0.1:7676/v1/status): error trying to connect: tcp connect error: Connection refused (os error 111) After: $ ./target/release/neon_local stop Stopping pageserver gracefully...done! Stopping safekeeper 1 gracefully...done! Also removes the spurious "initializing for sk 1 for 7676" message from "neon_local start" --- control_plane/src/safekeeper.rs | 47 ++++++++++++++++++++------------- control_plane/src/storage.rs | 46 ++++++++++++++++++++------------ 2 files changed, 57 insertions(+), 36 deletions(-) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 1ac06cb2d2..d5b6251209 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -81,8 +81,6 @@ impl SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { let pageserver = Arc::new(PageServerNode::from_env(env)); - println!("initializing for sk {} for {}", conf.id, conf.http_port); - SafekeeperNode { id: conf.id, conf: conf.clone(), @@ -207,12 +205,13 @@ impl SafekeeperNode { let pid = Pid::from_raw(pid); let sig = if immediate { - println!("Stop safekeeper immediately"); + print!("Stopping safekeeper {} immediately..", self.id); Signal::SIGQUIT } else { - println!("Stop safekeeper gracefully"); + print!("Stopping safekeeper {} gracefully..", self.id); Signal::SIGTERM }; + io::stdout().flush().unwrap(); match kill(pid, sig) { Ok(_) => (), Err(Errno::ESRCH) => { @@ -234,25 +233,35 @@ impl SafekeeperNode { // TODO Remove this "timeout" and handle it on caller side instead. // Shutting down may take a long time, // if safekeeper flushes a lot of data + let mut tcp_stopped = false; for _ in 0..100 { - if let Err(_e) = TcpStream::connect(&address) { - println!("Safekeeper stopped receiving connections"); - - //Now check status - match self.check_status() { - Ok(_) => { - println!("Safekeeper status is OK. Wait a bit."); - thread::sleep(Duration::from_secs(1)); - } - Err(err) => { - println!("Safekeeper status is: {}", err); - return Ok(()); + if !tcp_stopped { + if let Err(err) = TcpStream::connect(&address) { + tcp_stopped = true; + if err.kind() != io::ErrorKind::ConnectionRefused { + eprintln!("\nSafekeeper connection failed with error: {err}"); } } - } else { - println!("Safekeeper still receives connections"); - thread::sleep(Duration::from_secs(1)); } + if tcp_stopped { + // Also check status on the HTTP port + match self.check_status() { + Err(SafekeeperHttpError::Transport(err)) if err.is_connect() => { + println!("done!"); + return Ok(()); + } + Err(err) => { + eprintln!("\nSafekeeper status check failed with error: {err}"); + return Ok(()); + } + Ok(()) => { + // keep waiting + } + } + } + print!("."); + io::stdout().flush().unwrap(); + thread::sleep(Duration::from_secs(1)); } bail!("Failed to stop safekeeper with pid {}", pid); diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 7dbc19e145..355c7c250d 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -281,12 +281,13 @@ impl PageServerNode { let pid = Pid::from_raw(read_pidfile(&pid_file)?); let sig = if immediate { - println!("Stop pageserver immediately"); + print!("Stopping pageserver immediately.."); Signal::SIGQUIT } else { - println!("Stop pageserver gracefully"); + print!("Stopping pageserver gracefully.."); Signal::SIGTERM }; + io::stdout().flush().unwrap(); match kill(pid, sig) { Ok(_) => (), Err(Errno::ESRCH) => { @@ -308,25 +309,36 @@ impl PageServerNode { // TODO Remove this "timeout" and handle it on caller side instead. // Shutting down may take a long time, // if pageserver checkpoints a lot of data + let mut tcp_stopped = false; for _ in 0..100 { - if let Err(_e) = TcpStream::connect(&address) { - println!("Pageserver stopped receiving connections"); - - //Now check status - match self.check_status() { - Ok(_) => { - println!("Pageserver status is OK. Wait a bit."); - thread::sleep(Duration::from_secs(1)); - } - Err(err) => { - println!("Pageserver status is: {}", err); - return Ok(()); + if !tcp_stopped { + if let Err(err) = TcpStream::connect(&address) { + tcp_stopped = true; + if err.kind() != io::ErrorKind::ConnectionRefused { + eprintln!("\nPageserver connection failed with error: {err}"); } } - } else { - println!("Pageserver still receives connections"); - thread::sleep(Duration::from_secs(1)); } + if tcp_stopped { + // Also check status on the HTTP port + + match self.check_status() { + Err(PageserverHttpError::Transport(err)) if err.is_connect() => { + println!("done!"); + return Ok(()); + } + Err(err) => { + eprintln!("\nPageserver status check failed with error: {err}"); + return Ok(()); + } + Ok(()) => { + // keep waiting + } + } + } + print!("."); + io::stdout().flush().unwrap(); + thread::sleep(Duration::from_secs(1)); } bail!("Failed to stop pageserver with pid {}", pid); From 070c255522f1f1d002db127b5a52c957f9016800 Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Tue, 17 May 2022 18:03:01 +0300 Subject: [PATCH 0304/1022] Neon stress deploy (#1720) * storage and proxy deployment for neon stress environment * neon stress inventory fix --- .circleci/ansible/neon-stress.hosts | 19 ++++++++ .circleci/config.yml | 49 ++++++++++++++++++++ .circleci/helm-values/neon-stress.proxy.yaml | 34 ++++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 .circleci/ansible/neon-stress.hosts create mode 100644 .circleci/helm-values/neon-stress.proxy.yaml diff --git a/.circleci/ansible/neon-stress.hosts b/.circleci/ansible/neon-stress.hosts new file mode 100644 index 0000000000..283ec0e8b3 --- /dev/null +++ b/.circleci/ansible/neon-stress.hosts @@ -0,0 +1,19 @@ +[pageservers] +neon-stress-ps-1 console_region_id=1 +neon-stress-ps-2 console_region_id=1 + +[safekeepers] +neon-stress-sk-1 console_region_id=1 +neon-stress-sk-2 console_region_id=1 +neon-stress-sk-3 console_region_id=1 + +[storage:children] +pageservers +safekeepers + +[storage:vars] +console_mgmt_base_url = http://neon-stress-console.local +bucket_name = neon-storage-ireland +bucket_region = eu-west-1 +etcd_endpoints = etcd-stress.local:2379 +safekeeper_enable_s3_offload = false diff --git a/.circleci/config.yml b/.circleci/config.yml index 62ae60eb18..fdd3e0cce7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -587,6 +587,55 @@ jobs: helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait + deploy-neon-stress: + docker: + - image: cimg/python:3.10 + steps: + - checkout + - setup_remote_docker + - run: + name: Setup ansible + command: | + pip install --progress-bar off --user ansible boto3 + - run: + name: Redeploy + command: | + cd "$(pwd)/.circleci/ansible" + + ./get_binaries.sh + + echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key + echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub + chmod 0600 ssh-key + ssh-add ssh-key + rm -f ssh-key ssh-key-cert.pub + + ansible-playbook deploy.yaml -i neon-stress.hosts + rm -f neon_install.tar.gz .neon_current_version + + deploy-neon-stress-proxy: + docker: + - image: cimg/base:2021.04 + environment: + KUBECONFIG: .kubeconfig + steps: + - checkout + - run: + name: Store kubeconfig file + command: | + echo "${NEON_STRESS_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG} + chmod 0600 ${KUBECONFIG} + - run: + name: Setup helm v3 + command: | + curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + helm repo add neondatabase https://neondatabase.github.io/helm-charts + - run: + name: Re-deploy proxy + command: | + DOCKER_TAG=$(git log --oneline|wc -l) + helm upgrade neon-stress-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait + deploy-release: docker: - image: cimg/python:3.10 diff --git a/.circleci/helm-values/neon-stress.proxy.yaml b/.circleci/helm-values/neon-stress.proxy.yaml new file mode 100644 index 0000000000..8236f9873a --- /dev/null +++ b/.circleci/helm-values/neon-stress.proxy.yaml @@ -0,0 +1,34 @@ +fullnameOverride: "neon-stress-proxy" + +settings: + authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/" + uri: "https://console.dev.neon.tech/psql_session/" + +# -- Additional labels for zenith-proxy pods +podLabels: + zenith_service: proxy + zenith_env: staging + zenith_region: eu-west-1 + zenith_region_slug: ireland + +service: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internal + external-dns.alpha.kubernetes.io/hostname: neon-stress-proxy.local + type: LoadBalancer + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: connect.dev.neon.tech + +metrics: + enabled: true + serviceMonitor: + enabled: true + selector: + release: kube-prometheus-stack From f03779bf1a555f921f63406f7accdf28e427c8f0 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 17 May 2022 16:21:13 +0300 Subject: [PATCH 0305/1022] Fix wait_for_last_record_lsn() and wait_for_upload() python functions. The contract for wait_for() was not very clear. It waits until the given function returns successfully, without an exception, but the wait_for_last_record_lsn() and wait_for_upload() functions used "a < b" as the condition, i.e. they thought that wait_for() would poll until the function returns true. Inline the logic from wait_for() into those two functions, it's not that complicated, and you get a more specific error message too, if it fails. Also add a comment to wait_for() to make it more clear how it works. Also change remote_consistent_lsn() to return 0 instead of raising an exception, if remote is None. That can happen if nothing has been uploaded to remote storage for the timeline yet. It happened once in the CI, and I was able to reproduce that locally too by adding a sleep to the storage sync thread, to delay the first upload. --- .../batch_others/test_remote_storage.py | 8 ++-- .../batch_others/test_tenant_relocation.py | 4 +- test_runner/fixtures/zenith_fixtures.py | 47 +++++++++++++++---- 3 files changed, 44 insertions(+), 15 deletions(-) diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index 3c7bd08996..afbe3c55c7 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -6,7 +6,7 @@ from contextlib import closing from pathlib import Path import time from uuid import UUID -from fixtures.zenith_fixtures import ZenithEnvBuilder, assert_local, wait_for, wait_for_last_record_lsn, wait_for_upload +from fixtures.zenith_fixtures import ZenithEnvBuilder, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload from fixtures.log_helper import log from fixtures.utils import lsn_from_hex, lsn_to_hex import pytest @@ -109,9 +109,9 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, client.timeline_attach(UUID(tenant_id), UUID(timeline_id)) log.info("waiting for timeline redownload") - wait_for(number_of_iterations=10, - interval=1, - func=lambda: assert_local(client, UUID(tenant_id), UUID(timeline_id))) + wait_until(number_of_iterations=10, + interval=1, + func=lambda: assert_local(client, UUID(tenant_id), UUID(timeline_id))) detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) assert detail['local'] is not None diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 0e5dd6eadf..91506e120d 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -10,7 +10,7 @@ from typing import Optional import signal import pytest -from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, Etcd, ZenithPageserverHttpClient, assert_local, wait_for, wait_for_last_record_lsn, wait_for_upload, zenith_binpath, pg_distrib_dir +from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, Etcd, ZenithPageserverHttpClient, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload, zenith_binpath, pg_distrib_dir from fixtures.utils import lsn_from_hex @@ -191,7 +191,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, # call to attach timeline to new pageserver new_pageserver_http.timeline_attach(tenant, timeline) # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint - new_timeline_detail = wait_for( + new_timeline_detail = wait_until( number_of_iterations=5, interval=1, func=lambda: assert_local(new_pageserver_http, tenant, timeline)) diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 8fca56143e..203e73037f 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -34,7 +34,12 @@ from typing_extensions import Literal import requests import backoff # type: ignore -from .utils import (etcd_path, get_self_dir, mkdir_if_needed, subprocess_capture, lsn_from_hex) +from .utils import (etcd_path, + get_self_dir, + mkdir_if_needed, + subprocess_capture, + lsn_from_hex, + lsn_to_hex) from fixtures.log_helper import log """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -2065,7 +2070,11 @@ def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Pos assert (mismatch, error) == ([], []) -def wait_for(number_of_iterations: int, interval: int, func): +def wait_until(number_of_iterations: int, interval: int, func): + """ + Wait until 'func' returns successfully, without exception. Returns the last return value + from the the function. + """ last_exception = None for i in range(number_of_iterations): try: @@ -2092,9 +2101,15 @@ def remote_consistent_lsn(pageserver_http_client: ZenithPageserverHttpClient, timeline: uuid.UUID) -> int: detail = pageserver_http_client.timeline_detail(tenant, timeline) - lsn_str = detail['remote']['remote_consistent_lsn'] - assert isinstance(lsn_str, str) - return lsn_from_hex(lsn_str) + if detail['remote'] is None: + # No remote information at all. This happens right after creating + # a timeline, before any part of it it has been uploaded to remote + # storage yet. + return 0 + else: + lsn_str = detail['remote']['remote_consistent_lsn'] + assert isinstance(lsn_str, str) + return lsn_from_hex(lsn_str) def wait_for_upload(pageserver_http_client: ZenithPageserverHttpClient, @@ -2102,8 +2117,15 @@ def wait_for_upload(pageserver_http_client: ZenithPageserverHttpClient, timeline: uuid.UUID, lsn: int): """waits for local timeline upload up to specified lsn""" - - wait_for(10, 1, lambda: remote_consistent_lsn(pageserver_http_client, tenant, timeline) >= lsn) + for i in range(10): + current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) + if current_lsn >= lsn: + return + log.info("waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1)) + time.sleep(1) + raise Exception("timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn))) def last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, @@ -2121,5 +2143,12 @@ def wait_for_last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, timeline: uuid.UUID, lsn: int): """waits for pageserver to catch up to a certain lsn""" - - wait_for(10, 1, lambda: last_record_lsn(pageserver_http_client, tenant, timeline) >= lsn) + for i in range(10): + current_lsn = last_record_lsn(pageserver_http_client, tenant, timeline) + if current_lsn >= lsn: + return + log.info("waiting for last_record_lsn to reach {}, now {}, iteration {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1)) + time.sleep(1) + raise Exception("timed out while waiting for last_record_lsn to reach {}, was {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn))) From 55ea3f262edc8d992e01c67c4ed7ef96203ebbbb Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 17 May 2022 18:14:37 +0300 Subject: [PATCH 0306/1022] Fix race condition leading to panic in remote storage sync thread. The SyncQueue consisted of a tokio mpsc channel, and an atomic counter to keep track of how many items there are in the channel. Updating the atomic counter was racy, and sometimes the consumer would decrement the counter before the producer had incremented it, leading to integer wraparound to usize::MAX. Calling Vec::with_capacity(usize::MAX) leads to a panic. To fix, replace the channel with a VecDeque protected by a Mutex, and a condition variable for signaling. Now that the queue is now protected by standard blocking Mutex and Condvar, refactor the functions touching it to be sync, not async. A theoretical downside of this is that the calls to push items to the queue and the storage sync thread that drains the queue might now need to wait, if another thread is busy manipulating the queue. I believe that's OK; the lock isn't held for very long, and these operations are made in background threads, not in the hot GetPage@LSN path, so they're not very latency-sensitive. Fixes #1719. Also add a test case. --- pageserver/src/storage_sync.rs | 240 ++++++++---------- pageserver/src/storage_sync/delete.rs | 4 +- pageserver/src/storage_sync/download.rs | 4 +- pageserver/src/storage_sync/upload.rs | 4 +- .../test_tenants_with_remote_storage.py | 97 +++++++ 5 files changed, 208 insertions(+), 141 deletions(-) create mode 100644 test_runner/batch_others/test_tenants_with_remote_storage.py diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 7755e67c8d..39459fafc6 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -9,7 +9,7 @@ //! //! * public API via to interact with the external world: //! * [`start_local_timeline_sync`] to launch a background async loop to handle the synchronization -//! * [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] to enqueue a new upload and download tasks, +//! * [`schedule_layer_upload`], [`schedule_layer_download`], and[`schedule_layer_delete`] to enqueue a new task //! to be processed by the async loop //! //! Here's a schematic overview of all interactions backup and the rest of the pageserver perform: @@ -44,8 +44,8 @@ //! query their downloads later if they are accessed. //! //! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata. -//! If the storage sync loop was successfully started before, pageserver schedules the new checkpoint file uploads after every checkpoint. -//! The checkpoint uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either). +//! If the storage sync loop was successfully started before, pageserver schedules the layer files and the updated metadata file for upload, every time a layer is flushed to disk. +//! The uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either). //! See [`crate::layered_repository`] for the upload calls and the adjacent logic. //! //! Synchronization logic is able to communicate back with updated timeline sync states, [`crate::repository::TimelineSyncStatusUpdate`], @@ -54,7 +54,7 @@ //! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future //! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory //! -//! When the pageserver terminates, the sync loop finishes a current sync task (if any) and exits. +//! When the pageserver terminates, the sync loop finishes current sync task (if any) and exits. //! //! The storage logic considers `image` as a set of local files (layers), fully representing a certain timeline at given moment (identified with `disk_consistent_lsn` from the corresponding `metadata` file). //! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed @@ -66,13 +66,13 @@ //! when the newer image is downloaded //! //! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure. -//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexShard`], containing the list of remote files. +//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexPart`], containing the list of remote files. //! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download. //! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`], //! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its shard contents, if needed, same as any layer files. //! //! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed. -//! Bulk index data download happens only initially, on pageserer startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only, +//! Bulk index data download happens only initially, on pageserver startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only, //! when a new timeline is scheduled for the download. //! //! NOTES: @@ -89,13 +89,12 @@ //! Synchronization is done with the queue being emptied via separate thread asynchronously, //! attempting to fully store pageserver's local data on the remote storage in a custom format, beneficial for storing. //! -//! A queue is implemented in the [`sync_queue`] module as a pair of sender and receiver channels, to block on zero tasks instead of checking the queue. -//! The pair's shared buffer of a fixed size serves as an implicit queue, holding [`SyncTask`] for local files upload/download operations. +//! A queue is implemented in the [`sync_queue`] module as a VecDeque to hold the tasks, and a condition variable for blocking when the queue is empty. //! //! The queue gets emptied by a single thread with the loop, that polls the tasks in batches of deduplicated tasks. //! A task from the batch corresponds to a single timeline, with its files to sync merged together: given that only one task sync loop step is active at a time, //! timeline uploads and downloads can happen concurrently, in no particular order due to incremental nature of the timeline layers. -//! Deletion happens only after a successful upload only, otherwise the compation output might make the timeline inconsistent until both tasks are fully processed without errors. +//! Deletion happens only after a successful upload only, otherwise the compaction output might make the timeline inconsistent until both tasks are fully processed without errors. //! Upload and download update the remote data (inmemory index and S3 json index part file) only after every layer is successfully synchronized, while the deletion task //! does otherwise: it requires to have the remote data updated first succesfully: blob files will be invisible to pageserver this way. //! @@ -138,8 +137,6 @@ //! NOTE: No real contents or checksum check happens right now and is a subject to improve later. //! //! After the whole timeline is downloaded, [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function is used to update pageserver memory stage for the timeline processed. -//! -//! When pageserver signals shutdown, current sync task gets finished and the loop exists. mod delete; mod download; @@ -153,10 +150,7 @@ use std::{ num::{NonZeroU32, NonZeroUsize}, ops::ControlFlow, path::{Path, PathBuf}, - sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, - }, + sync::{Arc, Condvar, Mutex}, }; use anyhow::{anyhow, bail, Context}; @@ -167,7 +161,6 @@ use remote_storage::{GenericRemoteStorage, RemoteStorage}; use tokio::{ fs, runtime::Runtime, - sync::mpsc::{self, error::TryRecvError, UnboundedReceiver, UnboundedSender}, time::{Duration, Instant}, }; use tracing::*; @@ -453,97 +446,77 @@ fn collect_timeline_files( Ok((timeline_id, metadata, timeline_files)) } -/// Wraps mpsc channel bits around into a queue interface. -/// mpsc approach was picked to allow blocking the sync loop if no tasks are present, to avoid meaningless spinning. +/// Global queue of sync tasks. +/// +/// 'queue' is protected by a mutex, and 'condvar' is used to wait for tasks to arrive. struct SyncQueue { - len: AtomicUsize, max_timelines_per_batch: NonZeroUsize, - sender: UnboundedSender<(ZTenantTimelineId, SyncTask)>, + + queue: Mutex>, + condvar: Condvar, } impl SyncQueue { - fn new( - max_timelines_per_batch: NonZeroUsize, - ) -> (Self, UnboundedReceiver<(ZTenantTimelineId, SyncTask)>) { - let (sender, receiver) = mpsc::unbounded_channel(); - ( - Self { - len: AtomicUsize::new(0), - max_timelines_per_batch, - sender, - }, - receiver, - ) + fn new(max_timelines_per_batch: NonZeroUsize) -> Self { + Self { + max_timelines_per_batch, + queue: Mutex::new(VecDeque::new()), + condvar: Condvar::new(), + } } + /// Queue a new task fn push(&self, sync_id: ZTenantTimelineId, new_task: SyncTask) { - match self.sender.send((sync_id, new_task)) { - Ok(()) => { - self.len.fetch_add(1, Ordering::Relaxed); - } - Err(e) => { - error!("failed to push sync task to queue: {e}"); - } + let mut q = self.queue.lock().unwrap(); + + q.push_back((sync_id, new_task)); + if q.len() <= 1 { + self.condvar.notify_one(); } } /// Fetches a task batch, getting every existing entry from the queue, grouping by timelines and merging the tasks for every timeline. - /// A timeline has to care to not to delete cetain layers from the remote storage before the corresponding uploads happen. - /// Otherwise, due to "immutable" nature of the layers, the order of their deletion/uploading/downloading does not matter. + /// A timeline has to care to not to delete certain layers from the remote storage before the corresponding uploads happen. + /// Other than that, due to "immutable" nature of the layers, the order of their deletion/uploading/downloading does not matter. /// Hence, we merge the layers together into single task per timeline and run those concurrently (with the deletion happening only after successful uploading). - async fn next_task_batch( - &self, - // The queue is based on two ends of a channel and has to be accessible statically without blocking for submissions from the sync code. - // Its receiver needs &mut, so we cannot place it in the same container with the other end and get both static and non-blocking access. - // Hence toss this around to use it from the sync loop directly as &mut. - sync_queue_receiver: &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, - ) -> HashMap { - // request the first task in blocking fashion to do less meaningless work - let (first_sync_id, first_task) = if let Some(first_task) = sync_queue_receiver.recv().await - { - self.len.fetch_sub(1, Ordering::Relaxed); - first_task - } else { - info!("Queue sender part was dropped, aborting"); - return HashMap::new(); - }; + fn next_task_batch(&self) -> (HashMap, usize) { + // Wait for the first task in blocking fashion + let mut q = self.queue.lock().unwrap(); + while q.is_empty() { + q = self + .condvar + .wait_timeout(q, Duration::from_millis(1000)) + .unwrap() + .0; + + if thread_mgr::is_shutdown_requested() { + return (HashMap::new(), q.len()); + } + } + let (first_sync_id, first_task) = q.pop_front().unwrap(); + let mut timelines_left_to_batch = self.max_timelines_per_batch.get() - 1; - let mut tasks_to_process = self.len(); + let tasks_to_process = q.len(); let mut batches = HashMap::with_capacity(tasks_to_process); batches.insert(first_sync_id, SyncTaskBatch::new(first_task)); let mut tasks_to_reenqueue = Vec::with_capacity(tasks_to_process); - // Pull the queue channel until we get all tasks that were there at the beginning of the batch construction. + // Greedily grab as many other tasks that we can. // Yet do not put all timelines in the batch, but only the first ones that fit the timeline limit. - // Still merge the rest of the pulled tasks and reenqueue those for later. - while tasks_to_process > 0 { - match sync_queue_receiver.try_recv() { - Ok((sync_id, new_task)) => { - self.len.fetch_sub(1, Ordering::Relaxed); - tasks_to_process -= 1; - - match batches.entry(sync_id) { - hash_map::Entry::Occupied(mut v) => v.get_mut().add(new_task), - hash_map::Entry::Vacant(v) => { - timelines_left_to_batch = timelines_left_to_batch.saturating_sub(1); - if timelines_left_to_batch == 0 { - tasks_to_reenqueue.push((sync_id, new_task)); - } else { - v.insert(SyncTaskBatch::new(new_task)); - } - } + // Re-enqueue the tasks that don't fit in this batch. + while let Some((sync_id, new_task)) = q.pop_front() { + match batches.entry(sync_id) { + hash_map::Entry::Occupied(mut v) => v.get_mut().add(new_task), + hash_map::Entry::Vacant(v) => { + timelines_left_to_batch = timelines_left_to_batch.saturating_sub(1); + if timelines_left_to_batch == 0 { + tasks_to_reenqueue.push((sync_id, new_task)); + } else { + v.insert(SyncTaskBatch::new(new_task)); } } - Err(TryRecvError::Disconnected) => { - debug!("Sender disconnected, batch collection aborted"); - break; - } - Err(TryRecvError::Empty) => { - debug!("No more data in the sync queue, task batch is not full"); - break; - } } } @@ -553,14 +526,15 @@ impl SyncQueue { tasks_to_reenqueue.len() ); for (id, task) in tasks_to_reenqueue { - self.push(id, task); + q.push_back((id, task)); } - batches + (batches, q.len()) } + #[cfg(test)] fn len(&self) -> usize { - self.len.load(Ordering::Relaxed) + self.queue.lock().unwrap().len() } } @@ -823,7 +797,7 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { debug!("Download task for tenant {tenant_id}, timeline {timeline_id} sent") } -/// Uses a remote storage given to start the storage sync loop. +/// Launch a thread to perform remote storage sync tasks. /// See module docs for loop step description. pub(super) fn spawn_storage_sync_thread( conf: &'static PageServerConf, @@ -836,7 +810,7 @@ where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { - let (sync_queue, sync_queue_receiver) = SyncQueue::new(max_concurrent_timelines_sync); + let sync_queue = SyncQueue::new(max_concurrent_timelines_sync); SYNC_QUEUE .set(sync_queue) .map_err(|_queue| anyhow!("Could not initialize sync queue"))?; @@ -864,7 +838,7 @@ where local_timeline_files, ); - let loop_index = remote_index.clone(); + let remote_index_clone = remote_index.clone(); thread_mgr::spawn( ThreadKind::StorageSync, None, @@ -875,12 +849,7 @@ where storage_sync_loop( runtime, conf, - ( - Arc::new(storage), - loop_index, - sync_queue, - sync_queue_receiver, - ), + (Arc::new(storage), remote_index_clone, sync_queue), max_sync_errors, ); Ok(()) @@ -896,12 +865,7 @@ where fn storage_sync_loop( runtime: Runtime, conf: &'static PageServerConf, - (storage, index, sync_queue, mut sync_queue_receiver): ( - Arc, - RemoteIndex, - &SyncQueue, - UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, - ), + (storage, index, sync_queue): (Arc, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, ) where P: Debug + Send + Sync + 'static, @@ -909,16 +873,35 @@ fn storage_sync_loop( { info!("Starting remote storage sync loop"); loop { - let loop_index = index.clone(); let loop_storage = Arc::clone(&storage); + + let (batched_tasks, remaining_queue_length) = sync_queue.next_task_batch(); + + if thread_mgr::is_shutdown_requested() { + info!("Shutdown requested, stopping"); + break; + } + + REMAINING_SYNC_ITEMS.set(remaining_queue_length as i64); + if remaining_queue_length > 0 || !batched_tasks.is_empty() { + info!("Processing tasks for {} timelines in batch, more tasks left to process: {remaining_queue_length}", batched_tasks.len()); + } else { + debug!("No tasks to process"); + continue; + } + + // Concurrently perform all the tasks in the batch let loop_step = runtime.block_on(async { tokio::select! { - step = loop_step( + step = process_batches( conf, - (loop_storage, loop_index, sync_queue, &mut sync_queue_receiver), max_sync_errors, + loop_storage, + &index, + batched_tasks, + sync_queue, ) - .instrument(info_span!("storage_sync_loop_step")) => step, + .instrument(info_span!("storage_sync_loop_step")) => ControlFlow::Continue(step), _ = thread_mgr::shutdown_watcher() => ControlFlow::Break(()), } }); @@ -944,31 +927,18 @@ fn storage_sync_loop( } } -async fn loop_step( +async fn process_batches( conf: &'static PageServerConf, - (storage, index, sync_queue, sync_queue_receiver): ( - Arc, - RemoteIndex, - &SyncQueue, - &mut UnboundedReceiver<(ZTenantTimelineId, SyncTask)>, - ), max_sync_errors: NonZeroU32, -) -> ControlFlow<(), HashMap>> + storage: Arc, + index: &RemoteIndex, + batched_tasks: HashMap, + sync_queue: &SyncQueue, +) -> HashMap> where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { - let batched_tasks = sync_queue.next_task_batch(sync_queue_receiver).await; - - let remaining_queue_length = sync_queue.len(); - REMAINING_SYNC_ITEMS.set(remaining_queue_length as i64); - if remaining_queue_length > 0 || !batched_tasks.is_empty() { - info!("Processing tasks for {} timelines in batch, more tasks left to process: {remaining_queue_length}", batched_tasks.len()); - } else { - debug!("No tasks to process"); - return ControlFlow::Continue(HashMap::new()); - } - let mut sync_results = batched_tasks .into_iter() .map(|(sync_id, batch)| { @@ -993,6 +963,7 @@ where ZTenantId, HashMap, > = HashMap::new(); + while let Some((sync_id, state_update)) = sync_results.next().await { debug!("Finished storage sync task for sync id {sync_id}"); if let Some(state_update) = state_update { @@ -1003,7 +974,7 @@ where } } - ControlFlow::Continue(new_timeline_states) + new_timeline_states } async fn process_sync_task_batch( @@ -1376,7 +1347,6 @@ where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { - info!("Updating remote index for the timeline"); let updated_remote_timeline = { let mut index_accessor = index.write().await; @@ -1443,7 +1413,7 @@ where IndexPart::from_remote_timeline(&timeline_path, updated_remote_timeline) .context("Failed to create an index part from the updated remote timeline")?; - info!("Uploading remote data for the timeline"); + info!("Uploading remote index for the timeline"); upload_index_part(conf, storage, sync_id, new_index_part) .await .context("Failed to upload new index part") @@ -1685,7 +1655,7 @@ mod tests { #[tokio::test] async fn separate_task_ids_batch() { - let (sync_queue, mut sync_queue_receiver) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); assert_eq!(sync_queue.len(), 0); let sync_id_2 = ZTenantTimelineId { @@ -1720,7 +1690,7 @@ mod tests { let submitted_tasks_count = sync_queue.len(); assert_eq!(submitted_tasks_count, 3); - let mut batch = sync_queue.next_task_batch(&mut sync_queue_receiver).await; + let (mut batch, _) = sync_queue.next_task_batch(); assert_eq!( batch.len(), submitted_tasks_count, @@ -1746,7 +1716,7 @@ mod tests { #[tokio::test] async fn same_task_id_separate_tasks_batch() { - let (sync_queue, mut sync_queue_receiver) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); assert_eq!(sync_queue.len(), 0); let download = LayersDownload { @@ -1769,7 +1739,7 @@ mod tests { let submitted_tasks_count = sync_queue.len(); assert_eq!(submitted_tasks_count, 3); - let mut batch = sync_queue.next_task_batch(&mut sync_queue_receiver).await; + let (mut batch, _) = sync_queue.next_task_batch(); assert_eq!( batch.len(), 1, @@ -1801,7 +1771,7 @@ mod tests { #[tokio::test] async fn same_task_id_same_tasks_batch() { - let (sync_queue, mut sync_queue_receiver) = SyncQueue::new(NonZeroUsize::new(1).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(1).unwrap()); let download_1 = LayersDownload { layers_to_skip: HashSet::from([PathBuf::from("sk1")]), }; @@ -1823,11 +1793,11 @@ mod tests { sync_queue.push(TEST_SYNC_ID, SyncTask::download(download_1.clone())); sync_queue.push(TEST_SYNC_ID, SyncTask::download(download_2.clone())); - sync_queue.push(sync_id_2, SyncTask::download(download_3.clone())); + sync_queue.push(sync_id_2, SyncTask::download(download_3)); sync_queue.push(TEST_SYNC_ID, SyncTask::download(download_4.clone())); assert_eq!(sync_queue.len(), 4); - let mut smallest_batch = sync_queue.next_task_batch(&mut sync_queue_receiver).await; + let (mut smallest_batch, _) = sync_queue.next_task_batch(); assert_eq!( smallest_batch.len(), 1, diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index 047ad6c2be..91c618d201 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -119,7 +119,7 @@ mod tests { #[tokio::test] async fn delete_timeline_negative() -> anyhow::Result<()> { let harness = RepoHarness::create("delete_timeline_negative")?; - let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = LocalFs::new( tempdir()?.path().to_path_buf(), @@ -152,7 +152,7 @@ mod tests { #[tokio::test] async fn delete_timeline() -> anyhow::Result<()> { let harness = RepoHarness::create("delete_timeline")?; - let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "c", "d"]; diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 98a0a0e2fc..a28867f27e 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -286,7 +286,7 @@ mod tests { #[tokio::test] async fn download_timeline() -> anyhow::Result<()> { let harness = RepoHarness::create("download_timeline")?; - let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"]; @@ -385,7 +385,7 @@ mod tests { #[tokio::test] async fn download_timeline_negatives() -> anyhow::Result<()> { let harness = RepoHarness::create("download_timeline_negatives")?; - let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?; diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index f9d606f2b8..625ec7aed6 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -240,7 +240,7 @@ mod tests { #[tokio::test] async fn regular_layer_upload() -> anyhow::Result<()> { let harness = RepoHarness::create("regular_layer_upload")?; - let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b"]; @@ -327,7 +327,7 @@ mod tests { #[tokio::test] async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> { let harness = RepoHarness::create("layer_upload_after_local_fs_update")?; - let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a1", "b1"]; diff --git a/test_runner/batch_others/test_tenants_with_remote_storage.py b/test_runner/batch_others/test_tenants_with_remote_storage.py new file mode 100644 index 0000000000..c00f077fcd --- /dev/null +++ b/test_runner/batch_others/test_tenants_with_remote_storage.py @@ -0,0 +1,97 @@ +# +# Little stress test for the checkpointing and remote storage code. +# +# The test creates several tenants, and runs a simple workload on +# each tenant, in parallel. The test uses remote storage, and a tiny +# checkpoint_distance setting so that a lot of layer files are created. +# + +import asyncio +from contextlib import closing +from uuid import UUID + +import pytest + +from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithEnv, Postgres, wait_for_last_record_lsn, wait_for_upload +from fixtures.utils import lsn_from_hex + + +async def tenant_workload(env: ZenithEnv, pg: Postgres): + pageserver_conn = await env.pageserver.connect_async() + + pg_conn = await pg.connect_async() + + tenant_id = await pg_conn.fetchval("show zenith.zenith_tenant") + timeline_id = await pg_conn.fetchval("show zenith.zenith_timeline") + + await pg_conn.execute("CREATE TABLE t(key int primary key, value text)") + for i in range(1, 100): + await pg_conn.execute( + f"INSERT INTO t SELECT {i}*1000 + g, 'payload' from generate_series(1,1000) g") + + # we rely upon autocommit after each statement + # as waiting for acceptors happens there + res = await pg_conn.fetchval("SELECT count(*) FROM t") + assert res == i * 1000 + + +async def all_tenants_workload(env: ZenithEnv, tenants_pgs): + workers = [] + for tenant, pg in tenants_pgs: + worker = tenant_workload(env, pg) + workers.append(asyncio.create_task(worker)) + + # await all workers + await asyncio.gather(*workers) + + +@pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3']) +def test_tenants_many(zenith_env_builder: ZenithEnvBuilder, storage_type: str): + + if storage_type == 'local_fs': + zenith_env_builder.enable_local_fs_remote_storage() + elif storage_type == 'mock_s3': + zenith_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore') + else: + raise RuntimeError(f'Unknown storage type: {storage_type}') + + zenith_env_builder.enable_local_fs_remote_storage() + + env = zenith_env_builder.init_start() + + tenants_pgs = [] + + for i in range(1, 5): + # Use a tiny checkpoint distance, to create a lot of layers quickly + tenant, _ = env.zenith_cli.create_tenant( + conf={ + 'checkpoint_distance': '5000000', + }) + env.zenith_cli.create_timeline(f'test_tenants_many', tenant_id=tenant) + + pg = env.postgres.create_start( + f'test_tenants_many', + tenant_id=tenant, + ) + tenants_pgs.append((tenant, pg)) + + asyncio.run(all_tenants_workload(env, tenants_pgs)) + + # Wait for the remote storage uploads to finish + pageserver_http = env.pageserver.http_client() + for tenant, pg in tenants_pgs: + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("show zenith.zenith_tenant") + tenant_id = cur.fetchone()[0] + cur.execute("show zenith.zenith_timeline") + timeline_id = cur.fetchone()[0] + cur.execute("SELECT pg_current_wal_flush_lsn()") + current_lsn = lsn_from_hex(cur.fetchone()[0]) + + # wait until pageserver receives all the data + wait_for_last_record_lsn(pageserver_http, UUID(tenant_id), UUID(timeline_id), current_lsn) + + # run final checkpoint manually to flush all the data to remote storage + env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") + wait_for_upload(pageserver_http, UUID(tenant_id), UUID(timeline_id), current_lsn) From 134eeeb096de28c44c8fc7de1d771ed5350598c2 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 17 May 2022 19:29:01 +0300 Subject: [PATCH 0307/1022] Add more common storage metrics (#1722) - Enabled process exporter for storage services - Changed zenith_proxy prefix to just proxy - Removed old `monitoring` directory - Removed common prefix for metrics, now our common metrics have `libmetrics_` prefix, for example `libmetrics_serve_metrics_count` - Added `test_metrics_normal_work` --- .circleci/config.yml | 2 +- Cargo.lock | 39 ++++++++++- libs/metrics/Cargo.toml | 2 +- libs/metrics/src/lib.rs | 38 +---------- libs/utils/src/http/endpoint.rs | 4 +- monitoring/docker-compose.yml | 25 ------- monitoring/grafana.yaml | 12 ---- monitoring/prometheus.yaml | 5 -- pageserver/src/bin/pageserver.rs | 1 - poetry.lock | 30 +++++++-- proxy/src/main.rs | 1 - proxy/src/proxy.rs | 8 +-- pyproject.toml | 1 + safekeeper/src/bin/safekeeper.rs | 1 - test_runner/batch_others/test_tenants.py | 82 ++++++++++++++++++++++- test_runner/fixtures/benchmark_fixture.py | 4 +- test_runner/fixtures/metrics.py | 38 +++++++++++ test_runner/fixtures/zenith_fixtures.py | 7 +- 18 files changed, 198 insertions(+), 102 deletions(-) delete mode 100644 monitoring/docker-compose.yml delete mode 100644 monitoring/grafana.yaml delete mode 100644 monitoring/prometheus.yaml create mode 100644 test_runner/fixtures/metrics.py diff --git a/.circleci/config.yml b/.circleci/config.yml index fdd3e0cce7..1eddb9f220 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -355,7 +355,7 @@ jobs: when: always command: | du -sh /tmp/test_output/* - find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "etcd.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" -delete + find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "etcd.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete du -sh /tmp/test_output/* - store_artifacts: path: /tmp/test_output diff --git a/Cargo.lock b/Cargo.lock index a3974f6776..6a320ee274 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -166,7 +166,7 @@ dependencies = [ "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.4.4", "object", "rustc-demangle", ] @@ -868,6 +868,18 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "279fb028e20b3c4c320317955b77c5e0c9701f05a1d309905d6fc702cdc5053e" +[[package]] +name = "flate2" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39522e96686d38f4bc984b9198e3a0613264abaebaff2c5c918bfa6b6da09af" +dependencies = [ + "cfg-if", + "crc32fast", + "libc", + "miniz_oxide 0.5.1", +] + [[package]] name = "fnv" version = "1.0.7" @@ -1527,6 +1539,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "miniz_oxide" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2b29bd4bc3f33391105ebee3589c19197c4271e3e5a9ec9bfe8127eeff8f082" +dependencies = [ + "adler", +] + [[package]] name = "mio" version = "0.8.2" @@ -2088,6 +2109,20 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "procfs" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95e344cafeaeefe487300c361654bcfc85db3ac53619eeccced29f5ea18c4c70" +dependencies = [ + "bitflags", + "byteorder", + "flate2", + "hex", + "lazy_static", + "libc", +] + [[package]] name = "prometheus" version = "0.13.0" @@ -2097,8 +2132,10 @@ dependencies = [ "cfg-if", "fnv", "lazy_static", + "libc", "memchr", "parking_lot 0.11.2", + "procfs", "thiserror", ] diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index 3b6ff4691d..8ff5d1d421 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -prometheus = {version = "0.13", default_features=false} # removes protobuf dependency +prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency libc = "0.2" lazy_static = "1.4" once_cell = "1.8.0" diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 8756a078c3..b3c1a6bd55 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -3,7 +3,6 @@ //! Otherwise, we might not see all metrics registered via //! a default registry. use lazy_static::lazy_static; -use once_cell::race::OnceBox; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_gauge, Gauge}; pub use prometheus::{register_gauge_vec, GaugeVec}; @@ -27,48 +26,15 @@ pub fn gather() -> Vec { prometheus::gather() } -static COMMON_METRICS_PREFIX: OnceBox<&str> = OnceBox::new(); - -/// Sets a prefix which will be used for all common metrics, typically a service -/// name like 'pageserver'. Should be executed exactly once in the beginning of -/// any executable which uses common metrics. -pub fn set_common_metrics_prefix(prefix: &'static str) { - // Not unwrap() because metrics may be initialized after multiple threads have been started. - COMMON_METRICS_PREFIX - .set(prefix.into()) - .unwrap_or_else(|_| { - eprintln!( - "set_common_metrics_prefix() was called second time with '{}', exiting", - prefix - ); - std::process::exit(1); - }); -} - -/// Prepends a prefix to a common metric name so they are distinguished between -/// different services, see -/// A call to set_common_metrics_prefix() is necessary prior to calling this. -pub fn new_common_metric_name(unprefixed_metric_name: &str) -> String { - // Not unwrap() because metrics may be initialized after multiple threads have been started. - format!( - "{}_{}", - COMMON_METRICS_PREFIX.get().unwrap_or_else(|| { - eprintln!("set_common_metrics_prefix() was not called, but metrics are used, exiting"); - std::process::exit(1); - }), - unprefixed_metric_name - ) -} - lazy_static! { static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!( - new_common_metric_name("disk_io_bytes"), + "libmetrics_disk_io_bytes", "Bytes written and read from disk, grouped by the operation (read|write)", &["io_operation"] ) .expect("Failed to register disk i/o bytes int gauge vec"); static ref MAXRSS_KB: IntGauge = register_int_gauge!( - new_common_metric_name("maxrss_kb"), + "libmetrics_maxrss_kb", "Memory usage (Maximum Resident Set Size)" ) .expect("Failed to register maxrss_kb int gauge"); diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 77acab496f..912404bd7d 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -5,7 +5,7 @@ use anyhow::anyhow; use hyper::header::AUTHORIZATION; use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server}; use lazy_static::lazy_static; -use metrics::{new_common_metric_name, register_int_counter, Encoder, IntCounter, TextEncoder}; +use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use routerify::ext::RequestExt; use routerify::RequestInfo; use routerify::{Middleware, Router, RouterBuilder, RouterService}; @@ -18,7 +18,7 @@ use super::error::ApiError; lazy_static! { static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!( - new_common_metric_name("serve_metrics_count"), + "libmetrics_serve_metrics_count", "Number of metric requests made" ) .expect("failed to define a metric"); diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml deleted file mode 100644 index a3fda0b246..0000000000 --- a/monitoring/docker-compose.yml +++ /dev/null @@ -1,25 +0,0 @@ -version: "3" -services: - - prometheus: - container_name: prometheus - image: prom/prometheus:latest - volumes: - - ./prometheus.yaml:/etc/prometheus/prometheus.yml - # ports: - # - "9090:9090" - # TODO: find a proper portable solution - network_mode: "host" - - grafana: - image: grafana/grafana:latest - volumes: - - ./grafana.yaml:/etc/grafana/provisioning/datasources/datasources.yaml - environment: - - GF_AUTH_ANONYMOUS_ENABLED=true - - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - - GF_AUTH_DISABLE_LOGIN_FORM=true - # ports: - # - "3000:3000" - # TODO: find a proper portable solution - network_mode: "host" diff --git a/monitoring/grafana.yaml b/monitoring/grafana.yaml deleted file mode 100644 index eac8879e6c..0000000000 --- a/monitoring/grafana.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: 1 - -datasources: -- name: Prometheus - type: prometheus - access: proxy - orgId: 1 - url: http://localhost:9090 - basicAuth: false - isDefault: false - version: 1 - editable: false diff --git a/monitoring/prometheus.yaml b/monitoring/prometheus.yaml deleted file mode 100644 index ba55d53737..0000000000 --- a/monitoring/prometheus.yaml +++ /dev/null @@ -1,5 +0,0 @@ -scrape_configs: - - job_name: 'default' - scrape_interval: 10s - static_configs: - - targets: ['localhost:9898'] diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 4cc1dcbc5a..00864056cb 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -38,7 +38,6 @@ fn version() -> String { } fn main() -> anyhow::Result<()> { - metrics::set_common_metrics_prefix("pageserver"); let arg_matches = App::new("Zenith page server") .about("Materializes WAL stream to pages and serves them to the postgres") .version(&*version()) diff --git a/poetry.lock b/poetry.lock index a7cbe0aa3c..aa1e91c606 100644 --- a/poetry.lock +++ b/poetry.lock @@ -822,7 +822,7 @@ python-versions = "*" [[package]] name = "moto" -version = "3.1.7" +version = "3.1.9" description = "A library that allows your python tests to easily mock out the boto library" category = "main" optional = false @@ -868,6 +868,7 @@ ds = ["sshpubkeys (>=3.1.0)"] dynamodb = ["docker (>=2.5.1)"] dynamodb2 = ["docker (>=2.5.1)"] dynamodbstreams = ["docker (>=2.5.1)"] +ebs = ["sshpubkeys (>=3.1.0)"] ec2 = ["sshpubkeys (>=3.1.0)"] efs = ["sshpubkeys (>=3.1.0)"] glue = ["pyparsing (>=3.0.0)"] @@ -953,6 +954,17 @@ importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "prometheus-client" +version = "0.14.1" +description = "Python client for the Prometheus monitoring system." +category = "main" +optional = false +python-versions = ">=3.6" + +[package.extras] +twisted = ["twisted"] + [[package]] name = "psycopg2-binary" version = "2.9.3" @@ -1003,7 +1015,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [[package]] name = "pyjwt" -version = "2.3.0" +version = "2.4.0" description = "JSON Web Token implementation in Python" category = "main" optional = false @@ -1375,7 +1387,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.7" -content-hash = "dc63b6e02d0ceccdc4b5616e9362c149a27fdcc6c54fda63a3b115a5b980c42e" +content-hash = "d2fcba2af0a32cde3a1d0c8cfdfe5fb26531599b0c8c376bf16e200a74b55553" [metadata.files] aiopg = [ @@ -1693,8 +1705,8 @@ mccabe = [ {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, ] moto = [ - {file = "moto-3.1.7-py3-none-any.whl", hash = "sha256:4ab6fb8dd150343e115d75e3dbdb5a8f850fc7236790819d7cef438c11ee6e89"}, - {file = "moto-3.1.7.tar.gz", hash = "sha256:20607a0fd0cf6530e05ffb623ca84d3f45d50bddbcec2a33705a0cf471e71289"}, + {file = "moto-3.1.9-py3-none-any.whl", hash = "sha256:8928ec168e5fd88b1127413b2fa570a80d45f25182cdad793edd208d07825269"}, + {file = "moto-3.1.9.tar.gz", hash = "sha256:ba683e70950b6579189bc12d74c1477aa036c090c6ad8b151a22f5896c005113"}, ] mypy = [ {file = "mypy-0.910-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457"}, @@ -1741,6 +1753,10 @@ pluggy = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, ] +prometheus-client = [ + {file = "prometheus_client-0.14.1-py3-none-any.whl", hash = "sha256:522fded625282822a89e2773452f42df14b5a8e84a86433e3f8a189c1d54dc01"}, + {file = "prometheus_client-0.14.1.tar.gz", hash = "sha256:5459c427624961076277fdc6dc50540e2bacb98eebde99886e59ec55ed92093a"}, +] psycopg2-binary = [ {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"}, @@ -1831,8 +1847,8 @@ pyflakes = [ {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"}, ] pyjwt = [ - {file = "PyJWT-2.3.0-py3-none-any.whl", hash = "sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"}, - {file = "PyJWT-2.3.0.tar.gz", hash = "sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41"}, + {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"}, + {file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"}, ] pyparsing = [ {file = "pyparsing-3.0.6-py3-none-any.whl", hash = "sha256:04ff808a5b90911829c55c4e26f75fa5ca8a2f5f36aa3a51f68e27033341d3e4"}, diff --git a/proxy/src/main.rs b/proxy/src/main.rs index f46e19e5d6..b457d46824 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -38,7 +38,6 @@ async fn flatten_err( #[tokio::main] async fn main() -> anyhow::Result<()> { - metrics::set_common_metrics_prefix("zenith_proxy"); let arg_matches = App::new("Neon proxy/router") .version(GIT_VERSION) .arg( diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 821ce377f5..f10b273bfd 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -5,7 +5,7 @@ use crate::stream::{MetricsStream, PqStream, Stream}; use anyhow::{bail, Context}; use futures::TryFutureExt; use lazy_static::lazy_static; -use metrics::{new_common_metric_name, register_int_counter, IntCounter}; +use metrics::{register_int_counter, IntCounter}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use utils::pq_proto::{BeMessage as Be, *}; @@ -15,17 +15,17 @@ const ERR_PROTO_VIOLATION: &str = "protocol violation"; lazy_static! { static ref NUM_CONNECTIONS_ACCEPTED_COUNTER: IntCounter = register_int_counter!( - new_common_metric_name("num_connections_accepted"), + "proxy_accepted_connections", "Number of TCP client connections accepted." ) .unwrap(); static ref NUM_CONNECTIONS_CLOSED_COUNTER: IntCounter = register_int_counter!( - new_common_metric_name("num_connections_closed"), + "proxy_closed_connections", "Number of TCP client connections closed." ) .unwrap(); static ref NUM_BYTES_PROXIED_COUNTER: IntCounter = register_int_counter!( - new_common_metric_name("num_bytes_proxied"), + "proxy_io_bytes", "Number of bytes sent/received between any client and backend." ) .unwrap(); diff --git a/pyproject.toml b/pyproject.toml index 335c6d61d8..b70eb19009 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ boto3-stubs = "^1.20.40" moto = {version = "^3.0.0", extras = ["server"]} backoff = "^1.11.1" pytest-lazy-fixture = "^0.6.3" +prometheus-client = "^0.14.1" [tool.poetry.dev-dependencies] yapf = "==0.31.0" diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 2d47710a88..61d2f558f2 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -32,7 +32,6 @@ const ID_FILE_NAME: &str = "safekeeper.id"; project_git_version!(GIT_VERSION); fn main() -> anyhow::Result<()> { - metrics::set_common_metrics_prefix("safekeeper"); let arg_matches = App::new("Zenith safekeeper") .about("Store WAL stream to local file system and push it to WAL receivers") .version(GIT_VERSION) diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/batch_others/test_tenants.py index 1b593cfee3..9ccb8cf196 100644 --- a/test_runner/batch_others/test_tenants.py +++ b/test_runner/batch_others/test_tenants.py @@ -1,8 +1,12 @@ from contextlib import closing - +from datetime import datetime +import os import pytest from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.log_helper import log +from fixtures.metrics import parse_metrics +from fixtures.utils import lsn_to_hex @pytest.mark.parametrize('with_safekeepers', [False, True]) @@ -38,3 +42,79 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_safekeep cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") cur.execute("SELECT sum(key) FROM t") assert cur.fetchone() == (5000050000, ) + + +def test_metrics_normal_work(zenith_env_builder: ZenithEnvBuilder): + zenith_env_builder.num_safekeepers = 3 + + env = zenith_env_builder.init_start() + tenant_1, _ = env.zenith_cli.create_tenant() + tenant_2, _ = env.zenith_cli.create_tenant() + + timeline_1 = env.zenith_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_1) + timeline_2 = env.zenith_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_2) + + pg_tenant1 = env.postgres.create_start('test_metrics_normal_work', tenant_id=tenant_1) + pg_tenant2 = env.postgres.create_start('test_metrics_normal_work', tenant_id=tenant_2) + + for pg in [pg_tenant1, pg_tenant2]: + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (5000050000, ) + + collected_metrics = { + "pageserver": env.pageserver.http_client().get_metrics(), + } + for sk in env.safekeepers: + collected_metrics[f'safekeeper{sk.id}'] = sk.http_client().get_metrics_str() + + for name in collected_metrics: + basepath = os.path.join(zenith_env_builder.repo_dir, f'{name}.metrics') + + with open(basepath, 'w') as stdout_f: + print(collected_metrics[name], file=stdout_f, flush=True) + + all_metrics = [parse_metrics(m, name) for name, m in collected_metrics.items()] + ps_metrics = all_metrics[0] + sk_metrics = all_metrics[1:] + + ttids = [{ + 'tenant_id': tenant_1.hex, 'timeline_id': timeline_1.hex + }, { + 'tenant_id': tenant_2.hex, 'timeline_id': timeline_2.hex + }] + + # Test metrics per timeline + for tt in ttids: + log.info(f"Checking metrics for {tt}") + + ps_lsn = int(ps_metrics.query_one("pageserver_last_record_lsn", filter=tt).value) + sk_lsns = [int(sk.query_one("safekeeper_commit_lsn", filter=tt).value) for sk in sk_metrics] + + log.info(f"ps_lsn: {lsn_to_hex(ps_lsn)}") + log.info(f"sk_lsns: {list(map(lsn_to_hex, sk_lsns))}") + + assert ps_lsn <= max(sk_lsns) + assert ps_lsn > 0 + + # Test common metrics + for metrics in all_metrics: + log.info(f"Checking common metrics for {metrics.name}") + + log.info( + f"process_cpu_seconds_total: {metrics.query_one('process_cpu_seconds_total').value}") + log.info(f"process_threads: {int(metrics.query_one('process_threads').value)}") + log.info( + f"process_resident_memory_bytes (MB): {metrics.query_one('process_resident_memory_bytes').value / 1024 / 1024}" + ) + log.info( + f"process_virtual_memory_bytes (MB): {metrics.query_one('process_virtual_memory_bytes').value / 1024 / 1024}" + ) + log.info(f"process_open_fds: {int(metrics.query_one('process_open_fds').value)}") + log.info(f"process_max_fds: {int(metrics.query_one('process_max_fds').value)}") + log.info( + f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}" + ) diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 0735f16d73..e296e85cc7 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -236,14 +236,14 @@ class ZenithBenchmarker: """ Fetch the "cumulative # of bytes written" metric from the pageserver """ - metric_name = r'pageserver_disk_io_bytes{io_operation="write"}' + metric_name = r'libmetrics_disk_io_bytes{io_operation="write"}' return self.get_int_counter_value(pageserver, metric_name) def get_peak_mem(self, pageserver) -> int: """ Fetch the "maxrss" metric from the pageserver """ - metric_name = r'pageserver_maxrss_kb' + metric_name = r'libmetrics_maxrss_kb' return self.get_int_counter_value(pageserver, metric_name) def get_int_counter_value(self, pageserver, metric_name) -> int: diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py new file mode 100644 index 0000000000..6fc62c6ea9 --- /dev/null +++ b/test_runner/fixtures/metrics.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass +from prometheus_client.parser import text_string_to_metric_families +from prometheus_client.samples import Sample +from typing import Dict, List +from collections import defaultdict + +from fixtures.log_helper import log + + +class Metrics: + metrics: Dict[str, List[Sample]] + name: str + + def __init__(self, name: str = ""): + self.metrics = defaultdict(list) + self.name = name + + def query_all(self, name: str, filter: Dict[str, str]) -> List[Sample]: + res = [] + for sample in self.metrics[name]: + if all(sample.labels[k] == v for k, v in filter.items()): + res.append(sample) + return res + + def query_one(self, name: str, filter: Dict[str, str] = {}) -> Sample: + res = self.query_all(name, filter) + assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}" + return res[0] + + +def parse_metrics(text: str, name: str = ""): + metrics = Metrics(name) + gen = text_string_to_metric_families(text) + for family in gen: + for sample in family.samples: + metrics.metrics[sample.name].append(sample) + + return metrics diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 203e73037f..17d932c968 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1833,10 +1833,13 @@ class SafekeeperHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def get_metrics(self) -> SafekeeperMetrics: + def get_metrics_str(self) -> str: request_result = self.get(f"http://localhost:{self.port}/metrics") request_result.raise_for_status() - all_metrics_text = request_result.text + return request_result.text + + def get_metrics(self) -> SafekeeperMetrics: + all_metrics_text = self.get_metrics_str() metrics = SafekeeperMetrics() for match in re.finditer( From b9f84f4a83ed916919884b4f9f038356e76f113f Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Tue, 17 May 2022 23:04:04 +0300 Subject: [PATCH 0308/1022] trun on storage deployment to neon-stress enviroment (#1729) --- .circleci/config.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 1eddb9f220..85ac905f0b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -820,6 +820,25 @@ workflows: requires: - docker-image + - deploy-neon-stress: + # Context gives an ability to login + context: Docker Hub + # deploy only for commits to main + filters: + branches: + only: + - main + requires: + - docker-image + - deploy-neon-stress-proxy: + # deploy only for commits to main + filters: + branches: + only: + - main + requires: + - docker-image + - docker-image-release: # Context gives an ability to login context: Docker Hub From 772c2fb4ff3e58d328f22a955190dc08545efbdf Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Mon, 9 May 2022 19:45:28 +0300 Subject: [PATCH 0309/1022] Report startup metrics and failure reason from compute_ctl (#1581) + neondatabase/cloud#1103 This adds a couple of control endpoints to simplify compute state discovery for control-plane. For example, now we may figure out that Postgres wasn't able to start or basebackup failed within seconds instead of just blindly polling the compute readiness for a minute or two. Also we now expose startup metrics (time of the each step: basebackup, sync safekeepers, config, total). Console grabs them after each successful start and report as histogram to prometheus and grafana. OpenAPI spec is added and up-tp date, but is not currently used in the console yet. --- Dockerfile.compute-tools | 2 +- compute_tools/README.md | 18 +- compute_tools/src/bin/compute_ctl.rs | 174 ++++++++++ compute_tools/src/bin/zenith_ctl.rs | 252 -------------- compute_tools/src/checker.rs | 10 +- compute_tools/src/compute.rs | 315 ++++++++++++++++++ compute_tools/src/config.rs | 12 +- .../src/{http_api.rs => http/api.rs} | 47 ++- compute_tools/src/http/mod.rs | 1 + compute_tools/src/http/openapi_spec.yaml | 158 +++++++++ compute_tools/src/lib.rs | 4 +- compute_tools/src/monitor.rs | 16 +- compute_tools/src/pg_helpers.rs | 27 +- compute_tools/src/spec.rs | 47 ++- compute_tools/src/zenith.rs | 109 ------ compute_tools/tests/pg_helpers_tests.rs | 6 +- docs/docker.md | 16 +- vendor/postgres | 2 +- 18 files changed, 787 insertions(+), 429 deletions(-) create mode 100644 compute_tools/src/bin/compute_ctl.rs delete mode 100644 compute_tools/src/bin/zenith_ctl.rs create mode 100644 compute_tools/src/compute.rs rename compute_tools/src/{http_api.rs => http/api.rs} (56%) create mode 100644 compute_tools/src/http/mod.rs create mode 100644 compute_tools/src/http/openapi_spec.yaml delete mode 100644 compute_tools/src/zenith.rs diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index bbe0f517ce..f0c9b9d56a 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -15,4 +15,4 @@ RUN set -e \ # Final image that only has one binary FROM debian:buster-slim -COPY --from=rust-build /home/circleci/project/target/release/zenith_ctl /usr/local/bin/zenith_ctl +COPY --from=rust-build /home/circleci/project/target/release/compute_ctl /usr/local/bin/compute_ctl diff --git a/compute_tools/README.md b/compute_tools/README.md index ccae3d2842..15876ed246 100644 --- a/compute_tools/README.md +++ b/compute_tools/README.md @@ -1,9 +1,9 @@ # Compute node tools -Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd` -`ExecStart` option. It will handle all the `zenith` specifics during compute node +Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd` +`ExecStart` option. It will handle all the `Neon` specifics during compute node initialization: -- `zenith_ctl` accepts cluster (compute node) specification as a JSON file. +- `compute_ctl` accepts cluster (compute node) specification as a JSON file. - Every start is a fresh start, so the data directory is removed and initialized again on each run. - Next it will put configuration files into the `PGDATA` directory. @@ -13,18 +13,18 @@ initialization: - Check and alter/drop/create roles and databases. - Hang waiting on the `postmaster` process to exit. -Also `zenith_ctl` spawns two separate service threads: +Also `compute_ctl` spawns two separate service threads: - `compute-monitor` checks the last Postgres activity timestamp and saves it - into the shared `ComputeState`; + into the shared `ComputeNode`; - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the last activity requests. Usage example: ```sh -zenith_ctl -D /var/db/postgres/compute \ - -C 'postgresql://zenith_admin@localhost/postgres' \ - -S /var/db/postgres/specs/current.json \ - -b /usr/local/bin/postgres +compute_ctl -D /var/db/postgres/compute \ + -C 'postgresql://zenith_admin@localhost/postgres' \ + -S /var/db/postgres/specs/current.json \ + -b /usr/local/bin/postgres ``` ## Tests diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs new file mode 100644 index 0000000000..5c951b7779 --- /dev/null +++ b/compute_tools/src/bin/compute_ctl.rs @@ -0,0 +1,174 @@ +//! +//! Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd` +//! `ExecStart` option. It will handle all the `Neon` specifics during compute node +//! initialization: +//! - `compute_ctl` accepts cluster (compute node) specification as a JSON file. +//! - Every start is a fresh start, so the data directory is removed and +//! initialized again on each run. +//! - Next it will put configuration files into the `PGDATA` directory. +//! - Sync safekeepers and get commit LSN. +//! - Get `basebackup` from pageserver using the returned on the previous step LSN. +//! - Try to start `postgres` and wait until it is ready to accept connections. +//! - Check and alter/drop/create roles and databases. +//! - Hang waiting on the `postmaster` process to exit. +//! +//! Also `compute_ctl` spawns two separate service threads: +//! - `compute-monitor` checks the last Postgres activity timestamp and saves it +//! into the shared `ComputeNode`; +//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the +//! last activity requests. +//! +//! Usage example: +//! ```sh +//! compute_ctl -D /var/db/postgres/compute \ +//! -C 'postgresql://zenith_admin@localhost/postgres' \ +//! -S /var/db/postgres/specs/current.json \ +//! -b /usr/local/bin/postgres +//! ``` +//! +use std::fs::File; +use std::panic; +use std::path::Path; +use std::process::exit; +use std::sync::{Arc, RwLock}; +use std::{thread, time::Duration}; + +use anyhow::Result; +use chrono::Utc; +use clap::Arg; +use log::{error, info}; + +use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus}; +use compute_tools::http::api::launch_http_server; +use compute_tools::logger::*; +use compute_tools::monitor::launch_monitor; +use compute_tools::params::*; +use compute_tools::pg_helpers::*; +use compute_tools::spec::*; + +fn main() -> Result<()> { + // TODO: re-use `utils::logging` later + init_logger(DEFAULT_LOG_LEVEL)?; + + // Env variable is set by `cargo` + let version: Option<&str> = option_env!("CARGO_PKG_VERSION"); + let matches = clap::App::new("compute_ctl") + .version(version.unwrap_or("unknown")) + .arg( + Arg::new("connstr") + .short('C') + .long("connstr") + .value_name("DATABASE_URL") + .required(true), + ) + .arg( + Arg::new("pgdata") + .short('D') + .long("pgdata") + .value_name("DATADIR") + .required(true), + ) + .arg( + Arg::new("pgbin") + .short('b') + .long("pgbin") + .value_name("POSTGRES_PATH"), + ) + .arg( + Arg::new("spec") + .short('s') + .long("spec") + .value_name("SPEC_JSON"), + ) + .arg( + Arg::new("spec-path") + .short('S') + .long("spec-path") + .value_name("SPEC_PATH"), + ) + .get_matches(); + + let pgdata = matches.value_of("pgdata").expect("PGDATA path is required"); + let connstr = matches + .value_of("connstr") + .expect("Postgres connection string is required"); + let spec = matches.value_of("spec"); + let spec_path = matches.value_of("spec-path"); + + // Try to use just 'postgres' if no path is provided + let pgbin = matches.value_of("pgbin").unwrap_or("postgres"); + + let spec: ComputeSpec = match spec { + // First, try to get cluster spec from the cli argument + Some(json) => serde_json::from_str(json)?, + None => { + // Second, try to read it from the file if path is provided + if let Some(sp) = spec_path { + let path = Path::new(sp); + let file = File::open(path)?; + serde_json::from_reader(file)? + } else { + panic!("cluster spec should be provided via --spec or --spec-path argument"); + } + } + }; + + let pageserver_connstr = spec + .cluster + .settings + .find("zenith.page_server_connstring") + .expect("pageserver connstr should be provided"); + let tenant = spec + .cluster + .settings + .find("zenith.zenith_tenant") + .expect("tenant id should be provided"); + let timeline = spec + .cluster + .settings + .find("zenith.zenith_timeline") + .expect("tenant id should be provided"); + + let compute_state = ComputeNode { + start_time: Utc::now(), + connstr: connstr.to_string(), + pgdata: pgdata.to_string(), + pgbin: pgbin.to_string(), + spec, + tenant, + timeline, + pageserver_connstr, + metrics: ComputeMetrics::new(), + state: RwLock::new(ComputeState::new()), + }; + let compute = Arc::new(compute_state); + + // Launch service threads first, so we were able to serve availability + // requests, while configuration is still in progress. + let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread"); + let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread"); + + // Run compute (Postgres) and hang waiting on it. + match compute.prepare_and_run() { + Ok(ec) => { + let code = ec.code().unwrap_or(1); + info!("Postgres exited with code {}, shutting down", code); + exit(code) + } + Err(error) => { + error!("could not start the compute node: {}", error); + + let mut state = compute.state.write().unwrap(); + state.error = Some(format!("{:?}", error)); + state.status = ComputeStatus::Failed; + drop(state); + + // Keep serving HTTP requests, so the cloud control plane was able to + // get the actual error. + info!("giving control plane 30s to collect the error before shutdown"); + thread::sleep(Duration::from_secs(30)); + info!("shutting down"); + Err(error) + } + } +} diff --git a/compute_tools/src/bin/zenith_ctl.rs b/compute_tools/src/bin/zenith_ctl.rs deleted file mode 100644 index 3685f8e8b4..0000000000 --- a/compute_tools/src/bin/zenith_ctl.rs +++ /dev/null @@ -1,252 +0,0 @@ -//! -//! Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd` -//! `ExecStart` option. It will handle all the `zenith` specifics during compute node -//! initialization: -//! - `zenith_ctl` accepts cluster (compute node) specification as a JSON file. -//! - Every start is a fresh start, so the data directory is removed and -//! initialized again on each run. -//! - Next it will put configuration files into the `PGDATA` directory. -//! - Sync safekeepers and get commit LSN. -//! - Get `basebackup` from pageserver using the returned on the previous step LSN. -//! - Try to start `postgres` and wait until it is ready to accept connections. -//! - Check and alter/drop/create roles and databases. -//! - Hang waiting on the `postmaster` process to exit. -//! -//! Also `zenith_ctl` spawns two separate service threads: -//! - `compute-monitor` checks the last Postgres activity timestamp and saves it -//! into the shared `ComputeState`; -//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the -//! last activity requests. -//! -//! Usage example: -//! ```sh -//! zenith_ctl -D /var/db/postgres/compute \ -//! -C 'postgresql://zenith_admin@localhost/postgres' \ -//! -S /var/db/postgres/specs/current.json \ -//! -b /usr/local/bin/postgres -//! ``` -//! -use std::fs::File; -use std::panic; -use std::path::Path; -use std::process::{exit, Command, ExitStatus}; -use std::sync::{Arc, RwLock}; - -use anyhow::{Context, Result}; -use chrono::Utc; -use clap::Arg; -use log::info; -use postgres::{Client, NoTls}; - -use compute_tools::checker::create_writablity_check_data; -use compute_tools::config; -use compute_tools::http_api::launch_http_server; -use compute_tools::logger::*; -use compute_tools::monitor::launch_monitor; -use compute_tools::params::*; -use compute_tools::pg_helpers::*; -use compute_tools::spec::*; -use compute_tools::zenith::*; - -/// Do all the preparations like PGDATA directory creation, configuration, -/// safekeepers sync, basebackup, etc. -fn prepare_pgdata(state: &Arc>) -> Result<()> { - let state = state.read().unwrap(); - let spec = &state.spec; - let pgdata_path = Path::new(&state.pgdata); - let pageserver_connstr = spec - .cluster - .settings - .find("zenith.page_server_connstring") - .expect("pageserver connstr should be provided"); - let tenant = spec - .cluster - .settings - .find("zenith.zenith_tenant") - .expect("tenant id should be provided"); - let timeline = spec - .cluster - .settings - .find("zenith.zenith_timeline") - .expect("tenant id should be provided"); - - info!( - "starting cluster #{}, operation #{}", - spec.cluster.cluster_id, - spec.operation_uuid.as_ref().unwrap() - ); - - // Remove/create an empty pgdata directory and put configuration there. - create_pgdata(&state.pgdata)?; - config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?; - - info!("starting safekeepers syncing"); - let lsn = sync_safekeepers(&state.pgdata, &state.pgbin) - .with_context(|| "failed to sync safekeepers")?; - info!("safekeepers synced at LSN {}", lsn); - - info!( - "getting basebackup@{} from pageserver {}", - lsn, pageserver_connstr - ); - get_basebackup(&state.pgdata, &pageserver_connstr, &tenant, &timeline, &lsn).with_context( - || { - format!( - "failed to get basebackup@{} from pageserver {}", - lsn, pageserver_connstr - ) - }, - )?; - - // Update pg_hba.conf received with basebackup. - update_pg_hba(pgdata_path)?; - - Ok(()) -} - -/// Start Postgres as a child process and manage DBs/roles. -/// After that this will hang waiting on the postmaster process to exit. -fn run_compute(state: &Arc>) -> Result { - let read_state = state.read().unwrap(); - let pgdata_path = Path::new(&read_state.pgdata); - - // Run postgres as a child process. - let mut pg = Command::new(&read_state.pgbin) - .args(&["-D", &read_state.pgdata]) - .spawn() - .expect("cannot start postgres process"); - - // Try default Postgres port if it is not provided - let port = read_state - .spec - .cluster - .settings - .find("port") - .unwrap_or_else(|| "5432".to_string()); - wait_for_postgres(&port, pgdata_path)?; - - let mut client = Client::connect(&read_state.connstr, NoTls)?; - - handle_roles(&read_state.spec, &mut client)?; - handle_databases(&read_state.spec, &mut client)?; - handle_grants(&read_state.spec, &mut client)?; - create_writablity_check_data(&mut client)?; - - // 'Close' connection - drop(client); - - info!( - "finished configuration of cluster #{}", - read_state.spec.cluster.cluster_id - ); - - // Release the read lock. - drop(read_state); - - // Get the write lock, update state and release the lock, so HTTP API - // was able to serve requests, while we are blocked waiting on - // Postgres. - let mut state = state.write().unwrap(); - state.ready = true; - drop(state); - - // Wait for child postgres process basically forever. In this state Ctrl+C - // will be propagated to postgres and it will be shut down as well. - let ecode = pg.wait().expect("failed to wait on postgres"); - - Ok(ecode) -} - -fn main() -> Result<()> { - // TODO: re-use `utils::logging` later - init_logger(DEFAULT_LOG_LEVEL)?; - - // Env variable is set by `cargo` - let version: Option<&str> = option_env!("CARGO_PKG_VERSION"); - let matches = clap::App::new("zenith_ctl") - .version(version.unwrap_or("unknown")) - .arg( - Arg::new("connstr") - .short('C') - .long("connstr") - .value_name("DATABASE_URL") - .required(true), - ) - .arg( - Arg::new("pgdata") - .short('D') - .long("pgdata") - .value_name("DATADIR") - .required(true), - ) - .arg( - Arg::new("pgbin") - .short('b') - .long("pgbin") - .value_name("POSTGRES_PATH"), - ) - .arg( - Arg::new("spec") - .short('s') - .long("spec") - .value_name("SPEC_JSON"), - ) - .arg( - Arg::new("spec-path") - .short('S') - .long("spec-path") - .value_name("SPEC_PATH"), - ) - .get_matches(); - - let pgdata = matches.value_of("pgdata").expect("PGDATA path is required"); - let connstr = matches - .value_of("connstr") - .expect("Postgres connection string is required"); - let spec = matches.value_of("spec"); - let spec_path = matches.value_of("spec-path"); - - // Try to use just 'postgres' if no path is provided - let pgbin = matches.value_of("pgbin").unwrap_or("postgres"); - - let spec: ClusterSpec = match spec { - // First, try to get cluster spec from the cli argument - Some(json) => serde_json::from_str(json)?, - None => { - // Second, try to read it from the file if path is provided - if let Some(sp) = spec_path { - let path = Path::new(sp); - let file = File::open(path)?; - serde_json::from_reader(file)? - } else { - panic!("cluster spec should be provided via --spec or --spec-path argument"); - } - } - }; - - let compute_state = ComputeState { - connstr: connstr.to_string(), - pgdata: pgdata.to_string(), - pgbin: pgbin.to_string(), - spec, - ready: false, - last_active: Utc::now(), - }; - let compute_state = Arc::new(RwLock::new(compute_state)); - - // Launch service threads first, so we were able to serve availability - // requests, while configuration is still in progress. - let mut _threads = vec![ - launch_http_server(&compute_state).expect("cannot launch compute monitor thread"), - launch_monitor(&compute_state).expect("cannot launch http endpoint thread"), - ]; - - prepare_pgdata(&compute_state)?; - - // Run compute (Postgres) and hang waiting on it. Panic if any error happens, - // it will help us to trigger unwind and kill postmaster as well. - match run_compute(&compute_state) { - Ok(ec) => exit(ec.success() as i32), - Err(error) => panic!("cannot start compute node, error: {}", error), - } -} diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index 63da6ea23e..dbb70a74cf 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -1,11 +1,11 @@ -use std::sync::{Arc, RwLock}; +use std::sync::Arc; use anyhow::{anyhow, Result}; use log::error; use postgres::Client; use tokio_postgres::NoTls; -use crate::zenith::ComputeState; +use crate::compute::ComputeNode; pub fn create_writablity_check_data(client: &mut Client) -> Result<()> { let query = " @@ -23,9 +23,9 @@ pub fn create_writablity_check_data(client: &mut Client) -> Result<()> { Ok(()) } -pub async fn check_writability(state: &Arc>) -> Result<()> { - let connstr = state.read().unwrap().connstr.clone(); - let (client, connection) = tokio_postgres::connect(&connstr, NoTls).await?; +pub async fn check_writability(compute: &Arc) -> Result<()> { + let connstr = &compute.connstr; + let (client, connection) = tokio_postgres::connect(connstr, NoTls).await?; if client.is_closed() { return Err(anyhow!("connection to postgres closed")); } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs new file mode 100644 index 0000000000..a8422fb2b2 --- /dev/null +++ b/compute_tools/src/compute.rs @@ -0,0 +1,315 @@ +// +// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`, +// but there are several things that makes `PostgresNode` usage inconvenient in the +// cloud: +// - it inherits from `LocalEnv`, which contains **all-all** the information about +// a complete service running +// - it uses `PageServerNode` with information about http endpoint, which we do not +// need in the cloud again +// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud +// +// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required +// attributes (not required for the cloud). Yet, it is still tempting to unify these +// `PostgresNode` and `ComputeNode` and use one in both places. +// +// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`. +// +use std::fs; +use std::os::unix::fs::PermissionsExt; +use std::path::Path; +use std::process::{Command, ExitStatus, Stdio}; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::RwLock; + +use anyhow::{Context, Result}; +use chrono::{DateTime, Utc}; +use log::info; +use postgres::{Client, NoTls}; +use serde::{Serialize, Serializer}; + +use crate::checker::create_writablity_check_data; +use crate::config; +use crate::pg_helpers::*; +use crate::spec::*; + +/// Compute node info shared across several `compute_ctl` threads. +pub struct ComputeNode { + pub start_time: DateTime, + pub connstr: String, + pub pgdata: String, + pub pgbin: String, + pub spec: ComputeSpec, + pub tenant: String, + pub timeline: String, + pub pageserver_connstr: String, + pub metrics: ComputeMetrics, + /// Volatile part of the `ComputeNode` so should be used under `RwLock` + /// to allow HTTP API server to serve status requests, while configuration + /// is in progress. + pub state: RwLock, +} + +fn rfc3339_serialize(x: &DateTime, s: S) -> Result +where + S: Serializer, +{ + x.to_rfc3339().serialize(s) +} + +#[derive(Serialize)] +#[serde(rename_all = "snake_case")] +pub struct ComputeState { + pub status: ComputeStatus, + /// Timestamp of the last Postgres activity + #[serde(serialize_with = "rfc3339_serialize")] + pub last_active: DateTime, + pub error: Option, +} + +impl ComputeState { + pub fn new() -> Self { + Self { + status: ComputeStatus::Init, + last_active: Utc::now(), + error: None, + } + } +} + +impl Default for ComputeState { + fn default() -> Self { + Self::new() + } +} + +#[derive(Serialize, Clone, Copy, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputeStatus { + Init, + Running, + Failed, +} + +#[derive(Serialize)] +pub struct ComputeMetrics { + pub sync_safekeepers_ms: AtomicU64, + pub basebackup_ms: AtomicU64, + pub config_ms: AtomicU64, + pub total_startup_ms: AtomicU64, +} + +impl ComputeMetrics { + pub fn new() -> Self { + Self { + sync_safekeepers_ms: AtomicU64::new(0), + basebackup_ms: AtomicU64::new(0), + config_ms: AtomicU64::new(0), + total_startup_ms: AtomicU64::new(0), + } + } +} + +impl Default for ComputeMetrics { + fn default() -> Self { + Self::new() + } +} + +impl ComputeNode { + pub fn set_status(&self, status: ComputeStatus) { + self.state.write().unwrap().status = status; + } + + pub fn get_status(&self) -> ComputeStatus { + self.state.read().unwrap().status + } + + // Remove `pgdata` directory and create it again with right permissions. + fn create_pgdata(&self) -> Result<()> { + // Ignore removal error, likely it is a 'No such file or directory (os error 2)'. + // If it is something different then create_dir() will error out anyway. + let _ok = fs::remove_dir_all(&self.pgdata); + fs::create_dir(&self.pgdata)?; + fs::set_permissions(&self.pgdata, fs::Permissions::from_mode(0o700))?; + + Ok(()) + } + + // Get basebackup from the libpq connection to pageserver using `connstr` and + // unarchive it to `pgdata` directory overriding all its previous content. + fn get_basebackup(&self, lsn: &str) -> Result<()> { + let start_time = Utc::now(); + + let mut client = Client::connect(&self.pageserver_connstr, NoTls)?; + let basebackup_cmd = match lsn { + "0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute + _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn), + }; + let copyreader = client.copy_out(basebackup_cmd.as_str())?; + let mut ar = tar::Archive::new(copyreader); + + ar.unpack(&self.pgdata)?; + + self.metrics.basebackup_ms.store( + Utc::now() + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64, + Ordering::Relaxed, + ); + + Ok(()) + } + + // Run `postgres` in a special mode with `--sync-safekeepers` argument + // and return the reported LSN back to the caller. + fn sync_safekeepers(&self) -> Result { + let start_time = Utc::now(); + + let sync_handle = Command::new(&self.pgbin) + .args(&["--sync-safekeepers"]) + .env("PGDATA", &self.pgdata) // we cannot use -D in this mode + .stdout(Stdio::piped()) + .spawn() + .expect("postgres --sync-safekeepers failed to start"); + + // `postgres --sync-safekeepers` will print all log output to stderr and + // final LSN to stdout. So we pipe only stdout, while stderr will be automatically + // redirected to the caller output. + let sync_output = sync_handle + .wait_with_output() + .expect("postgres --sync-safekeepers failed"); + if !sync_output.status.success() { + anyhow::bail!( + "postgres --sync-safekeepers exited with non-zero status: {}", + sync_output.status, + ); + } + + self.metrics.sync_safekeepers_ms.store( + Utc::now() + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64, + Ordering::Relaxed, + ); + + let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim()); + + Ok(lsn) + } + + /// Do all the preparations like PGDATA directory creation, configuration, + /// safekeepers sync, basebackup, etc. + pub fn prepare_pgdata(&self) -> Result<()> { + let spec = &self.spec; + let pgdata_path = Path::new(&self.pgdata); + + // Remove/create an empty pgdata directory and put configuration there. + self.create_pgdata()?; + config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?; + + info!("starting safekeepers syncing"); + let lsn = self + .sync_safekeepers() + .with_context(|| "failed to sync safekeepers")?; + info!("safekeepers synced at LSN {}", lsn); + + info!( + "getting basebackup@{} from pageserver {}", + lsn, &self.pageserver_connstr + ); + self.get_basebackup(&lsn).with_context(|| { + format!( + "failed to get basebackup@{} from pageserver {}", + lsn, &self.pageserver_connstr + ) + })?; + + // Update pg_hba.conf received with basebackup. + update_pg_hba(pgdata_path)?; + + Ok(()) + } + + /// Start Postgres as a child process and manage DBs/roles. + /// After that this will hang waiting on the postmaster process to exit. + pub fn run(&self) -> Result { + let start_time = Utc::now(); + + let pgdata_path = Path::new(&self.pgdata); + + // Run postgres as a child process. + let mut pg = Command::new(&self.pgbin) + .args(&["-D", &self.pgdata]) + .spawn() + .expect("cannot start postgres process"); + + // Try default Postgres port if it is not provided + let port = self + .spec + .cluster + .settings + .find("port") + .unwrap_or_else(|| "5432".to_string()); + wait_for_postgres(&mut pg, &port, pgdata_path)?; + + let mut client = Client::connect(&self.connstr, NoTls)?; + + handle_roles(&self.spec, &mut client)?; + handle_databases(&self.spec, &mut client)?; + handle_grants(&self.spec, &mut client)?; + create_writablity_check_data(&mut client)?; + + // 'Close' connection + drop(client); + let startup_end_time = Utc::now(); + + self.metrics.config_ms.store( + startup_end_time + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64, + Ordering::Relaxed, + ); + self.metrics.total_startup_ms.store( + startup_end_time + .signed_duration_since(self.start_time) + .to_std() + .unwrap() + .as_millis() as u64, + Ordering::Relaxed, + ); + + self.set_status(ComputeStatus::Running); + + info!( + "finished configuration of compute for project {}", + self.spec.cluster.cluster_id + ); + + // Wait for child Postgres process basically forever. In this state Ctrl+C + // will propagate to Postgres and it will be shut down as well. + let ecode = pg + .wait() + .expect("failed to start waiting on Postgres process"); + + Ok(ecode) + } + + pub fn prepare_and_run(&self) -> Result { + info!( + "starting compute for project {}, operation {}, tenant {}, timeline {}", + self.spec.cluster.cluster_id, + self.spec.operation_uuid.as_ref().unwrap(), + self.tenant, + self.timeline, + ); + + self.prepare_pgdata()?; + self.run() + } +} diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 22134db0f8..6cbd0e3d4c 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -6,7 +6,7 @@ use std::path::Path; use anyhow::Result; use crate::pg_helpers::PgOptionsSerialize; -use crate::zenith::ClusterSpec; +use crate::spec::ComputeSpec; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -32,20 +32,20 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { } /// Create or completely rewrite configuration file specified by `path` -pub fn write_postgres_conf(path: &Path, spec: &ClusterSpec) -> Result<()> { +pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> { // File::create() destroys the file content if it exists. let mut postgres_conf = File::create(path)?; - write_zenith_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?; + write_auto_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?; Ok(()) } // Write Postgres config block wrapped with generated comment section -fn write_zenith_managed_block(file: &mut File, buf: &str) -> Result<()> { - writeln!(file, "# Managed by Zenith: begin")?; +fn write_auto_managed_block(file: &mut File, buf: &str) -> Result<()> { + writeln!(file, "# Managed by compute_ctl: begin")?; writeln!(file, "{}", buf)?; - writeln!(file, "# Managed by Zenith: end")?; + writeln!(file, "# Managed by compute_ctl: end")?; Ok(()) } diff --git a/compute_tools/src/http_api.rs b/compute_tools/src/http/api.rs similarity index 56% rename from compute_tools/src/http_api.rs rename to compute_tools/src/http/api.rs index 7e1a876044..4c8bbc608b 100644 --- a/compute_tools/src/http_api.rs +++ b/compute_tools/src/http/api.rs @@ -1,37 +1,64 @@ use std::convert::Infallible; use std::net::SocketAddr; -use std::sync::{Arc, RwLock}; +use std::sync::Arc; use std::thread; use anyhow::Result; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; use log::{error, info}; +use serde_json; -use crate::zenith::*; +use crate::compute::{ComputeNode, ComputeStatus}; // Service function to handle all available routes. -async fn routes(req: Request, state: Arc>) -> Response { +async fn routes(req: Request, compute: Arc) -> Response { match (req.method(), req.uri().path()) { // Timestamp of the last Postgres activity in the plain text. + // DEPRECATED in favour of /status (&Method::GET, "/last_activity") => { info!("serving /last_active GET request"); - let state = state.read().unwrap(); + let state = compute.state.read().unwrap(); // Use RFC3339 format for consistency. Response::new(Body::from(state.last_active.to_rfc3339())) } - // Has compute setup process finished? -> true/false + // Has compute setup process finished? -> true/false. + // DEPRECATED in favour of /status (&Method::GET, "/ready") => { info!("serving /ready GET request"); - let state = state.read().unwrap(); - Response::new(Body::from(format!("{}", state.ready))) + let status = compute.get_status(); + Response::new(Body::from(format!("{}", status == ComputeStatus::Running))) } + // Serialized compute state. + (&Method::GET, "/status") => { + info!("serving /status GET request"); + let state = compute.state.read().unwrap(); + Response::new(Body::from(serde_json::to_string(&*state).unwrap())) + } + + // Startup metrics in JSON format. Keep /metrics reserved for a possible + // future use for Prometheus metrics format. + (&Method::GET, "/metrics.json") => { + info!("serving /metrics.json GET request"); + Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap())) + } + + // DEPRECATED, use POST instead (&Method::GET, "/check_writability") => { info!("serving /check_writability GET request"); - let res = crate::checker::check_writability(&state).await; + let res = crate::checker::check_writability(&compute).await; + match res { + Ok(_) => Response::new(Body::from("true")), + Err(e) => Response::new(Body::from(e.to_string())), + } + } + + (&Method::POST, "/check_writability") => { + info!("serving /check_writability POST request"); + let res = crate::checker::check_writability(&compute).await; match res { Ok(_) => Response::new(Body::from("true")), Err(e) => Response::new(Body::from(e.to_string())), @@ -49,7 +76,7 @@ async fn routes(req: Request, state: Arc>) -> Respons // Main Hyper HTTP server function that runs it and blocks waiting on it forever. #[tokio::main] -async fn serve(state: Arc>) { +async fn serve(state: Arc) { let addr = SocketAddr::from(([0, 0, 0, 0], 3080)); let make_service = make_service_fn(move |_conn| { @@ -73,7 +100,7 @@ async fn serve(state: Arc>) { } /// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`. -pub fn launch_http_server(state: &Arc>) -> Result> { +pub fn launch_http_server(state: &Arc) -> Result> { let state = Arc::clone(state); Ok(thread::Builder::new() diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs new file mode 100644 index 0000000000..e5fdf85eed --- /dev/null +++ b/compute_tools/src/http/mod.rs @@ -0,0 +1 @@ +pub mod api; diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml new file mode 100644 index 0000000000..9c0f8e3ccd --- /dev/null +++ b/compute_tools/src/http/openapi_spec.yaml @@ -0,0 +1,158 @@ +openapi: "3.0.2" +info: + title: Compute node control API + version: "1.0" + +servers: + - url: "http://localhost:3080" + +paths: + /status: + get: + tags: + - "info" + summary: Get compute node internal status + description: "" + operationId: getComputeStatus + responses: + "200": + description: ComputeState + content: + application/json: + schema: + $ref: "#/components/schemas/ComputeState" + + /metrics.json: + get: + tags: + - "info" + summary: Get compute node startup metrics in JSON format + description: "" + operationId: getComputeMetricsJSON + responses: + "200": + description: ComputeMetrics + content: + application/json: + schema: + $ref: "#/components/schemas/ComputeMetrics" + + /ready: + get: + deprecated: true + tags: + - "info" + summary: Check whether compute startup process finished successfully + description: "" + operationId: computeIsReady + responses: + "200": + description: Compute is ready ('true') or not ('false') + content: + text/plain: + schema: + type: string + example: "true" + + /last_activity: + get: + deprecated: true + tags: + - "info" + summary: Get timestamp of the last compute activity + description: "" + operationId: getLastComputeActivityTS + responses: + "200": + description: Timestamp of the last compute activity + content: + text/plain: + schema: + type: string + example: "2022-10-12T07:20:50.52Z" + + /check_writability: + get: + deprecated: true + tags: + - "check" + summary: Check that we can write new data on this compute + description: "" + operationId: checkComputeWritabilityDeprecated + responses: + "200": + description: Check result + content: + text/plain: + schema: + type: string + description: Error text or 'true' if check passed + example: "true" + + post: + tags: + - "check" + summary: Check that we can write new data on this compute + description: "" + operationId: checkComputeWritability + responses: + "200": + description: Check result + content: + text/plain: + schema: + type: string + description: Error text or 'true' if check passed + example: "true" + +components: + securitySchemes: + JWT: + type: http + scheme: bearer + bearerFormat: JWT + + schemas: + ComputeMetrics: + type: object + description: Compute startup metrics + required: + - sync_safekeepers_ms + - basebackup_ms + - config_ms + - total_startup_ms + properties: + sync_safekeepers_ms: + type: integer + basebackup_ms: + type: integer + config_ms: + type: integer + total_startup_ms: + type: integer + + ComputeState: + type: object + required: + - status + - last_active + properties: + status: + $ref: '#/components/schemas/ComputeStatus' + last_active: + type: string + description: The last detected compute activity timestamp in UTC and RFC3339 format + example: "2022-10-12T07:20:50.52Z" + error: + type: string + description: Text of the error during compute startup, if any + + ComputeStatus: + type: string + enum: + - init + - failed + - running + +security: + - JWT: [] diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index ffb9700a49..aee6b53e6a 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -4,11 +4,11 @@ //! pub mod checker; pub mod config; -pub mod http_api; +pub mod http; #[macro_use] pub mod logger; +pub mod compute; pub mod monitor; pub mod params; pub mod pg_helpers; pub mod spec; -pub mod zenith; diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 596981b2d2..496a5aae3b 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -1,4 +1,4 @@ -use std::sync::{Arc, RwLock}; +use std::sync::Arc; use std::{thread, time}; use anyhow::Result; @@ -6,16 +6,16 @@ use chrono::{DateTime, Utc}; use log::{debug, info}; use postgres::{Client, NoTls}; -use crate::zenith::ComputeState; +use crate::compute::ComputeNode; const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds // Spin in a loop and figure out the last activity time in the Postgres. // Then update it in the shared state. This function never errors out. // XXX: the only expected panic is at `RwLock` unwrap(). -fn watch_compute_activity(state: &Arc>) { +fn watch_compute_activity(compute: &Arc) { // Suppose that `connstr` doesn't change - let connstr = state.read().unwrap().connstr.clone(); + let connstr = compute.connstr.clone(); // Define `client` outside of the loop to reuse existing connection if it's active. let mut client = Client::connect(&connstr, NoTls); let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL); @@ -46,7 +46,7 @@ fn watch_compute_activity(state: &Arc>) { AND usename != 'zenith_admin';", // XXX: find a better way to filter other monitors? &[], ); - let mut last_active = state.read().unwrap().last_active; + let mut last_active = compute.state.read().unwrap().last_active; if let Ok(backs) = backends { let mut idle_backs: Vec> = vec![]; @@ -83,14 +83,14 @@ fn watch_compute_activity(state: &Arc>) { } // Update the last activity in the shared state if we got a more recent one. - let mut state = state.write().unwrap(); + let mut state = compute.state.write().unwrap(); if last_active > state.last_active { state.last_active = last_active; debug!("set the last compute activity time to: {}", last_active); } } Err(e) => { - info!("cannot connect to postgres: {}, retrying", e); + debug!("cannot connect to postgres: {}, retrying", e); // Establish a new connection and try again. client = Client::connect(&connstr, NoTls); @@ -100,7 +100,7 @@ fn watch_compute_activity(state: &Arc>) { } /// Launch a separate compute monitor thread and return its `JoinHandle`. -pub fn launch_monitor(state: &Arc>) -> Result> { +pub fn launch_monitor(state: &Arc) -> Result> { let state = Arc::clone(state); Ok(thread::Builder::new() diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 1409a81b6b..74856eac63 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -1,7 +1,9 @@ +use std::fs::File; +use std::io::{BufRead, BufReader}; use std::net::{SocketAddr, TcpStream}; use std::os::unix::fs::PermissionsExt; use std::path::Path; -use std::process::Command; +use std::process::Child; use std::str::FromStr; use std::{fs, thread, time}; @@ -220,12 +222,12 @@ pub fn get_existing_dbs(client: &mut Client) -> Result> { /// Wait for Postgres to become ready to accept connections: /// - state should be `ready` in the `pgdata/postmaster.pid` /// - and we should be able to connect to 127.0.0.1:5432 -pub fn wait_for_postgres(port: &str, pgdata: &Path) -> Result<()> { +pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()> { let pid_path = pgdata.join("postmaster.pid"); let mut slept: u64 = 0; // ms let pause = time::Duration::from_millis(100); - let timeout = time::Duration::from_millis(200); + let timeout = time::Duration::from_millis(10); let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap(); loop { @@ -236,14 +238,19 @@ pub fn wait_for_postgres(port: &str, pgdata: &Path) -> Result<()> { bail!("timed out while waiting for Postgres to start"); } + if let Ok(Some(status)) = pg.try_wait() { + // Postgres exited, that is not what we expected, bail out earlier. + let code = status.code().unwrap_or(-1); + bail!("Postgres exited unexpectedly with code {}", code); + } + if pid_path.exists() { - // XXX: dumb and the simplest way to get the last line in a text file - // TODO: better use `.lines().last()` later - let stdout = Command::new("tail") - .args(&["-n1", pid_path.to_str().unwrap()]) - .output()? - .stdout; - let status = String::from_utf8(stdout)?; + let file = BufReader::new(File::open(&pid_path)?); + let status = file + .lines() + .last() + .unwrap() + .unwrap_or_else(|_| "unknown".to_string()); let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok(); // Now Postgres is ready to accept connections diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 334e0a9e05..e88df56a65 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -3,16 +3,53 @@ use std::path::Path; use anyhow::Result; use log::{info, log_enabled, warn, Level}; use postgres::Client; +use serde::Deserialize; use crate::config; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; -use crate::zenith::ClusterSpec; + +/// Cluster spec or configuration represented as an optional number of +/// delta operations + final cluster state description. +#[derive(Clone, Deserialize)] +pub struct ComputeSpec { + pub format_version: f32, + pub timestamp: String, + pub operation_uuid: Option, + /// Expected cluster state at the end of transition process. + pub cluster: Cluster, + pub delta_operations: Option>, +} + +/// Cluster state seen from the perspective of the external tools +/// like Rails web console. +#[derive(Clone, Deserialize)] +pub struct Cluster { + pub cluster_id: String, + pub name: String, + pub state: Option, + pub roles: Vec, + pub databases: Vec, + pub settings: GenericOptions, +} + +/// Single cluster state changing operation that could not be represented as +/// a static `Cluster` structure. For example: +/// - DROP DATABASE +/// - DROP ROLE +/// - ALTER ROLE name RENAME TO new_name +/// - ALTER DATABASE name RENAME TO new_name +#[derive(Clone, Deserialize)] +pub struct DeltaOp { + pub action: String, + pub name: PgIdent, + pub new_name: Option, +} /// It takes cluster specification and does the following: /// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file. /// - Update `pg_hba.conf` to allow external connections. -pub fn handle_configuration(spec: &ClusterSpec, pgdata_path: &Path) -> Result<()> { +pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> { // File `postgresql.conf` is no longer included into `basebackup`, so just // always write all config into it creating new file. config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?; @@ -39,7 +76,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { /// Given a cluster spec json and open transaction it handles roles creation, /// deletion and update. -pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> { +pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { let mut xact = client.transaction()?; let existing_roles: Vec = get_existing_roles(&mut xact)?; @@ -165,7 +202,7 @@ pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> { /// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level /// atomicity should be enough here due to the order of operations and various checks, /// which together provide us idempotency. -pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> { +pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { let existing_dbs: Vec = get_existing_dbs(client)?; // Print a list of existing Postgres databases (only in debug mode) @@ -254,7 +291,7 @@ pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> { // Grant CREATE ON DATABASE to the database owner // to allow clients create trusted extensions. -pub fn handle_grants(spec: &ClusterSpec, client: &mut Client) -> Result<()> { +pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> { info!("cluster spec grants:"); for db in &spec.cluster.databases { diff --git a/compute_tools/src/zenith.rs b/compute_tools/src/zenith.rs deleted file mode 100644 index ba7dc20787..0000000000 --- a/compute_tools/src/zenith.rs +++ /dev/null @@ -1,109 +0,0 @@ -use std::process::{Command, Stdio}; - -use anyhow::Result; -use chrono::{DateTime, Utc}; -use postgres::{Client, NoTls}; -use serde::Deserialize; - -use crate::pg_helpers::*; - -/// Compute node state shared across several `zenith_ctl` threads. -/// Should be used under `RwLock` to allow HTTP API server to serve -/// status requests, while configuration is in progress. -pub struct ComputeState { - pub connstr: String, - pub pgdata: String, - pub pgbin: String, - pub spec: ClusterSpec, - /// Compute setup process has finished - pub ready: bool, - /// Timestamp of the last Postgres activity - pub last_active: DateTime, -} - -/// Cluster spec or configuration represented as an optional number of -/// delta operations + final cluster state description. -#[derive(Clone, Deserialize)] -pub struct ClusterSpec { - pub format_version: f32, - pub timestamp: String, - pub operation_uuid: Option, - /// Expected cluster state at the end of transition process. - pub cluster: Cluster, - pub delta_operations: Option>, -} - -/// Cluster state seen from the perspective of the external tools -/// like Rails web console. -#[derive(Clone, Deserialize)] -pub struct Cluster { - pub cluster_id: String, - pub name: String, - pub state: Option, - pub roles: Vec, - pub databases: Vec, - pub settings: GenericOptions, -} - -/// Single cluster state changing operation that could not be represented as -/// a static `Cluster` structure. For example: -/// - DROP DATABASE -/// - DROP ROLE -/// - ALTER ROLE name RENAME TO new_name -/// - ALTER DATABASE name RENAME TO new_name -#[derive(Clone, Deserialize)] -pub struct DeltaOp { - pub action: String, - pub name: PgIdent, - pub new_name: Option, -} - -/// Get basebackup from the libpq connection to pageserver using `connstr` and -/// unarchive it to `pgdata` directory overriding all its previous content. -pub fn get_basebackup( - pgdata: &str, - connstr: &str, - tenant: &str, - timeline: &str, - lsn: &str, -) -> Result<()> { - let mut client = Client::connect(connstr, NoTls)?; - let basebackup_cmd = match lsn { - "0/0" => format!("basebackup {} {}", tenant, timeline), // First start of the compute - _ => format!("basebackup {} {} {}", tenant, timeline, lsn), - }; - let copyreader = client.copy_out(basebackup_cmd.as_str())?; - let mut ar = tar::Archive::new(copyreader); - - ar.unpack(&pgdata)?; - - Ok(()) -} - -/// Run `postgres` in a special mode with `--sync-safekeepers` argument -/// and return the reported LSN back to the caller. -pub fn sync_safekeepers(pgdata: &str, pgbin: &str) -> Result { - let sync_handle = Command::new(&pgbin) - .args(&["--sync-safekeepers"]) - .env("PGDATA", &pgdata) // we cannot use -D in this mode - .stdout(Stdio::piped()) - .spawn() - .expect("postgres --sync-safekeepers failed to start"); - - // `postgres --sync-safekeepers` will print all log output to stderr and - // final LSN to stdout. So we pipe only stdout, while stderr will be automatically - // redirected to the caller output. - let sync_output = sync_handle - .wait_with_output() - .expect("postgres --sync-safekeepers failed"); - if !sync_output.status.success() { - anyhow::bail!( - "postgres --sync-safekeepers exited with non-zero status: {}", - sync_output.status, - ); - } - - let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim()); - - Ok(lsn) -} diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 472a49af4b..33f903f0e1 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -4,12 +4,12 @@ mod pg_helpers_tests { use std::fs::File; use compute_tools::pg_helpers::*; - use compute_tools::zenith::ClusterSpec; + use compute_tools::spec::ComputeSpec; #[test] fn params_serialize() { let file = File::open("tests/cluster_spec.json").unwrap(); - let spec: ClusterSpec = serde_json::from_reader(file).unwrap(); + let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); assert_eq!( spec.cluster.databases.first().unwrap().to_pg_options(), @@ -24,7 +24,7 @@ mod pg_helpers_tests { #[test] fn settings_serialize() { let file = File::open("tests/cluster_spec.json").unwrap(); - let spec: ClusterSpec = serde_json::from_reader(file).unwrap(); + let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); assert_eq!( spec.cluster.settings.as_pg_settings(), diff --git a/docs/docker.md b/docs/docker.md index cc54d012dd..100cdd248b 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -1,20 +1,20 @@ -# Docker images of Zenith +# Docker images of Neon ## Images Currently we build two main images: -- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). -- [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres). +- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). +- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). -And additional intermediate images: +And additional intermediate image: -- [zenithdb/compute-tools](https://hub.docker.com/repository/docker/zenithdb/compute-tools) — compute node configuration management tools. +- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools. ## Building pipeline -1. Image `zenithdb/compute-tools` is re-built automatically. +We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs -2. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo. +1. `neondatabase/compute-tools` and `neondatabase/compute-node` -3. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically. +2. `neondatabase/neon` diff --git a/vendor/postgres b/vendor/postgres index 1db115cecb..79af2faf08 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 1db115cecb3dbc2a74c5efa964fdf3a8a341c4d2 +Subproject commit 79af2faf08d9bec1b1664a72936727dcca36d253 From 98da0aa159f028c1ffc0679ee788f44e9f083dfc Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 18 May 2022 15:17:04 +0300 Subject: [PATCH 0310/1022] Add _total suffix to metrics name (#1741) --- libs/metrics/src/lib.rs | 2 +- libs/utils/src/http/endpoint.rs | 2 +- proxy/src/proxy.rs | 6 +++--- test_runner/fixtures/benchmark_fixture.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index b3c1a6bd55..9929fc6d45 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -28,7 +28,7 @@ pub fn gather() -> Vec { lazy_static! { static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!( - "libmetrics_disk_io_bytes", + "libmetrics_disk_io_bytes_total", "Bytes written and read from disk, grouped by the operation (read|write)", &["io_operation"] ) diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 912404bd7d..51bff5f6eb 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -18,7 +18,7 @@ use super::error::ApiError; lazy_static! { static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!( - "libmetrics_serve_metrics_count", + "libmetrics_metric_handler_requests_total", "Number of metric requests made" ) .expect("failed to define a metric"); diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index f10b273bfd..642e50c2c1 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -15,17 +15,17 @@ const ERR_PROTO_VIOLATION: &str = "protocol violation"; lazy_static! { static ref NUM_CONNECTIONS_ACCEPTED_COUNTER: IntCounter = register_int_counter!( - "proxy_accepted_connections", + "proxy_accepted_connections_total", "Number of TCP client connections accepted." ) .unwrap(); static ref NUM_CONNECTIONS_CLOSED_COUNTER: IntCounter = register_int_counter!( - "proxy_closed_connections", + "proxy_closed_connections_total", "Number of TCP client connections closed." ) .unwrap(); static ref NUM_BYTES_PROXIED_COUNTER: IntCounter = register_int_counter!( - "proxy_io_bytes", + "proxy_io_bytes_total", "Number of bytes sent/received between any client and backend." ) .unwrap(); diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index e296e85cc7..5fc6076f51 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -236,7 +236,7 @@ class ZenithBenchmarker: """ Fetch the "cumulative # of bytes written" metric from the pageserver """ - metric_name = r'libmetrics_disk_io_bytes{io_operation="write"}' + metric_name = r'libmetrics_disk_io_bytes_total{io_operation="write"}' return self.get_int_counter_value(pageserver, metric_name) def get_peak_mem(self, pageserver) -> int: From 432907ff5f130f4fada8dd605e428d1bea822ea0 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Wed, 18 May 2022 22:02:17 +0200 Subject: [PATCH 0311/1022] Safekeeper: avoid holding mutex when deleting a tenant (#1746) Following discussion with @arssher after #1653 --- safekeeper/src/timeline.rs | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 84ad53d72d..2bb7771aac 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -679,29 +679,32 @@ impl GlobalTimelines { /// Deactivates and deletes all timelines for the tenant, see `delete()`. /// Returns map of all timelines which the tenant had, `true` if a timeline was active. + /// There may be a race if new timelines are created simultaneously. pub fn delete_force_all_for_tenant( conf: &SafeKeeperConf, tenant_id: &ZTenantId, ) -> Result> { info!("deleting all timelines for tenant {}", tenant_id); - let mut state = TIMELINES_STATE.lock().unwrap(); - let mut deleted = HashMap::new(); - for (zttid, tli) in &state.timelines { - if zttid.tenant_id == *tenant_id { - deleted.insert( - *zttid, - GlobalTimelines::delete_force_internal( - conf, - zttid, - tli.deactivate_for_delete()?, - )?, - ); + let mut to_delete = HashMap::new(); + { + // Keep mutex in this scope. + let timelines = &mut TIMELINES_STATE.lock().unwrap().timelines; + for (&zttid, tli) in timelines.iter() { + if zttid.tenant_id == *tenant_id { + to_delete.insert(zttid, tli.deactivate_for_delete()?); + } } + // TODO: test that the correct subset of timelines is removed. It's complicated because they are implicitly created currently. + timelines.retain(|zttid, _| !to_delete.contains_key(zttid)); } - // TODO: test that the exact subset of timelines is removed. - state - .timelines - .retain(|zttid, _| !deleted.contains_key(zttid)); + let mut deleted = HashMap::new(); + for (zttid, was_active) in to_delete { + deleted.insert( + zttid, + GlobalTimelines::delete_force_internal(conf, &zttid, was_active)?, + ); + } + // There may be inactive timelines, so delete the whole tenant dir as well. match std::fs::remove_dir_all(conf.tenant_dir(tenant_id)) { Ok(_) => (), Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), From 4a36d89247723a42b45f8b46da5e6b930a6aaa38 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 18 May 2022 22:26:17 +0300 Subject: [PATCH 0312/1022] Avoid spawning a layer-flush thread when there's no work to do. The check_checkpoint_distance() always spawned a new thread, even if there is no frozen layer to flush. That was a thinko, as @knizhnik pointed out. --- pageserver/src/layered_repository.rs | 32 +++++++++++++++++----------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index c7536cc959..bad2e32cc2 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1621,22 +1621,30 @@ impl LayeredTimeline { pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { let last_lsn = self.get_last_record_lsn(); + // Has more than 'checkpoint_distance' of WAL been accumulated? let distance = last_lsn.widening_sub(self.last_freeze_at.load()); if distance >= self.get_checkpoint_distance().into() { + // Yes. Freeze the current in-memory layer. self.freeze_inmem_layer(true); self.last_freeze_at.store(last_lsn); - } - if let Ok(guard) = self.layer_flush_lock.try_lock() { - drop(guard); - let self_clone = Arc::clone(self); - thread_mgr::spawn( - thread_mgr::ThreadKind::LayerFlushThread, - Some(self.tenant_id), - Some(self.timeline_id), - "layer flush thread", - false, - move || self_clone.flush_frozen_layers(false), - )?; + + // Launch a thread to flush the frozen layer to disk, unless + // a thread was already running. (If the thread was running + // at the time that we froze the layer, it must've seen the + // the layer we just froze before it exited; see comments + // in flush_frozen_layers()) + if let Ok(guard) = self.layer_flush_lock.try_lock() { + drop(guard); + let self_clone = Arc::clone(self); + thread_mgr::spawn( + thread_mgr::ThreadKind::LayerFlushThread, + Some(self.tenant_id), + Some(self.timeline_id), + "layer flush thread", + false, + move || self_clone.flush_frozen_layers(false), + )?; + } } Ok(()) } From 5914aab78aa54daa889abab9ae41db358158bd71 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 18 May 2022 21:16:14 +0300 Subject: [PATCH 0313/1022] add comments, use expect instead of unwrap --- .../src/layered_repository/disk_btree.rs | 33 +++++++++++++++---- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/pageserver/src/layered_repository/disk_btree.rs b/pageserver/src/layered_repository/disk_btree.rs index e747192d96..0c9ad75048 100644 --- a/pageserver/src/layered_repository/disk_btree.rs +++ b/pageserver/src/layered_repository/disk_btree.rs @@ -444,6 +444,13 @@ where /// /// stack[0] is the current root page, stack.last() is the leaf. /// + /// We maintain the length of the stack to be always greater than zero. + /// Two exceptions are: + /// 1. `Self::flush_node`. The method will push the new node if it extracted the last one. + /// So because other methods cannot see the intermediate state invariant still holds. + /// 2. `Self::finish`. It consumes self and does not return it back, + /// which means that this is where the structure is destroyed. + /// Thus stack of zero length cannot be observed by other methods. stack: Vec>, /// Last key that was appended to the tree. Used to sanity check that append @@ -482,7 +489,10 @@ where fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<()> { // Try to append to the current leaf buffer - let last = self.stack.last_mut().unwrap(); + let last = self + .stack + .last_mut() + .expect("should always have at least one item"); let level = last.level; if last.push(key, value) { return Ok(()); @@ -512,19 +522,25 @@ where Ok(()) } + /// Flush the bottommost node in the stack to disk. Appends a downlink to its parent, + /// and recursively flushes the parent too, if it becomes full. If the root page becomes full, + /// creates a new root page, increasing the height of the tree. fn flush_node(&mut self) -> Result<()> { - let last = self.stack.pop().unwrap(); + // Get the current bottommost node in the stack and flush it to disk. + let last = self + .stack + .pop() + .expect("should always have at least one item"); let buf = last.pack(); let downlink_key = last.first_key(); let downlink_ptr = self.writer.write_blk(buf)?; - // Append the downlink to the parent + // Append the downlink to the parent. If there is no parent, ie. this was the root page, + // create a new root page, increasing the height of the tree. if self.stack.is_empty() { self.stack.push(BuildNode::new(last.level + 1)); } - self.append_internal(&downlink_key, Value::from_blknum(downlink_ptr))?; - - Ok(()) + self.append_internal(&downlink_key, Value::from_blknum(downlink_ptr)) } /// @@ -540,7 +556,10 @@ where self.flush_node()?; } - let root = self.stack.first().unwrap(); + let root = self + .stack + .first() + .expect("by the check above we left one item there"); let buf = root.pack(); let root_blknum = self.writer.write_blk(buf)?; From bd2979d02cfafa84180290f1c3986ad5d3eb33de Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Tue, 10 May 2022 17:06:03 +0300 Subject: [PATCH 0314/1022] CirleCI/check-codestyle-python: print versions --- .circleci/config.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 85ac905f0b..60a1cfea14 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -222,6 +222,12 @@ jobs: key: v2-python-deps-{{ checksum "poetry.lock" }} paths: - /home/circleci/.cache/pypoetry/virtualenvs + - run: + name: Print versions + when: always + command: | + poetry run python --version + poetry show - run: name: Run yapf to ensure code format when: always From 7dd27ecd20c179d176880998db8ce9a1f1f56c61 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Tue, 10 May 2022 17:08:33 +0300 Subject: [PATCH 0315/1022] Bump minimal supported Python version to 3.9 Most of the CI already run with Python 3.9 since https://github.com/neondatabase/docker-images/pull/1 --- README.md | 3 +-- docs/sourcetree.md | 8 ++++---- pyproject.toml | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 39cbd2a222..d5dccb7724 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,6 @@ cd neon make -j5 ``` - #### building on OSX (12.3.1) 1. Install XCode ``` @@ -82,7 +81,7 @@ make -j5 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively. To run the integration tests or Python scripts (not required to use the code), install -Python (3.7 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory. +Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory. #### running neon database diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 5ddc6208d2..81e0f2fe88 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -91,18 +91,18 @@ so manual installation of dependencies is not recommended. A single virtual environment with all dependencies is described in the single `Pipfile`. ### Prerequisites -- Install Python 3.7 (the minimal supported version) or greater. +- Install Python 3.9 (the minimal supported version) or greater. - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesnt work as expected. - - If you have some trouble with other version you can resolve it by installing Python 3.7 separately, via pyenv or via system package manager e.g.: + - If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via pyenv or via system package manager e.g.: ```bash # In Ubuntu sudo add-apt-repository ppa:deadsnakes/ppa sudo apt update - sudo apt install python3.7 + sudo apt install python3.9 ``` - Install `poetry` - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`. -- Install dependencies via `./scripts/pysync`. Note that CI uses Python 3.7 so if you have different version some linting tools can yield different result locally vs in the CI. +- Install dependencies via `./scripts/pysync`. Note that CI uses Python 3.9 so if you have different version some linting tools can yield different result locally vs in the CI. Run `poetry shell` to activate the virtual environment. Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`. diff --git a/pyproject.toml b/pyproject.toml index b70eb19009..def55f6671 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "" authors = [] [tool.poetry.dependencies] -python = "^3.7" +python = "^3.9" pytest = "^6.2.5" psycopg2-binary = "^2.9.1" typing-extensions = "^3.10.0" From fab104d5f32f3373c29d7764c37830b712f954c3 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Tue, 10 May 2022 17:11:31 +0300 Subject: [PATCH 0316/1022] docs/sourcetree: add note about exact Python version used and how to choose it --- docs/sourcetree.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 81e0f2fe88..c8d4baff62 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -93,7 +93,7 @@ A single virtual environment with all dependencies is described in the single `P ### Prerequisites - Install Python 3.9 (the minimal supported version) or greater. - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesnt work as expected. - - If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via pyenv or via system package manager e.g.: + - If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.: ```bash # In Ubuntu sudo add-apt-repository ppa:deadsnakes/ppa @@ -102,7 +102,11 @@ A single virtual environment with all dependencies is described in the single `P ``` - Install `poetry` - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`. -- Install dependencies via `./scripts/pysync`. Note that CI uses Python 3.9 so if you have different version some linting tools can yield different result locally vs in the CI. +- Install dependencies via `./scripts/pysync`. + - Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile)) + so if you have different version some linting tools can yield different result locally vs in the CI. + - You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.9`. + This may also disable the `The currently activated Python version X.Y.Z is not supported by the project` warning. Run `poetry shell` to activate the virtual environment. Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`. From c1b365fdf7f56cf05d84c7b095bebc12101a1c12 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 18 May 2022 14:29:01 +0300 Subject: [PATCH 0317/1022] Use temp filename while writing ImageLayer file --- .../src/layered_repository/delta_layer.rs | 24 ++++++++--- .../src/layered_repository/image_layer.rs | 42 +++++++++++++++---- 2 files changed, 53 insertions(+), 13 deletions(-) diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 1c48f3def5..855e2a9172 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -420,6 +420,21 @@ impl DeltaLayer { } } + fn temp_path_for( + conf: &PageServerConf, + timelineid: ZTimelineId, + tenantid: ZTenantId, + key_start: Key, + lsn_range: Range, + ) -> PathBuf { + conf.timeline_path(&timelineid, &tenantid).join(format!( + "{}-XXX__{:016X}-{:016X}.temp", + key_start, + u64::from(lsn_range.start), + u64::from(lsn_range.end) + )) + } + /// /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. @@ -607,12 +622,9 @@ impl DeltaLayerWriter { // // Note: This overwrites any existing file. There shouldn't be any. // FIXME: throw an error instead? - let path = conf.timeline_path(&timelineid, &tenantid).join(format!( - "{}-XXX__{:016X}-{:016X}.temp", - key_start, - u64::from(lsn_range.start), - u64::from(lsn_range.end) - )); + let path = + DeltaLayer::temp_path_for(conf, timelineid, tenantid, key_start, lsn_range.clone()); + let mut file = VirtualFile::create(&path)?; // make room for the header block file.seek(SeekFrom::Start(PAGE_SZ as u64))?; diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index c0c8e7789a..0a7cd2cdba 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -241,6 +241,20 @@ impl ImageLayer { } } + fn temp_path_for( + path_or_conf: &PathOrConf, + timelineid: ZTimelineId, + tenantid: ZTenantId, + fname: &ImageFileName, + ) -> PathBuf { + match path_or_conf { + PathOrConf::Path(path) => path.to_path_buf(), + PathOrConf::Conf(conf) => conf + .timeline_path(&timelineid, &tenantid) + .join(format!("{}.temp", fname)), + } + } + /// /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. @@ -398,7 +412,7 @@ impl ImageLayer { /// pub struct ImageLayerWriter { conf: &'static PageServerConf, - _path: PathBuf, + path: PathBuf, timelineid: ZTimelineId, tenantid: ZTenantId, key_range: Range, @@ -416,11 +430,9 @@ impl ImageLayerWriter { key_range: &Range, lsn: Lsn, ) -> anyhow::Result { - // Create the file - // - // Note: This overwrites any existing file. There shouldn't be any. - // FIXME: throw an error instead? - let path = ImageLayer::path_for( + // Create the file initially with a temporary filename. + // We'll atomically rename it to the final name when we're done. + let path = ImageLayer::temp_path_for( &PathOrConf::Conf(conf), timelineid, tenantid, @@ -441,7 +453,7 @@ impl ImageLayerWriter { let writer = ImageLayerWriter { conf, - _path: path, + path, timelineid, tenantid, key_range: key_range.clone(), @@ -512,6 +524,22 @@ impl ImageLayerWriter { index_root_blk, }), }; + + // Rename the file to its final name + // + // Note: This overwrites any existing file. There shouldn't be any. + // FIXME: throw an error instead? + let final_path = ImageLayer::path_for( + &PathOrConf::Conf(self.conf), + self.timelineid, + self.tenantid, + &ImageFileName { + key_range: self.key_range.clone(), + lsn: self.lsn, + }, + ); + std::fs::rename(self.path, &final_path)?; + trace!("created image layer {}", layer.path().display()); Ok(layer) From 3da4b3165ef4056f72e0fb84bd4fd24669526c15 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 18 May 2022 18:06:33 +0300 Subject: [PATCH 0318/1022] Fsync layer files before rename --- pageserver/src/layered_repository/delta_layer.rs | 7 ++++--- pageserver/src/layered_repository/image_layer.rs | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 855e2a9172..3484e6bd0f 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -425,7 +425,7 @@ impl DeltaLayer { timelineid: ZTimelineId, tenantid: ZTenantId, key_start: Key, - lsn_range: Range, + lsn_range: &Range, ) -> PathBuf { conf.timeline_path(&timelineid, &tenantid).join(format!( "{}-XXX__{:016X}-{:016X}.temp", @@ -622,8 +622,7 @@ impl DeltaLayerWriter { // // Note: This overwrites any existing file. There shouldn't be any. // FIXME: throw an error instead? - let path = - DeltaLayer::temp_path_for(conf, timelineid, tenantid, key_start, lsn_range.clone()); + let path = DeltaLayer::temp_path_for(conf, timelineid, tenantid, key_start, &lsn_range); let mut file = VirtualFile::create(&path)?; // make room for the header block @@ -717,6 +716,8 @@ impl DeltaLayerWriter { }), }; + // fsync the file + file.sync_all()?; // Rename the file to its final name // // Note: This overwrites any existing file. There shouldn't be any. diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 0a7cd2cdba..5e97366da9 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -525,6 +525,9 @@ impl ImageLayerWriter { }), }; + // fsync the file + file.sync_all()?; + // Rename the file to its final name // // Note: This overwrites any existing file. There shouldn't be any. From 4c30ae8ba32f45d90d870dbf926965237ddd3c7f Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 18 May 2022 22:29:13 +0300 Subject: [PATCH 0319/1022] Add random string as a part of tempfile name --- .../src/layered_repository/delta_layer.rs | 12 ++++++++++-- .../src/layered_repository/image_layer.rs | 19 +++++++++++-------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 3484e6bd0f..ed342c0cca 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -37,6 +37,7 @@ use crate::virtual_file::VirtualFile; use crate::walrecord; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; +use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; use std::fs; use std::io::{BufWriter, Write}; @@ -427,11 +428,18 @@ impl DeltaLayer { key_start: Key, lsn_range: &Range, ) -> PathBuf { + let rand_string: String = rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(8) + .map(char::from) + .collect(); + conf.timeline_path(&timelineid, &tenantid).join(format!( - "{}-XXX__{:016X}-{:016X}.temp", + "{}-XXX__{:016X}-{:016X}.{}.temp", key_start, u64::from(lsn_range.start), - u64::from(lsn_range.end) + u64::from(lsn_range.end), + rand_string )) } diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 5e97366da9..905023ecf9 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -34,6 +34,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use hex; +use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; use std::fs; use std::io::Write; @@ -242,17 +243,19 @@ impl ImageLayer { } fn temp_path_for( - path_or_conf: &PathOrConf, + conf: &PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId, fname: &ImageFileName, ) -> PathBuf { - match path_or_conf { - PathOrConf::Path(path) => path.to_path_buf(), - PathOrConf::Conf(conf) => conf - .timeline_path(&timelineid, &tenantid) - .join(format!("{}.temp", fname)), - } + let rand_string: String = rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(8) + .map(char::from) + .collect(); + + conf.timeline_path(&timelineid, &tenantid) + .join(format!("{}.{}.temp", fname, rand_string)) } /// @@ -433,7 +436,7 @@ impl ImageLayerWriter { // Create the file initially with a temporary filename. // We'll atomically rename it to the final name when we're done. let path = ImageLayer::temp_path_for( - &PathOrConf::Conf(conf), + conf, timelineid, tenantid, &ImageFileName { From cbd00d7ed91e4b4cd95d3e2e40b16a06e73613ff Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 18 May 2022 23:46:38 +0300 Subject: [PATCH 0320/1022] Remove temp layer files during timeline initialization on pageserver start --- pageserver/src/storage_sync.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 39459fafc6..bbebcd1f36 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -421,6 +421,14 @@ fn collect_timeline_files( entry_path.display() ) })?; + } else if entry_path.extension().and_then(OsStr::to_str) == Some("temp") { + info!("removing temp layer file at {}", entry_path.display()); + std::fs::remove_file(&entry_path).with_context(|| { + format!( + "failed to remove temp layer file at {}", + entry_path.display() + ) + })?; } else { timeline_files.insert(entry_path); } From 0da4046704d5c5f100a81915e68098f7c8e486f7 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 19 May 2022 00:53:28 +0300 Subject: [PATCH 0321/1022] Include traversal path in error message. Previously, the path was printed to the log with separate error!() calls. It's better to include the whole path in the error object and have it printed to the log as one message. Also print the path in the ValueReconstructResult::Missing case. This is what it looks like now: 2022-05-17T21:53:53.611801Z ERROR pagestream{timeline=5adcb4af3e95f00a31550d266aab7a37 tenant=74d9f9ad3293c030c6a6e196dd91c60f}: error reading relation or page version: could not find data for key 000000067F000032BE000000000000000001 at LSN 0/1698C48, for request at LSN 0/1698CF8 Caused by: 0: layer traversal: result Complete, cont_lsn 0/1698C48, layer: 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001698C48-0000000001698CC1 1: layer traversal: result Continue, cont_lsn 0/1698CC1, layer: inmem-0000000001698CC1-FFFFFFFFFFFFFFFF Stack backtrace: --- pageserver/src/layered_repository.rs | 72 ++++++++++++++++++---------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index bad2e32cc2..79e66e5f17 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1357,7 +1357,9 @@ impl LayeredTimeline { let mut timeline_owned; let mut timeline = self; - let mut path: Vec<(ValueReconstructResult, Lsn, Arc)> = Vec::new(); + // For debugging purposes, collect the path of layers that we traversed + // through. It's included in the error message if we fail to find the key. + let mut traversal_path: Vec<(ValueReconstructResult, Lsn, Arc)> = Vec::new(); let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { *cached_lsn @@ -1387,32 +1389,24 @@ impl LayeredTimeline { if prev_lsn <= cont_lsn { // Didn't make any progress in last iteration. Error out to avoid // getting stuck in the loop. - - // For debugging purposes, print the path of layers that we traversed - // through. - for (r, c, l) in path { - error!( - "PATH: result {:?}, cont_lsn {}, layer: {}", - r, - c, - l.filename().display() - ); - } - bail!("could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", - key, - Lsn(cont_lsn.0 - 1), - request_lsn, - timeline.ancestor_lsn) + return layer_traversal_error(format!( + "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", + key, + Lsn(cont_lsn.0 - 1), + request_lsn, + timeline.ancestor_lsn + ), traversal_path); } prev_lsn = cont_lsn; } ValueReconstructResult::Missing => { - bail!( - "could not find data for key {} at LSN {}, for request at LSN {}", - key, - cont_lsn, - request_lsn - ) + return layer_traversal_error( + format!( + "could not find data for key {} at LSN {}, for request at LSN {}", + key, cont_lsn, request_lsn + ), + traversal_path, + ); } } @@ -1447,7 +1441,7 @@ impl LayeredTimeline { reconstruct_state, )?; cont_lsn = lsn_floor; - path.push((result, cont_lsn, open_layer.clone())); + traversal_path.push((result, cont_lsn, open_layer.clone())); continue; } } @@ -1462,7 +1456,7 @@ impl LayeredTimeline { reconstruct_state, )?; cont_lsn = lsn_floor; - path.push((result, cont_lsn, frozen_layer.clone())); + traversal_path.push((result, cont_lsn, frozen_layer.clone())); continue 'outer; } } @@ -1477,7 +1471,7 @@ impl LayeredTimeline { reconstruct_state, )?; cont_lsn = lsn_floor; - path.push((result, cont_lsn, layer)); + traversal_path.push((result, cont_lsn, layer)); } else if timeline.ancestor_timeline.is_some() { // Nothing on this timeline. Traverse to parent result = ValueReconstructResult::Continue; @@ -2375,6 +2369,32 @@ impl LayeredTimeline { } } +/// Helper function for get_reconstruct_data() to add the path of layers traversed +/// to an error, as anyhow context information. +fn layer_traversal_error( + msg: String, + path: Vec<(ValueReconstructResult, Lsn, Arc)>, +) -> anyhow::Result<()> { + // We want the original 'msg' to be the outermost context. The outermost context + // is the most high-level information, which also gets propagated to the client. + let mut msg_iter = path + .iter() + .map(|(r, c, l)| { + format!( + "layer traversal: result {:?}, cont_lsn {}, layer: {}", + r, + c, + l.filename().display() + ) + }) + .chain(std::iter::once(msg)); + // Construct initial message from the first traversed layer + let err = anyhow!(msg_iter.next().unwrap()); + + // Append all subsequent traversals, and the error message 'msg', as contexts. + Err(msg_iter.fold(err, |err, msg| err.context(msg))) +} + struct LayeredTimelineWriter<'a> { tl: &'a LayeredTimeline, _write_guard: MutexGuard<'a, ()>, From ee3bcf108d0ed1c1442c22182dcaaa1a6c518df4 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 19 May 2022 00:53:33 +0300 Subject: [PATCH 0322/1022] Fix compact_level0 for delta layers with overlap or gaps We saw a case in staging, where there was a gap in the LSN ranges of level 0 files, like this: 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016960E9-00000000016E4DB9 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016E4DB9-000000000BFCE3E1 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000BFCE3E1-000000000BFD0FE9 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000060045901-000000007005EAC1 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000007005EAC1-0000000080062E99 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000080062E99-000000009007F481 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000009007F481-00000000A009F7C9 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000A009F7C9-00000000AA284EB9 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000AA286471-00000000AA2886B9 Note that gap between 000000000BFD0FE9 and 0000000060045901. I don't know how that happened, but in general the pageserver should be robust if there are gaps like that, or overlapping files etc. In theory they could happen as result of crashes, partial downloads from S3 etc., although it is mystery what caused it in this case. Looking at the compaction code, it was not safe in the face of gaps like that. The compaction routine collected all the level 0 files, and took their min(start)..max(end) as the range of the new files it builds. That's wrong, if the level 0 files don't cover the whole LSN range; the newly created files will miss any records in the gap. Fix that, by only collecting contiguous sequences of level 0 files, so that the end LSN of previous delta file is equal to the start of the next one. Fixes issue #1730 --- pageserver/src/layered_repository.rs | 106 +++++++++++++++++++-------- 1 file changed, 76 insertions(+), 30 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 79e66e5f17..fc4ab942f6 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -18,7 +18,7 @@ use itertools::Itertools; use lazy_static::lazy_static; use tracing::*; -use std::cmp::{max, min, Ordering}; +use std::cmp::{max, Ordering}; use std::collections::hash_map::Entry; use std::collections::HashMap; use std::collections::{BTreeSet, HashSet}; @@ -1946,41 +1946,87 @@ impl LayeredTimeline { Ok(new_path) } + /// + /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as + /// as Level 1 files. + /// fn compact_level0(&self, target_file_size: u64) -> Result<()> { let layers = self.layers.read().unwrap(); - - let level0_deltas = layers.get_level0_deltas()?; - - // We compact or "shuffle" the level-0 delta layers when they've - // accumulated over the compaction threshold. - if level0_deltas.len() < self.get_compaction_threshold() { - return Ok(()); - } + let mut level0_deltas = layers.get_level0_deltas()?; drop(layers); - // FIXME: this function probably won't work correctly if there's overlap - // in the deltas. - let lsn_range = level0_deltas - .iter() - .map(|l| l.get_lsn_range()) - .reduce(|a, b| min(a.start, b.start)..max(a.end, b.end)) - .unwrap(); + // Only compact if enough layers have accumulated. + if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() { + return Ok(()); + } - let all_values_iter = level0_deltas.iter().map(|l| l.iter()).kmerge_by(|a, b| { - if let Ok((a_key, a_lsn, _)) = a { - if let Ok((b_key, b_lsn, _)) = b { - match a_key.cmp(b_key) { - Ordering::Less => true, - Ordering::Equal => a_lsn <= b_lsn, - Ordering::Greater => false, + // Gather the files to compact in this iteration. + // + // Start with the oldest Level 0 delta file, and collect any other + // level 0 files that form a contiguous sequence, such that the end + // LSN of previous file matches the start LSN of the next file. + // + // Note that if the files don't form such a sequence, we might + // "compact" just a single file. That's a bit pointless, but it allows + // us to get rid of the level 0 file, and compact the other files on + // the next iteration. This could probably made smarter, but such + // "gaps" in the sequence of level 0 files should only happen in case + // of a crash, partial download from cloud storage, or something like + // that, so it's not a big deal in practice. + level0_deltas.sort_by_key(|l| l.get_lsn_range().start); + let mut level0_deltas_iter = level0_deltas.iter(); + + let first_level0_delta = level0_deltas_iter.next().unwrap(); + let mut prev_lsn_end = first_level0_delta.get_lsn_range().end; + let mut deltas_to_compact = vec![Arc::clone(first_level0_delta)]; + for l in level0_deltas_iter { + let lsn_range = l.get_lsn_range(); + + if lsn_range.start != prev_lsn_end { + break; + } + deltas_to_compact.push(Arc::clone(l)); + prev_lsn_end = lsn_range.end; + } + let lsn_range = Range { + start: deltas_to_compact.first().unwrap().get_lsn_range().start, + end: deltas_to_compact.last().unwrap().get_lsn_range().end, + }; + + info!( + "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", + lsn_range.start, + lsn_range.end, + deltas_to_compact.len(), + level0_deltas.len() + ); + for l in deltas_to_compact.iter() { + info!("compact includes {}", l.filename().display()); + } + // We don't need the original list of layers anymore. Drop it so that + // we don't accidentally use it later in the function. + drop(level0_deltas); + + // This iterator walks through all key-value pairs from all the layers + // we're compacting, in key, LSN order. + let all_values_iter = deltas_to_compact + .iter() + .map(|l| l.iter()) + .kmerge_by(|a, b| { + if let Ok((a_key, a_lsn, _)) = a { + if let Ok((b_key, b_lsn, _)) = b { + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + } else { + false } } else { - false + true } - } else { - true - } - }); + }); // Merge the contents of all the input delta layers into a new set // of delta layers, based on the current partitioning. @@ -2046,8 +2092,8 @@ impl LayeredTimeline { // Now that we have reshuffled the data to set of new delta layers, we can // delete the old ones - let mut layer_paths_do_delete = HashSet::with_capacity(level0_deltas.len()); - for l in level0_deltas { + let mut layer_paths_do_delete = HashSet::with_capacity(deltas_to_compact.len()); + for l in deltas_to_compact { l.delete()?; if let Some(path) = l.local_path() { layer_paths_do_delete.insert(path); From baf7a81dceaa68d634a96b4833bec2fc6999b5ce Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 19 May 2022 13:01:03 +0200 Subject: [PATCH 0323/1022] git-upload: pass committer to 'git rebase' (fix #1749) (#1750) No committer was specified, which resulted in failing `git rebase` if the branch is not up-to-date. --- scripts/git-upload | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/git-upload b/scripts/git-upload index 4649f6998d..a53987894a 100755 --- a/scripts/git-upload +++ b/scripts/git-upload @@ -80,12 +80,14 @@ class GitRepo: print('No changes detected, quitting') return - run([ + git_with_user = [ 'git', '-c', 'user.name=vipvap', '-c', 'user.email=vipvap@zenith.tech', + ] + run(git_with_user + [ 'commit', '--author="vipvap "', f'--message={message}', @@ -94,7 +96,7 @@ class GitRepo: for _ in range(5): try: run(['git', 'fetch', 'origin', branch]) - run(['git', 'rebase', f'origin/{branch}']) + run(git_with_user + ['rebase', f'origin/{branch}']) run(['git', 'push', 'origin', branch]) return From ffbb9dd1553288641a59622693eb68bf99205cee Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 19 May 2022 10:24:50 +0300 Subject: [PATCH 0324/1022] Add a 5 minute timeout to python tests. The CI times out after 10 minutes of no output. It's annoying if a test hangs and is killed by the CI timeout, because you don't get information about which test was running. Try to avoid that, by adding a slightly smaller timeout in pytest itself. You can override it on a per-test basis if needed, but let's try to keep our tests shorter than that. For the Postgres regression tests, use a longer 30 minute timeout. They're not really a single test, but many tests wrapped in a single pytest test. It's OK for them to run longer in aggregate, each Postgres test is still fairly short. --- poetry.lock | 17 ++++++++++++++++- pyproject.toml | 1 + pytest.ini | 1 + test_runner/batch_pg_regress/test_isolation.py | 5 ++++- test_runner/batch_pg_regress/test_pg_regress.py | 5 ++++- test_runner/performance/test_startup.py | 4 +++- 6 files changed, 29 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index aa1e91c606..a69f482776 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1094,6 +1094,17 @@ python-versions = "*" [package.dependencies] pytest = ">=3.2.5" +[[package]] +name = "pytest-timeout" +version = "2.1.0" +description = "pytest plugin to abort hanging tests" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +pytest = ">=5.0.0" + [[package]] name = "pytest-xdist" version = "2.5.0" @@ -1387,7 +1398,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.7" -content-hash = "d2fcba2af0a32cde3a1d0c8cfdfe5fb26531599b0c8c376bf16e200a74b55553" +content-hash = "4ee85b435461dec70b406bf7170302fe54e9e247bdf628a9cb6b5fb9eb9afd82" [metadata.files] aiopg = [ @@ -1889,6 +1900,10 @@ pytest-lazy-fixture = [ {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"}, {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"}, ] +pytest-timeout = [ + {file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"}, + {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"}, +] pytest-xdist = [ {file = "pytest-xdist-2.5.0.tar.gz", hash = "sha256:4580deca3ff04ddb2ac53eba39d76cb5dd5edeac050cb6fbc768b0dd712b4edf"}, {file = "pytest_xdist-2.5.0-py3-none-any.whl", hash = "sha256:6fe5c74fec98906deb8f2d2b616b5c782022744978e7bd4695d39c8f42d0ce65"}, diff --git a/pyproject.toml b/pyproject.toml index def55f6671..c965535049 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ moto = {version = "^3.0.0", extras = ["server"]} backoff = "^1.11.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" +pytest-timeout = "^2.1.0" [tool.poetry.dev-dependencies] yapf = "==0.31.0" diff --git a/pytest.ini b/pytest.ini index abc69b765b..da9ab8c12f 100644 --- a/pytest.ini +++ b/pytest.ini @@ -9,3 +9,4 @@ minversion = 6.0 log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s log_date_format = %Y-%m-%d %H:%M:%S log_cli = true +timeout = 300 diff --git a/test_runner/batch_pg_regress/test_isolation.py b/test_runner/batch_pg_regress/test_isolation.py index cde56d9b88..7c99c04fe3 100644 --- a/test_runner/batch_pg_regress/test_isolation.py +++ b/test_runner/batch_pg_regress/test_isolation.py @@ -1,9 +1,12 @@ import os - +import pytest from fixtures.utils import mkdir_if_needed from fixtures.zenith_fixtures import ZenithEnv, base_dir, pg_distrib_dir +# The isolation tests run for a long time, especially in debug mode, +# so use a larger-than-default timeout. +@pytest.mark.timeout(1800) def test_isolation(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys): env = zenith_simple_env diff --git a/test_runner/batch_pg_regress/test_pg_regress.py b/test_runner/batch_pg_regress/test_pg_regress.py index 07d2574f4a..be7776113a 100644 --- a/test_runner/batch_pg_regress/test_pg_regress.py +++ b/test_runner/batch_pg_regress/test_pg_regress.py @@ -1,9 +1,12 @@ import os - +import pytest from fixtures.utils import mkdir_if_needed from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content, base_dir, pg_distrib_dir +# The pg_regress tests run for a long time, especially in debug mode, +# so use a larger-than-default timeout. +@pytest.mark.timeout(1800) def test_pg_regress(zenith_simple_env: ZenithEnv, test_output_dir: str, pg_bin, capsys): env = zenith_simple_env diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index e30912ce32..53b6a3a4fc 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -1,9 +1,11 @@ +import pytest from contextlib import closing - from fixtures.zenith_fixtures import ZenithEnvBuilder from fixtures.benchmark_fixture import ZenithBenchmarker +# This test sometimes runs for longer than the global 5 minute timeout. +@pytest.mark.timeout(600) def test_startup(zenith_env_builder: ZenithEnvBuilder, zenbenchmark: ZenithBenchmarker): zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() From a4aef5d8dc9666183e3968031952cb511cf918ec Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Thu, 19 May 2022 12:25:31 -0400 Subject: [PATCH 0325/1022] Compile psql with openssl (#1725) --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index d2a79661f2..329742bf78 100644 --- a/Makefile +++ b/Makefile @@ -12,12 +12,12 @@ endif # BUILD_TYPE ?= debug ifeq ($(BUILD_TYPE),release) - PG_CONFIGURE_OPTS = --enable-debug + PG_CONFIGURE_OPTS = --enable-debug --with-openssl PG_CFLAGS = -O2 -g3 $(CFLAGS) # Unfortunately, `--profile=...` is a nightly feature CARGO_BUILD_FLAGS += --release else ifeq ($(BUILD_TYPE),debug) - PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend + PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend PG_CFLAGS = -O0 -g3 $(CFLAGS) else $(error Bad build type `$(BUILD_TYPE)', see Makefile for options) From 65cf1a3221a7535e2aece1b99d985f9a4fbfb3cf Mon Sep 17 00:00:00 2001 From: KlimentSerafimov Date: Fri, 20 May 2022 12:02:51 -0400 Subject: [PATCH 0326/1022] Added paths to openssl includes and libraries for OSX because make complained that it couldn't find them. (#1761) --- Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile b/Makefile index 329742bf78..5eca7fb094 100644 --- a/Makefile +++ b/Makefile @@ -23,6 +23,12 @@ else $(error Bad build type `$(BUILD_TYPE)', see Makefile for options) endif +# macOS with brew-installed openssl requires explicit paths +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Darwin) + PG_CONFIGURE_OPTS += --with-includes=/usr/local/opt/openssl/include --with-libraries=/usr/local/opt/openssl/lib +endif + # Choose whether we should be silent or verbose CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose) # Fix for a corner case when make doesn't pass a jobserver From d97617ed3a59e78733752c410025b4e9a1ed614a Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Fri, 20 May 2022 23:12:30 +0300 Subject: [PATCH 0327/1022] updated proxy and proxy scram deployment for prod and stress environments (#1758) --- .circleci/config.yml | 6 +++-- .../helm-values/neon-stress.proxy-scram.yaml | 26 +++++++++++++++++++ .../helm-values/production.proxy-scram.yaml | 24 +++++++++++++++++ .circleci/helm-values/production.proxy.yaml | 8 +----- 4 files changed, 55 insertions(+), 9 deletions(-) create mode 100644 .circleci/helm-values/neon-stress.proxy-scram.yaml create mode 100644 .circleci/helm-values/production.proxy-scram.yaml diff --git a/.circleci/config.yml b/.circleci/config.yml index 60a1cfea14..eb2bf0172b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -640,7 +640,8 @@ jobs: name: Re-deploy proxy command: | DOCKER_TAG=$(git log --oneline|wc -l) - helm upgrade neon-stress-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait + helm upgrade neon-stress-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait + helm upgrade neon-stress-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait deploy-release: docker: @@ -689,7 +690,8 @@ jobs: name: Re-deploy proxy command: | DOCKER_TAG="release-$(git log --oneline|wc -l)" - helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait + helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait + helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait # Trigger a new remote CI job remote-ci-trigger: diff --git a/.circleci/helm-values/neon-stress.proxy-scram.yaml b/.circleci/helm-values/neon-stress.proxy-scram.yaml new file mode 100644 index 0000000000..8f55d31c87 --- /dev/null +++ b/.circleci/helm-values/neon-stress.proxy-scram.yaml @@ -0,0 +1,26 @@ +fullnameOverride: "neon-stress-proxy-scram" + +settings: + authBackend: "console" + authEndpoint: "http://neon-stress-console.local/management/api/v2" + domain: "*.stress.neon.tech" + +podLabels: + zenith_service: proxy-scram + zenith_env: staging + zenith_region: eu-west-1 + zenith_region_slug: ireland + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: '*.stress.neon.tech' + +metrics: + enabled: true + serviceMonitor: + enabled: true + selector: + release: kube-prometheus-stack diff --git a/.circleci/helm-values/production.proxy-scram.yaml b/.circleci/helm-values/production.proxy-scram.yaml new file mode 100644 index 0000000000..54b0fbcd98 --- /dev/null +++ b/.circleci/helm-values/production.proxy-scram.yaml @@ -0,0 +1,24 @@ +settings: + authBackend: "console" + authEndpoint: "http://console-release.local/management/api/v2" + domain: "*.cloud.neon.tech" + +podLabels: + zenith_service: proxy-scram + zenith_env: production + zenith_region: us-west-2 + zenith_region_slug: oregon + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech' + +metrics: + enabled: true + serviceMonitor: + enabled: true + selector: + release: kube-prometheus-stack diff --git a/.circleci/helm-values/production.proxy.yaml b/.circleci/helm-values/production.proxy.yaml index e13968a6a8..87c61c90cf 100644 --- a/.circleci/helm-values/production.proxy.yaml +++ b/.circleci/helm-values/production.proxy.yaml @@ -1,9 +1,3 @@ -# Helm chart values for zenith-proxy. -# This is a YAML-formatted file. - -image: - repository: neondatabase/neon - settings: authEndpoint: "https://console.neon.tech/authenticate_proxy_request/" uri: "https://console.neon.tech/psql_session/" @@ -28,7 +22,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-type: external service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing - external-dns.alpha.kubernetes.io/hostname: start.zenith.tech,connect.neon.tech,pg.neon.tech + external-dns.alpha.kubernetes.io/hostname: connect.neon.tech,pg.neon.tech metrics: enabled: true From 3c6890bf1dd72722c646d918b984d2392a010ce2 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 21 Apr 2022 14:54:22 +0300 Subject: [PATCH 0328/1022] postgres_ffi: add complex WAL tests for find_end_of_wal * Actual generation logic is in a separate crate `postgres_ffi/wal_generate` * The create also provides a binary for debug purposes akin to `initdb` * Two tests currently fail and are ignored * There is no easy way to test this directly in Safekeeper as it starts restoring from commit_lsn. So testing would require disconnecting Safekeeper just after it has received the WAL, but before it is committed. --- Cargo.lock | 15 + libs/postgres_ffi/Cargo.toml | 5 + libs/postgres_ffi/src/xlog_utils.rs | 143 ++++++--- libs/postgres_ffi/wal_generate/Cargo.toml | 14 + .../wal_generate/src/bin/wal_generate.rs | 58 ++++ libs/postgres_ffi/wal_generate/src/lib.rs | 278 ++++++++++++++++++ 6 files changed, 466 insertions(+), 47 deletions(-) create mode 100644 libs/postgres_ffi/wal_generate/Cargo.toml create mode 100644 libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs create mode 100644 libs/postgres_ffi/wal_generate/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 6a320ee274..6acad6dac8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2047,15 +2047,18 @@ dependencies = [ "bytes", "chrono", "crc32c", + "env_logger", "hex", "lazy_static", "log", "memoffset", + "postgres", "rand", "regex", "serde", "thiserror", "utils", + "wal_generate", "workspace_hack", ] @@ -3627,6 +3630,18 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "wal_generate" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap 3.0.14", + "env_logger", + "log", + "postgres", + "tempfile", +] + [[package]] name = "walkdir" version = "2.3.2" diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 7be5ca1b93..129c93cf6d 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -20,5 +20,10 @@ serde = { version = "1.0", features = ["derive"] } utils = { path = "../utils" } workspace_hack = { version = "0.1", path = "../../workspace_hack" } +[dev-dependencies] +env_logger = "0.9" +postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +wal_generate = { path = "wal_generate" } + [build-dependencies] bindgen = "0.59.1" diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 7882058868..3e30f9905e 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -476,78 +476,127 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result anyhow::Result, + expected_end_of_wal_non_partial: Lsn, + last_segment: &str, + ) { + use wal_generate::*; + // 1. Generate some WAL let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("..") .join(".."); - let data_dir = top_path.join("test_output/test_find_end_of_wal"); - let initdb_path = top_path.join("tmp_install/bin/initdb"); - let lib_path = top_path.join("tmp_install/lib"); - if data_dir.exists() { - fs::remove_dir_all(&data_dir).unwrap(); + let cfg = Conf { + pg_distrib_dir: top_path.join("tmp_install"), + datadir: top_path.join(format!("test_output/{}", test_name)), + }; + if cfg.datadir.exists() { + fs::remove_dir_all(&cfg.datadir).unwrap(); } - println!("Using initdb from '{}'", initdb_path.display()); - println!("Data directory '{}'", data_dir.display()); - let initdb_output = Command::new(initdb_path) - .args(&["-D", data_dir.to_str().unwrap()]) - .arg("--no-instructions") - .arg("--no-sync") - .env_clear() - .env("LD_LIBRARY_PATH", &lib_path) - .env("DYLD_LIBRARY_PATH", &lib_path) - .output() - .unwrap(); - assert!( - initdb_output.status.success(), - "initdb failed. Status: '{}', stdout: '{}', stderr: '{}'", - initdb_output.status, - String::from_utf8_lossy(&initdb_output.stdout), - String::from_utf8_lossy(&initdb_output.stderr), - ); + cfg.initdb().unwrap(); + let mut srv = cfg.start_server().unwrap(); + let expected_wal_end: Lsn = + u64::from(generate_wal(&mut srv.connect_with_timeout().unwrap()).unwrap()).into(); + srv.kill(); // 2. Pick WAL generated by initdb - let wal_dir = data_dir.join("pg_wal"); + let wal_dir = cfg.datadir.join("pg_wal"); let wal_seg_size = 16 * 1024 * 1024; // 3. Check end_of_wal on non-partial WAL segment (we treat it as fully populated) let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap(); let wal_end = Lsn(wal_end); - println!("wal_end={}, tli={}", wal_end, tli); - assert_eq!(wal_end, "0/2000000".parse::().unwrap()); + info!( + "find_end_of_wal returned (wal_end={}, tli={})", + wal_end, tli + ); + assert_eq!(wal_end, expected_end_of_wal_non_partial); // 4. Get the actual end of WAL by pg_waldump - let waldump_path = top_path.join("tmp_install/bin/pg_waldump"); - let waldump_output = Command::new(waldump_path) - .arg(wal_dir.join("000000010000000000000001")) - .env_clear() - .env("LD_LIBRARY_PATH", &lib_path) - .env("DYLD_LIBRARY_PATH", &lib_path) - .output() - .unwrap(); - let waldump_output = std::str::from_utf8(&waldump_output.stderr).unwrap(); - println!("waldump_output = '{}'", &waldump_output); - let re = Regex::new(r"invalid record length at (.+):").unwrap(); - let caps = re.captures(waldump_output).unwrap(); + let waldump_output = cfg + .pg_waldump("000000010000000000000001", last_segment) + .unwrap() + .stderr; + let waldump_output = std::str::from_utf8(&waldump_output).unwrap(); + let caps = match Regex::new(r"invalid record length at (.+):") + .unwrap() + .captures(waldump_output) + { + Some(caps) => caps, + None => { + error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output); + panic!(); + } + }; let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap(); + info!( + "waldump erred on {}, expected wal end at {}", + waldump_wal_end, expected_wal_end + ); + assert_eq!(waldump_wal_end, expected_wal_end); // 5. Rename file to partial to actually find last valid lsn fs::rename( - wal_dir.join("000000010000000000000001"), - wal_dir.join("000000010000000000000001.partial"), + wal_dir.join(last_segment), + wal_dir.join(format!("{}.partial", last_segment)), ) .unwrap(); let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap(); let wal_end = Lsn(wal_end); - println!("wal_end={}, tli={}", wal_end, tli); + info!( + "find_end_of_wal returned (wal_end={}, tli={})", + wal_end, tli + ); assert_eq!(wal_end, waldump_wal_end); } + #[test] + pub fn test_find_end_of_wal_simple() { + init_logging(); + test_end_of_wal( + "test_find_end_of_wal_simple", + wal_generate::generate_simple, + "0/2000000".parse::().unwrap(), + "000000010000000000000001", + ); + } + + #[test] + #[ignore = "not yet fixed, needs correct skipping of contrecord"] // TODO + pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() { + init_logging(); + test_end_of_wal( + "test_find_end_of_wal_crossing_segment_followed_by_small_one", + wal_generate::generate_wal_record_crossing_segment_followed_by_small_one, + "0/3000000".parse::().unwrap(), + "000000010000000000000002", + ); + } + + #[test] + #[ignore = "not yet fixed, needs correct parsing of pre-last segments"] // TODO + pub fn test_find_end_of_wal_last_crossing_segment() { + init_logging(); + test_end_of_wal( + "test_find_end_of_wal_last_crossing_segment", + wal_generate::generate_last_wal_record_crossing_segment, + "0/3000000".parse::().unwrap(), + "000000010000000000000002", + ); + } + /// Check the math in update_next_xid /// /// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL, diff --git a/libs/postgres_ffi/wal_generate/Cargo.toml b/libs/postgres_ffi/wal_generate/Cargo.toml new file mode 100644 index 0000000000..a10671dddd --- /dev/null +++ b/libs/postgres_ffi/wal_generate/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "wal_generate" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow = "1.0" +clap = "3.0" +env_logger = "0.9" +log = "0.4" +postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tempfile = "3.2" diff --git a/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs b/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs new file mode 100644 index 0000000000..07ceb31c7f --- /dev/null +++ b/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs @@ -0,0 +1,58 @@ +use anyhow::*; +use clap::{App, Arg}; +use wal_generate::*; + +fn main() -> Result<()> { + env_logger::Builder::from_env( + env_logger::Env::default().default_filter_or("wal_generate=info"), + ) + .init(); + let arg_matches = App::new("Postgres WAL generator") + .about("Generates Postgres databases with specific WAL properties") + .arg( + Arg::new("datadir") + .short('D') + .long("datadir") + .takes_value(true) + .help("Data directory for the Postgres server") + .required(true) + ) + .arg( + Arg::new("pg-distrib-dir") + .long("pg-distrib-dir") + .takes_value(true) + .help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)") + .default_value("/usr/local") + ) + .arg( + Arg::new("type") + .long("type") + .takes_value(true) + .help("Type of WAL to generate") + .possible_values(["simple", "last_wal_record_crossing_segment", "wal_record_crossing_segment_followed_by_small_one"]) + .required(true) + ) + .get_matches(); + + let cfg = Conf { + pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(), + datadir: arg_matches.value_of("datadir").unwrap().into(), + }; + cfg.initdb()?; + let mut srv = cfg.start_server()?; + let lsn = match arg_matches.value_of("type").unwrap() { + "simple" => generate_simple(&mut srv.connect_with_timeout()?)?, + "last_wal_record_crossing_segment" => { + generate_last_wal_record_crossing_segment(&mut srv.connect_with_timeout()?)? + } + "wal_record_crossing_segment_followed_by_small_one" => { + generate_wal_record_crossing_segment_followed_by_small_one( + &mut srv.connect_with_timeout()?, + )? + } + a => panic!("Unknown --type argument: {}", a), + }; + println!("end_of_wal = {}", lsn); + srv.kill(); + Ok(()) +} diff --git a/libs/postgres_ffi/wal_generate/src/lib.rs b/libs/postgres_ffi/wal_generate/src/lib.rs new file mode 100644 index 0000000000..a5cd81d68a --- /dev/null +++ b/libs/postgres_ffi/wal_generate/src/lib.rs @@ -0,0 +1,278 @@ +use anyhow::*; +use core::time::Duration; +use log::*; +use postgres::types::PgLsn; +use postgres::Client; +use std::cmp::Ordering; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; +use std::time::Instant; +use tempfile::{tempdir, TempDir}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Conf { + pub pg_distrib_dir: PathBuf, + pub datadir: PathBuf, +} + +pub struct PostgresServer { + process: std::process::Child, + _unix_socket_dir: TempDir, + client_config: postgres::Config, +} + +impl Conf { + fn pg_bin_dir(&self) -> PathBuf { + self.pg_distrib_dir.join("bin") + } + + fn pg_lib_dir(&self) -> PathBuf { + self.pg_distrib_dir.join("lib") + } + + fn new_pg_command(&self, command: impl AsRef) -> Result { + let path = self.pg_bin_dir().join(command); + ensure!(path.exists(), "Command {:?} does not exist", path); + let mut cmd = Command::new(path); + cmd.env_clear() + .env("LD_LIBRARY_PATH", self.pg_lib_dir()) + .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()); + Ok(cmd) + } + + pub fn initdb(&self) -> Result<()> { + if let Some(parent) = self.datadir.parent() { + info!("Pre-creating parent directory {:?}", parent); + // Tests may be run concurrently and there may be a race to create `test_output/`. + // std::fs::create_dir_all is guaranteed to have no races with another thread creating directories. + std::fs::create_dir_all(parent)?; + } + info!( + "Running initdb in {:?} with user \"postgres\"", + self.datadir + ); + let output = self + .new_pg_command("initdb")? + .arg("-D") + .arg(self.datadir.as_os_str()) + .args(&["-U", "postgres", "--no-instructions", "--no-sync"]) + .output()?; + debug!("initdb output: {:?}", output); + ensure!( + output.status.success(), + "initdb failed, stdout and stderr follow:\n{}{}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr), + ); + Ok(()) + } + + pub fn start_server(&self) -> Result { + info!("Starting Postgres server in {:?}", self.datadir); + let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols) + let unix_socket_dir_path = unix_socket_dir.path().to_owned(); + let server_process = self + .new_pg_command("postgres")? + .args(&["-c", "listen_addresses="]) + .arg("-k") + .arg(unix_socket_dir_path.as_os_str()) + .arg("-D") + .arg(self.datadir.as_os_str()) + .args(&["-c", "wal_keep_size=50MB"]) // Ensure old WAL is not removed + .args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output + .args(&["-c", "shared_preload_libraries=zenith"]) // can only be loaded at startup + // Disable background processes as much as possible + .args(&["-c", "wal_writer_delay=10s"]) + .args(&["-c", "autovacuum=off"]) + .stderr(Stdio::null()) + .spawn()?; + let server = PostgresServer { + process: server_process, + _unix_socket_dir: unix_socket_dir, + client_config: { + let mut c = postgres::Config::new(); + c.host_path(&unix_socket_dir_path); + c.user("postgres"); + c.connect_timeout(Duration::from_millis(1000)); + c + }, + }; + Ok(server) + } + + pub fn pg_waldump( + &self, + first_segment_name: &str, + last_segment_name: &str, + ) -> Result { + let first_segment_file = self.datadir.join(first_segment_name); + let last_segment_file = self.datadir.join(last_segment_name); + info!( + "Running pg_waldump for {} .. {}", + first_segment_file.display(), + last_segment_file.display() + ); + let output = self + .new_pg_command("pg_waldump")? + .args(&[ + &first_segment_file.as_os_str(), + &last_segment_file.as_os_str(), + ]) + .output()?; + debug!("waldump output: {:?}", output); + Ok(output) + } +} + +impl PostgresServer { + pub fn connect_with_timeout(&self) -> Result { + let retry_until = Instant::now() + *self.client_config.get_connect_timeout().unwrap(); + while Instant::now() < retry_until { + use std::result::Result::Ok; + if let Ok(client) = self.client_config.connect(postgres::NoTls) { + return Ok(client); + } + std::thread::sleep(Duration::from_millis(100)); + } + bail!("Connection timed out"); + } + + pub fn kill(&mut self) { + self.process.kill().unwrap(); + self.process.wait().unwrap(); + } +} + +impl Drop for PostgresServer { + fn drop(&mut self) { + use std::result::Result::Ok; + match self.process.try_wait() { + Ok(Some(_)) => return, + Ok(None) => { + warn!("Server was not terminated, will be killed"); + } + Err(e) => { + error!("Unable to get status of the server: {}, will be killed", e); + } + } + let _ = self.process.kill(); + } +} + +pub trait PostgresClientExt: postgres::GenericClient { + fn pg_current_wal_insert_lsn(&mut self) -> Result { + Ok(self + .query_one("SELECT pg_current_wal_insert_lsn()", &[])? + .get(0)) + } + fn pg_current_wal_flush_lsn(&mut self) -> Result { + Ok(self + .query_one("SELECT pg_current_wal_flush_lsn()", &[])? + .get(0)) + } +} + +impl PostgresClientExt for C {} + +fn generate_internal( + client: &mut C, + f: impl Fn(&mut C, PgLsn) -> Result>, +) -> Result { + client.execute("create extension if not exists zenith_test_utils", &[])?; + + let wal_segment_size = client.query_one( + "select cast(setting as bigint) as setting, unit \ + from pg_settings where name = 'wal_segment_size'", + &[], + )?; + ensure!( + wal_segment_size.get::<_, String>("unit") == "B", + "Unexpected wal_segment_size unit" + ); + ensure!( + wal_segment_size.get::<_, i64>("setting") == 16 * 1024 * 1024, + "Unexpected wal_segment_size in bytes" + ); + + let initial_lsn = client.pg_current_wal_insert_lsn()?; + info!("LSN initial = {}", initial_lsn); + + let last_lsn = match f(client, initial_lsn)? { + None => client.pg_current_wal_insert_lsn()?, + Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) { + Ordering::Less => bail!("Some records were inserted after the generated WAL"), + Ordering::Equal => last_lsn, + Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"), + }, + }; + + // Some records may be not flushed, e.g. non-transactional logical messages. + client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?; + match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) { + Ordering::Less => bail!("Some records were flushed after the generated WAL"), + Ordering::Equal => {} + Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"), + } + Ok(last_lsn) +} + +pub fn generate_simple(client: &mut impl postgres::GenericClient) -> Result { + generate_internal(client, |client, _| { + client.execute("CREATE table t(x int)", &[])?; + Ok(None) + }) +} + +fn generate_single_logical_message( + client: &mut impl postgres::GenericClient, + transactional: bool, +) -> Result { + generate_internal(client, |client, initial_lsn| { + ensure!( + initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024), + "Initial LSN is too far in the future" + ); + + let message_lsn: PgLsn = client + .query_one( + "select pg_logical_emit_message($1, 'big-16mb-msg', \ + concat(repeat('abcd', 16 * 256 * 1024), 'end')) as message_lsn", + &[&transactional], + )? + .get("message_lsn"); + ensure!( + message_lsn > PgLsn::from(0x0200_0000 + 4 * 8192), + "Logical message did not cross the segment boundary" + ); + ensure!( + message_lsn < PgLsn::from(0x0400_0000), + "Logical message crossed two segments" + ); + + if transactional { + // Transactional logical messages are part of a transaction, so the one above is + // followed by a small COMMIT record. + + let after_message_lsn = client.pg_current_wal_insert_lsn()?; + ensure!( + message_lsn < after_message_lsn, + "No record found after the emitted message" + ); + Ok(Some(after_message_lsn)) + } else { + Ok(Some(message_lsn)) + } + }) +} + +pub fn generate_wal_record_crossing_segment_followed_by_small_one( + client: &mut impl postgres::GenericClient, +) -> Result { + generate_single_logical_message(client, true) +} + +pub fn generate_last_wal_record_crossing_segment( + client: &mut C, +) -> Result { + generate_single_logical_message(client, false) +} From 12b7c793b3f9885d3132d66da149431b4fd7f5b7 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 21 Apr 2022 22:52:55 +0300 Subject: [PATCH 0329/1022] postgres_ffi: find_end_of_wal_segment: remove redundant CRC operations Previous invariant: `crc` contains an "unfinalized" CRC32 value, its one complement, like in postgres before FIN_CRC32C. New invariant: `crc` always contains a "finalized" CRC32 value, this is the semantics of crc32c_append, so we don't need to invert CRC manually. --- libs/postgres_ffi/src/xlog_utils.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 3e30f9905e..ce036bc49a 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -234,16 +234,13 @@ fn find_end_of_wal_segment( wal_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]); crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]); } else { - crc ^= 0xFFFFFFFFu32; crc = crc32c_append(crc, &buf[page_offs..page_offs + n]); } - crc = !crc; rec_offs += n; offs += n; contlen -= n; if contlen == 0 { - crc = !crc; crc = crc32c_append(crc, &rec_hdr); offs = (offs + 7) & !7; // pad on 8 bytes boundary */ if crc == wal_crc { From c9efdec8db8115a56bb6044e0d0547aac7583872 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 21 Apr 2022 23:08:13 +0300 Subject: [PATCH 0330/1022] postgres_ffi: find_end_of_wal_segment: improve name of wal_crc variable Now it reflects the field it's mirroring. --- libs/postgres_ffi/src/xlog_utils.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index ce036bc49a..9fcf78acb1 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -150,7 +150,7 @@ fn find_end_of_wal_segment( // step back to the beginning of the page to read it in... let mut offs: usize = start_offset - start_offset % XLOG_BLCKSZ; let mut contlen: usize = 0; - let mut wal_crc: u32 = 0; + let mut xl_crc: u32 = 0; let mut crc: u32 = 0; let mut rec_offs: usize = 0; let mut buf = [0u8; XLOG_BLCKSZ]; @@ -231,7 +231,7 @@ fn find_end_of_wal_segment( } if rec_offs <= XLOG_RECORD_CRC_OFFS && rec_offs + n >= XLOG_SIZE_OF_XLOG_RECORD { let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS; - wal_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]); + xl_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]); crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]); } else { crc = crc32c_append(crc, &buf[page_offs..page_offs + n]); @@ -243,14 +243,14 @@ fn find_end_of_wal_segment( if contlen == 0 { crc = crc32c_append(crc, &rec_hdr); offs = (offs + 7) & !7; // pad on 8 bytes boundary */ - if crc == wal_crc { + if crc == xl_crc { // record is valid, advance the result to its end (with // alignment to the next record taken into account) last_valid_rec_pos = offs; } else { info!( "CRC mismatch {} vs {} at {}", - crc, wal_crc, last_valid_rec_pos + crc, xl_crc, last_valid_rec_pos ); break; } From c4b77084afd70098ed3ecf56b6778a6cc0dbcfe4 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 19 May 2022 01:58:51 +0300 Subject: [PATCH 0331/1022] utils: add const_assert! macro --- libs/utils/src/lib.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 4810909712..15d4c7a81e 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -95,3 +95,11 @@ macro_rules! project_git_version { ); }; } + +/// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime. +#[macro_export] +macro_rules! const_assert { + ($($args:tt)*) => { + const _: () = assert!($($args)*); + }; +} From a124e44866c0b6cd1295d83d445dc7fab9e6e1d5 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 19 May 2022 03:02:54 +0300 Subject: [PATCH 0332/1022] postgres_ffi: find_end_of_wal_segment: add lots of trace --- libs/postgres_ffi/src/xlog_utils.rs | 75 ++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 2 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 9fcf78acb1..93b4924110 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -160,9 +160,11 @@ fn find_end_of_wal_segment( file.seek(SeekFrom::Start(offs as u64))?; let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS]; + trace!("find_end_of_wal_segment(data_dir={}, segno={}, tli={}, wal_seg_size={}, start_offset=0x{:x})", data_dir.display(), segno, tli, wal_seg_size, start_offset); while offs < wal_seg_size { // we are at the beginning of the page; read it in if offs % XLOG_BLCKSZ == 0 { + trace!("offs=0x{:x}: new page", offs); let bytes_read = file.read(&mut buf)?; if bytes_read != buf.len() { bail!( @@ -176,10 +178,16 @@ fn find_end_of_wal_segment( let xlp_magic = LittleEndian::read_u16(&buf[0..2]); let xlp_info = LittleEndian::read_u16(&buf[2..4]); let xlp_rem_len = LittleEndian::read_u32(&buf[XLP_REM_LEN_OFFS..XLP_REM_LEN_OFFS + 4]); + trace!( + " xlp_magic=0x{:x}, xlp_info=0x{:x}, xlp_rem_len={}", + xlp_magic, + xlp_info, + xlp_rem_len + ); // this is expected in current usage when valid WAL starts after page header if xlp_magic != XLOG_PAGE_MAGIC as u16 { trace!( - "invalid WAL file {}.partial magic {} at {:?}", + " invalid WAL file {}.partial magic {} at {:?}", file_name, xlp_magic, Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)), @@ -194,12 +202,13 @@ fn find_end_of_wal_segment( offs += XLOG_SIZE_OF_XLOG_SHORT_PHD; } // ... and step forward again if asked + trace!(" skipped header to 0x{:x}", offs); offs = max(offs, start_offset); - // beginning of the next record } else if contlen == 0 { let page_offs = offs % XLOG_BLCKSZ; let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize; + trace!("offs=0x{:x}: new record, xl_tot_len={}", offs, xl_tot_len); if xl_tot_len == 0 { info!( "find_end_of_wal_segment reached zeros at {:?}, last records ends at {:?}", @@ -212,10 +221,20 @@ fn find_end_of_wal_segment( ); break; // zeros, reached the end } + trace!( + " updating last_valid_rec_pos: 0x{:x} --> 0x{:x}", + last_valid_rec_pos, + offs + ); last_valid_rec_pos = offs; offs += 4; rec_offs = 4; contlen = xl_tot_len - 4; + trace!( + " reading rec_hdr[0..4] <-- [0x{:x}; 0x{:x})", + page_offs, + page_offs + 4 + ); rec_hdr[0..4].copy_from_slice(&buf[page_offs..page_offs + 4]); } else { // we're continuing a record, possibly from previous page. @@ -224,28 +243,79 @@ fn find_end_of_wal_segment( // read the rest of the record, or as much as fits on this page. let n = min(contlen, pageleft); + trace!( + "offs=0x{:x}, record continuation, pageleft={}, contlen={}", + offs, + pageleft, + contlen + ); // fill rec_hdr (header up to (but not including) xl_crc field) + trace!( + " rec_offs={}, XLOG_RECORD_CRC_OFFS={}, XLOG_SIZE_OF_XLOG_RECORD={}", + rec_offs, + XLOG_RECORD_CRC_OFFS, + XLOG_SIZE_OF_XLOG_RECORD + ); if rec_offs < XLOG_RECORD_CRC_OFFS { let len = min(XLOG_RECORD_CRC_OFFS - rec_offs, n); + trace!( + " reading rec_hdr[{}..{}] <-- [0x{:x}; 0x{:x})", + rec_offs, + rec_offs + len, + page_offs, + page_offs + len + ); rec_hdr[rec_offs..rec_offs + len].copy_from_slice(&buf[page_offs..page_offs + len]); } if rec_offs <= XLOG_RECORD_CRC_OFFS && rec_offs + n >= XLOG_SIZE_OF_XLOG_RECORD { let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS; xl_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]); + trace!( + " reading xl_crc: [0x{:x}; 0x{:x}) = 0x{:x}", + crc_offs, + crc_offs + 4, + xl_crc + ); crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]); + trace!( + " initializing crc: [0x{:x}; 0x{:x}); crc = 0x{:x}", + crc_offs + 4, + page_offs + n, + crc + ); } else { + let old_crc = crc; crc = crc32c_append(crc, &buf[page_offs..page_offs + n]); + trace!( + " appending to crc: [0x{:x}; 0x{:x}); 0x{:x} --> 0x{:x}", + page_offs, + page_offs + n, + old_crc, + crc + ); } rec_offs += n; offs += n; contlen -= n; if contlen == 0 { + trace!(" record completed at 0x{:x}", offs); crc = crc32c_append(crc, &rec_hdr); offs = (offs + 7) & !7; // pad on 8 bytes boundary */ + trace!( + " padded offs to 0x{:x}, crc is {:x}, expected crc is {:x}", + offs, + crc, + xl_crc + ); if crc == xl_crc { // record is valid, advance the result to its end (with // alignment to the next record taken into account) + trace!( + " updating last_valid_rec_pos: 0x{:x} --> 0x{:x}", + last_valid_rec_pos, + offs + ); last_valid_rec_pos = offs; } else { info!( @@ -257,6 +327,7 @@ fn find_end_of_wal_segment( } } } + trace!("last_valid_rec_pos=0x{:x}", last_valid_rec_pos); Ok(last_valid_rec_pos as u32) } From 967eb38e815a102751bd1658caf91a05f9cecb22 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 19 May 2022 03:20:06 +0300 Subject: [PATCH 0333/1022] postgres_ffi: find_end_of_wal_segment: fix contrecord skipping Also enable corresponding test. --- libs/postgres_ffi/src/xlog_utils.rs | 42 +++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 93b4924110..ac52e3fb4f 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -15,7 +15,7 @@ use crate::XLogPageHeaderData; use crate::XLogRecord; use crate::XLOG_PAGE_MAGIC; -use anyhow::bail; +use anyhow::{bail, ensure}; use byteorder::{ByteOrder, LittleEndian}; use bytes::BytesMut; use bytes::{Buf, Bytes}; @@ -149,6 +149,7 @@ fn find_end_of_wal_segment( ) -> anyhow::Result { // step back to the beginning of the page to read it in... let mut offs: usize = start_offset - start_offset % XLOG_BLCKSZ; + let mut skipping_first_contrecord: bool = false; let mut contlen: usize = 0; let mut xl_crc: u32 = 0; let mut crc: u32 = 0; @@ -194,9 +195,21 @@ fn find_end_of_wal_segment( ); } if offs == 0 { - offs = XLOG_SIZE_OF_XLOG_LONG_PHD; + offs += XLOG_SIZE_OF_XLOG_LONG_PHD; if (xlp_info & XLP_FIRST_IS_CONTRECORD) != 0 { - offs += ((xlp_rem_len + 7) & !7) as usize; + trace!(" first record is contrecord"); + skipping_first_contrecord = true; + contlen = xlp_rem_len as usize; + if offs < start_offset { + // Pre-condition failed: the beginning of the segment is unexpectedly corrupted. + ensure!(start_offset - offs >= contlen, + "start_offset is in the middle of the first record (which happens to be a contrecord), \ + expected to be on a record boundary. Is beginning of the segment corrupted?"); + contlen = 0; + // keep skipping_first_contrecord to avoid counting the contrecord as valid, we did not check it. + } + } else { + trace!(" first record is not contrecord"); } } else { offs += XLOG_SIZE_OF_XLOG_SHORT_PHD; @@ -221,12 +234,17 @@ fn find_end_of_wal_segment( ); break; // zeros, reached the end } - trace!( - " updating last_valid_rec_pos: 0x{:x} --> 0x{:x}", - last_valid_rec_pos, - offs - ); - last_valid_rec_pos = offs; + if skipping_first_contrecord { + skipping_first_contrecord = false; + trace!(" first contrecord has been just completed"); + } else { + trace!( + " updating last_valid_rec_pos: 0x{:x} --> 0x{:x}", + last_valid_rec_pos, + offs + ); + last_valid_rec_pos = offs; + } offs += 4; rec_offs = 4; contlen = xl_tot_len - 4; @@ -308,7 +326,10 @@ fn find_end_of_wal_segment( crc, xl_crc ); - if crc == xl_crc { + if skipping_first_contrecord { + // do nothing, the flag will go down on next iteration when we're reading new record + trace!(" first conrecord has been just completed"); + } else if crc == xl_crc { // record is valid, advance the result to its end (with // alignment to the next record taken into account) trace!( @@ -642,7 +663,6 @@ mod tests { } #[test] - #[ignore = "not yet fixed, needs correct skipping of contrecord"] // TODO pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() { init_logging(); test_end_of_wal( From 73187bfef12852b38b39724df42323e5ab0c60a5 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Sat, 21 May 2022 02:48:43 +0300 Subject: [PATCH 0334/1022] postgres_ffi: find_end_of_wal_segment: clarify code around xl_crc retrieval It would be better to not update xl_crc/rec_hdr at all when skipping contrecord, but I would prefer to keep PR #1574 small. Better audit of `find_end_of_wal_segment` is coming anyway in #544. --- libs/postgres_ffi/src/xlog_utils.rs | 31 +++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index ac52e3fb4f..32a3022c5a 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -30,6 +30,7 @@ use std::path::{Path, PathBuf}; use std::time::SystemTime; use utils::bin_ser::DeserializeError; use utils::bin_ser::SerializeError; +use utils::const_assert; use utils::lsn::Lsn; pub const XLOG_FNAME_LEN: usize = 24; @@ -159,6 +160,8 @@ fn find_end_of_wal_segment( let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap(); file.seek(SeekFrom::Start(offs as u64))?; + // xl_crc is the last field in XLogRecord, will not be read into rec_hdr + const_assert!(XLOG_RECORD_CRC_OFFS + 4 == XLOG_SIZE_OF_XLOG_RECORD); let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS]; trace!("find_end_of_wal_segment(data_dir={}, segno={}, tli={}, wal_seg_size={}, start_offset=0x{:x})", data_dir.display(), segno, tli, wal_seg_size, start_offset); @@ -267,7 +270,7 @@ fn find_end_of_wal_segment( pageleft, contlen ); - // fill rec_hdr (header up to (but not including) xl_crc field) + // fill rec_hdr header up to (but not including) xl_crc field trace!( " rec_offs={}, XLOG_RECORD_CRC_OFFS={}, XLOG_SIZE_OF_XLOG_RECORD={}", rec_offs, @@ -287,6 +290,14 @@ fn find_end_of_wal_segment( } if rec_offs <= XLOG_RECORD_CRC_OFFS && rec_offs + n >= XLOG_SIZE_OF_XLOG_RECORD { let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS; + // All records are aligned on 8-byte boundary, so their 8-byte frames + // cannot be split between pages. As xl_crc is the last field, + // its content is always on the same page. + const_assert!(XLOG_RECORD_CRC_OFFS % 8 == 4); + // We should always start reading aligned records even in incorrect WALs so if + // the condition is false it is likely a bug. However, it is localized somewhere + // in this function, hence we do not crash and just report failure instead. + ensure!(crc_offs % 8 == 4, "Record is not aligned properly (bug?)"); xl_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]); trace!( " reading xl_crc: [0x{:x}; 0x{:x}) = 0x{:x}", @@ -301,7 +312,9 @@ fn find_end_of_wal_segment( page_offs + n, crc ); - } else { + } else if rec_offs > XLOG_RECORD_CRC_OFFS { + // As all records are 8-byte aligned, the header is already fully read and `crc` is initialized in the branch above. + ensure!(rec_offs >= XLOG_SIZE_OF_XLOG_RECORD); let old_crc = crc; crc = crc32c_append(crc, &buf[page_offs..page_offs + n]); trace!( @@ -311,6 +324,20 @@ fn find_end_of_wal_segment( old_crc, crc ); + } else { + // Correct because of the way conditions are written above. + assert!(rec_offs + n < XLOG_SIZE_OF_XLOG_RECORD); + // If `skipping_first_contrecord == true`, we may be reading from a middle of a record + // which started in the previous segment. Hence there is no point in validating the header. + if !skipping_first_contrecord && rec_offs + n > XLOG_RECORD_CRC_OFFS { + info!( + "Curiously corrupted WAL: a record stops inside the header; \ + offs=0x{:x}, record continuation, pageleft={}, contlen={}", + offs, pageleft, contlen + ); + break; + } + // Do nothing: we are still reading the header. It's accounted in CRC in the end of the record. } rec_offs += n; offs += n; From ef7cdb13e28abcbd1a36eea87dda70481ab28191 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Sat, 21 May 2022 03:47:41 +0300 Subject: [PATCH 0335/1022] Remove unused dependencies from poetry.lock via `poetry lock --no-update` There were a bunch of dependencies for Python <3.9. They are not needed after #1254. This commit makes it easier to add/remove dependencies because lock file will be updated like this on any such operation. Do not update dependencies yet to not break anything. --- poetry.lock | 103 +--------------------------------------------------- 1 file changed, 2 insertions(+), 101 deletions(-) diff --git a/poetry.lock b/poetry.lock index a69f482776..6e552d2cd3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -21,9 +21,6 @@ category = "main" optional = false python-versions = ">=3.6" -[package.dependencies] -typing-extensions = {version = ">=3.6.5", markers = "python_version < \"3.8\""} - [[package]] name = "asyncpg" version = "0.24.0" @@ -32,9 +29,6 @@ category = "main" optional = false python-versions = ">=3.6.0" -[package.dependencies] -typing-extensions = {version = ">=3.7.4.3", markers = "python_version < \"3.8\""} - [package.extras] dev = ["Cython (>=0.29.24,<0.30.0)", "pytest (>=6.0)", "Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "pycodestyle (>=2.7.0,<2.8.0)", "flake8 (>=3.9.2,<3.10.0)", "uvloop (>=0.15.3)"] docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)"] @@ -125,7 +119,6 @@ python-versions = ">=3.6" [package.dependencies] botocore-stubs = "*" -typing-extensions = {version = "*", markers = "python_version < \"3.9\""} [package.extras] accessanalyzer = ["mypy-boto3-accessanalyzer (>=1.20.0)"] @@ -454,9 +447,6 @@ category = "main" optional = false python-versions = ">=3.6" -[package.dependencies] -typing-extensions = {version = "*", markers = "python_version < \"3.9\""} - [[package]] name = "cached-property" version = "1.5.2" @@ -524,7 +514,6 @@ python-versions = ">=3.6" [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} [[package]] name = "colorama" @@ -605,7 +594,6 @@ optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" [package.dependencies] -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} mccabe = ">=0.6.0,<0.7.0" pycodestyle = ">=2.7.0,<2.8.0" pyflakes = ">=2.3.0,<2.4.0" @@ -664,23 +652,6 @@ category = "main" optional = false python-versions = ">=3.5" -[[package]] -name = "importlib-metadata" -version = "4.10.1" -description = "Read metadata from Python packages" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} -zipp = ">=0.5" - -[package.extras] -docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] -perf = ["ipython"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy", "importlib-resources (>=1.3)"] - [[package]] name = "iniconfig" version = "1.1.1" @@ -759,9 +730,6 @@ category = "main" optional = false python-versions = ">=2.7" -[package.dependencies] -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} - [package.extras] docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-black-multipy", "pytest-cov", "ecdsa", "feedparser", "numpy", "pandas", "pymongo", "scikit-learn", "sqlalchemy", "enum34", "jsonlib"] @@ -785,7 +753,6 @@ python-versions = "*" [package.dependencies] attrs = ">=17.4.0" -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} pyrsistent = ">=0.14.0" six = ">=1.11.0" @@ -840,7 +807,6 @@ flask = {version = "*", optional = true, markers = "extra == \"server\""} flask-cors = {version = "*", optional = true, markers = "extra == \"server\""} graphql-core = {version = "*", optional = true, markers = "extra == \"server\""} idna = {version = ">=2.5,<4", optional = true, markers = "extra == \"server\""} -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} Jinja2 = ">=2.10.1" jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} MarkupSafe = "!=2.0.0a1" @@ -890,7 +856,6 @@ python-versions = ">=3.5" [package.dependencies] mypy-extensions = ">=0.4.3,<0.5.0" toml = "*" -typed-ast = {version = ">=1.4.0,<1.5.0", markers = "python_version < \"3.8\""} typing-extensions = ">=3.7.4" [package.extras] @@ -947,9 +912,6 @@ category = "main" optional = false python-versions = ">=3.6" -[package.dependencies] -importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} - [package.extras] dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] @@ -1061,7 +1023,6 @@ python-versions = ">=3.6" atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} attrs = ">=19.2.0" colorama = {version = "*", markers = "sys_platform == \"win32\""} -importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} iniconfig = "*" packaging = "*" pluggy = ">=0.12,<2.0" @@ -1279,14 +1240,6 @@ category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" -[[package]] -name = "typed-ast" -version = "1.4.3" -description = "a fork of Python 2 and 3 ast modules with type comment support" -category = "dev" -optional = false -python-versions = "*" - [[package]] name = "types-psycopg2" version = "2.9.6" @@ -1383,22 +1336,10 @@ category = "dev" optional = false python-versions = "*" -[[package]] -name = "zipp" -version = "3.7.0" -description = "Backport of pathlib-compatible object wrapper for zip files" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.extras] -docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy"] - [metadata] lock-version = "1.1" -python-versions = "^3.7" -content-hash = "4ee85b435461dec70b406bf7170302fe54e9e247bdf628a9cb6b5fb9eb9afd82" +python-versions = "^3.9" +content-hash = "be9c00bb5081535805824242fea2a03b2f82fa9466856d618e24b3140c7da6a0" [metadata.files] aiopg = [ @@ -1594,10 +1535,6 @@ idna = [ {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, ] -importlib-metadata = [ - {file = "importlib_metadata-4.10.1-py3-none-any.whl", hash = "sha256:899e2a40a8c4a1aec681feef45733de8a6c58f3f6a0dbed2eb6574b4387a77b6"}, - {file = "importlib_metadata-4.10.1.tar.gz", hash = "sha256:951f0d8a5b7260e9db5e41d429285b5f451e928479f19d80818878527d36e95e"}, -] iniconfig = [ {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, @@ -2001,38 +1938,6 @@ toml = [ {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, ] -typed-ast = [ - {file = "typed_ast-1.4.3-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6"}, - {file = "typed_ast-1.4.3-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075"}, - {file = "typed_ast-1.4.3-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528"}, - {file = "typed_ast-1.4.3-cp35-cp35m-win32.whl", hash = "sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428"}, - {file = "typed_ast-1.4.3-cp35-cp35m-win_amd64.whl", hash = "sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3"}, - {file = "typed_ast-1.4.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f"}, - {file = "typed_ast-1.4.3-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341"}, - {file = "typed_ast-1.4.3-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace"}, - {file = "typed_ast-1.4.3-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f"}, - {file = "typed_ast-1.4.3-cp36-cp36m-win32.whl", hash = "sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363"}, - {file = "typed_ast-1.4.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7"}, - {file = "typed_ast-1.4.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266"}, - {file = "typed_ast-1.4.3-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e"}, - {file = "typed_ast-1.4.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04"}, - {file = "typed_ast-1.4.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899"}, - {file = "typed_ast-1.4.3-cp37-cp37m-win32.whl", hash = "sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c"}, - {file = "typed_ast-1.4.3-cp37-cp37m-win_amd64.whl", hash = "sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805"}, - {file = "typed_ast-1.4.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a"}, - {file = "typed_ast-1.4.3-cp38-cp38-manylinux1_i686.whl", hash = "sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff"}, - {file = "typed_ast-1.4.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41"}, - {file = "typed_ast-1.4.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39"}, - {file = "typed_ast-1.4.3-cp38-cp38-win32.whl", hash = "sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927"}, - {file = "typed_ast-1.4.3-cp38-cp38-win_amd64.whl", hash = "sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40"}, - {file = "typed_ast-1.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3"}, - {file = "typed_ast-1.4.3-cp39-cp39-manylinux1_i686.whl", hash = "sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4"}, - {file = "typed_ast-1.4.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0"}, - {file = "typed_ast-1.4.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3"}, - {file = "typed_ast-1.4.3-cp39-cp39-win32.whl", hash = "sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808"}, - {file = "typed_ast-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c"}, - {file = "typed_ast-1.4.3.tar.gz", hash = "sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"}, -] types-psycopg2 = [ {file = "types-psycopg2-2.9.6.tar.gz", hash = "sha256:753b50b38da0e61bc8f89d149f2c4420c7e18535a87963d17b72343eb98f7c32"}, {file = "types_psycopg2-2.9.6-py3-none-any.whl", hash = "sha256:2cfd855e1562ebb5da595ee9401da93a308d69121ccd359cb8341f94ba4b6d1c"}, @@ -2123,7 +2028,3 @@ yapf = [ {file = "yapf-0.31.0-py2.py3-none-any.whl", hash = "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"}, {file = "yapf-0.31.0.tar.gz", hash = "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d"}, ] -zipp = [ - {file = "zipp-3.7.0-py3-none-any.whl", hash = "sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375"}, - {file = "zipp-3.7.0.tar.gz", hash = "sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d"}, -] From 89e5659f3f4e163533ddf08bfb71495a8dabe2b7 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Sat, 21 May 2022 03:11:39 +0300 Subject: [PATCH 0336/1022] Replace COPYRIGHT file from the root with NOTICE file The primary reason: make GitHub detect that we use Apache License 2.0 They do it via https://github.com/licensee/licensee Ruby library (gem). Our COPYRIGHT file contains a part of the Apache License, which should be added to a source file, not the license or copyright information itself, which confuses the library. Instead, the recommended way is to create a NOTICE file which references license of the code and its bundled dependencies. --- COPYRIGHT | 20 -------------------- NOTICE | 5 +++++ 2 files changed, 5 insertions(+), 20 deletions(-) delete mode 100644 COPYRIGHT create mode 100644 NOTICE diff --git a/COPYRIGHT b/COPYRIGHT deleted file mode 100644 index 448363b12f..0000000000 --- a/COPYRIGHT +++ /dev/null @@ -1,20 +0,0 @@ -This software is licensed under the Apache 2.0 License: - ----------------------------------------------------------------------------- -Copyright 2021 Zenith Labs, Inc - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. ----------------------------------------------------------------------------- - -The PostgreSQL submodule in vendor/postgres is licensed under the -PostgreSQL license. See vendor/postgres/COPYRIGHT. diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000000..47cc4e798f --- /dev/null +++ b/NOTICE @@ -0,0 +1,5 @@ +Neon +Copyright 2022 Neon Inc. + +The PostgreSQL submodule in vendor/postgres is licensed under the +PostgreSQL license. See vendor/postgres/COPYRIGHT. From fbedd535c0c79e06c41b1a8d78e0bb74de74a848 Mon Sep 17 00:00:00 2001 From: chaitanya sharma <86035+phoenix24@users.noreply.github.com> Date: Mon, 23 May 2022 15:46:00 +0530 Subject: [PATCH 0337/1022] Replace a bunch of zenith references with neon. --- docs/glossary.md | 16 ++++++++-------- safekeeper/README.md | 4 ++-- safekeeper/README_PROTO.md | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/glossary.md b/docs/glossary.md index ecc57b9ed1..a014446010 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -21,7 +21,7 @@ NOTE:It has nothing to do with PostgreSQL pg_basebackup. ### Branch -We can create branch at certain LSN using `zenith timeline branch` command. +We can create branch at certain LSN using `neon_local timeline branch` command. Each Branch lives in a corresponding timeline[] and has an ancestor[]. @@ -91,7 +91,7 @@ The layer map tracks what layers exist in a timeline. ### Layered repository -Zenith repository implementation that keeps data in layers. +Neon repository implementation that keeps data in layers. ### LSN The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log. @@ -101,7 +101,7 @@ It is printed as two hexadecimal numbers of up to 8 digits each, separated by a Check also [PostgreSQL doc about pg_lsn type](https://www.postgresql.org/docs/devel/datatype-pg-lsn.html) Values can be compared to calculate the volume of WAL data that separates them, so they are used to measure the progress of replication and recovery. -In postgres and Zenith lsns are used to describe certain points in WAL handling. +In Postgres and Neon LSNs are used to describe certain points in WAL handling. PostgreSQL LSNs and functions to monitor them: * `pg_current_wal_insert_lsn()` - Returns the current write-ahead log insert location. @@ -111,13 +111,13 @@ PostgreSQL LSNs and functions to monitor them: * `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically. [source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html): -Zenith safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md) +Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md) * `CommitLSN`: position in WAL confirmed by quorum safekeepers. * `RestartLSN`: position in WAL confirmed by all safekeepers. * `FlushLSN`: part of WAL persisted to the disk by safekeeper. * `VCL`: the largerst LSN for which we can guarantee availablity of all prior records. -Zenith pageserver LSNs: +Neon pageserver LSNs: * `last_record_lsn` - the end of last processed WAL record. * `disk_consistent_lsn` - data is known to be fully flushed and fsync'd to local disk on pageserver up to this LSN. * `remote_consistent_lsn` - The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash. @@ -132,7 +132,7 @@ This is the unit of data exchange between compute node and pageserver. ### Pageserver -Zenith storage engine: repositories + wal receiver + page service + wal redo. +Neon storage engine: repositories + wal receiver + page service + wal redo. ### Page service @@ -184,10 +184,10 @@ relation exceeds that size, it is split into multiple segments. SLRUs include pg_clog, pg_multixact/members, and pg_multixact/offsets. There are other SLRUs in PostgreSQL, but they don't need to be stored permanently (e.g. pg_subtrans), -or we do not support them in zenith yet (pg_commit_ts). +or we do not support them in neon yet (pg_commit_ts). ### Tenant (Multitenancy) -Tenant represents a single customer, interacting with Zenith. +Tenant represents a single customer, interacting with Neon. Wal redo[] activity, timelines[], layers[] are managed for each tenant independently. One pageserver[] can serve multiple tenants at once. One safekeeper diff --git a/safekeeper/README.md b/safekeeper/README.md index 3f097d0c24..a4bb260932 100644 --- a/safekeeper/README.md +++ b/safekeeper/README.md @@ -1,6 +1,6 @@ # WAL service -The zenith WAL service acts as a holding area and redistribution +The neon WAL service acts as a holding area and redistribution center for recently generated WAL. The primary Postgres server streams the WAL to the WAL safekeeper, and treats it like a (synchronous) replica. A replication slot is used in the primary to prevent the @@ -94,7 +94,7 @@ Q: What if the compute node evicts a page, needs it back, but the page is yet A: If the compute node has evicted a page, changes to it have been WAL-logged (that's why it is called Write Ahead logging; there are some exceptions like index builds, but these are exceptions). These WAL records will eventually - reach the Page Server. The Page Server notes that the compute note requests + reach the Page Server. The Page Server notes that the compute node requests pages with a very recent LSN and will not respond to the compute node until a corresponding WAL is received from WAL safekeepers. diff --git a/safekeeper/README_PROTO.md b/safekeeper/README_PROTO.md index 5d79f8c2d3..6b2ae50254 100644 --- a/safekeeper/README_PROTO.md +++ b/safekeeper/README_PROTO.md @@ -151,7 +151,7 @@ It is assumed that in case of loosing local data by some safekeepers, it should * `RestartLSN`: position in WAL confirmed by all safekeepers. * `FlushLSN`: part of WAL persisted to the disk by safekeeper. * `NodeID`: pair (term,UUID) -* `Pager`: Zenith component restoring pages from WAL stream +* `Pager`: Neon component restoring pages from WAL stream * `Replica`: read-only computatio node * `VCL`: the largerst LSN for which we can guarantee availablity of all prior records. From 3ff5caf786e666c988c3d74d65e399d95d1b7ae6 Mon Sep 17 00:00:00 2001 From: KlimentSerafimov Date: Mon, 23 May 2022 13:11:59 -0400 Subject: [PATCH 0338/1022] Add to readme install protobuf etcd (#1777) * Update installation instructions * Added libprotobuf-dev etcd to apt install Added "brew install protobuf etcd" to OSX installation instructions. Added "sudo apt install libprotobuf-dev etcd" to Linux installation instructions. Without these, cargo build complains. Figured out in collaboration with Bojan. --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d5dccb7724..8e8bf1a9b2 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Pageserver consists of: On Ubuntu or Debian this set of packages should be sufficient to build the code: ```text apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ -libssl-dev clang pkg-config libpq-dev +libssl-dev clang pkg-config libpq-dev libprotobuf-dev etcd ``` 2. [Install Rust](https://www.rust-lang.org/tools/install) @@ -52,9 +52,10 @@ make -j5 ``` #### building on OSX (12.3.1) -1. Install XCode +1. Install XCode and dependencies ``` xcode-select --install +brew install protobuf etcd ``` 2. [Install Rust](https://www.rust-lang.org/tools/install) From 2aceb6a3095bf0ee6cf7ef3ecc1bb182864abccb Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 23 May 2022 20:58:27 +0300 Subject: [PATCH 0339/1022] Fix garbage collection to not remove image layers that are still needed. The logic would incorrectly remove an image layer, if a new image layer existed, even though the older image layer was still needed by some delta layers after it. See example given in the comment this adds. Without this fix, I was getting a lot of "could not find data for key 010000000000000000000000000000000000" errors from GC, with the new test case being added in PR #1735. Fixes #707 --- pageserver/src/layered_repository.rs | 24 ++++++++++++------- .../src/layered_repository/layer_map.rs | 13 ++++------ 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index fc4ab942f6..a83907430e 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -18,7 +18,7 @@ use itertools::Itertools; use lazy_static::lazy_static; use tracing::*; -use std::cmp::{max, Ordering}; +use std::cmp::{max, min, Ordering}; use std::collections::hash_map::Entry; use std::collections::HashMap; use std::collections::{BTreeSet, HashSet}; @@ -2165,7 +2165,7 @@ impl LayeredTimeline { let gc_info = self.gc_info.read().unwrap(); let retain_lsns = &gc_info.retain_lsns; - let cutoff = gc_info.cutoff; + let cutoff = min(gc_info.cutoff, disk_consistent_lsn); let pitr = gc_info.pitr; // Calculate pitr cutoff point. @@ -2294,12 +2294,20 @@ impl LayeredTimeline { // is 102, then it might not have been fully flushed to disk // before crash. // - // FIXME: This logic is wrong. See https://github.com/zenithdb/zenith/issues/707 - if !layers.newer_image_layer_exists( - &l.get_key_range(), - l.get_lsn_range().end, - disk_consistent_lsn + 1, - )? { + // For example, imagine that the following layers exist: + // + // 1000 - image (A) + // 1000-2000 - delta (B) + // 2000 - image (C) + // 2000-3000 - delta (D) + // 3000 - image (E) + // + // If GC horizon is at 2500, we can remove layers A and B, but + // we cannot remove C, even though it's older than 2500, because + // the delta layer 2000-3000 depends on it. + if !layers + .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))? + { debug!( "keeping {} because it is the latest layer", l.filename().display() diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 7491294c03..f7f51bf21f 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -201,18 +201,14 @@ impl LayerMap { NUM_ONDISK_LAYERS.dec(); } - /// Is there a newer image layer for given key-range? + /// Is there a newer image layer for given key- and LSN-range? /// /// This is used for garbage collection, to determine if an old layer can /// be deleted. - /// We ignore layers newer than disk_consistent_lsn because they will be removed at restart - /// We also only look at historic layers - //#[allow(dead_code)] - pub fn newer_image_layer_exists( + pub fn image_layer_exists( &self, key_range: &Range, - lsn: Lsn, - disk_consistent_lsn: Lsn, + lsn_range: &Range, ) -> Result { let mut range_remain = key_range.clone(); @@ -225,8 +221,7 @@ impl LayerMap { let img_lsn = l.get_lsn_range().start; if !l.is_incremental() && l.get_key_range().contains(&range_remain.start) - && img_lsn > lsn - && img_lsn < disk_consistent_lsn + && lsn_range.contains(&img_lsn) { made_progress = true; let img_key_end = l.get_key_range().end; From 8346aa3a29daf6088689076d35a9c99df3c9e4ce Mon Sep 17 00:00:00 2001 From: KlimentSerafimov Date: Tue, 24 May 2022 04:55:38 -0400 Subject: [PATCH 0340/1022] Potential fix to #1626. Fixed typo is Makefile. (#1781) * Potential fix to #1626. Fixed typo is Makefile. * Completed fix to #1626. Summary: changed 'error' to 'bail' in start_pageserver and start_safekeeper. --- Makefile | 2 +- pageserver/src/bin/pageserver.rs | 2 +- safekeeper/src/bin/safekeeper.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 5eca7fb094..fdfc64f6fa 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ else ifeq ($(BUILD_TYPE),debug) PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend PG_CFLAGS = -O0 -g3 $(CFLAGS) else -$(error Bad build type `$(BUILD_TYPE)', see Makefile for options) + $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) endif # macOS with brew-installed openssl requires explicit paths diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 00864056cb..ac90500b97 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -254,7 +254,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() // Otherwise, the coverage data will be damaged. match daemonize.exit_action(|| exit_now(0)).start() { Ok(_) => info!("Success, daemonized"), - Err(err) => error!(%err, "could not daemonize"), + Err(err) => bail!("{err}. could not daemonize. bailing."), } } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 61d2f558f2..a5ffc013e2 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -245,7 +245,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b // Otherwise, the coverage data will be damaged. match daemonize.exit_action(|| exit_now(0)).start() { Ok(_) => info!("Success, daemonized"), - Err(e) => error!("Error, {}", e), + Err(err) => bail!("Error: {err}. could not daemonize. bailing."), } } From 541ec258758309b1ef98c24b5afe79169406d3b9 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 24 May 2022 17:56:37 +0300 Subject: [PATCH 0341/1022] Properly shutdown test mock S3 server --- .circleci/config.yml | 2 +- test_runner/fixtures/zenith_fixtures.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index eb2bf0172b..41f7693726 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -361,7 +361,7 @@ jobs: when: always command: | du -sh /tmp/test_output/* - find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "etcd.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete + find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete du -sh /tmp/test_output/* - store_artifacts: path: /tmp/test_output diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 17d932c968..8f9bf1c11b 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -393,7 +393,10 @@ class MockS3Server: ): self.port = port - self.subprocess = subprocess.Popen([f'poetry run moto_server s3 -p{port}'], shell=True) + # XXX: do not use `shell=True` or add `exec ` to the command here otherwise. + # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux + # if a process is started from the shell process. + self.subprocess = subprocess.Popen(['poetry', 'run', 'moto_server', 's3', f'-p{port}']) error = None try: return_code = self.subprocess.poll() @@ -403,7 +406,7 @@ class MockS3Server: error = f"expected mock s3 server to start but it failed with exception: {e}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'" if error is not None: log.error(error) - self.subprocess.kill() + self.kill() raise RuntimeError("failed to start s3 mock server") def endpoint(self) -> str: From d32b491a5300d99c9e2d7811944160185e23730c Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Wed, 25 May 2022 11:31:10 +0400 Subject: [PATCH 0342/1022] Add zenith-us-stage-sk-6 to deploy (#1728) --- .circleci/ansible/staging.hosts | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index 8e89e843d9..d99ffa6dac 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -6,6 +6,7 @@ zenith-us-stage-ps-2 console_region_id=27 zenith-us-stage-sk-1 console_region_id=27 zenith-us-stage-sk-4 console_region_id=27 zenith-us-stage-sk-5 console_region_id=27 +zenith-us-stage-sk-6 console_region_id=27 [storage:children] pageservers From 2b265fd6dc38b58a684ee6d584714a87705936b1 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 25 May 2022 14:16:44 +0400 Subject: [PATCH 0343/1022] Disable restart_after_crash in neon_local. It is pointless when basebackup is invalid. --- control_plane/src/compute.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 92d0e080d8..350cf74b7c 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -274,6 +274,8 @@ impl PostgresNode { conf.append("listen_addresses", &self.address.ip().to_string()); conf.append("port", &self.address.port().to_string()); conf.append("wal_keep_size", "0"); + // walproposer panics when basebackup is invalid, it is pointless to restart in this case. + conf.append("restart_after_crash", "off"); // Configure the node to fetch pages from pageserver let pageserver_connstr = { From 703f691df8fb82fdfd3d2febc892748eb7317126 Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Wed, 25 May 2022 14:30:50 +0300 Subject: [PATCH 0344/1022] production inventory update (#1779) --- .circleci/ansible/production.hosts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.circleci/ansible/production.hosts b/.circleci/ansible/production.hosts index 2ed8f517f7..6cefd724d8 100644 --- a/.circleci/ansible/production.hosts +++ b/.circleci/ansible/production.hosts @@ -1,5 +1,6 @@ [pageservers] -zenith-1-ps-1 console_region_id=1 +#zenith-1-ps-1 console_region_id=1 +zenith-1-ps-2 console_region_id=1 [safekeepers] zenith-1-sk-1 console_region_id=1 @@ -15,4 +16,4 @@ console_mgmt_base_url = http://console-release.local bucket_name = zenith-storage-oregon bucket_region = us-west-2 etcd_endpoints = etcd-release.local:2379 -safekeeper_enable_s3_offload = true +safekeeper_enable_s3_offload = false From 6f1f33ef42a63c0047442e8057b9223793424edb Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 25 May 2022 14:33:06 +0300 Subject: [PATCH 0345/1022] Improve error messages on seccomp loading errors. Bump vendor/postgres for https://github.com/neondatabase/postgres/pull/166 --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 79af2faf08..038b2b98e5 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 79af2faf08d9bec1b1664a72936727dcca36d253 +Subproject commit 038b2b98e5c3d6274cbd43e9b822cdd946cb8b91 From 9ab52e2186e9330d4098b27372d8a0a2d5f0ac1e Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Wed, 25 May 2022 15:41:18 +0300 Subject: [PATCH 0346/1022] helm repository name fix for production proxy deploy (#1790) --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 41f7693726..5346e35c01 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -685,7 +685,7 @@ jobs: name: Setup helm v3 command: | curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - helm repo add zenithdb https://neondatabase.github.io/helm-charts + helm repo add neondatabase https://neondatabase.github.io/helm-charts - run: name: Re-deploy proxy command: | From 24d2313d0b8d1b6279f8a01376f55111427c9b19 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 25 May 2022 16:57:45 +0300 Subject: [PATCH 0347/1022] Set --quota-backend-bytes when launching etcd in tests. By default, etcd makes a huge 10 GB mmap() allocation when it starts up. It doesn't actually use that much memory, it's just address space, but it caused me grief when I tried to use 'rr' to debug a python test run. Apparently, when you replay the 'rr' trace, it does allocate memory for all that address space. The size of the initial mmap depends on the --quota-backend-bytes setting. Our etcd clusters are very small, so let's set --quota-backend-bytes to keep the virtual memory size small, to make debugging with 'rr' easier. See https://github.com/etcd-io/etcd/issues/7910 and https://github.com/etcd-io/etcd/commit/5e4b0081065925ab9d04009cd4fb559c4cceb304 --- control_plane/src/etcd.rs | 4 ++++ test_runner/fixtures/zenith_fixtures.py | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs index df657dd1be..bc39b7dea3 100644 --- a/control_plane/src/etcd.rs +++ b/control_plane/src/etcd.rs @@ -48,6 +48,10 @@ pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { format!("--data-dir={}", etcd_data_dir.display()), format!("--listen-client-urls={client_urls}"), format!("--advertise-client-urls={client_urls}"), + // Set --quota-backend-bytes to keep the etcd virtual memory + // size smaller. Our test etcd clusters are very small. + // See https://github.com/etcd-io/etcd/issues/7910 + "--quota-backend-bytes=100000000".to_string(), ]) .stdout(Stdio::from(etcd_stdout_file)) .stderr(Stdio::from(etcd_stderr_file)) diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 8f9bf1c11b..7f5b2ad2aa 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1893,7 +1893,11 @@ class Etcd: f"--data-dir={self.datadir}", f"--listen-client-urls={client_url}", f"--advertise-client-urls={client_url}", - f"--listen-peer-urls=http://127.0.0.1:{self.peer_port}" + f"--listen-peer-urls=http://127.0.0.1:{self.peer_port}", + # Set --quota-backend-bytes to keep the etcd virtual memory + # size smaller. Our test etcd clusters are very small. + # See https://github.com/etcd-io/etcd/issues/7910 + f"--quota-backend-bytes=100000000" ] self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file) From 7997fc2932465b1c8854a64c2c053041eacdf80a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 25 May 2022 18:14:44 +0300 Subject: [PATCH 0348/1022] Fix error handling with 'basebackup' command. If the 'basebackup' command failed in the middle of building the tar archive, the client would not report the error, but would attempt to to start up postgres with the partial contents of the data directory. That fails because the control file is missing (it's added to the archive last, precisly to make sure that you cannot start postgres from a partial archive). But the client doesn't see the proper error message that caused the basebackup to fail in the server, which is confusing. Two issues conspired to cause that: 1. The tar::Builder object that we use in the pageserver to construct the tar stream has a Drop handler that automatically writes a valid end-of-archive marker on drop. Because of that, the resulting tarball looks complete, even if an error happens while we're building it. The pageserver does send an ErrorResponse after the seemingly-valid tarball, but: 2. The client stops reading the Copy stream, as soon as it sees the tar end-of-archive marker. Therefore, it doesn't read the ErrorResponse that comes after it. We have two clients that call 'basebackup', one in `control_plane` used by the `neon_local` binary, and another one in `compute_tools`. Both had the same issue. This PR fixes both issues, even though fixing either one would be enough to fix the problem at hand. The pageserver now doesn't send the end-of-archive marker on error, and the client now reads the copy stream to the end, even if it sees an end-of-archive marker. Fixes github issue #1715 In the passing, change Basebackup to use generic Write rather than 'dyn'. --- compute_tools/src/compute.rs | 8 +- control_plane/Cargo.toml | 2 +- control_plane/src/compute.rs | 9 +- pageserver/src/basebackup.rs | 90 +++++++++++++++++-- pageserver/src/page_service.rs | 3 +- .../batch_others/test_basebackup_error.py | 20 +++++ 6 files changed, 119 insertions(+), 13 deletions(-) create mode 100644 test_runner/batch_others/test_basebackup_error.py diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index a8422fb2b2..fd60b80305 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -146,8 +146,14 @@ impl ComputeNode { _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn), }; let copyreader = client.copy_out(basebackup_cmd.as_str())?; - let mut ar = tar::Archive::new(copyreader); + // Read the archive directly from the `CopyOutReader` + // + // Set `ignore_zeros` so that unpack() reads all the Copy data and + // doesn't stop at the end-of-archive marker. Otherwise, if the server + // sends an Error after finishing the tarball, we will not notice it. + let mut ar = tar::Archive::new(copyreader); + ar.set_ignore_zeros(true); ar.unpack(&self.pgdata)?; self.metrics.basebackup_ms.store( diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 41417aab9a..21311eea9a 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -tar = "0.4.33" +tar = "0.4.38" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } serde = { version = "1.0", features = ["derive"] } serde_with = "1.12.0" diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 350cf74b7c..045acd7519 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -231,8 +231,13 @@ impl PostgresNode { .context("page server 'basebackup' command failed")?; // Read the archive directly from the `CopyOutReader` - tar::Archive::new(copyreader) - .unpack(&self.pgdata()) + // + // Set `ignore_zeros` so that unpack() reads all the Copy data and + // doesn't stop at the end-of-archive marker. Otherwise, if the server + // sends an Error after finishing the tarball, we will not notice it. + let mut ar = tar::Archive::new(copyreader); + ar.set_ignore_zeros(true); + ar.unpack(&self.pgdata()) .context("extracting base backup failed")?; Ok(()) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 92d35130d8..46d824b2e2 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,8 +10,9 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{anyhow, ensure, Context, Result}; +use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::{BufMut, BytesMut}; +use fail::fail_point; use std::fmt::Write as FmtWrite; use std::io; use std::io::Write; @@ -30,11 +31,16 @@ use utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, /// created mostly to avoid passing a lot of parameters between various functions /// used for constructing tarball. -pub struct Basebackup<'a> { - ar: Builder<&'a mut dyn Write>, +pub struct Basebackup<'a, W> +where + W: Write, +{ + ar: Builder>, timeline: &'a Arc, pub lsn: Lsn, prev_record_lsn: Lsn, + + finished: bool, } // Create basebackup with non-rel data in it. Omit relational data. @@ -44,12 +50,15 @@ pub struct Basebackup<'a> { // * When working without safekeepers. In this situation it is important to match the lsn // we are taking basebackup on with the lsn that is used in pageserver's walreceiver // to start the replication. -impl<'a> Basebackup<'a> { +impl<'a, W> Basebackup<'a, W> +where + W: Write, +{ pub fn new( - write: &'a mut dyn Write, + write: W, timeline: &'a Arc, req_lsn: Option, - ) -> Result> { + ) -> Result> { // Compute postgres doesn't have any previous WAL files, but the first // record that it's going to write needs to include the LSN of the // previous record (xl_prev). We include prev_record_lsn in the @@ -90,14 +99,15 @@ impl<'a> Basebackup<'a> { ); Ok(Basebackup { - ar: Builder::new(write), + ar: Builder::new(AbortableWrite::new(write)), timeline, lsn: backup_lsn, prev_record_lsn: backup_prev, + finished: false, }) } - pub fn send_tarball(&mut self) -> anyhow::Result<()> { + pub fn send_tarball(mut self) -> anyhow::Result<()> { // Create pgdata subdirs structure for dir in pg_constants::PGDATA_SUBDIRS.iter() { let header = new_tar_header_dir(*dir)?; @@ -135,9 +145,14 @@ impl<'a> Basebackup<'a> { self.add_twophase_file(xid)?; } + fail_point!("basebackup-before-control-file", |_| { + bail!("failpoint basebackup-before-control-file") + }); + // Generate pg_control and bootstrap WAL segment. self.add_pgcontrol_file()?; self.ar.finish()?; + self.finished = true; debug!("all tarred up!"); Ok(()) } @@ -331,6 +346,19 @@ impl<'a> Basebackup<'a> { } } +impl<'a, W> Drop for Basebackup<'a, W> +where + W: Write, +{ + /// If the basebackup was not finished, prevent the Archive::drop() from + /// writing the end-of-archive marker. + fn drop(&mut self) { + if !self.finished { + self.ar.get_mut().abort(); + } + } +} + // // Create new tarball entry header // @@ -366,3 +394,49 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result
{ header.set_cksum(); Ok(header) } + +/// A wrapper that passes through all data to the underlying Write, +/// until abort() is called. +/// +/// tar::Builder has an annoying habit of finishing the archive with +/// a valid tar end-of-archive marker (two 512-byte sectors of zeros), +/// even if an error occurs and we don't finish building the archive. +/// We'd rather abort writing the tarball immediately than construct +/// a seemingly valid but incomplete archive. This wrapper allows us +/// to swallow the end-of-archive marker that Builder::drop() emits, +/// without writing it to the underlying sink. +/// +struct AbortableWrite { + w: W, + aborted: bool, +} + +impl AbortableWrite { + pub fn new(w: W) -> Self { + AbortableWrite { w, aborted: false } + } + + pub fn abort(&mut self) { + self.aborted = true; + } +} + +impl Write for AbortableWrite +where + W: Write, +{ + fn write(&mut self, data: &[u8]) -> io::Result { + if self.aborted { + Ok(data.len()) + } else { + self.w.write(data) + } + } + fn flush(&mut self) -> io::Result<()> { + if self.aborted { + Ok(()) + } else { + self.w.flush() + } + } +} diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 03264c9782..f54cd550b3 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -593,7 +593,8 @@ impl PageServerHandler { /* Send a tarball of the latest layer on the timeline */ { let mut writer = CopyDataSink { pgb }; - let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?; + + let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?; span.record("lsn", &basebackup.lsn.to_string().as_str()); basebackup.send_tarball()?; } diff --git a/test_runner/batch_others/test_basebackup_error.py b/test_runner/batch_others/test_basebackup_error.py new file mode 100644 index 0000000000..4b8b8a746c --- /dev/null +++ b/test_runner/batch_others/test_basebackup_error.py @@ -0,0 +1,20 @@ +import pytest +from contextlib import closing + +from fixtures.zenith_fixtures import ZenithEnv +from fixtures.log_helper import log + + +# +# Test error handling, if the 'basebackup' command fails in the middle +# of building the tar archive. +# +def test_basebackup_error(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + env.zenith_cli.create_branch("test_basebackup_error", "empty") + + # Introduce failpoint + env.pageserver.safe_psql(f"failpoints basebackup-before-control-file=return") + + with pytest.raises(Exception, match="basebackup-before-control-file"): + pg = env.postgres.create_start('test_basebackup_error') From c584d90bb96bb7bd390bc5345ec8f667e765c299 Mon Sep 17 00:00:00 2001 From: chaitanya sharma <86035+phoenix24@users.noreply.github.com> Date: Mon, 23 May 2022 15:52:21 +0000 Subject: [PATCH 0349/1022] initial commit, renamed znodeid to nodeid. --- control_plane/src/local_env.rs | 10 +++++----- control_plane/src/safekeeper.rs | 8 ++++---- libs/etcd_broker/src/lib.rs | 16 ++++++++-------- libs/utils/src/zid.rs | 4 ++-- neon_local/src/main.rs | 10 +++++----- pageserver/src/config.rs | 16 ++++++++-------- pageserver/src/http/models.rs | 4 ++-- safekeeper/src/bin/safekeeper.rs | 12 ++++++------ safekeeper/src/broker.rs | 4 ++-- safekeeper/src/http/models.rs | 4 ++-- safekeeper/src/http/routes.rs | 6 +++--- safekeeper/src/lib.rs | 6 +++--- safekeeper/src/safekeeper.rs | 18 +++++++++--------- safekeeper/src/timeline.rs | 10 +++++----- 14 files changed, 64 insertions(+), 64 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index c73af7d338..015b33f591 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -15,7 +15,7 @@ use std::process::{Command, Stdio}; use utils::{ auth::{encode_from_key_file, Claims, Scope}, postgres_backend::AuthType, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use crate::safekeeper::SafekeeperNode; @@ -136,7 +136,7 @@ impl EtcdBroker { #[serde(default)] pub struct PageServerConf { // node id - pub id: ZNodeId, + pub id: NodeId, // Pageserver connection settings pub listen_pg_addr: String, pub listen_http_addr: String, @@ -151,7 +151,7 @@ pub struct PageServerConf { impl Default for PageServerConf { fn default() -> Self { Self { - id: ZNodeId(0), + id: NodeId(0), listen_pg_addr: String::new(), listen_http_addr: String::new(), auth_type: AuthType::Trust, @@ -163,7 +163,7 @@ impl Default for PageServerConf { #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct SafekeeperConf { - pub id: ZNodeId, + pub id: NodeId, pub pg_port: u16, pub http_port: u16, pub sync: bool, @@ -172,7 +172,7 @@ pub struct SafekeeperConf { impl Default for SafekeeperConf { fn default() -> Self { Self { - id: ZNodeId(0), + id: NodeId(0), pg_port: 0, http_port: 0, sync: true, diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index d5b6251209..303d6850df 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -18,7 +18,7 @@ use thiserror::Error; use utils::{ connstring::connection_address, http::error::HttpErrorBody, - zid::{ZNodeId, ZTenantId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTimelineId}, }; use crate::local_env::{LocalEnv, SafekeeperConf}; @@ -65,7 +65,7 @@ impl ResponseErrorMessageExt for Response { // #[derive(Debug)] pub struct SafekeeperNode { - pub id: ZNodeId, + pub id: NodeId, pub conf: SafekeeperConf, @@ -100,7 +100,7 @@ impl SafekeeperNode { .unwrap() } - pub fn datadir_path_by_id(env: &LocalEnv, sk_id: ZNodeId) -> PathBuf { + pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf { env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref()) } @@ -286,7 +286,7 @@ impl SafekeeperNode { &self, tenant_id: ZTenantId, timeline_id: ZTimelineId, - peer_ids: Vec, + peer_ids: Vec, ) -> Result<()> { Ok(self .http_request( diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index 76181f9ba1..271f657f43 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -16,7 +16,7 @@ use tokio::{sync::mpsc, task::JoinHandle}; use tracing::*; use utils::{ lsn::Lsn, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId}, }; /// Default value to use for prefixing to all etcd keys with. @@ -25,7 +25,7 @@ pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon"; #[derive(Debug, Deserialize, Serialize)] struct SafekeeperTimeline { - safekeeper_id: ZNodeId, + safekeeper_id: NodeId, info: SkTimelineInfo, } @@ -71,7 +71,7 @@ pub enum BrokerError { /// A way to control the data retrieval from a certain subscription. pub struct SkTimelineSubscription { safekeeper_timeline_updates: - mpsc::UnboundedReceiver>>, + mpsc::UnboundedReceiver>>, kind: SkTimelineSubscriptionKind, watcher_handle: JoinHandle>, watcher: Watcher, @@ -81,7 +81,7 @@ impl SkTimelineSubscription { /// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet. pub async fn fetch_data( &mut self, - ) -> Option>> { + ) -> Option>> { self.safekeeper_timeline_updates.recv().await } @@ -221,7 +221,7 @@ pub async fn subscribe_to_safekeeper_timeline_updates( break; } - let mut timeline_updates: HashMap> = HashMap::new(); + let mut timeline_updates: HashMap> = HashMap::new(); // Keep track that the timeline data updates from etcd arrive in the right order. // https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas // > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering. @@ -299,18 +299,18 @@ fn parse_etcd_key_value( parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?, parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?, ), - ZNodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?), + NodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?), ), SubscriptionKind::Tenant(tenant_id) => ( ZTenantTimelineId::new( tenant_id, parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?, ), - ZNodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?), + NodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?), ), SubscriptionKind::Timeline(zttid) => ( zttid, - ZNodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?), + NodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?), ), }; diff --git a/libs/utils/src/zid.rs b/libs/utils/src/zid.rs index 44d81cda50..02f781c49a 100644 --- a/libs/utils/src/zid.rs +++ b/libs/utils/src/zid.rs @@ -226,9 +226,9 @@ impl fmt::Display for ZTenantTimelineId { // by the console. #[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)] #[serde(transparent)] -pub struct ZNodeId(pub u64); +pub struct NodeId(pub u64); -impl fmt::Display for ZNodeId { +impl fmt::Display for NodeId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.0) } diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index f04af9cfdd..8d39fe5d0d 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -22,14 +22,14 @@ use utils::{ lsn::Lsn, postgres_backend::AuthType, project_git_version, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use pageserver::timelines::TimelineInfo; // Default id of a safekeeper node, if not specified on the command line. -const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1); -const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1); +const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1); +const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); @@ -860,7 +860,7 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul Ok(()) } -fn get_safekeeper(env: &local_env::LocalEnv, id: ZNodeId) -> Result { +fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result { if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) { Ok(SafekeeperNode::from_env(env, node)) } else { @@ -876,7 +876,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul // All the commands take an optional safekeeper name argument let sk_id = if let Some(id_str) = sub_args.value_of("id") { - ZNodeId(id_str.parse().context("while parsing safekeeper id")?) + NodeId(id_str.parse().context("while parsing safekeeper id")?) } else { DEFAULT_SAFEKEEPER_ID }; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index a9215c0701..6c045d77ae 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -16,7 +16,7 @@ use toml_edit::{Document, Item}; use url::Url; use utils::{ postgres_backend::AuthType, - zid::{ZNodeId, ZTenantId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTimelineId}, }; use crate::layered_repository::TIMELINES_SEGMENT_NAME; @@ -78,7 +78,7 @@ pub mod defaults { pub struct PageServerConf { // Identifier of that particular pageserver so e g safekeepers // can safely distinguish different pageservers - pub id: ZNodeId, + pub id: NodeId, /// Example (default): 127.0.0.1:64000 pub listen_pg_addr: String, @@ -180,7 +180,7 @@ struct PageServerConfigBuilder { auth_validation_public_key_path: BuilderValue>, remote_storage_config: BuilderValue>, - id: BuilderValue, + id: BuilderValue, profiling: BuilderValue, broker_etcd_prefix: BuilderValue, @@ -276,7 +276,7 @@ impl PageServerConfigBuilder { self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix) } - pub fn id(&mut self, node_id: ZNodeId) { + pub fn id(&mut self, node_id: NodeId) { self.id = BuilderValue::Set(node_id) } @@ -399,7 +399,7 @@ impl PageServerConf { "tenant_config" => { t_conf = Self::parse_toml_tenant_conf(item)?; } - "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)), + "id" => builder.id(NodeId(parse_toml_u64(key, item)?)), "profiling" => builder.profiling(parse_toml_from_str(key, item)?), "broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?), "broker_endpoints" => builder.broker_endpoints( @@ -550,7 +550,7 @@ impl PageServerConf { #[cfg(test)] pub fn dummy_conf(repo_dir: PathBuf) -> Self { PageServerConf { - id: ZNodeId(0), + id: NodeId(0), wait_lsn_timeout: Duration::from_secs(60), wal_redo_timeout: Duration::from_secs(60), page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE, @@ -693,7 +693,7 @@ id = 10 assert_eq!( parsed_config, PageServerConf { - id: ZNodeId(10), + id: NodeId(10), listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?, @@ -737,7 +737,7 @@ id = 10 assert_eq!( parsed_config, PageServerConf { - id: ZNodeId(10), + id: NodeId(10), listen_pg_addr: "127.0.0.1:64000".to_string(), listen_http_addr: "127.0.0.1:9898".to_string(), wait_lsn_timeout: Duration::from_secs(111), diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index e9aaa72416..e00ccda2a1 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -2,7 +2,7 @@ use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use utils::{ lsn::Lsn, - zid::{ZNodeId, ZTenantId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTimelineId}, }; #[serde_as] @@ -42,7 +42,7 @@ pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId #[derive(Serialize)] pub struct StatusResponse { - pub id: ZNodeId, + pub id: NodeId, } impl TenantCreateRequest { diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index a5ffc013e2..290b7c738a 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -24,7 +24,7 @@ use safekeeper::{broker, callmemaybe}; use safekeeper::{http, s3_offload}; use utils::{ http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener, - zid::ZNodeId, + zid::NodeId, }; const LOCK_FILE_NAME: &str = "safekeeper.lock"; @@ -167,7 +167,7 @@ fn main() -> anyhow::Result<()> { let mut given_id = None; if let Some(given_id_str) = arg_matches.value_of("id") { - given_id = Some(ZNodeId( + given_id = Some(NodeId( given_id_str .parse() .context("failed to parse safekeeper id")?, @@ -192,7 +192,7 @@ fn main() -> anyhow::Result<()> { start_safekeeper(conf, given_id, arg_matches.is_present("init")) } -fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { +fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { let log_file = logging::init("safekeeper.log", conf.daemonize)?; info!("version: {GIT_VERSION}"); @@ -345,14 +345,14 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b } /// Determine safekeeper id and set it in config. -fn set_id(conf: &mut SafeKeeperConf, given_id: Option) -> Result<()> { +fn set_id(conf: &mut SafeKeeperConf, given_id: Option) -> Result<()> { let id_file_path = conf.workdir.join(ID_FILE_NAME); - let my_id: ZNodeId; + let my_id: NodeId; // If ID exists, read it in; otherwise set one passed match fs::read(&id_file_path) { Ok(id_serialized) => { - my_id = ZNodeId( + my_id = NodeId( std::str::from_utf8(&id_serialized) .context("failed to parse safekeeper id")? .parse() diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index d7217be20a..59d282d378 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -12,7 +12,7 @@ use tokio::{runtime, time::sleep}; use tracing::*; use crate::{timeline::GlobalTimelines, SafeKeeperConf}; -use utils::zid::{ZNodeId, ZTenantTimelineId}; +use utils::zid::{NodeId, ZTenantTimelineId}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; @@ -36,7 +36,7 @@ pub fn thread_main(conf: SafeKeeperConf) { fn timeline_safekeeper_path( broker_etcd_prefix: String, zttid: ZTenantTimelineId, - sk_id: ZNodeId, + sk_id: NodeId, ) -> String { format!( "{}/{sk_id}", diff --git a/safekeeper/src/http/models.rs b/safekeeper/src/http/models.rs index ca18e64096..77efc0cc21 100644 --- a/safekeeper/src/http/models.rs +++ b/safekeeper/src/http/models.rs @@ -1,9 +1,9 @@ use serde::{Deserialize, Serialize}; -use utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; +use utils::zid::{NodeId, ZTenantId, ZTimelineId}; #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { pub tenant_id: ZTenantId, pub timeline_id: ZTimelineId, - pub peer_ids: Vec, + pub peer_ids: Vec, } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 62fbd2ff2f..3f6ade970d 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -20,14 +20,14 @@ use utils::{ RequestExt, RouterBuilder, }, lsn::Lsn, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use super::models::TimelineCreateRequest; #[derive(Debug, Serialize)] struct SafekeeperStatus { - id: ZNodeId, + id: NodeId, } /// Healthcheck handler. @@ -178,7 +178,7 @@ async fn record_safekeeper_info(mut request: Request) -> Result, pub recall_period: Duration, - pub my_id: ZNodeId, + pub my_id: NodeId, pub broker_endpoints: Vec, pub broker_etcd_prefix: String, pub s3_offload_enabled: bool, @@ -79,7 +79,7 @@ impl Default for SafeKeeperConf { listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), ttl: None, recall_period: defaults::DEFAULT_RECALL_PERIOD, - my_id: ZNodeId(0), + my_id: NodeId(0), broker_endpoints: Vec::new(), broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), s3_offload_enabled: true, diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index fff1c269b6..b8b969929d 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -26,7 +26,7 @@ use utils::{ bin_ser::LeSer, lsn::Lsn, pq_proto::{SystemId, ZenithFeedback}, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; pub const SK_MAGIC: u32 = 0xcafeceefu32; @@ -164,7 +164,7 @@ impl PeerInfo { // vector-based node id -> peer state map with very limited functionality we // need/ #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Peers(pub Vec<(ZNodeId, PeerInfo)>); +pub struct Peers(pub Vec<(NodeId, PeerInfo)>); /// Persistent information stored on safekeeper node /// On disk data is prefixed by magic and format version and followed by checksum. @@ -224,7 +224,7 @@ pub struct SafekeeperMemState { } impl SafeKeeperState { - pub fn new(zttid: &ZTenantTimelineId, peers: Vec) -> SafeKeeperState { + pub fn new(zttid: &ZTenantTimelineId, peers: Vec) -> SafeKeeperState { SafeKeeperState { tenant_id: zttid.tenant_id, timeline_id: zttid.timeline_id, @@ -277,7 +277,7 @@ pub struct ProposerGreeting { #[derive(Debug, Serialize)] pub struct AcceptorGreeting { term: u64, - node_id: ZNodeId, + node_id: NodeId, } /// Vote request sent from proposer to safekeepers @@ -531,7 +531,7 @@ pub struct SafeKeeper { pub wal_store: WAL, - node_id: ZNodeId, // safekeeper's node id + node_id: NodeId, // safekeeper's node id } impl SafeKeeper @@ -544,7 +544,7 @@ where ztli: ZTimelineId, state: CTRL, mut wal_store: WAL, - node_id: ZNodeId, + node_id: NodeId, ) -> Result> { if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id { bail!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id); @@ -1013,7 +1013,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -1028,7 +1028,7 @@ mod tests { let storage = InMemoryState { persisted_state: state, }; - sk = SafeKeeper::new(ztli, storage, sk.wal_store, ZNodeId(0)).unwrap(); + sk = SafeKeeper::new(ztli, storage, sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request); @@ -1045,7 +1045,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { term: 1, diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 2bb7771aac..0953439bd8 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -21,7 +21,7 @@ use tracing::*; use utils::{ lsn::Lsn, pq_proto::ZenithFeedback, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId}, }; use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; @@ -99,7 +99,7 @@ impl SharedState { fn create( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, - peer_ids: Vec, + peer_ids: Vec, ) -> Result { let state = SafeKeeperState::new(zttid, peer_ids); let control_store = control_file::FileStorage::create_new(zttid, conf, state)?; @@ -448,7 +448,7 @@ impl Timeline { } /// Update timeline state with peer safekeeper data. - pub fn record_safekeeper_info(&self, sk_info: &SkTimelineInfo, _sk_id: ZNodeId) -> Result<()> { + pub fn record_safekeeper_info(&self, sk_info: &SkTimelineInfo, _sk_id: NodeId) -> Result<()> { let mut shared_state = self.mutex.lock().unwrap(); shared_state.sk.record_safekeeper_info(sk_info)?; self.notify_wal_senders(&mut shared_state); @@ -551,7 +551,7 @@ impl GlobalTimelines { mut state: MutexGuard, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, - peer_ids: Vec, + peer_ids: Vec, ) -> Result> { match state.timelines.get(&zttid) { Some(_) => bail!("timeline {} already exists", zttid), @@ -576,7 +576,7 @@ impl GlobalTimelines { pub fn create( conf: &SafeKeeperConf, zttid: ZTenantTimelineId, - peer_ids: Vec, + peer_ids: Vec, ) -> Result> { let state = TIMELINES_STATE.lock().unwrap(); GlobalTimelines::create_internal(state, conf, zttid, peer_ids) From 887b0e14d9285bdf64eab3e44eb7000cdb55b44b Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 25 May 2022 21:07:49 +0300 Subject: [PATCH 0350/1022] Run basic checks on PRs and pushes to main only --- .github/workflows/testing.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 79b2ba05d0..281c893403 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -1,8 +1,10 @@ name: Build and Test on: - pull_request: push: + branches: + - main + pull_request: jobs: regression-check: From 06f5e017a1b0d380e0e082e906cd52b7a885b100 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 25 May 2022 21:12:17 +0300 Subject: [PATCH 0351/1022] Move rustfmt check to GH Action --- .circleci/config.yml | 10 ---------- .github/workflows/testing.yml | 6 +++++- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 5346e35c01..624d367053 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -11,15 +11,6 @@ executors: - image: zimg/rust:1.58 jobs: - check-codestyle-rust: - executor: neon-xlarge-executor - steps: - - checkout - - run: - name: rustfmt - when: always - command: cargo fmt --all -- --check - # A job to build postgres build-postgres: executor: neon-xlarge-executor @@ -740,7 +731,6 @@ jobs: workflows: build_and_test: jobs: - - check-codestyle-rust - check-codestyle-python - build-postgres: name: build-postgres-<< matrix.build_type >> diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 281c893403..1ce1b64a49 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -25,13 +25,17 @@ jobs: submodules: true fetch-depth: 2 - - name: install rust toolchain ${{ matrix.rust_toolchain }} + - name: Install rust toolchain ${{ matrix.rust_toolchain }} uses: actions-rs/toolchain@v1 with: profile: minimal toolchain: ${{ matrix.rust_toolchain }} + components: rustfmt, clippy override: true + - name: Check formatting + run: cargo fmt --all -- --check + - name: Install Ubuntu postgres dependencies if: matrix.os == 'ubuntu-latest' run: | From 5a5737278e637245d0b7b89a20b47040d2572a0e Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 25 May 2022 23:10:44 +0300 Subject: [PATCH 0352/1022] add simple metrics for remote storage operations track number of operations and number of their failures --- Cargo.lock | 2 + libs/remote_storage/Cargo.toml | 11 ++- libs/remote_storage/src/s3_bucket.rs | 109 +++++++++++++++++++++++++-- 3 files changed, 113 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6acad6dac8..840953f645 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2394,6 +2394,8 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", + "metrics", + "once_cell", "rusoto_core", "rusoto_s3", "serde", diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 291f6e50ac..5c62e28fda 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -5,14 +5,17 @@ edition = "2021" [dependencies] anyhow = { version = "1.0", features = ["backtrace"] } -tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] } -tokio-util = { version = "0.7", features = ["io"] } -tracing = "0.1.27" +async-trait = "0.1" + +metrics = { version = "0.1", path = "../metrics" } +once_cell = "1.8.0" rusoto_core = "0.48" rusoto_s3 = "0.48" serde = { version = "1.0", features = ["derive"] } serde_json = "1" -async-trait = "0.1" +tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] } +tokio-util = { version = "0.7", features = ["io"] } +tracing = "0.1.27" workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 01aaf7ca7e..80d6966494 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -23,6 +23,71 @@ use crate::{strip_path_prefix, RemoteStorage, S3Config}; use super::StorageMetadata; +pub(super) mod metrics { + use metrics::{register_int_counter_vec, IntCounterVec}; + use once_cell::sync::Lazy; + + static S3_REQUESTS_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "remote_storage_s3_requests_count", + "Number of s3 requests of particular type", + &["request_type"], + ) + .expect("failed to define a metric") + }); + + static S3_REQUESTS_FAIL_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "remote_storage_s3_failures_count", + "Number of failed s3 requests of particular type", + &["request_type"], + ) + .expect("failed to define a metric") + }); + + pub fn inc_get_object() { + S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc(); + } + + pub fn inc_get_object_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["get_object"]) + .inc(); + } + + pub fn inc_put_object() { + S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc(); + } + + pub fn inc_put_object_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["put_object"]) + .inc(); + } + + pub fn inc_delete_object() { + S3_REQUESTS_COUNT + .with_label_values(&["delete_object"]) + .inc(); + } + + pub fn inc_delete_object_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["delete_object"]) + .inc(); + } + + pub fn inc_list_objects() { + S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc(); + } + + pub fn inc_list_objects_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["list_objects"]) + .inc(); + } +} + const S3_PREFIX_SEPARATOR: char = '/'; #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Hash)] @@ -152,6 +217,9 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 list")?; + + metrics::inc_list_objects(); + let fetch_response = self .client .list_objects_v2(ListObjectsV2Request { @@ -160,7 +228,11 @@ impl RemoteStorage for S3Bucket { continuation_token, ..ListObjectsV2Request::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_list_objects_fail(); + e + })?; document_keys.extend( fetch_response .contents @@ -190,6 +262,8 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 upload")?; + + metrics::inc_put_object(); self.client .put_object(PutObjectRequest { body: Some(StreamingBody::new_with_size( @@ -201,7 +275,11 @@ impl RemoteStorage for S3Bucket { metadata: metadata.map(|m| m.0), ..PutObjectRequest::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_put_object_fail(); + e + })?; Ok(()) } @@ -215,6 +293,9 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 download")?; + + metrics::inc_get_object(); + let object_output = self .client .get_object(GetObjectRequest { @@ -222,7 +303,11 @@ impl RemoteStorage for S3Bucket { key: from.key().to_owned(), ..GetObjectRequest::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_get_object_fail(); + e + })?; if let Some(body) = object_output.body { let mut from = io::BufReader::new(body.into_async_read()); @@ -251,6 +336,9 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 range download")?; + + metrics::inc_get_object(); + let object_output = self .client .get_object(GetObjectRequest { @@ -259,7 +347,11 @@ impl RemoteStorage for S3Bucket { range, ..GetObjectRequest::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_get_object_fail(); + e + })?; if let Some(body) = object_output.body { let mut from = io::BufReader::new(body.into_async_read()); @@ -275,13 +367,20 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 delete")?; + + metrics::inc_delete_object(); + self.client .delete_object(DeleteObjectRequest { bucket: self.bucket_name.clone(), key: path.key().to_owned(), ..DeleteObjectRequest::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_delete_object_fail(); + e + })?; Ok(()) } } From 38f2d165b778834d927ed6c549c3285ecfbbe576 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 26 May 2022 12:06:05 +0300 Subject: [PATCH 0353/1022] allow TLS 1.2 in proxy to be compatible with older client libraries --- proxy/src/config.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 077a07beb9..6f1b56bfe4 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -61,7 +61,8 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result Date: Thu, 19 May 2022 14:27:28 +0300 Subject: [PATCH 0354/1022] Initialize last_freeze_at with disk consistent LSN to avoid creation of small L0 delta layer on startup refer #1736 --- pageserver/src/layered_repository.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index a83907430e..d10c795214 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1230,7 +1230,7 @@ impl LayeredTimeline { }), disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0), - last_freeze_at: AtomicLsn::new(0), + last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0), ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), From 72a7220dc8c7a247ea411f3e381c8710f99617b7 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 26 May 2022 16:48:32 +0300 Subject: [PATCH 0355/1022] Tidy up some log messages * turn println into an info with proper message * rename new_local_timeline to load_local_timeline because it does not create new timeline, it registers timeline that exists on disk in pageserver in-memory structures --- pageserver/src/tenant_mgr.rs | 10 +++++----- pageserver/src/timelines.rs | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index bbe66d7f80..bba67394c3 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -327,8 +327,8 @@ pub fn get_local_timeline_with_load( return Ok(Arc::clone(page_tline)); } - let page_tline = new_local_timeline(&tenant.repo, timeline_id) - .with_context(|| format!("Failed to create new local timeline for tenant {tenant_id}"))?; + let page_tline = load_local_timeline(&tenant.repo, timeline_id) + .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}"))?; tenant .local_timelines .insert(timeline_id, Arc::clone(&page_tline)); @@ -365,7 +365,7 @@ pub fn detach_timeline( Ok(()) } -fn new_local_timeline( +fn load_local_timeline( repo: &RepositoryImpl, timeline_id: ZTimelineId, ) -> anyhow::Result>> { @@ -458,8 +458,8 @@ fn apply_timeline_remote_sync_status_updates( bail!("Local timeline {timeline_id} already registered") } Entry::Vacant(v) => { - v.insert(new_local_timeline(repo, timeline_id).with_context(|| { - format!("Failed to register new local timeline for tenant {tenant_id}") + v.insert(load_local_timeline(repo, timeline_id).with_context(|| { + format!("Failed to register add local timeline for tenant {tenant_id}") })?); } }, diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index eadf5bf4e0..408eca6501 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -302,8 +302,8 @@ fn bootstrap_timeline( import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?; page_tline.tline.checkpoint(CheckpointConfig::Forced)?; - println!( - "created initial timeline {} timeline.lsn {}", + info!( + "created root timeline {} timeline.lsn {}", tli, page_tline.tline.get_last_record_lsn() ); From 7d565aa4b93836127de209eca5ceb1a98167b4f7 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Thu, 26 May 2022 12:21:15 -0400 Subject: [PATCH 0356/1022] Reduce the logging level when PG client disconnected to `INFO` (#1713) Fixes #1683. --- pageserver/src/page_service.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index f54cd550b3..1c07b63072 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -305,7 +305,29 @@ fn page_service_conn_main( let mut conn_handler = PageServerHandler::new(conf, auth); let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?; - pgbackend.run(&mut conn_handler) + match pgbackend.run(&mut conn_handler) { + Ok(()) => { + // we've been requested to shut down + Ok(()) + } + Err(err) => { + let root_cause_io_err_kind = err + .root_cause() + .downcast_ref::() + .map(|e| e.kind()); + + // `ConnectionReset` error happens when the Postgres client closes the connection. + // As this disconnection happens quite often and is expected, + // we decided to downgrade the logging level to `INFO`. + // See: https://github.com/neondatabase/neon/issues/1683. + if root_cause_io_err_kind == Some(io::ErrorKind::ConnectionReset) { + info!("Postgres client disconnected"); + Ok(()) + } else { + Err(err) + } + } + } } #[derive(Debug)] From 1d71949c51f06cd0eaf313f0ac595af3209ef57a Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Thu, 26 May 2022 14:59:03 -0400 Subject: [PATCH 0357/1022] Change proxy welcome message (#1808) Remove zenith sun and outdated instructions around .pgpass --- proxy/src/auth_backend/link.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/proxy/src/auth_backend/link.rs b/proxy/src/auth_backend/link.rs index 9bdb9e21c4..8e5fcb32a9 100644 --- a/proxy/src/auth_backend/link.rs +++ b/proxy/src/auth_backend/link.rs @@ -5,12 +5,9 @@ use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; fn hello_message(redirect_uri: &str, session_id: &str) -> String { format!( concat![ - "☀️ Welcome to Neon!\n", - "To proceed with database creation, open the following link:\n\n", + "Welcome to Neon!\n", + "Authenticate by visiting:\n", " {redirect_uri}{session_id}\n\n", - "It needs to be done once and we will send you '.pgpass' file,\n", - "which will allow you to access or create ", - "databases without opening your web browser." ], redirect_uri = redirect_uri, session_id = session_id, From 0e1bd57c533165dbe4bead8fa23baefa09c97b82 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 27 Apr 2022 00:24:59 -0700 Subject: [PATCH 0358/1022] Add WAL offloading to s3 on safekeepers. Separate task is launched for each timeline and stopped when timeline doesn't need offloading. Decision who offloads is done through etcd leader election; currently there is no pre condition for participating, that's a TODO. neon_local and tests infrastructure for remote storage in safekeepers added, along with the test itself. ref #1009 Co-authored-by: Anton Shyrabokau --- .circleci/ansible/production.hosts | 1 - .circleci/ansible/staging.hosts | 1 - .circleci/ansible/systemd/safekeeper.service | 2 +- Cargo.lock | 9 +- control_plane/src/lib.rs | 9 + control_plane/src/local_env.rs | 5 + control_plane/src/safekeeper.rs | 10 +- control_plane/src/storage.rs | 11 +- libs/etcd_broker/src/lib.rs | 4 +- libs/remote_storage/Cargo.toml | 2 +- libs/remote_storage/src/lib.rs | 88 +++- libs/utils/src/lsn.rs | 9 + pageserver/src/config.rs | 87 +--- safekeeper/Cargo.toml | 4 + safekeeper/src/bin/safekeeper.rs | 72 +-- safekeeper/src/broker.rs | 129 +++++- safekeeper/src/control_file_upgrade.rs | 8 +- safekeeper/src/http/routes.rs | 18 +- safekeeper/src/lib.rs | 15 +- safekeeper/src/receive_wal.rs | 22 +- safekeeper/src/remove_wal.rs | 2 +- safekeeper/src/s3_offload.rs | 107 ----- safekeeper/src/safekeeper.rs | 69 ++- safekeeper/src/send_wal.rs | 2 +- safekeeper/src/timeline.rs | 307 +++++++++---- safekeeper/src/wal_backup.rs | 418 ++++++++++++++++++ test_runner/batch_others/test_wal_acceptor.py | 54 ++- test_runner/fixtures/zenith_fixtures.py | 110 +++-- 28 files changed, 1146 insertions(+), 429 deletions(-) delete mode 100644 safekeeper/src/s3_offload.rs create mode 100644 safekeeper/src/wal_backup.rs diff --git a/.circleci/ansible/production.hosts b/.circleci/ansible/production.hosts index 6cefd724d8..03c6cf57e0 100644 --- a/.circleci/ansible/production.hosts +++ b/.circleci/ansible/production.hosts @@ -16,4 +16,3 @@ console_mgmt_base_url = http://console-release.local bucket_name = zenith-storage-oregon bucket_region = us-west-2 etcd_endpoints = etcd-release.local:2379 -safekeeper_enable_s3_offload = false diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index d99ffa6dac..cf5b98eaa1 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -17,4 +17,3 @@ console_mgmt_base_url = http://console-staging.local bucket_name = zenith-staging-storage-us-east-1 bucket_region = us-east-1 etcd_endpoints = etcd-staging.local:2379 -safekeeper_enable_s3_offload = false diff --git a/.circleci/ansible/systemd/safekeeper.service b/.circleci/ansible/systemd/safekeeper.service index 55088db859..a6b443c3e7 100644 --- a/.circleci/ansible/systemd/safekeeper.service +++ b/.circleci/ansible/systemd/safekeeper.service @@ -6,7 +6,7 @@ After=network.target auditd.service Type=simple User=safekeeper Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib -ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --enable-s3-offload={{ safekeeper_enable_s3_offload }} +ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote_storage='{bucket_name={{bucket_name}}, bucket_region={{bucket_region}}, prefix_in_bucket=wal}' ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed KillSignal=SIGINT diff --git a/Cargo.lock b/Cargo.lock index 840953f645..e39375c221 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1722,9 +1722,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" +checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9" [[package]] name = "oorandom" @@ -2403,6 +2403,7 @@ dependencies = [ "tempfile", "tokio", "tokio-util 0.7.0", + "toml_edit", "tracing", "workspace_hack", ] @@ -2654,6 +2655,7 @@ name = "safekeeper" version = "0.1.0" dependencies = [ "anyhow", + "async-trait", "byteorder", "bytes", "clap 3.0.14", @@ -2662,12 +2664,14 @@ dependencies = [ "daemonize", "etcd_broker", "fs2", + "futures", "git-version", "hex", "humantime", "hyper", "lazy_static", "metrics", + "once_cell", "postgres", "postgres-protocol", "postgres_ffi", @@ -2681,6 +2685,7 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-util 0.7.0", + "toml_edit", "tracing", "url", "utils", diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index c3469c3350..4dfca588ad 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -49,3 +49,12 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command { cmd } } + +fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command { + for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] { + if let Ok(value) = std::env::var(env_key) { + cmd = cmd.env(env_key, value); + } + } + cmd +} diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 015b33f591..2623f65242 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -167,6 +167,8 @@ pub struct SafekeeperConf { pub pg_port: u16, pub http_port: u16, pub sync: bool, + pub remote_storage: Option, + pub backup_threads: Option, } impl Default for SafekeeperConf { @@ -176,6 +178,8 @@ impl Default for SafekeeperConf { pg_port: 0, http_port: 0, sync: true, + remote_storage: None, + backup_threads: None, } } } @@ -377,6 +381,7 @@ impl LocalEnv { base_path != Path::new(""), "repository base path is missing" ); + ensure!( !base_path.exists(), "directory '{}' already exists. Perhaps already initialized?", diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 303d6850df..972b6d48ae 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -23,7 +23,7 @@ use utils::{ use crate::local_env::{LocalEnv, SafekeeperConf}; use crate::storage::PageServerNode; -use crate::{fill_rust_env_vars, read_pidfile}; +use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile}; #[derive(Error, Debug)] pub enum SafekeeperHttpError { @@ -143,6 +143,14 @@ impl SafekeeperNode { if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() { cmd.args(&["--broker-etcd-prefix", prefix]); } + if let Some(threads) = self.conf.backup_threads { + cmd.args(&["--backup-threads", threads.to_string().as_ref()]); + } + if let Some(ref remote_storage) = self.conf.remote_storage { + cmd.args(&["--remote-storage", remote_storage]); + } + + fill_aws_secrets_vars(&mut cmd); if !cmd.status()?.success() { bail!( diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 355c7c250d..24cdbce8f3 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -25,7 +25,7 @@ use utils::{ }; use crate::local_env::LocalEnv; -use crate::{fill_rust_env_vars, read_pidfile}; +use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile}; use pageserver::tenant_mgr::TenantInfo; #[derive(Error, Debug)] @@ -493,12 +493,3 @@ impl PageServerNode { Ok(timeline_info_response) } } - -fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command { - for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] { - if let Ok(value) = std::env::var(env_key) { - cmd = cmd.env(env_key, value); - } - } - cmd -} diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index 271f657f43..7fe142502b 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -43,10 +43,10 @@ pub struct SkTimelineInfo { #[serde_as(as = "Option")] #[serde(default)] pub commit_lsn: Option, - /// LSN up to which safekeeper offloaded WAL to s3. + /// LSN up to which safekeeper has backed WAL. #[serde_as(as = "Option")] #[serde(default)] - pub s3_wal_lsn: Option, + pub backup_lsn: Option, /// LSN of last checkpoint uploaded by pageserver. #[serde_as(as = "Option")] #[serde(default)] diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 5c62e28fda..b11b3cf371 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -6,7 +6,6 @@ edition = "2021" [dependencies] anyhow = { version = "1.0", features = ["backtrace"] } async-trait = "0.1" - metrics = { version = "0.1", path = "../metrics" } once_cell = "1.8.0" rusoto_core = "0.48" @@ -15,6 +14,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1" tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] } tokio-util = { version = "0.7", features = ["io"] } +toml_edit = { version = "0.13", features = ["easy"] } tracing = "0.1.27" workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 8092e4fc49..0889cb720c 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -16,8 +16,10 @@ use std::{ path::{Path, PathBuf}, }; -use anyhow::Context; +use anyhow::{bail, Context}; + use tokio::io; +use toml_edit::Item; use tracing::info; pub use self::{ @@ -203,6 +205,90 @@ pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) .with_extension(new_extension.as_ref()) } +impl RemoteStorageConfig { + pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { + let local_path = toml.get("local_path"); + let bucket_name = toml.get("bucket_name"); + let bucket_region = toml.get("bucket_region"); + + let max_concurrent_syncs = NonZeroUsize::new( + parse_optional_integer("max_concurrent_syncs", toml)? + .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS), + ) + .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?; + + let max_sync_errors = NonZeroU32::new( + parse_optional_integer("max_sync_errors", toml)? + .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS), + ) + .context("Failed to parse 'max_sync_errors' as a positive integer")?; + + let concurrency_limit = NonZeroUsize::new( + parse_optional_integer("concurrency_limit", toml)? + .unwrap_or(DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT), + ) + .context("Failed to parse 'concurrency_limit' as a positive integer")?; + + let storage = match (local_path, bucket_name, bucket_region) { + (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"), + (_, Some(_), None) => { + bail!("'bucket_region' option is mandatory if 'bucket_name' is given ") + } + (_, None, Some(_)) => { + bail!("'bucket_name' option is mandatory if 'bucket_region' is given ") + } + (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config { + bucket_name: parse_toml_string("bucket_name", bucket_name)?, + bucket_region: parse_toml_string("bucket_region", bucket_region)?, + prefix_in_bucket: toml + .get("prefix_in_bucket") + .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket)) + .transpose()?, + endpoint: toml + .get("endpoint") + .map(|endpoint| parse_toml_string("endpoint", endpoint)) + .transpose()?, + concurrency_limit, + }), + (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from( + parse_toml_string("local_path", local_path)?, + )), + (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"), + }; + + Ok(RemoteStorageConfig { + max_concurrent_syncs, + max_sync_errors, + storage, + }) + } +} + +// Helper functions to parse a toml Item +fn parse_optional_integer(name: &str, item: &toml_edit::Item) -> anyhow::Result> +where + I: TryFrom, + E: std::error::Error + Send + Sync + 'static, +{ + let toml_integer = match item.get(name) { + Some(item) => item + .as_integer() + .with_context(|| format!("configure option {name} is not an integer"))?, + None => return Ok(None), + }; + + I::try_from(toml_integer) + .map(Some) + .with_context(|| format!("configure option {name} is too large")) +} + +fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result { + let s = item + .as_str() + .with_context(|| format!("configure option {name} is not a string"))?; + Ok(s.to_string()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index c09d8c67ce..3dab2a625c 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -26,6 +26,9 @@ impl Lsn { /// Maximum possible value for an LSN pub const MAX: Lsn = Lsn(u64::MAX); + /// Invalid value for InvalidXLogRecPtr, as defined in xlogdefs.h + pub const INVALID: Lsn = Lsn(0); + /// Subtract a number, returning None on overflow. pub fn checked_sub>(self, other: T) -> Option { let other: u64 = other.into(); @@ -103,6 +106,12 @@ impl Lsn { pub fn is_aligned(&self) -> bool { *self == self.align() } + + /// Return if the LSN is valid + /// mimics postgres XLogRecPtrIsInvalid macro + pub fn is_valid(self) -> bool { + self != Lsn::INVALID + } } impl From for Lsn { diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 6c045d77ae..dc9d7161a2 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -5,9 +5,9 @@ //! See also `settings.md` for better description on every parameter. use anyhow::{anyhow, bail, ensure, Context, Result}; -use remote_storage::{RemoteStorageConfig, RemoteStorageKind, S3Config}; +use remote_storage::RemoteStorageConfig; use std::env; -use std::num::{NonZeroU32, NonZeroUsize}; + use std::path::{Path, PathBuf}; use std::str::FromStr; use std::time::Duration; @@ -394,7 +394,7 @@ impl PageServerConf { )), "auth_type" => builder.auth_type(parse_toml_from_str(key, item)?), "remote_storage" => { - builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?)) + builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item)?)) } "tenant_config" => { t_conf = Self::parse_toml_tenant_conf(item)?; @@ -484,64 +484,6 @@ impl PageServerConf { Ok(t_conf) } - /// subroutine of parse_config(), to parse the `[remote_storage]` table. - fn parse_remote_storage_config(toml: &toml_edit::Item) -> anyhow::Result { - let local_path = toml.get("local_path"); - let bucket_name = toml.get("bucket_name"); - let bucket_region = toml.get("bucket_region"); - - let max_concurrent_syncs = NonZeroUsize::new( - parse_optional_integer("max_concurrent_syncs", toml)? - .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS), - ) - .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?; - - let max_sync_errors = NonZeroU32::new( - parse_optional_integer("max_sync_errors", toml)? - .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS), - ) - .context("Failed to parse 'max_sync_errors' as a positive integer")?; - - let concurrency_limit = NonZeroUsize::new( - parse_optional_integer("concurrency_limit", toml)? - .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT), - ) - .context("Failed to parse 'concurrency_limit' as a positive integer")?; - - let storage = match (local_path, bucket_name, bucket_region) { - (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"), - (_, Some(_), None) => { - bail!("'bucket_region' option is mandatory if 'bucket_name' is given ") - } - (_, None, Some(_)) => { - bail!("'bucket_name' option is mandatory if 'bucket_region' is given ") - } - (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config { - bucket_name: parse_toml_string("bucket_name", bucket_name)?, - bucket_region: parse_toml_string("bucket_region", bucket_region)?, - prefix_in_bucket: toml - .get("prefix_in_bucket") - .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket)) - .transpose()?, - endpoint: toml - .get("endpoint") - .map(|endpoint| parse_toml_string("endpoint", endpoint)) - .transpose()?, - concurrency_limit, - }), - (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from( - parse_toml_string("local_path", local_path)?, - )), - (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"), - }; - - Ok(RemoteStorageConfig { - max_concurrent_syncs, - max_sync_errors, - storage, - }) - } - #[cfg(test)] pub fn test_repo_dir(test_name: &str) -> PathBuf { PathBuf::from(format!("../tmp_check/test_{test_name}")) @@ -592,23 +534,6 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result { Ok(i as u64) } -fn parse_optional_integer(name: &str, item: &toml_edit::Item) -> anyhow::Result> -where - I: TryFrom, - E: std::error::Error + Send + Sync + 'static, -{ - let toml_integer = match item.get(name) { - Some(item) => item - .as_integer() - .with_context(|| format!("configure option {name} is not an integer"))?, - None => return Ok(None), - }; - - I::try_from(toml_integer) - .map(Some) - .with_context(|| format!("configure option {name} is too large")) -} - fn parse_toml_duration(name: &str, item: &Item) -> Result { let s = item .as_str() @@ -651,8 +576,12 @@ fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result> { #[cfg(test)] mod tests { - use std::fs; + use std::{ + fs, + num::{NonZeroU32, NonZeroUsize}, + }; + use remote_storage::{RemoteStorageKind, S3Config}; use tempfile::{tempdir, TempDir}; use super::*; diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 417cf58cd5..373108c61b 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -30,6 +30,10 @@ const_format = "0.2.21" tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-util = { version = "0.7", features = ["io"] } git-version = "0.3.5" +async-trait = "0.1" +once_cell = "1.10.0" +futures = "0.3.13" +toml_edit = { version = "0.13", features = ["easy"] } postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 290b7c738a..a7628482d9 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -6,22 +6,27 @@ use clap::{App, Arg}; use const_format::formatcp; use daemonize::Daemonize; use fs2::FileExt; +use remote_storage::RemoteStorageConfig; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::path::{Path, PathBuf}; use std::thread; use tokio::sync::mpsc; +use toml_edit::Document; use tracing::*; use url::{ParseError, Url}; use safekeeper::control_file::{self}; -use safekeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR}; +use safekeeper::defaults::{ + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS, +}; +use safekeeper::http; use safekeeper::remove_wal; use safekeeper::timeline::GlobalTimelines; +use safekeeper::wal_backup; use safekeeper::wal_service; use safekeeper::SafeKeeperConf; use safekeeper::{broker, callmemaybe}; -use safekeeper::{http, s3_offload}; use utils::{ http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener, zid::NodeId, @@ -71,12 +76,6 @@ fn main() -> anyhow::Result<()> { .long("pageserver") .takes_value(true), ) - .arg( - Arg::new("ttl") - .long("ttl") - .takes_value(true) - .help("interval for keeping WAL at safekeeper node, after which them will be uploaded to S3 and removed locally"), - ) .arg( Arg::new("recall") .long("recall") @@ -118,12 +117,20 @@ fn main() -> anyhow::Result<()> { .help("a prefix to always use when polling/pusing data in etcd from this safekeeper"), ) .arg( - Arg::new("enable-s3-offload") - .long("enable-s3-offload") + Arg::new("wal-backup-threads").long("backup-threads").takes_value(true).help(formatcp!("number of threads for wal backup (default {DEFAULT_WAL_BACKUP_RUNTIME_THREADS}")), + ).arg( + Arg::new("remote-storage") + .long("remote-storage") + .takes_value(true) + .help("Remote storage configuration for WAL backup (offloading to s3) as TOML inline table, e.g. {\"max_concurrent_syncs\" = 17, \"max_sync_errors\": 13, \"bucket_name\": \"\", \"bucket_region\":\"\", \"concurrency_limit\": 119}.\nSafekeeper offloads WAL to [prefix_in_bucket/]//, mirroring structure on the file system.") + ) + .arg( + Arg::new("enable-wal-backup") + .long("enable-wal-backup") .takes_value(true) .default_value("true") .default_missing_value("true") - .help("Enable/disable s3 offloading. When disabled, safekeeper removes WAL ignoring s3 WAL horizon."), + .help("Enable/disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring WAL backup horizon."), ) .get_matches(); @@ -157,10 +164,6 @@ fn main() -> anyhow::Result<()> { conf.listen_http_addr = addr.to_owned(); } - if let Some(ttl) = arg_matches.value_of("ttl") { - conf.ttl = Some(humantime::parse_duration(ttl)?); - } - if let Some(recall) = arg_matches.value_of("recall") { conf.recall_period = humantime::parse_duration(recall)?; } @@ -182,9 +185,21 @@ fn main() -> anyhow::Result<()> { conf.broker_etcd_prefix = prefix.to_string(); } + if let Some(backup_threads) = arg_matches.value_of("wal-backup-threads") { + conf.backup_runtime_threads = backup_threads + .parse() + .with_context(|| format!("Failed to parse backup threads {}", backup_threads))?; + } + if let Some(storage_conf) = arg_matches.value_of("remote-storage") { + // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse + let storage_conf_toml = format!("remote_storage = {}", storage_conf); + let parsed_toml = storage_conf_toml.parse::()?; // parse + let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again + conf.remote_storage = Some(RemoteStorageConfig::from_toml(storage_conf_parsed_toml)?); + } // Seems like there is no better way to accept bool values explicitly in clap. - conf.s3_offload_enabled = arg_matches - .value_of("enable-s3-offload") + conf.wal_backup_enabled = arg_matches + .value_of("enable-wal-backup") .unwrap() .parse() .context("failed to parse bool enable-s3-offload bool")?; @@ -252,7 +267,8 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; let (callmemaybe_tx, callmemaybe_rx) = mpsc::unbounded_channel(); - GlobalTimelines::set_callmemaybe_tx(callmemaybe_tx); + let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); + GlobalTimelines::init(callmemaybe_tx, wal_backup_launcher_tx); let conf_ = conf.clone(); threads.push( @@ -270,17 +286,6 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo })?, ); - if conf.ttl.is_some() { - let conf_ = conf.clone(); - threads.push( - thread::Builder::new() - .name("S3 offload thread".into()) - .spawn(|| { - s3_offload::thread_main(conf_); - })?, - ); - } - let conf_cloned = conf.clone(); let safekeeper_thread = thread::Builder::new() .name("Safekeeper thread".into()) @@ -330,6 +335,15 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo })?, ); + let conf_ = conf.clone(); + threads.push( + thread::Builder::new() + .name("wal backup launcher thread".into()) + .spawn(move || { + wal_backup::wal_backup_launcher_thread_main(conf_, wal_backup_launcher_rx); + })?, + ); + // TODO: put more thoughts into handling of failed threads // We probably should restart them. diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 59d282d378..676719b60d 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -1,5 +1,6 @@ //! Communication with etcd, providing safekeeper peers and pageserver coordination. +use anyhow::anyhow; use anyhow::Context; use anyhow::Error; use anyhow::Result; @@ -7,9 +8,11 @@ use etcd_broker::Client; use etcd_broker::PutOptions; use etcd_broker::SkTimelineSubscriptionKind; use std::time::Duration; +use tokio::spawn; use tokio::task::JoinHandle; use tokio::{runtime, time::sleep}; use tracing::*; +use url::Url; use crate::{timeline::GlobalTimelines, SafeKeeperConf}; use utils::zid::{NodeId, ZTenantTimelineId}; @@ -44,6 +47,118 @@ fn timeline_safekeeper_path( ) } +pub struct Election { + pub election_name: String, + pub candidate_name: String, + pub broker_endpoints: Vec, +} + +impl Election { + pub fn new(election_name: String, candidate_name: String, broker_endpoints: Vec) -> Self { + Self { + election_name, + candidate_name, + broker_endpoints, + } + } +} + +pub struct ElectionLeader { + client: Client, + keep_alive: JoinHandle>, +} + +impl ElectionLeader { + pub async fn check_am_i( + &mut self, + election_name: String, + candidate_name: String, + ) -> Result { + let resp = self.client.leader(election_name).await?; + + let kv = resp.kv().ok_or(anyhow!("failed to get leader response"))?; + let leader = kv.value_str()?; + + Ok(leader == candidate_name) + } + + pub async fn give_up(self) { + // self.keep_alive.abort(); + // TODO: it'll be wise to resign here but it'll happen after lease expiration anyway + // should we await for keep alive termination? + let _ = self.keep_alive.await; + } +} + +pub async fn get_leader(req: &Election) -> Result { + let mut client = Client::connect(req.broker_endpoints.clone(), None) + .await + .context("Could not connect to etcd")?; + + let lease = client + .lease_grant(LEASE_TTL_SEC, None) + .await + .context("Could not acquire a lease"); + + let lease_id = lease.map(|l| l.id()).unwrap(); + + let keep_alive = spawn::<_>(lease_keep_alive(client.clone(), lease_id)); + + if let Err(e) = client + .campaign( + req.election_name.clone(), + req.candidate_name.clone(), + lease_id, + ) + .await + { + keep_alive.abort(); + let _ = keep_alive.await; + return Err(e.into()); + } + + Ok(ElectionLeader { client, keep_alive }) +} + +async fn lease_keep_alive(mut client: Client, lease_id: i64) -> Result<()> { + let (mut keeper, mut ka_stream) = client + .lease_keep_alive(lease_id) + .await + .context("failed to create keepalive stream")?; + + loop { + let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); + + keeper + .keep_alive() + .await + .context("failed to send LeaseKeepAliveRequest")?; + + ka_stream + .message() + .await + .context("failed to receive LeaseKeepAliveResponse")?; + + sleep(push_interval).await; + } +} + +pub fn get_campaign_name( + election_name: String, + broker_prefix: String, + timeline_id: &ZTenantTimelineId, +) -> String { + return format!( + "{}/{}", + SkTimelineSubscriptionKind::timeline(broker_prefix, *timeline_id).watch_key(), + election_name + ); +} + +pub fn get_candiate_name(system_id: NodeId) -> String { + format!("id_{}", system_id) +} + /// Push once in a while data about all active timelines to the broker. async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { let mut client = Client::connect(&conf.broker_endpoints, None).await?; @@ -59,7 +174,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // sensitive and there is no risk of deadlock as we don't await while // lock is held. for zttid in GlobalTimelines::get_active_timelines() { - if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { + if let Some(tli) = GlobalTimelines::get_loaded(zttid) { let sk_info = tli.get_public_info(&conf)?; let put_opts = PutOptions::new().with_lease(lease.id()); client @@ -106,12 +221,13 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { // note: there are blocking operations below, but it's considered fine for now if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { for (safekeeper_id, info) in sk_info { - tli.record_safekeeper_info(&info, safekeeper_id)? + tli.record_safekeeper_info(&info, safekeeper_id).await? } } } } None => { + // XXX it means we lost connection with etcd, error is consumed inside sub object debug!("timeline updates sender closed, aborting the pull loop"); return Ok(()); } @@ -142,11 +258,12 @@ async fn main_loop(conf: SafeKeeperConf) { }, res = async { pull_handle.as_mut().unwrap().await }, if pull_handle.is_some() => { // was it panic or normal error? - let err = match res { - Ok(res_internal) => res_internal.unwrap_err(), - Err(err_outer) => err_outer.into(), + match res { + Ok(res_internal) => if let Err(err_inner) = res_internal { + warn!("pull task failed: {:?}", err_inner); + } + Err(err_outer) => { warn!("pull task panicked: {:?}", err_outer) } }; - warn!("pull task failed: {:?}", err); pull_handle = None; }, _ = ticker.tick() => { diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 22716de1a0..8d36472540 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -165,7 +165,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), @@ -188,7 +188,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), @@ -211,7 +211,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), @@ -234,7 +234,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn::INVALID, peer_horizon_lsn: oldstate.peer_horizon_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 3f6ade970d..b0197a9a2a 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -70,19 +70,19 @@ struct TimelineStatus { timeline_id: ZTimelineId, acceptor_state: AcceptorStateStatus, #[serde(serialize_with = "display_serialize")] + flush_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] timeline_start_lsn: Lsn, #[serde(serialize_with = "display_serialize")] local_start_lsn: Lsn, #[serde(serialize_with = "display_serialize")] commit_lsn: Lsn, #[serde(serialize_with = "display_serialize")] - s3_wal_lsn: Lsn, + backup_lsn: Lsn, #[serde(serialize_with = "display_serialize")] peer_horizon_lsn: Lsn, #[serde(serialize_with = "display_serialize")] remote_consistent_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - flush_lsn: Lsn, } /// Report info about timeline. @@ -107,13 +107,13 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result, pub recall_period: Duration, + pub remote_storage: Option, + pub backup_runtime_threads: usize, + pub wal_backup_enabled: bool, pub my_id: NodeId, pub broker_endpoints: Vec, pub broker_etcd_prefix: String, - pub s3_offload_enabled: bool, } impl SafeKeeperConf { @@ -77,12 +81,13 @@ impl Default for SafeKeeperConf { no_sync: false, listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - ttl: None, + remote_storage: None, recall_period: defaults::DEFAULT_RECALL_PERIOD, my_id: NodeId(0), broker_endpoints: Vec::new(), broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), - s3_offload_enabled: true, + backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS, + wal_backup_enabled: true, } } } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 0ef335c9ed..88b7816912 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -85,16 +85,10 @@ impl<'pg> ReceiveWalConn<'pg> { _ => bail!("unexpected message {:?} instead of greeting", next_msg), } - // Register the connection and defer unregister. - spg.timeline - .get() - .on_compute_connect(self.pageserver_connstr.as_ref())?; - let _guard = ComputeConnectionGuard { - timeline: Arc::clone(spg.timeline.get()), - }; - let mut next_msg = Some(next_msg); + let mut first_time_through = true; + let mut _guard: Option = None; loop { if matches!(next_msg, Some(ProposerAcceptorMessage::AppendRequest(_))) { // poll AppendRequest's without blocking and write WAL to disk without flushing, @@ -122,6 +116,18 @@ impl<'pg> ReceiveWalConn<'pg> { self.write_msg(&reply)?; } } + if first_time_through { + // Register the connection and defer unregister. Do that only + // after processing first message, as it sets wal_seg_size, + // wanted by many. + spg.timeline + .get() + .on_compute_connect(self.pageserver_connstr.as_ref())?; + _guard = Some(ComputeConnectionGuard { + timeline: Arc::clone(spg.timeline.get()), + }); + first_time_through = false; + } // blocking wait for the next message if next_msg.is_none() { diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index 3278d51bd3..004c0243f9 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -12,7 +12,7 @@ pub fn thread_main(conf: SafeKeeperConf) { let active_tlis = GlobalTimelines::get_active_timelines(); for zttid in &active_tlis { if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) { - if let Err(e) = tli.remove_old_wal(conf.s3_offload_enabled) { + if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) { warn!( "failed to remove WAL for tenant {} timeline {}: {}", tli.zttid.tenant_id, tli.zttid.timeline_id, e diff --git a/safekeeper/src/s3_offload.rs b/safekeeper/src/s3_offload.rs deleted file mode 100644 index 2851c0b8a0..0000000000 --- a/safekeeper/src/s3_offload.rs +++ /dev/null @@ -1,107 +0,0 @@ -// -// Offload old WAL segments to S3 and remove them locally -// Needs `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to be set -// if no IAM bucket access is used. -// - -use anyhow::{bail, Context}; -use postgres_ffi::xlog_utils::*; -use remote_storage::{ - GenericRemoteStorage, RemoteStorage, RemoteStorageConfig, S3Bucket, S3Config, S3ObjectKey, -}; -use std::collections::HashSet; -use std::env; -use std::num::{NonZeroU32, NonZeroUsize}; -use std::path::Path; -use std::time::SystemTime; -use tokio::fs::{self, File}; -use tokio::io::BufReader; -use tokio::runtime; -use tokio::time::sleep; -use tracing::*; -use walkdir::WalkDir; - -use crate::SafeKeeperConf; - -pub fn thread_main(conf: SafeKeeperConf) { - // Create a new thread pool - // - // FIXME: keep it single-threaded for now, make it easier to debug with gdb, - // and we're not concerned with performance yet. - //let runtime = runtime::Runtime::new().unwrap(); - let runtime = runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - - info!("Starting S3 offload task"); - - runtime.block_on(async { - main_loop(&conf).await.unwrap(); - }); -} - -async fn offload_files( - remote_storage: &S3Bucket, - listing: &HashSet, - dir_path: &Path, - conf: &SafeKeeperConf, -) -> anyhow::Result { - let horizon = SystemTime::now() - conf.ttl.unwrap(); - let mut n: u64 = 0; - for entry in WalkDir::new(dir_path) { - let entry = entry?; - let path = entry.path(); - - if path.is_file() - && IsXLogFileName(entry.file_name().to_str().unwrap()) - && entry.metadata().unwrap().created().unwrap() <= horizon - { - let remote_path = remote_storage.remote_object_id(path)?; - if !listing.contains(&remote_path) { - let file = File::open(&path).await?; - let file_length = file.metadata().await?.len() as usize; - remote_storage - .upload(BufReader::new(file), file_length, &remote_path, None) - .await?; - - fs::remove_file(&path).await?; - n += 1; - } - } - } - Ok(n) -} - -async fn main_loop(conf: &SafeKeeperConf) -> anyhow::Result<()> { - let remote_storage = match GenericRemoteStorage::new( - conf.workdir.clone(), - &RemoteStorageConfig { - max_concurrent_syncs: NonZeroUsize::new(10).unwrap(), - max_sync_errors: NonZeroU32::new(1).unwrap(), - storage: remote_storage::RemoteStorageKind::AwsS3(S3Config { - bucket_name: "zenith-testbucket".to_string(), - bucket_region: env::var("S3_REGION").context("S3_REGION env var is not set")?, - prefix_in_bucket: Some("walarchive/".to_string()), - endpoint: Some(env::var("S3_ENDPOINT").context("S3_ENDPOINT env var is not set")?), - concurrency_limit: NonZeroUsize::new(20).unwrap(), - }), - }, - )? { - GenericRemoteStorage::Local(_) => { - bail!("Unexpected: got local storage for the remote config") - } - GenericRemoteStorage::S3(remote_storage) => remote_storage, - }; - - loop { - let listing = remote_storage - .list() - .await? - .into_iter() - .collect::>(); - let n = offload_files(&remote_storage, &listing, &conf.workdir, conf).await?; - info!("Offload {n} files to S3"); - sleep(conf.ttl.unwrap()).await; - } -} diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index b8b969929d..9a07127771 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -19,6 +19,7 @@ use lazy_static::lazy_static; use crate::control_file; use crate::send_wal::HotStandbyFeedback; + use crate::wal_storage; use metrics::{register_gauge_vec, Gauge, GaugeVec}; use postgres_ffi::xlog_utils::MAX_SEND_SIZE; @@ -141,7 +142,7 @@ pub struct ServerInfo { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct PeerInfo { /// LSN up to which safekeeper offloaded WAL to s3. - s3_wal_lsn: Lsn, + backup_lsn: Lsn, /// Term of the last entry. term: Term, /// LSN of the last record. @@ -153,7 +154,7 @@ pub struct PeerInfo { impl PeerInfo { fn new() -> Self { Self { - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn::INVALID, term: INVALID_TERM, flush_lsn: Lsn(0), commit_lsn: Lsn(0), @@ -193,9 +194,9 @@ pub struct SafeKeeperState { /// Part of WAL acknowledged by quorum and available locally. Always points /// to record boundary. pub commit_lsn: Lsn, - /// First LSN not yet offloaded to s3. Useful to persist to avoid finding - /// out offloading progress on boot. - pub s3_wal_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn /// of last record streamed to everyone). Persisting it helps skipping /// recovery in walproposer, generally we compute it from peers. In @@ -217,7 +218,7 @@ pub struct SafeKeeperState { // are not flushed yet. pub struct SafekeeperMemState { pub commit_lsn: Lsn, - pub s3_wal_lsn: Lsn, // TODO: keep only persistent version + pub backup_lsn: Lsn, pub peer_horizon_lsn: Lsn, pub remote_consistent_lsn: Lsn, pub proposer_uuid: PgUuid, @@ -241,7 +242,7 @@ impl SafeKeeperState { timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: Lsn(0), - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn::INVALID, peer_horizon_lsn: Lsn(0), remote_consistent_lsn: Lsn(0), peers: Peers(peers.iter().map(|p| (*p, PeerInfo::new())).collect()), @@ -559,7 +560,7 @@ where epoch_start_lsn: Lsn(0), inmem: SafekeeperMemState { commit_lsn: state.commit_lsn, - s3_wal_lsn: state.s3_wal_lsn, + backup_lsn: state.backup_lsn, peer_horizon_lsn: state.peer_horizon_lsn, remote_consistent_lsn: state.remote_consistent_lsn, proposer_uuid: state.proposer_uuid, @@ -649,7 +650,6 @@ where self.state.persist(&state)?; } - // pass wal_seg_size to read WAL and find flush_lsn self.wal_store.init_storage(&self.state)?; info!( @@ -764,6 +764,14 @@ where self.inmem.commit_lsn = commit_lsn; self.metrics.commit_lsn.set(self.inmem.commit_lsn.0 as f64); + // We got our first commit_lsn, which means we should sync + // everything to disk, to initialize the state. + if self.state.commit_lsn == Lsn::INVALID && commit_lsn != Lsn::INVALID { + self.inmem.backup_lsn = self.inmem.commit_lsn; // initialize backup_lsn + self.wal_store.flush_wal()?; + self.persist_control_file()?; + } + // If new commit_lsn reached epoch switch, force sync of control // file: walproposer in sync mode is very interested when this // happens. Note: this is for sync-safekeepers mode only, as @@ -775,22 +783,14 @@ where self.persist_control_file()?; } - // We got our first commit_lsn, which means we should sync - // everything to disk, to initialize the state. - if self.state.commit_lsn == Lsn(0) && commit_lsn > Lsn(0) { - self.wal_store.flush_wal()?; - self.persist_control_file()?; - } - Ok(()) } /// Persist in-memory state to the disk. fn persist_control_file(&mut self) -> Result<()> { let mut state = self.state.clone(); - state.commit_lsn = self.inmem.commit_lsn; - state.s3_wal_lsn = self.inmem.s3_wal_lsn; + state.backup_lsn = self.inmem.backup_lsn; state.peer_horizon_lsn = self.inmem.peer_horizon_lsn; state.remote_consistent_lsn = self.inmem.remote_consistent_lsn; state.proposer_uuid = self.inmem.proposer_uuid; @@ -898,11 +898,11 @@ where self.update_commit_lsn()?; } } - if let Some(s3_wal_lsn) = sk_info.s3_wal_lsn { - let new_s3_wal_lsn = max(s3_wal_lsn, self.inmem.s3_wal_lsn); + if let Some(backup_lsn) = sk_info.backup_lsn { + let new_backup_lsn = max(backup_lsn, self.inmem.backup_lsn); sync_control_file |= - self.state.s3_wal_lsn + (self.state.server.wal_seg_size as u64) < new_s3_wal_lsn; - self.inmem.s3_wal_lsn = new_s3_wal_lsn; + self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn; + self.inmem.backup_lsn = new_backup_lsn; } if let Some(remote_consistent_lsn) = sk_info.remote_consistent_lsn { let new_remote_consistent_lsn = @@ -930,29 +930,23 @@ where /// offloading. /// While it is safe to use inmem values for determining horizon, /// we use persistent to make possible normal states less surprising. - pub fn get_horizon_segno(&self, s3_offload_enabled: bool) -> XLogSegNo { - let s3_offload_horizon = if s3_offload_enabled { - self.state.s3_wal_lsn - } else { - Lsn(u64::MAX) - }; - let horizon_lsn = min( - min( - self.state.remote_consistent_lsn, - self.state.peer_horizon_lsn, - ), - s3_offload_horizon, + pub fn get_horizon_segno(&self, wal_backup_enabled: bool) -> XLogSegNo { + let mut horizon_lsn = min( + self.state.remote_consistent_lsn, + self.state.peer_horizon_lsn, ); + if wal_backup_enabled { + horizon_lsn = min(horizon_lsn, self.state.backup_lsn); + } horizon_lsn.segment_number(self.state.server.wal_seg_size as usize) } } #[cfg(test)] mod tests { - use std::ops::Deref; - use super::*; use crate::wal_storage::Storage; + use std::ops::Deref; // fake storage for tests struct InMemoryState { @@ -1013,6 +1007,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok @@ -1028,6 +1023,7 @@ mod tests { let storage = InMemoryState { persisted_state: state, }; + sk = SafeKeeper::new(ztli, storage, sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok @@ -1045,6 +1041,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index d52dd6ea57..a89ed18071 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -315,7 +315,7 @@ impl ReplicationConn { } else { // TODO: also check once in a while whether we are walsender // to right pageserver. - if spg.timeline.get().check_deactivate(replica_id)? { + if spg.timeline.get().stop_walsender(replica_id)? { // Shut down, timeline is suspended. // TODO create proper error type for this bail!("end streaming to {:?}", spg.appname); diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 0953439bd8..74a61410fd 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -8,6 +8,7 @@ use lazy_static::lazy_static; use postgres_ffi::xlog_utils::XLogSegNo; use serde::Serialize; +use tokio::sync::watch; use std::cmp::{max, min}; use std::collections::HashMap; @@ -15,7 +16,7 @@ use std::fs::{self}; use std::sync::{Arc, Condvar, Mutex, MutexGuard}; use std::time::Duration; -use tokio::sync::mpsc::UnboundedSender; +use tokio::sync::mpsc::{Sender, UnboundedSender}; use tracing::*; use utils::{ @@ -25,13 +26,13 @@ use utils::{ }; use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; - use crate::control_file; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, SafekeeperMemState, }; use crate::send_wal::HotStandbyFeedback; + use crate::wal_storage; use crate::wal_storage::Storage as wal_storage_iface; use crate::SafeKeeperConf; @@ -81,10 +82,14 @@ struct SharedState { notified_commit_lsn: Lsn, /// State of replicas replicas: Vec>, - /// Inactive clusters shouldn't occupy any resources, so timeline is - /// activated whenever there is a compute connection or pageserver is not - /// caughtup (it must have latest WAL for new compute start) and suspended - /// otherwise. + /// True when WAL backup launcher oversees the timeline, making sure WAL is + /// offloaded, allows to bother launcher less. + wal_backup_active: bool, + /// True whenever there is at least some pending activity on timeline: live + /// compute connection, pageserver is not caughtup (it must have latest WAL + /// for new compute start) or WAL backuping is not finished. Practically it + /// means safekeepers broadcast info to peers about the timeline, old WAL is + /// trimmed. /// /// TODO: it might be better to remove tli completely from GlobalTimelines /// when tli is inactive instead of having this flag. @@ -103,6 +108,7 @@ impl SharedState { ) -> Result { let state = SafeKeeperState::new(zttid, peer_ids); let control_store = control_file::FileStorage::create_new(zttid, conf, state)?; + let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?; @@ -110,6 +116,7 @@ impl SharedState { notified_commit_lsn: Lsn(0), sk, replicas: Vec::new(), + wal_backup_active: false, active: false, num_computes: 0, pageserver_connstr: None, @@ -129,15 +136,62 @@ impl SharedState { notified_commit_lsn: Lsn(0), sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?, replicas: Vec::new(), + wal_backup_active: false, active: false, num_computes: 0, pageserver_connstr: None, last_removed_segno: 0, }) } + fn is_active(&self) -> bool { + self.is_wal_backup_required() + // FIXME: add tracking of relevant pageservers and check them here individually, + // otherwise migration won't work (we suspend too early). + || self.sk.inmem.remote_consistent_lsn <= self.sk.inmem.commit_lsn + } - /// Activate the timeline: start/change walsender (via callmemaybe). - fn activate( + /// Mark timeline active/inactive and return whether s3 offloading requires + /// start/stop action. + fn update_status(&mut self) -> bool { + self.active = self.is_active(); + self.is_wal_backup_action_pending() + } + + /// Should we run s3 offloading in current state? + fn is_wal_backup_required(&self) -> bool { + let seg_size = self.get_wal_seg_size(); + self.num_computes > 0 || + // Currently only the whole segment is offloaded, so compare segment numbers. + (self.sk.inmem.commit_lsn.segment_number(seg_size) > + self.sk.inmem.backup_lsn.segment_number(seg_size)) + } + + /// Is current state of s3 offloading is not what it ought to be? + fn is_wal_backup_action_pending(&self) -> bool { + let res = self.wal_backup_active != self.is_wal_backup_required(); + if res { + let action_pending = if self.is_wal_backup_required() { + "start" + } else { + "stop" + }; + trace!( + "timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}", + self.sk.state.timeline_id, action_pending, self.num_computes, self.sk.inmem.commit_lsn, self.sk.inmem.backup_lsn + ); + } + res + } + + /// Returns whether s3 offloading is required and sets current status as + /// matching. + fn wal_backup_attend(&mut self) -> bool { + self.wal_backup_active = self.is_wal_backup_required(); + self.wal_backup_active + } + + /// start/change walsender (via callmemaybe). + fn callmemaybe_sub( &mut self, zttid: &ZTenantTimelineId, pageserver_connstr: Option<&String>, @@ -179,42 +233,42 @@ impl SharedState { ); } self.pageserver_connstr = pageserver_connstr.map(|c| c.to_owned()); - self.active = true; Ok(()) } /// Deactivate the timeline: stop callmemaybe. - fn deactivate( + fn callmemaybe_unsub( &mut self, zttid: &ZTenantTimelineId, callmemaybe_tx: &UnboundedSender, ) -> Result<()> { - if self.active { - if let Some(ref pageserver_connstr) = self.pageserver_connstr { - let subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.to_owned(), - ); - callmemaybe_tx - .send(CallmeEvent::Unsubscribe(subscription_key)) - .unwrap_or_else(|e| { - error!( - "failed to send Unsubscribe request to callmemaybe thread {}", - e - ); - }); - info!( - "timeline {} is unsubscribed from callmemaybe to {}", - zttid.timeline_id, - self.pageserver_connstr.as_ref().unwrap() - ); - } - self.active = false; + if let Some(ref pageserver_connstr) = self.pageserver_connstr { + let subscription_key = SubscriptionStateKey::new( + zttid.tenant_id, + zttid.timeline_id, + pageserver_connstr.to_owned(), + ); + callmemaybe_tx + .send(CallmeEvent::Unsubscribe(subscription_key)) + .unwrap_or_else(|e| { + error!( + "failed to send Unsubscribe request to callmemaybe thread {}", + e + ); + }); + info!( + "timeline {} is unsubscribed from callmemaybe to {}", + zttid.timeline_id, + self.pageserver_connstr.as_ref().unwrap() + ); } Ok(()) } + fn get_wal_seg_size(&self) -> usize { + self.sk.state.server.wal_seg_size as usize + } + /// Get combined state of all alive replicas pub fn get_replicas_state(&self) -> ReplicaState { let mut acc = ReplicaState::new(); @@ -278,6 +332,13 @@ impl SharedState { pub struct Timeline { pub zttid: ZTenantTimelineId, pub callmemaybe_tx: UnboundedSender, + /// Sending here asks for wal backup launcher attention (start/stop + /// offloading). Sending zttid instead of concrete command allows to do + /// sending without timeline lock. + wal_backup_launcher_tx: Sender, + commit_lsn_watch_tx: watch::Sender, + /// For breeding receivers. + commit_lsn_watch_rx: watch::Receiver, mutex: Mutex, /// conditional variable used to notify wal senders cond: Condvar, @@ -287,11 +348,17 @@ impl Timeline { fn new( zttid: ZTenantTimelineId, callmemaybe_tx: UnboundedSender, + wal_backup_launcher_tx: Sender, shared_state: SharedState, ) -> Timeline { + let (commit_lsn_watch_tx, commit_lsn_watch_rx) = + watch::channel(shared_state.sk.inmem.commit_lsn); Timeline { zttid, callmemaybe_tx, + wal_backup_launcher_tx, + commit_lsn_watch_tx, + commit_lsn_watch_rx, mutex: Mutex::new(shared_state), cond: Condvar::new(), } @@ -301,13 +368,21 @@ impl Timeline { /// not running yet. /// Can fail only if channel to a static thread got closed, which is not normal at all. pub fn on_compute_connect(&self, pageserver_connstr: Option<&String>) -> Result<()> { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.num_computes += 1; - // FIXME: currently we always adopt latest pageserver connstr, but we - // should have kind of generations assigned by compute to distinguish - // the latest one or even pass it through consensus to reliably deliver - // to all safekeepers. - shared_state.activate(&self.zttid, pageserver_connstr, &self.callmemaybe_tx)?; + let is_wal_backup_action_pending: bool; + { + let mut shared_state = self.mutex.lock().unwrap(); + shared_state.num_computes += 1; + is_wal_backup_action_pending = shared_state.update_status(); + // FIXME: currently we always adopt latest pageserver connstr, but we + // should have kind of generations assigned by compute to distinguish + // the latest one or even pass it through consensus to reliably deliver + // to all safekeepers. + shared_state.callmemaybe_sub(&self.zttid, pageserver_connstr, &self.callmemaybe_tx)?; + } + // Wake up wal backup launcher, if offloading not started yet. + if is_wal_backup_action_pending { + self.wal_backup_launcher_tx.blocking_send(self.zttid)?; + } Ok(()) } @@ -315,38 +390,43 @@ impl Timeline { /// pageserver doesn't need catchup. /// Can fail only if channel to a static thread got closed, which is not normal at all. pub fn on_compute_disconnect(&self) -> Result<()> { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.num_computes -= 1; - // If there is no pageserver, can suspend right away; otherwise let - // walsender do that. - if shared_state.num_computes == 0 && shared_state.pageserver_connstr.is_none() { - shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; + let is_wal_backup_action_pending: bool; + { + let mut shared_state = self.mutex.lock().unwrap(); + shared_state.num_computes -= 1; + is_wal_backup_action_pending = shared_state.update_status(); + } + // Wake up wal backup launcher, if it is time to stop the offloading. + if is_wal_backup_action_pending { + self.wal_backup_launcher_tx.blocking_send(self.zttid)?; } Ok(()) } - /// Deactivate tenant if there is no computes and pageserver is caughtup, - /// assuming the pageserver status is in replica_id. - /// Returns true if deactivated. - pub fn check_deactivate(&self, replica_id: usize) -> Result { + /// Whether we still need this walsender running? + /// TODO: check this pageserver is actually interested in this timeline. + pub fn stop_walsender(&self, replica_id: usize) -> Result { let mut shared_state = self.mutex.lock().unwrap(); - if !shared_state.active { - // already suspended - return Ok(true); - } if shared_state.num_computes == 0 { let replica_state = shared_state.replicas[replica_id].unwrap(); - let deactivate = shared_state.notified_commit_lsn == Lsn(0) || // no data at all yet - (replica_state.last_received_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. - replica_state.last_received_lsn >= shared_state.sk.inmem.commit_lsn); - if deactivate { - shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; + let stop = shared_state.notified_commit_lsn == Lsn(0) || // no data at all yet + (replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. + replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); + if stop { + shared_state.callmemaybe_unsub(&self.zttid, &self.callmemaybe_tx)?; return Ok(true); } } Ok(false) } + /// Returns whether s3 offloading is required and sets current status as + /// matching it. + pub fn wal_backup_attend(&self) -> bool { + let mut shared_state = self.mutex.lock().unwrap(); + shared_state.wal_backup_attend() + } + /// Deactivates the timeline, assuming it is being deleted. /// Returns whether the timeline was already active. /// @@ -354,10 +434,14 @@ impl Timeline { /// will stop by themselves eventually (possibly with errors, but no panics). There should be no /// compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but /// we're deleting the timeline anyway. - pub fn deactivate_for_delete(&self) -> Result { - let mut shared_state = self.mutex.lock().unwrap(); - let was_active = shared_state.active; - shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; + pub async fn deactivate_for_delete(&self) -> Result { + let was_active: bool; + { + let mut shared_state = self.mutex.lock().unwrap(); + was_active = shared_state.active; + shared_state.callmemaybe_unsub(&self.zttid, &self.callmemaybe_tx)?; + } + self.wal_backup_launcher_tx.send(self.zttid).await?; Ok(was_active) } @@ -391,6 +475,7 @@ impl Timeline { } // Notify caught-up WAL senders about new WAL data received + // TODO: replace-unify it with commit_lsn_watch. fn notify_wal_senders(&self, shared_state: &mut MutexGuard) { if shared_state.notified_commit_lsn < shared_state.sk.inmem.commit_lsn { shared_state.notified_commit_lsn = shared_state.sk.inmem.commit_lsn; @@ -398,12 +483,17 @@ impl Timeline { } } + pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver { + self.commit_lsn_watch_rx.clone() + } + /// Pass arrived message to the safekeeper. pub fn process_msg( &self, msg: &ProposerAcceptorMessage, ) -> Result> { let mut rmsg: Option; + let commit_lsn: Lsn; { let mut shared_state = self.mutex.lock().unwrap(); rmsg = shared_state.sk.process_msg(msg)?; @@ -419,15 +509,31 @@ impl Timeline { // Ping wal sender that new data might be available. self.notify_wal_senders(&mut shared_state); + commit_lsn = shared_state.sk.inmem.commit_lsn; } + self.commit_lsn_watch_tx.send(commit_lsn)?; Ok(rmsg) } + pub fn get_wal_seg_size(&self) -> usize { + self.mutex.lock().unwrap().get_wal_seg_size() + } + pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) { let shared_state = self.mutex.lock().unwrap(); (shared_state.sk.inmem.clone(), shared_state.sk.state.clone()) } + pub fn get_wal_backup_lsn(&self) -> Lsn { + self.mutex.lock().unwrap().sk.inmem.backup_lsn + } + + pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) { + self.mutex.lock().unwrap().sk.inmem.backup_lsn = backup_lsn; + // we should check whether to shut down offloader, but this will be done + // soon by peer communication anyway. + } + /// Prepare public safekeeper info for reporting. pub fn get_public_info(&self, conf: &SafeKeeperConf) -> anyhow::Result { let shared_state = self.mutex.lock().unwrap(); @@ -436,7 +542,6 @@ impl Timeline { flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()), // note: this value is not flushed to control file yet and can be lost commit_lsn: Some(shared_state.sk.inmem.commit_lsn), - s3_wal_lsn: Some(shared_state.sk.inmem.s3_wal_lsn), // TODO: rework feedbacks to avoid max here remote_consistent_lsn: Some(max( shared_state.get_replicas_state().remote_consistent_lsn, @@ -444,14 +549,35 @@ impl Timeline { )), peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn), safekeeper_connection_string: Some(conf.listen_pg_addr.clone()), + backup_lsn: Some(shared_state.sk.inmem.backup_lsn), }) } /// Update timeline state with peer safekeeper data. - pub fn record_safekeeper_info(&self, sk_info: &SkTimelineInfo, _sk_id: NodeId) -> Result<()> { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.sk.record_safekeeper_info(sk_info)?; - self.notify_wal_senders(&mut shared_state); + pub async fn record_safekeeper_info( + &self, + sk_info: &SkTimelineInfo, + _sk_id: NodeId, + ) -> Result<()> { + let is_wal_backup_action_pending: bool; + let commit_lsn: Lsn; + { + let mut shared_state = self.mutex.lock().unwrap(); + // WAL seg size not initialized yet (no message from compute ever + // received), can't do much without it. + if shared_state.get_wal_seg_size() == 0 { + return Ok(()); + } + shared_state.sk.record_safekeeper_info(sk_info)?; + self.notify_wal_senders(&mut shared_state); + is_wal_backup_action_pending = shared_state.update_status(); + commit_lsn = shared_state.sk.inmem.commit_lsn; + } + self.commit_lsn_watch_tx.send(commit_lsn)?; + // Wake up wal backup launcher, if it is time to stop the offloading. + if is_wal_backup_action_pending { + self.wal_backup_launcher_tx.send(self.zttid).await?; + } Ok(()) } @@ -476,16 +602,16 @@ impl Timeline { shared_state.sk.wal_store.flush_lsn() } - pub fn remove_old_wal(&self, s3_offload_enabled: bool) -> Result<()> { + pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { let horizon_segno: XLogSegNo; let remover: Box Result<(), anyhow::Error>>; { let shared_state = self.mutex.lock().unwrap(); // WAL seg size not initialized yet, no WAL exists. - if shared_state.sk.state.server.wal_seg_size == 0 { + if shared_state.get_wal_seg_size() == 0 { return Ok(()); } - horizon_segno = shared_state.sk.get_horizon_segno(s3_offload_enabled); + horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled); remover = shared_state.sk.wal_store.remove_up_to(); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { return Ok(()); @@ -522,12 +648,14 @@ impl TimelineTools for Option> { struct GlobalTimelinesState { timelines: HashMap>, callmemaybe_tx: Option>, + wal_backup_launcher_tx: Option>, } lazy_static! { static ref TIMELINES_STATE: Mutex = Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), - callmemaybe_tx: None + callmemaybe_tx: None, + wal_backup_launcher_tx: None, }); } @@ -541,10 +669,15 @@ pub struct TimelineDeleteForceResult { pub struct GlobalTimelines; impl GlobalTimelines { - pub fn set_callmemaybe_tx(callmemaybe_tx: UnboundedSender) { + pub fn init( + callmemaybe_tx: UnboundedSender, + wal_backup_launcher_tx: Sender, + ) { let mut state = TIMELINES_STATE.lock().unwrap(); assert!(state.callmemaybe_tx.is_none()); state.callmemaybe_tx = Some(callmemaybe_tx); + assert!(state.wal_backup_launcher_tx.is_none()); + state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); } fn create_internal( @@ -559,12 +692,14 @@ impl GlobalTimelines { // TODO: check directory existence let dir = conf.timeline_dir(&zttid); fs::create_dir_all(dir)?; + let shared_state = SharedState::create(conf, &zttid, peer_ids) .context("failed to create shared state")?; let new_tli = Arc::new(Timeline::new( zttid, state.callmemaybe_tx.as_ref().unwrap().clone(), + state.wal_backup_launcher_tx.as_ref().unwrap().clone(), shared_state, )); state.timelines.insert(zttid, Arc::clone(&new_tli)); @@ -594,8 +729,7 @@ impl GlobalTimelines { match state.timelines.get(&zttid) { Some(result) => Ok(Arc::clone(result)), None => { - let shared_state = - SharedState::restore(conf, &zttid).context("failed to restore shared state"); + let shared_state = SharedState::restore(conf, &zttid); let shared_state = match shared_state { Ok(shared_state) => shared_state, @@ -617,6 +751,7 @@ impl GlobalTimelines { let new_tli = Arc::new(Timeline::new( zttid, state.callmemaybe_tx.as_ref().unwrap().clone(), + state.wal_backup_launcher_tx.as_ref().unwrap().clone(), shared_state, )); state.timelines.insert(zttid, Arc::clone(&new_tli)); @@ -625,6 +760,12 @@ impl GlobalTimelines { } } + /// Get loaded timeline, if it exists. + pub fn get_loaded(zttid: ZTenantTimelineId) -> Option> { + let state = TIMELINES_STATE.lock().unwrap(); + state.timelines.get(&zttid).map(Arc::clone) + } + /// Get ZTenantTimelineIDs of all active timelines. pub fn get_active_timelines() -> Vec { let state = TIMELINES_STATE.lock().unwrap(); @@ -665,22 +806,23 @@ impl GlobalTimelines { /// b) an HTTP GET request about the timeline is made and it's able to restore the current state, or /// c) an HTTP POST request for timeline creation is made after the timeline is already deleted. /// TODO: ensure all of the above never happens. - pub fn delete_force( + pub async fn delete_force( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, ) -> Result { info!("deleting timeline {}", zttid); - let was_active = match TIMELINES_STATE.lock().unwrap().timelines.remove(zttid) { - None => false, - Some(tli) => tli.deactivate_for_delete()?, - }; + let timeline = TIMELINES_STATE.lock().unwrap().timelines.remove(zttid); + let mut was_active = false; + if let Some(tli) = timeline { + was_active = tli.deactivate_for_delete().await?; + } GlobalTimelines::delete_force_internal(conf, zttid, was_active) } /// Deactivates and deletes all timelines for the tenant, see `delete()`. /// Returns map of all timelines which the tenant had, `true` if a timeline was active. /// There may be a race if new timelines are created simultaneously. - pub fn delete_force_all_for_tenant( + pub async fn delete_force_all_for_tenant( conf: &SafeKeeperConf, tenant_id: &ZTenantId, ) -> Result> { @@ -691,14 +833,15 @@ impl GlobalTimelines { let timelines = &mut TIMELINES_STATE.lock().unwrap().timelines; for (&zttid, tli) in timelines.iter() { if zttid.tenant_id == *tenant_id { - to_delete.insert(zttid, tli.deactivate_for_delete()?); + to_delete.insert(zttid, tli.clone()); } } // TODO: test that the correct subset of timelines is removed. It's complicated because they are implicitly created currently. timelines.retain(|zttid, _| !to_delete.contains_key(zttid)); } let mut deleted = HashMap::new(); - for (zttid, was_active) in to_delete { + for (zttid, timeline) in to_delete { + let was_active = timeline.deactivate_for_delete().await?; deleted.insert( zttid, GlobalTimelines::delete_force_internal(conf, &zttid, was_active)?, diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs new file mode 100644 index 0000000000..ef8ebe14e1 --- /dev/null +++ b/safekeeper/src/wal_backup.rs @@ -0,0 +1,418 @@ +use anyhow::{Context, Result}; +use tokio::task::JoinHandle; + +use std::cmp::min; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::Duration; + +use postgres_ffi::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, PG_TLI}; +use remote_storage::{GenericRemoteStorage, RemoteStorage}; +use tokio::fs::File; +use tokio::runtime::Builder; + +use tokio::select; +use tokio::sync::mpsc::{self, Receiver, Sender}; +use tokio::sync::watch; +use tokio::time::sleep; +use tracing::*; + +use utils::{lsn::Lsn, zid::ZTenantTimelineId}; + +use crate::broker::{Election, ElectionLeader}; +use crate::timeline::{GlobalTimelines, Timeline}; +use crate::{broker, SafeKeeperConf}; + +use once_cell::sync::OnceCell; + +const BACKUP_ELECTION_NAME: &str = "WAL_BACKUP"; + +const BROKER_CONNECTION_RETRY_DELAY_MS: u64 = 1000; + +const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10; +const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; + +pub fn wal_backup_launcher_thread_main( + conf: SafeKeeperConf, + wal_backup_launcher_rx: Receiver, +) { + let rt = Builder::new_multi_thread() + .worker_threads(conf.backup_runtime_threads) + .enable_all() + .build() + .expect("failed to create wal backup runtime"); + + rt.block_on(async { + wal_backup_launcher_main_loop(conf, wal_backup_launcher_rx).await; + }); +} + +/// Check whether wal backup is required for timeline and mark that launcher is +/// aware of current status (if timeline exists). +fn is_wal_backup_required(zttid: ZTenantTimelineId) -> bool { + if let Some(tli) = GlobalTimelines::get_loaded(zttid) { + tli.wal_backup_attend() + } else { + false + } +} + +struct WalBackupTaskHandle { + shutdown_tx: Sender<()>, + handle: JoinHandle<()>, +} + +/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup +/// tasks. Having this in separate task simplifies locking, allows to reap +/// panics and separate elections from offloading itself. +async fn wal_backup_launcher_main_loop( + conf: SafeKeeperConf, + mut wal_backup_launcher_rx: Receiver, +) { + info!( + "wal backup launcher started, remote config {:?}", + conf.remote_storage + ); + + let conf_ = conf.clone(); + REMOTE_STORAGE.get_or_init(|| { + conf_.remote_storage.as_ref().map(|c| { + GenericRemoteStorage::new(conf_.workdir, c).expect("failed to create remote storage") + }) + }); + + let mut tasks: HashMap = HashMap::new(); + + loop { + // channel is never expected to get closed + let zttid = wal_backup_launcher_rx.recv().await.unwrap(); + let is_wal_backup_required = is_wal_backup_required(zttid); + if conf.remote_storage.is_none() || !conf.wal_backup_enabled { + continue; /* just drain the channel and do nothing */ + } + // do we need to do anything at all? + if is_wal_backup_required != tasks.contains_key(&zttid) { + if is_wal_backup_required { + // need to start the task + info!("starting wal backup task for {}", zttid); + + // TODO: decide who should offload in launcher itself by simply checking current state + let election_name = broker::get_campaign_name( + BACKUP_ELECTION_NAME.to_string(), + conf.broker_etcd_prefix.clone(), + &zttid, + ); + let my_candidate_name = broker::get_candiate_name(conf.my_id); + let election = broker::Election::new( + election_name, + my_candidate_name, + conf.broker_endpoints.clone(), + ); + + let (shutdown_tx, shutdown_rx) = mpsc::channel(1); + let timeline_dir = conf.timeline_dir(&zttid); + + let handle = tokio::spawn( + backup_task_main(zttid, timeline_dir, shutdown_rx, election) + .instrument(info_span!("WAL backup", zttid = %zttid)), + ); + + tasks.insert( + zttid, + WalBackupTaskHandle { + shutdown_tx, + handle, + }, + ); + } else { + // need to stop the task + info!("stopping wal backup task for {}", zttid); + + let wb_handle = tasks.remove(&zttid).unwrap(); + // Tell the task to shutdown. Error means task exited earlier, that's ok. + let _ = wb_handle.shutdown_tx.send(()).await; + // Await the task itself. TODO: restart panicked tasks earlier. + // Hm, why I can't await on reference to handle? + if let Err(e) = wb_handle.handle.await { + warn!("WAL backup task for {} panicked: {}", zttid, e); + } + } + } + } +} + +struct WalBackupTask { + timeline: Arc, + timeline_dir: PathBuf, + wal_seg_size: usize, + commit_lsn_watch_rx: watch::Receiver, + leader: Option, + election: Election, +} + +/// Offload single timeline. +async fn backup_task_main( + zttid: ZTenantTimelineId, + timeline_dir: PathBuf, + mut shutdown_rx: Receiver<()>, + election: Election, +) { + info!("started"); + let timeline: Arc = if let Some(tli) = GlobalTimelines::get_loaded(zttid) { + tli + } else { + /* Timeline could get deleted while task was starting, just exit then. */ + info!("no timeline, exiting"); + return; + }; + + let mut wb = WalBackupTask { + wal_seg_size: timeline.get_wal_seg_size(), + commit_lsn_watch_rx: timeline.get_commit_lsn_watch_rx(), + timeline, + timeline_dir, + leader: None, + election, + }; + + // task is spinned up only when wal_seg_size already initialized + assert!(wb.wal_seg_size > 0); + + let mut canceled = false; + select! { + _ = wb.run() => {} + _ = shutdown_rx.recv() => { + canceled = true; + } + } + if let Some(l) = wb.leader { + l.give_up().await; + } + info!("task {}", if canceled { "canceled" } else { "terminated" }); +} + +impl WalBackupTask { + async fn run(&mut self) { + let mut backup_lsn = Lsn(0); + + // election loop + loop { + let mut retry_attempt = 0u32; + + if let Some(l) = self.leader.take() { + l.give_up().await; + } + + match broker::get_leader(&self.election).await { + Ok(l) => { + self.leader = Some(l); + } + Err(e) => { + error!("error during leader election {:?}", e); + sleep(Duration::from_millis(BROKER_CONNECTION_RETRY_DELAY_MS)).await; + continue; + } + } + + // offload loop + loop { + if retry_attempt == 0 { + // wait for new WAL to arrive + if let Err(e) = self.commit_lsn_watch_rx.changed().await { + // should never happen, as we hold Arc to timeline. + error!("commit_lsn watch shut down: {:?}", e); + return; + } + } else { + // or just sleep if we errored previously + let mut retry_delay = UPLOAD_FAILURE_RETRY_MAX_MS; + if let Some(backoff_delay) = + UPLOAD_FAILURE_RETRY_MIN_MS.checked_shl(retry_attempt) + { + retry_delay = min(retry_delay, backoff_delay); + } + sleep(Duration::from_millis(retry_delay)).await; + } + + let commit_lsn = *self.commit_lsn_watch_rx.borrow(); + assert!( + commit_lsn >= backup_lsn, + "backup lsn should never pass commit lsn" + ); + + if backup_lsn.segment_number(self.wal_seg_size) + == commit_lsn.segment_number(self.wal_seg_size) + { + continue; /* nothing to do, common case as we wake up on every commit_lsn bump */ + } + // Perhaps peers advanced the position, check shmem value. + backup_lsn = self.timeline.get_wal_backup_lsn(); + if backup_lsn.segment_number(self.wal_seg_size) + == commit_lsn.segment_number(self.wal_seg_size) + { + continue; + } + + if let Some(l) = self.leader.as_mut() { + // Optimization idea for later: + // Avoid checking election leader every time by returning current lease grant expiration time + // Re-check leadership only after expiration time, + // such approach woud reduce overhead on write-intensive workloads + + match l + .check_am_i( + self.election.election_name.clone(), + self.election.candidate_name.clone(), + ) + .await + { + Ok(leader) => { + if !leader { + info!("leader has changed"); + break; + } + } + Err(e) => { + warn!("error validating leader, {:?}", e); + break; + } + } + } + + match backup_lsn_range( + backup_lsn, + commit_lsn, + self.wal_seg_size, + &self.timeline_dir, + ) + .await + { + Ok(backup_lsn_result) => { + backup_lsn = backup_lsn_result; + self.timeline.set_wal_backup_lsn(backup_lsn_result); + retry_attempt = 0; + } + Err(e) => { + error!( + "failed while offloading range {}-{}: {:?}", + backup_lsn, commit_lsn, e + ); + + retry_attempt = min(retry_attempt + 1, u32::MAX); + } + } + } + } + } +} + +pub async fn backup_lsn_range( + start_lsn: Lsn, + end_lsn: Lsn, + wal_seg_size: usize, + timeline_dir: &Path, +) -> Result { + let mut res = start_lsn; + let segments = get_segments(start_lsn, end_lsn, wal_seg_size); + for s in &segments { + backup_single_segment(s, timeline_dir) + .await + .with_context(|| format!("offloading segno {}", s.seg_no))?; + + res = s.end_lsn; + } + info!( + "offloaded segnos {:?} up to {}, previous backup_lsn {}", + segments.iter().map(|&s| s.seg_no).collect::>(), + end_lsn, + start_lsn, + ); + Ok(res) +} + +async fn backup_single_segment(seg: &Segment, timeline_dir: &Path) -> Result<()> { + let segment_file_name = seg.file_path(timeline_dir)?; + + backup_object(&segment_file_name, seg.size()).await?; + debug!("Backup of {} done", segment_file_name.display()); + + Ok(()) +} + +#[derive(Debug, Copy, Clone)] +pub struct Segment { + seg_no: XLogSegNo, + start_lsn: Lsn, + end_lsn: Lsn, +} + +impl Segment { + pub fn new(seg_no: u64, start_lsn: Lsn, end_lsn: Lsn) -> Self { + Self { + seg_no, + start_lsn, + end_lsn, + } + } + + pub fn object_name(self) -> String { + XLogFileName(PG_TLI, self.seg_no, self.size()) + } + + pub fn file_path(self, timeline_dir: &Path) -> Result { + Ok(timeline_dir.join(self.object_name())) + } + + pub fn size(self) -> usize { + (u64::from(self.end_lsn) - u64::from(self.start_lsn)) as usize + } +} + +fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec { + let first_seg = start.segment_number(seg_size); + let last_seg = end.segment_number(seg_size); + + let res: Vec = (first_seg..last_seg) + .map(|s| { + let start_lsn = XLogSegNoOffsetToRecPtr(s, 0, seg_size); + let end_lsn = XLogSegNoOffsetToRecPtr(s + 1, 0, seg_size); + Segment::new(s, Lsn::from(start_lsn), Lsn::from(end_lsn)) + }) + .collect(); + res +} + +static REMOTE_STORAGE: OnceCell> = OnceCell::new(); + +async fn backup_object(source_file: &Path, size: usize) -> Result<()> { + let storage = REMOTE_STORAGE.get().expect("failed to get remote storage"); + + let file = File::open(&source_file).await?; + + // Storage is initialized by launcher at ths point. + match storage.as_ref().unwrap() { + GenericRemoteStorage::Local(local_storage) => { + let destination = local_storage.remote_object_id(source_file)?; + + debug!( + "local upload about to start from {} to {}", + source_file.display(), + destination.display() + ); + local_storage.upload(file, size, &destination, None).await + } + GenericRemoteStorage::S3(s3_storage) => { + let s3key = s3_storage.remote_object_id(source_file)?; + + debug!( + "S3 upload about to start from {} to {:?}", + source_file.display(), + s3key + ); + s3_storage.upload(file, size, &s3key, None).await + } + }?; + + Ok(()) +} diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index e1b7bd91ee..fc192c28e8 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -12,7 +12,7 @@ from contextlib import closing from dataclasses import dataclass, field from multiprocessing import Process, Value from pathlib import Path -from fixtures.zenith_fixtures import PgBin, Etcd, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol +from fixtures.zenith_fixtures import PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol from fixtures.utils import get_dir_size, lsn_to_hex, mkdir_if_needed, lsn_from_hex from fixtures.log_helper import log from typing import List, Optional, Any @@ -401,7 +401,7 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): http_cli = env.safekeepers[0].http_client() # Pretend WAL is offloaded to s3. - http_cli.record_safekeeper_info(tenant_id, timeline_id, {'s3_wal_lsn': 'FFFFFFFF/FEFFFFFF'}) + http_cli.record_safekeeper_info(tenant_id, timeline_id, {'backup_lsn': 'FFFFFFFF/FEFFFFFF'}) # wait till first segment is removed on all safekeepers started_at = time.time() @@ -414,6 +414,56 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): time.sleep(0.5) +@pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs']) +def test_wal_backup(zenith_env_builder: ZenithEnvBuilder, storage_type: str): + zenith_env_builder.num_safekeepers = 3 + if storage_type == 'local_fs': + zenith_env_builder.enable_local_fs_remote_storage() + elif storage_type == 'mock_s3': + zenith_env_builder.enable_s3_mock_remote_storage('test_safekeepers_wal_backup') + else: + raise RuntimeError(f'Unknown storage type: {storage_type}') + zenith_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER + + env = zenith_env_builder.init_start() + + env.zenith_cli.create_branch('test_safekeepers_wal_backup') + pg = env.postgres.create_start('test_safekeepers_wal_backup') + + # learn zenith timeline from compute + tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] + timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + + pg_conn = pg.connect() + cur = pg_conn.cursor() + cur.execute('create table t(key int, value text)') + + # Shut down subsequently each of safekeepers and fill a segment while sk is + # down; ensure segment gets offloaded by others. + offloaded_seg_end = ['0/2000000', '0/3000000', '0/4000000'] + for victim, seg_end in zip(env.safekeepers, offloaded_seg_end): + victim.stop() + # roughly fills one segment + cur.execute("insert into t select generate_series(1,250000), 'payload'") + live_sk = [sk for sk in env.safekeepers if sk != victim][0] + http_cli = live_sk.http_client() + + started_at = time.time() + while True: + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"live sk status is {tli_status}") + + if lsn_from_hex(tli_status.backup_lsn) >= lsn_from_hex(seg_end): + break + elapsed = time.time() - started_at + if elapsed > 20: + raise RuntimeError( + f"timed out waiting {elapsed:.0f}s segment ending at {seg_end} get offloaded") + time.sleep(0.5) + + victim.start() + + class ProposerPostgres(PgProtocol): """Object for running postgres without ZenithEnv""" def __init__(self, diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 7f5b2ad2aa..a2e8c82d30 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1,6 +1,7 @@ from __future__ import annotations from dataclasses import field +from enum import Flag, auto import textwrap from cached_property import cached_property import asyncpg @@ -421,10 +422,51 @@ class MockS3Server: def secret_key(self) -> str: return 'test' + def access_env_vars(self) -> Dict[Any, Any]: + return { + 'AWS_ACCESS_KEY_ID': self.access_key(), + 'AWS_SECRET_ACCESS_KEY': self.secret_key(), + } + def kill(self): self.subprocess.kill() +@dataclass +class LocalFsStorage: + local_path: Path + + +@dataclass +class S3Storage: + bucket_name: str + bucket_region: str + endpoint: Optional[str] + + +RemoteStorage = Union[LocalFsStorage, S3Storage] + + +# serialize as toml inline table +def remote_storage_to_toml_inline_table(remote_storage): + if isinstance(remote_storage, LocalFsStorage): + res = f"local_path='{remote_storage.local_path}'" + elif isinstance(remote_storage, S3Storage): + res = f"bucket_name='{remote_storage.bucket_name}', bucket_region='{remote_storage.bucket_region}'" + if remote_storage.endpoint is not None: + res += f", endpoint='{remote_storage.endpoint}'" + else: + raise Exception(f'Unknown storage configuration {remote_storage}') + else: + raise Exception("invalid remote storage type") + return f"{{{res}}}" + + +class RemoteStorageUsers(Flag): + PAGESERVER = auto() + SAFEKEEPER = auto() + + class ZenithEnvBuilder: """ Builder object to create a Zenith runtime environment @@ -440,6 +482,7 @@ class ZenithEnvBuilder: broker: Etcd, mock_s3_server: MockS3Server, remote_storage: Optional[RemoteStorage] = None, + remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER, pageserver_config_override: Optional[str] = None, num_safekeepers: int = 1, pageserver_auth_enabled: bool = False, @@ -449,6 +492,7 @@ class ZenithEnvBuilder: self.rust_log_override = rust_log_override self.port_distributor = port_distributor self.remote_storage = remote_storage + self.remote_storage_users = remote_storage_users self.broker = broker self.mock_s3_server = mock_s3_server self.pageserver_config_override = pageserver_config_override @@ -497,9 +541,9 @@ class ZenithEnvBuilder: aws_access_key_id=self.mock_s3_server.access_key(), aws_secret_access_key=self.mock_s3_server.secret_key(), ).create_bucket(Bucket=bucket_name) - self.remote_storage = S3Storage(bucket=bucket_name, + self.remote_storage = S3Storage(bucket_name=bucket_name, endpoint=mock_endpoint, - region=mock_region) + bucket_region=mock_region) def __enter__(self): return self @@ -557,6 +601,7 @@ class ZenithEnv: self.safekeepers: List[Safekeeper] = [] self.broker = config.broker self.remote_storage = config.remote_storage + self.remote_storage_users = config.remote_storage_users # generate initial tenant ID here instead of letting 'zenith init' generate it, # so that we don't need to dig it out of the config file afterwards. @@ -605,8 +650,12 @@ class ZenithEnv: id = {id} pg_port = {port.pg} http_port = {port.http} - sync = false # Disable fsyncs to make the tests go faster - """) + sync = false # Disable fsyncs to make the tests go faster""") + if bool(self.remote_storage_users + & RemoteStorageUsers.SAFEKEEPER) and self.remote_storage is not None: + toml += textwrap.dedent(f""" + remote_storage = "{remote_storage_to_toml_inline_table(self.remote_storage)}" + """) safekeeper = Safekeeper(env=self, id=id, port=port) self.safekeepers.append(safekeeper) @@ -638,7 +687,7 @@ def _shared_simple_env(request: Any, mock_s3_server: MockS3Server, default_broker: Etcd) -> Iterator[ZenithEnv]: """ - Internal fixture backing the `zenith_simple_env` fixture. If TEST_SHARED_FIXTURES + # Internal fixture backing the `zenith_simple_env` fixture. If TEST_SHARED_FIXTURES is set, this is shared by all tests using `zenith_simple_env`. """ @@ -822,20 +871,6 @@ class PageserverPort: http: int -@dataclass -class LocalFsStorage: - root: Path - - -@dataclass -class S3Storage: - bucket: str - region: str - endpoint: Optional[str] - - -RemoteStorage = Union[LocalFsStorage, S3Storage] - CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", re.MULTILINE) CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", @@ -998,6 +1033,7 @@ class ZenithCli: append_pageserver_param_overrides( params_to_update=cmd, remote_storage=self.env.remote_storage, + remote_storage_users=self.env.remote_storage_users, pageserver_config_override=self.env.pageserver.config_override) res = self.raw_cli(cmd) @@ -1022,14 +1058,10 @@ class ZenithCli: append_pageserver_param_overrides( params_to_update=start_args, remote_storage=self.env.remote_storage, + remote_storage_users=self.env.remote_storage_users, pageserver_config_override=self.env.pageserver.config_override) - s3_env_vars = None - if self.env.s3_mock_server: - s3_env_vars = { - 'AWS_ACCESS_KEY_ID': self.env.s3_mock_server.access_key(), - 'AWS_SECRET_ACCESS_KEY': self.env.s3_mock_server.secret_key(), - } + s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None return self.raw_cli(start_args, extra_env_vars=s3_env_vars) def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]': @@ -1041,7 +1073,8 @@ class ZenithCli: return self.raw_cli(cmd) def safekeeper_start(self, id: int) -> 'subprocess.CompletedProcess[str]': - return self.raw_cli(['safekeeper', 'start', str(id)]) + s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None + return self.raw_cli(['safekeeper', 'start', str(id)], extra_env_vars=s3_env_vars) def safekeeper_stop(self, id: Optional[int] = None, @@ -1237,22 +1270,13 @@ class ZenithPageserver(PgProtocol): def append_pageserver_param_overrides( params_to_update: List[str], remote_storage: Optional[RemoteStorage], + remote_storage_users: RemoteStorageUsers, pageserver_config_override: Optional[str] = None, ): - if remote_storage is not None: - if isinstance(remote_storage, LocalFsStorage): - pageserver_storage_override = f"local_path='{remote_storage.root}'" - elif isinstance(remote_storage, S3Storage): - pageserver_storage_override = f"bucket_name='{remote_storage.bucket}',\ - bucket_region='{remote_storage.region}'" - - if remote_storage.endpoint is not None: - pageserver_storage_override += f",endpoint='{remote_storage.endpoint}'" - - else: - raise Exception(f'Unknown storage configuration {remote_storage}') + if bool(remote_storage_users & RemoteStorageUsers.PAGESERVER) and remote_storage is not None: + remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage) params_to_update.append( - f'--pageserver-config-override=remote_storage={{{pageserver_storage_override}}}') + f'--pageserver-config-override=remote_storage={remote_storage_toml_table}') env_overrides = os.getenv('ZENITH_PAGESERVER_OVERRIDES') if env_overrides is not None: @@ -1786,8 +1810,9 @@ class Safekeeper: class SafekeeperTimelineStatus: acceptor_epoch: int flush_lsn: str - remote_consistent_lsn: str timeline_start_lsn: str + backup_lsn: str + remote_consistent_lsn: str @dataclass @@ -1812,8 +1837,9 @@ class SafekeeperHttpClient(requests.Session): resj = res.json() return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'], flush_lsn=resj['flush_lsn'], - remote_consistent_lsn=resj['remote_consistent_lsn'], - timeline_start_lsn=resj['timeline_start_lsn']) + timeline_start_lsn=resj['timeline_start_lsn'], + backup_lsn=resj['backup_lsn'], + remote_consistent_lsn=resj['remote_consistent_lsn']) def record_safekeeper_info(self, tenant_id: str, timeline_id: str, body): res = self.post( From 54b75248ff53cd3530916200d9156a491c16b8dd Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 27 May 2022 13:09:17 +0400 Subject: [PATCH 0359/1022] s3 WAL offloading staging review. - Uncomment accidently `self.keep_alive.abort()` commented line, due to this task never finished, which blocked launcher. - Mess up with initialization one more time, to fix offloader trying to back up segment 0. Now we initialize all required LSNs in handle_elected, where we learn start LSN for the first time. - Fix blind attempt to provide safekeeper service file with remote storage params. --- .circleci/ansible/systemd/safekeeper.service | 2 +- libs/utils/src/zid.rs | 2 +- safekeeper/src/broker.rs | 2 +- safekeeper/src/safekeeper.rs | 50 +++++++++----------- safekeeper/src/wal_backup.rs | 19 ++++---- 5 files changed, 35 insertions(+), 40 deletions(-) diff --git a/.circleci/ansible/systemd/safekeeper.service b/.circleci/ansible/systemd/safekeeper.service index a6b443c3e7..e4a395a60e 100644 --- a/.circleci/ansible/systemd/safekeeper.service +++ b/.circleci/ansible/systemd/safekeeper.service @@ -6,7 +6,7 @@ After=network.target auditd.service Type=simple User=safekeeper Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib -ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote_storage='{bucket_name={{bucket_name}}, bucket_region={{bucket_region}}, prefix_in_bucket=wal}' +ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="wal"}' ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed KillSignal=SIGINT diff --git a/libs/utils/src/zid.rs b/libs/utils/src/zid.rs index 02f781c49a..0ef174da4d 100644 --- a/libs/utils/src/zid.rs +++ b/libs/utils/src/zid.rs @@ -218,7 +218,7 @@ impl ZTenantTimelineId { impl fmt::Display for ZTenantTimelineId { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}-{}", self.tenant_id, self.timeline_id) + write!(f, "{}/{}", self.tenant_id, self.timeline_id) } } diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 676719b60d..5bcb197205 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -83,7 +83,7 @@ impl ElectionLeader { } pub async fn give_up(self) { - // self.keep_alive.abort(); + self.keep_alive.abort(); // TODO: it'll be wise to resign here but it'll happen after lease expiration anyway // should we await for keep alive termination? let _ = self.keep_alive.await; diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 9a07127771..0a7adb96b6 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -731,24 +731,36 @@ where { let mut state = self.state.clone(); - // Remeber point where WAL begins globally, if not yet. + // Here we learn initial LSN for the first time, set fields + // interested in that. + if state.timeline_start_lsn == Lsn(0) { + // Remember point where WAL begins globally. state.timeline_start_lsn = msg.timeline_start_lsn; info!( "setting timeline_start_lsn to {:?}", state.timeline_start_lsn ); - } - // Remember point where WAL begins locally, if not yet. (I doubt the - // second condition is ever possible) - if state.local_start_lsn == Lsn(0) || state.local_start_lsn >= msg.start_streaming_at { state.local_start_lsn = msg.start_streaming_at; info!("setting local_start_lsn to {:?}", state.local_start_lsn); } + // Initializing commit_lsn before acking first flushed record is + // important to let find_end_of_wal skip the whole in the beginning + // of the first segment. + // + // NB: on new clusters, this happens at the same time as + // timeline_start_lsn initialization, it is taken outside to provide + // upgrade. + self.global_commit_lsn = max(self.global_commit_lsn, state.timeline_start_lsn); + self.inmem.commit_lsn = max(self.inmem.commit_lsn, state.timeline_start_lsn); + self.metrics.commit_lsn.set(self.inmem.commit_lsn.0 as f64); + + // Initalizing backup_lsn is useful to avoid making backup think it should upload 0 segment. + self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn); state.acceptor_state.term_history = msg.term_history.clone(); - self.state.persist(&state)?; + self.persist_control_file(state)?; } info!("start receiving WAL since {:?}", msg.start_streaming_at); @@ -764,14 +776,6 @@ where self.inmem.commit_lsn = commit_lsn; self.metrics.commit_lsn.set(self.inmem.commit_lsn.0 as f64); - // We got our first commit_lsn, which means we should sync - // everything to disk, to initialize the state. - if self.state.commit_lsn == Lsn::INVALID && commit_lsn != Lsn::INVALID { - self.inmem.backup_lsn = self.inmem.commit_lsn; // initialize backup_lsn - self.wal_store.flush_wal()?; - self.persist_control_file()?; - } - // If new commit_lsn reached epoch switch, force sync of control // file: walproposer in sync mode is very interested when this // happens. Note: this is for sync-safekeepers mode only, as @@ -780,15 +784,14 @@ where // that we receive new epoch_start_lsn, and we still need to sync // control file in this case. if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn { - self.persist_control_file()?; + self.persist_control_file(self.state.clone())?; } Ok(()) } - /// Persist in-memory state to the disk. - fn persist_control_file(&mut self) -> Result<()> { - let mut state = self.state.clone(); + /// Persist in-memory state to the disk, taking other data from state. + fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> { state.commit_lsn = self.inmem.commit_lsn; state.backup_lsn = self.inmem.backup_lsn; state.peer_horizon_lsn = self.inmem.peer_horizon_lsn; @@ -823,13 +826,6 @@ where // do the job if !msg.wal_data.is_empty() { self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?; - - // If this was the first record we ever received, initialize - // commit_lsn to help find_end_of_wal skip the hole in the - // beginning. - if self.global_commit_lsn == Lsn(0) { - self.global_commit_lsn = msg.h.begin_lsn; - } } // flush wal to the disk, if required @@ -852,7 +848,7 @@ where if self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64) < self.inmem.peer_horizon_lsn { - self.persist_control_file()?; + self.persist_control_file(self.state.clone())?; } trace!( @@ -920,7 +916,7 @@ where self.inmem.peer_horizon_lsn = new_peer_horizon_lsn; } if sync_control_file { - self.persist_control_file()?; + self.persist_control_file(self.state.clone())?; } Ok(()) } diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index ef8ebe14e1..83dc312d28 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -71,7 +71,7 @@ async fn wal_backup_launcher_main_loop( mut wal_backup_launcher_rx: Receiver, ) { info!( - "wal backup launcher started, remote config {:?}", + "WAL backup launcher: started, remote config {:?}", conf.remote_storage ); @@ -95,7 +95,7 @@ async fn wal_backup_launcher_main_loop( if is_wal_backup_required != tasks.contains_key(&zttid) { if is_wal_backup_required { // need to start the task - info!("starting wal backup task for {}", zttid); + info!("starting WAL backup task for {}", zttid); // TODO: decide who should offload in launcher itself by simply checking current state let election_name = broker::get_campaign_name( @@ -115,7 +115,7 @@ async fn wal_backup_launcher_main_loop( let handle = tokio::spawn( backup_task_main(zttid, timeline_dir, shutdown_rx, election) - .instrument(info_span!("WAL backup", zttid = %zttid)), + .instrument(info_span!("WAL backup task", zttid = %zttid)), ); tasks.insert( @@ -127,7 +127,7 @@ async fn wal_backup_launcher_main_loop( ); } else { // need to stop the task - info!("stopping wal backup task for {}", zttid); + info!("stopping WAL backup task for {}", zttid); let wb_handle = tasks.remove(&zttid).unwrap(); // Tell the task to shutdown. Error means task exited earlier, that's ok. @@ -236,20 +236,19 @@ impl WalBackupTask { } let commit_lsn = *self.commit_lsn_watch_rx.borrow(); - assert!( - commit_lsn >= backup_lsn, - "backup lsn should never pass commit lsn" - ); + // Note that backup_lsn can be higher than commit_lsn if we + // don't have much local WAL and others already uploaded + // segments we don't even have. if backup_lsn.segment_number(self.wal_seg_size) - == commit_lsn.segment_number(self.wal_seg_size) + >= commit_lsn.segment_number(self.wal_seg_size) { continue; /* nothing to do, common case as we wake up on every commit_lsn bump */ } // Perhaps peers advanced the position, check shmem value. backup_lsn = self.timeline.get_wal_backup_lsn(); if backup_lsn.segment_number(self.wal_seg_size) - == commit_lsn.segment_number(self.wal_seg_size) + >= commit_lsn.segment_number(self.wal_seg_size) { continue; } From 75f71a63801c687a8bebe6aea28d751da52ac677 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Fri, 27 May 2022 11:43:06 -0400 Subject: [PATCH 0360/1022] Handle broken timelines on startup (#1809) Resolve #1663. ## Changes - ignore a "broken" [1] timeline on page server startup - fix the race condition when creating multiple timelines in parallel for a tenant - added tests for the above changes [1]: a timeline is marked as "broken" if either - failed to load the timeline's metadata or - the timeline's disk consistent LSN is zero --- pageserver/src/layered_repository.rs | 2 +- pageserver/src/tenant_mgr.rs | 31 +++++++++++++++- pageserver/src/timelines.rs | 9 ++++- .../batch_others/test_broken_timeline.py | 37 ++++++++++++++++++- 4 files changed, 74 insertions(+), 5 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index d10c795214..0d7c6f54c8 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -2518,7 +2518,7 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { bail!("couldn't find an unused backup number for {:?}", path) } -fn load_metadata( +pub fn load_metadata( conf: &'static PageServerConf, timeline_id: ZTimelineId, tenant_id: ZTenantId, diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index bba67394c3..cc35d79d16 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -2,7 +2,7 @@ //! page server. use crate::config::PageServerConf; -use crate::layered_repository::LayeredRepository; +use crate::layered_repository::{load_metadata, LayeredRepository}; use crate::pgdatadir_mapping::DatadirTimeline; use crate::repository::{Repository, TimelineSyncStatusUpdate}; use crate::storage_sync::index::RemoteIndex; @@ -22,6 +22,7 @@ use std::collections::HashMap; use std::fmt; use std::sync::Arc; use tracing::*; +use utils::lsn::Lsn; use utils::zid::{ZTenantId, ZTimelineId}; @@ -399,6 +400,26 @@ pub fn list_tenants() -> Vec { .collect() } +/// Check if a given timeline is "broken" \[1\]. +/// The function returns an error if the timeline is "broken". +/// +/// \[1\]: it's not clear now how should we classify a timeline as broken. +/// A timeline is categorized as broken when any of following conditions is true: +/// - failed to load the timeline's metadata +/// - the timeline's disk consistent LSN is zero +fn check_broken_timeline(repo: &LayeredRepository, timeline_id: ZTimelineId) -> anyhow::Result<()> { + let metadata = load_metadata(repo.conf, timeline_id, repo.tenant_id()) + .context("failed to load metadata")?; + + // A timeline with zero disk consistent LSN can happen when the page server + // failed to checkpoint the timeline import data when creating that timeline. + if metadata.disk_consistent_lsn() == Lsn::INVALID { + bail!("Timeline {timeline_id} has a zero disk consistent LSN."); + } + + Ok(()) +} + fn init_local_repository( conf: &'static PageServerConf, tenant_id: ZTenantId, @@ -414,7 +435,13 @@ fn init_local_repository( match init_status { LocalTimelineInitStatus::LocallyComplete => { debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository"); - status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded); + if let Err(err) = check_broken_timeline(&repo, timeline_id) { + info!( + "Found a broken timeline {timeline_id} (err={err:?}), skip registering it in repository" + ); + } else { + status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded); + } } LocalTimelineInitStatus::NeedsSync => { debug!( diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 408eca6501..9ab063107c 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -285,7 +285,9 @@ fn bootstrap_timeline( ) -> Result<()> { let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered(); - let initdb_path = conf.tenant_path(&tenantid).join("tmp"); + let initdb_path = conf + .tenant_path(&tenantid) + .join(format!("tmp-timeline-{}", tli)); // Init temporarily repo to get bootstrap data run_initdb(conf, &initdb_path)?; @@ -300,6 +302,11 @@ fn bootstrap_timeline( let timeline = repo.create_empty_timeline(tli, lsn)?; let mut page_tline: DatadirTimeline = DatadirTimeline::new(timeline, u64::MAX); import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?; + + fail::fail_point!("before-checkpoint-new-timeline", |_| { + bail!("failpoint before-checkpoint-new-timeline"); + }); + page_tline.tline.checkpoint(CheckpointConfig::Forced)?; info!( diff --git a/test_runner/batch_others/test_broken_timeline.py b/test_runner/batch_others/test_broken_timeline.py index 17eadb33b4..f0aa44e0a4 100644 --- a/test_runner/batch_others/test_broken_timeline.py +++ b/test_runner/batch_others/test_broken_timeline.py @@ -1,6 +1,7 @@ import pytest +import concurrent.futures from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithEnv from fixtures.log_helper import log import os @@ -78,3 +79,37 @@ def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder): with pytest.raises(Exception, match="Cannot load local timeline") as err: pg.start() log.info(f'compute startup failed as expected: {err}') + + +def test_create_multiple_timelines_parallel(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + + tenant_id, _ = env.zenith_cli.create_tenant() + + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + futures = [ + executor.submit(env.zenith_cli.create_timeline, + f"test-create-multiple-timelines-{i}", + tenant_id) for i in range(4) + ] + for future in futures: + future.result() + + +def test_fix_broken_timelines_on_startup(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + + tenant_id, _ = env.zenith_cli.create_tenant() + + # Introduce failpoint when creating a new timeline + env.pageserver.safe_psql(f"failpoints before-checkpoint-new-timeline=return") + with pytest.raises(Exception, match="before-checkpoint-new-timeline"): + _ = env.zenith_cli.create_timeline("test_fix_broken_timelines", tenant_id) + + # Restart the page server + env.zenith_cli.pageserver_stop(immediate=True) + env.zenith_cli.pageserver_start() + + # Check that the "broken" timeline is not loaded + timelines = env.zenith_cli.list_timelines(tenant_id) + assert len(timelines) == 1 From cb8bf1beb606fa97eeee0f038d28af4c7327af34 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 27 May 2022 14:10:10 +0400 Subject: [PATCH 0361/1022] Prevent commit_lsn <= flush_lsn violation after a42eba3cd7. Nothing complained about that yet, but we definitely don't hold at least one assert, so let's keep it this way until better version. --- safekeeper/src/safekeeper.rs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 0a7adb96b6..c254f2c57c 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -576,13 +576,16 @@ where self.state .acceptor_state .term_history - .up_to(self.wal_store.flush_lsn()) + .up_to(self.flush_lsn()) } pub fn get_epoch(&self) -> Term { - self.state - .acceptor_state - .get_epoch(self.wal_store.flush_lsn()) + self.state.acceptor_state.get_epoch(self.flush_lsn()) + } + + /// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet. + fn flush_lsn(&self) -> Lsn { + max(self.wal_store.flush_lsn(), self.state.timeline_start_lsn) } /// Process message from proposer and possibly form reply. Concurrent @@ -671,7 +674,7 @@ where let mut resp = VoteResponse { term: self.state.acceptor_state.term, vote_given: false as u64, - flush_lsn: self.wal_store.flush_lsn(), + flush_lsn: self.flush_lsn(), truncate_lsn: self.state.peer_horizon_lsn, term_history: self.get_term_history(), timeline_start_lsn: self.state.timeline_start_lsn, @@ -703,7 +706,7 @@ where fn append_response(&self) -> AppendResponse { let ar = AppendResponse { term: self.state.acceptor_state.term, - flush_lsn: self.wal_store.flush_lsn(), + flush_lsn: self.flush_lsn(), commit_lsn: self.state.commit_lsn, // will be filled by the upper code to avoid bothering safekeeper hs_feedback: HotStandbyFeedback::empty(), @@ -770,7 +773,7 @@ where /// Advance commit_lsn taking into account what we have locally pub fn update_commit_lsn(&mut self) -> Result<()> { - let commit_lsn = min(self.global_commit_lsn, self.wal_store.flush_lsn()); + let commit_lsn = min(self.global_commit_lsn, self.flush_lsn()); assert!(commit_lsn >= self.inmem.commit_lsn); self.inmem.commit_lsn = commit_lsn; From 757746b5717eec6e0c338e41f19844ec077852e7 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Fri, 27 May 2022 13:33:53 -0400 Subject: [PATCH 0362/1022] Fix `test_pageserver_http_get_wal_receiver_success` flaky test. (#1786) Fixes #1768. ## Context Previously, to test `get_wal_receiver` API, we make run some DB transactions then call the API to check the latest message's LSN from the WAL receiver. However, this test won't work because it's not guaranteed that the WAL receiver will get the latest WAL from the postgres/safekeeper at the time of making the API call. This PR resolves the above issue by adding a "poll and wait" code that waits to retrieve the latest data from the WAL receiver. This PR also fixes a bug that tries to compare two hex LSNs, should convert to number before the comparison. See: https://github.com/neondatabase/neon/issues/1768#issuecomment-1133752122. --- .../batch_others/test_pageserver_api.py | 40 ++++++++++++++----- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 7fe3b4dff5..2b0e5ae8bd 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -1,11 +1,14 @@ +from typing import Optional from uuid import uuid4, UUID import pytest +from fixtures.utils import lsn_from_hex from fixtures.zenith_fixtures import ( DEFAULT_BRANCH_NAME, ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient, ZenithPageserverApiException, + wait_until, ) @@ -73,18 +76,35 @@ def test_pageserver_http_get_wal_receiver_success(zenith_simple_env: ZenithEnv): tenant_id, timeline_id = env.zenith_cli.create_tenant() pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id) - res = client.wal_receiver_get(tenant_id, timeline_id) - assert list(res.keys()) == [ - "thread_id", - "wal_producer_connstr", - "last_received_msg_lsn", - "last_received_msg_ts", - ] + def expect_updated_msg_lsn(prev_msg_lsn: Optional[int]) -> int: + res = client.wal_receiver_get(tenant_id, timeline_id) - # make a DB modification then expect getting a new WAL receiver's data + # a successful `wal_receiver_get` response must contain the below fields + assert list(res.keys()) == [ + "thread_id", + "wal_producer_connstr", + "last_received_msg_lsn", + "last_received_msg_ts", + ] + + assert res["last_received_msg_lsn"] is not None, "the last received message's LSN is empty" + + last_msg_lsn = lsn_from_hex(res["last_received_msg_lsn"]) + assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, \ + f"the last received message's LSN {last_msg_lsn} hasn't been updated \ + compared to the previous message's LSN {prev_msg_lsn}" + + return last_msg_lsn + + # Wait to make sure that we get a latest WAL receiver data. + # We need to wait here because it's possible that we don't have access to + # the latest WAL during the time the `wal_receiver_get` API is called. + # See: https://github.com/neondatabase/neon/issues/1768. + lsn = wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(None)) + + # Make a DB modification then expect getting a new WAL receiver's data. pg.safe_psql("CREATE TABLE t(key int primary key, value text)") - res2 = client.wal_receiver_get(tenant_id, timeline_id) - assert res2["last_received_msg_lsn"] > res["last_received_msg_lsn"] + wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(lsn)) def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv): From 5d813f97386b34f020c8051bf2c5a1b06dc4e408 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 18 May 2022 16:01:56 +0300 Subject: [PATCH 0363/1022] [proxy] Refactoring This patch attempts to fix some of the technical debt we had to introduce in previous patches. --- proxy/src/auth.rs | 67 ++--- proxy/src/auth/backend.rs | 109 ++++++++ proxy/src/auth/backend/console.rs | 225 ++++++++++++++++ .../backend}/legacy_console.rs | 32 ++- .../{auth_backend => auth/backend}/link.rs | 14 +- proxy/src/auth/backend/postgres.rs | 88 +++++++ proxy/src/auth/credentials.rs | 30 ++- proxy/src/auth/flow.rs | 6 +- proxy/src/auth_backend.rs | 31 --- proxy/src/auth_backend/console.rs | 243 ------------------ proxy/src/auth_backend/postgres.rs | 93 ------- proxy/src/compute.rs | 4 +- proxy/src/config.rs | 35 ++- proxy/src/main.rs | 2 +- proxy/src/mgmt.rs | 8 +- proxy/src/url.rs | 82 ++++++ 16 files changed, 599 insertions(+), 470 deletions(-) create mode 100644 proxy/src/auth/backend.rs create mode 100644 proxy/src/auth/backend/console.rs rename proxy/src/{auth_backend => auth/backend}/legacy_console.rs (90%) rename proxy/src/{auth_backend => auth/backend}/link.rs (75%) create mode 100644 proxy/src/auth/backend/postgres.rs delete mode 100644 proxy/src/auth_backend.rs delete mode 100644 proxy/src/auth_backend/console.rs delete mode 100644 proxy/src/auth_backend/postgres.rs create mode 100644 proxy/src/url.rs diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 2463f31645..082a7bcf20 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,56 +1,58 @@ -mod credentials; -mod flow; +//! Client authentication mechanisms. -use crate::auth_backend::{console, legacy_console, link, postgres}; -use crate::config::{AuthBackendType, ProxyConfig}; -use crate::error::UserFacingError; -use crate::stream::PqStream; -use crate::{auth_backend, compute, waiters}; -use console::ConsoleAuthError::SniMissing; +pub mod backend; +pub use backend::DatabaseInfo; + +mod credentials; +pub use credentials::ClientCredentials; + +mod flow; +pub use flow::*; + +use crate::{error::UserFacingError, waiters}; use std::io; use thiserror::Error; -use tokio::io::{AsyncRead, AsyncWrite}; -pub use credentials::ClientCredentials; -pub use flow::*; +/// Convenience wrapper for the authentication error. +pub type Result = std::result::Result; /// Common authentication error. #[derive(Debug, Error)] pub enum AuthErrorImpl { /// Authentication error reported by the console. #[error(transparent)] - Console(#[from] auth_backend::AuthError), + Console(#[from] backend::AuthError), #[error(transparent)] - GetAuthInfo(#[from] auth_backend::console::ConsoleAuthError), + GetAuthInfo(#[from] backend::console::ConsoleAuthError), #[error(transparent)] Sasl(#[from] crate::sasl::Error), - /// For passwords that couldn't be processed by [`parse_password`]. + /// For passwords that couldn't be processed by [`backend::legacy_console::parse_password`]. #[error("Malformed password message")] MalformedPassword, - /// Errors produced by [`PqStream`]. + /// Errors produced by [`crate::stream::PqStream`]. #[error(transparent)] Io(#[from] io::Error), } impl AuthErrorImpl { pub fn auth_failed(msg: impl Into) -> Self { - AuthErrorImpl::Console(auth_backend::AuthError::auth_failed(msg)) + Self::Console(backend::AuthError::auth_failed(msg)) } } impl From for AuthErrorImpl { fn from(e: waiters::RegisterError) -> Self { - AuthErrorImpl::Console(auth_backend::AuthError::from(e)) + Self::Console(backend::AuthError::from(e)) } } impl From for AuthErrorImpl { fn from(e: waiters::WaitError) -> Self { - AuthErrorImpl::Console(auth_backend::AuthError::from(e)) + Self::Console(backend::AuthError::from(e)) } } @@ -63,7 +65,7 @@ where AuthErrorImpl: From, { fn from(e: T) -> Self { - AuthError(Box::new(e.into())) + Self(Box::new(e.into())) } } @@ -72,34 +74,9 @@ impl UserFacingError for AuthError { use AuthErrorImpl::*; match self.0.as_ref() { Console(e) => e.to_string_client(), + GetAuthInfo(e) => e.to_string_client(), MalformedPassword => self.to_string(), - GetAuthInfo(e) if matches!(e, SniMissing) => e.to_string(), _ => "Internal error".to_string(), } } } - -async fn handle_user( - config: &ProxyConfig, - client: &mut PqStream, - creds: ClientCredentials, -) -> Result { - match config.auth_backend { - AuthBackendType::LegacyConsole => { - legacy_console::handle_user( - &config.auth_endpoint, - &config.auth_link_uri, - client, - &creds, - ) - .await - } - AuthBackendType::Console => { - console::handle_user(config.auth_endpoint.as_ref(), client, &creds).await - } - AuthBackendType::Postgres => { - postgres::handle_user(&config.auth_endpoint, client, &creds).await - } - AuthBackendType::Link => link::handle_user(config.auth_link_uri.as_ref(), client).await, - } -} diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs new file mode 100644 index 0000000000..1d41f7f932 --- /dev/null +++ b/proxy/src/auth/backend.rs @@ -0,0 +1,109 @@ +mod legacy_console; +mod link; +mod postgres; + +pub mod console; + +pub use legacy_console::{AuthError, AuthErrorImpl}; + +use super::ClientCredentials; +use crate::{ + compute, + config::{AuthBackendType, ProxyConfig}, + mgmt, + stream::PqStream, + waiters::{self, Waiter, Waiters}, +}; +use lazy_static::lazy_static; +use serde::{Deserialize, Serialize}; +use tokio::io::{AsyncRead, AsyncWrite}; + +lazy_static! { + static ref CPLANE_WAITERS: Waiters = Default::default(); +} + +/// Give caller an opportunity to wait for the cloud's reply. +pub async fn with_waiter( + psql_session_id: impl Into, + action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R, +) -> Result +where + R: std::future::Future>, + E: From, +{ + let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; + action(waiter).await +} + +pub fn notify(psql_session_id: &str, msg: mgmt::ComputeReady) -> Result<(), waiters::NotifyError> { + CPLANE_WAITERS.notify(psql_session_id, msg) +} + +/// Compute node connection params provided by the cloud. +/// Note how it implements serde traits, since we receive it over the wire. +#[derive(Serialize, Deserialize, Default)] +pub struct DatabaseInfo { + pub host: String, + pub port: u16, + pub dbname: String, + pub user: String, + pub password: Option, +} + +// Manually implement debug to omit personal and sensitive info. +impl std::fmt::Debug for DatabaseInfo { + fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { + fmt.debug_struct("DatabaseInfo") + .field("host", &self.host) + .field("port", &self.port) + .finish() + } +} + +impl From for tokio_postgres::Config { + fn from(db_info: DatabaseInfo) -> Self { + let mut config = tokio_postgres::Config::new(); + + config + .host(&db_info.host) + .port(db_info.port) + .dbname(&db_info.dbname) + .user(&db_info.user); + + if let Some(password) = db_info.password { + config.password(password); + } + + config + } +} + +pub(super) async fn handle_user( + config: &ProxyConfig, + client: &mut PqStream, + creds: ClientCredentials, +) -> super::Result { + use AuthBackendType::*; + match config.auth_backend { + LegacyConsole => { + legacy_console::handle_user( + &config.auth_endpoint, + &config.auth_link_uri, + client, + &creds, + ) + .await + } + Console => { + console::Api::new(&config.auth_endpoint, &creds)? + .handle_user(client) + .await + } + Postgres => { + postgres::Api::new(&config.auth_endpoint, &creds)? + .handle_user(client) + .await + } + Link => link::handle_user(&config.auth_link_uri, client).await, + } +} diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs new file mode 100644 index 0000000000..252522affb --- /dev/null +++ b/proxy/src/auth/backend/console.rs @@ -0,0 +1,225 @@ +//! Cloud API V2. + +use crate::{ + auth::{self, AuthFlow, ClientCredentials, DatabaseInfo}, + compute, + error::UserFacingError, + scram, + stream::PqStream, + url::ApiUrl, +}; +use serde::{Deserialize, Serialize}; +use std::{future::Future, io}; +use thiserror::Error; +use tokio::io::{AsyncRead, AsyncWrite}; +use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; + +pub type Result = std::result::Result; + +#[derive(Debug, Error)] +pub enum ConsoleAuthError { + #[error(transparent)] + BadProjectName(#[from] auth::credentials::ProjectNameError), + + // We shouldn't include the actual secret here. + #[error("Bad authentication secret")] + BadSecret, + + #[error("Console responded with a malformed compute address: '{0}'")] + BadComputeAddress(String), + + #[error("Console responded with a malformed JSON: '{0}'")] + BadResponse(#[from] serde_json::Error), + + /// HTTP status (other than 200) returned by the console. + #[error("Console responded with an HTTP status: {0}")] + HttpStatus(reqwest::StatusCode), + + #[error(transparent)] + Io(#[from] std::io::Error), +} + +impl UserFacingError for ConsoleAuthError { + fn to_string_client(&self) -> String { + use ConsoleAuthError::*; + match self { + BadProjectName(e) => e.to_string_client(), + _ => "Internal error".to_string(), + } + } +} + +// TODO: convert into an enum with "error" +#[derive(Serialize, Deserialize, Debug)] +struct GetRoleSecretResponse { + role_secret: String, +} + +// TODO: convert into an enum with "error" +#[derive(Serialize, Deserialize, Debug)] +struct GetWakeComputeResponse { + address: String, +} + +/// Auth secret which is managed by the cloud. +pub enum AuthInfo { + /// Md5 hash of user's password. + Md5([u8; 16]), + + /// [SCRAM](crate::scram) authentication info. + Scram(scram::ServerSecret), +} + +#[must_use] +pub(super) struct Api<'a> { + endpoint: &'a ApiUrl, + creds: &'a ClientCredentials, + /// Cache project name, since we'll need it several times. + project: &'a str, +} + +impl<'a> Api<'a> { + /// Construct an API object containing the auth parameters. + pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result { + Ok(Self { + endpoint, + creds, + project: creds.project_name()?, + }) + } + + /// Authenticate the existing user or throw an error. + pub(super) async fn handle_user( + self, + client: &mut PqStream, + ) -> auth::Result { + handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await + } + + async fn get_auth_info(&self) -> Result { + let mut url = self.endpoint.clone(); + url.path_segments_mut().push("proxy_get_role_secret"); + url.query_pairs_mut() + .append_pair("project", self.project) + .append_pair("role", &self.creds.user); + + // TODO: use a proper logger + println!("cplane request: {url}"); + + let resp = reqwest::get(url.into_inner()).await.map_err(io_error)?; + if !resp.status().is_success() { + return Err(ConsoleAuthError::HttpStatus(resp.status())); + } + + let response: GetRoleSecretResponse = + serde_json::from_str(&resp.text().await.map_err(io_error)?)?; + + scram::ServerSecret::parse(response.role_secret.as_str()) + .map(AuthInfo::Scram) + .ok_or(ConsoleAuthError::BadSecret) + } + + /// Wake up the compute node and return the corresponding connection info. + async fn wake_compute(&self) -> Result { + let mut url = self.endpoint.clone(); + url.path_segments_mut().push("proxy_wake_compute"); + url.query_pairs_mut().append_pair("project", self.project); + + // TODO: use a proper logger + println!("cplane request: {url}"); + + let resp = reqwest::get(url.into_inner()).await.map_err(io_error)?; + if !resp.status().is_success() { + return Err(ConsoleAuthError::HttpStatus(resp.status())); + } + + let response: GetWakeComputeResponse = + serde_json::from_str(&resp.text().await.map_err(io_error)?)?; + + let (host, port) = parse_host_port(&response.address) + .ok_or(ConsoleAuthError::BadComputeAddress(response.address))?; + + Ok(DatabaseInfo { + host, + port, + dbname: self.creds.dbname.to_owned(), + user: self.creds.user.to_owned(), + password: None, + }) + } +} + +/// Common logic for user handling in API V2. +/// We reuse this for a mock API implementation in [`super::postgres`]. +pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>( + client: &mut PqStream, + endpoint: &'a Endpoint, + get_auth_info: impl FnOnce(&'a Endpoint) -> GetAuthInfo, + wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute, +) -> auth::Result +where + GetAuthInfo: Future>, + WakeCompute: Future>, +{ + let auth_info = get_auth_info(endpoint).await?; + + let flow = AuthFlow::new(client); + let scram_keys = match auth_info { + AuthInfo::Md5(_) => { + // TODO: decide if we should support MD5 in api v2 + return Err(auth::AuthErrorImpl::auth_failed("MD5 is not supported").into()); + } + AuthInfo::Scram(secret) => { + let scram = auth::Scram(&secret); + Some(compute::ScramKeys { + client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), + server_key: secret.server_key.as_bytes(), + }) + } + }; + + client + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())?; + + Ok(compute::NodeInfo { + db_info: wake_compute(endpoint).await?, + scram_keys, + }) +} + +/// Upcast (almost) any error into an opaque [`io::Error`]. +pub(super) fn io_error(e: impl Into>) -> io::Error { + io::Error::new(io::ErrorKind::Other, e) +} + +fn parse_host_port(input: &str) -> Option<(String, u16)> { + let (host, port) = input.split_once(':')?; + Some((host.to_owned(), port.parse().ok()?)) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn parse_db_info() -> anyhow::Result<()> { + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "password": "password", + }))?; + + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + }))?; + + Ok(()) + } +} diff --git a/proxy/src/auth_backend/legacy_console.rs b/proxy/src/auth/backend/legacy_console.rs similarity index 90% rename from proxy/src/auth_backend/legacy_console.rs rename to proxy/src/auth/backend/legacy_console.rs index 29997d2389..467da63a98 100644 --- a/proxy/src/auth_backend/legacy_console.rs +++ b/proxy/src/auth/backend/legacy_console.rs @@ -1,20 +1,18 @@ //! Cloud API V1. -use super::console::DatabaseInfo; - -use crate::auth::ClientCredentials; -use crate::stream::PqStream; - -use crate::{compute, waiters}; +use super::DatabaseInfo; +use crate::{ + auth::{self, ClientCredentials}, + compute, + error::UserFacingError, + stream::PqStream, + waiters, +}; use serde::{Deserialize, Serialize}; - +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; -use thiserror::Error; - -use crate::error::UserFacingError; - #[derive(Debug, Error)] pub enum AuthErrorImpl { /// Authentication error reported by the console. @@ -45,7 +43,7 @@ pub struct AuthError(Box); impl AuthError { /// Smart constructor for authentication error reported by `mgmt`. pub fn auth_failed(msg: impl Into) -> Self { - AuthError(Box::new(AuthErrorImpl::AuthFailed(msg.into()))) + Self(Box::new(AuthErrorImpl::AuthFailed(msg.into()))) } } @@ -54,7 +52,7 @@ where AuthErrorImpl: From, { fn from(e: T) -> Self { - AuthError(Box::new(e.into())) + Self(Box::new(e.into())) } } @@ -120,7 +118,7 @@ async fn handle_existing_user( auth_endpoint: &reqwest::Url, client: &mut PqStream, creds: &ClientCredentials, -) -> Result { +) -> Result { let psql_session_id = super::link::new_psql_session_id(); let md5_salt = rand::random(); @@ -130,7 +128,7 @@ async fn handle_existing_user( // Read client's password hash let msg = client.read_password_message().await?; - let md5_response = parse_password(&msg).ok_or(crate::auth::AuthErrorImpl::MalformedPassword)?; + let md5_response = parse_password(&msg).ok_or(auth::AuthErrorImpl::MalformedPassword)?; let db_info = authenticate_proxy_client( auth_endpoint, @@ -156,11 +154,11 @@ pub async fn handle_user( auth_link_uri: &reqwest::Url, client: &mut PqStream, creds: &ClientCredentials, -) -> Result { +) -> auth::Result { if creds.is_existing_user() { handle_existing_user(auth_endpoint, client, creds).await } else { - super::link::handle_user(auth_link_uri.as_ref(), client).await + super::link::handle_user(auth_link_uri, client).await } } diff --git a/proxy/src/auth_backend/link.rs b/proxy/src/auth/backend/link.rs similarity index 75% rename from proxy/src/auth_backend/link.rs rename to proxy/src/auth/backend/link.rs index 8e5fcb32a9..669c9e00e9 100644 --- a/proxy/src/auth_backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -1,4 +1,4 @@ -use crate::{compute, stream::PqStream}; +use crate::{auth, compute, stream::PqStream}; use tokio::io::{AsyncRead, AsyncWrite}; use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; @@ -19,13 +19,13 @@ pub fn new_psql_session_id() -> String { } pub async fn handle_user( - redirect_uri: &str, + redirect_uri: &reqwest::Url, client: &mut PqStream, -) -> Result { +) -> auth::Result { let psql_session_id = new_psql_session_id(); - let greeting = hello_message(redirect_uri, &psql_session_id); + let greeting = hello_message(redirect_uri.as_str(), &psql_session_id); - let db_info = crate::auth_backend::with_waiter(psql_session_id, |waiter| async { + let db_info = super::with_waiter(psql_session_id, |waiter| async { // Give user a URL to spawn a new database client .write_message_noflush(&Be::AuthenticationOk)? @@ -34,9 +34,7 @@ pub async fn handle_user( .await?; // Wait for web console response (see `mgmt`) - waiter - .await? - .map_err(crate::auth::AuthErrorImpl::auth_failed) + waiter.await?.map_err(auth::AuthErrorImpl::auth_failed) }) .await?; diff --git a/proxy/src/auth/backend/postgres.rs b/proxy/src/auth/backend/postgres.rs new file mode 100644 index 0000000000..721b9db095 --- /dev/null +++ b/proxy/src/auth/backend/postgres.rs @@ -0,0 +1,88 @@ +//! Local mock of Cloud API V2. + +use crate::{ + auth::{ + self, + backend::console::{self, io_error, AuthInfo, Result}, + ClientCredentials, DatabaseInfo, + }, + compute, scram, + stream::PqStream, + url::ApiUrl, +}; +use tokio::io::{AsyncRead, AsyncWrite}; + +#[must_use] +pub(super) struct Api<'a> { + endpoint: &'a ApiUrl, + creds: &'a ClientCredentials, +} + +impl<'a> Api<'a> { + /// Construct an API object containing the auth parameters. + pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result { + Ok(Self { endpoint, creds }) + } + + /// Authenticate the existing user or throw an error. + pub(super) async fn handle_user( + self, + client: &mut PqStream, + ) -> auth::Result { + // We reuse user handling logic from a production module. + console::handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await + } + + /// This implementation fetches the auth info from a local postgres instance. + async fn get_auth_info(&self) -> Result { + // Perhaps we could persist this connection, but then we'd have to + // write more code for reopening it if it got closed, which doesn't + // seem worth it. + let (client, connection) = + tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls) + .await + .map_err(io_error)?; + + tokio::spawn(connection); + let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1"; + let rows = client + .query(query, &[&self.creds.user]) + .await + .map_err(io_error)?; + + match &rows[..] { + // We can't get a secret if there's no such user. + [] => Err(io_error(format!("unknown user '{}'", self.creds.user)).into()), + + // We shouldn't get more than one row anyway. + [row, ..] => { + let entry = row.try_get(0).map_err(io_error)?; + scram::ServerSecret::parse(entry) + .map(AuthInfo::Scram) + .or_else(|| { + // It could be an md5 hash if it's not a SCRAM secret. + let text = entry.strip_prefix("md5")?; + Some(AuthInfo::Md5({ + let mut bytes = [0u8; 16]; + hex::decode_to_slice(text, &mut bytes).ok()?; + bytes + })) + }) + // Putting the secret into this message is a security hazard! + .ok_or(console::ConsoleAuthError::BadSecret) + } + } + } + + /// We don't need to wake anything locally, so we just return the connection info. + async fn wake_compute(&self) -> Result { + Ok(DatabaseInfo { + // TODO: handle that near CLI params parsing + host: self.endpoint.host_str().unwrap_or("localhost").to_owned(), + port: self.endpoint.port().unwrap_or(5432), + dbname: self.creds.dbname.to_owned(), + user: self.creds.user.to_owned(), + password: None, + }) + } +} diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 9d2272b5ad..467e7db282 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,6 +1,5 @@ //! User credentials used in authentication. -use super::AuthError; use crate::compute; use crate::config::ProxyConfig; use crate::error::UserFacingError; @@ -36,6 +35,27 @@ impl ClientCredentials { } } +#[derive(Debug, Error)] +pub enum ProjectNameError { + #[error("SNI is missing, please upgrade the postgres client library")] + Missing, + + #[error("SNI is malformed")] + Bad, +} + +impl UserFacingError for ProjectNameError {} + +impl ClientCredentials { + /// Determine project name from SNI. + pub fn project_name(&self) -> Result<&str, ProjectNameError> { + // Currently project name is passed as a top level domain + let sni = self.sni_data.as_ref().ok_or(ProjectNameError::Missing)?; + let (first, _) = sni.split_once('.').ok_or(ProjectNameError::Bad)?; + Ok(first) + } +} + impl TryFrom> for ClientCredentials { type Error = ClientCredsParseError; @@ -47,11 +67,11 @@ impl TryFrom> for ClientCredentials { }; let user = get_param("user")?; - let db = get_param("database")?; + let dbname = get_param("database")?; Ok(Self { user, - dbname: db, + dbname, sni_data: None, }) } @@ -63,8 +83,8 @@ impl ClientCredentials { self, config: &ProxyConfig, client: &mut PqStream, - ) -> Result { + ) -> super::Result { // This method is just a convenient facade for `handle_user` - super::handle_user(config, client, self).await + super::backend::handle_user(config, client, self).await } } diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 3eed0f0a23..7efff13bfc 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -1,6 +1,6 @@ //! Main authentication flow. -use super::{AuthError, AuthErrorImpl}; +use super::AuthErrorImpl; use crate::stream::PqStream; use crate::{sasl, scram}; use std::io; @@ -32,7 +32,7 @@ impl AuthMethod for Scram<'_> { pub struct AuthFlow<'a, Stream, State> { /// The underlying stream which implements libpq's protocol. stream: &'a mut PqStream, - /// State might contain ancillary data (see [`AuthFlow::begin`]). + /// State might contain ancillary data (see [`Self::begin`]). state: State, } @@ -60,7 +60,7 @@ impl<'a, S: AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { /// Stream wrapper for handling [SCRAM](crate::scram) auth. impl AuthFlow<'_, S, Scram<'_>> { /// Perform user authentication. Raise an error in case authentication failed. - pub async fn authenticate(self) -> Result { + pub async fn authenticate(self) -> super::Result { // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; let sasl = sasl::FirstMessage::parse(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; diff --git a/proxy/src/auth_backend.rs b/proxy/src/auth_backend.rs deleted file mode 100644 index 54362bf719..0000000000 --- a/proxy/src/auth_backend.rs +++ /dev/null @@ -1,31 +0,0 @@ -pub mod console; -pub mod legacy_console; -pub mod link; -pub mod postgres; - -pub use legacy_console::{AuthError, AuthErrorImpl}; - -use crate::mgmt; -use crate::waiters::{self, Waiter, Waiters}; -use lazy_static::lazy_static; - -lazy_static! { - static ref CPLANE_WAITERS: Waiters = Default::default(); -} - -/// Give caller an opportunity to wait for the cloud's reply. -pub async fn with_waiter( - psql_session_id: impl Into, - action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R, -) -> Result -where - R: std::future::Future>, - E: From, -{ - let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; - action(waiter).await -} - -pub fn notify(psql_session_id: &str, msg: mgmt::ComputeReady) -> Result<(), waiters::NotifyError> { - CPLANE_WAITERS.notify(psql_session_id, msg) -} diff --git a/proxy/src/auth_backend/console.rs b/proxy/src/auth_backend/console.rs deleted file mode 100644 index 41a822701f..0000000000 --- a/proxy/src/auth_backend/console.rs +++ /dev/null @@ -1,243 +0,0 @@ -//! Declaration of Cloud API V2. - -use crate::{ - auth::{self, AuthFlow}, - compute, scram, -}; -use serde::{Deserialize, Serialize}; -use thiserror::Error; - -use crate::auth::ClientCredentials; -use crate::stream::PqStream; - -use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; - -#[derive(Debug, Error)] -pub enum ConsoleAuthError { - // We shouldn't include the actual secret here. - #[error("Bad authentication secret")] - BadSecret, - - #[error("Bad client credentials: {0:?}")] - BadCredentials(crate::auth::ClientCredentials), - - #[error("SNI info is missing, please upgrade the postgres client library")] - SniMissing, - - #[error("Unexpected SNI content")] - SniWrong, - - #[error(transparent)] - BadUrl(#[from] url::ParseError), - - #[error(transparent)] - Io(#[from] std::io::Error), - - /// HTTP status (other than 200) returned by the console. - #[error("Console responded with an HTTP status: {0}")] - HttpStatus(reqwest::StatusCode), - - #[error(transparent)] - Transport(#[from] reqwest::Error), - - #[error("Console responded with a malformed JSON: '{0}'")] - MalformedResponse(#[from] serde_json::Error), - - #[error("Console responded with a malformed compute address: '{0}'")] - MalformedComputeAddress(String), -} - -#[derive(Serialize, Deserialize, Debug)] -struct GetRoleSecretResponse { - role_secret: String, -} - -#[derive(Serialize, Deserialize, Debug)] -struct GetWakeComputeResponse { - address: String, -} - -/// Auth secret which is managed by the cloud. -pub enum AuthInfo { - /// Md5 hash of user's password. - Md5([u8; 16]), - /// [SCRAM](crate::scram) authentication info. - Scram(scram::ServerSecret), -} - -/// Compute node connection params provided by the cloud. -/// Note how it implements serde traits, since we receive it over the wire. -#[derive(Serialize, Deserialize, Default)] -pub struct DatabaseInfo { - pub host: String, - pub port: u16, - pub dbname: String, - pub user: String, - - /// [Cloud API V1](super::legacy) returns cleartext password, - /// but [Cloud API V2](super::api) implements [SCRAM](crate::scram) - /// authentication, so we can leverage this method and cope without password. - pub password: Option, -} - -// Manually implement debug to omit personal and sensitive info. -impl std::fmt::Debug for DatabaseInfo { - fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { - fmt.debug_struct("DatabaseInfo") - .field("host", &self.host) - .field("port", &self.port) - .finish() - } -} - -impl From for tokio_postgres::Config { - fn from(db_info: DatabaseInfo) -> Self { - let mut config = tokio_postgres::Config::new(); - - config - .host(&db_info.host) - .port(db_info.port) - .dbname(&db_info.dbname) - .user(&db_info.user); - - if let Some(password) = db_info.password { - config.password(password); - } - - config - } -} - -async fn get_auth_info( - auth_endpoint: &str, - user: &str, - cluster: &str, -) -> Result { - let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_get_role_secret"))?; - - url.query_pairs_mut() - .append_pair("project", cluster) - .append_pair("role", user); - - // TODO: use a proper logger - println!("cplane request: {}", url); - - let resp = reqwest::get(url).await?; - if !resp.status().is_success() { - return Err(ConsoleAuthError::HttpStatus(resp.status())); - } - - let response: GetRoleSecretResponse = serde_json::from_str(resp.text().await?.as_str())?; - - scram::ServerSecret::parse(response.role_secret.as_str()) - .map(AuthInfo::Scram) - .ok_or(ConsoleAuthError::BadSecret) -} - -/// Wake up the compute node and return the corresponding connection info. -async fn wake_compute( - auth_endpoint: &str, - cluster: &str, -) -> Result<(String, u16), ConsoleAuthError> { - let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_wake_compute"))?; - url.query_pairs_mut().append_pair("project", cluster); - - // TODO: use a proper logger - println!("cplane request: {}", url); - - let resp = reqwest::get(url).await?; - if !resp.status().is_success() { - return Err(ConsoleAuthError::HttpStatus(resp.status())); - } - - let response: GetWakeComputeResponse = serde_json::from_str(resp.text().await?.as_str())?; - let (host, port) = response - .address - .split_once(':') - .ok_or_else(|| ConsoleAuthError::MalformedComputeAddress(response.address.clone()))?; - let port: u16 = port - .parse() - .map_err(|_| ConsoleAuthError::MalformedComputeAddress(response.address.clone()))?; - - Ok((host.to_string(), port)) -} - -pub async fn handle_user( - auth_endpoint: &str, - client: &mut PqStream, - creds: &ClientCredentials, -) -> Result { - // Determine cluster name from SNI. - let cluster = creds - .sni_data - .as_ref() - .ok_or(ConsoleAuthError::SniMissing)? - .split_once('.') - .ok_or(ConsoleAuthError::SniWrong)? - .0; - - let user = creds.user.as_str(); - - // Step 1: get the auth secret - let auth_info = get_auth_info(auth_endpoint, user, cluster).await?; - - let flow = AuthFlow::new(client); - let scram_keys = match auth_info { - AuthInfo::Md5(_) => { - // TODO: decide if we should support MD5 in api v2 - return Err(crate::auth::AuthErrorImpl::auth_failed("MD5 is not supported").into()); - } - AuthInfo::Scram(secret) => { - let scram = auth::Scram(&secret); - Some(compute::ScramKeys { - client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), - server_key: secret.server_key.as_bytes(), - }) - } - }; - - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; - - // Step 2: wake compute - let (host, port) = wake_compute(auth_endpoint, cluster).await?; - - Ok(compute::NodeInfo { - db_info: DatabaseInfo { - host, - port, - dbname: creds.dbname.clone(), - user: creds.user.clone(), - password: None, - }, - scram_keys, - }) -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn parse_db_info() -> anyhow::Result<()> { - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - "password": "password", - }))?; - - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - }))?; - - Ok(()) - } -} diff --git a/proxy/src/auth_backend/postgres.rs b/proxy/src/auth_backend/postgres.rs deleted file mode 100644 index 148c2a2518..0000000000 --- a/proxy/src/auth_backend/postgres.rs +++ /dev/null @@ -1,93 +0,0 @@ -//! Local mock of Cloud API V2. - -use super::console::{self, AuthInfo, DatabaseInfo}; -use crate::scram; -use crate::{auth::ClientCredentials, compute}; - -use crate::stream::PqStream; -use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; - -async fn get_auth_info( - auth_endpoint: &str, - creds: &ClientCredentials, -) -> Result { - // We wrap `tokio_postgres::Error` because we don't want to infect the - // method's error type with a detail that's specific to debug mode only. - let io_error = |e| std::io::Error::new(std::io::ErrorKind::Other, e); - - // Perhaps we could persist this connection, but then we'd have to - // write more code for reopening it if it got closed, which doesn't - // seem worth it. - let (client, connection) = tokio_postgres::connect(auth_endpoint, tokio_postgres::NoTls) - .await - .map_err(io_error)?; - - tokio::spawn(connection); - let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1"; - let rows = client - .query(query, &[&creds.user]) - .await - .map_err(io_error)?; - - match &rows[..] { - // We can't get a secret if there's no such user. - [] => Err(console::ConsoleAuthError::BadCredentials(creds.to_owned())), - // We shouldn't get more than one row anyway. - [row, ..] => { - let entry = row.try_get(0).map_err(io_error)?; - scram::ServerSecret::parse(entry) - .map(AuthInfo::Scram) - .or_else(|| { - // It could be an md5 hash if it's not a SCRAM secret. - let text = entry.strip_prefix("md5")?; - Some(AuthInfo::Md5({ - let mut bytes = [0u8; 16]; - hex::decode_to_slice(text, &mut bytes).ok()?; - bytes - })) - }) - // Putting the secret into this message is a security hazard! - .ok_or(console::ConsoleAuthError::BadSecret) - } - } -} - -pub async fn handle_user( - auth_endpoint: &reqwest::Url, - client: &mut PqStream, - creds: &ClientCredentials, -) -> Result { - let auth_info = get_auth_info(auth_endpoint.as_ref(), creds).await?; - - let flow = crate::auth::AuthFlow::new(client); - let scram_keys = match auth_info { - AuthInfo::Md5(_) => { - // TODO: decide if we should support MD5 in api v2 - return Err(crate::auth::AuthErrorImpl::auth_failed("MD5 is not supported").into()); - } - AuthInfo::Scram(secret) => { - let scram = crate::auth::Scram(&secret); - Some(compute::ScramKeys { - client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), - server_key: secret.server_key.as_bytes(), - }) - } - }; - - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; - - Ok(compute::NodeInfo { - db_info: DatabaseInfo { - // TODO: handle that near CLI params parsing - host: auth_endpoint.host_str().unwrap_or("localhost").to_owned(), - port: auth_endpoint.port().unwrap_or(5432), - dbname: creds.dbname.to_owned(), - user: creds.user.to_owned(), - password: None, - }, - scram_keys, - }) -} diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index c3c5ba47fb..cccd6e60d4 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,4 +1,4 @@ -use crate::auth_backend::console::DatabaseInfo; +use crate::auth::DatabaseInfo; use crate::cancellation::CancelClosure; use crate::error::UserFacingError; use std::io; @@ -37,7 +37,7 @@ pub struct NodeInfo { impl NodeInfo { async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { - let host_port = format!("{}:{}", self.db_info.host, self.db_info.port); + let host_port = (self.db_info.host.as_str(), self.db_info.port); let socket = TcpStream::connect(host_port).await?; let socket_addr = socket.peer_addr()?; socket2::SockRef::from(&socket).set_keepalive(true)?; diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 6f1b56bfe4..a5cd17eb55 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,39 +1,38 @@ -use anyhow::{ensure, Context}; +use crate::url::ApiUrl; +use anyhow::{bail, ensure, Context}; use std::{str::FromStr, sync::Arc}; -#[non_exhaustive] pub enum AuthBackendType { + /// Legacy Cloud API (V1). LegacyConsole, - Console, - Postgres, + /// Authentication via a web browser. Link, + /// Current Cloud API (V2). + Console, + /// Local mock of Cloud API (V2). + Postgres, } impl FromStr for AuthBackendType { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { - println!("ClientAuthMethod::from_str: '{}'", s); use AuthBackendType::*; - match s { - "legacy" => Ok(LegacyConsole), - "console" => Ok(Console), - "postgres" => Ok(Postgres), - "link" => Ok(Link), - _ => Err(anyhow::anyhow!("Invlid option for auth method")), - } + Ok(match s { + "legacy" => LegacyConsole, + "console" => Console, + "postgres" => Postgres, + "link" => Link, + _ => bail!("Invalid option `{s}` for auth method"), + }) } } pub struct ProxyConfig { - /// TLS configuration for the proxy. pub tls_config: Option, - pub auth_backend: AuthBackendType, - - pub auth_endpoint: reqwest::Url, - - pub auth_link_uri: reqwest::Url, + pub auth_endpoint: ApiUrl, + pub auth_link_uri: ApiUrl, } pub type TlsConfig = Arc; diff --git a/proxy/src/main.rs b/proxy/src/main.rs index b457d46824..672f24b6fb 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -5,7 +5,6 @@ //! in somewhat transparent manner (again via communication with control plane API). mod auth; -mod auth_backend; mod cancellation; mod compute; mod config; @@ -17,6 +16,7 @@ mod proxy; mod sasl; mod scram; mod stream; +mod url; mod waiters; use anyhow::{bail, Context}; diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 93618fff68..8737d170b1 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -1,4 +1,4 @@ -use crate::auth_backend; +use crate::auth; use anyhow::Context; use serde::Deserialize; use std::{ @@ -77,12 +77,12 @@ struct PsqlSessionResponse { #[derive(Deserialize)] enum PsqlSessionResult { - Success(auth_backend::console::DatabaseInfo), + Success(auth::DatabaseInfo), Failure(String), } /// A message received by `mgmt` when a compute node is ready. -pub type ComputeReady = Result; +pub type ComputeReady = Result; impl PsqlSessionResult { fn into_compute_ready(self) -> ComputeReady { @@ -113,7 +113,7 @@ fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::R let resp: PsqlSessionResponse = serde_json::from_str(query_string)?; - match auth_backend::notify(&resp.session_id, resp.result.into_compute_ready()) { + match auth::backend::notify(&resp.session_id, resp.result.into_compute_ready()) { Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? diff --git a/proxy/src/url.rs b/proxy/src/url.rs new file mode 100644 index 0000000000..76d6ad0e66 --- /dev/null +++ b/proxy/src/url.rs @@ -0,0 +1,82 @@ +use anyhow::bail; +use url::form_urlencoded::Serializer; + +/// A [url](url::Url) type with additional guarantees. +#[derive(Debug, Clone)] +pub struct ApiUrl(url::Url); + +impl ApiUrl { + /// Consume the wrapper and return inner [url](url::Url). + pub fn into_inner(self) -> url::Url { + self.0 + } + + /// See [`url::Url::query_pairs_mut`]. + pub fn query_pairs_mut(&mut self) -> Serializer<'_, url::UrlQuery<'_>> { + self.0.query_pairs_mut() + } + + /// See [`url::Url::path_segments_mut`]. + pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut { + // We've already verified that it works during construction. + self.0.path_segments_mut().expect("bad API url") + } +} + +/// This instance imposes additional requirements on the url. +impl std::str::FromStr for ApiUrl { + type Err = anyhow::Error; + + fn from_str(s: &str) -> anyhow::Result { + let mut url: url::Url = s.parse()?; + + // Make sure that we can build upon this URL. + if url.path_segments_mut().is_err() { + bail!("bad API url provided"); + } + + Ok(Self(url)) + } +} + +/// This instance is safe because it doesn't allow us to modify the object. +impl std::ops::Deref for ApiUrl { + type Target = url::Url; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl std::fmt::Display for ApiUrl { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn bad_url() { + let url = "test:foobar"; + url.parse::().expect("unexpected parsing failure"); + let _ = url.parse::().expect_err("should not parse"); + } + + #[test] + fn good_url() { + let url = "test://foobar"; + let mut a = url.parse::().expect("unexpected parsing failure"); + let mut b = url.parse::().expect("unexpected parsing failure"); + + a.path_segments_mut().unwrap().push("method"); + a.query_pairs_mut().append_pair("key", "value"); + + b.path_segments_mut().push("method"); + b.query_pairs_mut().append_pair("key", "value"); + + assert_eq!(a, b.into_inner()); + } +} From b3ec6e0661e1f08beb1cd08b265cc64af0cd4035 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Thu, 26 May 2022 20:39:33 +0300 Subject: [PATCH 0364/1022] [proxy] Propagate SASL/SCRAM auth errors to the user This will replace the vague (and incorrect) "Internal error" with a nice and helpful authentication error, e.g. "password doesn't match". --- proxy/src/auth.rs | 1 + proxy/src/config.rs | 1 + proxy/src/main.rs | 1 + proxy/src/sasl.rs | 15 +++++++++++++++ proxy/src/scram/exchange.rs | 6 ++++-- 5 files changed, 22 insertions(+), 2 deletions(-) diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 082a7bcf20..9bddd58fce 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -75,6 +75,7 @@ impl UserFacingError for AuthError { match self.0.as_ref() { Console(e) => e.to_string_client(), GetAuthInfo(e) => e.to_string_client(), + Sasl(e) => e.to_string_client(), MalformedPassword => self.to_string(), _ => "Internal error".to_string(), } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index a5cd17eb55..4def11aefc 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -2,6 +2,7 @@ use crate::url::ApiUrl; use anyhow::{bail, ensure, Context}; use std::{str::FromStr, sync::Arc}; +#[derive(Debug)] pub enum AuthBackendType { /// Legacy Cloud API (V1). LegacyConsole, diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 672f24b6fb..b68b2440dd 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -126,6 +126,7 @@ async fn main() -> anyhow::Result<()> { })); println!("Version: {GIT_VERSION}"); + println!("Authentication backend: {:?}", config.auth_backend); // Check that we can bind to address before further initialization println!("Starting http on {}", http_address); diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs index cd9032bfb9..689fca6049 100644 --- a/proxy/src/sasl.rs +++ b/proxy/src/sasl.rs @@ -10,6 +10,7 @@ mod channel_binding; mod messages; mod stream; +use crate::error::UserFacingError; use std::io; use thiserror::Error; @@ -36,6 +37,20 @@ pub enum Error { Io(#[from] io::Error), } +impl UserFacingError for Error { + fn to_string_client(&self) -> String { + use Error::*; + match self { + // This constructor contains the reason why auth has failed. + AuthenticationFailed(s) => s.to_string(), + // TODO: add support for channel binding + ChannelBindingFailed(_) => "channel binding is not supported yet".to_string(), + ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"), + _ => "authentication protocol violation".to_string(), + } + } +} + /// A convenient result type for SASL exchange. pub type Result = std::result::Result; diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index cad77e15f5..fca5585b25 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -106,7 +106,9 @@ impl sasl::Mechanism for Exchange<'_> { } if client_final_message.nonce != server_first_message.nonce() { - return Err(SaslError::AuthenticationFailed("bad nonce")); + return Err(SaslError::AuthenticationFailed( + "combined nonce doesn't match", + )); } let signature_builder = SignatureBuilder { @@ -120,7 +122,7 @@ impl sasl::Mechanism for Exchange<'_> { .derive_client_key(&client_final_message.proof); if client_key.sha256() != self.secret.stored_key { - return Err(SaslError::AuthenticationFailed("keys don't match")); + return Err(SaslError::AuthenticationFailed("password doesn't match")); } let msg = client_final_message From 500e8772f058ccb1a7cccbbfc83c80d14aa26a1e Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Fri, 27 May 2022 17:48:11 -0400 Subject: [PATCH 0365/1022] Add quick-start guide in readme (#1816) --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 8e8bf1a9b2..97927317d8 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,11 @@ Neon is a serverless open source alternative to AWS Aurora Postgres. It separate The project used to be called "Zenith". Many of the commands and code comments still refer to "zenith", but we are in the process of renaming things. +## Quick start +[Join the waitlist](https://neon.tech/) for our free tier to receive your serverless postgres instance. Then connect to it with your preferred postgres client (psql, dbeaver, etc) or use the online SQL editor. + +Alternatively, compile and run the project [locally](#running-local-installation). + ## Architecture overview A Neon installation consists of compute nodes and Neon storage engine. From f1c51a12675587fc1c28412e7ee3c31212e01cd7 Mon Sep 17 00:00:00 2001 From: Kian-Meng Ang Date: Sat, 28 May 2022 13:27:30 +0800 Subject: [PATCH 0366/1022] Fix typos --- .circleci/ansible/get_binaries.sh | 4 ++-- .github/workflows/benchmarking.yml | 2 +- Dockerfile | 2 +- Dockerfile.alpine | 2 +- docs/glossary.md | 2 +- docs/multitenancy.md | 2 +- docs/rfcs/002-storage.md | 6 +++--- docs/rfcs/003-laptop-cli.md | 2 +- docs/rfcs/005-zenith_local.md | 2 +- docs/rfcs/006-laptop-cli-v2-CLI.md | 4 ++-- docs/rfcs/009-snapshot-first-storage-cli.md | 2 +- docs/rfcs/009-snapshot-first-storage-pitr.md | 2 +- docs/rfcs/010-storage_details.md | 2 +- docs/rfcs/013-term-history.md | 2 +- docs/rfcs/015-storage-messaging.md | 6 +++--- docs/rfcs/README.md | 2 +- docs/settings.md | 2 +- docs/sourcetree.md | 4 ++-- libs/postgres_ffi/src/waldecoder.rs | 2 +- libs/postgres_ffi/src/xlog_utils.rs | 2 +- libs/utils/src/bin_ser.rs | 4 ++-- libs/utils/src/lib.rs | 2 +- libs/utils/src/postgres_backend.rs | 2 +- libs/utils/src/pq_proto.rs | 8 ++++---- pageserver/src/config.rs | 2 +- pageserver/src/keyspace.rs | 2 +- pageserver/src/layered_repository.rs | 6 +++--- pageserver/src/layered_repository/disk_btree.rs | 2 +- pageserver/src/page_service.rs | 2 +- pageserver/src/pgdatadir_mapping.rs | 2 +- .../src/remote_storage/storage_sync/delete.rs | 2 +- pageserver/src/repository.rs | 10 +++++----- pageserver/src/storage_sync.rs | 16 ++++++++-------- pageserver/src/storage_sync/delete.rs | 2 +- pageserver/src/storage_sync/download.rs | 2 +- pageserver/src/storage_sync/index.rs | 4 ++-- pageserver/src/storage_sync/upload.rs | 2 +- pageserver/src/virtual_file.rs | 2 +- pageserver/src/walingest.rs | 2 +- pageserver/src/walredo.rs | 4 ++-- proxy/src/proxy.rs | 2 +- safekeeper/README.md | 2 +- safekeeper/README_PROTO.md | 4 ++-- safekeeper/spec/ProposerAcceptorConsensus.tla | 2 +- safekeeper/src/bin/safekeeper.rs | 2 +- safekeeper/src/callmemaybe.rs | 2 +- safekeeper/src/control_file_upgrade.rs | 8 ++++---- safekeeper/src/safekeeper.rs | 4 ++-- safekeeper/src/wal_backup.rs | 4 ++-- safekeeper/src/wal_storage.rs | 4 ++-- test_runner/batch_others/test_clog_truncate.py | 2 +- test_runner/batch_others/test_pitr_gc.py | 2 +- test_runner/batch_others/test_remote_storage.py | 2 +- .../batch_others/test_tenant_relocation.py | 2 +- test_runner/batch_others/test_vm_bits.py | 2 +- test_runner/batch_others/test_wal_acceptor.py | 4 ++-- test_runner/fixtures/benchmark_fixture.py | 4 ++-- test_runner/fixtures/zenith_fixtures.py | 8 ++++---- 58 files changed, 96 insertions(+), 96 deletions(-) diff --git a/.circleci/ansible/get_binaries.sh b/.circleci/ansible/get_binaries.sh index c613213a75..c9cbe91f34 100755 --- a/.circleci/ansible/get_binaries.sh +++ b/.circleci/ansible/get_binaries.sh @@ -6,7 +6,7 @@ RELEASE=${RELEASE:-false} # look at docker hub for latest tag for neon docker image if [ "${RELEASE}" = "true" ]; then - echo "search latest relase tag" + echo "search latest release tag" VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1) if [ -z "${VERSION}" ]; then echo "no any docker tags found, exiting..." @@ -31,7 +31,7 @@ echo "found ${VERSION}" rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version mkdir neon_install -# retrive binaries from docker image +# retrieve binaries from docker image echo "getting binaries from docker image" docker pull --quiet neondatabase/neon:${TAG} ID=$(docker create neondatabase/neon:${TAG}) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 72041c9d02..adb53c0009 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -19,7 +19,7 @@ jobs: bench: # this workflow runs on self hosteed runner # it's environment is quite different from usual guthub runner - # probably the most important difference is that it doesnt start from clean workspace each time + # probably the most important difference is that it doesn't start from clean workspace each time # e g if you install system packages they are not cleaned up since you install them directly in host machine # not a container or something # See documentation for more info: https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners diff --git a/Dockerfile b/Dockerfile index a7afd1f335..62e0de7e15 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,7 +25,7 @@ COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/inclu COPY . . # Show build caching stats to check if it was used in the end. -# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, loosing the compilation stats. +# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ && sudo -E "PATH=$PATH" mold -run cargo build --release \ && cachepot -s diff --git a/Dockerfile.alpine b/Dockerfile.alpine index dafb7eaf6b..0f244e4443 100644 --- a/Dockerfile.alpine +++ b/Dockerfile.alpine @@ -4,7 +4,7 @@ # We may also reuse it in CI to unify installation process and as a general binaries building # tool for production servers. # -# Dynamic linking is used for librocksdb and libstdc++ bacause librocksdb-sys calls +# Dynamic linking is used for librocksdb and libstdc++ because librocksdb-sys calls # bindgen with "dynamic" feature flag. This also prevents usage of dockerhub alpine-rust # images which are statically linked and have guards against any dlopen. I would rather # prefer all static binaries so we may change the way librocksdb-sys builds or wait until diff --git a/docs/glossary.md b/docs/glossary.md index a014446010..0de0eea1cb 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -115,7 +115,7 @@ Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/RE * `CommitLSN`: position in WAL confirmed by quorum safekeepers. * `RestartLSN`: position in WAL confirmed by all safekeepers. * `FlushLSN`: part of WAL persisted to the disk by safekeeper. -* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records. +* `VCL`: the largerst LSN for which we can guarantee availability of all prior records. Neon pageserver LSNs: * `last_record_lsn` - the end of last processed WAL record. diff --git a/docs/multitenancy.md b/docs/multitenancy.md index 4f1d45e970..c697ae93cd 100644 --- a/docs/multitenancy.md +++ b/docs/multitenancy.md @@ -6,7 +6,7 @@ Zenith supports multitenancy. One pageserver can serve multiple tenants at once. ### Tenants in other commands -By default during `zenith init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct arugment `--tenantid=` is provided. So generally tenantid more frequently appears in internal pageserver interface. Its commands take tenantid argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants. +By default during `zenith init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct argument `--tenantid=` is provided. So generally tenantid more frequently appears in internal pageserver interface. Its commands take tenantid argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants. Examples for cli: diff --git a/docs/rfcs/002-storage.md b/docs/rfcs/002-storage.md index 5cac377272..6e756df4bf 100644 --- a/docs/rfcs/002-storage.md +++ b/docs/rfcs/002-storage.md @@ -111,13 +111,13 @@ Since we are storing page diffs of variable sizes there is no structural depende ### **Chunk metadata** -Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunck should always consult this data when merging SSTables and applying delete markers. +Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunk should always consult this data when merging SSTables and applying delete markers. ### **Chunk splitting** *(NB: following paragraph is about how to avoid page splitting)* -When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global matadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following: +When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global metadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following: 1. Find separation key and spawn two new chunks with [lo, mid) [mid, hi) boundaries. @@ -166,7 +166,7 @@ Multi-tenant storage makes sense even on a laptop, when you work with different Few databases are stored in one chunk, replicated three times -- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we alway may manually move chunks around the cluster. +- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we always may manually move chunks around the cluster. Screenshot_2021-02-22_at_16 49 10 diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md index 4d1f0a68f0..8520249bf1 100644 --- a/docs/rfcs/003-laptop-cli.md +++ b/docs/rfcs/003-laptop-cli.md @@ -123,7 +123,7 @@ Show currently attached storages. For example: > zenith storage list NAME USED TYPE OPTIONS PATH local 5.1G zenith-local /opt/zenith/store/local -local.compr 20.4G zenith-local comression=on /opt/zenith/store/local.compr +local.compr 20.4G zenith-local compression=on /opt/zenith/store/local.compr zcloud 60G zenith-remote zenith.tech/stas/mystore s3tank 80G S3 ``` diff --git a/docs/rfcs/005-zenith_local.md b/docs/rfcs/005-zenith_local.md index 7b078e9ec0..e36d0a9ae3 100644 --- a/docs/rfcs/005-zenith_local.md +++ b/docs/rfcs/005-zenith_local.md @@ -31,7 +31,7 @@ Ideally, just one binary that incorporates all elements we need. #### Components: -- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responces to show them in a user-friendly way. +- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responses to show them in a user-friendly way. CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli diff --git a/docs/rfcs/006-laptop-cli-v2-CLI.md b/docs/rfcs/006-laptop-cli-v2-CLI.md index a04536922a..84dc932211 100644 --- a/docs/rfcs/006-laptop-cli-v2-CLI.md +++ b/docs/rfcs/006-laptop-cli-v2-CLI.md @@ -25,9 +25,9 @@ To make changes in the catalog you need to run compute nodes zenith start /home/pipedpiper/northwind:main -- starts a compute instance zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud -- you can start a compute node against any hash or branch -zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start anothe compute instance (on different port) +zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port) -- you can start a compute node against any hash or branch -zenith start /home/pipedpiper/northwind: --port 8009 -- start anothe compute instance (on different port) +zenith start /home/pipedpiper/northwind: --port 8009 -- start another compute instance (on different port) -- After running some DML you can run -- zenith status and see how there are two WAL streams one on top of diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md index 11ded3a724..0139569721 100644 --- a/docs/rfcs/009-snapshot-first-storage-cli.md +++ b/docs/rfcs/009-snapshot-first-storage-cli.md @@ -4,7 +4,7 @@ We may think about backups as snapshots in a different format (i.e plain pgdata Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postges to zenith. -So here is an attemt to design consistent CLI for diferent usage scenarios: +So here is an attempt to design consistent CLI for different usage scenarios: #### 1. Start empty pageserver. That is what we have now. diff --git a/docs/rfcs/009-snapshot-first-storage-pitr.md b/docs/rfcs/009-snapshot-first-storage-pitr.md index 801613e2c9..a4d978324b 100644 --- a/docs/rfcs/009-snapshot-first-storage-pitr.md +++ b/docs/rfcs/009-snapshot-first-storage-pitr.md @@ -3,7 +3,7 @@ GetPage@LSN can be called with older LSNs, and the page server needs to be able to reconstruct older page versions. That's needed for having read-only replicas that lag behind the primary, or that are -"anchored" at an older LSN, and internally in the page server whne you +"anchored" at an older LSN, and internally in the page server when you branch at an older point in time. How do you do that? For now, I'm not considering incremental snapshots at all. I don't diff --git a/docs/rfcs/010-storage_details.md b/docs/rfcs/010-storage_details.md index 8429a2d9e3..5c279b7dc8 100644 --- a/docs/rfcs/010-storage_details.md +++ b/docs/rfcs/010-storage_details.md @@ -123,7 +123,7 @@ As far as I understand Bookfile/Aversion addresses versioning and serialization As for exact data that should go to snapshots I think it is the following for each snapshot: * format version number -* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknow key are present. If we add something backward compatible to the file we can keep the version number. +* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknown key are present. If we add something backward compatible to the file we can keep the version number. * array of [BuffTag, corresponding offset in file] for pages -- IIUC that is analogous to ToC in Bookfile * array of [(BuffTag, LSN), corresponding offset in file] for the WAL records * pages, one by one diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md index 0c359028ed..7fe505456d 100644 --- a/docs/rfcs/013-term-history.md +++ b/docs/rfcs/013-term-history.md @@ -13,7 +13,7 @@ https://github.com/zenithdb/rfcs/pull/3/files This makes our biggest our difference from Raft. In Raft, every log record is -stamped with term in which it was generated; while we essentialy store in +stamped with term in which it was generated; while we essentially store in `epoch` only the term of the highest record on this safekeeper -- when we know it -- because during recovery generally we don't, and `epoch` is bumped directly to the term of the proposer who performs the recovery when it is finished. It is diff --git a/docs/rfcs/015-storage-messaging.md b/docs/rfcs/015-storage-messaging.md index 47bc9eb89c..a415b90459 100644 --- a/docs/rfcs/015-storage-messaging.md +++ b/docs/rfcs/015-storage-messaging.md @@ -124,7 +124,7 @@ Each storage node can subscribe to the relevant sets of keys and maintain a loca ### Safekeeper address discovery -During the startup safekeeper should publish the address he is listening on as the part of `{"sk_#{sk_id}" => ip_address}`. Then the pageserver can resolve `sk_#{sk_id}` to the actual address. This way it would work both locally and in the cloud setup. Safekeeper should have `--advertised-address` CLI option so that we can listen on e.g. 0.0.0.0 but advertize something more useful. +During the startup safekeeper should publish the address he is listening on as the part of `{"sk_#{sk_id}" => ip_address}`. Then the pageserver can resolve `sk_#{sk_id}` to the actual address. This way it would work both locally and in the cloud setup. Safekeeper should have `--advertised-address` CLI option so that we can listen on e.g. 0.0.0.0 but advertise something more useful. ### Safekeeper behavior @@ -195,7 +195,7 @@ sequenceDiagram PS1->>SK1: start replication ``` -#### Behavour of services during typical operations +#### Behaviour of services during typical operations ```mermaid sequenceDiagram @@ -250,7 +250,7 @@ sequenceDiagram PS2->>M: Register downloaded timeline PS2->>M: Get safekeepers for timeline, subscribe to changes PS2->>SK1: Start replication to catch up - note over O: PS2 catched up, time to switch compute + note over O: PS2 caught up, time to switch compute O->>C: Restart compute with new pageserver url in config note over C: Wal push is restarted loop request pages diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md index fdf6885929..f7b0b3a587 100644 --- a/docs/rfcs/README.md +++ b/docs/rfcs/README.md @@ -49,7 +49,7 @@ topics. RFC lifecycle: -- Should be submitted in a pull request with and full RFC text in a commited markdown file and copy of the Summary and Motivation sections also included in the PR body. +- Should be submitted in a pull request with and full RFC text in a committed markdown file and copy of the Summary and Motivation sections also included in the PR body. - RFC should be published for review before most of the actual code is written. This isn’t a strict rule, don’t hesitate to experiment and build a POC in parallel with writing an RFC. - Add labels to the PR in the same manner as you do Issues. Example TBD - Request the review from your peers. Reviewing the RFCs from your peers is a priority, same as reviewing the actual code. diff --git a/docs/settings.md b/docs/settings.md index 9564ef626f..7773dbf17f 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -105,7 +105,7 @@ Interval at which garbage collection is triggered. Default is 100 s. #### image_creation_threshold -L0 delta layer threshold for L1 iamge layer creation. Default is 3. +L0 delta layer threshold for L1 image layer creation. Default is 3. #### pitr_interval diff --git a/docs/sourcetree.md b/docs/sourcetree.md index c8d4baff62..5384d334df 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -10,7 +10,7 @@ Intended to be used in integration tests and in CLI tools for local installation `/docs`: -Documentaion of the Zenith features and concepts. +Documentation of the Zenith features and concepts. Now it is mostly dev documentation. `/monitoring`: @@ -92,7 +92,7 @@ A single virtual environment with all dependencies is described in the single `P ### Prerequisites - Install Python 3.9 (the minimal supported version) or greater. - - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesnt work as expected. + - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesn't work as expected. - If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.: ```bash # In Ubuntu diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index 95ea9660e8..91542d268f 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -73,7 +73,7 @@ impl WalStreamDecoder { /// Returns one of the following: /// Ok((Lsn, Bytes)): a tuple containing the LSN of next record, and the record itself /// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function - /// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid. + /// Err(WalDecodeError): an error occurred while decoding, meaning the input was invalid. /// pub fn poll_decode(&mut self) -> Result, WalDecodeError> { let recordbuf; diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 32a3022c5a..67541d844e 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -531,7 +531,7 @@ impl CheckPoint { /// /// Returns 'true' if the XID was updated. pub fn update_next_xid(&mut self, xid: u32) -> bool { - // nextXid should nw greate than any XID in WAL, so increment provided XID and check for wraparround. + // nextXid should nw greater than any XID in WAL, so increment provided XID and check for wraparround. let mut new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID); // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL. // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE diff --git a/libs/utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs index 063d69557d..70f54ea02f 100644 --- a/libs/utils/src/bin_ser.rs +++ b/libs/utils/src/bin_ser.rs @@ -71,7 +71,7 @@ impl From for SerializeError { /// - Fixed integer encoding (i.e. 1u32 is 00000001 not 01) /// /// Does not allow trailing bytes in deserialization. If this is desired, you -/// may set [`Options::allow_trailing_bytes`] to explicitly accomodate this. +/// may set [`Options::allow_trailing_bytes`] to explicitly accommodate this. pub fn be_coder() -> impl Options { bincode::DefaultOptions::new() .with_big_endian() @@ -85,7 +85,7 @@ pub fn be_coder() -> impl Options { /// - Fixed integer encoding (i.e. 1u32 is 00000001 not 01) /// /// Does not allow trailing bytes in deserialization. If this is desired, you -/// may set [`Options::allow_trailing_bytes`] to explicitly accomodate this. +/// may set [`Options::allow_trailing_bytes`] to explicitly accommodate this. pub fn le_coder() -> impl Options { bincode::DefaultOptions::new() .with_little_endian() diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 15d4c7a81e..1b011bb73a 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -64,7 +64,7 @@ pub mod signals; /// One thing to note is that .git is not available in docker (and it is bad to include it there). /// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required. /// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro. -/// Git version received from environment variable used as a fallback in git_version invokation. +/// Git version received from environment variable used as a fallback in git_version invocation. /// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option. /// So the build script will be run only when GIT_VERSION envvar has changed. /// diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 857df0ec84..5fdb1ff9d2 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -475,7 +475,7 @@ impl PostgresBackend { self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; } // NOTE there is no ReadyForQuery message. This handler is used - // for basebackup and it uses CopyOut which doesnt require + // for basebackup and it uses CopyOut which doesn't require // ReadyForQuery message and backend just switches back to // processing mode after sending CopyDone or ErrorResponse. } diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index ce86cf8c91..a36e8342b0 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -464,7 +464,7 @@ impl BeParameterStatusMessage<'static> { } } -// One row desciption in RowDescription packet. +// One row description in RowDescription packet. #[derive(Debug)] pub struct RowDescriptor<'a> { pub name: &'a [u8], @@ -613,7 +613,7 @@ fn cstr_to_str(b: &Bytes) -> Result<&str> { impl<'a> BeMessage<'a> { /// Write message to the given buf. // Unlike the reading side, we use BytesMut - // here as msg len preceeds its body and it is handy to write it down first + // here as msg len precedes its body and it is handy to write it down first // and then fill the length. With Write we would have to either calc it // manually or have one more buffer. pub fn write(buf: &mut BytesMut, message: &BeMessage) -> io::Result<()> { @@ -1047,7 +1047,7 @@ mod tests { #[test] fn test_zenithfeedback_serialization() { let mut zf = ZenithFeedback::empty(); - // Fill zf wih some values + // Fill zf with some values zf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. @@ -1062,7 +1062,7 @@ mod tests { #[test] fn test_zenithfeedback_unknown_key() { let mut zf = ZenithFeedback::empty(); - // Fill zf wih some values + // Fill zf with some values zf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index dc9d7161a2..8add7b8b8f 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -114,7 +114,7 @@ pub struct PageServerConf { pub default_tenant_conf: TenantConf, /// A prefix to add in etcd brokers before every key. - /// Can be used for isolating different pageserver groups withing the same etcd cluster. + /// Can be used for isolating different pageserver groups within the same etcd cluster. pub broker_etcd_prefix: String, /// Etcd broker endpoints to connect to. diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs index f6f0d7b7cf..da213704f3 100644 --- a/pageserver/src/keyspace.rs +++ b/pageserver/src/keyspace.rs @@ -15,7 +15,7 @@ pub struct KeySpace { impl KeySpace { /// /// Partition a key space into roughly chunks of roughly 'target_size' bytes - /// in each patition. + /// in each partition. /// pub fn partition(&self, target_size: u64) -> KeyPartitioning { // Assume that each value is 8k in size. diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 0d7c6f54c8..c13407a14b 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -823,7 +823,7 @@ impl LayeredRepository { for (timeline_id, timeline_entry) in timelines.iter() { timeline_ids.push(*timeline_id); - // This is unresolved question for now, how to do gc in presense of remote timelines + // This is unresolved question for now, how to do gc in presence of remote timelines // especially when this is combined with branching. // Somewhat related: https://github.com/zenithdb/zenith/issues/999 if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() { @@ -1831,7 +1831,7 @@ impl LayeredTimeline { // collect any page versions that are no longer needed because // of the new image layers we created in step 2. // - // TODO: This hight level strategy hasn't been implemented yet. + // TODO: This high level strategy hasn't been implemented yet. // Below are functions compact_level0() and create_image_layers() // but they are a bit ad hoc and don't quite work like it's explained // above. Rewrite it. @@ -2268,7 +2268,7 @@ impl LayeredTimeline { } // 3. Is it needed by a child branch? - // NOTE With that wee would keep data that + // NOTE With that we would keep data that // might be referenced by child branches forever. // We can track this in child timeline GC and delete parent layers when // they are no longer needed. This might be complicated with long inheritance chains. diff --git a/pageserver/src/layered_repository/disk_btree.rs b/pageserver/src/layered_repository/disk_btree.rs index 0c9ad75048..5f9ed8bbea 100644 --- a/pageserver/src/layered_repository/disk_btree.rs +++ b/pageserver/src/layered_repository/disk_btree.rs @@ -7,7 +7,7 @@ //! - Fixed-width keys //! - Fixed-width values (VALUE_SZ) //! - The tree is created in a bulk operation. Insert/deletion after creation -//! is not suppported +//! is not supported //! - page-oriented //! //! TODO: diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 1c07b63072..4f0fca4797 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -634,7 +634,7 @@ impl PageServerHandler { return Ok(()); } // auth is some, just checked above, when auth is some - // then claims are always present because of checks during connetion init + // then claims are always present because of checks during connection init // so this expect won't trigger let claims = self .claims diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index c052aa3d69..626ed1b0f1 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -521,7 +521,7 @@ pub struct DatadirModification<'a, R: Repository> { lsn: Lsn, - // The modifications are not applied directly to the underyling key-value store. + // The modifications are not applied directly to the underlying key-value store. // The put-functions add the modifications here, and they are flushed to the // underlying key-value store by the 'finish' function. pending_updates: HashMap, diff --git a/pageserver/src/remote_storage/storage_sync/delete.rs b/pageserver/src/remote_storage/storage_sync/delete.rs index 00e7c85e35..6fb1d254c4 100644 --- a/pageserver/src/remote_storage/storage_sync/delete.rs +++ b/pageserver/src/remote_storage/storage_sync/delete.rs @@ -1,4 +1,4 @@ -//! Timeline synchrnonization logic to delete a bulk of timeline's remote files from the remote storage. +//! Timeline synchronization logic to delete a bulk of timeline's remote files from the remote storage. use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index d25dc8914d..5bf128e66b 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -19,7 +19,7 @@ use utils::{ #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] /// Key used in the Repository kv-store. /// -/// The Repository treates this as an opaque struct, but see the code in pgdatadir_mapping.rs +/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs /// for what we actually store in these fields. pub struct Key { pub field1: u8, @@ -210,7 +210,7 @@ pub trait Repository: Send + Sync { ) -> Result<()>; /// Get Timeline handle for given zenith timeline ID. - /// This function is idempotent. It doesnt change internal state in any way. + /// This function is idempotent. It doesn't change internal state in any way. fn get_timeline(&self, timelineid: ZTimelineId) -> Option>; /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded. @@ -345,11 +345,11 @@ pub trait Timeline: Send + Sync { /// Look up given page version. /// - /// NOTE: It is considerd an error to 'get' a key that doesn't exist. The abstraction + /// NOTE: It is considered an error to 'get' a key that doesn't exist. The abstraction /// above this needs to store suitable metadata to track what data exists with /// what keys, in separate metadata entries. If a non-existent key is requested, - /// the Repository implementation may incorrectly return a value from an ancestore - /// branch, for exampel, or waste a lot of cycles chasing the non-existing key. + /// the Repository implementation may incorrectly return a value from an ancestor + /// branch, for example, or waste a lot of cycles chasing the non-existing key. /// fn get(&self, key: Key, lsn: Lsn) -> Result; diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index bbebcd1f36..1c33d8315c 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -69,7 +69,7 @@ //! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexPart`], containing the list of remote files. //! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download. //! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`], -//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its shard contents, if needed, same as any layer files. +//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrieve its shard contents, if needed, same as any layer files. //! //! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed. //! Bulk index data download happens only initially, on pageserver startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only, @@ -96,7 +96,7 @@ //! timeline uploads and downloads can happen concurrently, in no particular order due to incremental nature of the timeline layers. //! Deletion happens only after a successful upload only, otherwise the compaction output might make the timeline inconsistent until both tasks are fully processed without errors. //! Upload and download update the remote data (inmemory index and S3 json index part file) only after every layer is successfully synchronized, while the deletion task -//! does otherwise: it requires to have the remote data updated first succesfully: blob files will be invisible to pageserver this way. +//! does otherwise: it requires to have the remote data updated first successfully: blob files will be invisible to pageserver this way. //! //! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via downloading and merging the index data for all timelines, //! present locally. @@ -440,7 +440,7 @@ fn collect_timeline_files( // initial collect will fail because there is no metadata. // We either need to start download if we see empty dir after restart or attach caller should // be aware of that and retry attach if awaits_download for timeline switched from true to false - // but timelinne didnt appear locally. + // but timelinne didn't appear locally. // Check what happens with remote index in that case. let timeline_metadata_path = match timeline_metadata_path { Some(path) => path, @@ -1007,7 +1007,7 @@ where // in local (implicitly, via Lsn values and related memory state) or remote (explicitly via remote layer file paths) metadata. // When operating in a system without tasks failing over the error threshold, // current batching and task processing systems aim to update the layer set and metadata files (remote and local), - // without "loosing" such layer files. + // without "losing" such layer files. let (upload_result, status_update) = tokio::join!( async { if let Some(upload_data) = upload_data { @@ -1162,7 +1162,7 @@ where return Some(TimelineSyncStatusUpdate::Downloaded); } Err(e) => { - error!("Timeline {sync_id} was expected to be in the remote index after a sucessful download, but it's absent: {e:?}"); + error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}"); } }, Err(e) => { @@ -1549,10 +1549,10 @@ fn compare_local_and_remote_timeline( let remote_files = remote_entry.stored_files(); // TODO probably here we need more sophisticated logic, - // if more data is available remotely can we just download whats there? + // if more data is available remotely can we just download what's there? // without trying to upload something. It may be tricky, needs further investigation. // For now looks strange that we can request upload - // and dowload for the same timeline simultaneously. + // and download for the same timeline simultaneously. // (upload needs to be only for previously unsynced files, not whole timeline dir). // If one of the tasks fails they will be reordered in the queue which can lead // to timeline being stuck in evicted state @@ -1565,7 +1565,7 @@ fn compare_local_and_remote_timeline( }), )); (LocalTimelineInitStatus::NeedsSync, true) - // we do not need to manupulate with remote consistent lsn here + // we do not need to manipulate with remote consistent lsn here // because it will be updated when sync will be completed } else { (LocalTimelineInitStatus::LocallyComplete, false) diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index 91c618d201..0dcd9c97fc 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -1,4 +1,4 @@ -//! Timeline synchrnonization logic to delete a bulk of timeline's remote files from the remote storage. +//! Timeline synchronization logic to delete a bulk of timeline's remote files from the remote storage. use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index a28867f27e..99ccf27e1c 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -1,4 +1,4 @@ -//! Timeline synchrnonization logic to fetch the layer files from remote storage into pageserver's local directory. +//! Timeline synchronization logic to fetch the layer files from remote storage into pageserver's local directory. use std::{collections::HashSet, fmt::Debug, path::Path}; diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 7764a810bc..2ba48ddf53 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -273,7 +273,7 @@ mod tests { }; let index_part = IndexPart::from_remote_timeline(&timeline_path, remote_timeline.clone()) - .expect("Correct remote timeline should be convertable to index part"); + .expect("Correct remote timeline should be convertible to index part"); assert_eq!( index_part.timeline_layers.iter().collect::>(), @@ -305,7 +305,7 @@ mod tests { ); let restored_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part) - .expect("Correct index part should be convertable to remote timeline"); + .expect("Correct index part should be convertible to remote timeline"); let original_metadata = &remote_timeline.metadata; let restored_metadata = &restored_timeline.metadata; diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 625ec7aed6..2f88fa95ba 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -391,7 +391,7 @@ mod tests { assert_eq!( upload.metadata, Some(metadata), - "Successful upload should not chage its metadata" + "Successful upload should not change its metadata" ); let storage_files = storage.list().await?; diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 37d70372b5..a16e772238 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -336,7 +336,7 @@ impl VirtualFile { // library RwLock doesn't allow downgrading without releasing the lock, // and that doesn't seem worth the trouble. // - // XXX: `parking_lot::RwLock` can enable such downgrades, yet its implemenation is fair and + // XXX: `parking_lot::RwLock` can enable such downgrades, yet its implementation is fair and // may deadlock on subsequent read calls. // Simply replacing all `RwLock` in project causes deadlocks, so use it sparingly. let result = STORAGE_IO_TIME diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 5223125ce6..2f39007e9f 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -12,7 +12,7 @@ //! The zenith Repository can store page versions in two formats: as //! page images, or a WAL records. WalIngest::ingest_record() extracts //! page images out of some WAL records, but most it stores as WAL -//! records. If a WAL record modifies multple pages, WalIngest +//! records. If a WAL record modifies multiple pages, WalIngest //! will call Repository::put_wal_record or put_page_image functions //! separately for each modified page. //! diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index e556c24548..edfd36f51a 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -122,7 +122,7 @@ lazy_static! { /// /// This is the real implementation that uses a Postgres process to -/// perform WAL replay. Only one thread can use the processs at a time, +/// perform WAL replay. Only one thread can use the process at a time, /// that is controlled by the Mutex. In the future, we might want to /// launch a pool of processes to allow concurrent replay of multiple /// records. @@ -134,7 +134,7 @@ pub struct PostgresRedoManager { process: Mutex>, } -/// Can this request be served by zenith redo funcitons +/// Can this request be served by zenith redo functions /// or we need to pass it to wal-redo postgres process? fn can_apply_in_zenith(rec: &ZenithWalRecord) -> bool { // Currently, we don't have bespoken Rust code to replay any diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 642e50c2c1..0e3e17359e 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -95,7 +95,7 @@ async fn handle_client( /// Establish a (most probably, secure) connection with the client. /// For better testing experience, `stream` can be any object satisfying the traits. -/// It's easier to work with owned `stream` here as we need to updgrade it to TLS; +/// It's easier to work with owned `stream` here as we need to upgrade it to TLS; /// we also take an extra care of propagating only the select handshake errors to client. async fn handshake( stream: S, diff --git a/safekeeper/README.md b/safekeeper/README.md index a4bb260932..7b217ddbec 100644 --- a/safekeeper/README.md +++ b/safekeeper/README.md @@ -75,7 +75,7 @@ safekeepers. The Paxos and crash recovery algorithm ensures that only one primary node can be actively streaming WAL to the quorum of safekeepers. -See README_PROTO.md for a more detailed desription of the consensus +See README_PROTO.md for a more detailed description of the consensus protocol. spec/ contains TLA+ specification of it. # Q&A diff --git a/safekeeper/README_PROTO.md b/safekeeper/README_PROTO.md index 6b2ae50254..7f3da3563a 100644 --- a/safekeeper/README_PROTO.md +++ b/safekeeper/README_PROTO.md @@ -143,7 +143,7 @@ Restart of PostgreSQL initiates new round of voting and switching new epoch. ## Limitations Right now message queue is maintained in main memory and is not spilled to the disk. It can cause memory overflow in case of presence of lagging safekeepers. -It is assumed that in case of loosing local data by some safekeepers, it should be recovered using some external mechanism. +It is assumed that in case of losing local data by some safekeepers, it should be recovered using some external mechanism. ## Glossary @@ -153,7 +153,7 @@ It is assumed that in case of loosing local data by some safekeepers, it should * `NodeID`: pair (term,UUID) * `Pager`: Neon component restoring pages from WAL stream * `Replica`: read-only computatio node -* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records. +* `VCL`: the largerst LSN for which we can guarantee availability of all prior records. ## Algorithm diff --git a/safekeeper/spec/ProposerAcceptorConsensus.tla b/safekeeper/spec/ProposerAcceptorConsensus.tla index 993edfcf23..e5f0bb270f 100644 --- a/safekeeper/spec/ProposerAcceptorConsensus.tla +++ b/safekeeper/spec/ProposerAcceptorConsensus.tla @@ -88,7 +88,7 @@ TypeOk == \* in campaign proposer sends RequestVote and waits for acks; \* in leader he is elected /\ prop_state[p].state \in {"campaign", "leader"} - \* 0..max_term should be actually Nat in the unbouned model, but TLC won't + \* 0..max_term should be actually Nat in the unbounded model, but TLC won't \* swallow it /\ prop_state[p].term \in 0..max_term \* votes received diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index a7628482d9..e792a854d5 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -100,7 +100,7 @@ fn main() -> anyhow::Result<()> { Arg::new("dump-control-file") .long("dump-control-file") .takes_value(true) - .help("Dump control file at path specifed by this argument and exit"), + .help("Dump control file at path specified by this argument and exit"), ) .arg( Arg::new("id").long("id").takes_value(true).help("safekeeper node id: integer") diff --git a/safekeeper/src/callmemaybe.rs b/safekeeper/src/callmemaybe.rs index 8c3fbe26ba..53d38c5e25 100644 --- a/safekeeper/src/callmemaybe.rs +++ b/safekeeper/src/callmemaybe.rs @@ -39,7 +39,7 @@ async fn request_callback( } }); - // use Config parsing because SockAddr parsing doesnt allow to use host names instead of ip addresses + // use Config parsing because SockAddr parsing doesn't allow to use host names instead of ip addresses let me_connstr = format!("postgresql://no_user@{}/no_db", listen_pg_addr_str); let me_conf: postgres::config::Config = me_connstr.parse().unwrap(); let (host, port) = connection_host_port(&me_conf); diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 8d36472540..e1740cdcbf 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -27,7 +27,7 @@ struct SafeKeeperStateV1 { acceptor_state: AcceptorStateV1, /// information about server server: ServerInfoV2, - /// Unique id of the last *elected* proposer we dealed with. Not needed + /// Unique id of the last *elected* proposer we dealt with. Not needed /// for correctness, exists for monitoring purposes. proposer_uuid: PgUuid, /// part of WAL acknowledged by quorum and available locally @@ -57,7 +57,7 @@ pub struct SafeKeeperStateV2 { pub acceptor_state: AcceptorState, /// information about server pub server: ServerInfoV2, - /// Unique id of the last *elected* proposer we dealed with. Not needed + /// Unique id of the last *elected* proposer we dealt with. Not needed /// for correctness, exists for monitoring purposes. pub proposer_uuid: PgUuid, /// part of WAL acknowledged by quorum and available locally @@ -89,7 +89,7 @@ pub struct SafeKeeperStateV3 { pub acceptor_state: AcceptorState, /// information about server pub server: ServerInfoV3, - /// Unique id of the last *elected* proposer we dealed with. Not needed + /// Unique id of the last *elected* proposer we dealt with. Not needed /// for correctness, exists for monitoring purposes. #[serde(with = "hex")] pub proposer_uuid: PgUuid, @@ -114,7 +114,7 @@ pub struct SafeKeeperStateV4 { pub acceptor_state: AcceptorState, /// information about server pub server: ServerInfo, - /// Unique id of the last *elected* proposer we dealed with. Not needed + /// Unique id of the last *elected* proposer we dealt with. Not needed /// for correctness, exists for monitoring purposes. #[serde(with = "hex")] pub proposer_uuid: PgUuid, diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index c254f2c57c..df4b202063 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -180,7 +180,7 @@ pub struct SafeKeeperState { pub acceptor_state: AcceptorState, /// information about server pub server: ServerInfo, - /// Unique id of the last *elected* proposer we dealed with. Not needed + /// Unique id of the last *elected* proposer we dealt with. Not needed /// for correctness, exists for monitoring purposes. #[serde(with = "hex")] pub proposer_uuid: PgUuid, @@ -759,7 +759,7 @@ where self.inmem.commit_lsn = max(self.inmem.commit_lsn, state.timeline_start_lsn); self.metrics.commit_lsn.set(self.inmem.commit_lsn.0 as f64); - // Initalizing backup_lsn is useful to avoid making backup think it should upload 0 segment. + // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment. self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn); state.acceptor_state.term_history = msg.term_history.clone(); diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 83dc312d28..a4b779649d 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -257,7 +257,7 @@ impl WalBackupTask { // Optimization idea for later: // Avoid checking election leader every time by returning current lease grant expiration time // Re-check leadership only after expiration time, - // such approach woud reduce overhead on write-intensive workloads + // such approach would reduce overhead on write-intensive workloads match l .check_am_i( @@ -389,7 +389,7 @@ async fn backup_object(source_file: &Path, size: usize) -> Result<()> { let file = File::open(&source_file).await?; - // Storage is initialized by launcher at ths point. + // Storage is initialized by launcher at this point. match storage.as_ref().unwrap() { GenericRemoteStorage::Local(local_storage) => { let destination = local_storage.remote_object_id(source_file)?; diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 503bd7c543..7285cedc03 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -126,7 +126,7 @@ pub struct PhysicalStorage { conf: SafeKeeperConf, // fields below are filled upon initialization - /// None if unitialized, Some(usize) if storage is initialized. + /// None if uninitialized, Some(usize) if storage is initialized. wal_seg_size: Option, /// Written to disk, but possibly still in the cache and not fully persisted. @@ -456,7 +456,7 @@ impl Storage for PhysicalStorage { segno += 1; let (wal_file_path, wal_file_partial_path) = wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; - // TODO: better use fs::try_exists which is currenty avaialble only in nightly build + // TODO: better use fs::try_exists which is currently available only in nightly build if wal_file_path.exists() { fs::remove_file(&wal_file_path)?; } else if wal_file_partial_path.exists() { diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/batch_others/test_clog_truncate.py index b7eeedb23e..1a49a4582e 100644 --- a/test_runner/batch_others/test_clog_truncate.py +++ b/test_runner/batch_others/test_clog_truncate.py @@ -14,7 +14,7 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv): env = zenith_simple_env env.zenith_cli.create_branch('test_clog_truncate', 'empty') - # set agressive autovacuum to make sure that truncation will happen + # set aggressive autovacuum to make sure that truncation will happen config = [ 'autovacuum_max_workers=10', 'autovacuum_vacuum_threshold=0', diff --git a/test_runner/batch_others/test_pitr_gc.py b/test_runner/batch_others/test_pitr_gc.py index ee19bddfe8..a5149f7ad9 100644 --- a/test_runner/batch_others/test_pitr_gc.py +++ b/test_runner/batch_others/test_pitr_gc.py @@ -55,7 +55,7 @@ def test_pitr_gc(zenith_env_builder: ZenithEnvBuilder): with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: pscur.execute(f"compact {env.initial_tenant.hex} {timeline}") - # perform agressive GC. Data still should be kept because of the PITR setting. + # perform aggressive GC. Data still should be kept because of the PITR setting. pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") row = pscur.fetchone() print_gc_result(row) diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index afbe3c55c7..864cccf736 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -116,7 +116,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) assert detail['local'] is not None log.info("Timeline detail after attach completed: %s", detail) - assert lsn_from_hex(detail['local']['last_record_lsn']) >= current_lsn, 'current db Lsn should shoud not be less than the one stored on remote storage' + assert lsn_from_hex(detail['local']['last_record_lsn']) >= current_lsn, 'current db Lsn should should not be less than the one stored on remote storage' assert not detail['remote']['awaits_download'] pg = env.postgres.create_start('main') diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 91506e120d..8ecc731ae9 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -92,7 +92,7 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve # if we recovered after failure verify that we have correct number of rows log.info("recovering at %s", inserted_ctr) cur.execute("SELECT count(*) FROM load") - # it seems that sometimes transaction gets commited before we can acknowledge + # it seems that sometimes transaction gets committed before we can acknowledge # the result, so sometimes selected value is larger by one than we expect assert cur.fetchone()[0] - inserted_ctr <= 1 log.info("successfully recovered %s", inserted_ctr) diff --git a/test_runner/batch_others/test_vm_bits.py b/test_runner/batch_others/test_vm_bits.py index 49e48dd450..98854111f6 100644 --- a/test_runner/batch_others/test_vm_bits.py +++ b/test_runner/batch_others/test_vm_bits.py @@ -28,7 +28,7 @@ def test_vm_bit_clear(zenith_simple_env: ZenithEnv): cur.execute('INSERT INTO vmtest_update SELECT g FROM generate_series(1, 1000) g') cur.execute('VACUUM FREEZE vmtest_update') - # DELETE and UDPATE the rows. + # DELETE and UPDATE the rows. cur.execute('DELETE FROM vmtest_delete WHERE id = 1') cur.execute('UPDATE vmtest_update SET id = 5000 WHERE id = 1') diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index fc192c28e8..8837725b84 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -905,8 +905,8 @@ def test_delete_force(zenith_env_builder: ZenithEnvBuilder): # Create two tenants: one will be deleted, other should be preserved. tenant_id = env.initial_tenant.hex - timeline_id_1 = env.zenith_cli.create_branch('br1').hex # Acive, delete explicitly - timeline_id_2 = env.zenith_cli.create_branch('br2').hex # Inactive, delete explictly + timeline_id_1 = env.zenith_cli.create_branch('br1').hex # Active, delete explicitly + timeline_id_2 = env.zenith_cli.create_branch('br2').hex # Inactive, delete explicitly timeline_id_3 = env.zenith_cli.create_branch('br3').hex # Active, delete with the tenant timeline_id_4 = env.zenith_cli.create_branch('br4').hex # Inactive, delete with the tenant diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 5fc6076f51..75fece6818 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -206,7 +206,7 @@ class ZenithBenchmarker: f"{prefix}.number_of_transactions_actually_processed", pg_bench_result.number_of_transactions_actually_processed, '', - # thats because this is predefined by test matrix and doesnt change across runs + # that's because this is predefined by test matrix and doesn't change across runs report=MetricReport.TEST_PARAM, ) self.record(f"{prefix}.latency_average", @@ -302,7 +302,7 @@ def pytest_addoption(parser): parser.addoption( "--out-dir", dest="out_dir", - help="Directory to ouput performance tests results to.", + help="Directory to output performance tests results to.", ) diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index a2e8c82d30..8d9a4ccd85 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -75,7 +75,7 @@ def pytest_addoption(parser): "--skip-interfering-proc-check", dest="skip_interfering_proc_check", action="store_true", - help="skip check for interferring processes", + help="skip check for interfering processes", ) @@ -88,7 +88,7 @@ top_output_dir = "" def check_interferring_processes(config): if config.getoption("skip_interfering_proc_check"): - warnings.warn("interferring process check is skipped") + warnings.warn("interfering process check is skipped") return # does not use -c as it is not supported on macOS @@ -107,7 +107,7 @@ def check_interferring_processes(config): def pytest_configure(config): """ Ensure that no unwanted daemons are running before we start testing. - Check that we do not owerflow available ports range. + Check that we do not overflow available ports range. """ check_interferring_processes(config) @@ -1417,7 +1417,7 @@ class RemotePostgres(PgProtocol): raise Exception('cannot stop a remote Postgres instance') def get_subdir_size(self, subdir) -> int: - # TODO: Could use the server's Generic File Acccess functions if superuser. + # TODO: Could use the server's Generic File Access functions if superuser. # See https://www.postgresql.org/docs/14/functions-admin.html#FUNCTIONS-ADMIN-GENFILE raise Exception('cannot get size of a Postgres instance') From 4b4d3073b8c479b4bcb1bec4681120c2f49065da Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 28 May 2022 14:30:59 +0300 Subject: [PATCH 0367/1022] Fix misc typos --- control_plane/src/etcd.rs | 2 +- docs/glossary.md | 2 +- pageserver/src/layered_repository/disk_btree.rs | 4 ++-- safekeeper/README_PROTO.md | 2 +- test_runner/fixtures/zenith_fixtures.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs index bc39b7dea3..0123d9c491 100644 --- a/control_plane/src/etcd.rs +++ b/control_plane/src/etcd.rs @@ -77,7 +77,7 @@ pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { let etcd_pid_file_path = etcd_pid_file_path(env); let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| { format!( - "Failed to read etcd pid filea at {}", + "Failed to read etcd pid file at {}", etcd_pid_file_path.display() ) })?); diff --git a/docs/glossary.md b/docs/glossary.md index 0de0eea1cb..a5bb154793 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -115,7 +115,7 @@ Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/RE * `CommitLSN`: position in WAL confirmed by quorum safekeepers. * `RestartLSN`: position in WAL confirmed by all safekeepers. * `FlushLSN`: part of WAL persisted to the disk by safekeeper. -* `VCL`: the largerst LSN for which we can guarantee availability of all prior records. +* `VCL`: the largest LSN for which we can guarantee availability of all prior records. Neon pageserver LSNs: * `last_record_lsn` - the end of last processed WAL record. diff --git a/pageserver/src/layered_repository/disk_btree.rs b/pageserver/src/layered_repository/disk_btree.rs index 5f9ed8bbea..dc8d7a2ad3 100644 --- a/pageserver/src/layered_repository/disk_btree.rs +++ b/pageserver/src/layered_repository/disk_btree.rs @@ -498,8 +498,8 @@ where return Ok(()); } - // It did not fit. Try to compress, and it it succeeds to make some room - // on the node, try appending to it again. + // It did not fit. Try to compress, and if it succeeds to make + // some room on the node, try appending to it again. #[allow(clippy::collapsible_if)] if last.compress() { if last.push(key, value) { diff --git a/safekeeper/README_PROTO.md b/safekeeper/README_PROTO.md index 7f3da3563a..0cd1f510e6 100644 --- a/safekeeper/README_PROTO.md +++ b/safekeeper/README_PROTO.md @@ -153,7 +153,7 @@ It is assumed that in case of losing local data by some safekeepers, it should b * `NodeID`: pair (term,UUID) * `Pager`: Neon component restoring pages from WAL stream * `Replica`: read-only computatio node -* `VCL`: the largerst LSN for which we can guarantee availability of all prior records. +* `VCL`: the largest LSN for which we can guarantee availability of all prior records. ## Algorithm diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 8d9a4ccd85..336f1f1348 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -2139,7 +2139,7 @@ def remote_consistent_lsn(pageserver_http_client: ZenithPageserverHttpClient, if detail['remote'] is None: # No remote information at all. This happens right after creating - # a timeline, before any part of it it has been uploaded to remote + # a timeline, before any part of it has been uploaded to remote # storage yet. return 0 else: From e3b320daabe4d140f7963c1ffed996128567264c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 28 May 2022 21:22:19 +0300 Subject: [PATCH 0368/1022] Remove obsolete Dockerfile.alpine It hasn't been used for anything for a long time. The comments still talked about librocksdb, which we also haven't used for a long time. --- Dockerfile.alpine | 95 ----------------------------------------------- 1 file changed, 95 deletions(-) delete mode 100644 Dockerfile.alpine diff --git a/Dockerfile.alpine b/Dockerfile.alpine deleted file mode 100644 index 0f244e4443..0000000000 --- a/Dockerfile.alpine +++ /dev/null @@ -1,95 +0,0 @@ -# -# Docker image for console integration testing. -# -# We may also reuse it in CI to unify installation process and as a general binaries building -# tool for production servers. -# -# Dynamic linking is used for librocksdb and libstdc++ because librocksdb-sys calls -# bindgen with "dynamic" feature flag. This also prevents usage of dockerhub alpine-rust -# images which are statically linked and have guards against any dlopen. I would rather -# prefer all static binaries so we may change the way librocksdb-sys builds or wait until -# we will have our own storage and drop rockdb dependency. -# -# Cargo-chef is used to separate dependencies building from main binaries building. This -# way `docker build` will download and install dependencies only of there are changes to -# out Cargo.toml files. -# - - -# -# build postgres separately -- this layer will be rebuilt only if one of -# mentioned paths will get any changes -# -FROM alpine:3.13 as pg-build -RUN apk add --update clang llvm compiler-rt compiler-rt-static lld musl-dev binutils \ - make bison flex readline-dev zlib-dev perl linux-headers libseccomp-dev -WORKDIR zenith -COPY ./vendor/postgres vendor/postgres -COPY ./Makefile Makefile -# Build using clang and lld -RUN CC='clang' LD='lld' CFLAGS='-fuse-ld=lld --rtlib=compiler-rt' make postgres -j4 - -# -# Calculate cargo dependencies. -# This will always run, but only generate recipe.json with list of dependencies without -# installing them. -# -FROM alpine:20210212 as cargo-deps-inspect -RUN apk add --update rust cargo -RUN cargo install cargo-chef -WORKDIR zenith -COPY . . -RUN cargo chef prepare --recipe-path recipe.json - -# -# Build cargo dependencies. -# This temp cantainner would be build only if recipe.json was changed. -# -FROM alpine:20210212 as deps-build -RUN apk add --update rust cargo openssl-dev clang build-base -# rust-rocksdb can be built against system-wide rocksdb -- that saves about -# 10 minutes during build. Rocksdb apk package is in testing now, but use it -# anyway. In case of any troubles we can download and build rocksdb here manually -# (to cache it as a docker layer). -RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev -WORKDIR zenith -COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server -COPY --from=cargo-deps-inspect /root/.cargo/bin/cargo-chef /root/.cargo/bin/ -COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json -RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json - -# -# Build zenith binaries -# -FROM alpine:20210212 as build -RUN apk add --update rust cargo openssl-dev clang build-base -RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev -WORKDIR zenith -COPY . . -# Copy cached dependencies -COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server -COPY --from=deps-build /zenith/target target -COPY --from=deps-build /root/.cargo /root/.cargo -RUN cargo build --release - -# -# Copy binaries to resulting image. -# build-base hare to provide libstdc++ (it will also bring gcc, but leave it this way until we figure -# out how to statically link rocksdb or avoid it at all). -# -FROM alpine:3.13 -RUN apk add --update openssl build-base libseccomp-dev -RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb -COPY --from=build /zenith/target/release/pageserver /usr/local/bin -COPY --from=build /zenith/target/release/safekeeper /usr/local/bin -COPY --from=build /zenith/target/release/proxy /usr/local/bin -COPY --from=pg-build /zenith/tmp_install /usr/local -COPY docker-entrypoint.sh /docker-entrypoint.sh - -RUN addgroup zenith && adduser -h /data -D -G zenith zenith -VOLUME ["/data"] -WORKDIR /data -USER zenith -EXPOSE 6400 -ENTRYPOINT ["/docker-entrypoint.sh"] -CMD ["pageserver"] From 3accde613d8a57fea149471dab22ee0d6843035e Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 18 Apr 2022 19:43:57 +0300 Subject: [PATCH 0369/1022] Rename contrib/zenith to contrib/neon. Rename custom GUCs: - zenith.page_server_connstring -> neon.pageserver_connstring - zenith.zenith_tenant -> neon.tenantid - zenith.zenith_timeline -> neon.timelineid - zenith.max_cluster_size -> neon.max_cluster_size --- Makefile | 10 ++++---- compute_tools/src/bin/compute_ctl.rs | 6 ++--- compute_tools/tests/cluster_spec.json | 8 +++---- compute_tools/tests/pg_helpers_tests.rs | 2 +- control_plane/src/compute.rs | 14 +++++------ docs/rfcs/cluster-size-limits.md | 8 +++---- docs/sourcetree.md | 6 ++--- libs/postgres_ffi/wal_generate/src/lib.rs | 4 ++-- pageserver/src/walredo.rs | 4 ++-- .../batch_others/test_ancestor_branch.py | 6 ++--- test_runner/batch_others/test_backpressure.py | 2 +- .../batch_others/test_branch_behind.py | 2 +- .../batch_others/test_broken_timeline.py | 2 +- .../batch_others/test_clog_truncate.py | 2 +- .../batch_others/test_gc_aggressive.py | 2 +- .../batch_others/test_old_request_lsn.py | 2 +- test_runner/batch_others/test_pitr_gc.py | 2 +- .../batch_others/test_read_validation.py | 2 +- .../batch_others/test_remote_storage.py | 4 ++-- .../batch_others/test_tenant_relocation.py | 4 ++-- .../test_tenants_with_remote_storage.py | 8 +++---- .../batch_others/test_timeline_size.py | 6 ++--- test_runner/batch_others/test_vm_bits.py | 2 +- test_runner/batch_others/test_wal_acceptor.py | 24 +++++++++---------- .../batch_others/test_wal_acceptor_async.py | 4 ++-- test_runner/batch_others/test_wal_restore.py | 2 +- test_runner/fixtures/compare_fixtures.py | 2 +- test_runner/fixtures/zenith_fixtures.py | 2 +- vendor/postgres | 2 +- 29 files changed, 72 insertions(+), 72 deletions(-) diff --git a/Makefile b/Makefile index fdfc64f6fa..e3d183eaee 100644 --- a/Makefile +++ b/Makefile @@ -74,16 +74,16 @@ postgres-headers: postgres-configure +@echo "Installing PostgreSQL headers" $(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install -# Compile and install PostgreSQL and contrib/zenith +# Compile and install PostgreSQL and contrib/neon .PHONY: postgres postgres: postgres-configure \ postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers` +@echo "Compiling PostgreSQL" $(MAKE) -C tmp_install/build MAKELEVEL=0 install - +@echo "Compiling contrib/zenith" - $(MAKE) -C tmp_install/build/contrib/zenith install - +@echo "Compiling contrib/zenith_test_utils" - $(MAKE) -C tmp_install/build/contrib/zenith_test_utils install + +@echo "Compiling contrib/neon" + $(MAKE) -C tmp_install/build/contrib/neon install + +@echo "Compiling contrib/neon_test_utils" + $(MAKE) -C tmp_install/build/contrib/neon_test_utils install +@echo "Compiling pg_buffercache" $(MAKE) -C tmp_install/build/contrib/pg_buffercache install +@echo "Compiling pageinspect" diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 5c951b7779..b97429c223 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -116,17 +116,17 @@ fn main() -> Result<()> { let pageserver_connstr = spec .cluster .settings - .find("zenith.page_server_connstring") + .find("neon.pageserver_connstring") .expect("pageserver connstr should be provided"); let tenant = spec .cluster .settings - .find("zenith.zenith_tenant") + .find("neon.tenantid") .expect("tenant id should be provided"); let timeline = spec .cluster .settings - .find("zenith.zenith_timeline") + .find("neon.timelineid") .expect("tenant id should be provided"); let compute_state = ComputeNode { diff --git a/compute_tools/tests/cluster_spec.json b/compute_tools/tests/cluster_spec.json index 4a1672919c..4821848678 100644 --- a/compute_tools/tests/cluster_spec.json +++ b/compute_tools/tests/cluster_spec.json @@ -150,7 +150,7 @@ "vartype": "integer" }, { - "name": "zenith.zenith_tenant", + "name": "neon.tenantid", "value": "b0554b632bd4d547a63b86c3630317e8", "vartype": "string" }, @@ -160,13 +160,13 @@ "vartype": "integer" }, { - "name": "zenith.zenith_timeline", + "name": "neon.timelineid", "value": "2414a61ffc94e428f14b5758fe308e13", "vartype": "string" }, { "name": "shared_preload_libraries", - "value": "zenith", + "value": "neon", "vartype": "string" }, { @@ -175,7 +175,7 @@ "vartype": "string" }, { - "name": "zenith.page_server_connstring", + "name": "neon.pageserver_connstring", "value": "host=127.0.0.1 port=6400", "vartype": "string" } diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 33f903f0e1..a81c6512bc 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -28,7 +28,7 @@ mod pg_helpers_tests { assert_eq!( spec.cluster.settings.as_pg_settings(), - "fsync = off\nwal_level = replica\nhot_standby = on\nwal_acceptors = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nzenith.zenith_tenant = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nzenith.zenith_timeline = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'zenith'\nsynchronous_standby_names = 'walproposer'\nzenith.page_server_connstring = 'host=127.0.0.1 port=6400'" + "fsync = off\nwal_level = replica\nhot_standby = on\nwal_acceptors = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenantid = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timelineid = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" ); } diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 045acd7519..3fefd32389 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -148,8 +148,8 @@ impl PostgresNode { // Read a few options from the config file let context = format!("in config file {}", cfg_path_str); let port: u16 = conf.parse_field("port", &context)?; - let timeline_id: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?; - let tenant_id: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?; + let timeline_id: ZTimelineId = conf.parse_field("neon.timelineid", &context)?; + let tenant_id: ZTenantId = conf.parse_field("neon.tenantid", &context)?; let uses_wal_proposer = conf.get("wal_acceptors").is_some(); // parse recovery_target_lsn, if any @@ -303,11 +303,11 @@ impl PostgresNode { // uses only needed variables namely host, port, user, password. format!("postgresql://no_user:{}@{}:{}", password, host, port) }; - conf.append("shared_preload_libraries", "zenith"); + conf.append("shared_preload_libraries", "neon"); conf.append_line(""); - conf.append("zenith.page_server_connstring", &pageserver_connstr); - conf.append("zenith.zenith_tenant", &self.tenant_id.to_string()); - conf.append("zenith.zenith_timeline", &self.timeline_id.to_string()); + conf.append("neon.pageserver_connstring", &pageserver_connstr); + conf.append("neon.tenantid", &self.tenant_id.to_string()); + conf.append("neon.timelineid", &self.timeline_id.to_string()); if let Some(lsn) = self.lsn { conf.append("recovery_target_lsn", &lsn.to_string()); } @@ -352,7 +352,7 @@ impl PostgresNode { // This isn't really a supported configuration, but can be useful for // testing. conf.append("synchronous_standby_names", "pageserver"); - conf.append("zenith.callmemaybe_connstring", &self.connstr()); + conf.append("neon.callmemaybe_connstring", &self.connstr()); } let mut file = File::create(self.pgdata().join("postgresql.conf"))?; diff --git a/docs/rfcs/cluster-size-limits.md b/docs/rfcs/cluster-size-limits.md index 4696f2c7f0..bd12fb6eee 100644 --- a/docs/rfcs/cluster-size-limits.md +++ b/docs/rfcs/cluster-size-limits.md @@ -22,8 +22,8 @@ so we don't want to give users access to the functionality that we don't think i * pageserver - calculate the size consumed by a timeline and add it to the feedback message. * safekeeper - pass feedback message from pageserver to compute. -* compute - receive feedback message, enforce size limit based on GUC `zenith.max_cluster_size`. -* console - set and update `zenith.max_cluster_size` setting +* compute - receive feedback message, enforce size limit based on GUC `neon.max_cluster_size`. +* console - set and update `neon.max_cluster_size` setting ## Proposed implementation @@ -49,7 +49,7 @@ This message is received by the safekeeper and propagated to compute node as a p Finally, when compute node receives the `current_timeline_size` from safekeeper (or from pageserver directly), it updates the global variable. -And then every zenith_extend() operation checks if limit is reached `(current_timeline_size > zenith.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so. +And then every zenith_extend() operation checks if limit is reached `(current_timeline_size > neon.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so. (see Postgres error codes [https://www.postgresql.org/docs/devel/errcodes-appendix.html](https://www.postgresql.org/docs/devel/errcodes-appendix.html)) TODO: @@ -75,5 +75,5 @@ We should warn users if the limit is soon to be reached. ### **Security implications** We treat compute as an untrusted component. That's why we try to isolate it with secure container runtime or a VM. -Malicious users may change the `zenith.max_cluster_size`, so we need an extra size limit check. +Malicious users may change the `neon.max_cluster_size`, so we need an extra size limit check. To cover this case, we also monitor the compute node size in the console. diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 5384d334df..05eaa96938 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -42,13 +42,13 @@ Integration tests, written in Python using the `pytest` framework. `/vendor/postgres`: -PostgreSQL source tree, with the modifications needed for Zenith. +PostgreSQL source tree, with the modifications needed for Neon. -`/vendor/postgres/contrib/zenith`: +`/vendor/postgres/contrib/neon`: PostgreSQL extension that implements storage manager API and network communications with remote page server. -`/vendor/postgres/contrib/zenith_test_utils`: +`/vendor/postgres/contrib/neon_test_utils`: PostgreSQL extension that contains functions needed for testing and debugging. diff --git a/libs/postgres_ffi/wal_generate/src/lib.rs b/libs/postgres_ffi/wal_generate/src/lib.rs index a5cd81d68a..3b19afb826 100644 --- a/libs/postgres_ffi/wal_generate/src/lib.rs +++ b/libs/postgres_ffi/wal_generate/src/lib.rs @@ -80,7 +80,7 @@ impl Conf { .arg(self.datadir.as_os_str()) .args(&["-c", "wal_keep_size=50MB"]) // Ensure old WAL is not removed .args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output - .args(&["-c", "shared_preload_libraries=zenith"]) // can only be loaded at startup + .args(&["-c", "shared_preload_libraries=neon"]) // can only be loaded at startup // Disable background processes as much as possible .args(&["-c", "wal_writer_delay=10s"]) .args(&["-c", "autovacuum=off"]) @@ -178,7 +178,7 @@ fn generate_internal( client: &mut C, f: impl Fn(&mut C, PgLsn) -> Result>, ) -> Result { - client.execute("create extension if not exists zenith_test_utils", &[])?; + client.execute("create extension if not exists neon_test_utils", &[])?; let wal_segment_size = client.query_one( "select cast(setting as bigint) as setting, unit \ diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index edfd36f51a..d263bf0e9a 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -607,8 +607,8 @@ impl PostgresRedoProcess { .open(PathBuf::from(&datadir).join("postgresql.conf"))?; config.write_all(b"shared_buffers=128kB\n")?; config.write_all(b"fsync=off\n")?; - config.write_all(b"shared_preload_libraries=zenith\n")?; - config.write_all(b"zenith.wal_redo=on\n")?; + config.write_all(b"shared_preload_libraries=neon\n")?; + config.write_all(b"neon.wal_redo=on\n")?; } // Start postgres itself let mut child = Command::new(conf.pg_bin_dir().join("postgres")) diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index 5dbd6d2e26..e05a550fdf 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -30,7 +30,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): pg_branch0 = env.postgres.create_start('main', tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() - branch0_cur.execute("SHOW zenith.zenith_timeline") + branch0_cur.execute("SHOW neon.timelineid") branch0_timeline = branch0_cur.fetchone()[0] log.info(f"b0 timeline {branch0_timeline}") @@ -55,7 +55,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): log.info("postgres is running on 'branch1' branch") branch1_cur = pg_branch1.connect().cursor() - branch1_cur.execute("SHOW zenith.zenith_timeline") + branch1_cur.execute("SHOW neon.timelineid") branch1_timeline = branch1_cur.fetchone()[0] log.info(f"b1 timeline {branch1_timeline}") @@ -79,7 +79,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): log.info("postgres is running on 'branch2' branch") branch2_cur = pg_branch2.connect().cursor() - branch2_cur.execute("SHOW zenith.zenith_timeline") + branch2_cur.execute("SHOW neon.timelineid") branch2_timeline = branch2_cur.fetchone()[0] log.info(f"b2 timeline {branch2_timeline}") diff --git a/test_runner/batch_others/test_backpressure.py b/test_runner/batch_others/test_backpressure.py index 81f45b749b..5debb2ee61 100644 --- a/test_runner/batch_others/test_backpressure.py +++ b/test_runner/batch_others/test_backpressure.py @@ -26,7 +26,7 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv log.info("checks started") with pg_cur(pg) as cur: - cur.execute("CREATE EXTENSION zenith") # TODO move it to zenith_fixtures? + cur.execute("CREATE EXTENSION neon") # TODO move it to zenith_fixtures? cur.execute("select pg_size_bytes(current_setting('max_replication_write_lag'))") res = cur.fetchone() diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index fc84af5283..9bb04f574b 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -31,7 +31,7 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() - main_cur.execute("SHOW zenith.zenith_timeline") + main_cur.execute("SHOW neon.timelineid") timeline = main_cur.fetchone()[0] # Create table, and insert the first 100 rows diff --git a/test_runner/batch_others/test_broken_timeline.py b/test_runner/batch_others/test_broken_timeline.py index f0aa44e0a4..45fe69748d 100644 --- a/test_runner/batch_others/test_broken_timeline.py +++ b/test_runner/batch_others/test_broken_timeline.py @@ -26,7 +26,7 @@ def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder): cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'") - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timelineid") timeline_id = cur.fetchone()[0] pg.stop() tenant_timelines.append((tenant_id, timeline_id, pg)) diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/batch_others/test_clog_truncate.py index 1a49a4582e..2382cd93b3 100644 --- a/test_runner/batch_others/test_clog_truncate.py +++ b/test_runner/batch_others/test_clog_truncate.py @@ -29,7 +29,7 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv): log.info('postgres is running on test_clog_truncate branch') # Install extension containing function needed for test - pg.safe_psql('CREATE EXTENSION zenith_test_utils') + pg.safe_psql('CREATE EXTENSION neon_test_utils') # Consume many xids to advance clog with closing(pg.connect()) as conn: diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/batch_others/test_gc_aggressive.py index 519a6dda1c..6beee49d2f 100644 --- a/test_runner/batch_others/test_gc_aggressive.py +++ b/test_runner/batch_others/test_gc_aggressive.py @@ -62,7 +62,7 @@ def test_gc_aggressive(zenith_env_builder: ZenithEnvBuilder): conn = pg.connect() cur = conn.cursor() - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timelineid") timeline = cur.fetchone()[0] # Create table, and insert the first 100 rows diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py index cf7fe09b1e..1ec429ea34 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -26,7 +26,7 @@ def test_old_request_lsn(zenith_env_builder: ZenithEnvBuilder): cur = pg_conn.cursor() # Get the timeline ID of our branch. We need it for the 'do_gc' command - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timelineid") timeline = cur.fetchone()[0] psconn = env.pageserver.connect() diff --git a/test_runner/batch_others/test_pitr_gc.py b/test_runner/batch_others/test_pitr_gc.py index a5149f7ad9..6456acd214 100644 --- a/test_runner/batch_others/test_pitr_gc.py +++ b/test_runner/batch_others/test_pitr_gc.py @@ -25,7 +25,7 @@ def test_pitr_gc(zenith_env_builder: ZenithEnvBuilder): main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() - main_cur.execute("SHOW zenith.zenith_timeline") + main_cur.execute("SHOW neon.timelineid") timeline = main_cur.fetchone()[0] # Create table diff --git a/test_runner/batch_others/test_read_validation.py b/test_runner/batch_others/test_read_validation.py index ee41e6511c..9d2248ac89 100644 --- a/test_runner/batch_others/test_read_validation.py +++ b/test_runner/batch_others/test_read_validation.py @@ -8,7 +8,7 @@ from psycopg2.errors import IoError pytest_plugins = ("fixtures.zenith_fixtures") -extensions = ["pageinspect", "zenith_test_utils", "pg_buffercache"] +extensions = ["pageinspect", "neon_test_utils", "pg_buffercache"] # diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index 864cccf736..e5c94980f0 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -48,8 +48,8 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, client = env.pageserver.http_client() - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + tenant_id = pg.safe_psql("show neon.tenantid")[0][0] + timeline_id = pg.safe_psql("show neon.timelineid")[0][0] checkpoint_numbers = range(1, 3) diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 8ecc731ae9..6ad9c6305f 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -130,7 +130,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, with closing(tenant_pg.connect()) as conn: with conn.cursor() as cur: # save timeline for later gc call - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timelineid") timeline = UUID(cur.fetchone()[0]) log.info("timeline to relocate %s", timeline.hex) @@ -223,7 +223,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, tenant_pg_config_file_path = pathlib.Path(tenant_pg.config_file_path()) tenant_pg_config_file_path.open('a').write( - f"\nzenith.page_server_connstring = 'postgresql://no_user:@localhost:{new_pageserver_pg_port}'" + f"\nneon.pageserver_connstring = 'postgresql://no_user:@localhost:{new_pageserver_pg_port}'" ) tenant_pg.start() diff --git a/test_runner/batch_others/test_tenants_with_remote_storage.py b/test_runner/batch_others/test_tenants_with_remote_storage.py index c00f077fcd..8eb72437fd 100644 --- a/test_runner/batch_others/test_tenants_with_remote_storage.py +++ b/test_runner/batch_others/test_tenants_with_remote_storage.py @@ -21,8 +21,8 @@ async def tenant_workload(env: ZenithEnv, pg: Postgres): pg_conn = await pg.connect_async() - tenant_id = await pg_conn.fetchval("show zenith.zenith_tenant") - timeline_id = await pg_conn.fetchval("show zenith.zenith_timeline") + tenant_id = await pg_conn.fetchval("show neon.tenantid") + timeline_id = await pg_conn.fetchval("show neon.timelineid") await pg_conn.execute("CREATE TABLE t(key int primary key, value text)") for i in range(1, 100): @@ -82,9 +82,9 @@ def test_tenants_many(zenith_env_builder: ZenithEnvBuilder, storage_type: str): for tenant, pg in tenants_pgs: with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute("show zenith.zenith_tenant") + cur.execute("show neon.tenantid") tenant_id = cur.fetchone()[0] - cur.execute("show zenith.zenith_timeline") + cur.execute("show neon.timelineid") timeline_id = cur.fetchone()[0] cur.execute("SELECT pg_current_wal_flush_lsn()") current_lsn = lsn_from_hex(cur.fetchone()[0]) diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 0b33b56df3..86f9ed247b 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -21,7 +21,7 @@ def test_timeline_size(zenith_simple_env: ZenithEnv): with closing(pgmain.connect()) as conn: with conn.cursor() as cur: - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timelineid") # Create table, and insert the first 100 rows cur.execute("CREATE TABLE foo (t text)") @@ -81,12 +81,12 @@ def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder): pgmain = env.postgres.create_start( "test_timeline_size_quota", # Set small limit for the test - config_lines=['zenith.max_cluster_size=30MB']) + config_lines=['neon.max_cluster_size=30MB']) log.info("postgres is running on 'test_timeline_size_quota' branch") with closing(pgmain.connect()) as conn: with conn.cursor() as cur: - cur.execute("CREATE EXTENSION zenith") # TODO move it to zenith_fixtures? + cur.execute("CREATE EXTENSION neon") # TODO move it to zenith_fixtures? cur.execute("CREATE TABLE foo (t text)") diff --git a/test_runner/batch_others/test_vm_bits.py b/test_runner/batch_others/test_vm_bits.py index 98854111f6..8a14959eff 100644 --- a/test_runner/batch_others/test_vm_bits.py +++ b/test_runner/batch_others/test_vm_bits.py @@ -17,7 +17,7 @@ def test_vm_bit_clear(zenith_simple_env: ZenithEnv): cur = pg_conn.cursor() # Install extension containing function needed for test - cur.execute('CREATE EXTENSION zenith_test_utils') + cur.execute('CREATE EXTENSION neon_test_utils') # Create a test table and freeze it to set the VM bit. cur.execute('CREATE TABLE vmtest_delete (id integer PRIMARY KEY)') diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 8837725b84..46fb6601b1 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -337,8 +337,8 @@ def test_broker(zenith_env_builder: ZenithEnvBuilder): pg.safe_psql("CREATE TABLE t(key int primary key, value text)") # learn zenith timeline from compute - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + tenant_id = pg.safe_psql("show neon.tenantid")[0][0] + timeline_id = pg.safe_psql("show neon.timelineid")[0][0] # wait until remote_consistent_lsn gets advanced on all safekeepers clients = [sk.http_client() for sk in env.safekeepers] @@ -384,8 +384,8 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): cur.execute('CREATE TABLE t(key int primary key, value text)') cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + tenant_id = pg.safe_psql("show neon.tenantid")[0][0] + timeline_id = pg.safe_psql("show neon.timelineid")[0][0] # force checkpoint to advance remote_consistent_lsn with closing(env.pageserver.connect()) as psconn: @@ -497,10 +497,10 @@ class ProposerPostgres(PgProtocol): with open(self.config_file_path(), "w") as f: cfg = [ "synchronous_standby_names = 'walproposer'\n", - "shared_preload_libraries = 'zenith'\n", - f"zenith.zenith_timeline = '{self.timeline_id.hex}'\n", - f"zenith.zenith_tenant = '{self.tenant_id.hex}'\n", - f"zenith.page_server_connstring = ''\n", + "shared_preload_libraries = 'neon'\n", + f"neon.timelineid = '{self.timeline_id.hex}'\n", + f"neon.tenantid = '{self.tenant_id.hex}'\n", + f"neon.pageserver_connstring = ''\n", f"wal_acceptors = '{safekeepers}'\n", f"listen_addresses = '{self.listen_addr}'\n", f"port = '{self.port}'\n", @@ -612,8 +612,8 @@ def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): wa_http_cli.check_status() # learn zenith timeline from compute - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + tenant_id = pg.safe_psql("show neon.tenantid")[0][0] + timeline_id = pg.safe_psql("show neon.timelineid")[0][0] # fetch something sensible from status tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) @@ -798,8 +798,8 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): pg.start() # learn zenith timeline from compute - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + tenant_id = pg.safe_psql("show neon.tenantid")[0][0] + timeline_id = pg.safe_psql("show neon.timelineid")[0][0] execute_payload(pg) show_statuses(env.safekeepers, tenant_id, timeline_id) diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index c484b6401c..1e7edcc8df 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -151,8 +151,8 @@ async def run_restarts_under_load(env: ZenithEnv, test_timeout_at = time.monotonic() + 5 * 60 pg_conn = await pg.connect_async() - tenant_id = await pg_conn.fetchval("show zenith.zenith_tenant") - timeline_id = await pg_conn.fetchval("show zenith.zenith_timeline") + tenant_id = await pg_conn.fetchval("show neon.tenantid") + timeline_id = await pg_conn.fetchval("show neon.timelineid") bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount) # create tables and initial balances diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py index f4aceac5e8..eacc742880 100644 --- a/test_runner/batch_others/test_wal_restore.py +++ b/test_runner/batch_others/test_wal_restore.py @@ -19,7 +19,7 @@ def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, env.zenith_cli.create_branch("test_wal_restore") pg = env.postgres.create_start('test_wal_restore') pg.safe_psql("create table t as select generate_series(1,300000)") - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] + tenant_id = pg.safe_psql("show neon.tenantid")[0][0] env.zenith_cli.pageserver_stop() port = port_distributor.get_port() data_dir = os.path.join(test_output_dir, 'pgsql.restored') diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index d572901ed1..f5a97b5a84 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -66,7 +66,7 @@ class ZenithCompare(PgCompare): # We only use one branch and one timeline self.env.zenith_cli.create_branch(branch_name, 'empty') self._pg = self.env.postgres.create_start(branch_name) - self.timeline = self.pg.safe_psql("SHOW zenith.zenith_timeline")[0][0] + self.timeline = self.pg.safe_psql("SHOW neon.timelineid")[0][0] # Long-lived cursor, useful for flushing self.psconn = self.env.pageserver.connect() diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 336f1f1348..6d859b17d2 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -2039,7 +2039,7 @@ def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Pos # Get the timeline ID. We need it for the 'basebackup' command with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timelineid") timeline = cur.fetchone()[0] # stop postgres to ensure that files won't change diff --git a/vendor/postgres b/vendor/postgres index 038b2b98e5..165e61b5e0 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 038b2b98e5c3d6274cbd43e9b822cdd946cb8b91 +Subproject commit 165e61b5e0a7e003b28d8dca7a6825b3a03f065d From 751f1191b42a5c65b601bd5ab3e15f7301f8cf5f Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 19 Apr 2022 15:36:43 +0300 Subject: [PATCH 0370/1022] Rename 'wal_acceptors' GUC to 'safekeepers' --- compute_tools/tests/cluster_spec.json | 2 +- compute_tools/tests/pg_helpers_tests.rs | 2 +- control_plane/src/compute.rs | 4 ++-- test_runner/batch_others/test_wal_acceptor.py | 2 +- test_runner/fixtures/zenith_fixtures.py | 6 +++--- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/compute_tools/tests/cluster_spec.json b/compute_tools/tests/cluster_spec.json index 4821848678..5d8104ab4c 100644 --- a/compute_tools/tests/cluster_spec.json +++ b/compute_tools/tests/cluster_spec.json @@ -85,7 +85,7 @@ "vartype": "bool" }, { - "name": "wal_acceptors", + "name": "safekeepers", "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501", "vartype": "string" }, diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index a81c6512bc..9e606ec7c2 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -28,7 +28,7 @@ mod pg_helpers_tests { assert_eq!( spec.cluster.settings.as_pg_settings(), - "fsync = off\nwal_level = replica\nhot_standby = on\nwal_acceptors = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenantid = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timelineid = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" + "fsync = off\nwal_level = replica\nhot_standby = on\nsafekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenantid = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timelineid = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" ); } diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 3fefd32389..e81dddc287 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -150,7 +150,7 @@ impl PostgresNode { let port: u16 = conf.parse_field("port", &context)?; let timeline_id: ZTimelineId = conf.parse_field("neon.timelineid", &context)?; let tenant_id: ZTenantId = conf.parse_field("neon.tenantid", &context)?; - let uses_wal_proposer = conf.get("wal_acceptors").is_some(); + let uses_wal_proposer = conf.get("safekeepers").is_some(); // parse recovery_target_lsn, if any let recovery_target_lsn: Option = @@ -341,7 +341,7 @@ impl PostgresNode { .map(|sk| format!("localhost:{}", sk.pg_port)) .collect::>() .join(","); - conf.append("wal_acceptors", &safekeepers); + conf.append("safekeepers", &safekeepers); } else { // We only use setup without safekeepers for tests, // and don't care about data durability on pageserver, diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 46fb6601b1..b176faa46a 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -501,7 +501,7 @@ class ProposerPostgres(PgProtocol): f"neon.timelineid = '{self.timeline_id.hex}'\n", f"neon.tenantid = '{self.tenant_id.hex}'\n", f"neon.pageserver_connstring = ''\n", - f"wal_acceptors = '{safekeepers}'\n", + f"safekeepers = '{safekeepers}'\n", f"listen_addresses = '{self.listen_addr}'\n", f"port = '{self.port}'\n", ] diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 6d859b17d2..533a6cfa8c 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1590,12 +1590,12 @@ class Postgres(PgProtocol): if ("synchronous_standby_names" in cfg_line or # don't ask pageserver to fetch WAL from compute "callmemaybe_connstring" in cfg_line or - # don't repeat safekeepers/wal_acceptors multiple times - "wal_acceptors" in cfg_line): + # don't repeat safekeepers multiple times + "safekeepers" in cfg_line): continue f.write(cfg_line) f.write("synchronous_standby_names = 'walproposer'\n") - f.write("wal_acceptors = '{}'\n".format(safekeepers)) + f.write("safekeepers = '{}'\n".format(safekeepers)) return self def config(self, lines: List[str]) -> 'Postgres': From 6a867bce6db5c4a0b6bd0d56c6d6a6df92ef2279 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 26 May 2022 12:48:07 +0300 Subject: [PATCH 0371/1022] Rename 'zenith_admin' role to 'cloud_admin' --- README.md | 10 +++++----- compute_tools/README.md | 2 +- compute_tools/src/bin/compute_ctl.rs | 2 +- compute_tools/src/monitor.rs | 2 +- control_plane/src/compute.rs | 2 +- docs/settings.md | 6 +++--- libs/utils/scripts/restore_from_wal.sh | 2 +- libs/utils/scripts/restore_from_wal_archive.sh | 2 +- pageserver/src/config.rs | 4 ++-- test_runner/batch_others/test_wal_acceptor.py | 4 ++-- test_runner/batch_others/test_wal_restore.py | 2 +- test_runner/fixtures/zenith_fixtures.py | 4 ++-- vendor/postgres | 2 +- 13 files changed, 22 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 97927317d8..131d5da110 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,7 @@ Safekeeper started > ./target/debug/neon_local pg start main Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ... Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432 -Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres' +Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres' # check list of running postgres instances > ./target/debug/neon_local pg list @@ -123,7 +123,7 @@ Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=po 2. Now it is possible to connect to postgres and run some queries: ```text -> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres +> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres postgres=# CREATE TABLE t(key int primary key, value text); CREATE TABLE postgres=# insert into t values(1,1); @@ -150,7 +150,7 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: > ./target/debug/neon_local pg start migration_check --branch-name migration_check Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ... Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433 -Starting postgres node at 'host=127.0.0.1 port=55433 user=zenith_admin dbname=postgres' +Starting postgres node at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres' # check the new list of running postgres instances > ./target/debug/neon_local pg list @@ -160,7 +160,7 @@ Starting postgres node at 'host=127.0.0.1 port=55433 user=zenith_admin dbname=po # this new postgres instance will have all the data from 'main' postgres, # but all modifications would not affect data in original postgres -> psql -p55433 -h 127.0.0.1 -U zenith_admin postgres +> psql -p55433 -h 127.0.0.1 -U cloud_admin postgres postgres=# select * from t; key | value -----+------- @@ -171,7 +171,7 @@ postgres=# insert into t values(2,2); INSERT 0 1 # check that the new change doesn't affect the 'main' postgres -> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres +> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres postgres=# select * from t; key | value -----+------- diff --git a/compute_tools/README.md b/compute_tools/README.md index 15876ed246..97a7513344 100644 --- a/compute_tools/README.md +++ b/compute_tools/README.md @@ -22,7 +22,7 @@ Also `compute_ctl` spawns two separate service threads: Usage example: ```sh compute_ctl -D /var/db/postgres/compute \ - -C 'postgresql://zenith_admin@localhost/postgres' \ + -C 'postgresql://cloud_admin@localhost/postgres' \ -S /var/db/postgres/specs/current.json \ -b /usr/local/bin/postgres ``` diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index b97429c223..2e8d864830 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -21,7 +21,7 @@ //! Usage example: //! ```sh //! compute_ctl -D /var/db/postgres/compute \ -//! -C 'postgresql://zenith_admin@localhost/postgres' \ +//! -C 'postgresql://cloud_admin@localhost/postgres' \ //! -S /var/db/postgres/specs/current.json \ //! -b /usr/local/bin/postgres //! ``` diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 496a5aae3b..041b4875bd 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -43,7 +43,7 @@ fn watch_compute_activity(compute: &Arc) { FROM pg_stat_activity WHERE backend_type = 'client backend' AND pid != pg_backend_pid() - AND usename != 'zenith_admin';", // XXX: find a better way to filter other monitors? + AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors? &[], ); let mut last_active = compute.state.read().unwrap().last_active; diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index e81dddc287..d2d1d840c9 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -499,7 +499,7 @@ impl PostgresNode { "host={} port={} user={} dbname={}", self.address.ip(), self.address.port(), - "zenith_admin", + "cloud_admin", "postgres" ) } diff --git a/docs/settings.md b/docs/settings.md index 7773dbf17f..98439a094c 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -23,7 +23,7 @@ gc_horizon = '67108864' max_file_descriptors = '100' # initial superuser role name to use when creating a new tenant -initial_superuser_name = 'zenith_admin' +initial_superuser_name = 'cloud_admin' broker_etcd_prefix = 'neon' broker_endpoints = ['some://etcd'] @@ -38,7 +38,7 @@ Yet, it validates the config values it can (e.g. postgres install dir) and error Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#table) in TOML specification and -- either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'` +- either has to be placed in the config after the table-less values such as `initial_superuser_name = 'cloud_admin'` - or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}` @@ -115,7 +115,7 @@ WAL retention duration for PITR branching. Default is 30 days. Name of the initial superuser role, passed to initdb when a new tenant is initialized. It doesn't affect anything after initialization. The -default is Note: The default is 'zenith_admin', and the console +default is Note: The default is 'cloud_admin', and the console depends on that, so if you change it, bad things will happen. #### page_cache_size diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh index 4983449f24..9bd860affb 100755 --- a/libs/utils/scripts/restore_from_wal.sh +++ b/libs/utils/scripts/restore_from_wal.sh @@ -5,7 +5,7 @@ DATA_DIR=$3 PORT=$4 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-` rm -fr $DATA_DIR -env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U zenith_admin -D $DATA_DIR --sysid=$SYSID +env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID echo port=$PORT >> $DATA_DIR/postgresql.conf REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-` declare -i WAL_SIZE=$REDO_POS+114 diff --git a/libs/utils/scripts/restore_from_wal_archive.sh b/libs/utils/scripts/restore_from_wal_archive.sh index 07f4fe1e4f..ce58b349fc 100755 --- a/libs/utils/scripts/restore_from_wal_archive.sh +++ b/libs/utils/scripts/restore_from_wal_archive.sh @@ -5,7 +5,7 @@ PORT=$4 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-` rm -fr $DATA_DIR /tmp/pg_wals mkdir /tmp/pg_wals -env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U zenith_admin -D $DATA_DIR --sysid=$SYSID +env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID echo port=$PORT >> $DATA_DIR/postgresql.conf REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-` declare -i WAL_SIZE=$REDO_POS+114 diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 8add7b8b8f..f44b0846a8 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -34,7 +34,7 @@ pub mod defaults { pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; - pub const DEFAULT_SUPERUSER: &str = "zenith_admin"; + pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; @@ -499,7 +499,7 @@ impl PageServerConf { max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - superuser: "zenith_admin".to_string(), + superuser: "cloud_admin".to_string(), workdir: repo_dir, pg_distrib_dir: PathBuf::new(), auth_type: AuthType::Trust, diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index b176faa46a..97bac5fed4 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -473,7 +473,7 @@ class ProposerPostgres(PgProtocol): tenant_id: uuid.UUID, listen_addr: str, port: int): - super().__init__(host=listen_addr, port=port, user='zenith_admin', dbname='postgres') + super().__init__(host=listen_addr, port=port, user='cloud_admin', dbname='postgres') self.pgdata_dir: str = pgdata_dir self.pg_bin: PgBin = pg_bin @@ -529,7 +529,7 @@ class ProposerPostgres(PgProtocol): def initdb(self): """ Run initdb """ - args = ["initdb", "-U", "zenith_admin", "-D", self.pg_data_dir_path()] + args = ["initdb", "-U", "cloud_admin", "-D", self.pg_data_dir_path()] self.pg_bin.run(args) def start(self): diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py index eacc742880..69249e75ff 100644 --- a/test_runner/batch_others/test_wal_restore.py +++ b/test_runner/batch_others/test_wal_restore.py @@ -32,4 +32,4 @@ def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, str(port) ]) restored.start() - assert restored.safe_psql('select count(*) from t', user='zenith_admin') == [(300000, )] + assert restored.safe_psql('select count(*) from t', user='cloud_admin') == [(300000, )] diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 533a6cfa8c..4459e0ac55 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1226,7 +1226,7 @@ class ZenithPageserver(PgProtocol): Initializes the repository via `zenith init`. """ def __init__(self, env: ZenithEnv, port: PageserverPort, config_override: Optional[str] = None): - super().__init__(host='localhost', port=port.pg, user='zenith_admin') + super().__init__(host='localhost', port=port.pg, user='cloud_admin') self.env = env self.running = False self.service_port = port @@ -1495,7 +1495,7 @@ def static_proxy(vanilla_pg) -> Iterator[ZenithProxy]: class Postgres(PgProtocol): """ An object representing a running postgres daemon. """ def __init__(self, env: ZenithEnv, tenant_id: uuid.UUID, port: int): - super().__init__(host='localhost', port=port, user='zenith_admin', dbname='postgres') + super().__init__(host='localhost', port=port, user='cloud_admin', dbname='postgres') self.env = env self.running = False self.node_name: Optional[str] = None # dubious, see asserts below diff --git a/vendor/postgres b/vendor/postgres index 165e61b5e0..7a2aa6035b 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 165e61b5e0a7e003b28d8dca7a6825b3a03f065d +Subproject commit 7a2aa6035bf0b4f676597b7b90de7fee20824fff From 67d6ff41009a38a8c96e7058737220518f2267c5 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 26 May 2022 21:18:52 +0300 Subject: [PATCH 0372/1022] Rename custom GUCs: - zenith.zenith_tenant -> neon.tenant_id - zenith.zenith_timeline -> neon.timeline_id --- compute_tools/src/bin/compute_ctl.rs | 4 ++-- compute_tools/tests/cluster_spec.json | 4 ++-- compute_tools/tests/pg_helpers_tests.rs | 2 +- control_plane/src/compute.rs | 8 ++++---- .../batch_others/test_ancestor_branch.py | 6 +++--- .../batch_others/test_branch_behind.py | 2 +- .../batch_others/test_broken_timeline.py | 2 +- .../batch_others/test_gc_aggressive.py | 2 +- .../batch_others/test_old_request_lsn.py | 2 +- test_runner/batch_others/test_pitr_gc.py | 2 +- .../batch_others/test_remote_storage.py | 4 ++-- .../batch_others/test_tenant_relocation.py | 2 +- .../test_tenants_with_remote_storage.py | 8 ++++---- .../batch_others/test_timeline_size.py | 2 +- test_runner/batch_others/test_wal_acceptor.py | 20 +++++++++---------- .../batch_others/test_wal_acceptor_async.py | 4 ++-- test_runner/batch_others/test_wal_restore.py | 2 +- test_runner/fixtures/compare_fixtures.py | 2 +- test_runner/fixtures/zenith_fixtures.py | 2 +- vendor/postgres | 2 +- 20 files changed, 41 insertions(+), 41 deletions(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 2e8d864830..ba116af11b 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -121,12 +121,12 @@ fn main() -> Result<()> { let tenant = spec .cluster .settings - .find("neon.tenantid") + .find("neon.tenant_id") .expect("tenant id should be provided"); let timeline = spec .cluster .settings - .find("neon.timelineid") + .find("neon.timeline_id") .expect("tenant id should be provided"); let compute_state = ComputeNode { diff --git a/compute_tools/tests/cluster_spec.json b/compute_tools/tests/cluster_spec.json index 5d8104ab4c..bdd6e60a69 100644 --- a/compute_tools/tests/cluster_spec.json +++ b/compute_tools/tests/cluster_spec.json @@ -150,7 +150,7 @@ "vartype": "integer" }, { - "name": "neon.tenantid", + "name": "neon.tenant_id", "value": "b0554b632bd4d547a63b86c3630317e8", "vartype": "string" }, @@ -160,7 +160,7 @@ "vartype": "integer" }, { - "name": "neon.timelineid", + "name": "neon.timeline_id", "value": "2414a61ffc94e428f14b5758fe308e13", "vartype": "string" }, diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 9e606ec7c2..1f2e188398 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -28,7 +28,7 @@ mod pg_helpers_tests { assert_eq!( spec.cluster.settings.as_pg_settings(), - "fsync = off\nwal_level = replica\nhot_standby = on\nsafekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenantid = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timelineid = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" + "fsync = off\nwal_level = replica\nhot_standby = on\nsafekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" ); } diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index d2d1d840c9..06a14d8a41 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -148,8 +148,8 @@ impl PostgresNode { // Read a few options from the config file let context = format!("in config file {}", cfg_path_str); let port: u16 = conf.parse_field("port", &context)?; - let timeline_id: ZTimelineId = conf.parse_field("neon.timelineid", &context)?; - let tenant_id: ZTenantId = conf.parse_field("neon.tenantid", &context)?; + let timeline_id: ZTimelineId = conf.parse_field("neon.timeline_id", &context)?; + let tenant_id: ZTenantId = conf.parse_field("neon.tenant_id", &context)?; let uses_wal_proposer = conf.get("safekeepers").is_some(); // parse recovery_target_lsn, if any @@ -306,8 +306,8 @@ impl PostgresNode { conf.append("shared_preload_libraries", "neon"); conf.append_line(""); conf.append("neon.pageserver_connstring", &pageserver_connstr); - conf.append("neon.tenantid", &self.tenant_id.to_string()); - conf.append("neon.timelineid", &self.timeline_id.to_string()); + conf.append("neon.tenant_id", &self.tenant_id.to_string()); + conf.append("neon.timeline_id", &self.timeline_id.to_string()); if let Some(lsn) = self.lsn { conf.append("recovery_target_lsn", &lsn.to_string()); } diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index e05a550fdf..d87bebcc11 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -30,7 +30,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): pg_branch0 = env.postgres.create_start('main', tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() - branch0_cur.execute("SHOW neon.timelineid") + branch0_cur.execute("SHOW neon.timeline_id") branch0_timeline = branch0_cur.fetchone()[0] log.info(f"b0 timeline {branch0_timeline}") @@ -55,7 +55,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): log.info("postgres is running on 'branch1' branch") branch1_cur = pg_branch1.connect().cursor() - branch1_cur.execute("SHOW neon.timelineid") + branch1_cur.execute("SHOW neon.timeline_id") branch1_timeline = branch1_cur.fetchone()[0] log.info(f"b1 timeline {branch1_timeline}") @@ -79,7 +79,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): log.info("postgres is running on 'branch2' branch") branch2_cur = pg_branch2.connect().cursor() - branch2_cur.execute("SHOW neon.timelineid") + branch2_cur.execute("SHOW neon.timeline_id") branch2_timeline = branch2_cur.fetchone()[0] log.info(f"b2 timeline {branch2_timeline}") diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index 9bb04f574b..7a00ecfca2 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -31,7 +31,7 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() - main_cur.execute("SHOW neon.timelineid") + main_cur.execute("SHOW neon.timeline_id") timeline = main_cur.fetchone()[0] # Create table, and insert the first 100 rows diff --git a/test_runner/batch_others/test_broken_timeline.py b/test_runner/batch_others/test_broken_timeline.py index 45fe69748d..05391f7e4d 100644 --- a/test_runner/batch_others/test_broken_timeline.py +++ b/test_runner/batch_others/test_broken_timeline.py @@ -26,7 +26,7 @@ def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder): cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'") - cur.execute("SHOW neon.timelineid") + cur.execute("SHOW neon.timeline_id") timeline_id = cur.fetchone()[0] pg.stop() tenant_timelines.append((tenant_id, timeline_id, pg)) diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/batch_others/test_gc_aggressive.py index 6beee49d2f..79af54c1de 100644 --- a/test_runner/batch_others/test_gc_aggressive.py +++ b/test_runner/batch_others/test_gc_aggressive.py @@ -62,7 +62,7 @@ def test_gc_aggressive(zenith_env_builder: ZenithEnvBuilder): conn = pg.connect() cur = conn.cursor() - cur.execute("SHOW neon.timelineid") + cur.execute("SHOW neon.timeline_id") timeline = cur.fetchone()[0] # Create table, and insert the first 100 rows diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py index 1ec429ea34..fd0cbe26cc 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -26,7 +26,7 @@ def test_old_request_lsn(zenith_env_builder: ZenithEnvBuilder): cur = pg_conn.cursor() # Get the timeline ID of our branch. We need it for the 'do_gc' command - cur.execute("SHOW neon.timelineid") + cur.execute("SHOW neon.timeline_id") timeline = cur.fetchone()[0] psconn = env.pageserver.connect() diff --git a/test_runner/batch_others/test_pitr_gc.py b/test_runner/batch_others/test_pitr_gc.py index 6456acd214..1a1562ca5f 100644 --- a/test_runner/batch_others/test_pitr_gc.py +++ b/test_runner/batch_others/test_pitr_gc.py @@ -25,7 +25,7 @@ def test_pitr_gc(zenith_env_builder: ZenithEnvBuilder): main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() - main_cur.execute("SHOW neon.timelineid") + main_cur.execute("SHOW neon.timeline_id") timeline = main_cur.fetchone()[0] # Create table diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index e5c94980f0..e7097e2ef5 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -48,8 +48,8 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, client = env.pageserver.http_client() - tenant_id = pg.safe_psql("show neon.tenantid")[0][0] - timeline_id = pg.safe_psql("show neon.timelineid")[0][0] + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] checkpoint_numbers = range(1, 3) diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 6ad9c6305f..af96cc8524 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -130,7 +130,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, with closing(tenant_pg.connect()) as conn: with conn.cursor() as cur: # save timeline for later gc call - cur.execute("SHOW neon.timelineid") + cur.execute("SHOW neon.timeline_id") timeline = UUID(cur.fetchone()[0]) log.info("timeline to relocate %s", timeline.hex) diff --git a/test_runner/batch_others/test_tenants_with_remote_storage.py b/test_runner/batch_others/test_tenants_with_remote_storage.py index 8eb72437fd..dbe07c4aba 100644 --- a/test_runner/batch_others/test_tenants_with_remote_storage.py +++ b/test_runner/batch_others/test_tenants_with_remote_storage.py @@ -21,8 +21,8 @@ async def tenant_workload(env: ZenithEnv, pg: Postgres): pg_conn = await pg.connect_async() - tenant_id = await pg_conn.fetchval("show neon.tenantid") - timeline_id = await pg_conn.fetchval("show neon.timelineid") + tenant_id = await pg_conn.fetchval("show neon.tenant_id") + timeline_id = await pg_conn.fetchval("show neon.timeline_id") await pg_conn.execute("CREATE TABLE t(key int primary key, value text)") for i in range(1, 100): @@ -82,9 +82,9 @@ def test_tenants_many(zenith_env_builder: ZenithEnvBuilder, storage_type: str): for tenant, pg in tenants_pgs: with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute("show neon.tenantid") + cur.execute("show neon.tenant_id") tenant_id = cur.fetchone()[0] - cur.execute("show neon.timelineid") + cur.execute("show neon.timeline_id") timeline_id = cur.fetchone()[0] cur.execute("SELECT pg_current_wal_flush_lsn()") current_lsn = lsn_from_hex(cur.fetchone()[0]) diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 86f9ed247b..d43e793df8 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -21,7 +21,7 @@ def test_timeline_size(zenith_simple_env: ZenithEnv): with closing(pgmain.connect()) as conn: with conn.cursor() as cur: - cur.execute("SHOW neon.timelineid") + cur.execute("SHOW neon.timeline_id") # Create table, and insert the first 100 rows cur.execute("CREATE TABLE foo (t text)") diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 97bac5fed4..fd80313f94 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -337,8 +337,8 @@ def test_broker(zenith_env_builder: ZenithEnvBuilder): pg.safe_psql("CREATE TABLE t(key int primary key, value text)") # learn zenith timeline from compute - tenant_id = pg.safe_psql("show neon.tenantid")[0][0] - timeline_id = pg.safe_psql("show neon.timelineid")[0][0] + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] # wait until remote_consistent_lsn gets advanced on all safekeepers clients = [sk.http_client() for sk in env.safekeepers] @@ -384,8 +384,8 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): cur.execute('CREATE TABLE t(key int primary key, value text)') cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - tenant_id = pg.safe_psql("show neon.tenantid")[0][0] - timeline_id = pg.safe_psql("show neon.timelineid")[0][0] + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] # force checkpoint to advance remote_consistent_lsn with closing(env.pageserver.connect()) as psconn: @@ -498,8 +498,8 @@ class ProposerPostgres(PgProtocol): cfg = [ "synchronous_standby_names = 'walproposer'\n", "shared_preload_libraries = 'neon'\n", - f"neon.timelineid = '{self.timeline_id.hex}'\n", - f"neon.tenantid = '{self.tenant_id.hex}'\n", + f"neon.timeline_id = '{self.timeline_id.hex}'\n", + f"neon.tenant_id = '{self.tenant_id.hex}'\n", f"neon.pageserver_connstring = ''\n", f"safekeepers = '{safekeepers}'\n", f"listen_addresses = '{self.listen_addr}'\n", @@ -612,8 +612,8 @@ def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): wa_http_cli.check_status() # learn zenith timeline from compute - tenant_id = pg.safe_psql("show neon.tenantid")[0][0] - timeline_id = pg.safe_psql("show neon.timelineid")[0][0] + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] # fetch something sensible from status tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) @@ -798,8 +798,8 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): pg.start() # learn zenith timeline from compute - tenant_id = pg.safe_psql("show neon.tenantid")[0][0] - timeline_id = pg.safe_psql("show neon.timelineid")[0][0] + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] execute_payload(pg) show_statuses(env.safekeepers, tenant_id, timeline_id) diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index 1e7edcc8df..bd3b3027c5 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -151,8 +151,8 @@ async def run_restarts_under_load(env: ZenithEnv, test_timeout_at = time.monotonic() + 5 * 60 pg_conn = await pg.connect_async() - tenant_id = await pg_conn.fetchval("show neon.tenantid") - timeline_id = await pg_conn.fetchval("show neon.timelineid") + tenant_id = await pg_conn.fetchval("show neon.tenant_id") + timeline_id = await pg_conn.fetchval("show neon.timeline_id") bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount) # create tables and initial balances diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py index 69249e75ff..85c6e776c5 100644 --- a/test_runner/batch_others/test_wal_restore.py +++ b/test_runner/batch_others/test_wal_restore.py @@ -19,7 +19,7 @@ def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, env.zenith_cli.create_branch("test_wal_restore") pg = env.postgres.create_start('test_wal_restore') pg.safe_psql("create table t as select generate_series(1,300000)") - tenant_id = pg.safe_psql("show neon.tenantid")[0][0] + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] env.zenith_cli.pageserver_stop() port = port_distributor.get_port() data_dir = os.path.join(test_output_dir, 'pgsql.restored') diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index f5a97b5a84..c61bc6d81f 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -66,7 +66,7 @@ class ZenithCompare(PgCompare): # We only use one branch and one timeline self.env.zenith_cli.create_branch(branch_name, 'empty') self._pg = self.env.postgres.create_start(branch_name) - self.timeline = self.pg.safe_psql("SHOW neon.timelineid")[0][0] + self.timeline = self.pg.safe_psql("SHOW neon.timeline_id")[0][0] # Long-lived cursor, useful for flushing self.psconn = self.env.pageserver.connect() diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 4459e0ac55..5f3c16c4e6 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -2039,7 +2039,7 @@ def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Pos # Get the timeline ID. We need it for the 'basebackup' command with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute("SHOW neon.timelineid") + cur.execute("SHOW neon.timeline_id") timeline = cur.fetchone()[0] # stop postgres to ensure that files won't change diff --git a/vendor/postgres b/vendor/postgres index 7a2aa6035b..a424e3ccff 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 7a2aa6035bf0b4f676597b7b90de7fee20824fff +Subproject commit a424e3ccff7d6af97d9ee5d4b727fb8324c78e11 From 915e5c911483ca10716615bf2e14574710e6844e Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 26 May 2022 19:18:32 +0300 Subject: [PATCH 0373/1022] Rename 'zenith_admin' to 'cloud_admin' on compute node start --- compute_tools/src/compute.rs | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index fd60b80305..a2e6874a28 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -262,7 +262,30 @@ impl ComputeNode { .unwrap_or_else(|| "5432".to_string()); wait_for_postgres(&mut pg, &port, pgdata_path)?; - let mut client = Client::connect(&self.connstr, NoTls)?; + // If connection fails, + // it may be the old node with `zenith_admin` superuser. + // + // In this case we need to connect with old `zenith_admin`name + // and create new user. We cannot simply rename connected user, + // but we can create a new one and grant it all privileges. + let mut client = match Client::connect(&self.connstr, NoTls) { + Err(e) => { + info!( + "cannot connect to postgres: {}, retrying with `zenith_admin` username", + e + ); + let zenith_admin_connstr = self.connstr.replacen("cloud_admin", "zenith_admin", 1); + + let mut client = Client::connect(&zenith_admin_connstr, NoTls)?; + client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; + client.simple_query("GRANT zenith_admin TO cloud_admin")?; + drop(client); + + // reconnect with connsting with expected name + Client::connect(&self.connstr, NoTls)? + } + Ok(client) => client, + }; handle_roles(&self.spec, &mut client)?; handle_databases(&self.spec, &mut client)?; From e014cb6026f5b5f0105a7db5f81ac16affd9a1a7 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 30 May 2022 12:03:04 +0300 Subject: [PATCH 0374/1022] rename zenith.zenith_tenant to neon.tenant_id in test --- test_runner/batch_others/test_wal_acceptor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index fd80313f94..35b7d9585a 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -431,8 +431,8 @@ def test_wal_backup(zenith_env_builder: ZenithEnvBuilder, storage_type: str): pg = env.postgres.create_start('test_safekeepers_wal_backup') # learn zenith timeline from compute - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] pg_conn = pg.connect() cur = pg_conn.cursor() From 36281e3b475ac46570dd4f89a61fc525ff3f0a1c Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Sat, 28 May 2022 07:13:15 +0400 Subject: [PATCH 0375/1022] Extend test_wal_backup with compute restart. --- safekeeper/src/wal_backup.rs | 4 +- test_runner/batch_others/test_wal_acceptor.py | 40 +++++++++++++------ 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index a4b779649d..1723d03ee3 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -204,6 +204,7 @@ impl WalBackupTask { l.give_up().await; } + info!("acquiring leadership"); match broker::get_leader(&self.election).await { Ok(l) => { self.leader = Some(l); @@ -214,6 +215,7 @@ impl WalBackupTask { continue; } } + info!("acquired leadership"); // offload loop loop { @@ -268,7 +270,7 @@ impl WalBackupTask { { Ok(leader) => { if !leader { - info!("leader has changed"); + info!("lost leadership"); break; } } diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 35b7d9585a..40a9b48a18 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -414,6 +414,22 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): time.sleep(0.5) +def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end): + started_at = time.time() + http_cli = live_sk.http_client() + while True: + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"live sk status is {tli_status}") + + if lsn_from_hex(tli_status.backup_lsn) >= lsn_from_hex(seg_end): + break + elapsed = time.time() - started_at + if elapsed > 20: + raise RuntimeError( + f"timed out waiting {elapsed:.0f}s for segment ending at {seg_end} get offloaded") + time.sleep(0.5) + + @pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs']) def test_wal_backup(zenith_env_builder: ZenithEnvBuilder, storage_type: str): zenith_env_builder.num_safekeepers = 3 @@ -446,23 +462,21 @@ def test_wal_backup(zenith_env_builder: ZenithEnvBuilder, storage_type: str): # roughly fills one segment cur.execute("insert into t select generate_series(1,250000), 'payload'") live_sk = [sk for sk in env.safekeepers if sk != victim][0] - http_cli = live_sk.http_client() - started_at = time.time() - while True: - tli_status = http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"live sk status is {tli_status}") - - if lsn_from_hex(tli_status.backup_lsn) >= lsn_from_hex(seg_end): - break - elapsed = time.time() - started_at - if elapsed > 20: - raise RuntimeError( - f"timed out waiting {elapsed:.0f}s segment ending at {seg_end} get offloaded") - time.sleep(0.5) + wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end) victim.start() + # put one of safekeepers down again + env.safekeepers[0].stop() + # restart postgres + pg.stop_and_destroy().create_start('test_safekeepers_wal_backup') + # and ensure offloading still works + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("insert into t select generate_series(1,250000), 'payload'") + wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], '0/5000000') + class ProposerPostgres(PgProtocol): """Object for running postgres without ZenithEnv""" From c3e0b6c839fa37bc9734a09dc8288d577557cb27 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 31 May 2022 11:10:50 +0300 Subject: [PATCH 0376/1022] Implement timeline-based metrics in safekeeper (#1823) Now there's timelines metrics collector, which goes through all timelines and reports metrics only for active ones --- libs/metrics/src/lib.rs | 1 + safekeeper/src/bin/safekeeper.rs | 6 + safekeeper/src/lib.rs | 1 + safekeeper/src/metrics.rs | 336 +++++++++++++++++++++++++++++++ safekeeper/src/safekeeper.rs | 37 +--- safekeeper/src/timeline.rs | 38 ++++ safekeeper/src/wal_storage.rs | 14 +- 7 files changed, 384 insertions(+), 49 deletions(-) create mode 100644 safekeeper/src/metrics.rs diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 9929fc6d45..3b5da9f7ff 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -3,6 +3,7 @@ //! Otherwise, we might not see all metrics registered via //! a default registry. use lazy_static::lazy_static; +pub use prometheus::{core, default_registry, proto}; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_gauge, Gauge}; pub use prometheus::{register_gauge_vec, GaugeVec}; diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index e792a854d5..9feb984c4f 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -264,6 +264,12 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo } } + // Register metrics collector for active timelines. It's important to do this + // after daemonizing, otherwise process collector will be upset. + let registry = metrics::default_registry(); + let timeline_collector = safekeeper::metrics::TimelineCollector::new(); + registry.register(Box::new(timeline_collector))?; + let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; let (callmemaybe_tx, callmemaybe_rx) = mpsc::unbounded_channel(); diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index c092f5185b..1fae9b00f8 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -14,6 +14,7 @@ pub mod control_file_upgrade; pub mod handler; pub mod http; pub mod json_ctrl; +pub mod metrics; pub mod receive_wal; pub mod remove_wal; pub mod safekeeper; diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs new file mode 100644 index 0000000000..5a2e5f125f --- /dev/null +++ b/safekeeper/src/metrics.rs @@ -0,0 +1,336 @@ +//! This module exports metrics for all active timelines. + +use std::time::{Instant, SystemTime}; + +use metrics::{ + core::{AtomicU64, Collector, Desc, GenericGaugeVec, Opts}, + proto::MetricFamily, + Gauge, IntGaugeVec, +}; +use postgres_ffi::xlog_utils::XLogSegNo; +use utils::{lsn::Lsn, zid::ZTenantTimelineId}; + +use crate::{ + safekeeper::{SafeKeeperState, SafekeeperMemState}, + timeline::{GlobalTimelines, ReplicaState}, +}; + +pub struct FullTimelineInfo { + pub zttid: ZTenantTimelineId, + pub replicas: Vec, + pub wal_backup_active: bool, + pub timeline_is_active: bool, + pub num_computes: u32, + pub last_removed_segno: XLogSegNo, + + pub epoch_start_lsn: Lsn, + pub mem_state: SafekeeperMemState, + pub persisted_state: SafeKeeperState, + + pub flush_lsn: Lsn, +} + +pub struct TimelineCollector { + descs: Vec, + commit_lsn: GenericGaugeVec, + backup_lsn: GenericGaugeVec, + flush_lsn: GenericGaugeVec, + epoch_start_lsn: GenericGaugeVec, + peer_horizon_lsn: GenericGaugeVec, + remote_consistent_lsn: GenericGaugeVec, + feedback_ps_write_lsn: GenericGaugeVec, + feedback_last_time_seconds: GenericGaugeVec, + timeline_active: GenericGaugeVec, + wal_backup_active: GenericGaugeVec, + connected_computes: IntGaugeVec, + disk_usage: GenericGaugeVec, + acceptor_term: GenericGaugeVec, + collect_timeline_metrics: Gauge, +} + +impl Default for TimelineCollector { + fn default() -> Self { + Self::new() + } +} + +impl TimelineCollector { + pub fn new() -> TimelineCollector { + let mut descs = Vec::new(); + + let commit_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_commit_lsn", + "Current commit_lsn (not necessarily persisted to disk), grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(commit_lsn.desc().into_iter().cloned()); + + let backup_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_backup_lsn", + "Current backup_lsn, up to which WAL is backed up, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(backup_lsn.desc().into_iter().cloned()); + + let flush_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_flush_lsn", + "Current flush_lsn, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(flush_lsn.desc().into_iter().cloned()); + + let epoch_start_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_epoch_start_lsn", + "Point since which compute generates new WAL in the current consensus term", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(epoch_start_lsn.desc().into_iter().cloned()); + + let peer_horizon_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_peer_horizon_lsn", + "LSN of the most lagging safekeeper", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(peer_horizon_lsn.desc().into_iter().cloned()); + + let remote_consistent_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_remote_consistent_lsn", + "LSN which is persisted to the remote storage in pageserver", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(remote_consistent_lsn.desc().into_iter().cloned()); + + let feedback_ps_write_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_feedback_ps_write_lsn", + "Last LSN received by the pageserver, acknowledged in the feedback", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(feedback_ps_write_lsn.desc().into_iter().cloned()); + + let feedback_last_time_seconds = GenericGaugeVec::new( + Opts::new( + "safekeeper_feedback_last_time_seconds", + "Timestamp of the last feedback from the pageserver", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(feedback_last_time_seconds.desc().into_iter().cloned()); + + let timeline_active = GenericGaugeVec::new( + Opts::new( + "safekeeper_timeline_active", + "Reports 1 for active timelines, 0 for inactive", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(timeline_active.desc().into_iter().cloned()); + + let wal_backup_active = GenericGaugeVec::new( + Opts::new( + "safekeeper_wal_backup_active", + "Reports 1 for timelines with active WAL backup, 0 otherwise", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(wal_backup_active.desc().into_iter().cloned()); + + let connected_computes = IntGaugeVec::new( + Opts::new( + "safekeeper_connected_computes", + "Number of active compute connections", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(connected_computes.desc().into_iter().cloned()); + + let disk_usage = GenericGaugeVec::new( + Opts::new( + "safekeeper_disk_usage_bytes", + "Estimated disk space used to store WAL segments", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(disk_usage.desc().into_iter().cloned()); + + let acceptor_term = GenericGaugeVec::new( + Opts::new("safekeeper_acceptor_term", "Current consensus term"), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(acceptor_term.desc().into_iter().cloned()); + + let collect_timeline_metrics = Gauge::new( + "safekeeper_collect_timeline_metrics_seconds", + "Time spent collecting timeline metrics, including obtaining mutex lock for all timelines", + ) + .unwrap(); + descs.extend(collect_timeline_metrics.desc().into_iter().cloned()); + + TimelineCollector { + descs, + commit_lsn, + backup_lsn, + flush_lsn, + epoch_start_lsn, + peer_horizon_lsn, + remote_consistent_lsn, + feedback_ps_write_lsn, + feedback_last_time_seconds, + timeline_active, + wal_backup_active, + connected_computes, + disk_usage, + acceptor_term, + collect_timeline_metrics, + } + } +} + +impl Collector for TimelineCollector { + fn desc(&self) -> Vec<&Desc> { + self.descs.iter().collect() + } + + fn collect(&self) -> Vec { + let start_collecting = Instant::now(); + + // reset all metrics to clean up inactive timelines + self.commit_lsn.reset(); + self.backup_lsn.reset(); + self.flush_lsn.reset(); + self.epoch_start_lsn.reset(); + self.peer_horizon_lsn.reset(); + self.remote_consistent_lsn.reset(); + self.feedback_ps_write_lsn.reset(); + self.feedback_last_time_seconds.reset(); + self.timeline_active.reset(); + self.wal_backup_active.reset(); + self.connected_computes.reset(); + self.disk_usage.reset(); + self.acceptor_term.reset(); + + let timelines = GlobalTimelines::active_timelines_metrics(); + + for tli in timelines { + let tenant_id = tli.zttid.tenant_id.to_string(); + let timeline_id = tli.zttid.timeline_id.to_string(); + let labels = &[tenant_id.as_str(), timeline_id.as_str()]; + + let mut most_advanced: Option = None; + for replica in tli.replicas.iter() { + if let Some(replica_feedback) = replica.zenith_feedback { + if let Some(current) = most_advanced { + if current.ps_writelsn < replica_feedback.ps_writelsn { + most_advanced = Some(replica_feedback); + } + } else { + most_advanced = Some(replica_feedback); + } + } + } + + self.commit_lsn + .with_label_values(labels) + .set(tli.mem_state.commit_lsn.into()); + self.backup_lsn + .with_label_values(labels) + .set(tli.mem_state.backup_lsn.into()); + self.flush_lsn + .with_label_values(labels) + .set(tli.flush_lsn.into()); + self.epoch_start_lsn + .with_label_values(labels) + .set(tli.epoch_start_lsn.into()); + self.peer_horizon_lsn + .with_label_values(labels) + .set(tli.mem_state.peer_horizon_lsn.into()); + self.remote_consistent_lsn + .with_label_values(labels) + .set(tli.mem_state.remote_consistent_lsn.into()); + self.timeline_active + .with_label_values(labels) + .set(tli.timeline_is_active as u64); + self.wal_backup_active + .with_label_values(labels) + .set(tli.wal_backup_active as u64); + self.connected_computes + .with_label_values(labels) + .set(tli.num_computes as i64); + self.acceptor_term + .with_label_values(labels) + .set(tli.persisted_state.acceptor_state.term as u64); + + if let Some(feedback) = most_advanced { + self.feedback_ps_write_lsn + .with_label_values(labels) + .set(feedback.ps_writelsn); + if let Ok(unix_time) = feedback.ps_replytime.duration_since(SystemTime::UNIX_EPOCH) + { + self.feedback_last_time_seconds + .with_label_values(labels) + .set(unix_time.as_secs()); + } + } + + if tli.last_removed_segno != 0 { + let segno_count = tli + .flush_lsn + .segment_number(tli.persisted_state.server.wal_seg_size as usize) + - tli.last_removed_segno; + let disk_usage_bytes = segno_count * tli.persisted_state.server.wal_seg_size as u64; + self.disk_usage + .with_label_values(labels) + .set(disk_usage_bytes); + } + } + + // collect MetricFamilys. + let mut mfs = Vec::new(); + mfs.extend(self.commit_lsn.collect()); + mfs.extend(self.backup_lsn.collect()); + mfs.extend(self.flush_lsn.collect()); + mfs.extend(self.epoch_start_lsn.collect()); + mfs.extend(self.peer_horizon_lsn.collect()); + mfs.extend(self.remote_consistent_lsn.collect()); + mfs.extend(self.feedback_ps_write_lsn.collect()); + mfs.extend(self.feedback_last_time_seconds.collect()); + mfs.extend(self.timeline_active.collect()); + mfs.extend(self.wal_backup_active.collect()); + mfs.extend(self.connected_computes.collect()); + mfs.extend(self.disk_usage.collect()); + mfs.extend(self.acceptor_term.collect()); + + // report time it took to collect all info + let elapsed = start_collecting.elapsed().as_secs_f64(); + self.collect_timeline_metrics.set(elapsed); + mfs.extend(self.collect_timeline_metrics.collect()); + + mfs + } +} diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index df4b202063..1c00af7043 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -15,13 +15,10 @@ use std::fmt; use std::io::Read; use tracing::*; -use lazy_static::lazy_static; - use crate::control_file; use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; -use metrics::{register_gauge_vec, Gauge, GaugeVec}; use postgres_ffi::xlog_utils::MAX_SEND_SIZE; use utils::{ bin_ser::LeSer, @@ -487,45 +484,16 @@ impl AcceptorProposerMessage { } } -lazy_static! { - // The prometheus crate does not support u64 yet, i64 only (see `IntGauge`). - // i64 is faster than f64, so update to u64 when available. - static ref COMMIT_LSN_GAUGE: GaugeVec = register_gauge_vec!( - "safekeeper_commit_lsn", - "Current commit_lsn (not necessarily persisted to disk), grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("Failed to register safekeeper_commit_lsn gauge vec"); -} - -struct SafeKeeperMetrics { - commit_lsn: Gauge, - // WAL-related metrics are in WalStorageMetrics -} - -impl SafeKeeperMetrics { - fn new(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Self { - let tenant_id = tenant_id.to_string(); - let timeline_id = timeline_id.to_string(); - Self { - commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&[&tenant_id, &timeline_id]), - } - } -} - /// SafeKeeper which consumes events (messages from compute) and provides /// replies. pub struct SafeKeeper { - // Cached metrics so we don't have to recompute labels on each update. - metrics: SafeKeeperMetrics, - /// Maximum commit_lsn between all nodes, can be ahead of local flush_lsn. /// Note: be careful to set only if we are sure our WAL (term history) matches /// committed one. pub global_commit_lsn: Lsn, /// LSN since the proposer safekeeper currently talking to appends WAL; /// determines epoch switch point. - epoch_start_lsn: Lsn, + pub epoch_start_lsn: Lsn, pub inmem: SafekeeperMemState, // in memory part pub state: CTRL, // persistent state storage @@ -555,7 +523,6 @@ where wal_store.init_storage(&state)?; Ok(SafeKeeper { - metrics: SafeKeeperMetrics::new(state.tenant_id, ztli), global_commit_lsn: state.commit_lsn, epoch_start_lsn: Lsn(0), inmem: SafekeeperMemState { @@ -757,7 +724,6 @@ where // upgrade. self.global_commit_lsn = max(self.global_commit_lsn, state.timeline_start_lsn); self.inmem.commit_lsn = max(self.inmem.commit_lsn, state.timeline_start_lsn); - self.metrics.commit_lsn.set(self.inmem.commit_lsn.0 as f64); // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment. self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn); @@ -777,7 +743,6 @@ where assert!(commit_lsn >= self.inmem.commit_lsn); self.inmem.commit_lsn = commit_lsn; - self.metrics.commit_lsn.set(self.inmem.commit_lsn.0 as f64); // If new commit_lsn reached epoch switch, force sync of control // file: walproposer in sync mode is very interested when this diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 74a61410fd..2fc5bcc1f6 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -33,6 +33,7 @@ use crate::safekeeper::{ }; use crate::send_wal::HotStandbyFeedback; +use crate::metrics::FullTimelineInfo; use crate::wal_storage; use crate::wal_storage::Storage as wal_storage_iface; use crate::SafeKeeperConf; @@ -450,6 +451,33 @@ impl Timeline { shared_state.active } + /// Returns full timeline info, required for the metrics. + /// If the timeline is not active, returns None instead. + pub fn info_for_metrics(&self) -> Option { + let shared_state = self.mutex.lock().unwrap(); + if !shared_state.active { + return None; + } + + Some(FullTimelineInfo { + zttid: self.zttid, + replicas: shared_state + .replicas + .iter() + .filter_map(|r| r.as_ref()) + .copied() + .collect(), + wal_backup_active: shared_state.wal_backup_active, + timeline_is_active: shared_state.active, + num_computes: shared_state.num_computes, + last_removed_segno: shared_state.last_removed_segno, + epoch_start_lsn: shared_state.sk.epoch_start_lsn, + mem_state: shared_state.sk.inmem.clone(), + persisted_state: shared_state.sk.state.clone(), + flush_lsn: shared_state.sk.wal_store.flush_lsn(), + }) + } + /// Timed wait for an LSN to be committed. /// /// Returns the last committed LSN, which will be at least @@ -777,6 +805,16 @@ impl GlobalTimelines { .collect() } + /// Return FullTimelineInfo for all active timelines. + pub fn active_timelines_metrics() -> Vec { + let state = TIMELINES_STATE.lock().unwrap(); + state + .timelines + .iter() + .filter_map(|(_, tli)| tli.info_for_metrics()) + .collect() + } + fn delete_force_internal( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 7285cedc03..e3f1ce7333 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -31,20 +31,11 @@ use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ}; use postgres_ffi::waldecoder::WalStreamDecoder; -use metrics::{ - register_gauge_vec, register_histogram_vec, Gauge, GaugeVec, Histogram, HistogramVec, - DISK_WRITE_SECONDS_BUCKETS, -}; +use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; lazy_static! { // The prometheus crate does not support u64 yet, i64 only (see `IntGauge`). // i64 is faster than f64, so update to u64 when available. - static ref FLUSH_LSN_GAUGE: GaugeVec = register_gauge_vec!( - "safekeeper_flush_lsn", - "Current flush_lsn, grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("Failed to register safekeeper_flush_lsn gauge vec"); static ref WRITE_WAL_BYTES: HistogramVec = register_histogram_vec!( "safekeeper_write_wal_bytes", "Bytes written to WAL in a single request, grouped by timeline", @@ -69,7 +60,6 @@ lazy_static! { } struct WalStorageMetrics { - flush_lsn: Gauge, write_wal_bytes: Histogram, write_wal_seconds: Histogram, flush_wal_seconds: Histogram, @@ -80,7 +70,6 @@ impl WalStorageMetrics { let tenant_id = zttid.tenant_id.to_string(); let timeline_id = zttid.timeline_id.to_string(); Self { - flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&[&tenant_id, &timeline_id]), write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&[&tenant_id, &timeline_id]), write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), flush_wal_seconds: FLUSH_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), @@ -171,7 +160,6 @@ impl PhysicalStorage { /// Wrapper for flush_lsn updates that also updates metrics. fn update_flush_lsn(&mut self) { self.flush_record_lsn = self.write_record_lsn; - self.metrics.flush_lsn.set(self.flush_record_lsn.0 as f64); } /// Call fdatasync if config requires so. From 595a6bc1e15390782b38d2cdf48a5bb24b7a061b Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 31 May 2022 14:47:06 +0300 Subject: [PATCH 0377/1022] Bump vendor/postgres to fix basebackup LSN comparison. (#1835) Co-authored-by: Arseny Sher --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index a424e3ccff..8a6cc09624 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit a424e3ccff7d6af97d9ee5d4b727fb8324c78e11 +Subproject commit 8a6cc09624fe921b6191f1f524a8051dc476404e From 54e163ac03c9ba556d8055cc81d3a70825bf5aaa Mon Sep 17 00:00:00 2001 From: Ryan Russell Date: Mon, 30 May 2022 07:00:23 -0500 Subject: [PATCH 0378/1022] Improve Readability in Docs Signed-off-by: Ryan Russell --- docs/README.md | 2 +- docs/glossary.md | 2 +- docs/rfcs/003-laptop-cli.md | 4 ++-- docs/rfcs/006-laptop-cli-v2-repository-structure.md | 2 +- docs/rfcs/009-snapshot-first-storage-cli.md | 2 +- docs/rfcs/009-snapshot-first-storage-pitr.md | 2 +- docs/rfcs/009-snapshot-first-storage.md | 2 +- docs/rfcs/010-storage_details.md | 4 ++-- docs/rfcs/013-term-history.md | 2 +- docs/settings.md | 2 +- pageserver/src/layered_repository/README.md | 2 +- safekeeper/README_PROTO.md | 2 +- 12 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/README.md b/docs/README.md index 886363dccc..60114c5fd5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -6,7 +6,7 @@ - [docker.md](docker.md) — Docker images and building pipeline. - [glossary.md](glossary.md) — Glossary of all the terms used in codebase. - [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI. -- [sourcetree.md](sourcetree.md) — Overview of the source tree layeout. +- [sourcetree.md](sourcetree.md) — Overview of the source tree layout. - [pageserver/README.md](/pageserver/README.md) — pageserver overview. - [postgres_ffi/README.md](/libs/postgres_ffi/README.md) — Postgres FFI overview. - [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview. diff --git a/docs/glossary.md b/docs/glossary.md index a5bb154793..7aeae27a39 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -2,7 +2,7 @@ ### Authentication -### Backpresssure +### Backpressure Backpressure is used to limit the lag between pageserver and compute node or WAL service. diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md index 8520249bf1..1a549c2df5 100644 --- a/docs/rfcs/003-laptop-cli.md +++ b/docs/rfcs/003-laptop-cli.md @@ -136,9 +136,9 @@ s3tank 80G S3 ## pg -Manages postgres data directories and can start postgreses with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themself. +Manages postgres data directories and can start postgres instances with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themselves. -Pg is a term for a single postgres running on some data. I'm trying to avoid here separation of datadir management and postgres instance management -- both that concepts bundled here together. +Pg is a term for a single postgres running on some data. I'm trying to avoid separation of datadir management and postgres instance management -- both that concepts bundled here together. **zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata diff --git a/docs/rfcs/006-laptop-cli-v2-repository-structure.md b/docs/rfcs/006-laptop-cli-v2-repository-structure.md index ee4e432182..e6e6e172ad 100644 --- a/docs/rfcs/006-laptop-cli-v2-repository-structure.md +++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md @@ -121,7 +121,7 @@ repository, launch an instance on the same branch in both clones, and later try to push/pull between them? Perhaps create a new timeline every time you start up an instance? Then you would detect that the timelines have diverged. That would match with the "epoch" concept -that we have in the WAL safekeepr +that we have in the WAL safekeeper ### zenith checkout/commit diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md index 0139569721..0acbd68f86 100644 --- a/docs/rfcs/009-snapshot-first-storage-cli.md +++ b/docs/rfcs/009-snapshot-first-storage-cli.md @@ -2,7 +2,7 @@ While working on export/import commands, I understood that they fit really well We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files. -Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postges to zenith. +Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to zenith. So here is an attempt to design consistent CLI for different usage scenarios: diff --git a/docs/rfcs/009-snapshot-first-storage-pitr.md b/docs/rfcs/009-snapshot-first-storage-pitr.md index a4d978324b..29d3614d34 100644 --- a/docs/rfcs/009-snapshot-first-storage-pitr.md +++ b/docs/rfcs/009-snapshot-first-storage-pitr.md @@ -192,7 +192,7 @@ for a particular relation readily available alongside the snapshot files, and you don't need to track what snapshot LSNs exist separately. -(If we wanted to minize the number of files, you could include the +(If we wanted to minimize the number of files, you could include the snapshot @300 and the WAL between 200 and 300 in the same file, but I feel it's probably better to keep them separate) diff --git a/docs/rfcs/009-snapshot-first-storage.md b/docs/rfcs/009-snapshot-first-storage.md index aeef54898a..75ed490f21 100644 --- a/docs/rfcs/009-snapshot-first-storage.md +++ b/docs/rfcs/009-snapshot-first-storage.md @@ -121,7 +121,7 @@ The properties of s3 that we depend on are: list objects streaming read of entire object read byte range from object -streaming write new object (may use multipart upload for better relialibity) +streaming write new object (may use multipart upload for better reliability) delete object (that should not disrupt an already-started read). Uploaded files, restored backups, or s3 buckets controlled by users could contain malicious content. We should always validate that objects contain the content they’re supposed to. Incorrect, Corrupt or malicious-looking contents should cause software (cloud tools, pageserver) to fail gracefully. diff --git a/docs/rfcs/010-storage_details.md b/docs/rfcs/010-storage_details.md index 5c279b7dc8..bc79924e7b 100644 --- a/docs/rfcs/010-storage_details.md +++ b/docs/rfcs/010-storage_details.md @@ -40,7 +40,7 @@ b) overwrite older pages with the newer pages -- if there is no replica we proba I imagine that newly created pages would just be added to the back of PageStore (again in queue-like fashion) and this way there wouldn't be any meaningful ordering inside of that queue. When we are forming a new incremental snapshot we may prohibit any updates to the current set of pages in PageStore (giving up on single page version rule) and cut off that whole set when snapshot creation is complete. -With option b) we can also treat PageStor as an uncompleted increamental snapshot. +With option b) we can also treat PageStor as an uncompleted incremental snapshot. ### LocalStore @@ -131,7 +131,7 @@ As for exact data that should go to snapshots I think it is the following for ea It is also important to be able to load metadata quickly since it would be one of the main factors impacting the time of page server start. E.g. if would store/cache about 10TB of data per page server, the size of uncompressed page references would be about 30GB (10TB / ( 8192 bytes page size / ( ~18 bytes per ObjectTag + 8 bytes offset in the file))). -1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when realtion_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset delatas would be small). +1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when relation_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset deltas would be small). 2) It makes sense to keep ToC at the beginning of the file to avoid extra seeks to locate it. Doesn't matter too much with the local files but matters on S3 -- if we are accessing a lot of ~1Gb files with the size of metadata ~ 1Mb then the time to transfer this metadata would be comparable with access latency itself (which is about a half of a second). So by slurping metadata with one read of file header instead of N reads we can improve the speed of page server start by this N factor. I think both of that optimizations can be done later, but that is something to keep in mind when we are designing our storage serialization routines. diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md index 7fe505456d..59833526c5 100644 --- a/docs/rfcs/013-term-history.md +++ b/docs/rfcs/013-term-history.md @@ -7,7 +7,7 @@ and e.g. prevents electing two proposers with the same term -- it is actually called `term` in the code. The second, called `epoch`, reflects progress of log receival and this might lag behind `term`; safekeeper switches to epoch `n` when it has received all committed log records from all `< n` terms. This roughly -correspones to proposed in +corresponds to proposed in https://github.com/zenithdb/rfcs/pull/3/files diff --git a/docs/settings.md b/docs/settings.md index 98439a094c..0ca7223faa 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -185,7 +185,7 @@ If no IAM bucket access is used during the remote storage usage, use the `AWS_AC ###### General remote storage configuration -Pagesever allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used. +Pageserver allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used. No default values are used for the remote storage configuration parameters. Besides, there are parameters common for all types of remote storage that can be configured, those have defaults: diff --git a/pageserver/src/layered_repository/README.md b/pageserver/src/layered_repository/README.md index 70c571a507..81f585d2e2 100644 --- a/pageserver/src/layered_repository/README.md +++ b/pageserver/src/layered_repository/README.md @@ -260,7 +260,7 @@ Whenever a GetPage@LSN request comes in from the compute node, the page server needs to reconstruct the requested page, as it was at the requested LSN. To do that, the page server first checks the recent in-memory layer; if the requested page version is found there, it can -be returned immediatedly without looking at the files on +be returned immediately without looking at the files on disk. Otherwise the page server needs to locate the layer file that contains the requested page version. diff --git a/safekeeper/README_PROTO.md b/safekeeper/README_PROTO.md index 0cd1f510e6..a2d4fa455d 100644 --- a/safekeeper/README_PROTO.md +++ b/safekeeper/README_PROTO.md @@ -152,7 +152,7 @@ It is assumed that in case of losing local data by some safekeepers, it should b * `FlushLSN`: part of WAL persisted to the disk by safekeeper. * `NodeID`: pair (term,UUID) * `Pager`: Neon component restoring pages from WAL stream -* `Replica`: read-only computatio node +* `Replica`: read-only computation node * `VCL`: the largest LSN for which we can guarantee availability of all prior records. ## Algorithm From c97cd684e0d925cc21d9e484c6d65ba69629b458 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Tue, 31 May 2022 11:20:51 -0400 Subject: [PATCH 0379/1022] Use `HOMEBREW_PREFIX` instead of hard-coded path (#1833) --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e3d183eaee..50e2c8ab7f 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,7 @@ endif # macOS with brew-installed openssl requires explicit paths UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - PG_CONFIGURE_OPTS += --with-includes=/usr/local/opt/openssl/include --with-libraries=/usr/local/opt/openssl/lib + PG_CONFIGURE_OPTS += --with-includes=$(HOMEBREW_PREFIX)/opt/openssl/include --with-libraries=$(HOMEBREW_PREFIX)/opt/openssl/lib endif # Choose whether we should be silent or verbose From ca10cc12c1fe40c3ca5c020a219d96aa1f06de92 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Tue, 31 May 2022 14:14:09 -0400 Subject: [PATCH 0380/1022] Close file descriptors for redo process (#1834) --- Cargo.lock | 11 +++++++++ pageserver/Cargo.toml | 1 + pageserver/src/walredo.rs | 49 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index e39375c221..6f8382de27 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -363,6 +363,16 @@ dependencies = [ "textwrap 0.14.2", ] +[[package]] +name = "close_fds" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed" +dependencies = [ + "cfg-if", + "libc", +] + [[package]] name = "cmake" version = "0.1.48" @@ -1789,6 +1799,7 @@ dependencies = [ "bytes", "chrono", "clap 3.0.14", + "close_fds", "const_format", "crc32c", "crossbeam-utils", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 290f52e0b2..d78d3622c4 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -60,6 +60,7 @@ metrics = { path = "../libs/metrics" } utils = { path = "../libs/utils" } remote_storage = { path = "../libs/remote_storage" } workspace_hack = { version = "0.1", path = "../workspace_hack" } +close_fds = "0.3.2" [dev-dependencies] hex-literal = "0.3" diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index d263bf0e9a..cad211b1bd 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -28,6 +28,7 @@ use std::fs::OpenOptions; use std::io::prelude::*; use std::io::{Error, ErrorKind}; use std::os::unix::io::AsRawFd; +use std::os::unix::prelude::CommandExt; use std::path::PathBuf; use std::process::Stdio; use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command}; @@ -554,6 +555,40 @@ impl PostgresRedoManager { } } +/// +/// Command with ability not to give all file descriptors to child process +/// +trait CloseFileDescriptors: CommandExt { + /// + /// Close file descriptors (other than stdin, stdout, stderr) in child process + /// + fn close_fds(&mut self) -> &mut Command; +} + +impl CloseFileDescriptors for C { + fn close_fds(&mut self) -> &mut Command { + unsafe { + self.pre_exec(move || { + // SAFETY: Code executed inside pre_exec should have async-signal-safety, + // which means it should be safe to execute inside a signal handler. + // The precise meaning depends on platform. See `man signal-safety` + // for the linux definition. + // + // The set_fds_cloexec_threadsafe function is documented to be + // async-signal-safe. + // + // Aside from this function, the rest of the code is re-entrant and + // doesn't make any syscalls. We're just passing constants. + // + // NOTE: It's easy to indirectly cause a malloc or lock a mutex, + // which is not async-signal-safe. Be careful. + close_fds::set_fds_cloexec_threadsafe(3, &[]); + Ok(()) + }) + } + } +} + /// /// Handle to the Postgres WAL redo process /// @@ -610,6 +645,7 @@ impl PostgresRedoProcess { config.write_all(b"shared_preload_libraries=neon\n")?; config.write_all(b"neon.wal_redo=on\n")?; } + // Start postgres itself let mut child = Command::new(conf.pg_bin_dir().join("postgres")) .arg("--wal-redo") @@ -620,6 +656,19 @@ impl PostgresRedoProcess { .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) .env("PGDATA", &datadir) + // The redo process is not trusted, so it runs in seccomp mode + // (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't + // inherit any file descriptors from the pageserver that would allow + // an attacker to do bad things. + // + // The Rust standard library makes sure to mark any file descriptors with + // as close-on-exec by default, but that's not enough, since we use + // libraries that directly call libc open without setting that flag. + // + // One example is the pidfile of the daemonize library, which doesn't + // currently mark file descriptors as close-on-exec. Either way, we + // want to be on the safe side and prevent accidental regression. + .close_fds() .spawn() .map_err(|e| { Error::new( From b1b67cc5a055561a3d60c4e0194b0a3103cb8624 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 31 May 2022 19:13:12 +0300 Subject: [PATCH 0381/1022] improve test normal work to start several computes --- .../batch_others/test_ancestor_branch.py | 4 +- test_runner/batch_others/test_normal_work.py | 47 +++++++++++++++++++ test_runner/batch_others/test_wal_acceptor.py | 19 -------- test_runner/fixtures/zenith_fixtures.py | 22 +++++++-- 4 files changed, 65 insertions(+), 27 deletions(-) create mode 100644 test_runner/batch_others/test_normal_work.py diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index d87bebcc11..78724c434e 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -24,9 +24,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): 'compaction_target_size': '4194304', }) - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - pscur.execute("failpoints flush-frozen=sleep(10000)") + env.pageserver.safe_psql("failpoints flush-frozen=sleep(10000)") pg_branch0 = env.postgres.create_start('main', tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() diff --git a/test_runner/batch_others/test_normal_work.py b/test_runner/batch_others/test_normal_work.py new file mode 100644 index 0000000000..87dd2d5e18 --- /dev/null +++ b/test_runner/batch_others/test_normal_work.py @@ -0,0 +1,47 @@ +from fixtures.log_helper import log +from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient + + +def check_tenant(env: ZenithEnv, pageserver_http: ZenithPageserverHttpClient): + tenant_id, timeline_id = env.zenith_cli.create_tenant() + pg = env.postgres.create_start('main', tenant_id=tenant_id) + # we rely upon autocommit after each statement + res_1 = pg.safe_psql_many(queries=[ + 'CREATE TABLE t(key int primary key, value text)', + 'INSERT INTO t SELECT generate_series(1,100000), \'payload\'', + 'SELECT sum(key) FROM t', + ]) + + assert res_1[-1][0] == (5000050000, ) + # TODO check detach on live instance + log.info("stopping compute") + pg.stop() + log.info("compute stopped") + + pg.start() + res_2 = pg.safe_psql('SELECT sum(key) FROM t') + assert res_2[0] == (5000050000, ) + + pg.stop() + pageserver_http.timeline_detach(tenant_id, timeline_id) + + +def test_normal_work(zenith_env_builder: ZenithEnvBuilder): + """ + Basic test: + * create new tenant with a timeline + * write some data + * ensure that it was successfully written + * restart compute + * check that the data is there + * stop compute + * detach timeline + + Repeat check for several tenants/timelines. + """ + + env = zenith_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + for _ in range(3): + check_tenant(env, pageserver_http) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 40a9b48a18..007641417e 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -18,25 +18,6 @@ from fixtures.log_helper import log from typing import List, Optional, Any -# basic test, write something in setup with wal acceptors, ensure that commits -# succeed and data is written -def test_normal_work(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() - - env.zenith_cli.create_branch('test_safekeepers_normal_work') - pg = env.postgres.create_start('test_safekeepers_normal_work') - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - cur.execute('CREATE TABLE t(key int primary key, value text)') - cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - cur.execute('SELECT sum(key) FROM t') - assert cur.fetchone() == (5000050000, ) - - @dataclass class TimelineMetrics: timeline_id: str diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 5f3c16c4e6..ff905efa53 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -338,18 +338,30 @@ class PgProtocol: conn_options['server_settings'] = {key: val} return await asyncpg.connect(**conn_options) - def safe_psql(self, query: str, **kwargs: Any) -> List[Any]: + def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]: """ Execute query against the node and return all rows. This method passes all extra params to connstr. """ + return self.safe_psql_many([query], **kwargs)[0] + def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]: + """ + Execute queries against the node and return all rows. + This method passes all extra params to connstr. + """ + result: List[List[Any]] = [] with closing(self.connect(**kwargs)) as conn: with conn.cursor() as cur: - cur.execute(query) - if cur.description is None: - return [] # query didn't return data - return cast(List[Any], cur.fetchall()) + for query in queries: + log.info(f"Executing query: {query}") + cur.execute(query) + + if cur.description is None: + result.append([]) # query didn't return data + else: + result.append(cast(List[Any], cur.fetchall())) + return result @dataclass From ff233cf4c28a29086de28627aee2d8753855d77f Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Tue, 31 May 2022 17:36:35 +0200 Subject: [PATCH 0382/1022] Use :local compute-tools tag to build compute-node image --- .circleci/config.yml | 24 ++++++++++++++---------- vendor/postgres | 2 +- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 624d367053..fde6cbd35f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -453,9 +453,6 @@ jobs: - checkout - setup_remote_docker: docker_layer_caching: true - # Build neondatabase/compute-tools:latest image and push it to Docker hub - # TODO: this should probably also use versioned tag, not just :latest. - # XXX: but should it? We build and use it only locally now. - run: name: Build and push compute-tools Docker image command: | @@ -463,7 +460,10 @@ jobs: docker build \ --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ - --tag neondatabase/compute-tools:latest -f Dockerfile.compute-tools . + --tag neondatabase/compute-tools:local \ + --tag neondatabase/compute-tools:latest \ + -f Dockerfile.compute-tools . + # Only push :latest image docker push neondatabase/compute-tools:latest - run: name: Init postgres submodule @@ -473,7 +473,9 @@ jobs: command: | echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin DOCKER_TAG=$(git log --oneline|wc -l) - docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:latest vendor/postgres + docker build --tag neondatabase/compute-node:${DOCKER_TAG} \ + --tag neondatabase/compute-node:latest vendor/postgres \ + --build-arg COMPUTE_TOOLS_TAG=local docker push neondatabase/compute-node:${DOCKER_TAG} docker push neondatabase/compute-node:latest @@ -510,9 +512,6 @@ jobs: - checkout - setup_remote_docker: docker_layer_caching: true - # Build neondatabase/compute-tools:release image and push it to Docker hub - # TODO: this should probably also use versioned tag, not just :latest. - # XXX: but should it? We build and use it only locally now. - run: name: Build and push compute-tools Docker image command: | @@ -520,7 +519,10 @@ jobs: docker build \ --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ - --tag neondatabase/compute-tools:release -f Dockerfile.compute-tools . + --tag neondatabase/compute-tools:release \ + --tag neondatabase/compute-tools:local \ + -f Dockerfile.compute-tools . + # Only push :release image docker push neondatabase/compute-tools:release - run: name: Init postgres submodule @@ -530,7 +532,9 @@ jobs: command: | echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin DOCKER_TAG="release-$(git log --oneline|wc -l)" - docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:release vendor/postgres + docker build --tag neondatabase/compute-node:${DOCKER_TAG} \ + --tag neondatabase/compute-node:release vendor/postgres \ + --build-arg COMPUTE_TOOLS_TAG=local docker push neondatabase/compute-node:${DOCKER_TAG} docker push neondatabase/compute-node:release diff --git a/vendor/postgres b/vendor/postgres index 8a6cc09624..50b6edfbe0 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 8a6cc09624fe921b6191f1f524a8051dc476404e +Subproject commit 50b6edfbe0c3b171bd6d407652e1e31a4c97aa8b From af6143ea1ffb3987279745af6c70071b16e5fcee Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Tue, 31 May 2022 18:35:06 +0200 Subject: [PATCH 0383/1022] Install missing openssl packages in the Github Actions workflow --- .github/workflows/testing.yml | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 1ce1b64a49..41f9f51e86 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -40,11 +40,11 @@ jobs: if: matrix.os == 'ubuntu-latest' run: | sudo apt update - sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev + sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev - - name: Install macOs postgres dependencies + - name: Install macOS postgres dependencies if: matrix.os == 'macos-latest' - run: brew install flex bison + run: brew install flex bison openssl - name: Set pg revision for caching id: pg_ver @@ -58,10 +58,27 @@ jobs: tmp_install/ key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }} + - name: Set extra env for macOS + if: matrix.os == 'macos-latest' + run: | + echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV + echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV + - name: Build postgres if: steps.cache_pg.outputs.cache-hit != 'true' run: make postgres + # Plain configure output can contain weird errors like 'error: C compiler cannot create executables' + # and the real cause will be inside config.log + - name: Print configure logs in case of failure + if: failure() + continue-on-error: true + run: | + echo '' && echo '=== config.log ===' && echo '' + cat tmp_install/build/config.log + echo '' && echo '=== configure.log ===' && echo '' + cat tmp_install/build/configure.log + - name: Cache cargo deps id: cache_cargo uses: actions/cache@v2 From e5a2b0372d73854121c159c0ea7092bd72d0d8dd Mon Sep 17 00:00:00 2001 From: Anton Chaporgin Date: Wed, 1 Jun 2022 15:40:45 +0300 Subject: [PATCH 0384/1022] remove sk1 from inventory (#1845) https://github.com/neondatabase/cloud/issues/1454 --- .circleci/ansible/staging.hosts | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index cf5b98eaa1..4273b885e1 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -3,7 +3,6 @@ zenith-us-stage-ps-2 console_region_id=27 [safekeepers] -zenith-us-stage-sk-1 console_region_id=27 zenith-us-stage-sk-4 console_region_id=27 zenith-us-stage-sk-5 console_region_id=27 zenith-us-stage-sk-6 console_region_id=27 From 6623c5b9d5322da766674319beb03a56cb68e462 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 30 May 2022 16:02:57 +0300 Subject: [PATCH 0385/1022] add installation instructions for Fedora Linux --- README.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 131d5da110..be5032e87d 100644 --- a/README.md +++ b/README.md @@ -29,13 +29,18 @@ Pageserver consists of: ## Running local installation -#### building on Ubuntu/ Debian (Linux) +#### building on Linux 1. Install build dependencies and other useful packages -On Ubuntu or Debian this set of packages should be sufficient to build the code: -```text +* On Ubuntu or Debian this set of packages should be sufficient to build the code: +```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ -libssl-dev clang pkg-config libpq-dev libprotobuf-dev etcd +libssl-dev clang pkg-config libpq-dev etcd cmake postgresql-client +``` +* On Fedora these packages are needed: +```bash +dnf install flex bison readline-devel zlib-devel openssl-devel \ + libseccomp-devel perl clang cmake etcd postgresql postgresql-contrib ``` 2. [Install Rust](https://www.rust-lang.org/tools/install) @@ -44,16 +49,11 @@ libssl-dev clang pkg-config libpq-dev libprotobuf-dev etcd curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh ``` -3. Install PostgreSQL Client -``` -apt install postgresql-client -``` - -4. Build neon and patched postgres +3. Build neon and patched postgres ```sh git clone --recursive https://github.com/neondatabase/neon.git cd neon -make -j5 +make -j`nproc` ``` #### building on OSX (12.3.1) From e5cb72757250457a61eeb4bdd7c613527ce7ec98 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 22 Apr 2022 13:56:48 +0300 Subject: [PATCH 0386/1022] Replace callmemaybe with etcd subscriptions on safekeeper timeline info --- control_plane/src/compute.rs | 1 - control_plane/src/storage.rs | 47 +- docs/settings.md | 18 +- libs/etcd_broker/src/lib.rs | 6 +- libs/utils/src/postgres_backend.rs | 13 +- libs/utils/src/zid.rs | 2 +- pageserver/Cargo.toml | 2 +- pageserver/src/config.rs | 15 + pageserver/src/http/models.rs | 23 + pageserver/src/http/routes.rs | 50 +- pageserver/src/layered_repository.rs | 22 + pageserver/src/page_service.rs | 26 - pageserver/src/repository.rs | 3 + pageserver/src/tenant_config.rs | 54 + pageserver/src/tenant_mgr.rs | 178 +- pageserver/src/thread_mgr.rs | 4 +- pageserver/src/walreceiver.rs | 1554 ++++++++++++----- .../src/walreceiver/connection_handler.rs | 405 +++++ safekeeper/src/bin/safekeeper.rs | 26 +- safekeeper/src/lib.rs | 1 - safekeeper/src/send_wal.rs | 67 - safekeeper/src/timeline.rs | 116 +- .../batch_others/test_pageserver_api.py | 10 +- test_runner/fixtures/zenith_fixtures.py | 4 +- .../performance/test_bulk_tenant_create.py | 14 +- 25 files changed, 1968 insertions(+), 693 deletions(-) create mode 100644 pageserver/src/walreceiver/connection_handler.rs diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 06a14d8a41..e78f96074e 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -352,7 +352,6 @@ impl PostgresNode { // This isn't really a supported configuration, but can be useful for // testing. conf.append("synchronous_standby_names", "pageserver"); - conf.append("neon.callmemaybe_connstring", &self.connstr()); } let mut file = File::create(self.pgdata().join("postgresql.conf"))?; diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 24cdbce8f3..a8f21406fb 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::io::Write; use std::net::TcpStream; +use std::num::NonZeroU64; use std::path::PathBuf; use std::process::Command; use std::time::Duration; @@ -11,6 +12,7 @@ use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; use pageserver::http::models::{TenantConfigRequest, TenantCreateRequest, TimelineCreateRequest}; +use pageserver::tenant_mgr::TenantInfo; use pageserver::timelines::TimelineInfo; use postgres::{Config, NoTls}; use reqwest::blocking::{Client, RequestBuilder, Response}; @@ -26,7 +28,6 @@ use utils::{ use crate::local_env::LocalEnv; use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile}; -use pageserver::tenant_mgr::TenantInfo; #[derive(Error, Debug)] pub enum PageserverHttpError { @@ -37,6 +38,12 @@ pub enum PageserverHttpError { Response(String), } +impl From for PageserverHttpError { + fn from(e: anyhow::Error) -> Self { + Self::Response(e.to_string()) + } +} + type Result = result::Result; pub trait ResponseErrorMessageExt: Sized { @@ -410,6 +417,15 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose()?, pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()), + walreceiver_connect_timeout: settings + .get("walreceiver_connect_timeout") + .map(|x| x.to_string()), + lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()), + max_lsn_wal_lag: settings + .get("max_lsn_wal_lag") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, }) .send()? .error_from_body()? @@ -433,22 +449,41 @@ impl PageServerNode { tenant_id, checkpoint_distance: settings .get("checkpoint_distance") - .map(|x| x.parse::().unwrap()), + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'checkpoint_distance' as an integer")?, compaction_target_size: settings .get("compaction_target_size") - .map(|x| x.parse::().unwrap()), + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_target_size' as an integer")?, compaction_period: settings.get("compaction_period").map(|x| x.to_string()), compaction_threshold: settings .get("compaction_threshold") - .map(|x| x.parse::().unwrap()), + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_threshold' as an integer")?, gc_horizon: settings .get("gc_horizon") - .map(|x| x.parse::().unwrap()), + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_horizon' as an integer")?, gc_period: settings.get("gc_period").map(|x| x.to_string()), image_creation_threshold: settings .get("image_creation_threshold") - .map(|x| x.parse::().unwrap()), + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'image_creation_threshold' as non zero integer")?, pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()), + walreceiver_connect_timeout: settings + .get("walreceiver_connect_timeout") + .map(|x| x.to_string()), + lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()), + max_lsn_wal_lag: settings + .get("max_lsn_wal_lag") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, }) .send()? .error_from_body()?; diff --git a/docs/settings.md b/docs/settings.md index 0ca7223faa..4d828f22bc 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -31,7 +31,7 @@ broker_endpoints = ['some://etcd'] # [remote_storage] ``` -The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user, +The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user, see the corresponding section below. Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank. Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start. @@ -54,7 +54,7 @@ Note that TOML distinguishes between strings and integers, the former require si A list of endpoints (etcd currently) to connect and pull the information from. Mandatory, does not have a default, since requires etcd to be started as a separate process, -and its connection url should be specified separately. +and its connection url should be specified separately. #### broker_etcd_prefix @@ -111,6 +111,20 @@ L0 delta layer threshold for L1 image layer creation. Default is 3. WAL retention duration for PITR branching. Default is 30 days. +#### walreceiver_connect_timeout + +Time to wait to establish the wal receiver connection before failing + +#### lagging_wal_timeout + +Time the pageserver did not get any WAL updates from safekeeper (if any). +Avoids lagging pageserver preemptively by forcing to switch it from stalled connections. + +#### max_lsn_wal_lag + +Difference between Lsn values of the latest available WAL on safekeepers: if currently connected safekeeper starts to lag too long and too much, +it gets swapped to the different one. + #### initial_superuser_name Name of the initial superuser role, passed to initdb when a new tenant diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index 7fe142502b..0bfce66a5d 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -31,7 +31,7 @@ struct SafekeeperTimeline { /// Published data about safekeeper's timeline. Fields made optional for easy migrations. #[serde_as] -#[derive(Debug, Deserialize, Serialize)] +#[derive(Debug, Clone, Deserialize, Serialize)] pub struct SkTimelineInfo { /// Term of the last entry. pub last_log_term: Option, @@ -55,7 +55,9 @@ pub struct SkTimelineInfo { #[serde(default)] pub peer_horizon_lsn: Option, #[serde(default)] - pub safekeeper_connection_string: Option, + pub safekeeper_connstr: Option, + #[serde(default)] + pub pageserver_connstr: Option, } #[derive(Debug, thiserror::Error)] diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 5fdb1ff9d2..ff71423122 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -336,11 +336,11 @@ impl PostgresBackend { let have_tls = self.tls_config.is_some(); match msg { FeMessage::StartupPacket(m) => { - trace!("got startup message {:?}", m); + trace!("got startup message {m:?}"); match m { FeStartupPacket::SslRequest => { - info!("SSL requested"); + debug!("SSL requested"); self.write_message(&BeMessage::EncryptionResponse(have_tls))?; if have_tls { @@ -349,7 +349,7 @@ impl PostgresBackend { } } FeStartupPacket::GssEncRequest => { - info!("GSS requested"); + debug!("GSS requested"); self.write_message(&BeMessage::EncryptionResponse(false))?; } FeStartupPacket::StartupMessage { .. } => { @@ -433,12 +433,7 @@ impl PostgresBackend { // full cause of the error, not just the top-level context + its trace. // We don't want to send that in the ErrorResponse though, // because it's not relevant to the compute node logs. - if query_string.starts_with("callmemaybe") { - // FIXME avoid printing a backtrace for tenant x not found errors until this is properly fixed - error!("query handler for '{}' failed: {}", query_string, e); - } else { - error!("query handler for '{}' failed: {:?}", query_string, e); - } + error!("query handler for '{}' failed: {:?}", query_string, e); self.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?; // TODO: untangle convoluted control flow if e.to_string().contains("failed to run") { diff --git a/libs/utils/src/zid.rs b/libs/utils/src/zid.rs index 0ef174da4d..6da5355f61 100644 --- a/libs/utils/src/zid.rs +++ b/libs/utils/src/zid.rs @@ -193,7 +193,7 @@ pub struct ZTenantId(ZId); zid_newtype!(ZTenantId); // A pair uniquely identifying Zenith instance. -#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct ZTenantTimelineId { pub tenant_id: ZTenantId, pub timeline_id: ZTimelineId, diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index d78d3622c4..298addb838 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [features] # It is simpler infra-wise to have failpoints enabled by default -# It shouldn't affect perf in any way because failpoints +# It shouldn't affect performance in any way because failpoints # are not placed in hot code paths default = ["failpoints"] profiling = ["pprof"] diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index f44b0846a8..01b626e046 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -480,6 +480,21 @@ impl PageServerConf { if let Some(pitr_interval) = item.get("pitr_interval") { t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?); } + if let Some(walreceiver_connect_timeout) = item.get("walreceiver_connect_timeout") { + t_conf.walreceiver_connect_timeout = Some(parse_toml_duration( + "walreceiver_connect_timeout", + walreceiver_connect_timeout, + )?); + } + if let Some(lagging_wal_timeout) = item.get("lagging_wal_timeout") { + t_conf.lagging_wal_timeout = Some(parse_toml_duration( + "lagging_wal_timeout", + lagging_wal_timeout, + )?); + } + if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") { + t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?); + } Ok(t_conf) } diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index e00ccda2a1..c947cebcb6 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -1,3 +1,5 @@ +use std::num::NonZeroU64; + use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use utils::{ @@ -33,6 +35,9 @@ pub struct TenantCreateRequest { pub gc_period: Option, pub image_creation_threshold: Option, pub pitr_interval: Option, + pub walreceiver_connect_timeout: Option, + pub lagging_wal_timeout: Option, + pub max_lsn_wal_lag: Option, } #[serde_as] @@ -68,6 +73,9 @@ pub struct TenantConfigRequest { pub gc_period: Option, pub image_creation_threshold: Option, pub pitr_interval: Option, + pub walreceiver_connect_timeout: Option, + pub lagging_wal_timeout: Option, + pub max_lsn_wal_lag: Option, } impl TenantConfigRequest { @@ -82,6 +90,21 @@ impl TenantConfigRequest { gc_period: None, image_creation_threshold: None, pitr_interval: None, + walreceiver_connect_timeout: None, + lagging_wal_timeout: None, + max_lsn_wal_lag: None, } } } + +/// A WAL receiver's data stored inside the global `WAL_RECEIVERS`. +/// We keep one WAL receiver active per timeline. +#[serde_as] +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct WalReceiverEntry { + pub wal_producer_connstr: Option, + #[serde_as(as = "Option")] + pub last_received_msg_lsn: Option, + /// the timestamp (in microseconds) of the last received message + pub last_received_msg_ts: Option, +} diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index bb650a34ed..a1198051a8 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -229,23 +229,16 @@ async fn wal_receiver_get_handler(request: Request) -> Result) -> Result, ApiError> { @@ -402,6 +395,19 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result Duration { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .walreceiver_connect_timeout + .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout) + } + + pub fn get_lagging_wal_timeout(&self) -> Duration { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .lagging_wal_timeout + .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout) + } + + pub fn get_max_lsn_wal_lag(&self) -> NonZeroU64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .max_lsn_wal_lag + .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag) + } + pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) -> Result<()> { let mut tenant_conf = self.tenant_conf.write().unwrap(); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 4f0fca4797..df43b8c0df 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -7,7 +7,6 @@ // *status* -- show actual info about this pageserver, // *pagestream* -- enter mode where smgr and pageserver talk with their // custom protocol. -// *callmemaybe $url* -- ask pageserver to start walreceiver on $url // use anyhow::{bail, ensure, Context, Result}; @@ -38,7 +37,6 @@ use crate::repository::Timeline; use crate::tenant_mgr; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; -use crate::walreceiver; use crate::CheckpointConfig; use metrics::{register_histogram_vec, HistogramVec}; use postgres_ffi::xlog_utils::to_pg_timestamp; @@ -716,30 +714,6 @@ impl postgres_backend::Handler for PageServerHandler { // Check that the timeline exists self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("callmemaybe ") { - // callmemaybe - // TODO lazy static - let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) ([[:xdigit:]]+) (.*)$").unwrap(); - let caps = re - .captures(query_string) - .with_context(|| format!("invalid callmemaybe: '{}'", query_string))?; - - let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let connstr = caps.get(3).unwrap().as_str().to_owned(); - - self.check_permission(Some(tenantid))?; - - let _enter = - info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered(); - - // Check that the timeline exists - tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) - .context("Cannot load local timeline")?; - - walreceiver::launch_wal_receiver(self.conf, tenantid, timelineid, &connstr)?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.to_ascii_lowercase().starts_with("set ") { // important because psycopg2 executes "SET datestyle TO 'ISO'" diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 5bf128e66b..9d5056cd16 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -469,6 +469,9 @@ pub mod repo_harness { gc_period: Some(tenant_conf.gc_period), image_creation_threshold: Some(tenant_conf.image_creation_threshold), pitr_interval: Some(tenant_conf.pitr_interval), + walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout), + lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout), + max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), } } } diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index 9bf223e59e..f68a820e95 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -10,6 +10,7 @@ //! use crate::config::PageServerConf; use serde::{Deserialize, Serialize}; +use std::num::NonZeroU64; use std::path::PathBuf; use std::time::Duration; use utils::zid::ZTenantId; @@ -34,6 +35,9 @@ pub mod defaults { pub const DEFAULT_GC_PERIOD: &str = "100 s"; pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; pub const DEFAULT_PITR_INTERVAL: &str = "30 days"; + pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds"; + pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; + pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1_000_000; } /// Per-tenant configuration options @@ -68,6 +72,17 @@ pub struct TenantConf { // Page versions older than this are garbage collected away. #[serde(with = "humantime_serde")] pub pitr_interval: Duration, + /// Maximum amount of time to wait while opening a connection to receive wal, before erroring. + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Duration, + /// Considers safekeepers stalled after no WAL updates were received longer than this threshold. + /// A stalled safekeeper will be changed to a newer one when it appears. + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Duration, + /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold. + /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update, + /// to avoid eager reconnects. + pub max_lsn_wal_lag: NonZeroU64, } /// Same as TenantConf, but this struct preserves the information about @@ -85,6 +100,11 @@ pub struct TenantConfOpt { pub image_creation_threshold: Option, #[serde(with = "humantime_serde")] pub pitr_interval: Option, + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Option, + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Option, + pub max_lsn_wal_lag: Option, } impl TenantConfOpt { @@ -108,6 +128,13 @@ impl TenantConfOpt { .image_creation_threshold .unwrap_or(global_conf.image_creation_threshold), pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval), + walreceiver_connect_timeout: self + .walreceiver_connect_timeout + .unwrap_or(global_conf.walreceiver_connect_timeout), + lagging_wal_timeout: self + .lagging_wal_timeout + .unwrap_or(global_conf.lagging_wal_timeout), + max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag), } } @@ -136,6 +163,15 @@ impl TenantConfOpt { if let Some(pitr_interval) = other.pitr_interval { self.pitr_interval = Some(pitr_interval); } + if let Some(walreceiver_connect_timeout) = other.walreceiver_connect_timeout { + self.walreceiver_connect_timeout = Some(walreceiver_connect_timeout); + } + if let Some(lagging_wal_timeout) = other.lagging_wal_timeout { + self.lagging_wal_timeout = Some(lagging_wal_timeout); + } + if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag { + self.max_lsn_wal_lag = Some(max_lsn_wal_lag); + } } } @@ -155,6 +191,14 @@ impl TenantConf { image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD, pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL) .expect("cannot parse default PITR interval"), + walreceiver_connect_timeout: humantime::parse_duration( + DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, + ) + .expect("cannot parse default walreceiver connect timeout"), + lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT) + .expect("cannot parse default walreceiver lagging wal timeout"), + max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) + .expect("cannot parse default max walreceiver Lsn wal lag"), } } @@ -175,6 +219,16 @@ impl TenantConf { gc_period: Duration::from_secs(10), image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD, pitr_interval: Duration::from_secs(60 * 60), + walreceiver_connect_timeout: humantime::parse_duration( + defaults::DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, + ) + .unwrap(), + lagging_wal_timeout: humantime::parse_duration( + defaults::DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT, + ) + .unwrap(), + max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) + .unwrap(), } } } diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index cc35d79d16..c48b021d1f 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -8,11 +8,10 @@ use crate::repository::{Repository, TimelineSyncStatusUpdate}; use crate::storage_sync::index::RemoteIndex; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; -use crate::thread_mgr; use crate::thread_mgr::ThreadKind; -use crate::timelines; use crate::timelines::CreateRepo; use crate::walredo::PostgresRedoManager; +use crate::{thread_mgr, timelines, walreceiver}; use crate::{DatadirTimelineImpl, RepositoryImpl}; use anyhow::{bail, Context}; use serde::{Deserialize, Serialize}; @@ -21,23 +20,30 @@ use std::collections::hash_map::Entry; use std::collections::HashMap; use std::fmt; use std::sync::Arc; +use tokio::sync::mpsc; use tracing::*; use utils::lsn::Lsn; -use utils::zid::{ZTenantId, ZTimelineId}; +use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; mod tenants_state { + use anyhow::ensure; use std::{ collections::HashMap, sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}, }; + use tokio::sync::mpsc; + use tracing::{debug, error}; use utils::zid::ZTenantId; - use crate::tenant_mgr::Tenant; + use crate::tenant_mgr::{LocalTimelineUpdate, Tenant}; lazy_static::lazy_static! { static ref TENANTS: RwLock> = RwLock::new(HashMap::new()); + /// Sends updates to the local timelines (creation and deletion) to the WAL receiver, + /// so that it can enable/disable corresponding processes. + static ref TIMELINE_UPDATE_SENDER: RwLock>> = RwLock::new(None); } pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap> { @@ -51,6 +57,39 @@ mod tenants_state { .write() .expect("Failed to write() tenants lock, it got poisoned") } + + pub(super) fn set_timeline_update_sender( + timeline_updates_sender: mpsc::UnboundedSender, + ) -> anyhow::Result<()> { + let mut sender_guard = TIMELINE_UPDATE_SENDER + .write() + .expect("Failed to write() timeline_update_sender lock, it got poisoned"); + ensure!(sender_guard.is_none(), "Timeline update sender already set"); + *sender_guard = Some(timeline_updates_sender); + Ok(()) + } + + pub(super) fn try_send_timeline_update(update: LocalTimelineUpdate) { + match TIMELINE_UPDATE_SENDER + .read() + .expect("Failed to read() timeline_update_sender lock, it got poisoned") + .as_ref() + { + Some(sender) => { + if let Err(e) = sender.send(update) { + error!("Failed to send timeline update: {}", e); + } + } + None => debug!("Timeline update sender is not enabled, cannot send update {update:?}"), + } + } + + pub(super) fn stop_timeline_update_sender() { + TIMELINE_UPDATE_SENDER + .write() + .expect("Failed to write() timeline_update_sender lock, it got poisoned") + .take(); + } } struct Tenant { @@ -87,10 +126,10 @@ pub enum TenantState { impl fmt::Display for TenantState { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - TenantState::Active => f.write_str("Active"), - TenantState::Idle => f.write_str("Idle"), - TenantState::Stopping => f.write_str("Stopping"), - TenantState::Broken => f.write_str("Broken"), + Self::Active => f.write_str("Active"), + Self::Idle => f.write_str("Idle"), + Self::Stopping => f.write_str("Stopping"), + Self::Broken => f.write_str("Broken"), } } } @@ -99,6 +138,11 @@ impl fmt::Display for TenantState { /// Timelines that are only partially available locally (remote storage has more data than this pageserver) /// are scheduled for download and added to the repository once download is completed. pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result { + let (timeline_updates_sender, timeline_updates_receiver) = + mpsc::unbounded_channel::(); + tenants_state::set_timeline_update_sender(timeline_updates_sender)?; + walreceiver::init_wal_receiver_main_thread(conf, timeline_updates_receiver)?; + let SyncStartupData { remote_index, local_timeline_init_statuses, @@ -113,16 +157,27 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result), +} + +impl std::fmt::Debug for LocalTimelineUpdate { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Detach(ttid) => f.debug_tuple("Remove").field(ttid).finish(), + Self::Attach(ttid, _) => f.debug_tuple("Add").field(ttid).finish(), + } + } +} + /// Updates tenants' repositories, changing their timelines state in memory. pub fn apply_timeline_sync_status_updates( conf: &'static PageServerConf, @@ -160,6 +215,7 @@ pub fn apply_timeline_sync_status_updates( /// Shut down all tenants. This runs as part of pageserver shutdown. /// pub fn shutdown_all_tenants() { + tenants_state::stop_timeline_update_sender(); let mut m = tenants_state::write_tenants(); let mut tenantids = Vec::new(); for (tenantid, tenant) in m.iter_mut() { @@ -173,7 +229,7 @@ pub fn shutdown_all_tenants() { } drop(m); - thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiver), None, None); + thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiverManager), None, None); thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), None, None); thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), None, None); @@ -247,32 +303,49 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> Option { Some(tenants_state::read_tenants().get(&tenantid)?.state) } -/// -/// Change the state of a tenant to Active and launch its compactor and GC -/// threads. If the tenant was already in Active state or Stopping, does nothing. -/// -pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> { +pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { let mut m = tenants_state::write_tenants(); let tenant = m .get_mut(&tenant_id) .with_context(|| format!("Tenant not found for id {tenant_id}"))?; + let old_state = tenant.state; + tenant.state = new_state; + drop(m); - info!("activating tenant {tenant_id}"); - - match tenant.state { - // If the tenant is already active, nothing to do. - TenantState::Active => {} - - // If it's Idle, launch the compactor and GC threads - TenantState::Idle => { - thread_mgr::spawn( + match (old_state, new_state) { + (TenantState::Broken, TenantState::Broken) + | (TenantState::Active, TenantState::Active) + | (TenantState::Idle, TenantState::Idle) + | (TenantState::Stopping, TenantState::Stopping) => { + debug!("tenant {tenant_id} already in state {new_state}"); + } + (TenantState::Broken, ignored) => { + debug!("Ignoring {ignored} since tenant {tenant_id} is in broken state"); + } + (_, TenantState::Broken) => { + debug!("Setting tenant {tenant_id} status to broken"); + } + (TenantState::Stopping, ignored) => { + debug!("Ignoring {ignored} since tenant {tenant_id} is in stopping state"); + } + (TenantState::Idle, TenantState::Active) => { + info!("activating tenant {tenant_id}"); + let compactor_spawn_result = thread_mgr::spawn( ThreadKind::Compactor, Some(tenant_id), None, "Compactor thread", false, move || crate::tenant_threads::compact_loop(tenant_id), - )?; + ); + if compactor_spawn_result.is_err() { + let mut m = tenants_state::write_tenants(); + m.get_mut(&tenant_id) + .with_context(|| format!("Tenant not found for id {tenant_id}"))? + .state = old_state; + drop(m); + } + compactor_spawn_result?; let gc_spawn_result = thread_mgr::spawn( ThreadKind::GarbageCollector, @@ -286,21 +359,31 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> { .with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}")); if let Err(e) = &gc_spawn_result { + let mut m = tenants_state::write_tenants(); + m.get_mut(&tenant_id) + .with_context(|| format!("Tenant not found for id {tenant_id}"))? + .state = old_state; + drop(m); error!("Failed to start GC thread for tenant {tenant_id}, stopping its checkpointer thread: {e:?}"); thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None); return gc_spawn_result; } - tenant.state = TenantState::Active; } - - TenantState::Stopping => { - // don't re-activate it if it's being stopped + (TenantState::Idle, TenantState::Stopping) => { + info!("stopping idle tenant {tenant_id}"); } - - TenantState::Broken => { - // cannot activate + (TenantState::Active, TenantState::Stopping | TenantState::Idle) => { + info!("stopping tenant {tenant_id} threads due to new state {new_state}"); + thread_mgr::shutdown_threads( + Some(ThreadKind::WalReceiverManager), + Some(tenant_id), + None, + ); + thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), Some(tenant_id), None); + thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None); } } + Ok(()) } @@ -325,15 +408,15 @@ pub fn get_local_timeline_with_load( .with_context(|| format!("Tenant {tenant_id} not found"))?; if let Some(page_tline) = tenant.local_timelines.get(&timeline_id) { - return Ok(Arc::clone(page_tline)); + Ok(Arc::clone(page_tline)) + } else { + let page_tline = load_local_timeline(&tenant.repo, timeline_id) + .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}"))?; + tenant + .local_timelines + .insert(timeline_id, Arc::clone(&page_tline)); + Ok(page_tline) } - - let page_tline = load_local_timeline(&tenant.repo, timeline_id) - .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}"))?; - tenant - .local_timelines - .insert(timeline_id, Arc::clone(&page_tline)); - Ok(page_tline) } pub fn detach_timeline( @@ -351,6 +434,9 @@ pub fn detach_timeline( .detach_timeline(timeline_id) .context("Failed to detach inmem tenant timeline")?; tenant.local_timelines.remove(&timeline_id); + tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach( + ZTenantTimelineId::new(tenant_id, timeline_id), + )); } None => bail!("Tenant {tenant_id} not found in local tenant state"), } @@ -379,6 +465,12 @@ fn load_local_timeline( repartition_distance, )); page_tline.init_logical_size()?; + + tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach( + ZTenantTimelineId::new(repo.tenant_id(), timeline_id), + Arc::clone(&page_tline), + )); + Ok(page_tline) } diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index 473cddda58..8264bdd97c 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -91,8 +91,8 @@ pub enum ThreadKind { // associated with one later, after receiving a command from the client. PageRequestHandler, - // Thread that connects to a safekeeper to fetch WAL for one timeline. - WalReceiver, + // Main walreceiver manager thread that ensures that every timeline spawns a connection to safekeeper, to fetch WAL. + WalReceiverManager, // Thread that handles compaction of all timelines for a tenant. Compactor, diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index b8f349af8f..df8dd2fc29 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -1,61 +1,77 @@ +//! WAL receiver manages an open connection to safekeeper, to get the WAL it streams into. +//! To do so, a current implementation needs to do the following: //! -//! WAL receiver connects to the WAL safekeeper service, streams WAL, -//! decodes records and saves them in the repository for the correct -//! timeline. +//! * acknowledge the timelines that it needs to stream WAL into. +//! Pageserver is able to dynamically (un)load tenants on attach and detach, +//! hence WAL receiver needs to react on such events. //! -//! We keep one WAL receiver active per timeline. +//! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming. +//! For that, it watches specific keys in etcd broker and pulls the relevant data periodically. +//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other. +//! Without this data, no WAL streaming is possible currently. +//! +//! Only one active WAL streaming connection is allowed at a time. +//! The connection is supposed to be updated periodically, based on safekeeper timeline data. +//! +//! * handle the actual connection and WAL streaming +//! +//! Handle happens dynamically, by portions of WAL being processed and registered in the server. +//! Along with the registration, certain metadata is written to show WAL streaming progress and rely on that when considering safekeepers for connection. +//! +//! ## Implementation details +//! +//! WAL receiver's implementation consists of 3 kinds of nested loops, separately handling the logic from the bullets above: +//! +//! * [`init_wal_receiver_main_thread`], a wal receiver main thread, containing the control async loop: timeline addition/removal and interruption of a whole thread handling. +//! The loop is infallible, always trying to continue with the new tasks, the only place where it can fail is its initialization. +//! All of the code inside the loop is either async or a spawn_blocking wrapper around the sync code. +//! +//! * [`timeline_wal_broker_loop_step`], a broker task, handling the etcd broker subscription and polling, safekeeper selection logic and [re]connects. +//! On every concequent broker/wal streamer connection attempt, the loop steps are forced to wait for some time before running, +//! increasing with the number of attempts (capped with some fixed value). +//! This is done endlessly, to ensure we don't miss the WAL streaming when it gets available on one of the safekeepers. +//! +//! Apart from the broker management, it keeps the wal streaming connection open, with the safekeeper having the most advanced timeline state. +//! The connection could be closed from safekeeper side (with error or not), could be cancelled from pageserver side from time to time. +//! +//! * [`connection_handler::handle_walreceiver_connection`], a wal streaming task, opening the libpq connection and reading the data out of it to the end. +//! Does periodic reporting of the progress, to share some of the data via external HTTP API and to ensure we're able to switch connections when needed. +//! +//! Every task is cancellable via its separate cancellation channel, +//! also every such task's dependency (broker subscription or the data source channel) cancellation/drop triggers the corresponding task cancellation either. + +mod connection_handler; use crate::config::PageServerConf; -use crate::repository::{Repository, Timeline}; -use crate::tenant_mgr; -use crate::thread_mgr; +use crate::http::models::WalReceiverEntry; +use crate::repository::Timeline; +use crate::tenant_mgr::{self, LocalTimelineUpdate, TenantState}; use crate::thread_mgr::ThreadKind; -use crate::walingest::WalIngest; -use anyhow::{bail, Context, Error, Result}; -use bytes::BytesMut; -use fail::fail_point; -use lazy_static::lazy_static; -use postgres_ffi::waldecoder::*; -use postgres_protocol::message::backend::ReplicationMessage; -use postgres_types::PgLsn; -use serde::{Deserialize, Serialize}; -use serde_with::{serde_as, DisplayFromStr}; +use crate::{thread_mgr, DatadirTimelineImpl}; +use anyhow::{ensure, Context}; +use chrono::{NaiveDateTime, Utc}; +use etcd_broker::{Client, SkTimelineInfo, SkTimelineSubscription, SkTimelineSubscriptionKind}; +use itertools::Itertools; +use once_cell::sync::Lazy; use std::cell::Cell; -use std::collections::HashMap; -use std::str::FromStr; -use std::sync::Mutex; +use std::collections::{hash_map, HashMap, HashSet}; +use std::num::NonZeroU64; +use std::ops::ControlFlow; +use std::sync::Arc; use std::thread_local; -use std::time::SystemTime; -use tokio::pin; -use tokio_postgres::replication::ReplicationStream; -use tokio_postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow}; -use tokio_stream::StreamExt; -use tracing::*; -use utils::{ - lsn::Lsn, - pq_proto::ZenithFeedback, - zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, +use std::time::Duration; +use tokio::select; +use tokio::{ + sync::{mpsc, watch, RwLock}, + task::JoinHandle, }; +use tracing::*; +use url::Url; +use utils::lsn::Lsn; +use utils::pq_proto::ZenithFeedback; +use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}; -/// -/// A WAL receiver's data stored inside the global `WAL_RECEIVERS`. -/// We keep one WAL receiver active per timeline. -/// -#[serde_as] -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct WalReceiverEntry { - thread_id: u64, - wal_producer_connstr: String, - #[serde_as(as = "Option")] - last_received_msg_lsn: Option, - /// the timestamp (in microseconds) of the last received message - last_received_msg_ts: Option, -} - -lazy_static! { - static ref WAL_RECEIVERS: Mutex> = - Mutex::new(HashMap::new()); -} +use self::connection_handler::{WalConnectionEvent, WalReceiverConnection}; thread_local! { // Boolean that is true only for WAL receiver threads @@ -64,375 +80,1133 @@ thread_local! { pub(crate) static IS_WAL_RECEIVER: Cell = Cell::new(false); } -fn drop_wal_receiver(tenantid: ZTenantId, timelineid: ZTimelineId) { - let mut receivers = WAL_RECEIVERS.lock().unwrap(); - receivers.remove(&(tenantid, timelineid)); -} +/// WAL receiver state for sharing with the outside world. +/// Only entries for timelines currently available in pageserver are stored. +static WAL_RECEIVER_ENTRIES: Lazy>> = + Lazy::new(|| RwLock::new(HashMap::new())); -// Launch a new WAL receiver, or tell one that's running about change in connection string -pub fn launch_wal_receiver( - conf: &'static PageServerConf, - tenantid: ZTenantId, - timelineid: ZTimelineId, - wal_producer_connstr: &str, -) -> Result<()> { - let mut receivers = WAL_RECEIVERS.lock().unwrap(); - - match receivers.get_mut(&(tenantid, timelineid)) { - Some(receiver) => { - debug!("wal receiver already running, updating connection string"); - receiver.wal_producer_connstr = wal_producer_connstr.into(); - } - None => { - let thread_id = thread_mgr::spawn( - ThreadKind::WalReceiver, - Some(tenantid), - Some(timelineid), - "WAL receiver thread", - false, - move || { - IS_WAL_RECEIVER.with(|c| c.set(true)); - thread_main(conf, tenantid, timelineid); - Ok(()) - }, - )?; - - let receiver = WalReceiverEntry { - thread_id, - wal_producer_connstr: wal_producer_connstr.into(), - last_received_msg_lsn: None, - last_received_msg_ts: None, - }; - receivers.insert((tenantid, timelineid), receiver); - - // Update tenant state and start tenant threads, if they are not running yet. - tenant_mgr::activate_tenant(tenantid)?; - } - }; - Ok(()) -} - -/// Look up a WAL receiver's data in the global `WAL_RECEIVERS` -pub fn get_wal_receiver_entry( +/// Gets the public WAL streaming entry for a certain timeline. +pub async fn get_wal_receiver_entry( tenant_id: ZTenantId, timeline_id: ZTimelineId, ) -> Option { - let receivers = WAL_RECEIVERS.lock().unwrap(); - receivers.get(&(tenant_id, timeline_id)).cloned() + WAL_RECEIVER_ENTRIES + .read() + .await + .get(&ZTenantTimelineId::new(tenant_id, timeline_id)) + .cloned() } -// -// This is the entry point for the WAL receiver thread. -// -fn thread_main(conf: &'static PageServerConf, tenant_id: ZTenantId, timeline_id: ZTimelineId) { - let _enter = info_span!("WAL receiver", timeline = %timeline_id, tenant = %tenant_id).entered(); - info!("WAL receiver thread started"); - - // Look up the current WAL producer address - let wal_producer_connstr = { - match get_wal_receiver_entry(tenant_id, timeline_id) { - Some(e) => e.wal_producer_connstr, - None => { - info!( - "Unable to create the WAL receiver thread: no WAL receiver entry found for tenant {} and timeline {}", - tenant_id, timeline_id - ); - return; - } - } - }; - - // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server, - // and start streaming WAL from it. - let res = walreceiver_main(conf, tenant_id, timeline_id, &wal_producer_connstr); - - // TODO cleanup info messages - if let Err(e) = res { - info!("WAL streaming connection failed ({})", e); - } else { - info!( - "walreceiver disconnected tenant {}, timelineid {}", - tenant_id, timeline_id - ); - } - - // Drop it from list of active WAL_RECEIVERS - // so that next callmemaybe request launched a new thread - drop_wal_receiver(tenant_id, timeline_id); -} - -fn walreceiver_main( - _conf: &PageServerConf, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - wal_producer_connstr: &str, -) -> anyhow::Result<(), Error> { - // Connect to the database in replication mode. - info!("connecting to {:?}", wal_producer_connstr); - let connect_cfg = format!( - "{} application_name=pageserver replication=true", - wal_producer_connstr +/// Sets up the main WAL receiver thread that manages the rest of the subtasks inside of it, per timeline. +/// See comments in [`wal_receiver_main_thread_loop_step`] for more details on per timeline activities. +pub fn init_wal_receiver_main_thread( + conf: &'static PageServerConf, + mut timeline_updates_receiver: mpsc::UnboundedReceiver, +) -> anyhow::Result<()> { + let etcd_endpoints = conf.broker_endpoints.clone(); + ensure!( + !etcd_endpoints.is_empty(), + "Cannot start wal receiver: etcd endpoints are empty" ); - - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?; - - let (mut replication_client, connection) = - runtime.block_on(tokio_postgres::connect(&connect_cfg, NoTls))?; - // This is from tokio-postgres docs, but it is a bit weird in our case because we extensively use block_on - runtime.spawn(async move { - if let Err(e) = connection.await { - error!("connection error: {}", e); - } - }); - - info!("connected!"); - - // Immediately increment the gauge, then create a job to decrement it on thread exit. - // One of the pros of `defer!` is that this will *most probably* - // get called, even in presence of panics. - let gauge = crate::LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]); - gauge.inc(); - scopeguard::defer! { - gauge.dec(); - } - - let identify = runtime.block_on(identify_system(&mut replication_client))?; - info!("{:?}", identify); - let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); - let mut caught_up = false; - - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("no repository found for tenant {}", tenant_id))?; - let timeline = - tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id).with_context(|| { - format!( - "local timeline {} not found for tenant {}", - timeline_id, tenant_id - ) - })?; - let remote_index = repo.get_remote_index(); - - // - // Start streaming the WAL, from where we left off previously. - // - // If we had previously received WAL up to some point in the middle of a WAL record, we - // better start from the end of last full WAL record, not in the middle of one. - let mut last_rec_lsn = timeline.get_last_record_lsn(); - let mut startpoint = last_rec_lsn; - - if startpoint == Lsn(0) { - bail!("No previous WAL position"); - } - - // There might be some padding after the last full record, skip it. - startpoint += startpoint.calc_padding(8u32); - + let broker_prefix = &conf.broker_etcd_prefix; info!( - "last_record_lsn {} starting replication from {}, server is at {}...", - last_rec_lsn, startpoint, end_of_wal + "Starting wal receiver main thread, etdc endpoints: {}", + etcd_endpoints.iter().map(Url::to_string).join(", ") ); - let query = format!("START_REPLICATION PHYSICAL {}", startpoint); + let runtime = tokio::runtime::Builder::new_multi_thread() + .thread_name("wal-receiver-runtime-thread") + .worker_threads(40) + .enable_all() + .on_thread_start(|| IS_WAL_RECEIVER.with(|c| c.set(true))) + .build() + .context("Failed to create storage sync runtime")?; + let etcd_client = runtime + .block_on(etcd_broker::Client::connect(etcd_endpoints, None)) + .context("Failed to connect to etcd")?; - let copy_stream = runtime.block_on(replication_client.copy_both_simple(&query))?; - let physical_stream = ReplicationStream::new(copy_stream); - pin!(physical_stream); - - let mut waldecoder = WalStreamDecoder::new(startpoint); - - let mut walingest = WalIngest::new(&*timeline, startpoint)?; - - while let Some(replication_message) = runtime.block_on(async { - let shutdown_watcher = thread_mgr::shutdown_watcher(); - tokio::select! { - // check for shutdown first - biased; - _ = shutdown_watcher => { - info!("walreceiver interrupted"); - None - } - replication_message = physical_stream.next() => replication_message, - } - }) { - let replication_message = replication_message?; - let status_update = match replication_message { - ReplicationMessage::XLogData(xlog_data) => { - // Pass the WAL data to the decoder, and see if we can decode - // more records as a result. - let data = xlog_data.data(); - let startlsn = Lsn::from(xlog_data.wal_start()); - let endlsn = startlsn + data.len() as u64; - - trace!("received XLogData between {} and {}", startlsn, endlsn); - - waldecoder.feed_bytes(data); - - while let Some((lsn, recdata)) = waldecoder.poll_decode()? { - let _enter = info_span!("processing record", lsn = %lsn).entered(); - - // It is important to deal with the aligned records as lsn in getPage@LSN is - // aligned and can be several bytes bigger. Without this alignment we are - // at risk of hitting a deadlock. - anyhow::ensure!(lsn.is_aligned()); - - walingest.ingest_record(&timeline, recdata, lsn)?; - - fail_point!("walreceiver-after-ingest"); - - last_rec_lsn = lsn; + thread_mgr::spawn( + ThreadKind::WalReceiverManager, + None, + None, + "WAL receiver manager main thread", + true, + move || { + runtime.block_on(async move { + let mut local_timeline_wal_receivers = HashMap::new(); + loop { + select! { + _ = thread_mgr::shutdown_watcher() => { + info!("Shutdown signal received"); + shutdown_all_wal_connections(&mut local_timeline_wal_receivers).await; + break; + }, + _ = wal_receiver_main_thread_loop_step( + broker_prefix, + &etcd_client, + &mut timeline_updates_receiver, + &mut local_timeline_wal_receivers, + ) => {}, + } } + }.instrument(info_span!("wal_receiver_main"))); - if !caught_up && endlsn >= end_of_wal { - info!("caught up at LSN {}", endlsn); - caught_up = true; + info!("Wal receiver main thread stopped"); + Ok(()) + }, + ) + .map(|_thread_id| ()) + .context("Failed to spawn wal receiver main thread") +} + +/// A step to process timeline attach/detach events to enable/disable the corresponding WAL receiver machinery. +/// In addition to WAL streaming management, the step ensures that corresponding tenant has its service threads enabled or disabled. +/// This is done here, since only walreceiver knows when a certain tenant has no streaming enabled. +/// +/// Cannot fail, should always try to process the next timeline event even if the other one was not processed properly. +async fn wal_receiver_main_thread_loop_step<'a>( + broker_prefix: &'a str, + etcd_client: &'a Client, + timeline_updates_receiver: &'a mut mpsc::UnboundedReceiver, + local_timeline_wal_receivers: &'a mut HashMap< + ZTenantId, + HashMap, + >, +) { + // Only react on updates from [`tenant_mgr`] on local timeline attach/detach. + match timeline_updates_receiver.recv().await { + Some(update) => { + info!("Processing timeline update: {update:?}"); + match update { + // Timeline got detached, stop all related tasks and remove public timeline data. + LocalTimelineUpdate::Detach(id) => { + match local_timeline_wal_receivers.get_mut(&id.tenant_id) { + Some(wal_receivers) => { + if let hash_map::Entry::Occupied(mut o) = wal_receivers.entry(id.timeline_id) { + if let Err(e) = o.get_mut().shutdown(id).await { + error!("Failed to shut down timeline {id} wal receiver handle: {e:#}"); + return; + } else { + o.remove(); + } + } + if wal_receivers.is_empty() { + if let Err(e) = change_tenant_state(id.tenant_id, TenantState::Idle).await { + error!("Failed to make tenant idle for id {id}: {e:#}"); + } + } + } + None => warn!("Timeline {id} does not have a tenant entry in wal receiver main thread"), + }; + { + WAL_RECEIVER_ENTRIES.write().await.remove(&id); + } } + // Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly. + LocalTimelineUpdate::Attach(new_id, new_timeline) => { + let timelines = local_timeline_wal_receivers + .entry(new_id.tenant_id) + .or_default(); - timeline.tline.check_checkpoint_distance()?; + if timelines.is_empty() { + if let Err(e) = + change_tenant_state(new_id.tenant_id, TenantState::Active).await + { + error!("Failed to make tenant active for id {new_id}: {e:#}"); + return; + } + } - Some(endlsn) - } + let vacant_timeline_entry = match timelines.entry(new_id.timeline_id) { + hash_map::Entry::Occupied(_) => { + debug!("Attepted to readd an existing timeline {new_id}, ignoring"); + return; + } + hash_map::Entry::Vacant(v) => v, + }; - ReplicationMessage::PrimaryKeepAlive(keepalive) => { - let wal_end = keepalive.wal_end(); - let timestamp = keepalive.timestamp(); - let reply_requested = keepalive.reply() != 0; + let (wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag) = + match fetch_tenant_settings(new_id.tenant_id).await { + Ok(settings) => settings, + Err(e) => { + error!("Failed to fetch tenant settings for id {new_id}: {e:#}"); + return; + } + }; - trace!( - "received PrimaryKeepAlive(wal_end: {}, timestamp: {:?} reply: {})", - wal_end, - timestamp, - reply_requested, - ); - - if reply_requested { - Some(last_rec_lsn) - } else { - None - } - } - - _ => None, - }; - - if let Some(last_lsn) = status_update { - let timeline_remote_consistent_lsn = runtime.block_on(async { - remote_index - .read() - .await - // here we either do not have this timeline in remote index - // or there were no checkpoints for it yet - .timeline_entry(&ZTenantTimelineId { - tenant_id, - timeline_id, - }) - .map(|remote_timeline| remote_timeline.metadata.disk_consistent_lsn()) - .unwrap_or(Lsn(0)) // no checkpoint was uploaded - }); - - // The last LSN we processed. It is not guaranteed to survive pageserver crash. - let write_lsn = u64::from(last_lsn); - // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data - let flush_lsn = u64::from(timeline.tline.get_disk_consistent_lsn()); - // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash - // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. - let apply_lsn = u64::from(timeline_remote_consistent_lsn); - let ts = SystemTime::now(); - - // Update the current WAL receiver's data stored inside the global hash table `WAL_RECEIVERS` - { - let mut receivers = WAL_RECEIVERS.lock().unwrap(); - let entry = match receivers.get_mut(&(tenant_id, timeline_id)) { - Some(e) => e, - None => { - anyhow::bail!( - "no WAL receiver entry found for tenant {} and timeline {}", - tenant_id, - timeline_id + { + WAL_RECEIVER_ENTRIES.write().await.insert( + new_id, + WalReceiverEntry { + wal_producer_connstr: None, + last_received_msg_lsn: None, + last_received_msg_ts: None, + }, ); } - }; - entry.last_received_msg_lsn = Some(last_lsn); - entry.last_received_msg_ts = Some( - ts.duration_since(SystemTime::UNIX_EPOCH) - .expect("Received message time should be before UNIX EPOCH!") - .as_micros(), - ); + let (cancellation_sender, mut cancellation_receiver) = watch::channel(()); + let mut wal_connection_manager = WalConnectionManager { + id: new_id, + timeline: Arc::clone(&new_timeline), + wal_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + wal_connection_data: None, + wal_connection_attempt: 0, + }; + + let broker_prefix = broker_prefix.to_string(); + let mut loop_client = etcd_client.clone(); + let broker_join_handle = tokio::spawn(async move { + info!("WAL receiver broker started, connecting to etcd"); + let mut cancellation = cancellation_receiver.clone(); + loop { + select! { + _ = cancellation.changed() => { + info!("Wal broker loop cancelled, shutting down"); + break; + }, + step_result = timeline_wal_broker_loop_step( + &broker_prefix, + &mut loop_client, + &mut wal_connection_manager, + &mut cancellation_receiver, + ) => match step_result { + Ok(ControlFlow::Break(())) => { + break; + } + Ok(ControlFlow::Continue(())) => {} + Err(e) => warn!("Error during wal receiver main thread step for timeline {new_id}: {e:#}"), + } + } + } + }.instrument(info_span!("timeline", id = %new_id))); + + vacant_timeline_entry.insert(TimelineWalBrokerLoopHandle { + broker_join_handle, + cancellation_sender, + }); + } } + } + None => { + info!("Local timeline update channel closed"); + shutdown_all_wal_connections(local_timeline_wal_receivers).await; + } + } +} - // Send zenith feedback message. - // Regular standby_status_update fields are put into this message. - let zenith_status_update = ZenithFeedback { - current_timeline_size: timeline.get_current_logical_size() as u64, - ps_writelsn: write_lsn, - ps_flushlsn: flush_lsn, - ps_applylsn: apply_lsn, - ps_replytime: ts, - }; +async fn fetch_tenant_settings( + tenant_id: ZTenantId, +) -> anyhow::Result<(Duration, Duration, NonZeroU64)> { + tokio::task::spawn_blocking(move || { + let repo = tenant_mgr::get_repository_for_tenant(tenant_id) + .with_context(|| format!("no repository found for tenant {tenant_id}"))?; + Ok::<_, anyhow::Error>(( + repo.get_wal_receiver_connect_timeout(), + repo.get_lagging_wal_timeout(), + repo.get_max_lsn_wal_lag(), + )) + }) + .await + .with_context(|| format!("Failed to join on tenant {tenant_id} settings fetch task"))? +} - debug!("zenith_status_update {:?}", zenith_status_update); +async fn change_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { + tokio::task::spawn_blocking(move || { + tenant_mgr::set_tenant_state(tenant_id, new_state) + .with_context(|| format!("Failed to activate tenant {tenant_id}")) + }) + .await + .with_context(|| format!("Failed to spawn activation task for tenant {tenant_id}"))? +} - let mut data = BytesMut::new(); - zenith_status_update.serialize(&mut data)?; - runtime.block_on( - physical_stream - .as_mut() - .zenith_status_update(data.len() as u64, &data), - )?; +async fn exponential_backoff(n: u32, base: f64, max_seconds: f64) { + if n == 0 { + return; + } + let seconds_to_wait = base.powf(f64::from(n) - 1.0).min(max_seconds); + info!("Backoff: waiting {seconds_to_wait} seconds before proceeding with the task"); + tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await; +} + +async fn shutdown_all_wal_connections( + local_timeline_wal_receivers: &mut HashMap< + ZTenantId, + HashMap, + >, +) { + info!("Shutting down all WAL connections"); + let mut broker_join_handles = Vec::new(); + for (tenant_id, timelines) in local_timeline_wal_receivers.drain() { + for (timeline_id, handles) in timelines { + handles.cancellation_sender.send(()).ok(); + broker_join_handles.push(( + ZTenantTimelineId::new(tenant_id, timeline_id), + handles.broker_join_handle, + )); } } - Ok(()) -} - -/// Data returned from the postgres `IDENTIFY_SYSTEM` command -/// -/// See the [postgres docs] for more details. -/// -/// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html -#[derive(Debug)] -// As of nightly 2021-09-11, fields that are only read by the type's `Debug` impl still count as -// unused. Relevant issue: https://github.com/rust-lang/rust/issues/88900 -#[allow(dead_code)] -pub struct IdentifySystem { - systemid: u64, - timeline: u32, - xlogpos: PgLsn, - dbname: Option, -} - -/// There was a problem parsing the response to -/// a postgres IDENTIFY_SYSTEM command. -#[derive(Debug, thiserror::Error)] -#[error("IDENTIFY_SYSTEM parse error")] -pub struct IdentifyError; - -/// Run the postgres `IDENTIFY_SYSTEM` command -pub async fn identify_system(client: &mut Client) -> Result { - let query_str = "IDENTIFY_SYSTEM"; - let response = client.simple_query(query_str).await?; - - // get(N) from row, then parse it as some destination type. - fn get_parse(row: &SimpleQueryRow, idx: usize) -> Result - where - T: FromStr, + let mut tenants = HashSet::with_capacity(broker_join_handles.len()); + for (id, broker_join_handle) in broker_join_handles { + tenants.insert(id.tenant_id); + debug!("Waiting for wal broker for timeline {id} to finish"); + if let Err(e) = broker_join_handle.await { + error!("Failed to join on wal broker for timeline {id}: {e}"); + } + } + if let Err(e) = tokio::task::spawn_blocking(move || { + for tenant_id in tenants { + if let Err(e) = tenant_mgr::set_tenant_state(tenant_id, TenantState::Idle) { + error!("Failed to make tenant {tenant_id} idle: {e:?}"); + } + } + }) + .await { - let val = row.get(idx).ok_or(IdentifyError)?; - val.parse::().or(Err(IdentifyError)) - } - - // extract the row contents into an IdentifySystem struct. - // written as a closure so I can use ? for Option here. - if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) { - Ok(IdentifySystem { - systemid: get_parse(first_row, 0)?, - timeline: get_parse(first_row, 1)?, - xlogpos: get_parse(first_row, 2)?, - dbname: get_parse(first_row, 3).ok(), - }) - } else { - Err(IdentifyError.into()) + error!("Failed to spawn a task to make all tenants idle: {e:?}"); + } +} + +/// Broker WAL loop handle to cancel the loop safely when needed. +struct TimelineWalBrokerLoopHandle { + broker_join_handle: JoinHandle<()>, + cancellation_sender: watch::Sender<()>, +} + +impl TimelineWalBrokerLoopHandle { + /// Stops the broker loop, waiting for its current task to finish. + async fn shutdown(&mut self, id: ZTenantTimelineId) -> anyhow::Result<()> { + self.cancellation_sender.send(()).context( + "Unexpected: cancellation sender is dropped before the receiver in the loop is", + )?; + debug!("Waiting for wal receiver for timeline {id} to finish"); + let handle = &mut self.broker_join_handle; + handle + .await + .with_context(|| format!("Failed to join the wal reveiver broker for timeline {id}")) + } +} + +/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. +/// Based on the updates, desides whether to start, keep or stop a WAL receiver task. +async fn timeline_wal_broker_loop_step( + broker_prefix: &str, + etcd_client: &mut Client, + wal_connection_manager: &mut WalConnectionManager, + cancellation: &mut watch::Receiver<()>, +) -> anyhow::Result> { + let id = wal_connection_manager.id; + + // Endlessly try to subscribe for broker updates for a given timeline. + // If there are no safekeepers to maintain the lease, the timeline subscription will be inavailable in the broker and the operation will fail constantly. + // This is ok, pageservers should anyway try subscribing (with some backoff) since it's the only way they can get the timeline WAL anyway. + let mut broker_subscription: SkTimelineSubscription; + let mut attempt = 0; + loop { + select! { + _ = cancellation.changed() => { + info!("Subscription backoff cancelled, shutting down"); + return Ok(ControlFlow::Break(())); + }, + _ = exponential_backoff(attempt, 2.0, 60.0) => {}, + } + attempt += 1; + + select! { + _ = cancellation.changed() => { + info!("Broker subscription loop cancelled, shutting down"); + return Ok(ControlFlow::Break(())); + }, + new_subscription = etcd_broker::subscribe_to_safekeeper_timeline_updates( + etcd_client, + SkTimelineSubscriptionKind::timeline(broker_prefix.to_owned(), id), + ) + .instrument(info_span!("etcd_subscription")) => match new_subscription { + Ok(new_subscription) => { + broker_subscription = new_subscription; + break; + } + Err(e) => { + warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in etcd: {e:#}"); + continue; + } + }, + + } + } + + info!("Subscribed for etcd timeline changes, considering walreceiver connections"); + + loop { + select! { + // the order of the polls is especially important here, since the first task to complete gets selected and the others get dropped (cancelled). + // place more frequetly updated tasks below to ensure the "slow" tasks are also reacted to. + biased; + // first, the cancellations are checked, to ensure we exit eagerly + _ = cancellation.changed() => { + info!("Broker loop cancelled, shutting down"); + break; + } + // then, we check for new events from the WAL connection: the existing connection should either return some progress data, + // or block, allowing other tasks in this `select!` to run first. + // + // We set a "timebomb" in the polling method, that waits long enough and cancels the entire loop if nothing happens during the wait. + // The wait is only initiated when no data (or a "channel closed" data) is received from the loop, ending with the break flow return. + // While waiting, more broker events are expected to be retrieved from etcd (currently, every safekeeper posts ~1 message/second). + // The timebomb ensures that we don't get stuck for too long on any of the WAL/etcd event polling, rather restarting the subscription entirely. + // + // We cannot return here eagerly on no WAL task data, since the result will get selected to early, not allowing etcd tasks to be polled properly. + // We cannot move etcd tasks above this select, since they are very frequent to finish and WAL events might get ignored. + // We need WAL events to periodically update the external data, so we cannot simply await the task result on the handler here. + wal_receiver_poll_result = wal_connection_manager.poll_connection_event_or_cancel() => match wal_receiver_poll_result { + ControlFlow::Break(()) => break, + ControlFlow::Continue(()) => {}, + }, + // finally, if no other tasks are completed, get another broker update and possibly reconnect + updates = broker_subscription.fetch_data() => match updates { + Some(mut all_timeline_updates) => { + if let Some(subscribed_timeline_updates) = all_timeline_updates.remove(&id) { + match wal_connection_manager.select_connection_candidate(subscribed_timeline_updates) { + Some(candidate) => { + info!("Switching to different safekeeper {} for timeline {id}, reason: {:?}", candidate.safekeeper_id, candidate.reason); + wal_connection_manager.change_connection(candidate.safekeeper_id, candidate.wal_producer_connstr).await; + }, + None => {} + } + } + }, + None => { + info!("Subscription source end was dropped, no more updates are possible, shutting down"); + break; + }, + }, + } + } + + info!("Waiting for the current connection to close"); + wal_connection_manager.close_connection().await; + broker_subscription + .cancel() + .await + .with_context(|| format!("Failed to cancel timeline {id} subscription in etcd"))?; + Ok(ControlFlow::Continue(())) +} + +/// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. +struct WalConnectionManager { + id: ZTenantTimelineId, + timeline: Arc, + wal_connect_timeout: Duration, + lagging_wal_timeout: Duration, + max_lsn_wal_lag: NonZeroU64, + wal_connection_attempt: u32, + wal_connection_data: Option, +} + +#[derive(Debug)] +struct WalConnectionData { + safekeeper_id: NodeId, + connection: WalReceiverConnection, + connection_init_time: NaiveDateTime, + last_wal_receiver_data: Option<(ZenithFeedback, NaiveDateTime)>, +} + +#[derive(Debug, PartialEq, Eq)] +struct NewWalConnectionCandidate { + safekeeper_id: NodeId, + wal_producer_connstr: String, + reason: ReconnectReason, +} + +/// Stores the reason why WAL connection was switched, for furter debugging purposes. +#[derive(Debug, PartialEq, Eq)] +enum ReconnectReason { + NoExistingConnection, + LaggingWal { + current_lsn: Lsn, + new_lsn: Lsn, + threshold: NonZeroU64, + }, + NoWalTimeout { + last_wal_interaction: NaiveDateTime, + check_time: NaiveDateTime, + threshold: Duration, + }, +} + +impl WalConnectionManager { + /// Tries to get more data from the WAL connection. + /// If the WAL connection channel is dropped or no data is retrieved, a "timebomb" future is started to break the existing broker subscription. + /// This future is intended to be used in the `select!` loop, so lengthy future normally gets dropped due to other futures completing. + /// If not, it's better to cancel the entire "stuck" loop and start over. + async fn poll_connection_event_or_cancel(&mut self) -> ControlFlow<(), ()> { + let (connection_data, wal_receiver_event) = match self.wal_connection_data.as_mut() { + Some(connection_data) => match connection_data.connection.next_event().await { + Some(event) => (connection_data, event), + None => { + warn!("WAL receiver event source stopped sending messages, waiting for other events to arrive"); + tokio::time::sleep(Duration::from_secs(30)).await; + warn!("WAL receiver without a connection spent sleeping 30s without being interrupted, aborting the loop"); + return ControlFlow::Break(()); + } + }, + None => { + tokio::time::sleep(Duration::from_secs(30)).await; + warn!("WAL receiver without a connection spent sleeping 30s without being interrupted, aborting the loop"); + return ControlFlow::Break(()); + } + }; + + match wal_receiver_event { + WalConnectionEvent::Started => { + self.wal_connection_attempt = 0; + } + WalConnectionEvent::NewWal(new_wal_data) => { + self.wal_connection_attempt = 0; + connection_data.last_wal_receiver_data = + Some((new_wal_data, Utc::now().naive_utc())); + } + WalConnectionEvent::End(wal_receiver_result) => { + match wal_receiver_result { + Ok(()) => { + info!("WAL receiver task finished, reconnecting"); + self.wal_connection_attempt = 0; + } + Err(e) => { + error!("WAL receiver task failed: {e:#}, reconnecting"); + self.wal_connection_attempt += 1; + } + } + self.close_connection().await; + } + } + + ControlFlow::Continue(()) + } + + /// Shuts down current connection (if any), waiting for it to finish. + async fn close_connection(&mut self) { + if let Some(data) = self.wal_connection_data.as_mut() { + match data.connection.shutdown().await { + Err(e) => { + error!("Failed to shutdown wal receiver connection: {e:#}"); + } + Ok(()) => self.wal_connection_data = None, + } + } + } + + /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. + async fn change_connection( + &mut self, + new_safekeeper_id: NodeId, + new_wal_producer_connstr: String, + ) { + self.close_connection().await; + self.wal_connection_data = Some(WalConnectionData { + safekeeper_id: new_safekeeper_id, + connection: WalReceiverConnection::open( + self.id, + new_safekeeper_id, + new_wal_producer_connstr, + self.wal_connect_timeout, + ), + connection_init_time: Utc::now().naive_utc(), + last_wal_receiver_data: None, + }); + } + + /// Checks current state against every fetched safekeeper state of a given timeline. + /// Returns a new candidate, if the current state is somewhat lagging, or `None` otherwise. + /// The current rules for approving new candidates: + /// * pick the safekeeper with biggest `commit_lsn` that's after than pageserver's latest Lsn for the timeline + /// * if the leader is a different SK and either + /// * no WAL updates happened after certain time (either none since the connection time or none since the last event after the connection) — reconnect + /// * same time amount had passed since the connection, WAL updates happened recently, but the new leader SK has timeline Lsn way ahead of the old one — reconnect + /// + /// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently. + /// Both thresholds are configured per tenant. + fn select_connection_candidate( + &self, + safekeeper_timelines: HashMap, + ) -> Option { + let (&new_sk_id, new_sk_timeline, new_wal_producer_connstr) = safekeeper_timelines + .iter() + .filter(|(_, info)| { + info.commit_lsn > Some(self.timeline.tline.get_last_record_lsn()) + }) + .filter_map(|(sk_id, info)| { + match wal_stream_connection_string( + self.id, + info.safekeeper_connstr.as_deref()?, + info.pageserver_connstr.as_deref()?, + ) { + Ok(connstr) => Some((sk_id, info, connstr)), + Err(e) => { + error!("Failed to create wal receiver connection string from broker data of safekeeper node {sk_id}: {e:#}"); + None + } + } + }) + .max_by_key(|(_, info, _)| info.commit_lsn)?; + + match self.wal_connection_data.as_ref() { + None => Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_producer_connstr: new_wal_producer_connstr, + reason: ReconnectReason::NoExistingConnection, + }), + Some(current_connection) => { + if current_connection.safekeeper_id == new_sk_id { + None + } else { + self.reason_to_reconnect(current_connection, new_sk_timeline) + .map(|reason| NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_producer_connstr: new_wal_producer_connstr, + reason, + }) + } + } + } + } + + fn reason_to_reconnect( + &self, + current_connection: &WalConnectionData, + new_sk_timeline: &SkTimelineInfo, + ) -> Option { + let last_sk_interaction_time = match current_connection.last_wal_receiver_data.as_ref() { + Some((last_wal_receiver_data, data_submission_time)) => { + let new_lsn = new_sk_timeline.commit_lsn?; + match new_lsn.0.checked_sub(last_wal_receiver_data.ps_writelsn) + { + Some(sk_lsn_advantage) => { + if sk_lsn_advantage >= self.max_lsn_wal_lag.get() { + return Some(ReconnectReason::LaggingWal { current_lsn: Lsn(last_wal_receiver_data.ps_writelsn), new_lsn, threshold: self.max_lsn_wal_lag }); + } + } + None => debug!("Best SK candidate has its commit Lsn behind the current timeline's latest consistent Lsn"), + } + *data_submission_time + } + None => current_connection.connection_init_time, + }; + + let now = Utc::now().naive_utc(); + match (now - last_sk_interaction_time).to_std() { + Ok(last_interaction) => { + if last_interaction > self.lagging_wal_timeout { + return Some(ReconnectReason::NoWalTimeout { + last_wal_interaction: last_sk_interaction_time, + check_time: now, + threshold: self.lagging_wal_timeout, + }); + } + } + Err(_e) => { + warn!("Last interaction with safekeeper {} happened in the future, ignoring the candidate. Interaction time: {last_sk_interaction_time}, now: {now}", + current_connection.safekeeper_id); + } + } + None + } +} + +fn wal_stream_connection_string( + ZTenantTimelineId { + tenant_id, + timeline_id, + }: ZTenantTimelineId, + listen_pg_addr_str: &str, + pageserver_connstr: &str, +) -> anyhow::Result { + let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db"); + let me_conf = sk_connstr + .parse::() + .with_context(|| { + format!("Failed to parse pageserver connection string '{sk_connstr}' as a postgres one") + })?; + let (host, port) = utils::connstring::connection_host_port(&me_conf); + Ok(format!( + "host={host} port={port} options='-c ztimelineid={timeline_id} ztenantid={tenant_id} pageserver_connstr={pageserver_connstr}'", + )) +} + +#[cfg(test)] +mod tests { + use std::time::SystemTime; + + use crate::repository::{ + repo_harness::{RepoHarness, TIMELINE_ID}, + Repository, + }; + + use super::*; + + #[test] + fn no_connection_no_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("no_connection_no_candidate")?; + let mut data_manager_with_no_connection = dummy_wal_connection_manager(&harness); + data_manager_with_no_connection.wal_connection_data = None; + + let no_candidate = + data_manager_with_no_connection.select_connection_candidate(HashMap::from([ + ( + NodeId(0), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: None, + pageserver_connstr: Some("no safekeeper_connstr".to_string()), + }, + ), + ( + NodeId(1), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("no pageserver_connstr".to_string()), + pageserver_connstr: None, + }, + ), + ( + NodeId(2), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: None, + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("no commit_lsn".to_string()), + pageserver_connstr: Some("no commit_lsn (p)".to_string()), + }, + ), + ( + NodeId(3), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: None, + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("no commit_lsn".to_string()), + pageserver_connstr: Some("no commit_lsn (p)".to_string()), + }, + ), + ])); + + assert!( + no_candidate.is_none(), + "Expected no candidate selected out of non full data options, but got {no_candidate:?}" + ); + + Ok(()) + } + + #[tokio::test] + async fn connection_no_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("connection_no_candidate")?; + + let current_lsn = 100_000; + let connected_sk_id = NodeId(0); + let mut data_manager_with_connection = dummy_wal_connection_manager(&harness); + let mut dummy_connection_data = dummy_connection_data( + ZTenantTimelineId { + tenant_id: harness.tenant_id, + timeline_id: TIMELINE_ID, + }, + connected_sk_id, + ) + .await; + let now = Utc::now().naive_utc(); + dummy_connection_data.last_wal_receiver_data = Some(( + ZenithFeedback { + current_timeline_size: 1, + ps_writelsn: 1, + ps_applylsn: current_lsn, + ps_flushlsn: 1, + ps_replytime: SystemTime::now(), + }, + now, + )); + dummy_connection_data.connection_init_time = now; + data_manager_with_connection.wal_connection_data = Some(dummy_connection_data); + + let no_candidate = + data_manager_with_connection.select_connection_candidate(HashMap::from([ + ( + connected_sk_id, + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn( + current_lsn + data_manager_with_connection.max_lsn_wal_lag.get() * 2 + )), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), + }, + ), + ( + NodeId(1), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(current_lsn)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("not advanced Lsn".to_string()), + pageserver_connstr: Some("not advanced Lsn (p)".to_string()), + }, + ), + ( + NodeId(2), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn( + current_lsn + data_manager_with_connection.max_lsn_wal_lag.get() / 2 + )), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("not enough advanced Lsn".to_string()), + pageserver_connstr: Some("not enough advanced Lsn (p)".to_string()), + }, + ), + ])); + + assert!( + no_candidate.is_none(), + "Expected no candidate selected out of valid options since candidate Lsn data is ignored and others' was not advanced enough, but got {no_candidate:?}" + ); + + Ok(()) + } + + #[test] + fn no_connection_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("no_connection_candidate")?; + let mut data_manager_with_no_connection = dummy_wal_connection_manager(&harness); + data_manager_with_no_connection.wal_connection_data = None; + + let only_candidate = data_manager_with_no_connection + .select_connection_candidate(HashMap::from([( + NodeId(0), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1 + data_manager_with_no_connection + .max_lsn_wal_lag + .get())), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), + }, + )])) + .expect("Expected one candidate selected out of the only data option, but got none"); + assert_eq!(only_candidate.safekeeper_id, NodeId(0)); + assert_eq!( + only_candidate.reason, + ReconnectReason::NoExistingConnection, + "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold" + ); + assert!(only_candidate + .wal_producer_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + assert!(only_candidate + .wal_producer_connstr + .contains(DUMMY_PAGESERVER_CONNSTR)); + + let selected_lsn = 100_000; + let biggest_wal_candidate = data_manager_with_no_connection + .select_connection_candidate(HashMap::from([ + ( + NodeId(0), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(selected_lsn - 100)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("smaller commit_lsn".to_string()), + pageserver_connstr: Some("smaller commit_lsn (p)".to_string()), + }, + ), + ( + NodeId(1), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(selected_lsn)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), + }, + ), + ( + NodeId(2), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(selected_lsn + 100)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: None, + pageserver_connstr: Some( + "no safekeeper_connstr despite bigger commit_lsn".to_string(), + ), + }, + ), + ])) + .expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(biggest_wal_candidate.safekeeper_id, NodeId(1)); + assert_eq!( + biggest_wal_candidate.reason, + ReconnectReason::NoExistingConnection, + "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold" + ); + assert!(biggest_wal_candidate + .wal_producer_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + assert!(biggest_wal_candidate + .wal_producer_connstr + .contains(DUMMY_PAGESERVER_CONNSTR)); + + Ok(()) + } + + #[tokio::test] + async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("lsn_wal_over_threshcurrent_candidate")?; + let current_lsn = Lsn(100_000).align(); + + let id = ZTenantTimelineId { + tenant_id: harness.tenant_id, + timeline_id: TIMELINE_ID, + }; + + let mut data_manager_with_connection = dummy_wal_connection_manager(&harness); + let connected_sk_id = NodeId(0); + let mut dummy_connection_data = dummy_connection_data(id, NodeId(0)).await; + let lagging_wal_timeout = + chrono::Duration::from_std(data_manager_with_connection.lagging_wal_timeout)?; + let time_over_threshold = + Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; + dummy_connection_data.last_wal_receiver_data = Some(( + ZenithFeedback { + current_timeline_size: 1, + ps_writelsn: current_lsn.0, + ps_applylsn: 1, + ps_flushlsn: 1, + ps_replytime: SystemTime::now(), + }, + time_over_threshold, + )); + dummy_connection_data.connection_init_time = time_over_threshold; + data_manager_with_connection.wal_connection_data = Some(dummy_connection_data); + + let new_lsn = Lsn(current_lsn.0 + data_manager_with_connection.max_lsn_wal_lag.get() + 1); + let candidates = HashMap::from([ + ( + connected_sk_id, + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(current_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), + }, + ), + ( + NodeId(1), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(new_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("advanced by Lsn safekeeper".to_string()), + pageserver_connstr: Some("advanced by Lsn safekeeper (p)".to_string()), + }, + ), + ]); + + let over_threshcurrent_candidate = data_manager_with_connection + .select_connection_candidate(candidates) + .expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(1)); + assert_eq!( + over_threshcurrent_candidate.reason, + ReconnectReason::LaggingWal { + current_lsn, + new_lsn, + threshold: data_manager_with_connection.max_lsn_wal_lag + }, + "Should select bigger WAL safekeeper if it starts to lag enough" + ); + assert!(over_threshcurrent_candidate + .wal_producer_connstr + .contains("advanced by Lsn safekeeper")); + assert!(over_threshcurrent_candidate + .wal_producer_connstr + .contains("advanced by Lsn safekeeper (p)")); + + Ok(()) + } + + #[tokio::test] + async fn timeout_wal_over_threshcurrent_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("timeout_wal_over_threshcurrent_candidate")?; + let current_lsn = Lsn(100_000).align(); + + let id = ZTenantTimelineId { + tenant_id: harness.tenant_id, + timeline_id: TIMELINE_ID, + }; + + let mut data_manager_with_connection = dummy_wal_connection_manager(&harness); + let mut dummy_connection_data = dummy_connection_data(id, NodeId(1)).await; + let lagging_wal_timeout = + chrono::Duration::from_std(data_manager_with_connection.lagging_wal_timeout)?; + let time_over_threshold = + Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; + dummy_connection_data.last_wal_receiver_data = None; + dummy_connection_data.connection_init_time = time_over_threshold; + data_manager_with_connection.wal_connection_data = Some(dummy_connection_data); + + let new_lsn = Lsn(current_lsn.0 + data_manager_with_connection.max_lsn_wal_lag.get() + 1); + let over_threshcurrent_candidate = data_manager_with_connection + .select_connection_candidate(HashMap::from([ + ( + NodeId(0), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(new_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), + }, + ), + ( + NodeId(1), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(current_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("not advanced by Lsn safekeeper".to_string()), + pageserver_connstr: Some("not advanced by Lsn safekeeper".to_string()), + }, + ), + ])) + .expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0)); + match over_threshcurrent_candidate.reason { + ReconnectReason::NoWalTimeout { + last_wal_interaction, + threshold, + .. + } => { + assert_eq!(last_wal_interaction, time_over_threshold); + assert_eq!(threshold, data_manager_with_connection.lagging_wal_timeout); + } + unexpected => panic!("Unexpected reason: {unexpected:?}"), + } + assert!(over_threshcurrent_candidate + .wal_producer_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + assert!(over_threshcurrent_candidate + .wal_producer_connstr + .contains(DUMMY_PAGESERVER_CONNSTR)); + + Ok(()) + } + + fn dummy_wal_connection_manager(harness: &RepoHarness) -> WalConnectionManager { + WalConnectionManager { + id: ZTenantTimelineId { + tenant_id: harness.tenant_id, + timeline_id: TIMELINE_ID, + }, + timeline: Arc::new(DatadirTimelineImpl::new( + harness + .load() + .create_empty_timeline(TIMELINE_ID, Lsn(0)) + .expect("Failed to create an empty timeline for dummy wal connection manager"), + 10_000, + )), + wal_connect_timeout: Duration::from_secs(1), + lagging_wal_timeout: Duration::from_secs(10), + max_lsn_wal_lag: NonZeroU64::new(300_000).unwrap(), + wal_connection_attempt: 0, + wal_connection_data: None, + } + } + + const DUMMY_SAFEKEEPER_CONNSTR: &str = "safekeeper_connstr"; + const DUMMY_PAGESERVER_CONNSTR: &str = "pageserver_connstr"; + + // the function itself does not need async, but it spawns a tokio::task underneath hence neeed + // a runtime to not to panic + async fn dummy_connection_data( + id: ZTenantTimelineId, + safekeeper_id: NodeId, + ) -> WalConnectionData { + let dummy_connstr = + wal_stream_connection_string(id, DUMMY_SAFEKEEPER_CONNSTR, DUMMY_PAGESERVER_CONNSTR) + .expect("Failed to construct dummy wal producer connstr"); + WalConnectionData { + safekeeper_id, + connection: WalReceiverConnection::open( + id, + safekeeper_id, + dummy_connstr, + Duration::from_secs(1), + ), + connection_init_time: Utc::now().naive_utc(), + last_wal_receiver_data: None, + } } } diff --git a/pageserver/src/walreceiver/connection_handler.rs b/pageserver/src/walreceiver/connection_handler.rs new file mode 100644 index 0000000000..aaccee9730 --- /dev/null +++ b/pageserver/src/walreceiver/connection_handler.rs @@ -0,0 +1,405 @@ +//! Actual Postgres connection handler to stream WAL to the server. +//! Runs as a separate, cancellable Tokio task. +use std::{ + str::FromStr, + sync::Arc, + time::{Duration, SystemTime}, +}; + +use anyhow::{bail, ensure, Context}; +use bytes::BytesMut; +use fail::fail_point; +use postgres::{SimpleQueryMessage, SimpleQueryRow}; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_protocol::message::backend::ReplicationMessage; +use postgres_types::PgLsn; +use tokio::{pin, select, sync::watch, time}; +use tokio_postgres::{replication::ReplicationStream, Client}; +use tokio_stream::StreamExt; +use tracing::{debug, error, info, info_span, trace, warn, Instrument}; +use utils::{ + lsn::Lsn, + pq_proto::ZenithFeedback, + zid::{NodeId, ZTenantTimelineId}, +}; + +use crate::{ + http::models::WalReceiverEntry, + repository::{Repository, Timeline}, + tenant_mgr, + walingest::WalIngest, +}; + +#[derive(Debug, Clone)] +pub enum WalConnectionEvent { + Started, + NewWal(ZenithFeedback), + End(Result<(), String>), +} + +/// A wrapper around standalone Tokio task, to poll its updates or cancel the task. +#[derive(Debug)] +pub struct WalReceiverConnection { + handle: tokio::task::JoinHandle<()>, + cancellation: watch::Sender<()>, + events_receiver: watch::Receiver, +} + +impl WalReceiverConnection { + /// Initializes the connection task, returning a set of handles on top of it. + /// The task is started immediately after the creation, fails if no connection is established during the timeout given. + pub fn open( + id: ZTenantTimelineId, + safekeeper_id: NodeId, + wal_producer_connstr: String, + connect_timeout: Duration, + ) -> Self { + let (cancellation, mut cancellation_receiver) = watch::channel(()); + let (events_sender, events_receiver) = watch::channel(WalConnectionEvent::Started); + + let handle = tokio::spawn( + async move { + let connection_result = handle_walreceiver_connection( + id, + &wal_producer_connstr, + &events_sender, + &mut cancellation_receiver, + connect_timeout, + ) + .await + .map_err(|e| { + format!("Walreceiver connection for id {id} failed with error: {e:#}") + }); + + match &connection_result { + Ok(()) => { + debug!("Walreceiver connection for id {id} ended successfully") + } + Err(e) => warn!("{e}"), + } + events_sender + .send(WalConnectionEvent::End(connection_result)) + .ok(); + } + .instrument(info_span!("safekeeper_handle", sk = %safekeeper_id)), + ); + + Self { + handle, + cancellation, + events_receiver, + } + } + + /// Polls for the next WAL receiver event, if there's any available since the last check. + /// Blocks if there's no new event available, returns `None` if no new events will ever occur. + /// Only the last event is returned, all events received between observatins are lost. + pub async fn next_event(&mut self) -> Option { + match self.events_receiver.changed().await { + Ok(()) => Some(self.events_receiver.borrow().clone()), + Err(_cancellation_error) => None, + } + } + + /// Gracefully aborts current WAL streaming task, waiting for the current WAL streamed. + pub async fn shutdown(&mut self) -> anyhow::Result<()> { + self.cancellation.send(()).ok(); + let handle = &mut self.handle; + handle + .await + .context("Failed to join on a walreceiver connection task")?; + Ok(()) + } +} + +async fn handle_walreceiver_connection( + id: ZTenantTimelineId, + wal_producer_connstr: &str, + events_sender: &watch::Sender, + cancellation: &mut watch::Receiver<()>, + connect_timeout: Duration, +) -> anyhow::Result<()> { + // Connect to the database in replication mode. + info!("connecting to {wal_producer_connstr}"); + let connect_cfg = + format!("{wal_producer_connstr} application_name=pageserver replication=true"); + + let (mut replication_client, connection) = time::timeout( + connect_timeout, + tokio_postgres::connect(&connect_cfg, postgres::NoTls), + ) + .await + .context("Timed out while waiting for walreceiver connection to open")? + .context("Failed to open walreceiver conection")?; + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + let mut connection_cancellation = cancellation.clone(); + tokio::spawn( + async move { + info!("connected!"); + select! { + connection_result = connection => match connection_result{ + Ok(()) => info!("Walreceiver db connection closed"), + Err(connection_error) => { + if connection_error.is_closed() { + info!("Connection closed regularly: {connection_error}") + } else { + warn!("Connection aborted: {connection_error}") + } + } + }, + + _ = connection_cancellation.changed() => info!("Connection cancelled"), + } + } + .instrument(info_span!("safekeeper_handle_db")), + ); + + // Immediately increment the gauge, then create a job to decrement it on task exit. + // One of the pros of `defer!` is that this will *most probably* + // get called, even in presence of panics. + let gauge = crate::LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]); + gauge.inc(); + scopeguard::defer! { + gauge.dec(); + } + + let identify = identify_system(&mut replication_client).await?; + info!("{identify:?}"); + let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); + let mut caught_up = false; + let ZTenantTimelineId { + tenant_id, + timeline_id, + } = id; + + let (repo, timeline) = tokio::task::spawn_blocking(move || { + let repo = tenant_mgr::get_repository_for_tenant(tenant_id) + .with_context(|| format!("no repository found for tenant {tenant_id}"))?; + let timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id) + .with_context(|| { + format!("local timeline {timeline_id} not found for tenant {tenant_id}") + })?; + Ok::<_, anyhow::Error>((repo, timeline)) + }) + .await + .with_context(|| format!("Failed to spawn blocking task to get repository and timeline for tenant {tenant_id} timeline {timeline_id}"))??; + + // + // Start streaming the WAL, from where we left off previously. + // + // If we had previously received WAL up to some point in the middle of a WAL record, we + // better start from the end of last full WAL record, not in the middle of one. + let mut last_rec_lsn = timeline.get_last_record_lsn(); + let mut startpoint = last_rec_lsn; + + if startpoint == Lsn(0) { + bail!("No previous WAL position"); + } + + // There might be some padding after the last full record, skip it. + startpoint += startpoint.calc_padding(8u32); + + info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, server is at {end_of_wal}..."); + + let query = format!("START_REPLICATION PHYSICAL {startpoint}"); + + let copy_stream = replication_client.copy_both_simple(&query).await?; + let physical_stream = ReplicationStream::new(copy_stream); + pin!(physical_stream); + + let mut waldecoder = WalStreamDecoder::new(startpoint); + + let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?; + + while let Some(replication_message) = { + select! { + // check for shutdown first + biased; + _ = cancellation.changed() => { + info!("walreceiver interrupted"); + None + } + replication_message = physical_stream.next() => replication_message, + } + } { + let replication_message = replication_message?; + let status_update = match replication_message { + ReplicationMessage::XLogData(xlog_data) => { + // Pass the WAL data to the decoder, and see if we can decode + // more records as a result. + let data = xlog_data.data(); + let startlsn = Lsn::from(xlog_data.wal_start()); + let endlsn = startlsn + data.len() as u64; + + trace!("received XLogData between {startlsn} and {endlsn}"); + + waldecoder.feed_bytes(data); + + while let Some((lsn, recdata)) = waldecoder.poll_decode()? { + let _enter = info_span!("processing record", lsn = %lsn).entered(); + + // It is important to deal with the aligned records as lsn in getPage@LSN is + // aligned and can be several bytes bigger. Without this alignment we are + // at risk of hitting a deadlock. + ensure!(lsn.is_aligned()); + + walingest.ingest_record(&timeline, recdata, lsn)?; + + fail_point!("walreceiver-after-ingest"); + + last_rec_lsn = lsn; + } + + if !caught_up && endlsn >= end_of_wal { + info!("caught up at LSN {endlsn}"); + caught_up = true; + } + + let timeline_to_check = Arc::clone(&timeline.tline); + tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance()) + .await + .with_context(|| { + format!("Spawned checkpoint check task panicked for timeline {id}") + })? + .with_context(|| { + format!("Failed to check checkpoint distance for timeline {id}") + })?; + + Some(endlsn) + } + + ReplicationMessage::PrimaryKeepAlive(keepalive) => { + let wal_end = keepalive.wal_end(); + let timestamp = keepalive.timestamp(); + let reply_requested = keepalive.reply() != 0; + + trace!("received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})"); + + if reply_requested { + Some(last_rec_lsn) + } else { + None + } + } + + _ => None, + }; + + if let Some(last_lsn) = status_update { + let remote_index = repo.get_remote_index(); + let timeline_remote_consistent_lsn = remote_index + .read() + .await + // here we either do not have this timeline in remote index + // or there were no checkpoints for it yet + .timeline_entry(&ZTenantTimelineId { + tenant_id, + timeline_id, + }) + .map(|remote_timeline| remote_timeline.metadata.disk_consistent_lsn()) + // no checkpoint was uploaded + .unwrap_or(Lsn(0)); + + // The last LSN we processed. It is not guaranteed to survive pageserver crash. + let write_lsn = u64::from(last_lsn); + // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data + let flush_lsn = u64::from(timeline.tline.get_disk_consistent_lsn()); + // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash + // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. + let apply_lsn = u64::from(timeline_remote_consistent_lsn); + let ts = SystemTime::now(); + + // Update the current WAL receiver's data stored inside the global hash table `WAL_RECEIVERS` + { + super::WAL_RECEIVER_ENTRIES.write().await.insert( + id, + WalReceiverEntry { + wal_producer_connstr: Some(wal_producer_connstr.to_owned()), + last_received_msg_lsn: Some(last_lsn), + last_received_msg_ts: Some( + ts.duration_since(SystemTime::UNIX_EPOCH) + .expect("Received message time should be before UNIX EPOCH!") + .as_micros(), + ), + }, + ); + } + + // Send zenith feedback message. + // Regular standby_status_update fields are put into this message. + let zenith_status_update = ZenithFeedback { + current_timeline_size: timeline.get_current_logical_size() as u64, + ps_writelsn: write_lsn, + ps_flushlsn: flush_lsn, + ps_applylsn: apply_lsn, + ps_replytime: ts, + }; + + debug!("zenith_status_update {zenith_status_update:?}"); + + let mut data = BytesMut::new(); + zenith_status_update.serialize(&mut data)?; + physical_stream + .as_mut() + .zenith_status_update(data.len() as u64, &data) + .await?; + if let Err(e) = events_sender.send(WalConnectionEvent::NewWal(zenith_status_update)) { + warn!("Wal connection event listener dropped, aborting the connection: {e}"); + return Ok(()); + } + } + } + + Ok(()) +} + +/// Data returned from the postgres `IDENTIFY_SYSTEM` command +/// +/// See the [postgres docs] for more details. +/// +/// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html +#[derive(Debug)] +// As of nightly 2021-09-11, fields that are only read by the type's `Debug` impl still count as +// unused. Relevant issue: https://github.com/rust-lang/rust/issues/88900 +#[allow(dead_code)] +struct IdentifySystem { + systemid: u64, + timeline: u32, + xlogpos: PgLsn, + dbname: Option, +} + +/// There was a problem parsing the response to +/// a postgres IDENTIFY_SYSTEM command. +#[derive(Debug, thiserror::Error)] +#[error("IDENTIFY_SYSTEM parse error")] +struct IdentifyError; + +/// Run the postgres `IDENTIFY_SYSTEM` command +async fn identify_system(client: &mut Client) -> anyhow::Result { + let query_str = "IDENTIFY_SYSTEM"; + let response = client.simple_query(query_str).await?; + + // get(N) from row, then parse it as some destination type. + fn get_parse(row: &SimpleQueryRow, idx: usize) -> Result + where + T: FromStr, + { + let val = row.get(idx).ok_or(IdentifyError)?; + val.parse::().or(Err(IdentifyError)) + } + + // extract the row contents into an IdentifySystem struct. + // written as a closure so I can use ? for Option here. + if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) { + Ok(IdentifySystem { + systemid: get_parse(first_row, 0)?, + timeline: get_parse(first_row, 1)?, + xlogpos: get_parse(first_row, 2)?, + dbname: get_parse(first_row, 3).ok(), + }) + } else { + Err(IdentifyError.into()) + } +} diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 9feb984c4f..5ce2591ff3 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -16,7 +16,8 @@ use toml_edit::Document; use tracing::*; use url::{ParseError, Url}; -use safekeeper::control_file::{self}; +use safekeeper::broker; +use safekeeper::control_file; use safekeeper::defaults::{ DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS, }; @@ -26,7 +27,6 @@ use safekeeper::timeline::GlobalTimelines; use safekeeper::wal_backup; use safekeeper::wal_service; use safekeeper::SafeKeeperConf; -use safekeeper::{broker, callmemaybe}; use utils::{ http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener, zid::NodeId, @@ -272,9 +272,8 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; - let (callmemaybe_tx, callmemaybe_rx) = mpsc::unbounded_channel(); let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); - GlobalTimelines::init(callmemaybe_tx, wal_backup_launcher_tx); + GlobalTimelines::init(wal_backup_launcher_tx); let conf_ = conf.clone(); threads.push( @@ -296,29 +295,14 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo let safekeeper_thread = thread::Builder::new() .name("Safekeeper thread".into()) .spawn(|| { - // thread code - let thread_result = wal_service::thread_main(conf_cloned, pg_listener); - if let Err(e) = thread_result { - info!("safekeeper thread terminated: {}", e); + if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener) { + info!("safekeeper thread terminated: {e}"); } }) .unwrap(); threads.push(safekeeper_thread); - let conf_cloned = conf.clone(); - let callmemaybe_thread = thread::Builder::new() - .name("callmemaybe thread".into()) - .spawn(|| { - // thread code - let thread_result = callmemaybe::thread_main(conf_cloned, callmemaybe_rx); - if let Err(e) = thread_result { - error!("callmemaybe thread terminated: {}", e); - } - }) - .unwrap(); - threads.push(callmemaybe_thread); - if !conf.broker_endpoints.is_empty() { let conf_ = conf.clone(); threads.push( diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 1fae9b00f8..f328d2e85a 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -8,7 +8,6 @@ use url::Url; use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId}; pub mod broker; -pub mod callmemaybe; pub mod control_file; pub mod control_file_upgrade; pub mod handler; diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index a89ed18071..7a6a8ca9b9 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -8,7 +8,6 @@ use anyhow::{bail, Context, Result}; use postgres_ffi::xlog_utils::{get_current_timestamp, TimestampTz, MAX_SEND_SIZE}; -use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; use bytes::Bytes; use serde::{Deserialize, Serialize}; use std::cmp::min; @@ -17,7 +16,6 @@ use std::sync::Arc; use std::thread::sleep; use std::time::Duration; use std::{str, thread}; -use tokio::sync::mpsc::UnboundedSender; use tracing::*; use utils::{ bin_ser::BeSer, @@ -25,7 +23,6 @@ use utils::{ postgres_backend::PostgresBackend, pq_proto::{BeMessage, FeMessage, WalSndKeepAlive, XLogDataBody, ZenithFeedback}, sock_split::ReadStream, - zid::{ZTenantId, ZTimelineId}, }; // See: https://www.postgresql.org/docs/13/protocol-replication.html @@ -83,40 +80,6 @@ impl Drop for ReplicationConnGuard { } } -// XXX: Naming is a bit messy here. -// This ReplicationStreamGuard lives as long as ReplicationConn -// and current ReplicationConnGuard is tied to the background thread -// that receives feedback. -struct ReplicationStreamGuard { - tx: UnboundedSender, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - pageserver_connstr: String, -} - -impl Drop for ReplicationStreamGuard { - fn drop(&mut self) { - // the connection with pageserver is lost, - // resume callback subscription - debug!( - "Connection to pageserver is gone. Resume callmemaybe subsciption if necessary. tenantid {} timelineid {}", - self.tenant_id, self.timeline_id, - ); - - let subscription_key = SubscriptionStateKey::new( - self.tenant_id, - self.timeline_id, - self.pageserver_connstr.to_owned(), - ); - - self.tx - .send(CallmeEvent::Resume(subscription_key)) - .unwrap_or_else(|e| { - error!("failed to send Resume request to callmemaybe thread {}", e); - }); - } -} - impl ReplicationConn { /// Create a new `ReplicationConn` pub fn new(pgb: &mut PostgresBackend) -> Self { @@ -256,36 +219,6 @@ impl ReplicationConn { }; info!("Start replication from {:?} till {:?}", start_pos, stop_pos); - // Don't spam pageserver with callmemaybe queries - // when replication connection with pageserver is already established. - let _guard = { - if spg.appname == Some("wal_proposer_recovery".to_string()) { - None - } else { - let pageserver_connstr = pageserver_connstr.expect("there should be a pageserver connection string since this is not a wal_proposer_recovery"); - let zttid = spg.timeline.get().zttid; - let tx_clone = spg.timeline.get().callmemaybe_tx.clone(); - let subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.clone(), - ); - tx_clone - .send(CallmeEvent::Pause(subscription_key)) - .unwrap_or_else(|e| { - error!("failed to send Pause request to callmemaybe thread {}", e); - }); - - // create a guard to subscribe callback again, when this connection will exit - Some(ReplicationStreamGuard { - tx: tx_clone, - tenant_id: zttid.tenant_id, - timeline_id: zttid.timeline_id, - pageserver_connstr, - }) - } - }; - // switch to copy pgb.write_message(&BeMessage::CopyBothResponse)?; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 2fc5bcc1f6..b7a549fef8 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -16,7 +16,7 @@ use std::fs::{self}; use std::sync::{Arc, Condvar, Mutex, MutexGuard}; use std::time::Duration; -use tokio::sync::mpsc::{Sender, UnboundedSender}; +use tokio::sync::mpsc::Sender; use tracing::*; use utils::{ @@ -25,7 +25,6 @@ use utils::{ zid::{NodeId, ZTenantId, ZTenantTimelineId}, }; -use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; use crate::control_file; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, @@ -191,79 +190,33 @@ impl SharedState { self.wal_backup_active } - /// start/change walsender (via callmemaybe). - fn callmemaybe_sub( + /// Activate timeline's walsender: start/change timeline information propagated into etcd for further pageserver connections. + fn activate_walsender( &mut self, zttid: &ZTenantTimelineId, - pageserver_connstr: Option<&String>, - callmemaybe_tx: &UnboundedSender, - ) -> Result<()> { - if let Some(ref pageserver_connstr) = self.pageserver_connstr { - // unsub old sub. xxx: callmemaybe is going out - let old_subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.to_owned(), - ); - callmemaybe_tx - .send(CallmeEvent::Unsubscribe(old_subscription_key)) - .unwrap_or_else(|e| { - error!("failed to send Pause request to callmemaybe thread {}", e); - }); + new_pageserver_connstr: Option, + ) { + if self.pageserver_connstr != new_pageserver_connstr { + self.deactivate_walsender(zttid); + + if new_pageserver_connstr.is_some() { + info!( + "timeline {} has activated its walsender with connstr {new_pageserver_connstr:?}", + zttid.timeline_id, + ); + } + self.pageserver_connstr = new_pageserver_connstr; } - if let Some(pageserver_connstr) = pageserver_connstr { - let subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.to_owned(), - ); - // xx: sending to channel under lock is not very cool, but - // shouldn't be a problem here. If it is, we can grab a counter - // here and later augment channel messages with it. - callmemaybe_tx - .send(CallmeEvent::Subscribe(subscription_key)) - .unwrap_or_else(|e| { - error!( - "failed to send Subscribe request to callmemaybe thread {}", - e - ); - }); - info!( - "timeline {} is subscribed to callmemaybe to {}", - zttid.timeline_id, pageserver_connstr - ); - } - self.pageserver_connstr = pageserver_connstr.map(|c| c.to_owned()); - Ok(()) } - /// Deactivate the timeline: stop callmemaybe. - fn callmemaybe_unsub( - &mut self, - zttid: &ZTenantTimelineId, - callmemaybe_tx: &UnboundedSender, - ) -> Result<()> { - if let Some(ref pageserver_connstr) = self.pageserver_connstr { - let subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.to_owned(), - ); - callmemaybe_tx - .send(CallmeEvent::Unsubscribe(subscription_key)) - .unwrap_or_else(|e| { - error!( - "failed to send Unsubscribe request to callmemaybe thread {}", - e - ); - }); + /// Deactivate the timeline: stop sending the timeline data into etcd, so no pageserver can connect for WAL streaming. + fn deactivate_walsender(&mut self, zttid: &ZTenantTimelineId) { + if let Some(pageserver_connstr) = self.pageserver_connstr.take() { info!( - "timeline {} is unsubscribed from callmemaybe to {}", + "timeline {} had deactivated its wallsender with connstr {pageserver_connstr:?}", zttid.timeline_id, - self.pageserver_connstr.as_ref().unwrap() - ); + ) } - Ok(()) } fn get_wal_seg_size(&self) -> usize { @@ -332,7 +285,6 @@ impl SharedState { /// Database instance (tenant) pub struct Timeline { pub zttid: ZTenantTimelineId, - pub callmemaybe_tx: UnboundedSender, /// Sending here asks for wal backup launcher attention (start/stop /// offloading). Sending zttid instead of concrete command allows to do /// sending without timeline lock. @@ -348,7 +300,6 @@ pub struct Timeline { impl Timeline { fn new( zttid: ZTenantTimelineId, - callmemaybe_tx: UnboundedSender, wal_backup_launcher_tx: Sender, shared_state: SharedState, ) -> Timeline { @@ -356,7 +307,6 @@ impl Timeline { watch::channel(shared_state.sk.inmem.commit_lsn); Timeline { zttid, - callmemaybe_tx, wal_backup_launcher_tx, commit_lsn_watch_tx, commit_lsn_watch_rx, @@ -378,7 +328,7 @@ impl Timeline { // should have kind of generations assigned by compute to distinguish // the latest one or even pass it through consensus to reliably deliver // to all safekeepers. - shared_state.callmemaybe_sub(&self.zttid, pageserver_connstr, &self.callmemaybe_tx)?; + shared_state.activate_walsender(&self.zttid, pageserver_connstr.cloned()); } // Wake up wal backup launcher, if offloading not started yet. if is_wal_backup_action_pending { @@ -414,7 +364,7 @@ impl Timeline { (replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); if stop { - shared_state.callmemaybe_unsub(&self.zttid, &self.callmemaybe_tx)?; + shared_state.deactivate_walsender(&self.zttid); return Ok(true); } } @@ -431,16 +381,14 @@ impl Timeline { /// Deactivates the timeline, assuming it is being deleted. /// Returns whether the timeline was already active. /// - /// The callmemaybe thread is stopped by the deactivation message. We assume all other threads - /// will stop by themselves eventually (possibly with errors, but no panics). There should be no - /// compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but + /// We assume all threads will stop by themselves eventually (possibly with errors, but no panics). + /// There should be no compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but /// we're deleting the timeline anyway. pub async fn deactivate_for_delete(&self) -> Result { let was_active: bool; { - let mut shared_state = self.mutex.lock().unwrap(); + let shared_state = self.mutex.lock().unwrap(); was_active = shared_state.active; - shared_state.callmemaybe_unsub(&self.zttid, &self.callmemaybe_tx)?; } self.wal_backup_launcher_tx.send(self.zttid).await?; Ok(was_active) @@ -576,7 +524,8 @@ impl Timeline { shared_state.sk.inmem.remote_consistent_lsn, )), peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn), - safekeeper_connection_string: Some(conf.listen_pg_addr.clone()), + safekeeper_connstr: Some(conf.listen_pg_addr.clone()), + pageserver_connstr: shared_state.pageserver_connstr.clone(), backup_lsn: Some(shared_state.sk.inmem.backup_lsn), }) } @@ -675,14 +624,12 @@ impl TimelineTools for Option> { struct GlobalTimelinesState { timelines: HashMap>, - callmemaybe_tx: Option>, wal_backup_launcher_tx: Option>, } lazy_static! { static ref TIMELINES_STATE: Mutex = Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), - callmemaybe_tx: None, wal_backup_launcher_tx: None, }); } @@ -697,13 +644,8 @@ pub struct TimelineDeleteForceResult { pub struct GlobalTimelines; impl GlobalTimelines { - pub fn init( - callmemaybe_tx: UnboundedSender, - wal_backup_launcher_tx: Sender, - ) { + pub fn init(wal_backup_launcher_tx: Sender) { let mut state = TIMELINES_STATE.lock().unwrap(); - assert!(state.callmemaybe_tx.is_none()); - state.callmemaybe_tx = Some(callmemaybe_tx); assert!(state.wal_backup_launcher_tx.is_none()); state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); } @@ -726,7 +668,6 @@ impl GlobalTimelines { let new_tli = Arc::new(Timeline::new( zttid, - state.callmemaybe_tx.as_ref().unwrap().clone(), state.wal_backup_launcher_tx.as_ref().unwrap().clone(), shared_state, )); @@ -778,7 +719,6 @@ impl GlobalTimelines { let new_tli = Arc::new(Timeline::new( zttid, - state.callmemaybe_tx.as_ref().unwrap().clone(), state.wal_backup_launcher_tx.as_ref().unwrap().clone(), shared_state, )); diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 2b0e5ae8bd..d22654ad3e 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -63,10 +63,11 @@ def test_pageserver_http_get_wal_receiver_not_found(zenith_simple_env: ZenithEnv tenant_id, timeline_id = env.zenith_cli.create_tenant() - # no PG compute node is running, so no WAL receiver is running - with pytest.raises(ZenithPageserverApiException) as e: - _ = client.wal_receiver_get(tenant_id, timeline_id) - assert "Not Found" in str(e.value) + empty_response = client.wal_receiver_get(tenant_id, timeline_id) + + assert empty_response.get('wal_producer_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running' + assert empty_response.get('last_received_msg_lsn') is None, 'Should not be able to connect to WAL streaming without PG compute node running' + assert empty_response.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running' def test_pageserver_http_get_wal_receiver_success(zenith_simple_env: ZenithEnv): @@ -81,7 +82,6 @@ def test_pageserver_http_get_wal_receiver_success(zenith_simple_env: ZenithEnv): # a successful `wal_receiver_get` response must contain the below fields assert list(res.keys()) == [ - "thread_id", "wal_producer_connstr", "last_received_msg_lsn", "last_received_msg_ts", diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index ff905efa53..37bc5fe541 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1600,9 +1600,7 @@ class Postgres(PgProtocol): for cfg_line in cfg_lines: # walproposer uses different application_name if ("synchronous_standby_names" in cfg_line or - # don't ask pageserver to fetch WAL from compute - "callmemaybe_connstring" in cfg_line or - # don't repeat safekeepers multiple times + # don't repeat safekeepers/wal_acceptors multiple times "safekeepers" in cfg_line): continue f.write(cfg_line) diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index 0e16d3e749..a8a1ff7687 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -13,16 +13,12 @@ from fixtures.zenith_fixtures import ZenithEnvBuilder @pytest.mark.parametrize('tenants_count', [1, 5, 10]) -@pytest.mark.parametrize('use_safekeepers', ['with_wa', 'without_wa']) def test_bulk_tenant_create( zenith_env_builder: ZenithEnvBuilder, - use_safekeepers: str, tenants_count: int, zenbenchmark, ): - """Measure tenant creation time (with and without wal acceptors)""" - if use_safekeepers == 'with_wa': - zenith_env_builder.num_safekeepers = 3 + zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() time_slices = [] @@ -31,15 +27,15 @@ def test_bulk_tenant_create( start = timeit.default_timer() tenant, _ = env.zenith_cli.create_tenant() - env.zenith_cli.create_timeline( - f'test_bulk_tenant_create_{tenants_count}_{i}_{use_safekeepers}', tenant_id=tenant) + env.zenith_cli.create_timeline(f'test_bulk_tenant_create_{tenants_count}_{i}', + tenant_id=tenant) # FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now? #if use_safekeepers == 'with_sa': # wa_factory.start_n_new(3) - pg_tenant = env.postgres.create_start( - f'test_bulk_tenant_create_{tenants_count}_{i}_{use_safekeepers}', tenant_id=tenant) + pg_tenant = env.postgres.create_start(f'test_bulk_tenant_create_{tenants_count}_{i}', + tenant_id=tenant) end = timeit.default_timer() time_slices.append(end - start) From 1188c9a95c6fe55a8b37e8f52402ef2e954f934e Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 30 May 2022 20:38:28 +0300 Subject: [PATCH 0387/1022] remove extra span as this code is already covered by create timeline span E g this log line contains duplicated data: INFO /timeline_create{tenant=8d367870988250a755101b5189bbbc17 new_timeline=Some(27e2580f51f5660642d8ce124e9ee4ac) lsn=None}: bootstrapping{timeline=27e2580f51f5660642d8ce124e9ee4ac tenant=8d367870988250a755101b5189bbbc17}: created root timeline 27e2580f51f5660642d8ce124e9ee4ac timeline.lsn 0/16960E8 this avoids variable duplication in `bootstrapping` subspan --- pageserver/src/timelines.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 9ab063107c..a3939661c1 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -283,8 +283,6 @@ fn bootstrap_timeline( tli: ZTimelineId, repo: &R, ) -> Result<()> { - let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered(); - let initdb_path = conf .tenant_path(&tenantid) .join(format!("tmp-timeline-{}", tli)); From de7eda2dc6a6dbad3c3ec96e71673c5a8a48bb79 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 1 Jun 2022 23:23:35 +0300 Subject: [PATCH 0388/1022] Fix url path printing --- control_plane/src/local_env.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 2623f65242..f7bb890893 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -119,16 +119,24 @@ impl EtcdBroker { } pub fn comma_separated_endpoints(&self) -> String { - self.broker_endpoints.iter().map(Url::as_str).fold( - String::new(), - |mut comma_separated_urls, url| { + self.broker_endpoints + .iter() + .map(|url| { + // URL by default adds a '/' path at the end, which is not what etcd CLI wants. + let url_string = url.as_str(); + if url_string.ends_with('/') { + &url_string[0..url_string.len() - 1] + } else { + url_string + } + }) + .fold(String::new(), |mut comma_separated_urls, url| { if !comma_separated_urls.is_empty() { comma_separated_urls.push(','); } comma_separated_urls.push_str(url); comma_separated_urls - }, - ) + }) } } From c71faae2c61b1c2578f33afcbe877b206e4867c9 Mon Sep 17 00:00:00 2001 From: Ryan Russell Date: Wed, 1 Jun 2022 14:59:16 -0500 Subject: [PATCH 0389/1022] Docs readability cont Signed-off-by: Ryan Russell --- docs/core_changes.md | 2 +- docs/rfcs/002-storage.md | 2 +- pageserver/src/layered_repository/README.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/core_changes.md b/docs/core_changes.md index db311e3667..82c5addd16 100644 --- a/docs/core_changes.md +++ b/docs/core_changes.md @@ -188,7 +188,7 @@ Not currently committed but proposed: 3. Prefetching - Why? As far as pages in Zenith are loaded on demand, to reduce node startup time - and also sppedup some massive queries we need some mechanism for bulk loading to + and also speedup some massive queries we need some mechanism for bulk loading to reduce page request round-trip overhead. Currently Postgres is supporting prefetching only for bitmap scan. diff --git a/docs/rfcs/002-storage.md b/docs/rfcs/002-storage.md index 6e756df4bf..f99683cf09 100644 --- a/docs/rfcs/002-storage.md +++ b/docs/rfcs/002-storage.md @@ -77,7 +77,7 @@ Upon storage node restart recent WAL files are applied to appropriate pages and ### **Checkpointing** -No such mechanism is needed. Or we may look at the storage node as at kind of continuous chekpointer. +No such mechanism is needed. Or we may look at the storage node as at kind of continuous checkpointer. ### **Full page writes (torn page protection)** diff --git a/pageserver/src/layered_repository/README.md b/pageserver/src/layered_repository/README.md index 81f585d2e2..15040d21b2 100644 --- a/pageserver/src/layered_repository/README.md +++ b/pageserver/src/layered_repository/README.md @@ -409,7 +409,7 @@ removed because there is no newer layer file for the table. Things get slightly more complicated with multiple branches. All of the above still holds, but in addition to recent files we must also -retain older shapshot files that are still needed by child branches. +retain older snapshot files that are still needed by child branches. For example, if child branch is created at LSN 150, and the 'customers' table is updated on the branch, you would have these files: From b155fe0e2fce278e50a7e1c05612ff1f376f35bb Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 2 Jun 2022 16:54:50 +0300 Subject: [PATCH 0390/1022] avoid perf test result context for pg regress --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index fde6cbd35f..9aca415dc8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -750,7 +750,6 @@ workflows: - build-postgres-<< matrix.build_type >> - run-pytest: name: pg_regress-tests-<< matrix.build_type >> - context: PERF_TEST_RESULT_CONNSTR matrix: parameters: build_type: ["debug", "release"] From aba5e5f8b5c611de06836538d1058ce7a5a7e671 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Wed, 1 Jun 2022 20:50:15 +0300 Subject: [PATCH 0391/1022] GitHub Actions: pin Rust version to 1.58 like on CircleCI * Fix failing `cargo clippy` while we're here. The behavior has been changed in Rust 1.60: https://github.com/rust-lang/rust-clippy/issues/8928 * Add Rust version to the Cargo deps cache key --- .github/workflows/testing.yml | 4 ++-- pageserver/src/walreceiver.rs | 9 +++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 41f9f51e86..aa1e152fb2 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -12,7 +12,7 @@ jobs: matrix: # If we want to duplicate this job for different # Rust toolchains (e.g. nightly or 1.37.0), add them here. - rust_toolchain: [stable] + rust_toolchain: [1.58] os: [ubuntu-latest, macos-latest] timeout-minutes: 30 name: run regression test suite @@ -87,7 +87,7 @@ jobs: ~/.cargo/registry ~/.cargo/git target - key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }} + key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} - name: Run cargo clippy run: ./run_clippy.sh diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index df8dd2fc29..e54406a450 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -469,12 +469,9 @@ async fn timeline_wal_broker_loop_step( updates = broker_subscription.fetch_data() => match updates { Some(mut all_timeline_updates) => { if let Some(subscribed_timeline_updates) = all_timeline_updates.remove(&id) { - match wal_connection_manager.select_connection_candidate(subscribed_timeline_updates) { - Some(candidate) => { - info!("Switching to different safekeeper {} for timeline {id}, reason: {:?}", candidate.safekeeper_id, candidate.reason); - wal_connection_manager.change_connection(candidate.safekeeper_id, candidate.wal_producer_connstr).await; - }, - None => {} + if let Some(candidate) = wal_connection_manager.select_connection_candidate(subscribed_timeline_updates) { + info!("Switching to different safekeeper {} for timeline {id}, reason: {:?}", candidate.safekeeper_id, candidate.reason); + wal_connection_manager.change_connection(candidate.safekeeper_id, candidate.wal_producer_connstr).await; } } }, From 90e2c9ee1f62a2676cfe56053704b9377c48e2e0 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Thu, 2 Jun 2022 16:21:28 -0400 Subject: [PATCH 0392/1022] Rename zenith to neon in python tests (#1871) --- test_runner/README.md | 24 +- .../batch_others/test_ancestor_branch.py | 25 +-- test_runner/batch_others/test_auth.py | 26 +-- test_runner/batch_others/test_backpressure.py | 14 +- .../batch_others/test_basebackup_error.py | 8 +- .../batch_others/test_branch_behind.py | 46 ++-- .../batch_others/test_broken_timeline.py | 32 +-- .../batch_others/test_clog_truncate.py | 14 +- test_runner/batch_others/test_config.py | 8 +- test_runner/batch_others/test_createdropdb.py | 30 +-- test_runner/batch_others/test_createuser.py | 10 +- .../batch_others/test_gc_aggressive.py | 16 +- test_runner/batch_others/test_lsn_mapping.py | 10 +- test_runner/batch_others/test_multixact.py | 10 +- .../{test_zenith_cli.py => test_neon_cli.py} | 60 ++--- test_runner/batch_others/test_next_xid.py | 6 +- test_runner/batch_others/test_normal_work.py | 10 +- .../batch_others/test_old_request_lsn.py | 10 +- .../batch_others/test_pageserver_api.py | 38 ++-- .../batch_others/test_pageserver_catchup.py | 10 +- .../batch_others/test_pageserver_restart.py | 8 +- .../batch_others/test_parallel_copy.py | 8 +- test_runner/batch_others/test_pitr_gc.py | 12 +- .../batch_others/test_read_validation.py | 16 +- .../batch_others/test_readonly_node.py | 8 +- test_runner/batch_others/test_recovery.py | 16 +- .../batch_others/test_remote_storage.py | 12 +- .../batch_others/test_restart_compute.py | 12 +- test_runner/batch_others/test_subxacts.py | 10 +- test_runner/batch_others/test_tenant_conf.py | 22 +- .../batch_others/test_tenant_relocation.py | 24 +- test_runner/batch_others/test_tenants.py | 36 +-- .../test_tenants_with_remote_storage.py | 20 +- .../batch_others/test_timeline_size.py | 16 +- test_runner/batch_others/test_twophase.py | 10 +- test_runner/batch_others/test_vm_bits.py | 10 +- test_runner/batch_others/test_wal_acceptor.py | 136 ++++++------ .../batch_others/test_wal_acceptor_async.py | 20 +- test_runner/batch_others/test_wal_restore.py | 22 +- .../batch_pg_regress/test_isolation.py | 8 +- ...zenith_regress.py => test_neon_regress.py} | 20 +- .../batch_pg_regress/test_pg_regress.py | 8 +- test_runner/conftest.py | 2 +- test_runner/fixtures/benchmark_fixture.py | 12 +- test_runner/fixtures/compare_fixtures.py | 34 +-- .../{zenith_fixtures.py => neon_fixtures.py} | 206 +++++++++--------- .../.gitignore | 0 .../README.md | 4 +- .../expected/.gitignore | 0 .../expected/neon-cid.out} | 0 .../expected/neon-clog.out} | 0 .../expected/neon-rel-truncate.out} | 0 .../expected/neon-vacuum-full.out} | 0 .../parallel_schedule | 8 +- .../sql/.gitignore | 0 .../sql/neon-cid.sql} | 0 .../sql/neon-clog.sql} | 0 .../sql/neon-rel-truncate.sql} | 0 .../sql/neon-vacuum-full.sql} | 0 test_runner/performance/test_bulk_insert.py | 10 +- .../performance/test_bulk_tenant_create.py | 14 +- test_runner/performance/test_copy.py | 10 +- test_runner/performance/test_gist_build.py | 8 +- test_runner/performance/test_hot_page.py | 2 +- test_runner/performance/test_hot_table.py | 2 +- .../performance/test_parallel_copy_to.py | 14 +- test_runner/performance/test_perf_pgbench.py | 24 +- test_runner/performance/test_random_writes.py | 12 +- test_runner/performance/test_seqscans.py | 8 +- test_runner/performance/test_startup.py | 12 +- .../performance/test_write_amplification.py | 8 +- test_runner/test_broken.py | 8 +- 72 files changed, 629 insertions(+), 630 deletions(-) rename test_runner/batch_others/{test_zenith_cli.py => test_neon_cli.py} (60%) rename test_runner/batch_pg_regress/{test_zenith_regress.py => test_neon_regress.py} (75%) rename test_runner/fixtures/{zenith_fixtures.py => neon_fixtures.py} (92%) rename test_runner/{zenith_regress => neon_regress}/.gitignore (100%) rename test_runner/{zenith_regress => neon_regress}/README.md (56%) rename test_runner/{zenith_regress => neon_regress}/expected/.gitignore (100%) rename test_runner/{zenith_regress/expected/zenith-cid.out => neon_regress/expected/neon-cid.out} (100%) rename test_runner/{zenith_regress/expected/zenith-clog.out => neon_regress/expected/neon-clog.out} (100%) rename test_runner/{zenith_regress/expected/zenith-rel-truncate.out => neon_regress/expected/neon-rel-truncate.out} (100%) rename test_runner/{zenith_regress/expected/zenith-vacuum-full.out => neon_regress/expected/neon-vacuum-full.out} (100%) rename test_runner/{zenith_regress => neon_regress}/parallel_schedule (71%) rename test_runner/{zenith_regress => neon_regress}/sql/.gitignore (100%) rename test_runner/{zenith_regress/sql/zenith-cid.sql => neon_regress/sql/neon-cid.sql} (100%) rename test_runner/{zenith_regress/sql/zenith-clog.sql => neon_regress/sql/neon-clog.sql} (100%) rename test_runner/{zenith_regress/sql/zenith-rel-truncate.sql => neon_regress/sql/neon-rel-truncate.sql} (100%) rename test_runner/{zenith_regress/sql/zenith-vacuum-full.sql => neon_regress/sql/neon-vacuum-full.sql} (100%) diff --git a/test_runner/README.md b/test_runner/README.md index 059bbb83cc..f95588462b 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -1,14 +1,14 @@ -## Zenith test runner +## Neon test runner This directory contains integration tests. Prerequisites: - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python) -- Zenith and Postgres binaries +- Neon and Postgres binaries - See the root [README.md](/README.md) for build directions - Tests can be run from the git tree; or see the environment variables below to run from other directories. -- The zenith git repo, including the postgres submodule +- The neon git repo, including the postgres submodule (for some tests, e.g. `pg_regress`) - Some tests (involving storage nodes coordination) require etcd installed. Follow [`the guide`](https://etcd.io/docs/v3.5/install/) to obtain it. @@ -51,8 +51,8 @@ Useful environment variables: should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. `ZENITH_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as -`--pageserver-config-override=${value}` parameter values when zenith cli is invoked -`RUST_LOG`: logging configuration to pass into Zenith CLI +`--pageserver-config-override=${value}` parameter values when neon_local cli is invoked +`RUST_LOG`: logging configuration to pass into Neon CLI Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them: `./scripts/pytest -s --log-cli-level=INFO ...` @@ -65,32 +65,32 @@ Exit after the first test failure: ### Writing a test -Every test needs a Zenith Environment, or ZenithEnv to operate in. A Zenith Environment +Every test needs a Neon Environment, or NeonEnv to operate in. A Neon Environment is like a little cloud-in-a-box, and consists of a Pageserver, 0-N Safekeepers, and compute Postgres nodes. The connections between them can be configured to use JWT authentication tokens, and some other configuration options can be tweaked too. -The easiest way to get access to a Zenith Environment is by using the `zenith_simple_env` +The easiest way to get access to a Neon Environment is by using the `neon_simple_env` fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes or make other destructive changes in that environment. Also don't assume that there are no tenants or branches or data in the cluster. For convenience, there is a branch called `empty`, though. The convention is to create a test-specific branch of that and load any test data there, instead of the 'main' branch. -For more complicated cases, you can build a custom Zenith Environment, with the `zenith_env` +For more complicated cases, you can build a custom Neon Environment, with the `neon_env` fixture: ```python -def test_foobar(zenith_env_builder: ZenithEnvBuilder): +def test_foobar(neon_env_builder: NeonEnvBuilder): # Prescribe the environment. # We want to have 3 safekeeper nodes, and use JWT authentication in the # connections to the page server - zenith_env_builder.num_safekeepers = 3 - zenith_env_builder.set_pageserver_auth(True) + neon_env_builder.num_safekeepers = 3 + neon_env_builder.set_pageserver_auth(True) # Now create the environment. This initializes the repository, and starts # up the page server and the safekeepers - env = zenith_env_builder.init_start() + env = neon_env_builder.init_start() # Run the test ... diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index 78724c434e..3a16157093 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -3,18 +3,18 @@ from contextlib import closing import psycopg2.extras import pytest from fixtures.log_helper import log -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverApiException +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException # # Create ancestor branches off the main branch. # -def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init_start() +def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() # Override defaults, 1M gc_horizon and 4M checkpoint_distance. # Extend compaction_period and gc_period to disable background compaction and gc. - tenant, _ = env.zenith_cli.create_tenant( + tenant, _ = env.neon_cli.create_tenant( conf={ 'gc_period': '10 m', 'gc_horizon': '1048576', @@ -48,7 +48,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 100k rows: {lsn_100}') # Create branch1. - env.zenith_cli.create_branch('branch1', 'main', tenant_id=tenant, ancestor_start_lsn=lsn_100) + env.neon_cli.create_branch('branch1', 'main', tenant_id=tenant, ancestor_start_lsn=lsn_100) pg_branch1 = env.postgres.create_start('branch1', tenant_id=tenant) log.info("postgres is running on 'branch1' branch") @@ -72,7 +72,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 200k rows: {lsn_200}') # Create branch2. - env.zenith_cli.create_branch('branch2', 'branch1', tenant_id=tenant, ancestor_start_lsn=lsn_200) + env.neon_cli.create_branch('branch2', 'branch1', tenant_id=tenant, ancestor_start_lsn=lsn_200) pg_branch2 = env.postgres.create_start('branch2', tenant_id=tenant) log.info("postgres is running on 'branch2' branch") branch2_cur = pg_branch2.connect().cursor() @@ -110,15 +110,14 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): assert branch2_cur.fetchone() == (300000, ) -def test_ancestor_branch_detach(zenith_simple_env: ZenithEnv): - env = zenith_simple_env +def test_ancestor_branch_detach(neon_simple_env: NeonEnv): + env = neon_simple_env - parent_timeline_id = env.zenith_cli.create_branch("test_ancestor_branch_detach_parent", "empty") + parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_detach_parent", "empty") - env.zenith_cli.create_branch("test_ancestor_branch_detach_branch1", - "test_ancestor_branch_detach_parent") + env.neon_cli.create_branch("test_ancestor_branch_detach_branch1", + "test_ancestor_branch_detach_parent") ps_http = env.pageserver.http_client() - with pytest.raises(ZenithPageserverApiException, - match="Failed to detach inmem tenant timeline"): + with pytest.raises(NeonPageserverApiException, match="Failed to detach inmem tenant timeline"): ps_http.timeline_detach(env.initial_tenant, parent_timeline_id) diff --git a/test_runner/batch_others/test_auth.py b/test_runner/batch_others/test_auth.py index a8ad384f27..73120880d3 100644 --- a/test_runner/batch_others/test_auth.py +++ b/test_runner/batch_others/test_auth.py @@ -1,14 +1,14 @@ from contextlib import closing from typing import Iterator from uuid import UUID, uuid4 -from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithPageserverApiException +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException from requests.exceptions import HTTPError import pytest -def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.pageserver_auth_enabled = True - env = zenith_env_builder.init_start() +def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): + neon_env_builder.pageserver_auth_enabled = True + env = neon_env_builder.init_start() ps = env.pageserver @@ -25,8 +25,8 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): ps.safe_psql("set FOO", password=tenant_token) ps.safe_psql("set FOO", password=management_token) - new_timeline_id = env.zenith_cli.create_branch('test_pageserver_auth', - tenant_id=env.initial_tenant) + new_timeline_id = env.neon_cli.create_branch('test_pageserver_auth', + tenant_id=env.initial_tenant) # tenant can create branches tenant_http_client.timeline_create(tenant_id=env.initial_tenant, @@ -36,7 +36,7 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): ancestor_timeline_id=new_timeline_id) # fail to create branch using token with different tenant_id - with pytest.raises(ZenithPageserverApiException, + with pytest.raises(NeonPageserverApiException, match='Forbidden: Tenant id mismatch. Permission denied'): invalid_tenant_http_client.timeline_create(tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id) @@ -46,21 +46,21 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): # fail to create tenant using tenant token with pytest.raises( - ZenithPageserverApiException, + NeonPageserverApiException, match='Forbidden: Attempt to access management api with tenant scope. Permission denied' ): tenant_http_client.tenant_create() @pytest.mark.parametrize('with_safekeepers', [False, True]) -def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_safekeepers: bool): - zenith_env_builder.pageserver_auth_enabled = True +def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): + neon_env_builder.pageserver_auth_enabled = True if with_safekeepers: - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() branch = f'test_compute_auth_to_pageserver{with_safekeepers}' - env.zenith_cli.create_branch(branch) + env.neon_cli.create_branch(branch) pg = env.postgres.create_start(branch) with closing(pg.connect()) as conn: diff --git a/test_runner/batch_others/test_backpressure.py b/test_runner/batch_others/test_backpressure.py index 5debb2ee61..f89ee14691 100644 --- a/test_runner/batch_others/test_backpressure.py +++ b/test_runner/batch_others/test_backpressure.py @@ -1,15 +1,15 @@ from contextlib import closing, contextmanager import psycopg2.extras import pytest -from fixtures.zenith_fixtures import PgProtocol, ZenithEnvBuilder +from fixtures.neon_fixtures import PgProtocol, NeonEnvBuilder from fixtures.log_helper import log import os import time import asyncpg -from fixtures.zenith_fixtures import Postgres +from fixtures.neon_fixtures import Postgres import threading -pytest_plugins = ("fixtures.zenith_fixtures") +pytest_plugins = ("fixtures.neon_fixtures") @contextmanager @@ -26,7 +26,7 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv log.info("checks started") with pg_cur(pg) as cur: - cur.execute("CREATE EXTENSION neon") # TODO move it to zenith_fixtures? + cur.execute("CREATE EXTENSION neon") # TODO move it to neon_fixtures? cur.execute("select pg_size_bytes(current_setting('max_replication_write_lag'))") res = cur.fetchone() @@ -93,10 +93,10 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv @pytest.mark.skip("See https://github.com/neondatabase/neon/issues/1587") -def test_backpressure_received_lsn_lag(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init_start() +def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() # Create a branch for us - env.zenith_cli.create_branch('test_backpressure') + env.neon_cli.create_branch('test_backpressure') pg = env.postgres.create_start('test_backpressure', config_lines=['max_replication_write_lag=30MB']) diff --git a/test_runner/batch_others/test_basebackup_error.py b/test_runner/batch_others/test_basebackup_error.py index 4b8b8a746c..29cbe59d2e 100644 --- a/test_runner/batch_others/test_basebackup_error.py +++ b/test_runner/batch_others/test_basebackup_error.py @@ -1,7 +1,7 @@ import pytest from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log @@ -9,9 +9,9 @@ from fixtures.log_helper import log # Test error handling, if the 'basebackup' command fails in the middle # of building the tar archive. # -def test_basebackup_error(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_basebackup_error", "empty") +def test_basebackup_error(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_basebackup_error", "empty") # Introduce failpoint env.pageserver.safe_psql(f"failpoints basebackup-before-control-file=return") diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index 7a00ecfca2..4f4c058b61 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -5,26 +5,26 @@ import psycopg2.extras import pytest from fixtures.log_helper import log from fixtures.utils import print_gc_result -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder # # Create a couple of branches off the main branch, at a historical point in time. # -def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): +def test_branch_behind(neon_env_builder: NeonEnvBuilder): # Use safekeeper in this test to avoid a subtle race condition. # Without safekeeper, walreceiver reconnection can stuck # because of IO deadlock. # - # See https://github.com/zenithdb/zenith/issues/1068 - zenith_env_builder.num_safekeepers = 1 + # See https://github.com/neondatabase/neon/issues/1068 + neon_env_builder.num_safekeepers = 1 # Disable pitr, because here we want to test branch creation after GC - zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = zenith_env_builder.init_start() + neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" + env = neon_env_builder.init_start() # Branch at the point where only 100 rows were inserted - env.zenith_cli.create_branch('test_branch_behind') + env.neon_cli.create_branch('test_branch_behind') pgmain = env.postgres.create_start('test_branch_behind') log.info("postgres is running on 'test_branch_behind' branch") @@ -61,9 +61,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 200100 rows: {lsn_b}') # Branch at the point where only 100 rows were inserted - env.zenith_cli.create_branch('test_branch_behind_hundred', - 'test_branch_behind', - ancestor_start_lsn=lsn_a) + env.neon_cli.create_branch('test_branch_behind_hundred', + 'test_branch_behind', + ancestor_start_lsn=lsn_a) # Insert many more rows. This generates enough WAL to fill a few segments. main_cur.execute(''' @@ -78,9 +78,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 400100 rows: {lsn_c}') # Branch at the point where only 200100 rows were inserted - env.zenith_cli.create_branch('test_branch_behind_more', - 'test_branch_behind', - ancestor_start_lsn=lsn_b) + env.neon_cli.create_branch('test_branch_behind_more', + 'test_branch_behind', + ancestor_start_lsn=lsn_b) pg_hundred = env.postgres.create_start('test_branch_behind_hundred') pg_more = env.postgres.create_start('test_branch_behind_more') @@ -104,9 +104,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): # Check bad lsn's for branching # branch at segment boundary - env.zenith_cli.create_branch('test_branch_segment_boundary', - 'test_branch_behind', - ancestor_start_lsn="0/3000000") + env.neon_cli.create_branch('test_branch_segment_boundary', + 'test_branch_behind', + ancestor_start_lsn="0/3000000") pg = env.postgres.create_start('test_branch_segment_boundary') cur = pg.connect().cursor() cur.execute('SELECT 1') @@ -114,13 +114,13 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): # branch at pre-initdb lsn with pytest.raises(Exception, match="invalid branch start lsn"): - env.zenith_cli.create_branch('test_branch_preinitdb', ancestor_start_lsn="0/42") + env.neon_cli.create_branch('test_branch_preinitdb', ancestor_start_lsn="0/42") # branch at pre-ancestor lsn with pytest.raises(Exception, match="less than timeline ancestor lsn"): - env.zenith_cli.create_branch('test_branch_preinitdb', - 'test_branch_behind', - ancestor_start_lsn="0/42") + env.neon_cli.create_branch('test_branch_preinitdb', + 'test_branch_behind', + ancestor_start_lsn="0/42") # check that we cannot create branch based on garbage collected data with closing(env.pageserver.connect()) as psconn: @@ -132,9 +132,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): with pytest.raises(Exception, match="invalid branch start lsn"): # this gced_lsn is pretty random, so if gc is disabled this woudln't fail - env.zenith_cli.create_branch('test_branch_create_fail', - 'test_branch_behind', - ancestor_start_lsn=gced_lsn) + env.neon_cli.create_branch('test_branch_create_fail', + 'test_branch_behind', + ancestor_start_lsn=gced_lsn) # check that after gc everything is still there hundred_cur.execute('SELECT count(*) FROM foo') diff --git a/test_runner/batch_others/test_broken_timeline.py b/test_runner/batch_others/test_broken_timeline.py index 05391f7e4d..b72f337e06 100644 --- a/test_runner/batch_others/test_broken_timeline.py +++ b/test_runner/batch_others/test_broken_timeline.py @@ -1,22 +1,22 @@ import pytest import concurrent.futures from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithEnv +from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv from fixtures.log_helper import log import os # Test restarting page server, while safekeeper and compute node keep # running. -def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder): +def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # One safekeeper is enough for this test. - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() tenant_timelines = [] for n in range(4): - tenant_id_uuid, timeline_id_uuid = env.zenith_cli.create_tenant() + tenant_id_uuid, timeline_id_uuid = env.neon_cli.create_tenant() tenant_id = tenant_id_uuid.hex timeline_id = timeline_id_uuid.hex @@ -81,14 +81,14 @@ def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder): log.info(f'compute startup failed as expected: {err}') -def test_create_multiple_timelines_parallel(zenith_simple_env: ZenithEnv): - env = zenith_simple_env +def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): + env = neon_simple_env - tenant_id, _ = env.zenith_cli.create_tenant() + tenant_id, _ = env.neon_cli.create_tenant() with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: futures = [ - executor.submit(env.zenith_cli.create_timeline, + executor.submit(env.neon_cli.create_timeline, f"test-create-multiple-timelines-{i}", tenant_id) for i in range(4) ] @@ -96,20 +96,20 @@ def test_create_multiple_timelines_parallel(zenith_simple_env: ZenithEnv): future.result() -def test_fix_broken_timelines_on_startup(zenith_simple_env: ZenithEnv): - env = zenith_simple_env +def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): + env = neon_simple_env - tenant_id, _ = env.zenith_cli.create_tenant() + tenant_id, _ = env.neon_cli.create_tenant() # Introduce failpoint when creating a new timeline env.pageserver.safe_psql(f"failpoints before-checkpoint-new-timeline=return") with pytest.raises(Exception, match="before-checkpoint-new-timeline"): - _ = env.zenith_cli.create_timeline("test_fix_broken_timelines", tenant_id) + _ = env.neon_cli.create_timeline("test_fix_broken_timelines", tenant_id) # Restart the page server - env.zenith_cli.pageserver_stop(immediate=True) - env.zenith_cli.pageserver_start() + env.neon_cli.pageserver_stop(immediate=True) + env.neon_cli.pageserver_start() # Check that the "broken" timeline is not loaded - timelines = env.zenith_cli.list_timelines(tenant_id) + timelines = env.neon_cli.list_timelines(tenant_id) assert len(timelines) == 1 diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/batch_others/test_clog_truncate.py index 2382cd93b3..cbf55e9fc1 100644 --- a/test_runner/batch_others/test_clog_truncate.py +++ b/test_runner/batch_others/test_clog_truncate.py @@ -3,16 +3,16 @@ import os from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log # # Test compute node start after clog truncation # -def test_clog_truncate(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch('test_clog_truncate', 'empty') +def test_clog_truncate(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch('test_clog_truncate', 'empty') # set aggressive autovacuum to make sure that truncation will happen config = [ @@ -62,9 +62,9 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv): # create new branch after clog truncation and start a compute node on it log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}') - env.zenith_cli.create_branch('test_clog_truncate_new', - 'test_clog_truncate', - ancestor_start_lsn=lsn_after_truncation) + env.neon_cli.create_branch('test_clog_truncate_new', + 'test_clog_truncate', + ancestor_start_lsn=lsn_after_truncation) pg2 = env.postgres.create_start('test_clog_truncate_new') log.info('postgres is running on test_clog_truncate_new branch') diff --git a/test_runner/batch_others/test_config.py b/test_runner/batch_others/test_config.py index fd2b3b4e99..51deeebeed 100644 --- a/test_runner/batch_others/test_config.py +++ b/test_runner/batch_others/test_config.py @@ -1,15 +1,15 @@ from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log # # Test starting Postgres with custom options # -def test_config(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_config", "empty") +def test_config(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_config", "empty") # change config pg = env.postgres.create_start('test_config', config_lines=['log_min_messages=debug1']) diff --git a/test_runner/batch_others/test_createdropdb.py b/test_runner/batch_others/test_createdropdb.py index 24898be70a..392e5a6fd4 100644 --- a/test_runner/batch_others/test_createdropdb.py +++ b/test_runner/batch_others/test_createdropdb.py @@ -2,16 +2,16 @@ import os import pathlib from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.log_helper import log # # Test CREATE DATABASE when there have been relmapper changes # -def test_createdb(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch('test_createdb', 'empty') +def test_createdb(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch('test_createdb', 'empty') pg = env.postgres.create_start('test_createdb') log.info("postgres is running on 'test_createdb' branch") @@ -27,7 +27,7 @@ def test_createdb(zenith_simple_env: ZenithEnv): lsn = cur.fetchone()[0] # Create a branch - env.zenith_cli.create_branch('test_createdb2', 'test_createdb', ancestor_start_lsn=lsn) + env.neon_cli.create_branch('test_createdb2', 'test_createdb', ancestor_start_lsn=lsn) pg2 = env.postgres.create_start('test_createdb2') # Test that you can connect to the new database on both branches @@ -40,16 +40,16 @@ def test_createdb(zenith_simple_env: ZenithEnv): ('foodb', )) res = cur.fetchone() # check that dbsize equals sum of all relation sizes, excluding shared ones - # This is how we define dbsize in zenith for now + # This is how we define dbsize in neon for now assert res[0] == res[1] # # Test DROP DATABASE # -def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir): - env = zenith_simple_env - env.zenith_cli.create_branch('test_dropdb', 'empty') +def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): + env = neon_simple_env + env.neon_cli.create_branch('test_dropdb', 'empty') pg = env.postgres.create_start('test_dropdb') log.info("postgres is running on 'test_dropdb' branch") @@ -73,14 +73,14 @@ def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir): lsn_after_drop = cur.fetchone()[0] # Create two branches before and after database drop. - env.zenith_cli.create_branch('test_before_dropdb', - 'test_dropdb', - ancestor_start_lsn=lsn_before_drop) + env.neon_cli.create_branch('test_before_dropdb', + 'test_dropdb', + ancestor_start_lsn=lsn_before_drop) pg_before = env.postgres.create_start('test_before_dropdb') - env.zenith_cli.create_branch('test_after_dropdb', - 'test_dropdb', - ancestor_start_lsn=lsn_after_drop) + env.neon_cli.create_branch('test_after_dropdb', + 'test_dropdb', + ancestor_start_lsn=lsn_after_drop) pg_after = env.postgres.create_start('test_after_dropdb') # Test that database exists on the branch before drop diff --git a/test_runner/batch_others/test_createuser.py b/test_runner/batch_others/test_createuser.py index f4bbbc8a7a..cbfe496e19 100644 --- a/test_runner/batch_others/test_createuser.py +++ b/test_runner/batch_others/test_createuser.py @@ -1,15 +1,15 @@ from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log # # Test CREATE USER to check shared catalog restore # -def test_createuser(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch('test_createuser', 'empty') +def test_createuser(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch('test_createuser', 'empty') pg = env.postgres.create_start('test_createuser') log.info("postgres is running on 'test_createuser' branch") @@ -24,7 +24,7 @@ def test_createuser(zenith_simple_env: ZenithEnv): lsn = cur.fetchone()[0] # Create a branch - env.zenith_cli.create_branch('test_createuser2', 'test_createuser', ancestor_start_lsn=lsn) + env.neon_cli.create_branch('test_createuser2', 'test_createuser', ancestor_start_lsn=lsn) pg2 = env.postgres.create_start('test_createuser2') # Test that you can connect to new branch as a new user diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/batch_others/test_gc_aggressive.py index 79af54c1de..bffeedfdc0 100644 --- a/test_runner/batch_others/test_gc_aggressive.py +++ b/test_runner/batch_others/test_gc_aggressive.py @@ -1,7 +1,7 @@ import asyncio import random -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres from fixtures.log_helper import log # Test configuration @@ -27,7 +27,7 @@ async def update_table(pg: Postgres): # Perform aggressive GC with 0 horizon -async def gc(env: ZenithEnv, timeline: str): +async def gc(env: NeonEnv, timeline: str): psconn = await env.pageserver.connect_async() while updates_performed < updates_to_perform: @@ -35,7 +35,7 @@ async def gc(env: ZenithEnv, timeline: str): # At the same time, run UPDATEs and GC -async def update_and_gc(env: ZenithEnv, pg: Postgres, timeline: str): +async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: str): workers = [] for worker_id in range(num_connections): workers.append(asyncio.create_task(update_table(pg))) @@ -48,14 +48,14 @@ async def update_and_gc(env: ZenithEnv, pg: Postgres, timeline: str): # # Aggressively force GC, while running queries. # -# (repro for https://github.com/zenithdb/zenith/issues/1047) +# (repro for https://github.com/neondatabase/neon/issues/1047) # -def test_gc_aggressive(zenith_env_builder: ZenithEnvBuilder): +def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_gc_aggressive", "main") + neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_gc_aggressive", "main") pg = env.postgres.create_start('test_gc_aggressive') log.info('postgres is running on test_gc_aggressive branch') diff --git a/test_runner/batch_others/test_lsn_mapping.py b/test_runner/batch_others/test_lsn_mapping.py index 37113b46f2..1eca92ed58 100644 --- a/test_runner/batch_others/test_lsn_mapping.py +++ b/test_runner/batch_others/test_lsn_mapping.py @@ -4,7 +4,7 @@ import math from uuid import UUID import psycopg2.extras import psycopg2.errors -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres from fixtures.log_helper import log import time @@ -12,11 +12,11 @@ import time # # Test pageserver get_lsn_by_timestamp API # -def test_lsn_mapping(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init_start() +def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() - new_timeline_id = env.zenith_cli.create_branch('test_lsn_mapping') + new_timeline_id = env.neon_cli.create_branch('test_lsn_mapping') pgmain = env.postgres.create_start("test_lsn_mapping") log.info("postgres is running on 'test_lsn_mapping' branch") diff --git a/test_runner/batch_others/test_multixact.py b/test_runner/batch_others/test_multixact.py index 7a508a67fb..b17676658b 100644 --- a/test_runner/batch_others/test_multixact.py +++ b/test_runner/batch_others/test_multixact.py @@ -1,4 +1,4 @@ -from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.log_helper import log @@ -8,9 +8,9 @@ from fixtures.log_helper import log # it only checks next_multixact_id field in restored pg_control, # since we don't have functions to check multixact internals. # -def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir): - env = zenith_simple_env - env.zenith_cli.create_branch('test_multixact', 'empty') +def test_multixact(neon_simple_env: NeonEnv, test_output_dir): + env = neon_simple_env + env.neon_cli.create_branch('test_multixact', 'empty') pg = env.postgres.create_start('test_multixact') log.info("postgres is running on 'test_multixact' branch") @@ -60,7 +60,7 @@ def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir): assert int(next_multixact_id) > int(next_multixact_id_old) # Branch at this point - env.zenith_cli.create_branch('test_multixact_new', 'test_multixact', ancestor_start_lsn=lsn) + env.neon_cli.create_branch('test_multixact_new', 'test_multixact', ancestor_start_lsn=lsn) pg_new = env.postgres.create_start('test_multixact_new') log.info("postgres is running on 'test_multixact_new' branch") diff --git a/test_runner/batch_others/test_zenith_cli.py b/test_runner/batch_others/test_neon_cli.py similarity index 60% rename from test_runner/batch_others/test_zenith_cli.py rename to test_runner/batch_others/test_neon_cli.py index 103d51aae5..728bc7b894 100644 --- a/test_runner/batch_others/test_zenith_cli.py +++ b/test_runner/batch_others/test_neon_cli.py @@ -1,12 +1,12 @@ import uuid import requests -from fixtures.zenith_fixtures import DEFAULT_BRANCH_NAME, ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient +from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient from typing import cast -def helper_compare_timeline_list(pageserver_http_client: ZenithPageserverHttpClient, - env: ZenithEnv, +def helper_compare_timeline_list(pageserver_http_client: NeonPageserverHttpClient, + env: NeonEnv, initial_tenant: uuid.UUID): """ Compare timelines list returned by CLI and directly via API. @@ -17,65 +17,65 @@ def helper_compare_timeline_list(pageserver_http_client: ZenithPageserverHttpCli map(lambda t: cast(str, t['timeline_id']), pageserver_http_client.timeline_list(initial_tenant))) - timelines_cli = env.zenith_cli.list_timelines() - assert timelines_cli == env.zenith_cli.list_timelines(initial_tenant) + timelines_cli = env.neon_cli.list_timelines() + assert timelines_cli == env.neon_cli.list_timelines(initial_tenant) cli_timeline_ids = sorted([timeline_id for (_, timeline_id) in timelines_cli]) assert timelines_api == cli_timeline_ids -def test_cli_timeline_list(zenith_simple_env: ZenithEnv): - env = zenith_simple_env +def test_cli_timeline_list(neon_simple_env: NeonEnv): + env = neon_simple_env pageserver_http_client = env.pageserver.http_client() # Initial sanity check helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Create a branch for us - main_timeline_id = env.zenith_cli.create_branch('test_cli_branch_list_main') + main_timeline_id = env.neon_cli.create_branch('test_cli_branch_list_main') helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Create a nested branch - nested_timeline_id = env.zenith_cli.create_branch('test_cli_branch_list_nested', - 'test_cli_branch_list_main') + nested_timeline_id = env.neon_cli.create_branch('test_cli_branch_list_nested', + 'test_cli_branch_list_main') helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Check that all new branches are visible via CLI - timelines_cli = [timeline_id for (_, timeline_id) in env.zenith_cli.list_timelines()] + timelines_cli = [timeline_id for (_, timeline_id) in env.neon_cli.list_timelines()] assert main_timeline_id.hex in timelines_cli assert nested_timeline_id.hex in timelines_cli -def helper_compare_tenant_list(pageserver_http_client: ZenithPageserverHttpClient, env: ZenithEnv): +def helper_compare_tenant_list(pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv): tenants = pageserver_http_client.tenant_list() tenants_api = sorted(map(lambda t: cast(str, t['id']), tenants)) - res = env.zenith_cli.list_tenants() + res = env.neon_cli.list_tenants() tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines())) assert tenants_api == tenants_cli -def test_cli_tenant_list(zenith_simple_env: ZenithEnv): - env = zenith_simple_env +def test_cli_tenant_list(neon_simple_env: NeonEnv): + env = neon_simple_env pageserver_http_client = env.pageserver.http_client() # Initial sanity check helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant - tenant1, _ = env.zenith_cli.create_tenant() + tenant1, _ = env.neon_cli.create_tenant() # check tenant1 appeared helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant - tenant2, _ = env.zenith_cli.create_tenant() + tenant2, _ = env.neon_cli.create_tenant() # check tenant2 appeared helper_compare_tenant_list(pageserver_http_client, env) - res = env.zenith_cli.list_tenants() + res = env.neon_cli.list_tenants() tenants = sorted(map(lambda t: t.split()[0], res.stdout.splitlines())) assert env.initial_tenant.hex in tenants @@ -83,18 +83,18 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv): assert tenant2.hex in tenants -def test_cli_tenant_create(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - tenant_id, _ = env.zenith_cli.create_tenant() - timelines = env.zenith_cli.list_timelines(tenant_id) +def test_cli_tenant_create(neon_simple_env: NeonEnv): + env = neon_simple_env + tenant_id, _ = env.neon_cli.create_tenant() + timelines = env.neon_cli.list_timelines(tenant_id) # an initial timeline should be created upon tenant creation assert len(timelines) == 1 assert timelines[0][0] == DEFAULT_BRANCH_NAME -def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init_start() +def test_cli_ipv4_listeners(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() # Connect to sk port on v4 loopback res = requests.get(f'http://127.0.0.1:{env.safekeepers[0].port.http}/v1/status') @@ -108,17 +108,17 @@ def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): # assert res.ok -def test_cli_start_stop(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init_start() +def test_cli_start_stop(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() # Stop default ps/sk - env.zenith_cli.pageserver_stop() - env.zenith_cli.safekeeper_stop() + env.neon_cli.pageserver_stop() + env.neon_cli.safekeeper_stop() # Default start - res = env.zenith_cli.raw_cli(["start"]) + res = env.neon_cli.raw_cli(["start"]) res.check_returncode() # Default stop - res = env.zenith_cli.raw_cli(["stop"]) + res = env.neon_cli.raw_cli(["stop"]) res.check_returncode() diff --git a/test_runner/batch_others/test_next_xid.py b/test_runner/batch_others/test_next_xid.py index 1ab1addad3..f8d11a9381 100644 --- a/test_runner/batch_others/test_next_xid.py +++ b/test_runner/batch_others/test_next_xid.py @@ -1,12 +1,12 @@ import time -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder # Test restarting page server, while safekeeper and compute node keep # running. -def test_next_xid(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init_start() +def test_next_xid(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() pg = env.postgres.create_start('main') diff --git a/test_runner/batch_others/test_normal_work.py b/test_runner/batch_others/test_normal_work.py index 87dd2d5e18..aac9685681 100644 --- a/test_runner/batch_others/test_normal_work.py +++ b/test_runner/batch_others/test_normal_work.py @@ -1,9 +1,9 @@ from fixtures.log_helper import log -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient -def check_tenant(env: ZenithEnv, pageserver_http: ZenithPageserverHttpClient): - tenant_id, timeline_id = env.zenith_cli.create_tenant() +def check_tenant(env: NeonEnv, pageserver_http: NeonPageserverHttpClient): + tenant_id, timeline_id = env.neon_cli.create_tenant() pg = env.postgres.create_start('main', tenant_id=tenant_id) # we rely upon autocommit after each statement res_1 = pg.safe_psql_many(queries=[ @@ -26,7 +26,7 @@ def check_tenant(env: ZenithEnv, pageserver_http: ZenithPageserverHttpClient): pageserver_http.timeline_detach(tenant_id, timeline_id) -def test_normal_work(zenith_env_builder: ZenithEnvBuilder): +def test_normal_work(neon_env_builder: NeonEnvBuilder): """ Basic test: * create new tenant with a timeline @@ -40,7 +40,7 @@ def test_normal_work(zenith_env_builder: ZenithEnvBuilder): Repeat check for several tenants/timelines. """ - env = zenith_env_builder.init_start() + env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() for _ in range(3): diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py index fd0cbe26cc..1e96c0a1fa 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -1,4 +1,4 @@ -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log from fixtures.utils import print_gc_result import psycopg2.extras @@ -14,11 +14,11 @@ import psycopg2.extras # just a hint that the page hasn't been modified since that LSN, and the page # server should return the latest page version regardless of the LSN. # -def test_old_request_lsn(zenith_env_builder: ZenithEnvBuilder): +def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_old_request_lsn", "main") + neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_old_request_lsn", "main") pg = env.postgres.create_start('test_old_request_lsn') log.info('postgres is running on test_old_request_lsn branch') diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index d22654ad3e..289eec74c5 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -2,26 +2,26 @@ from typing import Optional from uuid import uuid4, UUID import pytest from fixtures.utils import lsn_from_hex -from fixtures.zenith_fixtures import ( +from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, - ZenithEnv, - ZenithEnvBuilder, - ZenithPageserverHttpClient, - ZenithPageserverApiException, + NeonEnv, + NeonEnvBuilder, + NeonPageserverHttpClient, + NeonPageserverApiException, wait_until, ) # test that we cannot override node id -def test_pageserver_init_node_id(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init() +def test_pageserver_init_node_id(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init() with pytest.raises( Exception, match="node id can only be set during pageserver init and cannot be overridden"): env.pageserver.start(overrides=['--pageserver-config-override=id=10']) -def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): +def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID): client.check_status() # check initial tenant is there @@ -57,11 +57,11 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): assert local_timeline_details['timeline_state'] == 'Loaded' -def test_pageserver_http_get_wal_receiver_not_found(zenith_simple_env: ZenithEnv): - env = zenith_simple_env +def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): + env = neon_simple_env client = env.pageserver.http_client() - tenant_id, timeline_id = env.zenith_cli.create_tenant() + tenant_id, timeline_id = env.neon_cli.create_tenant() empty_response = client.wal_receiver_get(tenant_id, timeline_id) @@ -70,11 +70,11 @@ def test_pageserver_http_get_wal_receiver_not_found(zenith_simple_env: ZenithEnv assert empty_response.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running' -def test_pageserver_http_get_wal_receiver_success(zenith_simple_env: ZenithEnv): - env = zenith_simple_env +def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): + env = neon_simple_env client = env.pageserver.http_client() - tenant_id, timeline_id = env.zenith_cli.create_tenant() + tenant_id, timeline_id = env.neon_cli.create_tenant() pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id) def expect_updated_msg_lsn(prev_msg_lsn: Optional[int]) -> int: @@ -107,15 +107,15 @@ def test_pageserver_http_get_wal_receiver_success(zenith_simple_env: ZenithEnv): wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(lsn)) -def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv): - env = zenith_simple_env +def test_pageserver_http_api_client(neon_simple_env: NeonEnv): + env = neon_simple_env client = env.pageserver.http_client() check_client(client, env.initial_tenant) -def test_pageserver_http_api_client_auth_enabled(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.pageserver_auth_enabled = True - env = zenith_env_builder.init_start() +def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder): + neon_env_builder.pageserver_auth_enabled = True + env = neon_env_builder.init_start() management_token = env.auth_keys.generate_management_token() diff --git a/test_runner/batch_others/test_pageserver_catchup.py b/test_runner/batch_others/test_pageserver_catchup.py index 758b018046..dd24351e17 100644 --- a/test_runner/batch_others/test_pageserver_catchup.py +++ b/test_runner/batch_others/test_pageserver_catchup.py @@ -1,15 +1,15 @@ -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder # Test safekeeper sync and pageserver catch up # while initial compute node is down and pageserver is lagging behind safekeepers. # Ensure that basebackup after restart of all components is correct # and new compute node contains all data. -def test_pageserver_catchup_while_compute_down(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() +def test_pageserver_catchup_while_compute_down(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_pageserver_catchup_while_compute_down') + env.neon_cli.create_branch('test_pageserver_catchup_while_compute_down') # Make shared_buffers large to ensure we won't query pageserver while it is down. pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down', config_lines=['shared_buffers=512MB']) diff --git a/test_runner/batch_others/test_pageserver_restart.py b/test_runner/batch_others/test_pageserver_restart.py index 69f5ea85ce..403ff7b305 100644 --- a/test_runner/batch_others/test_pageserver_restart.py +++ b/test_runner/batch_others/test_pageserver_restart.py @@ -1,13 +1,13 @@ -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log # Test restarting page server, while safekeeper and compute node keep # running. -def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init_start() +def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_pageserver_restart') + env.neon_cli.create_branch('test_pageserver_restart') pg = env.postgres.create_start('test_pageserver_restart') pg_conn = pg.connect() diff --git a/test_runner/batch_others/test_parallel_copy.py b/test_runner/batch_others/test_parallel_copy.py index a44acecf21..55947fe427 100644 --- a/test_runner/batch_others/test_parallel_copy.py +++ b/test_runner/batch_others/test_parallel_copy.py @@ -1,6 +1,6 @@ from io import BytesIO import asyncio -from fixtures.zenith_fixtures import ZenithEnv, Postgres +from fixtures.neon_fixtures import NeonEnv, Postgres from fixtures.log_helper import log @@ -38,9 +38,9 @@ async def parallel_load_same_table(pg: Postgres, n_parallel: int): # Load data into one table with COPY TO from 5 parallel connections -def test_parallel_copy(zenith_simple_env: ZenithEnv, n_parallel=5): - env = zenith_simple_env - env.zenith_cli.create_branch("test_parallel_copy", "empty") +def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5): + env = neon_simple_env + env.neon_cli.create_branch("test_parallel_copy", "empty") pg = env.postgres.create_start('test_parallel_copy') log.info("postgres is running on 'test_parallel_copy' branch") diff --git a/test_runner/batch_others/test_pitr_gc.py b/test_runner/batch_others/test_pitr_gc.py index 1a1562ca5f..161f628429 100644 --- a/test_runner/batch_others/test_pitr_gc.py +++ b/test_runner/batch_others/test_pitr_gc.py @@ -5,20 +5,20 @@ import psycopg2.extras import pytest from fixtures.log_helper import log from fixtures.utils import print_gc_result -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder # # Check pitr_interval GC behavior. # Insert some data, run GC and create a branch in the past. # -def test_pitr_gc(zenith_env_builder: ZenithEnvBuilder): +def test_pitr_gc(neon_env_builder: NeonEnvBuilder): - zenith_env_builder.num_safekeepers = 1 + neon_env_builder.num_safekeepers = 1 # Set pitr interval such that we need to keep the data - zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" + neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" - env = zenith_env_builder.init_start() + env = neon_env_builder.init_start() pgmain = env.postgres.create_start('main') log.info("postgres is running on 'main' branch") @@ -62,7 +62,7 @@ def test_pitr_gc(zenith_env_builder: ZenithEnvBuilder): # Branch at the point where only 100 rows were inserted # It must have been preserved by PITR setting - env.zenith_cli.create_branch('test_pitr_gc_hundred', 'main', ancestor_start_lsn=lsn_a) + env.neon_cli.create_branch('test_pitr_gc_hundred', 'main', ancestor_start_lsn=lsn_a) pg_hundred = env.postgres.create_start('test_pitr_gc_hundred') diff --git a/test_runner/batch_others/test_read_validation.py b/test_runner/batch_others/test_read_validation.py index 9d2248ac89..6b8a154865 100644 --- a/test_runner/batch_others/test_read_validation.py +++ b/test_runner/batch_others/test_read_validation.py @@ -1,12 +1,12 @@ from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log from psycopg2.errors import UndefinedTable from psycopg2.errors import IoError -pytest_plugins = ("fixtures.zenith_fixtures") +pytest_plugins = ("fixtures.neon_fixtures") extensions = ["pageinspect", "neon_test_utils", "pg_buffercache"] @@ -14,9 +14,9 @@ extensions = ["pageinspect", "neon_test_utils", "pg_buffercache"] # # Validation of reading different page versions # -def test_read_validation(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_read_validation", "empty") +def test_read_validation(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_read_validation", "empty") pg = env.postgres.create_start("test_read_validation") log.info("postgres is running on 'test_read_validation' branch") @@ -125,9 +125,9 @@ def test_read_validation(zenith_simple_env: ZenithEnv): log.info("Caught an expected failure: {}".format(e)) -def test_read_validation_neg(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_read_validation_neg", "empty") +def test_read_validation_neg(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_read_validation_neg", "empty") pg = env.postgres.create_start("test_read_validation_neg") log.info("postgres is running on 'test_read_validation_neg' branch") diff --git a/test_runner/batch_others/test_readonly_node.py b/test_runner/batch_others/test_readonly_node.py index 808ee62def..286c756a5e 100644 --- a/test_runner/batch_others/test_readonly_node.py +++ b/test_runner/batch_others/test_readonly_node.py @@ -1,6 +1,6 @@ import pytest from fixtures.log_helper import log -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv # @@ -9,9 +9,9 @@ from fixtures.zenith_fixtures import ZenithEnv # This is very similar to the 'test_branch_behind' test, but instead of # creating branches, creates read-only nodes. # -def test_readonly_node(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch('test_readonly_node', 'empty') +def test_readonly_node(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch('test_readonly_node', 'empty') pgmain = env.postgres.create_start('test_readonly_node') log.info("postgres is running on 'test_readonly_node' branch") diff --git a/test_runner/batch_others/test_recovery.py b/test_runner/batch_others/test_recovery.py index eb1747efa5..14d1adf25d 100644 --- a/test_runner/batch_others/test_recovery.py +++ b/test_runner/batch_others/test_recovery.py @@ -4,28 +4,28 @@ import psycopg2.extras import json from ast import Assert from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log # # Test pageserver recovery after crash # -def test_pageserver_recovery(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 1 +def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 # Override default checkpointer settings to run it more often - zenith_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" + neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" - env = zenith_env_builder.init() + env = neon_env_builder.init() # Check if failpoints enables. Otherwise the test doesn't make sense - f = env.zenith_cli.pageserver_enabled_features() + f = env.neon_cli.pageserver_enabled_features() assert "failpoints" in f["features"], "Build pageserver with --features=failpoints option to run this test" - zenith_env_builder.start() + neon_env_builder.start() # Create a branch for us - env.zenith_cli.create_branch("test_pageserver_recovery", "main") + env.neon_cli.create_branch("test_pageserver_recovery", "main") pg = env.postgres.create_start('test_pageserver_recovery') log.info("postgres is running on 'test_pageserver_recovery' branch") diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index e7097e2ef5..bf9717a74a 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -6,7 +6,7 @@ from contextlib import closing from pathlib import Path import time from uuid import UUID -from fixtures.zenith_fixtures import ZenithEnvBuilder, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload +from fixtures.neon_fixtures import NeonEnvBuilder, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload from fixtures.log_helper import log from fixtures.utils import lsn_from_hex, lsn_to_hex import pytest @@ -30,12 +30,12 @@ import pytest # # The tests are done for all types of remote storage pageserver supports. @pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3']) -def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, storage_type: str): - # zenith_env_builder.rust_log_override = 'debug' +def test_remote_storage_backup_and_restore(neon_env_builder: NeonEnvBuilder, storage_type: str): + # neon_env_builder.rust_log_override = 'debug' if storage_type == 'local_fs': - zenith_env_builder.enable_local_fs_remote_storage() + neon_env_builder.enable_local_fs_remote_storage() elif storage_type == 'mock_s3': - zenith_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore') + neon_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore') else: raise RuntimeError(f'Unknown storage type: {storage_type}') @@ -43,7 +43,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, data_secret = 'very secret secret' ##### First start, insert secret data and upload it to the remote storage - env = zenith_env_builder.init_start() + env = neon_env_builder.init_start() pg = env.postgres.create_start('main') client = env.pageserver.http_client() diff --git a/test_runner/batch_others/test_restart_compute.py b/test_runner/batch_others/test_restart_compute.py index d6e7fd9e0d..d55c0f2bcc 100644 --- a/test_runner/batch_others/test_restart_compute.py +++ b/test_runner/batch_others/test_restart_compute.py @@ -1,7 +1,7 @@ import pytest from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log @@ -9,13 +9,13 @@ from fixtures.log_helper import log # Test restarting and recreating a postgres instance # @pytest.mark.parametrize('with_safekeepers', [False, True]) -def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_safekeepers: bool): - zenith_env_builder.pageserver_auth_enabled = True +def test_restart_compute(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): + neon_env_builder.pageserver_auth_enabled = True if with_safekeepers: - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_restart_compute') + env.neon_cli.create_branch('test_restart_compute') pg = env.postgres.create_start('test_restart_compute') log.info("postgres is running on 'test_restart_compute' branch") diff --git a/test_runner/batch_others/test_subxacts.py b/test_runner/batch_others/test_subxacts.py index bed1c4be63..d06877825e 100644 --- a/test_runner/batch_others/test_subxacts.py +++ b/test_runner/batch_others/test_subxacts.py @@ -1,4 +1,4 @@ -from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.log_helper import log @@ -6,11 +6,11 @@ from fixtures.log_helper import log # # The pg_subxact SLRU is not preserved on restarts, and doesn't need to be # maintained in the pageserver, so subtransactions are not very exciting for -# Zenith. They are included in the commit record though and updated in the +# Neon. They are included in the commit record though and updated in the # CLOG. -def test_subxacts(zenith_simple_env: ZenithEnv, test_output_dir): - env = zenith_simple_env - env.zenith_cli.create_branch("test_subxacts", "empty") +def test_subxacts(neon_simple_env: NeonEnv, test_output_dir): + env = neon_simple_env + env.neon_cli.create_branch("test_subxacts", "empty") pg = env.postgres.create_start('test_subxacts') log.info("postgres is running on 'test_subxacts' branch") diff --git a/test_runner/batch_others/test_tenant_conf.py b/test_runner/batch_others/test_tenant_conf.py index d627d8a6ee..d25aad742e 100644 --- a/test_runner/batch_others/test_tenant_conf.py +++ b/test_runner/batch_others/test_tenant_conf.py @@ -3,25 +3,25 @@ from contextlib import closing import pytest import psycopg2.extras -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log -def test_tenant_config(zenith_env_builder: ZenithEnvBuilder): +def test_tenant_config(neon_env_builder: NeonEnvBuilder): # set some non-default global config - zenith_env_builder.pageserver_config_override = ''' + neon_env_builder.pageserver_config_override = ''' page_cache_size=444; wait_lsn_timeout='111 s'; tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' - env = zenith_env_builder.init_start() + env = neon_env_builder.init_start() """Test per tenant configuration""" - tenant, _ = env.zenith_cli.create_tenant(conf={ + tenant, _ = env.neon_cli.create_tenant(conf={ 'checkpoint_distance': '20000', 'gc_period': '30sec', }) - env.zenith_cli.create_timeline(f'test_tenant_conf', tenant_id=tenant) + env.neon_cli.create_timeline(f'test_tenant_conf', tenant_id=tenant) pg = env.postgres.create_start( "test_tenant_conf", "main", @@ -66,11 +66,11 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' }.items()) # update the config and ensure that it has changed - env.zenith_cli.config_tenant(tenant_id=tenant, - conf={ - 'checkpoint_distance': '15000', - 'gc_period': '80sec', - }) + env.neon_cli.config_tenant(tenant_id=tenant, + conf={ + 'checkpoint_distance': '15000', + 'gc_period': '80sec', + }) with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index af96cc8524..18ec34b02e 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -10,7 +10,7 @@ from typing import Optional import signal import pytest -from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, Etcd, ZenithPageserverHttpClient, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload, zenith_binpath, pg_distrib_dir +from fixtures.neon_fixtures import PgProtocol, PortDistributor, Postgres, NeonEnvBuilder, Etcd, NeonPageserverHttpClient, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload, neon_binpath, pg_distrib_dir from fixtures.utils import lsn_from_hex @@ -26,7 +26,7 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, http_port: int, broker: Optional[Etcd]): """ - cannot use ZenithPageserver yet because it depends on zenith cli + cannot use NeonPageserver yet because it depends on neon cli which currently lacks support for multiple pageservers """ cmd = [ @@ -106,21 +106,21 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve "needs to replace callmemaybe call with better idea how to migrate timelines between pageservers" ) @pytest.mark.parametrize('with_load', ['with_load', 'without_load']) -def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, +def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, port_distributor: PortDistributor, with_load: str): - zenith_env_builder.enable_local_fs_remote_storage() + neon_env_builder.enable_local_fs_remote_storage() - env = zenith_env_builder.init_start() + env = neon_env_builder.init_start() # create folder for remote storage mock remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage' - tenant, _ = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) + tenant, _ = env.neon_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) log.info("tenant to relocate %s", tenant) # attach does not download ancestor branches (should it?), just use root branch for now - env.zenith_cli.create_root_branch('test_tenant_relocation', tenant_id=tenant) + env.neon_cli.create_root_branch('test_tenant_relocation', tenant_id=tenant) tenant_pg = env.postgres.create_start(branch_name='test_tenant_relocation', node_name='test_tenant_relocation', @@ -177,16 +177,16 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, new_pageserver_pg_port = port_distributor.get_port() new_pageserver_http_port = port_distributor.get_port() log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port) - pageserver_bin = pathlib.Path(zenith_binpath) / 'pageserver' + pageserver_bin = pathlib.Path(neon_binpath) / 'pageserver' - new_pageserver_http = ZenithPageserverHttpClient(port=new_pageserver_http_port, auth_token=None) + new_pageserver_http = NeonPageserverHttpClient(port=new_pageserver_http_port, auth_token=None) with new_pageserver_helper(new_pageserver_dir, pageserver_bin, remote_storage_mock_path, new_pageserver_pg_port, new_pageserver_http_port, - zenith_env_builder.broker): + neon_env_builder.broker): # call to attach timeline to new pageserver new_pageserver_http.timeline_attach(tenant, timeline) @@ -215,7 +215,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, tenant_pg.stop() - # rewrite zenith cli config to use new pageserver for basebackup to start new compute + # rewrite neon cli config to use new pageserver for basebackup to start new compute cli_config_lines = (env.repo_dir / 'config').read_text().splitlines() cli_config_lines[-2] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'" cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'" @@ -258,7 +258,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, assert not os.path.exists(timeline_to_detach_local_path), f'After detach, local timeline dir {timeline_to_detach_local_path} should be removed' - # bring old pageserver back for clean shutdown via zenith cli + # bring old pageserver back for clean shutdown via neon cli # new pageserver will be shut down by the context manager cli_config_lines = (env.repo_dir / 'config').read_text().splitlines() cli_config_lines[-2] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'" diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/batch_others/test_tenants.py index 9ccb8cf196..8d73d8185c 100644 --- a/test_runner/batch_others/test_tenants.py +++ b/test_runner/batch_others/test_tenants.py @@ -3,26 +3,26 @@ from datetime import datetime import os import pytest -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log from fixtures.metrics import parse_metrics from fixtures.utils import lsn_to_hex @pytest.mark.parametrize('with_safekeepers', [False, True]) -def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_safekeepers: bool): +def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): if with_safekeepers: - zenith_env_builder.num_safekeepers = 3 + neon_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() + env = neon_env_builder.init_start() """Tests tenants with and without wal acceptors""" - tenant_1, _ = env.zenith_cli.create_tenant() - tenant_2, _ = env.zenith_cli.create_tenant() + tenant_1, _ = env.neon_cli.create_tenant() + tenant_2, _ = env.neon_cli.create_tenant() - env.zenith_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', - tenant_id=tenant_1) - env.zenith_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', - tenant_id=tenant_2) + env.neon_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', + tenant_id=tenant_1) + env.neon_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', + tenant_id=tenant_2) pg_tenant1 = env.postgres.create_start( f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', @@ -44,15 +44,15 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_safekeep assert cur.fetchone() == (5000050000, ) -def test_metrics_normal_work(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 +def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() - tenant_1, _ = env.zenith_cli.create_tenant() - tenant_2, _ = env.zenith_cli.create_tenant() + env = neon_env_builder.init_start() + tenant_1, _ = env.neon_cli.create_tenant() + tenant_2, _ = env.neon_cli.create_tenant() - timeline_1 = env.zenith_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_1) - timeline_2 = env.zenith_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_2) + timeline_1 = env.neon_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_1) + timeline_2 = env.neon_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_2) pg_tenant1 = env.postgres.create_start('test_metrics_normal_work', tenant_id=tenant_1) pg_tenant2 = env.postgres.create_start('test_metrics_normal_work', tenant_id=tenant_2) @@ -72,7 +72,7 @@ def test_metrics_normal_work(zenith_env_builder: ZenithEnvBuilder): collected_metrics[f'safekeeper{sk.id}'] = sk.http_client().get_metrics_str() for name in collected_metrics: - basepath = os.path.join(zenith_env_builder.repo_dir, f'{name}.metrics') + basepath = os.path.join(neon_env_builder.repo_dir, f'{name}.metrics') with open(basepath, 'w') as stdout_f: print(collected_metrics[name], file=stdout_f, flush=True) diff --git a/test_runner/batch_others/test_tenants_with_remote_storage.py b/test_runner/batch_others/test_tenants_with_remote_storage.py index dbe07c4aba..41506ad920 100644 --- a/test_runner/batch_others/test_tenants_with_remote_storage.py +++ b/test_runner/batch_others/test_tenants_with_remote_storage.py @@ -12,11 +12,11 @@ from uuid import UUID import pytest -from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithEnv, Postgres, wait_for_last_record_lsn, wait_for_upload +from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv, Postgres, wait_for_last_record_lsn, wait_for_upload from fixtures.utils import lsn_from_hex -async def tenant_workload(env: ZenithEnv, pg: Postgres): +async def tenant_workload(env: NeonEnv, pg: Postgres): pageserver_conn = await env.pageserver.connect_async() pg_conn = await pg.connect_async() @@ -35,7 +35,7 @@ async def tenant_workload(env: ZenithEnv, pg: Postgres): assert res == i * 1000 -async def all_tenants_workload(env: ZenithEnv, tenants_pgs): +async def all_tenants_workload(env: NeonEnv, tenants_pgs): workers = [] for tenant, pg in tenants_pgs: worker = tenant_workload(env, pg) @@ -46,28 +46,28 @@ async def all_tenants_workload(env: ZenithEnv, tenants_pgs): @pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3']) -def test_tenants_many(zenith_env_builder: ZenithEnvBuilder, storage_type: str): +def test_tenants_many(neon_env_builder: NeonEnvBuilder, storage_type: str): if storage_type == 'local_fs': - zenith_env_builder.enable_local_fs_remote_storage() + neon_env_builder.enable_local_fs_remote_storage() elif storage_type == 'mock_s3': - zenith_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore') + neon_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore') else: raise RuntimeError(f'Unknown storage type: {storage_type}') - zenith_env_builder.enable_local_fs_remote_storage() + neon_env_builder.enable_local_fs_remote_storage() - env = zenith_env_builder.init_start() + env = neon_env_builder.init_start() tenants_pgs = [] for i in range(1, 5): # Use a tiny checkpoint distance, to create a lot of layers quickly - tenant, _ = env.zenith_cli.create_tenant( + tenant, _ = env.neon_cli.create_tenant( conf={ 'checkpoint_distance': '5000000', }) - env.zenith_cli.create_timeline(f'test_tenants_many', tenant_id=tenant) + env.neon_cli.create_timeline(f'test_tenants_many', tenant_id=tenant) pg = env.postgres.create_start( f'test_tenants_many', diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index d43e793df8..70dbe8ac4a 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -1,15 +1,15 @@ from contextlib import closing import psycopg2.extras import psycopg2.errors -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres, assert_local +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_local from fixtures.log_helper import log import time -def test_timeline_size(zenith_simple_env: ZenithEnv): - env = zenith_simple_env +def test_timeline_size(neon_simple_env: NeonEnv): + env = neon_simple_env # Branch at the point where only 100 rows were inserted - new_timeline_id = env.zenith_cli.create_branch('test_timeline_size', 'empty') + new_timeline_id = env.neon_cli.create_branch('test_timeline_size', 'empty') client = env.pageserver.http_client() timeline_details = assert_local(client, env.initial_tenant, new_timeline_id) @@ -69,9 +69,9 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 time.sleep(polling_interval) -def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init_start() - new_timeline_id = env.zenith_cli.create_branch('test_timeline_size_quota') +def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + new_timeline_id = env.neon_cli.create_branch('test_timeline_size_quota') client = env.pageserver.http_client() res = assert_local(client, env.initial_tenant, new_timeline_id) @@ -86,7 +86,7 @@ def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder): with closing(pgmain.connect()) as conn: with conn.cursor() as cur: - cur.execute("CREATE EXTENSION neon") # TODO move it to zenith_fixtures? + cur.execute("CREATE EXTENSION neon") # TODO move it to neon_fixtures? cur.execute("CREATE TABLE foo (t text)") diff --git a/test_runner/batch_others/test_twophase.py b/test_runner/batch_others/test_twophase.py index 4afdc7e0be..04e3d0b7bc 100644 --- a/test_runner/batch_others/test_twophase.py +++ b/test_runner/batch_others/test_twophase.py @@ -1,15 +1,15 @@ import os -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log # # Test branching, when a transaction is in prepared state # -def test_twophase(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_twophase", "empty") +def test_twophase(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_twophase", "empty") pg = env.postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5']) log.info("postgres is running on 'test_twophase' branch") @@ -55,7 +55,7 @@ def test_twophase(zenith_simple_env: ZenithEnv): assert len(twophase_files) == 2 # Create a branch with the transaction in prepared state - env.zenith_cli.create_branch("test_twophase_prepared", "test_twophase") + env.neon_cli.create_branch("test_twophase_prepared", "test_twophase") # Start compute on the new branch pg2 = env.postgres.create_start( diff --git a/test_runner/batch_others/test_vm_bits.py b/test_runner/batch_others/test_vm_bits.py index 8a14959eff..29b55f5b8c 100644 --- a/test_runner/batch_others/test_vm_bits.py +++ b/test_runner/batch_others/test_vm_bits.py @@ -1,4 +1,4 @@ -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log @@ -6,10 +6,10 @@ from fixtures.log_helper import log # Test that the VM bit is cleared correctly at a HEAP_DELETE and # HEAP_UPDATE record. # -def test_vm_bit_clear(zenith_simple_env: ZenithEnv): - env = zenith_simple_env +def test_vm_bit_clear(neon_simple_env: NeonEnv): + env = neon_simple_env - env.zenith_cli.create_branch("test_vm_bit_clear", "empty") + env.neon_cli.create_branch("test_vm_bit_clear", "empty") pg = env.postgres.create_start('test_vm_bit_clear') log.info("postgres is running on 'test_vm_bit_clear' branch") @@ -33,7 +33,7 @@ def test_vm_bit_clear(zenith_simple_env: ZenithEnv): cur.execute('UPDATE vmtest_update SET id = 5000 WHERE id = 1') # Branch at this point, to test that later - env.zenith_cli.create_branch("test_vm_bit_clear_new", "test_vm_bit_clear") + env.neon_cli.create_branch("test_vm_bit_clear_new", "test_vm_bit_clear") # Clear the buffer cache, to force the VM page to be re-fetched from # the page server diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 007641417e..1932c3e450 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -12,7 +12,7 @@ from contextlib import closing from dataclasses import dataclass, field from multiprocessing import Process, Value from pathlib import Path -from fixtures.zenith_fixtures import PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol +from fixtures.neon_fixtures import PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, neon_binpath, PgProtocol from fixtures.utils import get_dir_size, lsn_to_hex, mkdir_if_needed, lsn_from_hex from fixtures.log_helper import log from typing import List, Optional, Any @@ -29,9 +29,9 @@ class TimelineMetrics: # Run page server and multiple acceptors, and multiple compute nodes running # against different timelines. -def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() +def test_many_timelines(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() n_timelines = 3 @@ -39,15 +39,15 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): "test_safekeepers_many_timelines_{}".format(tlin) for tlin in range(n_timelines) ] # pageserver, safekeeper operate timelines via their ids (can be represented in hex as 'ad50847381e248feaac9876cc71ae418') - # that's not really human readable, so the branch names are introduced in Zenith CLI. - # Zenith CLI stores its branch <-> timeline mapping in its internals, + # that's not really human readable, so the branch names are introduced in Neon CLI. + # Neon CLI stores its branch <-> timeline mapping in its internals, # but we need this to collect metrics from other servers, related to the timeline. branch_names_to_timeline_ids = {} # start postgres on each timeline pgs = [] for branch_name in branch_names: - new_timeline_id = env.zenith_cli.create_branch(branch_name) + new_timeline_id = env.neon_cli.create_branch(branch_name) pgs.append(env.postgres.create_start(branch_name)) branch_names_to_timeline_ids[branch_name] = new_timeline_id @@ -93,14 +93,14 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): # the compute node, which only happens after a consensus of safekeepers # has confirmed the transaction. We assume majority consensus here. assert (2 * sum(m.last_record_lsn <= lsn - for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + for lsn in m.flush_lsns) > neon_env_builder.num_safekeepers), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" assert (2 * sum(m.last_record_lsn <= lsn - for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + for lsn in m.commit_lsns) > neon_env_builder.num_safekeepers), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" timeline_metrics.append(m) log.info(f"{message}: {timeline_metrics}") return timeline_metrics - # TODO: https://github.com/zenithdb/zenith/issues/809 + # TODO: https://github.com/neondatabase/neon/issues/809 # collect_metrics("before CREATE TABLE") # Do everything in different loops to have actions on different timelines @@ -168,15 +168,15 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): # Check that dead minority doesn't prevent the commits: execute insert n_inserts # times, with fault_probability chance of getting a wal acceptor down or up # along the way. 2 of 3 are always alive, so the work keeps going. -def test_restarts(zenith_env_builder: ZenithEnvBuilder): +def test_restarts(neon_env_builder: NeonEnvBuilder): fault_probability = 0.01 n_inserts = 1000 n_acceptors = 3 - zenith_env_builder.num_safekeepers = n_acceptors - env = zenith_env_builder.init_start() + neon_env_builder.num_safekeepers = n_acceptors + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_safekeepers_restarts') + env.neon_cli.create_branch('test_safekeepers_restarts') pg = env.postgres.create_start('test_safekeepers_restarts') # we rely upon autocommit after each statement @@ -209,11 +209,11 @@ def delayed_safekeeper_start(wa): # When majority of acceptors is offline, commits are expected to be frozen -def test_unavailability(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 2 - env = zenith_env_builder.init_start() +def test_unavailability(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 2 + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_safekeepers_unavailability') + env.neon_cli.create_branch('test_safekeepers_unavailability') pg = env.postgres.create_start('test_safekeepers_unavailability') # we rely upon autocommit after each statement @@ -279,12 +279,12 @@ def stop_value(): # do inserts while concurrently getting up/down subsets of acceptors -def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value): +def test_race_conditions(neon_env_builder: NeonEnvBuilder, stop_value): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_safekeepers_race_conditions') + env.neon_cli.create_branch('test_safekeepers_race_conditions') pg = env.postgres.create_start('test_safekeepers_race_conditions') # we rely upon autocommit after each statement @@ -308,16 +308,16 @@ def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value): # Test that safekeepers push their info to the broker and learn peer status from it -def test_broker(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - zenith_env_builder.enable_local_fs_remote_storage() - env = zenith_env_builder.init_start() +def test_broker(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_local_fs_remote_storage() + env = neon_env_builder.init_start() - env.zenith_cli.create_branch("test_broker", "main") + env.neon_cli.create_branch("test_broker", "main") pg = env.postgres.create_start('test_broker') pg.safe_psql("CREATE TABLE t(key int primary key, value text)") - # learn zenith timeline from compute + # learn neon timeline from compute tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] @@ -349,13 +349,13 @@ def test_broker(zenith_env_builder: ZenithEnvBuilder): # Test that old WAL consumed by peers and pageserver is removed from safekeepers. -def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 2 +def test_wal_removal(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 2 # to advance remote_consistent_llsn - zenith_env_builder.enable_local_fs_remote_storage() - env = zenith_env_builder.init_start() + neon_env_builder.enable_local_fs_remote_storage() + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_safekeepers_wal_removal') + env.neon_cli.create_branch('test_safekeepers_wal_removal') pg = env.postgres.create_start('test_safekeepers_wal_removal') with closing(pg.connect()) as conn: @@ -412,22 +412,22 @@ def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end): @pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs']) -def test_wal_backup(zenith_env_builder: ZenithEnvBuilder, storage_type: str): - zenith_env_builder.num_safekeepers = 3 +def test_wal_backup(neon_env_builder: NeonEnvBuilder, storage_type: str): + neon_env_builder.num_safekeepers = 3 if storage_type == 'local_fs': - zenith_env_builder.enable_local_fs_remote_storage() + neon_env_builder.enable_local_fs_remote_storage() elif storage_type == 'mock_s3': - zenith_env_builder.enable_s3_mock_remote_storage('test_safekeepers_wal_backup') + neon_env_builder.enable_s3_mock_remote_storage('test_safekeepers_wal_backup') else: raise RuntimeError(f'Unknown storage type: {storage_type}') - zenith_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER + neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER - env = zenith_env_builder.init_start() + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_safekeepers_wal_backup') + env.neon_cli.create_branch('test_safekeepers_wal_backup') pg = env.postgres.create_start('test_safekeepers_wal_backup') - # learn zenith timeline from compute + # learn neon timeline from compute tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] @@ -460,7 +460,7 @@ def test_wal_backup(zenith_env_builder: ZenithEnvBuilder, storage_type: str): class ProposerPostgres(PgProtocol): - """Object for running postgres without ZenithEnv""" + """Object for running postgres without NeonEnv""" def __init__(self, pgdata_dir: str, pg_bin, @@ -542,14 +542,14 @@ class ProposerPostgres(PgProtocol): # insert wal in all safekeepers and run sync on proposer -def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, +def test_sync_safekeepers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor): # We don't really need the full environment for this test, just the # safekeepers would be enough. - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() timeline_id = uuid.uuid4() tenant_id = uuid.uuid4() @@ -596,17 +596,17 @@ def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, assert all(lsn_after_sync == lsn for lsn in lsn_after_append) -def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init_start() +def test_timeline_status(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_timeline_status') + env.neon_cli.create_branch('test_timeline_status') pg = env.postgres.create_start('test_timeline_status') wa = env.safekeepers[0] wa_http_cli = wa.http_client() wa_http_cli.check_status() - # learn zenith timeline from compute + # learn neon timeline from compute tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] @@ -642,7 +642,7 @@ class SafekeeperEnv: peer_port=self.port_distributor.get_port()) self.pg_bin = pg_bin self.num_safekeepers = num_safekeepers - self.bin_safekeeper = os.path.join(str(zenith_binpath), 'safekeeper') + self.bin_safekeeper = os.path.join(str(neon_binpath), 'safekeeper') self.safekeepers: Optional[List[subprocess.CompletedProcess[Any]]] = None self.postgres: Optional[ProposerPostgres] = None self.tenant_id: Optional[uuid.UUID] = None @@ -753,8 +753,8 @@ def test_safekeeper_without_pageserver(test_output_dir: str, assert res == 5050 -def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): - def safekeepers_guc(env: ZenithEnv, sk_names: List[int]) -> str: +def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): + def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str: return ','.join([f'localhost:{sk.port.pg}' for sk in env.safekeepers if sk.id in sk_names]) def execute_payload(pg: Postgres): @@ -781,9 +781,9 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): except Exception as e: log.info(f"Safekeeper {sk.id} status error: {e}") - zenith_env_builder.num_safekeepers = 4 - env = zenith_env_builder.init_start() - env.zenith_cli.create_branch('test_replace_safekeeper') + neon_env_builder.num_safekeepers = 4 + env = neon_env_builder.init_start() + env.neon_cli.create_branch('test_replace_safekeeper') log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() @@ -792,7 +792,7 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): pg.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) pg.start() - # learn zenith timeline from compute + # learn neon timeline from compute tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] @@ -844,7 +844,7 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): # We have `wal_keep_size=0`, so postgres should trim WAL once it's broadcasted # to all safekeepers. This test checks that compute WAL can fit into small number # of WAL segments. -def test_wal_deleted_after_broadcast(zenith_env_builder: ZenithEnvBuilder): +def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): # used to calculate delta in collect_stats last_lsn = .0 @@ -866,10 +866,10 @@ def test_wal_deleted_after_broadcast(zenith_env_builder: ZenithEnvBuilder): def generate_wal(cur): cur.execute("INSERT INTO t SELECT generate_series(1,300000), 'payload'") - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_wal_deleted_after_broadcast') + env.neon_cli.create_branch('test_wal_deleted_after_broadcast') # Adjust checkpoint config to prevent keeping old WAL segments pg = env.postgres.create_start( 'test_wal_deleted_after_broadcast', @@ -894,18 +894,18 @@ def test_wal_deleted_after_broadcast(zenith_env_builder: ZenithEnvBuilder): assert wal_size_after_checkpoint < 16 * 2.5 -def test_delete_force(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init_start() +def test_delete_force(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() # Create two tenants: one will be deleted, other should be preserved. tenant_id = env.initial_tenant.hex - timeline_id_1 = env.zenith_cli.create_branch('br1').hex # Active, delete explicitly - timeline_id_2 = env.zenith_cli.create_branch('br2').hex # Inactive, delete explicitly - timeline_id_3 = env.zenith_cli.create_branch('br3').hex # Active, delete with the tenant - timeline_id_4 = env.zenith_cli.create_branch('br4').hex # Inactive, delete with the tenant + timeline_id_1 = env.neon_cli.create_branch('br1').hex # Active, delete explicitly + timeline_id_2 = env.neon_cli.create_branch('br2').hex # Inactive, delete explicitly + timeline_id_3 = env.neon_cli.create_branch('br3').hex # Active, delete with the tenant + timeline_id_4 = env.neon_cli.create_branch('br4').hex # Inactive, delete with the tenant - tenant_id_other_uuid, timeline_id_other_uuid = env.zenith_cli.create_tenant() + tenant_id_other_uuid, timeline_id_other_uuid = env.neon_cli.create_tenant() tenant_id_other = tenant_id_other_uuid.hex timeline_id_other = timeline_id_other_uuid.hex diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index bd3b3027c5..c0967ef6c0 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -4,7 +4,7 @@ import asyncpg import random import time -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres, Safekeeper +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper from fixtures.log_helper import getLogger from fixtures.utils import lsn_from_hex, lsn_to_hex from typing import List @@ -136,7 +136,7 @@ async def wait_for_lsn(safekeeper: Safekeeper, # On each iteration 1 acceptor is stopped, and 2 others should allow # background workers execute transactions. In the end, state should remain # consistent. -async def run_restarts_under_load(env: ZenithEnv, +async def run_restarts_under_load(env: NeonEnv, pg: Postgres, acceptors: List[Safekeeper], n_workers=10, @@ -202,11 +202,11 @@ async def run_restarts_under_load(env: ZenithEnv, # Restart acceptors one by one, while executing and validating bank transactions -def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() +def test_restarts_under_load(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_safekeepers_restarts_under_load') + env.neon_cli.create_branch('test_safekeepers_restarts_under_load') # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long pg = env.postgres.create_start('test_safekeepers_restarts_under_load', config_lines=['max_replication_write_lag=1MB']) @@ -217,11 +217,11 @@ def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder): # Restart acceptors one by one and test that everything is working as expected # when checkpoins are triggered frequently by max_wal_size=32MB. Because we have # wal_keep_size=0, there will be aggressive WAL segments recycling. -def test_restarts_frequent_checkpoints(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() +def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_restarts_frequent_checkpoints') + env.neon_cli.create_branch('test_restarts_frequent_checkpoints') # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long pg = env.postgres.create_start('test_restarts_frequent_checkpoints', config_lines=[ diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py index 85c6e776c5..8ea64d4fce 100644 --- a/test_runner/batch_others/test_wal_restore.py +++ b/test_runner/batch_others/test_wal_restore.py @@ -1,26 +1,26 @@ import os import subprocess -from fixtures.zenith_fixtures import (ZenithEnvBuilder, - VanillaPostgres, - PortDistributor, - PgBin, - base_dir, - vanilla_pg, - pg_distrib_dir) +from fixtures.neon_fixtures import (NeonEnvBuilder, + VanillaPostgres, + PortDistributor, + PgBin, + base_dir, + vanilla_pg, + pg_distrib_dir) from fixtures.log_helper import log -def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, +def test_wal_restore(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir, port_distributor: PortDistributor): - env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_wal_restore") + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_wal_restore") pg = env.postgres.create_start('test_wal_restore') pg.safe_psql("create table t as select generate_series(1,300000)") tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - env.zenith_cli.pageserver_stop() + env.neon_cli.pageserver_stop() port = port_distributor.get_port() data_dir = os.path.join(test_output_dir, 'pgsql.restored') with VanillaPostgres(data_dir, PgBin(test_output_dir), port) as restored: diff --git a/test_runner/batch_pg_regress/test_isolation.py b/test_runner/batch_pg_regress/test_isolation.py index 7c99c04fe3..936b31298e 100644 --- a/test_runner/batch_pg_regress/test_isolation.py +++ b/test_runner/batch_pg_regress/test_isolation.py @@ -1,16 +1,16 @@ import os import pytest from fixtures.utils import mkdir_if_needed -from fixtures.zenith_fixtures import ZenithEnv, base_dir, pg_distrib_dir +from fixtures.neon_fixtures import NeonEnv, base_dir, pg_distrib_dir # The isolation tests run for a long time, especially in debug mode, # so use a larger-than-default timeout. @pytest.mark.timeout(1800) -def test_isolation(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys): - env = zenith_simple_env +def test_isolation(neon_simple_env: NeonEnv, test_output_dir, pg_bin, capsys): + env = neon_simple_env - env.zenith_cli.create_branch("test_isolation", "empty") + env.neon_cli.create_branch("test_isolation", "empty") # Connect to postgres and create a database called "regression". # isolation tests use prepared transactions, so enable them pg = env.postgres.create_start('test_isolation', config_lines=['max_prepared_transactions=100']) diff --git a/test_runner/batch_pg_regress/test_zenith_regress.py b/test_runner/batch_pg_regress/test_neon_regress.py similarity index 75% rename from test_runner/batch_pg_regress/test_zenith_regress.py rename to test_runner/batch_pg_regress/test_neon_regress.py index 2b57137d16..de3f9705a0 100644 --- a/test_runner/batch_pg_regress/test_zenith_regress.py +++ b/test_runner/batch_pg_regress/test_neon_regress.py @@ -1,19 +1,19 @@ import os from fixtures.utils import mkdir_if_needed -from fixtures.zenith_fixtures import (ZenithEnv, - check_restored_datadir_content, - base_dir, - pg_distrib_dir) +from fixtures.neon_fixtures import (NeonEnv, + check_restored_datadir_content, + base_dir, + pg_distrib_dir) from fixtures.log_helper import log -def test_zenith_regress(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys): - env = zenith_simple_env +def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir, pg_bin, capsys): + env = neon_simple_env - env.zenith_cli.create_branch("test_zenith_regress", "empty") + env.neon_cli.create_branch("test_neon_regress", "empty") # Connect to postgres and create a database called "regression". - pg = env.postgres.create_start('test_zenith_regress') + pg = env.postgres.create_start('test_neon_regress') pg.safe_psql('CREATE DATABASE regression') # Create some local directories for pg_regress to run in. @@ -22,9 +22,9 @@ def test_zenith_regress(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, c mkdir_if_needed(os.path.join(runpath, 'testtablespace')) # Compute all the file locations that pg_regress will need. - # This test runs zenith specific tests + # This test runs neon specific tests build_path = os.path.join(pg_distrib_dir, 'build/src/test/regress') - src_path = os.path.join(base_dir, 'test_runner/zenith_regress') + src_path = os.path.join(base_dir, 'test_runner/neon_regress') bindir = os.path.join(pg_distrib_dir, 'bin') schedule = os.path.join(src_path, 'parallel_schedule') pg_regress = os.path.join(build_path, 'pg_regress') diff --git a/test_runner/batch_pg_regress/test_pg_regress.py b/test_runner/batch_pg_regress/test_pg_regress.py index be7776113a..fb71d31170 100644 --- a/test_runner/batch_pg_regress/test_pg_regress.py +++ b/test_runner/batch_pg_regress/test_pg_regress.py @@ -1,16 +1,16 @@ import os import pytest from fixtures.utils import mkdir_if_needed -from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content, base_dir, pg_distrib_dir +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content, base_dir, pg_distrib_dir # The pg_regress tests run for a long time, especially in debug mode, # so use a larger-than-default timeout. @pytest.mark.timeout(1800) -def test_pg_regress(zenith_simple_env: ZenithEnv, test_output_dir: str, pg_bin, capsys): - env = zenith_simple_env +def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: str, pg_bin, capsys): + env = neon_simple_env - env.zenith_cli.create_branch("test_pg_regress", "empty") + env.neon_cli.create_branch("test_pg_regress", "empty") # Connect to postgres and create a database called "regression". pg = env.postgres.create_start('test_pg_regress') pg.safe_psql('CREATE DATABASE regression') diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 59e415e3a8..9569ff5674 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -1,5 +1,5 @@ pytest_plugins = ( - "fixtures.zenith_fixtures", + "fixtures.neon_fixtures", "fixtures.benchmark_fixture", "fixtures.compare_fixtures", "fixtures.slow", diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 75fece6818..3a679cc705 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -25,9 +25,9 @@ To use, declare the 'zenbenchmark' fixture in the test function. Run the bencmark, and then record the result by calling zenbenchmark.record. For example: import timeit -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv -def test_mybench(zenith_simple_env: env, zenbenchmark): +def test_mybench(neon_simple_env: env, zenbenchmark): # Initialize the test ... @@ -142,7 +142,7 @@ class MetricReport(str, enum.Enum): # str is a hack to make it json serializabl LOWER_IS_BETTER = 'lower_is_better' -class ZenithBenchmarker: +class NeonBenchmarker: """ An object for recording benchmark results. This is created for each test function by the zenbenchmark fixture @@ -163,7 +163,7 @@ class ZenithBenchmarker: Record a benchmark result. """ # just to namespace the value - name = f"zenith_benchmarker_{metric_name}" + name = f"neon_benchmarker_{metric_name}" self.property_recorder( name, { @@ -289,12 +289,12 @@ class ZenithBenchmarker: @pytest.fixture(scope="function") -def zenbenchmark(record_property) -> Iterator[ZenithBenchmarker]: +def zenbenchmark(record_property) -> Iterator[NeonBenchmarker]: """ This is a python decorator for benchmark fixtures. It contains functions for recording measurements, and prints them out at the end. """ - benchmarker = ZenithBenchmarker(record_property) + benchmarker = NeonBenchmarker(record_property) yield benchmarker diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index c61bc6d81f..b04a038a50 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -2,8 +2,8 @@ import pytest from contextlib import contextmanager from abc import ABC, abstractmethod -from fixtures.zenith_fixtures import PgBin, PgProtocol, VanillaPostgres, RemotePostgres, ZenithEnv -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker +from fixtures.neon_fixtures import PgBin, PgProtocol, VanillaPostgres, RemotePostgres, NeonEnv +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker # Type-related stuff from typing import Iterator @@ -12,7 +12,7 @@ from typing import Iterator class PgCompare(ABC): """Common interface of all postgres implementations, useful for benchmarks. - This class is a helper class for the zenith_with_baseline fixture. See its documentation + This class is a helper class for the neon_with_baseline fixture. See its documentation for more details. """ @property @@ -26,7 +26,7 @@ class PgCompare(ABC): pass @property - def zenbenchmark(self) -> ZenithBenchmarker: + def zenbenchmark(self) -> NeonBenchmarker: pass @abstractmethod @@ -52,19 +52,19 @@ class PgCompare(ABC): pass -class ZenithCompare(PgCompare): - """PgCompare interface for the zenith stack.""" +class NeonCompare(PgCompare): + """PgCompare interface for the neon stack.""" def __init__(self, - zenbenchmark: ZenithBenchmarker, - zenith_simple_env: ZenithEnv, + zenbenchmark: NeonBenchmarker, + neon_simple_env: NeonEnv, pg_bin: PgBin, branch_name): - self.env = zenith_simple_env + self.env = neon_simple_env self._zenbenchmark = zenbenchmark self._pg_bin = pg_bin # We only use one branch and one timeline - self.env.zenith_cli.create_branch(branch_name, 'empty') + self.env.neon_cli.create_branch(branch_name, 'empty') self._pg = self.env.postgres.create_start(branch_name) self.timeline = self.pg.safe_psql("SHOW neon.timeline_id")[0][0] @@ -221,9 +221,9 @@ class RemoteCompare(PgCompare): @pytest.fixture(scope='function') -def zenith_compare(request, zenbenchmark, pg_bin, zenith_simple_env) -> ZenithCompare: +def neon_compare(request, zenbenchmark, pg_bin, neon_simple_env) -> NeonCompare: branch_name = request.node.name - return ZenithCompare(zenbenchmark, zenith_simple_env, pg_bin, branch_name) + return NeonCompare(zenbenchmark, neon_simple_env, pg_bin, branch_name) @pytest.fixture(scope='function') @@ -236,13 +236,13 @@ def remote_compare(zenbenchmark, remote_pg) -> RemoteCompare: return RemoteCompare(zenbenchmark, remote_pg) -@pytest.fixture(params=["vanilla_compare", "zenith_compare"], ids=["vanilla", "zenith"]) -def zenith_with_baseline(request) -> PgCompare: - """Parameterized fixture that helps compare zenith against vanilla postgres. +@pytest.fixture(params=["vanilla_compare", "neon_compare"], ids=["vanilla", "neon"]) +def neon_with_baseline(request) -> PgCompare: + """Parameterized fixture that helps compare neon against vanilla postgres. A test that uses this fixture turns into a parameterized test that runs against: 1. A vanilla postgres instance - 2. A simple zenith env (see zenith_simple_env) + 2. A simple neon env (see neon_simple_env) 3. Possibly other postgres protocol implementations. The main goal of this fixture is to make it easier for people to read and write @@ -254,7 +254,7 @@ def zenith_with_baseline(request) -> PgCompare: of that. If a test requires some one-off special implementation-specific logic, use of - isinstance(zenith_with_baseline, ZenithCompare) is encouraged. Though if that + isinstance(neon_with_baseline, NeonCompare) is encouraged. Though if that implementation-specific logic is widely useful across multiple tests, it might make sense to add methods to the PgCompare class. """ diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/neon_fixtures.py similarity index 92% rename from test_runner/fixtures/zenith_fixtures.py rename to test_runner/fixtures/neon_fixtures.py index 37bc5fe541..2e58ad6ea5 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -81,7 +81,7 @@ def pytest_addoption(parser): # These are set in pytest_configure() base_dir = "" -zenith_binpath = "" +neon_binpath = "" pg_distrib_dir = "" top_output_dir = "" @@ -100,7 +100,7 @@ def check_interferring_processes(config): # result of the test. # NOTE this shows as an internal pytest error, there might be a better way raise Exception( - 'Found interfering processes running. Stop all Zenith pageservers, nodes, safekeepers, as well as stand-alone Postgres.' + 'Found interfering processes running. Stop all Neon pageservers, nodes, safekeepers, as well as stand-alone Postgres.' ) @@ -146,25 +146,25 @@ def pytest_configure(config): raise Exception('postgres not found at "{}"'.format(pg_distrib_dir)) if os.getenv("REMOTE_ENV"): - # we are in remote env and do not have zenith binaries locally + # we are in remote env and do not have neon binaries locally # this is the case for benchmarks run on self-hosted runner return - # Find the zenith binaries. - global zenith_binpath - env_zenith_bin = os.environ.get('ZENITH_BIN') - if env_zenith_bin: - zenith_binpath = env_zenith_bin + # Find the neon binaries. + global neon_binpath + env_neon_bin = os.environ.get('ZENITH_BIN') + if env_neon_bin: + neon_binpath = env_neon_bin else: - zenith_binpath = os.path.join(base_dir, 'target/debug') - log.info(f'zenith_binpath is {zenith_binpath}') - if not os.path.exists(os.path.join(zenith_binpath, 'pageserver')): - raise Exception('zenith binaries not found at "{}"'.format(zenith_binpath)) + neon_binpath = os.path.join(base_dir, 'target/debug') + log.info(f'neon_binpath is {neon_binpath}') + if not os.path.exists(os.path.join(neon_binpath, 'pageserver')): + raise Exception('neon binaries not found at "{}"'.format(neon_binpath)) def profiling_supported(): """Return True if the pageserver was compiled with the 'profiling' feature """ - bin_pageserver = os.path.join(str(zenith_binpath), 'pageserver') + bin_pageserver = os.path.join(str(neon_binpath), 'pageserver') res = subprocess.run([bin_pageserver, '--version'], check=True, universal_newlines=True, @@ -223,7 +223,7 @@ def can_bind(host: str, port: int) -> bool: # TODO: The pageserver and safekeepers don't use SO_REUSEADDR at the # moment. If that changes, we should use start using SO_REUSEADDR here # too, to allow reusing ports more quickly. - # See https://github.com/zenithdb/zenith/issues/801 + # See https://github.com/neondatabase/neon/issues/801 #sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) try: @@ -479,12 +479,12 @@ class RemoteStorageUsers(Flag): SAFEKEEPER = auto() -class ZenithEnvBuilder: +class NeonEnvBuilder: """ - Builder object to create a Zenith runtime environment + Builder object to create a Neon runtime environment - You should use the `zenith_env_builder` or `zenith_simple_env` pytest - fixture to create the ZenithEnv object. That way, the repository is + You should use the `neon_env_builder` or `neon_simple_env` pytest + fixture to create the NeonEnv object. That way, the repository is created in the right directory, based on the test name, and it's properly cleaned up after the test has finished. """ @@ -511,18 +511,18 @@ class ZenithEnvBuilder: self.num_safekeepers = num_safekeepers self.pageserver_auth_enabled = pageserver_auth_enabled self.default_branch_name = default_branch_name - self.env: Optional[ZenithEnv] = None + self.env: Optional[NeonEnv] = None - def init(self) -> ZenithEnv: + def init(self) -> NeonEnv: # Cannot create more than one environment from one builder assert self.env is None, "environment already initialized" - self.env = ZenithEnv(self) + self.env = NeonEnv(self) return self.env def start(self): self.env.start() - def init_start(self) -> ZenithEnv: + def init_start(self) -> NeonEnv: env = self.init() self.start() return env @@ -571,12 +571,12 @@ class ZenithEnvBuilder: self.env.pageserver.stop(immediate=True) -class ZenithEnv: +class NeonEnv: """ - An object representing the Zenith runtime environment. It consists of + An object representing the Neon runtime environment. It consists of the page server, 0-N safekeepers, and the compute nodes. - ZenithEnv contains functions for stopping/starting nodes in the + NeonEnv contains functions for stopping/starting nodes in the environment, checking their status, creating tenants, connecting to the nodes, creating and destroying compute nodes, etc. The page server and the safekeepers are considered fixed in the environment, you cannot @@ -584,7 +584,7 @@ class ZenithEnv: likely change in the future, as we start supporting multiple page servers and adding/removing safekeepers on the fly). - Some notable functions and fields in ZenithEnv: + Some notable functions and fields in NeonEnv: postgres - A factory object for creating postgres compute nodes. @@ -598,24 +598,24 @@ class ZenithEnv: initial_tenant - tenant ID of the initial tenant created in the repository - zenith_cli - can be used to run the 'zenith' CLI tool + neon_cli - can be used to run the 'neon' CLI tool create_tenant() - initializes a new tenant in the page server, returns the tenant id """ - def __init__(self, config: ZenithEnvBuilder): + def __init__(self, config: NeonEnvBuilder): self.repo_dir = config.repo_dir self.rust_log_override = config.rust_log_override self.port_distributor = config.port_distributor self.s3_mock_server = config.mock_s3_server - self.zenith_cli = ZenithCli(env=self) + self.neon_cli = NeonCli(env=self) self.postgres = PostgresFactory(self) self.safekeepers: List[Safekeeper] = [] self.broker = config.broker self.remote_storage = config.remote_storage self.remote_storage_users = config.remote_storage_users - # generate initial tenant ID here instead of letting 'zenith init' generate it, + # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. self.initial_tenant = uuid.uuid4() @@ -645,10 +645,10 @@ class ZenithEnv: auth_type = '{pageserver_auth_type}' """) - # Create a corresponding ZenithPageserver object - self.pageserver = ZenithPageserver(self, - port=pageserver_port, - config_override=config.pageserver_config_override) + # Create a corresponding NeonPageserver object + self.pageserver = NeonPageserver(self, + port=pageserver_port, + config_override=config.pageserver_config_override) # Create config and a Safekeeper object for each safekeeper for i in range(1, config.num_safekeepers + 1): @@ -672,7 +672,7 @@ class ZenithEnv: self.safekeepers.append(safekeeper) log.info(f"Config: {toml}") - self.zenith_cli.init(toml) + self.neon_cli.init(toml) def start(self): # Start up broker, pageserver and all safekeepers @@ -697,10 +697,10 @@ class ZenithEnv: def _shared_simple_env(request: Any, port_distributor: PortDistributor, mock_s3_server: MockS3Server, - default_broker: Etcd) -> Iterator[ZenithEnv]: + default_broker: Etcd) -> Iterator[NeonEnv]: """ - # Internal fixture backing the `zenith_simple_env` fixture. If TEST_SHARED_FIXTURES - is set, this is shared by all tests using `zenith_simple_env`. + # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES + is set, this is shared by all tests using `neon_simple_env`. """ if os.environ.get('TEST_SHARED_FIXTURES') is None: @@ -711,23 +711,23 @@ def _shared_simple_env(request: Any, repo_dir = os.path.join(str(top_output_dir), "shared_repo") shutil.rmtree(repo_dir, ignore_errors=True) - with ZenithEnvBuilder(Path(repo_dir), port_distributor, default_broker, - mock_s3_server) as builder: + with NeonEnvBuilder(Path(repo_dir), port_distributor, default_broker, + mock_s3_server) as builder: env = builder.init_start() # For convenience in tests, create a branch from the freshly-initialized cluster. - env.zenith_cli.create_branch('empty', ancestor_branch_name=DEFAULT_BRANCH_NAME) + env.neon_cli.create_branch('empty', ancestor_branch_name=DEFAULT_BRANCH_NAME) yield env @pytest.fixture(scope='function') -def zenith_simple_env(_shared_simple_env: ZenithEnv) -> Iterator[ZenithEnv]: +def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]: """ - Simple Zenith environment, with no authentication and no safekeepers. + Simple Neon environment, with no authentication and no safekeepers. If TEST_SHARED_FIXTURES environment variable is set, we reuse the same - environment for all tests that use 'zenith_simple_env', keeping the + environment for all tests that use 'neon_simple_env', keeping the page server and safekeepers running. Any compute nodes are stopped after each the test, however. """ @@ -737,17 +737,17 @@ def zenith_simple_env(_shared_simple_env: ZenithEnv) -> Iterator[ZenithEnv]: @pytest.fixture(scope='function') -def zenith_env_builder(test_output_dir, - port_distributor: PortDistributor, - mock_s3_server: MockS3Server, - default_broker: Etcd) -> Iterator[ZenithEnvBuilder]: +def neon_env_builder(test_output_dir, + port_distributor: PortDistributor, + mock_s3_server: MockS3Server, + default_broker: Etcd) -> Iterator[NeonEnvBuilder]: """ - Fixture to create a Zenith environment for test. + Fixture to create a Neon environment for test. - To use, define 'zenith_env_builder' fixture in your test to get access to the + To use, define 'neon_env_builder' fixture in your test to get access to the builder object. Set properties on it to describe the environment. Finally, initialize and start up the environment by calling - zenith_env_builder.init_start(). + neon_env_builder.init_start(). After the initialization, you can launch compute nodes by calling the functions in the 'env.postgres' factory object, stop/start the @@ -758,16 +758,16 @@ def zenith_env_builder(test_output_dir, repo_dir = os.path.join(test_output_dir, "repo") # Return the builder to the caller - with ZenithEnvBuilder(Path(repo_dir), port_distributor, default_broker, - mock_s3_server) as builder: + with NeonEnvBuilder(Path(repo_dir), port_distributor, default_broker, + mock_s3_server) as builder: yield builder -class ZenithPageserverApiException(Exception): +class NeonPageserverApiException(Exception): pass -class ZenithPageserverHttpClient(requests.Session): +class NeonPageserverHttpClient(requests.Session): def __init__(self, port: int, auth_token: Optional[str] = None): super().__init__() self.port = port @@ -784,7 +784,7 @@ class ZenithPageserverHttpClient(requests.Session): msg = res.json()['msg'] except: msg = '' - raise ZenithPageserverApiException(msg) from e + raise NeonPageserverApiException(msg) from e def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() @@ -891,12 +891,12 @@ TIMELINE_DATA_EXTRACTOR = re.compile(r"\s(?P[^\s]+)\s\[(?P List[Tuple[str, str]]: """ - Returns a list of (branch_name, timeline_id) tuples out of parsed `zenith timeline list` CLI output. + Returns a list of (branch_name, timeline_id) tuples out of parsed `neon timeline list` CLI output. """ # (L) main [b49f7954224a0ad25cc0013ea107b54b] @@ -1053,7 +1053,7 @@ class ZenithCli: return res def pageserver_enabled_features(self) -> Any: - bin_pageserver = os.path.join(str(zenith_binpath), 'pageserver') + bin_pageserver = os.path.join(str(neon_binpath), 'pageserver') args = [bin_pageserver, '--enabled-features'] log.info('Running command "{}"'.format(' '.join(args))) @@ -1173,22 +1173,22 @@ class ZenithCli: extra_env_vars: Optional[Dict[str, str]] = None, check_return_code=True) -> 'subprocess.CompletedProcess[str]': """ - Run "zenith" with the specified arguments. + Run "neon" with the specified arguments. Arguments must be in list form, e.g. ['pg', 'create'] Return both stdout and stderr, which can be accessed as - >>> result = env.zenith_cli.raw_cli(...) + >>> result = env.neon_cli.raw_cli(...) >>> assert result.stderr == "" >>> log.info(result.stdout) """ assert type(arguments) == list - bin_zenith = os.path.join(str(zenith_binpath), 'neon_local') + bin_neon = os.path.join(str(neon_binpath), 'neon_local') - args = [bin_zenith] + arguments + args = [bin_neon] + arguments log.info('Running command "{}"'.format(' '.join(args))) log.info(f'Running in "{self.env.repo_dir}"') @@ -1231,20 +1231,20 @@ class ZenithCli: return res -class ZenithPageserver(PgProtocol): +class NeonPageserver(PgProtocol): """ An object representing a running pageserver. - Initializes the repository via `zenith init`. + Initializes the repository via `neon init`. """ - def __init__(self, env: ZenithEnv, port: PageserverPort, config_override: Optional[str] = None): + def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional[str] = None): super().__init__(host='localhost', port=port.pg, user='cloud_admin') self.env = env self.running = False self.service_port = port self.config_override = config_override - def start(self, overrides=()) -> 'ZenithPageserver': + def start(self, overrides=()) -> 'NeonPageserver': """ Start the page server. `overrides` allows to add some config to this pageserver start. @@ -1252,17 +1252,17 @@ class ZenithPageserver(PgProtocol): """ assert self.running == False - self.env.zenith_cli.pageserver_start(overrides=overrides) + self.env.neon_cli.pageserver_start(overrides=overrides) self.running = True return self - def stop(self, immediate=False) -> 'ZenithPageserver': + def stop(self, immediate=False) -> 'NeonPageserver': """ Stop the page server. Returns self. """ if self.running: - self.env.zenith_cli.pageserver_stop(immediate) + self.env.neon_cli.pageserver_stop(immediate) self.running = False return self @@ -1272,8 +1272,8 @@ class ZenithPageserver(PgProtocol): def __exit__(self, exc_type, exc, tb): self.stop(True) - def http_client(self, auth_token: Optional[str] = None) -> ZenithPageserverHttpClient: - return ZenithPageserverHttpClient( + def http_client(self, auth_token: Optional[str] = None) -> NeonPageserverHttpClient: + return NeonPageserverHttpClient( port=self.service_port.http, auth_token=auth_token, ) @@ -1453,7 +1453,7 @@ def remote_pg(test_output_dir: str) -> Iterator[RemotePostgres]: yield remote_pg -class ZenithProxy(PgProtocol): +class NeonProxy(PgProtocol): def __init__(self, port: int): super().__init__(host="127.0.0.1", user="proxy_user", @@ -1469,7 +1469,7 @@ class ZenithProxy(PgProtocol): assert self._popen is None # Start proxy - bin_proxy = os.path.join(str(zenith_binpath), 'proxy') + bin_proxy = os.path.join(str(neon_binpath), 'proxy') args = [bin_proxy] args.extend(["--http", f"{self.host}:{self.http_port}"]) args.extend(["--proxy", f"{self.host}:{self.port}"]) @@ -1493,20 +1493,20 @@ class ZenithProxy(PgProtocol): @pytest.fixture(scope='function') -def static_proxy(vanilla_pg) -> Iterator[ZenithProxy]: - """Zenith proxy that routes directly to vanilla postgres.""" +def static_proxy(vanilla_pg) -> Iterator[NeonProxy]: + """Neon proxy that routes directly to vanilla postgres.""" vanilla_pg.start() vanilla_pg.safe_psql("create user proxy_auth with password 'pytest1' superuser") vanilla_pg.safe_psql("create user proxy_user with password 'pytest2'") - with ZenithProxy(4432) as proxy: + with NeonProxy(4432) as proxy: proxy.start_static() yield proxy class Postgres(PgProtocol): """ An object representing a running postgres daemon. """ - def __init__(self, env: ZenithEnv, tenant_id: uuid.UUID, port: int): + def __init__(self, env: NeonEnv, tenant_id: uuid.UUID, port: int): super().__init__(host='localhost', port=port, user='cloud_admin', dbname='postgres') self.env = env self.running = False @@ -1532,11 +1532,11 @@ class Postgres(PgProtocol): config_lines = [] self.node_name = node_name or f'{branch_name}_pg_node' - self.env.zenith_cli.pg_create(branch_name, - node_name=self.node_name, - tenant_id=self.tenant_id, - lsn=lsn, - port=self.port) + self.env.neon_cli.pg_create(branch_name, + node_name=self.node_name, + tenant_id=self.tenant_id, + lsn=lsn, + port=self.port) path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name self.pgdata_dir = os.path.join(self.env.repo_dir, path) @@ -1560,9 +1560,9 @@ class Postgres(PgProtocol): log.info(f"Starting postgres node {self.node_name}") - run_result = self.env.zenith_cli.pg_start(self.node_name, - tenant_id=self.tenant_id, - port=self.port) + run_result = self.env.neon_cli.pg_start(self.node_name, + tenant_id=self.tenant_id, + port=self.port) self.running = True log.info(f"stdout: {run_result.stdout}") @@ -1630,7 +1630,7 @@ class Postgres(PgProtocol): if self.running: assert self.node_name is not None - self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id) + self.env.neon_cli.pg_stop(self.node_name, self.tenant_id) self.running = False return self @@ -1642,7 +1642,7 @@ class Postgres(PgProtocol): """ assert self.node_name is not None - self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, True) + self.env.neon_cli.pg_stop(self.node_name, self.tenant_id, True) self.node_name = None self.running = False @@ -1679,7 +1679,7 @@ class Postgres(PgProtocol): class PostgresFactory: """ An object representing multiple running postgres daemons. """ - def __init__(self, env: ZenithEnv): + def __init__(self, env: NeonEnv): self.env = env self.num_instances = 0 self.instances: List[Postgres] = [] @@ -1750,7 +1750,7 @@ class SafekeeperPort: @dataclass class Safekeeper: """ An object representing a running safekeeper daemon. """ - env: ZenithEnv + env: NeonEnv port: SafekeeperPort id: int auth_token: Optional[str] = None @@ -1758,7 +1758,7 @@ class Safekeeper: def start(self) -> 'Safekeeper': assert self.running == False - self.env.zenith_cli.safekeeper_start(self.id) + self.env.neon_cli.safekeeper_start(self.id) self.running = True # wait for wal acceptor start by checking its status started_at = time.time() @@ -1778,7 +1778,7 @@ class Safekeeper: def stop(self, immediate=False) -> 'Safekeeper': log.info('Stopping safekeeper {}'.format(self.id)) - self.env.zenith_cli.safekeeper_stop(self.id, immediate) + self.env.neon_cli.safekeeper_stop(self.id, immediate) self.running = False return self @@ -1966,7 +1966,7 @@ def get_test_output_dir(request: Any) -> str: # This is autouse, so the test output directory always gets created, even # if a test doesn't put anything there. It also solves a problem with the -# zenith_simple_env fixture: if TEST_SHARED_FIXTURES is not set, it +# neon_simple_env fixture: if TEST_SHARED_FIXTURES is not set, it # creates the repo in the test output directory. But it cannot depend on # 'test_output_dir' fixture, because when TEST_SHARED_FIXTURES is not set, # it has 'session' scope and cannot access fixtures with 'function' @@ -2044,7 +2044,7 @@ def list_files_to_compare(pgdata_dir: str): # pg is the existing and running compute node, that we want to compare with a basebackup -def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Postgres): +def check_restored_datadir_content(test_output_dir: str, env: NeonEnv, pg: Postgres): # Get the timeline ID. We need it for the 'basebackup' command with closing(pg.connect()) as conn: @@ -2134,7 +2134,7 @@ def wait_until(number_of_iterations: int, interval: int, func): raise Exception("timed out while waiting for %s" % func) from last_exception -def assert_local(pageserver_http_client: ZenithPageserverHttpClient, +def assert_local(pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID): timeline_detail = pageserver_http_client.timeline_detail(tenant, timeline) @@ -2142,7 +2142,7 @@ def assert_local(pageserver_http_client: ZenithPageserverHttpClient, return timeline_detail -def remote_consistent_lsn(pageserver_http_client: ZenithPageserverHttpClient, +def remote_consistent_lsn(pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID) -> int: detail = pageserver_http_client.timeline_detail(tenant, timeline) @@ -2158,7 +2158,7 @@ def remote_consistent_lsn(pageserver_http_client: ZenithPageserverHttpClient, return lsn_from_hex(lsn_str) -def wait_for_upload(pageserver_http_client: ZenithPageserverHttpClient, +def wait_for_upload(pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID, lsn: int): @@ -2174,7 +2174,7 @@ def wait_for_upload(pageserver_http_client: ZenithPageserverHttpClient, lsn_to_hex(lsn), lsn_to_hex(current_lsn))) -def last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, +def last_record_lsn(pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID) -> int: detail = pageserver_http_client.timeline_detail(tenant, timeline) @@ -2184,7 +2184,7 @@ def last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, return lsn_from_hex(lsn_str) -def wait_for_last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, +def wait_for_last_record_lsn(pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID, lsn: int): diff --git a/test_runner/zenith_regress/.gitignore b/test_runner/neon_regress/.gitignore similarity index 100% rename from test_runner/zenith_regress/.gitignore rename to test_runner/neon_regress/.gitignore diff --git a/test_runner/zenith_regress/README.md b/test_runner/neon_regress/README.md similarity index 56% rename from test_runner/zenith_regress/README.md rename to test_runner/neon_regress/README.md index 61e3aad04e..b23a55462e 100644 --- a/test_runner/zenith_regress/README.md +++ b/test_runner/neon_regress/README.md @@ -1,7 +1,7 @@ To add a new SQL test -- add sql script to run to zenith_regress/sql/testname.sql -- add expected output to zenith_regress/expected/testname.out +- add sql script to run to neon_regress/sql/testname.sql +- add expected output to neon_regress/expected/testname.out - add testname to parallel_schedule That's it. diff --git a/test_runner/zenith_regress/expected/.gitignore b/test_runner/neon_regress/expected/.gitignore similarity index 100% rename from test_runner/zenith_regress/expected/.gitignore rename to test_runner/neon_regress/expected/.gitignore diff --git a/test_runner/zenith_regress/expected/zenith-cid.out b/test_runner/neon_regress/expected/neon-cid.out similarity index 100% rename from test_runner/zenith_regress/expected/zenith-cid.out rename to test_runner/neon_regress/expected/neon-cid.out diff --git a/test_runner/zenith_regress/expected/zenith-clog.out b/test_runner/neon_regress/expected/neon-clog.out similarity index 100% rename from test_runner/zenith_regress/expected/zenith-clog.out rename to test_runner/neon_regress/expected/neon-clog.out diff --git a/test_runner/zenith_regress/expected/zenith-rel-truncate.out b/test_runner/neon_regress/expected/neon-rel-truncate.out similarity index 100% rename from test_runner/zenith_regress/expected/zenith-rel-truncate.out rename to test_runner/neon_regress/expected/neon-rel-truncate.out diff --git a/test_runner/zenith_regress/expected/zenith-vacuum-full.out b/test_runner/neon_regress/expected/neon-vacuum-full.out similarity index 100% rename from test_runner/zenith_regress/expected/zenith-vacuum-full.out rename to test_runner/neon_regress/expected/neon-vacuum-full.out diff --git a/test_runner/zenith_regress/parallel_schedule b/test_runner/neon_regress/parallel_schedule similarity index 71% rename from test_runner/zenith_regress/parallel_schedule rename to test_runner/neon_regress/parallel_schedule index f64bf8a034..569c7b5066 100644 --- a/test_runner/zenith_regress/parallel_schedule +++ b/test_runner/neon_regress/parallel_schedule @@ -4,7 +4,7 @@ # number of connections needed to run the tests. # ---------- -test: zenith-cid -test: zenith-rel-truncate -test: zenith-clog -test: zenith-vacuum-full +test: neon-cid +test: neon-rel-truncate +test: neon-clog +test: neon-vacuum-full diff --git a/test_runner/zenith_regress/sql/.gitignore b/test_runner/neon_regress/sql/.gitignore similarity index 100% rename from test_runner/zenith_regress/sql/.gitignore rename to test_runner/neon_regress/sql/.gitignore diff --git a/test_runner/zenith_regress/sql/zenith-cid.sql b/test_runner/neon_regress/sql/neon-cid.sql similarity index 100% rename from test_runner/zenith_regress/sql/zenith-cid.sql rename to test_runner/neon_regress/sql/neon-cid.sql diff --git a/test_runner/zenith_regress/sql/zenith-clog.sql b/test_runner/neon_regress/sql/neon-clog.sql similarity index 100% rename from test_runner/zenith_regress/sql/zenith-clog.sql rename to test_runner/neon_regress/sql/neon-clog.sql diff --git a/test_runner/zenith_regress/sql/zenith-rel-truncate.sql b/test_runner/neon_regress/sql/neon-rel-truncate.sql similarity index 100% rename from test_runner/zenith_regress/sql/zenith-rel-truncate.sql rename to test_runner/neon_regress/sql/neon-rel-truncate.sql diff --git a/test_runner/zenith_regress/sql/zenith-vacuum-full.sql b/test_runner/neon_regress/sql/neon-vacuum-full.sql similarity index 100% rename from test_runner/zenith_regress/sql/zenith-vacuum-full.sql rename to test_runner/neon_regress/sql/neon-vacuum-full.sql diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 3b57ac73cc..6a5bad8757 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -1,8 +1,8 @@ from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare # @@ -15,8 +15,8 @@ from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare # 3. Disk space used # 4. Peak memory usage # -def test_bulk_insert(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +def test_bulk_insert(neon_with_baseline: PgCompare): + env = neon_with_baseline with closing(env.pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index a8a1ff7687..fe3c3afe37 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -2,7 +2,7 @@ import timeit from fixtures.benchmark_fixture import MetricReport import pytest -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder # Run bulk tenant creation test. # @@ -14,21 +14,21 @@ from fixtures.zenith_fixtures import ZenithEnvBuilder @pytest.mark.parametrize('tenants_count', [1, 5, 10]) def test_bulk_tenant_create( - zenith_env_builder: ZenithEnvBuilder, + neon_env_builder: NeonEnvBuilder, tenants_count: int, zenbenchmark, ): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() time_slices = [] for i in range(tenants_count): start = timeit.default_timer() - tenant, _ = env.zenith_cli.create_tenant() - env.zenith_cli.create_timeline(f'test_bulk_tenant_create_{tenants_count}_{i}', - tenant_id=tenant) + tenant, _ = env.neon_cli.create_tenant() + env.neon_cli.create_timeline(f'test_bulk_tenant_create_{tenants_count}_{i}', + tenant_id=tenant) # FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now? #if use_safekeepers == 'with_sa': diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py index e04a0361cb..ad088684d5 100644 --- a/test_runner/performance/test_copy.py +++ b/test_runner/performance/test_copy.py @@ -1,8 +1,8 @@ from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare from io import BufferedReader, RawIOBase from itertools import repeat @@ -41,8 +41,8 @@ def copy_test_data(rows: int): # # COPY performance tests. # -def test_copy(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +def test_copy(neon_with_baseline: PgCompare): + env = neon_with_baseline # Get the timeline ID of our branch. We need it for the pageserver 'checkpoint' command with closing(env.pg.connect()) as conn: diff --git a/test_runner/performance/test_gist_build.py b/test_runner/performance/test_gist_build.py index 92396f6cb7..839eb3f57d 100644 --- a/test_runner/performance/test_gist_build.py +++ b/test_runner/performance/test_gist_build.py @@ -1,8 +1,8 @@ import os from contextlib import closing from fixtures.benchmark_fixture import MetricReport -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare +from fixtures.neon_fixtures import NeonEnv +from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare from fixtures.log_helper import log @@ -11,8 +11,8 @@ from fixtures.log_helper import log # As of this writing, we're duplicate those giant WAL records for each page, # which makes the delta layer about 32x larger than it needs to be. # -def test_gist_buffering_build(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +def test_gist_buffering_build(neon_with_baseline: PgCompare): + env = neon_with_baseline with closing(env.pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py index 2042b0d548..a9124b55cf 100644 --- a/test_runner/performance/test_hot_page.py +++ b/test_runner/performance/test_hot_page.py @@ -8,7 +8,7 @@ from pytest_lazyfixture import lazy_fixture # type: ignore "env", [ # The test is too slow to run in CI, but fast enough to run with remote tests - pytest.param(lazy_fixture("zenith_compare"), id="zenith", marks=pytest.mark.slow), + pytest.param(lazy_fixture("neon_compare"), id="neon", marks=pytest.mark.slow), pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), ]) diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py index 11e047b8c3..229c56122f 100644 --- a/test_runner/performance/test_hot_table.py +++ b/test_runner/performance/test_hot_table.py @@ -8,7 +8,7 @@ from pytest_lazyfixture import lazy_fixture # type: ignore "env", [ # The test is too slow to run in CI, but fast enough to run with remote tests - pytest.param(lazy_fixture("zenith_compare"), id="zenith", marks=pytest.mark.slow), + pytest.param(lazy_fixture("neon_compare"), id="neon", marks=pytest.mark.slow), pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), ]) diff --git a/test_runner/performance/test_parallel_copy_to.py b/test_runner/performance/test_parallel_copy_to.py index e4388ce8e2..d4e74ce195 100644 --- a/test_runner/performance/test_parallel_copy_to.py +++ b/test_runner/performance/test_parallel_copy_to.py @@ -1,10 +1,10 @@ from io import BytesIO import asyncio import asyncpg -from fixtures.zenith_fixtures import ZenithEnv, Postgres, PgProtocol +from fixtures.neon_fixtures import NeonEnv, Postgres, PgProtocol from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare async def repeat_bytes(buf, repetitions: int): @@ -36,9 +36,9 @@ async def parallel_load_different_tables(pg: PgProtocol, n_parallel: int): # Load 5 different tables in parallel with COPY TO -def test_parallel_copy_different_tables(zenith_with_baseline: PgCompare, n_parallel=5): +def test_parallel_copy_different_tables(neon_with_baseline: PgCompare, n_parallel=5): - env = zenith_with_baseline + env = neon_with_baseline conn = env.pg.connect() cur = conn.cursor() @@ -65,8 +65,8 @@ async def parallel_load_same_table(pg: PgProtocol, n_parallel: int): # Load data into one table with COPY TO from 5 parallel connections -def test_parallel_copy_same_table(zenith_with_baseline: PgCompare, n_parallel=5): - env = zenith_with_baseline +def test_parallel_copy_same_table(neon_with_baseline: PgCompare, n_parallel=5): + env = neon_with_baseline conn = env.pg.connect() cur = conn.cursor() diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index fc10ca4d6c..97aeae2b8e 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -1,8 +1,8 @@ from contextlib import closing -from fixtures.zenith_fixtures import PgBin, VanillaPostgres, ZenithEnv, profiling_supported -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare +from fixtures.neon_fixtures import PgBin, VanillaPostgres, NeonEnv, profiling_supported +from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare -from fixtures.benchmark_fixture import PgBenchRunResult, MetricReport, ZenithBenchmarker +from fixtures.benchmark_fixture import PgBenchRunResult, MetricReport, NeonBenchmarker from fixtures.log_helper import log from pathlib import Path @@ -99,11 +99,11 @@ def get_scales_matrix(): return list(map(int, scales.split(","))) -# Run the pgbench tests against vanilla Postgres and zenith +# Run the pgbench tests against vanilla Postgres and neon @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix()) -def test_pgbench(zenith_with_baseline: PgCompare, scale: int, duration: int): - run_test_pgbench(zenith_with_baseline, scale, duration) +def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int): + run_test_pgbench(neon_with_baseline, scale, duration) # Run the pgbench tests, and generate a flamegraph from it @@ -114,18 +114,18 @@ def test_pgbench(zenith_with_baseline: PgCompare, scale: int, duration: int): # can see how much overhead the profiling adds. @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix()) -def test_pgbench_flamegraph(zenbenchmark, pg_bin, zenith_env_builder, scale: int, duration: int): - zenith_env_builder.num_safekeepers = 1 - zenith_env_builder.pageserver_config_override = ''' +def test_pgbench_flamegraph(zenbenchmark, pg_bin, neon_env_builder, scale: int, duration: int): + neon_env_builder.num_safekeepers = 1 + neon_env_builder.pageserver_config_override = ''' profiling="page_requests" ''' if not profiling_supported(): pytest.skip("pageserver was built without 'profiling' feature") - env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("empty", "main") + env = neon_env_builder.init_start() + env.neon_cli.create_branch("empty", "main") - run_test_pgbench(ZenithCompare(zenbenchmark, env, pg_bin, "pgbench"), scale, duration) + run_test_pgbench(NeonCompare(zenbenchmark, env, pg_bin, "pgbench"), scale, duration) # Run the pgbench tests against an existing Postgres cluster diff --git a/test_runner/performance/test_random_writes.py b/test_runner/performance/test_random_writes.py index 205388bd90..4350386dd0 100644 --- a/test_runner/performance/test_random_writes.py +++ b/test_runner/performance/test_random_writes.py @@ -1,8 +1,8 @@ import os from contextlib import closing from fixtures.benchmark_fixture import MetricReport -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare +from fixtures.neon_fixtures import NeonEnv +from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare from fixtures.log_helper import log import psycopg2.extras @@ -16,14 +16,14 @@ import time # A naive pageserver implementation would create a full image layer for each # dirty segment, leading to write_amplification = segment_size / page_size, # when compared to vanilla postgres. With segment_size = 10MB, that's 1250. -def test_random_writes(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +def test_random_writes(neon_with_baseline: PgCompare): + env = neon_with_baseline # Number of rows in the test database. 1M rows runs quickly, but implies # a small effective_checkpoint_distance, which makes the test less realistic. # Using a 300 TB database would imply a 250 MB effective_checkpoint_distance, # but it will take a very long time to run. From what I've seen so far, - # increasing n_rows doesn't have impact on the (zenith_runtime / vanilla_runtime) + # increasing n_rows doesn't have impact on the (neon_runtime / vanilla_runtime) # performance ratio. n_rows = 1 * 1000 * 1000 # around 36 MB table @@ -65,7 +65,7 @@ def test_random_writes(zenith_with_baseline: PgCompare): env.zenbenchmark.record("table_size", table_size, 'bytes', MetricReport.TEST_PARAM) # Decide how much to write, based on knowledge of pageserver implementation. - # Avoiding segment collisions maximizes (zenith_runtime / vanilla_runtime). + # Avoiding segment collisions maximizes (neon_runtime / vanilla_runtime). segment_size = 10 * 1024 * 1024 n_segments = table_size // segment_size n_writes = load_factor * n_segments // 3 diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py index 85d0a24510..8ed31cb480 100644 --- a/test_runner/performance/test_seqscans.py +++ b/test_runner/performance/test_seqscans.py @@ -2,9 +2,9 @@ # from contextlib import closing from dataclasses import dataclass -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.compare_fixtures import PgCompare import pytest @@ -20,8 +20,8 @@ import pytest pytest.param(10000000, 1, 0), pytest.param(10000000, 1, 4) ]) -def test_seqscans(zenith_with_baseline: PgCompare, rows: int, iters: int, workers: int): - env = zenith_with_baseline +def test_seqscans(neon_with_baseline: PgCompare, rows: int, iters: int, workers: int): + env = neon_with_baseline with closing(env.pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index 53b6a3a4fc..1cfd128e9b 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -1,17 +1,17 @@ import pytest from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnvBuilder -from fixtures.benchmark_fixture import ZenithBenchmarker +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.benchmark_fixture import NeonBenchmarker # This test sometimes runs for longer than the global 5 minute timeout. @pytest.mark.timeout(600) -def test_startup(zenith_env_builder: ZenithEnvBuilder, zenbenchmark: ZenithBenchmarker): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() +def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() # Start - env.zenith_cli.create_branch('test_startup') + env.neon_cli.create_branch('test_startup') with zenbenchmark.record_duration("startup_time"): pg = env.postgres.create_start('test_startup') pg.safe_psql("select 1;") diff --git a/test_runner/performance/test_write_amplification.py b/test_runner/performance/test_write_amplification.py index 49232bf6d3..1d729fd78f 100644 --- a/test_runner/performance/test_write_amplification.py +++ b/test_runner/performance/test_write_amplification.py @@ -13,13 +13,13 @@ import os from contextlib import closing from fixtures.benchmark_fixture import MetricReport -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare +from fixtures.neon_fixtures import NeonEnv +from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare from fixtures.log_helper import log -def test_write_amplification(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +def test_write_amplification(neon_with_baseline: PgCompare): + env = neon_with_baseline with closing(env.pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py index 56c735e87c..3960546689 100644 --- a/test_runner/test_broken.py +++ b/test_runner/test_broken.py @@ -1,7 +1,7 @@ import pytest import os -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log """ Use this test to see what happens when tests fail. @@ -18,10 +18,10 @@ run_broken = pytest.mark.skipif(os.environ.get('RUN_BROKEN') is None, @run_broken -def test_broken(zenith_simple_env: ZenithEnv, pg_bin): - env = zenith_simple_env +def test_broken(neon_simple_env: NeonEnv, pg_bin): + env = neon_simple_env - env.zenith_cli.create_branch("test_broken", "empty") + env.neon_cli.create_branch("test_broken", "empty") env.postgres.create_start("test_broken") log.info('postgres is running') From b0c4ec05942ccb66ef487a6bac14e963d2b8c35d Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 2 Jun 2022 12:31:04 +0300 Subject: [PATCH 0393/1022] Log storage sync and etcd events a bit better --- libs/etcd_broker/src/lib.rs | 4 ++-- pageserver/src/storage_sync.rs | 4 ++-- pageserver/src/walreceiver.rs | 14 ++++++++++---- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index 0bfce66a5d..9184412eb1 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -258,7 +258,7 @@ pub async fn subscribe_to_safekeeper_timeline_updates( } } } - Ok(None) => {} + Ok(None) => warn!("Ignoring etcd KV with unexpected key {:?} that does not match required regex {}", new_etcd_kv.key_str(), regex), Err(e) => error!("Failed to parse timeline update: {e}"), }; } @@ -272,7 +272,7 @@ pub async fn subscribe_to_safekeeper_timeline_updates( } Ok(()) - }); + }.instrument(info_span!("etcd_broker"))); Ok(SkTimelineSubscription { kind: subscription, diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 1c33d8315c..a140149c23 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -892,7 +892,7 @@ fn storage_sync_loop( REMAINING_SYNC_ITEMS.set(remaining_queue_length as i64); if remaining_queue_length > 0 || !batched_tasks.is_empty() { - info!("Processing tasks for {} timelines in batch, more tasks left to process: {remaining_queue_length}", batched_tasks.len()); + debug!("Processing tasks for {} timelines in batch, more tasks left to process: {remaining_queue_length}", batched_tasks.len()); } else { debug!("No tasks to process"); continue; @@ -1186,7 +1186,7 @@ async fn update_local_metadata( let remote_metadata = match remote_timeline { Some(timeline) => &timeline.metadata, None => { - info!("No remote timeline to update local metadata from, skipping the update"); + debug!("No remote timeline to update local metadata from, skipping the update"); return Ok(()); } }; diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index e54406a450..527fb137cd 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -468,11 +468,17 @@ async fn timeline_wal_broker_loop_step( // finally, if no other tasks are completed, get another broker update and possibly reconnect updates = broker_subscription.fetch_data() => match updates { Some(mut all_timeline_updates) => { - if let Some(subscribed_timeline_updates) = all_timeline_updates.remove(&id) { - if let Some(candidate) = wal_connection_manager.select_connection_candidate(subscribed_timeline_updates) { - info!("Switching to different safekeeper {} for timeline {id}, reason: {:?}", candidate.safekeeper_id, candidate.reason); - wal_connection_manager.change_connection(candidate.safekeeper_id, candidate.wal_producer_connstr).await; + match all_timeline_updates.remove(&id) { + Some(subscribed_timeline_updates) => { + match wal_connection_manager.select_connection_candidate(subscribed_timeline_updates) { + Some(candidate) => { + info!("Switching to different safekeeper {} for timeline {id}, reason: {:?}", candidate.safekeeper_id, candidate.reason); + wal_connection_manager.change_connection(candidate.safekeeper_id, candidate.wal_producer_connstr).await; + }, + None => debug!("No connection candidate was selected for timeline"), + } } + None => warn!("Timeline has an active broker subscription, but got no updates. Other data length: {}", all_timeline_updates.len()), } }, None => { From a91e0c299d3604086be9186d31209b08b733766e Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 2 Jun 2022 13:04:59 +0300 Subject: [PATCH 0394/1022] Reproduce etcd parsing bug in Python tests --- .../batch_others/test_remote_storage.py | 5 ++- test_runner/fixtures/neon_fixtures.py | 32 +++++++++++-------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index bf9717a74a..8a2748b880 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -31,7 +31,10 @@ import pytest # The tests are done for all types of remote storage pageserver supports. @pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3']) def test_remote_storage_backup_and_restore(neon_env_builder: NeonEnvBuilder, storage_type: str): - # neon_env_builder.rust_log_override = 'debug' + # Use this test to check more realistic SK ids: some etcd key parsing bugs were related, + # and this test needs SK to write data to pageserver, so it will be visible + neon_env_builder.safekeepers_id_start = 12 + if storage_type == 'local_fs': neon_env_builder.enable_local_fs_remote_storage() elif storage_type == 'mock_s3': diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 2e58ad6ea5..63ee6ec57d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -488,18 +488,21 @@ class NeonEnvBuilder: created in the right directory, based on the test name, and it's properly cleaned up after the test has finished. """ - def __init__(self, - repo_dir: Path, - port_distributor: PortDistributor, - broker: Etcd, - mock_s3_server: MockS3Server, - remote_storage: Optional[RemoteStorage] = None, - remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER, - pageserver_config_override: Optional[str] = None, - num_safekeepers: int = 1, - pageserver_auth_enabled: bool = False, - rust_log_override: Optional[str] = None, - default_branch_name=DEFAULT_BRANCH_NAME): + def __init__( + self, + repo_dir: Path, + port_distributor: PortDistributor, + broker: Etcd, + mock_s3_server: MockS3Server, + remote_storage: Optional[RemoteStorage] = None, + remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER, + pageserver_config_override: Optional[str] = None, + num_safekeepers: int = 1, + # Use non-standard SK ids to check for various parsing bugs + safekeepers_id_start: int = 0, + pageserver_auth_enabled: bool = False, + rust_log_override: Optional[str] = None, + default_branch_name=DEFAULT_BRANCH_NAME): self.repo_dir = repo_dir self.rust_log_override = rust_log_override self.port_distributor = port_distributor @@ -509,6 +512,7 @@ class NeonEnvBuilder: self.mock_s3_server = mock_s3_server self.pageserver_config_override = pageserver_config_override self.num_safekeepers = num_safekeepers + self.safekeepers_id_start = safekeepers_id_start self.pageserver_auth_enabled = pageserver_auth_enabled self.default_branch_name = default_branch_name self.env: Optional[NeonEnv] = None @@ -656,7 +660,7 @@ class NeonEnv: pg=self.port_distributor.get_port(), http=self.port_distributor.get_port(), ) - id = i # assign ids sequentially + id = config.safekeepers_id_start + i # assign ids sequentially toml += textwrap.dedent(f""" [[safekeepers]] id = {id} @@ -1093,7 +1097,7 @@ class NeonCli: immediate=False) -> 'subprocess.CompletedProcess[str]': args = ['safekeeper', 'stop'] if id is not None: - args.extend(str(id)) + args.append(str(id)) if immediate: args.extend(['-m', 'immediate']) return self.raw_cli(args) From 7933804284f42204819c65543e181d99444c4df4 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 2 Jun 2022 13:28:34 +0300 Subject: [PATCH 0395/1022] Fix and test regex parsing --- libs/etcd_broker/src/lib.rs | 129 +++++++++++++++++++++++++++--------- 1 file changed, 99 insertions(+), 30 deletions(-) diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index 9184412eb1..c7777c207f 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -139,12 +139,12 @@ impl SkTimelineSubscriptionKind { fn watch_regex(&self) -> Regex { match self.kind { SubscriptionKind::All => Regex::new(&format!( - r"^{}/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$", + r"^{}/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]]+)$", self.broker_etcd_prefix )) .expect("wrong regex for 'everything' subscription"), SubscriptionKind::Tenant(tenant_id) => Regex::new(&format!( - r"^{}/{tenant_id}/([[:xdigit:]]+)/safekeeper/([[:digit:]])$", + r"^{}/{tenant_id}/([[:xdigit:]]+)/safekeeper/([[:digit:]]+)$", self.broker_etcd_prefix )) .expect("wrong regex for 'tenant' subscription"), @@ -152,7 +152,7 @@ impl SkTimelineSubscriptionKind { tenant_id, timeline_id, }) => Regex::new(&format!( - r"^{}/{tenant_id}/{timeline_id}/safekeeper/([[:digit:]])$", + r"^{}/{tenant_id}/{timeline_id}/safekeeper/([[:digit:]]+)$", self.broker_etcd_prefix )) .expect("wrong regex for 'timeline' subscription"), @@ -237,9 +237,16 @@ pub async fn subscribe_to_safekeeper_timeline_updates( if EventType::Put == event.event_type() { if let Some(new_etcd_kv) = event.kv() { let new_kv_version = new_etcd_kv.version(); + let (key_str, value_str) = match extract_key_value_str(new_etcd_kv) { + Ok(strs) => strs, + Err(e) => { + error!("Failed to represent etcd KV {new_etcd_kv:?} as pair of str: {e}"); + continue; + }, + }; - match parse_etcd_key_value(subscription_kind, ®ex, new_etcd_kv) { - Ok(Some((zttid, timeline))) => { + match parse_etcd_key_value(subscription_kind, ®ex, key_str, value_str) { + Ok((zttid, timeline)) => { match timeline_updates .entry(zttid) .or_default() @@ -250,6 +257,8 @@ pub async fn subscribe_to_safekeeper_timeline_updates( if old_etcd_kv_version < new_kv_version { o.insert(timeline.info); timeline_etcd_versions.insert(zttid,new_kv_version); + } else { + debug!("Skipping etcd timeline update due to older version compared to one that's already stored"); } } hash_map::Entry::Vacant(v) => { @@ -258,7 +267,6 @@ pub async fn subscribe_to_safekeeper_timeline_updates( } } } - Ok(None) => warn!("Ignoring etcd KV with unexpected key {:?} that does not match required regex {}", new_etcd_kv.key_str(), regex), Err(e) => error!("Failed to parse timeline update: {e}"), }; } @@ -282,54 +290,64 @@ pub async fn subscribe_to_safekeeper_timeline_updates( }) } +fn extract_key_value_str(kv: &KeyValue) -> Result<(&str, &str), BrokerError> { + let key = kv.key_str().map_err(|e| { + BrokerError::EtcdClient(e, "Failed to extract key str out of etcd KV".to_string()) + })?; + let value = kv.value_str().map_err(|e| { + BrokerError::EtcdClient(e, "Failed to extract value str out of etcd KV".to_string()) + })?; + Ok((key, value)) +} + fn parse_etcd_key_value( subscription_kind: SubscriptionKind, regex: &Regex, - kv: &KeyValue, -) -> Result, BrokerError> { - let caps = if let Some(caps) = regex.captures(kv.key_str().map_err(|e| { - BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as key str")) - })?) { - caps - } else { - return Ok(None); + key_str: &str, + value_str: &str, +) -> Result<(ZTenantTimelineId, SafekeeperTimeline), BrokerError> { + let key_captures = match regex.captures(key_str) { + Some(captures) => captures, + None => { + return Err(BrokerError::ParsingError(format!( + "KV has unexpected key '{key_str}' that does not match required regex {regex}" + ))); + } }; + let info = serde_json::from_str(value_str).map_err(|e| { + BrokerError::ParsingError(format!( + "Failed to parse '{value_str}' as safekeeper timeline info: {e}" + )) + })?; let (zttid, safekeeper_id) = match subscription_kind { SubscriptionKind::All => ( ZTenantTimelineId::new( - parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?, - parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?, + parse_capture(&key_captures, 1).map_err(BrokerError::ParsingError)?, + parse_capture(&key_captures, 2).map_err(BrokerError::ParsingError)?, ), - NodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?), + NodeId(parse_capture(&key_captures, 3).map_err(BrokerError::ParsingError)?), ), SubscriptionKind::Tenant(tenant_id) => ( ZTenantTimelineId::new( tenant_id, - parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?, + parse_capture(&key_captures, 1).map_err(BrokerError::ParsingError)?, ), - NodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?), + NodeId(parse_capture(&key_captures, 2).map_err(BrokerError::ParsingError)?), ), SubscriptionKind::Timeline(zttid) => ( zttid, - NodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?), + NodeId(parse_capture(&key_captures, 1).map_err(BrokerError::ParsingError)?), ), }; - let info_str = kv.value_str().map_err(|e| { - BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as value str")) - })?; - Ok(Some(( + Ok(( zttid, SafekeeperTimeline { safekeeper_id, - info: serde_json::from_str(info_str).map_err(|e| { - BrokerError::ParsingError(format!( - "Failed to parse '{info_str}' as safekeeper timeline info: {e}" - )) - })?, + info, }, - ))) + )) } fn parse_capture(caps: &Captures, index: usize) -> Result @@ -348,3 +366,54 @@ where ) }) } + +#[cfg(test)] +mod tests { + use utils::zid::ZTimelineId; + + use super::*; + + #[test] + fn typical_etcd_prefix_should_be_parsed() { + let prefix = "neon"; + let tenant_id = ZTenantId::generate(); + let timeline_id = ZTimelineId::generate(); + let all_subscription = SkTimelineSubscriptionKind { + broker_etcd_prefix: prefix.to_string(), + kind: SubscriptionKind::All, + }; + let tenant_subscription = SkTimelineSubscriptionKind { + broker_etcd_prefix: prefix.to_string(), + kind: SubscriptionKind::Tenant(tenant_id), + }; + let timeline_subscription = SkTimelineSubscriptionKind { + broker_etcd_prefix: prefix.to_string(), + kind: SubscriptionKind::Timeline(ZTenantTimelineId::new(tenant_id, timeline_id)), + }; + + let typical_etcd_kv_strs = [ + ( + format!("{prefix}/{tenant_id}/{timeline_id}/safekeeper/1"), + r#"{"last_log_term":231,"flush_lsn":"0/241BB70","commit_lsn":"0/241BB70","backup_lsn":"0/2000000","remote_consistent_lsn":"0/0","peer_horizon_lsn":"0/16960E8","safekeeper_connstr":"something.local:1234","pageserver_connstr":"postgresql://(null):@somethine.else.local:3456"}"#, + ), + ( + format!("{prefix}/{tenant_id}/{timeline_id}/safekeeper/13"), + r#"{"last_log_term":231,"flush_lsn":"0/241BB70","commit_lsn":"0/241BB70","backup_lsn":"0/2000000","remote_consistent_lsn":"0/0","peer_horizon_lsn":"0/16960E8","safekeeper_connstr":"something.local:1234","pageserver_connstr":"postgresql://(null):@somethine.else.local:3456"}"#, + ), + ]; + + for (key_string, value_str) in typical_etcd_kv_strs { + for subscription in [ + &all_subscription, + &tenant_subscription, + &timeline_subscription, + ] { + let watch_regex = subscription.watch_regex(); + let (id, _timeline) = + parse_etcd_key_value(subscription.kind, &watch_regex, &key_string, value_str) + .unwrap_or_else(|e| panic!("Should be able to parse etcd key string '{key_string}' and etcd value string '{value_str}' for subscription {subscription:?}, but got: {e}")); + assert_eq!(id, ZTenantTimelineId::new(tenant_id, timeline_id)); + } + } + } +} From 1d16ee92d4a41882047902a0cf9eaab7f9332ce6 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 2 Jun 2022 21:21:01 +0300 Subject: [PATCH 0396/1022] Fix the Lsn difference reconnection --- pageserver/src/tenant_config.rs | 2 +- pageserver/src/walreceiver.rs | 188 +++++++++++++++++--------------- 2 files changed, 99 insertions(+), 91 deletions(-) diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index f68a820e95..1722c1a13a 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -37,7 +37,7 @@ pub mod defaults { pub const DEFAULT_PITR_INTERVAL: &str = "30 days"; pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; - pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1_000_000; + pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10_000; } /// Per-tenant configuration options diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 527fb137cd..11c8617a57 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -478,7 +478,11 @@ async fn timeline_wal_broker_loop_step( None => debug!("No connection candidate was selected for timeline"), } } - None => warn!("Timeline has an active broker subscription, but got no updates. Other data length: {}", all_timeline_updates.len()), + // XXX: If we subscribe for a certain timeline, we expect only its data to come. + // But somebody could propagate a new etcd key, that has the same prefix as the subscribed one, then we'll get odd data. + // This is an error, we don't want to have overlapping prefixes for timelines, but we can complain and thow those away instead of panicking, + // since the next poll might bring the correct data. + None => error!("Timeline has an active broker subscription, but got no updates. Other data length: {}", all_timeline_updates.len()), } }, None => { @@ -625,18 +629,28 @@ impl WalConnectionManager { /// Checks current state against every fetched safekeeper state of a given timeline. /// Returns a new candidate, if the current state is somewhat lagging, or `None` otherwise. /// The current rules for approving new candidates: - /// * pick the safekeeper with biggest `commit_lsn` that's after than pageserver's latest Lsn for the timeline - /// * if the leader is a different SK and either - /// * no WAL updates happened after certain time (either none since the connection time or none since the last event after the connection) — reconnect - /// * same time amount had passed since the connection, WAL updates happened recently, but the new leader SK has timeline Lsn way ahead of the old one — reconnect + /// * pick from the input data from etcd for currently connected safekeeper (if any) + /// * out of the rest input entries, pick one with biggest `commit_lsn` that's after than pageserver's latest Lsn for the timeline + /// * if there's no such entry, no new candidate found, abort + /// * otherwise, check if etcd updates contain currently connected safekeeper + /// * if not, that means no WAL updates happened after certain time (either none since the connection time or none since the last event after the connection) + /// Reconnect if the time exceeds the threshold. + /// * if there's one, compare its Lsn with the other candidate's, reconnect if candidate's over threshold /// /// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently. /// Both thresholds are configured per tenant. fn select_connection_candidate( &self, - safekeeper_timelines: HashMap, + mut safekeeper_timelines: HashMap, ) -> Option { - let (&new_sk_id, new_sk_timeline, new_wal_producer_connstr) = safekeeper_timelines + let current_sk_data_updated = + self.wal_connection_data + .as_ref() + .and_then(|connection_data| { + safekeeper_timelines.remove(&connection_data.safekeeper_id) + }); + + let candidate_sk_data = safekeeper_timelines .iter() .filter(|(_, info)| { info.commit_lsn > Some(self.timeline.tline.get_last_record_lsn()) @@ -654,68 +668,78 @@ impl WalConnectionManager { } } }) - .max_by_key(|(_, info, _)| info.commit_lsn)?; + .max_by_key(|(_, info, _)| info.commit_lsn); - match self.wal_connection_data.as_ref() { - None => Some(NewWalConnectionCandidate { - safekeeper_id: new_sk_id, - wal_producer_connstr: new_wal_producer_connstr, - reason: ReconnectReason::NoExistingConnection, - }), - Some(current_connection) => { - if current_connection.safekeeper_id == new_sk_id { - None - } else { - self.reason_to_reconnect(current_connection, new_sk_timeline) - .map(|reason| NewWalConnectionCandidate { - safekeeper_id: new_sk_id, - wal_producer_connstr: new_wal_producer_connstr, - reason, - }) + match (current_sk_data_updated, candidate_sk_data) { + // No better candidate than one we're already connected to: + // whatever data update comes for the connected one, we don't have a better candidate + (_, None) => None, + + // No updates from the old SK in this batch, but some candidate is available: + // check how long time ago did we receive updates from the current SK, switch connections in case it's over the threshold + (None, Some((&new_sk_id, _, new_wal_producer_connstr))) => { + match self.wal_connection_data.as_ref() { + Some(current_connection) => { + let last_sk_interaction_time = + match current_connection.last_wal_receiver_data.as_ref() { + Some((_, data_submission_time)) => *data_submission_time, + None => current_connection.connection_init_time, + }; + + let now = Utc::now().naive_utc(); + match (now - last_sk_interaction_time).to_std() { + Ok(last_interaction) => { + if last_interaction > self.lagging_wal_timeout { + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_producer_connstr: new_wal_producer_connstr, + reason: ReconnectReason::NoWalTimeout { + last_wal_interaction: last_sk_interaction_time, + check_time: now, + threshold: self.lagging_wal_timeout, + }, + }); + } + } + Err(_e) => { + warn!("Last interaction with safekeeper {} happened in the future, ignoring the candidate. Interaction time: {last_sk_interaction_time}, now: {now}", current_connection.safekeeper_id); + } + } + None + } + None => Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_producer_connstr: new_wal_producer_connstr, + reason: ReconnectReason::NoExistingConnection, + }), } } - } - } - - fn reason_to_reconnect( - &self, - current_connection: &WalConnectionData, - new_sk_timeline: &SkTimelineInfo, - ) -> Option { - let last_sk_interaction_time = match current_connection.last_wal_receiver_data.as_ref() { - Some((last_wal_receiver_data, data_submission_time)) => { - let new_lsn = new_sk_timeline.commit_lsn?; - match new_lsn.0.checked_sub(last_wal_receiver_data.ps_writelsn) + // Both current SK got updated via etcd and there's another candidate with suitable Lsn: + // check how bigger the new SK Lsn is in the future compared to the current SK, switch connections in case it's over the threshold + ( + Some(current_sk_timeline), + Some((&new_sk_id, new_sk_timeline, new_wal_producer_connstr)), + ) => { + let new_lsn = new_sk_timeline.commit_lsn.unwrap_or(Lsn(0)); + let current_lsn = current_sk_timeline.commit_lsn.unwrap_or(Lsn(0)); + match new_lsn.0.checked_sub(current_lsn.0) { - Some(sk_lsn_advantage) => { - if sk_lsn_advantage >= self.max_lsn_wal_lag.get() { - return Some(ReconnectReason::LaggingWal { current_lsn: Lsn(last_wal_receiver_data.ps_writelsn), new_lsn, threshold: self.max_lsn_wal_lag }); + Some(new_sk_lsn_advantage) => { + if new_sk_lsn_advantage >= self.max_lsn_wal_lag.get() { + return Some( + NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_producer_connstr: new_wal_producer_connstr, + reason: ReconnectReason::LaggingWal { current_lsn, new_lsn, threshold: self.max_lsn_wal_lag }, + }); } } None => debug!("Best SK candidate has its commit Lsn behind the current timeline's latest consistent Lsn"), } - *data_submission_time - } - None => current_connection.connection_init_time, - }; - let now = Utc::now().naive_utc(); - match (now - last_sk_interaction_time).to_std() { - Ok(last_interaction) => { - if last_interaction > self.lagging_wal_timeout { - return Some(ReconnectReason::NoWalTimeout { - last_wal_interaction: last_sk_interaction_time, - check_time: now, - threshold: self.lagging_wal_timeout, - }); - } - } - Err(_e) => { - warn!("Last interaction with safekeeper {} happened in the future, ignoring the candidate. Interaction time: {last_sk_interaction_time}, now: {now}", - current_connection.safekeeper_id); + None } } - None } } @@ -1017,7 +1041,7 @@ mod tests { let mut data_manager_with_connection = dummy_wal_connection_manager(&harness); let connected_sk_id = NodeId(0); - let mut dummy_connection_data = dummy_connection_data(id, NodeId(0)).await; + let mut dummy_connection_data = dummy_connection_data(id, connected_sk_id).await; let lagging_wal_timeout = chrono::Duration::from_std(data_manager_with_connection.lagging_wal_timeout)?; let time_over_threshold = @@ -1092,8 +1116,8 @@ mod tests { } #[tokio::test] - async fn timeout_wal_over_threshcurrent_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("timeout_wal_over_threshcurrent_candidate")?; + async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("timeout_wal_over_threshhold_current_candidate")?; let current_lsn = Lsn(100_000).align(); let id = ZTenantTimelineId { @@ -1111,36 +1135,20 @@ mod tests { dummy_connection_data.connection_init_time = time_over_threshold; data_manager_with_connection.wal_connection_data = Some(dummy_connection_data); - let new_lsn = Lsn(current_lsn.0 + data_manager_with_connection.max_lsn_wal_lag.get() + 1); let over_threshcurrent_candidate = data_manager_with_connection - .select_connection_candidate(HashMap::from([ - ( - NodeId(0), - SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(new_lsn), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), - pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), - }, - ), - ( - NodeId(1), - SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(current_lsn), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: Some("not advanced by Lsn safekeeper".to_string()), - pageserver_connstr: Some("not advanced by Lsn safekeeper".to_string()), - }, - ), - ])) + .select_connection_candidate(HashMap::from([( + NodeId(0), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(current_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), + }, + )])) .expect( "Expected one candidate selected out of multiple valid data options, but got none", ); From 5b06599770624ecd3184a9670c645cfbebfcfdfb Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 2 Jun 2022 21:39:13 +0300 Subject: [PATCH 0397/1022] Simplify etcd key regex parsing --- Cargo.lock | 1 + libs/etcd_broker/Cargo.toml | 1 + libs/etcd_broker/src/lib.rs | 83 ++++++++++++------------------------ safekeeper/src/broker.rs | 14 +++--- safekeeper/src/wal_backup.rs | 6 +-- 5 files changed, 38 insertions(+), 67 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6f8382de27..c615766eb8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -811,6 +811,7 @@ name = "etcd_broker" version = "0.1.0" dependencies = [ "etcd-client", + "once_cell", "regex", "serde", "serde_json", diff --git a/libs/etcd_broker/Cargo.toml b/libs/etcd_broker/Cargo.toml index 65bd406131..49be7ad207 100644 --- a/libs/etcd_broker/Cargo.toml +++ b/libs/etcd_broker/Cargo.toml @@ -9,6 +9,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_with = "1.12.0" + once_cell = "1.8.0" utils = { path = "../utils" } workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index c7777c207f..daa9c513c2 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -6,6 +6,7 @@ use std::{ str::FromStr, }; +use once_cell::sync::Lazy; use regex::{Captures, Regex}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; @@ -136,29 +137,6 @@ impl SkTimelineSubscriptionKind { } } - fn watch_regex(&self) -> Regex { - match self.kind { - SubscriptionKind::All => Regex::new(&format!( - r"^{}/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]]+)$", - self.broker_etcd_prefix - )) - .expect("wrong regex for 'everything' subscription"), - SubscriptionKind::Tenant(tenant_id) => Regex::new(&format!( - r"^{}/{tenant_id}/([[:xdigit:]]+)/safekeeper/([[:digit:]]+)$", - self.broker_etcd_prefix - )) - .expect("wrong regex for 'tenant' subscription"), - SubscriptionKind::Timeline(ZTenantTimelineId { - tenant_id, - timeline_id, - }) => Regex::new(&format!( - r"^{}/{tenant_id}/{timeline_id}/safekeeper/([[:digit:]]+)$", - self.broker_etcd_prefix - )) - .expect("wrong regex for 'timeline' subscription"), - } - } - /// Etcd key to use for watching a certain timeline updates from safekeepers. pub fn watch_key(&self) -> String { match self.kind { @@ -196,6 +174,7 @@ pub async fn subscribe_to_safekeeper_timeline_updates( subscription: SkTimelineSubscriptionKind, ) -> Result { info!("Subscribing to timeline updates, subscription kind: {subscription:?}"); + let kind = subscription.clone(); let (watcher, mut stream) = client .watch( @@ -211,12 +190,9 @@ pub async fn subscribe_to_safekeeper_timeline_updates( })?; let (timeline_updates_sender, safekeeper_timeline_updates) = mpsc::unbounded_channel(); - - let subscription_kind = subscription.kind; - let regex = subscription.watch_regex(); let watcher_handle = tokio::spawn(async move { while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!( - "Failed to get messages from the subscription stream, kind: {subscription_kind:?}, error: {e}" + "Failed to get messages from the subscription stream, kind: {:?}, error: {e}", subscription.kind )))? { if resp.canceled() { info!("Watch for timeline updates subscription was canceled, exiting"); @@ -245,7 +221,7 @@ pub async fn subscribe_to_safekeeper_timeline_updates( }, }; - match parse_etcd_key_value(subscription_kind, ®ex, key_str, value_str) { + match parse_etcd_key_value(&subscription, key_str, value_str) { Ok((zttid, timeline)) => { match timeline_updates .entry(zttid) @@ -283,7 +259,7 @@ pub async fn subscribe_to_safekeeper_timeline_updates( }.instrument(info_span!("etcd_broker"))); Ok(SkTimelineSubscription { - kind: subscription, + kind, safekeeper_timeline_updates, watcher_handle, watcher, @@ -300,17 +276,30 @@ fn extract_key_value_str(kv: &KeyValue) -> Result<(&str, &str), BrokerError> { Ok((key, value)) } +static SK_TIMELINE_KEY_REGEX: Lazy = Lazy::new(|| { + Regex::new("/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]]+)$") + .expect("wrong regex for safekeeper timeline etcd key") +}); + fn parse_etcd_key_value( - subscription_kind: SubscriptionKind, - regex: &Regex, + subscription: &SkTimelineSubscriptionKind, key_str: &str, value_str: &str, ) -> Result<(ZTenantTimelineId, SafekeeperTimeline), BrokerError> { - let key_captures = match regex.captures(key_str) { + let broker_prefix = subscription.broker_etcd_prefix.as_str(); + if !key_str.starts_with(broker_prefix) { + return Err(BrokerError::ParsingError(format!( + "KV has unexpected key '{key_str}' that does not start with broker prefix {broker_prefix}" + ))); + } + + let key_part = &key_str[broker_prefix.len()..]; + let key_captures = match SK_TIMELINE_KEY_REGEX.captures(key_part) { Some(captures) => captures, None => { return Err(BrokerError::ParsingError(format!( - "KV has unexpected key '{key_str}' that does not match required regex {regex}" + "KV has unexpected key part '{key_part}' that does not match required regex {}", + SK_TIMELINE_KEY_REGEX.as_str() ))); } }; @@ -320,26 +309,11 @@ fn parse_etcd_key_value( )) })?; - let (zttid, safekeeper_id) = match subscription_kind { - SubscriptionKind::All => ( - ZTenantTimelineId::new( - parse_capture(&key_captures, 1).map_err(BrokerError::ParsingError)?, - parse_capture(&key_captures, 2).map_err(BrokerError::ParsingError)?, - ), - NodeId(parse_capture(&key_captures, 3).map_err(BrokerError::ParsingError)?), - ), - SubscriptionKind::Tenant(tenant_id) => ( - ZTenantTimelineId::new( - tenant_id, - parse_capture(&key_captures, 1).map_err(BrokerError::ParsingError)?, - ), - NodeId(parse_capture(&key_captures, 2).map_err(BrokerError::ParsingError)?), - ), - SubscriptionKind::Timeline(zttid) => ( - zttid, - NodeId(parse_capture(&key_captures, 1).map_err(BrokerError::ParsingError)?), - ), - }; + let zttid = ZTenantTimelineId::new( + parse_capture(&key_captures, 1).map_err(BrokerError::ParsingError)?, + parse_capture(&key_captures, 2).map_err(BrokerError::ParsingError)?, + ); + let safekeeper_id = NodeId(parse_capture(&key_captures, 3).map_err(BrokerError::ParsingError)?); Ok(( zttid, @@ -408,9 +382,8 @@ mod tests { &tenant_subscription, &timeline_subscription, ] { - let watch_regex = subscription.watch_regex(); let (id, _timeline) = - parse_etcd_key_value(subscription.kind, &watch_regex, &key_string, value_str) + parse_etcd_key_value(subscription, &key_string, value_str) .unwrap_or_else(|e| panic!("Should be able to parse etcd key string '{key_string}' and etcd value string '{value_str}' for subscription {subscription:?}, but got: {e}")); assert_eq!(id, ZTenantTimelineId::new(tenant_id, timeline_id)); } diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 5bcb197205..5be8091a7e 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -144,19 +144,15 @@ async fn lease_keep_alive(mut client: Client, lease_id: i64) -> Result<()> { } pub fn get_campaign_name( - election_name: String, - broker_prefix: String, - timeline_id: &ZTenantTimelineId, + election_name: &str, + broker_prefix: &str, + id: ZTenantTimelineId, ) -> String { - return format!( - "{}/{}", - SkTimelineSubscriptionKind::timeline(broker_prefix, *timeline_id).watch_key(), - election_name - ); + format!("{broker_prefix}/{id}/{election_name}") } pub fn get_candiate_name(system_id: NodeId) -> String { - format!("id_{}", system_id) + format!("id_{system_id}") } /// Push once in a while data about all active timelines to the broker. diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 1723d03ee3..30364ce434 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -99,9 +99,9 @@ async fn wal_backup_launcher_main_loop( // TODO: decide who should offload in launcher itself by simply checking current state let election_name = broker::get_campaign_name( - BACKUP_ELECTION_NAME.to_string(), - conf.broker_etcd_prefix.clone(), - &zttid, + BACKUP_ELECTION_NAME, + &conf.broker_etcd_prefix, + zttid, ); let my_candidate_name = broker::get_candiate_name(conf.my_id); let election = broker::Election::new( From c5007d3916f0e5f8ce6555af1750d34e5935bd63 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 2 Jun 2022 22:58:23 +0300 Subject: [PATCH 0398/1022] Remove unused module --- safekeeper/src/callmemaybe.rs | 305 ---------------------------------- 1 file changed, 305 deletions(-) delete mode 100644 safekeeper/src/callmemaybe.rs diff --git a/safekeeper/src/callmemaybe.rs b/safekeeper/src/callmemaybe.rs deleted file mode 100644 index 53d38c5e25..0000000000 --- a/safekeeper/src/callmemaybe.rs +++ /dev/null @@ -1,305 +0,0 @@ -//! -//! Callmemaybe module is responsible for periodically requesting -//! pageserver to initiate wal streaming. -//! -//! Other threads can use CallmeEvent messages to subscribe or unsubscribe -//! from the call list. -//! -use crate::SafeKeeperConf; -use anyhow::{Context, Result}; -use std::collections::hash_map::Entry; -use std::collections::HashMap; -use std::sync::Mutex; -use std::time::{Duration, Instant}; -use tokio::runtime; -use tokio::sync::mpsc::UnboundedReceiver; -use tokio::task; -use tokio_postgres::NoTls; -use tracing::*; -use utils::{ - connstring::connection_host_port, - zid::{ZTenantId, ZTimelineId}, -}; - -async fn request_callback( - pageserver_connstr: String, - listen_pg_addr_str: String, - timelineid: ZTimelineId, - tenantid: ZTenantId, -) -> Result<()> { - info!( - "callmemaybe request_callback Connecting to pageserver {}", - &pageserver_connstr - ); - let (client, connection) = tokio_postgres::connect(&pageserver_connstr, NoTls).await?; - - tokio::spawn(async move { - if let Err(e) = connection.await { - error!("connection error: {}", e); - } - }); - - // use Config parsing because SockAddr parsing doesn't allow to use host names instead of ip addresses - let me_connstr = format!("postgresql://no_user@{}/no_db", listen_pg_addr_str); - let me_conf: postgres::config::Config = me_connstr.parse().unwrap(); - let (host, port) = connection_host_port(&me_conf); - - // pageserver connstr is needed to be able to distinguish between different pageservers - // it is required to correctly manage callmemaybe subscriptions when more than one pageserver is involved - // TODO it is better to use some sort of a unique id instead of connection string, see https://github.com/zenithdb/zenith/issues/1105 - let callme = format!( - "callmemaybe {} {} host={} port={} options='-c ztimelineid={} ztenantid={} pageserver_connstr={}'", - tenantid, timelineid, host, port, timelineid, tenantid, pageserver_connstr, - ); - - let _ = client.simple_query(&callme).await?; - - Ok(()) -} - -pub fn thread_main(conf: SafeKeeperConf, rx: UnboundedReceiver) -> Result<()> { - let runtime = runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - - runtime.block_on(main_loop(conf, rx)) -} - -#[derive(Debug, PartialEq, Eq, Hash, Clone)] -pub struct SubscriptionStateKey { - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - pageserver_connstr: String, -} - -impl SubscriptionStateKey { - pub fn new(tenant_id: ZTenantId, timeline_id: ZTimelineId, pageserver_connstr: String) -> Self { - Self { - tenant_id, - timeline_id, - pageserver_connstr, - } - } -} - -/// Messages to the callmemaybe thread -#[derive(Debug)] -pub enum CallmeEvent { - // add new subscription to the list - Subscribe(SubscriptionStateKey), - // remove the subscription from the list - Unsubscribe(SubscriptionStateKey), - // don't serve this subscription, but keep it in the list - Pause(SubscriptionStateKey), - // resume this subscription, if it exists, - // but don't create a new one if it is gone - Resume(SubscriptionStateKey), - // TODO how do we delete from subscriptions? -} - -#[derive(Debug)] -struct SubscriptionState { - tenantid: ZTenantId, - timelineid: ZTimelineId, - pageserver_connstr: String, - handle: Option>, - last_call_time: Instant, - paused: bool, -} - -impl SubscriptionState { - fn new( - tenantid: ZTenantId, - timelineid: ZTimelineId, - pageserver_connstr: String, - ) -> SubscriptionState { - SubscriptionState { - tenantid, - timelineid, - pageserver_connstr, - handle: None, - last_call_time: Instant::now(), - paused: false, - } - } - - fn pause(&mut self) { - self.paused = true; - self.abort_handle(); - } - - fn resume(&mut self) { - self.paused = false; - } - - // Most likely, the task have already successfully completed - // and abort() won't have any effect. - fn abort_handle(&mut self) { - if let Some(handle) = self.handle.take() { - handle.abort(); - - let timelineid = self.timelineid; - let tenantid = self.tenantid; - let pageserver_connstr = self.pageserver_connstr.clone(); - tokio::spawn(async move { - if let Err(err) = handle.await { - if err.is_cancelled() { - warn!("callback task for timelineid={} tenantid={} was cancelled before spawning a new one", - timelineid, tenantid); - } else { - error!( - "callback task for timelineid={} tenantid={} pageserver_connstr={} failed: {}", - timelineid, tenantid, pageserver_connstr, err - ); - } - } - }); - } - } - - fn call(&mut self, recall_period: Duration, listen_pg_addr: String) { - // Ignore call request if this subscription is paused - if self.paused { - debug!( - "ignore call request for paused subscription \ - tenantid: {}, timelineid: {}", - self.tenantid, self.timelineid - ); - return; - } - - // Check if it too early to recall - if self.handle.is_some() && self.last_call_time.elapsed() < recall_period { - debug!( - "too early to recall. self.last_call_time.elapsed: {:?}, recall_period: {:?} \ - tenantid: {}, timelineid: {}", - self.last_call_time, recall_period, self.tenantid, self.timelineid - ); - return; - } - - // If previous task didn't complete in recall_period, it must be hanging, - // so don't wait for it forever, just abort it and try again. - self.abort_handle(); - - let timelineid = self.timelineid; - let tenantid = self.tenantid; - let pageserver_connstr = self.pageserver_connstr.clone(); - self.handle = Some(tokio::spawn(async move { - request_callback(pageserver_connstr, listen_pg_addr, timelineid, tenantid) - .await - .unwrap_or_else(|e| { - error!( - "callback task for timelineid={} tenantid={} failed: {}", - timelineid, tenantid, e - ) - }); - })); - - // Update last_call_time - self.last_call_time = Instant::now(); - info!( - "new call spawned. last call time {:?} tenantid: {}, timelineid: {}", - self.last_call_time, self.tenantid, self.timelineid - ); - } -} - -impl Drop for SubscriptionState { - fn drop(&mut self) { - self.abort_handle(); - } -} - -pub async fn main_loop(conf: SafeKeeperConf, mut rx: UnboundedReceiver) -> Result<()> { - let subscriptions: Mutex> = - Mutex::new(HashMap::new()); - - let mut ticker = tokio::time::interval(conf.recall_period); - loop { - tokio::select! { - request = rx.recv() => - { - match request.context("done")? - { - CallmeEvent::Subscribe(key) => - { - let _enter = info_span!("callmemaybe: subscribe", timelineid = %key.timeline_id, tenantid = %key.tenant_id, pageserver_connstr=%key.pageserver_connstr.clone()).entered(); - let mut subscriptions = subscriptions.lock().unwrap(); - // XXX this clone is ugly, is there a way to use the trick with Borrow trait with entry API? - // when we switch to node id instead of the connection string key will be Copy and there will be no need to clone - match subscriptions.entry(key.clone()) { - Entry::Occupied(_) => { - // Do nothing if subscription already exists - // If it is paused it means that there is already established replication connection. - // If it is not paused it will be polled with other subscriptions when timeout expires. - // This can occur when replication channel is established before subscription is added. - info!( - "subscription already exists", - ); - } - Entry::Vacant(entry) => { - let subscription = entry.insert(SubscriptionState::new( - key.tenant_id, - key.timeline_id, - key.pageserver_connstr, - )); - subscription.call(conf.recall_period, conf.listen_pg_addr.clone()); - } - } - }, - CallmeEvent::Unsubscribe(key) => { - let _enter = debug_span!("callmemaybe: unsubscribe", timelineid = %key.timeline_id, tenantid = %key.tenant_id, pageserver_connstr=%key.pageserver_connstr.clone()).entered(); - debug!("unsubscribe"); - let mut subscriptions = subscriptions.lock().unwrap(); - subscriptions.remove(&key); - - }, - CallmeEvent::Pause(key) => { - let _enter = debug_span!("callmemaybe: pause", timelineid = %key.timeline_id, tenantid = %key.tenant_id, pageserver_connstr=%key.pageserver_connstr.clone()).entered(); - let mut subscriptions = subscriptions.lock().unwrap(); - // If pause received when no corresponding subscription exists it means that someone started replication - // without using callmemaybe. So we create subscription and pause it. - // In tenant relocation scenario subscribe call will be executed after pause when compute is restarted. - // In that case there is no need to create new/unpause existing subscription. - match subscriptions.entry(key.clone()) { - Entry::Occupied(mut sub) => { - debug!("pause existing"); - sub.get_mut().pause(); - } - Entry::Vacant(entry) => { - debug!("create paused"); - let subscription = entry.insert(SubscriptionState::new( - key.tenant_id, - key.timeline_id, - key.pageserver_connstr, - )); - subscription.pause(); - } - } - }, - CallmeEvent::Resume(key) => { - debug!( - "callmemaybe. thread_main. resume callback request for timelineid={} tenantid={} pageserver_connstr={}", - key.timeline_id, key.tenant_id, key.pageserver_connstr, - ); - let mut subscriptions = subscriptions.lock().unwrap(); - if let Some(sub) = subscriptions.get_mut(&key) - { - sub.resume(); - }; - }, - } - }, - _ = ticker.tick() => { - let _enter = debug_span!("callmemaybe: tick").entered(); - let mut subscriptions = subscriptions.lock().unwrap(); - - for (_, state) in subscriptions.iter_mut() { - state.call(conf.recall_period, conf.listen_pg_addr.clone()); - } - }, - }; - } -} From 9c846a93e84882984c0d074ebb25d62ae550be86 Mon Sep 17 00:00:00 2001 From: huming Date: Thu, 2 Jun 2022 15:03:35 +0800 Subject: [PATCH 0399/1022] chore(doc) --- .gitignore | 1 + pageserver/src/page_cache.rs | 2 +- pageserver/src/repository.rs | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index adb1b41503..291504ea81 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ __pycache__/ test_output/ .vscode +.idea /.zenith /integration_tests/.zenith diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 0c179b95c5..716df0f749 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -20,7 +20,7 @@ //! assign a buffer for a page, you must hold the mapping lock and the lock on //! the slot at the same time. //! -//! Whenever you need to hold both locks simultenously, the slot lock must be +//! Whenever you need to hold both locks simultaneously, the slot lock must be //! acquired first. This consistent ordering avoids deadlocks. To look up a page //! in the cache, you would first look up the mapping, while holding the mapping //! lock, and then lock the slot. You must release the mapping lock in between, diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 9d5056cd16..f687f24c6e 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -195,6 +195,7 @@ impl Display for TimelineSyncStatusUpdate { f.write_str(s) } } + /// /// A repository corresponds to one .zenith directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. @@ -242,7 +243,7 @@ pub trait Repository: Send + Sync { /// /// 'timelineid' specifies the timeline to GC, or None for all. /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval). - /// `checkpoint_before_gc` parameter is used to force compaction of storage before CG + /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC /// to make tests more deterministic. /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed? fn gc_iteration( From 9e108102b3e4a4c4e7d881e75bdad62d2c1833bf Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 3 Jun 2022 14:08:56 +0400 Subject: [PATCH 0400/1022] Silence etcd safekeeper info key parse errors. When we subscribe to everything, it is ok to receive not only safekeeper timeline updates. --- libs/etcd_broker/src/lib.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index daa9c513c2..81353450e0 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -65,7 +65,9 @@ pub struct SkTimelineInfo { pub enum BrokerError { #[error("Etcd client error: {0}. Context: {1}")] EtcdClient(etcd_client::Error, String), - #[error("Error during parsing etcd data: {0}")] + #[error("Error during parsing etcd key: {0}")] + InvalidKey(String), + #[error("Error during parsing etcd value: {0}")] ParsingError(String), #[error("Internal error: {0}")] InternalError(String), @@ -221,7 +223,7 @@ pub async fn subscribe_to_safekeeper_timeline_updates( }, }; - match parse_etcd_key_value(&subscription, key_str, value_str) { + match parse_safekeeper_timeline(&subscription, key_str, value_str) { Ok((zttid, timeline)) => { match timeline_updates .entry(zttid) @@ -243,6 +245,8 @@ pub async fn subscribe_to_safekeeper_timeline_updates( } } } + // it is normal to get other keys when we subscribe to everything + Err(BrokerError::InvalidKey(e)) => debug!("Unexpected key for timeline update: {e}"), Err(e) => error!("Failed to parse timeline update: {e}"), }; } @@ -281,14 +285,14 @@ static SK_TIMELINE_KEY_REGEX: Lazy = Lazy::new(|| { .expect("wrong regex for safekeeper timeline etcd key") }); -fn parse_etcd_key_value( +fn parse_safekeeper_timeline( subscription: &SkTimelineSubscriptionKind, key_str: &str, value_str: &str, ) -> Result<(ZTenantTimelineId, SafekeeperTimeline), BrokerError> { let broker_prefix = subscription.broker_etcd_prefix.as_str(); if !key_str.starts_with(broker_prefix) { - return Err(BrokerError::ParsingError(format!( + return Err(BrokerError::InvalidKey(format!( "KV has unexpected key '{key_str}' that does not start with broker prefix {broker_prefix}" ))); } @@ -297,7 +301,7 @@ fn parse_etcd_key_value( let key_captures = match SK_TIMELINE_KEY_REGEX.captures(key_part) { Some(captures) => captures, None => { - return Err(BrokerError::ParsingError(format!( + return Err(BrokerError::InvalidKey(format!( "KV has unexpected key part '{key_part}' that does not match required regex {}", SK_TIMELINE_KEY_REGEX.as_str() ))); @@ -383,7 +387,7 @@ mod tests { &timeline_subscription, ] { let (id, _timeline) = - parse_etcd_key_value(subscription, &key_string, value_str) + parse_safekeeper_timeline(subscription, &key_string, value_str) .unwrap_or_else(|e| panic!("Should be able to parse etcd key string '{key_string}' and etcd value string '{value_str}' for subscription {subscription:?}, but got: {e}")); assert_eq!(id, ZTenantTimelineId::new(tenant_id, timeline_id)); } From 70a53c4b0316e3f35cf9f558590984db7895beba Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 3 Jun 2022 14:10:34 +0400 Subject: [PATCH 0401/1022] Get backup test_safekeeper_normal_work, but skip by default. It is handy for development. --- test_runner/batch_others/test_wal_acceptor.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 1932c3e450..3c74c729a1 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -18,6 +18,26 @@ from fixtures.log_helper import log from typing import List, Optional, Any +# basic test, write something in setup with 3 wal acceptors, ensure that commits +# succeed and data is written +@pytest.mark.skip(reason="simple test for development") +def test_safekeeper_normal_work(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch('test_safekeepers_normal_work') + pg = env.postgres.create_start('test_safekeepers_normal_work') + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + # we rely upon autocommit after each statement + # as waiting for acceptors happens there + cur.execute('CREATE TABLE t(key int primary key, value text)') + cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + cur.execute('SELECT sum(key) FROM t') + assert cur.fetchone() == (5000050000, ) + + @dataclass class TimelineMetrics: timeline_id: str From 262319387619c18f700b2b03c7e3ad4fc25daa9e Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 2 Jun 2022 23:26:28 +0300 Subject: [PATCH 0402/1022] Remove pageserver_connstr from WAL stream logic --- libs/etcd_broker/src/lib.rs | 2 -- pageserver/src/walreceiver.rs | 50 +++-------------------------------- safekeeper/src/handler.rs | 18 ++++--------- safekeeper/src/receive_wal.rs | 14 ++-------- safekeeper/src/send_wal.rs | 3 +-- safekeeper/src/timeline.rs | 42 ++--------------------------- 6 files changed, 13 insertions(+), 116 deletions(-) diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index 81353450e0..6b3293ec40 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -57,8 +57,6 @@ pub struct SkTimelineInfo { pub peer_horizon_lsn: Option, #[serde(default)] pub safekeeper_connstr: Option, - #[serde(default)] - pub pageserver_connstr: Option, } #[derive(Debug, thiserror::Error)] diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 11c8617a57..202a13545d 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -659,7 +659,6 @@ impl WalConnectionManager { match wal_stream_connection_string( self.id, info.safekeeper_connstr.as_deref()?, - info.pageserver_connstr.as_deref()?, ) { Ok(connstr) => Some((sk_id, info, connstr)), Err(e) => { @@ -749,7 +748,6 @@ fn wal_stream_connection_string( timeline_id, }: ZTenantTimelineId, listen_pg_addr_str: &str, - pageserver_connstr: &str, ) -> anyhow::Result { let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db"); let me_conf = sk_connstr @@ -759,7 +757,7 @@ fn wal_stream_connection_string( })?; let (host, port) = utils::connstring::connection_host_port(&me_conf); Ok(format!( - "host={host} port={port} options='-c ztimelineid={timeline_id} ztenantid={tenant_id} pageserver_connstr={pageserver_connstr}'", + "host={host} port={port} options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'" )) } @@ -792,20 +790,6 @@ mod tests { remote_consistent_lsn: None, peer_horizon_lsn: None, safekeeper_connstr: None, - pageserver_connstr: Some("no safekeeper_connstr".to_string()), - }, - ), - ( - NodeId(1), - SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(1)), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: Some("no pageserver_connstr".to_string()), - pageserver_connstr: None, }, ), ( @@ -818,7 +802,6 @@ mod tests { remote_consistent_lsn: None, peer_horizon_lsn: None, safekeeper_connstr: Some("no commit_lsn".to_string()), - pageserver_connstr: Some("no commit_lsn (p)".to_string()), }, ), ( @@ -831,7 +814,6 @@ mod tests { remote_consistent_lsn: None, peer_horizon_lsn: None, safekeeper_connstr: Some("no commit_lsn".to_string()), - pageserver_connstr: Some("no commit_lsn (p)".to_string()), }, ), ])); @@ -887,7 +869,6 @@ mod tests { remote_consistent_lsn: None, peer_horizon_lsn: None, safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), - pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), }, ), ( @@ -900,7 +881,6 @@ mod tests { remote_consistent_lsn: None, peer_horizon_lsn: None, safekeeper_connstr: Some("not advanced Lsn".to_string()), - pageserver_connstr: Some("not advanced Lsn (p)".to_string()), }, ), ( @@ -915,7 +895,6 @@ mod tests { remote_consistent_lsn: None, peer_horizon_lsn: None, safekeeper_connstr: Some("not enough advanced Lsn".to_string()), - pageserver_connstr: Some("not enough advanced Lsn (p)".to_string()), }, ), ])); @@ -947,7 +926,6 @@ mod tests { remote_consistent_lsn: None, peer_horizon_lsn: None, safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), - pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), }, )])) .expect("Expected one candidate selected out of the only data option, but got none"); @@ -960,9 +938,6 @@ mod tests { assert!(only_candidate .wal_producer_connstr .contains(DUMMY_SAFEKEEPER_CONNSTR)); - assert!(only_candidate - .wal_producer_connstr - .contains(DUMMY_PAGESERVER_CONNSTR)); let selected_lsn = 100_000; let biggest_wal_candidate = data_manager_with_no_connection @@ -977,7 +952,6 @@ mod tests { remote_consistent_lsn: None, peer_horizon_lsn: None, safekeeper_connstr: Some("smaller commit_lsn".to_string()), - pageserver_connstr: Some("smaller commit_lsn (p)".to_string()), }, ), ( @@ -990,7 +964,6 @@ mod tests { remote_consistent_lsn: None, peer_horizon_lsn: None, safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), - pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), }, ), ( @@ -1003,9 +976,6 @@ mod tests { remote_consistent_lsn: None, peer_horizon_lsn: None, safekeeper_connstr: None, - pageserver_connstr: Some( - "no safekeeper_connstr despite bigger commit_lsn".to_string(), - ), }, ), ])) @@ -1022,9 +992,6 @@ mod tests { assert!(biggest_wal_candidate .wal_producer_connstr .contains(DUMMY_SAFEKEEPER_CONNSTR)); - assert!(biggest_wal_candidate - .wal_producer_connstr - .contains(DUMMY_PAGESERVER_CONNSTR)); Ok(()) } @@ -1071,7 +1038,6 @@ mod tests { remote_consistent_lsn: None, peer_horizon_lsn: None, safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), - pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), }, ), ( @@ -1084,7 +1050,6 @@ mod tests { remote_consistent_lsn: None, peer_horizon_lsn: None, safekeeper_connstr: Some("advanced by Lsn safekeeper".to_string()), - pageserver_connstr: Some("advanced by Lsn safekeeper (p)".to_string()), }, ), ]); @@ -1108,9 +1073,6 @@ mod tests { assert!(over_threshcurrent_candidate .wal_producer_connstr .contains("advanced by Lsn safekeeper")); - assert!(over_threshcurrent_candidate - .wal_producer_connstr - .contains("advanced by Lsn safekeeper (p)")); Ok(()) } @@ -1146,7 +1108,6 @@ mod tests { remote_consistent_lsn: None, peer_horizon_lsn: None, safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), - pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), }, )])) .expect( @@ -1168,9 +1129,6 @@ mod tests { assert!(over_threshcurrent_candidate .wal_producer_connstr .contains(DUMMY_SAFEKEEPER_CONNSTR)); - assert!(over_threshcurrent_candidate - .wal_producer_connstr - .contains(DUMMY_PAGESERVER_CONNSTR)); Ok(()) } @@ -1197,7 +1155,6 @@ mod tests { } const DUMMY_SAFEKEEPER_CONNSTR: &str = "safekeeper_connstr"; - const DUMMY_PAGESERVER_CONNSTR: &str = "pageserver_connstr"; // the function itself does not need async, but it spawns a tokio::task underneath hence neeed // a runtime to not to panic @@ -1205,9 +1162,8 @@ mod tests { id: ZTenantTimelineId, safekeeper_id: NodeId, ) -> WalConnectionData { - let dummy_connstr = - wal_stream_connection_string(id, DUMMY_SAFEKEEPER_CONNSTR, DUMMY_PAGESERVER_CONNSTR) - .expect("Failed to construct dummy wal producer connstr"); + let dummy_connstr = wal_stream_connection_string(id, DUMMY_SAFEKEEPER_CONNSTR) + .expect("Failed to construct dummy wal producer connstr"); WalConnectionData { safekeeper_id, connection: WalReceiverConnection::open( diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 9af78661f9..a8121e829e 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -29,12 +29,11 @@ pub struct SafekeeperPostgresHandler { pub ztenantid: Option, pub ztimelineid: Option, pub timeline: Option>, - pageserver_connstr: Option, } /// Parsed Postgres command. enum SafekeeperPostgresCommand { - StartWalPush { pageserver_connstr: Option }, + StartWalPush, StartReplication { start_lsn: Lsn }, IdentifySystem, JSONCtrl { cmd: AppendLogicalMessage }, @@ -42,11 +41,7 @@ enum SafekeeperPostgresCommand { fn parse_cmd(cmd: &str) -> Result { if cmd.starts_with("START_WAL_PUSH") { - let re = Regex::new(r"START_WAL_PUSH(?: (.+))?").unwrap(); - - let caps = re.captures(cmd).unwrap(); - let pageserver_connstr = caps.get(1).map(|m| m.as_str().to_owned()); - Ok(SafekeeperPostgresCommand::StartWalPush { pageserver_connstr }) + Ok(SafekeeperPostgresCommand::StartWalPush) } else if cmd.starts_with("START_REPLICATION") { let re = Regex::new(r"START_REPLICATION(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)").unwrap(); @@ -86,8 +81,6 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { self.appname = Some(app_name.clone()); } - self.pageserver_connstr = params.get("pageserver_connstr").cloned(); - Ok(()) } else { bail!("Safekeeper received unexpected initial message: {:?}", sm); @@ -113,14 +106,14 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { } match cmd { - SafekeeperPostgresCommand::StartWalPush { pageserver_connstr } => { - ReceiveWalConn::new(pgb, pageserver_connstr) + SafekeeperPostgresCommand::StartWalPush => { + ReceiveWalConn::new(pgb) .run(self) .context("failed to run ReceiveWalConn")?; } SafekeeperPostgresCommand::StartReplication { start_lsn } => { ReplicationConn::new(pgb) - .run(self, pgb, start_lsn, self.pageserver_connstr.clone()) + .run(self, pgb, start_lsn) .context("failed to run ReplicationConn")?; } SafekeeperPostgresCommand::IdentifySystem => { @@ -142,7 +135,6 @@ impl SafekeeperPostgresHandler { ztenantid: None, ztimelineid: None, timeline: None, - pageserver_connstr: None, } } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 88b7816912..af4cfb6ba4 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -32,22 +32,14 @@ pub struct ReceiveWalConn<'pg> { pg_backend: &'pg mut PostgresBackend, /// The cached result of `pg_backend.socket().peer_addr()` (roughly) peer_addr: SocketAddr, - /// Pageserver connection string forwarded from compute - /// NOTE that it is allowed to operate without a pageserver. - /// So if compute has no pageserver configured do not use it. - pageserver_connstr: Option, } impl<'pg> ReceiveWalConn<'pg> { - pub fn new( - pg: &'pg mut PostgresBackend, - pageserver_connstr: Option, - ) -> ReceiveWalConn<'pg> { + pub fn new(pg: &'pg mut PostgresBackend) -> ReceiveWalConn<'pg> { let peer_addr = *pg.get_peer_addr(); ReceiveWalConn { pg_backend: pg, peer_addr, - pageserver_connstr, } } @@ -120,9 +112,7 @@ impl<'pg> ReceiveWalConn<'pg> { // Register the connection and defer unregister. Do that only // after processing first message, as it sets wal_seg_size, // wanted by many. - spg.timeline - .get() - .on_compute_connect(self.pageserver_connstr.as_ref())?; + spg.timeline.get().on_compute_connect()?; _guard = Some(ComputeConnectionGuard { timeline: Arc::clone(spg.timeline.get()), }); diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 7a6a8ca9b9..fd82a55efa 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -162,9 +162,8 @@ impl ReplicationConn { spg: &mut SafekeeperPostgresHandler, pgb: &mut PostgresBackend, mut start_pos: Lsn, - pageserver_connstr: Option, ) -> Result<()> { - let _enter = info_span!("WAL sender", timeline = %spg.ztimelineid.unwrap(), pageserver_connstr = %pageserver_connstr.as_deref().unwrap_or_default()).entered(); + let _enter = info_span!("WAL sender", timeline = %spg.ztimelineid.unwrap()).entered(); // spawn the background thread which receives HotStandbyFeedback messages. let bg_timeline = Arc::clone(spg.timeline.get()); diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index b7a549fef8..30c94f2543 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -95,7 +95,6 @@ struct SharedState { /// when tli is inactive instead of having this flag. active: bool, num_computes: u32, - pageserver_connstr: Option, last_removed_segno: XLogSegNo, } @@ -119,7 +118,6 @@ impl SharedState { wal_backup_active: false, active: false, num_computes: 0, - pageserver_connstr: None, last_removed_segno: 0, }) } @@ -139,7 +137,6 @@ impl SharedState { wal_backup_active: false, active: false, num_computes: 0, - pageserver_connstr: None, last_removed_segno: 0, }) } @@ -190,35 +187,6 @@ impl SharedState { self.wal_backup_active } - /// Activate timeline's walsender: start/change timeline information propagated into etcd for further pageserver connections. - fn activate_walsender( - &mut self, - zttid: &ZTenantTimelineId, - new_pageserver_connstr: Option, - ) { - if self.pageserver_connstr != new_pageserver_connstr { - self.deactivate_walsender(zttid); - - if new_pageserver_connstr.is_some() { - info!( - "timeline {} has activated its walsender with connstr {new_pageserver_connstr:?}", - zttid.timeline_id, - ); - } - self.pageserver_connstr = new_pageserver_connstr; - } - } - - /// Deactivate the timeline: stop sending the timeline data into etcd, so no pageserver can connect for WAL streaming. - fn deactivate_walsender(&mut self, zttid: &ZTenantTimelineId) { - if let Some(pageserver_connstr) = self.pageserver_connstr.take() { - info!( - "timeline {} had deactivated its wallsender with connstr {pageserver_connstr:?}", - zttid.timeline_id, - ) - } - } - fn get_wal_seg_size(&self) -> usize { self.sk.state.server.wal_seg_size as usize } @@ -318,17 +286,12 @@ impl Timeline { /// Register compute connection, starting timeline-related activity if it is /// not running yet. /// Can fail only if channel to a static thread got closed, which is not normal at all. - pub fn on_compute_connect(&self, pageserver_connstr: Option<&String>) -> Result<()> { + pub fn on_compute_connect(&self) -> Result<()> { let is_wal_backup_action_pending: bool; { let mut shared_state = self.mutex.lock().unwrap(); shared_state.num_computes += 1; is_wal_backup_action_pending = shared_state.update_status(); - // FIXME: currently we always adopt latest pageserver connstr, but we - // should have kind of generations assigned by compute to distinguish - // the latest one or even pass it through consensus to reliably deliver - // to all safekeepers. - shared_state.activate_walsender(&self.zttid, pageserver_connstr.cloned()); } // Wake up wal backup launcher, if offloading not started yet. if is_wal_backup_action_pending { @@ -364,7 +327,7 @@ impl Timeline { (replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); if stop { - shared_state.deactivate_walsender(&self.zttid); + shared_state.update_status(); return Ok(true); } } @@ -525,7 +488,6 @@ impl Timeline { )), peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn), safekeeper_connstr: Some(conf.listen_pg_addr.clone()), - pageserver_connstr: shared_state.pageserver_connstr.clone(), backup_lsn: Some(shared_state.sk.inmem.backup_lsn), }) } From 5a723d44cd55673db8c65aa100438af6cd7c3f02 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 3 Jun 2022 17:33:42 +0400 Subject: [PATCH 0403/1022] Parametrize test_normal_work. I like to run small test locally, but let's avoid duplication. --- test_runner/batch_others/test_normal_work.py | 7 +++++-- test_runner/batch_others/test_wal_acceptor.py | 20 ------------------- 2 files changed, 5 insertions(+), 22 deletions(-) diff --git a/test_runner/batch_others/test_normal_work.py b/test_runner/batch_others/test_normal_work.py index aac9685681..c0f44ce7a9 100644 --- a/test_runner/batch_others/test_normal_work.py +++ b/test_runner/batch_others/test_normal_work.py @@ -1,5 +1,6 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient +import pytest def check_tenant(env: NeonEnv, pageserver_http: NeonPageserverHttpClient): @@ -26,7 +27,8 @@ def check_tenant(env: NeonEnv, pageserver_http: NeonPageserverHttpClient): pageserver_http.timeline_detach(tenant_id, timeline_id) -def test_normal_work(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize('num_timelines,num_safekeepers', [(3, 1)]) +def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_safekeepers: int): """ Basic test: * create new tenant with a timeline @@ -41,7 +43,8 @@ def test_normal_work(neon_env_builder: NeonEnvBuilder): """ env = neon_env_builder.init_start() + neon_env_builder.num_safekeepers = num_safekeepers pageserver_http = env.pageserver.http_client() - for _ in range(3): + for _ in range(num_timelines): check_tenant(env, pageserver_http) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 3c74c729a1..1932c3e450 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -18,26 +18,6 @@ from fixtures.log_helper import log from typing import List, Optional, Any -# basic test, write something in setup with 3 wal acceptors, ensure that commits -# succeed and data is written -@pytest.mark.skip(reason="simple test for development") -def test_safekeeper_normal_work(neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 3 - env = neon_env_builder.init_start() - - env.neon_cli.create_branch('test_safekeepers_normal_work') - pg = env.postgres.create_start('test_safekeepers_normal_work') - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - cur.execute('CREATE TABLE t(key int primary key, value text)') - cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - cur.execute('SELECT sum(key) FROM t') - assert cur.fetchone() == (5000050000, ) - - @dataclass class TimelineMetrics: timeline_id: str From e442f5357bd049511e72aaa7d6352c4c08159733 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 3 Jun 2022 15:21:01 +0300 Subject: [PATCH 0404/1022] unify two identical failpoints in flush_frozen_layer probably is a merge artfact --- pageserver/src/layered_repository.rs | 4 +--- test_runner/batch_others/test_ancestor_branch.py | 2 +- test_runner/batch_others/test_recovery.py | 3 ++- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 84ef2aa380..7696f0d021 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1727,9 +1727,7 @@ impl LayeredTimeline { new_delta_path.clone(), self.conf.timeline_path(&self.timeline_id, &self.tenant_id), ])?; - fail_point!("checkpoint-before-sync"); - - fail_point!("flush-frozen"); + fail_point!("flush-frozen-before-sync"); // Finally, replace the frozen in-memory layer with the new on-disk layer { diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index 3a16157093..656428e5df 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -24,7 +24,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): 'compaction_target_size': '4194304', }) - env.pageserver.safe_psql("failpoints flush-frozen=sleep(10000)") + env.pageserver.safe_psql("failpoints flush-frozen-before-sync=sleep(10000)") pg_branch0 = env.postgres.create_start('main', tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() diff --git a/test_runner/batch_others/test_recovery.py b/test_runner/batch_others/test_recovery.py index 14d1adf25d..5ba783b802 100644 --- a/test_runner/batch_others/test_recovery.py +++ b/test_runner/batch_others/test_recovery.py @@ -45,7 +45,8 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): # Configure failpoints pscur.execute( - "failpoints checkpoint-before-sync=sleep(2000);checkpoint-after-sync=exit") + "failpoints flush-frozen-before-sync=sleep(2000);checkpoint-after-sync=exit" + ) # Do some updates until pageserver is crashed try: From 92de8423afdab2191da8bfce89e7b046532e37cf Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Sun, 5 Jun 2022 09:18:11 -0400 Subject: [PATCH 0405/1022] Remove dead code (#1886) --- .../src/remote_storage/storage_sync/delete.rs | 223 ------------------ 1 file changed, 223 deletions(-) delete mode 100644 pageserver/src/remote_storage/storage_sync/delete.rs diff --git a/pageserver/src/remote_storage/storage_sync/delete.rs b/pageserver/src/remote_storage/storage_sync/delete.rs deleted file mode 100644 index 6fb1d254c4..0000000000 --- a/pageserver/src/remote_storage/storage_sync/delete.rs +++ /dev/null @@ -1,223 +0,0 @@ -//! Timeline synchronization logic to delete a bulk of timeline's remote files from the remote storage. - -use anyhow::Context; -use futures::stream::{FuturesUnordered, StreamExt}; -use tracing::{debug, error, info}; -use utils::zid::ZTenantTimelineId; - -use crate::remote_storage::{ - storage_sync::{SyncQueue, SyncTask}, - RemoteStorage, -}; - -use super::{LayersDeletion, SyncData}; - -/// Attempts to remove the timleline layers from the remote storage. -/// If the task had not adjusted the metadata before, the deletion will fail. -pub(super) async fn delete_timeline_layers<'a, P, S>( - storage: &'a S, - sync_queue: &SyncQueue, - sync_id: ZTenantTimelineId, - mut delete_data: SyncData, -) -> bool -where - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ - if !delete_data.data.deletion_registered { - error!("Cannot delete timeline layers before the deletion metadata is not registered, reenqueueing"); - delete_data.retries += 1; - sync_queue.push(sync_id, SyncTask::Delete(delete_data)); - return false; - } - - if delete_data.data.layers_to_delete.is_empty() { - info!("No layers to delete, skipping"); - return true; - } - - let layers_to_delete = delete_data - .data - .layers_to_delete - .drain() - .collect::>(); - debug!("Layers to delete: {layers_to_delete:?}"); - info!("Deleting {} timeline layers", layers_to_delete.len()); - - let mut delete_tasks = layers_to_delete - .into_iter() - .map(|local_layer_path| async { - let storage_path = match storage.storage_path(&local_layer_path).with_context(|| { - format!( - "Failed to get the layer storage path for local path '{}'", - local_layer_path.display() - ) - }) { - Ok(path) => path, - Err(e) => return Err((e, local_layer_path)), - }; - - match storage.delete(&storage_path).await.with_context(|| { - format!( - "Failed to delete remote layer from storage at '{:?}'", - storage_path - ) - }) { - Ok(()) => Ok(local_layer_path), - Err(e) => Err((e, local_layer_path)), - } - }) - .collect::>(); - - let mut errored = false; - while let Some(deletion_result) = delete_tasks.next().await { - match deletion_result { - Ok(local_layer_path) => { - debug!( - "Successfully deleted layer {} for timeline {sync_id}", - local_layer_path.display() - ); - delete_data.data.deleted_layers.insert(local_layer_path); - } - Err((e, local_layer_path)) => { - errored = true; - error!( - "Failed to delete layer {} for timeline {sync_id}: {e:?}", - local_layer_path.display() - ); - delete_data.data.layers_to_delete.insert(local_layer_path); - } - } - } - - if errored { - debug!("Reenqueuing failed delete task for timeline {sync_id}"); - delete_data.retries += 1; - sync_queue.push(sync_id, SyncTask::Delete(delete_data)); - } - errored -} - -#[cfg(test)] -mod tests { - use std::{collections::HashSet, num::NonZeroUsize}; - - use itertools::Itertools; - use tempfile::tempdir; - use tokio::fs; - use utils::lsn::Lsn; - - use crate::{ - remote_storage::{ - storage_sync::test_utils::{create_local_timeline, dummy_metadata}, - LocalFs, - }, - repository::repo_harness::{RepoHarness, TIMELINE_ID}, - }; - - use super::*; - - #[tokio::test] - async fn delete_timeline_negative() -> anyhow::Result<()> { - let harness = RepoHarness::create("delete_timeline_negative")?; - let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; - - let deleted = delete_timeline_layers( - &storage, - &sync_queue, - sync_id, - SyncData { - retries: 1, - data: LayersDeletion { - deleted_layers: HashSet::new(), - layers_to_delete: HashSet::new(), - deletion_registered: false, - }, - }, - ) - .await; - - assert!( - !deleted, - "Should not start the deletion for task with delete metadata unregistered" - ); - - Ok(()) - } - - #[tokio::test] - async fn delete_timeline() -> anyhow::Result<()> { - let harness = RepoHarness::create("delete_timeline")?; - let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let layer_files = ["a", "b", "c", "d"]; - let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; - let current_retries = 3; - let metadata = dummy_metadata(Lsn(0x30)); - let local_timeline_path = harness.timeline_path(&TIMELINE_ID); - let timeline_upload = - create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; - for local_path in timeline_upload.layers_to_upload { - let remote_path = storage.storage_path(&local_path)?; - let remote_parent_dir = remote_path.parent().unwrap(); - if !remote_parent_dir.exists() { - fs::create_dir_all(&remote_parent_dir).await?; - } - fs::copy(&local_path, &remote_path).await?; - } - assert_eq!( - storage - .list() - .await? - .into_iter() - .map(|remote_path| storage.local_path(&remote_path).unwrap()) - .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) - .sorted() - .collect::>(), - layer_files - .iter() - .map(|layer_str| layer_str.to_string()) - .sorted() - .collect::>(), - "Expect to have all layer files remotely before deletion" - ); - - let deleted = delete_timeline_layers( - &storage, - &sync_queue, - sync_id, - SyncData { - retries: current_retries, - data: LayersDeletion { - deleted_layers: HashSet::new(), - layers_to_delete: HashSet::from([ - local_timeline_path.join("a"), - local_timeline_path.join("c"), - local_timeline_path.join("something_different"), - ]), - deletion_registered: true, - }, - }, - ) - .await; - assert!(deleted, "Should be able to delete timeline files"); - - assert_eq!( - storage - .list() - .await? - .into_iter() - .map(|remote_path| storage.local_path(&remote_path).unwrap()) - .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) - .sorted() - .collect::>(), - vec!["b".to_string(), "d".to_string()], - "Expect to have only non-deleted files remotely" - ); - - Ok(()) - } -} From fecad1ca340f30fcffcc0fb306c63b1140ce18ba Mon Sep 17 00:00:00 2001 From: KlimentSerafimov Date: Mon, 6 Jun 2022 14:14:41 +0200 Subject: [PATCH 0406/1022] Resolving issue #1745. Added cluster option for SNI data (#1813) * Added project option in case SNI data is missing. Resolving issue #1745. * Added invariant checking for project name: if both sni_data and project_name are available then they should match. --- libs/utils/src/pq_proto.rs | 9 ++++++- proxy/src/auth/credentials.rs | 46 +++++++++++++++++++++++++++++------ 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index a36e8342b0..599af3fc68 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -269,7 +269,14 @@ impl FeStartupPacket { .next() .context("expected even number of params in StartupMessage")?; if name == "options" { - // deprecated way of passing params as cmd line args + // parsing options arguments "...&options=%3D+=..." + // '%3D' is '=' and '+' is ' ' + + // Note: we allow users that don't have SNI capabilities, + // to pass a special keyword argument 'project' + // to be used to determine the cluster name by the proxy. + + //TODO: write unit test for this and refactor in its own function. for cmdopt in value.split(' ') { let nameval: Vec<&str> = cmdopt.split('=').collect(); if nameval.len() == 2 { diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 467e7db282..6521162b50 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -26,6 +26,11 @@ pub struct ClientCredentials { // New console API requires SNI info to determine the cluster name. // Other Auth backends don't need it. pub sni_data: Option, + + // project_name is passed as argument from options from url. + // In case sni_data is missing: project_name is used to determine cluster name. + // In case sni_data is available: project_name and sni_data should match (otherwise throws an error). + pub project_name: Option, } impl ClientCredentials { @@ -37,22 +42,47 @@ impl ClientCredentials { #[derive(Debug, Error)] pub enum ProjectNameError { - #[error("SNI is missing, please upgrade the postgres client library")] + #[error("SNI is missing. EITHER please upgrade the postgres client library OR pass the project name as a parameter: '...&options=project%3D...'.")] Missing, - #[error("SNI is malformed")] + #[error("SNI is malformed.")] Bad, + + #[error("Inconsistent project name inferred from SNI and project option. String from SNI: '{0}', String from project option: '{1}'")] + Inconsistent(String, String), } impl UserFacingError for ProjectNameError {} impl ClientCredentials { - /// Determine project name from SNI. + /// Determine project name from SNI or from project_name parameter from options argument. pub fn project_name(&self) -> Result<&str, ProjectNameError> { - // Currently project name is passed as a top level domain - let sni = self.sni_data.as_ref().ok_or(ProjectNameError::Missing)?; - let (first, _) = sni.split_once('.').ok_or(ProjectNameError::Bad)?; - Ok(first) + // Checking that if both sni_data and project_name are set, then they should match + // otherwise, throws a ProjectNameError::Inconsistent error. + if let Some(sni_data) = &self.sni_data { + let project_name_from_sni_data = + sni_data.split_once('.').ok_or(ProjectNameError::Bad)?.0; + if let Some(project_name_from_options) = &self.project_name { + if !project_name_from_options.eq(project_name_from_sni_data) { + return Err(ProjectNameError::Inconsistent( + project_name_from_sni_data.to_string(), + project_name_from_options.to_string(), + )); + } + } + } + // determine the project name from self.sni_data if it exists, otherwise from self.project_name. + let ret = match &self.sni_data { + // if sni_data exists, use it to determine project name + Some(sni_data) => sni_data.split_once('.').ok_or(ProjectNameError::Bad)?.0, + // otherwise use project_option if it was manually set thought options parameter. + None => self + .project_name + .as_ref() + .ok_or(ProjectNameError::Missing)? + .as_str(), + }; + Ok(ret) } } @@ -68,11 +98,13 @@ impl TryFrom> for ClientCredentials { let user = get_param("user")?; let dbname = get_param("database")?; + let project_name = get_param("project").ok(); Ok(Self { user, dbname, sni_data: None, + project_name, }) } } From 6cfebc096f8e10bb454992875da118a93dd23bce Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Mon, 6 Jun 2022 12:32:10 -0400 Subject: [PATCH 0407/1022] Add read/write throughput performance tests (#1883) Part of #1467 This PR adds several performance tests that compare the [PG statistics](https://www.postgresql.org/docs/current/monitoring-stats.html) obtained when running PG benchmarks against Neon and vanilla PG to measure the read/write throughput of the DB. --- test_runner/conftest.py | 11 +- test_runner/fixtures/compare_fixtures.py | 28 ++++- test_runner/fixtures/pg_stats.py | 52 +++++++++ .../performance/test_compare_pg_stats.py | 101 ++++++++++++++++++ test_runner/performance/test_perf_pgbench.py | 10 +- 5 files changed, 190 insertions(+), 12 deletions(-) create mode 100644 test_runner/fixtures/pg_stats.py create mode 100644 test_runner/performance/test_compare_pg_stats.py diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 9569ff5674..c6e6289a5c 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -1,6 +1,5 @@ -pytest_plugins = ( - "fixtures.neon_fixtures", - "fixtures.benchmark_fixture", - "fixtures.compare_fixtures", - "fixtures.slow", -) +pytest_plugins = ("fixtures.neon_fixtures", + "fixtures.benchmark_fixture", + "fixtures.compare_fixtures", + "fixtures.slow", + "fixtures.pg_stats") diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index b04a038a50..9808d83492 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -1,12 +1,13 @@ import pytest from contextlib import contextmanager from abc import ABC, abstractmethod +from fixtures.pg_stats import PgStatTable from fixtures.neon_fixtures import PgBin, PgProtocol, VanillaPostgres, RemotePostgres, NeonEnv from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker # Type-related stuff -from typing import Iterator +from typing import Dict, List class PgCompare(ABC): @@ -51,6 +52,31 @@ class PgCompare(ABC): def record_duration(self, out_name): pass + @contextmanager + def record_pg_stats(self, pg_stats: List[PgStatTable]): + init_data = self._retrieve_pg_stats(pg_stats) + + yield + + data = self._retrieve_pg_stats(pg_stats) + + for k in set(init_data) & set(data): + self.zenbenchmark.record(k, data[k] - init_data[k], '', MetricReport.HIGHER_IS_BETTER) + + def _retrieve_pg_stats(self, pg_stats: List[PgStatTable]) -> Dict[str, int]: + results: Dict[str, int] = {} + + with self.pg.connect().cursor() as cur: + for pg_stat in pg_stats: + cur.execute(pg_stat.query) + row = cur.fetchone() + assert len(row) == len(pg_stat.columns) + + for col, val in zip(pg_stat.columns, row): + results[f"{pg_stat.table}.{col}"] = int(val) + + return results + class NeonCompare(PgCompare): """PgCompare interface for the neon stack.""" diff --git a/test_runner/fixtures/pg_stats.py b/test_runner/fixtures/pg_stats.py new file mode 100644 index 0000000000..e113d37248 --- /dev/null +++ b/test_runner/fixtures/pg_stats.py @@ -0,0 +1,52 @@ +from typing import List + +import pytest + + +class PgStatTable: + table: str + columns: List[str] + additional_query: str + + def __init__(self, table: str, columns: List[str], filter_query: str = ""): + self.table = table + self.columns = columns + self.additional_query = filter_query + + @property + def query(self) -> str: + return f"SELECT {','.join(self.columns)} FROM {self.table} {self.additional_query}" + + +@pytest.fixture(scope='function') +def pg_stats_rw() -> List[PgStatTable]: + return [ + PgStatTable("pg_stat_database", + ["tup_returned", "tup_fetched", "tup_inserted", "tup_updated", "tup_deleted"], + "WHERE datname='postgres'"), + ] + + +@pytest.fixture(scope='function') +def pg_stats_ro() -> List[PgStatTable]: + return [ + PgStatTable("pg_stat_database", ["tup_returned", "tup_fetched"], + "WHERE datname='postgres'"), + ] + + +@pytest.fixture(scope='function') +def pg_stats_wo() -> List[PgStatTable]: + return [ + PgStatTable("pg_stat_database", ["tup_inserted", "tup_updated", "tup_deleted"], + "WHERE datname='postgres'"), + ] + + +@pytest.fixture(scope='function') +def pg_stats_wal() -> List[PgStatTable]: + return [ + PgStatTable("pg_stat_wal", + ["wal_records", "wal_fpi", "wal_bytes", "wal_buffers_full", "wal_write"], + "") + ] diff --git a/test_runner/performance/test_compare_pg_stats.py b/test_runner/performance/test_compare_pg_stats.py new file mode 100644 index 0000000000..798974eac2 --- /dev/null +++ b/test_runner/performance/test_compare_pg_stats.py @@ -0,0 +1,101 @@ +import os +from typing import List + +import pytest +from fixtures.compare_fixtures import PgCompare +from fixtures.pg_stats import PgStatTable + +from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix + + +def get_seeds_matrix(default: int = 100): + seeds = os.getenv("TEST_PG_BENCH_SEEDS_MATRIX", default=str(default)) + return list(map(int, seeds.split(","))) + + +@pytest.mark.parametrize("seed", get_seeds_matrix()) +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix(5)) +def test_compare_pg_stats_rw_with_pgbench_default(neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_rw: List[PgStatTable]): + env = neon_with_baseline + # initialize pgbench + env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.flush() + + with env.record_pg_stats(pg_stats_rw): + env.pg_bin.run_capture( + ['pgbench', f'-T{duration}', f'--random-seed={seed}', '-Mprepared', env.pg.connstr()]) + env.flush() + + +@pytest.mark.parametrize("seed", get_seeds_matrix()) +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix(5)) +def test_compare_pg_stats_wo_with_pgbench_simple_update(neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_wo: List[PgStatTable]): + env = neon_with_baseline + # initialize pgbench + env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.flush() + + with env.record_pg_stats(pg_stats_wo): + env.pg_bin.run_capture([ + 'pgbench', + '-N', + f'-T{duration}', + f'--random-seed={seed}', + '-Mprepared', + env.pg.connstr() + ]) + env.flush() + + +@pytest.mark.parametrize("seed", get_seeds_matrix()) +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix(5)) +def test_compare_pg_stats_ro_with_pgbench_select_only(neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_ro: List[PgStatTable]): + env = neon_with_baseline + # initialize pgbench + env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.flush() + + with env.record_pg_stats(pg_stats_ro): + env.pg_bin.run_capture([ + 'pgbench', + '-S', + f'-T{duration}', + f'--random-seed={seed}', + '-Mprepared', + env.pg.connstr() + ]) + env.flush() + + +@pytest.mark.parametrize("seed", get_seeds_matrix()) +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix(5)) +def test_compare_pg_stats_wal_with_pgbench_default(neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_wal: List[PgStatTable]): + env = neon_with_baseline + # initialize pgbench + env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.flush() + + with env.record_pg_stats(pg_stats_wal): + env.pg_bin.run_capture( + ['pgbench', f'-T{duration}', f'--random-seed={seed}', '-Mprepared', env.pg.connstr()]) + env.flush() diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 97aeae2b8e..6ebb6d6ecf 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -79,7 +79,7 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int): # Run simple-update workload run_pgbench(env, "simple-update", - ['pgbench', '-n', '-c4', f'-T{duration}', '-P2', '-Mprepared', env.pg.connstr()]) + ['pgbench', '-N', '-c4', f'-T{duration}', '-P2', '-Mprepared', env.pg.connstr()]) # Run SELECT workload run_pgbench(env, @@ -89,13 +89,13 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int): env.report_size() -def get_durations_matrix(): - durations = os.getenv("TEST_PG_BENCH_DURATIONS_MATRIX", default="45") +def get_durations_matrix(default: int = 45): + durations = os.getenv("TEST_PG_BENCH_DURATIONS_MATRIX", default=str(default)) return list(map(int, durations.split(","))) -def get_scales_matrix(): - scales = os.getenv("TEST_PG_BENCH_SCALES_MATRIX", default="10") +def get_scales_matrix(default: int = 10): + scales = os.getenv("TEST_PG_BENCH_SCALES_MATRIX", default=str(default)) return list(map(int, scales.split(","))) From 7dc6beacbd34b4631337d4935a91a65586c818f8 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 6 Jun 2022 23:57:06 +0300 Subject: [PATCH 0408/1022] make it possible to associate thread with a tenant after thread start --- pageserver/src/page_service.rs | 5 ++- pageserver/src/thread_mgr.rs | 76 ++++++++++++++++++++-------------- 2 files changed, 50 insertions(+), 31 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index df43b8c0df..f6a088d4b5 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -370,6 +370,10 @@ impl PageServerHandler { ) -> anyhow::Result<()> { let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered(); + // NOTE: pagerequests handler exits when connection is closed, + // so there is no need to reset the association + thread_mgr::associate_with(Some(tenantid), Some(timelineid)); + // Check that the timeline exists let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) .context("Cannot load local timeline")?; @@ -802,7 +806,6 @@ impl postgres_backend::Handler for PageServerHandler { .map(|h| h.as_str().parse()) .unwrap_or_else(|| Ok(repo.get_gc_horizon()))?; - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; // Use tenant's pitr setting let pitr = repo.get_pitr_interval(); let result = repo.gc_iteration(Some(timelineid), gc_horizon, pitr, true)?; diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index 8264bdd97c..6e4bc1a787 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -108,15 +108,21 @@ pub enum ThreadKind { StorageSync, } +struct MutableThreadState { + /// Tenant and timeline that this thread is associated with. + tenant_id: Option, + timeline_id: Option, + + /// Handle for waiting for the thread to exit. It can be None, if the + /// the thread has already exited. + join_handle: Option>, +} + struct PageServerThread { _thread_id: u64, kind: ThreadKind, - /// Tenant and timeline that this thread is associated with. - tenant_id: Option, - timeline_id: Option, - name: String, // To request thread shutdown, set the flag, and send a dummy message to the @@ -124,9 +130,7 @@ struct PageServerThread { shutdown_requested: AtomicBool, shutdown_tx: watch::Sender<()>, - /// Handle for waiting for the thread to exit. It can be None, if the - /// the thread has already exited. - join_handle: Mutex>>, + mutable: Mutex, } /// Launch a new thread @@ -145,29 +149,27 @@ where { let (shutdown_tx, shutdown_rx) = watch::channel(()); let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); - let thread = PageServerThread { + let thread = Arc::new(PageServerThread { _thread_id: thread_id, kind, - tenant_id, - timeline_id, name: name.to_string(), - shutdown_requested: AtomicBool::new(false), shutdown_tx, - - join_handle: Mutex::new(None), - }; - - let thread_rc = Arc::new(thread); - - let mut jh_guard = thread_rc.join_handle.lock().unwrap(); + mutable: Mutex::new(MutableThreadState { + tenant_id, + timeline_id, + join_handle: None, + }), + }); THREADS .lock() .unwrap() - .insert(thread_id, Arc::clone(&thread_rc)); + .insert(thread_id, Arc::clone(&thread)); - let thread_rc2 = Arc::clone(&thread_rc); + let mut thread_mut = thread.mutable.lock().unwrap(); + + let thread_cloned = Arc::clone(&thread); let thread_name = name.to_string(); let join_handle = match thread::Builder::new() .name(name.to_string()) @@ -175,7 +177,7 @@ where thread_wrapper( thread_name, thread_id, - thread_rc2, + thread_cloned, shutdown_rx, shutdown_process_on_error, f, @@ -189,8 +191,8 @@ where return Err(err); } }; - *jh_guard = Some(join_handle); - drop(jh_guard); + thread_mut.join_handle = Some(join_handle); + drop(thread_mut); // The thread is now running. Nothing more to do here Ok(thread_id) @@ -229,19 +231,20 @@ fn thread_wrapper( .remove(&thread_id) .expect("no thread in registry"); + let thread_mut = thread.mutable.lock().unwrap(); match result { Ok(Ok(())) => debug!("Thread '{}' exited normally", thread_name), Ok(Err(err)) => { if shutdown_process_on_error { error!( "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - thread_name, thread.tenant_id, thread.timeline_id, err + thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err ); shutdown_pageserver(1); } else { error!( "Thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - thread_name, thread.tenant_id, thread.timeline_id, err + thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err ); } } @@ -249,19 +252,29 @@ fn thread_wrapper( if shutdown_process_on_error { error!( "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - thread_name, thread.tenant_id, thread.timeline_id, err + thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err ); shutdown_pageserver(1); } else { error!( "Thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - thread_name, thread.tenant_id, thread.timeline_id, err + thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err ); } } } } +// expected to be called from the thread of the given id. +pub fn associate_with(tenant_id: Option, timeline_id: Option) { + CURRENT_THREAD.with(|ct| { + let borrowed = ct.borrow(); + let mut thread_mut = borrowed.as_ref().unwrap().mutable.lock().unwrap(); + thread_mut.tenant_id = tenant_id; + thread_mut.timeline_id = timeline_id; + }); +} + /// Is there a thread running that matches the criteria /// Signal and wait for threads to shut down. @@ -285,9 +298,10 @@ pub fn shutdown_threads( let threads = THREADS.lock().unwrap(); for thread in threads.values() { + let thread_mut = thread.mutable.lock().unwrap(); if (kind.is_none() || Some(thread.kind) == kind) - && (tenant_id.is_none() || thread.tenant_id == tenant_id) - && (timeline_id.is_none() || thread.timeline_id == timeline_id) + && (tenant_id.is_none() || thread_mut.tenant_id == tenant_id) + && (timeline_id.is_none() || thread_mut.timeline_id == timeline_id) { thread.shutdown_requested.store(true, Ordering::Relaxed); // FIXME: handle error? @@ -298,8 +312,10 @@ pub fn shutdown_threads( drop(threads); for thread in victim_threads { + let mut thread_mut = thread.mutable.lock().unwrap(); info!("waiting for {} to shut down", thread.name); - if let Some(join_handle) = thread.join_handle.lock().unwrap().take() { + if let Some(join_handle) = thread_mut.join_handle.take() { + drop(thread_mut); let _ = join_handle.join(); } else { // The thread had not even fully started yet. Or it was shut down From 0b93253b3c270a07a6d1e536711122a2b47903b3 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 7 Jun 2022 11:58:35 +0400 Subject: [PATCH 0409/1022] Fix leaked keepalive task in s3 offloading leader election. I still don't like the surroundings and feel we'd better get away without using election API at all, but this is a quick fix to keep CI green. ref #1815 --- safekeeper/src/broker.rs | 25 +++++++++++++++---------- safekeeper/src/wal_backup.rs | 17 ++++------------- 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 5be8091a7e..3d75fec587 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -90,7 +90,7 @@ impl ElectionLeader { } } -pub async fn get_leader(req: &Election) -> Result { +pub async fn get_leader(req: &Election, leader: &mut Option) -> Result<()> { let mut client = Client::connect(req.broker_endpoints.clone(), None) .await .context("Could not connect to etcd")?; @@ -102,22 +102,27 @@ pub async fn get_leader(req: &Election) -> Result { let lease_id = lease.map(|l| l.id()).unwrap(); - let keep_alive = spawn::<_>(lease_keep_alive(client.clone(), lease_id)); + // kill previous keepalive, if any + if let Some(l) = leader.take() { + l.give_up().await; + } - if let Err(e) = client + let keep_alive = spawn::<_>(lease_keep_alive(client.clone(), lease_id)); + // immediately save handle to kill task if we get canceled below + *leader = Some(ElectionLeader { + client: client.clone(), + keep_alive, + }); + + client .campaign( req.election_name.clone(), req.candidate_name.clone(), lease_id, ) - .await - { - keep_alive.abort(); - let _ = keep_alive.await; - return Err(e.into()); - } + .await?; - Ok(ElectionLeader { client, keep_alive }) + Ok(()) } async fn lease_keep_alive(mut client: Client, lease_id: i64) -> Result<()> { diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 30364ce434..1f2e9c303a 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -200,20 +200,11 @@ impl WalBackupTask { loop { let mut retry_attempt = 0u32; - if let Some(l) = self.leader.take() { - l.give_up().await; - } - info!("acquiring leadership"); - match broker::get_leader(&self.election).await { - Ok(l) => { - self.leader = Some(l); - } - Err(e) => { - error!("error during leader election {:?}", e); - sleep(Duration::from_millis(BROKER_CONNECTION_RETRY_DELAY_MS)).await; - continue; - } + if let Err(e) = broker::get_leader(&self.election, &mut self.leader).await { + error!("error during leader election {:?}", e); + sleep(Duration::from_millis(BROKER_CONNECTION_RETRY_DELAY_MS)).await; + continue; } info!("acquired leadership"); From 6e26588d17d212e703915507a4913a592b3082d5 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 7 Jun 2022 17:10:39 +0300 Subject: [PATCH 0410/1022] Allow to customize shutdown condition in PostgresBackend Use it in PageServerHandler to check per thread shutdown condition from thread_mgr which takes into account tenants and timelines --- libs/utils/src/postgres_backend.rs | 14 +++++--------- pageserver/src/lib.rs | 2 -- pageserver/src/page_service.rs | 4 ++++ 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index ff71423122..79dca96fcf 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -13,13 +13,10 @@ use std::fmt; use std::io::{self, Write}; use std::net::{Shutdown, SocketAddr, TcpStream}; use std::str::FromStr; -use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::time::Duration; use tracing::*; -static PGBACKEND_SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false); - pub trait Handler { /// Handle single query. /// postgres_backend will issue ReadyForQuery after calling this (this @@ -45,6 +42,10 @@ pub trait Handler { fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> { bail!("JWT auth failed") } + + fn is_shutdown_requested(&self) -> bool { + false + } } /// PostgresBackend protocol state. @@ -274,7 +275,7 @@ impl PostgresBackend { let mut unnamed_query_string = Bytes::new(); - while !PGBACKEND_SHUTDOWN_REQUESTED.load(Ordering::Relaxed) { + while !handler.is_shutdown_requested() { match self.read_message() { Ok(message) => { if let Some(msg) = message { @@ -493,8 +494,3 @@ impl PostgresBackend { Ok(ProcessMsgResult::Continue) } } - -// Set the flag to inform connections to cancel -pub fn set_pgbackend_shutdown_requested() { - PGBACKEND_SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed); -} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index fdce0e5c5f..a68c277114 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -24,7 +24,6 @@ pub mod walredo; use lazy_static::lazy_static; use tracing::info; -use utils::postgres_backend; use crate::thread_mgr::ThreadKind; use metrics::{register_int_gauge_vec, IntGaugeVec}; @@ -73,7 +72,6 @@ pub fn shutdown_pageserver(exit_code: i32) { thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None); // Shut down any page service threads. - postgres_backend::set_pgbackend_shutdown_requested(); thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None); // Shut down all the tenants. This flushes everything to disk and kills diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index f6a088d4b5..30f0d241d6 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -676,6 +676,10 @@ impl postgres_backend::Handler for PageServerHandler { Ok(()) } + fn is_shutdown_requested(&self) -> bool { + thread_mgr::is_shutdown_requested() + } + fn process_query( &mut self, pgb: &mut PostgresBackend, From 8a53472e4f22cb3462694b5e3919c8d482fe5f58 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 2 May 2022 23:28:54 +0300 Subject: [PATCH 0411/1022] Force etcd broker keys to not to intersect --- libs/etcd_broker/src/lib.rs | 405 ++++++--------------- libs/etcd_broker/src/subscription_key.rs | 310 ++++++++++++++++ libs/etcd_broker/src/subscription_value.rs | 35 ++ pageserver/src/walreceiver.rs | 11 +- safekeeper/src/broker.rs | 33 +- safekeeper/src/http/routes.rs | 2 +- safekeeper/src/safekeeper.rs | 2 +- safekeeper/src/timeline.rs | 2 +- safekeeper/src/wal_backup.rs | 19 +- 9 files changed, 499 insertions(+), 320 deletions(-) create mode 100644 libs/etcd_broker/src/subscription_key.rs create mode 100644 libs/etcd_broker/src/subscription_value.rs diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index 6b3293ec40..38d4a403c2 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -1,91 +1,43 @@ //! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent). //! Intended to connect services to each other, not to store their data. + +/// All broker keys, that are used when dealing with etcd. +pub mod subscription_key; +/// All broker values, possible to use when dealing with etcd. +pub mod subscription_value; + use std::{ collections::{hash_map, HashMap}, - fmt::Display, str::FromStr, }; -use once_cell::sync::Lazy; -use regex::{Captures, Regex}; -use serde::{Deserialize, Serialize}; -use serde_with::{serde_as, DisplayFromStr}; - -pub use etcd_client::*; +use serde::de::DeserializeOwned; +use subscription_key::SubscriptionKey; use tokio::{sync::mpsc, task::JoinHandle}; use tracing::*; -use utils::{ - lsn::Lsn, - zid::{NodeId, ZTenantId, ZTenantTimelineId}, -}; +use utils::zid::{NodeId, ZTenantTimelineId}; + +use crate::subscription_key::SubscriptionFullKey; + +pub use etcd_client::*; /// Default value to use for prefixing to all etcd keys with. /// This way allows isolating safekeeper/pageserver groups in the same etcd cluster. pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon"; -#[derive(Debug, Deserialize, Serialize)] -struct SafekeeperTimeline { - safekeeper_id: NodeId, - info: SkTimelineInfo, -} - -/// Published data about safekeeper's timeline. Fields made optional for easy migrations. -#[serde_as] -#[derive(Debug, Clone, Deserialize, Serialize)] -pub struct SkTimelineInfo { - /// Term of the last entry. - pub last_log_term: Option, - /// LSN of the last record. - #[serde_as(as = "Option")] - #[serde(default)] - pub flush_lsn: Option, - /// Up to which LSN safekeeper regards its WAL as committed. - #[serde_as(as = "Option")] - #[serde(default)] - pub commit_lsn: Option, - /// LSN up to which safekeeper has backed WAL. - #[serde_as(as = "Option")] - #[serde(default)] - pub backup_lsn: Option, - /// LSN of last checkpoint uploaded by pageserver. - #[serde_as(as = "Option")] - #[serde(default)] - pub remote_consistent_lsn: Option, - #[serde_as(as = "Option")] - #[serde(default)] - pub peer_horizon_lsn: Option, - #[serde(default)] - pub safekeeper_connstr: Option, -} - -#[derive(Debug, thiserror::Error)] -pub enum BrokerError { - #[error("Etcd client error: {0}. Context: {1}")] - EtcdClient(etcd_client::Error, String), - #[error("Error during parsing etcd key: {0}")] - InvalidKey(String), - #[error("Error during parsing etcd value: {0}")] - ParsingError(String), - #[error("Internal error: {0}")] - InternalError(String), -} - /// A way to control the data retrieval from a certain subscription. -pub struct SkTimelineSubscription { - safekeeper_timeline_updates: - mpsc::UnboundedReceiver>>, - kind: SkTimelineSubscriptionKind, +pub struct BrokerSubscription { + value_updates: mpsc::UnboundedReceiver>>, + key: SubscriptionKey, watcher_handle: JoinHandle>, watcher: Watcher, } -impl SkTimelineSubscription { +impl BrokerSubscription { /// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet. - pub async fn fetch_data( - &mut self, - ) -> Option>> { - self.safekeeper_timeline_updates.recv().await + pub async fn fetch_data(&mut self) -> Option>> { + self.value_updates.recv().await } /// Cancels the subscription, stopping the data poller and waiting for it to shut down. @@ -93,117 +45,90 @@ impl SkTimelineSubscription { self.watcher.cancel().await.map_err(|e| { BrokerError::EtcdClient( e, - format!( - "Failed to cancel timeline subscription, kind: {:?}", - self.kind - ), + format!("Failed to cancel broker subscription, kind: {:?}", self.key), ) })?; self.watcher_handle.await.map_err(|e| { BrokerError::InternalError(format!( - "Failed to join the timeline updates task, kind: {:?}, error: {e}", - self.kind + "Failed to join the broker value updates task, kind: {:?}, error: {e}", + self.key )) })? } } -/// The subscription kind to the timeline updates from safekeeper. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct SkTimelineSubscriptionKind { - broker_etcd_prefix: String, - kind: SubscriptionKind, -} - -impl SkTimelineSubscriptionKind { - pub fn all(broker_etcd_prefix: String) -> Self { - Self { - broker_etcd_prefix, - kind: SubscriptionKind::All, - } - } - - pub fn tenant(broker_etcd_prefix: String, tenant: ZTenantId) -> Self { - Self { - broker_etcd_prefix, - kind: SubscriptionKind::Tenant(tenant), - } - } - - pub fn timeline(broker_etcd_prefix: String, timeline: ZTenantTimelineId) -> Self { - Self { - broker_etcd_prefix, - kind: SubscriptionKind::Timeline(timeline), - } - } - - /// Etcd key to use for watching a certain timeline updates from safekeepers. - pub fn watch_key(&self) -> String { - match self.kind { - SubscriptionKind::All => self.broker_etcd_prefix.to_string(), - SubscriptionKind::Tenant(tenant_id) => { - format!("{}/{tenant_id}/safekeeper", self.broker_etcd_prefix) - } - SubscriptionKind::Timeline(ZTenantTimelineId { - tenant_id, - timeline_id, - }) => format!( - "{}/{tenant_id}/{timeline_id}/safekeeper", - self.broker_etcd_prefix - ), - } - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -enum SubscriptionKind { - /// Get every timeline update. - All, - /// Get certain tenant timelines' updates. - Tenant(ZTenantId), - /// Get certain timeline updates. - Timeline(ZTenantTimelineId), +#[derive(Debug, thiserror::Error)] +pub enum BrokerError { + #[error("Etcd client error: {0}. Context: {1}")] + EtcdClient(etcd_client::Error, String), + #[error("Error during parsing etcd key: {0}")] + KeyNotParsed(String), + #[error("Internal error: {0}")] + InternalError(String), } /// Creates a background task to poll etcd for timeline updates from safekeepers. /// Stops and returns `Err` on any error during etcd communication. /// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle, /// exiting normally in such cases. -pub async fn subscribe_to_safekeeper_timeline_updates( +/// Etcd values are parsed as json fukes into a type, specified in the generic patameter. +pub async fn subscribe_for_json_values( client: &mut Client, - subscription: SkTimelineSubscriptionKind, -) -> Result { - info!("Subscribing to timeline updates, subscription kind: {subscription:?}"); - let kind = subscription.clone(); + key: SubscriptionKey, +) -> Result, BrokerError> +where + V: DeserializeOwned + Send + 'static, +{ + subscribe_for_values(client, key, |_, value_str| { + match serde_json::from_str::(value_str) { + Ok(value) => Some(value), + Err(e) => { + error!("Failed to parse value str '{value_str}': {e}"); + None + } + } + }) + .await +} + +/// Same as [`subscribe_for_json_values`], but allows to specify a custom parser of a etcd value string. +pub async fn subscribe_for_values( + client: &mut Client, + key: SubscriptionKey, + value_parser: P, +) -> Result, BrokerError> +where + V: Send + 'static, + P: Fn(SubscriptionFullKey, &str) -> Option + Send + 'static, +{ + info!("Subscribing to broker value updates, key: {key:?}"); + let subscription_key = key.clone(); let (watcher, mut stream) = client - .watch( - subscription.watch_key(), - Some(WatchOptions::new().with_prefix()), - ) + .watch(key.watch_key(), Some(WatchOptions::new().with_prefix())) .await .map_err(|e| { BrokerError::EtcdClient( e, - format!("Failed to init the watch for subscription {subscription:?}"), + format!("Failed to init the watch for subscription {key:?}"), ) })?; - let (timeline_updates_sender, safekeeper_timeline_updates) = mpsc::unbounded_channel(); + let (value_updates_sender, value_updates_receiver) = mpsc::unbounded_channel(); let watcher_handle = tokio::spawn(async move { while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!( - "Failed to get messages from the subscription stream, kind: {:?}, error: {e}", subscription.kind + "Failed to get messages from the subscription stream, kind: {:?}, error: {e}", key.kind )))? { if resp.canceled() { info!("Watch for timeline updates subscription was canceled, exiting"); break; } - let mut timeline_updates: HashMap> = HashMap::new(); + let mut value_updates: HashMap> = HashMap::new(); // Keep track that the timeline data updates from etcd arrive in the right order. // https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas // > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering. - let mut timeline_etcd_versions: HashMap = HashMap::new(); + let mut value_etcd_versions: HashMap = HashMap::new(); let events = resp.events(); @@ -213,182 +138,78 @@ pub async fn subscribe_to_safekeeper_timeline_updates( if EventType::Put == event.event_type() { if let Some(new_etcd_kv) = event.kv() { let new_kv_version = new_etcd_kv.version(); - let (key_str, value_str) = match extract_key_value_str(new_etcd_kv) { - Ok(strs) => strs, - Err(e) => { - error!("Failed to represent etcd KV {new_etcd_kv:?} as pair of str: {e}"); - continue; - }, - }; - match parse_safekeeper_timeline(&subscription, key_str, value_str) { - Ok((zttid, timeline)) => { - match timeline_updates - .entry(zttid) - .or_default() - .entry(timeline.safekeeper_id) - { - hash_map::Entry::Occupied(mut o) => { - let old_etcd_kv_version = timeline_etcd_versions.get(&zttid).copied().unwrap_or(i64::MIN); - if old_etcd_kv_version < new_kv_version { - o.insert(timeline.info); - timeline_etcd_versions.insert(zttid,new_kv_version); - } else { - debug!("Skipping etcd timeline update due to older version compared to one that's already stored"); + match parse_etcd_kv(new_etcd_kv, &value_parser, &key.cluster_prefix) { + Ok(Some((key, value))) => match value_updates + .entry(key.id) + .or_default() + .entry(key.node_id) + { + hash_map::Entry::Occupied(mut o) => { + let old_etcd_kv_version = value_etcd_versions.get(&key.id).copied().unwrap_or(i64::MIN); + if old_etcd_kv_version < new_kv_version { + o.insert(value); + value_etcd_versions.insert(key.id,new_kv_version); + } else { + debug!("Skipping etcd timeline update due to older version compared to one that's already stored"); + } } - } - hash_map::Entry::Vacant(v) => { - v.insert(timeline.info); - timeline_etcd_versions.insert(zttid,new_kv_version); - } - } - } - // it is normal to get other keys when we subscribe to everything - Err(BrokerError::InvalidKey(e)) => debug!("Unexpected key for timeline update: {e}"), - Err(e) => error!("Failed to parse timeline update: {e}"), + hash_map::Entry::Vacant(v) => { + v.insert(value); + value_etcd_versions.insert(key.id,new_kv_version); + } + }, + Ok(None) => debug!("Ignoring key {key:?} : no value was returned by the parser"), + Err(BrokerError::KeyNotParsed(e)) => debug!("Unexpected key {key:?} for timeline update: {e}"), + Err(e) => error!("Failed to represent etcd KV {new_etcd_kv:?}: {e}"), }; } } } - if let Err(e) = timeline_updates_sender.send(timeline_updates) { - info!("Timeline updates sender got dropped, exiting: {e}"); - break; + if !value_updates.is_empty() { + if let Err(e) = value_updates_sender.send(value_updates) { + info!("Broker value updates for key {key:?} sender got dropped, exiting: {e}"); + break; + } } } Ok(()) }.instrument(info_span!("etcd_broker"))); - Ok(SkTimelineSubscription { - kind, - safekeeper_timeline_updates, + Ok(BrokerSubscription { + key: subscription_key, + value_updates: value_updates_receiver, watcher_handle, watcher, }) } -fn extract_key_value_str(kv: &KeyValue) -> Result<(&str, &str), BrokerError> { - let key = kv.key_str().map_err(|e| { +fn parse_etcd_kv( + kv: &KeyValue, + value_parser: &P, + cluster_prefix: &str, +) -> Result, BrokerError> +where + P: Fn(SubscriptionFullKey, &str) -> Option, +{ + let key_str = kv.key_str().map_err(|e| { BrokerError::EtcdClient(e, "Failed to extract key str out of etcd KV".to_string()) })?; - let value = kv.value_str().map_err(|e| { + let value_str = kv.value_str().map_err(|e| { BrokerError::EtcdClient(e, "Failed to extract value str out of etcd KV".to_string()) })?; - Ok((key, value)) -} -static SK_TIMELINE_KEY_REGEX: Lazy = Lazy::new(|| { - Regex::new("/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]]+)$") - .expect("wrong regex for safekeeper timeline etcd key") -}); - -fn parse_safekeeper_timeline( - subscription: &SkTimelineSubscriptionKind, - key_str: &str, - value_str: &str, -) -> Result<(ZTenantTimelineId, SafekeeperTimeline), BrokerError> { - let broker_prefix = subscription.broker_etcd_prefix.as_str(); - if !key_str.starts_with(broker_prefix) { - return Err(BrokerError::InvalidKey(format!( - "KV has unexpected key '{key_str}' that does not start with broker prefix {broker_prefix}" + if !key_str.starts_with(cluster_prefix) { + return Err(BrokerError::KeyNotParsed(format!( + "KV has unexpected key '{key_str}' that does not start with cluster prefix {cluster_prefix}" ))); } - let key_part = &key_str[broker_prefix.len()..]; - let key_captures = match SK_TIMELINE_KEY_REGEX.captures(key_part) { - Some(captures) => captures, - None => { - return Err(BrokerError::InvalidKey(format!( - "KV has unexpected key part '{key_part}' that does not match required regex {}", - SK_TIMELINE_KEY_REGEX.as_str() - ))); - } - }; - let info = serde_json::from_str(value_str).map_err(|e| { - BrokerError::ParsingError(format!( - "Failed to parse '{value_str}' as safekeeper timeline info: {e}" - )) + let key = SubscriptionFullKey::from_str(&key_str[cluster_prefix.len()..]).map_err(|e| { + BrokerError::KeyNotParsed(format!("Failed to parse KV key '{key_str}': {e}")) })?; - let zttid = ZTenantTimelineId::new( - parse_capture(&key_captures, 1).map_err(BrokerError::ParsingError)?, - parse_capture(&key_captures, 2).map_err(BrokerError::ParsingError)?, - ); - let safekeeper_id = NodeId(parse_capture(&key_captures, 3).map_err(BrokerError::ParsingError)?); - - Ok(( - zttid, - SafekeeperTimeline { - safekeeper_id, - info, - }, - )) -} - -fn parse_capture(caps: &Captures, index: usize) -> Result -where - T: FromStr, - ::Err: Display, -{ - let capture_match = caps - .get(index) - .ok_or_else(|| format!("Failed to get capture match at index {index}"))? - .as_str(); - capture_match.parse().map_err(|e| { - format!( - "Failed to parse {} from {capture_match}: {e}", - std::any::type_name::() - ) - }) -} - -#[cfg(test)] -mod tests { - use utils::zid::ZTimelineId; - - use super::*; - - #[test] - fn typical_etcd_prefix_should_be_parsed() { - let prefix = "neon"; - let tenant_id = ZTenantId::generate(); - let timeline_id = ZTimelineId::generate(); - let all_subscription = SkTimelineSubscriptionKind { - broker_etcd_prefix: prefix.to_string(), - kind: SubscriptionKind::All, - }; - let tenant_subscription = SkTimelineSubscriptionKind { - broker_etcd_prefix: prefix.to_string(), - kind: SubscriptionKind::Tenant(tenant_id), - }; - let timeline_subscription = SkTimelineSubscriptionKind { - broker_etcd_prefix: prefix.to_string(), - kind: SubscriptionKind::Timeline(ZTenantTimelineId::new(tenant_id, timeline_id)), - }; - - let typical_etcd_kv_strs = [ - ( - format!("{prefix}/{tenant_id}/{timeline_id}/safekeeper/1"), - r#"{"last_log_term":231,"flush_lsn":"0/241BB70","commit_lsn":"0/241BB70","backup_lsn":"0/2000000","remote_consistent_lsn":"0/0","peer_horizon_lsn":"0/16960E8","safekeeper_connstr":"something.local:1234","pageserver_connstr":"postgresql://(null):@somethine.else.local:3456"}"#, - ), - ( - format!("{prefix}/{tenant_id}/{timeline_id}/safekeeper/13"), - r#"{"last_log_term":231,"flush_lsn":"0/241BB70","commit_lsn":"0/241BB70","backup_lsn":"0/2000000","remote_consistent_lsn":"0/0","peer_horizon_lsn":"0/16960E8","safekeeper_connstr":"something.local:1234","pageserver_connstr":"postgresql://(null):@somethine.else.local:3456"}"#, - ), - ]; - - for (key_string, value_str) in typical_etcd_kv_strs { - for subscription in [ - &all_subscription, - &tenant_subscription, - &timeline_subscription, - ] { - let (id, _timeline) = - parse_safekeeper_timeline(subscription, &key_string, value_str) - .unwrap_or_else(|e| panic!("Should be able to parse etcd key string '{key_string}' and etcd value string '{value_str}' for subscription {subscription:?}, but got: {e}")); - assert_eq!(id, ZTenantTimelineId::new(tenant_id, timeline_id)); - } - } - } + Ok(value_parser(key, value_str).map(|value| (key, value))) } diff --git a/libs/etcd_broker/src/subscription_key.rs b/libs/etcd_broker/src/subscription_key.rs new file mode 100644 index 0000000000..8f8579f4e5 --- /dev/null +++ b/libs/etcd_broker/src/subscription_key.rs @@ -0,0 +1,310 @@ +//! Etcd broker keys, used in the project and shared between instances. +//! The keys are split into two categories: +//! +//! * [`SubscriptionFullKey`] full key format: `/////` +//! Always returned from etcd in this form, always start with the user key provided. +//! +//! * [`SubscriptionKey`] user input key format: always partial, since it's unknown which `node_id`'s are available. +//! Full key always starts with the user input one, due to etcd subscription properties. + +use std::{fmt::Display, str::FromStr}; + +use once_cell::sync::Lazy; +use regex::{Captures, Regex}; +use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId}; + +/// The subscription kind to the timeline updates from safekeeper. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct SubscriptionKey { + /// Generic cluster prefix, allowing to use the same etcd instance by multiple logic groups. + pub cluster_prefix: String, + /// The subscription kind. + pub kind: SubscriptionKind, +} + +/// All currently possible key kinds of a etcd broker subscription. +/// Etcd works so, that every key that starts with the subbscription key given is considered matching and +/// returned as part of the subscrption. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum SubscriptionKind { + /// Get every update in etcd. + All, + /// Get etcd updates for any timeiline of a certain tenant, affected by any operation from any node kind. + TenantTimelines(ZTenantId), + /// Get etcd updates for a certain timeline of a tenant, affected by any operation from any node kind. + Timeline(ZTenantTimelineId), + /// Get etcd timeline updates, specific to a certain node kind. + Node(ZTenantTimelineId, NodeKind), + /// Get etcd timeline updates for a certain operation on specific nodes. + Operation(ZTenantTimelineId, NodeKind, OperationKind), +} + +/// All kinds of nodes, able to write into etcd. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum NodeKind { + Safekeeper, + Pageserver, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum OperationKind { + Safekeeper(SkOperationKind), +} + +/// Current operations, running inside the safekeeper node. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum SkOperationKind { + TimelineInfo, + WalBackup, +} + +static SUBSCRIPTION_FULL_KEY_REGEX: Lazy = Lazy::new(|| { + Regex::new("/([[:xdigit:]]+)/([[:xdigit:]]+)/([^/]+)/([^/]+)/([[:digit:]]+)$") + .expect("wrong subscription full etcd key regex") +}); + +/// Full key, received from etcd during any of the component's work. +/// No other etcd keys are considered during system's work. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct SubscriptionFullKey { + pub id: ZTenantTimelineId, + pub node_kind: NodeKind, + pub operation: OperationKind, + pub node_id: NodeId, +} + +impl SubscriptionKey { + /// Subscribes for all etcd updates. + pub fn all(cluster_prefix: String) -> Self { + SubscriptionKey { + cluster_prefix, + kind: SubscriptionKind::All, + } + } + + /// Subscribes to a given timeline info updates from safekeepers. + pub fn sk_timeline_info(cluster_prefix: String, timeline: ZTenantTimelineId) -> Self { + Self { + cluster_prefix, + kind: SubscriptionKind::Operation( + timeline, + NodeKind::Safekeeper, + OperationKind::Safekeeper(SkOperationKind::TimelineInfo), + ), + } + } + + /// Subscribes to all timeine updates during specific operations, running on the corresponding nodes. + pub fn operation( + cluster_prefix: String, + timeline: ZTenantTimelineId, + node_kind: NodeKind, + operation: OperationKind, + ) -> Self { + Self { + cluster_prefix, + kind: SubscriptionKind::Operation(timeline, node_kind, operation), + } + } + + /// Etcd key to use for watching a certain timeline updates from safekeepers. + pub fn watch_key(&self) -> String { + let cluster_prefix = &self.cluster_prefix; + match self.kind { + SubscriptionKind::All => cluster_prefix.to_string(), + SubscriptionKind::TenantTimelines(tenant_id) => { + format!("{cluster_prefix}/{tenant_id}") + } + SubscriptionKind::Timeline(id) => { + format!("{cluster_prefix}/{id}") + } + SubscriptionKind::Node(id, node_kind) => { + format!("{cluster_prefix}/{id}/{node_kind}") + } + SubscriptionKind::Operation(id, node_kind, operation_kind) => { + format!("{cluster_prefix}/{id}/{node_kind}/{operation_kind}") + } + } + } +} + +impl Display for OperationKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + OperationKind::Safekeeper(o) => o.fmt(f), + } + } +} + +impl FromStr for OperationKind { + type Err = String; + + fn from_str(operation_kind_str: &str) -> Result { + match operation_kind_str { + "timeline_info" => Ok(OperationKind::Safekeeper(SkOperationKind::TimelineInfo)), + "wal_backup" => Ok(OperationKind::Safekeeper(SkOperationKind::WalBackup)), + _ => Err(format!("Unknown operation kind: {operation_kind_str}")), + } + } +} + +impl Display for SubscriptionFullKey { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let Self { + id, + node_kind, + operation, + node_id, + } = self; + write!(f, "{id}/{node_kind}/{operation}/{node_id}") + } +} + +impl FromStr for SubscriptionFullKey { + type Err = String; + + fn from_str(subscription_kind_str: &str) -> Result { + let key_captures = match SUBSCRIPTION_FULL_KEY_REGEX.captures(subscription_kind_str) { + Some(captures) => captures, + None => { + return Err(format!( + "Subscription kind str does not match a subscription full key regex {}", + SUBSCRIPTION_FULL_KEY_REGEX.as_str() + )); + } + }; + + Ok(Self { + id: ZTenantTimelineId::new( + parse_capture(&key_captures, 1)?, + parse_capture(&key_captures, 2)?, + ), + node_kind: parse_capture(&key_captures, 3)?, + operation: parse_capture(&key_captures, 4)?, + node_id: NodeId(parse_capture(&key_captures, 5)?), + }) + } +} + +fn parse_capture(caps: &Captures, index: usize) -> Result +where + T: FromStr, + ::Err: Display, +{ + let capture_match = caps + .get(index) + .ok_or_else(|| format!("Failed to get capture match at index {index}"))? + .as_str(); + capture_match.parse().map_err(|e| { + format!( + "Failed to parse {} from {capture_match}: {e}", + std::any::type_name::() + ) + }) +} + +impl Display for NodeKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Safekeeper => write!(f, "safekeeper"), + Self::Pageserver => write!(f, "pageserver"), + } + } +} + +impl FromStr for NodeKind { + type Err = String; + + fn from_str(node_kind_str: &str) -> Result { + match node_kind_str { + "safekeeper" => Ok(Self::Safekeeper), + "pageserver" => Ok(Self::Pageserver), + _ => Err(format!("Invalid node kind: {node_kind_str}")), + } + } +} + +impl Display for SkOperationKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::TimelineInfo => write!(f, "timeline_info"), + Self::WalBackup => write!(f, "wal_backup"), + } + } +} + +impl FromStr for SkOperationKind { + type Err = String; + + fn from_str(operation_str: &str) -> Result { + match operation_str { + "timeline_info" => Ok(Self::TimelineInfo), + "wal_backup" => Ok(Self::WalBackup), + _ => Err(format!("Invalid operation: {operation_str}")), + } + } +} + +#[cfg(test)] +mod tests { + use utils::zid::ZTimelineId; + + use super::*; + + #[test] + fn full_cluster_key_parsing() { + let prefix = "neon"; + let node_kind = NodeKind::Safekeeper; + let operation_kind = OperationKind::Safekeeper(SkOperationKind::WalBackup); + let tenant_id = ZTenantId::generate(); + let timeline_id = ZTimelineId::generate(); + let id = ZTenantTimelineId::new(tenant_id, timeline_id); + let node_id = NodeId(1); + + let timeline_subscription_keys = [ + SubscriptionKey { + cluster_prefix: prefix.to_string(), + kind: SubscriptionKind::All, + }, + SubscriptionKey { + cluster_prefix: prefix.to_string(), + kind: SubscriptionKind::TenantTimelines(tenant_id), + }, + SubscriptionKey { + cluster_prefix: prefix.to_string(), + kind: SubscriptionKind::Timeline(id), + }, + SubscriptionKey { + cluster_prefix: prefix.to_string(), + kind: SubscriptionKind::Node(id, node_kind), + }, + SubscriptionKey { + cluster_prefix: prefix.to_string(), + kind: SubscriptionKind::Operation(id, node_kind, operation_kind), + }, + ]; + + let full_key_string = format!( + "{}/{node_id}", + timeline_subscription_keys.last().unwrap().watch_key() + ); + + for key in timeline_subscription_keys { + assert!(full_key_string.starts_with(&key.watch_key()), "Full key '{full_key_string}' should start with any of the keys, keys, but {key:?} did not match"); + } + + let full_key = SubscriptionFullKey::from_str(&full_key_string).unwrap_or_else(|e| { + panic!("Failed to parse {full_key_string} as a subscription full key: {e}") + }); + + assert_eq!( + full_key, + SubscriptionFullKey { + id, + node_kind, + operation: operation_kind, + node_id + } + ) + } +} diff --git a/libs/etcd_broker/src/subscription_value.rs b/libs/etcd_broker/src/subscription_value.rs new file mode 100644 index 0000000000..d3e2011761 --- /dev/null +++ b/libs/etcd_broker/src/subscription_value.rs @@ -0,0 +1,35 @@ +//! Module for the values to put into etcd. + +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; +use utils::lsn::Lsn; + +/// Data about safekeeper's timeline. Fields made optional for easy migrations. +#[serde_as] +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct SkTimelineInfo { + /// Term of the last entry. + pub last_log_term: Option, + /// LSN of the last record. + #[serde_as(as = "Option")] + #[serde(default)] + pub flush_lsn: Option, + /// Up to which LSN safekeeper regards its WAL as committed. + #[serde_as(as = "Option")] + #[serde(default)] + pub commit_lsn: Option, + /// LSN up to which safekeeper has backed WAL. + #[serde_as(as = "Option")] + #[serde(default)] + pub backup_lsn: Option, + /// LSN of last checkpoint uploaded by pageserver. + #[serde_as(as = "Option")] + #[serde(default)] + pub remote_consistent_lsn: Option, + #[serde_as(as = "Option")] + #[serde(default)] + pub peer_horizon_lsn: Option, + /// A connection string to use for WAL receiving. + #[serde(default)] + pub safekeeper_connstr: Option, +} diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 202a13545d..32bd88cf7c 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -50,7 +50,10 @@ use crate::thread_mgr::ThreadKind; use crate::{thread_mgr, DatadirTimelineImpl}; use anyhow::{ensure, Context}; use chrono::{NaiveDateTime, Utc}; -use etcd_broker::{Client, SkTimelineInfo, SkTimelineSubscription, SkTimelineSubscriptionKind}; +use etcd_broker::{ + subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription, + Client, +}; use itertools::Itertools; use once_cell::sync::Lazy; use std::cell::Cell; @@ -403,7 +406,7 @@ async fn timeline_wal_broker_loop_step( // Endlessly try to subscribe for broker updates for a given timeline. // If there are no safekeepers to maintain the lease, the timeline subscription will be inavailable in the broker and the operation will fail constantly. // This is ok, pageservers should anyway try subscribing (with some backoff) since it's the only way they can get the timeline WAL anyway. - let mut broker_subscription: SkTimelineSubscription; + let mut broker_subscription: BrokerSubscription; let mut attempt = 0; loop { select! { @@ -420,9 +423,9 @@ async fn timeline_wal_broker_loop_step( info!("Broker subscription loop cancelled, shutting down"); return Ok(ControlFlow::Break(())); }, - new_subscription = etcd_broker::subscribe_to_safekeeper_timeline_updates( + new_subscription = etcd_broker::subscribe_for_json_values( etcd_client, - SkTimelineSubscriptionKind::timeline(broker_prefix.to_owned(), id), + SubscriptionKey::sk_timeline_info(broker_prefix.to_owned(), id), ) .instrument(info_span!("etcd_subscription")) => match new_subscription { Ok(new_subscription) => { diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 3d75fec587..169b106aa9 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -4,9 +4,7 @@ use anyhow::anyhow; use anyhow::Context; use anyhow::Error; use anyhow::Result; -use etcd_broker::Client; -use etcd_broker::PutOptions; -use etcd_broker::SkTimelineSubscriptionKind; +use etcd_broker::subscription_value::SkTimelineInfo; use std::time::Duration; use tokio::spawn; use tokio::task::JoinHandle; @@ -15,6 +13,10 @@ use tracing::*; use url::Url; use crate::{timeline::GlobalTimelines, SafeKeeperConf}; +use etcd_broker::{ + subscription_key::{OperationKind, SkOperationKind, SubscriptionKey}, + Client, PutOptions, +}; use utils::zid::{NodeId, ZTenantTimelineId}; const RETRY_INTERVAL_MSEC: u64 = 1000; @@ -43,7 +45,7 @@ fn timeline_safekeeper_path( ) -> String { format!( "{}/{sk_id}", - SkTimelineSubscriptionKind::timeline(broker_etcd_prefix, zttid).watch_key() + SubscriptionKey::sk_timeline_info(broker_etcd_prefix, zttid).watch_key() ) } @@ -148,14 +150,6 @@ async fn lease_keep_alive(mut client: Client, lease_id: i64) -> Result<()> { } } -pub fn get_campaign_name( - election_name: &str, - broker_prefix: &str, - id: ZTenantTimelineId, -) -> String { - format!("{broker_prefix}/{id}/{election_name}") -} - pub fn get_candiate_name(system_id: NodeId) -> String { format!("id_{system_id}") } @@ -209,9 +203,20 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { let mut client = Client::connect(&conf.broker_endpoints, None).await?; - let mut subscription = etcd_broker::subscribe_to_safekeeper_timeline_updates( + let mut subscription = etcd_broker::subscribe_for_values( &mut client, - SkTimelineSubscriptionKind::all(conf.broker_etcd_prefix.clone()), + SubscriptionKey::all(conf.broker_etcd_prefix.clone()), + |full_key, value_str| { + if full_key.operation == OperationKind::Safekeeper(SkOperationKind::TimelineInfo) { + match serde_json::from_str::(value_str) { + Ok(new_info) => return Some(new_info), + Err(e) => { + error!("Failed to parse timeline info from value str '{value_str}': {e}") + } + } + } + None + }, ) .await .context("failed to subscribe for safekeeper info")?; diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index b0197a9a2a..73b9024c7d 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,4 +1,3 @@ -use etcd_broker::SkTimelineInfo; use hyper::{Body, Request, Response, StatusCode}; use serde::Serialize; @@ -11,6 +10,7 @@ use crate::safekeeper::Term; use crate::safekeeper::TermHistory; use crate::timeline::{GlobalTimelines, TimelineDeleteForceResult}; use crate::SafeKeeperConf; +use etcd_broker::subscription_value::SkTimelineInfo; use utils::{ http::{ endpoint, diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 1c00af7043..eb6316dec2 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -4,7 +4,7 @@ use anyhow::{bail, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use etcd_broker::SkTimelineInfo; +use etcd_broker::subscription_value::SkTimelineInfo; use postgres_ffi::xlog_utils::TimeLineID; use postgres_ffi::xlog_utils::XLogSegNo; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 30c94f2543..a69dadb7bb 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -3,7 +3,7 @@ use anyhow::{bail, Context, Result}; -use etcd_broker::SkTimelineInfo; +use etcd_broker::subscription_value::SkTimelineInfo; use lazy_static::lazy_static; use postgres_ffi::xlog_utils::XLogSegNo; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 1f2e9c303a..08e19f3f2f 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -1,4 +1,7 @@ use anyhow::{Context, Result}; +use etcd_broker::subscription_key::{ + NodeKind, OperationKind, SkOperationKind, SubscriptionKey, SubscriptionKind, +}; use tokio::task::JoinHandle; use std::cmp::min; @@ -26,8 +29,6 @@ use crate::{broker, SafeKeeperConf}; use once_cell::sync::OnceCell; -const BACKUP_ELECTION_NAME: &str = "WAL_BACKUP"; - const BROKER_CONNECTION_RETRY_DELAY_MS: u64 = 1000; const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10; @@ -98,11 +99,15 @@ async fn wal_backup_launcher_main_loop( info!("starting WAL backup task for {}", zttid); // TODO: decide who should offload in launcher itself by simply checking current state - let election_name = broker::get_campaign_name( - BACKUP_ELECTION_NAME, - &conf.broker_etcd_prefix, - zttid, - ); + let election_name = SubscriptionKey { + cluster_prefix: conf.broker_etcd_prefix.clone(), + kind: SubscriptionKind::Operation( + zttid, + NodeKind::Safekeeper, + OperationKind::Safekeeper(SkOperationKind::WalBackup), + ), + } + .watch_key(); let my_candidate_name = broker::get_candiate_name(conf.my_id); let election = broker::Election::new( election_name, From 32e64afd544e632fa9afa0b4d396488de4d544f5 Mon Sep 17 00:00:00 2001 From: chaitanya sharma <86035+phoenix24@users.noreply.github.com> Date: Wed, 8 Jun 2022 13:55:37 +0530 Subject: [PATCH 0412/1022] Use better parallel build instructions in readme.md (#1908) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index be5032e87d..0866678490 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ brew link --force libpq ```sh git clone --recursive https://github.com/neondatabase/neon.git cd neon -make -j5 +make -j`nproc` ``` #### dependency installation notes From a01999bc4a09d9fb31153ef6c2cad084f94f0286 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 8 Jun 2022 13:36:49 +0300 Subject: [PATCH 0413/1022] Replace most common remote logs with metrics (#1909) --- pageserver/src/storage_sync.rs | 66 ++++++++++++++++++--------- pageserver/src/storage_sync/upload.rs | 19 +++++++- 2 files changed, 62 insertions(+), 23 deletions(-) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index a140149c23..5fe2cde3b7 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -186,8 +186,8 @@ use crate::{ }; use metrics::{ - register_histogram_vec, register_int_counter, register_int_gauge, HistogramVec, IntCounter, - IntGauge, + register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge, + HistogramVec, IntCounter, IntCounterVec, IntGauge, }; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; @@ -208,14 +208,17 @@ lazy_static! { static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!( "pageserver_remote_storage_image_sync_seconds", "Time took to synchronize (download or upload) a whole pageserver image. \ - Grouped by `operation_kind` (upload|download) and `status` (success|failure)", - &["operation_kind", "status"], - vec![ - 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0, 6.0, 7.0, - 8.0, 9.0, 10.0, 12.5, 15.0, 17.5, 20.0 - ] + Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)", + &["tenant_id", "timeline_id", "operation_kind", "status"], + vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0] ) .expect("failed to register pageserver image sync time histogram vec"); + static ref REMOTE_INDEX_UPLOAD: IntCounterVec = register_int_counter_vec!( + "pageserver_remote_storage_remote_index_uploads_total", + "Number of remote index uploads", + &["tenant_id", "timeline_id"], + ) + .expect("failed to register pageserver remote index upload vec"); } static SYNC_QUEUE: OnceCell = OnceCell::new(); @@ -1146,19 +1149,19 @@ where .await { DownloadedTimeline::Abort => { - register_sync_status(sync_start, task_name, None); + register_sync_status(sync_id, sync_start, task_name, None); if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) { error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}"); } } DownloadedTimeline::FailedAndRescheduled => { - register_sync_status(sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, task_name, Some(false)); } DownloadedTimeline::Successful(mut download_data) => { match update_local_metadata(conf, sync_id, current_remote_timeline).await { Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) { Ok(()) => { - register_sync_status(sync_start, task_name, Some(true)); + register_sync_status(sync_id, sync_start, task_name, Some(true)); return Some(TimelineSyncStatusUpdate::Downloaded); } Err(e) => { @@ -1169,7 +1172,7 @@ where error!("Failed to update local timeline metadata: {e:?}"); download_data.retries += 1; sync_queue.push(sync_id, SyncTask::Download(download_data)); - register_sync_status(sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, task_name, Some(false)); } } } @@ -1265,14 +1268,14 @@ async fn delete_timeline_data( error!("Failed to update remote timeline {sync_id}: {e:?}"); new_delete_data.retries += 1; sync_queue.push(sync_id, SyncTask::Delete(new_delete_data)); - register_sync_status(sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, task_name, Some(false)); return; } } timeline_delete.deletion_registered = true; let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await; - register_sync_status(sync_start, task_name, Some(sync_status)); + register_sync_status(sync_id, sync_start, task_name, Some(sync_status)); } async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result { @@ -1306,7 +1309,7 @@ async fn upload_timeline_data( .await { UploadedTimeline::FailedAndRescheduled => { - register_sync_status(sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, task_name, Some(false)); return; } UploadedTimeline::Successful(upload_data) => upload_data, @@ -1325,13 +1328,13 @@ async fn upload_timeline_data( .await { Ok(()) => { - register_sync_status(sync_start, task_name, Some(true)); + register_sync_status(sync_id, sync_start, task_name, Some(true)); } Err(e) => { error!("Failed to update remote timeline {sync_id}: {e:?}"); uploaded_data.retries += 1; sync_queue.push(sync_id, SyncTask::Upload(uploaded_data)); - register_sync_status(sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, task_name, Some(false)); } } } @@ -1421,7 +1424,14 @@ where IndexPart::from_remote_timeline(&timeline_path, updated_remote_timeline) .context("Failed to create an index part from the updated remote timeline")?; - info!("Uploading remote index for the timeline"); + debug!("Uploading remote index for the timeline"); + REMOTE_INDEX_UPLOAD + .with_label_values(&[ + &sync_id.tenant_id.to_string(), + &sync_id.timeline_id.to_string(), + ]) + .inc(); + upload_index_part(conf, storage, sync_id, new_index_part) .await .context("Failed to upload new index part") @@ -1590,12 +1600,24 @@ fn compare_local_and_remote_timeline( (initial_timeline_status, awaits_download) } -fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Option) { +fn register_sync_status( + sync_id: ZTenantTimelineId, + sync_start: Instant, + sync_name: &str, + sync_status: Option, +) { let secs_elapsed = sync_start.elapsed().as_secs_f64(); - info!("Processed a sync task in {secs_elapsed:.2} seconds"); + debug!("Processed a sync task in {secs_elapsed:.2} seconds"); + + let tenant_id = sync_id.tenant_id.to_string(); + let timeline_id = sync_id.timeline_id.to_string(); match sync_status { - Some(true) => IMAGE_SYNC_TIME.with_label_values(&[sync_name, "success"]), - Some(false) => IMAGE_SYNC_TIME.with_label_values(&[sync_name, "failure"]), + Some(true) => { + IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "success"]) + } + Some(false) => { + IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "failure"]) + } None => return, } .observe(secs_elapsed) diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 2f88fa95ba..f9ab3b7471 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -4,6 +4,7 @@ use std::{fmt::Debug, path::PathBuf}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; +use lazy_static::lazy_static; use remote_storage::RemoteStorage; use tokio::fs; use tracing::{debug, error, info, warn}; @@ -17,6 +18,16 @@ use super::{ use crate::{ config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, }; +use metrics::{register_int_counter_vec, IntCounterVec}; + +lazy_static! { + static ref NO_LAYERS_UPLOAD: IntCounterVec = register_int_counter_vec!( + "pageserver_remote_storage_no_layers_uploads_total", + "Number of skipped uploads due to no layers", + &["tenant_id", "timeline_id"], + ) + .expect("failed to register pageserver no layers upload vec"); +} /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part( @@ -102,7 +113,13 @@ where .collect::>(); if layers_to_upload.is_empty() { - info!("No layers to upload after filtering, aborting"); + debug!("No layers to upload after filtering, aborting"); + NO_LAYERS_UPLOAD + .with_label_values(&[ + &sync_id.tenant_id.to_string(), + &sync_id.timeline_id.to_string(), + ]) + .inc(); return UploadedTimeline::Successful(upload_data); } From e22d9cee3a6fe420ec9962281a7d71c2b1eae0a2 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Wed, 8 Jun 2022 09:15:12 -0400 Subject: [PATCH 0414/1022] fix `ZeroDivisionError` in `scripts/generate_perf_report_page` (#1906) Fixes the `ZeroDivisionError` error by adding `EPS=1e-6` when doing the calculation. --- scripts/generate_perf_report_page.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/generate_perf_report_page.py b/scripts/generate_perf_report_page.py index a15d04e056..23fa4b76a3 100755 --- a/scripts/generate_perf_report_page.py +++ b/scripts/generate_perf_report_page.py @@ -26,6 +26,7 @@ KEY_EXCLUDE_FIELDS = frozenset({ }) NEGATIVE_COLOR = 'negative' POSITIVE_COLOR = 'positive' +EPS = 1e-6 @dataclass @@ -120,7 +121,8 @@ def get_row_values(columns: List[str], run_result: SuitRun, # this might happen when new metric is added and there is no value for it in previous run # let this be here, TODO add proper handling when this actually happens raise ValueError(f'{column} not found in previous result') - ratio = float(value) / float(prev_value['value']) - 1 + # adding `EPS` to each term to avoid ZeroDivisionError when the denominator is zero + ratio = (float(value) + EPS) / (float(prev_value['value']) + EPS) - 1 ratio_display, color = format_ratio(ratio, current_value['report']) row_values.append(RowValue(value, color, ratio_display)) return row_values From a51b2dac9a59587377246f85c6964f2902ce4026 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Thu, 9 Jun 2022 14:15:41 +0400 Subject: [PATCH 0415/1022] Don't s3 offload from newly joined safekeeper not having required WAL. I made the check at launcher level with the perspective of generally moving election (decision who offloads) there. Also log timeline 'active' changes. --- safekeeper/src/timeline.rs | 28 +++- safekeeper/src/wal_backup.rs | 158 +++++++++++-------- test_runner/batch_others/test_normal_work.py | 2 +- 3 files changed, 118 insertions(+), 70 deletions(-) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index a69dadb7bb..42bb02c1ea 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -149,8 +149,12 @@ impl SharedState { /// Mark timeline active/inactive and return whether s3 offloading requires /// start/stop action. - fn update_status(&mut self) -> bool { - self.active = self.is_active(); + fn update_status(&mut self, ttid: ZTenantTimelineId) -> bool { + let is_active = self.is_active(); + if self.active != is_active { + info!("timeline {} active={} now", ttid, is_active); + } + self.active = is_active; self.is_wal_backup_action_pending() } @@ -187,6 +191,12 @@ impl SharedState { self.wal_backup_active } + // Can this safekeeper offload to s3? Recently joined safekeepers might not + // have necessary WAL. + fn can_wal_backup(&self) -> bool { + self.sk.state.local_start_lsn <= self.sk.inmem.backup_lsn + } + fn get_wal_seg_size(&self) -> usize { self.sk.state.server.wal_seg_size as usize } @@ -291,7 +301,7 @@ impl Timeline { { let mut shared_state = self.mutex.lock().unwrap(); shared_state.num_computes += 1; - is_wal_backup_action_pending = shared_state.update_status(); + is_wal_backup_action_pending = shared_state.update_status(self.zttid); } // Wake up wal backup launcher, if offloading not started yet. if is_wal_backup_action_pending { @@ -308,7 +318,7 @@ impl Timeline { { let mut shared_state = self.mutex.lock().unwrap(); shared_state.num_computes -= 1; - is_wal_backup_action_pending = shared_state.update_status(); + is_wal_backup_action_pending = shared_state.update_status(self.zttid); } // Wake up wal backup launcher, if it is time to stop the offloading. if is_wal_backup_action_pending { @@ -327,7 +337,7 @@ impl Timeline { (replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); if stop { - shared_state.update_status(); + shared_state.update_status(self.zttid); return Ok(true); } } @@ -341,6 +351,12 @@ impl Timeline { shared_state.wal_backup_attend() } + // Can this safekeeper offload to s3? Recently joined safekeepers might not + // have necessary WAL. + pub fn can_wal_backup(&self) -> bool { + self.mutex.lock().unwrap().can_wal_backup() + } + /// Deactivates the timeline, assuming it is being deleted. /// Returns whether the timeline was already active. /// @@ -509,7 +525,7 @@ impl Timeline { } shared_state.sk.record_safekeeper_info(sk_info)?; self.notify_wal_senders(&mut shared_state); - is_wal_backup_action_pending = shared_state.update_status(); + is_wal_backup_action_pending = shared_state.update_status(self.zttid); commit_lsn = shared_state.sk.inmem.commit_lsn; } self.commit_lsn_watch_tx.send(commit_lsn)?; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 08e19f3f2f..1d7c8de3b8 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -49,14 +49,10 @@ pub fn wal_backup_launcher_thread_main( }); } -/// Check whether wal backup is required for timeline and mark that launcher is -/// aware of current status (if timeline exists). -fn is_wal_backup_required(zttid: ZTenantTimelineId) -> bool { - if let Some(tli) = GlobalTimelines::get_loaded(zttid) { - tli.wal_backup_attend() - } else { - false - } +/// Check whether wal backup is required for timeline. If yes, mark that launcher is +/// aware of current status and return the timeline. +fn is_wal_backup_required(zttid: ZTenantTimelineId) -> Option> { + GlobalTimelines::get_loaded(zttid).filter(|t| t.wal_backup_attend()) } struct WalBackupTaskHandle { @@ -64,6 +60,56 @@ struct WalBackupTaskHandle { handle: JoinHandle<()>, } +struct WalBackupTimelineEntry { + timeline: Arc, + handle: Option, +} + +/// Start per timeline task, if it makes sense for this safekeeper to offload. +fn consider_start_task( + conf: &SafeKeeperConf, + zttid: ZTenantTimelineId, + task: &mut WalBackupTimelineEntry, +) { + if !task.timeline.can_wal_backup() { + return; + } + info!("starting WAL backup task for {}", zttid); + + // TODO: decide who should offload right here by simply checking current + // state instead of running elections in offloading task. + let election_name = SubscriptionKey { + cluster_prefix: conf.broker_etcd_prefix.clone(), + kind: SubscriptionKind::Operation( + zttid, + NodeKind::Safekeeper, + OperationKind::Safekeeper(SkOperationKind::WalBackup), + ), + } + .watch_key(); + let my_candidate_name = broker::get_candiate_name(conf.my_id); + let election = broker::Election::new( + election_name, + my_candidate_name, + conf.broker_endpoints.clone(), + ); + + let (shutdown_tx, shutdown_rx) = mpsc::channel(1); + let timeline_dir = conf.timeline_dir(&zttid); + + let handle = tokio::spawn( + backup_task_main(zttid, timeline_dir, shutdown_rx, election) + .instrument(info_span!("WAL backup task", zttid = %zttid)), + ); + + task.handle = Some(WalBackupTaskHandle { + shutdown_tx, + handle, + }); +} + +const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000; + /// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup /// tasks. Having this in separate task simplifies locking, allows to reap /// panics and separate elections from offloading itself. @@ -72,7 +118,7 @@ async fn wal_backup_launcher_main_loop( mut wal_backup_launcher_rx: Receiver, ) { info!( - "WAL backup launcher: started, remote config {:?}", + "WAL backup launcher started, remote config {:?}", conf.remote_storage ); @@ -83,64 +129,50 @@ async fn wal_backup_launcher_main_loop( }) }); - let mut tasks: HashMap = HashMap::new(); + // Presense in this map means launcher is aware s3 offloading is needed for + // the timeline, but task is started only if it makes sense for to offload + // from this safekeeper. + let mut tasks: HashMap = HashMap::new(); + let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC)); loop { - // channel is never expected to get closed - let zttid = wal_backup_launcher_rx.recv().await.unwrap(); - let is_wal_backup_required = is_wal_backup_required(zttid); - if conf.remote_storage.is_none() || !conf.wal_backup_enabled { - continue; /* just drain the channel and do nothing */ - } - // do we need to do anything at all? - if is_wal_backup_required != tasks.contains_key(&zttid) { - if is_wal_backup_required { - // need to start the task - info!("starting WAL backup task for {}", zttid); - - // TODO: decide who should offload in launcher itself by simply checking current state - let election_name = SubscriptionKey { - cluster_prefix: conf.broker_etcd_prefix.clone(), - kind: SubscriptionKind::Operation( - zttid, - NodeKind::Safekeeper, - OperationKind::Safekeeper(SkOperationKind::WalBackup), - ), + tokio::select! { + zttid = wal_backup_launcher_rx.recv() => { + // channel is never expected to get closed + let zttid = zttid.unwrap(); + if conf.remote_storage.is_none() || !conf.wal_backup_enabled { + continue; /* just drain the channel and do nothing */ } - .watch_key(); - let my_candidate_name = broker::get_candiate_name(conf.my_id); - let election = broker::Election::new( - election_name, - my_candidate_name, - conf.broker_endpoints.clone(), - ); + let timeline = is_wal_backup_required(zttid); + // do we need to do anything at all? + if timeline.is_some() != tasks.contains_key(&zttid) { + if let Some(timeline) = timeline { + // need to start the task + let entry = tasks.entry(zttid).or_insert(WalBackupTimelineEntry { + timeline, + handle: None, + }); + consider_start_task(&conf, zttid, entry); + } else { + // need to stop the task + info!("stopping WAL backup task for {}", zttid); - let (shutdown_tx, shutdown_rx) = mpsc::channel(1); - let timeline_dir = conf.timeline_dir(&zttid); - - let handle = tokio::spawn( - backup_task_main(zttid, timeline_dir, shutdown_rx, election) - .instrument(info_span!("WAL backup task", zttid = %zttid)), - ); - - tasks.insert( - zttid, - WalBackupTaskHandle { - shutdown_tx, - handle, - }, - ); - } else { - // need to stop the task - info!("stopping WAL backup task for {}", zttid); - - let wb_handle = tasks.remove(&zttid).unwrap(); - // Tell the task to shutdown. Error means task exited earlier, that's ok. - let _ = wb_handle.shutdown_tx.send(()).await; - // Await the task itself. TODO: restart panicked tasks earlier. - // Hm, why I can't await on reference to handle? - if let Err(e) = wb_handle.handle.await { - warn!("WAL backup task for {} panicked: {}", zttid, e); + let entry = tasks.remove(&zttid).unwrap(); + if let Some(wb_handle) = entry.handle { + // Tell the task to shutdown. Error means task exited earlier, that's ok. + let _ = wb_handle.shutdown_tx.send(()).await; + // Await the task itself. TODO: restart panicked tasks earlier. + if let Err(e) = wb_handle.handle.await { + warn!("WAL backup task for {} panicked: {}", zttid, e); + } + } + } + } + } + // Start known tasks, if needed and possible. + _ = ticker.tick() => { + for (zttid, entry) in tasks.iter_mut().filter(|(_, entry)| entry.handle.is_none()) { + consider_start_task(&conf, *zttid, entry); } } } diff --git a/test_runner/batch_others/test_normal_work.py b/test_runner/batch_others/test_normal_work.py index c0f44ce7a9..4635a70de6 100644 --- a/test_runner/batch_others/test_normal_work.py +++ b/test_runner/batch_others/test_normal_work.py @@ -42,8 +42,8 @@ def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_s Repeat check for several tenants/timelines. """ - env = neon_env_builder.init_start() neon_env_builder.num_safekeepers = num_safekeepers + env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() for _ in range(num_timelines): From f7b878611a8709954b0a61bc1656f34f2054c4ea Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Wed, 1 Jun 2022 23:55:22 +0300 Subject: [PATCH 0416/1022] Implement JWT authentication in Safekeeper HTTP API (#1753) * `control_plane` crate (used by `neon_local`) now parses an `auth_enabled` bool for each Safekeeper * If auth is enabled, a Safekeeper is passed a path to a public key via a new command line argument * Added TODO comments to other places needing auth --- control_plane/src/local_env.rs | 2 ++ control_plane/src/safekeeper.rs | 5 +++++ safekeeper/src/bin/safekeeper.rs | 32 +++++++++++++++++++++++++++-- safekeeper/src/http/routes.rs | 35 +++++++++++++++++++++++++++----- safekeeper/src/lib.rs | 2 ++ 5 files changed, 69 insertions(+), 7 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index f7bb890893..28541c2ece 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -177,6 +177,7 @@ pub struct SafekeeperConf { pub sync: bool, pub remote_storage: Option, pub backup_threads: Option, + pub auth_enabled: bool, } impl Default for SafekeeperConf { @@ -188,6 +189,7 @@ impl Default for SafekeeperConf { sync: true, remote_storage: None, backup_threads: None, + auth_enabled: false, } } } diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 972b6d48ae..c90f36d104 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -149,6 +149,11 @@ impl SafekeeperNode { if let Some(ref remote_storage) = self.conf.remote_storage { cmd.args(&["--remote-storage", remote_storage]); } + if self.conf.auth_enabled { + cmd.arg("--auth-validation-public-key-path"); + // PathBuf is better be passed as is, not via `String`. + cmd.arg(self.env.base_data_dir.join("auth_public_key.pem")); + } fill_aws_secrets_vars(&mut cmd); diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 5ce2591ff3..6c9c59c76b 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -10,6 +10,7 @@ use remote_storage::RemoteStorageConfig; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::path::{Path, PathBuf}; +use std::sync::Arc; use std::thread; use tokio::sync::mpsc; use toml_edit::Document; @@ -27,6 +28,7 @@ use safekeeper::timeline::GlobalTimelines; use safekeeper::wal_backup; use safekeeper::wal_service; use safekeeper::SafeKeeperConf; +use utils::auth::JwtAuth; use utils::{ http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener, zid::NodeId, @@ -132,6 +134,12 @@ fn main() -> anyhow::Result<()> { .default_missing_value("true") .help("Enable/disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring WAL backup horizon."), ) + .arg( + Arg::new("auth-validation-public-key-path") + .long("auth-validation-public-key-path") + .takes_value(true) + .help("Path to an RSA .pem public key which is used to check JWT tokens") + ) .get_matches(); if let Some(addr) = arg_matches.value_of("dump-control-file") { @@ -204,6 +212,10 @@ fn main() -> anyhow::Result<()> { .parse() .context("failed to parse bool enable-s3-offload bool")?; + conf.auth_validation_public_key_path = arg_matches + .value_of("auth-validation-public-key-path") + .map(PathBuf::from); + start_safekeeper(conf, given_id, arg_matches.is_present("init")) } @@ -239,6 +251,19 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo e })?; + let auth = match conf.auth_validation_public_key_path.as_ref() { + None => { + info!("Auth is disabled"); + None + } + Some(path) => { + info!("Loading JWT auth key from {}", path.display()); + Some(Arc::new( + JwtAuth::from_key_path(path).context("failed to load the auth key")?, + )) + } + }; + // XXX: Don't spawn any threads before daemonizing! if conf.daemonize { info!("daemonizing..."); @@ -280,8 +305,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo thread::Builder::new() .name("http_endpoint_thread".into()) .spawn(|| { - // TODO authentication - let router = http::make_router(conf_); + let router = http::make_router(conf_, auth); endpoint::serve_thread_main( router, http_listener, @@ -295,6 +319,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo let safekeeper_thread = thread::Builder::new() .name("Safekeeper thread".into()) .spawn(|| { + // TODO: add auth if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener) { info!("safekeeper thread terminated: {e}"); } @@ -309,6 +334,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo thread::Builder::new() .name("broker thread".into()) .spawn(|| { + // TODO: add auth? broker::thread_main(conf_); })?, ); @@ -321,6 +347,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo thread::Builder::new() .name("WAL removal thread".into()) .spawn(|| { + // TODO: add auth? remove_wal::thread_main(conf_); })?, ); @@ -330,6 +357,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo thread::Builder::new() .name("wal backup launcher thread".into()) .spawn(move || { + // TODO: add auth? wal_backup::wal_backup_launcher_thread_main(conf_, wal_backup_launcher_rx); })?, ); diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 73b9024c7d..ca43039d3b 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,8 +1,9 @@ -use hyper::{Body, Request, Response, StatusCode}; +use hyper::{Body, Request, Response, StatusCode, Uri}; +use once_cell::sync::Lazy; use serde::Serialize; use serde::Serializer; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fmt::Display; use std::sync::Arc; @@ -12,8 +13,9 @@ use crate::timeline::{GlobalTimelines, TimelineDeleteForceResult}; use crate::SafeKeeperConf; use etcd_broker::subscription_value::SkTimelineInfo; use utils::{ + auth::JwtAuth, http::{ - endpoint, + endpoint::{self, auth_middleware, check_permission}, error::ApiError, json::{json_request, json_response}, request::{ensure_no_body, parse_request_param}, @@ -32,6 +34,7 @@ struct SafekeeperStatus { /// Healthcheck handler. async fn status_handler(request: Request) -> Result, ApiError> { + check_permission(&request, None)?; let conf = get_conf(&request); let status = SafekeeperStatus { id: conf.my_id }; json_response(StatusCode::OK, status) @@ -91,6 +94,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result, ) -> Result, ApiError> { let tenant_id = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; ensure_no_body(&mut request).await?; json_response( StatusCode::OK, @@ -178,6 +185,7 @@ async fn record_safekeeper_info(mut request: Request) -> Result) -> Result RouterBuilder { - let router = endpoint::make_router(); +pub fn make_router( + conf: SafeKeeperConf, + auth: Option>, +) -> RouterBuilder { + let mut router = endpoint::make_router(); + if auth.is_some() { + router = router.middleware(auth_middleware(|request| { + #[allow(clippy::mutable_key_type)] + static ALLOWLIST_ROUTES: Lazy> = + Lazy::new(|| ["/v1/status"].iter().map(|v| v.parse().unwrap()).collect()); + if ALLOWLIST_ROUTES.contains(request.uri()) { + None + } else { + // Option> is always provided as data below, hence unwrap(). + request.data::>>().unwrap().as_deref() + } + })) + } router .data(Arc::new(conf)) + .data(auth) .get("/v1/status", status_handler) .get( "/v1/timeline/:tenant_id/:timeline_id", diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index f328d2e85a..0335d61d3f 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -57,6 +57,7 @@ pub struct SafeKeeperConf { pub my_id: NodeId, pub broker_endpoints: Vec, pub broker_etcd_prefix: String, + pub auth_validation_public_key_path: Option, } impl SafeKeeperConf { @@ -88,6 +89,7 @@ impl Default for SafeKeeperConf { broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS, wal_backup_enabled: true, + auth_validation_public_key_path: None, } } } From 1f1d852204b7e1fc66abd4b28080dfbef26a6195 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Wed, 1 Jun 2022 23:56:06 +0300 Subject: [PATCH 0417/1022] ZenithEnvBuilder: rename pageserver_auth_enabled --> auth_enabled --- test_runner/batch_others/test_auth.py | 4 ++-- test_runner/batch_others/test_pageserver_api.py | 2 +- test_runner/batch_others/test_restart_compute.py | 2 +- test_runner/fixtures/neon_fixtures.py | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test_runner/batch_others/test_auth.py b/test_runner/batch_others/test_auth.py index 73120880d3..b9eb9d7cee 100644 --- a/test_runner/batch_others/test_auth.py +++ b/test_runner/batch_others/test_auth.py @@ -7,7 +7,7 @@ import pytest def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): - neon_env_builder.pageserver_auth_enabled = True + neon_env_builder.auth_enabled = True env = neon_env_builder.init_start() ps = env.pageserver @@ -54,7 +54,7 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize('with_safekeepers', [False, True]) def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): - neon_env_builder.pageserver_auth_enabled = True + neon_env_builder.auth_enabled = True if with_safekeepers: neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 289eec74c5..7f9cb9493d 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -114,7 +114,7 @@ def test_pageserver_http_api_client(neon_simple_env: NeonEnv): def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder): - neon_env_builder.pageserver_auth_enabled = True + neon_env_builder.auth_enabled = True env = neon_env_builder.init_start() management_token = env.auth_keys.generate_management_token() diff --git a/test_runner/batch_others/test_restart_compute.py b/test_runner/batch_others/test_restart_compute.py index d55c0f2bcc..af1956e196 100644 --- a/test_runner/batch_others/test_restart_compute.py +++ b/test_runner/batch_others/test_restart_compute.py @@ -10,7 +10,7 @@ from fixtures.log_helper import log # @pytest.mark.parametrize('with_safekeepers', [False, True]) def test_restart_compute(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): - neon_env_builder.pageserver_auth_enabled = True + neon_env_builder.auth_enabled = True if with_safekeepers: neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 63ee6ec57d..2c3a3ad5a8 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -500,7 +500,7 @@ class NeonEnvBuilder: num_safekeepers: int = 1, # Use non-standard SK ids to check for various parsing bugs safekeepers_id_start: int = 0, - pageserver_auth_enabled: bool = False, + auth_enabled: bool = False, rust_log_override: Optional[str] = None, default_branch_name=DEFAULT_BRANCH_NAME): self.repo_dir = repo_dir @@ -513,7 +513,7 @@ class NeonEnvBuilder: self.pageserver_config_override = pageserver_config_override self.num_safekeepers = num_safekeepers self.safekeepers_id_start = safekeepers_id_start - self.pageserver_auth_enabled = pageserver_auth_enabled + self.auth_enabled = auth_enabled self.default_branch_name = default_branch_name self.env: Optional[NeonEnv] = None @@ -639,7 +639,7 @@ class NeonEnv: pg=self.port_distributor.get_port(), http=self.port_distributor.get_port(), ) - pageserver_auth_type = "ZenithJWT" if config.pageserver_auth_enabled else "Trust" + pageserver_auth_type = "ZenithJWT" if config.auth_enabled else "Trust" toml += textwrap.dedent(f""" [pageserver] From a001052cdd1976019dcd81f988adb099b66f6cc7 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 2 Jun 2022 00:15:54 +0300 Subject: [PATCH 0418/1022] test_runner: SafekeeperHttpClient: support auth --- test_runner/fixtures/neon_fixtures.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 2c3a3ad5a8..b8fb7fb82d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -667,6 +667,10 @@ class NeonEnv: pg_port = {port.pg} http_port = {port.http} sync = false # Disable fsyncs to make the tests go faster""") + if config.auth_enabled: + toml += textwrap.dedent(f""" + auth_enabled = true + """) if bool(self.remote_storage_users & RemoteStorageUsers.SAFEKEEPER) and self.remote_storage is not None: toml += textwrap.dedent(f""" @@ -1757,7 +1761,6 @@ class Safekeeper: env: NeonEnv port: SafekeeperPort id: int - auth_token: Optional[str] = None running: bool = False def start(self) -> 'Safekeeper': @@ -1813,8 +1816,8 @@ class Safekeeper: assert isinstance(res, dict) return res - def http_client(self) -> SafekeeperHttpClient: - return SafekeeperHttpClient(port=self.port.http) + def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient: + return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token) def data_dir(self) -> str: return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}") @@ -1838,9 +1841,15 @@ class SafekeeperMetrics: class SafekeeperHttpClient(requests.Session): - def __init__(self, port: int): + HTTPError = requests.HTTPError + + def __init__(self, port: int, auth_token: Optional[str] = None): super().__init__() self.port = port + self.auth_token = auth_token + + if auth_token is not None: + self.headers['Authorization'] = f'Bearer {auth_token}' def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() From 0ac0fba77aa551c5d83a27579bef533cb50e76c9 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 2 Jun 2022 00:45:10 +0300 Subject: [PATCH 0419/1022] test_runner: test Safekeeper HTTP API Auth All endpoints except for POST /v1/timeline are tested, this one is not tested in any way yet. Three attempts for each endpoint: correctly authenticated, badly authenticated, unauthenticated. --- test_runner/batch_others/test_wal_acceptor.py | 79 ++++++++++++++++--- 1 file changed, 70 insertions(+), 9 deletions(-) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 1932c3e450..e4970272d4 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -16,6 +16,7 @@ from fixtures.neon_fixtures import PgBin, Etcd, Postgres, RemoteStorageUsers, Sa from fixtures.utils import get_dir_size, lsn_to_hex, mkdir_if_needed, lsn_from_hex from fixtures.log_helper import log from typing import List, Optional, Any +from uuid import uuid4 @dataclass @@ -349,10 +350,12 @@ def test_broker(neon_env_builder: NeonEnvBuilder): # Test that old WAL consumed by peers and pageserver is removed from safekeepers. -def test_wal_removal(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize('auth_enabled', [False, True]) +def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.num_safekeepers = 2 # to advance remote_consistent_llsn neon_env_builder.enable_local_fs_remote_storage() + neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() env.neon_cli.create_branch('test_safekeepers_wal_removal') @@ -369,7 +372,10 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder): timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] # force checkpoint to advance remote_consistent_lsn - with closing(env.pageserver.connect()) as psconn: + pageserver_conn_options = {} + if auth_enabled: + pageserver_conn_options['password'] = env.auth_keys.generate_tenant_token(tenant_id) + with closing(env.pageserver.connect(**pageserver_conn_options)) as psconn: with psconn.cursor() as pscur: pscur.execute(f"checkpoint {tenant_id} {timeline_id}") @@ -380,9 +386,29 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder): ] assert all(os.path.exists(p) for p in first_segments) - http_cli = env.safekeepers[0].http_client() + if not auth_enabled: + http_cli = env.safekeepers[0].http_client() + else: + http_cli = env.safekeepers[0].http_client( + auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + http_cli_other = env.safekeepers[0].http_client( + auth_token=env.auth_keys.generate_tenant_token(uuid4().hex)) + http_cli_noauth = env.safekeepers[0].http_client() + # Pretend WAL is offloaded to s3. + if auth_enabled: + old_backup_lsn = http_cli.timeline_status(tenant_id=tenant_id, + timeline_id=timeline_id).backup_lsn + assert 'FFFFFFFF/FEFFFFFF' != old_backup_lsn + for cli in [http_cli_other, http_cli_noauth]: + with pytest.raises(cli.HTTPError, match='Forbidden|Unauthorized'): + cli.record_safekeeper_info(tenant_id, + timeline_id, {'backup_lsn': 'FFFFFFFF/FEFFFFFF'}) + assert old_backup_lsn == http_cli.timeline_status(tenant_id=tenant_id, + timeline_id=timeline_id).backup_lsn http_cli.record_safekeeper_info(tenant_id, timeline_id, {'backup_lsn': 'FFFFFFFF/FEFFFFFF'}) + assert 'FFFFFFFF/FEFFFFFF' == http_cli.timeline_status(tenant_id=tenant_id, + timeline_id=timeline_id).backup_lsn # wait till first segment is removed on all safekeepers started_at = time.time() @@ -596,25 +622,42 @@ def test_sync_safekeepers(neon_env_builder: NeonEnvBuilder, assert all(lsn_after_sync == lsn for lsn in lsn_after_append) -def test_timeline_status(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize('auth_enabled', [False, True]) +def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): + neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() env.neon_cli.create_branch('test_timeline_status') pg = env.postgres.create_start('test_timeline_status') wa = env.safekeepers[0] - wa_http_cli = wa.http_client() - wa_http_cli.check_status() # learn neon timeline from compute tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + if not auth_enabled: + wa_http_cli = wa.http_client() + wa_http_cli.check_status() + else: + wa_http_cli = wa.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + wa_http_cli.check_status() + wa_http_cli_bad = wa.http_client( + auth_token=env.auth_keys.generate_tenant_token(uuid4().hex)) + wa_http_cli_bad.check_status() + wa_http_cli_noauth = wa.http_client() + wa_http_cli_noauth.check_status() + # fetch something sensible from status tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) epoch = tli_status.acceptor_epoch timeline_start_lsn = tli_status.timeline_start_lsn + if auth_enabled: + for cli in [wa_http_cli_bad, wa_http_cli_noauth]: + with pytest.raises(cli.HTTPError, match='Forbidden|Unauthorized'): + cli.timeline_status(tenant_id, timeline_id) + pg.safe_psql("create table t(i int)") # ensure epoch goes up after reboot @@ -894,8 +937,10 @@ def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): assert wal_size_after_checkpoint < 16 * 2.5 -def test_delete_force(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize('auth_enabled', [False, True]) +def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.num_safekeepers = 1 + neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() # Create two tenants: one will be deleted, other should be preserved. @@ -921,7 +966,14 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder): cur.execute('CREATE TABLE t(key int primary key)') sk = env.safekeepers[0] sk_data_dir = Path(sk.data_dir()) - sk_http = sk.http_client() + if not auth_enabled: + sk_http = sk.http_client() + sk_http_other = sk_http + else: + sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + sk_http_other = sk.http_client( + auth_token=env.auth_keys.generate_tenant_token(tenant_id_other)) + sk_http_noauth = sk.http_client() assert (sk_data_dir / tenant_id / timeline_id_1).is_dir() assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() @@ -961,6 +1013,15 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder): assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + if auth_enabled: + # Ensure we cannot delete the other tenant + for sk_h in [sk_http, sk_http_noauth]: + with pytest.raises(sk_h.HTTPError, match='Forbidden|Unauthorized'): + assert sk_h.timeline_delete_force(tenant_id_other, timeline_id_other) + with pytest.raises(sk_h.HTTPError, match='Forbidden|Unauthorized'): + assert sk_h.tenant_delete_force(tenant_id_other) + assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + # Remove initial tenant's br2 (inactive) assert sk_http.timeline_delete_force(tenant_id, timeline_id_2) == { "dir_existed": True, @@ -1001,7 +1062,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder): assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() # Ensure the other tenant still works - sk_http.timeline_status(tenant_id_other, timeline_id_other) + sk_http_other.timeline_status(tenant_id_other, timeline_id_other) with closing(pg_other.connect()) as conn: with conn.cursor() as cur: cur.execute('INSERT INTO t (key) VALUES (123)') From e2a5a3159556341ef53cfe4a87180c0ffa5dadbd Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Fri, 3 Jun 2022 20:33:47 +0300 Subject: [PATCH 0420/1022] Safekeeper HTTP router: add comment about /v1/timeline --- safekeeper/src/http/routes.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index ca43039d3b..33581c6c31 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -222,6 +222,7 @@ pub fn make_router( "/v1/timeline/:tenant_id/:timeline_id", timeline_status_handler, ) + // Will be used in the future instead of implicit timeline creation .post("/v1/timeline", timeline_create_handler) .delete( "/v1/tenant/:tenant_id/timeline/:timeline_id", From a4d8261390a25338f0fa5eca572b28bf9ffb39ff Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 7 Jun 2022 15:18:49 +0400 Subject: [PATCH 0421/1022] Save Postgres log in test_find_end_of_wal_* tests. --- libs/postgres_ffi/wal_generate/src/lib.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/libs/postgres_ffi/wal_generate/src/lib.rs b/libs/postgres_ffi/wal_generate/src/lib.rs index 3b19afb826..2b3f5ef703 100644 --- a/libs/postgres_ffi/wal_generate/src/lib.rs +++ b/libs/postgres_ffi/wal_generate/src/lib.rs @@ -4,6 +4,7 @@ use log::*; use postgres::types::PgLsn; use postgres::Client; use std::cmp::Ordering; +use std::fs; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use std::time::Instant; @@ -69,6 +70,12 @@ impl Conf { pub fn start_server(&self) -> Result { info!("Starting Postgres server in {:?}", self.datadir); + let log_file = fs::File::create(self.datadir.join("pg.log")).with_context(|| { + format!( + "Failed to create pg.log file in directory {}", + self.datadir.display() + ) + })?; let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols) let unix_socket_dir_path = unix_socket_dir.path().to_owned(); let server_process = self @@ -84,7 +91,7 @@ impl Conf { // Disable background processes as much as possible .args(&["-c", "wal_writer_delay=10s"]) .args(&["-c", "autovacuum=off"]) - .stderr(Stdio::null()) + .stderr(Stdio::from(log_file)) .spawn()?; let server = PostgresServer { process: server_process, From e1336f451d53c4bffbd201b8b5aa0f17152c48f2 Mon Sep 17 00:00:00 2001 From: chaitanya sharma <86035+phoenix24@users.noreply.github.com> Date: Wed, 1 Jun 2022 19:30:47 +0000 Subject: [PATCH 0422/1022] renamed .zenith data-dir to .neon. --- .circleci/ansible/deploy.yaml | 4 ++-- .circleci/ansible/systemd/pageserver.service | 2 +- .circleci/ansible/systemd/safekeeper.service | 2 +- .dockerignore | 4 ++-- .gitignore | 4 ++-- .yapfignore | 2 +- README.md | 10 +++++----- control_plane/src/local_env.rs | 14 +++++++------- docs/settings.md | 2 +- pageserver/README.md | 4 ++-- pageserver/src/bin/pageserver.rs | 2 +- pageserver/src/layered_repository.rs | 4 ++-- pageserver/src/layered_repository/README.md | 4 ++-- pageserver/src/repository.rs | 2 +- test_runner/fixtures/neon_fixtures.py | 2 +- 15 files changed, 31 insertions(+), 31 deletions(-) diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml index a8154ba3b0..b47db6a9b5 100644 --- a/.circleci/ansible/deploy.yaml +++ b/.circleci/ansible/deploy.yaml @@ -57,7 +57,7 @@ args: creates: "/storage/pageserver/data/tenants" environment: - ZENITH_REPO_DIR: "/storage/pageserver/data" + NEON_REPO_DIR: "/storage/pageserver/data" LD_LIBRARY_PATH: "/usr/local/lib" become: true tags: @@ -131,7 +131,7 @@ args: creates: "/storage/safekeeper/data/safekeeper.id" environment: - ZENITH_REPO_DIR: "/storage/safekeeper/data" + NEON_REPO_DIR: "/storage/safekeeper/data" LD_LIBRARY_PATH: "/usr/local/lib" become: true tags: diff --git a/.circleci/ansible/systemd/pageserver.service b/.circleci/ansible/systemd/pageserver.service index 54a7b1ba0a..bb78054fa3 100644 --- a/.circleci/ansible/systemd/pageserver.service +++ b/.circleci/ansible/systemd/pageserver.service @@ -5,7 +5,7 @@ After=network.target auditd.service [Service] Type=simple User=pageserver -Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib +Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed diff --git a/.circleci/ansible/systemd/safekeeper.service b/.circleci/ansible/systemd/safekeeper.service index e4a395a60e..9b1159d812 100644 --- a/.circleci/ansible/systemd/safekeeper.service +++ b/.circleci/ansible/systemd/safekeeper.service @@ -5,7 +5,7 @@ After=network.target auditd.service [Service] Type=simple User=safekeeper -Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib +Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="wal"}' ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed diff --git a/.dockerignore b/.dockerignore index 352336496f..0667d8870e 100644 --- a/.dockerignore +++ b/.dockerignore @@ -9,8 +9,8 @@ tmp_install tmp_check_cli test_output .vscode -.zenith -integration_tests/.zenith +.neon +integration_tests/.neon .mypy_cache Dockerfile diff --git a/.gitignore b/.gitignore index 291504ea81..ed718c8c79 100644 --- a/.gitignore +++ b/.gitignore @@ -6,8 +6,8 @@ __pycache__/ test_output/ .vscode .idea -/.zenith -/integration_tests/.zenith +/.neon +/integration_tests/.neon # Coverage *.profraw diff --git a/.yapfignore b/.yapfignore index 258f6c59cd..149428e452 100644 --- a/.yapfignore +++ b/.yapfignore @@ -6,5 +6,5 @@ target/ tmp_install/ __pycache__/ test_output/ -.zenith/ +.neon/ .git/ diff --git a/README.md b/README.md index 0866678490..de9070ac0f 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r #### running neon database 1. Start pageserver and postgres on top of it (should be called from repo root): ```sh -# Create repository in .zenith with proper paths to binaries and data +# Create repository in .neon with proper paths to binaries and data # Later that would be responsibility of a package install script > ./target/debug/neon_local init initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c @@ -103,16 +103,16 @@ pageserver init succeeded # start pageserver and safekeeper > ./target/debug/neon_local start -Starting pageserver at '127.0.0.1:64000' in '.zenith' +Starting pageserver at '127.0.0.1:64000' in '.neon' Pageserver started initializing for sk 1 for 7676 -Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/sk1' +Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1' Safekeeper started # start postgres compute node > ./target/debug/neon_local pg start main Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ... -Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432 +Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432 Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres' # check list of running postgres instances @@ -149,7 +149,7 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: # start postgres on that branch > ./target/debug/neon_local pg start migration_check --branch-name migration_check Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ... -Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433 +Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433 Starting postgres node at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres' # check the new list of running postgres instances diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 28541c2ece..08389d29ba 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -21,9 +21,9 @@ use utils::{ use crate::safekeeper::SafekeeperNode; // -// This data structures represents zenith CLI config +// This data structures represents neon_local CLI config // -// It is deserialized from the .zenith/config file, or the config file passed +// It is deserialized from the .neon/config file, or the config file passed // to 'zenith init --config=' option. See control_plane/simple.conf for // an example. // @@ -34,8 +34,8 @@ pub struct LocalEnv { // compute nodes). // // This is not stored in the config file. Rather, this is the path where the - // config file itself is. It is read from the ZENITH_REPO_DIR env variable or - // '.zenith' if not given. + // config file itself is. It is read from the NEON_REPO_DIR env variable or + // '.neon' if not given. #[serde(skip)] pub base_data_dir: PathBuf, @@ -339,7 +339,7 @@ impl LocalEnv { pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> { // Currently, the user first passes a config file with 'zenith init --config=' // We read that in, in `create_config`, and fill any missing defaults. Then it's saved - // to .zenith/config. TODO: We lose any formatting and comments along the way, which is + // to .neon/config. TODO: We lose any formatting and comments along the way, which is // a bit sad. let mut conf_content = r#"# This file describes a locale deployment of the page server # and safekeeeper node. It is read by the 'zenith' command-line @@ -483,9 +483,9 @@ impl LocalEnv { } fn base_path() -> PathBuf { - match std::env::var_os("ZENITH_REPO_DIR") { + match std::env::var_os("NEON_REPO_DIR") { Some(val) => PathBuf::from(val), - None => PathBuf::from(".zenith"), + None => PathBuf::from(".neon"), } } diff --git a/docs/settings.md b/docs/settings.md index 4d828f22bc..f2aaab75a8 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -154,7 +154,7 @@ The default distrib dir is `./tmp_install/`. #### workdir (-D) A directory in the file system, where pageserver will store its files. -The default is `./.zenith/`. +The default is `./.neon/`. This parameter has a special CLI alias (`-D`) and can not be overridden with regular `-c` way. diff --git a/pageserver/README.md b/pageserver/README.md index cf841d1e46..cb752881af 100644 --- a/pageserver/README.md +++ b/pageserver/README.md @@ -69,7 +69,7 @@ Repository The repository stores all the page versions, or WAL records needed to reconstruct them. Each tenant has a separate Repository, which is -stored in the .zenith/tenants/ directory. +stored in the .neon/tenants/ directory. Repository is an abstract trait, defined in `repository.rs`. It is implemented by the LayeredRepository object in @@ -92,7 +92,7 @@ Each repository also has a WAL redo manager associated with it, see records, whenever we need to reconstruct a page version from WAL to satisfy a GetPage@LSN request, or to avoid accumulating too much WAL for a page. The WAL redo manager uses a Postgres process running in -special zenith wal-redo mode to do the actual WAL redo, and +special Neon wal-redo mode to do the actual WAL redo, and communicates with the process using a pipe. diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index ac90500b97..1d407a29bc 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -104,7 +104,7 @@ fn main() -> anyhow::Result<()> { return Ok(()); } - let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith")); + let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".neon")); let workdir = workdir .canonicalize() .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?; diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 7696f0d021..5c5b03268a 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -4,7 +4,7 @@ //! The functions here are responsible for locating the correct layer for the //! get/put call, tracing timeline branching history as needed. //! -//! The files are stored in the .zenith/tenants//timelines/ +//! The files are stored in the .neon/tenants//timelines/ //! directory. See layered_repository/README for how the files are managed. //! In addition to the layer files, there is a metadata file in the same //! directory that contains information about the timeline, in particular its @@ -148,7 +148,7 @@ lazy_static! { .expect("failed to define a metric"); } -/// Parts of the `.zenith/tenants//timelines/` directory prefix. +/// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; /// diff --git a/pageserver/src/layered_repository/README.md b/pageserver/src/layered_repository/README.md index 15040d21b2..bd5fa59257 100644 --- a/pageserver/src/layered_repository/README.md +++ b/pageserver/src/layered_repository/README.md @@ -123,7 +123,7 @@ The files are called "layer files". Each layer file covers a range of keys, and a range of LSNs (or a single LSN, in case of image layers). You can think of it as a rectangle in the two-dimensional key-LSN space. The layer files for each timeline are stored in the timeline's subdirectory under -`.zenith/tenants//timelines`. +`.neon/tenants//timelines`. There are two kind of layer files: images, and delta layers. An image file contains a snapshot of all keys at a particular LSN, whereas a delta file @@ -178,7 +178,7 @@ version, and how branching and GC works is still valid. The full path of a delta file looks like this: ``` - .zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000 + .neon/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000 ``` For simplicity, the examples below use a simplified notation for the diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index f687f24c6e..756c3b8191 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -197,7 +197,7 @@ impl Display for TimelineSyncStatusUpdate { } /// -/// A repository corresponds to one .zenith directory. One repository holds multiple +/// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. pub trait Repository: Send + Sync { type Timeline: Timeline; diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b8fb7fb82d..4c0715bac3 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1201,7 +1201,7 @@ class NeonCli: log.info(f'Running in "{self.env.repo_dir}"') env_vars = os.environ.copy() - env_vars['ZENITH_REPO_DIR'] = str(self.env.repo_dir) + env_vars['NEON_REPO_DIR'] = str(self.env.repo_dir) env_vars['POSTGRES_DISTRIB_DIR'] = str(pg_distrib_dir) if self.env.rust_log_override is not None: env_vars['RUST_LOG'] = self.env.rust_log_override From d8a37452c86571726fa7815ed6fdb22bb2c5120b Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 11 Jun 2022 00:44:05 +0300 Subject: [PATCH 0423/1022] Rename ZenithFeedback (#1912) --- docs/rfcs/cluster-size-limits.md | 10 ++--- libs/utils/src/pq_proto.rs | 44 +++++++++---------- pageserver/src/walreceiver.rs | 8 ++-- .../src/walreceiver/connection_handler.rs | 6 +-- safekeeper/src/metrics.rs | 4 +- safekeeper/src/safekeeper.rs | 10 ++--- safekeeper/src/send_wal.rs | 14 +++--- safekeeper/src/timeline.rs | 26 +++++------ 8 files changed, 61 insertions(+), 61 deletions(-) diff --git a/docs/rfcs/cluster-size-limits.md b/docs/rfcs/cluster-size-limits.md index bd12fb6eee..bd4cb9ef32 100644 --- a/docs/rfcs/cluster-size-limits.md +++ b/docs/rfcs/cluster-size-limits.md @@ -36,12 +36,12 @@ This is how the `LOGICAL_TIMELINE_SIZE` metric is implemented in the pageserver. Alternatively, we could count only relation data. As in pg_database_size(). This approach is somewhat more user-friendly because it is the data that is really affected by the user. On the other hand, it puts us in a weaker position than other services, i.e., RDS. -We will need to refactor the timeline_size counter or add another counter to implement it. +We will need to refactor the timeline_size counter or add another counter to implement it. Timeline size is updated during wal digestion. It is not versioned and is valid at the last_received_lsn moment. Then this size should be reported to compute node. -`current_timeline_size` value is included in the walreceiver's custom feedback message: `ZenithFeedback.` +`current_timeline_size` value is included in the walreceiver's custom feedback message: `ReplicationFeedback.` (PR about protocol changes https://github.com/zenithdb/zenith/pull/1037). @@ -64,11 +64,11 @@ We should warn users if the limit is soon to be reached. ### **Reliability, failure modes and corner cases** 1. `current_timeline_size` is valid at the last received and digested by pageserver lsn. - + If pageserver lags behind compute node, `current_timeline_size` will lag too. This lag can be tuned using backpressure, but it is not expected to be 0 all the time. - + So transactions that happen in this lsn range may cause limit overflow. Especially operations that generate (i.e., CREATE DATABASE) or free (i.e., TRUNCATE) a lot of data pages while generating a small amount of WAL. Are there other operations like this? - + Currently, CREATE DATABASE operations are restricted in the console. So this is not an issue. diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index 599af3fc68..0a320f123c 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -926,10 +926,10 @@ impl<'a> BeMessage<'a> { } } -// Zenith extension of postgres replication protocol -// See ZENITH_STATUS_UPDATE_TAG_BYTE +// Neon extension of postgres replication protocol +// See NEON_STATUS_UPDATE_TAG_BYTE #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] -pub struct ZenithFeedback { +pub struct ReplicationFeedback { // Last known size of the timeline. Used to enforce timeline size limit. pub current_timeline_size: u64, // Parts of StandbyStatusUpdate we resend to compute via safekeeper @@ -939,13 +939,13 @@ pub struct ZenithFeedback { pub ps_replytime: SystemTime, } -// NOTE: Do not forget to increment this number when adding new fields to ZenithFeedback. +// NOTE: Do not forget to increment this number when adding new fields to ReplicationFeedback. // Do not remove previously available fields because this might be backwards incompatible. -pub const ZENITH_FEEDBACK_FIELDS_NUMBER: u8 = 5; +pub const REPLICATION_FEEDBACK_FIELDS_NUMBER: u8 = 5; -impl ZenithFeedback { - pub fn empty() -> ZenithFeedback { - ZenithFeedback { +impl ReplicationFeedback { + pub fn empty() -> ReplicationFeedback { + ReplicationFeedback { current_timeline_size: 0, ps_writelsn: 0, ps_applylsn: 0, @@ -954,7 +954,7 @@ impl ZenithFeedback { } } - // Serialize ZenithFeedback using custom format + // Serialize ReplicationFeedback using custom format // to support protocol extensibility. // // Following layout is used: @@ -965,7 +965,7 @@ impl ZenithFeedback { // uint32 - value length in bytes // value itself pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> { - buf.put_u8(ZENITH_FEEDBACK_FIELDS_NUMBER); // # of keys + buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys write_cstr(&Bytes::from("current_timeline_size"), buf)?; buf.put_i32(8); buf.put_u64(self.current_timeline_size); @@ -992,9 +992,9 @@ impl ZenithFeedback { Ok(()) } - // Deserialize ZenithFeedback message - pub fn parse(mut buf: Bytes) -> ZenithFeedback { - let mut zf = ZenithFeedback::empty(); + // Deserialize ReplicationFeedback message + pub fn parse(mut buf: Bytes) -> ReplicationFeedback { + let mut zf = ReplicationFeedback::empty(); let nfields = buf.get_u8(); let mut i = 0; while i < nfields { @@ -1035,14 +1035,14 @@ impl ZenithFeedback { _ => { let len = buf.get_i32(); warn!( - "ZenithFeedback parse. unknown key {} of len {}. Skip it.", + "ReplicationFeedback parse. unknown key {} of len {}. Skip it.", key, len ); buf.advance(len as usize); } } } - trace!("ZenithFeedback parsed is {:?}", zf); + trace!("ReplicationFeedback parsed is {:?}", zf); zf } } @@ -1052,8 +1052,8 @@ mod tests { use super::*; #[test] - fn test_zenithfeedback_serialization() { - let mut zf = ZenithFeedback::empty(); + fn test_replication_feedback_serialization() { + let mut zf = ReplicationFeedback::empty(); // Fill zf with some values zf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, @@ -1062,13 +1062,13 @@ mod tests { let mut data = BytesMut::new(); zf.serialize(&mut data).unwrap(); - let zf_parsed = ZenithFeedback::parse(data.freeze()); + let zf_parsed = ReplicationFeedback::parse(data.freeze()); assert_eq!(zf, zf_parsed); } #[test] - fn test_zenithfeedback_unknown_key() { - let mut zf = ZenithFeedback::empty(); + fn test_replication_feedback_unknown_key() { + let mut zf = ReplicationFeedback::empty(); // Fill zf with some values zf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, @@ -1079,7 +1079,7 @@ mod tests { // Add an extra field to the buffer and adjust number of keys if let Some(first) = data.first_mut() { - *first = ZENITH_FEEDBACK_FIELDS_NUMBER + 1; + *first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1; } write_cstr(&Bytes::from("new_field_one"), &mut data).unwrap(); @@ -1087,7 +1087,7 @@ mod tests { data.put_u64(42); // Parse serialized data and check that new field is not parsed - let zf_parsed = ZenithFeedback::parse(data.freeze()); + let zf_parsed = ReplicationFeedback::parse(data.freeze()); assert_eq!(zf, zf_parsed); } diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 32bd88cf7c..82401e1d8c 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -71,7 +71,7 @@ use tokio::{ use tracing::*; use url::Url; use utils::lsn::Lsn; -use utils::pq_proto::ZenithFeedback; +use utils::pq_proto::ReplicationFeedback; use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}; use self::connection_handler::{WalConnectionEvent, WalReceiverConnection}; @@ -521,7 +521,7 @@ struct WalConnectionData { safekeeper_id: NodeId, connection: WalReceiverConnection, connection_init_time: NaiveDateTime, - last_wal_receiver_data: Option<(ZenithFeedback, NaiveDateTime)>, + last_wal_receiver_data: Option<(ReplicationFeedback, NaiveDateTime)>, } #[derive(Debug, PartialEq, Eq)] @@ -846,7 +846,7 @@ mod tests { .await; let now = Utc::now().naive_utc(); dummy_connection_data.last_wal_receiver_data = Some(( - ZenithFeedback { + ReplicationFeedback { current_timeline_size: 1, ps_writelsn: 1, ps_applylsn: current_lsn, @@ -1017,7 +1017,7 @@ mod tests { let time_over_threshold = Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; dummy_connection_data.last_wal_receiver_data = Some(( - ZenithFeedback { + ReplicationFeedback { current_timeline_size: 1, ps_writelsn: current_lsn.0, ps_applylsn: 1, diff --git a/pageserver/src/walreceiver/connection_handler.rs b/pageserver/src/walreceiver/connection_handler.rs index aaccee9730..97b9b8cc9b 100644 --- a/pageserver/src/walreceiver/connection_handler.rs +++ b/pageserver/src/walreceiver/connection_handler.rs @@ -19,7 +19,7 @@ use tokio_stream::StreamExt; use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use utils::{ lsn::Lsn, - pq_proto::ZenithFeedback, + pq_proto::ReplicationFeedback, zid::{NodeId, ZTenantTimelineId}, }; @@ -33,7 +33,7 @@ use crate::{ #[derive(Debug, Clone)] pub enum WalConnectionEvent { Started, - NewWal(ZenithFeedback), + NewWal(ReplicationFeedback), End(Result<(), String>), } @@ -328,7 +328,7 @@ async fn handle_walreceiver_connection( // Send zenith feedback message. // Regular standby_status_update fields are put into this message. - let zenith_status_update = ZenithFeedback { + let zenith_status_update = ReplicationFeedback { current_timeline_size: timeline.get_current_logical_size() as u64, ps_writelsn: write_lsn, ps_flushlsn: flush_lsn, diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 5a2e5f125f..fe4f9d231c 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -242,9 +242,9 @@ impl Collector for TimelineCollector { let timeline_id = tli.zttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; - let mut most_advanced: Option = None; + let mut most_advanced: Option = None; for replica in tli.replicas.iter() { - if let Some(replica_feedback) = replica.zenith_feedback { + if let Some(replica_feedback) = replica.pageserver_feedback { if let Some(current) = most_advanced { if current.ps_writelsn < replica_feedback.ps_writelsn { most_advanced = Some(replica_feedback); diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index eb6316dec2..7986fa5834 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -23,7 +23,7 @@ use postgres_ffi::xlog_utils::MAX_SEND_SIZE; use utils::{ bin_ser::LeSer, lsn::Lsn, - pq_proto::{SystemId, ZenithFeedback}, + pq_proto::{ReplicationFeedback, SystemId}, zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; @@ -348,7 +348,7 @@ pub struct AppendResponse { // a criterion for walproposer --sync mode exit pub commit_lsn: Lsn, pub hs_feedback: HotStandbyFeedback, - pub zenith_feedback: ZenithFeedback, + pub pageserver_feedback: ReplicationFeedback, } impl AppendResponse { @@ -358,7 +358,7 @@ impl AppendResponse { flush_lsn: Lsn(0), commit_lsn: Lsn(0), hs_feedback: HotStandbyFeedback::empty(), - zenith_feedback: ZenithFeedback::empty(), + pageserver_feedback: ReplicationFeedback::empty(), } } } @@ -476,7 +476,7 @@ impl AcceptorProposerMessage { buf.put_u64_le(msg.hs_feedback.xmin); buf.put_u64_le(msg.hs_feedback.catalog_xmin); - msg.zenith_feedback.serialize(buf)? + msg.pageserver_feedback.serialize(buf)? } } @@ -677,7 +677,7 @@ where commit_lsn: self.state.commit_lsn, // will be filled by the upper code to avoid bothering safekeeper hs_feedback: HotStandbyFeedback::empty(), - zenith_feedback: ZenithFeedback::empty(), + pageserver_feedback: ReplicationFeedback::empty(), }; trace!("formed AppendResponse {:?}", ar); ar diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index fd82a55efa..11e5b963c9 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -21,7 +21,7 @@ use utils::{ bin_ser::BeSer, lsn::Lsn, postgres_backend::PostgresBackend, - pq_proto::{BeMessage, FeMessage, WalSndKeepAlive, XLogDataBody, ZenithFeedback}, + pq_proto::{BeMessage, FeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody}, sock_split::ReadStream, }; @@ -29,7 +29,7 @@ use utils::{ const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h'; const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r'; // zenith extension of replication protocol -const ZENITH_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; +const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; type FullTransactionId = u64; @@ -122,15 +122,15 @@ impl ReplicationConn { warn!("unexpected StandbyReply. Read-only postgres replicas are not supported in safekeepers yet."); // timeline.update_replica_state(replica_id, Some(state)); } - Some(ZENITH_STATUS_UPDATE_TAG_BYTE) => { + Some(NEON_STATUS_UPDATE_TAG_BYTE) => { // Note: deserializing is on m[9..] because we skip the tag byte and len bytes. let buf = Bytes::copy_from_slice(&m[9..]); - let reply = ZenithFeedback::parse(buf); + let reply = ReplicationFeedback::parse(buf); - trace!("ZenithFeedback is {:?}", reply); - // Only pageserver sends ZenithFeedback, so set the flag. + trace!("ReplicationFeedback is {:?}", reply); + // Only pageserver sends ReplicationFeedback, so set the flag. // This replica is the source of information to resend to compute. - state.zenith_feedback = Some(reply); + state.pageserver_feedback = Some(reply); timeline.update_replica_state(replica_id, state); } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 42bb02c1ea..39f2593dbc 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -21,7 +21,7 @@ use tracing::*; use utils::{ lsn::Lsn, - pq_proto::ZenithFeedback, + pq_proto::ReplicationFeedback, zid::{NodeId, ZTenantId, ZTenantTimelineId}, }; @@ -48,8 +48,8 @@ pub struct ReplicaState { pub remote_consistent_lsn: Lsn, /// combined hot standby feedback from all replicas pub hs_feedback: HotStandbyFeedback, - /// Zenith specific feedback received from pageserver, if any - pub zenith_feedback: Option, + /// Replication specific feedback received from pageserver, if any + pub pageserver_feedback: Option, } impl Default for ReplicaState { @@ -68,7 +68,7 @@ impl ReplicaState { xmin: u64::MAX, catalog_xmin: u64::MAX, }, - zenith_feedback: None, + pageserver_feedback: None, } } } @@ -221,25 +221,25 @@ impl SharedState { // we need to know which pageserver compute node considers to be main. // See https://github.com/zenithdb/zenith/issues/1171 // - if let Some(zenith_feedback) = state.zenith_feedback { - if let Some(acc_feedback) = acc.zenith_feedback { - if acc_feedback.ps_writelsn < zenith_feedback.ps_writelsn { + if let Some(pageserver_feedback) = state.pageserver_feedback { + if let Some(acc_feedback) = acc.pageserver_feedback { + if acc_feedback.ps_writelsn < pageserver_feedback.ps_writelsn { warn!("More than one pageserver is streaming WAL for the timeline. Feedback resolving is not fully supported yet."); - acc.zenith_feedback = Some(zenith_feedback); + acc.pageserver_feedback = Some(pageserver_feedback); } } else { - acc.zenith_feedback = Some(zenith_feedback); + acc.pageserver_feedback = Some(pageserver_feedback); } // last lsn received by pageserver // FIXME if multiple pageservers are streaming WAL, last_received_lsn must be tracked per pageserver. // See https://github.com/zenithdb/zenith/issues/1171 - acc.last_received_lsn = Lsn::from(zenith_feedback.ps_writelsn); + acc.last_received_lsn = Lsn::from(pageserver_feedback.ps_writelsn); // When at least one pageserver has preserved data up to remote_consistent_lsn, // safekeeper is free to delete it, so choose max of all pageservers. acc.remote_consistent_lsn = max( - Lsn::from(zenith_feedback.ps_applylsn), + Lsn::from(pageserver_feedback.ps_applylsn), acc.remote_consistent_lsn, ); } @@ -457,8 +457,8 @@ impl Timeline { if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { let state = shared_state.get_replicas_state(); resp.hs_feedback = state.hs_feedback; - if let Some(zenith_feedback) = state.zenith_feedback { - resp.zenith_feedback = zenith_feedback; + if let Some(pageserver_feedback) = state.pageserver_feedback { + resp.pageserver_feedback = pageserver_feedback; } } From d11c9f9fcb950ac263b1fa16651976ca11e96edf Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 15 Jun 2022 18:16:04 +0300 Subject: [PATCH 0424/1022] Use random ports for the proxy and local pg in tests Fixes #1931 Author: Dmitry Ivanov --- test_runner/fixtures/neon_fixtures.py | 25 ++++++++++++++++--------- test_runner/fixtures/utils.py | 2 +- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 4c0715bac3..167c3ff60a 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -29,7 +29,7 @@ from dataclasses import dataclass # Type-related stuff from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import make_dsn, parse_dsn -from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, TypeVar, cast, Union, Tuple +from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple from typing_extensions import Literal import requests @@ -1379,6 +1379,7 @@ class VanillaPostgres(PgProtocol): self.pg_bin = pg_bin self.running = False self.pg_bin.run_capture(['initdb', '-D', pgdatadir]) + self.configure([f"port = {port}\n"]) def configure(self, options: List[str]): """Append lines into postgresql.conf file.""" @@ -1413,10 +1414,12 @@ class VanillaPostgres(PgProtocol): @pytest.fixture(scope='function') -def vanilla_pg(test_output_dir: str) -> Iterator[VanillaPostgres]: +def vanilla_pg(test_output_dir: str, + port_distributor: PortDistributor) -> Iterator[VanillaPostgres]: pgdatadir = os.path.join(test_output_dir, "pgdata-vanilla") pg_bin = PgBin(test_output_dir) - with VanillaPostgres(pgdatadir, pg_bin, 5432) as vanilla_pg: + port = port_distributor.get_port() + with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: yield vanilla_pg @@ -1462,7 +1465,7 @@ def remote_pg(test_output_dir: str) -> Iterator[RemotePostgres]: class NeonProxy(PgProtocol): - def __init__(self, port: int): + def __init__(self, port: int, pg_port: int): super().__init__(host="127.0.0.1", user="proxy_user", password="pytest2", @@ -1471,9 +1474,10 @@ class NeonProxy(PgProtocol): self.http_port = 7001 self.host = "127.0.0.1" self.port = port + self.pg_port = pg_port self._popen: Optional[subprocess.Popen[bytes]] = None - def start_static(self, addr="127.0.0.1:5432") -> None: + def start(self) -> None: assert self._popen is None # Start proxy @@ -1482,7 +1486,8 @@ class NeonProxy(PgProtocol): args.extend(["--http", f"{self.host}:{self.http_port}"]) args.extend(["--proxy", f"{self.host}:{self.port}"]) args.extend(["--auth-backend", "postgres"]) - args.extend(["--auth-endpoint", "postgres://proxy_auth:pytest1@localhost:5432/postgres"]) + args.extend( + ["--auth-endpoint", f"postgres://proxy_auth:pytest1@localhost:{self.pg_port}/postgres"]) self._popen = subprocess.Popen(args) self._wait_until_ready() @@ -1501,14 +1506,16 @@ class NeonProxy(PgProtocol): @pytest.fixture(scope='function') -def static_proxy(vanilla_pg) -> Iterator[NeonProxy]: +def static_proxy(vanilla_pg, port_distributor) -> Iterator[NeonProxy]: """Neon proxy that routes directly to vanilla postgres.""" vanilla_pg.start() vanilla_pg.safe_psql("create user proxy_auth with password 'pytest1' superuser") vanilla_pg.safe_psql("create user proxy_user with password 'pytest2'") - with NeonProxy(4432) as proxy: - proxy.start_static() + port = port_distributor.get_port() + pg_port = vanilla_pg.default_options['port'] + with NeonProxy(port, pg_port) as proxy: + proxy.start() yield proxy diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index ba9bc6e113..bfa57373b3 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -3,7 +3,7 @@ import shutil import subprocess from pathlib import Path -from typing import Any, List, Optional +from typing import Any, List from fixtures.log_helper import log From 36ee182d260dc01fd592e19a9928f10c3957cd05 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 16 Jun 2022 14:07:11 +0300 Subject: [PATCH 0425/1022] Implement page servise 'fullbackup' endpoint (#1923) * Implement page servise 'fullbackup' endpoint that works like basebackup, but also sends relational files * Add test_runner/batch_others/test_fullbackup.py Co-authored-by: bojanserafimov --- pageserver/src/basebackup.rs | 80 ++++++++++++++++----- pageserver/src/page_service.rs | 31 +++++++- pageserver/src/reltag.rs | 26 ++++++- test_runner/batch_others/test_fullbackup.py | 73 +++++++++++++++++++ test_runner/fixtures/neon_fixtures.py | 5 +- 5 files changed, 193 insertions(+), 22 deletions(-) create mode 100644 test_runner/batch_others/test_fullbackup.py diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 46d824b2e2..44a6442522 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -13,6 +13,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::{BufMut, BytesMut}; use fail::fail_point; +use itertools::Itertools; use std::fmt::Write as FmtWrite; use std::io; use std::io::Write; @@ -21,7 +22,7 @@ use std::time::SystemTime; use tar::{Builder, EntryType, Header}; use tracing::*; -use crate::reltag::SlruKind; +use crate::reltag::{RelTag, SlruKind}; use crate::repository::Timeline; use crate::DatadirTimelineImpl; use postgres_ffi::xlog_utils::*; @@ -39,11 +40,12 @@ where timeline: &'a Arc, pub lsn: Lsn, prev_record_lsn: Lsn, - + full_backup: bool, finished: bool, } -// Create basebackup with non-rel data in it. Omit relational data. +// Create basebackup with non-rel data in it. +// Only include relational data if 'full_backup' is true. // // Currently we use empty lsn in two cases: // * During the basebackup right after timeline creation @@ -58,6 +60,7 @@ where write: W, timeline: &'a Arc, req_lsn: Option, + full_backup: bool, ) -> Result> { // Compute postgres doesn't have any previous WAL files, but the first // record that it's going to write needs to include the LSN of the @@ -94,8 +97,8 @@ where }; info!( - "taking basebackup lsn={}, prev_lsn={}", - backup_lsn, backup_prev + "taking basebackup lsn={}, prev_lsn={} (full_backup={})", + backup_lsn, backup_prev, full_backup ); Ok(Basebackup { @@ -103,6 +106,7 @@ where timeline, lsn: backup_lsn, prev_record_lsn: backup_prev, + full_backup, finished: false, }) } @@ -140,6 +144,13 @@ where // Create tablespace directories for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? { self.add_dbdir(spcnode, dbnode, has_relmap_file)?; + + // Gather and send relational files in each database if full backup is requested. + if self.full_backup { + for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn)? { + self.add_rel(rel)?; + } + } } for xid in self.timeline.list_twophase_files(self.lsn)? { self.add_twophase_file(xid)?; @@ -157,6 +168,38 @@ where Ok(()) } + fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> { + let nblocks = self.timeline.get_rel_size(tag, self.lsn)?; + + // Function that adds relation segment data to archive + let mut add_file = |segment_index, data: &Vec| -> anyhow::Result<()> { + let file_name = tag.to_segfile_name(segment_index as u32); + let header = new_tar_header(&file_name, data.len() as u64)?; + self.ar.append(&header, data.as_slice())?; + Ok(()) + }; + + // If the relation is empty, create an empty file + if nblocks == 0 { + add_file(0, &vec![])?; + return Ok(()); + } + + // Add a file for each chunk of blocks (aka segment) + let chunks = (0..nblocks).chunks(pg_constants::RELSEG_SIZE as usize); + for (seg, blocks) in chunks.into_iter().enumerate() { + let mut segment_data: Vec = vec![]; + for blknum in blocks { + let img = self.timeline.get_rel_page_at_lsn(tag, blknum, self.lsn)?; + segment_data.extend_from_slice(&img[..]); + } + + add_file(seg, &segment_data)?; + } + + Ok(()) + } + // // Generate SLRU segment files from repository. // @@ -312,21 +355,24 @@ where pg_control.checkPointCopy = checkpoint; pg_control.state = pg_constants::DB_SHUTDOWNED; - // add zenith.signal file - let mut zenith_signal = String::new(); - if self.prev_record_lsn == Lsn(0) { - if self.lsn == self.timeline.tline.get_ancestor_lsn() { - write!(zenith_signal, "PREV LSN: none")?; + // Postgres doesn't recognize the zenith.signal file and doesn't need it. + if !self.full_backup { + // add zenith.signal file + let mut zenith_signal = String::new(); + if self.prev_record_lsn == Lsn(0) { + if self.lsn == self.timeline.tline.get_ancestor_lsn() { + write!(zenith_signal, "PREV LSN: none")?; + } else { + write!(zenith_signal, "PREV LSN: invalid")?; + } } else { - write!(zenith_signal, "PREV LSN: invalid")?; + write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?; } - } else { - write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?; + self.ar.append( + &new_tar_header("zenith.signal", zenith_signal.len() as u64)?, + zenith_signal.as_bytes(), + )?; } - self.ar.append( - &new_tar_header("zenith.signal", zenith_signal.len() as u64)?, - zenith_signal.as_bytes(), - )?; //send pg_control let pg_control_bytes = pg_control.encode(); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 30f0d241d6..406228f034 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -596,6 +596,7 @@ impl PageServerHandler { timelineid: ZTimelineId, lsn: Option, tenantid: ZTenantId, + full_backup: bool, ) -> anyhow::Result<()> { let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty); let _enter = span.enter(); @@ -618,7 +619,7 @@ impl PageServerHandler { { let mut writer = CopyDataSink { pgb }; - let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?; + let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn, full_backup)?; span.record("lsn", &basebackup.lsn.to_string().as_str()); basebackup.send_tarball()?; } @@ -721,7 +722,33 @@ impl postgres_backend::Handler for PageServerHandler { }; // Check that the timeline exists - self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?; + self.handle_basebackup_request(pgb, timelineid, lsn, tenantid, false)?; + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + } + // same as basebackup, but result includes relational data as well + else if query_string.starts_with("fullbackup ") { + let (_, params_raw) = query_string.split_at("fullbackup ".len()); + let params = params_raw.split_whitespace().collect::>(); + + ensure!( + params.len() == 3, + "invalid param number for fullbackup command" + ); + + let tenantid = ZTenantId::from_str(params[0])?; + let timelineid = ZTimelineId::from_str(params[1])?; + + self.check_permission(Some(tenantid))?; + + // Lsn is required for fullbackup, because otherwise we would not know + // at which lsn to upload this backup. + // + // The caller is responsible for providing a valid lsn + // and using it in the subsequent import. + let lsn = Some(Lsn::from_str(params[2])?); + + // Check that the timeline exists + self.handle_basebackup_request(pgb, timelineid, lsn, tenantid, true)?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.to_ascii_lowercase().starts_with("set ") { // important because psycopg2 executes "SET datestyle TO 'ISO'" diff --git a/pageserver/src/reltag.rs b/pageserver/src/reltag.rs index 18e26cc37a..fadd41f547 100644 --- a/pageserver/src/reltag.rs +++ b/pageserver/src/reltag.rs @@ -3,7 +3,7 @@ use std::cmp::Ordering; use std::fmt; use postgres_ffi::relfile_utils::forknumber_to_name; -use postgres_ffi::Oid; +use postgres_ffi::{pg_constants, Oid}; /// /// Relation data file segment id throughout the Postgres cluster. @@ -75,6 +75,30 @@ impl fmt::Display for RelTag { } } +impl RelTag { + pub fn to_segfile_name(&self, segno: u32) -> String { + let mut name = if self.spcnode == pg_constants::GLOBALTABLESPACE_OID { + "global/".to_string() + } else { + format!("base/{}/", self.dbnode) + }; + + name += &self.relnode.to_string(); + + if let Some(fork_name) = forknumber_to_name(self.forknum) { + name += "_"; + name += fork_name; + } + + if segno != 0 { + name += "."; + name += &segno.to_string(); + } + + name + } +} + /// /// Non-relation transaction status files (clog (a.k.a. pg_xact) and /// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer, diff --git a/test_runner/batch_others/test_fullbackup.py b/test_runner/batch_others/test_fullbackup.py new file mode 100644 index 0000000000..e5d705beab --- /dev/null +++ b/test_runner/batch_others/test_fullbackup.py @@ -0,0 +1,73 @@ +import subprocess +from contextlib import closing + +import psycopg2.extras +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres +from fixtures.neon_fixtures import pg_distrib_dir +import os +from fixtures.utils import mkdir_if_needed, subprocess_capture +import shutil +import getpass +import pwd + +num_rows = 1000 + + +# Ensure that regular postgres can start from fullbackup +def test_fullbackup(neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + port_distributor: PortDistributor): + + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch('test_fullbackup') + pgmain = env.postgres.create_start('test_fullbackup') + log.info("postgres is running on 'test_fullbackup' branch") + + timeline = pgmain.safe_psql("SHOW neon.timeline_id")[0][0] + + with closing(pgmain.connect()) as conn: + with conn.cursor() as cur: + # data loading may take a while, so increase statement timeout + cur.execute("SET statement_timeout='300s'") + cur.execute(f'''CREATE TABLE tbl AS SELECT 'long string to consume some space' || g + from generate_series(1,{num_rows}) g''') + cur.execute("CHECKPOINT") + + cur.execute('SELECT pg_current_wal_insert_lsn()') + lsn = cur.fetchone()[0] + log.info(f"start_backup_lsn = {lsn}") + + # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. + # PgBin sets it automatically, but here we need to pipe psql output to the tar command. + psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')} + + # Get and unpack fullbackup from pageserver + restored_dir_path = os.path.join(env.repo_dir, "restored_datadir") + os.mkdir(restored_dir_path, 0o750) + query = f"fullbackup {env.initial_tenant.hex} {timeline} {lsn}" + cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] + result_basepath = pg_bin.run_capture(cmd, env=psql_env) + tar_output_file = result_basepath + ".stdout" + subprocess_capture(str(env.repo_dir), ["tar", "-xf", tar_output_file, "-C", restored_dir_path]) + + # HACK + # fullbackup returns neon specific pg_control and first WAL segment + # use resetwal to overwrite it + pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, 'pg_resetwal') + cmd = [pg_resetwal_path, "-D", restored_dir_path] + pg_bin.run_capture(cmd, env=psql_env) + + # Restore from the backup and find the data we inserted + port = port_distributor.get_port() + with VanillaPostgres(restored_dir_path, pg_bin, port, init=False) as vanilla_pg: + # TODO make port an optional argument + vanilla_pg.configure([ + f"port={port}", + ]) + vanilla_pg.start() + num_rows_found = vanilla_pg.safe_psql('select count(*) from tbl;', user="cloud_admin")[0][0] + assert num_rows == num_rows_found diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 167c3ff60a..fcefaad8fa 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1373,12 +1373,13 @@ def pg_bin(test_output_dir: str) -> PgBin: class VanillaPostgres(PgProtocol): - def __init__(self, pgdatadir: str, pg_bin: PgBin, port: int): + def __init__(self, pgdatadir: str, pg_bin: PgBin, port: int, init=True): super().__init__(host='localhost', port=port, dbname='postgres') self.pgdatadir = pgdatadir self.pg_bin = pg_bin self.running = False - self.pg_bin.run_capture(['initdb', '-D', pgdatadir]) + if init: + self.pg_bin.run_capture(['initdb', '-D', pgdatadir]) self.configure([f"port = {port}\n"]) def configure(self, options: List[str]): From 699f46cd84c23bdfbd382679e087a5b55da87eb6 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Fri, 17 Jun 2022 15:33:39 +0300 Subject: [PATCH 0426/1022] Download WAL from S3 if it's not available in safekeeper dir (#1932) `send_wal.rs` and `WalReader` are now async. `test_s3_wal_replay` checks that WAL can be replayed after offloaded. --- safekeeper/src/json_ctrl.rs | 2 +- safekeeper/src/send_wal.rs | 225 +++++++++++------- safekeeper/src/timeline.rs | 52 +--- safekeeper/src/wal_backup.rs | 51 +++- safekeeper/src/wal_storage.rs | 125 +++++++--- test_runner/batch_others/test_wal_acceptor.py | 98 +++++++- 6 files changed, 379 insertions(+), 174 deletions(-) diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 43514997d4..97fb3654d2 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -124,7 +124,7 @@ fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: L term, start_streaming_at: lsn, term_history: history, - timeline_start_lsn: Lsn(0), + timeline_start_lsn: lsn, }); spg.timeline.get().process_msg(&proposer_elected_request)?; diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 11e5b963c9..a6b9de2050 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -13,9 +13,11 @@ use serde::{Deserialize, Serialize}; use std::cmp::min; use std::net::Shutdown; use std::sync::Arc; -use std::thread::sleep; use std::time::Duration; use std::{str, thread}; + +use tokio::sync::watch::Receiver; +use tokio::time::timeout; use tracing::*; use utils::{ bin_ser::BeSer, @@ -191,100 +193,143 @@ impl ReplicationConn { } })?; - let mut wal_seg_size: usize; - loop { - wal_seg_size = spg.timeline.get().get_state().1.server.wal_seg_size as usize; - if wal_seg_size == 0 { - error!("Cannot start replication before connecting to wal_proposer"); - sleep(Duration::from_secs(1)); + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?; + + runtime.block_on(async move { + let (_, persisted_state) = spg.timeline.get().get_state(); + if persisted_state.server.wal_seg_size == 0 + || persisted_state.timeline_start_lsn == Lsn(0) + { + bail!("Cannot start replication before connecting to walproposer"); + } + + let wal_end = spg.timeline.get().get_end_of_wal(); + // Walproposer gets special handling: safekeeper must give proposer all + // local WAL till the end, whether committed or not (walproposer will + // hang otherwise). That's because walproposer runs the consensus and + // synchronizes safekeepers on the most advanced one. + // + // There is a small risk of this WAL getting concurrently garbaged if + // another compute rises which collects majority and starts fixing log + // on this safekeeper itself. That's ok as (old) proposer will never be + // able to commit such WAL. + let stop_pos: Option = if spg.appname == Some("wal_proposer_recovery".to_string()) + { + Some(wal_end) } else { + None + }; + + info!("Start replication from {:?} till {:?}", start_pos, stop_pos); + + // switch to copy + pgb.write_message(&BeMessage::CopyBothResponse)?; + + let mut end_pos = Lsn(0); + + let mut wal_reader = WalReader::new( + spg.conf.timeline_dir(&spg.timeline.get().zttid), + &persisted_state, + start_pos, + spg.conf.wal_backup_enabled, + )?; + + // buffer for wal sending, limited by MAX_SEND_SIZE + let mut send_buf = vec![0u8; MAX_SEND_SIZE]; + + // watcher for commit_lsn updates + let mut commit_lsn_watch_rx = spg.timeline.get().get_commit_lsn_watch_rx(); + + loop { + if let Some(stop_pos) = stop_pos { + if start_pos >= stop_pos { + break; /* recovery finished */ + } + end_pos = stop_pos; + } else { + /* Wait until we have some data to stream */ + let lsn = wait_for_lsn(&mut commit_lsn_watch_rx, start_pos).await?; + + if let Some(lsn) = lsn { + end_pos = lsn; + } else { + // TODO: also check once in a while whether we are walsender + // to right pageserver. + if spg.timeline.get().stop_walsender(replica_id)? { + // Shut down, timeline is suspended. + // TODO create proper error type for this + bail!("end streaming to {:?}", spg.appname); + } + + // timeout expired: request pageserver status + pgb.write_message(&BeMessage::KeepAlive(WalSndKeepAlive { + sent_ptr: end_pos.0, + timestamp: get_current_timestamp(), + request_reply: true, + })) + .context("Failed to send KeepAlive message")?; + continue; + } + } + + let send_size = end_pos.checked_sub(start_pos).unwrap().0 as usize; + let send_size = min(send_size, send_buf.len()); + + let send_buf = &mut send_buf[..send_size]; + + // read wal into buffer + let send_size = wal_reader.read(send_buf).await?; + let send_buf = &send_buf[..send_size]; + + // Write some data to the network socket. + pgb.write_message(&BeMessage::XLogData(XLogDataBody { + wal_start: start_pos.0, + wal_end: end_pos.0, + timestamp: get_current_timestamp(), + data: send_buf, + })) + .context("Failed to send XLogData")?; + + start_pos += send_size as u64; + trace!("sent WAL up to {}", start_pos); + } + + Ok(()) + }) + } +} + +const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); + +// Wait until we have commit_lsn > lsn or timeout expires. Returns latest commit_lsn. +async fn wait_for_lsn(rx: &mut Receiver, lsn: Lsn) -> Result> { + let commit_lsn: Lsn = *rx.borrow(); + if commit_lsn > lsn { + return Ok(Some(commit_lsn)); + } + + let res = timeout(POLL_STATE_TIMEOUT, async move { + let mut commit_lsn; + loop { + rx.changed().await?; + commit_lsn = *rx.borrow(); + if commit_lsn > lsn { break; } } - let wal_end = spg.timeline.get().get_end_of_wal(); - // Walproposer gets special handling: safekeeper must give proposer all - // local WAL till the end, whether committed or not (walproposer will - // hang otherwise). That's because walproposer runs the consensus and - // synchronizes safekeepers on the most advanced one. - // - // There is a small risk of this WAL getting concurrently garbaged if - // another compute rises which collects majority and starts fixing log - // on this safekeeper itself. That's ok as (old) proposer will never be - // able to commit such WAL. - let stop_pos: Option = if spg.appname == Some("wal_proposer_recovery".to_string()) { - Some(wal_end) - } else { - None - }; - info!("Start replication from {:?} till {:?}", start_pos, stop_pos); - // switch to copy - pgb.write_message(&BeMessage::CopyBothResponse)?; + Ok(commit_lsn) + }) + .await; - let mut end_pos = Lsn(0); - - let mut wal_reader = WalReader::new( - spg.conf.timeline_dir(&spg.timeline.get().zttid), - wal_seg_size, - start_pos, - ); - - // buffer for wal sending, limited by MAX_SEND_SIZE - let mut send_buf = vec![0u8; MAX_SEND_SIZE]; - - loop { - if let Some(stop_pos) = stop_pos { - if start_pos >= stop_pos { - break; /* recovery finished */ - } - end_pos = stop_pos; - } else { - /* Wait until we have some data to stream */ - let lsn = spg.timeline.get().wait_for_lsn(start_pos); - - if let Some(lsn) = lsn { - end_pos = lsn; - } else { - // TODO: also check once in a while whether we are walsender - // to right pageserver. - if spg.timeline.get().stop_walsender(replica_id)? { - // Shut down, timeline is suspended. - // TODO create proper error type for this - bail!("end streaming to {:?}", spg.appname); - } - - // timeout expired: request pageserver status - pgb.write_message(&BeMessage::KeepAlive(WalSndKeepAlive { - sent_ptr: end_pos.0, - timestamp: get_current_timestamp(), - request_reply: true, - })) - .context("Failed to send KeepAlive message")?; - continue; - } - } - - let send_size = end_pos.checked_sub(start_pos).unwrap().0 as usize; - let send_size = min(send_size, send_buf.len()); - - let send_buf = &mut send_buf[..send_size]; - - // read wal into buffer - let send_size = wal_reader.read(send_buf)?; - let send_buf = &send_buf[..send_size]; - - // Write some data to the network socket. - pgb.write_message(&BeMessage::XLogData(XLogDataBody { - wal_start: start_pos.0, - wal_end: end_pos.0, - timestamp: get_current_timestamp(), - data: send_buf, - })) - .context("Failed to send XLogData")?; - - start_pos += send_size as u64; - trace!("sent WAL up to {}", start_pos); - } - Ok(()) + match res { + // success + Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)), + // error inside closure + Ok(Err(err)) => Err(err), + // timeout + Err(_) => Ok(None), } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 39f2593dbc..2e415a53d0 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -14,8 +14,8 @@ use std::cmp::{max, min}; use std::collections::HashMap; use std::fs::{self}; -use std::sync::{Arc, Condvar, Mutex, MutexGuard}; -use std::time::Duration; +use std::sync::{Arc, Mutex, MutexGuard}; + use tokio::sync::mpsc::Sender; use tracing::*; @@ -37,8 +37,6 @@ use crate::wal_storage; use crate::wal_storage::Storage as wal_storage_iface; use crate::SafeKeeperConf; -const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); - /// Replica status update + hot standby feedback #[derive(Debug, Clone, Copy)] pub struct ReplicaState { @@ -77,9 +75,6 @@ impl ReplicaState { struct SharedState { /// Safekeeper object sk: SafeKeeper, - /// For receiving-sending wal cooperation - /// quorum commit LSN we've notified walsenders about - notified_commit_lsn: Lsn, /// State of replicas replicas: Vec>, /// True when WAL backup launcher oversees the timeline, making sure WAL is @@ -112,7 +107,6 @@ impl SharedState { let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?; Ok(Self { - notified_commit_lsn: Lsn(0), sk, replicas: Vec::new(), wal_backup_active: false, @@ -131,7 +125,6 @@ impl SharedState { info!("timeline {} restored", zttid.timeline_id); Ok(Self { - notified_commit_lsn: Lsn(0), sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?, replicas: Vec::new(), wal_backup_active: false, @@ -271,8 +264,6 @@ pub struct Timeline { /// For breeding receivers. commit_lsn_watch_rx: watch::Receiver, mutex: Mutex, - /// conditional variable used to notify wal senders - cond: Condvar, } impl Timeline { @@ -289,7 +280,6 @@ impl Timeline { commit_lsn_watch_tx, commit_lsn_watch_rx, mutex: Mutex::new(shared_state), - cond: Condvar::new(), } } @@ -333,7 +323,7 @@ impl Timeline { let mut shared_state = self.mutex.lock().unwrap(); if shared_state.num_computes == 0 { let replica_state = shared_state.replicas[replica_id].unwrap(); - let stop = shared_state.notified_commit_lsn == Lsn(0) || // no data at all yet + let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet (replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); if stop { @@ -405,39 +395,6 @@ impl Timeline { }) } - /// Timed wait for an LSN to be committed. - /// - /// Returns the last committed LSN, which will be at least - /// as high as the LSN waited for, or None if timeout expired. - /// - pub fn wait_for_lsn(&self, lsn: Lsn) -> Option { - let mut shared_state = self.mutex.lock().unwrap(); - loop { - let commit_lsn = shared_state.notified_commit_lsn; - // This must be `>`, not `>=`. - if commit_lsn > lsn { - return Some(commit_lsn); - } - let result = self - .cond - .wait_timeout(shared_state, POLL_STATE_TIMEOUT) - .unwrap(); - if result.1.timed_out() { - return None; - } - shared_state = result.0 - } - } - - // Notify caught-up WAL senders about new WAL data received - // TODO: replace-unify it with commit_lsn_watch. - fn notify_wal_senders(&self, shared_state: &mut MutexGuard) { - if shared_state.notified_commit_lsn < shared_state.sk.inmem.commit_lsn { - shared_state.notified_commit_lsn = shared_state.sk.inmem.commit_lsn; - self.cond.notify_all(); - } - } - pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver { self.commit_lsn_watch_rx.clone() } @@ -462,8 +419,6 @@ impl Timeline { } } - // Ping wal sender that new data might be available. - self.notify_wal_senders(&mut shared_state); commit_lsn = shared_state.sk.inmem.commit_lsn; } self.commit_lsn_watch_tx.send(commit_lsn)?; @@ -524,7 +479,6 @@ impl Timeline { return Ok(()); } shared_state.sk.record_safekeeper_info(sk_info)?; - self.notify_wal_senders(&mut shared_state); is_wal_backup_action_pending = shared_state.update_status(self.zttid); commit_lsn = shared_state.sk.inmem.commit_lsn; } diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 1d7c8de3b8..8fada70e8b 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -2,6 +2,7 @@ use anyhow::{Context, Result}; use etcd_broker::subscription_key::{ NodeKind, OperationKind, SkOperationKind, SubscriptionKey, SubscriptionKind, }; +use tokio::io::AsyncRead; use tokio::task::JoinHandle; use std::cmp::min; @@ -10,7 +11,9 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::Duration; -use postgres_ffi::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, PG_TLI}; +use postgres_ffi::xlog_utils::{ + XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, MAX_SEND_SIZE, PG_TLI, +}; use remote_storage::{GenericRemoteStorage, RemoteStorage}; use tokio::fs::File; use tokio::runtime::Builder; @@ -445,3 +448,49 @@ async fn backup_object(source_file: &Path, size: usize) -> Result<()> { Ok(()) } + +pub async fn read_object( + file_path: PathBuf, + offset: u64, +) -> (impl AsyncRead, JoinHandle>) { + let storage = REMOTE_STORAGE.get().expect("failed to get remote storage"); + + let (mut pipe_writer, pipe_reader) = tokio::io::duplex(MAX_SEND_SIZE); + + let copy_result = tokio::spawn(async move { + let res = match storage.as_ref().unwrap() { + GenericRemoteStorage::Local(local_storage) => { + let source = local_storage.remote_object_id(&file_path)?; + + info!( + "local download about to start from {} at offset {}", + source.display(), + offset + ); + local_storage + .download_byte_range(&source, offset, None, &mut pipe_writer) + .await + } + GenericRemoteStorage::S3(s3_storage) => { + let s3key = s3_storage.remote_object_id(&file_path)?; + + info!( + "S3 download about to start from {:?} at offset {}", + s3key, offset + ); + s3_storage + .download_byte_range(&s3key, offset, None, &mut pipe_writer) + .await + } + }; + + if let Err(e) = res { + error!("failed to download WAL segment from remote storage: {}", e); + Err(e) + } else { + Ok(()) + } + }); + + (pipe_reader, copy_result) +} diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index e3f1ce7333..5cfc96c84b 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -8,7 +8,9 @@ //! Note that last file has `.partial` suffix, that's different from postgres. use anyhow::{anyhow, bail, Context, Result}; -use std::io::{Read, Seek, SeekFrom}; +use std::io::{self, Seek, SeekFrom}; +use std::pin::Pin; +use tokio::io::AsyncRead; use lazy_static::lazy_static; use postgres_ffi::xlog_utils::{ @@ -26,6 +28,7 @@ use utils::{lsn::Lsn, zid::ZTenantTimelineId}; use crate::safekeeper::SafeKeeperState; +use crate::wal_backup::read_object; use crate::SafeKeeperConf; use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ}; @@ -33,6 +36,8 @@ use postgres_ffi::waldecoder::WalStreamDecoder; use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; +use tokio::io::{AsyncReadExt, AsyncSeekExt}; + lazy_static! { // The prometheus crate does not support u64 yet, i64 only (see `IntGauge`). // i64 is faster than f64, so update to u64 when available. @@ -504,69 +509,125 @@ pub struct WalReader { timeline_dir: PathBuf, wal_seg_size: usize, pos: Lsn, - file: Option, + wal_segment: Option>>, + + enable_remote_read: bool, + // S3 will be used to read WAL if LSN is not available locally + local_start_lsn: Lsn, } impl WalReader { - pub fn new(timeline_dir: PathBuf, wal_seg_size: usize, pos: Lsn) -> Self { - Self { - timeline_dir, - wal_seg_size, - pos, - file: None, + pub fn new( + timeline_dir: PathBuf, + state: &SafeKeeperState, + start_pos: Lsn, + enable_remote_read: bool, + ) -> Result { + if start_pos < state.timeline_start_lsn { + bail!( + "Requested streaming from {}, which is before the start of the timeline {}", + start_pos, + state.timeline_start_lsn + ); } + + if state.server.wal_seg_size == 0 + || state.timeline_start_lsn == Lsn(0) + || state.local_start_lsn == Lsn(0) + { + bail!("state uninitialized, no data to read"); + } + + Ok(Self { + timeline_dir, + wal_seg_size: state.server.wal_seg_size as usize, + pos: start_pos, + wal_segment: None, + enable_remote_read, + local_start_lsn: state.local_start_lsn, + }) } - pub fn read(&mut self, buf: &mut [u8]) -> Result { - // Take the `File` from `wal_file`, or open a new file. - let mut file = match self.file.take() { - Some(file) => file, - None => { - // Open a new file. - let segno = self.pos.segment_number(self.wal_seg_size); - let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size); - let wal_file_path = self.timeline_dir.join(wal_file_name); - Self::open_wal_file(&wal_file_path)? - } + pub async fn read(&mut self, buf: &mut [u8]) -> Result { + let mut wal_segment = match self.wal_segment.take() { + Some(reader) => reader, + None => self.open_segment().await?, }; - let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize; - // How much to read and send in message? We cannot cross the WAL file // boundary, and we don't want send more than provided buffer. + let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize; let send_size = min(buf.len(), self.wal_seg_size - xlogoff); // Read some data from the file. let buf = &mut buf[0..send_size]; - file.seek(SeekFrom::Start(xlogoff as u64)) - .and_then(|_| file.read_exact(buf)) - .context("Failed to read data from WAL file")?; - + let send_size = wal_segment.read_exact(buf).await?; self.pos += send_size as u64; - // Decide whether to reuse this file. If we don't set wal_file here - // a new file will be opened next time. + // Decide whether to reuse this file. If we don't set wal_segment here + // a new reader will be opened next time. if self.pos.segment_offset(self.wal_seg_size) != 0 { - self.file = Some(file); + self.wal_segment = Some(wal_segment); } Ok(send_size) } + /// Open WAL segment at the current position of the reader. + async fn open_segment(&self) -> Result>> { + let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize; + let segno = self.pos.segment_number(self.wal_seg_size); + let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size); + let wal_file_path = self.timeline_dir.join(wal_file_name); + + // Try to open local file, if we may have WAL locally + if self.pos >= self.local_start_lsn { + let res = Self::open_wal_file(&wal_file_path).await; + match res { + Ok(mut file) => { + file.seek(SeekFrom::Start(xlogoff as u64)).await?; + return Ok(Box::pin(file)); + } + Err(e) => { + let is_not_found = e.chain().any(|e| { + if let Some(e) = e.downcast_ref::() { + e.kind() == io::ErrorKind::NotFound + } else { + false + } + }); + if !is_not_found { + return Err(e); + } + // NotFound is expected, fall through to remote read + } + }; + } + + // Try to open remote file, if remote reads are enabled + if self.enable_remote_read { + let (reader, _) = read_object(wal_file_path, xlogoff as u64).await; + return Ok(Box::pin(reader)); + } + + bail!("WAL segment is not found") + } + /// Helper function for opening a wal file. - fn open_wal_file(wal_file_path: &Path) -> Result { + async fn open_wal_file(wal_file_path: &Path) -> Result { // First try to open the .partial file. let mut partial_path = wal_file_path.to_owned(); partial_path.set_extension("partial"); - if let Ok(opened_file) = File::open(&partial_path) { + if let Ok(opened_file) = tokio::fs::File::open(&partial_path).await { return Ok(opened_file); } // If that failed, try it without the .partial extension. - File::open(&wal_file_path) + tokio::fs::File::open(&wal_file_path) + .await .with_context(|| format!("Failed to open WAL file {:?}", wal_file_path)) .map_err(|e| { - error!("{}", e); + warn!("{}", e); e }) } diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index e4970272d4..05827baf86 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -2,6 +2,7 @@ import pytest import random import time import os +import shutil import signal import subprocess import sys @@ -353,7 +354,7 @@ def test_broker(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize('auth_enabled', [False, True]) def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.num_safekeepers = 2 - # to advance remote_consistent_llsn + # to advance remote_consistent_lsn neon_env_builder.enable_local_fs_remote_storage() neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() @@ -437,6 +438,26 @@ def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end): time.sleep(0.5) +def wait_wal_trim(tenant_id, timeline_id, sk, target_size): + started_at = time.time() + http_cli = sk.http_client() + while True: + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + sk_wal_size = get_dir_size(os.path.join(sk.data_dir(), tenant_id, + timeline_id)) / 1024 / 1024 + log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size:.2f}MB status={tli_status}") + + if sk_wal_size <= target_size: + break + + elapsed = time.time() - started_at + if elapsed > 20: + raise RuntimeError( + f"timed out waiting {elapsed:.0f}s for sk_id={sk.id} to trim WAL to {target_size:.2f}MB, current size is {sk_wal_size:.2f}MB" + ) + time.sleep(0.5) + + @pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs']) def test_wal_backup(neon_env_builder: NeonEnvBuilder, storage_type: str): neon_env_builder.num_safekeepers = 3 @@ -485,6 +506,81 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, storage_type: str): wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], '0/5000000') +@pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs']) +def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, storage_type: str): + neon_env_builder.num_safekeepers = 3 + if storage_type == 'local_fs': + neon_env_builder.enable_local_fs_remote_storage() + elif storage_type == 'mock_s3': + neon_env_builder.enable_s3_mock_remote_storage('test_s3_wal_replay') + else: + raise RuntimeError(f'Unknown storage type: {storage_type}') + neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER + + env = neon_env_builder.init_start() + env.neon_cli.create_branch('test_s3_wal_replay') + + env.pageserver.stop() + pageserver_tenants_dir = os.path.join(env.repo_dir, 'tenants') + pageserver_fresh_copy = os.path.join(env.repo_dir, 'tenants_fresh') + log.info(f"Creating a copy of pageserver in a fresh state at {pageserver_fresh_copy}") + shutil.copytree(pageserver_tenants_dir, pageserver_fresh_copy) + env.pageserver.start() + + pg = env.postgres.create_start('test_s3_wal_replay') + + # learn neon timeline from compute + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + expected_sum = 0 + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("create table t(key int, value text)") + cur.execute("insert into t values (1, 'payload')") + expected_sum += 1 + + offloaded_seg_end = ['0/3000000'] + for seg_end in offloaded_seg_end: + # roughly fills two segments + cur.execute("insert into t select generate_series(1,500000), 'payload'") + expected_sum += 500000 * 500001 // 2 + + cur.execute("select sum(key) from t") + assert cur.fetchone()[0] == expected_sum + + for sk in env.safekeepers: + wait_segment_offload(tenant_id, timeline_id, sk, seg_end) + + # advance remote_consistent_lsn to trigger WAL trimming + # this LSN should be less than commit_lsn, so timeline will be active=true in safekeepers, to push etcd updates + env.safekeepers[0].http_client().record_safekeeper_info( + tenant_id, timeline_id, {'remote_consistent_lsn': offloaded_seg_end[-1]}) + + for sk in env.safekeepers: + # require WAL to be trimmed, so no more than one segment is left on disk + wait_wal_trim(tenant_id, timeline_id, sk, 16 * 1.5) + + # replace pageserver with a fresh copy + pg.stop_and_destroy() + env.pageserver.stop() + + log.info(f'Removing current pageserver state at {pageserver_tenants_dir}') + shutil.rmtree(pageserver_tenants_dir) + log.info(f'Copying fresh pageserver state from {pageserver_fresh_copy}') + shutil.move(pageserver_fresh_copy, pageserver_tenants_dir) + + # start everything, verify data + env.pageserver.start() + pg.create_start('test_s3_wal_replay') + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("select sum(key) from t") + assert cur.fetchone()[0] == expected_sum + + class ProposerPostgres(PgProtocol): """Object for running postgres without NeonEnv""" def __init__(self, From f862373ac0da301b906f6bbed9eea1c9f47bd0e4 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Fri, 17 Jun 2022 20:43:54 +0300 Subject: [PATCH 0427/1022] Fix WAL timeout in test_s3_wal_replay (#1953) --- test_runner/batch_others/test_wal_acceptor.py | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 05827baf86..2b93dd160a 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -562,6 +562,16 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, storage_type: str): # require WAL to be trimmed, so no more than one segment is left on disk wait_wal_trim(tenant_id, timeline_id, sk, 16 * 1.5) + cur.execute('SELECT pg_current_wal_flush_lsn()') + last_lsn = cur.fetchone()[0] + + pageserver_lsn = env.pageserver.http_client().timeline_detail( + uuid.UUID(tenant_id), uuid.UUID((timeline_id)))["local"]["last_record_lsn"] + lag = lsn_from_hex(last_lsn) - lsn_from_hex(pageserver_lsn) + log.info( + f'Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb' + ) + # replace pageserver with a fresh copy pg.stop_and_destroy() env.pageserver.stop() @@ -571,8 +581,33 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, storage_type: str): log.info(f'Copying fresh pageserver state from {pageserver_fresh_copy}') shutil.move(pageserver_fresh_copy, pageserver_tenants_dir) - # start everything, verify data + # start pageserver and wait for replay env.pageserver.start() + wait_lsn_timeout = 60 * 3 + started_at = time.time() + last_debug_print = 0.0 + + while True: + elapsed = time.time() - started_at + if elapsed > wait_lsn_timeout: + raise RuntimeError(f'Timed out waiting for WAL redo') + + pageserver_lsn = env.pageserver.http_client().timeline_detail( + uuid.UUID(tenant_id), uuid.UUID((timeline_id)))["local"]["last_record_lsn"] + lag = lsn_from_hex(last_lsn) - lsn_from_hex(pageserver_lsn) + + if time.time() > last_debug_print + 10 or lag <= 0: + last_debug_print = time.time() + log.info(f'Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb') + + if lag <= 0: + break + + time.sleep(1) + + log.info(f'WAL redo took {elapsed} s') + + # verify data pg.create_start('test_s3_wal_replay') with closing(pg.connect()) as conn: From 83c7e6ce527f26129d0e49e6d11593e109b06bea Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 20 Jun 2022 15:28:43 +0300 Subject: [PATCH 0428/1022] Bump vendor/postgres. This brings in the change to not use a shared memory in the WAL redo process, to avoid running out of sysv shmem segments in the page server. Also, removal of callmemaybe bits. --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 50b6edfbe0..7faa67c3ca 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 50b6edfbe0c3b171bd6d407652e1e31a4c97aa8b +Subproject commit 7faa67c3ca53fcce51ae8fedf6b1af3b8cefd3e2 From ec0064c4425b606389417bd7c64cf407b5556a1a Mon Sep 17 00:00:00 2001 From: "Joshua D. Drake" Date: Mon, 20 Jun 2022 07:05:10 -0700 Subject: [PATCH 0429/1022] Small README.md changes (#1957) * Update make instructions for release and debug build. Update PostgreSQL glossary to proper version (14) * Continued cleanup of build instructions including removal of redundancies --- README.md | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index de9070ac0f..f63c21459e 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Pageserver consists of: ## Running local installation -#### building on Linux +#### Installing dependencies on Linux 1. Install build dependencies and other useful packages * On Ubuntu or Debian this set of packages should be sufficient to build the code: @@ -49,14 +49,7 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh ``` -3. Build neon and patched postgres -```sh -git clone --recursive https://github.com/neondatabase/neon.git -cd neon -make -j`nproc` -``` - -#### building on OSX (12.3.1) +#### Installing dependencies on OSX (12.3.1) 1. Install XCode and dependencies ``` xcode-select --install @@ -76,10 +69,19 @@ brew install libpq brew link --force libpq ``` -4. Build neon and patched postgres -```sh +#### Building on Linux and OSX + +1. Build neon and patched postgres +``` +# Note: The path to the neon sources can not contain a space. + git clone --recursive https://github.com/neondatabase/neon.git cd neon + +# The preferred and default is to make a debug build. This will create a +# demonstrably slower build than a release build. If you want to use a release +# build, utilize "`BUILD_TYPE=release make -j`nproc``" + make -j`nproc` ``` @@ -209,7 +211,7 @@ Same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, wh To get more familiar with this aspect, refer to: - [Neon glossary](/docs/glossary.md) -- [PostgreSQL glossary](https://www.postgresql.org/docs/13/glossary.html) +- [PostgreSQL glossary](https://www.postgresql.org/docs/14/glossary.html) - Other PostgreSQL documentation and sources (Neon fork sources can be found [here](https://github.com/neondatabase/postgres)) ## Join the development From 37465dafe3c34b586a88ba9ea40ca3de98994780 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Mon, 20 Jun 2022 11:40:55 -0400 Subject: [PATCH 0430/1022] Add wal backpressure tests (#1919) Resolves #1889. This PR adds new tests to measure the WAL backpressure's performance under different workloads. ## Changes - add new performance tests in `test_wal_backpressure.py` - allow safekeeper's fsync to be configurable when running tests --- test_runner/fixtures/neon_fixtures.py | 5 +- .../performance/test_wal_backpressure.py | 264 ++++++++++++++++++ 2 files changed, 268 insertions(+), 1 deletion(-) create mode 100644 test_runner/performance/test_wal_backpressure.py diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index fcefaad8fa..51afd3a03d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -500,6 +500,8 @@ class NeonEnvBuilder: num_safekeepers: int = 1, # Use non-standard SK ids to check for various parsing bugs safekeepers_id_start: int = 0, + # fsync is disabled by default to make the tests go faster + safekeepers_enable_fsync: bool = False, auth_enabled: bool = False, rust_log_override: Optional[str] = None, default_branch_name=DEFAULT_BRANCH_NAME): @@ -513,6 +515,7 @@ class NeonEnvBuilder: self.pageserver_config_override = pageserver_config_override self.num_safekeepers = num_safekeepers self.safekeepers_id_start = safekeepers_id_start + self.safekeepers_enable_fsync = safekeepers_enable_fsync self.auth_enabled = auth_enabled self.default_branch_name = default_branch_name self.env: Optional[NeonEnv] = None @@ -666,7 +669,7 @@ class NeonEnv: id = {id} pg_port = {port.pg} http_port = {port.http} - sync = false # Disable fsyncs to make the tests go faster""") + sync = {'true' if config.safekeepers_enable_fsync else 'false'}""") if config.auth_enabled: toml += textwrap.dedent(f""" auth_enabled = true diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py new file mode 100644 index 0000000000..873d1132a7 --- /dev/null +++ b/test_runner/performance/test_wal_backpressure.py @@ -0,0 +1,264 @@ +import statistics +import threading +import time +import timeit +from typing import Callable + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare +from fixtures.log_helper import log +from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin +from fixtures.utils import lsn_from_hex + +from performance.test_perf_pgbench import (get_durations_matrix, get_scales_matrix) + + +@pytest.fixture(params=["vanilla", "neon_off", "neon_on"]) +# This fixture constructs multiple `PgCompare` interfaces using a builder pattern. +# The builder parameters are encoded in the fixture's param. +# For example, to build a `NeonCompare` interface, the corresponding fixture's param should have +# a format of `neon_{safekeepers_enable_fsync}`. +# Note that, here "_" is used to separate builder parameters. +def pg_compare(request) -> PgCompare: + x = request.param.split("_") + + if x[0] == "vanilla": + # `VanillaCompare` interface + fixture = request.getfixturevalue("vanilla_compare") + assert isinstance(fixture, VanillaCompare) + + return fixture + else: + assert len(x) == 2, f"request param ({request.param}) should have a format of \ + `neon_{{safekeepers_enable_fsync}}`" + + # `NeonCompare` interface + neon_env_builder = request.getfixturevalue("neon_env_builder") + assert isinstance(neon_env_builder, NeonEnvBuilder) + + zenbenchmark = request.getfixturevalue("zenbenchmark") + assert isinstance(zenbenchmark, NeonBenchmarker) + + pg_bin = request.getfixturevalue("pg_bin") + assert isinstance(pg_bin, PgBin) + + neon_env_builder.safekeepers_enable_fsync = x[1] == "on" + + env = neon_env_builder.init_start() + env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) + + branch_name = request.node.name + return NeonCompare(zenbenchmark, env, pg_bin, branch_name) + + +def start_heavy_write_workload(env: PgCompare, n_tables: int, scale: int, num_iters: int): + """Start an intensive write workload across multiple tables. + + ## Single table workload: + At each step, insert new `new_rows_each_update` rows. + The variable `new_rows_each_update` is equal to `scale * 100_000`. + The number of steps is determined by `num_iters` variable.""" + new_rows_each_update = scale * 100_000 + + def start_single_table_workload(table_id: int): + for _ in range(num_iters): + with env.pg.connect().cursor() as cur: + cur.execute( + f"INSERT INTO t{table_id} SELECT FROM generate_series(1,{new_rows_each_update})" + ) + + with env.record_duration("run_duration"): + threads = [ + threading.Thread(target=start_single_table_workload, args=(i, )) + for i in range(n_tables) + ] + + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + +@pytest.mark.parametrize("n_tables", [5]) +@pytest.mark.parametrize("scale", get_scales_matrix(5)) +@pytest.mark.parametrize("num_iters", [10]) +def test_heavy_write_workload(pg_compare: PgCompare, n_tables: int, scale: int, num_iters: int): + env = pg_compare + + # Initializes test tables + with env.pg.connect().cursor() as cur: + for i in range(n_tables): + cur.execute( + f"CREATE TABLE t{i}(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')" + ) + cur.execute(f"INSERT INTO t{i} (key) VALUES (0)") + + workload_thread = threading.Thread(target=start_heavy_write_workload, + args=(env, n_tables, scale, num_iters)) + workload_thread.start() + + record_thread = threading.Thread(target=record_lsn_write_lag, + args=(env, lambda: workload_thread.is_alive())) + record_thread.start() + + record_read_latency(env, lambda: workload_thread.is_alive(), "SELECT * from t0 where key = 0") + workload_thread.join() + record_thread.join() + + +def start_pgbench_simple_update_workload(env: PgCompare, duration: int): + with env.record_duration("run_duration"): + env.pg_bin.run_capture([ + 'pgbench', + '-j10', + '-c10', + '-N', + f'-T{duration}', + '-Mprepared', + env.pg.connstr(options="-csynchronous_commit=off") + ]) + env.flush() + + +@pytest.mark.parametrize("scale", get_scales_matrix(100)) +@pytest.mark.parametrize("duration", get_durations_matrix()) +def test_pgbench_simple_update_workload(pg_compare: PgCompare, scale: int, duration: int): + env = pg_compare + + # initialize pgbench tables + env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.flush() + + workload_thread = threading.Thread(target=start_pgbench_simple_update_workload, + args=(env, duration)) + workload_thread.start() + + record_thread = threading.Thread(target=record_lsn_write_lag, + args=(env, lambda: workload_thread.is_alive())) + record_thread.start() + + record_read_latency(env, + lambda: workload_thread.is_alive(), + "SELECT * from pgbench_accounts where aid = 1") + workload_thread.join() + record_thread.join() + + +def start_pgbench_intensive_initialization(env: PgCompare, scale: int): + with env.record_duration("run_duration"): + # Needs to increase the statement timeout (default: 120s) because the + # initialization step can be slow with a large scale. + env.pg_bin.run_capture([ + 'pgbench', + f'-s{scale}', + '-i', + '-Idtg', + env.pg.connstr(options='-cstatement_timeout=300s') + ]) + + +@pytest.mark.parametrize("scale", get_scales_matrix(1000)) +def test_pgbench_intensive_init_workload(pg_compare: PgCompare, scale: int): + env = pg_compare + with env.pg.connect().cursor() as cur: + cur.execute("CREATE TABLE foo as select generate_series(1,100000)") + + workload_thread = threading.Thread(target=start_pgbench_intensive_initialization, + args=(env, scale)) + workload_thread.start() + + record_thread = threading.Thread(target=record_lsn_write_lag, + args=(env, lambda: workload_thread.is_alive())) + record_thread.start() + + record_read_latency(env, lambda: workload_thread.is_alive(), "SELECT count(*) from foo") + workload_thread.join() + record_thread.join() + + +def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_interval: float = 1.0): + if not isinstance(env, NeonCompare): + return + + lsn_write_lags = [] + last_received_lsn = 0 + last_pg_flush_lsn = 0 + + with env.pg.connect().cursor() as cur: + cur.execute("CREATE EXTENSION neon") + + while run_cond(): + cur.execute(''' + select pg_wal_lsn_diff(pg_current_wal_flush_lsn(),received_lsn), + pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_flush_lsn(),received_lsn)), + pg_current_wal_flush_lsn(), + received_lsn + from backpressure_lsns(); + ''') + + res = cur.fetchone() + lsn_write_lags.append(res[0]) + + curr_received_lsn = lsn_from_hex(res[3]) + lsn_process_speed = (curr_received_lsn - last_received_lsn) / (1024**2) + last_received_lsn = curr_received_lsn + + curr_pg_flush_lsn = lsn_from_hex(res[2]) + lsn_produce_speed = (curr_pg_flush_lsn - last_pg_flush_lsn) / (1024**2) + last_pg_flush_lsn = curr_pg_flush_lsn + + log.info( + f"received_lsn_lag={res[1]}, pg_flush_lsn={res[2]}, received_lsn={res[3]}, lsn_process_speed={lsn_process_speed:.2f}MB/s, lsn_produce_speed={lsn_produce_speed:.2f}MB/s" + ) + + time.sleep(pool_interval) + + env.zenbenchmark.record("lsn_write_lag_max", + float(max(lsn_write_lags) / (1024**2)), + "MB", + MetricReport.LOWER_IS_BETTER) + env.zenbenchmark.record("lsn_write_lag_avg", + float(statistics.mean(lsn_write_lags) / (1024**2)), + "MB", + MetricReport.LOWER_IS_BETTER) + env.zenbenchmark.record("lsn_write_lag_stdev", + float(statistics.stdev(lsn_write_lags) / (1024**2)), + "MB", + MetricReport.LOWER_IS_BETTER) + + +def record_read_latency(env: PgCompare, + run_cond: Callable[[], bool], + read_query: str, + read_interval: float = 1.0): + read_latencies = [] + + with env.pg.connect().cursor() as cur: + while run_cond(): + try: + t1 = timeit.default_timer() + cur.execute(read_query) + t2 = timeit.default_timer() + + log.info( + f"Executed read query {read_query}, got {cur.fetchall()}, read time {t2-t1:.2f}s" + ) + read_latencies.append(t2 - t1) + except Exception as err: + log.error(f"Got error when executing the read query: {err}") + + time.sleep(read_interval) + + env.zenbenchmark.record("read_latency_max", + max(read_latencies), + 's', + MetricReport.LOWER_IS_BETTER) + env.zenbenchmark.record("read_latency_avg", + statistics.mean(read_latencies), + 's', + MetricReport.LOWER_IS_BETTER) + env.zenbenchmark.record("read_latency_stdev", + statistics.stdev(read_latencies), + 's', + MetricReport.LOWER_IS_BETTER) From 6c4d6a218386b7e63890cbe06ad7fabeaff3f801 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 21 Jun 2022 02:02:24 +0300 Subject: [PATCH 0431/1022] Remove timeline_start_lsn check temporary. (#1964) --- safekeeper/src/send_wal.rs | 5 ++--- safekeeper/src/wal_storage.rs | 6 ++---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index a6b9de2050..7439d6a8f6 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -199,9 +199,8 @@ impl ReplicationConn { runtime.block_on(async move { let (_, persisted_state) = spg.timeline.get().get_state(); - if persisted_state.server.wal_seg_size == 0 - || persisted_state.timeline_start_lsn == Lsn(0) - { + // add persisted_state.timeline_start_lsn == Lsn(0) check + if persisted_state.server.wal_seg_size == 0 { bail!("Cannot start replication before connecting to walproposer"); } diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 5cfc96c84b..5cb7a8c758 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -531,10 +531,8 @@ impl WalReader { ); } - if state.server.wal_seg_size == 0 - || state.timeline_start_lsn == Lsn(0) - || state.local_start_lsn == Lsn(0) - { + // TODO: add state.timeline_start_lsn == Lsn(0) check + if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) { bail!("state uninitialized, no data to read"); } From 1ca28e6f3cc87840a28afc2a3cd4bcfb064de1c3 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Tue, 21 Jun 2022 11:04:10 -0400 Subject: [PATCH 0432/1022] Import basebackup into pageserver (#1925) Allow importing basebackup taken from vanilla postgres or another pageserver via psql copy in protocol. --- Cargo.lock | 1 + control_plane/src/storage.rs | 53 ++- neon_local/src/main.rs | 53 ++- pageserver/Cargo.toml | 1 + pageserver/src/basebackup.rs | 29 +- pageserver/src/import_datadir.rs | 493 +++++++++++++++--------- pageserver/src/layered_repository.rs | 16 +- pageserver/src/page_service.rs | 231 ++++++++++- pageserver/src/pgdatadir_mapping.rs | 10 +- test_runner/batch_others/test_import.py | 193 ++++++++++ test_runner/fixtures/neon_fixtures.py | 4 +- 11 files changed, 875 insertions(+), 209 deletions(-) create mode 100644 test_runner/batch_others/test_import.py diff --git a/Cargo.lock b/Cargo.lock index c615766eb8..dca525941d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1842,6 +1842,7 @@ dependencies = [ "tracing", "url", "utils", + "walkdir", "workspace_hack", ] diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index a8f21406fb..f1eaa99904 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; -use std::io::Write; +use std::fs::File; +use std::io::{BufReader, Write}; use std::net::TcpStream; use std::num::NonZeroU64; use std::path::PathBuf; @@ -527,4 +528,54 @@ impl PageServerNode { Ok(timeline_info_response) } + + /// Import a basebackup prepared using either: + /// a) `pg_basebackup -F tar`, or + /// b) The `fullbackup` pageserver endpoint + /// + /// # Arguments + /// * `tenant_id` - tenant to import into. Created if not exists + /// * `timeline_id` - id to assign to imported timeline + /// * `base` - (start lsn of basebackup, path to `base.tar` file) + /// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`) + pub fn timeline_import( + &self, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + base: (Lsn, PathBuf), + pg_wal: Option<(Lsn, PathBuf)>, + ) -> anyhow::Result<()> { + let mut client = self.pg_connection_config.connect(NoTls).unwrap(); + + // Init base reader + let (start_lsn, base_tarfile_path) = base; + let base_tarfile = File::open(base_tarfile_path)?; + let mut base_reader = BufReader::new(base_tarfile); + + // Init wal reader if necessary + let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal { + let wal_tarfile = File::open(wal_tarfile_path)?; + let wal_reader = BufReader::new(wal_tarfile); + (end_lsn, Some(wal_reader)) + } else { + (start_lsn, None) + }; + + // Import base + let import_cmd = + format!("import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn}"); + let mut writer = client.copy_in(&import_cmd)?; + io::copy(&mut base_reader, &mut writer)?; + writer.finish()?; + + // Import wal if necessary + if let Some(mut wal_reader) = wal_reader { + let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"); + let mut writer = client.copy_in(&import_cmd)?; + io::copy(&mut wal_reader, &mut writer)?; + writer.finish()?; + } + + Ok(()) + } } diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index 8d39fe5d0d..35e2d9c9e2 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -14,7 +14,7 @@ use safekeeper::defaults::{ DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; use std::collections::{BTreeSet, HashMap}; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::process::exit; use std::str::FromStr; use utils::{ @@ -159,6 +159,20 @@ fn main() -> Result<()> { .about("Create a new blank timeline") .arg(tenant_id_arg.clone()) .arg(branch_name_arg.clone())) + .subcommand(App::new("import") + .about("Import timeline from basebackup directory") + .arg(tenant_id_arg.clone()) + .arg(timeline_id_arg.clone()) + .arg(Arg::new("node-name").long("node-name").takes_value(true) + .help("Name to assign to the imported timeline")) + .arg(Arg::new("base-tarfile").long("base-tarfile").takes_value(true) + .help("Basebackup tarfile to import")) + .arg(Arg::new("base-lsn").long("base-lsn").takes_value(true) + .help("Lsn the basebackup starts at")) + .arg(Arg::new("wal-tarfile").long("wal-tarfile").takes_value(true) + .help("Wal to add after base")) + .arg(Arg::new("end-lsn").long("end-lsn").takes_value(true) + .help("Lsn the basebackup ends at"))) ).subcommand( App::new("tenant") .setting(AppSettings::ArgRequiredElseHelp) @@ -613,6 +627,43 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - timeline.timeline_id, last_record_lsn, tenant_id, ); } + Some(("import", import_match)) => { + let tenant_id = get_tenant_id(import_match, env)?; + let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided"); + let name = import_match + .value_of("node-name") + .ok_or_else(|| anyhow!("No node name provided"))?; + + // Parse base inputs + let base_tarfile = import_match + .value_of("base-tarfile") + .map(|s| PathBuf::from_str(s).unwrap()) + .ok_or_else(|| anyhow!("No base-tarfile provided"))?; + let base_lsn = Lsn::from_str( + import_match + .value_of("base-lsn") + .ok_or_else(|| anyhow!("No base-lsn provided"))?, + )?; + let base = (base_lsn, base_tarfile); + + // Parse pg_wal inputs + let wal_tarfile = import_match + .value_of("wal-tarfile") + .map(|s| PathBuf::from_str(s).unwrap()); + let end_lsn = import_match + .value_of("end-lsn") + .map(|s| Lsn::from_str(s).unwrap()); + // TODO validate both or none are provided + let pg_wal = end_lsn.zip(wal_tarfile); + + let mut cplane = ComputeControlPlane::load(env.clone())?; + println!("Importing timeline into pageserver ..."); + pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal)?; + println!("Creating node for imported timeline ..."); + env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; + cplane.new_node(tenant_id, name, timeline_id, None, None)?; + println!("Done"); + } Some(("branch", branch_match)) => { let tenant_id = get_tenant_id(branch_match, env)?; let new_branch_name = branch_match diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 298addb838..b7d97a67c0 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -61,6 +61,7 @@ utils = { path = "../libs/utils" } remote_storage = { path = "../libs/remote_storage" } workspace_hack = { version = "0.1", path = "../workspace_hack" } close_fds = "0.3.2" +walkdir = "2.3.2" [dev-dependencies] hex-literal = "0.3" diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 44a6442522..ed300b3360 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -112,6 +112,8 @@ where } pub fn send_tarball(mut self) -> anyhow::Result<()> { + // TODO include checksum + // Create pgdata subdirs structure for dir in pg_constants::PGDATA_SUBDIRS.iter() { let header = new_tar_header_dir(*dir)?; @@ -355,24 +357,21 @@ where pg_control.checkPointCopy = checkpoint; pg_control.state = pg_constants::DB_SHUTDOWNED; - // Postgres doesn't recognize the zenith.signal file and doesn't need it. - if !self.full_backup { - // add zenith.signal file - let mut zenith_signal = String::new(); - if self.prev_record_lsn == Lsn(0) { - if self.lsn == self.timeline.tline.get_ancestor_lsn() { - write!(zenith_signal, "PREV LSN: none")?; - } else { - write!(zenith_signal, "PREV LSN: invalid")?; - } + // add zenith.signal file + let mut zenith_signal = String::new(); + if self.prev_record_lsn == Lsn(0) { + if self.lsn == self.timeline.tline.get_ancestor_lsn() { + write!(zenith_signal, "PREV LSN: none")?; } else { - write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?; + write!(zenith_signal, "PREV LSN: invalid")?; } - self.ar.append( - &new_tar_header("zenith.signal", zenith_signal.len() as u64)?, - zenith_signal.as_bytes(), - )?; + } else { + write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?; } + self.ar.append( + &new_tar_header("zenith.signal", zenith_signal.len() as u64)?, + zenith_signal.as_bytes(), + )?; //send pg_control let pg_control_bytes = pg_control.encode(); diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 703ee8f1b1..3ede949885 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -2,7 +2,6 @@ //! Import data and WAL from a PostgreSQL data directory and WAL segments into //! a zenith Timeline. //! -use std::fs; use std::fs::File; use std::io::{Read, Seek, SeekFrom}; use std::path::{Path, PathBuf}; @@ -10,16 +9,18 @@ use std::path::{Path, PathBuf}; use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use tracing::*; +use walkdir::WalkDir; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::repository::Repository; +use crate::repository::Timeline; use crate::walingest::WalIngest; use postgres_ffi::relfile_utils::*; use postgres_ffi::waldecoder::*; use postgres_ffi::xlog_utils::*; +use postgres_ffi::Oid; use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED}; -use postgres_ffi::{Oid, TransactionId}; use utils::lsn::Lsn; /// @@ -35,100 +36,29 @@ pub fn import_timeline_from_postgres_datadir( ) -> Result<()> { let mut pg_control: Option = None; + // TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn) + // Then fishing out pg_control would be unnecessary let mut modification = tline.begin_modification(lsn); modification.init_empty()?; - // Scan 'global' - let mut relfiles: Vec = Vec::new(); - for direntry in fs::read_dir(path.join("global"))? { - let direntry = direntry?; - match direntry.file_name().to_str() { - None => continue, + // Import all but pg_wal + let all_but_wal = WalkDir::new(path) + .into_iter() + .filter_entry(|entry| !entry.path().ends_with("pg_wal")); + for entry in all_but_wal { + let entry = entry?; + let metadata = entry.metadata().expect("error getting dir entry metadata"); + if metadata.is_file() { + let absolute_path = entry.path(); + let relative_path = absolute_path.strip_prefix(path)?; - Some("pg_control") => { - pg_control = Some(import_control_file(&mut modification, &direntry.path())?); - } - Some("pg_filenode.map") => { - import_relmap_file( - &mut modification, - pg_constants::GLOBALTABLESPACE_OID, - 0, - &direntry.path(), - )?; - } - - // Load any relation files into the page server (but only after the other files) - _ => relfiles.push(direntry.path()), - } - } - for relfile in relfiles { - import_relfile( - &mut modification, - &relfile, - pg_constants::GLOBALTABLESPACE_OID, - 0, - )?; - } - - // Scan 'base'. It contains database dirs, the database OID is the filename. - // E.g. 'base/12345', where 12345 is the database OID. - for direntry in fs::read_dir(path.join("base"))? { - let direntry = direntry?; - - //skip all temporary files - if direntry.file_name().to_string_lossy() == "pgsql_tmp" { - continue; - } - - let dboid = direntry.file_name().to_string_lossy().parse::()?; - - let mut relfiles: Vec = Vec::new(); - for direntry in fs::read_dir(direntry.path())? { - let direntry = direntry?; - match direntry.file_name().to_str() { - None => continue, - - Some("PG_VERSION") => { - //modification.put_dbdir_creation(pg_constants::DEFAULTTABLESPACE_OID, dboid)?; - } - Some("pg_filenode.map") => import_relmap_file( - &mut modification, - pg_constants::DEFAULTTABLESPACE_OID, - dboid, - &direntry.path(), - )?, - - // Load any relation files into the page server - _ => relfiles.push(direntry.path()), + let file = File::open(absolute_path)?; + let len = metadata.len() as usize; + if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? { + pg_control = Some(control_file); } } - for relfile in relfiles { - import_relfile( - &mut modification, - &relfile, - pg_constants::DEFAULTTABLESPACE_OID, - dboid, - )?; - } } - for entry in fs::read_dir(path.join("pg_xact"))? { - let entry = entry?; - import_slru_file(&mut modification, SlruKind::Clog, &entry.path())?; - } - for entry in fs::read_dir(path.join("pg_multixact").join("members"))? { - let entry = entry?; - import_slru_file(&mut modification, SlruKind::MultiXactMembers, &entry.path())?; - } - for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? { - let entry = entry?; - import_slru_file(&mut modification, SlruKind::MultiXactOffsets, &entry.path())?; - } - for entry in fs::read_dir(path.join("pg_twophase"))? { - let entry = entry?; - let xid = u32::from_str_radix(&entry.path().to_string_lossy(), 16)?; - import_twophase_file(&mut modification, xid, &entry.path())?; - } - // TODO: Scan pg_tblspc // We're done importing all the data files. modification.commit()?; @@ -158,31 +88,30 @@ pub fn import_timeline_from_postgres_datadir( } // subroutine of import_timeline_from_postgres_datadir(), to load one relation file. -fn import_relfile( +fn import_rel( modification: &mut DatadirModification, path: &Path, spcoid: Oid, dboid: Oid, + mut reader: Reader, + len: usize, ) -> anyhow::Result<()> { // Does it look like a relation file? trace!("importing rel file {}", path.display()); - let (relnode, forknum, segno) = parse_relfilename(&path.file_name().unwrap().to_string_lossy()) - .map_err(|e| { - warn!("unrecognized file in postgres datadir: {:?} ({})", path, e); - e - })?; + let filename = &path + .file_name() + .expect("missing rel filename") + .to_string_lossy(); + let (relnode, forknum, segno) = parse_relfilename(filename).map_err(|e| { + warn!("unrecognized file in postgres datadir: {:?} ({})", path, e); + e + })?; - let mut file = File::open(path)?; let mut buf: [u8; 8192] = [0u8; 8192]; - let len = file.metadata().unwrap().len(); - ensure!(len % pg_constants::BLCKSZ as u64 == 0); - let nblocks = len / pg_constants::BLCKSZ as u64; - - if segno != 0 { - todo!(); - } + ensure!(len % pg_constants::BLCKSZ as usize == 0); + let nblocks = len / pg_constants::BLCKSZ as usize; let rel = RelTag { spcnode: spcoid, @@ -190,11 +119,22 @@ fn import_relfile( relnode, forknum, }; - modification.put_rel_creation(rel, nblocks as u32)?; let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32); + + // Call put_rel_creation for every segment of the relation, + // because there is no guarantee about the order in which we are processing segments. + // ignore "relation already exists" error + if let Err(e) = modification.put_rel_creation(rel, nblocks as u32) { + if e.to_string().contains("already exists") { + debug!("relation {} already exists. we must be extending it", rel); + } else { + return Err(e); + } + } + loop { - let r = file.read_exact(&mut buf); + let r = reader.read_exact(&mut buf); match r { Ok(_) => { modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; @@ -204,7 +144,9 @@ fn import_relfile( Err(err) => match err.kind() { std::io::ErrorKind::UnexpectedEof => { // reached EOF. That's expected. - ensure!(blknum == nblocks as u32, "unexpected EOF"); + let relative_blknum = + blknum - segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32); + ensure!(relative_blknum == nblocks as u32, "unexpected EOF"); break; } _ => { @@ -215,96 +157,43 @@ fn import_relfile( blknum += 1; } + // Update relation size + // + // If we process rel segments out of order, + // put_rel_extend will skip the update. + modification.put_rel_extend(rel, blknum)?; + Ok(()) } -/// Import a relmapper (pg_filenode.map) file into the repository -fn import_relmap_file( - modification: &mut DatadirModification, - spcnode: Oid, - dbnode: Oid, - path: &Path, -) -> Result<()> { - let mut file = File::open(path)?; - let mut buffer = Vec::new(); - // read the whole file - file.read_to_end(&mut buffer)?; - - trace!("importing relmap file {}", path.display()); - - modification.put_relmap_file(spcnode, dbnode, Bytes::copy_from_slice(&buffer[..]))?; - Ok(()) -} - -/// Import a twophase state file (pg_twophase/) into the repository -fn import_twophase_file( - modification: &mut DatadirModification, - xid: TransactionId, - path: &Path, -) -> Result<()> { - let mut file = File::open(path)?; - let mut buffer = Vec::new(); - // read the whole file - file.read_to_end(&mut buffer)?; - - trace!("importing non-rel file {}", path.display()); - - modification.put_twophase_file(xid, Bytes::copy_from_slice(&buffer[..]))?; - Ok(()) -} - -/// -/// Import pg_control file into the repository. -/// -/// The control file is imported as is, but we also extract the checkpoint record -/// from it and store it separated. -fn import_control_file( - modification: &mut DatadirModification, - path: &Path, -) -> Result { - let mut file = File::open(path)?; - let mut buffer = Vec::new(); - // read the whole file - file.read_to_end(&mut buffer)?; - - trace!("importing control file {}", path.display()); - - // Import it as ControlFile - modification.put_control_file(Bytes::copy_from_slice(&buffer[..]))?; - - // Extract the checkpoint record and import it separately. - let pg_control = ControlFileData::decode(&buffer)?; - let checkpoint_bytes = pg_control.checkPointCopy.encode()?; - modification.put_checkpoint(checkpoint_bytes)?; - - Ok(pg_control) -} - -/// /// Import an SLRU segment file /// -fn import_slru_file( +fn import_slru( modification: &mut DatadirModification, slru: SlruKind, path: &Path, + mut reader: Reader, + len: usize, ) -> Result<()> { trace!("importing slru file {}", path.display()); - let mut file = File::open(path)?; let mut buf: [u8; 8192] = [0u8; 8192]; - let segno = u32::from_str_radix(&path.file_name().unwrap().to_string_lossy(), 16)?; + let filename = &path + .file_name() + .expect("missing slru filename") + .to_string_lossy(); + let segno = u32::from_str_radix(filename, 16)?; - let len = file.metadata().unwrap().len(); - ensure!(len % pg_constants::BLCKSZ as u64 == 0); // we assume SLRU block size is the same as BLCKSZ - let nblocks = len / pg_constants::BLCKSZ as u64; + ensure!(len % pg_constants::BLCKSZ as usize == 0); // we assume SLRU block size is the same as BLCKSZ + let nblocks = len / pg_constants::BLCKSZ as usize; - ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as u64); + ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize); modification.put_slru_segment_creation(slru, segno, nblocks as u32)?; let mut rpageno = 0; loop { - let r = file.read_exact(&mut buf); + let r = reader.read_exact(&mut buf); match r { Ok(_) => { modification.put_slru_page_image( @@ -396,10 +285,258 @@ fn import_wal( } if last_lsn != startpoint { - debug!("reached end of WAL at {}", last_lsn); + info!("reached end of WAL at {}", last_lsn); } else { info!("no WAL to import at {}", last_lsn); } Ok(()) } + +pub fn import_basebackup_from_tar( + tline: &mut DatadirTimeline, + reader: Reader, + base_lsn: Lsn, +) -> Result<()> { + info!("importing base at {}", base_lsn); + let mut modification = tline.begin_modification(base_lsn); + modification.init_empty()?; + + let mut pg_control: Option = None; + + // Import base + for base_tar_entry in tar::Archive::new(reader).entries()? { + let entry = base_tar_entry?; + let header = entry.header(); + let len = header.entry_size()? as usize; + let file_path = header.path()?.into_owned(); + + match header.entry_type() { + tar::EntryType::Regular => { + if let Some(res) = import_file(&mut modification, file_path.as_ref(), entry, len)? { + // We found the pg_control file. + pg_control = Some(res); + } + } + tar::EntryType::Directory => { + debug!("directory {:?}", file_path); + } + _ => { + panic!("tar::EntryType::?? {}", file_path.display()); + } + } + } + + // sanity check: ensure that pg_control is loaded + let _pg_control = pg_control.context("pg_control file not found")?; + + modification.commit()?; + Ok(()) +} + +pub fn import_wal_from_tar( + tline: &mut DatadirTimeline, + reader: Reader, + start_lsn: Lsn, + end_lsn: Lsn, +) -> Result<()> { + // Set up walingest mutable state + let mut waldecoder = WalStreamDecoder::new(start_lsn); + let mut segno = start_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE); + let mut offset = start_lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE); + let mut last_lsn = start_lsn; + let mut walingest = WalIngest::new(tline, start_lsn)?; + + // Ingest wal until end_lsn + info!("importing wal until {}", end_lsn); + let mut pg_wal_tar = tar::Archive::new(reader); + let mut pg_wal_entries_iter = pg_wal_tar.entries()?; + while last_lsn <= end_lsn { + let bytes = { + let entry = pg_wal_entries_iter.next().expect("expected more wal")?; + let header = entry.header(); + let file_path = header.path()?.into_owned(); + + match header.entry_type() { + tar::EntryType::Regular => { + // FIXME: assume postgresql tli 1 for now + let expected_filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE); + let file_name = file_path + .file_name() + .expect("missing wal filename") + .to_string_lossy(); + ensure!(expected_filename == file_name); + + debug!("processing wal file {:?}", file_path); + read_all_bytes(entry)? + } + tar::EntryType::Directory => { + debug!("directory {:?}", file_path); + continue; + } + _ => { + panic!("tar::EntryType::?? {}", file_path.display()); + } + } + }; + + waldecoder.feed_bytes(&bytes[offset..]); + + while last_lsn <= end_lsn { + if let Some((lsn, recdata)) = waldecoder.poll_decode()? { + walingest.ingest_record(tline, recdata, lsn)?; + last_lsn = lsn; + + debug!("imported record at {} (end {})", lsn, end_lsn); + } + } + + debug!("imported records up to {}", last_lsn); + segno += 1; + offset = 0; + } + + if last_lsn != start_lsn { + info!("reached end of WAL at {}", last_lsn); + } else { + info!("there was no WAL to import at {}", last_lsn); + } + + // Log any extra unused files + for e in &mut pg_wal_entries_iter { + let entry = e?; + let header = entry.header(); + let file_path = header.path()?.into_owned(); + info!("skipping {:?}", file_path); + } + + Ok(()) +} + +pub fn import_file( + modification: &mut DatadirModification, + file_path: &Path, + reader: Reader, + len: usize, +) -> Result> { + debug!("looking at {:?}", file_path); + + if file_path.starts_with("global") { + let spcnode = pg_constants::GLOBALTABLESPACE_OID; + let dbnode = 0; + + match file_path + .file_name() + .expect("missing filename") + .to_string_lossy() + .as_ref() + { + "pg_control" => { + let bytes = read_all_bytes(reader)?; + + // Extract the checkpoint record and import it separately. + let pg_control = ControlFileData::decode(&bytes[..])?; + let checkpoint_bytes = pg_control.checkPointCopy.encode()?; + modification.put_checkpoint(checkpoint_bytes)?; + debug!("imported control file"); + + // Import it as ControlFile + modification.put_control_file(bytes)?; + return Ok(Some(pg_control)); + } + "pg_filenode.map" => { + let bytes = read_all_bytes(reader)?; + modification.put_relmap_file(spcnode, dbnode, bytes)?; + debug!("imported relmap file") + } + "PG_VERSION" => { + debug!("ignored"); + } + _ => { + import_rel(modification, file_path, spcnode, dbnode, reader, len)?; + debug!("imported rel creation"); + } + } + } else if file_path.starts_with("base") { + let spcnode = pg_constants::DEFAULTTABLESPACE_OID; + let dbnode: u32 = file_path + .iter() + .nth(1) + .expect("invalid file path, expected dbnode") + .to_string_lossy() + .parse()?; + + match file_path + .file_name() + .expect("missing base filename") + .to_string_lossy() + .as_ref() + { + "pg_filenode.map" => { + let bytes = read_all_bytes(reader)?; + modification.put_relmap_file(spcnode, dbnode, bytes)?; + debug!("imported relmap file") + } + "PG_VERSION" => { + debug!("ignored"); + } + _ => { + import_rel(modification, file_path, spcnode, dbnode, reader, len)?; + debug!("imported rel creation"); + } + } + } else if file_path.starts_with("pg_xact") { + let slru = SlruKind::Clog; + + import_slru(modification, slru, file_path, reader, len)?; + debug!("imported clog slru"); + } else if file_path.starts_with("pg_multixact/offsets") { + let slru = SlruKind::MultiXactOffsets; + + import_slru(modification, slru, file_path, reader, len)?; + debug!("imported multixact offsets slru"); + } else if file_path.starts_with("pg_multixact/members") { + let slru = SlruKind::MultiXactMembers; + + import_slru(modification, slru, file_path, reader, len)?; + debug!("imported multixact members slru"); + } else if file_path.starts_with("pg_twophase") { + let file_name = &file_path + .file_name() + .expect("missing twophase filename") + .to_string_lossy(); + let xid = u32::from_str_radix(file_name, 16)?; + + let bytes = read_all_bytes(reader)?; + modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?; + debug!("imported twophase file"); + } else if file_path.starts_with("pg_wal") { + debug!("found wal file in base section. ignore it"); + } else if file_path.starts_with("zenith.signal") { + // Parse zenith signal file to set correct previous LSN + let bytes = read_all_bytes(reader)?; + // zenith.signal format is "PREV LSN: prev_lsn" + let zenith_signal = std::str::from_utf8(&bytes)?; + let zenith_signal = zenith_signal.split(':').collect::>(); + let prev_lsn = zenith_signal[1].trim().parse::()?; + + let writer = modification.tline.tline.writer(); + writer.finish_write(prev_lsn); + + debug!("imported zenith signal {}", prev_lsn); + } else if file_path.starts_with("pg_tblspc") { + // TODO Backups exported from neon won't have pg_tblspc, but we will need + // this to import arbitrary postgres databases. + bail!("Importing pg_tblspc is not implemented"); + } else { + debug!("ignored"); + } + + Ok(None) +} + +fn read_all_bytes(mut reader: Reader) -> Result { + let mut buf: Vec = vec![]; + reader.read_to_end(&mut buf)?; + Ok(Bytes::copy_from_slice(&buf[..])) +} diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 5c5b03268a..fdd03ecf8b 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -243,15 +243,15 @@ impl Repository for LayeredRepository { ); timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); + // Insert if not exists let timeline = Arc::new(timeline); - let r = timelines.insert( - timelineid, - LayeredTimelineEntry::Loaded(Arc::clone(&timeline)), - ); - ensure!( - r.is_none(), - "assertion failure, inserted duplicate timeline" - ); + match timelines.entry(timelineid) { + Entry::Occupied(_) => bail!("Timeline already exists"), + Entry::Vacant(vacant) => { + vacant.insert(LayeredTimelineEntry::Loaded(Arc::clone(&timeline))) + } + }; + Ok(timeline) } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 406228f034..079f477f75 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -13,7 +13,7 @@ use anyhow::{bail, ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use lazy_static::lazy_static; use regex::Regex; -use std::io; +use std::io::{self, Read}; use std::net::TcpListener; use std::str; use std::str::FromStr; @@ -29,6 +29,8 @@ use utils::{ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; +use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; +use crate::layered_repository::LayeredRepository; use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp}; use crate::profiling::profpoint_start; use crate::reltag::RelTag; @@ -200,6 +202,96 @@ impl PagestreamBeMessage { } } +/// Implements Read for the server side of CopyIn +struct CopyInReader<'a> { + pgb: &'a mut PostgresBackend, + + /// Overflow buffer for bytes sent in CopyData messages + /// that the reader (caller of read) hasn't asked for yet. + /// TODO use BytesMut? + buf: Vec, + + /// Bytes before `buf_begin` are considered as dropped. + /// This allows us to implement O(1) pop_front on Vec. + /// The Vec won't grow large because we only add to it + /// when it's empty. + buf_begin: usize, +} + +impl<'a> CopyInReader<'a> { + // NOTE: pgb should be in copy in state already + fn new(pgb: &'a mut PostgresBackend) -> Self { + Self { + pgb, + buf: Vec::<_>::new(), + buf_begin: 0, + } + } +} + +impl<'a> Drop for CopyInReader<'a> { + fn drop(&mut self) { + // Finalize copy protocol so that self.pgb can be reused + // TODO instead, maybe take ownership of pgb and give it back at the end + let mut buf: Vec = vec![]; + let _ = self.read_to_end(&mut buf); + } +} + +impl<'a> Read for CopyInReader<'a> { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + while !thread_mgr::is_shutdown_requested() { + // Return from buffer if nonempty + if self.buf_begin < self.buf.len() { + let bytes_to_read = std::cmp::min(buf.len(), self.buf.len() - self.buf_begin); + buf[..bytes_to_read].copy_from_slice(&self.buf[self.buf_begin..][..bytes_to_read]); + self.buf_begin += bytes_to_read; + return Ok(bytes_to_read); + } + + // Delete garbage + self.buf.clear(); + self.buf_begin = 0; + + // Wait for client to send CopyData bytes + match self.pgb.read_message() { + Ok(Some(message)) => { + let copy_data_bytes = match message { + FeMessage::CopyData(bytes) => bytes, + FeMessage::CopyDone => return Ok(0), + FeMessage::Sync => continue, + m => { + let msg = format!("unexpected message {:?}", m); + self.pgb.write_message(&BeMessage::ErrorResponse(&msg))?; + return Err(io::Error::new(io::ErrorKind::Other, msg)); + } + }; + + // Return as much as we can, saving the rest in self.buf + let mut reader = copy_data_bytes.reader(); + let bytes_read = reader.read(buf)?; + reader.read_to_end(&mut self.buf)?; + return Ok(bytes_read); + } + Ok(None) => { + let msg = "client closed connection"; + self.pgb.write_message(&BeMessage::ErrorResponse(msg))?; + return Err(io::Error::new(io::ErrorKind::Other, msg)); + } + Err(e) => { + if !is_socket_read_timed_out(&e) { + return Err(io::Error::new(io::ErrorKind::Other, e)); + } + } + } + } + + // Shutting down + let msg = "Importer thread was shut down"; + Err(io::Error::new(io::ErrorKind::Other, msg)) + } +} + /////////////////////////////////////////////////////////////////////////////// /// @@ -447,6 +539,98 @@ impl PageServerHandler { Ok(()) } + fn handle_import_basebackup( + &self, + pgb: &mut PostgresBackend, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + base_lsn: Lsn, + _end_lsn: Lsn, + ) -> anyhow::Result<()> { + thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); + let _enter = + info_span!("import basebackup", timeline = %timeline_id, tenant = %tenant_id).entered(); + + // Create empty timeline + info!("creating new timeline"); + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + let timeline = repo.create_empty_timeline(timeline_id, Lsn(0))?; + let repartition_distance = repo.get_checkpoint_distance(); + let mut datadir_timeline = + DatadirTimeline::::new(timeline, repartition_distance); + + // TODO mark timeline as not ready until it reaches end_lsn. + // We might have some wal to import as well, and we should prevent compute + // from connecting before that and writing conflicting wal. + // + // This is not relevant for pageserver->pageserver migrations, since there's + // no wal to import. But should be fixed if we want to import from postgres. + + // TODO leave clean state on error. For now you can use detach to clean + // up broken state from a failed import. + + // Import basebackup provided via CopyData + info!("importing basebackup"); + pgb.write_message(&BeMessage::CopyInResponse)?; + let reader = CopyInReader::new(pgb); + import_basebackup_from_tar(&mut datadir_timeline, reader, base_lsn)?; + + // TODO check checksum + // Meanwhile you can verify client-side by taking fullbackup + // and checking that it matches in size with what was imported. + // It wouldn't work if base came from vanilla postgres though, + // since we discard some log files. + + // Flush data to disk, then upload to s3 + info!("flushing layers"); + datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?; + + info!("done"); + Ok(()) + } + + fn handle_import_wal( + &self, + pgb: &mut PostgresBackend, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + start_lsn: Lsn, + end_lsn: Lsn, + ) -> anyhow::Result<()> { + thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); + let _enter = + info_span!("import wal", timeline = %timeline_id, tenant = %tenant_id).entered(); + + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + let timeline = repo.get_timeline_load(timeline_id)?; + ensure!(timeline.get_last_record_lsn() == start_lsn); + + let repartition_distance = repo.get_checkpoint_distance(); + let mut datadir_timeline = + DatadirTimeline::::new(timeline, repartition_distance); + + // TODO leave clean state on error. For now you can use detach to clean + // up broken state from a failed import. + + // Import wal provided via CopyData + info!("importing wal"); + pgb.write_message(&BeMessage::CopyInResponse)?; + let reader = CopyInReader::new(pgb); + import_wal_from_tar(&mut datadir_timeline, reader, start_lsn, end_lsn)?; + + // TODO Does it make sense to overshoot? + ensure!(datadir_timeline.tline.get_last_record_lsn() >= end_lsn); + + // Flush data to disk, then upload to s3. No need for a forced checkpoint. + // We only want to persist the data, and it doesn't matter if it's in the + // shape of deltas or images. + info!("flushing layers"); + datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?; + + info!("done"); + Ok(()) + } + /// Helper function to handle the LSN from client request. /// /// Each GetPage (and Exists and Nblocks) request includes information about @@ -750,6 +934,51 @@ impl postgres_backend::Handler for PageServerHandler { // Check that the timeline exists self.handle_basebackup_request(pgb, timelineid, lsn, tenantid, true)?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + } else if query_string.starts_with("import basebackup ") { + // Import the `base` section (everything but the wal) of a basebackup. + // Assumes the tenant already exists on this pageserver. + // + // Files are scheduled to be persisted to remote storage, and the + // caller should poll the http api to check when that is done. + // + // Example import command: + // 1. Get start/end LSN from backup_manifest file + // 2. Run: + // cat my_backup/base.tar | psql -h $PAGESERVER \ + // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN" + let (_, params_raw) = query_string.split_at("import basebackup ".len()); + let params = params_raw.split_whitespace().collect::>(); + ensure!(params.len() == 4); + let tenant = ZTenantId::from_str(params[0])?; + let timeline = ZTimelineId::from_str(params[1])?; + let base_lsn = Lsn::from_str(params[2])?; + let end_lsn = Lsn::from_str(params[3])?; + + self.check_permission(Some(tenant))?; + + match self.handle_import_basebackup(pgb, tenant, timeline, base_lsn, end_lsn) { + Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, + Err(e) => pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?, + }; + } else if query_string.starts_with("import wal ") { + // Import the `pg_wal` section of a basebackup. + // + // Files are scheduled to be persisted to remote storage, and the + // caller should poll the http api to check when that is done. + let (_, params_raw) = query_string.split_at("import wal ".len()); + let params = params_raw.split_whitespace().collect::>(); + ensure!(params.len() == 4); + let tenant = ZTenantId::from_str(params[0])?; + let timeline = ZTimelineId::from_str(params[1])?; + let start_lsn = Lsn::from_str(params[2])?; + let end_lsn = Lsn::from_str(params[3])?; + + self.check_permission(Some(tenant))?; + + match self.handle_import_wal(pgb, tenant, timeline, start_lsn, end_lsn) { + Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, + Err(e) => pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?, + }; } else if query_string.to_ascii_lowercase().starts_with("set ") { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 626ed1b0f1..59a53d68a1 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -749,6 +749,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { } /// Extend relation + /// If new size is smaller, do nothing. pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { ensure!(rel.relnode != 0, "invalid relnode"); @@ -756,10 +757,13 @@ impl<'a, R: Repository> DatadirModification<'a, R> { let size_key = rel_size_to_key(rel); let old_size = self.get(size_key)?.get_u32_le(); - let buf = nblocks.to_le_bytes(); - self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + // only extend relation here. never decrease the size + if nblocks > old_size { + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); - self.pending_nblocks += nblocks as isize - old_size as isize; + self.pending_nblocks += nblocks as isize - old_size as isize; + } Ok(()) } diff --git a/test_runner/batch_others/test_import.py b/test_runner/batch_others/test_import.py new file mode 100644 index 0000000000..e478103313 --- /dev/null +++ b/test_runner/batch_others/test_import.py @@ -0,0 +1,193 @@ +import pytest +from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_upload, wait_for_last_record_lsn +from fixtures.utils import lsn_from_hex, lsn_to_hex +from uuid import UUID, uuid4 +import tarfile +import os +import shutil +from pathlib import Path +import json +from fixtures.utils import subprocess_capture +from fixtures.log_helper import log +from contextlib import closing +from fixtures.neon_fixtures import pg_distrib_dir + + +@pytest.mark.timeout(600) +def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder): + # Put data in vanilla pg + vanilla_pg.start() + vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") + vanilla_pg.safe_psql('''create table t as select 'long string to consume some space' || g + from generate_series(1,300000) g''') + assert vanilla_pg.safe_psql('select count(*) from t') == [(300000, )] + + # Take basebackup + basebackup_dir = os.path.join(test_output_dir, "basebackup") + base_tar = os.path.join(basebackup_dir, "base.tar") + wal_tar = os.path.join(basebackup_dir, "pg_wal.tar") + os.mkdir(basebackup_dir) + vanilla_pg.safe_psql("CHECKPOINT") + pg_bin.run([ + "pg_basebackup", + "-F", + "tar", + "-d", + vanilla_pg.connstr(), + "-D", + basebackup_dir, + ]) + + # Make corrupt base tar with missing pg_control + unpacked_base = os.path.join(basebackup_dir, "unpacked-base") + corrupt_base_tar = os.path.join(unpacked_base, "corrupt-base.tar") + os.mkdir(unpacked_base, 0o750) + subprocess_capture(str(test_output_dir), ["tar", "-xf", base_tar, "-C", unpacked_base]) + os.remove(os.path.join(unpacked_base, "global/pg_control")) + subprocess_capture(str(test_output_dir), + ["tar", "-cf", "corrupt-base.tar"] + os.listdir(unpacked_base), + cwd=unpacked_base) + + # Get start_lsn and end_lsn + with open(os.path.join(basebackup_dir, "backup_manifest")) as f: + manifest = json.load(f) + start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"] + end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] + + node_name = "import_from_vanilla" + tenant = uuid4() + timeline = uuid4() + + # Set up pageserver for import + neon_env_builder.enable_local_fs_remote_storage() + env = neon_env_builder.init_start() + env.pageserver.http_client().tenant_create(tenant) + + def import_tar(base, wal): + env.neon_cli.raw_cli([ + "timeline", + "import", + "--tenant-id", + tenant.hex, + "--timeline-id", + timeline.hex, + "--node-name", + node_name, + "--base-lsn", + start_lsn, + "--base-tarfile", + base, + "--end-lsn", + end_lsn, + "--wal-tarfile", + wal, + ]) + + # Importing corrupt backup fails + with pytest.raises(Exception): + import_tar(corrupt_base_tar, wal_tar) + + # Clean up + # TODO it should clean itself + client = env.pageserver.http_client() + client.timeline_detach(tenant, timeline) + + # Importing correct backup works + import_tar(base_tar, wal_tar) + + # Wait for data to land in s3 + wait_for_last_record_lsn(client, tenant, timeline, lsn_from_hex(end_lsn)) + wait_for_upload(client, tenant, timeline, lsn_from_hex(end_lsn)) + + # Check it worked + pg = env.postgres.create_start(node_name, tenant_id=tenant) + assert pg.safe_psql('select count(*) from t') == [(300000, )] + + +@pytest.mark.timeout(600) +def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_builder): + + num_rows = 3000 + neon_env_builder.num_safekeepers = 1 + neon_env_builder.enable_local_fs_remote_storage() + env = neon_env_builder.init_start() + + env.neon_cli.create_branch('test_import_from_pageserver') + pgmain = env.postgres.create_start('test_import_from_pageserver') + log.info("postgres is running on 'test_import_from_pageserver' branch") + + timeline = pgmain.safe_psql("SHOW neon.timeline_id")[0][0] + + with closing(pgmain.connect()) as conn: + with conn.cursor() as cur: + # data loading may take a while, so increase statement timeout + cur.execute("SET statement_timeout='300s'") + cur.execute(f'''CREATE TABLE tbl AS SELECT 'long string to consume some space' || g + from generate_series(1,{num_rows}) g''') + cur.execute("CHECKPOINT") + + cur.execute('SELECT pg_current_wal_insert_lsn()') + lsn = cur.fetchone()[0] + log.info(f"start_backup_lsn = {lsn}") + + # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. + # PgBin sets it automatically, but here we need to pipe psql output to the tar command. + psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')} + + # Get a fullbackup from pageserver + query = f"fullbackup { env.initial_tenant.hex} {timeline} {lsn}" + cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] + result_basepath = pg_bin.run_capture(cmd, env=psql_env) + tar_output_file = result_basepath + ".stdout" + + # Stop the first pageserver instance, erase all its data + env.postgres.stop_all() + env.pageserver.stop() + + dir_to_clear = Path(env.repo_dir) / 'tenants' + shutil.rmtree(dir_to_clear) + os.mkdir(dir_to_clear) + + #start the pageserver again + env.pageserver.start() + + # Import using another tenantid, because we use the same pageserver. + # TODO Create another pageserver to maeke test more realistic. + tenant = uuid4() + + # Import to pageserver + node_name = "import_from_pageserver" + client = env.pageserver.http_client() + client.tenant_create(tenant) + env.neon_cli.raw_cli([ + "timeline", + "import", + "--tenant-id", + tenant.hex, + "--timeline-id", + timeline, + "--node-name", + node_name, + "--base-lsn", + lsn, + "--base-tarfile", + os.path.join(tar_output_file), + ]) + + # Wait for data to land in s3 + wait_for_last_record_lsn(client, tenant, UUID(timeline), lsn_from_hex(lsn)) + wait_for_upload(client, tenant, UUID(timeline), lsn_from_hex(lsn)) + + # Check it worked + pg = env.postgres.create_start(node_name, tenant_id=tenant) + assert pg.safe_psql('select count(*) from tbl') == [(num_rows, )] + + # Take another fullbackup + query = f"fullbackup { tenant.hex} {timeline} {lsn}" + cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] + result_basepath = pg_bin.run_capture(cmd, env=psql_env) + new_tar_output_file = result_basepath + ".stdout" + + # Check it's the same as the first fullbackup + # TODO pageserver should be checking checksum + assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 51afd3a03d..12edcb8792 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1398,12 +1398,12 @@ class VanillaPostgres(PgProtocol): if log_path is None: log_path = os.path.join(self.pgdatadir, "pg.log") - self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, '-l', log_path, 'start']) + self.pg_bin.run_capture(['pg_ctl', '-w', '-D', self.pgdatadir, '-l', log_path, 'start']) def stop(self): assert self.running self.running = False - self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, 'stop']) + self.pg_bin.run_capture(['pg_ctl', '-w', '-D', self.pgdatadir, 'stop']) def get_subdir_size(self, subdir) -> int: """Return size of pgdatadir subdirectory in bytes.""" From 6222a0012bf1a856149af618b67e7e362528bf36 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Wed, 22 Jun 2022 11:40:59 +0300 Subject: [PATCH 0433/1022] Migrate from CircleCI to Github Actions: python codestyle, build and tests (#1647) Duplicate postgres and neon build and test jobs from CircleCI to Github actions. --- .../actions/run-python-test-set/action.yml | 119 ++++++++ .github/workflows/build_and_test.yml | 276 ++++++++++++++++++ 2 files changed, 395 insertions(+) create mode 100644 .github/actions/run-python-test-set/action.yml create mode 100644 .github/workflows/build_and_test.yml diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml new file mode 100644 index 0000000000..94fac2ee99 --- /dev/null +++ b/.github/actions/run-python-test-set/action.yml @@ -0,0 +1,119 @@ +name: 'Run python test' +description: 'Runs a Neon python test set, performing all the required preparations before' + +inputs: + # Select the type of Rust build. Must be "release" or "debug". + build_type: + required: true + rust_toolchain: + required: true + # This parameter is required, to prevent the mistake of running all tests in one job. + test_selection: + required: true + # Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr + extra_params: + required: false + default: '' + needs_postgres_source: + required: false + default: 'false' + run_in_parallel: + required: false + default: 'true' + save_perf_report: + required: false + default: 'false' + +runs: + using: "composite" + steps: + - name: Get Neon artifact for restoration + uses: actions/download-artifact@v3 + with: + name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact + path: ./neon-artifact/ + + - name: Extract Neon artifact + shell: bash -ex {0} + run: | + mkdir -p /tmp/neon/ + tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/ + rm -rf ./neon-artifact/ + + - name: Checkout + if: inputs.needs_postgres_source == 'true' + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 1 + + - name: Cache poetry deps + id: cache_poetry + uses: actions/cache@v3 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + shell: bash -ex {0} + run: ./scripts/pysync + + - name: Run pytest + env: + ZENITH_BIN: /tmp/neon/bin + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + TEST_OUTPUT: /tmp/test_output + # this variable will be embedded in perf test report + # and is needed to distinguish different environments + PLATFORM: github-actions-selfhosted + shell: bash -ex {0} + run: | + PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)" + rm -rf $PERF_REPORT_DIR + + TEST_SELECTION="test_runner/${{ inputs.test_selection }}" + EXTRA_PARAMS="${{ inputs.extra_params }}" + if [ -z "$TEST_SELECTION" ]; then + echo "test_selection must be set" + exit 1 + fi + if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then + EXTRA_PARAMS="-n4 $EXTRA_PARAMS" + fi + if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then + if [[ "$GITHUB_REF" == "main" ]]; then + mkdir -p "$PERF_REPORT_DIR" + EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" + fi + fi + + if [[ "${{ inputs.build_type }}" == "debug" ]]; then + cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run) + elif [[ "${{ inputs.build_type }}" == "release" ]]; then + cov_prefix=() + fi + + # Run the tests. + # + # The junit.xml file allows CircleCI to display more fine-grained test information + # in its "Tests" tab in the results page. + # --verbose prints name of each test (helpful when there are + # multiple tests in one file) + # -rA prints summary in the end + # -n4 uses four processes to run tests via pytest-xdist + # -s is not used to prevent pytest from capturing output, because tests are running + # in parallel and logs are mixed between different tests + "${cov_prefix[@]}" ./scripts/pytest \ + --junitxml=$TEST_OUTPUT/junit.xml \ + --tb=short \ + --verbose \ + -m "not remote_cluster" \ + -rA $TEST_SELECTION $EXTRA_PARAMS + + if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then + if [[ "$GITHUB_REF" == "main" ]]; then + export REPORT_FROM="$PERF_REPORT_DIR" + export REPORT_TO=local + scripts/generate_and_push_perf_report.sh + fi + fi diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml new file mode 100644 index 0000000000..5f4dd754d2 --- /dev/null +++ b/.github/workflows/build_and_test.yml @@ -0,0 +1,276 @@ +name: build_and_test +on: [ push ] +defaults: + run: + shell: bash -ex {0} + +jobs: + build-postgres: + runs-on: [ self-hosted, Linux, k8s-runner ] + strategy: + matrix: + build_type: [ debug, release ] + rust_toolchain: [ 1.58 ] + + env: + BUILD_TYPE: ${{ matrix.build_type }} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 1 + + - name: Set pg revision for caching + id: pg_ver + run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres) + + - name: Cache postgres build + id: cache_pg + uses: actions/cache@v3 + with: + path: tmp_install/ + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Build postgres + if: steps.cache_pg.outputs.cache-hit != 'true' + run: COPT='-Werror' mold -run make postgres -j$(nproc) + + # actions/cache@v3 does not allow concurrently using the same cache across job steps, so use a separate cache + - name: Prepare postgres artifact + run: tar -C tmp_install/ -czf ./pg.tgz . + - name: Upload postgres artifact + uses: actions/upload-artifact@v3 + with: + retention-days: 7 + if-no-files-found: error + name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact + path: ./pg.tgz + + + build-neon: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ build-postgres ] + strategy: + matrix: + build_type: [ debug, release ] + rust_toolchain: [ 1.58 ] + + env: + BUILD_TYPE: ${{ matrix.build_type }} + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 1 + + - name: Get postgres artifact for restoration + uses: actions/download-artifact@v3 + with: + name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact + path: ./postgres-artifact/ + - name: Extract postgres artifact + run: | + mkdir ./tmp_install/ + tar -xf ./postgres-artifact/pg.tgz -C ./tmp_install/ + rm -rf ./postgres-artifact/ + + - name: Cache cargo deps + id: cache_cargo + uses: actions/cache@v3 + with: + path: | + ~/.cargo/registry/ + ~/.cargo/git/ + target/ + key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + + - name: Run cargo build + run: | + if [[ $BUILD_TYPE == "debug" ]]; then + cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run) + CARGO_FLAGS= + elif [[ $BUILD_TYPE == "release" ]]; then + cov_prefix=() + CARGO_FLAGS="--release --features profiling" + fi + + export CACHEPOT_BUCKET=zenith-rust-cachepot + export RUSTC_WRAPPER=cachepot + export AWS_ACCESS_KEY_ID="${{ secrets.AWS_ACCESS_KEY_ID }}" + export AWS_SECRET_ACCESS_KEY="${{ secrets.AWS_SECRET_ACCESS_KEY }}" + export HOME=/home/runner + "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests + cachepot -s + + - name: Run cargo test + run: | + export HOME=/home/runner + if [[ $BUILD_TYPE == "debug" ]]; then + cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run) + CARGO_FLAGS= + elif [[ $BUILD_TYPE == "release" ]]; then + cov_prefix=() + CARGO_FLAGS=--release + fi + + "${cov_prefix[@]}" cargo test $CARGO_FLAGS + + - name: Install rust binaries + run: | + export HOME=/home/runner + if [[ $BUILD_TYPE == "debug" ]]; then + cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run) + elif [[ $BUILD_TYPE == "release" ]]; then + cov_prefix=() + fi + + binaries=$( + "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps | + jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' + ) + + test_exe_paths=$( + "${cov_prefix[@]}" cargo test --message-format=json --no-run | + jq -r '.executable | select(. != null)' + ) + + mkdir -p /tmp/neon/bin + mkdir -p /tmp/neon/test_bin + mkdir -p /tmp/neon/etc + + # Install target binaries + for bin in $binaries; do + SRC=target/$BUILD_TYPE/$bin + DST=/tmp/neon/bin/$bin + cp $SRC $DST + echo $DST >> /tmp/neon/etc/binaries.list + done + + # Install test executables (for code coverage) + if [[ $BUILD_TYPE == "debug" ]]; then + for bin in $test_exe_paths; do + SRC=$bin + DST=/tmp/neon/test_bin/$(basename $bin) + cp $SRC $DST + echo $DST >> /tmp/neon/etc/binaries.list + done + fi + + - name: Install postgres binaries + run: cp -a tmp_install /tmp/neon/pg_install + + - name: Merge coverage data + run: | + export HOME=/home/runner + # This will speed up workspace uploads + if [[ $BUILD_TYPE == "debug" ]]; then + scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage merge + fi + + - name: Prepare neon artifact + run: tar -C /tmp/neon/ -czf ./neon.tgz . + + - name: Upload neon binaries + uses: actions/upload-artifact@v3 + with: + retention-days: 7 + if-no-files-found: error + name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact + path: ./neon.tgz + + check-codestyle-python: + runs-on: [ self-hosted, Linux, k8s-runner ] + strategy: + matrix: + rust_toolchain: [ 1.58 ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 1 + + - name: Cache poetry deps + id: cache_poetry + uses: actions/cache@v3 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + run: ./scripts/pysync + + - name: Run yapf to ensure code format + run: poetry run yapf --recursive --diff . + + - name: Run mypy to check types + run: poetry run mypy . + + pg_regress-tests: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ build-neon ] + strategy: + matrix: + build_type: [ debug, release ] + rust_toolchain: [ 1.58 ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 2 + + - name: Pytest regress tests + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ matrix.build_type }} + rust_toolchain: ${{ matrix.rust_toolchain }} + test_selection: batch_pg_regress + needs_postgres_source: true + + other-tests: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ build-neon ] + strategy: + matrix: + build_type: [ debug, release ] + rust_toolchain: [ 1.58 ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 2 + + - name: Pytest other tests + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ matrix.build_type }} + rust_toolchain: ${{ matrix.rust_toolchain }} + test_selection: batch_others + + benchmarks: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ build-neon ] + strategy: + matrix: + build_type: [ release ] + rust_toolchain: [ 1.58 ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 2 + + - name: Pytest benchmarks + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ matrix.build_type }} + rust_toolchain: ${{ matrix.rust_toolchain }} + test_selection: performance + run_in_parallel: false + # save_perf_report: true From d059e588a663d0b26992761a11b93acb11ff499d Mon Sep 17 00:00:00 2001 From: KlimentSerafimov Date: Wed, 22 Jun 2022 15:34:24 +0200 Subject: [PATCH 0434/1022] Added invariant check for project name. (#1921) Summary: Added invariant checking for project name. Refactored ClientCredentials and TlsConfig. * Added formatting invariant check for project name: **\forall c \in project_name . c \in [alnum] U {'-'}. ** sni_data == . * Added exhaustive tests for get_project_name. * Refactored TlsConfig to contain common_name : Option. * Refactored ClientCredentials construction to construct project_name directly. * Merged ProjectNameError into ClientCredsParseError. * Tweaked proxy tests to accommodate refactored ClientCredentials construction semantics. * [Pytests] Added project option argument to test_proxy_select_1. * Removed project param from Api since now it's contained in creds. * Refactored &Option -> Option<&str>. Co-authored-by: Dmitrii Ivanov . --- Cargo.lock | 119 +++++++++ proxy/Cargo.toml | 2 + proxy/src/auth/backend/console.rs | 15 +- proxy/src/auth/credentials.rs | 339 ++++++++++++++++++++----- proxy/src/config.rs | 42 ++- proxy/src/proxy.rs | 58 +++-- test_runner/batch_others/test_proxy.py | 2 +- 7 files changed, 473 insertions(+), 104 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dca525941d..f4d3743676 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -64,6 +64,45 @@ dependencies = [ "nodrop", ] +[[package]] +name = "asn1-rs" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30ff05a702273012438132f449575dbc804e27b2f3cbe3069aa237d26c98fa33" +dependencies = [ + "asn1-rs-derive", + "asn1-rs-impl", + "displaydoc", + "nom", + "num-traits", + "rusticata-macros", + "thiserror", + "time 0.3.9", +] + +[[package]] +name = "asn1-rs-derive" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db8b7511298d5b7784b40b092d9e9dcd3a627a5707e4b5e507931ab0d44eeebf" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "asn1-rs-impl" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "async-stream" version = "0.3.3" @@ -712,6 +751,12 @@ dependencies = [ "syn", ] +[[package]] +name = "data-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ee2393c4a91429dffb4bedf19f4d6abf27d8a732c8ce4980305d782e5426d57" + [[package]] name = "debugid" version = "0.7.3" @@ -721,6 +766,20 @@ dependencies = [ "uuid", ] +[[package]] +name = "der-parser" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe398ac75057914d7d07307bf67dc7f3f574a26783b4fc7805a20ffa9f506e82" +dependencies = [ + "asn1-rs", + "displaydoc", + "nom", + "num-bigint", + "num-traits", + "rusticata-macros", +] + [[package]] name = "digest" version = "0.9.0" @@ -762,6 +821,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "displaydoc" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bf95dc3f046b9da4f2d51833c0d3547d8564ef6910f5c1ed130306a75b92886" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "either" version = "1.6.1" @@ -1731,6 +1801,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "oid-registry" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38e20717fa0541f39bd146692035c37bedfa532b3e5071b35761082407546b2a" +dependencies = [ + "asn1-rs", +] + [[package]] name = "once_cell" version = "1.10.0" @@ -2250,6 +2329,7 @@ dependencies = [ "url", "utils", "workspace_hack", + "x509-parser", ] [[package]] @@ -2621,6 +2701,15 @@ dependencies = [ "semver", ] +[[package]] +name = "rusticata-macros" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faf0c4a6ece9950b9abdb62b1cfcf2a68b3b67a10ba445b3bb85be2a293d0632" +dependencies = [ + "nom", +] + [[package]] name = "rustls" version = "0.20.4" @@ -3060,6 +3149,18 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8" +[[package]] +name = "synstructure" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "unicode-xid", +] + [[package]] name = "tar" version = "0.4.38" @@ -3922,6 +4023,24 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "x509-parser" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fb9bace5b5589ffead1afb76e43e34cff39cd0f3ce7e170ae0c29e53b88eb1c" +dependencies = [ + "asn1-rs", + "base64", + "data-encoding", + "der-parser", + "lazy_static", + "nom", + "oid-registry", + "rusticata-macros", + "thiserror", + "time 0.3.9", +] + [[package]] name = "xattr" version = "0.2.2" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 4e45698e3e..8c6036f87d 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -39,6 +39,8 @@ utils = { path = "../libs/utils" } metrics = { path = "../libs/metrics" } workspace_hack = { version = "0.1", path = "../workspace_hack" } +x509-parser = "0.13.2" + [dev-dependencies] rcgen = "0.8.14" rstest = "0.12" diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index 252522affb..93462086ea 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -19,7 +19,7 @@ pub type Result = std::result::Result; #[derive(Debug, Error)] pub enum ConsoleAuthError { #[error(transparent)] - BadProjectName(#[from] auth::credentials::ProjectNameError), + BadProjectName(#[from] auth::credentials::ClientCredsParseError), // We shouldn't include the actual secret here. #[error("Bad authentication secret")] @@ -74,18 +74,12 @@ pub enum AuthInfo { pub(super) struct Api<'a> { endpoint: &'a ApiUrl, creds: &'a ClientCredentials, - /// Cache project name, since we'll need it several times. - project: &'a str, } impl<'a> Api<'a> { /// Construct an API object containing the auth parameters. pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result { - Ok(Self { - endpoint, - creds, - project: creds.project_name()?, - }) + Ok(Self { endpoint, creds }) } /// Authenticate the existing user or throw an error. @@ -100,7 +94,7 @@ impl<'a> Api<'a> { let mut url = self.endpoint.clone(); url.path_segments_mut().push("proxy_get_role_secret"); url.query_pairs_mut() - .append_pair("project", self.project) + .append_pair("project", &self.creds.project_name) .append_pair("role", &self.creds.user); // TODO: use a proper logger @@ -123,7 +117,8 @@ impl<'a> Api<'a> { async fn wake_compute(&self) -> Result { let mut url = self.endpoint.clone(); url.path_segments_mut().push("proxy_wake_compute"); - url.query_pairs_mut().append_pair("project", self.project); + url.query_pairs_mut() + .append_pair("project", &self.creds.project_name); // TODO: use a proper logger println!("cplane request: {url}"); diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 6521162b50..48dc8542ec 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -8,10 +8,32 @@ use std::collections::HashMap; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -#[derive(Debug, Error)] +#[derive(Debug, Error, PartialEq)] pub enum ClientCredsParseError { - #[error("Parameter `{0}` is missing in startup packet")] + #[error("Parameter `{0}` is missing in startup packet.")] MissingKey(&'static str), + + #[error( + "Project name is not specified. \ + EITHER please upgrade the postgres client library (libpq) for SNI support \ + OR pass the project name as a parameter: '&options=project%3D'." + )] + MissingSNIAndProjectName, + + #[error("Inconsistent project name inferred from SNI ('{0}') and project option ('{1}').")] + InconsistentProjectNameAndSNI(String, String), + + #[error("Common name is not set.")] + CommonNameNotSet, + + #[error( + "SNI ('{1}') inconsistently formatted with respect to common name ('{0}'). \ + SNI should be formatted as '.'." + )] + InconsistentCommonNameAndSNI(String, String), + + #[error("Project name ('{0}') must contain only alphanumeric characters and hyphens ('-').")] + ProjectNameContainsIllegalChars(String), } impl UserFacingError for ClientCredsParseError {} @@ -22,15 +44,7 @@ impl UserFacingError for ClientCredsParseError {} pub struct ClientCredentials { pub user: String, pub dbname: String, - - // New console API requires SNI info to determine the cluster name. - // Other Auth backends don't need it. - pub sni_data: Option, - - // project_name is passed as argument from options from url. - // In case sni_data is missing: project_name is used to determine cluster name. - // In case sni_data is available: project_name and sni_data should match (otherwise throws an error). - pub project_name: Option, + pub project_name: String, } impl ClientCredentials { @@ -38,60 +52,14 @@ impl ClientCredentials { // This logic will likely change in the future. self.user.ends_with("@zenith") } -} -#[derive(Debug, Error)] -pub enum ProjectNameError { - #[error("SNI is missing. EITHER please upgrade the postgres client library OR pass the project name as a parameter: '...&options=project%3D...'.")] - Missing, - - #[error("SNI is malformed.")] - Bad, - - #[error("Inconsistent project name inferred from SNI and project option. String from SNI: '{0}', String from project option: '{1}'")] - Inconsistent(String, String), -} - -impl UserFacingError for ProjectNameError {} - -impl ClientCredentials { - /// Determine project name from SNI or from project_name parameter from options argument. - pub fn project_name(&self) -> Result<&str, ProjectNameError> { - // Checking that if both sni_data and project_name are set, then they should match - // otherwise, throws a ProjectNameError::Inconsistent error. - if let Some(sni_data) = &self.sni_data { - let project_name_from_sni_data = - sni_data.split_once('.').ok_or(ProjectNameError::Bad)?.0; - if let Some(project_name_from_options) = &self.project_name { - if !project_name_from_options.eq(project_name_from_sni_data) { - return Err(ProjectNameError::Inconsistent( - project_name_from_sni_data.to_string(), - project_name_from_options.to_string(), - )); - } - } - } - // determine the project name from self.sni_data if it exists, otherwise from self.project_name. - let ret = match &self.sni_data { - // if sni_data exists, use it to determine project name - Some(sni_data) => sni_data.split_once('.').ok_or(ProjectNameError::Bad)?.0, - // otherwise use project_option if it was manually set thought options parameter. - None => self - .project_name - .as_ref() - .ok_or(ProjectNameError::Missing)? - .as_str(), - }; - Ok(ret) - } -} - -impl TryFrom> for ClientCredentials { - type Error = ClientCredsParseError; - - fn try_from(mut value: HashMap) -> Result { + pub fn parse( + mut options: HashMap, + sni_data: Option<&str>, + common_name: Option<&str>, + ) -> Result { let mut get_param = |key| { - value + options .remove(key) .ok_or(ClientCredsParseError::MissingKey(key)) }; @@ -99,17 +67,15 @@ impl TryFrom> for ClientCredentials { let user = get_param("user")?; let dbname = get_param("database")?; let project_name = get_param("project").ok(); + let project_name = get_project_name(sni_data, common_name, project_name.as_deref())?; Ok(Self { user, dbname, - sni_data: None, project_name, }) } -} -impl ClientCredentials { /// Use credentials to authenticate the user. pub async fn authenticate( self, @@ -120,3 +86,244 @@ impl ClientCredentials { super::backend::handle_user(config, client, self).await } } + +/// Inferring project name from sni_data. +fn project_name_from_sni_data( + sni_data: &str, + common_name: &str, +) -> Result { + let common_name_with_dot = format!(".{common_name}"); + // check that ".{common_name_with_dot}" is the actual suffix in sni_data + if !sni_data.ends_with(&common_name_with_dot) { + return Err(ClientCredsParseError::InconsistentCommonNameAndSNI( + common_name.to_string(), + sni_data.to_string(), + )); + } + // return sni_data without the common name suffix. + Ok(sni_data + .strip_suffix(&common_name_with_dot) + .unwrap() + .to_string()) +} + +#[cfg(test)] +mod tests_for_project_name_from_sni_data { + use super::*; + + #[test] + fn passing() { + let target_project_name = "my-project-123"; + let common_name = "localtest.me"; + let sni_data = format!("{target_project_name}.{common_name}"); + assert_eq!( + project_name_from_sni_data(&sni_data, common_name), + Ok(target_project_name.to_string()) + ); + } + + #[test] + fn throws_inconsistent_common_name_and_sni_data() { + let target_project_name = "my-project-123"; + let common_name = "localtest.me"; + let wrong_suffix = "wrongtest.me"; + assert_eq!(common_name.len(), wrong_suffix.len()); + let wrong_common_name = format!("wrong{wrong_suffix}"); + let sni_data = format!("{target_project_name}.{wrong_common_name}"); + assert_eq!( + project_name_from_sni_data(&sni_data, common_name), + Err(ClientCredsParseError::InconsistentCommonNameAndSNI( + common_name.to_string(), + sni_data + )) + ); + } +} + +/// Determine project name from SNI or from project_name parameter from options argument. +fn get_project_name( + sni_data: Option<&str>, + common_name: Option<&str>, + project_name: Option<&str>, +) -> Result { + // determine the project name from sni_data if it exists, otherwise from project_name. + let ret = match sni_data { + Some(sni_data) => { + let common_name = common_name.ok_or(ClientCredsParseError::CommonNameNotSet)?; + let project_name_from_sni = project_name_from_sni_data(sni_data, common_name)?; + // check invariant: project name from options and from sni should match + if let Some(project_name) = &project_name { + if !project_name_from_sni.eq(project_name) { + return Err(ClientCredsParseError::InconsistentProjectNameAndSNI( + project_name_from_sni, + project_name.to_string(), + )); + } + } + project_name_from_sni + } + None => project_name + .ok_or(ClientCredsParseError::MissingSNIAndProjectName)? + .to_string(), + }; + + // check formatting invariant: project name must contain only alphanumeric characters and hyphens. + if !ret.chars().all(|x: char| x.is_alphanumeric() || x == '-') { + return Err(ClientCredsParseError::ProjectNameContainsIllegalChars(ret)); + } + + Ok(ret) +} + +#[cfg(test)] +mod tests_for_project_name_only { + use super::*; + + #[test] + fn passing_from_sni_data_only() { + let target_project_name = "my-project-123"; + let common_name = "localtest.me"; + let sni_data = format!("{target_project_name}.{common_name}"); + assert_eq!( + get_project_name(Some(&sni_data), Some(common_name), None), + Ok(target_project_name.to_string()) + ); + } + + #[test] + fn throws_project_name_contains_illegal_chars_from_sni_data_only() { + let project_name_prefix = "my-project"; + let project_name_suffix = "123"; + let common_name = "localtest.me"; + + for illegal_char_id in 0..256 { + let illegal_char = char::from_u32(illegal_char_id).unwrap(); + if !(illegal_char.is_alphanumeric() || illegal_char == '-') + && illegal_char.to_string().len() == 1 + { + let target_project_name = + format!("{project_name_prefix}{illegal_char}{project_name_suffix}"); + let sni_data = format!("{target_project_name}.{common_name}"); + assert_eq!( + get_project_name(Some(&sni_data), Some(common_name), None), + Err(ClientCredsParseError::ProjectNameContainsIllegalChars( + target_project_name + )) + ); + } + } + } + + #[test] + fn passing_from_project_name_only() { + let target_project_name = "my-project-123"; + let common_names = [Some("localtest.me"), None]; + for common_name in common_names { + assert_eq!( + get_project_name(None, common_name, Some(target_project_name)), + Ok(target_project_name.to_string()) + ); + } + } + + #[test] + fn throws_project_name_contains_illegal_chars_from_project_name_only() { + let project_name_prefix = "my-project"; + let project_name_suffix = "123"; + let common_names = [Some("localtest.me"), None]; + + for common_name in common_names { + for illegal_char_id in 0..256 { + let illegal_char: char = char::from_u32(illegal_char_id).unwrap(); + if !(illegal_char.is_alphanumeric() || illegal_char == '-') + && illegal_char.to_string().len() == 1 + { + let target_project_name = + format!("{project_name_prefix}{illegal_char}{project_name_suffix}"); + assert_eq!( + get_project_name(None, common_name, Some(&target_project_name)), + Err(ClientCredsParseError::ProjectNameContainsIllegalChars( + target_project_name + )) + ); + } + } + } + } + + #[test] + fn passing_from_sni_data_and_project_name() { + let target_project_name = "my-project-123"; + let common_name = "localtest.me"; + let sni_data = format!("{target_project_name}.{common_name}"); + assert_eq!( + get_project_name( + Some(&sni_data), + Some(common_name), + Some(target_project_name) + ), + Ok(target_project_name.to_string()) + ); + } + + #[test] + fn throws_inconsistent_project_name_and_sni() { + let project_name_param = "my-project-123"; + let wrong_project_name = "not-my-project-123"; + let common_name = "localtest.me"; + let sni_data = format!("{wrong_project_name}.{common_name}"); + assert_eq!( + get_project_name(Some(&sni_data), Some(common_name), Some(project_name_param)), + Err(ClientCredsParseError::InconsistentProjectNameAndSNI( + wrong_project_name.to_string(), + project_name_param.to_string() + )) + ); + } + + #[test] + fn throws_common_name_not_set() { + let target_project_name = "my-project-123"; + let wrong_project_name = "not-my-project-123"; + let common_name = "localtest.me"; + let sni_datas = [ + Some(format!("{wrong_project_name}.{common_name}")), + Some(format!("{target_project_name}.{common_name}")), + ]; + let project_names = [None, Some(target_project_name)]; + for sni_data in sni_datas { + for project_name_param in project_names { + assert_eq!( + get_project_name(sni_data.as_deref(), None, project_name_param), + Err(ClientCredsParseError::CommonNameNotSet) + ); + } + } + } + + #[test] + fn throws_inconsistent_common_name_and_sni_data() { + let target_project_name = "my-project-123"; + let wrong_project_name = "not-my-project-123"; + let common_name = "localtest.me"; + let wrong_suffix = "wrongtest.me"; + assert_eq!(common_name.len(), wrong_suffix.len()); + let wrong_common_name = format!("wrong{wrong_suffix}"); + let sni_datas = [ + Some(format!("{wrong_project_name}.{wrong_common_name}")), + Some(format!("{target_project_name}.{wrong_common_name}")), + ]; + let project_names = [None, Some(target_project_name)]; + for project_name_param in project_names { + for sni_data in &sni_datas { + assert_eq!( + get_project_name(sni_data.as_deref(), Some(common_name), project_name_param), + Err(ClientCredsParseError::InconsistentCommonNameAndSNI( + common_name.to_string(), + sni_data.clone().unwrap().to_string() + )) + ); + } + } + } +} diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 4def11aefc..df3923de1a 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -36,23 +36,35 @@ pub struct ProxyConfig { pub auth_link_uri: ApiUrl, } -pub type TlsConfig = Arc; +pub struct TlsConfig { + pub config: Arc, + pub common_name: Option, +} + +impl TlsConfig { + pub fn to_server_config(&self) -> Arc { + self.config.clone() + } +} /// Configure TLS for the main endpoint. pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result { let key = { let key_bytes = std::fs::read(key_path).context("TLS key file")?; let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) - .context("couldn't read TLS keys")?; + .context(format!("Failed to read TLS keys at '{key_path}'"))?; ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); keys.pop().map(rustls::PrivateKey).unwrap() }; + let cert_chain_bytes = std::fs::read(cert_path) + .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; let cert_chain = { - let cert_chain_bytes = std::fs::read(cert_path).context("TLS cert file")?; rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .context("couldn't read TLS certificate chain")? + .context(format!( + "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." + ))? .into_iter() .map(rustls::Certificate) .collect() @@ -64,7 +76,25 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result x, None => return Ok(()), // it's a cancellation request @@ -99,12 +99,14 @@ async fn handle_client( /// we also take an extra care of propagating only the select handshake errors to client. async fn handshake( stream: S, - mut tls: Option, + mut tls: Option<&TlsConfig>, cancel_map: &CancelMap, ) -> anyhow::Result>, auth::ClientCredentials)>> { // Client may try upgrading to each protocol only once let (mut tried_ssl, mut tried_gss) = (false, false); + let common_name = tls.and_then(|cfg| cfg.common_name.as_deref()); + let mut stream = PqStream::new(Stream::from_raw(stream)); loop { let msg = stream.read_startup_packet().await?; @@ -122,7 +124,9 @@ async fn handshake( if let Some(tls) = tls.take() { // Upgrade raw stream into a secure TLS-backed stream. // NOTE: We've consumed `tls`; this fact will be used later. - stream = PqStream::new(stream.into_inner().upgrade(tls).await?); + stream = PqStream::new( + stream.into_inner().upgrade(tls.to_server_config()).await?, + ); } } _ => bail!(ERR_PROTO_VIOLATION), @@ -143,15 +147,16 @@ async fn handshake( stream.throw_error_str(ERR_INSECURE_CONNECTION).await?; } - // Here and forth: `or_else` demands that we use a future here - let mut creds: auth::ClientCredentials = async { params.try_into() } - .or_else(|e| stream.throw_error(e)) - .await?; + // Get SNI info when available + let sni_data = match stream.get_ref() { + Stream::Tls { tls } => tls.get_ref().1.sni_hostname().map(|s| s.to_owned()), + _ => None, + }; - // Set SNI info when available - if let Stream::Tls { tls } = stream.get_ref() { - creds.sni_data = tls.get_ref().1.sni_hostname().map(|s| s.to_owned()); - } + // Construct credentials + let creds = + auth::ClientCredentials::parse(params, sni_data.as_deref(), common_name); + let creds = async { creds }.or_else(|e| stream.throw_error(e)).await?; break Ok(Some((stream, creds))); } @@ -264,12 +269,13 @@ mod tests { } /// Generate TLS certificates and build rustls configs for client and server. - fn generate_tls_config( - hostname: &str, - ) -> anyhow::Result<(ClientConfig<'_>, Arc)> { + fn generate_tls_config<'a>( + hostname: &'a str, + common_name: &'a str, + ) -> anyhow::Result<(ClientConfig<'a>, TlsConfig)> { let (ca, cert, key) = generate_certs(hostname)?; - let server_config = { + let tls_config = { let config = rustls::ServerConfig::builder() .with_safe_defaults() .with_no_client_auth() @@ -291,7 +297,12 @@ mod tests { ClientConfig { config, hostname } }; - Ok((client_config, server_config)) + let tls_config = TlsConfig { + config: tls_config, + common_name: Some(common_name.to_string()), + }; + + Ok((client_config, tls_config)) } #[async_trait] @@ -346,7 +357,7 @@ mod tests { auth: impl TestAuth + Send, ) -> anyhow::Result<()> { let cancel_map = CancelMap::default(); - let (mut stream, _creds) = handshake(client, tls, &cancel_map) + let (mut stream, _creds) = handshake(client, tls.as_ref(), &cancel_map) .await? .context("handshake failed")?; @@ -365,7 +376,8 @@ mod tests { async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); - let (_, server_config) = generate_tls_config("localhost")?; + let (_, server_config) = + generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); let client_err = tokio_postgres::Config::new() @@ -393,7 +405,8 @@ mod tests { async fn handshake_tls() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); - let (client_config, server_config) = generate_tls_config("localhost")?; + let (client_config, server_config) = + generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); let (_client, _conn) = tokio_postgres::Config::new() @@ -415,6 +428,7 @@ mod tests { let (_client, _conn) = tokio_postgres::Config::new() .user("john_doe") .dbname("earth") + .options("project=generic-project-name") .ssl_mode(SslMode::Prefer) .connect_raw(server, NoTls) .await?; @@ -476,7 +490,8 @@ mod tests { async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); - let (client_config, server_config) = generate_tls_config("localhost")?; + let (client_config, server_config) = + generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), @@ -498,7 +513,8 @@ mod tests { async fn scram_auth_mock() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); - let (client_config, server_config) = generate_tls_config("localhost")?; + let (client_config, server_config) = + generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), diff --git a/test_runner/batch_others/test_proxy.py b/test_runner/batch_others/test_proxy.py index a6f828f829..ebeede8df7 100644 --- a/test_runner/batch_others/test_proxy.py +++ b/test_runner/batch_others/test_proxy.py @@ -2,7 +2,7 @@ import pytest def test_proxy_select_1(static_proxy): - static_proxy.safe_psql("select 1;") + static_proxy.safe_psql("select 1;", options="project=generic-project-name") # Pass extra options to the server. From 7c49abe7d1102edc99dd5f490dd10d002d77f74a Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 2 May 2022 23:28:54 +0300 Subject: [PATCH 0435/1022] Rework etcd timeline updates and their handling --- libs/etcd_broker/src/lib.rs | 100 +- pageserver/src/walreceiver.rs | 1087 ++-------------- .../src/walreceiver/connection_manager.rs | 1133 +++++++++++++++++ ...n_handler.rs => walreceiver_connection.rs} | 104 +- safekeeper/src/broker.rs | 13 +- 5 files changed, 1322 insertions(+), 1115 deletions(-) create mode 100644 pageserver/src/walreceiver/connection_manager.rs rename pageserver/src/walreceiver/{connection_handler.rs => walreceiver_connection.rs} (78%) diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index 38d4a403c2..8f698977a9 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -6,17 +6,13 @@ pub mod subscription_key; /// All broker values, possible to use when dealing with etcd. pub mod subscription_value; -use std::{ - collections::{hash_map, HashMap}, - str::FromStr, -}; +use std::str::FromStr; use serde::de::DeserializeOwned; use subscription_key::SubscriptionKey; use tokio::{sync::mpsc, task::JoinHandle}; use tracing::*; -use utils::zid::{NodeId, ZTenantTimelineId}; use crate::subscription_key::SubscriptionFullKey; @@ -28,18 +24,17 @@ pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon"; /// A way to control the data retrieval from a certain subscription. pub struct BrokerSubscription { - value_updates: mpsc::UnboundedReceiver>>, + /// An unbounded channel to fetch the relevant etcd updates from. + pub value_updates: mpsc::UnboundedReceiver>, key: SubscriptionKey, - watcher_handle: JoinHandle>, + /// A subscription task handle, to allow waiting on it for the task to complete. + /// Both the updates channel and the handle require `&mut`, so it's better to keep + /// both `pub` to allow using both in the same structures without borrow checker complaining. + pub watcher_handle: JoinHandle>, watcher: Watcher, } impl BrokerSubscription { - /// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet. - pub async fn fetch_data(&mut self) -> Option>> { - self.value_updates.recv().await - } - /// Cancels the subscription, stopping the data poller and waiting for it to shut down. pub async fn cancel(mut self) -> Result<(), BrokerError> { self.watcher.cancel().await.map_err(|e| { @@ -48,15 +43,41 @@ impl BrokerSubscription { format!("Failed to cancel broker subscription, kind: {:?}", self.key), ) })?; - self.watcher_handle.await.map_err(|e| { - BrokerError::InternalError(format!( - "Failed to join the broker value updates task, kind: {:?}, error: {e}", - self.key - )) - })? + match (&mut self.watcher_handle).await { + Ok(res) => res, + Err(e) => { + if e.is_cancelled() { + // don't error on the tasks that are cancelled already + Ok(()) + } else { + Err(BrokerError::InternalError(format!( + "Panicked during broker subscription task, kind: {:?}, error: {e}", + self.key + ))) + } + } + } } } +impl Drop for BrokerSubscription { + fn drop(&mut self) { + // we poll data from etcd into the channel in the same struct, so if the whole struct gets dropped, + // no more data is used by the receiver and it's safe to cancel and drop the whole etcd subscription task. + self.watcher_handle.abort(); + } +} + +/// An update from the etcd broker. +pub struct BrokerUpdate { + /// Etcd generation version, the bigger the more actual the data is. + pub etcd_version: i64, + /// Etcd key for the corresponding value, parsed from the broker KV. + pub key: SubscriptionFullKey, + /// Current etcd value, parsed from the broker KV. + pub value: V, +} + #[derive(Debug, thiserror::Error)] pub enum BrokerError { #[error("Etcd client error: {0}. Context: {1}")] @@ -124,41 +145,21 @@ where break; } - let mut value_updates: HashMap> = HashMap::new(); - // Keep track that the timeline data updates from etcd arrive in the right order. - // https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas - // > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering. - let mut value_etcd_versions: HashMap = HashMap::new(); - - let events = resp.events(); debug!("Processing {} events", events.len()); for event in events { if EventType::Put == event.event_type() { if let Some(new_etcd_kv) = event.kv() { - let new_kv_version = new_etcd_kv.version(); - match parse_etcd_kv(new_etcd_kv, &value_parser, &key.cluster_prefix) { - Ok(Some((key, value))) => match value_updates - .entry(key.id) - .or_default() - .entry(key.node_id) - { - hash_map::Entry::Occupied(mut o) => { - let old_etcd_kv_version = value_etcd_versions.get(&key.id).copied().unwrap_or(i64::MIN); - if old_etcd_kv_version < new_kv_version { - o.insert(value); - value_etcd_versions.insert(key.id,new_kv_version); - } else { - debug!("Skipping etcd timeline update due to older version compared to one that's already stored"); - } - } - hash_map::Entry::Vacant(v) => { - v.insert(value); - value_etcd_versions.insert(key.id,new_kv_version); - } - }, + Ok(Some((key, value))) => if let Err(e) = value_updates_sender.send(BrokerUpdate { + etcd_version: new_etcd_kv.version(), + key, + value, + }) { + info!("Broker value updates for key {key:?} sender got dropped, exiting: {e}"); + break; + }, Ok(None) => debug!("Ignoring key {key:?} : no value was returned by the parser"), Err(BrokerError::KeyNotParsed(e)) => debug!("Unexpected key {key:?} for timeline update: {e}"), Err(e) => error!("Failed to represent etcd KV {new_etcd_kv:?}: {e}"), @@ -166,13 +167,6 @@ where } } } - - if !value_updates.is_empty() { - if let Err(e) = value_updates_sender.send(value_updates) { - info!("Broker value updates for key {key:?} sender got dropped, exiting: {e}"); - break; - } - } } Ok(()) diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 82401e1d8c..fd9468a101 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -15,66 +15,38 @@ //! //! * handle the actual connection and WAL streaming //! -//! Handle happens dynamically, by portions of WAL being processed and registered in the server. +//! Handling happens dynamically, by portions of WAL being processed and registered in the server. //! Along with the registration, certain metadata is written to show WAL streaming progress and rely on that when considering safekeepers for connection. //! -//! ## Implementation details -//! -//! WAL receiver's implementation consists of 3 kinds of nested loops, separately handling the logic from the bullets above: -//! -//! * [`init_wal_receiver_main_thread`], a wal receiver main thread, containing the control async loop: timeline addition/removal and interruption of a whole thread handling. -//! The loop is infallible, always trying to continue with the new tasks, the only place where it can fail is its initialization. -//! All of the code inside the loop is either async or a spawn_blocking wrapper around the sync code. -//! -//! * [`timeline_wal_broker_loop_step`], a broker task, handling the etcd broker subscription and polling, safekeeper selection logic and [re]connects. -//! On every concequent broker/wal streamer connection attempt, the loop steps are forced to wait for some time before running, -//! increasing with the number of attempts (capped with some fixed value). -//! This is done endlessly, to ensure we don't miss the WAL streaming when it gets available on one of the safekeepers. -//! -//! Apart from the broker management, it keeps the wal streaming connection open, with the safekeeper having the most advanced timeline state. -//! The connection could be closed from safekeeper side (with error or not), could be cancelled from pageserver side from time to time. -//! -//! * [`connection_handler::handle_walreceiver_connection`], a wal streaming task, opening the libpq connection and reading the data out of it to the end. -//! Does periodic reporting of the progress, to share some of the data via external HTTP API and to ensure we're able to switch connections when needed. -//! -//! Every task is cancellable via its separate cancellation channel, -//! also every such task's dependency (broker subscription or the data source channel) cancellation/drop triggers the corresponding task cancellation either. +//! The current module contains high-level primitives used in the submodules; general synchronization, timeline acknowledgement and shutdown logic. -mod connection_handler; +mod connection_manager; +mod walreceiver_connection; -use crate::config::PageServerConf; -use crate::http::models::WalReceiverEntry; -use crate::repository::Timeline; -use crate::tenant_mgr::{self, LocalTimelineUpdate, TenantState}; -use crate::thread_mgr::ThreadKind; -use crate::{thread_mgr, DatadirTimelineImpl}; use anyhow::{ensure, Context}; -use chrono::{NaiveDateTime, Utc}; -use etcd_broker::{ - subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription, - Client, -}; +use etcd_broker::Client; use itertools::Itertools; use once_cell::sync::Lazy; use std::cell::Cell; use std::collections::{hash_map, HashMap, HashSet}; +use std::future::Future; use std::num::NonZeroU64; -use std::ops::ControlFlow; use std::sync::Arc; use std::thread_local; use std::time::Duration; -use tokio::select; use tokio::{ + select, sync::{mpsc, watch, RwLock}, task::JoinHandle, }; use tracing::*; use url::Url; -use utils::lsn::Lsn; -use utils::pq_proto::ReplicationFeedback; -use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}; -use self::connection_handler::{WalConnectionEvent, WalReceiverConnection}; +use crate::config::PageServerConf; +use crate::http::models::WalReceiverEntry; +use crate::tenant_mgr::{self, LocalTimelineUpdate, TenantState}; +use crate::thread_mgr::{self, ThreadKind}; +use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; thread_local! { // Boolean that is true only for WAL receiver threads @@ -125,7 +97,7 @@ pub fn init_wal_receiver_main_thread( .build() .context("Failed to create storage sync runtime")?; let etcd_client = runtime - .block_on(etcd_broker::Client::connect(etcd_endpoints, None)) + .block_on(Client::connect(etcd_endpoints, None)) .context("Failed to connect to etcd")?; thread_mgr::spawn( @@ -162,6 +134,97 @@ pub fn init_wal_receiver_main_thread( .context("Failed to spawn wal receiver main thread") } +async fn shutdown_all_wal_connections( + local_timeline_wal_receivers: &mut HashMap>>, +) { + info!("Shutting down all WAL connections"); + let mut broker_join_handles = Vec::new(); + for (tenant_id, timelines) in local_timeline_wal_receivers.drain() { + for (timeline_id, handles) in timelines { + handles.cancellation.send(()).ok(); + broker_join_handles.push(( + ZTenantTimelineId::new(tenant_id, timeline_id), + handles.handle, + )); + } + } + + let mut tenants = HashSet::with_capacity(broker_join_handles.len()); + for (id, broker_join_handle) in broker_join_handles { + tenants.insert(id.tenant_id); + debug!("Waiting for wal broker for timeline {id} to finish"); + if let Err(e) = broker_join_handle.await { + error!("Failed to join on wal broker for timeline {id}: {e}"); + } + } + if let Err(e) = tokio::task::spawn_blocking(move || { + for tenant_id in tenants { + if let Err(e) = tenant_mgr::set_tenant_state(tenant_id, TenantState::Idle) { + error!("Failed to make tenant {tenant_id} idle: {e:?}"); + } + } + }) + .await + { + error!("Failed to await a task to make all tenants idle: {e:?}"); + } +} + +/// A handle of an asynchronous task. +/// The task has a channel that it can use to communicate its lifecycle events in a certain form, see [`TaskEvent`] +/// and a cancellation channel that it can listen to for earlier interrupts. +/// +/// Note that the communication happens via the `watch` channel, that does not accumulate the events, replacing the old one with the never one on submission. +/// That may lead to certain events not being observed by the listener. +#[derive(Debug)] +struct TaskHandle { + handle: JoinHandle<()>, + events_receiver: watch::Receiver>, + cancellation: watch::Sender<()>, +} + +#[derive(Debug, Clone)] +pub enum TaskEvent { + Started, + NewEvent(E), + End(Result<(), String>), +} + +impl TaskHandle { + /// Initializes the task, starting it immediately after the creation. + pub fn spawn( + task: impl FnOnce(Arc>>, watch::Receiver<()>) -> Fut + Send + 'static, + ) -> Self + where + Fut: Future> + Send, + E: Sync + Send + 'static, + { + let (cancellation, cancellation_receiver) = watch::channel(()); + let (events_sender, events_receiver) = watch::channel(TaskEvent::Started); + let events_sender = Arc::new(events_sender); + + let sender = Arc::clone(&events_sender); + let handle = tokio::task::spawn(async move { + let task_result = task(sender, cancellation_receiver).await; + events_sender.send(TaskEvent::End(task_result)).ok(); + }); + + TaskHandle { + handle, + events_receiver, + cancellation, + } + } + + /// Aborts current task, waiting for it to finish. + async fn shutdown(self) { + self.cancellation.send(()).ok(); + if let Err(e) = self.handle.await { + error!("Task failed to shut down: {e}") + } + } +} + /// A step to process timeline attach/detach events to enable/disable the corresponding WAL receiver machinery. /// In addition to WAL streaming management, the step ensures that corresponding tenant has its service threads enabled or disabled. /// This is done here, since only walreceiver knows when a certain tenant has no streaming enabled. @@ -171,10 +234,7 @@ async fn wal_receiver_main_thread_loop_step<'a>( broker_prefix: &'a str, etcd_client: &'a Client, timeline_updates_receiver: &'a mut mpsc::UnboundedReceiver, - local_timeline_wal_receivers: &'a mut HashMap< - ZTenantId, - HashMap, - >, + local_timeline_wal_receivers: &'a mut HashMap>>, ) { // Only react on updates from [`tenant_mgr`] on local timeline attach/detach. match timeline_updates_receiver.recv().await { @@ -185,13 +245,8 @@ async fn wal_receiver_main_thread_loop_step<'a>( LocalTimelineUpdate::Detach(id) => { match local_timeline_wal_receivers.get_mut(&id.tenant_id) { Some(wal_receivers) => { - if let hash_map::Entry::Occupied(mut o) = wal_receivers.entry(id.timeline_id) { - if let Err(e) = o.get_mut().shutdown(id).await { - error!("Failed to shut down timeline {id} wal receiver handle: {e:#}"); - return; - } else { - o.remove(); - } + if let hash_map::Entry::Occupied(o) = wal_receivers.entry(id.timeline_id) { + o.remove().shutdown().await } if wal_receivers.is_empty() { if let Err(e) = change_tenant_state(id.tenant_id, TenantState::Idle).await { @@ -207,11 +262,11 @@ async fn wal_receiver_main_thread_loop_step<'a>( } // Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly. LocalTimelineUpdate::Attach(new_id, new_timeline) => { - let timelines = local_timeline_wal_receivers + let timeline_connection_managers = local_timeline_wal_receivers .entry(new_id.tenant_id) .or_default(); - if timelines.is_empty() { + if timeline_connection_managers.is_empty() { if let Err(e) = change_tenant_state(new_id.tenant_id, TenantState::Active).await { @@ -220,13 +275,14 @@ async fn wal_receiver_main_thread_loop_step<'a>( } } - let vacant_timeline_entry = match timelines.entry(new_id.timeline_id) { - hash_map::Entry::Occupied(_) => { - debug!("Attepted to readd an existing timeline {new_id}, ignoring"); - return; - } - hash_map::Entry::Vacant(v) => v, - }; + let vacant_connection_manager_entry = + match timeline_connection_managers.entry(new_id.timeline_id) { + hash_map::Entry::Occupied(_) => { + debug!("Attepted to readd an existing timeline {new_id}, ignoring"); + return; + } + hash_map::Entry::Vacant(v) => v, + }; let (wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag) = match fetch_tenant_settings(new_id.tenant_id).await { @@ -248,48 +304,17 @@ async fn wal_receiver_main_thread_loop_step<'a>( ); } - let (cancellation_sender, mut cancellation_receiver) = watch::channel(()); - let mut wal_connection_manager = WalConnectionManager { - id: new_id, - timeline: Arc::clone(&new_timeline), - wal_connect_timeout, - lagging_wal_timeout, - max_lsn_wal_lag, - wal_connection_data: None, - wal_connection_attempt: 0, - }; - - let broker_prefix = broker_prefix.to_string(); - let mut loop_client = etcd_client.clone(); - let broker_join_handle = tokio::spawn(async move { - info!("WAL receiver broker started, connecting to etcd"); - let mut cancellation = cancellation_receiver.clone(); - loop { - select! { - _ = cancellation.changed() => { - info!("Wal broker loop cancelled, shutting down"); - break; - }, - step_result = timeline_wal_broker_loop_step( - &broker_prefix, - &mut loop_client, - &mut wal_connection_manager, - &mut cancellation_receiver, - ) => match step_result { - Ok(ControlFlow::Break(())) => { - break; - } - Ok(ControlFlow::Continue(())) => {} - Err(e) => warn!("Error during wal receiver main thread step for timeline {new_id}: {e:#}"), - } - } - } - }.instrument(info_span!("timeline", id = %new_id))); - - vacant_timeline_entry.insert(TimelineWalBrokerLoopHandle { - broker_join_handle, - cancellation_sender, - }); + vacant_connection_manager_entry.insert( + connection_manager::spawn_connection_manager_task( + new_id, + broker_prefix.to_owned(), + etcd_client.clone(), + new_timeline, + wal_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + ), + ); } } } @@ -324,859 +349,3 @@ async fn change_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> an .await .with_context(|| format!("Failed to spawn activation task for tenant {tenant_id}"))? } - -async fn exponential_backoff(n: u32, base: f64, max_seconds: f64) { - if n == 0 { - return; - } - let seconds_to_wait = base.powf(f64::from(n) - 1.0).min(max_seconds); - info!("Backoff: waiting {seconds_to_wait} seconds before proceeding with the task"); - tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await; -} - -async fn shutdown_all_wal_connections( - local_timeline_wal_receivers: &mut HashMap< - ZTenantId, - HashMap, - >, -) { - info!("Shutting down all WAL connections"); - let mut broker_join_handles = Vec::new(); - for (tenant_id, timelines) in local_timeline_wal_receivers.drain() { - for (timeline_id, handles) in timelines { - handles.cancellation_sender.send(()).ok(); - broker_join_handles.push(( - ZTenantTimelineId::new(tenant_id, timeline_id), - handles.broker_join_handle, - )); - } - } - - let mut tenants = HashSet::with_capacity(broker_join_handles.len()); - for (id, broker_join_handle) in broker_join_handles { - tenants.insert(id.tenant_id); - debug!("Waiting for wal broker for timeline {id} to finish"); - if let Err(e) = broker_join_handle.await { - error!("Failed to join on wal broker for timeline {id}: {e}"); - } - } - if let Err(e) = tokio::task::spawn_blocking(move || { - for tenant_id in tenants { - if let Err(e) = tenant_mgr::set_tenant_state(tenant_id, TenantState::Idle) { - error!("Failed to make tenant {tenant_id} idle: {e:?}"); - } - } - }) - .await - { - error!("Failed to spawn a task to make all tenants idle: {e:?}"); - } -} - -/// Broker WAL loop handle to cancel the loop safely when needed. -struct TimelineWalBrokerLoopHandle { - broker_join_handle: JoinHandle<()>, - cancellation_sender: watch::Sender<()>, -} - -impl TimelineWalBrokerLoopHandle { - /// Stops the broker loop, waiting for its current task to finish. - async fn shutdown(&mut self, id: ZTenantTimelineId) -> anyhow::Result<()> { - self.cancellation_sender.send(()).context( - "Unexpected: cancellation sender is dropped before the receiver in the loop is", - )?; - debug!("Waiting for wal receiver for timeline {id} to finish"); - let handle = &mut self.broker_join_handle; - handle - .await - .with_context(|| format!("Failed to join the wal reveiver broker for timeline {id}")) - } -} - -/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. -/// Based on the updates, desides whether to start, keep or stop a WAL receiver task. -async fn timeline_wal_broker_loop_step( - broker_prefix: &str, - etcd_client: &mut Client, - wal_connection_manager: &mut WalConnectionManager, - cancellation: &mut watch::Receiver<()>, -) -> anyhow::Result> { - let id = wal_connection_manager.id; - - // Endlessly try to subscribe for broker updates for a given timeline. - // If there are no safekeepers to maintain the lease, the timeline subscription will be inavailable in the broker and the operation will fail constantly. - // This is ok, pageservers should anyway try subscribing (with some backoff) since it's the only way they can get the timeline WAL anyway. - let mut broker_subscription: BrokerSubscription; - let mut attempt = 0; - loop { - select! { - _ = cancellation.changed() => { - info!("Subscription backoff cancelled, shutting down"); - return Ok(ControlFlow::Break(())); - }, - _ = exponential_backoff(attempt, 2.0, 60.0) => {}, - } - attempt += 1; - - select! { - _ = cancellation.changed() => { - info!("Broker subscription loop cancelled, shutting down"); - return Ok(ControlFlow::Break(())); - }, - new_subscription = etcd_broker::subscribe_for_json_values( - etcd_client, - SubscriptionKey::sk_timeline_info(broker_prefix.to_owned(), id), - ) - .instrument(info_span!("etcd_subscription")) => match new_subscription { - Ok(new_subscription) => { - broker_subscription = new_subscription; - break; - } - Err(e) => { - warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in etcd: {e:#}"); - continue; - } - }, - - } - } - - info!("Subscribed for etcd timeline changes, considering walreceiver connections"); - - loop { - select! { - // the order of the polls is especially important here, since the first task to complete gets selected and the others get dropped (cancelled). - // place more frequetly updated tasks below to ensure the "slow" tasks are also reacted to. - biased; - // first, the cancellations are checked, to ensure we exit eagerly - _ = cancellation.changed() => { - info!("Broker loop cancelled, shutting down"); - break; - } - // then, we check for new events from the WAL connection: the existing connection should either return some progress data, - // or block, allowing other tasks in this `select!` to run first. - // - // We set a "timebomb" in the polling method, that waits long enough and cancels the entire loop if nothing happens during the wait. - // The wait is only initiated when no data (or a "channel closed" data) is received from the loop, ending with the break flow return. - // While waiting, more broker events are expected to be retrieved from etcd (currently, every safekeeper posts ~1 message/second). - // The timebomb ensures that we don't get stuck for too long on any of the WAL/etcd event polling, rather restarting the subscription entirely. - // - // We cannot return here eagerly on no WAL task data, since the result will get selected to early, not allowing etcd tasks to be polled properly. - // We cannot move etcd tasks above this select, since they are very frequent to finish and WAL events might get ignored. - // We need WAL events to periodically update the external data, so we cannot simply await the task result on the handler here. - wal_receiver_poll_result = wal_connection_manager.poll_connection_event_or_cancel() => match wal_receiver_poll_result { - ControlFlow::Break(()) => break, - ControlFlow::Continue(()) => {}, - }, - // finally, if no other tasks are completed, get another broker update and possibly reconnect - updates = broker_subscription.fetch_data() => match updates { - Some(mut all_timeline_updates) => { - match all_timeline_updates.remove(&id) { - Some(subscribed_timeline_updates) => { - match wal_connection_manager.select_connection_candidate(subscribed_timeline_updates) { - Some(candidate) => { - info!("Switching to different safekeeper {} for timeline {id}, reason: {:?}", candidate.safekeeper_id, candidate.reason); - wal_connection_manager.change_connection(candidate.safekeeper_id, candidate.wal_producer_connstr).await; - }, - None => debug!("No connection candidate was selected for timeline"), - } - } - // XXX: If we subscribe for a certain timeline, we expect only its data to come. - // But somebody could propagate a new etcd key, that has the same prefix as the subscribed one, then we'll get odd data. - // This is an error, we don't want to have overlapping prefixes for timelines, but we can complain and thow those away instead of panicking, - // since the next poll might bring the correct data. - None => error!("Timeline has an active broker subscription, but got no updates. Other data length: {}", all_timeline_updates.len()), - } - }, - None => { - info!("Subscription source end was dropped, no more updates are possible, shutting down"); - break; - }, - }, - } - } - - info!("Waiting for the current connection to close"); - wal_connection_manager.close_connection().await; - broker_subscription - .cancel() - .await - .with_context(|| format!("Failed to cancel timeline {id} subscription in etcd"))?; - Ok(ControlFlow::Continue(())) -} - -/// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. -struct WalConnectionManager { - id: ZTenantTimelineId, - timeline: Arc, - wal_connect_timeout: Duration, - lagging_wal_timeout: Duration, - max_lsn_wal_lag: NonZeroU64, - wal_connection_attempt: u32, - wal_connection_data: Option, -} - -#[derive(Debug)] -struct WalConnectionData { - safekeeper_id: NodeId, - connection: WalReceiverConnection, - connection_init_time: NaiveDateTime, - last_wal_receiver_data: Option<(ReplicationFeedback, NaiveDateTime)>, -} - -#[derive(Debug, PartialEq, Eq)] -struct NewWalConnectionCandidate { - safekeeper_id: NodeId, - wal_producer_connstr: String, - reason: ReconnectReason, -} - -/// Stores the reason why WAL connection was switched, for furter debugging purposes. -#[derive(Debug, PartialEq, Eq)] -enum ReconnectReason { - NoExistingConnection, - LaggingWal { - current_lsn: Lsn, - new_lsn: Lsn, - threshold: NonZeroU64, - }, - NoWalTimeout { - last_wal_interaction: NaiveDateTime, - check_time: NaiveDateTime, - threshold: Duration, - }, -} - -impl WalConnectionManager { - /// Tries to get more data from the WAL connection. - /// If the WAL connection channel is dropped or no data is retrieved, a "timebomb" future is started to break the existing broker subscription. - /// This future is intended to be used in the `select!` loop, so lengthy future normally gets dropped due to other futures completing. - /// If not, it's better to cancel the entire "stuck" loop and start over. - async fn poll_connection_event_or_cancel(&mut self) -> ControlFlow<(), ()> { - let (connection_data, wal_receiver_event) = match self.wal_connection_data.as_mut() { - Some(connection_data) => match connection_data.connection.next_event().await { - Some(event) => (connection_data, event), - None => { - warn!("WAL receiver event source stopped sending messages, waiting for other events to arrive"); - tokio::time::sleep(Duration::from_secs(30)).await; - warn!("WAL receiver without a connection spent sleeping 30s without being interrupted, aborting the loop"); - return ControlFlow::Break(()); - } - }, - None => { - tokio::time::sleep(Duration::from_secs(30)).await; - warn!("WAL receiver without a connection spent sleeping 30s without being interrupted, aborting the loop"); - return ControlFlow::Break(()); - } - }; - - match wal_receiver_event { - WalConnectionEvent::Started => { - self.wal_connection_attempt = 0; - } - WalConnectionEvent::NewWal(new_wal_data) => { - self.wal_connection_attempt = 0; - connection_data.last_wal_receiver_data = - Some((new_wal_data, Utc::now().naive_utc())); - } - WalConnectionEvent::End(wal_receiver_result) => { - match wal_receiver_result { - Ok(()) => { - info!("WAL receiver task finished, reconnecting"); - self.wal_connection_attempt = 0; - } - Err(e) => { - error!("WAL receiver task failed: {e:#}, reconnecting"); - self.wal_connection_attempt += 1; - } - } - self.close_connection().await; - } - } - - ControlFlow::Continue(()) - } - - /// Shuts down current connection (if any), waiting for it to finish. - async fn close_connection(&mut self) { - if let Some(data) = self.wal_connection_data.as_mut() { - match data.connection.shutdown().await { - Err(e) => { - error!("Failed to shutdown wal receiver connection: {e:#}"); - } - Ok(()) => self.wal_connection_data = None, - } - } - } - - /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. - async fn change_connection( - &mut self, - new_safekeeper_id: NodeId, - new_wal_producer_connstr: String, - ) { - self.close_connection().await; - self.wal_connection_data = Some(WalConnectionData { - safekeeper_id: new_safekeeper_id, - connection: WalReceiverConnection::open( - self.id, - new_safekeeper_id, - new_wal_producer_connstr, - self.wal_connect_timeout, - ), - connection_init_time: Utc::now().naive_utc(), - last_wal_receiver_data: None, - }); - } - - /// Checks current state against every fetched safekeeper state of a given timeline. - /// Returns a new candidate, if the current state is somewhat lagging, or `None` otherwise. - /// The current rules for approving new candidates: - /// * pick from the input data from etcd for currently connected safekeeper (if any) - /// * out of the rest input entries, pick one with biggest `commit_lsn` that's after than pageserver's latest Lsn for the timeline - /// * if there's no such entry, no new candidate found, abort - /// * otherwise, check if etcd updates contain currently connected safekeeper - /// * if not, that means no WAL updates happened after certain time (either none since the connection time or none since the last event after the connection) - /// Reconnect if the time exceeds the threshold. - /// * if there's one, compare its Lsn with the other candidate's, reconnect if candidate's over threshold - /// - /// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently. - /// Both thresholds are configured per tenant. - fn select_connection_candidate( - &self, - mut safekeeper_timelines: HashMap, - ) -> Option { - let current_sk_data_updated = - self.wal_connection_data - .as_ref() - .and_then(|connection_data| { - safekeeper_timelines.remove(&connection_data.safekeeper_id) - }); - - let candidate_sk_data = safekeeper_timelines - .iter() - .filter(|(_, info)| { - info.commit_lsn > Some(self.timeline.tline.get_last_record_lsn()) - }) - .filter_map(|(sk_id, info)| { - match wal_stream_connection_string( - self.id, - info.safekeeper_connstr.as_deref()?, - ) { - Ok(connstr) => Some((sk_id, info, connstr)), - Err(e) => { - error!("Failed to create wal receiver connection string from broker data of safekeeper node {sk_id}: {e:#}"); - None - } - } - }) - .max_by_key(|(_, info, _)| info.commit_lsn); - - match (current_sk_data_updated, candidate_sk_data) { - // No better candidate than one we're already connected to: - // whatever data update comes for the connected one, we don't have a better candidate - (_, None) => None, - - // No updates from the old SK in this batch, but some candidate is available: - // check how long time ago did we receive updates from the current SK, switch connections in case it's over the threshold - (None, Some((&new_sk_id, _, new_wal_producer_connstr))) => { - match self.wal_connection_data.as_ref() { - Some(current_connection) => { - let last_sk_interaction_time = - match current_connection.last_wal_receiver_data.as_ref() { - Some((_, data_submission_time)) => *data_submission_time, - None => current_connection.connection_init_time, - }; - - let now = Utc::now().naive_utc(); - match (now - last_sk_interaction_time).to_std() { - Ok(last_interaction) => { - if last_interaction > self.lagging_wal_timeout { - return Some(NewWalConnectionCandidate { - safekeeper_id: new_sk_id, - wal_producer_connstr: new_wal_producer_connstr, - reason: ReconnectReason::NoWalTimeout { - last_wal_interaction: last_sk_interaction_time, - check_time: now, - threshold: self.lagging_wal_timeout, - }, - }); - } - } - Err(_e) => { - warn!("Last interaction with safekeeper {} happened in the future, ignoring the candidate. Interaction time: {last_sk_interaction_time}, now: {now}", current_connection.safekeeper_id); - } - } - None - } - None => Some(NewWalConnectionCandidate { - safekeeper_id: new_sk_id, - wal_producer_connstr: new_wal_producer_connstr, - reason: ReconnectReason::NoExistingConnection, - }), - } - } - // Both current SK got updated via etcd and there's another candidate with suitable Lsn: - // check how bigger the new SK Lsn is in the future compared to the current SK, switch connections in case it's over the threshold - ( - Some(current_sk_timeline), - Some((&new_sk_id, new_sk_timeline, new_wal_producer_connstr)), - ) => { - let new_lsn = new_sk_timeline.commit_lsn.unwrap_or(Lsn(0)); - let current_lsn = current_sk_timeline.commit_lsn.unwrap_or(Lsn(0)); - match new_lsn.0.checked_sub(current_lsn.0) - { - Some(new_sk_lsn_advantage) => { - if new_sk_lsn_advantage >= self.max_lsn_wal_lag.get() { - return Some( - NewWalConnectionCandidate { - safekeeper_id: new_sk_id, - wal_producer_connstr: new_wal_producer_connstr, - reason: ReconnectReason::LaggingWal { current_lsn, new_lsn, threshold: self.max_lsn_wal_lag }, - }); - } - } - None => debug!("Best SK candidate has its commit Lsn behind the current timeline's latest consistent Lsn"), - } - - None - } - } - } -} - -fn wal_stream_connection_string( - ZTenantTimelineId { - tenant_id, - timeline_id, - }: ZTenantTimelineId, - listen_pg_addr_str: &str, -) -> anyhow::Result { - let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db"); - let me_conf = sk_connstr - .parse::() - .with_context(|| { - format!("Failed to parse pageserver connection string '{sk_connstr}' as a postgres one") - })?; - let (host, port) = utils::connstring::connection_host_port(&me_conf); - Ok(format!( - "host={host} port={port} options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'" - )) -} - -#[cfg(test)] -mod tests { - use std::time::SystemTime; - - use crate::repository::{ - repo_harness::{RepoHarness, TIMELINE_ID}, - Repository, - }; - - use super::*; - - #[test] - fn no_connection_no_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("no_connection_no_candidate")?; - let mut data_manager_with_no_connection = dummy_wal_connection_manager(&harness); - data_manager_with_no_connection.wal_connection_data = None; - - let no_candidate = - data_manager_with_no_connection.select_connection_candidate(HashMap::from([ - ( - NodeId(0), - SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(1)), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: None, - }, - ), - ( - NodeId(2), - SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: None, - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: Some("no commit_lsn".to_string()), - }, - ), - ( - NodeId(3), - SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: None, - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: Some("no commit_lsn".to_string()), - }, - ), - ])); - - assert!( - no_candidate.is_none(), - "Expected no candidate selected out of non full data options, but got {no_candidate:?}" - ); - - Ok(()) - } - - #[tokio::test] - async fn connection_no_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("connection_no_candidate")?; - - let current_lsn = 100_000; - let connected_sk_id = NodeId(0); - let mut data_manager_with_connection = dummy_wal_connection_manager(&harness); - let mut dummy_connection_data = dummy_connection_data( - ZTenantTimelineId { - tenant_id: harness.tenant_id, - timeline_id: TIMELINE_ID, - }, - connected_sk_id, - ) - .await; - let now = Utc::now().naive_utc(); - dummy_connection_data.last_wal_receiver_data = Some(( - ReplicationFeedback { - current_timeline_size: 1, - ps_writelsn: 1, - ps_applylsn: current_lsn, - ps_flushlsn: 1, - ps_replytime: SystemTime::now(), - }, - now, - )); - dummy_connection_data.connection_init_time = now; - data_manager_with_connection.wal_connection_data = Some(dummy_connection_data); - - let no_candidate = - data_manager_with_connection.select_connection_candidate(HashMap::from([ - ( - connected_sk_id, - SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn( - current_lsn + data_manager_with_connection.max_lsn_wal_lag.get() * 2 - )), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), - }, - ), - ( - NodeId(1), - SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(current_lsn)), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: Some("not advanced Lsn".to_string()), - }, - ), - ( - NodeId(2), - SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn( - current_lsn + data_manager_with_connection.max_lsn_wal_lag.get() / 2 - )), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: Some("not enough advanced Lsn".to_string()), - }, - ), - ])); - - assert!( - no_candidate.is_none(), - "Expected no candidate selected out of valid options since candidate Lsn data is ignored and others' was not advanced enough, but got {no_candidate:?}" - ); - - Ok(()) - } - - #[test] - fn no_connection_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("no_connection_candidate")?; - let mut data_manager_with_no_connection = dummy_wal_connection_manager(&harness); - data_manager_with_no_connection.wal_connection_data = None; - - let only_candidate = data_manager_with_no_connection - .select_connection_candidate(HashMap::from([( - NodeId(0), - SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(1 + data_manager_with_no_connection - .max_lsn_wal_lag - .get())), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), - }, - )])) - .expect("Expected one candidate selected out of the only data option, but got none"); - assert_eq!(only_candidate.safekeeper_id, NodeId(0)); - assert_eq!( - only_candidate.reason, - ReconnectReason::NoExistingConnection, - "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold" - ); - assert!(only_candidate - .wal_producer_connstr - .contains(DUMMY_SAFEKEEPER_CONNSTR)); - - let selected_lsn = 100_000; - let biggest_wal_candidate = data_manager_with_no_connection - .select_connection_candidate(HashMap::from([ - ( - NodeId(0), - SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(selected_lsn - 100)), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: Some("smaller commit_lsn".to_string()), - }, - ), - ( - NodeId(1), - SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(selected_lsn)), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), - }, - ), - ( - NodeId(2), - SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(selected_lsn + 100)), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: None, - }, - ), - ])) - .expect( - "Expected one candidate selected out of multiple valid data options, but got none", - ); - - assert_eq!(biggest_wal_candidate.safekeeper_id, NodeId(1)); - assert_eq!( - biggest_wal_candidate.reason, - ReconnectReason::NoExistingConnection, - "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold" - ); - assert!(biggest_wal_candidate - .wal_producer_connstr - .contains(DUMMY_SAFEKEEPER_CONNSTR)); - - Ok(()) - } - - #[tokio::test] - async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("lsn_wal_over_threshcurrent_candidate")?; - let current_lsn = Lsn(100_000).align(); - - let id = ZTenantTimelineId { - tenant_id: harness.tenant_id, - timeline_id: TIMELINE_ID, - }; - - let mut data_manager_with_connection = dummy_wal_connection_manager(&harness); - let connected_sk_id = NodeId(0); - let mut dummy_connection_data = dummy_connection_data(id, connected_sk_id).await; - let lagging_wal_timeout = - chrono::Duration::from_std(data_manager_with_connection.lagging_wal_timeout)?; - let time_over_threshold = - Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; - dummy_connection_data.last_wal_receiver_data = Some(( - ReplicationFeedback { - current_timeline_size: 1, - ps_writelsn: current_lsn.0, - ps_applylsn: 1, - ps_flushlsn: 1, - ps_replytime: SystemTime::now(), - }, - time_over_threshold, - )); - dummy_connection_data.connection_init_time = time_over_threshold; - data_manager_with_connection.wal_connection_data = Some(dummy_connection_data); - - let new_lsn = Lsn(current_lsn.0 + data_manager_with_connection.max_lsn_wal_lag.get() + 1); - let candidates = HashMap::from([ - ( - connected_sk_id, - SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(current_lsn), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), - }, - ), - ( - NodeId(1), - SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(new_lsn), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: Some("advanced by Lsn safekeeper".to_string()), - }, - ), - ]); - - let over_threshcurrent_candidate = data_manager_with_connection - .select_connection_candidate(candidates) - .expect( - "Expected one candidate selected out of multiple valid data options, but got none", - ); - - assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(1)); - assert_eq!( - over_threshcurrent_candidate.reason, - ReconnectReason::LaggingWal { - current_lsn, - new_lsn, - threshold: data_manager_with_connection.max_lsn_wal_lag - }, - "Should select bigger WAL safekeeper if it starts to lag enough" - ); - assert!(over_threshcurrent_candidate - .wal_producer_connstr - .contains("advanced by Lsn safekeeper")); - - Ok(()) - } - - #[tokio::test] - async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("timeout_wal_over_threshhold_current_candidate")?; - let current_lsn = Lsn(100_000).align(); - - let id = ZTenantTimelineId { - tenant_id: harness.tenant_id, - timeline_id: TIMELINE_ID, - }; - - let mut data_manager_with_connection = dummy_wal_connection_manager(&harness); - let mut dummy_connection_data = dummy_connection_data(id, NodeId(1)).await; - let lagging_wal_timeout = - chrono::Duration::from_std(data_manager_with_connection.lagging_wal_timeout)?; - let time_over_threshold = - Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; - dummy_connection_data.last_wal_receiver_data = None; - dummy_connection_data.connection_init_time = time_over_threshold; - data_manager_with_connection.wal_connection_data = Some(dummy_connection_data); - - let over_threshcurrent_candidate = data_manager_with_connection - .select_connection_candidate(HashMap::from([( - NodeId(0), - SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(current_lsn), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), - }, - )])) - .expect( - "Expected one candidate selected out of multiple valid data options, but got none", - ); - - assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0)); - match over_threshcurrent_candidate.reason { - ReconnectReason::NoWalTimeout { - last_wal_interaction, - threshold, - .. - } => { - assert_eq!(last_wal_interaction, time_over_threshold); - assert_eq!(threshold, data_manager_with_connection.lagging_wal_timeout); - } - unexpected => panic!("Unexpected reason: {unexpected:?}"), - } - assert!(over_threshcurrent_candidate - .wal_producer_connstr - .contains(DUMMY_SAFEKEEPER_CONNSTR)); - - Ok(()) - } - - fn dummy_wal_connection_manager(harness: &RepoHarness) -> WalConnectionManager { - WalConnectionManager { - id: ZTenantTimelineId { - tenant_id: harness.tenant_id, - timeline_id: TIMELINE_ID, - }, - timeline: Arc::new(DatadirTimelineImpl::new( - harness - .load() - .create_empty_timeline(TIMELINE_ID, Lsn(0)) - .expect("Failed to create an empty timeline for dummy wal connection manager"), - 10_000, - )), - wal_connect_timeout: Duration::from_secs(1), - lagging_wal_timeout: Duration::from_secs(10), - max_lsn_wal_lag: NonZeroU64::new(300_000).unwrap(), - wal_connection_attempt: 0, - wal_connection_data: None, - } - } - - const DUMMY_SAFEKEEPER_CONNSTR: &str = "safekeeper_connstr"; - - // the function itself does not need async, but it spawns a tokio::task underneath hence neeed - // a runtime to not to panic - async fn dummy_connection_data( - id: ZTenantTimelineId, - safekeeper_id: NodeId, - ) -> WalConnectionData { - let dummy_connstr = wal_stream_connection_string(id, DUMMY_SAFEKEEPER_CONNSTR) - .expect("Failed to construct dummy wal producer connstr"); - WalConnectionData { - safekeeper_id, - connection: WalReceiverConnection::open( - id, - safekeeper_id, - dummy_connstr, - Duration::from_secs(1), - ), - connection_init_time: Utc::now().naive_utc(), - last_wal_receiver_data: None, - } - } -} diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs new file mode 100644 index 0000000000..d5ca1d5159 --- /dev/null +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -0,0 +1,1133 @@ +//! WAL receiver logic that ensures the pageserver gets connectected to safekeeper, +//! that contains the latest WAL to stream and this connection does not go stale. +//! +//! To achieve that, a etcd broker is used: safekepers propagate their timelines' state in it, +//! the manager subscribes for changes and accumulates those to query the one with the biggest Lsn for connection. +//! Current connection state is tracked too, to ensure it's not getting stale. +//! +//! After every connection or etcd update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader, +//! then a [re]connection happens, if necessary. +//! Only WAL streaming task expects to be finished, other loops (etcd, connection management) never exit unless cancelled explicitly via the dedicated channel. + +use std::{ + collections::{hash_map, HashMap}, + num::NonZeroU64, + sync::Arc, + time::Duration, +}; + +use anyhow::Context; +use chrono::{DateTime, Local, NaiveDateTime, Utc}; +use etcd_broker::{ + subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription, + BrokerUpdate, Client, +}; +use tokio::select; +use tracing::*; + +use crate::DatadirTimelineImpl; +use utils::{ + lsn::Lsn, + pq_proto::ReplicationFeedback, + zid::{NodeId, ZTenantTimelineId}, +}; + +use super::{TaskEvent, TaskHandle}; + +/// Spawns the loop to take care of the timeline's WAL streaming connection. +pub(super) fn spawn_connection_manager_task( + id: ZTenantTimelineId, + broker_loop_prefix: String, + mut client: Client, + local_timeline: Arc, + wal_connect_timeout: Duration, + lagging_wal_timeout: Duration, + max_lsn_wal_lag: NonZeroU64, +) -> TaskHandle<()> { + TaskHandle::spawn(move |_, mut cancellation| { + async move { + info!("WAL receiver broker started, connecting to etcd"); + let mut walreceiver_state = WalreceiverState::new( + id, + local_timeline, + wal_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + ); + loop { + select! { + _ = cancellation.changed() => { + info!("Broker subscription init cancelled, shutting down"); + if let Some(wal_connection) = walreceiver_state.wal_connection.take() + { + wal_connection.connection_task.shutdown().await; + } + return Ok(()); + }, + + _ = connection_manager_loop_step( + &broker_loop_prefix, + &mut client, + &mut walreceiver_state, + ) => {}, + } + } + } + .instrument(info_span!("wal_connection_manager", id = %id)) + }) +} + +/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. +/// Based on the updates, desides whether to start, keep or stop a WAL receiver task. +/// If etcd subscription is cancelled, exits. +async fn connection_manager_loop_step( + broker_prefix: &str, + etcd_client: &mut Client, + walreceiver_state: &mut WalreceiverState, +) { + let id = walreceiver_state.id; + + // XXX: We never explicitly cancel etcd task, instead establishing one and never letting it go, + // running the entire loop step as much as possible to an end. + // The task removal happens implicitly on drop, both aborting the etcd subscription task and dropping the receiver channel end, + // forcing the etcd subscription to exit either way. + let mut broker_subscription = + subscribe_for_timeline_updates(etcd_client, broker_prefix, id).await; + info!("Subscribed for etcd timeline changes, waiting for new etcd data"); + + loop { + select! { + broker_connection_result = &mut broker_subscription.watcher_handle => { + cleanup_broker_connection(broker_connection_result, walreceiver_state); + return; + }, + + Some(wal_connection_update) = async { + match walreceiver_state.wal_connection.as_mut() { + Some(wal_connection) => { + let receiver = &mut wal_connection.connection_task.events_receiver; + Some(match receiver.changed().await { + Ok(()) => receiver.borrow().clone(), + Err(_cancellation_error) => TaskEvent::End(Ok(())), + }) + } + None => None, + } + } => { + let (connection_update, reset_connection_attempts) = match &wal_connection_update { + TaskEvent::Started => (Some(Utc::now().naive_utc()), true), + TaskEvent::NewEvent(replication_feedback) => (Some(DateTime::::from(replication_feedback.ps_replytime).naive_utc()), true), + TaskEvent::End(end_result) => { + let should_reset_connection_attempts = match end_result { + Ok(()) => { + debug!("WAL receiving task finished"); + true + }, + Err(e) => { + warn!("WAL receiving task failed: {e}"); + false + }, + }; + walreceiver_state.wal_connection = None; + (None, should_reset_connection_attempts) + }, + }; + + if let Some(connection_update) = connection_update { + match &mut walreceiver_state.wal_connection { + Some(wal_connection) => { + wal_connection.latest_connection_update = connection_update; + + let attempts_entry = walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0); + if reset_connection_attempts { + *attempts_entry = 0; + } else { + *attempts_entry += 1; + } + }, + None => error!("Received connection update for WAL connection that is not active, update: {wal_connection_update:?}"), + } + } + }, + + broker_update = broker_subscription.value_updates.recv() => { + match broker_update { + Some(broker_update) => walreceiver_state.register_timeline_update(broker_update), + None => { + info!("Broker sender end was dropped, ending current broker loop step"); + // Ensure to cancel and wait for the broker subscription task end, to log its result. + // Broker sender end is in the broker subscription task and its drop means abnormal task completion. + // First, ensure that the task is stopped (abort can be done without errors on already stopped tasks and repeated multiple times). + broker_subscription.watcher_handle.abort(); + // Then, wait for the task to finish and print its result. If the task was finished before abort (which we assume in this abnormal case), + // a proper error message will be printed, otherwise an abortion message is printed which is ok, since we're signalled to finish anyway. + cleanup_broker_connection( + (&mut broker_subscription.watcher_handle).await, + walreceiver_state, + ); + return; + } + } + }, + } + + // Fetch more etcd timeline updates, but limit ourselves since they may arrive quickly. + let mut max_events_to_poll = 100_u32; + while max_events_to_poll > 0 { + if let Ok(broker_update) = broker_subscription.value_updates.try_recv() { + walreceiver_state.register_timeline_update(broker_update); + max_events_to_poll -= 1; + } else { + break; + } + } + + if let Some(new_candidate) = walreceiver_state.next_connection_candidate() { + info!("Switching to new connection candidate: {new_candidate:?}"); + walreceiver_state + .change_connection( + new_candidate.safekeeper_id, + new_candidate.wal_producer_connstr, + ) + .await + } + } +} + +fn cleanup_broker_connection( + broker_connection_result: Result, tokio::task::JoinError>, + walreceiver_state: &mut WalreceiverState, +) { + match broker_connection_result { + Ok(Ok(())) => info!("Broker conneciton task finished, ending current broker loop step"), + Ok(Err(broker_error)) => warn!("Broker conneciton ended with error: {broker_error}"), + Err(abort_error) => { + if abort_error.is_panic() { + error!("Broker connection panicked: {abort_error}") + } else { + debug!("Broker connection aborted: {abort_error}") + } + } + } + + walreceiver_state.wal_stream_candidates.clear(); +} + +/// Endlessly try to subscribe for broker updates for a given timeline. +/// If there are no safekeepers to maintain the lease, the timeline subscription will be unavailable in the broker and the operation will fail constantly. +/// This is ok, pageservers should anyway try subscribing (with some backoff) since it's the only way they can get the timeline WAL anyway. +async fn subscribe_for_timeline_updates( + etcd_client: &mut Client, + broker_prefix: &str, + id: ZTenantTimelineId, +) -> BrokerSubscription { + let mut attempt = 0; + loop { + exponential_backoff( + attempt, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ) + .await; + attempt += 1; + + match etcd_broker::subscribe_for_json_values( + etcd_client, + SubscriptionKey::sk_timeline_info(broker_prefix.to_owned(), id), + ) + .instrument(info_span!("etcd_subscription")) + .await + { + Ok(new_subscription) => { + return new_subscription; + } + Err(e) => { + warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in etcd: {e:#}"); + continue; + } + } + } +} + +const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 2.0; +const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 60.0; + +async fn exponential_backoff(n: u32, base: f64, max_seconds: f64) { + if n == 0 { + return; + } + let seconds_to_wait = base.powf(f64::from(n) - 1.0).min(max_seconds); + info!("Backoff: waiting {seconds_to_wait} seconds before proceeding with the task"); + tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await; +} + +/// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. +struct WalreceiverState { + id: ZTenantTimelineId, + /// Use pageserver data about the timeline to filter out some of the safekeepers. + local_timeline: Arc, + /// The timeout on the connection to safekeeper for WAL streaming. + wal_connect_timeout: Duration, + /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. + lagging_wal_timeout: Duration, + /// The Lsn lag to use to determine when the current connection is lagging to much behind and reconnect to the other one. + max_lsn_wal_lag: NonZeroU64, + /// Current connection to safekeeper for WAL streaming. + wal_connection: Option, + wal_connection_attempts: HashMap, + /// Data about all timelines, available for connection, fetched from etcd, grouped by their corresponding safekeeper node id. + wal_stream_candidates: HashMap, +} + +/// Current connection data. +#[derive(Debug)] +struct WalConnection { + /// Current safekeeper pageserver is connected to for WAL streaming. + sk_id: NodeId, + /// Connection task start time or the timestamp of a latest connection message received. + latest_connection_update: NaiveDateTime, + /// WAL streaming task handle. + connection_task: TaskHandle, +} + +/// Data about the timeline to connect to, received from etcd. +#[derive(Debug)] +struct EtcdSkTimeline { + timeline: SkTimelineInfo, + /// Etcd generation, the bigger it is, the more up to date the timeline data is. + etcd_version: i64, + /// Time at which the data was fetched from etcd last time, to track the stale data. + latest_update: NaiveDateTime, +} + +impl WalreceiverState { + fn new( + id: ZTenantTimelineId, + local_timeline: Arc, + wal_connect_timeout: Duration, + lagging_wal_timeout: Duration, + max_lsn_wal_lag: NonZeroU64, + ) -> Self { + Self { + id, + local_timeline, + wal_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + wal_connection: None, + wal_stream_candidates: HashMap::new(), + wal_connection_attempts: HashMap::new(), + } + } + + /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. + async fn change_connection(&mut self, new_sk_id: NodeId, new_wal_producer_connstr: String) { + if let Some(old_connection) = self.wal_connection.take() { + old_connection.connection_task.shutdown().await + } + + let id = self.id; + let connect_timeout = self.wal_connect_timeout; + let connection_attempt = self + .wal_connection_attempts + .get(&new_sk_id) + .copied() + .unwrap_or(0); + let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { + async move { + exponential_backoff( + connection_attempt, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ) + .await; + super::walreceiver_connection::handle_walreceiver_connection( + id, + &new_wal_producer_connstr, + events_sender.as_ref(), + cancellation, + connect_timeout, + ) + .await + .map_err(|e| format!("walreceiver connection handling failure: {e:#}")) + } + .instrument(info_span!("walreceiver_connection", id = %id)) + }); + + self.wal_connection = Some(WalConnection { + sk_id: new_sk_id, + latest_connection_update: Utc::now().naive_utc(), + connection_task: connection_handle, + }); + } + + /// Adds another etcd timeline into the state, if its more recent than the one already added there for the same key. + fn register_timeline_update(&mut self, timeline_update: BrokerUpdate) { + match self + .wal_stream_candidates + .entry(timeline_update.key.node_id) + { + hash_map::Entry::Occupied(mut o) => { + let existing_value = o.get_mut(); + if existing_value.etcd_version < timeline_update.etcd_version { + existing_value.etcd_version = timeline_update.etcd_version; + existing_value.timeline = timeline_update.value; + existing_value.latest_update = Utc::now().naive_utc(); + } + } + hash_map::Entry::Vacant(v) => { + v.insert(EtcdSkTimeline { + timeline: timeline_update.value, + etcd_version: timeline_update.etcd_version, + latest_update: Utc::now().naive_utc(), + }); + } + } + } + + /// Cleans up stale etcd records and checks the rest for the new connection candidate. + /// Returns a new candidate, if the current state is absent or somewhat lagging, `None` otherwise. + /// The current rules for approving new candidates: + /// * pick from the input data from etcd for currently connected safekeeper (if any) + /// * out of the rest input entries, pick one with biggest `commit_lsn` that's after than pageserver's latest Lsn for the timeline + /// * if there's no such entry, no new candidate found, abort + /// * check the current connection time data for staleness, reconnect if stale + /// * otherwise, check if etcd updates contain currently connected safekeeper + /// * if not, that means no WAL updates happened after certain time (either none since the connection time or none since the last event after the connection) + /// Reconnect if the time exceeds the threshold. + /// * if there's one, compare its Lsn with the other candidate's, reconnect if candidate's over threshold + /// + /// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently. + /// Both thresholds are configured per tenant. + fn next_connection_candidate(&mut self) -> Option { + self.cleanup_old_candidates(); + + match &self.wal_connection { + Some(existing_wal_connection) => { + let connected_sk_node = existing_wal_connection.sk_id; + + let (new_sk_id, new_safekeeper_etcd_data, new_wal_producer_connstr) = self + .applicable_connection_candidates() + .filter(|&(sk_id, _, _)| sk_id != connected_sk_node) + .max_by_key(|(_, info, _)| info.commit_lsn)?; + + let now = Utc::now().naive_utc(); + if let Ok(latest_interaciton) = + (now - existing_wal_connection.latest_connection_update).to_std() + { + if latest_interaciton > self.lagging_wal_timeout { + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_producer_connstr: new_wal_producer_connstr, + reason: ReconnectReason::NoWalTimeout { + last_wal_interaction: Some( + existing_wal_connection.latest_connection_update, + ), + check_time: now, + threshold: self.lagging_wal_timeout, + }, + }); + } + } + + match self.wal_stream_candidates.get(&connected_sk_node) { + Some(current_connection_etcd_data) => { + let new_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0)); + let current_lsn = current_connection_etcd_data + .timeline + .commit_lsn + .unwrap_or(Lsn(0)); + match new_lsn.0.checked_sub(current_lsn.0) + { + Some(new_sk_lsn_advantage) => { + if new_sk_lsn_advantage >= self.max_lsn_wal_lag.get() { + return Some( + NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_producer_connstr: new_wal_producer_connstr, + reason: ReconnectReason::LaggingWal { current_lsn, new_lsn, threshold: self.max_lsn_wal_lag }, + }); + } + } + None => debug!("Best SK candidate has its commit Lsn behind the current timeline's latest consistent Lsn"), + } + } + None => { + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_producer_connstr: new_wal_producer_connstr, + reason: ReconnectReason::NoEtcdDataForExistingConnection, + }) + } + } + } + None => { + let (new_sk_id, _, new_wal_producer_connstr) = self + .applicable_connection_candidates() + .max_by_key(|(_, info, _)| info.commit_lsn)?; + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_producer_connstr: new_wal_producer_connstr, + reason: ReconnectReason::NoExistingConnection, + }); + } + } + + None + } + + fn applicable_connection_candidates( + &self, + ) -> impl Iterator { + self.wal_stream_candidates + .iter() + .filter(|(_, etcd_info)| { + etcd_info.timeline.commit_lsn > Some(self.local_timeline.get_last_record_lsn()) + }) + .filter_map(|(sk_id, etcd_info)| { + let info = &etcd_info.timeline; + match wal_stream_connection_string( + self.id, + info.safekeeper_connstr.as_deref()?, + ) { + Ok(connstr) => Some((*sk_id, info, connstr)), + Err(e) => { + error!("Failed to create wal receiver connection string from broker data of safekeeper node {}: {e:#}", sk_id); + None + } + } + }) + } + + fn cleanup_old_candidates(&mut self) { + self.wal_stream_candidates.retain(|_, etcd_info| { + if let Ok(time_since_latest_etcd_update) = + (Utc::now().naive_utc() - etcd_info.latest_update).to_std() + { + time_since_latest_etcd_update < self.lagging_wal_timeout + } else { + true + } + }); + } +} + +#[derive(Debug, PartialEq, Eq)] +struct NewWalConnectionCandidate { + safekeeper_id: NodeId, + wal_producer_connstr: String, + reason: ReconnectReason, +} + +/// Stores the reason why WAL connection was switched, for furter debugging purposes. +#[derive(Debug, PartialEq, Eq)] +enum ReconnectReason { + NoExistingConnection, + NoEtcdDataForExistingConnection, + LaggingWal { + current_lsn: Lsn, + new_lsn: Lsn, + threshold: NonZeroU64, + }, + NoWalTimeout { + last_wal_interaction: Option, + check_time: NaiveDateTime, + threshold: Duration, + }, +} + +fn wal_stream_connection_string( + ZTenantTimelineId { + tenant_id, + timeline_id, + }: ZTenantTimelineId, + listen_pg_addr_str: &str, +) -> anyhow::Result { + let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db"); + let me_conf = sk_connstr + .parse::() + .with_context(|| { + format!("Failed to parse pageserver connection string '{sk_connstr}' as a postgres one") + })?; + let (host, port) = utils::connstring::connection_host_port(&me_conf); + Ok(format!( + "host={host} port={port} options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'" + )) +} + +#[cfg(test)] +mod tests { + use std::time::SystemTime; + + use crate::repository::{ + repo_harness::{RepoHarness, TIMELINE_ID}, + Repository, + }; + + use super::*; + + #[test] + fn no_connection_no_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("no_connection_no_candidate")?; + let mut state = dummy_state(&harness); + let now = Utc::now().naive_utc(); + + let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?; + let delay_over_threshold = now - lagging_wal_timeout - lagging_wal_timeout; + + state.wal_connection = None; + state.wal_stream_candidates = HashMap::from([ + ( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: None, + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(1), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: None, + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("no commit_lsn".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(2), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: None, + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("no commit_lsn".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(3), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: delay_over_threshold, + }, + ), + ]); + + let no_candidate = state.next_connection_candidate(); + assert!( + no_candidate.is_none(), + "Expected no candidate selected out of non full data options, but got {no_candidate:?}" + ); + + Ok(()) + } + + #[tokio::test] + async fn connection_no_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("connection_no_candidate")?; + let mut state = dummy_state(&harness); + let now = Utc::now().naive_utc(); + + let connected_sk_id = NodeId(0); + let current_lsn = 100_000; + + state.max_lsn_wal_lag = NonZeroU64::new(100).unwrap(); + state.wal_connection = Some(WalConnection { + sk_id: connected_sk_id, + latest_connection_update: now, + connection_task: TaskHandle::spawn(move |sender, _| async move { + sender + .send(TaskEvent::NewEvent(ReplicationFeedback { + current_timeline_size: 1, + ps_writelsn: 1, + ps_applylsn: current_lsn, + ps_flushlsn: 1, + ps_replytime: SystemTime::now(), + })) + .ok(); + Ok(()) + }), + }); + state.wal_stream_candidates = HashMap::from([ + ( + connected_sk_id, + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() * 2)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(1), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(current_lsn)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("not advanced Lsn".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(2), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() / 2)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("not enough advanced Lsn".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ]); + + let no_candidate = state.next_connection_candidate(); + assert!( + no_candidate.is_none(), + "Expected no candidate selected out of valid options since candidate Lsn data is ignored and others' was not advanced enough, but got {no_candidate:?}" + ); + + Ok(()) + } + + #[test] + fn no_connection_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("no_connection_candidate")?; + let mut state = dummy_state(&harness); + let now = Utc::now().naive_utc(); + + state.wal_connection = None; + state.wal_stream_candidates = HashMap::from([( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + )]); + + let only_candidate = state + .next_connection_candidate() + .expect("Expected one candidate selected out of the only data option, but got none"); + assert_eq!(only_candidate.safekeeper_id, NodeId(0)); + assert_eq!( + only_candidate.reason, + ReconnectReason::NoExistingConnection, + "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold" + ); + assert!(only_candidate + .wal_producer_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + + let selected_lsn = 100_000; + state.wal_stream_candidates = HashMap::from([ + ( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(selected_lsn - 100)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("smaller commit_lsn".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(1), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(selected_lsn)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(2), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(selected_lsn + 100)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: None, + }, + etcd_version: 0, + latest_update: now, + }, + ), + ]); + let biggest_wal_candidate = state.next_connection_candidate().expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(biggest_wal_candidate.safekeeper_id, NodeId(1)); + assert_eq!( + biggest_wal_candidate.reason, + ReconnectReason::NoExistingConnection, + "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold" + ); + assert!(biggest_wal_candidate + .wal_producer_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + + Ok(()) + } + + #[tokio::test] + async fn connection_no_etcd_data_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("connection_no_etcd_data_candidate")?; + let mut state = dummy_state(&harness); + + let now = Utc::now().naive_utc(); + let current_lsn = Lsn(100_000).align(); + let connected_sk_id = NodeId(0); + let other_sk_id = NodeId(connected_sk_id.0 + 1); + + state.wal_connection = Some(WalConnection { + sk_id: connected_sk_id, + latest_connection_update: now, + connection_task: TaskHandle::spawn(move |sender, _| async move { + sender + .send(TaskEvent::NewEvent(ReplicationFeedback { + current_timeline_size: 1, + ps_writelsn: current_lsn.0, + ps_applylsn: 1, + ps_flushlsn: 1, + ps_replytime: SystemTime::now(), + })) + .ok(); + Ok(()) + }), + }); + state.wal_stream_candidates = HashMap::from([( + other_sk_id, + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + )]); + + let only_candidate = state + .next_connection_candidate() + .expect("Expected one candidate selected out of the only data option, but got none"); + assert_eq!(only_candidate.safekeeper_id, other_sk_id); + assert_eq!( + only_candidate.reason, + ReconnectReason::NoEtcdDataForExistingConnection, + "Should select new safekeeper due to missing etcd data, even if there's an existing connection with this safekeeper" + ); + assert!(only_candidate + .wal_producer_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + + Ok(()) + } + + #[tokio::test] + async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("lsn_wal_over_threshcurrent_candidate")?; + let mut state = dummy_state(&harness); + let current_lsn = Lsn(100_000).align(); + let now = Utc::now().naive_utc(); + + let connected_sk_id = NodeId(0); + let new_lsn = Lsn(current_lsn.0 + state.max_lsn_wal_lag.get() + 1); + + state.wal_connection = Some(WalConnection { + sk_id: connected_sk_id, + latest_connection_update: now, + connection_task: TaskHandle::spawn(move |sender, _| async move { + sender + .send(TaskEvent::NewEvent(ReplicationFeedback { + current_timeline_size: 1, + ps_writelsn: current_lsn.0, + ps_applylsn: 1, + ps_flushlsn: 1, + ps_replytime: SystemTime::now(), + })) + .ok(); + Ok(()) + }), + }); + state.wal_stream_candidates = HashMap::from([ + ( + connected_sk_id, + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(current_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(1), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(new_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("advanced by Lsn safekeeper".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ]); + + let over_threshcurrent_candidate = state.next_connection_candidate().expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(1)); + assert_eq!( + over_threshcurrent_candidate.reason, + ReconnectReason::LaggingWal { + current_lsn, + new_lsn, + threshold: state.max_lsn_wal_lag + }, + "Should select bigger WAL safekeeper if it starts to lag enough" + ); + assert!(over_threshcurrent_candidate + .wal_producer_connstr + .contains("advanced by Lsn safekeeper")); + + Ok(()) + } + + #[tokio::test] + async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("timeout_wal_over_threshhold_current_candidate")?; + let mut state = dummy_state(&harness); + let current_lsn = Lsn(100_000).align(); + let now = Utc::now().naive_utc(); + + let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?; + let time_over_threshold = + Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; + + state.wal_connection = Some(WalConnection { + sk_id: NodeId(1), + latest_connection_update: time_over_threshold, + connection_task: TaskHandle::spawn(move |sender, _| async move { + sender + .send(TaskEvent::NewEvent(ReplicationFeedback { + current_timeline_size: 1, + ps_writelsn: current_lsn.0, + ps_applylsn: 1, + ps_flushlsn: 1, + ps_replytime: SystemTime::now(), + })) + .ok(); + Ok(()) + }), + }); + state.wal_stream_candidates = HashMap::from([( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(current_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + )]); + + let over_threshcurrent_candidate = state.next_connection_candidate().expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0)); + match over_threshcurrent_candidate.reason { + ReconnectReason::NoWalTimeout { + last_wal_interaction, + threshold, + .. + } => { + assert_eq!(last_wal_interaction, Some(time_over_threshold)); + assert_eq!(threshold, state.lagging_wal_timeout); + } + unexpected => panic!("Unexpected reason: {unexpected:?}"), + } + assert!(over_threshcurrent_candidate + .wal_producer_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + + Ok(()) + } + + #[tokio::test] + async fn timeout_connection_over_threshhold_current_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("timeout_connection_over_threshhold_current_candidate")?; + let mut state = dummy_state(&harness); + let current_lsn = Lsn(100_000).align(); + let now = Utc::now().naive_utc(); + + let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?; + let time_over_threshold = + Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; + + state.wal_connection = Some(WalConnection { + sk_id: NodeId(1), + latest_connection_update: time_over_threshold, + connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }), + }); + state.wal_stream_candidates = HashMap::from([( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(current_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + )]); + + let over_threshcurrent_candidate = state.next_connection_candidate().expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0)); + match over_threshcurrent_candidate.reason { + ReconnectReason::NoWalTimeout { + last_wal_interaction, + threshold, + .. + } => { + assert_eq!(last_wal_interaction, Some(time_over_threshold)); + assert_eq!(threshold, state.lagging_wal_timeout); + } + unexpected => panic!("Unexpected reason: {unexpected:?}"), + } + assert!(over_threshcurrent_candidate + .wal_producer_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + + Ok(()) + } + + const DUMMY_SAFEKEEPER_CONNSTR: &str = "safekeeper_connstr"; + + fn dummy_state(harness: &RepoHarness) -> WalreceiverState { + WalreceiverState { + id: ZTenantTimelineId { + tenant_id: harness.tenant_id, + timeline_id: TIMELINE_ID, + }, + local_timeline: Arc::new(DatadirTimelineImpl::new( + harness + .load() + .create_empty_timeline(TIMELINE_ID, Lsn(0)) + .expect("Failed to create an empty timeline for dummy wal connection manager"), + 10_000, + )), + wal_connect_timeout: Duration::from_secs(1), + lagging_wal_timeout: Duration::from_secs(1), + max_lsn_wal_lag: NonZeroU64::new(1).unwrap(), + wal_connection: None, + wal_stream_candidates: HashMap::new(), + wal_connection_attempts: HashMap::new(), + } + } +} diff --git a/pageserver/src/walreceiver/connection_handler.rs b/pageserver/src/walreceiver/walreceiver_connection.rs similarity index 78% rename from pageserver/src/walreceiver/connection_handler.rs rename to pageserver/src/walreceiver/walreceiver_connection.rs index 97b9b8cc9b..98b36dfe48 100644 --- a/pageserver/src/walreceiver/connection_handler.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -1,5 +1,5 @@ //! Actual Postgres connection handler to stream WAL to the server. -//! Runs as a separate, cancellable Tokio task. + use std::{ str::FromStr, sync::Arc, @@ -10,113 +10,29 @@ use anyhow::{bail, ensure, Context}; use bytes::BytesMut; use fail::fail_point; use postgres::{SimpleQueryMessage, SimpleQueryRow}; -use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; use tokio::{pin, select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; use tokio_stream::StreamExt; use tracing::{debug, error, info, info_span, trace, warn, Instrument}; -use utils::{ - lsn::Lsn, - pq_proto::ReplicationFeedback, - zid::{NodeId, ZTenantTimelineId}, -}; +use super::TaskEvent; use crate::{ http::models::WalReceiverEntry, repository::{Repository, Timeline}, tenant_mgr, walingest::WalIngest, }; +use postgres_ffi::waldecoder::WalStreamDecoder; +use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; -#[derive(Debug, Clone)] -pub enum WalConnectionEvent { - Started, - NewWal(ReplicationFeedback), - End(Result<(), String>), -} - -/// A wrapper around standalone Tokio task, to poll its updates or cancel the task. -#[derive(Debug)] -pub struct WalReceiverConnection { - handle: tokio::task::JoinHandle<()>, - cancellation: watch::Sender<()>, - events_receiver: watch::Receiver, -} - -impl WalReceiverConnection { - /// Initializes the connection task, returning a set of handles on top of it. - /// The task is started immediately after the creation, fails if no connection is established during the timeout given. - pub fn open( - id: ZTenantTimelineId, - safekeeper_id: NodeId, - wal_producer_connstr: String, - connect_timeout: Duration, - ) -> Self { - let (cancellation, mut cancellation_receiver) = watch::channel(()); - let (events_sender, events_receiver) = watch::channel(WalConnectionEvent::Started); - - let handle = tokio::spawn( - async move { - let connection_result = handle_walreceiver_connection( - id, - &wal_producer_connstr, - &events_sender, - &mut cancellation_receiver, - connect_timeout, - ) - .await - .map_err(|e| { - format!("Walreceiver connection for id {id} failed with error: {e:#}") - }); - - match &connection_result { - Ok(()) => { - debug!("Walreceiver connection for id {id} ended successfully") - } - Err(e) => warn!("{e}"), - } - events_sender - .send(WalConnectionEvent::End(connection_result)) - .ok(); - } - .instrument(info_span!("safekeeper_handle", sk = %safekeeper_id)), - ); - - Self { - handle, - cancellation, - events_receiver, - } - } - - /// Polls for the next WAL receiver event, if there's any available since the last check. - /// Blocks if there's no new event available, returns `None` if no new events will ever occur. - /// Only the last event is returned, all events received between observatins are lost. - pub async fn next_event(&mut self) -> Option { - match self.events_receiver.changed().await { - Ok(()) => Some(self.events_receiver.borrow().clone()), - Err(_cancellation_error) => None, - } - } - - /// Gracefully aborts current WAL streaming task, waiting for the current WAL streamed. - pub async fn shutdown(&mut self) -> anyhow::Result<()> { - self.cancellation.send(()).ok(); - let handle = &mut self.handle; - handle - .await - .context("Failed to join on a walreceiver connection task")?; - Ok(()) - } -} - -async fn handle_walreceiver_connection( +/// Opens a conneciton to the given wal producer and streams the WAL, sending progress messages during streaming. +pub async fn handle_walreceiver_connection( id: ZTenantTimelineId, wal_producer_connstr: &str, - events_sender: &watch::Sender, - cancellation: &mut watch::Receiver<()>, + events_sender: &watch::Sender>, + mut cancellation: watch::Receiver<()>, connect_timeout: Duration, ) -> anyhow::Result<()> { // Connect to the database in replication mode. @@ -214,8 +130,6 @@ async fn handle_walreceiver_connection( while let Some(replication_message) = { select! { - // check for shutdown first - biased; _ = cancellation.changed() => { info!("walreceiver interrupted"); None @@ -344,7 +258,7 @@ async fn handle_walreceiver_connection( .as_mut() .zenith_status_update(data.len() as u64, &data) .await?; - if let Err(e) = events_sender.send(WalConnectionEvent::NewWal(zenith_status_update)) { + if let Err(e) = events_sender.send(TaskEvent::NewEvent(zenith_status_update)) { warn!("Wal connection event listener dropped, aborting the connection: {e}"); return Ok(()); } diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 169b106aa9..d3f6fb8903 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -221,15 +221,12 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { .await .context("failed to subscribe for safekeeper info")?; loop { - match subscription.fetch_data().await { + match subscription.value_updates.recv().await { Some(new_info) => { - for (zttid, sk_info) in new_info { - // note: there are blocking operations below, but it's considered fine for now - if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { - for (safekeeper_id, info) in sk_info { - tli.record_safekeeper_info(&info, safekeeper_id).await? - } - } + // note: there are blocking operations below, but it's considered fine for now + if let Ok(tli) = GlobalTimelines::get(&conf, new_info.key.id, false) { + tli.record_safekeeper_info(&new_info.value, new_info.key.node_id) + .await? } } None => { From 3c2b03cd8781cdc51d2ad312250eb400d994b56b Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 23 Jun 2022 12:28:12 +0300 Subject: [PATCH 0436/1022] Update timeline size on dropdb. Add the test (#1973) In addition, fix database size calculation: count not only main fork of the relation, but also vm and fsm. --- pageserver/src/page_service.rs | 13 ++--- pageserver/src/pgdatadir_mapping.rs | 20 +++++++- test_runner/batch_others/test_createdropdb.py | 11 +++-- .../batch_others/test_timeline_size.py | 47 ++++++++++++++++++- 4 files changed, 75 insertions(+), 16 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 079f477f75..77c320a181 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -733,17 +733,10 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; - let all_rels = timeline.list_rels(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?; - let mut total_blocks: i64 = 0; + let total_blocks = + timeline.get_db_size(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?; - for rel in all_rels { - if rel.forknum == 0 { - let n_blocks = timeline.get_rel_size(rel, lsn).unwrap_or(0); - total_blocks += n_blocks as i64; - } - } - - let db_size = total_blocks * pg_constants::BLCKSZ as i64; + let db_size = total_blocks as i64 * pg_constants::BLCKSZ as i64; Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse { db_size, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 59a53d68a1..ce305a55f4 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -123,6 +123,19 @@ impl DatadirTimeline { self.tline.get(key, lsn) } + // Get size of a database in blocks + pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + let mut total_blocks = 0; + + let rels = self.list_rels(spcnode, dbnode, lsn)?; + + for rel in rels { + let n_blocks = self.get_rel_size(rel, lsn)?; + total_blocks += n_blocks as usize; + } + Ok(total_blocks) + } + /// Get size of a relation file pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); @@ -667,6 +680,10 @@ impl<'a, R: Repository> DatadirModification<'a, R> { } pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> { + let req_lsn = self.tline.get_last_record_lsn(); + + let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn)?; + // Remove entry from dbdir let buf = self.get(DBDIR_KEY)?; let mut dir = DbDirectory::des(&buf)?; @@ -680,7 +697,8 @@ impl<'a, R: Repository> DatadirModification<'a, R> { ); } - // FIXME: update pending_nblocks + // Update logical database size. + self.pending_nblocks -= total_blocks as isize; // Delete all relations and metadata files for the spcnode/dnode self.delete(dbdir_key_range(spcnode, dbnode)); diff --git a/test_runner/batch_others/test_createdropdb.py b/test_runner/batch_others/test_createdropdb.py index 392e5a6fd4..151ce997ee 100644 --- a/test_runner/batch_others/test_createdropdb.py +++ b/test_runner/batch_others/test_createdropdb.py @@ -35,9 +35,14 @@ def test_createdb(neon_simple_env: NeonEnv): with closing(db.connect(dbname='foodb')) as conn: with conn.cursor() as cur: # Check database size in both branches - cur.execute( - 'select pg_size_pretty(pg_database_size(%s)), pg_size_pretty(sum(pg_relation_size(oid))) from pg_class where relisshared is false;', - ('foodb', )) + cur.execute(""" + select pg_size_pretty(pg_database_size('foodb')), + pg_size_pretty( + sum(pg_relation_size(oid, 'main')) + +sum(pg_relation_size(oid, 'vm')) + +sum(pg_relation_size(oid, 'fsm')) + ) FROM pg_class where relisshared is false + """) res = cur.fetchone() # check that dbsize equals sum of all relation sizes, excluding shared ones # This is how we define dbsize in neon for now diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 70dbe8ac4a..5734091757 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -8,7 +8,6 @@ import time def test_timeline_size(neon_simple_env: NeonEnv): env = neon_simple_env - # Branch at the point where only 100 rows were inserted new_timeline_id = env.neon_cli.create_branch('test_timeline_size', 'empty') client = env.pageserver.http_client() @@ -23,7 +22,6 @@ def test_timeline_size(neon_simple_env: NeonEnv): with conn.cursor() as cur: cur.execute("SHOW neon.timeline_id") - # Create table, and insert the first 100 rows cur.execute("CREATE TABLE foo (t text)") cur.execute(""" INSERT INTO foo @@ -43,6 +41,51 @@ def test_timeline_size(neon_simple_env: NeonEnv): "current_logical_size_non_incremental"] +def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): + env = neon_simple_env + new_timeline_id = env.neon_cli.create_branch('test_timeline_size', 'empty') + + client = env.pageserver.http_client() + timeline_details = assert_local(client, env.initial_tenant, new_timeline_id) + assert timeline_details['local']['current_logical_size'] == timeline_details['local'][ + 'current_logical_size_non_incremental'] + + pgmain = env.postgres.create_start("test_timeline_size") + log.info("postgres is running on 'test_timeline_size' branch") + + with closing(pgmain.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SHOW neon.timeline_id") + + res = assert_local(client, env.initial_tenant, new_timeline_id) + local_details = res['local'] + assert local_details["current_logical_size"] == local_details[ + "current_logical_size_non_incremental"] + + cur.execute('CREATE DATABASE foodb') + with closing(pgmain.connect(dbname='foodb')) as conn: + with conn.cursor() as cur2: + + cur2.execute("CREATE TABLE foo (t text)") + cur2.execute(""" + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 10) g + """) + + res = assert_local(client, env.initial_tenant, new_timeline_id) + local_details = res['local'] + assert local_details["current_logical_size"] == local_details[ + "current_logical_size_non_incremental"] + + cur.execute('DROP DATABASE foodb') + + res = assert_local(client, env.initial_tenant, new_timeline_id) + local_details = res['local'] + assert local_details["current_logical_size"] == local_details[ + "current_logical_size_non_incremental"] + + # wait until received_lsn_lag is 0 def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60): started_at = time.time() From 6d7dc384a5501cdcc7b159f2c7f31d16a35b2f7b Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 23 Jun 2022 14:27:33 +0300 Subject: [PATCH 0437/1022] Add zenith-us-stage-ps-3 to deploy --- .circleci/ansible/staging.hosts | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index 4273b885e1..29e4efbb19 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -1,6 +1,7 @@ [pageservers] #zenith-us-stage-ps-1 console_region_id=27 zenith-us-stage-ps-2 console_region_id=27 +zenith-us-stage-ps-3 console_region_id=27 [safekeepers] zenith-us-stage-sk-4 console_region_id=27 From 93e050afe38d719cb2bd5a841cda800b1609d01a Mon Sep 17 00:00:00 2001 From: Bojan Serafimov Date: Wed, 22 Jun 2022 18:31:24 -0400 Subject: [PATCH 0438/1022] Don't require project name for link auth --- proxy/src/auth/backend/console.rs | 15 ++++++++++++--- proxy/src/auth/credentials.rs | 6 +++--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index 93462086ea..098233c648 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -92,9 +92,14 @@ impl<'a> Api<'a> { async fn get_auth_info(&self) -> Result { let mut url = self.endpoint.clone(); + let project_name = self + .creds + .project_name + .as_ref() + .map_err(|e| ConsoleAuthError::BadProjectName(e.clone()))?; url.path_segments_mut().push("proxy_get_role_secret"); url.query_pairs_mut() - .append_pair("project", &self.creds.project_name) + .append_pair("project", project_name) .append_pair("role", &self.creds.user); // TODO: use a proper logger @@ -116,9 +121,13 @@ impl<'a> Api<'a> { /// Wake up the compute node and return the corresponding connection info. async fn wake_compute(&self) -> Result { let mut url = self.endpoint.clone(); + let project_name = self + .creds + .project_name + .as_ref() + .map_err(|e| ConsoleAuthError::BadProjectName(e.clone()))?; url.path_segments_mut().push("proxy_wake_compute"); - url.query_pairs_mut() - .append_pair("project", &self.creds.project_name); + url.query_pairs_mut().append_pair("project", project_name); // TODO: use a proper logger println!("cplane request: {url}"); diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 48dc8542ec..b5312fbe1f 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -8,7 +8,7 @@ use std::collections::HashMap; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -#[derive(Debug, Error, PartialEq)] +#[derive(Debug, Error, PartialEq, Eq, Clone)] pub enum ClientCredsParseError { #[error("Parameter `{0}` is missing in startup packet.")] MissingKey(&'static str), @@ -44,7 +44,7 @@ impl UserFacingError for ClientCredsParseError {} pub struct ClientCredentials { pub user: String, pub dbname: String, - pub project_name: String, + pub project_name: Result, } impl ClientCredentials { @@ -67,7 +67,7 @@ impl ClientCredentials { let user = get_param("user")?; let dbname = get_param("database")?; let project_name = get_param("project").ok(); - let project_name = get_project_name(sni_data, common_name, project_name.as_deref())?; + let project_name = get_project_name(sni_data, common_name, project_name.as_deref()); Ok(Self { user, From 84b9fcbbd59d7cfaa4181f2f1b6869d248070be0 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Thu, 23 Jun 2022 11:51:56 -0400 Subject: [PATCH 0439/1022] Increase a few test timeouts (#1977) --- test_runner/performance/test_wal_backpressure.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 873d1132a7..862b5e1c5e 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -80,6 +80,7 @@ def start_heavy_write_workload(env: PgCompare, n_tables: int, scale: int, num_it thread.join() +@pytest.mark.timeout(1000) @pytest.mark.parametrize("n_tables", [5]) @pytest.mark.parametrize("scale", get_scales_matrix(5)) @pytest.mark.parametrize("num_iters", [10]) @@ -121,6 +122,7 @@ def start_pgbench_simple_update_workload(env: PgCompare, duration: int): env.flush() +@pytest.mark.timeout(1000) @pytest.mark.parametrize("scale", get_scales_matrix(100)) @pytest.mark.parametrize("duration", get_durations_matrix()) def test_pgbench_simple_update_workload(pg_compare: PgCompare, scale: int, duration: int): @@ -158,6 +160,7 @@ def start_pgbench_intensive_initialization(env: PgCompare, scale: int): ]) +@pytest.mark.timeout(1000) @pytest.mark.parametrize("scale", get_scales_matrix(1000)) def test_pgbench_intensive_init_workload(pg_compare: PgCompare, scale: int): env = pg_compare From 3cc531d09349ae7fa9700083574036dab0f49d45 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Mon, 20 Jun 2022 22:28:40 +0200 Subject: [PATCH 0440/1022] Fix CREATE EXTENSION for non-db-owner users (#1408) Previously, we were granting create only to db owner, but now we have a dedicated 'web_access' role to connect via web UI and proxy link auth. We anyway grant read / write all data to all roles, so let's grant create to everyone too. This creates some provelege objects in each db, which we need to drop before deleting the role. So now we reassign all owned objects to each db owner before deletion. This also fixes deletion of roles that created some data in any db previously. Will be tested by https://github.com/neondatabase/cloud/pull/1673 Later we should stop messing with Postgres ACL that much. --- Cargo.lock | 7 +++ compute_tools/Cargo.toml | 1 + compute_tools/src/compute.rs | 1 + compute_tools/src/spec.rs | 94 +++++++++++++++++++++++++++++++----- 4 files changed, 92 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f4d3743676..ef1b7327c5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -461,6 +461,7 @@ dependencies = [ "tar", "tokio", "tokio-postgres", + "urlencoding", "workspace_hack", ] @@ -3684,6 +3685,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "urlencoding" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68b90931029ab9b034b300b797048cf23723400aa757e8a2bfb9d748102f9821" + [[package]] name = "utils" version = "0.1.0" diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 42db763961..a47f9998e6 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -18,4 +18,5 @@ serde_json = "1" tar = "0.4" tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +urlencoding = "2.1.0" workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index a2e6874a28..abf7081cb7 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -289,6 +289,7 @@ impl ComputeNode { handle_roles(&self.spec, &mut client)?; handle_databases(&self.spec, &mut client)?; + handle_role_deletions(self, &mut client)?; handle_grants(&self.spec, &mut client)?; create_writablity_check_data(&mut client)?; diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index e88df56a65..d2cfb6d726 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -2,9 +2,11 @@ use std::path::Path; use anyhow::Result; use log::{info, log_enabled, warn, Level}; -use postgres::Client; +use postgres::{Client, NoTls}; use serde::Deserialize; +use urlencoding::encode; +use crate::compute::ComputeNode; use crate::config; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; @@ -97,18 +99,13 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { // Process delta operations first if let Some(ops) = &spec.delta_operations { - info!("processing delta operations on roles"); + info!("processing role renames"); for op in ops { match op.action.as_ref() { - // We do not check either role exists or not, - // Postgres will take care of it for us "delete_role" => { - let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote()); - - warn!("deleting role '{}'", &op.name); - xact.execute(query.as_str(), &[])?; + // no-op now, roles will be deleted at the end of configuration } - // Renaming role drops its password, since tole name is + // Renaming role drops its password, since role name is // used as a salt there. It is important that this role // is recorded with a new `name` in the `roles` list. // Follow up roles update will set the new password. @@ -182,7 +179,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { xact.execute(query.as_str(), &[])?; let grant_query = format!( - "grant pg_read_all_data, pg_write_all_data to {}", + "GRANT pg_read_all_data, pg_write_all_data TO {}", name.quote() ); xact.execute(grant_query.as_str(), &[])?; @@ -197,6 +194,68 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { Ok(()) } +/// Reassign all dependent objects and delete requested roles. +pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> { + let spec = &node.spec; + + // First, reassign all dependent objects to db owners. + if let Some(ops) = &spec.delta_operations { + info!("reassigning dependent objects of to-be-deleted roles"); + for op in ops { + if op.action == "delete_role" { + reassign_owned_objects(node, &op.name)?; + } + } + } + + // Second, proceed with role deletions. + let mut xact = client.transaction()?; + if let Some(ops) = &spec.delta_operations { + info!("processing role deletions"); + for op in ops { + // We do not check either role exists or not, + // Postgres will take care of it for us + if op.action == "delete_role" { + let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote()); + + warn!("deleting role '{}'", &op.name); + xact.execute(query.as_str(), &[])?; + } + } + } + + Ok(()) +} + +// Reassign all owned objects in all databases to the owner of the database. +fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> { + for db in &node.spec.cluster.databases { + if db.owner != *role_name { + let db_name_encoded = format!("/{}", encode(&db.name)); + let db_connstr = node.connstr.replacen("/postgres", &db_name_encoded, 1); + let mut client = Client::connect(&db_connstr, NoTls)?; + + // This will reassign all dependent objects to the db owner + let reassign_query = format!( + "REASSIGN OWNED BY {} TO {}", + role_name.quote(), + db.owner.quote() + ); + info!( + "reassigning objects owned by '{}' in db '{}' to '{}'", + role_name, &db.name, &db.owner + ); + client.simple_query(&reassign_query)?; + + // This now will only drop privileges of the role + let drop_query = format!("DROP OWNED BY {}", role_name.quote()); + client.simple_query(&drop_query)?; + } + } + + Ok(()) +} + /// It follows mostly the same logic as `handle_roles()` excepting that we /// does not use an explicit transactions block, since major database operations /// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level @@ -294,13 +353,26 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> { info!("cluster spec grants:"); + // We now have a separate `web_access` role to connect to the database + // via the web interface and proxy link auth. And also we grant a + // read / write all data privilege to every role. So also grant + // create to everyone. + // XXX: later we should stop messing with Postgres ACL in such horrible + // ways. + let roles = spec + .cluster + .roles + .iter() + .map(|r| r.name.quote()) + .collect::>(); + for db in &spec.cluster.databases { let dbname = &db.name; let query: String = format!( "GRANT CREATE ON DATABASE {} TO {}", dbname.quote(), - db.owner.quote() + roles.join(", ") ); info!("grant query {}", &query); From 392cd8b1fc6af7921cb40d06a632748fefcb4aa9 Mon Sep 17 00:00:00 2001 From: KlimentSerafimov Date: Fri, 24 Jun 2022 11:57:33 +0200 Subject: [PATCH 0441/1022] Refactored extracting project_name in console.rs. (#1982) --- proxy/src/auth/backend/console.rs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index 098233c648..3085f0b0e4 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -49,6 +49,12 @@ impl UserFacingError for ConsoleAuthError { } } +impl From<&auth::credentials::ClientCredsParseError> for ConsoleAuthError { + fn from(e: &auth::credentials::ClientCredsParseError) -> Self { + ConsoleAuthError::BadProjectName(e.clone()) + } +} + // TODO: convert into an enum with "error" #[derive(Serialize, Deserialize, Debug)] struct GetRoleSecretResponse { @@ -92,14 +98,9 @@ impl<'a> Api<'a> { async fn get_auth_info(&self) -> Result { let mut url = self.endpoint.clone(); - let project_name = self - .creds - .project_name - .as_ref() - .map_err(|e| ConsoleAuthError::BadProjectName(e.clone()))?; url.path_segments_mut().push("proxy_get_role_secret"); url.query_pairs_mut() - .append_pair("project", project_name) + .append_pair("project", self.creds.project_name.as_ref()?) .append_pair("role", &self.creds.user); // TODO: use a proper logger @@ -121,12 +122,8 @@ impl<'a> Api<'a> { /// Wake up the compute node and return the corresponding connection info. async fn wake_compute(&self) -> Result { let mut url = self.endpoint.clone(); - let project_name = self - .creds - .project_name - .as_ref() - .map_err(|e| ConsoleAuthError::BadProjectName(e.clone()))?; url.path_segments_mut().push("proxy_wake_compute"); + let project_name = self.creds.project_name.as_ref()?; url.query_pairs_mut().append_pair("project", project_name); // TODO: use a proper logger From 55192384c36bb9e2424418bb265ddbef2b3cb0ff Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Fri, 24 Jun 2022 13:59:37 +0300 Subject: [PATCH 0442/1022] Fix zero timeline_start_lsn (#1981) * Fix zero timeline_start_lsn * Log more info on control file upgrade * Fix formatting Co-authored-by: Anastasia Lubennikova --- safekeeper/src/control_file_upgrade.rs | 13 +++++++++++++ safekeeper/src/safekeeper.rs | 2 +- safekeeper/src/timeline.rs | 2 ++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index e1740cdcbf..5e749796dd 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -239,6 +239,19 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), }); + } else if version == 5 { + info!("reading safekeeper control file version {}", version); + let mut oldstate = SafeKeeperState::des(&buf[..buf.len()])?; + if oldstate.timeline_start_lsn != Lsn(0) { + return Ok(oldstate); + } + + // set special timeline_start_lsn because we don't know the real one + info!("setting timeline_start_lsn and local_start_lsn to Lsn(1)"); + oldstate.timeline_start_lsn = Lsn(1); + oldstate.local_start_lsn = Lsn(1); + + return Ok(oldstate); } bail!("unsupported safekeeper control file version {}", version) } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 7986fa5834..331baffbca 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -28,7 +28,7 @@ use utils::{ }; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 5; +pub const SK_FORMAT_VERSION: u32 = 6; const SK_PROTOCOL_VERSION: u32 = 2; const UNKNOWN_SERVER_VERSION: u32 = 0; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 2e415a53d0..12cac831f4 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -625,6 +625,8 @@ impl GlobalTimelines { zttid: ZTenantTimelineId, create: bool, ) -> Result> { + let _enter = info_span!("", timeline = %zttid.tenant_id).entered(); + let mut state = TIMELINES_STATE.lock().unwrap(); match state.timelines.get(&zttid) { From 26bca6ddbad38aaa72df25c52464084d787e2278 Mon Sep 17 00:00:00 2001 From: Johan Eliasson Date: Sun, 26 Jun 2022 20:54:07 +0200 Subject: [PATCH 0443/1022] Add `openssl` to OSX dependencies (#1994) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f63c21459e..6a4fc5ce1b 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 1. Install XCode and dependencies ``` xcode-select --install -brew install protobuf etcd +brew install protobuf etcd openssl ``` 2. [Install Rust](https://www.rust-lang.org/tools/install) From eb8926083ee7949a027e72281a6ebe71b1343a4b Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 27 Jun 2022 13:12:58 +0300 Subject: [PATCH 0444/1022] Use the updated base build Docker image (#1972) --- Dockerfile | 10 +++++----- Dockerfile.compute-tools | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index 62e0de7e15..34f5282c2c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Build Postgres -FROM zimg/rust:1.58 AS pg-build +FROM neondatabase/rust:1.58 AS pg-build WORKDIR /pg USER root @@ -14,7 +14,7 @@ RUN set -e \ && tar -C tmp_install -czf /postgres_install.tar.gz . # Build zenith binaries -FROM zimg/rust:1.58 AS build +FROM neondatabase/rust:1.58 AS build ARG GIT_VERSION=local ARG CACHEPOT_BUCKET=zenith-rust-cachepot @@ -46,9 +46,9 @@ RUN set -e \ && useradd -d /data zenith \ && chown -R zenith:zenith /data -COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/pageserver /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/safekeeper /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/proxy /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/runner/project/target/release/pageserver /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/runner/project/target/release/safekeeper /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/runner/project/target/release/proxy /usr/local/bin COPY --from=pg-build /pg/tmp_install/ /usr/local/ COPY --from=pg-build /postgres_install.tar.gz /data/ diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index f0c9b9d56a..1e7e20eae0 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,6 +1,6 @@ # First transient image to build compute_tools binaries # NB: keep in sync with rust image version in .circle/config.yml -FROM zimg/rust:1.58 AS rust-build +FROM neondatabase/rust:1.58 AS rust-build ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID @@ -15,4 +15,4 @@ RUN set -e \ # Final image that only has one binary FROM debian:buster-slim -COPY --from=rust-build /home/circleci/project/target/release/compute_ctl /usr/local/bin/compute_ctl +COPY --from=rust-build /home/runner/project/target/release/compute_ctl /usr/local/bin/compute_ctl From 137291dc2475eefa94be5a17ebe089f1e6e5b2f6 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Sat, 25 Jun 2022 15:57:28 +0400 Subject: [PATCH 0445/1022] Push to etcd from safekeeper many timelines concurrently. Mitigates latency fee, making push throughput 1-1.5 order of magnitude bigger. Also make leases per timeline, not per whole safekeeper, avoiding storing garbage in etcd for deleted timelines while safekeeper is alive. --- safekeeper/src/broker.rs | 113 +++++++++++++++++++++++++++---------- safekeeper/src/timeline.rs | 10 ++-- 2 files changed, 88 insertions(+), 35 deletions(-) diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index d3f6fb8903..8e0eb971f3 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -5,6 +5,11 @@ use anyhow::Context; use anyhow::Error; use anyhow::Result; use etcd_broker::subscription_value::SkTimelineInfo; +use etcd_broker::LeaseKeepAliveStream; +use etcd_broker::LeaseKeeper; + +use std::collections::hash_map::Entry; +use std::collections::HashMap; use std::time::Duration; use tokio::spawn; use tokio::task::JoinHandle; @@ -21,7 +26,7 @@ use utils::zid::{NodeId, ZTenantTimelineId}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; -const LEASE_TTL_SEC: i64 = 5; +const LEASE_TTL_SEC: i64 = 10; pub fn thread_main(conf: SafeKeeperConf) { let runtime = runtime::Builder::new_current_thread() @@ -154,13 +159,48 @@ pub fn get_candiate_name(system_id: NodeId) -> String { format!("id_{system_id}") } +async fn push_sk_info( + zttid: ZTenantTimelineId, + mut client: Client, + key: String, + sk_info: SkTimelineInfo, + mut lease: Lease, +) -> anyhow::Result<(ZTenantTimelineId, Lease)> { + let put_opts = PutOptions::new().with_lease(lease.id); + client + .put( + key.clone(), + serde_json::to_string(&sk_info)?, + Some(put_opts), + ) + .await + .with_context(|| format!("failed to push safekeeper info to {}", key))?; + + // revive the lease + lease + .keeper + .keep_alive() + .await + .context("failed to send LeaseKeepAliveRequest")?; + lease + .ka_stream + .message() + .await + .context("failed to receive LeaseKeepAliveResponse")?; + + Ok((zttid, lease)) +} + +struct Lease { + id: i64, + keeper: LeaseKeeper, + ka_stream: LeaseKeepAliveStream, +} + /// Push once in a while data about all active timelines to the broker. async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { let mut client = Client::connect(&conf.broker_endpoints, None).await?; - - // Get and maintain lease to automatically delete obsolete data - let lease = client.lease_grant(LEASE_TTL_SEC, None).await?; - let (mut keeper, mut ka_stream) = client.lease_keep_alive(lease.id()).await?; + let mut leases: HashMap = HashMap::new(); let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); loop { @@ -168,33 +208,46 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // is under plain mutex. That's ok, all this code is not performance // sensitive and there is no risk of deadlock as we don't await while // lock is held. - for zttid in GlobalTimelines::get_active_timelines() { - if let Some(tli) = GlobalTimelines::get_loaded(zttid) { - let sk_info = tli.get_public_info(&conf)?; - let put_opts = PutOptions::new().with_lease(lease.id()); - client - .put( - timeline_safekeeper_path( - conf.broker_etcd_prefix.clone(), - zttid, - conf.my_id, - ), - serde_json::to_string(&sk_info)?, - Some(put_opts), - ) - .await - .context("failed to push safekeeper info")?; + let active_tlis = GlobalTimelines::get_active_timelines(); + + // // Get and maintain (if not yet) per timeline lease to automatically delete obsolete data. + for zttid in active_tlis.iter() { + if let Entry::Vacant(v) = leases.entry(*zttid) { + let lease = client.lease_grant(LEASE_TTL_SEC, None).await?; + let (keeper, ka_stream) = client.lease_keep_alive(lease.id()).await?; + v.insert(Lease { + id: lease.id(), + keeper, + ka_stream, + }); } } - // revive the lease - keeper - .keep_alive() - .await - .context("failed to send LeaseKeepAliveRequest")?; - ka_stream - .message() - .await - .context("failed to receive LeaseKeepAliveResponse")?; + leases.retain(|zttid, _| active_tlis.contains(zttid)); + + // Push data concurrently to not suffer from latency, with many timelines it can be slow. + let handles = active_tlis + .iter() + .filter_map(|zttid| GlobalTimelines::get_loaded(*zttid)) + .map(|tli| { + let sk_info = tli.get_public_info(&conf); + let key = timeline_safekeeper_path( + conf.broker_etcd_prefix.clone(), + tli.zttid, + conf.my_id, + ); + let lease = leases.remove(&tli.zttid).unwrap(); + tokio::spawn(push_sk_info(tli.zttid, client.clone(), key, sk_info, lease)) + }) + .collect::>(); + for h in handles { + let (zttid, lease) = h.await??; + // It is ugly to pull leases from hash and then put it back, but + // otherwise we have to resort to long living per tli tasks (which + // would generate a lot of errors when etcd is down) as task wants to + // have 'static objects, we can't borrow to it. + leases.insert(zttid, lease); + } + sleep(push_interval).await; } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 12cac831f4..bed6e447d7 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -11,7 +11,7 @@ use serde::Serialize; use tokio::sync::watch; use std::cmp::{max, min}; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fs::{self}; use std::sync::{Arc, Mutex, MutexGuard}; @@ -445,9 +445,9 @@ impl Timeline { } /// Prepare public safekeeper info for reporting. - pub fn get_public_info(&self, conf: &SafeKeeperConf) -> anyhow::Result { + pub fn get_public_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo { let shared_state = self.mutex.lock().unwrap(); - Ok(SkTimelineInfo { + SkTimelineInfo { last_log_term: Some(shared_state.sk.get_epoch()), flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()), // note: this value is not flushed to control file yet and can be lost @@ -460,7 +460,7 @@ impl Timeline { peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn), safekeeper_connstr: Some(conf.listen_pg_addr.clone()), backup_lsn: Some(shared_state.sk.inmem.backup_lsn), - }) + } } /// Update timeline state with peer safekeeper data. @@ -669,7 +669,7 @@ impl GlobalTimelines { } /// Get ZTenantTimelineIDs of all active timelines. - pub fn get_active_timelines() -> Vec { + pub fn get_active_timelines() -> HashSet { let state = TIMELINES_STATE.lock().unwrap(); state .timelines From 8a714f1ebfea2d7f2dfa58cdcbbfc7b6930eab2f Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 27 Jun 2022 19:15:56 +0300 Subject: [PATCH 0446/1022] Add coverage to GH actions and rework part of them (#1987) --- .circleci/config.yml | 57 +---- .../actions/run-python-test-set/action.yml | 29 ++- .github/actions/save-coverage-data/action.yml | 17 ++ .github/workflows/build_and_test.yml | 207 +++++++++++++----- .../workflows/{testing.yml => codestyle.yml} | 45 +++- Dockerfile | 6 +- Dockerfile.compute-tools | 2 +- test_runner/README.md | 2 +- .../batch_others/test_remote_storage.py | 2 +- test_runner/fixtures/neon_fixtures.py | 4 +- 10 files changed, 246 insertions(+), 125 deletions(-) create mode 100644 .github/actions/save-coverage-data/action.yml rename .github/workflows/{testing.yml => codestyle.yml} (73%) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9aca415dc8..61f551cd03 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -286,7 +286,7 @@ jobs: # no_output_timeout, specified here. no_output_timeout: 10m environment: - - ZENITH_BIN: /tmp/zenith/bin + - NEON_BIN: /tmp/zenith/bin - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install - TEST_OUTPUT: /tmp/test_output # this variable will be embedded in perf test report @@ -688,50 +688,6 @@ jobs: helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait - # Trigger a new remote CI job - remote-ci-trigger: - docker: - - image: cimg/base:2021.04 - parameters: - remote_repo: - type: string - environment: - REMOTE_REPO: << parameters.remote_repo >> - steps: - - run: - name: Set PR's status to pending - command: | - LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME - - curl -f -X POST \ - https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \ - -H "Accept: application/vnd.github.v3+json" \ - --user "$CI_ACCESS_TOKEN" \ - --data \ - "{ - \"state\": \"pending\", - \"context\": \"neon-cloud-e2e\", - \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\" - }" - - run: - name: Request a remote CI test - command: | - LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME - - curl -f -X POST \ - https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \ - -H "Accept: application/vnd.github.v3+json" \ - --user "$CI_ACCESS_TOKEN" \ - --data \ - "{ - \"ref\": \"main\", - \"inputs\": { - \"ci_job_name\": \"neon-cloud-e2e\", - \"commit_hash\": \"$CIRCLE_SHA1\", - \"remote_repo\": \"$LOCAL_REPO\" - } - }" - workflows: build_and_test: jobs: @@ -880,14 +836,3 @@ workflows: - release requires: - docker-image-release - - remote-ci-trigger: - # Context passes credentials for gh api - context: CI_ACCESS_TOKEN - remote_repo: "neondatabase/cloud" - requires: - # XXX: Successful build doesn't mean everything is OK, but - # the job to be triggered takes so much time to complete (~22 min) - # that it's better not to wait for the commented-out steps - - build-neon-release - # - pg_regress-tests-release - # - other-tests-release diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 94fac2ee99..4831cdaed1 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -2,25 +2,29 @@ name: 'Run python test' description: 'Runs a Neon python test set, performing all the required preparations before' inputs: - # Select the type of Rust build. Must be "release" or "debug". build_type: + description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug".' required: true rust_toolchain: + description: 'Rust toolchain version to fetch the caches' required: true - # This parameter is required, to prevent the mistake of running all tests in one job. test_selection: + description: 'A python test suite to run' required: true - # Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr extra_params: + description: 'Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr' required: false default: '' needs_postgres_source: + description: 'Set to true if the test suite requires postgres source checked out' required: false default: 'false' run_in_parallel: + description: 'Whether to run tests in parallel' required: false default: 'true' save_perf_report: + description: 'Whether to upload the performance report' required: false default: 'false' @@ -60,7 +64,7 @@ runs: - name: Run pytest env: - ZENITH_BIN: /tmp/neon/bin + NEON_BIN: /tmp/neon/bin POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install TEST_OUTPUT: /tmp/test_output # this variable will be embedded in perf test report @@ -117,3 +121,20 @@ runs: scripts/generate_and_push_perf_report.sh fi fi + + - name: Delete all data but logs + shell: bash -ex {0} + if: always() + run: | + du -sh /tmp/test_output/* + find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete + du -sh /tmp/test_output/* + + - name: Upload python test logs + if: always() + uses: actions/upload-artifact@v3 + with: + retention-days: 7 + if-no-files-found: error + name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs + path: /tmp/test_output/ diff --git a/.github/actions/save-coverage-data/action.yml b/.github/actions/save-coverage-data/action.yml new file mode 100644 index 0000000000..7b228f636f --- /dev/null +++ b/.github/actions/save-coverage-data/action.yml @@ -0,0 +1,17 @@ +name: 'Merge and upload coverage data' +description: 'Compresses and uploads the coverage data as an artifact' + +runs: + using: "composite" + steps: + - name: Merge coverage data + shell: bash -ex {0} + run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage/ merge + + - name: Upload coverage data + uses: actions/upload-artifact@v3 + with: + retention-days: 7 + if-no-files-found: error + name: coverage-data-artifact + path: /tmp/neon/coverage/ diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5f4dd754d2..7cbd1103c8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1,13 +1,33 @@ -name: build_and_test -on: [ push ] +name: Test + +on: + push: + branches: + - main + pull_request: + defaults: run: shell: bash -ex {0} +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + RUST_BACKTRACE: 1 + COPT: '-Werror' + AWS_ACCESS_KEY_ID: ${{ secrets.CACHEPOT_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY }} + CACHEPOT_BUCKET: zenith-rust-cachepot + RUSTC_WRAPPER: cachepot + + jobs: build-postgres: runs-on: [ self-hosted, Linux, k8s-runner ] strategy: + fail-fast: false matrix: build_type: [ debug, release ] rust_toolchain: [ 1.58 ] @@ -52,6 +72,7 @@ jobs: runs-on: [ self-hosted, Linux, k8s-runner ] needs: [ build-postgres ] strategy: + fail-fast: false matrix: build_type: [ debug, release ] rust_toolchain: [ 1.58 ] @@ -97,17 +118,11 @@ jobs: CARGO_FLAGS="--release --features profiling" fi - export CACHEPOT_BUCKET=zenith-rust-cachepot - export RUSTC_WRAPPER=cachepot - export AWS_ACCESS_KEY_ID="${{ secrets.AWS_ACCESS_KEY_ID }}" - export AWS_SECRET_ACCESS_KEY="${{ secrets.AWS_SECRET_ACCESS_KEY }}" - export HOME=/home/runner "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests cachepot -s - name: Run cargo test run: | - export HOME=/home/runner if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run) CARGO_FLAGS= @@ -115,12 +130,11 @@ jobs: cov_prefix=() CARGO_FLAGS=--release fi - + "${cov_prefix[@]}" cargo test $CARGO_FLAGS - name: Install rust binaries run: | - export HOME=/home/runner if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run) elif [[ $BUILD_TYPE == "release" ]]; then @@ -137,39 +151,34 @@ jobs: jq -r '.executable | select(. != null)' ) - mkdir -p /tmp/neon/bin - mkdir -p /tmp/neon/test_bin - mkdir -p /tmp/neon/etc + mkdir -p /tmp/neon/bin/ + mkdir -p /tmp/neon/test_bin/ + mkdir -p /tmp/neon/etc/ + mkdir -p /tmp/neon/coverage/ # Install target binaries for bin in $binaries; do SRC=target/$BUILD_TYPE/$bin DST=/tmp/neon/bin/$bin - cp $SRC $DST - echo $DST >> /tmp/neon/etc/binaries.list + cp "$SRC" "$DST" done - # Install test executables (for code coverage) + # Install test executables and write list of all binaries (for code coverage) if [[ $BUILD_TYPE == "debug" ]]; then + for bin in $binaries; do + echo "/tmp/neon/bin/$bin" >> /tmp/neon/coverage/binaries.list + done for bin in $test_exe_paths; do SRC=$bin DST=/tmp/neon/test_bin/$(basename $bin) - cp $SRC $DST - echo $DST >> /tmp/neon/etc/binaries.list + cp "$SRC" "$DST" + echo "$DST" >> /tmp/neon/coverage/binaries.list done fi - name: Install postgres binaries run: cp -a tmp_install /tmp/neon/pg_install - - name: Merge coverage data - run: | - export HOME=/home/runner - # This will speed up workspace uploads - if [[ $BUILD_TYPE == "debug" ]]; then - scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage merge - fi - - name: Prepare neon artifact run: tar -C /tmp/neon/ -czf ./neon.tgz . @@ -181,38 +190,17 @@ jobs: name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact path: ./neon.tgz - check-codestyle-python: - runs-on: [ self-hosted, Linux, k8s-runner ] - strategy: - matrix: - rust_toolchain: [ 1.58 ] - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 1 + # XXX: keep this after the binaries.list is formed, so the coverage can properly work later + - name: Merge and upload coverage data + if: matrix.build_type == 'debug' + uses: ./.github/actions/save-coverage-data - - name: Cache poetry deps - id: cache_poetry - uses: actions/cache@v3 - with: - path: ~/.cache/pypoetry/virtualenvs - key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} - - - name: Install Python deps - run: ./scripts/pysync - - - name: Run yapf to ensure code format - run: poetry run yapf --recursive --diff . - - - name: Run mypy to check types - run: poetry run mypy . pg_regress-tests: runs-on: [ self-hosted, Linux, k8s-runner ] needs: [ build-neon ] strategy: + fail-fast: false matrix: build_type: [ debug, release ] rust_toolchain: [ 1.58 ] @@ -231,10 +219,15 @@ jobs: test_selection: batch_pg_regress needs_postgres_source: true + - name: Merge and upload coverage data + if: matrix.build_type == 'debug' + uses: ./.github/actions/save-coverage-data + other-tests: runs-on: [ self-hosted, Linux, k8s-runner ] needs: [ build-neon ] strategy: + fail-fast: false matrix: build_type: [ debug, release ] rust_toolchain: [ 1.58 ] @@ -252,10 +245,15 @@ jobs: rust_toolchain: ${{ matrix.rust_toolchain }} test_selection: batch_others + - name: Merge and upload coverage data + if: matrix.build_type == 'debug' + uses: ./.github/actions/save-coverage-data + benchmarks: runs-on: [ self-hosted, Linux, k8s-runner ] needs: [ build-neon ] strategy: + fail-fast: false matrix: build_type: [ release ] rust_toolchain: [ 1.58 ] @@ -273,4 +271,107 @@ jobs: rust_toolchain: ${{ matrix.rust_toolchain }} test_selection: performance run_in_parallel: false - # save_perf_report: true + save_perf_report: true + # XXX: no coverage data handling here, since benchmarks are run on release builds, + # while coverage is currently collected for the debug ones + + coverage-report: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ other-tests, pg_regress-tests ] + strategy: + fail-fast: false + matrix: + build_type: [ debug ] + rust_toolchain: [ 1.58 ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 1 + + - name: Get Neon artifact for restoration + uses: actions/download-artifact@v3 + with: + name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact + path: ./neon-artifact/ + + - name: Extract Neon artifact + run: | + mkdir -p /tmp/neon/ + tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/ + rm -rf ./neon-artifact/ + + - name: Restore coverage data + uses: actions/download-artifact@v3 + with: + name: coverage-data-artifact + path: /tmp/neon/coverage/ + + - name: Build and upload coverage report + run: | + COMMIT_SHA=${{ github.event.pull_request.head.sha }} + COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} + COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA + + scripts/coverage \ + --dir=/tmp/neon/coverage report \ + --input-objects=/tmp/neon/coverage/binaries.list \ + --commit-url=$COMMIT_URL \ + --format=github + + REPORT_URL=https://${{ github.repository_owner }}.github.io/neon-coverage-data/$COMMIT_SHA + + scripts/git-upload \ + --repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/${{ github.repository_owner }}/neon-coverage-data.git \ + --message="Add code coverage for $COMMIT_URL" \ + copy /tmp/neon/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE + + # Add link to the coverage report to the commit + curl -f -X POST \ + https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \ + -H "Accept: application/vnd.github.v3+json" \ + --user "${{ secrets.CI_ACCESS_TOKEN }}" \ + --data \ + "{ + \"state\": \"success\", + \"context\": \"neon-coverage\", + \"description\": \"Coverage report is ready\", + \"target_url\": \"$REPORT_URL\" + }" + + trigger-e2e-tests: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ build-neon ] + steps: + - name: Set PR's status to pending and request a remote CI test + run: | + COMMIT_SHA=${{ github.event.pull_request.head.sha }} + COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} + + REMOTE_REPO="${{ github.repository_owner }}/cloud" + + curl -f -X POST \ + https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \ + -H "Accept: application/vnd.github.v3+json" \ + --user "${{ secrets.CI_ACCESS_TOKEN }}" \ + --data \ + "{ + \"state\": \"pending\", + \"context\": \"neon-cloud-e2e\", + \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\" + }" + + curl -f -X POST \ + https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \ + -H "Accept: application/vnd.github.v3+json" \ + --user "${{ secrets.CI_ACCESS_TOKEN }}" \ + --data \ + "{ + \"ref\": \"main\", + \"inputs\": { + \"ci_job_name\": \"neon-cloud-e2e\", + \"commit_hash\": \"$COMMIT_SHA\", + \"remote_repo\": \"${{ github.repository }}\" + } + }" diff --git a/.github/workflows/testing.yml b/.github/workflows/codestyle.yml similarity index 73% rename from .github/workflows/testing.yml rename to .github/workflows/codestyle.yml index aa1e152fb2..292c2c903b 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/codestyle.yml @@ -1,4 +1,4 @@ -name: Build and Test +name: Check code style and build on: push: @@ -6,9 +6,21 @@ on: - main pull_request: +defaults: + run: + shell: bash -ex {0} + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + RUST_BACKTRACE: 1 + jobs: - regression-check: + check-codestyle-rust: strategy: + fail-fast: false matrix: # If we want to duplicate this job for different # Rust toolchains (e.g. nightly or 1.37.0), add them here. @@ -92,5 +104,30 @@ jobs: - name: Run cargo clippy run: ./run_clippy.sh - - name: Run cargo test - run: cargo test --all --all-targets + - name: Ensure all project builds + run: cargo build --all --all-targets + + check-codestyle-python: + runs-on: [ self-hosted, Linux, k8s-runner ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: false + fetch-depth: 1 + + - name: Cache poetry deps + id: cache_poetry + uses: actions/cache@v3 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + run: ./scripts/pysync + + - name: Run yapf to ensure code format + run: poetry run yapf --recursive --diff . + + - name: Run mypy to check types + run: poetry run mypy . diff --git a/Dockerfile b/Dockerfile index 34f5282c2c..ad85638af3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,9 +46,9 @@ RUN set -e \ && useradd -d /data zenith \ && chown -R zenith:zenith /data -COPY --from=build --chown=zenith:zenith /home/runner/project/target/release/pageserver /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/runner/project/target/release/safekeeper /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/runner/project/target/release/proxy /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/runner/target/release/pageserver /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/runner/target/release/safekeeper /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/runner/target/release/proxy /usr/local/bin COPY --from=pg-build /pg/tmp_install/ /usr/local/ COPY --from=pg-build /postgres_install.tar.gz /data/ diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 1e7e20eae0..71770ae9ed 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -15,4 +15,4 @@ RUN set -e \ # Final image that only has one binary FROM debian:buster-slim -COPY --from=rust-build /home/runner/project/target/release/compute_ctl /usr/local/bin/compute_ctl +COPY --from=rust-build /home/runner/target/release/compute_ctl /usr/local/bin/compute_ctl diff --git a/test_runner/README.md b/test_runner/README.md index f95588462b..4b54c45175 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -45,7 +45,7 @@ If you want to run all tests that have the string "bench" in their names: Useful environment variables: -`ZENITH_BIN`: The directory where zenith binaries can be found. +`NEON_BIN`: The directory where neon binaries can be found. `POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found. `TEST_OUTPUT`: Set the directory where test state and test output files should go. diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index 8a2748b880..b0ba8758cc 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -1,5 +1,5 @@ # It's possible to run any regular test with the local fs remote storage via -# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/zenith_zzz/'}" poetry ...... +# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... import shutil, os from contextlib import closing diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 12edcb8792..7506641fcb 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -50,7 +50,7 @@ A fixture is created with the decorator @pytest.fixture decorator. See docs: https://docs.pytest.org/en/6.2.x/fixture.html There are several environment variables that can control the running of tests: -ZENITH_BIN, POSTGRES_DISTRIB_DIR, etc. See README.md for more information. +NEON_BIN, POSTGRES_DISTRIB_DIR, etc. See README.md for more information. There's no need to import this file to use it. It should be declared as a plugin inside conftest.py, and that makes it available to all tests. @@ -151,7 +151,7 @@ def pytest_configure(config): return # Find the neon binaries. global neon_binpath - env_neon_bin = os.environ.get('ZENITH_BIN') + env_neon_bin = os.environ.get('NEON_BIN') if env_neon_bin: neon_binpath = env_neon_bin else: From dd61f3558fbb8f5d3d13011a32ed684728caab25 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 27 Jun 2022 20:41:09 +0300 Subject: [PATCH 0447/1022] Fix coverage upload credentials retrieval (#2001) --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7cbd1103c8..9391b29333 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -323,7 +323,7 @@ jobs: REPORT_URL=https://${{ github.repository_owner }}.github.io/neon-coverage-data/$COMMIT_SHA scripts/git-upload \ - --repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/${{ github.repository_owner }}/neon-coverage-data.git \ + --repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/neon-coverage-data.git \ --message="Add code coverage for $COMMIT_URL" \ copy /tmp/neon/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE From 4a05413a4c5ada72cb1bb99809bc4cdea482165d Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 27 Jun 2022 22:40:20 +0300 Subject: [PATCH 0448/1022] More code coverage fixes in GH Actions (#2002) --- .github/workflows/build_and_test.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 9391b29333..2508c32bff 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -290,6 +290,16 @@ jobs: submodules: true fetch-depth: 1 + - name: Restore cargo deps cache + id: cache_cargo + uses: actions/cache@v3 + with: + path: | + ~/.cargo/registry/ + ~/.cargo/git/ + target/ + key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + - name: Get Neon artifact for restoration uses: actions/download-artifact@v3 with: @@ -320,10 +330,10 @@ jobs: --commit-url=$COMMIT_URL \ --format=github - REPORT_URL=https://${{ github.repository_owner }}.github.io/neon-coverage-data/$COMMIT_SHA + REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA scripts/git-upload \ - --repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/neon-coverage-data.git \ + --repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \ --message="Add code coverage for $COMMIT_URL" \ copy /tmp/neon/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE From cef90d9220f15bebb72a0be83f9cce7940e72a6c Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 29 Jun 2022 17:56:02 +0300 Subject: [PATCH 0449/1022] Disable cachepot for GH Actions builds (#2007) --- .github/workflows/build_and_test.yml | 11 ++++------- .github/workflows/codestyle.yml | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 2508c32bff..40a305a468 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -17,11 +17,6 @@ concurrency: env: RUST_BACKTRACE: 1 COPT: '-Werror' - AWS_ACCESS_KEY_ID: ${{ secrets.CACHEPOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY }} - CACHEPOT_BUCKET: zenith-rust-cachepot - RUSTC_WRAPPER: cachepot - jobs: build-postgres: @@ -106,7 +101,10 @@ jobs: ~/.cargo/registry/ ~/.cargo/git/ target/ - key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + # Fall back to older versions of the key, if no cache for current Cargo.lock was found + key: | + v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- - name: Run cargo build run: | @@ -119,7 +117,6 @@ jobs: fi "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests - cachepot -s - name: Run cargo test run: | diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 292c2c903b..2b8a01e94e 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -26,7 +26,7 @@ jobs: # Rust toolchains (e.g. nightly or 1.37.0), add them here. rust_toolchain: [1.58] os: [ubuntu-latest, macos-latest] - timeout-minutes: 30 + timeout-minutes: 50 name: run regression test suite runs-on: ${{ matrix.os }} From 5ee19b075855b1fb0aecbabbd0fe8d1385bcfbb3 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 29 Jun 2022 17:59:19 +0300 Subject: [PATCH 0450/1022] Fix bloated coverage uploads (#2005) Move coverage data to a better directory, merge it better and don't publish it from CircleCI pipeline --- .circleci/config.yml | 110 ++---------------- .../actions/run-python-test-set/action.yml | 2 +- .github/actions/save-coverage-data/action.yml | 4 +- .github/workflows/build_and_test.yml | 27 +++-- 4 files changed, 26 insertions(+), 117 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 61f551cd03..f64ba94cb4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -100,10 +100,8 @@ jobs: name: Rust build << parameters.build_type >> command: | if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run) CARGO_FLAGS= elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() CARGO_FLAGS="--release --features profiling" fi @@ -112,7 +110,7 @@ jobs: export RUSTC_WRAPPER=cachepot export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" - "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests + mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests cachepot -s - save_cache: @@ -128,32 +126,24 @@ jobs: name: cargo test command: | if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run) CARGO_FLAGS= elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() CARGO_FLAGS=--release fi - "${cov_prefix[@]}" cargo test $CARGO_FLAGS + cargo test $CARGO_FLAGS # Install the rust binaries, for use by test jobs - run: name: Install rust binaries command: | - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run) - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - fi - binaries=$( - "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps | + cargo metadata --format-version=1 --no-deps | jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' ) test_exe_paths=$( - "${cov_prefix[@]}" cargo test --message-format=json --no-run | + cargo test --message-format=json --no-run | jq -r '.executable | select(. != null)' ) @@ -166,34 +156,15 @@ jobs: SRC=target/$BUILD_TYPE/$bin DST=/tmp/zenith/bin/$bin cp $SRC $DST - echo $DST >> /tmp/zenith/etc/binaries.list done - # Install test executables (for code coverage) - if [[ $BUILD_TYPE == "debug" ]]; then - for bin in $test_exe_paths; do - SRC=$bin - DST=/tmp/zenith/test_bin/$(basename $bin) - cp $SRC $DST - echo $DST >> /tmp/zenith/etc/binaries.list - done - fi - # Install the postgres binaries, for use by test jobs - run: name: Install postgres binaries command: | cp -a tmp_install /tmp/zenith/pg_install - - run: - name: Merge coverage data - command: | - # This will speed up workspace uploads - if [[ $BUILD_TYPE == "debug" ]]; then - scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge - fi - - # Save the rust binaries and coverage data for other jobs in this workflow. + # Save rust binaries for other jobs in the workflow - persist_to_workspace: root: /tmp/zenith paths: @@ -314,12 +285,6 @@ jobs: export GITHUB_SHA=$CIRCLE_SHA1 - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run) - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - fi - # Run the tests. # # The junit.xml file allows CircleCI to display more fine-grained test information @@ -330,7 +295,7 @@ jobs: # -n4 uses four processes to run tests via pytest-xdist # -s is not used to prevent pytest from capturing output, because tests are running # in parallel and logs are mixed between different tests - "${cov_prefix[@]}" ./scripts/pytest \ + ./scripts/pytest \ --junitxml=$TEST_OUTPUT/junit.xml \ --tb=short \ --verbose \ @@ -359,67 +324,12 @@ jobs: # The store_test_results step tells CircleCI where to find the junit.xml file. - store_test_results: path: /tmp/test_output - - run: - name: Merge coverage data - command: | - # This will speed up workspace uploads - if [[ $BUILD_TYPE == "debug" ]]; then - scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge - fi - # Save coverage data (if any) + # Save data (if any) - persist_to_workspace: root: /tmp/zenith paths: - "*" - coverage-report: - executor: neon-xlarge-executor - steps: - - attach_workspace: - at: /tmp/zenith - - checkout - - restore_cache: - name: Restore rust cache - keys: - # Require an exact match. While an out of date cache might speed up the build, - # there's no way to clean out old packages, so the cache grows every time something - # changes. - - v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }} - - run: - name: Build coverage report - command: | - COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1 - - scripts/coverage \ - --dir=/tmp/zenith/coverage report \ - --input-objects=/tmp/zenith/etc/binaries.list \ - --commit-url=$COMMIT_URL \ - --format=github - - run: - name: Upload coverage report - command: | - LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME - REPORT_URL=https://neondatabase.github.io/zenith-coverage-data/$CIRCLE_SHA1 - COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1 - - scripts/git-upload \ - --repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/neondatabase/zenith-coverage-data.git \ - --message="Add code coverage for $COMMIT_URL" \ - copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE - - # Add link to the coverage report to the commit - curl -f -X POST \ - https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \ - -H "Accept: application/vnd.github.v3+json" \ - --user "$CI_ACCESS_TOKEN" \ - --data \ - "{ - \"state\": \"success\", - \"context\": \"zenith-coverage\", - \"description\": \"Coverage report is ready\", - \"target_url\": \"$REPORT_URL\" - }" - # Build neondatabase/neon:latest image and push it to Docker hub docker-image: docker: @@ -730,12 +640,6 @@ workflows: save_perf_report: true requires: - build-neon-release - - coverage-report: - # Context passes credentials for gh api - context: CI_ACCESS_TOKEN - requires: - # TODO: consider adding more - - other-tests-debug - docker-image: # Context gives an ability to login context: Docker Hub diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 4831cdaed1..48c0c2b925 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -92,7 +92,7 @@ runs: fi if [[ "${{ inputs.build_type }}" == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run) + cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) elif [[ "${{ inputs.build_type }}" == "release" ]]; then cov_prefix=() fi diff --git a/.github/actions/save-coverage-data/action.yml b/.github/actions/save-coverage-data/action.yml index 7b228f636f..7ad04cf1fe 100644 --- a/.github/actions/save-coverage-data/action.yml +++ b/.github/actions/save-coverage-data/action.yml @@ -6,7 +6,7 @@ runs: steps: - name: Merge coverage data shell: bash -ex {0} - run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage/ merge + run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge - name: Upload coverage data uses: actions/upload-artifact@v3 @@ -14,4 +14,4 @@ runs: retention-days: 7 if-no-files-found: error name: coverage-data-artifact - path: /tmp/neon/coverage/ + path: /tmp/coverage/ diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 40a305a468..81b4585714 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -49,7 +49,7 @@ jobs: - name: Build postgres if: steps.cache_pg.outputs.cache-hit != 'true' - run: COPT='-Werror' mold -run make postgres -j$(nproc) + run: mold -run make postgres -j$(nproc) # actions/cache@v3 does not allow concurrently using the same cache across job steps, so use a separate cache - name: Prepare postgres artifact @@ -109,7 +109,7 @@ jobs: - name: Run cargo build run: | if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run) + cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) CARGO_FLAGS= elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix=() @@ -121,7 +121,7 @@ jobs: - name: Run cargo test run: | if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run) + cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) CARGO_FLAGS= elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix=() @@ -133,7 +133,7 @@ jobs: - name: Install rust binaries run: | if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run) + cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix=() fi @@ -151,7 +151,9 @@ jobs: mkdir -p /tmp/neon/bin/ mkdir -p /tmp/neon/test_bin/ mkdir -p /tmp/neon/etc/ - mkdir -p /tmp/neon/coverage/ + + # Keep bloated coverage data files away from the rest of the artifact + mkdir -p /tmp/coverage/ # Install target binaries for bin in $binaries; do @@ -163,13 +165,13 @@ jobs: # Install test executables and write list of all binaries (for code coverage) if [[ $BUILD_TYPE == "debug" ]]; then for bin in $binaries; do - echo "/tmp/neon/bin/$bin" >> /tmp/neon/coverage/binaries.list + echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list done for bin in $test_exe_paths; do SRC=$bin DST=/tmp/neon/test_bin/$(basename $bin) cp "$SRC" "$DST" - echo "$DST" >> /tmp/neon/coverage/binaries.list + echo "$DST" >> /tmp/coverage/binaries.list done fi @@ -313,7 +315,10 @@ jobs: uses: actions/download-artifact@v3 with: name: coverage-data-artifact - path: /tmp/neon/coverage/ + path: /tmp/coverage/ + + - name: Merge coverage data + run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge - name: Build and upload coverage report run: | @@ -322,8 +327,8 @@ jobs: COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA scripts/coverage \ - --dir=/tmp/neon/coverage report \ - --input-objects=/tmp/neon/coverage/binaries.list \ + --dir=/tmp/coverage report \ + --input-objects=/tmp/coverage/binaries.list \ --commit-url=$COMMIT_URL \ --format=github @@ -332,7 +337,7 @@ jobs: scripts/git-upload \ --repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \ --message="Add code coverage for $COMMIT_URL" \ - copy /tmp/neon/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE + copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE # Add link to the coverage report to the commit curl -f -X POST \ From 1d0706cf250b16f2c5052f446807b83fd41013fe Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 24 Jun 2022 21:26:53 +0300 Subject: [PATCH 0451/1022] Fix walreceiver connection selection mechanism * Avoid reconnecting to safekeeper immediately after its failure by limiting candidates to those with fewest connection attempts. Thus we don't have to wait lagging_wal_timeout (10s by default) before switch happens even if no new changes are generated, and current test_restarts_under_load expects some commits to happen within 4s. * Make default max_lsn_wal_lag larger, otherwise we constant reconnections happen during normal work. * Fix wal_connection_attempts maintanance, preventing busy loop of reconnections. --- pageserver/src/tenant_config.rs | 2 +- pageserver/src/walreceiver.rs | 29 ++- .../src/walreceiver/connection_manager.rs | 178 +++++++++++++----- 3 files changed, 160 insertions(+), 49 deletions(-) diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index 1722c1a13a..8811009743 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -37,7 +37,7 @@ pub mod defaults { pub const DEFAULT_PITR_INTERVAL: &str = "30 days"; pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; - pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10_000; + pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024; } /// Per-tenant configuration options diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index fd9468a101..2b5a3123c1 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -178,7 +178,7 @@ async fn shutdown_all_wal_connections( /// That may lead to certain events not being observed by the listener. #[derive(Debug)] struct TaskHandle { - handle: JoinHandle<()>, + handle: JoinHandle>, events_receiver: watch::Receiver>, cancellation: watch::Sender<()>, } @@ -205,8 +205,8 @@ impl TaskHandle { let sender = Arc::clone(&events_sender); let handle = tokio::task::spawn(async move { - let task_result = task(sender, cancellation_receiver).await; - events_sender.send(TaskEvent::End(task_result)).ok(); + events_sender.send(TaskEvent::Started).ok(); + task(sender, cancellation_receiver).await }); TaskHandle { @@ -216,6 +216,16 @@ impl TaskHandle { } } + async fn next_task_event(&mut self) -> TaskEvent { + select! { + next_task_event = self.events_receiver.changed() => match next_task_event { + Ok(()) => self.events_receiver.borrow().clone(), + Err(_task_channel_part_dropped) => join_on_handle(&mut self.handle).await, + }, + task_completion_result = join_on_handle(&mut self.handle) => task_completion_result, + } + } + /// Aborts current task, waiting for it to finish. async fn shutdown(self) { self.cancellation.send(()).ok(); @@ -225,6 +235,19 @@ impl TaskHandle { } } +async fn join_on_handle(handle: &mut JoinHandle>) -> TaskEvent { + match handle.await { + Ok(task_result) => TaskEvent::End(task_result), + Err(e) => { + if e.is_cancelled() { + TaskEvent::End(Ok(())) + } else { + TaskEvent::End(Err(format!("WAL receiver task panicked: {e}"))) + } + } + } +} + /// A step to process timeline attach/detach events to enable/disable the corresponding WAL receiver machinery. /// In addition to WAL streaming management, the step ensures that corresponding tenant has its service threads enabled or disabled. /// This is done here, since only walreceiver knows when a certain tenant has no streaming enabled. diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index d5ca1d5159..614bca50ad 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -104,49 +104,29 @@ async fn connection_manager_loop_step( Some(wal_connection_update) = async { match walreceiver_state.wal_connection.as_mut() { - Some(wal_connection) => { - let receiver = &mut wal_connection.connection_task.events_receiver; - Some(match receiver.changed().await { - Ok(()) => receiver.borrow().clone(), - Err(_cancellation_error) => TaskEvent::End(Ok(())), - }) - } + Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await), None => None, } } => { - let (connection_update, reset_connection_attempts) = match &wal_connection_update { - TaskEvent::Started => (Some(Utc::now().naive_utc()), true), - TaskEvent::NewEvent(replication_feedback) => (Some(DateTime::::from(replication_feedback.ps_replytime).naive_utc()), true), + let wal_connection = walreceiver_state.wal_connection.as_mut().expect("Should have a connection, as checked by the corresponding select! guard"); + match &wal_connection_update { + TaskEvent::Started => { + wal_connection.latest_connection_update = Utc::now().naive_utc(); + *walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0) += 1; + }, + TaskEvent::NewEvent(replication_feedback) => { + wal_connection.latest_connection_update = DateTime::::from(replication_feedback.ps_replytime).naive_utc(); + // reset connection attempts here only, the only place where both nodes + // explicitly confirmn with replication feedback that they are connected to each other + walreceiver_state.wal_connection_attempts.remove(&wal_connection.sk_id); + }, TaskEvent::End(end_result) => { - let should_reset_connection_attempts = match end_result { - Ok(()) => { - debug!("WAL receiving task finished"); - true - }, - Err(e) => { - warn!("WAL receiving task failed: {e}"); - false - }, + match end_result { + Ok(()) => debug!("WAL receiving task finished"), + Err(e) => warn!("WAL receiving task failed: {e}"), }; walreceiver_state.wal_connection = None; - (None, should_reset_connection_attempts) }, - }; - - if let Some(connection_update) = connection_update { - match &mut walreceiver_state.wal_connection { - Some(wal_connection) => { - wal_connection.latest_connection_update = connection_update; - - let attempts_entry = walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0); - if reset_connection_attempts { - *attempts_entry = 0; - } else { - *attempts_entry += 1; - } - }, - None => error!("Received connection update for WAL connection that is not active, update: {wal_connection_update:?}"), - } } }, @@ -406,10 +386,8 @@ impl WalreceiverState { Some(existing_wal_connection) => { let connected_sk_node = existing_wal_connection.sk_id; - let (new_sk_id, new_safekeeper_etcd_data, new_wal_producer_connstr) = self - .applicable_connection_candidates() - .filter(|&(sk_id, _, _)| sk_id != connected_sk_node) - .max_by_key(|(_, info, _)| info.commit_lsn)?; + let (new_sk_id, new_safekeeper_etcd_data, new_wal_producer_connstr) = + self.select_connection_candidate(Some(connected_sk_node))?; let now = Utc::now().naive_utc(); if let Ok(latest_interaciton) = @@ -462,9 +440,8 @@ impl WalreceiverState { } } None => { - let (new_sk_id, _, new_wal_producer_connstr) = self - .applicable_connection_candidates() - .max_by_key(|(_, info, _)| info.commit_lsn)?; + let (new_sk_id, _, new_wal_producer_connstr) = + self.select_connection_candidate(None)?; return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_producer_connstr: new_wal_producer_connstr, @@ -476,6 +453,49 @@ impl WalreceiverState { None } + /// Selects the best possible candidate, based on the data collected from etcd updates about the safekeepers. + /// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another. + /// + /// The candidate that is chosen: + /// * has fewest connection attempts from pageserver to safekeeper node (reset every time the WAL replication feedback is sent) + /// * has greatest data Lsn among the ones that are left + /// + /// NOTE: + /// We evict timeline data received from etcd based on time passed since it was registered, along with its connection attempts values, but + /// otherwise to reset the connection attempts, a successful connection to that node is needed. + /// That won't happen now, before all nodes with less connection attempts are connected to first, which might leave the sk node with more advanced state to be ignored. + fn select_connection_candidate( + &self, + node_to_omit: Option, + ) -> Option<(NodeId, &SkTimelineInfo, String)> { + let all_candidates = self + .applicable_connection_candidates() + .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit) + .collect::>(); + + let smallest_attempts_allowed = all_candidates + .iter() + .map(|(sk_id, _, _)| { + self.wal_connection_attempts + .get(sk_id) + .copied() + .unwrap_or(0) + }) + .min()?; + + all_candidates + .into_iter() + .filter(|(sk_id, _, _)| { + smallest_attempts_allowed + >= self + .wal_connection_attempts + .get(sk_id) + .copied() + .unwrap_or(0) + }) + .max_by_key(|(_, info, _)| info.commit_lsn) + } + fn applicable_connection_candidates( &self, ) -> impl Iterator { @@ -500,15 +520,25 @@ impl WalreceiverState { } fn cleanup_old_candidates(&mut self) { - self.wal_stream_candidates.retain(|_, etcd_info| { + let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len()); + + self.wal_stream_candidates.retain(|node_id, etcd_info| { if let Ok(time_since_latest_etcd_update) = (Utc::now().naive_utc() - etcd_info.latest_update).to_std() { - time_since_latest_etcd_update < self.lagging_wal_timeout + let should_retain = time_since_latest_etcd_update < self.lagging_wal_timeout; + if !should_retain { + node_ids_to_remove.push(*node_id); + } + should_retain } else { true } }); + + for node_id in node_ids_to_remove { + self.wal_connection_attempts.remove(&node_id); + } } } @@ -843,6 +873,64 @@ mod tests { Ok(()) } + #[tokio::test] + async fn candidate_with_many_connection_failures() -> anyhow::Result<()> { + let harness = RepoHarness::create("candidate_with_many_connection_failures")?; + let mut state = dummy_state(&harness); + let now = Utc::now().naive_utc(); + + let current_lsn = Lsn(100_000).align(); + let bigger_lsn = Lsn(current_lsn.0 + 100).align(); + + state.wal_connection = None; + state.wal_stream_candidates = HashMap::from([ + ( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(bigger_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(1), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(current_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ]); + state.wal_connection_attempts = HashMap::from([(NodeId(0), 1), (NodeId(1), 0)]); + + let candidate_with_less_errors = state + .next_connection_candidate() + .expect("Expected one candidate selected, but got none"); + assert_eq!( + candidate_with_less_errors.safekeeper_id, + NodeId(1), + "Should select the node with less connection errors" + ); + + Ok(()) + } + #[tokio::test] async fn connection_no_etcd_data_candidate() -> anyhow::Result<()> { let harness = RepoHarness::create("connection_no_etcd_data_candidate")?; From 00fc696606a235e9517344d58f6804bdd3eae76e Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 27 Jun 2022 19:08:03 +0300 Subject: [PATCH 0452/1022] replace extra urlencode dependency with already present url library --- Cargo.lock | 8 +------- compute_tools/Cargo.toml | 2 +- compute_tools/src/bin/compute_ctl.rs | 5 +++-- compute_tools/src/checker.rs | 7 ++----- compute_tools/src/compute.rs | 15 ++++++++++----- compute_tools/src/monitor.rs | 10 +++++----- compute_tools/src/spec.rs | 9 +++++---- 7 files changed, 27 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ef1b7327c5..e812ce7eab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -461,7 +461,7 @@ dependencies = [ "tar", "tokio", "tokio-postgres", - "urlencoding", + "url", "workspace_hack", ] @@ -3685,12 +3685,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "urlencoding" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b90931029ab9b034b300b797048cf23723400aa757e8a2bfb9d748102f9821" - [[package]] name = "utils" version = "0.1.0" diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index a47f9998e6..1022438c2e 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -18,5 +18,5 @@ serde_json = "1" tar = "0.4" tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -urlencoding = "2.1.0" +url = "2.2.2" workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index ba116af11b..f535adfd87 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -33,7 +33,7 @@ use std::process::exit; use std::sync::{Arc, RwLock}; use std::{thread, time::Duration}; -use anyhow::Result; +use anyhow::{Context, Result}; use chrono::Utc; use clap::Arg; use log::{error, info}; @@ -45,6 +45,7 @@ use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::pg_helpers::*; use compute_tools::spec::*; +use url::Url; fn main() -> Result<()> { // TODO: re-use `utils::logging` later @@ -131,7 +132,7 @@ fn main() -> Result<()> { let compute_state = ComputeNode { start_time: Utc::now(), - connstr: connstr.to_string(), + connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?, pgdata: pgdata.to_string(), pgbin: pgbin.to_string(), spec, diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index dbb70a74cf..b6ba1692f9 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -1,5 +1,3 @@ -use std::sync::Arc; - use anyhow::{anyhow, Result}; use log::error; use postgres::Client; @@ -23,9 +21,8 @@ pub fn create_writablity_check_data(client: &mut Client) -> Result<()> { Ok(()) } -pub async fn check_writability(compute: &Arc) -> Result<()> { - let connstr = &compute.connstr; - let (client, connection) = tokio_postgres::connect(connstr, NoTls).await?; +pub async fn check_writability(compute: &ComputeNode) -> Result<()> { + let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?; if client.is_closed() { return Err(anyhow!("connection to postgres closed")); } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index abf7081cb7..8bcaf5494a 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -35,7 +35,8 @@ use crate::spec::*; /// Compute node info shared across several `compute_ctl` threads. pub struct ComputeNode { pub start_time: DateTime, - pub connstr: String, + // Url type maintains proper escaping + pub connstr: url::Url, pub pgdata: String, pub pgbin: String, pub spec: ComputeSpec, @@ -268,21 +269,25 @@ impl ComputeNode { // In this case we need to connect with old `zenith_admin`name // and create new user. We cannot simply rename connected user, // but we can create a new one and grant it all privileges. - let mut client = match Client::connect(&self.connstr, NoTls) { + let mut client = match Client::connect(self.connstr.as_str(), NoTls) { Err(e) => { info!( "cannot connect to postgres: {}, retrying with `zenith_admin` username", e ); - let zenith_admin_connstr = self.connstr.replacen("cloud_admin", "zenith_admin", 1); + let mut zenith_admin_connstr = self.connstr.clone(); - let mut client = Client::connect(&zenith_admin_connstr, NoTls)?; + zenith_admin_connstr + .set_username("zenith_admin") + .map_err(|_| anyhow::anyhow!("invalid connstr"))?; + + let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?; client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; client.simple_query("GRANT zenith_admin TO cloud_admin")?; drop(client); // reconnect with connsting with expected name - Client::connect(&self.connstr, NoTls)? + Client::connect(self.connstr.as_str(), NoTls)? } Ok(client) => client, }; diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 041b4875bd..58cdf796bc 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -13,11 +13,11 @@ const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds // Spin in a loop and figure out the last activity time in the Postgres. // Then update it in the shared state. This function never errors out. // XXX: the only expected panic is at `RwLock` unwrap(). -fn watch_compute_activity(compute: &Arc) { +fn watch_compute_activity(compute: &ComputeNode) { // Suppose that `connstr` doesn't change - let connstr = compute.connstr.clone(); + let connstr = compute.connstr.as_str(); // Define `client` outside of the loop to reuse existing connection if it's active. - let mut client = Client::connect(&connstr, NoTls); + let mut client = Client::connect(connstr, NoTls); let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL); info!("watching Postgres activity at {}", connstr); @@ -32,7 +32,7 @@ fn watch_compute_activity(compute: &Arc) { info!("connection to postgres closed, trying to reconnect"); // Connection is closed, reconnect and try again. - client = Client::connect(&connstr, NoTls); + client = Client::connect(connstr, NoTls); continue; } @@ -93,7 +93,7 @@ fn watch_compute_activity(compute: &Arc) { debug!("cannot connect to postgres: {}, retrying", e); // Establish a new connection and try again. - client = Client::connect(&connstr, NoTls); + client = Client::connect(connstr, NoTls); } } } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index d2cfb6d726..041f42acde 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -4,7 +4,6 @@ use anyhow::Result; use log::{info, log_enabled, warn, Level}; use postgres::{Client, NoTls}; use serde::Deserialize; -use urlencoding::encode; use crate::compute::ComputeNode; use crate::config; @@ -231,9 +230,11 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result< fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> { for db in &node.spec.cluster.databases { if db.owner != *role_name { - let db_name_encoded = format!("/{}", encode(&db.name)); - let db_connstr = node.connstr.replacen("/postgres", &db_name_encoded, 1); - let mut client = Client::connect(&db_connstr, NoTls)?; + let mut connstr = node.connstr.clone(); + // database name is always the last and the only component of the path + connstr.set_path(&db.name); + + let mut client = Client::connect(connstr.as_str(), NoTls)?; // This will reassign all dependent objects to the db owner let reassign_query = format!( From f09c09438a307f88eeb6bbaf85069022623e35d6 Mon Sep 17 00:00:00 2001 From: Bojan Serafimov Date: Thu, 30 Jun 2022 19:00:45 -0400 Subject: [PATCH 0453/1022] Fix gc after import --- pageserver/src/layered_repository.rs | 3 +++ pageserver/src/page_service.rs | 3 ++- pageserver/src/pgdatadir_mapping.rs | 3 ++- test_runner/batch_others/test_import.py | 5 +++++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index fdd03ecf8b..af6c7ba9fd 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -2210,6 +2210,9 @@ impl LayeredTimeline { LsnForTimestamp::Past(lsn) => { debug!("past({})", lsn); } + LsnForTimestamp::NoData(lsn) => { + debug!("nodata({})", lsn); + } } debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn) } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 77c320a181..22002fdbab 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -554,7 +554,7 @@ impl PageServerHandler { // Create empty timeline info!("creating new timeline"); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - let timeline = repo.create_empty_timeline(timeline_id, Lsn(0))?; + let timeline = repo.create_empty_timeline(timeline_id, base_lsn)?; let repartition_distance = repo.get_checkpoint_distance(); let mut datadir_timeline = DatadirTimeline::::new(timeline, repartition_distance); @@ -1151,6 +1151,7 @@ impl postgres_backend::Handler for PageServerHandler { LsnForTimestamp::Present(lsn) => format!("{}", lsn), LsnForTimestamp::Future(_lsn) => "future".into(), LsnForTimestamp::Past(_lsn) => "past".into(), + LsnForTimestamp::NoData(_lsn) => "nodata".into(), }; pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?; pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index ce305a55f4..9dbae74074 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -51,6 +51,7 @@ pub enum LsnForTimestamp { Present(Lsn), Future(Lsn), Past(Lsn), + NoData(Lsn), } impl DatadirTimeline { @@ -263,7 +264,7 @@ impl DatadirTimeline { (false, false) => { // This can happen if no commit records have been processed yet, e.g. // just after importing a cluster. - bail!("no commit timestamps found"); + Ok(LsnForTimestamp::NoData(max_lsn)) } (true, false) => { // Didn't find any commit timestamps larger than the request diff --git a/test_runner/batch_others/test_import.py b/test_runner/batch_others/test_import.py index e478103313..63dc42ee3e 100644 --- a/test_runner/batch_others/test_import.py +++ b/test_runner/batch_others/test_import.py @@ -191,3 +191,8 @@ def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_bu # Check it's the same as the first fullbackup # TODO pageserver should be checking checksum assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file) + + # Check that gc works + psconn = env.pageserver.connect() + pscur = psconn.cursor() + pscur.execute(f"do_gc {tenant.hex} {timeline} 0") From cadaca010cba2337fa7699bd7fb29429d8964a3b Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 1 Jul 2022 13:42:56 +0400 Subject: [PATCH 0454/1022] Make ansible to work with storage nodes through teleport from local box. --- .circleci/ansible/ansible.cfg | 4 +++- .circleci/ansible/ansible.ssh.cfg | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.circleci/ansible/ansible.cfg b/.circleci/ansible/ansible.cfg index e3daf3abe3..5818a64455 100644 --- a/.circleci/ansible/ansible.cfg +++ b/.circleci/ansible/ansible.cfg @@ -6,5 +6,7 @@ timeout = 30 [ssh_connection] ssh_args = -F ./ansible.ssh.cfg -scp_if_ssh = True +# teleport doesn't support sftp yet https://github.com/gravitational/teleport/issues/7127 +# and scp neither worked for me +transfer_method = piped pipelining = True diff --git a/.circleci/ansible/ansible.ssh.cfg b/.circleci/ansible/ansible.ssh.cfg index 91f673718e..55970f8c2b 100644 --- a/.circleci/ansible/ansible.ssh.cfg +++ b/.circleci/ansible/ansible.ssh.cfg @@ -1,3 +1,6 @@ +# Remove this once https://github.com/gravitational/teleport/issues/10918 is fixed +PubkeyAcceptedAlgorithms +ssh-rsa-cert-v01@openssh.com + Host tele.zenith.tech User admin Port 3023 From 97fed38213cfb4cf8a7fd0ebb22c48137f307428 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 1 Jul 2022 19:01:17 +0400 Subject: [PATCH 0455/1022] Fix cadaca010c for older ssh clients. --- .circleci/ansible/ansible.ssh.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.circleci/ansible/ansible.ssh.cfg b/.circleci/ansible/ansible.ssh.cfg index 55970f8c2b..cd058b5427 100644 --- a/.circleci/ansible/ansible.ssh.cfg +++ b/.circleci/ansible/ansible.ssh.cfg @@ -1,5 +1,6 @@ # Remove this once https://github.com/gravitational/teleport/issues/10918 is fixed -PubkeyAcceptedAlgorithms +ssh-rsa-cert-v01@openssh.com +# (use pre 8.5 option name to cope with old ssh in CI) +PubkeyAcceptedKeyTypes +ssh-rsa-cert-v01@openssh.com Host tele.zenith.tech User admin From 6100a02d0f7f68aa3e9e757a02f037550299966c Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 1 Jul 2022 18:08:34 +0400 Subject: [PATCH 0456/1022] Prefix WAL files in s3 with environment name. It wasn't merged to prod yet, so safe to enable. --- .circleci/ansible/production.hosts | 1 + .circleci/ansible/staging.hosts | 1 + .circleci/ansible/systemd/safekeeper.service | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.circleci/ansible/production.hosts b/.circleci/ansible/production.hosts index 03c6cf57e0..6a3a7791ad 100644 --- a/.circleci/ansible/production.hosts +++ b/.circleci/ansible/production.hosts @@ -12,6 +12,7 @@ pageservers safekeepers [storage:vars] +env_name = prod-1 console_mgmt_base_url = http://console-release.local bucket_name = zenith-storage-oregon bucket_region = us-west-2 diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index 29e4efbb19..35e77513df 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -13,6 +13,7 @@ pageservers safekeepers [storage:vars] +env_name = us-stage console_mgmt_base_url = http://console-staging.local bucket_name = zenith-staging-storage-us-east-1 bucket_region = us-east-1 diff --git a/.circleci/ansible/systemd/safekeeper.service b/.circleci/ansible/systemd/safekeeper.service index 9b1159d812..d5c6d00017 100644 --- a/.circleci/ansible/systemd/safekeeper.service +++ b/.circleci/ansible/systemd/safekeeper.service @@ -6,7 +6,7 @@ After=network.target auditd.service Type=simple User=safekeeper Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib -ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="wal"}' +ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}' ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed KillSignal=SIGINT From 65704708fa922b524d3ab75995c39afc5c4f562e Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 24 Jun 2022 21:11:49 +0300 Subject: [PATCH 0457/1022] remove unused imports, make more use of pathlib.Path --- .../batch_others/test_ancestor_branch.py | 3 - test_runner/batch_others/test_auth.py | 4 +- test_runner/batch_others/test_backpressure.py | 4 +- .../batch_others/test_basebackup_error.py | 2 - .../batch_others/test_branch_behind.py | 1 - test_runner/batch_others/test_fullbackup.py | 15 ++--- test_runner/batch_others/test_wal_acceptor.py | 13 +++-- test_runner/batch_others/test_wal_restore.py | 12 ++-- .../batch_pg_regress/test_isolation.py | 9 ++- .../batch_pg_regress/test_neon_regress.py | 9 ++- .../batch_pg_regress/test_pg_regress.py | 11 ++-- test_runner/fixtures/neon_fixtures.py | 55 ++++++++++--------- test_runner/fixtures/utils.py | 12 ---- 13 files changed, 61 insertions(+), 89 deletions(-) diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index 656428e5df..20e63b4e5c 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -1,6 +1,3 @@ -from contextlib import closing - -import psycopg2.extras import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException diff --git a/test_runner/batch_others/test_auth.py b/test_runner/batch_others/test_auth.py index b9eb9d7cee..0fd0a5d7e3 100644 --- a/test_runner/batch_others/test_auth.py +++ b/test_runner/batch_others/test_auth.py @@ -1,8 +1,6 @@ from contextlib import closing -from typing import Iterator -from uuid import UUID, uuid4 +from uuid import uuid4 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException -from requests.exceptions import HTTPError import pytest diff --git a/test_runner/batch_others/test_backpressure.py b/test_runner/batch_others/test_backpressure.py index f89ee14691..4ca03b102b 100644 --- a/test_runner/batch_others/test_backpressure.py +++ b/test_runner/batch_others/test_backpressure.py @@ -1,11 +1,9 @@ from contextlib import closing, contextmanager import psycopg2.extras import pytest -from fixtures.neon_fixtures import PgProtocol, NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log -import os import time -import asyncpg from fixtures.neon_fixtures import Postgres import threading diff --git a/test_runner/batch_others/test_basebackup_error.py b/test_runner/batch_others/test_basebackup_error.py index 29cbe59d2e..0909ed98a7 100644 --- a/test_runner/batch_others/test_basebackup_error.py +++ b/test_runner/batch_others/test_basebackup_error.py @@ -1,8 +1,6 @@ import pytest -from contextlib import closing from fixtures.neon_fixtures import NeonEnv -from fixtures.log_helper import log # diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index 4f4c058b61..0274c6c1e0 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -1,4 +1,3 @@ -import subprocess from contextlib import closing import psycopg2.extras diff --git a/test_runner/batch_others/test_fullbackup.py b/test_runner/batch_others/test_fullbackup.py index e5d705beab..cd6c40f56b 100644 --- a/test_runner/batch_others/test_fullbackup.py +++ b/test_runner/batch_others/test_fullbackup.py @@ -1,16 +1,10 @@ -import subprocess from contextlib import closing -import psycopg2.extras -import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres from fixtures.neon_fixtures import pg_distrib_dir import os -from fixtures.utils import mkdir_if_needed, subprocess_capture -import shutil -import getpass -import pwd +from fixtures.utils import subprocess_capture num_rows = 1000 @@ -46,19 +40,20 @@ def test_fullbackup(neon_env_builder: NeonEnvBuilder, psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')} # Get and unpack fullbackup from pageserver - restored_dir_path = os.path.join(env.repo_dir, "restored_datadir") + restored_dir_path = env.repo_dir / "restored_datadir" os.mkdir(restored_dir_path, 0o750) query = f"fullbackup {env.initial_tenant.hex} {timeline} {lsn}" cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] result_basepath = pg_bin.run_capture(cmd, env=psql_env) tar_output_file = result_basepath + ".stdout" - subprocess_capture(str(env.repo_dir), ["tar", "-xf", tar_output_file, "-C", restored_dir_path]) + subprocess_capture(str(env.repo_dir), + ["tar", "-xf", tar_output_file, "-C", str(restored_dir_path)]) # HACK # fullbackup returns neon specific pg_control and first WAL segment # use resetwal to overwrite it pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, 'pg_resetwal') - cmd = [pg_resetwal_path, "-D", restored_dir_path] + cmd = [pg_resetwal_path, "-D", str(restored_dir_path)] pg_bin.run_capture(cmd, env=psql_env) # Restore from the backup and find the data we inserted diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 2b93dd160a..9b876f780d 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -1,3 +1,4 @@ +import pathlib import pytest import random import time @@ -14,7 +15,7 @@ from dataclasses import dataclass, field from multiprocessing import Process, Value from pathlib import Path from fixtures.neon_fixtures import PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, neon_binpath, PgProtocol -from fixtures.utils import get_dir_size, lsn_to_hex, mkdir_if_needed, lsn_from_hex +from fixtures.utils import get_dir_size, lsn_to_hex, lsn_from_hex from fixtures.log_helper import log from typing import List, Optional, Any from uuid import uuid4 @@ -645,7 +646,7 @@ class ProposerPostgres(PgProtocol): def create_dir_config(self, safekeepers: str): """ Create dir and config for running --sync-safekeepers """ - mkdir_if_needed(self.pg_data_dir_path()) + pathlib.Path(self.pg_data_dir_path()).mkdir(exist_ok=True) with open(self.config_file_path(), "w") as f: cfg = [ "synchronous_standby_names = 'walproposer'\n", @@ -828,7 +829,7 @@ class SafekeeperEnv: self.timeline_id = uuid.uuid4() self.tenant_id = uuid.uuid4() - mkdir_if_needed(str(self.repo_dir)) + self.repo_dir.mkdir(exist_ok=True) # Create config and a Safekeeper object for each safekeeper self.safekeepers = [] @@ -847,8 +848,8 @@ class SafekeeperEnv: http=self.port_distributor.get_port(), ) - safekeeper_dir = os.path.join(self.repo_dir, f"sk{i}") - mkdir_if_needed(safekeeper_dir) + safekeeper_dir = self.repo_dir / f"sk{i}" + safekeeper_dir.mkdir(exist_ok=True) args = [ self.bin_safekeeper, @@ -857,7 +858,7 @@ class SafekeeperEnv: "--listen-http", f"127.0.0.1:{port.http}", "-D", - safekeeper_dir, + str(safekeeper_dir), "--id", str(i), "--broker-endpoints", diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py index 8ea64d4fce..809e942415 100644 --- a/test_runner/batch_others/test_wal_restore.py +++ b/test_runner/batch_others/test_wal_restore.py @@ -1,19 +1,17 @@ import os -import subprocess +from pathlib import Path from fixtures.neon_fixtures import (NeonEnvBuilder, VanillaPostgres, PortDistributor, PgBin, base_dir, - vanilla_pg, pg_distrib_dir) -from fixtures.log_helper import log def test_wal_restore(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, - test_output_dir, + test_output_dir: Path, port_distributor: PortDistributor): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_wal_restore") @@ -22,13 +20,13 @@ def test_wal_restore(neon_env_builder: NeonEnvBuilder, tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] env.neon_cli.pageserver_stop() port = port_distributor.get_port() - data_dir = os.path.join(test_output_dir, 'pgsql.restored') + data_dir = test_output_dir / 'pgsql.restored' with VanillaPostgres(data_dir, PgBin(test_output_dir), port) as restored: pg_bin.run_capture([ os.path.join(base_dir, 'libs/utils/scripts/restore_from_wal.sh'), os.path.join(pg_distrib_dir, 'bin'), - os.path.join(test_output_dir, 'repo/safekeepers/sk1/{}/*'.format(tenant_id)), - data_dir, + str(test_output_dir / 'repo' / 'safekeepers' / 'sk1' / str(tenant_id) / '*'), + str(data_dir), str(port) ]) restored.start() diff --git a/test_runner/batch_pg_regress/test_isolation.py b/test_runner/batch_pg_regress/test_isolation.py index 936b31298e..0124459440 100644 --- a/test_runner/batch_pg_regress/test_isolation.py +++ b/test_runner/batch_pg_regress/test_isolation.py @@ -1,13 +1,13 @@ import os +from pathlib import Path import pytest -from fixtures.utils import mkdir_if_needed from fixtures.neon_fixtures import NeonEnv, base_dir, pg_distrib_dir # The isolation tests run for a long time, especially in debug mode, # so use a larger-than-default timeout. @pytest.mark.timeout(1800) -def test_isolation(neon_simple_env: NeonEnv, test_output_dir, pg_bin, capsys): +def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): env = neon_simple_env env.neon_cli.create_branch("test_isolation", "empty") @@ -17,9 +17,8 @@ def test_isolation(neon_simple_env: NeonEnv, test_output_dir, pg_bin, capsys): pg.safe_psql('CREATE DATABASE isolation_regression') # Create some local directories for pg_isolation_regress to run in. - runpath = os.path.join(test_output_dir, 'regress') - mkdir_if_needed(runpath) - mkdir_if_needed(os.path.join(runpath, 'testtablespace')) + runpath = test_output_dir / 'regress' + (runpath / 'testtablespace').mkdir(parents=True) # Compute all the file locations that pg_isolation_regress will need. build_path = os.path.join(pg_distrib_dir, 'build/src/test/isolation') diff --git a/test_runner/batch_pg_regress/test_neon_regress.py b/test_runner/batch_pg_regress/test_neon_regress.py index de3f9705a0..66ea67d9f1 100644 --- a/test_runner/batch_pg_regress/test_neon_regress.py +++ b/test_runner/batch_pg_regress/test_neon_regress.py @@ -1,6 +1,6 @@ import os +from pathlib import Path -from fixtures.utils import mkdir_if_needed from fixtures.neon_fixtures import (NeonEnv, check_restored_datadir_content, base_dir, @@ -8,7 +8,7 @@ from fixtures.neon_fixtures import (NeonEnv, from fixtures.log_helper import log -def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir, pg_bin, capsys): +def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): env = neon_simple_env env.neon_cli.create_branch("test_neon_regress", "empty") @@ -17,9 +17,8 @@ def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir, pg_bin, capsys) pg.safe_psql('CREATE DATABASE regression') # Create some local directories for pg_regress to run in. - runpath = os.path.join(test_output_dir, 'regress') - mkdir_if_needed(runpath) - mkdir_if_needed(os.path.join(runpath, 'testtablespace')) + runpath = test_output_dir / 'regress' + (runpath / 'testtablespace').mkdir(parents=True) # Compute all the file locations that pg_regress will need. # This test runs neon specific tests diff --git a/test_runner/batch_pg_regress/test_pg_regress.py b/test_runner/batch_pg_regress/test_pg_regress.py index fb71d31170..b53bc21ca2 100644 --- a/test_runner/batch_pg_regress/test_pg_regress.py +++ b/test_runner/batch_pg_regress/test_pg_regress.py @@ -1,13 +1,13 @@ import os +import pathlib import pytest -from fixtures.utils import mkdir_if_needed from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content, base_dir, pg_distrib_dir # The pg_regress tests run for a long time, especially in debug mode, # so use a larger-than-default timeout. @pytest.mark.timeout(1800) -def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: str, pg_bin, capsys): +def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: pathlib.Path, pg_bin, capsys): env = neon_simple_env env.neon_cli.create_branch("test_pg_regress", "empty") @@ -16,9 +16,8 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: str, pg_bin, caps pg.safe_psql('CREATE DATABASE regression') # Create some local directories for pg_regress to run in. - runpath = os.path.join(test_output_dir, 'regress') - mkdir_if_needed(runpath) - mkdir_if_needed(os.path.join(runpath, 'testtablespace')) + runpath = test_output_dir / 'regress' + (runpath / 'testtablespace').mkdir(parents=True) # Compute all the file locations that pg_regress will need. build_path = os.path.join(pg_distrib_dir, 'build/src/test/regress') @@ -51,7 +50,7 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: str, pg_bin, caps # checkpoint one more time to ensure that the lsn we get is the latest one pg.safe_psql('CHECKPOINT') - lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0] + pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0] # Check that we restore the content of the datadir correctly check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7506641fcb..93efc7d5d2 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -35,12 +35,7 @@ from typing_extensions import Literal import requests import backoff # type: ignore -from .utils import (etcd_path, - get_self_dir, - mkdir_if_needed, - subprocess_capture, - lsn_from_hex, - lsn_to_hex) +from .utils import (etcd_path, get_self_dir, subprocess_capture, lsn_from_hex, lsn_to_hex) from fixtures.log_helper import log """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -127,7 +122,7 @@ def pytest_configure(config): top_output_dir = env_test_output else: top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR) - mkdir_if_needed(top_output_dir) + pathlib.Path(top_output_dir).mkdir(exist_ok=True) # Find the postgres installation. global pg_distrib_dir @@ -1316,7 +1311,7 @@ def append_pageserver_param_overrides( class PgBin: """ A helper class for executing postgres binaries """ - def __init__(self, log_dir: str): + def __init__(self, log_dir: Path): self.log_dir = log_dir self.pg_bin_path = os.path.join(str(pg_distrib_dir), 'bin') self.env = os.environ.copy() @@ -1367,22 +1362,27 @@ class PgBin: self._fixpath(command) log.info('Running command "{}"'.format(' '.join(command))) env = self._build_env(env) - return subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs) + return subprocess_capture(str(self.log_dir), + command, + env=env, + cwd=cwd, + check=True, + **kwargs) @pytest.fixture(scope='function') -def pg_bin(test_output_dir: str) -> PgBin: +def pg_bin(test_output_dir: Path) -> PgBin: return PgBin(test_output_dir) class VanillaPostgres(PgProtocol): - def __init__(self, pgdatadir: str, pg_bin: PgBin, port: int, init=True): + def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True): super().__init__(host='localhost', port=port, dbname='postgres') self.pgdatadir = pgdatadir self.pg_bin = pg_bin self.running = False if init: - self.pg_bin.run_capture(['initdb', '-D', pgdatadir]) + self.pg_bin.run_capture(['initdb', '-D', str(pgdatadir)]) self.configure([f"port = {port}\n"]) def configure(self, options: List[str]): @@ -1398,12 +1398,13 @@ class VanillaPostgres(PgProtocol): if log_path is None: log_path = os.path.join(self.pgdatadir, "pg.log") - self.pg_bin.run_capture(['pg_ctl', '-w', '-D', self.pgdatadir, '-l', log_path, 'start']) + self.pg_bin.run_capture( + ['pg_ctl', '-w', '-D', str(self.pgdatadir), '-l', log_path, 'start']) def stop(self): assert self.running self.running = False - self.pg_bin.run_capture(['pg_ctl', '-w', '-D', self.pgdatadir, 'stop']) + self.pg_bin.run_capture(['pg_ctl', '-w', '-D', str(self.pgdatadir), 'stop']) def get_subdir_size(self, subdir) -> int: """Return size of pgdatadir subdirectory in bytes.""" @@ -1418,9 +1419,9 @@ class VanillaPostgres(PgProtocol): @pytest.fixture(scope='function') -def vanilla_pg(test_output_dir: str, +def vanilla_pg(test_output_dir: Path, port_distributor: PortDistributor) -> Iterator[VanillaPostgres]: - pgdatadir = os.path.join(test_output_dir, "pgdata-vanilla") + pgdatadir = test_output_dir / "pgdata-vanilla" pg_bin = PgBin(test_output_dir) port = port_distributor.get_port() with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: @@ -1457,7 +1458,7 @@ class RemotePostgres(PgProtocol): @pytest.fixture(scope='function') -def remote_pg(test_output_dir: str) -> Iterator[RemotePostgres]: +def remote_pg(test_output_dir: Path) -> Iterator[RemotePostgres]: pg_bin = PgBin(test_output_dir) connstr = os.getenv("BENCHMARK_CONNSTR") @@ -1980,11 +1981,13 @@ class Etcd: self.handle.wait() -def get_test_output_dir(request: Any) -> str: +def get_test_output_dir(request: Any) -> pathlib.Path: """ Compute the working directory for an individual test. """ test_name = request.node.name - test_dir = os.path.join(str(top_output_dir), test_name) + test_dir = pathlib.Path(top_output_dir) / test_name log.info(f'get_test_output_dir is {test_dir}') + # make mypy happy + assert isinstance(test_dir, pathlib.Path) return test_dir @@ -1998,14 +2001,14 @@ def get_test_output_dir(request: Any) -> str: # this fixture ensures that the directory exists. That works because # 'autouse' fixtures are run before other fixtures. @pytest.fixture(scope='function', autouse=True) -def test_output_dir(request: Any) -> str: +def test_output_dir(request: Any) -> pathlib.Path: """ Create the working directory for an individual test. """ # one directory per test test_dir = get_test_output_dir(request) log.info(f'test_output_dir is {test_dir}') shutil.rmtree(test_dir, ignore_errors=True) - mkdir_if_needed(test_dir) + test_dir.mkdir() return test_dir @@ -2051,7 +2054,7 @@ def should_skip_file(filename: str) -> bool: # # Test helpers # -def list_files_to_compare(pgdata_dir: str): +def list_files_to_compare(pgdata_dir: pathlib.Path): pgdata_files = [] for root, _file, filenames in os.walk(pgdata_dir): for filename in filenames: @@ -2068,7 +2071,7 @@ def list_files_to_compare(pgdata_dir: str): # pg is the existing and running compute node, that we want to compare with a basebackup -def check_restored_datadir_content(test_output_dir: str, env: NeonEnv, pg: Postgres): +def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Postgres): # Get the timeline ID. We need it for the 'basebackup' command with closing(pg.connect()) as conn: @@ -2080,8 +2083,8 @@ def check_restored_datadir_content(test_output_dir: str, env: NeonEnv, pg: Postg pg.stop() # Take a basebackup from pageserver - restored_dir_path = os.path.join(env.repo_dir, f"{pg.node_name}_restored_datadir") - mkdir_if_needed(restored_dir_path) + restored_dir_path = env.repo_dir / f"{pg.node_name}_restored_datadir" + restored_dir_path.mkdir(exist_ok=True) pg_bin = PgBin(test_output_dir) psql_path = os.path.join(pg_bin.pg_bin_path, 'psql') @@ -2108,7 +2111,7 @@ def check_restored_datadir_content(test_output_dir: str, env: NeonEnv, pg: Postg # list files we're going to compare assert pg.pgdata_dir - pgdata_files = list_files_to_compare(pg.pgdata_dir) + pgdata_files = list_files_to_compare(pathlib.Path(pg.pgdata_dir)) restored_files = list_files_to_compare(restored_dir_path) # check that file sets are equal diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index bfa57373b3..05d1a6634d 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -12,18 +12,6 @@ def get_self_dir() -> str: return os.path.dirname(os.path.abspath(__file__)) -def mkdir_if_needed(path: str) -> None: - """ Create a directory if it doesn't already exist - - Note this won't try to create intermediate directories. - """ - try: - os.mkdir(path) - except FileExistsError: - pass - assert os.path.isdir(path) - - def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: """ Run a process and capture its output From 7898e72990620fd25bee604e0c632750a8ae5a9a Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 4 Jul 2022 10:53:37 +0100 Subject: [PATCH 0458/1022] Remove duplicated checks from LocalEnv --- control_plane/src/local_env.rs | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 08389d29ba..e0b409f32d 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -403,16 +403,6 @@ impl LocalEnv { self.pg_distrib_dir.display() ); } - for binary in ["pageserver", "safekeeper"] { - if !self.zenith_distrib_dir.join(binary).exists() { - bail!( - "Can't find binary '{}' in zenith distrib dir '{}'", - binary, - self.zenith_distrib_dir.display() - ); - } - } - for binary in ["pageserver", "safekeeper"] { if !self.zenith_distrib_dir.join(binary).exists() { bail!( @@ -421,12 +411,6 @@ impl LocalEnv { ); } } - if !self.pg_distrib_dir.join("bin/postgres").exists() { - bail!( - "Can't find postgres binary at {}", - self.pg_distrib_dir.display() - ); - } fs::create_dir(&base_path)?; From 6abdb1272428718cb21e058044e05b572051c08f Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 4 Jul 2022 18:46:29 +0300 Subject: [PATCH 0459/1022] Fix 1.62 Clippy errors --- compute_tools/src/pg_helpers.rs | 10 +++++++--- pageserver/src/layered_repository/blob_io.rs | 2 +- pageserver/src/profiling.rs | 6 ++++++ proxy/src/waiters.rs | 2 +- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 74856eac63..ea3909a029 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -1,3 +1,4 @@ +use std::fmt::Write; use std::fs::File; use std::io::{BufRead, BufReader}; use std::net::{SocketAddr, TcpStream}; @@ -138,9 +139,11 @@ impl Role { // Now we also support SCRAM-SHA-256 and to preserve compatibility // we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256. if pass.starts_with("SCRAM-SHA-256") { - params.push_str(&format!(" PASSWORD '{}'", pass)); + write!(params, " PASSWORD '{pass}'") + .expect("String is documented to not to error during write operations"); } else { - params.push_str(&format!(" PASSWORD 'md5{}'", pass)); + write!(params, " PASSWORD 'md5{pass}'") + .expect("String is documented to not to error during write operations"); } } else { params.push_str(" PASSWORD NULL"); @@ -158,7 +161,8 @@ impl Database { /// it may require a proper quoting too. pub fn to_pg_options(&self) -> String { let mut params: String = self.options.as_pg_options(); - params.push_str(&format!(" OWNER {}", &self.owner.quote())); + write!(params, " OWNER {}", &self.owner.quote()) + .expect("String is documented to not to error during write operations"); params } diff --git a/pageserver/src/layered_repository/blob_io.rs b/pageserver/src/layered_repository/blob_io.rs index 3aeeb2b2c8..a4c6186056 100644 --- a/pageserver/src/layered_repository/blob_io.rs +++ b/pageserver/src/layered_repository/blob_io.rs @@ -34,7 +34,7 @@ pub trait BlobCursor { ) -> Result<(), std::io::Error>; } -impl<'a, R> BlobCursor for BlockCursor +impl BlobCursor for BlockCursor where R: BlockReader, { diff --git a/pageserver/src/profiling.rs b/pageserver/src/profiling.rs index 84132659d6..ad896cfa30 100644 --- a/pageserver/src/profiling.rs +++ b/pageserver/src/profiling.rs @@ -81,6 +81,12 @@ mod profiling_impl { pub struct DummyProfilerGuard; + impl Drop for DummyProfilerGuard { + fn drop(&mut self) { + // do nothing, this exists to calm Clippy down + } + } + pub fn profpoint_start( _conf: &PageServerConf, _point: ProfilingConfig, diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs index 799d45a165..bba5494cfe 100644 --- a/proxy/src/waiters.rs +++ b/proxy/src/waiters.rs @@ -115,7 +115,7 @@ mod tests { Ok(()) }); - let () = waiter.await?; + waiter.await?; notifier.await? } } From d29c545b5d214413e5fc1199b2aece650561cc44 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Tue, 5 Jul 2022 02:06:40 -0400 Subject: [PATCH 0460/1022] Gc/compaction thread pool, take 2 (#1933) Decrease the number of pageserver threads by running gc and compaction in a blocking tokio thread pool --- pageserver/src/bin/pageserver.rs | 2 + pageserver/src/layered_repository.rs | 13 + pageserver/src/lib.rs | 2 +- pageserver/src/tenant_mgr.rs | 50 +-- pageserver/src/tenant_tasks.rs | 288 ++++++++++++++++++ pageserver/src/tenant_threads.rs | 79 ----- pageserver/src/thread_mgr.rs | 7 +- test_runner/batch_others/test_tenant_tasks.py | 70 +++++ test_runner/fixtures/neon_fixtures.py | 2 +- 9 files changed, 386 insertions(+), 127 deletions(-) create mode 100644 pageserver/src/tenant_tasks.rs delete mode 100644 pageserver/src/tenant_threads.rs create mode 100644 test_runner/batch_others/test_tenant_tasks.py diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 1d407a29bc..b539964414 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -263,6 +263,8 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() // start profiler (if enabled) let profiler_guard = profiling::init_profiler(conf); + pageserver::tenant_tasks::init_tenant_task_pool()?; + // initialize authentication for incoming connections let auth = match &conf.auth_type { AuthType::Trust | AuthType::MD5 => None, diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index af6c7ba9fd..67f024ef59 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -158,6 +158,18 @@ pub struct LayeredRepository { // Global pageserver config parameters pub conf: &'static PageServerConf, + // Allows us to gracefully cancel operations that edit the directory + // that backs this layered repository. Usage: + // + // Use `let _guard = file_lock.try_read()` while writing any files. + // Use `let _guard = file_lock.write().unwrap()` to wait for all writes to finish. + // + // TODO try_read this lock during checkpoint as well to prevent race + // between checkpoint and detach/delete. + // TODO try_read this lock for all gc/compaction operations, not just + // ones scheduled by the tenant task manager. + pub file_lock: RwLock<()>, + // Overridden tenant-specific config parameters. // We keep TenantConfOpt sturct here to preserve the information // about parameters that are not set. @@ -685,6 +697,7 @@ impl LayeredRepository { ) -> LayeredRepository { LayeredRepository { tenant_id, + file_lock: RwLock::new(()), conf, tenant_conf: Arc::new(RwLock::new(tenant_conf)), timelines: Mutex::new(HashMap::new()), diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index a68c277114..c9c00d75e2 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -13,7 +13,7 @@ pub mod repository; pub mod storage_sync; pub mod tenant_config; pub mod tenant_mgr; -pub mod tenant_threads; +pub mod tenant_tasks; pub mod thread_mgr; pub mod timelines; pub mod virtual_file; diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index c48b021d1f..c73fed140a 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -230,8 +230,6 @@ pub fn shutdown_all_tenants() { drop(m); thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiverManager), None, None); - thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), None, None); - thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), None, None); // Ok, no background threads running anymore. Flush any remaining data in // memory to disk. @@ -330,44 +328,12 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow: } (TenantState::Idle, TenantState::Active) => { info!("activating tenant {tenant_id}"); - let compactor_spawn_result = thread_mgr::spawn( - ThreadKind::Compactor, - Some(tenant_id), - None, - "Compactor thread", - false, - move || crate::tenant_threads::compact_loop(tenant_id), - ); - if compactor_spawn_result.is_err() { - let mut m = tenants_state::write_tenants(); - m.get_mut(&tenant_id) - .with_context(|| format!("Tenant not found for id {tenant_id}"))? - .state = old_state; - drop(m); - } - compactor_spawn_result?; - let gc_spawn_result = thread_mgr::spawn( - ThreadKind::GarbageCollector, - Some(tenant_id), - None, - "GC thread", - false, - move || crate::tenant_threads::gc_loop(tenant_id), - ) - .map(|_thread_id| ()) // update the `Result::Ok` type to match the outer function's return signature - .with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}")); - - if let Err(e) = &gc_spawn_result { - let mut m = tenants_state::write_tenants(); - m.get_mut(&tenant_id) - .with_context(|| format!("Tenant not found for id {tenant_id}"))? - .state = old_state; - drop(m); - error!("Failed to start GC thread for tenant {tenant_id}, stopping its checkpointer thread: {e:?}"); - thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None); - return gc_spawn_result; - } + // Spawn gc and compaction loops. The loops will shut themselves + // down when they notice that the tenant is inactive. + // TODO maybe use tokio::sync::watch instead? + crate::tenant_tasks::start_compaction_loop(tenant_id)?; + crate::tenant_tasks::start_gc_loop(tenant_id)?; } (TenantState::Idle, TenantState::Stopping) => { info!("stopping idle tenant {tenant_id}"); @@ -379,8 +345,10 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow: Some(tenant_id), None, ); - thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), Some(tenant_id), None); - thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None); + + // Wait until all gc/compaction tasks finish + let repo = get_repository_for_tenant(tenant_id)?; + let _guard = repo.file_lock.write().unwrap(); } } diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs new file mode 100644 index 0000000000..6871ac3001 --- /dev/null +++ b/pageserver/src/tenant_tasks.rs @@ -0,0 +1,288 @@ +//! This module contains functions to serve per-tenant background processes, +//! such as compaction and GC + +use std::collections::HashMap; +use std::ops::ControlFlow; +use std::time::Duration; + +use crate::repository::Repository; +use crate::tenant_mgr::TenantState; +use crate::thread_mgr::ThreadKind; +use crate::{tenant_mgr, thread_mgr}; +use anyhow::{self, Context}; +use futures::stream::FuturesUnordered; +use futures::StreamExt; +use metrics::{register_int_counter_vec, IntCounterVec}; +use once_cell::sync::{Lazy, OnceCell}; +use tokio::sync::mpsc; +use tokio::sync::watch; +use tracing::*; +use utils::zid::ZTenantId; + +static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_task_events", + "Number of task start/stop/fail events.", + &["event"], + ) + .expect("Failed to register tenant_task_events metric") +}); + +/// +/// Compaction task's main loop +/// +async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { + loop { + trace!("waking up"); + + // Run blocking part of the task + let period: Result, _> = tokio::task::spawn_blocking(move || { + // Break if tenant is not active + if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { + return Ok(ControlFlow::Break(())); + } + + // Break if we're not allowed to write to disk + let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + // TODO do this inside repo.compaction_iteration instead. + let _guard = match repo.file_lock.try_read() { + Ok(g) => g, + Err(_) => return Ok(ControlFlow::Break(())), + }; + + // Run compaction + let compaction_period = repo.get_compaction_period(); + repo.compaction_iteration()?; + Ok(ControlFlow::Continue(compaction_period)) + }) + .await; + + // Decide whether to sleep or break + let sleep_duration = match period { + Ok(Ok(ControlFlow::Continue(period))) => period, + Ok(Ok(ControlFlow::Break(()))) => break, + Ok(Err(e)) => { + error!("Compaction failed, retrying: {}", e); + Duration::from_secs(2) + } + Err(e) => { + error!("Compaction join error, retrying: {}", e); + Duration::from_secs(2) + } + }; + + // Sleep + tokio::select! { + _ = cancel.changed() => { + trace!("received cancellation request"); + break; + }, + _ = tokio::time::sleep(sleep_duration) => {}, + } + } + + trace!( + "compaction loop stopped. State is {:?}", + tenant_mgr::get_tenant_state(tenantid) + ); +} + +static START_GC_LOOP: OnceCell> = OnceCell::new(); +static START_COMPACTION_LOOP: OnceCell> = OnceCell::new(); + +/// Spawn a task that will periodically schedule garbage collection until +/// the tenant becomes inactive. This should be called on tenant +/// activation. +pub fn start_gc_loop(tenantid: ZTenantId) -> anyhow::Result<()> { + START_GC_LOOP + .get() + .context("Failed to get START_GC_LOOP")? + .blocking_send(tenantid) + .context("Failed to send to START_GC_LOOP channel")?; + Ok(()) +} + +/// Spawn a task that will periodically schedule compaction until +/// the tenant becomes inactive. This should be called on tenant +/// activation. +pub fn start_compaction_loop(tenantid: ZTenantId) -> anyhow::Result<()> { + START_COMPACTION_LOOP + .get() + .context("failed to get START_COMPACTION_LOOP")? + .blocking_send(tenantid) + .context("failed to send to START_COMPACTION_LOOP")?; + Ok(()) +} + +/// Spawn the TenantTaskManager +/// This needs to be called before start_gc_loop or start_compaction_loop +pub fn init_tenant_task_pool() -> anyhow::Result<()> { + let runtime = tokio::runtime::Builder::new_multi_thread() + .thread_name("tenant-task-worker") + .worker_threads(40) // Way more than necessary + .max_blocking_threads(100) // Way more than necessary + .enable_all() + .build()?; + + let (gc_send, mut gc_recv) = mpsc::channel::(100); + START_GC_LOOP + .set(gc_send) + .expect("Failed to set START_GC_LOOP"); + + let (compaction_send, mut compaction_recv) = mpsc::channel::(100); + START_COMPACTION_LOOP + .set(compaction_send) + .expect("Failed to set START_COMPACTION_LOOP"); + + // TODO this is getting repetitive + let mut gc_loops = HashMap::>::new(); + let mut compaction_loops = HashMap::>::new(); + + thread_mgr::spawn( + ThreadKind::TenantTaskManager, + None, + None, + "Tenant task manager main thread", + true, + move || { + runtime.block_on(async move { + let mut futures = FuturesUnordered::new(); + loop { + tokio::select! { + _ = thread_mgr::shutdown_watcher() => { + // Send cancellation to all tasks + for (_, cancel) in gc_loops.drain() { + cancel.send(()).ok(); + } + for (_, cancel) in compaction_loops.drain() { + cancel.send(()).ok(); + } + + // Exit after all tasks finish + while let Some(result) = futures.next().await { + match result { + Ok(()) => { + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); + }, + Err(e) => { + TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc(); + error!("loop join error {}", e) + }, + } + } + break; + }, + tenantid = gc_recv.recv() => { + let tenantid = tenantid.expect("Gc task channel closed unexpectedly"); + + // Spawn new task, request cancellation of the old one if exists + let (cancel_send, cancel_recv) = watch::channel(()); + let handle = tokio::spawn(gc_loop(tenantid, cancel_recv) + .instrument(info_span!("gc loop", tenant = %tenantid))); + if let Some(old_cancel_send) = gc_loops.insert(tenantid, cancel_send) { + old_cancel_send.send(()).ok(); + } + + // Update metrics, remember handle + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + futures.push(handle); + }, + tenantid = compaction_recv.recv() => { + let tenantid = tenantid.expect("Compaction task channel closed unexpectedly"); + + // Spawn new task, request cancellation of the old one if exists + let (cancel_send, cancel_recv) = watch::channel(()); + let handle = tokio::spawn(compaction_loop(tenantid, cancel_recv) + .instrument(info_span!("compaction loop", tenant = %tenantid))); + if let Some(old_cancel_send) = compaction_loops.insert(tenantid, cancel_send) { + old_cancel_send.send(()).ok(); + } + + // Update metrics, remember handle + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + futures.push(handle); + }, + result = futures.next() => { + // Log and count any unhandled panics + match result { + Some(Ok(())) => { + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); + }, + Some(Err(e)) => { + TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc(); + error!("loop join error {}", e) + }, + None => {}, + }; + }, + } + } + }); + Ok(()) + }, + )?; + + Ok(()) +} + +/// +/// GC task's main loop +/// +async fn gc_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { + loop { + trace!("waking up"); + + // Run blocking part of the task + let period: Result, _> = tokio::task::spawn_blocking(move || { + // Break if tenant is not active + if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { + return Ok(ControlFlow::Break(())); + } + + // Break if we're not allowed to write to disk + let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + // TODO do this inside repo.gc_iteration instead. + let _guard = match repo.file_lock.try_read() { + Ok(g) => g, + Err(_) => return Ok(ControlFlow::Break(())), + }; + + // Run gc + let gc_period = repo.get_gc_period(); + let gc_horizon = repo.get_gc_horizon(); + if gc_horizon > 0 { + repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?; + } + + Ok(ControlFlow::Continue(gc_period)) + }) + .await; + + // Decide whether to sleep or break + let sleep_duration = match period { + Ok(Ok(ControlFlow::Continue(period))) => period, + Ok(Ok(ControlFlow::Break(()))) => break, + Ok(Err(e)) => { + error!("Gc failed, retrying: {}", e); + Duration::from_secs(2) + } + Err(e) => { + error!("Gc join error, retrying: {}", e); + Duration::from_secs(2) + } + }; + + // Sleep + tokio::select! { + _ = cancel.changed() => { + trace!("received cancellation request"); + break; + }, + _ = tokio::time::sleep(sleep_duration) => {}, + } + } + trace!( + "GC loop stopped. State is {:?}", + tenant_mgr::get_tenant_state(tenantid) + ); +} diff --git a/pageserver/src/tenant_threads.rs b/pageserver/src/tenant_threads.rs deleted file mode 100644 index b904d9040d..0000000000 --- a/pageserver/src/tenant_threads.rs +++ /dev/null @@ -1,79 +0,0 @@ -//! This module contains functions to serve per-tenant background processes, -//! such as compaction and GC -use crate::repository::Repository; -use crate::tenant_mgr; -use crate::tenant_mgr::TenantState; -use anyhow::Result; -use std::time::Duration; -use tracing::*; -use utils::zid::ZTenantId; - -/// -/// Compaction thread's main loop -/// -pub fn compact_loop(tenantid: ZTenantId) -> Result<()> { - if let Err(err) = compact_loop_ext(tenantid) { - error!("compact loop terminated with error: {:?}", err); - Err(err) - } else { - Ok(()) - } -} - -fn compact_loop_ext(tenantid: ZTenantId) -> Result<()> { - loop { - if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { - break; - } - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - let compaction_period = repo.get_compaction_period(); - - std::thread::sleep(compaction_period); - trace!("compaction thread for tenant {} waking up", tenantid); - - // Compact timelines - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - repo.compaction_iteration()?; - } - - trace!( - "compaction thread stopped for tenant {} state is {:?}", - tenantid, - tenant_mgr::get_tenant_state(tenantid) - ); - Ok(()) -} - -/// -/// GC thread's main loop -/// -pub fn gc_loop(tenantid: ZTenantId) -> Result<()> { - loop { - if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { - break; - } - - trace!("gc thread for tenant {} waking up", tenantid); - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - let gc_horizon = repo.get_gc_horizon(); - // Garbage collect old files that are not needed for PITR anymore - if gc_horizon > 0 { - repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?; - } - - // TODO Write it in more adequate way using - // condvar.wait_timeout() or something - let mut sleep_time = repo.get_gc_period().as_secs(); - while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == Some(TenantState::Active) - { - sleep_time -= 1; - std::thread::sleep(Duration::from_secs(1)); - } - } - trace!( - "GC thread stopped for tenant {} state is {:?}", - tenantid, - tenant_mgr::get_tenant_state(tenantid) - ); - Ok(()) -} diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index 6e4bc1a787..ab0d894c70 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -94,11 +94,8 @@ pub enum ThreadKind { // Main walreceiver manager thread that ensures that every timeline spawns a connection to safekeeper, to fetch WAL. WalReceiverManager, - // Thread that handles compaction of all timelines for a tenant. - Compactor, - - // Thread that handles GC of a tenant - GarbageCollector, + // Thread that schedules new compaction and gc jobs + TenantTaskManager, // Thread that flushes frozen in-memory layers to disk LayerFlushThread, diff --git a/test_runner/batch_others/test_tenant_tasks.py b/test_runner/batch_others/test_tenant_tasks.py new file mode 100644 index 0000000000..9ce2016a64 --- /dev/null +++ b/test_runner/batch_others/test_tenant_tasks.py @@ -0,0 +1,70 @@ +from fixtures.neon_fixtures import NeonEnvBuilder, wait_until +from uuid import UUID +import time + + +def get_only_element(l): + assert len(l) == 1 + return l[0] + + +# Test that gc and compaction tenant tasks start and stop correctly +def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): + # The gc and compaction loops don't bother to watch for tenant state + # changes while sleeping, so we use small periods to make this test + # run faster. With default settings we'd have to wait longer for tasks + # to notice state changes and shut down. + # TODO fix this behavior in the pageserver + tenant_config = "{gc_period = '1 s', compaction_period = '1 s'}" + neon_env_builder.pageserver_config_override = f"tenant_config={tenant_config}" + name = "test_tenant_tasks" + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + + def get_state(tenant): + all_states = client.tenant_list() + matching = [t for t in all_states if t["id"] == tenant.hex] + return get_only_element(matching)["state"] + + def get_metric_value(name): + metrics = client.get_metrics() + relevant = [line for line in metrics.splitlines() if line.startswith(name)] + if len(relevant) == 0: + return 0 + line = get_only_element(relevant) + value = line.lstrip(name).strip() + return int(value) + + def detach_all_timelines(tenant): + timelines = [UUID(t["timeline_id"]) for t in client.timeline_list(tenant)] + for t in timelines: + client.timeline_detach(tenant, t) + + def assert_idle(tenant): + assert get_state(tenant) == "Idle" + + # Create tenant, start compute + tenant, _ = env.neon_cli.create_tenant() + timeline = env.neon_cli.create_timeline(name, tenant_id=tenant) + pg = env.postgres.create_start(name, tenant_id=tenant) + assert (get_state(tenant) == "Active") + + # Stop compute + pg.stop() + + # Detach all tenants and wait for them to go idle + # TODO they should be already idle since there are no active computes + for tenant_info in client.tenant_list(): + tenant_id = UUID(tenant_info["id"]) + detach_all_timelines(tenant_id) + wait_until(10, 0.2, lambda: assert_idle(tenant_id)) + + # Assert that all tasks finish quickly after tenants go idle + def assert_tasks_finish(): + tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}') + tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}') + tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}') + assert tasks_started == tasks_ended + assert tasks_panicked == 0 + + wait_until(10, 0.2, assert_tasks_finish) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 93efc7d5d2..8df4878039 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2143,7 +2143,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post assert (mismatch, error) == ([], []) -def wait_until(number_of_iterations: int, interval: int, func): +def wait_until(number_of_iterations: int, interval: float, func): """ Wait until 'func' returns successfully, without exception. Returns the last return value from the the function. From 844832ffe46f3dd62d8cf063ffa858a941c6a838 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 5 Jul 2022 10:55:03 +0300 Subject: [PATCH 0461/1022] Bump vendor/postgres Contains changes from two PRs in vendor/postgres: - https://github.com/neondatabase/postgres/pull/163 - https://github.com/neondatabase/postgres/pull/176 --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 7faa67c3ca..35ad142301 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 7faa67c3ca53fcce51ae8fedf6b1af3b8cefd3e2 +Subproject commit 35ad142301bde7982aadae5403e9524bf5a7cce1 From 05f6a1394de68b5dbe7100304a3a8bef5e5fa48e Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 5 Jul 2022 12:22:58 +0100 Subject: [PATCH 0462/1022] Add tests for different Postgres client libraries (#2008) * Add tests for different postgres clients * test/fixtures: sanitize test name for test_output_dir * test/fixtures: do not look for etcd before runtime * Add workflow for testing Postgres client libraries --- .github/workflows/pg_clients.yml | 74 +++++ setup.cfg | 4 + test_runner/fixtures/neon_fixtures.py | 7 +- .../pg_clients/csharp/npgsql/.dockerignore | 2 + .../pg_clients/csharp/npgsql/.gitignore | 2 + .../pg_clients/csharp/npgsql/Dockerfile | 14 + .../pg_clients/csharp/npgsql/Program.cs | 19 ++ .../csharp/npgsql/csharp-npgsql.csproj | 14 + test_runner/pg_clients/java/jdbc/.gitignore | 1 + test_runner/pg_clients/java/jdbc/Dockerfile | 10 + test_runner/pg_clients/java/jdbc/Example.java | 31 +++ .../pg_clients/python/asyncpg/Dockerfile | 8 + .../python/asyncpg/asyncpg_example.py | 30 ++ .../python/asyncpg/requirements.txt | 1 + .../pg_clients/python/pg8000/Dockerfile | 8 + .../pg_clients/python/pg8000/README.md | 0 .../python/pg8000/pg8000_example.py | 23 ++ .../pg_clients/python/pg8000/requirements.txt | 1 + .../PostgresClientKitExample/.dockerignore | 1 + .../swift/PostgresClientKitExample/.gitignore | 1 + .../swift/PostgresClientKitExample/Dockerfile | 11 + .../PostgresClientKitExample/Package.resolved | 41 +++ .../PostgresClientKitExample/Package.swift | 17 ++ .../PostgresClientKitExample/main.swift | 38 +++ test_runner/pg_clients/test_pg_clients.py | 54 ++++ .../postgresql-client/.dockerignore | 1 + .../typescript/postgresql-client/.gitignore | 1 + .../typescript/postgresql-client/Dockerfile | 7 + .../typescript/postgresql-client/index.js | 25 ++ .../postgresql-client/package-lock.json | 262 ++++++++++++++++++ .../typescript/postgresql-client/package.json | 6 + 31 files changed, 712 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/pg_clients.yml create mode 100644 test_runner/pg_clients/csharp/npgsql/.dockerignore create mode 100644 test_runner/pg_clients/csharp/npgsql/.gitignore create mode 100644 test_runner/pg_clients/csharp/npgsql/Dockerfile create mode 100644 test_runner/pg_clients/csharp/npgsql/Program.cs create mode 100644 test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj create mode 100644 test_runner/pg_clients/java/jdbc/.gitignore create mode 100644 test_runner/pg_clients/java/jdbc/Dockerfile create mode 100644 test_runner/pg_clients/java/jdbc/Example.java create mode 100644 test_runner/pg_clients/python/asyncpg/Dockerfile create mode 100755 test_runner/pg_clients/python/asyncpg/asyncpg_example.py create mode 100644 test_runner/pg_clients/python/asyncpg/requirements.txt create mode 100644 test_runner/pg_clients/python/pg8000/Dockerfile create mode 100644 test_runner/pg_clients/python/pg8000/README.md create mode 100755 test_runner/pg_clients/python/pg8000/pg8000_example.py create mode 100644 test_runner/pg_clients/python/pg8000/requirements.txt create mode 100644 test_runner/pg_clients/swift/PostgresClientKitExample/.dockerignore create mode 100644 test_runner/pg_clients/swift/PostgresClientKitExample/.gitignore create mode 100644 test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile create mode 100644 test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved create mode 100644 test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift create mode 100644 test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift create mode 100644 test_runner/pg_clients/test_pg_clients.py create mode 100644 test_runner/pg_clients/typescript/postgresql-client/.dockerignore create mode 100644 test_runner/pg_clients/typescript/postgresql-client/.gitignore create mode 100644 test_runner/pg_clients/typescript/postgresql-client/Dockerfile create mode 100755 test_runner/pg_clients/typescript/postgresql-client/index.js create mode 100644 test_runner/pg_clients/typescript/postgresql-client/package-lock.json create mode 100644 test_runner/pg_clients/typescript/postgresql-client/package.json diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml new file mode 100644 index 0000000000..66f259d012 --- /dev/null +++ b/.github/workflows/pg_clients.yml @@ -0,0 +1,74 @@ +name: Test Postgres client libraries + +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '23 02 * * *' # run once a day, timezone is utc + + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + test-postgres-client-libs: + runs-on: [ ubuntu-latest ] + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: 3.9 + + - name: Install Poetry + uses: snok/install-poetry@v1 + + - name: Cache poetry deps + id: cache_poetry + uses: actions/cache@v3 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + shell: bash -ex {0} + run: ./scripts/pysync + + - name: Run pytest + env: + REMOTE_ENV: 1 + BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" + TEST_OUTPUT: /tmp/test_output + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + # this variable will be embedded in perf test report + # and is needed to distinguish different environments + PLATFORM: github-actions-selfhosted + shell: bash -ex {0} + run: | + # Test framework expects we have psql binary; + # but since we don't really need it in this test, let's mock it + mkdir -p "$POSTGRES_DISTRIB_DIR/bin" && touch "$POSTGRES_DISTRIB_DIR/bin/psql"; + ./scripts/pytest \ + --junitxml=$TEST_OUTPUT/junit.xml \ + --tb=short \ + --verbose \ + -m "remote_cluster" \ + -rA "test_runner/pg_clients" + + - name: Post to a Slack channel + if: failure() + id: slack + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/setup.cfg b/setup.cfg index b3b39fadd7..d1a2f9a359 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,6 +28,10 @@ strict = true # There is some work in progress, though: https://github.com/MagicStack/asyncpg/pull/577 ignore_missing_imports = true +[mypy-pg8000.*] +# Used only in testing clients +ignore_missing_imports = true + [mypy-cached_property.*] ignore_missing_imports = true diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 8df4878039..9eb02b50d0 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1925,9 +1925,12 @@ class Etcd: datadir: str port: int peer_port: int - binary_path: Path = etcd_path() + binary_path: Path = field(init=False) handle: Optional[subprocess.Popen[Any]] = None # handle of running daemon + def __post_init__(self): + self.binary_path = etcd_path() + def client_url(self): return f'http://127.0.0.1:{self.port}' @@ -1984,7 +1987,7 @@ class Etcd: def get_test_output_dir(request: Any) -> pathlib.Path: """ Compute the working directory for an individual test. """ test_name = request.node.name - test_dir = pathlib.Path(top_output_dir) / test_name + test_dir = pathlib.Path(top_output_dir) / test_name.replace("/", "-") log.info(f'get_test_output_dir is {test_dir}') # make mypy happy assert isinstance(test_dir, pathlib.Path) diff --git a/test_runner/pg_clients/csharp/npgsql/.dockerignore b/test_runner/pg_clients/csharp/npgsql/.dockerignore new file mode 100644 index 0000000000..cd42ee34e8 --- /dev/null +++ b/test_runner/pg_clients/csharp/npgsql/.dockerignore @@ -0,0 +1,2 @@ +bin/ +obj/ diff --git a/test_runner/pg_clients/csharp/npgsql/.gitignore b/test_runner/pg_clients/csharp/npgsql/.gitignore new file mode 100644 index 0000000000..cd42ee34e8 --- /dev/null +++ b/test_runner/pg_clients/csharp/npgsql/.gitignore @@ -0,0 +1,2 @@ +bin/ +obj/ diff --git a/test_runner/pg_clients/csharp/npgsql/Dockerfile b/test_runner/pg_clients/csharp/npgsql/Dockerfile new file mode 100644 index 0000000000..a78bc2f3bc --- /dev/null +++ b/test_runner/pg_clients/csharp/npgsql/Dockerfile @@ -0,0 +1,14 @@ +FROM mcr.microsoft.com/dotnet/sdk:6.0 AS build +WORKDIR /source + +COPY *.csproj . +RUN dotnet restore + +COPY . . +RUN dotnet publish -c release -o /app --no-restore + +FROM mcr.microsoft.com/dotnet/runtime:6.0 +WORKDIR /app +COPY --from=build /app . + +ENTRYPOINT ["dotnet", "csharp-npgsql.dll"] diff --git a/test_runner/pg_clients/csharp/npgsql/Program.cs b/test_runner/pg_clients/csharp/npgsql/Program.cs new file mode 100644 index 0000000000..17c2d5b81d --- /dev/null +++ b/test_runner/pg_clients/csharp/npgsql/Program.cs @@ -0,0 +1,19 @@ +using Npgsql; + +var host = Environment.GetEnvironmentVariable("NEON_HOST"); +var database = Environment.GetEnvironmentVariable("NEON_DATABASE"); +var user = Environment.GetEnvironmentVariable("NEON_USER"); +var password = Environment.GetEnvironmentVariable("NEON_PASSWORD"); + +var connString = $"Host={host};Username={user};Password={password};Database={database}"; + +await using var conn = new NpgsqlConnection(connString); +await conn.OpenAsync(); + +await using (var cmd = new NpgsqlCommand("SELECT 1", conn)) +await using (var reader = await cmd.ExecuteReaderAsync()) +{ + while (await reader.ReadAsync()) + Console.WriteLine(reader.GetInt32(0)); +} +await conn.CloseAsync(); diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj new file mode 100644 index 0000000000..7c1f90c1fc --- /dev/null +++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj @@ -0,0 +1,14 @@ + + + + Exe + net6.0 + enable + enable + + + + + + + diff --git a/test_runner/pg_clients/java/jdbc/.gitignore b/test_runner/pg_clients/java/jdbc/.gitignore new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/test_runner/pg_clients/java/jdbc/.gitignore @@ -0,0 +1 @@ + diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile new file mode 100644 index 0000000000..daad99c3a1 --- /dev/null +++ b/test_runner/pg_clients/java/jdbc/Dockerfile @@ -0,0 +1,10 @@ +FROM openjdk:17 +WORKDIR /source + +COPY . . + +WORKDIR /app +RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.4.0.jar && \ + javac -d /app /source/Example.java + +CMD ["java", "-cp", "/app/postgresql.jar:.", "Example"] diff --git a/test_runner/pg_clients/java/jdbc/Example.java b/test_runner/pg_clients/java/jdbc/Example.java new file mode 100644 index 0000000000..410a971649 --- /dev/null +++ b/test_runner/pg_clients/java/jdbc/Example.java @@ -0,0 +1,31 @@ +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.ResultSet; +import java.sql.Statement; +import java.util.Properties; + +public class Example +{ + public static void main( String[] args ) throws Exception + { + String host = System.getenv("NEON_HOST"); + String database = System.getenv("NEON_DATABASE"); + String user = System.getenv("NEON_USER"); + String password = System.getenv("NEON_PASSWORD"); + + String url = "jdbc:postgresql://%s/%s".formatted(host, database); + Properties props = new Properties(); + props.setProperty("user", user); + props.setProperty("password", password); + + Connection conn = DriverManager.getConnection(url, props); + Statement st = conn.createStatement(); + ResultSet rs = st.executeQuery("SELECT 1"); + while (rs.next()) + { + System.out.println(rs.getString(1)); + } + rs.close(); + st.close(); + } +} diff --git a/test_runner/pg_clients/python/asyncpg/Dockerfile b/test_runner/pg_clients/python/asyncpg/Dockerfile new file mode 100644 index 0000000000..10662f92d5 --- /dev/null +++ b/test_runner/pg_clients/python/asyncpg/Dockerfile @@ -0,0 +1,8 @@ +FROM python:3.10 +WORKDIR /source + +COPY . . + +RUN python3 -m pip install --no-cache-dir -r requirements.txt + +CMD ["python3", "asyncpg_example.py"] diff --git a/test_runner/pg_clients/python/asyncpg/asyncpg_example.py b/test_runner/pg_clients/python/asyncpg/asyncpg_example.py new file mode 100755 index 0000000000..7f579ce672 --- /dev/null +++ b/test_runner/pg_clients/python/asyncpg/asyncpg_example.py @@ -0,0 +1,30 @@ +#! /usr/bin/env python3 + +import asyncio +import os + +import asyncpg + + +async def run(**kwargs) -> asyncpg.Record: + conn = await asyncpg.connect( + **kwargs, + statement_cache_size=0, # Prepared statements doesn't work pgbouncer + ) + rv = await conn.fetchrow("SELECT 1") + await conn.close() + + return rv + + +if __name__ == "__main__": + kwargs = { + k.lstrip("NEON_").lower(): v + for k in ("NEON_HOST", "NEON_DATABASE", "NEON_USER", "NEON_PASSWORD") + if (v := os.environ.get(k, None)) is not None + } + + loop = asyncio.new_event_loop() + row = loop.run_until_complete(run(**kwargs)) + + print(row[0]) diff --git a/test_runner/pg_clients/python/asyncpg/requirements.txt b/test_runner/pg_clients/python/asyncpg/requirements.txt new file mode 100644 index 0000000000..edc57ecc81 --- /dev/null +++ b/test_runner/pg_clients/python/asyncpg/requirements.txt @@ -0,0 +1 @@ +asyncpg==0.25.0 diff --git a/test_runner/pg_clients/python/pg8000/Dockerfile b/test_runner/pg_clients/python/pg8000/Dockerfile new file mode 100644 index 0000000000..eddf64df5b --- /dev/null +++ b/test_runner/pg_clients/python/pg8000/Dockerfile @@ -0,0 +1,8 @@ +FROM python:3.10 +WORKDIR /source + +COPY . . + +RUN python3 -m pip install --no-cache-dir -r requirements.txt + +CMD ["python3", "pg8000_example.py"] diff --git a/test_runner/pg_clients/python/pg8000/README.md b/test_runner/pg_clients/python/pg8000/README.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/pg_clients/python/pg8000/pg8000_example.py b/test_runner/pg_clients/python/pg8000/pg8000_example.py new file mode 100755 index 0000000000..f463867f88 --- /dev/null +++ b/test_runner/pg_clients/python/pg8000/pg8000_example.py @@ -0,0 +1,23 @@ +#! /usr/bin/env python3 + +import os +import ssl + +import pg8000.dbapi + +if __name__ == "__main__": + kwargs = { + k.lstrip("NEON_").lower(): v + for k in ("NEON_HOST", "NEON_DATABASE", "NEON_USER", "NEON_PASSWORD") + if (v := os.environ.get(k, None)) is not None + } + conn = pg8000.dbapi.connect( + **kwargs, + ssl_context=True, + ) + + cursor = conn.cursor() + cursor.execute("SELECT 1") + row = cursor.fetchone() + print(row[0]) + conn.close() diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt new file mode 100644 index 0000000000..1577712150 --- /dev/null +++ b/test_runner/pg_clients/python/pg8000/requirements.txt @@ -0,0 +1 @@ +pg8000==1.29.1 diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/.dockerignore b/test_runner/pg_clients/swift/PostgresClientKitExample/.dockerignore new file mode 100644 index 0000000000..30bcfa4ed5 --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/.dockerignore @@ -0,0 +1 @@ +.build/ diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/.gitignore b/test_runner/pg_clients/swift/PostgresClientKitExample/.gitignore new file mode 100644 index 0000000000..30bcfa4ed5 --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/.gitignore @@ -0,0 +1 @@ +.build/ diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile new file mode 100644 index 0000000000..8f9477bd6a --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile @@ -0,0 +1,11 @@ +FROM swift:5.6 AS build +RUN apt-get -q update && apt-get -q install -y libssl-dev +WORKDIR /source + +COPY . . +RUN swift build --configuration release + +FROM swift:5.6 +WORKDIR /app +COPY --from=build /source/.build/release/release . +CMD ["/app/PostgresClientKitExample"] diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved new file mode 100644 index 0000000000..478e31000e --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved @@ -0,0 +1,41 @@ +{ + "pins" : [ + { + "identity" : "bluesocket", + "kind" : "remoteSourceControl", + "location" : "https://github.com/IBM-Swift/BlueSocket.git", + "state" : { + "revision" : "dd924c3bc2c1c144c42b8dda3896f1a03115ded4", + "version" : "2.0.2" + } + }, + { + "identity" : "bluesslservice", + "kind" : "remoteSourceControl", + "location" : "https://github.com/IBM-Swift/BlueSSLService", + "state" : { + "revision" : "c249988fb748749739144e7f554710552acdc0bd", + "version" : "2.0.1" + } + }, + { + "identity" : "postgresclientkit", + "kind" : "remoteSourceControl", + "location" : "https://github.com/codewinsdotcom/PostgresClientKit.git", + "state" : { + "branch" : "v1.4.3", + "revision" : "beafedaea6dc9f04712e9a8547b77f47c406a47e" + } + }, + { + "identity" : "swift-argument-parser", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-argument-parser", + "state" : { + "revision" : "6b2aa2748a7881eebb9f84fb10c01293e15b52ca", + "version" : "0.5.0" + } + } + ], + "version" : 2 +} diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift new file mode 100644 index 0000000000..0d40b28572 --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift @@ -0,0 +1,17 @@ +// swift-tools-version:5.6 +import PackageDescription + +let package = Package( + name: "PostgresClientKitExample", + dependencies: [ + .package( + url: "https://github.com/codewinsdotcom/PostgresClientKit.git", + revision: "v1.4.3" + ) + ], + targets: [ + .target( + name: "PostgresClientKitExample", + dependencies: [ "PostgresClientKit" ]) + ] +) diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift new file mode 100644 index 0000000000..c7518dd88c --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift @@ -0,0 +1,38 @@ +import Foundation + +import PostgresClientKit + +do { + var configuration = PostgresClientKit.ConnectionConfiguration() + + let env = ProcessInfo.processInfo.environment + if let host = env["NEON_HOST"] { + configuration.host = host + } + if let database = env["NEON_DATABASE"] { + configuration.database = database + } + if let user = env["NEON_USER"] { + configuration.user = user + } + if let password = env["NEON_PASSWORD"] { + configuration.credential = .scramSHA256(password: password) + } + + let connection = try PostgresClientKit.Connection(configuration: configuration) + defer { connection.close() } + + let text = "SELECT 1;" + let statement = try connection.prepareStatement(text: text) + defer { statement.close() } + + let cursor = try statement.execute(parameterValues: [ ]) + defer { cursor.close() } + + for row in cursor { + let columns = try row.get().columns + print(columns[0]) + } +} catch { + print(error) +} diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py new file mode 100644 index 0000000000..7dc7299791 --- /dev/null +++ b/test_runner/pg_clients/test_pg_clients.py @@ -0,0 +1,54 @@ +import os +import shutil +import subprocess +from pathlib import Path +from tempfile import NamedTemporaryFile +from urllib.parse import urlparse + +import pytest +from fixtures.neon_fixtures import RemotePostgres + + +@pytest.mark.remote_cluster +@pytest.mark.parametrize( + "client", + [ + "csharp/npgsql", + "java/jdbc", + "python/asyncpg", + pytest.param( + "python/pg8000", # See https://github.com/neondatabase/neon/pull/2008#discussion_r912264281 + marks=pytest.mark.xfail(reason="Handles SSL in incompatible with Neon way")), + pytest.param( + "swift/PostgresClientKit", # See https://github.com/neondatabase/neon/pull/2008#discussion_r911896592 + marks=pytest.mark.xfail(reason="Neither SNI nor parameters is supported")), + "typescript/postgresql-client", + ], +) +def test_pg_clients(remote_pg: RemotePostgres, client: str): + conn_options = remote_pg.conn_options() + + env_file = None + with NamedTemporaryFile(mode="w", delete=False) as f: + env_file = f.name + f.write(f""" + NEON_HOST={conn_options["host"]} + NEON_DATABASE={conn_options["dbname"]} + NEON_USER={conn_options["user"]} + NEON_PASSWORD={conn_options["password"]} + """) + + image_tag = client.lower() + docker_bin = shutil.which("docker") + if docker_bin is None: + raise RuntimeError("docker is required for running this test") + + build_cmd = [ + docker_bin, "build", "--quiet", "--tag", image_tag, f"{Path(__file__).parent / client}" + ] + run_cmd = [docker_bin, "run", "--rm", "--env-file", env_file, image_tag] + + subprocess.run(build_cmd, check=True) + result = subprocess.run(run_cmd, check=True, capture_output=True, text=True) + + assert result.stdout.strip() == "1" diff --git a/test_runner/pg_clients/typescript/postgresql-client/.dockerignore b/test_runner/pg_clients/typescript/postgresql-client/.dockerignore new file mode 100644 index 0000000000..c2658d7d1b --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/.dockerignore @@ -0,0 +1 @@ +node_modules/ diff --git a/test_runner/pg_clients/typescript/postgresql-client/.gitignore b/test_runner/pg_clients/typescript/postgresql-client/.gitignore new file mode 100644 index 0000000000..c2658d7d1b --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/.gitignore @@ -0,0 +1 @@ +node_modules/ diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile new file mode 100644 index 0000000000..b57147503f --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile @@ -0,0 +1,7 @@ +FROM node:16 +WORKDIR /source + +COPY . . +RUN npm clean-install + +CMD ["/source/index.js"] \ No newline at end of file diff --git a/test_runner/pg_clients/typescript/postgresql-client/index.js b/test_runner/pg_clients/typescript/postgresql-client/index.js new file mode 100755 index 0000000000..af4899baab --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/index.js @@ -0,0 +1,25 @@ +#! /usr/bin/env node + +import {Connection} from 'postgresql-client'; + +const params = { + "host": process.env.NEON_HOST, + "database": process.env.NEON_DATABASE, + "user": process.env.NEON_USER, + "password": process.env.NEON_PASSWORD, + "ssl": true, +} +for (const key in params) { + if (params[key] === undefined) { + delete params[key]; + } +} + +const connection = new Connection(params); +await connection.connect(); +const result = await connection.query( + 'select 1' +); +const rows = result.rows; +await connection.close(); +console.log(rows[0][0]); diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json new file mode 100644 index 0000000000..bb5b4a1378 --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json @@ -0,0 +1,262 @@ +{ + "name": "typescript", + "lockfileVersion": 2, + "requires": true, + "packages": { + "": { + "dependencies": { + "postgresql-client": "^2.1.3" + } + }, + "node_modules/debug": { + "version": "4.3.4", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", + "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==", + "dependencies": { + "ms": "2.1.2" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/doublylinked": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.1.tgz", + "integrity": "sha512-Lpqb+qyHpR5Bew8xfKsxVYdjXEYAQ7HLp1IX47kHKmVCZeXErInytonjkL+kE+L4yaKSYEmDNR9MJYr5zwuAKA==", + "engines": { + "node": ">= 10.0" + } + }, + "node_modules/lightning-pool": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-3.1.3.tgz", + "integrity": "sha512-OgWuoh0BBrikWx/mc/XwIKwC9HHTe/GU3XODLMBPibv7jv8u0o2gQFS7KVEg5U8Oufg6N7mkm8Y1RoiLER0zeQ==", + "dependencies": { + "doublylinked": "^2.4.3", + "putil-promisify": "^1.8.2" + }, + "engines": { + "node": ">= 10.0" + } + }, + "node_modules/ms": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", + "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==" + }, + "node_modules/obuf": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz", + "integrity": "sha512-PX1wu0AmAdPqOL1mWhqmlOd8kOIZQwGZw6rh7uby9fTc5lhaOWFLX3I6R1hrF9k3zUY40e6igsLGkDXK92LJNg==" + }, + "node_modules/postgres-bytea": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-3.0.0.tgz", + "integrity": "sha512-CNd4jim9RFPkObHSjVHlVrxoVQXz7quwNFpz7RY1okNNme49+sVyiTvTRobiLV548Hx/hb1BG+iE7h9493WzFw==", + "dependencies": { + "obuf": "~1.1.2" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/postgresql-client": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.1.3.tgz", + "integrity": "sha512-36Ga6JzhydsRzcCRcA/Y2hrX9C9sI0wS6sgRNBlOGkOwACXQVybmhDM7mAUbi9cT00N39Ee7btR0eMCyD//5Xg==", + "dependencies": { + "debug": "^4.3.4", + "doublylinked": "^2.5.1", + "lightning-pool": "^3.1.3", + "postgres-bytea": "^3.0.0", + "power-tasks": "^0.8.0", + "putil-merge": "^3.8.0", + "putil-promisify": "^1.8.5", + "putil-varhelpers": "^1.6.4" + }, + "engines": { + "node": ">=14.0", + "npm": ">=7.0.0" + } + }, + "node_modules/power-tasks": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-0.8.0.tgz", + "integrity": "sha512-HhMcx+y5UkzlEmKslruz8uAU2Yq8CODJsFEMFsYMrGp5EzKpkNHGu0RNvBqyewKJDZHPNKtBSULsEAxMqQIBVQ==", + "dependencies": { + "debug": "^4.3.4", + "doublylinked": "^2.5.1", + "strict-typed-events": "^2.2.0" + }, + "engines": { + "node": ">=14.0", + "npm": ">=7.0.0" + } + }, + "node_modules/putil-merge": { + "version": "3.8.0", + "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.8.0.tgz", + "integrity": "sha512-5tXPafJawWFoYZWLhkYXZ7IC/qkI45HgJsgv36lJBeq3qjFZfUITZE01CmWUBIlIn9f1yDiikqgYERARhVmgrg==", + "engines": { + "node": ">= 10.0" + } + }, + "node_modules/putil-promisify": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.8.5.tgz", + "integrity": "sha512-DItclasWWZokvpq3Aiaq0iV7WC8isP/0o/8mhC0yV6CQ781N/7NQHA1VyOm6hfpeFEwIQoo1C4Yjc5eH0q6Jbw==", + "engines": { + "node": ">= 6.0" + } + }, + "node_modules/putil-varhelpers": { + "version": "1.6.4", + "resolved": "https://registry.npmjs.org/putil-varhelpers/-/putil-varhelpers-1.6.4.tgz", + "integrity": "sha512-nM2nO1HS2yJUyPgz0grd2XZAM0Spr6/tt6F4xXeNDjByV00BV2mq6lZ+sDff8WIfQBI9Hn1Czh93H1xBvKESxw==", + "engines": { + "node": ">= 6.0" + } + }, + "node_modules/strict-typed-events": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.2.0.tgz", + "integrity": "sha512-yvHRtEfRRV7TJWi9cLhMt4Mb12JtAwXXONltUlLCA3fRB0LRy94B4E4e2gIlXzT5nZHTZVpOjJNOshri3LZ5bw==", + "dependencies": { + "putil-promisify": "^1.8.5", + "ts-gems": "^2.0.0" + }, + "engines": { + "node": ">=14.0" + } + }, + "node_modules/ts-gems": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.1.0.tgz", + "integrity": "sha512-5IqiG4nq1tsOhYPc4CwxA6bsE+TfU6uAABzf6bH4sdElgXpt/mlStvIYedvvtU7BM1+RRJxCaTLaaVFcCqNaiA==", + "peerDependencies": { + "typescript": ">=4.0.0" + } + }, + "node_modules/typescript": { + "version": "4.7.4", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.7.4.tgz", + "integrity": "sha512-C0WQT0gezHuw6AdY1M2jxUO83Rjf0HP7Sk1DtXj6j1EwkQNZrHAg2XPWlq62oqEhYvONq5pkC2Y9oPljWToLmQ==", + "peer": true, + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=4.2.0" + } + } + }, + "dependencies": { + "debug": { + "version": "4.3.4", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", + "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==", + "requires": { + "ms": "2.1.2" + } + }, + "doublylinked": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.1.tgz", + "integrity": "sha512-Lpqb+qyHpR5Bew8xfKsxVYdjXEYAQ7HLp1IX47kHKmVCZeXErInytonjkL+kE+L4yaKSYEmDNR9MJYr5zwuAKA==" + }, + "lightning-pool": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-3.1.3.tgz", + "integrity": "sha512-OgWuoh0BBrikWx/mc/XwIKwC9HHTe/GU3XODLMBPibv7jv8u0o2gQFS7KVEg5U8Oufg6N7mkm8Y1RoiLER0zeQ==", + "requires": { + "doublylinked": "^2.4.3", + "putil-promisify": "^1.8.2" + } + }, + "ms": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", + "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==" + }, + "obuf": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz", + "integrity": "sha512-PX1wu0AmAdPqOL1mWhqmlOd8kOIZQwGZw6rh7uby9fTc5lhaOWFLX3I6R1hrF9k3zUY40e6igsLGkDXK92LJNg==" + }, + "postgres-bytea": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-3.0.0.tgz", + "integrity": "sha512-CNd4jim9RFPkObHSjVHlVrxoVQXz7quwNFpz7RY1okNNme49+sVyiTvTRobiLV548Hx/hb1BG+iE7h9493WzFw==", + "requires": { + "obuf": "~1.1.2" + } + }, + "postgresql-client": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.1.3.tgz", + "integrity": "sha512-36Ga6JzhydsRzcCRcA/Y2hrX9C9sI0wS6sgRNBlOGkOwACXQVybmhDM7mAUbi9cT00N39Ee7btR0eMCyD//5Xg==", + "requires": { + "debug": "^4.3.4", + "doublylinked": "^2.5.1", + "lightning-pool": "^3.1.3", + "postgres-bytea": "^3.0.0", + "power-tasks": "^0.8.0", + "putil-merge": "^3.8.0", + "putil-promisify": "^1.8.5", + "putil-varhelpers": "^1.6.4" + } + }, + "power-tasks": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-0.8.0.tgz", + "integrity": "sha512-HhMcx+y5UkzlEmKslruz8uAU2Yq8CODJsFEMFsYMrGp5EzKpkNHGu0RNvBqyewKJDZHPNKtBSULsEAxMqQIBVQ==", + "requires": { + "debug": "^4.3.4", + "doublylinked": "^2.5.1", + "strict-typed-events": "^2.2.0" + } + }, + "putil-merge": { + "version": "3.8.0", + "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.8.0.tgz", + "integrity": "sha512-5tXPafJawWFoYZWLhkYXZ7IC/qkI45HgJsgv36lJBeq3qjFZfUITZE01CmWUBIlIn9f1yDiikqgYERARhVmgrg==" + }, + "putil-promisify": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.8.5.tgz", + "integrity": "sha512-DItclasWWZokvpq3Aiaq0iV7WC8isP/0o/8mhC0yV6CQ781N/7NQHA1VyOm6hfpeFEwIQoo1C4Yjc5eH0q6Jbw==" + }, + "putil-varhelpers": { + "version": "1.6.4", + "resolved": "https://registry.npmjs.org/putil-varhelpers/-/putil-varhelpers-1.6.4.tgz", + "integrity": "sha512-nM2nO1HS2yJUyPgz0grd2XZAM0Spr6/tt6F4xXeNDjByV00BV2mq6lZ+sDff8WIfQBI9Hn1Czh93H1xBvKESxw==" + }, + "strict-typed-events": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.2.0.tgz", + "integrity": "sha512-yvHRtEfRRV7TJWi9cLhMt4Mb12JtAwXXONltUlLCA3fRB0LRy94B4E4e2gIlXzT5nZHTZVpOjJNOshri3LZ5bw==", + "requires": { + "putil-promisify": "^1.8.5", + "ts-gems": "^2.0.0" + } + }, + "ts-gems": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.1.0.tgz", + "integrity": "sha512-5IqiG4nq1tsOhYPc4CwxA6bsE+TfU6uAABzf6bH4sdElgXpt/mlStvIYedvvtU7BM1+RRJxCaTLaaVFcCqNaiA==", + "requires": {} + }, + "typescript": { + "version": "4.7.4", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.7.4.tgz", + "integrity": "sha512-C0WQT0gezHuw6AdY1M2jxUO83Rjf0HP7Sk1DtXj6j1EwkQNZrHAg2XPWlq62oqEhYvONq5pkC2Y9oPljWToLmQ==", + "peer": true + } + } +} diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json new file mode 100644 index 0000000000..5d8ca23a7f --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/package.json @@ -0,0 +1,6 @@ +{ + "type": "module", + "dependencies": { + "postgresql-client": "^2.1.3" + } +} From bb69e0920cfb57c28875ef8c811fd01a289569dd Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 5 Jul 2022 11:27:51 +0300 Subject: [PATCH 0463/1022] Do not overwrite an existing image layer. See github issues #1594 and #1690 Co-authored-by: Konstantin Knizhnik --- pageserver/src/layered_repository.rs | 29 ++++++++++++++----- .../src/layered_repository/image_layer.rs | 5 +++- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 67f024ef59..b3026fe3a7 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1923,15 +1923,28 @@ impl LayeredTimeline { } else { Lsn(0) }; + // Let's consider an example: + // + // delta layer with LSN range 71-81 + // delta layer with LSN range 81-91 + // delta layer with LSN range 91-101 + // image layer at LSN 100 + // + // If 'lsn' is still 100, i.e. no new WAL has been processed since the last image layer, + // there's no need to create a new one. We check this case explicitly, to avoid passing + // a bogus range to count_deltas below, with start > end. It's even possible that there + // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed + // after we read last_record_lsn, which is passed here in the 'lsn' argument. + if img_lsn < lsn { + let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?; - let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?; - - debug!( - "range {}-{}, has {} deltas on this timeline", - img_range.start, img_range.end, num_deltas - ); - if num_deltas >= self.get_image_creation_threshold() { - return Ok(true); + debug!( + "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", + img_range.start, img_range.end, num_deltas, img_lsn, lsn + ); + if num_deltas >= self.get_image_creation_threshold() { + return Ok(true); + } } } } diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 905023ecf9..bb24553afd 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -445,7 +445,10 @@ impl ImageLayerWriter { }, ); info!("new image layer {}", path.display()); - let mut file = VirtualFile::create(&path)?; + let mut file = VirtualFile::open_with_options( + &path, + std::fs::OpenOptions::new().write(true).create_new(true), + )?; // make room for the header block file.seek(SeekFrom::Start(PAGE_SZ as u64))?; let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64); From 32560e75d22ceb92b9b765be61dc63991e0ddae3 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Tue, 5 Jul 2022 08:27:57 -0400 Subject: [PATCH 0464/1022] Enable relocation test (#1974) --- .../batch_others/test_tenant_relocation.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 18ec34b02e..0239b17494 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -101,10 +101,6 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve log.info('load thread stopped') -@pytest.mark.skip( - reason= - "needs to replace callmemaybe call with better idea how to migrate timelines between pageservers" -) @pytest.mark.parametrize('with_load', ['with_load', 'without_load']) def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, port_distributor: PortDistributor, @@ -202,17 +198,6 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, lsn_from_hex(timeline_detail['local']['disk_consistent_lsn']), 0.03) - # callmemaybe to start replication from safekeeper to the new pageserver - # when there is no load there is a clean checkpoint and no wal delta - # needs to be streamed to the new pageserver - # TODO (rodionov) use attach to start replication - with pg_cur(PgProtocol(host='localhost', port=new_pageserver_pg_port)) as cur: - # "callmemaybe {} {} host={} port={} options='-c ztimelineid={} ztenantid={}'" - safekeeper_connstring = f"host=localhost port={env.safekeepers[0].port.pg} options='-c ztimelineid={timeline} ztenantid={tenant} pageserver_connstr=postgresql://no_user:@localhost:{new_pageserver_pg_port}'" - cur.execute("callmemaybe {} {} {}".format(tenant.hex, - timeline.hex, - safekeeper_connstring)) - tenant_pg.stop() # rewrite neon cli config to use new pageserver for basebackup to start new compute From cfdf79acebfa3dbbe1fc465329ee2aceae9304a4 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 1 Jul 2022 19:44:48 +0300 Subject: [PATCH 0465/1022] harden create_empty_timeline Reorder checks so it checks whether the timeline exists before writing something to disk, possibly replacing valid content --- pageserver/src/layered_repository.rs | 24 ++++++++++++++---------- pageserver/src/repository.rs | 15 ++++++++++++++- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index b3026fe3a7..db5b77a4d9 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -232,23 +232,32 @@ impl Repository for LayeredRepository { fn create_empty_timeline( &self, - timelineid: ZTimelineId, + timeline_id: ZTimelineId, initdb_lsn: Lsn, ) -> Result> { let mut timelines = self.timelines.lock().unwrap(); + let vacant_timeline_entry = match timelines.entry(timeline_id) { + Entry::Occupied(_) => bail!("Timeline already exists"), + Entry::Vacant(vacant_entry) => vacant_entry, + }; + + let timeline_path = self.conf.timeline_path(&timeline_id, &self.tenant_id); + if timeline_path.exists() { + bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.") + } // Create the timeline directory, and write initial metadata to file. - crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenant_id))?; + crashsafe_dir::create_dir_all(timeline_path)?; let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); - Self::save_metadata(self.conf, timelineid, self.tenant_id, &metadata, true)?; + Self::save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?; let timeline = LayeredTimeline::new( self.conf, Arc::clone(&self.tenant_conf), metadata, None, - timelineid, + timeline_id, self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, @@ -257,12 +266,7 @@ impl Repository for LayeredRepository { // Insert if not exists let timeline = Arc::new(timeline); - match timelines.entry(timelineid) { - Entry::Occupied(_) => bail!("Timeline already exists"), - Entry::Vacant(vacant) => { - vacant.insert(LayeredTimelineEntry::Loaded(Arc::clone(&timeline))) - } - }; + vacant_timeline_entry.insert(LayeredTimelineEntry::Loaded(Arc::clone(&timeline))); Ok(timeline) } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 756c3b8191..9501a416b4 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -225,7 +225,7 @@ pub trait Repository: Send + Sync { /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. fn create_empty_timeline( &self, - timelineid: ZTimelineId, + timeline_id: ZTimelineId, initdb_lsn: Lsn, ) -> Result>; @@ -636,6 +636,19 @@ mod tests { Ok(()) } + #[test] + fn no_duplicate_timelines() -> Result<()> { + let repo = RepoHarness::create("no_duplicate_timelines")?.load(); + let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) { + Ok(_) => panic!("duplicate timeline creation should fail"), + Err(e) => assert_eq!(e.to_string(), "Timeline already exists"), + } + + Ok(()) + } + /// Convenience function to create a page image with given string as the only content pub fn test_value(s: &str) -> Value { let mut buf = BytesMut::new(); From 68adfe0fc869dca7f611df70ee605397f2b80339 Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Tue, 5 Jul 2022 00:55:43 +0300 Subject: [PATCH 0466/1022] inventory file fix for neon-stress env --- .circleci/ansible/neon-stress.hosts | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/ansible/neon-stress.hosts b/.circleci/ansible/neon-stress.hosts index 283ec0e8b3..750fd8106a 100644 --- a/.circleci/ansible/neon-stress.hosts +++ b/.circleci/ansible/neon-stress.hosts @@ -12,6 +12,7 @@ pageservers safekeepers [storage:vars] +env_name = neon-stress console_mgmt_base_url = http://neon-stress-console.local bucket_name = neon-storage-ireland bucket_region = eu-west-1 From 50821c0a3cf1328e44c3023ae0ea17d835235c6a Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 4 Jul 2022 17:53:14 +0300 Subject: [PATCH 0467/1022] Return download stream directly from the remote storage API --- libs/remote_storage/src/lib.rs | 51 ++++- libs/remote_storage/src/local_fs.rs | 291 ++++++++++++------------ libs/remote_storage/src/s3_bucket.rs | 117 +++++----- pageserver/src/storage_sync/download.rs | 34 ++- safekeeper/src/wal_backup.rs | 78 +++---- safekeeper/src/wal_storage.rs | 3 +- 6 files changed, 298 insertions(+), 276 deletions(-) diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 0889cb720c..6d47d070c1 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -12,8 +12,10 @@ use std::{ borrow::Cow, collections::HashMap, ffi::OsStr, + fmt::Debug, num::{NonZeroU32, NonZeroUsize}, path::{Path, PathBuf}, + pin::Pin, }; use anyhow::{bail, Context}; @@ -70,11 +72,7 @@ pub trait RemoteStorage: Send + Sync { /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer. /// Returns the metadata, if any was stored with the file previously. - async fn download( - &self, - from: &Self::RemoteObjectId, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result>; + async fn download(&self, from: &Self::RemoteObjectId) -> Result; /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer. /// Returns the metadata, if any was stored with the file previously. @@ -83,12 +81,49 @@ pub trait RemoteStorage: Send + Sync { from: &Self::RemoteObjectId, start_inclusive: u64, end_exclusive: Option, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result>; + ) -> Result; async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>; } +pub struct Download { + pub download_stream: Pin>, + /// Extra key-value data, associated with the current remote file. + pub metadata: Option, +} + +impl Debug for Download { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Download") + .field("metadata", &self.metadata) + .finish() + } +} + +#[derive(Debug)] +pub enum DownloadError { + /// Validation or other error happened due to user input. + BadInput(anyhow::Error), + /// The file was not found in the remote storage. + NotFound, + /// The file was found in the remote storage, but the download failed. + Other(anyhow::Error), +} + +impl std::fmt::Display for DownloadError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DownloadError::BadInput(e) => { + write!(f, "Failed to download a remote file due to user input: {e}") + } + DownloadError::NotFound => write!(f, "No file found for the remote object id given"), + DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e}"), + } + } +} + +impl std::error::Error for DownloadError {} + /// Every storage, currently supported. /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics. pub enum GenericRemoteStorage { @@ -180,7 +215,7 @@ pub struct S3Config { pub concurrency_limit: NonZeroUsize, } -impl std::fmt::Debug for S3Config { +impl Debug for S3Config { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("S3Config") .field("bucket_name", &self.bucket_name) diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 50243352ee..25235200b2 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -17,7 +17,7 @@ use tokio::{ }; use tracing::*; -use crate::path_with_suffix_extension; +use crate::{path_with_suffix_extension, Download, DownloadError}; use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; @@ -192,15 +192,12 @@ impl RemoteStorage for LocalFs { Ok(()) } - async fn download( - &self, - from: &Self::RemoteObjectId, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result> { - let file_path = self.resolve_in_storage(from)?; - - if file_path.exists() && file_path.is_file() { - let mut source = io::BufReader::new( + async fn download(&self, from: &Self::RemoteObjectId) -> Result { + let file_path = self + .resolve_in_storage(from) + .map_err(DownloadError::BadInput)?; + if file_exists(&file_path).map_err(DownloadError::BadInput)? { + let source = io::BufReader::new( fs::OpenOptions::new() .read(true) .open(&file_path) @@ -210,22 +207,20 @@ impl RemoteStorage for LocalFs { "Failed to open source file '{}' to use in the download", file_path.display() ) - })?, + }) + .map_err(DownloadError::Other)?, ); - io::copy(&mut source, to).await.with_context(|| { - format!( - "Failed to download file '{}' from the local storage", - file_path.display() - ) - })?; - source.flush().await?; - self.read_storage_metadata(&file_path).await + let metadata = self + .read_storage_metadata(&file_path) + .await + .map_err(DownloadError::Other)?; + Ok(Download { + metadata, + download_stream: Box::pin(source), + }) } else { - bail!( - "File '{}' either does not exist or is not a file", - file_path.display() - ) + Err(DownloadError::NotFound) } } @@ -234,22 +229,19 @@ impl RemoteStorage for LocalFs { from: &Self::RemoteObjectId, start_inclusive: u64, end_exclusive: Option, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result> { + ) -> Result { if let Some(end_exclusive) = end_exclusive { - ensure!( - end_exclusive > start_inclusive, - "Invalid range, start ({}) is bigger then end ({:?})", - start_inclusive, - end_exclusive - ); + if end_exclusive <= start_inclusive { + return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) is not less than end_exclusive ({end_exclusive:?})"))); + }; if start_inclusive == end_exclusive.saturating_sub(1) { - return Ok(None); + return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes"))); } } - let file_path = self.resolve_in_storage(from)?; - - if file_path.exists() && file_path.is_file() { + let file_path = self + .resolve_in_storage(from) + .map_err(DownloadError::BadInput)?; + if file_exists(&file_path).map_err(DownloadError::BadInput)? { let mut source = io::BufReader::new( fs::OpenOptions::new() .read(true) @@ -260,31 +252,31 @@ impl RemoteStorage for LocalFs { "Failed to open source file '{}' to use in the download", file_path.display() ) - })?, + }) + .map_err(DownloadError::Other)?, ); source .seek(io::SeekFrom::Start(start_inclusive)) .await - .context("Failed to seek to the range start in a local storage file")?; - match end_exclusive { - Some(end_exclusive) => { - io::copy(&mut source.take(end_exclusive - start_inclusive), to).await - } - None => io::copy(&mut source, to).await, - } - .with_context(|| { - format!( - "Failed to download file '{}' range from the local storage", - file_path.display() - ) - })?; + .context("Failed to seek to the range start in a local storage file") + .map_err(DownloadError::Other)?; + let metadata = self + .read_storage_metadata(&file_path) + .await + .map_err(DownloadError::Other)?; - self.read_storage_metadata(&file_path).await + Ok(match end_exclusive { + Some(end_exclusive) => Download { + metadata, + download_stream: Box::pin(source.take(end_exclusive - start_inclusive)), + }, + None => Download { + metadata, + download_stream: Box::pin(source), + }, + }) } else { - bail!( - "File '{}' either does not exist or is not a file", - file_path.display() - ) + Err(DownloadError::NotFound) } } @@ -352,6 +344,19 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> Ok(()) } +fn file_exists(file_path: &Path) -> anyhow::Result { + if file_path.exists() { + ensure!( + file_path.is_file(), + "file path '{}' is not a file", + file_path.display() + ); + Ok(true) + } else { + Ok(false) + } +} + #[cfg(test)] mod pure_tests { use tempfile::tempdir; @@ -518,6 +523,31 @@ mod fs_tests { use std::{collections::HashMap, io::Write}; use tempfile::tempdir; + async fn read_and_assert_remote_file_contents( + storage: &LocalFs, + #[allow(clippy::ptr_arg)] + // have to use &PathBuf due to `storage.local_path` parameter requirements + remote_storage_path: &PathBuf, + expected_metadata: Option<&StorageMetadata>, + ) -> anyhow::Result { + let mut download = storage + .download(remote_storage_path) + .await + .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?; + ensure!( + download.metadata.as_ref() == expected_metadata, + "Unexpected metadata returned for the downloaded file" + ); + + let mut contents = String::new(); + download + .download_stream + .read_to_string(&mut contents) + .await + .context("Failed to read remote file contents into string")?; + Ok(contents) + } + #[tokio::test] async fn upload_file() -> anyhow::Result<()> { let workdir = tempdir()?.path().to_owned(); @@ -568,15 +598,7 @@ mod fs_tests { let upload_name = "upload_1"; let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; - let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - let metadata = storage.download(&upload_target, &mut content_bytes).await?; - assert!( - metadata.is_none(), - "No metadata should be returned for no metadata upload" - ); - - content_bytes.flush().await?; - let contents = String::from_utf8(content_bytes.into_inner().into_inner())?; + let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?; assert_eq!( dummy_contents(upload_name), contents, @@ -584,13 +606,9 @@ mod fs_tests { ); let non_existing_path = PathBuf::from("somewhere").join("else"); - match storage.download(&non_existing_path, &mut io::sink()).await { - Ok(_) => panic!("Should not allow downloading non-existing storage files"), - Err(e) => { - let error_string = e.to_string(); - assert!(error_string.contains("does not exist")); - assert!(error_string.contains(&non_existing_path.display().to_string())); - } + match storage.download(&non_existing_path).await { + Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys + other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"), } Ok(()) } @@ -603,58 +621,31 @@ mod fs_tests { let upload_name = "upload_1"; let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; - let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - let metadata = storage - .download_byte_range(&upload_target, 0, None, &mut full_range_bytes) - .await?; - assert!( - metadata.is_none(), - "No metadata should be returned for no metadata upload" - ); - full_range_bytes.flush().await?; + let full_range_download_contents = + read_and_assert_remote_file_contents(&storage, &upload_target, None).await?; assert_eq!( dummy_contents(upload_name), - String::from_utf8(full_range_bytes.into_inner().into_inner())?, + full_range_download_contents, "Download full range should return the whole upload" ); - let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - let same_byte = 1_000_000_000; - let metadata = storage - .download_byte_range( - &upload_target, - same_byte, - Some(same_byte + 1), // exclusive end - &mut zero_range_bytes, - ) - .await?; - assert!( - metadata.is_none(), - "No metadata should be returned for no metadata upload" - ); - zero_range_bytes.flush().await?; - assert!( - zero_range_bytes.into_inner().into_inner().is_empty(), - "Zero byte range should not download any part of the file" - ); - let uploaded_bytes = dummy_contents(upload_name).into_bytes(); let (first_part_local, second_part_local) = uploaded_bytes.split_at(3); - let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - let metadata = storage - .download_byte_range( - &upload_target, - 0, - Some(first_part_local.len() as u64), - &mut first_part_remote, - ) + let mut first_part_download = storage + .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64)) .await?; assert!( - metadata.is_none(), + first_part_download.metadata.is_none(), "No metadata should be returned for no metadata upload" ); + let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); + io::copy( + &mut first_part_download.download_stream, + &mut first_part_remote, + ) + .await?; first_part_remote.flush().await?; let first_part_remote = first_part_remote.into_inner().into_inner(); assert_eq!( @@ -663,20 +654,24 @@ mod fs_tests { "First part bytes should be returned when requested" ); - let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - let metadata = storage + let mut second_part_download = storage .download_byte_range( &upload_target, first_part_local.len() as u64, Some((first_part_local.len() + second_part_local.len()) as u64), - &mut second_part_remote, ) .await?; assert!( - metadata.is_none(), + second_part_download.metadata.is_none(), "No metadata should be returned for no metadata upload" ); + let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); + io::copy( + &mut second_part_download.download_stream, + &mut second_part_remote, + ) + .await?; second_part_remote.flush().await?; let second_part_remote = second_part_remote.into_inner().into_inner(); assert_eq!( @@ -696,11 +691,30 @@ mod fs_tests { let upload_name = "upload_1"; let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; + let start = 1_000_000_000; + let end = start + 1; + match storage + .download_byte_range( + &upload_target, + start, + Some(end), // exclusive end + ) + .await + { + Ok(_) => panic!("Should not allow downloading wrong ranges"), + Err(e) => { + let error_string = e.to_string(); + assert!(error_string.contains("zero bytes")); + assert!(error_string.contains(&start.to_string())); + assert!(error_string.contains(&end.to_string())); + } + } + let start = 10000; let end = 234; assert!(start > end, "Should test an incorrect range"); match storage - .download_byte_range(&upload_target, start, Some(end), &mut io::sink()) + .download_byte_range(&upload_target, start, Some(end)) .await { Ok(_) => panic!("Should not allow downloading wrong ranges"), @@ -712,18 +726,6 @@ mod fs_tests { } } - let non_existing_path = PathBuf::from("somewhere").join("else"); - match storage - .download_byte_range(&non_existing_path, 1, Some(3), &mut io::sink()) - .await - { - Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"), - Err(e) => { - let error_string = e.to_string(); - assert!(error_string.contains("does not exist")); - assert!(error_string.contains(&non_existing_path.display().to_string())); - } - } Ok(()) } @@ -762,35 +764,26 @@ mod fs_tests { let upload_target = upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?; - let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - let full_download_metadata = storage.download(&upload_target, &mut content_bytes).await?; - - content_bytes.flush().await?; - let contents = String::from_utf8(content_bytes.into_inner().into_inner())?; + let full_range_download_contents = + read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?; assert_eq!( dummy_contents(upload_name), - contents, + full_range_download_contents, "We should upload and download the same contents" ); - assert_eq!( - full_download_metadata.as_ref(), - Some(&metadata), - "We should get the same metadata back for full download" - ); - let uploaded_bytes = dummy_contents(upload_name).into_bytes(); let (first_part_local, _) = uploaded_bytes.split_at(3); - let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - let partial_download_metadata = storage - .download_byte_range( - &upload_target, - 0, - Some(first_part_local.len() as u64), - &mut first_part_remote, - ) + let mut partial_download_with_metadata = storage + .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64)) .await?; + let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); + io::copy( + &mut partial_download_with_metadata.download_stream, + &mut first_part_remote, + ) + .await?; first_part_remote.flush().await?; let first_part_remote = first_part_remote.into_inner().into_inner(); assert_eq!( @@ -800,8 +793,8 @@ mod fs_tests { ); assert_eq!( - partial_download_metadata.as_ref(), - Some(&metadata), + partial_download_with_metadata.metadata, + Some(metadata), "We should get the same metadata back for partial download" ); @@ -843,7 +836,7 @@ mod fs_tests { } fn dummy_contents(name: &str) -> String { - format!("contents for {}", name) + format!("contents for {name}") } async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result> { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 80d6966494..5269d63d09 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -9,17 +9,17 @@ use std::path::{Path, PathBuf}; use anyhow::Context; use rusoto_core::{ credential::{InstanceMetadataProvider, StaticProvider}, - HttpClient, Region, + HttpClient, Region, RusotoError, }; use rusoto_s3::{ - DeleteObjectRequest, GetObjectRequest, ListObjectsV2Request, PutObjectRequest, S3Client, - StreamingBody, S3, + DeleteObjectRequest, GetObjectError, GetObjectRequest, ListObjectsV2Request, PutObjectRequest, + S3Client, StreamingBody, S3, }; use tokio::{io, sync::Semaphore}; use tokio_util::io::ReaderStream; use tracing::debug; -use crate::{strip_path_prefix, RemoteStorage, S3Config}; +use crate::{strip_path_prefix, Download, DownloadError, RemoteStorage, S3Config}; use super::StorageMetadata; @@ -187,6 +187,39 @@ impl S3Bucket { concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()), }) } + + async fn download_object(&self, request: GetObjectRequest) -> Result { + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 download") + .map_err(DownloadError::Other)?; + + metrics::inc_get_object(); + + match self.client.get_object(request).await { + Ok(object_output) => match object_output.body { + None => { + metrics::inc_get_object_fail(); + Err(DownloadError::Other(anyhow::anyhow!( + "Got no body for the S3 object given" + ))) + } + Some(body) => Ok(Download { + metadata: object_output.metadata.map(StorageMetadata), + download_stream: Box::pin(io::BufReader::new(body.into_async_read())), + }), + }, + Err(RusotoError::Service(GetObjectError::NoSuchKey(_))) => Err(DownloadError::NotFound), + Err(e) => { + metrics::inc_get_object_fail(); + Err(DownloadError::Other(anyhow::anyhow!( + "Failed to download S3 object: {e}" + ))) + } + } + } } #[async_trait::async_trait] @@ -283,38 +316,13 @@ impl RemoteStorage for S3Bucket { Ok(()) } - async fn download( - &self, - from: &Self::RemoteObjectId, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result> { - let _guard = self - .concurrency_limiter - .acquire() - .await - .context("Concurrency limiter semaphore got closed during S3 download")?; - - metrics::inc_get_object(); - - let object_output = self - .client - .get_object(GetObjectRequest { - bucket: self.bucket_name.clone(), - key: from.key().to_owned(), - ..GetObjectRequest::default() - }) - .await - .map_err(|e| { - metrics::inc_get_object_fail(); - e - })?; - - if let Some(body) = object_output.body { - let mut from = io::BufReader::new(body.into_async_read()); - io::copy(&mut from, to).await?; - } - - Ok(object_output.metadata.map(StorageMetadata)) + async fn download(&self, from: &Self::RemoteObjectId) -> Result { + self.download_object(GetObjectRequest { + bucket: self.bucket_name.clone(), + key: from.key().to_owned(), + ..GetObjectRequest::default() + }) + .await } async fn download_byte_range( @@ -322,8 +330,7 @@ impl RemoteStorage for S3Bucket { from: &Self::RemoteObjectId, start_inclusive: u64, end_exclusive: Option, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result> { + ) -> Result { // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 // and needs both ends to be exclusive let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1)); @@ -331,34 +338,14 @@ impl RemoteStorage for S3Bucket { Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive), None => format!("bytes={}-", start_inclusive), }); - let _guard = self - .concurrency_limiter - .acquire() - .await - .context("Concurrency limiter semaphore got closed during S3 range download")?; - metrics::inc_get_object(); - - let object_output = self - .client - .get_object(GetObjectRequest { - bucket: self.bucket_name.clone(), - key: from.key().to_owned(), - range, - ..GetObjectRequest::default() - }) - .await - .map_err(|e| { - metrics::inc_get_object_fail(); - e - })?; - - if let Some(body) = object_output.body { - let mut from = io::BufReader::new(body.into_async_read()); - io::copy(&mut from, to).await?; - } - - Ok(object_output.metadata.map(StorageMetadata)) + self.download_object(GetObjectRequest { + bucket: self.bucket_name.clone(), + key: from.key().to_owned(), + range, + ..GetObjectRequest::default() + }) + .await } async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> { diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 99ccf27e1c..b51826fa1e 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -44,13 +44,23 @@ where index_part_path.display() ) })?; + + let mut index_part_download = + storage + .download(&part_storage_path) + .await + .with_context(|| { + format!("Failed to open download stream for for storage path {part_storage_path:?}") + })?; let mut index_part_bytes = Vec::new(); - storage - .download(&part_storage_path, &mut index_part_bytes) - .await - .with_context(|| { - format!("Failed to download an index part from storage path {part_storage_path:?}") - })?; + io::copy( + &mut index_part_download.download_stream, + &mut index_part_bytes, + ) + .await + .with_context(|| { + format!("Failed to download an index part from storage path {part_storage_path:?}") + })?; let index_part: IndexPart = serde_json::from_slice(&index_part_bytes).with_context(|| { format!("Failed to deserialize index part file from storage path '{part_storage_path:?}'") @@ -162,15 +172,19 @@ where temp_file_path.display() ) })?; - - storage - .download(&layer_storage_path, &mut destination_file) + let mut download = storage + .download(&layer_storage_path) .await .with_context(|| { format!( - "Failed to download a layer from storage path '{layer_storage_path:?}'" + "Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'" ) })?; + io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| { + format!( + "Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display() + ) + })?; // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: // A file will not be closed immediately when it goes out of scope if there are any IO operations diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 8fada70e8b..b2f9d8d4f3 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -2,18 +2,16 @@ use anyhow::{Context, Result}; use etcd_broker::subscription_key::{ NodeKind, OperationKind, SkOperationKind, SubscriptionKey, SubscriptionKind, }; -use tokio::io::AsyncRead; use tokio::task::JoinHandle; use std::cmp::min; use std::collections::HashMap; use std::path::{Path, PathBuf}; +use std::pin::Pin; use std::sync::Arc; use std::time::Duration; -use postgres_ffi::xlog_utils::{ - XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, MAX_SEND_SIZE, PG_TLI, -}; +use postgres_ffi::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, PG_TLI}; use remote_storage::{GenericRemoteStorage, RemoteStorage}; use tokio::fs::File; use tokio::runtime::Builder; @@ -452,45 +450,41 @@ async fn backup_object(source_file: &Path, size: usize) -> Result<()> { pub async fn read_object( file_path: PathBuf, offset: u64, -) -> (impl AsyncRead, JoinHandle>) { - let storage = REMOTE_STORAGE.get().expect("failed to get remote storage"); +) -> anyhow::Result>> { + let download = match REMOTE_STORAGE + .get() + .context("Failed to get remote storage")? + .as_ref() + .context("No remote storage configured")? + { + GenericRemoteStorage::Local(local_storage) => { + let source = local_storage.remote_object_id(&file_path)?; - let (mut pipe_writer, pipe_reader) = tokio::io::duplex(MAX_SEND_SIZE); - - let copy_result = tokio::spawn(async move { - let res = match storage.as_ref().unwrap() { - GenericRemoteStorage::Local(local_storage) => { - let source = local_storage.remote_object_id(&file_path)?; - - info!( - "local download about to start from {} at offset {}", - source.display(), - offset - ); - local_storage - .download_byte_range(&source, offset, None, &mut pipe_writer) - .await - } - GenericRemoteStorage::S3(s3_storage) => { - let s3key = s3_storage.remote_object_id(&file_path)?; - - info!( - "S3 download about to start from {:?} at offset {}", - s3key, offset - ); - s3_storage - .download_byte_range(&s3key, offset, None, &mut pipe_writer) - .await - } - }; - - if let Err(e) = res { - error!("failed to download WAL segment from remote storage: {}", e); - Err(e) - } else { - Ok(()) + info!( + "local download about to start from {} at offset {}", + source.display(), + offset + ); + local_storage + .download_byte_range(&source, offset, None) + .await } - }); + GenericRemoteStorage::S3(s3_storage) => { + let s3key = s3_storage.remote_object_id(&file_path)?; - (pipe_reader, copy_result) + info!( + "S3 download about to start from {:?} at offset {}", + s3key, offset + ); + s3_storage.download_byte_range(&s3key, offset, None).await + } + } + .with_context(|| { + format!( + "Failed to open WAL segment download stream for local storage path {}", + file_path.display() + ) + })?; + + Ok(download.download_stream) } diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 5cb7a8c758..9b23e2189c 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -604,8 +604,7 @@ impl WalReader { // Try to open remote file, if remote reads are enabled if self.enable_remote_read { - let (reader, _) = read_object(wal_file_path, xlogoff as u64).await; - return Ok(Box::pin(reader)); + return read_object(wal_file_path, xlogoff as u64).await; } bail!("WAL segment is not found") From 07df7c2edd46739a868451d12e8e746ffc18b7d9 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 6 Jul 2022 13:15:15 +0100 Subject: [PATCH 0468/1022] github/actions: fix storing perf data for main (#2038) --- .github/actions/run-python-test-set/action.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 48c0c2b925..accb8896de 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -85,7 +85,7 @@ runs: EXTRA_PARAMS="-n4 $EXTRA_PARAMS" fi if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then - if [[ "$GITHUB_REF" == "main" ]]; then + if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then mkdir -p "$PERF_REPORT_DIR" EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" fi @@ -115,7 +115,7 @@ runs: -rA $TEST_SELECTION $EXTRA_PARAMS if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then - if [[ "$GITHUB_REF" == "main" ]]; then + if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then export REPORT_FROM="$PERF_REPORT_DIR" export REPORT_TO=local scripts/generate_and_push_perf_report.sh From 8fabdc67082247558d740127c329d523d8a5eb5a Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 16 Jun 2022 11:15:02 +0000 Subject: [PATCH 0469/1022] Add tests with concurrent computes. Removes test_restart_compute, as added test_compute_restarts is stronger. --- .../batch_others/test_restart_compute.py | 74 --------- .../batch_others/test_wal_acceptor_async.py | 156 +++++++++++++++++- test_runner/fixtures/neon_fixtures.py | 57 ++++--- 3 files changed, 191 insertions(+), 96 deletions(-) delete mode 100644 test_runner/batch_others/test_restart_compute.py diff --git a/test_runner/batch_others/test_restart_compute.py b/test_runner/batch_others/test_restart_compute.py deleted file mode 100644 index af1956e196..0000000000 --- a/test_runner/batch_others/test_restart_compute.py +++ /dev/null @@ -1,74 +0,0 @@ -import pytest - -from contextlib import closing -from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.log_helper import log - - -# -# Test restarting and recreating a postgres instance -# -@pytest.mark.parametrize('with_safekeepers', [False, True]) -def test_restart_compute(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): - neon_env_builder.auth_enabled = True - if with_safekeepers: - neon_env_builder.num_safekeepers = 3 - env = neon_env_builder.init_start() - - env.neon_cli.create_branch('test_restart_compute') - pg = env.postgres.create_start('test_restart_compute') - log.info("postgres is running on 'test_restart_compute' branch") - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute('CREATE TABLE t(key int primary key, value text)') - cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - cur.execute('SELECT sum(key) FROM t') - r = cur.fetchone() - assert r == (5000050000, ) - log.info(f"res = {r}") - - # Remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute') - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # We can still see the row - cur.execute('SELECT sum(key) FROM t') - r = cur.fetchone() - assert r == (5000050000, ) - log.info(f"res = {r}") - - # Insert another row - cur.execute("INSERT INTO t VALUES (100001, 'payload2')") - cur.execute('SELECT count(*) FROM t') - - r = cur.fetchone() - assert r == (100001, ) - log.info(f"res = {r}") - - # Again remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute') - - # That select causes lots of FPI's and increases probability of wakeepers - # lagging behind after query completion - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # We can still see the rows - cur.execute('SELECT count(*) FROM t') - - r = cur.fetchone() - assert r == (100001, ) - log.info(f"res = {r}") - - # And again remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute') - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # We can still see the rows - cur.execute('SELECT count(*) FROM t') - - r = cur.fetchone() - assert r == (100001, ) - log.info(f"res = {r}") diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index c0967ef6c0..4664c332fc 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -1,5 +1,6 @@ import asyncio import uuid + import asyncpg import random import time @@ -7,7 +8,7 @@ import time from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper from fixtures.log_helper import getLogger from fixtures.utils import lsn_from_hex, lsn_to_hex -from typing import List +from typing import List, Optional log = getLogger('root.safekeeper_async') @@ -234,3 +235,156 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): # we try to simulate large (flush_lsn - truncate_lsn) lag, to test that WAL segments # are not removed before broadcasted to all safekeepers, with the help of replication slot asyncio.run(run_restarts_under_load(env, pg, env.safekeepers, period_time=15, iterations=5)) + + +def postgres_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): + pg = Postgres( + env, + tenant_id=env.initial_tenant, + port=env.port_distributor.get_port(), + # In these tests compute has high probability of terminating on its own + # before our stop() due to lost consensus leadership. + check_stop_result=False) + + # embed current time in node name + node_name = pgdir_name or f'pg_node_{time.time()}' + return pg.create_start(branch_name=branch, + node_name=node_name, + config_lines=['log_statement=all']) + + +async def exec_compute_query(env: NeonEnv, + branch: str, + query: str, + pgdir_name: Optional[str] = None): + with postgres_create_start(env, branch=branch, pgdir_name=pgdir_name) as pg: + before_conn = time.time() + conn = await pg.connect_async() + res = await conn.fetch(query) + await conn.close() + after_conn = time.time() + log.info(f'{query} took {after_conn - before_conn}s') + return res + + +async def run_compute_restarts(env: NeonEnv, + queries=16, + batch_insert=10000, + branch='test_compute_restarts'): + cnt = 0 + sum = 0 + + await exec_compute_query(env, branch, 'CREATE TABLE t (i int)') + + for i in range(queries): + if i % 4 == 0: + await exec_compute_query( + env, branch, f'INSERT INTO t SELECT 1 FROM generate_series(1, {batch_insert})') + sum += batch_insert + cnt += batch_insert + elif (i % 4 == 1) or (i % 4 == 3): + # Note that select causes lots of FPI's and increases probability of safekeepers + # standing at different LSNs after compute termination. + actual_sum = (await exec_compute_query(env, branch, 'SELECT SUM(i) FROM t'))[0][0] + assert actual_sum == sum, f'Expected sum={sum}, actual={actual_sum}' + elif i % 4 == 2: + await exec_compute_query(env, branch, 'UPDATE t SET i = i + 1') + sum += cnt + + +# Add a test which creates compute for every query, and then destroys it right after. +def test_compute_restarts(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch('test_compute_restarts') + asyncio.run(run_compute_restarts(env)) + + +class BackgroundCompute(object): + def __init__(self, index: int, env: NeonEnv, branch: str): + self.index = index + self.env = env + self.branch = branch + self.running = False + self.stopped = False + self.total_tries = 0 + self.successful_queries: List[int] = [] + + async def run(self): + if self.running: + raise Exception('BackgroundCompute is already running') + + self.running = True + i = 0 + while not self.stopped: + try: + verify_key = (self.index << 16) + i + i += 1 + self.total_tries += 1 + res = await exec_compute_query( + self.env, + self.branch, + f'INSERT INTO query_log(index, verify_key) VALUES ({self.index}, {verify_key}) RETURNING verify_key', + pgdir_name=f'bgcompute{self.index}_key{verify_key}', + ) + log.info(f'result: {res}') + if len(res) != 1: + raise Exception('No result returned') + if res[0][0] != verify_key: + raise Exception('Wrong result returned') + self.successful_queries.append(verify_key) + except Exception as e: + log.info(f'BackgroundCompute {self.index} query failed: {e}') + + # With less sleep, there is a very big chance of not committing + # anything or only 1 xact during test run. + await asyncio.sleep(2 * random.random()) + self.running = False + + +async def run_concurrent_computes(env: NeonEnv, + num_computes=10, + run_seconds=20, + branch='test_concurrent_computes'): + await exec_compute_query( + env, + branch, + 'CREATE TABLE query_log (t timestamp default now(), index int, verify_key int)') + + computes = [BackgroundCompute(i, env, branch) for i in range(num_computes)] + background_tasks = [asyncio.create_task(compute.run()) for compute in computes] + + await asyncio.sleep(run_seconds) + for compute in computes[1:]: + compute.stopped = True + log.info("stopped all tasks but one") + + # work for some time with only one compute -- it should be able to make some xacts + await asyncio.sleep(8) + computes[0].stopped = True + + await asyncio.gather(*background_tasks) + + result = await exec_compute_query(env, branch, 'SELECT * FROM query_log') + # we should have inserted something while single compute was running + assert len(result) >= 4 + log.info(f'Executed {len(result)} queries') + for row in result: + log.info(f'{row[0]} {row[1]} {row[2]}') + + # ensure everything reported as committed wasn't lost + for compute in computes: + for verify_key in compute.successful_queries: + assert verify_key in [row[2] for row in result] + + +# Run multiple computes concurrently, creating-destroying them after single +# query. Ensure we don't lose any xacts reported as committed and be able to +# progress once only one compute remains. +def test_concurrent_computes(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch('test_concurrent_computes') + asyncio.run(run_concurrent_computes(env)) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9eb02b50d0..3d4daf5f29 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1160,6 +1160,7 @@ class NeonCli: node_name: str, tenant_id: Optional[uuid.UUID] = None, destroy=False, + check_return_code=True, ) -> 'subprocess.CompletedProcess[str]': args = [ 'pg', @@ -1172,7 +1173,7 @@ class NeonCli: if node_name is not None: args.append(node_name) - return self.raw_cli(args) + return self.raw_cli(args, check_return_code=check_return_code) def raw_cli(self, arguments: List[str], @@ -1188,6 +1189,8 @@ class NeonCli: >>> result = env.neon_cli.raw_cli(...) >>> assert result.stderr == "" >>> log.info(result.stdout) + + If `check_return_code`, on non-zero exit code logs failure and raises. """ assert type(arguments) == list @@ -1213,27 +1216,27 @@ class NeonCli: env_vars[var] = val # Intercept CalledProcessError and print more info - try: - res = subprocess.run(args, - env=env_vars, - check=True, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + res = subprocess.run(args, + env=env_vars, + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if not res.returncode: log.info(f"Run success: {res.stdout}") - except subprocess.CalledProcessError as exc: + elif check_return_code: # this way command output will be in recorded and shown in CI in failure message msg = f"""\ - Run failed: {exc} - stdout: {exc.stdout} - stderr: {exc.stderr} + Run {res.args} failed: + stdout: {res.stdout} + stderr: {res.stderr} """ log.info(msg) + raise Exception(msg) from subprocess.CalledProcessError(res.returncode, + res.args, + res.stdout, + res.stderr) - raise Exception(msg) from exc - - if check_return_code: - res.check_returncode() return res @@ -1526,7 +1529,11 @@ def static_proxy(vanilla_pg, port_distributor) -> Iterator[NeonProxy]: class Postgres(PgProtocol): """ An object representing a running postgres daemon. """ - def __init__(self, env: NeonEnv, tenant_id: uuid.UUID, port: int): + def __init__(self, + env: NeonEnv, + tenant_id: uuid.UUID, + port: int, + check_stop_result: bool = True): super().__init__(host='localhost', port=port, user='cloud_admin', dbname='postgres') self.env = env self.running = False @@ -1534,6 +1541,7 @@ class Postgres(PgProtocol): self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA self.tenant_id = tenant_id self.port = port + self.check_stop_result = check_stop_result # path to conf is /pgdatadirs/tenants///postgresql.conf def create( @@ -1585,8 +1593,6 @@ class Postgres(PgProtocol): port=self.port) self.running = True - log.info(f"stdout: {run_result.stdout}") - return self def pg_data_dir_path(self) -> str: @@ -1650,7 +1656,9 @@ class Postgres(PgProtocol): if self.running: assert self.node_name is not None - self.env.neon_cli.pg_stop(self.node_name, self.tenant_id) + self.env.neon_cli.pg_stop(self.node_name, + self.tenant_id, + check_return_code=self.check_stop_result) self.running = False return self @@ -1662,7 +1670,10 @@ class Postgres(PgProtocol): """ assert self.node_name is not None - self.env.neon_cli.pg_stop(self.node_name, self.tenant_id, True) + self.env.neon_cli.pg_stop(self.node_name, + self.tenant_id, + True, + check_return_code=self.check_stop_result) self.node_name = None self.running = False @@ -1681,6 +1692,8 @@ class Postgres(PgProtocol): Returns self. """ + started_at = time.time() + self.create( branch_name=branch_name, node_name=node_name, @@ -1688,6 +1701,8 @@ class Postgres(PgProtocol): lsn=lsn, ).start() + log.info(f"Postgres startup took {time.time() - started_at} seconds") + return self def __enter__(self): From 242af756538365cca504bd28497d407dd4874362 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Wed, 6 Jul 2022 13:45:02 -0400 Subject: [PATCH 0470/1022] Fix signal file parsing (#2042) --- pageserver/src/import_datadir.rs | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 3ede949885..1a9aa78d8c 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -516,10 +516,23 @@ pub fn import_file( // Parse zenith signal file to set correct previous LSN let bytes = read_all_bytes(reader)?; // zenith.signal format is "PREV LSN: prev_lsn" - let zenith_signal = std::str::from_utf8(&bytes)?; - let zenith_signal = zenith_signal.split(':').collect::>(); - let prev_lsn = zenith_signal[1].trim().parse::()?; + // TODO write serialization and deserialization in the same place. + let zenith_signal = std::str::from_utf8(&bytes)?.trim(); + let prev_lsn = match zenith_signal { + "PREV LSN: none" => Lsn(0), + "PREV LSN: invalid" => Lsn(0), + other => { + let split = other.split(':').collect::>(); + split[1] + .trim() + .parse::() + .context("can't parse zenith.signal")? + } + }; + // zenith.signal is not necessarily the last file, that we handle + // but it is ok to call `finish_write()`, because final `modification.commit()` + // will update lsn once more to the final one. let writer = modification.tline.tline.writer(); writer.finish_write(prev_lsn); From 4a96259bdd1ac2e31fefba0375aaf177644ba199 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Wed, 6 Jul 2022 13:45:26 -0400 Subject: [PATCH 0471/1022] Add export/import test (#2036) --- .../batch_others/test_tenant_relocation.py | 77 +++++++++++++++---- 1 file changed, 61 insertions(+), 16 deletions(-) diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 0239b17494..e9c493cad6 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -10,8 +10,8 @@ from typing import Optional import signal import pytest -from fixtures.neon_fixtures import PgProtocol, PortDistributor, Postgres, NeonEnvBuilder, Etcd, NeonPageserverHttpClient, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload, neon_binpath, pg_distrib_dir -from fixtures.utils import lsn_from_hex +from fixtures.neon_fixtures import PgProtocol, PortDistributor, Postgres, NeonEnvBuilder, Etcd, NeonPageserverHttpClient, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload, neon_binpath, pg_distrib_dir, base_dir +from fixtures.utils import lsn_from_hex, subprocess_capture def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @@ -101,9 +101,23 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve log.info('load thread stopped') +@pytest.mark.parametrize( + 'method', + [ + # A minor migration involves no storage breaking changes. + # It is done by attaching the tenant to a new pageserver. + 'minor', + # A major migration involves exporting a postgres datadir + # basebackup and importing it into the new pageserver. + # This kind of migration can tolerate breaking changes + # to storage format + pytest.param('major', marks=pytest.mark.xfail(reason="Not implemented")), + ]) @pytest.mark.parametrize('with_load', ['with_load', 'without_load']) def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, port_distributor: PortDistributor, + test_output_dir, + method: str, with_load: str): neon_env_builder.enable_local_fs_remote_storage() @@ -153,8 +167,11 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, load_stop_event = threading.Event() load_ok_event = threading.Event() - load_thread = threading.Thread(target=load, - args=(tenant_pg, load_stop_event, load_ok_event)) + load_thread = threading.Thread( + target=load, + args=(tenant_pg, load_stop_event, load_ok_event), + daemon=True, # To make sure the child dies when the parent errors + ) load_thread.start() # run checkpoint manually to be sure that data landed in remote storage @@ -184,19 +201,47 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, new_pageserver_http_port, neon_env_builder.broker): - # call to attach timeline to new pageserver - new_pageserver_http.timeline_attach(tenant, timeline) - # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint - new_timeline_detail = wait_until( - number_of_iterations=5, - interval=1, - func=lambda: assert_local(new_pageserver_http, tenant, timeline)) + # Migrate either by attaching from s3 or import/export basebackup + if method == "major": + cmd = [ + "python", + os.path.join(base_dir, "scripts/export_import_between_pageservers.py"), + "--tenant-id", + tenant.hex, + "--from-host", + "localhost", + "--from-http-port", + str(pageserver_http.port), + "--from-pg-port", + str(env.pageserver.service_port.pg), + "--to-host", + "localhost", + "--to-http-port", + str(new_pageserver_http_port), + "--to-pg-port", + str(new_pageserver_pg_port), + "--psql-path", + os.path.join(pg_distrib_dir, "bin", "psql"), + "--work-dir", + os.path.join(test_output_dir), + ] + subprocess_capture(str(env.repo_dir), cmd, check=True) + elif method == "minor": + # call to attach timeline to new pageserver + new_pageserver_http.timeline_attach(tenant, timeline) - # when load is active these checks can break because lsns are not static - # so lets check with some margin - assert_abs_margin_ratio(lsn_from_hex(new_timeline_detail['local']['disk_consistent_lsn']), - lsn_from_hex(timeline_detail['local']['disk_consistent_lsn']), - 0.03) + # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint + new_timeline_detail = wait_until( + number_of_iterations=5, + interval=1, + func=lambda: assert_local(new_pageserver_http, tenant, timeline)) + + # when load is active these checks can break because lsns are not static + # so lets check with some margin + assert_abs_margin_ratio( + lsn_from_hex(new_timeline_detail['local']['disk_consistent_lsn']), + lsn_from_hex(timeline_detail['local']['disk_consistent_lsn']), + 0.03) tenant_pg.stop() From 1faf49da0f138c3d931a77fb3fc16d475939f4e0 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 6 Jul 2022 19:24:06 +0100 Subject: [PATCH 0472/1022] github/actions: set PERF_TEST_RESULT_CONNSTR from secrets (#2040) --- .github/workflows/build_and_test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 81b4585714..cea40a047b 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -271,6 +271,8 @@ jobs: test_selection: performance run_in_parallel: false save_perf_report: true + env: + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones From 0e3456351fca1184bb4fd45c8cecdc3245ae77a0 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 6 Jul 2022 18:18:04 +0300 Subject: [PATCH 0473/1022] Shrink thread pools used for WAL receivers and background tasks. I noticed that the pageserver has a very large virtual memory size, several GB, even though it doesn't actually use that much memory. That's not much of a problem normally, but I hit it because I wanted to run tests with a limited virtual memory size, by calling setrlimit(RLIMIT_AS), but the highest limit you can set is 2 GB. I was not able to start pageserver with a limit of 2 GB. On Linux, each thread allocates 32 MB of virtual memory. I read this on some random forum on the Internet, but unfortunately could not find the source again now. Empirically, reducing the number of threads clearly helps to bring down the virtual memory size. Aside from the virtual memory usage, it seems excessive to launch 40 threads in both of those thread pools. The tokio default is to have as many worker threads as there are CPU cores in the system. That seems like a fine heuristic for us, too, so remove the explicit setting of the pool size and rely on the default. Note that the GC and compaction tasks are actually run with tokio spawn_blocking, so the threads that are actually doing the work, and possibly waiting on I/O, are not consuming threads from the thread pool. The WAL receiver work is done in the tokio worker threads, but the WAL receivers are more CPU bound so that seems OK. Also remove the explicit maxinum on blocking tasks. I'm not sure what the right value for that would be, or whether the value we set (100) would be better than the tokio default (512). Since the value was arbitrary, let's just rely on the tokio default for that, too. --- pageserver/src/tenant_tasks.rs | 2 -- pageserver/src/walreceiver.rs | 1 - 2 files changed, 3 deletions(-) diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 6871ac3001..b0bb4953ca 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -119,8 +119,6 @@ pub fn start_compaction_loop(tenantid: ZTenantId) -> anyhow::Result<()> { pub fn init_tenant_task_pool() -> anyhow::Result<()> { let runtime = tokio::runtime::Builder::new_multi_thread() .thread_name("tenant-task-worker") - .worker_threads(40) // Way more than necessary - .max_blocking_threads(100) // Way more than necessary .enable_all() .build()?; diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 2b5a3123c1..9f0f911e0c 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -91,7 +91,6 @@ pub fn init_wal_receiver_main_thread( let runtime = tokio::runtime::Builder::new_multi_thread() .thread_name("wal-receiver-runtime-thread") - .worker_threads(40) .enable_all() .on_thread_start(|| IS_WAL_RECEIVER.with(|c| c.set(true))) .build() From cb5df3c6277c296cf37b934fb3f790d075beff5b Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 7 Jul 2022 10:47:03 +0100 Subject: [PATCH 0474/1022] github/actions: set missing VIP_VAP_ACCESS_TOKEN (#2045) --- .github/workflows/build_and_test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index cea40a047b..78aa163f3e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -272,6 +272,7 @@ jobs: run_in_parallel: false save_perf_report: true env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones From 747d009bb4b88d54d53f6d2b417e02a85f41d9d1 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Tue, 5 Jul 2022 12:27:53 +0200 Subject: [PATCH 0475/1022] Fix panic while waiting for Postgres readiness in the compute_ctl (#2021) We were reading Postgres pid file and looking for the 'ready' status, but it could be empty or we could not read it. So add all the checks. --- compute_tools/src/pg_helpers.rs | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index ea3909a029..207d09d76b 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -248,18 +248,20 @@ pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<() bail!("Postgres exited unexpectedly with code {}", code); } - if pid_path.exists() { - let file = BufReader::new(File::open(&pid_path)?); - let status = file - .lines() - .last() - .unwrap() - .unwrap_or_else(|_| "unknown".to_string()); - let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok(); + // Check that we can open pid file first. + if let Ok(file) = File::open(&pid_path) { + let file = BufReader::new(file); + let last_line = file.lines().last(); - // Now Postgres is ready to accept connections - if status.trim() == "ready" && can_connect { - break; + // Pid file could be there and we could read it, but it could be empty, for example. + if let Some(Ok(line)) = last_line { + let status = line.trim(); + let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok(); + + // Now Postgres is ready to accept connections + if status == "ready" && can_connect { + break; + } } } From e6ea049165dccfd4e804d729c8afcde60e3bde14 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 6 Jul 2022 22:24:08 +0300 Subject: [PATCH 0476/1022] If an error happens during import of base backup or WAL, log it. We only sent the error to the client, with no trace in the pageserver log. Log it, similar to how we log errors in GetPage@LSN requests. --- pageserver/src/page_service.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 22002fdbab..973a631d23 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -951,7 +951,10 @@ impl postgres_backend::Handler for PageServerHandler { match self.handle_import_basebackup(pgb, tenant, timeline, base_lsn, end_lsn) { Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, - Err(e) => pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?, + Err(e) => { + error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}"); + pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))? + } }; } else if query_string.starts_with("import wal ") { // Import the `pg_wal` section of a basebackup. @@ -970,7 +973,10 @@ impl postgres_backend::Handler for PageServerHandler { match self.handle_import_wal(pgb, tenant, timeline, start_lsn, end_lsn) { Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, - Err(e) => pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?, + Err(e) => { + error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}"); + pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))? + } }; } else if query_string.to_ascii_lowercase().starts_with("set ") { // important because psycopg2 executes "SET datestyle TO 'ISO'" From ae116ff0a9bbbb85132bc3413512bc67a36650c3 Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Thu, 7 Jul 2022 18:09:57 +0300 Subject: [PATCH 0477/1022] update timeout for proxy deploy (#2047) --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index f64ba94cb4..5370e46663 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -495,8 +495,8 @@ jobs: name: Re-deploy proxy command: | DOCKER_TAG=$(git log --oneline|wc -l) - helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait - helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait + helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s deploy-neon-stress: docker: From 4c54e4b37d11c775609ca276e4e35bb4ca6be8a7 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 15 Jun 2022 17:59:24 +0300 Subject: [PATCH 0478/1022] switch to per-tenant attach/detach download operations of all timelines for one tenant are now grouped together so when attach is invoked pageserver downloads all of them and registers them in a single apply_sync_status_update call so branches can be used safely with attach/detach --- libs/remote_storage/src/lib.rs | 14 +- libs/remote_storage/src/local_fs.rs | 29 ++- libs/remote_storage/src/s3_bucket.rs | 112 +++++++- pageserver/src/http/openapi_spec.yml | 47 +++- pageserver/src/http/routes.rs | 145 ++++++----- pageserver/src/layered_repository.rs | 54 ++-- pageserver/src/repository.rs | 5 +- pageserver/src/storage_sync.rs | 107 ++++---- pageserver/src/storage_sync/download.rs | 109 +++++++- pageserver/src/storage_sync/index.rs | 118 +++++++-- pageserver/src/tenant_mgr.rs | 87 ++++--- pageserver/src/timelines.rs | 2 +- pageserver/src/walreceiver.rs | 7 +- .../batch_others/test_ancestor_branch.py | 13 - test_runner/batch_others/test_detach.py | 49 ++++ test_runner/batch_others/test_normal_work.py | 2 +- .../batch_others/test_remote_storage.py | 10 +- .../batch_others/test_tenant_relocation.py | 246 +++++++++++++----- test_runner/fixtures/neon_fixtures.py | 12 +- 19 files changed, 835 insertions(+), 333 deletions(-) create mode 100644 test_runner/batch_others/test_detach.py diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 6d47d070c1..dec79e4580 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -42,13 +42,19 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; +pub trait RemoteObjectName { + // Needed to retrieve last component for RemoteObjectId. + // In other words a file name + fn object_name(&self) -> Option<&str>; +} + /// Storage (potentially remote) API to manage its state. /// This storage tries to be unaware of any layered repository context, /// providing basic CRUD operations for storage files. #[async_trait::async_trait] pub trait RemoteStorage: Send + Sync { /// A way to uniquely reference a file in the remote storage. - type RemoteObjectId; + type RemoteObjectId: RemoteObjectName; /// Attempts to derive the storage path out of the local path, if the latter is correct. fn remote_object_id(&self, local_path: &Path) -> anyhow::Result; @@ -59,6 +65,12 @@ pub trait RemoteStorage: Send + Sync { /// Lists all items the storage has right now. async fn list(&self) -> anyhow::Result>; + /// Lists all top level subdirectories for a given prefix + async fn list_prefixes( + &self, + prefix: Option, + ) -> anyhow::Result>; + /// Streams the local file contents into remote into the remote storage entry. async fn upload( &self, diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 25235200b2..df1581fb51 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -5,6 +5,7 @@ //! volume is mounted to the local FS. use std::{ + borrow::Cow, future::Future, path::{Path, PathBuf}, pin::Pin, @@ -17,10 +18,16 @@ use tokio::{ }; use tracing::*; -use crate::{path_with_suffix_extension, Download, DownloadError}; +use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectName}; use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; +impl RemoteObjectName for PathBuf { + fn object_name(&self) -> Option<&str> { + self.file_stem().and_then(|n| n.to_str()) + } +} + pub struct LocalFs { working_directory: PathBuf, storage_root: PathBuf, @@ -101,7 +108,18 @@ impl RemoteStorage for LocalFs { } async fn list(&self) -> anyhow::Result> { - get_all_files(&self.storage_root).await + get_all_files(&self.storage_root, true).await + } + + async fn list_prefixes( + &self, + prefix: Option, + ) -> anyhow::Result> { + let path = match prefix { + Some(prefix) => Cow::Owned(self.storage_root.join(prefix)), + None => Cow::Borrowed(&self.storage_root), + }; + get_all_files(path.as_ref(), false).await } async fn upload( @@ -299,6 +317,7 @@ fn storage_metadata_path(original_path: &Path) -> PathBuf { fn get_all_files<'a, P>( directory_path: P, + recursive: bool, ) -> Pin>> + Send + Sync + 'a>> where P: AsRef + Send + Sync + 'a, @@ -315,7 +334,11 @@ where if file_type.is_symlink() { debug!("{:?} us a symlink, skipping", entry_path) } else if file_type.is_dir() { - paths.extend(get_all_files(entry_path).await?.into_iter()) + if recursive { + paths.extend(get_all_files(entry_path, true).await?.into_iter()) + } else { + paths.push(dir_entry.path()) + } } else { paths.push(dir_entry.path()); } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 5269d63d09..3b413e30ce 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -19,7 +19,9 @@ use tokio::{io, sync::Semaphore}; use tokio_util::io::ReaderStream; use tracing::debug; -use crate::{strip_path_prefix, Download, DownloadError, RemoteStorage, S3Config}; +use crate::{ + strip_path_prefix, Download, DownloadError, RemoteObjectName, RemoteStorage, S3Config, +}; use super::StorageMetadata; @@ -117,6 +119,24 @@ impl S3ObjectKey { } } +impl RemoteObjectName for S3ObjectKey { + /// Turn a/b/c or a/b/c/ into c + fn object_name(&self) -> Option<&str> { + // corner case + if &self.0 == "/" { + return None; + } + + if self.0.ends_with(S3_PREFIX_SEPARATOR) { + self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1) + } else { + self.0 + .rsplit_once(S3_PREFIX_SEPARATOR) + .map(|(_, last)| last) + } + } +} + /// AWS S3 storage. pub struct S3Bucket { workdir: PathBuf, @@ -283,6 +303,77 @@ impl RemoteStorage for S3Bucket { Ok(document_keys) } + /// Note: it wont include empty "directories" + async fn list_prefixes( + &self, + prefix: Option, + ) -> anyhow::Result> { + let list_prefix = match prefix { + Some(prefix) => { + let mut prefix_in_bucket = self.prefix_in_bucket.clone().unwrap_or_default(); + // if there is no trailing / in default prefix and + // supplied prefix does not start with "/" insert it + if !(prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR) + || prefix.0.starts_with(S3_PREFIX_SEPARATOR)) + { + prefix_in_bucket.push(S3_PREFIX_SEPARATOR); + } + + prefix_in_bucket.push_str(&prefix.0); + // required to end with a separator + // otherwise request will return only the entry of a prefix + if !prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR) { + prefix_in_bucket.push(S3_PREFIX_SEPARATOR); + } + Some(prefix_in_bucket) + } + None => self.prefix_in_bucket.clone(), + }; + + let mut document_keys = Vec::new(); + + let mut continuation_token = None; + loop { + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 list")?; + + metrics::inc_list_objects(); + + let fetch_response = self + .client + .list_objects_v2(ListObjectsV2Request { + bucket: self.bucket_name.clone(), + prefix: list_prefix.clone(), + continuation_token, + delimiter: Some(S3_PREFIX_SEPARATOR.to_string()), + ..ListObjectsV2Request::default() + }) + .await + .map_err(|e| { + metrics::inc_list_objects_fail(); + e + })?; + + document_keys.extend( + fetch_response + .common_prefixes + .unwrap_or_default() + .into_iter() + .filter_map(|o| Some(S3ObjectKey(o.prefix?))), + ); + + match fetch_response.continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + + Ok(document_keys) + } + async fn upload( &self, from: impl io::AsyncRead + Unpin + Send + Sync + 'static, @@ -378,6 +469,25 @@ mod tests { use super::*; + #[test] + fn object_name() { + let k = S3ObjectKey("a/b/c".to_owned()); + assert_eq!(k.object_name(), Some("c")); + + let k = S3ObjectKey("a/b/c/".to_owned()); + assert_eq!(k.object_name(), Some("c")); + + let k = S3ObjectKey("a/".to_owned()); + assert_eq!(k.object_name(), Some("a")); + + // XXX is it impossible to have an empty key? + let k = S3ObjectKey("".to_owned()); + assert_eq!(k.object_name(), None); + + let k = S3ObjectKey("/".to_owned()); + assert_eq!(k.object_name(), None); + } + #[test] fn download_destination() -> anyhow::Result<()> { let workdir = tempdir()?.path().to_owned(); diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 55f7b3c5a7..ebbb0d5ced 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -170,7 +170,6 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - /v1/tenant/{tenant_id}/timeline/{timeline_id}/attach: parameters: - name: tenant_id @@ -186,12 +185,27 @@ paths: type: string format: hex post: - description: Attach remote timeline + description: Deprecated responses: - "200": - description: Timeline attaching scheduled + "410": + description: GONE + + + /v1/tenant/{tenant_id}/attach: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Deprecated + responses: + "202": + description: Tenant attaching scheduled "400": - description: Error when no tenant id found in path or no timeline id + description: Error when no tenant id found in path parameters content: application/json: schema: @@ -215,7 +229,7 @@ paths: schema: $ref: "#/components/schemas/NotFoundError" "409": - description: Timeline download is already in progress + description: Tenant download is already in progress content: application/json: schema: @@ -227,7 +241,6 @@ paths: schema: $ref: "#/components/schemas/Error" - /v1/tenant/{tenant_id}/timeline/{timeline_id}/detach: parameters: - name: tenant_id @@ -243,12 +256,26 @@ paths: type: string format: hex post: - description: Detach local timeline + description: Deprecated + responses: + "410": + description: GONE + + /v1/tenant/{tenant_id}/detach: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Detach local tenant responses: "200": - description: Timeline detached + description: Tenant detached "400": - description: Error when no tenant id found in path or no timeline id + description: Error when no tenant id found in path parameters content: application/json: schema: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a1198051a8..41c78210f4 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -209,9 +209,9 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { +async fn timeline_attach_handler(_: Request) -> Result, ApiError> { + json_response(StatusCode::GONE, ()) +} + +// TODO makes sense to provide tenant config right away the same way as it handled in tenant_create +async fn tenant_attach_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; - info!( - "Handling timeline {} attach for tenant: {}", - timeline_id, tenant_id, - ); + info!("Handling tenant attach {}", tenant_id,); tokio::task::spawn_blocking(move || { - if tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id).is_ok() { - // TODO: maybe answer with 309 Not Modified here? - anyhow::bail!("Timeline is already present locally") + if tenant_mgr::get_tenant_state(tenant_id).is_some() { + anyhow::bail!("Tenant is already present locally") }; Ok(()) }) .await .map_err(ApiError::from_err)??; - let sync_id = ZTenantTimelineId { - tenant_id, - timeline_id, - }; let state = get_state(&request); let remote_index = &state.remote_index; let mut index_accessor = remote_index.write().await; - if let Some(remote_timeline) = index_accessor.timeline_entry_mut(&sync_id) { - if remote_timeline.awaits_download { + if let Some(tenant_entry) = index_accessor.tenant_entry_mut(&tenant_id) { + if tenant_entry.has_in_progress_downloads() { return Err(ApiError::Conflict( - "Timeline download is already in progress".to_string(), + "Tenant download is already in progress".to_string(), )); } - remote_timeline.awaits_download = true; - storage_sync::schedule_layer_download(tenant_id, timeline_id); - return json_response(StatusCode::ACCEPTED, ()); - } else { - // no timeline in the index, release the lock to make the potentially lengthy download opetation - drop(index_accessor); - } - - let new_timeline = match try_download_index_part_data(state, sync_id).await { - Ok(Some(mut new_timeline)) => { - tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id)) - .await - .context("Failed to create new timeline directory")?; - new_timeline.awaits_download = true; - new_timeline + for (timeline_id, remote_timeline) in tenant_entry.iter_mut() { + storage_sync::schedule_layer_download(tenant_id, *timeline_id); + remote_timeline.awaits_download = true; } - Ok(None) => return Err(ApiError::NotFound("Unknown remote timeline".to_string())), + return json_response(StatusCode::ACCEPTED, ()); + } + // no tenant in the index, release the lock to make the potentially lengthy download opetation + drop(index_accessor); + + // download index parts for every tenant timeline + let remote_timelines = match try_download_tenant_index(state, tenant_id).await { + Ok(Some(remote_timelines)) => remote_timelines, + Ok(None) => return Err(ApiError::NotFound("Unknown remote tenant".to_string())), Err(e) => { - error!("Failed to retrieve remote timeline data: {:?}", e); + error!("Failed to retrieve remote tenant data: {:?}", e); return Err(ApiError::NotFound( - "Failed to retrieve remote timeline".to_string(), + "Failed to retrieve remote tenant".to_string(), )); } }; + // recheck that download is not in progress because + // we've released the lock to avoid holding it during the download let mut index_accessor = remote_index.write().await; - match index_accessor.timeline_entry_mut(&sync_id) { - Some(remote_timeline) => { - if remote_timeline.awaits_download { + let tenant_entry = match index_accessor.tenant_entry_mut(&tenant_id) { + Some(tenant_entry) => { + if tenant_entry.has_in_progress_downloads() { return Err(ApiError::Conflict( - "Timeline download is already in progress".to_string(), + "Tenant download is already in progress".to_string(), )); } - remote_timeline.awaits_download = true; + tenant_entry } - None => index_accessor.add_timeline_entry(sync_id, new_timeline), + None => index_accessor.add_tenant_entry(tenant_id), + }; + + // populate remote index with the data from index part and create directories on the local filesystem + for (timeline_id, mut remote_timeline) in remote_timelines { + tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id)) + .await + .context("Failed to create new timeline directory")?; + + remote_timeline.awaits_download = true; + tenant_entry.insert(timeline_id, remote_timeline); + // schedule actual download + storage_sync::schedule_layer_download(tenant_id, timeline_id); } - storage_sync::schedule_layer_download(tenant_id, timeline_id); + json_response(StatusCode::ACCEPTED, ()) } -async fn try_download_index_part_data( +async fn try_download_tenant_index( state: &State, - sync_id: ZTenantTimelineId, -) -> anyhow::Result> { - let index_part = match state.remote_storage.as_ref() { + tenant_id: ZTenantId, +) -> anyhow::Result>> { + let index_parts = match state.remote_storage.as_ref() { Some(GenericRemoteStorage::Local(local_storage)) => { - storage_sync::download_index_part(state.conf, local_storage, sync_id).await + storage_sync::download_tenant_index_parts(state.conf, local_storage, tenant_id).await } + // FIXME here s3 storage contains its own limits, that are separate from sync storage thread ones + // because it is a different instance. We can move this limit to some global static + // or use one instance everywhere. Some(GenericRemoteStorage::S3(s3_storage)) => { - storage_sync::download_index_part(state.conf, s3_storage, sync_id).await + storage_sync::download_tenant_index_parts(state.conf, s3_storage, tenant_id).await } None => return Ok(None), } - .with_context(|| format!("Failed to download index part for timeline {sync_id}"))?; + .with_context(|| format!("Failed to download index parts for tenant {tenant_id}"))?; - let timeline_path = state - .conf - .timeline_path(&sync_id.timeline_id, &sync_id.tenant_id); - RemoteTimeline::from_index_part(&timeline_path, index_part) - .map(Some) - .with_context(|| { - format!("Failed to convert index part into remote timeline for timeline {sync_id}") - }) + let mut remote_timelines = Vec::with_capacity(index_parts.len()); + for (timeline_id, index_part) in index_parts { + let timeline_path = state.conf.timeline_path(&timeline_id, &tenant_id); + let remote_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part) + .with_context(|| { + format!("Failed to convert index part into remote timeline for timeline {tenant_id}/{timeline_id}") + })?; + remote_timelines.push((timeline_id, remote_timeline)); + } + Ok(Some(remote_timelines)) } -async fn timeline_detach_handler(request: Request) -> Result, ApiError> { +async fn timeline_detach_handler(_: Request) -> Result, ApiError> { + json_response(StatusCode::GONE, ()) +} + +async fn tenant_detach_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; - tokio::task::spawn_blocking(move || { - let _enter = - info_span!("timeline_detach_handler", tenant = %tenant_id, timeline = %timeline_id) - .entered(); + let _enter = info_span!("tenant_detach_handler", tenant = %tenant_id).entered(); let state = get_state(&request); - tenant_mgr::detach_timeline(state.conf, tenant_id, timeline_id) + tenant_mgr::detach_tenant(state.conf, tenant_id) }) .await .map_err(ApiError::from_err)??; @@ -523,6 +534,8 @@ pub fn make_router( .put("/v1/tenant/config", tenant_config_handler) .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler) .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) + .post("/v1/tenant/:tenant_id/attach", tenant_attach_handler) + .post("/v1/tenant/:tenant_id/detach", tenant_detach_handler) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler, diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index db5b77a4d9..2369f46c4f 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -331,19 +331,19 @@ impl Repository for LayeredRepository { /// metrics collection. fn gc_iteration( &self, - target_timelineid: Option, + target_timeline_id: Option, horizon: u64, pitr: Duration, checkpoint_before_gc: bool, ) -> Result { - let timeline_str = target_timelineid + let timeline_str = target_timeline_id .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); STORAGE_TIME .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str]) .observe_closure_duration(|| { - self.gc_iteration_internal(target_timelineid, horizon, pitr, checkpoint_before_gc) + self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc) }) } @@ -410,28 +410,6 @@ impl Repository for LayeredRepository { Ok(()) } - fn detach_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> { - let mut timelines = self.timelines.lock().unwrap(); - // check no child timelines, because detach will remove files, which will brake child branches - // FIXME this can still be violated because we do not guarantee - // that all ancestors are downloaded/attached to the same pageserver - let num_children = timelines - .iter() - .filter(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id)) - .count(); - - ensure!( - num_children == 0, - "Cannot detach timeline which has child timelines" - ); - - ensure!( - timelines.remove(&timeline_id).is_some(), - "Cannot detach timeline {timeline_id} that is not available locally" - ); - Ok(()) - } - fn apply_timeline_remote_sync_status_update( &self, timeline_id: ZTimelineId, @@ -839,13 +817,13 @@ impl LayeredRepository { // we do. fn gc_iteration_internal( &self, - target_timelineid: Option, + target_timeline_id: Option, horizon: u64, pitr: Duration, checkpoint_before_gc: bool, ) -> Result { let _span_guard = - info_span!("gc iteration", tenant = %self.tenant_id, timeline = ?target_timelineid) + info_span!("gc iteration", tenant = %self.tenant_id, timeline = ?target_timeline_id) .entered(); let mut totals: GcResult = Default::default(); let now = Instant::now(); @@ -859,6 +837,12 @@ impl LayeredRepository { let mut timeline_ids = Vec::new(); let mut timelines = self.timelines.lock().unwrap(); + if let Some(target_timeline_id) = target_timeline_id.as_ref() { + if timelines.get(target_timeline_id).is_none() { + bail!("gc target timeline does not exist") + } + }; + for (timeline_id, timeline_entry) in timelines.iter() { timeline_ids.push(*timeline_id); @@ -867,7 +851,7 @@ impl LayeredRepository { // Somewhat related: https://github.com/zenithdb/zenith/issues/999 if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() { // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timelineid) = target_timelineid { + if let Some(timelineid) = target_timeline_id { if ancestor_timeline_id == &timelineid { all_branchpoints .insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); @@ -882,7 +866,7 @@ impl LayeredRepository { // Ok, we now know all the branch points. // Perform GC for each timeline. - for timelineid in timeline_ids.into_iter() { + for timeline_id in timeline_ids.into_iter() { if thread_mgr::is_shutdown_requested() { // We were requested to shut down. Stop and return with the progress we // made. @@ -891,12 +875,12 @@ impl LayeredRepository { // Timeline is known to be local and loaded. let timeline = self - .get_timeline_load_internal(timelineid, &mut *timelines)? + .get_timeline_load_internal(timeline_id, &mut *timelines)? .expect("checked above that timeline is local and loaded"); // If target_timeline is specified, only GC it - if let Some(target_timelineid) = target_timelineid { - if timelineid != target_timelineid { + if let Some(target_timelineid) = target_timeline_id { + if timeline_id != target_timelineid { continue; } } @@ -905,8 +889,8 @@ impl LayeredRepository { drop(timelines); let branchpoints: Vec = all_branchpoints .range(( - Included((timelineid, Lsn(0))), - Included((timelineid, Lsn(u64::MAX))), + Included((timeline_id, Lsn(0))), + Included((timeline_id, Lsn(u64::MAX))), )) .map(|&x| x.1) .collect(); @@ -916,7 +900,7 @@ impl LayeredRepository { // used in tests, so we want as deterministic results as possible. if checkpoint_before_gc { timeline.checkpoint(CheckpointConfig::Forced)?; - info!("timeline {} checkpoint_before_gc done", timelineid); + info!("timeline {} checkpoint_before_gc done", timeline_id); } timeline.update_gc_info(branchpoints, cutoff, pitr); let result = timeline.gc()?; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 9501a416b4..f9ea4a6ff8 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -260,9 +260,6 @@ pub trait Repository: Send + Sync { /// api's 'compact' command. fn compaction_iteration(&self) -> Result<()>; - /// detaches timeline-related in-memory data. - fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>; - // Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn. fn get_remote_index(&self) -> &RemoteIndex; } @@ -537,7 +534,7 @@ pub mod repo_harness { TenantConfOpt::from(self.tenant_conf), walredo_mgr, self.tenant_id, - RemoteIndex::empty(), + RemoteIndex::default(), false, ); // populate repo with locally available timelines diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 5fe2cde3b7..c52da95945 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -192,6 +192,8 @@ use metrics::{ use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; pub use self::download::download_index_part; +pub use self::download::download_tenant_index_parts; +pub use self::download::try_download_index_parts; pub use self::download::TEMP_DOWNLOAD_EXTENSION; lazy_static! { @@ -301,7 +303,7 @@ pub fn start_local_timeline_sync( } Ok(SyncStartupData { local_timeline_init_statuses, - remote_index: RemoteIndex::empty(), + remote_index: RemoteIndex::default(), }) } } @@ -835,7 +837,7 @@ where .build() .context("Failed to create storage sync runtime")?; - let applicable_index_parts = runtime.block_on(try_fetch_index_parts( + let applicable_index_parts = runtime.block_on(try_download_index_parts( conf, &storage, local_timeline_files.keys().copied().collect(), @@ -918,16 +920,59 @@ fn storage_sync_loop( }); match loop_step { - ControlFlow::Continue(new_timeline_states) => { - if new_timeline_states.is_empty() { - debug!("Sync loop step completed, no new timeline states"); + ControlFlow::Continue(updated_tenants) => { + if updated_tenants.is_empty() { + debug!("Sync loop step completed, no new tenant states"); } else { info!( - "Sync loop step completed, {} new timeline state update(s)", - new_timeline_states.len() + "Sync loop step completed, {} new tenant state update(s)", + updated_tenants.len() ); - // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. - apply_timeline_sync_status_updates(conf, &index, new_timeline_states); + let index_accessor = runtime.block_on(index.write()); + for tenant_id in updated_tenants { + let tenant_entry = match index_accessor.tenant_entry(&tenant_id) { + Some(tenant_entry) => tenant_entry, + None => { + error!( + "cannot find tenant in remote index for timeline sync update" + ); + continue; + } + }; + + if tenant_entry.has_in_progress_downloads() { + info!("Tenant {tenant_id} has pending timeline downloads, skipping repository registration"); + continue; + } else { + info!( + "Tenant {tenant_id} download completed. Registering in repository" + ); + // Here we assume that if tenant has no in-progress downloads that + // means that it is the last completed timeline download that triggered + // sync status update. So we look at the index for available timelines + // and register them all at once in a repository for download + // to be submitted in a single operation to repository + // so it can apply them at once to internal timeline map. + let sync_status_updates: HashMap< + ZTimelineId, + TimelineSyncStatusUpdate, + > = tenant_entry + .keys() + .copied() + .map(|timeline_id| { + (timeline_id, TimelineSyncStatusUpdate::Downloaded) + }) + .collect(); + + // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. + apply_timeline_sync_status_updates( + conf, + &index, + tenant_id, + sync_status_updates, + ); + } + } } } ControlFlow::Break(()) => { @@ -945,7 +990,7 @@ async fn process_batches( index: &RemoteIndex, batched_tasks: HashMap, sync_queue: &SyncQueue, -) -> HashMap> +) -> HashSet where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, @@ -970,18 +1015,13 @@ where }) .collect::>(); - let mut new_timeline_states: HashMap< - ZTenantId, - HashMap, - > = HashMap::new(); + let mut new_timeline_states = HashSet::new(); + // we purposely ignore actual state update, because we're waiting for last timeline download to happen while let Some((sync_id, state_update)) = sync_results.next().await { debug!("Finished storage sync task for sync id {sync_id}"); - if let Some(state_update) = state_update { - new_timeline_states - .entry(sync_id.tenant_id) - .or_default() - .insert(sync_id.timeline_id, state_update); + if state_update.is_some() { + new_timeline_states.insert(sync_id.tenant_id); } } @@ -1458,35 +1498,6 @@ async fn validate_task_retries( ControlFlow::Continue(sync_data) } -async fn try_fetch_index_parts( - conf: &'static PageServerConf, - storage: &S, - keys: HashSet, -) -> HashMap -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ - let mut index_parts = HashMap::with_capacity(keys.len()); - - let mut part_downloads = keys - .into_iter() - .map(|id| async move { (id, download_index_part(conf, storage, id).await) }) - .collect::>(); - - while let Some((id, part_upload_result)) = part_downloads.next().await { - match part_upload_result { - Ok(index_part) => { - debug!("Successfully fetched index part for {id}"); - index_parts.insert(id, index_part); - } - Err(e) => warn!("Failed to fetch index part for {id}: {e}"), - } - } - - index_parts -} - fn schedule_first_sync_tasks( index: &mut RemoteTimelineIndex, sync_queue: &SyncQueue, diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index b51826fa1e..8cb9906e33 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -1,10 +1,14 @@ //! Timeline synchronization logic to fetch the layer files from remote storage into pageserver's local directory. -use std::{collections::HashSet, fmt::Debug, path::Path}; +use std::{ + collections::{HashMap, HashSet}, + fmt::Debug, + path::Path, +}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; -use remote_storage::{path_with_suffix_extension, RemoteStorage}; +use remote_storage::{path_with_suffix_extension, RemoteObjectName, RemoteStorage}; use tokio::{ fs, io::{self, AsyncWriteExt}, @@ -14,7 +18,7 @@ use tracing::{debug, error, info, warn}; use crate::{ config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, }; -use utils::zid::ZTenantTimelineId; +use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; use super::{ index::{IndexPart, RemoteTimeline}, @@ -23,6 +27,105 @@ use super::{ pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; +/// FIXME: Needs cleanup. Currently it swallows errors. Here we need to ensure that +/// we successfully downloaded all metadata parts for one tenant. +/// And successful includes absence of index_part in the remote. Because it is valid situation +/// when timeline was just created and pageserver restarted before upload of index part was completed. +/// But currently RemoteStorage interface does not provide this knowledge because it uses +/// anyhow::Error as an error type. So this needs a refactoring. +/// +/// In other words we need to yield only complete sets of tenant timelines. +/// Failure for one timeline of a tenant should exclude whole tenant from returned hashmap. +/// So there are two requirements: keep everything in one futures unordered +/// to allow higher concurrency. Mark tenants as failed independently. +/// That requires some bookeeping. +pub async fn try_download_index_parts( + conf: &'static PageServerConf, + storage: &S, + keys: HashSet, +) -> HashMap> +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let mut index_parts: HashMap> = HashMap::new(); + + let mut part_downloads = keys + .into_iter() + .map(|id| async move { (id, download_index_part(conf, storage, id).await) }) + .collect::>(); + + while let Some((id, part_upload_result)) = part_downloads.next().await { + match part_upload_result { + Ok(index_part) => { + debug!("Successfully fetched index part for {id}"); + index_parts + .entry(id.tenant_id) + .or_default() + .insert(id.timeline_id, index_part); + } + Err(e) => error!("Failed to fetch index part for {id}: {e}"), + } + } + + index_parts +} + +pub async fn download_tenant_index_parts( + conf: &'static PageServerConf, + storage: &S, + tenant_id: ZTenantId, +) -> anyhow::Result> +where + P: RemoteObjectName + Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let tenant_path = conf.timelines_path(&tenant_id); + let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| { + format!( + "Failed to get tenant storage path for local path '{}'", + tenant_path.display() + ) + })?; + let timelines = storage + .list_prefixes(Some(tenant_storage_path)) + .await + .with_context(|| { + format!( + "Failed to list tenant storage path to get remote timelines to download: {}", + tenant_id + ) + })?; + + let mut sync_ids = HashSet::new(); + + for timeline_remote_storage_key in timelines { + let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { + anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") + })?; + + let timeline_id: ZTimelineId = object_name + .parse() + .with_context(|| { + format!("failed to parse object name into timeline id for tenant {tenant_id} '{object_name}'") + })?; + + sync_ids.insert(ZTenantTimelineId { + tenant_id, + timeline_id, + }); + } + + let index_parts = try_download_index_parts(conf, storage, sync_ids) + .await + .remove(&tenant_id) + .ok_or(anyhow::anyhow!( + "Missing tenant index parts. This is a bug." + ))?; + + Ok(index_parts) +} + /// Retrieves index data from the remote storage for a given timeline. pub async fn download_index_part( conf: &'static PageServerConf, diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 2ba48ddf53..8bc9f6f189 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -2,6 +2,7 @@ //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about //! remote timeline layers and its metadata. +use std::ops::{Deref, DerefMut}; use std::{ collections::{HashMap, HashSet}, path::{Path, PathBuf}, @@ -14,7 +15,10 @@ use serde_with::{serde_as, DisplayFromStr}; use tokio::sync::RwLock; use crate::{config::PageServerConf, layered_repository::metadata::TimelineMetadata}; -use utils::{lsn::Lsn, zid::ZTenantTimelineId}; +use utils::{ + lsn::Lsn, + zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, +}; /// A part of the filesystem path, that needs a root to become a path again. #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] @@ -41,38 +45,68 @@ impl RelativePath { } } +#[derive(Debug, Clone, Default)] +pub struct TenantEntry(HashMap); + +impl TenantEntry { + pub fn has_in_progress_downloads(&self) -> bool { + self.values() + .any(|remote_timeline| remote_timeline.awaits_download) + } +} + +impl Deref for TenantEntry { + type Target = HashMap; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for TenantEntry { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl From> for TenantEntry { + fn from(inner: HashMap) -> Self { + Self(inner) + } +} + /// An index to track tenant files that exist on the remote storage. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct RemoteTimelineIndex { - timeline_entries: HashMap, + entries: HashMap, } /// A wrapper to synchronize the access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`]. +#[derive(Default)] pub struct RemoteIndex(Arc>); impl RemoteIndex { - pub fn empty() -> Self { - Self(Arc::new(RwLock::new(RemoteTimelineIndex { - timeline_entries: HashMap::new(), - }))) - } - pub fn from_parts( conf: &'static PageServerConf, - index_parts: HashMap, + index_parts: HashMap>, ) -> anyhow::Result { - let mut timeline_entries = HashMap::new(); + let mut entries: HashMap = HashMap::new(); - for (sync_id, index_part) in index_parts { - let timeline_path = conf.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id); - let remote_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part) - .context("Failed to restore remote timeline data from index part")?; - timeline_entries.insert(sync_id, remote_timeline); + for (tenant_id, timelines) in index_parts { + for (timeline_id, index_part) in timelines { + let timeline_path = conf.timeline_path(&timeline_id, &tenant_id); + let remote_timeline = + RemoteTimeline::from_index_part(&timeline_path, index_part) + .context("Failed to restore remote timeline data from index part")?; + + entries + .entry(tenant_id) + .or_default() + .insert(timeline_id, remote_timeline); + } } - Ok(Self(Arc::new(RwLock::new(RemoteTimelineIndex { - timeline_entries, - })))) + Ok(Self(Arc::new(RwLock::new(RemoteTimelineIndex { entries })))) } pub async fn read(&self) -> tokio::sync::RwLockReadGuard<'_, RemoteTimelineIndex> { @@ -91,20 +125,50 @@ impl Clone for RemoteIndex { } impl RemoteTimelineIndex { - pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&RemoteTimeline> { - self.timeline_entries.get(id) + pub fn timeline_entry( + &self, + ZTenantTimelineId { + tenant_id, + timeline_id, + }: &ZTenantTimelineId, + ) -> Option<&RemoteTimeline> { + self.entries.get(tenant_id)?.get(timeline_id) } - pub fn timeline_entry_mut(&mut self, id: &ZTenantTimelineId) -> Option<&mut RemoteTimeline> { - self.timeline_entries.get_mut(id) + pub fn timeline_entry_mut( + &mut self, + ZTenantTimelineId { + tenant_id, + timeline_id, + }: &ZTenantTimelineId, + ) -> Option<&mut RemoteTimeline> { + self.entries.get_mut(tenant_id)?.get_mut(timeline_id) } - pub fn add_timeline_entry(&mut self, id: ZTenantTimelineId, entry: RemoteTimeline) { - self.timeline_entries.insert(id, entry); + pub fn add_timeline_entry( + &mut self, + ZTenantTimelineId { + tenant_id, + timeline_id, + }: ZTenantTimelineId, + entry: RemoteTimeline, + ) { + self.entries + .entry(tenant_id) + .or_default() + .insert(timeline_id, entry); } - pub fn all_sync_ids(&self) -> impl Iterator + '_ { - self.timeline_entries.keys().copied() + pub fn tenant_entry(&self, tenant_id: &ZTenantId) -> Option<&TenantEntry> { + self.entries.get(tenant_id) + } + + pub fn tenant_entry_mut(&mut self, tenant_id: &ZTenantId) -> Option<&mut TenantEntry> { + self.entries.get_mut(tenant_id) + } + + pub fn add_tenant_entry(&mut self, tenant_id: ZTenantId) -> &mut TenantEntry { + self.entries.entry(tenant_id).or_default() } pub fn set_awaits_download( diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index c73fed140a..c96dc6973b 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -165,14 +165,14 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result), Attach(ZTenantTimelineId, Arc), } impl std::fmt::Debug for LocalTimelineUpdate { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Self::Detach(ttid) => f.debug_tuple("Remove").field(ttid).finish(), + Self::Detach(ttid, _) => f.debug_tuple("Remove").field(ttid).finish(), Self::Attach(ttid, _) => f.debug_tuple("Add").field(ttid).finish(), } } @@ -182,32 +182,31 @@ impl std::fmt::Debug for LocalTimelineUpdate { pub fn apply_timeline_sync_status_updates( conf: &'static PageServerConf, remote_index: &RemoteIndex, - sync_status_updates: HashMap>, + tenant_id: ZTenantId, + sync_status_updates: HashMap, ) { if sync_status_updates.is_empty() { debug!("no sync status updates to apply"); return; } info!( - "Applying sync status updates for {} timelines", + "Applying sync status updates for tenant {tenant_id} {} timelines", sync_status_updates.len() ); debug!("Sync status updates: {sync_status_updates:?}"); - for (tenant_id, status_updates) in sync_status_updates { - let repo = match load_local_repo(conf, tenant_id, remote_index) { - Ok(repo) => repo, - Err(e) => { - error!("Failed to load repo for tenant {tenant_id} Error: {e:?}",); - continue; - } - }; - match apply_timeline_remote_sync_status_updates(&repo, status_updates) { - Ok(()) => info!("successfully applied sync status updates for tenant {tenant_id}"), - Err(e) => error!( - "Failed to apply timeline sync timeline status updates for tenant {tenant_id}: {e:?}" - ), + let repo = match load_local_repo(conf, tenant_id, remote_index) { + Ok(repo) => repo, + Err(e) => { + error!("Failed to load repo for tenant {tenant_id} Error: {e:?}"); + return; } + }; + match apply_timeline_remote_sync_status_updates(&repo, sync_status_updates) { + Ok(()) => info!("successfully applied sync status updates for tenant {tenant_id}"), + Err(e) => error!( + "Failed to apply timeline sync timeline status updates for tenant {tenant_id}: {e:?}" + ), } } @@ -387,29 +386,49 @@ pub fn get_local_timeline_with_load( } } -pub fn detach_timeline( - conf: &'static PageServerConf, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, -) -> anyhow::Result<()> { - // shutdown the timeline threads (this shuts down the walreceiver) - thread_mgr::shutdown_threads(None, Some(tenant_id), Some(timeline_id)); +pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> anyhow::Result<()> { + set_tenant_state(tenant_id, TenantState::Stopping)?; + // shutdown the tenant and timeline threads: gc, compaction, page service threads) + thread_mgr::shutdown_threads(None, Some(tenant_id), None); - match tenants_state::write_tenants().get_mut(&tenant_id) { - Some(tenant) => { - tenant - .repo - .detach_timeline(timeline_id) - .context("Failed to detach inmem tenant timeline")?; - tenant.local_timelines.remove(&timeline_id); + // FIXME should we protect somehow from starting new threads/walreceivers when tenant is in stopping state? + // send stop signal to wal receiver and collect join handles while holding the lock + let walreceiver_join_handles = { + let tenants = tenants_state::write_tenants(); + let tenant = tenants.get(&tenant_id).context("tenant not found")?; + let mut walreceiver_join_handles = Vec::with_capacity(tenant.local_timelines.len()); + for timeline_id in tenant.local_timelines.keys() { + let (sender, receiver) = std::sync::mpsc::channel::<()>(); tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach( - ZTenantTimelineId::new(tenant_id, timeline_id), + ZTenantTimelineId::new(tenant_id, *timeline_id), + sender, )); + walreceiver_join_handles.push((*timeline_id, receiver)); } - None => bail!("Tenant {tenant_id} not found in local tenant state"), + // drop the tenants lock + walreceiver_join_handles + }; + + // wait for wal receivers to stop without holding the lock, because walreceiver + // will attempt to change tenant state which is protected by the same global tenants lock. + // TODO do we need a timeout here? how to handle it? + // recv_timeout is broken: https://github.com/rust-lang/rust/issues/94518#issuecomment-1057440631 + // need to use crossbeam-channel + for (timeline_id, join_handle) in walreceiver_join_handles { + info!("waiting for wal receiver to shutdown timeline_id {timeline_id}"); + join_handle.recv().context("failed to join walreceiver")?; + info!("wal receiver shutdown confirmed timeline_id {timeline_id}"); } - let local_timeline_directory = conf.timeline_path(&timeline_id, &tenant_id); + tenants_state::write_tenants().remove(&tenant_id); + + // If removal fails there will be no way to successfully retry detach, + // because tenant no longer exists in in memory map. And it needs to be removed from it + // before we remove files because it contains references to repository + // which references ephemeral files which are deleted on drop. So if we keep these references + // code will attempt to remove files which no longer exist. This can be fixed by having shutdown + // mechanism for repository that will clean temporary data to avoid any references to ephemeral files + let local_timeline_directory = conf.tenant_path(&tenant_id); std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { format!( "Failed to remove local timeline directory '{}'", diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index a3939661c1..e0e79e4166 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -202,7 +202,7 @@ pub fn create_repo( // anymore, but I think that could still happen. let wal_redo_manager = Arc::new(crate::walredo::DummyRedoManager {}); - (wal_redo_manager as _, RemoteIndex::empty()) + (wal_redo_manager as _, RemoteIndex::default()) } }; diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 9f0f911e0c..b70350e0da 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -264,7 +264,7 @@ async fn wal_receiver_main_thread_loop_step<'a>( info!("Processing timeline update: {update:?}"); match update { // Timeline got detached, stop all related tasks and remove public timeline data. - LocalTimelineUpdate::Detach(id) => { + LocalTimelineUpdate::Detach(id, join_sender) => { match local_timeline_wal_receivers.get_mut(&id.tenant_id) { Some(wal_receivers) => { if let hash_map::Entry::Occupied(o) = wal_receivers.entry(id.timeline_id) { @@ -280,6 +280,11 @@ async fn wal_receiver_main_thread_loop_step<'a>( }; { WAL_RECEIVER_ENTRIES.write().await.remove(&id); + if let Err(e) = join_sender.send(()) { + warn!("cannot send wal_receiver shutdown confirmation {e}") + } else { + info!("confirm walreceiver shutdown for {id}"); + } } } // Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly. diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index 20e63b4e5c..3e7ba22184 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -105,16 +105,3 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): branch2_cur.execute('SELECT count(*) FROM foo') assert branch2_cur.fetchone() == (300000, ) - - -def test_ancestor_branch_detach(neon_simple_env: NeonEnv): - env = neon_simple_env - - parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_detach_parent", "empty") - - env.neon_cli.create_branch("test_ancestor_branch_detach_branch1", - "test_ancestor_branch_detach_parent") - - ps_http = env.pageserver.http_client() - with pytest.raises(NeonPageserverApiException, match="Failed to detach inmem tenant timeline"): - ps_http.timeline_detach(env.initial_tenant, parent_timeline_id) diff --git a/test_runner/batch_others/test_detach.py b/test_runner/batch_others/test_detach.py new file mode 100644 index 0000000000..105facb656 --- /dev/null +++ b/test_runner/batch_others/test_detach.py @@ -0,0 +1,49 @@ +from threading import Thread +from uuid import uuid4 +import psycopg2 +import pytest + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +def test_detach_smoke(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant() + pg = env.postgres.create_start('main', tenant_id=tenant_id) + # we rely upon autocommit after each statement + pg.safe_psql_many(queries=[ + 'CREATE TABLE t(key int primary key, value text)', + 'INSERT INTO t SELECT generate_series(1,100000), \'payload\'', + ]) + + # gc should try to even start + with pytest.raises(expected_exception=psycopg2.DatabaseError, + match='gc target timeline does not exist'): + env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {uuid4().hex} 0') + + gc_thread = Thread( + target=lambda: env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0'), ) + gc_thread.start() + + last_error = None + for i in range(3): + try: + pageserver_http.tenant_detach(tenant_id) + except Exception as e: + last_error = e + log.error(f"try {i} error detaching tenant: {e}") + continue + else: + break + # else is called if the loop finished without reaching "break" + else: + pytest.fail(f"could not detach timeline: {last_error}") + + gc_thread.join(timeout=10) + + with pytest.raises(expected_exception=psycopg2.DatabaseError, + match=f'Tenant {tenant_id.hex} not found'): + env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0') diff --git a/test_runner/batch_others/test_normal_work.py b/test_runner/batch_others/test_normal_work.py index 4635a70de6..5b25691517 100644 --- a/test_runner/batch_others/test_normal_work.py +++ b/test_runner/batch_others/test_normal_work.py @@ -24,7 +24,7 @@ def check_tenant(env: NeonEnv, pageserver_http: NeonPageserverHttpClient): assert res_2[0] == (5000050000, ) pg.stop() - pageserver_http.timeline_detach(tenant_id, timeline_id) + pageserver_http.tenant_detach(tenant_id) @pytest.mark.parametrize('num_timelines,num_safekeepers', [(3, 1)]) diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index b0ba8758cc..ac39c6290b 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -91,14 +91,14 @@ def test_remote_storage_backup_and_restore(neon_env_builder: NeonEnvBuilder, sto # Introduce failpoint in download env.pageserver.safe_psql(f"failpoints remote-storage-download-pre-rename=return") - client.timeline_attach(UUID(tenant_id), UUID(timeline_id)) + client.tenant_attach(UUID(tenant_id)) - # is there a better way to assert that fafilpoint triggered? + # is there a better way to assert that failpoint triggered? time.sleep(10) # assert cannot attach timeline that is scheduled for download - with pytest.raises(Exception, match="Timeline download is already in progress"): - client.timeline_attach(UUID(tenant_id), UUID(timeline_id)) + with pytest.raises(Exception, match="Conflict: Tenant download is already in progress"): + client.tenant_attach(UUID(tenant_id)) detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) log.info("Timeline detail with active failpoint: %s", detail) @@ -109,7 +109,7 @@ def test_remote_storage_backup_and_restore(neon_env_builder: NeonEnvBuilder, sto env.pageserver.stop() env.pageserver.start() - client.timeline_attach(UUID(tenant_id), UUID(timeline_id)) + client.tenant_attach(UUID(tenant_id)) log.info("waiting for timeline redownload") wait_until(number_of_iterations=10, diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index e9c493cad6..0560469ca1 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -3,14 +3,13 @@ import os import pathlib import subprocess import threading -import typing from uuid import UUID from fixtures.log_helper import log -from typing import Optional +from typing import Any, Dict, Optional, Tuple import signal import pytest -from fixtures.neon_fixtures import PgProtocol, PortDistributor, Postgres, NeonEnvBuilder, Etcd, NeonPageserverHttpClient, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload, neon_binpath, pg_distrib_dir, base_dir +from fixtures.neon_fixtures import NeonEnv, PortDistributor, Postgres, NeonEnvBuilder, Etcd, NeonPageserverHttpClient, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload, neon_binpath, pg_distrib_dir, base_dir from fixtures.utils import lsn_from_hex, subprocess_capture @@ -101,6 +100,102 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve log.info('load thread stopped') +def populate_branch(pg: Postgres, create_table: bool, + expected_sum: Optional[int]) -> Tuple[UUID, int]: + # insert some data + with pg_cur(pg) as cur: + cur.execute("SHOW neon.timeline_id") + timeline_id = UUID(cur.fetchone()[0]) + log.info("timeline to relocate %s", timeline_id.hex) + + # we rely upon autocommit after each statement + # as waiting for acceptors happens there + if create_table: + cur.execute("CREATE TABLE t(key int, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'some payload'") + if expected_sum is not None: + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (expected_sum, ) + cur.execute("SELECT pg_current_wal_flush_lsn()") + + current_lsn = lsn_from_hex(cur.fetchone()[0]) + return timeline_id, current_lsn + + +def ensure_checkpoint( + pageserver_cur, + pageserver_http: NeonPageserverHttpClient, + tenant_id: UUID, + timeline_id: UUID, + current_lsn: int, +): + # run checkpoint manually to be sure that data landed in remote storage + pageserver_cur.execute(f"checkpoint {tenant_id.hex} {timeline_id.hex}") + + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + +def check_timeline_attached( + new_pageserver_http_client: NeonPageserverHttpClient, + tenant_id: UUID, + timeline_id: UUID, + old_timeline_detail: Dict[str, Any], + old_current_lsn: int, +): + # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint + new_timeline_detail = wait_until( + number_of_iterations=5, + interval=1, + func=lambda: assert_local(new_pageserver_http_client, tenant_id, timeline_id)) + + # when load is active these checks can break because lsns are not static + # so lets check with some margin + assert_abs_margin_ratio(lsn_from_hex(new_timeline_detail['local']['disk_consistent_lsn']), + lsn_from_hex(old_timeline_detail['local']['disk_consistent_lsn']), + 0.03) + + assert_abs_margin_ratio(lsn_from_hex(new_timeline_detail['local']['disk_consistent_lsn']), + old_current_lsn, + 0.03) + + +def switch_pg_to_new_pageserver(env: NeonEnv, + pg: Postgres, + new_pageserver_port: int, + tenant_id: UUID, + timeline_id: UUID) -> pathlib.Path: + pg.stop() + + pg_config_file_path = pathlib.Path(pg.config_file_path()) + pg_config_file_path.open('a').write( + f"\nneon.pageserver_connstring = 'postgresql://no_user:@localhost:{new_pageserver_port}'") + + pg.start() + + timeline_to_detach_local_path = env.repo_dir / 'tenants' / tenant_id.hex / 'timelines' / timeline_id.hex + files_before_detach = os.listdir(timeline_to_detach_local_path) + assert 'metadata' in files_before_detach, f'Regular timeline {timeline_to_detach_local_path} should have the metadata file,\ + but got: {files_before_detach}' + assert len(files_before_detach) >= 2, f'Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\ + but got {files_before_detach}' + + return timeline_to_detach_local_path + + +def post_migration_check(pg: Postgres, sum_before_migration: int, old_local_path: pathlib.Path): + with pg_cur(pg) as cur: + # check that data is still there + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (sum_before_migration, ) + # check that we can write new data + cur.execute("INSERT INTO t SELECT generate_series(1001,2000), 'some payload'") + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (sum_before_migration + 1500500, ) + + assert not os.path.exists(old_local_path), f'After detach, local timeline dir {old_local_path} should be removed' + + @pytest.mark.parametrize( 'method', [ @@ -126,61 +221,73 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, # create folder for remote storage mock remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage' - tenant, _ = env.neon_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) - log.info("tenant to relocate %s", tenant) + # we use two branches to check that they are both relocated + # first branch is used for load, compute for second one is used to + # check that data is not lost - # attach does not download ancestor branches (should it?), just use root branch for now - env.neon_cli.create_root_branch('test_tenant_relocation', tenant_id=tenant) + tenant_id, initial_timeline_id = env.neon_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) + log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id) - tenant_pg = env.postgres.create_start(branch_name='test_tenant_relocation', - node_name='test_tenant_relocation', - tenant_id=tenant) + env.neon_cli.create_branch("test_tenant_relocation_main", tenant_id=tenant_id) + pg_main = env.postgres.create_start(branch_name='test_tenant_relocation_main', + tenant_id=tenant_id) - # insert some data - with closing(tenant_pg.connect()) as conn: - with conn.cursor() as cur: - # save timeline for later gc call - cur.execute("SHOW neon.timeline_id") - timeline = UUID(cur.fetchone()[0]) - log.info("timeline to relocate %s", timeline.hex) + timeline_id_main, current_lsn_main = populate_branch(pg_main, create_table=True, expected_sum=500500) - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - cur.execute("CREATE TABLE t(key int primary key, value text)") - cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'some payload'") - cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (500500, ) - cur.execute("SELECT pg_current_wal_flush_lsn()") + env.neon_cli.create_branch( + new_branch_name="test_tenant_relocation_second", + ancestor_branch_name="test_tenant_relocation_main", + tenant_id=tenant_id, + ) + pg_second = env.postgres.create_start(branch_name='test_tenant_relocation_second', + tenant_id=tenant_id) - current_lsn = lsn_from_hex(cur.fetchone()[0]) + # do not select sum for second branch, this select will wait until wal reaches pageserver + # try to check another case when pageserver didnt receive that wal and needs to get it from safekeeper + timeline_id_second, current_lsn_second = populate_branch(pg_second, create_table=False, expected_sum=1001000) pageserver_http = env.pageserver.http_client() # wait until pageserver receives that data - wait_for_last_record_lsn(pageserver_http, tenant, timeline, current_lsn) - timeline_detail = assert_local(pageserver_http, tenant, timeline) + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_main, current_lsn_main) + timeline_detail_main = assert_local(pageserver_http, tenant_id, timeline_id_main) + + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_second, current_lsn_second) + timeline_detail_second = assert_local(pageserver_http, tenant_id, timeline_id_second) if with_load == 'with_load': # create load table - with pg_cur(tenant_pg) as cur: + with pg_cur(pg_main) as cur: cur.execute("CREATE TABLE load(value text)") load_stop_event = threading.Event() load_ok_event = threading.Event() load_thread = threading.Thread( target=load, - args=(tenant_pg, load_stop_event, load_ok_event), + args=(pg_main, load_stop_event, load_ok_event), daemon=True, # To make sure the child dies when the parent errors ) load_thread.start() - # run checkpoint manually to be sure that data landed in remote storage - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor() as pscur: - pscur.execute(f"checkpoint {tenant.hex} {timeline.hex}") + # this requirement introduces a problem + # if user creates a branch during migration + # it wont appear on the new pageserver + with pg_cur(env.pageserver) as cur: + ensure_checkpoint( + cur, + pageserver_http=pageserver_http, + tenant_id=tenant_id, + timeline_id=timeline_id_main, + current_lsn=current_lsn_main, + ) - # wait until pageserver successfully uploaded a checkpoint to remote storage - wait_for_upload(pageserver_http, tenant, timeline, current_lsn) + ensure_checkpoint( + cur, + pageserver_http=pageserver_http, + tenant_id=tenant_id, + timeline_id=timeline_id_second, + current_lsn=current_lsn_second, + ) log.info("inititalizing new pageserver") # bootstrap second pageserver @@ -207,7 +314,7 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, "python", os.path.join(base_dir, "scripts/export_import_between_pageservers.py"), "--tenant-id", - tenant.hex, + tenant_id.hex, "--from-host", "localhost", "--from-http-port", @@ -228,22 +335,23 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, subprocess_capture(str(env.repo_dir), cmd, check=True) elif method == "minor": # call to attach timeline to new pageserver - new_pageserver_http.timeline_attach(tenant, timeline) + new_pageserver_http.tenant_attach(tenant_id) - # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint - new_timeline_detail = wait_until( - number_of_iterations=5, - interval=1, - func=lambda: assert_local(new_pageserver_http, tenant, timeline)) + check_timeline_attached( + new_pageserver_http, + tenant_id, + timeline_id_main, + timeline_detail_main, + current_lsn_main, + ) - # when load is active these checks can break because lsns are not static - # so lets check with some margin - assert_abs_margin_ratio( - lsn_from_hex(new_timeline_detail['local']['disk_consistent_lsn']), - lsn_from_hex(timeline_detail['local']['disk_consistent_lsn']), - 0.03) - - tenant_pg.stop() + check_timeline_attached( + new_pageserver_http, + tenant_id, + timeline_id_second, + timeline_detail_second, + current_lsn_second, + ) # rewrite neon cli config to use new pageserver for basebackup to start new compute cli_config_lines = (env.repo_dir / 'config').read_text().splitlines() @@ -251,33 +359,29 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'" (env.repo_dir / 'config').write_text('\n'.join(cli_config_lines)) - tenant_pg_config_file_path = pathlib.Path(tenant_pg.config_file_path()) - tenant_pg_config_file_path.open('a').write( - f"\nneon.pageserver_connstring = 'postgresql://no_user:@localhost:{new_pageserver_pg_port}'" + old_local_path_main = switch_pg_to_new_pageserver( + env, + pg_main, + new_pageserver_pg_port, + tenant_id, + timeline_id_main, ) - tenant_pg.start() - - timeline_to_detach_local_path = env.repo_dir / 'tenants' / tenant.hex / 'timelines' / timeline.hex - files_before_detach = os.listdir(timeline_to_detach_local_path) - assert 'metadata' in files_before_detach, f'Regular timeline {timeline_to_detach_local_path} should have the metadata file,\ - but got: {files_before_detach}' - assert len(files_before_detach) > 2, f'Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\ - but got {files_before_detach}' + old_local_path_second = switch_pg_to_new_pageserver( + env, + pg_second, + new_pageserver_pg_port, + tenant_id, + timeline_id_second, + ) # detach tenant from old pageserver before we check # that all the data is there to be sure that old pageserver # is no longer involved, and if it is, we will see the errors - pageserver_http.timeline_detach(tenant, timeline) + pageserver_http.tenant_detach(tenant_id) - with pg_cur(tenant_pg) as cur: - # check that data is still there - cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (500500, ) - # check that we can write new data - cur.execute("INSERT INTO t SELECT generate_series(1001,2000), 'some payload'") - cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (2001000, ) + post_migration_check(pg_main, 500500, old_local_path_main) + post_migration_check(pg_second, 1001000, old_local_path_second) if with_load == 'with_load': assert load_ok_event.wait(3) @@ -286,8 +390,6 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, load_thread.join(timeout=10) log.info('load thread stopped') - assert not os.path.exists(timeline_to_detach_local_path), f'After detach, local timeline dir {timeline_to_detach_local_path} should be removed' - # bring old pageserver back for clean shutdown via neon cli # new pageserver will be shut down by the context manager cli_config_lines = (env.repo_dir / 'config').read_text().splitlines() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 3d4daf5f29..e6967e3682 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -795,16 +795,12 @@ class NeonPageserverHttpClient(requests.Session): def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() - def timeline_attach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/attach", - ) + def tenant_attach(self, tenant_id: uuid.UUID): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/attach") self.verbose_error(res) - def timeline_detach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/detach", - ) + def tenant_detach(self, tenant_id: uuid.UUID): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/detach") self.verbose_error(res) def timeline_create( From e1e24336b7a97d9c5cea4786ebd7f5b35289b7a3 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 24 Jun 2022 18:52:44 +0300 Subject: [PATCH 0479/1022] review adjustments, bring back timeline_detach and rename it to timeline_delete --- libs/remote_storage/src/s3_bucket.rs | 5 +- pageserver/src/http/openapi_spec.yml | 33 ++- pageserver/src/http/routes.rs | 37 +++- pageserver/src/layered_repository.rs | 91 +++++--- pageserver/src/repository.rs | 31 +-- pageserver/src/storage_sync.rs | 69 +++--- pageserver/src/storage_sync/download.rs | 6 +- pageserver/src/storage_sync/index.rs | 17 ++ pageserver/src/tenant_mgr.rs | 196 ++++++++++++------ pageserver/src/walreceiver.rs | 30 +-- .../batch_others/test_ancestor_branch.py | 23 ++ .../batch_others/test_broken_timeline.py | 6 +- test_runner/batch_others/test_import.py | 2 +- test_runner/fixtures/neon_fixtures.py | 64 +++--- 14 files changed, 401 insertions(+), 209 deletions(-) diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 3b413e30ce..ff52f033d1 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -122,8 +122,9 @@ impl S3ObjectKey { impl RemoteObjectName for S3ObjectKey { /// Turn a/b/c or a/b/c/ into c fn object_name(&self) -> Option<&str> { - // corner case - if &self.0 == "/" { + // corner case, char::to_string is not const, thats why this is more verbose than it needs to be + // see https://github.com/rust-lang/rust/issues/88674 + if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR { return None; } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index ebbb0d5ced..6cfedc0931 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -122,6 +122,35 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" + delete: + description: "Attempts to delete specified timeline. On 500 errors should be retried" + responses: + "200": + description: Ok + "400": + description: Error when no tenant id found in path or no timeline id + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/timeline/{timeline_id}/wal_receiver: parameters: @@ -190,7 +219,6 @@ paths: "410": description: GONE - /v1/tenant/{tenant_id}/attach: parameters: - name: tenant_id @@ -200,7 +228,7 @@ paths: type: string format: hex post: - description: Deprecated + description: Schedules attach operation to happen in the background for given tenant responses: "202": description: Tenant attaching scheduled @@ -299,7 +327,6 @@ paths: schema: $ref: "#/components/schemas/Error" - /v1/tenant/{tenant_id}/timeline/: parameters: - name: tenant_id diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 41c78210f4..997e3c9a1f 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -353,22 +353,45 @@ async fn try_download_tenant_index( Ok(Some(remote_timelines)) } -async fn timeline_detach_handler(_: Request) -> Result, ApiError> { - json_response(StatusCode::GONE, ()) +async fn timeline_delete_handler(request: Request) -> Result, ApiError> { + let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + + let state = get_state(&request); + tokio::task::spawn_blocking(move || { + let _enter = info_span!("tenant_detach_handler", tenant = %tenant_id).entered(); + tenant_mgr::delete_timeline(tenant_id, timeline_id) + }) + .await + .map_err(ApiError::from_err)??; + + let mut remote_index = state.remote_index.write().await; + remote_index.remove_timeline_entry(ZTenantTimelineId { + tenant_id, + timeline_id, + }); + + json_response(StatusCode::OK, ()) } async fn tenant_detach_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; + let state = get_state(&request); + let conf = state.conf; tokio::task::spawn_blocking(move || { let _enter = info_span!("tenant_detach_handler", tenant = %tenant_id).entered(); - let state = get_state(&request); - tenant_mgr::detach_tenant(state.conf, tenant_id) + tenant_mgr::detach_tenant(conf, tenant_id) }) .await .map_err(ApiError::from_err)??; + let mut remote_index = state.remote_index.write().await; + remote_index.remove_tenant_entry(&tenant_id); + json_response(StatusCode::OK, ()) } @@ -540,6 +563,10 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler, ) + .delete( + "/v1/tenant/:tenant_id/timeline/:timeline_id", + timeline_delete_handler, + ) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver", wal_receiver_get_handler, @@ -550,7 +577,7 @@ pub fn make_router( ) .post( "/v1/tenant/:tenant_id/timeline/:timeline_id/detach", - timeline_detach_handler, + timeline_delete_handler, ) .any(handler_404)) } diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 2369f46c4f..a1870703f4 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -38,9 +38,7 @@ use crate::keyspace::KeySpace; use crate::storage_sync::index::RemoteIndex; use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::repository::{ - GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncStatusUpdate, TimelineWriter, -}; +use crate::repository::{GcResult, Repository, RepositoryTimeline, Timeline, TimelineWriter}; use crate::repository::{Key, Value}; use crate::tenant_mgr; use crate::thread_mgr; @@ -410,28 +408,61 @@ impl Repository for LayeredRepository { Ok(()) } - fn apply_timeline_remote_sync_status_update( - &self, - timeline_id: ZTimelineId, - timeline_sync_status_update: TimelineSyncStatusUpdate, - ) -> Result<()> { - debug!( - "apply_timeline_remote_sync_status_update timeline_id: {} update: {:?}", - timeline_id, timeline_sync_status_update + // in order to be retriable detach needs to be idempotent + fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> { + // in order to be retriable detach needs to be idempotent + let mut timelines = self.timelines.lock().unwrap(); + + // Ensure that there are no child timelines **attached to that pageserver**, + // because detach removes files, which will brake child branches + let num_children = timelines + .iter() + .filter(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id)) + .count(); + + ensure!( + num_children == 0, + "Cannot detach timeline which has child timelines" ); - match timeline_sync_status_update { - TimelineSyncStatusUpdate::Downloaded => { - match self.timelines.lock().unwrap().entry(timeline_id) { - Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."), - Entry::Vacant(entry) => { - // we need to get metadata of a timeline, another option is to pass it along with Downloaded status - let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?; - // finally we make newly downloaded timeline visible to repository - entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, }) - }, - }; - } - } + let timeline_entry = match timelines.entry(timeline_id) { + Entry::Occupied(e) => e, + Entry::Vacant(_) => bail!("timeline not found"), + }; + + // try to acquire gc and compaction locks to prevent errors from missing files + let _gc_guard = self + .gc_cs + .try_lock() + .map_err(|e| anyhow::anyhow!("cannot acquire gc lock {e}"))?; + + let compaction_guard = timeline_entry.get().compaction_guard()?; + + let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id); + std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { + format!( + "Failed to remove local timeline directory '{}'", + local_timeline_directory.display() + ) + })?; + info!("detach removed files"); + + drop(compaction_guard); + timeline_entry.remove(); + + Ok(()) + } + + fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> { + debug!("attach timeline_id: {}", timeline_id,); + match self.timelines.lock().unwrap().entry(timeline_id) { + Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."), + Entry::Vacant(entry) => { + // we need to get metadata of a timeline, another option is to pass it along with Downloaded status + let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?; + // finally we make newly downloaded timeline visible to repository + entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, }) + }, + }; Ok(()) } @@ -481,6 +512,18 @@ impl LayeredTimelineEntry { } } } + + fn compaction_guard(&self) -> Result>, anyhow::Error> { + match self { + LayeredTimelineEntry::Loaded(timeline) => timeline + .compaction_cs + .try_lock() + .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) + .map(Some), + + LayeredTimelineEntry::Unloaded { .. } => Ok(None), + } + } } impl From for RepositoryTimeline { diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index f9ea4a6ff8..5b28681b16 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -7,7 +7,6 @@ use byteorder::{ByteOrder, BE}; use bytes::Bytes; use serde::{Deserialize, Serialize}; use std::fmt; -use std::fmt::Display; use std::ops::{AddAssign, Range}; use std::sync::{Arc, RwLockReadGuard}; use std::time::Duration; @@ -182,20 +181,6 @@ impl Value { } } -#[derive(Clone, Copy, Debug)] -pub enum TimelineSyncStatusUpdate { - Downloaded, -} - -impl Display for TimelineSyncStatusUpdate { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let s = match self { - TimelineSyncStatusUpdate::Downloaded => "Downloaded", - }; - f.write_str(s) - } -} - /// /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. @@ -204,11 +189,7 @@ pub trait Repository: Send + Sync { /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization. /// See [`crate::remote_storage`] for more details about the synchronization. - fn apply_timeline_remote_sync_status_update( - &self, - timeline_id: ZTimelineId, - timeline_sync_status_update: TimelineSyncStatusUpdate, - ) -> Result<()>; + fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>; /// Get Timeline handle for given zenith timeline ID. /// This function is idempotent. It doesn't change internal state in any way. @@ -260,7 +241,10 @@ pub trait Repository: Send + Sync { /// api's 'compact' command. fn compaction_iteration(&self) -> Result<()>; - // Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn. + /// removes timeline-related in-memory data + fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()>; + + /// Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn. fn get_remote_index(&self) -> &RemoteIndex; } @@ -550,10 +534,7 @@ pub mod repo_harness { .parse() .unwrap(); - repo.apply_timeline_remote_sync_status_update( - timeline_id, - TimelineSyncStatusUpdate::Downloaded, - )?; + repo.attach_timeline(timeline_id)?; } Ok(repo) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index c52da95945..6df41d854c 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -178,9 +178,8 @@ use crate::{ metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}, LayeredRepository, }, - repository::TimelineSyncStatusUpdate, storage_sync::{self, index::RemoteIndex}, - tenant_mgr::apply_timeline_sync_status_updates, + tenant_mgr::attach_downloaded_tenants, thread_mgr, thread_mgr::ThreadKind, }; @@ -191,9 +190,8 @@ use metrics::{ }; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; -pub use self::download::download_index_part; +use self::download::download_index_parts; pub use self::download::download_tenant_index_parts; -pub use self::download::try_download_index_parts; pub use self::download::TEMP_DOWNLOAD_EXTENSION; lazy_static! { @@ -837,7 +835,7 @@ where .build() .context("Failed to create storage sync runtime")?; - let applicable_index_parts = runtime.block_on(try_download_index_parts( + let applicable_index_parts = runtime.block_on(download_index_parts( conf, &storage, local_timeline_files.keys().copied().collect(), @@ -928,6 +926,8 @@ fn storage_sync_loop( "Sync loop step completed, {} new tenant state update(s)", updated_tenants.len() ); + let mut sync_status_updates: HashMap> = + HashMap::new(); let index_accessor = runtime.block_on(index.write()); for tenant_id in updated_tenants { let tenant_entry = match index_accessor.tenant_entry(&tenant_id) { @@ -945,7 +945,7 @@ fn storage_sync_loop( continue; } else { info!( - "Tenant {tenant_id} download completed. Registering in repository" + "Tenant {tenant_id} download completed. Picking to register in repository" ); // Here we assume that if tenant has no in-progress downloads that // means that it is the last completed timeline download that triggered @@ -953,26 +953,13 @@ fn storage_sync_loop( // and register them all at once in a repository for download // to be submitted in a single operation to repository // so it can apply them at once to internal timeline map. - let sync_status_updates: HashMap< - ZTimelineId, - TimelineSyncStatusUpdate, - > = tenant_entry - .keys() - .copied() - .map(|timeline_id| { - (timeline_id, TimelineSyncStatusUpdate::Downloaded) - }) - .collect(); - - // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. - apply_timeline_sync_status_updates( - conf, - &index, - tenant_id, - sync_status_updates, - ); + sync_status_updates + .insert(tenant_id, tenant_entry.keys().copied().collect()); } } + drop(index_accessor); + // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. + attach_downloaded_tenants(conf, &index, sync_status_updates); } } ControlFlow::Break(()) => { @@ -983,6 +970,14 @@ fn storage_sync_loop( } } +// needed to check whether the download happened +// more informative than just a bool +#[derive(Debug)] +enum DownloadMarker { + Downloaded, + Nothing, +} + async fn process_batches( conf: &'static PageServerConf, max_sync_errors: NonZeroU32, @@ -1015,17 +1010,19 @@ where }) .collect::>(); - let mut new_timeline_states = HashSet::new(); + let mut downloaded_timelines = HashSet::new(); - // we purposely ignore actual state update, because we're waiting for last timeline download to happen - while let Some((sync_id, state_update)) = sync_results.next().await { - debug!("Finished storage sync task for sync id {sync_id}"); - if state_update.is_some() { - new_timeline_states.insert(sync_id.tenant_id); + while let Some((sync_id, download_marker)) = sync_results.next().await { + debug!( + "Finished storage sync task for sync id {sync_id} download marker {:?}", + download_marker + ); + if matches!(download_marker, DownloadMarker::Downloaded) { + downloaded_timelines.insert(sync_id.tenant_id); } } - new_timeline_states + downloaded_timelines } async fn process_sync_task_batch( @@ -1034,7 +1031,7 @@ async fn process_sync_task_batch( max_sync_errors: NonZeroU32, sync_id: ZTenantTimelineId, batch: SyncTaskBatch, -) -> Option +) -> DownloadMarker where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, @@ -1119,7 +1116,7 @@ where } } } - None + DownloadMarker::Nothing } .instrument(info_span!("download_timeline_data")), ); @@ -1173,7 +1170,7 @@ async fn download_timeline_data( new_download_data: SyncData, sync_start: Instant, task_name: &str, -) -> Option +) -> DownloadMarker where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, @@ -1202,7 +1199,7 @@ where Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) { Ok(()) => { register_sync_status(sync_id, sync_start, task_name, Some(true)); - return Some(TimelineSyncStatusUpdate::Downloaded); + return DownloadMarker::Downloaded; } Err(e) => { error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}"); @@ -1218,7 +1215,7 @@ where } } - None + DownloadMarker::Nothing } async fn update_local_metadata( diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 8cb9906e33..05a3df166a 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -39,7 +39,7 @@ pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; /// So there are two requirements: keep everything in one futures unordered /// to allow higher concurrency. Mark tenants as failed independently. /// That requires some bookeeping. -pub async fn try_download_index_parts( +pub async fn download_index_parts( conf: &'static PageServerConf, storage: &S, keys: HashSet, @@ -116,7 +116,7 @@ where }); } - let index_parts = try_download_index_parts(conf, storage, sync_ids) + let index_parts = download_index_parts(conf, storage, sync_ids) .await .remove(&tenant_id) .ok_or(anyhow::anyhow!( @@ -127,7 +127,7 @@ where } /// Retrieves index data from the remote storage for a given timeline. -pub async fn download_index_part( +async fn download_index_part( conf: &'static PageServerConf, storage: &S, sync_id: ZTenantTimelineId, diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 8bc9f6f189..54be3d0f8c 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -159,6 +159,19 @@ impl RemoteTimelineIndex { .insert(timeline_id, entry); } + pub fn remove_timeline_entry( + &mut self, + ZTenantTimelineId { + tenant_id, + timeline_id, + }: ZTenantTimelineId, + ) -> Option { + self.entries + .entry(tenant_id) + .or_default() + .remove(&timeline_id) + } + pub fn tenant_entry(&self, tenant_id: &ZTenantId) -> Option<&TenantEntry> { self.entries.get(tenant_id) } @@ -171,6 +184,10 @@ impl RemoteTimelineIndex { self.entries.entry(tenant_id).or_default() } + pub fn remove_tenant_entry(&mut self, tenant_id: &ZTenantId) -> Option { + self.entries.remove(tenant_id) + } + pub fn set_awaits_download( &mut self, id: &ZTenantTimelineId, diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index c96dc6973b..84282be63f 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -4,7 +4,7 @@ use crate::config::PageServerConf; use crate::layered_repository::{load_metadata, LayeredRepository}; use crate::pgdatadir_mapping::DatadirTimeline; -use crate::repository::{Repository, TimelineSyncStatusUpdate}; +use crate::repository::Repository; use crate::storage_sync::index::RemoteIndex; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; @@ -17,7 +17,7 @@ use anyhow::{bail, Context}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use std::collections::hash_map::Entry; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fmt; use std::sync::Arc; use tokio::sync::mpsc; @@ -157,7 +157,13 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result anyhow::Result), - Attach(ZTenantTimelineId, Arc), + Detach { + id: ZTenantTimelineId, + // used to signal to the detach caller that walreceiver successfully terminated for specified id + join_confirmation_sender: std::sync::mpsc::Sender<()>, + }, + Attach { + id: ZTenantTimelineId, + datadir: Arc, + }, } impl std::fmt::Debug for LocalTimelineUpdate { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Self::Detach(ttid, _) => f.debug_tuple("Remove").field(ttid).finish(), - Self::Attach(ttid, _) => f.debug_tuple("Add").field(ttid).finish(), + Self::Detach { id, .. } => f.debug_tuple("Remove").field(id).finish(), + Self::Attach { id, .. } => f.debug_tuple("Add").field(id).finish(), } } } /// Updates tenants' repositories, changing their timelines state in memory. -pub fn apply_timeline_sync_status_updates( +pub fn attach_downloaded_tenants( conf: &'static PageServerConf, remote_index: &RemoteIndex, - tenant_id: ZTenantId, - sync_status_updates: HashMap, + sync_status_updates: HashMap>, ) { if sync_status_updates.is_empty() { - debug!("no sync status updates to apply"); + debug!("No sync status updates to apply"); return; } - info!( - "Applying sync status updates for tenant {tenant_id} {} timelines", - sync_status_updates.len() - ); - debug!("Sync status updates: {sync_status_updates:?}"); + for (tenant_id, downloaded_timelines) in sync_status_updates { + info!( + "Registering downlloaded timelines for {tenant_id} {} timelines", + downloaded_timelines.len() + ); + debug!("Downloaded timelines: {downloaded_timelines:?}"); - let repo = match load_local_repo(conf, tenant_id, remote_index) { - Ok(repo) => repo, - Err(e) => { - error!("Failed to load repo for tenant {tenant_id} Error: {e:?}"); - return; + let repo = match load_local_repo(conf, tenant_id, remote_index) { + Ok(repo) => repo, + Err(e) => { + error!("Failed to load repo for tenant {tenant_id} Error: {e:?}"); + continue; + } + }; + match attach_downloaded_tenant(&repo, downloaded_timelines) { + Ok(()) => info!("successfully applied sync status updates for tenant {tenant_id}"), + Err(e) => error!( + "Failed to apply timeline sync timeline status updates for tenant {tenant_id}: {e:?}" + ), } - }; - match apply_timeline_remote_sync_status_updates(&repo, sync_status_updates) { - Ok(()) => info!("successfully applied sync status updates for tenant {tenant_id}"), - Err(e) => error!( - "Failed to apply timeline sync timeline status updates for tenant {tenant_id}: {e:?}" - ), } } @@ -386,6 +400,59 @@ pub fn get_local_timeline_with_load( } } +pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { + // shutdown the timeline threads (this shuts down the walreceiver) + // FIXME it does not shut down wal receiver + + // Things needed to be done + // *. check no ancestors + // *. remove from repo map + // *. remove from global tenant timelines map + // -- no new connections can see the timeline + // *. shutdown threads + // *. join walreceiver (any flushing thread?) + // *. delete files while ensuring that no gc or compaction is in progress + // 7. should we checkpoint before detach? That can be harmful during relocation, + // because it will upload to s3 something that other pageserver didnt see + // TODO put falpoints at every step. Iterate over failpoints + // in detach test and check that timeline is either attached or detached + // verify with a try to start a compute + // TODO adjust remote_index + // what is harder, write whole tenant detach correctly, or fix the timeline based one. + + // TODO bail on active page_service threads? + // TODO what about inprogress downloads or uploads? + // can it be idempotent? + // FAILPOINTS: broken repo.detach_timeline + // broken wal_receiver + // broken rmdir + + let (sender, receiver) = std::sync::mpsc::channel::<()>(); + tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach { + id: ZTenantTimelineId::new(tenant_id, timeline_id), + join_confirmation_sender: sender, + }); + + info!("waiting for wal receiver to shutdown"); + let _ = receiver.recv(); + info!("wal receiver shutdown confirmed"); + info!("waiting for threads to shutdown"); + thread_mgr::shutdown_threads(None, None, Some(timeline_id)); + info!("thread shutdown completed"); + match tenants_state::write_tenants().get_mut(&tenant_id) { + Some(tenant) => { + tenant + .repo + .delete_timeline(timeline_id) + .context("Failed to delete tenant timeline from repo")?; + tenant.local_timelines.remove(&timeline_id); + } + None => warn!("Tenant {tenant_id} not found in local tenant state"), + } + + Ok(()) +} + pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> anyhow::Result<()> { set_tenant_state(tenant_id, TenantState::Stopping)?; // shutdown the tenant and timeline threads: gc, compaction, page service threads) @@ -399,10 +466,10 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any let mut walreceiver_join_handles = Vec::with_capacity(tenant.local_timelines.len()); for timeline_id in tenant.local_timelines.keys() { let (sender, receiver) = std::sync::mpsc::channel::<()>(); - tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach( - ZTenantTimelineId::new(tenant_id, *timeline_id), - sender, - )); + tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach { + id: ZTenantTimelineId::new(tenant_id, *timeline_id), + join_confirmation_sender: sender, + }); walreceiver_join_handles.push((*timeline_id, receiver)); } // drop the tenants lock @@ -428,11 +495,11 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any // which references ephemeral files which are deleted on drop. So if we keep these references // code will attempt to remove files which no longer exist. This can be fixed by having shutdown // mechanism for repository that will clean temporary data to avoid any references to ephemeral files - let local_timeline_directory = conf.tenant_path(&tenant_id); - std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { + let local_tenant_directory = conf.tenant_path(&tenant_id); + std::fs::remove_dir_all(&local_tenant_directory).with_context(|| { format!( "Failed to remove local timeline directory '{}'", - local_timeline_directory.display() + local_tenant_directory.display() ) })?; @@ -453,10 +520,10 @@ fn load_local_timeline( )); page_tline.init_logical_size()?; - tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach( - ZTenantTimelineId::new(repo.tenant_id(), timeline_id), - Arc::clone(&page_tline), - )); + tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach { + id: ZTenantTimelineId::new(repo.tenant_id(), timeline_id), + datadir: Arc::clone(&page_tline), + }); Ok(page_tline) } @@ -486,9 +553,13 @@ pub fn list_tenants() -> Vec { /// A timeline is categorized as broken when any of following conditions is true: /// - failed to load the timeline's metadata /// - the timeline's disk consistent LSN is zero -fn check_broken_timeline(repo: &LayeredRepository, timeline_id: ZTimelineId) -> anyhow::Result<()> { - let metadata = load_metadata(repo.conf, timeline_id, repo.tenant_id()) - .context("failed to load metadata")?; +fn check_broken_timeline( + conf: &'static PageServerConf, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, +) -> anyhow::Result<()> { + let metadata = + load_metadata(conf, timeline_id, tenant_id).context("failed to load metadata")?; // A timeline with zero disk consistent LSN can happen when the page server // failed to checkpoint the timeline import data when creating that timeline. @@ -499,61 +570,56 @@ fn check_broken_timeline(repo: &LayeredRepository, timeline_id: ZTimelineId) -> Ok(()) } +/// Note: all timelines are attached at once if and only if all of them are locally complete fn init_local_repository( conf: &'static PageServerConf, tenant_id: ZTenantId, local_timeline_init_statuses: HashMap, remote_index: &RemoteIndex, ) -> anyhow::Result<(), anyhow::Error> { - // initialize local tenant - let repo = load_local_repo(conf, tenant_id, remote_index) - .with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?; - - let mut status_updates = HashMap::with_capacity(local_timeline_init_statuses.len()); + let mut timelines_to_attach = HashSet::new(); for (timeline_id, init_status) in local_timeline_init_statuses { match init_status { LocalTimelineInitStatus::LocallyComplete => { debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository"); - if let Err(err) = check_broken_timeline(&repo, timeline_id) { - info!( - "Found a broken timeline {timeline_id} (err={err:?}), skip registering it in repository" - ); - } else { - status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded); - } + check_broken_timeline(conf, tenant_id, timeline_id) + .context("found broken timeline")?; + timelines_to_attach.insert(timeline_id); } LocalTimelineInitStatus::NeedsSync => { debug!( "timeline {tenant_id} for tenant {timeline_id} needs sync, \ so skipped for adding into repository until sync is finished" ); + return Ok(()); } } } + // initialize local tenant + let repo = load_local_repo(conf, tenant_id, remote_index) + .with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?; + // Lets fail here loudly to be on the safe side. // XXX: It may be a better api to actually distinguish between repository startup // and processing of newly downloaded timelines. - apply_timeline_remote_sync_status_updates(&repo, status_updates) + attach_downloaded_tenant(&repo, timelines_to_attach) .with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))?; Ok(()) } -fn apply_timeline_remote_sync_status_updates( +fn attach_downloaded_tenant( repo: &LayeredRepository, - status_updates: HashMap, + downloaded_timelines: HashSet, ) -> anyhow::Result<()> { - let mut registration_queue = Vec::with_capacity(status_updates.len()); + let mut registration_queue = Vec::with_capacity(downloaded_timelines.len()); // first need to register the in-mem representations, to avoid missing ancestors during the local disk data registration - for (timeline_id, status_update) in status_updates { - repo.apply_timeline_remote_sync_status_update(timeline_id, status_update) - .with_context(|| { - format!("Failed to load timeline {timeline_id} into in-memory repository") - })?; - match status_update { - TimelineSyncStatusUpdate::Downloaded => registration_queue.push(timeline_id), - } + for timeline_id in downloaded_timelines { + repo.attach_timeline(timeline_id).with_context(|| { + format!("Failed to load timeline {timeline_id} into in-memory repository") + })?; + registration_queue.push(timeline_id); } for timeline_id in registration_queue { diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index b70350e0da..c36343db17 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -264,7 +264,10 @@ async fn wal_receiver_main_thread_loop_step<'a>( info!("Processing timeline update: {update:?}"); match update { // Timeline got detached, stop all related tasks and remove public timeline data. - LocalTimelineUpdate::Detach(id, join_sender) => { + LocalTimelineUpdate::Detach { + id, + join_confirmation_sender, + } => { match local_timeline_wal_receivers.get_mut(&id.tenant_id) { Some(wal_receivers) => { if let hash_map::Entry::Occupied(o) = wal_receivers.entry(id.timeline_id) { @@ -280,7 +283,7 @@ async fn wal_receiver_main_thread_loop_step<'a>( }; { WAL_RECEIVER_ENTRIES.write().await.remove(&id); - if let Err(e) = join_sender.send(()) { + if let Err(e) = join_confirmation_sender.send(()) { warn!("cannot send wal_receiver shutdown confirmation {e}") } else { info!("confirm walreceiver shutdown for {id}"); @@ -288,41 +291,40 @@ async fn wal_receiver_main_thread_loop_step<'a>( } } // Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly. - LocalTimelineUpdate::Attach(new_id, new_timeline) => { + LocalTimelineUpdate::Attach { id, datadir } => { let timeline_connection_managers = local_timeline_wal_receivers - .entry(new_id.tenant_id) + .entry(id.tenant_id) .or_default(); if timeline_connection_managers.is_empty() { - if let Err(e) = - change_tenant_state(new_id.tenant_id, TenantState::Active).await + if let Err(e) = change_tenant_state(id.tenant_id, TenantState::Active).await { - error!("Failed to make tenant active for id {new_id}: {e:#}"); + error!("Failed to make tenant active for id {id}: {e:#}"); return; } } let vacant_connection_manager_entry = - match timeline_connection_managers.entry(new_id.timeline_id) { + match timeline_connection_managers.entry(id.timeline_id) { hash_map::Entry::Occupied(_) => { - debug!("Attepted to readd an existing timeline {new_id}, ignoring"); + debug!("Attepted to readd an existing timeline {id}, ignoring"); return; } hash_map::Entry::Vacant(v) => v, }; let (wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag) = - match fetch_tenant_settings(new_id.tenant_id).await { + match fetch_tenant_settings(id.tenant_id).await { Ok(settings) => settings, Err(e) => { - error!("Failed to fetch tenant settings for id {new_id}: {e:#}"); + error!("Failed to fetch tenant settings for id {id}: {e:#}"); return; } }; { WAL_RECEIVER_ENTRIES.write().await.insert( - new_id, + id, WalReceiverEntry { wal_producer_connstr: None, last_received_msg_lsn: None, @@ -333,10 +335,10 @@ async fn wal_receiver_main_thread_loop_step<'a>( vacant_connection_manager_entry.insert( connection_manager::spawn_connection_manager_task( - new_id, + id, broker_prefix.to_owned(), etcd_client.clone(), - new_timeline, + datadir, wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index 3e7ba22184..96132c14f9 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -105,3 +105,26 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): branch2_cur.execute('SELECT count(*) FROM foo') assert branch2_cur.fetchone() == (300000, ) + + +def test_ancestor_branch_delete(neon_simple_env: NeonEnv): + env = neon_simple_env + + parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_parent", "empty") + + leaf_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_branch1", + "test_ancestor_branch_delete_parent") + + ps_http = env.pageserver.http_client() + with pytest.raises(NeonPageserverApiException, + match="Failed to delete tenant timeline from repo"): + ps_http.timeline_delete(env.initial_tenant, parent_timeline_id) + + ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id) + # check 404 + with pytest.raises(NeonPageserverApiException, + match="is not found neither locally nor remotely"): + ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) + + # FIXME leaves tenant without timelines, should we prevent deletion of root timeline? + ps_http.timeline_delete(env.initial_tenant, parent_timeline_id) diff --git a/test_runner/batch_others/test_broken_timeline.py b/test_runner/batch_others/test_broken_timeline.py index b72f337e06..675236fbd7 100644 --- a/test_runner/batch_others/test_broken_timeline.py +++ b/test_runner/batch_others/test_broken_timeline.py @@ -110,6 +110,6 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): env.neon_cli.pageserver_stop(immediate=True) env.neon_cli.pageserver_start() - # Check that the "broken" timeline is not loaded - timelines = env.neon_cli.list_timelines(tenant_id) - assert len(timelines) == 1 + # Check that tenant with "broken" timeline is not loaded. + with pytest.raises(Exception, match=f"Failed to get repo for tenant {tenant_id.hex}"): + env.neon_cli.list_timelines(tenant_id) diff --git a/test_runner/batch_others/test_import.py b/test_runner/batch_others/test_import.py index 63dc42ee3e..617d4808cc 100644 --- a/test_runner/batch_others/test_import.py +++ b/test_runner/batch_others/test_import.py @@ -90,7 +90,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build # Clean up # TODO it should clean itself client = env.pageserver.http_client() - client.timeline_detach(tenant, timeline) + client.timeline_delete(tenant, timeline) # Importing correct backup works import_tar(base_tar, wal_tar) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index e6967e3682..aaccb00399 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -795,6 +795,27 @@ class NeonPageserverHttpClient(requests.Session): def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + def tenant_list(self) -> List[Dict[Any, Any]]: + res = self.get(f"http://localhost:{self.port}/v1/tenant") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, list) + return res_json + + def tenant_create(self, new_tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: + res = self.post( + f"http://localhost:{self.port}/v1/tenant", + json={ + 'new_tenant_id': new_tenant_id.hex if new_tenant_id else None, + }, + ) + self.verbose_error(res) + if res.status_code == 409: + raise Exception(f'could not create tenant: already exists for id {new_tenant_id}') + new_tenant_id = res.json() + assert isinstance(new_tenant_id, str) + return uuid.UUID(new_tenant_id) + def tenant_attach(self, tenant_id: uuid.UUID): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/attach") self.verbose_error(res) @@ -803,6 +824,13 @@ class NeonPageserverHttpClient(requests.Session): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/detach") self.verbose_error(res) + def timeline_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, list) + return res_json + def timeline_create( self, tenant_id: uuid.UUID, @@ -827,34 +855,6 @@ class NeonPageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def tenant_list(self) -> List[Dict[Any, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/tenant") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def tenant_create(self, new_tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: - res = self.post( - f"http://localhost:{self.port}/v1/tenant", - json={ - 'new_tenant_id': new_tenant_id.hex if new_tenant_id else None, - }, - ) - self.verbose_error(res) - if res.status_code == 409: - raise Exception(f'could not create tenant: already exists for id {new_tenant_id}') - new_tenant_id = res.json() - assert isinstance(new_tenant_id, str) - return uuid.UUID(new_tenant_id) - - def timeline_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1" @@ -864,6 +864,14 @@ class NeonPageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json + def timeline_delete(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): + res = self.delete( + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + def wal_receiver_get(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/wal_receiver" From d9d4ef12c3112a7461365b032157b791dc6a8db7 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 27 Jun 2022 13:41:19 +0300 Subject: [PATCH 0480/1022] review cleanup --- pageserver/src/http/routes.rs | 12 ++++++--- pageserver/src/storage_sync.rs | 2 +- pageserver/src/storage_sync/download.rs | 5 +++- pageserver/src/tenant_mgr.rs | 34 ++++--------------------- 4 files changed, 18 insertions(+), 35 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 997e3c9a1f..64aaa68e44 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -282,7 +282,7 @@ async fn tenant_attach_handler(request: Request) -> Result, drop(index_accessor); // download index parts for every tenant timeline - let remote_timelines = match try_download_tenant_index(state, tenant_id).await { + let remote_timelines = match gather_tenant_timelines_index_parts(state, tenant_id).await { Ok(Some(remote_timelines)) => remote_timelines, Ok(None) => return Err(ApiError::NotFound("Unknown remote tenant".to_string())), Err(e) => { @@ -323,19 +323,23 @@ async fn tenant_attach_handler(request: Request) -> Result, json_response(StatusCode::ACCEPTED, ()) } -async fn try_download_tenant_index( +/// Note: is expensive from s3 access perspective, +/// for details see comment to `storage_sync::gather_tenant_timelines_index_parts` +async fn gather_tenant_timelines_index_parts( state: &State, tenant_id: ZTenantId, ) -> anyhow::Result>> { let index_parts = match state.remote_storage.as_ref() { Some(GenericRemoteStorage::Local(local_storage)) => { - storage_sync::download_tenant_index_parts(state.conf, local_storage, tenant_id).await + storage_sync::gather_tenant_timelines_index_parts(state.conf, local_storage, tenant_id) + .await } // FIXME here s3 storage contains its own limits, that are separate from sync storage thread ones // because it is a different instance. We can move this limit to some global static // or use one instance everywhere. Some(GenericRemoteStorage::S3(s3_storage)) => { - storage_sync::download_tenant_index_parts(state.conf, s3_storage, tenant_id).await + storage_sync::gather_tenant_timelines_index_parts(state.conf, s3_storage, tenant_id) + .await } None => return Ok(None), } diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 6df41d854c..d6e3741bc0 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -191,7 +191,7 @@ use metrics::{ use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; use self::download::download_index_parts; -pub use self::download::download_tenant_index_parts; +pub use self::download::gather_tenant_timelines_index_parts; pub use self::download::TEMP_DOWNLOAD_EXTENSION; lazy_static! { diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 05a3df166a..d023a8ef52 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -71,7 +71,10 @@ where index_parts } -pub async fn download_tenant_index_parts( +/// Note: The function is rather expensive from s3 access point of view, it will execute ceil(N/1000) + N requests. +/// At least one request to obtain a list of tenant timelines (more requests is there are more than 1000 timelines). +/// And then will attempt to download all index files that belong to these timelines. +pub async fn gather_tenant_timelines_index_parts( conf: &'static PageServerConf, storage: &S, tenant_id: ZTenantId, diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 84282be63f..afdca9abbd 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -401,31 +401,7 @@ pub fn get_local_timeline_with_load( } pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { - // shutdown the timeline threads (this shuts down the walreceiver) - // FIXME it does not shut down wal receiver - - // Things needed to be done - // *. check no ancestors - // *. remove from repo map - // *. remove from global tenant timelines map - // -- no new connections can see the timeline - // *. shutdown threads - // *. join walreceiver (any flushing thread?) - // *. delete files while ensuring that no gc or compaction is in progress - // 7. should we checkpoint before detach? That can be harmful during relocation, - // because it will upload to s3 something that other pageserver didnt see - // TODO put falpoints at every step. Iterate over failpoints - // in detach test and check that timeline is either attached or detached - // verify with a try to start a compute - // TODO adjust remote_index - // what is harder, write whole tenant detach correctly, or fix the timeline based one. - - // TODO bail on active page_service threads? - // TODO what about inprogress downloads or uploads? - // can it be idempotent? - // FAILPOINTS: broken repo.detach_timeline - // broken wal_receiver - // broken rmdir + // shutdown the timeline tasks (this shuts down the walreceiver) let (sender, receiver) = std::sync::mpsc::channel::<()>(); tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach { @@ -433,12 +409,12 @@ pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow join_confirmation_sender: sender, }); - info!("waiting for wal receiver to shutdown"); + debug!("waiting for wal receiver to shutdown"); let _ = receiver.recv(); - info!("wal receiver shutdown confirmed"); - info!("waiting for threads to shutdown"); + debug!("wal receiver shutdown confirmed"); + debug!("waiting for threads to shutdown"); thread_mgr::shutdown_threads(None, None, Some(timeline_id)); - info!("thread shutdown completed"); + debug!("thread shutdown completed"); match tenants_state::write_tenants().get_mut(&tenant_id) { Some(tenant) => { tenant From 168214e0b6e74f9e65892e89b5150b7eccf6d3b9 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 30 Jun 2022 20:36:32 +0300 Subject: [PATCH 0481/1022] use tenant status endpoint to check whether timelines were downloaded or not --- neon_local/src/main.rs | 8 +++- pageserver/src/http/openapi_spec.yml | 47 ++++++++++++++++++- pageserver/src/http/routes.rs | 36 +++++++++++++- pageserver/src/tenant_mgr.rs | 24 +++++++--- .../batch_others/test_remote_storage.py | 4 +- .../batch_others/test_tenant_relocation.py | 46 +++++++++++++----- .../batch_others/test_timeline_size.py | 18 +++---- test_runner/fixtures/neon_fixtures.py | 25 ++++++++-- 8 files changed, 171 insertions(+), 37 deletions(-) diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index 35e2d9c9e2..b29cc6978c 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -537,7 +537,13 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an match tenant_match.subcommand() { Some(("list", _)) => { for t in pageserver.tenant_list()? { - println!("{} {}", t.id, t.state); + println!( + "{} {}", + t.id, + t.state + .map(|s| s.to_string()) + .unwrap_or_else(|| String::from("")) + ); } } Some(("create", create_match)) => { diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 6cfedc0931..408d066fb4 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -22,6 +22,49 @@ paths: properties: id: type: integer + + /v1/tenant/{tenant_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + get: + description: Get tenant status + responses: + "200": + description: Currently returns the flag whether the tenant has inprogress timeline downloads + content: + application/json: + schema: + $ref: "#/components/schemas/TenantInfo" + "400": + description: Error when no tenant id found in path or no timeline id + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/timeline: parameters: - name: tenant_id @@ -521,12 +564,13 @@ components: type: object required: - id - - state properties: id: type: string state: type: string + has_in_progress_downloads: + type: bool TenantCreateInfo: type: object properties: @@ -621,6 +665,7 @@ components: type: integer current_logical_size_non_incremental: type: integer + WalReceiverEntry: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 64aaa68e44..2cf5e7a828 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -14,6 +14,7 @@ use crate::repository::Repository; use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant_config::TenantConfOpt; +use crate::tenant_mgr::TenantInfo; use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo}; use crate::{config::PageServerConf, tenant_mgr, timelines}; use utils::{ @@ -403,9 +404,13 @@ async fn tenant_list_handler(request: Request) -> Result, A // check for management permission check_permission(&request, None)?; + let state = get_state(&request); + // clone to avoid holding the lock while awaiting for blocking task + let remote_index = state.remote_index.read().await.clone(); + let response_data = tokio::task::spawn_blocking(move || { let _enter = info_span!("tenant_list").entered(); - crate::tenant_mgr::list_tenants() + crate::tenant_mgr::list_tenants(&remote_index) }) .await .map_err(ApiError::from_err)?; @@ -413,6 +418,34 @@ async fn tenant_list_handler(request: Request) -> Result, A json_response(StatusCode::OK, response_data) } +async fn tenant_status(request: Request) -> Result, ApiError> { + let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + // if tenant is in progress of downloading it can be absent in global tenant map + let tenant_state = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant_state(tenant_id)) + .await + .map_err(ApiError::from_err)?; + + let state = get_state(&request); + let remote_index = &state.remote_index; + + let index_accessor = remote_index.read().await; + let has_in_progress_downloads = index_accessor + .tenant_entry(&tenant_id) + .ok_or_else(|| ApiError::NotFound("Tenant not found in remote index".to_string()))? + .has_in_progress_downloads(); + + json_response( + StatusCode::OK, + TenantInfo { + id: tenant_id, + state: tenant_state, + has_in_progress_downloads: Some(has_in_progress_downloads), + }, + ) +} + async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { // check for management permission check_permission(&request, None)?; @@ -558,6 +591,7 @@ pub fn make_router( .get("/v1/status", status_handler) .get("/v1/tenant", tenant_list_handler) .post("/v1/tenant", tenant_create_handler) + .get("/v1/tenant/:tenant_id", tenant_status) .put("/v1/tenant/config", tenant_config_handler) .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler) .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index afdca9abbd..b3b8d2ce50 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -5,7 +5,7 @@ use crate::config::PageServerConf; use crate::layered_repository::{load_metadata, LayeredRepository}; use crate::pgdatadir_mapping::DatadirTimeline; use crate::repository::Repository; -use crate::storage_sync::index::RemoteIndex; +use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; use crate::thread_mgr::ThreadKind; @@ -509,15 +509,27 @@ fn load_local_timeline( pub struct TenantInfo { #[serde_as(as = "DisplayFromStr")] pub id: ZTenantId, - pub state: TenantState, + pub state: Option, + pub has_in_progress_downloads: Option, } -pub fn list_tenants() -> Vec { +pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec { tenants_state::read_tenants() .iter() - .map(|(id, tenant)| TenantInfo { - id: *id, - state: tenant.state, + .map(|(id, tenant)| { + let has_in_progress_downloads = remote_index + .tenant_entry(id) + .map(|entry| entry.has_in_progress_downloads()); + + if has_in_progress_downloads.is_none() { + error!("timeline is not found in remote index while it is present in the tenants registry") + } + + TenantInfo { + id: *id, + state: Some(tenant.state), + has_in_progress_downloads, + } }) .collect() } diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index ac39c6290b..163912690c 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -6,7 +6,7 @@ from contextlib import closing from pathlib import Path import time from uuid import UUID -from fixtures.neon_fixtures import NeonEnvBuilder, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload +from fixtures.neon_fixtures import NeonEnvBuilder, assert_timeline_local, wait_until, wait_for_last_record_lsn, wait_for_upload from fixtures.log_helper import log from fixtures.utils import lsn_from_hex, lsn_to_hex import pytest @@ -114,7 +114,7 @@ def test_remote_storage_backup_and_restore(neon_env_builder: NeonEnvBuilder, sto log.info("waiting for timeline redownload") wait_until(number_of_iterations=10, interval=1, - func=lambda: assert_local(client, UUID(tenant_id), UUID(timeline_id))) + func=lambda: assert_timeline_local(client, UUID(tenant_id), UUID(timeline_id))) detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) assert detail['local'] is not None diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 0560469ca1..f6f2d8ca9d 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -1,15 +1,30 @@ -from contextlib import closing, contextmanager import os import pathlib +import signal import subprocess import threading -from uuid import UUID -from fixtures.log_helper import log +from contextlib import closing, contextmanager from typing import Any, Dict, Optional, Tuple -import signal -import pytest +from uuid import UUID -from fixtures.neon_fixtures import NeonEnv, PortDistributor, Postgres, NeonEnvBuilder, Etcd, NeonPageserverHttpClient, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload, neon_binpath, pg_distrib_dir, base_dir +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + Etcd, + NeonEnv, + NeonEnvBuilder, + NeonPageserverHttpClient, + PortDistributor, + Postgres, + assert_no_in_progress_downloads_for_tenant, + assert_timeline_local, + base_dir, + neon_binpath, + pg_distrib_dir, + wait_for_last_record_lsn, + wait_for_upload, + wait_until, +) from fixtures.utils import lsn_from_hex, subprocess_capture @@ -144,10 +159,7 @@ def check_timeline_attached( old_current_lsn: int, ): # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint - new_timeline_detail = wait_until( - number_of_iterations=5, - interval=1, - func=lambda: assert_local(new_pageserver_http_client, tenant_id, timeline_id)) + new_timeline_detail = assert_timeline_local(new_pageserver_http_client, tenant_id, timeline_id) # when load is active these checks can break because lsns are not static # so lets check with some margin @@ -250,10 +262,10 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, # wait until pageserver receives that data wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_main, current_lsn_main) - timeline_detail_main = assert_local(pageserver_http, tenant_id, timeline_id_main) + timeline_detail_main = assert_timeline_local(pageserver_http, tenant_id, timeline_id_main) wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_second, current_lsn_second) - timeline_detail_second = assert_local(pageserver_http, tenant_id, timeline_id_second) + timeline_detail_second = assert_timeline_local(pageserver_http, tenant_id, timeline_id_second) if with_load == 'with_load': # create load table @@ -337,6 +349,16 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, # call to attach timeline to new pageserver new_pageserver_http.tenant_attach(tenant_id) + # check that it shows that download is in progress + tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id) + assert tenant_status.get('has_in_progress_downloads'), tenant_status + + # wait until tenant is downloaded + wait_until(number_of_iterations=10, + interval=1, + func=lambda: assert_no_in_progress_downloads_for_tenant( + new_pageserver_http, tenant_id)) + check_timeline_attached( new_pageserver_http, tenant_id, diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 5734091757..7b7b16bcbf 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -1,7 +1,7 @@ from contextlib import closing import psycopg2.extras import psycopg2.errors -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_local +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local from fixtures.log_helper import log import time @@ -11,7 +11,7 @@ def test_timeline_size(neon_simple_env: NeonEnv): new_timeline_id = env.neon_cli.create_branch('test_timeline_size', 'empty') client = env.pageserver.http_client() - timeline_details = assert_local(client, env.initial_tenant, new_timeline_id) + timeline_details = assert_timeline_local(client, env.initial_tenant, new_timeline_id) assert timeline_details['local']['current_logical_size'] == timeline_details['local'][ 'current_logical_size_non_incremental'] @@ -29,13 +29,13 @@ def test_timeline_size(neon_simple_env: NeonEnv): FROM generate_series(1, 10) g """) - res = assert_local(client, env.initial_tenant, new_timeline_id) + res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) local_details = res['local'] assert local_details["current_logical_size"] == local_details[ "current_logical_size_non_incremental"] cur.execute("TRUNCATE foo") - res = assert_local(client, env.initial_tenant, new_timeline_id) + res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) local_details = res['local'] assert local_details["current_logical_size"] == local_details[ "current_logical_size_non_incremental"] @@ -46,7 +46,7 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): new_timeline_id = env.neon_cli.create_branch('test_timeline_size', 'empty') client = env.pageserver.http_client() - timeline_details = assert_local(client, env.initial_tenant, new_timeline_id) + timeline_details = assert_timeline_local(client, env.initial_tenant, new_timeline_id) assert timeline_details['local']['current_logical_size'] == timeline_details['local'][ 'current_logical_size_non_incremental'] @@ -57,7 +57,7 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): with conn.cursor() as cur: cur.execute("SHOW neon.timeline_id") - res = assert_local(client, env.initial_tenant, new_timeline_id) + res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) local_details = res['local'] assert local_details["current_logical_size"] == local_details[ "current_logical_size_non_incremental"] @@ -73,14 +73,14 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): FROM generate_series(1, 10) g """) - res = assert_local(client, env.initial_tenant, new_timeline_id) + res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) local_details = res['local'] assert local_details["current_logical_size"] == local_details[ "current_logical_size_non_incremental"] cur.execute('DROP DATABASE foodb') - res = assert_local(client, env.initial_tenant, new_timeline_id) + res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) local_details = res['local'] assert local_details["current_logical_size"] == local_details[ "current_logical_size_non_incremental"] @@ -117,7 +117,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): new_timeline_id = env.neon_cli.create_branch('test_timeline_size_quota') client = env.pageserver.http_client() - res = assert_local(client, env.initial_tenant, new_timeline_id) + res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) assert res['local']["current_logical_size"] == res['local'][ "current_logical_size_non_incremental"] diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index aaccb00399..24b234ac25 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -29,7 +29,7 @@ from dataclasses import dataclass # Type-related stuff from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import make_dsn, parse_dsn -from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple +from typing import Any, Callable, Dict, Iterator, List, Optional, Type, TypeVar, cast, Union, Tuple from typing_extensions import Literal import requests @@ -824,7 +824,14 @@ class NeonPageserverHttpClient(requests.Session): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/detach") self.verbose_error(res) - def timeline_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]: + def tenant_status(self, tenant_id: uuid.UUID) -> Dict[Any, Any]: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def timeline_list(self, tenant_id: uuid.UUID) -> List[Dict[str, Any]]: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline") self.verbose_error(res) res_json = res.json() @@ -2183,14 +2190,22 @@ def wait_until(number_of_iterations: int, interval: float, func): raise Exception("timed out while waiting for %s" % func) from last_exception -def assert_local(pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID): +def assert_timeline_local(pageserver_http_client: NeonPageserverHttpClient, + tenant: uuid.UUID, + timeline: uuid.UUID): timeline_detail = pageserver_http_client.timeline_detail(tenant, timeline) assert timeline_detail.get('local', {}).get("disk_consistent_lsn"), timeline_detail return timeline_detail +def assert_no_in_progress_downloads_for_tenant( + pageserver_http_client: NeonPageserverHttpClient, + tenant: uuid.UUID, +): + tenant_status = pageserver_http_client.tenant_status(tenant) + assert tenant_status['has_in_progress_downloads'] is False, tenant_status + + def remote_consistent_lsn(pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID) -> int: From 9f2b40645d345671bcdbd6a56ba9b09a5b44127a Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 4 Jul 2022 17:25:32 +0300 Subject: [PATCH 0482/1022] review cleanup, point timeline/detach to timeline/delete --- pageserver/src/http/openapi_spec.yml | 51 ++++++++++++++----------- pageserver/src/http/routes.rs | 17 +++------ pageserver/src/layered_repository.rs | 11 +++--- pageserver/src/storage_sync/download.rs | 6 +-- 4 files changed, 41 insertions(+), 44 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 408d066fb4..02569a3778 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -113,6 +113,7 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/timeline/{timeline_id}: parameters: - name: tenant_id @@ -242,25 +243,6 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - /v1/tenant/{tenant_id}/timeline/{timeline_id}/attach: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - format: hex - - name: timeline_id - in: path - required: true - schema: - type: string - format: hex - post: - description: Deprecated - responses: - "410": - description: GONE /v1/tenant/{tenant_id}/attach: parameters: @@ -327,10 +309,35 @@ paths: type: string format: hex post: - description: Deprecated + description: Deprecated, use DELETE /v1/tenant/{tenant_id}/timeline/{timeline_id} instead + deprecated: true responses: - "410": - description: GONE + "200": + description: Ok + "400": + description: Error when no tenant id found in path or no timeline id + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/detach: parameters: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2cf5e7a828..236415cf58 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -242,10 +242,6 @@ async fn wal_receiver_get_handler(request: Request) -> Result) -> Result, ApiError> { - json_response(StatusCode::GONE, ()) -} - // TODO makes sense to provide tenant config right away the same way as it handled in tenant_create async fn tenant_attach_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; @@ -605,17 +601,14 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler, ) - .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver", - wal_receiver_get_handler, - ) - .post( - "/v1/tenant/:tenant_id/timeline/:timeline_id/attach", - timeline_attach_handler, - ) + // for backward compatibility .post( "/v1/tenant/:tenant_id/timeline/:timeline_id/detach", timeline_delete_handler, ) + .get( + "/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver", + wal_receiver_get_handler, + ) .any(handler_404)) } diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index a1870703f4..55f01ee962 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -408,20 +408,19 @@ impl Repository for LayeredRepository { Ok(()) } - // in order to be retriable detach needs to be idempotent fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> { // in order to be retriable detach needs to be idempotent + // (or at least to a point that each time the detach is called it can make progress) let mut timelines = self.timelines.lock().unwrap(); // Ensure that there are no child timelines **attached to that pageserver**, - // because detach removes files, which will brake child branches - let num_children = timelines + // because detach removes files, which will break child branches + let children_exist = timelines .iter() - .filter(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id)) - .count(); + .any(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id)); ensure!( - num_children == 0, + !children_exist, "Cannot detach timeline which has child timelines" ); let timeline_entry = match timelines.entry(timeline_id) { diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index d023a8ef52..0f2bdd3bcb 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -119,14 +119,12 @@ where }); } - let index_parts = download_index_parts(conf, storage, sync_ids) + download_index_parts(conf, storage, sync_ids) .await .remove(&tenant_id) .ok_or(anyhow::anyhow!( "Missing tenant index parts. This is a bug." - ))?; - - Ok(index_parts) + )) } /// Retrieves index data from the remote storage for a given timeline. From 520ffb341bbf70743ee4d34641c469b25e927197 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 4 Jul 2022 21:01:22 +0300 Subject: [PATCH 0483/1022] fix pageserver openapi spec --- pageserver/src/http/openapi_spec.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 02569a3778..2775a27e0f 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -128,13 +128,14 @@ paths: schema: type: string format: hex - - name: include-non-incremental-logical-size - in: query - schema: - type: string - description: Controls calculation of current_logical_size_non_incremental get: description: Get info about the timeline + parameters: + - name: include-non-incremental-logical-size + in: query + schema: + type: string + description: Controls calculation of current_logical_size_non_incremental responses: "200": description: TimelineInfo @@ -577,7 +578,7 @@ components: state: type: string has_in_progress_downloads: - type: bool + type: boolean TenantCreateInfo: type: object properties: From 1a5af6d7a5ab25e39f1a68f411a0566f899f1007 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 6 Jul 2022 13:40:33 +0300 Subject: [PATCH 0484/1022] extend detach/delete tests --- pageserver/src/tenant_mgr.rs | 24 ++++---- .../batch_others/test_ancestor_branch.py | 23 -------- .../{test_detach.py => test_tenant_detach.py} | 21 ++++++- test_runner/batch_others/test_tenant_tasks.py | 6 +- .../batch_others/test_timeline_delete.py | 55 +++++++++++++++++++ 5 files changed, 90 insertions(+), 39 deletions(-) rename test_runner/batch_others/{test_detach.py => test_tenant_detach.py} (68%) create mode 100644 test_runner/batch_others/test_timeline_delete.py diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index b3b8d2ce50..1759d3bbb8 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -13,7 +13,7 @@ use crate::timelines::CreateRepo; use crate::walredo::PostgresRedoManager; use crate::{thread_mgr, timelines, walreceiver}; use crate::{DatadirTimelineImpl, RepositoryImpl}; -use anyhow::{bail, Context}; +use anyhow::Context; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use std::collections::hash_map::Entry; @@ -401,7 +401,14 @@ pub fn get_local_timeline_with_load( } pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { - // shutdown the timeline tasks (this shuts down the walreceiver) + // Start with the shutdown of timeline tasks (this shuts down the walreceiver) + // It is important that we do not take locks here, and do not check whether the timeline exists + // because if we hold tenants_state::write_tenants() while awaiting for the threads to join + // we cannot create new timelines and tenants, and that can take quite some time, + // it can even become stuck due to a bug making whole pageserver unavailable for some operations + // so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation + // and then try to actually remove timeline from inmemory state and this is the point when concurrent requests + // will synchronize and either fail with the not found error or succeed let (sender, receiver) = std::sync::mpsc::channel::<()>(); tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach { @@ -417,13 +424,10 @@ pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow debug!("thread shutdown completed"); match tenants_state::write_tenants().get_mut(&tenant_id) { Some(tenant) => { - tenant - .repo - .delete_timeline(timeline_id) - .context("Failed to delete tenant timeline from repo")?; + tenant.repo.delete_timeline(timeline_id)?; tenant.local_timelines.remove(&timeline_id); } - None => warn!("Tenant {tenant_id} not found in local tenant state"), + None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"), } Ok(()) @@ -552,7 +556,7 @@ fn check_broken_timeline( // A timeline with zero disk consistent LSN can happen when the page server // failed to checkpoint the timeline import data when creating that timeline. if metadata.disk_consistent_lsn() == Lsn::INVALID { - bail!("Timeline {timeline_id} has a zero disk consistent LSN."); + anyhow::bail!("Timeline {timeline_id} has a zero disk consistent LSN."); } Ok(()) @@ -615,7 +619,7 @@ fn attach_downloaded_tenant( match tenants_state::write_tenants().get_mut(&tenant_id) { Some(tenant) => match tenant.local_timelines.entry(timeline_id) { Entry::Occupied(_) => { - bail!("Local timeline {timeline_id} already registered") + anyhow::bail!("Local timeline {timeline_id} already registered") } Entry::Vacant(v) => { v.insert(load_local_timeline(repo, timeline_id).with_context(|| { @@ -623,7 +627,7 @@ fn attach_downloaded_tenant( })?); } }, - None => bail!( + None => anyhow::bail!( "Tenant {} not found in local tenant state", repo.tenant_id() ), diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index 96132c14f9..3e7ba22184 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -105,26 +105,3 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): branch2_cur.execute('SELECT count(*) FROM foo') assert branch2_cur.fetchone() == (300000, ) - - -def test_ancestor_branch_delete(neon_simple_env: NeonEnv): - env = neon_simple_env - - parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_parent", "empty") - - leaf_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_branch1", - "test_ancestor_branch_delete_parent") - - ps_http = env.pageserver.http_client() - with pytest.raises(NeonPageserverApiException, - match="Failed to delete tenant timeline from repo"): - ps_http.timeline_delete(env.initial_tenant, parent_timeline_id) - - ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id) - # check 404 - with pytest.raises(NeonPageserverApiException, - match="is not found neither locally nor remotely"): - ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) - - # FIXME leaves tenant without timelines, should we prevent deletion of root timeline? - ps_http.timeline_delete(env.initial_tenant, parent_timeline_id) diff --git a/test_runner/batch_others/test_detach.py b/test_runner/batch_others/test_tenant_detach.py similarity index 68% rename from test_runner/batch_others/test_detach.py rename to test_runner/batch_others/test_tenant_detach.py index 105facb656..2df5409b4f 100644 --- a/test_runner/batch_others/test_detach.py +++ b/test_runner/batch_others/test_tenant_detach.py @@ -4,14 +4,25 @@ import psycopg2 import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException -def test_detach_smoke(neon_env_builder: NeonEnvBuilder): +def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() + # first check for non existing tenant + tenant_id = uuid4() + with pytest.raises(expected_exception=NeonPageserverApiException, + match=f'Tenant not found for id {tenant_id.hex}'): + pageserver_http.tenant_detach(tenant_id) + + # create new nenant tenant_id, timeline_id = env.neon_cli.create_tenant() + + # assert tenant exists on disk + assert (env.repo_dir / "tenants" / tenant_id.hex).exists() + pg = env.postgres.create_start('main', tenant_id=tenant_id) # we rely upon autocommit after each statement pg.safe_psql_many(queries=[ @@ -19,11 +30,12 @@ def test_detach_smoke(neon_env_builder: NeonEnvBuilder): 'INSERT INTO t SELECT generate_series(1,100000), \'payload\'', ]) - # gc should try to even start + # gc should not try to even start with pytest.raises(expected_exception=psycopg2.DatabaseError, match='gc target timeline does not exist'): env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {uuid4().hex} 0') + # try to concurrently run gc and detach gc_thread = Thread( target=lambda: env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0'), ) gc_thread.start() @@ -44,6 +56,9 @@ def test_detach_smoke(neon_env_builder: NeonEnvBuilder): gc_thread.join(timeout=10) + # check that nothing is left on disk for deleted tenant + assert not (env.repo_dir / "tenants" / tenant_id.hex).exists() + with pytest.raises(expected_exception=psycopg2.DatabaseError, match=f'Tenant {tenant_id.hex} not found'): env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0') diff --git a/test_runner/batch_others/test_tenant_tasks.py b/test_runner/batch_others/test_tenant_tasks.py index 9ce2016a64..fae2a2199d 100644 --- a/test_runner/batch_others/test_tenant_tasks.py +++ b/test_runner/batch_others/test_tenant_tasks.py @@ -35,10 +35,10 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): value = line.lstrip(name).strip() return int(value) - def detach_all_timelines(tenant): + def delete_all_timelines(tenant): timelines = [UUID(t["timeline_id"]) for t in client.timeline_list(tenant)] for t in timelines: - client.timeline_detach(tenant, t) + client.timeline_delete(tenant, t) def assert_idle(tenant): assert get_state(tenant) == "Idle" @@ -56,7 +56,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): # TODO they should be already idle since there are no active computes for tenant_info in client.tenant_list(): tenant_id = UUID(tenant_info["id"]) - detach_all_timelines(tenant_id) + delete_all_timelines(tenant_id) wait_until(10, 0.2, lambda: assert_idle(tenant_id)) # Assert that all tasks finish quickly after tenants go idle diff --git a/test_runner/batch_others/test_timeline_delete.py b/test_runner/batch_others/test_timeline_delete.py new file mode 100644 index 0000000000..2c9a4cd913 --- /dev/null +++ b/test_runner/batch_others/test_timeline_delete.py @@ -0,0 +1,55 @@ +from uuid import uuid4 +import pytest + +from fixtures.neon_fixtures import NeonEnv, NeonPageserverApiException + + +def test_timeline_delete(neon_simple_env: NeonEnv): + env = neon_simple_env + + ps_http = env.pageserver.http_client() + + # first try to delete non existing timeline + # for existing tenant: + invalid_timeline_id = uuid4() + with pytest.raises(NeonPageserverApiException, match="timeline not found"): + ps_http.timeline_delete(tenant_id=env.initial_tenant, timeline_id=invalid_timeline_id) + + # for non existing tenant: + invalid_tenant_id = uuid4() + with pytest.raises(NeonPageserverApiException, + match=f"Tenant {invalid_tenant_id.hex} not found in local tenant state"): + ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id) + + # construct pair of branches to validate that pageserver prohibits + # deletion of ancestor timelines when they have child branches + parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_parent", "empty") + + leaf_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_branch1", + "test_ancestor_branch_delete_parent") + + ps_http = env.pageserver.http_client() + with pytest.raises(NeonPageserverApiException, + match="Cannot detach timeline which has child timelines"): + + timeline_path = env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / parent_timeline_id.hex + assert timeline_path.exists() + + ps_http.timeline_delete(env.initial_tenant, parent_timeline_id) + + assert not timeline_path.exists() + + timeline_path = env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / leaf_timeline_id.hex + assert timeline_path.exists() + + ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id) + + assert not timeline_path.exists() + + # check 404 + with pytest.raises(NeonPageserverApiException, + match="is not found neither locally nor remotely"): + ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) + + # FIXME leaves tenant without timelines, should we prevent deletion of root timeline? + ps_http.timeline_delete(env.initial_tenant, parent_timeline_id) From ec0faf3ac6b700fe891c3fd2913e48214bb0c2fa Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 7 Jul 2022 16:26:51 +0300 Subject: [PATCH 0485/1022] retry timeline delete --- test_runner/batch_others/test_timeline_delete.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/test_runner/batch_others/test_timeline_delete.py b/test_runner/batch_others/test_timeline_delete.py index 2c9a4cd913..594475faf4 100644 --- a/test_runner/batch_others/test_timeline_delete.py +++ b/test_runner/batch_others/test_timeline_delete.py @@ -1,7 +1,7 @@ from uuid import uuid4 import pytest -from fixtures.neon_fixtures import NeonEnv, NeonPageserverApiException +from fixtures.neon_fixtures import NeonEnv, NeonPageserverApiException, wait_until def test_timeline_delete(neon_simple_env: NeonEnv): @@ -42,7 +42,10 @@ def test_timeline_delete(neon_simple_env: NeonEnv): timeline_path = env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / leaf_timeline_id.hex assert timeline_path.exists() - ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id) + # retry deletes when compaction or gc is running in pageserver + wait_until(number_of_iterations=3, + interval=0.2, + func=lambda: ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)) assert not timeline_path.exists() @@ -51,5 +54,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): match="is not found neither locally nor remotely"): ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) - # FIXME leaves tenant without timelines, should we prevent deletion of root timeline? - ps_http.timeline_delete(env.initial_tenant, parent_timeline_id) + # FIXME leaves tenant without timelines, should we prevent deletion of root timeline? + wait_until(number_of_iterations=3, + interval=0.2, + func=lambda: ps_http.timeline_delete(env.initial_tenant, parent_timeline_id)) From 00c26ff3a3bebcc4f757ee6b475d61e29acf6dbd Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 7 Jul 2022 19:53:23 +0100 Subject: [PATCH 0486/1022] Bring periodic perf tests on GitHub back (#2037) * test/fixtures: fix DeprecationWarning * workflows/benchmarking: increase timeout * test: switch pgbench to default(simple) query mode * test/performance: ensure we don't have tables that we're creating * workflows/pg_clients: remove unused env var * workflows/benchmarking: change platform name --- .github/workflows/benchmarking.yml | 8 +++---- .github/workflows/pg_clients.yml | 3 --- test_runner/fixtures/neon_fixtures.py | 2 +- .../performance/test_compare_pg_stats.py | 24 +++++-------------- test_runner/performance/test_hot_page.py | 1 + test_runner/performance/test_hot_table.py | 1 + test_runner/performance/test_perf_pgbench.py | 6 ++--- .../performance/test_wal_backpressure.py | 1 - 8 files changed, 15 insertions(+), 31 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index adb53c0009..01dd9d00b0 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -26,11 +26,11 @@ jobs: runs-on: [self-hosted, zenith-benchmarker] env: - POSTGRES_DISTRIB_DIR: "/usr/pgsql-13" + POSTGRES_DISTRIB_DIR: "/usr/pgsql-14" steps: - name: Checkout zenith repo - uses: actions/checkout@v2 + uses: actions/checkout@v3 # actions/setup-python@v2 is not working correctly on self-hosted runners # see https://github.com/actions/setup-python/issues/162 @@ -88,7 +88,7 @@ jobs: # Plus time needed to initialize the test databases. TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" - PLATFORM: "zenith-staging" + PLATFORM: "neon-staging" BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally run: | @@ -96,7 +96,7 @@ jobs: # since it might generate duplicates when calling ingest_perf_test_result.py rm -rf perf-report-staging mkdir -p perf-report-staging - ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging + ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging --timeout 3600 - name: Submit result env: diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index 66f259d012..fe4dbea8ac 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -48,9 +48,6 @@ jobs: BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" TEST_OUTPUT: /tmp/test_output POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - # this variable will be embedded in perf test report - # and is needed to distinguish different environments - PLATFORM: github-actions-selfhosted shell: bash -ex {0} run: | # Test framework expects we have psql binary; diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 24b234ac25..be0322d418 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -324,7 +324,7 @@ class PgProtocol: # Convert options='-c=' to server_settings if 'options' in conn_options: options = conn_options.pop('options') - for match in re.finditer('-c(\w*)=(\w*)', options): + for match in re.finditer(r'-c(\w*)=(\w*)', options): key = match.group(1) val = match.group(2) if 'server_options' in conn_options: diff --git a/test_runner/performance/test_compare_pg_stats.py b/test_runner/performance/test_compare_pg_stats.py index 798974eac2..a8a9e3cd4d 100644 --- a/test_runner/performance/test_compare_pg_stats.py +++ b/test_runner/performance/test_compare_pg_stats.py @@ -28,7 +28,7 @@ def test_compare_pg_stats_rw_with_pgbench_default(neon_with_baseline: PgCompare, with env.record_pg_stats(pg_stats_rw): env.pg_bin.run_capture( - ['pgbench', f'-T{duration}', f'--random-seed={seed}', '-Mprepared', env.pg.connstr()]) + ['pgbench', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) env.flush() @@ -46,14 +46,8 @@ def test_compare_pg_stats_wo_with_pgbench_simple_update(neon_with_baseline: PgCo env.flush() with env.record_pg_stats(pg_stats_wo): - env.pg_bin.run_capture([ - 'pgbench', - '-N', - f'-T{duration}', - f'--random-seed={seed}', - '-Mprepared', - env.pg.connstr() - ]) + env.pg_bin.run_capture( + ['pgbench', '-N', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) env.flush() @@ -71,14 +65,8 @@ def test_compare_pg_stats_ro_with_pgbench_select_only(neon_with_baseline: PgComp env.flush() with env.record_pg_stats(pg_stats_ro): - env.pg_bin.run_capture([ - 'pgbench', - '-S', - f'-T{duration}', - f'--random-seed={seed}', - '-Mprepared', - env.pg.connstr() - ]) + env.pg_bin.run_capture( + ['pgbench', '-S', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) env.flush() @@ -97,5 +85,5 @@ def test_compare_pg_stats_wal_with_pgbench_default(neon_with_baseline: PgCompare with env.record_pg_stats(pg_stats_wal): env.pg_bin.run_capture( - ['pgbench', f'-T{duration}', f'--random-seed={seed}', '-Mprepared', env.pg.connstr()]) + ['pgbench', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) env.flush() diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py index a9124b55cf..d3da0310ce 100644 --- a/test_runner/performance/test_hot_page.py +++ b/test_runner/performance/test_hot_page.py @@ -18,6 +18,7 @@ def test_hot_page(env: PgCompare): with closing(env.pg.connect()) as conn: with conn.cursor() as cur: + cur.execute('drop table if exists t, f;') # Write many updates to the same row with env.record_duration('write'): diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py index 229c56122f..997c772f88 100644 --- a/test_runner/performance/test_hot_table.py +++ b/test_runner/performance/test_hot_table.py @@ -20,6 +20,7 @@ def test_hot_table(env: PgCompare): with closing(env.pg.connect()) as conn: with conn.cursor() as cur: + cur.execute('drop table if exists t;') # Write many updates to a small table with env.record_duration('write'): diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 6ebb6d6ecf..8644ced6d9 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -78,13 +78,11 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int): # Run simple-update workload run_pgbench(env, - "simple-update", - ['pgbench', '-N', '-c4', f'-T{duration}', '-P2', '-Mprepared', env.pg.connstr()]) + "simple-update", ['pgbench', '-N', '-c4', f'-T{duration}', '-P2', env.pg.connstr()]) # Run SELECT workload run_pgbench(env, - "select-only", - ['pgbench', '-S', '-c4', f'-T{duration}', '-P2', '-Mprepared', env.pg.connstr()]) + "select-only", ['pgbench', '-S', '-c4', f'-T{duration}', '-P2', env.pg.connstr()]) env.report_size() diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 862b5e1c5e..2a79a778fc 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -116,7 +116,6 @@ def start_pgbench_simple_update_workload(env: PgCompare, duration: int): '-c10', '-N', f'-T{duration}', - '-Mprepared', env.pg.connstr(options="-csynchronous_commit=off") ]) env.flush() From c08fa9d5627ec1d3a9a424484fcb018a0613cae2 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Fri, 24 Jun 2022 22:58:09 +0300 Subject: [PATCH 0487/1022] postgres_ffi/wal_generate: support generating WAL for an already running Postgres server * ensure_server_config() function is added to ensure the server does not have background processes which intervene with WAL generation * Rework command line syntax * Add `print-postgres-config` subcommand which prints the required server configuration --- Cargo.lock | 1 + libs/postgres_ffi/wal_generate/Cargo.toml | 1 + .../wal_generate/src/bin/wal_generate.rs | 118 ++++++++++++------ libs/postgres_ffi/wal_generate/src/lib.rs | 38 ++++-- 4 files changed, 110 insertions(+), 48 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e812ce7eab..1f4cb8f3d7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3760,6 +3760,7 @@ dependencies = [ "clap 3.0.14", "env_logger", "log", + "once_cell", "postgres", "tempfile", ] diff --git a/libs/postgres_ffi/wal_generate/Cargo.toml b/libs/postgres_ffi/wal_generate/Cargo.toml index a10671dddd..7edb36937d 100644 --- a/libs/postgres_ffi/wal_generate/Cargo.toml +++ b/libs/postgres_ffi/wal_generate/Cargo.toml @@ -10,5 +10,6 @@ anyhow = "1.0" clap = "3.0" env_logger = "0.9" log = "0.4" +once_cell = "1.8.0" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tempfile = "3.2" diff --git a/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs b/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs index 07ceb31c7f..0da47f32c1 100644 --- a/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs +++ b/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs @@ -1,5 +1,6 @@ use anyhow::*; -use clap::{App, Arg}; +use clap::{App, Arg, ArgMatches}; +use std::str::FromStr; use wal_generate::*; fn main() -> Result<()> { @@ -7,52 +8,91 @@ fn main() -> Result<()> { env_logger::Env::default().default_filter_or("wal_generate=info"), ) .init(); + let type_arg = &Arg::new("type") + .takes_value(true) + .help("Type of WAL to generate") + .possible_values([ + "simple", + "last_wal_record_crossing_segment", + "wal_record_crossing_segment_followed_by_small_one", + ]) + .required(true); let arg_matches = App::new("Postgres WAL generator") .about("Generates Postgres databases with specific WAL properties") - .arg( - Arg::new("datadir") - .short('D') - .long("datadir") - .takes_value(true) - .help("Data directory for the Postgres server") - .required(true) + .subcommand( + App::new("print-postgres-config") + .about("Print the configuration required for PostgreSQL server before running this script") ) - .arg( - Arg::new("pg-distrib-dir") - .long("pg-distrib-dir") - .takes_value(true) - .help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)") - .default_value("/usr/local") + .subcommand( + App::new("with-initdb") + .about("Generate WAL in a new data directory first initialized with initdb") + .arg(type_arg) + .arg( + Arg::new("datadir") + .takes_value(true) + .help("Data directory for the Postgres server") + .required(true) + ) + .arg( + Arg::new("pg-distrib-dir") + .long("pg-distrib-dir") + .takes_value(true) + .help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)") + .default_value("/usr/local") + ) ) - .arg( - Arg::new("type") - .long("type") - .takes_value(true) - .help("Type of WAL to generate") - .possible_values(["simple", "last_wal_record_crossing_segment", "wal_record_crossing_segment_followed_by_small_one"]) - .required(true) + .subcommand( + App::new("in-existing") + .about("Generate WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.") + .arg(type_arg) + .arg( + Arg::new("connection") + .takes_value(true) + .help("Connection string to the Postgres database to populate") + .required(true) + ) ) .get_matches(); - let cfg = Conf { - pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(), - datadir: arg_matches.value_of("datadir").unwrap().into(), + let wal_generate = |arg_matches: &ArgMatches, client| { + let lsn = match arg_matches.value_of("type").unwrap() { + "simple" => generate_simple(client)?, + "last_wal_record_crossing_segment" => { + generate_last_wal_record_crossing_segment(client)? + } + "wal_record_crossing_segment_followed_by_small_one" => { + generate_wal_record_crossing_segment_followed_by_small_one(client)? + } + a => panic!("Unknown --type argument: {}", a), + }; + println!("end_of_wal = {}", lsn); + Ok(()) }; - cfg.initdb()?; - let mut srv = cfg.start_server()?; - let lsn = match arg_matches.value_of("type").unwrap() { - "simple" => generate_simple(&mut srv.connect_with_timeout()?)?, - "last_wal_record_crossing_segment" => { - generate_last_wal_record_crossing_segment(&mut srv.connect_with_timeout()?)? + + match arg_matches.subcommand() { + None => panic!("No subcommand provided"), + Some(("print-postgres-config", _)) => { + for cfg in REQUIRED_POSTGRES_CONFIG.iter() { + println!("{}", cfg); + } + Ok(()) } - "wal_record_crossing_segment_followed_by_small_one" => { - generate_wal_record_crossing_segment_followed_by_small_one( - &mut srv.connect_with_timeout()?, - )? + Some(("with-initdb", arg_matches)) => { + let cfg = Conf { + pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(), + datadir: arg_matches.value_of("datadir").unwrap().into(), + }; + cfg.initdb()?; + let mut srv = cfg.start_server()?; + wal_generate(arg_matches, &mut srv.connect_with_timeout()?)?; + srv.kill(); + Ok(()) } - a => panic!("Unknown --type argument: {}", a), - }; - println!("end_of_wal = {}", lsn); - srv.kill(); - Ok(()) + Some(("in-existing", arg_matches)) => wal_generate( + arg_matches, + &mut postgres::Config::from_str(arg_matches.value_of("connection").unwrap())? + .connect(postgres::NoTls)?, + ), + Some(_) => panic!("Unknown subcommand"), + } } diff --git a/libs/postgres_ffi/wal_generate/src/lib.rs b/libs/postgres_ffi/wal_generate/src/lib.rs index 2b3f5ef703..78ce320515 100644 --- a/libs/postgres_ffi/wal_generate/src/lib.rs +++ b/libs/postgres_ffi/wal_generate/src/lib.rs @@ -1,6 +1,7 @@ use anyhow::*; use core::time::Duration; use log::*; +use once_cell::sync::Lazy; use postgres::types::PgLsn; use postgres::Client; use std::cmp::Ordering; @@ -22,6 +23,16 @@ pub struct PostgresServer { client_config: postgres::Config, } +pub static REQUIRED_POSTGRES_CONFIG: Lazy> = Lazy::new(|| { + vec![ + "wal_keep_size=50MB", // Ensure old WAL is not removed + "shared_preload_libraries=neon", // can only be loaded at startup + // Disable background processes as much as possible + "wal_writer_delay=10s", + "autovacuum=off", + ] +}); + impl Conf { fn pg_bin_dir(&self) -> PathBuf { self.pg_distrib_dir.join("bin") @@ -85,12 +96,8 @@ impl Conf { .arg(unix_socket_dir_path.as_os_str()) .arg("-D") .arg(self.datadir.as_os_str()) - .args(&["-c", "wal_keep_size=50MB"]) // Ensure old WAL is not removed .args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output - .args(&["-c", "shared_preload_libraries=neon"]) // can only be loaded at startup - // Disable background processes as much as possible - .args(&["-c", "wal_writer_delay=10s"]) - .args(&["-c", "autovacuum=off"]) + .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg])) .stderr(Stdio::from(log_file)) .spawn()?; let server = PostgresServer { @@ -181,12 +188,16 @@ pub trait PostgresClientExt: postgres::GenericClient { impl PostgresClientExt for C {} -fn generate_internal( - client: &mut C, - f: impl Fn(&mut C, PgLsn) -> Result>, -) -> Result { +pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result<()> { client.execute("create extension if not exists neon_test_utils", &[])?; + let wal_keep_size: String = client.query_one("SHOW wal_keep_size", &[])?.get(0); + ensure!(wal_keep_size == "50MB"); + let wal_writer_delay: String = client.query_one("SHOW wal_writer_delay", &[])?.get(0); + ensure!(wal_writer_delay == "10s"); + let autovacuum: String = client.query_one("SHOW autovacuum", &[])?.get(0); + ensure!(autovacuum == "off"); + let wal_segment_size = client.query_one( "select cast(setting as bigint) as setting, unit \ from pg_settings where name = 'wal_segment_size'", @@ -201,6 +212,15 @@ fn generate_internal( "Unexpected wal_segment_size in bytes" ); + Ok(()) +} + +fn generate_internal( + client: &mut C, + f: impl Fn(&mut C, PgLsn) -> Result>, +) -> Result { + ensure_server_config(client)?; + let initial_lsn = client.pg_current_wal_insert_lsn()?; info!("LSN initial = {}", initial_lsn); From bcdee3d3b54eb03e71febc6eac88f6b77b91a4d3 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Sat, 25 Jun 2022 00:38:30 +0300 Subject: [PATCH 0488/1022] test_runner: add test_crafted_wal_end.py For some reason both non-`simple` tests spend about 10 seconds in the post-restart `INSERT INTO` query on my machine, see #2023 --- .../batch_others/test_crafted_wal_end.py | 61 +++++++ test_runner/fixtures/neon_fixtures.py | 159 +++++++++++------- 2 files changed, 157 insertions(+), 63 deletions(-) create mode 100644 test_runner/batch_others/test_crafted_wal_end.py diff --git a/test_runner/batch_others/test_crafted_wal_end.py b/test_runner/batch_others/test_crafted_wal_end.py new file mode 100644 index 0000000000..1ddaadfae9 --- /dev/null +++ b/test_runner/batch_others/test_crafted_wal_end.py @@ -0,0 +1,61 @@ +from fixtures.neon_fixtures import NeonEnvBuilder, WalGenerate +from fixtures.log_helper import log +import pytest + +# Restart nodes with WAL end having specially crafted shape, like last record +# crossing segment boundary, to test decoding issues. + + +@pytest.mark.parametrize('wal_type', + [ + 'simple', + 'last_wal_record_crossing_segment', + 'wal_record_crossing_segment_followed_by_small_one', + ]) +def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + env.neon_cli.create_branch('test_crafted_wal_end') + + pg = env.postgres.create('test_crafted_wal_end') + gen = WalGenerate(env) + pg.config(gen.postgres_config()) + pg.start() + res = pg.safe_psql_many(queries=[ + 'CREATE TABLE keys(key int primary key)', + 'INSERT INTO keys SELECT generate_series(1, 100)', + 'SELECT SUM(key) FROM keys' + ]) + assert res[-1][0] == (5050, ) + + gen.in_existing(wal_type, pg.connstr()) + + log.info("Restarting all safekeepers and pageservers") + env.pageserver.stop() + env.safekeepers[0].stop() + env.safekeepers[0].start() + env.pageserver.start() + + log.info("Trying more queries") + res = pg.safe_psql_many(queries=[ + 'SELECT SUM(key) FROM keys', + 'INSERT INTO keys SELECT generate_series(101, 200)', + 'SELECT SUM(key) FROM keys', + ]) + assert res[0][0] == (5050, ) + assert res[-1][0] == (20100, ) + + log.info("Restarting all safekeepers and pageservers (again)") + env.pageserver.stop() + env.safekeepers[0].stop() + env.safekeepers[0].start() + env.pageserver.start() + + log.info("Trying more queries (again)") + res = pg.safe_psql_many(queries=[ + 'SELECT SUM(key) FROM keys', + 'INSERT INTO keys SELECT generate_series(201, 300)', + 'SELECT SUM(key) FROM keys', + ]) + assert res[0][0] == (20100, ) + assert res[-1][0] == (45150, ) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index be0322d418..d91ea398f9 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4,6 +4,7 @@ from dataclasses import field from enum import Flag, auto import textwrap from cached_property import cached_property +import abc import asyncpg import os import boto3 @@ -908,14 +909,89 @@ TIMELINE_DATA_EXTRACTOR = re.compile(r"\s(?P[^\s]+)\s\[(?P 'subprocess.CompletedProcess[str]': + """ + Run the command with the specified arguments. + + Arguments must be in list form, e.g. ['pg', 'create'] + + Return both stdout and stderr, which can be accessed as + + >>> result = env.neon_cli.raw_cli(...) + >>> assert result.stderr == "" + >>> log.info(result.stdout) + + If `check_return_code`, on non-zero exit code logs failure and raises. + """ + + assert type(arguments) == list + assert type(self.COMMAND) == str + + bin_neon = os.path.join(str(neon_binpath), self.COMMAND) + + args = [bin_neon] + arguments + log.info('Running command "{}"'.format(' '.join(args))) + log.info(f'Running in "{self.env.repo_dir}"') + + env_vars = os.environ.copy() + env_vars['NEON_REPO_DIR'] = str(self.env.repo_dir) + env_vars['POSTGRES_DISTRIB_DIR'] = str(pg_distrib_dir) + if self.env.rust_log_override is not None: + env_vars['RUST_LOG'] = self.env.rust_log_override + for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): + env_vars[extra_env_key] = extra_env_value + + # Pass coverage settings + var = 'LLVM_PROFILE_FILE' + val = os.environ.get(var) + if val: + env_vars[var] = val + + # Intercept CalledProcessError and print more info + res = subprocess.run(args, + env=env_vars, + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if not res.returncode: + log.info(f"Run success: {res.stdout}") + elif check_return_code: + # this way command output will be in recorded and shown in CI in failure message + msg = f"""\ + Run {res.args} failed: + stdout: {res.stdout} + stderr: {res.stderr} + """ + log.info(msg) + raise Exception(msg) from subprocess.CalledProcessError(res.returncode, + res.args, + res.stdout, + res.stderr) + return res + + +class NeonCli(AbstractNeonCli): """ A typed wrapper around the `neon` CLI tool. Supports main commands via typed methods and a way to run arbitrary command directly via CLI. """ - def __init__(self, env: NeonEnv): - self.env = env - pass + + COMMAND = 'neon_local' def create_tenant(self, tenant_id: Optional[uuid.UUID] = None, @@ -1186,69 +1262,26 @@ class NeonCli: return self.raw_cli(args, check_return_code=check_return_code) - def raw_cli(self, - arguments: List[str], - extra_env_vars: Optional[Dict[str, str]] = None, - check_return_code=True) -> 'subprocess.CompletedProcess[str]': - """ - Run "neon" with the specified arguments. - Arguments must be in list form, e.g. ['pg', 'create'] +class WalGenerate(AbstractNeonCli): + """ + A typed wrapper around the `wal_generate` CLI tool. + Supports main commands via typed methods and a way to run arbitrary command directly via CLI. + """ - Return both stdout and stderr, which can be accessed as + COMMAND = 'wal_generate' - >>> result = env.neon_cli.raw_cli(...) - >>> assert result.stderr == "" - >>> log.info(result.stdout) + def postgres_config(self) -> List[str]: + res = self.raw_cli(["print-postgres-config"]) + res.check_returncode() + return res.stdout.split('\n') - If `check_return_code`, on non-zero exit code logs failure and raises. - """ - - assert type(arguments) == list - - bin_neon = os.path.join(str(neon_binpath), 'neon_local') - - args = [bin_neon] + arguments - log.info('Running command "{}"'.format(' '.join(args))) - log.info(f'Running in "{self.env.repo_dir}"') - - env_vars = os.environ.copy() - env_vars['NEON_REPO_DIR'] = str(self.env.repo_dir) - env_vars['POSTGRES_DISTRIB_DIR'] = str(pg_distrib_dir) - if self.env.rust_log_override is not None: - env_vars['RUST_LOG'] = self.env.rust_log_override - for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): - env_vars[extra_env_key] = extra_env_value - - # Pass coverage settings - var = 'LLVM_PROFILE_FILE' - val = os.environ.get(var) - if val: - env_vars[var] = val - - # Intercept CalledProcessError and print more info - res = subprocess.run(args, - env=env_vars, - check=False, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - if not res.returncode: - log.info(f"Run success: {res.stdout}") - elif check_return_code: - # this way command output will be in recorded and shown in CI in failure message - msg = f"""\ - Run {res.args} failed: - stdout: {res.stdout} - stderr: {res.stderr} - """ - log.info(msg) - raise Exception(msg) from subprocess.CalledProcessError(res.returncode, - res.args, - res.stdout, - res.stderr) - - return res + def in_existing(self, type: str, connection: str) -> int: + res = self.raw_cli(["in-existing", type, connection]) + res.check_returncode() + m = re.fullmatch(r'end_of_wal = (.*)\n', res.stdout) + assert m + return lsn_from_hex(m.group(1)) class NeonPageserver(PgProtocol): From 52f445094aa722dff377b36995b9562f752302cf Mon Sep 17 00:00:00 2001 From: MMeent Date: Fri, 8 Jul 2022 14:51:44 +0200 Subject: [PATCH 0489/1022] Update vendor/postgres to 14.4 (#2049) Co-authored-by: Matthias van de Meent --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 35ad142301..9c99008445 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 35ad142301bde7982aadae5403e9524bf5a7cce1 +Subproject commit 9c99008445dbccd8204f188e0933def507058eac From 85bda437de51ba568203ca0989919bb5050be4fd Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Sat, 25 Jun 2022 00:53:45 +0300 Subject: [PATCH 0490/1022] postgres_ffi/wal_generate: add last_wal_record_xlog_switch and use it in tests Fix #1190: WalDecoder did not return correct LSN of the next record after processing a XLOG_SWITCH record --- libs/postgres_ffi/src/waldecoder.rs | 8 ++++---- .../wal_generate/src/bin/wal_generate.rs | 2 ++ libs/postgres_ffi/wal_generate/src/lib.rs | 19 +++++++++++++++++++ .../batch_others/test_crafted_wal_end.py | 1 + 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index 91542d268f..f7bd70653c 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -226,10 +226,10 @@ impl WalStreamDecoder { self.padlen = self.lsn.calc_padding(8u32) as u32; } - // Always align resulting LSN on 0x8 boundary -- that is important for getPage() - // and WalReceiver integration. Since this code is used both for WalReceiver and - // initial WAL import let's force alignment right here. - let result = (self.lsn.align(), recordbuf); + // We should return LSN of the next record, not the last byte of this record or + // the byte immediately after. Note that this handles both XLOG_SWITCH and usual + // records, the former "spans" until the next WAL segment (see test_xlog_switch). + let result = (self.lsn + self.padlen as u64, recordbuf); Ok(Some(result)) } } diff --git a/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs b/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs index 0da47f32c1..6ed34caf28 100644 --- a/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs +++ b/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs @@ -13,6 +13,7 @@ fn main() -> Result<()> { .help("Type of WAL to generate") .possible_values([ "simple", + "last_wal_record_xlog_switch", "last_wal_record_crossing_segment", "wal_record_crossing_segment_followed_by_small_one", ]) @@ -57,6 +58,7 @@ fn main() -> Result<()> { let wal_generate = |arg_matches: &ArgMatches, client| { let lsn = match arg_matches.value_of("type").unwrap() { "simple" => generate_simple(client)?, + "last_wal_record_xlog_switch" => generate_last_wal_record_xlog_switch(client)?, "last_wal_record_crossing_segment" => { generate_last_wal_record_crossing_segment(client)? } diff --git a/libs/postgres_ffi/wal_generate/src/lib.rs b/libs/postgres_ffi/wal_generate/src/lib.rs index 78ce320515..01639ccfff 100644 --- a/libs/postgres_ffi/wal_generate/src/lib.rs +++ b/libs/postgres_ffi/wal_generate/src/lib.rs @@ -250,6 +250,25 @@ pub fn generate_simple(client: &mut impl postgres::GenericClient) -> Result Result { + // Do not use generate_internal because here we end up with flush_lsn exactly on + // the segment boundary and insert_lsn after the initial page header, which is unusual. + ensure_server_config(client)?; + + client.execute("CREATE table t(x int)", &[])?; + let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); + let next_segment = PgLsn::from(0x0200_0000); + ensure!( + after_xlog_switch <= next_segment, + "XLOG_SWITCH message ended after the expected segment boundary: {} > {}", + after_xlog_switch, + next_segment + ); + Ok(next_segment) +} + fn generate_single_logical_message( client: &mut impl postgres::GenericClient, transactional: bool, diff --git a/test_runner/batch_others/test_crafted_wal_end.py b/test_runner/batch_others/test_crafted_wal_end.py index 1ddaadfae9..c4674f802e 100644 --- a/test_runner/batch_others/test_crafted_wal_end.py +++ b/test_runner/batch_others/test_crafted_wal_end.py @@ -9,6 +9,7 @@ import pytest @pytest.mark.parametrize('wal_type', [ 'simple', + 'last_wal_record_xlog_switch', 'last_wal_record_crossing_segment', 'wal_record_crossing_segment_followed_by_small_one', ]) From 80b7a3b51afe2bd8895b16c9caba840a9aa65124 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Sat, 25 Jun 2022 02:16:50 +0300 Subject: [PATCH 0491/1022] Test what happens when XLOG_SWITCH ends on page boundary, fix #1991 --- Cargo.lock | 1 + libs/postgres_ffi/src/waldecoder.rs | 21 +++--- libs/postgres_ffi/wal_generate/Cargo.toml | 1 + .../wal_generate/src/bin/wal_generate.rs | 4 ++ libs/postgres_ffi/wal_generate/src/lib.rs | 67 +++++++++++++++++++ .../batch_others/test_crafted_wal_end.py | 1 + 6 files changed, 85 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1f4cb8f3d7..6924c0c74a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3762,6 +3762,7 @@ dependencies = [ "log", "once_cell", "postgres", + "postgres_ffi", "tempfile", ] diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index f7bd70653c..7a69f471d9 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -82,7 +82,17 @@ impl WalStreamDecoder { // that cross page boundaries. loop { // parse and verify page boundaries as we go - if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 { + if self.padlen > 0 { + // We should first skip padding, as we may have to skip some page headers if we're processing the XLOG_SWITCH record. + if self.inputbuf.remaining() < self.padlen as usize { + return Ok(None); + } + + // skip padding + self.inputbuf.advance(self.padlen as usize); + self.lsn += self.padlen as u64; + self.padlen = 0; + } else if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 { // parse long header if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD { @@ -128,15 +138,6 @@ impl WalStreamDecoder { self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64; continue; - } else if self.padlen > 0 { - if self.inputbuf.remaining() < self.padlen as usize { - return Ok(None); - } - - // skip padding - self.inputbuf.advance(self.padlen as usize); - self.lsn += self.padlen as u64; - self.padlen = 0; } else if self.contlen == 0 { assert!(self.recordbuf.is_empty()); diff --git a/libs/postgres_ffi/wal_generate/Cargo.toml b/libs/postgres_ffi/wal_generate/Cargo.toml index 7edb36937d..ce1a60c4f8 100644 --- a/libs/postgres_ffi/wal_generate/Cargo.toml +++ b/libs/postgres_ffi/wal_generate/Cargo.toml @@ -12,4 +12,5 @@ env_logger = "0.9" log = "0.4" once_cell = "1.8.0" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres_ffi = { path = "../" } tempfile = "3.2" diff --git a/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs b/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs index 6ed34caf28..1549bfb505 100644 --- a/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs +++ b/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs @@ -14,6 +14,7 @@ fn main() -> Result<()> { .possible_values([ "simple", "last_wal_record_xlog_switch", + "last_wal_record_xlog_switch_ends_on_page_boundary", "last_wal_record_crossing_segment", "wal_record_crossing_segment_followed_by_small_one", ]) @@ -59,6 +60,9 @@ fn main() -> Result<()> { let lsn = match arg_matches.value_of("type").unwrap() { "simple" => generate_simple(client)?, "last_wal_record_xlog_switch" => generate_last_wal_record_xlog_switch(client)?, + "last_wal_record_xlog_switch_ends_on_page_boundary" => { + generate_last_wal_record_xlog_switch_ends_on_page_boundary(client)? + } "last_wal_record_crossing_segment" => { generate_last_wal_record_crossing_segment(client)? } diff --git a/libs/postgres_ffi/wal_generate/src/lib.rs b/libs/postgres_ffi/wal_generate/src/lib.rs index 01639ccfff..ac6fcc441d 100644 --- a/libs/postgres_ffi/wal_generate/src/lib.rs +++ b/libs/postgres_ffi/wal_generate/src/lib.rs @@ -4,6 +4,9 @@ use log::*; use once_cell::sync::Lazy; use postgres::types::PgLsn; use postgres::Client; +use postgres_ffi::xlog_utils::{ + XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, +}; use std::cmp::Ordering; use std::fs; use std::path::{Path, PathBuf}; @@ -269,6 +272,70 @@ pub fn generate_last_wal_record_xlog_switch( Ok(next_segment) } +pub fn generate_last_wal_record_xlog_switch_ends_on_page_boundary( + client: &mut impl postgres::GenericClient, +) -> Result { + // Do not use generate_internal because here we end up with flush_lsn exactly on + // the segment boundary and insert_lsn after the initial page header, which is unusual. + ensure_server_config(client)?; + + client.execute("CREATE table t(x int)", &[])?; + + // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. + // We will use logical message as the padding. We start with detecting how much WAL + // it takes for one logical message, considering all alignments and headers. + let base_wal_advance = { + let before_lsn = client.pg_current_wal_insert_lsn()?; + // Small non-empty message bigger than few bytes is more likely than an empty + // message to have the same format as the big padding message. + client.execute( + "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))", + &[], + )?; + // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD. + (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize + + XLOG_SIZE_OF_XLOG_RECORD + }; + let mut remaining_lsn = + XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ; + if remaining_lsn < base_wal_advance { + remaining_lsn += XLOG_BLCKSZ; + } + let repeats = 10 + remaining_lsn - base_wal_advance; + info!( + "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}", + client.pg_current_wal_insert_lsn()?, + remaining_lsn, + base_wal_advance, + repeats + ); + client.execute( + "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))", + &[&(repeats as i32)], + )?; + info!( + "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", + client.pg_current_wal_insert_lsn()?, + XLOG_SIZE_OF_XLOG_RECORD + ); + + // Emit the XLOG_SWITCH + let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); + let next_segment = PgLsn::from(0x0200_0000); + ensure!( + after_xlog_switch < next_segment, + "XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}", + after_xlog_switch, + next_segment + ); + ensure!( + u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD, + "XLOG_SWITCH message ended not on page boundary: {}", + after_xlog_switch + ); + Ok(next_segment) +} + fn generate_single_logical_message( client: &mut impl postgres::GenericClient, transactional: bool, diff --git a/test_runner/batch_others/test_crafted_wal_end.py b/test_runner/batch_others/test_crafted_wal_end.py index c4674f802e..945dfffe4f 100644 --- a/test_runner/batch_others/test_crafted_wal_end.py +++ b/test_runner/batch_others/test_crafted_wal_end.py @@ -10,6 +10,7 @@ import pytest [ 'simple', 'last_wal_record_xlog_switch', + 'last_wal_record_xlog_switch_ends_on_page_boundary', 'last_wal_record_crossing_segment', 'wal_record_crossing_segment_followed_by_small_one', ]) From 1f5918b36d16a85ffd379d009ffe1d89023562f8 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Fri, 8 Jul 2022 10:29:29 -0400 Subject: [PATCH 0492/1022] Delay calculating the starting LSN when doing timeline branching (#2053) Previously, upon branching, if no starting LSN is specified, we determine the start LSN based on the source timeline's last record LSN in `timelines::create_timeline` function, which then calls `Repository::branch_timeline` to create the timeline. Inside the `LayeredRepository::branch_timeline` function, to start branching, we try to acquire a GC lock to prevent GC from removing data needed for the new timeline. However, a GC iteration takes time, so the GC lock can be held for a long period of time. As a result, the previously determined starting LSN can become invalid because of GC. This PR fixes the above issue by delaying the LSN calculation part and moving it to be inside `LayeredRepository::branch_timeline` function. --- pageserver/src/layered_repository.rs | 19 +++++++++++++++--- pageserver/src/repository.rs | 19 +++++++++++------- pageserver/src/timelines.rs | 30 +++++++++++----------------- 3 files changed, 40 insertions(+), 28 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 55f01ee962..392aa52844 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -270,7 +270,12 @@ impl Repository for LayeredRepository { } /// Branch a timeline - fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()> { + fn branch_timeline( + &self, + src: ZTimelineId, + dst: ZTimelineId, + start_lsn: Option, + ) -> Result<()> { // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn // about timelines, so otherwise a race condition is possible, where we create new timeline and GC // concurrently removes data that is needed by the new timeline. @@ -283,6 +288,14 @@ impl Repository for LayeredRepository { .context("failed to load timeline for branching")? .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {}", &src))?; let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); + + // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN + let start_lsn = start_lsn.unwrap_or_else(|| { + let lsn = src_timeline.get_last_record_lsn(); + info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}"); + lsn + }); + src_timeline .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) .context("invalid branch start lsn")?; @@ -2874,7 +2887,7 @@ pub mod tests { let mut tline_id = TIMELINE_ID; for _ in 0..50 { let new_tline_id = ZTimelineId::generate(); - repo.branch_timeline(tline_id, new_tline_id, lsn)?; + repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?; tline = repo.get_timeline_load(new_tline_id)?; tline_id = new_tline_id; @@ -2933,7 +2946,7 @@ pub mod tests { #[allow(clippy::needless_range_loop)] for idx in 0..NUM_TLINES { let new_tline_id = ZTimelineId::generate(); - repo.branch_timeline(tline_id, new_tline_id, lsn)?; + repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?; tline = repo.get_timeline_load(new_tline_id)?; tline_id = new_tline_id; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 5b28681b16..17e8806899 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -211,7 +211,12 @@ pub trait Repository: Send + Sync { ) -> Result>; /// Branch a timeline - fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>; + fn branch_timeline( + &self, + src: ZTimelineId, + dst: ZTimelineId, + start_lsn: Option, + ) -> Result<()>; /// Flush all data to disk. /// @@ -662,7 +667,7 @@ mod tests { //assert_current_logical_size(&tline, Lsn(0x40)); // Branch the history, modify relation differently on the new timeline - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; let newtline = repo .get_timeline_load(NEW_TIMELINE_ID) .expect("Should have a local timeline"); @@ -744,7 +749,7 @@ mod tests { repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; // try to branch at lsn 25, should fail because we already garbage collected the data - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) { + match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), Err(err) => { assert!(err.to_string().contains("invalid branch start lsn")); @@ -765,7 +770,7 @@ mod tests { repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) { + match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), Err(err) => { assert!(&err.to_string().contains("invalid branch start lsn")); @@ -810,7 +815,7 @@ mod tests { let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = repo .get_timeline_load(NEW_TIMELINE_ID) .expect("Should have a local timeline"); @@ -826,7 +831,7 @@ mod tests { let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = repo .get_timeline_load(NEW_TIMELINE_ID) .expect("Should have a local timeline"); @@ -884,7 +889,7 @@ mod tests { make_some_layers(tline.as_ref(), Lsn(0x20))?; tline.checkpoint(CheckpointConfig::Forced)?; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = repo .get_timeline_load(NEW_TIMELINE_ID) diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index e0e79e4166..a40e705cb9 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -347,7 +347,7 @@ pub(crate) fn create_timeline( tenant_id: ZTenantId, new_timeline_id: Option, ancestor_timeline_id: Option, - ancestor_start_lsn: Option, + mut ancestor_start_lsn: Option, ) -> Result> { let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; @@ -357,41 +357,35 @@ pub(crate) fn create_timeline( return Ok(None); } - let mut start_lsn = ancestor_start_lsn.unwrap_or(Lsn(0)); - let new_timeline_info = match ancestor_timeline_id { Some(ancestor_timeline_id) => { let ancestor_timeline = repo .get_timeline_load(ancestor_timeline_id) .context("Cannot branch off the timeline that's not present locally")?; - if start_lsn == Lsn(0) { - // Find end of WAL on the old timeline - let end_of_wal = ancestor_timeline.get_last_record_lsn(); - info!("branching at end of WAL: {}", end_of_wal); - start_lsn = end_of_wal; - } else { + if let Some(lsn) = ancestor_start_lsn.as_mut() { // Wait for the WAL to arrive and be processed on the parent branch up // to the requested branch point. The repository code itself doesn't // require it, but if we start to receive WAL on the new timeline, // decoding the new WAL might need to look up previous pages, relation // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. - ancestor_timeline.wait_lsn(start_lsn)?; - } - start_lsn = start_lsn.align(); + *lsn = lsn.align(); + ancestor_timeline.wait_lsn(*lsn)?; - let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); - if ancestor_ancestor_lsn > start_lsn { - // can we safely just branch from the ancestor instead? - anyhow::bail!( + let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); + if ancestor_ancestor_lsn > *lsn { + // can we safely just branch from the ancestor instead? + anyhow::bail!( "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", - start_lsn, + lsn, ancestor_timeline_id, ancestor_ancestor_lsn, ); + } } - repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?; + + repo.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?; // load the timeline into memory let loaded_timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; From 60e5dc10e6df9e768efc7a982b20b2140316750e Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Fri, 8 Jul 2022 15:19:53 +0300 Subject: [PATCH 0493/1022] postgres_ffi/wal_generate: use 'craft' instead of 'generate' It does very fine-tuned byte-to-byte WAL crafting, not a sloppy generation. Hence 'craft' sounds like a better description. --- Cargo.lock | 4 ++-- libs/postgres_ffi/Cargo.toml | 2 +- libs/postgres_ffi/src/xlog_utils.rs | 10 ++++---- .../{wal_generate => wal_craft}/Cargo.toml | 2 +- .../src/bin/wal_craft.rs} | 24 +++++++++---------- .../{wal_generate => wal_craft}/src/lib.rs | 20 ++++++++-------- .../batch_others/test_crafted_wal_end.py | 8 +++---- test_runner/fixtures/neon_fixtures.py | 6 ++--- 8 files changed, 37 insertions(+), 39 deletions(-) rename libs/postgres_ffi/{wal_generate => wal_craft}/Cargo.toml (94%) rename libs/postgres_ffi/{wal_generate/src/bin/wal_generate.rs => wal_craft/src/bin/wal_craft.rs} (81%) rename libs/postgres_ffi/{wal_generate => wal_craft}/src/lib.rs (96%) diff --git a/Cargo.lock b/Cargo.lock index 6924c0c74a..4f1d727ae1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2151,7 +2151,7 @@ dependencies = [ "serde", "thiserror", "utils", - "wal_generate", + "wal_craft", "workspace_hack", ] @@ -3753,7 +3753,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] -name = "wal_generate" +name = "wal_craft" version = "0.1.0" dependencies = [ "anyhow", diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 129c93cf6d..c9cc858ab9 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -23,7 +23,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" } [dev-dependencies] env_logger = "0.9" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -wal_generate = { path = "wal_generate" } +wal_craft = { path = "wal_craft" } [build-dependencies] bindgen = "0.59.1" diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 67541d844e..b707b10fc8 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -597,7 +597,7 @@ mod tests { fn init_logging() { let _ = env_logger::Builder::from_env( env_logger::Env::default() - .default_filter_or("wal_generate=info,postgres_ffi::xlog_utils=trace"), + .default_filter_or("wal_craft=info,postgres_ffi::xlog_utils=trace"), ) .is_test(true) .try_init(); @@ -609,7 +609,7 @@ mod tests { expected_end_of_wal_non_partial: Lsn, last_segment: &str, ) { - use wal_generate::*; + use wal_craft::*; // 1. Generate some WAL let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("..") @@ -683,7 +683,7 @@ mod tests { init_logging(); test_end_of_wal( "test_find_end_of_wal_simple", - wal_generate::generate_simple, + wal_craft::generate_simple, "0/2000000".parse::().unwrap(), "000000010000000000000001", ); @@ -694,7 +694,7 @@ mod tests { init_logging(); test_end_of_wal( "test_find_end_of_wal_crossing_segment_followed_by_small_one", - wal_generate::generate_wal_record_crossing_segment_followed_by_small_one, + wal_craft::generate_wal_record_crossing_segment_followed_by_small_one, "0/3000000".parse::().unwrap(), "000000010000000000000002", ); @@ -706,7 +706,7 @@ mod tests { init_logging(); test_end_of_wal( "test_find_end_of_wal_last_crossing_segment", - wal_generate::generate_last_wal_record_crossing_segment, + wal_craft::generate_last_wal_record_crossing_segment, "0/3000000".parse::().unwrap(), "000000010000000000000002", ); diff --git a/libs/postgres_ffi/wal_generate/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml similarity index 94% rename from libs/postgres_ffi/wal_generate/Cargo.toml rename to libs/postgres_ffi/wal_craft/Cargo.toml index ce1a60c4f8..374c8e2e55 100644 --- a/libs/postgres_ffi/wal_generate/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "wal_generate" +name = "wal_craft" version = "0.1.0" edition = "2021" diff --git a/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs similarity index 81% rename from libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs rename to libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 1549bfb505..8297ad4391 100644 --- a/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -1,16 +1,14 @@ use anyhow::*; use clap::{App, Arg, ArgMatches}; use std::str::FromStr; -use wal_generate::*; +use wal_craft::*; fn main() -> Result<()> { - env_logger::Builder::from_env( - env_logger::Env::default().default_filter_or("wal_generate=info"), - ) - .init(); + env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("wal_craft=info")) + .init(); let type_arg = &Arg::new("type") .takes_value(true) - .help("Type of WAL to generate") + .help("Type of WAL to craft") .possible_values([ "simple", "last_wal_record_xlog_switch", @@ -19,15 +17,15 @@ fn main() -> Result<()> { "wal_record_crossing_segment_followed_by_small_one", ]) .required(true); - let arg_matches = App::new("Postgres WAL generator") - .about("Generates Postgres databases with specific WAL properties") + let arg_matches = App::new("Postgres WAL crafter") + .about("Crafts Postgres databases with specific WAL properties") .subcommand( App::new("print-postgres-config") .about("Print the configuration required for PostgreSQL server before running this script") ) .subcommand( App::new("with-initdb") - .about("Generate WAL in a new data directory first initialized with initdb") + .about("Craft WAL in a new data directory first initialized with initdb") .arg(type_arg) .arg( Arg::new("datadir") @@ -45,7 +43,7 @@ fn main() -> Result<()> { ) .subcommand( App::new("in-existing") - .about("Generate WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.") + .about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.") .arg(type_arg) .arg( Arg::new("connection") @@ -56,7 +54,7 @@ fn main() -> Result<()> { ) .get_matches(); - let wal_generate = |arg_matches: &ArgMatches, client| { + let wal_craft = |arg_matches: &ArgMatches, client| { let lsn = match arg_matches.value_of("type").unwrap() { "simple" => generate_simple(client)?, "last_wal_record_xlog_switch" => generate_last_wal_record_xlog_switch(client)?, @@ -90,11 +88,11 @@ fn main() -> Result<()> { }; cfg.initdb()?; let mut srv = cfg.start_server()?; - wal_generate(arg_matches, &mut srv.connect_with_timeout()?)?; + wal_craft(arg_matches, &mut srv.connect_with_timeout()?)?; srv.kill(); Ok(()) } - Some(("in-existing", arg_matches)) => wal_generate( + Some(("in-existing", arg_matches)) => wal_craft( arg_matches, &mut postgres::Config::from_str(arg_matches.value_of("connection").unwrap())? .connect(postgres::NoTls)?, diff --git a/libs/postgres_ffi/wal_generate/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs similarity index 96% rename from libs/postgres_ffi/wal_generate/src/lib.rs rename to libs/postgres_ffi/wal_craft/src/lib.rs index ac6fcc441d..bd87ab2a19 100644 --- a/libs/postgres_ffi/wal_generate/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -218,7 +218,7 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result Ok(()) } -fn generate_internal( +fn craft_internal( client: &mut C, f: impl Fn(&mut C, PgLsn) -> Result>, ) -> Result { @@ -230,7 +230,7 @@ fn generate_internal( let last_lsn = match f(client, initial_lsn)? { None => client.pg_current_wal_insert_lsn()?, Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) { - Ordering::Less => bail!("Some records were inserted after the generated WAL"), + Ordering::Less => bail!("Some records were inserted after the crafted WAL"), Ordering::Equal => last_lsn, Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"), }, @@ -239,7 +239,7 @@ fn generate_internal( // Some records may be not flushed, e.g. non-transactional logical messages. client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?; match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) { - Ordering::Less => bail!("Some records were flushed after the generated WAL"), + Ordering::Less => bail!("Some records were flushed after the crafted WAL"), Ordering::Equal => {} Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"), } @@ -247,7 +247,7 @@ fn generate_internal( } pub fn generate_simple(client: &mut impl postgres::GenericClient) -> Result { - generate_internal(client, |client, _| { + craft_internal(client, |client, _| { client.execute("CREATE table t(x int)", &[])?; Ok(None) }) @@ -256,7 +256,7 @@ pub fn generate_simple(client: &mut impl postgres::GenericClient) -> Result Result { - // Do not use generate_internal because here we end up with flush_lsn exactly on + // Do not use craft_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; @@ -275,7 +275,7 @@ pub fn generate_last_wal_record_xlog_switch( pub fn generate_last_wal_record_xlog_switch_ends_on_page_boundary( client: &mut impl postgres::GenericClient, ) -> Result { - // Do not use generate_internal because here we end up with flush_lsn exactly on + // Do not use craft_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; @@ -336,11 +336,11 @@ pub fn generate_last_wal_record_xlog_switch_ends_on_page_boundary( Ok(next_segment) } -fn generate_single_logical_message( +fn craft_single_logical_message( client: &mut impl postgres::GenericClient, transactional: bool, ) -> Result { - generate_internal(client, |client, initial_lsn| { + craft_internal(client, |client, initial_lsn| { ensure!( initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024), "Initial LSN is too far in the future" @@ -381,11 +381,11 @@ fn generate_single_logical_message( pub fn generate_wal_record_crossing_segment_followed_by_small_one( client: &mut impl postgres::GenericClient, ) -> Result { - generate_single_logical_message(client, true) + craft_single_logical_message(client, true) } pub fn generate_last_wal_record_crossing_segment( client: &mut C, ) -> Result { - generate_single_logical_message(client, false) + craft_single_logical_message(client, false) } diff --git a/test_runner/batch_others/test_crafted_wal_end.py b/test_runner/batch_others/test_crafted_wal_end.py index 945dfffe4f..d1c46fc73a 100644 --- a/test_runner/batch_others/test_crafted_wal_end.py +++ b/test_runner/batch_others/test_crafted_wal_end.py @@ -1,4 +1,4 @@ -from fixtures.neon_fixtures import NeonEnvBuilder, WalGenerate +from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft from fixtures.log_helper import log import pytest @@ -20,8 +20,8 @@ def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): env.neon_cli.create_branch('test_crafted_wal_end') pg = env.postgres.create('test_crafted_wal_end') - gen = WalGenerate(env) - pg.config(gen.postgres_config()) + wal_craft = WalCraft(env) + pg.config(wal_craft.postgres_config()) pg.start() res = pg.safe_psql_many(queries=[ 'CREATE TABLE keys(key int primary key)', @@ -30,7 +30,7 @@ def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): ]) assert res[-1][0] == (5050, ) - gen.in_existing(wal_type, pg.connstr()) + wal_craft.in_existing(wal_type, pg.connstr()) log.info("Restarting all safekeepers and pageservers") env.pageserver.stop() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d91ea398f9..e2bf7da79d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1263,13 +1263,13 @@ class NeonCli(AbstractNeonCli): return self.raw_cli(args, check_return_code=check_return_code) -class WalGenerate(AbstractNeonCli): +class WalCraft(AbstractNeonCli): """ - A typed wrapper around the `wal_generate` CLI tool. + A typed wrapper around the `wal_craft` CLI tool. Supports main commands via typed methods and a way to run arbitrary command directly via CLI. """ - COMMAND = 'wal_generate' + COMMAND = 'wal_craft' def postgres_config(self) -> List[str]: res = self.raw_cli(["print-postgres-config"]) From 0b5b2e8e0b9f789a834d4e9a78b2bc4debc17d26 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 7 Jul 2022 14:49:57 +0300 Subject: [PATCH 0494/1022] postgres_ffi/wal_craft: extract trait Crafter Make the intent of the code clearer. --- libs/postgres_ffi/src/xlog_utils.rs | 14 +- .../wal_craft/src/bin/wal_craft.rs | 26 +-- libs/postgres_ffi/wal_craft/src/lib.rs | 207 ++++++++++-------- 3 files changed, 130 insertions(+), 117 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index b707b10fc8..b7464da887 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -603,9 +603,8 @@ mod tests { .try_init(); } - fn test_end_of_wal( + fn test_end_of_wal( test_name: &str, - generate_wal: impl Fn(&mut postgres::Client) -> anyhow::Result, expected_end_of_wal_non_partial: Lsn, last_segment: &str, ) { @@ -624,7 +623,7 @@ mod tests { cfg.initdb().unwrap(); let mut srv = cfg.start_server().unwrap(); let expected_wal_end: Lsn = - u64::from(generate_wal(&mut srv.connect_with_timeout().unwrap()).unwrap()).into(); + u64::from(C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap()).into(); srv.kill(); // 2. Pick WAL generated by initdb @@ -681,9 +680,8 @@ mod tests { #[test] pub fn test_find_end_of_wal_simple() { init_logging(); - test_end_of_wal( + test_end_of_wal::( "test_find_end_of_wal_simple", - wal_craft::generate_simple, "0/2000000".parse::().unwrap(), "000000010000000000000001", ); @@ -692,9 +690,8 @@ mod tests { #[test] pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() { init_logging(); - test_end_of_wal( + test_end_of_wal::( "test_find_end_of_wal_crossing_segment_followed_by_small_one", - wal_craft::generate_wal_record_crossing_segment_followed_by_small_one, "0/3000000".parse::().unwrap(), "000000010000000000000002", ); @@ -704,9 +701,8 @@ mod tests { #[ignore = "not yet fixed, needs correct parsing of pre-last segments"] // TODO pub fn test_find_end_of_wal_last_crossing_segment() { init_logging(); - test_end_of_wal( + test_end_of_wal::( "test_find_end_of_wal_last_crossing_segment", - wal_craft::generate_last_wal_record_crossing_segment, "0/3000000".parse::().unwrap(), "000000010000000000000002", ); diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 8297ad4391..2c50bebdd2 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -10,11 +10,11 @@ fn main() -> Result<()> { .takes_value(true) .help("Type of WAL to craft") .possible_values([ - "simple", - "last_wal_record_xlog_switch", - "last_wal_record_xlog_switch_ends_on_page_boundary", - "last_wal_record_crossing_segment", - "wal_record_crossing_segment_followed_by_small_one", + Simple::NAME, + LastWalRecordXlogSwitch::NAME, + LastWalRecordXlogSwitchEndsOnPageBoundary::NAME, + WalRecordCrossingSegmentFollowedBySmallOne::NAME, + LastWalRecordCrossingSegment::NAME, ]) .required(true); let arg_matches = App::new("Postgres WAL crafter") @@ -56,17 +56,15 @@ fn main() -> Result<()> { let wal_craft = |arg_matches: &ArgMatches, client| { let lsn = match arg_matches.value_of("type").unwrap() { - "simple" => generate_simple(client)?, - "last_wal_record_xlog_switch" => generate_last_wal_record_xlog_switch(client)?, - "last_wal_record_xlog_switch_ends_on_page_boundary" => { - generate_last_wal_record_xlog_switch_ends_on_page_boundary(client)? + Simple::NAME => Simple::craft(client)?, + LastWalRecordXlogSwitch::NAME => LastWalRecordXlogSwitch::craft(client)?, + LastWalRecordXlogSwitchEndsOnPageBoundary::NAME => { + LastWalRecordXlogSwitchEndsOnPageBoundary::craft(client)? } - "last_wal_record_crossing_segment" => { - generate_last_wal_record_crossing_segment(client)? - } - "wal_record_crossing_segment_followed_by_small_one" => { - generate_wal_record_crossing_segment_followed_by_small_one(client)? + WalRecordCrossingSegmentFollowedBySmallOne::NAME => { + WalRecordCrossingSegmentFollowedBySmallOne::craft(client)? } + LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?, a => panic!("Unknown --type argument: {}", a), }; println!("end_of_wal = {}", lsn); diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index bd87ab2a19..52ecae59ff 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -218,6 +218,13 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result Ok(()) } +pub trait Crafter { + const NAME: &'static str; + + /// Generates WAL using the client `client`. Returns the expected end-of-wal LSN. + fn craft(client: &mut impl postgres::GenericClient) -> Result; +} + fn craft_internal( client: &mut C, f: impl Fn(&mut C, PgLsn) -> Result>, @@ -246,94 +253,102 @@ fn craft_internal( Ok(last_lsn) } -pub fn generate_simple(client: &mut impl postgres::GenericClient) -> Result { - craft_internal(client, |client, _| { - client.execute("CREATE table t(x int)", &[])?; - Ok(None) - }) -} - -pub fn generate_last_wal_record_xlog_switch( - client: &mut impl postgres::GenericClient, -) -> Result { - // Do not use craft_internal because here we end up with flush_lsn exactly on - // the segment boundary and insert_lsn after the initial page header, which is unusual. - ensure_server_config(client)?; - - client.execute("CREATE table t(x int)", &[])?; - let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); - let next_segment = PgLsn::from(0x0200_0000); - ensure!( - after_xlog_switch <= next_segment, - "XLOG_SWITCH message ended after the expected segment boundary: {} > {}", - after_xlog_switch, - next_segment - ); - Ok(next_segment) -} - -pub fn generate_last_wal_record_xlog_switch_ends_on_page_boundary( - client: &mut impl postgres::GenericClient, -) -> Result { - // Do not use craft_internal because here we end up with flush_lsn exactly on - // the segment boundary and insert_lsn after the initial page header, which is unusual. - ensure_server_config(client)?; - - client.execute("CREATE table t(x int)", &[])?; - - // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. - // We will use logical message as the padding. We start with detecting how much WAL - // it takes for one logical message, considering all alignments and headers. - let base_wal_advance = { - let before_lsn = client.pg_current_wal_insert_lsn()?; - // Small non-empty message bigger than few bytes is more likely than an empty - // message to have the same format as the big padding message. - client.execute( - "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))", - &[], - )?; - // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD. - (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize - + XLOG_SIZE_OF_XLOG_RECORD - }; - let mut remaining_lsn = - XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ; - if remaining_lsn < base_wal_advance { - remaining_lsn += XLOG_BLCKSZ; +pub struct Simple; +impl Crafter for Simple { + const NAME: &'static str = "simple"; + fn craft(client: &mut impl postgres::GenericClient) -> Result { + craft_internal(client, |client, _| { + client.execute("CREATE table t(x int)", &[])?; + Ok(None) + }) } - let repeats = 10 + remaining_lsn - base_wal_advance; - info!( - "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}", - client.pg_current_wal_insert_lsn()?, - remaining_lsn, - base_wal_advance, - repeats - ); - client.execute( - "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))", - &[&(repeats as i32)], - )?; - info!( - "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", - client.pg_current_wal_insert_lsn()?, - XLOG_SIZE_OF_XLOG_RECORD - ); +} - // Emit the XLOG_SWITCH - let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); - let next_segment = PgLsn::from(0x0200_0000); - ensure!( - after_xlog_switch < next_segment, - "XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}", - after_xlog_switch, - next_segment - ); - ensure!( - u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD, - "XLOG_SWITCH message ended not on page boundary: {}", - after_xlog_switch - ); - Ok(next_segment) +pub struct LastWalRecordXlogSwitch; +impl Crafter for LastWalRecordXlogSwitch { + const NAME: &'static str = "last_wal_record_xlog_switch"; + fn craft(client: &mut impl postgres::GenericClient) -> Result { + // Do not use generate_internal because here we end up with flush_lsn exactly on + // the segment boundary and insert_lsn after the initial page header, which is unusual. + ensure_server_config(client)?; + + client.execute("CREATE table t(x int)", &[])?; + let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); + let next_segment = PgLsn::from(0x0200_0000); + ensure!( + after_xlog_switch <= next_segment, + "XLOG_SWITCH message ended after the expected segment boundary: {} > {}", + after_xlog_switch, + next_segment + ); + Ok(next_segment) + } +} + +pub struct LastWalRecordXlogSwitchEndsOnPageBoundary; +impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { + const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary"; + fn craft(client: &mut impl postgres::GenericClient) -> Result { + // Do not use generate_internal because here we end up with flush_lsn exactly on + // the segment boundary and insert_lsn after the initial page header, which is unusual. + ensure_server_config(client)?; + + client.execute("CREATE table t(x int)", &[])?; + + // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. + // We will use logical message as the padding. We start with detecting how much WAL + // it takes for one logical message, considering all alignments and headers. + let base_wal_advance = { + let before_lsn = client.pg_current_wal_insert_lsn()?; + // Small non-empty message bigger than few bytes is more likely than an empty + // message to have the same format as the big padding message. + client.execute( + "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))", + &[], + )?; + // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD. + (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize + + XLOG_SIZE_OF_XLOG_RECORD + }; + let mut remaining_lsn = + XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ; + if remaining_lsn < base_wal_advance { + remaining_lsn += XLOG_BLCKSZ; + } + let repeats = 10 + remaining_lsn - base_wal_advance; + info!( + "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}", + client.pg_current_wal_insert_lsn()?, + remaining_lsn, + base_wal_advance, + repeats + ); + client.execute( + "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))", + &[&(repeats as i32)], + )?; + info!( + "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", + client.pg_current_wal_insert_lsn()?, + XLOG_SIZE_OF_XLOG_RECORD + ); + + // Emit the XLOG_SWITCH + let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); + let next_segment = PgLsn::from(0x0200_0000); + ensure!( + after_xlog_switch < next_segment, + "XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}", + after_xlog_switch, + next_segment + ); + ensure!( + u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD, + "XLOG_SWITCH message ended not on page boundary: {}", + after_xlog_switch + ); + Ok(next_segment) + } } fn craft_single_logical_message( @@ -378,14 +393,18 @@ fn craft_single_logical_message( }) } -pub fn generate_wal_record_crossing_segment_followed_by_small_one( - client: &mut impl postgres::GenericClient, -) -> Result { - craft_single_logical_message(client, true) +pub struct WalRecordCrossingSegmentFollowedBySmallOne; +impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { + const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one"; + fn craft(client: &mut impl postgres::GenericClient) -> Result { + craft_single_logical_message(client, true) + } } -pub fn generate_last_wal_record_crossing_segment( - client: &mut C, -) -> Result { - craft_single_logical_message(client, false) +pub struct LastWalRecordCrossingSegment; +impl Crafter for LastWalRecordCrossingSegment { + const NAME: &'static str = "last_wal_record_crossing_segment"; + fn craft(client: &mut impl postgres::GenericClient) -> Result { + craft_single_logical_message(client, false) + } } From f540f115a364a083700eb5f9e3968d308e3b5291 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 7 Jul 2022 16:53:12 +0300 Subject: [PATCH 0495/1022] postgres_ffi/wal_craft: simplify API --- libs/postgres_ffi/src/xlog_utils.rs | 2 +- libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs | 2 +- libs/postgres_ffi/wal_craft/src/lib.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index b7464da887..17891fb94f 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -621,7 +621,7 @@ mod tests { fs::remove_dir_all(&cfg.datadir).unwrap(); } cfg.initdb().unwrap(); - let mut srv = cfg.start_server().unwrap(); + let srv = cfg.start_server().unwrap(); let expected_wal_end: Lsn = u64::from(C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap()).into(); srv.kill(); diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 2c50bebdd2..13892538d0 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -85,7 +85,7 @@ fn main() -> Result<()> { datadir: arg_matches.value_of("datadir").unwrap().into(), }; cfg.initdb()?; - let mut srv = cfg.start_server()?; + let srv = cfg.start_server()?; wal_craft(arg_matches, &mut srv.connect_with_timeout()?)?; srv.kill(); Ok(()) diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 52ecae59ff..51482137c8 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -154,7 +154,7 @@ impl PostgresServer { bail!("Connection timed out"); } - pub fn kill(&mut self) { + pub fn kill(mut self) { self.process.kill().unwrap(); self.process.wait().unwrap(); } From 39d86ed29e9a2887e6dc339fbb0eeb6040c12cc8 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 8 Jul 2022 20:34:46 +0300 Subject: [PATCH 0496/1022] debug branch failure --- .../batch_others/test_tenant_relocation.py | 37 +++++++++++++++---- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index f6f2d8ca9d..73f6f52e72 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -14,6 +14,7 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient, + PageserverPort, PortDistributor, Postgres, assert_no_in_progress_downloads_for_tenant, @@ -115,14 +116,24 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve log.info('load thread stopped') -def populate_branch(pg: Postgres, create_table: bool, - expected_sum: Optional[int]) -> Tuple[UUID, int]: +def populate_branch( + pg: Postgres, + tenant_id: UUID, + ps_http: NeonPageserverHttpClient, + create_table: bool, + expected_sum: Optional[int], +) -> Tuple[UUID, int]: # insert some data with pg_cur(pg) as cur: cur.execute("SHOW neon.timeline_id") timeline_id = UUID(cur.fetchone()[0]) log.info("timeline to relocate %s", timeline_id.hex) + cur.execute("SELECT pg_current_wal_flush_lsn()") + log.info("pg_current_wal_flush_lsn() %s", lsn_from_hex(cur.fetchone()[0])) + log.info("timeline detail %s", + ps_http.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)) + # we rely upon autocommit after each statement # as waiting for acceptors happens there if create_table: @@ -237,6 +248,8 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, # first branch is used for load, compute for second one is used to # check that data is not lost + pageserver_http = env.pageserver.http_client() + tenant_id, initial_timeline_id = env.neon_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id) @@ -244,7 +257,13 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, pg_main = env.postgres.create_start(branch_name='test_tenant_relocation_main', tenant_id=tenant_id) - timeline_id_main, current_lsn_main = populate_branch(pg_main, create_table=True, expected_sum=500500) + timeline_id_main, current_lsn_main = populate_branch( + pg_main, + tenant_id=tenant_id, + ps_http=pageserver_http, + create_table=True, + expected_sum=500500, + ) env.neon_cli.create_branch( new_branch_name="test_tenant_relocation_second", @@ -254,11 +273,13 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, pg_second = env.postgres.create_start(branch_name='test_tenant_relocation_second', tenant_id=tenant_id) - # do not select sum for second branch, this select will wait until wal reaches pageserver - # try to check another case when pageserver didnt receive that wal and needs to get it from safekeeper - timeline_id_second, current_lsn_second = populate_branch(pg_second, create_table=False, expected_sum=1001000) - - pageserver_http = env.pageserver.http_client() + timeline_id_second, current_lsn_second = populate_branch( + pg_second, + tenant_id=tenant_id, + ps_http=pageserver_http, + create_table=False, + expected_sum=1001000, + ) # wait until pageserver receives that data wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_main, current_lsn_main) From 21da9199fa448767771a09be7ecdaf0378463bca Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 7 Jul 2022 22:51:26 +0300 Subject: [PATCH 0497/1022] take Value by reference to avoid calling .clone --- pageserver/src/layered_repository.rs | 24 +++++++++---------- .../src/layered_repository/inmemory_layer.rs | 4 ++-- pageserver/src/pgdatadir_mapping.rs | 2 +- pageserver/src/repository.rs | 24 +++++++++---------- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 392aa52844..cc8f8d6b68 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1640,7 +1640,7 @@ impl LayeredTimeline { Ok(layer) } - fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> { + fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { //info!("PUT: key {} at {}", key, lsn); let layer = self.get_layer_for_write(lsn)?; layer.put_value(key, lsn, val)?; @@ -2555,7 +2555,7 @@ impl Deref for LayeredTimelineWriter<'_> { } impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { - fn put(&self, key: Key, lsn: Lsn, value: Value) -> Result<()> { + fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> { self.tl.put_value(key, lsn, value) } @@ -2697,7 +2697,7 @@ pub mod tests { let TEST_KEY: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?; + writer.put(TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; writer.finish_write(Lsn(0x10)); drop(writer); @@ -2705,7 +2705,7 @@ pub mod tests { tline.compact()?; let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?; + writer.put(TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; writer.finish_write(Lsn(0x20)); drop(writer); @@ -2713,7 +2713,7 @@ pub mod tests { tline.compact()?; let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x30), Value::Image(TEST_IMG("foo at 0x30")))?; + writer.put(TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?; writer.finish_write(Lsn(0x30)); drop(writer); @@ -2721,7 +2721,7 @@ pub mod tests { tline.compact()?; let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x40), Value::Image(TEST_IMG("foo at 0x40")))?; + writer.put(TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?; writer.finish_write(Lsn(0x40)); drop(writer); @@ -2759,7 +2759,7 @@ pub mod tests { writer.put( test_key, lsn, - Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; writer.finish_write(lsn); drop(writer); @@ -2805,7 +2805,7 @@ pub mod tests { writer.put( test_key, lsn, - Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; writer.finish_write(lsn); updated[blknum] = lsn; @@ -2823,7 +2823,7 @@ pub mod tests { writer.put( test_key, lsn, - Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; writer.finish_write(lsn); drop(writer); @@ -2875,7 +2875,7 @@ pub mod tests { writer.put( test_key, lsn, - Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; writer.finish_write(lsn); updated[blknum] = lsn; @@ -2899,7 +2899,7 @@ pub mod tests { writer.put( test_key, lsn, - Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; println!("updating {} at {}", blknum, lsn); writer.finish_write(lsn); @@ -2958,7 +2958,7 @@ pub mod tests { writer.put( test_key, lsn, - Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))), + &Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))), )?; println!("updating [{}][{}] at {}", idx, blknum, lsn); writer.finish_write(lsn); diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index bffb946f7e..87e6877520 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -267,13 +267,13 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree - pub fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> { + pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { trace!("put_value key {} at {}/{}", key, self.timelineid, lsn); let mut inner = self.inner.write().unwrap(); inner.assert_writeable(); - let off = inner.file.write_blob(&Value::ser(&val)?)?; + let off = inner.file.write_blob(&Value::ser(val)?)?; let vec_map = inner.index.entry(key).or_default(); let old = vec_map.append_or_update_last(lsn, off).unwrap().0; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 9dbae74074..23850169d8 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -912,7 +912,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { let pending_nblocks = self.pending_nblocks; for (key, value) in self.pending_updates { - writer.put(key, self.lsn, value)?; + writer.put(key, self.lsn, &value)?; } for key_range in self.pending_deletions { writer.delete(key_range.clone(), self.lsn)?; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 17e8806899..359c704e81 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -393,7 +393,7 @@ pub trait TimelineWriter<'a> { /// /// This will implicitly extend the relation, if the page is beyond the /// current end-of-file. - fn put(&self, key: Key, lsn: Lsn, value: Value) -> Result<()>; + fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()>; fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()>; @@ -603,12 +603,12 @@ mod tests { let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let writer = tline.writer(); - writer.put(*TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?; + writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; writer.finish_write(Lsn(0x10)); drop(writer); let writer = tline.writer(); - writer.put(*TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?; + writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; writer.finish_write(Lsn(0x20)); drop(writer); @@ -655,13 +655,13 @@ mod tests { let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap(); // Insert a value on the timeline - writer.put(TEST_KEY_A, Lsn(0x20), test_value("foo at 0x20"))?; - writer.put(TEST_KEY_B, Lsn(0x20), test_value("foobar at 0x20"))?; + writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?; + writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?; writer.finish_write(Lsn(0x20)); - writer.put(TEST_KEY_A, Lsn(0x30), test_value("foo at 0x30"))?; + writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?; writer.finish_write(Lsn(0x30)); - writer.put(TEST_KEY_A, Lsn(0x40), test_value("foo at 0x40"))?; + writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?; writer.finish_write(Lsn(0x40)); //assert_current_logical_size(&tline, Lsn(0x40)); @@ -672,7 +672,7 @@ mod tests { .get_timeline_load(NEW_TIMELINE_ID) .expect("Should have a local timeline"); let new_writer = newtline.writer(); - new_writer.put(TEST_KEY_A, Lsn(0x40), test_value("bar at 0x40"))?; + new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?; new_writer.finish_write(Lsn(0x40)); // Check page contents on both branches @@ -703,14 +703,14 @@ mod tests { writer.put( *TEST_KEY, lsn, - Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), )?; writer.finish_write(lsn); lsn += 0x10; writer.put( *TEST_KEY, lsn, - Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), )?; writer.finish_write(lsn); lsn += 0x10; @@ -721,14 +721,14 @@ mod tests { writer.put( *TEST_KEY, lsn, - Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), )?; writer.finish_write(lsn); lsn += 0x10; writer.put( *TEST_KEY, lsn, - Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), )?; writer.finish_write(lsn); } From 95452e605ab16a79aee3580a0147c31811ded32a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 8 Jul 2022 13:35:33 +0300 Subject: [PATCH 0498/1022] Optimize importing a physical backup Before this patch, importing a physical backup followed the same path as ingesting any WAL records: 1. All the data pages from the backup are first collected in the DatadirModification object. 2. Then, they are "committed" to the Repository. They are written to the in-memory layer 3. Finally, the in-memory layer is frozen, and flushed to disk as a L0 delta layer file. This was pretty inefficient. In step 1, the whole physical backup was held in memory. If the backup is large, you simply run out of memory. And in step 3, the resulting L0 delta layer file is large, holding all the data again. That's a problem if the backup is larger than 5 GB: Amazon S3 doesn't allow uploading files larger than 5 GB (without using multi-part upload, see github issue #1910). So we want to avoid that. To alleviate those problems, optimize the codepath for importing a physical backup. The basic flow is the same as before, but step 1 is optimized so that it doesn't accumulate all the data in memory, and step 3 writes the data in image layers instead of one large delta layer. --- pageserver/src/import_datadir.rs | 2 + pageserver/src/layered_repository.rs | 171 ++++++++++++++++++--------- pageserver/src/pgdatadir_mapping.rs | 61 ++++++++++ 3 files changed, 175 insertions(+), 59 deletions(-) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 1a9aa78d8c..f8a41e5b2b 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -57,6 +57,7 @@ pub fn import_timeline_from_postgres_datadir( if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? { pg_control = Some(control_file); } + modification.flush()?; } } @@ -317,6 +318,7 @@ pub fn import_basebackup_from_tar( // We found the pg_control file. pg_control = Some(res); } + modification.flush()?; } tar::EntryType::Directory => { debug!("directory {:?}", file_path); diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index cc8f8d6b68..e977329822 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -34,7 +34,7 @@ use std::time::{Duration, Instant, SystemTime}; use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; use crate::config::PageServerConf; -use crate::keyspace::KeySpace; +use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::storage_sync::index::RemoteIndex; use crate::tenant_config::{TenantConf, TenantConfOpt}; @@ -1768,24 +1768,29 @@ impl LayeredTimeline { /// Flush one frozen in-memory layer to disk, as a new delta layer. fn flush_frozen_layer(&self, frozen_layer: Arc) -> Result<()> { - let new_delta = frozen_layer.write_to_disk()?; - let new_delta_path = new_delta.path(); + let layer_paths_to_upload; + + // As a special case, when we have just imported an image into the repository, + // instead of writing out a L0 delta layer, we directly write out image layer + // files instead. This is possible as long as *all* the data imported into the + // repository have the same LSN. + let lsn_range = frozen_layer.get_lsn_range(); + if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) { + let pgdir = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)?; + let (partitioning, _lsn) = + pgdir.repartition(self.initdb_lsn, self.get_compaction_target_size())?; + layer_paths_to_upload = + self.create_image_layers(&partitioning, self.initdb_lsn, true)?; + } else { + // normal case, write out a L0 delta layer file. + let delta_path = self.create_delta_layer(&frozen_layer)?; + layer_paths_to_upload = HashSet::from([delta_path]); + } - // Sync the new layer to disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable - // - // TODO: If we're running inside 'flush_frozen_layers' and there are multiple - // files to flush, it might be better to first write them all, and then fsync - // them all in parallel. - par_fsync::par_fsync(&[ - new_delta_path.clone(), - self.conf.timeline_path(&self.timeline_id, &self.tenant_id), - ])?; fail_point!("flush-frozen-before-sync"); - // Finally, replace the frozen in-memory layer with the new on-disk layer + // The new on-disk layers are now in the layer map. We can remove the + // in-memory layer from the map now. { let mut layers = self.layers.write().unwrap(); let l = layers.frozen_layers.pop_front(); @@ -1795,19 +1800,27 @@ impl LayeredTimeline { // layer to disk at the same time, that would not work. assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer)); - // Add the new delta layer to the LayerMap - layers.insert_historic(Arc::new(new_delta)); - // release lock on 'layers' } + fail_point!("checkpoint-after-sync"); + // Update the metadata file, with new 'disk_consistent_lsn' // // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing // *all* the layers, to avoid fsyncing the file multiple times. - let disk_consistent_lsn = Lsn(frozen_layer.get_lsn_range().end.0 - 1); - fail_point!("checkpoint-after-sync"); + let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); + self.update_disk_consistent_lsn(disk_consistent_lsn, layer_paths_to_upload)?; + Ok(()) + } + + /// Update metadata file + fn update_disk_consistent_lsn( + &self, + disk_consistent_lsn: Lsn, + layer_paths_to_upload: HashSet, + ) -> Result<()> { // If we were able to advance 'disk_consistent_lsn', save it the metadata file. // After crash, we will restart WAL streaming and processing from that point. let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); @@ -1857,14 +1870,11 @@ impl LayeredTimeline { false, )?; - NUM_PERSISTENT_FILES_CREATED.inc_by(1); - PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len()); - if self.upload_layers.load(atomic::Ordering::Relaxed) { storage_sync::schedule_layer_upload( self.tenant_id, self.timeline_id, - HashSet::from([new_delta_path]), + layer_paths_to_upload, Some(metadata), ); } @@ -1876,6 +1886,37 @@ impl LayeredTimeline { Ok(()) } + // Write out the given frozen in-memory layer as a new L0 delta file + fn create_delta_layer(&self, frozen_layer: &InMemoryLayer) -> Result { + // Write it out + let new_delta = frozen_layer.write_to_disk()?; + let new_delta_path = new_delta.path(); + + // Sync it to disk. + // + // We must also fsync the timeline dir to ensure the directory entries for + // new layer files are durable + // + // TODO: If we're running inside 'flush_frozen_layers' and there are multiple + // files to flush, it might be better to first write them all, and then fsync + // them all in parallel. + par_fsync::par_fsync(&[ + new_delta_path.clone(), + self.conf.timeline_path(&self.timeline_id, &self.tenant_id), + ])?; + + // Add it to the layer map + { + let mut layers = self.layers.write().unwrap(); + layers.insert_historic(Arc::new(new_delta)); + } + + NUM_PERSISTENT_FILES_CREATED.inc_by(1); + PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len()); + + Ok(new_delta_path) + } + pub fn compact(&self) -> Result<()> { // // High level strategy for compaction / image creation: @@ -1919,29 +1960,23 @@ impl LayeredTimeline { if let Ok(pgdir) = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) { + // 2. Create new image layers for partitions that have been modified + // "enough". let (partitioning, lsn) = pgdir.repartition( self.get_last_record_lsn(), self.get_compaction_target_size(), )?; - let timer = self.create_images_time_histo.start_timer(); - // 2. Create new image layers for partitions that have been modified - // "enough". - let mut layer_paths_to_upload = HashSet::with_capacity(partitioning.parts.len()); - for part in partitioning.parts.iter() { - if self.time_for_new_image_layer(part, lsn)? { - let new_path = self.create_image_layer(part, lsn)?; - layer_paths_to_upload.insert(new_path); - } - } - if self.upload_layers.load(atomic::Ordering::Relaxed) { + let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; + if !layer_paths_to_upload.is_empty() + && self.upload_layers.load(atomic::Ordering::Relaxed) + { storage_sync::schedule_layer_upload( self.tenant_id, self.timeline_id, - layer_paths_to_upload, + HashSet::from_iter(layer_paths_to_upload), None, ); } - timer.stop_and_record(); // 3. Compact let timer = self.compact_time_histo.start_timer(); @@ -1995,21 +2030,40 @@ impl LayeredTimeline { Ok(false) } - fn create_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result { - let img_range = - partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; - let mut image_layer_writer = - ImageLayerWriter::new(self.conf, self.timeline_id, self.tenant_id, &img_range, lsn)?; + fn create_image_layers( + &self, + partitioning: &KeyPartitioning, + lsn: Lsn, + force: bool, + ) -> Result> { + let timer = self.create_images_time_histo.start_timer(); + let mut image_layers: Vec = Vec::new(); + let mut layer_paths_to_upload = HashSet::new(); + for partition in partitioning.parts.iter() { + if force || self.time_for_new_image_layer(partition, lsn)? { + let img_range = + partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_id, + &img_range, + lsn, + )?; - for range in &partition.ranges { - let mut key = range.start; - while key < range.end { - let img = self.get(key, lsn)?; - image_layer_writer.put_image(key, &img)?; - key = key.next(); + for range in &partition.ranges { + let mut key = range.start; + while key < range.end { + let img = self.get(key, lsn)?; + image_layer_writer.put_image(key, &img)?; + key = key.next(); + } + } + let image_layer = image_layer_writer.finish()?; + layer_paths_to_upload.insert(image_layer.path()); + image_layers.push(image_layer); } } - let image_layer = image_layer_writer.finish()?; // Sync the new layer to disk before adding it to the layer map, to make sure // we don't garbage collect something based on the new layer, before it has @@ -2020,19 +2074,18 @@ impl LayeredTimeline { // // Compaction creates multiple image layers. It would be better to create them all // and fsync them all in parallel. - par_fsync::par_fsync(&[ - image_layer.path(), - self.conf.timeline_path(&self.timeline_id, &self.tenant_id), - ])?; - - // FIXME: Do we need to do something to upload it to remote storage here? + let mut all_paths = Vec::from_iter(layer_paths_to_upload.clone()); + all_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); + par_fsync::par_fsync(&all_paths)?; let mut layers = self.layers.write().unwrap(); - let new_path = image_layer.path(); - layers.insert_historic(Arc::new(image_layer)); + for l in image_layers { + layers.insert_historic(Arc::new(l)); + } drop(layers); + timer.stop_and_record(); - Ok(new_path) + Ok(layer_paths_to_upload) } /// diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 23850169d8..f696c1f411 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -902,6 +902,57 @@ impl<'a, R: Repository> DatadirModification<'a, R> { Ok(()) } + /// + /// Flush changes accumulated so far to the underlying repository. + /// + /// Usually, changes made in DatadirModification are atomic, but this allows + /// you to flush them to the underlying repository before the final `commit`. + /// That allows to free up the memory used to hold the pending changes. + /// + /// Currently only used during bulk import of a data directory. In that + /// context, breaking the atomicity is OK. If the import is interrupted, the + /// whole import fails and the timeline will be deleted anyway. + /// (Or to be precise, it will be left behind for debugging purposes and + /// ignored, see https://github.com/neondatabase/neon/pull/1809) + /// + /// Note: A consequence of flushing the pending operations is that they + /// won't be visible to subsequent operations until `commit`. The function + /// retains all the metadata, but data pages are flushed. That's again OK + /// for bulk import, where you are just loading data pages and won't try to + /// modify the same pages twice. + pub fn flush(&mut self) -> Result<()> { + // Unless we have accumulated a decent amount of changes, it's not worth it + // to scan through the pending_updates list. + let pending_nblocks = self.pending_nblocks; + if pending_nblocks < 10000 { + return Ok(()); + } + + let writer = self.tline.tline.writer(); + + // Flush relation and SLRU data blocks, keep metadata. + let mut result: Result<()> = Ok(()); + self.pending_updates.retain(|&key, value| { + if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) { + result = writer.put(key, self.lsn, value); + false + } else { + true + } + }); + result?; + + if pending_nblocks != 0 { + self.tline.current_logical_size.fetch_add( + pending_nblocks * pg_constants::BLCKSZ as isize, + Ordering::SeqCst, + ); + self.pending_nblocks = 0; + } + + Ok(()) + } + /// /// Finish this atomic update, writing all the updated keys to the /// underlying timeline. @@ -1317,6 +1368,10 @@ pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> { }) } +fn is_rel_block_key(key: Key) -> bool { + key.field1 == 0x00 && key.field4 != 0 +} + pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> { Ok(match key.field1 { 0x01 => { @@ -1335,6 +1390,12 @@ pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> { }) } +fn is_slru_block_key(key: Key) -> bool { + key.field1 == 0x01 // SLRU-related + && key.field3 == 0x00000001 // but not SlruDir + && key.field6 != 0xffffffff // and not SlruSegSize +} + // //-- Tests that should work the same with any Repository/Timeline implementation. // From 5cf597044d88c6280dd1d4e5c1e8340c4229a755 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Mon, 11 Jul 2022 10:31:14 -0400 Subject: [PATCH 0499/1022] Allow prev_lsn hint for fullbackup (#2052) --- pageserver/src/basebackup.rs | 15 +++++++-- pageserver/src/page_service.rs | 58 +++++++++++++++++++++++++++------- 2 files changed, 60 insertions(+), 13 deletions(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index ed300b3360..3ec1ec9243 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -60,6 +60,7 @@ where write: W, timeline: &'a Arc, req_lsn: Option, + prev_lsn: Option, full_backup: bool, ) -> Result> { // Compute postgres doesn't have any previous WAL files, but the first @@ -96,16 +97,26 @@ where (end_of_timeline.prev, end_of_timeline.last) }; + // Consolidate the derived and the provided prev_lsn values + let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn { + if backup_prev != Lsn(0) { + ensure!(backup_prev == provided_prev_lsn) + } + provided_prev_lsn + } else { + backup_prev + }; + info!( "taking basebackup lsn={}, prev_lsn={} (full_backup={})", - backup_lsn, backup_prev, full_backup + backup_lsn, prev_lsn, full_backup ); Ok(Basebackup { ar: Builder::new(AbortableWrite::new(write)), timeline, lsn: backup_lsn, - prev_record_lsn: backup_prev, + prev_record_lsn: prev_lsn, full_backup, finished: false, }) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 973a631d23..078edc5c9f 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -772,6 +772,7 @@ impl PageServerHandler { pgb: &mut PostgresBackend, timelineid: ZTimelineId, lsn: Option, + prev_lsn: Option, tenantid: ZTenantId, full_backup: bool, ) -> anyhow::Result<()> { @@ -796,7 +797,8 @@ impl PageServerHandler { { let mut writer = CopyDataSink { pgb }; - let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn, full_backup)?; + let basebackup = + basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?; span.record("lsn", &basebackup.lsn.to_string().as_str()); basebackup.send_tarball()?; } @@ -899,33 +901,67 @@ impl postgres_backend::Handler for PageServerHandler { }; // Check that the timeline exists - self.handle_basebackup_request(pgb, timelineid, lsn, tenantid, false)?; + self.handle_basebackup_request(pgb, timelineid, lsn, None, tenantid, false)?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } + // return pair of prev_lsn and last_lsn + else if query_string.starts_with("get_last_record_rlsn ") { + let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len()); + let params = params_raw.split_whitespace().collect::>(); + + ensure!( + params.len() == 2, + "invalid param number for get_last_record_rlsn command" + ); + + let tenantid = ZTenantId::from_str(params[0])?; + let timelineid = ZTimelineId::from_str(params[1])?; + + self.check_permission(Some(tenantid))?; + let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) + .context("Cannot load local timeline")?; + + let end_of_timeline = timeline.tline.get_last_record_rlsn(); + + pgb.write_message_noflush(&BeMessage::RowDescription(&[ + RowDescriptor::text_col(b"prev_lsn"), + RowDescriptor::text_col(b"last_lsn"), + ]))? + .write_message_noflush(&BeMessage::DataRow(&[ + Some(end_of_timeline.prev.to_string().as_bytes()), + Some(end_of_timeline.last.to_string().as_bytes()), + ]))? + .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + } // same as basebackup, but result includes relational data as well else if query_string.starts_with("fullbackup ") { let (_, params_raw) = query_string.split_at("fullbackup ".len()); let params = params_raw.split_whitespace().collect::>(); ensure!( - params.len() == 3, + params.len() >= 2, "invalid param number for fullbackup command" ); let tenantid = ZTenantId::from_str(params[0])?; let timelineid = ZTimelineId::from_str(params[1])?; + // The caller is responsible for providing correct lsn and prev_lsn. + let lsn = if params.len() > 2 { + Some(Lsn::from_str(params[2])?) + } else { + None + }; + let prev_lsn = if params.len() > 3 { + Some(Lsn::from_str(params[3])?) + } else { + None + }; + self.check_permission(Some(tenantid))?; - // Lsn is required for fullbackup, because otherwise we would not know - // at which lsn to upload this backup. - // - // The caller is responsible for providing a valid lsn - // and using it in the subsequent import. - let lsn = Some(Lsn::from_str(params[2])?); - // Check that the timeline exists - self.handle_basebackup_request(pgb, timelineid, lsn, tenantid, true)?; + self.handle_basebackup_request(pgb, timelineid, lsn, prev_lsn, tenantid, true)?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("import basebackup ") { // Import the `base` section (everything but the wal) of a basebackup. From 5cf94a58484c22baea32fa88aaf9b2f4e763e882 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 12 Jul 2022 22:01:44 +0300 Subject: [PATCH 0500/1022] Add test for cascade/flat branching (#1569) --- test_runner/batch_others/test_branching.py | 78 ++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 test_runner/batch_others/test_branching.py diff --git a/test_runner/batch_others/test_branching.py b/test_runner/batch_others/test_branching.py new file mode 100644 index 0000000000..957d9a33b3 --- /dev/null +++ b/test_runner/batch_others/test_branching.py @@ -0,0 +1,78 @@ +from typing import List +import threading +import pytest +from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres +import time +import random +from fixtures.log_helper import log +from performance.test_perf_pgbench import get_scales_matrix + + +# Test branch creation +# +# This test spawns pgbench in a thread in the background, and creates a branch while +# pgbench is running. Then it launches pgbench on the new branch, and creates another branch. +# Repeat `n_branches` times. +# +# If 'ty' == 'cascade', each branch is created from the previous branch, so that you end +# up with a branch of a branch of a branch ... of a branch. With 'ty' == 'flat', +# each branch is created from the root. +@pytest.mark.parametrize("n_branches", [10]) +@pytest.mark.parametrize("scale", get_scales_matrix(1)) +@pytest.mark.parametrize("ty", ["cascade", "flat"]) +def test_branching_with_pgbench(neon_simple_env: NeonEnv, + pg_bin: PgBin, + n_branches: int, + scale: int, + ty: str): + env = neon_simple_env + + # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test + tenant, _ = env.neon_cli.create_tenant( + conf={ + 'gc_period': '5 s', + 'gc_horizon': f'{1024 ** 2}', + 'checkpoint_distance': f'{1024 ** 2}', + 'compaction_target_size': f'{1024 ** 2}', + # set PITR interval to be small, so we can do GC + 'pitr_interval': '5 s' + }) + + def run_pgbench(pg: Postgres): + connstr = pg.connstr() + + log.info(f"Start a pgbench workload on pg {connstr}") + + pg_bin.run_capture(['pgbench', '-i', f'-s{scale}', connstr]) + pg_bin.run_capture(['pgbench', '-c10', '-T15', connstr]) + + env.neon_cli.create_branch('b0', tenant_id=tenant) + pgs: List[Postgres] = [] + pgs.append(env.postgres.create_start('b0', tenant_id=tenant)) + + threads: List[threading.Thread] = [] + threads.append(threading.Thread(target=run_pgbench, args=(pgs[0], ), daemon=True)) + threads[-1].start() + + for i in range(n_branches): + # random a delay between [0, 5] + delay = random.random() * 5 + time.sleep(delay) + log.info(f"Sleep {delay}s") + + if ty == "cascade": + env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(i), tenant_id=tenant) + else: + env.neon_cli.create_branch('b{}'.format(i + 1), 'b0', tenant_id=tenant) + + pgs.append(env.postgres.create_start('b{}'.format(i + 1), tenant_id=tenant)) + + threads.append(threading.Thread(target=run_pgbench, args=(pgs[-1], ), daemon=True)) + threads[-1].start() + + for thread in threads: + thread.join() + + for pg in pgs: + res = pg.safe_psql('SELECT count(*) from pgbench_accounts') + assert res[0] == (100000 * scale, ) From 7f048abf3b6869517e20f744a27d159fc95f9c2c Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Tue, 12 Jul 2022 15:04:40 -0400 Subject: [PATCH 0501/1022] Add `close_fds` for `initdb` command and add close fd test (#2060) This PR adds a test for https://github.com/neondatabase/neon/pull/1834 and fixes the error in https://app.circleci.com/pipelines/github/neondatabase/neon/7753/workflows/94d1b796-10a3-4989-b23c-4c1eb4a49cf5/jobs/79586, which happens because `pageserver.pid` is held by `initdb` command on restart. Because the test requires `lsof` to be installed in the docker image, this PR also updates the caches and docker image specified in CircleCI config file. --- .circleci/config.yml | 21 ++++----- pageserver/src/walredo.rs | 1 + test_runner/batch_others/test_close_fds.py | 51 ++++++++++++++++++++++ 3 files changed, 60 insertions(+), 13 deletions(-) create mode 100644 test_runner/batch_others/test_close_fds.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 5370e46663..941849bb0e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -5,10 +5,10 @@ executors: resource_class: xlarge docker: # NB: when changed, do not forget to update rust image tag in all Dockerfiles - - image: zimg/rust:1.58 + - image: neondatabase/rust:1.58 neon-executor: docker: - - image: zimg/rust:1.58 + - image: neondatabase/rust:1.58 jobs: # A job to build postgres @@ -37,7 +37,7 @@ jobs: name: Restore postgres cache keys: # Restore ONLY if the rev key matches exactly - - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} + - v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} # Build postgres if the restore_cache didn't find a build. # `make` can't figure out whether the cache is valid, since @@ -54,7 +54,7 @@ jobs: - save_cache: name: Save postgres cache - key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} + key: v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} paths: - tmp_install @@ -85,7 +85,7 @@ jobs: name: Restore postgres cache keys: # Restore ONLY if the rev key matches exactly - - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} + - v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} - restore_cache: name: Restore rust cache @@ -93,7 +93,7 @@ jobs: # Require an exact match. While an out of date cache might speed up the build, # there's no way to clean out old packages, so the cache grows every time something # changes. - - v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} + - v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} # Build the rust code, including test binaries - run: @@ -107,7 +107,7 @@ jobs: export CARGO_INCREMENTAL=0 export CACHEPOT_BUCKET=zenith-rust-cachepot - export RUSTC_WRAPPER=cachepot + export RUSTC_WRAPPER="" export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests @@ -115,7 +115,7 @@ jobs: - save_cache: name: Save rust cache - key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} + key: v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} paths: - ~/.cargo/registry - ~/.cargo/git @@ -142,11 +142,6 @@ jobs: jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' ) - test_exe_paths=$( - cargo test --message-format=json --no-run | - jq -r '.executable | select(. != null)' - ) - mkdir -p /tmp/zenith/bin mkdir -p /tmp/zenith/test_bin mkdir -p /tmp/zenith/etc diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index cad211b1bd..db4620417c 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -623,6 +623,7 @@ impl PostgresRedoProcess { .env_clear() .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) + .close_fds() .output() .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {}", e)))?; diff --git a/test_runner/batch_others/test_close_fds.py b/test_runner/batch_others/test_close_fds.py new file mode 100644 index 0000000000..9521b1bb4a --- /dev/null +++ b/test_runner/batch_others/test_close_fds.py @@ -0,0 +1,51 @@ +from contextlib import closing +import shutil +import time +import subprocess +import os.path + +from cached_property import threading +from fixtures.neon_fixtures import NeonEnv +from fixtures.log_helper import log + + +def lsof_path() -> str: + path_output = shutil.which("lsof") + if path_output is None: + raise RuntimeError('lsof not found in PATH') + else: + return path_output + + +# Makes sure that `pageserver.pid` is only held by `pageserve` command, not other commands. +# This is to test the changes in https://github.com/neondatabase/neon/pull/1834. +def test_lsof_pageserver_pid(neon_simple_env: NeonEnv): + env = neon_simple_env + + def start_workload(): + env.neon_cli.create_branch("test_lsof_pageserver_pid") + pg = env.postgres.create_start("test_lsof_pageserver_pid") + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE foo as SELECT x FROM generate_series(1,100000) x") + cur.execute("update foo set x=x+1") + + workload_thread = threading.Thread(target=start_workload, args=(), daemon=True) + workload_thread.start() + + path = os.path.join(env.repo_dir, "pageserver.pid") + lsof = lsof_path() + while workload_thread.is_alive(): + res = subprocess.run([lsof, path], + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + # parse the `lsof` command's output to get only the list of commands + commands = [line.split(' ')[0] for line in res.stdout.strip().split('\n')[1:]] + if len(commands) > 0: + log.info(f"lsof commands: {commands}") + assert commands == ['pageserve'] + + time.sleep(1.0) From 7c041d9939ae647a72f77afa42a3e3c91be14932 Mon Sep 17 00:00:00 2001 From: dhammika Date: Tue, 12 Jul 2022 12:53:22 -0700 Subject: [PATCH 0502/1022] Add a test for gc dropping active layers (#707) (#1484) This PR adds `test_branch_and_gc` test that reproduces https://github.com/neondatabase/neon/issues/707. It tests GC when running with branching. Co-authored-by: Thang Pham --- .../batch_others/test_branch_and_gc.py | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 test_runner/batch_others/test_branch_and_gc.py diff --git a/test_runner/batch_others/test_branch_and_gc.py b/test_runner/batch_others/test_branch_and_gc.py new file mode 100644 index 0000000000..a6210b9176 --- /dev/null +++ b/test_runner/batch_others/test_branch_and_gc.py @@ -0,0 +1,101 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import lsn_from_hex + + +# Test the GC implementation when running with branching. +# This test reproduces the issue https://github.com/neondatabase/neon/issues/707. +# +# Consider two LSNs `lsn1` and `lsn2` with some delta files as follows: +# ... +# p -> has an image layer xx_p with p < lsn1 +# ... +# lsn1 +# ... +# q -> has an image layer yy_q with lsn1 < q < lsn2 +# ... +# lsn2 +# +# Consider running a GC iteration such that the GC horizon is between p and lsn1 +# ... +# p -> has an image layer xx_p with p < lsn1 +# D_start -> is a delta layer D's start (e.g D = '...-...-D_start-D_end') +# ... +# GC_h -> is a gc horizon such that p < GC_h < lsn1 +# ... +# lsn1 +# ... +# D_end -> is a delta layer D's end +# ... +# q -> has an image layer yy_q with lsn1 < q < lsn2 +# ... +# lsn2 +# +# As described in the issue #707, the image layer xx_p will be deleted as +# its range is below the GC horizon and there exists a newer image layer yy_q (q > p). +# However, removing xx_p will corrupt any delta layers that depend on xx_p that +# are not deleted by GC. For example, the delta layer D is corrupted in the +# above example because D depends on the image layer xx_p for value reconstruction. +# +# Because the delta layer D covering lsn1 is corrupted, creating a branch +# starting from lsn1 should return an error as follows: +# could not find data for key ... at LSN ..., for request at LSN ... +def test_branch_and_gc(neon_simple_env: NeonEnv): + env = neon_simple_env + + tenant, _ = env.neon_cli.create_tenant( + conf={ + # disable background GC + 'gc_period': '10 m', + 'gc_horizon': f'{10 * 1024 ** 3}', + + # small checkpoint distance to create more delta layer files + 'checkpoint_distance': f'{1024 ** 2}', + + # set the target size to be large to allow the image layer to cover the whole key space + 'compaction_target_size': f'{1024 ** 3}', + + # tweak the default settings to allow quickly create image layers and L1 layers + 'compaction_period': '1 s', + 'compaction_threshold': '2', + 'image_creation_threshold': '1', + + # set PITR interval to be small, so we can do GC + 'pitr_interval': '1 s' + }) + + timeline_main = env.neon_cli.create_timeline(f'test_main', tenant_id=tenant) + pg_main = env.postgres.create_start('test_main', tenant_id=tenant) + + main_cur = pg_main.connect().cursor() + + main_cur.execute( + "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')" + ) + main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)') + main_cur.execute('SELECT pg_current_wal_insert_lsn()') + lsn1 = main_cur.fetchone()[0] + log.info(f'LSN1: {lsn1}') + + main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)') + main_cur.execute('SELECT pg_current_wal_insert_lsn()') + lsn2 = main_cur.fetchone()[0] + log.info(f'LSN2: {lsn2}') + + # Set the GC horizon so that lsn1 is inside the horizon, which means + # we can create a new branch starting from lsn1. + env.pageserver.safe_psql( + f'''do_gc {tenant.hex} {timeline_main.hex} {lsn_from_hex(lsn2) - lsn_from_hex(lsn1) + 1024}''' + ) + + env.neon_cli.create_branch('test_branch', + 'test_main', + tenant_id=tenant, + ancestor_start_lsn=lsn1) + pg_branch = env.postgres.create_start('test_branch', tenant_id=tenant) + + branch_cur = pg_branch.connect().cursor() + branch_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)') + + branch_cur.execute('SELECT count(*) FROM foo') + assert branch_cur.fetchone() == (200000, ) From 61cc562822eaf5f8f51c231d6c65dcbd42a6f44d Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 13 Jul 2022 09:18:11 +0100 Subject: [PATCH 0503/1022] Make POSTGRES_INSTALL_DIR configurable for build (#2067) --- Makefile | 39 ++++++++++++++++------------- libs/postgres_ffi/build.rs | 50 +++++++++++++++++++++++++++++++------- 2 files changed, 63 insertions(+), 26 deletions(-) diff --git a/Makefile b/Makefile index 50e2c8ab7f..566f2ecb10 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,8 @@ +ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) + +# Where to install Postgres, default is ./tmp_install, maybe useful for package managers +POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/tmp_install + # Seccomp BPF is only available for Linux UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Linux) @@ -55,55 +60,55 @@ zenith: postgres-headers $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) ### PostgreSQL parts -tmp_install/build/config.status: +$(POSTGRES_INSTALL_DIR)/build/config.status: +@echo "Configuring postgres build" - mkdir -p tmp_install/build - (cd tmp_install/build && \ - ../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \ + mkdir -p $(POSTGRES_INSTALL_DIR)/build + (cd $(POSTGRES_INSTALL_DIR)/build && \ + $(ROOT_PROJECT_DIR)/vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \ $(PG_CONFIGURE_OPTS) \ $(SECCOMP) \ - --prefix=$(abspath tmp_install) > configure.log) + --prefix=$(abspath $(POSTGRES_INSTALL_DIR)) > configure.log) # nicer alias for running 'configure' .PHONY: postgres-configure -postgres-configure: tmp_install/build/config.status +postgres-configure: $(POSTGRES_INSTALL_DIR)/build/config.status -# Install the PostgreSQL header files into tmp_install/include +# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/include .PHONY: postgres-headers postgres-headers: postgres-configure +@echo "Installing PostgreSQL headers" - $(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/include MAKELEVEL=0 install # Compile and install PostgreSQL and contrib/neon .PHONY: postgres postgres: postgres-configure \ postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers` +@echo "Compiling PostgreSQL" - $(MAKE) -C tmp_install/build MAKELEVEL=0 install + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 install +@echo "Compiling contrib/neon" - $(MAKE) -C tmp_install/build/contrib/neon install + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon install +@echo "Compiling contrib/neon_test_utils" - $(MAKE) -C tmp_install/build/contrib/neon_test_utils install + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon_test_utils install +@echo "Compiling pg_buffercache" - $(MAKE) -C tmp_install/build/contrib/pg_buffercache install + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache install +@echo "Compiling pageinspect" - $(MAKE) -C tmp_install/build/contrib/pageinspect install + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect install .PHONY: postgres-clean postgres-clean: - $(MAKE) -C tmp_install/build MAKELEVEL=0 clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 clean # This doesn't remove the effects of 'configure'. .PHONY: clean clean: - cd tmp_install/build && $(MAKE) clean + cd $(POSTGRES_INSTALL_DIR)/build && $(MAKE) clean $(CARGO_CMD_PREFIX) cargo clean # This removes everything .PHONY: distclean distclean: - rm -rf tmp_install + rm -rf $(POSTGRES_INSTALL_DIR) $(CARGO_CMD_PREFIX) cargo clean .PHONY: fmt @@ -112,4 +117,4 @@ fmt: .PHONY: setup-pre-commit-hook setup-pre-commit-hook: - ln -s -f ../../pre-commit.py .git/hooks/pre-commit + ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 0043b9ab58..703d972480 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -2,6 +2,7 @@ extern crate bindgen; use std::env; use std::path::PathBuf; +use std::process::Command; use bindgen::callbacks::ParseCallbacks; @@ -45,6 +46,45 @@ fn main() { // Tell cargo to invalidate the built crate whenever the wrapper changes println!("cargo:rerun-if-changed=pg_control_ffi.h"); + // Finding the location of C headers for the Postgres server: + // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/tmp_install` + // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/tmp_install/include/postgresql/server` + let mut pg_install_dir: PathBuf; + let inc_server_path: String; + + if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") { + pg_install_dir = postgres_install_dir.into(); + } else { + pg_install_dir = PathBuf::from("tmp_install") + } + + if pg_install_dir.is_relative() { + let cwd = env::current_dir().unwrap(); + pg_install_dir = cwd.join("..").join("..").join(pg_install_dir); + } + + let pg_config_bin = pg_install_dir.join("bin").join("pg_config"); + if pg_config_bin.exists() { + let output = Command::new(pg_config_bin) + .arg("--includedir-server") + .output() + .expect("failed to execute `pg_config --includedir-server`"); + + if !output.status.success() { + panic!("`pg_config --includedir-server` failed") + } + + inc_server_path = String::from_utf8(output.stdout).unwrap().trim_end().into(); + } else { + inc_server_path = pg_install_dir + .join("include") + .join("postgresql") + .join("server") + .into_os_string() + .into_string() + .unwrap(); + } + // The bindgen::Builder is the main entry point // to bindgen, and lets you build up options for // the resulting bindings. @@ -81,15 +121,7 @@ fn main() { // explicit padding fields. .explicit_padding(true) // - // Path the server include dir. It is in tmp_install/include/server, if you did - // "configure --prefix=". But if you used "configure --prefix=/", - // and used DESTDIR to move it into tmp_install, then it's in - // tmp_install/include/postgres/server - // 'pg_config --includedir-server' would perhaps be the more proper way to find it, - // but this will do for now. - // - .clang_arg("-I../../tmp_install/include/server") - .clang_arg("-I../../tmp_install/include/postgresql/server") + .clang_arg(format!("-I{inc_server_path}")) // // Finish the builder and generate the bindings. // From 2b21d7b5bc7180f606982e401d3a5c4dd130b635 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Wed, 13 Jul 2022 12:51:20 +0300 Subject: [PATCH 0504/1022] Migrate from CircleCI to Github Actions: docker build and deploy (#1986) --- .circleci/config.yml | 368 ------------------ {.circleci => .github}/ansible/.gitignore | 0 {.circleci => .github}/ansible/ansible.cfg | 0 .../ansible/ansible.ssh.cfg | 0 {.circleci => .github}/ansible/deploy.yaml | 0 .../ansible/get_binaries.sh | 0 .../ansible/neon-stress.hosts | 0 .../ansible/production.hosts | 0 .../ansible/scripts/init_pageserver.sh | 0 .../ansible/scripts/init_safekeeper.sh | 0 {.circleci => .github}/ansible/staging.hosts | 0 .../ansible/systemd/pageserver.service | 0 .../ansible/systemd/safekeeper.service | 0 .../helm-values/neon-stress.proxy-scram.yaml | 0 .../helm-values/neon-stress.proxy.yaml | 0 .../helm-values/production.proxy-scram.yaml | 0 .../helm-values/production.proxy.yaml | 0 .../helm-values/staging.proxy-scram.yaml | 0 .../helm-values/staging.proxy.yaml | 0 .github/workflows/build_and_test.yml | 242 ++++++++++++ 20 files changed, 242 insertions(+), 368 deletions(-) rename {.circleci => .github}/ansible/.gitignore (100%) rename {.circleci => .github}/ansible/ansible.cfg (100%) rename {.circleci => .github}/ansible/ansible.ssh.cfg (100%) rename {.circleci => .github}/ansible/deploy.yaml (100%) rename {.circleci => .github}/ansible/get_binaries.sh (100%) rename {.circleci => .github}/ansible/neon-stress.hosts (100%) rename {.circleci => .github}/ansible/production.hosts (100%) rename {.circleci => .github}/ansible/scripts/init_pageserver.sh (100%) rename {.circleci => .github}/ansible/scripts/init_safekeeper.sh (100%) rename {.circleci => .github}/ansible/staging.hosts (100%) rename {.circleci => .github}/ansible/systemd/pageserver.service (100%) rename {.circleci => .github}/ansible/systemd/safekeeper.service (100%) rename {.circleci => .github}/helm-values/neon-stress.proxy-scram.yaml (100%) rename {.circleci => .github}/helm-values/neon-stress.proxy.yaml (100%) rename {.circleci => .github}/helm-values/production.proxy-scram.yaml (100%) rename {.circleci => .github}/helm-values/production.proxy.yaml (100%) rename {.circleci => .github}/helm-values/staging.proxy-scram.yaml (100%) rename {.circleci => .github}/helm-values/staging.proxy.yaml (100%) diff --git a/.circleci/config.yml b/.circleci/config.yml index 941849bb0e..00a51eb906 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -325,274 +325,6 @@ jobs: paths: - "*" - # Build neondatabase/neon:latest image and push it to Docker hub - docker-image: - docker: - - image: cimg/base:2021.04 - steps: - - checkout - - setup_remote_docker: - docker_layer_caching: true - - run: - name: Init postgres submodule - command: git submodule update --init --depth 1 - - run: - name: Build and push Docker image - command: | - echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin - DOCKER_TAG=$(git log --oneline|wc -l) - docker build \ - --pull \ - --build-arg GIT_VERSION=${CIRCLE_SHA1} \ - --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ - --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ - --tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:latest . - docker push neondatabase/neon:${DOCKER_TAG} - docker push neondatabase/neon:latest - - # Build neondatabase/compute-node:latest image and push it to Docker hub - docker-image-compute: - docker: - - image: cimg/base:2021.04 - steps: - - checkout - - setup_remote_docker: - docker_layer_caching: true - - run: - name: Build and push compute-tools Docker image - command: | - echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin - docker build \ - --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ - --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ - --tag neondatabase/compute-tools:local \ - --tag neondatabase/compute-tools:latest \ - -f Dockerfile.compute-tools . - # Only push :latest image - docker push neondatabase/compute-tools:latest - - run: - name: Init postgres submodule - command: git submodule update --init --depth 1 - - run: - name: Build and push compute-node Docker image - command: | - echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin - DOCKER_TAG=$(git log --oneline|wc -l) - docker build --tag neondatabase/compute-node:${DOCKER_TAG} \ - --tag neondatabase/compute-node:latest vendor/postgres \ - --build-arg COMPUTE_TOOLS_TAG=local - docker push neondatabase/compute-node:${DOCKER_TAG} - docker push neondatabase/compute-node:latest - - # Build production neondatabase/neon:release image and push it to Docker hub - docker-image-release: - docker: - - image: cimg/base:2021.04 - steps: - - checkout - - setup_remote_docker: - docker_layer_caching: true - - run: - name: Init postgres submodule - command: git submodule update --init --depth 1 - - run: - name: Build and push Docker image - command: | - echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin - DOCKER_TAG="release-$(git log --oneline|wc -l)" - docker build \ - --pull \ - --build-arg GIT_VERSION=${CIRCLE_SHA1} \ - --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ - --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ - --tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:release . - docker push neondatabase/neon:${DOCKER_TAG} - docker push neondatabase/neon:release - - # Build production neondatabase/compute-node:release image and push it to Docker hub - docker-image-compute-release: - docker: - - image: cimg/base:2021.04 - steps: - - checkout - - setup_remote_docker: - docker_layer_caching: true - - run: - name: Build and push compute-tools Docker image - command: | - echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin - docker build \ - --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ - --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ - --tag neondatabase/compute-tools:release \ - --tag neondatabase/compute-tools:local \ - -f Dockerfile.compute-tools . - # Only push :release image - docker push neondatabase/compute-tools:release - - run: - name: Init postgres submodule - command: git submodule update --init --depth 1 - - run: - name: Build and push compute-node Docker image - command: | - echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin - DOCKER_TAG="release-$(git log --oneline|wc -l)" - docker build --tag neondatabase/compute-node:${DOCKER_TAG} \ - --tag neondatabase/compute-node:release vendor/postgres \ - --build-arg COMPUTE_TOOLS_TAG=local - docker push neondatabase/compute-node:${DOCKER_TAG} - docker push neondatabase/compute-node:release - - deploy-staging: - docker: - - image: cimg/python:3.10 - steps: - - checkout - - setup_remote_docker - - run: - name: Setup ansible - command: | - pip install --progress-bar off --user ansible boto3 - - run: - name: Redeploy - command: | - cd "$(pwd)/.circleci/ansible" - - ./get_binaries.sh - - echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key - echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub - chmod 0600 ssh-key - ssh-add ssh-key - rm -f ssh-key ssh-key-cert.pub - - ansible-playbook deploy.yaml -i staging.hosts - rm -f neon_install.tar.gz .neon_current_version - - deploy-staging-proxy: - docker: - - image: cimg/base:2021.04 - environment: - KUBECONFIG: .kubeconfig - steps: - - checkout - - run: - name: Store kubeconfig file - command: | - echo "${STAGING_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG} - chmod 0600 ${KUBECONFIG} - - run: - name: Setup helm v3 - command: | - curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - helm repo add neondatabase https://neondatabase.github.io/helm-charts - - run: - name: Re-deploy proxy - command: | - DOCKER_TAG=$(git log --oneline|wc -l) - helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s - helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s - - deploy-neon-stress: - docker: - - image: cimg/python:3.10 - steps: - - checkout - - setup_remote_docker - - run: - name: Setup ansible - command: | - pip install --progress-bar off --user ansible boto3 - - run: - name: Redeploy - command: | - cd "$(pwd)/.circleci/ansible" - - ./get_binaries.sh - - echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key - echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub - chmod 0600 ssh-key - ssh-add ssh-key - rm -f ssh-key ssh-key-cert.pub - - ansible-playbook deploy.yaml -i neon-stress.hosts - rm -f neon_install.tar.gz .neon_current_version - - deploy-neon-stress-proxy: - docker: - - image: cimg/base:2021.04 - environment: - KUBECONFIG: .kubeconfig - steps: - - checkout - - run: - name: Store kubeconfig file - command: | - echo "${NEON_STRESS_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG} - chmod 0600 ${KUBECONFIG} - - run: - name: Setup helm v3 - command: | - curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - helm repo add neondatabase https://neondatabase.github.io/helm-charts - - run: - name: Re-deploy proxy - command: | - DOCKER_TAG=$(git log --oneline|wc -l) - helm upgrade neon-stress-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait - helm upgrade neon-stress-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait - - deploy-release: - docker: - - image: cimg/python:3.10 - steps: - - checkout - - setup_remote_docker - - run: - name: Setup ansible - command: | - pip install --progress-bar off --user ansible boto3 - - run: - name: Redeploy - command: | - cd "$(pwd)/.circleci/ansible" - - RELEASE=true ./get_binaries.sh - - echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key - echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub - chmod 0600 ssh-key - ssh-add ssh-key - rm -f ssh-key ssh-key-cert.pub - - ansible-playbook deploy.yaml -i production.hosts - rm -f neon_install.tar.gz .neon_current_version - - deploy-release-proxy: - docker: - - image: cimg/base:2021.04 - environment: - KUBECONFIG: .kubeconfig - steps: - - checkout - - run: - name: Store kubeconfig file - command: | - echo "${PRODUCTION_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG} - chmod 0600 ${KUBECONFIG} - - run: - name: Setup helm v3 - command: | - curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - helm repo add neondatabase https://neondatabase.github.io/helm-charts - - run: - name: Re-deploy proxy - command: | - DOCKER_TAG="release-$(git log --oneline|wc -l)" - helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait - helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait - workflows: build_and_test: jobs: @@ -635,103 +367,3 @@ workflows: save_perf_report: true requires: - build-neon-release - - docker-image: - # Context gives an ability to login - context: Docker Hub - # Build image only for commits to main - filters: - branches: - only: - - main - requires: - - pg_regress-tests-release - - other-tests-release - - docker-image-compute: - # Context gives an ability to login - context: Docker Hub - # Build image only for commits to main - filters: - branches: - only: - - main - requires: - - pg_regress-tests-release - - other-tests-release - - deploy-staging: - # Context gives an ability to login - context: Docker Hub - # deploy only for commits to main - filters: - branches: - only: - - main - requires: - - docker-image - - deploy-staging-proxy: - # deploy only for commits to main - filters: - branches: - only: - - main - requires: - - docker-image - - - deploy-neon-stress: - # Context gives an ability to login - context: Docker Hub - # deploy only for commits to main - filters: - branches: - only: - - main - requires: - - docker-image - - deploy-neon-stress-proxy: - # deploy only for commits to main - filters: - branches: - only: - - main - requires: - - docker-image - - - docker-image-release: - # Context gives an ability to login - context: Docker Hub - # Build image only for commits to main - filters: - branches: - only: - - release - requires: - - pg_regress-tests-release - - other-tests-release - - docker-image-compute-release: - # Context gives an ability to login - context: Docker Hub - # Build image only for commits to main - filters: - branches: - only: - - release - requires: - - pg_regress-tests-release - - other-tests-release - - deploy-release: - # Context gives an ability to login - context: Docker Hub - # deploy only for commits to main - filters: - branches: - only: - - release - requires: - - docker-image-release - - deploy-release-proxy: - # deploy only for commits to main - filters: - branches: - only: - - release - requires: - - docker-image-release diff --git a/.circleci/ansible/.gitignore b/.github/ansible/.gitignore similarity index 100% rename from .circleci/ansible/.gitignore rename to .github/ansible/.gitignore diff --git a/.circleci/ansible/ansible.cfg b/.github/ansible/ansible.cfg similarity index 100% rename from .circleci/ansible/ansible.cfg rename to .github/ansible/ansible.cfg diff --git a/.circleci/ansible/ansible.ssh.cfg b/.github/ansible/ansible.ssh.cfg similarity index 100% rename from .circleci/ansible/ansible.ssh.cfg rename to .github/ansible/ansible.ssh.cfg diff --git a/.circleci/ansible/deploy.yaml b/.github/ansible/deploy.yaml similarity index 100% rename from .circleci/ansible/deploy.yaml rename to .github/ansible/deploy.yaml diff --git a/.circleci/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh similarity index 100% rename from .circleci/ansible/get_binaries.sh rename to .github/ansible/get_binaries.sh diff --git a/.circleci/ansible/neon-stress.hosts b/.github/ansible/neon-stress.hosts similarity index 100% rename from .circleci/ansible/neon-stress.hosts rename to .github/ansible/neon-stress.hosts diff --git a/.circleci/ansible/production.hosts b/.github/ansible/production.hosts similarity index 100% rename from .circleci/ansible/production.hosts rename to .github/ansible/production.hosts diff --git a/.circleci/ansible/scripts/init_pageserver.sh b/.github/ansible/scripts/init_pageserver.sh similarity index 100% rename from .circleci/ansible/scripts/init_pageserver.sh rename to .github/ansible/scripts/init_pageserver.sh diff --git a/.circleci/ansible/scripts/init_safekeeper.sh b/.github/ansible/scripts/init_safekeeper.sh similarity index 100% rename from .circleci/ansible/scripts/init_safekeeper.sh rename to .github/ansible/scripts/init_safekeeper.sh diff --git a/.circleci/ansible/staging.hosts b/.github/ansible/staging.hosts similarity index 100% rename from .circleci/ansible/staging.hosts rename to .github/ansible/staging.hosts diff --git a/.circleci/ansible/systemd/pageserver.service b/.github/ansible/systemd/pageserver.service similarity index 100% rename from .circleci/ansible/systemd/pageserver.service rename to .github/ansible/systemd/pageserver.service diff --git a/.circleci/ansible/systemd/safekeeper.service b/.github/ansible/systemd/safekeeper.service similarity index 100% rename from .circleci/ansible/systemd/safekeeper.service rename to .github/ansible/systemd/safekeeper.service diff --git a/.circleci/helm-values/neon-stress.proxy-scram.yaml b/.github/helm-values/neon-stress.proxy-scram.yaml similarity index 100% rename from .circleci/helm-values/neon-stress.proxy-scram.yaml rename to .github/helm-values/neon-stress.proxy-scram.yaml diff --git a/.circleci/helm-values/neon-stress.proxy.yaml b/.github/helm-values/neon-stress.proxy.yaml similarity index 100% rename from .circleci/helm-values/neon-stress.proxy.yaml rename to .github/helm-values/neon-stress.proxy.yaml diff --git a/.circleci/helm-values/production.proxy-scram.yaml b/.github/helm-values/production.proxy-scram.yaml similarity index 100% rename from .circleci/helm-values/production.proxy-scram.yaml rename to .github/helm-values/production.proxy-scram.yaml diff --git a/.circleci/helm-values/production.proxy.yaml b/.github/helm-values/production.proxy.yaml similarity index 100% rename from .circleci/helm-values/production.proxy.yaml rename to .github/helm-values/production.proxy.yaml diff --git a/.circleci/helm-values/staging.proxy-scram.yaml b/.github/helm-values/staging.proxy-scram.yaml similarity index 100% rename from .circleci/helm-values/staging.proxy-scram.yaml rename to .github/helm-values/staging.proxy-scram.yaml diff --git a/.circleci/helm-values/staging.proxy.yaml b/.github/helm-values/staging.proxy.yaml similarity index 100% rename from .circleci/helm-values/staging.proxy.yaml rename to .github/helm-values/staging.proxy.yaml diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 78aa163f3e..857e9e3533 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -390,3 +390,245 @@ jobs: \"remote_repo\": \"${{ github.repository }}\" } }" + + docker-image: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ pg_regress-tests, other-tests ] + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + outputs: + build-tag: ${{steps.build-tag.outputs.tag}} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + with: + driver: docker + + - name: Get build tag + run: | + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "::set-output name=tag::$(git rev-list --count HEAD)" + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "::set-output name=tag::release-$(git rev-list --count HEAD)" + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + id: build-tag + + - name: Get legacy build tag + run: | + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "::set-output name=tag::latest + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "::set-output name=tag::release + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + id: legacy-build-tag + + - name: Build compute-tools Docker image + uses: docker/build-push-action@v2 + with: + context: . + build-args: | + GIT_VERSION="${GITHUB_SHA}" + AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" + AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" + pull: true + push: true + tags: neondatabase/neon:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/neon:${{steps.build-tag.outputs.tag}} + + docker-image-compute: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ pg_regress-tests, other-tests ] + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + outputs: + build-tag: ${{steps.build-tag.outputs.tag}} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + with: + driver: docker + + - name: Get build tag + run: | + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "::set-output name=tag::$(git rev-list --count HEAD)" + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "::set-output name=tag::release-$(git rev-list --count HEAD)" + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + id: build-tag + + - name: Get legacy build tag + run: | + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "::set-output name=tag::latest + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "::set-output name=tag::release + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + id: legacy-build-tag + + - name: Build compute-tools Docker image + uses: docker/build-push-action@v2 + with: + context: . + build-args: | + AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" + AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" + push: false + file: Dockerfile.compute-tools + tags: neondatabase/compute-tools:local + + - name: Push compute-tools Docker image + uses: docker/build-push-action@v2 + with: + context: . + build-args: | + AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" + AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" + push: true + file: Dockerfile.compute-tools + tags: neondatabase/compute-tools:${{steps.legacy-build-tag.outputs.tag}} + + - name: Build compute-node Docker image + uses: docker/build-push-action@v2 + with: + context: ./vendor/postgres/ + build-args: + COMPUTE_TOOLS_TAG=local + push: true + tags: neondatabase/compute-node:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/compute-node:${{steps.build-tag.outputs.tag}} + + calculate-deploy-targets: + runs-on: [ self-hosted, Linux, k8s-runner ] + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + outputs: + matrix-include: ${{ steps.set-matrix.outputs.include }} + steps: + - id: set-matrix + run: | + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}' + NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}' + echo "::set-output name=include::[$STAGING, $NEON_STRESS]" + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}' + echo "::set-output name=include::[$PRODUCTION]" + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + + deploy: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ docker-image, calculate-deploy-targets ] + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + strategy: + matrix: + include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Setup ansible + run: | + pip install --progress-bar off --user ansible boto3 + + - name: Redeploy + run: | + cd "$(pwd)/.github/ansible" + + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + ./get_binaries.sh + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + RELEASE=true ./get_binaries.sh + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + + eval $(ssh-agent) + echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key + echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub + chmod 0600 ssh-key + ssh-add ssh-key + rm -f ssh-key ssh-key-cert.pub + + ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts + rm -f neon_install.tar.gz .neon_current_version + + deploy-proxy: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ docker-image, calculate-deploy-targets ] + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + strategy: + matrix: + include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} + env: + KUBECONFIG: .kubeconfig + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Store kubeconfig file + run: | + echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG} + chmod 0600 ${KUBECONFIG} + + - name: Setup helm v3 + run: | + curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + helm repo add neondatabase https://neondatabase.github.io/helm-charts + + - name: Re-deploy proxy + run: | + DOCKER_TAG=${{needs.docker-image.outputs.build-tag}} + helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s From 07acd6dddef81e5d67c20ea2b5ce343eaabaa1df Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 13 Jul 2022 14:12:11 +0100 Subject: [PATCH 0505/1022] Fix clippy warnings in postgres_ffi/build.rs (#2081) --- libs/postgres_ffi/build.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 703d972480..c6df4fc0b0 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -50,8 +50,6 @@ fn main() { // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/tmp_install` // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/tmp_install/include/postgresql/server` let mut pg_install_dir: PathBuf; - let inc_server_path: String; - if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") { pg_install_dir = postgres_install_dir.into(); } else { @@ -64,7 +62,7 @@ fn main() { } let pg_config_bin = pg_install_dir.join("bin").join("pg_config"); - if pg_config_bin.exists() { + let inc_server_path: String = if pg_config_bin.exists() { let output = Command::new(pg_config_bin) .arg("--includedir-server") .output() @@ -74,16 +72,16 @@ fn main() { panic!("`pg_config --includedir-server` failed") } - inc_server_path = String::from_utf8(output.stdout).unwrap().trim_end().into(); + String::from_utf8(output.stdout).unwrap().trim_end().into() } else { - inc_server_path = pg_install_dir + pg_install_dir .join("include") .join("postgresql") .join("server") .into_os_string() .into_string() - .unwrap(); - } + .unwrap() + }; // The bindgen::Builder is the main entry point // to bindgen, and lets you build up options for From f8a64512df619cd8f73d937a93640d102271312d Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Mon, 11 Jul 2022 19:41:08 +0200 Subject: [PATCH 0506/1022] [compute_tools] Set public schema owner to db owner (#2058) Otherwise, it does not have a control on it, which is reasonable thing to have and some users already hit it. --- compute_tools/src/compute.rs | 2 +- compute_tools/src/spec.rs | 39 ++++++++++++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 8bcaf5494a..1e812f2aa0 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -295,7 +295,7 @@ impl ComputeNode { handle_roles(&self.spec, &mut client)?; handle_databases(&self.spec, &mut client)?; handle_role_deletions(self, &mut client)?; - handle_grants(&self.spec, &mut client)?; + handle_grants(self, &mut client)?; create_writablity_check_data(&mut client)?; // 'Close' connection diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 041f42acde..71ae3f4ca9 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,7 +1,8 @@ use std::path::Path; -use anyhow::Result; +use anyhow::{anyhow, Result}; use log::{info, log_enabled, warn, Level}; +use postgres::error::SqlState; use postgres::{Client, NoTls}; use serde::Deserialize; @@ -349,9 +350,11 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { Ok(()) } -// Grant CREATE ON DATABASE to the database owner -// to allow clients create trusted extensions. -pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> { +/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants +/// to allow users creating trusted extensions and re-creating `public` schema, for example. +pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { + let spec = &node.spec; + info!("cluster spec grants:"); // We now have a separate `web_access` role to connect to the database @@ -380,5 +383,33 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> { client.execute(query.as_str(), &[])?; } + // Do some per-database access adjustments. We'd better do this at db creation time, + // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants + // atomically. + let mut db_connstr = node.connstr.clone(); + for db in &node.spec.cluster.databases { + // database name is always the last and the only component of the path + db_connstr.set_path(&db.name); + + let mut db_client = Client::connect(db_connstr.as_str(), NoTls)?; + + // This will only change ownership on the schema itself, not the objects + // inside it. Without it owner of the `public` schema will be `cloud_admin` + // and database owner cannot do anything with it. + let alter_query = format!("ALTER SCHEMA public OWNER TO {}", db.owner.quote()); + let res = db_client.simple_query(&alter_query); + + if let Err(e) = res { + if e.code() == Some(&SqlState::INVALID_SCHEMA_NAME) { + // This is OK, db just don't have a `public` schema. + // Probably user dropped it manually. + info!("no 'public' schema found in the database {}", db.name); + } else { + // Something different happened, propagate the error + return Err(anyhow!(e)); + } + } + } + Ok(()) } From 968c20ca5f060eccc0930dc98ea358ab67df62c6 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 13 Jul 2022 21:22:44 +0300 Subject: [PATCH 0507/1022] Add zenith-1-ps-3 to prod inventory (#2084) --- .github/ansible/production.hosts | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/ansible/production.hosts b/.github/ansible/production.hosts index 6a3a7791ad..d22ce0e37e 100644 --- a/.github/ansible/production.hosts +++ b/.github/ansible/production.hosts @@ -1,6 +1,7 @@ [pageservers] #zenith-1-ps-1 console_region_id=1 zenith-1-ps-2 console_region_id=1 +zenith-1-ps-3 console_region_id=1 [safekeepers] zenith-1-sk-1 console_region_id=1 From 9a7427c203f919891e3a8713a21672e3ed1da03b Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 14 Jul 2022 02:15:43 +0300 Subject: [PATCH 0508/1022] Fill build-args for Docker builds via GH Actions context --- .github/workflows/build_and_test.yml | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 857e9e3533..e4858c1fe9 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -441,14 +441,14 @@ jobs: fi id: legacy-build-tag - - name: Build compute-tools Docker image + - name: Build neon Docker image uses: docker/build-push-action@v2 with: context: . build-args: | - GIT_VERSION="${GITHUB_SHA}" - AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" - AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" + GIT_VERSION="${{github.sha}}" + AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" + AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" pull: true push: true tags: neondatabase/neon:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/neon:${{steps.build-tag.outputs.tag}} @@ -508,8 +508,9 @@ jobs: with: context: . build-args: | - AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" - AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" + GIT_VERSION="${{github.sha}}" + AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" + AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" push: false file: Dockerfile.compute-tools tags: neondatabase/compute-tools:local @@ -519,8 +520,9 @@ jobs: with: context: . build-args: | - AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" - AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" + GIT_VERSION="${{github.sha}}" + AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" + AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" push: true file: Dockerfile.compute-tools tags: neondatabase/compute-tools:${{steps.legacy-build-tag.outputs.tag}} From 12bac9c12bdcddfb73fec68fbe65a9013af3e588 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Wed, 13 Jul 2022 15:52:04 +0200 Subject: [PATCH 0509/1022] Wait for compute image before deploy in GitHub Action We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly. --- .github/workflows/build_and_test.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e4858c1fe9..75f5828ef4 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -560,7 +560,11 @@ jobs: deploy: runs-on: [ self-hosted, Linux, k8s-runner ] - needs: [ docker-image, calculate-deploy-targets ] + # We need both storage **and** compute images for deploy, because control plane + # picks the compute version based on the storage version. If it notices a fresh + # storage it may bump the compute version. And if compute image failed to build + # it may break things badly. + needs: [ docker-image, docker-image-compute, calculate-deploy-targets ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' @@ -603,7 +607,9 @@ jobs: deploy-proxy: runs-on: [ self-hosted, Linux, k8s-runner ] - needs: [ docker-image, calculate-deploy-targets ] + # Compute image isn't strictly required for proxy deploy, but let's still wait for it + # to run all deploy jobs consistently. + needs: [ docker-image, docker-image-compute, calculate-deploy-targets ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' From 1b6a80a38f3961f95c8b96361367f9d827e39ae6 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Fri, 8 Jul 2022 20:18:24 +0300 Subject: [PATCH 0510/1022] Fix flaky test_concurrent_computes * Wait for all computes (except one) to complete before proceeding with the single compute. * It previously waited for too few seconds. As the test is randomized, it was not failing all the time, but only in specific unlucky cases. E.g. when there were no successfuly queries by concurrent computes, and the single node had big timeouts and spent lots of time making the transaction. See https://github.com/neondatabase/neon/runs/7234456482?check_suite_focus=true (around line 980). * Wait for exactly one extra transaction by the single compute. --- .../batch_others/test_wal_acceptor_async.py | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index 4664c332fc..d74ef8840a 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -302,6 +302,8 @@ def test_compute_restarts(neon_env_builder: NeonEnvBuilder): class BackgroundCompute(object): + MAX_QUERY_GAP_SECONDS = 2 + def __init__(self, index: int, env: NeonEnv, branch: str): self.index = index self.env = env @@ -339,7 +341,7 @@ class BackgroundCompute(object): # With less sleep, there is a very big chance of not committing # anything or only 1 xact during test run. - await asyncio.sleep(2 * random.random()) + await asyncio.sleep(random.uniform(0, self.MAX_QUERY_GAP_SECONDS)) self.running = False @@ -356,20 +358,34 @@ async def run_concurrent_computes(env: NeonEnv, background_tasks = [asyncio.create_task(compute.run()) for compute in computes] await asyncio.sleep(run_seconds) + log.info("stopping all tasks but one") for compute in computes[1:]: compute.stopped = True + await asyncio.gather(*background_tasks[1:]) log.info("stopped all tasks but one") # work for some time with only one compute -- it should be able to make some xacts - await asyncio.sleep(8) + TIMEOUT_SECONDS = computes[0].MAX_QUERY_GAP_SECONDS + 3 + initial_queries_by_0 = len(computes[0].successful_queries) + log.info(f'Waiting for another query by computes[0], ' + f'it already had {initial_queries_by_0}, timeout is {TIMEOUT_SECONDS}s') + for _ in range(10 * TIMEOUT_SECONDS): + current_queries_by_0 = len(computes[0].successful_queries) - initial_queries_by_0 + if current_queries_by_0 >= 1: + log.info(f'Found {current_queries_by_0} successful queries ' + f'by computes[0], completing the test') + break + await asyncio.sleep(0.1) + else: + assert False, "Timed out while waiting for another query by computes[0]" computes[0].stopped = True - await asyncio.gather(*background_tasks) + await asyncio.gather(background_tasks[0]) result = await exec_compute_query(env, branch, 'SELECT * FROM query_log') # we should have inserted something while single compute was running - assert len(result) >= 4 - log.info(f'Executed {len(result)} queries') + log.info(f'Executed {len(result)} queries, {current_queries_by_0} of them ' + f'by computes[0] after we started stopping the others') for row in result: log.info(f'{row[0]} {row[1]} {row[2]}') From c004a6d62fc9b45c4ef6f8ed6ba1016101d4807d Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 14 Jul 2022 12:46:38 +0300 Subject: [PATCH 0511/1022] Do not cancel in-progress checks on the `main` branch See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#concurrency * Previously there was a single concurrency group per each branch. As the `main` branch got pushed into frequently, very few commits got tested to the end. It resulted in "broken" `main` branch as there were no fully successful workflow runs. Now the `main` branch gets a separate concurrency group for each commit. * As GitHub Actions syntax does not have the conditional operator, it is emulated via logical and/or operations. Although undocumented, they return one of their operands instead of plain true/false. * Replace 3-space indentation with 2-space indentation while we are here to be consistent with the rest of the file. --- .github/workflows/build_and_test.yml | 5 +++-- .github/workflows/codestyle.yml | 5 +++-- .github/workflows/pg_clients.yml | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 75f5828ef4..3a12d19428 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -11,8 +11,9 @@ defaults: shell: bash -ex {0} concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + cancel-in-progress: true env: RUST_BACKTRACE: 1 diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 2b8a01e94e..345c1d5397 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -11,8 +11,9 @@ defaults: shell: bash -ex {0} concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + cancel-in-progress: true env: RUST_BACKTRACE: 1 diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index fe4dbea8ac..4ff31ac508 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -13,8 +13,9 @@ on: workflow_dispatch: concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + cancel-in-progress: true jobs: test-postgres-client-libs: From 79f5685d00284fe78efac25605e9bbd7646c95c7 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 14 Jul 2022 14:06:10 +0300 Subject: [PATCH 0512/1022] Enable basic optimizations even in 'dev' builds. Change the build options to enable basic optimizations even in debug mode, and always build dependencies with more optimizations. That makes the debug-mode binaries somewhat faster, without messing up stack traces and line-by-line debugging too much. --- .cargo/config.toml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 .cargo/config.toml diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000000..76a2ff549e --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,13 @@ +# The binaries are really slow, if you compile them in 'dev' mode with the defaults. +# Enable some optimizations even in 'dev' mode, to make tests faster. The basic +# optimizations enabled by "opt-level=1" don't affect debuggability too much. +# +# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/ +# +[profile.dev.package."*"] +# Set the default for dependencies in Development mode. +opt-level = 3 + +[profile.dev] +# Turn on a small amount of optimization in Development mode. +opt-level = 1 From a342957aee30ee5cab076ea067bd5996343fb9d0 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 14 Jul 2022 20:47:35 +0300 Subject: [PATCH 0513/1022] Use ok_or_else() instead of ok_or(), to silence clippy warnings. "cargo clippy" started to complain about these, after running "cargo update". Not sure why it didn't complain before, but seems reasonable to fix these. (The "cargo update" is not included in this commit) --- pageserver/src/storage_sync/download.rs | 4 +--- safekeeper/src/broker.rs | 4 +++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 0f2bdd3bcb..12c7f4384b 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -122,9 +122,7 @@ where download_index_parts(conf, storage, sync_ids) .await .remove(&tenant_id) - .ok_or(anyhow::anyhow!( - "Missing tenant index parts. This is a bug." - )) + .ok_or_else(|| anyhow::anyhow!("Missing tenant index parts. This is a bug.")) } /// Retrieves index data from the remote storage for a given timeline. diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 8e0eb971f3..ce66131700 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -83,7 +83,9 @@ impl ElectionLeader { ) -> Result { let resp = self.client.leader(election_name).await?; - let kv = resp.kv().ok_or(anyhow!("failed to get leader response"))?; + let kv = resp + .kv() + .ok_or_else(|| anyhow!("failed to get leader response"))?; let leader = kv.value_str()?; Ok(leader == candidate_name) From 0886aced86b2b63cab4c75342451e707f8a300c8 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 14 Jul 2022 20:47:38 +0300 Subject: [PATCH 0514/1022] Update dependencies. - Updated dependencies with "cargo update" - Updated workspace_hack with "cargo hakari generate" There's no particular reason to do this now, just a periodic refresh. --- Cargo.lock | 776 +++++++++++++++++++------------------- workspace_hack/Cargo.toml | 10 +- 2 files changed, 391 insertions(+), 395 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4f1d727ae1..4f453678e6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -48,9 +48,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.53" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94a45b455c14666b85fc40a019e8ab9eb75e3a124e05494f5397122bc9eb06e0" +checksum = "bb07d2053ccdbe10e2af2995a2f116c1330396493dc1269f6a91d0ae82e19704" dependencies = [ "backtrace", ] @@ -77,7 +77,7 @@ dependencies = [ "num-traits", "rusticata-macros", "thiserror", - "time 0.3.9", + "time 0.3.11", ] [[package]] @@ -126,9 +126,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.52" +version = "0.1.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061a7acccaa286c011ddc30970520b98fa40e00c9d644633fb26b5fc63a265e3" +checksum = "96cf8829f67d2eab0b2dfa42c5d0ef737e0724e4a82b01b3e292456202b19716" dependencies = [ "proc-macro2", "quote", @@ -154,9 +154,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "axum" -version = "0.5.4" +version = "0.5.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4af7447fc1214c1f3a1ace861d0216a6c8bb13965b64bbad9650f375b67689a" +checksum = "d16705af05732b7d3258ec0f7b73c03a658a28925e050d8852d5b568ee8bcf4e" dependencies = [ "async-trait", "axum-core", @@ -166,7 +166,7 @@ dependencies = [ "http", "http-body", "hyper", - "itoa 1.0.1", + "itoa 1.0.2", "matchit", "memchr", "mime", @@ -183,9 +183,9 @@ dependencies = [ [[package]] name = "axum-core" -version = "0.2.3" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bdc19781b16e32f8a7200368a336fa4509d4b72ef15dd4e41df5290855ee1e6" +checksum = "e4f44a0e6200e9d11a1cdc989e4b358f6e3d354fbf48478f345a17f4e43f8635" dependencies = [ "async-trait", "bytes", @@ -197,15 +197,15 @@ dependencies = [ [[package]] name = "backtrace" -version = "0.3.64" +version = "0.3.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e121dee8023ce33ab248d9ce1493df03c3b38a659b240096fcbd7048ff9c31f" +checksum = "cab84319d616cfb654d03394f38ab7e6f0919e181b1b57e1fd15e7fb4077d9a7" dependencies = [ "addr2line", "cc", "cfg-if", "libc", - "miniz_oxide 0.4.4", + "miniz_oxide", "object", "rustc-demangle", ] @@ -292,15 +292,15 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.9.1" +version = "3.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a45a46ab1f2412e53d3a0ade76ffad2025804294569aae387231a0cd6e0899" +checksum = "37ccbd214614c6783386c1af30caf03192f17891059cecc394b4fb119e363de3" [[package]] name = "bytemuck" -version = "1.9.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdead85bdec19c194affaeeb670c0e41fe23de31459efd1c174d049269cf02cc" +checksum = "c53dfa917ec274df8ed3c572698f381a24eef2efba9492d797301b72b6db408a" [[package]] name = "byteorder" @@ -327,10 +327,16 @@ dependencies = [ ] [[package]] -name = "cc" -version = "1.0.72" +name = "cast" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cc" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" [[package]] name = "cexpr" @@ -363,9 +369,9 @@ dependencies = [ [[package]] name = "clang-sys" -version = "1.3.1" +version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cc00842eed744b858222c4c9faf7243aafc6d33f92f96935263ef4d8a41ce21" +checksum = "5a050e2153c5be08febd6734e29298e844fdb0fa21aeddd63b4eb7baa106c69b" dependencies = [ "glob", "libc", @@ -389,17 +395,26 @@ dependencies = [ [[package]] name = "clap" -version = "3.0.14" +version = "3.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b63edc3f163b3c71ec8aa23f9bd6070f77edbf3d1d198b164afa90ff00e4ec62" +checksum = "ab8b79fe3946ceb4a0b1c080b4018992b8d27e9ff363644c1c9b6387c854614d" dependencies = [ "atty", "bitflags", + "clap_lex", "indexmap", - "os_str_bytes", "strsim 0.10.0", "termcolor", - "textwrap 0.14.2", + "textwrap 0.15.0", +] + +[[package]] +name = "clap_lex" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5" +dependencies = [ + "os_str_bytes", ] [[package]] @@ -423,9 +438,9 @@ dependencies = [ [[package]] name = "combine" -version = "4.6.3" +version = "4.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50b727aacc797f9fc28e355d21f34709ac4fc9adecfe470ad07b8f4464f53062" +checksum = "2a604e93b79d1808327a6fca85a6f2d69de66461e7620f5a4cbf5fb4d1d7c948" dependencies = [ "bytes", "memchr", @@ -449,7 +464,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap 3.0.14", + "clap 3.2.12", "env_logger", "hyper", "libc", @@ -467,9 +482,9 @@ dependencies = [ [[package]] name = "const_format" -version = "0.2.22" +version = "0.2.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22bc6cd49b0ec407b680c3e380182b6ac63b73991cb7602de350352fc309b614" +checksum = "939dc9e2eb9077e0679d2ce32de1ded8531779360b003b4a972a7a39ec263495" dependencies = [ "const_format_proc_macros", ] @@ -534,18 +549,18 @@ dependencies = [ [[package]] name = "cpufeatures" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469" +checksum = "59a6001667ab124aebae2a495118e11d30984c3a653e99d86d58971708cf5e4b" dependencies = [ "libc", ] [[package]] name = "crc32c" -version = "0.6.1" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee6b9c9389584bcba988bd0836086789b7f87ad91892d6a83d5291dbb24524b5" +checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e" dependencies = [ "rustc_version", ] @@ -561,12 +576,12 @@ dependencies = [ [[package]] name = "criterion" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10" +checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f" dependencies = [ "atty", - "cast", + "cast 0.3.0", "clap 2.34.0", "criterion-plot", "csv", @@ -591,15 +606,15 @@ version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" dependencies = [ - "cast", + "cast 0.2.7", "itertools", ] [[package]] name = "crossbeam-channel" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aaa7bd5fb665c6864b5f963dd9097905c54125909c7aa94c9e18507cdbe6c53" +checksum = "4c02a4d71819009c192cf4872265391563fd6a84c81ff2c0f2a7026ca4c1d85c" dependencies = [ "cfg-if", "crossbeam-utils", @@ -618,26 +633,26 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.8" +version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1145cf131a2c6ba0615079ab6a638f7e1973ac9c2634fcbeaaad6114246efe8c" +checksum = "07db9d94cbd326813772c968ccd25999e5f8ae22f4f8d1b11effa37ef6ce281d" dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", - "lazy_static", "memoffset", + "once_cell", "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.7" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e5bed1f1c269533fa816a0a5492b3545209a205ca1a54842be180eb63a16a6" +checksum = "7d82ee10ce34d7bc12c2122495e7593a9c41347ecdd64185af4ecf72cb1a7f83" dependencies = [ "cfg-if", - "lazy_static", + "once_cell", ] [[package]] @@ -650,7 +665,7 @@ dependencies = [ "crossterm_winapi", "libc", "mio", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "signal-hook", "signal-hook-mio", "winapi", @@ -667,9 +682,9 @@ dependencies = [ [[package]] name = "crypto-common" -version = "0.1.3" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57952ca27b5e3606ff4dd79b0020231aaf9d6aa76dc05fd30137538c50bd3ce8" +checksum = "2ccfd8c0ee4cce11e45b3fd6f9d5e69e0cc62912aa6a0cb1bf4617b0eba5a12f" dependencies = [ "generic-array", "typenum", @@ -719,9 +734,9 @@ dependencies = [ [[package]] name = "darling" -version = "0.13.1" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0d720b8683f8dd83c65155f0530560cba68cd2bf395f6513a483caee57ff7f4" +checksum = "a01d95850c592940db9b8194bc39f4bc0e89dee5c4265e4b1807c34a9aba453c" dependencies = [ "darling_core", "darling_macro", @@ -729,9 +744,9 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.13.1" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a340f241d2ceed1deb47ae36c4144b2707ec7dd0b649f894cb39bb595986324" +checksum = "859d65a907b6852c9361e3185c862aae7fafd2887876799fa55f5f99dc40d610" dependencies = [ "fnv", "ident_case", @@ -743,9 +758,9 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.13.1" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72c41b3b7352feb3211a0d743dc5700a4e3b60f51bd2b368892d1e0f9a95f44b" +checksum = "9c972679f83bdf9c42bd905396b6c3588a843a17f0f16dfcfa3e2c5d57441835" dependencies = [ "darling_core", "quote", @@ -835,15 +850,15 @@ dependencies = [ [[package]] name = "either" -version = "1.6.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +checksum = "3f107b87b6afc2a64fd13cac55fe06d6c8859f12d4b14cbcdd2c67d0976781be" [[package]] name = "encoding_rs" -version = "0.8.30" +version = "0.8.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7896dc8abb250ffdda33912550faa54c88ec8b998dec0b2c55ab224921ce11df" +checksum = "9852635589dc9f9ea1b6fe9f05b50ef208c85c834a562f0c6abb1c475736ec2b" dependencies = [ "cfg-if", ] @@ -863,9 +878,9 @@ dependencies = [ [[package]] name = "etcd-client" -version = "0.9.1" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c434d2800b273a506b82397aad2f20971636f65e47b27c027f77d498530c5954" +checksum = "9fb8664f6ea68aba5503d42dd1be786b0f1bd9b7972e7f40208c83ef74db91bf" dependencies = [ "http", "prost", @@ -922,14 +937,14 @@ dependencies = [ [[package]] name = "filetime" -version = "0.2.15" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "975ccf83d8d9d0d84682850a38c8169027be83368805971cc4f238c2b245bc98" +checksum = "e94a7bbaa59354bc20dd75b67f23e2797b4490e9d6928203fb105c79e448c86c" dependencies = [ "cfg-if", "libc", "redox_syscall", - "winapi", + "windows-sys", ] [[package]] @@ -946,21 +961,9 @@ dependencies = [ [[package]] name = "fixedbitset" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "279fb028e20b3c4c320317955b77c5e0c9701f05a1d309905d6fc702cdc5053e" - -[[package]] -name = "flate2" -version = "1.0.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b39522e96686d38f4bc984b9198e3a0613264abaebaff2c5c918bfa6b6da09af" -dependencies = [ - "cfg-if", - "crc32fast", - "libc", - "miniz_oxide 0.5.1", -] +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "fnv" @@ -1104,13 +1107,13 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.4" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418d37c8b1d42553c93648be529cb70f920d3baf8ef469b74b9638df426e0b4c" +checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6" dependencies = [ "cfg-if", "libc", - "wasi 0.10.0+wasi-snapshot-preview1", + "wasi 0.11.0+wasi-snapshot-preview1", ] [[package]] @@ -1149,9 +1152,9 @@ checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" [[package]] name = "h2" -version = "0.3.11" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9f1f717ddc7b2ba36df7e871fd88db79326551d3d6f1fc406fbfd28b582ff8e" +checksum = "37a82c6d637fc9515a4694bbf1cb2457b79d81ce52b3108bdeea58b07dd34a57" dependencies = [ "bytes", "fnv", @@ -1162,7 +1165,7 @@ dependencies = [ "indexmap", "slab", "tokio", - "tokio-util 0.6.9", + "tokio-util", "tracing", ] @@ -1181,6 +1184,12 @@ dependencies = [ "ahash", ] +[[package]] +name = "hashbrown" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "607c8a29735385251a339424dd462993c0fed8fa09d378f259377df08c126022" + [[package]] name = "heck" version = "0.3.3" @@ -1241,20 +1250,20 @@ dependencies = [ [[package]] name = "http" -version = "0.2.6" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f4c6746584866f0feabcc69893c5b51beef3831656a968ed7ae254cdc4fd03" +checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" dependencies = [ "bytes", "fnv", - "itoa 1.0.1", + "itoa 1.0.2", ] [[package]] name = "http-body" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ff4f84919677303da5f147645dbea6b1881f368d03ac84e1dc09031ebd7b2c6" +checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" dependencies = [ "bytes", "http", @@ -1269,9 +1278,9 @@ checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29" [[package]] name = "httparse" -version = "1.6.0" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9100414882e15fb7feccb4897e5f0ff0ff1ca7d1a86a23208ada4d7a18e6c6c4" +checksum = "496ce29bb5a52785b44e0f7ca2847ae0bb839c9bd28f69acac9b99d461c0c04c" [[package]] name = "httpdate" @@ -1297,9 +1306,9 @@ dependencies = [ [[package]] name = "hyper" -version = "0.14.17" +version = "0.14.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "043f0e083e9901b6cc658a77d1eb86f4fc650bbb977a4337dd63192826aa85dd" +checksum = "02c929dc5c39e335a03c405292728118860721b10190d98c2a0f0efd5baafbac" dependencies = [ "bytes", "futures-channel", @@ -1310,7 +1319,7 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa 1.0.1", + "itoa 1.0.2", "pin-project-lite", "socket2", "tokio", @@ -1376,12 +1385,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.8.0" +version = "1.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282a6247722caba404c065016bbfa522806e51714c34f5dfc3e4a3a46fcb4223" +checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" dependencies = [ "autocfg", - "hashbrown", + "hashbrown 0.12.2", ] [[package]] @@ -1393,7 +1402,7 @@ dependencies = [ "ahash", "atty", "indexmap", - "itoa 1.0.1", + "itoa 1.0.2", "lazy_static", "log", "num-format", @@ -1413,9 +1422,9 @@ dependencies = [ [[package]] name = "ipnet" -version = "2.3.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f2d64f2edebec4ce84ad108148e67e1064789bee435edc5b60ad398714a3a9" +checksum = "879d54834c8c76457ef4293a689b2a8c59b076067ad77b15efafbb05f92a592b" [[package]] name = "itertools" @@ -1434,24 +1443,24 @@ checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" [[package]] name = "itoa" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" +checksum = "112c678d4050afce233f4f2852bb2eb519230b3cf12f33585275537d7e41578d" [[package]] name = "js-sys" -version = "0.3.56" +version = "0.3.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a38fc24e30fd564ce974c02bf1d337caddff65be6cc4735a1f7eab22a7440f04" +checksum = "c3fac17f7123a73ca62df411b1bf727ccc805daa070338fda671c86dac1bdc27" dependencies = [ "wasm-bindgen", ] [[package]] name = "jsonwebtoken" -version = "8.1.0" +version = "8.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9051c17f81bae79440afa041b3a278e1de71bfb96d32454b477fd4703ccb6f" +checksum = "1aa4b4af834c6cfd35d8763d359661b90f2e45d8f750a0849156c7f4671af09c" dependencies = [ "base64", "pem", @@ -1484,9 +1493,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.117" +version = "0.2.126" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e74d72e0f9b65b5b4ca49a346af3976df0f9c61d550727f349ecd559f251a26c" +checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836" [[package]] name = "libloading" @@ -1500,18 +1509,19 @@ dependencies = [ [[package]] name = "lock_api" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88943dd7ef4a2e5a4bfa2753aaab3013e34ce2533d1996fb18ef591e315e2b3b" +checksum = "327fa5b6a6940e4699ec49a9beae1ea4845c6bab9314e4f84ac68742139d8c53" dependencies = [ + "autocfg", "scopeguard", ] [[package]] name = "log" -version = "0.4.14" +version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ "cfg-if", "serde", @@ -1566,15 +1576,15 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "memchr" -version = "2.4.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "memmap2" -version = "0.5.3" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "057a3db23999c867821a7a59feb06a578fcb03685e983dff90daf9e7d24ac08f" +checksum = "3a79b39c93a7a5a27eeaf9a23b5ff43f1b9e0ad6b1cdd441140ae53c35613fc7" dependencies = [ "libc", ] @@ -1613,44 +1623,23 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.4.4" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" -dependencies = [ - "adler", - "autocfg", -] - -[[package]] -name = "miniz_oxide" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2b29bd4bc3f33391105ebee3589c19197c4271e3e5a9ec9bfe8127eeff8f082" +checksum = "6f5c75688da582b8ffc1f1799e9db273f32133c49e048f614d22ec3256773ccc" dependencies = [ "adler", ] [[package]] name = "mio" -version = "0.8.2" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52da4364ffb0e4fe33a9841a98a3f3014fb964045ce4f7a45a398243c8d6b0c9" +checksum = "57ee1c23c7c63b0c9250c339ffdc69255f110b298b901b9f6c82547b7b87caaf" dependencies = [ "libc", "log", - "miow", - "ntapi", "wasi 0.11.0+wasi-snapshot-preview1", - "winapi", -] - -[[package]] -name = "miow" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" -dependencies = [ - "winapi", + "windows-sys", ] [[package]] @@ -1661,9 +1650,9 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" [[package]] name = "native-tls" -version = "0.2.8" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48ba9f7719b5a0f42f338907614285fb5fd70e53858141f69898a1fb7203b24d" +checksum = "fd7e2f3618557f980e0b17e8856252eee3c97fa12c54dff0ca290fb6266ca4a9" dependencies = [ "lazy_static", "libc", @@ -1682,7 +1671,7 @@ name = "neon_local" version = "0.1.0" dependencies = [ "anyhow", - "clap 3.0.14", + "clap 3.2.12", "comfy-table", "control_plane", "git-version", @@ -1716,22 +1705,12 @@ checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" [[package]] name = "nom" -version = "7.1.0" +version = "7.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b1d11e1ef389c76fe5b81bcaf2ea32cf88b62bc494e19f493d0b30e7a930109" +checksum = "a8903e5a29a317527874d0402f867152a3d21c908bb0b933e416c65e301d4c36" dependencies = [ "memchr", "minimal-lexical", - "version_check", -] - -[[package]] -name = "ntapi" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c28774a7fd2fbb4f0babd8237ce554b73af68021b5f695a3cebd6c59bac0980f" -dependencies = [ - "winapi", ] [[package]] @@ -1757,9 +1736,9 @@ dependencies = [ [[package]] name = "num-integer" -version = "0.1.44" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" dependencies = [ "autocfg", "num-traits", @@ -1767,9 +1746,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", ] @@ -1786,18 +1765,18 @@ dependencies = [ [[package]] name = "num_threads" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aba1801fb138d8e85e11d0fc70baf4fe1cdfffda7c6cd34a854905df588e5ed0" +checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" dependencies = [ "libc", ] [[package]] name = "object" -version = "0.27.1" +version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67ac1d3f9a1d3616fd9a60c8d74296f22406a238b6a72f5cc1e6f314df4ffbf9" +checksum = "21158b2c33aa6d4561f1c0a6ea283ca92bc54802a93b263e910746d679a7eb53" dependencies = [ "memchr", ] @@ -1813,9 +1792,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.10.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9" +checksum = "18a6dbe30758c9f83eb00cbea4ac95966305f5a7772f3f42ebfc7fc7eddbd8e1" [[package]] name = "oorandom" @@ -1831,18 +1810,30 @@ checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" [[package]] name = "openssl" -version = "0.10.38" +version = "0.10.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c7ae222234c30df141154f159066c5093ff73b63204dcda7121eb082fc56a95" +checksum = "618febf65336490dfcf20b73f885f5651a0c89c64c2d4a8c3662585a70bf5bd0" dependencies = [ "bitflags", "cfg-if", "foreign-types", "libc", "once_cell", + "openssl-macros", "openssl-sys", ] +[[package]] +name = "openssl-macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "openssl-probe" version = "0.1.5" @@ -1851,9 +1842,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.72" +version = "0.9.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e46109c383602735fa0a2e48dd2b7c892b048e1bf69e5c3b1d804b7d9c203cb" +checksum = "e5f9bd0c2710541a3cda73d6f9ac4f1b240de4ae261065d309dbe73d9dceb42f" dependencies = [ "autocfg", "cc", @@ -1864,12 +1855,9 @@ dependencies = [ [[package]] name = "os_str_bytes" -version = "6.0.0" +version = "6.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" -dependencies = [ - "memchr", -] +checksum = "21326818e99cfe6ce1e524c2a805c189a99b5ae555a35d19f9a284b427d86afa" [[package]] name = "pageserver" @@ -1879,7 +1867,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 3.0.14", + "clap 3.2.12", "close_fds", "const_format", "crc32c", @@ -1939,12 +1927,12 @@ dependencies = [ [[package]] name = "parking_lot" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f5ec2493a61ac0506c0f4199f99070cbe83857b0337006a30f3e6719b8ef58" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", - "parking_lot_core 0.9.2", + "parking_lot_core 0.9.3", ] [[package]] @@ -1963,9 +1951,9 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.2" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "995f667a6c822200b0433ac218e05582f0e2efa1b922a3fd2fbaadc5f87bab37" +checksum = "09a279cbf25cb0757810394fbc1e359949b59e348145c643a939a525692e6929" dependencies = [ "cfg-if", "libc", @@ -1982,9 +1970,9 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] name = "pem" -version = "1.0.2" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9a3b09a20e374558580a4914d3b7d89bd61b954a5a5e1dcbea98753addb1947" +checksum = "03c64931a1a212348ec4f3b4362585eca7159d0d09cbdf4a7f74f02173596fd4" dependencies = [ "base64", ] @@ -1997,9 +1985,9 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" [[package]] name = "petgraph" -version = "0.6.0" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" +checksum = "e6d5014253a1331579ce62aa67443b4a658c5e7dd03d4bc6d302b94474888143" dependencies = [ "fixedbitset", "indexmap", @@ -2025,18 +2013,18 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58ad3879ad3baf4e44784bc6a718a8698867bb991f8ce24d1bcbe2cfb4c3a75e" +checksum = "78203e83c48cffbe01e4a2d35d566ca4de445d79a85372fc64e378bfc812a260" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744b6f092ba29c3650faf274db506afd39944f48420f6c86b17cfe0ee1cb36bb" +checksum = "710faf75e1b33345361201d36d04e98ac1ed8909151a017ed384700836104c74" dependencies = [ "proc-macro2", "quote", @@ -2045,9 +2033,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e280fbe77cc62c91527259e9442153f4688736748d24660126286329742b4c6c" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" [[package]] name = "pin-utils" @@ -2057,15 +2045,15 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58893f751c9b0412871a09abd62ecd2a00298c6c83befa223ef98c52aef40cbe" +checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae" [[package]] name = "plotters" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" +checksum = "9428003b84df1496fb9d6eeee9c5f8145cb41ca375eb0dad204328888832811f" dependencies = [ "num-traits", "plotters-backend", @@ -2076,15 +2064,15 @@ dependencies = [ [[package]] name = "plotters-backend" -version = "0.3.2" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c" +checksum = "193228616381fecdc1224c62e96946dfbc73ff4384fba576e052ff8c1bea8142" [[package]] name = "plotters-svg" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" +checksum = "e0918736323d1baff32ee0eade54984f6f201ad7e97d5cfb5d6ab4a358529615" dependencies = [ "plotters-backend", ] @@ -2182,9 +2170,9 @@ checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" [[package]] name = "prettyplease" -version = "0.1.10" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9e07e3a46d0771a8a06b5f4441527802830b43e679ba12f44960f48dd4c6803" +checksum = "da6ffbe862780245013cb1c0a48c4e44b7d665548088f91f6b90876d0625e4c2" dependencies = [ "proc-macro2", "syn", @@ -2198,22 +2186,21 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro2" -version = "1.0.36" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" +checksum = "dd96a1e8ed2596c337f8eae5f24924ec83f5ad5ab21ea8e455d3566c69fbcaf7" dependencies = [ - "unicode-xid", + "unicode-ident", ] [[package]] name = "procfs" -version = "0.10.1" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95e344cafeaeefe487300c361654bcfc85db3ac53619eeccced29f5ea18c4c70" +checksum = "0941606b9934e2d98a3677759a971756eb821f75764d0e0d26946d08e74d9104" dependencies = [ "bitflags", "byteorder", - "flate2", "hex", "lazy_static", "libc", @@ -2221,25 +2208,25 @@ dependencies = [ [[package]] name = "prometheus" -version = "0.13.0" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7f64969ffd5dd8f39bd57a68ac53c163a095ed9d0fb707146da1b27025a3504" +checksum = "cface98dfa6d645ea4c789839f176e4b072265d085bfcc48eaa8d137f58d3c39" dependencies = [ "cfg-if", "fnv", "lazy_static", "libc", "memchr", - "parking_lot 0.11.2", + "parking_lot 0.12.1", "procfs", "thiserror", ] [[package]] name = "prost" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc03e116981ff7d8da8e5c220e374587b98d294af7ba7dd7fda761158f00086f" +checksum = "71adf41db68aa0daaefc69bb30bcd68ded9b9abaad5d1fbb6304c4fb390e083e" dependencies = [ "bytes", "prost-derive", @@ -2247,9 +2234,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65a1118354442de7feb8a2a76f3d80ef01426bd45542c8c1fdffca41a758f846" +checksum = "8ae5a4388762d5815a9fc0dea33c56b021cdc8dde0c55e0c9ca57197254b0cab" dependencies = [ "bytes", "cfg-if", @@ -2298,17 +2285,17 @@ dependencies = [ "async-trait", "base64", "bytes", - "clap 3.0.14", + "clap 3.2.12", "futures", "git-version", - "hashbrown", + "hashbrown 0.11.2", "hex", "hmac 0.12.1", "hyper", "lazy_static", "md5", "metrics", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pin-project-lite", "rand", "rcgen", @@ -2316,7 +2303,7 @@ dependencies = [ "routerify", "rstest", "rustls", - "rustls-pemfile", + "rustls-pemfile 0.2.1", "scopeguard", "serde", "serde_json", @@ -2353,9 +2340,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.15" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "864d3e96a899863136fc6e99f3d7cae289dafe43bf2c5ac19b70df7210c0a145" +checksum = "3bcdf212e9776fbcb2d23ab029360416bb1706b1aea2d1a5ba002727cbcab804" dependencies = [ "proc-macro2", ] @@ -2402,9 +2389,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.5.1" +version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" dependencies = [ "autocfg", "crossbeam-deque", @@ -2414,14 +2401,13 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.9.1" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" dependencies = [ "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", - "lazy_static", "num_cpus", ] @@ -2439,28 +2425,29 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.2.10" +version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +checksum = "62f25bc4c7e55e0b0b7a1d43fb893f4fa1361d0abe38b9ce4f323c2adfe6ef42" dependencies = [ "bitflags", ] [[package]] name = "redox_users" -version = "0.4.0" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" +checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" dependencies = [ "getrandom", "redox_syscall", + "thiserror", ] [[package]] name = "regex" -version = "1.5.5" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286" +checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" dependencies = [ "aho-corasick", "memchr", @@ -2478,9 +2465,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.25" +version = "0.6.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" +checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" [[package]] name = "remote_storage" @@ -2496,7 +2483,7 @@ dependencies = [ "serde_json", "tempfile", "tokio", - "tokio-util 0.7.0", + "tokio-util", "toml_edit", "tracing", "workspace_hack", @@ -2513,9 +2500,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.9" +version = "0.11.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f242f1488a539a79bac6dbe7c8609ae43b7914b7736210f239a37cccb32525" +checksum = "b75aa69a3f06bbcc66ede33af2af253c6f7a86b1ca0033f60c580a27074fbf92" dependencies = [ "base64", "bytes", @@ -2535,12 +2522,13 @@ dependencies = [ "percent-encoding", "pin-project-lite", "rustls", - "rustls-pemfile", + "rustls-pemfile 1.0.0", "serde", "serde_json", "serde_urlencoded", "tokio", "tokio-rustls", + "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", @@ -2551,9 +2539,9 @@ dependencies = [ [[package]] name = "rgb" -version = "0.8.32" +version = "0.8.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e74fdc210d8f24a7dbfedc13b04ba5764f5232754ccebfdf5fff1bad791ccbc6" +checksum = "c3b221de559e4a29df3b957eec92bc0de6bc8eaf6ca9cfed43e5e1d67ff65a34" dependencies = [ "bytemuck", ] @@ -2713,9 +2701,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.20.4" +version = "0.20.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fbfeb8d0ddb84706bc597a5574ab8912817c52a397f819e5b614e2265206921" +checksum = "5aab8ee6c7097ed6057f43c187a62418d0c05a4bd5f18b3571db50ee0f9ce033" dependencies = [ "log", "ring", @@ -2732,6 +2720,15 @@ dependencies = [ "base64", ] +[[package]] +name = "rustls-pemfile" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7522c9de787ff061458fe9a829dc790a3f5b22dc571694fc5883f448b94d9a9" +dependencies = [ + "base64", +] + [[package]] name = "rustls-split" version = "0.3.0" @@ -2743,15 +2740,15 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2cc38e8fa666e2de3c4aba7edeb5ffc5246c1c2ed0e3d17e560aeeba736b23f" +checksum = "a0a5f7c728f5d284929a1cccb5bc19884422bfe6ef4d6c409da2c41838983fcf" [[package]] name = "ryu" -version = "1.0.9" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f" +checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695" [[package]] name = "safekeeper" @@ -2761,7 +2758,7 @@ dependencies = [ "async-trait", "byteorder", "bytes", - "clap 3.0.14", + "clap 3.2.12", "const_format", "crc32c", "daemonize", @@ -2787,7 +2784,7 @@ dependencies = [ "tempfile", "tokio", "tokio-postgres", - "tokio-util 0.7.0", + "tokio-util", "toml_edit", "tracing", "url", @@ -2807,12 +2804,12 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.19" +version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f05ba609c234e60bee0d547fe94a4c7e9da733d1c962cf6e59efa4cd9c8bc75" +checksum = "88d6731146462ea25d9244b2ed5fd1d716d25c52e4d54aa4fb0f3c4e9854dbe2" dependencies = [ "lazy_static", - "winapi", + "windows-sys", ] [[package]] @@ -2856,15 +2853,15 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.5" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0486718e92ec9a68fbed73bb5ef687d71103b142595b406835649bebd33f72c7" +checksum = "a2333e6df6d6598f2b1974829f853c2b4c5f4a6e503c10af918081aa6f8564e1" [[package]] name = "serde" -version = "1.0.136" +version = "1.0.139" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce31e24b01e1e524df96f1c2fdd054405f8d7376249a5110886fb4b658484789" +checksum = "0171ebb889e45aa68b44aee0859b3eede84c6f5f5c228e6f140c0b2a0a46cad6" dependencies = [ "serde_derive", ] @@ -2881,9 +2878,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.136" +version = "1.0.139" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08597e7152fcd306f41838ed3e37be9eaeed2b61c42e2117266a554fab4662f9" +checksum = "dc1d3230c1de7932af58ad8ffbe1d784bd55efd5a9d84ac24f69c72d83543dfb" dependencies = [ "proc-macro2", "quote", @@ -2892,11 +2889,11 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.78" +version = "1.0.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d23c1ba4cf0efd44be32017709280b32d1cea5c3f1275c3b6d9e8bc54f758085" +checksum = "82c2c1fdcd807d1098552c5b9a36e425e42e9fbd7c6a37a8425f390f781f7fa7" dependencies = [ - "itoa 1.0.1", + "itoa 1.0.2", "ryu", "serde", ] @@ -2908,27 +2905,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" dependencies = [ "form_urlencoded", - "itoa 1.0.1", + "itoa 1.0.2", "ryu", "serde", ] [[package]] name = "serde_with" -version = "1.12.0" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec1e6ec4d8950e5b1e894eac0d360742f3b1407a6078a604a731c4b3f49cefbc" +checksum = "678b5a069e50bf00ecd22d0cd8ddf7c236f68581b03db652061ed5eb13a312ff" dependencies = [ - "rustversion", "serde", "serde_with_macros", ] [[package]] name = "serde_with_macros" -version = "1.5.1" +version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12e47be9471c72889ebafb5e14d5ff930d89ae7a67bbdb5f8abb564f845a927e" +checksum = "e182d6ec6f05393cc0e5ed1bf81ad6db3a8feedf8ee515ecdd369809bcce8082" dependencies = [ "darling", "proc-macro2", @@ -2977,9 +2973,9 @@ checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" [[package]] name = "signal-hook" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "647c97df271007dcea485bb74ffdb57f2e683f1306c854f468a0c244badabf2d" +checksum = "a253b5e89e2698464fc26b545c9edceb338e18a89effeeecfea192c3025be29d" dependencies = [ "libc", "signal-hook-registry", @@ -3007,33 +3003,33 @@ dependencies = [ [[package]] name = "simple_asn1" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a762b1c38b9b990c694b9c2f8abe3372ce6a9ceaae6bca39cfc46e054f45745" +checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085" dependencies = [ "num-bigint", "num-traits", "thiserror", - "time 0.3.9", + "time 0.3.11", ] [[package]] name = "siphasher" -version = "0.3.9" +version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a86232ab60fa71287d7f2ddae4a7073f6b7aac33631c3015abb556f08c6d0a3e" +checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" [[package]] name = "slab" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9def91fd1e018fe007022791f865d0ccc9b3a0d5001e01aabb8b40e46000afb5" +checksum = "eb703cfe953bccee95685111adeedb76fabe4e97549a58d16f03ea7b9367bb32" [[package]] name = "smallvec" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" +checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" [[package]] name = "socket2" @@ -3112,9 +3108,9 @@ checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" [[package]] name = "symbolic-common" -version = "8.7.0" +version = "8.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac6aac7b803adc9ee75344af7681969f76d4b38e4723c6eaacf3b28f5f1d87ff" +checksum = "f551f902d5642e58039aee6a9021a61037926af96e071816361644983966f540" dependencies = [ "debugid", "memmap2", @@ -3124,9 +3120,9 @@ dependencies = [ [[package]] name = "symbolic-demangle" -version = "8.7.0" +version = "8.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8143ea5aa546f86c64f9b9aafdd14223ffad4ecd2d58575c63c21335909c99a7" +checksum = "4564ca7b4e6eb14105aa8bbbce26e080f6b5d9c4373e67167ab31f7b86443750" dependencies = [ "cpp_demangle", "rustc-demangle", @@ -3135,13 +3131,13 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.92" +version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ff7c592601f11445996a06f8ad0c27f094a58857c2f89e97974ab9235b92c52" +checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd" dependencies = [ "proc-macro2", "quote", - "unicode-xid", + "unicode-ident", ] [[package]] @@ -3189,9 +3185,9 @@ dependencies = [ [[package]] name = "termcolor" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" +checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" dependencies = [ "winapi-util", ] @@ -3207,24 +3203,24 @@ dependencies = [ [[package]] name = "textwrap" -version = "0.14.2" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0066c8d12af8b5acd21e00547c3797fde4e8677254a7ee429176ccebbe93dd80" +checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb" [[package]] name = "thiserror" -version = "1.0.30" +version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417" +checksum = "bd829fe32373d27f76265620b5309d0340cb8550f523c1dda251d6298069069a" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.30" +version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b" +checksum = "0396bc89e626244658bef819e22d0cc459e795a5ebe878e6ec336d1674a8d79a" dependencies = [ "proc-macro2", "quote", @@ -3253,11 +3249,11 @@ dependencies = [ [[package]] name = "time" -version = "0.3.9" +version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2702e08a7a860f005826c6815dcac101b19b5eb330c27fe4a5928fec1d20ddd" +checksum = "72c91f41dcb2f096c05f0873d667dceec1087ce5bcf984ec8ffb19acddbb3217" dependencies = [ - "itoa 1.0.1", + "itoa 1.0.2", "libc", "num_threads", "quickcheck", @@ -3282,9 +3278,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.5.1" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c1c1d5a42b6245520c249549ec267180beaffcc0615401ac8e31853d4b6d8d2" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" dependencies = [ "tinyvec_macros", ] @@ -3297,10 +3293,11 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.17.0" +version = "1.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2af73ac49756f3f7c01172e34a23e5d0216f6c32333757c2c61feb2bbff5a5ee" +checksum = "57aec3cfa4c296db7255446efb4928a6be304b431a806216105542a67b6ca82e" dependencies = [ + "autocfg", "bytes", "libc", "memchr", @@ -3326,9 +3323,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "1.7.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b557f72f448c511a979e2564e55d74e6c4432fc96ff4f6241bc6bded342643b7" +checksum = "9724f9a975fb987ef7a3cd9be0350edcbe130698af5b8f7a631e23d42d052484" dependencies = [ "proc-macro2", "quote", @@ -3356,7 +3353,7 @@ dependencies = [ "fallible-iterator", "futures", "log", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "percent-encoding", "phf", "pin-project-lite", @@ -3364,7 +3361,7 @@ dependencies = [ "postgres-types", "socket2", "tokio", - "tokio-util 0.7.0", + "tokio-util", ] [[package]] @@ -3383,9 +3380,9 @@ dependencies = [ [[package]] name = "tokio-rustls" -version = "0.23.3" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4151fda0cf2798550ad0b34bcfc9b9dcc2a9d2471c895c68f3a8818e54f2389e" +checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" dependencies = [ "rustls", "tokio", @@ -3394,9 +3391,9 @@ dependencies = [ [[package]] name = "tokio-stream" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50145484efff8818b5ccd256697f36863f587da82cf8b409c53adf1e840798e3" +checksum = "df54d54117d6fdc4e4fea40fe1e4e566b3505700e148a6827e59b34b0d2600d9" dependencies = [ "futures-core", "pin-project-lite", @@ -3405,37 +3402,23 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.6.9" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e99e1983e5d376cd8eb4b66604d2e99e79f5bd988c3055891dcd8c9e2604cc0" +checksum = "cc463cd8deddc3770d20f9852143d50bf6094e640b485cb2e189a2099085ff45" dependencies = [ "bytes", "futures-core", "futures-sink", - "log", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tokio-util" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64910e1b9c1901aaf5375561e35b9c057d95ff41a44ede043a03e09279eabaf1" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "log", "pin-project-lite", "tokio", + "tracing", ] [[package]] name = "toml" -version = "0.5.8" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa" +checksum = "8d82e1a7758622a465f8cee077614c73484dac5b836c02ff6a40d5d1010324d7" dependencies = [ "serde", ] @@ -3477,7 +3460,7 @@ dependencies = [ "prost-derive", "tokio", "tokio-stream", - "tokio-util 0.7.0", + "tokio-util", "tower", "tower-layer", "tower-service", @@ -3500,9 +3483,9 @@ dependencies = [ [[package]] name = "tower" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a89fd63ad6adf737582df5db40d286574513c69a11dac5214dc3b5603d6713e" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" dependencies = [ "futures-core", "futures-util", @@ -3512,7 +3495,7 @@ dependencies = [ "rand", "slab", "tokio", - "tokio-util 0.7.0", + "tokio-util", "tower-layer", "tower-service", "tracing", @@ -3520,9 +3503,9 @@ dependencies = [ [[package]] name = "tower-http" -version = "0.3.2" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e980386f06883cf4d0578d6c9178c81f68b45d77d00f2c2c1bc034b3439c2c56" +checksum = "3c530c8675c1dbf98facee631536fa116b5fb6382d7dd6dc1b118d970eafe3ba" dependencies = [ "bitflags", "bytes", @@ -3545,15 +3528,15 @@ checksum = "343bc9466d3fe6b0f960ef45960509f84480bf4fd96f92901afe7ff3df9d3a62" [[package]] name = "tower-service" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" -version = "0.1.30" +version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d8d93354fe2a8e50d5953f5ae2e47a3fc2ef03292e7ea46e3cc38f549525fb9" +checksum = "5d0ecdcb44a79f0fe9844f0c4f33a342cbcbb5117de8001e6ba0dc2351327d09" dependencies = [ "cfg-if", "log", @@ -3564,9 +3547,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.19" +version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8276d9a4a3a558d7b7ad5303ad50b53d58264641b82914b7ada36bd762e7a716" +checksum = "11c75893af559bc8e10716548bdef5cb2b983f8e637db9d0e15126b61b484ee2" dependencies = [ "proc-macro2", "quote", @@ -3575,9 +3558,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.22" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03cfcb51380632a72d3111cb8d3447a8d908e577d31beeac006f836383d29a23" +checksum = "f54c8ca710e81886d498c2fd3331b56c93aa248d49de2222ad2742247c60072f" dependencies = [ "lazy_static", "valuable", @@ -3595,9 +3578,9 @@ dependencies = [ [[package]] name = "tracing-log" -version = "0.1.2" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3" +checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922" dependencies = [ "lazy_static", "log", @@ -3606,9 +3589,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.8" +version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74786ce43333fcf51efe947aed9718fbe46d5c7328ec3f1029e818083966d9aa" +checksum = "4bc28f93baff38037f64e6f43d34cfa1605f27a49c34e8a04c5e78b0babf2596" dependencies = [ "ansi_term", "lazy_static", @@ -3636,15 +3619,21 @@ checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" [[package]] name = "unicode-bidi" -version = "0.3.7" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a01404663e3db436ed2746d9fefef640d868edae3cceb81c3b8d5732fda678f" +checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992" + +[[package]] +name = "unicode-ident" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bd2fe26506023ed7b5e1e315add59d6f584c621d037f9368fea9cfb988f368c" [[package]] name = "unicode-normalization" -version = "0.1.19" +version = "0.1.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" +checksum = "854cbdc4f7bc6ae19c820d44abdc3277ac3e1b2b93db20a636825d9322fb60e6" dependencies = [ "tinyvec", ] @@ -3663,9 +3652,9 @@ checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" [[package]] name = "unicode-xid" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" +checksum = "957e51f3646910546462e67d5f7599b9e4fb8acdd304b087a6494730f9eebf04" [[package]] name = "untrusted" @@ -3708,7 +3697,7 @@ dependencies = [ "rand", "routerify", "rustls", - "rustls-pemfile", + "rustls-pemfile 0.2.1", "rustls-split", "serde", "serde_json", @@ -3757,7 +3746,7 @@ name = "wal_craft" version = "0.1.0" dependencies = [ "anyhow", - "clap 3.0.14", + "clap 3.2.12", "env_logger", "log", "once_cell", @@ -3801,9 +3790,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.79" +version = "0.2.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25f1af7423d8588a3d840681122e72e6a24ddbcb3f0ec385cac0d12d24256c06" +checksum = "7c53b543413a17a202f4be280a7e5c62a1c69345f5de525ee64f8cfdbc954994" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -3811,9 +3800,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.79" +version = "0.2.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b21c0df030f5a177f3cba22e9bc4322695ec43e7257d865302900290bcdedca" +checksum = "5491a68ab4500fa6b4d726bd67408630c3dbe9c4fe7bda16d5c82a1fd8c7340a" dependencies = [ "bumpalo", "lazy_static", @@ -3826,9 +3815,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.29" +version = "0.4.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2eb6ec270a31b1d3c7e266b999739109abce8b6c87e4b31fcfcd788b65267395" +checksum = "de9a9cec1733468a8c657e57fa2413d2ae2c0129b95e87c5b72b8ace4d13f31f" dependencies = [ "cfg-if", "js-sys", @@ -3838,9 +3827,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.79" +version = "0.2.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f4203d69e40a52ee523b2529a773d5ffc1dc0071801c87b3d270b471b80ed01" +checksum = "c441e177922bc58f1e12c022624b6216378e5febc2f0533e41ba443d505b80aa" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3848,9 +3837,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.79" +version = "0.2.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa8a30d46208db204854cadbb5d4baf5fcf8071ba5bf48190c3e59937962ebc" +checksum = "7d94ac45fcf608c1f45ef53e748d35660f168490c10b23704c7779ab8f5c3048" dependencies = [ "proc-macro2", "quote", @@ -3861,15 +3850,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.79" +version = "0.2.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d958d035c4438e28c70e4321a2911302f10135ce78a9c7834c0cab4123d06a2" +checksum = "6a89911bd99e5f3659ec4acf9c4d93b0a90fe4a2a11f15328472058edc5261be" [[package]] name = "web-sys" -version = "0.3.56" +version = "0.3.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c060b319f29dd25724f09a2ba1418f142f539b2be99fbf4d2d5a8f7330afb8eb" +checksum = "2fed94beee57daf8dd7d51f2b15dc2bcde92d7a72304cdf662a4371008b71b90" dependencies = [ "js-sys", "wasm-bindgen", @@ -3887,18 +3876,18 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "0.22.2" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "552ceb903e957524388c4d3475725ff2c8b7960922063af6ce53c9a43da07449" +checksum = "f1c760f0d366a6c24a02ed7816e23e691f5d92291f94d15e836006fd11b04daf" dependencies = [ "webpki", ] [[package]] name = "which" -version = "4.2.4" +version = "4.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a5a7e487e921cf220206864a94a89b6c6905bfc19f1057fa26a4cb360e5c1d2" +checksum = "5c4fb54e6113b6a8772ee41c3404fb0301ac79604489467e0a9ce1f3e97c24ae" dependencies = [ "either", "lazy_static", @@ -3938,9 +3927,9 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" -version = "0.34.0" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5acdd78cb4ba54c0045ac14f62d8f94a03d10047904ae2a40afa1e99d8f70825" +checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2" dependencies = [ "windows_aarch64_msvc", "windows_i686_gnu", @@ -3951,39 +3940,39 @@ dependencies = [ [[package]] name = "windows_aarch64_msvc" -version = "0.34.0" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17cffbe740121affb56fad0fc0e421804adf0ae00891205213b5cecd30db881d" +checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47" [[package]] name = "windows_i686_gnu" -version = "0.34.0" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2564fde759adb79129d9b4f54be42b32c89970c18ebf93124ca8870a498688ed" +checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6" [[package]] name = "windows_i686_msvc" -version = "0.34.0" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cd9d32ba70453522332c14d38814bceeb747d80b3958676007acadd7e166956" +checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024" [[package]] name = "windows_x86_64_gnu" -version = "0.34.0" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfce6deae227ee8d356d19effc141a509cc503dfd1f850622ec4b0f84428e1f4" +checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1" [[package]] name = "windows_x86_64_msvc" -version = "0.34.0" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d19538ccc21819d01deaf88d6a17eae6596a12e9aafdbb97916fb49896d89de9" +checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680" [[package]] name = "winreg" -version = "0.7.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69" +checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d" dependencies = [ "winapi", ] @@ -4003,7 +3992,7 @@ dependencies = [ "futures-task", "futures-util", "generic-array", - "hashbrown", + "hashbrown 0.11.2", "hex", "hyper", "indexmap", @@ -4011,6 +4000,8 @@ dependencies = [ "libc", "log", "memchr", + "nom", + "num-bigint", "num-integer", "num-traits", "prost", @@ -4020,8 +4011,9 @@ dependencies = [ "scopeguard", "serde", "syn", + "time 0.3.11", "tokio", - "tokio-util 0.7.0", + "tokio-util", "tracing", "tracing-core", ] @@ -4041,14 +4033,14 @@ dependencies = [ "oid-registry", "rusticata-macros", "thiserror", - "time 0.3.9", + "time 0.3.11", ] [[package]] name = "xattr" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "244c3741f4240ef46274860397c7c74e50eb23624996930e484c16679633a54c" +checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc" dependencies = [ "libc", ] @@ -4070,6 +4062,6 @@ dependencies = [ [[package]] name = "zeroize" -version = "1.5.2" +version = "1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c88870063c39ee00ec285a2f8d6a966e5b6fb2becc4e8dac77ed0d370ed6006" +checksum = "20b578acffd8516a6c3f2a1bdefc1ec37e547bb4e0fb8b6b01a4cafc886b4442" diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 92877faef7..4dc7e4e157 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -33,7 +33,9 @@ itoa = { version = "0.4", features = ["i128", "std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } memchr = { version = "2", features = ["std", "use_std"] } -num-integer = { version = "0.1", default-features = false, features = ["i128"] } +nom = { version = "7", features = ["alloc", "std"] } +num-bigint = { version = "0.4", features = ["std"] } +num-integer = { version = "0.1", default-features = false, features = ["i128", "std"] } num-traits = { version = "0.2", features = ["i128", "std"] } prost = { version = "0.10", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } @@ -41,10 +43,11 @@ regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cac regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } -tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } +time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "quickcheck", "quickcheck-dep", "std", "time-macros"] } +tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "winapi"] } tokio-util = { version = "0.7", features = ["codec", "io"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } -tracing-core = { version = "0.1", features = ["lazy_static", "std"] } +tracing-core = { version = "0.1", features = ["lazy_static", "std", "valuable"] } [build-dependencies] ahash = { version = "0.7", features = ["std"] } @@ -57,6 +60,7 @@ indexmap = { version = "1", default-features = false, features = ["std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } memchr = { version = "2", features = ["std", "use_std"] } +nom = { version = "7", features = ["alloc", "std"] } prost = { version = "0.10", features = ["prost-derive", "std"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } From c68336a246829fc83b625e2197f56ceaf136ab8a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 14 Jul 2022 20:48:47 +0300 Subject: [PATCH 0515/1022] Strip debug symbols from test binaries, to make the artifact smaller. Uploading large artifacts is slow in github actions. To speed that up, make the artifact smaller. The code coverage tool doesn't require debug symbols, so remove them. We've discussed doing the same for *all* binaries, but it's nice to have debugging symbols for debugging purposes, and so that you get more complete stack traces. The discussion is ongoing, but let's at least do this for the test symbols now. --- .github/workflows/build_and_test.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 3a12d19428..f050d25449 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -171,7 +171,10 @@ jobs: for bin in $test_exe_paths; do SRC=$bin DST=/tmp/neon/test_bin/$(basename $bin) - cp "$SRC" "$DST" + + # We don't need debug symbols for code coverage, so strip them out to make + # the artifact smaller. + strip "$SRC" -o "$DST" echo "$DST" >> /tmp/coverage/binaries.list done fi From fe65d1df74ef784ccd546356dced2227ef814c03 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Thu, 14 Jul 2022 14:31:01 -0400 Subject: [PATCH 0516/1022] reduce concurrent tasks in `test_branching_with_pgbench.py` - add thread limit - run `pgbench` with 1 client --- test_runner/batch_others/test_branching.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/test_runner/batch_others/test_branching.py b/test_runner/batch_others/test_branching.py index 957d9a33b3..c61bac7a58 100644 --- a/test_runner/batch_others/test_branching.py +++ b/test_runner/batch_others/test_branching.py @@ -44,7 +44,7 @@ def test_branching_with_pgbench(neon_simple_env: NeonEnv, log.info(f"Start a pgbench workload on pg {connstr}") pg_bin.run_capture(['pgbench', '-i', f'-s{scale}', connstr]) - pg_bin.run_capture(['pgbench', '-c10', '-T15', connstr]) + pg_bin.run_capture(['pgbench', '-T15', connstr]) env.neon_cli.create_branch('b0', tenant_id=tenant) pgs: List[Postgres] = [] @@ -54,12 +54,23 @@ def test_branching_with_pgbench(neon_simple_env: NeonEnv, threads.append(threading.Thread(target=run_pgbench, args=(pgs[0], ), daemon=True)) threads[-1].start() + thread_limit = 4 + for i in range(n_branches): # random a delay between [0, 5] delay = random.random() * 5 time.sleep(delay) log.info(f"Sleep {delay}s") + # If the number of concurrent threads exceeds a threshold, + # wait for all the threads to finish before spawning a new one. + # Because tests defined in `batch_others` are run concurrently in CI, + # we want to avoid the situation that one test exhausts resources for other tests. + if len(threads) >= thread_limit: + for thread in threads: + thread.join() + threads = [] + if ty == "cascade": env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(i), tenant_id=tenant) else: From a490f64a68592dd108f286979a7c796c40b185ff Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 14 Jul 2022 18:49:06 +0300 Subject: [PATCH 0517/1022] Don't include Postgres binaries in neon.tgz neon.tgz artifact in the github workflow included the contents of 'tmp_install', but that seems pointless, because the same files are included earlier already in the pg.tgz artifact. --- .github/actions/run-python-test-set/action.yml | 15 ++++++++++++++- .github/workflows/build_and_test.yml | 3 --- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index accb8896de..f220be2b12 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -37,6 +37,12 @@ runs: name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact path: ./neon-artifact/ + - name: Get Postgres artifact for restoration + uses: actions/download-artifact@v3 + with: + name: postgres-${{ runner.os }}-${{ inputs.build_type }}-artifact + path: ./pg-artifact/ + - name: Extract Neon artifact shell: bash -ex {0} run: | @@ -44,6 +50,13 @@ runs: tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/ rm -rf ./neon-artifact/ + - name: Extract Postgres artifact + shell: bash -ex {0} + run: | + mkdir -p /tmp/neon/tmp_install + tar -xf ./pg-artifact/pg.tgz -C /tmp/neon/tmp_install + rm -rf ./pg-artifact/ + - name: Checkout if: inputs.needs_postgres_source == 'true' uses: actions/checkout@v3 @@ -65,7 +78,7 @@ runs: - name: Run pytest env: NEON_BIN: /tmp/neon/bin - POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + POSTGRES_DISTRIB_DIR: /tmp/neon/tmp_install TEST_OUTPUT: /tmp/test_output # this variable will be embedded in perf test report # and is needed to distinguish different environments diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index f050d25449..3894cfe98c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -179,9 +179,6 @@ jobs: done fi - - name: Install postgres binaries - run: cp -a tmp_install /tmp/neon/pg_install - - name: Prepare neon artifact run: tar -C /tmp/neon/ -czf ./neon.tgz . From eaa550afcc02d3b89a265fab3ff230d221ccc9bb Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 14 Jul 2022 21:37:43 +0300 Subject: [PATCH 0518/1022] Reduce size of cargo deps cache, by excluding ~/.cargo/registry/src. --- .github/workflows/build_and_test.yml | 6 ++++++ .github/workflows/codestyle.yml | 1 + 2 files changed, 7 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 3894cfe98c..b836f72da5 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -94,12 +94,17 @@ jobs: tar -xf ./postgres-artifact/pg.tgz -C ./tmp_install/ rm -rf ./postgres-artifact/ + # Don't include the ~/.cargo/registry/src directory. It contains just + # uncompressed versions of the crates in ~/.cargo/registry/cache + # directory, and it's faster to let 'cargo' to rebuild it from the + # compressed crates. - name: Cache cargo deps id: cache_cargo uses: actions/cache@v3 with: path: | ~/.cargo/registry/ + !~/.cargo/registry/src ~/.cargo/git/ target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found @@ -299,6 +304,7 @@ jobs: with: path: | ~/.cargo/registry/ + !~/.cargo/registry/src ~/.cargo/git/ target/ key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 345c1d5397..89bfffd4b9 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -98,6 +98,7 @@ jobs: with: path: | ~/.cargo/registry + !~/.cargo/registry/src ~/.cargo/git target key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} From c690522870a358a3089ed76ea66e1012632a509d Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Thu, 14 Jul 2022 21:30:11 +0200 Subject: [PATCH 0519/1022] [compute_tools] Change owner of the schema public only once (#2058) Otherwise, we will change it back to the db owner on each restart. Even if user already changed schema owner to some other user. --- compute_tools/src/spec.rs | 45 +++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 71ae3f4ca9..bd47614386 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,8 +1,7 @@ use std::path::Path; -use anyhow::{anyhow, Result}; +use anyhow::Result; use log::{info, log_enabled, warn, Level}; -use postgres::error::SqlState; use postgres::{Client, NoTls}; use serde::Deserialize; @@ -395,20 +394,34 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { // This will only change ownership on the schema itself, not the objects // inside it. Without it owner of the `public` schema will be `cloud_admin` - // and database owner cannot do anything with it. - let alter_query = format!("ALTER SCHEMA public OWNER TO {}", db.owner.quote()); - let res = db_client.simple_query(&alter_query); - - if let Err(e) = res { - if e.code() == Some(&SqlState::INVALID_SCHEMA_NAME) { - // This is OK, db just don't have a `public` schema. - // Probably user dropped it manually. - info!("no 'public' schema found in the database {}", db.name); - } else { - // Something different happened, propagate the error - return Err(anyhow!(e)); - } - } + // and database owner cannot do anything with it. SQL procedure ensures + // that it won't error out if schema `public` doesn't exist. + let alter_query = format!( + "DO $$\n\ + DECLARE\n\ + schema_owner TEXT;\n\ + BEGIN\n\ + IF EXISTS(\n\ + SELECT nspname\n\ + FROM pg_catalog.pg_namespace\n\ + WHERE nspname = 'public'\n\ + )\n\ + THEN\n\ + SELECT nspowner::regrole::text\n\ + FROM pg_catalog.pg_namespace\n\ + WHERE nspname = 'public'\n\ + INTO schema_owner;\n\ + \n\ + IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'\n\ + THEN\n\ + ALTER SCHEMA public OWNER TO {};\n\ + END IF;\n\ + END IF;\n\ + END\n\ + $$;", + db.owner.quote() + ); + db_client.simple_query(&alter_query)?; } Ok(()) From a68d5a0173cf7ce00199a7f5625b7fdddebb4152 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Fri, 15 Jul 2022 13:18:55 +0200 Subject: [PATCH 0520/1022] Run workflow on release branch (#2085) --- .github/workflows/build_and_test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index b836f72da5..776c696f59 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1,9 +1,10 @@ -name: Test +name: Test and Deploy on: push: branches: - main + - release pull_request: defaults: From 95c40334b82e070d1e845e7401a806c499cf1d3d Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 15 Jul 2022 15:39:49 +0100 Subject: [PATCH 0521/1022] github/workflows: post periodic benchmark failures to slack (#2105) --- .github/workflows/benchmarking.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 01dd9d00b0..d08c3c50bd 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -104,3 +104,12 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" run: | REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} From 19ea486cde7f7261eccf336ccb664ded291b63c7 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Sat, 9 Jul 2022 14:13:11 +0300 Subject: [PATCH 0522/1022] postgres_ffi/xlog_utils: refactor find_end_of_wal test * Deduce `last_segment` automatically * Get rid of local `wal_dir`/`wal_seg_size` variables * Prepare to test parsing of WAL from multiple specific points, not just the start; extract `check_end_of_wal` function to check both partial and non-partial WAL segments. --- libs/postgres_ffi/src/xlog_utils.rs | 95 +++++++++++++++++--------- libs/postgres_ffi/wal_craft/src/lib.rs | 7 +- 2 files changed, 70 insertions(+), 32 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 17891fb94f..b9bd922025 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -15,6 +15,7 @@ use crate::XLogPageHeaderData; use crate::XLogRecord; use crate::XLOG_PAGE_MAGIC; +use crate::pg_constants::WAL_SEGMENT_SIZE; use anyhow::{bail, ensure}; use byteorder::{ByteOrder, LittleEndian}; use bytes::BytesMut; @@ -461,8 +462,7 @@ pub fn find_end_of_wal( pub fn main() { let mut data_dir = PathBuf::new(); data_dir.push("."); - let wal_seg_size = 16 * 1024 * 1024; - let (wal_end, tli) = find_end_of_wal(&data_dir, wal_seg_size, true, Lsn(0)).unwrap(); + let (wal_end, tli) = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, true, Lsn(0)).unwrap(); println!( "wal_end={:>08X}{:>08X}, tli={}", (wal_end >> 32) as u32, @@ -606,10 +606,9 @@ mod tests { fn test_end_of_wal( test_name: &str, expected_end_of_wal_non_partial: Lsn, - last_segment: &str, ) { use wal_craft::*; - // 1. Generate some WAL + // Craft some WAL let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("..") .join(".."); @@ -622,24 +621,35 @@ mod tests { } cfg.initdb().unwrap(); let srv = cfg.start_server().unwrap(); - let expected_wal_end: Lsn = + let expected_end_of_wal_partial: Lsn = u64::from(C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap()).into(); srv.kill(); - // 2. Pick WAL generated by initdb - let wal_dir = cfg.datadir.join("pg_wal"); - let wal_seg_size = 16 * 1024 * 1024; - - // 3. Check end_of_wal on non-partial WAL segment (we treat it as fully populated) - let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap(); - let wal_end = Lsn(wal_end); - info!( - "find_end_of_wal returned (wal_end={}, tli={})", - wal_end, tli + // Check find_end_of_wal on the initial WAL + let last_segment = cfg + .wal_dir() + .read_dir() + .unwrap() + .map(|f| f.unwrap().file_name().into_string().unwrap()) + .filter(|fname| IsXLogFileName(fname)) + .max() + .unwrap(); + check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal_partial); + check_end_of_wal( + &cfg, + &last_segment, + Lsn(0), // start from the beginning + expected_end_of_wal_non_partial, + expected_end_of_wal_partial, ); - assert_eq!(wal_end, expected_end_of_wal_non_partial); + } - // 4. Get the actual end of WAL by pg_waldump + fn check_pg_waldump_end_of_wal( + cfg: &wal_craft::Conf, + last_segment: &str, + expected_end_of_wal: Lsn, + ) { + // Get the actual end of WAL by pg_waldump let waldump_output = cfg .pg_waldump("000000010000000000000001", last_segment) .unwrap() @@ -658,32 +668,57 @@ mod tests { let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap(); info!( "waldump erred on {}, expected wal end at {}", - waldump_wal_end, expected_wal_end + waldump_wal_end, expected_end_of_wal ); - assert_eq!(waldump_wal_end, expected_wal_end); + assert_eq!(waldump_wal_end, expected_end_of_wal); + } - // 5. Rename file to partial to actually find last valid lsn - fs::rename( - wal_dir.join(last_segment), - wal_dir.join(format!("{}.partial", last_segment)), - ) - .unwrap(); - let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap(); + fn check_end_of_wal( + cfg: &wal_craft::Conf, + last_segment: &str, + start_lsn: Lsn, + expected_end_of_wal_non_partial: Lsn, + expected_end_of_wal_partial: Lsn, + ) { + // Check end_of_wal on non-partial WAL segment (we treat it as fully populated) + let (wal_end, tli) = + find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap(); let wal_end = Lsn(wal_end); info!( - "find_end_of_wal returned (wal_end={}, tli={})", + "find_end_of_wal returned (wal_end={}, tli={}) with non-partial WAL segment", wal_end, tli ); - assert_eq!(wal_end, waldump_wal_end); + assert_eq!(wal_end, expected_end_of_wal_non_partial); + + // Rename file to partial to actually find last valid lsn, then rename it back. + fs::rename( + cfg.wal_dir().join(&last_segment), + cfg.wal_dir().join(format!("{}.partial", last_segment)), + ) + .unwrap(); + let (wal_end, tli) = + find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap(); + let wal_end = Lsn(wal_end); + info!( + "find_end_of_wal returned (wal_end={}, tli={}) with partial WAL segment", + wal_end, tli + ); + assert_eq!(wal_end, expected_end_of_wal_partial); + fs::rename( + cfg.wal_dir().join(format!("{}.partial", last_segment)), + cfg.wal_dir().join(last_segment), + ) + .unwrap(); } + const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024); + #[test] pub fn test_find_end_of_wal_simple() { init_logging(); test_end_of_wal::( "test_find_end_of_wal_simple", "0/2000000".parse::().unwrap(), - "000000010000000000000001", ); } @@ -693,7 +728,6 @@ mod tests { test_end_of_wal::( "test_find_end_of_wal_crossing_segment_followed_by_small_one", "0/3000000".parse::().unwrap(), - "000000010000000000000002", ); } @@ -704,7 +738,6 @@ mod tests { test_end_of_wal::( "test_find_end_of_wal_last_crossing_segment", "0/3000000".parse::().unwrap(), - "000000010000000000000002", ); } diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 51482137c8..11e62d7fba 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -4,6 +4,7 @@ use log::*; use once_cell::sync::Lazy; use postgres::types::PgLsn; use postgres::Client; +use postgres_ffi::pg_constants::WAL_SEGMENT_SIZE; use postgres_ffi::xlog_utils::{ XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, }; @@ -45,6 +46,10 @@ impl Conf { self.pg_distrib_dir.join("lib") } + pub fn wal_dir(&self) -> PathBuf { + self.datadir.join("pg_wal") + } + fn new_pg_command(&self, command: impl AsRef) -> Result { let path = self.pg_bin_dir().join(command); ensure!(path.exists(), "Command {:?} does not exist", path); @@ -211,7 +216,7 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result "Unexpected wal_segment_size unit" ); ensure!( - wal_segment_size.get::<_, i64>("setting") == 16 * 1024 * 1024, + wal_segment_size.get::<_, i64>("setting") == WAL_SEGMENT_SIZE as i64, "Unexpected wal_segment_size in bytes" ); From 94003e1ebc4f1699e4ecdc4d0dc59985219b6e6f Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Sat, 9 Jul 2022 14:37:08 +0300 Subject: [PATCH 0523/1022] postgres_ffi: test restoring from intermediate LSNs by wal_craft --- libs/postgres_ffi/src/xlog_utils.rs | 54 +++++++++++++++---- .../wal_craft/src/bin/wal_craft.rs | 7 ++- libs/postgres_ffi/wal_craft/src/lib.rs | 43 +++++++++------ test_runner/fixtures/neon_fixtures.py | 5 +- 4 files changed, 77 insertions(+), 32 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index b9bd922025..520870cc53 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -621,8 +621,13 @@ mod tests { } cfg.initdb().unwrap(); let srv = cfg.start_server().unwrap(); - let expected_end_of_wal_partial: Lsn = - u64::from(C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap()).into(); + let (intermediate_lsns, expected_end_of_wal_partial) = + C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap(); + let intermediate_lsns: Vec = intermediate_lsns + .iter() + .map(|&lsn| u64::from(lsn).into()) + .collect(); + let expected_end_of_wal_partial: Lsn = u64::from(expected_end_of_wal_partial).into(); srv.kill(); // Check find_end_of_wal on the initial WAL @@ -635,13 +640,44 @@ mod tests { .max() .unwrap(); check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal_partial); - check_end_of_wal( - &cfg, - &last_segment, - Lsn(0), // start from the beginning - expected_end_of_wal_non_partial, - expected_end_of_wal_partial, - ); + for start_lsn in std::iter::once(Lsn(0)) + .chain(intermediate_lsns) + .chain(std::iter::once(expected_end_of_wal_partial)) + { + // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`. + // We assume that `start_lsn` is non-decreasing. + info!( + "Checking with start_lsn={}, erasing WAL before it", + start_lsn + ); + for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() { + let fname = file.file_name().into_string().unwrap(); + if !IsXLogFileName(&fname) { + continue; + } + let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE); + let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); + if seg_start_lsn > u64::from(start_lsn) { + continue; + } + let mut f = File::options().write(true).open(file.path()).unwrap(); + const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE]; + f.write_all( + &ZEROS[0..min( + WAL_SEGMENT_SIZE, + (u64::from(start_lsn) - seg_start_lsn) as usize, + )], + ) + .unwrap(); + } + check_end_of_wal( + &cfg, + &last_segment, + start_lsn, + expected_end_of_wal_non_partial, + expected_end_of_wal_partial, + ); + } } fn check_pg_waldump_end_of_wal( diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 13892538d0..938f8f421b 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -55,7 +55,7 @@ fn main() -> Result<()> { .get_matches(); let wal_craft = |arg_matches: &ArgMatches, client| { - let lsn = match arg_matches.value_of("type").unwrap() { + let (intermediate_lsns, end_of_wal_lsn) = match arg_matches.value_of("type").unwrap() { Simple::NAME => Simple::craft(client)?, LastWalRecordXlogSwitch::NAME => LastWalRecordXlogSwitch::craft(client)?, LastWalRecordXlogSwitchEndsOnPageBoundary::NAME => { @@ -67,7 +67,10 @@ fn main() -> Result<()> { LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?, a => panic!("Unknown --type argument: {}", a), }; - println!("end_of_wal = {}", lsn); + for lsn in intermediate_lsns { + println!("intermediate_lsn = {}", lsn); + } + println!("end_of_wal = {}", end_of_wal_lsn); Ok(()) }; diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 11e62d7fba..e3b666da41 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -226,20 +226,24 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result pub trait Crafter { const NAME: &'static str; - /// Generates WAL using the client `client`. Returns the expected end-of-wal LSN. - fn craft(client: &mut impl postgres::GenericClient) -> Result; + /// Generates WAL using the client `client`. Returns a pair of: + /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from. + /// May include or exclude Lsn(0) and the end-of-wal. + /// * The expected end-of-wal LSN. + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)>; } fn craft_internal( client: &mut C, - f: impl Fn(&mut C, PgLsn) -> Result>, -) -> Result { + f: impl Fn(&mut C, PgLsn) -> Result<(Vec, Option)>, +) -> Result<(Vec, PgLsn)> { ensure_server_config(client)?; let initial_lsn = client.pg_current_wal_insert_lsn()?; info!("LSN initial = {}", initial_lsn); - let last_lsn = match f(client, initial_lsn)? { + let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?; + let last_lsn = match last_lsn { None => client.pg_current_wal_insert_lsn()?, Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) { Ordering::Less => bail!("Some records were inserted after the crafted WAL"), @@ -247,6 +251,9 @@ fn craft_internal( Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"), }, }; + if !intermediate_lsns.starts_with(&[initial_lsn]) { + intermediate_lsns.insert(0, initial_lsn); + } // Some records may be not flushed, e.g. non-transactional logical messages. client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?; @@ -255,16 +262,16 @@ fn craft_internal( Ordering::Equal => {} Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"), } - Ok(last_lsn) + Ok((intermediate_lsns, last_lsn)) } pub struct Simple; impl Crafter for Simple { const NAME: &'static str = "simple"; - fn craft(client: &mut impl postgres::GenericClient) -> Result { + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { craft_internal(client, |client, _| { client.execute("CREATE table t(x int)", &[])?; - Ok(None) + Ok((Vec::new(), None)) }) } } @@ -272,12 +279,13 @@ impl Crafter for Simple { pub struct LastWalRecordXlogSwitch; impl Crafter for LastWalRecordXlogSwitch { const NAME: &'static str = "last_wal_record_xlog_switch"; - fn craft(client: &mut impl postgres::GenericClient) -> Result { + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { // Do not use generate_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; client.execute("CREATE table t(x int)", &[])?; + let before_xlog_switch = client.pg_current_wal_insert_lsn()?; let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); let next_segment = PgLsn::from(0x0200_0000); ensure!( @@ -286,14 +294,14 @@ impl Crafter for LastWalRecordXlogSwitch { after_xlog_switch, next_segment ); - Ok(next_segment) + Ok((vec![before_xlog_switch, after_xlog_switch], next_segment)) } } pub struct LastWalRecordXlogSwitchEndsOnPageBoundary; impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary"; - fn craft(client: &mut impl postgres::GenericClient) -> Result { + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { // Do not use generate_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; @@ -339,6 +347,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { ); // Emit the XLOG_SWITCH + let before_xlog_switch = client.pg_current_wal_insert_lsn()?; let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); let next_segment = PgLsn::from(0x0200_0000); ensure!( @@ -352,14 +361,14 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { "XLOG_SWITCH message ended not on page boundary: {}", after_xlog_switch ); - Ok(next_segment) + Ok((vec![before_xlog_switch, after_xlog_switch], next_segment)) } } fn craft_single_logical_message( client: &mut impl postgres::GenericClient, transactional: bool, -) -> Result { +) -> Result<(Vec, PgLsn)> { craft_internal(client, |client, initial_lsn| { ensure!( initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024), @@ -391,9 +400,9 @@ fn craft_single_logical_message( message_lsn < after_message_lsn, "No record found after the emitted message" ); - Ok(Some(after_message_lsn)) + Ok((vec![message_lsn], Some(after_message_lsn))) } else { - Ok(Some(message_lsn)) + Ok((Vec::new(), Some(message_lsn))) } }) } @@ -401,7 +410,7 @@ fn craft_single_logical_message( pub struct WalRecordCrossingSegmentFollowedBySmallOne; impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one"; - fn craft(client: &mut impl postgres::GenericClient) -> Result { + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { craft_single_logical_message(client, true) } } @@ -409,7 +418,7 @@ impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { pub struct LastWalRecordCrossingSegment; impl Crafter for LastWalRecordCrossingSegment { const NAME: &'static str = "last_wal_record_crossing_segment"; - fn craft(client: &mut impl postgres::GenericClient) -> Result { + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { craft_single_logical_message(client, false) } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index e2bf7da79d..3a6a233208 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1276,12 +1276,9 @@ class WalCraft(AbstractNeonCli): res.check_returncode() return res.stdout.split('\n') - def in_existing(self, type: str, connection: str) -> int: + def in_existing(self, type: str, connection: str) -> None: res = self.raw_cli(["in-existing", type, connection]) res.check_returncode() - m = re.fullmatch(r'end_of_wal = (.*)\n', res.stdout) - assert m - return lsn_from_hex(m.group(1)) class NeonPageserver(PgProtocol): From 373bc59ebe2b4fc0a65e0d66694cea69495f6616 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 16 Jul 2022 16:05:12 +0100 Subject: [PATCH 0524/1022] Bump pywin32 from 227 to 301 (#2102) --- poetry.lock | 52 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/poetry.lock b/poetry.lock index 6e552d2cd3..4963390718 100644 --- a/poetry.lock +++ b/poetry.lock @@ -544,20 +544,21 @@ test = ["pytest (>=6.2.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pr [[package]] name = "docker" -version = "5.0.3" +version = "4.2.2" description = "A Python library for the Docker Engine API." category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [package.dependencies] -pywin32 = {version = "227", markers = "sys_platform == \"win32\""} +pypiwin32 = {version = "223", markers = "sys_platform == \"win32\" and python_version >= \"3.6\""} requests = ">=2.14.2,<2.18.0 || >2.18.0" +six = ">=1.4.0" websocket-client = ">=0.32.0" [package.extras] ssh = ["paramiko (>=2.4.2)"] -tls = ["pyOpenSSL (>=17.5.0)", "cryptography (>=3.4.7)", "idna (>=2.0.0)"] +tls = ["pyOpenSSL (>=17.5.0)", "cryptography (>=1.3.4)", "idna (>=2.0.0)"] [[package]] name = "ecdsa" @@ -1003,6 +1004,17 @@ python-versions = ">=3.6" [package.extras] diagrams = ["jinja2", "railroad-diagrams"] +[[package]] +name = "pypiwin32" +version = "223" +description = "" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +pywin32 = ">=223" + [[package]] name = "pyrsistent" version = "0.18.1" @@ -1124,7 +1136,7 @@ python-versions = "*" [[package]] name = "pywin32" -version = "227" +version = "301" description = "Python for Window Extensions" category = "main" optional = false @@ -1501,8 +1513,8 @@ cryptography = [ {file = "cryptography-36.0.1.tar.gz", hash = "sha256:53e5c1dc3d7a953de055d77bef2ff607ceef7a2aac0353b5d630ab67f7423638"}, ] docker = [ - {file = "docker-5.0.3-py2.py3-none-any.whl", hash = "sha256:7a79bb439e3df59d0a72621775d600bc8bc8b422d285824cb37103eab91d1ce0"}, - {file = "docker-5.0.3.tar.gz", hash = "sha256:d916a26b62970e7c2f554110ed6af04c7ccff8e9f81ad17d0d40c75637e227fb"}, + {file = "docker-4.2.2-py2.py3-none-any.whl", hash = "sha256:03a46400c4080cb6f7aa997f881ddd84fef855499ece219d75fbdb53289c17ab"}, + {file = "docker-4.2.2.tar.gz", hash = "sha256:26eebadce7e298f55b76a88c4f8802476c5eaddbdbe38dbc6cce8781c47c9b54"}, ] ecdsa = [ {file = "ecdsa-0.17.0-py2.py3-none-any.whl", hash = "sha256:5cf31d5b33743abe0dfc28999036c849a69d548f994b535e527ee3cb7f3ef676"}, @@ -1802,6 +1814,10 @@ pyparsing = [ {file = "pyparsing-3.0.6-py3-none-any.whl", hash = "sha256:04ff808a5b90911829c55c4e26f75fa5ca8a2f5f36aa3a51f68e27033341d3e4"}, {file = "pyparsing-3.0.6.tar.gz", hash = "sha256:d9bdec0013ef1eb5a84ab39a3b3868911598afa494f5faa038647101504e2b81"}, ] +pypiwin32 = [ + {file = "pypiwin32-223-py3-none-any.whl", hash = "sha256:67adf399debc1d5d14dffc1ab5acacb800da569754fafdc576b2a039485aa775"}, + {file = "pypiwin32-223.tar.gz", hash = "sha256:71be40c1fbd28594214ecaecb58e7aa8b708eabfa0125c8a109ebd51edbd776a"}, +] pyrsistent = [ {file = "pyrsistent-0.18.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:df46c854f490f81210870e509818b729db4488e1f30f2a1ce1698b2295a878d1"}, {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d45866ececf4a5fff8742c25722da6d4c9e180daa7b405dc0a2a2790d668c26"}, @@ -1858,18 +1874,16 @@ pytz = [ {file = "pytz-2021.3.tar.gz", hash = "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"}, ] pywin32 = [ - {file = "pywin32-227-cp27-cp27m-win32.whl", hash = "sha256:371fcc39416d736401f0274dd64c2302728c9e034808e37381b5e1b22be4a6b0"}, - {file = "pywin32-227-cp27-cp27m-win_amd64.whl", hash = "sha256:4cdad3e84191194ea6d0dd1b1b9bdda574ff563177d2adf2b4efec2a244fa116"}, - {file = "pywin32-227-cp35-cp35m-win32.whl", hash = "sha256:f4c5be1a293bae0076d93c88f37ee8da68136744588bc5e2be2f299a34ceb7aa"}, - {file = "pywin32-227-cp35-cp35m-win_amd64.whl", hash = "sha256:a929a4af626e530383a579431b70e512e736e9588106715215bf685a3ea508d4"}, - {file = "pywin32-227-cp36-cp36m-win32.whl", hash = "sha256:300a2db938e98c3e7e2093e4491439e62287d0d493fe07cce110db070b54c0be"}, - {file = "pywin32-227-cp36-cp36m-win_amd64.whl", hash = "sha256:9b31e009564fb95db160f154e2aa195ed66bcc4c058ed72850d047141b36f3a2"}, - {file = "pywin32-227-cp37-cp37m-win32.whl", hash = "sha256:47a3c7551376a865dd8d095a98deba954a98f326c6fe3c72d8726ca6e6b15507"}, - {file = "pywin32-227-cp37-cp37m-win_amd64.whl", hash = "sha256:31f88a89139cb2adc40f8f0e65ee56a8c585f629974f9e07622ba80199057511"}, - {file = "pywin32-227-cp38-cp38-win32.whl", hash = "sha256:7f18199fbf29ca99dff10e1f09451582ae9e372a892ff03a28528a24d55875bc"}, - {file = "pywin32-227-cp38-cp38-win_amd64.whl", hash = "sha256:7c1ae32c489dc012930787f06244426f8356e129184a02c25aef163917ce158e"}, - {file = "pywin32-227-cp39-cp39-win32.whl", hash = "sha256:c054c52ba46e7eb6b7d7dfae4dbd987a1bb48ee86debe3f245a2884ece46e295"}, - {file = "pywin32-227-cp39-cp39-win_amd64.whl", hash = "sha256:f27cec5e7f588c3d1051651830ecc00294f90728d19c3bf6916e6dba93ea357c"}, + {file = "pywin32-301-cp35-cp35m-win32.whl", hash = "sha256:93367c96e3a76dfe5003d8291ae16454ca7d84bb24d721e0b74a07610b7be4a7"}, + {file = "pywin32-301-cp35-cp35m-win_amd64.whl", hash = "sha256:9635df6998a70282bd36e7ac2a5cef9ead1627b0a63b17c731312c7a0daebb72"}, + {file = "pywin32-301-cp36-cp36m-win32.whl", hash = "sha256:c866f04a182a8cb9b7855de065113bbd2e40524f570db73ef1ee99ff0a5cc2f0"}, + {file = "pywin32-301-cp36-cp36m-win_amd64.whl", hash = "sha256:dafa18e95bf2a92f298fe9c582b0e205aca45c55f989937c52c454ce65b93c78"}, + {file = "pywin32-301-cp37-cp37m-win32.whl", hash = "sha256:98f62a3f60aa64894a290fb7494bfa0bfa0a199e9e052e1ac293b2ad3cd2818b"}, + {file = "pywin32-301-cp37-cp37m-win_amd64.whl", hash = "sha256:fb3b4933e0382ba49305cc6cd3fb18525df7fd96aa434de19ce0878133bf8e4a"}, + {file = "pywin32-301-cp38-cp38-win32.whl", hash = "sha256:88981dd3cfb07432625b180f49bf4e179fb8cbb5704cd512e38dd63636af7a17"}, + {file = "pywin32-301-cp38-cp38-win_amd64.whl", hash = "sha256:8c9d33968aa7fcddf44e47750e18f3d034c3e443a707688a008a2e52bbef7e96"}, + {file = "pywin32-301-cp39-cp39-win32.whl", hash = "sha256:595d397df65f1b2e0beaca63a883ae6d8b6df1cdea85c16ae85f6d2e648133fe"}, + {file = "pywin32-301-cp39-cp39-win_amd64.whl", hash = "sha256:87604a4087434cd814ad8973bd47d6524bd1fa9e971ce428e76b62a5e0860fdf"}, ] pyyaml = [ {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, From c4b2347e21688e45cb13d965cac9dfe4d8fcce72 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 16 Jul 2022 13:57:48 +0300 Subject: [PATCH 0525/1022] Use less restricrtive lock guard during storage sync --- pageserver/src/layered_repository.rs | 13 ++++++------- pageserver/src/storage_sync.rs | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index e977329822..cead2e9222 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1768,24 +1768,23 @@ impl LayeredTimeline { /// Flush one frozen in-memory layer to disk, as a new delta layer. fn flush_frozen_layer(&self, frozen_layer: Arc) -> Result<()> { - let layer_paths_to_upload; - // As a special case, when we have just imported an image into the repository, // instead of writing out a L0 delta layer, we directly write out image layer // files instead. This is possible as long as *all* the data imported into the // repository have the same LSN. let lsn_range = frozen_layer.get_lsn_range(); - if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) { + let layer_paths_to_upload = if lsn_range.start == self.initdb_lsn + && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) + { let pgdir = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)?; let (partitioning, _lsn) = pgdir.repartition(self.initdb_lsn, self.get_compaction_target_size())?; - layer_paths_to_upload = - self.create_image_layers(&partitioning, self.initdb_lsn, true)?; + self.create_image_layers(&partitioning, self.initdb_lsn, true)? } else { // normal case, write out a L0 delta layer file. let delta_path = self.create_delta_layer(&frozen_layer)?; - layer_paths_to_upload = HashSet::from([delta_path]); - } + HashSet::from([delta_path]) + }; fail_point!("flush-frozen-before-sync"); diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index d6e3741bc0..1747995d2d 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -928,7 +928,7 @@ fn storage_sync_loop( ); let mut sync_status_updates: HashMap> = HashMap::new(); - let index_accessor = runtime.block_on(index.write()); + let index_accessor = runtime.block_on(index.read()); for tenant_id in updated_tenants { let tenant_entry = match index_accessor.tenant_entry(&tenant_id) { Some(tenant_entry) => tenant_entry, From 912a08317b8c959f4a0965c5925a18430ed93288 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 8 Jul 2022 17:10:57 +0300 Subject: [PATCH 0526/1022] do not ignore errors during downloading of tenant index parts --- pageserver/src/storage_sync/download.rs | 97 ++++++++++++++++--------- pageserver/src/storage_sync/index.rs | 31 +++++--- 2 files changed, 82 insertions(+), 46 deletions(-) diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 12c7f4384b..1700d6c9c9 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -8,7 +8,7 @@ use std::{ use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; -use remote_storage::{path_with_suffix_extension, RemoteObjectName, RemoteStorage}; +use remote_storage::{path_with_suffix_extension, DownloadError, RemoteObjectName, RemoteStorage}; use tokio::{ fs, io::{self, AsyncWriteExt}, @@ -27,28 +27,31 @@ use super::{ pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; -/// FIXME: Needs cleanup. Currently it swallows errors. Here we need to ensure that -/// we successfully downloaded all metadata parts for one tenant. -/// And successful includes absence of index_part in the remote. Because it is valid situation -/// when timeline was just created and pageserver restarted before upload of index part was completed. -/// But currently RemoteStorage interface does not provide this knowledge because it uses -/// anyhow::Error as an error type. So this needs a refactoring. -/// -/// In other words we need to yield only complete sets of tenant timelines. -/// Failure for one timeline of a tenant should exclude whole tenant from returned hashmap. -/// So there are two requirements: keep everything in one futures unordered -/// to allow higher concurrency. Mark tenants as failed independently. -/// That requires some bookeeping. +// We collect timelines remotely available for each tenant +// in case we failed to gather all index parts (due to an error) +// Poisoned variant is returned. +// When data is received succesfully without errors Present variant is used. +pub enum TenantIndexParts { + Poisoned(ZTenantTimelineId), + Present(HashMap), +} + +impl Default for TenantIndexParts { + fn default() -> Self { + TenantIndexParts::Present(HashMap::default()) + } +} + pub async fn download_index_parts( conf: &'static PageServerConf, storage: &S, keys: HashSet, -) -> HashMap> +) -> HashMap where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { - let mut index_parts: HashMap> = HashMap::new(); + let mut index_parts: HashMap = HashMap::new(); let mut part_downloads = keys .into_iter() @@ -59,12 +62,29 @@ where match part_upload_result { Ok(index_part) => { debug!("Successfully fetched index part for {id}"); - index_parts - .entry(id.tenant_id) - .or_default() - .insert(id.timeline_id, index_part); + match index_parts.entry(id.tenant_id).or_default() { + TenantIndexParts::Poisoned(id) => { + warn!("disgarding index part for poisoned tenant, poisoned by id: {id}") + } + TenantIndexParts::Present(parts) => { + parts.insert(id.timeline_id, index_part); + } + } + } + Err(download_error) => { + match download_error { + DownloadError::NotFound => { + // thats ok because it means that we didnt upload something we have locally for example + } + e => { + *index_parts.entry(id.tenant_id).or_default() = + TenantIndexParts::Poisoned(id); + error!( + "Failed to fetch index part for {id}: {e} poisoning tenant index parts" + ); + } + } } - Err(e) => error!("Failed to fetch index part for {id}: {e}"), } } @@ -119,10 +139,16 @@ where }); } - download_index_parts(conf, storage, sync_ids) + match download_index_parts(conf, storage, sync_ids) .await .remove(&tenant_id) - .ok_or_else(|| anyhow::anyhow!("Missing tenant index parts. This is a bug.")) + .ok_or_else(|| anyhow::anyhow!("Missing tenant index parts. This is a bug."))? + { + TenantIndexParts::Poisoned(id) => { + anyhow::bail!("failed to download index parts for all timeline: {id}") + } + TenantIndexParts::Present(parts) => Ok(parts), + } } /// Retrieves index data from the remote storage for a given timeline. @@ -130,7 +156,7 @@ async fn download_index_part( conf: &'static PageServerConf, storage: &S, sync_id: ZTenantTimelineId, -) -> anyhow::Result +) -> Result where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, @@ -145,15 +171,11 @@ where "Failed to get the index part storage path for local path '{}'", index_part_path.display() ) - })?; + }) + .map_err(DownloadError::BadInput)?; + + let mut index_part_download = storage.download(&part_storage_path).await?; - let mut index_part_download = - storage - .download(&part_storage_path) - .await - .with_context(|| { - format!("Failed to open download stream for for storage path {part_storage_path:?}") - })?; let mut index_part_bytes = Vec::new(); io::copy( &mut index_part_download.download_stream, @@ -162,11 +184,16 @@ where .await .with_context(|| { format!("Failed to download an index part from storage path {part_storage_path:?}") - })?; + }) + .map_err(DownloadError::Other)?; - let index_part: IndexPart = serde_json::from_slice(&index_part_bytes).with_context(|| { - format!("Failed to deserialize index part file from storage path '{part_storage_path:?}'") - })?; + let index_part: IndexPart = serde_json::from_slice(&index_part_bytes) + .with_context(|| { + format!( + "Failed to deserialize index part file from storage path '{part_storage_path:?}'" + ) + }) + .map_err(DownloadError::Other)?; let missing_files = index_part.missing_files(); if !missing_files.is_empty() { diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 54be3d0f8c..4182d58ce6 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -13,6 +13,7 @@ use anyhow::{anyhow, Context, Ok}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use tokio::sync::RwLock; +use tracing::log::warn; use crate::{config::PageServerConf, layered_repository::metadata::TimelineMetadata}; use utils::{ @@ -20,6 +21,8 @@ use utils::{ zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, }; +use super::download::TenantIndexParts; + /// A part of the filesystem path, that needs a root to become a path again. #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] #[serde(transparent)] @@ -88,21 +91,27 @@ pub struct RemoteIndex(Arc>); impl RemoteIndex { pub fn from_parts( conf: &'static PageServerConf, - index_parts: HashMap>, + index_parts: HashMap, ) -> anyhow::Result { let mut entries: HashMap = HashMap::new(); - for (tenant_id, timelines) in index_parts { - for (timeline_id, index_part) in timelines { - let timeline_path = conf.timeline_path(&timeline_id, &tenant_id); - let remote_timeline = - RemoteTimeline::from_index_part(&timeline_path, index_part) - .context("Failed to restore remote timeline data from index part")?; + for (tenant_id, index_parts) in index_parts { + match index_parts { + // TODO: should we schedule a retry so it can be recovered? otherwise there is no way to revive it other restarting whole pageserver + TenantIndexParts::Poisoned(id) => warn!("skipping tenant_id set up for remote index because the index download has failed for timeline {id}"), + TenantIndexParts::Present(timelines) => { + for (timeline_id, index_part) in timelines { + let timeline_path = conf.timeline_path(&timeline_id, &tenant_id); + let remote_timeline = + RemoteTimeline::from_index_part(&timeline_path, index_part) + .context("Failed to restore remote timeline data from index part")?; - entries - .entry(tenant_id) - .or_default() - .insert(timeline_id, remote_timeline); + entries + .entry(tenant_id) + .or_default() + .insert(timeline_id, remote_timeline); + } + }, } } From 7987889cb3f9a38252e50ddc8da66779e5c21df6 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 14 Jul 2022 12:51:55 +0300 Subject: [PATCH 0527/1022] keep successfully downloaded index parts --- pageserver/src/storage_sync.rs | 9 +------ pageserver/src/storage_sync/download.rs | 34 ++++++++++++++++++++----- pageserver/src/storage_sync/index.rs | 4 +-- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 1747995d2d..ac5fb0bc8c 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -1557,6 +1557,7 @@ fn schedule_first_sync_tasks( local_timeline_init_statuses } +/// bool in return value stands for awaits_download fn compare_local_and_remote_timeline( new_sync_tasks: &mut VecDeque<(ZTenantTimelineId, SyncTask)>, sync_id: ZTenantTimelineId, @@ -1566,14 +1567,6 @@ fn compare_local_and_remote_timeline( ) -> (LocalTimelineInitStatus, bool) { let remote_files = remote_entry.stored_files(); - // TODO probably here we need more sophisticated logic, - // if more data is available remotely can we just download what's there? - // without trying to upload something. It may be tricky, needs further investigation. - // For now looks strange that we can request upload - // and download for the same timeline simultaneously. - // (upload needs to be only for previously unsynced files, not whole timeline dir). - // If one of the tasks fails they will be reordered in the queue which can lead - // to timeline being stuck in evicted state let number_of_layers_to_download = remote_files.difference(&local_files).count(); let (initial_timeline_status, awaits_download) = if number_of_layers_to_download > 0 { new_sync_tasks.push_back(( diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 1700d6c9c9..a91eaaa7ca 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -3,6 +3,7 @@ use std::{ collections::{HashMap, HashSet}, fmt::Debug, + mem, path::Path, }; @@ -32,10 +33,29 @@ pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; // Poisoned variant is returned. // When data is received succesfully without errors Present variant is used. pub enum TenantIndexParts { - Poisoned(ZTenantTimelineId), + Poisoned { + present: HashMap, + missing: HashSet, + }, Present(HashMap), } +impl TenantIndexParts { + fn add_poisoned(&mut self, timeline_id: ZTimelineId) { + match self { + TenantIndexParts::Poisoned { missing, .. } => { + missing.insert(timeline_id); + } + TenantIndexParts::Present(present) => { + *self = TenantIndexParts::Poisoned { + present: mem::take(present), + missing: HashSet::from([timeline_id]), + } + } + } + } +} + impl Default for TenantIndexParts { fn default() -> Self { TenantIndexParts::Present(HashMap::default()) @@ -63,8 +83,8 @@ where Ok(index_part) => { debug!("Successfully fetched index part for {id}"); match index_parts.entry(id.tenant_id).or_default() { - TenantIndexParts::Poisoned(id) => { - warn!("disgarding index part for poisoned tenant, poisoned by id: {id}") + TenantIndexParts::Poisoned { present, .. } => { + present.insert(id.timeline_id, index_part); } TenantIndexParts::Present(parts) => { parts.insert(id.timeline_id, index_part); @@ -77,8 +97,8 @@ where // thats ok because it means that we didnt upload something we have locally for example } e => { - *index_parts.entry(id.tenant_id).or_default() = - TenantIndexParts::Poisoned(id); + let tenant_parts = index_parts.entry(id.tenant_id).or_default(); + tenant_parts.add_poisoned(id.timeline_id); error!( "Failed to fetch index part for {id}: {e} poisoning tenant index parts" ); @@ -144,8 +164,8 @@ where .remove(&tenant_id) .ok_or_else(|| anyhow::anyhow!("Missing tenant index parts. This is a bug."))? { - TenantIndexParts::Poisoned(id) => { - anyhow::bail!("failed to download index parts for all timeline: {id}") + TenantIndexParts::Poisoned { missing, .. } => { + anyhow::bail!("Failed to download index parts for all timelines. Missing {missing:?}") } TenantIndexParts::Present(parts) => Ok(parts), } diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 4182d58ce6..134ae893bc 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -97,8 +97,8 @@ impl RemoteIndex { for (tenant_id, index_parts) in index_parts { match index_parts { - // TODO: should we schedule a retry so it can be recovered? otherwise there is no way to revive it other restarting whole pageserver - TenantIndexParts::Poisoned(id) => warn!("skipping tenant_id set up for remote index because the index download has failed for timeline {id}"), + // TODO: should we schedule a retry so it can be recovered? otherwise we can revive it only through detach/attach or pageserver restart + TenantIndexParts::Poisoned { missing, ..} => warn!("skipping tenant_id set up for remote index because the index download has failed for timeline(s): {missing:?}"), TenantIndexParts::Present(timelines) => { for (timeline_id, index_part) in timelines { let timeline_path = conf.timeline_path(&timeline_id, &tenant_id); From eeff56aeb7d50717e5f9f475f2be03abc09e28d0 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 18 Jul 2022 13:40:42 +0300 Subject: [PATCH 0528/1022] Make get_dir_size robust to concurrent deletions. ref #2055 --- test_runner/fixtures/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 05d1a6634d..c49fa08d77 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -83,6 +83,9 @@ def get_dir_size(path: str) -> int: totalbytes = 0 for root, dirs, files in os.walk(path): for name in files: - totalbytes += os.path.getsize(os.path.join(root, name)) + try: + totalbytes += os.path.getsize(os.path.join(root, name)) + except FileNotFoundError as e: + pass # file could be concurrently removed return totalbytes From a69fdb0e8e64ae9482b1a78a546cdd4d3ec6d4ab Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 18 Jul 2022 11:51:28 +0300 Subject: [PATCH 0529/1022] Fix commit_lsn monotonicity violation. On ProposerElected message receival WAL is truncated at streaming point; this code expected that, once vote is given for the proposer / term switch happened, flush_lsn can be advanced only by this proposer (or higher one). However, that didn't take into account possibility of accumulating written WAL and flushing it after vote is given -- flushing goes without term checks. Which eventually led to the violation in question. ref #2048 --- safekeeper/src/safekeeper.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 331baffbca..fd4761505d 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -637,6 +637,17 @@ where &mut self, msg: &VoteRequest, ) -> Result> { + // Once voted, we won't accept data from older proposers; flush + // everything we've already received so that new proposer starts + // streaming at end of our WAL, without overlap. Currently we truncate + // WAL at streaming point, so this avoids truncating already committed + // WAL. + // + // TODO: it would be smoother to not truncate committed piece at + // handle_elected instead. Currently not a big deal, as proposer is the + // only source of WAL; with peer2peer recovery it would be more + // important. + self.wal_store.flush_wal()?; // initialize with refusal let mut resp = VoteResponse { term: self.state.acceptor_state.term, From 0b14fdb0783b745fbb99e08d49da1253db2b0134 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 18 Jul 2022 13:28:39 +0300 Subject: [PATCH 0530/1022] Reorganize, expand, improve internal documentation Reorganize existing READMEs and other documentation files into mdbook format. The resulting Table of Contents is a mix of placeholders for docs that we should write, and documentation files that we already had, dropped into the most appropriate place. Update the Pageserver overview diagram. Add sections on thread management and WAL redo processes. Add all the RFCs to the mdbook Table of Content too. Per github issue #1979 --- docs/.gitignore | 1 + docs/README.md | 14 -- docs/SUMMARY.md | 84 ++++++++++ docs/book.toml | 5 + docs/core_changes.md | 9 + docs/pageserver-page-service.md | 9 + docs/pageserver-pagecache.md | 8 + docs/pageserver-processing-getpage.md | 4 + docs/pageserver-processing-wal.md | 5 + .../README.md => docs/pageserver-services.md | 156 +++++++++--------- .../README.md => docs/pageserver-storage.md | 2 +- docs/pageserver-thread-mgmt.md | 26 +++ docs/pageserver-walredo.md | 77 +++++++++ docs/pageserver.md | 11 ++ .../safekeeper-protocol.md | 0 docs/separation-compute-storage.md | 8 + safekeeper/README.md => docs/walservice.md | 0 17 files changed, 326 insertions(+), 93 deletions(-) create mode 100644 docs/.gitignore delete mode 100644 docs/README.md create mode 100644 docs/SUMMARY.md create mode 100644 docs/book.toml create mode 100644 docs/pageserver-page-service.md create mode 100644 docs/pageserver-pagecache.md create mode 100644 docs/pageserver-processing-getpage.md create mode 100644 docs/pageserver-processing-wal.md rename pageserver/README.md => docs/pageserver-services.md (75%) rename pageserver/src/layered_repository/README.md => docs/pageserver-storage.md (99%) create mode 100644 docs/pageserver-thread-mgmt.md create mode 100644 docs/pageserver-walredo.md create mode 100644 docs/pageserver.md rename safekeeper/README_PROTO.md => docs/safekeeper-protocol.md (100%) create mode 100644 docs/separation-compute-storage.md rename safekeeper/README.md => docs/walservice.md (100%) diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000000..7585238efe --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +book diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 60114c5fd5..0000000000 --- a/docs/README.md +++ /dev/null @@ -1,14 +0,0 @@ -# Zenith documentation - -## Table of contents - -- [authentication.md](authentication.md) — pageserver JWT authentication. -- [docker.md](docker.md) — Docker images and building pipeline. -- [glossary.md](glossary.md) — Glossary of all the terms used in codebase. -- [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI. -- [sourcetree.md](sourcetree.md) — Overview of the source tree layout. -- [pageserver/README.md](/pageserver/README.md) — pageserver overview. -- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) — Postgres FFI overview. -- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview. -- [safekeeper/README.md](/safekeeper/README.md) — WAL service overview. -- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md new file mode 100644 index 0000000000..cf29ee3c6a --- /dev/null +++ b/docs/SUMMARY.md @@ -0,0 +1,84 @@ +# Summary + +[Introduction]() +- [Separation of Compute and Storage](./separation-compute-storage.md) + +# Architecture + +- [Compute]() + - [WAL proposer]() + - [WAL Backpressure]() + - [Postgres changes](./core_changes.md) + +- [Pageserver](./pageserver.md) + - [Services](./pageserver-services.md) + - [Thread management](./pageserver-thread-mgmt.md) + - [WAL Redo](./pageserver-walredo.md) + - [Page cache](./pageserver-pagecache.md) + - [Storage](./pageserver-storage.md) + - [Datadir mapping]() + - [Layer files]() + - [Branching]() + - [Garbage collection]() + - [Cloud Storage]() + - [Processing a GetPage request](./pageserver-processing-getpage.md) + - [Processing WAL](./pageserver-processing-wal.md) + - [Management API]() + - [Tenant Rebalancing]() + +- [WAL Service](walservice.md) + - [Consensus protocol](safekeeper-protocol.md) + - [Management API]() + - [Rebalancing]() + +- [Control Plane]() + +- [Proxy]() + +- [Source view](./sourcetree.md) + - [docker.md](./docker.md) — Docker images and building pipeline. + - [Error handling and logging]() + - [Testing]() + - [Unit testing]() + - [Integration testing]() + - [Benchmarks]() + + +- [Glossary](./glossary.md) + +# Uncategorized + +- [authentication.md](./authentication.md) +- [multitenancy.md](./multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI. +- [settings.md](./settings.md) +#FIXME: move these under sourcetree.md +#- [pageserver/README.md](/pageserver/README.md) +#- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) +#- [test_runner/README.md](/test_runner/README.md) +#- [safekeeper/README.md](/safekeeper/README.md) + + +# RFCs + +- [RFCs](./rfcs/README.md) + +- [002-storage](rfcs/002-storage.md) +- [003-laptop-cli](rfcs/003-laptop-cli.md) +- [004-durability](rfcs/004-durability.md) +- [005-zenith_local](rfcs/005-zenith_local.md) +- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md) +- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md) +- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md) +- [008-push-pull](rfcs/008-push-pull.md) +- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md) +- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md) +- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md) +- [010-storage_details](rfcs/010-storage_details.md) +- [011-retention-policy](rfcs/011-retention-policy.md) +- [012-background-tasks](rfcs/012-background-tasks.md) +- [013-term-history](rfcs/013-term-history.md) +- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md) +- [014-storage-lsm](rfcs/014-storage-lsm.md) +- [015-storage-messaging](rfcs/015-storage-messaging.md) +- [016-connection-routing](rfcs/016-connection-routing.md) +- [cluster-size-limits](rfcs/cluster-size-limits.md) diff --git a/docs/book.toml b/docs/book.toml new file mode 100644 index 0000000000..f83ac2a6aa --- /dev/null +++ b/docs/book.toml @@ -0,0 +1,5 @@ +[book] +language = "en" +multilingual = false +src = "." +title = "Neon architecture" diff --git a/docs/core_changes.md b/docs/core_changes.md index 82c5addd16..86fdc420f7 100644 --- a/docs/core_changes.md +++ b/docs/core_changes.md @@ -1,3 +1,12 @@ +# Postgres core changes + +This lists all the changes that have been made to the PostgreSQL +source tree, as a somewhat logical set of patches. The long-term goal +is to eliminate all these changes, by submitting patches to upstream +and refactoring code into extensions, so that you can run unmodified +PostgreSQL against Neon storage. + + 1. Add t_cid to XLOG record - Why? The cmin/cmax on a heap page is a real bummer. I don't see any other way to fix that than bite the bullet and modify the WAL-logging routine to include the cmin/cmax. diff --git a/docs/pageserver-page-service.md b/docs/pageserver-page-service.md new file mode 100644 index 0000000000..cea9e5a637 --- /dev/null +++ b/docs/pageserver-page-service.md @@ -0,0 +1,9 @@ +# Page Service + +The Page Service listens for GetPage@LSN requests from the Compute Nodes, +and responds with pages from the repository. On each GetPage@LSN request, +it calls into the Repository function + +A separate thread is spawned for each incoming connection to the page +service. The page service uses the libpq protocol to communicate with +the client. The client is a Compute Postgres instance. diff --git a/docs/pageserver-pagecache.md b/docs/pageserver-pagecache.md new file mode 100644 index 0000000000..d9b120bbb9 --- /dev/null +++ b/docs/pageserver-pagecache.md @@ -0,0 +1,8 @@ +# Page cache + +TODO: + +- shared across tenants +- store pages from layer files +- store pages from "in-memory layer" +- store materialized pages diff --git a/docs/pageserver-processing-getpage.md b/docs/pageserver-processing-getpage.md new file mode 100644 index 0000000000..be99ab82d4 --- /dev/null +++ b/docs/pageserver-processing-getpage.md @@ -0,0 +1,4 @@ +# Processing a GetPage request + +TODO: +- sequence diagram that shows how a GetPage@LSN request is processed diff --git a/docs/pageserver-processing-wal.md b/docs/pageserver-processing-wal.md new file mode 100644 index 0000000000..f8c43b6085 --- /dev/null +++ b/docs/pageserver-processing-wal.md @@ -0,0 +1,5 @@ +# Processing WAL + +TODO: +- diagram that shows how incoming WAL is processed +- explain durability, what is fsync'd when, disk_consistent_lsn diff --git a/pageserver/README.md b/docs/pageserver-services.md similarity index 75% rename from pageserver/README.md rename to docs/pageserver-services.md index cb752881af..4e85413513 100644 --- a/pageserver/README.md +++ b/docs/pageserver-services.md @@ -1,15 +1,4 @@ -## Page server architecture - -The Page Server has a few different duties: - -- Respond to GetPage@LSN requests from the Compute Nodes -- Receive WAL from WAL safekeeper -- Replay WAL that's applicable to the chunks that the Page Server maintains -- Backup to S3 - -S3 is the main fault-tolerant storage of all data, as there are no Page Server -replicas. We use a separate fault-tolerant WAL service to reduce latency. It -keeps track of WAL records which are not synced to S3 yet. +# Services The Page Server consists of multiple threads that operate on a shared repository of page versions: @@ -21,18 +10,22 @@ repository of page versions: | WAL receiver | | | +--------------+ - +----+ - +---------+ .......... | | - | | . . | | - GetPage@LSN | | . backup . -------> | S3 | --------------> | Page | repository . . | | - | Service | .......... | | - page | | +----+ + ...... + +---------+ +--------+ . . + | | | | . . + GetPage@LSN | | | backup | -------> . S3 . +-------------> | Page | repository | | . . + | Service | +--------+ . . + page | | ...... <------------- | | - +---------+ +--------------------+ - | Checkpointing / | - | Garbage collection | - +--------------------+ + +---------+ +-----------+ +--------------------+ + | WAL redo | | Checkpointing, | + +----------+ | processes | | Garbage collection | + | | +-----------+ +--------------------+ + | HTTP | + | mgmt API | + | | + +----------+ Legend: @@ -40,28 +33,77 @@ Legend: | | A thread or multi-threaded service +--+ -.... -. . Component at its early development phase. -.... - ---> Data flow <--- ``` -Page Service ------------- +## Page Service The Page Service listens for GetPage@LSN requests from the Compute Nodes, -and responds with pages from the repository. +and responds with pages from the repository. On each GetPage@LSN request, +it calls into the Repository function + +A separate thread is spawned for each incoming connection to the page +service. The page service uses the libpq protocol to communicate with +the client. The client is a Compute Postgres instance. + +## WAL Receiver + +The WAL receiver connects to the external WAL safekeeping service +using PostgreSQL physical streaming replication, and continuously +receives WAL. It decodes the WAL records, and stores them to the +repository. -WAL Receiver ------------- +## Backup service -The WAL receiver connects to the external WAL safekeeping service (or -directly to the primary) using PostgreSQL physical streaming -replication, and continuously receives WAL. It decodes the WAL records, -and stores them to the repository. +The backup service, responsible for storing pageserver recovery data externally. + +Currently, pageserver stores its files in a filesystem directory it's pointed to. +That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached". +Therefore, the server interacts with external, more reliable storage to back up and restore its state. + +The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait. +There are the following implementations present: +* local filesystem — to use in tests mainly +* AWS S3 - to use in production + +Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs, parameters documentation can be found at [settings docs](../docs/settings.md). + +The backup service is disabled by default and can be enabled to interact with a single remote storage. + +CLI examples: +* Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"` +* AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"` + +For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS. +For local S3 installations, refer to the their documentation for name format and credentials. + +Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets. +Required sections are: + +```toml +[remote_storage] +local_path = '/Users/someonetoignore/Downloads/tmp_dir/' +``` + +or + +```toml +[remote_storage] +bucket_name = 'some-sample-bucket' +bucket_region = 'eu-north-1' +prefix_in_bucket = '/test_prefix/' +``` + +`AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed. + + +## Repository background tasks + +The Repository also has a few different background threads and tokio tasks that perform +background duties like dumping accumulated WAL data from memory to disk, reorganizing +files for performance (compaction), and garbage collecting old files. Repository @@ -116,48 +158,6 @@ Remove old on-disk layer files that are no longer needed according to the PITR retention policy -### Backup service - -The backup service, responsible for storing pageserver recovery data externally. - -Currently, pageserver stores its files in a filesystem directory it's pointed to. -That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached". -Therefore, the server interacts with external, more reliable storage to back up and restore its state. - -The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait. -There are the following implementations present: -* local filesystem — to use in tests mainly -* AWS S3 - to use in production - -Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs, parameters documentation can be found at [settings docs](../docs/settings.md). - -The backup service is disabled by default and can be enabled to interact with a single remote storage. - -CLI examples: -* Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"` -* AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"` - -For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS. -For local S3 installations, refer to the their documentation for name format and credentials. - -Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets. -Required sections are: - -```toml -[remote_storage] -local_path = '/Users/someonetoignore/Downloads/tmp_dir/' -``` - -or - -```toml -[remote_storage] -bucket_name = 'some-sample-bucket' -bucket_region = 'eu-north-1' -prefix_in_bucket = '/test_prefix/' -``` - -`AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed. TODO: Sharding -------------------- diff --git a/pageserver/src/layered_repository/README.md b/docs/pageserver-storage.md similarity index 99% rename from pageserver/src/layered_repository/README.md rename to docs/pageserver-storage.md index bd5fa59257..8d03e68ac7 100644 --- a/pageserver/src/layered_repository/README.md +++ b/docs/pageserver-storage.md @@ -1,4 +1,4 @@ -# Overview +# Pageserver storage The main responsibility of the Page Server is to process the incoming WAL, and reprocess it into a format that allows reasonably quick access to any page diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md new file mode 100644 index 0000000000..9ee3e40085 --- /dev/null +++ b/docs/pageserver-thread-mgmt.md @@ -0,0 +1,26 @@ +## Thread management + +Each thread in the system is tracked by the `thread_mgr` module. It +maintains a registry of threads, and which tenant or timeline they are +operating on. This is used for safe shutdown of a tenant, or the whole +system. + +### Handling shutdown + +When a tenant or timeline is deleted, we need to shut down all threads +operating on it, before deleting the data on disk. A thread registered +in the thread registry can check if it has been requested to shut down, +by calling `is_shutdown_requested()`. For async operations, there's also +a `shudown_watcher()` async task that can be used to wake up on shutdown. + +### Sync vs async + +The primary programming model in the page server is synchronous, +blocking code. However, there are some places where async code is +used. Be very careful when mixing sync and async code. + +Async is primarily used to wait for incoming data on network +connections. For example, all WAL receivers have a shared thread pool, +with one async Task for each connection. Once a piece of WAL has been +received from the network, the thread calls the blocking functions in +the Repository to process the WAL. diff --git a/docs/pageserver-walredo.md b/docs/pageserver-walredo.md new file mode 100644 index 0000000000..1de9c177cc --- /dev/null +++ b/docs/pageserver-walredo.md @@ -0,0 +1,77 @@ +# WAL Redo + +To reconstruct a particular page version from an image of the page and +some WAL records, the pageserver needs to replay the WAL records. This +happens on-demand, when a GetPage@LSN request comes in, or as part of +background jobs that reorganize data for faster access. + +It's important that data cannot leak from one tenant to another, and +that a corrupt WAL record on one timeline doesn't affect other tenants +or timelines. + +## Multi-tenant security + +If you have direct access to the WAL directory, or if you have +superuser access to a running PostgreSQL server, it's easy to +construct a malicious or corrupt WAL record that causes the WAL redo +functions to crash, or to execute arbitrary code. That is not a +security problem for PostgreSQL; if you have superuser access, you +have full access to the system anyway. + +The Neon pageserver, however, is multi-tenant. It needs to execute WAL +belonging to different tenants in the same system, and malicious WAL +in one tenant must not affect other tenants. + +A separate WAL redo process is launched for each tenant, and the +process uses the seccomp(2) system call to restrict its access to the +bare minimum needed to replay WAL records. The process does not have +access to the filesystem or network. It can only communicate with the +parent pageserver process through a pipe. + +If an attacker creates a malicious WAL record and injects it into the +WAL stream of a timeline, he can take control of the WAL redo process +in the pageserver. However, the WAL redo process cannot access the +rest of the system. And because there is a separate WAL redo process +for each tenant, the hijacked WAL redo process can only see WAL and +data belonging to the same tenant, which the attacker would have +access to anyway. + +## WAL-redo process communication + +The WAL redo process runs the 'postgres' executable, launched with a +Neon-specific command-line option to put it into WAL-redo process +mode. The pageserver controls the lifetime of the WAL redo processes, +launching them as needed. If a tenant is detached from the pageserver, +any WAL redo processes for that tenant are killed. + +The pageserver communicates with each WAL redo process over its +stdin/stdout/stderr. It works in request-response model with a simple +custom protocol, described in walredo.rs. To replay a set of WAL +records for a page, the pageserver sends the "before" image of the +page and the WAL records over 'stdin', followed by a command to +perform the replay. The WAL redo process responds with an "after" +image of the page. + +## Special handling of some records + +Some WAL record types are handled directly in the pageserver, by +bespoken Rust code, and are not sent over to the WAL redo process. +This includes SLRU-related WAL records, like commit records. SLRUs +don't use the standard Postgres buffer manager, so dealing with them +in the Neon WAL redo mode would require quite a few changes to +Postgres code and special handling in the protocol anyway. + +Some record types that include a full-page-image (e.g. XLOG_FPI) are +also handled specially when incoming WAL is processed already, and are +stored as page images rather than WAL records. + + +## Records that modify multiple pages + +Some Postgres WAL records modify multiple pages. Such WAL records are +duplicated, so that a copy is stored for each affected page. This is +somewhat wasteful, but because most WAL records only affect one page, +the overhead is acceptable. + +The WAL redo always happens for one particular page. If the WAL record +coantains changes to other pages, they are ignored. diff --git a/docs/pageserver.md b/docs/pageserver.md new file mode 100644 index 0000000000..ee70032396 --- /dev/null +++ b/docs/pageserver.md @@ -0,0 +1,11 @@ +# Page server architecture + +The Page Server has a few different duties: + +- Respond to GetPage@LSN requests from the Compute Nodes +- Receive WAL from WAL safekeeper, and store it +- Upload data to S3 to make it durable, download files from S3 as needed + +S3 is the main fault-tolerant storage of all data, as there are no Page Server +replicas. We use a separate fault-tolerant WAL service to reduce latency. It +keeps track of WAL records which are not synced to S3 yet. diff --git a/safekeeper/README_PROTO.md b/docs/safekeeper-protocol.md similarity index 100% rename from safekeeper/README_PROTO.md rename to docs/safekeeper-protocol.md diff --git a/docs/separation-compute-storage.md b/docs/separation-compute-storage.md new file mode 100644 index 0000000000..f07fa8b6dc --- /dev/null +++ b/docs/separation-compute-storage.md @@ -0,0 +1,8 @@ +# Separation of Compute and Storage + +TODO: + +- Read path +- Write path +- Durability model +- API auth diff --git a/safekeeper/README.md b/docs/walservice.md similarity index 100% rename from safekeeper/README.md rename to docs/walservice.md From f384e20d789f7b253b3d7415879f77ef1c681292 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 18 Jul 2022 17:54:43 +0300 Subject: [PATCH 0531/1022] Minor cleanup in layer_repository.rs. --- pageserver/src/layered_repository.rs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index cead2e9222..6459e802f4 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1,8 +1,8 @@ //! -//! Zenith repository implementation that keeps old data in files on disk, and +//! Timeline repository implementation that keeps old data in files on disk, and //! the recent changes in memory. See layered_repository/*_layer.rs files. //! The functions here are responsible for locating the correct layer for the -//! get/put call, tracing timeline branching history as needed. +//! get/put call, walking back the timeline branching history as needed. //! //! The files are stored in the .neon/tenants//timelines/ //! directory. See layered_repository/README for how the files are managed. @@ -300,12 +300,12 @@ impl Repository for LayeredRepository { .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) .context("invalid branch start lsn")?; + // Determine prev-LSN for the new timeline. We can only determine it if + // the timeline was branched at the current end of the source timeline. let RecordLsn { last: src_last, prev: src_prev, } = src_timeline.get_last_record_rlsn(); - - // Use src_prev from the source timeline only if we branched at the last record. let dst_prev = if src_last == start_lsn { Some(src_prev) } else { @@ -314,7 +314,6 @@ impl Repository for LayeredRepository { // create a new timeline directory let timelinedir = self.conf.timeline_path(&dst, &self.tenant_id); - crashsafe_dir::create_dir(&timelinedir)?; // Create the metadata file, noting the ancestor of the new timeline. @@ -759,7 +758,7 @@ impl LayeredRepository { // https://github.com/neondatabase/neon/issues/1555 if !target_config_path.exists() { info!( - "Zenith tenant config is not found in {}", + "tenant config not found in {}", target_config_path.display() ); return Ok(Default::default()); @@ -858,7 +857,7 @@ impl LayeredRepository { // +-----baz--------> // // - // 1. Grab a mutex to prevent new timelines from being created + // 1. Grab 'gc_cs' mutex to prevent new timelines from being created // 2. Scan all timelines, and on each timeline, make note of the // all the points where other timelines have been branched off. // We will refrain from removing page versions at those LSNs. @@ -903,7 +902,7 @@ impl LayeredRepository { // This is unresolved question for now, how to do gc in presence of remote timelines // especially when this is combined with branching. - // Somewhat related: https://github.com/zenithdb/zenith/issues/999 + // Somewhat related: https://github.com/neondatabase/neon/issues/999 if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() { // If target_timeline is specified, we only need to know branchpoints of its children if let Some(timelineid) = target_timeline_id { @@ -933,7 +932,7 @@ impl LayeredRepository { .get_timeline_load_internal(timeline_id, &mut *timelines)? .expect("checked above that timeline is local and loaded"); - // If target_timeline is specified, only GC it + // If target_timeline is specified, ignore all other timelines if let Some(target_timelineid) = target_timeline_id { if timeline_id != target_timelineid { continue; From 0b8049c2835891fc40632b2f5d38f89b7c575b27 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 15 Jul 2022 10:19:13 +0300 Subject: [PATCH 0532/1022] Update core_changes.md, describing Postgres changes. I went through "git diff REL_14_2" and updated the doc to list all the changes, categorized into what I think could form a logical set of patches. --- docs/core_changes.md | 610 ++++++++++++++++++++++++++++++++----------- 1 file changed, 459 insertions(+), 151 deletions(-) diff --git a/docs/core_changes.md b/docs/core_changes.md index 86fdc420f7..8f29dd9121 100644 --- a/docs/core_changes.md +++ b/docs/core_changes.md @@ -6,206 +6,514 @@ is to eliminate all these changes, by submitting patches to upstream and refactoring code into extensions, so that you can run unmodified PostgreSQL against Neon storage. +In Neon, we run PostgreSQL in the compute nodes, but we also run a special WAL redo process in the +page server. We currently use the same binary for both, with --wal-redo runtime flag to launch it in +the WAL redo mode. Some PostgreSQL changes are needed in the compute node, while others are just for +the WAL redo process. -1. Add t_cid to XLOG record -- Why? - The cmin/cmax on a heap page is a real bummer. I don't see any other way to fix that than bite the bullet and modify the WAL-logging routine to include the cmin/cmax. +In addition to core PostgreSQL changes, there is a Neon extension in contrib/neon, to hook into the +smgr interface. Once all the core changes have been submitted to upstream or eliminated some other +way, the extension could live outside the postgres repository and build against vanilla PostgreSQL. - To recap, the problem is that the XLOG_HEAP_INSERT record does not include the command id of the inserted row. And same with deletion/update. So in the primary, a row is inserted with current xmin + cmin. But in the replica, the cmin is always set to 1. That works, because the command id is only relevant to the inserting transaction itself. After commit/abort, no one cares abut it anymore. +Below is a list of all the PostgreSQL source code changes, categorized into changes needed for +compute, and changes needed for the WAL redo process: -- Alternatives? - I don't know +# Changes for Compute node -2. Add PD_WAL_LOGGED. -- Why? - Postgres sometimes writes data to the page before it is wal-logged. If such page ais swapped out, we will loose this change. The problem is currently solved by setting PD_WAL_LOGGED bit in page header. When page without this bit set is written to the SMGR, then it is forced to be written to the WAL as FPI using log_newpage_copy() function. +## Add t_cid to heap WAL records - There was wrong assumption that it can happen only during construction of some exotic indexes (like gist). It is not true. The same situation can happen with COPY,VACUUM and when record hint bits are set. +``` + src/backend/access/heap/heapam.c | 26 +- + src/include/access/heapam_xlog.h | 6 +- +``` -- Discussion: - https://discord.com/channels/869525774699462656/882681420986851359 +We have added a new t_cid field to heap WAL records. This changes the WAL record format, making Neon WAL format incompatible with vanilla PostgreSQL! -- Alternatives: - Do not store this flag in page header, but associate this bit with shared buffer. Logically it is more correct but in practice we will get not advantages: neither in space, neither in CPU overhead. +### Problem we're trying to solve + +The problem is that the XLOG_HEAP_INSERT record does not include the command id of the inserted row. And same with deletion/update. So in the primary, a row is inserted with current xmin + cmin. But in the replica, the cmin is always set to 1. That works in PostgreSQL, because the command id is only relevant to the inserting transaction itself. After commit/abort, no one cares about it anymore. But with Neon, we rely on WAL replay to reconstruct the page, even while the original transaction is still running. + +### How to get rid of the patch + +Bite the bullet and submit the patch to PostgreSQL, to add the t_cid to the WAL records. It makes the WAL records larger, which could make this unpopular in the PostgreSQL community. However, it might simplify some logical decoding code; Andres Freund briefly mentioned in PGCon 2022 discussion on Heikki's Neon presentation that logical decoding currently needs to jump through some hoops to reconstruct the same information. -3. XLogReadBufferForRedo not always loads and pins requested buffer. So we need to add extra checks that buffer is really pinned. Also do not use BufferGetBlockNumber for buffer returned by XLogReadBufferForRedo. -- Why? - XLogReadBufferForRedo is not pinning pages which are not requested by wal-redo. It is specific only for wal-redo Postgres. +### Alternatives +Perhaps we could write an extra WAL record with the t_cid information, when a page is evicted that contains rows that were touched a transaction that's still running. However, that seems very complicated. -- Alternatives? - No +## ginfast.c + +``` +diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c +index e0d9940946..2d964c02e9 100644 +--- a/src/backend/access/gin/ginfast.c ++++ b/src/backend/access/gin/ginfast.c +@@ -285,6 +285,17 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) + memset(&sublist, 0, sizeof(GinMetaPageData)); + makeSublist(index, collector->tuples, collector->ntuples, &sublist); + ++ if (metadata->head != InvalidBlockNumber) ++ { ++ /* ++ * ZENITH: Get buffer before XLogBeginInsert() to avoid recursive call ++ * of XLogBeginInsert(). Reading a new buffer might evict a dirty page from ++ * the buffer cache, and if that page happens to be an FSM or VM page, zenith_write() ++ * will try to WAL-log an image of the page. ++ */ ++ buffer = ReadBuffer(index, metadata->tail); ++ } ++ + if (needWal) + XLogBeginInsert(); + +@@ -316,7 +327,6 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) + data.prevTail = metadata->tail; + data.newRightlink = sublist.head; + +- buffer = ReadBuffer(index, metadata->tail); + LockBuffer(buffer, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); +``` + +The problem is explained in the comment above + +### How to get rid of the patch + +Can we stop WAL-logging FSM or VM pages? Or delay the WAL logging until we're out of the critical +section or something. + +Maybe some bigger rewrite of FSM and VM would help to avoid WAL-logging FSM and VM page images? -4. Eliminate reporting of some warnings related with hint bits, for example -"page is not marked all-visible but visibility map bit is set in relation". -- Why? - Hint bit may be not WAL logged. +## Mark index builds that use buffer manager without logging explicitly -- Alternative? - Always wal log any page changes. +``` + src/backend/access/gin/gininsert.c | 7 + + src/backend/access/gist/gistbuild.c | 15 +- + src/backend/access/spgist/spginsert.c | 8 +- + +also some changes in src/backend/storage/smgr/smgr.c +``` + +When a GIN index is built, for example, it is built by inserting the entries into the index more or +less normally, but without WAL-logging anything. After the index has been built, we iterate through +all pages and write them to the WAL. That doesn't work for Neon, because if a page is not WAL-logged +and is evicted from the buffer cache, it is lost. We have an check to catch that in the Neon +extension. To fix that, we've added a few functions to track explicitly when we're performing such +an operation: `smgr_start_unlogged_build`, `smgr_finish_unlogged_build_phase_1` and +`smgr_end_unlogged_build`. -5. Maintain last written LSN. -- Why? - When compute node requests page from page server, we need to specify LSN. Ideally it should be LSN - of WAL record performing last update of this pages. But we do not know it, because we do not have page. - We can use current WAL flush position, but in this case there is high probability that page server - will be blocked until this peace of WAL is delivered. - As better approximation we can keep max LSN of written page. It will be better to take in account LSNs only of evicted pages, - but SMGR API doesn't provide such knowledge. +### How to get rid of the patch -- Alternatives? - Maintain map of LSNs of evicted pages. +I think it would make sense to be more explicit about that in PostgreSQL too. So extract these +changes to a patch and post to pgsql-hackers. -6. Launching Postgres without WAL. -- Why? - According to Zenith architecture compute node is stateless. So when we are launching - compute node, we need to provide some dummy PG_DATADIR. Relation pages - can be requested on demand from page server. But Postgres still need some non-relational data: - control and configuration files, SLRUs,... - It is currently implemented using basebackup (do not mix with pg_basebackup) which is created - by pageserver. It includes in this tarball config/control files, SLRUs and required directories. - As far as pageserver do not have original (non-scattered) WAL segments, it includes in - this tarball dummy WAL segment which contains only SHUTDOWN_CHECKPOINT record at the beginning of segment, - which redo field points to the end of wal. It allows to load checkpoint record in more or less - standard way with minimal changes of Postgres, but then some special handling is needed, - including restoring previous record position from zenith.signal file. - Also we have to correctly initialize header of last WAL page (pointed by checkpoint.redo) - to pass checks performed by XLogReader. +## Track last-written page LSN -- Alternatives? - We may not include fake WAL segment in tarball at all and modify xlog.c to load checkpoint record - in special way. But it may only increase number of changes in xlog.c +``` + src/backend/commands/dbcommands.c | 17 +- -7. Add redo_read_buffer_filter callback to XLogReadBufferForRedoExtended -- Why? - We need a way in wal-redo Postgres to ignore pages which are not requested by pageserver. - So wal-redo Postgres reconstructs only requested page and for all other returns BLK_DONE - which means that recovery for them is not needed. +Also one call to SetLastWrittenPageLSN() in spginsert.c, maybe elsewhere too +``` -- Alternatives? - No +Whenever a page is evicted from the buffer cache, we remember its LSN, so that we can use the same +LSN in the GetPage@LSN request when reading the page back from the page server. The value is +conservative: it would be correct to always use the last-inserted LSN, but it would be slow because +then the page server would need to wait for the recent WAL to be streamed and processed, before +responding to any GetPage@LSN request. -8. Enforce WAL logging of sequence updates. -- Why? - Due to performance reasons Postgres don't want to log each fetching of a value from a sequence, - so we pre-log a few fetches in advance. In the event of crash we can lose - (skip over) as many values as we pre-logged. - But it doesn't work with Zenith because page with sequence value can be evicted from buffer cache - and we will get a gap in sequence values even without crash. +The last-written page LSN is mostly tracked in the smgrwrite() function, without core code changes, +but there are a few exceptions where we've had to add explicit calls to the Neon-specific +SetLastWrittenPageLSN() function. -- Alternatives: - Do not try to preserve sequential order but avoid performance penalty. +There's an open PR to track the LSN in a more-fine grained fashion: +https://github.com/neondatabase/postgres/pull/177 + +PostgreSQL v15 introduces a new method to do CREATE DATABASE that WAL-logs the database instead of +relying copying files and checkpoint. With that method, we probably won't need any special handling. +The old method is still available, though. + +### How to get rid of the patch + +Wait until v15? -9. Treat unlogged tables as normal (permanent) tables. -- Why? - Unlogged tables are not transient, so them have to survive node restart (unlike temporary tables). - But as far as compute node is stateless, we need to persist their data to storage node. - And it can only be done through the WAL. +## Cache relation sizes -- Alternatives? - * Store unlogged tables locally (violates requirement of stateless compute nodes). - * Prohibit unlogged tables at all. +The Neon extension contains a little cache for smgrnblocks() and smgrexists() calls, to avoid going +to the page server every time. It might be useful to cache those in PostgreSQL, maybe in the +relcache? (I think we do cache nblocks in relcache already, check why that's not good enough for +Neon) -10. Support start Postgres in wal-redo mode -- Why? - To be able to apply WAL record and reconstruct pages at page server. +## Misc change in vacuumlazy.c -- Alternatives? - * Rewrite redo handlers in Rust - * Do not reconstruct pages at page server at all and do it at compute node. +``` +index 8aab6e324e..c684c4fbee 100644 +--- a/src/backend/access/heap/vacuumlazy.c ++++ b/src/backend/access/heap/vacuumlazy.c +@@ -1487,7 +1487,10 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) + else if (all_visible_according_to_vm && !PageIsAllVisible(page) + && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer)) + { +- elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", ++ /* ZENITH-XXX: all visible hint is not wal-logged ++ * FIXME: Replay visibilitymap changes in pageserver ++ */ ++ elog(DEBUG1, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", + vacrel->relname, blkno); + visibilitymap_clear(vacrel->rel, blkno, vmbuffer, + VISIBILITYMAP_VALID_BITS); +``` -11. WAL proposer -- Why? - WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes. - It is currently implemented as patch to standard WAL sender. - -- Alternatives? - Can be moved to extension if some extra callbacks will be added to wal sender code. +Is this still needed? If that WARNING happens, it looks like potential corruption that we should +fix! -12. Secure Computing BPF API wrapper. -- Why? - Pageserver delegates complex WAL decoding duties to Postgres, - which means that the latter might fall victim to carefully designed - malicious WAL records and start doing harmful things to the system. - To prevent this, it has been decided to limit possible interactions - with the outside world using the Secure Computing BPF mode. +## Use buffer manager when extending VM or FSM -- Alternatives: - * Rewrite redo handlers in Rust. - * Add more checks to guarantee correctness of WAL records. - * Move seccomp.c to extension - * Many other discussed approaches to neutralize incorrect WAL records vulnerabilities. +``` + src/backend/storage/freespace/freespace.c | 14 +- + src/backend/access/heap/visibilitymap.c | 15 +- + +diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c +index e198df65d8..addfe93eac 100644 +--- a/src/backend/access/heap/visibilitymap.c ++++ b/src/backend/access/heap/visibilitymap.c +@@ -652,10 +652,19 @@ vm_extend(Relation rel, BlockNumber vm_nblocks) + /* Now extend the file */ + while (vm_nblocks_now < vm_nblocks) + { +- PageSetChecksumInplace((Page) pg.data, vm_nblocks_now); ++ /* ++ * ZENITH: Initialize VM pages through buffer cache to prevent loading ++ * them from pageserver. ++ */ ++ Buffer buffer = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, P_NEW, ++ RBM_ZERO_AND_LOCK, NULL); ++ Page page = BufferGetPage(buffer); ++ ++ PageInit((Page) page, BLCKSZ, 0); ++ PageSetChecksumInplace(page, vm_nblocks_now); ++ MarkBufferDirty(buffer); ++ UnlockReleaseBuffer(buffer); + +- smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now, +- pg.data, false); + vm_nblocks_now++; + } +``` + +### Problem we're trying to solve + +??? + +### How to get rid of the patch + +Maybe this would be a reasonable change in PostgreSQL too? -13. Callbacks for replica feedbacks -- Why? - Allowing waproposer to interact with walsender code. +## Allow startup without reading checkpoint record -- Alternatives - Copy walsender code to walproposer. +In Neon, the compute node is stateless. So when we are launching compute node, we need to provide +some dummy PG_DATADIR. Relation pages can be requested on demand from page server. But Postgres +still need some non-relational data: control and configuration files, SLRUs,... It is currently +implemented using basebackup (do not mix with pg_basebackup) which is created by pageserver. It +includes in this tarball config/control files, SLRUs and required directories. + +As pageserver does not have the original WAL segments, the basebackup tarball includes an empty WAL +segment to bootstrap the WAL writing, but it doesn't contain the checkpoint record. There are some +changes in xlog.c, to allow starting the compute node without reading the last checkpoint record +from WAL. + +This includes code to read the `zenith.signal` file, which tells the startup code the LSN to start +at. When the `zenith.signal` file is present, the startup uses that LSN instead of the last +checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo. -14. Support multiple SMGR implementations. -- Why? - Postgres provides abstract API for storage manager but it has only one implementation - and provides no way to replace it with custom storage manager. +### How to get rid of the patch -- Alternatives? - None. +??? -15. Calculate database size as sum of all database relations. -- Why? - Postgres is calculating database size by traversing data directory - but as far as Zenith compute node is stateless we can not do it. +### Alternatives -- Alternatives? - Send this request directly to pageserver and calculate real (physical) size - of Zenith representation of database/timeline, rather than sum logical size of all relations. +Include a fake checkpoint record in the tarball. Creating fake WAL is a bit risky, though; I'm +afraid it might accidentally get streamed to the safekeepers and overwrite or corrupt the real WAL. + +## Disable sequence caching + +``` +diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c +index 0415df9ccb..9f9db3c8bc 100644 +--- a/src/backend/commands/sequence.c ++++ b/src/backend/commands/sequence.c +@@ -53,7 +53,9 @@ + * so we pre-log a few fetches in advance. In the event of + * crash we can lose (skip over) as many values as we pre-logged. + */ +-#define SEQ_LOG_VALS 32 ++/* Zenith XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */ ++/* #define SEQ_LOG_VALS 32 */ ++#define SEQ_LOG_VALS 0 +``` + +Due to performance reasons Postgres don't want to log each fetching of a value from a sequence, so +it pre-logs a few fetches in advance. In the event of crash we can lose (skip over) as many values +as we pre-logged. But with Neon, because page with sequence value can be evicted from buffer cache, +we can get a gap in sequence values even without crash. + +### How to get rid of the patch + +Maybe we can just remove it, and accept the gaps. Or add some special handling for sequence +relations in the Neon extension, to WAL log the sequence page when it's about to be evicted. It +would be weird if the sequence moved backwards though, think of PITR. + +Or add a GUC for the amount to prefix to PostgreSQL, and force it to 1 in Neon. ------------------------------------------------ -Not currently committed but proposed: +## Walproposer -1. Disable ring buffer buffer manager strategies -- Why? - Postgres tries to avoid cache flushing by bulk operations (copy, seqscan, vacuum,...). - Even if there are free space in buffer cache, pages may be evicted. - Negative effect of it can be somehow compensated by file system cache, but in case of Zenith - cost of requesting page from page server is much higher. +``` + src/Makefile | 1 + + src/backend/replication/libpqwalproposer/Makefile | 37 + + src/backend/replication/libpqwalproposer/libpqwalproposer.c | 416 ++++++++++++ + src/backend/postmaster/bgworker.c | 4 + + src/backend/postmaster/postmaster.c | 6 + + src/backend/replication/Makefile | 4 +- + src/backend/replication/walproposer.c | 2350 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + src/backend/replication/walproposer_utils.c | 402 +++++++++++ + src/backend/replication/walreceiver.c | 7 + + src/backend/replication/walsender.c | 320 ++++++--- + src/backend/storage/ipc/ipci.c | 6 + + src/include/replication/walproposer.h | 565 ++++++++++++++++ +``` -- Alternatives? - Instead of just prohibiting ring buffer we may try to implement more flexible eviction policy, - for example copy evicted page from ring buffer to some other buffer if there is free space - in buffer cache. +WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes. It is +currently implemented as patch to standard WAL sender. -2. Disable marking page as dirty when hint bits are set. -- Why? - Postgres has to modify page twice: first time when some tuple is updated and second time when - hint bits are set. Wal logging hint bits updates requires FPI which significantly increase size of WAL. +### How to get rid of the patch -- Alternatives? - Add special WAL record for setting page hints. +Refactor into an extension. Submit hooks or APIs into upstream if necessary. -3. Prefetching -- Why? - As far as pages in Zenith are loaded on demand, to reduce node startup time - and also speedup some massive queries we need some mechanism for bulk loading to - reduce page request round-trip overhead. +@MMeent did some work on this already: https://github.com/neondatabase/postgres/pull/96 - Currently Postgres is supporting prefetching only for bitmap scan. - In Zenith we also use prefetch for sequential and index scan. For sequential scan we prefetch - some number of following pages. For index scan we prefetch pages of heap relation addressed by TIDs. +## Ignore unexpected data beyond EOF in bufmgr.c -4. Prewarming. -- Why? - Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith. - But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow. - We can capture state of compute node buffer cache and send bulk request for this pages at startup. +``` +@@ -922,11 +928,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, + */ + bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr); + if (!PageIsNew((Page) bufBlock)) +- ereport(ERROR, ++ { ++ // XXX-ZENITH ++ MemSet((char *) bufBlock, 0, BLCKSZ); ++ ereport(DEBUG1, + (errmsg("unexpected data beyond EOF in block %u of relation %s", + blockNum, relpath(smgr->smgr_rnode, forkNum)), + errhint("This has been seen to occur with buggy kernels; consider updating your system."))); +- ++ } + /* + * We *must* do smgrextend before succeeding, else the page will not + * be reserved by the kernel, and the next P_NEW call will decide to +``` + +PostgreSQL is a bit sloppy with extending relations. Usually, the relation is extended with zeros +first, then the page is filled, and finally the new page WAL-logged. But if multiple backends extend +a relation at the same time, the pages can be WAL-logged in different order. + +I'm not sure what scenario exactly required this change in Neon, though. + +### How to get rid of the patch + +Submit patches to pgsql-hackers, to tighten up the WAL-logging around relation extension. It's a bit +confusing even in PostgreSQL. Maybe WAL log the intention to extend first, then extend the relation, +and finally WAL-log that the extension succeeded. + +## Make smgr interface available to extensions + +``` + src/backend/storage/smgr/smgr.c | 203 +++--- + src/include/storage/smgr.h | 72 +- +``` + +### How to get rid of the patch + +Submit to upstream. This could be useful for the Disk Encryption patches too, or for compression. + + +## Added relpersistence argument to smgropen() + +``` + src/backend/access/heap/heapam_handler.c | 2 +- + src/backend/catalog/storage.c | 10 +- + src/backend/commands/tablecmds.c | 2 +- + src/backend/storage/smgr/md.c | 4 +- + src/include/utils/rel.h | 3 +- +``` + +Neon needs to treat unlogged relations differently from others, so the smgrread(), smgrwrite() etc. +implementations need to know the 'relpersistence' of the relation. To get that information where +it's needed, we added the 'relpersistence' field to smgropen(). + +### How to get rid of the patch + +Maybe 'relpersistence' would be useful in PostgreSQL for debugging purposes? Or simply for the +benefit of extensions like Neon. Should consider this in the patch to make smgr API usable to +extensions. + +## Alternatives + +Currently in Neon, unlogged tables live on local disk in the compute node, and are wiped away on +compute node restart. One alternative would be to instead WAL-log even unlogged tables, essentially +ignoring the UNLOGGED option. Or prohibit UNLOGGED tables completely. But would we still need the +relpersistence argument to handle index builds? See item on "Mark index builds that use buffer +manager without logging explicitly". + +## Use smgr and dbsize_hook for size calculations + +``` + src/backend/utils/adt/dbsize.c | 61 +- +``` + +In PostgreSQL, the rel and db-size functions scan the data directory directly. That won't work in Neon. + +### How to get rid of the patch + +Send patch to PostgreSQL, to use smgr API functions for relation size calculation instead. Maybe as +part of the general smgr API patch. + + + +# WAL redo process changes + +Pageserver delegates complex WAL decoding duties to Postgres, which means that the latter might fall +victim to carefully designed malicious WAL records and start doing harmful things to the system. To +prevent this, the redo functions are executed in a separate process that is sandboxed with Linux +Secure Computing mode (see seccomp(2) man page). + +As an alternative to having a separate WAL redo process, we could rewrite all redo handlers in Rust +This is infeasible. However, it would take a lot of effort to rewrite them, ensure that you've done +the rewrite correctly, and once you've done that, it would be a lot of ongoing maintenance effort to +keep the rewritten code in sync over time, across new PostgreSQL versions. That's why we want to +leverage PostgreSQL code. + +Another alternative would be to harden all the PostgreSQL WAL redo functions so that it would be +safe to call them directly from Rust code, without needing the security sandbox. That's not feasible +for similar reasons as rewriting them in Rust. + + +## Don't replay change in XLogReadBufferForRedo that are not for the target page we're replaying + +``` + src/backend/access/gin/ginxlog.c | 19 +- + +Also some changes in xlog.c and xlogutils.c + +Example: + +@@ -415,21 +416,27 @@ ginRedoSplit(XLogReaderState *record) + if (!isLeaf) + ginRedoClearIncompleteSplit(record, 3); + +- if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED) ++ action = XLogReadBufferForRedo(record, 0, &lbuffer); ++ if (action != BLK_RESTORED && action != BLK_DONE) + elog(ERROR, "GIN split record did not contain a full-page image of left page"); +``` + +### Problem we're trying to solve + +In PostgreSQL, if a WAL redo function calls XLogReadBufferForRead() for a page that has a full-page +image, it always succeeds. However, Neon WAL redo process is only concerned about replaying changes +to a singe page, so replaying any changes for other pages is a waste of cycles. We have modified +XLogReadBufferForRead() to return BLK_DONE for all other pages, to avoid the overhead. That is +unexpected by code like the above. + +### How to get rid of the patch + +Submit the changes to upstream, hope the community accepts them. There's no harm to PostgreSQL from +these changes, although it doesn't have any benefit either. + +To make these changes useful to upstream PostgreSQL, we could implement a feature to look ahead the +WAL, and detect truncated relations. Even in PostgreSQL, it is a waste of cycles to replay changes +to pages that are later truncated away, so we could have XLogReadBufferForRedo() return BLK_DONE or +BLK_NOTFOUND for pages that are known to be truncated away later in the WAL stream. + +### Alternatives + +Maybe we could revert this optimization, and restore pages other than the target page too. + +## Add predefined_sysidentifier flag to initdb + +``` + src/backend/bootstrap/bootstrap.c | 13 +- + src/bin/initdb/initdb.c | 4 + + +And some changes in xlog.c +``` + +This is used to help with restoring a database when you have all the WAL, all the way back to +initdb, but no backup. You can reconstruct the missing backup by running initdb again, with the same +sysidentifier. + + +### How to get rid of the patch + +Ignore it. This is only needed for disaster recovery, so once we've eliminated all other Postgres +patches, we can just keep it around as a patch or as separate branch in a repo. + + +# Not currently committed but proposed + +## Disable ring buffer buffer manager strategies + +### Why? + +Postgres tries to avoid cache flushing by bulk operations (copy, seqscan, vacuum,...). +Even if there are free space in buffer cache, pages may be evicted. +Negative effect of it can be somehow compensated by file system cache, but in Neon, +cost of requesting page from page server is much higher. + +### Alternatives? + +Instead of just prohibiting ring buffer we may try to implement more flexible eviction policy, +for example copy evicted page from ring buffer to some other buffer if there is free space +in buffer cache. + +## Disable marking page as dirty when hint bits are set. + +### Why? + +Postgres has to modify page twice: first time when some tuple is updated and second time when +hint bits are set. Wal logging hint bits updates requires FPI which significantly increase size of WAL. + +### Alternatives? + +Add special WAL record for setting page hints. + +## Prefetching + +### Why? + +As far as pages in Neon are loaded on demand, to reduce node startup time +and also speedup some massive queries we need some mechanism for bulk loading to +reduce page request round-trip overhead. + +Currently Postgres is supporting prefetching only for bitmap scan. +In Neon we should also use prefetch for sequential and index scans, because the OS is not doing it for us. +For sequential scan we could prefetch some number of following pages. For index scan we could prefetch pages +of heap relation addressed by TIDs. + +## Prewarming + +### Why? + +Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith. +But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow. +We can capture state of compute node buffer cache and send bulk request for this pages at startup. From bf5333544f1093af22701515600cb47314f95fcf Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 19 Jul 2022 10:57:24 +0300 Subject: [PATCH 0533/1022] Fix missing quotes in GitHub Actions (#2116) --- .github/workflows/build_and_test.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 776c696f59..01920643ec 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -440,9 +440,9 @@ jobs: - name: Get legacy build tag run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "::set-output name=tag::latest + echo "::set-output name=tag::latest" elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "::set-output name=tag::release + echo "::set-output name=tag::release" else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" exit 1 @@ -502,9 +502,9 @@ jobs: - name: Get legacy build tag run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "::set-output name=tag::latest + echo "::set-output name=tag::latest" elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "::set-output name=tag::release + echo "::set-output name=tag::release" else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" exit 1 From df7f6448221c6b7786b8f1f63200f79b604c067e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 19 Jul 2022 11:27:06 +0300 Subject: [PATCH 0534/1022] Move things around in github yml file, for clarity. Also, this avoids building the list of test binaries in release mode. They are not included in the neon.tgz tarball in release mode. --- .github/workflows/build_and_test.yml | 34 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 01920643ec..0186232e3e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -145,24 +145,15 @@ jobs: cov_prefix=() fi + # FIXME: What's this for? + mkdir -p /tmp/neon/etc/ + + # Install target binaries + mkdir -p /tmp/neon/bin/ binaries=$( "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps | jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' ) - - test_exe_paths=$( - "${cov_prefix[@]}" cargo test --message-format=json --no-run | - jq -r '.executable | select(. != null)' - ) - - mkdir -p /tmp/neon/bin/ - mkdir -p /tmp/neon/test_bin/ - mkdir -p /tmp/neon/etc/ - - # Keep bloated coverage data files away from the rest of the artifact - mkdir -p /tmp/coverage/ - - # Install target binaries for bin in $binaries; do SRC=target/$BUILD_TYPE/$bin DST=/tmp/neon/bin/$bin @@ -171,9 +162,14 @@ jobs: # Install test executables and write list of all binaries (for code coverage) if [[ $BUILD_TYPE == "debug" ]]; then - for bin in $binaries; do - echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list - done + # Keep bloated coverage data files away from the rest of the artifact + mkdir -p /tmp/coverage/ + + mkdir -p /tmp/neon/test_bin/ + test_exe_paths=$( + "${cov_prefix[@]}" cargo test --message-format=json --no-run | + jq -r '.executable | select(. != null)' + ) for bin in $test_exe_paths; do SRC=$bin DST=/tmp/neon/test_bin/$(basename $bin) @@ -183,6 +179,10 @@ jobs: strip "$SRC" -o "$DST" echo "$DST" >> /tmp/coverage/binaries.list done + + for bin in $binaries; do + echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list + done fi - name: Prepare neon artifact From 3dce39419794627f0502450dc6e5df7dd282031b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 19 Jul 2022 15:29:51 +0300 Subject: [PATCH 0535/1022] Use the same cargo options for every cargo call. The "cargo metadata" and "cargo test --no-run" are used in the workflow to just list names of the final binaries, but unless the same cargo options like --release or --debug are used in those calls, they will in fact recompile everything. --- .github/workflows/build_and_test.yml | 56 ++++++++++++++-------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 0186232e3e..99d483ea4a 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -84,6 +84,29 @@ jobs: submodules: true fetch-depth: 1 + # Set some environment variables used by all the steps. + # + # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. + # It also includes --features, if any + # + # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, + # because "cargo metadata" doesn't accept --release or --debug options + # + - name: Set env variables + run: | + if [[ $BUILD_TYPE == "debug" ]]; then + cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" + CARGO_FEATURES="" + CARGO_FLAGS="" + elif [[ $BUILD_TYPE == "release" ]]; then + cov_prefix="" + CARGO_FEATURES="--features profiling" + CARGO_FLAGS="--release $CARGO_FEATURES" + fi + echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV + echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV + echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV + - name: Get postgres artifact for restoration uses: actions/download-artifact@v3 with: @@ -115,43 +138,18 @@ jobs: - name: Run cargo build run: | - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) - CARGO_FLAGS= - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - CARGO_FLAGS="--release --features profiling" - fi - - "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests + ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests - name: Run cargo test run: | - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) - CARGO_FLAGS= - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - CARGO_FLAGS=--release - fi - - "${cov_prefix[@]}" cargo test $CARGO_FLAGS + ${cov_prefix} cargo test $CARGO_FLAGS - name: Install rust binaries run: | - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - fi - - # FIXME: What's this for? - mkdir -p /tmp/neon/etc/ - # Install target binaries mkdir -p /tmp/neon/bin/ binaries=$( - "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps | + ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps | jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' ) for bin in $binaries; do @@ -167,7 +165,7 @@ jobs: mkdir -p /tmp/neon/test_bin/ test_exe_paths=$( - "${cov_prefix[@]}" cargo test --message-format=json --no-run | + ${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run | jq -r '.executable | select(. != null)' ) for bin in $test_exe_paths; do From 5ff7a7dd8bbebde1fc14990a977fbe834f2ed8e0 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 19 Jul 2022 16:33:33 +0100 Subject: [PATCH 0536/1022] github/workflows: run periodic benchmarks earlier (#2121) --- .github/workflows/benchmarking.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index d08c3c50bd..cfd54325eb 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -11,7 +11,7 @@ on: # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '36 7 * * *' # run once a day, timezone is utc + - cron: '36 4 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually From 4446791397e88156012e704381e329551b404c60 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 19 Jul 2022 17:40:58 +0100 Subject: [PATCH 0537/1022] github/workflows: pause stress env deployment (#2122) --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 99d483ea4a..95da34dc62 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -555,7 +555,7 @@ jobs: if [[ "$GITHUB_REF_NAME" == "main" ]]; then STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}' NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}' - echo "::set-output name=include::[$STAGING, $NEON_STRESS]" + echo "::set-output name=include::[$STAGING]" elif [[ "$GITHUB_REF_NAME" == "release" ]]; then PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}' echo "::set-output name=include::[$PRODUCTION]" From 71753dd947373f5536d2a2d4dbad3d8cf866fb6e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 19 Jul 2022 16:37:37 +0300 Subject: [PATCH 0538/1022] Remove github CI 'build_postgres' job, merging it with 'build_neon' Simplifies the workflow. Makes the overall build a little faster, as the build_postgres step doesn't need to upload the pg.tgz artifact, and the build_neon step doesn't need to download it again. This effectively reverts commit a490f64a68. That commit changed the workflow so that the Postgres binaries were not included in the neon.tgz artifact. With this commit, the pg.tgz artifact is gone, and the Postgres binaries are part of neon.tgz again. --- .../actions/run-python-test-set/action.yml | 15 +--- .github/workflows/build_and_test.yml | 70 +++++-------------- 2 files changed, 17 insertions(+), 68 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index f220be2b12..accb8896de 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -37,12 +37,6 @@ runs: name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact path: ./neon-artifact/ - - name: Get Postgres artifact for restoration - uses: actions/download-artifact@v3 - with: - name: postgres-${{ runner.os }}-${{ inputs.build_type }}-artifact - path: ./pg-artifact/ - - name: Extract Neon artifact shell: bash -ex {0} run: | @@ -50,13 +44,6 @@ runs: tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/ rm -rf ./neon-artifact/ - - name: Extract Postgres artifact - shell: bash -ex {0} - run: | - mkdir -p /tmp/neon/tmp_install - tar -xf ./pg-artifact/pg.tgz -C /tmp/neon/tmp_install - rm -rf ./pg-artifact/ - - name: Checkout if: inputs.needs_postgres_source == 'true' uses: actions/checkout@v3 @@ -78,7 +65,7 @@ runs: - name: Run pytest env: NEON_BIN: /tmp/neon/bin - POSTGRES_DISTRIB_DIR: /tmp/neon/tmp_install + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install TEST_OUTPUT: /tmp/test_output # this variable will be embedded in perf test report # and is needed to distinguish different environments diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 95da34dc62..e20dc08697 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -21,7 +21,7 @@ env: COPT: '-Werror' jobs: - build-postgres: + build-neon: runs-on: [ self-hosted, Linux, k8s-runner ] strategy: fail-fast: false @@ -31,6 +31,7 @@ jobs: env: BUILD_TYPE: ${{ matrix.build_type }} + steps: - name: Checkout uses: actions/checkout@v3 @@ -42,48 +43,6 @@ jobs: id: pg_ver run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres) - - name: Cache postgres build - id: cache_pg - uses: actions/cache@v3 - with: - path: tmp_install/ - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Build postgres - if: steps.cache_pg.outputs.cache-hit != 'true' - run: mold -run make postgres -j$(nproc) - - # actions/cache@v3 does not allow concurrently using the same cache across job steps, so use a separate cache - - name: Prepare postgres artifact - run: tar -C tmp_install/ -czf ./pg.tgz . - - name: Upload postgres artifact - uses: actions/upload-artifact@v3 - with: - retention-days: 7 - if-no-files-found: error - name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact - path: ./pg.tgz - - - build-neon: - runs-on: [ self-hosted, Linux, k8s-runner ] - needs: [ build-postgres ] - strategy: - fail-fast: false - matrix: - build_type: [ debug, release ] - rust_toolchain: [ 1.58 ] - - env: - BUILD_TYPE: ${{ matrix.build_type }} - - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 1 - # Set some environment variables used by all the steps. # # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. @@ -107,17 +66,6 @@ jobs: echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV - - name: Get postgres artifact for restoration - uses: actions/download-artifact@v3 - with: - name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact - path: ./postgres-artifact/ - - name: Extract postgres artifact - run: | - mkdir ./tmp_install/ - tar -xf ./postgres-artifact/pg.tgz -C ./tmp_install/ - rm -rf ./postgres-artifact/ - # Don't include the ~/.cargo/registry/src directory. It contains just # uncompressed versions of the crates in ~/.cargo/registry/cache # directory, and it's faster to let 'cargo' to rebuild it from the @@ -136,6 +84,17 @@ jobs: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + - name: Cache postgres build + id: cache_pg + uses: actions/cache@v3 + with: + path: tmp_install/ + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Build postgres + if: steps.cache_pg.outputs.cache-hit != 'true' + run: mold -run make postgres -j$(nproc) + - name: Run cargo build run: | ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests @@ -183,6 +142,9 @@ jobs: done fi + - name: Install postgres binaries + run: cp -a tmp_install /tmp/neon/pg_install + - name: Prepare neon artifact run: tar -C /tmp/neon/ -czf ./neon.tgz . From 98dd2e4f52731f4d0ebdd75591de056da62e0129 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 19 Jul 2022 18:36:46 +0300 Subject: [PATCH 0539/1022] Use zstd and multiple threads to compress artifact tarball. For faster and better compression. --- .github/actions/run-python-test-set/action.yml | 2 +- .github/workflows/build_and_test.yml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index accb8896de..0d058d47c1 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -41,7 +41,7 @@ runs: shell: bash -ex {0} run: | mkdir -p /tmp/neon/ - tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/ + tar -xf ./neon-artifact/neon.tar.zst -C /tmp/neon/ rm -rf ./neon-artifact/ - name: Checkout diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e20dc08697..3fecb2bf67 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -146,7 +146,7 @@ jobs: run: cp -a tmp_install /tmp/neon/pg_install - name: Prepare neon artifact - run: tar -C /tmp/neon/ -czf ./neon.tgz . + run: ZSTD_NBTHREADS=0 tar -C /tmp/neon/ -cf ./neon.tar.zst --zstd . - name: Upload neon binaries uses: actions/upload-artifact@v3 @@ -154,7 +154,7 @@ jobs: retention-days: 7 if-no-files-found: error name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact - path: ./neon.tgz + path: ./neon.tar.zst # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - name: Merge and upload coverage data @@ -279,7 +279,7 @@ jobs: - name: Extract Neon artifact run: | mkdir -p /tmp/neon/ - tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/ + tar -xf ./neon-artifact/neon.tar.zst -C /tmp/neon/ rm -rf ./neon-artifact/ - name: Restore coverage data From 160e52ec7e70213cc0e87843886c1e2204bdf60d Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Tue, 19 Jul 2022 14:56:25 -0400 Subject: [PATCH 0540/1022] Optimize branch creation (#2101) Resolves #2054 **Context**: branch creation needs to wait for GC to acquire `gc_cs` lock, which prevents creating new timelines during GC. However, because individual timeline GC iteration also requires `compaction_cs` lock, branch creation may also need to wait for compactions of multiple timelines. This results in large latency when creating a new branch, which we advertised as *"instantly"*. This PR optimizes the latency of branch creation by separating GC into two phases: 1. Collect GC data (branching points, cutoff LSNs, etc) 2. Perform GC for each timeline The GC bottleneck comes from step 2, which must wait for compaction of multiple timelines. This PR modifies the branch creation and GC functions to allow GC to hold the GC lock only in step 1. As a result, branch creation doesn't need to wait for compaction to finish but only needs to wait for GC data collection step, which is fast. --- .github/workflows/build_and_test.yml | 6 +- .github/workflows/codestyle.yml | 2 +- libs/postgres_ffi/build.rs | 10 +- pageserver/src/layered_repository.rs | 291 +++++++++++------- .../batch_others/test_branch_and_gc.py | 66 ++++ .../performance/test_branch_creation.py | 110 +++++++ 6 files changed, 359 insertions(+), 126 deletions(-) create mode 100644 test_runner/performance/test_branch_creation.py diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 3fecb2bf67..5874aa9b5c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -81,8 +81,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- - name: Cache postgres build id: cache_pg @@ -268,7 +268,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + key: v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact for restoration uses: actions/download-artifact@v3 diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 89bfffd4b9..8bcaa8f947 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -101,7 +101,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git target - key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} + key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} - name: Run cargo clippy run: ./run_clippy.sh diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index c6df4fc0b0..7db2c20e34 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -49,12 +49,12 @@ fn main() { // Finding the location of C headers for the Postgres server: // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/tmp_install` // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/tmp_install/include/postgresql/server` - let mut pg_install_dir: PathBuf; - if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") { - pg_install_dir = postgres_install_dir.into(); + let mut pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") + { + postgres_install_dir.into() } else { - pg_install_dir = PathBuf::from("tmp_install") - } + PathBuf::from("tmp_install") + }; if pg_install_dir.is_relative() { let cwd = env::current_dir().unwrap(); diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 6459e802f4..93acce912c 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -281,12 +281,22 @@ impl Repository for LayeredRepository { // concurrently removes data that is needed by the new timeline. let _gc_cs = self.gc_cs.lock().unwrap(); + // In order for the branch creation task to not wait for GC/compaction, + // we need to make sure that the starting LSN of the child branch is not out of scope midway by + // + // 1. holding the GC lock to prevent overwritting timeline's GC data + // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline + // + // Step 2 is to avoid initializing the new branch using data removed by past GC iterations + // or in-queue GC iterations. + let mut timelines = self.timelines.lock().unwrap(); let src_timeline = self .get_timeline_load_internal(src, &mut timelines) // message about timeline being remote is one .context up in the stack .context("failed to load timeline for branching")? .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {}", &src))?; + let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN @@ -296,9 +306,23 @@ impl Repository for LayeredRepository { lsn }); + // Check if the starting LSN is out of scope because it is less than + // 1. the latest GC cutoff LSN or + // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration. src_timeline .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) - .context("invalid branch start lsn")?; + .context(format!( + "invalid branch start lsn: less than latest GC cutoff {latest_gc_cutoff_lsn}" + ))?; + { + let gc_info = src_timeline.gc_info.read().unwrap(); + let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); + if start_lsn < cutoff { + bail!(format!( + "invalid branch start lsn: less than planned GC cutoff {cutoff}" + )); + } + } // Determine prev-LSN for the new timeline. We can only determine it if // the timeline was branched at the current end of the source timeline. @@ -440,13 +464,7 @@ impl Repository for LayeredRepository { Entry::Vacant(_) => bail!("timeline not found"), }; - // try to acquire gc and compaction locks to prevent errors from missing files - let _gc_guard = self - .gc_cs - .try_lock() - .map_err(|e| anyhow::anyhow!("cannot acquire gc lock {e}"))?; - - let compaction_guard = timeline_entry.get().compaction_guard()?; + let layer_removal_guard = timeline_entry.get().layer_removal_guard()?; let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id); std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { @@ -457,7 +475,7 @@ impl Repository for LayeredRepository { })?; info!("detach removed files"); - drop(compaction_guard); + drop(layer_removal_guard); timeline_entry.remove(); Ok(()) @@ -524,10 +542,10 @@ impl LayeredTimelineEntry { } } - fn compaction_guard(&self) -> Result>, anyhow::Error> { + fn layer_removal_guard(&self) -> Result>, anyhow::Error> { match self { LayeredTimelineEntry::Loaded(timeline) => timeline - .compaction_cs + .layer_removal_cs .try_lock() .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) .map(Some), @@ -883,50 +901,50 @@ impl LayeredRepository { let now = Instant::now(); // grab mutex to prevent new timelines from being created here. - let _gc_cs = self.gc_cs.lock().unwrap(); + let gc_cs = self.gc_cs.lock().unwrap(); + + let mut timelines = self.timelines.lock().unwrap(); // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. let mut all_branchpoints: BTreeSet<(ZTimelineId, Lsn)> = BTreeSet::new(); - let mut timeline_ids = Vec::new(); - let mut timelines = self.timelines.lock().unwrap(); + let timeline_ids = { + if let Some(target_timeline_id) = target_timeline_id.as_ref() { + if timelines.get(target_timeline_id).is_none() { + bail!("gc target timeline does not exist") + } + }; - if let Some(target_timeline_id) = target_timeline_id.as_ref() { - if timelines.get(target_timeline_id).is_none() { - bail!("gc target timeline does not exist") - } + timelines + .iter() + .map(|(timeline_id, timeline_entry)| { + // This is unresolved question for now, how to do gc in presence of remote timelines + // especially when this is combined with branching. + // Somewhat related: https://github.com/zenithdb/zenith/issues/999 + if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() { + // If target_timeline is specified, we only need to know branchpoints of its children + if let Some(timelineid) = target_timeline_id { + if ancestor_timeline_id == &timelineid { + all_branchpoints + .insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); + } + } + // Collect branchpoints for all timelines + else { + all_branchpoints + .insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); + } + } + + *timeline_id + }) + .collect::>() }; - for (timeline_id, timeline_entry) in timelines.iter() { - timeline_ids.push(*timeline_id); - - // This is unresolved question for now, how to do gc in presence of remote timelines - // especially when this is combined with branching. - // Somewhat related: https://github.com/neondatabase/neon/issues/999 - if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() { - // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timelineid) = target_timeline_id { - if ancestor_timeline_id == &timelineid { - all_branchpoints - .insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); - } - } - // Collect branchpoints for all timelines - else { - all_branchpoints.insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); - } - } - } - // Ok, we now know all the branch points. - // Perform GC for each timeline. - for timeline_id in timeline_ids.into_iter() { - if thread_mgr::is_shutdown_requested() { - // We were requested to shut down. Stop and return with the progress we - // made. - break; - } - + // Update the GC information for each timeline. + let mut gc_timelines = Vec::with_capacity(timeline_ids.len()); + for timeline_id in timeline_ids { // Timeline is known to be local and loaded. let timeline = self .get_timeline_load_internal(timeline_id, &mut *timelines)? @@ -940,7 +958,6 @@ impl LayeredRepository { } if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) { - drop(timelines); let branchpoints: Vec = all_branchpoints .range(( Included((timeline_id, Lsn(0))), @@ -948,21 +965,45 @@ impl LayeredRepository { )) .map(|&x| x.1) .collect(); + timeline.update_gc_info(branchpoints, cutoff, pitr)?; - // If requested, force flush all in-memory layers to disk first, - // so that they too can be garbage collected. That's - // used in tests, so we want as deterministic results as possible. - if checkpoint_before_gc { - timeline.checkpoint(CheckpointConfig::Forced)?; - info!("timeline {} checkpoint_before_gc done", timeline_id); - } - timeline.update_gc_info(branchpoints, cutoff, pitr); - let result = timeline.gc()?; - - totals += result; - timelines = self.timelines.lock().unwrap(); + gc_timelines.push(timeline); } } + drop(timelines); + drop(gc_cs); + + // Perform GC for each timeline. + // + // Note that we don't hold the GC lock here because we don't want + // to delay the branch creation task, which requires the GC lock. + // A timeline GC iteration can be slow because it may need to wait for + // compaction (both require `layer_removal_cs` lock), + // but the GC iteration can run concurrently with branch creation. + // + // See comments in [`LayeredRepository::branch_timeline`] for more information + // about why branch creation task can run concurrently with timeline's GC iteration. + for timeline in gc_timelines { + if thread_mgr::is_shutdown_requested() { + // We were requested to shut down. Stop and return with the progress we + // made. + break; + } + + // If requested, force flush all in-memory layers to disk first, + // so that they too can be garbage collected. That's + // used in tests, so we want as deterministic results as possible. + if checkpoint_before_gc { + timeline.checkpoint(CheckpointConfig::Forced)?; + info!( + "timeline {} checkpoint_before_gc done", + timeline.timeline_id + ); + } + + let result = timeline.gc()?; + totals += result; + } totals.elapsed = now.elapsed(); Ok(totals) @@ -1038,11 +1079,11 @@ pub struct LayeredTimeline { /// Used to ensure that there is only one thread layer_flush_lock: Mutex<()>, - // Prevent concurrent compactions. - // Compactions are normally performed by one thread. But compaction can also be manually - // requested by admin (that's used in tests). These forced compactions run in a different - // thread and could be triggered at the same time as a normal, timed compaction. - compaction_cs: Mutex<()>, + /// Layer removal lock. + /// A lock to ensure that no layer of the timeline is removed concurrently by other threads. + /// This lock is acquired in [`LayeredTimeline::gc`], [`LayeredTimeline::compact`], + /// and [`LayeredRepository::delete_timeline`]. + layer_removal_cs: Mutex<()>, // Needed to ensure that we can't create a branch at a point that was already garbage collected latest_gc_cutoff_lsn: RwLock, @@ -1079,12 +1120,14 @@ struct GcInfo { /// last-record LSN /// /// FIXME: is this inclusive or exclusive? - cutoff: Lsn, + horizon_cutoff: Lsn, - /// In addition to 'retain_lsns', keep everything newer than 'SystemTime::now()' - /// minus 'pitr_interval' + /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this + /// point. /// - pitr: Duration, + /// This is calculated by finding a number such that a record is needed for PITR + /// if only if its LSN is larger than 'pitr_cutoff'. + pitr_cutoff: Lsn, } /// Public interface functions @@ -1324,12 +1367,12 @@ impl LayeredTimeline { write_lock: Mutex::new(()), layer_flush_lock: Mutex::new(()), - compaction_cs: Mutex::new(()), + layer_removal_cs: Mutex::new(()), gc_info: RwLock::new(GcInfo { retain_lsns: Vec::new(), - cutoff: Lsn(0), - pitr: Duration::ZERO, + horizon_cutoff: Lsn(0), + pitr_cutoff: Lsn(0), }), latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), @@ -1950,7 +1993,7 @@ impl LayeredTimeline { // Below are functions compact_level0() and create_image_layers() // but they are a bit ad hoc and don't quite work like it's explained // above. Rewrite it. - let _compaction_cs = self.compaction_cs.lock().unwrap(); + let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); let target_file_size = self.get_checkpoint_distance(); @@ -2267,46 +2310,34 @@ impl LayeredTimeline { /// TODO: that's wishful thinking, compaction doesn't actually do that /// currently. /// - /// The caller specifies how much history is needed with the two arguments: + /// The caller specifies how much history is needed with the 3 arguments: /// /// retain_lsns: keep a version of each page at these LSNs - /// cutoff: also keep everything newer than this LSN + /// cutoff_horizon: also keep everything newer than this LSN + /// pitr: the time duration required to keep data for PITR /// /// The 'retain_lsns' list is currently used to prevent removing files that /// are needed by child timelines. In the future, the user might be able to /// name additional points in time to retain. The caller is responsible for /// collecting that information. /// - /// The 'cutoff' point is used to retain recent versions that might still be + /// The 'cutoff_horizon' point is used to retain recent versions that might still be /// needed by read-only nodes. (As of this writing, the caller just passes /// the latest LSN subtracted by a constant, and doesn't do anything smart /// to figure out what read-only nodes might actually need.) /// - fn update_gc_info(&self, retain_lsns: Vec, cutoff: Lsn, pitr: Duration) { + /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine + /// whether a record is needed for PITR. + fn update_gc_info( + &self, + retain_lsns: Vec, + cutoff_horizon: Lsn, + pitr: Duration, + ) -> Result<()> { let mut gc_info = self.gc_info.write().unwrap(); + + gc_info.horizon_cutoff = cutoff_horizon; gc_info.retain_lsns = retain_lsns; - gc_info.cutoff = cutoff; - gc_info.pitr = pitr; - } - - /// - /// Garbage collect layer files on a timeline that are no longer needed. - /// - /// Currently, we don't make any attempt at removing unneeded page versions - /// within a layer file. We can only remove the whole file if it's fully - /// obsolete. - /// - fn gc(&self) -> Result { - let now = SystemTime::now(); - let mut result: GcResult = Default::default(); - let disk_consistent_lsn = self.get_disk_consistent_lsn(); - - let _compaction_cs = self.compaction_cs.lock().unwrap(); - - let gc_info = self.gc_info.read().unwrap(); - let retain_lsns = &gc_info.retain_lsns; - let cutoff = min(gc_info.cutoff, disk_consistent_lsn); - let pitr = gc_info.pitr; // Calculate pitr cutoff point. // If we cannot determine a cutoff LSN, be conservative and don't GC anything. @@ -2315,6 +2346,7 @@ impl LayeredTimeline { if let Ok(timeline) = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) { + let now = SystemTime::now(); // First, calculate pitr_cutoff_timestamp and then convert it to LSN. // If we don't have enough data to convert to LSN, // play safe and don't remove any layers. @@ -2325,7 +2357,7 @@ impl LayeredTimeline { LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, LsnForTimestamp::Future(lsn) => { debug!("future({})", lsn); - pitr_cutoff_lsn = cutoff; + pitr_cutoff_lsn = gc_info.horizon_cutoff; } LsnForTimestamp::Past(lsn) => { debug!("past({})", lsn); @@ -2339,22 +2371,47 @@ impl LayeredTimeline { } else if cfg!(test) { // We don't have local timeline in mocked cargo tests. // So, just ignore pitr_interval setting in this case. - pitr_cutoff_lsn = cutoff; + pitr_cutoff_lsn = gc_info.horizon_cutoff; } + gc_info.pitr_cutoff = pitr_cutoff_lsn; - let new_gc_cutoff = Lsn::min(cutoff, pitr_cutoff_lsn); + Ok(()) + } + + /// + /// Garbage collect layer files on a timeline that are no longer needed. + /// + /// Currently, we don't make any attempt at removing unneeded page versions + /// within a layer file. We can only remove the whole file if it's fully + /// obsolete. + /// + fn gc(&self) -> Result { + let mut result: GcResult = Default::default(); + let now = SystemTime::now(); + + fail_point!("before-timeline-gc"); + + let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); + + let gc_info = self.gc_info.read().unwrap(); + + let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn()); + let pitr_cutoff = gc_info.pitr_cutoff; + let retain_lsns = &gc_info.retain_lsns; + + let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); // Nothing to GC. Return early. - if *self.get_latest_gc_cutoff_lsn() >= new_gc_cutoff { + let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); + if latest_gc_cutoff >= new_gc_cutoff { info!( - "Nothing to GC for timeline {}. cutoff_lsn {}", - self.timeline_id, new_gc_cutoff + "Nothing to GC for timeline {}: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", + self.timeline_id ); - result.elapsed = now.elapsed()?; return Ok(result); } - let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %cutoff).entered(); + let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %new_gc_cutoff).entered(); // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. // See branch_timeline() for details. @@ -2388,23 +2445,23 @@ impl LayeredTimeline { result.layers_total += 1; - // 1. Is it newer than cutoff point? - if l.get_lsn_range().end > cutoff { + // 1. Is it newer than GC horizon cutoff point? + if l.get_lsn_range().end > horizon_cutoff { debug!( - "keeping {} because it's newer than cutoff {}", + "keeping {} because it's newer than horizon_cutoff {}", l.filename().display(), - cutoff + horizon_cutoff ); result.layers_needed_by_cutoff += 1; continue 'outer; } // 2. It is newer than PiTR cutoff point? - if l.get_lsn_range().end > pitr_cutoff_lsn { + if l.get_lsn_range().end > pitr_cutoff { debug!( - "keeping {} because it's newer than pitr_cutoff_lsn {}", + "keeping {} because it's newer than pitr_cutoff {}", l.filename().display(), - pitr_cutoff_lsn + pitr_cutoff ); result.layers_needed_by_pitr += 1; continue 'outer; @@ -2823,7 +2880,7 @@ pub mod tests { let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO); + tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; tline.checkpoint(CheckpointConfig::Forced)?; tline.compact()?; tline.gc()?; @@ -2893,7 +2950,7 @@ pub mod tests { // Perform a cycle of checkpoint, compaction, and GC println!("checkpointing {}", lsn); let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO); + tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; tline.checkpoint(CheckpointConfig::Forced)?; tline.compact()?; tline.gc()?; @@ -2970,7 +3027,7 @@ pub mod tests { // Perform a cycle of checkpoint, compaction, and GC println!("checkpointing {}", lsn); let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO); + tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; tline.checkpoint(CheckpointConfig::Forced)?; tline.compact()?; tline.gc()?; diff --git a/test_runner/batch_others/test_branch_and_gc.py b/test_runner/batch_others/test_branch_and_gc.py index a6210b9176..7157386ce2 100644 --- a/test_runner/batch_others/test_branch_and_gc.py +++ b/test_runner/batch_others/test_branch_and_gc.py @@ -1,3 +1,5 @@ +import threading +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.utils import lsn_from_hex @@ -99,3 +101,67 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): branch_cur.execute('SELECT count(*) FROM foo') assert branch_cur.fetchone() == (200000, ) + + +# This test simulates a race condition happening when branch creation and GC are performed concurrently. +# +# Suppose we want to create a new timeline 't' from a source timeline 's' starting +# from a lsn 'lsn'. Upon creating 't', if we don't hold the GC lock and compare 'lsn' with +# the latest GC information carefully, it's possible for GC to accidentally remove data +# needed by the new timeline. +# +# In this test, GC is requested before the branch creation but is delayed to happen after branch creation. +# As a result, when doing GC for the source timeline, we don't have any information about +# the upcoming new branches, so it's possible to remove data that may be needed by the new branches. +# It's the branch creation task's job to make sure the starting 'lsn' is not out of scope +# and prevent creating branches with invalid starting LSNs. +# +# For more details, see discussion in https://github.com/neondatabase/neon/pull/2101#issuecomment-1185273447. +def test_branch_creation_before_gc(neon_simple_env: NeonEnv): + env = neon_simple_env + # Disable background GC but set the `pitr_interval` to be small, so GC can delete something + tenant, _ = env.neon_cli.create_tenant( + conf={ + # disable background GC + 'gc_period': '10 m', + 'gc_horizon': f'{10 * 1024 ** 3}', + + # small checkpoint distance to create more delta layer files + 'checkpoint_distance': f'{1024 ** 2}', + + # set the target size to be large to allow the image layer to cover the whole key space + 'compaction_target_size': f'{1024 ** 3}', + + # tweak the default settings to allow quickly create image layers and L1 layers + 'compaction_period': '1 s', + 'compaction_threshold': '2', + 'image_creation_threshold': '1', + + # set PITR interval to be small, so we can do GC + 'pitr_interval': '1 s' + }) + + b0 = env.neon_cli.create_branch('b0', tenant_id=tenant) + pg0 = env.postgres.create_start('b0', tenant_id=tenant) + res = pg0.safe_psql_many(queries=[ + "CREATE TABLE t(key serial primary key)", + "INSERT INTO t SELECT FROM generate_series(1, 100000)", + "SELECT pg_current_wal_insert_lsn()", + "INSERT INTO t SELECT FROM generate_series(1, 100000)", + ]) + lsn = res[2][0][0] + + # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the + # branch creation task but the individual timeline GC iteration happens *after* + # the branch creation task. + env.pageserver.safe_psql(f"failpoints before-timeline-gc=sleep(2000)") + + def do_gc(): + env.pageserver.safe_psql(f"do_gc {tenant.hex} {b0.hex} 0") + + thread = threading.Thread(target=do_gc, daemon=True) + thread.start() + + # The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC. + with pytest.raises(Exception, match="invalid branch start lsn"): + env.neon_cli.create_branch('b1', 'b0', tenant_id=tenant, ancestor_start_lsn=lsn) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py new file mode 100644 index 0000000000..1d39b0830d --- /dev/null +++ b/test_runner/performance/test_branch_creation.py @@ -0,0 +1,110 @@ +import random +import time +import statistics +import threading +import timeit +import pytest +from typing import List +from fixtures.benchmark_fixture import MetricReport +from fixtures.compare_fixtures import NeonCompare +from fixtures.log_helper import log + + +def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]): + neon_compare.zenbenchmark.record("branch_creation_duration_max", + max(durs), + 's', + MetricReport.LOWER_IS_BETTER) + neon_compare.zenbenchmark.record("branch_creation_duration_avg", + statistics.mean(durs), + 's', + MetricReport.LOWER_IS_BETTER) + neon_compare.zenbenchmark.record("branch_creation_duration_stdev", + statistics.stdev(durs), + 's', + MetricReport.LOWER_IS_BETTER) + + +@pytest.mark.parametrize("n_branches", [20]) +# Test measures the latency of branch creation during a heavy [1] workload. +# +# [1]: to simulate a heavy workload, the test tweaks the GC and compaction settings +# to increase the task's frequency. The test runs `pgbench` in each new branch. +# Each branch is created from a randomly picked source branch. +def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int): + env = neon_compare.env + pg_bin = neon_compare.pg_bin + + # Use aggressive GC and checkpoint settings, so GC and compaction happen more often during the test + tenant, _ = env.neon_cli.create_tenant( + conf={ + 'gc_period': '5 s', + 'gc_horizon': f'{4 * 1024 ** 2}', + 'checkpoint_distance': f'{2 * 1024 ** 2}', + 'compaction_target_size': f'{1024 ** 2}', + 'compaction_threshold': '2', + # set PITR interval to be small, so we can do GC + 'pitr_interval': '5 s' + }) + + def run_pgbench(branch: str): + log.info(f"Start a pgbench workload on branch {branch}") + + pg = env.postgres.create_start(branch, tenant_id=tenant) + connstr = pg.connstr() + + pg_bin.run_capture(['pgbench', '-i', connstr]) + pg_bin.run_capture(['pgbench', '-c10', '-T10', connstr]) + + pg.stop() + + env.neon_cli.create_branch('b0', tenant_id=tenant) + + threads: List[threading.Thread] = [] + threads.append(threading.Thread(target=run_pgbench, args=('b0', ), daemon=True)) + threads[-1].start() + + branch_creation_durations = [] + for i in range(n_branches): + time.sleep(1.0) + + # random a source branch + p = random.randint(0, i) + + timer = timeit.default_timer() + env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(p), tenant_id=tenant) + dur = timeit.default_timer() - timer + + log.info(f"Creating branch b{i+1} took {dur}s") + branch_creation_durations.append(dur) + + threads.append(threading.Thread(target=run_pgbench, args=(f'b{i+1}', ), daemon=True)) + threads[-1].start() + + for thread in threads: + thread.join() + + _record_branch_creation_durations(neon_compare, branch_creation_durations) + + +@pytest.mark.parametrize("n_branches", [1024]) +# Test measures the latency of branch creation when creating a lot of branches. +def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): + env = neon_compare.env + + env.neon_cli.create_branch('b0') + + pg = env.postgres.create_start('b0') + neon_compare.pg_bin.run_capture(['pgbench', '-i', '-s10', pg.connstr()]) + + branch_creation_durations = [] + + for i in range(n_branches): + # random a source branch + p = random.randint(0, i) + timer = timeit.default_timer() + env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(p)) + dur = timeit.default_timer() - timer + branch_creation_durations.append(dur) + + _record_branch_creation_durations(neon_compare, branch_creation_durations) From abff15dd7c2a64ae15d06679080653aa056a3269 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 20 Jul 2022 15:04:24 +0300 Subject: [PATCH 0541/1022] Fix test to be more robust with slow pageserver. If the WAL arrives at the pageserver slowly, it's possible that the branch is created before all the data on the parent branch have arrived. That results in a failure: test_runner/batch_others/test_tenant_relocation.py:259: in test_tenant_relocation timeline_id_second, current_lsn_second = populate_branch(pg_second, create_table=False, expected_sum=1001000) test_runner/batch_others/test_tenant_relocation.py:133: in populate_branch assert cur.fetchone() == (expected_sum, ) E assert (500500,) == (1001000,) E At index 0 diff: 500500 != 1001000 E Full diff: E - (1001000,) E + (500500,) To fix, specify the LSN to branch at, so that the pageserver will wait for it arrive. See https://github.com/neondatabase/neon/issues/2063 --- test_runner/batch_others/test_tenant_relocation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 73f6f52e72..d59f28bcc5 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -26,7 +26,7 @@ from fixtures.neon_fixtures import ( wait_for_upload, wait_until, ) -from fixtures.utils import lsn_from_hex, subprocess_capture +from fixtures.utils import lsn_from_hex, lsn_to_hex, subprocess_capture def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @@ -268,6 +268,7 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, env.neon_cli.create_branch( new_branch_name="test_tenant_relocation_second", ancestor_branch_name="test_tenant_relocation_main", + ancestor_start_lsn=lsn_to_hex(current_lsn_main), tenant_id=tenant_id, ) pg_second = env.postgres.create_start(branch_name='test_tenant_relocation_second', From b4c74c0ecd9776d91b973fe00bb647de7f227727 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 20 Jul 2022 12:12:02 +0300 Subject: [PATCH 0542/1022] Clean up unnecessary dependencies. Just to be tidy. --- Cargo.lock | 8 -------- compute_tools/Cargo.toml | 1 - control_plane/Cargo.toml | 1 - libs/metrics/Cargo.toml | 1 - neon_local/Cargo.toml | 1 - pageserver/Cargo.toml | 1 - pageserver/src/walreceiver/walreceiver_connection.rs | 2 +- safekeeper/Cargo.toml | 3 --- 8 files changed, 1 insertion(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4f453678e6..5031ae02e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -467,7 +467,6 @@ dependencies = [ "clap 3.2.12", "env_logger", "hyper", - "libc", "log", "postgres", "regex", @@ -517,7 +516,6 @@ dependencies = [ "tar", "thiserror", "toml", - "url", "utils", "workspace_hack", ] @@ -1604,7 +1602,6 @@ version = "0.1.0" dependencies = [ "lazy_static", "libc", - "once_cell", "prometheus", "workspace_hack", ] @@ -1677,7 +1674,6 @@ dependencies = [ "git-version", "pageserver", "postgres", - "postgres_ffi", "safekeeper", "serde_json", "utils", @@ -1905,7 +1901,6 @@ dependencies = [ "thiserror", "tokio", "tokio-postgres", - "tokio-stream", "toml_edit", "tracing", "url", @@ -2764,7 +2759,6 @@ dependencies = [ "daemonize", "etcd_broker", "fs2", - "futures", "git-version", "hex", "humantime", @@ -2784,12 +2778,10 @@ dependencies = [ "tempfile", "tokio", "tokio-postgres", - "tokio-util", "toml_edit", "tracing", "url", "utils", - "walkdir", "workspace_hack", ] diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 1022438c2e..78b85d0e79 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -4,7 +4,6 @@ version = "0.1.0" edition = "2021" [dependencies] -libc = "0.2" anyhow = "1.0" chrono = "0.4" clap = "3.0" diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 21311eea9a..26bb577636 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -14,7 +14,6 @@ regex = "1" anyhow = "1.0" thiserror = "1" nix = "0.23" -url = "2.2.2" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } pageserver = { path = "../pageserver" } diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index 8ff5d1d421..2879dfed81 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -7,5 +7,4 @@ edition = "2021" prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency libc = "0.2" lazy_static = "1.4" -once_cell = "1.8.0" workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/neon_local/Cargo.toml b/neon_local/Cargo.toml index 8ebd7d5c17..2fc38cfe02 100644 --- a/neon_local/Cargo.toml +++ b/neon_local/Cargo.toml @@ -15,6 +15,5 @@ git-version = "0.3.5" pageserver = { path = "../pageserver" } control_plane = { path = "../control_plane" } safekeeper = { path = "../safekeeper" } -postgres_ffi = { path = "../libs/postgres_ffi" } utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index b7d97a67c0..215fa151a0 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -29,7 +29,6 @@ postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -tokio-stream = "0.1.8" anyhow = { version = "1.0", features = ["backtrace"] } crc32c = "0.6.0" thiserror = "1.0" diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 98b36dfe48..0c8c0ae2f6 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -9,12 +9,12 @@ use std::{ use anyhow::{bail, ensure, Context}; use bytes::BytesMut; use fail::fail_point; +use futures::StreamExt; use postgres::{SimpleQueryMessage, SimpleQueryRow}; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; use tokio::{pin, select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; -use tokio_stream::StreamExt; use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 373108c61b..f6ae9e75d7 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -20,7 +20,6 @@ postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8 anyhow = "1.0" crc32c = "0.6.0" humantime = "2.1.0" -walkdir = "2" url = "2.2.2" signal-hook = "0.3.10" serde = { version = "1.0", features = ["derive"] } @@ -28,11 +27,9 @@ serde_with = "1.12.0" hex = "0.4.3" const_format = "0.2.21" tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -tokio-util = { version = "0.7", features = ["io"] } git-version = "0.3.5" async-trait = "0.1" once_cell = "1.10.0" -futures = "0.3.13" toml_edit = { version = "0.13", features = ["easy"] } postgres_ffi = { path = "../libs/postgres_ffi" } From f4233fde398172d8734dfae036b71367e274c0fd Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 20 Jul 2022 15:19:46 +0300 Subject: [PATCH 0543/1022] Silence "Module already imported" warning in python tests We were getting a warning like this from the pg_regress tests: =================== warnings summary =================== /usr/lib/python3/dist-packages/_pytest/config/__init__.py:663 /usr/lib/python3/dist-packages/_pytest/config/__init__.py:663: PytestAssertRewriteWarning: Module already imported so cannot be rewritten: fixtures.pg_stats self.import_plugin(import_spec) -- Docs: https://docs.pytest.org/en/stable/warnings.html ------------------ Benchmark results ------------------- To fix, reorder the imports in conftest.py. I'm not sure what exactly the problem was or why the order matters, but the warning is gone and that's good enough for me. --- test_runner/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/conftest.py b/test_runner/conftest.py index c6e6289a5c..51545d0217 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -1,5 +1,5 @@ pytest_plugins = ("fixtures.neon_fixtures", "fixtures.benchmark_fixture", + "fixtures.pg_stats", "fixtures.compare_fixtures", - "fixtures.slow", - "fixtures.pg_stats") + "fixtures.slow") From cc680dd81c4d7be96916811fc4de1f859703a0b9 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 20 Jul 2022 15:06:38 +0300 Subject: [PATCH 0544/1022] Explicitly enable cachepot in Docker builds only --- Dockerfile | 4 ++++ Dockerfile.compute-tools | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/Dockerfile b/Dockerfile index ad85638af3..6f017ac5d4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,6 +17,10 @@ RUN set -e \ FROM neondatabase/rust:1.58 AS build ARG GIT_VERSION=local +# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. +# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. +# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build. +ARG RUSTC_WRAPPER=cachepot ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID ARG AWS_SECRET_ACCESS_KEY diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 71770ae9ed..87b73e139c 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -2,6 +2,10 @@ # NB: keep in sync with rust image version in .circle/config.yml FROM neondatabase/rust:1.58 AS rust-build +# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. +# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. +# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build. +ARG RUSTC_WRAPPER=cachepot ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID ARG AWS_SECRET_ACCESS_KEY From b445cf76658808dfbb1c440e663fb8a5b321d7aa Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 20 Jul 2022 22:13:05 +0300 Subject: [PATCH 0545/1022] Refactor test_unavailability (#2134) Now test_unavailability uses async instead of Process. The test is refactored to fix a possible race condition. --- test_runner/batch_others/test_wal_acceptor.py | 55 ------------------- .../batch_others/test_wal_acceptor_async.py | 52 ++++++++++++++++++ 2 files changed, 52 insertions(+), 55 deletions(-) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 9b876f780d..5014a7ad4e 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -203,61 +203,6 @@ def test_restarts(neon_env_builder: NeonEnvBuilder): assert cur.fetchone() == (500500, ) -start_delay_sec = 2 - - -def delayed_safekeeper_start(wa): - time.sleep(start_delay_sec) - wa.start() - - -# When majority of acceptors is offline, commits are expected to be frozen -def test_unavailability(neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 2 - env = neon_env_builder.init_start() - - env.neon_cli.create_branch('test_safekeepers_unavailability') - pg = env.postgres.create_start('test_safekeepers_unavailability') - - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - pg_conn = pg.connect() - cur = pg_conn.cursor() - - # check basic work with table - cur.execute('CREATE TABLE t(key int primary key, value text)') - cur.execute("INSERT INTO t values (1, 'payload')") - - # shutdown one of two acceptors, that is, majority - env.safekeepers[0].stop() - - proc = Process(target=delayed_safekeeper_start, args=(env.safekeepers[0], )) - proc.start() - - start = time.time() - cur.execute("INSERT INTO t values (2, 'payload')") - # ensure that the query above was hanging while acceptor was down - assert (time.time() - start) >= start_delay_sec - proc.join() - - # for the world's balance, do the same with second acceptor - env.safekeepers[1].stop() - - proc = Process(target=delayed_safekeeper_start, args=(env.safekeepers[1], )) - proc.start() - - start = time.time() - cur.execute("INSERT INTO t values (3, 'payload')") - # ensure that the query above was hanging while acceptor was down - assert (time.time() - start) >= start_delay_sec - proc.join() - - cur.execute("INSERT INTO t values (4, 'payload')") - - cur.execute('SELECT sum(key) FROM t') - assert cur.fetchone() == (10, ) - - # shut down random subset of acceptors, sleep, wake them up, rinse, repeat def xmas_garland(acceptors, stop): while not bool(stop.value): diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index d74ef8840a..9577c0980e 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -404,3 +404,55 @@ def test_concurrent_computes(neon_env_builder: NeonEnvBuilder): env.neon_cli.create_branch('test_concurrent_computes') asyncio.run(run_concurrent_computes(env)) + + +# Stop safekeeper and check that query cannot be executed while safekeeper is down. +# Query will insert a single row into a table. +async def check_unavailability(sk: Safekeeper, + conn: asyncpg.Connection, + key: int, + start_delay_sec: int = 2): + # shutdown one of two acceptors, that is, majority + sk.stop() + + bg_query = asyncio.create_task(conn.execute(f"INSERT INTO t values ({key}, 'payload')")) + + await asyncio.sleep(start_delay_sec) + # ensure that the query has not been executed yet + assert not bg_query.done() + + # start safekeeper and await the query + sk.start() + await bg_query + assert bg_query.done() + + +async def run_unavailability(env: NeonEnv, pg: Postgres): + conn = await pg.connect_async() + + # check basic work with table + await conn.execute('CREATE TABLE t(key int primary key, value text)') + await conn.execute("INSERT INTO t values (1, 'payload')") + + # stop safekeeper and check that query cannot be executed while safekeeper is down + await check_unavailability(env.safekeepers[0], conn, 2) + + # for the world's balance, do the same with second safekeeper + await check_unavailability(env.safekeepers[1], conn, 3) + + # check that we can execute queries after restart + await conn.execute("INSERT INTO t values (4, 'payload')") + + result_sum = await conn.fetchval('SELECT sum(key) FROM t') + assert result_sum == 10 + + +# When majority of acceptors is offline, commits are expected to be frozen +def test_unavailability(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 2 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch('test_safekeepers_unavailability') + pg = env.postgres.create_start('test_safekeepers_unavailability') + + asyncio.run(run_unavailability(env, pg)) From 572ae743883df19f5ca9f32d7cdce7a7ca5cca4f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 21 Jul 2022 07:45:11 +0300 Subject: [PATCH 0546/1022] More precisely control size of inmem layer (#1927) * More precisely control size of inmem layer * Force recompaction of L0 layers if them contains large non-wallogged BLOBs to avoid too large layers * Add modified version of test_hot_update test (test_dup_key.py) which should generate large layers without large number of tables * Change test name in test_dup_key * Add Layer::get_max_key_range function * Add layer::key_iter method and implement new approach of splitting layers during compaction based on total size of all key values * Add test_large_schema test for checking layer file size after compaction * Make clippy happy * Restore checking LSN distance threshold for checkpoint in-memory layer * Optimize stoage keys iterator * Update pageserver/src/layered_repository.rs Co-authored-by: Heikki Linnakangas * Update pageserver/src/layered_repository.rs Co-authored-by: Heikki Linnakangas * Update pageserver/src/layered_repository.rs Co-authored-by: Heikki Linnakangas * Update pageserver/src/layered_repository.rs Co-authored-by: Heikki Linnakangas * Update pageserver/src/layered_repository.rs Co-authored-by: Heikki Linnakangas * Fix code style * Reduce number of tables in test_large_schema to make it fit in timeout with debug build * Fix style of test_large_schema.py * Fix handlng of duplicates layers Co-authored-by: Heikki Linnakangas --- pageserver/src/layered_repository.rs | 176 ++++++++++++++---- .../src/layered_repository/delta_layer.rs | 84 +++++++++ .../src/layered_repository/ephemeral_file.rs | 2 +- .../src/layered_repository/inmemory_layer.rs | 8 + .../src/layered_repository/storage_layer.rs | 6 + test_runner/batch_others/test_large_schema.py | 82 ++++++++ test_runner/performance/test_dup_key.py | 48 +++++ 7 files changed, 372 insertions(+), 34 deletions(-) create mode 100644 test_runner/batch_others/test_large_schema.py create mode 100644 test_runner/performance/test_dup_key.py diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 93acce912c..3830e4c1bd 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1734,30 +1734,43 @@ impl LayeredTimeline { /// pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { let last_lsn = self.get_last_record_lsn(); + let layers = self.layers.read().unwrap(); + if let Some(open_layer) = &layers.open_layer { + let open_layer_size = open_layer.size()?; + drop(layers); + let distance = last_lsn.widening_sub(self.last_freeze_at.load()); + // Checkpointing the open layer can be triggered by layer size or LSN range. + // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and + // we want to stay below that with a big margin. The LSN distance determines how + // much WAL the safekeepers need to store. + if distance >= self.get_checkpoint_distance().into() + || open_layer_size > self.get_checkpoint_distance() + { + info!( + "check_checkpoint_distance {}, layer size {}", + distance, open_layer_size + ); - // Has more than 'checkpoint_distance' of WAL been accumulated? - let distance = last_lsn.widening_sub(self.last_freeze_at.load()); - if distance >= self.get_checkpoint_distance().into() { - // Yes. Freeze the current in-memory layer. - self.freeze_inmem_layer(true); - self.last_freeze_at.store(last_lsn); + self.freeze_inmem_layer(true); + self.last_freeze_at.store(last_lsn); - // Launch a thread to flush the frozen layer to disk, unless - // a thread was already running. (If the thread was running - // at the time that we froze the layer, it must've seen the - // the layer we just froze before it exited; see comments - // in flush_frozen_layers()) - if let Ok(guard) = self.layer_flush_lock.try_lock() { - drop(guard); - let self_clone = Arc::clone(self); - thread_mgr::spawn( - thread_mgr::ThreadKind::LayerFlushThread, - Some(self.tenant_id), - Some(self.timeline_id), - "layer flush thread", - false, - move || self_clone.flush_frozen_layers(false), - )?; + // Launch a thread to flush the frozen layer to disk, unless + // a thread was already running. (If the thread was running + // at the time that we froze the layer, it must've seen the + // the layer we just froze before it exited; see comments + // in flush_frozen_layers()) + if let Ok(guard) = self.layer_flush_lock.try_lock() { + drop(guard); + let self_clone = Arc::clone(self); + thread_mgr::spawn( + thread_mgr::ThreadKind::LayerFlushThread, + Some(self.tenant_id), + Some(self.timeline_id), + "layer flush thread", + false, + move || self_clone.flush_frozen_layers(false), + )?; + } } } Ok(()) @@ -2211,9 +2224,59 @@ impl LayeredTimeline { } }); + // This iterator walks through all keys and is needed to calculate size used by each key + let mut all_keys_iter = deltas_to_compact + .iter() + .map(|l| l.key_iter()) + .kmerge_by(|a, b| { + let (a_key, a_lsn, _) = a; + let (b_key, b_lsn, _) = b; + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + }); + // Merge the contents of all the input delta layers into a new set // of delta layers, based on the current partitioning. // + // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. + // It's possible that there is a single key with so many page versions that storing all of them in a single layer file + // would be too large. In that case, we also split on the LSN dimension. + // + // LSN + // ^ + // | + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // + // + // If one key (X) has a lot of page versions: + // + // LSN + // ^ + // | (X) + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | +--+ | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | +--+ | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key // TODO: this actually divides the layers into fixed-size chunks, not // based on the partitioning. // @@ -2222,29 +2285,76 @@ impl LayeredTimeline { let mut new_layers = Vec::new(); let mut prev_key: Option = None; let mut writer: Option = None; + let mut key_values_total_size = 0u64; + let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key + let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key for x in all_values_iter { let (key, lsn, value) = x?; - - if let Some(prev_key) = prev_key { - if key != prev_key && writer.is_some() { - let size = writer.as_mut().unwrap().size(); - if size > target_file_size { - new_layers.push(writer.take().unwrap().finish(prev_key.next())?); + let same_key = prev_key.map_or(false, |prev_key| prev_key == key); + // We need to check key boundaries once we reach next key or end of layer with the same key + if !same_key || lsn == dup_end_lsn { + let mut next_key_size = 0u64; + let is_dup_layer = dup_end_lsn.is_valid(); + dup_start_lsn = Lsn::INVALID; + if !same_key { + dup_end_lsn = Lsn::INVALID; + } + // Determine size occupied by this key. We stop at next key, or when size becomes larger than target_file_size + for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { + next_key_size = next_size; + if key != next_key { + if dup_end_lsn.is_valid() { + dup_start_lsn = dup_end_lsn; + dup_end_lsn = lsn_range.end; + } + break; + } + key_values_total_size += next_size; + if key_values_total_size > target_file_size { + // split key between multiple layers: such layer can contain only single key + dup_start_lsn = if dup_end_lsn.is_valid() { + dup_end_lsn + } else { + lsn + }; + dup_end_lsn = next_lsn; + break; + } + } + // handle case when loop reaches last key + if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { + dup_start_lsn = dup_end_lsn; + dup_end_lsn = lsn_range.end; + } + if writer.is_some() { + let written_size = writer.as_mut().unwrap().size(); + // check if key cause layer overflow + if is_dup_layer + || dup_end_lsn.is_valid() + || written_size + key_values_total_size > target_file_size + { + new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?); writer = None; } } + key_values_total_size = next_key_size; } - if writer.is_none() { writer = Some(DeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_id, key, - lsn_range.clone(), + if dup_end_lsn.is_valid() { + // this is a layer containing slice of values of the same key + debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); + dup_start_lsn..dup_end_lsn + } else { + debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); + lsn_range.clone() + }, )?); } - writer.as_mut().unwrap().put_value(key, lsn, value)?; prev_key = Some(key); } @@ -2276,12 +2386,12 @@ impl LayeredTimeline { // Now that we have reshuffled the data to set of new delta layers, we can // delete the old ones let mut layer_paths_do_delete = HashSet::with_capacity(deltas_to_compact.len()); - for l in deltas_to_compact { + for l in &deltas_to_compact { l.delete()?; if let Some(path) = l.local_path() { layer_paths_do_delete.insert(path); } - layers.remove_historic(l); + layers.remove_historic(l.clone()); } drop(layers); diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index ed342c0cca..d622df531a 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -316,6 +316,18 @@ impl Layer for DeltaLayer { } } + fn key_iter<'a>(&'a self) -> Box + 'a> { + let inner = match self.load() { + Ok(inner) => inner, + Err(e) => panic!("Failed to load a delta layer: {e:?}"), + }; + + match DeltaKeyIter::new(inner) { + Ok(iter) => Box::new(iter), + Err(e) => panic!("Layer index is corrupted: {e:?}"), + } + } + fn delete(&self) -> Result<()> { // delete underlying file fs::remove_file(self.path())?; @@ -822,3 +834,75 @@ impl<'a> DeltaValueIter<'a> { } } } +/// +/// Iterator over all keys stored in a delta layer +/// +/// FIXME: This creates a Vector to hold all keys. +/// That takes up quite a lot of memory. Should do this in a more streaming +/// fashion. +/// +struct DeltaKeyIter { + all_keys: Vec<(DeltaKey, u64)>, + next_idx: usize, +} + +impl Iterator for DeltaKeyIter { + type Item = (Key, Lsn, u64); + + fn next(&mut self) -> Option { + if self.next_idx < self.all_keys.len() { + let (delta_key, size) = &self.all_keys[self.next_idx]; + + let key = delta_key.key(); + let lsn = delta_key.lsn(); + + self.next_idx += 1; + Some((key, lsn, *size)) + } else { + None + } + } +} + +impl<'a> DeltaKeyIter { + fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result { + let file = inner.file.as_ref().unwrap(); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + file, + ); + + let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new(); + tree_reader.visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |key, value| { + let delta_key = DeltaKey::from_slice(key); + let pos = BlobRef(value).pos(); + if let Some(last) = all_keys.last_mut() { + if last.0.key() == delta_key.key() { + return true; + } else { + // subtract offset of new key BLOB and first blob of this key + // to get total size if values associated with this key + let first_pos = last.1; + last.1 = pos - first_pos; + } + } + all_keys.push((delta_key, pos)); + true + }, + )?; + if let Some(last) = all_keys.last_mut() { + // Last key occupies all space till end of layer + last.1 = std::fs::metadata(&file.file.path)?.len() - last.1; + } + let iter = DeltaKeyIter { + all_keys, + next_idx: 0, + }; + + Ok(iter) + } +} diff --git a/pageserver/src/layered_repository/ephemeral_file.rs b/pageserver/src/layered_repository/ephemeral_file.rs index cdde9d5d13..299bb4e873 100644 --- a/pageserver/src/layered_repository/ephemeral_file.rs +++ b/pageserver/src/layered_repository/ephemeral_file.rs @@ -43,7 +43,7 @@ pub struct EphemeralFile { _timelineid: ZTimelineId, file: Arc, - size: u64, + pub size: u64, } impl EphemeralFile { diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 87e6877520..1f89f333dd 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -233,6 +233,14 @@ impl Layer for InMemoryLayer { } impl InMemoryLayer { + /// + /// Get layer size on the disk + /// + pub fn size(&self) -> Result { + let inner = self.inner.read().unwrap(); + Ok(inner.file.size) + } + /// /// Create a new, empty, in-memory layer /// diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index aaf765b83d..e10330bdd3 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -139,6 +139,12 @@ pub trait Layer: Send + Sync { /// Iterate through all keys and values stored in the layer fn iter(&self) -> Box> + '_>; + /// Iterate through all keys stored in the layer. Returns key, lsn and value size + /// It is used only for compaction and so is currently implemented only for DeltaLayer + fn key_iter(&self) -> Box + '_> { + panic!("Not implemented") + } + /// Permanently remove this layer from disk. fn delete(&self) -> Result<()>; diff --git a/test_runner/batch_others/test_large_schema.py b/test_runner/batch_others/test_large_schema.py new file mode 100644 index 0000000000..18ae0614a9 --- /dev/null +++ b/test_runner/batch_others/test_large_schema.py @@ -0,0 +1,82 @@ +import time +import os +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.log_helper import log + + +# This test creates large number of tables which cause large catalog. +# Right now Neon serialize directory as single key-value storage entry and so +# it leads to layer filled mostly by one key. +# Originally Neon implementation of checkpoint and compaction is not able to split key which leads +# to large (several gigabytes) layer files (both ephemeral and delta layers). +# It may cause problems with uploading to S3 and also degrade performance because ephemeral file swapping. +# +def test_large_schema(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + pg = env.postgres.create_start('main') + + conn = pg.connect() + cur = conn.cursor() + + tables = 2 # 10 is too much for debug build + partitions = 1000 + for i in range(1, tables + 1): + print(f'iteration {i} / {tables}') + + # Restart compute. Restart is actually not strictly needed. + # It is done mostly because this test originally tries to model the problem reported by Ketteq. + pg.stop() + # Kill and restart the pageserver. + # env.pageserver.stop(immediate=True) + # env.pageserver.start() + pg.start() + + retry_sleep = 0.5 + max_retries = 200 + retries = 0 + while True: + try: + conn = pg.connect() + cur = conn.cursor() + cur.execute(f"CREATE TABLE if not exists t_{i}(pk integer) partition by range (pk)") + for j in range(1, partitions + 1): + cur.execute( + f"create table if not exists p_{i}_{j} partition of t_{i} for values from ({j}) to ({j + 1})" + ) + cur.execute(f"insert into t_{i} values (generate_series(1,{partitions}))") + cur.execute("vacuum full") + conn.close() + + except Exception as error: + # It's normal that it takes some time for the pageserver to + # restart, and for the connection to fail until it does. It + # should eventually recover, so retry until it succeeds. + print(f'failed: {error}') + if retries < max_retries: + retries += 1 + print(f'retry {retries} / {max_retries}') + time.sleep(retry_sleep) + continue + else: + raise + break + + conn = pg.connect() + cur = conn.cursor() + + for i in range(1, tables + 1): + cur.execute(f"SELECT count(*) FROM t_{i}") + assert cur.fetchone() == (partitions, ) + + cur.execute("set enable_sort=off") + cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid") + + # Check layer file sizes + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant_id, timeline_id) + for filename in os.listdir(timeline_path): + if filename.startswith('00000'): + log.info(f'layer {filename} size is {os.path.getsize(timeline_path + filename)}') + assert os.path.getsize(timeline_path + filename) < 512_000_000 diff --git a/test_runner/performance/test_dup_key.py b/test_runner/performance/test_dup_key.py new file mode 100644 index 0000000000..a8caceb61a --- /dev/null +++ b/test_runner/performance/test_dup_key.py @@ -0,0 +1,48 @@ +import pytest +from contextlib import closing +from fixtures.compare_fixtures import PgCompare +from pytest_lazyfixture import lazy_fixture # type: ignore + + +@pytest.mark.parametrize( + "env", + [ + # The test is too slow to run in CI, but fast enough to run with remote tests + pytest.param(lazy_fixture("neon_compare"), id="neon", marks=pytest.mark.slow), + pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), + pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), + ]) +def test_dup_key(env: PgCompare): + # Update the same page many times, then measure read performance + + with closing(env.pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SET synchronous_commit=off") + cur.execute("SET statement_timeout=0") + + # Write many updates to the same row + with env.record_duration('write'): + cur.execute("create table t (i integer, filler text);") + cur.execute('insert into t values (0);') + cur.execute(""" +do $$ +begin + for ivar in 1..5000000 loop + update t set i = ivar, filler = repeat('a', 50); + update t set i = ivar, filler = repeat('b', 50); + update t set i = ivar, filler = repeat('c', 50); + update t set i = ivar, filler = repeat('d', 50); + rollback; + end loop; +end; +$$; +""") + + # Write 3-4 MB to evict t from compute cache + cur.execute('create table f (i integer);') + cur.execute(f'insert into f values (generate_series(1,100000));') + + # Read + with env.record_duration('read'): + cur.execute('select * from t;') + cur.fetchall() From ed102f44d9cb101da57f6915afbbc96a14d23570 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Thu, 21 Jul 2022 12:08:26 -0400 Subject: [PATCH 0547/1022] Reduce memory allocations for page server (#2010) ## Overview This patch reduces the number of memory allocations when running the page server under a heavy write workload. This mostly helps improve the speed of WAL record ingestion. ## Changes - modified `DatadirModification` to allow reuse the struct's allocated memory after each modification - modified `decode_wal_record` to allow passing a `DecodedWALRecord` reference. This helps reuse the struct in each `decode_wal_record` call - added a reusable buffer for serializing object inside the `InMemoryLayer::put_value` function - added a performance test simulating a heavy write workload for testing the changes in this patch ### Semi-related changes - remove redundant serializations when calling `DeltaLayer::put_value` during `InMemoryLayer::write_to_disk` function call [1] - removed the info span `info_span!("processing record", lsn = %lsn)` during each WAL ingestion [2] ## Notes - [1]: in `InMemoryLayer::write_to_disk`, a deserialization is called ``` let val = Value::des(&buf)?; delta_layer_writer.put_value(key, *lsn, val)?; ``` `DeltaLayer::put_value` then creates a serialization based on the previous deserialization ``` let off = self.blob_writer.write_blob(&Value::ser(&val)?)?; ``` - [2]: related: https://github.com/neondatabase/neon/issues/733 --- pageserver/src/import_datadir.rs | 21 ++-- .../src/layered_repository/delta_layer.rs | 14 ++- .../src/layered_repository/inmemory_layer.rs | 22 +++- pageserver/src/pgdatadir_mapping.rs | 38 +++--- pageserver/src/walingest.rs | 112 +++++++++--------- .../src/walreceiver/walreceiver_connection.rs | 25 ++-- pageserver/src/walrecord.rs | 34 ++++-- .../batch_others/test_branch_and_gc.py | 6 + .../performance/test_compare_pg_stats.py | 33 ++++++ 9 files changed, 196 insertions(+), 109 deletions(-) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index f8a41e5b2b..6402657e05 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -16,6 +16,7 @@ use crate::reltag::{RelTag, SlruKind}; use crate::repository::Repository; use crate::repository::Timeline; use crate::walingest::WalIngest; +use crate::walrecord::DecodedWALRecord; use postgres_ffi::relfile_utils::*; use postgres_ffi::waldecoder::*; use postgres_ffi::xlog_utils::*; @@ -38,7 +39,7 @@ pub fn import_timeline_from_postgres_datadir( // TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn) // Then fishing out pg_control would be unnecessary - let mut modification = tline.begin_modification(lsn); + let mut modification = tline.begin_modification(); modification.init_empty()?; // Import all but pg_wal @@ -57,12 +58,12 @@ pub fn import_timeline_from_postgres_datadir( if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? { pg_control = Some(control_file); } - modification.flush()?; + modification.flush(lsn)?; } } // We're done importing all the data files. - modification.commit()?; + modification.commit(lsn)?; // We expect the Postgres server to be shut down cleanly. let pg_control = pg_control.context("pg_control file not found")?; @@ -268,9 +269,11 @@ fn import_wal( waldecoder.feed_bytes(&buf); let mut nrecords = 0; + let mut modification = tline.begin_modification(); + let mut decoded = DecodedWALRecord::default(); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(tline, recdata, lsn)?; + walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?; last_lsn = lsn; nrecords += 1; @@ -300,7 +303,7 @@ pub fn import_basebackup_from_tar( base_lsn: Lsn, ) -> Result<()> { info!("importing base at {}", base_lsn); - let mut modification = tline.begin_modification(base_lsn); + let mut modification = tline.begin_modification(); modification.init_empty()?; let mut pg_control: Option = None; @@ -318,7 +321,7 @@ pub fn import_basebackup_from_tar( // We found the pg_control file. pg_control = Some(res); } - modification.flush()?; + modification.flush(base_lsn)?; } tar::EntryType::Directory => { debug!("directory {:?}", file_path); @@ -332,7 +335,7 @@ pub fn import_basebackup_from_tar( // sanity check: ensure that pg_control is loaded let _pg_control = pg_control.context("pg_control file not found")?; - modification.commit()?; + modification.commit(base_lsn)?; Ok(()) } @@ -384,9 +387,11 @@ pub fn import_wal_from_tar( waldecoder.feed_bytes(&bytes[offset..]); + let mut modification = tline.begin_modification(); + let mut decoded = DecodedWALRecord::default(); while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(tline, recdata, lsn)?; + walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?; last_lsn = lsn; debug!("imported record at {} (end {})", lsn, end_lsn); diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index d622df531a..ce5cb57745 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -672,11 +672,21 @@ impl DeltaLayerWriter { /// The values must be appended in key, lsn order. /// pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> { + self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init()) + } + + pub fn put_value_bytes( + &mut self, + key: Key, + lsn: Lsn, + val: &[u8], + will_init: bool, + ) -> Result<()> { assert!(self.lsn_range.start <= lsn); - let off = self.blob_writer.write_blob(&Value::ser(&val)?)?; + let off = self.blob_writer.write_blob(val)?; - let blob_ref = BlobRef::new(off, val.will_init()); + let blob_ref = BlobRef::new(off, will_init); let delta_key = DeltaKey::from_key_lsn(&key, lsn); self.tree.append(&delta_key.0, blob_ref.0)?; diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 1f89f333dd..5f269a868f 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -15,6 +15,7 @@ use crate::layered_repository::storage_layer::{ use crate::repository::{Key, Value}; use crate::walrecord; use anyhow::{bail, ensure, Result}; +use std::cell::RefCell; use std::collections::HashMap; use tracing::*; use utils::{ @@ -30,6 +31,12 @@ use std::ops::Range; use std::path::PathBuf; use std::sync::RwLock; +thread_local! { + /// A buffer for serializing object during [`InMemoryLayer::put_value`]. + /// This buffer is reused for each serialization to avoid additional malloc calls. + static SER_BUFFER: RefCell> = RefCell::new(Vec::new()); +} + pub struct InMemoryLayer { conf: &'static PageServerConf, tenantid: ZTenantId, @@ -278,10 +285,17 @@ impl InMemoryLayer { pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { trace!("put_value key {} at {}/{}", key, self.timelineid, lsn); let mut inner = self.inner.write().unwrap(); - inner.assert_writeable(); - let off = inner.file.write_blob(&Value::ser(val)?)?; + let off = { + SER_BUFFER.with(|x| -> Result<_> { + let mut buf = x.borrow_mut(); + buf.clear(); + val.ser_into(&mut (*buf))?; + let off = inner.file.write_blob(&buf)?; + Ok(off) + })? + }; let vec_map = inner.index.entry(key).or_default(); let old = vec_map.append_or_update_last(lsn, off).unwrap().0; @@ -350,8 +364,8 @@ impl InMemoryLayer { // Write all page versions for (lsn, pos) in vec_map.as_slice() { cursor.read_blob_into_buf(*pos, &mut buf)?; - let val = Value::des(&buf)?; - delta_layer_writer.put_value(key, *lsn, val)?; + let will_init = Value::des(&buf)?.will_init(); + delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?; } } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index f696c1f411..788c9de29e 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -80,23 +80,25 @@ impl DatadirTimeline { /// the timeline. /// /// This provides a transaction-like interface to perform a bunch - /// of modifications atomically, all stamped with one LSN. + /// of modifications atomically. /// - /// To ingest a WAL record, call begin_modification(lsn) to get a + /// To ingest a WAL record, call begin_modification() to get a /// DatadirModification object. Use the functions in the object to /// modify the repository state, updating all the pages and metadata - /// that the WAL record affects. When you're done, call commit() to - /// commit the changes. + /// that the WAL record affects. When you're done, call commit(lsn) to + /// commit the changes. All the changes will be stamped with the specified LSN. + /// + /// Calling commit(lsn) will flush all the changes and reset the state, + /// so the `DatadirModification` struct can be reused to perform the next modification. /// /// Note that any pending modifications you make through the /// modification object won't be visible to calls to the 'get' and list /// functions of the timeline until you finish! And if you update the /// same page twice, the last update wins. /// - pub fn begin_modification(&self, lsn: Lsn) -> DatadirModification { + pub fn begin_modification(&self) -> DatadirModification { DatadirModification { tline: self, - lsn, pending_updates: HashMap::new(), pending_deletions: Vec::new(), pending_nblocks: 0, @@ -533,8 +535,6 @@ pub struct DatadirModification<'a, R: Repository> { /// in the state in 'tline' yet. pub tline: &'a DatadirTimeline, - lsn: Lsn, - // The modifications are not applied directly to the underlying key-value store. // The put-functions add the modifications here, and they are flushed to the // underlying key-value store by the 'finish' function. @@ -920,7 +920,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { /// retains all the metadata, but data pages are flushed. That's again OK /// for bulk import, where you are just loading data pages and won't try to /// modify the same pages twice. - pub fn flush(&mut self) -> Result<()> { + pub fn flush(&mut self, lsn: Lsn) -> Result<()> { // Unless we have accumulated a decent amount of changes, it's not worth it // to scan through the pending_updates list. let pending_nblocks = self.pending_nblocks; @@ -934,7 +934,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { let mut result: Result<()> = Ok(()); self.pending_updates.retain(|&key, value| { if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) { - result = writer.put(key, self.lsn, value); + result = writer.put(key, lsn, value); false } else { true @@ -956,20 +956,22 @@ impl<'a, R: Repository> DatadirModification<'a, R> { /// /// Finish this atomic update, writing all the updated keys to the /// underlying timeline. + /// All the modifications in this atomic update are stamped by the specified LSN. /// - pub fn commit(self) -> Result<()> { + pub fn commit(&mut self, lsn: Lsn) -> Result<()> { let writer = self.tline.tline.writer(); let pending_nblocks = self.pending_nblocks; + self.pending_nblocks = 0; - for (key, value) in self.pending_updates { - writer.put(key, self.lsn, &value)?; + for (key, value) in self.pending_updates.drain() { + writer.put(key, lsn, &value)?; } - for key_range in self.pending_deletions { - writer.delete(key_range.clone(), self.lsn)?; + for key_range in self.pending_deletions.drain(..) { + writer.delete(key_range, lsn)?; } - writer.finish_write(self.lsn); + writer.finish_write(lsn); if pending_nblocks != 0 { self.tline.current_logical_size.fetch_add( @@ -1407,9 +1409,9 @@ pub fn create_test_timeline( ) -> Result>> { let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; let tline = DatadirTimeline::new(tline, 256 * 1024); - let mut m = tline.begin_modification(Lsn(8)); + let mut m = tline.begin_modification(); m.init_empty()?; - m.commit()?; + m.commit(Lsn(8))?; Ok(Arc::new(tline)) } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 2f39007e9f..adc24328ae 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -78,13 +78,13 @@ impl<'a, R: Repository> WalIngest<'a, R> { /// pub fn ingest_record( &mut self, - timeline: &DatadirTimeline, recdata: Bytes, lsn: Lsn, + modification: &mut DatadirModification, + decoded: &mut DecodedWALRecord, ) -> Result<()> { - let mut modification = timeline.begin_modification(lsn); + decode_wal_record(recdata, decoded).context("failed decoding wal record")?; - let mut decoded = decode_wal_record(recdata).context("failed decoding wal record")?; let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -98,7 +98,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { if decoded.xl_rmid == pg_constants::RM_HEAP_ID || decoded.xl_rmid == pg_constants::RM_HEAP2_ID { - self.ingest_heapam_record(&mut buf, &mut modification, &mut decoded)?; + self.ingest_heapam_record(&mut buf, modification, decoded)?; } // Handle other special record types if decoded.xl_rmid == pg_constants::RM_SMGR_ID @@ -106,19 +106,19 @@ impl<'a, R: Repository> WalIngest<'a, R> { == pg_constants::XLOG_SMGR_CREATE { let create = XlSmgrCreate::decode(&mut buf); - self.ingest_xlog_smgr_create(&mut modification, &create)?; + self.ingest_xlog_smgr_create(modification, &create)?; } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE { let truncate = XlSmgrTruncate::decode(&mut buf); - self.ingest_xlog_smgr_truncate(&mut modification, &truncate)?; + self.ingest_xlog_smgr_truncate(modification, &truncate)?; } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_CREATE { let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(&mut modification, &createdb)?; + self.ingest_xlog_dbase_create(modification, &createdb)?; } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_DROP { @@ -137,7 +137,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( - &mut modification, + modification, SlruKind::Clog, segno, rpageno, @@ -146,7 +146,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { } else { assert!(info == pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(&mut buf); - self.ingest_clog_truncate_record(&mut modification, &xlrec)?; + self.ingest_clog_truncate_record(modification, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_XACT_ID { let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK; @@ -154,7 +154,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); self.ingest_xact_record( - &mut modification, + modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT, )?; @@ -164,7 +164,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); self.ingest_xact_record( - &mut modification, + modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT_PREPARED, )?; @@ -187,7 +187,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( - &mut modification, + modification, SlruKind::MultiXactOffsets, segno, rpageno, @@ -198,7 +198,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( - &mut modification, + modification, SlruKind::MultiXactMembers, segno, rpageno, @@ -206,14 +206,14 @@ impl<'a, R: Repository> WalIngest<'a, R> { )?; } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { let xlrec = XlMultiXactCreate::decode(&mut buf); - self.ingest_multixact_create_record(&mut modification, &xlrec)?; + self.ingest_multixact_create_record(modification, &xlrec)?; } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID { let xlrec = XlMultiXactTruncate::decode(&mut buf); - self.ingest_multixact_truncate_record(&mut modification, &xlrec)?; + self.ingest_multixact_truncate_record(modification, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID { let xlrec = XlRelmapUpdate::decode(&mut buf); - self.ingest_relmap_page(&mut modification, &xlrec, &decoded)?; + self.ingest_relmap_page(modification, &xlrec, decoded)?; } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_NEXTOID { @@ -248,7 +248,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { // Iterate through all the blocks that the record modifies, and // "put" a separate copy of the record for each block. for blk in decoded.blocks.iter() { - self.ingest_decoded_block(&mut modification, lsn, &decoded, blk)?; + self.ingest_decoded_block(modification, lsn, decoded, blk)?; } // If checkpoint data was updated, store the new version in the repository @@ -261,7 +261,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { // Now that this record has been fully handled, including updating the // checkpoint data, let the repository know that it is up-to-date to this LSN - modification.commit()?; + modification.commit(lsn)?; Ok(()) } @@ -1069,10 +1069,10 @@ mod tests { static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); fn init_walingest_test(tline: &DatadirTimeline) -> Result> { - let mut m = tline.begin_modification(Lsn(0x10)); + let mut m = tline.begin_modification(); m.put_checkpoint(ZERO_CHECKPOINT.clone())?; m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file - m.commit()?; + m.commit(Lsn(0x10))?; let walingest = WalIngest::new(tline, Lsn(0x10))?; Ok(walingest) @@ -1084,19 +1084,19 @@ mod tests { let tline = create_test_timeline(repo, TIMELINE_ID)?; let mut walingest = init_walingest_test(&tline)?; - let mut m = tline.begin_modification(Lsn(0x20)); + let mut m = tline.begin_modification(); walingest.put_rel_creation(&mut m, TESTREL_A)?; walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; - m.commit()?; - let mut m = tline.begin_modification(Lsn(0x30)); + m.commit(Lsn(0x20))?; + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?; - m.commit()?; - let mut m = tline.begin_modification(Lsn(0x40)); + m.commit(Lsn(0x30))?; + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?; - m.commit()?; - let mut m = tline.begin_modification(Lsn(0x50)); + m.commit(Lsn(0x40))?; + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?; - m.commit()?; + m.commit(Lsn(0x50))?; assert_current_logical_size(&tline, Lsn(0x50)); @@ -1142,9 +1142,9 @@ mod tests { ); // Truncate last block - let mut m = tline.begin_modification(Lsn(0x60)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?; - m.commit()?; + m.commit(Lsn(0x60))?; assert_current_logical_size(&tline, Lsn(0x60)); // Check reported size and contents after truncation @@ -1166,15 +1166,15 @@ mod tests { ); // Truncate to zero length - let mut m = tline.begin_modification(Lsn(0x68)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?; - m.commit()?; + m.commit(Lsn(0x68))?; assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68))?, 0); // Extend from 0 to 2 blocks, leaving a gap - let mut m = tline.begin_modification(Lsn(0x70)); + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?; - m.commit()?; + m.commit(Lsn(0x70))?; assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70))?, 2); assert_eq!( tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, @@ -1186,9 +1186,9 @@ mod tests { ); // Extend a lot more, leaving a big gap that spans across segments - let mut m = tline.begin_modification(Lsn(0x80)); + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?; - m.commit()?; + m.commit(Lsn(0x80))?; assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, 1501); for blk in 2..1500 { assert_eq!( @@ -1212,18 +1212,18 @@ mod tests { let tline = create_test_timeline(repo, TIMELINE_ID)?; let mut walingest = init_walingest_test(&tline)?; - let mut m = tline.begin_modification(Lsn(0x20)); + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; - m.commit()?; + m.commit(Lsn(0x20))?; // Check that rel exists and size is correct assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1); // Drop rel - let mut m = tline.begin_modification(Lsn(0x30)); + let mut m = tline.begin_modification(); walingest.put_rel_drop(&mut m, TESTREL_A)?; - m.commit()?; + m.commit(Lsn(0x30))?; // Check that rel is not visible anymore assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false); @@ -1232,9 +1232,9 @@ mod tests { //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30))?.is_none()); // Re-create it - let mut m = tline.begin_modification(Lsn(0x40)); + let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?; - m.commit()?; + m.commit(Lsn(0x40))?; // Check that rel exists and size is correct assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true); @@ -1254,12 +1254,12 @@ mod tests { // Create a 20 MB relation (the size is arbitrary) let relsize = 20 * 1024 * 1024 / 8192; - let mut m = tline.begin_modification(Lsn(0x20)); + let mut m = tline.begin_modification(); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; } - m.commit()?; + m.commit(Lsn(0x20))?; // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); @@ -1280,9 +1280,9 @@ mod tests { // Truncate relation so that second segment was dropped // - only leave one page - let mut m = tline.begin_modification(Lsn(0x60)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, 1)?; - m.commit()?; + m.commit(Lsn(0x60))?; // Check reported size and contents after truncation assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 1); @@ -1310,12 +1310,12 @@ mod tests { // Extend relation again. // Add enough blocks to create second segment let lsn = Lsn(0x80); - let mut m = tline.begin_modification(lsn); + let mut m = tline.begin_modification(); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, lsn); walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; } - m.commit()?; + m.commit(lsn)?; assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true); assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, relsize); @@ -1343,10 +1343,10 @@ mod tests { let mut lsn = 0x10; for blknum in 0..pg_constants::RELSEG_SIZE + 1 { lsn += 0x10; - let mut m = tline.begin_modification(Lsn(lsn)); + let mut m = tline.begin_modification(); let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?; - m.commit()?; + m.commit(Lsn(lsn))?; } assert_current_logical_size(&tline, Lsn(lsn)); @@ -1358,9 +1358,9 @@ mod tests { // Truncate one block lsn += 0x10; - let mut m = tline.begin_modification(Lsn(lsn)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE)?; - m.commit()?; + m.commit(Lsn(lsn))?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn))?, pg_constants::RELSEG_SIZE @@ -1369,9 +1369,9 @@ mod tests { // Truncate another block lsn += 0x10; - let mut m = tline.begin_modification(Lsn(lsn)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE - 1)?; - m.commit()?; + m.commit(Lsn(lsn))?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn))?, pg_constants::RELSEG_SIZE - 1 @@ -1383,9 +1383,9 @@ mod tests { let mut size: i32 = 3000; while size >= 0 { lsn += 0x10; - let mut m = tline.begin_modification(Lsn(lsn)); + let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?; - m.commit()?; + m.commit(Lsn(lsn))?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn))?, size as BlockNumber diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 0c8c0ae2f6..cc1a9cc5eb 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -23,6 +23,7 @@ use crate::{ repository::{Repository, Timeline}, tenant_mgr, walingest::WalIngest, + walrecord::DecodedWALRecord, }; use postgres_ffi::waldecoder::WalStreamDecoder; use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; @@ -150,19 +151,25 @@ pub async fn handle_walreceiver_connection( waldecoder.feed_bytes(data); - while let Some((lsn, recdata)) = waldecoder.poll_decode()? { - let _enter = info_span!("processing record", lsn = %lsn).entered(); + { + let mut decoded = DecodedWALRecord::default(); + let mut modification = timeline.begin_modification(); + while let Some((lsn, recdata)) = waldecoder.poll_decode()? { + // let _enter = info_span!("processing record", lsn = %lsn).entered(); - // It is important to deal with the aligned records as lsn in getPage@LSN is - // aligned and can be several bytes bigger. Without this alignment we are - // at risk of hitting a deadlock. - ensure!(lsn.is_aligned()); + // It is important to deal with the aligned records as lsn in getPage@LSN is + // aligned and can be several bytes bigger. Without this alignment we are + // at risk of hitting a deadlock. + ensure!(lsn.is_aligned()); - walingest.ingest_record(&timeline, recdata, lsn)?; + walingest + .ingest_record(recdata, lsn, &mut modification, &mut decoded) + .context("could not ingest record at {lsn}")?; - fail_point!("walreceiver-after-ingest"); + fail_point!("walreceiver-after-ingest"); - last_rec_lsn = lsn; + last_rec_lsn = lsn; + } } if !caught_up && endlsn >= end_of_wal { diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 5a384360e2..6b01d52005 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -96,6 +96,7 @@ impl DecodedBkpBlock { } } +#[derive(Default)] pub struct DecodedWALRecord { pub xl_xid: TransactionId, pub xl_info: u8, @@ -505,7 +506,17 @@ impl XlMultiXactTruncate { // block data // ... // main data -pub fn decode_wal_record(record: Bytes) -> Result { +// +// +// For performance reasons, the caller provides the DecodedWALRecord struct and the function just fills it in. +// It would be more natural for this function to return a DecodedWALRecord as return value, +// but reusing the caller-supplied struct avoids an allocation. +// This code is in the hot path for digesting incoming WAL, and is very performance sensitive. +// +pub fn decode_wal_record( + record: Bytes, + decoded: &mut DecodedWALRecord, +) -> Result<(), DeserializeError> { let mut rnode_spcnode: u32 = 0; let mut rnode_dbnode: u32 = 0; let mut rnode_relnode: u32 = 0; @@ -534,7 +545,7 @@ pub fn decode_wal_record(record: Bytes) -> Result = Vec::new(); + decoded.blocks.clear(); // 2. Decode the headers. // XLogRecordBlockHeaders if any, @@ -713,7 +724,7 @@ pub fn decode_wal_record(record: Bytes) -> Result { @@ -724,7 +735,7 @@ pub fn decode_wal_record(record: Bytes) -> Result Result Date: Tue, 12 Jul 2022 23:07:26 +0300 Subject: [PATCH 0548/1022] register tenants task thread pool threads in thread_mgr needed to avoid this warning: is_shutdown_requested() called in an unexpected thread --- pageserver/src/tenant_tasks.rs | 4 ++ pageserver/src/thread_mgr.rs | 70 +++++++++++++++++++++++++++++++--- 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index b0bb4953ca..e51744d3cc 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -120,6 +120,10 @@ pub fn init_tenant_task_pool() -> anyhow::Result<()> { let runtime = tokio::runtime::Builder::new_multi_thread() .thread_name("tenant-task-worker") .enable_all() + .on_thread_start(|| { + thread_mgr::register(ThreadKind::TenantTaskWorker, "tenant-task-worker") + }) + .on_thread_stop(thread_mgr::deregister) .build()?; let (gc_send, mut gc_recv) = mpsc::channel::(100); diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index ab0d894c70..6dd2e4b00b 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -97,6 +97,9 @@ pub enum ThreadKind { // Thread that schedules new compaction and gc jobs TenantTaskManager, + // Worker thread for tenant tasks thread pool + TenantTaskWorker, + // Thread that flushes frozen in-memory layers to disk LayerFlushThread, @@ -105,18 +108,20 @@ pub enum ThreadKind { StorageSync, } +#[derive(Default)] struct MutableThreadState { /// Tenant and timeline that this thread is associated with. tenant_id: Option, timeline_id: Option, /// Handle for waiting for the thread to exit. It can be None, if the - /// the thread has already exited. + /// the thread has already exited. OR if this thread is managed externally + /// and was not spawned through thread_mgr.rs::spawn function. join_handle: Option>, } struct PageServerThread { - _thread_id: u64, + thread_id: u64, kind: ThreadKind, @@ -147,7 +152,7 @@ where let (shutdown_tx, shutdown_rx) = watch::channel(()); let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); let thread = Arc::new(PageServerThread { - _thread_id: thread_id, + thread_id, kind, name: name.to_string(), shutdown_requested: AtomicBool::new(false), @@ -315,8 +320,10 @@ pub fn shutdown_threads( drop(thread_mut); let _ = join_handle.join(); } else { - // The thread had not even fully started yet. Or it was shut down - // concurrently and already exited + // Possibly one of: + // * The thread had not even fully started yet. + // * It was shut down concurrently and already exited + // * Is managed through `register`/`deregister` fns without providing a join handle } } } @@ -348,3 +355,56 @@ pub fn is_shutdown_requested() -> bool { } }) } + +/// Needed to register threads that were not spawned through spawn function. +/// For example tokio blocking threads. This function is expected to be used +/// in tandem with `deregister`. +/// NOTE: threads registered through this function cannot be joined +pub fn register(kind: ThreadKind, name: &str) { + CURRENT_THREAD.with(|ct| { + let mut borrowed = ct.borrow_mut(); + if borrowed.is_some() { + panic!("thread already registered") + }; + let (shutdown_tx, shutdown_rx) = watch::channel(()); + let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); + + let thread = Arc::new(PageServerThread { + thread_id, + kind, + name: name.to_owned(), + shutdown_requested: AtomicBool::new(false), + shutdown_tx, + mutable: Mutex::new(MutableThreadState { + tenant_id: None, + timeline_id: None, + join_handle: None, + }), + }); + + *borrowed = Some(Arc::clone(&thread)); + + SHUTDOWN_RX.with(|rx| { + *rx.borrow_mut() = Some(shutdown_rx); + }); + + THREADS.lock().unwrap().insert(thread_id, thread); + }); +} + +// Expected to be used in tandem with `register`. See the doc for `register` for more details +pub fn deregister() { + CURRENT_THREAD.with(|ct| { + let mut borrowed = ct.borrow_mut(); + let thread = match borrowed.take() { + Some(thread) => thread, + None => panic!("calling deregister on unregistered thread"), + }; + + SHUTDOWN_RX.with(|rx| { + *rx.borrow_mut() = None; + }); + + THREADS.lock().unwrap().remove(&thread.thread_id) + }); +} From 9dcb9ca3da358a678daf040eda2c94b0b8dd9fab Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 22 Jul 2022 11:00:05 +0100 Subject: [PATCH 0549/1022] test/performance: ensure we don't have tables that we're creating (#2135) --- test_runner/performance/test_dup_key.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test_runner/performance/test_dup_key.py b/test_runner/performance/test_dup_key.py index a8caceb61a..ee867a9845 100644 --- a/test_runner/performance/test_dup_key.py +++ b/test_runner/performance/test_dup_key.py @@ -17,6 +17,8 @@ def test_dup_key(env: PgCompare): with closing(env.pg.connect()) as conn: with conn.cursor() as cur: + cur.execute('drop table if exists t, f;') + cur.execute("SET synchronous_commit=off") cur.execute("SET statement_timeout=0") From 39c59b8df5069efb9364280cf64b8f9ecf4241b3 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Fri, 22 Jul 2022 07:44:20 -0400 Subject: [PATCH 0550/1022] Fix flaky test_branch_creation_before_gc test (#2142) --- test_runner/batch_others/test_branch_and_gc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/batch_others/test_branch_and_gc.py b/test_runner/batch_others/test_branch_and_gc.py index b8ce63b069..901b3f3d0f 100644 --- a/test_runner/batch_others/test_branch_and_gc.py +++ b/test_runner/batch_others/test_branch_and_gc.py @@ -139,7 +139,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): 'image_creation_threshold': '1', # set PITR interval to be small, so we can do GC - 'pitr_interval': '1 s' + 'pitr_interval': '0 s' }) b0 = env.neon_cli.create_branch('b0', tenant_id=tenant) From 5f4ccae5c5d426d8587ac9f91b251f8f842f4333 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Mon, 25 Jul 2022 17:23:10 +0300 Subject: [PATCH 0551/1022] [proxy] Add the `password hack` authentication flow (#2095) [proxy] Add the `password hack` authentication flow This lets us authenticate users which can use neither SNI (due to old libpq) nor connection string `options` (due to restrictions in other client libraries). Note: `PasswordHack` will accept passwords which are not encoded in base64 via the "password" field. The assumption is that most user passwords will be valid utf-8 strings, and the rest may still be passed via "password_". --- libs/utils/src/pq_proto.rs | 4 +- proxy/src/auth.rs | 12 +- proxy/src/auth/backend.rs | 186 ++++++++-- proxy/src/auth/backend/console.rs | 91 ++--- proxy/src/auth/backend/legacy_console.rs | 44 ++- proxy/src/auth/backend/link.rs | 4 +- proxy/src/auth/backend/postgres.rs | 35 +- proxy/src/auth/credentials.rs | 431 ++++++++--------------- proxy/src/auth/flow.rs | 39 +- proxy/src/auth/password_hack.rs | 102 ++++++ proxy/src/compute.rs | 104 ++++-- proxy/src/config.rs | 36 +- proxy/src/error.rs | 7 + proxy/src/main.rs | 8 +- proxy/src/proxy.rs | 91 ++--- proxy/src/stream.rs | 8 + test_runner/batch_others/test_proxy.py | 32 +- test_runner/fixtures/neon_fixtures.py | 66 ++-- 18 files changed, 750 insertions(+), 550 deletions(-) create mode 100644 proxy/src/auth/password_hack.rs diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index 0a320f123c..3dcae4d0af 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -47,10 +47,12 @@ pub enum FeStartupPacket { StartupMessage { major_version: u32, minor_version: u32, - params: HashMap, + params: StartupMessageParams, }, } +pub type StartupMessageParams = HashMap; + #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct CancelKeyData { pub backend_pid: i32, diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 9bddd58fce..61c7458e16 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,11 +1,14 @@ //! Client authentication mechanisms. pub mod backend; -pub use backend::DatabaseInfo; +pub use backend::{BackendType, DatabaseInfo}; mod credentials; pub use credentials::ClientCredentials; +mod password_hack; +use password_hack::PasswordHackPayload; + mod flow; pub use flow::*; @@ -29,9 +32,8 @@ pub enum AuthErrorImpl { #[error(transparent)] Sasl(#[from] crate::sasl::Error), - /// For passwords that couldn't be processed by [`backend::legacy_console::parse_password`]. - #[error("Malformed password message")] - MalformedPassword, + #[error("Malformed password message: {0}")] + MalformedPassword(&'static str), /// Errors produced by [`crate::stream::PqStream`]. #[error(transparent)] @@ -76,7 +78,7 @@ impl UserFacingError for AuthError { Console(e) => e.to_string_client(), GetAuthInfo(e) => e.to_string_client(), Sasl(e) => e.to_string_client(), - MalformedPassword => self.to_string(), + MalformedPassword(_) => self.to_string(), _ => "Internal error".to_string(), } } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 1d41f7f932..5e87059c86 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -1,16 +1,14 @@ -mod legacy_console; mod link; mod postgres; pub mod console; +mod legacy_console; pub use legacy_console::{AuthError, AuthErrorImpl}; -use super::ClientCredentials; use crate::{ - compute, - config::{AuthBackendType, ProxyConfig}, - mgmt, + auth::{self, AuthFlow, ClientCredentials}, + compute, config, mgmt, stream::PqStream, waiters::{self, Waiter, Waiters}, }; @@ -78,32 +76,158 @@ impl From for tokio_postgres::Config { } } -pub(super) async fn handle_user( - config: &ProxyConfig, - client: &mut PqStream, - creds: ClientCredentials, -) -> super::Result { - use AuthBackendType::*; - match config.auth_backend { - LegacyConsole => { - legacy_console::handle_user( - &config.auth_endpoint, - &config.auth_link_uri, - client, - &creds, - ) - .await +/// This type serves two purposes: +/// +/// * When `T` is `()`, it's just a regular auth backend selector +/// which we use in [`crate::config::ProxyConfig`]. +/// +/// * However, when we substitute `T` with [`ClientCredentials`], +/// this helps us provide the credentials only to those auth +/// backends which require them for the authentication process. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum BackendType { + /// Legacy Cloud API (V1) + link auth. + LegacyConsole(T), + /// Current Cloud API (V2). + Console(T), + /// Local mock of Cloud API (V2). + Postgres(T), + /// Authentication via a web browser. + Link, +} + +impl BackendType { + /// Very similar to [`std::option::Option::map`]. + /// Maps [`BackendType`] to [`BackendType`] by applying + /// a function to a contained value. + pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType { + use BackendType::*; + match self { + LegacyConsole(x) => LegacyConsole(f(x)), + Console(x) => Console(f(x)), + Postgres(x) => Postgres(f(x)), + Link => Link, + } + } +} + +impl BackendType> { + /// Very similar to [`std::option::Option::transpose`]. + /// This is most useful for error handling. + pub fn transpose(self) -> Result, E> { + use BackendType::*; + match self { + LegacyConsole(x) => x.map(LegacyConsole), + Console(x) => x.map(Console), + Postgres(x) => x.map(Postgres), + Link => Ok(Link), + } + } +} + +impl BackendType { + /// Authenticate the client via the requested backend, possibly using credentials. + pub async fn authenticate( + mut self, + urls: &config::AuthUrls, + client: &mut PqStream, + ) -> super::Result { + use BackendType::*; + + if let Console(creds) | Postgres(creds) = &mut self { + // If there's no project so far, that entails that client doesn't + // support SNI or other means of passing the project name. + // We now expect to see a very specific payload in the place of password. + if creds.project().is_none() { + let payload = AuthFlow::new(client) + .begin(auth::PasswordHack) + .await? + .authenticate() + .await?; + + // Finally we may finish the initialization of `creds`. + // TODO: add missing type safety to ClientCredentials. + creds.project = Some(payload.project); + + let mut config = match &self { + Console(creds) => { + console::Api::new(&urls.auth_endpoint, creds) + .wake_compute() + .await? + } + Postgres(creds) => { + postgres::Api::new(&urls.auth_endpoint, creds) + .wake_compute() + .await? + } + _ => unreachable!("see the patterns above"), + }; + + // We should use a password from payload as well. + config.password(payload.password); + + return Ok(compute::NodeInfo { + reported_auth_ok: false, + config, + }); + } + } + + match self { + LegacyConsole(creds) => { + legacy_console::handle_user( + &urls.auth_endpoint, + &urls.auth_link_uri, + &creds, + client, + ) + .await + } + Console(creds) => { + console::Api::new(&urls.auth_endpoint, &creds) + .handle_user(client) + .await + } + Postgres(creds) => { + postgres::Api::new(&urls.auth_endpoint, &creds) + .handle_user(client) + .await + } + // NOTE: this auth backend doesn't use client credentials. + Link => link::handle_user(&urls.auth_link_uri, client).await, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_backend_type_map() { + let values = [ + BackendType::LegacyConsole(0), + BackendType::Console(0), + BackendType::Postgres(0), + BackendType::Link, + ]; + + for value in values { + assert_eq!(value.map(|x| x), value); + } + } + + #[test] + fn test_backend_type_transpose() { + let values = [ + BackendType::LegacyConsole(Ok::<_, ()>(0)), + BackendType::Console(Ok(0)), + BackendType::Postgres(Ok(0)), + BackendType::Link, + ]; + + for value in values { + assert_eq!(value.map(Result::unwrap), value.transpose().unwrap()); } - Console => { - console::Api::new(&config.auth_endpoint, &creds)? - .handle_user(client) - .await - } - Postgres => { - postgres::Api::new(&config.auth_endpoint, &creds)? - .handle_user(client) - .await - } - Link => link::handle_user(&config.auth_link_uri, client).await, } } diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index 3085f0b0e4..a8ff1a3522 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -1,18 +1,17 @@ //! Cloud API V2. use crate::{ - auth::{self, AuthFlow, ClientCredentials, DatabaseInfo}, - compute, - error::UserFacingError, + auth::{self, AuthFlow, ClientCredentials}, + compute::{self, ComputeConnCfg}, + error::{io_error, UserFacingError}, scram, stream::PqStream, url::ApiUrl, }; use serde::{Deserialize, Serialize}; -use std::{future::Future, io}; +use std::future::Future; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; pub type Result = std::result::Result; @@ -84,8 +83,8 @@ pub(super) struct Api<'a> { impl<'a> Api<'a> { /// Construct an API object containing the auth parameters. - pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result { - Ok(Self { endpoint, creds }) + pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Self { + Self { endpoint, creds } } /// Authenticate the existing user or throw an error. @@ -100,7 +99,7 @@ impl<'a> Api<'a> { let mut url = self.endpoint.clone(); url.path_segments_mut().push("proxy_get_role_secret"); url.query_pairs_mut() - .append_pair("project", self.creds.project_name.as_ref()?) + .append_pair("project", self.creds.project().expect("impossible")) .append_pair("role", &self.creds.user); // TODO: use a proper logger @@ -120,11 +119,11 @@ impl<'a> Api<'a> { } /// Wake up the compute node and return the corresponding connection info. - async fn wake_compute(&self) -> Result { + pub(super) async fn wake_compute(&self) -> Result { let mut url = self.endpoint.clone(); url.path_segments_mut().push("proxy_wake_compute"); - let project_name = self.creds.project_name.as_ref()?; - url.query_pairs_mut().append_pair("project", project_name); + url.query_pairs_mut() + .append_pair("project", self.creds.project().expect("impossible")); // TODO: use a proper logger println!("cplane request: {url}"); @@ -137,16 +136,20 @@ impl<'a> Api<'a> { let response: GetWakeComputeResponse = serde_json::from_str(&resp.text().await.map_err(io_error)?)?; - let (host, port) = parse_host_port(&response.address) - .ok_or(ConsoleAuthError::BadComputeAddress(response.address))?; + // Unfortunately, ownership won't let us use `Option::ok_or` here. + let (host, port) = match parse_host_port(&response.address) { + None => return Err(ConsoleAuthError::BadComputeAddress(response.address)), + Some(x) => x, + }; - Ok(DatabaseInfo { - host, - port, - dbname: self.creds.dbname.to_owned(), - user: self.creds.user.to_owned(), - password: None, - }) + let mut config = ComputeConnCfg::new(); + config + .host(host) + .port(port) + .dbname(&self.creds.dbname) + .user(&self.creds.user); + + Ok(config) } } @@ -160,7 +163,7 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>( ) -> auth::Result where GetAuthInfo: Future>, - WakeCompute: Future>, + WakeCompute: Future>, { let auth_info = get_auth_info(endpoint).await?; @@ -179,48 +182,18 @@ where } }; - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; + let mut config = wake_compute(endpoint).await?; + if let Some(keys) = scram_keys { + config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(keys)); + } Ok(compute::NodeInfo { - db_info: wake_compute(endpoint).await?, - scram_keys, + reported_auth_ok: false, + config, }) } -/// Upcast (almost) any error into an opaque [`io::Error`]. -pub(super) fn io_error(e: impl Into>) -> io::Error { - io::Error::new(io::ErrorKind::Other, e) -} - -fn parse_host_port(input: &str) -> Option<(String, u16)> { +fn parse_host_port(input: &str) -> Option<(&str, u16)> { let (host, port) = input.split_once(':')?; - Some((host.to_owned(), port.parse().ok()?)) -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn parse_db_info() -> anyhow::Result<()> { - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - "password": "password", - }))?; - - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - }))?; - - Ok(()) - } + Some((host, port.parse().ok()?)) } diff --git a/proxy/src/auth/backend/legacy_console.rs b/proxy/src/auth/backend/legacy_console.rs index 467da63a98..7a5e9b6f62 100644 --- a/proxy/src/auth/backend/legacy_console.rs +++ b/proxy/src/auth/backend/legacy_console.rs @@ -11,7 +11,7 @@ use crate::{ use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; +use utils::pq_proto::BeMessage as Be; #[derive(Debug, Error)] pub enum AuthErrorImpl { @@ -76,6 +76,12 @@ enum ProxyAuthResponse { NotReady { ready: bool }, // TODO: get rid of `ready` } +impl ClientCredentials { + fn is_existing_user(&self) -> bool { + self.user.ends_with("@zenith") + } +} + async fn authenticate_proxy_client( auth_endpoint: &reqwest::Url, creds: &ClientCredentials, @@ -100,7 +106,7 @@ async fn authenticate_proxy_client( } let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?; - println!("got auth info: #{:?}", auth_info); + println!("got auth info: {:?}", auth_info); use ProxyAuthResponse::*; let db_info = match auth_info { @@ -128,7 +134,9 @@ async fn handle_existing_user( // Read client's password hash let msg = client.read_password_message().await?; - let md5_response = parse_password(&msg).ok_or(auth::AuthErrorImpl::MalformedPassword)?; + let md5_response = parse_password(&msg).ok_or(auth::AuthErrorImpl::MalformedPassword( + "the password should be a valid null-terminated utf-8 string", + ))?; let db_info = authenticate_proxy_client( auth_endpoint, @@ -139,21 +147,17 @@ async fn handle_existing_user( ) .await?; - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; - Ok(compute::NodeInfo { - db_info, - scram_keys: None, + reported_auth_ok: false, + config: db_info.into(), }) } pub async fn handle_user( auth_endpoint: &reqwest::Url, auth_link_uri: &reqwest::Url, - client: &mut PqStream, creds: &ClientCredentials, + client: &mut PqStream, ) -> auth::Result { if creds.is_existing_user() { handle_existing_user(auth_endpoint, client, creds).await @@ -201,4 +205,24 @@ mod tests { .unwrap(); assert!(matches!(auth, ProxyAuthResponse::NotReady { .. })); } + + #[test] + fn parse_db_info() -> anyhow::Result<()> { + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "password": "password", + }))?; + + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + }))?; + + Ok(()) + } } diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index 669c9e00e9..d658a34825 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -41,7 +41,7 @@ pub async fn handle_user( client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; Ok(compute::NodeInfo { - db_info, - scram_keys: None, + reported_auth_ok: true, + config: db_info.into(), }) } diff --git a/proxy/src/auth/backend/postgres.rs b/proxy/src/auth/backend/postgres.rs index 721b9db095..1d7ab8f249 100644 --- a/proxy/src/auth/backend/postgres.rs +++ b/proxy/src/auth/backend/postgres.rs @@ -3,10 +3,12 @@ use crate::{ auth::{ self, - backend::console::{self, io_error, AuthInfo, Result}, - ClientCredentials, DatabaseInfo, + backend::console::{self, AuthInfo, Result}, + ClientCredentials, }, - compute, scram, + compute::{self, ComputeConnCfg}, + error::io_error, + scram, stream::PqStream, url::ApiUrl, }; @@ -20,8 +22,8 @@ pub(super) struct Api<'a> { impl<'a> Api<'a> { /// Construct an API object containing the auth parameters. - pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result { - Ok(Self { endpoint, creds }) + pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Self { + Self { endpoint, creds } } /// Authenticate the existing user or throw an error. @@ -56,7 +58,10 @@ impl<'a> Api<'a> { // We shouldn't get more than one row anyway. [row, ..] => { - let entry = row.try_get(0).map_err(io_error)?; + let entry = row + .try_get("rolpassword") + .map_err(|e| io_error(format!("failed to read user's password: {e}")))?; + scram::ServerSecret::parse(entry) .map(AuthInfo::Scram) .or_else(|| { @@ -75,14 +80,14 @@ impl<'a> Api<'a> { } /// We don't need to wake anything locally, so we just return the connection info. - async fn wake_compute(&self) -> Result { - Ok(DatabaseInfo { - // TODO: handle that near CLI params parsing - host: self.endpoint.host_str().unwrap_or("localhost").to_owned(), - port: self.endpoint.port().unwrap_or(5432), - dbname: self.creds.dbname.to_owned(), - user: self.creds.user.to_owned(), - password: None, - }) + pub(super) async fn wake_compute(&self) -> Result { + let mut config = ComputeConnCfg::new(); + config + .host(self.endpoint.host_str().unwrap_or("localhost")) + .port(self.endpoint.port().unwrap_or(5432)) + .dbname(&self.creds.dbname) + .user(&self.creds.user); + + Ok(config) } } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index b5312fbe1f..4c72da1c48 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,39 +1,25 @@ //! User credentials used in authentication. -use crate::compute; -use crate::config::ProxyConfig; use crate::error::UserFacingError; -use crate::stream::PqStream; -use std::collections::HashMap; use thiserror::Error; -use tokio::io::{AsyncRead, AsyncWrite}; +use utils::pq_proto::StartupMessageParams; #[derive(Debug, Error, PartialEq, Eq, Clone)] pub enum ClientCredsParseError { - #[error("Parameter `{0}` is missing in startup packet.")] + #[error("Parameter '{0}' is missing in startup packet.")] MissingKey(&'static str), - #[error( - "Project name is not specified. \ - EITHER please upgrade the postgres client library (libpq) for SNI support \ - OR pass the project name as a parameter: '&options=project%3D'." - )] - MissingSNIAndProjectName, - #[error("Inconsistent project name inferred from SNI ('{0}') and project option ('{1}').")] - InconsistentProjectNameAndSNI(String, String), - - #[error("Common name is not set.")] - CommonNameNotSet, + InconsistentProjectNames(String, String), #[error( "SNI ('{1}') inconsistently formatted with respect to common name ('{0}'). \ - SNI should be formatted as '.'." + SNI should be formatted as '.{0}'." )] - InconsistentCommonNameAndSNI(String, String), + InconsistentSni(String, String), - #[error("Project name ('{0}') must contain only alphanumeric characters and hyphens ('-').")] - ProjectNameContainsIllegalChars(String), + #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")] + MalformedProjectName(String), } impl UserFacingError for ClientCredsParseError {} @@ -44,286 +30,171 @@ impl UserFacingError for ClientCredsParseError {} pub struct ClientCredentials { pub user: String, pub dbname: String, - pub project_name: Result, + pub project: Option, } impl ClientCredentials { - pub fn is_existing_user(&self) -> bool { - // This logic will likely change in the future. - self.user.ends_with("@zenith") + pub fn project(&self) -> Option<&str> { + self.project.as_deref() } +} +impl ClientCredentials { pub fn parse( - mut options: HashMap, - sni_data: Option<&str>, + mut options: StartupMessageParams, + sni: Option<&str>, common_name: Option<&str>, ) -> Result { - let mut get_param = |key| { - options - .remove(key) - .ok_or(ClientCredsParseError::MissingKey(key)) - }; + use ClientCredsParseError::*; + // Some parameters are absolutely necessary, others not so much. + let mut get_param = |key| options.remove(key).ok_or(MissingKey(key)); + + // Some parameters are stored in the startup message. let user = get_param("user")?; let dbname = get_param("database")?; - let project_name = get_param("project").ok(); - let project_name = get_project_name(sni_data, common_name, project_name.as_deref()); + let project_a = get_param("project").ok(); + + // Alternative project name is in fact a subdomain from SNI. + // NOTE: we do not consider SNI if `common_name` is missing. + let project_b = sni + .zip(common_name) + .map(|(sni, cn)| { + // TODO: what if SNI is present but just a common name? + subdomain_from_sni(sni, cn) + .ok_or_else(|| InconsistentSni(sni.to_owned(), cn.to_owned())) + }) + .transpose()?; + + let project = match (project_a, project_b) { + // Invariant: if we have both project name variants, they should match. + (Some(a), Some(b)) if a != b => Some(Err(InconsistentProjectNames(a, b))), + (a, b) => a.or(b).map(|name| { + // Invariant: project name may not contain certain characters. + check_project_name(name).map_err(MalformedProjectName) + }), + } + .transpose()?; Ok(Self { user, dbname, - project_name, + project, }) } +} - /// Use credentials to authenticate the user. - pub async fn authenticate( - self, - config: &ProxyConfig, - client: &mut PqStream, - ) -> super::Result { - // This method is just a convenient facade for `handle_user` - super::backend::handle_user(config, client, self).await +fn check_project_name(name: String) -> Result { + if name.chars().all(|c| c.is_alphanumeric() || c == '-') { + Ok(name) + } else { + Err(name) } } -/// Inferring project name from sni_data. -fn project_name_from_sni_data( - sni_data: &str, - common_name: &str, -) -> Result { - let common_name_with_dot = format!(".{common_name}"); - // check that ".{common_name_with_dot}" is the actual suffix in sni_data - if !sni_data.ends_with(&common_name_with_dot) { - return Err(ClientCredsParseError::InconsistentCommonNameAndSNI( - common_name.to_string(), - sni_data.to_string(), +fn subdomain_from_sni(sni: &str, common_name: &str) -> Option { + sni.strip_suffix(common_name)? + .strip_suffix('.') + .map(str::to_owned) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_options<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> StartupMessageParams { + StartupMessageParams::from(pairs.map(|(k, v)| (k.to_owned(), v.to_owned()))) + } + + #[test] + #[ignore = "TODO: fix how database is handled"] + fn parse_bare_minimum() -> anyhow::Result<()> { + // According to postgresql, only `user` should be required. + let options = make_options([("user", "john_doe")]); + + // TODO: check that `creds.dbname` is None. + let creds = ClientCredentials::parse(options, None, None)?; + assert_eq!(creds.user, "john_doe"); + + Ok(()) + } + + #[test] + fn parse_missing_project() -> anyhow::Result<()> { + let options = make_options([("user", "john_doe"), ("database", "world")]); + + let creds = ClientCredentials::parse(options, None, None)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.dbname, "world"); + assert_eq!(creds.project, None); + + Ok(()) + } + + #[test] + fn parse_project_from_sni() -> anyhow::Result<()> { + let options = make_options([("user", "john_doe"), ("database", "world")]); + + let sni = Some("foo.localhost"); + let common_name = Some("localhost"); + + let creds = ClientCredentials::parse(options, sni, common_name)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.dbname, "world"); + assert_eq!(creds.project.as_deref(), Some("foo")); + + Ok(()) + } + + #[test] + fn parse_project_from_options() -> anyhow::Result<()> { + let options = make_options([ + ("user", "john_doe"), + ("database", "world"), + ("project", "bar"), + ]); + + let creds = ClientCredentials::parse(options, None, None)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.dbname, "world"); + assert_eq!(creds.project.as_deref(), Some("bar")); + + Ok(()) + } + + #[test] + fn parse_projects_identical() -> anyhow::Result<()> { + let options = make_options([ + ("user", "john_doe"), + ("database", "world"), + ("project", "baz"), + ]); + + let sni = Some("baz.localhost"); + let common_name = Some("localhost"); + + let creds = ClientCredentials::parse(options, sni, common_name)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.dbname, "world"); + assert_eq!(creds.project.as_deref(), Some("baz")); + + Ok(()) + } + + #[test] + fn parse_projects_different() { + let options = make_options([ + ("user", "john_doe"), + ("database", "world"), + ("project", "first"), + ]); + + let sni = Some("second.localhost"); + let common_name = Some("localhost"); + + assert!(matches!( + ClientCredentials::parse(options, sni, common_name).expect_err("should fail"), + ClientCredsParseError::InconsistentProjectNames(_, _) )); } - // return sni_data without the common name suffix. - Ok(sni_data - .strip_suffix(&common_name_with_dot) - .unwrap() - .to_string()) -} - -#[cfg(test)] -mod tests_for_project_name_from_sni_data { - use super::*; - - #[test] - fn passing() { - let target_project_name = "my-project-123"; - let common_name = "localtest.me"; - let sni_data = format!("{target_project_name}.{common_name}"); - assert_eq!( - project_name_from_sni_data(&sni_data, common_name), - Ok(target_project_name.to_string()) - ); - } - - #[test] - fn throws_inconsistent_common_name_and_sni_data() { - let target_project_name = "my-project-123"; - let common_name = "localtest.me"; - let wrong_suffix = "wrongtest.me"; - assert_eq!(common_name.len(), wrong_suffix.len()); - let wrong_common_name = format!("wrong{wrong_suffix}"); - let sni_data = format!("{target_project_name}.{wrong_common_name}"); - assert_eq!( - project_name_from_sni_data(&sni_data, common_name), - Err(ClientCredsParseError::InconsistentCommonNameAndSNI( - common_name.to_string(), - sni_data - )) - ); - } -} - -/// Determine project name from SNI or from project_name parameter from options argument. -fn get_project_name( - sni_data: Option<&str>, - common_name: Option<&str>, - project_name: Option<&str>, -) -> Result { - // determine the project name from sni_data if it exists, otherwise from project_name. - let ret = match sni_data { - Some(sni_data) => { - let common_name = common_name.ok_or(ClientCredsParseError::CommonNameNotSet)?; - let project_name_from_sni = project_name_from_sni_data(sni_data, common_name)?; - // check invariant: project name from options and from sni should match - if let Some(project_name) = &project_name { - if !project_name_from_sni.eq(project_name) { - return Err(ClientCredsParseError::InconsistentProjectNameAndSNI( - project_name_from_sni, - project_name.to_string(), - )); - } - } - project_name_from_sni - } - None => project_name - .ok_or(ClientCredsParseError::MissingSNIAndProjectName)? - .to_string(), - }; - - // check formatting invariant: project name must contain only alphanumeric characters and hyphens. - if !ret.chars().all(|x: char| x.is_alphanumeric() || x == '-') { - return Err(ClientCredsParseError::ProjectNameContainsIllegalChars(ret)); - } - - Ok(ret) -} - -#[cfg(test)] -mod tests_for_project_name_only { - use super::*; - - #[test] - fn passing_from_sni_data_only() { - let target_project_name = "my-project-123"; - let common_name = "localtest.me"; - let sni_data = format!("{target_project_name}.{common_name}"); - assert_eq!( - get_project_name(Some(&sni_data), Some(common_name), None), - Ok(target_project_name.to_string()) - ); - } - - #[test] - fn throws_project_name_contains_illegal_chars_from_sni_data_only() { - let project_name_prefix = "my-project"; - let project_name_suffix = "123"; - let common_name = "localtest.me"; - - for illegal_char_id in 0..256 { - let illegal_char = char::from_u32(illegal_char_id).unwrap(); - if !(illegal_char.is_alphanumeric() || illegal_char == '-') - && illegal_char.to_string().len() == 1 - { - let target_project_name = - format!("{project_name_prefix}{illegal_char}{project_name_suffix}"); - let sni_data = format!("{target_project_name}.{common_name}"); - assert_eq!( - get_project_name(Some(&sni_data), Some(common_name), None), - Err(ClientCredsParseError::ProjectNameContainsIllegalChars( - target_project_name - )) - ); - } - } - } - - #[test] - fn passing_from_project_name_only() { - let target_project_name = "my-project-123"; - let common_names = [Some("localtest.me"), None]; - for common_name in common_names { - assert_eq!( - get_project_name(None, common_name, Some(target_project_name)), - Ok(target_project_name.to_string()) - ); - } - } - - #[test] - fn throws_project_name_contains_illegal_chars_from_project_name_only() { - let project_name_prefix = "my-project"; - let project_name_suffix = "123"; - let common_names = [Some("localtest.me"), None]; - - for common_name in common_names { - for illegal_char_id in 0..256 { - let illegal_char: char = char::from_u32(illegal_char_id).unwrap(); - if !(illegal_char.is_alphanumeric() || illegal_char == '-') - && illegal_char.to_string().len() == 1 - { - let target_project_name = - format!("{project_name_prefix}{illegal_char}{project_name_suffix}"); - assert_eq!( - get_project_name(None, common_name, Some(&target_project_name)), - Err(ClientCredsParseError::ProjectNameContainsIllegalChars( - target_project_name - )) - ); - } - } - } - } - - #[test] - fn passing_from_sni_data_and_project_name() { - let target_project_name = "my-project-123"; - let common_name = "localtest.me"; - let sni_data = format!("{target_project_name}.{common_name}"); - assert_eq!( - get_project_name( - Some(&sni_data), - Some(common_name), - Some(target_project_name) - ), - Ok(target_project_name.to_string()) - ); - } - - #[test] - fn throws_inconsistent_project_name_and_sni() { - let project_name_param = "my-project-123"; - let wrong_project_name = "not-my-project-123"; - let common_name = "localtest.me"; - let sni_data = format!("{wrong_project_name}.{common_name}"); - assert_eq!( - get_project_name(Some(&sni_data), Some(common_name), Some(project_name_param)), - Err(ClientCredsParseError::InconsistentProjectNameAndSNI( - wrong_project_name.to_string(), - project_name_param.to_string() - )) - ); - } - - #[test] - fn throws_common_name_not_set() { - let target_project_name = "my-project-123"; - let wrong_project_name = "not-my-project-123"; - let common_name = "localtest.me"; - let sni_datas = [ - Some(format!("{wrong_project_name}.{common_name}")), - Some(format!("{target_project_name}.{common_name}")), - ]; - let project_names = [None, Some(target_project_name)]; - for sni_data in sni_datas { - for project_name_param in project_names { - assert_eq!( - get_project_name(sni_data.as_deref(), None, project_name_param), - Err(ClientCredsParseError::CommonNameNotSet) - ); - } - } - } - - #[test] - fn throws_inconsistent_common_name_and_sni_data() { - let target_project_name = "my-project-123"; - let wrong_project_name = "not-my-project-123"; - let common_name = "localtest.me"; - let wrong_suffix = "wrongtest.me"; - assert_eq!(common_name.len(), wrong_suffix.len()); - let wrong_common_name = format!("wrong{wrong_suffix}"); - let sni_datas = [ - Some(format!("{wrong_project_name}.{wrong_common_name}")), - Some(format!("{target_project_name}.{wrong_common_name}")), - ]; - let project_names = [None, Some(target_project_name)]; - for project_name_param in project_names { - for sni_data in &sni_datas { - assert_eq!( - get_project_name(sni_data.as_deref(), Some(common_name), project_name_param), - Err(ClientCredsParseError::InconsistentCommonNameAndSNI( - common_name.to_string(), - sni_data.clone().unwrap().to_string() - )) - ); - } - } - } } diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 7efff13bfc..705f1e3807 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -1,8 +1,7 @@ //! Main authentication flow. -use super::AuthErrorImpl; -use crate::stream::PqStream; -use crate::{sasl, scram}; +use super::{AuthErrorImpl, PasswordHackPayload}; +use crate::{sasl, scram, stream::PqStream}; use std::io; use tokio::io::{AsyncRead, AsyncWrite}; use utils::pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; @@ -27,6 +26,17 @@ impl AuthMethod for Scram<'_> { } } +/// Use an ad hoc auth flow (for clients which don't support SNI) proposed in +/// . +pub struct PasswordHack; + +impl AuthMethod for PasswordHack { + #[inline(always)] + fn first_message(&self) -> BeMessage<'_> { + Be::AuthenticationCleartextPassword + } +} + /// This wrapper for [`PqStream`] performs client authentication. #[must_use] pub struct AuthFlow<'a, Stream, State> { @@ -57,13 +67,34 @@ impl<'a, S: AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { } } +impl AuthFlow<'_, S, PasswordHack> { + /// Perform user authentication. Raise an error in case authentication failed. + pub async fn authenticate(self) -> super::Result { + let msg = self.stream.read_password_message().await?; + let password = msg + .strip_suffix(&[0]) + .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; + + // The so-called "password" should contain a base64-encoded json. + // We will use it later to route the client to their project. + let bytes = base64::decode(password) + .map_err(|_| AuthErrorImpl::MalformedPassword("bad encoding"))?; + + let payload = serde_json::from_slice(&bytes) + .map_err(|_| AuthErrorImpl::MalformedPassword("invalid payload"))?; + + Ok(payload) + } +} + /// Stream wrapper for handling [SCRAM](crate::scram) auth. impl AuthFlow<'_, S, Scram<'_>> { /// Perform user authentication. Raise an error in case authentication failed. pub async fn authenticate(self) -> super::Result { // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; - let sasl = sasl::FirstMessage::parse(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; + let sasl = sasl::FirstMessage::parse(&msg) + .ok_or(AuthErrorImpl::MalformedPassword("bad sasl message"))?; // Currently, the only supported SASL method is SCRAM. if !scram::METHODS.contains(&sasl.method) { diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs new file mode 100644 index 0000000000..6a1258ab31 --- /dev/null +++ b/proxy/src/auth/password_hack.rs @@ -0,0 +1,102 @@ +//! Payload for ad hoc authentication method for clients that don't support SNI. +//! See the `impl` for [`super::backend::BackendType`]. +//! Read more: . + +use serde::{de, Deserialize, Deserializer}; +use std::fmt; + +#[derive(Deserialize)] +#[serde(untagged)] +pub enum Password { + /// A regular string for utf-8 encoded passwords. + Simple { password: String }, + + /// Password is base64-encoded because it may contain arbitrary byte sequences. + Encoded { + #[serde(rename = "password_", deserialize_with = "deserialize_base64")] + password: Vec, + }, +} + +impl AsRef<[u8]> for Password { + fn as_ref(&self) -> &[u8] { + match self { + Password::Simple { password } => password.as_ref(), + Password::Encoded { password } => password.as_ref(), + } + } +} + +#[derive(Deserialize)] +pub struct PasswordHackPayload { + pub project: String, + + #[serde(flatten)] + pub password: Password, +} + +fn deserialize_base64<'a, D: Deserializer<'a>>(des: D) -> Result, D::Error> { + // It's very tempting to replace this with + // + // ``` + // let base64: &str = Deserialize::deserialize(des)?; + // base64::decode(base64).map_err(serde::de::Error::custom) + // ``` + // + // Unfortunately, we can't always deserialize into `&str`, so we'd + // have to use an allocating `String` instead. Thus, visitor is better. + struct Visitor; + + impl<'de> de::Visitor<'de> for Visitor { + type Value = Vec; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("a string") + } + + fn visit_str(self, v: &str) -> Result { + base64::decode(v).map_err(de::Error::custom) + } + } + + des.deserialize_str(Visitor) +} + +#[cfg(test)] +mod tests { + use super::*; + use rstest::rstest; + use serde_json::json; + + #[test] + fn parse_password() -> anyhow::Result<()> { + let password: Password = serde_json::from_value(json!({ + "password": "foo", + }))?; + assert_eq!(password.as_ref(), "foo".as_bytes()); + + let password: Password = serde_json::from_value(json!({ + "password_": base64::encode("foo"), + }))?; + assert_eq!(password.as_ref(), "foo".as_bytes()); + + Ok(()) + } + + #[rstest] + #[case("password", str::to_owned)] + #[case("password_", base64::encode)] + fn parse(#[case] key: &str, #[case] encode: fn(&'static str) -> String) -> anyhow::Result<()> { + let (password, project) = ("password", "pie-in-the-sky"); + let payload = json!({ + "project": project, + key: encode(password), + }); + + let payload: PasswordHackPayload = serde_json::from_value(payload)?; + assert_eq!(payload.password.as_ref(), password.as_bytes()); + assert_eq!(payload.project, project); + + Ok(()) + } +} diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index cccd6e60d4..896ef3588d 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,8 +1,6 @@ -use crate::auth::DatabaseInfo; -use crate::cancellation::CancelClosure; -use crate::error::UserFacingError; -use std::io; -use std::net::SocketAddr; +use crate::{cancellation::CancelClosure, error::UserFacingError}; +use futures::TryFutureExt; +use std::{io, net::SocketAddr}; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::NoTls; @@ -21,44 +19,96 @@ pub enum ConnectionError { FailedToFetchPgVersion, } -impl UserFacingError for ConnectionError {} - -/// PostgreSQL version as [`String`]. -pub type Version = String; +impl UserFacingError for ConnectionError { + fn to_string_client(&self) -> String { + use ConnectionError::*; + match self { + // This helps us drop irrelevant library-specific prefixes. + // TODO: propagate severity level and other parameters. + Postgres(err) => match err.as_db_error() { + Some(err) => err.message().to_string(), + None => err.to_string(), + }, + other => other.to_string(), + } + } +} /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`. pub type ScramKeys = tokio_postgres::config::ScramKeys<32>; -/// Compute node connection params. +pub type ComputeConnCfg = tokio_postgres::Config; + +/// Various compute node info for establishing connection etc. pub struct NodeInfo { - pub db_info: DatabaseInfo, - pub scram_keys: Option, + /// Did we send [`utils::pq_proto::BeMessage::AuthenticationOk`]? + pub reported_auth_ok: bool, + /// Compute node connection params. + pub config: tokio_postgres::Config, } impl NodeInfo { async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { - let host_port = (self.db_info.host.as_str(), self.db_info.port); - let socket = TcpStream::connect(host_port).await?; - let socket_addr = socket.peer_addr()?; - socket2::SockRef::from(&socket).set_keepalive(true)?; + use tokio_postgres::config::Host; - Ok((socket_addr, socket)) + let connect_once = |host, port| { + TcpStream::connect((host, port)).and_then(|socket| async { + let socket_addr = socket.peer_addr()?; + // This prevents load balancer from severing the connection. + socket2::SockRef::from(&socket).set_keepalive(true)?; + Ok((socket_addr, socket)) + }) + }; + + // We can't reuse connection establishing logic from `tokio_postgres` here, + // because it has no means for extracting the underlying socket which we + // require for our business. + let mut connection_error = None; + let ports = self.config.get_ports(); + for (i, host) in self.config.get_hosts().iter().enumerate() { + let port = ports.get(i).or_else(|| ports.get(0)).unwrap_or(&5432); + let host = match host { + Host::Tcp(host) => host.as_str(), + Host::Unix(_) => continue, // unix sockets are not welcome here + }; + + // TODO: maybe we should add a timeout. + match connect_once(host, *port).await { + Ok(socket) => return Ok(socket), + Err(err) => { + // We can't throw an error here, as there might be more hosts to try. + println!("failed to connect to compute `{host}:{port}`: {err}"); + connection_error = Some(err); + } + } + } + + Err(connection_error.unwrap_or_else(|| { + io::Error::new( + io::ErrorKind::Other, + format!("couldn't connect: bad compute config: {:?}", self.config), + ) + })) } +} +pub struct PostgresConnection { + /// Socket connected to a compute node. + pub stream: TcpStream, + /// PostgreSQL version of this instance. + pub version: String, +} + +impl NodeInfo { /// Connect to a corresponding compute node. - pub async fn connect(self) -> Result<(TcpStream, Version, CancelClosure), ConnectionError> { - let (socket_addr, mut socket) = self + pub async fn connect(&self) -> Result<(PostgresConnection, CancelClosure), ConnectionError> { + let (socket_addr, mut stream) = self .connect_raw() .await .map_err(|_| ConnectionError::FailedToConnectToCompute)?; - let mut config = tokio_postgres::Config::from(self.db_info); - if let Some(scram_keys) = self.scram_keys { - config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(scram_keys)); - } - // TODO: establish a secure connection to the DB - let (client, conn) = config.connect_raw(&mut socket, NoTls).await?; + let (client, conn) = self.config.connect_raw(&mut stream, NoTls).await?; let version = conn .parameter("server_version") .ok_or(ConnectionError::FailedToFetchPgVersion)? @@ -66,6 +116,8 @@ impl NodeInfo { let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); - Ok((socket, version, cancel_closure)) + let db = PostgresConnection { stream, version }; + + Ok((db, cancel_closure)) } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index df3923de1a..1f01c25734 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,28 +1,16 @@ -use crate::url::ApiUrl; +use crate::{auth, url::ApiUrl}; use anyhow::{bail, ensure, Context}; use std::{str::FromStr, sync::Arc}; -#[derive(Debug)] -pub enum AuthBackendType { - /// Legacy Cloud API (V1). - LegacyConsole, - /// Authentication via a web browser. - Link, - /// Current Cloud API (V2). - Console, - /// Local mock of Cloud API (V2). - Postgres, -} - -impl FromStr for AuthBackendType { +impl FromStr for auth::BackendType<()> { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { - use AuthBackendType::*; + use auth::BackendType::*; Ok(match s { - "legacy" => LegacyConsole, - "console" => Console, - "postgres" => Postgres, + "legacy" => LegacyConsole(()), + "console" => Console(()), + "postgres" => Postgres(()), "link" => Link, _ => bail!("Invalid option `{s}` for auth method"), }) @@ -31,7 +19,11 @@ impl FromStr for AuthBackendType { pub struct ProxyConfig { pub tls_config: Option, - pub auth_backend: AuthBackendType, + pub auth_backend: auth::BackendType<()>, + pub auth_urls: AuthUrls, +} + +pub struct AuthUrls { pub auth_endpoint: ApiUrl, pub auth_link_uri: ApiUrl, } @@ -87,10 +79,8 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result>) -> io::Error { + io::Error::new(io::ErrorKind::Other, e) +} diff --git a/proxy/src/main.rs b/proxy/src/main.rs index b68b2440dd..2521f2af21 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -118,11 +118,15 @@ async fn main() -> anyhow::Result<()> { let mgmt_address: SocketAddr = arg_matches.value_of("mgmt").unwrap().parse()?; let http_address: SocketAddr = arg_matches.value_of("http").unwrap().parse()?; + let auth_urls = config::AuthUrls { + auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, + auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, + }; + let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig { tls_config, auth_backend: arg_matches.value_of("auth-backend").unwrap().parse()?, - auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, - auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, + auth_urls, })); println!("Version: {GIT_VERSION}"); diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 7e364b5e9c..f202782109 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -82,11 +82,22 @@ async fn handle_client( } let tls = config.tls_config.as_ref(); - let (stream, creds) = match handshake(stream, tls, cancel_map).await? { + let (mut stream, params) = match handshake(stream, tls, cancel_map).await? { Some(x) => x, None => return Ok(()), // it's a cancellation request }; + let creds = { + let sni = stream.get_ref().sni_hostname(); + let common_name = tls.and_then(|tls| tls.common_name.as_deref()); + let result = config + .auth_backend + .map(|_| auth::ClientCredentials::parse(params, sni, common_name)) + .transpose(); + + async { result }.or_else(|e| stream.throw_error(e)).await? + }; + let client = Client::new(stream, creds); cancel_map .with_session(|session| client.connect_to_db(config, session)) @@ -101,12 +112,10 @@ async fn handshake( stream: S, mut tls: Option<&TlsConfig>, cancel_map: &CancelMap, -) -> anyhow::Result>, auth::ClientCredentials)>> { +) -> anyhow::Result>, StartupMessageParams)>> { // Client may try upgrading to each protocol only once let (mut tried_ssl, mut tried_gss) = (false, false); - let common_name = tls.and_then(|cfg| cfg.common_name.as_deref()); - let mut stream = PqStream::new(Stream::from_raw(stream)); loop { let msg = stream.read_startup_packet().await?; @@ -147,18 +156,7 @@ async fn handshake( stream.throw_error_str(ERR_INSECURE_CONNECTION).await?; } - // Get SNI info when available - let sni_data = match stream.get_ref() { - Stream::Tls { tls } => tls.get_ref().1.sni_hostname().map(|s| s.to_owned()), - _ => None, - }; - - // Construct credentials - let creds = - auth::ClientCredentials::parse(params, sni_data.as_deref(), common_name); - let creds = async { creds }.or_else(|e| stream.throw_error(e)).await?; - - break Ok(Some((stream, creds))); + break Ok(Some((stream, params))); } CancelRequest(cancel_key_data) => { cancel_map.cancel_session(cancel_key_data).await?; @@ -174,12 +172,12 @@ struct Client { /// The underlying libpq protocol stream. stream: PqStream, /// Client credentials that we care about. - creds: auth::ClientCredentials, + creds: auth::BackendType, } impl Client { /// Construct a new connection context. - fn new(stream: PqStream, creds: auth::ClientCredentials) -> Self { + fn new(stream: PqStream, creds: auth::BackendType) -> Self { Self { stream, creds } } } @@ -194,16 +192,22 @@ impl Client { let Self { mut stream, creds } = self; // Authenticate and connect to a compute node. - let auth = creds.authenticate(config, &mut stream).await; + let auth = creds.authenticate(&config.auth_urls, &mut stream).await; let node = async { auth }.or_else(|e| stream.throw_error(e)).await?; - let (db, version, cancel_closure) = - node.connect().or_else(|e| stream.throw_error(e)).await?; + let (db, cancel_closure) = node.connect().or_else(|e| stream.throw_error(e)).await?; let cancel_key_data = session.enable_cancellation(cancel_closure); + // Report authentication success if we haven't done this already. + if !node.reported_auth_ok { + stream + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())?; + } + stream .write_message_noflush(&BeMessage::ParameterStatus( - BeParameterStatusMessage::ServerVersion(&version), + BeParameterStatusMessage::ServerVersion(&db.version), ))? .write_message_noflush(&Be::BackendKeyData(cancel_key_data))? .write_message(&BeMessage::ReadyForQuery) @@ -217,7 +221,7 @@ impl Client { } // Starting from here we only proxy the client's traffic. - let mut db = MetricsStream::new(db, inc_proxied); + let mut db = MetricsStream::new(db.stream, inc_proxied); let mut client = MetricsStream::new(stream.into_inner(), inc_proxied); let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?; @@ -279,9 +283,13 @@ mod tests { let config = rustls::ServerConfig::builder() .with_safe_defaults() .with_no_client_auth() - .with_single_cert(vec![cert], key)?; + .with_single_cert(vec![cert], key)? + .into(); - config.into() + TlsConfig { + config, + common_name: Some(common_name.to_string()), + } }; let client_config = { @@ -297,11 +305,6 @@ mod tests { ClientConfig { config, hostname } }; - let tls_config = TlsConfig { - config: tls_config, - common_name: Some(common_name.to_string()), - }; - Ok((client_config, tls_config)) } @@ -357,7 +360,7 @@ mod tests { auth: impl TestAuth + Send, ) -> anyhow::Result<()> { let cancel_map = CancelMap::default(); - let (mut stream, _creds) = handshake(client, tls.as_ref(), &cancel_map) + let (mut stream, _params) = handshake(client, tls.as_ref(), &cancel_map) .await? .context("handshake failed")?; @@ -436,32 +439,6 @@ mod tests { proxy.await? } - #[tokio::test] - async fn give_user_an_error_for_bad_creds() -> anyhow::Result<()> { - let (client, server) = tokio::io::duplex(1024); - - let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth)); - - let client_err = tokio_postgres::Config::new() - .ssl_mode(SslMode::Disable) - .connect_raw(server, NoTls) - .await - .err() // -> Option - .context("client shouldn't be able to connect")?; - - // TODO: this is ugly, but `format!` won't allow us to extract fmt string - assert!(client_err.to_string().contains("missing in startup packet")); - - let server_err = proxy - .await? - .err() // -> Option - .context("server shouldn't accept client")?; - - assert!(client_err.to_string().contains(&server_err.to_string())); - - Ok(()) - } - #[tokio::test] async fn keepalive_is_inherited() -> anyhow::Result<()> { use tokio::net::{TcpListener, TcpStream}; diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 42b0185fde..54ff8bcc07 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -145,6 +145,14 @@ impl Stream { pub fn from_raw(raw: S) -> Self { Self::Raw { raw } } + + /// Return SNI hostname when it's available. + pub fn sni_hostname(&self) -> Option<&str> { + match self { + Stream::Raw { .. } => None, + Stream::Tls { tls } => tls.get_ref().1.sni_hostname(), + } + } } #[derive(Debug, Error)] diff --git a/test_runner/batch_others/test_proxy.py b/test_runner/batch_others/test_proxy.py index ebeede8df7..92c8475e69 100644 --- a/test_runner/batch_others/test_proxy.py +++ b/test_runner/batch_others/test_proxy.py @@ -1,8 +1,34 @@ import pytest +import json +import base64 def test_proxy_select_1(static_proxy): - static_proxy.safe_psql("select 1;", options="project=generic-project-name") + static_proxy.safe_psql('select 1', options='project=generic-project-name') + + +def test_password_hack(static_proxy): + user = 'borat' + password = 'password' + static_proxy.safe_psql(f"create role {user} with login password '{password}'", + options='project=irrelevant') + + def encode(s: str) -> str: + return base64.b64encode(s.encode('utf-8')).decode('utf-8') + + magic = encode(json.dumps({ + 'project': 'irrelevant', + 'password': password, + })) + + static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic) + + magic = encode(json.dumps({ + 'project': 'irrelevant', + 'password_': encode(password), + })) + + static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic) # Pass extra options to the server. @@ -11,8 +37,8 @@ def test_proxy_select_1(static_proxy): # See https://github.com/neondatabase/neon/issues/1287 @pytest.mark.xfail def test_proxy_options(static_proxy): - with static_proxy.connect(options="-cproxytest.option=value") as conn: + with static_proxy.connect(options='-cproxytest.option=value') as conn: with conn.cursor() as cur: - cur.execute("SHOW proxytest.option;") + cur.execute('SHOW proxytest.option') value = cur.fetchall()[0][0] assert value == 'value' diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 3a6a233208..b1fba29e3b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -30,7 +30,7 @@ from dataclasses import dataclass # Type-related stuff from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import make_dsn, parse_dsn -from typing import Any, Callable, Dict, Iterator, List, Optional, Type, TypeVar, cast, Union, Tuple +from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple from typing_extensions import Literal import requests @@ -280,20 +280,18 @@ class PgProtocol: return str(make_dsn(**self.conn_options(**kwargs))) def conn_options(self, **kwargs): - conn_options = self.default_options.copy() + result = self.default_options.copy() if 'dsn' in kwargs: - conn_options.update(parse_dsn(kwargs['dsn'])) - conn_options.update(kwargs) + result.update(parse_dsn(kwargs['dsn'])) + result.update(kwargs) # Individual statement timeout in seconds. 2 minutes should be # enough for our tests, but if you need a longer, you can # change it by calling "SET statement_timeout" after # connecting. - if 'options' in conn_options: - conn_options['options'] = f"-cstatement_timeout=120s " + conn_options['options'] - else: - conn_options['options'] = "-cstatement_timeout=120s" - return conn_options + options = result.get('options', '') + result['options'] = f'-cstatement_timeout=120s {options}' + return result # autocommit=True here by default because that's what we need most of the time def connect(self, autocommit=True, **kwargs) -> PgConnection: @@ -1514,29 +1512,25 @@ def remote_pg(test_output_dir: Path) -> Iterator[RemotePostgres]: class NeonProxy(PgProtocol): - def __init__(self, port: int, pg_port: int): - super().__init__(host="127.0.0.1", - user="proxy_user", - password="pytest2", - port=port, - dbname='postgres') - self.http_port = 7001 - self.host = "127.0.0.1" - self.port = port - self.pg_port = pg_port + def __init__(self, proxy_port: int, http_port: int, auth_endpoint: str): + super().__init__(dsn=auth_endpoint, port=proxy_port) + self.host = '127.0.0.1' + self.http_port = http_port + self.proxy_port = proxy_port + self.auth_endpoint = auth_endpoint self._popen: Optional[subprocess.Popen[bytes]] = None def start(self) -> None: assert self._popen is None # Start proxy - bin_proxy = os.path.join(str(neon_binpath), 'proxy') - args = [bin_proxy] - args.extend(["--http", f"{self.host}:{self.http_port}"]) - args.extend(["--proxy", f"{self.host}:{self.port}"]) - args.extend(["--auth-backend", "postgres"]) - args.extend( - ["--auth-endpoint", f"postgres://proxy_auth:pytest1@localhost:{self.pg_port}/postgres"]) + args = [ + os.path.join(str(neon_binpath), 'proxy'), + *["--http", f"{self.host}:{self.http_port}"], + *["--proxy", f"{self.host}:{self.proxy_port}"], + *["--auth-backend", "postgres"], + *["--auth-endpoint", self.auth_endpoint], + ] self._popen = subprocess.Popen(args) self._wait_until_ready() @@ -1557,13 +1551,21 @@ class NeonProxy(PgProtocol): @pytest.fixture(scope='function') def static_proxy(vanilla_pg, port_distributor) -> Iterator[NeonProxy]: """Neon proxy that routes directly to vanilla postgres.""" - vanilla_pg.start() - vanilla_pg.safe_psql("create user proxy_auth with password 'pytest1' superuser") - vanilla_pg.safe_psql("create user proxy_user with password 'pytest2'") - port = port_distributor.get_port() - pg_port = vanilla_pg.default_options['port'] - with NeonProxy(port, pg_port) as proxy: + # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql` + vanilla_pg.start() + vanilla_pg.safe_psql("create user proxy with login superuser password 'password'") + + port = vanilla_pg.default_options['port'] + host = vanilla_pg.default_options['host'] + dbname = vanilla_pg.default_options['dbname'] + auth_endpoint = f'postgres://proxy:password@{host}:{port}/{dbname}' + + proxy_port = port_distributor.get_port() + http_port = port_distributor.get_port() + + with NeonProxy(proxy_port=proxy_port, http_port=http_port, + auth_endpoint=auth_endpoint) as proxy: proxy.start() yield proxy From 45680f9a2d36d3c14ed1daa20565d849a53aa80f Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 25 Jul 2022 18:30:30 +0300 Subject: [PATCH 0552/1022] Drop CircleCI runs (#2082) --- .circleci/config.yml | 369 ------------------ .../actions/run-python-test-set/action.yml | 2 +- Dockerfile.compute-tools | 2 +- .../batch_others/test_wal_acceptor_async.py | 5 +- 4 files changed, 4 insertions(+), 374 deletions(-) delete mode 100644 .circleci/config.yml diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 00a51eb906..0000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,369 +0,0 @@ -version: 2.1 - -executors: - neon-xlarge-executor: - resource_class: xlarge - docker: - # NB: when changed, do not forget to update rust image tag in all Dockerfiles - - image: neondatabase/rust:1.58 - neon-executor: - docker: - - image: neondatabase/rust:1.58 - -jobs: - # A job to build postgres - build-postgres: - executor: neon-xlarge-executor - parameters: - build_type: - type: enum - enum: ["debug", "release"] - environment: - BUILD_TYPE: << parameters.build_type >> - steps: - # Checkout the git repo (circleci doesn't have a flag to enable submodules here) - - checkout - - # Grab the postgres git revision to build a cache key. - # Append makefile as it could change the way postgres is built. - # Note this works even though the submodule hasn't been checkout out yet. - - run: - name: Get postgres cache key - command: | - git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres - cat Makefile >> /tmp/cache-key-postgres - - - restore_cache: - name: Restore postgres cache - keys: - # Restore ONLY if the rev key matches exactly - - v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} - - # Build postgres if the restore_cache didn't find a build. - # `make` can't figure out whether the cache is valid, since - # it only compares file timestamps. - - run: - name: build postgres - command: | - if [ ! -e tmp_install/bin/postgres ]; then - # "depth 1" saves some time by not cloning the whole repo - git submodule update --init --depth 1 - # bail out on any warnings - COPT='-Werror' mold -run make postgres -j$(nproc) - fi - - - save_cache: - name: Save postgres cache - key: v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} - paths: - - tmp_install - - # A job to build Neon rust code - build-neon: - executor: neon-xlarge-executor - parameters: - build_type: - type: enum - enum: ["debug", "release"] - environment: - BUILD_TYPE: << parameters.build_type >> - steps: - # Checkout the git repo (without submodules) - - checkout - - # Grab the postgres git revision to build a cache key. - # Append makefile as it could change the way postgres is built. - # Note this works even though the submodule hasn't been checkout out yet. - - run: - name: Get postgres cache key - command: | - git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres - cat Makefile >> /tmp/cache-key-postgres - - - - restore_cache: - name: Restore postgres cache - keys: - # Restore ONLY if the rev key matches exactly - - v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} - - - restore_cache: - name: Restore rust cache - keys: - # Require an exact match. While an out of date cache might speed up the build, - # there's no way to clean out old packages, so the cache grows every time something - # changes. - - v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} - - # Build the rust code, including test binaries - - run: - name: Rust build << parameters.build_type >> - command: | - if [[ $BUILD_TYPE == "debug" ]]; then - CARGO_FLAGS= - elif [[ $BUILD_TYPE == "release" ]]; then - CARGO_FLAGS="--release --features profiling" - fi - - export CARGO_INCREMENTAL=0 - export CACHEPOT_BUCKET=zenith-rust-cachepot - export RUSTC_WRAPPER="" - export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" - export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" - mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests - cachepot -s - - - save_cache: - name: Save rust cache - key: v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} - paths: - - ~/.cargo/registry - - ~/.cargo/git - - target - - # Run rust unit tests - - run: - name: cargo test - command: | - if [[ $BUILD_TYPE == "debug" ]]; then - CARGO_FLAGS= - elif [[ $BUILD_TYPE == "release" ]]; then - CARGO_FLAGS=--release - fi - - cargo test $CARGO_FLAGS - - # Install the rust binaries, for use by test jobs - - run: - name: Install rust binaries - command: | - binaries=$( - cargo metadata --format-version=1 --no-deps | - jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' - ) - - mkdir -p /tmp/zenith/bin - mkdir -p /tmp/zenith/test_bin - mkdir -p /tmp/zenith/etc - - # Install target binaries - for bin in $binaries; do - SRC=target/$BUILD_TYPE/$bin - DST=/tmp/zenith/bin/$bin - cp $SRC $DST - done - - # Install the postgres binaries, for use by test jobs - - run: - name: Install postgres binaries - command: | - cp -a tmp_install /tmp/zenith/pg_install - - # Save rust binaries for other jobs in the workflow - - persist_to_workspace: - root: /tmp/zenith - paths: - - "*" - - check-codestyle-python: - executor: neon-executor - steps: - - checkout - - restore_cache: - keys: - - v2-python-deps-{{ checksum "poetry.lock" }} - - run: - name: Install deps - command: ./scripts/pysync - - save_cache: - key: v2-python-deps-{{ checksum "poetry.lock" }} - paths: - - /home/circleci/.cache/pypoetry/virtualenvs - - run: - name: Print versions - when: always - command: | - poetry run python --version - poetry show - - run: - name: Run yapf to ensure code format - when: always - command: poetry run yapf --recursive --diff . - - run: - name: Run mypy to check types - when: always - command: poetry run mypy . - - run-pytest: - executor: neon-executor - parameters: - # pytest args to specify the tests to run. - # - # This can be a test file name, e.g. 'test_pgbench.py, or a subdirectory, - # or '-k foobar' to run tests containing string 'foobar'. See pytest man page - # section SPECIFYING TESTS / SELECTING TESTS for details. - # - # Select the type of Rust build. Must be "release" or "debug". - build_type: - type: string - default: "debug" - # This parameter is required, to prevent the mistake of running all tests in one job. - test_selection: - type: string - default: "" - # Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr - extra_params: - type: string - default: "" - needs_postgres_source: - type: boolean - default: false - run_in_parallel: - type: boolean - default: true - save_perf_report: - type: boolean - default: false - environment: - BUILD_TYPE: << parameters.build_type >> - steps: - - attach_workspace: - at: /tmp/zenith - - checkout - - when: - condition: << parameters.needs_postgres_source >> - steps: - - run: git submodule update --init --depth 1 - - restore_cache: - keys: - - v2-python-deps-{{ checksum "poetry.lock" }} - - run: - name: Install deps - command: ./scripts/pysync - - save_cache: - key: v2-python-deps-{{ checksum "poetry.lock" }} - paths: - - /home/circleci/.cache/pypoetry/virtualenvs - - run: - name: Run pytest - # pytest doesn't output test logs in real time, so CI job may fail with - # `Too long with no output` error, if a test is running for a long time. - # In that case, tests should have internal timeouts that are less than - # no_output_timeout, specified here. - no_output_timeout: 10m - environment: - - NEON_BIN: /tmp/zenith/bin - - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install - - TEST_OUTPUT: /tmp/test_output - # this variable will be embedded in perf test report - # and is needed to distinguish different environments - - PLATFORM: zenith-local-ci - command: | - PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)" - rm -rf $PERF_REPORT_DIR - - TEST_SELECTION="test_runner/<< parameters.test_selection >>" - EXTRA_PARAMS="<< parameters.extra_params >>" - if [ -z "$TEST_SELECTION" ]; then - echo "test_selection must be set" - exit 1 - fi - if << parameters.run_in_parallel >>; then - EXTRA_PARAMS="-n4 $EXTRA_PARAMS" - fi - if << parameters.save_perf_report >>; then - if [[ $CIRCLE_BRANCH == "main" ]]; then - mkdir -p "$PERF_REPORT_DIR" - EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" - fi - fi - - export GITHUB_SHA=$CIRCLE_SHA1 - - # Run the tests. - # - # The junit.xml file allows CircleCI to display more fine-grained test information - # in its "Tests" tab in the results page. - # --verbose prints name of each test (helpful when there are - # multiple tests in one file) - # -rA prints summary in the end - # -n4 uses four processes to run tests via pytest-xdist - # -s is not used to prevent pytest from capturing output, because tests are running - # in parallel and logs are mixed between different tests - ./scripts/pytest \ - --junitxml=$TEST_OUTPUT/junit.xml \ - --tb=short \ - --verbose \ - -m "not remote_cluster" \ - -rA $TEST_SELECTION $EXTRA_PARAMS - - if << parameters.save_perf_report >>; then - if [[ $CIRCLE_BRANCH == "main" ]]; then - export REPORT_FROM="$PERF_REPORT_DIR" - export REPORT_TO=local - scripts/generate_and_push_perf_report.sh - fi - fi - - run: - # CircleCI artifacts are preserved one file at a time, so skipping - # this step isn't a good idea. If you want to extract the - # pageserver state, perhaps a tarball would be a better idea. - name: Delete all data but logs - when: always - command: | - du -sh /tmp/test_output/* - find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete - du -sh /tmp/test_output/* - - store_artifacts: - path: /tmp/test_output - # The store_test_results step tells CircleCI where to find the junit.xml file. - - store_test_results: - path: /tmp/test_output - # Save data (if any) - - persist_to_workspace: - root: /tmp/zenith - paths: - - "*" - -workflows: - build_and_test: - jobs: - - check-codestyle-python - - build-postgres: - name: build-postgres-<< matrix.build_type >> - matrix: - parameters: - build_type: ["debug", "release"] - - build-neon: - name: build-neon-<< matrix.build_type >> - matrix: - parameters: - build_type: ["debug", "release"] - requires: - - build-postgres-<< matrix.build_type >> - - run-pytest: - name: pg_regress-tests-<< matrix.build_type >> - matrix: - parameters: - build_type: ["debug", "release"] - test_selection: batch_pg_regress - needs_postgres_source: true - requires: - - build-neon-<< matrix.build_type >> - - run-pytest: - name: other-tests-<< matrix.build_type >> - matrix: - parameters: - build_type: ["debug", "release"] - test_selection: batch_others - requires: - - build-neon-<< matrix.build_type >> - - run-pytest: - name: benchmarks - context: PERF_TEST_RESULT_CONNSTR - build_type: release - test_selection: performance - run_in_parallel: false - save_perf_report: true - requires: - - build-neon-release diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 0d058d47c1..a956929d92 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -99,7 +99,7 @@ runs: # Run the tests. # - # The junit.xml file allows CircleCI to display more fine-grained test information + # The junit.xml file allows CI tools to display more fine-grained test information # in its "Tests" tab in the results page. # --verbose prints name of each test (helpful when there are # multiple tests in one file) diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 87b73e139c..76cbc2ac30 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,5 +1,5 @@ # First transient image to build compute_tools binaries -# NB: keep in sync with rust image version in .circle/config.yml +# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml FROM neondatabase/rust:1.58 AS rust-build # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index 9577c0980e..bf7d8e3645 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -146,9 +146,8 @@ async def run_restarts_under_load(env: NeonEnv, max_transfer=100, period_time=4, iterations=10): - # Set timeout for this test at 5 minutes. It should be enough for test to complete - # and less than CircleCI's no_output_timeout, taking into account that this timeout - # is checked only at the beginning of every iteration. + # Set timeout for this test at 5 minutes. It should be enough for test to complete, + # taking into account that this timeout is checked only at the beginning of every iteration. test_timeout_at = time.monotonic() + 5 * 60 pg_conn = await pg.connect_async() From 28243d68e60ffc7e69f158522f589f7d2e09186d Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 26 Jul 2022 09:11:10 +0300 Subject: [PATCH 0553/1022] Yet another apporach of copying logical timeline size during branch creation (#2139) * Yet another apporach of copying logical timeline size during branch creation * Fix unit tests * Update pageserver/src/layered_repository.rs Co-authored-by: Thang Pham * Update pageserver/src/layered_repository.rs Co-authored-by: Thang Pham * Update pageserver/src/layered_repository.rs Co-authored-by: Thang Pham Co-authored-by: Thang Pham --- pageserver/src/layered_repository.rs | 44 +++++++++++++++++++++++++--- pageserver/src/pgdatadir_mapping.rs | 6 ++++ pageserver/src/tenant_mgr.rs | 8 +++-- 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 3830e4c1bd..5c65b5dc7e 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -259,6 +259,7 @@ impl Repository for LayeredRepository { self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, + None, ); timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); @@ -323,6 +324,20 @@ impl Repository for LayeredRepository { )); } } + // Copy logical size from source timeline if we are branching on the last position. + let init_logical_size = + if let Ok(src_pgdir) = tenant_mgr::get_local_timeline_with_load(self.tenant_id, src) { + let logical_size = src_pgdir.get_current_logical_size(); + // Check LSN after getting logical size to exclude race condition + // when ancestor timeline is concurrently updated + if src_timeline.get_last_record_lsn() == start_lsn { + Some(logical_size) + } else { + None + } + } else { + None + }; // Determine prev-LSN for the new timeline. We can only determine it if // the timeline was branched at the current end of the source timeline. @@ -353,7 +368,14 @@ impl Repository for LayeredRepository { ); crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; Self::save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; - timelines.insert(dst, LayeredTimelineEntry::Unloaded { id: dst, metadata }); + timelines.insert( + dst, + LayeredTimelineEntry::Unloaded { + id: dst, + metadata, + init_logical_size, + }, + ); info!("branched timeline {} from {} at {}", dst, src, start_lsn); @@ -489,7 +511,7 @@ impl Repository for LayeredRepository { // we need to get metadata of a timeline, another option is to pass it along with Downloaded status let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?; // finally we make newly downloaded timeline visible to repository - entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, }) + entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, init_logical_size: None }) }, }; Ok(()) @@ -506,6 +528,7 @@ enum LayeredTimelineEntry { Unloaded { id: ZTimelineId, metadata: TimelineMetadata, + init_logical_size: Option, }, } @@ -673,13 +696,18 @@ impl LayeredRepository { timelineid: ZTimelineId, timelines: &mut HashMap, ) -> anyhow::Result>> { + let logical_size: Option; match timelines.get(&timelineid) { Some(entry) => match entry { LayeredTimelineEntry::Loaded(local_timeline) => { debug!("timeline {} found loaded into memory", &timelineid); return Ok(Some(Arc::clone(local_timeline))); } - LayeredTimelineEntry::Unloaded { .. } => {} + LayeredTimelineEntry::Unloaded { + init_logical_size, .. + } => { + logical_size = *init_logical_size; + } }, None => { debug!("timeline {} not found", &timelineid); @@ -690,7 +718,7 @@ impl LayeredRepository { "timeline {} found on a local disk, but not loaded into the memory, loading", &timelineid ); - let timeline = self.load_local_timeline(timelineid, timelines)?; + let timeline = self.load_local_timeline(timelineid, timelines, logical_size)?; let was_loaded = timelines.insert( timelineid, LayeredTimelineEntry::Loaded(Arc::clone(&timeline)), @@ -707,6 +735,7 @@ impl LayeredRepository { &self, timeline_id: ZTimelineId, timelines: &mut HashMap, + init_logical_size: Option, ) -> anyhow::Result> { let metadata = load_metadata(self.conf, timeline_id, self.tenant_id) .context("failed to load metadata")?; @@ -733,6 +762,7 @@ impl LayeredRepository { self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, + init_logical_size, ); timeline .load_layer_map(disk_consistent_lsn) @@ -1099,6 +1129,10 @@ pub struct LayeredTimeline { // It can be unified with latest_gc_cutoff_lsn under some "first_valid_lsn", // though lets keep them both for better error visibility. initdb_lsn: Lsn, + + // Initial logical size of timeline (if known). + // Logical size can be copied from ancestor timeline when new branch is create at last LSN + pub init_logical_size: Option, } /// @@ -1299,6 +1333,7 @@ impl LayeredTimeline { tenant_id: ZTenantId, walredo_mgr: Arc, upload_layers: bool, + init_logical_size: Option, ) -> LayeredTimeline { let reconstruct_time_histo = RECONSTRUCT_TIME .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) @@ -1377,6 +1412,7 @@ impl LayeredTimeline { latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), + init_logical_size, } } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 788c9de29e..f703fa16af 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -76,6 +76,12 @@ impl DatadirTimeline { Ok(()) } + /// Set timeline logical size. + pub fn set_logical_size(&self, size: usize) { + self.current_logical_size + .store(size as isize, Ordering::SeqCst); + } + /// Start ingesting a WAL record, or other atomic modification of /// the timeline. /// diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 1759d3bbb8..a485e7c2cb 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -494,12 +494,16 @@ fn load_local_timeline( format!("Inmem timeline {timeline_id} not found in tenant's repository") })?; let repartition_distance = repo.get_checkpoint_distance() / 10; + let init_logical_size = inmem_timeline.init_logical_size; let page_tline = Arc::new(DatadirTimelineImpl::new( inmem_timeline, repartition_distance, )); - page_tline.init_logical_size()?; - + if let Some(logical_size) = init_logical_size { + page_tline.set_logical_size(logical_size); + } else { + page_tline.init_logical_size()?; + } tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach { id: ZTenantTimelineId::new(repo.tenant_id(), timeline_id), datadir: Arc::clone(&page_tline), From 172314155e4bedd17904cf9eb3b49598fc3abfd1 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 26 Jul 2022 00:59:14 +0300 Subject: [PATCH 0554/1022] Compact only once on psql checkpoint call --- pageserver/src/page_service.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 078edc5c9f..3dba207ab9 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1159,14 +1159,9 @@ impl postgres_backend::Handler for PageServerHandler { let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) .context("Cannot load local timeline")?; + // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). timeline.tline.checkpoint(CheckpointConfig::Forced)?; - // Also compact it. - // - // FIXME: This probably shouldn't be part of a "checkpoint" command, but a - // separate operation. Update the tests if you change this. - timeline.tline.compact()?; - pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("get_lsn_by_timestamp ") { From d301b8364cef3b2884b78ad6369e7587d9389b5f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 25 Jul 2022 23:23:35 +0300 Subject: [PATCH 0555/1022] Move LayeredTimeline and related code to separate source file. The layered_repository.rs file had grown to be very large. Split off the LayeredTimeline struct and related code to a separate source file to make it more manageable. There are plans to move much of the code to track timelines from tenant_mgr.rs to LayeredRepository. That will make layered_repository.rs grow again, so now is a good time to split it. There's a lot more cleanup to do, but this commit intentionally only moves existing code and avoids doing anything else, for easier review. --- pageserver/src/layered_repository.rs | 2032 +---------------- .../src/layered_repository/layer_map.rs | 2 +- pageserver/src/layered_repository/timeline.rs | 2021 ++++++++++++++++ pageserver/src/storage_sync.rs | 9 +- 4 files changed, 2057 insertions(+), 2007 deletions(-) create mode 100644 pageserver/src/layered_repository/timeline.rs diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 5c65b5dc7e..ff230ed3c3 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -11,52 +11,36 @@ //! parent timeline, and the last LSN that has been written to disk. //! -use anyhow::{anyhow, bail, ensure, Context, Result}; -use bytes::Bytes; -use fail::fail_point; -use itertools::Itertools; -use lazy_static::lazy_static; +use anyhow::{bail, ensure, Context, Result}; use tracing::*; -use std::cmp::{max, min, Ordering}; +use std::cmp::min; use std::collections::hash_map::Entry; +use std::collections::BTreeSet; use std::collections::HashMap; -use std::collections::{BTreeSet, HashSet}; use std::fs; -use std::fs::{File, OpenOptions}; -use std::io::Write; +use std::fs::File; use std::num::NonZeroU64; -use std::ops::{Bound::Included, Deref, Range}; -use std::path::{Path, PathBuf}; -use std::sync::atomic::{self, AtomicBool}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError}; -use std::time::{Duration, Instant, SystemTime}; +use std::ops::Bound::Included; +use std::path::Path; +use std::sync::{Arc, Mutex, RwLock}; +use std::time::{Duration, Instant}; -use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; +use self::metadata::{metadata_path, TimelineMetadata}; use crate::config::PageServerConf; -use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::storage_sync::index::RemoteIndex; use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::repository::{GcResult, Repository, RepositoryTimeline, Timeline, TimelineWriter}; -use crate::repository::{Key, Value}; +use crate::repository::{GcResult, Repository, RepositoryTimeline, Timeline}; use crate::tenant_mgr; use crate::thread_mgr; -use crate::virtual_file::VirtualFile; -use crate::walreceiver::IS_WAL_RECEIVER; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; -use crate::{page_cache, storage_sync}; -use metrics::{ - register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec, - Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, -}; use toml_edit; use utils::{ crashsafe_dir, - lsn::{AtomicLsn, Lsn, RecordLsn}, - seqwait::SeqWait, + lsn::{Lsn, RecordLsn}, zid::{ZTenantId, ZTimelineId}, }; @@ -73,78 +57,16 @@ pub mod metadata; mod par_fsync; mod storage_layer; -use crate::pgdatadir_mapping::LsnForTimestamp; -use delta_layer::{DeltaLayer, DeltaLayerWriter}; -use ephemeral_file::is_ephemeral_file; -use filename::{DeltaFileName, ImageFileName}; -use image_layer::{ImageLayer, ImageLayerWriter}; -use inmemory_layer::InMemoryLayer; -use layer_map::LayerMap; -use layer_map::SearchResult; -use postgres_ffi::xlog_utils::to_pg_timestamp; -use storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; +mod timeline; + +use storage_layer::Layer; +use timeline::{LayeredTimeline, LayeredTimelineEntry}; // re-export this function so that page_cache.rs can use it. pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; -// Metrics collected on operations on the storage repository. -lazy_static! { - static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( - "pageserver_storage_operations_seconds", - "Time spent on storage operations", - &["operation", "tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); -} - -// Metrics collected on operations on the storage repository. -lazy_static! { - static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!( - "pageserver_getpage_reconstruct_seconds", - "Time spent in reconstruct_value", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); -} - -lazy_static! { - static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!( - "pageserver_materialized_cache_hits_total", - "Number of cache hits from materialized page cache", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); - static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!( - "pageserver_wait_lsn_seconds", - "Time spent waiting for WAL to arrive", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); -} - -lazy_static! { - static ref LAST_RECORD_LSN: IntGaugeVec = register_int_gauge_vec!( - "pageserver_last_record_lsn", - "Last record LSN grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); -} - -// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, -// or in testing they estimate how much we would upload if we did. -lazy_static! { - static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!( - "pageserver_created_persistent_files_total", - "Number of files created that are meant to be uploaded to cloud storage", - ) - .expect("failed to define a metric"); - static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!( - "pageserver_written_persistent_bytes_total", - "Total bytes written that are meant to be uploaded to cloud storage", - ) - .expect("failed to define a metric"); -} +// re-export for use in storage_sync.rs +pub use crate::layered_repository::timeline::save_metadata; /// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; @@ -248,7 +170,7 @@ impl Repository for LayeredRepository { crashsafe_dir::create_dir_all(timeline_path)?; let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); - Self::save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?; + timeline::save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?; let timeline = LayeredTimeline::new( self.conf, @@ -367,7 +289,7 @@ impl Repository for LayeredRepository { src_timeline.initdb_lsn, ); crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; - Self::save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; + timeline::save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; timelines.insert( dst, LayeredTimelineEntry::Unloaded { @@ -396,7 +318,7 @@ impl Repository for LayeredRepository { .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); - STORAGE_TIME + timeline::STORAGE_TIME .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str]) .observe_closure_duration(|| { self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc) @@ -522,73 +444,6 @@ impl Repository for LayeredRepository { } } -#[derive(Clone)] -enum LayeredTimelineEntry { - Loaded(Arc), - Unloaded { - id: ZTimelineId, - metadata: TimelineMetadata, - init_logical_size: Option, - }, -} - -impl LayeredTimelineEntry { - fn timeline_id(&self) -> ZTimelineId { - match self { - LayeredTimelineEntry::Loaded(timeline) => timeline.timeline_id, - LayeredTimelineEntry::Unloaded { id, .. } => *id, - } - } - - fn ancestor_timeline_id(&self) -> Option { - match self { - LayeredTimelineEntry::Loaded(timeline) => { - timeline.ancestor_timeline.as_ref().map(|t| t.timeline_id()) - } - LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_timeline(), - } - } - - fn ancestor_lsn(&self) -> Lsn { - match self { - LayeredTimelineEntry::Loaded(timeline) => timeline.ancestor_lsn, - LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_lsn(), - } - } - - fn ensure_loaded(&self) -> anyhow::Result<&Arc> { - match self { - LayeredTimelineEntry::Loaded(timeline) => Ok(timeline), - LayeredTimelineEntry::Unloaded { .. } => { - anyhow::bail!("timeline is unloaded") - } - } - } - - fn layer_removal_guard(&self) -> Result>, anyhow::Error> { - match self { - LayeredTimelineEntry::Loaded(timeline) => timeline - .layer_removal_cs - .try_lock() - .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) - .map(Some), - - LayeredTimelineEntry::Unloaded { .. } => Ok(None), - } - } -} - -impl From for RepositoryTimeline { - fn from(entry: LayeredTimelineEntry) -> Self { - match entry { - LayeredTimelineEntry::Loaded(timeline) => RepositoryTimeline::Loaded(timeline as _), - LayeredTimelineEntry::Unloaded { metadata, .. } => { - RepositoryTimeline::Unloaded { metadata } - } - } - } -} - /// Private functions impl LayeredRepository { pub fn get_checkpoint_distance(&self) -> u64 { @@ -857,42 +712,6 @@ impl LayeredRepository { }) } - /// Save timeline metadata to file - pub fn save_metadata( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - data: &TimelineMetadata, - first_save: bool, - ) -> Result<()> { - let _enter = info_span!("saving metadata").entered(); - let path = metadata_path(conf, timelineid, tenantid); - // use OpenOptions to ensure file presence is consistent with first_save - let mut file = VirtualFile::open_with_options( - &path, - OpenOptions::new().write(true).create_new(first_save), - )?; - - let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?; - - if file.write(&metadata_bytes)? != metadata_bytes.len() { - bail!("Could not write all the metadata bytes in a single call"); - } - file.sync_all()?; - - // fsync the parent directory to ensure the directory entry is durable - if first_save { - let timeline_dir = File::open( - &path - .parent() - .expect("Metadata should always have a parent dir"), - )?; - timeline_dir.sync_all()?; - } - - Ok(()) - } - // // How garbage collection works: // @@ -1044,1787 +863,6 @@ impl LayeredRepository { } } -pub struct LayeredTimeline { - conf: &'static PageServerConf, - tenant_conf: Arc>, - - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - - layers: RwLock, - - last_freeze_at: AtomicLsn, - - // WAL redo manager - walredo_mgr: Arc, - - // What page versions do we hold in the repository? If we get a - // request > last_record_lsn, we need to wait until we receive all - // the WAL up to the request. The SeqWait provides functions for - // that. TODO: If we get a request for an old LSN, such that the - // versions have already been garbage collected away, we should - // throw an error, but we don't track that currently. - // - // last_record_lsn.load().last points to the end of last processed WAL record. - // - // We also remember the starting point of the previous record in - // 'last_record_lsn.load().prev'. It's used to set the xl_prev pointer of the - // first WAL record when the node is started up. But here, we just - // keep track of it. - last_record_lsn: SeqWait, - - // All WAL records have been processed and stored durably on files on - // local disk, up to this LSN. On crash and restart, we need to re-process - // the WAL starting from this point. - // - // Some later WAL records might have been processed and also flushed to disk - // already, so don't be surprised to see some, but there's no guarantee on - // them yet. - disk_consistent_lsn: AtomicLsn, - - // Parent timeline that this timeline was branched from, and the LSN - // of the branch point. - ancestor_timeline: Option, - ancestor_lsn: Lsn, - - // Metrics - reconstruct_time_histo: Histogram, - materialized_page_cache_hit_counter: IntCounter, - flush_time_histo: Histogram, - compact_time_histo: Histogram, - create_images_time_histo: Histogram, - last_record_gauge: IntGauge, - wait_lsn_time_histo: Histogram, - - /// If `true`, will backup its files that appear after each checkpointing to the remote storage. - upload_layers: AtomicBool, - - /// Ensures layers aren't frozen by checkpointer between - /// [`LayeredTimeline::get_layer_for_write`] and layer reads. - /// Locked automatically by [`LayeredTimelineWriter`] and checkpointer. - /// Must always be acquired before the layer map/individual layer lock - /// to avoid deadlock. - write_lock: Mutex<()>, - - /// Used to ensure that there is only one thread - layer_flush_lock: Mutex<()>, - - /// Layer removal lock. - /// A lock to ensure that no layer of the timeline is removed concurrently by other threads. - /// This lock is acquired in [`LayeredTimeline::gc`], [`LayeredTimeline::compact`], - /// and [`LayeredRepository::delete_timeline`]. - layer_removal_cs: Mutex<()>, - - // Needed to ensure that we can't create a branch at a point that was already garbage collected - latest_gc_cutoff_lsn: RwLock, - - // List of child timelines and their branch points. This is needed to avoid - // garbage collecting data that is still needed by the child timelines. - gc_info: RwLock, - - // It may change across major versions so for simplicity - // keep it after running initdb for a timeline. - // It is needed in checks when we want to error on some operations - // when they are requested for pre-initdb lsn. - // It can be unified with latest_gc_cutoff_lsn under some "first_valid_lsn", - // though lets keep them both for better error visibility. - initdb_lsn: Lsn, - - // Initial logical size of timeline (if known). - // Logical size can be copied from ancestor timeline when new branch is create at last LSN - pub init_logical_size: Option, -} - -/// -/// Information about how much history needs to be retained, needed by -/// Garbage Collection. -/// -struct GcInfo { - /// Specific LSNs that are needed. - /// - /// Currently, this includes all points where child branches have - /// been forked off from. In the future, could also include - /// explicit user-defined snapshot points. - retain_lsns: Vec, - - /// In addition to 'retain_lsns', keep everything newer than this - /// point. - /// - /// This is calculated by subtracting 'gc_horizon' setting from - /// last-record LSN - /// - /// FIXME: is this inclusive or exclusive? - horizon_cutoff: Lsn, - - /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this - /// point. - /// - /// This is calculated by finding a number such that a record is needed for PITR - /// if only if its LSN is larger than 'pitr_cutoff'. - pitr_cutoff: Lsn, -} - -/// Public interface functions -impl Timeline for LayeredTimeline { - fn get_ancestor_lsn(&self) -> Lsn { - self.ancestor_lsn - } - - fn get_ancestor_timeline_id(&self) -> Option { - self.ancestor_timeline - .as_ref() - .map(LayeredTimelineEntry::timeline_id) - } - - /// Wait until WAL has been received up to the given LSN. - fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { - // This should never be called from the WAL receiver thread, because that could lead - // to a deadlock. - ensure!( - !IS_WAL_RECEIVER.with(|c| c.get()), - "wait_lsn called by WAL receiver thread" - ); - - self.wait_lsn_time_histo.observe_closure_duration( - || self.last_record_lsn - .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) - .with_context(|| { - format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", - lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() - ) - }))?; - - Ok(()) - } - - fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard { - self.latest_gc_cutoff_lsn.read().unwrap() - } - - /// Look up the value with the given a key - fn get(&self, key: Key, lsn: Lsn) -> Result { - debug_assert!(lsn <= self.get_last_record_lsn()); - - // Check the page cache. We will get back the most recent page with lsn <= `lsn`. - // The cached image can be returned directly if there is no WAL between the cached image - // and requested LSN. The cached image can also be used to reduce the amount of WAL needed - // for redo. - let cached_page_img = match self.lookup_cached_page(&key, lsn) { - Some((cached_lsn, cached_img)) => { - match cached_lsn.cmp(&lsn) { - Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check - Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image - Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn - } - Some((cached_lsn, cached_img)) - } - None => None, - }; - - let mut reconstruct_state = ValueReconstructState { - records: Vec::new(), - img: cached_page_img, - }; - - self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; - - self.reconstruct_time_histo - .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) - } - - /// Public entry point for checkpoint(). All the logic is in the private - /// checkpoint_internal function, this public facade just wraps it for - /// metrics collection. - fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { - match cconf { - CheckpointConfig::Flush => { - self.freeze_inmem_layer(false); - self.flush_frozen_layers(true) - } - CheckpointConfig::Forced => { - self.freeze_inmem_layer(false); - self.flush_frozen_layers(true)?; - self.compact() - } - } - } - - /// - /// Validate lsn against initdb_lsn and latest_gc_cutoff_lsn. - /// - fn check_lsn_is_in_scope( - &self, - lsn: Lsn, - latest_gc_cutoff_lsn: &RwLockReadGuard, - ) -> Result<()> { - ensure!( - lsn >= **latest_gc_cutoff_lsn, - "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", - lsn, - **latest_gc_cutoff_lsn, - ); - Ok(()) - } - - fn get_last_record_lsn(&self) -> Lsn { - self.last_record_lsn.load().last - } - - fn get_prev_record_lsn(&self) -> Lsn { - self.last_record_lsn.load().prev - } - - fn get_last_record_rlsn(&self) -> RecordLsn { - self.last_record_lsn.load() - } - - fn get_disk_consistent_lsn(&self) -> Lsn { - self.disk_consistent_lsn.load() - } - - fn writer<'a>(&'a self) -> Box { - Box::new(LayeredTimelineWriter { - tl: self, - _write_guard: self.write_lock.lock().unwrap(), - }) - } -} - -impl LayeredTimeline { - fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap(); - tenant_conf - .checkpoint_distance - .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) - } - - fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap(); - tenant_conf - .compaction_target_size - .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) - } - - fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap(); - tenant_conf - .compaction_threshold - .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) - } - - fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap(); - tenant_conf - .image_creation_threshold - .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) - } - - /// Open a Timeline handle. - /// - /// Loads the metadata for the timeline into memory, but not the layer map. - #[allow(clippy::too_many_arguments)] - fn new( - conf: &'static PageServerConf, - tenant_conf: Arc>, - metadata: TimelineMetadata, - ancestor: Option, - timeline_id: ZTimelineId, - tenant_id: ZTenantId, - walredo_mgr: Arc, - upload_layers: bool, - init_logical_size: Option, - ) -> LayeredTimeline { - let reconstruct_time_histo = RECONSTRUCT_TIME - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let flush_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "layer flush", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let compact_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "compact", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let create_images_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "create images", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let last_record_gauge = LAST_RECORD_LSN - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let wait_lsn_time_histo = WAIT_LSN_TIME - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - - LayeredTimeline { - conf, - tenant_conf, - timeline_id, - tenant_id, - layers: RwLock::new(LayerMap::default()), - - walredo_mgr, - - // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. - last_record_lsn: SeqWait::new(RecordLsn { - last: metadata.disk_consistent_lsn(), - prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)), - }), - disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0), - - last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0), - - ancestor_timeline: ancestor, - ancestor_lsn: metadata.ancestor_lsn(), - - reconstruct_time_histo, - materialized_page_cache_hit_counter, - flush_time_histo, - compact_time_histo, - create_images_time_histo, - last_record_gauge, - wait_lsn_time_histo, - - upload_layers: AtomicBool::new(upload_layers), - - write_lock: Mutex::new(()), - layer_flush_lock: Mutex::new(()), - layer_removal_cs: Mutex::new(()), - - gc_info: RwLock::new(GcInfo { - retain_lsns: Vec::new(), - horizon_cutoff: Lsn(0), - pitr_cutoff: Lsn(0), - }), - - latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), - initdb_lsn: metadata.initdb_lsn(), - init_logical_size, - } - } - - /// - /// Scan the timeline directory to populate the layer map. - /// Returns all timeline-related files that were found and loaded. - /// - fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { - let mut layers = self.layers.write().unwrap(); - let mut num_layers = 0; - - // Scan timeline directory and create ImageFileName and DeltaFilename - // structs representing all files on disk - let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); - - for direntry in fs::read_dir(timeline_path)? { - let direntry = direntry?; - let fname = direntry.file_name(); - let fname = fname.to_string_lossy(); - - if let Some(imgfilename) = ImageFileName::parse_str(&fname) { - // create an ImageLayer struct for each image file. - if imgfilename.lsn > disk_consistent_lsn { - warn!( - "found future image layer {} on timeline {} disk_consistent_lsn is {}", - imgfilename, self.timeline_id, disk_consistent_lsn - ); - - rename_to_backup(direntry.path())?; - continue; - } - - let layer = - ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename); - - trace!("found layer {}", layer.filename().display()); - layers.insert_historic(Arc::new(layer)); - num_layers += 1; - } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { - // Create a DeltaLayer struct for each delta file. - // The end-LSN is exclusive, while disk_consistent_lsn is - // inclusive. For example, if disk_consistent_lsn is 100, it is - // OK for a delta layer to have end LSN 101, but if the end LSN - // is 102, then it might not have been fully flushed to disk - // before crash. - if deltafilename.lsn_range.end > disk_consistent_lsn + 1 { - warn!( - "found future delta layer {} on timeline {} disk_consistent_lsn is {}", - deltafilename, self.timeline_id, disk_consistent_lsn - ); - - rename_to_backup(direntry.path())?; - continue; - } - - let layer = - DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename); - - trace!("found layer {}", layer.filename().display()); - layers.insert_historic(Arc::new(layer)); - num_layers += 1; - } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { - // ignore these - } else if is_ephemeral_file(&fname) { - // Delete any old ephemeral files - trace!("deleting old ephemeral file in timeline dir: {}", fname); - fs::remove_file(direntry.path())?; - } else { - warn!("unrecognized filename in timeline dir: {}", fname); - } - } - - layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1); - - info!( - "loaded layer map with {} layers at {}", - num_layers, disk_consistent_lsn - ); - - Ok(()) - } - - /// - /// Get a handle to a Layer for reading. - /// - /// The returned Layer might be from an ancestor timeline, if the - /// segment hasn't been updated on this timeline yet. - /// - /// This function takes the current timeline's locked LayerMap as an argument, - /// so callers can avoid potential race conditions. - fn get_reconstruct_data( - &self, - key: Key, - request_lsn: Lsn, - reconstruct_state: &mut ValueReconstructState, - ) -> anyhow::Result<()> { - // Start from the current timeline. - let mut timeline_owned; - let mut timeline = self; - - // For debugging purposes, collect the path of layers that we traversed - // through. It's included in the error message if we fail to find the key. - let mut traversal_path: Vec<(ValueReconstructResult, Lsn, Arc)> = Vec::new(); - - let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { - *cached_lsn - } else { - Lsn(0) - }; - - // 'prev_lsn' tracks the last LSN that we were at in our search. It's used - // to check that each iteration make some progress, to break infinite - // looping if something goes wrong. - let mut prev_lsn = Lsn(u64::MAX); - - let mut result = ValueReconstructResult::Continue; - let mut cont_lsn = Lsn(request_lsn.0 + 1); - - 'outer: loop { - // The function should have updated 'state' - //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn); - match result { - ValueReconstructResult::Complete => return Ok(()), - ValueReconstructResult::Continue => { - // If we reached an earlier cached page image, we're done. - if cont_lsn == cached_lsn + 1 { - self.materialized_page_cache_hit_counter.inc_by(1); - return Ok(()); - } - if prev_lsn <= cont_lsn { - // Didn't make any progress in last iteration. Error out to avoid - // getting stuck in the loop. - return layer_traversal_error(format!( - "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", - key, - Lsn(cont_lsn.0 - 1), - request_lsn, - timeline.ancestor_lsn - ), traversal_path); - } - prev_lsn = cont_lsn; - } - ValueReconstructResult::Missing => { - return layer_traversal_error( - format!( - "could not find data for key {} at LSN {}, for request at LSN {}", - key, cont_lsn, request_lsn - ), - traversal_path, - ); - } - } - - // Recurse into ancestor if needed - if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { - trace!( - "going into ancestor {}, cont_lsn is {}", - timeline.ancestor_lsn, - cont_lsn - ); - let ancestor = timeline.get_ancestor_timeline()?; - timeline_owned = ancestor; - timeline = &*timeline_owned; - prev_lsn = Lsn(u64::MAX); - continue; - } - - let layers = timeline.layers.read().unwrap(); - - // Check the open and frozen in-memory layers first, in order from newest - // to oldest. - if let Some(open_layer) = &layers.open_layer { - let start_lsn = open_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); - // Get all the data needed to reconstruct the page version from this layer. - // But if we have an older cached page image, no need to go past that. - let lsn_floor = max(cached_lsn + 1, start_lsn); - result = open_layer.get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - )?; - cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, open_layer.clone())); - continue; - } - } - for frozen_layer in layers.frozen_layers.iter().rev() { - let start_lsn = frozen_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); - let lsn_floor = max(cached_lsn + 1, start_lsn); - result = frozen_layer.get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - )?; - cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, frozen_layer.clone())); - continue 'outer; - } - } - - if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn)? { - //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display()); - - let lsn_floor = max(cached_lsn + 1, lsn_floor); - result = layer.get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - )?; - cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, layer)); - } else if timeline.ancestor_timeline.is_some() { - // Nothing on this timeline. Traverse to parent - result = ValueReconstructResult::Continue; - cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); - } else { - // Nothing found - result = ValueReconstructResult::Missing; - } - } - } - - fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> { - let cache = page_cache::get(); - - // FIXME: It's pointless to check the cache for things that are not 8kB pages. - // We should look at the key to determine if it's a cacheable object - let (lsn, read_guard) = - cache.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)?; - let img = Bytes::from(read_guard.to_vec()); - Some((lsn, img)) - } - - fn get_ancestor_timeline(&self) -> Result> { - let ancestor = self - .ancestor_timeline - .as_ref() - .with_context(|| { - format!( - "Ancestor is missing. Timeline id: {} Ancestor id {:?}", - self.timeline_id, - self.get_ancestor_timeline_id(), - ) - })? - .ensure_loaded() - .with_context(|| { - format!( - "Ancestor timeline is not loaded. Timeline id: {} Ancestor id {:?}", - self.timeline_id, - self.get_ancestor_timeline_id(), - ) - })?; - Ok(Arc::clone(ancestor)) - } - - /// - /// Get a handle to the latest layer for appending. - /// - fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result> { - let mut layers = self.layers.write().unwrap(); - - ensure!(lsn.is_aligned()); - - let last_record_lsn = self.get_last_record_lsn(); - ensure!( - lsn > last_record_lsn, - "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", - lsn, - last_record_lsn, - ); - - // Do we have a layer open for writing already? - let layer; - if let Some(open_layer) = &layers.open_layer { - if open_layer.get_lsn_range().start > lsn { - bail!("unexpected open layer in the future"); - } - - layer = Arc::clone(open_layer); - } else { - // No writeable layer yet. Create one. - let start_lsn = layers.next_open_layer_at.unwrap(); - - trace!( - "creating layer for write at {}/{} for record at {}", - self.timeline_id, - start_lsn, - lsn - ); - let new_layer = - InMemoryLayer::create(self.conf, self.timeline_id, self.tenant_id, start_lsn)?; - let layer_rc = Arc::new(new_layer); - - layers.open_layer = Some(Arc::clone(&layer_rc)); - layers.next_open_layer_at = None; - - layer = layer_rc; - } - Ok(layer) - } - - fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { - //info!("PUT: key {} at {}", key, lsn); - let layer = self.get_layer_for_write(lsn)?; - layer.put_value(key, lsn, val)?; - Ok(()) - } - - fn put_tombstone(&self, key_range: Range, lsn: Lsn) -> Result<()> { - let layer = self.get_layer_for_write(lsn)?; - layer.put_tombstone(key_range, lsn)?; - - Ok(()) - } - - fn finish_write(&self, new_lsn: Lsn) { - assert!(new_lsn.is_aligned()); - - self.last_record_gauge.set(new_lsn.0 as i64); - self.last_record_lsn.advance(new_lsn); - } - - fn freeze_inmem_layer(&self, write_lock_held: bool) { - // Freeze the current open in-memory layer. It will be written to disk on next - // iteration. - let _write_guard = if write_lock_held { - None - } else { - Some(self.write_lock.lock().unwrap()) - }; - let mut layers = self.layers.write().unwrap(); - if let Some(open_layer) = &layers.open_layer { - let open_layer_rc = Arc::clone(open_layer); - // Does this layer need freezing? - let end_lsn = Lsn(self.get_last_record_lsn().0 + 1); - open_layer.freeze(end_lsn); - - // The layer is no longer open, update the layer map to reflect this. - // We will replace it with on-disk historics below. - layers.frozen_layers.push_back(open_layer_rc); - layers.open_layer = None; - layers.next_open_layer_at = Some(end_lsn); - self.last_freeze_at.store(end_lsn); - } - drop(layers); - } - - /// - /// Check if more than 'checkpoint_distance' of WAL has been accumulated - /// in the in-memory layer, and initiate flushing it if so. - /// - pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { - let last_lsn = self.get_last_record_lsn(); - let layers = self.layers.read().unwrap(); - if let Some(open_layer) = &layers.open_layer { - let open_layer_size = open_layer.size()?; - drop(layers); - let distance = last_lsn.widening_sub(self.last_freeze_at.load()); - // Checkpointing the open layer can be triggered by layer size or LSN range. - // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and - // we want to stay below that with a big margin. The LSN distance determines how - // much WAL the safekeepers need to store. - if distance >= self.get_checkpoint_distance().into() - || open_layer_size > self.get_checkpoint_distance() - { - info!( - "check_checkpoint_distance {}, layer size {}", - distance, open_layer_size - ); - - self.freeze_inmem_layer(true); - self.last_freeze_at.store(last_lsn); - - // Launch a thread to flush the frozen layer to disk, unless - // a thread was already running. (If the thread was running - // at the time that we froze the layer, it must've seen the - // the layer we just froze before it exited; see comments - // in flush_frozen_layers()) - if let Ok(guard) = self.layer_flush_lock.try_lock() { - drop(guard); - let self_clone = Arc::clone(self); - thread_mgr::spawn( - thread_mgr::ThreadKind::LayerFlushThread, - Some(self.tenant_id), - Some(self.timeline_id), - "layer flush thread", - false, - move || self_clone.flush_frozen_layers(false), - )?; - } - } - } - Ok(()) - } - - /// Flush all frozen layers to disk. - /// - /// Only one thread at a time can be doing layer-flushing for a - /// given timeline. If 'wait' is true, and another thread is - /// currently doing the flushing, this function will wait for it - /// to finish. If 'wait' is false, this function will return - /// immediately instead. - fn flush_frozen_layers(&self, wait: bool) -> Result<()> { - let flush_lock_guard = if wait { - self.layer_flush_lock.lock().unwrap() - } else { - match self.layer_flush_lock.try_lock() { - Ok(guard) => guard, - Err(TryLockError::WouldBlock) => return Ok(()), - Err(TryLockError::Poisoned(err)) => panic!("{:?}", err), - } - }; - - let timer = self.flush_time_histo.start_timer(); - - loop { - let layers = self.layers.read().unwrap(); - if let Some(frozen_layer) = layers.frozen_layers.front() { - let frozen_layer = Arc::clone(frozen_layer); - drop(layers); // to allow concurrent reads and writes - self.flush_frozen_layer(frozen_layer)?; - } else { - // Drop the 'layer_flush_lock' *before* 'layers'. That - // way, if you freeze a layer, and then call - // flush_frozen_layers(false), it is guaranteed that - // if another thread was busy flushing layers and the - // call therefore returns immediately, the other - // thread will have seen the newly-frozen layer and - // will flush that too (assuming no errors). - drop(flush_lock_guard); - drop(layers); - break; - } - } - - timer.stop_and_record(); - - Ok(()) - } - - /// Flush one frozen in-memory layer to disk, as a new delta layer. - fn flush_frozen_layer(&self, frozen_layer: Arc) -> Result<()> { - // As a special case, when we have just imported an image into the repository, - // instead of writing out a L0 delta layer, we directly write out image layer - // files instead. This is possible as long as *all* the data imported into the - // repository have the same LSN. - let lsn_range = frozen_layer.get_lsn_range(); - let layer_paths_to_upload = if lsn_range.start == self.initdb_lsn - && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) - { - let pgdir = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)?; - let (partitioning, _lsn) = - pgdir.repartition(self.initdb_lsn, self.get_compaction_target_size())?; - self.create_image_layers(&partitioning, self.initdb_lsn, true)? - } else { - // normal case, write out a L0 delta layer file. - let delta_path = self.create_delta_layer(&frozen_layer)?; - HashSet::from([delta_path]) - }; - - fail_point!("flush-frozen-before-sync"); - - // The new on-disk layers are now in the layer map. We can remove the - // in-memory layer from the map now. - { - let mut layers = self.layers.write().unwrap(); - let l = layers.frozen_layers.pop_front(); - - // Only one thread may call this function at a time (for this - // timeline). If two threads tried to flush the same frozen - // layer to disk at the same time, that would not work. - assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer)); - - // release lock on 'layers' - } - - fail_point!("checkpoint-after-sync"); - - // Update the metadata file, with new 'disk_consistent_lsn' - // - // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing - // *all* the layers, to avoid fsyncing the file multiple times. - let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); - self.update_disk_consistent_lsn(disk_consistent_lsn, layer_paths_to_upload)?; - - Ok(()) - } - - /// Update metadata file - fn update_disk_consistent_lsn( - &self, - disk_consistent_lsn: Lsn, - layer_paths_to_upload: HashSet, - ) -> Result<()> { - // If we were able to advance 'disk_consistent_lsn', save it the metadata file. - // After crash, we will restart WAL streaming and processing from that point. - let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); - if disk_consistent_lsn != old_disk_consistent_lsn { - assert!(disk_consistent_lsn > old_disk_consistent_lsn); - - // We can only save a valid 'prev_record_lsn' value on disk if we - // flushed *all* in-memory changes to disk. We only track - // 'prev_record_lsn' in memory for the latest processed record, so we - // don't remember what the correct value that corresponds to some old - // LSN is. But if we flush everything, then the value corresponding - // current 'last_record_lsn' is correct and we can store it on disk. - let RecordLsn { - last: last_record_lsn, - prev: prev_record_lsn, - } = self.last_record_lsn.load(); - let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn { - Some(prev_record_lsn) - } else { - None - }; - - let ancestor_timelineid = self - .ancestor_timeline - .as_ref() - .map(LayeredTimelineEntry::timeline_id); - - let metadata = TimelineMetadata::new( - disk_consistent_lsn, - ondisk_prev_record_lsn, - ancestor_timelineid, - self.ancestor_lsn, - *self.latest_gc_cutoff_lsn.read().unwrap(), - self.initdb_lsn, - ); - - fail_point!("checkpoint-before-saving-metadata", |x| bail!( - "{}", - x.unwrap() - )); - - LayeredRepository::save_metadata( - self.conf, - self.timeline_id, - self.tenant_id, - &metadata, - false, - )?; - - if self.upload_layers.load(atomic::Ordering::Relaxed) { - storage_sync::schedule_layer_upload( - self.tenant_id, - self.timeline_id, - layer_paths_to_upload, - Some(metadata), - ); - } - - // Also update the in-memory copy - self.disk_consistent_lsn.store(disk_consistent_lsn); - } - - Ok(()) - } - - // Write out the given frozen in-memory layer as a new L0 delta file - fn create_delta_layer(&self, frozen_layer: &InMemoryLayer) -> Result { - // Write it out - let new_delta = frozen_layer.write_to_disk()?; - let new_delta_path = new_delta.path(); - - // Sync it to disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable - // - // TODO: If we're running inside 'flush_frozen_layers' and there are multiple - // files to flush, it might be better to first write them all, and then fsync - // them all in parallel. - par_fsync::par_fsync(&[ - new_delta_path.clone(), - self.conf.timeline_path(&self.timeline_id, &self.tenant_id), - ])?; - - // Add it to the layer map - { - let mut layers = self.layers.write().unwrap(); - layers.insert_historic(Arc::new(new_delta)); - } - - NUM_PERSISTENT_FILES_CREATED.inc_by(1); - PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len()); - - Ok(new_delta_path) - } - - pub fn compact(&self) -> Result<()> { - // - // High level strategy for compaction / image creation: - // - // 1. First, calculate the desired "partitioning" of the - // currently in-use key space. The goal is to partition the - // key space into roughly fixed-size chunks, but also take into - // account any existing image layers, and try to align the - // chunk boundaries with the existing image layers to avoid - // too much churn. Also try to align chunk boundaries with - // relation boundaries. In principle, we don't know about - // relation boundaries here, we just deal with key-value - // pairs, and the code in pgdatadir_mapping.rs knows how to - // map relations into key-value pairs. But in practice we know - // that 'field6' is the block number, and the fields 1-5 - // identify a relation. This is just an optimization, - // though. - // - // 2. Once we know the partitioning, for each partition, - // decide if it's time to create a new image layer. The - // criteria is: there has been too much "churn" since the last - // image layer? The "churn" is fuzzy concept, it's a - // combination of too many delta files, or too much WAL in - // total in the delta file. Or perhaps: if creating an image - // file would allow to delete some older files. - // - // 3. After that, we compact all level0 delta files if there - // are too many of them. While compacting, we also garbage - // collect any page versions that are no longer needed because - // of the new image layers we created in step 2. - // - // TODO: This high level strategy hasn't been implemented yet. - // Below are functions compact_level0() and create_image_layers() - // but they are a bit ad hoc and don't quite work like it's explained - // above. Rewrite it. - let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); - - let target_file_size = self.get_checkpoint_distance(); - - // Define partitioning schema if needed - if let Ok(pgdir) = - tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) - { - // 2. Create new image layers for partitions that have been modified - // "enough". - let (partitioning, lsn) = pgdir.repartition( - self.get_last_record_lsn(), - self.get_compaction_target_size(), - )?; - let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; - if !layer_paths_to_upload.is_empty() - && self.upload_layers.load(atomic::Ordering::Relaxed) - { - storage_sync::schedule_layer_upload( - self.tenant_id, - self.timeline_id, - HashSet::from_iter(layer_paths_to_upload), - None, - ); - } - - // 3. Compact - let timer = self.compact_time_histo.start_timer(); - self.compact_level0(target_file_size)?; - timer.stop_and_record(); - } else { - debug!("Could not compact because no partitioning specified yet"); - } - - Ok(()) - } - - // Is it time to create a new image layer for the given partition? - fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result { - let layers = self.layers.read().unwrap(); - - for part_range in &partition.ranges { - let image_coverage = layers.image_coverage(part_range, lsn)?; - for (img_range, last_img) in image_coverage { - let img_lsn = if let Some(last_img) = last_img { - last_img.get_lsn_range().end - } else { - Lsn(0) - }; - // Let's consider an example: - // - // delta layer with LSN range 71-81 - // delta layer with LSN range 81-91 - // delta layer with LSN range 91-101 - // image layer at LSN 100 - // - // If 'lsn' is still 100, i.e. no new WAL has been processed since the last image layer, - // there's no need to create a new one. We check this case explicitly, to avoid passing - // a bogus range to count_deltas below, with start > end. It's even possible that there - // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed - // after we read last_record_lsn, which is passed here in the 'lsn' argument. - if img_lsn < lsn { - let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?; - - debug!( - "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", - img_range.start, img_range.end, num_deltas, img_lsn, lsn - ); - if num_deltas >= self.get_image_creation_threshold() { - return Ok(true); - } - } - } - } - - Ok(false) - } - - fn create_image_layers( - &self, - partitioning: &KeyPartitioning, - lsn: Lsn, - force: bool, - ) -> Result> { - let timer = self.create_images_time_histo.start_timer(); - let mut image_layers: Vec = Vec::new(); - let mut layer_paths_to_upload = HashSet::new(); - for partition in partitioning.parts.iter() { - if force || self.time_for_new_image_layer(partition, lsn)? { - let img_range = - partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; - let mut image_layer_writer = ImageLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_id, - &img_range, - lsn, - )?; - - for range in &partition.ranges { - let mut key = range.start; - while key < range.end { - let img = self.get(key, lsn)?; - image_layer_writer.put_image(key, &img)?; - key = key.next(); - } - } - let image_layer = image_layer_writer.finish()?; - layer_paths_to_upload.insert(image_layer.path()); - image_layers.push(image_layer); - } - } - - // Sync the new layer to disk before adding it to the layer map, to make sure - // we don't garbage collect something based on the new layer, before it has - // reached the disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable - // - // Compaction creates multiple image layers. It would be better to create them all - // and fsync them all in parallel. - let mut all_paths = Vec::from_iter(layer_paths_to_upload.clone()); - all_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); - par_fsync::par_fsync(&all_paths)?; - - let mut layers = self.layers.write().unwrap(); - for l in image_layers { - layers.insert_historic(Arc::new(l)); - } - drop(layers); - timer.stop_and_record(); - - Ok(layer_paths_to_upload) - } - - /// - /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as - /// as Level 1 files. - /// - fn compact_level0(&self, target_file_size: u64) -> Result<()> { - let layers = self.layers.read().unwrap(); - let mut level0_deltas = layers.get_level0_deltas()?; - drop(layers); - - // Only compact if enough layers have accumulated. - if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() { - return Ok(()); - } - - // Gather the files to compact in this iteration. - // - // Start with the oldest Level 0 delta file, and collect any other - // level 0 files that form a contiguous sequence, such that the end - // LSN of previous file matches the start LSN of the next file. - // - // Note that if the files don't form such a sequence, we might - // "compact" just a single file. That's a bit pointless, but it allows - // us to get rid of the level 0 file, and compact the other files on - // the next iteration. This could probably made smarter, but such - // "gaps" in the sequence of level 0 files should only happen in case - // of a crash, partial download from cloud storage, or something like - // that, so it's not a big deal in practice. - level0_deltas.sort_by_key(|l| l.get_lsn_range().start); - let mut level0_deltas_iter = level0_deltas.iter(); - - let first_level0_delta = level0_deltas_iter.next().unwrap(); - let mut prev_lsn_end = first_level0_delta.get_lsn_range().end; - let mut deltas_to_compact = vec![Arc::clone(first_level0_delta)]; - for l in level0_deltas_iter { - let lsn_range = l.get_lsn_range(); - - if lsn_range.start != prev_lsn_end { - break; - } - deltas_to_compact.push(Arc::clone(l)); - prev_lsn_end = lsn_range.end; - } - let lsn_range = Range { - start: deltas_to_compact.first().unwrap().get_lsn_range().start, - end: deltas_to_compact.last().unwrap().get_lsn_range().end, - }; - - info!( - "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", - lsn_range.start, - lsn_range.end, - deltas_to_compact.len(), - level0_deltas.len() - ); - for l in deltas_to_compact.iter() { - info!("compact includes {}", l.filename().display()); - } - // We don't need the original list of layers anymore. Drop it so that - // we don't accidentally use it later in the function. - drop(level0_deltas); - - // This iterator walks through all key-value pairs from all the layers - // we're compacting, in key, LSN order. - let all_values_iter = deltas_to_compact - .iter() - .map(|l| l.iter()) - .kmerge_by(|a, b| { - if let Ok((a_key, a_lsn, _)) = a { - if let Ok((b_key, b_lsn, _)) = b { - match a_key.cmp(b_key) { - Ordering::Less => true, - Ordering::Equal => a_lsn <= b_lsn, - Ordering::Greater => false, - } - } else { - false - } - } else { - true - } - }); - - // This iterator walks through all keys and is needed to calculate size used by each key - let mut all_keys_iter = deltas_to_compact - .iter() - .map(|l| l.key_iter()) - .kmerge_by(|a, b| { - let (a_key, a_lsn, _) = a; - let (b_key, b_lsn, _) = b; - match a_key.cmp(b_key) { - Ordering::Less => true, - Ordering::Equal => a_lsn <= b_lsn, - Ordering::Greater => false, - } - }); - - // Merge the contents of all the input delta layers into a new set - // of delta layers, based on the current partitioning. - // - // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. - // It's possible that there is a single key with so many page versions that storing all of them in a single layer file - // would be too large. In that case, we also split on the LSN dimension. - // - // LSN - // ^ - // | - // | +-----------+ +--+--+--+--+ - // | | | | | | | | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ ==> | | | | | - // | | | | | | | | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ +--+--+--+--+ - // | - // +--------------> key - // - // - // If one key (X) has a lot of page versions: - // - // LSN - // ^ - // | (X) - // | +-----------+ +--+--+--+--+ - // | | | | | | | | - // | +-----------+ | | +--+ | - // | | | | | | | | - // | +-----------+ ==> | | | | | - // | | | | | +--+ | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ +--+--+--+--+ - // | - // +--------------> key - // TODO: this actually divides the layers into fixed-size chunks, not - // based on the partitioning. - // - // TODO: we should also opportunistically materialize and - // garbage collect what we can. - let mut new_layers = Vec::new(); - let mut prev_key: Option = None; - let mut writer: Option = None; - let mut key_values_total_size = 0u64; - let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key - let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key - for x in all_values_iter { - let (key, lsn, value) = x?; - let same_key = prev_key.map_or(false, |prev_key| prev_key == key); - // We need to check key boundaries once we reach next key or end of layer with the same key - if !same_key || lsn == dup_end_lsn { - let mut next_key_size = 0u64; - let is_dup_layer = dup_end_lsn.is_valid(); - dup_start_lsn = Lsn::INVALID; - if !same_key { - dup_end_lsn = Lsn::INVALID; - } - // Determine size occupied by this key. We stop at next key, or when size becomes larger than target_file_size - for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { - next_key_size = next_size; - if key != next_key { - if dup_end_lsn.is_valid() { - dup_start_lsn = dup_end_lsn; - dup_end_lsn = lsn_range.end; - } - break; - } - key_values_total_size += next_size; - if key_values_total_size > target_file_size { - // split key between multiple layers: such layer can contain only single key - dup_start_lsn = if dup_end_lsn.is_valid() { - dup_end_lsn - } else { - lsn - }; - dup_end_lsn = next_lsn; - break; - } - } - // handle case when loop reaches last key - if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { - dup_start_lsn = dup_end_lsn; - dup_end_lsn = lsn_range.end; - } - if writer.is_some() { - let written_size = writer.as_mut().unwrap().size(); - // check if key cause layer overflow - if is_dup_layer - || dup_end_lsn.is_valid() - || written_size + key_values_total_size > target_file_size - { - new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?); - writer = None; - } - } - key_values_total_size = next_key_size; - } - if writer.is_none() { - writer = Some(DeltaLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_id, - key, - if dup_end_lsn.is_valid() { - // this is a layer containing slice of values of the same key - debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); - dup_start_lsn..dup_end_lsn - } else { - debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); - lsn_range.clone() - }, - )?); - } - writer.as_mut().unwrap().put_value(key, lsn, value)?; - prev_key = Some(key); - } - if let Some(writer) = writer { - new_layers.push(writer.finish(prev_key.unwrap().next())?); - } - - // Sync layers - if !new_layers.is_empty() { - let mut layer_paths: Vec = new_layers.iter().map(|l| l.path()).collect(); - - // also sync the directory - layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); - - // Fsync all the layer files and directory using multiple threads to - // minimize latency. - par_fsync::par_fsync(&layer_paths)?; - - layer_paths.pop().unwrap(); - } - - let mut layers = self.layers.write().unwrap(); - let mut new_layer_paths = HashSet::with_capacity(new_layers.len()); - for l in new_layers { - new_layer_paths.insert(l.path()); - layers.insert_historic(Arc::new(l)); - } - - // Now that we have reshuffled the data to set of new delta layers, we can - // delete the old ones - let mut layer_paths_do_delete = HashSet::with_capacity(deltas_to_compact.len()); - for l in &deltas_to_compact { - l.delete()?; - if let Some(path) = l.local_path() { - layer_paths_do_delete.insert(path); - } - layers.remove_historic(l.clone()); - } - drop(layers); - - if self.upload_layers.load(atomic::Ordering::Relaxed) { - storage_sync::schedule_layer_upload( - self.tenant_id, - self.timeline_id, - new_layer_paths, - None, - ); - storage_sync::schedule_layer_delete( - self.tenant_id, - self.timeline_id, - layer_paths_do_delete, - ); - } - - Ok(()) - } - - /// Update information about which layer files need to be retained on - /// garbage collection. This is separate from actually performing the GC, - /// and is updated more frequently, so that compaction can remove obsolete - /// page versions more aggressively. - /// - /// TODO: that's wishful thinking, compaction doesn't actually do that - /// currently. - /// - /// The caller specifies how much history is needed with the 3 arguments: - /// - /// retain_lsns: keep a version of each page at these LSNs - /// cutoff_horizon: also keep everything newer than this LSN - /// pitr: the time duration required to keep data for PITR - /// - /// The 'retain_lsns' list is currently used to prevent removing files that - /// are needed by child timelines. In the future, the user might be able to - /// name additional points in time to retain. The caller is responsible for - /// collecting that information. - /// - /// The 'cutoff_horizon' point is used to retain recent versions that might still be - /// needed by read-only nodes. (As of this writing, the caller just passes - /// the latest LSN subtracted by a constant, and doesn't do anything smart - /// to figure out what read-only nodes might actually need.) - /// - /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine - /// whether a record is needed for PITR. - fn update_gc_info( - &self, - retain_lsns: Vec, - cutoff_horizon: Lsn, - pitr: Duration, - ) -> Result<()> { - let mut gc_info = self.gc_info.write().unwrap(); - - gc_info.horizon_cutoff = cutoff_horizon; - gc_info.retain_lsns = retain_lsns; - - // Calculate pitr cutoff point. - // If we cannot determine a cutoff LSN, be conservative and don't GC anything. - let mut pitr_cutoff_lsn: Lsn = *self.get_latest_gc_cutoff_lsn(); - - if let Ok(timeline) = - tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) - { - let now = SystemTime::now(); - // First, calculate pitr_cutoff_timestamp and then convert it to LSN. - // If we don't have enough data to convert to LSN, - // play safe and don't remove any layers. - if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { - let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); - - match timeline.find_lsn_for_timestamp(pitr_timestamp)? { - LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, - LsnForTimestamp::Future(lsn) => { - debug!("future({})", lsn); - pitr_cutoff_lsn = gc_info.horizon_cutoff; - } - LsnForTimestamp::Past(lsn) => { - debug!("past({})", lsn); - } - LsnForTimestamp::NoData(lsn) => { - debug!("nodata({})", lsn); - } - } - debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn) - } - } else if cfg!(test) { - // We don't have local timeline in mocked cargo tests. - // So, just ignore pitr_interval setting in this case. - pitr_cutoff_lsn = gc_info.horizon_cutoff; - } - gc_info.pitr_cutoff = pitr_cutoff_lsn; - - Ok(()) - } - - /// - /// Garbage collect layer files on a timeline that are no longer needed. - /// - /// Currently, we don't make any attempt at removing unneeded page versions - /// within a layer file. We can only remove the whole file if it's fully - /// obsolete. - /// - fn gc(&self) -> Result { - let mut result: GcResult = Default::default(); - let now = SystemTime::now(); - - fail_point!("before-timeline-gc"); - - let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); - - let gc_info = self.gc_info.read().unwrap(); - - let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn()); - let pitr_cutoff = gc_info.pitr_cutoff; - let retain_lsns = &gc_info.retain_lsns; - - let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); - - // Nothing to GC. Return early. - let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); - if latest_gc_cutoff >= new_gc_cutoff { - info!( - "Nothing to GC for timeline {}: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", - self.timeline_id - ); - return Ok(result); - } - - let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %new_gc_cutoff).entered(); - - // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. - // See branch_timeline() for details. - *self.latest_gc_cutoff_lsn.write().unwrap() = new_gc_cutoff; - - info!("GC starting"); - - debug!("retain_lsns: {:?}", retain_lsns); - - let mut layers_to_remove = Vec::new(); - - // Scan all on-disk layers in the timeline. - // - // Garbage collect the layer if all conditions are satisfied: - // 1. it is older than cutoff LSN; - // 2. it is older than PITR interval; - // 3. it doesn't need to be retained for 'retain_lsns'; - // 4. newer on-disk image layers cover the layer's whole key range - // - let mut layers = self.layers.write().unwrap(); - 'outer: for l in layers.iter_historic_layers() { - // This layer is in the process of being flushed to disk. - // It will be swapped out of the layer map, replaced with - // on-disk layers containing the same data. - // We can't GC it, as it's not on disk. We can't remove it - // from the layer map yet, as it would make its data - // inaccessible. - if l.is_in_memory() { - continue; - } - - result.layers_total += 1; - - // 1. Is it newer than GC horizon cutoff point? - if l.get_lsn_range().end > horizon_cutoff { - debug!( - "keeping {} because it's newer than horizon_cutoff {}", - l.filename().display(), - horizon_cutoff - ); - result.layers_needed_by_cutoff += 1; - continue 'outer; - } - - // 2. It is newer than PiTR cutoff point? - if l.get_lsn_range().end > pitr_cutoff { - debug!( - "keeping {} because it's newer than pitr_cutoff {}", - l.filename().display(), - pitr_cutoff - ); - result.layers_needed_by_pitr += 1; - continue 'outer; - } - - // 3. Is it needed by a child branch? - // NOTE With that we would keep data that - // might be referenced by child branches forever. - // We can track this in child timeline GC and delete parent layers when - // they are no longer needed. This might be complicated with long inheritance chains. - for retain_lsn in retain_lsns { - // start_lsn is inclusive - if &l.get_lsn_range().start <= retain_lsn { - debug!( - "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", - l.filename().display(), - retain_lsn, - l.is_incremental(), - ); - result.layers_needed_by_branches += 1; - continue 'outer; - } - } - - // 4. Is there a later on-disk layer for this relation? - // - // The end-LSN is exclusive, while disk_consistent_lsn is - // inclusive. For example, if disk_consistent_lsn is 100, it is - // OK for a delta layer to have end LSN 101, but if the end LSN - // is 102, then it might not have been fully flushed to disk - // before crash. - // - // For example, imagine that the following layers exist: - // - // 1000 - image (A) - // 1000-2000 - delta (B) - // 2000 - image (C) - // 2000-3000 - delta (D) - // 3000 - image (E) - // - // If GC horizon is at 2500, we can remove layers A and B, but - // we cannot remove C, even though it's older than 2500, because - // the delta layer 2000-3000 depends on it. - if !layers - .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))? - { - debug!( - "keeping {} because it is the latest layer", - l.filename().display() - ); - result.layers_not_updated += 1; - continue 'outer; - } - - // We didn't find any reason to keep this file, so remove it. - debug!( - "garbage collecting {} is_dropped: xx is_incremental: {}", - l.filename().display(), - l.is_incremental(), - ); - layers_to_remove.push(Arc::clone(l)); - } - - // Actually delete the layers from disk and remove them from the map. - // (couldn't do this in the loop above, because you cannot modify a collection - // while iterating it. BTreeMap::retain() would be another option) - let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len()); - for doomed_layer in layers_to_remove { - doomed_layer.delete()?; - if let Some(path) = doomed_layer.local_path() { - layer_paths_to_delete.insert(path); - } - layers.remove_historic(doomed_layer); - result.layers_removed += 1; - } - - if self.upload_layers.load(atomic::Ordering::Relaxed) { - storage_sync::schedule_layer_delete( - self.tenant_id, - self.timeline_id, - layer_paths_to_delete, - ); - } - - result.elapsed = now.elapsed()?; - Ok(result) - } - - /// - /// Reconstruct a value, using the given base image and WAL records in 'data'. - /// - fn reconstruct_value( - &self, - key: Key, - request_lsn: Lsn, - mut data: ValueReconstructState, - ) -> Result { - // Perform WAL redo if needed - data.records.reverse(); - - // If we have a page image, and no WAL, we're all set - if data.records.is_empty() { - if let Some((img_lsn, img)) = &data.img { - trace!( - "found page image for key {} at {}, no WAL redo required", - key, - img_lsn - ); - Ok(img.clone()) - } else { - bail!("base image for {} at {} not found", key, request_lsn); - } - } else { - // We need to do WAL redo. - // - // If we don't have a base image, then the oldest WAL record better initialize - // the page - if data.img.is_none() && !data.records.first().unwrap().1.will_init() { - bail!( - "Base image for {} at {} not found, but got {} WAL records", - key, - request_lsn, - data.records.len() - ); - } else { - let base_img = if let Some((_lsn, img)) = data.img { - trace!( - "found {} WAL records and a base image for {} at {}, performing WAL redo", - data.records.len(), - key, - request_lsn - ); - Some(img) - } else { - trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); - None - }; - - let last_rec_lsn = data.records.last().unwrap().0; - - let img = - self.walredo_mgr - .request_redo(key, request_lsn, base_img, data.records)?; - - if img.len() == page_cache::PAGE_SZ { - let cache = page_cache::get(); - cache.memorize_materialized_page( - self.tenant_id, - self.timeline_id, - key, - last_rec_lsn, - &img, - ); - } - - Ok(img) - } - } - } -} - -/// Helper function for get_reconstruct_data() to add the path of layers traversed -/// to an error, as anyhow context information. -fn layer_traversal_error( - msg: String, - path: Vec<(ValueReconstructResult, Lsn, Arc)>, -) -> anyhow::Result<()> { - // We want the original 'msg' to be the outermost context. The outermost context - // is the most high-level information, which also gets propagated to the client. - let mut msg_iter = path - .iter() - .map(|(r, c, l)| { - format!( - "layer traversal: result {:?}, cont_lsn {}, layer: {}", - r, - c, - l.filename().display() - ) - }) - .chain(std::iter::once(msg)); - // Construct initial message from the first traversed layer - let err = anyhow!(msg_iter.next().unwrap()); - - // Append all subsequent traversals, and the error message 'msg', as contexts. - Err(msg_iter.fold(err, |err, msg| err.context(msg))) -} - -struct LayeredTimelineWriter<'a> { - tl: &'a LayeredTimeline, - _write_guard: MutexGuard<'a, ()>, -} - -impl Deref for LayeredTimelineWriter<'_> { - type Target = dyn Timeline; - - fn deref(&self) -> &Self::Target { - self.tl - } -} - -impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { - fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> { - self.tl.put_value(key, lsn, value) - } - - fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()> { - self.tl.put_tombstone(key_range, lsn) - } - - /// - /// Remember the (end of) last valid WAL record remembered in the timeline. - /// - fn finish_write(&self, new_lsn: Lsn) { - self.tl.finish_write(new_lsn); - } -} - /// Dump contents of a layer file to stdout. pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { use std::os::unix::fs::FileExt; @@ -2836,34 +874,18 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { file.read_exact_at(&mut header_buf, 0)?; match u16::from_be_bytes(header_buf) { - crate::IMAGE_FILE_MAGIC => ImageLayer::new_for_path(path, file)?.dump(verbose)?, - crate::DELTA_FILE_MAGIC => DeltaLayer::new_for_path(path, file)?.dump(verbose)?, + crate::IMAGE_FILE_MAGIC => { + image_layer::ImageLayer::new_for_path(path, file)?.dump(verbose)? + } + crate::DELTA_FILE_MAGIC => { + delta_layer::DeltaLayer::new_for_path(path, file)?.dump(verbose)? + } magic => bail!("unrecognized magic identifier: {:?}", magic), } Ok(()) } -/// Add a suffix to a layer file's name: .{num}.old -/// Uses the first available num (starts at 0) -fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { - let filename = path - .file_name() - .ok_or_else(|| anyhow!("Path {} don't have a file name", path.display()))? - .to_string_lossy(); - let mut new_path = path.clone(); - - for i in 0u32.. { - new_path.set_file_name(format!("{}.{}.old", filename, i)); - if !new_path.exists() { - std::fs::rename(&path, &new_path)?; - return Ok(()); - } - } - - bail!("couldn't find an unused backup number for {:?}", path) -} - pub fn load_metadata( conf: &'static PageServerConf, timeline_id: ZTimelineId, @@ -2893,9 +915,11 @@ pub fn load_metadata( /// #[cfg(test)] pub mod tests { + use super::metadata::METADATA_FILE_NAME; use super::*; use crate::keyspace::KeySpaceAccum; use crate::repository::repo_harness::*; + use crate::repository::{Key, Value}; use rand::{thread_rng, Rng}; #[test] diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index f7f51bf21f..be590c88c2 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -10,9 +10,9 @@ //! corresponding files are written to disk. //! +use crate::layered_repository::inmemory_layer::InMemoryLayer; use crate::layered_repository::storage_layer::Layer; use crate::layered_repository::storage_layer::{range_eq, range_overlaps}; -use crate::layered_repository::InMemoryLayer; use crate::repository::Key; use anyhow::Result; use lazy_static::lazy_static; diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs new file mode 100644 index 0000000000..e862b7def7 --- /dev/null +++ b/pageserver/src/layered_repository/timeline.rs @@ -0,0 +1,2021 @@ +//! + +use anyhow::{anyhow, bail, ensure, Context, Result}; +use bytes::Bytes; +use fail::fail_point; +use itertools::Itertools; +use lazy_static::lazy_static; +use tracing::*; + +use std::cmp::{max, min, Ordering}; +use std::collections::HashSet; +use std::fs; +use std::fs::{File, OpenOptions}; +use std::io::Write; +use std::ops::{Deref, Range}; +use std::path::PathBuf; +use std::sync::atomic::{self, AtomicBool}; +use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError}; +use std::time::{Duration, SystemTime}; + +use metrics::{ + register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec, + Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, +}; + +use crate::layered_repository::{ + delta_layer::{DeltaLayer, DeltaLayerWriter}, + ephemeral_file::is_ephemeral_file, + filename::{DeltaFileName, ImageFileName}, + image_layer::{ImageLayer, ImageLayerWriter}, + inmemory_layer::InMemoryLayer, + layer_map::{LayerMap, SearchResult}, + metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}, + par_fsync, + storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}, +}; + +use crate::config::PageServerConf; +use crate::keyspace::{KeyPartitioning, KeySpace}; +use crate::pgdatadir_mapping::LsnForTimestamp; +use crate::tenant_config::TenantConfOpt; + +use postgres_ffi::xlog_utils::to_pg_timestamp; +use utils::{ + lsn::{AtomicLsn, Lsn, RecordLsn}, + seqwait::SeqWait, + zid::{ZTenantId, ZTimelineId}, +}; + +use crate::repository::{GcResult, RepositoryTimeline, Timeline, TimelineWriter}; +use crate::repository::{Key, Value}; +use crate::tenant_mgr; +use crate::thread_mgr; +use crate::virtual_file::VirtualFile; +use crate::walreceiver::IS_WAL_RECEIVER; +use crate::walredo::WalRedoManager; +use crate::CheckpointConfig; +use crate::{page_cache, storage_sync}; + +// Metrics collected on operations on the storage repository. +lazy_static! { + pub static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( + "pageserver_storage_operations_seconds", + "Time spent on storage operations", + &["operation", "tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); +} + +// Metrics collected on operations on the storage repository. +lazy_static! { + static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!( + "pageserver_getpage_reconstruct_seconds", + "Time spent in reconstruct_value", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); +} + +lazy_static! { + static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!( + "pageserver_materialized_cache_hits_total", + "Number of cache hits from materialized page cache", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); + static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!( + "pageserver_wait_lsn_seconds", + "Time spent waiting for WAL to arrive", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); +} + +lazy_static! { + static ref LAST_RECORD_LSN: IntGaugeVec = register_int_gauge_vec!( + "pageserver_last_record_lsn", + "Last record LSN grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric"); +} + +// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, +// or in testing they estimate how much we would upload if we did. +lazy_static! { + static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!( + "pageserver_created_persistent_files_total", + "Number of files created that are meant to be uploaded to cloud storage", + ) + .expect("failed to define a metric"); + static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!( + "pageserver_written_persistent_bytes_total", + "Total bytes written that are meant to be uploaded to cloud storage", + ) + .expect("failed to define a metric"); +} + +#[derive(Clone)] +pub enum LayeredTimelineEntry { + Loaded(Arc), + Unloaded { + id: ZTimelineId, + metadata: TimelineMetadata, + init_logical_size: Option, + }, +} + +impl LayeredTimelineEntry { + fn timeline_id(&self) -> ZTimelineId { + match self { + LayeredTimelineEntry::Loaded(timeline) => timeline.timeline_id, + LayeredTimelineEntry::Unloaded { id, .. } => *id, + } + } + + pub fn ancestor_timeline_id(&self) -> Option { + match self { + LayeredTimelineEntry::Loaded(timeline) => { + timeline.ancestor_timeline.as_ref().map(|t| t.timeline_id()) + } + LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_timeline(), + } + } + + pub fn ancestor_lsn(&self) -> Lsn { + match self { + LayeredTimelineEntry::Loaded(timeline) => timeline.ancestor_lsn, + LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_lsn(), + } + } + + fn ensure_loaded(&self) -> anyhow::Result<&Arc> { + match self { + LayeredTimelineEntry::Loaded(timeline) => Ok(timeline), + LayeredTimelineEntry::Unloaded { .. } => { + anyhow::bail!("timeline is unloaded") + } + } + } + + pub fn layer_removal_guard(&self) -> Result>, anyhow::Error> { + match self { + LayeredTimelineEntry::Loaded(timeline) => timeline + .layer_removal_cs + .try_lock() + .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) + .map(Some), + + LayeredTimelineEntry::Unloaded { .. } => Ok(None), + } + } +} + +impl From for RepositoryTimeline { + fn from(entry: LayeredTimelineEntry) -> Self { + match entry { + LayeredTimelineEntry::Loaded(timeline) => RepositoryTimeline::Loaded(timeline as _), + LayeredTimelineEntry::Unloaded { metadata, .. } => { + RepositoryTimeline::Unloaded { metadata } + } + } + } +} + +pub struct LayeredTimeline { + conf: &'static PageServerConf, + tenant_conf: Arc>, + + tenant_id: ZTenantId, + pub timeline_id: ZTimelineId, + + pub layers: RwLock, + + last_freeze_at: AtomicLsn, + + // WAL redo manager + walredo_mgr: Arc, + + // What page versions do we hold in the repository? If we get a + // request > last_record_lsn, we need to wait until we receive all + // the WAL up to the request. The SeqWait provides functions for + // that. TODO: If we get a request for an old LSN, such that the + // versions have already been garbage collected away, we should + // throw an error, but we don't track that currently. + // + // last_record_lsn.load().last points to the end of last processed WAL record. + // + // We also remember the starting point of the previous record in + // 'last_record_lsn.load().prev'. It's used to set the xl_prev pointer of the + // first WAL record when the node is started up. But here, we just + // keep track of it. + last_record_lsn: SeqWait, + + // All WAL records have been processed and stored durably on files on + // local disk, up to this LSN. On crash and restart, we need to re-process + // the WAL starting from this point. + // + // Some later WAL records might have been processed and also flushed to disk + // already, so don't be surprised to see some, but there's no guarantee on + // them yet. + disk_consistent_lsn: AtomicLsn, + + // Parent timeline that this timeline was branched from, and the LSN + // of the branch point. + ancestor_timeline: Option, + ancestor_lsn: Lsn, + + // Metrics + reconstruct_time_histo: Histogram, + materialized_page_cache_hit_counter: IntCounter, + flush_time_histo: Histogram, + compact_time_histo: Histogram, + create_images_time_histo: Histogram, + last_record_gauge: IntGauge, + wait_lsn_time_histo: Histogram, + + /// If `true`, will backup its files that appear after each checkpointing to the remote storage. + upload_layers: AtomicBool, + + /// Ensures layers aren't frozen by checkpointer between + /// [`LayeredTimeline::get_layer_for_write`] and layer reads. + /// Locked automatically by [`LayeredTimelineWriter`] and checkpointer. + /// Must always be acquired before the layer map/individual layer lock + /// to avoid deadlock. + write_lock: Mutex<()>, + + /// Used to ensure that there is only one thread + layer_flush_lock: Mutex<()>, + + /// Layer removal lock. + /// A lock to ensure that no layer of the timeline is removed concurrently by other threads. + /// This lock is acquired in [`LayeredTimeline::gc`], [`LayeredTimeline::compact`], + /// and [`LayeredRepository::delete_timeline`]. + layer_removal_cs: Mutex<()>, + + // Needed to ensure that we can't create a branch at a point that was already garbage collected + pub latest_gc_cutoff_lsn: RwLock, + + // List of child timelines and their branch points. This is needed to avoid + // garbage collecting data that is still needed by the child timelines. + pub gc_info: RwLock, + + // It may change across major versions so for simplicity + // keep it after running initdb for a timeline. + // It is needed in checks when we want to error on some operations + // when they are requested for pre-initdb lsn. + // It can be unified with latest_gc_cutoff_lsn under some "first_valid_lsn", + // though lets keep them both for better error visibility. + pub initdb_lsn: Lsn, + + // Initial logical size of timeline (if known). + // Logical size can be copied from ancestor timeline when new branch is create at last LSN + pub init_logical_size: Option, +} + +/// +/// Information about how much history needs to be retained, needed by +/// Garbage Collection. +/// +pub struct GcInfo { + /// Specific LSNs that are needed. + /// + /// Currently, this includes all points where child branches have + /// been forked off from. In the future, could also include + /// explicit user-defined snapshot points. + pub retain_lsns: Vec, + + /// In addition to 'retain_lsns', keep everything newer than this + /// point. + /// + /// This is calculated by subtracting 'gc_horizon' setting from + /// last-record LSN + /// + /// FIXME: is this inclusive or exclusive? + pub horizon_cutoff: Lsn, + + /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this + /// point. + /// + /// This is calculated by finding a number such that a record is needed for PITR + /// if only if its LSN is larger than 'pitr_cutoff'. + pub pitr_cutoff: Lsn, +} + +/// Public interface functions +impl Timeline for LayeredTimeline { + fn get_ancestor_lsn(&self) -> Lsn { + self.ancestor_lsn + } + + fn get_ancestor_timeline_id(&self) -> Option { + self.ancestor_timeline + .as_ref() + .map(LayeredTimelineEntry::timeline_id) + } + + /// Wait until WAL has been received up to the given LSN. + fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { + // This should never be called from the WAL receiver thread, because that could lead + // to a deadlock. + ensure!( + !IS_WAL_RECEIVER.with(|c| c.get()), + "wait_lsn called by WAL receiver thread" + ); + + self.wait_lsn_time_histo.observe_closure_duration( + || self.last_record_lsn + .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) + .with_context(|| { + format!( + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", + lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() + ) + }))?; + + Ok(()) + } + + fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard { + self.latest_gc_cutoff_lsn.read().unwrap() + } + + /// Look up the value with the given a key + fn get(&self, key: Key, lsn: Lsn) -> Result { + debug_assert!(lsn <= self.get_last_record_lsn()); + + // Check the page cache. We will get back the most recent page with lsn <= `lsn`. + // The cached image can be returned directly if there is no WAL between the cached image + // and requested LSN. The cached image can also be used to reduce the amount of WAL needed + // for redo. + let cached_page_img = match self.lookup_cached_page(&key, lsn) { + Some((cached_lsn, cached_img)) => { + match cached_lsn.cmp(&lsn) { + Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check + Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image + Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn + } + Some((cached_lsn, cached_img)) + } + None => None, + }; + + let mut reconstruct_state = ValueReconstructState { + records: Vec::new(), + img: cached_page_img, + }; + + self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; + + self.reconstruct_time_histo + .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) + } + + /// Public entry point for checkpoint(). All the logic is in the private + /// checkpoint_internal function, this public facade just wraps it for + /// metrics collection. + fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { + match cconf { + CheckpointConfig::Flush => { + self.freeze_inmem_layer(false); + self.flush_frozen_layers(true) + } + CheckpointConfig::Forced => { + self.freeze_inmem_layer(false); + self.flush_frozen_layers(true)?; + self.compact() + } + } + } + + /// + /// Validate lsn against initdb_lsn and latest_gc_cutoff_lsn. + /// + fn check_lsn_is_in_scope( + &self, + lsn: Lsn, + latest_gc_cutoff_lsn: &RwLockReadGuard, + ) -> Result<()> { + ensure!( + lsn >= **latest_gc_cutoff_lsn, + "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", + lsn, + **latest_gc_cutoff_lsn, + ); + Ok(()) + } + + fn get_last_record_lsn(&self) -> Lsn { + self.last_record_lsn.load().last + } + + fn get_prev_record_lsn(&self) -> Lsn { + self.last_record_lsn.load().prev + } + + fn get_last_record_rlsn(&self) -> RecordLsn { + self.last_record_lsn.load() + } + + fn get_disk_consistent_lsn(&self) -> Lsn { + self.disk_consistent_lsn.load() + } + + fn writer<'a>(&'a self) -> Box { + Box::new(LayeredTimelineWriter { + tl: self, + _write_guard: self.write_lock.lock().unwrap(), + }) + } +} + +impl LayeredTimeline { + fn get_checkpoint_distance(&self) -> u64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .checkpoint_distance + .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) + } + + fn get_compaction_target_size(&self) -> u64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .compaction_target_size + .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) + } + + fn get_compaction_threshold(&self) -> usize { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .compaction_threshold + .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) + } + + fn get_image_creation_threshold(&self) -> usize { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .image_creation_threshold + .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) + } + + /// Open a Timeline handle. + /// + /// Loads the metadata for the timeline into memory, but not the layer map. + #[allow(clippy::too_many_arguments)] + pub fn new( + conf: &'static PageServerConf, + tenant_conf: Arc>, + metadata: TimelineMetadata, + ancestor: Option, + timeline_id: ZTimelineId, + tenant_id: ZTenantId, + walredo_mgr: Arc, + upload_layers: bool, + init_logical_size: Option, + ) -> LayeredTimeline { + let reconstruct_time_histo = RECONSTRUCT_TIME + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) + .unwrap(); + let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) + .unwrap(); + let flush_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "layer flush", + &tenant_id.to_string(), + &timeline_id.to_string(), + ]) + .unwrap(); + let compact_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "compact", + &tenant_id.to_string(), + &timeline_id.to_string(), + ]) + .unwrap(); + let create_images_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "create images", + &tenant_id.to_string(), + &timeline_id.to_string(), + ]) + .unwrap(); + let last_record_gauge = LAST_RECORD_LSN + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) + .unwrap(); + let wait_lsn_time_histo = WAIT_LSN_TIME + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) + .unwrap(); + + LayeredTimeline { + conf, + tenant_conf, + timeline_id, + tenant_id, + layers: RwLock::new(LayerMap::default()), + + walredo_mgr, + + // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. + last_record_lsn: SeqWait::new(RecordLsn { + last: metadata.disk_consistent_lsn(), + prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)), + }), + disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0), + + last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0), + + ancestor_timeline: ancestor, + ancestor_lsn: metadata.ancestor_lsn(), + + reconstruct_time_histo, + materialized_page_cache_hit_counter, + flush_time_histo, + compact_time_histo, + create_images_time_histo, + last_record_gauge, + wait_lsn_time_histo, + + upload_layers: AtomicBool::new(upload_layers), + + write_lock: Mutex::new(()), + layer_flush_lock: Mutex::new(()), + layer_removal_cs: Mutex::new(()), + + gc_info: RwLock::new(GcInfo { + retain_lsns: Vec::new(), + horizon_cutoff: Lsn(0), + pitr_cutoff: Lsn(0), + }), + + latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), + initdb_lsn: metadata.initdb_lsn(), + init_logical_size, + } + } + + /// + /// Scan the timeline directory to populate the layer map. + /// Returns all timeline-related files that were found and loaded. + /// + pub fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { + let mut layers = self.layers.write().unwrap(); + let mut num_layers = 0; + + // Scan timeline directory and create ImageFileName and DeltaFilename + // structs representing all files on disk + let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); + + for direntry in fs::read_dir(timeline_path)? { + let direntry = direntry?; + let fname = direntry.file_name(); + let fname = fname.to_string_lossy(); + + if let Some(imgfilename) = ImageFileName::parse_str(&fname) { + // create an ImageLayer struct for each image file. + if imgfilename.lsn > disk_consistent_lsn { + warn!( + "found future image layer {} on timeline {} disk_consistent_lsn is {}", + imgfilename, self.timeline_id, disk_consistent_lsn + ); + + rename_to_backup(direntry.path())?; + continue; + } + + let layer = + ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename); + + trace!("found layer {}", layer.filename().display()); + layers.insert_historic(Arc::new(layer)); + num_layers += 1; + } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { + // Create a DeltaLayer struct for each delta file. + // The end-LSN is exclusive, while disk_consistent_lsn is + // inclusive. For example, if disk_consistent_lsn is 100, it is + // OK for a delta layer to have end LSN 101, but if the end LSN + // is 102, then it might not have been fully flushed to disk + // before crash. + if deltafilename.lsn_range.end > disk_consistent_lsn + 1 { + warn!( + "found future delta layer {} on timeline {} disk_consistent_lsn is {}", + deltafilename, self.timeline_id, disk_consistent_lsn + ); + + rename_to_backup(direntry.path())?; + continue; + } + + let layer = + DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename); + + trace!("found layer {}", layer.filename().display()); + layers.insert_historic(Arc::new(layer)); + num_layers += 1; + } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { + // ignore these + } else if is_ephemeral_file(&fname) { + // Delete any old ephemeral files + trace!("deleting old ephemeral file in timeline dir: {}", fname); + fs::remove_file(direntry.path())?; + } else { + warn!("unrecognized filename in timeline dir: {}", fname); + } + } + + layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1); + + info!( + "loaded layer map with {} layers at {}", + num_layers, disk_consistent_lsn + ); + + Ok(()) + } + + /// + /// Get a handle to a Layer for reading. + /// + /// The returned Layer might be from an ancestor timeline, if the + /// segment hasn't been updated on this timeline yet. + /// + /// This function takes the current timeline's locked LayerMap as an argument, + /// so callers can avoid potential race conditions. + fn get_reconstruct_data( + &self, + key: Key, + request_lsn: Lsn, + reconstruct_state: &mut ValueReconstructState, + ) -> anyhow::Result<()> { + // Start from the current timeline. + let mut timeline_owned; + let mut timeline = self; + + // For debugging purposes, collect the path of layers that we traversed + // through. It's included in the error message if we fail to find the key. + let mut traversal_path: Vec<(ValueReconstructResult, Lsn, Arc)> = Vec::new(); + + let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { + *cached_lsn + } else { + Lsn(0) + }; + + // 'prev_lsn' tracks the last LSN that we were at in our search. It's used + // to check that each iteration make some progress, to break infinite + // looping if something goes wrong. + let mut prev_lsn = Lsn(u64::MAX); + + let mut result = ValueReconstructResult::Continue; + let mut cont_lsn = Lsn(request_lsn.0 + 1); + + 'outer: loop { + // The function should have updated 'state' + //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn); + match result { + ValueReconstructResult::Complete => return Ok(()), + ValueReconstructResult::Continue => { + // If we reached an earlier cached page image, we're done. + if cont_lsn == cached_lsn + 1 { + self.materialized_page_cache_hit_counter.inc_by(1); + return Ok(()); + } + if prev_lsn <= cont_lsn { + // Didn't make any progress in last iteration. Error out to avoid + // getting stuck in the loop. + return layer_traversal_error(format!( + "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", + key, + Lsn(cont_lsn.0 - 1), + request_lsn, + timeline.ancestor_lsn + ), traversal_path); + } + prev_lsn = cont_lsn; + } + ValueReconstructResult::Missing => { + return layer_traversal_error( + format!( + "could not find data for key {} at LSN {}, for request at LSN {}", + key, cont_lsn, request_lsn + ), + traversal_path, + ); + } + } + + // Recurse into ancestor if needed + if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { + trace!( + "going into ancestor {}, cont_lsn is {}", + timeline.ancestor_lsn, + cont_lsn + ); + let ancestor = timeline.get_ancestor_timeline()?; + timeline_owned = ancestor; + timeline = &*timeline_owned; + prev_lsn = Lsn(u64::MAX); + continue; + } + + let layers = timeline.layers.read().unwrap(); + + // Check the open and frozen in-memory layers first, in order from newest + // to oldest. + if let Some(open_layer) = &layers.open_layer { + let start_lsn = open_layer.get_lsn_range().start; + if cont_lsn > start_lsn { + //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); + // Get all the data needed to reconstruct the page version from this layer. + // But if we have an older cached page image, no need to go past that. + let lsn_floor = max(cached_lsn + 1, start_lsn); + result = open_layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + )?; + cont_lsn = lsn_floor; + traversal_path.push((result, cont_lsn, open_layer.clone())); + continue; + } + } + for frozen_layer in layers.frozen_layers.iter().rev() { + let start_lsn = frozen_layer.get_lsn_range().start; + if cont_lsn > start_lsn { + //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); + let lsn_floor = max(cached_lsn + 1, start_lsn); + result = frozen_layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + )?; + cont_lsn = lsn_floor; + traversal_path.push((result, cont_lsn, frozen_layer.clone())); + continue 'outer; + } + } + + if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn)? { + //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display()); + + let lsn_floor = max(cached_lsn + 1, lsn_floor); + result = layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + )?; + cont_lsn = lsn_floor; + traversal_path.push((result, cont_lsn, layer)); + } else if timeline.ancestor_timeline.is_some() { + // Nothing on this timeline. Traverse to parent + result = ValueReconstructResult::Continue; + cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); + } else { + // Nothing found + result = ValueReconstructResult::Missing; + } + } + } + + fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> { + let cache = page_cache::get(); + + // FIXME: It's pointless to check the cache for things that are not 8kB pages. + // We should look at the key to determine if it's a cacheable object + let (lsn, read_guard) = + cache.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)?; + let img = Bytes::from(read_guard.to_vec()); + Some((lsn, img)) + } + + fn get_ancestor_timeline(&self) -> Result> { + let ancestor = self + .ancestor_timeline + .as_ref() + .with_context(|| { + format!( + "Ancestor is missing. Timeline id: {} Ancestor id {:?}", + self.timeline_id, + self.get_ancestor_timeline_id(), + ) + })? + .ensure_loaded() + .with_context(|| { + format!( + "Ancestor timeline is not loaded. Timeline id: {} Ancestor id {:?}", + self.timeline_id, + self.get_ancestor_timeline_id(), + ) + })?; + Ok(Arc::clone(ancestor)) + } + + /// + /// Get a handle to the latest layer for appending. + /// + fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result> { + let mut layers = self.layers.write().unwrap(); + + ensure!(lsn.is_aligned()); + + let last_record_lsn = self.get_last_record_lsn(); + ensure!( + lsn > last_record_lsn, + "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", + lsn, + last_record_lsn, + ); + + // Do we have a layer open for writing already? + let layer; + if let Some(open_layer) = &layers.open_layer { + if open_layer.get_lsn_range().start > lsn { + bail!("unexpected open layer in the future"); + } + + layer = Arc::clone(open_layer); + } else { + // No writeable layer yet. Create one. + let start_lsn = layers.next_open_layer_at.unwrap(); + + trace!( + "creating layer for write at {}/{} for record at {}", + self.timeline_id, + start_lsn, + lsn + ); + let new_layer = + InMemoryLayer::create(self.conf, self.timeline_id, self.tenant_id, start_lsn)?; + let layer_rc = Arc::new(new_layer); + + layers.open_layer = Some(Arc::clone(&layer_rc)); + layers.next_open_layer_at = None; + + layer = layer_rc; + } + Ok(layer) + } + + fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { + //info!("PUT: key {} at {}", key, lsn); + let layer = self.get_layer_for_write(lsn)?; + layer.put_value(key, lsn, val)?; + Ok(()) + } + + fn put_tombstone(&self, key_range: Range, lsn: Lsn) -> Result<()> { + let layer = self.get_layer_for_write(lsn)?; + layer.put_tombstone(key_range, lsn)?; + + Ok(()) + } + + fn finish_write(&self, new_lsn: Lsn) { + assert!(new_lsn.is_aligned()); + + self.last_record_gauge.set(new_lsn.0 as i64); + self.last_record_lsn.advance(new_lsn); + } + + fn freeze_inmem_layer(&self, write_lock_held: bool) { + // Freeze the current open in-memory layer. It will be written to disk on next + // iteration. + let _write_guard = if write_lock_held { + None + } else { + Some(self.write_lock.lock().unwrap()) + }; + let mut layers = self.layers.write().unwrap(); + if let Some(open_layer) = &layers.open_layer { + let open_layer_rc = Arc::clone(open_layer); + // Does this layer need freezing? + let end_lsn = Lsn(self.get_last_record_lsn().0 + 1); + open_layer.freeze(end_lsn); + + // The layer is no longer open, update the layer map to reflect this. + // We will replace it with on-disk historics below. + layers.frozen_layers.push_back(open_layer_rc); + layers.open_layer = None; + layers.next_open_layer_at = Some(end_lsn); + self.last_freeze_at.store(end_lsn); + } + drop(layers); + } + + /// + /// Check if more than 'checkpoint_distance' of WAL has been accumulated + /// in the in-memory layer, and initiate flushing it if so. + /// + pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { + let last_lsn = self.get_last_record_lsn(); + let layers = self.layers.read().unwrap(); + if let Some(open_layer) = &layers.open_layer { + let open_layer_size = open_layer.size()?; + drop(layers); + let distance = last_lsn.widening_sub(self.last_freeze_at.load()); + // Checkpointing the open layer can be triggered by layer size or LSN range. + // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and + // we want to stay below that with a big margin. The LSN distance determines how + // much WAL the safekeepers need to store. + if distance >= self.get_checkpoint_distance().into() + || open_layer_size > self.get_checkpoint_distance() + { + info!( + "check_checkpoint_distance {}, layer size {}", + distance, open_layer_size + ); + + self.freeze_inmem_layer(true); + self.last_freeze_at.store(last_lsn); + + // Launch a thread to flush the frozen layer to disk, unless + // a thread was already running. (If the thread was running + // at the time that we froze the layer, it must've seen the + // the layer we just froze before it exited; see comments + // in flush_frozen_layers()) + if let Ok(guard) = self.layer_flush_lock.try_lock() { + drop(guard); + let self_clone = Arc::clone(self); + thread_mgr::spawn( + thread_mgr::ThreadKind::LayerFlushThread, + Some(self.tenant_id), + Some(self.timeline_id), + "layer flush thread", + false, + move || self_clone.flush_frozen_layers(false), + )?; + } + } + } + Ok(()) + } + + /// Flush all frozen layers to disk. + /// + /// Only one thread at a time can be doing layer-flushing for a + /// given timeline. If 'wait' is true, and another thread is + /// currently doing the flushing, this function will wait for it + /// to finish. If 'wait' is false, this function will return + /// immediately instead. + fn flush_frozen_layers(&self, wait: bool) -> Result<()> { + let flush_lock_guard = if wait { + self.layer_flush_lock.lock().unwrap() + } else { + match self.layer_flush_lock.try_lock() { + Ok(guard) => guard, + Err(TryLockError::WouldBlock) => return Ok(()), + Err(TryLockError::Poisoned(err)) => panic!("{:?}", err), + } + }; + + let timer = self.flush_time_histo.start_timer(); + + loop { + let layers = self.layers.read().unwrap(); + if let Some(frozen_layer) = layers.frozen_layers.front() { + let frozen_layer = Arc::clone(frozen_layer); + drop(layers); // to allow concurrent reads and writes + self.flush_frozen_layer(frozen_layer)?; + } else { + // Drop the 'layer_flush_lock' *before* 'layers'. That + // way, if you freeze a layer, and then call + // flush_frozen_layers(false), it is guaranteed that + // if another thread was busy flushing layers and the + // call therefore returns immediately, the other + // thread will have seen the newly-frozen layer and + // will flush that too (assuming no errors). + drop(flush_lock_guard); + drop(layers); + break; + } + } + + timer.stop_and_record(); + + Ok(()) + } + + /// Flush one frozen in-memory layer to disk, as a new delta layer. + fn flush_frozen_layer(&self, frozen_layer: Arc) -> Result<()> { + // As a special case, when we have just imported an image into the repository, + // instead of writing out a L0 delta layer, we directly write out image layer + // files instead. This is possible as long as *all* the data imported into the + // repository have the same LSN. + let lsn_range = frozen_layer.get_lsn_range(); + let layer_paths_to_upload = if lsn_range.start == self.initdb_lsn + && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) + { + let pgdir = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)?; + let (partitioning, _lsn) = + pgdir.repartition(self.initdb_lsn, self.get_compaction_target_size())?; + self.create_image_layers(&partitioning, self.initdb_lsn, true)? + } else { + // normal case, write out a L0 delta layer file. + let delta_path = self.create_delta_layer(&frozen_layer)?; + HashSet::from([delta_path]) + }; + + fail_point!("flush-frozen-before-sync"); + + // The new on-disk layers are now in the layer map. We can remove the + // in-memory layer from the map now. + { + let mut layers = self.layers.write().unwrap(); + let l = layers.frozen_layers.pop_front(); + + // Only one thread may call this function at a time (for this + // timeline). If two threads tried to flush the same frozen + // layer to disk at the same time, that would not work. + assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer)); + + // release lock on 'layers' + } + + fail_point!("checkpoint-after-sync"); + + // Update the metadata file, with new 'disk_consistent_lsn' + // + // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing + // *all* the layers, to avoid fsyncing the file multiple times. + let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); + self.update_disk_consistent_lsn(disk_consistent_lsn, layer_paths_to_upload)?; + + Ok(()) + } + + /// Update metadata file + fn update_disk_consistent_lsn( + &self, + disk_consistent_lsn: Lsn, + layer_paths_to_upload: HashSet, + ) -> Result<()> { + // If we were able to advance 'disk_consistent_lsn', save it the metadata file. + // After crash, we will restart WAL streaming and processing from that point. + let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); + if disk_consistent_lsn != old_disk_consistent_lsn { + assert!(disk_consistent_lsn > old_disk_consistent_lsn); + + // We can only save a valid 'prev_record_lsn' value on disk if we + // flushed *all* in-memory changes to disk. We only track + // 'prev_record_lsn' in memory for the latest processed record, so we + // don't remember what the correct value that corresponds to some old + // LSN is. But if we flush everything, then the value corresponding + // current 'last_record_lsn' is correct and we can store it on disk. + let RecordLsn { + last: last_record_lsn, + prev: prev_record_lsn, + } = self.last_record_lsn.load(); + let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn { + Some(prev_record_lsn) + } else { + None + }; + + let ancestor_timelineid = self + .ancestor_timeline + .as_ref() + .map(LayeredTimelineEntry::timeline_id); + + let metadata = TimelineMetadata::new( + disk_consistent_lsn, + ondisk_prev_record_lsn, + ancestor_timelineid, + self.ancestor_lsn, + *self.latest_gc_cutoff_lsn.read().unwrap(), + self.initdb_lsn, + ); + + fail_point!("checkpoint-before-saving-metadata", |x| bail!( + "{}", + x.unwrap() + )); + + save_metadata( + self.conf, + self.timeline_id, + self.tenant_id, + &metadata, + false, + )?; + + if self.upload_layers.load(atomic::Ordering::Relaxed) { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + layer_paths_to_upload, + Some(metadata), + ); + } + + // Also update the in-memory copy + self.disk_consistent_lsn.store(disk_consistent_lsn); + } + + Ok(()) + } + + // Write out the given frozen in-memory layer as a new L0 delta file + fn create_delta_layer(&self, frozen_layer: &InMemoryLayer) -> Result { + // Write it out + let new_delta = frozen_layer.write_to_disk()?; + let new_delta_path = new_delta.path(); + + // Sync it to disk. + // + // We must also fsync the timeline dir to ensure the directory entries for + // new layer files are durable + // + // TODO: If we're running inside 'flush_frozen_layers' and there are multiple + // files to flush, it might be better to first write them all, and then fsync + // them all in parallel. + par_fsync::par_fsync(&[ + new_delta_path.clone(), + self.conf.timeline_path(&self.timeline_id, &self.tenant_id), + ])?; + + // Add it to the layer map + { + let mut layers = self.layers.write().unwrap(); + layers.insert_historic(Arc::new(new_delta)); + } + + NUM_PERSISTENT_FILES_CREATED.inc_by(1); + PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len()); + + Ok(new_delta_path) + } + + pub fn compact(&self) -> Result<()> { + // + // High level strategy for compaction / image creation: + // + // 1. First, calculate the desired "partitioning" of the + // currently in-use key space. The goal is to partition the + // key space into roughly fixed-size chunks, but also take into + // account any existing image layers, and try to align the + // chunk boundaries with the existing image layers to avoid + // too much churn. Also try to align chunk boundaries with + // relation boundaries. In principle, we don't know about + // relation boundaries here, we just deal with key-value + // pairs, and the code in pgdatadir_mapping.rs knows how to + // map relations into key-value pairs. But in practice we know + // that 'field6' is the block number, and the fields 1-5 + // identify a relation. This is just an optimization, + // though. + // + // 2. Once we know the partitioning, for each partition, + // decide if it's time to create a new image layer. The + // criteria is: there has been too much "churn" since the last + // image layer? The "churn" is fuzzy concept, it's a + // combination of too many delta files, or too much WAL in + // total in the delta file. Or perhaps: if creating an image + // file would allow to delete some older files. + // + // 3. After that, we compact all level0 delta files if there + // are too many of them. While compacting, we also garbage + // collect any page versions that are no longer needed because + // of the new image layers we created in step 2. + // + // TODO: This high level strategy hasn't been implemented yet. + // Below are functions compact_level0() and create_image_layers() + // but they are a bit ad hoc and don't quite work like it's explained + // above. Rewrite it. + let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); + + let target_file_size = self.get_checkpoint_distance(); + + // Define partitioning schema if needed + if let Ok(pgdir) = + tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) + { + // 2. Create new image layers for partitions that have been modified + // "enough". + let (partitioning, lsn) = pgdir.repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + )?; + let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; + if !layer_paths_to_upload.is_empty() + && self.upload_layers.load(atomic::Ordering::Relaxed) + { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + HashSet::from_iter(layer_paths_to_upload), + None, + ); + } + + // 3. Compact + let timer = self.compact_time_histo.start_timer(); + self.compact_level0(target_file_size)?; + timer.stop_and_record(); + } else { + debug!("Could not compact because no partitioning specified yet"); + } + + Ok(()) + } + + // Is it time to create a new image layer for the given partition? + fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result { + let layers = self.layers.read().unwrap(); + + for part_range in &partition.ranges { + let image_coverage = layers.image_coverage(part_range, lsn)?; + for (img_range, last_img) in image_coverage { + let img_lsn = if let Some(last_img) = last_img { + last_img.get_lsn_range().end + } else { + Lsn(0) + }; + // Let's consider an example: + // + // delta layer with LSN range 71-81 + // delta layer with LSN range 81-91 + // delta layer with LSN range 91-101 + // image layer at LSN 100 + // + // If 'lsn' is still 100, i.e. no new WAL has been processed since the last image layer, + // there's no need to create a new one. We check this case explicitly, to avoid passing + // a bogus range to count_deltas below, with start > end. It's even possible that there + // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed + // after we read last_record_lsn, which is passed here in the 'lsn' argument. + if img_lsn < lsn { + let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?; + + debug!( + "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", + img_range.start, img_range.end, num_deltas, img_lsn, lsn + ); + if num_deltas >= self.get_image_creation_threshold() { + return Ok(true); + } + } + } + } + + Ok(false) + } + + fn create_image_layers( + &self, + partitioning: &KeyPartitioning, + lsn: Lsn, + force: bool, + ) -> Result> { + let timer = self.create_images_time_histo.start_timer(); + let mut image_layers: Vec = Vec::new(); + let mut layer_paths_to_upload = HashSet::new(); + for partition in partitioning.parts.iter() { + if force || self.time_for_new_image_layer(partition, lsn)? { + let img_range = + partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_id, + &img_range, + lsn, + )?; + + for range in &partition.ranges { + let mut key = range.start; + while key < range.end { + let img = self.get(key, lsn)?; + image_layer_writer.put_image(key, &img)?; + key = key.next(); + } + } + let image_layer = image_layer_writer.finish()?; + layer_paths_to_upload.insert(image_layer.path()); + image_layers.push(image_layer); + } + } + + // Sync the new layer to disk before adding it to the layer map, to make sure + // we don't garbage collect something based on the new layer, before it has + // reached the disk. + // + // We must also fsync the timeline dir to ensure the directory entries for + // new layer files are durable + // + // Compaction creates multiple image layers. It would be better to create them all + // and fsync them all in parallel. + let mut all_paths = Vec::from_iter(layer_paths_to_upload.clone()); + all_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); + par_fsync::par_fsync(&all_paths)?; + + let mut layers = self.layers.write().unwrap(); + for l in image_layers { + layers.insert_historic(Arc::new(l)); + } + drop(layers); + timer.stop_and_record(); + + Ok(layer_paths_to_upload) + } + + /// + /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as + /// as Level 1 files. + /// + fn compact_level0(&self, target_file_size: u64) -> Result<()> { + let layers = self.layers.read().unwrap(); + let mut level0_deltas = layers.get_level0_deltas()?; + drop(layers); + + // Only compact if enough layers have accumulated. + if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() { + return Ok(()); + } + + // Gather the files to compact in this iteration. + // + // Start with the oldest Level 0 delta file, and collect any other + // level 0 files that form a contiguous sequence, such that the end + // LSN of previous file matches the start LSN of the next file. + // + // Note that if the files don't form such a sequence, we might + // "compact" just a single file. That's a bit pointless, but it allows + // us to get rid of the level 0 file, and compact the other files on + // the next iteration. This could probably made smarter, but such + // "gaps" in the sequence of level 0 files should only happen in case + // of a crash, partial download from cloud storage, or something like + // that, so it's not a big deal in practice. + level0_deltas.sort_by_key(|l| l.get_lsn_range().start); + let mut level0_deltas_iter = level0_deltas.iter(); + + let first_level0_delta = level0_deltas_iter.next().unwrap(); + let mut prev_lsn_end = first_level0_delta.get_lsn_range().end; + let mut deltas_to_compact = vec![Arc::clone(first_level0_delta)]; + for l in level0_deltas_iter { + let lsn_range = l.get_lsn_range(); + + if lsn_range.start != prev_lsn_end { + break; + } + deltas_to_compact.push(Arc::clone(l)); + prev_lsn_end = lsn_range.end; + } + let lsn_range = Range { + start: deltas_to_compact.first().unwrap().get_lsn_range().start, + end: deltas_to_compact.last().unwrap().get_lsn_range().end, + }; + + info!( + "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", + lsn_range.start, + lsn_range.end, + deltas_to_compact.len(), + level0_deltas.len() + ); + for l in deltas_to_compact.iter() { + info!("compact includes {}", l.filename().display()); + } + // We don't need the original list of layers anymore. Drop it so that + // we don't accidentally use it later in the function. + drop(level0_deltas); + + // This iterator walks through all key-value pairs from all the layers + // we're compacting, in key, LSN order. + let all_values_iter = deltas_to_compact + .iter() + .map(|l| l.iter()) + .kmerge_by(|a, b| { + if let Ok((a_key, a_lsn, _)) = a { + if let Ok((b_key, b_lsn, _)) = b { + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + } else { + false + } + } else { + true + } + }); + + // This iterator walks through all keys and is needed to calculate size used by each key + let mut all_keys_iter = deltas_to_compact + .iter() + .map(|l| l.key_iter()) + .kmerge_by(|a, b| { + let (a_key, a_lsn, _) = a; + let (b_key, b_lsn, _) = b; + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + }); + + // Merge the contents of all the input delta layers into a new set + // of delta layers, based on the current partitioning. + // + // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. + // It's possible that there is a single key with so many page versions that storing all of them in a single layer file + // would be too large. In that case, we also split on the LSN dimension. + // + // LSN + // ^ + // | + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // + // + // If one key (X) has a lot of page versions: + // + // LSN + // ^ + // | (X) + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | +--+ | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | +--+ | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // TODO: this actually divides the layers into fixed-size chunks, not + // based on the partitioning. + // + // TODO: we should also opportunistically materialize and + // garbage collect what we can. + let mut new_layers = Vec::new(); + let mut prev_key: Option = None; + let mut writer: Option = None; + let mut key_values_total_size = 0u64; + let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key + let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key + for x in all_values_iter { + let (key, lsn, value) = x?; + let same_key = prev_key.map_or(false, |prev_key| prev_key == key); + // We need to check key boundaries once we reach next key or end of layer with the same key + if !same_key || lsn == dup_end_lsn { + let mut next_key_size = 0u64; + let is_dup_layer = dup_end_lsn.is_valid(); + dup_start_lsn = Lsn::INVALID; + if !same_key { + dup_end_lsn = Lsn::INVALID; + } + // Determine size occupied by this key. We stop at next key, or when size becomes larger than target_file_size + for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { + next_key_size = next_size; + if key != next_key { + if dup_end_lsn.is_valid() { + dup_start_lsn = dup_end_lsn; + dup_end_lsn = lsn_range.end; + } + break; + } + key_values_total_size += next_size; + if key_values_total_size > target_file_size { + // split key between multiple layers: such layer can contain only single key + dup_start_lsn = if dup_end_lsn.is_valid() { + dup_end_lsn + } else { + lsn + }; + dup_end_lsn = next_lsn; + break; + } + } + // handle case when loop reaches last key + if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { + dup_start_lsn = dup_end_lsn; + dup_end_lsn = lsn_range.end; + } + if writer.is_some() { + let written_size = writer.as_mut().unwrap().size(); + // check if key cause layer overflow + if is_dup_layer + || dup_end_lsn.is_valid() + || written_size + key_values_total_size > target_file_size + { + new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?); + writer = None; + } + } + key_values_total_size = next_key_size; + } + if writer.is_none() { + writer = Some(DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_id, + key, + if dup_end_lsn.is_valid() { + // this is a layer containing slice of values of the same key + debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); + dup_start_lsn..dup_end_lsn + } else { + debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); + lsn_range.clone() + }, + )?); + } + writer.as_mut().unwrap().put_value(key, lsn, value)?; + prev_key = Some(key); + } + if let Some(writer) = writer { + new_layers.push(writer.finish(prev_key.unwrap().next())?); + } + + // Sync layers + if !new_layers.is_empty() { + let mut layer_paths: Vec = new_layers.iter().map(|l| l.path()).collect(); + + // also sync the directory + layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); + + // Fsync all the layer files and directory using multiple threads to + // minimize latency. + par_fsync::par_fsync(&layer_paths)?; + + layer_paths.pop().unwrap(); + } + + let mut layers = self.layers.write().unwrap(); + let mut new_layer_paths = HashSet::with_capacity(new_layers.len()); + for l in new_layers { + new_layer_paths.insert(l.path()); + layers.insert_historic(Arc::new(l)); + } + + // Now that we have reshuffled the data to set of new delta layers, we can + // delete the old ones + let mut layer_paths_do_delete = HashSet::with_capacity(deltas_to_compact.len()); + for l in &deltas_to_compact { + l.delete()?; + if let Some(path) = l.local_path() { + layer_paths_do_delete.insert(path); + } + layers.remove_historic(l.clone()); + } + drop(layers); + + if self.upload_layers.load(atomic::Ordering::Relaxed) { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + new_layer_paths, + None, + ); + storage_sync::schedule_layer_delete( + self.tenant_id, + self.timeline_id, + layer_paths_do_delete, + ); + } + + Ok(()) + } + + /// Update information about which layer files need to be retained on + /// garbage collection. This is separate from actually performing the GC, + /// and is updated more frequently, so that compaction can remove obsolete + /// page versions more aggressively. + /// + /// TODO: that's wishful thinking, compaction doesn't actually do that + /// currently. + /// + /// The caller specifies how much history is needed with the 3 arguments: + /// + /// retain_lsns: keep a version of each page at these LSNs + /// cutoff_horizon: also keep everything newer than this LSN + /// pitr: the time duration required to keep data for PITR + /// + /// The 'retain_lsns' list is currently used to prevent removing files that + /// are needed by child timelines. In the future, the user might be able to + /// name additional points in time to retain. The caller is responsible for + /// collecting that information. + /// + /// The 'cutoff_horizon' point is used to retain recent versions that might still be + /// needed by read-only nodes. (As of this writing, the caller just passes + /// the latest LSN subtracted by a constant, and doesn't do anything smart + /// to figure out what read-only nodes might actually need.) + /// + /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine + /// whether a record is needed for PITR. + pub fn update_gc_info( + &self, + retain_lsns: Vec, + cutoff_horizon: Lsn, + pitr: Duration, + ) -> Result<()> { + let mut gc_info = self.gc_info.write().unwrap(); + + gc_info.horizon_cutoff = cutoff_horizon; + gc_info.retain_lsns = retain_lsns; + + // Calculate pitr cutoff point. + // If we cannot determine a cutoff LSN, be conservative and don't GC anything. + let mut pitr_cutoff_lsn: Lsn = *self.get_latest_gc_cutoff_lsn(); + + if let Ok(timeline) = + tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) + { + let now = SystemTime::now(); + // First, calculate pitr_cutoff_timestamp and then convert it to LSN. + // If we don't have enough data to convert to LSN, + // play safe and don't remove any layers. + if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { + let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); + + match timeline.find_lsn_for_timestamp(pitr_timestamp)? { + LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, + LsnForTimestamp::Future(lsn) => { + debug!("future({})", lsn); + pitr_cutoff_lsn = gc_info.horizon_cutoff; + } + LsnForTimestamp::Past(lsn) => { + debug!("past({})", lsn); + } + LsnForTimestamp::NoData(lsn) => { + debug!("nodata({})", lsn); + } + } + debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn) + } + } else if cfg!(test) { + // We don't have local timeline in mocked cargo tests. + // So, just ignore pitr_interval setting in this case. + pitr_cutoff_lsn = gc_info.horizon_cutoff; + } + gc_info.pitr_cutoff = pitr_cutoff_lsn; + + Ok(()) + } + + /// + /// Garbage collect layer files on a timeline that are no longer needed. + /// + /// Currently, we don't make any attempt at removing unneeded page versions + /// within a layer file. We can only remove the whole file if it's fully + /// obsolete. + /// + pub fn gc(&self) -> Result { + let mut result: GcResult = Default::default(); + let now = SystemTime::now(); + + fail_point!("before-timeline-gc"); + + let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); + + let gc_info = self.gc_info.read().unwrap(); + + let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn()); + let pitr_cutoff = gc_info.pitr_cutoff; + let retain_lsns = &gc_info.retain_lsns; + + let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + + // Nothing to GC. Return early. + let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); + if latest_gc_cutoff >= new_gc_cutoff { + info!( + "Nothing to GC for timeline {}: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", + self.timeline_id + ); + return Ok(result); + } + + let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %new_gc_cutoff).entered(); + + // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. + // See branch_timeline() for details. + *self.latest_gc_cutoff_lsn.write().unwrap() = new_gc_cutoff; + + info!("GC starting"); + + debug!("retain_lsns: {:?}", retain_lsns); + + let mut layers_to_remove = Vec::new(); + + // Scan all on-disk layers in the timeline. + // + // Garbage collect the layer if all conditions are satisfied: + // 1. it is older than cutoff LSN; + // 2. it is older than PITR interval; + // 3. it doesn't need to be retained for 'retain_lsns'; + // 4. newer on-disk image layers cover the layer's whole key range + // + let mut layers = self.layers.write().unwrap(); + 'outer: for l in layers.iter_historic_layers() { + // This layer is in the process of being flushed to disk. + // It will be swapped out of the layer map, replaced with + // on-disk layers containing the same data. + // We can't GC it, as it's not on disk. We can't remove it + // from the layer map yet, as it would make its data + // inaccessible. + if l.is_in_memory() { + continue; + } + + result.layers_total += 1; + + // 1. Is it newer than GC horizon cutoff point? + if l.get_lsn_range().end > horizon_cutoff { + debug!( + "keeping {} because it's newer than horizon_cutoff {}", + l.filename().display(), + horizon_cutoff + ); + result.layers_needed_by_cutoff += 1; + continue 'outer; + } + + // 2. It is newer than PiTR cutoff point? + if l.get_lsn_range().end > pitr_cutoff { + debug!( + "keeping {} because it's newer than pitr_cutoff {}", + l.filename().display(), + pitr_cutoff + ); + result.layers_needed_by_pitr += 1; + continue 'outer; + } + + // 3. Is it needed by a child branch? + // NOTE With that we would keep data that + // might be referenced by child branches forever. + // We can track this in child timeline GC and delete parent layers when + // they are no longer needed. This might be complicated with long inheritance chains. + for retain_lsn in retain_lsns { + // start_lsn is inclusive + if &l.get_lsn_range().start <= retain_lsn { + debug!( + "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", + l.filename().display(), + retain_lsn, + l.is_incremental(), + ); + result.layers_needed_by_branches += 1; + continue 'outer; + } + } + + // 4. Is there a later on-disk layer for this relation? + // + // The end-LSN is exclusive, while disk_consistent_lsn is + // inclusive. For example, if disk_consistent_lsn is 100, it is + // OK for a delta layer to have end LSN 101, but if the end LSN + // is 102, then it might not have been fully flushed to disk + // before crash. + // + // For example, imagine that the following layers exist: + // + // 1000 - image (A) + // 1000-2000 - delta (B) + // 2000 - image (C) + // 2000-3000 - delta (D) + // 3000 - image (E) + // + // If GC horizon is at 2500, we can remove layers A and B, but + // we cannot remove C, even though it's older than 2500, because + // the delta layer 2000-3000 depends on it. + if !layers + .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))? + { + debug!( + "keeping {} because it is the latest layer", + l.filename().display() + ); + result.layers_not_updated += 1; + continue 'outer; + } + + // We didn't find any reason to keep this file, so remove it. + debug!( + "garbage collecting {} is_dropped: xx is_incremental: {}", + l.filename().display(), + l.is_incremental(), + ); + layers_to_remove.push(Arc::clone(l)); + } + + // Actually delete the layers from disk and remove them from the map. + // (couldn't do this in the loop above, because you cannot modify a collection + // while iterating it. BTreeMap::retain() would be another option) + let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len()); + for doomed_layer in layers_to_remove { + doomed_layer.delete()?; + if let Some(path) = doomed_layer.local_path() { + layer_paths_to_delete.insert(path); + } + layers.remove_historic(doomed_layer); + result.layers_removed += 1; + } + + if self.upload_layers.load(atomic::Ordering::Relaxed) { + storage_sync::schedule_layer_delete( + self.tenant_id, + self.timeline_id, + layer_paths_to_delete, + ); + } + + result.elapsed = now.elapsed()?; + Ok(result) + } + + /// + /// Reconstruct a value, using the given base image and WAL records in 'data'. + /// + fn reconstruct_value( + &self, + key: Key, + request_lsn: Lsn, + mut data: ValueReconstructState, + ) -> Result { + // Perform WAL redo if needed + data.records.reverse(); + + // If we have a page image, and no WAL, we're all set + if data.records.is_empty() { + if let Some((img_lsn, img)) = &data.img { + trace!( + "found page image for key {} at {}, no WAL redo required", + key, + img_lsn + ); + Ok(img.clone()) + } else { + bail!("base image for {} at {} not found", key, request_lsn); + } + } else { + // We need to do WAL redo. + // + // If we don't have a base image, then the oldest WAL record better initialize + // the page + if data.img.is_none() && !data.records.first().unwrap().1.will_init() { + bail!( + "Base image for {} at {} not found, but got {} WAL records", + key, + request_lsn, + data.records.len() + ); + } else { + let base_img = if let Some((_lsn, img)) = data.img { + trace!( + "found {} WAL records and a base image for {} at {}, performing WAL redo", + data.records.len(), + key, + request_lsn + ); + Some(img) + } else { + trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); + None + }; + + let last_rec_lsn = data.records.last().unwrap().0; + + let img = + self.walredo_mgr + .request_redo(key, request_lsn, base_img, data.records)?; + + if img.len() == page_cache::PAGE_SZ { + let cache = page_cache::get(); + cache.memorize_materialized_page( + self.tenant_id, + self.timeline_id, + key, + last_rec_lsn, + &img, + ); + } + + Ok(img) + } + } + } +} + +/// Helper function for get_reconstruct_data() to add the path of layers traversed +/// to an error, as anyhow context information. +fn layer_traversal_error( + msg: String, + path: Vec<(ValueReconstructResult, Lsn, Arc)>, +) -> anyhow::Result<()> { + // We want the original 'msg' to be the outermost context. The outermost context + // is the most high-level information, which also gets propagated to the client. + let mut msg_iter = path + .iter() + .map(|(r, c, l)| { + format!( + "layer traversal: result {:?}, cont_lsn {}, layer: {}", + r, + c, + l.filename().display() + ) + }) + .chain(std::iter::once(msg)); + // Construct initial message from the first traversed layer + let err = anyhow!(msg_iter.next().unwrap()); + + // Append all subsequent traversals, and the error message 'msg', as contexts. + Err(msg_iter.fold(err, |err, msg| err.context(msg))) +} + +struct LayeredTimelineWriter<'a> { + tl: &'a LayeredTimeline, + _write_guard: MutexGuard<'a, ()>, +} + +impl Deref for LayeredTimelineWriter<'_> { + type Target = dyn Timeline; + + fn deref(&self) -> &Self::Target { + self.tl + } +} + +impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { + fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> { + self.tl.put_value(key, lsn, value) + } + + fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()> { + self.tl.put_tombstone(key_range, lsn) + } + + /// + /// Remember the (end of) last valid WAL record remembered in the timeline. + /// + fn finish_write(&self, new_lsn: Lsn) { + self.tl.finish_write(new_lsn); + } +} + +/// Add a suffix to a layer file's name: .{num}.old +/// Uses the first available num (starts at 0) +fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { + let filename = path + .file_name() + .ok_or_else(|| anyhow!("Path {} don't have a file name", path.display()))? + .to_string_lossy(); + let mut new_path = path.clone(); + + for i in 0u32.. { + new_path.set_file_name(format!("{}.{}.old", filename, i)); + if !new_path.exists() { + std::fs::rename(&path, &new_path)?; + return Ok(()); + } + } + + bail!("couldn't find an unused backup number for {:?}", path) +} + +/// Save timeline metadata to file +pub fn save_metadata( + conf: &'static PageServerConf, + timelineid: ZTimelineId, + tenantid: ZTenantId, + data: &TimelineMetadata, + first_save: bool, +) -> Result<()> { + let _enter = info_span!("saving metadata").entered(); + let path = metadata_path(conf, timelineid, tenantid); + // use OpenOptions to ensure file presence is consistent with first_save + let mut file = VirtualFile::open_with_options( + &path, + OpenOptions::new().write(true).create_new(first_save), + )?; + + let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?; + + if file.write(&metadata_bytes)? != metadata_bytes.len() { + bail!("Could not write all the metadata bytes in a single call"); + } + file.sync_all()?; + + // fsync the parent directory to ensure the directory entry is durable + if first_save { + let timeline_dir = File::open( + &path + .parent() + .expect("Metadata should always have a parent dir"), + )?; + timeline_dir.sync_all()?; + } + + Ok(()) +} diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index ac5fb0bc8c..fe1ba4b5bb 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -176,7 +176,6 @@ use crate::{ layered_repository::{ ephemeral_file::is_ephemeral_file, metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}, - LayeredRepository, }, storage_sync::{self, index::RemoteIndex}, tenant_mgr::attach_downloaded_tenants, @@ -1257,7 +1256,13 @@ async fn update_local_metadata( timeline_id, } = sync_id; tokio::task::spawn_blocking(move || { - LayeredRepository::save_metadata(conf, timeline_id, tenant_id, &cloned_metadata, true) + crate::layered_repository::save_metadata( + conf, + timeline_id, + tenant_id, + &cloned_metadata, + true, + ) }) .await .with_context(|| { From 5a4394a8df1a573a3e3fa27bcd7a792f4f00139d Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 26 Jul 2022 22:21:05 +0300 Subject: [PATCH 0556/1022] Do not hold timelines lock while calling update_gc_info to avoid recusrive mutex lock and so deadlock (#2163) --- pageserver/src/layered_repository.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index ff230ed3c3..d770e736e9 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -752,7 +752,7 @@ impl LayeredRepository { // grab mutex to prevent new timelines from being created here. let gc_cs = self.gc_cs.lock().unwrap(); - let mut timelines = self.timelines.lock().unwrap(); + let timelines = self.timelines.lock().unwrap(); // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. @@ -789,15 +789,14 @@ impl LayeredRepository { }) .collect::>() }; + drop(timelines); // Ok, we now know all the branch points. // Update the GC information for each timeline. let mut gc_timelines = Vec::with_capacity(timeline_ids.len()); for timeline_id in timeline_ids { // Timeline is known to be local and loaded. - let timeline = self - .get_timeline_load_internal(timeline_id, &mut *timelines)? - .expect("checked above that timeline is local and loaded"); + let timeline = self.get_timeline_load(timeline_id)?; // If target_timeline is specified, ignore all other timelines if let Some(target_timelineid) = target_timeline_id { @@ -819,7 +818,6 @@ impl LayeredRepository { gc_timelines.push(timeline); } } - drop(timelines); drop(gc_cs); // Perform GC for each timeline. From d6f12cff8e1dd858a961369d6caf1e8ea8345759 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 27 Jul 2022 10:26:21 +0300 Subject: [PATCH 0557/1022] Make DatadirTimeline a trait, implemented by LayeredTimeline. Previously DatadirTimeline was a separate struct, and there was a 1:1 relationship between each DatadirTimeline and LayeredTimeline. That was a bit awkward; whenever you created a timeline, you also needed to create the DatadirTimeline wrapper around it, and if you only had a reference to the LayeredTimeline, you would need to look up the corresponding DatadirTimeline struct through tenant_mgr::get_local_timeline_with_load(). There were a couple of calls like that from LayeredTimeline itself. Refactor DatadirTimeline, so that it's a trait, and mark LayeredTimeline as implementing that trait. That way, there's only one object, LayeredTimeline, and you can call both Timeline and DatadirTimeline functions on that. You can now also call DatadirTimeline functions from LayeredTimeline itself. I considered just moving all the functions from DatadirTimeline directly to Timeline/LayeredTimeline, but I still like to have some separation. Timeline provides a simple key-value API, and handles durably storing key/value pairs, and branching. Whereas DatadirTimeline is stateless, and provides an abstraction over the key-value store, to present an interface with relations, databases, etc. Postgres concepts. This simplified the logical size calculation fast-path for branch creation, introduced in commit 28243d68e6. LayerTimeline can now access the ancestor's logical size directly, so it doesn't need the caller to pass it to it. I moved the fast-path to init_logical_size() function itself. It now checks if the ancestor's last LSN is the same as the branch point, i.e. if there haven't been any changes on the ancestor after the branch, and copies the size from there. An additional bonus is that the optimization will now work any time you have a branch of another branch, with no changes from the ancestor, not only at a create-branch command. --- pageserver/src/basebackup.rs | 26 +- pageserver/src/import_datadir.rs | 32 ++- pageserver/src/layered_repository.rs | 38 +-- pageserver/src/layered_repository/timeline.rs | 206 ++++++++++----- pageserver/src/lib.rs | 3 +- pageserver/src/page_service.rs | 58 ++--- pageserver/src/pgdatadir_mapping.rs | 243 ++++++------------ pageserver/src/repository.rs | 4 +- pageserver/src/tenant_mgr.rs | 28 +- pageserver/src/timelines.rs | 38 ++- pageserver/src/walingest.rs | 72 +++--- .../src/walreceiver/connection_manager.rs | 20 +- .../src/walreceiver/walreceiver_connection.rs | 5 +- 13 files changed, 361 insertions(+), 412 deletions(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 3ec1ec9243..5837447ce8 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -23,8 +23,7 @@ use tar::{Builder, EntryType, Header}; use tracing::*; use crate::reltag::{RelTag, SlruKind}; -use crate::repository::Timeline; -use crate::DatadirTimelineImpl; +use crate::DatadirTimeline; use postgres_ffi::xlog_utils::*; use postgres_ffi::*; use utils::lsn::Lsn; @@ -32,12 +31,13 @@ use utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, /// created mostly to avoid passing a lot of parameters between various functions /// used for constructing tarball. -pub struct Basebackup<'a, W> +pub struct Basebackup<'a, W, T> where W: Write, + T: DatadirTimeline, { ar: Builder>, - timeline: &'a Arc, + timeline: &'a Arc, pub lsn: Lsn, prev_record_lsn: Lsn, full_backup: bool, @@ -52,17 +52,18 @@ where // * When working without safekeepers. In this situation it is important to match the lsn // we are taking basebackup on with the lsn that is used in pageserver's walreceiver // to start the replication. -impl<'a, W> Basebackup<'a, W> +impl<'a, W, T> Basebackup<'a, W, T> where W: Write, + T: DatadirTimeline, { pub fn new( write: W, - timeline: &'a Arc, + timeline: &'a Arc, req_lsn: Option, prev_lsn: Option, full_backup: bool, - ) -> Result> { + ) -> Result> { // Compute postgres doesn't have any previous WAL files, but the first // record that it's going to write needs to include the LSN of the // previous record (xl_prev). We include prev_record_lsn in the @@ -79,13 +80,13 @@ where let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { // Backup was requested at a particular LSN. Wait for it to arrive. info!("waiting for {}", req_lsn); - timeline.tline.wait_lsn(req_lsn)?; + timeline.wait_lsn(req_lsn)?; // If the requested point is the end of the timeline, we can // provide prev_lsn. (get_last_record_rlsn() might return it as // zero, though, if no WAL has been generated on this timeline // yet.) - let end_of_timeline = timeline.tline.get_last_record_rlsn(); + let end_of_timeline = timeline.get_last_record_rlsn(); if req_lsn == end_of_timeline.last { (end_of_timeline.prev, req_lsn) } else { @@ -93,7 +94,7 @@ where } } else { // Backup was requested at end of the timeline. - let end_of_timeline = timeline.tline.get_last_record_rlsn(); + let end_of_timeline = timeline.get_last_record_rlsn(); (end_of_timeline.prev, end_of_timeline.last) }; @@ -371,7 +372,7 @@ where // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { - if self.lsn == self.timeline.tline.get_ancestor_lsn() { + if self.lsn == self.timeline.get_ancestor_lsn() { write!(zenith_signal, "PREV LSN: none")?; } else { write!(zenith_signal, "PREV LSN: invalid")?; @@ -402,9 +403,10 @@ where } } -impl<'a, W> Drop for Basebackup<'a, W> +impl<'a, W, T> Drop for Basebackup<'a, W, T> where W: Write, + T: DatadirTimeline, { /// If the basebackup was not finished, prevent the Archive::drop() from /// writing the end-of-archive marker. diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 6402657e05..ccfd83400a 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -13,8 +13,6 @@ use walkdir::WalkDir; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; -use crate::repository::Repository; -use crate::repository::Timeline; use crate::walingest::WalIngest; use crate::walrecord::DecodedWALRecord; use postgres_ffi::relfile_utils::*; @@ -30,9 +28,9 @@ use utils::lsn::Lsn; /// This is currently only used to import a cluster freshly created by initdb. /// The code that deals with the checkpoint would not work right if the /// cluster was not shut down cleanly. -pub fn import_timeline_from_postgres_datadir( +pub fn import_timeline_from_postgres_datadir( path: &Path, - tline: &mut DatadirTimeline, + tline: &T, lsn: Lsn, ) -> Result<()> { let mut pg_control: Option = None; @@ -90,8 +88,8 @@ pub fn import_timeline_from_postgres_datadir( } // subroutine of import_timeline_from_postgres_datadir(), to load one relation file. -fn import_rel( - modification: &mut DatadirModification, +fn import_rel( + modification: &mut DatadirModification, path: &Path, spcoid: Oid, dboid: Oid, @@ -170,8 +168,8 @@ fn import_rel( /// Import an SLRU segment file /// -fn import_slru( - modification: &mut DatadirModification, +fn import_slru( + modification: &mut DatadirModification, slru: SlruKind, path: &Path, mut reader: Reader, @@ -226,9 +224,9 @@ fn import_slru( /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. -fn import_wal( +fn import_wal( walpath: &Path, - tline: &mut DatadirTimeline, + tline: &T, startpoint: Lsn, endpoint: Lsn, ) -> Result<()> { @@ -297,8 +295,8 @@ fn import_wal( Ok(()) } -pub fn import_basebackup_from_tar( - tline: &mut DatadirTimeline, +pub fn import_basebackup_from_tar( + tline: &T, reader: Reader, base_lsn: Lsn, ) -> Result<()> { @@ -339,8 +337,8 @@ pub fn import_basebackup_from_tar( Ok(()) } -pub fn import_wal_from_tar( - tline: &mut DatadirTimeline, +pub fn import_wal_from_tar( + tline: &T, reader: Reader, start_lsn: Lsn, end_lsn: Lsn, @@ -420,8 +418,8 @@ pub fn import_wal_from_tar( Ok(()) } -pub fn import_file( - modification: &mut DatadirModification, +pub fn import_file( + modification: &mut DatadirModification, file_path: &Path, reader: Reader, len: usize, @@ -540,7 +538,7 @@ pub fn import_file( // zenith.signal is not necessarily the last file, that we handle // but it is ok to call `finish_write()`, because final `modification.commit()` // will update lsn once more to the final one. - let writer = modification.tline.tline.writer(); + let writer = modification.tline.writer(); writer.finish_write(prev_lsn); debug!("imported zenith signal {}", prev_lsn); diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index d770e736e9..c500b05e66 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -32,7 +32,6 @@ use crate::storage_sync::index::RemoteIndex; use crate::tenant_config::{TenantConf, TenantConfOpt}; use crate::repository::{GcResult, Repository, RepositoryTimeline, Timeline}; -use crate::tenant_mgr; use crate::thread_mgr; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; @@ -181,7 +180,6 @@ impl Repository for LayeredRepository { self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, - None, ); timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); @@ -246,20 +244,6 @@ impl Repository for LayeredRepository { )); } } - // Copy logical size from source timeline if we are branching on the last position. - let init_logical_size = - if let Ok(src_pgdir) = tenant_mgr::get_local_timeline_with_load(self.tenant_id, src) { - let logical_size = src_pgdir.get_current_logical_size(); - // Check LSN after getting logical size to exclude race condition - // when ancestor timeline is concurrently updated - if src_timeline.get_last_record_lsn() == start_lsn { - Some(logical_size) - } else { - None - } - } else { - None - }; // Determine prev-LSN for the new timeline. We can only determine it if // the timeline was branched at the current end of the source timeline. @@ -290,14 +274,7 @@ impl Repository for LayeredRepository { ); crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; timeline::save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; - timelines.insert( - dst, - LayeredTimelineEntry::Unloaded { - id: dst, - metadata, - init_logical_size, - }, - ); + timelines.insert(dst, LayeredTimelineEntry::Unloaded { id: dst, metadata }); info!("branched timeline {} from {} at {}", dst, src, start_lsn); @@ -433,7 +410,7 @@ impl Repository for LayeredRepository { // we need to get metadata of a timeline, another option is to pass it along with Downloaded status let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?; // finally we make newly downloaded timeline visible to repository - entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, init_logical_size: None }) + entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata }) }, }; Ok(()) @@ -551,18 +528,13 @@ impl LayeredRepository { timelineid: ZTimelineId, timelines: &mut HashMap, ) -> anyhow::Result>> { - let logical_size: Option; match timelines.get(&timelineid) { Some(entry) => match entry { LayeredTimelineEntry::Loaded(local_timeline) => { debug!("timeline {} found loaded into memory", &timelineid); return Ok(Some(Arc::clone(local_timeline))); } - LayeredTimelineEntry::Unloaded { - init_logical_size, .. - } => { - logical_size = *init_logical_size; - } + LayeredTimelineEntry::Unloaded { .. } => {} }, None => { debug!("timeline {} not found", &timelineid); @@ -573,7 +545,7 @@ impl LayeredRepository { "timeline {} found on a local disk, but not loaded into the memory, loading", &timelineid ); - let timeline = self.load_local_timeline(timelineid, timelines, logical_size)?; + let timeline = self.load_local_timeline(timelineid, timelines)?; let was_loaded = timelines.insert( timelineid, LayeredTimelineEntry::Loaded(Arc::clone(&timeline)), @@ -590,7 +562,6 @@ impl LayeredRepository { &self, timeline_id: ZTimelineId, timelines: &mut HashMap, - init_logical_size: Option, ) -> anyhow::Result> { let metadata = load_metadata(self.conf, timeline_id, self.tenant_id) .context("failed to load metadata")?; @@ -617,7 +588,6 @@ impl LayeredRepository { self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, - init_logical_size, ); timeline .load_layer_map(disk_consistent_lsn) diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index e862b7def7..bdc74160aa 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -14,7 +14,7 @@ use std::fs::{File, OpenOptions}; use std::io::Write; use std::ops::{Deref, Range}; use std::path::PathBuf; -use std::sync::atomic::{self, AtomicBool}; +use std::sync::atomic::{self, AtomicBool, AtomicIsize, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError}; use std::time::{Duration, SystemTime}; @@ -39,6 +39,7 @@ use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::tenant_config::TenantConfOpt; +use crate::DatadirTimeline; use postgres_ffi::xlog_utils::to_pg_timestamp; use utils::{ @@ -49,7 +50,6 @@ use utils::{ use crate::repository::{GcResult, RepositoryTimeline, Timeline, TimelineWriter}; use crate::repository::{Key, Value}; -use crate::tenant_mgr; use crate::thread_mgr; use crate::virtual_file::VirtualFile; use crate::walreceiver::IS_WAL_RECEIVER; @@ -122,7 +122,6 @@ pub enum LayeredTimelineEntry { Unloaded { id: ZTimelineId, metadata: TimelineMetadata, - init_logical_size: Option, }, } @@ -269,11 +268,21 @@ pub struct LayeredTimeline { // though lets keep them both for better error visibility. pub initdb_lsn: Lsn, - // Initial logical size of timeline (if known). - // Logical size can be copied from ancestor timeline when new branch is create at last LSN - pub init_logical_size: Option, + /// When did we last calculate the partitioning? + partitioning: Mutex<(KeyPartitioning, Lsn)>, + + /// Configuration: how often should the partitioning be recalculated. + repartition_threshold: u64, + + /// Current logical size of the "datadir", at the last LSN. + current_logical_size: AtomicIsize, } +/// Inherit all the functions from DatadirTimeline, to provide the +/// functionality to store PostgreSQL relations, SLRUs, etc. in a +/// LayeredTimeline. +impl DatadirTimeline for LayeredTimeline {} + /// /// Information about how much history needs to be retained, needed by /// Garbage Collection. @@ -472,7 +481,6 @@ impl LayeredTimeline { tenant_id: ZTenantId, walredo_mgr: Arc, upload_layers: bool, - init_logical_size: Option, ) -> LayeredTimeline { let reconstruct_time_histo = RECONSTRUCT_TIME .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) @@ -508,7 +516,7 @@ impl LayeredTimeline { .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) .unwrap(); - LayeredTimeline { + let mut result = LayeredTimeline { conf, tenant_conf, timeline_id, @@ -551,8 +559,13 @@ impl LayeredTimeline { latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), - init_logical_size, - } + + current_logical_size: AtomicIsize::new(0), + partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), + repartition_threshold: 0, + }; + result.repartition_threshold = result.get_checkpoint_distance() / 10; + result } /// @@ -634,6 +647,58 @@ impl LayeredTimeline { Ok(()) } + /// (Re-)calculate the logical size of the database at the latest LSN. + /// + /// This can be a slow operation. + pub fn init_logical_size(&self) -> Result<()> { + // Try a fast-path first: + // Copy logical size from ancestor timeline if there has been no changes on this + // branch, and no changes on the ancestor branch since the branch point. + if self.get_ancestor_lsn() == self.get_last_record_lsn() && self.ancestor_timeline.is_some() + { + let ancestor = self.get_ancestor_timeline()?; + let ancestor_logical_size = ancestor.get_current_logical_size(); + // Check LSN after getting logical size to exclude race condition + // when ancestor timeline is concurrently updated. + // + // Logical size 0 means that it was not initialized, so don't believe that. + if ancestor_logical_size != 0 && ancestor.get_last_record_lsn() == self.ancestor_lsn { + self.current_logical_size + .store(ancestor_logical_size as isize, AtomicOrdering::SeqCst); + debug!( + "logical size copied from ancestor: {}", + ancestor_logical_size + ); + return Ok(()); + } + } + + // Have to calculate it the hard way + let last_lsn = self.get_last_record_lsn(); + let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?; + self.current_logical_size + .store(logical_size as isize, AtomicOrdering::SeqCst); + debug!("calculated logical size the hard way: {}", logical_size); + Ok(()) + } + + /// Retrieve current logical size of the timeline + /// + /// NOTE: counted incrementally, includes ancestors, + pub fn get_current_logical_size(&self) -> usize { + let current_logical_size = self.current_logical_size.load(AtomicOrdering::Acquire); + match usize::try_from(current_logical_size) { + Ok(sz) => sz, + Err(_) => { + error!( + "current_logical_size is out of range: {}", + current_logical_size + ); + 0 + } + } + } + /// /// Get a handle to a Layer for reading. /// @@ -1003,18 +1068,16 @@ impl LayeredTimeline { // files instead. This is possible as long as *all* the data imported into the // repository have the same LSN. let lsn_range = frozen_layer.get_lsn_range(); - let layer_paths_to_upload = if lsn_range.start == self.initdb_lsn - && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) - { - let pgdir = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)?; - let (partitioning, _lsn) = - pgdir.repartition(self.initdb_lsn, self.get_compaction_target_size())?; - self.create_image_layers(&partitioning, self.initdb_lsn, true)? - } else { - // normal case, write out a L0 delta layer file. - let delta_path = self.create_delta_layer(&frozen_layer)?; - HashSet::from([delta_path]) - }; + let layer_paths_to_upload = + if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) { + let (partitioning, _lsn) = + self.repartition(self.initdb_lsn, self.get_compaction_target_size())?; + self.create_image_layers(&partitioning, self.initdb_lsn, true)? + } else { + // normal case, write out a L0 delta layer file. + let delta_path = self.create_delta_layer(&frozen_layer)?; + HashSet::from([delta_path]) + }; fail_point!("flush-frozen-before-sync"); @@ -1186,38 +1249,56 @@ impl LayeredTimeline { let target_file_size = self.get_checkpoint_distance(); // Define partitioning schema if needed - if let Ok(pgdir) = - tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) - { - // 2. Create new image layers for partitions that have been modified - // "enough". - let (partitioning, lsn) = pgdir.repartition( - self.get_last_record_lsn(), - self.get_compaction_target_size(), - )?; - let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; - if !layer_paths_to_upload.is_empty() - && self.upload_layers.load(atomic::Ordering::Relaxed) - { - storage_sync::schedule_layer_upload( - self.tenant_id, - self.timeline_id, - HashSet::from_iter(layer_paths_to_upload), - None, - ); - } - // 3. Compact - let timer = self.compact_time_histo.start_timer(); - self.compact_level0(target_file_size)?; - timer.stop_and_record(); - } else { - debug!("Could not compact because no partitioning specified yet"); - } + match self.repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + ) { + Ok((partitioning, lsn)) => { + // 2. Create new image layers for partitions that have been modified + // "enough". + let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; + if !layer_paths_to_upload.is_empty() + && self.upload_layers.load(atomic::Ordering::Relaxed) + { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + HashSet::from_iter(layer_paths_to_upload), + None, + ); + } + + // 3. Compact + let timer = self.compact_time_histo.start_timer(); + self.compact_level0(target_file_size)?; + timer.stop_and_record(); + } + Err(err) => { + // no partitioning? This is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline + // as a simple key-value store, ignoring the datadir layout. Log the + // error but continue. + error!("could not compact, repartitioning keyspace failed: {err:?}"); + } + }; Ok(()) } + fn repartition(&self, lsn: Lsn, partition_size: u64) -> Result<(KeyPartitioning, Lsn)> { + let mut partitioning_guard = self.partitioning.lock().unwrap(); + if partitioning_guard.1 == Lsn(0) + || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold + { + let keyspace = self.collect_keyspace(lsn)?; + let partitioning = keyspace.partition(partition_size); + *partitioning_guard = (partitioning, lsn); + return Ok((partitioning_guard.0.clone(), lsn)); + } + Ok((partitioning_guard.0.clone(), partitioning_guard.1)) + } + // Is it time to create a new image layer for the given partition? fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result { let layers = self.layers.read().unwrap(); @@ -1626,19 +1707,21 @@ impl LayeredTimeline { // Calculate pitr cutoff point. // If we cannot determine a cutoff LSN, be conservative and don't GC anything. - let mut pitr_cutoff_lsn: Lsn = *self.get_latest_gc_cutoff_lsn(); + let mut pitr_cutoff_lsn: Lsn; + + if pitr != Duration::ZERO { + // conservative, safe default is to remove nothing, when we have no + // commit timestamp data available + pitr_cutoff_lsn = *self.get_latest_gc_cutoff_lsn(); - if let Ok(timeline) = - tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) - { - let now = SystemTime::now(); // First, calculate pitr_cutoff_timestamp and then convert it to LSN. // If we don't have enough data to convert to LSN, // play safe and don't remove any layers. + let now = SystemTime::now(); if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); - match timeline.find_lsn_for_timestamp(pitr_timestamp)? { + match self.find_lsn_for_timestamp(pitr_timestamp)? { LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, LsnForTimestamp::Future(lsn) => { debug!("future({})", lsn); @@ -1653,9 +1736,10 @@ impl LayeredTimeline { } debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn) } - } else if cfg!(test) { - // We don't have local timeline in mocked cargo tests. - // So, just ignore pitr_interval setting in this case. + } else { + // No time-based retention. (Some unit tests depend on garbage-collection + // working even when CLOG data is missing, so that find_lsn_for_timestamp() + // above doesn't work.) pitr_cutoff_lsn = gc_info.horizon_cutoff; } gc_info.pitr_cutoff = pitr_cutoff_lsn; @@ -1962,6 +2046,12 @@ impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { fn finish_write(&self, new_lsn: Lsn) { self.tl.finish_write(new_lsn); } + + fn update_current_logical_size(&self, delta: isize) { + self.tl + .current_logical_size + .fetch_add(delta, AtomicOrdering::SeqCst); + } } /// Add a suffix to a layer file's name: .{num}.old diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index c9c00d75e2..4ecb181553 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -63,8 +63,7 @@ pub enum CheckpointConfig { } pub type RepositoryImpl = LayeredRepository; - -pub type DatadirTimelineImpl = DatadirTimeline; +pub type TimelineImpl = ::Timeline; pub fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint thread. This prevents new connections from diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 3dba207ab9..c8aa4b35e8 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -30,7 +30,6 @@ use utils::{ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; -use crate::layered_repository::LayeredRepository; use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp}; use crate::profiling::profpoint_start; use crate::reltag::RelTag; @@ -555,9 +554,6 @@ impl PageServerHandler { info!("creating new timeline"); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; let timeline = repo.create_empty_timeline(timeline_id, base_lsn)?; - let repartition_distance = repo.get_checkpoint_distance(); - let mut datadir_timeline = - DatadirTimeline::::new(timeline, repartition_distance); // TODO mark timeline as not ready until it reaches end_lsn. // We might have some wal to import as well, and we should prevent compute @@ -573,7 +569,7 @@ impl PageServerHandler { info!("importing basebackup"); pgb.write_message(&BeMessage::CopyInResponse)?; let reader = CopyInReader::new(pgb); - import_basebackup_from_tar(&mut datadir_timeline, reader, base_lsn)?; + import_basebackup_from_tar(&*timeline, reader, base_lsn)?; // TODO check checksum // Meanwhile you can verify client-side by taking fullbackup @@ -583,7 +579,7 @@ impl PageServerHandler { // Flush data to disk, then upload to s3 info!("flushing layers"); - datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?; + timeline.checkpoint(CheckpointConfig::Flush)?; info!("done"); Ok(()) @@ -605,10 +601,6 @@ impl PageServerHandler { let timeline = repo.get_timeline_load(timeline_id)?; ensure!(timeline.get_last_record_lsn() == start_lsn); - let repartition_distance = repo.get_checkpoint_distance(); - let mut datadir_timeline = - DatadirTimeline::::new(timeline, repartition_distance); - // TODO leave clean state on error. For now you can use detach to clean // up broken state from a failed import. @@ -616,16 +608,16 @@ impl PageServerHandler { info!("importing wal"); pgb.write_message(&BeMessage::CopyInResponse)?; let reader = CopyInReader::new(pgb); - import_wal_from_tar(&mut datadir_timeline, reader, start_lsn, end_lsn)?; + import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn)?; // TODO Does it make sense to overshoot? - ensure!(datadir_timeline.tline.get_last_record_lsn() >= end_lsn); + ensure!(timeline.get_last_record_lsn() >= end_lsn); // Flush data to disk, then upload to s3. No need for a forced checkpoint. // We only want to persist the data, and it doesn't matter if it's in the // shape of deltas or images. info!("flushing layers"); - datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?; + timeline.checkpoint(CheckpointConfig::Flush)?; info!("done"); Ok(()) @@ -643,8 +635,8 @@ impl PageServerHandler { /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. - fn wait_or_get_last_lsn( - timeline: &DatadirTimeline, + fn wait_or_get_last_lsn( + timeline: &T, mut lsn: Lsn, latest: bool, latest_gc_cutoff_lsn: &RwLockReadGuard, @@ -671,7 +663,7 @@ impl PageServerHandler { if lsn <= last_record_lsn { lsn = last_record_lsn; } else { - timeline.tline.wait_lsn(lsn)?; + timeline.wait_lsn(lsn)?; // Since we waited for 'lsn' to arrive, that is now the last // record LSN. (Or close enough for our purposes; the // last-record LSN can advance immediately after we return @@ -681,7 +673,7 @@ impl PageServerHandler { if lsn == Lsn(0) { bail!("invalid LSN(0) in request"); } - timeline.tline.wait_lsn(lsn)?; + timeline.wait_lsn(lsn)?; } ensure!( lsn >= **latest_gc_cutoff_lsn, @@ -691,14 +683,14 @@ impl PageServerHandler { Ok(lsn) } - fn handle_get_rel_exists_request( + fn handle_get_rel_exists_request( &self, - timeline: &DatadirTimeline, + timeline: &T, req: &PagestreamExistsRequest, ) -> Result { let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered(); - let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; let exists = timeline.get_rel_exists(req.rel, lsn)?; @@ -708,13 +700,13 @@ impl PageServerHandler { })) } - fn handle_get_nblocks_request( + fn handle_get_nblocks_request( &self, - timeline: &DatadirTimeline, + timeline: &T, req: &PagestreamNblocksRequest, ) -> Result { let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered(); - let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; let n_blocks = timeline.get_rel_size(req.rel, lsn)?; @@ -724,13 +716,13 @@ impl PageServerHandler { })) } - fn handle_db_size_request( + fn handle_db_size_request( &self, - timeline: &DatadirTimeline, + timeline: &T, req: &PagestreamDbSizeRequest, ) -> Result { let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered(); - let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; let total_blocks = @@ -743,14 +735,14 @@ impl PageServerHandler { })) } - fn handle_get_page_at_lsn_request( + fn handle_get_page_at_lsn_request( &self, - timeline: &DatadirTimeline, + timeline: &T, req: &PagestreamGetPageRequest, ) -> Result { let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn) .entered(); - let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; /* // Add a 1s delay to some requests. The delayed causes the requests to @@ -783,7 +775,7 @@ impl PageServerHandler { // check that the timeline exists let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) .context("Cannot load local timeline")?; - let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) @@ -921,7 +913,7 @@ impl postgres_backend::Handler for PageServerHandler { let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) .context("Cannot load local timeline")?; - let end_of_timeline = timeline.tline.get_last_record_rlsn(); + let end_of_timeline = timeline.get_last_record_rlsn(); pgb.write_message_noflush(&BeMessage::RowDescription(&[ RowDescriptor::text_col(b"prev_lsn"), @@ -1139,7 +1131,7 @@ impl postgres_backend::Handler for PageServerHandler { let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) .context("Couldn't load timeline")?; - timeline.tline.compact()?; + timeline.compact()?; pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; @@ -1160,7 +1152,7 @@ impl postgres_backend::Handler for PageServerHandler { .context("Cannot load local timeline")?; // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). - timeline.tline.checkpoint(CheckpointConfig::Forced)?; + timeline.checkpoint(CheckpointConfig::Forced)?; pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index f703fa16af..61aca8d4ba 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -6,10 +6,10 @@ //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! -use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceAccum}; +use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::reltag::{RelTag, SlruKind}; +use crate::repository::Timeline; use crate::repository::*; -use crate::repository::{Repository, Timeline}; use crate::walrecord::ZenithWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; @@ -18,34 +18,12 @@ use postgres_ffi::{pg_constants, Oid, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::ops::Range; -use std::sync::atomic::{AtomicIsize, Ordering}; -use std::sync::{Arc, Mutex, RwLockReadGuard}; -use tracing::{debug, error, trace, warn}; +use tracing::{debug, trace, warn}; use utils::{bin_ser::BeSer, lsn::Lsn}; /// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type. pub type BlockNumber = u32; -pub struct DatadirTimeline -where - R: Repository, -{ - /// The underlying key-value store. Callers should not read or modify the - /// data in the underlying store directly. However, it is exposed to have - /// access to information like last-LSN, ancestor, and operations like - /// compaction. - pub tline: Arc, - - /// When did we last calculate the partitioning? - partitioning: Mutex<(KeyPartitioning, Lsn)>, - - /// Configuration: how often should the partitioning be recalculated. - repartition_threshold: u64, - - /// Current logical size of the "datadir", at the last LSN. - current_logical_size: AtomicIsize, -} - #[derive(Debug)] pub enum LsnForTimestamp { Present(Lsn), @@ -54,34 +32,24 @@ pub enum LsnForTimestamp { NoData(Lsn), } -impl DatadirTimeline { - pub fn new(tline: Arc, repartition_threshold: u64) -> Self { - DatadirTimeline { - tline, - partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), - current_logical_size: AtomicIsize::new(0), - repartition_threshold, - } - } - - /// (Re-)calculate the logical size of the database at the latest LSN. - /// - /// This can be a slow operation. - pub fn init_logical_size(&self) -> Result<()> { - let last_lsn = self.tline.get_last_record_lsn(); - self.current_logical_size.store( - self.get_current_logical_size_non_incremental(last_lsn)? as isize, - Ordering::SeqCst, - ); - Ok(()) - } - - /// Set timeline logical size. - pub fn set_logical_size(&self, size: usize) { - self.current_logical_size - .store(size as isize, Ordering::SeqCst); - } - +/// +/// This trait provides all the functionality to store PostgreSQL relations, SLRUs, +/// and other special kinds of files, in a versioned key-value store. The +/// Timeline trait provides the key-value store. +/// +/// This is a trait, so that we can easily include all these functions in a Timeline +/// implementation. You're not expected to have different implementations of this trait, +/// rather, this provides an interface and implementation, over Timeline. +/// +/// If you wanted to store other kinds of data in the Neon repository, e.g. +/// flat files or MySQL, you would create a new trait like this, with all the +/// functions that make sense for the kind of data you're storing. For flat files, +/// for example, you might have a function like "fn read(path, offset, size)". +/// We might also have that situation in the future, to support multiple PostgreSQL +/// versions, if there are big changes in how the data is organized in the data +/// directory, or if new special files are introduced. +/// +pub trait DatadirTimeline: Timeline { /// Start ingesting a WAL record, or other atomic modification of /// the timeline. /// @@ -102,7 +70,10 @@ impl DatadirTimeline { /// functions of the timeline until you finish! And if you update the /// same page twice, the last update wins. /// - pub fn begin_modification(&self) -> DatadirModification { + fn begin_modification(&self) -> DatadirModification + where + Self: Sized, + { DatadirModification { tline: self, pending_updates: HashMap::new(), @@ -116,7 +87,7 @@ impl DatadirTimeline { //------------------------------------------------------------------------------ /// Look up given page version. - pub fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result { + fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); let nblocks = self.get_rel_size(tag, lsn)?; @@ -129,11 +100,11 @@ impl DatadirTimeline { } let key = rel_block_to_key(tag, blknum); - self.tline.get(key, lsn) + self.get(key, lsn) } // Get size of a database in blocks - pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { let mut total_blocks = 0; let rels = self.list_rels(spcnode, dbnode, lsn)?; @@ -146,7 +117,7 @@ impl DatadirTimeline { } /// Get size of a relation file - pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { + fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); if (tag.forknum == pg_constants::FSM_FORKNUM @@ -161,17 +132,17 @@ impl DatadirTimeline { } let key = rel_size_to_key(tag); - let mut buf = self.tline.get(key, lsn)?; + let mut buf = self.get(key, lsn)?; Ok(buf.get_u32_le()) } /// Does relation exist? - pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { + fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); // fetch directory listing let key = rel_dir_to_key(tag.spcnode, tag.dbnode); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; let dir = RelDirectory::des(&buf)?; let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); @@ -180,10 +151,10 @@ impl DatadirTimeline { } /// Get a list of all existing relations in given tablespace and database. - pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { + fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { // fetch directory listing let key = rel_dir_to_key(spcnode, dbnode); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; let dir = RelDirectory::des(&buf)?; let rels: HashSet = @@ -198,7 +169,7 @@ impl DatadirTimeline { } /// Look up given SLRU page version. - pub fn get_slru_page_at_lsn( + fn get_slru_page_at_lsn( &self, kind: SlruKind, segno: u32, @@ -206,26 +177,21 @@ impl DatadirTimeline { lsn: Lsn, ) -> Result { let key = slru_block_to_key(kind, segno, blknum); - self.tline.get(key, lsn) + self.get(key, lsn) } /// Get size of an SLRU segment - pub fn get_slru_segment_size( - &self, - kind: SlruKind, - segno: u32, - lsn: Lsn, - ) -> Result { + fn get_slru_segment_size(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { let key = slru_segment_size_to_key(kind, segno); - let mut buf = self.tline.get(key, lsn)?; + let mut buf = self.get(key, lsn)?; Ok(buf.get_u32_le()) } /// Get size of an SLRU segment - pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { + fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { // fetch directory listing let key = slru_dir_to_key(kind); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; let dir = SlruSegmentDirectory::des(&buf)?; let exists = dir.segments.get(&segno).is_some(); @@ -239,10 +205,10 @@ impl DatadirTimeline { /// so it's not well defined which LSN you get if there were multiple commits /// "in flight" at that point in time. /// - pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result { - let gc_cutoff_lsn_guard = self.tline.get_latest_gc_cutoff_lsn(); + fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result { + let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); let min_lsn = *gc_cutoff_lsn_guard; - let max_lsn = self.tline.get_last_record_lsn(); + let max_lsn = self.get_last_record_lsn(); // LSNs are always 8-byte aligned. low/mid/high represent the // LSN divided by 8. @@ -333,88 +299,51 @@ impl DatadirTimeline { } /// Get a list of SLRU segments - pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { + fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { // fetch directory entry let key = slru_dir_to_key(kind); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; let dir = SlruSegmentDirectory::des(&buf)?; Ok(dir.segments) } - pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { let key = relmap_file_key(spcnode, dbnode); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; Ok(buf) } - pub fn list_dbdirs(&self, lsn: Lsn) -> Result> { + fn list_dbdirs(&self, lsn: Lsn) -> Result> { // fetch directory entry - let buf = self.tline.get(DBDIR_KEY, lsn)?; + let buf = self.get(DBDIR_KEY, lsn)?; let dir = DbDirectory::des(&buf)?; Ok(dir.dbdirs) } - pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result { + fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result { let key = twophase_file_key(xid); - let buf = self.tline.get(key, lsn)?; + let buf = self.get(key, lsn)?; Ok(buf) } - pub fn list_twophase_files(&self, lsn: Lsn) -> Result> { + fn list_twophase_files(&self, lsn: Lsn) -> Result> { // fetch directory entry - let buf = self.tline.get(TWOPHASEDIR_KEY, lsn)?; + let buf = self.get(TWOPHASEDIR_KEY, lsn)?; let dir = TwoPhaseDirectory::des(&buf)?; Ok(dir.xids) } - pub fn get_control_file(&self, lsn: Lsn) -> Result { - self.tline.get(CONTROLFILE_KEY, lsn) + fn get_control_file(&self, lsn: Lsn) -> Result { + self.get(CONTROLFILE_KEY, lsn) } - pub fn get_checkpoint(&self, lsn: Lsn) -> Result { - self.tline.get(CHECKPOINT_KEY, lsn) - } - - /// Get the LSN of the last ingested WAL record. - /// - /// This is just a convenience wrapper that calls through to the underlying - /// repository. - pub fn get_last_record_lsn(&self) -> Lsn { - self.tline.get_last_record_lsn() - } - - /// Check that it is valid to request operations with that lsn. - /// - /// This is just a convenience wrapper that calls through to the underlying - /// repository. - pub fn check_lsn_is_in_scope( - &self, - lsn: Lsn, - latest_gc_cutoff_lsn: &RwLockReadGuard, - ) -> Result<()> { - self.tline.check_lsn_is_in_scope(lsn, latest_gc_cutoff_lsn) - } - - /// Retrieve current logical size of the timeline - /// - /// NOTE: counted incrementally, includes ancestors, - pub fn get_current_logical_size(&self) -> usize { - let current_logical_size = self.current_logical_size.load(Ordering::Acquire); - match usize::try_from(current_logical_size) { - Ok(sz) => sz, - Err(_) => { - error!( - "current_logical_size is out of range: {}", - current_logical_size - ); - 0 - } - } + fn get_checkpoint(&self, lsn: Lsn) -> Result { + self.get(CHECKPOINT_KEY, lsn) } /// Does the same as get_current_logical_size but counted on demand. @@ -422,16 +351,16 @@ impl DatadirTimeline { /// /// Only relation blocks are counted currently. That excludes metadata, /// SLRUs, twophase files etc. - pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { + fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { // Fetch list of database dirs and iterate them - let buf = self.tline.get(DBDIR_KEY, lsn)?; + let buf = self.get(DBDIR_KEY, lsn)?; let dbdir = DbDirectory::des(&buf)?; let mut total_size: usize = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { for rel in self.list_rels(*spcnode, *dbnode, lsn)? { let relsize_key = rel_size_to_key(rel); - let mut buf = self.tline.get(relsize_key, lsn)?; + let mut buf = self.get(relsize_key, lsn)?; let relsize = buf.get_u32_le(); total_size += relsize as usize; @@ -452,7 +381,7 @@ impl DatadirTimeline { result.add_key(DBDIR_KEY); // Fetch list of database dirs and iterate them - let buf = self.tline.get(DBDIR_KEY, lsn)?; + let buf = self.get(DBDIR_KEY, lsn)?; let dbdir = DbDirectory::des(&buf)?; let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect(); @@ -469,7 +398,7 @@ impl DatadirTimeline { rels.sort_unstable(); for rel in rels { let relsize_key = rel_size_to_key(rel); - let mut buf = self.tline.get(relsize_key, lsn)?; + let mut buf = self.get(relsize_key, lsn)?; let relsize = buf.get_u32_le(); result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize)); @@ -485,13 +414,13 @@ impl DatadirTimeline { ] { let slrudir_key = slru_dir_to_key(kind); result.add_key(slrudir_key); - let buf = self.tline.get(slrudir_key, lsn)?; + let buf = self.get(slrudir_key, lsn)?; let dir = SlruSegmentDirectory::des(&buf)?; let mut segments: Vec = dir.segments.iter().cloned().collect(); segments.sort_unstable(); for segno in segments { let segsize_key = slru_segment_size_to_key(kind, segno); - let mut buf = self.tline.get(segsize_key, lsn)?; + let mut buf = self.get(segsize_key, lsn)?; let segsize = buf.get_u32_le(); result.add_range( @@ -503,7 +432,7 @@ impl DatadirTimeline { // Then pg_twophase result.add_key(TWOPHASEDIR_KEY); - let buf = self.tline.get(TWOPHASEDIR_KEY, lsn)?; + let buf = self.get(TWOPHASEDIR_KEY, lsn)?; let twophase_dir = TwoPhaseDirectory::des(&buf)?; let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); xids.sort_unstable(); @@ -516,30 +445,17 @@ impl DatadirTimeline { Ok(result.to_keyspace()) } - - pub fn repartition(&self, lsn: Lsn, partition_size: u64) -> Result<(KeyPartitioning, Lsn)> { - let mut partitioning_guard = self.partitioning.lock().unwrap(); - if partitioning_guard.1 == Lsn(0) - || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold - { - let keyspace = self.collect_keyspace(lsn)?; - let partitioning = keyspace.partition(partition_size); - *partitioning_guard = (partitioning, lsn); - return Ok((partitioning_guard.0.clone(), lsn)); - } - Ok((partitioning_guard.0.clone(), partitioning_guard.1)) - } } /// DatadirModification represents an operation to ingest an atomic set of /// updates to the repository. It is created by the 'begin_record' /// function. It is called for each WAL record, so that all the modifications /// by a one WAL record appear atomic. -pub struct DatadirModification<'a, R: Repository> { +pub struct DatadirModification<'a, T: DatadirTimeline> { /// The timeline this modification applies to. You can access this to /// read the state, but note that any pending updates are *not* reflected /// in the state in 'tline' yet. - pub tline: &'a DatadirTimeline, + pub tline: &'a T, // The modifications are not applied directly to the underlying key-value store. // The put-functions add the modifications here, and they are flushed to the @@ -549,7 +465,7 @@ pub struct DatadirModification<'a, R: Repository> { pending_nblocks: isize, } -impl<'a, R: Repository> DatadirModification<'a, R> { +impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { /// Initialize a completely new repository. /// /// This inserts the directory metadata entries that are assumed to @@ -934,7 +850,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { return Ok(()); } - let writer = self.tline.tline.writer(); + let writer = self.tline.writer(); // Flush relation and SLRU data blocks, keep metadata. let mut result: Result<()> = Ok(()); @@ -949,10 +865,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { result?; if pending_nblocks != 0 { - self.tline.current_logical_size.fetch_add( - pending_nblocks * pg_constants::BLCKSZ as isize, - Ordering::SeqCst, - ); + writer.update_current_logical_size(pending_nblocks * pg_constants::BLCKSZ as isize); self.pending_nblocks = 0; } @@ -965,7 +878,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { /// All the modifications in this atomic update are stamped by the specified LSN. /// pub fn commit(&mut self, lsn: Lsn) -> Result<()> { - let writer = self.tline.tline.writer(); + let writer = self.tline.writer(); let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; @@ -980,10 +893,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { writer.finish_write(lsn); if pending_nblocks != 0 { - self.tline.current_logical_size.fetch_add( - pending_nblocks * pg_constants::BLCKSZ as isize, - Ordering::SeqCst, - ); + writer.update_current_logical_size(pending_nblocks * pg_constants::BLCKSZ as isize); } Ok(()) @@ -1010,7 +920,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { } } else { let last_lsn = self.tline.get_last_record_lsn(); - self.tline.tline.get(key, last_lsn) + self.tline.get(key, last_lsn) } } @@ -1412,13 +1322,12 @@ fn is_slru_block_key(key: Key) -> bool { pub fn create_test_timeline( repo: R, timeline_id: utils::zid::ZTimelineId, -) -> Result>> { +) -> Result> { let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; - let tline = DatadirTimeline::new(tline, 256 * 1024); let mut m = tline.begin_modification(); m.init_empty()?; m.commit(Lsn(8))?; - Ok(Arc::new(tline)) + Ok(tline) } #[allow(clippy::bool_assert_comparison)] @@ -1491,7 +1400,7 @@ mod tests { .contains(&TESTREL_A)); // Run checkpoint and garbage collection and check that it's still not visible - newtline.tline.checkpoint(CheckpointConfig::Forced)?; + newtline.checkpoint(CheckpointConfig::Forced)?; repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?; assert!(!newtline diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 359c704e81..61058a7806 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -185,7 +185,7 @@ impl Value { /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. pub trait Repository: Send + Sync { - type Timeline: Timeline; + type Timeline: crate::DatadirTimeline; /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization. /// See [`crate::remote_storage`] for more details about the synchronization. @@ -405,6 +405,8 @@ pub trait TimelineWriter<'a> { /// the 'lsn' or anything older. The previous last record LSN is stored alongside /// the latest and can be read. fn finish_write(&self, lsn: Lsn); + + fn update_current_logical_size(&self, delta: isize); } #[cfg(test)] diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index a485e7c2cb..640dfa623a 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,7 +3,6 @@ use crate::config::PageServerConf; use crate::layered_repository::{load_metadata, LayeredRepository}; -use crate::pgdatadir_mapping::DatadirTimeline; use crate::repository::Repository; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; @@ -12,7 +11,7 @@ use crate::thread_mgr::ThreadKind; use crate::timelines::CreateRepo; use crate::walredo::PostgresRedoManager; use crate::{thread_mgr, timelines, walreceiver}; -use crate::{DatadirTimelineImpl, RepositoryImpl}; +use crate::{RepositoryImpl, TimelineImpl}; use anyhow::Context; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; @@ -101,7 +100,7 @@ struct Tenant { /// /// Local timelines have more metadata that's loaded into memory, /// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`]. - local_timelines: HashMap>, + local_timelines: HashMap::Timeline>>, } #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] @@ -178,7 +177,7 @@ pub enum LocalTimelineUpdate { }, Attach { id: ZTenantTimelineId, - datadir: Arc, + datadir: Arc<::Timeline>, }, } @@ -382,7 +381,7 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result anyhow::Result> { +) -> anyhow::Result> { let mut m = tenants_state::write_tenants(); let tenant = m .get_mut(&tenant_id) @@ -489,27 +488,18 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any fn load_local_timeline( repo: &RepositoryImpl, timeline_id: ZTimelineId, -) -> anyhow::Result>> { +) -> anyhow::Result> { let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| { format!("Inmem timeline {timeline_id} not found in tenant's repository") })?; - let repartition_distance = repo.get_checkpoint_distance() / 10; - let init_logical_size = inmem_timeline.init_logical_size; - let page_tline = Arc::new(DatadirTimelineImpl::new( - inmem_timeline, - repartition_distance, - )); - if let Some(logical_size) = init_logical_size { - page_tline.set_logical_size(logical_size); - } else { - page_tline.init_logical_size()?; - } + inmem_timeline.init_logical_size()?; + tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach { id: ZTenantTimelineId::new(repo.tenant_id(), timeline_id), - datadir: Arc::clone(&page_tline), + datadir: Arc::clone(&inmem_timeline), }); - Ok(page_tline) + Ok(inmem_timeline) } #[serde_as] diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index a40e705cb9..984276bad2 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -26,7 +26,7 @@ use crate::{ repository::{LocalTimelineState, Repository}, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, - DatadirTimeline, RepositoryImpl, + DatadirTimeline, RepositoryImpl, TimelineImpl, }; use crate::{import_datadir, LOG_FILE_NAME}; use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager}; @@ -54,27 +54,27 @@ pub struct LocalTimelineInfo { } impl LocalTimelineInfo { - pub fn from_loaded_timeline( - datadir_tline: &DatadirTimeline, + pub fn from_loaded_timeline( + timeline: &TimelineImpl, include_non_incremental_logical_size: bool, ) -> anyhow::Result { - let last_record_lsn = datadir_tline.tline.get_last_record_lsn(); + let last_record_lsn = timeline.get_last_record_lsn(); let info = LocalTimelineInfo { - ancestor_timeline_id: datadir_tline.tline.get_ancestor_timeline_id(), + ancestor_timeline_id: timeline.get_ancestor_timeline_id(), ancestor_lsn: { - match datadir_tline.tline.get_ancestor_lsn() { + match timeline.get_ancestor_lsn() { Lsn(0) => None, lsn @ Lsn(_) => Some(lsn), } }, - disk_consistent_lsn: datadir_tline.tline.get_disk_consistent_lsn(), + disk_consistent_lsn: timeline.get_disk_consistent_lsn(), last_record_lsn, - prev_record_lsn: Some(datadir_tline.tline.get_prev_record_lsn()), - latest_gc_cutoff_lsn: *datadir_tline.tline.get_latest_gc_cutoff_lsn(), + prev_record_lsn: Some(timeline.get_prev_record_lsn()), + latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), timeline_state: LocalTimelineState::Loaded, - current_logical_size: Some(datadir_tline.get_current_logical_size()), + current_logical_size: Some(timeline.get_current_logical_size()), current_logical_size_non_incremental: if include_non_incremental_logical_size { - Some(datadir_tline.get_current_logical_size_non_incremental(last_record_lsn)?) + Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?) } else { None }, @@ -109,9 +109,8 @@ impl LocalTimelineInfo { ) -> anyhow::Result { match repo_timeline { RepositoryTimeline::Loaded(_) => { - let datadir_tline = - tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id)?; - Self::from_loaded_timeline(&datadir_tline, include_non_incremental_logical_size) + let timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id)?; + Self::from_loaded_timeline(&*timeline, include_non_incremental_logical_size) } RepositoryTimeline::Unloaded { metadata } => Ok(Self::from_unloaded_timeline(metadata)), } @@ -298,19 +297,18 @@ fn bootstrap_timeline( // Initdb lsn will be equal to last_record_lsn which will be set after import. // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. let timeline = repo.create_empty_timeline(tli, lsn)?; - let mut page_tline: DatadirTimeline = DatadirTimeline::new(timeline, u64::MAX); - import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?; + import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; fail::fail_point!("before-checkpoint-new-timeline", |_| { bail!("failpoint before-checkpoint-new-timeline"); }); - page_tline.tline.checkpoint(CheckpointConfig::Forced)?; + timeline.checkpoint(CheckpointConfig::Forced)?; info!( "created root timeline {} timeline.lsn {}", tli, - page_tline.tline.get_last_record_lsn() + timeline.get_last_record_lsn() ); // Remove temp dir. We don't need it anymore @@ -389,7 +387,7 @@ pub(crate) fn create_timeline( // load the timeline into memory let loaded_timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; - LocalTimelineInfo::from_loaded_timeline(&loaded_timeline, false) + LocalTimelineInfo::from_loaded_timeline(&*loaded_timeline, false) .context("cannot fill timeline info")? } None => { @@ -397,7 +395,7 @@ pub(crate) fn create_timeline( // load the timeline into memory let new_timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; - LocalTimelineInfo::from_loaded_timeline(&new_timeline, false) + LocalTimelineInfo::from_loaded_timeline(&*new_timeline, false) .context("cannot fill timeline info")? } }; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index adc24328ae..8dd14ec177 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -34,7 +34,6 @@ use std::collections::HashMap; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; -use crate::repository::Repository; use crate::walrecord::*; use postgres_ffi::nonrelfile_utils::mx_offset_to_member_segment; use postgres_ffi::xlog_utils::*; @@ -44,8 +43,8 @@ use utils::lsn::Lsn; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); -pub struct WalIngest<'a, R: Repository> { - timeline: &'a DatadirTimeline, +pub struct WalIngest<'a, T: DatadirTimeline> { + timeline: &'a T, checkpoint: CheckPoint, checkpoint_modified: bool, @@ -53,8 +52,8 @@ pub struct WalIngest<'a, R: Repository> { relsize_cache: HashMap, } -impl<'a, R: Repository> WalIngest<'a, R> { - pub fn new(timeline: &DatadirTimeline, startpoint: Lsn) -> Result> { +impl<'a, T: DatadirTimeline> WalIngest<'a, T> { + pub fn new(timeline: &T, startpoint: Lsn) -> Result> { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_checkpoint(startpoint)?; @@ -80,7 +79,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { &mut self, recdata: Bytes, lsn: Lsn, - modification: &mut DatadirModification, + modification: &mut DatadirModification, decoded: &mut DecodedWALRecord, ) -> Result<()> { decode_wal_record(recdata, decoded).context("failed decoding wal record")?; @@ -268,7 +267,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_decoded_block( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, lsn: Lsn, decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, @@ -328,7 +327,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_heapam_record( &mut self, buf: &mut Bytes, - modification: &mut DatadirModification, + modification: &mut DatadirModification, decoded: &mut DecodedWALRecord, ) -> Result<()> { // Handle VM bit updates that are implicitly part of heap records. @@ -472,7 +471,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record. fn ingest_xlog_dbase_create( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rec: &XlCreateDatabase, ) -> Result<()> { let db_id = rec.db_id; @@ -539,7 +538,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_xlog_smgr_create( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rec: &XlSmgrCreate, ) -> Result<()> { let rel = RelTag { @@ -557,7 +556,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { /// This is the same logic as in PostgreSQL's smgr_redo() function. fn ingest_xlog_smgr_truncate( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rec: &XlSmgrTruncate, ) -> Result<()> { let spcnode = rec.rnode.spcnode; @@ -622,7 +621,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { /// fn ingest_xact_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, parsed: &XlXactParsedRecord, is_commit: bool, ) -> Result<()> { @@ -691,7 +690,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_clog_truncate_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlClogTruncate, ) -> Result<()> { info!( @@ -749,7 +748,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_multixact_create_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlMultiXactCreate, ) -> Result<()> { // Create WAL record for updating the multixact-offsets page @@ -828,7 +827,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_multixact_truncate_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlMultiXactTruncate, ) -> Result<()> { self.checkpoint.oldestMulti = xlrec.end_trunc_off; @@ -862,7 +861,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn ingest_relmap_page( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlRelmapUpdate, decoded: &DecodedWALRecord, ) -> Result<()> { @@ -878,7 +877,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_rel_creation( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, ) -> Result<()> { self.relsize_cache.insert(rel, 0); @@ -888,7 +887,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_rel_page_image( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, img: Bytes, @@ -900,7 +899,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_rel_wal_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, rec: ZenithWalRecord, @@ -912,7 +911,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_rel_truncation( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, nblocks: BlockNumber, ) -> Result<()> { @@ -923,7 +922,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_rel_drop( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, ) -> Result<()> { modification.put_rel_drop(rel)?; @@ -948,7 +947,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn handle_rel_extend( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, ) -> Result<()> { @@ -986,7 +985,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn put_slru_page_image( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, kind: SlruKind, segno: u32, blknum: BlockNumber, @@ -999,7 +998,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { fn handle_slru_extend( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, kind: SlruKind, segno: u32, blknum: BlockNumber, @@ -1052,6 +1051,7 @@ mod tests { use super::*; use crate::pgdatadir_mapping::create_test_timeline; use crate::repository::repo_harness::*; + use crate::repository::Timeline; use postgres_ffi::pg_constants; /// Arbitrary relation tag, for testing. @@ -1062,13 +1062,13 @@ mod tests { forknum: 0, }; - fn assert_current_logical_size(_timeline: &DatadirTimeline, _lsn: Lsn) { + fn assert_current_logical_size(_timeline: &T, _lsn: Lsn) { // TODO } static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); - fn init_walingest_test(tline: &DatadirTimeline) -> Result> { + fn init_walingest_test(tline: &T) -> Result> { let mut m = tline.begin_modification(); m.put_checkpoint(ZERO_CHECKPOINT.clone())?; m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file @@ -1082,7 +1082,7 @@ mod tests { fn test_relsize() -> Result<()> { let repo = RepoHarness::create("test_relsize")?.load(); let tline = create_test_timeline(repo, TIMELINE_ID)?; - let mut walingest = init_walingest_test(&tline)?; + let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(); walingest.put_rel_creation(&mut m, TESTREL_A)?; @@ -1098,7 +1098,7 @@ mod tests { walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?; m.commit(Lsn(0x50))?; - assert_current_logical_size(&tline, Lsn(0x50)); + assert_current_logical_size(&*tline, Lsn(0x50)); // The relation was created at LSN 2, not visible at LSN 1 yet. assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); @@ -1145,7 +1145,7 @@ mod tests { let mut m = tline.begin_modification(); walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?; m.commit(Lsn(0x60))?; - assert_current_logical_size(&tline, Lsn(0x60)); + assert_current_logical_size(&*tline, Lsn(0x60)); // Check reported size and contents after truncation assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 2); @@ -1210,7 +1210,7 @@ mod tests { fn test_drop_extend() -> Result<()> { let repo = RepoHarness::create("test_drop_extend")?.load(); let tline = create_test_timeline(repo, TIMELINE_ID)?; - let mut walingest = init_walingest_test(&tline)?; + let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(); walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; @@ -1250,7 +1250,7 @@ mod tests { fn test_truncate_extend() -> Result<()> { let repo = RepoHarness::create("test_truncate_extend")?.load(); let tline = create_test_timeline(repo, TIMELINE_ID)?; - let mut walingest = init_walingest_test(&tline)?; + let mut walingest = init_walingest_test(&*tline)?; // Create a 20 MB relation (the size is arbitrary) let relsize = 20 * 1024 * 1024 / 8192; @@ -1338,7 +1338,7 @@ mod tests { fn test_large_rel() -> Result<()> { let repo = RepoHarness::create("test_large_rel")?.load(); let tline = create_test_timeline(repo, TIMELINE_ID)?; - let mut walingest = init_walingest_test(&tline)?; + let mut walingest = init_walingest_test(&*tline)?; let mut lsn = 0x10; for blknum in 0..pg_constants::RELSEG_SIZE + 1 { @@ -1349,7 +1349,7 @@ mod tests { m.commit(Lsn(lsn))?; } - assert_current_logical_size(&tline, Lsn(lsn)); + assert_current_logical_size(&*tline, Lsn(lsn)); assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn))?, @@ -1365,7 +1365,7 @@ mod tests { tline.get_rel_size(TESTREL_A, Lsn(lsn))?, pg_constants::RELSEG_SIZE ); - assert_current_logical_size(&tline, Lsn(lsn)); + assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate another block lsn += 0x10; @@ -1376,7 +1376,7 @@ mod tests { tline.get_rel_size(TESTREL_A, Lsn(lsn))?, pg_constants::RELSEG_SIZE - 1 ); - assert_current_logical_size(&tline, Lsn(lsn)); + assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate to 1500, and then truncate all the way down to 0, one block at a time // This tests the behavior at segment boundaries @@ -1393,7 +1393,7 @@ mod tests { size -= 1; } - assert_current_logical_size(&tline, Lsn(lsn)); + assert_current_logical_size(&*tline, Lsn(lsn)); Ok(()) } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 614bca50ad..f2aa7ce2cf 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -25,7 +25,8 @@ use etcd_broker::{ use tokio::select; use tracing::*; -use crate::DatadirTimelineImpl; +use crate::repository::{Repository, Timeline}; +use crate::{RepositoryImpl, TimelineImpl}; use utils::{ lsn::Lsn, pq_proto::ReplicationFeedback, @@ -39,7 +40,7 @@ pub(super) fn spawn_connection_manager_task( id: ZTenantTimelineId, broker_loop_prefix: String, mut client: Client, - local_timeline: Arc, + local_timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, @@ -245,7 +246,7 @@ async fn exponential_backoff(n: u32, base: f64, max_seconds: f64) { struct WalreceiverState { id: ZTenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. - local_timeline: Arc, + local_timeline: Arc, /// The timeout on the connection to safekeeper for WAL streaming. wal_connect_timeout: Duration, /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. @@ -283,7 +284,7 @@ struct EtcdSkTimeline { impl WalreceiverState { fn new( id: ZTenantTimelineId, - local_timeline: Arc, + local_timeline: Arc<::Timeline>, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, @@ -1203,13 +1204,10 @@ mod tests { tenant_id: harness.tenant_id, timeline_id: TIMELINE_ID, }, - local_timeline: Arc::new(DatadirTimelineImpl::new( - harness - .load() - .create_empty_timeline(TIMELINE_ID, Lsn(0)) - .expect("Failed to create an empty timeline for dummy wal connection manager"), - 10_000, - )), + local_timeline: harness + .load() + .create_empty_timeline(TIMELINE_ID, Lsn(0)) + .expect("Failed to create an empty timeline for dummy wal connection manager"), wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), max_lsn_wal_lag: NonZeroU64::new(1).unwrap(), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index cc1a9cc5eb..ca29c00771 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -20,6 +20,7 @@ use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; use crate::{ http::models::WalReceiverEntry, + pgdatadir_mapping::DatadirTimeline, repository::{Repository, Timeline}, tenant_mgr, walingest::WalIngest, @@ -177,7 +178,7 @@ pub async fn handle_walreceiver_connection( caught_up = true; } - let timeline_to_check = Arc::clone(&timeline.tline); + let timeline_to_check = Arc::clone(&timeline); tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance()) .await .with_context(|| { @@ -225,7 +226,7 @@ pub async fn handle_walreceiver_connection( // The last LSN we processed. It is not guaranteed to survive pageserver crash. let write_lsn = u64::from(last_lsn); // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data - let flush_lsn = u64::from(timeline.tline.get_disk_consistent_lsn()); + let flush_lsn = u64::from(timeline.get_disk_consistent_lsn()); // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. let apply_lsn = u64::from(timeline_remote_consistent_lsn); From fd46e52e00dfdaa1e63c40cf0e8836d9f10470cb Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Wed, 27 Jul 2022 12:28:05 +0300 Subject: [PATCH 0558/1022] Switch staging storage to dedicated etcd (#2164) --- .github/ansible/staging.hosts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ansible/staging.hosts b/.github/ansible/staging.hosts index 35e77513df..2bb28f1972 100644 --- a/.github/ansible/staging.hosts +++ b/.github/ansible/staging.hosts @@ -17,4 +17,4 @@ env_name = us-stage console_mgmt_base_url = http://console-staging.local bucket_name = zenith-staging-storage-us-east-1 bucket_region = us-east-1 -etcd_endpoints = etcd-staging.local:2379 +etcd_endpoints = zenith-us-stage-etcd.local:2379 From f6f29f58cd8178ac452b276ececa692e79dda85e Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Wed, 27 Jul 2022 16:41:25 +0300 Subject: [PATCH 0559/1022] Switch production storage to dedicated etcd (#2169) --- .github/ansible/production.hosts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ansible/production.hosts b/.github/ansible/production.hosts index d22ce0e37e..364e8ed50e 100644 --- a/.github/ansible/production.hosts +++ b/.github/ansible/production.hosts @@ -17,4 +17,4 @@ env_name = prod-1 console_mgmt_base_url = http://console-release.local bucket_name = zenith-storage-oregon bucket_region = us-west-2 -etcd_endpoints = etcd-release.local:2379 +etcd_endpoints = zenith-1-etcd.local:2379 From 6a664629fa4834a8c9c1a00d3c729d924f19ad45 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Wed, 27 Jul 2022 12:36:46 -0400 Subject: [PATCH 0560/1022] Add timeline physical size tracking (#2126) Ref #1902. - Track the layered timeline's `physical_size` using `pageserver_current_physical_size` metric when updating the layer map. - Report the local timeline's `physical_size` in timeline GET APIs. - Add `include-non-incremental-physical-size` URL flag to also report the local timeline's `physical_size_non_incremental` (similar to `logical_size_non_incremental`) - Add a `UIntGaugeVec` and `UIntGauge` to represent `u64` prometheus metrics Co-authored-by: Dmitry Rodionov --- libs/metrics/src/lib.rs | 14 ++ pageserver/src/http/openapi_spec.yml | 14 ++ pageserver/src/http/routes.rs | 25 ++-- pageserver/src/layered_repository/timeline.rs | 80 +++++++++-- pageserver/src/repository.rs | 5 + pageserver/src/timelines.rs | 24 +++- .../batch_others/test_timeline_size.py | 131 ++++++++++++++++++ test_runner/fixtures/neon_fixtures.py | 8 +- test_runner/fixtures/utils.py | 37 ++++- 9 files changed, 315 insertions(+), 23 deletions(-) diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 3b5da9f7ff..ea24b3fe7e 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -3,6 +3,9 @@ //! Otherwise, we might not see all metrics registered via //! a default registry. use lazy_static::lazy_static; +use prometheus::core::{AtomicU64, GenericGauge, GenericGaugeVec}; +pub use prometheus::opts; +pub use prometheus::register; pub use prometheus::{core, default_registry, proto}; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_gauge, Gauge}; @@ -18,6 +21,17 @@ pub use prometheus::{Encoder, TextEncoder}; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; +pub type UIntGauge = GenericGauge; +pub type UIntGaugeVec = GenericGaugeVec; + +#[macro_export] +macro_rules! register_uint_gauge_vec { + ($NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{ + let gauge_vec = UIntGaugeVec::new($crate::opts!($NAME, $HELP), $LABELS_NAMES).unwrap(); + $crate::register(Box::new(gauge_vec.clone())).map(|_| gauge_vec) + }}; +} + /// Gathers all Prometheus metrics and records the I/O stats just before that. /// /// Metrics gathering is a relatively simple and standalone operation, so diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 2775a27e0f..46305a4855 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -78,6 +78,11 @@ paths: schema: type: string description: Controls calculation of current_logical_size_non_incremental + - name: include-non-incremental-physical-size + in: query + schema: + type: string + description: Controls calculation of current_physical_size_non_incremental get: description: Get timelines for tenant responses: @@ -136,6 +141,11 @@ paths: schema: type: string description: Controls calculation of current_logical_size_non_incremental + - name: include-non-incremental-physical-size + in: query + schema: + type: string + description: Controls calculation of current_physical_size_non_incremental responses: "200": description: TimelineInfo @@ -671,8 +681,12 @@ components: format: hex current_logical_size: type: integer + current_physical_size: + type: integer current_logical_size_non_incremental: type: integer + current_physical_size_non_incremental: + type: integer WalReceiverEntry: type: object diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 236415cf58..8ac3faca7a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -113,10 +113,17 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); + let include_non_incremental_logical_size = + query_param_present(&request, "include-non-incremental-logical-size"); + let include_non_incremental_physical_size = + query_param_present(&request, "include-non-incremental-physical-size"); let local_timeline_infos = tokio::task::spawn_blocking(move || { let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); - crate::timelines::get_local_timelines(tenant_id, include_non_incremental_logical_size) + crate::timelines::get_local_timelines( + tenant_id, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + ) }) .await .map_err(ApiError::from_err)??; @@ -145,17 +152,15 @@ async fn timeline_list_handler(request: Request) -> Result, json_response(StatusCode::OK, response_data) } -// Gate non incremental logical size calculation behind a flag -// after pgbench -i -s100 calculation took 28ms so if multiplied by the number of timelines -// and tenants it can take noticeable amount of time. Also the value currently used only in tests -fn get_include_non_incremental_logical_size(request: &Request) -> bool { +/// Checks if a query param is present in the request's URL +fn query_param_present(request: &Request, param: &str) -> bool { request .uri() .query() .map(|v| { url::form_urlencoded::parse(v.as_bytes()) .into_owned() - .any(|(param, _)| param == "include-non-incremental-logical-size") + .any(|(p, _)| p == param) }) .unwrap_or(false) } @@ -165,7 +170,10 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result u64 { + self.current_physical_size_gauge.get() + } + + fn get_physical_size_non_incremental(&self) -> anyhow::Result { + let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); + // total size of layer files in the current timeline directory + let mut total_physical_size = 0; + + for direntry in fs::read_dir(timeline_path)? { + let direntry = direntry?; + let fname = direntry.file_name(); + let fname = fname.to_string_lossy(); + + if ImageFileName::parse_str(&fname).is_some() + || DeltaFileName::parse_str(&fname).is_some() + { + total_physical_size += direntry.metadata()?.len(); + } + } + + Ok(total_physical_size) + } } impl LayeredTimeline { @@ -515,6 +553,9 @@ impl LayeredTimeline { let wait_lsn_time_histo = WAIT_LSN_TIME .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) .unwrap(); + let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) + .unwrap(); let mut result = LayeredTimeline { conf, @@ -544,6 +585,7 @@ impl LayeredTimeline { create_images_time_histo, last_record_gauge, wait_lsn_time_histo, + current_physical_size_gauge, upload_layers: AtomicBool::new(upload_layers), @@ -579,6 +621,8 @@ impl LayeredTimeline { // Scan timeline directory and create ImageFileName and DeltaFilename // structs representing all files on disk let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); + // total size of layer files in the current timeline directory + let mut total_physical_size = 0; for direntry in fs::read_dir(timeline_path)? { let direntry = direntry?; @@ -601,6 +645,7 @@ impl LayeredTimeline { ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename); trace!("found layer {}", layer.filename().display()); + total_physical_size += layer.path().metadata()?.len(); layers.insert_historic(Arc::new(layer)); num_layers += 1; } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { @@ -624,6 +669,7 @@ impl LayeredTimeline { DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename); trace!("found layer {}", layer.filename().display()); + total_physical_size += layer.path().metadata()?.len(); layers.insert_historic(Arc::new(layer)); num_layers += 1; } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { @@ -640,9 +686,10 @@ impl LayeredTimeline { layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1); info!( - "loaded layer map with {} layers at {}", - num_layers, disk_consistent_lsn + "loaded layer map with {} layers at {}, total physical size: {}", + num_layers, disk_consistent_lsn, total_physical_size ); + self.current_physical_size_gauge.set(total_physical_size); Ok(()) } @@ -1203,8 +1250,12 @@ impl LayeredTimeline { layers.insert_historic(Arc::new(new_delta)); } + // update the timeline's physical size + let sz = new_delta_path.metadata()?.len(); + self.current_physical_size_gauge.add(sz); + // update metrics NUM_PERSISTENT_FILES_CREATED.inc_by(1); - PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len()); + PERSISTENT_BYTES_WRITTEN.inc_by(sz); Ok(new_delta_path) } @@ -1390,6 +1441,8 @@ impl LayeredTimeline { let mut layers = self.layers.write().unwrap(); for l in image_layers { + self.current_physical_size_gauge + .add(l.path().metadata()?.len()); layers.insert_historic(Arc::new(l)); } drop(layers); @@ -1635,19 +1688,27 @@ impl LayeredTimeline { let mut layers = self.layers.write().unwrap(); let mut new_layer_paths = HashSet::with_capacity(new_layers.len()); for l in new_layers { - new_layer_paths.insert(l.path()); + let new_delta_path = l.path(); + + // update the timeline's physical size + self.current_physical_size_gauge + .add(new_delta_path.metadata()?.len()); + + new_layer_paths.insert(new_delta_path); layers.insert_historic(Arc::new(l)); } // Now that we have reshuffled the data to set of new delta layers, we can // delete the old ones let mut layer_paths_do_delete = HashSet::with_capacity(deltas_to_compact.len()); - for l in &deltas_to_compact { - l.delete()?; + drop(all_keys_iter); + for l in deltas_to_compact { if let Some(path) = l.local_path() { + self.current_physical_size_gauge.sub(path.metadata()?.len()); layer_paths_do_delete.insert(path); } - layers.remove_historic(l.clone()); + l.delete()?; + layers.remove_historic(l); } drop(layers); @@ -1899,10 +1960,11 @@ impl LayeredTimeline { // while iterating it. BTreeMap::retain() would be another option) let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len()); for doomed_layer in layers_to_remove { - doomed_layer.delete()?; if let Some(path) = doomed_layer.local_path() { + self.current_physical_size_gauge.sub(path.metadata()?.len()); layer_paths_to_delete.insert(path); } + doomed_layer.delete()?; layers.remove_historic(doomed_layer); result.layers_removed += 1; } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 61058a7806..0ca8c6150c 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -382,6 +382,11 @@ pub trait Timeline: Send + Sync { lsn: Lsn, latest_gc_cutoff_lsn: &RwLockReadGuard, ) -> Result<()>; + + /// Get the physical size of the timeline at the latest LSN + fn get_physical_size(&self) -> u64; + /// Get the physical size of the timeline at the latest LSN non incrementally + fn get_physical_size_non_incremental(&self) -> Result; } /// Various functions to mutate the timeline. diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 984276bad2..1088e516aa 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -49,7 +49,9 @@ pub struct LocalTimelineInfo { #[serde_as(as = "DisplayFromStr")] pub disk_consistent_lsn: Lsn, pub current_logical_size: Option, // is None when timeline is Unloaded + pub current_physical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, + pub current_physical_size_non_incremental: Option, pub timeline_state: LocalTimelineState, } @@ -57,6 +59,7 @@ impl LocalTimelineInfo { pub fn from_loaded_timeline( timeline: &TimelineImpl, include_non_incremental_logical_size: bool, + include_non_incremental_physical_size: bool, ) -> anyhow::Result { let last_record_lsn = timeline.get_last_record_lsn(); let info = LocalTimelineInfo { @@ -72,12 +75,18 @@ impl LocalTimelineInfo { prev_record_lsn: Some(timeline.get_prev_record_lsn()), latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), timeline_state: LocalTimelineState::Loaded, + current_physical_size: Some(timeline.get_physical_size()), current_logical_size: Some(timeline.get_current_logical_size()), current_logical_size_non_incremental: if include_non_incremental_logical_size { Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?) } else { None }, + current_physical_size_non_incremental: if include_non_incremental_physical_size { + Some(timeline.get_physical_size_non_incremental()?) + } else { + None + }, }; Ok(info) } @@ -97,7 +106,9 @@ impl LocalTimelineInfo { latest_gc_cutoff_lsn: metadata.latest_gc_cutoff_lsn(), timeline_state: LocalTimelineState::Unloaded, current_logical_size: None, + current_physical_size: None, current_logical_size_non_incremental: None, + current_physical_size_non_incremental: None, } } @@ -106,11 +117,16 @@ impl LocalTimelineInfo { timeline_id: ZTimelineId, repo_timeline: &RepositoryTimeline, include_non_incremental_logical_size: bool, + include_non_incremental_physical_size: bool, ) -> anyhow::Result { match repo_timeline { RepositoryTimeline::Loaded(_) => { let timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id)?; - Self::from_loaded_timeline(&*timeline, include_non_incremental_logical_size) + Self::from_loaded_timeline( + &*timeline, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + ) } RepositoryTimeline::Unloaded { metadata } => Ok(Self::from_unloaded_timeline(metadata)), } @@ -320,6 +336,7 @@ fn bootstrap_timeline( pub(crate) fn get_local_timelines( tenant_id: ZTenantId, include_non_incremental_logical_size: bool, + include_non_incremental_physical_size: bool, ) -> Result> { let repo = tenant_mgr::get_repository_for_tenant(tenant_id) .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?; @@ -334,6 +351,7 @@ pub(crate) fn get_local_timelines( timeline_id, &repository_timeline, include_non_incremental_logical_size, + include_non_incremental_physical_size, )?, )) } @@ -387,7 +405,7 @@ pub(crate) fn create_timeline( // load the timeline into memory let loaded_timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; - LocalTimelineInfo::from_loaded_timeline(&*loaded_timeline, false) + LocalTimelineInfo::from_loaded_timeline(&*loaded_timeline, false, false) .context("cannot fill timeline info")? } None => { @@ -395,7 +413,7 @@ pub(crate) fn create_timeline( // load the timeline into memory let new_timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; - LocalTimelineInfo::from_loaded_timeline(&*new_timeline, false) + LocalTimelineInfo::from_loaded_timeline(&*new_timeline, false, false) .context("cannot fill timeline info")? } }; diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 7b7b16bcbf..c3788a0e9b 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -1,10 +1,15 @@ from contextlib import closing +import pathlib +from uuid import UUID +import re import psycopg2.extras import psycopg2.errors from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local from fixtures.log_helper import log import time +from fixtures.utils import get_timeline_dir_size + def test_timeline_size(neon_simple_env: NeonEnv): env = neon_simple_env @@ -176,3 +181,129 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): cur.execute("SELECT * from pg_size_pretty(pg_cluster_size())") pg_cluster_size = cur.fetchone() log.info(f"pg_cluster_size = {pg_cluster_size}") + + +def test_timeline_physical_size_init(neon_simple_env: NeonEnv): + env = neon_simple_env + new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_init') + pg = env.postgres.create_start("test_timeline_physical_size_init") + + pg.safe_psql_many([ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 1000) g""", + ]) + + # restart the pageserer to force calculating timeline's initial physical size + env.pageserver.stop() + env.pageserver.start() + + assert_physical_size(env, env.initial_tenant, new_timeline_id) + + +def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): + env = neon_simple_env + new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_post_checkpoint') + pg = env.postgres.create_start("test_timeline_physical_size_post_checkpoint") + + pg.safe_psql_many([ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 1000) g""", + ]) + + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + assert_physical_size(env, env.initial_tenant, new_timeline_id) + + +def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder): + # Disable background compaction as we don't want it to happen after `get_physical_size` request + # and before checking the expected size on disk, which makes the assertion failed + neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='10m'}" + + env = neon_env_builder.init_start() + + new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_post_compaction') + pg = env.postgres.create_start("test_timeline_physical_size_post_compaction") + + pg.safe_psql_many([ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""", + ]) + + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + env.pageserver.safe_psql(f"compact {env.initial_tenant.hex} {new_timeline_id.hex}") + assert_physical_size(env, env.initial_tenant, new_timeline_id) + + +def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): + # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request + # and before checking the expected size on disk, which makes the assertion failed + neon_env_builder.pageserver_config_override = \ + "tenant_config={checkpoint_distance=100000, compaction_period='10m', gc_period='10m', pitr_interval='1s'}" + + env = neon_env_builder.init_start() + + new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_post_gc') + pg = env.postgres.create_start("test_timeline_physical_size_post_gc") + + pg.safe_psql_many([ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""", + ]) + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + pg.safe_psql(""" + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """) + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + + env.pageserver.safe_psql(f"do_gc {env.initial_tenant.hex} {new_timeline_id.hex} 0") + assert_physical_size(env, env.initial_tenant, new_timeline_id) + + +def test_timeline_physical_size_metric(neon_simple_env: NeonEnv): + env = neon_simple_env + + new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_metric') + pg = env.postgres.create_start("test_timeline_physical_size_metric") + + pg.safe_psql_many([ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""", + ]) + + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + + # get the metrics and parse the metric for the current timeline's physical size + metrics = env.pageserver.http_client().get_metrics() + matches = re.search( + f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant.hex}",timeline_id="{new_timeline_id.hex}"}} (\\S+)$', + metrics, + re.MULTILINE) + assert matches + + # assert that the metric matches the actual physical size on disk + tl_physical_size_metric = int(matches.group(1)) + timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id) + assert tl_physical_size_metric == get_timeline_dir_size(timeline_path) + + +def assert_physical_size(env: NeonEnv, tenant_id: UUID, timeline_id: UUID): + """Check the current physical size returned from timeline API + matches the total physical size of the timeline on disk""" + client = env.pageserver.http_client() + res = assert_timeline_local(client, tenant_id, timeline_id) + timeline_path = env.timeline_dir(tenant_id, timeline_id) + assert res["local"]["current_physical_size"] == res["local"][ + "current_physical_size_non_incremental"] + assert res["local"]["current_physical_size"] == get_timeline_dir_size(timeline_path) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b1fba29e3b..4913f0b456 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -691,6 +691,10 @@ class NeonEnv: """ Get list of safekeeper endpoints suitable for safekeepers GUC """ return ','.join([f'localhost:{wa.port.pg}' for wa in self.safekeepers]) + def timeline_dir(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Path: + """Get a timeline directory's path based on the repo directory of the test environment""" + return self.repo_dir / "tenants" / tenant_id.hex / "timelines" / timeline_id.hex + @cached_property def auth_keys(self) -> AuthKeys: pub = (Path(self.repo_dir) / 'auth_public_key.pem').read_bytes() @@ -863,8 +867,8 @@ class NeonPageserverHttpClient(requests.Session): def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1" - ) + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}" + + "?include-non-incremental-logical-size=1&include-non-incremental-physical-size=1") self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index c49fa08d77..bc50a43ada 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,9 +1,11 @@ +import contextlib import os +import pathlib import shutil import subprocess from pathlib import Path -from typing import Any, List +from typing import Any, List, Tuple from fixtures.log_helper import log @@ -89,3 +91,36 @@ def get_dir_size(path: str) -> int: pass # file could be concurrently removed return totalbytes + + +def get_timeline_dir_size(path: pathlib.Path) -> int: + """Get the timeline directory's total size, which only counts the layer files' size.""" + sz = 0 + for dir_entry in path.iterdir(): + with contextlib.suppress(Exception): + # file is an image layer + _ = parse_image_layer(dir_entry.name) + sz += dir_entry.stat().st_size + continue + + with contextlib.suppress(Exception): + # file is a delta layer + _ = parse_delta_layer(dir_entry.name) + sz += dir_entry.stat().st_size + continue + return sz + + +def parse_image_layer(f_name: str) -> Tuple[int, int, int]: + """Parse an image layer file name. Return key start, key end, and snapshot lsn""" + parts = f_name.split("__") + key_parts = parts[0].split("-") + return int(key_parts[0], 16), int(key_parts[1], 16), int(parts[1], 16) + + +def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]: + """Parse a delta layer file name. Return key start, key end, lsn start, and lsn end""" + parts = f_name.split("__") + key_parts = parts[0].split("-") + lsn_parts = parts[1].split("-") + return int(key_parts[0], 16), int(key_parts[1], 16), int(lsn_parts[0], 16), int(lsn_parts[1], 16) From 01f1f1c1bfcbb1d1ef0e28782fcec138ddd9ac05 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Wed, 27 Jul 2022 20:29:22 +0200 Subject: [PATCH 0561/1022] Add OpenAPI spec for safekeeper HTTP API (neondatabase/cloud#1264, #2061) This spec is used in the `cloud` repo to generate HTTP client. --- .github/ansible/scripts/init_safekeeper.sh | 5 +- control_plane/src/safekeeper.rs | 3 +- safekeeper/src/http/models.rs | 3 +- safekeeper/src/http/openapi_spec.yaml | 365 +++++++++++++++++++++ safekeeper/src/http/routes.rs | 11 +- test_runner/fixtures/neon_fixtures.py | 2 +- 6 files changed, 377 insertions(+), 12 deletions(-) create mode 100644 safekeeper/src/http/openapi_spec.yaml diff --git a/.github/ansible/scripts/init_safekeeper.sh b/.github/ansible/scripts/init_safekeeper.sh index 2297788f59..a9b5025562 100644 --- a/.github/ansible/scripts/init_safekeeper.sh +++ b/.github/ansible/scripts/init_safekeeper.sh @@ -12,10 +12,9 @@ cat <, } diff --git a/safekeeper/src/http/openapi_spec.yaml b/safekeeper/src/http/openapi_spec.yaml new file mode 100644 index 0000000000..da225f244b --- /dev/null +++ b/safekeeper/src/http/openapi_spec.yaml @@ -0,0 +1,365 @@ +openapi: "3.0.2" +info: + title: Safekeeper control API + version: "1.0" + + +servers: + - url: "http://localhost:7676" + + +paths: + /v1/status: + get: + tags: + - "Info" + summary: Get safekeeper status + description: "" + operationId: v1GetSafekeeperStatus + responses: + "200": + description: Safekeeper status + content: + application/json: + schema: + $ref: "#/components/schemas/SafekeeperStatus" + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + + /v1/tenant/{tenant_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + + delete: + tags: + - "Tenant" + summary: Delete tenant and all its timelines + description: "Deletes tenant and returns a map of timelines that were deleted along with their statuses" + operationId: v1DeleteTenant + responses: + "200": + description: Tenant deleted + content: + application/json: + schema: + $ref: "#/components/schemas/TenantDeleteResult" + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + + /v1/tenant/{tenant_id}/timeline: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + + post: + tags: + - "Timeline" + summary: Register new timeline + description: "" + operationId: v1CreateTenantTimeline + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/TimelineCreateRequest" + responses: + "201": + description: Timeline created + # TODO: return timeline info? + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + + /v1/tenant/{tenant_id}/timeline/{timeline_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + + get: + tags: + - "Timeline" + summary: Get timeline information and status + description: "" + operationId: v1GetTenantTimeline + responses: + "200": + description: Timeline status + content: + application/json: + schema: + $ref: "#/components/schemas/TimelineStatus" + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + delete: + tags: + - "Timeline" + summary: Delete timeline + description: "" + operationId: v1DeleteTenantTimeline + responses: + "200": + description: Timeline deleted + content: + application/json: + schema: + $ref: "#/components/schemas/TimelineDeleteResult" + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + + /v1/record_safekeeper_info/{tenant_id}/{timeline_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + + post: + tags: + - "Tests" + summary: Used only in tests to hand craft required data + description: "" + operationId: v1RecordSafekeeperInfo + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/SkTimelineInfo" + responses: + "200": + description: Timeline info posted + # TODO: return timeline info? + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + +components: + securitySchemes: + JWT: + type: http + scheme: bearer + bearerFormat: JWT + + + schemas: + + # + # Requests + # + + TimelineCreateRequest: + type: object + required: + - timeline_id + - peer_ids + properties: + timeline_id: + type: string + format: hex + peer_ids: + type: array + items: + type: integer + minimum: 0 + + SkTimelineInfo: + type: object + required: + - last_log_term + - flush_lsn + - commit_lsn + - backup_lsn + - remote_consistent_lsn + - peer_horizon_lsn + - safekeeper_connstr + properties: + last_log_term: + type: integer + minimum: 0 + flush_lsn: + type: string + commit_lsn: + type: string + backup_lsn: + type: string + remote_consistent_lsn: + type: string + peer_horizon_lsn: + type: string + safekeeper_connstr: + type: string + + # + # Responses + # + + SafekeeperStatus: + type: object + required: + - id + properties: + id: + type: integer + minimum: 0 # kind of unsigned integer + + TimelineStatus: + type: object + required: + - timeline_id + - tenant_id + properties: + timeline_id: + type: string + format: hex + tenant_id: + type: string + format: hex + acceptor_state: + $ref: '#/components/schemas/AcceptorStateStatus' + flush_lsn: + type: string + timeline_start_lsn: + type: string + local_start_lsn: + type: string + commit_lsn: + type: string + backup_lsn: + type: string + peer_horizon_lsn: + type: string + remote_consistent_lsn: + type: string + + AcceptorStateStatus: + type: object + required: + - term + - epoch + properties: + term: + type: integer + minimum: 0 # kind of unsigned integer + epoch: + type: integer + minimum: 0 # kind of unsigned integer + term_history: + type: array + items: + $ref: '#/components/schemas/TermSwitchEntry' + + TermSwitchEntry: + type: object + required: + - term + - lsn + properties: + term: + type: integer + minimum: 0 # kind of unsigned integer + lsn: + type: string + + TimelineDeleteResult: + type: object + required: + - dir_existed + - was_active + properties: + dir_existed: + type: boolean + was_active: + type: boolean + + TenantDeleteResult: + type: object + additionalProperties: + $ref: "#/components/schemas/TimelineDeleteResult" + example: + 57fd1b39f23704a63423de0a8435d85c: + dir_existed: true + was_active: false + 67fd1b39f23704a63423gb8435d85c33: + dir_existed: false + was_active: false + + # + # Errors + # + + GenericErrorContent: + type: object + properties: + msg: + type: string + + responses: + + # + # Errors + # + + GenericError: + description: Generic error response + content: + application/json: + schema: + $ref: "#/components/schemas/GenericErrorContent" + + ForbiddenError: + description: Forbidden error response + content: + application/json: + schema: + type: object + required: + - msg + properties: + msg: + type: string + + +security: + - JWT: [] diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 33581c6c31..13356c5921 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -126,7 +126,7 @@ async fn timeline_create_handler(mut request: Request) -> Result SafekeeperTimelineStatus: - res = self.get(f"http://localhost:{self.port}/v1/timeline/{tenant_id}/{timeline_id}") + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") res.raise_for_status() resj = res.json() return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'], From 58b04438f0fad05e78e661e1643b6653af092dd7 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 27 Jul 2022 20:04:34 +0300 Subject: [PATCH 0562/1022] Tweak backoff numbers to avoid no wal connection threshold trigger --- pageserver/src/walreceiver/connection_manager.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index f2aa7ce2cf..f2b1671eb4 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -230,8 +230,8 @@ async fn subscribe_for_timeline_updates( } } -const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 2.0; -const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 60.0; +const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1; +const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0; async fn exponential_backoff(n: u32, base: f64, max_seconds: f64) { if n == 0 { From aeb3f0ea07979070c504ca69f664d352dc9d5d0f Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 28 Jul 2022 14:38:37 +0300 Subject: [PATCH 0563/1022] Refactor test_race_conditions (#2162) Do not use python multiprocessing, make the test async --- test_runner/batch_others/test_wal_acceptor.py | 52 --------------- .../batch_others/test_wal_acceptor_async.py | 65 +++++++++++++++++++ 2 files changed, 65 insertions(+), 52 deletions(-) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 5014a7ad4e..74b2f2657f 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -203,58 +203,6 @@ def test_restarts(neon_env_builder: NeonEnvBuilder): assert cur.fetchone() == (500500, ) -# shut down random subset of acceptors, sleep, wake them up, rinse, repeat -def xmas_garland(acceptors, stop): - while not bool(stop.value): - victims = [] - for wa in acceptors: - if random.random() >= 0.5: - victims.append(wa) - for v in victims: - v.stop() - time.sleep(1) - for v in victims: - v.start() - time.sleep(1) - - -# value which gets unset on exit -@pytest.fixture -def stop_value(): - stop = Value('i', 0) - yield stop - stop.value = 1 - - -# do inserts while concurrently getting up/down subsets of acceptors -def test_race_conditions(neon_env_builder: NeonEnvBuilder, stop_value): - - neon_env_builder.num_safekeepers = 3 - env = neon_env_builder.init_start() - - env.neon_cli.create_branch('test_safekeepers_race_conditions') - pg = env.postgres.create_start('test_safekeepers_race_conditions') - - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - pg_conn = pg.connect() - cur = pg_conn.cursor() - - cur.execute('CREATE TABLE t(key int primary key, value text)') - - proc = Process(target=xmas_garland, args=(env.safekeepers, stop_value)) - proc.start() - - for i in range(1000): - cur.execute("INSERT INTO t values (%s, 'payload');", (i + 1, )) - - cur.execute('SELECT sum(key) FROM t') - assert cur.fetchone() == (500500, ) - - stop_value.value = 1 - proc.join() - - # Test that safekeepers push their info to the broker and learn peer status from it def test_broker(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index bf7d8e3645..5c0cb56880 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -9,6 +9,7 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper from fixtures.log_helper import getLogger from fixtures.utils import lsn_from_hex, lsn_to_hex from typing import List, Optional +from dataclasses import dataclass log = getLogger('root.safekeeper_async') @@ -455,3 +456,67 @@ def test_unavailability(neon_env_builder: NeonEnvBuilder): pg = env.postgres.create_start('test_safekeepers_unavailability') asyncio.run(run_unavailability(env, pg)) + + +@dataclass +class RaceConditionTest: + iteration: int + is_stopped: bool + + +# shut down random subset of safekeeper, sleep, wake them up, rinse, repeat +async def xmas_garland(safekeepers: List[Safekeeper], data: RaceConditionTest): + while not data.is_stopped: + data.iteration += 1 + victims = [] + for sk in safekeepers: + if random.random() >= 0.5: + victims.append(sk) + log.info( + f'Iteration {data.iteration}: stopping {list(map(lambda sk: sk.id, victims))} safekeepers' + ) + for v in victims: + v.stop() + await asyncio.sleep(1) + for v in victims: + v.start() + log.info(f'Iteration {data.iteration} finished') + await asyncio.sleep(1) + + +async def run_race_conditions(env: NeonEnv, pg: Postgres): + conn = await pg.connect_async() + await conn.execute('CREATE TABLE t(key int primary key, value text)') + + data = RaceConditionTest(0, False) + bg_xmas = asyncio.create_task(xmas_garland(env.safekeepers, data)) + + n_iterations = 5 + expected_sum = 0 + i = 1 + + while data.iteration <= n_iterations: + await asyncio.sleep(0.005) + await conn.execute(f"INSERT INTO t values ({i}, 'payload')") + expected_sum += i + i += 1 + + log.info(f'Executed {i-1} queries') + + res = await conn.fetchval('SELECT sum(key) FROM t') + assert res == expected_sum + + data.is_stopped = True + await bg_xmas + + +# do inserts while concurrently getting up/down subsets of acceptors +def test_race_conditions(neon_env_builder: NeonEnvBuilder): + + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch('test_safekeepers_race_conditions') + pg = env.postgres.create_start('test_safekeepers_race_conditions') + + asyncio.run(run_race_conditions(env, pg)) From 09ddd34b2af6850b207b91ba9eb4d42cce3d9edd Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 28 Jul 2022 15:44:02 +0300 Subject: [PATCH 0564/1022] Fix checkpoints race condition in safekeeper tests (#2175) We should wait for WAL to arrive to pageserver before calling CHECKPOINT --- test_runner/batch_others/test_wal_acceptor.py | 44 +++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 74b2f2657f..844ef3ebe1 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -14,13 +14,43 @@ from contextlib import closing from dataclasses import dataclass, field from multiprocessing import Process, Value from pathlib import Path -from fixtures.neon_fixtures import PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, neon_binpath, PgProtocol +from fixtures.neon_fixtures import NeonPageserver, PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, neon_binpath, PgProtocol, wait_for_last_record_lsn, wait_for_upload from fixtures.utils import get_dir_size, lsn_to_hex, lsn_from_hex from fixtures.log_helper import log from typing import List, Optional, Any from uuid import uuid4 +def wait_lsn_force_checkpoint(tenant_id: str, + timeline_id: str, + pg: Postgres, + ps: NeonPageserver, + pageserver_conn_options={}): + lsn = lsn_from_hex(pg.safe_psql('SELECT pg_current_wal_flush_lsn()')[0][0]) + log.info(f"pg_current_wal_flush_lsn is {lsn_to_hex(lsn)}, waiting for it on pageserver") + + auth_token = None + if 'password' in pageserver_conn_options: + auth_token = pageserver_conn_options['password'] + + # wait for the pageserver to catch up + wait_for_last_record_lsn(ps.http_client(auth_token=auth_token), + uuid.UUID(hex=tenant_id), + uuid.UUID(hex=timeline_id), + lsn) + + # force checkpoint to advance remote_consistent_lsn + with closing(ps.connect(**pageserver_conn_options)) as psconn: + with psconn.cursor() as pscur: + pscur.execute(f"checkpoint {tenant_id} {timeline_id}") + + # ensure that remote_consistent_lsn is advanced + wait_for_upload(ps.http_client(auth_token=auth_token), + uuid.UUID(hex=tenant_id), + uuid.UUID(hex=timeline_id), + lsn) + + @dataclass class TimelineMetrics: timeline_id: str @@ -223,10 +253,10 @@ def test_broker(neon_env_builder: NeonEnvBuilder): log.info(f"statuses is {stat_before}") pg.safe_psql("INSERT INTO t SELECT generate_series(1,100), 'payload'") - # force checkpoint to advance remote_consistent_lsn - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor() as pscur: - pscur.execute(f"checkpoint {tenant_id} {timeline_id}") + + # force checkpoint in pageserver to advance remote_consistent_lsn + wait_lsn_force_checkpoint(tenant_id, timeline_id, pg, env.pageserver) + # and wait till remote_consistent_lsn propagates to all safekeepers started_at = time.time() while True: @@ -270,9 +300,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): pageserver_conn_options = {} if auth_enabled: pageserver_conn_options['password'] = env.auth_keys.generate_tenant_token(tenant_id) - with closing(env.pageserver.connect(**pageserver_conn_options)) as psconn: - with psconn.cursor() as pscur: - pscur.execute(f"checkpoint {tenant_id} {timeline_id}") + wait_lsn_force_checkpoint(tenant_id, timeline_id, pg, env.pageserver, pageserver_conn_options) # We will wait for first segment removal. Make sure they exist for starter. first_segments = [ From 14a027cce5fa9ec1220e2309f45b5c479e86ed8b Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 28 Jul 2022 17:05:30 +0100 Subject: [PATCH 0565/1022] Makefile: get openssl prefix dynamically (#2179) --- Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 566f2ecb10..fc75e9fc5e 100644 --- a/Makefile +++ b/Makefile @@ -29,9 +29,11 @@ else endif # macOS with brew-installed openssl requires explicit paths +# It can be configured with OPENSSL_PREFIX variable UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - PG_CONFIGURE_OPTS += --with-includes=$(HOMEBREW_PREFIX)/opt/openssl/include --with-libraries=$(HOMEBREW_PREFIX)/opt/openssl/lib + OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3) + PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib endif # Choose whether we should be silent or verbose From 6ace347175402786a06e794d2795b0f750dff8c2 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 28 Jul 2022 18:37:21 +0100 Subject: [PATCH 0566/1022] github/workflows: unpause stress env deployment (#2180) This reverts commit 4446791397e88156012e704381e329551b404c60. --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5874aa9b5c..ba36c62156 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -517,7 +517,7 @@ jobs: if [[ "$GITHUB_REF_NAME" == "main" ]]; then STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}' NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}' - echo "::set-output name=include::[$STAGING]" + echo "::set-output name=include::[$STAGING, $NEON_STRESS]" elif [[ "$GITHUB_REF_NAME" == "release" ]]; then PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}' echo "::set-output name=include::[$PRODUCTION]" From 417d9e9db2e3ec45a1ec181e23f19144bea34e08 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Thu, 28 Jul 2022 13:59:20 -0400 Subject: [PATCH 0567/1022] Add current physical size to tenant status endpoint (#2173) Ref #1902 --- pageserver/src/http/openapi_spec.yml | 2 ++ pageserver/src/http/routes.rs | 27 ++++++++++++-- pageserver/src/tenant_mgr.rs | 2 ++ .../batch_others/test_timeline_size.py | 36 ++++++++++++++++++- 4 files changed, 64 insertions(+), 3 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 46305a4855..ed190db43a 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -587,6 +587,8 @@ components: type: string state: type: string + current_physical_size: + type: integer has_in_progress_downloads: type: boolean TenantCreateInfo: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 8ac3faca7a..1582e8a2a4 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -438,14 +438,37 @@ async fn tenant_status(request: Request) -> Result, ApiErro let index_accessor = remote_index.read().await; let has_in_progress_downloads = index_accessor .tenant_entry(&tenant_id) - .ok_or_else(|| ApiError::NotFound("Tenant not found in remote index".to_string()))? - .has_in_progress_downloads(); + .map(|t| t.has_in_progress_downloads()) + .unwrap_or_else(|| { + info!("Tenant {tenant_id} not found in remote index"); + false + }); + + let current_physical_size = match tokio::task::spawn_blocking(move || { + crate::timelines::get_local_timelines(tenant_id, false, false) + }) + .await + .map_err(ApiError::from_err)? + { + Err(err) => { + // Getting local timelines can fail when no local repo is on disk (e.g, when tenant data is being downloaded). + // In that case, put a warning message into log and operate normally. + warn!("Failed to get local timelines for tenant {tenant_id}: {err}"); + None + } + Ok(local_timeline_infos) => Some( + local_timeline_infos + .into_iter() + .fold(0, |acc, x| acc + x.1.current_physical_size.unwrap()), + ), + }; json_response( StatusCode::OK, TenantInfo { id: tenant_id, state: tenant_state, + current_physical_size, has_in_progress_downloads: Some(has_in_progress_downloads), }, ) diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 640dfa623a..3f88ab1be2 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -508,6 +508,7 @@ pub struct TenantInfo { #[serde_as(as = "DisplayFromStr")] pub id: ZTenantId, pub state: Option, + pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub has_in_progress_downloads: Option, } @@ -526,6 +527,7 @@ pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec { TenantInfo { id: *id, state: Some(tenant.state), + current_physical_size: None, has_in_progress_downloads, } }) diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index c3788a0e9b..f76529f1f7 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -1,5 +1,5 @@ from contextlib import closing -import pathlib +import random from uuid import UUID import re import psycopg2.extras @@ -298,6 +298,40 @@ def test_timeline_physical_size_metric(neon_simple_env: NeonEnv): assert tl_physical_size_metric == get_timeline_dir_size(timeline_path) +def test_tenant_physical_size(neon_simple_env: NeonEnv): + random.seed(100) + + env = neon_simple_env + client = env.pageserver.http_client() + + tenant, timeline = env.neon_cli.create_tenant() + + def get_timeline_physical_size(timeline: UUID): + res = client.timeline_detail(tenant, timeline) + return res['local']['current_physical_size_non_incremental'] + + timeline_total_size = get_timeline_physical_size(timeline) + for i in range(10): + n_rows = random.randint(100, 1000) + + timeline = env.neon_cli.create_branch(f"test_tenant_physical_size_{i}", tenant_id=tenant) + pg = env.postgres.create_start(f"test_tenant_physical_size_{i}", tenant_id=tenant) + + pg.safe_psql_many([ + "CREATE TABLE foo (t text)", + f"INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, {n_rows}) g", + ]) + + env.pageserver.safe_psql(f"checkpoint {tenant.hex} {timeline.hex}") + + timeline_total_size += get_timeline_physical_size(timeline) + + pg.stop() + + tenant_physical_size = int(client.tenant_status(tenant_id=tenant)['current_physical_size']) + assert tenant_physical_size == timeline_total_size + + def assert_physical_size(env: NeonEnv, tenant_id: UUID, timeline_id: UUID): """Check the current physical size returned from timeline API matches the total physical size of the timeline on disk""" From d903dd61bd243d37d086f50df19eae5d619b4b40 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 29 Jul 2022 09:09:22 +0300 Subject: [PATCH 0568/1022] Rename 'wal_producer_connstr' to 'wal_source_connstr'. What the WAL receiver really connects to is the safekeeper. The "producer" term is a bit misleading, as the safekeeper doesn't produce the WAL, the compute node does. This change also applies to the name of the field used in the mgmt API in in the response of the '/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver' endpoint. AFAICS that's not used anywhere else than one python test, so it should be OK to change it. --- pageserver/src/http/models.rs | 2 +- pageserver/src/http/openapi_spec.yml | 4 +-- pageserver/src/walreceiver.rs | 2 +- .../src/walreceiver/connection_manager.rs | 32 +++++++++---------- .../src/walreceiver/walreceiver_connection.rs | 12 +++---- .../batch_others/test_pageserver_api.py | 4 +-- 6 files changed, 28 insertions(+), 28 deletions(-) diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index c947cebcb6..dc9db3a62f 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -102,7 +102,7 @@ impl TenantConfigRequest { #[serde_as] #[derive(Debug, Serialize, Deserialize, Clone)] pub struct WalReceiverEntry { - pub wal_producer_connstr: Option, + pub wal_source_connstr: Option, #[serde_as(as = "Option")] pub last_received_msg_lsn: Option, /// the timestamp (in microseconds) of the last received message diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index ed190db43a..2109fcbe5b 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -694,11 +694,11 @@ components: type: object required: - thread_id - - wal_producer_connstr + - wal_source_connstr properties: thread_id: type: integer - wal_producer_connstr: + wal_source_connstr: type: string last_received_msg_lsn: type: string diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index c36343db17..946230b0d3 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -326,7 +326,7 @@ async fn wal_receiver_main_thread_loop_step<'a>( WAL_RECEIVER_ENTRIES.write().await.insert( id, WalReceiverEntry { - wal_producer_connstr: None, + wal_source_connstr: None, last_received_msg_lsn: None, last_received_msg_ts: None, }, diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index f2b1671eb4..ae1c787517 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -168,7 +168,7 @@ async fn connection_manager_loop_step( walreceiver_state .change_connection( new_candidate.safekeeper_id, - new_candidate.wal_producer_connstr, + new_candidate.wal_source_connstr, ) .await } @@ -302,7 +302,7 @@ impl WalreceiverState { } /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. - async fn change_connection(&mut self, new_sk_id: NodeId, new_wal_producer_connstr: String) { + async fn change_connection(&mut self, new_sk_id: NodeId, new_wal_source_connstr: String) { if let Some(old_connection) = self.wal_connection.take() { old_connection.connection_task.shutdown().await } @@ -324,7 +324,7 @@ impl WalreceiverState { .await; super::walreceiver_connection::handle_walreceiver_connection( id, - &new_wal_producer_connstr, + &new_wal_source_connstr, events_sender.as_ref(), cancellation, connect_timeout, @@ -387,7 +387,7 @@ impl WalreceiverState { Some(existing_wal_connection) => { let connected_sk_node = existing_wal_connection.sk_id; - let (new_sk_id, new_safekeeper_etcd_data, new_wal_producer_connstr) = + let (new_sk_id, new_safekeeper_etcd_data, new_wal_source_connstr) = self.select_connection_candidate(Some(connected_sk_node))?; let now = Utc::now().naive_utc(); @@ -397,7 +397,7 @@ impl WalreceiverState { if latest_interaciton > self.lagging_wal_timeout { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, - wal_producer_connstr: new_wal_producer_connstr, + wal_source_connstr: new_wal_source_connstr, reason: ReconnectReason::NoWalTimeout { last_wal_interaction: Some( existing_wal_connection.latest_connection_update, @@ -423,7 +423,7 @@ impl WalreceiverState { return Some( NewWalConnectionCandidate { safekeeper_id: new_sk_id, - wal_producer_connstr: new_wal_producer_connstr, + wal_source_connstr: new_wal_source_connstr, reason: ReconnectReason::LaggingWal { current_lsn, new_lsn, threshold: self.max_lsn_wal_lag }, }); } @@ -434,18 +434,18 @@ impl WalreceiverState { None => { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, - wal_producer_connstr: new_wal_producer_connstr, + wal_source_connstr: new_wal_source_connstr, reason: ReconnectReason::NoEtcdDataForExistingConnection, }) } } } None => { - let (new_sk_id, _, new_wal_producer_connstr) = + let (new_sk_id, _, new_wal_source_connstr) = self.select_connection_candidate(None)?; return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, - wal_producer_connstr: new_wal_producer_connstr, + wal_source_connstr: new_wal_source_connstr, reason: ReconnectReason::NoExistingConnection, }); } @@ -546,7 +546,7 @@ impl WalreceiverState { #[derive(Debug, PartialEq, Eq)] struct NewWalConnectionCandidate { safekeeper_id: NodeId, - wal_producer_connstr: String, + wal_source_connstr: String, reason: ReconnectReason, } @@ -803,7 +803,7 @@ mod tests { "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold" ); assert!(only_candidate - .wal_producer_connstr + .wal_source_connstr .contains(DUMMY_SAFEKEEPER_CONNSTR)); let selected_lsn = 100_000; @@ -868,7 +868,7 @@ mod tests { "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold" ); assert!(biggest_wal_candidate - .wal_producer_connstr + .wal_source_connstr .contains(DUMMY_SAFEKEEPER_CONNSTR)); Ok(()) @@ -985,7 +985,7 @@ mod tests { "Should select new safekeeper due to missing etcd data, even if there's an existing connection with this safekeeper" ); assert!(only_candidate - .wal_producer_connstr + .wal_source_connstr .contains(DUMMY_SAFEKEEPER_CONNSTR)); Ok(()) @@ -1067,7 +1067,7 @@ mod tests { "Should select bigger WAL safekeeper if it starts to lag enough" ); assert!(over_threshcurrent_candidate - .wal_producer_connstr + .wal_source_connstr .contains("advanced by Lsn safekeeper")); Ok(()) @@ -1134,7 +1134,7 @@ mod tests { unexpected => panic!("Unexpected reason: {unexpected:?}"), } assert!(over_threshcurrent_candidate - .wal_producer_connstr + .wal_source_connstr .contains(DUMMY_SAFEKEEPER_CONNSTR)); Ok(()) @@ -1190,7 +1190,7 @@ mod tests { unexpected => panic!("Unexpected reason: {unexpected:?}"), } assert!(over_threshcurrent_candidate - .wal_producer_connstr + .wal_source_connstr .contains(DUMMY_SAFEKEEPER_CONNSTR)); Ok(()) diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index ca29c00771..8846e27256 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -29,18 +29,18 @@ use crate::{ use postgres_ffi::waldecoder::WalStreamDecoder; use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; -/// Opens a conneciton to the given wal producer and streams the WAL, sending progress messages during streaming. +/// Open a connection to the given safekeeper and receive WAL, sending back progress +/// messages as we go. pub async fn handle_walreceiver_connection( id: ZTenantTimelineId, - wal_producer_connstr: &str, + wal_source_connstr: &str, events_sender: &watch::Sender>, mut cancellation: watch::Receiver<()>, connect_timeout: Duration, ) -> anyhow::Result<()> { // Connect to the database in replication mode. - info!("connecting to {wal_producer_connstr}"); - let connect_cfg = - format!("{wal_producer_connstr} application_name=pageserver replication=true"); + info!("connecting to {wal_source_connstr}"); + let connect_cfg = format!("{wal_source_connstr} application_name=pageserver replication=true"); let (mut replication_client, connection) = time::timeout( connect_timeout, @@ -237,7 +237,7 @@ pub async fn handle_walreceiver_connection( super::WAL_RECEIVER_ENTRIES.write().await.insert( id, WalReceiverEntry { - wal_producer_connstr: Some(wal_producer_connstr.to_owned()), + wal_source_connstr: Some(wal_source_connstr.to_owned()), last_received_msg_lsn: Some(last_lsn), last_received_msg_ts: Some( ts.duration_since(SystemTime::UNIX_EPOCH) diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 7f9cb9493d..a298f1d701 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -65,7 +65,7 @@ def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): empty_response = client.wal_receiver_get(tenant_id, timeline_id) - assert empty_response.get('wal_producer_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running' + assert empty_response.get('wal_source_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running' assert empty_response.get('last_received_msg_lsn') is None, 'Should not be able to connect to WAL streaming without PG compute node running' assert empty_response.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running' @@ -82,7 +82,7 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): # a successful `wal_receiver_get` response must contain the below fields assert list(res.keys()) == [ - "wal_producer_connstr", + "wal_source_connstr", "last_received_msg_lsn", "last_received_msg_ts", ] From 02afa2762c20009c34d9933c20e13d15634f3bee Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 29 Jul 2022 12:00:54 +0300 Subject: [PATCH 0569/1022] Move Tenant- and TimelineInfo structs to models.rs. They are part of the management API response structs. Let's try to concentrate everything that's part of the API in models.rs. --- control_plane/src/storage.rs | 6 +- neon_local/src/main.rs | 3 +- pageserver/src/http/models.rs | 55 ++++++++++ pageserver/src/http/routes.rs | 171 ++++++++++++++++++++++++----- pageserver/src/repository.rs | 9 -- pageserver/src/tenant_mgr.rs | 15 +-- pageserver/src/timelines.rs | 196 ++++------------------------------ 7 files changed, 226 insertions(+), 229 deletions(-) diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index f1eaa99904..13d64a79f0 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -12,9 +12,9 @@ use anyhow::{bail, Context}; use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; -use pageserver::http::models::{TenantConfigRequest, TenantCreateRequest, TimelineCreateRequest}; -use pageserver::tenant_mgr::TenantInfo; -use pageserver::timelines::TimelineInfo; +use pageserver::http::models::{ + TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo, +}; use postgres::{Config, NoTls}; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index b29cc6978c..e6f5c6125d 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -9,6 +9,7 @@ use pageserver::config::defaults::{ DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR, }; +use pageserver::http::models::TimelineInfo; use safekeeper::defaults::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, @@ -25,8 +26,6 @@ use utils::{ zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; -use pageserver::timelines::TimelineInfo; - // Default id of a safekeeper node, if not specified on the command line. const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1); const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index dc9db3a62f..31c205b3a8 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -7,6 +7,10 @@ use utils::{ zid::{NodeId, ZTenantId, ZTimelineId}, }; +// These enums are used in the API response fields. +use crate::repository::LocalTimelineState; +use crate::tenant_mgr::TenantState; + #[serde_as] #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { @@ -108,3 +112,54 @@ pub struct WalReceiverEntry { /// the timestamp (in microseconds) of the last received message pub last_received_msg_ts: Option, } + +#[serde_as] +#[derive(Serialize, Deserialize, Clone)] +pub struct TenantInfo { + #[serde_as(as = "DisplayFromStr")] + pub id: ZTenantId, + pub state: Option, + pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint + pub has_in_progress_downloads: Option, +} + +#[serde_as] +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct LocalTimelineInfo { + #[serde_as(as = "Option")] + pub ancestor_timeline_id: Option, + #[serde_as(as = "Option")] + pub ancestor_lsn: Option, + #[serde_as(as = "DisplayFromStr")] + pub last_record_lsn: Lsn, + #[serde_as(as = "Option")] + pub prev_record_lsn: Option, + #[serde_as(as = "DisplayFromStr")] + pub latest_gc_cutoff_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] + pub disk_consistent_lsn: Lsn, + pub current_logical_size: Option, // is None when timeline is Unloaded + pub current_physical_size: Option, // is None when timeline is Unloaded + pub current_logical_size_non_incremental: Option, + pub current_physical_size_non_incremental: Option, + pub timeline_state: LocalTimelineState, +} + +#[serde_as] +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct RemoteTimelineInfo { + #[serde_as(as = "DisplayFromStr")] + pub remote_consistent_lsn: Lsn, + pub awaits_download: bool, +} + +#[serde_as] +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct TimelineInfo { + #[serde_as(as = "DisplayFromStr")] + pub tenant_id: ZTenantId, + #[serde_as(as = "DisplayFromStr")] + pub timeline_id: ZTimelineId, + pub local: Option, + pub remote: Option, +} diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 1582e8a2a4..9d284405ec 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -6,16 +6,19 @@ use hyper::{Body, Request, Response, Uri}; use remote_storage::GenericRemoteStorage; use tracing::*; +use super::models::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo}; use super::models::{ - StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, + StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, }; -use crate::repository::Repository; +use crate::layered_repository::metadata::TimelineMetadata; +use crate::pgdatadir_mapping::DatadirTimeline; +use crate::repository::{LocalTimelineState, RepositoryTimeline}; +use crate::repository::{Repository, Timeline}; use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant_config::TenantConfOpt; -use crate::tenant_mgr::TenantInfo; -use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo}; +use crate::TimelineImpl; use crate::{config::PageServerConf, tenant_mgr, timelines}; use utils::{ auth::JwtAuth, @@ -26,6 +29,7 @@ use utils::{ request::parse_request_param, RequestExt, RouterBuilder, }, + lsn::Lsn, zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, }; @@ -79,6 +83,104 @@ fn get_config(request: &Request) -> &'static PageServerConf { get_state(request).conf } +// Helper functions to construct a LocalTimelineInfo struct for a timeline + +fn local_timeline_info_from_loaded_timeline( + timeline: &TimelineImpl, + include_non_incremental_logical_size: bool, + include_non_incremental_physical_size: bool, +) -> anyhow::Result { + let last_record_lsn = timeline.get_last_record_lsn(); + let info = LocalTimelineInfo { + ancestor_timeline_id: timeline.get_ancestor_timeline_id(), + ancestor_lsn: { + match timeline.get_ancestor_lsn() { + Lsn(0) => None, + lsn @ Lsn(_) => Some(lsn), + } + }, + disk_consistent_lsn: timeline.get_disk_consistent_lsn(), + last_record_lsn, + prev_record_lsn: Some(timeline.get_prev_record_lsn()), + latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), + timeline_state: LocalTimelineState::Loaded, + current_logical_size: Some(timeline.get_current_logical_size()), + current_physical_size: Some(timeline.get_physical_size()), + current_logical_size_non_incremental: if include_non_incremental_logical_size { + Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?) + } else { + None + }, + current_physical_size_non_incremental: if include_non_incremental_physical_size { + Some(timeline.get_physical_size_non_incremental()?) + } else { + None + }, + }; + Ok(info) +} + +fn local_timeline_info_from_unloaded_timeline(metadata: &TimelineMetadata) -> LocalTimelineInfo { + LocalTimelineInfo { + ancestor_timeline_id: metadata.ancestor_timeline(), + ancestor_lsn: { + match metadata.ancestor_lsn() { + Lsn(0) => None, + lsn @ Lsn(_) => Some(lsn), + } + }, + disk_consistent_lsn: metadata.disk_consistent_lsn(), + last_record_lsn: metadata.disk_consistent_lsn(), + prev_record_lsn: metadata.prev_record_lsn(), + latest_gc_cutoff_lsn: metadata.latest_gc_cutoff_lsn(), + timeline_state: LocalTimelineState::Unloaded, + current_logical_size: None, + current_physical_size: None, + current_logical_size_non_incremental: None, + current_physical_size_non_incremental: None, + } +} + +fn local_timeline_info_from_repo_timeline( + repo_timeline: &RepositoryTimeline, + include_non_incremental_logical_size: bool, + include_non_incremental_physical_size: bool, +) -> anyhow::Result { + match repo_timeline { + RepositoryTimeline::Loaded(timeline) => local_timeline_info_from_loaded_timeline( + &*timeline, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + ), + RepositoryTimeline::Unloaded { metadata } => { + Ok(local_timeline_info_from_unloaded_timeline(metadata)) + } + } +} + +fn list_local_timelines( + tenant_id: ZTenantId, + include_non_incremental_logical_size: bool, + include_non_incremental_physical_size: bool, +) -> Result> { + let repo = tenant_mgr::get_repository_for_tenant(tenant_id) + .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?; + let repo_timelines = repo.list_timelines(); + + let mut local_timeline_info = Vec::with_capacity(repo_timelines.len()); + for (timeline_id, repository_timeline) in repo_timelines { + local_timeline_info.push(( + timeline_id, + local_timeline_info_from_repo_timeline( + &repository_timeline, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + )?, + )) + } + Ok(local_timeline_info) +} + // healthcheck handler async fn status_handler(request: Request) -> Result, ApiError> { let config = get_config(&request); @@ -93,16 +195,30 @@ async fn timeline_create_handler(mut request: Request) -> Result { + // Created. Construct a TimelineInfo for it. + let local_info = local_timeline_info_from_loaded_timeline(new_timeline.as_ref(), false, false)?; + Ok(Some(TimelineInfo { + tenant_id, + timeline_id: new_timeline_id, + local: Some(local_info), + remote: None, + })) + } + Ok(None) => Ok(None), // timeline already exists + Err(err) => Err(err), + } }) .await - .map_err(ApiError::from_err)??; + .map_err(ApiError::from_err)??; Ok(match new_timeline_info { Some(info) => json_response(StatusCode::CREATED, info)?, @@ -119,7 +235,7 @@ async fn timeline_list_handler(request: Request) -> Result, query_param_present(&request, "include-non-incremental-physical-size"); let local_timeline_infos = tokio::task::spawn_blocking(move || { let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); - crate::timelines::get_local_timelines( + list_local_timelines( tenant_id, include_non_incremental_logical_size, include_non_incremental_physical_size, @@ -184,9 +300,7 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result, ApiErro false }); - let current_physical_size = match tokio::task::spawn_blocking(move || { - crate::timelines::get_local_timelines(tenant_id, false, false) - }) - .await - .map_err(ApiError::from_err)? - { - Err(err) => { - // Getting local timelines can fail when no local repo is on disk (e.g, when tenant data is being downloaded). - // In that case, put a warning message into log and operate normally. - warn!("Failed to get local timelines for tenant {tenant_id}: {err}"); - None - } - Ok(local_timeline_infos) => Some( - local_timeline_infos - .into_iter() - .fold(0, |acc, x| acc + x.1.current_physical_size.unwrap()), - ), - }; + let current_physical_size = + match tokio::task::spawn_blocking(move || list_local_timelines(tenant_id, false, false)) + .await + .map_err(ApiError::from_err)? + { + Err(err) => { + // Getting local timelines can fail when no local repo is on disk (e.g, when tenant data is being downloaded). + // In that case, put a warning message into log and operate normally. + warn!("Failed to get local timelines for tenant {tenant_id}: {err}"); + None + } + Ok(local_timeline_infos) => Some( + local_timeline_infos + .into_iter() + .fold(0, |acc, x| acc + x.1.current_physical_size.unwrap()), + ), + }; json_response( StatusCode::OK, diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 0ca8c6150c..6467231e08 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -277,15 +277,6 @@ pub enum LocalTimelineState { Unloaded, } -impl<'a, T> From<&'a RepositoryTimeline> for LocalTimelineState { - fn from(local_timeline_entry: &'a RepositoryTimeline) -> Self { - match local_timeline_entry { - RepositoryTimeline::Loaded(_) => LocalTimelineState::Loaded, - RepositoryTimeline::Unloaded { .. } => LocalTimelineState::Unloaded, - } - } -} - /// /// Result of performing GC /// diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 3f88ab1be2..dfdbc4c318 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -2,6 +2,7 @@ //! page server. use crate::config::PageServerConf; +use crate::http::models::TenantInfo; use crate::layered_repository::{load_metadata, LayeredRepository}; use crate::repository::Repository; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; @@ -14,7 +15,6 @@ use crate::{thread_mgr, timelines, walreceiver}; use crate::{RepositoryImpl, TimelineImpl}; use anyhow::Context; use serde::{Deserialize, Serialize}; -use serde_with::{serde_as, DisplayFromStr}; use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::fmt; @@ -502,16 +502,9 @@ fn load_local_timeline( Ok(inmem_timeline) } -#[serde_as] -#[derive(Serialize, Deserialize, Clone)] -pub struct TenantInfo { - #[serde_as(as = "DisplayFromStr")] - pub id: ZTenantId, - pub state: Option, - pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint - pub has_in_progress_downloads: Option, -} - +/// +/// Get list of tenants, for the mgmt API +/// pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec { tenants_state::read_tenants() .iter() diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 1088e516aa..42cb6cb156 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -4,8 +4,6 @@ use anyhow::{bail, ensure, Context, Result}; use postgres_ffi::ControlFileData; -use serde::{Deserialize, Serialize}; -use serde_with::{serde_as, DisplayFromStr}; use std::{ fs, path::Path, @@ -20,138 +18,15 @@ use utils::{ zid::{ZTenantId, ZTimelineId}, }; +use crate::tenant_mgr; use crate::{ - config::PageServerConf, - layered_repository::metadata::TimelineMetadata, - repository::{LocalTimelineState, Repository}, - storage_sync::index::RemoteIndex, - tenant_config::TenantConfOpt, - DatadirTimeline, RepositoryImpl, TimelineImpl, + config::PageServerConf, repository::Repository, storage_sync::index::RemoteIndex, + tenant_config::TenantConfOpt, RepositoryImpl, TimelineImpl, }; use crate::{import_datadir, LOG_FILE_NAME}; use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager}; -use crate::{repository::RepositoryTimeline, tenant_mgr}; use crate::{repository::Timeline, CheckpointConfig}; -#[serde_as] -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct LocalTimelineInfo { - #[serde_as(as = "Option")] - pub ancestor_timeline_id: Option, - #[serde_as(as = "Option")] - pub ancestor_lsn: Option, - #[serde_as(as = "DisplayFromStr")] - pub last_record_lsn: Lsn, - #[serde_as(as = "Option")] - pub prev_record_lsn: Option, - #[serde_as(as = "DisplayFromStr")] - pub latest_gc_cutoff_lsn: Lsn, - #[serde_as(as = "DisplayFromStr")] - pub disk_consistent_lsn: Lsn, - pub current_logical_size: Option, // is None when timeline is Unloaded - pub current_physical_size: Option, // is None when timeline is Unloaded - pub current_logical_size_non_incremental: Option, - pub current_physical_size_non_incremental: Option, - pub timeline_state: LocalTimelineState, -} - -impl LocalTimelineInfo { - pub fn from_loaded_timeline( - timeline: &TimelineImpl, - include_non_incremental_logical_size: bool, - include_non_incremental_physical_size: bool, - ) -> anyhow::Result { - let last_record_lsn = timeline.get_last_record_lsn(); - let info = LocalTimelineInfo { - ancestor_timeline_id: timeline.get_ancestor_timeline_id(), - ancestor_lsn: { - match timeline.get_ancestor_lsn() { - Lsn(0) => None, - lsn @ Lsn(_) => Some(lsn), - } - }, - disk_consistent_lsn: timeline.get_disk_consistent_lsn(), - last_record_lsn, - prev_record_lsn: Some(timeline.get_prev_record_lsn()), - latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), - timeline_state: LocalTimelineState::Loaded, - current_physical_size: Some(timeline.get_physical_size()), - current_logical_size: Some(timeline.get_current_logical_size()), - current_logical_size_non_incremental: if include_non_incremental_logical_size { - Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?) - } else { - None - }, - current_physical_size_non_incremental: if include_non_incremental_physical_size { - Some(timeline.get_physical_size_non_incremental()?) - } else { - None - }, - }; - Ok(info) - } - - pub fn from_unloaded_timeline(metadata: &TimelineMetadata) -> Self { - LocalTimelineInfo { - ancestor_timeline_id: metadata.ancestor_timeline(), - ancestor_lsn: { - match metadata.ancestor_lsn() { - Lsn(0) => None, - lsn @ Lsn(_) => Some(lsn), - } - }, - disk_consistent_lsn: metadata.disk_consistent_lsn(), - last_record_lsn: metadata.disk_consistent_lsn(), - prev_record_lsn: metadata.prev_record_lsn(), - latest_gc_cutoff_lsn: metadata.latest_gc_cutoff_lsn(), - timeline_state: LocalTimelineState::Unloaded, - current_logical_size: None, - current_physical_size: None, - current_logical_size_non_incremental: None, - current_physical_size_non_incremental: None, - } - } - - pub fn from_repo_timeline( - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - repo_timeline: &RepositoryTimeline, - include_non_incremental_logical_size: bool, - include_non_incremental_physical_size: bool, - ) -> anyhow::Result { - match repo_timeline { - RepositoryTimeline::Loaded(_) => { - let timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id)?; - Self::from_loaded_timeline( - &*timeline, - include_non_incremental_logical_size, - include_non_incremental_physical_size, - ) - } - RepositoryTimeline::Unloaded { metadata } => Ok(Self::from_unloaded_timeline(metadata)), - } - } -} - -#[serde_as] -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct RemoteTimelineInfo { - #[serde_as(as = "DisplayFromStr")] - pub remote_consistent_lsn: Lsn, - pub awaits_download: bool, -} - -#[serde_as] -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct TimelineInfo { - #[serde_as(as = "DisplayFromStr")] - pub tenant_id: ZTenantId, - #[serde_as(as = "DisplayFromStr")] - pub timeline_id: ZTimelineId, - pub local: Option, - pub remote: Option, -} - #[derive(Debug, Clone, Copy)] pub struct PointInTime { pub timeline_id: ZTimelineId, @@ -333,38 +208,22 @@ fn bootstrap_timeline( Ok(()) } -pub(crate) fn get_local_timelines( - tenant_id: ZTenantId, - include_non_incremental_logical_size: bool, - include_non_incremental_physical_size: bool, -) -> Result> { - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?; - let repo_timelines = repo.list_timelines(); - - let mut local_timeline_info = Vec::with_capacity(repo_timelines.len()); - for (timeline_id, repository_timeline) in repo_timelines { - local_timeline_info.push(( - timeline_id, - LocalTimelineInfo::from_repo_timeline( - tenant_id, - timeline_id, - &repository_timeline, - include_non_incremental_logical_size, - include_non_incremental_physical_size, - )?, - )) - } - Ok(local_timeline_info) -} - +/// +/// Create a new timeline. +/// +/// Returns the new timeline ID and reference to its Timeline object. +/// +/// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with +/// the same timeline ID already exists, returns None. If `new_timeline_id` is not given, +/// a new unique ID is generated. +/// pub(crate) fn create_timeline( conf: &'static PageServerConf, tenant_id: ZTenantId, new_timeline_id: Option, ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, -) -> Result> { +) -> Result)>> { let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; @@ -373,7 +232,7 @@ pub(crate) fn create_timeline( return Ok(None); } - let new_timeline_info = match ancestor_timeline_id { + let _new_timeline = match ancestor_timeline_id { Some(ancestor_timeline_id) => { let ancestor_timeline = repo .get_timeline_load(ancestor_timeline_id) @@ -401,26 +260,13 @@ pub(crate) fn create_timeline( } } - repo.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?; - // load the timeline into memory - let loaded_timeline = - tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; - LocalTimelineInfo::from_loaded_timeline(&*loaded_timeline, false, false) - .context("cannot fill timeline info")? - } - None => { - bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?; - // load the timeline into memory - let new_timeline = - tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; - LocalTimelineInfo::from_loaded_timeline(&*new_timeline, false, false) - .context("cannot fill timeline info")? + repo.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? } + None => bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?, }; - Ok(Some(TimelineInfo { - tenant_id, - timeline_id: new_timeline_id, - local: Some(new_timeline_info), - remote: None, - })) + + // load the timeline into memory + let loaded_timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; + + Ok(Some((new_timeline_id, loaded_timeline))) } From a0f76253f8fd3865d257c47c9ac4fb3a9e9c427a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 29 Jul 2022 14:32:29 +0300 Subject: [PATCH 0570/1022] Bump Postgres version. This brings in the inclusion of 'uuid-ossp' extension. --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 9c99008445..bc6dcc493c 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 9c99008445dbccd8204f188e0933def507058eac +Subproject commit bc6dcc493c977f3b06ad95abf493273a693b0e12 From d865892a06ea465d2e40118158e15cbaa6d35635 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 29 Jul 2022 14:07:14 +0300 Subject: [PATCH 0571/1022] Print full error with stacktrace, if compute node startup fails. It failed in staging environment a few times, and all we got in the logs was: ERROR could not start the compute node: failed to get basebackup@0/2D6194F8 from pageserver host=zenith-us-stage-ps-2.local port=6400 giving control plane 30s to collect the error before shutdown That's missing all the detail on *why* it failed. --- compute_tools/src/bin/compute_ctl.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index f535adfd87..fc5bbc5fd2 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -157,7 +157,7 @@ fn main() -> Result<()> { exit(code) } Err(error) => { - error!("could not start the compute node: {}", error); + error!("could not start the compute node: {:?}", error); let mut state = compute.state.write().unwrap(); state.error = Some(format!("{:?}", error)); From 9733b24f4ad4b47b2a997cd4272d338f3106a772 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Fri, 29 Jul 2022 14:30:18 +0200 Subject: [PATCH 0572/1022] Fix README.md: Fixed several typos and changed a bit documentation for OSX --- README.md | 57 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 6a4fc5ce1b..f4c86bd6a0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Neon -Neon is a serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributing data across a cluster of nodes. +Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes. The project used to be called "Zenith". Many of the commands and code comments still refer to "zenith", but we are in the process of renaming things. @@ -12,32 +12,31 @@ Alternatively, compile and run the project [locally](#running-local-installation ## Architecture overview -A Neon installation consists of compute nodes and Neon storage engine. +A Neon installation consists of compute nodes and a Neon storage engine. -Compute nodes are stateless PostgreSQL nodes, backed by Neon storage engine. +Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine. -Neon storage engine consists of two major components: -- Pageserver. Scalable storage backend for compute nodes. -- WAL service. The service that receives WAL from compute node and ensures that it is stored durably. +The Neon storage engine consists of two major components: +- Pageserver. Scalable storage backend for the compute nodes. +- WAL service. The service receives WAL from the compute node and ensures that it is stored durably. Pageserver consists of: - Repository - Neon storage implementation. - WAL receiver - service that receives WAL from WAL service and stores it in the repository. - Page service - service that communicates with compute nodes and responds with pages from the repository. -- WAL redo - service that builds pages from base images and WAL records on Page service request. - +- WAL redo - service that builds pages from base images and WAL records on Page service request ## Running local installation #### Installing dependencies on Linux -1. Install build dependencies and other useful packages +1. Install build dependencies and other applicable packages -* On Ubuntu or Debian this set of packages should be sufficient to build the code: +* On Ubuntu or Debian, this set of packages should be sufficient to build the code: ```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ libssl-dev clang pkg-config libpq-dev etcd cmake postgresql-client ``` -* On Fedora these packages are needed: +* On Fedora, these packages are needed: ```bash dnf install flex bison readline-devel zlib-devel openssl-devel \ libseccomp-devel perl clang cmake etcd postgresql postgresql-contrib @@ -69,7 +68,7 @@ brew install libpq brew link --force libpq ``` -#### Building on Linux and OSX +#### Building on Linux 1. Build neon and patched postgres ``` @@ -80,19 +79,35 @@ cd neon # The preferred and default is to make a debug build. This will create a # demonstrably slower build than a release build. If you want to use a release -# build, utilize "`BUILD_TYPE=release make -j`nproc``" +# build, utilize "BUILD_TYPE=release make -j`nproc`" make -j`nproc` ``` -#### dependency installation notes +#### Building on OSX + +1. Build neon and patched postgres +``` +# Note: The path to the neon sources can not contain a space. + +git clone --recursive https://github.com/neondatabase/neon.git +cd neon + +# The preferred and default is to make a debug build. This will create a +# demonstrably slower build than a release build. If you want to use a release +# build, utilize "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" + +make -j`sysctl -n hw.logicalcpu` +``` + +#### Dependency installation notes To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively. To run the integration tests or Python scripts (not required to use the code), install -Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory. +Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry](https://python-poetry.org/)) in the project directory. -#### running neon database +#### Running neon database 1. Start pageserver and postgres on top of it (should be called from repo root): ```sh # Create repository in .neon with proper paths to binaries and data @@ -123,7 +138,7 @@ Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=pos main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running ``` -2. Now it is possible to connect to postgres and run some queries: +2. Now, it is possible to connect to postgres and run some queries: ```text > psql -p55432 -h 127.0.0.1 -U cloud_admin postgres postgres=# CREATE TABLE t(key int primary key, value text); @@ -181,8 +196,8 @@ postgres=# select * from t; (1 row) ``` -4. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances - you have just started. You can stop them all with one command: +4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances + you have just started. You can terminate them all with one command: ```sh > ./target/debug/neon_local stop ``` @@ -205,8 +220,8 @@ To view your `rustdoc` documentation in a browser, try running `cargo doc --no-d ### Postgres-specific terms -Due to Neon's very close relation with PostgreSQL internals, there are numerous specific terms used. -Same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use. +Due to Neon's very close relation with PostgreSQL internals, numerous specific terms are used. +The same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use. To get more familiar with this aspect, refer to: From 2af5a96f0d7b4ffc5a93769cbbb11320dca7e1e0 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 29 Jul 2022 17:22:52 +0300 Subject: [PATCH 0573/1022] Back off when reenqueueing delete tasks --- pageserver/src/storage_sync.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index fe1ba4b5bb..c60d3dccc0 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -1120,7 +1120,7 @@ where .instrument(info_span!("download_timeline_data")), ); - if let Some(delete_data) = batch.delete { + if let Some(mut delete_data) = batch.delete { if upload_result.is_some() { match validate_task_retries(delete_data, max_sync_errors) .instrument(info_span!("retries_validation")) @@ -1153,6 +1153,7 @@ where } } } else { + delete_data.retries += 1; sync_queue.push(sync_id, SyncTask::Delete(delete_data)); warn!("Skipping delete task due to failed upload tasks, reenqueuing"); } From d0494c391aca02ce2be1992c2d2399ca2e495f4f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 29 Jul 2022 12:05:32 +0300 Subject: [PATCH 0574/1022] Remove wal_receiver mgmt API endpoint Move all the fields that were returned by the wal_receiver endpoint into timeline_detail. Internally, move those fields from the separate global WAL_RECEIVERS hash into the LayeredTimeline struct. That way, all the information about a timeline is kept in one place. In the passing, I noted that the 'thread_id' field was removed from WalReceiverEntry in commit e5cb727572, but it forgot to update openapi_spec.yml. This commit removes that too. --- pageserver/src/http/models.rs | 18 ++---- pageserver/src/http/openapi_spec.yml | 57 ------------------- pageserver/src/http/routes.rs | 40 +++++++------ pageserver/src/layered_repository.rs | 3 + pageserver/src/layered_repository/timeline.rs | 13 +++++ pageserver/src/walreceiver.rs | 43 ++------------ .../src/walreceiver/walreceiver_connection.rs | 27 ++++----- .../batch_others/test_pageserver_api.py | 36 +++++++----- test_runner/fixtures/neon_fixtures.py | 27 +++++---- 9 files changed, 94 insertions(+), 170 deletions(-) diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 31c205b3a8..aee31f14a7 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -101,18 +101,6 @@ impl TenantConfigRequest { } } -/// A WAL receiver's data stored inside the global `WAL_RECEIVERS`. -/// We keep one WAL receiver active per timeline. -#[serde_as] -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct WalReceiverEntry { - pub wal_source_connstr: Option, - #[serde_as(as = "Option")] - pub last_received_msg_lsn: Option, - /// the timestamp (in microseconds) of the last received message - pub last_received_msg_ts: Option, -} - #[serde_as] #[derive(Serialize, Deserialize, Clone)] pub struct TenantInfo { @@ -143,6 +131,12 @@ pub struct LocalTimelineInfo { pub current_logical_size_non_incremental: Option, pub current_physical_size_non_incremental: Option, pub timeline_state: LocalTimelineState, + + pub wal_source_connstr: Option, + #[serde_as(as = "Option")] + pub last_received_msg_lsn: Option, + /// the timestamp (in microseconds) of the last received message + pub last_received_msg_ts: Option, } #[serde_as] diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 2109fcbe5b..106c14fbc8 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -207,54 +207,6 @@ paths: schema: $ref: "#/components/schemas/Error" - /v1/tenant/{tenant_id}/timeline/{timeline_id}/wal_receiver: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - format: hex - - name: timeline_id - in: path - required: true - schema: - type: string - format: hex - get: - description: Get wal receiver's data attached to the timeline - responses: - "200": - description: WalReceiverEntry - content: - application/json: - schema: - $ref: "#/components/schemas/WalReceiverEntry" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Error when no wal receiver is running or found - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - /v1/tenant/{tenant_id}/attach: parameters: - name: tenant_id @@ -689,15 +641,6 @@ components: type: integer current_physical_size_non_incremental: type: integer - - WalReceiverEntry: - type: object - required: - - thread_id - - wal_source_connstr - properties: - thread_id: - type: integer wal_source_connstr: type: string last_received_msg_lsn: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 9d284405ec..fa598de402 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -91,6 +91,19 @@ fn local_timeline_info_from_loaded_timeline( include_non_incremental_physical_size: bool, ) -> anyhow::Result { let last_record_lsn = timeline.get_last_record_lsn(); + let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = { + let guard = timeline.last_received_wal.lock().unwrap(); + if let Some(info) = guard.as_ref() { + ( + Some(info.wal_source_connstr.clone()), + Some(info.last_received_msg_lsn), + Some(info.last_received_msg_ts), + ) + } else { + (None, None, None) + } + }; + let info = LocalTimelineInfo { ancestor_timeline_id: timeline.get_ancestor_timeline_id(), ancestor_lsn: { @@ -116,6 +129,9 @@ fn local_timeline_info_from_loaded_timeline( } else { None }, + wal_source_connstr, + last_received_msg_lsn, + last_received_msg_ts, }; Ok(info) } @@ -138,6 +154,9 @@ fn local_timeline_info_from_unloaded_timeline(metadata: &TimelineMetadata) -> Lo current_physical_size: None, current_logical_size_non_incremental: None, current_physical_size_non_incremental: None, + wal_source_connstr: None, + last_received_msg_lsn: None, + last_received_msg_ts: None, } } @@ -348,23 +367,6 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; - let wal_receiver_entry = crate::walreceiver::get_wal_receiver_entry(tenant_id, timeline_id) - .instrument(info_span!("wal_receiver_get", tenant = %tenant_id, timeline = %timeline_id)) - .await - .ok_or_else(|| { - ApiError::NotFound(format!( - "WAL receiver data not found for tenant {tenant_id} and timeline {timeline_id}" - )) - })?; - - json_response(StatusCode::OK, &wal_receiver_entry) -} - // TODO makes sense to provide tenant config right away the same way as it handled in tenant_create async fn tenant_attach_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; @@ -751,9 +753,5 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/detach", timeline_delete_handler, ) - .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver", - wal_receiver_get_handler, - ) .any(handler_404)) } diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index c500b05e66..79a180c4cf 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -67,6 +67,9 @@ pub use crate::layered_repository::ephemeral_file::writeback as writeback_epheme // re-export for use in storage_sync.rs pub use crate::layered_repository::timeline::save_metadata; +// re-export for use in walreceiver +pub use crate::layered_repository::timeline::WalReceiverInfo; + /// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 703e1993e5..6ed1efd3d1 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -290,6 +290,17 @@ pub struct LayeredTimeline { /// Current logical size of the "datadir", at the last LSN. current_logical_size: AtomicIsize, + + /// Information about the last processed message by the WAL receiver, + /// or None if WAL receiver has not received anything for this timeline + /// yet. + pub last_received_wal: Mutex>, +} + +pub struct WalReceiverInfo { + pub wal_source_connstr: String, + pub last_received_msg_lsn: Lsn, + pub last_received_msg_ts: u128, } /// Inherit all the functions from DatadirTimeline, to provide the @@ -605,6 +616,8 @@ impl LayeredTimeline { current_logical_size: AtomicIsize::new(0), partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), repartition_threshold: 0, + + last_received_wal: Mutex::new(None), }; result.repartition_threshold = result.get_checkpoint_distance() / 10; result diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 946230b0d3..43bb3fa971 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -26,7 +26,6 @@ mod walreceiver_connection; use anyhow::{ensure, Context}; use etcd_broker::Client; use itertools::Itertools; -use once_cell::sync::Lazy; use std::cell::Cell; use std::collections::{hash_map, HashMap, HashSet}; use std::future::Future; @@ -36,14 +35,13 @@ use std::thread_local; use std::time::Duration; use tokio::{ select, - sync::{mpsc, watch, RwLock}, + sync::{mpsc, watch}, task::JoinHandle, }; use tracing::*; use url::Url; use crate::config::PageServerConf; -use crate::http::models::WalReceiverEntry; use crate::tenant_mgr::{self, LocalTimelineUpdate, TenantState}; use crate::thread_mgr::{self, ThreadKind}; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; @@ -55,23 +53,6 @@ thread_local! { pub(crate) static IS_WAL_RECEIVER: Cell = Cell::new(false); } -/// WAL receiver state for sharing with the outside world. -/// Only entries for timelines currently available in pageserver are stored. -static WAL_RECEIVER_ENTRIES: Lazy>> = - Lazy::new(|| RwLock::new(HashMap::new())); - -/// Gets the public WAL streaming entry for a certain timeline. -pub async fn get_wal_receiver_entry( - tenant_id: ZTenantId, - timeline_id: ZTimelineId, -) -> Option { - WAL_RECEIVER_ENTRIES - .read() - .await - .get(&ZTenantTimelineId::new(tenant_id, timeline_id)) - .cloned() -} - /// Sets up the main WAL receiver thread that manages the rest of the subtasks inside of it, per timeline. /// See comments in [`wal_receiver_main_thread_loop_step`] for more details on per timeline activities. pub fn init_wal_receiver_main_thread( @@ -281,13 +262,10 @@ async fn wal_receiver_main_thread_loop_step<'a>( } None => warn!("Timeline {id} does not have a tenant entry in wal receiver main thread"), }; - { - WAL_RECEIVER_ENTRIES.write().await.remove(&id); - if let Err(e) = join_confirmation_sender.send(()) { - warn!("cannot send wal_receiver shutdown confirmation {e}") - } else { - info!("confirm walreceiver shutdown for {id}"); - } + if let Err(e) = join_confirmation_sender.send(()) { + warn!("cannot send wal_receiver shutdown confirmation {e}") + } else { + info!("confirm walreceiver shutdown for {id}"); } } // Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly. @@ -322,17 +300,6 @@ async fn wal_receiver_main_thread_loop_step<'a>( } }; - { - WAL_RECEIVER_ENTRIES.write().await.insert( - id, - WalReceiverEntry { - wal_source_connstr: None, - last_received_msg_lsn: None, - last_received_msg_ts: None, - }, - ); - } - vacant_connection_manager_entry.insert( connection_manager::spawn_connection_manager_task( id, diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 8846e27256..fbd9ccd3c5 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -19,7 +19,7 @@ use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; use crate::{ - http::models::WalReceiverEntry, + layered_repository::WalReceiverInfo, pgdatadir_mapping::DatadirTimeline, repository::{Repository, Timeline}, tenant_mgr, @@ -232,21 +232,16 @@ pub async fn handle_walreceiver_connection( let apply_lsn = u64::from(timeline_remote_consistent_lsn); let ts = SystemTime::now(); - // Update the current WAL receiver's data stored inside the global hash table `WAL_RECEIVERS` - { - super::WAL_RECEIVER_ENTRIES.write().await.insert( - id, - WalReceiverEntry { - wal_source_connstr: Some(wal_source_connstr.to_owned()), - last_received_msg_lsn: Some(last_lsn), - last_received_msg_ts: Some( - ts.duration_since(SystemTime::UNIX_EPOCH) - .expect("Received message time should be before UNIX EPOCH!") - .as_micros(), - ), - }, - ); - } + // Update the status about what we just received. This is shown in the mgmt API. + let last_received_wal = WalReceiverInfo { + wal_source_connstr: wal_source_connstr.to_owned(), + last_received_msg_lsn: last_lsn, + last_received_msg_ts: ts + .duration_since(SystemTime::UNIX_EPOCH) + .expect("Received message time should be before UNIX EPOCH!") + .as_micros(), + }; + *timeline.last_received_wal.lock().unwrap() = Some(last_received_wal); // Send zenith feedback message. // Regular standby_status_update fields are put into this message. diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index a298f1d701..95791888a5 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -47,7 +47,8 @@ def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID): for timeline in timelines: timeline_id_str = str(timeline['timeline_id']) timeline_details = client.timeline_detail(tenant_id=tenant_id, - timeline_id=UUID(timeline_id_str)) + timeline_id=UUID(timeline_id_str), + include_non_incremental_logical_size=True) assert timeline_details['tenant_id'] == tenant_id.hex assert timeline_details['timeline_id'] == timeline_id_str @@ -63,13 +64,19 @@ def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): tenant_id, timeline_id = env.neon_cli.create_tenant() - empty_response = client.wal_receiver_get(tenant_id, timeline_id) + timeline_details = client.timeline_detail(tenant_id=tenant_id, + timeline_id=timeline_id, + include_non_incremental_logical_size=True) - assert empty_response.get('wal_source_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running' - assert empty_response.get('last_received_msg_lsn') is None, 'Should not be able to connect to WAL streaming without PG compute node running' - assert empty_response.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running' + assert timeline_details.get('wal_source_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running' + assert timeline_details.get('last_received_msg_lsn') is None, 'Should not be able to connect to WAL streaming without PG compute node running' + assert timeline_details.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running' +# Test the WAL-receiver related fields in the response to `timeline_details` API call +# +# These fields used to be returned by a separate API call, but they're part of +# `timeline_details` now. def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): env = neon_simple_env client = env.pageserver.http_client() @@ -78,18 +85,17 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id) def expect_updated_msg_lsn(prev_msg_lsn: Optional[int]) -> int: - res = client.wal_receiver_get(tenant_id, timeline_id) + timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id) - # a successful `wal_receiver_get` response must contain the below fields - assert list(res.keys()) == [ - "wal_source_connstr", - "last_received_msg_lsn", - "last_received_msg_ts", - ] + # a successful `timeline_details` response must contain the below fields + local_timeline_details = timeline_details['local'] + assert "wal_source_connstr" in local_timeline_details.keys() + assert "last_received_msg_lsn" in local_timeline_details.keys() + assert "last_received_msg_ts" in local_timeline_details.keys() - assert res["last_received_msg_lsn"] is not None, "the last received message's LSN is empty" + assert local_timeline_details["last_received_msg_lsn"] is not None, "the last received message's LSN is empty" - last_msg_lsn = lsn_from_hex(res["last_received_msg_lsn"]) + last_msg_lsn = lsn_from_hex(local_timeline_details["last_received_msg_lsn"]) assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, \ f"the last received message's LSN {last_msg_lsn} hasn't been updated \ compared to the previous message's LSN {prev_msg_lsn}" @@ -98,7 +104,7 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): # Wait to make sure that we get a latest WAL receiver data. # We need to wait here because it's possible that we don't have access to - # the latest WAL during the time the `wal_receiver_get` API is called. + # the latest WAL yet, when the `timeline_detail` API is first called. # See: https://github.com/neondatabase/neon/issues/1768. lsn = wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(None)) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 397b932ec9..8fa9e4a2ea 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -865,10 +865,24 @@ class NeonPageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: + def timeline_detail(self, + tenant_id: uuid.UUID, + timeline_id: uuid.UUID, + include_non_incremental_logical_size: bool = False, + include_non_incremental_physical_size: bool = False) -> Dict[Any, Any]: + + include_non_incremental_logical_size_str = "0" + if include_non_incremental_logical_size: + include_non_incremental_logical_size_str = "1" + + include_non_incremental_physical_size_str = "0" + if include_non_incremental_physical_size: + include_non_incremental_physical_size_str = "1" + res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}" + - "?include-non-incremental-logical-size=1&include-non-incremental-physical-size=1") + "?include-non-incremental-logical-size={include_non_incremental_logical_size_str}" + + "&include-non-incremental-physical-size={include_non_incremental_physical_size_str}") self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) @@ -882,15 +896,6 @@ class NeonPageserverHttpClient(requests.Session): assert res_json is None return res_json - def wal_receiver_get(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/wal_receiver" - ) - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - def get_metrics(self) -> str: res = self.get(f"http://localhost:{self.port}/metrics") self.verbose_error(res) From 539007c1732d30e58aeb869aa3b7aee64fabfa5e Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 1 Aug 2022 12:54:39 +0100 Subject: [PATCH 0575/1022] github/workflows: make bash more strict (#2197) --- .github/actions/run-python-test-set/action.yml | 8 ++++---- .github/actions/save-coverage-data/action.yml | 2 +- .github/workflows/benchmarking.yml | 2 +- .github/workflows/build_and_test.yml | 2 +- .github/workflows/codestyle.yml | 2 +- .github/workflows/pg_clients.yml | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index a956929d92..6dc377a809 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -38,7 +38,7 @@ runs: path: ./neon-artifact/ - name: Extract Neon artifact - shell: bash -ex {0} + shell: bash -euxo pipefail {0} run: | mkdir -p /tmp/neon/ tar -xf ./neon-artifact/neon.tar.zst -C /tmp/neon/ @@ -59,7 +59,7 @@ runs: key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Install Python deps - shell: bash -ex {0} + shell: bash -euxo pipefail {0} run: ./scripts/pysync - name: Run pytest @@ -70,7 +70,7 @@ runs: # this variable will be embedded in perf test report # and is needed to distinguish different environments PLATFORM: github-actions-selfhosted - shell: bash -ex {0} + shell: bash -euxo pipefail {0} run: | PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)" rm -rf $PERF_REPORT_DIR @@ -123,7 +123,7 @@ runs: fi - name: Delete all data but logs - shell: bash -ex {0} + shell: bash -euxo pipefail {0} if: always() run: | du -sh /tmp/test_output/* diff --git a/.github/actions/save-coverage-data/action.yml b/.github/actions/save-coverage-data/action.yml index 7ad04cf1fe..bcfd7cb47e 100644 --- a/.github/actions/save-coverage-data/action.yml +++ b/.github/actions/save-coverage-data/action.yml @@ -5,7 +5,7 @@ runs: using: "composite" steps: - name: Merge coverage data - shell: bash -ex {0} + shell: bash -euxo pipefail {0} run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge - name: Upload coverage data diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index cfd54325eb..427441f330 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -60,7 +60,7 @@ jobs: - name: Setup cluster env: BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" - shell: bash + shell: bash -euxo pipefail {0} run: | set -e diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index ba36c62156..312b4d1f46 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -9,7 +9,7 @@ on: defaults: run: - shell: bash -ex {0} + shell: bash -euxo pipefail {0} concurrency: # Allow only one workflow per any non-`main` branch. diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 8bcaa8f947..3acbeae9c2 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -8,7 +8,7 @@ on: defaults: run: - shell: bash -ex {0} + shell: bash -euxo pipefail {0} concurrency: # Allow only one workflow per any non-`main` branch. diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index 4ff31ac508..ba2d5cf666 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -40,7 +40,7 @@ jobs: key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Install Python deps - shell: bash -ex {0} + shell: bash -euxo pipefail {0} run: ./scripts/pysync - name: Run pytest @@ -49,7 +49,7 @@ jobs: BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" TEST_OUTPUT: /tmp/test_output POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - shell: bash -ex {0} + shell: bash -euxo pipefail {0} run: | # Test framework expects we have psql binary; # but since we don't really need it in this test, let's mock it From e73b95a09d7d8d14280c577b9c6412b9530309a0 Mon Sep 17 00:00:00 2001 From: Ankur Srivastava Date: Sun, 31 Jul 2022 17:40:59 +0200 Subject: [PATCH 0576/1022] docs: linked poetry related step in tests section Added the link to the dependencies which should be installed before running the tests. --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index f4c86bd6a0..f557b19987 100644 --- a/README.md +++ b/README.md @@ -204,6 +204,8 @@ postgres=# select * from t; ## Running tests +Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes). + ```sh git clone --recursive https://github.com/neondatabase/neon.git make # builds also postgres and installs it to ./tmp_install From 092a9b74d3d060678c53c4077bf67331e11c5a8a Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 28 Jul 2022 19:46:26 +0300 Subject: [PATCH 0577/1022] use only s3 in boto3-stubs and update mypy Newer version of mypy fixes buggy error when trying to update only boto3 stubs. However it brings new checks and starts to yell when we index into cusror.fetchone without checking for None first. So this introduces a wrapper to simplify quering for scalar values. I tried to use cursor_factory connection argument but without success. There can be a better way to do that, but this looks the simplest --- poetry.lock | 1729 ++++++----------- pyproject.toml | 10 +- .../batch_others/test_ancestor_branch.py | 43 +- .../batch_others/test_branch_and_gc.py | 14 +- .../batch_others/test_branch_behind.py | 62 +- .../batch_others/test_broken_timeline.py | 23 +- .../batch_others/test_clog_truncate.py | 26 +- test_runner/batch_others/test_createdropdb.py | 64 +- test_runner/batch_others/test_createuser.py | 15 +- test_runner/batch_others/test_fullbackup.py | 24 +- .../batch_others/test_gc_aggressive.py | 32 +- test_runner/batch_others/test_lsn_mapping.py | 29 +- test_runner/batch_others/test_multixact.py | 17 +- .../batch_others/test_old_request_lsn.py | 6 +- .../batch_others/test_pageserver_restart.py | 1 + test_runner/batch_others/test_pitr_gc.py | 11 +- .../batch_others/test_read_validation.py | 27 +- .../batch_others/test_readonly_node.py | 9 +- .../batch_others/test_remote_storage.py | 26 +- .../test_tenants_with_remote_storage.py | 16 +- .../batch_others/test_timeline_size.py | 19 +- test_runner/batch_others/test_wal_acceptor.py | 35 +- .../batch_pg_regress/test_pg_regress.py | 1 - test_runner/fixtures/compare_fixtures.py | 1 + test_runner/fixtures/neon_fixtures.py | 18 +- test_runner/fixtures/utils.py | 16 + test_runner/performance/test_random_writes.py | 6 +- test_runner/performance/test_seqscans.py | 1 + 28 files changed, 871 insertions(+), 1410 deletions(-) diff --git a/poetry.lock b/poetry.lock index 4963390718..f55cfda000 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,6 +1,6 @@ [[package]] name = "aiopg" -version = "1.3.3" +version = "1.3.4" description = "Postgres integration with asyncio." category = "main" optional = false @@ -36,7 +36,7 @@ test = ["pycodestyle (>=2.7.0,<2.8.0)", "flake8 (>=3.9.2,<3.10.0)", "uvloop (>=0 [[package]] name = "atomicwrites" -version = "1.4.0" +version = "1.4.1" description = "Atomic file writes." category = "main" optional = false @@ -58,23 +58,22 @@ tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (> [[package]] name = "aws-sam-translator" -version = "1.42.0" +version = "1.48.0" description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates" category = "main" optional = false -python-versions = "*" +python-versions = ">=3.7, <=4.0, !=4.0" [package.dependencies] -boto3 = ">=1.5,<2.0" +boto3 = ">=1.19.5,<2.0.0" jsonschema = ">=3.2,<4.0" -six = ">=1.15,<2.0" [package.extras] -dev = ["coverage (>=5.3,<6.0)", "flake8 (>=3.8.4,<3.9.0)", "tox (>=3.20.1,<3.21.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pylint (>=1.7.2,<2.0)", "pyyaml (>=5.4,<6.0)", "mock (>=3.0.5,<4.0.0)", "parameterized (>=0.7.4,<0.8.0)", "click (>=7.1,<8.0)", "dateparser (>=0.7,<1.0)", "boto3 (>=1.17,<2.0)", "requests (>=2.24.0,<2.25.0)", "docopt (>=0.6.2,<0.7.0)", "pathlib2 (>=2.3.5)", "pytest (>=4.6.11,<4.7.0)", "pytest (>=6.1.1,<6.2.0)", "black (==20.8b1)"] +dev = ["coverage (>=5.3,<6.0)", "flake8 (>=3.8.4,<3.9.0)", "tox (>=3.24,<4.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-xdist (>=2.5,<3.0)", "pytest-env (>=0.6.2,<0.7.0)", "pylint (>=2.9.0,<2.10.0)", "pyyaml (>=5.4,<6.0)", "pytest (>=6.2.5,<6.3.0)", "parameterized (>=0.7.4,<0.8.0)", "click (>=7.1,<8.0)", "dateparser (>=0.7,<1.0)", "boto3 (>=1.23,<2)", "tenacity (>=7.0.0,<7.1.0)", "requests (>=2.24.0,<2.25.0)", "docopt (>=0.6.2,<0.7.0)", "black (==20.8b1)"] [[package]] name = "aws-xray-sdk" -version = "2.9.0" +version = "2.10.0" description = "The AWS X-Ray SDK for Python (the SDK) enables Python developers to record and emit information from within their applications to the AWS X-Ray service." category = "main" optional = false @@ -82,7 +81,6 @@ python-versions = "*" [package.dependencies] botocore = ">=1.11.3" -future = "*" wrapt = "*" [[package]] @@ -95,357 +93,374 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] name = "boto3" -version = "1.20.40" +version = "1.24.38" description = "The AWS SDK for Python" category = "main" optional = false -python-versions = ">= 3.6" +python-versions = ">= 3.7" [package.dependencies] -botocore = ">=1.23.40,<1.24.0" -jmespath = ">=0.7.1,<1.0.0" -s3transfer = ">=0.5.0,<0.6.0" +botocore = ">=1.27.38,<1.28.0" +jmespath = ">=0.7.1,<2.0.0" +s3transfer = ">=0.6.0,<0.7.0" [package.extras] crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "boto3-stubs" -version = "1.20.40" -description = "Type annotations for boto3 1.20.40, generated by mypy-boto3-builder 6.3.2" +version = "1.24.38" +description = "Type annotations for boto3 1.24.38 generated with mypy-boto3-builder 7.10.1" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [package.dependencies] botocore-stubs = "*" +mypy-boto3-s3 = {version = ">=1.24.0,<1.25.0", optional = true, markers = "extra == \"s3\""} +types-s3transfer = "*" +typing-extensions = ">=4.1.0" [package.extras] -accessanalyzer = ["mypy-boto3-accessanalyzer (>=1.20.0)"] -account = ["mypy-boto3-account (>=1.20.0)"] -acm = ["mypy-boto3-acm (>=1.20.0)"] -acm-pca = ["mypy-boto3-acm-pca (>=1.20.0)"] -alexaforbusiness = ["mypy-boto3-alexaforbusiness (>=1.20.0)"] -all = ["mypy-boto3-accessanalyzer (>=1.20.0)", "mypy-boto3-account (>=1.20.0)", "mypy-boto3-acm (>=1.20.0)", "mypy-boto3-acm-pca (>=1.20.0)", "mypy-boto3-alexaforbusiness (>=1.20.0)", "mypy-boto3-amp (>=1.20.0)", "mypy-boto3-amplify (>=1.20.0)", "mypy-boto3-amplifybackend (>=1.20.0)", "mypy-boto3-amplifyuibuilder (>=1.20.0)", "mypy-boto3-apigateway (>=1.20.0)", "mypy-boto3-apigatewaymanagementapi (>=1.20.0)", "mypy-boto3-apigatewayv2 (>=1.20.0)", "mypy-boto3-appconfig (>=1.20.0)", "mypy-boto3-appconfigdata (>=1.20.0)", "mypy-boto3-appflow (>=1.20.0)", "mypy-boto3-appintegrations (>=1.20.0)", "mypy-boto3-application-autoscaling (>=1.20.0)", "mypy-boto3-application-insights (>=1.20.0)", "mypy-boto3-applicationcostprofiler (>=1.20.0)", "mypy-boto3-appmesh (>=1.20.0)", "mypy-boto3-apprunner (>=1.20.0)", "mypy-boto3-appstream (>=1.20.0)", "mypy-boto3-appsync (>=1.20.0)", "mypy-boto3-athena (>=1.20.0)", "mypy-boto3-auditmanager (>=1.20.0)", "mypy-boto3-autoscaling (>=1.20.0)", "mypy-boto3-autoscaling-plans (>=1.20.0)", "mypy-boto3-backup (>=1.20.0)", "mypy-boto3-backup-gateway (>=1.20.0)", "mypy-boto3-batch (>=1.20.0)", "mypy-boto3-braket (>=1.20.0)", "mypy-boto3-budgets (>=1.20.0)", "mypy-boto3-ce (>=1.20.0)", "mypy-boto3-chime (>=1.20.0)", "mypy-boto3-chime-sdk-identity (>=1.20.0)", "mypy-boto3-chime-sdk-meetings (>=1.20.0)", "mypy-boto3-chime-sdk-messaging (>=1.20.0)", "mypy-boto3-cloud9 (>=1.20.0)", "mypy-boto3-cloudcontrol (>=1.20.0)", "mypy-boto3-clouddirectory (>=1.20.0)", "mypy-boto3-cloudformation (>=1.20.0)", "mypy-boto3-cloudfront (>=1.20.0)", "mypy-boto3-cloudhsm (>=1.20.0)", "mypy-boto3-cloudhsmv2 (>=1.20.0)", "mypy-boto3-cloudsearch (>=1.20.0)", "mypy-boto3-cloudsearchdomain (>=1.20.0)", "mypy-boto3-cloudtrail (>=1.20.0)", "mypy-boto3-cloudwatch (>=1.20.0)", "mypy-boto3-codeartifact (>=1.20.0)", "mypy-boto3-codebuild (>=1.20.0)", "mypy-boto3-codecommit (>=1.20.0)", "mypy-boto3-codedeploy (>=1.20.0)", "mypy-boto3-codeguru-reviewer (>=1.20.0)", "mypy-boto3-codeguruprofiler (>=1.20.0)", "mypy-boto3-codepipeline (>=1.20.0)", "mypy-boto3-codestar (>=1.20.0)", "mypy-boto3-codestar-connections (>=1.20.0)", "mypy-boto3-codestar-notifications (>=1.20.0)", "mypy-boto3-cognito-identity (>=1.20.0)", "mypy-boto3-cognito-idp (>=1.20.0)", "mypy-boto3-cognito-sync (>=1.20.0)", "mypy-boto3-comprehend (>=1.20.0)", "mypy-boto3-comprehendmedical (>=1.20.0)", "mypy-boto3-compute-optimizer (>=1.20.0)", "mypy-boto3-config (>=1.20.0)", "mypy-boto3-connect (>=1.20.0)", "mypy-boto3-connect-contact-lens (>=1.20.0)", "mypy-boto3-connectparticipant (>=1.20.0)", "mypy-boto3-cur (>=1.20.0)", "mypy-boto3-customer-profiles (>=1.20.0)", "mypy-boto3-databrew (>=1.20.0)", "mypy-boto3-dataexchange (>=1.20.0)", "mypy-boto3-datapipeline (>=1.20.0)", "mypy-boto3-datasync (>=1.20.0)", "mypy-boto3-dax (>=1.20.0)", "mypy-boto3-detective (>=1.20.0)", "mypy-boto3-devicefarm (>=1.20.0)", "mypy-boto3-devops-guru (>=1.20.0)", "mypy-boto3-directconnect (>=1.20.0)", "mypy-boto3-discovery (>=1.20.0)", "mypy-boto3-dlm (>=1.20.0)", "mypy-boto3-dms (>=1.20.0)", "mypy-boto3-docdb (>=1.20.0)", "mypy-boto3-drs (>=1.20.0)", "mypy-boto3-ds (>=1.20.0)", "mypy-boto3-dynamodb (>=1.20.0)", "mypy-boto3-dynamodbstreams (>=1.20.0)", "mypy-boto3-ebs (>=1.20.0)", "mypy-boto3-ec2 (>=1.20.0)", "mypy-boto3-ec2-instance-connect (>=1.20.0)", "mypy-boto3-ecr (>=1.20.0)", "mypy-boto3-ecr-public (>=1.20.0)", "mypy-boto3-ecs (>=1.20.0)", "mypy-boto3-efs (>=1.20.0)", "mypy-boto3-eks (>=1.20.0)", "mypy-boto3-elastic-inference (>=1.20.0)", "mypy-boto3-elasticache (>=1.20.0)", "mypy-boto3-elasticbeanstalk (>=1.20.0)", "mypy-boto3-elastictranscoder (>=1.20.0)", "mypy-boto3-elb (>=1.20.0)", "mypy-boto3-elbv2 (>=1.20.0)", "mypy-boto3-emr (>=1.20.0)", "mypy-boto3-emr-containers (>=1.20.0)", "mypy-boto3-es (>=1.20.0)", "mypy-boto3-events (>=1.20.0)", "mypy-boto3-evidently (>=1.20.0)", "mypy-boto3-finspace (>=1.20.0)", "mypy-boto3-finspace-data (>=1.20.0)", "mypy-boto3-firehose (>=1.20.0)", "mypy-boto3-fis (>=1.20.0)", "mypy-boto3-fms (>=1.20.0)", "mypy-boto3-forecast (>=1.20.0)", "mypy-boto3-forecastquery (>=1.20.0)", "mypy-boto3-frauddetector (>=1.20.0)", "mypy-boto3-fsx (>=1.20.0)", "mypy-boto3-gamelift (>=1.20.0)", "mypy-boto3-glacier (>=1.20.0)", "mypy-boto3-globalaccelerator (>=1.20.0)", "mypy-boto3-glue (>=1.20.0)", "mypy-boto3-grafana (>=1.20.0)", "mypy-boto3-greengrass (>=1.20.0)", "mypy-boto3-greengrassv2 (>=1.20.0)", "mypy-boto3-groundstation (>=1.20.0)", "mypy-boto3-guardduty (>=1.20.0)", "mypy-boto3-health (>=1.20.0)", "mypy-boto3-healthlake (>=1.20.0)", "mypy-boto3-honeycode (>=1.20.0)", "mypy-boto3-iam (>=1.20.0)", "mypy-boto3-identitystore (>=1.20.0)", "mypy-boto3-imagebuilder (>=1.20.0)", "mypy-boto3-importexport (>=1.20.0)", "mypy-boto3-inspector (>=1.20.0)", "mypy-boto3-inspector2 (>=1.20.0)", "mypy-boto3-iot (>=1.20.0)", "mypy-boto3-iot-data (>=1.20.0)", "mypy-boto3-iot-jobs-data (>=1.20.0)", "mypy-boto3-iot1click-devices (>=1.20.0)", "mypy-boto3-iot1click-projects (>=1.20.0)", "mypy-boto3-iotanalytics (>=1.20.0)", "mypy-boto3-iotdeviceadvisor (>=1.20.0)", "mypy-boto3-iotevents (>=1.20.0)", "mypy-boto3-iotevents-data (>=1.20.0)", "mypy-boto3-iotfleethub (>=1.20.0)", "mypy-boto3-iotsecuretunneling (>=1.20.0)", "mypy-boto3-iotsitewise (>=1.20.0)", "mypy-boto3-iotthingsgraph (>=1.20.0)", "mypy-boto3-iottwinmaker (>=1.20.0)", "mypy-boto3-iotwireless (>=1.20.0)", "mypy-boto3-ivs (>=1.20.0)", "mypy-boto3-kafka (>=1.20.0)", "mypy-boto3-kafkaconnect (>=1.20.0)", "mypy-boto3-kendra (>=1.20.0)", "mypy-boto3-kinesis (>=1.20.0)", "mypy-boto3-kinesis-video-archived-media (>=1.20.0)", "mypy-boto3-kinesis-video-media (>=1.20.0)", "mypy-boto3-kinesis-video-signaling (>=1.20.0)", "mypy-boto3-kinesisanalytics (>=1.20.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.20.0)", "mypy-boto3-kinesisvideo (>=1.20.0)", "mypy-boto3-kms (>=1.20.0)", "mypy-boto3-lakeformation (>=1.20.0)", "mypy-boto3-lambda (>=1.20.0)", "mypy-boto3-lex-models (>=1.20.0)", "mypy-boto3-lex-runtime (>=1.20.0)", "mypy-boto3-lexv2-models (>=1.20.0)", "mypy-boto3-lexv2-runtime (>=1.20.0)", "mypy-boto3-license-manager (>=1.20.0)", "mypy-boto3-lightsail (>=1.20.0)", "mypy-boto3-location (>=1.20.0)", "mypy-boto3-logs (>=1.20.0)", "mypy-boto3-lookoutequipment (>=1.20.0)", "mypy-boto3-lookoutmetrics (>=1.20.0)", "mypy-boto3-lookoutvision (>=1.20.0)", "mypy-boto3-machinelearning (>=1.20.0)", "mypy-boto3-macie (>=1.20.0)", "mypy-boto3-macie2 (>=1.20.0)", "mypy-boto3-managedblockchain (>=1.20.0)", "mypy-boto3-marketplace-catalog (>=1.20.0)", "mypy-boto3-marketplace-entitlement (>=1.20.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.20.0)", "mypy-boto3-mediaconnect (>=1.20.0)", "mypy-boto3-mediaconvert (>=1.20.0)", "mypy-boto3-medialive (>=1.20.0)", "mypy-boto3-mediapackage (>=1.20.0)", "mypy-boto3-mediapackage-vod (>=1.20.0)", "mypy-boto3-mediastore (>=1.20.0)", "mypy-boto3-mediastore-data (>=1.20.0)", "mypy-boto3-mediatailor (>=1.20.0)", "mypy-boto3-memorydb (>=1.20.0)", "mypy-boto3-meteringmarketplace (>=1.20.0)", "mypy-boto3-mgh (>=1.20.0)", "mypy-boto3-mgn (>=1.20.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.20.0)", "mypy-boto3-migrationhub-config (>=1.20.0)", "mypy-boto3-migrationhubstrategy (>=1.20.0)", "mypy-boto3-mobile (>=1.20.0)", "mypy-boto3-mq (>=1.20.0)", "mypy-boto3-mturk (>=1.20.0)", "mypy-boto3-mwaa (>=1.20.0)", "mypy-boto3-neptune (>=1.20.0)", "mypy-boto3-network-firewall (>=1.20.0)", "mypy-boto3-networkmanager (>=1.20.0)", "mypy-boto3-nimble (>=1.20.0)", "mypy-boto3-opensearch (>=1.20.0)", "mypy-boto3-opsworks (>=1.20.0)", "mypy-boto3-opsworkscm (>=1.20.0)", "mypy-boto3-organizations (>=1.20.0)", "mypy-boto3-outposts (>=1.20.0)", "mypy-boto3-panorama (>=1.20.0)", "mypy-boto3-personalize (>=1.20.0)", "mypy-boto3-personalize-events (>=1.20.0)", "mypy-boto3-personalize-runtime (>=1.20.0)", "mypy-boto3-pi (>=1.20.0)", "mypy-boto3-pinpoint (>=1.20.0)", "mypy-boto3-pinpoint-email (>=1.20.0)", "mypy-boto3-pinpoint-sms-voice (>=1.20.0)", "mypy-boto3-polly (>=1.20.0)", "mypy-boto3-pricing (>=1.20.0)", "mypy-boto3-proton (>=1.20.0)", "mypy-boto3-qldb (>=1.20.0)", "mypy-boto3-qldb-session (>=1.20.0)", "mypy-boto3-quicksight (>=1.20.0)", "mypy-boto3-ram (>=1.20.0)", "mypy-boto3-rbin (>=1.20.0)", "mypy-boto3-rds (>=1.20.0)", "mypy-boto3-rds-data (>=1.20.0)", "mypy-boto3-redshift (>=1.20.0)", "mypy-boto3-redshift-data (>=1.20.0)", "mypy-boto3-rekognition (>=1.20.0)", "mypy-boto3-resiliencehub (>=1.20.0)", "mypy-boto3-resource-groups (>=1.20.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.20.0)", "mypy-boto3-robomaker (>=1.20.0)", "mypy-boto3-route53 (>=1.20.0)", "mypy-boto3-route53-recovery-cluster (>=1.20.0)", "mypy-boto3-route53-recovery-control-config (>=1.20.0)", "mypy-boto3-route53-recovery-readiness (>=1.20.0)", "mypy-boto3-route53domains (>=1.20.0)", "mypy-boto3-route53resolver (>=1.20.0)", "mypy-boto3-rum (>=1.20.0)", "mypy-boto3-s3 (>=1.20.0)", "mypy-boto3-s3control (>=1.20.0)", "mypy-boto3-s3outposts (>=1.20.0)", "mypy-boto3-sagemaker (>=1.20.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.20.0)", "mypy-boto3-sagemaker-edge (>=1.20.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.20.0)", "mypy-boto3-sagemaker-runtime (>=1.20.0)", "mypy-boto3-savingsplans (>=1.20.0)", "mypy-boto3-schemas (>=1.20.0)", "mypy-boto3-sdb (>=1.20.0)", "mypy-boto3-secretsmanager (>=1.20.0)", "mypy-boto3-securityhub (>=1.20.0)", "mypy-boto3-serverlessrepo (>=1.20.0)", "mypy-boto3-service-quotas (>=1.20.0)", "mypy-boto3-servicecatalog (>=1.20.0)", "mypy-boto3-servicecatalog-appregistry (>=1.20.0)", "mypy-boto3-servicediscovery (>=1.20.0)", "mypy-boto3-ses (>=1.20.0)", "mypy-boto3-sesv2 (>=1.20.0)", "mypy-boto3-shield (>=1.20.0)", "mypy-boto3-signer (>=1.20.0)", "mypy-boto3-sms (>=1.20.0)", "mypy-boto3-sms-voice (>=1.20.0)", "mypy-boto3-snow-device-management (>=1.20.0)", "mypy-boto3-snowball (>=1.20.0)", "mypy-boto3-sns (>=1.20.0)", "mypy-boto3-sqs (>=1.20.0)", "mypy-boto3-ssm (>=1.20.0)", "mypy-boto3-ssm-contacts (>=1.20.0)", "mypy-boto3-ssm-incidents (>=1.20.0)", "mypy-boto3-sso (>=1.20.0)", "mypy-boto3-sso-admin (>=1.20.0)", "mypy-boto3-sso-oidc (>=1.20.0)", "mypy-boto3-stepfunctions (>=1.20.0)", "mypy-boto3-storagegateway (>=1.20.0)", "mypy-boto3-sts (>=1.20.0)", "mypy-boto3-support (>=1.20.0)", "mypy-boto3-swf (>=1.20.0)", "mypy-boto3-synthetics (>=1.20.0)", "mypy-boto3-textract (>=1.20.0)", "mypy-boto3-timestream-query (>=1.20.0)", "mypy-boto3-timestream-write (>=1.20.0)", "mypy-boto3-transcribe (>=1.20.0)", "mypy-boto3-transfer (>=1.20.0)", "mypy-boto3-translate (>=1.20.0)", "mypy-boto3-voice-id (>=1.20.0)", "mypy-boto3-waf (>=1.20.0)", "mypy-boto3-waf-regional (>=1.20.0)", "mypy-boto3-wafv2 (>=1.20.0)", "mypy-boto3-wellarchitected (>=1.20.0)", "mypy-boto3-wisdom (>=1.20.0)", "mypy-boto3-workdocs (>=1.20.0)", "mypy-boto3-worklink (>=1.20.0)", "mypy-boto3-workmail (>=1.20.0)", "mypy-boto3-workmailmessageflow (>=1.20.0)", "mypy-boto3-workspaces (>=1.20.0)", "mypy-boto3-workspaces-web (>=1.20.0)", "mypy-boto3-xray (>=1.20.0)"] -amp = ["mypy-boto3-amp (>=1.20.0)"] -amplify = ["mypy-boto3-amplify (>=1.20.0)"] -amplifybackend = ["mypy-boto3-amplifybackend (>=1.20.0)"] -amplifyuibuilder = ["mypy-boto3-amplifyuibuilder (>=1.20.0)"] -apigateway = ["mypy-boto3-apigateway (>=1.20.0)"] -apigatewaymanagementapi = ["mypy-boto3-apigatewaymanagementapi (>=1.20.0)"] -apigatewayv2 = ["mypy-boto3-apigatewayv2 (>=1.20.0)"] -appconfig = ["mypy-boto3-appconfig (>=1.20.0)"] -appconfigdata = ["mypy-boto3-appconfigdata (>=1.20.0)"] -appflow = ["mypy-boto3-appflow (>=1.20.0)"] -appintegrations = ["mypy-boto3-appintegrations (>=1.20.0)"] -application-autoscaling = ["mypy-boto3-application-autoscaling (>=1.20.0)"] -application-insights = ["mypy-boto3-application-insights (>=1.20.0)"] -applicationcostprofiler = ["mypy-boto3-applicationcostprofiler (>=1.20.0)"] -appmesh = ["mypy-boto3-appmesh (>=1.20.0)"] -apprunner = ["mypy-boto3-apprunner (>=1.20.0)"] -appstream = ["mypy-boto3-appstream (>=1.20.0)"] -appsync = ["mypy-boto3-appsync (>=1.20.0)"] -athena = ["mypy-boto3-athena (>=1.20.0)"] -auditmanager = ["mypy-boto3-auditmanager (>=1.20.0)"] -autoscaling = ["mypy-boto3-autoscaling (>=1.20.0)"] -autoscaling-plans = ["mypy-boto3-autoscaling-plans (>=1.20.0)"] -backup = ["mypy-boto3-backup (>=1.20.0)"] -backup-gateway = ["mypy-boto3-backup-gateway (>=1.20.0)"] -batch = ["mypy-boto3-batch (>=1.20.0)"] -braket = ["mypy-boto3-braket (>=1.20.0)"] -budgets = ["mypy-boto3-budgets (>=1.20.0)"] -ce = ["mypy-boto3-ce (>=1.20.0)"] -chime = ["mypy-boto3-chime (>=1.20.0)"] -chime-sdk-identity = ["mypy-boto3-chime-sdk-identity (>=1.20.0)"] -chime-sdk-meetings = ["mypy-boto3-chime-sdk-meetings (>=1.20.0)"] -chime-sdk-messaging = ["mypy-boto3-chime-sdk-messaging (>=1.20.0)"] -cloud9 = ["mypy-boto3-cloud9 (>=1.20.0)"] -cloudcontrol = ["mypy-boto3-cloudcontrol (>=1.20.0)"] -clouddirectory = ["mypy-boto3-clouddirectory (>=1.20.0)"] -cloudformation = ["mypy-boto3-cloudformation (>=1.20.0)"] -cloudfront = ["mypy-boto3-cloudfront (>=1.20.0)"] -cloudhsm = ["mypy-boto3-cloudhsm (>=1.20.0)"] -cloudhsmv2 = ["mypy-boto3-cloudhsmv2 (>=1.20.0)"] -cloudsearch = ["mypy-boto3-cloudsearch (>=1.20.0)"] -cloudsearchdomain = ["mypy-boto3-cloudsearchdomain (>=1.20.0)"] -cloudtrail = ["mypy-boto3-cloudtrail (>=1.20.0)"] -cloudwatch = ["mypy-boto3-cloudwatch (>=1.20.0)"] -codeartifact = ["mypy-boto3-codeartifact (>=1.20.0)"] -codebuild = ["mypy-boto3-codebuild (>=1.20.0)"] -codecommit = ["mypy-boto3-codecommit (>=1.20.0)"] -codedeploy = ["mypy-boto3-codedeploy (>=1.20.0)"] -codeguru-reviewer = ["mypy-boto3-codeguru-reviewer (>=1.20.0)"] -codeguruprofiler = ["mypy-boto3-codeguruprofiler (>=1.20.0)"] -codepipeline = ["mypy-boto3-codepipeline (>=1.20.0)"] -codestar = ["mypy-boto3-codestar (>=1.20.0)"] -codestar-connections = ["mypy-boto3-codestar-connections (>=1.20.0)"] -codestar-notifications = ["mypy-boto3-codestar-notifications (>=1.20.0)"] -cognito-identity = ["mypy-boto3-cognito-identity (>=1.20.0)"] -cognito-idp = ["mypy-boto3-cognito-idp (>=1.20.0)"] -cognito-sync = ["mypy-boto3-cognito-sync (>=1.20.0)"] -comprehend = ["mypy-boto3-comprehend (>=1.20.0)"] -comprehendmedical = ["mypy-boto3-comprehendmedical (>=1.20.0)"] -compute-optimizer = ["mypy-boto3-compute-optimizer (>=1.20.0)"] -config = ["mypy-boto3-config (>=1.20.0)"] -connect = ["mypy-boto3-connect (>=1.20.0)"] -connect-contact-lens = ["mypy-boto3-connect-contact-lens (>=1.20.0)"] -connectparticipant = ["mypy-boto3-connectparticipant (>=1.20.0)"] -cur = ["mypy-boto3-cur (>=1.20.0)"] -customer-profiles = ["mypy-boto3-customer-profiles (>=1.20.0)"] -databrew = ["mypy-boto3-databrew (>=1.20.0)"] -dataexchange = ["mypy-boto3-dataexchange (>=1.20.0)"] -datapipeline = ["mypy-boto3-datapipeline (>=1.20.0)"] -datasync = ["mypy-boto3-datasync (>=1.20.0)"] -dax = ["mypy-boto3-dax (>=1.20.0)"] -detective = ["mypy-boto3-detective (>=1.20.0)"] -devicefarm = ["mypy-boto3-devicefarm (>=1.20.0)"] -devops-guru = ["mypy-boto3-devops-guru (>=1.20.0)"] -directconnect = ["mypy-boto3-directconnect (>=1.20.0)"] -discovery = ["mypy-boto3-discovery (>=1.20.0)"] -dlm = ["mypy-boto3-dlm (>=1.20.0)"] -dms = ["mypy-boto3-dms (>=1.20.0)"] -docdb = ["mypy-boto3-docdb (>=1.20.0)"] -drs = ["mypy-boto3-drs (>=1.20.0)"] -ds = ["mypy-boto3-ds (>=1.20.0)"] -dynamodb = ["mypy-boto3-dynamodb (>=1.20.0)"] -dynamodbstreams = ["mypy-boto3-dynamodbstreams (>=1.20.0)"] -ebs = ["mypy-boto3-ebs (>=1.20.0)"] -ec2 = ["mypy-boto3-ec2 (>=1.20.0)"] -ec2-instance-connect = ["mypy-boto3-ec2-instance-connect (>=1.20.0)"] -ecr = ["mypy-boto3-ecr (>=1.20.0)"] -ecr-public = ["mypy-boto3-ecr-public (>=1.20.0)"] -ecs = ["mypy-boto3-ecs (>=1.20.0)"] -efs = ["mypy-boto3-efs (>=1.20.0)"] -eks = ["mypy-boto3-eks (>=1.20.0)"] -elastic-inference = ["mypy-boto3-elastic-inference (>=1.20.0)"] -elasticache = ["mypy-boto3-elasticache (>=1.20.0)"] -elasticbeanstalk = ["mypy-boto3-elasticbeanstalk (>=1.20.0)"] -elastictranscoder = ["mypy-boto3-elastictranscoder (>=1.20.0)"] -elb = ["mypy-boto3-elb (>=1.20.0)"] -elbv2 = ["mypy-boto3-elbv2 (>=1.20.0)"] -emr = ["mypy-boto3-emr (>=1.20.0)"] -emr-containers = ["mypy-boto3-emr-containers (>=1.20.0)"] -es = ["mypy-boto3-es (>=1.20.0)"] -essential = ["mypy-boto3-cloudformation (>=1.20.0)", "mypy-boto3-dynamodb (>=1.20.0)", "mypy-boto3-ec2 (>=1.20.0)", "mypy-boto3-lambda (>=1.20.0)", "mypy-boto3-rds (>=1.20.0)", "mypy-boto3-s3 (>=1.20.0)", "mypy-boto3-sqs (>=1.20.0)"] -events = ["mypy-boto3-events (>=1.20.0)"] -evidently = ["mypy-boto3-evidently (>=1.20.0)"] -finspace = ["mypy-boto3-finspace (>=1.20.0)"] -finspace-data = ["mypy-boto3-finspace-data (>=1.20.0)"] -firehose = ["mypy-boto3-firehose (>=1.20.0)"] -fis = ["mypy-boto3-fis (>=1.20.0)"] -fms = ["mypy-boto3-fms (>=1.20.0)"] -forecast = ["mypy-boto3-forecast (>=1.20.0)"] -forecastquery = ["mypy-boto3-forecastquery (>=1.20.0)"] -frauddetector = ["mypy-boto3-frauddetector (>=1.20.0)"] -fsx = ["mypy-boto3-fsx (>=1.20.0)"] -gamelift = ["mypy-boto3-gamelift (>=1.20.0)"] -glacier = ["mypy-boto3-glacier (>=1.20.0)"] -globalaccelerator = ["mypy-boto3-globalaccelerator (>=1.20.0)"] -glue = ["mypy-boto3-glue (>=1.20.0)"] -grafana = ["mypy-boto3-grafana (>=1.20.0)"] -greengrass = ["mypy-boto3-greengrass (>=1.20.0)"] -greengrassv2 = ["mypy-boto3-greengrassv2 (>=1.20.0)"] -groundstation = ["mypy-boto3-groundstation (>=1.20.0)"] -guardduty = ["mypy-boto3-guardduty (>=1.20.0)"] -health = ["mypy-boto3-health (>=1.20.0)"] -healthlake = ["mypy-boto3-healthlake (>=1.20.0)"] -honeycode = ["mypy-boto3-honeycode (>=1.20.0)"] -iam = ["mypy-boto3-iam (>=1.20.0)"] -identitystore = ["mypy-boto3-identitystore (>=1.20.0)"] -imagebuilder = ["mypy-boto3-imagebuilder (>=1.20.0)"] -importexport = ["mypy-boto3-importexport (>=1.20.0)"] -inspector = ["mypy-boto3-inspector (>=1.20.0)"] -inspector2 = ["mypy-boto3-inspector2 (>=1.20.0)"] -iot = ["mypy-boto3-iot (>=1.20.0)"] -iot-data = ["mypy-boto3-iot-data (>=1.20.0)"] -iot-jobs-data = ["mypy-boto3-iot-jobs-data (>=1.20.0)"] -iot1click-devices = ["mypy-boto3-iot1click-devices (>=1.20.0)"] -iot1click-projects = ["mypy-boto3-iot1click-projects (>=1.20.0)"] -iotanalytics = ["mypy-boto3-iotanalytics (>=1.20.0)"] -iotdeviceadvisor = ["mypy-boto3-iotdeviceadvisor (>=1.20.0)"] -iotevents = ["mypy-boto3-iotevents (>=1.20.0)"] -iotevents-data = ["mypy-boto3-iotevents-data (>=1.20.0)"] -iotfleethub = ["mypy-boto3-iotfleethub (>=1.20.0)"] -iotsecuretunneling = ["mypy-boto3-iotsecuretunneling (>=1.20.0)"] -iotsitewise = ["mypy-boto3-iotsitewise (>=1.20.0)"] -iotthingsgraph = ["mypy-boto3-iotthingsgraph (>=1.20.0)"] -iottwinmaker = ["mypy-boto3-iottwinmaker (>=1.20.0)"] -iotwireless = ["mypy-boto3-iotwireless (>=1.20.0)"] -ivs = ["mypy-boto3-ivs (>=1.20.0)"] -kafka = ["mypy-boto3-kafka (>=1.20.0)"] -kafkaconnect = ["mypy-boto3-kafkaconnect (>=1.20.0)"] -kendra = ["mypy-boto3-kendra (>=1.20.0)"] -kinesis = ["mypy-boto3-kinesis (>=1.20.0)"] -kinesis-video-archived-media = ["mypy-boto3-kinesis-video-archived-media (>=1.20.0)"] -kinesis-video-media = ["mypy-boto3-kinesis-video-media (>=1.20.0)"] -kinesis-video-signaling = ["mypy-boto3-kinesis-video-signaling (>=1.20.0)"] -kinesisanalytics = ["mypy-boto3-kinesisanalytics (>=1.20.0)"] -kinesisanalyticsv2 = ["mypy-boto3-kinesisanalyticsv2 (>=1.20.0)"] -kinesisvideo = ["mypy-boto3-kinesisvideo (>=1.20.0)"] -kms = ["mypy-boto3-kms (>=1.20.0)"] -lakeformation = ["mypy-boto3-lakeformation (>=1.20.0)"] -lambda = ["mypy-boto3-lambda (>=1.20.0)"] -lex-models = ["mypy-boto3-lex-models (>=1.20.0)"] -lex-runtime = ["mypy-boto3-lex-runtime (>=1.20.0)"] -lexv2-models = ["mypy-boto3-lexv2-models (>=1.20.0)"] -lexv2-runtime = ["mypy-boto3-lexv2-runtime (>=1.20.0)"] -license-manager = ["mypy-boto3-license-manager (>=1.20.0)"] -lightsail = ["mypy-boto3-lightsail (>=1.20.0)"] -location = ["mypy-boto3-location (>=1.20.0)"] -logs = ["mypy-boto3-logs (>=1.20.0)"] -lookoutequipment = ["mypy-boto3-lookoutequipment (>=1.20.0)"] -lookoutmetrics = ["mypy-boto3-lookoutmetrics (>=1.20.0)"] -lookoutvision = ["mypy-boto3-lookoutvision (>=1.20.0)"] -machinelearning = ["mypy-boto3-machinelearning (>=1.20.0)"] -macie = ["mypy-boto3-macie (>=1.20.0)"] -macie2 = ["mypy-boto3-macie2 (>=1.20.0)"] -managedblockchain = ["mypy-boto3-managedblockchain (>=1.20.0)"] -marketplace-catalog = ["mypy-boto3-marketplace-catalog (>=1.20.0)"] -marketplace-entitlement = ["mypy-boto3-marketplace-entitlement (>=1.20.0)"] -marketplacecommerceanalytics = ["mypy-boto3-marketplacecommerceanalytics (>=1.20.0)"] -mediaconnect = ["mypy-boto3-mediaconnect (>=1.20.0)"] -mediaconvert = ["mypy-boto3-mediaconvert (>=1.20.0)"] -medialive = ["mypy-boto3-medialive (>=1.20.0)"] -mediapackage = ["mypy-boto3-mediapackage (>=1.20.0)"] -mediapackage-vod = ["mypy-boto3-mediapackage-vod (>=1.20.0)"] -mediastore = ["mypy-boto3-mediastore (>=1.20.0)"] -mediastore-data = ["mypy-boto3-mediastore-data (>=1.20.0)"] -mediatailor = ["mypy-boto3-mediatailor (>=1.20.0)"] -memorydb = ["mypy-boto3-memorydb (>=1.20.0)"] -meteringmarketplace = ["mypy-boto3-meteringmarketplace (>=1.20.0)"] -mgh = ["mypy-boto3-mgh (>=1.20.0)"] -mgn = ["mypy-boto3-mgn (>=1.20.0)"] -migration-hub-refactor-spaces = ["mypy-boto3-migration-hub-refactor-spaces (>=1.20.0)"] -migrationhub-config = ["mypy-boto3-migrationhub-config (>=1.20.0)"] -migrationhubstrategy = ["mypy-boto3-migrationhubstrategy (>=1.20.0)"] -mobile = ["mypy-boto3-mobile (>=1.20.0)"] -mq = ["mypy-boto3-mq (>=1.20.0)"] -mturk = ["mypy-boto3-mturk (>=1.20.0)"] -mwaa = ["mypy-boto3-mwaa (>=1.20.0)"] -neptune = ["mypy-boto3-neptune (>=1.20.0)"] -network-firewall = ["mypy-boto3-network-firewall (>=1.20.0)"] -networkmanager = ["mypy-boto3-networkmanager (>=1.20.0)"] -nimble = ["mypy-boto3-nimble (>=1.20.0)"] -opensearch = ["mypy-boto3-opensearch (>=1.20.0)"] -opsworks = ["mypy-boto3-opsworks (>=1.20.0)"] -opsworkscm = ["mypy-boto3-opsworkscm (>=1.20.0)"] -organizations = ["mypy-boto3-organizations (>=1.20.0)"] -outposts = ["mypy-boto3-outposts (>=1.20.0)"] -panorama = ["mypy-boto3-panorama (>=1.20.0)"] -personalize = ["mypy-boto3-personalize (>=1.20.0)"] -personalize-events = ["mypy-boto3-personalize-events (>=1.20.0)"] -personalize-runtime = ["mypy-boto3-personalize-runtime (>=1.20.0)"] -pi = ["mypy-boto3-pi (>=1.20.0)"] -pinpoint = ["mypy-boto3-pinpoint (>=1.20.0)"] -pinpoint-email = ["mypy-boto3-pinpoint-email (>=1.20.0)"] -pinpoint-sms-voice = ["mypy-boto3-pinpoint-sms-voice (>=1.20.0)"] -polly = ["mypy-boto3-polly (>=1.20.0)"] -pricing = ["mypy-boto3-pricing (>=1.20.0)"] -proton = ["mypy-boto3-proton (>=1.20.0)"] -qldb = ["mypy-boto3-qldb (>=1.20.0)"] -qldb-session = ["mypy-boto3-qldb-session (>=1.20.0)"] -quicksight = ["mypy-boto3-quicksight (>=1.20.0)"] -ram = ["mypy-boto3-ram (>=1.20.0)"] -rbin = ["mypy-boto3-rbin (>=1.20.0)"] -rds = ["mypy-boto3-rds (>=1.20.0)"] -rds-data = ["mypy-boto3-rds-data (>=1.20.0)"] -redshift = ["mypy-boto3-redshift (>=1.20.0)"] -redshift-data = ["mypy-boto3-redshift-data (>=1.20.0)"] -rekognition = ["mypy-boto3-rekognition (>=1.20.0)"] -resiliencehub = ["mypy-boto3-resiliencehub (>=1.20.0)"] -resource-groups = ["mypy-boto3-resource-groups (>=1.20.0)"] -resourcegroupstaggingapi = ["mypy-boto3-resourcegroupstaggingapi (>=1.20.0)"] -robomaker = ["mypy-boto3-robomaker (>=1.20.0)"] -route53 = ["mypy-boto3-route53 (>=1.20.0)"] -route53-recovery-cluster = ["mypy-boto3-route53-recovery-cluster (>=1.20.0)"] -route53-recovery-control-config = ["mypy-boto3-route53-recovery-control-config (>=1.20.0)"] -route53-recovery-readiness = ["mypy-boto3-route53-recovery-readiness (>=1.20.0)"] -route53domains = ["mypy-boto3-route53domains (>=1.20.0)"] -route53resolver = ["mypy-boto3-route53resolver (>=1.20.0)"] -rum = ["mypy-boto3-rum (>=1.20.0)"] -s3 = ["mypy-boto3-s3 (>=1.20.0)"] -s3control = ["mypy-boto3-s3control (>=1.20.0)"] -s3outposts = ["mypy-boto3-s3outposts (>=1.20.0)"] -sagemaker = ["mypy-boto3-sagemaker (>=1.20.0)"] -sagemaker-a2i-runtime = ["mypy-boto3-sagemaker-a2i-runtime (>=1.20.0)"] -sagemaker-edge = ["mypy-boto3-sagemaker-edge (>=1.20.0)"] -sagemaker-featurestore-runtime = ["mypy-boto3-sagemaker-featurestore-runtime (>=1.20.0)"] -sagemaker-runtime = ["mypy-boto3-sagemaker-runtime (>=1.20.0)"] -savingsplans = ["mypy-boto3-savingsplans (>=1.20.0)"] -schemas = ["mypy-boto3-schemas (>=1.20.0)"] -sdb = ["mypy-boto3-sdb (>=1.20.0)"] -secretsmanager = ["mypy-boto3-secretsmanager (>=1.20.0)"] -securityhub = ["mypy-boto3-securityhub (>=1.20.0)"] -serverlessrepo = ["mypy-boto3-serverlessrepo (>=1.20.0)"] -service-quotas = ["mypy-boto3-service-quotas (>=1.20.0)"] -servicecatalog = ["mypy-boto3-servicecatalog (>=1.20.0)"] -servicecatalog-appregistry = ["mypy-boto3-servicecatalog-appregistry (>=1.20.0)"] -servicediscovery = ["mypy-boto3-servicediscovery (>=1.20.0)"] -ses = ["mypy-boto3-ses (>=1.20.0)"] -sesv2 = ["mypy-boto3-sesv2 (>=1.20.0)"] -shield = ["mypy-boto3-shield (>=1.20.0)"] -signer = ["mypy-boto3-signer (>=1.20.0)"] -sms = ["mypy-boto3-sms (>=1.20.0)"] -sms-voice = ["mypy-boto3-sms-voice (>=1.20.0)"] -snow-device-management = ["mypy-boto3-snow-device-management (>=1.20.0)"] -snowball = ["mypy-boto3-snowball (>=1.20.0)"] -sns = ["mypy-boto3-sns (>=1.20.0)"] -sqs = ["mypy-boto3-sqs (>=1.20.0)"] -ssm = ["mypy-boto3-ssm (>=1.20.0)"] -ssm-contacts = ["mypy-boto3-ssm-contacts (>=1.20.0)"] -ssm-incidents = ["mypy-boto3-ssm-incidents (>=1.20.0)"] -sso = ["mypy-boto3-sso (>=1.20.0)"] -sso-admin = ["mypy-boto3-sso-admin (>=1.20.0)"] -sso-oidc = ["mypy-boto3-sso-oidc (>=1.20.0)"] -stepfunctions = ["mypy-boto3-stepfunctions (>=1.20.0)"] -storagegateway = ["mypy-boto3-storagegateway (>=1.20.0)"] -sts = ["mypy-boto3-sts (>=1.20.0)"] -support = ["mypy-boto3-support (>=1.20.0)"] -swf = ["mypy-boto3-swf (>=1.20.0)"] -synthetics = ["mypy-boto3-synthetics (>=1.20.0)"] -textract = ["mypy-boto3-textract (>=1.20.0)"] -timestream-query = ["mypy-boto3-timestream-query (>=1.20.0)"] -timestream-write = ["mypy-boto3-timestream-write (>=1.20.0)"] -transcribe = ["mypy-boto3-transcribe (>=1.20.0)"] -transfer = ["mypy-boto3-transfer (>=1.20.0)"] -translate = ["mypy-boto3-translate (>=1.20.0)"] -voice-id = ["mypy-boto3-voice-id (>=1.20.0)"] -waf = ["mypy-boto3-waf (>=1.20.0)"] -waf-regional = ["mypy-boto3-waf-regional (>=1.20.0)"] -wafv2 = ["mypy-boto3-wafv2 (>=1.20.0)"] -wellarchitected = ["mypy-boto3-wellarchitected (>=1.20.0)"] -wisdom = ["mypy-boto3-wisdom (>=1.20.0)"] -workdocs = ["mypy-boto3-workdocs (>=1.20.0)"] -worklink = ["mypy-boto3-worklink (>=1.20.0)"] -workmail = ["mypy-boto3-workmail (>=1.20.0)"] -workmailmessageflow = ["mypy-boto3-workmailmessageflow (>=1.20.0)"] -workspaces = ["mypy-boto3-workspaces (>=1.20.0)"] -workspaces-web = ["mypy-boto3-workspaces-web (>=1.20.0)"] -xray = ["mypy-boto3-xray (>=1.20.0)"] +accessanalyzer = ["mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)"] +account = ["mypy-boto3-account (>=1.24.0,<1.25.0)"] +acm = ["mypy-boto3-acm (>=1.24.0,<1.25.0)"] +acm-pca = ["mypy-boto3-acm-pca (>=1.24.0,<1.25.0)"] +alexaforbusiness = ["mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)"] +all = ["mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)", "mypy-boto3-account (>=1.24.0,<1.25.0)", "mypy-boto3-acm (>=1.24.0,<1.25.0)", "mypy-boto3-acm-pca (>=1.24.0,<1.25.0)", "mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)", "mypy-boto3-amp (>=1.24.0,<1.25.0)", "mypy-boto3-amplify (>=1.24.0,<1.25.0)", "mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)", "mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)", "mypy-boto3-apigateway (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)", "mypy-boto3-appconfig (>=1.24.0,<1.25.0)", "mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)", "mypy-boto3-appflow (>=1.24.0,<1.25.0)", "mypy-boto3-appintegrations (>=1.24.0,<1.25.0)", "mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-application-insights (>=1.24.0,<1.25.0)", "mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-appmesh (>=1.24.0,<1.25.0)", "mypy-boto3-apprunner (>=1.24.0,<1.25.0)", "mypy-boto3-appstream (>=1.24.0,<1.25.0)", "mypy-boto3-appsync (>=1.24.0,<1.25.0)", "mypy-boto3-athena (>=1.24.0,<1.25.0)", "mypy-boto3-auditmanager (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)", "mypy-boto3-backup (>=1.24.0,<1.25.0)", "mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)", "mypy-boto3-batch (>=1.24.0,<1.25.0)", "mypy-boto3-billingconductor (>=1.24.0,<1.25.0)", "mypy-boto3-braket (>=1.24.0,<1.25.0)", "mypy-boto3-budgets (>=1.24.0,<1.25.0)", "mypy-boto3-ce (>=1.24.0,<1.25.0)", "mypy-boto3-chime (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)", "mypy-boto3-cloud9 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)", "mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)", "mypy-boto3-cloudformation (>=1.24.0,<1.25.0)", "mypy-boto3-cloudfront (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)", "mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)", "mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)", "mypy-boto3-codeartifact (>=1.24.0,<1.25.0)", "mypy-boto3-codebuild (>=1.24.0,<1.25.0)", "mypy-boto3-codecommit (>=1.24.0,<1.25.0)", "mypy-boto3-codedeploy (>=1.24.0,<1.25.0)", "mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)", "mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-codepipeline (>=1.24.0,<1.25.0)", "mypy-boto3-codestar (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)", "mypy-boto3-comprehend (>=1.24.0,<1.25.0)", "mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)", "mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)", "mypy-boto3-config (>=1.24.0,<1.25.0)", "mypy-boto3-connect (>=1.24.0,<1.25.0)", "mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)", "mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)", "mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)", "mypy-boto3-cur (>=1.24.0,<1.25.0)", "mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)", "mypy-boto3-databrew (>=1.24.0,<1.25.0)", "mypy-boto3-dataexchange (>=1.24.0,<1.25.0)", "mypy-boto3-datapipeline (>=1.24.0,<1.25.0)", "mypy-boto3-datasync (>=1.24.0,<1.25.0)", "mypy-boto3-dax (>=1.24.0,<1.25.0)", "mypy-boto3-detective (>=1.24.0,<1.25.0)", "mypy-boto3-devicefarm (>=1.24.0,<1.25.0)", "mypy-boto3-devops-guru (>=1.24.0,<1.25.0)", "mypy-boto3-directconnect (>=1.24.0,<1.25.0)", "mypy-boto3-discovery (>=1.24.0,<1.25.0)", "mypy-boto3-dlm (>=1.24.0,<1.25.0)", "mypy-boto3-dms (>=1.24.0,<1.25.0)", "mypy-boto3-docdb (>=1.24.0,<1.25.0)", "mypy-boto3-drs (>=1.24.0,<1.25.0)", "mypy-boto3-ds (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)", "mypy-boto3-ebs (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)", "mypy-boto3-ecr (>=1.24.0,<1.25.0)", "mypy-boto3-ecr-public (>=1.24.0,<1.25.0)", "mypy-boto3-ecs (>=1.24.0,<1.25.0)", "mypy-boto3-efs (>=1.24.0,<1.25.0)", "mypy-boto3-eks (>=1.24.0,<1.25.0)", "mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)", "mypy-boto3-elasticache (>=1.24.0,<1.25.0)", "mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)", "mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)", "mypy-boto3-elb (>=1.24.0,<1.25.0)", "mypy-boto3-elbv2 (>=1.24.0,<1.25.0)", "mypy-boto3-emr (>=1.24.0,<1.25.0)", "mypy-boto3-emr-containers (>=1.24.0,<1.25.0)", "mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-es (>=1.24.0,<1.25.0)", "mypy-boto3-events (>=1.24.0,<1.25.0)", "mypy-boto3-evidently (>=1.24.0,<1.25.0)", "mypy-boto3-finspace (>=1.24.0,<1.25.0)", "mypy-boto3-finspace-data (>=1.24.0,<1.25.0)", "mypy-boto3-firehose (>=1.24.0,<1.25.0)", "mypy-boto3-fis (>=1.24.0,<1.25.0)", "mypy-boto3-fms (>=1.24.0,<1.25.0)", "mypy-boto3-forecast (>=1.24.0,<1.25.0)", "mypy-boto3-forecastquery (>=1.24.0,<1.25.0)", "mypy-boto3-frauddetector (>=1.24.0,<1.25.0)", "mypy-boto3-fsx (>=1.24.0,<1.25.0)", "mypy-boto3-gamelift (>=1.24.0,<1.25.0)", "mypy-boto3-gamesparks (>=1.24.0,<1.25.0)", "mypy-boto3-glacier (>=1.24.0,<1.25.0)", "mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)", "mypy-boto3-glue (>=1.24.0,<1.25.0)", "mypy-boto3-grafana (>=1.24.0,<1.25.0)", "mypy-boto3-greengrass (>=1.24.0,<1.25.0)", "mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)", "mypy-boto3-groundstation (>=1.24.0,<1.25.0)", "mypy-boto3-guardduty (>=1.24.0,<1.25.0)", "mypy-boto3-health (>=1.24.0,<1.25.0)", "mypy-boto3-healthlake (>=1.24.0,<1.25.0)", "mypy-boto3-honeycode (>=1.24.0,<1.25.0)", "mypy-boto3-iam (>=1.24.0,<1.25.0)", "mypy-boto3-identitystore (>=1.24.0,<1.25.0)", "mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)", "mypy-boto3-importexport (>=1.24.0,<1.25.0)", "mypy-boto3-inspector (>=1.24.0,<1.25.0)", "mypy-boto3-inspector2 (>=1.24.0,<1.25.0)", "mypy-boto3-iot (>=1.24.0,<1.25.0)", "mypy-boto3-iot-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)", "mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)", "mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)", "mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)", "mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)", "mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)", "mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)", "mypy-boto3-iotwireless (>=1.24.0,<1.25.0)", "mypy-boto3-ivs (>=1.24.0,<1.25.0)", "mypy-boto3-ivschat (>=1.24.0,<1.25.0)", "mypy-boto3-kafka (>=1.24.0,<1.25.0)", "mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-kendra (>=1.24.0,<1.25.0)", "mypy-boto3-keyspaces (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)", "mypy-boto3-kms (>=1.24.0,<1.25.0)", "mypy-boto3-lakeformation (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-lex-models (>=1.24.0,<1.25.0)", "mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager (>=1.24.0,<1.25.0)", "mypy-boto3-lightsail (>=1.24.0,<1.25.0)", "mypy-boto3-location (>=1.24.0,<1.25.0)", "mypy-boto3-logs (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)", "mypy-boto3-m2 (>=1.24.0,<1.25.0)", "mypy-boto3-machinelearning (>=1.24.0,<1.25.0)", "mypy-boto3-macie (>=1.24.0,<1.25.0)", "mypy-boto3-macie2 (>=1.24.0,<1.25.0)", "mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)", "mypy-boto3-medialive (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)", "mypy-boto3-mediatailor (>=1.24.0,<1.25.0)", "mypy-boto3-memorydb (>=1.24.0,<1.25.0)", "mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)", "mypy-boto3-mgh (>=1.24.0,<1.25.0)", "mypy-boto3-mgn (>=1.24.0,<1.25.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)", "mypy-boto3-mobile (>=1.24.0,<1.25.0)", "mypy-boto3-mq (>=1.24.0,<1.25.0)", "mypy-boto3-mturk (>=1.24.0,<1.25.0)", "mypy-boto3-mwaa (>=1.24.0,<1.25.0)", "mypy-boto3-neptune (>=1.24.0,<1.25.0)", "mypy-boto3-network-firewall (>=1.24.0,<1.25.0)", "mypy-boto3-networkmanager (>=1.24.0,<1.25.0)", "mypy-boto3-nimble (>=1.24.0,<1.25.0)", "mypy-boto3-opensearch (>=1.24.0,<1.25.0)", "mypy-boto3-opsworks (>=1.24.0,<1.25.0)", "mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)", "mypy-boto3-organizations (>=1.24.0,<1.25.0)", "mypy-boto3-outposts (>=1.24.0,<1.25.0)", "mypy-boto3-panorama (>=1.24.0,<1.25.0)", "mypy-boto3-personalize (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-events (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-pi (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)", "mypy-boto3-polly (>=1.24.0,<1.25.0)", "mypy-boto3-pricing (>=1.24.0,<1.25.0)", "mypy-boto3-proton (>=1.24.0,<1.25.0)", "mypy-boto3-qldb (>=1.24.0,<1.25.0)", "mypy-boto3-qldb-session (>=1.24.0,<1.25.0)", "mypy-boto3-quicksight (>=1.24.0,<1.25.0)", "mypy-boto3-ram (>=1.24.0,<1.25.0)", "mypy-boto3-rbin (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-rds-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-rekognition (>=1.24.0,<1.25.0)", "mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)", "mypy-boto3-resource-groups (>=1.24.0,<1.25.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)", "mypy-boto3-robomaker (>=1.24.0,<1.25.0)", "mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)", "mypy-boto3-route53 (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)", "mypy-boto3-route53domains (>=1.24.0,<1.25.0)", "mypy-boto3-route53resolver (>=1.24.0,<1.25.0)", "mypy-boto3-rum (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-s3control (>=1.24.0,<1.25.0)", "mypy-boto3-s3outposts (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-savingsplans (>=1.24.0,<1.25.0)", "mypy-boto3-schemas (>=1.24.0,<1.25.0)", "mypy-boto3-sdb (>=1.24.0,<1.25.0)", "mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)", "mypy-boto3-securityhub (>=1.24.0,<1.25.0)", "mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)", "mypy-boto3-service-quotas (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)", "mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)", "mypy-boto3-ses (>=1.24.0,<1.25.0)", "mypy-boto3-sesv2 (>=1.24.0,<1.25.0)", "mypy-boto3-shield (>=1.24.0,<1.25.0)", "mypy-boto3-signer (>=1.24.0,<1.25.0)", "mypy-boto3-sms (>=1.24.0,<1.25.0)", "mypy-boto3-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)", "mypy-boto3-snowball (>=1.24.0,<1.25.0)", "mypy-boto3-sns (>=1.24.0,<1.25.0)", "mypy-boto3-sqs (>=1.24.0,<1.25.0)", "mypy-boto3-ssm (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)", "mypy-boto3-sso (>=1.24.0,<1.25.0)", "mypy-boto3-sso-admin (>=1.24.0,<1.25.0)", "mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)", "mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)", "mypy-boto3-storagegateway (>=1.24.0,<1.25.0)", "mypy-boto3-sts (>=1.24.0,<1.25.0)", "mypy-boto3-support (>=1.24.0,<1.25.0)", "mypy-boto3-swf (>=1.24.0,<1.25.0)", "mypy-boto3-synthetics (>=1.24.0,<1.25.0)", "mypy-boto3-textract (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-query (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-write (>=1.24.0,<1.25.0)", "mypy-boto3-transcribe (>=1.24.0,<1.25.0)", "mypy-boto3-transfer (>=1.24.0,<1.25.0)", "mypy-boto3-translate (>=1.24.0,<1.25.0)", "mypy-boto3-voice-id (>=1.24.0,<1.25.0)", "mypy-boto3-waf (>=1.24.0,<1.25.0)", "mypy-boto3-waf-regional (>=1.24.0,<1.25.0)", "mypy-boto3-wafv2 (>=1.24.0,<1.25.0)", "mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)", "mypy-boto3-wisdom (>=1.24.0,<1.25.0)", "mypy-boto3-workdocs (>=1.24.0,<1.25.0)", "mypy-boto3-worklink (>=1.24.0,<1.25.0)", "mypy-boto3-workmail (>=1.24.0,<1.25.0)", "mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)", "mypy-boto3-xray (>=1.24.0,<1.25.0)"] +amp = ["mypy-boto3-amp (>=1.24.0,<1.25.0)"] +amplify = ["mypy-boto3-amplify (>=1.24.0,<1.25.0)"] +amplifybackend = ["mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)"] +amplifyuibuilder = ["mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)"] +apigateway = ["mypy-boto3-apigateway (>=1.24.0,<1.25.0)"] +apigatewaymanagementapi = ["mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)"] +apigatewayv2 = ["mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)"] +appconfig = ["mypy-boto3-appconfig (>=1.24.0,<1.25.0)"] +appconfigdata = ["mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)"] +appflow = ["mypy-boto3-appflow (>=1.24.0,<1.25.0)"] +appintegrations = ["mypy-boto3-appintegrations (>=1.24.0,<1.25.0)"] +application-autoscaling = ["mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)"] +application-insights = ["mypy-boto3-application-insights (>=1.24.0,<1.25.0)"] +applicationcostprofiler = ["mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)"] +appmesh = ["mypy-boto3-appmesh (>=1.24.0,<1.25.0)"] +apprunner = ["mypy-boto3-apprunner (>=1.24.0,<1.25.0)"] +appstream = ["mypy-boto3-appstream (>=1.24.0,<1.25.0)"] +appsync = ["mypy-boto3-appsync (>=1.24.0,<1.25.0)"] +athena = ["mypy-boto3-athena (>=1.24.0,<1.25.0)"] +auditmanager = ["mypy-boto3-auditmanager (>=1.24.0,<1.25.0)"] +autoscaling = ["mypy-boto3-autoscaling (>=1.24.0,<1.25.0)"] +autoscaling-plans = ["mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)"] +backup = ["mypy-boto3-backup (>=1.24.0,<1.25.0)"] +backup-gateway = ["mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)"] +batch = ["mypy-boto3-batch (>=1.24.0,<1.25.0)"] +billingconductor = ["mypy-boto3-billingconductor (>=1.24.0,<1.25.0)"] +braket = ["mypy-boto3-braket (>=1.24.0,<1.25.0)"] +budgets = ["mypy-boto3-budgets (>=1.24.0,<1.25.0)"] +ce = ["mypy-boto3-ce (>=1.24.0,<1.25.0)"] +chime = ["mypy-boto3-chime (>=1.24.0,<1.25.0)"] +chime-sdk-identity = ["mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)"] +chime-sdk-media-pipelines = ["mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)"] +chime-sdk-meetings = ["mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)"] +chime-sdk-messaging = ["mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)"] +cloud9 = ["mypy-boto3-cloud9 (>=1.24.0,<1.25.0)"] +cloudcontrol = ["mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)"] +clouddirectory = ["mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)"] +cloudformation = ["mypy-boto3-cloudformation (>=1.24.0,<1.25.0)"] +cloudfront = ["mypy-boto3-cloudfront (>=1.24.0,<1.25.0)"] +cloudhsm = ["mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)"] +cloudhsmv2 = ["mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)"] +cloudsearch = ["mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)"] +cloudsearchdomain = ["mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)"] +cloudtrail = ["mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)"] +cloudwatch = ["mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)"] +codeartifact = ["mypy-boto3-codeartifact (>=1.24.0,<1.25.0)"] +codebuild = ["mypy-boto3-codebuild (>=1.24.0,<1.25.0)"] +codecommit = ["mypy-boto3-codecommit (>=1.24.0,<1.25.0)"] +codedeploy = ["mypy-boto3-codedeploy (>=1.24.0,<1.25.0)"] +codeguru-reviewer = ["mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)"] +codeguruprofiler = ["mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)"] +codepipeline = ["mypy-boto3-codepipeline (>=1.24.0,<1.25.0)"] +codestar = ["mypy-boto3-codestar (>=1.24.0,<1.25.0)"] +codestar-connections = ["mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)"] +codestar-notifications = ["mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)"] +cognito-identity = ["mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)"] +cognito-idp = ["mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)"] +cognito-sync = ["mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)"] +comprehend = ["mypy-boto3-comprehend (>=1.24.0,<1.25.0)"] +comprehendmedical = ["mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)"] +compute-optimizer = ["mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)"] +config = ["mypy-boto3-config (>=1.24.0,<1.25.0)"] +connect = ["mypy-boto3-connect (>=1.24.0,<1.25.0)"] +connect-contact-lens = ["mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)"] +connectcampaigns = ["mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)"] +connectparticipant = ["mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)"] +cur = ["mypy-boto3-cur (>=1.24.0,<1.25.0)"] +customer-profiles = ["mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)"] +databrew = ["mypy-boto3-databrew (>=1.24.0,<1.25.0)"] +dataexchange = ["mypy-boto3-dataexchange (>=1.24.0,<1.25.0)"] +datapipeline = ["mypy-boto3-datapipeline (>=1.24.0,<1.25.0)"] +datasync = ["mypy-boto3-datasync (>=1.24.0,<1.25.0)"] +dax = ["mypy-boto3-dax (>=1.24.0,<1.25.0)"] +detective = ["mypy-boto3-detective (>=1.24.0,<1.25.0)"] +devicefarm = ["mypy-boto3-devicefarm (>=1.24.0,<1.25.0)"] +devops-guru = ["mypy-boto3-devops-guru (>=1.24.0,<1.25.0)"] +directconnect = ["mypy-boto3-directconnect (>=1.24.0,<1.25.0)"] +discovery = ["mypy-boto3-discovery (>=1.24.0,<1.25.0)"] +dlm = ["mypy-boto3-dlm (>=1.24.0,<1.25.0)"] +dms = ["mypy-boto3-dms (>=1.24.0,<1.25.0)"] +docdb = ["mypy-boto3-docdb (>=1.24.0,<1.25.0)"] +drs = ["mypy-boto3-drs (>=1.24.0,<1.25.0)"] +ds = ["mypy-boto3-ds (>=1.24.0,<1.25.0)"] +dynamodb = ["mypy-boto3-dynamodb (>=1.24.0,<1.25.0)"] +dynamodbstreams = ["mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)"] +ebs = ["mypy-boto3-ebs (>=1.24.0,<1.25.0)"] +ec2 = ["mypy-boto3-ec2 (>=1.24.0,<1.25.0)"] +ec2-instance-connect = ["mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)"] +ecr = ["mypy-boto3-ecr (>=1.24.0,<1.25.0)"] +ecr-public = ["mypy-boto3-ecr-public (>=1.24.0,<1.25.0)"] +ecs = ["mypy-boto3-ecs (>=1.24.0,<1.25.0)"] +efs = ["mypy-boto3-efs (>=1.24.0,<1.25.0)"] +eks = ["mypy-boto3-eks (>=1.24.0,<1.25.0)"] +elastic-inference = ["mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)"] +elasticache = ["mypy-boto3-elasticache (>=1.24.0,<1.25.0)"] +elasticbeanstalk = ["mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)"] +elastictranscoder = ["mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)"] +elb = ["mypy-boto3-elb (>=1.24.0,<1.25.0)"] +elbv2 = ["mypy-boto3-elbv2 (>=1.24.0,<1.25.0)"] +emr = ["mypy-boto3-emr (>=1.24.0,<1.25.0)"] +emr-containers = ["mypy-boto3-emr-containers (>=1.24.0,<1.25.0)"] +emr-serverless = ["mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)"] +es = ["mypy-boto3-es (>=1.24.0,<1.25.0)"] +essential = ["mypy-boto3-cloudformation (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-sqs (>=1.24.0,<1.25.0)"] +events = ["mypy-boto3-events (>=1.24.0,<1.25.0)"] +evidently = ["mypy-boto3-evidently (>=1.24.0,<1.25.0)"] +finspace = ["mypy-boto3-finspace (>=1.24.0,<1.25.0)"] +finspace-data = ["mypy-boto3-finspace-data (>=1.24.0,<1.25.0)"] +firehose = ["mypy-boto3-firehose (>=1.24.0,<1.25.0)"] +fis = ["mypy-boto3-fis (>=1.24.0,<1.25.0)"] +fms = ["mypy-boto3-fms (>=1.24.0,<1.25.0)"] +forecast = ["mypy-boto3-forecast (>=1.24.0,<1.25.0)"] +forecastquery = ["mypy-boto3-forecastquery (>=1.24.0,<1.25.0)"] +frauddetector = ["mypy-boto3-frauddetector (>=1.24.0,<1.25.0)"] +fsx = ["mypy-boto3-fsx (>=1.24.0,<1.25.0)"] +gamelift = ["mypy-boto3-gamelift (>=1.24.0,<1.25.0)"] +gamesparks = ["mypy-boto3-gamesparks (>=1.24.0,<1.25.0)"] +glacier = ["mypy-boto3-glacier (>=1.24.0,<1.25.0)"] +globalaccelerator = ["mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)"] +glue = ["mypy-boto3-glue (>=1.24.0,<1.25.0)"] +grafana = ["mypy-boto3-grafana (>=1.24.0,<1.25.0)"] +greengrass = ["mypy-boto3-greengrass (>=1.24.0,<1.25.0)"] +greengrassv2 = ["mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)"] +groundstation = ["mypy-boto3-groundstation (>=1.24.0,<1.25.0)"] +guardduty = ["mypy-boto3-guardduty (>=1.24.0,<1.25.0)"] +health = ["mypy-boto3-health (>=1.24.0,<1.25.0)"] +healthlake = ["mypy-boto3-healthlake (>=1.24.0,<1.25.0)"] +honeycode = ["mypy-boto3-honeycode (>=1.24.0,<1.25.0)"] +iam = ["mypy-boto3-iam (>=1.24.0,<1.25.0)"] +identitystore = ["mypy-boto3-identitystore (>=1.24.0,<1.25.0)"] +imagebuilder = ["mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)"] +importexport = ["mypy-boto3-importexport (>=1.24.0,<1.25.0)"] +inspector = ["mypy-boto3-inspector (>=1.24.0,<1.25.0)"] +inspector2 = ["mypy-boto3-inspector2 (>=1.24.0,<1.25.0)"] +iot = ["mypy-boto3-iot (>=1.24.0,<1.25.0)"] +iot-data = ["mypy-boto3-iot-data (>=1.24.0,<1.25.0)"] +iot-jobs-data = ["mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)"] +iot1click-devices = ["mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)"] +iot1click-projects = ["mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)"] +iotanalytics = ["mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)"] +iotdeviceadvisor = ["mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)"] +iotevents = ["mypy-boto3-iotevents (>=1.24.0,<1.25.0)"] +iotevents-data = ["mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)"] +iotfleethub = ["mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)"] +iotsecuretunneling = ["mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)"] +iotsitewise = ["mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)"] +iotthingsgraph = ["mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)"] +iottwinmaker = ["mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)"] +iotwireless = ["mypy-boto3-iotwireless (>=1.24.0,<1.25.0)"] +ivs = ["mypy-boto3-ivs (>=1.24.0,<1.25.0)"] +ivschat = ["mypy-boto3-ivschat (>=1.24.0,<1.25.0)"] +kafka = ["mypy-boto3-kafka (>=1.24.0,<1.25.0)"] +kafkaconnect = ["mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)"] +kendra = ["mypy-boto3-kendra (>=1.24.0,<1.25.0)"] +keyspaces = ["mypy-boto3-keyspaces (>=1.24.0,<1.25.0)"] +kinesis = ["mypy-boto3-kinesis (>=1.24.0,<1.25.0)"] +kinesis-video-archived-media = ["mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)"] +kinesis-video-media = ["mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)"] +kinesis-video-signaling = ["mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)"] +kinesisanalytics = ["mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)"] +kinesisanalyticsv2 = ["mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)"] +kinesisvideo = ["mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)"] +kms = ["mypy-boto3-kms (>=1.24.0,<1.25.0)"] +lakeformation = ["mypy-boto3-lakeformation (>=1.24.0,<1.25.0)"] +lambda = ["mypy-boto3-lambda (>=1.24.0,<1.25.0)"] +lex-models = ["mypy-boto3-lex-models (>=1.24.0,<1.25.0)"] +lex-runtime = ["mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)"] +lexv2-models = ["mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)"] +lexv2-runtime = ["mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)"] +license-manager = ["mypy-boto3-license-manager (>=1.24.0,<1.25.0)"] +lightsail = ["mypy-boto3-lightsail (>=1.24.0,<1.25.0)"] +location = ["mypy-boto3-location (>=1.24.0,<1.25.0)"] +logs = ["mypy-boto3-logs (>=1.24.0,<1.25.0)"] +lookoutequipment = ["mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)"] +lookoutmetrics = ["mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)"] +lookoutvision = ["mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)"] +m2 = ["mypy-boto3-m2 (>=1.24.0,<1.25.0)"] +machinelearning = ["mypy-boto3-machinelearning (>=1.24.0,<1.25.0)"] +macie = ["mypy-boto3-macie (>=1.24.0,<1.25.0)"] +macie2 = ["mypy-boto3-macie2 (>=1.24.0,<1.25.0)"] +managedblockchain = ["mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)"] +marketplace-catalog = ["mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)"] +marketplace-entitlement = ["mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)"] +marketplacecommerceanalytics = ["mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)"] +mediaconnect = ["mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)"] +mediaconvert = ["mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)"] +medialive = ["mypy-boto3-medialive (>=1.24.0,<1.25.0)"] +mediapackage = ["mypy-boto3-mediapackage (>=1.24.0,<1.25.0)"] +mediapackage-vod = ["mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)"] +mediastore = ["mypy-boto3-mediastore (>=1.24.0,<1.25.0)"] +mediastore-data = ["mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)"] +mediatailor = ["mypy-boto3-mediatailor (>=1.24.0,<1.25.0)"] +memorydb = ["mypy-boto3-memorydb (>=1.24.0,<1.25.0)"] +meteringmarketplace = ["mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)"] +mgh = ["mypy-boto3-mgh (>=1.24.0,<1.25.0)"] +mgn = ["mypy-boto3-mgn (>=1.24.0,<1.25.0)"] +migration-hub-refactor-spaces = ["mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)"] +migrationhub-config = ["mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)"] +migrationhubstrategy = ["mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)"] +mobile = ["mypy-boto3-mobile (>=1.24.0,<1.25.0)"] +mq = ["mypy-boto3-mq (>=1.24.0,<1.25.0)"] +mturk = ["mypy-boto3-mturk (>=1.24.0,<1.25.0)"] +mwaa = ["mypy-boto3-mwaa (>=1.24.0,<1.25.0)"] +neptune = ["mypy-boto3-neptune (>=1.24.0,<1.25.0)"] +network-firewall = ["mypy-boto3-network-firewall (>=1.24.0,<1.25.0)"] +networkmanager = ["mypy-boto3-networkmanager (>=1.24.0,<1.25.0)"] +nimble = ["mypy-boto3-nimble (>=1.24.0,<1.25.0)"] +opensearch = ["mypy-boto3-opensearch (>=1.24.0,<1.25.0)"] +opsworks = ["mypy-boto3-opsworks (>=1.24.0,<1.25.0)"] +opsworkscm = ["mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)"] +organizations = ["mypy-boto3-organizations (>=1.24.0,<1.25.0)"] +outposts = ["mypy-boto3-outposts (>=1.24.0,<1.25.0)"] +panorama = ["mypy-boto3-panorama (>=1.24.0,<1.25.0)"] +personalize = ["mypy-boto3-personalize (>=1.24.0,<1.25.0)"] +personalize-events = ["mypy-boto3-personalize-events (>=1.24.0,<1.25.0)"] +personalize-runtime = ["mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)"] +pi = ["mypy-boto3-pi (>=1.24.0,<1.25.0)"] +pinpoint = ["mypy-boto3-pinpoint (>=1.24.0,<1.25.0)"] +pinpoint-email = ["mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)"] +pinpoint-sms-voice = ["mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)"] +pinpoint-sms-voice-v2 = ["mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)"] +polly = ["mypy-boto3-polly (>=1.24.0,<1.25.0)"] +pricing = ["mypy-boto3-pricing (>=1.24.0,<1.25.0)"] +proton = ["mypy-boto3-proton (>=1.24.0,<1.25.0)"] +qldb = ["mypy-boto3-qldb (>=1.24.0,<1.25.0)"] +qldb-session = ["mypy-boto3-qldb-session (>=1.24.0,<1.25.0)"] +quicksight = ["mypy-boto3-quicksight (>=1.24.0,<1.25.0)"] +ram = ["mypy-boto3-ram (>=1.24.0,<1.25.0)"] +rbin = ["mypy-boto3-rbin (>=1.24.0,<1.25.0)"] +rds = ["mypy-boto3-rds (>=1.24.0,<1.25.0)"] +rds-data = ["mypy-boto3-rds-data (>=1.24.0,<1.25.0)"] +redshift = ["mypy-boto3-redshift (>=1.24.0,<1.25.0)"] +redshift-data = ["mypy-boto3-redshift-data (>=1.24.0,<1.25.0)"] +redshift-serverless = ["mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)"] +rekognition = ["mypy-boto3-rekognition (>=1.24.0,<1.25.0)"] +resiliencehub = ["mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)"] +resource-groups = ["mypy-boto3-resource-groups (>=1.24.0,<1.25.0)"] +resourcegroupstaggingapi = ["mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)"] +robomaker = ["mypy-boto3-robomaker (>=1.24.0,<1.25.0)"] +rolesanywhere = ["mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)"] +route53 = ["mypy-boto3-route53 (>=1.24.0,<1.25.0)"] +route53-recovery-cluster = ["mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)"] +route53-recovery-control-config = ["mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)"] +route53-recovery-readiness = ["mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)"] +route53domains = ["mypy-boto3-route53domains (>=1.24.0,<1.25.0)"] +route53resolver = ["mypy-boto3-route53resolver (>=1.24.0,<1.25.0)"] +rum = ["mypy-boto3-rum (>=1.24.0,<1.25.0)"] +s3 = ["mypy-boto3-s3 (>=1.24.0,<1.25.0)"] +s3control = ["mypy-boto3-s3control (>=1.24.0,<1.25.0)"] +s3outposts = ["mypy-boto3-s3outposts (>=1.24.0,<1.25.0)"] +sagemaker = ["mypy-boto3-sagemaker (>=1.24.0,<1.25.0)"] +sagemaker-a2i-runtime = ["mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)"] +sagemaker-edge = ["mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)"] +sagemaker-featurestore-runtime = ["mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)"] +sagemaker-runtime = ["mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)"] +savingsplans = ["mypy-boto3-savingsplans (>=1.24.0,<1.25.0)"] +schemas = ["mypy-boto3-schemas (>=1.24.0,<1.25.0)"] +sdb = ["mypy-boto3-sdb (>=1.24.0,<1.25.0)"] +secretsmanager = ["mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)"] +securityhub = ["mypy-boto3-securityhub (>=1.24.0,<1.25.0)"] +serverlessrepo = ["mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)"] +service-quotas = ["mypy-boto3-service-quotas (>=1.24.0,<1.25.0)"] +servicecatalog = ["mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)"] +servicecatalog-appregistry = ["mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)"] +servicediscovery = ["mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)"] +ses = ["mypy-boto3-ses (>=1.24.0,<1.25.0)"] +sesv2 = ["mypy-boto3-sesv2 (>=1.24.0,<1.25.0)"] +shield = ["mypy-boto3-shield (>=1.24.0,<1.25.0)"] +signer = ["mypy-boto3-signer (>=1.24.0,<1.25.0)"] +sms = ["mypy-boto3-sms (>=1.24.0,<1.25.0)"] +sms-voice = ["mypy-boto3-sms-voice (>=1.24.0,<1.25.0)"] +snow-device-management = ["mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)"] +snowball = ["mypy-boto3-snowball (>=1.24.0,<1.25.0)"] +sns = ["mypy-boto3-sns (>=1.24.0,<1.25.0)"] +sqs = ["mypy-boto3-sqs (>=1.24.0,<1.25.0)"] +ssm = ["mypy-boto3-ssm (>=1.24.0,<1.25.0)"] +ssm-contacts = ["mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)"] +ssm-incidents = ["mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)"] +sso = ["mypy-boto3-sso (>=1.24.0,<1.25.0)"] +sso-admin = ["mypy-boto3-sso-admin (>=1.24.0,<1.25.0)"] +sso-oidc = ["mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)"] +stepfunctions = ["mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)"] +storagegateway = ["mypy-boto3-storagegateway (>=1.24.0,<1.25.0)"] +sts = ["mypy-boto3-sts (>=1.24.0,<1.25.0)"] +support = ["mypy-boto3-support (>=1.24.0,<1.25.0)"] +swf = ["mypy-boto3-swf (>=1.24.0,<1.25.0)"] +synthetics = ["mypy-boto3-synthetics (>=1.24.0,<1.25.0)"] +textract = ["mypy-boto3-textract (>=1.24.0,<1.25.0)"] +timestream-query = ["mypy-boto3-timestream-query (>=1.24.0,<1.25.0)"] +timestream-write = ["mypy-boto3-timestream-write (>=1.24.0,<1.25.0)"] +transcribe = ["mypy-boto3-transcribe (>=1.24.0,<1.25.0)"] +transfer = ["mypy-boto3-transfer (>=1.24.0,<1.25.0)"] +translate = ["mypy-boto3-translate (>=1.24.0,<1.25.0)"] +voice-id = ["mypy-boto3-voice-id (>=1.24.0,<1.25.0)"] +waf = ["mypy-boto3-waf (>=1.24.0,<1.25.0)"] +waf-regional = ["mypy-boto3-waf-regional (>=1.24.0,<1.25.0)"] +wafv2 = ["mypy-boto3-wafv2 (>=1.24.0,<1.25.0)"] +wellarchitected = ["mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)"] +wisdom = ["mypy-boto3-wisdom (>=1.24.0,<1.25.0)"] +workdocs = ["mypy-boto3-workdocs (>=1.24.0,<1.25.0)"] +worklink = ["mypy-boto3-worklink (>=1.24.0,<1.25.0)"] +workmail = ["mypy-boto3-workmail (>=1.24.0,<1.25.0)"] +workmailmessageflow = ["mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)"] +workspaces = ["mypy-boto3-workspaces (>=1.24.0,<1.25.0)"] +workspaces-web = ["mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)"] +xray = ["mypy-boto3-xray (>=1.24.0,<1.25.0)"] [[package]] name = "botocore" -version = "1.23.40" +version = "1.27.38" description = "Low-level, data-driven core of boto 3." category = "main" optional = false -python-versions = ">= 3.6" +python-versions = ">= 3.7" [package.dependencies] -jmespath = ">=0.7.1,<1.0.0" +jmespath = ">=0.7.1,<2.0.0" python-dateutil = ">=2.1,<3.0.0" urllib3 = ">=1.25.4,<1.27" [package.extras] -crt = ["awscrt (==0.12.5)"] +crt = ["awscrt (==0.13.8)"] [[package]] name = "botocore-stubs" -version = "1.23.40" -description = "Type annotations for botocore 1.23.40, generated by mypy-boto3-builder 6.3.2" +version = "1.27.38" +description = "Type annotations for botocore 1.27.38 generated with mypy-boto3-builder 7.10.1" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" + +[package.dependencies] +typing-extensions = ">=4.1.0" [[package]] name = "cached-property" @@ -457,15 +472,15 @@ python-versions = "*" [[package]] name = "certifi" -version = "2021.10.8" +version = "2022.6.15" description = "Python package for providing Mozilla's CA Bundle." category = "main" optional = false -python-versions = "*" +python-versions = ">=3.6" [[package]] name = "cffi" -version = "1.15.0" +version = "1.15.1" description = "Foreign Function Interface for Python calling C code." category = "main" optional = false @@ -476,14 +491,14 @@ pycparser = "*" [[package]] name = "cfn-lint" -version = "0.57.0" +version = "0.61.3" description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved" category = "main" optional = false python-versions = ">=3.6, <=4.0, !=4.0" [package.dependencies] -aws-sam-translator = ">=1.42.0" +aws-sam-translator = ">=1.47.0" jschema-to-python = ">=1.2.3,<1.3.0" jsonpatch = "*" jsonschema = ">=3.0,<4.0" @@ -491,33 +506,32 @@ junit-xml = ">=1.9,<2.0" networkx = ">=2.4,<3.0" pyyaml = ">5.4" sarif-om = ">=1.0.4,<1.1.0" -six = ">=1.11" [[package]] name = "charset-normalizer" -version = "2.0.10" +version = "2.1.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." category = "main" optional = false -python-versions = ">=3.5.0" +python-versions = ">=3.6.0" [package.extras] unicode_backport = ["unicodedata2"] [[package]] name = "click" -version = "8.0.3" +version = "8.1.3" description = "Composable command line interface toolkit" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} [[package]] name = "colorama" -version = "0.4.4" +version = "0.4.5" description = "Cross-platform colored terminal text." category = "main" optional = false @@ -525,7 +539,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] name = "cryptography" -version = "36.0.1" +version = "37.0.4" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." category = "main" optional = false @@ -540,29 +554,28 @@ docstest = ["pyenchant (>=1.6.11)", "twine (>=1.12.0)", "sphinxcontrib-spelling pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"] sdist = ["setuptools_rust (>=0.11.4)"] ssh = ["bcrypt (>=3.1.5)"] -test = ["pytest (>=6.2.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"] +test = ["pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"] [[package]] name = "docker" -version = "4.2.2" +version = "5.0.3" description = "A Python library for the Docker Engine API." category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = ">=3.6" [package.dependencies] -pypiwin32 = {version = "223", markers = "sys_platform == \"win32\" and python_version >= \"3.6\""} +pywin32 = {version = "227", markers = "sys_platform == \"win32\""} requests = ">=2.14.2,<2.18.0 || >2.18.0" -six = ">=1.4.0" websocket-client = ">=0.32.0" [package.extras] ssh = ["paramiko (>=2.4.2)"] -tls = ["pyOpenSSL (>=17.5.0)", "cryptography (>=1.3.4)", "idna (>=2.0.0)"] +tls = ["pyOpenSSL (>=17.5.0)", "cryptography (>=3.4.7)", "idna (>=2.0.0)"] [[package]] name = "ecdsa" -version = "0.17.0" +version = "0.18.0" description = "ECDSA cryptographic signature library (pure python)" category = "main" optional = false @@ -601,14 +614,15 @@ pyflakes = ">=2.3.0,<2.4.0" [[package]] name = "flask" -version = "2.0.2" +version = "2.1.3" description = "A simple framework for building complex web applications." category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [package.dependencies] -click = ">=7.1.2" +click = ">=8.0" +importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""} itsdangerous = ">=2.0" Jinja2 = ">=3.0" Werkzeug = ">=2.0" @@ -629,17 +643,9 @@ python-versions = "*" Flask = ">=0.9" Six = "*" -[[package]] -name = "future" -version = "0.18.2" -description = "Clean single-source support for Python 3 and 2" -category = "main" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" - [[package]] name = "graphql-core" -version = "3.2.0" +version = "3.2.1" description = "GraphQL implementation for Python, a port of GraphQL.js, the JavaScript reference implementation for GraphQL." category = "main" optional = false @@ -653,6 +659,22 @@ category = "main" optional = false python-versions = ">=3.5" +[[package]] +name = "importlib-metadata" +version = "4.12.0" +description = "Read metadata from Python packages" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +zipp = ">=0.5" + +[package.extras] +docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"] +perf = ["ipython"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.3)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"] + [[package]] name = "iniconfig" version = "1.1.1" @@ -663,19 +685,19 @@ python-versions = "*" [[package]] name = "itsdangerous" -version = "2.0.1" +version = "2.1.2" description = "Safely pass data to untrusted environments and back." category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [[package]] name = "jinja2" -version = "3.0.3" +version = "3.1.2" description = "A very fast and expressive template engine." category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [package.dependencies] MarkupSafe = ">=2.0" @@ -685,11 +707,11 @@ i18n = ["Babel (>=2.7)"] [[package]] name = "jmespath" -version = "0.10.0" +version = "1.0.1" description = "JSON Matching Expressions" category = "main" optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +python-versions = ">=3.7" [[package]] name = "jschema-to-python" @@ -706,7 +728,7 @@ pbr = "*" [[package]] name = "jsondiff" -version = "1.3.0" +version = "2.0.0" description = "Diff JSON and JSON-like structures in Python" category = "main" optional = false @@ -725,7 +747,7 @@ jsonpointer = ">=1.9" [[package]] name = "jsonpickle" -version = "2.1.0" +version = "2.2.0" description = "Python library for serializing any arbitrary object graph into JSON" category = "main" optional = false @@ -733,12 +755,12 @@ python-versions = ">=2.7" [package.extras] docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] -testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-black-multipy", "pytest-cov", "ecdsa", "feedparser", "numpy", "pandas", "pymongo", "scikit-learn", "sqlalchemy", "enum34", "jsonlib"] -"testing.libs" = ["demjson", "simplejson", "ujson", "yajl"] +testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-black-multipy", "pytest-cov", "ecdsa", "feedparser", "numpy", "pandas", "pymongo", "scikit-learn", "sqlalchemy", "pytest-flake8 (<1.1.0)", "enum34", "jsonlib", "pytest-flake8 (>=1.1.1)"] +"testing.libs" = ["simplejson", "ujson", "yajl"] [[package]] name = "jsonpointer" -version = "2.2" +version = "2.3" description = "Identify specific nodes in a JSON document (RFC 6901)" category = "main" optional = false @@ -774,11 +796,11 @@ six = "*" [[package]] name = "markupsafe" -version = "2.0.1" +version = "2.1.1" description = "Safely add untrusted strings to HTML/XML markup." category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [[package]] name = "mccabe" @@ -790,7 +812,7 @@ python-versions = "*" [[package]] name = "moto" -version = "3.1.9" +version = "3.1.16" description = "A library that allows your python tests to easily mock out the boto library" category = "main" optional = false @@ -811,7 +833,8 @@ idna = {version = ">=2.5,<4", optional = true, markers = "extra == \"server\""} Jinja2 = ">=2.10.1" jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} MarkupSafe = "!=2.0.0a1" -pyparsing = {version = ">=3.0.0", optional = true, markers = "extra == \"server\""} +openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""} +pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""} python-dateutil = ">=2.1,<3.0.0" python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""} pytz = "*" @@ -819,17 +842,17 @@ PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""} requests = ">=2.5" responses = ">=0.9.0" sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""} -werkzeug = "*" +werkzeug = ">=0.5" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.0)", "setuptools"] -apigateway = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)"] +all = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.7)", "openapi-spec-validator (>=0.2.8)", "setuptools"] +apigateway = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)"] apigatewayv2 = ["PyYAML (>=5.1)"] appsync = ["graphql-core"] awslambda = ["docker (>=2.5.1)"] batch = ["docker (>=2.5.1)"] -cloudformation = ["docker (>=2.5.1)", "PyYAML (>=5.1)", "cfn-lint (>=0.4.0)"] +cloudformation = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.7)", "openapi-spec-validator (>=0.2.8)", "setuptools"] cognitoidp = ["python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)"] ds = ["sshpubkeys (>=3.1.0)"] dynamodb = ["docker (>=2.5.1)"] @@ -838,30 +861,42 @@ dynamodbstreams = ["docker (>=2.5.1)"] ebs = ["sshpubkeys (>=3.1.0)"] ec2 = ["sshpubkeys (>=3.1.0)"] efs = ["sshpubkeys (>=3.1.0)"] -glue = ["pyparsing (>=3.0.0)"] +glue = ["pyparsing (>=3.0.7)"] iotdata = ["jsondiff (>=1.1.2)"] route53resolver = ["sshpubkeys (>=3.1.0)"] s3 = ["PyYAML (>=5.1)"] -server = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.0)", "setuptools", "flask", "flask-cors"] +server = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.7)", "openapi-spec-validator (>=0.2.8)", "setuptools", "flask", "flask-cors"] ssm = ["PyYAML (>=5.1)", "dataclasses"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] [[package]] name = "mypy" -version = "0.910" +version = "0.971" description = "Optional static typing for Python" category = "dev" optional = false -python-versions = ">=3.5" +python-versions = ">=3.6" [package.dependencies] -mypy-extensions = ">=0.4.3,<0.5.0" -toml = "*" -typing-extensions = ">=3.7.4" +mypy-extensions = ">=0.4.3" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = ">=3.10" [package.extras] dmypy = ["psutil (>=4.0)"] -python2 = ["typed-ast (>=1.4.0,<1.5.0)"] +python2 = ["typed-ast (>=1.4.0,<2)"] +reports = ["lxml"] + +[[package]] +name = "mypy-boto3-s3" +version = "1.24.36.post1" +description = "Type annotations for boto3.S3 1.24.36 service generated with mypy-boto3-builder 7.10.0" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +typing-extensions = ">=4.1.0" [[package]] name = "mypy-extensions" @@ -873,18 +908,50 @@ python-versions = "*" [[package]] name = "networkx" -version = "2.6.3" +version = "2.8.5" description = "Python package for creating and manipulating graphs and networks" category = "main" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" [package.extras] -default = ["numpy (>=1.19)", "scipy (>=1.5,!=1.6.1)", "matplotlib (>=3.3)", "pandas (>=1.1)"] -developer = ["black (==21.5b1)", "pre-commit (>=2.12)"] -doc = ["sphinx (>=4.0,<5.0)", "pydata-sphinx-theme (>=0.6,<1.0)", "sphinx-gallery (>=0.9,<1.0)", "numpydoc (>=1.1)", "pillow (>=8.2)", "nb2plots (>=0.6)", "texext (>=0.6.6)"] -extra = ["lxml (>=4.5)", "pygraphviz (>=1.7)", "pydot (>=1.4.1)"] -test = ["pytest (>=6.2)", "pytest-cov (>=2.12)", "codecov (>=2.1)"] +default = ["numpy (>=1.19)", "scipy (>=1.8)", "matplotlib (>=3.4)", "pandas (>=1.3)"] +developer = ["pre-commit (>=2.19)", "mypy (>=0.960)"] +doc = ["sphinx (>=5)", "pydata-sphinx-theme (>=0.9)", "sphinx-gallery (>=0.10)", "numpydoc (>=1.4)", "pillow (>=9.1)", "nb2plots (>=0.6)", "texext (>=0.6.6)"] +extra = ["lxml (>=4.6)", "pygraphviz (>=1.9)", "pydot (>=1.4.2)", "sympy (>=1.10)"] +test = ["pytest (>=7.1)", "pytest-cov (>=3.0)", "codecov (>=2.1)"] + +[[package]] +name = "openapi-schema-validator" +version = "0.2.3" +description = "OpenAPI schema validation for Python" +category = "main" +optional = false +python-versions = ">=3.7.0,<4.0.0" + +[package.dependencies] +jsonschema = ">=3.0.0,<5.0.0" + +[package.extras] +rfc3339-validator = ["rfc3339-validator"] +strict-rfc3339 = ["strict-rfc3339"] +isodate = ["isodate"] + +[[package]] +name = "openapi-spec-validator" +version = "0.4.0" +description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3.0 spec validator" +category = "main" +optional = false +python-versions = ">=3.7.0,<4.0.0" + +[package.dependencies] +jsonschema = ">=3.2.0,<5.0.0" +openapi-schema-validator = ">=0.2.0,<0.3.0" +PyYAML = ">=5.1" + +[package.extras] +requests = ["requests"] [[package]] name = "packaging" @@ -899,7 +966,7 @@ pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" [[package]] name = "pbr" -version = "5.8.0" +version = "5.9.0" description = "Python Build Reasonableness" category = "main" optional = false @@ -995,25 +1062,14 @@ tests = ["pytest (>=6.0.0,<7.0.0)", "coverage[toml] (==5.0.4)"] [[package]] name = "pyparsing" -version = "3.0.6" -description = "Python parsing module" +version = "3.0.9" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.6.8" [package.extras] -diagrams = ["jinja2", "railroad-diagrams"] - -[[package]] -name = "pypiwin32" -version = "223" -description = "" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -pywin32 = ">=223" +diagrams = ["railroad-diagrams", "jinja2"] [[package]] name = "pyrsistent" @@ -1128,7 +1184,7 @@ pycryptodome = ["pycryptodome (>=3.3.1,<4.0.0)", "pyasn1"] [[package]] name = "pytz" -version = "2021.3" +version = "2022.1" description = "World timezone definitions, modern and historical" category = "main" optional = false @@ -1136,7 +1192,7 @@ python-versions = "*" [[package]] name = "pywin32" -version = "301" +version = "227" description = "Python for Window Extensions" category = "main" optional = false @@ -1152,41 +1208,40 @@ python-versions = ">=3.6" [[package]] name = "requests" -version = "2.27.1" +version = "2.28.1" description = "Python HTTP for Humans." category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +python-versions = ">=3.7, <4" [package.dependencies] certifi = ">=2017.4.17" -charset-normalizer = {version = ">=2.0.0,<2.1.0", markers = "python_version >= \"3\""} -idna = {version = ">=2.5,<4", markers = "python_version >= \"3\""} +charset-normalizer = ">=2,<3" +idna = ">=2.5,<4" urllib3 = ">=1.21.1,<1.27" [package.extras] -socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] -use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "responses" -version = "0.17.0" +version = "0.21.0" description = "A utility library for mocking out the `requests` Python library." category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = ">=3.7" [package.dependencies] -requests = ">=2.0" -six = "*" +requests = ">=2.0,<3.0" urllib3 = ">=1.25.10" [package.extras] -tests = ["coverage (>=3.7.1,<6.0.0)", "pytest-cov", "pytest-localserver", "flake8", "types-mock", "types-requests", "types-six", "pytest (>=4.6,<5.0)", "pytest (>=4.6)", "mypy"] +tests = ["pytest (>=7.0.0)", "coverage (>=6.0.0)", "pytest-cov", "pytest-asyncio", "pytest-localserver", "flake8", "types-mock", "types-requests", "mypy"] [[package]] name = "rsa" -version = "4.8" +version = "4.9" description = "Pure-Python RSA implementation" category = "main" optional = false @@ -1197,11 +1252,11 @@ pyasn1 = ">=0.1.3" [[package]] name = "s3transfer" -version = "0.5.0" +version = "0.6.0" description = "An Amazon S3 Transfer Manager" category = "main" optional = false -python-versions = ">= 3.6" +python-versions = ">= 3.7" [package.dependencies] botocore = ">=1.12.36,<2.0a.0" @@ -1252,9 +1307,17 @@ category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +category = "dev" +optional = false +python-versions = ">=3.7" + [[package]] name = "types-psycopg2" -version = "2.9.6" +version = "2.9.18" description = "Typing stubs for psycopg2" category = "main" optional = false @@ -1262,7 +1325,7 @@ python-versions = "*" [[package]] name = "types-requests" -version = "2.27.7" +version = "2.28.5" description = "Typing stubs for requests" category = "main" optional = false @@ -1271,9 +1334,17 @@ python-versions = "*" [package.dependencies] types-urllib3 = "<1.27" +[[package]] +name = "types-s3transfer" +version = "0.6.0.post3" +description = "Type annotations and code completion for s3transfer" +category = "main" +optional = false +python-versions = ">=3.7,<4.0" + [[package]] name = "types-urllib3" -version = "1.26.7" +version = "1.26.17" description = "Typing stubs for urllib3" category = "main" optional = false @@ -1281,32 +1352,32 @@ python-versions = "*" [[package]] name = "typing-extensions" -version = "3.10.0.2" -description = "Backported and Experimental Type Hints for Python 3.5+" +version = "4.3.0" +description = "Backported and Experimental Type Hints for Python 3.7+" category = "main" optional = false -python-versions = "*" +python-versions = ">=3.7" [[package]] name = "urllib3" -version = "1.26.8" +version = "1.26.11" description = "HTTP library with thread-safe connection pooling, file post, and more." category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" [package.extras] -brotli = ["brotlipy (>=0.6.0)"] +brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] name = "websocket-client" -version = "1.2.3" +version = "1.3.3" description = "WebSocket client for Python with low level API options" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [package.extras] docs = ["Sphinx (>=3.4)", "sphinx-rtd-theme (>=0.5)"] @@ -1315,18 +1386,21 @@ test = ["websockets"] [[package]] name = "werkzeug" -version = "2.0.2" +version = "2.2.0" description = "The comprehensive WSGI web application library." category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" + +[package.dependencies] +MarkupSafe = ">=2.1.1" [package.extras] watchdog = ["watchdog"] [[package]] name = "wrapt" -version = "1.13.3" +version = "1.14.1" description = "Module for decorators, wrappers and monkey patching." category = "main" optional = false @@ -1334,11 +1408,11 @@ python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" [[package]] name = "xmltodict" -version = "0.12.0" +version = "0.13.0" description = "Makes working with XML feel like you are working with JSON" category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +python-versions = ">=3.4" [[package]] name = "yapf" @@ -1348,697 +1422,132 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "zipp" +version = "3.8.1" +description = "Backport of pathlib-compatible object wrapper for zip files" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"] + [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "be9c00bb5081535805824242fea2a03b2f82fa9466856d618e24b3140c7da6a0" +content-hash = "17e901dca9680c6ead56661492431cfce65cb20508be419599f0862ff2d1d827" [metadata.files] -aiopg = [ - {file = "aiopg-1.3.3-py3-none-any.whl", hash = "sha256:2842dd8741460eeef940032dcb577bfba4d4115205dd82a73ce13b3271f5bf0a"}, - {file = "aiopg-1.3.3.tar.gz", hash = "sha256:547c6ba4ea0d73c2a11a2f44387d7133cc01d3c6f3b8ed976c0ac1eff4f595d7"}, -] -async-timeout = [ - {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"}, - {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"}, -] -asyncpg = [ - {file = "asyncpg-0.24.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1"}, - {file = "asyncpg-0.24.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843"}, - {file = "asyncpg-0.24.0-cp310-cp310-win_amd64.whl", hash = "sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d"}, - {file = "asyncpg-0.24.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5"}, - {file = "asyncpg-0.24.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83"}, - {file = "asyncpg-0.24.0-cp37-cp37m-win_amd64.whl", hash = "sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18"}, - {file = "asyncpg-0.24.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20"}, - {file = "asyncpg-0.24.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece"}, - {file = "asyncpg-0.24.0-cp38-cp38-win_amd64.whl", hash = "sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317"}, - {file = "asyncpg-0.24.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c"}, - {file = "asyncpg-0.24.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03"}, - {file = "asyncpg-0.24.0-cp39-cp39-win_amd64.whl", hash = "sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853"}, - {file = "asyncpg-0.24.0.tar.gz", hash = "sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6"}, -] -atomicwrites = [ - {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, - {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, -] -attrs = [ - {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"}, - {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"}, -] -aws-sam-translator = [ - {file = "aws-sam-translator-1.42.0.tar.gz", hash = "sha256:8a7976c0ee2fca004a590e17d3551a49c8d8ba14ed0cb3674ea270d41d0dcd5b"}, - {file = "aws_sam_translator-1.42.0-py2-none-any.whl", hash = "sha256:4f5d3d5d0567fe728e75c5c8dff599f7c88313b3b8e85b9b17a2c00cb046b2e4"}, - {file = "aws_sam_translator-1.42.0-py3-none-any.whl", hash = "sha256:31875e4f639511f506d0c757a2a50756bd846440724079e867aafb12c534ac23"}, -] -aws-xray-sdk = [ - {file = "aws-xray-sdk-2.9.0.tar.gz", hash = "sha256:b0cd972db218d4d8f7b53ad806fc6184626b924c4997ae58fc9f2a8cd1281568"}, - {file = "aws_xray_sdk-2.9.0-py2.py3-none-any.whl", hash = "sha256:98216b3ac8281b51b59a8703f8ec561c460807d9d0679838f5c0179d381d7e58"}, -] -backoff = [ - {file = "backoff-1.11.1-py2.py3-none-any.whl", hash = "sha256:61928f8fa48d52e4faa81875eecf308eccfb1016b018bb6bd21e05b5d90a96c5"}, - {file = "backoff-1.11.1.tar.gz", hash = "sha256:ccb962a2378418c667b3c979b504fdeb7d9e0d29c0579e3b13b86467177728cb"}, -] -boto3 = [ - {file = "boto3-1.20.40-py3-none-any.whl", hash = "sha256:cfe85589e4a0a997c7b9ae7432400b03fa6fa5fea29fdc48db3099a903b76998"}, - {file = "boto3-1.20.40.tar.gz", hash = "sha256:66aef9a6d8cad393f69166112ba49e14e2c6766f9278c96134101314a9af2992"}, -] -boto3-stubs = [ - {file = "boto3-stubs-1.20.40.tar.gz", hash = "sha256:24f23e14de15d29a85e301b5beb144d2c778ed350e0c08a2136a978c8105e3c9"}, - {file = "boto3_stubs-1.20.40-py3-none-any.whl", hash = "sha256:2e940afd4a47688bb536155b10bdc65cc99390217bfcb392f4fc8c188646a65f"}, -] -botocore = [ - {file = "botocore-1.23.40-py3-none-any.whl", hash = "sha256:88a314fe27cd97a0c731094c5b34db01ebe930801700e5d1b68485ebde746c3c"}, - {file = "botocore-1.23.40.tar.gz", hash = "sha256:49baa1fca4483b24769f0743fbf72afe4db391f41f1fc12ea34e06036db642a4"}, -] -botocore-stubs = [ - {file = "botocore-stubs-1.23.40.tar.gz", hash = "sha256:48529a2b7e14c6e3dd4544c21d4cf342ad512e2a526f5262c565357683d78787"}, - {file = "botocore_stubs-1.23.40-py3-none-any.whl", hash = "sha256:b5762895175cbacfa989b7ff313ca20f30f82137fcfd8a389cfe4a920cb57e73"}, -] -cached-property = [ - {file = "cached-property-1.5.2.tar.gz", hash = "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130"}, - {file = "cached_property-1.5.2-py2.py3-none-any.whl", hash = "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"}, -] -certifi = [ - {file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"}, - {file = "certifi-2021.10.8.tar.gz", hash = "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872"}, -] -cffi = [ - {file = "cffi-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962"}, - {file = "cffi-1.15.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0"}, - {file = "cffi-1.15.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14"}, - {file = "cffi-1.15.0-cp27-cp27m-win32.whl", hash = "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474"}, - {file = "cffi-1.15.0-cp27-cp27m-win_amd64.whl", hash = "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6"}, - {file = "cffi-1.15.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27"}, - {file = "cffi-1.15.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023"}, - {file = "cffi-1.15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2"}, - {file = "cffi-1.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e"}, - {file = "cffi-1.15.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7"}, - {file = "cffi-1.15.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3"}, - {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c"}, - {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962"}, - {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382"}, - {file = "cffi-1.15.0-cp310-cp310-win32.whl", hash = "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55"}, - {file = "cffi-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0"}, - {file = "cffi-1.15.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e"}, - {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39"}, - {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc"}, - {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032"}, - {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8"}, - {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605"}, - {file = "cffi-1.15.0-cp36-cp36m-win32.whl", hash = "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e"}, - {file = "cffi-1.15.0-cp36-cp36m-win_amd64.whl", hash = "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc"}, - {file = "cffi-1.15.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636"}, - {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4"}, - {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997"}, - {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b"}, - {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2"}, - {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7"}, - {file = "cffi-1.15.0-cp37-cp37m-win32.whl", hash = "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66"}, - {file = "cffi-1.15.0-cp37-cp37m-win_amd64.whl", hash = "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029"}, - {file = "cffi-1.15.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880"}, - {file = "cffi-1.15.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20"}, - {file = "cffi-1.15.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024"}, - {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e"}, - {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728"}, - {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6"}, - {file = "cffi-1.15.0-cp38-cp38-win32.whl", hash = "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c"}, - {file = "cffi-1.15.0-cp38-cp38-win_amd64.whl", hash = "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443"}, - {file = "cffi-1.15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a"}, - {file = "cffi-1.15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37"}, - {file = "cffi-1.15.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a"}, - {file = "cffi-1.15.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e"}, - {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"}, - {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df"}, - {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8"}, - {file = "cffi-1.15.0-cp39-cp39-win32.whl", hash = "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a"}, - {file = "cffi-1.15.0-cp39-cp39-win_amd64.whl", hash = "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139"}, - {file = "cffi-1.15.0.tar.gz", hash = "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954"}, -] -cfn-lint = [ - {file = "cfn-lint-0.57.0.tar.gz", hash = "sha256:17c2e3ba693ae259c868e221d159dc4aa9c7e60a970cdc1d1309150c9250faf4"}, - {file = "cfn_lint-0.57.0-py3-none-any.whl", hash = "sha256:71b5e23b6a5101416c13275baa0f172c935f679fac6956ae768c467a117913c2"}, -] -charset-normalizer = [ - {file = "charset-normalizer-2.0.10.tar.gz", hash = "sha256:876d180e9d7432c5d1dfd4c5d26b72f099d503e8fcc0feb7532c9289be60fcbd"}, - {file = "charset_normalizer-2.0.10-py3-none-any.whl", hash = "sha256:cb957888737fc0bbcd78e3df769addb41fd1ff8cf950dc9e7ad7793f1bf44455"}, -] -click = [ - {file = "click-8.0.3-py3-none-any.whl", hash = "sha256:353f466495adaeb40b6b5f592f9f91cb22372351c84caeb068132442a4518ef3"}, - {file = "click-8.0.3.tar.gz", hash = "sha256:410e932b050f5eed773c4cda94de75971c89cdb3155a72a0831139a79e5ecb5b"}, -] -colorama = [ - {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, - {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, -] -cryptography = [ - {file = "cryptography-36.0.1-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:73bc2d3f2444bcfeac67dd130ff2ea598ea5f20b40e36d19821b4df8c9c5037b"}, - {file = "cryptography-36.0.1-cp36-abi3-macosx_10_10_x86_64.whl", hash = "sha256:2d87cdcb378d3cfed944dac30596da1968f88fb96d7fc34fdae30a99054b2e31"}, - {file = "cryptography-36.0.1-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:74d6c7e80609c0f4c2434b97b80c7f8fdfaa072ca4baab7e239a15d6d70ed73a"}, - {file = "cryptography-36.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:6c0c021f35b421ebf5976abf2daacc47e235f8b6082d3396a2fe3ccd537ab173"}, - {file = "cryptography-36.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d59a9d55027a8b88fd9fd2826c4392bd487d74bf628bb9d39beecc62a644c12"}, - {file = "cryptography-36.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a817b961b46894c5ca8a66b599c745b9a3d9f822725221f0e0fe49dc043a3a3"}, - {file = "cryptography-36.0.1-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:94ae132f0e40fe48f310bba63f477f14a43116f05ddb69d6fa31e93f05848ae2"}, - {file = "cryptography-36.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:7be0eec337359c155df191d6ae00a5e8bbb63933883f4f5dffc439dac5348c3f"}, - {file = "cryptography-36.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:e0344c14c9cb89e76eb6a060e67980c9e35b3f36691e15e1b7a9e58a0a6c6dc3"}, - {file = "cryptography-36.0.1-cp36-abi3-win32.whl", hash = "sha256:4caa4b893d8fad33cf1964d3e51842cd78ba87401ab1d2e44556826df849a8ca"}, - {file = "cryptography-36.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:391432971a66cfaf94b21c24ab465a4cc3e8bf4a939c1ca5c3e3a6e0abebdbcf"}, - {file = "cryptography-36.0.1-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:bb5829d027ff82aa872d76158919045a7c1e91fbf241aec32cb07956e9ebd3c9"}, - {file = "cryptography-36.0.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc15b1c22e55c4d5566e3ca4db8689470a0ca2babef8e3a9ee057a8b82ce4b1"}, - {file = "cryptography-36.0.1-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:596f3cd67e1b950bc372c33f1a28a0692080625592ea6392987dba7f09f17a94"}, - {file = "cryptography-36.0.1-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:30ee1eb3ebe1644d1c3f183d115a8c04e4e603ed6ce8e394ed39eea4a98469ac"}, - {file = "cryptography-36.0.1-pp38-pypy38_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ec63da4e7e4a5f924b90af42eddf20b698a70e58d86a72d943857c4c6045b3ee"}, - {file = "cryptography-36.0.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca238ceb7ba0bdf6ce88c1b74a87bffcee5afbfa1e41e173b1ceb095b39add46"}, - {file = "cryptography-36.0.1-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:ca28641954f767f9822c24e927ad894d45d5a1e501767599647259cbf030b903"}, - {file = "cryptography-36.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:39bdf8e70eee6b1c7b289ec6e5d84d49a6bfa11f8b8646b5b3dfe41219153316"}, - {file = "cryptography-36.0.1.tar.gz", hash = "sha256:53e5c1dc3d7a953de055d77bef2ff607ceef7a2aac0353b5d630ab67f7423638"}, -] -docker = [ - {file = "docker-4.2.2-py2.py3-none-any.whl", hash = "sha256:03a46400c4080cb6f7aa997f881ddd84fef855499ece219d75fbdb53289c17ab"}, - {file = "docker-4.2.2.tar.gz", hash = "sha256:26eebadce7e298f55b76a88c4f8802476c5eaddbdbe38dbc6cce8781c47c9b54"}, -] -ecdsa = [ - {file = "ecdsa-0.17.0-py2.py3-none-any.whl", hash = "sha256:5cf31d5b33743abe0dfc28999036c849a69d548f994b535e527ee3cb7f3ef676"}, - {file = "ecdsa-0.17.0.tar.gz", hash = "sha256:b9f500bb439e4153d0330610f5d26baaf18d17b8ced1bc54410d189385ea68aa"}, -] -execnet = [ - {file = "execnet-1.9.0-py2.py3-none-any.whl", hash = "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"}, - {file = "execnet-1.9.0.tar.gz", hash = "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5"}, -] -flake8 = [ - {file = "flake8-3.9.2-py2.py3-none-any.whl", hash = "sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"}, - {file = "flake8-3.9.2.tar.gz", hash = "sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b"}, -] -flask = [ - {file = "Flask-2.0.2-py3-none-any.whl", hash = "sha256:cb90f62f1d8e4dc4621f52106613488b5ba826b2e1e10a33eac92f723093ab6a"}, - {file = "Flask-2.0.2.tar.gz", hash = "sha256:7b2fb8e934ddd50731893bdcdb00fc8c0315916f9fcd50d22c7cc1a95ab634e2"}, -] -flask-cors = [ - {file = "Flask-Cors-3.0.10.tar.gz", hash = "sha256:b60839393f3b84a0f3746f6cdca56c1ad7426aa738b70d6c61375857823181de"}, - {file = "Flask_Cors-3.0.10-py2.py3-none-any.whl", hash = "sha256:74efc975af1194fc7891ff5cd85b0f7478be4f7f59fe158102e91abb72bb4438"}, -] -future = [ - {file = "future-0.18.2.tar.gz", hash = "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"}, -] -graphql-core = [ - {file = "graphql-core-3.2.0.tar.gz", hash = "sha256:86e2a0be008bfde19ef78388de8a725a1d942a9190ca431c24a60837973803ce"}, - {file = "graphql_core-3.2.0-py3-none-any.whl", hash = "sha256:0dda7e63676f119bb3d814621190fedad72fda07a8e9ab780bedd9f1957c6dc6"}, -] -idna = [ - {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, - {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, -] -iniconfig = [ - {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, - {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, -] -itsdangerous = [ - {file = "itsdangerous-2.0.1-py3-none-any.whl", hash = "sha256:5174094b9637652bdb841a3029700391451bd092ba3db90600dea710ba28e97c"}, - {file = "itsdangerous-2.0.1.tar.gz", hash = "sha256:9e724d68fc22902a1435351f84c3fb8623f303fffcc566a4cb952df8c572cff0"}, -] -jinja2 = [ - {file = "Jinja2-3.0.3-py3-none-any.whl", hash = "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8"}, - {file = "Jinja2-3.0.3.tar.gz", hash = "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"}, -] -jmespath = [ - {file = "jmespath-0.10.0-py2.py3-none-any.whl", hash = "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"}, - {file = "jmespath-0.10.0.tar.gz", hash = "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9"}, -] -jschema-to-python = [ - {file = "jschema_to_python-1.2.3-py3-none-any.whl", hash = "sha256:8a703ca7604d42d74b2815eecf99a33359a8dccbb80806cce386d5e2dd992b05"}, - {file = "jschema_to_python-1.2.3.tar.gz", hash = "sha256:76ff14fe5d304708ccad1284e4b11f96a658949a31ee7faed9e0995279549b91"}, -] -jsondiff = [ - {file = "jsondiff-1.3.0.tar.gz", hash = "sha256:5122bf4708a031b02db029366184a87c5d0ddd5a327a5884ee6cf0193e599d71"}, -] -jsonpatch = [ - {file = "jsonpatch-1.32-py2.py3-none-any.whl", hash = "sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397"}, - {file = "jsonpatch-1.32.tar.gz", hash = "sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2"}, -] -jsonpickle = [ - {file = "jsonpickle-2.1.0-py2.py3-none-any.whl", hash = "sha256:1dee77ddc5d652dfdabc33d33cff9d7e131d428007007da4fd6f7071ae774b0f"}, - {file = "jsonpickle-2.1.0.tar.gz", hash = "sha256:84684cfc5338a534173c8dd69809e40f2865d0be1f8a2b7af8465e5b968dcfa9"}, -] -jsonpointer = [ - {file = "jsonpointer-2.2-py2.py3-none-any.whl", hash = "sha256:26d9a47a72d4dc3e3ae72c4c6cd432afd73c680164cd2540772eab53cb3823b6"}, - {file = "jsonpointer-2.2.tar.gz", hash = "sha256:f09f8deecaaa5aea65b5eb4f67ca4e54e1a61f7a11c75085e360fe6feb6a48bf"}, -] -jsonschema = [ - {file = "jsonschema-3.2.0-py2.py3-none-any.whl", hash = "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163"}, - {file = "jsonschema-3.2.0.tar.gz", hash = "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"}, -] -junit-xml = [ - {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"}, -] -markupsafe = [ - {file = "MarkupSafe-2.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-win32.whl", hash = "sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-win32.whl", hash = "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-win32.whl", hash = "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-win32.whl", hash = "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-win32.whl", hash = "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8"}, - {file = "MarkupSafe-2.0.1.tar.gz", hash = "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a"}, -] -mccabe = [ - {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, - {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, -] -moto = [ - {file = "moto-3.1.9-py3-none-any.whl", hash = "sha256:8928ec168e5fd88b1127413b2fa570a80d45f25182cdad793edd208d07825269"}, - {file = "moto-3.1.9.tar.gz", hash = "sha256:ba683e70950b6579189bc12d74c1477aa036c090c6ad8b151a22f5896c005113"}, -] -mypy = [ - {file = "mypy-0.910-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457"}, - {file = "mypy-0.910-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb"}, - {file = "mypy-0.910-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9"}, - {file = "mypy-0.910-cp35-cp35m-win_amd64.whl", hash = "sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e"}, - {file = "mypy-0.910-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921"}, - {file = "mypy-0.910-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6"}, - {file = "mypy-0.910-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212"}, - {file = "mypy-0.910-cp36-cp36m-win_amd64.whl", hash = "sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885"}, - {file = "mypy-0.910-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0"}, - {file = "mypy-0.910-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de"}, - {file = "mypy-0.910-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703"}, - {file = "mypy-0.910-cp37-cp37m-win_amd64.whl", hash = "sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a"}, - {file = "mypy-0.910-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504"}, - {file = "mypy-0.910-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9"}, - {file = "mypy-0.910-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072"}, - {file = "mypy-0.910-cp38-cp38-win_amd64.whl", hash = "sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811"}, - {file = "mypy-0.910-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e"}, - {file = "mypy-0.910-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b"}, - {file = "mypy-0.910-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2"}, - {file = "mypy-0.910-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97"}, - {file = "mypy-0.910-cp39-cp39-win_amd64.whl", hash = "sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8"}, - {file = "mypy-0.910-py3-none-any.whl", hash = "sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"}, - {file = "mypy-0.910.tar.gz", hash = "sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150"}, -] -mypy-extensions = [ - {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, - {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, -] -networkx = [ - {file = "networkx-2.6.3-py3-none-any.whl", hash = "sha256:80b6b89c77d1dfb64a4c7854981b60aeea6360ac02c6d4e4913319e0a313abef"}, - {file = "networkx-2.6.3.tar.gz", hash = "sha256:c0946ed31d71f1b732b5aaa6da5a0388a345019af232ce2f49c766e2d6795c51"}, -] +aiopg = [] +async-timeout = [] +asyncpg = [] +atomicwrites = [] +attrs = [] +aws-sam-translator = [] +aws-xray-sdk = [] +backoff = [] +boto3 = [] +boto3-stubs = [] +botocore = [] +botocore-stubs = [] +cached-property = [] +certifi = [] +cffi = [] +cfn-lint = [] +charset-normalizer = [] +click = [] +colorama = [] +cryptography = [] +docker = [] +ecdsa = [] +execnet = [] +flake8 = [] +flask = [] +flask-cors = [] +graphql-core = [] +idna = [] +importlib-metadata = [] +iniconfig = [] +itsdangerous = [] +jinja2 = [] +jmespath = [] +jschema-to-python = [] +jsondiff = [] +jsonpatch = [] +jsonpickle = [] +jsonpointer = [] +jsonschema = [] +junit-xml = [] +markupsafe = [] +mccabe = [] +moto = [] +mypy = [] +mypy-boto3-s3 = [] +mypy-extensions = [] +networkx = [] +openapi-schema-validator = [] +openapi-spec-validator = [] packaging = [ {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, ] -pbr = [ - {file = "pbr-5.8.0-py2.py3-none-any.whl", hash = "sha256:176e8560eaf61e127817ef93d8a844803abb27a4d4637f0ff3bb783129be2e0a"}, - {file = "pbr-5.8.0.tar.gz", hash = "sha256:672d8ebee84921862110f23fcec2acea191ef58543d34dfe9ef3d9f13c31cddf"}, -] -pluggy = [ - {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, - {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, -] -prometheus-client = [ - {file = "prometheus_client-0.14.1-py3-none-any.whl", hash = "sha256:522fded625282822a89e2773452f42df14b5a8e84a86433e3f8a189c1d54dc01"}, - {file = "prometheus_client-0.14.1.tar.gz", hash = "sha256:5459c427624961076277fdc6dc50540e2bacb98eebde99886e59ec55ed92093a"}, -] -psycopg2-binary = [ - {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_ppc64le.whl", hash = "sha256:0a29729145aaaf1ad8bafe663131890e2111f13416b60e460dae0a96af5905c9"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3a79d622f5206d695d7824cbf609a4f5b88ea6d6dab5f7c147fc6d333a8787e4"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:090f3348c0ab2cceb6dfbe6bf721ef61262ddf518cd6cc6ecc7d334996d64efa"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:a9e1f75f96ea388fbcef36c70640c4efbe4650658f3d6a2967b4cc70e907352e"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c3ae8e75eb7160851e59adc77b3a19a976e50622e44fd4fd47b8b18208189d42"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-win32.whl", hash = "sha256:7b1e9b80afca7b7a386ef087db614faebbf8839b7f4db5eb107d0f1a53225029"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:8b344adbb9a862de0c635f4f0425b7958bf5a4b927c8594e6e8d261775796d53"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:e847774f8ffd5b398a75bc1c18fbb56564cda3d629fe68fd81971fece2d3c67e"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:68641a34023d306be959101b345732360fc2ea4938982309b786f7be1b43a4a1"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3303f8807f342641851578ee7ed1f3efc9802d00a6f83c101d21c608cb864460"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_24_aarch64.whl", hash = "sha256:e3699852e22aa68c10de06524a3721ade969abf382da95884e6a10ff798f9281"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_24_ppc64le.whl", hash = "sha256:526ea0378246d9b080148f2d6681229f4b5964543c170dd10bf4faaab6e0d27f"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:b1c8068513f5b158cf7e29c43a77eb34b407db29aca749d3eb9293ee0d3103ca"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:15803fa813ea05bef089fa78835118b5434204f3a17cb9f1e5dbfd0b9deea5af"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:152f09f57417b831418304c7f30d727dc83a12761627bb826951692cc6491e57"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:404224e5fef3b193f892abdbf8961ce20e0b6642886cfe1fe1923f41aaa75c9d"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-win32.whl", hash = "sha256:1f6b813106a3abdf7b03640d36e24669234120c72e91d5cbaeb87c5f7c36c65b"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:2d872e3c9d5d075a2e104540965a1cf898b52274a5923936e5bfddb58c59c7c2"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:10bb90fb4d523a2aa67773d4ff2b833ec00857f5912bafcfd5f5414e45280fb1"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:874a52ecab70af13e899f7847b3e074eeb16ebac5615665db33bce8a1009cf33"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a29b3ca4ec9defec6d42bf5feb36bb5817ba3c0230dd83b4edf4bf02684cd0ae"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_24_aarch64.whl", hash = "sha256:12b11322ea00ad8db8c46f18b7dfc47ae215e4df55b46c67a94b4effbaec7094"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_24_ppc64le.whl", hash = "sha256:53293533fcbb94c202b7c800a12c873cfe24599656b341f56e71dd2b557be063"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c381bda330ddf2fccbafab789d83ebc6c53db126e4383e73794c74eedce855ef"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:9d29409b625a143649d03d0fd7b57e4b92e0ecad9726ba682244b73be91d2fdb"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:183a517a3a63503f70f808b58bfbf962f23d73b6dccddae5aa56152ef2bcb232"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:15c4e4cfa45f5a60599d9cec5f46cd7b1b29d86a6390ec23e8eebaae84e64554"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_ppc64le.whl", hash = "sha256:63638d875be8c2784cfc952c9ac34e2b50e43f9f0a0660b65e2a87d656b3116c"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ffb7a888a047696e7f8240d649b43fb3644f14f0ee229077e7f6b9f9081635bd"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0c9d5450c566c80c396b7402895c4369a410cab5a82707b11aee1e624da7d004"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:d1c1b569ecafe3a69380a94e6ae09a4789bbb23666f3d3a08d06bbd2451f5ef1"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8fc53f9af09426a61db9ba357865c77f26076d48669f2e1bb24d85a22fb52307"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_ppc64le.whl", hash = "sha256:7af0dd86ddb2f8af5da57a976d27cd2cd15510518d582b478fbb2292428710b4"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:93cd1967a18aa0edd4b95b1dfd554cf15af657cb606280996d393dadc88c3c35"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bda845b664bb6c91446ca9609fc69f7db6c334ec5e4adc87571c34e4f47b7ddb"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:01310cf4cf26db9aea5158c217caa92d291f0500051a6469ac52166e1a16f5b7"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:99485cab9ba0fa9b84f1f9e1fef106f44a46ef6afdeec8885e0b88d0772b49e8"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-win32.whl", hash = "sha256:46f0e0a6b5fa5851bbd9ab1bc805eef362d3a230fbdfbc209f4a236d0a7a990d"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:accfe7e982411da3178ec690baaceaad3c278652998b2c45828aaac66cd8285f"}, -] -py = [ - {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, - {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, -] -pyasn1 = [ - {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"}, - {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"}, - {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"}, - {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"}, - {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, - {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"}, - {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"}, - {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"}, - {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"}, - {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"}, - {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"}, - {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"}, - {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, -] -pycodestyle = [ - {file = "pycodestyle-2.7.0-py2.py3-none-any.whl", hash = "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068"}, - {file = "pycodestyle-2.7.0.tar.gz", hash = "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"}, -] -pycparser = [ - {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, - {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, -] -pyflakes = [ - {file = "pyflakes-2.3.1-py2.py3-none-any.whl", hash = "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3"}, - {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"}, -] -pyjwt = [ - {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"}, - {file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"}, -] +pbr = [] +pluggy = [] +prometheus-client = [] +psycopg2-binary = [] +py = [] +pyasn1 = [] +pycodestyle = [] +pycparser = [] +pyflakes = [] +pyjwt = [] pyparsing = [ - {file = "pyparsing-3.0.6-py3-none-any.whl", hash = "sha256:04ff808a5b90911829c55c4e26f75fa5ca8a2f5f36aa3a51f68e27033341d3e4"}, - {file = "pyparsing-3.0.6.tar.gz", hash = "sha256:d9bdec0013ef1eb5a84ab39a3b3868911598afa494f5faa038647101504e2b81"}, -] -pypiwin32 = [ - {file = "pypiwin32-223-py3-none-any.whl", hash = "sha256:67adf399debc1d5d14dffc1ab5acacb800da569754fafdc576b2a039485aa775"}, - {file = "pypiwin32-223.tar.gz", hash = "sha256:71be40c1fbd28594214ecaecb58e7aa8b708eabfa0125c8a109ebd51edbd776a"}, -] -pyrsistent = [ - {file = "pyrsistent-0.18.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:df46c854f490f81210870e509818b729db4488e1f30f2a1ce1698b2295a878d1"}, - {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d45866ececf4a5fff8742c25722da6d4c9e180daa7b405dc0a2a2790d668c26"}, - {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4ed6784ceac462a7d6fcb7e9b663e93b9a6fb373b7f43594f9ff68875788e01e"}, - {file = "pyrsistent-0.18.1-cp310-cp310-win32.whl", hash = "sha256:e4f3149fd5eb9b285d6bfb54d2e5173f6a116fe19172686797c056672689daf6"}, - {file = "pyrsistent-0.18.1-cp310-cp310-win_amd64.whl", hash = "sha256:636ce2dc235046ccd3d8c56a7ad54e99d5c1cd0ef07d9ae847306c91d11b5fec"}, - {file = "pyrsistent-0.18.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e92a52c166426efbe0d1ec1332ee9119b6d32fc1f0bbfd55d5c1088070e7fc1b"}, - {file = "pyrsistent-0.18.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7a096646eab884bf8bed965bad63ea327e0d0c38989fc83c5ea7b8a87037bfc"}, - {file = "pyrsistent-0.18.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cdfd2c361b8a8e5d9499b9082b501c452ade8bbf42aef97ea04854f4a3f43b22"}, - {file = "pyrsistent-0.18.1-cp37-cp37m-win32.whl", hash = "sha256:7ec335fc998faa4febe75cc5268a9eac0478b3f681602c1f27befaf2a1abe1d8"}, - {file = "pyrsistent-0.18.1-cp37-cp37m-win_amd64.whl", hash = "sha256:6455fc599df93d1f60e1c5c4fe471499f08d190d57eca040c0ea182301321286"}, - {file = "pyrsistent-0.18.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:fd8da6d0124efa2f67d86fa70c851022f87c98e205f0594e1fae044e7119a5a6"}, - {file = "pyrsistent-0.18.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bfe2388663fd18bd8ce7db2c91c7400bf3e1a9e8bd7d63bf7e77d39051b85ec"}, - {file = "pyrsistent-0.18.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0e3e1fcc45199df76053026a51cc59ab2ea3fc7c094c6627e93b7b44cdae2c8c"}, - {file = "pyrsistent-0.18.1-cp38-cp38-win32.whl", hash = "sha256:b568f35ad53a7b07ed9b1b2bae09eb15cdd671a5ba5d2c66caee40dbf91c68ca"}, - {file = "pyrsistent-0.18.1-cp38-cp38-win_amd64.whl", hash = "sha256:d1b96547410f76078eaf66d282ddca2e4baae8964364abb4f4dcdde855cd123a"}, - {file = "pyrsistent-0.18.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f87cc2863ef33c709e237d4b5f4502a62a00fab450c9e020892e8e2ede5847f5"}, - {file = "pyrsistent-0.18.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bc66318fb7ee012071b2792024564973ecc80e9522842eb4e17743604b5e045"}, - {file = "pyrsistent-0.18.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:914474c9f1d93080338ace89cb2acee74f4f666fb0424896fcfb8d86058bf17c"}, - {file = "pyrsistent-0.18.1-cp39-cp39-win32.whl", hash = "sha256:1b34eedd6812bf4d33814fca1b66005805d3640ce53140ab8bbb1e2651b0d9bc"}, - {file = "pyrsistent-0.18.1-cp39-cp39-win_amd64.whl", hash = "sha256:e24a828f57e0c337c8d8bb9f6b12f09dfdf0273da25fda9e314f0b684b415a07"}, - {file = "pyrsistent-0.18.1.tar.gz", hash = "sha256:d4d61f8b993a7255ba714df3aca52700f8125289f84f704cf80916517c46eb96"}, -] -pytest = [ - {file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"}, - {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"}, -] -pytest-forked = [ - {file = "pytest-forked-1.4.0.tar.gz", hash = "sha256:8b67587c8f98cbbadfdd804539ed5455b6ed03802203485dd2f53c1422d7440e"}, - {file = "pytest_forked-1.4.0-py3-none-any.whl", hash = "sha256:bbbb6717efc886b9d64537b41fb1497cfaf3c9601276be8da2cccfea5a3c8ad8"}, -] -pytest-lazy-fixture = [ - {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"}, - {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"}, -] -pytest-timeout = [ - {file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"}, - {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"}, -] -pytest-xdist = [ - {file = "pytest-xdist-2.5.0.tar.gz", hash = "sha256:4580deca3ff04ddb2ac53eba39d76cb5dd5edeac050cb6fbc768b0dd712b4edf"}, - {file = "pytest_xdist-2.5.0-py3-none-any.whl", hash = "sha256:6fe5c74fec98906deb8f2d2b616b5c782022744978e7bd4695d39c8f42d0ce65"}, + {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, + {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, ] +pyrsistent = [] +pytest = [] +pytest-forked = [] +pytest-lazy-fixture = [] +pytest-timeout = [] +pytest-xdist = [] python-dateutil = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, ] -python-jose = [ - {file = "python-jose-3.3.0.tar.gz", hash = "sha256:55779b5e6ad599c6336191246e95eb2293a9ddebd555f796a65f838f07e5d78a"}, - {file = "python_jose-3.3.0-py2.py3-none-any.whl", hash = "sha256:9b1376b023f8b298536eedd47ae1089bcdb848f1535ab30555cd92002d78923a"}, -] +python-jose = [] pytz = [ - {file = "pytz-2021.3-py2.py3-none-any.whl", hash = "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c"}, - {file = "pytz-2021.3.tar.gz", hash = "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"}, -] -pywin32 = [ - {file = "pywin32-301-cp35-cp35m-win32.whl", hash = "sha256:93367c96e3a76dfe5003d8291ae16454ca7d84bb24d721e0b74a07610b7be4a7"}, - {file = "pywin32-301-cp35-cp35m-win_amd64.whl", hash = "sha256:9635df6998a70282bd36e7ac2a5cef9ead1627b0a63b17c731312c7a0daebb72"}, - {file = "pywin32-301-cp36-cp36m-win32.whl", hash = "sha256:c866f04a182a8cb9b7855de065113bbd2e40524f570db73ef1ee99ff0a5cc2f0"}, - {file = "pywin32-301-cp36-cp36m-win_amd64.whl", hash = "sha256:dafa18e95bf2a92f298fe9c582b0e205aca45c55f989937c52c454ce65b93c78"}, - {file = "pywin32-301-cp37-cp37m-win32.whl", hash = "sha256:98f62a3f60aa64894a290fb7494bfa0bfa0a199e9e052e1ac293b2ad3cd2818b"}, - {file = "pywin32-301-cp37-cp37m-win_amd64.whl", hash = "sha256:fb3b4933e0382ba49305cc6cd3fb18525df7fd96aa434de19ce0878133bf8e4a"}, - {file = "pywin32-301-cp38-cp38-win32.whl", hash = "sha256:88981dd3cfb07432625b180f49bf4e179fb8cbb5704cd512e38dd63636af7a17"}, - {file = "pywin32-301-cp38-cp38-win_amd64.whl", hash = "sha256:8c9d33968aa7fcddf44e47750e18f3d034c3e443a707688a008a2e52bbef7e96"}, - {file = "pywin32-301-cp39-cp39-win32.whl", hash = "sha256:595d397df65f1b2e0beaca63a883ae6d8b6df1cdea85c16ae85f6d2e648133fe"}, - {file = "pywin32-301-cp39-cp39-win_amd64.whl", hash = "sha256:87604a4087434cd814ad8973bd47d6524bd1fa9e971ce428e76b62a5e0860fdf"}, -] -pyyaml = [ - {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, - {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, - {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, - {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, - {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"}, - {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"}, - {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"}, - {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"}, - {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"}, - {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"}, - {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"}, - {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"}, - {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"}, - {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"}, - {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"}, - {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"}, - {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"}, - {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"}, -] -requests = [ - {file = "requests-2.27.1-py2.py3-none-any.whl", hash = "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"}, - {file = "requests-2.27.1.tar.gz", hash = "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61"}, -] -responses = [ - {file = "responses-0.17.0-py2.py3-none-any.whl", hash = "sha256:e4fc472fb7374fb8f84fcefa51c515ca4351f198852b4eb7fc88223780b472ea"}, - {file = "responses-0.17.0.tar.gz", hash = "sha256:ec675e080d06bf8d1fb5e5a68a1e5cd0df46b09c78230315f650af5e4036bec7"}, -] -rsa = [ - {file = "rsa-4.8-py3-none-any.whl", hash = "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"}, - {file = "rsa-4.8.tar.gz", hash = "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17"}, -] -s3transfer = [ - {file = "s3transfer-0.5.0-py3-none-any.whl", hash = "sha256:9c1dc369814391a6bda20ebbf4b70a0f34630592c9aa520856bf384916af2803"}, - {file = "s3transfer-0.5.0.tar.gz", hash = "sha256:50ed823e1dc5868ad40c8dc92072f757aa0e653a192845c94a3b676f4a62da4c"}, -] -sarif-om = [ - {file = "sarif_om-1.0.4-py3-none-any.whl", hash = "sha256:539ef47a662329b1c8502388ad92457425e95dc0aaaf995fe46f4984c4771911"}, - {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"}, + {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"}, + {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"}, ] +pywin32 = [] +pyyaml = [] +requests = [] +responses = [] +rsa = [] +s3transfer = [] +sarif-om = [] six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] -sshpubkeys = [ - {file = "sshpubkeys-3.3.1-py2.py3-none-any.whl", hash = "sha256:946f76b8fe86704b0e7c56a00d80294e39bc2305999844f079a217885060b1ac"}, - {file = "sshpubkeys-3.3.1.tar.gz", hash = "sha256:3020ed4f8c846849299370fbe98ff4157b0ccc1accec105e07cfa9ae4bb55064"}, -] -toml = [ - {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, - {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, -] -types-psycopg2 = [ - {file = "types-psycopg2-2.9.6.tar.gz", hash = "sha256:753b50b38da0e61bc8f89d149f2c4420c7e18535a87963d17b72343eb98f7c32"}, - {file = "types_psycopg2-2.9.6-py3-none-any.whl", hash = "sha256:2cfd855e1562ebb5da595ee9401da93a308d69121ccd359cb8341f94ba4b6d1c"}, -] -types-requests = [ - {file = "types-requests-2.27.7.tar.gz", hash = "sha256:f38bd488528cdcbce5b01dc953972f3cead0d060cfd9ee35b363066c25bab13c"}, - {file = "types_requests-2.27.7-py3-none-any.whl", hash = "sha256:2e0e100dd489f83870d4f61949d3a7eae4821e7bfbf46c57e463c38f92d473d4"}, -] -types-urllib3 = [ - {file = "types-urllib3-1.26.7.tar.gz", hash = "sha256:cfd1fbbe4ba9a605ed148294008aac8a7b8b7472651d1cc357d507ae5962e3d2"}, - {file = "types_urllib3-1.26.7-py3-none-any.whl", hash = "sha256:3adcf2cb5981809091dbff456e6999fe55f201652d8c360f99997de5ac2f556e"}, -] -typing-extensions = [ - {file = "typing_extensions-3.10.0.2-py2-none-any.whl", hash = "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7"}, - {file = "typing_extensions-3.10.0.2-py3-none-any.whl", hash = "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"}, - {file = "typing_extensions-3.10.0.2.tar.gz", hash = "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e"}, -] -urllib3 = [ - {file = "urllib3-1.26.8-py2.py3-none-any.whl", hash = "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed"}, - {file = "urllib3-1.26.8.tar.gz", hash = "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"}, -] -websocket-client = [ - {file = "websocket-client-1.2.3.tar.gz", hash = "sha256:1315816c0acc508997eb3ae03b9d3ff619c9d12d544c9a9b553704b1cc4f6af5"}, - {file = "websocket_client-1.2.3-py3-none-any.whl", hash = "sha256:2eed4cc58e4d65613ed6114af2f380f7910ff416fc8c46947f6e76b6815f56c0"}, -] -werkzeug = [ - {file = "Werkzeug-2.0.2-py3-none-any.whl", hash = "sha256:63d3dc1cf60e7b7e35e97fa9861f7397283b75d765afcaefd993d6046899de8f"}, - {file = "Werkzeug-2.0.2.tar.gz", hash = "sha256:aa2bb6fc8dee8d6c504c0ac1e7f5f7dc5810a9903e793b6f715a9f015bdadb9a"}, -] -wrapt = [ - {file = "wrapt-1.13.3-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:e05e60ff3b2b0342153be4d1b597bbcfd8330890056b9619f4ad6b8d5c96a81a"}, - {file = "wrapt-1.13.3-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:85148f4225287b6a0665eef08a178c15097366d46b210574a658c1ff5b377489"}, - {file = "wrapt-1.13.3-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:2dded5496e8f1592ec27079b28b6ad2a1ef0b9296d270f77b8e4a3a796cf6909"}, - {file = "wrapt-1.13.3-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:e94b7d9deaa4cc7bac9198a58a7240aaf87fe56c6277ee25fa5b3aa1edebd229"}, - {file = "wrapt-1.13.3-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:498e6217523111d07cd67e87a791f5e9ee769f9241fcf8a379696e25806965af"}, - {file = "wrapt-1.13.3-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:ec7e20258ecc5174029a0f391e1b948bf2906cd64c198a9b8b281b811cbc04de"}, - {file = "wrapt-1.13.3-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:87883690cae293541e08ba2da22cacaae0a092e0ed56bbba8d018cc486fbafbb"}, - {file = "wrapt-1.13.3-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:f99c0489258086308aad4ae57da9e8ecf9e1f3f30fa35d5e170b4d4896554d80"}, - {file = "wrapt-1.13.3-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:6a03d9917aee887690aa3f1747ce634e610f6db6f6b332b35c2dd89412912bca"}, - {file = "wrapt-1.13.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:936503cb0a6ed28dbfa87e8fcd0a56458822144e9d11a49ccee6d9a8adb2ac44"}, - {file = "wrapt-1.13.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f9c51d9af9abb899bd34ace878fbec8bf357b3194a10c4e8e0a25512826ef056"}, - {file = "wrapt-1.13.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:220a869982ea9023e163ba915077816ca439489de6d2c09089b219f4e11b6785"}, - {file = "wrapt-1.13.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:0877fe981fd76b183711d767500e6b3111378ed2043c145e21816ee589d91096"}, - {file = "wrapt-1.13.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:43e69ffe47e3609a6aec0fe723001c60c65305784d964f5007d5b4fb1bc6bf33"}, - {file = "wrapt-1.13.3-cp310-cp310-win32.whl", hash = "sha256:78dea98c81915bbf510eb6a3c9c24915e4660302937b9ae05a0947164248020f"}, - {file = "wrapt-1.13.3-cp310-cp310-win_amd64.whl", hash = "sha256:ea3e746e29d4000cd98d572f3ee2a6050a4f784bb536f4ac1f035987fc1ed83e"}, - {file = "wrapt-1.13.3-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:8c73c1a2ec7c98d7eaded149f6d225a692caa1bd7b2401a14125446e9e90410d"}, - {file = "wrapt-1.13.3-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:086218a72ec7d986a3eddb7707c8c4526d677c7b35e355875a0fe2918b059179"}, - {file = "wrapt-1.13.3-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:e92d0d4fa68ea0c02d39f1e2f9cb5bc4b4a71e8c442207433d8db47ee79d7aa3"}, - {file = "wrapt-1.13.3-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:d4a5f6146cfa5c7ba0134249665acd322a70d1ea61732723c7d3e8cc0fa80755"}, - {file = "wrapt-1.13.3-cp35-cp35m-win32.whl", hash = "sha256:8aab36778fa9bba1a8f06a4919556f9f8c7b33102bd71b3ab307bb3fecb21851"}, - {file = "wrapt-1.13.3-cp35-cp35m-win_amd64.whl", hash = "sha256:944b180f61f5e36c0634d3202ba8509b986b5fbaf57db3e94df11abee244ba13"}, - {file = "wrapt-1.13.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:2ebdde19cd3c8cdf8df3fc165bc7827334bc4e353465048b36f7deeae8ee0918"}, - {file = "wrapt-1.13.3-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:610f5f83dd1e0ad40254c306f4764fcdc846641f120c3cf424ff57a19d5f7ade"}, - {file = "wrapt-1.13.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5601f44a0f38fed36cc07db004f0eedeaadbdcec90e4e90509480e7e6060a5bc"}, - {file = "wrapt-1.13.3-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:e6906d6f48437dfd80464f7d7af1740eadc572b9f7a4301e7dd3d65db285cacf"}, - {file = "wrapt-1.13.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:766b32c762e07e26f50d8a3468e3b4228b3736c805018e4b0ec8cc01ecd88125"}, - {file = "wrapt-1.13.3-cp36-cp36m-win32.whl", hash = "sha256:5f223101f21cfd41deec8ce3889dc59f88a59b409db028c469c9b20cfeefbe36"}, - {file = "wrapt-1.13.3-cp36-cp36m-win_amd64.whl", hash = "sha256:f122ccd12fdc69628786d0c947bdd9cb2733be8f800d88b5a37c57f1f1d73c10"}, - {file = "wrapt-1.13.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:46f7f3af321a573fc0c3586612db4decb7eb37172af1bc6173d81f5b66c2e068"}, - {file = "wrapt-1.13.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:778fd096ee96890c10ce96187c76b3e99b2da44e08c9e24d5652f356873f6709"}, - {file = "wrapt-1.13.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0cb23d36ed03bf46b894cfec777eec754146d68429c30431c99ef28482b5c1df"}, - {file = "wrapt-1.13.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:96b81ae75591a795d8c90edc0bfaab44d3d41ffc1aae4d994c5aa21d9b8e19a2"}, - {file = "wrapt-1.13.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:7dd215e4e8514004c8d810a73e342c536547038fb130205ec4bba9f5de35d45b"}, - {file = "wrapt-1.13.3-cp37-cp37m-win32.whl", hash = "sha256:47f0a183743e7f71f29e4e21574ad3fa95676136f45b91afcf83f6a050914829"}, - {file = "wrapt-1.13.3-cp37-cp37m-win_amd64.whl", hash = "sha256:fd76c47f20984b43d93de9a82011bb6e5f8325df6c9ed4d8310029a55fa361ea"}, - {file = "wrapt-1.13.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b73d4b78807bd299b38e4598b8e7bd34ed55d480160d2e7fdaabd9931afa65f9"}, - {file = "wrapt-1.13.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ec9465dd69d5657b5d2fa6133b3e1e989ae27d29471a672416fd729b429eb554"}, - {file = "wrapt-1.13.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:dd91006848eb55af2159375134d724032a2d1d13bcc6f81cd8d3ed9f2b8e846c"}, - {file = "wrapt-1.13.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ae9de71eb60940e58207f8e71fe113c639da42adb02fb2bcbcaccc1ccecd092b"}, - {file = "wrapt-1.13.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:51799ca950cfee9396a87f4a1240622ac38973b6df5ef7a41e7f0b98797099ce"}, - {file = "wrapt-1.13.3-cp38-cp38-win32.whl", hash = "sha256:4b9c458732450ec42578b5642ac53e312092acf8c0bfce140ada5ca1ac556f79"}, - {file = "wrapt-1.13.3-cp38-cp38-win_amd64.whl", hash = "sha256:7dde79d007cd6dfa65afe404766057c2409316135cb892be4b1c768e3f3a11cb"}, - {file = "wrapt-1.13.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:981da26722bebb9247a0601e2922cedf8bb7a600e89c852d063313102de6f2cb"}, - {file = "wrapt-1.13.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:705e2af1f7be4707e49ced9153f8d72131090e52be9278b5dbb1498c749a1e32"}, - {file = "wrapt-1.13.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:25b1b1d5df495d82be1c9d2fad408f7ce5ca8a38085e2da41bb63c914baadff7"}, - {file = "wrapt-1.13.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:77416e6b17926d953b5c666a3cb718d5945df63ecf922af0ee576206d7033b5e"}, - {file = "wrapt-1.13.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:865c0b50003616f05858b22174c40ffc27a38e67359fa1495605f96125f76640"}, - {file = "wrapt-1.13.3-cp39-cp39-win32.whl", hash = "sha256:0a017a667d1f7411816e4bf214646d0ad5b1da2c1ea13dec6c162736ff25a374"}, - {file = "wrapt-1.13.3-cp39-cp39-win_amd64.whl", hash = "sha256:81bd7c90d28a4b2e1df135bfbd7c23aee3050078ca6441bead44c42483f9ebfb"}, - {file = "wrapt-1.13.3.tar.gz", hash = "sha256:1fea9cd438686e6682271d36f3481a9f3636195578bab9ca3382e2f5f01fc185"}, -] -xmltodict = [ - {file = "xmltodict-0.12.0-py2.py3-none-any.whl", hash = "sha256:8bbcb45cc982f48b2ca8fe7e7827c5d792f217ecf1792626f808bf41c3b86051"}, - {file = "xmltodict-0.12.0.tar.gz", hash = "sha256:50d8c638ed7ecb88d90561beedbf720c9b4e851a9fa6c47ebd64e99d166d8a21"}, -] -yapf = [ - {file = "yapf-0.31.0-py2.py3-none-any.whl", hash = "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"}, - {file = "yapf-0.31.0.tar.gz", hash = "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d"}, +sshpubkeys = [] +toml = [] +tomli = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +types-psycopg2 = [] +types-requests = [] +types-s3transfer = [] +types-urllib3 = [] +typing-extensions = [] +urllib3 = [] +websocket-client = [] +werkzeug = [] +wrapt = [] +xmltodict = [] +yapf = [] +zipp = [] diff --git a/pyproject.toml b/pyproject.toml index c965535049..2807881d71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ authors = [] python = "^3.9" pytest = "^6.2.5" psycopg2-binary = "^2.9.1" -typing-extensions = "^3.10.0" +typing-extensions = "^4.1.0" PyJWT = {version = "^2.1.0", extras = ["crypto"]} requests = "^2.26.0" pytest-xdist = "^2.3.0" @@ -16,10 +16,10 @@ asyncpg = "^0.24.0" aiopg = "^1.3.1" cached-property = "^1.5.2" Jinja2 = "^3.0.2" -types-requests = "^2.27.7" -types-psycopg2 = "^2.9.6" +types-requests = "^2.28.5" +types-psycopg2 = "^2.9.18" boto3 = "^1.20.40" -boto3-stubs = "^1.20.40" +boto3-stubs = {version = "^1.23.38", extras = ["s3"]} moto = {version = "^3.0.0", extras = ["server"]} backoff = "^1.11.1" pytest-lazy-fixture = "^0.6.3" @@ -29,7 +29,7 @@ pytest-timeout = "^2.1.0" [tool.poetry.dev-dependencies] yapf = "==0.31.0" flake8 = "^3.9.2" -mypy = "==0.910" +mypy = "==0.971" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index 3e7ba22184..d8ba0a1b06 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -1,6 +1,7 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException +from fixtures.utils import query_scalar # @@ -25,13 +26,11 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): pg_branch0 = env.postgres.create_start('main', tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() - branch0_cur.execute("SHOW neon.timeline_id") - branch0_timeline = branch0_cur.fetchone()[0] + branch0_timeline = query_scalar(branch0_cur, "SHOW neon.timeline_id") log.info(f"b0 timeline {branch0_timeline}") # Create table, and insert 100k rows. - branch0_cur.execute('SELECT pg_current_wal_insert_lsn()') - branch0_lsn = branch0_cur.fetchone()[0] + branch0_lsn = query_scalar(branch0_cur, 'SELECT pg_current_wal_insert_lsn()') log.info(f"b0 at lsn {branch0_lsn}") branch0_cur.execute('CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)') @@ -40,8 +39,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): SELECT '00112233445566778899AABBCCDDEEFF' || ':branch0:' || g FROM generate_series(1, 100000) g ''') - branch0_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_100 = branch0_cur.fetchone()[0] + lsn_100 = query_scalar(branch0_cur, 'SELECT pg_current_wal_insert_lsn()') log.info(f'LSN after 100k rows: {lsn_100}') # Create branch1. @@ -50,12 +48,10 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on 'branch1' branch") branch1_cur = pg_branch1.connect().cursor() - branch1_cur.execute("SHOW neon.timeline_id") - branch1_timeline = branch1_cur.fetchone()[0] + branch1_timeline = query_scalar(branch1_cur, "SHOW neon.timeline_id") log.info(f"b1 timeline {branch1_timeline}") - branch1_cur.execute('SELECT pg_current_wal_insert_lsn()') - branch1_lsn = branch1_cur.fetchone()[0] + branch1_lsn = query_scalar(branch1_cur, 'SELECT pg_current_wal_insert_lsn()') log.info(f"b1 at lsn {branch1_lsn}") # Insert 100k rows. @@ -64,8 +60,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): SELECT '00112233445566778899AABBCCDDEEFF' || ':branch1:' || g FROM generate_series(1, 100000) g ''') - branch1_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_200 = branch1_cur.fetchone()[0] + lsn_200 = query_scalar(branch1_cur, 'SELECT pg_current_wal_insert_lsn()') log.info(f'LSN after 200k rows: {lsn_200}') # Create branch2. @@ -74,12 +69,10 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on 'branch2' branch") branch2_cur = pg_branch2.connect().cursor() - branch2_cur.execute("SHOW neon.timeline_id") - branch2_timeline = branch2_cur.fetchone()[0] + branch2_timeline = query_scalar(branch2_cur, "SHOW neon.timeline_id") log.info(f"b2 timeline {branch2_timeline}") - branch2_cur.execute('SELECT pg_current_wal_insert_lsn()') - branch2_lsn = branch2_cur.fetchone()[0] + branch2_lsn = query_scalar(branch2_cur, 'SELECT pg_current_wal_insert_lsn()') log.info(f"b2 at lsn {branch2_lsn}") # Insert 100k rows. @@ -88,20 +81,16 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): SELECT '00112233445566778899AABBCCDDEEFF' || ':branch2:' || g FROM generate_series(1, 100000) g ''') - branch2_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_300 = branch2_cur.fetchone()[0] + lsn_300 = query_scalar(branch2_cur, 'SELECT pg_current_wal_insert_lsn()') log.info(f'LSN after 300k rows: {lsn_300}') # Run compaction on branch1. - psconn = env.pageserver.connect() - log.info(f'compact {tenant.hex} {branch1_timeline} {lsn_200}') - psconn.cursor().execute(f'''compact {tenant.hex} {branch1_timeline} {lsn_200}''') + compact = f'compact {tenant.hex} {branch1_timeline} {lsn_200}' + log.info(compact) + env.pageserver.safe_psql(compact) - branch0_cur.execute('SELECT count(*) FROM foo') - assert branch0_cur.fetchone() == (100000, ) + assert query_scalar(branch0_cur, 'SELECT count(*) FROM foo') == 100000 - branch1_cur.execute('SELECT count(*) FROM foo') - assert branch1_cur.fetchone() == (200000, ) + assert query_scalar(branch1_cur, 'SELECT count(*) FROM foo') == 200000 - branch2_cur.execute('SELECT count(*) FROM foo') - assert branch2_cur.fetchone() == (300000, ) + assert query_scalar(branch2_cur, 'SELECT count(*) FROM foo') == 300000 diff --git a/test_runner/batch_others/test_branch_and_gc.py b/test_runner/batch_others/test_branch_and_gc.py index 901b3f3d0f..76a77357ae 100644 --- a/test_runner/batch_others/test_branch_and_gc.py +++ b/test_runner/batch_others/test_branch_and_gc.py @@ -3,7 +3,7 @@ import pytest import time from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv -from fixtures.utils import lsn_from_hex +from fixtures.utils import lsn_from_hex, query_scalar # Test the GC implementation when running with branching. @@ -76,20 +76,17 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')" ) main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)') - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn1 = main_cur.fetchone()[0] + lsn1 = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') log.info(f'LSN1: {lsn1}') main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)') - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn2 = main_cur.fetchone()[0] + lsn2 = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') log.info(f'LSN2: {lsn2}') # Set the GC horizon so that lsn1 is inside the horizon, which means # we can create a new branch starting from lsn1. env.pageserver.safe_psql( - f'''do_gc {tenant.hex} {timeline_main.hex} {lsn_from_hex(lsn2) - lsn_from_hex(lsn1) + 1024}''' - ) + f'do_gc {tenant.hex} {timeline_main.hex} {lsn_from_hex(lsn2) - lsn_from_hex(lsn1) + 1024}') env.neon_cli.create_branch('test_branch', 'test_main', @@ -100,8 +97,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): branch_cur = pg_branch.connect().cursor() branch_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)') - branch_cur.execute('SELECT count(*) FROM foo') - assert branch_cur.fetchone() == (200000, ) + assert query_scalar(branch_cur, 'SELECT count(*) FROM foo') == 200000 # This test simulates a race condition happening when branch creation and GC are performed concurrently. diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index 0274c6c1e0..95f478dda8 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -1,9 +1,7 @@ -from contextlib import closing - import psycopg2.extras import pytest from fixtures.log_helper import log -from fixtures.utils import print_gc_result +from fixtures.utils import print_gc_result, query_scalar from fixtures.neon_fixtures import NeonEnvBuilder @@ -27,26 +25,22 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): pgmain = env.postgres.create_start('test_branch_behind') log.info("postgres is running on 'test_branch_behind' branch") - main_pg_conn = pgmain.connect() - main_cur = main_pg_conn.cursor() + main_cur = pgmain.connect().cursor() - main_cur.execute("SHOW neon.timeline_id") - timeline = main_cur.fetchone()[0] + timeline = query_scalar(main_cur, "SHOW neon.timeline_id") # Create table, and insert the first 100 rows main_cur.execute('CREATE TABLE foo (t text)') # keep some early lsn to test branch creation on out of date lsn - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - gced_lsn = main_cur.fetchone()[0] + gced_lsn = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') main_cur.execute(''' INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100) g ''') - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_a = main_cur.fetchone()[0] + lsn_a = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') log.info(f'LSN after 100 rows: {lsn_a}') # Insert some more rows. (This generates enough WAL to fill a few segments.) @@ -55,8 +49,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): SELECT 'long string to consume some space' || g FROM generate_series(1, 200000) g ''') - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_b = main_cur.fetchone()[0] + lsn_b = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') log.info(f'LSN after 200100 rows: {lsn_b}') # Branch at the point where only 100 rows were inserted @@ -70,10 +63,8 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): SELECT 'long string to consume some space' || g FROM generate_series(1, 200000) g ''') - main_cur.execute('SELECT pg_current_wal_insert_lsn()') + lsn_c = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_c = main_cur.fetchone()[0] log.info(f'LSN after 400100 rows: {lsn_c}') # Branch at the point where only 200100 rows were inserted @@ -85,20 +76,15 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): pg_more = env.postgres.create_start('test_branch_behind_more') # On the 'hundred' branch, we should see only 100 rows - hundred_pg_conn = pg_hundred.connect() - hundred_cur = hundred_pg_conn.cursor() - hundred_cur.execute('SELECT count(*) FROM foo') - assert hundred_cur.fetchone() == (100, ) + hundred_cur = pg_hundred.connect().cursor() + assert query_scalar(hundred_cur, 'SELECT count(*) FROM foo') == 100 # On the 'more' branch, we should see 100200 rows - more_pg_conn = pg_more.connect() - more_cur = more_pg_conn.cursor() - more_cur.execute('SELECT count(*) FROM foo') - assert more_cur.fetchone() == (200100, ) + more_cur = pg_more.connect().cursor() + assert query_scalar(more_cur, 'SELECT count(*) FROM foo') == 200100 # All the rows are visible on the main branch - main_cur.execute('SELECT count(*) FROM foo') - assert main_cur.fetchone() == (400100, ) + assert query_scalar(main_cur, 'SELECT count(*) FROM foo') == 400100 # Check bad lsn's for branching @@ -107,9 +93,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): 'test_branch_behind', ancestor_start_lsn="0/3000000") pg = env.postgres.create_start('test_branch_segment_boundary') - cur = pg.connect().cursor() - cur.execute('SELECT 1') - assert cur.fetchone() == (1, ) + assert pg.safe_psql('SELECT 1')[0][0] == 1 # branch at pre-initdb lsn with pytest.raises(Exception, match="invalid branch start lsn"): @@ -122,12 +106,11 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): ancestor_start_lsn="0/42") # check that we cannot create branch based on garbage collected data - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - # call gc to advace latest_gc_cutoff_lsn - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) + with env.pageserver.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: + # call gc to advace latest_gc_cutoff_lsn + pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") + row = pscur.fetchone() + print_gc_result(row) with pytest.raises(Exception, match="invalid branch start lsn"): # this gced_lsn is pretty random, so if gc is disabled this woudln't fail @@ -136,11 +119,8 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): ancestor_start_lsn=gced_lsn) # check that after gc everything is still there - hundred_cur.execute('SELECT count(*) FROM foo') - assert hundred_cur.fetchone() == (100, ) + assert query_scalar(hundred_cur, 'SELECT count(*) FROM foo') == 100 - more_cur.execute('SELECT count(*) FROM foo') - assert more_cur.fetchone() == (200100, ) + assert query_scalar(more_cur, 'SELECT count(*) FROM foo') == 200100 - main_cur.execute('SELECT count(*) FROM foo') - assert main_cur.fetchone() == (400100, ) + assert query_scalar(main_cur, 'SELECT count(*) FROM foo') == 400100 diff --git a/test_runner/batch_others/test_broken_timeline.py b/test_runner/batch_others/test_broken_timeline.py index 675236fbd7..b9e5f637ab 100644 --- a/test_runner/batch_others/test_broken_timeline.py +++ b/test_runner/batch_others/test_broken_timeline.py @@ -1,10 +1,14 @@ +from typing import List, Tuple +from uuid import UUID import pytest import concurrent.futures from contextlib import closing -from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv +from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv, Postgres from fixtures.log_helper import log import os +from fixtures.utils import query_scalar + # Test restarting page server, while safekeeper and compute node keep # running. @@ -13,7 +17,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - tenant_timelines = [] + tenant_timelines: List[Tuple[str, str, Postgres]] = [] for n in range(4): tenant_id_uuid, timeline_id_uuid = env.neon_cli.create_tenant() @@ -21,13 +25,11 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): timeline_id = timeline_id_uuid.hex pg = env.postgres.create_start(f'main', tenant_id=tenant_id_uuid) - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute("CREATE TABLE t(key int primary key, value text)") - cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'") + with pg.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'") - cur.execute("SHOW neon.timeline_id") - timeline_id = cur.fetchone()[0] + timeline_id = query_scalar(cur, "SHOW neon.timeline_id") pg.stop() tenant_timelines.append((tenant_id, timeline_id, pg)) @@ -68,10 +70,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # Tenant 0 should still work pg0.start() - with closing(pg0.connect()) as conn: - with conn.cursor() as cur: - cur.execute("SELECT COUNT(*) FROM t") - assert cur.fetchone()[0] == 100 + assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100 # But all others are broken for n in range(1, 4): diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/batch_others/test_clog_truncate.py index cbf55e9fc1..cdb577f480 100644 --- a/test_runner/batch_others/test_clog_truncate.py +++ b/test_runner/batch_others/test_clog_truncate.py @@ -5,6 +5,7 @@ from contextlib import closing from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log +from fixtures.utils import query_scalar # @@ -32,17 +33,16 @@ def test_clog_truncate(neon_simple_env: NeonEnv): pg.safe_psql('CREATE EXTENSION neon_test_utils') # Consume many xids to advance clog - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute('select test_consume_xids(1000*1000*10);') - log.info('xids consumed') + with pg.cursor() as cur: + cur.execute('select test_consume_xids(1000*1000*10);') + log.info('xids consumed') - # call a checkpoint to trigger TruncateSubtrans - cur.execute('CHECKPOINT;') + # call a checkpoint to trigger TruncateSubtrans + cur.execute('CHECKPOINT;') - # ensure WAL flush - cur.execute('select txid_current()') - log.info(cur.fetchone()) + # ensure WAL flush + cur.execute('select txid_current()') + log.info(cur.fetchone()) # wait for autovacuum to truncate the pg_xact # XXX Is it worth to add a timeout here? @@ -54,11 +54,9 @@ def test_clog_truncate(neon_simple_env: NeonEnv): time.sleep(5) # checkpoint to advance latest lsn - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute('CHECKPOINT;') - cur.execute('select pg_current_wal_insert_lsn()') - lsn_after_truncation = cur.fetchone()[0] + with pg.cursor() as cur: + cur.execute('CHECKPOINT;') + lsn_after_truncation = query_scalar(cur, 'select pg_current_wal_insert_lsn()') # create new branch after clog truncation and start a compute node on it log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}') diff --git a/test_runner/batch_others/test_createdropdb.py b/test_runner/batch_others/test_createdropdb.py index 151ce997ee..0fbf6e2a47 100644 --- a/test_runner/batch_others/test_createdropdb.py +++ b/test_runner/batch_others/test_createdropdb.py @@ -4,6 +4,7 @@ import pathlib from contextlib import closing from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.log_helper import log +from fixtures.utils import query_scalar # @@ -16,15 +17,13 @@ def test_createdb(neon_simple_env: NeonEnv): pg = env.postgres.create_start('test_createdb') log.info("postgres is running on 'test_createdb' branch") - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # Cause a 'relmapper' change in the original branch - cur.execute('VACUUM FULL pg_class') + with pg.cursor() as cur: + # Cause a 'relmapper' change in the original branch + cur.execute('VACUUM FULL pg_class') - cur.execute('CREATE DATABASE foodb') + cur.execute('CREATE DATABASE foodb') - cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn = cur.fetchone()[0] + lsn = query_scalar(cur, 'SELECT pg_current_wal_insert_lsn()') # Create a branch env.neon_cli.create_branch('test_createdb2', 'test_createdb', ancestor_start_lsn=lsn) @@ -32,21 +31,21 @@ def test_createdb(neon_simple_env: NeonEnv): # Test that you can connect to the new database on both branches for db in (pg, pg2): - with closing(db.connect(dbname='foodb')) as conn: - with conn.cursor() as cur: - # Check database size in both branches - cur.execute(""" - select pg_size_pretty(pg_database_size('foodb')), - pg_size_pretty( - sum(pg_relation_size(oid, 'main')) - +sum(pg_relation_size(oid, 'vm')) - +sum(pg_relation_size(oid, 'fsm')) - ) FROM pg_class where relisshared is false - """) - res = cur.fetchone() - # check that dbsize equals sum of all relation sizes, excluding shared ones - # This is how we define dbsize in neon for now - assert res[0] == res[1] + with db.cursor(dbname='foodb') as cur: + # Check database size in both branches + cur.execute(""" + select pg_size_pretty(pg_database_size('foodb')), + pg_size_pretty( + sum(pg_relation_size(oid, 'main')) + +sum(pg_relation_size(oid, 'vm')) + +sum(pg_relation_size(oid, 'fsm')) + ) FROM pg_class where relisshared is false + """) + res = cur.fetchone() + assert res is not None + # check that dbsize equals sum of all relation sizes, excluding shared ones + # This is how we define dbsize in neon for now + assert res[0] == res[1] # @@ -58,24 +57,19 @@ def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): pg = env.postgres.create_start('test_dropdb') log.info("postgres is running on 'test_dropdb' branch") - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute('CREATE DATABASE foodb') + with pg.cursor() as cur: + cur.execute('CREATE DATABASE foodb') - cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_before_drop = cur.fetchone()[0] + lsn_before_drop = query_scalar(cur, 'SELECT pg_current_wal_insert_lsn()') - cur.execute("SELECT oid FROM pg_database WHERE datname='foodb';") - dboid = cur.fetchone()[0] + dboid = query_scalar(cur, "SELECT oid FROM pg_database WHERE datname='foodb';") - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute('DROP DATABASE foodb') + with pg.cursor() as cur: + cur.execute('DROP DATABASE foodb') - cur.execute('CHECKPOINT') + cur.execute('CHECKPOINT') - cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_after_drop = cur.fetchone()[0] + lsn_after_drop = query_scalar(cur, 'SELECT pg_current_wal_insert_lsn()') # Create two branches before and after database drop. env.neon_cli.create_branch('test_before_dropdb', diff --git a/test_runner/batch_others/test_createuser.py b/test_runner/batch_others/test_createuser.py index cbfe496e19..d48db05395 100644 --- a/test_runner/batch_others/test_createuser.py +++ b/test_runner/batch_others/test_createuser.py @@ -1,7 +1,6 @@ -from contextlib import closing - from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log +from fixtures.utils import query_scalar # @@ -13,15 +12,13 @@ def test_createuser(neon_simple_env: NeonEnv): pg = env.postgres.create_start('test_createuser') log.info("postgres is running on 'test_createuser' branch") - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # Cause a 'relmapper' change in the original branch - cur.execute('CREATE USER testuser with password %s', ('testpwd', )) + with pg.cursor() as cur: + # Cause a 'relmapper' change in the original branch + cur.execute('CREATE USER testuser with password %s', ('testpwd', )) - cur.execute('CHECKPOINT') + cur.execute('CHECKPOINT') - cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn = cur.fetchone()[0] + lsn = query_scalar(cur, 'SELECT pg_current_wal_insert_lsn()') # Create a branch env.neon_cli.create_branch('test_createuser2', 'test_createuser', ancestor_start_lsn=lsn) diff --git a/test_runner/batch_others/test_fullbackup.py b/test_runner/batch_others/test_fullbackup.py index cd6c40f56b..bce085c157 100644 --- a/test_runner/batch_others/test_fullbackup.py +++ b/test_runner/batch_others/test_fullbackup.py @@ -1,10 +1,8 @@ -from contextlib import closing - from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres from fixtures.neon_fixtures import pg_distrib_dir import os -from fixtures.utils import subprocess_capture +from fixtures.utils import query_scalar, subprocess_capture num_rows = 1000 @@ -21,19 +19,17 @@ def test_fullbackup(neon_env_builder: NeonEnvBuilder, pgmain = env.postgres.create_start('test_fullbackup') log.info("postgres is running on 'test_fullbackup' branch") - timeline = pgmain.safe_psql("SHOW neon.timeline_id")[0][0] + with pgmain.cursor() as cur: + timeline = query_scalar(cur, "SHOW neon.timeline_id") - with closing(pgmain.connect()) as conn: - with conn.cursor() as cur: - # data loading may take a while, so increase statement timeout - cur.execute("SET statement_timeout='300s'") - cur.execute(f'''CREATE TABLE tbl AS SELECT 'long string to consume some space' || g - from generate_series(1,{num_rows}) g''') - cur.execute("CHECKPOINT") + # data loading may take a while, so increase statement timeout + cur.execute("SET statement_timeout='300s'") + cur.execute(f'''CREATE TABLE tbl AS SELECT 'long string to consume some space' || g + from generate_series(1,{num_rows}) g''') + cur.execute("CHECKPOINT") - cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn = cur.fetchone()[0] - log.info(f"start_backup_lsn = {lsn}") + lsn = query_scalar(cur, 'SELECT pg_current_wal_insert_lsn()') + log.info(f"start_backup_lsn = {lsn}") # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. # PgBin sets it automatically, but here we need to pipe psql output to the tar command. diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/batch_others/test_gc_aggressive.py index bffeedfdc0..d7f6308182 100644 --- a/test_runner/batch_others/test_gc_aggressive.py +++ b/test_runner/batch_others/test_gc_aggressive.py @@ -3,6 +3,7 @@ import random from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres from fixtures.log_helper import log +from fixtures.utils import query_scalar # Test configuration # @@ -59,22 +60,21 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): pg = env.postgres.create_start('test_gc_aggressive') log.info('postgres is running on test_gc_aggressive branch') - conn = pg.connect() - cur = conn.cursor() + with pg.cursor() as cur: + timeline = query_scalar(cur, "SHOW neon.timeline_id") - cur.execute("SHOW neon.timeline_id") - timeline = cur.fetchone()[0] + # Create table, and insert the first 100 rows + cur.execute('CREATE TABLE foo (id int, counter int, t text)') + cur.execute(f''' + INSERT INTO foo + SELECT g, 0, 'long string to consume some space' || g + FROM generate_series(1, {num_rows}) g + ''') + cur.execute('CREATE INDEX ON foo(id)') - # Create table, and insert the first 100 rows - cur.execute('CREATE TABLE foo (id int, counter int, t text)') - cur.execute(f''' - INSERT INTO foo - SELECT g, 0, 'long string to consume some space' || g - FROM generate_series(1, {num_rows}) g - ''') - cur.execute('CREATE INDEX ON foo(id)') + asyncio.run(update_and_gc(env, pg, timeline)) - asyncio.run(update_and_gc(env, pg, timeline)) - - cur.execute('SELECT COUNT(*), SUM(counter) FROM foo') - assert cur.fetchone() == (num_rows, updates_to_perform) + cur.execute('SELECT COUNT(*), SUM(counter) FROM foo') + r = cur.fetchone() + assert r is not None + assert r == (num_rows, updates_to_perform) diff --git a/test_runner/batch_others/test_lsn_mapping.py b/test_runner/batch_others/test_lsn_mapping.py index 1eca92ed58..d8b207135e 100644 --- a/test_runner/batch_others/test_lsn_mapping.py +++ b/test_runner/batch_others/test_lsn_mapping.py @@ -8,6 +8,8 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres from fixtures.log_helper import log import time +from fixtures.utils import query_scalar + # # Test pageserver get_lsn_by_timestamp API @@ -20,11 +22,8 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): pgmain = env.postgres.create_start("test_lsn_mapping") log.info("postgres is running on 'test_lsn_mapping' branch") - ps_conn = env.pageserver.connect() - ps_cur = ps_conn.cursor() - conn = pgmain.connect() - cur = conn.cursor() - + ps_cur = env.pageserver.connect().cursor() + cur = pgmain.connect().cursor() # Create table, and insert rows, each in a separate transaction # Disable synchronous_commit to make this initialization go faster. # @@ -35,9 +34,8 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): tbl = [] for i in range(1000): cur.execute(f"INSERT INTO foo VALUES({i})") - cur.execute(f'SELECT clock_timestamp()') # Get the timestamp at UTC - after_timestamp = cur.fetchone()[0].replace(tzinfo=None) + after_timestamp = query_scalar(cur, 'SELECT clock_timestamp()').replace(tzinfo=None) tbl.append([i, after_timestamp]) # Execute one more transaction with synchronous_commit enabled, to flush @@ -47,18 +45,18 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Check edge cases: timestamp in the future probe_timestamp = tbl[-1][1] + timedelta(hours=1) - ps_cur.execute( + result = query_scalar( + ps_cur, f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'" ) - result = ps_cur.fetchone()[0] assert result == 'future' # timestamp too the far history probe_timestamp = tbl[0][1] - timedelta(hours=10) - ps_cur.execute( + result = query_scalar( + ps_cur, f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'" ) - result = ps_cur.fetchone()[0] assert result == 'past' # Probe a bunch of timestamps in the valid range @@ -66,19 +64,16 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): probe_timestamp = tbl[i][1] # Call get_lsn_by_timestamp to get the LSN - ps_cur.execute( + lsn = query_scalar( + ps_cur, f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'" ) - lsn = ps_cur.fetchone()[0] # Launch a new read-only node at that LSN, and check that only the rows # that were supposed to be committed at that point in time are visible. pg_here = env.postgres.create_start(branch_name='test_lsn_mapping', node_name='test_lsn_mapping_read', lsn=lsn) - with closing(pg_here.connect()) as conn_here: - with conn_here.cursor() as cur_here: - cur_here.execute("SELECT max(x) FROM foo") - assert cur_here.fetchone()[0] == i + assert pg_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i pg_here.stop_and_destroy() diff --git a/test_runner/batch_others/test_multixact.py b/test_runner/batch_others/test_multixact.py index b17676658b..dd00066092 100644 --- a/test_runner/batch_others/test_multixact.py +++ b/test_runner/batch_others/test_multixact.py @@ -1,5 +1,6 @@ from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.log_helper import log +from fixtures.utils import query_scalar # @@ -14,16 +15,14 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): pg = env.postgres.create_start('test_multixact') log.info("postgres is running on 'test_multixact' branch") - pg_conn = pg.connect() - cur = pg_conn.cursor() - + cur = pg.connect().cursor() cur.execute(''' CREATE TABLE t1(i int primary key); INSERT INTO t1 select * from generate_series(1, 100); ''') - cur.execute('SELECT next_multixact_id FROM pg_control_checkpoint()') - next_multixact_id_old = cur.fetchone()[0] + next_multixact_id_old = query_scalar(cur, + 'SELECT next_multixact_id FROM pg_control_checkpoint()') # Lock entries using parallel connections in a round-robin fashion. nclients = 20 @@ -53,6 +52,7 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): cur.execute( 'SELECT next_multixact_id, pg_current_wal_insert_lsn() FROM pg_control_checkpoint()') res = cur.fetchone() + assert res is not None next_multixact_id = res[0] lsn = res[1] @@ -64,11 +64,8 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): pg_new = env.postgres.create_start('test_multixact_new') log.info("postgres is running on 'test_multixact_new' branch") - pg_new_conn = pg_new.connect() - cur_new = pg_new_conn.cursor() - - cur_new.execute('SELECT next_multixact_id FROM pg_control_checkpoint()') - next_multixact_id_new = cur_new.fetchone()[0] + next_multixact_id_new = pg_new.safe_psql( + 'SELECT next_multixact_id FROM pg_control_checkpoint()')[0][0] # Check that we restored pg_controlfile correctly assert next_multixact_id_new == next_multixact_id diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py index 1e96c0a1fa..78a936af19 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -1,6 +1,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log -from fixtures.utils import print_gc_result +from fixtures.utils import print_gc_result, query_scalar import psycopg2.extras @@ -26,8 +26,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): cur = pg_conn.cursor() # Get the timeline ID of our branch. We need it for the 'do_gc' command - cur.execute("SHOW neon.timeline_id") - timeline = cur.fetchone()[0] + timeline = query_scalar(cur, "SHOW neon.timeline_id") psconn = env.pageserver.connect() pscur = psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) @@ -48,6 +47,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): from pg_settings where name = 'shared_buffers' ''') row = cur.fetchone() + assert row is not None log.info(f'shared_buffers is {row[0]}, table size {row[1]}') assert int(row[0]) < int(row[1]) diff --git a/test_runner/batch_others/test_pageserver_restart.py b/test_runner/batch_others/test_pageserver_restart.py index 403ff7b305..c656469cb7 100644 --- a/test_runner/batch_others/test_pageserver_restart.py +++ b/test_runner/batch_others/test_pageserver_restart.py @@ -30,6 +30,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): from pg_settings where name = 'shared_buffers' ''') row = cur.fetchone() + assert row is not None log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) diff --git a/test_runner/batch_others/test_pitr_gc.py b/test_runner/batch_others/test_pitr_gc.py index 161f628429..d63fc4b584 100644 --- a/test_runner/batch_others/test_pitr_gc.py +++ b/test_runner/batch_others/test_pitr_gc.py @@ -1,10 +1,8 @@ -import subprocess from contextlib import closing import psycopg2.extras -import pytest from fixtures.log_helper import log -from fixtures.utils import print_gc_result +from fixtures.utils import print_gc_result, query_scalar from fixtures.neon_fixtures import NeonEnvBuilder @@ -24,9 +22,7 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() - - main_cur.execute("SHOW neon.timeline_id") - timeline = main_cur.fetchone()[0] + timeline = query_scalar(main_cur, "SHOW neon.timeline_id") # Create table main_cur.execute('CREATE TABLE foo (t text)') @@ -41,12 +37,15 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): # keep some early lsn to test branch creation after GC main_cur.execute('SELECT pg_current_wal_insert_lsn(), txid_current()') res = main_cur.fetchone() + assert res is not None lsn_a = res[0] xid_a = res[1] log.info(f'LSN after 100 rows: {lsn_a} xid {xid_a}') main_cur.execute('SELECT pg_current_wal_insert_lsn(), txid_current()') res = main_cur.fetchone() + assert res is not None + debug_lsn = res[0] debug_xid = res[1] log.info(f'LSN after 10000 rows: {debug_lsn} xid {debug_xid}') diff --git a/test_runner/batch_others/test_read_validation.py b/test_runner/batch_others/test_read_validation.py index 6b8a154865..4be7af4c10 100644 --- a/test_runner/batch_others/test_read_validation.py +++ b/test_runner/batch_others/test_read_validation.py @@ -6,6 +6,8 @@ from fixtures.log_helper import log from psycopg2.errors import UndefinedTable from psycopg2.errors import IoError +from fixtures.utils import query_scalar + pytest_plugins = ("fixtures.neon_fixtures") extensions = ["pageinspect", "neon_test_utils", "pg_buffercache"] @@ -32,9 +34,9 @@ def test_read_validation(neon_simple_env: NeonEnv): c.execute("select lsn, lower, upper from page_header(get_raw_page('foo', 'main', 0));") first = c.fetchone() + assert first is not None - c.execute("select relfilenode from pg_class where relname = 'foo'") - relfilenode = c.fetchone()[0] + relfilenode = query_scalar(c, "select relfilenode from pg_class where relname = 'foo'") c.execute("insert into foo values (2);") c.execute("select lsn, lower, upper from page_header(get_raw_page('foo', 'main', 0));") @@ -44,22 +46,25 @@ def test_read_validation(neon_simple_env: NeonEnv): log.info("Test table is populated, validating buffer cache") - c.execute( + cache_entries = query_scalar( + c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode)) - assert c.fetchone()[0] > 0, "No buffers cached for the test relation" + assert cache_entries > 0, "No buffers cached for the test relation" c.execute( "select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {}" .format(relfilenode)) reln = c.fetchone() + assert reln is not None log.info("Clear buffer cache to ensure no stale pages are brought into the cache") c.execute("select clear_buffer_cache()") - c.execute( + cache_entries = query_scalar( + c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode)) - assert c.fetchone()[0] == 0, "Failed to clear buffer cache" + assert cache_entries == 0, "Failed to clear buffer cache" log.info("Cache is clear, reading stale page version") @@ -69,9 +74,10 @@ def test_read_validation(neon_simple_env: NeonEnv): direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn" - c.execute( + cache_entries = query_scalar( + c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode)) - assert c.fetchone()[0] == 0, "relation buffers detected after invalidation" + assert cache_entries == 0, "relation buffers detected after invalidation" log.info("Cache is clear, reading latest page version without cache") @@ -81,9 +87,10 @@ def test_read_validation(neon_simple_env: NeonEnv): direct_latest = c.fetchone() assert second == direct_latest, "Failed fetch page at latest lsn" - c.execute( + cache_entries = query_scalar( + c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode)) - assert c.fetchone()[0] == 0, "relation buffers detected after invalidation" + assert cache_entries == 0, "relation buffers detected after invalidation" log.info( "Cache is clear, reading stale page version without cache using relation identifiers" diff --git a/test_runner/batch_others/test_readonly_node.py b/test_runner/batch_others/test_readonly_node.py index 286c756a5e..82fc6329cf 100644 --- a/test_runner/batch_others/test_readonly_node.py +++ b/test_runner/batch_others/test_readonly_node.py @@ -1,6 +1,7 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import query_scalar # @@ -27,7 +28,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): FROM generate_series(1, 100) g ''') main_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_a = main_cur.fetchone()[0] + lsn_a = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') log.info('LSN after 100 rows: ' + lsn_a) # Insert some more rows. (This generates enough WAL to fill a few segments.) @@ -36,8 +37,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): SELECT 'long string to consume some space' || g FROM generate_series(1, 200000) g ''') - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_b = main_cur.fetchone()[0] + lsn_b = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') log.info('LSN after 200100 rows: ' + lsn_b) # Insert many more rows. This generates enough WAL to fill a few segments. @@ -47,8 +47,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): FROM generate_series(1, 200000) g ''') - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_c = main_cur.fetchone()[0] + lsn_c = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') log.info('LSN after 400100 rows: ' + lsn_c) # Create first read-only node at the point where only 100 rows were inserted diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index 163912690c..6a8497a559 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -8,7 +8,7 @@ import time from uuid import UUID from fixtures.neon_fixtures import NeonEnvBuilder, assert_timeline_local, wait_until, wait_for_last_record_lsn, wait_for_upload from fixtures.log_helper import log -from fixtures.utils import lsn_from_hex, lsn_to_hex +from fixtures.utils import lsn_from_hex, query_scalar import pytest @@ -57,14 +57,12 @@ def test_remote_storage_backup_and_restore(neon_env_builder: NeonEnvBuilder, sto checkpoint_numbers = range(1, 3) for checkpoint_number in checkpoint_numbers: - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute(f''' - CREATE TABLE t{checkpoint_number}(id int primary key, secret text); - INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); - ''') - cur.execute("SELECT pg_current_wal_flush_lsn()") - current_lsn = lsn_from_hex(cur.fetchone()[0]) + with pg.cursor() as cur: + cur.execute(f''' + CREATE TABLE t{checkpoint_number}(id int primary key, secret text); + INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); + ''') + current_lsn = lsn_from_hex(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) # wait until pageserver receives that data wait_for_last_record_lsn(client, UUID(tenant_id), UUID(timeline_id), current_lsn) @@ -123,8 +121,8 @@ def test_remote_storage_backup_and_restore(neon_env_builder: NeonEnvBuilder, sto assert not detail['remote']['awaits_download'] pg = env.postgres.create_start('main') - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - for checkpoint_number in checkpoint_numbers: - cur.execute(f'SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};') - assert cur.fetchone() == (f'{data_secret}|{checkpoint_number}', ) + with pg.cursor() as cur: + for checkpoint_number in checkpoint_numbers: + assert query_scalar(cur, + f'SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};' + ) == f'{data_secret}|{checkpoint_number}' diff --git a/test_runner/batch_others/test_tenants_with_remote_storage.py b/test_runner/batch_others/test_tenants_with_remote_storage.py index 41506ad920..8ddb4d1b92 100644 --- a/test_runner/batch_others/test_tenants_with_remote_storage.py +++ b/test_runner/batch_others/test_tenants_with_remote_storage.py @@ -8,6 +8,7 @@ import asyncio from contextlib import closing +from typing import List, Tuple from uuid import UUID import pytest @@ -59,7 +60,7 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, storage_type: str): env = neon_env_builder.init_start() - tenants_pgs = [] + tenants_pgs: List[Tuple[UUID, Postgres]] = [] for i in range(1, 5): # Use a tiny checkpoint distance, to create a lot of layers quickly @@ -80,14 +81,11 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, storage_type: str): # Wait for the remote storage uploads to finish pageserver_http = env.pageserver.http_client() for tenant, pg in tenants_pgs: - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute("show neon.tenant_id") - tenant_id = cur.fetchone()[0] - cur.execute("show neon.timeline_id") - timeline_id = cur.fetchone()[0] - cur.execute("SELECT pg_current_wal_flush_lsn()") - current_lsn = lsn_from_hex(cur.fetchone()[0]) + res = pg.safe_psql_many( + ["SHOW neon.tenant_id", "SHOW neon.timeline_id", "SELECT pg_current_wal_flush_lsn()"]) + tenant_id = res[0][0][0] + timeline_id = res[1][0][0] + current_lsn = lsn_from_hex(res[2][0][0]) # wait until pageserver receives all the data wait_for_last_record_lsn(pageserver_http, UUID(tenant_id), UUID(timeline_id), current_lsn) diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index f76529f1f7..c736893f99 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -102,17 +102,14 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 raise RuntimeError( f"timed out waiting for pageserver to reach pg_current_wal_flush_lsn()") - with closing(pgmain.connect()) as conn: - with conn.cursor() as cur: - - cur.execute(''' - select pg_size_pretty(pg_cluster_size()), - pg_wal_lsn_diff(pg_current_wal_flush_lsn(),received_lsn) as received_lsn_lag - FROM backpressure_lsns(); - ''') - res = cur.fetchone() - log.info(f"pg_cluster_size = {res[0]}, received_lsn_lag = {res[1]}") - received_lsn_lag = res[1] + res = pgmain.safe_psql(''' + SELECT + pg_size_pretty(pg_cluster_size()), + pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag + FROM backpressure_lsns(); + ''')[0] + log.info(f"pg_cluster_size = {res[0]}, received_lsn_lag = {res[1]}") + received_lsn_lag = res[1] time.sleep(polling_interval) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 844ef3ebe1..da861bb9f3 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -15,7 +15,7 @@ from dataclasses import dataclass, field from multiprocessing import Process, Value from pathlib import Path from fixtures.neon_fixtures import NeonPageserver, PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, neon_binpath, PgProtocol, wait_for_last_record_lsn, wait_for_upload -from fixtures.utils import get_dir_size, lsn_to_hex, lsn_from_hex +from fixtures.utils import get_dir_size, lsn_to_hex, lsn_from_hex, query_scalar from fixtures.log_helper import log from typing import List, Optional, Any from uuid import uuid4 @@ -229,8 +229,7 @@ def test_restarts(neon_env_builder: NeonEnvBuilder): else: failed_node.start() failed_node = None - cur.execute('SELECT sum(key) FROM t') - assert cur.fetchone() == (500500, ) + assert query_scalar(cur, 'SELECT sum(key) FROM t') == 500500 # Test that safekeepers push their info to the broker and learn peer status from it @@ -286,12 +285,10 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): env.neon_cli.create_branch('test_safekeepers_wal_removal') pg = env.postgres.create_start('test_safekeepers_wal_removal') - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - cur.execute('CREATE TABLE t(key int primary key, value text)') - cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + pg.safe_psql_many([ + 'CREATE TABLE t(key int primary key, value text)', + "INSERT INTO t SELECT generate_series(1,100000), 'payload'", + ]) tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] @@ -469,8 +466,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, storage_type: str): cur.execute("insert into t select generate_series(1,500000), 'payload'") expected_sum += 500000 * 500001 // 2 - cur.execute("select sum(key) from t") - assert cur.fetchone()[0] == expected_sum + assert query_scalar(cur, "select sum(key) from t") == expected_sum for sk in env.safekeepers: wait_segment_offload(tenant_id, timeline_id, sk, seg_end) @@ -484,8 +480,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, storage_type: str): # require WAL to be trimmed, so no more than one segment is left on disk wait_wal_trim(tenant_id, timeline_id, sk, 16 * 1.5) - cur.execute('SELECT pg_current_wal_flush_lsn()') - last_lsn = cur.fetchone()[0] + last_lsn = query_scalar(cur, 'SELECT pg_current_wal_flush_lsn()') pageserver_lsn = env.pageserver.http_client().timeline_detail( uuid.UUID(tenant_id), uuid.UUID((timeline_id)))["local"]["last_record_lsn"] @@ -532,10 +527,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, storage_type: str): # verify data pg.create_start('test_s3_wal_replay') - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute("select sum(key) from t") - assert cur.fetchone()[0] == expected_sum + assert pg.safe_psql("select sum(key) from t")[0][0] == expected_sum class ProposerPostgres(PgProtocol): @@ -860,12 +852,10 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): # as waiting for acceptors happens there cur.execute('CREATE TABLE IF NOT EXISTS t(key int, value text)') cur.execute("INSERT INTO t VALUES (0, 'something')") - cur.execute('SELECT SUM(key) FROM t') - sum_before = cur.fetchone()[0] + sum_before = query_scalar(cur, 'SELECT SUM(key) FROM t') cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - cur.execute('SELECT SUM(key) FROM t') - sum_after = cur.fetchone()[0] + sum_after = query_scalar(cur, 'SELECT SUM(key) FROM t') assert sum_after == sum_before + 5000050000 def show_statuses(safekeepers: List[Safekeeper], tenant_id: str, timeline_id: str): @@ -950,8 +940,7 @@ def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): assert pg.pgdata_dir is not None log.info('executing INSERT to generate WAL') - cur.execute("select pg_current_wal_lsn()") - current_lsn = lsn_from_hex(cur.fetchone()[0]) / 1024 / 1024 + current_lsn = lsn_from_hex(query_scalar(cur, "select pg_current_wal_lsn()")) / 1024 / 1024 pg_wal_size = get_dir_size(os.path.join(pg.pgdata_dir, 'pg_wal')) / 1024 / 1024 if enable_logs: log.info(f"LSN delta: {current_lsn - last_lsn} MB, current WAL size: {pg_wal_size} MB") diff --git a/test_runner/batch_pg_regress/test_pg_regress.py b/test_runner/batch_pg_regress/test_pg_regress.py index b53bc21ca2..28066d7a32 100644 --- a/test_runner/batch_pg_regress/test_pg_regress.py +++ b/test_runner/batch_pg_regress/test_pg_regress.py @@ -50,7 +50,6 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: pathlib.Path, pg_ # checkpoint one more time to ensure that the lsn we get is the latest one pg.safe_psql('CHECKPOINT') - pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0] # Check that we restore the content of the datadir correctly check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 9808d83492..e6c3a79697 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -70,6 +70,7 @@ class PgCompare(ABC): for pg_stat in pg_stats: cur.execute(pg_stat.query) row = cur.fetchone() + assert row is not None assert len(row) == len(pg_stat.columns) for col, val in zip(pg_stat.columns, row): diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 8fa9e4a2ea..6783ab710b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1,6 +1,7 @@ from __future__ import annotations from dataclasses import field +from contextlib import contextmanager from enum import Flag, auto import textwrap from cached_property import cached_property @@ -306,6 +307,15 @@ class PgProtocol: conn.autocommit = autocommit return conn + @contextmanager + def cursor(self, autocommit=True, **kwargs): + """ + Shorthand for pg.connect().cursor(). + The cursor and connection are closed when the context is exited. + """ + with closing(self.connect(autocommit=autocommit, **kwargs)) as conn: + yield conn.cursor() + async def connect_async(self, **kwargs) -> asyncpg.Connection: """ Connect to the node from async python. @@ -354,7 +364,7 @@ class PgProtocol: if cur.description is None: result.append([]) # query didn't return data else: - result.append(cast(List[Any], cur.fetchall())) + result.append(cur.fetchall()) return result @@ -2142,12 +2152,8 @@ def list_files_to_compare(pgdata_dir: pathlib.Path): # pg is the existing and running compute node, that we want to compare with a basebackup def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Postgres): - # Get the timeline ID. We need it for the 'basebackup' command - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute("SHOW neon.timeline_id") - timeline = cur.fetchone()[0] + timeline = pg.safe_psql("SHOW neon.timeline_id")[0][0] # stop postgres to ensure that files won't change pg.stop() diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index bc50a43ada..a86c5ad923 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -6,6 +6,8 @@ import subprocess from pathlib import Path from typing import Any, List, Tuple + +from psycopg2.extensions import cursor from fixtures.log_helper import log @@ -79,6 +81,20 @@ def etcd_path() -> Path: return Path(path_output) +def query_scalar(cur: cursor, query: str) -> Any: + """ + It is a convenience wrapper to avoid repetitions + of cur.execute(); cur.fetchone()[0] + + And this is mypy friendly, because without None + check mypy says that Optional is not indexable. + """ + cur.execute(query) + var = cur.fetchone() + assert var is not None + return var[0] + + # Traverse directory to get total size. def get_dir_size(path: str) -> int: """Return size in bytes.""" diff --git a/test_runner/performance/test_random_writes.py b/test_runner/performance/test_random_writes.py index 4350386dd0..8931234c51 100644 --- a/test_runner/performance/test_random_writes.py +++ b/test_runner/performance/test_random_writes.py @@ -9,6 +9,8 @@ import psycopg2.extras import random import time +from fixtures.utils import query_scalar + # This is a clear-box test that demonstrates the worst case scenario for the # "1 segment per layer" implementation of the pageserver. It writes to random @@ -59,9 +61,7 @@ def test_random_writes(neon_with_baseline: PgCompare): rows_inserted += rows_to_insert # Get table size (can't be predicted because padding and alignment) - cur.execute("SELECT pg_relation_size('Big');") - row = cur.fetchone() - table_size = row[0] + table_size = query_scalar(cur, "SELECT pg_relation_size('Big')") env.zenbenchmark.record("table_size", table_size, 'bytes', MetricReport.TEST_PARAM) # Decide how much to write, based on knowledge of pageserver implementation. diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py index 8ed31cb480..8d7ad46c1a 100644 --- a/test_runner/performance/test_seqscans.py +++ b/test_runner/performance/test_seqscans.py @@ -34,6 +34,7 @@ def test_seqscans(neon_with_baseline: PgCompare, rows: int, iters: int, workers: from pg_settings where name = 'shared_buffers' ''') row = cur.fetchone() + assert row is not None shared_buffers = row[0] table_size = row[1] log.info(f"shared_buffers is {shared_buffers}, table size {table_size}") From 0ebb6bc4b0fd920a7a597367e558254c12f9a882 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 29 Jul 2022 22:48:06 +0300 Subject: [PATCH 0578/1022] Temporary pin Werkzeug version because moto hangs with newer one. See https://github.com/spulec/moto/issues/5341 --- poetry.lock | 11 ++++------- pyproject.toml | 1 + 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index f55cfda000..d679b29cba 100644 --- a/poetry.lock +++ b/poetry.lock @@ -109,8 +109,8 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "boto3-stubs" -version = "1.24.38" -description = "Type annotations for boto3 1.24.38 generated with mypy-boto3-builder 7.10.1" +version = "1.24.41" +description = "Type annotations for boto3 1.24.41 generated with mypy-boto3-builder 7.10.2" category = "main" optional = false python-versions = ">=3.7" @@ -1386,15 +1386,12 @@ test = ["websockets"] [[package]] name = "werkzeug" -version = "2.2.0" +version = "2.1.2" description = "The comprehensive WSGI web application library." category = "main" optional = false python-versions = ">=3.7" -[package.dependencies] -MarkupSafe = ">=2.1.1" - [package.extras] watchdog = ["watchdog"] @@ -1437,7 +1434,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "17e901dca9680c6ead56661492431cfce65cb20508be419599f0862ff2d1d827" +content-hash = "5f7be77c7757a27bae28d39f31cd6f3a7a04e9dab53a200a6021a5af8ad02f37" [metadata.files] aiopg = [] diff --git a/pyproject.toml b/pyproject.toml index 2807881d71..da47ecefaf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ backoff = "^1.11.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" pytest-timeout = "^2.1.0" +Werkzeug = "2.1.2" [tool.poetry.dev-dependencies] yapf = "==0.31.0" From 1edf3eb2c8af6054f79e79f1f202be24b489c559 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 1 Aug 2022 15:53:03 +0300 Subject: [PATCH 0579/1022] increase timeout so mac os job can finish the build with all cache misses --- .github/workflows/codestyle.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 3acbeae9c2..aa37167a19 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -27,7 +27,7 @@ jobs: # Rust toolchains (e.g. nightly or 1.37.0), add them here. rust_toolchain: [1.58] os: [ubuntu-latest, macos-latest] - timeout-minutes: 50 + timeout-minutes: 60 name: run regression test suite runs-on: ${{ matrix.os }} From 8ba41b8c18aecd81db206c2c38055d9c9761656b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Aug 2022 19:08:09 +0100 Subject: [PATCH 0580/1022] Bump pywin32 from 227 to 301 (#2202) --- poetry.lock | 817 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 715 insertions(+), 102 deletions(-) diff --git a/poetry.lock b/poetry.lock index d679b29cba..2563054b0b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -558,20 +558,21 @@ test = ["pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", [[package]] name = "docker" -version = "5.0.3" +version = "4.2.2" description = "A Python library for the Docker Engine API." category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [package.dependencies] -pywin32 = {version = "227", markers = "sys_platform == \"win32\""} +pypiwin32 = {version = "223", markers = "sys_platform == \"win32\" and python_version >= \"3.6\""} requests = ">=2.14.2,<2.18.0 || >2.18.0" +six = ">=1.4.0" websocket-client = ">=0.32.0" [package.extras] ssh = ["paramiko (>=2.4.2)"] -tls = ["pyOpenSSL (>=17.5.0)", "cryptography (>=3.4.7)", "idna (>=2.0.0)"] +tls = ["pyOpenSSL (>=17.5.0)", "cryptography (>=1.3.4)", "idna (>=2.0.0)"] [[package]] name = "ecdsa" @@ -754,9 +755,9 @@ optional = false python-versions = ">=2.7" [package.extras] -docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] -testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-black-multipy", "pytest-cov", "ecdsa", "feedparser", "numpy", "pandas", "pymongo", "scikit-learn", "sqlalchemy", "pytest-flake8 (<1.1.0)", "enum34", "jsonlib", "pytest-flake8 (>=1.1.1)"] -"testing.libs" = ["simplejson", "ujson", "yajl"] +testing = ["pytest-flake8 (>=1.1.1)", "jsonlib", "enum34", "pytest-flake8 (<1.1.0)", "sqlalchemy", "scikit-learn", "pymongo", "pandas", "numpy", "feedparser", "ecdsa", "pytest-cov", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest (>=3.5,!=3.7.3)"] +"testing.libs" = ["yajl", "ujson", "simplejson"] +docs = ["rst.linker (>=1.9)", "jaraco.packaging (>=3.2)", "sphinx"] [[package]] name = "jsonpointer" @@ -933,9 +934,9 @@ python-versions = ">=3.7.0,<4.0.0" jsonschema = ">=3.0.0,<5.0.0" [package.extras] -rfc3339-validator = ["rfc3339-validator"] -strict-rfc3339 = ["strict-rfc3339"] isodate = ["isodate"] +strict-rfc3339 = ["strict-rfc3339"] +rfc3339-validator = ["rfc3339-validator"] [[package]] name = "openapi-spec-validator" @@ -981,8 +982,8 @@ optional = false python-versions = ">=3.6" [package.extras] -dev = ["pre-commit", "tox"] -testing = ["pytest", "pytest-benchmark"] +testing = ["pytest-benchmark", "pytest"] +dev = ["tox", "pre-commit"] [[package]] name = "prometheus-client" @@ -1055,10 +1056,10 @@ python-versions = ">=3.6" cryptography = {version = ">=3.3.1", optional = true, markers = "extra == \"crypto\""} [package.extras] +tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] +docs = ["zope.interface", "sphinx-rtd-theme", "sphinx"] +dev = ["pre-commit", "mypy", "coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)", "cryptography (>=3.3.1)", "zope.interface", "sphinx-rtd-theme", "sphinx"] crypto = ["cryptography (>=3.3.1)"] -dev = ["sphinx", "sphinx-rtd-theme", "zope.interface", "cryptography (>=3.3.1)", "pytest (>=6.0.0,<7.0.0)", "coverage[toml] (==5.0.4)", "mypy", "pre-commit"] -docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"] -tests = ["pytest (>=6.0.0,<7.0.0)", "coverage[toml] (==5.0.4)"] [[package]] name = "pyparsing" @@ -1071,6 +1072,17 @@ python-versions = ">=3.6.8" [package.extras] diagrams = ["railroad-diagrams", "jinja2"] +[[package]] +name = "pypiwin32" +version = "223" +description = "" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +pywin32 = ">=223" + [[package]] name = "pyrsistent" version = "0.18.1" @@ -1192,7 +1204,7 @@ python-versions = "*" [[package]] name = "pywin32" -version = "227" +version = "301" description = "Python for Window Extensions" category = "main" optional = false @@ -1437,114 +1449,715 @@ python-versions = "^3.9" content-hash = "5f7be77c7757a27bae28d39f31cd6f3a7a04e9dab53a200a6021a5af8ad02f37" [metadata.files] -aiopg = [] -async-timeout = [] -asyncpg = [] -atomicwrites = [] -attrs = [] -aws-sam-translator = [] -aws-xray-sdk = [] -backoff = [] -boto3 = [] -boto3-stubs = [] -botocore = [] -botocore-stubs = [] -cached-property = [] -certifi = [] -cffi = [] -cfn-lint = [] -charset-normalizer = [] -click = [] -colorama = [] -cryptography = [] -docker = [] -ecdsa = [] -execnet = [] -flake8 = [] -flask = [] -flask-cors = [] -graphql-core = [] -idna = [] -importlib-metadata = [] -iniconfig = [] -itsdangerous = [] -jinja2 = [] -jmespath = [] -jschema-to-python = [] -jsondiff = [] -jsonpatch = [] -jsonpickle = [] -jsonpointer = [] -jsonschema = [] -junit-xml = [] -markupsafe = [] -mccabe = [] -moto = [] -mypy = [] -mypy-boto3-s3 = [] -mypy-extensions = [] -networkx = [] -openapi-schema-validator = [] -openapi-spec-validator = [] +aiopg = [ + {file = "aiopg-1.3.4-py3-none-any.whl", hash = "sha256:b5b74a124831aad71608c3c203479db90bac4a7eb3f8982bc48c3d3e6f1e57bf"}, + {file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"}, +] +async-timeout = [ + {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"}, + {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"}, +] +asyncpg = [ + {file = "asyncpg-0.24.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1"}, + {file = "asyncpg-0.24.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843"}, + {file = "asyncpg-0.24.0-cp310-cp310-win_amd64.whl", hash = "sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d"}, + {file = "asyncpg-0.24.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5"}, + {file = "asyncpg-0.24.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83"}, + {file = "asyncpg-0.24.0-cp37-cp37m-win_amd64.whl", hash = "sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18"}, + {file = "asyncpg-0.24.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20"}, + {file = "asyncpg-0.24.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece"}, + {file = "asyncpg-0.24.0-cp38-cp38-win_amd64.whl", hash = "sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317"}, + {file = "asyncpg-0.24.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c"}, + {file = "asyncpg-0.24.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03"}, + {file = "asyncpg-0.24.0-cp39-cp39-win_amd64.whl", hash = "sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853"}, + {file = "asyncpg-0.24.0.tar.gz", hash = "sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6"}, +] +atomicwrites = [ + {file = "atomicwrites-1.4.1.tar.gz", hash = "sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11"}, +] +attrs = [ + {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"}, + {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"}, +] +aws-sam-translator = [ + {file = "aws-sam-translator-1.48.0.tar.gz", hash = "sha256:7171037323dfa30f8f73e9bccb9210e4c384a585e087219a9518a5204f0a2c44"}, + {file = "aws_sam_translator-1.48.0-py2-none-any.whl", hash = "sha256:be18dfa3dfe7ab291d281667c5f73ac62dbe6bfe86df7d122e4258b906b736f0"}, + {file = "aws_sam_translator-1.48.0-py3-none-any.whl", hash = "sha256:ca4f8f9910d7713aeaba59346775bfb3198f6acb47c6704572f9bd3fc0fb5bf0"}, +] +aws-xray-sdk = [ + {file = "aws-xray-sdk-2.10.0.tar.gz", hash = "sha256:9b14924fd0628cf92936055864655354003f0b1acc3e1c3ffde6403d0799dd7a"}, + {file = "aws_xray_sdk-2.10.0-py2.py3-none-any.whl", hash = "sha256:7551e81a796e1a5471ebe84844c40e8edf7c218db33506d046fec61f7495eda4"}, +] +backoff = [ + {file = "backoff-1.11.1-py2.py3-none-any.whl", hash = "sha256:61928f8fa48d52e4faa81875eecf308eccfb1016b018bb6bd21e05b5d90a96c5"}, + {file = "backoff-1.11.1.tar.gz", hash = "sha256:ccb962a2378418c667b3c979b504fdeb7d9e0d29c0579e3b13b86467177728cb"}, +] +boto3 = [ + {file = "boto3-1.24.38-py3-none-any.whl", hash = "sha256:bcf97fd7c494f4e2bbbe2511625500654179c0a6b3bea977d46f97af764e85a4"}, + {file = "boto3-1.24.38.tar.gz", hash = "sha256:f4c6b025f392c934338c7f01badfddbd0d3cf2397ff5df35c31409798dce33f5"}, +] +boto3-stubs = [ + {file = "boto3-stubs-1.24.41.tar.gz", hash = "sha256:8655d64981a7202aeb46a56a893ddcd23f59013894792e0e9a6f5350f7012674"}, + {file = "boto3_stubs-1.24.41-py3-none-any.whl", hash = "sha256:4579b2d28c5a0cd7d36a36cbdfcc872695f88eeaeadc8092f0b058049e9e08c7"}, +] +botocore = [ + {file = "botocore-1.27.38-py3-none-any.whl", hash = "sha256:46a0264ff3335496bd9cb404f83ec0d8eb7bfdef8f74a830c13e6a6b9612adea"}, + {file = "botocore-1.27.38.tar.gz", hash = "sha256:56a7682564ea57ceecfef5648f77b77e0543b9c904212fc9ef4416517d24fa45"}, +] +botocore-stubs = [ + {file = "botocore-stubs-1.27.38.tar.gz", hash = "sha256:408e8b86b5d171b58f81c74ca9d3b5317a5a8e2d3bc2073aa841ac13b8939e56"}, + {file = "botocore_stubs-1.27.38-py3-none-any.whl", hash = "sha256:7add7641e9a479a9c8366893bb522fd9ca3d58714201e43662a200a148a1bc38"}, +] +cached-property = [ + {file = "cached-property-1.5.2.tar.gz", hash = "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130"}, + {file = "cached_property-1.5.2-py2.py3-none-any.whl", hash = "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"}, +] +certifi = [ + {file = "certifi-2022.6.15-py3-none-any.whl", hash = "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412"}, + {file = "certifi-2022.6.15.tar.gz", hash = "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d"}, +] +cffi = [ + {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"}, + {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"}, + {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"}, + {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"}, + {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"}, + {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"}, + {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"}, + {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"}, + {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"}, + {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"}, + {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"}, + {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"}, + {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"}, + {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"}, + {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"}, + {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"}, + {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"}, + {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"}, + {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"}, + {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"}, + {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"}, + {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"}, + {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"}, + {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"}, + {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"}, + {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"}, + {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"}, + {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"}, + {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"}, + {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"}, + {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"}, + {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"}, + {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"}, + {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"}, + {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"}, +] +cfn-lint = [ + {file = "cfn-lint-0.61.3.tar.gz", hash = "sha256:3806e010d77901f5e935496df690c10e39676434a738fce1a1161cf9c7bd36a2"}, + {file = "cfn_lint-0.61.3-py3-none-any.whl", hash = "sha256:8e9522fad0c7c98b31ecbdd4724f8d8a5787457cc0f71e62ae0d11104d6e52ab"}, +] +charset-normalizer = [ + {file = "charset-normalizer-2.1.0.tar.gz", hash = "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413"}, + {file = "charset_normalizer-2.1.0-py3-none-any.whl", hash = "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5"}, +] +click = [ + {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, + {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, +] +colorama = [ + {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"}, + {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"}, +] +cryptography = [ + {file = "cryptography-37.0.4-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:549153378611c0cca1042f20fd9c5030d37a72f634c9326e225c9f666d472884"}, + {file = "cryptography-37.0.4-cp36-abi3-macosx_10_10_x86_64.whl", hash = "sha256:a958c52505c8adf0d3822703078580d2c0456dd1d27fabfb6f76fe63d2971cd6"}, + {file = "cryptography-37.0.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f721d1885ecae9078c3f6bbe8a88bc0786b6e749bf32ccec1ef2b18929a05046"}, + {file = "cryptography-37.0.4-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:3d41b965b3380f10e4611dbae366f6dc3cefc7c9ac4e8842a806b9672ae9add5"}, + {file = "cryptography-37.0.4-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:80f49023dd13ba35f7c34072fa17f604d2f19bf0989f292cedf7ab5770b87a0b"}, + {file = "cryptography-37.0.4-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2dcb0b3b63afb6df7fd94ec6fbddac81b5492513f7b0436210d390c14d46ee8"}, + {file = "cryptography-37.0.4-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:b7f8dd0d4c1f21759695c05a5ec8536c12f31611541f8904083f3dc582604280"}, + {file = "cryptography-37.0.4-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:30788e070800fec9bbcf9faa71ea6d8068f5136f60029759fd8c3efec3c9dcb3"}, + {file = "cryptography-37.0.4-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:190f82f3e87033821828f60787cfa42bff98404483577b591429ed99bed39d59"}, + {file = "cryptography-37.0.4-cp36-abi3-win32.whl", hash = "sha256:b62439d7cd1222f3da897e9a9fe53bbf5c104fff4d60893ad1355d4c14a24157"}, + {file = "cryptography-37.0.4-cp36-abi3-win_amd64.whl", hash = "sha256:f7a6de3e98771e183645181b3627e2563dcde3ce94a9e42a3f427d2255190327"}, + {file = "cryptography-37.0.4-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bc95ed67b6741b2607298f9ea4932ff157e570ef456ef7ff0ef4884a134cc4b"}, + {file = "cryptography-37.0.4-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:f8c0a6e9e1dd3eb0414ba320f85da6b0dcbd543126e30fcc546e7372a7fbf3b9"}, + {file = "cryptography-37.0.4-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:e007f052ed10cc316df59bc90fbb7ff7950d7e2919c9757fd42a2b8ecf8a5f67"}, + {file = "cryptography-37.0.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bc997818309f56c0038a33b8da5c0bfbb3f1f067f315f9abd6fc07ad359398d"}, + {file = "cryptography-37.0.4-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:d204833f3c8a33bbe11eda63a54b1aad7aa7456ed769a982f21ec599ba5fa282"}, + {file = "cryptography-37.0.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:75976c217f10d48a8b5a8de3d70c454c249e4b91851f6838a4e48b8f41eb71aa"}, + {file = "cryptography-37.0.4-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:7099a8d55cd49b737ffc99c17de504f2257e3787e02abe6d1a6d136574873441"}, + {file = "cryptography-37.0.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2be53f9f5505673eeda5f2736bea736c40f051a739bfae2f92d18aed1eb54596"}, + {file = "cryptography-37.0.4-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:91ce48d35f4e3d3f1d83e29ef4a9267246e6a3be51864a5b7d2247d5086fa99a"}, + {file = "cryptography-37.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:4c590ec31550a724ef893c50f9a97a0c14e9c851c85621c5650d699a7b88f7ab"}, + {file = "cryptography-37.0.4.tar.gz", hash = "sha256:63f9c17c0e2474ccbebc9302ce2f07b55b3b3fcb211ded18a42d5764f5c10a82"}, +] +docker = [ + {file = "docker-4.2.2-py2.py3-none-any.whl", hash = "sha256:03a46400c4080cb6f7aa997f881ddd84fef855499ece219d75fbdb53289c17ab"}, + {file = "docker-4.2.2.tar.gz", hash = "sha256:26eebadce7e298f55b76a88c4f8802476c5eaddbdbe38dbc6cce8781c47c9b54"}, +] +ecdsa = [ + {file = "ecdsa-0.18.0-py2.py3-none-any.whl", hash = "sha256:80600258e7ed2f16b9aa1d7c295bd70194109ad5a30fdee0eaeefef1d4c559dd"}, + {file = "ecdsa-0.18.0.tar.gz", hash = "sha256:190348041559e21b22a1d65cee485282ca11a6f81d503fddb84d5017e9ed1e49"}, +] +execnet = [ + {file = "execnet-1.9.0-py2.py3-none-any.whl", hash = "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"}, + {file = "execnet-1.9.0.tar.gz", hash = "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5"}, +] +flake8 = [ + {file = "flake8-3.9.2-py2.py3-none-any.whl", hash = "sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"}, + {file = "flake8-3.9.2.tar.gz", hash = "sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b"}, +] +flask = [ + {file = "Flask-2.1.3-py3-none-any.whl", hash = "sha256:9013281a7402ad527f8fd56375164f3aa021ecfaff89bfe3825346c24f87e04c"}, + {file = "Flask-2.1.3.tar.gz", hash = "sha256:15972e5017df0575c3d6c090ba168b6db90259e620ac8d7ea813a396bad5b6cb"}, +] +flask-cors = [ + {file = "Flask-Cors-3.0.10.tar.gz", hash = "sha256:b60839393f3b84a0f3746f6cdca56c1ad7426aa738b70d6c61375857823181de"}, + {file = "Flask_Cors-3.0.10-py2.py3-none-any.whl", hash = "sha256:74efc975af1194fc7891ff5cd85b0f7478be4f7f59fe158102e91abb72bb4438"}, +] +graphql-core = [ + {file = "graphql-core-3.2.1.tar.gz", hash = "sha256:9d1bf141427b7d54be944587c8349df791ce60ade2e3cccaf9c56368c133c201"}, + {file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"}, +] +idna = [ + {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, + {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, +] +importlib-metadata = [ + {file = "importlib_metadata-4.12.0-py3-none-any.whl", hash = "sha256:7401a975809ea1fdc658c3aa4f78cc2195a0e019c5cbc4c06122884e9ae80c23"}, + {file = "importlib_metadata-4.12.0.tar.gz", hash = "sha256:637245b8bab2b6502fcbc752cc4b7a6f6243bb02b31c5c26156ad103d3d45670"}, +] +iniconfig = [ + {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, + {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, +] +itsdangerous = [ + {file = "itsdangerous-2.1.2-py3-none-any.whl", hash = "sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44"}, + {file = "itsdangerous-2.1.2.tar.gz", hash = "sha256:5dbbc68b317e5e42f327f9021763545dc3fc3bfe22e6deb96aaf1fc38874156a"}, +] +jinja2 = [ + {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"}, + {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, +] +jmespath = [ + {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, + {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, +] +jschema-to-python = [ + {file = "jschema_to_python-1.2.3-py3-none-any.whl", hash = "sha256:8a703ca7604d42d74b2815eecf99a33359a8dccbb80806cce386d5e2dd992b05"}, + {file = "jschema_to_python-1.2.3.tar.gz", hash = "sha256:76ff14fe5d304708ccad1284e4b11f96a658949a31ee7faed9e0995279549b91"}, +] +jsondiff = [ + {file = "jsondiff-2.0.0-py3-none-any.whl", hash = "sha256:689841d66273fc88fc79f7d33f4c074774f4f214b6466e3aff0e5adaf889d1e0"}, + {file = "jsondiff-2.0.0.tar.gz", hash = "sha256:2795844ef075ec8a2b8d385c4d59f5ea48b08e7180fce3cb2787be0db00b1fb4"}, +] +jsonpatch = [ + {file = "jsonpatch-1.32-py2.py3-none-any.whl", hash = "sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397"}, + {file = "jsonpatch-1.32.tar.gz", hash = "sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2"}, +] +jsonpickle = [ + {file = "jsonpickle-2.2.0-py2.py3-none-any.whl", hash = "sha256:de7f2613818aa4f234138ca11243d6359ff83ae528b2185efdd474f62bcf9ae1"}, + {file = "jsonpickle-2.2.0.tar.gz", hash = "sha256:7b272918b0554182e53dc340ddd62d9b7f902fec7e7b05620c04f3ccef479a0e"}, +] +jsonpointer = [ + {file = "jsonpointer-2.3-py2.py3-none-any.whl", hash = "sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9"}, + {file = "jsonpointer-2.3.tar.gz", hash = "sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"}, +] +jsonschema = [ + {file = "jsonschema-3.2.0-py2.py3-none-any.whl", hash = "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163"}, + {file = "jsonschema-3.2.0.tar.gz", hash = "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"}, +] +junit-xml = [ + {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"}, +] +markupsafe = [ + {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-win32.whl", hash = "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-win32.whl", hash = "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-win32.whl", hash = "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-win32.whl", hash = "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247"}, + {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"}, +] +mccabe = [ + {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, + {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, +] +moto = [ + {file = "moto-3.1.16-py3-none-any.whl", hash = "sha256:8bb8e267d9b948509d4739d81d995615a193d2c459f5c0a979aaeb0d3bd4b381"}, + {file = "moto-3.1.16.tar.gz", hash = "sha256:cbe8ad8a949f519771e5d25b670738604757fb67cd474d75d14c20677582e81f"}, +] +mypy = [ + {file = "mypy-0.971-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f2899a3cbd394da157194f913a931edfd4be5f274a88041c9dc2d9cdcb1c315c"}, + {file = "mypy-0.971-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:98e02d56ebe93981c41211c05adb630d1d26c14195d04d95e49cd97dbc046dc5"}, + {file = "mypy-0.971-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:19830b7dba7d5356d3e26e2427a2ec91c994cd92d983142cbd025ebe81d69cf3"}, + {file = "mypy-0.971-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:02ef476f6dcb86e6f502ae39a16b93285fef97e7f1ff22932b657d1ef1f28655"}, + {file = "mypy-0.971-cp310-cp310-win_amd64.whl", hash = "sha256:25c5750ba5609a0c7550b73a33deb314ecfb559c350bb050b655505e8aed4103"}, + {file = "mypy-0.971-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d3348e7eb2eea2472db611486846742d5d52d1290576de99d59edeb7cd4a42ca"}, + {file = "mypy-0.971-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3fa7a477b9900be9b7dd4bab30a12759e5abe9586574ceb944bc29cddf8f0417"}, + {file = "mypy-0.971-cp36-cp36m-win_amd64.whl", hash = "sha256:2ad53cf9c3adc43cf3bea0a7d01a2f2e86db9fe7596dfecb4496a5dda63cbb09"}, + {file = "mypy-0.971-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:855048b6feb6dfe09d3353466004490b1872887150c5bb5caad7838b57328cc8"}, + {file = "mypy-0.971-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:23488a14a83bca6e54402c2e6435467a4138785df93ec85aeff64c6170077fb0"}, + {file = "mypy-0.971-cp37-cp37m-win_amd64.whl", hash = "sha256:4b21e5b1a70dfb972490035128f305c39bc4bc253f34e96a4adf9127cf943eb2"}, + {file = "mypy-0.971-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:9796a2ba7b4b538649caa5cecd398d873f4022ed2333ffde58eaf604c4d2cb27"}, + {file = "mypy-0.971-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5a361d92635ad4ada1b1b2d3630fc2f53f2127d51cf2def9db83cba32e47c856"}, + {file = "mypy-0.971-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b793b899f7cf563b1e7044a5c97361196b938e92f0a4343a5d27966a53d2ec71"}, + {file = "mypy-0.971-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d1ea5d12c8e2d266b5fb8c7a5d2e9c0219fedfeb493b7ed60cd350322384ac27"}, + {file = "mypy-0.971-cp38-cp38-win_amd64.whl", hash = "sha256:23c7ff43fff4b0df93a186581885c8512bc50fc4d4910e0f838e35d6bb6b5e58"}, + {file = "mypy-0.971-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1f7656b69974a6933e987ee8ffb951d836272d6c0f81d727f1d0e2696074d9e6"}, + {file = "mypy-0.971-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d2022bfadb7a5c2ef410d6a7c9763188afdb7f3533f22a0a32be10d571ee4bbe"}, + {file = "mypy-0.971-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef943c72a786b0f8d90fd76e9b39ce81fb7171172daf84bf43eaf937e9f220a9"}, + {file = "mypy-0.971-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d744f72eb39f69312bc6c2abf8ff6656973120e2eb3f3ec4f758ed47e414a4bf"}, + {file = "mypy-0.971-cp39-cp39-win_amd64.whl", hash = "sha256:77a514ea15d3007d33a9e2157b0ba9c267496acf12a7f2b9b9f8446337aac5b0"}, + {file = "mypy-0.971-py3-none-any.whl", hash = "sha256:0d054ef16b071149917085f51f89555a576e2618d5d9dd70bd6eea6410af3ac9"}, + {file = "mypy-0.971.tar.gz", hash = "sha256:40b0f21484238269ae6a57200c807d80debc6459d444c0489a102d7c6a75fa56"}, +] +mypy-boto3-s3 = [ + {file = "mypy-boto3-s3-1.24.36.post1.tar.gz", hash = "sha256:3bd7e06f9ade5059eae2181d7a9f1a41e7fa807ad3e94c01c9901838e87e0abe"}, + {file = "mypy_boto3_s3-1.24.36.post1-py3-none-any.whl", hash = "sha256:30ae59b33c55f8b7b693170f9519ea5b91a2fbf31a73de79cdef57a27d784e5a"}, +] +mypy-extensions = [ + {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, + {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, +] +networkx = [ + {file = "networkx-2.8.5-py3-none-any.whl", hash = "sha256:a762f4b385692d9c3a6f2912d058d76d29a827deaedf9e63ed14d397b8030687"}, + {file = "networkx-2.8.5.tar.gz", hash = "sha256:15a7b81a360791c458c55a417418ea136c13378cfdc06a2dcdc12bd2f9cf09c1"}, +] +openapi-schema-validator = [ + {file = "openapi-schema-validator-0.2.3.tar.gz", hash = "sha256:2c64907728c3ef78e23711c8840a423f0b241588c9ed929855e4b2d1bb0cf5f2"}, + {file = "openapi_schema_validator-0.2.3-py3-none-any.whl", hash = "sha256:9bae709212a19222892cabcc60cafd903cbf4b220223f48583afa3c0e3cc6fc4"}, +] +openapi-spec-validator = [ + {file = "openapi-spec-validator-0.4.0.tar.gz", hash = "sha256:97f258850afc97b048f7c2653855e0f88fa66ac103c2be5077c7960aca2ad49a"}, + {file = "openapi_spec_validator-0.4.0-py3-none-any.whl", hash = "sha256:06900ac4d546a1df3642a779da0055be58869c598e3042a2fef067cfd99d04d0"}, +] packaging = [ {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, ] -pbr = [] -pluggy = [] -prometheus-client = [] -psycopg2-binary = [] -py = [] -pyasn1 = [] -pycodestyle = [] -pycparser = [] -pyflakes = [] -pyjwt = [] +pbr = [ + {file = "pbr-5.9.0-py2.py3-none-any.whl", hash = "sha256:e547125940bcc052856ded43be8e101f63828c2d94239ffbe2b327ba3d5ccf0a"}, + {file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"}, +] +pluggy = [ + {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, + {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, +] +prometheus-client = [ + {file = "prometheus_client-0.14.1-py3-none-any.whl", hash = "sha256:522fded625282822a89e2773452f42df14b5a8e84a86433e3f8a189c1d54dc01"}, + {file = "prometheus_client-0.14.1.tar.gz", hash = "sha256:5459c427624961076277fdc6dc50540e2bacb98eebde99886e59ec55ed92093a"}, +] +psycopg2-binary = [ + {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_ppc64le.whl", hash = "sha256:0a29729145aaaf1ad8bafe663131890e2111f13416b60e460dae0a96af5905c9"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3a79d622f5206d695d7824cbf609a4f5b88ea6d6dab5f7c147fc6d333a8787e4"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:090f3348c0ab2cceb6dfbe6bf721ef61262ddf518cd6cc6ecc7d334996d64efa"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:a9e1f75f96ea388fbcef36c70640c4efbe4650658f3d6a2967b4cc70e907352e"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c3ae8e75eb7160851e59adc77b3a19a976e50622e44fd4fd47b8b18208189d42"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-win32.whl", hash = "sha256:7b1e9b80afca7b7a386ef087db614faebbf8839b7f4db5eb107d0f1a53225029"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:8b344adbb9a862de0c635f4f0425b7958bf5a4b927c8594e6e8d261775796d53"}, + {file = "psycopg2_binary-2.9.3-cp36-cp36m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:e847774f8ffd5b398a75bc1c18fbb56564cda3d629fe68fd81971fece2d3c67e"}, + {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:68641a34023d306be959101b345732360fc2ea4938982309b786f7be1b43a4a1"}, + {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3303f8807f342641851578ee7ed1f3efc9802d00a6f83c101d21c608cb864460"}, + {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_24_aarch64.whl", hash = "sha256:e3699852e22aa68c10de06524a3721ade969abf382da95884e6a10ff798f9281"}, + {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_24_ppc64le.whl", hash = "sha256:526ea0378246d9b080148f2d6681229f4b5964543c170dd10bf4faaab6e0d27f"}, + {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:b1c8068513f5b158cf7e29c43a77eb34b407db29aca749d3eb9293ee0d3103ca"}, + {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:15803fa813ea05bef089fa78835118b5434204f3a17cb9f1e5dbfd0b9deea5af"}, + {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:152f09f57417b831418304c7f30d727dc83a12761627bb826951692cc6491e57"}, + {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:404224e5fef3b193f892abdbf8961ce20e0b6642886cfe1fe1923f41aaa75c9d"}, + {file = "psycopg2_binary-2.9.3-cp36-cp36m-win32.whl", hash = "sha256:1f6b813106a3abdf7b03640d36e24669234120c72e91d5cbaeb87c5f7c36c65b"}, + {file = "psycopg2_binary-2.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:2d872e3c9d5d075a2e104540965a1cf898b52274a5923936e5bfddb58c59c7c2"}, + {file = "psycopg2_binary-2.9.3-cp37-cp37m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:10bb90fb4d523a2aa67773d4ff2b833ec00857f5912bafcfd5f5414e45280fb1"}, + {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:874a52ecab70af13e899f7847b3e074eeb16ebac5615665db33bce8a1009cf33"}, + {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a29b3ca4ec9defec6d42bf5feb36bb5817ba3c0230dd83b4edf4bf02684cd0ae"}, + {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_24_aarch64.whl", hash = "sha256:12b11322ea00ad8db8c46f18b7dfc47ae215e4df55b46c67a94b4effbaec7094"}, + {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_24_ppc64le.whl", hash = "sha256:53293533fcbb94c202b7c800a12c873cfe24599656b341f56e71dd2b557be063"}, + {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c381bda330ddf2fccbafab789d83ebc6c53db126e4383e73794c74eedce855ef"}, + {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:9d29409b625a143649d03d0fd7b57e4b92e0ecad9726ba682244b73be91d2fdb"}, + {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:183a517a3a63503f70f808b58bfbf962f23d73b6dccddae5aa56152ef2bcb232"}, + {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:15c4e4cfa45f5a60599d9cec5f46cd7b1b29d86a6390ec23e8eebaae84e64554"}, + {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"}, + {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_ppc64le.whl", hash = "sha256:63638d875be8c2784cfc952c9ac34e2b50e43f9f0a0660b65e2a87d656b3116c"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ffb7a888a047696e7f8240d649b43fb3644f14f0ee229077e7f6b9f9081635bd"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0c9d5450c566c80c396b7402895c4369a410cab5a82707b11aee1e624da7d004"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:d1c1b569ecafe3a69380a94e6ae09a4789bbb23666f3d3a08d06bbd2451f5ef1"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8fc53f9af09426a61db9ba357865c77f26076d48669f2e1bb24d85a22fb52307"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_ppc64le.whl", hash = "sha256:7af0dd86ddb2f8af5da57a976d27cd2cd15510518d582b478fbb2292428710b4"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:93cd1967a18aa0edd4b95b1dfd554cf15af657cb606280996d393dadc88c3c35"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bda845b664bb6c91446ca9609fc69f7db6c334ec5e4adc87571c34e4f47b7ddb"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:01310cf4cf26db9aea5158c217caa92d291f0500051a6469ac52166e1a16f5b7"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:99485cab9ba0fa9b84f1f9e1fef106f44a46ef6afdeec8885e0b88d0772b49e8"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-win32.whl", hash = "sha256:46f0e0a6b5fa5851bbd9ab1bc805eef362d3a230fbdfbc209f4a236d0a7a990d"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:accfe7e982411da3178ec690baaceaad3c278652998b2c45828aaac66cd8285f"}, +] +py = [ + {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, + {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, +] +pyasn1 = [ + {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"}, + {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"}, + {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"}, + {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"}, + {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, + {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"}, + {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"}, + {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"}, + {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"}, + {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"}, + {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"}, + {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"}, + {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, +] +pycodestyle = [ + {file = "pycodestyle-2.7.0-py2.py3-none-any.whl", hash = "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068"}, + {file = "pycodestyle-2.7.0.tar.gz", hash = "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"}, +] +pycparser = [ + {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, + {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, +] +pyflakes = [ + {file = "pyflakes-2.3.1-py2.py3-none-any.whl", hash = "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3"}, + {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"}, +] +pyjwt = [ + {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"}, + {file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"}, +] pyparsing = [ {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, ] -pyrsistent = [] -pytest = [] -pytest-forked = [] -pytest-lazy-fixture = [] -pytest-timeout = [] -pytest-xdist = [] +pypiwin32 = [ + {file = "pypiwin32-223-py3-none-any.whl", hash = "sha256:67adf399debc1d5d14dffc1ab5acacb800da569754fafdc576b2a039485aa775"}, + {file = "pypiwin32-223.tar.gz", hash = "sha256:71be40c1fbd28594214ecaecb58e7aa8b708eabfa0125c8a109ebd51edbd776a"}, +] +pyrsistent = [ + {file = "pyrsistent-0.18.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:df46c854f490f81210870e509818b729db4488e1f30f2a1ce1698b2295a878d1"}, + {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d45866ececf4a5fff8742c25722da6d4c9e180daa7b405dc0a2a2790d668c26"}, + {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4ed6784ceac462a7d6fcb7e9b663e93b9a6fb373b7f43594f9ff68875788e01e"}, + {file = "pyrsistent-0.18.1-cp310-cp310-win32.whl", hash = "sha256:e4f3149fd5eb9b285d6bfb54d2e5173f6a116fe19172686797c056672689daf6"}, + {file = "pyrsistent-0.18.1-cp310-cp310-win_amd64.whl", hash = "sha256:636ce2dc235046ccd3d8c56a7ad54e99d5c1cd0ef07d9ae847306c91d11b5fec"}, + {file = "pyrsistent-0.18.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e92a52c166426efbe0d1ec1332ee9119b6d32fc1f0bbfd55d5c1088070e7fc1b"}, + {file = "pyrsistent-0.18.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7a096646eab884bf8bed965bad63ea327e0d0c38989fc83c5ea7b8a87037bfc"}, + {file = "pyrsistent-0.18.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cdfd2c361b8a8e5d9499b9082b501c452ade8bbf42aef97ea04854f4a3f43b22"}, + {file = "pyrsistent-0.18.1-cp37-cp37m-win32.whl", hash = "sha256:7ec335fc998faa4febe75cc5268a9eac0478b3f681602c1f27befaf2a1abe1d8"}, + {file = "pyrsistent-0.18.1-cp37-cp37m-win_amd64.whl", hash = "sha256:6455fc599df93d1f60e1c5c4fe471499f08d190d57eca040c0ea182301321286"}, + {file = "pyrsistent-0.18.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:fd8da6d0124efa2f67d86fa70c851022f87c98e205f0594e1fae044e7119a5a6"}, + {file = "pyrsistent-0.18.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bfe2388663fd18bd8ce7db2c91c7400bf3e1a9e8bd7d63bf7e77d39051b85ec"}, + {file = "pyrsistent-0.18.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0e3e1fcc45199df76053026a51cc59ab2ea3fc7c094c6627e93b7b44cdae2c8c"}, + {file = "pyrsistent-0.18.1-cp38-cp38-win32.whl", hash = "sha256:b568f35ad53a7b07ed9b1b2bae09eb15cdd671a5ba5d2c66caee40dbf91c68ca"}, + {file = "pyrsistent-0.18.1-cp38-cp38-win_amd64.whl", hash = "sha256:d1b96547410f76078eaf66d282ddca2e4baae8964364abb4f4dcdde855cd123a"}, + {file = "pyrsistent-0.18.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f87cc2863ef33c709e237d4b5f4502a62a00fab450c9e020892e8e2ede5847f5"}, + {file = "pyrsistent-0.18.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bc66318fb7ee012071b2792024564973ecc80e9522842eb4e17743604b5e045"}, + {file = "pyrsistent-0.18.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:914474c9f1d93080338ace89cb2acee74f4f666fb0424896fcfb8d86058bf17c"}, + {file = "pyrsistent-0.18.1-cp39-cp39-win32.whl", hash = "sha256:1b34eedd6812bf4d33814fca1b66005805d3640ce53140ab8bbb1e2651b0d9bc"}, + {file = "pyrsistent-0.18.1-cp39-cp39-win_amd64.whl", hash = "sha256:e24a828f57e0c337c8d8bb9f6b12f09dfdf0273da25fda9e314f0b684b415a07"}, + {file = "pyrsistent-0.18.1.tar.gz", hash = "sha256:d4d61f8b993a7255ba714df3aca52700f8125289f84f704cf80916517c46eb96"}, +] +pytest = [ + {file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"}, + {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"}, +] +pytest-forked = [ + {file = "pytest-forked-1.4.0.tar.gz", hash = "sha256:8b67587c8f98cbbadfdd804539ed5455b6ed03802203485dd2f53c1422d7440e"}, + {file = "pytest_forked-1.4.0-py3-none-any.whl", hash = "sha256:bbbb6717efc886b9d64537b41fb1497cfaf3c9601276be8da2cccfea5a3c8ad8"}, +] +pytest-lazy-fixture = [ + {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"}, + {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"}, +] +pytest-timeout = [ + {file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"}, + {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"}, +] +pytest-xdist = [ + {file = "pytest-xdist-2.5.0.tar.gz", hash = "sha256:4580deca3ff04ddb2ac53eba39d76cb5dd5edeac050cb6fbc768b0dd712b4edf"}, + {file = "pytest_xdist-2.5.0-py3-none-any.whl", hash = "sha256:6fe5c74fec98906deb8f2d2b616b5c782022744978e7bd4695d39c8f42d0ce65"}, +] python-dateutil = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, ] -python-jose = [] +python-jose = [ + {file = "python-jose-3.3.0.tar.gz", hash = "sha256:55779b5e6ad599c6336191246e95eb2293a9ddebd555f796a65f838f07e5d78a"}, + {file = "python_jose-3.3.0-py2.py3-none-any.whl", hash = "sha256:9b1376b023f8b298536eedd47ae1089bcdb848f1535ab30555cd92002d78923a"}, +] pytz = [ {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"}, {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"}, ] -pywin32 = [] -pyyaml = [] -requests = [] -responses = [] -rsa = [] -s3transfer = [] -sarif-om = [] +pywin32 = [ + {file = "pywin32-301-cp35-cp35m-win32.whl", hash = "sha256:93367c96e3a76dfe5003d8291ae16454ca7d84bb24d721e0b74a07610b7be4a7"}, + {file = "pywin32-301-cp35-cp35m-win_amd64.whl", hash = "sha256:9635df6998a70282bd36e7ac2a5cef9ead1627b0a63b17c731312c7a0daebb72"}, + {file = "pywin32-301-cp36-cp36m-win32.whl", hash = "sha256:c866f04a182a8cb9b7855de065113bbd2e40524f570db73ef1ee99ff0a5cc2f0"}, + {file = "pywin32-301-cp36-cp36m-win_amd64.whl", hash = "sha256:dafa18e95bf2a92f298fe9c582b0e205aca45c55f989937c52c454ce65b93c78"}, + {file = "pywin32-301-cp37-cp37m-win32.whl", hash = "sha256:98f62a3f60aa64894a290fb7494bfa0bfa0a199e9e052e1ac293b2ad3cd2818b"}, + {file = "pywin32-301-cp37-cp37m-win_amd64.whl", hash = "sha256:fb3b4933e0382ba49305cc6cd3fb18525df7fd96aa434de19ce0878133bf8e4a"}, + {file = "pywin32-301-cp38-cp38-win32.whl", hash = "sha256:88981dd3cfb07432625b180f49bf4e179fb8cbb5704cd512e38dd63636af7a17"}, + {file = "pywin32-301-cp38-cp38-win_amd64.whl", hash = "sha256:8c9d33968aa7fcddf44e47750e18f3d034c3e443a707688a008a2e52bbef7e96"}, + {file = "pywin32-301-cp39-cp39-win32.whl", hash = "sha256:595d397df65f1b2e0beaca63a883ae6d8b6df1cdea85c16ae85f6d2e648133fe"}, + {file = "pywin32-301-cp39-cp39-win_amd64.whl", hash = "sha256:87604a4087434cd814ad8973bd47d6524bd1fa9e971ce428e76b62a5e0860fdf"}, +] +pyyaml = [ + {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, + {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"}, + {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"}, + {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"}, + {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, + {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, + {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, + {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, + {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, + {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, + {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"}, + {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"}, + {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"}, + {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"}, + {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"}, + {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"}, + {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"}, + {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"}, + {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"}, + {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"}, + {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"}, + {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"}, + {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"}, + {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"}, + {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"}, + {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"}, + {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"}, + {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"}, + {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"}, + {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"}, + {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"}, + {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"}, + {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"}, +] +requests = [ + {file = "requests-2.28.1-py3-none-any.whl", hash = "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"}, + {file = "requests-2.28.1.tar.gz", hash = "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983"}, +] +responses = [ + {file = "responses-0.21.0-py3-none-any.whl", hash = "sha256:2dcc863ba63963c0c3d9ee3fa9507cbe36b7d7b0fccb4f0bdfd9e96c539b1487"}, + {file = "responses-0.21.0.tar.gz", hash = "sha256:b82502eb5f09a0289d8e209e7bad71ef3978334f56d09b444253d5ad67bf5253"}, +] +rsa = [ + {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"}, + {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"}, +] +s3transfer = [ + {file = "s3transfer-0.6.0-py3-none-any.whl", hash = "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd"}, + {file = "s3transfer-0.6.0.tar.gz", hash = "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"}, +] +sarif-om = [ + {file = "sarif_om-1.0.4-py3-none-any.whl", hash = "sha256:539ef47a662329b1c8502388ad92457425e95dc0aaaf995fe46f4984c4771911"}, + {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"}, +] six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] -sshpubkeys = [] -toml = [] +sshpubkeys = [ + {file = "sshpubkeys-3.3.1-py2.py3-none-any.whl", hash = "sha256:946f76b8fe86704b0e7c56a00d80294e39bc2305999844f079a217885060b1ac"}, + {file = "sshpubkeys-3.3.1.tar.gz", hash = "sha256:3020ed4f8c846849299370fbe98ff4157b0ccc1accec105e07cfa9ae4bb55064"}, +] +toml = [ + {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, + {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, +] tomli = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] -types-psycopg2 = [] -types-requests = [] -types-s3transfer = [] -types-urllib3 = [] -typing-extensions = [] -urllib3 = [] -websocket-client = [] -werkzeug = [] -wrapt = [] -xmltodict = [] -yapf = [] -zipp = [] +types-psycopg2 = [ + {file = "types-psycopg2-2.9.18.tar.gz", hash = "sha256:9b0e9e1f097b15cd9fa8aad2596a9e3082fd72f8d9cfe52b190cfa709105b6c0"}, + {file = "types_psycopg2-2.9.18-py3-none-any.whl", hash = "sha256:14c779dcab18c31453fa1cad3cf4b1601d33540a344adead3c47a6b8091cd2fa"}, +] +types-requests = [ + {file = "types-requests-2.28.5.tar.gz", hash = "sha256:ac618bfefcb3742eaf97c961e13e9e5a226e545eda4a3dbe293b898d40933ad1"}, + {file = "types_requests-2.28.5-py3-none-any.whl", hash = "sha256:98ab647ae88b5e2c41d6d20cfcb5117da1bea561110000b6fdeeea07b3e89877"}, +] +types-s3transfer = [ + {file = "types-s3transfer-0.6.0.post3.tar.gz", hash = "sha256:92c3704e5d041202bfb5ddb79d083fd1a02de2c5dfec6a91576823e6b5c93993"}, + {file = "types_s3transfer-0.6.0.post3-py3-none-any.whl", hash = "sha256:eedc5117275565b3c83662c0ccc81662a34da5dda8bd502b89d296b6d5cb091d"}, +] +types-urllib3 = [ + {file = "types-urllib3-1.26.17.tar.gz", hash = "sha256:73fd274524c3fc7cd8cd9ceb0cb67ed99b45f9cb2831013e46d50c1451044800"}, + {file = "types_urllib3-1.26.17-py3-none-any.whl", hash = "sha256:0d027fcd27dbb3cb532453b4d977e05bc1e13aefd70519866af211b3003d895d"}, +] +typing-extensions = [ + {file = "typing_extensions-4.3.0-py3-none-any.whl", hash = "sha256:25642c956049920a5aa49edcdd6ab1e06d7e5d467fc00e0506c44ac86fbfca02"}, + {file = "typing_extensions-4.3.0.tar.gz", hash = "sha256:e6d2677a32f47fc7eb2795db1dd15c1f34eff616bcaf2cfb5e997f854fa1c4a6"}, +] +urllib3 = [ + {file = "urllib3-1.26.11-py2.py3-none-any.whl", hash = "sha256:c33ccba33c819596124764c23a97d25f32b28433ba0dedeb77d873a38722c9bc"}, + {file = "urllib3-1.26.11.tar.gz", hash = "sha256:ea6e8fb210b19d950fab93b60c9009226c63a28808bc8386e05301e25883ac0a"}, +] +websocket-client = [ + {file = "websocket-client-1.3.3.tar.gz", hash = "sha256:d58c5f284d6a9bf8379dab423259fe8f85b70d5fa5d2916d5791a84594b122b1"}, + {file = "websocket_client-1.3.3-py3-none-any.whl", hash = "sha256:5d55652dc1d0b3c734f044337d929aaf83f4f9138816ec680c1aefefb4dc4877"}, +] +werkzeug = [ + {file = "Werkzeug-2.1.2-py3-none-any.whl", hash = "sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255"}, + {file = "Werkzeug-2.1.2.tar.gz", hash = "sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6"}, +] +wrapt = [ + {file = "wrapt-1.14.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3"}, + {file = "wrapt-1.14.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef"}, + {file = "wrapt-1.14.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28"}, + {file = "wrapt-1.14.1-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59"}, + {file = "wrapt-1.14.1-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87"}, + {file = "wrapt-1.14.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1"}, + {file = "wrapt-1.14.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b"}, + {file = "wrapt-1.14.1-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462"}, + {file = "wrapt-1.14.1-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1"}, + {file = "wrapt-1.14.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320"}, + {file = "wrapt-1.14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2"}, + {file = "wrapt-1.14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4"}, + {file = "wrapt-1.14.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069"}, + {file = "wrapt-1.14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310"}, + {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f"}, + {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656"}, + {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, + {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, + {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, + {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, + {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, + {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, + {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d"}, + {file = "wrapt-1.14.1-cp35-cp35m-win32.whl", hash = "sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7"}, + {file = "wrapt-1.14.1-cp35-cp35m-win_amd64.whl", hash = "sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00"}, + {file = "wrapt-1.14.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4"}, + {file = "wrapt-1.14.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1"}, + {file = "wrapt-1.14.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1"}, + {file = "wrapt-1.14.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff"}, + {file = "wrapt-1.14.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d"}, + {file = "wrapt-1.14.1-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1"}, + {file = "wrapt-1.14.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569"}, + {file = "wrapt-1.14.1-cp36-cp36m-win32.whl", hash = "sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed"}, + {file = "wrapt-1.14.1-cp36-cp36m-win_amd64.whl", hash = "sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471"}, + {file = "wrapt-1.14.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248"}, + {file = "wrapt-1.14.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68"}, + {file = "wrapt-1.14.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d"}, + {file = "wrapt-1.14.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77"}, + {file = "wrapt-1.14.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7"}, + {file = "wrapt-1.14.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015"}, + {file = "wrapt-1.14.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a"}, + {file = "wrapt-1.14.1-cp37-cp37m-win32.whl", hash = "sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853"}, + {file = "wrapt-1.14.1-cp37-cp37m-win_amd64.whl", hash = "sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c"}, + {file = "wrapt-1.14.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456"}, + {file = "wrapt-1.14.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f"}, + {file = "wrapt-1.14.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc"}, + {file = "wrapt-1.14.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1"}, + {file = "wrapt-1.14.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af"}, + {file = "wrapt-1.14.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b"}, + {file = "wrapt-1.14.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0"}, + {file = "wrapt-1.14.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57"}, + {file = "wrapt-1.14.1-cp38-cp38-win32.whl", hash = "sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5"}, + {file = "wrapt-1.14.1-cp38-cp38-win_amd64.whl", hash = "sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d"}, + {file = "wrapt-1.14.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383"}, + {file = "wrapt-1.14.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7"}, + {file = "wrapt-1.14.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86"}, + {file = "wrapt-1.14.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735"}, + {file = "wrapt-1.14.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b"}, + {file = "wrapt-1.14.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3"}, + {file = "wrapt-1.14.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3"}, + {file = "wrapt-1.14.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe"}, + {file = "wrapt-1.14.1-cp39-cp39-win32.whl", hash = "sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5"}, + {file = "wrapt-1.14.1-cp39-cp39-win_amd64.whl", hash = "sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb"}, + {file = "wrapt-1.14.1.tar.gz", hash = "sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d"}, +] +xmltodict = [ + {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"}, + {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"}, +] +yapf = [ + {file = "yapf-0.31.0-py2.py3-none-any.whl", hash = "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"}, + {file = "yapf-0.31.0.tar.gz", hash = "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d"}, +] +zipp = [ + {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"}, + {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"}, +] From 177d5b1f228c774741ffb71206388a8d49ffbb4b Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Mon, 1 Aug 2022 22:36:00 +0300 Subject: [PATCH 0581/1022] Bump postgres to get uuid extension --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index bc6dcc493c..5280b6fe10 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit bc6dcc493c977f3b06ad95abf493273a693b0e12 +Subproject commit 5280b6fe1027afd5a7e14c142913d9fdf9e2b442 From 71f39bac3d489abcc4f07a06e4fa89c8af546cbb Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 2 Aug 2022 13:57:26 +0100 Subject: [PATCH 0582/1022] github/workflows: upload artifacts to S3 (#2071) --- .github/actions/download/action.yml | 56 +++++++++ .../actions/run-python-test-set/action.yml | 17 +-- .github/actions/save-coverage-data/action.yml | 15 ++- .github/actions/upload/action.yml | 51 ++++++++ .github/workflows/build_and_test.yml | 113 +++++++++--------- 5 files changed, 175 insertions(+), 77 deletions(-) create mode 100644 .github/actions/download/action.yml create mode 100644 .github/actions/upload/action.yml diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml new file mode 100644 index 0000000000..5aa45164e7 --- /dev/null +++ b/.github/actions/download/action.yml @@ -0,0 +1,56 @@ +name: "Download an artifact" +description: "Custom download action" +inputs: + name: + description: "Artifact name" + required: true + path: + description: "A directory to put artifact into" + default: "." + required: false + skip-if-does-not-exist: + description: "Allow to skip if file doesn't exist, fail otherwise" + default: false + required: false + +runs: + using: "composite" + steps: + - name: Download artifact + id: download-artifact + shell: bash -euxo pipefail {0} + env: + TARGET: ${{ inputs.path }} + ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst + SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }} + run: | + BUCKET=neon-github-public-dev + PREFIX=artifacts/${GITHUB_RUN_ID} + FILENAME=$(basename $ARCHIVE) + + S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) + if [ -z "${S3_KEY}" ]; then + if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then + echo '::set-output name=SKIPPED::true' + exit 0 + else + echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME} nor its version from previous attempts exist" + exit 1 + fi + fi + + echo '::set-output name=SKIPPED::false' + + mkdir -p $(dirname $ARCHIVE) + time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} ${ARCHIVE} + + - name: Extract artifact + if: ${{ steps.download-artifact.outputs.SKIPPED == 'false' }} + shell: bash -euxo pipefail {0} + env: + TARGET: ${{ inputs.path }} + ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst + run: | + mkdir -p ${TARGET} + time tar -xf ${ARCHIVE} -C ${TARGET} + rm -f ${ARCHIVE} diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 6dc377a809..c9987053ce 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -31,18 +31,11 @@ inputs: runs: using: "composite" steps: - - name: Get Neon artifact for restoration - uses: actions/download-artifact@v3 + - name: Get Neon artifact + uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact - path: ./neon-artifact/ - - - name: Extract Neon artifact - shell: bash -euxo pipefail {0} - run: | - mkdir -p /tmp/neon/ - tar -xf ./neon-artifact/neon.tar.zst -C /tmp/neon/ - rm -rf ./neon-artifact/ + path: /tmp/neon - name: Checkout if: inputs.needs_postgres_source == 'true' @@ -132,9 +125,7 @@ runs: - name: Upload python test logs if: always() - uses: actions/upload-artifact@v3 + uses: ./.github/actions/upload with: - retention-days: 7 - if-no-files-found: error name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs path: /tmp/test_output/ diff --git a/.github/actions/save-coverage-data/action.yml b/.github/actions/save-coverage-data/action.yml index bcfd7cb47e..6fbe19a96e 100644 --- a/.github/actions/save-coverage-data/action.yml +++ b/.github/actions/save-coverage-data/action.yml @@ -8,10 +8,15 @@ runs: shell: bash -euxo pipefail {0} run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge - - name: Upload coverage data - uses: actions/upload-artifact@v3 + - name: Download previous coverage data into the same directory + uses: ./.github/actions/download with: - retention-days: 7 - if-no-files-found: error name: coverage-data-artifact - path: /tmp/coverage/ + path: /tmp/coverage + skip-if-does-not-exist: true # skip if there's no previous coverage to download + + - name: Upload coverage data + uses: ./.github/actions/upload + with: + name: coverage-data-artifact + path: /tmp/coverage diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml new file mode 100644 index 0000000000..28e7d1fb1a --- /dev/null +++ b/.github/actions/upload/action.yml @@ -0,0 +1,51 @@ +name: "Upload an artifact" +description: "Custom upload action" +inputs: + name: + description: "Artifact name" + required: true + path: + description: "A directory or file to upload" + required: true + +runs: + using: "composite" + steps: + - name: Prepare artifact + shell: bash -euxo pipefail {0} + env: + SOURCE: ${{ inputs.path }} + ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst + run: | + mkdir -p $(dirname $ARCHIVE) + + if [ -f ${ARCHIVE} ]; then + echo 2>&1 "File ${ARCHIVE} already exist. Something went wrong before" + exit 1 + fi + + ZSTD_NBTHREADS=0 + if [ -d ${SOURCE} ]; then + time tar -C ${SOURCE} -cf ${ARCHIVE} --zstd . + elif [ -f ${SOURCE} ]; then + time tar -cf ${ARCHIVE} --zstd ${SOURCE} + else + echo 2>&1 "${SOURCE} neither directory nor file, don't know how to handle it" + fi + + - name: Upload artifact + shell: bash -euxo pipefail {0} + env: + SOURCE: ${{ inputs.path }} + ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst + run: | + BUCKET=neon-github-public-dev + PREFIX=artifacts/${GITHUB_RUN_ID} + FILENAME=$(basename $ARCHIVE) + + FILESIZE=$(du -sh ${ARCHIVE} | cut -f1) + + time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME} + + # Ref https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#adding-a-job-summary + echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 312b4d1f46..0be108400c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -3,8 +3,8 @@ name: Test and Deploy on: push: branches: - - main - - release + - main + - release pull_request: defaults: @@ -22,7 +22,8 @@ env: jobs: build-neon: - runs-on: [ self-hosted, Linux, k8s-runner ] + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 strategy: fail-fast: false matrix: @@ -31,6 +32,7 @@ jobs: env: BUILD_TYPE: ${{ matrix.build_type }} + GIT_VERSION: ${{ github.sha }} steps: - name: Checkout @@ -123,6 +125,7 @@ jobs: mkdir -p /tmp/coverage/ mkdir -p /tmp/neon/test_bin/ + test_exe_paths=$( ${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run | jq -r '.executable | select(. != null)' @@ -145,25 +148,20 @@ jobs: - name: Install postgres binaries run: cp -a tmp_install /tmp/neon/pg_install - - name: Prepare neon artifact - run: ZSTD_NBTHREADS=0 tar -C /tmp/neon/ -cf ./neon.tar.zst --zstd . - - - name: Upload neon binaries - uses: actions/upload-artifact@v3 + - name: Upload Neon artifact + uses: ./.github/actions/upload with: - retention-days: 7 - if-no-files-found: error name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact - path: ./neon.tar.zst + path: /tmp/neon # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - name: Merge and upload coverage data if: matrix.build_type == 'debug' uses: ./.github/actions/save-coverage-data - pg_regress-tests: - runs-on: [ self-hosted, Linux, k8s-runner ] + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 needs: [ build-neon ] strategy: fail-fast: false @@ -190,7 +188,8 @@ jobs: uses: ./.github/actions/save-coverage-data other-tests: - runs-on: [ self-hosted, Linux, k8s-runner ] + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 needs: [ build-neon ] strategy: fail-fast: false @@ -216,7 +215,8 @@ jobs: uses: ./.github/actions/save-coverage-data benchmarks: - runs-on: [ self-hosted, Linux, k8s-runner ] + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 needs: [ build-neon ] strategy: fail-fast: false @@ -245,7 +245,8 @@ jobs: # while coverage is currently collected for the debug ones coverage-report: - runs-on: [ self-hosted, Linux, k8s-runner ] + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 needs: [ other-tests, pg_regress-tests ] strategy: fail-fast: false @@ -270,23 +271,17 @@ jobs: target/ key: v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - - name: Get Neon artifact for restoration - uses: actions/download-artifact@v3 + - name: Get Neon artifact + uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact - path: ./neon-artifact/ + path: /tmp/neon - - name: Extract Neon artifact - run: | - mkdir -p /tmp/neon/ - tar -xf ./neon-artifact/neon.tar.zst -C /tmp/neon/ - rm -rf ./neon-artifact/ - - - name: Restore coverage data - uses: actions/download-artifact@v3 + - name: Get coverage artifact + uses: ./.github/actions/download with: name: coverage-data-artifact - path: /tmp/coverage/ + path: /tmp/coverage - name: Merge coverage data run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge @@ -324,40 +319,40 @@ jobs: }" trigger-e2e-tests: - runs-on: [ self-hosted, Linux, k8s-runner ] - needs: [ build-neon ] - steps: - - name: Set PR's status to pending and request a remote CI test - run: | - COMMIT_SHA=${{ github.event.pull_request.head.sha }} - COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ build-neon ] + steps: + - name: Set PR's status to pending and request a remote CI test + run: | + COMMIT_SHA=${{ github.event.pull_request.head.sha }} + COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} - REMOTE_REPO="${{ github.repository_owner }}/cloud" + REMOTE_REPO="${{ github.repository_owner }}/cloud" - curl -f -X POST \ - https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \ - -H "Accept: application/vnd.github.v3+json" \ - --user "${{ secrets.CI_ACCESS_TOKEN }}" \ - --data \ - "{ - \"state\": \"pending\", - \"context\": \"neon-cloud-e2e\", - \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\" - }" + curl -f -X POST \ + https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \ + -H "Accept: application/vnd.github.v3+json" \ + --user "${{ secrets.CI_ACCESS_TOKEN }}" \ + --data \ + "{ + \"state\": \"pending\", + \"context\": \"neon-cloud-e2e\", + \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\" + }" - curl -f -X POST \ - https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \ - -H "Accept: application/vnd.github.v3+json" \ - --user "${{ secrets.CI_ACCESS_TOKEN }}" \ - --data \ - "{ - \"ref\": \"main\", - \"inputs\": { - \"ci_job_name\": \"neon-cloud-e2e\", - \"commit_hash\": \"$COMMIT_SHA\", - \"remote_repo\": \"${{ github.repository }}\" - } - }" + curl -f -X POST \ + https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \ + -H "Accept: application/vnd.github.v3+json" \ + --user "${{ secrets.CI_ACCESS_TOKEN }}" \ + --data \ + "{ + \"ref\": \"main\", + \"inputs\": { + \"ci_job_name\": \"neon-cloud-e2e\", + \"commit_hash\": \"$COMMIT_SHA\", + \"remote_repo\": \"${{ github.repository }}\" + } + }" docker-image: runs-on: [ self-hosted, Linux, k8s-runner ] From b4f2c5b51448c5cf7ba1930edb4c3b3de06483c2 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 1 Aug 2022 17:57:46 +0300 Subject: [PATCH 0583/1022] run benchmarks conditionally, on main or if run_benchmarks label is set --- .github/workflows/build_and_test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 0be108400c..ec7579a0d1 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -218,6 +218,7 @@ jobs: runs-on: dev container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 needs: [ build-neon ] + if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') strategy: fail-fast: false matrix: From 5f71aa09d313f829875012752f65ae99e67fc3fe Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 21 Jul 2022 20:59:07 +0300 Subject: [PATCH 0584/1022] support running tests against real s3 implementation without mocking --- Cargo.lock | 47 ++- control_plane/src/lib.rs | 6 +- libs/remote_storage/src/lib.rs | 3 + libs/remote_storage/src/local_fs.rs | 2 +- libs/remote_storage/src/s3_bucket.rs | 40 +-- neon_local/src/main.rs | 4 +- pageserver/src/storage_sync/download.rs | 10 + .../batch_others/test_ancestor_branch.py | 3 +- .../batch_others/test_remote_storage.py | 20 +- .../test_tenants_with_remote_storage.py | 23 +- test_runner/batch_others/test_wal_acceptor.py | 35 ++- test_runner/fixtures/neon_fixtures.py | 278 ++++++++++++++---- 12 files changed, 315 insertions(+), 156 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5031ae02e3..4a78b2e504 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -154,9 +154,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "axum" -version = "0.5.12" +version = "0.5.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d16705af05732b7d3258ec0f7b73c03a658a28925e050d8852d5b568ee8bcf4e" +checksum = "6b9496f0c1d1afb7a2af4338bbe1d969cddfead41d87a9fb3aaa6d0bbc7af648" dependencies = [ "async-trait", "axum-core", @@ -317,15 +317,6 @@ dependencies = [ "serde", ] -[[package]] -name = "cast" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a" -dependencies = [ - "rustc_version", -] - [[package]] name = "cast" version = "0.3.0" @@ -579,7 +570,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f" dependencies = [ "atty", - "cast 0.3.0", + "cast", "clap 2.34.0", "criterion-plot", "csv", @@ -600,11 +591,11 @@ dependencies = [ [[package]] name = "criterion-plot" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" +checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876" dependencies = [ - "cast 0.2.7", + "cast", "itertools", ] @@ -680,9 +671,9 @@ dependencies = [ [[package]] name = "crypto-common" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ccfd8c0ee4cce11e45b3fd6f9d5e69e0cc62912aa6a0cb1bf4617b0eba5a12f" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" dependencies = [ "generic-array", "typenum", @@ -1116,9 +1107,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.26.1" +version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78cc372d058dcf6d5ecd98510e7fbc9e5aec4d21de70f65fea8fecebcd881bd4" +checksum = "22030e2c5a68ec659fde1e949a745124b48e6fa8b045b7ed5bd1fe4ccc5c4e5d" [[package]] name = "git-version" @@ -1184,9 +1175,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.12.2" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "607c8a29735385251a339424dd462993c0fed8fa09d378f259377df08c126022" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" [[package]] name = "heck" @@ -1388,7 +1379,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" dependencies = [ "autocfg", - "hashbrown 0.12.2", + "hashbrown 0.12.3", ] [[package]] @@ -1851,9 +1842,9 @@ dependencies = [ [[package]] name = "os_str_bytes" -version = "6.1.0" +version = "6.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21326818e99cfe6ce1e524c2a805c189a99b5ae555a35d19f9a284b427d86afa" +checksum = "648001efe5d5c0102d8cea768e348da85d90af8ba91f0bea908f157951493cd4" [[package]] name = "pageserver" @@ -2735,9 +2726,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0a5f7c728f5d284929a1cccb5bc19884422bfe6ef4d6c409da2c41838983fcf" +checksum = "24c8ad4f0c00e1eb5bc7614d236a7f1300e3dbd76b68cac8e06fb00b015ad8d8" [[package]] name = "ryu" @@ -3617,9 +3608,9 @@ checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992" [[package]] name = "unicode-ident" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bd2fe26506023ed7b5e1e315add59d6f584c621d037f9368fea9cfb988f368c" +checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7" [[package]] name = "unicode-normalization" diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index 4dfca588ad..17232ccf45 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -51,7 +51,11 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command { } fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command { - for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] { + for env_key in [ + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + "AWS_SESSION_TOKEN", + ] { if let Ok(value) = std::env::var(env_key) { cmd = cmd.env(env_key, value); } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index dec79e4580..07f8cb08aa 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -66,6 +66,9 @@ pub trait RemoteStorage: Send + Sync { async fn list(&self) -> anyhow::Result>; /// Lists all top level subdirectories for a given prefix + /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id + /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS) + /// so this method doesnt need to. async fn list_prefixes( &self, prefix: Option, diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index df1581fb51..07b04084b9 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -116,7 +116,7 @@ impl RemoteStorage for LocalFs { prefix: Option, ) -> anyhow::Result> { let path = match prefix { - Some(prefix) => Cow::Owned(self.storage_root.join(prefix)), + Some(prefix) => Cow::Owned(prefix), None => Cow::Borrowed(&self.storage_root), }; get_all_files(path.as_ref(), false).await diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index ff52f033d1..1b241fe4ed 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -171,17 +171,25 @@ impl S3Bucket { let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").ok(); let secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").ok(); + // session token is used when authorizing through sso + // which is typically the case when testing locally on developer machine + let session_token = std::env::var("AWS_SESSION_TOKEN").ok(); let client = if access_key_id.is_none() && secret_access_key.is_none() { debug!("Using IAM-based AWS access"); S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region) } else { - debug!("Using credentials-based AWS access"); + debug!( + "Using credentials-based AWS access. Session token is set: {}", + session_token.is_some() + ); S3Client::new_with( request_dispatcher, - StaticProvider::new_minimal( + StaticProvider::new( access_key_id.unwrap_or_default(), secret_access_key.unwrap_or_default(), + session_token, + None, ), region, ) @@ -304,32 +312,24 @@ impl RemoteStorage for S3Bucket { Ok(document_keys) } + /// See the doc for `RemoteStorage::list_prefixes` /// Note: it wont include empty "directories" async fn list_prefixes( &self, prefix: Option, ) -> anyhow::Result> { - let list_prefix = match prefix { - Some(prefix) => { - let mut prefix_in_bucket = self.prefix_in_bucket.clone().unwrap_or_default(); - // if there is no trailing / in default prefix and - // supplied prefix does not start with "/" insert it - if !(prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR) - || prefix.0.starts_with(S3_PREFIX_SEPARATOR)) - { - prefix_in_bucket.push(S3_PREFIX_SEPARATOR); - } - - prefix_in_bucket.push_str(&prefix.0); + // get the passed prefix or if it is not set use prefix_in_bucket value + let list_prefix = prefix + .map(|p| p.0) + .or_else(|| self.prefix_in_bucket.clone()) + .map(|mut p| { // required to end with a separator // otherwise request will return only the entry of a prefix - if !prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR) { - prefix_in_bucket.push(S3_PREFIX_SEPARATOR); + if !p.ends_with(S3_PREFIX_SEPARATOR) { + p.push(S3_PREFIX_SEPARATOR); } - Some(prefix_in_bucket) - } - None => self.prefix_in_bucket.clone(), - }; + p + }); let mut document_keys = Vec::new(); diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index e6f5c6125d..24b40b72d6 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -884,7 +884,7 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul match sub_match.subcommand() { Some(("start", start_match)) => { if let Err(e) = pageserver.start(&pageserver_config_overrides(start_match)) { - eprintln!("pageserver start failed: {}", e); + eprintln!("pageserver start failed: {e}"); exit(1); } } @@ -906,7 +906,7 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul } if let Err(e) = pageserver.start(&pageserver_config_overrides(restart_match)) { - eprintln!("pageserver start failed: {}", e); + eprintln!("pageserver start failed: {e}"); exit(1); } } diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index a91eaaa7ca..441d5e563e 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -130,6 +130,7 @@ where tenant_path.display() ) })?; + let timelines = storage .list_prefixes(Some(tenant_storage_path)) .await @@ -140,6 +141,13 @@ where ) })?; + if timelines.is_empty() { + anyhow::bail!( + "no timelines found on the remote storage for tenant {}", + tenant_id + ) + } + let mut sync_ids = HashSet::new(); for timeline_remote_storage_key in timelines { @@ -194,6 +202,8 @@ where }) .map_err(DownloadError::BadInput)?; + warn!("part_storage_path {:?}", part_storage_path); + let mut index_part_download = storage.download(&part_storage_path).await?; let mut index_part_bytes = Vec::new(); diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index d8ba0a1b06..c4d36da043 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -1,6 +1,5 @@ -import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.utils import query_scalar diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index 6a8497a559..72963ffe21 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -2,11 +2,10 @@ # env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... import shutil, os -from contextlib import closing from pathlib import Path import time from uuid import UUID -from fixtures.neon_fixtures import NeonEnvBuilder, assert_timeline_local, wait_until, wait_for_last_record_lsn, wait_for_upload +from fixtures.neon_fixtures import NeonEnvBuilder, RemoteStorageKind, assert_timeline_local, available_remote_storages, wait_until, wait_for_last_record_lsn, wait_for_upload from fixtures.log_helper import log from fixtures.utils import lsn_from_hex, query_scalar import pytest @@ -29,18 +28,19 @@ import pytest # * queries the specific data, ensuring that it matches the one stored before # # The tests are done for all types of remote storage pageserver supports. -@pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3']) -def test_remote_storage_backup_and_restore(neon_env_builder: NeonEnvBuilder, storage_type: str): +@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages()) +def test_remote_storage_backup_and_restore( + neon_env_builder: NeonEnvBuilder, + remote_storatge_kind: RemoteStorageKind, +): # Use this test to check more realistic SK ids: some etcd key parsing bugs were related, # and this test needs SK to write data to pageserver, so it will be visible neon_env_builder.safekeepers_id_start = 12 - if storage_type == 'local_fs': - neon_env_builder.enable_local_fs_remote_storage() - elif storage_type == 'mock_s3': - neon_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore') - else: - raise RuntimeError(f'Unknown storage type: {storage_type}') + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storatge_kind, + test_name='test_remote_storage_backup_and_restore', + ) data_id = 1 data_secret = 'very secret secret' diff --git a/test_runner/batch_others/test_tenants_with_remote_storage.py b/test_runner/batch_others/test_tenants_with_remote_storage.py index 8ddb4d1b92..636616a45b 100644 --- a/test_runner/batch_others/test_tenants_with_remote_storage.py +++ b/test_runner/batch_others/test_tenants_with_remote_storage.py @@ -13,7 +13,7 @@ from uuid import UUID import pytest -from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv, Postgres, wait_for_last_record_lsn, wait_for_upload +from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv, Postgres, RemoteStorageKind, available_remote_storages, wait_for_last_record_lsn, wait_for_upload from fixtures.utils import lsn_from_hex @@ -38,7 +38,7 @@ async def tenant_workload(env: NeonEnv, pg: Postgres): async def all_tenants_workload(env: NeonEnv, tenants_pgs): workers = [] - for tenant, pg in tenants_pgs: + for _, pg in tenants_pgs: worker = tenant_workload(env, pg) workers.append(asyncio.create_task(worker)) @@ -46,23 +46,18 @@ async def all_tenants_workload(env: NeonEnv, tenants_pgs): await asyncio.gather(*workers) -@pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3']) -def test_tenants_many(neon_env_builder: NeonEnvBuilder, storage_type: str): - - if storage_type == 'local_fs': - neon_env_builder.enable_local_fs_remote_storage() - elif storage_type == 'mock_s3': - neon_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore') - else: - raise RuntimeError(f'Unknown storage type: {storage_type}') - - neon_env_builder.enable_local_fs_remote_storage() +@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages()) +def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storatge_kind, + test_name='test_tenants_many', + ) env = neon_env_builder.init_start() tenants_pgs: List[Tuple[UUID, Postgres]] = [] - for i in range(1, 5): + for _ in range(1, 5): # Use a tiny checkpoint distance, to create a lot of layers quickly tenant, _ = env.neon_cli.create_tenant( conf={ diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index da861bb9f3..6544681bb0 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -12,9 +12,8 @@ import uuid from contextlib import closing from dataclasses import dataclass, field -from multiprocessing import Process, Value from pathlib import Path -from fixtures.neon_fixtures import NeonPageserver, PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, neon_binpath, PgProtocol, wait_for_last_record_lsn, wait_for_upload +from fixtures.neon_fixtures import NeonPageserver, PgBin, Etcd, Postgres, RemoteStorageKind, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, available_remote_storages, neon_binpath, PgProtocol, wait_for_last_record_lsn, wait_for_upload from fixtures.utils import get_dir_size, lsn_to_hex, lsn_from_hex, query_scalar from fixtures.log_helper import log from typing import List, Optional, Any @@ -377,15 +376,15 @@ def wait_wal_trim(tenant_id, timeline_id, sk, target_size): time.sleep(0.5) -@pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs']) -def test_wal_backup(neon_env_builder: NeonEnvBuilder, storage_type: str): +@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages()) +def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind): neon_env_builder.num_safekeepers = 3 - if storage_type == 'local_fs': - neon_env_builder.enable_local_fs_remote_storage() - elif storage_type == 'mock_s3': - neon_env_builder.enable_s3_mock_remote_storage('test_safekeepers_wal_backup') - else: - raise RuntimeError(f'Unknown storage type: {storage_type}') + + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storatge_kind, + test_name='test_safekeepers_wal_backup', + ) + neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER env = neon_env_builder.init_start() @@ -425,15 +424,15 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, storage_type: str): wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], '0/5000000') -@pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs']) -def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, storage_type: str): +@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages()) +def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind): neon_env_builder.num_safekeepers = 3 - if storage_type == 'local_fs': - neon_env_builder.enable_local_fs_remote_storage() - elif storage_type == 'mock_s3': - neon_env_builder.enable_s3_mock_remote_storage('test_s3_wal_replay') - else: - raise RuntimeError(f'Unknown storage type: {storage_type}') + + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storatge_kind, + test_name='test_s3_wal_replay', + ) + neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER env = neon_env_builder.init_start() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 6783ab710b..87a598b387 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3,6 +3,7 @@ from __future__ import annotations from dataclasses import field from contextlib import contextmanager from enum import Flag, auto +import enum import textwrap from cached_property import cached_property import abc @@ -262,6 +263,11 @@ def default_broker(request: Any, port_distributor: PortDistributor): broker.stop() +@pytest.fixture(scope='session') +def run_id(): + yield uuid.uuid4() + + @pytest.fixture(scope='session') def mock_s3_server(port_distributor: PortDistributor): mock_s3_server = MockS3Server(port_distributor.get_port()) @@ -438,26 +444,43 @@ class MockS3Server: def secret_key(self) -> str: return 'test' - def access_env_vars(self) -> Dict[Any, Any]: - return { - 'AWS_ACCESS_KEY_ID': self.access_key(), - 'AWS_SECRET_ACCESS_KEY': self.secret_key(), - } - def kill(self): self.subprocess.kill() +@enum.unique +class RemoteStorageKind(enum.Enum): + LOCAL_FS = "local_fs" + MOCK_S3 = "mock_s3" + REAL_S3 = "real_s3" + + +def available_remote_storages() -> List[RemoteStorageKind]: + remote_storages = [RemoteStorageKind.LOCAL_FS, RemoteStorageKind.MOCK_S3] + if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"): + remote_storages.append(RemoteStorageKind.REAL_S3) + return remote_storages + + @dataclass class LocalFsStorage: - local_path: Path + root: Path @dataclass class S3Storage: bucket_name: str bucket_region: str - endpoint: Optional[str] + access_key: str + secret_key: str + endpoint: Optional[str] = None + prefix_in_bucket: Optional[str] = None + + def access_env_vars(self) -> Dict[str, str]: + return { + 'AWS_ACCESS_KEY_ID': self.access_key, + 'AWS_SECRET_ACCESS_KEY': self.secret_key, + } RemoteStorage = Union[LocalFsStorage, S3Storage] @@ -466,16 +489,20 @@ RemoteStorage = Union[LocalFsStorage, S3Storage] # serialize as toml inline table def remote_storage_to_toml_inline_table(remote_storage): if isinstance(remote_storage, LocalFsStorage): - res = f"local_path='{remote_storage.local_path}'" + remote_storage_config = f"local_path='{remote_storage.root}'" elif isinstance(remote_storage, S3Storage): - res = f"bucket_name='{remote_storage.bucket_name}', bucket_region='{remote_storage.bucket_region}'" + remote_storage_config = f"bucket_name='{remote_storage.bucket_name}',\ + bucket_region='{remote_storage.bucket_region}'" + + if remote_storage.prefix_in_bucket is not None: + remote_storage_config += f",prefix_in_bucket='{remote_storage.prefix_in_bucket}'" + if remote_storage.endpoint is not None: - res += f", endpoint='{remote_storage.endpoint}'" - else: - raise Exception(f'Unknown storage configuration {remote_storage}') + remote_storage_config += f",endpoint='{remote_storage.endpoint}'" else: raise Exception("invalid remote storage type") - return f"{{{res}}}" + + return f"{{{remote_storage_config}}}" class RemoteStorageUsers(Flag): @@ -493,28 +520,31 @@ class NeonEnvBuilder: cleaned up after the test has finished. """ def __init__( - self, - repo_dir: Path, - port_distributor: PortDistributor, - broker: Etcd, - mock_s3_server: MockS3Server, - remote_storage: Optional[RemoteStorage] = None, - remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER, - pageserver_config_override: Optional[str] = None, - num_safekeepers: int = 1, - # Use non-standard SK ids to check for various parsing bugs - safekeepers_id_start: int = 0, - # fsync is disabled by default to make the tests go faster - safekeepers_enable_fsync: bool = False, - auth_enabled: bool = False, - rust_log_override: Optional[str] = None, - default_branch_name=DEFAULT_BRANCH_NAME): + self, + repo_dir: Path, + port_distributor: PortDistributor, + broker: Etcd, + run_id: uuid.UUID, + mock_s3_server: MockS3Server, + remote_storage: Optional[RemoteStorage] = None, + remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER, + pageserver_config_override: Optional[str] = None, + num_safekeepers: int = 1, + # Use non-standard SK ids to check for various parsing bugs + safekeepers_id_start: int = 0, + # fsync is disabled by default to make the tests go faster + safekeepers_enable_fsync: bool = False, + auth_enabled: bool = False, + rust_log_override: Optional[str] = None, + default_branch_name=DEFAULT_BRANCH_NAME, + ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override self.port_distributor = port_distributor self.remote_storage = remote_storage self.remote_storage_users = remote_storage_users self.broker = broker + self.run_id = run_id self.mock_s3_server = mock_s3_server self.pageserver_config_override = pageserver_config_override self.num_safekeepers = num_safekeepers @@ -523,6 +553,8 @@ class NeonEnvBuilder: self.auth_enabled = auth_enabled self.default_branch_name = default_branch_name self.env: Optional[NeonEnv] = None + self.remote_storage_prefix: Optional[str] = None + self.keep_remote_storage_contents: bool = True def init(self) -> NeonEnv: # Cannot create more than one environment from one builder @@ -538,41 +570,142 @@ class NeonEnvBuilder: self.start() return env - """ - Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path. - Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. - """ + def enable_remote_storage( + self, + remote_storage_kind: RemoteStorageKind, + test_name: str, + force_enable: bool = True, + ): + if remote_storage_kind == RemoteStorageKind.LOCAL_FS: + self.enable_local_fs_remote_storage(force_enable=force_enable) + elif remote_storage_kind == RemoteStorageKind.MOCK_S3: + self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable) + elif remote_storage_kind == RemoteStorageKind.REAL_S3: + self.enable_real_s3_remote_storage(test_name=test_name, force_enable=force_enable) + else: + raise RuntimeError(f'Unknown storage type: {remote_storage_kind}') def enable_local_fs_remote_storage(self, force_enable=True): + """ + Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path. + Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. + """ assert force_enable or self.remote_storage is None, "remote storage is enabled already" self.remote_storage = LocalFsStorage(Path(self.repo_dir / 'local_fs_remote_storage')) - """ - Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already. - Starts up the mock server, if that does not run yet. - Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. - """ - - def enable_s3_mock_remote_storage(self, bucket_name: str, force_enable=True): + def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable=True): + """ + Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already. + Starts up the mock server, if that does not run yet. + Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. + """ assert force_enable or self.remote_storage is None, "remote storage is enabled already" mock_endpoint = self.mock_s3_server.endpoint() mock_region = self.mock_s3_server.region() - boto3.client( + + self.remote_storage_client = boto3.client( 's3', endpoint_url=mock_endpoint, region_name=mock_region, aws_access_key_id=self.mock_s3_server.access_key(), aws_secret_access_key=self.mock_s3_server.secret_key(), ).create_bucket(Bucket=bucket_name) + + self.remote_storage = S3Storage( + bucket_name=bucket_name, + endpoint=mock_endpoint, + bucket_region=mock_region, + access_key=self.mock_s3_server.access_key(), + secret_key=self.mock_s3_server.secret_key(), + ) + + def enable_real_s3_remote_storage(self, test_name: str, force_enable=True): + """ + Sets up configuration to use real s3 endpoint without mock server + """ + assert force_enable or self.remote_storage is None, "remote storage is enabled already" + + access_key = os.getenv("AWS_ACCESS_KEY_ID") + assert access_key, "no aws access key provided" + secret_key = os.getenv("AWS_SECRET_ACCESS_KEY") + assert secret_key, "no aws access key provided" + + # session token is needed for local runs with sso auth + session_token = os.getenv("AWS_SESSION_TOKEN") + + bucket_name = os.getenv("REMOTE_STORAGE_S3_BUCKET") + assert bucket_name, "no remote storage bucket name provided" + region = os.getenv("REMOTE_STORAGE_S3_REGION") + assert region, "no remote storage region provided" + + # do not leave data in real s3 + self.keep_remote_storage_contents = False + + # construct a prefix inside bucket for the particular test case and test run + self.remote_storage_prefix = f'{self.run_id}/{test_name}' + + self.remote_storage_client = boto3.client( + 's3', + region_name=region, + aws_access_key_id=access_key, + aws_secret_access_key=secret_key, + aws_session_token=session_token, + ) self.remote_storage = S3Storage(bucket_name=bucket_name, - endpoint=mock_endpoint, - bucket_region=mock_region) + bucket_region=region, + access_key=access_key, + secret_key=secret_key, + prefix_in_bucket=self.remote_storage_prefix) + + def cleanup_remote_storage(self): + # here wee check for true remote storage, no the local one + # local cleanup is not needed after test because in ci all env will be destroyed anyway + if self.remote_storage_prefix is None: + log.info("no remote storage was set up, skipping cleanup") + return + + if self.keep_remote_storage_contents: + log.info("keep_remote_storage_contents skipping remote storage cleanup") + return + + log.info("removing data from test s3 bucket %s by prefix %s", + self.remote_storage.bucket_name, + self.remote_storage_prefix) + paginator = self.remote_storage_client.get_paginator('list_objects_v2') + pages = paginator.paginate( + Bucket=self.remote_storage.bucket_name, + Prefix=self.remote_storage_prefix, + ) + + objects_to_delete = {'Objects': []} + cnt = 0 + for item in pages.search('Contents'): + # weirdly when nothing is found it returns [None] + if item is None: + break + + objects_to_delete['Objects'].append({'Key': item['Key']}) + + # flush once aws limit reached + if len(objects_to_delete['Objects']) >= 1000: + self.remote_storage_client.delete_objects( + Bucket=self.remote_storage.bucket_name, + Delete=objects_to_delete, + ) + objects_to_delete = dict(Objects=[]) + cnt += 1 + + # flush rest + if len(objects_to_delete['Objects']): + self.remote_storage_client.delete_objects(Bucket=self.remote_storage.bucket_name, + Delete=objects_to_delete) + + log.info("deleted %s objects from remote storage", cnt) def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): - # Stop all the nodes. if self.env: log.info('Cleaning up all storage and compute nodes') @@ -581,6 +714,8 @@ class NeonEnvBuilder: sk.stop(immediate=True) self.env.pageserver.stop(immediate=True) + self.cleanup_remote_storage() + class NeonEnv: """ @@ -713,10 +848,13 @@ class NeonEnv: @pytest.fixture(scope=shareable_scope) -def _shared_simple_env(request: Any, - port_distributor: PortDistributor, - mock_s3_server: MockS3Server, - default_broker: Etcd) -> Iterator[NeonEnv]: +def _shared_simple_env( + request: Any, + port_distributor: PortDistributor, + mock_s3_server: MockS3Server, + default_broker: Etcd, + run_id: uuid.UUID, +) -> Iterator[NeonEnv]: """ # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES is set, this is shared by all tests using `neon_simple_env`. @@ -730,8 +868,13 @@ def _shared_simple_env(request: Any, repo_dir = os.path.join(str(top_output_dir), "shared_repo") shutil.rmtree(repo_dir, ignore_errors=True) - with NeonEnvBuilder(Path(repo_dir), port_distributor, default_broker, - mock_s3_server) as builder: + with NeonEnvBuilder( + repo_dir=Path(repo_dir), + port_distributor=port_distributor, + broker=default_broker, + mock_s3_server=mock_s3_server, + run_id=run_id, + ) as builder: env = builder.init_start() # For convenience in tests, create a branch from the freshly-initialized cluster. @@ -756,10 +899,13 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]: @pytest.fixture(scope='function') -def neon_env_builder(test_output_dir, - port_distributor: PortDistributor, - mock_s3_server: MockS3Server, - default_broker: Etcd) -> Iterator[NeonEnvBuilder]: +def neon_env_builder( + test_output_dir, + port_distributor: PortDistributor, + mock_s3_server: MockS3Server, + default_broker: Etcd, + run_id: uuid.UUID, +) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -777,8 +923,13 @@ def neon_env_builder(test_output_dir, repo_dir = os.path.join(test_output_dir, "repo") # Return the builder to the caller - with NeonEnvBuilder(Path(repo_dir), port_distributor, default_broker, - mock_s3_server) as builder: + with NeonEnvBuilder( + repo_dir=Path(repo_dir), + port_distributor=port_distributor, + mock_s3_server=mock_s3_server, + broker=default_broker, + run_id=run_id, + ) as builder: yield builder @@ -1183,7 +1334,10 @@ class NeonCli(AbstractNeonCli): remote_storage_users=self.env.remote_storage_users, pageserver_config_override=self.env.pageserver.config_override) - s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None + s3_env_vars = None + if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage): + s3_env_vars = self.env.remote_storage.access_env_vars() + return self.raw_cli(start_args, extra_env_vars=s3_env_vars) def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]': @@ -1195,7 +1349,10 @@ class NeonCli(AbstractNeonCli): return self.raw_cli(cmd) def safekeeper_start(self, id: int) -> 'subprocess.CompletedProcess[str]': - s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None + s3_env_vars = None + if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage): + s3_env_vars = self.env.remote_storage.access_env_vars() + return self.raw_cli(['safekeeper', 'start', str(id)], extra_env_vars=s3_env_vars) def safekeeper_stop(self, @@ -1337,7 +1494,7 @@ class NeonPageserver(PgProtocol): return self def __exit__(self, exc_type, exc, tb): - self.stop(True) + self.stop(immediate=True) def http_client(self, auth_token: Optional[str] = None) -> NeonPageserverHttpClient: return NeonPageserverHttpClient( @@ -1354,6 +1511,7 @@ def append_pageserver_param_overrides( ): if bool(remote_storage_users & RemoteStorageUsers.PAGESERVER) and remote_storage is not None: remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage) + params_to_update.append( f'--pageserver-config-override=remote_storage={remote_storage_toml_table}') From bc2cb5382b3b7101d029a2b780d07ee81f898a97 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 27 Jul 2022 16:03:09 +0300 Subject: [PATCH 0585/1022] run real s3 tests in CI --- .../actions/run-python-test-set/action.yml | 32 ++++++++++++++++++- .github/workflows/build_and_test.yml | 6 +++- test_runner/fixtures/neon_fixtures.py | 8 +++-- 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index c9987053ce..fcc8983e40 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -27,6 +27,26 @@ inputs: description: 'Whether to upload the performance report' required: false default: 'false' + run_with_real_s3: + description: 'Whether to pass real s3 credentials to the test suite' + required: false + default: 'false' + real_s3_bucket: + description: 'Bucket name for real s3 tests' + required: false + default: '' + real_s3_region: + description: 'Region name for real s3 tests' + required: false + default: '' + real_s3_access_key_id: + description: 'Access key id' + required: false + default: '' + real_s3_secret_access_key: + description: 'Secret access key' + required: false + default: '' runs: using: "composite" @@ -63,7 +83,9 @@ runs: # this variable will be embedded in perf test report # and is needed to distinguish different environments PLATFORM: github-actions-selfhosted - shell: bash -euxo pipefail {0} + AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }} + AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }} + shell: bash -euxo pipefail {0} {0} run: | PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)" rm -rf $PERF_REPORT_DIR @@ -77,6 +99,14 @@ runs: if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then EXTRA_PARAMS="-n4 $EXTRA_PARAMS" fi + + if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then + echo "REAL S3 ENABLED" + export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty + export REMOTE_STORAGE_S3_BUCKET=${{ inputs.real_s3_bucket }} + export REMOTE_STORAGE_S3_REGION=${{ inputs.real_s3_region }} + fi + if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then mkdir -p "$PERF_REPORT_DIR" diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index ec7579a0d1..4e784f0920 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -209,7 +209,11 @@ jobs: build_type: ${{ matrix.build_type }} rust_toolchain: ${{ matrix.rust_toolchain }} test_selection: batch_others - + run_with_real_s3: true + real_s3_bucket: ci-tests-s3 + real_s3_region: us-west-2 + real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}" + real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}" - name: Merge and upload coverage data if: matrix.build_type == 'debug' uses: ./.github/actions/save-coverage-data diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 87a598b387..9b39bf2b39 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -457,8 +457,11 @@ class RemoteStorageKind(enum.Enum): def available_remote_storages() -> List[RemoteStorageKind]: remote_storages = [RemoteStorageKind.LOCAL_FS, RemoteStorageKind.MOCK_S3] - if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"): + if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None: remote_storages.append(RemoteStorageKind.REAL_S3) + log.info("Enabling real s3 storage for tests") + else: + log.info("Using mock implementations to test remote storage") return remote_storages @@ -609,7 +612,8 @@ class NeonEnvBuilder: region_name=mock_region, aws_access_key_id=self.mock_s3_server.access_key(), aws_secret_access_key=self.mock_s3_server.secret_key(), - ).create_bucket(Bucket=bucket_name) + ) + self.remote_storage_client.create_bucket(Bucket=bucket_name) self.remote_storage = S3Storage( bucket_name=bucket_name, From 52ce1c9d5352d13ffedf6f2a0a261abe07785c3c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 4 Aug 2022 12:57:15 +0300 Subject: [PATCH 0586/1022] Speed up test shutdown, by polling more frequently. A fair amount of the time in our python tests is spent waiting for the pageserver and safekeeper processes to shut down. It doesn't matter so much when you're running a lot of tests in parallel, but it's quite noticeable when running them sequentially. A big part of the slowness is that is that after sending the SIGTERM signal, we poll to see if the process is still running, and the polling happened at 1 s interval. Reduce it to 0.1 s. --- control_plane/src/safekeeper.rs | 10 ++++++---- control_plane/src/storage.rs | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index d87be95b82..0cae479d71 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -247,7 +247,7 @@ impl SafekeeperNode { // Shutting down may take a long time, // if safekeeper flushes a lot of data let mut tcp_stopped = false; - for _ in 0..100 { + for i in 0..600 { if !tcp_stopped { if let Err(err) = TcpStream::connect(&address) { tcp_stopped = true; @@ -272,9 +272,11 @@ impl SafekeeperNode { } } } - print!("."); - io::stdout().flush().unwrap(); - thread::sleep(Duration::from_secs(1)); + if i % 10 == 0 { + print!("."); + io::stdout().flush().unwrap(); + } + thread::sleep(Duration::from_millis(100)); } bail!("Failed to stop safekeeper with pid {}", pid); diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 13d64a79f0..c2ed3fc824 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -318,7 +318,7 @@ impl PageServerNode { // Shutting down may take a long time, // if pageserver checkpoints a lot of data let mut tcp_stopped = false; - for _ in 0..100 { + for i in 0..600 { if !tcp_stopped { if let Err(err) = TcpStream::connect(&address) { tcp_stopped = true; @@ -344,9 +344,11 @@ impl PageServerNode { } } } - print!("."); - io::stdout().flush().unwrap(); - thread::sleep(Duration::from_secs(1)); + if i % 10 == 0 { + print!("."); + io::stdout().flush().unwrap(); + } + thread::sleep(Duration::from_millis(100)); } bail!("Failed to stop pageserver with pid {}", pid); From e54941b8118a81b069de5613c5ef60a93a5b51ef Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 3 Aug 2022 12:25:20 +0300 Subject: [PATCH 0587/1022] treat pytest warnings as errors --- pytest.ini | 3 + .../batch_others/test_branch_and_gc.py | 2 + .../batch_others/test_pageserver_api.py | 90 ++++++++++--------- test_runner/fixtures/neon_fixtures.py | 14 +-- 4 files changed, 61 insertions(+), 48 deletions(-) diff --git a/pytest.ini b/pytest.ini index da9ab8c12f..104d0e0244 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,7 @@ [pytest] +filterwarnings = + error::pytest.PytestUnhandledThreadExceptionWarning + error::UserWarning addopts = -m 'not remote_cluster' markers = diff --git a/test_runner/batch_others/test_branch_and_gc.py b/test_runner/batch_others/test_branch_and_gc.py index 76a77357ae..8e433f65ad 100644 --- a/test_runner/batch_others/test_branch_and_gc.py +++ b/test_runner/batch_others/test_branch_and_gc.py @@ -167,3 +167,5 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): # The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC. with pytest.raises(Exception, match="invalid branch start lsn"): env.neon_cli.create_branch('b1', 'b0', tenant_id=tenant, ancestor_start_lsn=lsn) + + thread.join() diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 95791888a5..51df41699a 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -60,17 +60,38 @@ def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID): def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): env = neon_simple_env - client = env.pageserver.http_client() + with env.pageserver.http_client() as client: + tenant_id, timeline_id = env.neon_cli.create_tenant() - tenant_id, timeline_id = env.neon_cli.create_tenant() + timeline_details = client.timeline_detail(tenant_id=tenant_id, + timeline_id=timeline_id, + include_non_incremental_logical_size=True) - timeline_details = client.timeline_detail(tenant_id=tenant_id, - timeline_id=timeline_id, - include_non_incremental_logical_size=True) + assert timeline_details.get('wal_source_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running' + assert timeline_details.get('last_received_msg_lsn') is None, 'Should not be able to connect to WAL streaming without PG compute node running' + assert timeline_details.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running' - assert timeline_details.get('wal_source_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running' - assert timeline_details.get('last_received_msg_lsn') is None, 'Should not be able to connect to WAL streaming without PG compute node running' - assert timeline_details.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running' + +def expect_updated_msg_lsn(client: NeonPageserverHttpClient, + tenant_id: UUID, + timeline_id: UUID, + prev_msg_lsn: Optional[int]) -> int: + timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id) + + # a successful `timeline_details` response must contain the below fields + local_timeline_details = timeline_details['local'] + assert "wal_source_connstr" in local_timeline_details.keys() + assert "last_received_msg_lsn" in local_timeline_details.keys() + assert "last_received_msg_ts" in local_timeline_details.keys() + + assert local_timeline_details["last_received_msg_lsn"] is not None, "the last received message's LSN is empty" + + last_msg_lsn = lsn_from_hex(local_timeline_details["last_received_msg_lsn"]) + assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, \ + f"the last received message's LSN {last_msg_lsn} hasn't been updated \ + compared to the previous message's LSN {prev_msg_lsn}" + + return last_msg_lsn # Test the WAL-receiver related fields in the response to `timeline_details` API call @@ -79,44 +100,29 @@ def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): # `timeline_details` now. def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): env = neon_simple_env - client = env.pageserver.http_client() + with env.pageserver.http_client() as client: + tenant_id, timeline_id = env.neon_cli.create_tenant() + pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id) - tenant_id, timeline_id = env.neon_cli.create_tenant() - pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id) + # Wait to make sure that we get a latest WAL receiver data. + # We need to wait here because it's possible that we don't have access to + # the latest WAL yet, when the `timeline_detail` API is first called. + # See: https://github.com/neondatabase/neon/issues/1768. + lsn = wait_until(number_of_iterations=5, + interval=1, + func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None)) - def expect_updated_msg_lsn(prev_msg_lsn: Optional[int]) -> int: - timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id) - - # a successful `timeline_details` response must contain the below fields - local_timeline_details = timeline_details['local'] - assert "wal_source_connstr" in local_timeline_details.keys() - assert "last_received_msg_lsn" in local_timeline_details.keys() - assert "last_received_msg_ts" in local_timeline_details.keys() - - assert local_timeline_details["last_received_msg_lsn"] is not None, "the last received message's LSN is empty" - - last_msg_lsn = lsn_from_hex(local_timeline_details["last_received_msg_lsn"]) - assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, \ - f"the last received message's LSN {last_msg_lsn} hasn't been updated \ - compared to the previous message's LSN {prev_msg_lsn}" - - return last_msg_lsn - - # Wait to make sure that we get a latest WAL receiver data. - # We need to wait here because it's possible that we don't have access to - # the latest WAL yet, when the `timeline_detail` API is first called. - # See: https://github.com/neondatabase/neon/issues/1768. - lsn = wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(None)) - - # Make a DB modification then expect getting a new WAL receiver's data. - pg.safe_psql("CREATE TABLE t(key int primary key, value text)") - wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(lsn)) + # Make a DB modification then expect getting a new WAL receiver's data. + pg.safe_psql("CREATE TABLE t(key int primary key, value text)") + wait_until(number_of_iterations=5, + interval=1, + func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn)) def test_pageserver_http_api_client(neon_simple_env: NeonEnv): env = neon_simple_env - client = env.pageserver.http_client() - check_client(client, env.initial_tenant) + with env.pageserver.http_client() as client: + check_client(client, env.initial_tenant) def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder): @@ -125,5 +131,5 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde management_token = env.auth_keys.generate_management_token() - client = env.pageserver.http_client(auth_token=management_token) - check_client(client, env.initial_tenant) + with env.pageserver.http_client(auth_token=management_token) as client: + check_client(client, env.initial_tenant) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9b39bf2b39..3848aee05a 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -222,7 +222,7 @@ def can_bind(host: str, port: int) -> bool: # moment. If that changes, we should use start using SO_REUSEADDR here # too, to allow reusing ports more quickly. # See https://github.com/neondatabase/neon/issues/801 - #sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + # sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) try: sock.bind((host, port)) @@ -231,6 +231,8 @@ def can_bind(host: str, port: int) -> bool: except socket.error: log.info(f"Port {port} is in use, skipping") return False + finally: + sock.close() class PortDistributor: @@ -2022,8 +2024,8 @@ class Safekeeper: started_at = time.time() while True: try: - http_cli = self.http_client() - http_cli.check_status() + with self.http_client() as http_cli: + http_cli.check_status() except Exception as e: elapsed = time.time() - started_at if elapsed > 3: @@ -2174,9 +2176,9 @@ class Etcd: return f'http://127.0.0.1:{self.port}' def check_status(self): - s = requests.Session() - s.mount('http://', requests.adapters.HTTPAdapter(max_retries=1)) # do not retry - s.get(f"{self.client_url()}/health").raise_for_status() + with requests.Session() as s: + s.mount('http://', requests.adapters.HTTPAdapter(max_retries=1)) # do not retry + s.get(f"{self.client_url()}/health").raise_for_status() def try_start(self): if self.handle is not None: From f7d8db7e3990a3c151859b39ef103d124a61453f Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 3 Aug 2022 17:40:01 +0300 Subject: [PATCH 0588/1022] silence https://github.com/neondatabase/neon/issues/2211 --- test_runner/batch_others/test_tenant_detach.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/test_runner/batch_others/test_tenant_detach.py b/test_runner/batch_others/test_tenant_detach.py index 2df5409b4f..afc4f89bbf 100644 --- a/test_runner/batch_others/test_tenant_detach.py +++ b/test_runner/batch_others/test_tenant_detach.py @@ -1,10 +1,19 @@ from threading import Thread from uuid import uuid4 +import uuid import psycopg2 import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException + + +def do_gc_target(env: NeonEnv, tenant_id: uuid.UUID, timeline_id: uuid.UUID): + """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" + try: + env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0') + except Exception as e: + log.error("do_gc failed: %s", e) def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): @@ -36,8 +45,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {uuid4().hex} 0') # try to concurrently run gc and detach - gc_thread = Thread( - target=lambda: env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0'), ) + gc_thread = Thread(target=lambda: do_gc_target(env, tenant_id, timeline_id)) gc_thread.start() last_error = None From 1bbc8090f3d7b750d8f87bd79664bade2f7bc345 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Mon, 1 Aug 2022 12:53:52 +0200 Subject: [PATCH 0589/1022] [issue #1591] Add `neon_local pageserver status` handler --- neon_local/src/main.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index 24b40b72d6..c4dd52e183 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -910,6 +910,15 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul exit(1); } } + + Some(("status", _)) => match PageServerNode::from_env(env).check_status() { + Ok(_) => println!("Page server is up and running"), + Err(err) => { + eprintln!("Page server is not available: {}", err); + exit(1); + } + }, + Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name), None => bail!("no pageserver subcommand provided"), } From 0a958b0ea1da2cf9f097cdb45b6e1740fccc4484 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 4 Aug 2022 11:39:52 +0000 Subject: [PATCH 0590/1022] Check find_end_of_wal errors instead of unwrap --- libs/postgres_ffi/src/xlog_utils.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 520870cc53..8cdfd92fc1 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -16,7 +16,7 @@ use crate::XLogRecord; use crate::XLOG_PAGE_MAGIC; use crate::pg_constants::WAL_SEGMENT_SIZE; -use anyhow::{bail, ensure}; +use anyhow::{anyhow, bail, ensure}; use byteorder::{ByteOrder, LittleEndian}; use bytes::BytesMut; use bytes::{Buf, Bytes}; @@ -159,7 +159,7 @@ fn find_end_of_wal_segment( let mut buf = [0u8; XLOG_BLCKSZ]; let file_name = XLogFileName(tli, segno, wal_seg_size); let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record - let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap(); + let mut file = File::open(data_dir.join(file_name.clone() + ".partial"))?; file.seek(SeekFrom::Start(offs as u64))?; // xl_crc is the last field in XLogRecord, will not be read into rec_hdr const_assert!(XLOG_RECORD_CRC_OFFS + 4 == XLOG_SIZE_OF_XLOG_RECORD); @@ -396,10 +396,13 @@ pub fn find_end_of_wal( let mut high_tli: TimeLineID = 0; let mut high_ispartial = false; - for entry in fs::read_dir(data_dir).unwrap().flatten() { + for entry in fs::read_dir(data_dir)?.flatten() { let ispartial: bool; let entry_name = entry.file_name(); - let fname = entry_name.to_str().unwrap(); + let fname = entry_name + .to_str() + .ok_or_else(|| anyhow!("Invalid file name"))?; + /* * Check if the filename looks like an xlog file, or a .partial file. */ @@ -411,7 +414,7 @@ pub fn find_end_of_wal( continue; } let (segno, tli) = XLogFromFileName(fname, wal_seg_size); - if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 { + if !ispartial && entry.metadata()?.len() != wal_seg_size as u64 { continue; } if segno > high_segno From 4cb1074fe59637b1d2d3afc426f225384621b266 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 5 Aug 2022 13:44:57 +0100 Subject: [PATCH 0591/1022] github/workflows: Fix git dubious ownership (#2223) --- .github/actions/run-python-test-set/action.yml | 2 +- .github/workflows/build_and_test.yml | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index fcc8983e40..41f68d63e1 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -85,7 +85,7 @@ runs: PLATFORM: github-actions-selfhosted AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }} AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }} - shell: bash -euxo pipefail {0} {0} + shell: bash -euxo pipefail {0} run: | PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)" rm -rf $PERF_REPORT_DIR diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 4e784f0920..d28da92d11 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -35,6 +35,16 @@ jobs: GIT_VERSION: ${{ github.sha }} steps: + - name: Fix git ownerwhip + run: | + # Workaround for `fatal: detected dubious ownership in repository at ...` + # + # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers + # Ref https://github.com/actions/checkout/issues/785 + # + git config --global --add safe.directory ${{ github.workspace }} + git config --global --add safe.directory ${GITHUB_WORKSPACE} + - name: Checkout uses: actions/checkout@v3 with: From 5133db44e191e71b99020f38e2bfe14966daff89 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 5 Aug 2022 16:28:59 +0300 Subject: [PATCH 0592/1022] Move relation size cache from WalIngest to DatadirTimeline (#2094) * Move relation sie cache to layered timeline * Fix obtaining current LSN for relation size cache * Resolve merge conflicts * Resolve merge conflicts * Reestore 'lsn' field in DatadirModification * adjust DatadirModification lsn in ingest_record * Fix formatting * Pass lsn to get_relsize * Fix merge conflict * Update pageserver/src/pgdatadir_mapping.rs Co-authored-by: Heikki Linnakangas * Update pageserver/src/pgdatadir_mapping.rs Co-authored-by: Heikki Linnakangas Co-authored-by: Heikki Linnakangas --- pageserver/src/import_datadir.rs | 16 +-- pageserver/src/layered_repository/timeline.rs | 47 ++++++- pageserver/src/pgdatadir_mapping.rs | 68 +++++++-- pageserver/src/walingest.rs | 133 ++++++++---------- .../src/walreceiver/walreceiver_connection.rs | 2 +- 5 files changed, 164 insertions(+), 102 deletions(-) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index ccfd83400a..7d1e8e43aa 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -37,7 +37,7 @@ pub fn import_timeline_from_postgres_datadir( // TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn) // Then fishing out pg_control would be unnecessary - let mut modification = tline.begin_modification(); + let mut modification = tline.begin_modification(lsn); modification.init_empty()?; // Import all but pg_wal @@ -56,12 +56,12 @@ pub fn import_timeline_from_postgres_datadir( if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? { pg_control = Some(control_file); } - modification.flush(lsn)?; + modification.flush()?; } } // We're done importing all the data files. - modification.commit(lsn)?; + modification.commit()?; // We expect the Postgres server to be shut down cleanly. let pg_control = pg_control.context("pg_control file not found")?; @@ -267,7 +267,7 @@ fn import_wal( waldecoder.feed_bytes(&buf); let mut nrecords = 0; - let mut modification = tline.begin_modification(); + let mut modification = tline.begin_modification(endpoint); let mut decoded = DecodedWALRecord::default(); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { @@ -301,7 +301,7 @@ pub fn import_basebackup_from_tar( base_lsn: Lsn, ) -> Result<()> { info!("importing base at {}", base_lsn); - let mut modification = tline.begin_modification(); + let mut modification = tline.begin_modification(base_lsn); modification.init_empty()?; let mut pg_control: Option = None; @@ -319,7 +319,7 @@ pub fn import_basebackup_from_tar( // We found the pg_control file. pg_control = Some(res); } - modification.flush(base_lsn)?; + modification.flush()?; } tar::EntryType::Directory => { debug!("directory {:?}", file_path); @@ -333,7 +333,7 @@ pub fn import_basebackup_from_tar( // sanity check: ensure that pg_control is loaded let _pg_control = pg_control.context("pg_control file not found")?; - modification.commit(base_lsn)?; + modification.commit()?; Ok(()) } @@ -385,7 +385,7 @@ pub fn import_wal_from_tar( waldecoder.feed_bytes(&bytes[offset..]); - let mut modification = tline.begin_modification(); + let mut modification = tline.begin_modification(end_lsn); let mut decoded = DecodedWALRecord::default(); while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 6ed1efd3d1..095f3d3861 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -8,7 +8,7 @@ use lazy_static::lazy_static; use tracing::*; use std::cmp::{max, min, Ordering}; -use std::collections::HashSet; +use std::collections::{hash_map::Entry, HashMap, HashSet}; use std::fs; use std::fs::{File, OpenOptions}; use std::io::Write; @@ -38,7 +38,9 @@ use crate::layered_repository::{ use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; +use crate::pgdatadir_mapping::BlockNumber; use crate::pgdatadir_mapping::LsnForTimestamp; +use crate::reltag::RelTag; use crate::tenant_config::TenantConfOpt; use crate::DatadirTimeline; @@ -295,6 +297,9 @@ pub struct LayeredTimeline { /// or None if WAL receiver has not received anything for this timeline /// yet. pub last_received_wal: Mutex>, + + /// Relation size cache + rel_size_cache: RwLock>, } pub struct WalReceiverInfo { @@ -306,7 +311,42 @@ pub struct WalReceiverInfo { /// Inherit all the functions from DatadirTimeline, to provide the /// functionality to store PostgreSQL relations, SLRUs, etc. in a /// LayeredTimeline. -impl DatadirTimeline for LayeredTimeline {} +impl DatadirTimeline for LayeredTimeline { + fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option { + let rel_size_cache = self.rel_size_cache.read().unwrap(); + if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) { + if lsn >= *cached_lsn { + return Some(*nblocks); + } + } + None + } + + fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { + let mut rel_size_cache = self.rel_size_cache.write().unwrap(); + match rel_size_cache.entry(tag) { + Entry::Occupied(mut entry) => { + let cached_lsn = entry.get_mut(); + if lsn >= cached_lsn.0 { + *cached_lsn = (lsn, nblocks); + } + } + Entry::Vacant(entry) => { + entry.insert((lsn, nblocks)); + } + } + } + + fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { + let mut rel_size_cache = self.rel_size_cache.write().unwrap(); + rel_size_cache.insert(tag, (lsn, nblocks)); + } + + fn remove_cached_rel_size(&self, tag: &RelTag) { + let mut rel_size_cache = self.rel_size_cache.write().unwrap(); + rel_size_cache.remove(tag); + } +} /// /// Information about how much history needs to be retained, needed by @@ -377,8 +417,6 @@ impl Timeline for LayeredTimeline { /// Look up the value with the given a key fn get(&self, key: Key, lsn: Lsn) -> Result { - debug_assert!(lsn <= self.get_last_record_lsn()); - // Check the page cache. We will get back the most recent page with lsn <= `lsn`. // The cached image can be returned directly if there is no WAL between the cached image // and requested LSN. The cached image can also be used to reduce the amount of WAL needed @@ -618,6 +656,7 @@ impl LayeredTimeline { repartition_threshold: 0, last_received_wal: Mutex::new(None), + rel_size_cache: RwLock::new(HashMap::new()), }; result.repartition_threshold = result.get_checkpoint_distance() / 10; result diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 61aca8d4ba..9097a08d05 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -56,13 +56,16 @@ pub trait DatadirTimeline: Timeline { /// This provides a transaction-like interface to perform a bunch /// of modifications atomically. /// - /// To ingest a WAL record, call begin_modification() to get a + /// To ingest a WAL record, call begin_modification(lsn) to get a /// DatadirModification object. Use the functions in the object to /// modify the repository state, updating all the pages and metadata - /// that the WAL record affects. When you're done, call commit(lsn) to - /// commit the changes. All the changes will be stamped with the specified LSN. + /// that the WAL record affects. When you're done, call commit() to + /// commit the changes. /// - /// Calling commit(lsn) will flush all the changes and reset the state, + /// Lsn stored in modification is advanced by `ingest_record` and + /// is used by `commit()` to update `last_record_lsn`. + /// + /// Calling commit() will flush all the changes and reset the state, /// so the `DatadirModification` struct can be reused to perform the next modification. /// /// Note that any pending modifications you make through the @@ -70,7 +73,7 @@ pub trait DatadirTimeline: Timeline { /// functions of the timeline until you finish! And if you update the /// same page twice, the last update wins. /// - fn begin_modification(&self) -> DatadirModification + fn begin_modification(&self, lsn: Lsn) -> DatadirModification where Self: Sized, { @@ -79,6 +82,7 @@ pub trait DatadirTimeline: Timeline { pending_updates: HashMap::new(), pending_deletions: Vec::new(), pending_nblocks: 0, + lsn, } } @@ -120,6 +124,10 @@ pub trait DatadirTimeline: Timeline { fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); + if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) { + return Ok(nblocks); + } + if (tag.forknum == pg_constants::FSM_FORKNUM || tag.forknum == pg_constants::VISIBILITYMAP_FORKNUM) && !self.get_rel_exists(tag, lsn)? @@ -133,13 +141,21 @@ pub trait DatadirTimeline: Timeline { let key = rel_size_to_key(tag); let mut buf = self.get(key, lsn)?; - Ok(buf.get_u32_le()) + let nblocks = buf.get_u32_le(); + + // Update relation size cache + self.update_cached_rel_size(tag, lsn, nblocks); + Ok(nblocks) } /// Does relation exist? fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); + // first try to lookup relation in cache + if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) { + return Ok(true); + } // fetch directory listing let key = rel_dir_to_key(tag.spcnode, tag.dbnode); let buf = self.get(key, lsn)?; @@ -445,6 +461,18 @@ pub trait DatadirTimeline: Timeline { Ok(result.to_keyspace()) } + + /// Get cached size of relation if it not updated after specified LSN + fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option; + + /// Update cached relation size if there is no more recent update + fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber); + + /// Store cached relation size + fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber); + + /// Remove cached relation size + fn remove_cached_rel_size(&self, tag: &RelTag); } /// DatadirModification represents an operation to ingest an atomic set of @@ -457,6 +485,9 @@ pub struct DatadirModification<'a, T: DatadirTimeline> { /// in the state in 'tline' yet. pub tline: &'a T, + /// Lsn assigned by begin_modification + pub lsn: Lsn, + // The modifications are not applied directly to the underlying key-value store. // The put-functions add the modifications here, and they are flushed to the // underlying key-value store by the 'finish' function. @@ -666,9 +697,11 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { self.pending_nblocks += nblocks as isize; + // Update relation size cache + self.tline.set_cached_rel_size(rel, self.lsn, nblocks); + // Even if nblocks > 0, we don't insert any actual blocks here. That's up to the // caller. - Ok(()) } @@ -684,6 +717,9 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { let buf = nblocks.to_le_bytes(); self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + // Update relation size cache + self.tline.set_cached_rel_size(rel, self.lsn, nblocks); + // Update logical database size. self.pending_nblocks -= old_size as isize - nblocks as isize; Ok(()) @@ -703,6 +739,9 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { let buf = nblocks.to_le_bytes(); self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + // Update relation size cache + self.tline.set_cached_rel_size(rel, self.lsn, nblocks); + self.pending_nblocks += nblocks as isize - old_size as isize; } Ok(()) @@ -728,6 +767,9 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { let old_size = self.get(size_key)?.get_u32_le(); self.pending_nblocks -= old_size as isize; + // Remove enty from relation size cache + self.tline.remove_cached_rel_size(&rel); + // Delete size entry, as well as all blocks self.delete(rel_key_range(rel)); @@ -842,7 +884,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { /// retains all the metadata, but data pages are flushed. That's again OK /// for bulk import, where you are just loading data pages and won't try to /// modify the same pages twice. - pub fn flush(&mut self, lsn: Lsn) -> Result<()> { + pub fn flush(&mut self) -> Result<()> { // Unless we have accumulated a decent amount of changes, it's not worth it // to scan through the pending_updates list. let pending_nblocks = self.pending_nblocks; @@ -856,7 +898,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { let mut result: Result<()> = Ok(()); self.pending_updates.retain(|&key, value| { if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) { - result = writer.put(key, lsn, value); + result = writer.put(key, self.lsn, value); false } else { true @@ -877,9 +919,9 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { /// underlying timeline. /// All the modifications in this atomic update are stamped by the specified LSN. /// - pub fn commit(&mut self, lsn: Lsn) -> Result<()> { + pub fn commit(&mut self) -> Result<()> { let writer = self.tline.writer(); - + let lsn = self.lsn; let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; @@ -1324,9 +1366,9 @@ pub fn create_test_timeline( timeline_id: utils::zid::ZTimelineId, ) -> Result> { let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; - m.commit(Lsn(8))?; + m.commit()?; Ok(tline) } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 8dd14ec177..b8064849e0 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -30,8 +30,6 @@ use anyhow::Result; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; -use std::collections::HashMap; - use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::walrecord::*; @@ -48,8 +46,6 @@ pub struct WalIngest<'a, T: DatadirTimeline> { checkpoint: CheckPoint, checkpoint_modified: bool, - - relsize_cache: HashMap, } impl<'a, T: DatadirTimeline> WalIngest<'a, T> { @@ -64,13 +60,13 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { timeline, checkpoint, checkpoint_modified: false, - relsize_cache: HashMap::new(), }) } /// /// Decode a PostgreSQL WAL record and store it in the repository, in the given timeline. /// + /// This function updates `lsn` field of `DatadirModification` /// /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the /// relations/pages that the record affects. @@ -82,6 +78,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { modification: &mut DatadirModification, decoded: &mut DecodedWALRecord, ) -> Result<()> { + modification.lsn = lsn; decode_wal_record(recdata, decoded).context("failed decoding wal record")?; let mut buf = decoded.record.clone(); @@ -260,7 +257,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { // Now that this record has been fully handled, including updating the // checkpoint data, let the repository know that it is up-to-date to this LSN - modification.commit(lsn)?; + modification.commit()?; Ok(()) } @@ -408,7 +405,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { // replaying it would fail to find the previous image of the page, because // it doesn't exist. So check if the VM page(s) exist, and skip the WAL // record if it doesn't. - let vm_size = self.get_relsize(vm_rel)?; + let vm_size = self.get_relsize(vm_rel, modification.lsn)?; if let Some(blknum) = new_vm_blk { if blknum >= vm_size { new_vm_blk = None; @@ -880,7 +877,6 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { modification: &mut DatadirModification, rel: RelTag, ) -> Result<()> { - self.relsize_cache.insert(rel, 0); modification.put_rel_creation(rel, 0)?; Ok(()) } @@ -916,7 +912,6 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { nblocks: BlockNumber, ) -> Result<()> { modification.put_rel_truncation(rel, nblocks)?; - self.relsize_cache.insert(rel, nblocks); Ok(()) } @@ -926,23 +921,16 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { rel: RelTag, ) -> Result<()> { modification.put_rel_drop(rel)?; - self.relsize_cache.remove(&rel); Ok(()) } - fn get_relsize(&mut self, rel: RelTag) -> Result { - if let Some(nblocks) = self.relsize_cache.get(&rel) { - Ok(*nblocks) + fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> Result { + let nblocks = if !self.timeline.get_rel_exists(rel, lsn)? { + 0 } else { - let last_lsn = self.timeline.get_last_record_lsn(); - let nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? { - 0 - } else { - self.timeline.get_rel_size(rel, last_lsn)? - }; - self.relsize_cache.insert(rel, nblocks); - Ok(nblocks) - } + self.timeline.get_rel_size(rel, lsn)? + }; + Ok(nblocks) } fn handle_rel_extend( @@ -952,22 +940,16 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { blknum: BlockNumber, ) -> Result<()> { let new_nblocks = blknum + 1; - let old_nblocks = if let Some(nblocks) = self.relsize_cache.get(&rel) { - *nblocks + // Check if the relation exists. We implicitly create relations on first + // record. + // TODO: would be nice if to be more explicit about it + let last_lsn = modification.lsn; + let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? { + // create it with 0 size initially, the logic below will extend it + modification.put_rel_creation(rel, 0)?; + 0 } else { - // Check if the relation exists. We implicitly create relations on first - // record. - // TODO: would be nice if to be more explicit about it - let last_lsn = self.timeline.get_last_record_lsn(); - let nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? { - // create it with 0 size initially, the logic below will extend it - modification.put_rel_creation(rel, 0)?; - 0 - } else { - self.timeline.get_rel_size(rel, last_lsn)? - }; - self.relsize_cache.insert(rel, nblocks); - nblocks + self.timeline.get_rel_size(rel, last_lsn)? }; if new_nblocks > old_nblocks { @@ -978,7 +960,6 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { for gap_blknum in old_nblocks..blknum { modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?; } - self.relsize_cache.insert(rel, new_nblocks); } Ok(()) } @@ -1069,10 +1050,10 @@ mod tests { static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); fn init_walingest_test(tline: &T) -> Result> { - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(Lsn(0x10)); m.put_checkpoint(ZERO_CHECKPOINT.clone())?; m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file - m.commit(Lsn(0x10))?; + m.commit()?; let walingest = WalIngest::new(tline, Lsn(0x10))?; Ok(walingest) @@ -1084,19 +1065,19 @@ mod tests { let tline = create_test_timeline(repo, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(Lsn(0x20)); walingest.put_rel_creation(&mut m, TESTREL_A)?; walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; - m.commit(Lsn(0x20))?; - let mut m = tline.begin_modification(); + m.commit()?; + let mut m = tline.begin_modification(Lsn(0x30)); walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?; - m.commit(Lsn(0x30))?; - let mut m = tline.begin_modification(); + m.commit()?; + let mut m = tline.begin_modification(Lsn(0x40)); walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?; - m.commit(Lsn(0x40))?; - let mut m = tline.begin_modification(); + m.commit()?; + let mut m = tline.begin_modification(Lsn(0x50)); walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?; - m.commit(Lsn(0x50))?; + m.commit()?; assert_current_logical_size(&*tline, Lsn(0x50)); @@ -1142,9 +1123,9 @@ mod tests { ); // Truncate last block - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(Lsn(0x60)); walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?; - m.commit(Lsn(0x60))?; + m.commit()?; assert_current_logical_size(&*tline, Lsn(0x60)); // Check reported size and contents after truncation @@ -1166,15 +1147,15 @@ mod tests { ); // Truncate to zero length - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(Lsn(0x68)); walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?; - m.commit(Lsn(0x68))?; + m.commit()?; assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68))?, 0); // Extend from 0 to 2 blocks, leaving a gap - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(Lsn(0x70)); walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?; - m.commit(Lsn(0x70))?; + m.commit()?; assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70))?, 2); assert_eq!( tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, @@ -1186,9 +1167,9 @@ mod tests { ); // Extend a lot more, leaving a big gap that spans across segments - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(Lsn(0x80)); walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?; - m.commit(Lsn(0x80))?; + m.commit()?; assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, 1501); for blk in 2..1500 { assert_eq!( @@ -1212,18 +1193,18 @@ mod tests { let tline = create_test_timeline(repo, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(Lsn(0x20)); walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; - m.commit(Lsn(0x20))?; + m.commit()?; // Check that rel exists and size is correct assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1); // Drop rel - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(Lsn(0x30)); walingest.put_rel_drop(&mut m, TESTREL_A)?; - m.commit(Lsn(0x30))?; + m.commit()?; // Check that rel is not visible anymore assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false); @@ -1232,9 +1213,9 @@ mod tests { //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30))?.is_none()); // Re-create it - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(Lsn(0x40)); walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?; - m.commit(Lsn(0x40))?; + m.commit()?; // Check that rel exists and size is correct assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true); @@ -1254,12 +1235,12 @@ mod tests { // Create a 20 MB relation (the size is arbitrary) let relsize = 20 * 1024 * 1024 / 8192; - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(Lsn(0x20)); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; } - m.commit(Lsn(0x20))?; + m.commit()?; // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); @@ -1280,9 +1261,9 @@ mod tests { // Truncate relation so that second segment was dropped // - only leave one page - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(Lsn(0x60)); walingest.put_rel_truncation(&mut m, TESTREL_A, 1)?; - m.commit(Lsn(0x60))?; + m.commit()?; // Check reported size and contents after truncation assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 1); @@ -1310,12 +1291,12 @@ mod tests { // Extend relation again. // Add enough blocks to create second segment let lsn = Lsn(0x80); - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(lsn); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, lsn); walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; } - m.commit(lsn)?; + m.commit()?; assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true); assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, relsize); @@ -1343,10 +1324,10 @@ mod tests { let mut lsn = 0x10; for blknum in 0..pg_constants::RELSEG_SIZE + 1 { lsn += 0x10; - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(Lsn(lsn)); let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?; - m.commit(Lsn(lsn))?; + m.commit()?; } assert_current_logical_size(&*tline, Lsn(lsn)); @@ -1358,9 +1339,9 @@ mod tests { // Truncate one block lsn += 0x10; - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(Lsn(lsn)); walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE)?; - m.commit(Lsn(lsn))?; + m.commit()?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn))?, pg_constants::RELSEG_SIZE @@ -1369,9 +1350,9 @@ mod tests { // Truncate another block lsn += 0x10; - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(Lsn(lsn)); walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE - 1)?; - m.commit(Lsn(lsn))?; + m.commit()?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn))?, pg_constants::RELSEG_SIZE - 1 @@ -1383,9 +1364,9 @@ mod tests { let mut size: i32 = 3000; while size >= 0 { lsn += 0x10; - let mut m = tline.begin_modification(); + let mut m = tline.begin_modification(Lsn(lsn)); walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?; - m.commit(Lsn(lsn))?; + m.commit()?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn))?, size as BlockNumber diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index fbd9ccd3c5..c4e66bdb95 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -154,7 +154,7 @@ pub async fn handle_walreceiver_connection( { let mut decoded = DecodedWALRecord::default(); - let mut modification = timeline.begin_modification(); + let mut modification = timeline.begin_modification(endlsn); while let Some((lsn, recdata)) = waldecoder.poll_decode()? { // let _enter = info_span!("processing record", lsn = %lsn).entered(); From 84d1bc06a93de64488a6da7b06c471a9505076e8 Mon Sep 17 00:00:00 2001 From: Ankur Srivastava Date: Fri, 5 Aug 2022 19:34:04 +0200 Subject: [PATCH 0593/1022] refactor: replace lazy-static with once-cell (#2195) - Replacing all the occurrences of lazy-static with `once-cell::sync::Lazy` - fixes #1147 Signed-off-by: Ankur Srivastava --- Cargo.lock | 12 ++-- control_plane/Cargo.toml | 2 +- control_plane/src/postgresql_conf.rs | 14 ++-- libs/etcd_broker/Cargo.toml | 2 +- libs/metrics/Cargo.toml | 2 +- libs/metrics/src/lib.rs | 17 +++-- libs/metrics/src/wrappers.rs | 16 ++--- libs/postgres_ffi/Cargo.toml | 2 +- libs/postgres_ffi/src/relfile_utils.rs | 13 ++-- libs/postgres_ffi/wal_craft/Cargo.toml | 2 +- libs/remote_storage/Cargo.toml | 2 +- libs/utils/Cargo.toml | 3 +- libs/utils/src/http/endpoint.rs | 10 +-- libs/utils/tests/ssl_test.rs | 21 +++--- pageserver/Cargo.toml | 3 +- pageserver/src/layered_repository/block_io.rs | 6 +- .../src/layered_repository/ephemeral_file.rs | 16 ++--- .../src/layered_repository/layer_map.rs | 11 ++-- pageserver/src/layered_repository/timeline.rs | 64 ++++++++++--------- pageserver/src/lib.rs | 10 +-- pageserver/src/page_service.rs | 10 +-- pageserver/src/repository.rs | 13 ++-- pageserver/src/storage_sync.rs | 34 +++++----- pageserver/src/storage_sync/upload.rs | 10 +-- pageserver/src/tenant_mgr.rs | 16 +++-- pageserver/src/thread_mgr.rs | 17 +++-- pageserver/src/virtual_file.rs | 19 +++--- pageserver/src/walredo.rs | 26 +++++--- proxy/Cargo.toml | 2 +- proxy/src/auth/backend.rs | 7 +- proxy/src/proxy.rs | 24 ++++--- safekeeper/Cargo.toml | 3 +- safekeeper/src/control_file.rs | 10 +-- safekeeper/src/timeline.rs | 10 +-- safekeeper/src/wal_storage.rs | 37 +++++++---- 35 files changed, 246 insertions(+), 220 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4a78b2e504..d850d3bd89 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -495,8 +495,8 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "lazy_static", "nix", + "once_cell", "pageserver", "postgres", "regex", @@ -1591,8 +1591,8 @@ dependencies = [ name = "metrics" version = "0.1.0" dependencies = [ - "lazy_static", "libc", + "once_cell", "prometheus", "workspace_hack", ] @@ -1870,7 +1870,6 @@ dependencies = [ "humantime-serde", "hyper", "itertools", - "lazy_static", "metrics", "nix", "once_cell", @@ -2116,9 +2115,9 @@ dependencies = [ "crc32c", "env_logger", "hex", - "lazy_static", "log", "memoffset", + "once_cell", "postgres", "rand", "regex", @@ -2278,9 +2277,9 @@ dependencies = [ "hex", "hmac 0.12.1", "hyper", - "lazy_static", "md5", "metrics", + "once_cell", "parking_lot 0.12.1", "pin-project-lite", "rand", @@ -2754,7 +2753,6 @@ dependencies = [ "hex", "humantime", "hyper", - "lazy_static", "metrics", "once_cell", "postgres", @@ -3671,9 +3669,9 @@ dependencies = [ "hex-literal", "hyper", "jsonwebtoken", - "lazy_static", "metrics", "nix", + "once_cell", "pin-project-lite", "postgres", "postgres-protocol", diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 26bb577636..425eb332c3 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -9,7 +9,7 @@ postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8 serde = { version = "1.0", features = ["derive"] } serde_with = "1.12.0" toml = "0.5" -lazy_static = "1.4" +once_cell = "1.13.0" regex = "1" anyhow = "1.0" thiserror = "1" diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs index 83765b2c95..a71108da01 100644 --- a/control_plane/src/postgresql_conf.rs +++ b/control_plane/src/postgresql_conf.rs @@ -5,7 +5,7 @@ /// enough to extract a few settings we need in Zenith, assuming you don't do /// funny stuff like include-directives or funny escaping. use anyhow::{bail, Context, Result}; -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use regex::Regex; use std::collections::HashMap; use std::fmt; @@ -19,9 +19,7 @@ pub struct PostgresConf { hash: HashMap, } -lazy_static! { - static ref CONF_LINE_RE: Regex = Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap(); -} +static CONF_LINE_RE: Lazy = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap()); impl PostgresConf { pub fn new() -> PostgresConf { @@ -139,10 +137,10 @@ fn escape_str(s: &str) -> String { // // This regex is a bit more conservative than the rules in guc-file.l, so we quote some // strings that PostgreSQL would accept without quoting, but that's OK. - lazy_static! { - static ref UNQUOTED_RE: Regex = - Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap(); - } + + static UNQUOTED_RE: Lazy = + Lazy::new(|| Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap()); + if UNQUOTED_RE.is_match(s) { s.to_string() } else { diff --git a/libs/etcd_broker/Cargo.toml b/libs/etcd_broker/Cargo.toml index 49be7ad207..f7bfbad4ba 100644 --- a/libs/etcd_broker/Cargo.toml +++ b/libs/etcd_broker/Cargo.toml @@ -9,7 +9,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_with = "1.12.0" - once_cell = "1.8.0" + once_cell = "1.13.0" utils = { path = "../utils" } workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index 2879dfed81..d0cd46d2a9 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -6,5 +6,5 @@ edition = "2021" [dependencies] prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency libc = "0.2" -lazy_static = "1.4" +once_cell = "1.13.0" workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index ea24b3fe7e..920d3fd17e 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -2,7 +2,7 @@ //! make sure that we use the same dep version everywhere. //! Otherwise, we might not see all metrics registered via //! a default registry. -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use prometheus::core::{AtomicU64, GenericGauge, GenericGaugeVec}; pub use prometheus::opts; pub use prometheus::register; @@ -41,19 +41,22 @@ pub fn gather() -> Vec { prometheus::gather() } -lazy_static! { - static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!( +static DISK_IO_BYTES: Lazy = Lazy::new(|| { + register_int_gauge_vec!( "libmetrics_disk_io_bytes_total", "Bytes written and read from disk, grouped by the operation (read|write)", &["io_operation"] ) - .expect("Failed to register disk i/o bytes int gauge vec"); - static ref MAXRSS_KB: IntGauge = register_int_gauge!( + .expect("Failed to register disk i/o bytes int gauge vec") +}); + +static MAXRSS_KB: Lazy = Lazy::new(|| { + register_int_gauge!( "libmetrics_maxrss_kb", "Memory usage (Maximum Resident Set Size)" ) - .expect("Failed to register maxrss_kb int gauge"); -} + .expect("Failed to register maxrss_kb int gauge") +}); pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[ 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, diff --git a/libs/metrics/src/wrappers.rs b/libs/metrics/src/wrappers.rs index de334add99..1bf1ea0753 100644 --- a/libs/metrics/src/wrappers.rs +++ b/libs/metrics/src/wrappers.rs @@ -10,13 +10,13 @@ use std::io::{Read, Result, Write}; /// # use std::io::{Result, Read}; /// # use metrics::{register_int_counter, IntCounter}; /// # use metrics::CountedReader; +/// # use once_cell::sync::Lazy; /// # -/// # lazy_static::lazy_static! { -/// # static ref INT_COUNTER: IntCounter = register_int_counter!( +/// # static INT_COUNTER: Lazy = Lazy::new( || { register_int_counter!( /// # "int_counter", /// # "let's count something!" -/// # ).unwrap(); -/// # } +/// # ).unwrap() +/// # }); /// # /// fn do_some_reads(stream: impl Read, count: usize) -> Result> { /// let mut reader = CountedReader::new(stream, |cnt| { @@ -85,13 +85,13 @@ impl Read for CountedReader<'_, T> { /// # use std::io::{Result, Write}; /// # use metrics::{register_int_counter, IntCounter}; /// # use metrics::CountedWriter; +/// # use once_cell::sync::Lazy; /// # -/// # lazy_static::lazy_static! { -/// # static ref INT_COUNTER: IntCounter = register_int_counter!( +/// # static INT_COUNTER: Lazy = Lazy::new( || { register_int_counter!( /// # "int_counter", /// # "let's count something!" -/// # ).unwrap(); -/// # } +/// # ).unwrap() +/// # }); /// # /// fn do_some_writes(stream: impl Write, payload: &[u8]) -> Result<()> { /// let mut writer = CountedWriter::new(stream, |cnt| { diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index c9cc858ab9..0118701a7e 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -12,7 +12,7 @@ byteorder = "1.4.3" anyhow = "1.0" crc32c = "0.6.0" hex = "0.4.3" -lazy_static = "1.4" +once_cell = "1.13.0" log = "0.4.14" memoffset = "0.6.2" thiserror = "1.0" diff --git a/libs/postgres_ffi/src/relfile_utils.rs b/libs/postgres_ffi/src/relfile_utils.rs index 97c8f0afea..94498ee9a9 100644 --- a/libs/postgres_ffi/src/relfile_utils.rs +++ b/libs/postgres_ffi/src/relfile_utils.rs @@ -2,7 +2,7 @@ //! Common utilities for dealing with PostgreSQL relation files. //! use crate::pg_constants; -use lazy_static::lazy_static; +use once_cell::sync::OnceCell; use regex::Regex; #[derive(Debug, Clone, thiserror::Error, PartialEq)] @@ -54,11 +54,14 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> { /// See functions relpath() and _mdfd_segpath() in PostgreSQL sources. /// pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> { - lazy_static! { - static ref RELFILE_RE: Regex = - Regex::new(r"^(?P\d+)(_(?P[a-z]+))?(\.(?P\d+))?$").unwrap(); - } + static RELFILE_RE: OnceCell = OnceCell::new(); + RELFILE_RE.get_or_init(|| { + Regex::new(r"^(?P\d+)(_(?P[a-z]+))?(\.(?P\d+))?$").unwrap() + }); + let caps = RELFILE_RE + .get() + .unwrap() .captures(fname) .ok_or(FilePathError::InvalidFileName)?; diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index 374c8e2e55..114f08113b 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -10,7 +10,7 @@ anyhow = "1.0" clap = "3.0" env_logger = "0.9" log = "0.4" -once_cell = "1.8.0" +once_cell = "1.13.0" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres_ffi = { path = "../" } tempfile = "3.2" diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index b11b3cf371..b3485f274a 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -7,7 +7,7 @@ edition = "2021" anyhow = { version = "1.0", features = ["backtrace"] } async-trait = "0.1" metrics = { version = "0.1", path = "../metrics" } -once_cell = "1.8.0" +once_cell = "1.13.0" rusoto_core = "0.48" rusoto_s3 = "0.48" serde = { version = "1.0", features = ["derive"] } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index d83b02d7ae..e3e78ec68f 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -8,7 +8,6 @@ anyhow = "1.0" bincode = "1.3" bytes = "1.0.1" hyper = { version = "0.14.7", features = ["full"] } -lazy_static = "1.4.0" pin-project-lite = "0.2.7" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } @@ -28,6 +27,8 @@ rustls = "0.20.2" rustls-split = "0.3.0" git-version = "0.3.5" serde_with = "1.12.0" +once_cell = "1.13.0" + metrics = { path = "../metrics" } workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 51bff5f6eb..69bf5ef87a 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -4,8 +4,8 @@ use crate::zid::ZTenantId; use anyhow::anyhow; use hyper::header::AUTHORIZATION; use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server}; -use lazy_static::lazy_static; use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; +use once_cell::sync::Lazy; use routerify::ext::RequestExt; use routerify::RequestInfo; use routerify::{Middleware, Router, RouterBuilder, RouterService}; @@ -16,13 +16,13 @@ use std::net::TcpListener; use super::error::ApiError; -lazy_static! { - static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!( +static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { + register_int_counter!( "libmetrics_metric_handler_requests_total", "Number of metric requests made" ) - .expect("failed to define a metric"); -} + .expect("failed to define a metric") +}); async fn logger(res: Response, info: RequestInfo) -> Result, ApiError> { info!("{} {} {}", info.method(), info.uri().path(), res.status(),); diff --git a/libs/utils/tests/ssl_test.rs b/libs/utils/tests/ssl_test.rs index 002361667b..907ef98aec 100644 --- a/libs/utils/tests/ssl_test.rs +++ b/libs/utils/tests/ssl_test.rs @@ -7,7 +7,7 @@ use std::{ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use utils::postgres_backend::{AuthType, Handler, PostgresBackend}; @@ -19,16 +19,15 @@ fn make_tcp_pair() -> (TcpStream, TcpStream) { (server_stream, client_stream) } -lazy_static! { - static ref KEY: rustls::PrivateKey = { - let mut cursor = Cursor::new(include_bytes!("key.pem")); - rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone()) - }; - static ref CERT: rustls::Certificate = { - let mut cursor = Cursor::new(include_bytes!("cert.pem")); - rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone()) - }; -} +static KEY: Lazy = Lazy::new(|| { + let mut cursor = Cursor::new(include_bytes!("key.pem")); + rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone()) +}); + +static CERT: Lazy = Lazy::new(|| { + let mut cursor = Cursor::new(include_bytes!("cert.pem")); + rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone()) +}); #[test] fn ssl() { diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 215fa151a0..63a2263ae0 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -21,7 +21,6 @@ futures = "0.3.13" hex = "0.4.3" hyper = "0.14" itertools = "0.10.3" -lazy_static = "1.4.0" clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } @@ -48,7 +47,7 @@ tracing = "0.1.27" signal-hook = "0.3.10" url = "2" nix = "0.23" -once_cell = "1.8.0" +once_cell = "1.13.0" crossbeam-utils = "0.8.5" fail = "0.5.0" git-version = "0.3.5" diff --git a/pageserver/src/layered_repository/block_io.rs b/pageserver/src/layered_repository/block_io.rs index d027b2f0e7..bc3bc082a0 100644 --- a/pageserver/src/layered_repository/block_io.rs +++ b/pageserver/src/layered_repository/block_io.rs @@ -5,7 +5,7 @@ use crate::page_cache; use crate::page_cache::{ReadBufResult, PAGE_SZ}; use bytes::Bytes; -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use std::ops::{Deref, DerefMut}; use std::os::unix::fs::FileExt; use std::sync::atomic::AtomicU64; @@ -117,9 +117,7 @@ where } } -lazy_static! { - static ref NEXT_ID: AtomicU64 = AtomicU64::new(1); -} +static NEXT_ID: Lazy = Lazy::new(|| AtomicU64::new(1)); /// An adapter for reading a (virtual) file using the page cache. /// diff --git a/pageserver/src/layered_repository/ephemeral_file.rs b/pageserver/src/layered_repository/ephemeral_file.rs index 299bb4e873..1776946e7a 100644 --- a/pageserver/src/layered_repository/ephemeral_file.rs +++ b/pageserver/src/layered_repository/ephemeral_file.rs @@ -8,7 +8,7 @@ use crate::page_cache; use crate::page_cache::PAGE_SZ; use crate::page_cache::{ReadBufResult, WriteBufResult}; use crate::virtual_file::VirtualFile; -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use std::cmp::min; use std::collections::HashMap; use std::fs::OpenOptions; @@ -21,15 +21,15 @@ use utils::zid::{ZTenantId, ZTimelineId}; use std::os::unix::fs::FileExt; -lazy_static! { - /// - /// This is the global cache of file descriptors (File objects). - /// - static ref EPHEMERAL_FILES: RwLock = RwLock::new(EphemeralFiles { +/// +/// This is the global cache of file descriptors (File objects). +/// +static EPHEMERAL_FILES: Lazy> = Lazy::new(|| { + RwLock::new(EphemeralFiles { next_file_id: 1, files: HashMap::new(), - }); -} + }) +}); pub struct EphemeralFiles { next_file_id: u64, diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index be590c88c2..8363d6314f 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -15,19 +15,18 @@ use crate::layered_repository::storage_layer::Layer; use crate::layered_repository::storage_layer::{range_eq, range_overlaps}; use crate::repository::Key; use anyhow::Result; -use lazy_static::lazy_static; use metrics::{register_int_gauge, IntGauge}; +use once_cell::sync::Lazy; use std::collections::VecDeque; use std::ops::Range; use std::sync::Arc; use tracing::*; use utils::lsn::Lsn; -lazy_static! { - static ref NUM_ONDISK_LAYERS: IntGauge = - register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk") - .expect("failed to define a metric"); -} +static NUM_ONDISK_LAYERS: Lazy = Lazy::new(|| { + register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk") + .expect("failed to define a metric") +}); /// /// LayerMap tracks what layers exist on a timeline. diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 095f3d3861..181adc2bcc 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -4,7 +4,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::Bytes; use fail::fail_point; use itertools::Itertools; -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use tracing::*; use std::cmp::{max, min, Ordering}; @@ -61,75 +61,81 @@ use crate::CheckpointConfig; use crate::{page_cache, storage_sync}; // Metrics collected on operations on the storage repository. -lazy_static! { - pub static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( +pub static STORAGE_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( "pageserver_storage_operations_seconds", "Time spent on storage operations", &["operation", "tenant_id", "timeline_id"] ) - .expect("failed to define a metric"); -} + .expect("failed to define a metric") +}); // Metrics collected on operations on the storage repository. -lazy_static! { - static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!( +static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( "pageserver_getpage_reconstruct_seconds", "Time spent in reconstruct_value", &["tenant_id", "timeline_id"] ) - .expect("failed to define a metric"); -} + .expect("failed to define a metric") +}); -lazy_static! { - static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!( +static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { + register_int_counter_vec!( "pageserver_materialized_cache_hits_total", "Number of cache hits from materialized page cache", &["tenant_id", "timeline_id"] ) - .expect("failed to define a metric"); - static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!( + .expect("failed to define a metric") +}); + +static WAIT_LSN_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( "pageserver_wait_lsn_seconds", "Time spent waiting for WAL to arrive", &["tenant_id", "timeline_id"] ) - .expect("failed to define a metric"); -} + .expect("failed to define a metric") +}); -lazy_static! { - static ref LAST_RECORD_LSN: IntGaugeVec = register_int_gauge_vec!( +static LAST_RECORD_LSN: Lazy = Lazy::new(|| { + register_int_gauge_vec!( "pageserver_last_record_lsn", "Last record LSN grouped by timeline", &["tenant_id", "timeline_id"] ) - .expect("failed to define a metric"); -} + .expect("failed to define a metric") +}); // Metrics for determining timeline's physical size. // A layered timeline's physical is defined as the total size of // (delta/image) layer files on disk. -lazy_static! { - static ref CURRENT_PHYSICAL_SIZE: UIntGaugeVec = register_uint_gauge_vec!( +static CURRENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( "pageserver_current_physical_size", "Current physical size grouped by timeline", &["tenant_id", "timeline_id"] ) - .expect("failed to define a metric"); -} + .expect("failed to define a metric") +}); // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, // or in testing they estimate how much we would upload if we did. -lazy_static! { - static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!( +static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { + register_int_counter!( "pageserver_created_persistent_files_total", "Number of files created that are meant to be uploaded to cloud storage", ) - .expect("failed to define a metric"); - static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!( + .expect("failed to define a metric") +}); + +static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { + register_int_counter!( "pageserver_written_persistent_bytes_total", "Total bytes written that are meant to be uploaded to cloud storage", ) - .expect("failed to define a metric"); -} + .expect("failed to define a metric") +}); #[derive(Clone)] pub enum LayeredTimelineEntry { diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 4ecb181553..ba912a3702 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -22,7 +22,7 @@ pub mod walreceiver; pub mod walrecord; pub mod walredo; -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use tracing::info; use crate::thread_mgr::ThreadKind; @@ -42,14 +42,14 @@ pub const STORAGE_FORMAT_VERSION: u16 = 3; pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; pub const DELTA_FILE_MAGIC: u16 = 0x5A61; -lazy_static! { - static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!( +static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { + register_int_gauge_vec!( "pageserver_live_connections", "Number of live network connections", &["pageserver_connection_kind"] ) - .expect("failed to define a metric"); -} + .expect("failed to define a metric") +}); pub const LOG_FILE_NAME: &str = "pageserver.log"; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index c8aa4b35e8..75df744014 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -11,7 +11,7 @@ use anyhow::{bail, ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use regex::Regex; use std::io::{self, Read}; use std::net::TcpListener; @@ -434,15 +434,15 @@ const TIME_BUCKETS: &[f64] = &[ 0.1, // 1/10 s ]; -lazy_static! { - static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!( +static SMGR_QUERY_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( "pageserver_smgr_query_seconds", "Time spent on smgr query handling", &["smgr_query_type", "tenant_id", "timeline_id"], TIME_BUCKETS.into() ) - .expect("failed to define a metric"); -} + .expect("failed to define a metric") +}); impl PageServerHandler { pub fn new(conf: &'static PageServerConf, auth: Option>) -> Self { diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 6467231e08..3fae0184f9 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -408,7 +408,7 @@ pub trait TimelineWriter<'a> { #[cfg(test)] pub mod repo_harness { use bytes::BytesMut; - use lazy_static::lazy_static; + use once_cell::sync::Lazy; use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; use std::{fs, path::PathBuf}; @@ -439,9 +439,7 @@ pub mod repo_harness { buf.freeze() } - lazy_static! { - static ref LOCK: RwLock<()> = RwLock::new(()); - } + static LOCK: Lazy> = Lazy::new(|| RwLock::new(())); impl From for TenantConfOpt { fn from(tenant_conf: TenantConf) -> Self { @@ -589,11 +587,10 @@ mod tests { //use std::sync::Arc; use bytes::BytesMut; use hex_literal::hex; - use lazy_static::lazy_static; + use once_cell::sync::Lazy; - lazy_static! { - static ref TEST_KEY: Key = Key::from_slice(&hex!("112222222233333333444444445500000001")); - } + static TEST_KEY: Lazy = + Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001"))); #[test] fn test_basic() -> Result<()> { diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index c60d3dccc0..222a406c81 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -155,8 +155,7 @@ use std::{ use anyhow::{anyhow, bail, Context}; use futures::stream::{FuturesUnordered, StreamExt}; -use lazy_static::lazy_static; -use once_cell::sync::OnceCell; +use once_cell::sync::{Lazy, OnceCell}; use remote_storage::{GenericRemoteStorage, RemoteStorage}; use tokio::{ fs, @@ -184,8 +183,8 @@ use crate::{ }; use metrics::{ - register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge, - HistogramVec, IntCounter, IntCounterVec, IntGauge, + register_histogram_vec, register_int_counter_vec, register_int_gauge, HistogramVec, + IntCounterVec, IntGauge, }; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; @@ -193,32 +192,33 @@ use self::download::download_index_parts; pub use self::download::gather_tenant_timelines_index_parts; pub use self::download::TEMP_DOWNLOAD_EXTENSION; -lazy_static! { - static ref REMAINING_SYNC_ITEMS: IntGauge = register_int_gauge!( +static REMAINING_SYNC_ITEMS: Lazy = Lazy::new(|| { + register_int_gauge!( "pageserver_remote_storage_remaining_sync_items", "Number of storage sync items left in the queue" ) - .expect("failed to register pageserver remote storage remaining sync items int gauge"); - static ref FATAL_TASK_FAILURES: IntCounter = register_int_counter!( - "pageserver_remote_storage_fatal_task_failures_total", - "Number of critically failed tasks" - ) - .expect("failed to register pageserver remote storage remaining sync items int gauge"); - static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!( + .expect("failed to register pageserver remote storage remaining sync items int gauge") +}); + +static IMAGE_SYNC_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( "pageserver_remote_storage_image_sync_seconds", "Time took to synchronize (download or upload) a whole pageserver image. \ Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)", &["tenant_id", "timeline_id", "operation_kind", "status"], vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0] ) - .expect("failed to register pageserver image sync time histogram vec"); - static ref REMOTE_INDEX_UPLOAD: IntCounterVec = register_int_counter_vec!( + .expect("failed to register pageserver image sync time histogram vec") +}); + +static REMOTE_INDEX_UPLOAD: Lazy = Lazy::new(|| { + register_int_counter_vec!( "pageserver_remote_storage_remote_index_uploads_total", "Number of remote index uploads", &["tenant_id", "timeline_id"], ) - .expect("failed to register pageserver remote index upload vec"); -} + .expect("failed to register pageserver remote index upload vec") +}); static SYNC_QUEUE: OnceCell = OnceCell::new(); diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index f9ab3b7471..671ea45202 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -4,7 +4,7 @@ use std::{fmt::Debug, path::PathBuf}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use remote_storage::RemoteStorage; use tokio::fs; use tracing::{debug, error, info, warn}; @@ -20,14 +20,14 @@ use crate::{ }; use metrics::{register_int_counter_vec, IntCounterVec}; -lazy_static! { - static ref NO_LAYERS_UPLOAD: IntCounterVec = register_int_counter_vec!( +static NO_LAYERS_UPLOAD: Lazy = Lazy::new(|| { + register_int_counter_vec!( "pageserver_remote_storage_no_layers_uploads_total", "Number of skipped uploads due to no layers", &["tenant_id", "timeline_id"], ) - .expect("failed to register pageserver no layers upload vec"); -} + .expect("failed to register pageserver no layers upload vec") +}); /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part( diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index dfdbc4c318..5a5cea9a4b 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -27,23 +27,25 @@ use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; mod tenants_state { use anyhow::ensure; + use once_cell::sync::Lazy; use std::{ collections::HashMap, sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}, }; use tokio::sync::mpsc; use tracing::{debug, error}; - use utils::zid::ZTenantId; use crate::tenant_mgr::{LocalTimelineUpdate, Tenant}; - lazy_static::lazy_static! { - static ref TENANTS: RwLock> = RwLock::new(HashMap::new()); - /// Sends updates to the local timelines (creation and deletion) to the WAL receiver, - /// so that it can enable/disable corresponding processes. - static ref TIMELINE_UPDATE_SENDER: RwLock>> = RwLock::new(None); - } + static TENANTS: Lazy>> = + Lazy::new(|| RwLock::new(HashMap::new())); + + /// Sends updates to the local timelines (creation and deletion) to the WAL receiver, + /// so that it can enable/disable corresponding processes. + static TIMELINE_UPDATE_SENDER: Lazy< + RwLock>>, + > = Lazy::new(|| RwLock::new(None)); pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap> { TENANTS diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index 6dd2e4b00b..cdd38febbc 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -45,21 +45,20 @@ use tokio::sync::watch; use tracing::{debug, error, info, warn}; -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use utils::zid::{ZTenantId, ZTimelineId}; use crate::shutdown_pageserver; -lazy_static! { - /// Each thread that we track is associated with a "thread ID". It's just - /// an increasing number that we assign, not related to any system thread - /// id. - static ref NEXT_THREAD_ID: AtomicU64 = AtomicU64::new(1); +/// Each thread that we track is associated with a "thread ID". It's just +/// an increasing number that we assign, not related to any system thread +/// id. +static NEXT_THREAD_ID: Lazy = Lazy::new(|| AtomicU64::new(1)); - /// Global registry of threads - static ref THREADS: Mutex>> = Mutex::new(HashMap::new()); -} +/// Global registry of threads +static THREADS: Lazy>>> = + Lazy::new(|| Mutex::new(HashMap::new())); // There is a Tokio watch channel for each thread, which can be used to signal the // thread that it needs to shut down. This thread local variable holds the receiving diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index a16e772238..5b24b848ad 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -10,7 +10,7 @@ //! This is similar to PostgreSQL's virtual file descriptor facility in //! src/backend/storage/file/fd.c //! -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use once_cell::sync::OnceCell; use std::fs::{File, OpenOptions}; use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write}; @@ -32,23 +32,24 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ 1.0, // 1 sec ]; -lazy_static! { - static ref STORAGE_IO_TIME: HistogramVec = register_histogram_vec!( +static STORAGE_IO_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( "pageserver_io_operations_seconds", "Time spent in IO operations", &["operation", "tenant_id", "timeline_id"], STORAGE_IO_TIME_BUCKETS.into() ) - .expect("failed to define a metric"); -} -lazy_static! { - static ref STORAGE_IO_SIZE: IntGaugeVec = register_int_gauge_vec!( + .expect("failed to define a metric") +}); + +static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( "pageserver_io_operations_bytes_total", "Total amount of bytes read/written in IO operations", &["operation", "tenant_id", "timeline_id"] ) - .expect("failed to define a metric"); -} + .expect("failed to define a metric") +}); /// /// A virtual file descriptor. You can use this just like std::fs::File, but internally diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index db4620417c..85f970a941 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -20,8 +20,8 @@ //! use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; -use lazy_static::lazy_static; use nix::poll::*; +use once_cell::sync::Lazy; use serde::Serialize; use std::fs; use std::fs::OpenOptions; @@ -105,21 +105,27 @@ impl crate::walredo::WalRedoManager for DummyRedoManager { // We collect the time spent in actual WAL redo ('redo'), and time waiting // for access to the postgres process ('wait') since there is only one for // each tenant. -lazy_static! { - static ref WAL_REDO_TIME: Histogram = - register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo") - .expect("failed to define a metric"); - static ref WAL_REDO_WAIT_TIME: Histogram = register_histogram!( + +static WAL_REDO_TIME: Lazy = Lazy::new(|| { + register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo") + .expect("failed to define a metric") +}); + +static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { + register_histogram!( "pageserver_wal_redo_wait_seconds", "Time spent waiting for access to the WAL redo process" ) - .expect("failed to define a metric"); - static ref WAL_REDO_RECORD_COUNTER: IntCounter = register_int_counter!( + .expect("failed to define a metric") +}); + +static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { + register_int_counter!( "pageserver_replayed_wal_records_total", "Number of WAL records replayed in WAL redo process" ) - .unwrap(); -} + .unwrap() +}); /// /// This is the real implementation that uses a Postgres process to diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 8c6036f87d..d9d43c3325 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -14,7 +14,7 @@ hashbrown = "0.11.2" hex = "0.4.3" hmac = "0.12.1" hyper = "0.14" -lazy_static = "1.4.0" +once_cell = "1.13.0" md5 = "0.7.0" parking_lot = "0.12" pin-project-lite = "0.2.7" diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 5e87059c86..a67865e08c 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -12,13 +12,12 @@ use crate::{ stream::PqStream, waiters::{self, Waiter, Waiters}, }; -use lazy_static::lazy_static; + +use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; -lazy_static! { - static ref CPLANE_WAITERS: Waiters = Default::default(); -} +static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); /// Give caller an opportunity to wait for the cloud's reply. pub async fn with_waiter( diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index f202782109..29be79c886 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -4,8 +4,8 @@ use crate::config::{ProxyConfig, TlsConfig}; use crate::stream::{MetricsStream, PqStream, Stream}; use anyhow::{bail, Context}; use futures::TryFutureExt; -use lazy_static::lazy_static; use metrics::{register_int_counter, IntCounter}; +use once_cell::sync::Lazy; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use utils::pq_proto::{BeMessage as Be, *}; @@ -13,23 +13,29 @@ use utils::pq_proto::{BeMessage as Be, *}; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; const ERR_PROTO_VIOLATION: &str = "protocol violation"; -lazy_static! { - static ref NUM_CONNECTIONS_ACCEPTED_COUNTER: IntCounter = register_int_counter!( +static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter!( "proxy_accepted_connections_total", "Number of TCP client connections accepted." ) - .unwrap(); - static ref NUM_CONNECTIONS_CLOSED_COUNTER: IntCounter = register_int_counter!( + .unwrap() +}); + +static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter!( "proxy_closed_connections_total", "Number of TCP client connections closed." ) - .unwrap(); - static ref NUM_BYTES_PROXIED_COUNTER: IntCounter = register_int_counter!( + .unwrap() +}); + +static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter!( "proxy_io_bytes_total", "Number of bytes sent/received between any client and backend." ) - .unwrap(); -} + .unwrap() +}); /// A small combinator for pluggable error logging. async fn log_error(future: F) -> F::Output diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index f6ae9e75d7..4ed30413e2 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -9,7 +9,6 @@ bytes = "1.0.1" byteorder = "1.4.3" hyper = "0.14" fs2 = "0.4.3" -lazy_static = "1.4.0" serde_json = "1" tracing = "0.1.27" clap = "3.0" @@ -29,7 +28,7 @@ const_format = "0.2.21" tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } git-version = "0.3.5" async-trait = "0.1" -once_cell = "1.10.0" +once_cell = "1.13.0" toml_edit = { version = "0.13", features = ["easy"] } postgres_ffi = { path = "../libs/postgres_ffi" } diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index c49b4c058a..7fc75246e1 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -2,7 +2,7 @@ use anyhow::{bail, ensure, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use std::fs::{self, File, OpenOptions}; use std::io::{Read, Write}; @@ -26,15 +26,15 @@ const CONTROL_FILE_NAME: &str = "safekeeper.control"; const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial"; pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); -lazy_static! { - static ref PERSIST_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!( +static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { + register_histogram_vec!( "safekeeper_persist_control_file_seconds", "Seconds to persist and sync control file, grouped by timeline", &["tenant_id", "timeline_id"], DISK_WRITE_SECONDS_BUCKETS.to_vec() ) - .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec"); -} + .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec") +}); /// Storage should keep actual state inside of it. It should implement Deref /// trait to access state fields and have persist method for updating that state. diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index bed6e447d7..ee642408f2 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -4,7 +4,7 @@ use anyhow::{bail, Context, Result}; use etcd_broker::subscription_value::SkTimelineInfo; -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use postgres_ffi::xlog_utils::XLogSegNo; use serde::Serialize; @@ -559,12 +559,12 @@ struct GlobalTimelinesState { wal_backup_launcher_tx: Option>, } -lazy_static! { - static ref TIMELINES_STATE: Mutex = Mutex::new(GlobalTimelinesState { +static TIMELINES_STATE: Lazy> = Lazy::new(|| { + Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), wal_backup_launcher_tx: None, - }); -} + }) +}); #[derive(Clone, Copy, Serialize)] pub struct TimelineDeleteForceResult { diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 9b23e2189c..2a36d5c04c 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -12,7 +12,7 @@ use std::io::{self, Seek, SeekFrom}; use std::pin::Pin; use tokio::io::AsyncRead; -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use postgres_ffi::xlog_utils::{ find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, XLogSegNo, PG_TLI, }; @@ -38,31 +38,44 @@ use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECOND use tokio::io::{AsyncReadExt, AsyncSeekExt}; -lazy_static! { - // The prometheus crate does not support u64 yet, i64 only (see `IntGauge`). - // i64 is faster than f64, so update to u64 when available. - static ref WRITE_WAL_BYTES: HistogramVec = register_histogram_vec!( +// The prometheus crate does not support u64 yet, i64 only (see `IntGauge`). +// i64 is faster than f64, so update to u64 when available. +static WRITE_WAL_BYTES: Lazy = Lazy::new(|| { + register_histogram_vec!( "safekeeper_write_wal_bytes", "Bytes written to WAL in a single request, grouped by timeline", &["tenant_id", "timeline_id"], - vec![1.0, 10.0, 100.0, 1024.0, 8192.0, 128.0 * 1024.0, 1024.0 * 1024.0, 10.0 * 1024.0 * 1024.0] + vec![ + 1.0, + 10.0, + 100.0, + 1024.0, + 8192.0, + 128.0 * 1024.0, + 1024.0 * 1024.0, + 10.0 * 1024.0 * 1024.0 + ] ) - .expect("Failed to register safekeeper_write_wal_bytes histogram vec"); - static ref WRITE_WAL_SECONDS: HistogramVec = register_histogram_vec!( + .expect("Failed to register safekeeper_write_wal_bytes histogram vec") +}); +static WRITE_WAL_SECONDS: Lazy = Lazy::new(|| { + register_histogram_vec!( "safekeeper_write_wal_seconds", "Seconds spent writing and syncing WAL to a disk in a single request, grouped by timeline", &["tenant_id", "timeline_id"], DISK_WRITE_SECONDS_BUCKETS.to_vec() ) - .expect("Failed to register safekeeper_write_wal_seconds histogram vec"); - static ref FLUSH_WAL_SECONDS: HistogramVec = register_histogram_vec!( + .expect("Failed to register safekeeper_write_wal_seconds histogram vec") +}); +static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { + register_histogram_vec!( "safekeeper_flush_wal_seconds", "Seconds spent syncing WAL to a disk, grouped by timeline", &["tenant_id", "timeline_id"], DISK_WRITE_SECONDS_BUCKETS.to_vec() ) - .expect("Failed to register safekeeper_flush_wal_seconds histogram vec"); -} + .expect("Failed to register safekeeper_flush_wal_seconds histogram vec") +}); struct WalStorageMetrics { write_wal_bytes: Histogram, From 8c8431ebc63f001efa117982c18b56bcf8551ad6 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Sat, 6 Aug 2022 11:45:47 +0200 Subject: [PATCH 0594/1022] Add more buckets to pageserver latency metrics (#2225) --- pageserver/src/layered_repository/timeline.rs | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 181adc2bcc..73877a6656 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -60,12 +60,30 @@ use crate::walredo::WalRedoManager; use crate::CheckpointConfig; use crate::{page_cache, storage_sync}; +/// Prometheus histogram buckets (in seconds) that capture the majority of +/// latencies in the microsecond range but also extend far enough up to distinguish +/// "bad" from "really bad". +fn get_buckets_for_critical_operations() -> Vec { + let buckets_per_digit = 5; + let min_exponent = -6; + let max_exponent = 2; + + let mut buckets = vec![]; + // Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp + // because it's more numerically stable and doesn't result in numbers like 9.999999 + for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) { + buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64)) + } + buckets +} + // Metrics collected on operations on the storage repository. pub static STORAGE_TIME: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_storage_operations_seconds", "Time spent on storage operations", - &["operation", "tenant_id", "timeline_id"] + &["operation", "tenant_id", "timeline_id"], + get_buckets_for_critical_operations(), ) .expect("failed to define a metric") }); @@ -75,7 +93,8 @@ static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_getpage_reconstruct_seconds", "Time spent in reconstruct_value", - &["tenant_id", "timeline_id"] + &["tenant_id", "timeline_id"], + get_buckets_for_critical_operations(), ) .expect("failed to define a metric") }); @@ -93,7 +112,8 @@ static WAIT_LSN_TIME: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_wait_lsn_seconds", "Time spent waiting for WAL to arrive", - &["tenant_id", "timeline_id"] + &["tenant_id", "timeline_id"], + get_buckets_for_critical_operations(), ) .expect("failed to define a metric") }); From 0d14d4a1a80d5b2c5aed35c7a4b6851d73c65559 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 4 Aug 2022 19:33:19 +0300 Subject: [PATCH 0595/1022] ignore record property warning to fix benchmarks --- pytest.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/pytest.ini b/pytest.ini index 104d0e0244..bfa07e520b 100644 --- a/pytest.ini +++ b/pytest.ini @@ -2,6 +2,7 @@ filterwarnings = error::pytest.PytestUnhandledThreadExceptionWarning error::UserWarning + ignore:record_property is incompatible with junit_family:pytest.PytestWarning addopts = -m 'not remote_cluster' markers = From 4da4c7f76960e642dd07c1e448ce3ddcf841020f Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 5 Aug 2022 11:59:38 +0300 Subject: [PATCH 0596/1022] increase statement timeout --- test_runner/performance/test_wal_backpressure.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 2a79a778fc..dc78ed77b0 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -155,7 +155,7 @@ def start_pgbench_intensive_initialization(env: PgCompare, scale: int): f'-s{scale}', '-i', '-Idtg', - env.pg.connstr(options='-cstatement_timeout=300s') + env.pg.connstr(options='-cstatement_timeout=360s') ]) From 9430abae053b0a7494358932fa726f3e84b1ebe3 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 5 Aug 2022 12:05:46 +0300 Subject: [PATCH 0597/1022] use event so it fires only if workload thread successfully finished --- test_runner/performance/test_wal_backpressure.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index dc78ed77b0..61c5bea64a 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -146,7 +146,7 @@ def test_pgbench_simple_update_workload(pg_compare: PgCompare, scale: int, durat record_thread.join() -def start_pgbench_intensive_initialization(env: PgCompare, scale: int): +def start_pgbench_intensive_initialization(env: PgCompare, scale: int, done_event: threading.Event): with env.record_duration("run_duration"): # Needs to increase the statement timeout (default: 120s) because the # initialization step can be slow with a large scale. @@ -158,6 +158,8 @@ def start_pgbench_intensive_initialization(env: PgCompare, scale: int): env.pg.connstr(options='-cstatement_timeout=360s') ]) + done_event.set() + @pytest.mark.timeout(1000) @pytest.mark.parametrize("scale", get_scales_matrix(1000)) @@ -166,15 +168,17 @@ def test_pgbench_intensive_init_workload(pg_compare: PgCompare, scale: int): with env.pg.connect().cursor() as cur: cur.execute("CREATE TABLE foo as select generate_series(1,100000)") + workload_done_event = threading.Event() + workload_thread = threading.Thread(target=start_pgbench_intensive_initialization, - args=(env, scale)) + args=(env, scale, workload_done_event)) workload_thread.start() record_thread = threading.Thread(target=record_lsn_write_lag, - args=(env, lambda: workload_thread.is_alive())) + args=(env, lambda: not workload_done_event.is_set())) record_thread.start() - record_read_latency(env, lambda: workload_thread.is_alive(), "SELECT count(*) from foo") + record_read_latency(env, lambda: not workload_done_event.is_set(), "SELECT count(*) from foo") workload_thread.join() record_thread.join() From beaa991f811775c863da273e7904769018a746f3 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 5 Aug 2022 13:05:02 +0300 Subject: [PATCH 0598/1022] remove debug log --- pageserver/src/storage_sync/download.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 441d5e563e..f714888d9a 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -202,8 +202,6 @@ where }) .map_err(DownloadError::BadInput)?; - warn!("part_storage_path {:?}", part_storage_path); - let mut index_part_download = storage.download(&part_storage_path).await?; let mut index_part_bytes = Vec::new(); From 7cd68a0c278517ad6d067012cbe814150b028bec Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 5 Aug 2022 13:08:01 +0300 Subject: [PATCH 0599/1022] increase timeout to pass test with real s3 --- test_runner/batch_others/test_remote_storage.py | 2 +- test_runner/batch_others/test_wal_acceptor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index 72963ffe21..ca46010dca 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -110,7 +110,7 @@ def test_remote_storage_backup_and_restore( client.tenant_attach(UUID(tenant_id)) log.info("waiting for timeline redownload") - wait_until(number_of_iterations=10, + wait_until(number_of_iterations=20, interval=1, func=lambda: assert_timeline_local(client, UUID(tenant_id), UUID(timeline_id))) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 6544681bb0..f7aeb0abeb 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -350,7 +350,7 @@ def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end): if lsn_from_hex(tli_status.backup_lsn) >= lsn_from_hex(seg_end): break elapsed = time.time() - started_at - if elapsed > 20: + if elapsed > 30: raise RuntimeError( f"timed out waiting {elapsed:.0f}s for segment ending at {seg_end} get offloaded") time.sleep(0.5) From cdfa9fe7058a77a568eb24e8469e20c96dfb5b90 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 5 Aug 2022 19:02:16 +0300 Subject: [PATCH 0600/1022] avoid duplicate parameter, increase timeout --- test_runner/fixtures/neon_fixtures.py | 4 +++- test_runner/performance/test_wal_backpressure.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 3848aee05a..3b87f290b8 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -299,7 +299,9 @@ class PgProtocol: # change it by calling "SET statement_timeout" after # connecting. options = result.get('options', '') - result['options'] = f'-cstatement_timeout=120s {options}' + if "statement_timeout" not in options: + options = f'-cstatement_timeout=120s {options}' + result['options'] = options return result # autocommit=True here by default because that's what we need most of the time diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 61c5bea64a..bbb5ddecab 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -155,7 +155,7 @@ def start_pgbench_intensive_initialization(env: PgCompare, scale: int, done_even f'-s{scale}', '-i', '-Idtg', - env.pg.connstr(options='-cstatement_timeout=360s') + env.pg.connstr(options='-cstatement_timeout=600s') ]) done_event.set() From 743370de98cb9e0b60f8c67453363b39aca7aa81 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Mon, 8 Aug 2022 17:52:28 +0200 Subject: [PATCH 0601/1022] Major migration script (#2073) This script can be used to migrate a tenant across breaking storage versions, or (in the future) upgrading postgres versions. See the comment at the top for an overview. Co-authored-by: Anastasia Lubennikova --- scripts/export_import_between_pageservers.py | 708 ++++++++++++++++++ .../batch_others/test_tenant_relocation.py | 26 +- 2 files changed, 730 insertions(+), 4 deletions(-) create mode 100755 scripts/export_import_between_pageservers.py diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py new file mode 100755 index 0000000000..96f1d36ddb --- /dev/null +++ b/scripts/export_import_between_pageservers.py @@ -0,0 +1,708 @@ +# +# Script to export tenants from one pageserver and import them into another page server. +# +# Outline of steps: +# 1. Get `(last_lsn, prev_lsn)` from old pageserver +# 2. Get `fullbackup` from old pageserver, which creates a basebackup tar file +# 3. This tar file might be missing relation files for empty relations, if the pageserver +# is old enough (we didn't always store those). So to recreate them, we start a local +# vanilla postgres on this basebackup and ask it what relations should exist, then touch +# any missing files and re-pack the tar. +# TODO This functionality is no longer needed, so we can delete it later if we don't +# end up using the same utils for the pg 15 upgrade. Not sure. +# 4. We import the patched basebackup into a new pageserver +# 5. We export again via fullbackup, now from the new pageserver and compare the returned +# tar file with the one we imported. This confirms that we imported everything that was +# exported, but doesn't guarantee correctness (what if we didn't **export** everything +# initially?) +# 6. We wait for the new pageserver's remote_consistent_lsn to catch up +# +# For more context on how to use this, see: +# https://github.com/neondatabase/cloud/wiki/Storage-format-migration + +import os +from os import path +import shutil +from pathlib import Path +import tempfile +from contextlib import closing +import psycopg2 +import subprocess +import argparse +import time +import requests +import uuid +from psycopg2.extensions import connection as PgConnection +from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple + +############################################### +### client-side utils copied from test fixtures +############################################### + +Env = Dict[str, str] + +_global_counter = 0 + + +def global_counter() -> int: + """ A really dumb global counter. + This is useful for giving output files a unique number, so if we run the + same command multiple times we can keep their output separate. + """ + global _global_counter + _global_counter += 1 + return _global_counter + + +def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: + """ Run a process and capture its output + Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" + where "cmd" is the name of the program and NNN is an incrementing + counter. + If those files already exist, we will overwrite them. + Returns basepath for files with captured output. + """ + assert type(cmd) is list + base = os.path.basename(cmd[0]) + '_{}'.format(global_counter()) + basepath = os.path.join(capture_dir, base) + stdout_filename = basepath + '.stdout' + stderr_filename = basepath + '.stderr' + + with open(stdout_filename, 'w') as stdout_f: + with open(stderr_filename, 'w') as stderr_f: + print('(capturing output to "{}.stdout")'.format(base)) + subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) + + return basepath + + +class PgBin: + """ A helper class for executing postgres binaries """ + def __init__(self, log_dir: Path, pg_distrib_dir): + self.log_dir = log_dir + self.pg_bin_path = os.path.join(str(pg_distrib_dir), 'bin') + self.env = os.environ.copy() + self.env['LD_LIBRARY_PATH'] = os.path.join(str(pg_distrib_dir), 'lib') + + def _fixpath(self, command: List[str]): + if '/' not in command[0]: + command[0] = os.path.join(self.pg_bin_path, command[0]) + + def _build_env(self, env_add: Optional[Env]) -> Env: + if env_add is None: + return self.env + env = self.env.copy() + env.update(env_add) + return env + + def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None): + """ + Run one of the postgres binaries. + The command should be in list form, e.g. ['pgbench', '-p', '55432'] + All the necessary environment variables will be set. + If the first argument (the command name) doesn't include a path (no '/' + characters present), then it will be edited to include the correct path. + If you want stdout/stderr captured to files, use `run_capture` instead. + """ + + self._fixpath(command) + print('Running command "{}"'.format(' '.join(command))) + env = self._build_env(env) + subprocess.run(command, env=env, cwd=cwd, check=True) + + def run_capture(self, + command: List[str], + env: Optional[Env] = None, + cwd: Optional[str] = None, + **kwargs: Any) -> str: + """ + Run one of the postgres binaries, with stderr and stdout redirected to a file. + This is just like `run`, but for chatty programs. Returns basepath for files + with captured output. + """ + + self._fixpath(command) + print('Running command "{}"'.format(' '.join(command))) + env = self._build_env(env) + return subprocess_capture(str(self.log_dir), + command, + env=env, + cwd=cwd, + check=True, + **kwargs) + + +class PgProtocol: + """ Reusable connection logic """ + def __init__(self, **kwargs): + self.default_options = kwargs + + def conn_options(self, **kwargs): + conn_options = self.default_options.copy() + if 'dsn' in kwargs: + conn_options.update(parse_dsn(kwargs['dsn'])) + conn_options.update(kwargs) + + # Individual statement timeout in seconds. 2 minutes should be + # enough for our tests, but if you need a longer, you can + # change it by calling "SET statement_timeout" after + # connecting. + if 'options' in conn_options: + conn_options['options'] = f"-cstatement_timeout=120s " + conn_options['options'] + else: + conn_options['options'] = "-cstatement_timeout=120s" + return conn_options + + # autocommit=True here by default because that's what we need most of the time + def connect(self, autocommit=True, **kwargs) -> PgConnection: + """ + Connect to the node. + Returns psycopg2's connection object. + This method passes all extra params to connstr. + """ + conn = psycopg2.connect(**self.conn_options(**kwargs)) + + # WARNING: this setting affects *all* tests! + conn.autocommit = autocommit + return conn + + def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]: + """ + Execute query against the node and return all rows. + This method passes all extra params to connstr. + """ + return self.safe_psql_many([query], **kwargs)[0] + + def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]: + """ + Execute queries against the node and return all rows. + This method passes all extra params to connstr. + """ + result: List[List[Any]] = [] + with closing(self.connect(**kwargs)) as conn: + with conn.cursor() as cur: + for query in queries: + print(f"Executing query: {query}") + cur.execute(query) + + if cur.description is None: + result.append([]) # query didn't return data + else: + result.append(cast(List[Any], cur.fetchall())) + return result + + +class VanillaPostgres(PgProtocol): + def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True): + super().__init__(host='localhost', port=port, dbname='postgres') + self.pgdatadir = pgdatadir + self.pg_bin = pg_bin + self.running = False + if init: + self.pg_bin.run_capture(['initdb', '-D', str(pgdatadir)]) + self.configure([f"port = {port}\n"]) + + def configure(self, options: List[str]): + """Append lines into postgresql.conf file.""" + assert not self.running + with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file: + conf_file.write("\n".join(options)) + + def start(self, log_path: Optional[str] = None): + assert not self.running + self.running = True + + if log_path is None: + log_path = os.path.join(self.pgdatadir, "pg.log") + + self.pg_bin.run_capture( + ['pg_ctl', '-w', '-D', str(self.pgdatadir), '-l', log_path, 'start']) + + def stop(self): + assert self.running + self.running = False + self.pg_bin.run_capture(['pg_ctl', '-w', '-D', str(self.pgdatadir), 'stop']) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + if self.running: + self.stop() + + +class NeonPageserverApiException(Exception): + pass + + +class NeonPageserverHttpClient(requests.Session): + def __init__(self, host, port): + super().__init__() + self.host = host + self.port = port + + def verbose_error(self, res: requests.Response): + try: + res.raise_for_status() + except requests.RequestException as e: + try: + msg = res.json()['msg'] + except: + msg = '' + raise NeonPageserverApiException(msg) from e + + def check_status(self): + self.get(f"http://{self.host}:{self.port}/v1/status").raise_for_status() + + def tenant_list(self): + res = self.get(f"http://{self.host}:{self.port}/v1/tenant") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, list) + return res_json + + def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists): + res = self.post( + f"http://{self.host}:{self.port}/v1/tenant", + json={ + 'new_tenant_id': new_tenant_id.hex, + }, + ) + + if res.status_code == 409: + if ok_if_exists: + print(f'could not create tenant: already exists for id {new_tenant_id}') + else: + res.raise_for_status() + elif res.status_code == 201: + print(f'created tenant {new_tenant_id}') + else: + self.verbose_error(res) + + return new_tenant_id + + def timeline_list(self, tenant_id: uuid.UUID): + res = self.get(f"http://{self.host}:{self.port}/v1/tenant/{tenant_id.hex}/timeline") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, list) + return res_json + + def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1" + ) + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + +def lsn_to_hex(num: int) -> str: + """ Convert lsn from int to standard hex notation. """ + return "{:X}/{:X}".format(num >> 32, num & 0xffffffff) + + +def lsn_from_hex(lsn_hex: str) -> int: + """ Convert lsn from hex notation to int. """ + l, r = lsn_hex.split('/') + return (int(l, 16) << 32) + int(r, 16) + + +def remote_consistent_lsn(pageserver_http_client: NeonPageserverHttpClient, + tenant: uuid.UUID, + timeline: uuid.UUID) -> int: + detail = pageserver_http_client.timeline_detail(tenant, timeline) + + if detail['remote'] is None: + # No remote information at all. This happens right after creating + # a timeline, before any part of it has been uploaded to remote + # storage yet. + return 0 + else: + lsn_str = detail['remote']['remote_consistent_lsn'] + assert isinstance(lsn_str, str) + return lsn_from_hex(lsn_str) + + +def wait_for_upload(pageserver_http_client: NeonPageserverHttpClient, + tenant: uuid.UUID, + timeline: uuid.UUID, + lsn: int): + """waits for local timeline upload up to specified lsn""" + for i in range(10): + current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) + if current_lsn >= lsn: + return + print("waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1)) + time.sleep(1) + + raise Exception("timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn))) + + +############## +# End of utils +############## + + +def pack_base(log_dir, restored_dir, output_tar): + """Create tar file from basebackup, being careful to produce relative filenames.""" + tmp_tar_name = "tmp.tar" + tmp_tar_path = os.path.join(restored_dir, tmp_tar_name) + cmd = ["tar", "-cf", tmp_tar_name] + os.listdir(restored_dir) + # We actually cd into the dir and call tar from there. If we call tar from + # outside we won't encode filenames as relative, and they won't parse well + # on import. + subprocess_capture(log_dir, cmd, cwd=restored_dir) + shutil.move(tmp_tar_path, output_tar) + + +def reconstruct_paths(log_dir, pg_bin, base_tar): + """Reconstruct what relation files should exist in the datadir by querying postgres.""" + with tempfile.TemporaryDirectory() as restored_dir: + # Unpack the base tar + subprocess_capture(log_dir, ["tar", "-xf", base_tar, "-C", restored_dir]) + + # Start a vanilla postgres from the given datadir and query it to find + # what relfiles should exist, but possibly don't. + port = "55439" # Probably free + with VanillaPostgres(restored_dir, pg_bin, port, init=False) as vanilla_pg: + vanilla_pg.configure([f"port={port}"]) + vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log")) + + # Create database based on template0 because we can't connect to template0 + query = "create database template0copy template template0" + vanilla_pg.safe_psql(query, user="cloud_admin") + vanilla_pg.safe_psql("CHECKPOINT", user="cloud_admin") + + # Get all databases + query = "select oid, datname from pg_database" + oid_dbname_pairs = vanilla_pg.safe_psql(query, user="cloud_admin") + template0_oid = [ + oid for (oid, database) in oid_dbname_pairs if database == "template0" + ][0] + + # Get rel paths for each database + for oid, database in oid_dbname_pairs: + if database == "template0": + # We can't connect to template0 + continue + + query = "select relname, pg_relation_filepath(oid) from pg_class" + result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database) + for relname, filepath in result: + if filepath is not None: + + if database == "template0copy": + # Add all template0copy paths to template0 + prefix = f"base/{oid}/" + if filepath.startswith(prefix): + suffix = filepath[len(prefix):] + yield f"base/{template0_oid}/{suffix}" + elif filepath.startswith("global"): + print(f"skipping {database} global file {filepath}") + else: + raise AssertionError + else: + yield filepath + + +def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths): + """Add the appropriate empty files to a basebadkup tar.""" + with tempfile.TemporaryDirectory() as restored_dir: + # Unpack the base tar + subprocess_capture(log_dir, ["tar", "-xf", corrupt_tar, "-C", restored_dir]) + + # Touch files that don't exist + for path in paths: + absolute_path = os.path.join(restored_dir, path) + exists = os.path.exists(absolute_path) + if not exists: + print(f"File {absolute_path} didn't exist. Creating..") + Path(absolute_path).touch() + + # Repackage + pack_base(log_dir, restored_dir, output_tar) + + +# HACK This is a workaround for exporting from old pageservers that +# can't export empty relations. In this case we need to start +# a vanilla postgres from the exported datadir, and query it +# to see what empty relations are missing, and then create +# those empty files before importing. +def add_missing_rels(base_tar, output_tar, log_dir, pg_bin): + reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar)) + touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths) + + +def get_rlsn(pageserver_connstr, tenant_id, timeline_id): + conn = psycopg2.connect(pageserver_connstr) + conn.autocommit = True + with conn.cursor() as cur: + cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}" + cur.execute(cmd) + res = cur.fetchone() + prev_lsn = res[0] + last_lsn = res[1] + conn.close() + + return last_lsn, prev_lsn + + +def import_timeline(args, + psql_path, + pageserver_connstr, + pageserver_http, + tenant_id, + timeline_id, + last_lsn, + prev_lsn, + tar_filename): + # Import timelines to new pageserver + import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn}" + full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """ + + stderr_filename2 = path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr") + stdout_filename = path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout") + + print(f"Running: {full_cmd}") + + with open(stdout_filename, 'w') as stdout_f: + with open(stderr_filename2, 'w') as stderr_f: + print(f"(capturing output to {stdout_filename})") + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) + subprocess.run(full_cmd, + stdout=stdout_f, + stderr=stderr_f, + env=pg_bin._build_env(None), + shell=True, + check=True) + + print(f"Done import") + + # Wait until pageserver persists the files + wait_for_upload(pageserver_http, + uuid.UUID(tenant_id), + uuid.UUID(timeline_id), + lsn_from_hex(last_lsn)) + + +def export_timeline(args, + psql_path, + pageserver_connstr, + tenant_id, + timeline_id, + last_lsn, + prev_lsn, + tar_filename): + # Choose filenames + incomplete_filename = tar_filename + ".incomplete" + stderr_filename = path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr") + + # Construct export command + query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}" + cmd = [psql_path, "--no-psqlrc", pageserver_connstr, "-c", query] + + # Run export command + print(f"Running: {cmd}") + with open(incomplete_filename, 'w') as stdout_f: + with open(stderr_filename, 'w') as stderr_f: + print(f"(capturing output to {incomplete_filename})") + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) + subprocess.run(cmd, + stdout=stdout_f, + stderr=stderr_f, + env=pg_bin._build_env(None), + check=True) + + # Add missing rels + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) + add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin) + + # Log more info + file_size = os.path.getsize(tar_filename) + print(f"Done export: {tar_filename}, size {file_size}") + + +def main(args: argparse.Namespace): + psql_path = str(Path(args.pg_distrib_dir) / "bin" / "psql") + + old_pageserver_host = args.old_pageserver_host + new_pageserver_host = args.new_pageserver_host + + old_http_client = NeonPageserverHttpClient(old_pageserver_host, args.old_pageserver_http_port) + old_http_client.check_status() + old_pageserver_connstr = f"postgresql://{old_pageserver_host}:{args.old_pageserver_pg_port}" + + new_http_client = NeonPageserverHttpClient(new_pageserver_host, args.new_pageserver_http_port) + new_http_client.check_status() + new_pageserver_connstr = f"postgresql://{new_pageserver_host}:{args.new_pageserver_pg_port}" + + for tenant_id in args.tenants: + print(f"Tenant: {tenant_id}") + timelines = old_http_client.timeline_list(uuid.UUID(tenant_id)) + print(f"Timelines: {timelines}") + + # Create tenant in new pageserver + if args.only_import is False and not args.timelines: + new_http_client.tenant_create(uuid.UUID(tenant_id), args.ok_if_exists) + + for timeline in timelines: + # Skip timelines we don't need to export + if args.timelines and timeline['timeline_id'] not in args.timelines: + print(f"Skipping timeline {timeline['timeline_id']}") + continue + + # Choose filenames + tar_filename = path.join(args.work_dir, + f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar") + + # Export timeline from old pageserver + if args.only_import is False: + last_lsn, prev_lsn = get_rlsn( + old_pageserver_connstr, + timeline['tenant_id'], + timeline['timeline_id'], + ) + export_timeline( + args, + psql_path, + old_pageserver_connstr, + timeline['tenant_id'], + timeline['timeline_id'], + last_lsn, + prev_lsn, + tar_filename, + ) + + # Import into new pageserver + import_timeline( + args, + psql_path, + new_pageserver_connstr, + new_http_client, + timeline['tenant_id'], + timeline['timeline_id'], + last_lsn, + prev_lsn, + tar_filename, + ) + + # Re-export and compare + re_export_filename = tar_filename + ".reexport" + export_timeline(args, + psql_path, + new_pageserver_connstr, + timeline['tenant_id'], + timeline['timeline_id'], + last_lsn, + prev_lsn, + re_export_filename) + + # Check the size is the same + old_size = os.path.getsize(tar_filename), + new_size = os.path.getsize(re_export_filename), + if old_size != new_size: + raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--tenant-id', + dest='tenants', + required=True, + nargs='+', + help='Id of the tenant to migrate. You can pass multiple arguments', + ) + parser.add_argument( + '--timeline-id', + dest='timelines', + required=False, + nargs='+', + help='Id of the timeline to migrate. You can pass multiple arguments', + ) + parser.add_argument( + '--from-host', + dest='old_pageserver_host', + required=True, + help='Host of the pageserver to migrate data from', + ) + parser.add_argument( + '--from-http-port', + dest='old_pageserver_http_port', + required=False, + type=int, + default=9898, + help='HTTP port of the pageserver to migrate data from. Default: 9898', + ) + parser.add_argument( + '--from-pg-port', + dest='old_pageserver_pg_port', + required=False, + type=int, + default=6400, + help='pg port of the pageserver to migrate data from. Default: 6400', + ) + parser.add_argument( + '--to-host', + dest='new_pageserver_host', + required=True, + help='Host of the pageserver to migrate data to', + ) + parser.add_argument( + '--to-http-port', + dest='new_pageserver_http_port', + required=False, + default=9898, + type=int, + help='HTTP port of the pageserver to migrate data to. Default: 9898', + ) + parser.add_argument( + '--to-pg-port', + dest='new_pageserver_pg_port', + required=False, + default=6400, + type=int, + help='pg port of the pageserver to migrate data to. Default: 6400', + ) + parser.add_argument( + '--ignore-tenant-exists', + dest='ok_if_exists', + required=False, + help= + 'Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.', + ) + parser.add_argument( + '--pg-distrib-dir', + dest='pg_distrib_dir', + required=False, + default='/usr/local/', + help='Path where postgres binaries are installed. Default: /usr/local/', + ) + parser.add_argument( + '--psql-path', + dest='psql_path', + required=False, + default='/usr/local/bin/psql', + help='Path to the psql binary. Default: /usr/local/bin/psql', + ) + parser.add_argument( + '--only-import', + dest='only_import', + required=False, + default=False, + action='store_true', + help='Skip export and tenant creation part', + ) + parser.add_argument( + '--work-dir', + dest='work_dir', + required=True, + default=False, + help='directory where temporary tar files are stored', + ) + args = parser.parse_args() + main(args) diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index d59f28bcc5..176ca740fe 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -229,7 +229,7 @@ def post_migration_check(pg: Postgres, sum_before_migration: int, old_local_path # basebackup and importing it into the new pageserver. # This kind of migration can tolerate breaking changes # to storage format - pytest.param('major', marks=pytest.mark.xfail(reason="Not implemented")), + 'major', ]) @pytest.mark.parametrize('with_load', ['with_load', 'without_load']) def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, @@ -345,6 +345,8 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, # Migrate either by attaching from s3 or import/export basebackup if method == "major": cmd = [ + "poetry", + "run", "python", os.path.join(base_dir, "scripts/export_import_between_pageservers.py"), "--tenant-id", @@ -361,12 +363,12 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, str(new_pageserver_http_port), "--to-pg-port", str(new_pageserver_pg_port), - "--psql-path", - os.path.join(pg_distrib_dir, "bin", "psql"), + "--pg-distrib-dir", + pg_distrib_dir, "--work-dir", os.path.join(test_output_dir), ] - subprocess_capture(str(env.repo_dir), cmd, check=True) + subprocess_capture(test_output_dir, cmd, check=True) elif method == "minor": # call to attach timeline to new pageserver new_pageserver_http.tenant_attach(tenant_id) @@ -427,6 +429,22 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, post_migration_check(pg_main, 500500, old_local_path_main) post_migration_check(pg_second, 1001000, old_local_path_second) + # ensure that we can successfully read all relations on the new pageserver + with pg_cur(pg_second) as cur: + cur.execute(''' + DO $$ + DECLARE + r RECORD; + BEGIN + FOR r IN + SELECT relname FROM pg_class WHERE relkind='r' + LOOP + RAISE NOTICE '%', r.relname; + EXECUTE 'SELECT count(*) FROM quote_ident($1)' USING r.relname; + END LOOP; + END$$; + ''') + if with_load == 'with_load': assert load_ok_event.wait(3) log.info('stopping load thread') From 3a9bff81db02c5e225451536a8cdd1cbf2edb47e Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 8 Aug 2022 17:53:46 +0300 Subject: [PATCH 0602/1022] Fix etcd typos --- control_plane/src/etcd.rs | 4 ++-- pageserver/src/walreceiver.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs index 0123d9c491..ccadfa8ce7 100644 --- a/control_plane/src/etcd.rs +++ b/control_plane/src/etcd.rs @@ -30,14 +30,14 @@ pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { let etcd_stdout_file = fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| { format!( - "Failed to create ectd stout file in directory {}", + "Failed to create etcd stout file in directory {}", etcd_data_dir.display() ) })?; let etcd_stderr_file = fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| { format!( - "Failed to create ectd stderr file in directory {}", + "Failed to create etcd stderr file in directory {}", etcd_data_dir.display() ) })?; diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 43bb3fa971..8a466a8a67 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -66,7 +66,7 @@ pub fn init_wal_receiver_main_thread( ); let broker_prefix = &conf.broker_etcd_prefix; info!( - "Starting wal receiver main thread, etdc endpoints: {}", + "Starting wal receiver main thread, etcd endpoints: {}", etcd_endpoints.iter().map(Url::to_string).join(", ") ); From 32fd709b34dda98db617e4b579e3a732d0e40ef0 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 9 Aug 2022 06:19:18 +0300 Subject: [PATCH 0603/1022] Fix links to safekeeper protocol docs. (#2188) safekeeper/README_PROTO.md was moved to docs/safekeeper-protocol.md in commit 0b14fdb078, as part of reorganizing the docs into 'mdbook' format. Fixes issue #1475. Thanks to @banks for spotting the outdated references. In addition to fixing the above issue, this patch also fixes other broken links as a result of 0b14fdb078. See https://github.com/neondatabase/neon/pull/2188#pullrequestreview-1055918480. Co-authored-by: Heikki Linnakangas Co-authored-by: Thang Pham --- docs/SUMMARY.md | 2 -- docs/glossary.md | 4 ++-- docs/pageserver-services.md | 4 +--- docs/sourcetree.md | 4 ++-- docs/walservice.md | 4 ++-- pageserver/src/layered_repository.rs | 2 +- 6 files changed, 8 insertions(+), 12 deletions(-) diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index cf29ee3c6a..95ac512ea8 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -52,10 +52,8 @@ - [multitenancy.md](./multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI. - [settings.md](./settings.md) #FIXME: move these under sourcetree.md -#- [pageserver/README.md](/pageserver/README.md) #- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) #- [test_runner/README.md](/test_runner/README.md) -#- [safekeeper/README.md](/safekeeper/README.md) # RFCs diff --git a/docs/glossary.md b/docs/glossary.md index 7aeae27a39..665596c68d 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -75,7 +75,7 @@ layer's Segment and range of LSNs. There are two kinds of layers, in-memory and on-disk layers. In-memory layers are used to ingest incoming WAL, and provide fast access to the recent page versions. On-disk layers are stored as files on disk, and -are immutable. See pageserver/src/layered_repository/README.md for more. +are immutable. See [pageserver-storage.md](./pageserver-storage.md) for more. ### Layer file (on-disk layer) @@ -111,7 +111,7 @@ PostgreSQL LSNs and functions to monitor them: * `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically. [source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html): -Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md) +Neon safekeeper LSNs. See [safekeeper protocol section](safekeeper-protocol.md) for more information. * `CommitLSN`: position in WAL confirmed by quorum safekeepers. * `RestartLSN`: position in WAL confirmed by all safekeepers. * `FlushLSN`: part of WAL persisted to the disk by safekeeper. diff --git a/docs/pageserver-services.md b/docs/pageserver-services.md index 4e85413513..07a91f543d 100644 --- a/docs/pageserver-services.md +++ b/docs/pageserver-services.md @@ -68,8 +68,6 @@ There are the following implementations present: * local filesystem — to use in tests mainly * AWS S3 - to use in production -Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs, parameters documentation can be found at [settings docs](../docs/settings.md). - The backup service is disabled by default and can be enabled to interact with a single remote storage. CLI examples: @@ -118,7 +116,7 @@ implemented by the LayeredRepository object in `layered_repository.rs`. There is only that one implementation of the Repository trait, but it's still a useful abstraction that keeps the interface for the low-level storage functionality clean. The layered -storage format is described in layered_repository/README.md. +storage format is described in [pageserver-storage.md](./pageserver-storage.md). Each repository consists of multiple Timelines. Timeline is a workhorse that accepts page changes from the WAL, and serves diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 05eaa96938..39f7be89a0 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -28,7 +28,7 @@ The pageserver has a few different duties: - Receive WAL from the WAL service and decode it. - Replay WAL that's applicable to the chunks that the Page Server maintains -For more detailed info, see [/pageserver/README](/pageserver/README.md) +For more detailed info, see [pageserver-services.md](./pageserver-services.md) `/proxy`: @@ -57,7 +57,7 @@ PostgreSQL extension that contains functions needed for testing and debugging. The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver. It acts as a holding area and redistribution center for recently generated WAL. -For more detailed info, see [/safekeeper/README](/safekeeper/README.md) +For more detailed info, see [walservice.md](./walservice.md) `/workspace_hack`: The workspace_hack crate exists only to pin down some dependencies. diff --git a/docs/walservice.md b/docs/walservice.md index 7b217ddbec..4e6db0c5a4 100644 --- a/docs/walservice.md +++ b/docs/walservice.md @@ -75,8 +75,8 @@ safekeepers. The Paxos and crash recovery algorithm ensures that only one primary node can be actively streaming WAL to the quorum of safekeepers. -See README_PROTO.md for a more detailed description of the consensus -protocol. spec/ contains TLA+ specification of it. +See [this section](safekeeper-protocol.md) for a more detailed description of +the consensus protocol. spec/ contains TLA+ specification of it. # Q&A diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 79a180c4cf..271e62a4f9 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -5,7 +5,7 @@ //! get/put call, walking back the timeline branching history as needed. //! //! The files are stored in the .neon/tenants//timelines/ -//! directory. See layered_repository/README for how the files are managed. +//! directory. See docs/pageserver-storage.md for how the files are managed. //! In addition to the layer files, there is a metadata file in the same //! directory that contains information about the timeline, in particular its //! parent timeline, and the last LSN that has been written to disk. From 0290893bcccfdfba1753a25a20bda0755ce16332 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Fri, 5 Aug 2022 14:56:34 +0300 Subject: [PATCH 0604/1022] Update CONTRIBUTING.md --- CONTRIBUTING.md | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a03cfdda48..f1fba50491 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,18 +11,13 @@ than it was before. ## Submitting changes -1. Make a PR for every change. - - Even seemingly trivial patches can break things in surprising ways. -Use of common sense is OK. If you're only fixing a typo in a comment, -it's probably fine to just push it. But if in doubt, open a PR. - -2. Get at least one +1 on your PR before you push. +1. Get at least one +1 on your PR before you push. For simple patches, it will only take a minute for someone to review -it. +it. PR force-push breaks unadressed review comments, so it is better to +just commit follow ups and squash later during the merge. -3. Always keep the CI green. +2. Always keep the CI green. Do not push, if the CI failed on your PR. Even if you think it's not your patch's fault. Help to fix the root cause if something else has From 227d47d2f3e5c69e7cd400dcbdfdfc38730b7969 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Fri, 5 Aug 2022 16:14:33 +0300 Subject: [PATCH 0605/1022] Update CONTRIBUTING.md --- CONTRIBUTING.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f1fba50491..43ebefc477 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,10 +14,13 @@ than it was before. 1. Get at least one +1 on your PR before you push. For simple patches, it will only take a minute for someone to review -it. PR force-push breaks unadressed review comments, so it is better to -just commit follow ups and squash later during the merge. +it. -2. Always keep the CI green. +2. Don't force push small changes after making the PR ready for review. +Doing so will force readers to re-read your entire PR, which will delay +the review process. + +3. Always keep the CI green. Do not push, if the CI failed on your PR. Even if you think it's not your patch's fault. Help to fix the root cause if something else has From 1fc761983f0e2fdd01b72dc5e97fcabcae3fa14f Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 1 Aug 2022 22:37:26 +0300 Subject: [PATCH 0606/1022] support node id and remote storage params in docker_entrypoint.sh --- docker-entrypoint.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 6bcbc76551..75dbdaed7a 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -1,6 +1,8 @@ #!/bin/sh set -eux +pageserver_id_param="${NODE_ID:-10}" + broker_endpoints_param="${BROKER_ENDPOINT:-absent}" if [ "$broker_endpoints_param" != "absent" ]; then broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']" @@ -8,10 +10,12 @@ else broker_endpoints_param='' fi +remote_storage_param="${REMOTE_STORAGE:-}" + if [ "$1" = 'pageserver' ]; then if [ ! -d "/data/tenants" ]; then echo "Initializing pageserver data directory" - pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" $broker_endpoints_param + pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=${pageserver_id_param}" $broker_endpoints_param $remote_storage_param fi echo "Staring pageserver at 0.0.0.0:6400" pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data From 4227cfc96eda8ddde3c4b4df51efac6f0f1fc8c0 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 9 Aug 2022 22:45:33 +0300 Subject: [PATCH 0607/1022] Safe truncate (#2218) * Move relation sie cache to layered timeline * Fix obtaining current LSN for relation size cache * Resolve merge conflicts * Resolve merge conflicts * Reestore 'lsn' field in DatadirModification * adjust DatadirModification lsn in ingest_record * Fix formatting * Pass lsn to get_relsize * Fix merge conflict * Update pageserver/src/pgdatadir_mapping.rs Co-authored-by: Heikki Linnakangas * Update pageserver/src/pgdatadir_mapping.rs Co-authored-by: Heikki Linnakangas * Check if relation exists before trying to truncat it refer #1932 * Add test reporducing FSM truncate problem Co-authored-by: Heikki Linnakangas --- pageserver/src/pgdatadir_mapping.rs | 25 +++++++++++-------- test_runner/batch_others/test_fsm_truncate.py | 11 ++++++++ vendor/postgres | 2 +- 3 files changed, 27 insertions(+), 11 deletions(-) create mode 100644 test_runner/batch_others/test_fsm_truncate.py diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 9097a08d05..827bd29ded 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -708,20 +708,25 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { /// Truncate relation pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { ensure!(rel.relnode != 0, "invalid relnode"); - let size_key = rel_size_to_key(rel); + let last_lsn = self.tline.get_last_record_lsn(); + if self.tline.get_rel_exists(rel, last_lsn)? { + let size_key = rel_size_to_key(rel); + // Fetch the old size first + let old_size = self.get(size_key)?.get_u32_le(); - // Fetch the old size first - let old_size = self.get(size_key)?.get_u32_le(); + // Update the entry with the new size. + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); - // Update the entry with the new size. - let buf = nblocks.to_le_bytes(); - self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + // Update relation size cache + self.tline.set_cached_rel_size(rel, self.lsn, nblocks); - // Update relation size cache - self.tline.set_cached_rel_size(rel, self.lsn, nblocks); + // Update relation size cache + self.tline.set_cached_rel_size(rel, self.lsn, nblocks); - // Update logical database size. - self.pending_nblocks -= old_size as isize - nblocks as isize; + // Update logical database size. + self.pending_nblocks -= old_size as isize - nblocks as isize; + } Ok(()) } diff --git a/test_runner/batch_others/test_fsm_truncate.py b/test_runner/batch_others/test_fsm_truncate.py new file mode 100644 index 0000000000..0f85942598 --- /dev/null +++ b/test_runner/batch_others/test_fsm_truncate.py @@ -0,0 +1,11 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient +import pytest + + +def test_fsm_truncate(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_fsm_truncate") + pg = env.postgres.create_start('test_fsm_truncate') + pg.safe_psql( + 'CREATE TABLE t1(key int); CREATE TABLE t2(key int); TRUNCATE TABLE t1; TRUNCATE TABLE t2;') diff --git a/vendor/postgres b/vendor/postgres index 5280b6fe10..bc6dcc493c 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 5280b6fe1027afd5a7e14c142913d9fdf9e2b442 +Subproject commit bc6dcc493c977f3b06ad95abf493273a693b0e12 From 7a36d06cc212e8cd615709695fb3f670c7e53174 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 10 Aug 2022 23:47:14 +0300 Subject: [PATCH 0608/1022] Fix exponential backoff values --- pageserver/src/timelines.rs | 2 +- .../src/walreceiver/connection_manager.rs | 51 +++++++++++++++++-- 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 42cb6cb156..6002e8b2d9 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -232,7 +232,7 @@ pub(crate) fn create_timeline( return Ok(None); } - let _new_timeline = match ancestor_timeline_id { + match ancestor_timeline_id { Some(ancestor_timeline_id) => { let ancestor_timeline = repo .get_timeline_load(ancestor_timeline_id) diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index ae1c787517..5b6a211566 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -233,13 +233,23 @@ async fn subscribe_for_timeline_updates( const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1; const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0; -async fn exponential_backoff(n: u32, base: f64, max_seconds: f64) { +fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { if n == 0 { - return; + 0.0 + } else { + (1.0 + base_increment).powf(f64::from(n)).min(max_seconds) + } +} + +async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) { + let backoff_duration_seconds = + exponential_backoff_duration_seconds(n, base_increment, max_seconds); + if backoff_duration_seconds > 0.0 { + info!( + "Backoff: waiting {backoff_duration_seconds} seconds before proceeding with the task", + ); + tokio::time::sleep(Duration::from_secs_f64(backoff_duration_seconds)).await; } - let seconds_to_wait = base.powf(f64::from(n) - 1.0).min(max_seconds); - info!("Backoff: waiting {seconds_to_wait} seconds before proceeding with the task"); - tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await; } /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. @@ -1217,3 +1227,34 @@ mod tests { } } } + +#[cfg(test)] +mod backoff_defaults_tests { + use super::*; + + #[test] + fn backoff_defaults_produce_growing_backoff_sequence() { + let mut current_backoff_value = None; + + for i in 0..10_000 { + let new_backoff_value = exponential_backoff_duration_seconds( + i, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ); + + if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) { + assert!( + old_backoff_value <= new_backoff_value, + "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}" + ) + } + } + + assert_eq!( + current_backoff_value.expect("Should have produced backoff values to compare"), + DEFAULT_MAX_BACKOFF_SECONDS, + "Given big enough of retries, backoff should reach its allowed max value" + ); + } +} From 4b9e02be45b0e115ebeba78ea2f3c9b7fbe5c3b7 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 11 Aug 2022 19:25:08 +0300 Subject: [PATCH 0609/1022] Update back `vendor/postgres` back; it was changed accidentally. (#2251) Commit 4227cfc96e accidentally reverted vendor/postgres to an older version. Update it back. --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index bc6dcc493c..0a9045c9ff 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit bc6dcc493c977f3b06ad95abf493273a693b0e12 +Subproject commit 0a9045c9ff2c0833fd7f32571833ebbdf037353d From e593cbaabafafb6f54c9ddcd5d1d9f04d1bd4490 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 8 Aug 2022 12:47:50 +0300 Subject: [PATCH 0610/1022] Add pageserver checkpoint_timeout option. To flush inmemory layer eventually when no new data arrives, which helps safekeepers to suspend activity (stop pushing to the broker). Default 10m should be ok. --- control_plane/src/storage.rs | 2 ++ docs/settings.md | 12 +++++-- pageserver/src/config.rs | 8 +++++ pageserver/src/http/models.rs | 3 ++ pageserver/src/http/openapi_spec.yml | 4 +++ pageserver/src/http/routes.rs | 9 ++++++ pageserver/src/layered_repository.rs | 7 +++++ pageserver/src/layered_repository/timeline.rs | 31 +++++++++++++++---- pageserver/src/page_service.rs | 7 +++++ pageserver/src/repository.rs | 1 + pageserver/src/tenant_config.rs | 14 +++++++++ .../src/walreceiver/walreceiver_connection.rs | 16 ++++------ safekeeper/src/safekeeper.rs | 6 +++- safekeeper/src/timeline.rs | 2 +- test_runner/batch_others/test_wal_acceptor.py | 5 ++- 15 files changed, 106 insertions(+), 21 deletions(-) diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index c2ed3fc824..d2742e84bb 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -401,6 +401,7 @@ impl PageServerNode { .get("checkpoint_distance") .map(|x| x.parse::()) .transpose()?, + checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()), compaction_target_size: settings .get("compaction_target_size") .map(|x| x.parse::()) @@ -455,6 +456,7 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'checkpoint_distance' as an integer")?, + checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()), compaction_target_size: settings .get("compaction_target_size") .map(|x| x.parse::()) diff --git a/docs/settings.md b/docs/settings.md index f2aaab75a8..5a0e976b47 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -15,7 +15,7 @@ listen_pg_addr = '127.0.0.1:64000' listen_http_addr = '127.0.0.1:9898' checkpoint_distance = '268435456' # in bytes -checkpoint_period = '1 s' +checkpoint_timeout = '10m' gc_period = '100 s' gc_horizon = '67108864' @@ -46,7 +46,7 @@ Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#ta All values can be passed as an argument to the pageserver binary, using the `-c` parameter and specified as a valid TOML string. All tables should be passed in the inline form. -Example: `${PAGESERVER_BIN} -c "checkpoint_period = '100 s'" -c "remote_storage={local_path='/some/local/path/'}"` +Example: `${PAGESERVER_BIN} -c "checkpoint_timeout = '10 m'" -c "remote_storage={local_path='/some/local/path/'}"` Note that TOML distinguishes between strings and integers, the former require single or double quotes around them. @@ -82,6 +82,14 @@ S3. The unit is # of bytes. +#### checkpoint_timeout + +Apart from `checkpoint_distance`, open layer flushing is also triggered +`checkpoint_timeout` after the last flush. This makes WAL eventually uploaded to +s3 when activity is stopped. + +The default is 10m. + #### compaction_period Every `compaction_period` seconds, the page server checks if diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 01b626e046..c1c4169e14 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -59,6 +59,7 @@ pub mod defaults { # [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes +#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} #compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes #compaction_period = '{DEFAULT_COMPACTION_PERIOD}' #compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}' @@ -452,6 +453,13 @@ impl PageServerConf { Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?); } + if let Some(checkpoint_timeout) = item.get("checkpoint_timeout") { + t_conf.checkpoint_timeout = Some(parse_toml_duration( + "checkpoint_timeout", + checkpoint_timeout, + )?); + } + if let Some(compaction_target_size) = item.get("compaction_target_size") { t_conf.compaction_target_size = Some(parse_toml_u64( "compaction_target_size", diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index aee31f14a7..a4f270580f 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -32,6 +32,7 @@ pub struct TenantCreateRequest { #[serde_as(as = "Option")] pub new_tenant_id: Option, pub checkpoint_distance: Option, + pub checkpoint_timeout: Option, pub compaction_target_size: Option, pub compaction_period: Option, pub compaction_threshold: Option, @@ -70,6 +71,7 @@ pub struct TenantConfigRequest { #[serde(default)] #[serde_as(as = "Option")] pub checkpoint_distance: Option, + pub checkpoint_timeout: Option, pub compaction_target_size: Option, pub compaction_period: Option, pub compaction_threshold: Option, @@ -87,6 +89,7 @@ impl TenantConfigRequest { TenantConfigRequest { tenant_id, checkpoint_distance: None, + checkpoint_timeout: None, compaction_target_size: None, compaction_period: None, compaction_threshold: None, diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 106c14fbc8..fc3e80ba19 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -560,6 +560,8 @@ components: type: string checkpoint_distance: type: integer + checkpoint_timeout: + type: string compaction_period: type: string compaction_threshold: @@ -578,6 +580,8 @@ components: type: string checkpoint_distance: type: integer + checkpoint_timeout: + type: string compaction_period: type: string compaction_threshold: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index fa598de402..1b1b4f99cb 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -623,6 +623,11 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result Duration { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .checkpoint_timeout + .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) + } + pub fn get_compaction_target_size(&self) -> u64 { let tenant_conf = self.tenant_conf.read().unwrap(); tenant_conf diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 73877a6656..2d396024a0 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -16,7 +16,7 @@ use std::ops::{Deref, Range}; use std::path::PathBuf; use std::sync::atomic::{self, AtomicBool, AtomicIsize, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError}; -use std::time::{Duration, SystemTime}; +use std::time::{Duration, Instant, SystemTime}; use metrics::{ register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec, @@ -233,6 +233,8 @@ pub struct LayeredTimeline { pub layers: RwLock, last_freeze_at: AtomicLsn, + // Atomic would be more appropriate here. + last_freeze_ts: RwLock, // WAL redo manager walredo_mgr: Arc, @@ -560,6 +562,13 @@ impl LayeredTimeline { .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } + fn get_checkpoint_timeout(&self) -> Duration { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .checkpoint_timeout + .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) + } + fn get_compaction_target_size(&self) -> u64 { let tenant_conf = self.tenant_conf.read().unwrap(); tenant_conf @@ -649,6 +658,7 @@ impl LayeredTimeline { disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0), last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0), + last_freeze_ts: RwLock::new(Instant::now()), ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), @@ -1094,8 +1104,11 @@ impl LayeredTimeline { } /// - /// Check if more than 'checkpoint_distance' of WAL has been accumulated - /// in the in-memory layer, and initiate flushing it if so. + /// Check if more than 'checkpoint_distance' of WAL has been accumulated in + /// the in-memory layer, and initiate flushing it if so. + /// + /// Also flush after a period of time without new data -- it helps + /// safekeepers to regard pageserver as caught up and suspend activity. /// pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { let last_lsn = self.get_last_record_lsn(); @@ -1103,21 +1116,27 @@ impl LayeredTimeline { if let Some(open_layer) = &layers.open_layer { let open_layer_size = open_layer.size()?; drop(layers); - let distance = last_lsn.widening_sub(self.last_freeze_at.load()); + let last_freeze_at = self.last_freeze_at.load(); + let last_freeze_ts = *(self.last_freeze_ts.read().unwrap()); + let distance = last_lsn.widening_sub(last_freeze_at); // Checkpointing the open layer can be triggered by layer size or LSN range. // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and // we want to stay below that with a big margin. The LSN distance determines how // much WAL the safekeepers need to store. if distance >= self.get_checkpoint_distance().into() || open_layer_size > self.get_checkpoint_distance() + || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()) { info!( - "check_checkpoint_distance {}, layer size {}", - distance, open_layer_size + "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}", + distance, + open_layer_size, + last_freeze_ts.elapsed() ); self.freeze_inmem_layer(true); self.last_freeze_at.store(last_lsn); + *(self.last_freeze_ts.write().unwrap()) = Instant::now(); // Launch a thread to flush the frozen layer to disk, unless // a thread was already running. (If the thread was running diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 75df744014..3c5ea5267e 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1044,6 +1044,7 @@ impl postgres_backend::Handler for PageServerHandler { let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; pgb.write_message_noflush(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), + RowDescriptor::int8_col(b"checkpoint_timeout"), RowDescriptor::int8_col(b"compaction_target_size"), RowDescriptor::int8_col(b"compaction_period"), RowDescriptor::int8_col(b"compaction_threshold"), @@ -1054,6 +1055,12 @@ impl postgres_backend::Handler for PageServerHandler { ]))? .write_message_noflush(&BeMessage::DataRow(&[ Some(repo.get_checkpoint_distance().to_string().as_bytes()), + Some( + repo.get_checkpoint_timeout() + .as_secs() + .to_string() + .as_bytes(), + ), Some(repo.get_compaction_target_size().to_string().as_bytes()), Some( repo.get_compaction_period() diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 3fae0184f9..a1a08b11d5 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -445,6 +445,7 @@ pub mod repo_harness { fn from(tenant_conf: TenantConf) -> Self { Self { checkpoint_distance: Some(tenant_conf.checkpoint_distance), + checkpoint_timeout: Some(tenant_conf.checkpoint_timeout), compaction_target_size: Some(tenant_conf.compaction_target_size), compaction_period: Some(tenant_conf.compaction_period), compaction_threshold: Some(tenant_conf.compaction_threshold), diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index 8811009743..eff5272837 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -23,6 +23,7 @@ pub mod defaults { // which is good for now to trigger bugs. // This parameter actually determines L0 layer file size. pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; + pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m"; // Target file size, when creating image and delta layers. // This parameter determines L1 layer file size. @@ -48,6 +49,9 @@ pub struct TenantConf { // page server crashes. // This parameter actually determines L0 layer file size. pub checkpoint_distance: u64, + // Inmemory layer is also flushed at least once in checkpoint_timeout to + // eventually upload WAL after activity is stopped. + pub checkpoint_timeout: Duration, // Target file size, when creating image and delta layers. // This parameter determines L1 layer file size. pub compaction_target_size: u64, @@ -90,6 +94,7 @@ pub struct TenantConf { #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] pub struct TenantConfOpt { pub checkpoint_distance: Option, + pub checkpoint_timeout: Option, pub compaction_target_size: Option, #[serde(with = "humantime_serde")] pub compaction_period: Option, @@ -113,6 +118,9 @@ impl TenantConfOpt { checkpoint_distance: self .checkpoint_distance .unwrap_or(global_conf.checkpoint_distance), + checkpoint_timeout: self + .checkpoint_timeout + .unwrap_or(global_conf.checkpoint_timeout), compaction_target_size: self .compaction_target_size .unwrap_or(global_conf.compaction_target_size), @@ -142,6 +150,9 @@ impl TenantConfOpt { if let Some(checkpoint_distance) = other.checkpoint_distance { self.checkpoint_distance = Some(checkpoint_distance); } + if let Some(checkpoint_timeout) = other.checkpoint_timeout { + self.checkpoint_timeout = Some(checkpoint_timeout); + } if let Some(compaction_target_size) = other.compaction_target_size { self.compaction_target_size = Some(compaction_target_size); } @@ -181,6 +192,8 @@ impl TenantConf { TenantConf { checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, + checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT) + .expect("cannot parse default checkpoint timeout"), compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE, compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) .expect("cannot parse default compaction period"), @@ -212,6 +225,7 @@ impl TenantConf { pub fn dummy_conf() -> Self { TenantConf { checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, + checkpoint_timeout: Duration::from_secs(600), compaction_target_size: 4 * 1024 * 1024, compaction_period: Duration::from_secs(10), compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD, diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index c4e66bdb95..538ebfe30e 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -178,16 +178,6 @@ pub async fn handle_walreceiver_connection( caught_up = true; } - let timeline_to_check = Arc::clone(&timeline); - tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance()) - .await - .with_context(|| { - format!("Spawned checkpoint check task panicked for timeline {id}") - })? - .with_context(|| { - format!("Failed to check checkpoint distance for timeline {id}") - })?; - Some(endlsn) } @@ -208,6 +198,12 @@ pub async fn handle_walreceiver_connection( _ => None, }; + let timeline_to_check = Arc::clone(&timeline); + tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance()) + .await + .with_context(|| format!("Spawned checkpoint check task panicked for timeline {id}"))? + .with_context(|| format!("Failed to check checkpoint distance for timeline {id}"))?; + if let Some(last_lsn) = status_update { let remote_index = repo.get_remote_index(); let timeline_remote_consistent_lsn = remote_index diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index fd4761505d..a9373cb584 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -727,7 +727,7 @@ where info!("setting local_start_lsn to {:?}", state.local_start_lsn); } // Initializing commit_lsn before acking first flushed record is - // important to let find_end_of_wal skip the whole in the beginning + // important to let find_end_of_wal skip the hole in the beginning // of the first segment. // // NB: on new clusters, this happens at the same time as @@ -738,6 +738,10 @@ where // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment. self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn); + // Initializing remote_consistent_lsn sets that we have nothing to + // stream to pageserver(s) immediately after creation. + self.inmem.remote_consistent_lsn = + max(self.inmem.remote_consistent_lsn, state.timeline_start_lsn); state.acceptor_state.term_history = msg.term_history.clone(); self.persist_control_file(state)?; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index ee642408f2..161fca3595 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -137,7 +137,7 @@ impl SharedState { self.is_wal_backup_required() // FIXME: add tracking of relevant pageservers and check them here individually, // otherwise migration won't work (we suspend too early). - || self.sk.inmem.remote_consistent_lsn <= self.sk.inmem.commit_lsn + || self.sk.inmem.remote_consistent_lsn < self.sk.inmem.commit_lsn } /// Mark timeline active/inactive and return whether s3 offloading requires diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index f7aeb0abeb..b55ba84756 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -284,9 +284,12 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): env.neon_cli.create_branch('test_safekeepers_wal_removal') pg = env.postgres.create_start('test_safekeepers_wal_removal') + # Note: it is important to insert at least two segments, as currently + # control file is synced roughly once in segment range and WAL is not + # removed until all horizons are persisted. pg.safe_psql_many([ 'CREATE TABLE t(key int primary key, value text)', - "INSERT INTO t SELECT generate_series(1,100000), 'payload'", + "INSERT INTO t SELECT generate_series(1,200000), 'payload'", ]) tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] From 995a2de21ea66eb1c8d49e1953fb95385baecdf0 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 11 Aug 2022 23:21:06 +0300 Subject: [PATCH 0611/1022] Share exponential backoff code and fix logic for delete task failure (#2252) --- pageserver/src/lib.rs | 53 +++++++++ pageserver/src/storage_sync.rs | 107 ++++++++++-------- .../src/walreceiver/connection_manager.rs | 59 +--------- 3 files changed, 115 insertions(+), 104 deletions(-) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index ba912a3702..140260e0d0 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -93,3 +93,56 @@ pub fn shutdown_pageserver(exit_code: i32) { info!("Shut down successfully completed"); std::process::exit(exit_code); } + +const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1; +const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0; + +async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) { + let backoff_duration_seconds = + exponential_backoff_duration_seconds(n, base_increment, max_seconds); + if backoff_duration_seconds > 0.0 { + info!( + "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task", + ); + tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await; + } +} + +fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { + if n == 0 { + 0.0 + } else { + (1.0 + base_increment).powf(f64::from(n)).min(max_seconds) + } +} + +#[cfg(test)] +mod backoff_defaults_tests { + use super::*; + + #[test] + fn backoff_defaults_produce_growing_backoff_sequence() { + let mut current_backoff_value = None; + + for i in 0..10_000 { + let new_backoff_value = exponential_backoff_duration_seconds( + i, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ); + + if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) { + assert!( + old_backoff_value <= new_backoff_value, + "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}" + ) + } + } + + assert_eq!( + current_backoff_value.expect("Should have produced backoff values to compare"), + DEFAULT_MAX_BACKOFF_SECONDS, + "Given big enough of retries, backoff should reach its allowed max value" + ); + } +} diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 222a406c81..d1c8922259 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -172,6 +172,7 @@ use self::{ }; use crate::{ config::PageServerConf, + exponential_backoff, layered_repository::{ ephemeral_file::is_ephemeral_file, metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}, @@ -969,14 +970,19 @@ fn storage_sync_loop( } } -// needed to check whether the download happened -// more informative than just a bool #[derive(Debug)] -enum DownloadMarker { +enum DownloadStatus { Downloaded, Nothing, } +#[derive(Debug)] +enum UploadStatus { + Uploaded, + Failed, + Nothing, +} + async fn process_batches( conf: &'static PageServerConf, max_sync_errors: NonZeroU32, @@ -1016,7 +1022,7 @@ where "Finished storage sync task for sync id {sync_id} download marker {:?}", download_marker ); - if matches!(download_marker, DownloadMarker::Downloaded) { + if matches!(download_marker, DownloadStatus::Downloaded) { downloaded_timelines.insert(sync_id.tenant_id); } } @@ -1030,7 +1036,7 @@ async fn process_sync_task_batch( max_sync_errors: NonZeroU32, sync_id: ZTenantTimelineId, batch: SyncTaskBatch, -) -> DownloadMarker +) -> DownloadStatus where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, @@ -1047,7 +1053,7 @@ where // When operating in a system without tasks failing over the error threshold, // current batching and task processing systems aim to update the layer set and metadata files (remote and local), // without "losing" such layer files. - let (upload_result, status_update) = tokio::join!( + let (upload_status, download_status) = tokio::join!( async { if let Some(upload_data) = upload_data { match validate_task_retries(upload_data, max_sync_errors) @@ -1065,7 +1071,7 @@ where "upload", ) .await; - return Some(()); + UploadStatus::Uploaded } ControlFlow::Break(failed_upload_data) => { if let Err(e) = update_remote_data( @@ -1082,10 +1088,13 @@ where { error!("Failed to update remote timeline {sync_id}: {e:?}"); } + + UploadStatus::Failed } } + } else { + UploadStatus::Nothing } - None } .instrument(info_span!("upload_timeline_data")), async { @@ -1115,51 +1124,53 @@ where } } } - DownloadMarker::Nothing + DownloadStatus::Nothing } .instrument(info_span!("download_timeline_data")), ); - if let Some(mut delete_data) = batch.delete { - if upload_result.is_some() { - match validate_task_retries(delete_data, max_sync_errors) - .instrument(info_span!("retries_validation")) - .await - { - ControlFlow::Continue(new_delete_data) => { - delete_timeline_data( - conf, - (storage.as_ref(), &index, sync_queue), - sync_id, - new_delete_data, - sync_start, - "delete", - ) - .instrument(info_span!("delete_timeline_data")) - .await; - } - ControlFlow::Break(failed_delete_data) => { - if let Err(e) = update_remote_data( - conf, - storage.as_ref(), - &index, - sync_id, - RemoteDataUpdate::Delete(&failed_delete_data.data.deleted_layers), - ) + if let Some(delete_data) = batch.delete { + match upload_status { + UploadStatus::Uploaded | UploadStatus::Nothing => { + match validate_task_retries(delete_data, max_sync_errors) + .instrument(info_span!("retries_validation")) .await - { - error!("Failed to update remote timeline {sync_id}: {e:?}"); + { + ControlFlow::Continue(new_delete_data) => { + delete_timeline_data( + conf, + (storage.as_ref(), &index, sync_queue), + sync_id, + new_delete_data, + sync_start, + "delete", + ) + .instrument(info_span!("delete_timeline_data")) + .await; + } + ControlFlow::Break(failed_delete_data) => { + if let Err(e) = update_remote_data( + conf, + storage.as_ref(), + &index, + sync_id, + RemoteDataUpdate::Delete(&failed_delete_data.data.deleted_layers), + ) + .await + { + error!("Failed to update remote timeline {sync_id}: {e:?}"); + } } } } - } else { - delete_data.retries += 1; - sync_queue.push(sync_id, SyncTask::Delete(delete_data)); - warn!("Skipping delete task due to failed upload tasks, reenqueuing"); + UploadStatus::Failed => { + warn!("Skipping delete task due to failed upload tasks, reenqueuing"); + sync_queue.push(sync_id, SyncTask::Delete(delete_data)); + } } } - status_update + download_status } async fn download_timeline_data( @@ -1170,7 +1181,7 @@ async fn download_timeline_data( new_download_data: SyncData, sync_start: Instant, task_name: &str, -) -> DownloadMarker +) -> DownloadStatus where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, @@ -1199,7 +1210,7 @@ where Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) { Ok(()) => { register_sync_status(sync_id, sync_start, task_name, Some(true)); - return DownloadMarker::Downloaded; + return DownloadStatus::Downloaded; } Err(e) => { error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}"); @@ -1215,7 +1226,7 @@ where } } - DownloadMarker::Nothing + DownloadStatus::Nothing } async fn update_local_metadata( @@ -1493,11 +1504,7 @@ async fn validate_task_retries( return ControlFlow::Break(sync_data); } - if current_attempt > 0 { - let seconds_to_wait = 2.0_f64.powf(current_attempt as f64 - 1.0).min(30.0); - info!("Waiting {seconds_to_wait} seconds before starting the task"); - tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await; - } + exponential_backoff(current_attempt, 1.0, 30.0).await; ControlFlow::Continue(sync_data) } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 5b6a211566..09142c4d44 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -25,7 +25,11 @@ use etcd_broker::{ use tokio::select; use tracing::*; -use crate::repository::{Repository, Timeline}; +use crate::{ + exponential_backoff, + repository::{Repository, Timeline}, + DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, +}; use crate::{RepositoryImpl, TimelineImpl}; use utils::{ lsn::Lsn, @@ -230,28 +234,6 @@ async fn subscribe_for_timeline_updates( } } -const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1; -const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0; - -fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { - if n == 0 { - 0.0 - } else { - (1.0 + base_increment).powf(f64::from(n)).min(max_seconds) - } -} - -async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) { - let backoff_duration_seconds = - exponential_backoff_duration_seconds(n, base_increment, max_seconds); - if backoff_duration_seconds > 0.0 { - info!( - "Backoff: waiting {backoff_duration_seconds} seconds before proceeding with the task", - ); - tokio::time::sleep(Duration::from_secs_f64(backoff_duration_seconds)).await; - } -} - /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. struct WalreceiverState { id: ZTenantTimelineId, @@ -1227,34 +1209,3 @@ mod tests { } } } - -#[cfg(test)] -mod backoff_defaults_tests { - use super::*; - - #[test] - fn backoff_defaults_produce_growing_backoff_sequence() { - let mut current_backoff_value = None; - - for i in 0..10_000 { - let new_backoff_value = exponential_backoff_duration_seconds( - i, - DEFAULT_BASE_BACKOFF_SECONDS, - DEFAULT_MAX_BACKOFF_SECONDS, - ); - - if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) { - assert!( - old_backoff_value <= new_backoff_value, - "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}" - ) - } - } - - assert_eq!( - current_backoff_value.expect("Should have produced backoff values to compare"), - DEFAULT_MAX_BACKOFF_SECONDS, - "Given big enough of retries, backoff should reach its allowed max value" - ); - } -} From dc52436a8f376342b981885e1afb1d5b0fe320aa Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Fri, 12 Aug 2022 09:24:20 +0700 Subject: [PATCH 0612/1022] Fix bug when import large (>1GB) relations (#2172) Resolves #2097 - use timeline modification's `lsn` and timeline's `last_record_lsn` to determine the corresponding LSN to query data in `DatadirModification::get` - update `test_import_from_pageserver`. Split the test into 2 variants: `small` and `multisegment`. + `small` is the old test + `multisegment` is to simulate #2097 by using a larger number of inserted rows to create multiple segment files of a relation. `multisegment` is configured to only run with a `release` build --- .../actions/run-python-test-set/action.yml | 1 + pageserver/src/pgdatadir_mapping.rs | 4 +- test_runner/batch_others/test_import.py | 98 +++++++++++++++---- 3 files changed, 80 insertions(+), 23 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 41f68d63e1..3900f93ee4 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -83,6 +83,7 @@ runs: # this variable will be embedded in perf test report # and is needed to distinguish different environments PLATFORM: github-actions-selfhosted + BUILD_TYPE: ${{ inputs.build_type }} AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }} AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }} shell: bash -euxo pipefail {0} diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 827bd29ded..113f40302a 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -966,8 +966,8 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { bail!("unexpected pending WAL record"); } } else { - let last_lsn = self.tline.get_last_record_lsn(); - self.tline.get(key, last_lsn) + let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); + self.tline.get(key, lsn) } } diff --git a/test_runner/batch_others/test_import.py b/test_runner/batch_others/test_import.py index 617d4808cc..d4b8b7a153 100644 --- a/test_runner/batch_others/test_import.py +++ b/test_runner/batch_others/test_import.py @@ -1,9 +1,10 @@ +import re import pytest -from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_upload, wait_for_last_record_lsn -from fixtures.utils import lsn_from_hex, lsn_to_hex +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, Postgres, wait_for_upload, wait_for_last_record_lsn +from fixtures.utils import lsn_from_hex from uuid import UUID, uuid4 -import tarfile import os +import tarfile import shutil from pathlib import Path import json @@ -105,20 +106,60 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build @pytest.mark.timeout(600) -def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_builder): - - num_rows = 3000 +def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 1 neon_env_builder.enable_local_fs_remote_storage() env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_import_from_pageserver') - pgmain = env.postgres.create_start('test_import_from_pageserver') - log.info("postgres is running on 'test_import_from_pageserver' branch") + timeline = env.neon_cli.create_branch('test_import_from_pageserver_small') + pg = env.postgres.create_start('test_import_from_pageserver_small') - timeline = pgmain.safe_psql("SHOW neon.timeline_id")[0][0] + num_rows = 3000 + lsn = _generate_data(num_rows, pg) + _import(num_rows, lsn, env, pg_bin, timeline) - with closing(pgmain.connect()) as conn: + +@pytest.mark.timeout(1800) +@pytest.mark.skipif(os.environ.get('BUILD_TYPE') == "debug", reason="only run with release build") +def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + neon_env_builder.enable_local_fs_remote_storage() + env = neon_env_builder.init_start() + + timeline = env.neon_cli.create_branch('test_import_from_pageserver_multisegment') + pg = env.postgres.create_start('test_import_from_pageserver_multisegment') + + # For `test_import_from_pageserver_multisegment`, we want to make sure that the data + # is large enough to create multi-segment files. Typically, a segment file's size is + # at most 1GB. A large number of inserted rows (`30000000`) is used to increase the + # DB size to above 1GB. Related: https://github.com/neondatabase/neon/issues/2097. + num_rows = 30000000 + lsn = _generate_data(num_rows, pg) + + logical_size = env.pageserver.http_client().timeline_detail( + env.initial_tenant, timeline)['local']['current_logical_size'] + log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB") + assert logical_size > 1024**3 # = 1GB + + tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline) + + # Check if the backup data contains multiple segment files + cnt_seg_files = 0 + segfile_re = re.compile('[0-9]+\\.[0-9]+') + with tarfile.open(tar_output_file, "r") as tar_f: + for f in tar_f.getnames(): + if segfile_re.search(f) is not None: + cnt_seg_files += 1 + log.info(f"Found a segment file: {f} in the backup archive file") + assert cnt_seg_files > 0 + + +def _generate_data(num_rows: int, pg: Postgres) -> str: + """Generate a table with `num_rows` rows. + + Returns: + the latest insert WAL's LSN""" + with closing(pg.connect()) as conn: with conn.cursor() as cur: # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") @@ -127,15 +168,28 @@ def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_bu cur.execute("CHECKPOINT") cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn = cur.fetchone()[0] - log.info(f"start_backup_lsn = {lsn}") + res = cur.fetchone() + assert res is not None and isinstance(res[0], str) + return res[0] + + +def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timeline: UUID) -> str: + """Test importing backup data to the pageserver. + + Args: + expected_num_rows: the expected number of rows of the test table in the backup data + lsn: the backup's base LSN + + Returns: + path to the backup archive file""" + log.info(f"start_backup_lsn = {lsn}") # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. # PgBin sets it automatically, but here we need to pipe psql output to the tar command. psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')} # Get a fullbackup from pageserver - query = f"fullbackup { env.initial_tenant.hex} {timeline} {lsn}" + query = f"fullbackup { env.initial_tenant.hex} {timeline.hex} {lsn}" cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] result_basepath = pg_bin.run_capture(cmd, env=psql_env) tar_output_file = result_basepath + ".stdout" @@ -152,7 +206,7 @@ def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_bu env.pageserver.start() # Import using another tenantid, because we use the same pageserver. - # TODO Create another pageserver to maeke test more realistic. + # TODO Create another pageserver to make test more realistic. tenant = uuid4() # Import to pageserver @@ -165,7 +219,7 @@ def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_bu "--tenant-id", tenant.hex, "--timeline-id", - timeline, + timeline.hex, "--node-name", node_name, "--base-lsn", @@ -175,15 +229,15 @@ def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_bu ]) # Wait for data to land in s3 - wait_for_last_record_lsn(client, tenant, UUID(timeline), lsn_from_hex(lsn)) - wait_for_upload(client, tenant, UUID(timeline), lsn_from_hex(lsn)) + wait_for_last_record_lsn(client, tenant, timeline, lsn_from_hex(lsn)) + wait_for_upload(client, tenant, timeline, lsn_from_hex(lsn)) # Check it worked pg = env.postgres.create_start(node_name, tenant_id=tenant) - assert pg.safe_psql('select count(*) from tbl') == [(num_rows, )] + assert pg.safe_psql('select count(*) from tbl') == [(expected_num_rows, )] # Take another fullbackup - query = f"fullbackup { tenant.hex} {timeline} {lsn}" + query = f"fullbackup { tenant.hex} {timeline.hex} {lsn}" cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] result_basepath = pg_bin.run_capture(cmd, env=psql_env) new_tar_output_file = result_basepath + ".stdout" @@ -195,4 +249,6 @@ def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_bu # Check that gc works psconn = env.pageserver.connect() pscur = psconn.cursor() - pscur.execute(f"do_gc {tenant.hex} {timeline} 0") + pscur.execute(f"do_gc {tenant.hex} {timeline.hex} 0") + + return tar_output_file From 7da47d8a0aa65785c90fb1cdd096f3416b1e49ab Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Fri, 12 Aug 2022 14:28:50 +0700 Subject: [PATCH 0613/1022] Fix timeline physical size flaky tests (#2244) Resolves #2212. - use `wait_for_last_flush_lsn` in `test_timeline_physical_size_*` tests ## Context Need to wait for the pageserver to catch up with the compute's last flush LSN because during the timeline physical size API call, it's possible that there are running `LayerFlushThread` threads. These threads flush new layers into disk and hence update the physical size. This results in a mismatch between the physical size reported by the API and the actual physical size on disk. ### Note The `LayerFlushThread` threads are processed **concurrently**, so it's possible that the above error still persists even with this patch. However, making the tests wait to finish processing all the WALs (not flushing) before calculating the physical size should help reduce the "flakiness" significantly --- test_runner/batch_others/test_timeline_size.py | 16 +++++++++++++++- test_runner/fixtures/neon_fixtures.py | 6 ++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index c736893f99..6e1168e38f 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -4,7 +4,7 @@ from uuid import UUID import re import psycopg2.extras import psycopg2.errors -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local, wait_for_last_flush_lsn from fixtures.log_helper import log import time @@ -192,6 +192,8 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv): FROM generate_series(1, 1000) g""", ]) + wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) + # restart the pageserer to force calculating timeline's initial physical size env.pageserver.stop() env.pageserver.start() @@ -211,7 +213,9 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): FROM generate_series(1, 1000) g""", ]) + wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -232,8 +236,10 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder FROM generate_series(1, 100000) g""", ]) + wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") env.pageserver.safe_psql(f"compact {env.initial_tenant.hex} {new_timeline_id.hex}") + assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -254,15 +260,21 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g""", ]) + + wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + pg.safe_psql(""" INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g """) + + wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") env.pageserver.safe_psql(f"do_gc {env.initial_tenant.hex} {new_timeline_id.hex} 0") + assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -279,6 +291,7 @@ def test_timeline_physical_size_metric(neon_simple_env: NeonEnv): FROM generate_series(1, 100000) g""", ]) + wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") # get the metrics and parse the metric for the current timeline's physical size @@ -319,6 +332,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): f"INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, {n_rows}) g", ]) + wait_for_last_flush_lsn(env, pg, tenant, timeline) env.pageserver.safe_psql(f"checkpoint {tenant.hex} {timeline.hex}") timeline_total_size += get_timeline_physical_size(timeline) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 3b87f290b8..d5b0af3813 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2475,3 +2475,9 @@ def wait_for_last_record_lsn(pageserver_http_client: NeonPageserverHttpClient, time.sleep(1) raise Exception("timed out while waiting for last_record_lsn to reach {}, was {}".format( lsn_to_hex(lsn), lsn_to_hex(current_lsn))) + + +def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: uuid.UUID, timeline: uuid.UUID): + """Wait for pageserver to catch up the latest flush LSN""" + last_flush_lsn = lsn_from_hex(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) From 142e247e8542edc7eec2f85ed9b79623ea33a9eb Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 23 Jun 2022 16:53:29 +0300 Subject: [PATCH 0614/1022] postgres_ffi/waldecoder: validate more header fields --- libs/postgres_ffi/src/waldecoder.rs | 56 +++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index 7a69f471d9..db69fb3954 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -13,6 +13,7 @@ use super::xlog_utils::*; use super::XLogLongPageHeaderData; use super::XLogPageHeaderData; use super::XLogRecord; +use super::XLOG_PAGE_MAGIC; use bytes::{Buf, BufMut, Bytes, BytesMut}; use crc32c::*; use log::*; @@ -67,6 +68,45 @@ impl WalStreamDecoder { self.inputbuf.extend_from_slice(buf); } + fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError> { + let validate_impl = || { + if hdr.xlp_magic != XLOG_PAGE_MAGIC as u16 { + return Err(format!( + "invalid xlog page header: xlp_magic={}, expected {}", + hdr.xlp_magic, XLOG_PAGE_MAGIC + )); + } + if hdr.xlp_pageaddr != self.lsn.0 { + return Err(format!( + "invalid xlog page header: xlp_pageaddr={}, expected {}", + hdr.xlp_pageaddr, self.lsn + )); + } + if self.contlen == 0 { + if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD != 0 { + return Err( + "invalid xlog page header: unexpected XLP_FIRST_IS_CONTRECORD".into(), + ); + } + } else { + if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD == 0 { + return Err( + "invalid xlog page header: XLP_FIRST_IS_CONTRECORD expected, not found" + .into(), + ); + } + } + if hdr.xlp_rem_len != self.contlen { + return Err(format!( + "invalid xlog page header: xlp_rem_len={}, expected {}", + hdr.xlp_rem_len, self.contlen + )); + } + Ok(()) + }; + validate_impl().map_err(|msg| WalDecodeError { msg, lsn: self.lsn }) + } + /// Attempt to decode another WAL record from the input that has been fed to the /// decoder so far. /// @@ -106,13 +146,7 @@ impl WalStreamDecoder { } })?; - if hdr.std.xlp_pageaddr != self.lsn.0 { - return Err(WalDecodeError { - msg: "invalid xlog segment header".into(), - lsn: self.lsn, - }); - } - // TODO: verify the remaining fields in the header + self.validate_page_header(&hdr.std)?; self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64; continue; @@ -128,13 +162,7 @@ impl WalStreamDecoder { } })?; - if hdr.xlp_pageaddr != self.lsn.0 { - return Err(WalDecodeError { - msg: "invalid xlog page header".into(), - lsn: self.lsn, - }); - } - // TODO: verify the remaining fields in the header + self.validate_page_header(&hdr)?; self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64; continue; From 07bb7a2afeb7ca188f8efc1601551cf87647abaf Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Fri, 24 Jun 2022 20:39:45 +0300 Subject: [PATCH 0615/1022] postgres_ffi/waldecoder: remove unused startlsn --- libs/postgres_ffi/src/waldecoder.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index db69fb3954..d4b7efbac4 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -24,7 +24,6 @@ use utils::lsn::Lsn; pub struct WalStreamDecoder { lsn: Lsn, - startlsn: Lsn, // LSN where this record starts contlen: u32, padlen: u32, @@ -50,7 +49,6 @@ impl WalStreamDecoder { WalStreamDecoder { lsn, - startlsn: Lsn(0), contlen: 0, padlen: 0, @@ -176,7 +174,6 @@ impl WalStreamDecoder { // peek xl_tot_len at the beginning of the record. // FIXME: assumes little-endian - self.startlsn = self.lsn; let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le(); if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD { return Err(WalDecodeError { From a7bf60631f4dec85476abb262564cb99e6bde07b Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Sat, 25 Jun 2022 02:40:42 +0300 Subject: [PATCH 0616/1022] postgres_ffi/waldecoder: introduce explicit `enum State` Previously it was emulated with a combination of nullable fields. This change should make the logic more readable. --- libs/postgres_ffi/src/waldecoder.rs | 277 +++++++++++++++------------- 1 file changed, 150 insertions(+), 127 deletions(-) diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index d4b7efbac4..cbb761236c 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -18,19 +18,25 @@ use bytes::{Buf, BufMut, Bytes, BytesMut}; use crc32c::*; use log::*; use std::cmp::min; +use std::num::NonZeroU32; use thiserror::Error; use utils::lsn::Lsn; +enum State { + WaitingForRecord, + ReassemblingRecord { + recordbuf: BytesMut, + contlen: NonZeroU32, + }, + SkippingEverything { + skip_until_lsn: Lsn, + }, +} + pub struct WalStreamDecoder { lsn: Lsn, - - contlen: u32, - padlen: u32, - inputbuf: BytesMut, - - /// buffer used to reassemble records that cross page boundaries. - recordbuf: BytesMut, + state: State, } #[derive(Error, Debug, Clone)] @@ -48,12 +54,8 @@ impl WalStreamDecoder { pub fn new(lsn: Lsn) -> WalStreamDecoder { WalStreamDecoder { lsn, - - contlen: 0, - padlen: 0, - inputbuf: BytesMut::new(), - recordbuf: BytesMut::new(), + state: State::WaitingForRecord, } } @@ -80,26 +82,39 @@ impl WalStreamDecoder { hdr.xlp_pageaddr, self.lsn )); } - if self.contlen == 0 { - if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD != 0 { - return Err( - "invalid xlog page header: unexpected XLP_FIRST_IS_CONTRECORD".into(), - ); + match self.state { + State::WaitingForRecord => { + if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD != 0 { + return Err( + "invalid xlog page header: unexpected XLP_FIRST_IS_CONTRECORD".into(), + ); + } + if hdr.xlp_rem_len != 0 { + return Err(format!( + "invalid xlog page header: xlp_rem_len={}, but it's not a contrecord", + hdr.xlp_rem_len + )); + } } - } else { - if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD == 0 { - return Err( - "invalid xlog page header: XLP_FIRST_IS_CONTRECORD expected, not found" - .into(), - ); + State::ReassemblingRecord { contlen, .. } => { + if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD == 0 { + return Err( + "invalid xlog page header: XLP_FIRST_IS_CONTRECORD expected, not found" + .into(), + ); + } + if hdr.xlp_rem_len != contlen.get() { + return Err(format!( + "invalid xlog page header: xlp_rem_len={}, expected {}", + hdr.xlp_rem_len, + contlen.get() + )); + } } - } - if hdr.xlp_rem_len != self.contlen { - return Err(format!( - "invalid xlog page header: xlp_rem_len={}, expected {}", - hdr.xlp_rem_len, self.contlen - )); - } + State::SkippingEverything { .. } => { + panic!("Should not be validating page header in the SkippingEverything state"); + } + }; Ok(()) }; validate_impl().map_err(|msg| WalDecodeError { msg, lsn: self.lsn }) @@ -114,115 +129,121 @@ impl WalStreamDecoder { /// Err(WalDecodeError): an error occurred while decoding, meaning the input was invalid. /// pub fn poll_decode(&mut self) -> Result, WalDecodeError> { - let recordbuf; - // Run state machine that validates page headers, and reassembles records // that cross page boundaries. loop { // parse and verify page boundaries as we go - if self.padlen > 0 { - // We should first skip padding, as we may have to skip some page headers if we're processing the XLOG_SWITCH record. - if self.inputbuf.remaining() < self.padlen as usize { - return Ok(None); - } + // However, we may have to skip some page headers if we're processing the XLOG_SWITCH record or skipping padding for whatever reason. + match self.state { + State::WaitingForRecord | State::ReassemblingRecord { .. } => { + if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 { + // parse long header - // skip padding - self.inputbuf.advance(self.padlen as usize); - self.lsn += self.padlen as u64; - self.padlen = 0; - } else if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 { - // parse long header + if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD { + return Ok(None); + } - if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD { - return Ok(None); - } + let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err( + |e| WalDecodeError { + msg: format!("long header deserialization failed {}", e), + lsn: self.lsn, + }, + )?; - let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| { - WalDecodeError { - msg: format!("long header deserialization failed {}", e), - lsn: self.lsn, + self.validate_page_header(&hdr.std)?; + + self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64; + } else if self.lsn.block_offset() == 0 { + if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD { + return Ok(None); + } + + let hdr = + XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| { + WalDecodeError { + msg: format!("header deserialization failed {}", e), + lsn: self.lsn, + } + })?; + + self.validate_page_header(&hdr)?; + + self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64; } - })?; - - self.validate_page_header(&hdr.std)?; - - self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64; - continue; - } else if self.lsn.block_offset() == 0 { - if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD { - return Ok(None); } - - let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| { - WalDecodeError { - msg: format!("header deserialization failed {}", e), - lsn: self.lsn, + State::SkippingEverything { .. } => {} + } + match &mut self.state { + State::WaitingForRecord => { + // need to have at least the xl_tot_len field + if self.inputbuf.remaining() < 4 { + return Ok(None); } - })?; - self.validate_page_header(&hdr)?; - - self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64; - continue; - } else if self.contlen == 0 { - assert!(self.recordbuf.is_empty()); - - // need to have at least the xl_tot_len field - if self.inputbuf.remaining() < 4 { - return Ok(None); + // peek xl_tot_len at the beginning of the record. + // FIXME: assumes little-endian + let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le(); + if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD { + return Err(WalDecodeError { + msg: format!("invalid xl_tot_len {}", xl_tot_len), + lsn: self.lsn, + }); + } + // Fast path for the common case that the whole record fits on the page. + let pageleft = self.lsn.remaining_in_block() as u32; + if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft { + self.lsn += xl_tot_len as u64; + let recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize); + return Ok(Some(self.complete_record(recordbuf)?)); + } else { + // Need to assemble the record from pieces. Remember the size of the + // record, and loop back. On next iteration, we will reach the 'else' + // branch below, and copy the part of the record that was on this page + // to 'recordbuf'. Subsequent iterations will skip page headers, and + // append the continuations from the next pages to 'recordbuf'. + self.state = State::ReassemblingRecord { + recordbuf: BytesMut::with_capacity(xl_tot_len as usize), + contlen: NonZeroU32::new(xl_tot_len).unwrap(), + } + } } + State::ReassemblingRecord { recordbuf, contlen } => { + // we're continuing a record, possibly from previous page. + let pageleft = self.lsn.remaining_in_block() as u32; - // peek xl_tot_len at the beginning of the record. - // FIXME: assumes little-endian - let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le(); - if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD { - return Err(WalDecodeError { - msg: format!("invalid xl_tot_len {}", xl_tot_len), - lsn: self.lsn, - }); + // read the rest of the record, or as much as fits on this page. + let n = min(contlen.get(), pageleft) as usize; + + if self.inputbuf.remaining() < n { + return Ok(None); + } + + recordbuf.put(self.inputbuf.split_to(n)); + self.lsn += n as u64; + *contlen = match NonZeroU32::new(contlen.get() - n as u32) { + Some(x) => x, + None => { + // The record is now complete. + let recordbuf = std::mem::replace(recordbuf, BytesMut::new()).freeze(); + return Ok(Some(self.complete_record(recordbuf)?)); + } + } } - - // Fast path for the common case that the whole record fits on the page. - let pageleft = self.lsn.remaining_in_block() as u32; - if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft { - // Take the record from the 'inputbuf', and validate it. - recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize); - self.lsn += xl_tot_len as u64; - break; - } else { - // Need to assemble the record from pieces. Remember the size of the - // record, and loop back. On next iteration, we will reach the 'else' - // branch below, and copy the part of the record that was on this page - // to 'recordbuf'. Subsequent iterations will skip page headers, and - // append the continuations from the next pages to 'recordbuf'. - self.recordbuf.reserve(xl_tot_len as usize); - self.contlen = xl_tot_len; - continue; + State::SkippingEverything { skip_until_lsn } => { + assert!(*skip_until_lsn >= self.lsn); + let n = skip_until_lsn.0 - self.lsn.0; + if self.inputbuf.remaining() < n as usize { + return Ok(None); + } + self.inputbuf.advance(n as usize); + self.lsn += n; + self.state = State::WaitingForRecord; } - } else { - // we're continuing a record, possibly from previous page. - let pageleft = self.lsn.remaining_in_block() as u32; - - // read the rest of the record, or as much as fits on this page. - let n = min(self.contlen, pageleft) as usize; - - if self.inputbuf.remaining() < n { - return Ok(None); - } - - self.recordbuf.put(self.inputbuf.split_to(n)); - self.lsn += n as u64; - self.contlen -= n as u32; - - if self.contlen == 0 { - // The record is now complete. - recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()).freeze(); - break; - } - continue; } } + } + fn complete_record(&mut self, recordbuf: Bytes) -> Result<(Lsn, Bytes), WalDecodeError> { // We now have a record in the 'recordbuf' local variable. let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]).map_err(|e| { @@ -244,18 +265,20 @@ impl WalStreamDecoder { // XLOG_SWITCH records are special. If we see one, we need to skip // to the next WAL segment. - if xlogrec.is_xlog_switch_record() { + let next_lsn = if xlogrec.is_xlog_switch_record() { trace!("saw xlog switch record at {}", self.lsn); - self.padlen = self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32; + self.lsn + self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) } else { // Pad to an 8-byte boundary - self.padlen = self.lsn.calc_padding(8u32) as u32; - } + self.lsn.align() + }; + self.state = State::SkippingEverything { + skip_until_lsn: next_lsn, + }; // We should return LSN of the next record, not the last byte of this record or // the byte immediately after. Note that this handles both XLOG_SWITCH and usual // records, the former "spans" until the next WAL segment (see test_xlog_switch). - let result = (self.lsn + self.padlen as u64, recordbuf); - Ok(Some(result)) + Ok((next_lsn, recordbuf)) } } From 6d99b4f1d8b3a78163f0171fee1eb26d37dba260 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Fri, 12 Aug 2022 19:13:42 +0700 Subject: [PATCH 0617/1022] disable `test_import_from_pageserver_multisegment` (#2258) This test failed consistently on `main` now. It's better to temporarily disable it to avoid blocking others' PRs while investigating the root cause for the test failure. See: #2255, #2256 --- test_runner/batch_others/test_import.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test_runner/batch_others/test_import.py b/test_runner/batch_others/test_import.py index d4b8b7a153..039945e5e4 100644 --- a/test_runner/batch_others/test_import.py +++ b/test_runner/batch_others/test_import.py @@ -120,7 +120,10 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu @pytest.mark.timeout(1800) -@pytest.mark.skipif(os.environ.get('BUILD_TYPE') == "debug", reason="only run with release build") +# TODO: temporarily disable `test_import_from_pageserver_multisegment` test, enable +# the test back after finding the failure cause. +# @pytest.mark.skipif(os.environ.get('BUILD_TYPE') == "debug", reason="only run with release build") +@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2255") def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 1 neon_env_builder.enable_local_fs_remote_storage() From 7f97269277b4cc02e424fb8ce024731d1ff0bd2d Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Fri, 12 Aug 2022 16:01:22 +0300 Subject: [PATCH 0618/1022] get_binaries uses DOCKER_TAG taken from docker image build step (#2260) --- .github/ansible/get_binaries.sh | 47 ++++++++++++++++------------ .github/workflows/build_and_test.yml | 1 + 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh index c9cbe91f34..777262e26f 100755 --- a/.github/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -2,30 +2,37 @@ set -e -RELEASE=${RELEASE:-false} +if [ -z "${DOCKER_TAG}" ]; then + # DOCKER_TAG absent, trying to find latest one in docker hub + + RELEASE=${RELEASE:-false} + # look at docker hub for latest tag for neon docker image + if [ "${RELEASE}" = "true" ]; then + echo "search latest release tag" + VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1) + if [ -z "${VERSION}" ]; then + echo "no any docker tags found, exiting..." + exit 1 + else + TAG="release-${VERSION}" + fi + else + echo "search latest dev tag" + VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1) + if [ -z "${VERSION}" ]; then + echo "no any docker tags found, exiting..." + exit 1 + else + TAG="${VERSION}" + fi + fi + echo "found ${VERSION}" -# look at docker hub for latest tag for neon docker image -if [ "${RELEASE}" = "true" ]; then - echo "search latest release tag" - VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1) - if [ -z "${VERSION}" ]; then - echo "no any docker tags found, exiting..." - exit 1 - else - TAG="release-${VERSION}" - fi else - echo "search latest dev tag" - VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1) - if [ -z "${VERSION}" ]; then - echo "no any docker tags found, exiting..." - exit 1 - else - TAG="${VERSION}" - fi + # DOCKER_TAG present, using it + TAG=${DOCKER_TAG} fi -echo "found ${VERSION}" # do initial cleanup rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d28da92d11..635c6126cc 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -562,6 +562,7 @@ jobs: - name: Redeploy run: | + export DOCKER_TAG=${{needs.docker-image.outputs.build-tag}} cd "$(pwd)/.github/ansible" if [[ "$GITHUB_REF_NAME" == "main" ]]; then From ad08c273d36343c157e015cc058fea7ac05a57d6 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 12 Aug 2022 17:38:43 +0300 Subject: [PATCH 0619/1022] [proxy] Rework wire format of the password hack and some errors (#2236) The new format has a few benefits: it's shorter, simpler and human-readable as well. We don't use base64 anymore, since url encoding got us covered. We also show a better error in case we couldn't parse the payload; the users should know it's all about passing the correct project name. --- Cargo.lock | 1 + proxy/Cargo.toml | 1 + proxy/src/auth.rs | 65 +++++++------ proxy/src/auth/backend.rs | 9 +- proxy/src/auth/backend/console.rs | 113 +++++++++++++++-------- proxy/src/auth/backend/legacy_console.rs | 42 +++------ proxy/src/auth/backend/link.rs | 31 ++++++- proxy/src/auth/backend/postgres.rs | 24 ++--- proxy/src/auth/flow.rs | 15 ++- proxy/src/auth/password_hack.rs | 102 +++++--------------- test_runner/batch_others/test_proxy.py | 23 ++--- 11 files changed, 211 insertions(+), 215 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d850d3bd89..a70b2b7dc9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2269,6 +2269,7 @@ dependencies = [ "anyhow", "async-trait", "base64", + "bstr", "bytes", "clap 3.2.12", "futures", diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index d9d43c3325..230fc8a253 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" anyhow = "1.0" async-trait = "0.1" base64 = "0.13.0" +bstr = "0.2.17" bytes = { version = "1.0.1", features = ['serde'] } clap = "3.0" futures = "0.3.13" diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 61c7458e16..4e78c576e2 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -12,7 +12,7 @@ use password_hack::PasswordHackPayload; mod flow; pub use flow::*; -use crate::{error::UserFacingError, waiters}; +use crate::error::UserFacingError; use std::io; use thiserror::Error; @@ -22,51 +22,54 @@ pub type Result = std::result::Result; /// Common authentication error. #[derive(Debug, Error)] pub enum AuthErrorImpl { - /// Authentication error reported by the console. + // This will be dropped in the future. #[error(transparent)] - Console(#[from] backend::AuthError), + Legacy(#[from] backend::LegacyAuthError), #[error(transparent)] - GetAuthInfo(#[from] backend::console::ConsoleAuthError), + Link(#[from] backend::LinkAuthError), + #[error(transparent)] + GetAuthInfo(#[from] backend::GetAuthInfoError), + + #[error(transparent)] + WakeCompute(#[from] backend::WakeComputeError), + + /// SASL protocol errors (includes [SCRAM](crate::scram)). #[error(transparent)] Sasl(#[from] crate::sasl::Error), + #[error("Unsupported authentication method: {0}")] + BadAuthMethod(Box), + #[error("Malformed password message: {0}")] MalformedPassword(&'static str), - /// Errors produced by [`crate::stream::PqStream`]. + #[error( + "Project name is not specified. \ + Either please upgrade the postgres client library (libpq) for SNI support \ + or pass the project name as a parameter: '&options=project%3D'. \ + See more at https://neon.tech/sni" + )] + MissingProjectName, + + /// Errors produced by e.g. [`crate::stream::PqStream`]. #[error(transparent)] Io(#[from] io::Error), } -impl AuthErrorImpl { - pub fn auth_failed(msg: impl Into) -> Self { - Self::Console(backend::AuthError::auth_failed(msg)) - } -} - -impl From for AuthErrorImpl { - fn from(e: waiters::RegisterError) -> Self { - Self::Console(backend::AuthError::from(e)) - } -} - -impl From for AuthErrorImpl { - fn from(e: waiters::WaitError) -> Self { - Self::Console(backend::AuthError::from(e)) - } -} - #[derive(Debug, Error)] #[error(transparent)] pub struct AuthError(Box); -impl From for AuthError -where - AuthErrorImpl: From, -{ - fn from(e: T) -> Self { +impl AuthError { + pub fn bad_auth_method(name: impl Into>) -> Self { + AuthErrorImpl::BadAuthMethod(name.into()).into() + } +} + +impl> From for AuthError { + fn from(e: E) -> Self { Self(Box::new(e.into())) } } @@ -75,10 +78,14 @@ impl UserFacingError for AuthError { fn to_string_client(&self) -> String { use AuthErrorImpl::*; match self.0.as_ref() { - Console(e) => e.to_string_client(), + Legacy(e) => e.to_string_client(), + Link(e) => e.to_string_client(), GetAuthInfo(e) => e.to_string_client(), + WakeCompute(e) => e.to_string_client(), Sasl(e) => e.to_string_client(), + BadAuthMethod(_) => self.to_string(), MalformedPassword(_) => self.to_string(), + MissingProjectName => self.to_string(), _ => "Internal error".to_string(), } } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index a67865e08c..b10ede8d5e 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -1,10 +1,13 @@ -mod link; mod postgres; -pub mod console; +mod link; +pub use link::LinkAuthError; + +mod console; +pub use console::{GetAuthInfoError, WakeComputeError}; mod legacy_console; -pub use legacy_console::{AuthError, AuthErrorImpl}; +pub use legacy_console::LegacyAuthError; use crate::{ auth::{self, AuthFlow, ClientCredentials}, diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index a8ff1a3522..87906679ea 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -13,21 +13,11 @@ use std::future::Future; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -pub type Result = std::result::Result; +const REQUEST_FAILED: &str = "Console request failed"; #[derive(Debug, Error)] -pub enum ConsoleAuthError { - #[error(transparent)] - BadProjectName(#[from] auth::credentials::ClientCredsParseError), - - // We shouldn't include the actual secret here. - #[error("Bad authentication secret")] - BadSecret, - - #[error("Console responded with a malformed compute address: '{0}'")] - BadComputeAddress(String), - - #[error("Console responded with a malformed JSON: '{0}'")] +pub enum TransportError { + #[error("Console responded with a malformed JSON: {0}")] BadResponse(#[from] serde_json::Error), /// HTTP status (other than 200) returned by the console. @@ -38,19 +28,72 @@ pub enum ConsoleAuthError { Io(#[from] std::io::Error), } -impl UserFacingError for ConsoleAuthError { +impl UserFacingError for TransportError { fn to_string_client(&self) -> String { - use ConsoleAuthError::*; + use TransportError::*; match self { - BadProjectName(e) => e.to_string_client(), - _ => "Internal error".to_string(), + HttpStatus(_) => self.to_string(), + _ => REQUEST_FAILED.to_owned(), } } } -impl From<&auth::credentials::ClientCredsParseError> for ConsoleAuthError { - fn from(e: &auth::credentials::ClientCredsParseError) -> Self { - ConsoleAuthError::BadProjectName(e.clone()) +// Helps eliminate graceless `.map_err` calls without introducing another ctor. +impl From for TransportError { + fn from(e: reqwest::Error) -> Self { + io_error(e).into() + } +} + +#[derive(Debug, Error)] +pub enum GetAuthInfoError { + // We shouldn't include the actual secret here. + #[error("Console responded with a malformed auth secret")] + BadSecret, + + #[error(transparent)] + Transport(TransportError), +} + +impl UserFacingError for GetAuthInfoError { + fn to_string_client(&self) -> String { + use GetAuthInfoError::*; + match self { + BadSecret => REQUEST_FAILED.to_owned(), + Transport(e) => e.to_string_client(), + } + } +} + +impl> From for GetAuthInfoError { + fn from(e: E) -> Self { + Self::Transport(e.into()) + } +} + +#[derive(Debug, Error)] +pub enum WakeComputeError { + // We shouldn't show users the address even if it's broken. + #[error("Console responded with a malformed compute address: {0}")] + BadComputeAddress(String), + + #[error(transparent)] + Transport(TransportError), +} + +impl UserFacingError for WakeComputeError { + fn to_string_client(&self) -> String { + use WakeComputeError::*; + match self { + BadComputeAddress(_) => REQUEST_FAILED.to_owned(), + Transport(e) => e.to_string_client(), + } + } +} + +impl> From for WakeComputeError { + fn from(e: E) -> Self { + Self::Transport(e.into()) } } @@ -95,7 +138,7 @@ impl<'a> Api<'a> { handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await } - async fn get_auth_info(&self) -> Result { + async fn get_auth_info(&self) -> Result { let mut url = self.endpoint.clone(); url.path_segments_mut().push("proxy_get_role_secret"); url.query_pairs_mut() @@ -105,21 +148,20 @@ impl<'a> Api<'a> { // TODO: use a proper logger println!("cplane request: {url}"); - let resp = reqwest::get(url.into_inner()).await.map_err(io_error)?; + let resp = reqwest::get(url.into_inner()).await?; if !resp.status().is_success() { - return Err(ConsoleAuthError::HttpStatus(resp.status())); + return Err(TransportError::HttpStatus(resp.status()).into()); } - let response: GetRoleSecretResponse = - serde_json::from_str(&resp.text().await.map_err(io_error)?)?; + let response: GetRoleSecretResponse = serde_json::from_str(&resp.text().await?)?; - scram::ServerSecret::parse(response.role_secret.as_str()) + scram::ServerSecret::parse(&response.role_secret) .map(AuthInfo::Scram) - .ok_or(ConsoleAuthError::BadSecret) + .ok_or(GetAuthInfoError::BadSecret) } /// Wake up the compute node and return the corresponding connection info. - pub(super) async fn wake_compute(&self) -> Result { + pub(super) async fn wake_compute(&self) -> Result { let mut url = self.endpoint.clone(); url.path_segments_mut().push("proxy_wake_compute"); url.query_pairs_mut() @@ -128,17 +170,16 @@ impl<'a> Api<'a> { // TODO: use a proper logger println!("cplane request: {url}"); - let resp = reqwest::get(url.into_inner()).await.map_err(io_error)?; + let resp = reqwest::get(url.into_inner()).await?; if !resp.status().is_success() { - return Err(ConsoleAuthError::HttpStatus(resp.status())); + return Err(TransportError::HttpStatus(resp.status()).into()); } - let response: GetWakeComputeResponse = - serde_json::from_str(&resp.text().await.map_err(io_error)?)?; + let response: GetWakeComputeResponse = serde_json::from_str(&resp.text().await?)?; // Unfortunately, ownership won't let us use `Option::ok_or` here. let (host, port) = match parse_host_port(&response.address) { - None => return Err(ConsoleAuthError::BadComputeAddress(response.address)), + None => return Err(WakeComputeError::BadComputeAddress(response.address)), Some(x) => x, }; @@ -162,8 +203,8 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>( wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute, ) -> auth::Result where - GetAuthInfo: Future>, - WakeCompute: Future>, + GetAuthInfo: Future>, + WakeCompute: Future>, { let auth_info = get_auth_info(endpoint).await?; @@ -171,7 +212,7 @@ where let scram_keys = match auth_info { AuthInfo::Md5(_) => { // TODO: decide if we should support MD5 in api v2 - return Err(auth::AuthErrorImpl::auth_failed("MD5 is not supported").into()); + return Err(auth::AuthError::bad_auth_method("MD5")); } AuthInfo::Scram(secret) => { let scram = auth::Scram(&secret); diff --git a/proxy/src/auth/backend/legacy_console.rs b/proxy/src/auth/backend/legacy_console.rs index 7a5e9b6f62..17ba44e833 100644 --- a/proxy/src/auth/backend/legacy_console.rs +++ b/proxy/src/auth/backend/legacy_console.rs @@ -14,7 +14,7 @@ use tokio::io::{AsyncRead, AsyncWrite}; use utils::pq_proto::BeMessage as Be; #[derive(Debug, Error)] -pub enum AuthErrorImpl { +pub enum LegacyAuthError { /// Authentication error reported by the console. #[error("Authentication failed: {0}")] AuthFailed(String), @@ -24,7 +24,7 @@ pub enum AuthErrorImpl { HttpStatus(reqwest::StatusCode), #[error("Console responded with a malformed JSON: {0}")] - MalformedResponse(#[from] serde_json::Error), + BadResponse(#[from] serde_json::Error), #[error(transparent)] Transport(#[from] reqwest::Error), @@ -36,30 +36,10 @@ pub enum AuthErrorImpl { WaiterWait(#[from] waiters::WaitError), } -#[derive(Debug, Error)] -#[error(transparent)] -pub struct AuthError(Box); - -impl AuthError { - /// Smart constructor for authentication error reported by `mgmt`. - pub fn auth_failed(msg: impl Into) -> Self { - Self(Box::new(AuthErrorImpl::AuthFailed(msg.into()))) - } -} - -impl From for AuthError -where - AuthErrorImpl: From, -{ - fn from(e: T) -> Self { - Self(Box::new(e.into())) - } -} - -impl UserFacingError for AuthError { +impl UserFacingError for LegacyAuthError { fn to_string_client(&self) -> String { - use AuthErrorImpl::*; - match self.0.as_ref() { + use LegacyAuthError::*; + match self { AuthFailed(_) | HttpStatus(_) => self.to_string(), _ => "Internal error".to_string(), } @@ -88,7 +68,7 @@ async fn authenticate_proxy_client( md5_response: &str, salt: &[u8; 4], psql_session_id: &str, -) -> Result { +) -> Result { let mut url = auth_endpoint.clone(); url.query_pairs_mut() .append_pair("login", &creds.user) @@ -102,17 +82,17 @@ async fn authenticate_proxy_client( // TODO: leverage `reqwest::Client` to reuse connections let resp = reqwest::get(url).await?; if !resp.status().is_success() { - return Err(AuthErrorImpl::HttpStatus(resp.status()).into()); + return Err(LegacyAuthError::HttpStatus(resp.status())); } - let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?; + let auth_info = serde_json::from_str(resp.text().await?.as_str())?; println!("got auth info: {:?}", auth_info); use ProxyAuthResponse::*; let db_info = match auth_info { Ready { conn_info } => conn_info, - Error { error } => return Err(AuthErrorImpl::AuthFailed(error).into()), - NotReady { .. } => waiter.await?.map_err(AuthErrorImpl::AuthFailed)?, + Error { error } => return Err(LegacyAuthError::AuthFailed(error)), + NotReady { .. } => waiter.await?.map_err(LegacyAuthError::AuthFailed)?, }; Ok(db_info) @@ -124,7 +104,7 @@ async fn handle_existing_user( auth_endpoint: &reqwest::Url, client: &mut PqStream, creds: &ClientCredentials, -) -> Result { +) -> auth::Result { let psql_session_id = super::link::new_psql_session_id(); let md5_salt = rand::random(); diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index d658a34825..d740a4c5c4 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -1,7 +1,34 @@ -use crate::{auth, compute, stream::PqStream}; +use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters}; +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; +#[derive(Debug, Error)] +pub enum LinkAuthError { + /// Authentication error reported by the console. + #[error("Authentication failed: {0}")] + AuthFailed(String), + + #[error(transparent)] + WaiterRegister(#[from] waiters::RegisterError), + + #[error(transparent)] + WaiterWait(#[from] waiters::WaitError), + + #[error(transparent)] + Io(#[from] std::io::Error), +} + +impl UserFacingError for LinkAuthError { + fn to_string_client(&self) -> String { + use LinkAuthError::*; + match self { + AuthFailed(_) => self.to_string(), + _ => "Internal error".to_string(), + } + } +} + fn hello_message(redirect_uri: &str, session_id: &str) -> String { format!( concat![ @@ -34,7 +61,7 @@ pub async fn handle_user( .await?; // Wait for web console response (see `mgmt`) - waiter.await?.map_err(auth::AuthErrorImpl::auth_failed) + waiter.await?.map_err(LinkAuthError::AuthFailed) }) .await?; diff --git a/proxy/src/auth/backend/postgres.rs b/proxy/src/auth/backend/postgres.rs index 1d7ab8f249..183fa52ec1 100644 --- a/proxy/src/auth/backend/postgres.rs +++ b/proxy/src/auth/backend/postgres.rs @@ -3,7 +3,7 @@ use crate::{ auth::{ self, - backend::console::{self, AuthInfo, Result}, + backend::console::{self, AuthInfo, GetAuthInfoError, TransportError, WakeComputeError}, ClientCredentials, }, compute::{self, ComputeConnCfg}, @@ -20,6 +20,13 @@ pub(super) struct Api<'a> { creds: &'a ClientCredentials, } +// Helps eliminate graceless `.map_err` calls without introducing another ctor. +impl From for TransportError { + fn from(e: tokio_postgres::Error) -> Self { + io_error(e).into() + } +} + impl<'a> Api<'a> { /// Construct an API object containing the auth parameters. pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Self { @@ -36,21 +43,16 @@ impl<'a> Api<'a> { } /// This implementation fetches the auth info from a local postgres instance. - async fn get_auth_info(&self) -> Result { + async fn get_auth_info(&self) -> Result { // Perhaps we could persist this connection, but then we'd have to // write more code for reopening it if it got closed, which doesn't // seem worth it. let (client, connection) = - tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls) - .await - .map_err(io_error)?; + tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?; tokio::spawn(connection); let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1"; - let rows = client - .query(query, &[&self.creds.user]) - .await - .map_err(io_error)?; + let rows = client.query(query, &[&self.creds.user]).await?; match &rows[..] { // We can't get a secret if there's no such user. @@ -74,13 +76,13 @@ impl<'a> Api<'a> { })) }) // Putting the secret into this message is a security hazard! - .ok_or(console::ConsoleAuthError::BadSecret) + .ok_or(GetAuthInfoError::BadSecret) } } } /// We don't need to wake anything locally, so we just return the connection info. - pub(super) async fn wake_compute(&self) -> Result { + pub(super) async fn wake_compute(&self) -> Result { let mut config = ComputeConnCfg::new(); config .host(self.endpoint.host_str().unwrap_or("localhost")) diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 705f1e3807..5a516fdc30 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -75,13 +75,12 @@ impl AuthFlow<'_, S, PasswordHack> { .strip_suffix(&[0]) .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; - // The so-called "password" should contain a base64-encoded json. - // We will use it later to route the client to their project. - let bytes = base64::decode(password) - .map_err(|_| AuthErrorImpl::MalformedPassword("bad encoding"))?; - - let payload = serde_json::from_slice(&bytes) - .map_err(|_| AuthErrorImpl::MalformedPassword("invalid payload"))?; + let payload = PasswordHackPayload::parse(password) + // If we ended up here and the payload is malformed, it means that + // the user neither enabled SNI nor resorted to any other method + // for passing the project name we rely on. We should show them + // the most helpful error message and point to the documentation. + .ok_or(AuthErrorImpl::MissingProjectName)?; Ok(payload) } @@ -98,7 +97,7 @@ impl AuthFlow<'_, S, Scram<'_>> { // Currently, the only supported SASL method is SCRAM. if !scram::METHODS.contains(&sasl.method) { - return Err(AuthErrorImpl::auth_failed("method not supported").into()); + return Err(super::AuthError::bad_auth_method(sasl.method)); } let secret = self.state.0; diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs index 6a1258ab31..639809e18a 100644 --- a/proxy/src/auth/password_hack.rs +++ b/proxy/src/auth/password_hack.rs @@ -1,102 +1,46 @@ //! Payload for ad hoc authentication method for clients that don't support SNI. //! See the `impl` for [`super::backend::BackendType`]. //! Read more: . +//! UPDATE (Mon Aug 8 13:20:34 UTC 2022): the payload format has been simplified. -use serde::{de, Deserialize, Deserializer}; -use std::fmt; +use bstr::ByteSlice; -#[derive(Deserialize)] -#[serde(untagged)] -pub enum Password { - /// A regular string for utf-8 encoded passwords. - Simple { password: String }, - - /// Password is base64-encoded because it may contain arbitrary byte sequences. - Encoded { - #[serde(rename = "password_", deserialize_with = "deserialize_base64")] - password: Vec, - }, -} - -impl AsRef<[u8]> for Password { - fn as_ref(&self) -> &[u8] { - match self { - Password::Simple { password } => password.as_ref(), - Password::Encoded { password } => password.as_ref(), - } - } -} - -#[derive(Deserialize)] pub struct PasswordHackPayload { pub project: String, - - #[serde(flatten)] - pub password: Password, + pub password: Vec, } -fn deserialize_base64<'a, D: Deserializer<'a>>(des: D) -> Result, D::Error> { - // It's very tempting to replace this with - // - // ``` - // let base64: &str = Deserialize::deserialize(des)?; - // base64::decode(base64).map_err(serde::de::Error::custom) - // ``` - // - // Unfortunately, we can't always deserialize into `&str`, so we'd - // have to use an allocating `String` instead. Thus, visitor is better. - struct Visitor; +impl PasswordHackPayload { + pub fn parse(bytes: &[u8]) -> Option { + // The format is `project=;`. + let mut iter = bytes.strip_prefix(b"project=")?.splitn_str(2, ";"); + let project = iter.next()?.to_str().ok()?.to_owned(); + let password = iter.next()?.to_owned(); - impl<'de> de::Visitor<'de> for Visitor { - type Value = Vec; - - fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { - formatter.write_str("a string") - } - - fn visit_str(self, v: &str) -> Result { - base64::decode(v).map_err(de::Error::custom) - } + Some(Self { project, password }) } - - des.deserialize_str(Visitor) } #[cfg(test)] mod tests { use super::*; - use rstest::rstest; - use serde_json::json; #[test] - fn parse_password() -> anyhow::Result<()> { - let password: Password = serde_json::from_value(json!({ - "password": "foo", - }))?; - assert_eq!(password.as_ref(), "foo".as_bytes()); + fn parse_password_hack_payload() { + let bytes = b""; + assert!(PasswordHackPayload::parse(bytes).is_none()); - let password: Password = serde_json::from_value(json!({ - "password_": base64::encode("foo"), - }))?; - assert_eq!(password.as_ref(), "foo".as_bytes()); + let bytes = b"project="; + assert!(PasswordHackPayload::parse(bytes).is_none()); - Ok(()) - } + let bytes = b"project=;"; + let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); + assert_eq!(payload.project, ""); + assert_eq!(payload.password, b""); - #[rstest] - #[case("password", str::to_owned)] - #[case("password_", base64::encode)] - fn parse(#[case] key: &str, #[case] encode: fn(&'static str) -> String) -> anyhow::Result<()> { - let (password, project) = ("password", "pie-in-the-sky"); - let payload = json!({ - "project": project, - key: encode(password), - }); - - let payload: PasswordHackPayload = serde_json::from_value(payload)?; - assert_eq!(payload.password.as_ref(), password.as_bytes()); - assert_eq!(payload.project, project); - - Ok(()) + let bytes = b"project=foobar;pass;word"; + let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); + assert_eq!(payload.project, "foobar"); + assert_eq!(payload.password, b"pass;word"); } } diff --git a/test_runner/batch_others/test_proxy.py b/test_runner/batch_others/test_proxy.py index 92c8475e69..2d9957fc38 100644 --- a/test_runner/batch_others/test_proxy.py +++ b/test_runner/batch_others/test_proxy.py @@ -1,6 +1,5 @@ import pytest -import json -import base64 +import psycopg2 def test_proxy_select_1(static_proxy): @@ -13,22 +12,14 @@ def test_password_hack(static_proxy): static_proxy.safe_psql(f"create role {user} with login password '{password}'", options='project=irrelevant') - def encode(s: str) -> str: - return base64.b64encode(s.encode('utf-8')).decode('utf-8') - - magic = encode(json.dumps({ - 'project': 'irrelevant', - 'password': password, - })) - + # Note the format of `magic`! + magic = f"project=irrelevant;{password}" static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic) - magic = encode(json.dumps({ - 'project': 'irrelevant', - 'password_': encode(password), - })) - - static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic) + # Must also check that invalid magic won't be accepted. + with pytest.raises(psycopg2.errors.OperationalError): + magic = "broken" + static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic) # Pass extra options to the server. From da5f8486cec6efcc17e93de6244ed23024c8b44e Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 12 Aug 2022 17:03:09 +0100 Subject: [PATCH 0620/1022] test_runner/pg_clients: collect docker logs (#2259) --- .github/workflows/pg_clients.yml | 19 ++++++++++++++++--- test_runner/fixtures/utils.py | 14 ++++++++++---- test_runner/pg_clients/test_pg_clients.py | 16 +++++++--------- 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index ba2d5cf666..95052619cd 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -19,8 +19,12 @@ concurrency: jobs: test-postgres-client-libs: + # TODO: switch to gen2 runner, requires docker runs-on: [ ubuntu-latest ] + env: + TEST_OUTPUT: /tmp/test_output + steps: - name: Checkout uses: actions/checkout@v3 @@ -47,7 +51,7 @@ jobs: env: REMOTE_ENV: 1 BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" - TEST_OUTPUT: /tmp/test_output + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install shell: bash -euxo pipefail {0} run: | @@ -61,9 +65,18 @@ jobs: -m "remote_cluster" \ -rA "test_runner/pg_clients" + # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI. + # It will be fixed after switching to gen2 runner + - name: Upload python test logs + if: always() + uses: actions/upload-artifact@v3 + with: + retention-days: 7 + name: python-test-pg_clients-${{ runner.os }}-stage-logs + path: ${{ env.TEST_OUTPUT }} + - name: Post to a Slack channel - if: failure() - id: slack + if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index a86c5ad923..36b88e9485 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -32,10 +32,16 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: stdout_filename = basepath + '.stdout' stderr_filename = basepath + '.stderr' - with open(stdout_filename, 'w') as stdout_f: - with open(stderr_filename, 'w') as stderr_f: - log.info('(capturing output to "{}.stdout")'.format(base)) - subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) + try: + with open(stdout_filename, 'w') as stdout_f: + with open(stderr_filename, 'w') as stderr_f: + log.info(f'Capturing stdout to "{base}.stdout" and stderr to "{base}.stderr"') + subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) + finally: + # Remove empty files if there is no output + for filename in (stdout_filename, stderr_filename): + if os.stat(filename).st_size == 0: + os.remove(filename) return basepath diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py index 7dc7299791..a117616358 100644 --- a/test_runner/pg_clients/test_pg_clients.py +++ b/test_runner/pg_clients/test_pg_clients.py @@ -3,10 +3,10 @@ import shutil import subprocess from pathlib import Path from tempfile import NamedTemporaryFile -from urllib.parse import urlparse import pytest from fixtures.neon_fixtures import RemotePostgres +from fixtures.utils import subprocess_capture @pytest.mark.remote_cluster @@ -25,7 +25,7 @@ from fixtures.neon_fixtures import RemotePostgres "typescript/postgresql-client", ], ) -def test_pg_clients(remote_pg: RemotePostgres, client: str): +def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: str): conn_options = remote_pg.conn_options() env_file = None @@ -43,12 +43,10 @@ def test_pg_clients(remote_pg: RemotePostgres, client: str): if docker_bin is None: raise RuntimeError("docker is required for running this test") - build_cmd = [ - docker_bin, "build", "--quiet", "--tag", image_tag, f"{Path(__file__).parent / client}" - ] + build_cmd = [docker_bin, "build", "--tag", image_tag, f"{Path(__file__).parent / client}"] + subprocess_capture(str(test_output_dir), build_cmd, check=True) + run_cmd = [docker_bin, "run", "--rm", "--env-file", env_file, image_tag] + basepath = subprocess_capture(str(test_output_dir), run_cmd, check=True) - subprocess.run(build_cmd, check=True) - result = subprocess.run(run_cmd, check=True, capture_output=True, text=True) - - assert result.stdout.strip() == "1" + assert Path(f"{basepath}.stdout").read_text().strip() == "1" From a5154dce3ea2b16b4d13f5e4475bbcdb92617c00 Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Fri, 12 Aug 2022 20:35:26 +0300 Subject: [PATCH 0621/1022] get_binaries script fix (#2263) * get_binaries uses DOCKER_TAG taken from docker image build step * remove docker tag discovery at all and fix get_binaries for version variable --- .github/ansible/get_binaries.sh | 37 +++++++-------------------------- 1 file changed, 7 insertions(+), 30 deletions(-) diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh index 777262e26f..f44a1ca50a 100755 --- a/.github/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -2,35 +2,12 @@ set -e -if [ -z "${DOCKER_TAG}" ]; then - # DOCKER_TAG absent, trying to find latest one in docker hub - - RELEASE=${RELEASE:-false} - # look at docker hub for latest tag for neon docker image - if [ "${RELEASE}" = "true" ]; then - echo "search latest release tag" - VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1) - if [ -z "${VERSION}" ]; then - echo "no any docker tags found, exiting..." - exit 1 - else - TAG="release-${VERSION}" - fi - else - echo "search latest dev tag" - VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1) - if [ -z "${VERSION}" ]; then - echo "no any docker tags found, exiting..." - exit 1 - else - TAG="${VERSION}" - fi - fi - echo "found ${VERSION}" - +if [ -n "${DOCKER_TAG}" ]; then + # Verson is DOCKER_TAG but without prefix + VERSION=$(echo $DOCKER_TAG | sed 's/^.*-//g') else - # DOCKER_TAG present, using it - TAG=${DOCKER_TAG} + echo "Please set DOCKER_TAG environment variable" + exit 1 fi @@ -40,8 +17,8 @@ mkdir neon_install # retrieve binaries from docker image echo "getting binaries from docker image" -docker pull --quiet neondatabase/neon:${TAG} -ID=$(docker create neondatabase/neon:${TAG}) +docker pull --quiet neondatabase/neon:${DOCKER_TAG} +ID=$(docker create neondatabase/neon:${DOCKER_TAG}) docker cp ${ID}:/data/postgres_install.tar.gz . tar -xzf postgres_install.tar.gz -C neon_install docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/ From f38f45b01d4fdc3aa6b682f98a2ccad571b79a2b Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 13 Aug 2022 10:58:14 +0300 Subject: [PATCH 0622/1022] Better storage sync logs (#2268) --- pageserver/src/storage_sync.rs | 93 ++++++++++++++------------- pageserver/src/storage_sync/delete.rs | 2 + pageserver/src/storage_sync/upload.rs | 21 +++--- 3 files changed, 61 insertions(+), 55 deletions(-) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index d1c8922259..15f24d7e24 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -979,7 +979,7 @@ enum DownloadStatus { #[derive(Debug)] enum UploadStatus { Uploaded, - Failed, + Failed(anyhow::Error), Nothing, } @@ -1056,41 +1056,43 @@ where let (upload_status, download_status) = tokio::join!( async { if let Some(upload_data) = upload_data { - match validate_task_retries(upload_data, max_sync_errors) + let upload_retries = upload_data.retries; + match validate_task_retries(upload_retries, max_sync_errors) .instrument(info_span!("retries_validation")) .await { - ControlFlow::Continue(new_upload_data) => { + ControlFlow::Continue(()) => { upload_timeline_data( conf, (storage.as_ref(), &index, sync_queue), current_remote_timeline.as_ref(), sync_id, - new_upload_data, + upload_data, sync_start, "upload", ) - .await; - UploadStatus::Uploaded - } - ControlFlow::Break(failed_upload_data) => { - if let Err(e) = update_remote_data( - conf, - storage.as_ref(), - &index, - sync_id, - RemoteDataUpdate::Upload { - uploaded_data: failed_upload_data.data, - upload_failed: true, - }, - ) .await - { - error!("Failed to update remote timeline {sync_id}: {e:?}"); - } - - UploadStatus::Failed } + ControlFlow::Break(()) => match update_remote_data( + conf, + storage.as_ref(), + &index, + sync_id, + RemoteDataUpdate::Upload { + uploaded_data: upload_data.data, + upload_failed: true, + }, + ) + .await + { + Ok(()) => UploadStatus::Failed(anyhow::anyhow!( + "Aborted after retries validation, current retries: {upload_retries}, max retries allowed: {max_sync_errors}" + )), + Err(e) => { + error!("Failed to update remote timeline {sync_id}: {e:?}"); + UploadStatus::Failed(e) + } + }, } } else { UploadStatus::Nothing @@ -1099,23 +1101,23 @@ where .instrument(info_span!("upload_timeline_data")), async { if let Some(download_data) = download_data { - match validate_task_retries(download_data, max_sync_errors) + match validate_task_retries(download_data.retries, max_sync_errors) .instrument(info_span!("retries_validation")) .await { - ControlFlow::Continue(new_download_data) => { + ControlFlow::Continue(()) => { return download_timeline_data( conf, (storage.as_ref(), &index, sync_queue), current_remote_timeline.as_ref(), sync_id, - new_download_data, + download_data, sync_start, "download", ) .await; } - ControlFlow::Break(_) => { + ControlFlow::Break(()) => { index .write() .await @@ -1132,29 +1134,29 @@ where if let Some(delete_data) = batch.delete { match upload_status { UploadStatus::Uploaded | UploadStatus::Nothing => { - match validate_task_retries(delete_data, max_sync_errors) + match validate_task_retries(delete_data.retries, max_sync_errors) .instrument(info_span!("retries_validation")) .await { - ControlFlow::Continue(new_delete_data) => { + ControlFlow::Continue(()) => { delete_timeline_data( conf, (storage.as_ref(), &index, sync_queue), sync_id, - new_delete_data, + delete_data, sync_start, "delete", ) .instrument(info_span!("delete_timeline_data")) .await; } - ControlFlow::Break(failed_delete_data) => { + ControlFlow::Break(()) => { if let Err(e) = update_remote_data( conf, storage.as_ref(), &index, sync_id, - RemoteDataUpdate::Delete(&failed_delete_data.data.deleted_layers), + RemoteDataUpdate::Delete(&delete_data.data.deleted_layers), ) .await { @@ -1163,8 +1165,8 @@ where } } } - UploadStatus::Failed => { - warn!("Skipping delete task due to failed upload tasks, reenqueuing"); + UploadStatus::Failed(e) => { + warn!("Skipping delete task due to failed upload tasks, reenqueuing. Upload data: {:?}, delete data: {delete_data:?}. Upload failure: {e:#}", batch.upload); sync_queue.push(sync_id, SyncTask::Delete(delete_data)); } } @@ -1349,7 +1351,8 @@ async fn upload_timeline_data( new_upload_data: SyncData, sync_start: Instant, task_name: &str, -) where +) -> UploadStatus +where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, { @@ -1362,9 +1365,9 @@ async fn upload_timeline_data( ) .await { - UploadedTimeline::FailedAndRescheduled => { + UploadedTimeline::FailedAndRescheduled(e) => { register_sync_status(sync_id, sync_start, task_name, Some(false)); - return; + return UploadStatus::Failed(e); } UploadedTimeline::Successful(upload_data) => upload_data, }; @@ -1383,12 +1386,14 @@ async fn upload_timeline_data( { Ok(()) => { register_sync_status(sync_id, sync_start, task_name, Some(true)); + UploadStatus::Uploaded } Err(e) => { error!("Failed to update remote timeline {sync_id}: {e:?}"); uploaded_data.retries += 1; sync_queue.push(sync_id, SyncTask::Upload(uploaded_data)); register_sync_status(sync_id, sync_start, task_name, Some(false)); + UploadStatus::Failed(e) } } } @@ -1491,21 +1496,17 @@ where .context("Failed to upload new index part") } -async fn validate_task_retries( - sync_data: SyncData, +async fn validate_task_retries( + current_attempt: u32, max_sync_errors: NonZeroU32, -) -> ControlFlow, SyncData> { - let current_attempt = sync_data.retries; +) -> ControlFlow<(), ()> { let max_sync_errors = max_sync_errors.get(); if current_attempt >= max_sync_errors { - error!( - "Aborting task that failed {current_attempt} times, exceeding retries threshold of {max_sync_errors}", - ); - return ControlFlow::Break(sync_data); + return ControlFlow::Break(()); } exponential_backoff(current_attempt, 1.0, 30.0).await; - ControlFlow::Continue(sync_data) + ControlFlow::Continue(()) } fn schedule_first_sync_tasks( diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index 0dcd9c97fc..a1b26ee9a2 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -95,6 +95,8 @@ where debug!("Reenqueuing failed delete task for timeline {sync_id}"); delete_data.retries += 1; sync_queue.push(sync_id, SyncTask::Delete(delete_data)); + } else { + info!("Successfully deleted all layers"); } errored } diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 671ea45202..2c41f58721 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -75,7 +75,7 @@ where #[derive(Debug)] pub(super) enum UploadedTimeline { /// Upload failed due to some error, the upload task is rescheduled for another retry. - FailedAndRescheduled, + FailedAndRescheduled(anyhow::Error), /// No issues happened during the upload, all task files were put into the remote storage. Successful(SyncData), } @@ -179,7 +179,7 @@ where }) .collect::>(); - let mut errors_happened = false; + let mut errors = Vec::new(); while let Some(upload_result) = upload_tasks.next().await { match upload_result { Ok(uploaded_path) => { @@ -188,13 +188,13 @@ where } Err(e) => match e { UploadError::Other(e) => { - errors_happened = true; error!("Failed to upload a layer for timeline {sync_id}: {e:?}"); + errors.push(format!("{e:#}")); } UploadError::MissingLocalFile(source_path, e) => { if source_path.exists() { - errors_happened = true; error!("Failed to upload a layer for timeline {sync_id}: {e:?}"); + errors.push(format!("{e:#}")); } else { // We have run the upload sync task, but the file we wanted to upload is gone. // This is "fine" due the asynchronous nature of the sync loop: it only reacts to events and might need to @@ -217,14 +217,17 @@ where } } - if errors_happened { + if errors.is_empty() { + info!("Successfully uploaded all layers"); + UploadedTimeline::Successful(upload_data) + } else { debug!("Reenqueuing failed upload task for timeline {sync_id}"); upload_data.retries += 1; sync_queue.push(sync_id, SyncTask::Upload(upload_data)); - UploadedTimeline::FailedAndRescheduled - } else { - info!("Successfully uploaded all layers"); - UploadedTimeline::Successful(upload_data) + UploadedTimeline::FailedAndRescheduled(anyhow::anyhow!( + "Errors appeared during layer uploads: {:?}", + errors + )) } } From 431393e3610c1e058056c604980b23b2aeb3ceac Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 12 Aug 2022 18:23:12 +0300 Subject: [PATCH 0623/1022] Find end of WAL on safekeepers using WalStreamDecoder. We could make it inside wal_storage.rs, but taking into account that - wal_storage.rs reading is async - we don't need s3 here - error handling is different; error during decoding is normal I decided to put it separately. Test cargo test test_find_end_of_wal_last_crossing_segment prepared earlier by @yeputons passes now. Fixes https://github.com/neondatabase/neon/issues/544 https://github.com/neondatabase/cloud/issues/2004 Supersedes https://github.com/neondatabase/neon/pull/2066 --- libs/postgres_ffi/src/xlog_utils.rs | 472 +++++++--------------------- safekeeper/src/wal_storage.rs | 2 +- 2 files changed, 106 insertions(+), 368 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 8cdfd92fc1..29b00c8d36 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -16,22 +16,22 @@ use crate::XLogRecord; use crate::XLOG_PAGE_MAGIC; use crate::pg_constants::WAL_SEGMENT_SIZE; -use anyhow::{anyhow, bail, ensure}; -use byteorder::{ByteOrder, LittleEndian}; +use crate::waldecoder::WalStreamDecoder; + use bytes::BytesMut; use bytes::{Buf, Bytes}; -use crc32c::*; + use log::*; -use std::cmp::max; -use std::cmp::min; -use std::fs::{self, File}; + +use std::fs::File; use std::io::prelude::*; +use std::io::ErrorKind; use std::io::SeekFrom; use std::path::{Path, PathBuf}; use std::time::SystemTime; use utils::bin_ser::DeserializeError; use utils::bin_ser::SerializeError; -use utils::const_assert; + use utils::lsn::Lsn; pub const XLOG_FNAME_LEN: usize = 24; @@ -140,338 +140,93 @@ pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz { } } -/// Return offset of the last valid record in the segment segno, starting -/// looking at start_offset. Returns start_offset if no records found. -fn find_end_of_wal_segment( - data_dir: &Path, - segno: XLogSegNo, - tli: TimeLineID, - wal_seg_size: usize, - start_offset: usize, // start reading at this point -) -> anyhow::Result { - // step back to the beginning of the page to read it in... - let mut offs: usize = start_offset - start_offset % XLOG_BLCKSZ; - let mut skipping_first_contrecord: bool = false; - let mut contlen: usize = 0; - let mut xl_crc: u32 = 0; - let mut crc: u32 = 0; - let mut rec_offs: usize = 0; - let mut buf = [0u8; XLOG_BLCKSZ]; - let file_name = XLogFileName(tli, segno, wal_seg_size); - let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record - let mut file = File::open(data_dir.join(file_name.clone() + ".partial"))?; - file.seek(SeekFrom::Start(offs as u64))?; - // xl_crc is the last field in XLogRecord, will not be read into rec_hdr - const_assert!(XLOG_RECORD_CRC_OFFS + 4 == XLOG_SIZE_OF_XLOG_RECORD); - let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS]; - - trace!("find_end_of_wal_segment(data_dir={}, segno={}, tli={}, wal_seg_size={}, start_offset=0x{:x})", data_dir.display(), segno, tli, wal_seg_size, start_offset); - while offs < wal_seg_size { - // we are at the beginning of the page; read it in - if offs % XLOG_BLCKSZ == 0 { - trace!("offs=0x{:x}: new page", offs); - let bytes_read = file.read(&mut buf)?; - if bytes_read != buf.len() { - bail!( - "failed to read {} bytes from {} at {}", - XLOG_BLCKSZ, - file_name, - offs - ); - } - - let xlp_magic = LittleEndian::read_u16(&buf[0..2]); - let xlp_info = LittleEndian::read_u16(&buf[2..4]); - let xlp_rem_len = LittleEndian::read_u32(&buf[XLP_REM_LEN_OFFS..XLP_REM_LEN_OFFS + 4]); - trace!( - " xlp_magic=0x{:x}, xlp_info=0x{:x}, xlp_rem_len={}", - xlp_magic, - xlp_info, - xlp_rem_len - ); - // this is expected in current usage when valid WAL starts after page header - if xlp_magic != XLOG_PAGE_MAGIC as u16 { - trace!( - " invalid WAL file {}.partial magic {} at {:?}", - file_name, - xlp_magic, - Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)), - ); - } - if offs == 0 { - offs += XLOG_SIZE_OF_XLOG_LONG_PHD; - if (xlp_info & XLP_FIRST_IS_CONTRECORD) != 0 { - trace!(" first record is contrecord"); - skipping_first_contrecord = true; - contlen = xlp_rem_len as usize; - if offs < start_offset { - // Pre-condition failed: the beginning of the segment is unexpectedly corrupted. - ensure!(start_offset - offs >= contlen, - "start_offset is in the middle of the first record (which happens to be a contrecord), \ - expected to be on a record boundary. Is beginning of the segment corrupted?"); - contlen = 0; - // keep skipping_first_contrecord to avoid counting the contrecord as valid, we did not check it. - } - } else { - trace!(" first record is not contrecord"); - } - } else { - offs += XLOG_SIZE_OF_XLOG_SHORT_PHD; - } - // ... and step forward again if asked - trace!(" skipped header to 0x{:x}", offs); - offs = max(offs, start_offset); - // beginning of the next record - } else if contlen == 0 { - let page_offs = offs % XLOG_BLCKSZ; - let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize; - trace!("offs=0x{:x}: new record, xl_tot_len={}", offs, xl_tot_len); - if xl_tot_len == 0 { - info!( - "find_end_of_wal_segment reached zeros at {:?}, last records ends at {:?}", - Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)), - Lsn(XLogSegNoOffsetToRecPtr( - segno, - last_valid_rec_pos as u32, - wal_seg_size - )) - ); - break; // zeros, reached the end - } - if skipping_first_contrecord { - skipping_first_contrecord = false; - trace!(" first contrecord has been just completed"); - } else { - trace!( - " updating last_valid_rec_pos: 0x{:x} --> 0x{:x}", - last_valid_rec_pos, - offs - ); - last_valid_rec_pos = offs; - } - offs += 4; - rec_offs = 4; - contlen = xl_tot_len - 4; - trace!( - " reading rec_hdr[0..4] <-- [0x{:x}; 0x{:x})", - page_offs, - page_offs + 4 - ); - rec_hdr[0..4].copy_from_slice(&buf[page_offs..page_offs + 4]); - } else { - // we're continuing a record, possibly from previous page. - let page_offs = offs % XLOG_BLCKSZ; - let pageleft = XLOG_BLCKSZ - page_offs; - - // read the rest of the record, or as much as fits on this page. - let n = min(contlen, pageleft); - trace!( - "offs=0x{:x}, record continuation, pageleft={}, contlen={}", - offs, - pageleft, - contlen - ); - // fill rec_hdr header up to (but not including) xl_crc field - trace!( - " rec_offs={}, XLOG_RECORD_CRC_OFFS={}, XLOG_SIZE_OF_XLOG_RECORD={}", - rec_offs, - XLOG_RECORD_CRC_OFFS, - XLOG_SIZE_OF_XLOG_RECORD - ); - if rec_offs < XLOG_RECORD_CRC_OFFS { - let len = min(XLOG_RECORD_CRC_OFFS - rec_offs, n); - trace!( - " reading rec_hdr[{}..{}] <-- [0x{:x}; 0x{:x})", - rec_offs, - rec_offs + len, - page_offs, - page_offs + len - ); - rec_hdr[rec_offs..rec_offs + len].copy_from_slice(&buf[page_offs..page_offs + len]); - } - if rec_offs <= XLOG_RECORD_CRC_OFFS && rec_offs + n >= XLOG_SIZE_OF_XLOG_RECORD { - let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS; - // All records are aligned on 8-byte boundary, so their 8-byte frames - // cannot be split between pages. As xl_crc is the last field, - // its content is always on the same page. - const_assert!(XLOG_RECORD_CRC_OFFS % 8 == 4); - // We should always start reading aligned records even in incorrect WALs so if - // the condition is false it is likely a bug. However, it is localized somewhere - // in this function, hence we do not crash and just report failure instead. - ensure!(crc_offs % 8 == 4, "Record is not aligned properly (bug?)"); - xl_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]); - trace!( - " reading xl_crc: [0x{:x}; 0x{:x}) = 0x{:x}", - crc_offs, - crc_offs + 4, - xl_crc - ); - crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]); - trace!( - " initializing crc: [0x{:x}; 0x{:x}); crc = 0x{:x}", - crc_offs + 4, - page_offs + n, - crc - ); - } else if rec_offs > XLOG_RECORD_CRC_OFFS { - // As all records are 8-byte aligned, the header is already fully read and `crc` is initialized in the branch above. - ensure!(rec_offs >= XLOG_SIZE_OF_XLOG_RECORD); - let old_crc = crc; - crc = crc32c_append(crc, &buf[page_offs..page_offs + n]); - trace!( - " appending to crc: [0x{:x}; 0x{:x}); 0x{:x} --> 0x{:x}", - page_offs, - page_offs + n, - old_crc, - crc - ); - } else { - // Correct because of the way conditions are written above. - assert!(rec_offs + n < XLOG_SIZE_OF_XLOG_RECORD); - // If `skipping_first_contrecord == true`, we may be reading from a middle of a record - // which started in the previous segment. Hence there is no point in validating the header. - if !skipping_first_contrecord && rec_offs + n > XLOG_RECORD_CRC_OFFS { - info!( - "Curiously corrupted WAL: a record stops inside the header; \ - offs=0x{:x}, record continuation, pageleft={}, contlen={}", - offs, pageleft, contlen - ); - break; - } - // Do nothing: we are still reading the header. It's accounted in CRC in the end of the record. - } - rec_offs += n; - offs += n; - contlen -= n; - - if contlen == 0 { - trace!(" record completed at 0x{:x}", offs); - crc = crc32c_append(crc, &rec_hdr); - offs = (offs + 7) & !7; // pad on 8 bytes boundary */ - trace!( - " padded offs to 0x{:x}, crc is {:x}, expected crc is {:x}", - offs, - crc, - xl_crc - ); - if skipping_first_contrecord { - // do nothing, the flag will go down on next iteration when we're reading new record - trace!(" first conrecord has been just completed"); - } else if crc == xl_crc { - // record is valid, advance the result to its end (with - // alignment to the next record taken into account) - trace!( - " updating last_valid_rec_pos: 0x{:x} --> 0x{:x}", - last_valid_rec_pos, - offs - ); - last_valid_rec_pos = offs; - } else { - info!( - "CRC mismatch {} vs {} at {}", - crc, xl_crc, last_valid_rec_pos - ); - break; - } - } - } - } - trace!("last_valid_rec_pos=0x{:x}", last_valid_rec_pos); - Ok(last_valid_rec_pos as u32) -} - -/// -/// Scan a directory that contains PostgreSQL WAL files, for the end of WAL. -/// If precise, returns end LSN (next insertion point, basically); -/// otherwise, start of the last segment. -/// Returns (0, 0) if there is no WAL. -/// +// Returns (aligned) end_lsn of the last record in data_dir with WAL segments. +// start_lsn must point to some previously known record boundary (beginning of +// the next record). If no valid record after is found, start_lsn is returned +// back. pub fn find_end_of_wal( data_dir: &Path, wal_seg_size: usize, - precise: bool, - start_lsn: Lsn, // start reading WAL at this point or later -) -> anyhow::Result<(XLogRecPtr, TimeLineID)> { - let mut high_segno: XLogSegNo = 0; - let mut high_tli: TimeLineID = 0; - let mut high_ispartial = false; + start_lsn: Lsn, // start reading WAL at this point; must point at record start_lsn. +) -> anyhow::Result { + let mut result = start_lsn; + let mut curr_lsn = start_lsn; + let mut buf = [0u8; XLOG_BLCKSZ]; + let mut decoder = WalStreamDecoder::new(start_lsn); - for entry in fs::read_dir(data_dir)?.flatten() { - let ispartial: bool; - let entry_name = entry.file_name(); - let fname = entry_name - .to_str() - .ok_or_else(|| anyhow!("Invalid file name"))?; - - /* - * Check if the filename looks like an xlog file, or a .partial file. - */ - if IsXLogFileName(fname) { - ispartial = false; - } else if IsPartialXLogFileName(fname) { - ispartial = true; - } else { - continue; - } - let (segno, tli) = XLogFromFileName(fname, wal_seg_size); - if !ispartial && entry.metadata()?.len() != wal_seg_size as u64 { - continue; - } - if segno > high_segno - || (segno == high_segno && tli > high_tli) - || (segno == high_segno && tli == high_tli && high_ispartial && !ispartial) - { - high_segno = segno; - high_tli = tli; - high_ispartial = ispartial; - } - } - if high_segno > 0 { - let mut high_offs = 0; - /* - * Move the starting pointer to the start of the next segment, if the - * highest one we saw was completed. - */ - if !high_ispartial { - high_segno += 1; - } else if precise { - /* otherwise locate last record in last partial segment */ - if start_lsn.segment_number(wal_seg_size) > high_segno { - bail!( - "provided start_lsn {:?} is beyond highest segno {:?} available", - start_lsn, - high_segno, + // loop over segments + loop { + let segno = curr_lsn.segment_number(wal_seg_size); + let seg_file_name = XLogFileName(PG_TLI, segno, wal_seg_size); + let seg_file_path = data_dir.join(seg_file_name); + match open_wal_segment(&seg_file_path)? { + None => { + // no more segments + info!( + "find_end_of_wal reached end at {:?}, segment {:?} doesn't exist", + result, seg_file_path ); + return Ok(result); + } + Some(mut segment) => { + let seg_offs = curr_lsn.segment_offset(wal_seg_size); + segment.seek(SeekFrom::Start(seg_offs as u64))?; + // loop inside segment + loop { + let bytes_read = segment.read(&mut buf)?; + if bytes_read == 0 { + break; // EOF + } + curr_lsn += bytes_read as u64; + decoder.feed_bytes(&buf[0..bytes_read]); + + // advance result past all completely read records + loop { + match decoder.poll_decode() { + Ok(Some(record)) => result = record.0, + Err(e) => { + info!( + "find_end_of_wal reached end at {:?}, decode error: {:?}", + result, e + ); + return Ok(result); + } + Ok(None) => break, // need more data + } + } + } } - let start_offset = if start_lsn.segment_number(wal_seg_size) == high_segno { - start_lsn.segment_offset(wal_seg_size) - } else { - 0 - }; - high_offs = find_end_of_wal_segment( - data_dir, - high_segno, - high_tli, - wal_seg_size, - start_offset, - )?; } - let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size); - return Ok((high_ptr, high_tli)); } - Ok((0, 0)) +} + +// Open .partial or full WAL segment file, if present. +fn open_wal_segment(seg_file_path: &Path) -> anyhow::Result> { + let mut partial_path = seg_file_path.to_owned(); + partial_path.set_extension("partial"); + match File::open(partial_path) { + Ok(file) => Ok(Some(file)), + Err(e) => match e.kind() { + ErrorKind::NotFound => { + // .partial not found, try full + match File::open(seg_file_path) { + Ok(file) => Ok(Some(file)), + Err(e) => match e.kind() { + ErrorKind::NotFound => Ok(None), + _ => Err(e.into()), + }, + } + } + _ => Err(e.into()), + }, + } } pub fn main() { let mut data_dir = PathBuf::new(); data_dir.push("."); - let (wal_end, tli) = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, true, Lsn(0)).unwrap(); - println!( - "wal_end={:>08X}{:>08X}, tli={}", - (wal_end >> 32) as u32, - wal_end as u32, - tli - ); + let wal_end = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, Lsn(0)).unwrap(); + println!("wal_end={:?}", wal_end); } impl XLogRecord { @@ -595,7 +350,10 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result( - test_name: &str, - expected_end_of_wal_non_partial: Lsn, - ) { + fn test_end_of_wal(test_name: &str) { use wal_craft::*; // Craft some WAL let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) @@ -630,7 +385,7 @@ mod tests { .iter() .map(|&lsn| u64::from(lsn).into()) .collect(); - let expected_end_of_wal_partial: Lsn = u64::from(expected_end_of_wal_partial).into(); + let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into(); srv.kill(); // Check find_end_of_wal on the initial WAL @@ -642,10 +397,10 @@ mod tests { .filter(|fname| IsXLogFileName(fname)) .max() .unwrap(); - check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal_partial); - for start_lsn in std::iter::once(Lsn(0)) - .chain(intermediate_lsns) - .chain(std::iter::once(expected_end_of_wal_partial)) + check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal); + for start_lsn in intermediate_lsns + .iter() + .chain(std::iter::once(&expected_end_of_wal)) { // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`. // We assume that `start_lsn` is non-decreasing. @@ -660,7 +415,7 @@ mod tests { } let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE); let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); - if seg_start_lsn > u64::from(start_lsn) { + if seg_start_lsn > u64::from(*start_lsn) { continue; } let mut f = File::options().write(true).open(file.path()).unwrap(); @@ -668,18 +423,12 @@ mod tests { f.write_all( &ZEROS[0..min( WAL_SEGMENT_SIZE, - (u64::from(start_lsn) - seg_start_lsn) as usize, + (u64::from(*start_lsn) - seg_start_lsn) as usize, )], ) .unwrap(); } - check_end_of_wal( - &cfg, - &last_segment, - start_lsn, - expected_end_of_wal_non_partial, - expected_end_of_wal_partial, - ); + check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal); } } @@ -716,18 +465,15 @@ mod tests { cfg: &wal_craft::Conf, last_segment: &str, start_lsn: Lsn, - expected_end_of_wal_non_partial: Lsn, - expected_end_of_wal_partial: Lsn, + expected_end_of_wal: Lsn, ) { // Check end_of_wal on non-partial WAL segment (we treat it as fully populated) - let (wal_end, tli) = - find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap(); - let wal_end = Lsn(wal_end); - info!( - "find_end_of_wal returned (wal_end={}, tli={}) with non-partial WAL segment", - wal_end, tli - ); - assert_eq!(wal_end, expected_end_of_wal_non_partial); + // let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap(); + // info!( + // "find_end_of_wal returned wal_end={} with non-partial WAL segment", + // wal_end + // ); + // assert_eq!(wal_end, expected_end_of_wal_non_partial); // Rename file to partial to actually find last valid lsn, then rename it back. fs::rename( @@ -735,14 +481,12 @@ mod tests { cfg.wal_dir().join(format!("{}.partial", last_segment)), ) .unwrap(); - let (wal_end, tli) = - find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap(); - let wal_end = Lsn(wal_end); + let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap(); info!( - "find_end_of_wal returned (wal_end={}, tli={}) with partial WAL segment", - wal_end, tli + "find_end_of_wal returned wal_end={} with partial WAL segment", + wal_end ); - assert_eq!(wal_end, expected_end_of_wal_partial); + assert_eq!(wal_end, expected_end_of_wal); fs::rename( cfg.wal_dir().join(format!("{}.partial", last_segment)), cfg.wal_dir().join(last_segment), @@ -755,10 +499,7 @@ mod tests { #[test] pub fn test_find_end_of_wal_simple() { init_logging(); - test_end_of_wal::( - "test_find_end_of_wal_simple", - "0/2000000".parse::().unwrap(), - ); + test_end_of_wal::("test_find_end_of_wal_simple"); } #[test] @@ -766,17 +507,14 @@ mod tests { init_logging(); test_end_of_wal::( "test_find_end_of_wal_crossing_segment_followed_by_small_one", - "0/3000000".parse::().unwrap(), ); } #[test] - #[ignore = "not yet fixed, needs correct parsing of pre-last segments"] // TODO pub fn test_find_end_of_wal_last_crossing_segment() { init_logging(); test_end_of_wal::( "test_find_end_of_wal_last_crossing_segment", - "0/3000000".parse::().unwrap(), ); } diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 2a36d5c04c..5f4bf588c7 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -332,7 +332,7 @@ impl Storage for PhysicalStorage { self.write_lsn = if state.commit_lsn == Lsn(0) { Lsn(0) } else { - Lsn(find_end_of_wal(&self.timeline_dir, wal_seg_size, true, state.commit_lsn)?.0) + find_end_of_wal(&self.timeline_dir, wal_seg_size, state.commit_lsn)? }; self.write_record_lsn = self.write_lsn; From 116ecdf87a94d486b60911c0a95ec3e949f03202 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Mon, 15 Aug 2022 13:31:26 +0300 Subject: [PATCH 0624/1022] Improve walreceiver logic (#2253) This patch makes walreceiver logic more complicated, but it should work better in most cases. Added `test_wal_lagging` to test scenarios where alive safekeepers can lag behind other alive safekeepers. - There was a bug which looks like `etcd_info.timeline.commit_lsn > Some(self.local_timeline.get_last_record_lsn())` filtered all safekeepers in some strange cases. I removed this filter, it should probably help with #2237 - Now walreceiver_connection reports status, including commit_lsn. This allows keeping safekeeper connection even when etcd is down. - Safekeeper connection now fails if pageserver doesn't receive safekeeper messages for some time. Usually safekeeper sends messages at least once per second. - `LaggingWal` check now uses `commit_lsn` directly from safekeeper. This fixes the issue with often reconnects, when compute generates WAL really fast. - `NoWalTimeout` is rewritten to trigger only when we know about the new WAL and the connected safekeeper doesn't stream any WAL. This allows setting a small `lagging_wal_timeout` because it will trigger only when we observe that the connected safekeeper has stuck. --- pageserver/src/tenant_config.rs | 2 +- .../src/walreceiver/connection_manager.rs | 402 +++++++++++------- .../src/walreceiver/walreceiver_connection.rs | 77 +++- test_runner/batch_others/test_wal_acceptor.py | 8 +- .../batch_others/test_wal_acceptor_async.py | 65 +++ 5 files changed, 380 insertions(+), 174 deletions(-) diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index eff5272837..73bf3636d2 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -37,7 +37,7 @@ pub mod defaults { pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; pub const DEFAULT_PITR_INTERVAL: &str = "30 days"; pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds"; - pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; + pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds"; pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024; } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 09142c4d44..2722bc7320 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -17,7 +17,7 @@ use std::{ }; use anyhow::Context; -use chrono::{DateTime, Local, NaiveDateTime, Utc}; +use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription, BrokerUpdate, Client, @@ -33,11 +33,10 @@ use crate::{ use crate::{RepositoryImpl, TimelineImpl}; use utils::{ lsn::Lsn, - pq_proto::ReplicationFeedback, zid::{NodeId, ZTenantTimelineId}, }; -use super::{TaskEvent, TaskHandle}; +use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle}; /// Spawns the loop to take care of the timeline's WAL streaming connection. pub(super) fn spawn_connection_manager_task( @@ -114,21 +113,26 @@ async fn connection_manager_loop_step( } } => { let wal_connection = walreceiver_state.wal_connection.as_mut().expect("Should have a connection, as checked by the corresponding select! guard"); - match &wal_connection_update { + match wal_connection_update { TaskEvent::Started => { - wal_connection.latest_connection_update = Utc::now().naive_utc(); *walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0) += 1; }, - TaskEvent::NewEvent(replication_feedback) => { - wal_connection.latest_connection_update = DateTime::::from(replication_feedback.ps_replytime).naive_utc(); - // reset connection attempts here only, the only place where both nodes - // explicitly confirmn with replication feedback that they are connected to each other - walreceiver_state.wal_connection_attempts.remove(&wal_connection.sk_id); + TaskEvent::NewEvent(status) => { + if status.has_received_wal { + // Reset connection attempts here only, we know that safekeeper is healthy + // because it can send us a WAL update. + walreceiver_state.wal_connection_attempts.remove(&wal_connection.sk_id); + } + wal_connection.status = status; }, TaskEvent::End(end_result) => { match end_result { Ok(()) => debug!("WAL receiving task finished"), - Err(e) => warn!("WAL receiving task failed: {e}"), + Err(e) => { + warn!("WAL receiving task failed: {e}"); + // If the task failed, set the connection attempts to at least 1, to try other safekeepers. + let _ = *walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(1); + } }; walreceiver_state.wal_connection = None; }, @@ -257,10 +261,21 @@ struct WalreceiverState { struct WalConnection { /// Current safekeeper pageserver is connected to for WAL streaming. sk_id: NodeId, - /// Connection task start time or the timestamp of a latest connection message received. - latest_connection_update: NaiveDateTime, + /// Status of the connection. + status: WalConnectionStatus, /// WAL streaming task handle. - connection_task: TaskHandle, + connection_task: TaskHandle, + /// Have we discovered that other safekeeper has more recent WAL than we do? + discovered_new_wal: Option, +} + +/// Notion of a new committed WAL, which exists on other safekeeper. +#[derive(Debug, Clone, Copy)] +struct NewCommittedWAL { + /// LSN of the new committed WAL. + lsn: Lsn, + /// When we discovered that the new committed WAL exists on other safekeeper. + discovered_at: NaiveDateTime, } /// Data about the timeline to connect to, received from etcd. @@ -327,10 +342,19 @@ impl WalreceiverState { .instrument(info_span!("walreceiver_connection", id = %id)) }); + let now = Utc::now().naive_utc(); self.wal_connection = Some(WalConnection { sk_id: new_sk_id, - latest_connection_update: Utc::now().naive_utc(), + status: WalConnectionStatus { + is_connected: false, + has_received_wal: false, + latest_connection_update: now, + latest_wal_update: now, + streaming_lsn: None, + commit_lsn: None, + }, connection_task: connection_handle, + discovered_new_wal: None, }); } @@ -361,14 +385,16 @@ impl WalreceiverState { /// Cleans up stale etcd records and checks the rest for the new connection candidate. /// Returns a new candidate, if the current state is absent or somewhat lagging, `None` otherwise. /// The current rules for approving new candidates: - /// * pick from the input data from etcd for currently connected safekeeper (if any) - /// * out of the rest input entries, pick one with biggest `commit_lsn` that's after than pageserver's latest Lsn for the timeline + /// * pick a candidate different from the connected safekeeper with biggest `commit_lsn` and lowest failed connection attemps /// * if there's no such entry, no new candidate found, abort - /// * check the current connection time data for staleness, reconnect if stale - /// * otherwise, check if etcd updates contain currently connected safekeeper - /// * if not, that means no WAL updates happened after certain time (either none since the connection time or none since the last event after the connection) - /// Reconnect if the time exceeds the threshold. - /// * if there's one, compare its Lsn with the other candidate's, reconnect if candidate's over threshold + /// * otherwise check if the candidate is much better than the current one + /// + /// To understand exact rules for determining if the candidate is better than the current one, refer to this function's implementation. + /// General rules are following: + /// * if connected safekeeper is not present, pick the candidate + /// * if we haven't received any updates for some time, pick the candidate + /// * if the candidate commit_lsn is much higher than the current one, pick the candidate + /// * if connected safekeeper stopped sending us new WAL which is available on other safekeeper, pick the candidate /// /// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently. /// Both thresholds are configured per tenant. @@ -384,53 +410,128 @@ impl WalreceiverState { let now = Utc::now().naive_utc(); if let Ok(latest_interaciton) = - (now - existing_wal_connection.latest_connection_update).to_std() + (now - existing_wal_connection.status.latest_connection_update).to_std() { - if latest_interaciton > self.lagging_wal_timeout { + // Drop connection if we haven't received keepalive message for a while. + if latest_interaciton > self.wal_connect_timeout { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connstr: new_wal_source_connstr, - reason: ReconnectReason::NoWalTimeout { - last_wal_interaction: Some( - existing_wal_connection.latest_connection_update, + reason: ReconnectReason::NoKeepAlives { + last_keep_alive: Some( + existing_wal_connection.status.latest_connection_update, ), check_time: now, - threshold: self.lagging_wal_timeout, + threshold: self.wal_connect_timeout, }, }); } } - match self.wal_stream_candidates.get(&connected_sk_node) { - Some(current_connection_etcd_data) => { - let new_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0)); - let current_lsn = current_connection_etcd_data - .timeline - .commit_lsn - .unwrap_or(Lsn(0)); - match new_lsn.0.checked_sub(current_lsn.0) - { - Some(new_sk_lsn_advantage) => { - if new_sk_lsn_advantage >= self.max_lsn_wal_lag.get() { - return Some( - NewWalConnectionCandidate { - safekeeper_id: new_sk_id, - wal_source_connstr: new_wal_source_connstr, - reason: ReconnectReason::LaggingWal { current_lsn, new_lsn, threshold: self.max_lsn_wal_lag }, - }); - } - } - None => debug!("Best SK candidate has its commit Lsn behind the current timeline's latest consistent Lsn"), + if !existing_wal_connection.status.is_connected { + // We haven't connected yet and we shouldn't switch until connection timeout (condition above). + return None; + } + + if let Some(current_commit_lsn) = existing_wal_connection.status.commit_lsn { + let new_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0)); + // Check if the new candidate has much more WAL than the current one. + match new_commit_lsn.0.checked_sub(current_commit_lsn.0) { + Some(new_sk_lsn_advantage) => { + if new_sk_lsn_advantage >= self.max_lsn_wal_lag.get() { + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_source_connstr: new_wal_source_connstr, + reason: ReconnectReason::LaggingWal { + current_commit_lsn, + new_commit_lsn, + threshold: self.max_lsn_wal_lag, + }, + }); } - } - None => { - return Some(NewWalConnectionCandidate { - safekeeper_id: new_sk_id, - wal_source_connstr: new_wal_source_connstr, - reason: ReconnectReason::NoEtcdDataForExistingConnection, - }) + } + None => debug!( + "Best SK candidate has its commit_lsn behind connected SK's commit_lsn" + ), } } + + let current_lsn = match existing_wal_connection.status.streaming_lsn { + Some(lsn) => lsn, + None => self.local_timeline.get_last_record_lsn(), + }; + let current_commit_lsn = existing_wal_connection + .status + .commit_lsn + .unwrap_or(current_lsn); + let candidate_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0)); + + // Keep discovered_new_wal only if connected safekeeper has not caught up yet. + let mut discovered_new_wal = existing_wal_connection + .discovered_new_wal + .filter(|new_wal| new_wal.lsn > current_commit_lsn); + + if discovered_new_wal.is_none() { + // Check if the new candidate has more WAL than the current one. + // If the new candidate has more WAL than the current one, we consider switching to the new candidate. + discovered_new_wal = if candidate_commit_lsn > current_commit_lsn { + trace!( + "New candidate has commit_lsn {}, higher than current_commit_lsn {}", + candidate_commit_lsn, + current_commit_lsn + ); + Some(NewCommittedWAL { + lsn: candidate_commit_lsn, + discovered_at: Utc::now().naive_utc(), + }) + } else { + None + }; + } + + let waiting_for_new_lsn_since = if current_lsn < current_commit_lsn { + // Connected safekeeper has more WAL, but we haven't received updates for some time. + trace!( + "Connected safekeeper has more WAL, but we haven't received updates for {:?}. current_lsn: {}, current_commit_lsn: {}", + (now - existing_wal_connection.status.latest_wal_update).to_std(), + current_lsn, + current_commit_lsn + ); + Some(existing_wal_connection.status.latest_wal_update) + } else { + discovered_new_wal.as_ref().map(|new_wal| { + // We know that new WAL is available on other safekeeper, but connected safekeeper don't have it. + new_wal + .discovered_at + .max(existing_wal_connection.status.latest_wal_update) + }) + }; + + // If we haven't received any WAL updates for a while and candidate has more WAL, switch to it. + if let Some(waiting_for_new_lsn_since) = waiting_for_new_lsn_since { + if let Ok(waiting_for_new_wal) = (now - waiting_for_new_lsn_since).to_std() { + if candidate_commit_lsn > current_commit_lsn + && waiting_for_new_wal > self.lagging_wal_timeout + { + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_source_connstr: new_wal_source_connstr, + reason: ReconnectReason::NoWalTimeout { + current_lsn, + current_commit_lsn, + candidate_commit_lsn, + last_wal_interaction: Some( + existing_wal_connection.status.latest_wal_update, + ), + check_time: now, + threshold: self.lagging_wal_timeout, + }, + }); + } + } + } + + self.wal_connection.as_mut().unwrap().discovered_new_wal = discovered_new_wal; } None => { let (new_sk_id, _, new_wal_source_connstr) = @@ -450,7 +551,7 @@ impl WalreceiverState { /// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another. /// /// The candidate that is chosen: - /// * has fewest connection attempts from pageserver to safekeeper node (reset every time the WAL replication feedback is sent) + /// * has fewest connection attempts from pageserver to safekeeper node (reset every time we receive a WAL message from the node) /// * has greatest data Lsn among the ones that are left /// /// NOTE: @@ -489,14 +590,13 @@ impl WalreceiverState { .max_by_key(|(_, info, _)| info.commit_lsn) } + /// Returns a list of safekeepers that have valid info and ready for connection. fn applicable_connection_candidates( &self, ) -> impl Iterator { self.wal_stream_candidates .iter() - .filter(|(_, etcd_info)| { - etcd_info.timeline.commit_lsn > Some(self.local_timeline.get_last_record_lsn()) - }) + .filter(|(_, info)| info.timeline.commit_lsn.is_some()) .filter_map(|(sk_id, etcd_info)| { let info = &etcd_info.timeline; match wal_stream_connection_string( @@ -512,6 +612,7 @@ impl WalreceiverState { }) } + /// Remove candidates which haven't sent etcd updates for a while. fn cleanup_old_candidates(&mut self) { let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len()); @@ -546,17 +647,24 @@ struct NewWalConnectionCandidate { #[derive(Debug, PartialEq, Eq)] enum ReconnectReason { NoExistingConnection, - NoEtcdDataForExistingConnection, LaggingWal { - current_lsn: Lsn, - new_lsn: Lsn, + current_commit_lsn: Lsn, + new_commit_lsn: Lsn, threshold: NonZeroU64, }, NoWalTimeout { + current_lsn: Lsn, + current_commit_lsn: Lsn, + candidate_commit_lsn: Lsn, last_wal_interaction: Option, check_time: NaiveDateTime, threshold: Duration, }, + NoKeepAlives { + last_keep_alive: Option, + check_time: NaiveDateTime, + threshold: Duration, + }, } fn wal_stream_connection_string( @@ -580,7 +688,6 @@ fn wal_stream_connection_string( #[cfg(test)] mod tests { - use std::time::SystemTime; use crate::repository::{ repo_harness::{RepoHarness, TIMELINE_ID}, @@ -658,7 +765,7 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, - safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + safekeeper_connstr: None, }, etcd_version: 0, latest_update: delay_over_threshold, @@ -684,22 +791,26 @@ mod tests { let connected_sk_id = NodeId(0); let current_lsn = 100_000; + let connection_status = WalConnectionStatus { + is_connected: true, + has_received_wal: true, + latest_connection_update: now, + latest_wal_update: now, + commit_lsn: Some(Lsn(current_lsn)), + streaming_lsn: Some(Lsn(current_lsn)), + }; + state.max_lsn_wal_lag = NonZeroU64::new(100).unwrap(); state.wal_connection = Some(WalConnection { sk_id: connected_sk_id, - latest_connection_update: now, + status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskEvent::NewEvent(ReplicationFeedback { - current_timeline_size: 1, - ps_writelsn: 1, - ps_applylsn: current_lsn, - ps_flushlsn: 1, - ps_replytime: SystemTime::now(), - })) + .send(TaskEvent::NewEvent(connection_status.clone())) .ok(); Ok(()) }), + discovered_new_wal: None, }); state.wal_stream_candidates = HashMap::from([ ( @@ -924,65 +1035,6 @@ mod tests { Ok(()) } - #[tokio::test] - async fn connection_no_etcd_data_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("connection_no_etcd_data_candidate")?; - let mut state = dummy_state(&harness); - - let now = Utc::now().naive_utc(); - let current_lsn = Lsn(100_000).align(); - let connected_sk_id = NodeId(0); - let other_sk_id = NodeId(connected_sk_id.0 + 1); - - state.wal_connection = Some(WalConnection { - sk_id: connected_sk_id, - latest_connection_update: now, - connection_task: TaskHandle::spawn(move |sender, _| async move { - sender - .send(TaskEvent::NewEvent(ReplicationFeedback { - current_timeline_size: 1, - ps_writelsn: current_lsn.0, - ps_applylsn: 1, - ps_flushlsn: 1, - ps_replytime: SystemTime::now(), - })) - .ok(); - Ok(()) - }), - }); - state.wal_stream_candidates = HashMap::from([( - other_sk_id, - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), - }, - etcd_version: 0, - latest_update: now, - }, - )]); - - let only_candidate = state - .next_connection_candidate() - .expect("Expected one candidate selected out of the only data option, but got none"); - assert_eq!(only_candidate.safekeeper_id, other_sk_id); - assert_eq!( - only_candidate.reason, - ReconnectReason::NoEtcdDataForExistingConnection, - "Should select new safekeeper due to missing etcd data, even if there's an existing connection with this safekeeper" - ); - assert!(only_candidate - .wal_source_connstr - .contains(DUMMY_SAFEKEEPER_CONNSTR)); - - Ok(()) - } - #[tokio::test] async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { let harness = RepoHarness::create("lsn_wal_over_threshcurrent_candidate")?; @@ -993,21 +1045,25 @@ mod tests { let connected_sk_id = NodeId(0); let new_lsn = Lsn(current_lsn.0 + state.max_lsn_wal_lag.get() + 1); + let connection_status = WalConnectionStatus { + is_connected: true, + has_received_wal: true, + latest_connection_update: now, + latest_wal_update: now, + commit_lsn: Some(current_lsn), + streaming_lsn: Some(current_lsn), + }; + state.wal_connection = Some(WalConnection { sk_id: connected_sk_id, - latest_connection_update: now, + status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskEvent::NewEvent(ReplicationFeedback { - current_timeline_size: 1, - ps_writelsn: current_lsn.0, - ps_applylsn: 1, - ps_flushlsn: 1, - ps_replytime: SystemTime::now(), - })) + .send(TaskEvent::NewEvent(connection_status.clone())) .ok(); Ok(()) }), + discovered_new_wal: None, }); state.wal_stream_candidates = HashMap::from([ ( @@ -1052,8 +1108,8 @@ mod tests { assert_eq!( over_threshcurrent_candidate.reason, ReconnectReason::LaggingWal { - current_lsn, - new_lsn, + current_commit_lsn: current_lsn, + new_commit_lsn: new_lsn, threshold: state.max_lsn_wal_lag }, "Should select bigger WAL safekeeper if it starts to lag enough" @@ -1066,31 +1122,35 @@ mod tests { } #[tokio::test] - async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("timeout_wal_over_threshhold_current_candidate")?; + async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("timeout_connection_threshhold_current_candidate")?; let mut state = dummy_state(&harness); let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); - let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?; + let wal_connect_timeout = chrono::Duration::from_std(state.wal_connect_timeout)?; let time_over_threshold = - Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; + Utc::now().naive_utc() - wal_connect_timeout - wal_connect_timeout; + + let connection_status = WalConnectionStatus { + is_connected: true, + has_received_wal: true, + latest_connection_update: time_over_threshold, + latest_wal_update: time_over_threshold, + commit_lsn: Some(current_lsn), + streaming_lsn: Some(current_lsn), + }; state.wal_connection = Some(WalConnection { sk_id: NodeId(1), - latest_connection_update: time_over_threshold, + status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskEvent::NewEvent(ReplicationFeedback { - current_timeline_size: 1, - ps_writelsn: current_lsn.0, - ps_applylsn: 1, - ps_flushlsn: 1, - ps_replytime: SystemTime::now(), - })) + .send(TaskEvent::NewEvent(connection_status.clone())) .ok(); Ok(()) }), + discovered_new_wal: None, }); state.wal_stream_candidates = HashMap::from([( NodeId(0), @@ -1115,12 +1175,12 @@ mod tests { assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0)); match over_threshcurrent_candidate.reason { - ReconnectReason::NoWalTimeout { - last_wal_interaction, + ReconnectReason::NoKeepAlives { + last_keep_alive, threshold, .. } => { - assert_eq!(last_wal_interaction, Some(time_over_threshold)); + assert_eq!(last_keep_alive, Some(time_over_threshold)); assert_eq!(threshold, state.lagging_wal_timeout); } unexpected => panic!("Unexpected reason: {unexpected:?}"), @@ -1133,20 +1193,34 @@ mod tests { } #[tokio::test] - async fn timeout_connection_over_threshhold_current_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("timeout_connection_over_threshhold_current_candidate")?; + async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("timeout_wal_over_threshhold_current_candidate")?; let mut state = dummy_state(&harness); let current_lsn = Lsn(100_000).align(); + let new_lsn = Lsn(100_100).align(); let now = Utc::now().naive_utc(); let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?; let time_over_threshold = Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; + let connection_status = WalConnectionStatus { + is_connected: true, + has_received_wal: true, + latest_connection_update: now, + latest_wal_update: time_over_threshold, + commit_lsn: Some(current_lsn), + streaming_lsn: Some(current_lsn), + }; + state.wal_connection = Some(WalConnection { sk_id: NodeId(1), - latest_connection_update: time_over_threshold, + status: connection_status, connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }), + discovered_new_wal: Some(NewCommittedWAL { + discovered_at: time_over_threshold, + lsn: new_lsn, + }), }); state.wal_stream_candidates = HashMap::from([( NodeId(0), @@ -1154,7 +1228,7 @@ mod tests { timeline: SkTimelineInfo { last_log_term: None, flush_lsn: None, - commit_lsn: Some(current_lsn), + commit_lsn: Some(new_lsn), backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, @@ -1172,10 +1246,16 @@ mod tests { assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0)); match over_threshcurrent_candidate.reason { ReconnectReason::NoWalTimeout { + current_lsn, + current_commit_lsn, + candidate_commit_lsn, last_wal_interaction, threshold, .. } => { + assert_eq!(current_lsn, current_lsn); + assert_eq!(current_commit_lsn, current_lsn); + assert_eq!(candidate_commit_lsn, new_lsn); assert_eq!(last_wal_interaction, Some(time_over_threshold)); assert_eq!(threshold, state.lagging_wal_timeout); } @@ -1202,7 +1282,7 @@ mod tests { .expect("Failed to create an empty timeline for dummy wal connection manager"), wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), - max_lsn_wal_lag: NonZeroU64::new(1).unwrap(), + max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), wal_connection: None, wal_stream_candidates: HashMap::new(), wal_connection_attempts: HashMap::new(), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 538ebfe30e..16a1f232e3 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -8,6 +8,7 @@ use std::{ use anyhow::{bail, ensure, Context}; use bytes::BytesMut; +use chrono::{NaiveDateTime, Utc}; use fail::fail_point; use futures::StreamExt; use postgres::{SimpleQueryMessage, SimpleQueryRow}; @@ -29,12 +30,29 @@ use crate::{ use postgres_ffi::waldecoder::WalStreamDecoder; use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; +/// Status of the connection. +#[derive(Debug, Clone)] +pub struct WalConnectionStatus { + /// If we were able to initiate a postgres connection, this means that safekeeper process is at least running. + pub is_connected: bool, + /// Defines a healthy connection as one on which we have received at least some WAL bytes. + pub has_received_wal: bool, + /// Connection establishment time or the timestamp of a latest connection message received. + pub latest_connection_update: NaiveDateTime, + /// Time of the latest WAL message received. + pub latest_wal_update: NaiveDateTime, + /// Latest WAL update contained WAL up to this LSN. Next WAL message with start from that LSN. + pub streaming_lsn: Option, + /// Latest commit_lsn received from the safekeeper. Can be zero if no message has been received yet. + pub commit_lsn: Option, +} + /// Open a connection to the given safekeeper and receive WAL, sending back progress /// messages as we go. pub async fn handle_walreceiver_connection( id: ZTenantTimelineId, wal_source_connstr: &str, - events_sender: &watch::Sender>, + events_sender: &watch::Sender>, mut cancellation: watch::Receiver<()>, connect_timeout: Duration, ) -> anyhow::Result<()> { @@ -49,12 +67,26 @@ pub async fn handle_walreceiver_connection( .await .context("Timed out while waiting for walreceiver connection to open")? .context("Failed to open walreceiver conection")?; + + info!("connected!"); + let mut connection_status = WalConnectionStatus { + is_connected: true, + has_received_wal: false, + latest_connection_update: Utc::now().naive_utc(), + latest_wal_update: Utc::now().naive_utc(), + streaming_lsn: None, + commit_lsn: None, + }; + if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); + return Ok(()); + } + // The connection object performs the actual communication with the database, // so spawn it off to run on its own. let mut connection_cancellation = cancellation.clone(); tokio::spawn( async move { - info!("connected!"); select! { connection_result = connection => match connection_result{ Ok(()) => info!("Walreceiver db connection closed"), @@ -84,6 +116,14 @@ pub async fn handle_walreceiver_connection( let identify = identify_system(&mut replication_client).await?; info!("{identify:?}"); + + connection_status.latest_connection_update = Utc::now().naive_utc(); + if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); + return Ok(()); + } + + // NB: this is a flush_lsn, not a commit_lsn. let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); let mut caught_up = false; let ZTenantTimelineId { @@ -118,7 +158,7 @@ pub async fn handle_walreceiver_connection( // There might be some padding after the last full record, skip it. startpoint += startpoint.calc_padding(8u32); - info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, server is at {end_of_wal}..."); + info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}..."); let query = format!("START_REPLICATION PHYSICAL {startpoint}"); @@ -140,6 +180,33 @@ pub async fn handle_walreceiver_connection( } } { let replication_message = replication_message?; + let now = Utc::now().naive_utc(); + + // Update the connection status before processing the message. If the message processing + // fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper. + match &replication_message { + ReplicationMessage::XLogData(xlog_data) => { + connection_status.latest_connection_update = now; + connection_status.commit_lsn = Some(Lsn::from(xlog_data.wal_end())); + connection_status.streaming_lsn = Some(Lsn::from( + xlog_data.wal_start() + xlog_data.data().len() as u64, + )); + if !xlog_data.data().is_empty() { + connection_status.latest_wal_update = now; + connection_status.has_received_wal = true; + } + } + ReplicationMessage::PrimaryKeepAlive(keepalive) => { + connection_status.latest_connection_update = now; + connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end())); + } + &_ => {} + }; + if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + warn!("Wal connection event listener dropped, aborting the connection: {e}"); + return Ok(()); + } + let status_update = match replication_message { ReplicationMessage::XLogData(xlog_data) => { // Pass the WAL data to the decoder, and see if we can decode @@ -257,10 +324,6 @@ pub async fn handle_walreceiver_connection( .as_mut() .zenith_status_update(data.len() as u64, &data) .await?; - if let Err(e) = events_sender.send(TaskEvent::NewEvent(zenith_status_update)) { - warn!("Wal connection event listener dropped, aborting the connection: {e}"); - return Ok(()); - } } } diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index b55ba84756..b6f914858e 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -1090,11 +1090,9 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # Remove initial tenant fully (two branches are active) response = sk_http.tenant_delete_force(tenant_id) - assert response == { - timeline_id_3: { - "dir_existed": True, - "was_active": True, - } + assert response[timeline_id_3] == { + "dir_existed": True, + "was_active": True, } assert not (sk_data_dir / tenant_id).exists() assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index 5c0cb56880..e1d3ba0919 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -520,3 +520,68 @@ def test_race_conditions(neon_env_builder: NeonEnvBuilder): pg = env.postgres.create_start('test_safekeepers_race_conditions') asyncio.run(run_race_conditions(env, pg)) + + +# Check that pageserver can select safekeeper with largest commit_lsn +# and switch if LSN is not updated for some time (NoWalTimeout). +async def run_wal_lagging(env: NeonEnv, pg: Postgres): + def safekeepers_guc(env: NeonEnv, active_sk: List[bool]) -> str: + # use ports 10, 11 and 12 to simulate unavailable safekeepers + return ','.join([ + f'localhost:{sk.port.pg if active else 10 + i}' + for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk)) + ]) + + conn = await pg.connect_async() + await conn.execute('CREATE TABLE t(key int primary key, value text)') + await conn.close() + pg.stop() + + n_iterations = 20 + n_txes = 10000 + expected_sum = 0 + i = 1 + quorum = len(env.safekeepers) // 2 + 1 + + for it in range(n_iterations): + active_sk = list(map(lambda _: random.random() >= 0.5, env.safekeepers)) + active_count = sum(active_sk) + + if active_count < quorum: + it -= 1 + continue + + pg.adjust_for_safekeepers(safekeepers_guc(env, active_sk)) + log.info(f'Iteration {it}: {active_sk}') + + pg.start() + conn = await pg.connect_async() + + for _ in range(n_txes): + await conn.execute(f"INSERT INTO t values ({i}, 'payload')") + expected_sum += i + i += 1 + + await conn.close() + pg.stop() + + pg.adjust_for_safekeepers(safekeepers_guc(env, [True] * len(env.safekeepers))) + pg.start() + conn = await pg.connect_async() + + log.info(f'Executed {i-1} queries') + + res = await conn.fetchval('SELECT sum(key) FROM t') + assert res == expected_sum + + +# do inserts while restarting postgres and messing with safekeeper addresses +def test_wal_lagging(neon_env_builder: NeonEnvBuilder): + + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch('test_wal_lagging') + pg = env.postgres.create_start('test_wal_lagging') + + asyncio.run(run_wal_lagging(env, pg)) From 63a72d99bb473043eeca78690ee09a4a94eaba6d Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 11 Aug 2022 11:32:23 +0300 Subject: [PATCH 0625/1022] increase timeout in wait_for_upload to avoid spurious failures when testing with real s3 --- test_runner/fixtures/neon_fixtures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d5b0af3813..fe0a3193c1 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2440,7 +2440,7 @@ def wait_for_upload(pageserver_http_client: NeonPageserverHttpClient, timeline: uuid.UUID, lsn: int): """waits for local timeline upload up to specified lsn""" - for i in range(10): + for i in range(20): current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) if current_lsn >= lsn: return From 7b12deead7d4024ffe55c09269915480a0f82bad Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 15 Aug 2022 18:24:24 +0300 Subject: [PATCH 0626/1022] Bump vendor/postgres to include XLP_FIRST_IS_CONTRECORD fix. (#2274) --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 0a9045c9ff..49015ce98f 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 0a9045c9ff2c0833fd7f32571833ebbdf037353d +Subproject commit 49015ce98f550d4fc08d3c1fe348faa71a15f51b From 4cddb0f1a4930cb8bafcbb61bf6bc5b8563421df Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 15 Aug 2022 18:54:31 +0100 Subject: [PATCH 0627/1022] Set up a workflow to run pgbench against captest (#2077) --- .github/actions/upload/action.yml | 6 +- .github/workflows/benchmarking.yml | 119 +++++++++++++- poetry.lock | 49 ++++-- pyproject.toml | 1 + test_runner/fixtures/benchmark_fixture.py | 100 ++++++++++-- test_runner/fixtures/utils.py | 9 ++ test_runner/performance/README.md | 2 +- test_runner/performance/test_perf_pgbench.py | 159 ++++++++++++++----- 8 files changed, 371 insertions(+), 74 deletions(-) diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml index 28e7d1fb1a..de8df3230f 100644 --- a/.github/actions/upload/action.yml +++ b/.github/actions/upload/action.yml @@ -29,8 +29,12 @@ runs: time tar -C ${SOURCE} -cf ${ARCHIVE} --zstd . elif [ -f ${SOURCE} ]; then time tar -cf ${ARCHIVE} --zstd ${SOURCE} + elif ! ls ${SOURCE} > /dev/null 2>&1; then + echo 2>&1 "${SOURCE} does not exist" + exit 2 else - echo 2>&1 "${SOURCE} neither directory nor file, don't know how to handle it" + echo 2>&1 "${SOURCE} is neither a directory nor a file, do not know how to handle it" + exit 3 fi - name: Upload artifact diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 427441f330..a6b2ca34e8 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -1,4 +1,4 @@ -name: benchmarking +name: Benchmarking on: # uncomment to run on push for debugging your PR @@ -15,6 +15,15 @@ on: workflow_dispatch: # adds ability to run this manually +defaults: + run: + shell: bash -euxo pipefail {0} + +concurrency: + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + cancel-in-progress: true + jobs: bench: # this workflow runs on self hosteed runner @@ -60,7 +69,6 @@ jobs: - name: Setup cluster env: BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" - shell: bash -euxo pipefail {0} run: | set -e @@ -96,7 +104,9 @@ jobs: # since it might generate duplicates when calling ingest_perf_test_result.py rm -rf perf-report-staging mkdir -p perf-report-staging - ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging --timeout 3600 + # Set --sparse-ordering option of pytest-order plugin to ensure tests are running in order of appears in the file, + # it's important for test_perf_pgbench.py::test_pgbench_remote_* tests + ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --skip-interfering-proc-check --out-dir perf-report-staging --timeout 3600 - name: Submit result env: @@ -113,3 +123,106 @@ jobs: slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + + pgbench-compare: + env: + TEST_PG_BENCH_DURATIONS_MATRIX: "60m" + TEST_PG_BENCH_SCALES_MATRIX: "10gb" + REMOTE_ENV: "1" + POSTGRES_DISTRIB_DIR: /usr + TEST_OUTPUT: /tmp/test_output + + strategy: + fail-fast: false + matrix: + connstr: [ BENCHMARK_CAPTEST_CONNSTR, BENCHMARK_RDS_CONNSTR ] + + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2817580636 + + timeout-minutes: 360 # 6h + + steps: + - uses: actions/checkout@v3 + + - name: Cache poetry deps + id: cache_poetry + uses: actions/cache@v3 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + run: ./scripts/pysync + + - name: Calculate platform + id: calculate-platform + env: + CONNSTR: ${{ matrix.connstr }} + run: | + if [ "${CONNSTR}" = "BENCHMARK_CAPTEST_CONNSTR" ]; then + PLATFORM=neon-captest + elif [ "${CONNSTR}" = "BENCHMARK_RDS_CONNSTR" ]; then + PLATFORM=rds-aurora + else + echo 2>&1 "Unknown CONNSTR=${CONNSTR}. Allowed are BENCHMARK_CAPTEST_CONNSTR, and BENCHMARK_RDS_CONNSTR only" + exit 1 + fi + + echo "::set-output name=PLATFORM::${PLATFORM}" + + - name: Install Deps + run: | + echo "deb http://apt.postgresql.org/pub/repos/apt focal-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list + wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add - + sudo apt -y update + sudo apt install -y postgresql-14 postgresql-client-14 + + - name: Benchmark init + env: + PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} + BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} + run: | + mkdir -p perf-report-captest + + psql $BENCHMARK_CONNSTR -c "SELECT 1;" + ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_init -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600 + + - name: Benchmark simple-update + env: + PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} + BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} + run: | + psql $BENCHMARK_CONNSTR -c "SELECT 1;" + ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_simple_update -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600 + + - name: Benchmark select-only + env: + PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} + BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} + run: | + psql $BENCHMARK_CONNSTR -c "SELECT 1;" + ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_select_only -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600 + + - name: Submit result + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + run: | + REPORT_FROM=$(realpath perf-report-captest) REPORT_TO=staging scripts/generate_and_push_perf_report.sh + + - name: Upload logs + if: always() + uses: ./.github/actions/upload + with: + name: bench-captest-${{ steps.calculate-platform.outputs.PLATFORM }} + path: /tmp/test_output/ + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/poetry.lock b/poetry.lock index 2563054b0b..6ab6bb0e20 100644 --- a/poetry.lock +++ b/poetry.lock @@ -109,8 +109,8 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "boto3-stubs" -version = "1.24.41" -description = "Type annotations for boto3 1.24.41 generated with mypy-boto3-builder 7.10.2" +version = "1.24.46" +description = "Type annotations for boto3 1.24.46 generated with mypy-boto3-builder 7.11.3" category = "main" optional = false python-versions = ">=3.7" @@ -127,7 +127,7 @@ account = ["mypy-boto3-account (>=1.24.0,<1.25.0)"] acm = ["mypy-boto3-acm (>=1.24.0,<1.25.0)"] acm-pca = ["mypy-boto3-acm-pca (>=1.24.0,<1.25.0)"] alexaforbusiness = ["mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)"] -all = ["mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)", "mypy-boto3-account (>=1.24.0,<1.25.0)", "mypy-boto3-acm (>=1.24.0,<1.25.0)", "mypy-boto3-acm-pca (>=1.24.0,<1.25.0)", "mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)", "mypy-boto3-amp (>=1.24.0,<1.25.0)", "mypy-boto3-amplify (>=1.24.0,<1.25.0)", "mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)", "mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)", "mypy-boto3-apigateway (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)", "mypy-boto3-appconfig (>=1.24.0,<1.25.0)", "mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)", "mypy-boto3-appflow (>=1.24.0,<1.25.0)", "mypy-boto3-appintegrations (>=1.24.0,<1.25.0)", "mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-application-insights (>=1.24.0,<1.25.0)", "mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-appmesh (>=1.24.0,<1.25.0)", "mypy-boto3-apprunner (>=1.24.0,<1.25.0)", "mypy-boto3-appstream (>=1.24.0,<1.25.0)", "mypy-boto3-appsync (>=1.24.0,<1.25.0)", "mypy-boto3-athena (>=1.24.0,<1.25.0)", "mypy-boto3-auditmanager (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)", "mypy-boto3-backup (>=1.24.0,<1.25.0)", "mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)", "mypy-boto3-batch (>=1.24.0,<1.25.0)", "mypy-boto3-billingconductor (>=1.24.0,<1.25.0)", "mypy-boto3-braket (>=1.24.0,<1.25.0)", "mypy-boto3-budgets (>=1.24.0,<1.25.0)", "mypy-boto3-ce (>=1.24.0,<1.25.0)", "mypy-boto3-chime (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)", "mypy-boto3-cloud9 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)", "mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)", "mypy-boto3-cloudformation (>=1.24.0,<1.25.0)", "mypy-boto3-cloudfront (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)", "mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)", "mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)", "mypy-boto3-codeartifact (>=1.24.0,<1.25.0)", "mypy-boto3-codebuild (>=1.24.0,<1.25.0)", "mypy-boto3-codecommit (>=1.24.0,<1.25.0)", "mypy-boto3-codedeploy (>=1.24.0,<1.25.0)", "mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)", "mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-codepipeline (>=1.24.0,<1.25.0)", "mypy-boto3-codestar (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)", "mypy-boto3-comprehend (>=1.24.0,<1.25.0)", "mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)", "mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)", "mypy-boto3-config (>=1.24.0,<1.25.0)", "mypy-boto3-connect (>=1.24.0,<1.25.0)", "mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)", "mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)", "mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)", "mypy-boto3-cur (>=1.24.0,<1.25.0)", "mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)", "mypy-boto3-databrew (>=1.24.0,<1.25.0)", "mypy-boto3-dataexchange (>=1.24.0,<1.25.0)", "mypy-boto3-datapipeline (>=1.24.0,<1.25.0)", "mypy-boto3-datasync (>=1.24.0,<1.25.0)", "mypy-boto3-dax (>=1.24.0,<1.25.0)", "mypy-boto3-detective (>=1.24.0,<1.25.0)", "mypy-boto3-devicefarm (>=1.24.0,<1.25.0)", "mypy-boto3-devops-guru (>=1.24.0,<1.25.0)", "mypy-boto3-directconnect (>=1.24.0,<1.25.0)", "mypy-boto3-discovery (>=1.24.0,<1.25.0)", "mypy-boto3-dlm (>=1.24.0,<1.25.0)", "mypy-boto3-dms (>=1.24.0,<1.25.0)", "mypy-boto3-docdb (>=1.24.0,<1.25.0)", "mypy-boto3-drs (>=1.24.0,<1.25.0)", "mypy-boto3-ds (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)", "mypy-boto3-ebs (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)", "mypy-boto3-ecr (>=1.24.0,<1.25.0)", "mypy-boto3-ecr-public (>=1.24.0,<1.25.0)", "mypy-boto3-ecs (>=1.24.0,<1.25.0)", "mypy-boto3-efs (>=1.24.0,<1.25.0)", "mypy-boto3-eks (>=1.24.0,<1.25.0)", "mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)", "mypy-boto3-elasticache (>=1.24.0,<1.25.0)", "mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)", "mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)", "mypy-boto3-elb (>=1.24.0,<1.25.0)", "mypy-boto3-elbv2 (>=1.24.0,<1.25.0)", "mypy-boto3-emr (>=1.24.0,<1.25.0)", "mypy-boto3-emr-containers (>=1.24.0,<1.25.0)", "mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-es (>=1.24.0,<1.25.0)", "mypy-boto3-events (>=1.24.0,<1.25.0)", "mypy-boto3-evidently (>=1.24.0,<1.25.0)", "mypy-boto3-finspace (>=1.24.0,<1.25.0)", "mypy-boto3-finspace-data (>=1.24.0,<1.25.0)", "mypy-boto3-firehose (>=1.24.0,<1.25.0)", "mypy-boto3-fis (>=1.24.0,<1.25.0)", "mypy-boto3-fms (>=1.24.0,<1.25.0)", "mypy-boto3-forecast (>=1.24.0,<1.25.0)", "mypy-boto3-forecastquery (>=1.24.0,<1.25.0)", "mypy-boto3-frauddetector (>=1.24.0,<1.25.0)", "mypy-boto3-fsx (>=1.24.0,<1.25.0)", "mypy-boto3-gamelift (>=1.24.0,<1.25.0)", "mypy-boto3-gamesparks (>=1.24.0,<1.25.0)", "mypy-boto3-glacier (>=1.24.0,<1.25.0)", "mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)", "mypy-boto3-glue (>=1.24.0,<1.25.0)", "mypy-boto3-grafana (>=1.24.0,<1.25.0)", "mypy-boto3-greengrass (>=1.24.0,<1.25.0)", "mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)", "mypy-boto3-groundstation (>=1.24.0,<1.25.0)", "mypy-boto3-guardduty (>=1.24.0,<1.25.0)", "mypy-boto3-health (>=1.24.0,<1.25.0)", "mypy-boto3-healthlake (>=1.24.0,<1.25.0)", "mypy-boto3-honeycode (>=1.24.0,<1.25.0)", "mypy-boto3-iam (>=1.24.0,<1.25.0)", "mypy-boto3-identitystore (>=1.24.0,<1.25.0)", "mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)", "mypy-boto3-importexport (>=1.24.0,<1.25.0)", "mypy-boto3-inspector (>=1.24.0,<1.25.0)", "mypy-boto3-inspector2 (>=1.24.0,<1.25.0)", "mypy-boto3-iot (>=1.24.0,<1.25.0)", "mypy-boto3-iot-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)", "mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)", "mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)", "mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)", "mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)", "mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)", "mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)", "mypy-boto3-iotwireless (>=1.24.0,<1.25.0)", "mypy-boto3-ivs (>=1.24.0,<1.25.0)", "mypy-boto3-ivschat (>=1.24.0,<1.25.0)", "mypy-boto3-kafka (>=1.24.0,<1.25.0)", "mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-kendra (>=1.24.0,<1.25.0)", "mypy-boto3-keyspaces (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)", "mypy-boto3-kms (>=1.24.0,<1.25.0)", "mypy-boto3-lakeformation (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-lex-models (>=1.24.0,<1.25.0)", "mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager (>=1.24.0,<1.25.0)", "mypy-boto3-lightsail (>=1.24.0,<1.25.0)", "mypy-boto3-location (>=1.24.0,<1.25.0)", "mypy-boto3-logs (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)", "mypy-boto3-m2 (>=1.24.0,<1.25.0)", "mypy-boto3-machinelearning (>=1.24.0,<1.25.0)", "mypy-boto3-macie (>=1.24.0,<1.25.0)", "mypy-boto3-macie2 (>=1.24.0,<1.25.0)", "mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)", "mypy-boto3-medialive (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)", "mypy-boto3-mediatailor (>=1.24.0,<1.25.0)", "mypy-boto3-memorydb (>=1.24.0,<1.25.0)", "mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)", "mypy-boto3-mgh (>=1.24.0,<1.25.0)", "mypy-boto3-mgn (>=1.24.0,<1.25.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)", "mypy-boto3-mobile (>=1.24.0,<1.25.0)", "mypy-boto3-mq (>=1.24.0,<1.25.0)", "mypy-boto3-mturk (>=1.24.0,<1.25.0)", "mypy-boto3-mwaa (>=1.24.0,<1.25.0)", "mypy-boto3-neptune (>=1.24.0,<1.25.0)", "mypy-boto3-network-firewall (>=1.24.0,<1.25.0)", "mypy-boto3-networkmanager (>=1.24.0,<1.25.0)", "mypy-boto3-nimble (>=1.24.0,<1.25.0)", "mypy-boto3-opensearch (>=1.24.0,<1.25.0)", "mypy-boto3-opsworks (>=1.24.0,<1.25.0)", "mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)", "mypy-boto3-organizations (>=1.24.0,<1.25.0)", "mypy-boto3-outposts (>=1.24.0,<1.25.0)", "mypy-boto3-panorama (>=1.24.0,<1.25.0)", "mypy-boto3-personalize (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-events (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-pi (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)", "mypy-boto3-polly (>=1.24.0,<1.25.0)", "mypy-boto3-pricing (>=1.24.0,<1.25.0)", "mypy-boto3-proton (>=1.24.0,<1.25.0)", "mypy-boto3-qldb (>=1.24.0,<1.25.0)", "mypy-boto3-qldb-session (>=1.24.0,<1.25.0)", "mypy-boto3-quicksight (>=1.24.0,<1.25.0)", "mypy-boto3-ram (>=1.24.0,<1.25.0)", "mypy-boto3-rbin (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-rds-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-rekognition (>=1.24.0,<1.25.0)", "mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)", "mypy-boto3-resource-groups (>=1.24.0,<1.25.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)", "mypy-boto3-robomaker (>=1.24.0,<1.25.0)", "mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)", "mypy-boto3-route53 (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)", "mypy-boto3-route53domains (>=1.24.0,<1.25.0)", "mypy-boto3-route53resolver (>=1.24.0,<1.25.0)", "mypy-boto3-rum (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-s3control (>=1.24.0,<1.25.0)", "mypy-boto3-s3outposts (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-savingsplans (>=1.24.0,<1.25.0)", "mypy-boto3-schemas (>=1.24.0,<1.25.0)", "mypy-boto3-sdb (>=1.24.0,<1.25.0)", "mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)", "mypy-boto3-securityhub (>=1.24.0,<1.25.0)", "mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)", "mypy-boto3-service-quotas (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)", "mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)", "mypy-boto3-ses (>=1.24.0,<1.25.0)", "mypy-boto3-sesv2 (>=1.24.0,<1.25.0)", "mypy-boto3-shield (>=1.24.0,<1.25.0)", "mypy-boto3-signer (>=1.24.0,<1.25.0)", "mypy-boto3-sms (>=1.24.0,<1.25.0)", "mypy-boto3-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)", "mypy-boto3-snowball (>=1.24.0,<1.25.0)", "mypy-boto3-sns (>=1.24.0,<1.25.0)", "mypy-boto3-sqs (>=1.24.0,<1.25.0)", "mypy-boto3-ssm (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)", "mypy-boto3-sso (>=1.24.0,<1.25.0)", "mypy-boto3-sso-admin (>=1.24.0,<1.25.0)", "mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)", "mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)", "mypy-boto3-storagegateway (>=1.24.0,<1.25.0)", "mypy-boto3-sts (>=1.24.0,<1.25.0)", "mypy-boto3-support (>=1.24.0,<1.25.0)", "mypy-boto3-swf (>=1.24.0,<1.25.0)", "mypy-boto3-synthetics (>=1.24.0,<1.25.0)", "mypy-boto3-textract (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-query (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-write (>=1.24.0,<1.25.0)", "mypy-boto3-transcribe (>=1.24.0,<1.25.0)", "mypy-boto3-transfer (>=1.24.0,<1.25.0)", "mypy-boto3-translate (>=1.24.0,<1.25.0)", "mypy-boto3-voice-id (>=1.24.0,<1.25.0)", "mypy-boto3-waf (>=1.24.0,<1.25.0)", "mypy-boto3-waf-regional (>=1.24.0,<1.25.0)", "mypy-boto3-wafv2 (>=1.24.0,<1.25.0)", "mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)", "mypy-boto3-wisdom (>=1.24.0,<1.25.0)", "mypy-boto3-workdocs (>=1.24.0,<1.25.0)", "mypy-boto3-worklink (>=1.24.0,<1.25.0)", "mypy-boto3-workmail (>=1.24.0,<1.25.0)", "mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)", "mypy-boto3-xray (>=1.24.0,<1.25.0)"] +all = ["mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)", "mypy-boto3-account (>=1.24.0,<1.25.0)", "mypy-boto3-acm (>=1.24.0,<1.25.0)", "mypy-boto3-acm-pca (>=1.24.0,<1.25.0)", "mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)", "mypy-boto3-amp (>=1.24.0,<1.25.0)", "mypy-boto3-amplify (>=1.24.0,<1.25.0)", "mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)", "mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)", "mypy-boto3-apigateway (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)", "mypy-boto3-appconfig (>=1.24.0,<1.25.0)", "mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)", "mypy-boto3-appflow (>=1.24.0,<1.25.0)", "mypy-boto3-appintegrations (>=1.24.0,<1.25.0)", "mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-application-insights (>=1.24.0,<1.25.0)", "mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-appmesh (>=1.24.0,<1.25.0)", "mypy-boto3-apprunner (>=1.24.0,<1.25.0)", "mypy-boto3-appstream (>=1.24.0,<1.25.0)", "mypy-boto3-appsync (>=1.24.0,<1.25.0)", "mypy-boto3-athena (>=1.24.0,<1.25.0)", "mypy-boto3-auditmanager (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)", "mypy-boto3-backup (>=1.24.0,<1.25.0)", "mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)", "mypy-boto3-batch (>=1.24.0,<1.25.0)", "mypy-boto3-billingconductor (>=1.24.0,<1.25.0)", "mypy-boto3-braket (>=1.24.0,<1.25.0)", "mypy-boto3-budgets (>=1.24.0,<1.25.0)", "mypy-boto3-ce (>=1.24.0,<1.25.0)", "mypy-boto3-chime (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)", "mypy-boto3-cloud9 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)", "mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)", "mypy-boto3-cloudformation (>=1.24.0,<1.25.0)", "mypy-boto3-cloudfront (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)", "mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)", "mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)", "mypy-boto3-codeartifact (>=1.24.0,<1.25.0)", "mypy-boto3-codebuild (>=1.24.0,<1.25.0)", "mypy-boto3-codecommit (>=1.24.0,<1.25.0)", "mypy-boto3-codedeploy (>=1.24.0,<1.25.0)", "mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)", "mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-codepipeline (>=1.24.0,<1.25.0)", "mypy-boto3-codestar (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)", "mypy-boto3-comprehend (>=1.24.0,<1.25.0)", "mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)", "mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)", "mypy-boto3-config (>=1.24.0,<1.25.0)", "mypy-boto3-connect (>=1.24.0,<1.25.0)", "mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)", "mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)", "mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)", "mypy-boto3-cur (>=1.24.0,<1.25.0)", "mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)", "mypy-boto3-databrew (>=1.24.0,<1.25.0)", "mypy-boto3-dataexchange (>=1.24.0,<1.25.0)", "mypy-boto3-datapipeline (>=1.24.0,<1.25.0)", "mypy-boto3-datasync (>=1.24.0,<1.25.0)", "mypy-boto3-dax (>=1.24.0,<1.25.0)", "mypy-boto3-detective (>=1.24.0,<1.25.0)", "mypy-boto3-devicefarm (>=1.24.0,<1.25.0)", "mypy-boto3-devops-guru (>=1.24.0,<1.25.0)", "mypy-boto3-directconnect (>=1.24.0,<1.25.0)", "mypy-boto3-discovery (>=1.24.0,<1.25.0)", "mypy-boto3-dlm (>=1.24.0,<1.25.0)", "mypy-boto3-dms (>=1.24.0,<1.25.0)", "mypy-boto3-docdb (>=1.24.0,<1.25.0)", "mypy-boto3-drs (>=1.24.0,<1.25.0)", "mypy-boto3-ds (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)", "mypy-boto3-ebs (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)", "mypy-boto3-ecr (>=1.24.0,<1.25.0)", "mypy-boto3-ecr-public (>=1.24.0,<1.25.0)", "mypy-boto3-ecs (>=1.24.0,<1.25.0)", "mypy-boto3-efs (>=1.24.0,<1.25.0)", "mypy-boto3-eks (>=1.24.0,<1.25.0)", "mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)", "mypy-boto3-elasticache (>=1.24.0,<1.25.0)", "mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)", "mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)", "mypy-boto3-elb (>=1.24.0,<1.25.0)", "mypy-boto3-elbv2 (>=1.24.0,<1.25.0)", "mypy-boto3-emr (>=1.24.0,<1.25.0)", "mypy-boto3-emr-containers (>=1.24.0,<1.25.0)", "mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-es (>=1.24.0,<1.25.0)", "mypy-boto3-events (>=1.24.0,<1.25.0)", "mypy-boto3-evidently (>=1.24.0,<1.25.0)", "mypy-boto3-finspace (>=1.24.0,<1.25.0)", "mypy-boto3-finspace-data (>=1.24.0,<1.25.0)", "mypy-boto3-firehose (>=1.24.0,<1.25.0)", "mypy-boto3-fis (>=1.24.0,<1.25.0)", "mypy-boto3-fms (>=1.24.0,<1.25.0)", "mypy-boto3-forecast (>=1.24.0,<1.25.0)", "mypy-boto3-forecastquery (>=1.24.0,<1.25.0)", "mypy-boto3-frauddetector (>=1.24.0,<1.25.0)", "mypy-boto3-fsx (>=1.24.0,<1.25.0)", "mypy-boto3-gamelift (>=1.24.0,<1.25.0)", "mypy-boto3-gamesparks (>=1.24.0,<1.25.0)", "mypy-boto3-glacier (>=1.24.0,<1.25.0)", "mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)", "mypy-boto3-glue (>=1.24.0,<1.25.0)", "mypy-boto3-grafana (>=1.24.0,<1.25.0)", "mypy-boto3-greengrass (>=1.24.0,<1.25.0)", "mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)", "mypy-boto3-groundstation (>=1.24.0,<1.25.0)", "mypy-boto3-guardduty (>=1.24.0,<1.25.0)", "mypy-boto3-health (>=1.24.0,<1.25.0)", "mypy-boto3-healthlake (>=1.24.0,<1.25.0)", "mypy-boto3-honeycode (>=1.24.0,<1.25.0)", "mypy-boto3-iam (>=1.24.0,<1.25.0)", "mypy-boto3-identitystore (>=1.24.0,<1.25.0)", "mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)", "mypy-boto3-importexport (>=1.24.0,<1.25.0)", "mypy-boto3-inspector (>=1.24.0,<1.25.0)", "mypy-boto3-inspector2 (>=1.24.0,<1.25.0)", "mypy-boto3-iot (>=1.24.0,<1.25.0)", "mypy-boto3-iot-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)", "mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)", "mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)", "mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)", "mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)", "mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)", "mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)", "mypy-boto3-iotwireless (>=1.24.0,<1.25.0)", "mypy-boto3-ivs (>=1.24.0,<1.25.0)", "mypy-boto3-ivschat (>=1.24.0,<1.25.0)", "mypy-boto3-kafka (>=1.24.0,<1.25.0)", "mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-kendra (>=1.24.0,<1.25.0)", "mypy-boto3-keyspaces (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)", "mypy-boto3-kms (>=1.24.0,<1.25.0)", "mypy-boto3-lakeformation (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-lex-models (>=1.24.0,<1.25.0)", "mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager-user-subscriptions (>=1.24.0,<1.25.0)", "mypy-boto3-lightsail (>=1.24.0,<1.25.0)", "mypy-boto3-location (>=1.24.0,<1.25.0)", "mypy-boto3-logs (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)", "mypy-boto3-m2 (>=1.24.0,<1.25.0)", "mypy-boto3-machinelearning (>=1.24.0,<1.25.0)", "mypy-boto3-macie (>=1.24.0,<1.25.0)", "mypy-boto3-macie2 (>=1.24.0,<1.25.0)", "mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)", "mypy-boto3-medialive (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)", "mypy-boto3-mediatailor (>=1.24.0,<1.25.0)", "mypy-boto3-memorydb (>=1.24.0,<1.25.0)", "mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)", "mypy-boto3-mgh (>=1.24.0,<1.25.0)", "mypy-boto3-mgn (>=1.24.0,<1.25.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)", "mypy-boto3-mobile (>=1.24.0,<1.25.0)", "mypy-boto3-mq (>=1.24.0,<1.25.0)", "mypy-boto3-mturk (>=1.24.0,<1.25.0)", "mypy-boto3-mwaa (>=1.24.0,<1.25.0)", "mypy-boto3-neptune (>=1.24.0,<1.25.0)", "mypy-boto3-network-firewall (>=1.24.0,<1.25.0)", "mypy-boto3-networkmanager (>=1.24.0,<1.25.0)", "mypy-boto3-nimble (>=1.24.0,<1.25.0)", "mypy-boto3-opensearch (>=1.24.0,<1.25.0)", "mypy-boto3-opsworks (>=1.24.0,<1.25.0)", "mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)", "mypy-boto3-organizations (>=1.24.0,<1.25.0)", "mypy-boto3-outposts (>=1.24.0,<1.25.0)", "mypy-boto3-panorama (>=1.24.0,<1.25.0)", "mypy-boto3-personalize (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-events (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-pi (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)", "mypy-boto3-polly (>=1.24.0,<1.25.0)", "mypy-boto3-pricing (>=1.24.0,<1.25.0)", "mypy-boto3-proton (>=1.24.0,<1.25.0)", "mypy-boto3-qldb (>=1.24.0,<1.25.0)", "mypy-boto3-qldb-session (>=1.24.0,<1.25.0)", "mypy-boto3-quicksight (>=1.24.0,<1.25.0)", "mypy-boto3-ram (>=1.24.0,<1.25.0)", "mypy-boto3-rbin (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-rds-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-rekognition (>=1.24.0,<1.25.0)", "mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)", "mypy-boto3-resource-groups (>=1.24.0,<1.25.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)", "mypy-boto3-robomaker (>=1.24.0,<1.25.0)", "mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)", "mypy-boto3-route53 (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)", "mypy-boto3-route53domains (>=1.24.0,<1.25.0)", "mypy-boto3-route53resolver (>=1.24.0,<1.25.0)", "mypy-boto3-rum (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-s3control (>=1.24.0,<1.25.0)", "mypy-boto3-s3outposts (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-savingsplans (>=1.24.0,<1.25.0)", "mypy-boto3-schemas (>=1.24.0,<1.25.0)", "mypy-boto3-sdb (>=1.24.0,<1.25.0)", "mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)", "mypy-boto3-securityhub (>=1.24.0,<1.25.0)", "mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)", "mypy-boto3-service-quotas (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)", "mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)", "mypy-boto3-ses (>=1.24.0,<1.25.0)", "mypy-boto3-sesv2 (>=1.24.0,<1.25.0)", "mypy-boto3-shield (>=1.24.0,<1.25.0)", "mypy-boto3-signer (>=1.24.0,<1.25.0)", "mypy-boto3-sms (>=1.24.0,<1.25.0)", "mypy-boto3-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)", "mypy-boto3-snowball (>=1.24.0,<1.25.0)", "mypy-boto3-sns (>=1.24.0,<1.25.0)", "mypy-boto3-sqs (>=1.24.0,<1.25.0)", "mypy-boto3-ssm (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)", "mypy-boto3-sso (>=1.24.0,<1.25.0)", "mypy-boto3-sso-admin (>=1.24.0,<1.25.0)", "mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)", "mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)", "mypy-boto3-storagegateway (>=1.24.0,<1.25.0)", "mypy-boto3-sts (>=1.24.0,<1.25.0)", "mypy-boto3-support (>=1.24.0,<1.25.0)", "mypy-boto3-swf (>=1.24.0,<1.25.0)", "mypy-boto3-synthetics (>=1.24.0,<1.25.0)", "mypy-boto3-textract (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-query (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-write (>=1.24.0,<1.25.0)", "mypy-boto3-transcribe (>=1.24.0,<1.25.0)", "mypy-boto3-transfer (>=1.24.0,<1.25.0)", "mypy-boto3-translate (>=1.24.0,<1.25.0)", "mypy-boto3-voice-id (>=1.24.0,<1.25.0)", "mypy-boto3-waf (>=1.24.0,<1.25.0)", "mypy-boto3-waf-regional (>=1.24.0,<1.25.0)", "mypy-boto3-wafv2 (>=1.24.0,<1.25.0)", "mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)", "mypy-boto3-wisdom (>=1.24.0,<1.25.0)", "mypy-boto3-workdocs (>=1.24.0,<1.25.0)", "mypy-boto3-worklink (>=1.24.0,<1.25.0)", "mypy-boto3-workmail (>=1.24.0,<1.25.0)", "mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)", "mypy-boto3-xray (>=1.24.0,<1.25.0)"] amp = ["mypy-boto3-amp (>=1.24.0,<1.25.0)"] amplify = ["mypy-boto3-amplify (>=1.24.0,<1.25.0)"] amplifybackend = ["mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)"] @@ -298,6 +298,7 @@ lex-runtime = ["mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)"] lexv2-models = ["mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)"] lexv2-runtime = ["mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)"] license-manager = ["mypy-boto3-license-manager (>=1.24.0,<1.25.0)"] +license-manager-user-subscriptions = ["mypy-boto3-license-manager-user-subscriptions (>=1.24.0,<1.25.0)"] lightsail = ["mypy-boto3-lightsail (>=1.24.0,<1.25.0)"] location = ["mypy-boto3-location (>=1.24.0,<1.25.0)"] logs = ["mypy-boto3-logs (>=1.24.0,<1.25.0)"] @@ -813,7 +814,7 @@ python-versions = "*" [[package]] name = "moto" -version = "3.1.16" +version = "3.1.17" description = "A library that allows your python tests to easily mock out the boto library" category = "main" optional = false @@ -843,7 +844,7 @@ PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""} requests = ">=2.5" responses = ">=0.9.0" sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""} -werkzeug = ">=0.5" +werkzeug = ">=0.5,<2.2.0" xmltodict = "*" [package.extras] @@ -982,8 +983,8 @@ optional = false python-versions = ">=3.6" [package.extras] -testing = ["pytest-benchmark", "pytest"] -dev = ["tox", "pre-commit"] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] [[package]] name = "prometheus-client" @@ -1056,10 +1057,10 @@ python-versions = ">=3.6" cryptography = {version = ">=3.3.1", optional = true, markers = "extra == \"crypto\""} [package.extras] -tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] -docs = ["zope.interface", "sphinx-rtd-theme", "sphinx"] -dev = ["pre-commit", "mypy", "coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)", "cryptography (>=3.3.1)", "zope.interface", "sphinx-rtd-theme", "sphinx"] crypto = ["cryptography (>=3.3.1)"] +dev = ["sphinx", "sphinx-rtd-theme", "zope.interface", "cryptography (>=3.3.1)", "pytest (>=6.0.0,<7.0.0)", "coverage[toml] (==5.0.4)", "mypy", "pre-commit"] +docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"] +tests = ["pytest (>=6.0.0,<7.0.0)", "coverage[toml] (==5.0.4)"] [[package]] name = "pyparsing" @@ -1135,6 +1136,20 @@ python-versions = "*" [package.dependencies] pytest = ">=3.2.5" +[[package]] +name = "pytest-order" +version = "1.0.1" +description = "pytest plugin to run your tests in a specific order" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +pytest = [ + {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, + {version = ">=5.0", markers = "python_version < \"3.10\""}, +] + [[package]] name = "pytest-timeout" version = "2.1.0" @@ -1446,7 +1461,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "5f7be77c7757a27bae28d39f31cd6f3a7a04e9dab53a200a6021a5af8ad02f37" +content-hash = "e58b30774603aa0f31579899a6c78579329c580f2f4bbaec209b0f9d52079fc6" [metadata.files] aiopg = [ @@ -1497,8 +1512,8 @@ boto3 = [ {file = "boto3-1.24.38.tar.gz", hash = "sha256:f4c6b025f392c934338c7f01badfddbd0d3cf2397ff5df35c31409798dce33f5"}, ] boto3-stubs = [ - {file = "boto3-stubs-1.24.41.tar.gz", hash = "sha256:8655d64981a7202aeb46a56a893ddcd23f59013894792e0e9a6f5350f7012674"}, - {file = "boto3_stubs-1.24.41-py3-none-any.whl", hash = "sha256:4579b2d28c5a0cd7d36a36cbdfcc872695f88eeaeadc8092f0b058049e9e08c7"}, + {file = "boto3-stubs-1.24.46.tar.gz", hash = "sha256:9482238ed9ea7794e6e66a41376bf75d5950f0328de09fac9d224906dcc624ef"}, + {file = "boto3_stubs-1.24.46-py3-none-any.whl", hash = "sha256:3aa84f2925b4b50b7f47ac41a11ac05302e744cdf460cb7bcf6488319393d8a4"}, ] botocore = [ {file = "botocore-1.27.38-py3-none-any.whl", hash = "sha256:46a0264ff3335496bd9cb404f83ec0d8eb7bfdef8f74a830c13e6a6b9612adea"}, @@ -1748,8 +1763,8 @@ mccabe = [ {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, ] moto = [ - {file = "moto-3.1.16-py3-none-any.whl", hash = "sha256:8bb8e267d9b948509d4739d81d995615a193d2c459f5c0a979aaeb0d3bd4b381"}, - {file = "moto-3.1.16.tar.gz", hash = "sha256:cbe8ad8a949f519771e5d25b670738604757fb67cd474d75d14c20677582e81f"}, + {file = "moto-3.1.17-py3-none-any.whl", hash = "sha256:84797321fad9a9e924c1c0385b302c80ec23429724c016b504f4bfca9d40d33a"}, + {file = "moto-3.1.17.tar.gz", hash = "sha256:f2e5b32e8910c51c0b0de5b73f902bc53e06fb1c1d077d2b848d27e0b0cbe65e"}, ] mypy = [ {file = "mypy-0.971-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f2899a3cbd394da157194f913a931edfd4be5f274a88041c9dc2d9cdcb1c315c"}, @@ -1948,6 +1963,10 @@ pytest-lazy-fixture = [ {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"}, {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"}, ] +pytest-order = [ + {file = "pytest-order-1.0.1.tar.gz", hash = "sha256:5dd6b929fbd7eaa6d0ee07586f65c623babb0afe72b4843c5f15055d6b3b1b1f"}, + {file = "pytest_order-1.0.1-py3-none-any.whl", hash = "sha256:bbe6e63a8e23741ab3e810d458d1ea7317e797b70f9550512d77d6e9e8fd1bbb"}, +] pytest-timeout = [ {file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"}, {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"}, diff --git a/pyproject.toml b/pyproject.toml index da47ecefaf..8a3d22f088 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" pytest-timeout = "^2.1.0" Werkzeug = "2.1.2" +pytest-order = "^1.0.1" [tool.poetry.dev-dependencies] yapf = "==0.31.0" diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 3a679cc705..cca4f7ce17 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -1,23 +1,21 @@ +import calendar import dataclasses +import enum import json import os -from pathlib import Path import re -import subprocess import timeit -import calendar -import enum -from datetime import datetime import uuid +import warnings +from contextlib import contextmanager +from datetime import datetime +from pathlib import Path +# Type-related stuff +from typing import Iterator, Optional + import pytest from _pytest.config import Config from _pytest.terminal import TerminalReporter -import warnings - -from contextlib import contextmanager - -# Type-related stuff -from typing import Iterator, Optional """ This file contains fixtures for micro-benchmarks. @@ -77,7 +75,7 @@ class PgBenchRunResult: # we know significant parts of these values from test input # but to be precise take them from output - for line in stdout.splitlines(): + for line in stdout_lines: # scaling factor: 5 if line.startswith("scaling factor:"): scale = int(line.split()[-1]) @@ -131,6 +129,58 @@ class PgBenchRunResult: ) +@dataclasses.dataclass +class PgBenchInitResult: + total: float + drop_tables: Optional[float] + create_tables: Optional[float] + client_side_generate: Optional[float] + vacuum: Optional[float] + primary_keys: Optional[float] + duration: float + start_timestamp: int + end_timestamp: int + + @classmethod + def parse_from_stderr( + cls, + stderr: str, + duration: float, + start_timestamp: int, + end_timestamp: int, + ): + # Parses pgbench initialize output for default initialization steps (dtgvp) + # Example: done in 5.66 s (drop tables 0.05 s, create tables 0.31 s, client-side generate 2.01 s, vacuum 0.53 s, primary keys 0.38 s). + + last_line = stderr.splitlines()[-1] + + regex = re.compile(r"done in (\d+\.\d+) s " + r"\(" + r"(?:drop tables (\d+\.\d+) s)?(?:, )?" + r"(?:create tables (\d+\.\d+) s)?(?:, )?" + r"(?:client-side generate (\d+\.\d+) s)?(?:, )?" + r"(?:vacuum (\d+\.\d+) s)?(?:, )?" + r"(?:primary keys (\d+\.\d+) s)?(?:, )?" + r"\)\.") + + if (m := regex.match(last_line)) is not None: + total, drop_tables, create_tables, client_side_generate, vacuum, primary_keys = [float(v) for v in m.groups() if v is not None] + else: + raise RuntimeError(f"can't parse pgbench initialize results from `{last_line}`") + + return cls( + total=total, + drop_tables=drop_tables, + create_tables=create_tables, + client_side_generate=client_side_generate, + vacuum=vacuum, + primary_keys=primary_keys, + duration=duration, + start_timestamp=start_timestamp, + end_timestamp=end_timestamp, + ) + + @enum.unique class MetricReport(str, enum.Enum): # str is a hack to make it json serializable # this means that this is a constant test parameter @@ -232,6 +282,32 @@ class NeonBenchmarker: '', MetricReport.TEST_PARAM) + def record_pg_bench_init_result(self, prefix: str, result: PgBenchInitResult): + test_params = [ + "start_timestamp", + "end_timestamp", + ] + for test_param in test_params: + self.record(f"{prefix}.{test_param}", + getattr(result, test_param), + '', + MetricReport.TEST_PARAM) + + metrics = [ + "duration", + "drop_tables", + "create_tables", + "client_side_generate", + "vacuum", + "primary_keys", + ] + for metric in metrics: + if (value := getattr(result, metric)) is not None: + self.record(f"{prefix}.{metric}", + value, + unit="s", + report=MetricReport.LOWER_IS_BETTER) + def get_io_writes(self, pageserver) -> int: """ Fetch the "cumulative # of bytes written" metric from the pageserver diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 36b88e9485..a37d40014c 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -146,3 +146,12 @@ def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]: key_parts = parts[0].split("-") lsn_parts = parts[1].split("-") return int(key_parts[0], 16), int(key_parts[1], 16), int(lsn_parts[0], 16), int(lsn_parts[1], 16) + + +def get_scale_for_db(size_mb: int) -> int: + """Returns pgbench scale factor for given target db size in MB. + + Ref https://www.cybertec-postgresql.com/en/a-formula-to-calculate-pgbench-scaling-factor-for-target-db-size/ + """ + + return round(0.06689 * size_mb - 0.5) diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index 776565b679..8bac8080db 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -10,7 +10,7 @@ In the CI, the performance tests are run in the same environment as the other in ## Remote tests -There are a few tests that marked with `pytest.mark.remote_cluster`. These tests do not set up a local environment, and instead require a libpq connection string to connect to. So they can be run on any Postgres compatible database. Currently, the CI runs these tests our staging environment daily. Staging is not an isolated environment, so there can be noise in the results due to activity of other clusters. +There are a few tests that marked with `pytest.mark.remote_cluster`. These tests do not set up a local environment, and instead require a libpq connection string to connect to. So they can be run on any Postgres compatible database. Currently, the CI runs these tests on our staging and captest environments daily. Those are not an isolated environments, so there can be noise in the results due to activity of other clusters. ## Noise diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 8644ced6d9..89c510e76e 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -1,17 +1,23 @@ -from contextlib import closing -from fixtures.neon_fixtures import PgBin, VanillaPostgres, NeonEnv, profiling_supported -from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare - -from fixtures.benchmark_fixture import PgBenchRunResult, MetricReport, NeonBenchmarker -from fixtures.log_helper import log - -from pathlib import Path - -import pytest -from datetime import datetime import calendar +import enum import os import timeit +from datetime import datetime +from pathlib import Path +from typing import List + +import pytest +from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult +from fixtures.compare_fixtures import NeonCompare, PgCompare +from fixtures.neon_fixtures import profiling_supported +from fixtures.utils import get_scale_for_db + + +@enum.unique +class PgBenchLoadType(enum.Enum): + INIT = "init" + SIMPLE_UPDATE = "simple_update" + SELECT_ONLY = "select-only" def utc_now_timestamp() -> int: @@ -22,23 +28,24 @@ def init_pgbench(env: PgCompare, cmdline): # calculate timestamps and durations separately # timestamp is intended to be used for linking to grafana and logs # duration is actually a metric and uses float instead of int for timestamp - init_start_timestamp = utc_now_timestamp() + start_timestamp = utc_now_timestamp() t0 = timeit.default_timer() with env.record_pageserver_writes('init.pageserver_writes'): - env.pg_bin.run_capture(cmdline) + out = env.pg_bin.run_capture(cmdline) env.flush() - init_duration = timeit.default_timer() - t0 - init_end_timestamp = utc_now_timestamp() - env.zenbenchmark.record("init.duration", - init_duration, - unit="s", - report=MetricReport.LOWER_IS_BETTER) - env.zenbenchmark.record("init.start_timestamp", - init_start_timestamp, - '', - MetricReport.TEST_PARAM) - env.zenbenchmark.record("init.end_timestamp", init_end_timestamp, '', MetricReport.TEST_PARAM) + duration = timeit.default_timer() - t0 + end_timestamp = utc_now_timestamp() + + stderr = Path(f"{out}.stderr").read_text() + + res = PgBenchInitResult.parse_from_stderr( + stderr=stderr, + duration=duration, + start_timestamp=start_timestamp, + end_timestamp=end_timestamp, + ) + env.zenbenchmark.record_pg_bench_init_result("init", res) def run_pgbench(env: PgCompare, prefix: str, cmdline): @@ -70,38 +77,84 @@ def run_pgbench(env: PgCompare, prefix: str, cmdline): # the test database. # # Currently, the # of connections is hardcoded at 4 -def run_test_pgbench(env: PgCompare, scale: int, duration: int): - - # Record the scale and initialize +def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: PgBenchLoadType): env.zenbenchmark.record("scale", scale, '', MetricReport.TEST_PARAM) - init_pgbench(env, ['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) - # Run simple-update workload - run_pgbench(env, - "simple-update", ['pgbench', '-N', '-c4', f'-T{duration}', '-P2', env.pg.connstr()]) + if workload_type == PgBenchLoadType.INIT: + # Run initialize + init_pgbench( + env, ['pgbench', f'-s{scale}', '-i', env.pg.connstr(options='-cstatement_timeout=1h')]) - # Run SELECT workload - run_pgbench(env, - "select-only", ['pgbench', '-S', '-c4', f'-T{duration}', '-P2', env.pg.connstr()]) + if workload_type == PgBenchLoadType.SIMPLE_UPDATE: + # Run simple-update workload + run_pgbench(env, + "simple-update", + [ + 'pgbench', + '-N', + '-c4', + f'-T{duration}', + '-P2', + '--progress-timestamp', + env.pg.connstr(), + ]) + + if workload_type == PgBenchLoadType.SELECT_ONLY: + # Run SELECT workload + run_pgbench(env, + "select-only", + [ + 'pgbench', + '-S', + '-c4', + f'-T{duration}', + '-P2', + '--progress-timestamp', + env.pg.connstr(), + ]) env.report_size() -def get_durations_matrix(default: int = 45): +def get_durations_matrix(default: int = 45) -> List[int]: durations = os.getenv("TEST_PG_BENCH_DURATIONS_MATRIX", default=str(default)) - return list(map(int, durations.split(","))) + rv = [] + for d in durations.split(","): + d = d.strip().lower() + if d.endswith('h'): + duration = int(d.removesuffix('h')) * 60 * 60 + elif d.endswith('m'): + duration = int(d.removesuffix('m')) * 60 + else: + duration = int(d.removesuffix('s')) + rv.append(duration) + + return rv -def get_scales_matrix(default: int = 10): +def get_scales_matrix(default: int = 10) -> List[int]: scales = os.getenv("TEST_PG_BENCH_SCALES_MATRIX", default=str(default)) - return list(map(int, scales.split(","))) + rv = [] + for s in scales.split(","): + s = s.strip().lower() + if s.endswith('mb'): + scale = get_scale_for_db(int(s.removesuffix('mb'))) + elif s.endswith('gb'): + scale = get_scale_for_db(int(s.removesuffix('gb')) * 1024) + else: + scale = int(s) + rv.append(scale) + + return rv # Run the pgbench tests against vanilla Postgres and neon @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix()) def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int): - run_test_pgbench(neon_with_baseline, scale, duration) + run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.INIT) + run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.SIMPLE_UPDATE) + run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.SELECT_ONLY) # Run the pgbench tests, and generate a flamegraph from it @@ -123,12 +176,34 @@ profiling="page_requests" env = neon_env_builder.init_start() env.neon_cli.create_branch("empty", "main") - run_test_pgbench(NeonCompare(zenbenchmark, env, pg_bin, "pgbench"), scale, duration) + neon_compare = NeonCompare(zenbenchmark, env, pg_bin, "pgbench") + run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.INIT) + run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SIMPLE_UPDATE) + run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SELECT_ONLY) +# The following 3 tests run on an existing database as it was set up by previous tests, +# and leaves the database in a state that would be used in the next tests. +# Modifying the definition order of these functions or adding other remote tests in between will alter results. +# See usage of --sparse-ordering flag in the pytest invocation in the CI workflow +# # Run the pgbench tests against an existing Postgres cluster @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix()) @pytest.mark.remote_cluster -def test_pgbench_remote(remote_compare: PgCompare, scale: int, duration: int): - run_test_pgbench(remote_compare, scale, duration) +def test_pgbench_remote_init(remote_compare: PgCompare, scale: int, duration: int): + run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.INIT) + + +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix()) +@pytest.mark.remote_cluster +def test_pgbench_remote_simple_update(remote_compare: PgCompare, scale: int, duration: int): + run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.SIMPLE_UPDATE) + + +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix()) +@pytest.mark.remote_cluster +def test_pgbench_remote_select_only(remote_compare: PgCompare, scale: int, duration: int): + run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.SELECT_ONLY) From 18f251384d68b1d39fc885b4d3416f363e85fe27 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 16 Aug 2022 11:10:38 +0300 Subject: [PATCH 0628/1022] Check for entire range during sasl validation (#2281) --- proxy/src/scram/messages.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index f6e6133adf..05855e74df 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -14,7 +14,7 @@ pub const SCRAM_RAW_NONCE_LEN: usize = 18; fn validate_sasl_extensions<'a>(parts: impl Iterator) -> Option<()> { for mut chars in parts.map(|s| s.chars()) { let attr = chars.next()?; - if !('a'..'z').contains(&attr) && !('A'..'Z').contains(&attr) { + if !('a'..='z').contains(&attr) && !('A'..='Z').contains(&attr) { return None; } let eq = chars.next()?; From b8f0f37de22034363ac895104eebbe0ccb0bbfd8 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Tue, 16 Aug 2022 11:15:35 +0200 Subject: [PATCH 0629/1022] Gen2 GH runner (#2128) * Re-add rustup override * Try s3 bucket * Set git version * Use v4 cache key to prevent problems * Switch to v5 for key * Add second rustup fix * Rebase * Add kaniko steps * Fix typo and set compress level * Disable global run default * Specify shell for step * Change approach with kaniko * Try less verbose shell spec * Add submodule pull * Add promote step * Adjust dependency chain * Try default swap again * Use env * Don't override aws key * Make kaniko build conditional * Specify runs on * Try without dependency link * Try soft fail * Use image with git * Try passing to next step * Fix duplicate * Try other approach * Try other approach * Fix typo * Try other syntax * Set env * Adjust setup * Try step 1 * Add link * Try global env * Fix mistake * Debug * Try other syntax * Try other approach * Change order * Move output one step down * Put output up one level * Try other syntax * Skip build * Try output * Re-enable build * Try other syntax * Skip middle step * Update check * Try first step of dockerhub push * Update needs dependency * Try explicit dir * Add missing package * Try other approach * Try other approach * Specify region * Use with * Try other approach * Add debug * Try other approach * Set region * Follow AWS example * Try github approach * Skip Qemu * Try stdin * Missing steps * Add missing close * Add echo debug * Try v2 endpoint * Use v1 endpoint * Try without quotes * Revert * Try crane * Add debug * Split steps * Fix duplicate * Add shell step * Conform to options * Add verbose flag * Try single step * Try workaround * First request fails hunch * Try bullseye image * Try other approach * Adjust verbose level * Try previous step * Add more debug * Remove debug step * Remove rogue indent * Try with larger image * Add build tag step * Update workflow for testing * Add tag step for test * Remove unused * Update dependency chain * Add ownership fix * Use matrix for promote * Force update * Force build * Remove unused * Add new image * Add missing argument * Update dockerfile copy * Update Dockerfile * Update clone * Update dockerfile * Go to correct folder * Use correct format * Update dockerfile * Remove cd * Debug find where we are * Add debug on first step * Changedir to postgres * Set workdir * Use v1 approach * Use other dependency * Try other approach * Try other approach * Update dockerfile * Update approach * Update dockerfile * Update approach * Update dockerfile * Update dockerfile * Add workspace hack * Update Dockerfile * Update Dockerfile * Update Dockerfile * Change last step * Cleanup pull in prep for review * Force build images * Add condition for latest tagging * Use pinned version * Try without name value * Remove more names * Shorten names * Add kaniko comments * Pin kaniko * Pin crane and ecr helper * Up one level * Switch to pinned tag for rust image * Force update for test Co-authored-by: Rory de Zoete Co-authored-by: Rory de Zoete Co-authored-by: Rory de Zoete Co-authored-by: Rory de Zoete Co-authored-by: Rory de Zoete Co-authored-by: Rory de Zoete Co-authored-by: Rory de Zoete Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 312 +++++++++++++++------------ 1 file changed, 171 insertions(+), 141 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 635c6126cc..99859197a1 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -7,10 +7,6 @@ on: - release pull_request: -defaults: - run: - shell: bash -euxo pipefail {0} - concurrency: # Allow only one workflow per any non-`main` branch. group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} @@ -23,7 +19,7 @@ env: jobs: build-neon: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned strategy: fail-fast: false matrix: @@ -35,7 +31,7 @@ jobs: GIT_VERSION: ${{ github.sha }} steps: - - name: Fix git ownerwhip + - name: Fix git ownership run: | # Workaround for `fatal: detected dubious ownership in repository at ...` # @@ -54,6 +50,7 @@ jobs: - name: Set pg revision for caching id: pg_ver run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres) + shell: bash -euxo pipefail {0} # Set some environment variables used by all the steps. # @@ -77,6 +74,7 @@ jobs: echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV + shell: bash -euxo pipefail {0} # Don't include the ~/.cargo/registry/src directory. It contains just # uncompressed versions of the crates in ~/.cargo/registry/cache @@ -93,8 +91,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + v5-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + v5-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- - name: Cache postgres build id: cache_pg @@ -106,14 +104,17 @@ jobs: - name: Build postgres if: steps.cache_pg.outputs.cache-hit != 'true' run: mold -run make postgres -j$(nproc) + shell: bash -euxo pipefail {0} - name: Run cargo build run: | ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests + shell: bash -euxo pipefail {0} - name: Run cargo test run: | ${cov_prefix} cargo test $CARGO_FLAGS + shell: bash -euxo pipefail {0} - name: Install rust binaries run: | @@ -154,9 +155,11 @@ jobs: echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list done fi + shell: bash -euxo pipefail {0} - name: Install postgres binaries run: cp -a tmp_install /tmp/neon/pg_install + shell: bash -euxo pipefail {0} - name: Upload Neon artifact uses: ./.github/actions/upload @@ -171,7 +174,7 @@ jobs: pg_regress-tests: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned needs: [ build-neon ] strategy: fail-fast: false @@ -199,7 +202,7 @@ jobs: other-tests: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned needs: [ build-neon ] strategy: fail-fast: false @@ -230,7 +233,7 @@ jobs: benchmarks: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned needs: [ build-neon ] if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') strategy: @@ -261,7 +264,7 @@ jobs: coverage-report: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned needs: [ other-tests, pg_regress-tests ] strategy: fail-fast: false @@ -284,7 +287,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + key: v5-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact uses: ./.github/actions/download @@ -300,6 +303,7 @@ jobs: - name: Merge coverage data run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge + shell: bash -euxo pipefail {0} - name: Build and upload coverage report run: | @@ -332,9 +336,11 @@ jobs: \"description\": \"Coverage report is ready\", \"target_url\": \"$REPORT_URL\" }" + shell: bash -euxo pipefail {0} trigger-e2e-tests: - runs-on: [ self-hosted, Linux, k8s-runner ] + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned needs: [ build-neon ] steps: - name: Set PR's status to pending and request a remote CI test @@ -369,32 +375,111 @@ jobs: } }" - docker-image: - runs-on: [ self-hosted, Linux, k8s-runner ] - needs: [ pg_regress-tests, other-tests ] - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + dockerfile-check: + if: github.event_name != 'workflow_dispatch' + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest outputs: - build-tag: ${{steps.build-tag.outputs.tag}} + value: ${{ steps.dockerfile-check.outputs.any_changed }} steps: - name: Checkout uses: actions/checkout@v3 + + - name: Get specific changed files + id: dockerfile-check + uses: tj-actions/changed-files@802732316a11c01531ea72773ec7998155238e31 # v25 + with: + files: | + Dockerfile + Dockerfile.compute-tools + ./vendor/postgres/Dockerfile + + neon-image: + # force building for all 3 images + if: needs.dockerfile-check.outputs.value != 'true' + runs-on: dev + needs: [ dockerfile-check ] + container: gcr.io/kaniko-project/executor:v1.9.0-debug + environment: dev + + steps: + - name: Checkout + uses: actions/checkout@v1 # v3 won't work with kaniko with: submodules: true fetch-depth: 0 - - name: Login to DockerHub - uses: docker/login-action@v1 - with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - with: - driver: docker + - name: Kaniko build console + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID + compute-tools-image: + if: needs.dockerfile-check.outputs.value != 'true' + runs-on: dev + needs: [ dockerfile-check ] + container: gcr.io/kaniko-project/executor:v1.9.0-debug + environment: dev + + steps: + - name: Checkout + uses: actions/checkout@v1 # v3 won't work with kaniko + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build console + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID + + compute-node-image: + if: needs.dockerfile-check.outputs.value != 'true' + runs-on: dev + needs: [ dockerfile-check ] + container: gcr.io/kaniko-project/executor:v1.9.0-debug + environment: dev + + steps: + - name: Checkout + uses: actions/checkout@v1 # v3 won't work with kaniko + with: + submodules: true + fetch-depth: 0 + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build console + working-directory: ./vendor/postgres/ + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID + + promote-images: + runs-on: dev + needs: [ neon-image, compute-tools-image, compute-node-image ] + if: github.event_name != 'workflow_dispatch' + container: amazon/aws-cli + strategy: + fail-fast: false + matrix: + name: [ neon, compute-tools, compute-node ] + + steps: + - name: Promote image to latest + run: + MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=$GITHUB_RUN_ID --query 'images[].imageManifest' --output text) && aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST" + + push-docker-hub: + runs-on: dev + needs: [ promote-images ] + container: golang:1.19-bullseye + environment: dev + + steps: + - name: Install Crane & ECR helper + run: | + go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 + go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 + - name: Get build tag run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then @@ -402,117 +487,48 @@ jobs: elif [[ "$GITHUB_REF_NAME" == "release" ]]; then echo "::set-output name=tag::release-$(git rev-list --count HEAD)" else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - exit 1 + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' " + echo "::set-output name=tag::$GITHUB_RUN_ID" fi id: build-tag - - name: Get legacy build tag + - name: Configure ECR login run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "::set-output name=tag::latest" - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "::set-output name=tag::release" - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - exit 1 - fi - id: legacy-build-tag + mkdir /github/home/.docker/ + echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json - - name: Build neon Docker image - uses: docker/build-push-action@v2 - with: - context: . - build-args: | - GIT_VERSION="${{github.sha}}" - AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" - AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" - pull: true - push: true - tags: neondatabase/neon:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/neon:${{steps.build-tag.outputs.tag}} + - name: Pull neon image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:latest neon - docker-image-compute: - runs-on: [ self-hosted, Linux, k8s-runner ] - needs: [ pg_regress-tests, other-tests ] - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' - outputs: - build-tag: ${{steps.build-tag.outputs.tag}} - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 + - name: Pull compute tools image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest compute-tools - - name: Login to DockerHub - uses: docker/login-action@v1 - with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - name: Pull compute node image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest compute-node - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - with: - driver: docker - - - name: Get build tag + - name: Configure docker login run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "::set-output name=tag::$(git rev-list --count HEAD)" - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "::set-output name=tag::release-$(git rev-list --count HEAD)" - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - exit 1 - fi - id: build-tag + # ECR Credential Helper & Docker Hub don't work together in config, hence reset + echo "" > /github/home/.docker/config.json + crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io - - name: Get legacy build tag + - name: Push neon image to Docker Hub + run: crane push neon neondatabase/neon:${{steps.build-tag.outputs.tag}} + + - name: Push compute tools image to Docker Hub + run: crane push compute-tools neondatabase/compute-tools:${{steps.build-tag.outputs.tag}} + + - name: Push compute node image to Docker Hub + run: crane push compute-node neondatabase/compute-node:${{steps.build-tag.outputs.tag}} + + - name: Add latest tag to images + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "::set-output name=tag::latest" - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "::set-output name=tag::release" - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - exit 1 - fi - id: legacy-build-tag - - - name: Build compute-tools Docker image - uses: docker/build-push-action@v2 - with: - context: . - build-args: | - GIT_VERSION="${{github.sha}}" - AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" - AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" - push: false - file: Dockerfile.compute-tools - tags: neondatabase/compute-tools:local - - - name: Push compute-tools Docker image - uses: docker/build-push-action@v2 - with: - context: . - build-args: | - GIT_VERSION="${{github.sha}}" - AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" - AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" - push: true - file: Dockerfile.compute-tools - tags: neondatabase/compute-tools:${{steps.legacy-build-tag.outputs.tag}} - - - name: Build compute-node Docker image - uses: docker/build-push-action@v2 - with: - context: ./vendor/postgres/ - build-args: - COMPUTE_TOOLS_TAG=local - push: true - tags: neondatabase/compute-node:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/compute-node:${{steps.build-tag.outputs.tag}} + crane tag neondatabase/neon:${{steps.build-tag.outputs.tag}} latest + crane tag neondatabase/compute-tools:${{steps.build-tag.outputs.tag}} latest + crane tag neondatabase/compute-node:${{steps.build-tag.outputs.tag}} latest calculate-deploy-targets: runs-on: [ self-hosted, Linux, k8s-runner ] @@ -537,15 +553,17 @@ jobs: fi deploy: - runs-on: [ self-hosted, Linux, k8s-runner ] - # We need both storage **and** compute images for deploy, because control plane - # picks the compute version based on the storage version. If it notices a fresh - # storage it may bump the compute version. And if compute image failed to build - # it may break things badly. - needs: [ docker-image, docker-image-compute, calculate-deploy-targets ] + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest + # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. + # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly + needs: [ push-docker-hub, calculate-deploy-targets ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash strategy: matrix: include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} @@ -556,8 +574,14 @@ jobs: submodules: true fetch-depth: 0 + - name: Setup python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: Setup ansible run: | + export PATH="/root/.local/bin:$PATH" pip install --progress-bar off --user ansible boto3 - name: Redeploy @@ -585,13 +609,16 @@ jobs: rm -f neon_install.tar.gz .neon_current_version deploy-proxy: - runs-on: [ self-hosted, Linux, k8s-runner ] - # Compute image isn't strictly required for proxy deploy, but let's still wait for it - # to run all deploy jobs consistently. - needs: [ docker-image, docker-image-compute, calculate-deploy-targets ] + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest + # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. + needs: [ push-docker-hub, calculate-deploy-targets ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash strategy: matrix: include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} @@ -604,6 +631,9 @@ jobs: submodules: true fetch-depth: 0 + - name: Add curl + run: apt update && apt install curl -y + - name: Store kubeconfig file run: | echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG} @@ -618,4 +648,4 @@ jobs: run: | DOCKER_TAG=${{needs.docker-image.outputs.build-tag}} helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s - helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s \ No newline at end of file From 83f7b8ed2298f4d94b14501ec6c3d2bc2f9e8c80 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Tue, 16 Aug 2022 13:41:51 +0200 Subject: [PATCH 0630/1022] Add missing step output, revert one deploy step (#2285) * Add missing step output, revert one deploy step * Conform to syntax * Update approach * Add missing value * Add missing needs Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 66 ++++++++++++++++++---------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 99859197a1..57214e0bfe 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -17,6 +17,24 @@ env: COPT: '-Werror' jobs: + tag: + runs-on: dev + outputs: + build-tag: ${{steps.build-tag.outputs.tag}} + + steps: + - name: Get build tag + run: | + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "::set-output name=tag::$(git rev-list --count HEAD)" + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "::set-output name=tag::release-$(git rev-list --count HEAD)" + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' " + echo "::set-output name=tag::$GITHUB_RUN_ID" + fi + id: build-tag + build-neon: runs-on: dev container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned @@ -470,7 +488,7 @@ jobs: push-docker-hub: runs-on: dev - needs: [ promote-images ] + needs: [ promote-images, tag ] container: golang:1.19-bullseye environment: dev @@ -480,17 +498,17 @@ jobs: go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 - - name: Get build tag - run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "::set-output name=tag::$(git rev-list --count HEAD)" - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "::set-output name=tag::release-$(git rev-list --count HEAD)" - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' " - echo "::set-output name=tag::$GITHUB_RUN_ID" - fi - id: build-tag +# - name: Get build tag +# run: | +# if [[ "$GITHUB_REF_NAME" == "main" ]]; then +# echo "::set-output name=tag::$(git rev-list --count HEAD)" +# elif [[ "$GITHUB_REF_NAME" == "release" ]]; then +# echo "::set-output name=tag::release-$(git rev-list --count HEAD)" +# else +# echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' " +# echo "::set-output name=tag::$GITHUB_RUN_ID" +# fi +# id: build-tag - name: Configure ECR login run: | @@ -513,22 +531,22 @@ jobs: crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io - name: Push neon image to Docker Hub - run: crane push neon neondatabase/neon:${{steps.build-tag.outputs.tag}} + run: crane push neon neondatabase/neon:${{needs.tag.outputs.build-tag}} - name: Push compute tools image to Docker Hub - run: crane push compute-tools neondatabase/compute-tools:${{steps.build-tag.outputs.tag}} + run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} - name: Push compute node image to Docker Hub - run: crane push compute-node neondatabase/compute-node:${{steps.build-tag.outputs.tag}} + run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}} - name: Add latest tag to images if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' run: | - crane tag neondatabase/neon:${{steps.build-tag.outputs.tag}} latest - crane tag neondatabase/compute-tools:${{steps.build-tag.outputs.tag}} latest - crane tag neondatabase/compute-node:${{steps.build-tag.outputs.tag}} latest + crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest calculate-deploy-targets: runs-on: [ self-hosted, Linux, k8s-runner ] @@ -553,11 +571,11 @@ jobs: fi deploy: - runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest + runs-on: [ self-hosted, Linux, k8s-runner ] + #container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly - needs: [ push-docker-hub, calculate-deploy-targets ] + needs: [ push-docker-hub, calculate-deploy-targets, tag ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' @@ -586,7 +604,7 @@ jobs: - name: Redeploy run: | - export DOCKER_TAG=${{needs.docker-image.outputs.build-tag}} + export DOCKER_TAG=${{needs.tag.outputs.build-tag}} cd "$(pwd)/.github/ansible" if [[ "$GITHUB_REF_NAME" == "main" ]]; then @@ -612,7 +630,7 @@ jobs: runs-on: dev container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. - needs: [ push-docker-hub, calculate-deploy-targets ] + needs: [ push-docker-hub, calculate-deploy-targets, tag ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' @@ -646,6 +664,6 @@ jobs: - name: Re-deploy proxy run: | - DOCKER_TAG=${{needs.docker-image.outputs.build-tag}} + DOCKER_TAG=${{needs.tag.outputs.build-tag}} helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s \ No newline at end of file From 4cde0e7a37d15ec1c6a5ca3a074a0a950a08cd60 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Tue, 16 Aug 2022 13:59:41 +0200 Subject: [PATCH 0631/1022] Error for fatal not git repo (#2286) Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 57214e0bfe..004797e502 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -19,10 +19,14 @@ env: jobs: tag: runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest outputs: build-tag: ${{steps.build-tag.outputs.tag}} steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Get build tag run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then @@ -33,6 +37,7 @@ jobs: echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' " echo "::set-output name=tag::$GITHUB_RUN_ID" fi + shell: bash id: build-tag build-neon: From 1d4114183c09b01069370a19c1d4a855f8bf2571 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Tue, 16 Aug 2022 15:41:31 +0200 Subject: [PATCH 0632/1022] Use main, not branch for ref check (#2288) * Use main, not branch for ref check * Add more debug * Count main, not head * Try new approach * Conform to syntax * Update approach * Get full history * Skip checkout * Cleanup debug * Remove more debug Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 004797e502..2f5c03f794 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -26,15 +26,20 @@ jobs: steps: - name: Checkout uses: actions/checkout@v3 + with: + fetch-depth: 0 - name: Get build tag run: | + echo run:$GITHUB_RUN_ID + echo ref:$GITHUB_REF_NAME + echo rev:$(git rev-list --count HEAD) if [[ "$GITHUB_REF_NAME" == "main" ]]; then echo "::set-output name=tag::$(git rev-list --count HEAD)" elif [[ "$GITHUB_REF_NAME" == "release" ]]; then echo "::set-output name=tag::release-$(git rev-list --count HEAD)" else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' " + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" echo "::set-output name=tag::$GITHUB_RUN_ID" fi shell: bash From 9218426e41417975d817fb5464ea83e2cf2f5aba Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Tue, 16 Aug 2022 17:24:58 +0200 Subject: [PATCH 0633/1022] Fix docker zombie process issue (#2289) * Fix docker zombie process issue * Init everywhere Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 2f5c03f794..9425ceb536 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -47,7 +47,9 @@ jobs: build-neon: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init strategy: fail-fast: false matrix: @@ -202,7 +204,9 @@ jobs: pg_regress-tests: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init needs: [ build-neon ] strategy: fail-fast: false @@ -230,7 +234,9 @@ jobs: other-tests: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init needs: [ build-neon ] strategy: fail-fast: false @@ -261,7 +267,9 @@ jobs: benchmarks: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init needs: [ build-neon ] if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') strategy: @@ -292,7 +300,9 @@ jobs: coverage-report: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init needs: [ other-tests, pg_regress-tests ] strategy: fail-fast: false @@ -368,7 +378,9 @@ jobs: trigger-e2e-tests: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init needs: [ build-neon ] steps: - name: Set PR's status to pending and request a remote CI test From 648e8bbefeadcf8c81d12f3598cca0794c57a284 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 16 Aug 2022 18:49:22 +0300 Subject: [PATCH 0634/1022] Fix 1.63 clippy lints (#2282) --- libs/postgres_ffi/src/relfile_utils.rs | 2 +- libs/postgres_ffi/src/xlog_utils.rs | 4 ++-- libs/utils/src/bin_ser.rs | 4 ++-- libs/utils/src/http/request.rs | 10 ++++------ libs/utils/src/lsn.rs | 2 +- libs/utils/src/postgres_backend.rs | 2 +- libs/utils/src/pq_proto.rs | 2 +- libs/utils/src/seqwait.rs | 2 +- libs/utils/tests/bin_ser_test.rs | 2 +- libs/utils/tests/ssl_test.rs | 3 +++ pageserver/src/http/routes.rs | 2 +- pageserver/src/layered_repository/disk_btree.rs | 2 +- proxy/src/auth/backend.rs | 2 +- proxy/src/compute.rs | 13 +++++++++++-- safekeeper/src/control_file_upgrade.rs | 4 ++-- safekeeper/src/safekeeper.rs | 2 +- safekeeper/src/send_wal.rs | 2 +- 17 files changed, 35 insertions(+), 25 deletions(-) diff --git a/libs/postgres_ffi/src/relfile_utils.rs b/libs/postgres_ffi/src/relfile_utils.rs index 94498ee9a9..cc9d6470c0 100644 --- a/libs/postgres_ffi/src/relfile_utils.rs +++ b/libs/postgres_ffi/src/relfile_utils.rs @@ -5,7 +5,7 @@ use crate::pg_constants; use once_cell::sync::OnceCell; use regex::Regex; -#[derive(Debug, Clone, thiserror::Error, PartialEq)] +#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] pub enum FilePathError { #[error("invalid relation fork name")] InvalidForkName, diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 29b00c8d36..956f53ce85 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -80,12 +80,12 @@ pub fn XLogSegNoOffsetToRecPtr( #[allow(non_snake_case)] pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { - return format!( + format!( "{:>08X}{:>08X}{:>08X}", tli, logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes), logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes) - ); + ) } #[allow(non_snake_case)] diff --git a/libs/utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs index 70f54ea02f..42b45eeea0 100644 --- a/libs/utils/src/bin_ser.rs +++ b/libs/utils/src/bin_ser.rs @@ -265,7 +265,7 @@ mod tests { use serde::{Deserialize, Serialize}; use std::io::Cursor; - #[derive(Debug, PartialEq, Serialize, Deserialize)] + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct ShortStruct { a: u8, b: u32, @@ -286,7 +286,7 @@ mod tests { const SHORT2_ENC_LE: &[u8] = &[8, 0, 0, 3, 7]; const SHORT2_ENC_LE_TRAILING: &[u8] = &[8, 0, 0, 3, 7, 0xff, 0xff, 0xff]; - #[derive(Debug, PartialEq, Serialize, Deserialize)] + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct LongMsg { pub tag: u8, pub blockpos: u32, diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs index 8e3d357397..4984d695fd 100644 --- a/libs/utils/src/http/request.rs +++ b/libs/utils/src/http/request.rs @@ -10,12 +10,10 @@ pub fn get_request_param<'a>( ) -> Result<&'a str, ApiError> { match request.param(param_name) { Some(arg) => Ok(arg), - None => { - return Err(ApiError::BadRequest(format!( - "no {} specified in path param", - param_name - ))) - } + None => Err(ApiError::BadRequest(format!( + "no {} specified in path param", + param_name + ))), } } diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index 3dab2a625c..1090f4c679 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -18,7 +18,7 @@ pub const XLOG_BLCKSZ: u32 = 8192; pub struct Lsn(pub u64); /// We tried to parse an LSN from a string, but failed -#[derive(Debug, PartialEq, thiserror::Error)] +#[derive(Debug, PartialEq, Eq, thiserror::Error)] #[error("LsnParseError")] pub struct LsnParseError; diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 79dca96fcf..4d873bd5ac 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -50,7 +50,7 @@ pub trait Handler { /// PostgresBackend protocol state. /// XXX: The order of the constructors matters. -#[derive(Clone, Copy, PartialEq, PartialOrd)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] pub enum ProtoState { Initialization, Encrypted, diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index 3dcae4d0af..3f14acd50d 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -930,7 +930,7 @@ impl<'a> BeMessage<'a> { // Neon extension of postgres replication protocol // See NEON_STATUS_UPDATE_TAG_BYTE -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct ReplicationFeedback { // Last known size of the timeline. Used to enforce timeline size limit. pub current_timeline_size: u64, diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index bc32f51b13..a531975d60 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -9,7 +9,7 @@ use std::sync::Mutex; use std::time::Duration; /// An error happened while waiting for a number -#[derive(Debug, PartialEq, thiserror::Error)] +#[derive(Debug, PartialEq, Eq, thiserror::Error)] #[error("SeqWaitError")] pub enum SeqWaitError { /// The wait timeout was reached diff --git a/libs/utils/tests/bin_ser_test.rs b/libs/utils/tests/bin_ser_test.rs index f357837a55..b995b61b78 100644 --- a/libs/utils/tests/bin_ser_test.rs +++ b/libs/utils/tests/bin_ser_test.rs @@ -4,7 +4,7 @@ use serde::Deserialize; use std::io::Read; use utils::bin_ser::LeSer; -#[derive(Debug, PartialEq, Deserialize)] +#[derive(Debug, PartialEq, Eq, Deserialize)] pub struct HeaderData { magic: u16, info: u16, diff --git a/libs/utils/tests/ssl_test.rs b/libs/utils/tests/ssl_test.rs index 907ef98aec..248400c2c1 100644 --- a/libs/utils/tests/ssl_test.rs +++ b/libs/utils/tests/ssl_test.rs @@ -30,6 +30,9 @@ static CERT: Lazy = Lazy::new(|| { }); #[test] +// [false-positive](https://github.com/rust-lang/rust-clippy/issues/9274), +// we resize the vector so doing some modifications after all +#[allow(clippy::read_zero_byte_vec)] fn ssl() { let (mut client_sock, server_sock) = make_tcp_pair(); diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 1b1b4f99cb..81ecdb7404 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -167,7 +167,7 @@ fn local_timeline_info_from_repo_timeline( ) -> anyhow::Result { match repo_timeline { RepositoryTimeline::Loaded(timeline) => local_timeline_info_from_loaded_timeline( - &*timeline, + timeline, include_non_incremental_logical_size, include_non_incremental_physical_size, ), diff --git a/pageserver/src/layered_repository/disk_btree.rs b/pageserver/src/layered_repository/disk_btree.rs index dc8d7a2ad3..c130a42a8e 100644 --- a/pageserver/src/layered_repository/disk_btree.rs +++ b/pageserver/src/layered_repository/disk_btree.rs @@ -209,7 +209,7 @@ where reader: R, } -#[derive(Clone, Copy, Debug, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum VisitDirection { Forwards, Backwards, diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index b10ede8d5e..bb7e7ef67b 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -86,7 +86,7 @@ impl From for tokio_postgres::Config { /// * However, when we substitute `T` with [`ClientCredentials`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum BackendType { /// Legacy Cloud API (V1) + link auth. LegacyConsole(T), diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 896ef3588d..3bad36661b 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -65,8 +65,17 @@ impl NodeInfo { // require for our business. let mut connection_error = None; let ports = self.config.get_ports(); - for (i, host) in self.config.get_hosts().iter().enumerate() { - let port = ports.get(i).or_else(|| ports.get(0)).unwrap_or(&5432); + let hosts = self.config.get_hosts(); + // the ports array is supposed to have 0 entries, 1 entry, or as many entries as in the hosts array + if ports.len() > 1 && ports.len() != hosts.len() { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("couldn't connect: bad compute config, ports and hosts entries' count does not match: {:?}", self.config), + )); + } + + for (i, host) in hosts.iter().enumerate() { + let port = ports.get(i).or_else(|| ports.first()).unwrap_or(&5432); let host = match host { Host::Tcp(host) => host.as_str(), Host::Unix(_) => continue, // unix sockets are not welcome here diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 5e749796dd..91d2f61c10 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -40,7 +40,7 @@ struct SafeKeeperStateV1 { wal_start_lsn: Lsn, } -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ServerInfoV2 { /// Postgres server version pub pg_version: u32, @@ -70,7 +70,7 @@ pub struct SafeKeeperStateV2 { pub wal_start_lsn: Lsn, } -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ServerInfoV3 { /// Postgres server version pub pg_version: u32, diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index a9373cb584..88747f14e5 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -127,7 +127,7 @@ impl AcceptorState { /// Information about Postgres. Safekeeper gets it once and then verifies /// all further connections from computes match. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ServerInfo { /// Postgres server version pub pg_version: u32, diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 7439d6a8f6..4a9c56859f 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -36,7 +36,7 @@ const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; type FullTransactionId = u64; /// Hot standby feedback received from replica -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct HotStandbyFeedback { pub ts: TimestampTz, pub xmin: FullTransactionId, From b21f7382ccdc5512eb11f3467fe0f5468e0fa543 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 15 Aug 2022 23:16:35 +0300 Subject: [PATCH 0635/1022] split out timeline metrics, track layer map loading and size calculation --- pageserver/src/layered_repository/timeline.rs | 165 +++++++++++------- 1 file changed, 98 insertions(+), 67 deletions(-) diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 2d396024a0..e27619cc83 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -4,6 +4,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::Bytes; use fail::fail_point; use itertools::Itertools; +use metrics::core::{AtomicU64, GenericCounter}; use once_cell::sync::Lazy; use tracing::*; @@ -223,6 +224,70 @@ impl From for RepositoryTimeline { } } +struct TimelineMetrics { + pub reconstruct_time_histo: Histogram, + pub materialized_page_cache_hit_counter: GenericCounter, + pub flush_time_histo: Histogram, + pub compact_time_histo: Histogram, + pub create_images_time_histo: Histogram, + pub init_logical_size_histo: Histogram, + pub load_layer_map_histo: Histogram, + pub last_record_gauge: IntGauge, + pub wait_lsn_time_histo: Histogram, + pub current_physical_size_gauge: UIntGauge, +} + +impl TimelineMetrics { + fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self { + let tenant_id = tenant_id.to_string(); + let timeline_id = timeline_id.to_string(); + + let reconstruct_time_histo = RECONSTRUCT_TIME + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let flush_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["layer flush", &tenant_id, &timeline_id]) + .unwrap(); + let compact_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["compact", &tenant_id, &timeline_id]) + .unwrap(); + let create_images_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["create images", &tenant_id, &timeline_id]) + .unwrap(); + let init_logical_size_histo = STORAGE_TIME + .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id]) + .unwrap(); + let load_layer_map_histo = STORAGE_TIME + .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id]) + .unwrap(); + let last_record_gauge = LAST_RECORD_LSN + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let wait_lsn_time_histo = WAIT_LSN_TIME + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + + TimelineMetrics { + reconstruct_time_histo, + materialized_page_cache_hit_counter, + flush_time_histo, + compact_time_histo, + create_images_time_histo, + init_logical_size_histo, + load_layer_map_histo, + last_record_gauge, + wait_lsn_time_histo, + current_physical_size_gauge, + } + } +} + pub struct LayeredTimeline { conf: &'static PageServerConf, tenant_conf: Arc>, @@ -269,14 +334,7 @@ pub struct LayeredTimeline { ancestor_lsn: Lsn, // Metrics - reconstruct_time_histo: Histogram, - materialized_page_cache_hit_counter: IntCounter, - flush_time_histo: Histogram, - compact_time_histo: Histogram, - create_images_time_histo: Histogram, - last_record_gauge: IntGauge, - wait_lsn_time_histo: Histogram, - current_physical_size_gauge: UIntGauge, + metrics: TimelineMetrics, /// If `true`, will backup its files that appear after each checkpointing to the remote storage. upload_layers: AtomicBool, @@ -426,7 +484,7 @@ impl Timeline for LayeredTimeline { "wait_lsn called by WAL receiver thread" ); - self.wait_lsn_time_histo.observe_closure_duration( + self.metrics.wait_lsn_time_histo.observe_closure_duration( || self.last_record_lsn .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) .with_context(|| { @@ -468,7 +526,8 @@ impl Timeline for LayeredTimeline { self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; - self.reconstruct_time_histo + self.metrics + .reconstruct_time_histo .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) } @@ -530,7 +589,7 @@ impl Timeline for LayeredTimeline { } fn get_physical_size(&self) -> u64 { - self.current_physical_size_gauge.get() + self.metrics.current_physical_size_gauge.get() } fn get_physical_size_non_incremental(&self) -> anyhow::Result { @@ -604,43 +663,6 @@ impl LayeredTimeline { walredo_mgr: Arc, upload_layers: bool, ) -> LayeredTimeline { - let reconstruct_time_histo = RECONSTRUCT_TIME - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let flush_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "layer flush", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let compact_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "compact", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let create_images_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "create images", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let last_record_gauge = LAST_RECORD_LSN - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let wait_lsn_time_histo = WAIT_LSN_TIME - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let mut result = LayeredTimeline { conf, tenant_conf, @@ -663,14 +685,7 @@ impl LayeredTimeline { ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), - reconstruct_time_histo, - materialized_page_cache_hit_counter, - flush_time_histo, - compact_time_histo, - create_images_time_histo, - last_record_gauge, - wait_lsn_time_histo, - current_physical_size_gauge, + metrics: TimelineMetrics::new(&tenant_id, &timeline_id), upload_layers: AtomicBool::new(upload_layers), @@ -706,6 +721,8 @@ impl LayeredTimeline { let mut layers = self.layers.write().unwrap(); let mut num_layers = 0; + let timer = self.metrics.load_layer_map_histo.start_timer(); + // Scan timeline directory and create ImageFileName and DeltaFilename // structs representing all files on disk let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); @@ -777,7 +794,11 @@ impl LayeredTimeline { "loaded layer map with {} layers at {}, total physical size: {}", num_layers, disk_consistent_lsn, total_physical_size ); - self.current_physical_size_gauge.set(total_physical_size); + self.metrics + .current_physical_size_gauge + .set(total_physical_size); + + timer.stop_and_record(); Ok(()) } @@ -808,12 +829,16 @@ impl LayeredTimeline { } } + let timer = self.metrics.init_logical_size_histo.start_timer(); + // Have to calculate it the hard way let last_lsn = self.get_last_record_lsn(); let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?; self.current_logical_size .store(logical_size as isize, AtomicOrdering::SeqCst); debug!("calculated logical size the hard way: {}", logical_size); + + timer.stop_and_record(); Ok(()) } @@ -878,7 +903,7 @@ impl LayeredTimeline { ValueReconstructResult::Continue => { // If we reached an earlier cached page image, we're done. if cont_lsn == cached_lsn + 1 { - self.materialized_page_cache_hit_counter.inc_by(1); + self.metrics.materialized_page_cache_hit_counter.inc_by(1); return Ok(()); } if prev_lsn <= cont_lsn { @@ -1074,7 +1099,7 @@ impl LayeredTimeline { fn finish_write(&self, new_lsn: Lsn) { assert!(new_lsn.is_aligned()); - self.last_record_gauge.set(new_lsn.0 as i64); + self.metrics.last_record_gauge.set(new_lsn.0 as i64); self.last_record_lsn.advance(new_lsn); } @@ -1178,7 +1203,7 @@ impl LayeredTimeline { } }; - let timer = self.flush_time_histo.start_timer(); + let timer = self.metrics.flush_time_histo.start_timer(); loop { let layers = self.layers.read().unwrap(); @@ -1349,7 +1374,7 @@ impl LayeredTimeline { // update the timeline's physical size let sz = new_delta_path.metadata()?.len(); - self.current_physical_size_gauge.add(sz); + self.metrics.current_physical_size_gauge.add(sz); // update metrics NUM_PERSISTENT_FILES_CREATED.inc_by(1); PERSISTENT_BYTES_WRITTEN.inc_by(sz); @@ -1418,7 +1443,7 @@ impl LayeredTimeline { } // 3. Compact - let timer = self.compact_time_histo.start_timer(); + let timer = self.metrics.compact_time_histo.start_timer(); self.compact_level0(target_file_size)?; timer.stop_and_record(); } @@ -1494,7 +1519,7 @@ impl LayeredTimeline { lsn: Lsn, force: bool, ) -> Result> { - let timer = self.create_images_time_histo.start_timer(); + let timer = self.metrics.create_images_time_histo.start_timer(); let mut image_layers: Vec = Vec::new(); let mut layer_paths_to_upload = HashSet::new(); for partition in partitioning.parts.iter() { @@ -1538,7 +1563,8 @@ impl LayeredTimeline { let mut layers = self.layers.write().unwrap(); for l in image_layers { - self.current_physical_size_gauge + self.metrics + .current_physical_size_gauge .add(l.path().metadata()?.len()); layers.insert_historic(Arc::new(l)); } @@ -1788,7 +1814,8 @@ impl LayeredTimeline { let new_delta_path = l.path(); // update the timeline's physical size - self.current_physical_size_gauge + self.metrics + .current_physical_size_gauge .add(new_delta_path.metadata()?.len()); new_layer_paths.insert(new_delta_path); @@ -1801,7 +1828,9 @@ impl LayeredTimeline { drop(all_keys_iter); for l in deltas_to_compact { if let Some(path) = l.local_path() { - self.current_physical_size_gauge.sub(path.metadata()?.len()); + self.metrics + .current_physical_size_gauge + .sub(path.metadata()?.len()); layer_paths_do_delete.insert(path); } l.delete()?; @@ -2058,7 +2087,9 @@ impl LayeredTimeline { let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len()); for doomed_layer in layers_to_remove { if let Some(path) = doomed_layer.local_path() { - self.current_physical_size_gauge.sub(path.metadata()?.len()); + self.metrics + .current_physical_size_gauge + .sub(path.metadata()?.len()); layer_paths_to_delete.insert(path); } doomed_layer.delete()?; From d5ec84b87bc5d11c4e77df77a515356ef33a4e32 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 16 Aug 2022 01:03:17 +0300 Subject: [PATCH 0636/1022] reset rust cache for clippy run to avoid an ICE additionally remove trailing whitespaces --- .github/workflows/build_and_test.yml | 6 +++--- .github/workflows/codestyle.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 9425ceb536..884187cec2 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -121,8 +121,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v5-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - v5-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + v6-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + v6-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- - name: Cache postgres build id: cache_pg @@ -688,4 +688,4 @@ jobs: run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s - helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s \ No newline at end of file + helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index aa37167a19..6f13a38dea 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -101,7 +101,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git target - key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} + key: v2-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} - name: Run cargo clippy run: ./run_clippy.sh From e94a5ce3606ecd6b8fddf85c087d33d4e90c11a4 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 29 Jul 2022 16:43:30 +0300 Subject: [PATCH 0637/1022] Rename pg_control_ffi.h to bindgen_deps.h, for clarity. The pg_control_ffi.h name implies that it only includes stuff related to pg_control.h. That's mostly true currently, but really the point of the file is to include everything that we need to generate Rust definitions from. --- libs/postgres_ffi/{pg_control_ffi.h => bindgen_deps.h} | 0 libs/postgres_ffi/build.rs | 6 +++--- 2 files changed, 3 insertions(+), 3 deletions(-) rename libs/postgres_ffi/{pg_control_ffi.h => bindgen_deps.h} (100%) diff --git a/libs/postgres_ffi/pg_control_ffi.h b/libs/postgres_ffi/bindgen_deps.h similarity index 100% rename from libs/postgres_ffi/pg_control_ffi.h rename to libs/postgres_ffi/bindgen_deps.h diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 7db2c20e34..69b2711c22 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -44,7 +44,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { fn main() { // Tell cargo to invalidate the built crate whenever the wrapper changes - println!("cargo:rerun-if-changed=pg_control_ffi.h"); + println!("cargo:rerun-if-changed=bindgen_deps.h"); // Finding the location of C headers for the Postgres server: // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/tmp_install` @@ -88,9 +88,9 @@ fn main() { // the resulting bindings. let bindings = bindgen::Builder::default() // - // All the needed PostgreSQL headers are included from 'pg_control_ffi.h' + // All the needed PostgreSQL headers are included from 'bindgen_deps.h' // - .header("pg_control_ffi.h") + .header("bindgen_deps.h") // // Tell cargo to invalidate the built crate whenever any of the // included header files changed. From 3414feae037e954a4630309114f196a84c1198a8 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Wed, 17 Aug 2022 08:17:09 -0400 Subject: [PATCH 0638/1022] Make local mypy behave like CI mypy (#2291) --- setup.cfg | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/setup.cfg b/setup.cfg index d1a2f9a359..7f8c45c8c3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,6 +18,10 @@ exclude = ^vendor/ # some tests don't typecheck when this flag is set check_untyped_defs = false +# Help mypy find imports when running against list of individual files. +# Without this line it would behave differently when executed on the entire project. +mypy_path = $MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner + disallow_incomplete_defs = false disallow_untyped_calls = false disallow_untyped_decorators = false From e9a3499e87c2f87632849b501282387e2a9dcbeb Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Wed, 17 Aug 2022 08:17:35 -0400 Subject: [PATCH 0639/1022] Fix flaky pageserver restarts in tests (#2261) --- control_plane/src/safekeeper.rs | 45 +++++++++----------------- control_plane/src/storage.rs | 44 ++++++++----------------- test_runner/batch_others/test_setup.py | 17 ++++++++++ 3 files changed, 46 insertions(+), 60 deletions(-) create mode 100644 test_runner/batch_others/test_setup.py diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 0cae479d71..3fda856e13 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -1,5 +1,4 @@ use std::io::Write; -use std::net::TcpStream; use std::path::PathBuf; use std::process::Command; use std::sync::Arc; @@ -241,37 +240,23 @@ impl SafekeeperNode { ), } - let address = connection_address(&self.pg_connection_config); - - // TODO Remove this "timeout" and handle it on caller side instead. - // Shutting down may take a long time, - // if safekeeper flushes a lot of data - let mut tcp_stopped = false; + // Wait until process is gone for i in 0..600 { - if !tcp_stopped { - if let Err(err) = TcpStream::connect(&address) { - tcp_stopped = true; - if err.kind() != io::ErrorKind::ConnectionRefused { - eprintln!("\nSafekeeper connection failed with error: {err}"); - } + let signal = None; // Send no signal, just get the error code + match kill(pid, signal) { + Ok(_) => (), // Process exists, keep waiting + Err(Errno::ESRCH) => { + // Process not found, we're done + println!("done!"); + return Ok(()); } - } - if tcp_stopped { - // Also check status on the HTTP port - match self.check_status() { - Err(SafekeeperHttpError::Transport(err)) if err.is_connect() => { - println!("done!"); - return Ok(()); - } - Err(err) => { - eprintln!("\nSafekeeper status check failed with error: {err}"); - return Ok(()); - } - Ok(()) => { - // keep waiting - } - } - } + Err(err) => bail!( + "Failed to send signal to pageserver with pid {}: {}", + pid, + err.desc() + ), + }; + if i % 10 == 0 { print!("."); io::stdout().flush().unwrap(); diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index d2742e84bb..31858278d3 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,7 +1,6 @@ use std::collections::HashMap; use std::fs::File; use std::io::{BufReader, Write}; -use std::net::TcpStream; use std::num::NonZeroU64; use std::path::PathBuf; use std::process::Command; @@ -312,38 +311,23 @@ impl PageServerNode { ), } - let address = connection_address(&self.pg_connection_config); - - // TODO Remove this "timeout" and handle it on caller side instead. - // Shutting down may take a long time, - // if pageserver checkpoints a lot of data - let mut tcp_stopped = false; + // Wait until process is gone for i in 0..600 { - if !tcp_stopped { - if let Err(err) = TcpStream::connect(&address) { - tcp_stopped = true; - if err.kind() != io::ErrorKind::ConnectionRefused { - eprintln!("\nPageserver connection failed with error: {err}"); - } + let signal = None; // Send no signal, just get the error code + match kill(pid, signal) { + Ok(_) => (), // Process exists, keep waiting + Err(Errno::ESRCH) => { + // Process not found, we're done + println!("done!"); + return Ok(()); } - } - if tcp_stopped { - // Also check status on the HTTP port + Err(err) => bail!( + "Failed to send signal to pageserver with pid {}: {}", + pid, + err.desc() + ), + }; - match self.check_status() { - Err(PageserverHttpError::Transport(err)) if err.is_connect() => { - println!("done!"); - return Ok(()); - } - Err(err) => { - eprintln!("\nPageserver status check failed with error: {err}"); - return Ok(()); - } - Ok(()) => { - // keep waiting - } - } - } if i % 10 == 0 { print!("."); io::stdout().flush().unwrap(); diff --git a/test_runner/batch_others/test_setup.py b/test_runner/batch_others/test_setup.py new file mode 100644 index 0000000000..3d1471621b --- /dev/null +++ b/test_runner/batch_others/test_setup.py @@ -0,0 +1,17 @@ +"""Tests for the code in test fixtures""" + +from fixtures.neon_fixtures import NeonEnvBuilder + + +# Test that pageserver and safekeeper can restart quickly. +# This is a regression test, see https://github.com/neondatabase/neon/issues/2247 +def test_fixture_restart(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + for i in range(3): + env.pageserver.stop() + env.pageserver.start() + + for i in range(3): + env.safekeepers[0].stop() + env.safekeepers[0].start() From 3b819ee159d9f363c2cb354423f14c808399ab56 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 17 Aug 2022 17:51:53 +0300 Subject: [PATCH 0640/1022] Remove extra type aliases (#2280) --- pageserver/src/http/routes.rs | 7 +++---- pageserver/src/layered_repository.rs | 4 +++- pageserver/src/lib.rs | 4 ---- pageserver/src/repository.rs | 5 ++--- pageserver/src/tenant_mgr.rs | 19 +++++++++---------- pageserver/src/timelines.rs | 11 +++++++---- .../src/walreceiver/connection_manager.rs | 14 +++++--------- 7 files changed, 29 insertions(+), 35 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 81ecdb7404..1d0adec63d 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -11,14 +11,13 @@ use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, }; -use crate::layered_repository::metadata::TimelineMetadata; +use crate::layered_repository::{metadata::TimelineMetadata, LayeredTimeline}; use crate::pgdatadir_mapping::DatadirTimeline; use crate::repository::{LocalTimelineState, RepositoryTimeline}; use crate::repository::{Repository, Timeline}; use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant_config::TenantConfOpt; -use crate::TimelineImpl; use crate::{config::PageServerConf, tenant_mgr, timelines}; use utils::{ auth::JwtAuth, @@ -86,7 +85,7 @@ fn get_config(request: &Request) -> &'static PageServerConf { // Helper functions to construct a LocalTimelineInfo struct for a timeline fn local_timeline_info_from_loaded_timeline( - timeline: &TimelineImpl, + timeline: &LayeredTimeline, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> anyhow::Result { @@ -161,7 +160,7 @@ fn local_timeline_info_from_unloaded_timeline(metadata: &TimelineMetadata) -> Lo } fn local_timeline_info_from_repo_timeline( - repo_timeline: &RepositoryTimeline, + repo_timeline: &RepositoryTimeline, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> anyhow::Result { diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 8e75cbd4a2..6bf2e71852 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -59,7 +59,9 @@ mod storage_layer; mod timeline; use storage_layer::Layer; -use timeline::{LayeredTimeline, LayeredTimelineEntry}; +use timeline::LayeredTimelineEntry; + +pub use timeline::LayeredTimeline; // re-export this function so that page_cache.rs can use it. pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 140260e0d0..47fd8a84cf 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -28,7 +28,6 @@ use tracing::info; use crate::thread_mgr::ThreadKind; use metrics::{register_int_gauge_vec, IntGaugeVec}; -use layered_repository::LayeredRepository; use pgdatadir_mapping::DatadirTimeline; /// Current storage format version @@ -62,9 +61,6 @@ pub enum CheckpointConfig { Forced, } -pub type RepositoryImpl = LayeredRepository; -pub type TimelineImpl = ::Timeline; - pub fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint thread. This prevents new connections from // being accepted. diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index a1a08b11d5..d09b01437c 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -412,7 +412,6 @@ pub mod repo_harness { use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; use std::{fs, path::PathBuf}; - use crate::RepositoryImpl; use crate::{ config::PageServerConf, layered_repository::LayeredRepository, @@ -508,11 +507,11 @@ pub mod repo_harness { }) } - pub fn load(&self) -> RepositoryImpl { + pub fn load(&self) -> LayeredRepository { self.try_load().expect("failed to load test repo") } - pub fn try_load(&self) -> Result { + pub fn try_load(&self) -> Result { let walredo_mgr = Arc::new(TestRedoManager); let repo = LayeredRepository::new( diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 5a5cea9a4b..d90cd7371a 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,7 +3,7 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; -use crate::layered_repository::{load_metadata, LayeredRepository}; +use crate::layered_repository::{load_metadata, LayeredRepository, LayeredTimeline}; use crate::repository::Repository; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; @@ -12,7 +12,6 @@ use crate::thread_mgr::ThreadKind; use crate::timelines::CreateRepo; use crate::walredo::PostgresRedoManager; use crate::{thread_mgr, timelines, walreceiver}; -use crate::{RepositoryImpl, TimelineImpl}; use anyhow::Context; use serde::{Deserialize, Serialize}; use std::collections::hash_map::Entry; @@ -96,13 +95,13 @@ mod tenants_state { struct Tenant { state: TenantState, /// Contains in-memory state, including the timeline that might not yet flushed on disk or loaded form disk. - repo: Arc, + repo: Arc, /// Timelines, located locally in the pageserver's datadir. /// Timelines can entirely be removed entirely by the `detach` operation only. /// /// Local timelines have more metadata that's loaded into memory, /// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`]. - local_timelines: HashMap::Timeline>>, + local_timelines: HashMap>, } #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] @@ -179,7 +178,7 @@ pub enum LocalTimelineUpdate { }, Attach { id: ZTenantTimelineId, - datadir: Arc<::Timeline>, + datadir: Arc, }, } @@ -369,7 +368,7 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow: Ok(()) } -pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result> { +pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result> { let m = tenants_state::read_tenants(); let tenant = m .get(&tenant_id) @@ -383,7 +382,7 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result anyhow::Result> { +) -> anyhow::Result> { let mut m = tenants_state::write_tenants(); let tenant = m .get_mut(&tenant_id) @@ -488,9 +487,9 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any } fn load_local_timeline( - repo: &RepositoryImpl, + repo: &LayeredRepository, timeline_id: ZTimelineId, -) -> anyhow::Result> { +) -> anyhow::Result> { let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| { format!("Inmem timeline {timeline_id} not found in tenant's repository") })?; @@ -634,7 +633,7 @@ fn load_local_repo( conf: &'static PageServerConf, tenant_id: ZTenantId, remote_index: &RemoteIndex, -) -> anyhow::Result> { +) -> anyhow::Result> { let mut m = tenants_state::write_tenants(); let tenant = m.entry(tenant_id).or_insert_with(|| { // Set up a WAL redo manager, for applying WAL records. diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 6002e8b2d9..c5b938c5fe 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -21,10 +21,13 @@ use utils::{ use crate::tenant_mgr; use crate::{ config::PageServerConf, repository::Repository, storage_sync::index::RemoteIndex, - tenant_config::TenantConfOpt, RepositoryImpl, TimelineImpl, + tenant_config::TenantConfOpt, }; use crate::{import_datadir, LOG_FILE_NAME}; -use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager}; +use crate::{ + layered_repository::{LayeredRepository, LayeredTimeline}, + walredo::WalRedoManager, +}; use crate::{repository::Timeline, CheckpointConfig}; #[derive(Debug, Clone, Copy)] @@ -73,7 +76,7 @@ pub fn create_repo( tenant_conf: TenantConfOpt, tenant_id: ZTenantId, create_repo: CreateRepo, -) -> Result> { +) -> Result> { let (wal_redo_manager, remote_index) = match create_repo { CreateRepo::Real { wal_redo_manager, @@ -223,7 +226,7 @@ pub(crate) fn create_timeline( new_timeline_id: Option, ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, -) -> Result)>> { +) -> Result)>> { let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 2722bc7320..0f11a2197a 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -16,6 +16,7 @@ use std::{ time::Duration, }; +use crate::{layered_repository::LayeredTimeline, repository::Timeline}; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ @@ -25,12 +26,7 @@ use etcd_broker::{ use tokio::select; use tracing::*; -use crate::{ - exponential_backoff, - repository::{Repository, Timeline}, - DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, -}; -use crate::{RepositoryImpl, TimelineImpl}; +use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use utils::{ lsn::Lsn, zid::{NodeId, ZTenantTimelineId}, @@ -43,7 +39,7 @@ pub(super) fn spawn_connection_manager_task( id: ZTenantTimelineId, broker_loop_prefix: String, mut client: Client, - local_timeline: Arc, + local_timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, @@ -242,7 +238,7 @@ async fn subscribe_for_timeline_updates( struct WalreceiverState { id: ZTenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. - local_timeline: Arc, + local_timeline: Arc, /// The timeout on the connection to safekeeper for WAL streaming. wal_connect_timeout: Duration, /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. @@ -291,7 +287,7 @@ struct EtcdSkTimeline { impl WalreceiverState { fn new( id: ZTenantTimelineId, - local_timeline: Arc<::Timeline>, + local_timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, From 262cdf83442eea512a0a0a812017c73f5ce1df9b Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Wed, 17 Aug 2022 18:02:03 +0200 Subject: [PATCH 0641/1022] Update cachepot endpoint (#2290) * Update cachepot endpoint * Update dockerfile & remove env * Update image building process * Cannot use metadata endpoint for this * Update workflow * Conform to kaniko syntax * Update syntax * Update approach * Update dockerfiles * Force update * Update dockerfiles * Update dockerfile * Cleanup dockerfiles * Update s3 test location * Revert s3 experiment * Add more debug * Specify aws region * Remove debug, add prefix * Remove one more debug Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 16 +++++------- Dockerfile | 39 ++++++++++++++-------------- Dockerfile.compute-tools | 17 +++++++----- 3 files changed, 36 insertions(+), 36 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 884187cec2..9b46d36c92 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -436,11 +436,10 @@ jobs: neon-image: # force building for all 3 images - if: needs.dockerfile-check.outputs.value != 'true' + if: needs.dockerfile-check.outputs.value == 'true' runs-on: dev needs: [ dockerfile-check ] container: gcr.io/kaniko-project/executor:v1.9.0-debug - environment: dev steps: - name: Checkout @@ -452,15 +451,14 @@ jobs: - name: Configure ECR login run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - name: Kaniko build console + - name: Kaniko build neon run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID compute-tools-image: - if: needs.dockerfile-check.outputs.value != 'true' + if: needs.dockerfile-check.outputs.value == 'true' runs-on: dev needs: [ dockerfile-check ] container: gcr.io/kaniko-project/executor:v1.9.0-debug - environment: dev steps: - name: Checkout @@ -469,15 +467,14 @@ jobs: - name: Configure ECR login run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - name: Kaniko build console + - name: Kaniko build compute tools run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID compute-node-image: - if: needs.dockerfile-check.outputs.value != 'true' + if: needs.dockerfile-check.outputs.value == 'true' runs-on: dev needs: [ dockerfile-check ] container: gcr.io/kaniko-project/executor:v1.9.0-debug - environment: dev steps: - name: Checkout @@ -489,7 +486,7 @@ jobs: - name: Configure ECR login run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - name: Kaniko build console + - name: Kaniko build compute node working-directory: ./vendor/postgres/ run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID @@ -512,7 +509,6 @@ jobs: runs-on: dev needs: [ promote-images, tag ] container: golang:1.19-bullseye - environment: dev steps: - name: Install Crane & ECR helper diff --git a/Dockerfile b/Dockerfile index 6f017ac5d4..1afaa41fb4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,6 @@ # Build Postgres -FROM neondatabase/rust:1.58 AS pg-build -WORKDIR /pg - -USER root +FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS pg-build +WORKDIR /home/nonroot COPY vendor/postgres vendor/postgres COPY Makefile Makefile @@ -11,27 +9,30 @@ ENV BUILD_TYPE release RUN set -e \ && mold -run make -j $(nproc) -s postgres \ && rm -rf tmp_install/build \ - && tar -C tmp_install -czf /postgres_install.tar.gz . + && tar -C tmp_install -czf /home/nonroot/postgres_install.tar.gz . # Build zenith binaries -FROM neondatabase/rust:1.58 AS build +FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS build +WORKDIR /home/nonroot ARG GIT_VERSION=local # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. -# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build. +# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build ARG RUSTC_WRAPPER=cachepot -ARG CACHEPOT_BUCKET=zenith-rust-cachepot -ARG AWS_ACCESS_KEY_ID -ARG AWS_SECRET_ACCESS_KEY +ENV AWS_REGION=eu-central-1 +ENV CACHEPOT_S3_KEY_PREFIX=cachepot +ARG CACHEPOT_BUCKET=neon-github-dev +#ARG AWS_ACCESS_KEY_ID +#ARG AWS_SECRET_ACCESS_KEY -COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server +COPY --from=pg-build /home/nonroot/tmp_install/include/postgresql/server tmp_install/include/postgresql/server COPY . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ - && sudo -E "PATH=$PATH" mold -run cargo build --release \ + && mold -run cargo build --release \ && cachepot -s # Build final image @@ -40,8 +41,8 @@ FROM debian:bullseye-slim WORKDIR /data RUN set -e \ - && apt-get update \ - && apt-get install -y \ + && apt update \ + && apt install -y \ libreadline-dev \ libseccomp-dev \ openssl \ @@ -50,12 +51,12 @@ RUN set -e \ && useradd -d /data zenith \ && chown -R zenith:zenith /data -COPY --from=build --chown=zenith:zenith /home/runner/target/release/pageserver /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/runner/target/release/safekeeper /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/runner/target/release/proxy /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/pageserver /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/safekeeper /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy /usr/local/bin -COPY --from=pg-build /pg/tmp_install/ /usr/local/ -COPY --from=pg-build /postgres_install.tar.gz /data/ +COPY --from=pg-build /home/nonroot/tmp_install/ /usr/local/ +COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ COPY docker-entrypoint.sh /docker-entrypoint.sh diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 76cbc2ac30..05393021c2 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,22 +1,25 @@ # First transient image to build compute_tools binaries # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml -FROM neondatabase/rust:1.58 AS rust-build +FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS rust-build +WORKDIR /home/nonroot # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. # cachepot falls back to local filesystem if S3 is misconfigured, not failing the build. ARG RUSTC_WRAPPER=cachepot -ARG CACHEPOT_BUCKET=zenith-rust-cachepot -ARG AWS_ACCESS_KEY_ID -ARG AWS_SECRET_ACCESS_KEY +ENV AWS_REGION=eu-central-1 +ENV CACHEPOT_S3_KEY_PREFIX=cachepot +ARG CACHEPOT_BUCKET=neon-github-dev +#ARG AWS_ACCESS_KEY_ID +#ARG AWS_SECRET_ACCESS_KEY COPY . . RUN set -e \ - && sudo -E "PATH=$PATH" mold -run cargo build -p compute_tools --release \ + && mold -run cargo build -p compute_tools --release \ && cachepot -s # Final image that only has one binary -FROM debian:buster-slim +FROM debian:bullseye-slim -COPY --from=rust-build /home/runner/target/release/compute_ctl /usr/local/bin/compute_ctl +COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl From dc102197df0469da544463253e636395d1d33789 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 17 Aug 2022 17:16:26 +0100 Subject: [PATCH 0642/1022] workflows/benchmarking: increase timeout (#2294) --- .github/workflows/benchmarking.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index a6b2ca34e8..8080d6b7db 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -106,7 +106,7 @@ jobs: mkdir -p perf-report-staging # Set --sparse-ordering option of pytest-order plugin to ensure tests are running in order of appears in the file, # it's important for test_perf_pgbench.py::test_pgbench_remote_* tests - ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --skip-interfering-proc-check --out-dir perf-report-staging --timeout 3600 + ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --skip-interfering-proc-check --out-dir perf-report-staging --timeout 5400 - name: Submit result env: From 67e091c906a2863e32a8599d4f0aca90fc756b74 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 17 Aug 2022 23:24:47 +0300 Subject: [PATCH 0643/1022] Rework `init` in pageserver CLI (#2272) * Do not create initial tenant and timeline (adjust Python tests for that) * Rework config handling during init, add --update-config to manage local config updates --- Dockerfile | 3 - control_plane/src/local_env.rs | 10 +- control_plane/src/safekeeper.rs | 2 +- control_plane/src/storage.rs | 204 +++++++++--------- docker-entrypoint.sh | 24 --- neon_local/src/main.rs | 62 +++--- pageserver/src/bin/pageserver.rs | 180 +++++++++------- pageserver/src/tenant_mgr.rs | 7 +- pageserver/src/timelines.rs | 64 +----- pageserver/src/walredo.rs | 18 -- .../batch_others/test_pageserver_api.py | 45 +++- .../batch_others/test_tenant_relocation.py | 26 +-- 12 files changed, 285 insertions(+), 360 deletions(-) delete mode 100755 docker-entrypoint.sh diff --git a/Dockerfile b/Dockerfile index 1afaa41fb4..17aa0025e8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -58,10 +58,7 @@ COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy COPY --from=pg-build /home/nonroot/tmp_install/ /usr/local/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ -COPY docker-entrypoint.sh /docker-entrypoint.sh - VOLUME ["/data"] USER zenith EXPOSE 6400 -ENTRYPOINT ["/docker-entrypoint.sh"] CMD ["pageserver"] diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index e0b409f32d..75e552f6cc 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -24,7 +24,7 @@ use crate::safekeeper::SafekeeperNode; // This data structures represents neon_local CLI config // // It is deserialized from the .neon/config file, or the config file passed -// to 'zenith init --config=' option. See control_plane/simple.conf for +// to 'neon_local init --config=' option. See control_plane/simple.conf for // an example. // #[serde_as] @@ -320,7 +320,7 @@ impl LocalEnv { if !repopath.exists() { bail!( - "Zenith config is not found in {}. You need to run 'zenith init' first", + "Zenith config is not found in {}. You need to run 'neon_local init' first", repopath.to_str().unwrap() ); } @@ -337,12 +337,12 @@ impl LocalEnv { } pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> { - // Currently, the user first passes a config file with 'zenith init --config=' + // Currently, the user first passes a config file with 'neon_local init --config=' // We read that in, in `create_config`, and fill any missing defaults. Then it's saved // to .neon/config. TODO: We lose any formatting and comments along the way, which is // a bit sad. let mut conf_content = r#"# This file describes a locale deployment of the page server -# and safekeeeper node. It is read by the 'zenith' command-line +# and safekeeeper node. It is read by the 'neon_local' command-line # utility. "# .to_string(); @@ -382,7 +382,7 @@ impl LocalEnv { } // - // Initialize a new Zenith repository + // Initialize a new Neon repository // pub fn init(&mut self) -> anyhow::Result<()> { // check if config already exists diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 3fda856e13..652736058a 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -51,7 +51,7 @@ impl ResponseErrorMessageExt for Response { Err(SafekeeperHttpError::Response( match self.json::() { Ok(err_body) => format!("Error: {}", err_body.msg), - Err(_) => format!("Http error ({}) at {url}.", status.as_u16()), + Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }, )) } diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 31858278d3..aab29628e3 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use std::fs::File; use std::io::{BufReader, Write}; use std::num::NonZeroU64; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::process::Command; use std::time::Duration; use std::{io, result, thread}; @@ -102,23 +102,19 @@ impl PageServerNode { /// Construct libpq connection string for connecting to the pageserver. fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config { - format!("postgresql://no_user:{}@{}/no_db", password, listen_addr) + format!("postgresql://no_user:{password}@{listen_addr}/no_db") .parse() .unwrap() } - pub fn init( + pub fn initialize( &self, create_tenant: Option, initial_timeline_id: Option, config_overrides: &[&str], ) -> anyhow::Result { - let mut cmd = Command::new(self.env.pageserver_bin()?); - let id = format!("id={}", self.env.pageserver.id); - // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. - let base_data_dir_param = self.env.base_data_dir.display().to_string(); let pg_distrib_dir_param = format!("pg_distrib_dir='{}'", self.env.pg_distrib_dir.display()); let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type); @@ -138,67 +134,52 @@ impl PageServerNode { .collect::>() .join(",") ); - let mut args = Vec::with_capacity(20); - - args.push("--init"); - args.extend(["-D", &base_data_dir_param]); - args.extend(["-c", &pg_distrib_dir_param]); - args.extend(["-c", &authg_type_param]); - args.extend(["-c", &listen_http_addr_param]); - args.extend(["-c", &listen_pg_addr_param]); - args.extend(["-c", &broker_endpoints_param]); - args.extend(["-c", &id]); - let broker_etcd_prefix_param = self .env .etcd_broker .broker_etcd_prefix .as_ref() .map(|prefix| format!("broker_etcd_prefix='{prefix}'")); - if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() { - args.extend(["-c", broker_etcd_prefix_param]); - } - for config_override in config_overrides { - args.extend(["-c", config_override]); + let mut init_config_overrides = config_overrides.to_vec(); + init_config_overrides.push(&id); + init_config_overrides.push(&pg_distrib_dir_param); + init_config_overrides.push(&authg_type_param); + init_config_overrides.push(&listen_http_addr_param); + init_config_overrides.push(&listen_pg_addr_param); + init_config_overrides.push(&broker_endpoints_param); + + if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() { + init_config_overrides.push(broker_etcd_prefix_param); } if self.env.pageserver.auth_type != AuthType::Trust { - args.extend([ - "-c", - "auth_validation_public_key_path='auth_public_key.pem'", - ]); + init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'"); } - let create_tenant = create_tenant.map(|id| id.to_string()); - if let Some(tenant_id) = create_tenant.as_deref() { - args.extend(["--create-tenant", tenant_id]) + self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?; + let init_result = self + .try_init_timeline(create_tenant, initial_timeline_id) + .context("Failed to create initial tenant and timeline for pageserver"); + match &init_result { + Ok(initial_timeline_id) => { + println!("Successfully initialized timeline {initial_timeline_id}") + } + Err(e) => eprintln!("{e:#}"), } + self.stop(false)?; + init_result + } - let initial_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); - let initial_timeline_id_string = initial_timeline_id.to_string(); - args.extend(["--initial-timeline-id", &initial_timeline_id_string]); - - let cmd_with_args = cmd.args(args); - let init_output = fill_rust_env_vars(cmd_with_args) - .output() - .with_context(|| { - format!("failed to init pageserver with command {:?}", cmd_with_args) - })?; - - if !init_output.status.success() { - bail!( - "init invocation failed, {}\nStdout: {}\nStderr: {}", - init_output.status, - String::from_utf8_lossy(&init_output.stdout), - String::from_utf8_lossy(&init_output.stderr) - ); - } - - // echo the captured output of the init command - println!("{}", String::from_utf8_lossy(&init_output.stdout)); - - Ok(initial_timeline_id) + fn try_init_timeline( + &self, + new_tenant_id: Option, + new_timeline_id: Option, + ) -> anyhow::Result { + let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?; + let initial_timeline_info = + self.timeline_create(initial_tenant_id, new_timeline_id, None, None)?; + Ok(initial_timeline_info.timeline_id) } pub fn repo_path(&self) -> PathBuf { @@ -210,15 +191,35 @@ impl PageServerNode { } pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> { - print!( + self.start_node(config_overrides, &self.repo_path(), false) + } + + fn start_node( + &self, + config_overrides: &[&str], + datadir: &Path, + update_config: bool, + ) -> anyhow::Result<()> { + println!( "Starting pageserver at '{}' in '{}'", connection_address(&self.pg_connection_config), - self.repo_path().display() + datadir.display() ); - io::stdout().flush().unwrap(); + io::stdout().flush()?; - let repo_path = self.repo_path(); - let mut args = vec!["-D", repo_path.to_str().unwrap()]; + let mut args = vec![ + "-D", + datadir.to_str().with_context(|| { + format!( + "Datadir path '{}' cannot be represented as a unicode string", + datadir.display() + ) + })?, + ]; + + if update_config { + args.push("--update-config"); + } for config_override in config_overrides { args.extend(["-c", config_override]); @@ -230,8 +231,8 @@ impl PageServerNode { if !filled_cmd.status()?.success() { bail!( - "Pageserver failed to start. See '{}' for details.", - self.repo_path().join("pageserver.log").display() + "Pageserver failed to start. See console output and '{}' for details.", + datadir.join("pageserver.log").display() ); } @@ -240,7 +241,7 @@ impl PageServerNode { const RETRIES: i8 = 15; for retries in 1..RETRIES { match self.check_status() { - Ok(_) => { + Ok(()) => { println!("\nPageserver started"); return Ok(()); } @@ -254,21 +255,18 @@ impl PageServerNode { if retries == 5 { println!() // put a line break after dots for second message } - println!( - "Pageserver not responding yet, err {} retrying ({})...", - err, retries - ); + println!("Pageserver not responding yet, err {err} retrying ({retries})..."); } } PageserverHttpError::Response(msg) => { - bail!("pageserver failed to start: {} ", msg) + bail!("pageserver failed to start: {msg} ") } } thread::sleep(Duration::from_secs(1)); } } } - bail!("pageserver failed to start in {} seconds", RETRIES); + bail!("pageserver failed to start in {RETRIES} seconds"); } /// @@ -298,15 +296,11 @@ impl PageServerNode { match kill(pid, sig) { Ok(_) => (), Err(Errno::ESRCH) => { - println!( - "Pageserver with pid {} does not exist, but a PID file was found", - pid - ); + println!("Pageserver with pid {pid} does not exist, but a PID file was found"); return Ok(()); } Err(err) => bail!( - "Failed to send signal to pageserver with pid {}: {}", - pid, + "Failed to send signal to pageserver with pid {pid}: {}", err.desc() ), } @@ -335,13 +329,13 @@ impl PageServerNode { thread::sleep(Duration::from_millis(100)); } - bail!("Failed to stop pageserver with pid {}", pid); + bail!("Failed to stop pageserver with pid {pid}"); } pub fn page_server_psql(&self, sql: &str) -> Vec { let mut client = self.pg_connection_config.connect(NoTls).unwrap(); - println!("Pageserver query: '{}'", sql); + println!("Pageserver query: '{sql}'"); client.simple_query(sql).unwrap() } @@ -376,9 +370,8 @@ impl PageServerNode { &self, new_tenant_id: Option, settings: HashMap<&str, &str>, - ) -> anyhow::Result> { - let tenant_id_string = self - .http_request(Method::POST, format!("{}/tenant", self.http_base_url)) + ) -> anyhow::Result { + self.http_request(Method::POST, format!("{}/tenant", self.http_base_url)) .json(&TenantCreateRequest { new_tenant_id, checkpoint_distance: settings @@ -417,18 +410,16 @@ impl PageServerNode { }) .send()? .error_from_body()? - .json::>()?; - - tenant_id_string - .map(|id| { - id.parse().with_context(|| { - format!( - "Failed to parse tennat creation response as tenant id: {}", - id - ) + .json::>() + .with_context(|| { + format!("Failed to parse tenant creation response for tenant id: {new_tenant_id:?}") + })? + .context("No tenant id was found in the tenant creation response") + .and_then(|tenant_id_string| { + tenant_id_string.parse().with_context(|| { + format!("Failed to parse response string as tenant id: '{tenant_id_string}'") }) }) - .transpose() } pub fn tenant_config(&self, tenant_id: ZTenantId, settings: HashMap<&str, &str>) -> Result<()> { @@ -499,22 +490,27 @@ impl PageServerNode { new_timeline_id: Option, ancestor_start_lsn: Option, ancestor_timeline_id: Option, - ) -> anyhow::Result> { - let timeline_info_response = self - .http_request( - Method::POST, - format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), + ) -> anyhow::Result { + self.http_request( + Method::POST, + format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), + ) + .json(&TimelineCreateRequest { + new_timeline_id, + ancestor_start_lsn, + ancestor_timeline_id, + }) + .send()? + .error_from_body()? + .json::>() + .with_context(|| { + format!("Failed to parse timeline creation response for tenant id: {tenant_id}") + })? + .with_context(|| { + format!( + "No timeline id was found in the timeline creation response for tenant {tenant_id}" ) - .json(&TimelineCreateRequest { - new_timeline_id, - ancestor_start_lsn, - ancestor_timeline_id, - }) - .send()? - .error_from_body()? - .json::>()?; - - Ok(timeline_info_response) + }) } /// Import a basebackup prepared using either: diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh deleted file mode 100755 index 75dbdaed7a..0000000000 --- a/docker-entrypoint.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/sh -set -eux - -pageserver_id_param="${NODE_ID:-10}" - -broker_endpoints_param="${BROKER_ENDPOINT:-absent}" -if [ "$broker_endpoints_param" != "absent" ]; then - broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']" -else - broker_endpoints_param='' -fi - -remote_storage_param="${REMOTE_STORAGE:-}" - -if [ "$1" = 'pageserver' ]; then - if [ ! -d "/data/tenants" ]; then - echo "Initializing pageserver data directory" - pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=${pageserver_id_param}" $broker_endpoints_param $remote_storage_param - fi - echo "Staring pageserver at 0.0.0.0:6400" - pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data -else - "$@" -fi diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index c4dd52e183..78a465539a 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -501,10 +501,10 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { // default_tenantid was generated by the `env.init()` call above let initial_tenant_id = env.default_tenant_id.unwrap(); - // Call 'pageserver init'. + // Initialize pageserver, create initial tenant and timeline. let pageserver = PageServerNode::from_env(&env); let initial_timeline_id = pageserver - .init( + .initialize( Some(initial_tenant_id), initial_timeline_id_arg, &pageserver_config_overrides(init_match), @@ -551,25 +551,15 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an .values_of("config") .map(|vals| vals.flat_map(|c| c.split_once(':')).collect()) .unwrap_or_default(); - let new_tenant_id = pageserver - .tenant_create(initial_tenant_id, tenant_conf)? - .ok_or_else(|| { - anyhow!("Tenant with id {:?} was already created", initial_tenant_id) - })?; - println!( - "tenant {} successfully created on the pageserver", - new_tenant_id - ); + let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?; + println!("tenant {new_tenant_id} successfully created on the pageserver"); // Create an initial timeline for the new tenant let new_timeline_id = parse_timeline_id(create_match)?; - let timeline = pageserver - .timeline_create(new_tenant_id, new_timeline_id, None, None)? - .context(format!( - "Failed to create initial timeline for tenant {new_tenant_id}" - ))?; - let new_timeline_id = timeline.timeline_id; - let last_record_lsn = timeline + let timeline_info = + pageserver.timeline_create(new_tenant_id, new_timeline_id, None, None)?; + let new_timeline_id = timeline_info.timeline_id; + let last_record_lsn = timeline_info .local .context(format!("Failed to get last record LSN: no local timeline info for timeline {new_timeline_id}"))? .last_record_lsn; @@ -616,20 +606,18 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let new_branch_name = create_match .value_of("branch-name") .ok_or_else(|| anyhow!("No branch name provided"))?; - let timeline = pageserver - .timeline_create(tenant_id, None, None, None)? - .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; - let new_timeline_id = timeline.timeline_id; + let timeline_info = pageserver.timeline_create(tenant_id, None, None, None)?; + let new_timeline_id = timeline_info.timeline_id; - let last_record_lsn = timeline + let last_record_lsn = timeline_info .local .expect("no local timeline info") .last_record_lsn; env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; println!( - "Created timeline '{}' at Lsn {} for tenant: {}", - timeline.timeline_id, last_record_lsn, tenant_id, + "Created timeline '{}' at Lsn {last_record_lsn} for tenant: {tenant_id}", + timeline_info.timeline_id ); } Some(("import", import_match)) => { @@ -680,10 +668,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let ancestor_timeline_id = env .get_branch_timeline_id(ancestor_branch_name, tenant_id) .ok_or_else(|| { - anyhow!( - "Found no timeline id for branch name '{}'", - ancestor_branch_name - ) + anyhow!("Found no timeline id for branch name '{ancestor_branch_name}'") })?; let start_lsn = branch_match @@ -691,12 +676,15 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - .map(Lsn::from_str) .transpose() .context("Failed to parse ancestor start Lsn from the request")?; - let timeline = pageserver - .timeline_create(tenant_id, None, start_lsn, Some(ancestor_timeline_id))? - .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; - let new_timeline_id = timeline.timeline_id; + let timeline_info = pageserver.timeline_create( + tenant_id, + None, + start_lsn, + Some(ancestor_timeline_id), + )?; + let new_timeline_id = timeline_info.timeline_id; - let last_record_lsn = timeline + let last_record_lsn = timeline_info .local .expect("no local timeline info") .last_record_lsn; @@ -704,11 +692,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; println!( - "Created timeline '{}' at Lsn {} for tenant: {}. Ancestor timeline: '{}'", - timeline.timeline_id, last_record_lsn, tenant_id, ancestor_branch_name, + "Created timeline '{}' at Lsn {last_record_lsn} for tenant: {tenant_id}. Ancestor timeline: '{ancestor_branch_name}'", + timeline_info.timeline_id ); } - Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), + Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{sub_name}'"), None => bail!("no tenant subcommand provided"), } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index b539964414..1a13147f42 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -1,6 +1,6 @@ //! Main entry point for the Page Server executable. -use std::{env, path::Path, str::FromStr}; +use std::{env, ops::ControlFlow, path::Path, str::FromStr}; use tracing::*; use anyhow::{bail, Context, Result}; @@ -13,7 +13,7 @@ use pageserver::{ config::{defaults::*, PageServerConf}, http, page_cache, page_service, profiling, tenant_mgr, thread_mgr, thread_mgr::ThreadKind, - timelines, virtual_file, LOG_FILE_NAME, + virtual_file, LOG_FILE_NAME, }; use utils::{ auth::JwtAuth, @@ -24,7 +24,6 @@ use utils::{ shutdown::exit_now, signals::{self, Signal}, tcp_listener, - zid::{ZTenantId, ZTimelineId}, }; project_git_version!(GIT_VERSION); @@ -42,6 +41,7 @@ fn main() -> anyhow::Result<()> { .about("Materializes WAL stream to pages and serves them to the postgres") .version(&*version()) .arg( + Arg::new("daemonize") .short('d') .long("daemonize") @@ -52,7 +52,7 @@ fn main() -> anyhow::Result<()> { Arg::new("init") .long("init") .takes_value(false) - .help("Initialize pageserver service: creates an initial config, tenant and timeline, if specified"), + .help("Initialize pageserver with all given config overrides"), ) .arg( Arg::new("workdir") @@ -61,20 +61,6 @@ fn main() -> anyhow::Result<()> { .takes_value(true) .help("Working directory for the pageserver"), ) - .arg( - Arg::new("create-tenant") - .long("create-tenant") - .takes_value(true) - .help("Create tenant during init") - .requires("init"), - ) - .arg( - Arg::new("initial-timeline-id") - .long("initial-timeline-id") - .takes_value(true) - .help("Use a specific timeline id during init and tenant creation") - .requires("create-tenant"), - ) // See `settings.md` for more details on the extra configuration patameters pageserver can process .arg( Arg::new("config-override") @@ -85,6 +71,9 @@ fn main() -> anyhow::Result<()> { .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), ) + .arg(Arg::new("update-config").long("update-config").takes_value(false).help( + "Update the config file when started", + )) .arg( Arg::new("enabled-features") .long("enabled-features") @@ -110,18 +99,6 @@ fn main() -> anyhow::Result<()> { .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?; let cfg_file_path = workdir.join("pageserver.toml"); - let init = arg_matches.is_present("init"); - let create_tenant = arg_matches - .value_of("create-tenant") - .map(ZTenantId::from_str) - .transpose() - .context("Failed to parse tenant id from the arguments")?; - let initial_timeline_id = arg_matches - .value_of("initial-timeline-id") - .map(ZTimelineId::from_str) - .transpose() - .context("Failed to parse timeline id from the arguments")?; - // Set CWD to workdir for non-daemon modes env::set_current_dir(&workdir).with_context(|| { format!( @@ -131,30 +108,86 @@ fn main() -> anyhow::Result<()> { })?; let daemonize = arg_matches.is_present("daemonize"); - if init && daemonize { - bail!("--daemonize cannot be used with --init") - } - let mut toml = if init { - // We're initializing the repo, so there's no config file yet - DEFAULT_CONFIG_FILE - .parse::() - .context("could not parse built-in config file")? - } else { - // Supplement the CLI arguments with the config file - let cfg_file_contents = std::fs::read_to_string(&cfg_file_path) - .with_context(|| format!("No pageserver config at '{}'", cfg_file_path.display()))?; - cfg_file_contents - .parse::() - .with_context(|| { - format!( - "Failed to read '{}' as pageserver config", - cfg_file_path.display() - ) - })? + let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? { + ControlFlow::Continue(conf) => conf, + ControlFlow::Break(()) => { + info!("Pageserver config init successful"); + return Ok(()); + } + }; + + let tenants_path = conf.tenants_path(); + if !tenants_path.exists() { + utils::crashsafe_dir::create_dir_all(conf.tenants_path()).with_context(|| { + format!( + "Failed to create tenants root dir at '{}'", + tenants_path.display() + ) + })?; + } + + // Initialize up failpoints support + let scenario = FailScenario::setup(); + + // Basic initialization of things that don't change after startup + virtual_file::init(conf.max_file_descriptors); + page_cache::init(conf.page_cache_size); + + start_pageserver(conf, daemonize).context("Failed to start pageserver")?; + + scenario.teardown(); + Ok(()) +} + +fn initialize_config( + cfg_file_path: &Path, + arg_matches: clap::ArgMatches, + workdir: &Path, +) -> anyhow::Result> { + let init = arg_matches.is_present("init"); + let update_config = init || arg_matches.is_present("update-config"); + + let (mut toml, config_file_exists) = if cfg_file_path.is_file() { + if init { + anyhow::bail!( + "Config file '{}' already exists, cannot init it, use --update-config to update it", + cfg_file_path.display() + ); + } + // Supplement the CLI arguments with the config file + let cfg_file_contents = std::fs::read_to_string(&cfg_file_path).with_context(|| { + format!( + "Failed to read pageserver config at '{}'", + cfg_file_path.display() + ) + })?; + ( + cfg_file_contents + .parse::() + .with_context(|| { + format!( + "Failed to parse '{}' as pageserver config", + cfg_file_path.display() + ) + })?, + true, + ) + } else if cfg_file_path.exists() { + anyhow::bail!( + "Config file '{}' exists but is not a regular file", + cfg_file_path.display() + ); + } else { + // We're initializing the repo, so there's no config file yet + ( + DEFAULT_CONFIG_FILE + .parse::() + .context("could not parse built-in config file")?, + false, + ) }; - // Process any extra options given with -c if let Some(values) = arg_matches.values_of("config-override") { for option_line in values { let doc = toml_edit::Document::from_str(option_line).with_context(|| { @@ -165,49 +198,38 @@ fn main() -> anyhow::Result<()> { })?; for (key, item) in doc.iter() { - if key == "id" { - anyhow::ensure!( - init, - "node id can only be set during pageserver init and cannot be overridden" - ); + if config_file_exists && update_config && key == "id" && toml.contains_key(key) { + anyhow::bail!("Pageserver config file exists at '{}' and has node id already, it cannot be overridden", cfg_file_path.display()); } toml.insert(key, item.clone()); } } } - trace!("Resulting toml: {}", toml); - let conf = PageServerConf::parse_and_validate(&toml, &workdir) + + debug!("Resulting toml: {toml}"); + let conf = PageServerConf::parse_and_validate(&toml, workdir) .context("Failed to parse pageserver configuration")?; - // The configuration is all set up now. Turn it into a 'static - // that can be freely stored in structs and passed across threads - // as a ref. - let conf: &'static PageServerConf = Box::leak(Box::new(conf)); + if update_config { + info!("Writing pageserver config to '{}'", cfg_file_path.display()); - // Initialize up failpoints support - let scenario = FailScenario::setup(); - - // Basic initialization of things that don't change after startup - virtual_file::init(conf.max_file_descriptors); - page_cache::init(conf.page_cache_size); - - // Create repo and exit if init was requested - if init { - timelines::init_pageserver(conf, create_tenant, initial_timeline_id) - .context("Failed to init pageserver")?; - // write the config file std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| { format!( - "Failed to initialize pageserver config at '{}'", + "Failed to write pageserver config to '{}'", cfg_file_path.display() ) })?; - } else { - start_pageserver(conf, daemonize).context("Failed to start pageserver")?; + info!( + "Config successfully written to '{}'", + cfg_file_path.display() + ) } - scenario.teardown(); - Ok(()) + Ok(if init { + ControlFlow::Break(()) + } else { + ControlFlow::Continue(Box::leak(Box::new(conf))) + }) } fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> { diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index d90cd7371a..64f1caa542 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -9,7 +9,6 @@ use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; use crate::thread_mgr::ThreadKind; -use crate::timelines::CreateRepo; use crate::walredo::PostgresRedoManager; use crate::{thread_mgr, timelines, walreceiver}; use anyhow::Context; @@ -284,10 +283,8 @@ pub fn create_tenant_repository( conf, tenant_conf, tenant_id, - CreateRepo::Real { - wal_redo_manager, - remote_index, - }, + wal_redo_manager, + remote_index, )?; v.insert(Tenant { state: TenantState::Idle, diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index c5b938c5fe..ed5975d3bd 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -13,17 +13,17 @@ use std::{ use tracing::*; use utils::{ - crashsafe_dir, logging, + crashsafe_dir, lsn::Lsn, zid::{ZTenantId, ZTimelineId}, }; +use crate::import_datadir; use crate::tenant_mgr; use crate::{ config::PageServerConf, repository::Repository, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, }; -use crate::{import_datadir, LOG_FILE_NAME}; use crate::{ layered_repository::{LayeredRepository, LayeredTimeline}, walredo::WalRedoManager, @@ -36,69 +36,13 @@ pub struct PointInTime { pub lsn: Lsn, } -pub fn init_pageserver( - conf: &'static PageServerConf, - create_tenant: Option, - initial_timeline_id: Option, -) -> anyhow::Result<()> { - // Initialize logger - // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages - let _log_file = logging::init(LOG_FILE_NAME, true)?; - - crashsafe_dir::create_dir_all(conf.tenants_path())?; - - if let Some(tenant_id) = create_tenant { - println!("initializing tenantid {}", tenant_id); - let repo = create_repo(conf, TenantConfOpt::default(), tenant_id, CreateRepo::Dummy) - .context("failed to create repo")?; - let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); - bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref()) - .context("failed to create initial timeline")?; - println!("initial timeline {} created", new_timeline_id) - } else if initial_timeline_id.is_some() { - println!("Ignoring initial timeline parameter, due to no tenant id to create given"); - } - - println!("pageserver init succeeded"); - Ok(()) -} - -pub enum CreateRepo { - Real { - wal_redo_manager: Arc, - remote_index: RemoteIndex, - }, - Dummy, -} - pub fn create_repo( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: ZTenantId, - create_repo: CreateRepo, + wal_redo_manager: Arc, + remote_index: RemoteIndex, ) -> Result> { - let (wal_redo_manager, remote_index) = match create_repo { - CreateRepo::Real { - wal_redo_manager, - remote_index, - } => (wal_redo_manager, remote_index), - CreateRepo::Dummy => { - // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo - // process during repository initialization. - // - // FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched - // initdb in the background, and it kept running even after the "zenith init" had exited. - // In tests, we started the page server immediately after that, so that initdb was still - // running in the background, and we failed to run initdb again in the same directory. This - // has been solved for the rapid init+start case now, but the general race condition remains - // if you restart the server quickly. The WAL redo manager doesn't use a separate thread - // anymore, but I think that could still happen. - let wal_redo_manager = Arc::new(crate::walredo::DummyRedoManager {}); - - (wal_redo_manager as _, RemoteIndex::default()) - } - }; - let repo_dir = conf.tenant_path(&tenant_id); ensure!( !repo_dir.exists(), diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 85f970a941..57817dbc9c 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -82,24 +82,6 @@ pub trait WalRedoManager: Send + Sync { ) -> Result; } -/// -/// A dummy WAL Redo Manager implementation that doesn't allow replaying -/// anything. Currently used during bootstrapping (zenith init), to create -/// a Repository object without launching the real WAL redo process. -/// -pub struct DummyRedoManager {} -impl crate::walredo::WalRedoManager for DummyRedoManager { - fn request_redo( - &self, - _key: Key, - _lsn: Lsn, - _base_img: Option, - _records: Vec<(Lsn, ZenithWalRecord)>, - ) -> Result { - Err(WalRedoError::InvalidState) - } -} - // Metrics collected on WAL redo operations // // We collect the time spent in actual WAL redo ('redo'), and time waiting diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 51df41699a..710b220ae8 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -1,7 +1,11 @@ from typing import Optional from uuid import uuid4, UUID import pytest +import pathlib +import os +import subprocess from fixtures.utils import lsn_from_hex +from fixtures.log_helper import log from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, @@ -9,16 +13,43 @@ from fixtures.neon_fixtures import ( NeonPageserverHttpClient, NeonPageserverApiException, wait_until, + neon_binpath, + pg_distrib_dir, ) -# test that we cannot override node id -def test_pageserver_init_node_id(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init() - with pytest.raises( - Exception, - match="node id can only be set during pageserver init and cannot be overridden"): - env.pageserver.start(overrides=['--pageserver-config-override=id=10']) +# test that we cannot override node id after init +def test_pageserver_init_node_id(neon_simple_env: NeonEnv): + repo_dir = neon_simple_env.repo_dir + pageserver_config = repo_dir / 'pageserver.toml' + pageserver_bin = pathlib.Path(neon_binpath) / 'pageserver' + run_pageserver = lambda args: subprocess.run([str(pageserver_bin), '-D', str(repo_dir), *args], + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + # remove initial config + pageserver_config.unlink() + + bad_init = run_pageserver(['--init', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) + assert bad_init.returncode == 1, 'pageserver should not be able to init new config without the node id' + assert "missing id" in bad_init.stderr + assert not pageserver_config.exists(), 'config file should not be created after init error' + + completed_init = run_pageserver( + ['--init', '-c', 'id = 12345', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) + assert completed_init.returncode == 0, 'pageserver should be able to create a new config with the node id given' + assert pageserver_config.exists(), 'config file should be created successfully' + + bad_reinit = run_pageserver( + ['--init', '-c', 'id = 12345', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) + assert bad_reinit.returncode == 1, 'pageserver should not be able to init new config without the node id' + assert "already exists, cannot init it" in bad_reinit.stderr + + bad_update = run_pageserver(['--update-config', '-c', 'id = 3']) + assert bad_update.returncode == 1, 'pageserver should not allow updating node id' + assert "has node id already, it cannot be overridden" in bad_update.stderr def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID): diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 176ca740fe..eb65e2e3b5 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -44,30 +44,22 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, cannot use NeonPageserver yet because it depends on neon cli which currently lacks support for multiple pageservers """ - cmd = [ - str(pageserver_bin), - '--init', - '--workdir', - str(new_pageserver_dir), - f"-c listen_pg_addr='localhost:{pg_port}'", - f"-c listen_http_addr='localhost:{http_port}'", - f"-c pg_distrib_dir='{pg_distrib_dir}'", - f"-c id=2", - f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", - ] - - if broker is not None: - cmd.append(f"-c broker_endpoints=['{broker.client_url()}']", ) - - subprocess.check_output(cmd, text=True) - # actually run new pageserver cmd = [ str(pageserver_bin), '--workdir', str(new_pageserver_dir), '--daemonize', + '--update-config', + f"-c listen_pg_addr='localhost:{pg_port}'", + f"-c listen_http_addr='localhost:{http_port}'", + f"-c pg_distrib_dir='{pg_distrib_dir}'", + f"-c id=2", + f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", ] + if broker is not None: + cmd.append(f"-c broker_endpoints=['{broker.client_url()}']", ) + log.info("starting new pageserver %s", cmd) out = subprocess.check_output(cmd, text=True) log.info("started new pageserver %s", out) From 92bdf04758f6c024737927a4db76769458c8f091 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Thu, 18 Aug 2022 09:41:24 +0200 Subject: [PATCH 0644/1022] Fix: Always build images (#2296) * Always build images * Remove unused Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 9b46d36c92..6b76b6e5fc 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -415,30 +415,8 @@ jobs: } }" - dockerfile-check: - if: github.event_name != 'workflow_dispatch' - runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest - outputs: - value: ${{ steps.dockerfile-check.outputs.any_changed }} - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Get specific changed files - id: dockerfile-check - uses: tj-actions/changed-files@802732316a11c01531ea72773ec7998155238e31 # v25 - with: - files: | - Dockerfile - Dockerfile.compute-tools - ./vendor/postgres/Dockerfile - neon-image: - # force building for all 3 images - if: needs.dockerfile-check.outputs.value == 'true' runs-on: dev - needs: [ dockerfile-check ] container: gcr.io/kaniko-project/executor:v1.9.0-debug steps: @@ -455,9 +433,7 @@ jobs: run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID compute-tools-image: - if: needs.dockerfile-check.outputs.value == 'true' runs-on: dev - needs: [ dockerfile-check ] container: gcr.io/kaniko-project/executor:v1.9.0-debug steps: @@ -471,9 +447,7 @@ jobs: run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID compute-node-image: - if: needs.dockerfile-check.outputs.value == 'true' runs-on: dev - needs: [ dockerfile-check ] container: gcr.io/kaniko-project/executor:v1.9.0-debug steps: From 9bc12f7444e8a574d4ac5cfe2b213a91390342c8 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 26 Jul 2022 13:45:28 +0300 Subject: [PATCH 0645/1022] Move auto-generated 'bindings' to a separate inner module. Re-export only things that are used by other modules. In the future, I'm imagining that we run bindgen twice, for Postgres v14 and v15. The two sets of bindings would go into separate 'bindings_v14' and 'bindings_v15' modules. Rearrange postgres_ffi modules. Move function, to avoid Postgres version dependency in timelines.rs Move function to generate a logical-message WAL record to postgres_ffi. --- libs/postgres_ffi/src/controlfile_utils.rs | 2 +- libs/postgres_ffi/src/lib.rs | 59 +++++++-- libs/postgres_ffi/src/nonrelfile_utils.rs | 5 +- libs/postgres_ffi/src/pg_constants.rs | 9 +- libs/postgres_ffi/src/relfile_utils.rs | 2 +- libs/postgres_ffi/src/waldecoder.rs | 5 +- libs/postgres_ffi/src/xlog_utils.rs | 113 ++++++++++++++++-- libs/postgres_ffi/wal_craft/src/lib.rs | 4 +- pageserver/src/basebackup.rs | 23 ++-- pageserver/src/import_datadir.rs | 32 +++-- pageserver/src/keyspace.rs | 4 +- pageserver/src/layered_repository/timeline.rs | 2 +- pageserver/src/page_cache.rs | 2 +- pageserver/src/page_service.rs | 10 +- pageserver/src/pgdatadir_mapping.rs | 18 +-- pageserver/src/reltag.rs | 5 +- pageserver/src/timelines.rs | 14 +-- pageserver/src/walingest.rs | 16 +-- .../src/walreceiver/walreceiver_connection.rs | 2 +- pageserver/src/walrecord.rs | 17 ++- pageserver/src/walredo.rs | 21 ++-- safekeeper/src/handler.rs | 2 +- safekeeper/src/json_ctrl.rs | 97 +-------------- safekeeper/src/metrics.rs | 2 +- safekeeper/src/safekeeper.rs | 5 +- safekeeper/src/send_wal.rs | 2 +- safekeeper/src/timeline.rs | 3 +- safekeeper/src/wal_backup.rs | 3 +- safekeeper/src/wal_storage.rs | 10 +- 29 files changed, 265 insertions(+), 224 deletions(-) diff --git a/libs/postgres_ffi/src/controlfile_utils.rs b/libs/postgres_ffi/src/controlfile_utils.rs index 4df2342b90..0918d15001 100644 --- a/libs/postgres_ffi/src/controlfile_utils.rs +++ b/libs/postgres_ffi/src/controlfile_utils.rs @@ -23,7 +23,7 @@ //! information. You can use PostgreSQL's pg_controldata utility to view its //! contents. //! -use crate::{ControlFileData, PG_CONTROL_FILE_SIZE}; +use super::bindings::{ControlFileData, PG_CONTROL_FILE_SIZE}; use anyhow::{bail, Result}; use bytes::{Bytes, BytesMut}; diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 28d9a13dbf..022355329c 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -7,21 +7,62 @@ // https://github.com/rust-lang/rust-bindgen/issues/1651 #![allow(deref_nullptr)] -use serde::{Deserialize, Serialize}; use utils::lsn::Lsn; -include!(concat!(env!("OUT_DIR"), "/bindings.rs")); +macro_rules! postgres_ffi { + ($version:ident) => { + #[path = "."] + pub mod $version { + // fixme: does this have to be 'pub'? + pub mod bindings { + // bindgen generates bindings for a lot of stuff we don't need + #![allow(dead_code)] -pub mod controlfile_utils; -pub mod nonrelfile_utils; -pub mod pg_constants; -pub mod relfile_utils; -pub mod waldecoder; -pub mod xlog_utils; + use serde::{Deserialize, Serialize}; + include!(concat!(env!("OUT_DIR"), "/bindings.rs")); + } + pub mod controlfile_utils; + pub mod nonrelfile_utils; + pub mod pg_constants; + pub mod relfile_utils; + pub mod waldecoder; + pub mod xlog_utils; + + // Re-export some symbols from bindings + pub use bindings::DBState_DB_SHUTDOWNED; + pub use bindings::{CheckPoint, ControlFileData, XLogRecord}; + } + }; +} + +postgres_ffi!(v14); + +// Export some widely used datatypes that are unlikely to change across Postgres versions +pub use v14::bindings::{uint32, uint64, Oid}; +pub use v14::bindings::{BlockNumber, OffsetNumber}; +pub use v14::bindings::{MultiXactId, TransactionId}; + +// Likewise for these, although the assumption that these don't change is a little more iffy. +pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; + +// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and +// --with-segsize=SEGSIZE, but assume the defaults for now. +pub const BLCKSZ: u16 = 8192; +pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32); +pub const XLOG_BLCKSZ: usize = 8192; + +// PG timeline is always 1, changing it doesn't have any useful meaning in Neon. +// +// NOTE: this is not to be confused with Neon timelines; different concept! +// +// It's a shaky assumption, that it's always 1. We might import a +// PostgreSQL data directory that has gone through timeline bumps, +// for example. FIXME later. +pub const PG_TLI: u32 = 1; // See TransactionIdIsNormal in transam.h pub const fn transaction_id_is_normal(id: TransactionId) -> bool { - id > pg_constants::FIRST_NORMAL_TRANSACTION_ID + id > v14::pg_constants::FIRST_NORMAL_TRANSACTION_ID } // See TransactionIdPrecedes in transam.c diff --git a/libs/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs index b92207cd81..04ef346d88 100644 --- a/libs/postgres_ffi/src/nonrelfile_utils.rs +++ b/libs/postgres_ffi/src/nonrelfile_utils.rs @@ -1,11 +1,12 @@ //! //! Common utilities for dealing with PostgreSQL non-relation files. //! -use crate::{pg_constants, transaction_id_precedes}; +use crate::transaction_id_precedes; +use super::pg_constants; use bytes::BytesMut; use log::*; -use crate::MultiXactId; +use super::bindings::MultiXactId; pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) { trace!( diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 7230b841f5..42b5c5d842 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -7,7 +7,8 @@ //! comments on them. //! -use crate::PageHeaderData; +use super::bindings::PageHeaderData; +use crate::BLCKSZ; // // From pg_tablespace_d.h @@ -31,11 +32,6 @@ pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001; pub const SMGR_TRUNCATE_VM: u32 = 0x0002; pub const SMGR_TRUNCATE_FSM: u32 = 0x0004; -// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and -// --with-segsize=SEGSIZE, but assume the defaults for now. -pub const BLCKSZ: u16 = 8192; -pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32); - // // From bufpage.h // @@ -213,7 +209,6 @@ pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384; /* FIXME: pageserver should request wal_seg_size from compute node */ pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; -pub const XLOG_BLCKSZ: usize = 8192; pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; pub const XLP_LONG_HEADER: u16 = 0x0002; diff --git a/libs/postgres_ffi/src/relfile_utils.rs b/libs/postgres_ffi/src/relfile_utils.rs index cc9d6470c0..f3476acc9c 100644 --- a/libs/postgres_ffi/src/relfile_utils.rs +++ b/libs/postgres_ffi/src/relfile_utils.rs @@ -1,7 +1,7 @@ //! //! Common utilities for dealing with PostgreSQL relation files. //! -use crate::pg_constants; +use super::pg_constants; use once_cell::sync::OnceCell; use regex::Regex; diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index cbb761236c..0e1c9567cb 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -10,10 +10,7 @@ //! use super::pg_constants; use super::xlog_utils::*; -use super::XLogLongPageHeaderData; -use super::XLogPageHeaderData; -use super::XLogRecord; -use super::XLOG_PAGE_MAGIC; +use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use crc32c::*; use log::*; diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 956f53ce85..e7838c3f2c 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -7,22 +7,24 @@ // have been named the same as the corresponding PostgreSQL functions instead. // -use crate::pg_constants; -use crate::CheckPoint; -use crate::FullTransactionId; -use crate::XLogLongPageHeaderData; -use crate::XLogPageHeaderData; -use crate::XLogRecord; -use crate::XLOG_PAGE_MAGIC; +use crc32c::crc32c_append; -use crate::pg_constants::WAL_SEGMENT_SIZE; -use crate::waldecoder::WalStreamDecoder; +use super::bindings::{ + CheckPoint, FullTransactionId, XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, + XLOG_PAGE_MAGIC, +}; +use super::pg_constants; +use super::pg_constants::WAL_SEGMENT_SIZE; +use crate::v14::waldecoder::WalStreamDecoder; +use crate::PG_TLI; +use crate::{uint32, uint64, Oid}; use bytes::BytesMut; use bytes::{Buf, Bytes}; use log::*; +use serde::Serialize; use std::fs::File; use std::io::prelude::*; use std::io::ErrorKind; @@ -47,9 +49,6 @@ pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::(); #[allow(clippy::identity_op)] pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; -// PG timeline is always 1, changing it doesn't have useful meaning in Zenith. -pub const PG_TLI: u32 = 1; - pub type XLogRecPtr = u64; pub type TimeLineID = u32; pub type TimestampTz = i64; @@ -346,6 +345,85 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result Bytes { + use utils::bin_ser::LeSer; + self.ser().unwrap().into() + } +} + +/// Create new WAL record for non-transactional logical message. +/// Used for creating artificial WAL for tests, as LogicalMessage +/// record is basically no-op. +/// +/// NOTE: This leaves the xl_prev field zero. The safekeeper and +/// pageserver tolerate that, but PostgreSQL does not. +pub fn encode_logical_message(prefix: &str, message: &str) -> Vec { + let mut prefix_bytes: Vec = Vec::with_capacity(prefix.len() + 1); + prefix_bytes.write_all(prefix.as_bytes()).unwrap(); + prefix_bytes.push(0); + + let message_bytes = message.as_bytes(); + + let logical_message = XlLogicalMessage { + db_id: 0, + transactional: 0, + prefix_size: prefix_bytes.len() as u64, + message_size: message_bytes.len() as u64, + }; + + let mainrdata = logical_message.encode(); + let mainrdata_len: usize = mainrdata.len() + prefix_bytes.len() + message_bytes.len(); + // only short mainrdata is supported for now + assert!(mainrdata_len <= 255); + let mainrdata_len = mainrdata_len as u8; + + let mut data: Vec = vec![pg_constants::XLR_BLOCK_ID_DATA_SHORT, mainrdata_len]; + data.extend_from_slice(&mainrdata); + data.extend_from_slice(&prefix_bytes); + data.extend_from_slice(message_bytes); + + let total_len = XLOG_SIZE_OF_XLOG_RECORD + data.len(); + + let mut header = XLogRecord { + xl_tot_len: total_len as u32, + xl_xid: 0, + xl_prev: 0, + xl_info: 0, + xl_rmid: 21, + __bindgen_padding_0: [0u8; 2usize], + xl_crc: 0, // crc will be calculated later + }; + + let header_bytes = header.encode().expect("failed to encode header"); + let crc = crc32c_append(0, &data); + let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]); + header.xl_crc = crc; + + let mut wal: Vec = Vec::new(); + wal.extend_from_slice(&header.encode().expect("failed to encode header")); + wal.extend_from_slice(&data); + + // WAL start position must be aligned at 8 bytes, + // this will add padding for the next WAL record. + const PADDING: usize = 8; + let padding_rem = wal.len() % PADDING; + if padding_rem != 0 { + wal.resize(wal.len() + PADDING - padding_rem, 0); + } + + wal +} + #[cfg(test)] mod tests { use super::*; @@ -547,4 +625,15 @@ mod tests { checkpoint.update_next_xid(1024); assert_eq!(checkpoint.nextXid.value, 2048); } + + #[test] + pub fn test_encode_logical_message() { + let expected = [ + 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, + 38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, + 101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, + ]; + let actual = encode_logical_message("prefix", "message"); + assert_eq!(expected, actual[..]); + } } diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index e3b666da41..6ac5afb27f 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -4,8 +4,8 @@ use log::*; use once_cell::sync::Lazy; use postgres::types::PgLsn; use postgres::Client; -use postgres_ffi::pg_constants::WAL_SEGMENT_SIZE; -use postgres_ffi::xlog_utils::{ +use postgres_ffi::v14::pg_constants::WAL_SEGMENT_SIZE; +use postgres_ffi::v14::xlog_utils::{ XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, }; use std::cmp::Ordering; diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 5837447ce8..33f072553f 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -24,8 +24,13 @@ use tracing::*; use crate::reltag::{RelTag, SlruKind}; use crate::DatadirTimeline; -use postgres_ffi::xlog_utils::*; -use postgres_ffi::*; + +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName}; +use postgres_ffi::v14::{CheckPoint, ControlFileData}; +use postgres_ffi::TransactionId; +use postgres_ffi::PG_TLI; +use postgres_ffi::{BLCKSZ, RELSEG_SIZE}; use utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, @@ -200,7 +205,7 @@ where } // Add a file for each chunk of blocks (aka segment) - let chunks = (0..nblocks).chunks(pg_constants::RELSEG_SIZE as usize); + let chunks = (0..nblocks).chunks(RELSEG_SIZE as usize); for (seg, blocks) in chunks.into_iter().enumerate() { let mut segment_data: Vec = vec![]; for blknum in blocks { @@ -220,23 +225,19 @@ where fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?; - let mut slru_buf: Vec = - Vec::with_capacity(nblocks as usize * pg_constants::BLCKSZ as usize); + let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * BLCKSZ as usize); for blknum in 0..nblocks { let img = self .timeline .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?; if slru == SlruKind::Clog { - ensure!( - img.len() == pg_constants::BLCKSZ as usize - || img.len() == pg_constants::BLCKSZ as usize + 8 - ); + ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8); } else { - ensure!(img.len() == pg_constants::BLCKSZ as usize); + ensure!(img.len() == BLCKSZ as usize); } - slru_buf.extend_from_slice(&img[..pg_constants::BLCKSZ as usize]); + slru_buf.extend_from_slice(&img[..BLCKSZ as usize]); } let segname = format!("{}/{:>04X}", slru.to_str(), segno); diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 7d1e8e43aa..729829c5e8 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -15,13 +15,24 @@ use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::walingest::WalIngest; use crate::walrecord::DecodedWALRecord; -use postgres_ffi::relfile_utils::*; -use postgres_ffi::waldecoder::*; -use postgres_ffi::xlog_utils::*; +use postgres_ffi::v14::relfile_utils::*; +use postgres_ffi::v14::waldecoder::*; +use postgres_ffi::v14::xlog_utils::*; +use postgres_ffi::v14::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED}; use postgres_ffi::Oid; -use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED}; +use postgres_ffi::BLCKSZ; use utils::lsn::Lsn; +// Returns checkpoint LSN from controlfile +pub fn get_lsn_from_controlfile(path: &Path) -> Result { + // Read control file to extract the LSN + let controlfile_path = path.join("global").join("pg_control"); + let controlfile = ControlFileData::decode(&std::fs::read(controlfile_path)?)?; + let lsn = controlfile.checkPoint; + + Ok(Lsn(lsn)) +} + /// /// Import all relation data pages from local disk into the repository. /// @@ -110,8 +121,8 @@ fn import_rel( let mut buf: [u8; 8192] = [0u8; 8192]; - ensure!(len % pg_constants::BLCKSZ as usize == 0); - let nblocks = len / pg_constants::BLCKSZ as usize; + ensure!(len % BLCKSZ as usize == 0); + let nblocks = len / BLCKSZ as usize; let rel = RelTag { spcnode: spcoid, @@ -120,7 +131,7 @@ fn import_rel( forknum, }; - let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32); + let mut blknum: u32 = segno * (1024 * 1024 * 1024 / BLCKSZ as u32); // Call put_rel_creation for every segment of the relation, // because there is no guarantee about the order in which we are processing segments. @@ -144,8 +155,7 @@ fn import_rel( Err(err) => match err.kind() { std::io::ErrorKind::UnexpectedEof => { // reached EOF. That's expected. - let relative_blknum = - blknum - segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32); + let relative_blknum = blknum - segno * (1024 * 1024 * 1024 / BLCKSZ as u32); ensure!(relative_blknum == nblocks as u32, "unexpected EOF"); break; } @@ -184,8 +194,8 @@ fn import_slru( .to_string_lossy(); let segno = u32::from_str_radix(filename, 16)?; - ensure!(len % pg_constants::BLCKSZ as usize == 0); // we assume SLRU block size is the same as BLCKSZ - let nblocks = len / pg_constants::BLCKSZ as usize; + ensure!(len % BLCKSZ as usize == 0); // we assume SLRU block size is the same as BLCKSZ + let nblocks = len / BLCKSZ as usize; ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize); diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs index da213704f3..64024a2d8d 100644 --- a/pageserver/src/keyspace.rs +++ b/pageserver/src/keyspace.rs @@ -1,5 +1,5 @@ use crate::repository::{key_range_size, singleton_range, Key}; -use postgres_ffi::pg_constants; +use postgres_ffi::BLCKSZ; use std::ops::Range; /// @@ -19,7 +19,7 @@ impl KeySpace { /// pub fn partition(&self, target_size: u64) -> KeyPartitioning { // Assume that each value is 8k in size. - let target_nblocks = (target_size / pg_constants::BLCKSZ as u64) as usize; + let target_nblocks = (target_size / BLCKSZ as u64) as usize; let mut parts = Vec::new(); let mut current_part = Vec::new(); diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index e27619cc83..6ef4915bdb 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -45,7 +45,7 @@ use crate::reltag::RelTag; use crate::tenant_config::TenantConfOpt; use crate::DatadirTimeline; -use postgres_ffi::xlog_utils::to_pg_timestamp; +use postgres_ffi::v14::xlog_utils::to_pg_timestamp; use utils::{ lsn::{AtomicLsn, Lsn, RecordLsn}, seqwait::SeqWait, diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 716df0f749..818eaf1b8f 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -83,7 +83,7 @@ pub fn get() -> &'static PageCache { } } -pub const PAGE_SZ: usize = postgres_ffi::pg_constants::BLCKSZ as usize; +pub const PAGE_SZ: usize = postgres_ffi::BLCKSZ as usize; const MAX_USAGE_COUNT: u8 = 5; /// diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 3c5ea5267e..b63bb90be1 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -40,9 +40,10 @@ use crate::thread_mgr; use crate::thread_mgr::ThreadKind; use crate::CheckpointConfig; use metrics::{register_histogram_vec, HistogramVec}; -use postgres_ffi::xlog_utils::to_pg_timestamp; +use postgres_ffi::v14::xlog_utils::to_pg_timestamp; -use postgres_ffi::pg_constants; +use postgres_ffi::v14::pg_constants::DEFAULTTABLESPACE_OID; +use postgres_ffi::BLCKSZ; // Wrapped in libpq CopyData enum PagestreamFeMessage { @@ -725,10 +726,9 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; - let total_blocks = - timeline.get_db_size(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?; + let total_blocks = timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn)?; - let db_size = total_blocks as i64 * pg_constants::BLCKSZ as i64; + let db_size = total_blocks as i64 * BLCKSZ as i64; Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse { db_size, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 113f40302a..88fac0ad5a 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -13,8 +13,10 @@ use crate::repository::*; use crate::walrecord::ZenithWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; -use postgres_ffi::xlog_utils::TimestampTz; -use postgres_ffi::{pg_constants, Oid, TransactionId}; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::xlog_utils::TimestampTz; +use postgres_ffi::BLCKSZ; +use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::ops::Range; @@ -297,9 +299,9 @@ pub trait DatadirTimeline: Timeline { let clog_page = self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?; - if clog_page.len() == pg_constants::BLCKSZ as usize + 8 { + if clog_page.len() == BLCKSZ as usize + 8 { let mut timestamp_bytes = [0u8; 8]; - timestamp_bytes.copy_from_slice(&clog_page[pg_constants::BLCKSZ as usize..]); + timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]); let timestamp = TimestampTz::from_be_bytes(timestamp_bytes); if timestamp >= search_timestamp { @@ -382,7 +384,7 @@ pub trait DatadirTimeline: Timeline { total_size += relsize as usize; } } - Ok(total_size * pg_constants::BLCKSZ as usize) + Ok(total_size * BLCKSZ as usize) } /// @@ -912,7 +914,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { result?; if pending_nblocks != 0 { - writer.update_current_logical_size(pending_nblocks * pg_constants::BLCKSZ as isize); + writer.update_current_logical_size(pending_nblocks * BLCKSZ as isize); self.pending_nblocks = 0; } @@ -940,7 +942,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { writer.finish_write(lsn); if pending_nblocks != 0 { - writer.update_current_logical_size(pending_nblocks * pg_constants::BLCKSZ as isize); + writer.update_current_logical_size(pending_nblocks * BLCKSZ as isize); } Ok(()) @@ -1014,7 +1016,7 @@ struct SlruSegmentDirectory { segments: HashSet, } -static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; pg_constants::BLCKSZ as usize]); +static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); // Layout of the Key address space // diff --git a/pageserver/src/reltag.rs b/pageserver/src/reltag.rs index fadd41f547..e3d08f8b3d 100644 --- a/pageserver/src/reltag.rs +++ b/pageserver/src/reltag.rs @@ -2,8 +2,9 @@ use serde::{Deserialize, Serialize}; use std::cmp::Ordering; use std::fmt; -use postgres_ffi::relfile_utils::forknumber_to_name; -use postgres_ffi::{pg_constants, Oid}; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::relfile_utils::forknumber_to_name; +use postgres_ffi::Oid; /// /// Relation data file segment id throughout the Postgres cluster. diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index ed5975d3bd..0d35195691 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -3,7 +3,7 @@ // use anyhow::{bail, ensure, Context, Result}; -use postgres_ffi::ControlFileData; + use std::{ fs, path::Path, @@ -69,16 +69,6 @@ pub fn create_repo( ))) } -// Returns checkpoint LSN from controlfile -fn get_lsn_from_controlfile(path: &Path) -> Result { - // Read control file to extract the LSN - let controlfile_path = path.join("global").join("pg_control"); - let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?; - let lsn = controlfile.checkPoint; - - Ok(Lsn(lsn)) -} - // Create the cluster temporarily in 'initdbpath' directory inside the repository // to get bootstrap data for timeline initialization. // @@ -128,7 +118,7 @@ fn bootstrap_timeline( run_initdb(conf, &initdb_path)?; let pgdata_path = initdb_path; - let lsn = get_lsn_from_controlfile(&pgdata_path)?.align(); + let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); // Import the contents of the data directory at the initial checkpoint // LSN, and any WAL after that. diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index b8064849e0..1b046b9f33 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -22,8 +22,8 @@ //! bespoken Rust code. use anyhow::Context; -use postgres_ffi::nonrelfile_utils::clogpage_precedes; -use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment; +use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes; +use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment; use postgres_ffi::{page_is_new, page_set_lsn}; use anyhow::Result; @@ -33,10 +33,12 @@ use tracing::*; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::walrecord::*; -use postgres_ffi::nonrelfile_utils::mx_offset_to_member_segment; -use postgres_ffi::xlog_utils::*; +use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::xlog_utils::*; +use postgres_ffi::v14::CheckPoint; use postgres_ffi::TransactionId; -use postgres_ffi::{pg_constants, CheckPoint}; +use postgres_ffi::BLCKSZ; use utils::lsn::Lsn; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); @@ -293,7 +295,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { // Extract page image from FPI record let img_len = blk.bimg_len as usize; let img_offs = blk.bimg_offset as usize; - let mut image = BytesMut::with_capacity(pg_constants::BLCKSZ as usize); + let mut image = BytesMut::with_capacity(BLCKSZ as usize); image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]); if blk.hole_length != 0 { @@ -309,7 +311,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { if !page_is_new(&image) { page_set_lsn(&mut image, lsn) } - assert_eq!(image.len(), pg_constants::BLCKSZ as usize); + assert_eq!(image.len(), BLCKSZ as usize); self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?; } else { let rec = ZenithWalRecord::Postgres { diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 16a1f232e3..0688086117 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -27,7 +27,7 @@ use crate::{ walingest::WalIngest, walrecord::DecodedWALRecord, }; -use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::v14::waldecoder::WalStreamDecoder; use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; /// Status of the connection. diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 6b01d52005..c56b1c6c0c 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -3,9 +3,10 @@ //! use anyhow::Result; use bytes::{Buf, Bytes}; -use postgres_ffi::pg_constants; -use postgres_ffi::xlog_utils::{TimestampTz, XLOG_SIZE_OF_XLOG_RECORD}; -use postgres_ffi::XLogRecord; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::xlog_utils::{TimestampTz, XLOG_SIZE_OF_XLOG_RECORD}; +use postgres_ffi::v14::XLogRecord; +use postgres_ffi::BLCKSZ; use postgres_ffi::{BlockNumber, OffsetNumber}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; use serde::{Deserialize, Serialize}; @@ -618,7 +619,7 @@ pub fn decode_wal_record( blk.hole_length = 0; } } else { - blk.hole_length = pg_constants::BLCKSZ - blk.bimg_len; + blk.hole_length = BLCKSZ - blk.bimg_len; } datatotal += blk.bimg_len as u32; blocks_total_len += blk.bimg_len as u32; @@ -628,9 +629,7 @@ pub fn decode_wal_record( * bimg_len < BLCKSZ if the HAS_HOLE flag is set. */ if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 - && (blk.hole_offset == 0 - || blk.hole_length == 0 - || blk.bimg_len == pg_constants::BLCKSZ) + && (blk.hole_offset == 0 || blk.hole_length == 0 || blk.bimg_len == BLCKSZ) { // TODO /* @@ -667,7 +666,7 @@ pub fn decode_wal_record( * flag is set. */ if (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0) - && blk.bimg_len == pg_constants::BLCKSZ + && blk.bimg_len == BLCKSZ { // TODO /* @@ -685,7 +684,7 @@ pub fn decode_wal_record( */ if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0 && blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0 - && blk.bimg_len != pg_constants::BLCKSZ + && blk.bimg_len != BLCKSZ { // TODO /* diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 57817dbc9c..9cf347573a 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -44,11 +44,12 @@ use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; use crate::walrecord::ZenithWalRecord; use metrics::{register_histogram, register_int_counter, Histogram, IntCounter}; -use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift; -use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_offset; -use postgres_ffi::nonrelfile_utils::mx_offset_to_member_offset; -use postgres_ffi::nonrelfile_utils::transaction_id_set_status; -use postgres_ffi::pg_constants; +use postgres_ffi::v14::nonrelfile_utils::{ + mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, + transaction_id_set_status, +}; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::BLCKSZ; /// /// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster. @@ -417,10 +418,10 @@ impl PostgresRedoManager { } // Append the timestamp - if page.len() == pg_constants::BLCKSZ as usize + 8 { - page.truncate(pg_constants::BLCKSZ as usize); + if page.len() == BLCKSZ as usize + 8 { + page.truncate(BLCKSZ as usize); } - if page.len() == pg_constants::BLCKSZ as usize { + if page.len() == BLCKSZ as usize { page.extend_from_slice(×tamp.to_be_bytes()); } else { warn!( @@ -741,7 +742,7 @@ impl PostgresRedoProcess { // We expect the WAL redo process to respond with an 8k page image. We read it // into this buffer. - let mut resultbuf = vec![0; pg_constants::BLCKSZ.into()]; + let mut resultbuf = vec![0; BLCKSZ.into()]; let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far // Prepare for calling poll() @@ -754,7 +755,7 @@ impl PostgresRedoProcess { // We do three things simultaneously: send the old base image and WAL records to // the child process's stdin, read the result from child's stdout, and forward any logging // information that the child writes to its stderr to the page server's log. - while nresult < pg_constants::BLCKSZ.into() { + while nresult < BLCKSZ.into() { // If we have more data to write, wake up if 'stdin' becomes writeable or // we have data to read. Otherwise only wake up if there's data to read. let nfds = if nwrite < writebuf.len() { 3 } else { 2 }; diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index a8121e829e..63bc9bd517 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -9,7 +9,7 @@ use crate::timeline::{Timeline, TimelineTools}; use crate::SafeKeeperConf; use anyhow::{bail, Context, Result}; -use postgres_ffi::xlog_utils::PG_TLI; +use postgres_ffi::PG_TLI; use regex::Regex; use std::str::FromStr; use std::sync::Arc; diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 97fb3654d2..3f84e7b183 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -7,8 +7,7 @@ //! use anyhow::Result; -use bytes::{BufMut, Bytes, BytesMut}; -use crc32c::crc32c_append; +use bytes::Bytes; use serde::{Deserialize, Serialize}; use tracing::*; @@ -19,9 +18,8 @@ use crate::safekeeper::{ }; use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry}; use crate::timeline::TimelineTools; -use postgres_ffi::pg_constants; -use postgres_ffi::xlog_utils; -use postgres_ffi::{uint32, uint64, Oid, XLogRecord}; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::xlog_utils; use utils::{ lsn::Lsn, postgres_backend::PostgresBackend, @@ -144,7 +142,7 @@ fn append_logical_message( spg: &mut SafekeeperPostgresHandler, msg: &AppendLogicalMessage, ) -> Result { - let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); + let wal_data = xlog_utils::encode_logical_message(&msg.lm_prefix, &msg.lm_message); let sk_state = spg.timeline.get().get_state().1; let begin_lsn = msg.begin_lsn; @@ -182,90 +180,3 @@ fn append_logical_message( append_response, }) } - -#[repr(C)] -#[derive(Debug, Clone, Default, Serialize, Deserialize)] -struct XlLogicalMessage { - db_id: Oid, - transactional: uint32, // bool, takes 4 bytes due to alignment in C structures - prefix_size: uint64, - message_size: uint64, -} - -impl XlLogicalMessage { - pub fn encode(&self) -> Bytes { - use utils::bin_ser::LeSer; - self.ser().unwrap().into() - } -} - -/// Create new WAL record for non-transactional logical message. -/// Used for creating artificial WAL for tests, as LogicalMessage -/// record is basically no-op. -fn encode_logical_message(prefix: &str, message: &str) -> Vec { - let mut prefix_bytes = BytesMut::with_capacity(prefix.len() + 1); - prefix_bytes.put(prefix.as_bytes()); - prefix_bytes.put_u8(0); - - let message_bytes = message.as_bytes(); - - let logical_message = XlLogicalMessage { - db_id: 0, - transactional: 0, - prefix_size: prefix_bytes.len() as u64, - message_size: message_bytes.len() as u64, - }; - - let mainrdata = logical_message.encode(); - let mainrdata_len: usize = mainrdata.len() + prefix_bytes.len() + message_bytes.len(); - // only short mainrdata is supported for now - assert!(mainrdata_len <= 255); - let mainrdata_len = mainrdata_len as u8; - - let mut data: Vec = vec![pg_constants::XLR_BLOCK_ID_DATA_SHORT, mainrdata_len]; - data.extend_from_slice(&mainrdata); - data.extend_from_slice(&prefix_bytes); - data.extend_from_slice(message_bytes); - - let total_len = xlog_utils::XLOG_SIZE_OF_XLOG_RECORD + data.len(); - - let mut header = XLogRecord { - xl_tot_len: total_len as u32, - xl_xid: 0, - xl_prev: 0, - xl_info: 0, - xl_rmid: 21, - __bindgen_padding_0: [0u8; 2usize], - xl_crc: 0, // crc will be calculated later - }; - - let header_bytes = header.encode().expect("failed to encode header"); - let crc = crc32c_append(0, &data); - let crc = crc32c_append(crc, &header_bytes[0..xlog_utils::XLOG_RECORD_CRC_OFFS]); - header.xl_crc = crc; - - let mut wal: Vec = Vec::new(); - wal.extend_from_slice(&header.encode().expect("failed to encode header")); - wal.extend_from_slice(&data); - - // WAL start position must be aligned at 8 bytes, - // this will add padding for the next WAL record. - const PADDING: usize = 8; - let padding_rem = wal.len() % PADDING; - if padding_rem != 0 { - wal.resize(wal.len() + PADDING - padding_rem, 0); - } - - wal -} - -#[test] -fn test_encode_logical_message() { - let expected = [ - 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38, - 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102, - 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, - ]; - let actual = encode_logical_message("prefix", "message"); - assert_eq!(expected, actual[..]); -} diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index fe4f9d231c..648f0634f8 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -7,7 +7,7 @@ use metrics::{ proto::MetricFamily, Gauge, IntGaugeVec, }; -use postgres_ffi::xlog_utils::XLogSegNo; +use postgres_ffi::v14::xlog_utils::XLogSegNo; use utils::{lsn::Lsn, zid::ZTenantTimelineId}; use crate::{ diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 88747f14e5..22f8ca2de4 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -5,9 +5,7 @@ use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use etcd_broker::subscription_value::SkTimelineInfo; -use postgres_ffi::xlog_utils::TimeLineID; - -use postgres_ffi::xlog_utils::XLogSegNo; +use postgres_ffi::v14::xlog_utils::{TimeLineID, XLogSegNo, MAX_SEND_SIZE}; use serde::{Deserialize, Serialize}; use std::cmp::max; use std::cmp::min; @@ -19,7 +17,6 @@ use crate::control_file; use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; -use postgres_ffi::xlog_utils::MAX_SEND_SIZE; use utils::{ bin_ser::LeSer, lsn::Lsn, diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 4a9c56859f..243d7bf7d0 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -6,7 +6,7 @@ use crate::timeline::{ReplicaState, Timeline, TimelineTools}; use crate::wal_storage::WalReader; use anyhow::{bail, Context, Result}; -use postgres_ffi::xlog_utils::{get_current_timestamp, TimestampTz, MAX_SEND_SIZE}; +use postgres_ffi::v14::xlog_utils::{get_current_timestamp, TimestampTz, MAX_SEND_SIZE}; use bytes::Bytes; use serde::{Deserialize, Serialize}; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 161fca3595..3a10c5d59e 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -4,8 +4,9 @@ use anyhow::{bail, Context, Result}; use etcd_broker::subscription_value::SkTimelineInfo; + use once_cell::sync::Lazy; -use postgres_ffi::xlog_utils::XLogSegNo; +use postgres_ffi::v14::xlog_utils::XLogSegNo; use serde::Serialize; use tokio::sync::watch; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index b2f9d8d4f3..3552452470 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -11,7 +11,8 @@ use std::pin::Pin; use std::sync::Arc; use std::time::Duration; -use postgres_ffi::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, PG_TLI}; +use postgres_ffi::v14::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr}; +use postgres_ffi::PG_TLI; use remote_storage::{GenericRemoteStorage, RemoteStorage}; use tokio::fs::File; use tokio::runtime::Builder; diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 5f4bf588c7..6a45ae1411 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -13,9 +13,10 @@ use std::pin::Pin; use tokio::io::AsyncRead; use once_cell::sync::Lazy; -use postgres_ffi::xlog_utils::{ - find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, XLogSegNo, PG_TLI, +use postgres_ffi::v14::xlog_utils::{ + find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, XLogSegNo, }; +use postgres_ffi::PG_TLI; use std::cmp::min; use std::fs::{self, remove_file, File, OpenOptions}; @@ -30,9 +31,10 @@ use crate::safekeeper::SafeKeeperState; use crate::wal_backup::read_object; use crate::SafeKeeperConf; -use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ}; +use postgres_ffi::v14::xlog_utils::XLogFileName; +use postgres_ffi::XLOG_BLCKSZ; -use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::v14::waldecoder::WalStreamDecoder; use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; From 1a07ddae5fa5b2096a5a356b8df6bb8d2bc30896 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 18 Aug 2022 00:25:15 +0300 Subject: [PATCH 0646/1022] fix cargo test --- pageserver/src/walingest.rs | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 1b046b9f33..05afe4ba3e 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1035,7 +1035,8 @@ mod tests { use crate::pgdatadir_mapping::create_test_timeline; use crate::repository::repo_harness::*; use crate::repository::Timeline; - use postgres_ffi::pg_constants; + use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; + use postgres_ffi::RELSEG_SIZE; /// Arbitrary relation tag, for testing. const TESTREL_A: RelTag = RelTag { @@ -1324,7 +1325,7 @@ mod tests { let mut walingest = init_walingest_test(&*tline)?; let mut lsn = 0x10; - for blknum in 0..pg_constants::RELSEG_SIZE + 1 { + for blknum in 0..RELSEG_SIZE + 1 { lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); @@ -1334,31 +1335,22 @@ mod tests { assert_current_logical_size(&*tline, Lsn(lsn)); - assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn))?, - pg_constants::RELSEG_SIZE + 1 - ); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn))?, RELSEG_SIZE + 1); // Truncate one block lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); - walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE)?; + walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)?; m.commit()?; - assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn))?, - pg_constants::RELSEG_SIZE - ); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn))?, RELSEG_SIZE); assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate another block lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); - walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE - 1)?; + walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)?; m.commit()?; - assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn))?, - pg_constants::RELSEG_SIZE - 1 - ); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn))?, RELSEG_SIZE - 1); assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate to 1500, and then truncate all the way down to 0, one block at a time From 976576ae599f1fa742c467f22798bc3c39be049d Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 18 Aug 2022 13:38:23 +0300 Subject: [PATCH 0647/1022] Fix walreceiver and safekeeper bugs (#2295) - There was an issue with zero commit_lsn `reason: LaggingWal { current_commit_lsn: 0/0, new_commit_lsn: 1/6FD90D38, threshold: 10485760 } }`. The problem was in `send_wal.rs`, where we initialized `end_pos = Lsn(0)` and in some cases sent it to the pageserver. - IDENTIFY_SYSTEM previously returned `flush_lsn` as a physical end of WAL. Now it returns `flush_lsn` (as it was) to walproposer and `commit_lsn` to everyone else including pageserver. - There was an issue with backoff where connection was cancelled right after initialization: `connected!` -> `safekeeper_handle_db: Connection cancelled` -> `Backoff: waiting 3 seconds`. The problem was in sleeping before establishing the connection. This is fixed by reworking retry logic. - There was an issue with getting `NoKeepAlives` reason in a loop. The issue is probably the same as the previous. - There was an issue with filtering safekeepers based on retry attempts, which could filter some safekeepers indefinetely. This is fixed by using retry cooldown duration instead of retry attempts. - Some `send_wal.rs` connections failed with errors without context. This is fixed by adding a timeline to safekeepers errors. New retry logic works like this: - Every candidate has a `next_retry_at` timestamp and is not considered for connection until that moment - When walreceiver connection is closed, we update `next_retry_at` using exponential backoff, increasing the cooldown on every disconnect. - When `last_record_lsn` was advanced using the WAL from the safekeeper, we reset the retry cooldown and exponential backoff, allowing walreceiver to reconnect to the same safekeeper instantly. --- .../src/walreceiver/connection_manager.rs | 200 ++++++++++++------ .../src/walreceiver/walreceiver_connection.rs | 33 ++- safekeeper/src/handler.rs | 49 +++-- safekeeper/src/send_wal.rs | 12 +- 4 files changed, 189 insertions(+), 105 deletions(-) diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 0f11a2197a..e8e0a7c52b 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -96,6 +96,8 @@ async fn connection_manager_loop_step( info!("Subscribed for etcd timeline changes, waiting for new etcd data"); loop { + let time_until_next_retry = walreceiver_state.time_until_next_retry(); + select! { broker_connection_result = &mut broker_subscription.watcher_handle => { cleanup_broker_connection(broker_connection_result, walreceiver_state); @@ -110,27 +112,23 @@ async fn connection_manager_loop_step( } => { let wal_connection = walreceiver_state.wal_connection.as_mut().expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { - TaskEvent::Started => { - *walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0) += 1; - }, + TaskEvent::Started => {}, TaskEvent::NewEvent(status) => { - if status.has_received_wal { - // Reset connection attempts here only, we know that safekeeper is healthy - // because it can send us a WAL update. - walreceiver_state.wal_connection_attempts.remove(&wal_connection.sk_id); + if status.has_processed_wal { + // We have advanced last_record_lsn by processing the WAL received + // from this safekeeper. This is good enough to clean unsuccessful + // retries history and allow reconnecting to this safekeeper without + // sleeping for a long time. + walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); } wal_connection.status = status; }, TaskEvent::End(end_result) => { match end_result { Ok(()) => debug!("WAL receiving task finished"), - Err(e) => { - warn!("WAL receiving task failed: {e}"); - // If the task failed, set the connection attempts to at least 1, to try other safekeepers. - let _ = *walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(1); - } + Err(e) => warn!("WAL receiving task failed: {e}"), }; - walreceiver_state.wal_connection = None; + walreceiver_state.drop_old_connection(false).await; }, } }, @@ -154,6 +152,8 @@ async fn connection_manager_loop_step( } } }, + + _ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {} } // Fetch more etcd timeline updates, but limit ourselves since they may arrive quickly. @@ -234,6 +234,10 @@ async fn subscribe_for_timeline_updates( } } +const WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS: f64 = 0.1; +const WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS: f64 = 15.0; +const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5; + /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. struct WalreceiverState { id: ZTenantTimelineId, @@ -247,7 +251,8 @@ struct WalreceiverState { max_lsn_wal_lag: NonZeroU64, /// Current connection to safekeeper for WAL streaming. wal_connection: Option, - wal_connection_attempts: HashMap, + /// Info about retries and unsuccessful attempts to connect to safekeepers. + wal_connection_retries: HashMap, /// Data about all timelines, available for connection, fetched from etcd, grouped by their corresponding safekeeper node id. wal_stream_candidates: HashMap, } @@ -255,6 +260,8 @@ struct WalreceiverState { /// Current connection data. #[derive(Debug)] struct WalConnection { + /// Time when the connection was initiated. + started_at: NaiveDateTime, /// Current safekeeper pageserver is connected to for WAL streaming. sk_id: NodeId, /// Status of the connection. @@ -274,6 +281,12 @@ struct NewCommittedWAL { discovered_at: NaiveDateTime, } +#[derive(Debug)] +struct RetryInfo { + next_retry_at: Option, + retry_duration_seconds: f64, +} + /// Data about the timeline to connect to, received from etcd. #[derive(Debug)] struct EtcdSkTimeline { @@ -300,31 +313,18 @@ impl WalreceiverState { max_lsn_wal_lag, wal_connection: None, wal_stream_candidates: HashMap::new(), - wal_connection_attempts: HashMap::new(), + wal_connection_retries: HashMap::new(), } } /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. async fn change_connection(&mut self, new_sk_id: NodeId, new_wal_source_connstr: String) { - if let Some(old_connection) = self.wal_connection.take() { - old_connection.connection_task.shutdown().await - } + self.drop_old_connection(true).await; let id = self.id; let connect_timeout = self.wal_connect_timeout; - let connection_attempt = self - .wal_connection_attempts - .get(&new_sk_id) - .copied() - .unwrap_or(0); let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { async move { - exponential_backoff( - connection_attempt, - DEFAULT_BASE_BACKOFF_SECONDS, - DEFAULT_MAX_BACKOFF_SECONDS, - ) - .await; super::walreceiver_connection::handle_walreceiver_connection( id, &new_wal_source_connstr, @@ -340,10 +340,11 @@ impl WalreceiverState { let now = Utc::now().naive_utc(); self.wal_connection = Some(WalConnection { + started_at: now, sk_id: new_sk_id, status: WalConnectionStatus { is_connected: false, - has_received_wal: false, + has_processed_wal: false, latest_connection_update: now, latest_wal_update: now, streaming_lsn: None, @@ -354,6 +355,71 @@ impl WalreceiverState { }); } + /// Drops the current connection (if any) and updates retry timeout for the next + /// connection attempt to the same safekeeper. + async fn drop_old_connection(&mut self, needs_shutdown: bool) { + let wal_connection = match self.wal_connection.take() { + Some(wal_connection) => wal_connection, + None => return, + }; + + if needs_shutdown { + wal_connection.connection_task.shutdown().await; + } + + let retry = self + .wal_connection_retries + .entry(wal_connection.sk_id) + .or_insert(RetryInfo { + next_retry_at: None, + retry_duration_seconds: WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS, + }); + + let now = Utc::now().naive_utc(); + + // Schedule the next retry attempt. We want to have exponential backoff for connection attempts, + // and we add backoff to the time when we started the connection attempt. If the connection + // was active for a long time, then next_retry_at will be in the past. + retry.next_retry_at = + wal_connection + .started_at + .checked_add_signed(chrono::Duration::milliseconds( + (retry.retry_duration_seconds * 1000.0) as i64, + )); + + if let Some(next) = &retry.next_retry_at { + if next > &now { + info!( + "Next connection retry to {:?} is at {}", + wal_connection.sk_id, next + ); + } + } + + let next_retry_duration = + retry.retry_duration_seconds * WALCONNECTION_RETRY_BACKOFF_MULTIPLIER; + // Clamp the next retry duration to the maximum allowed. + let next_retry_duration = next_retry_duration.min(WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS); + // Clamp the next retry duration to the minimum allowed. + let next_retry_duration = next_retry_duration.max(WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS); + + retry.retry_duration_seconds = next_retry_duration; + } + + /// Returns time needed to wait to have a new candidate for WAL streaming. + fn time_until_next_retry(&self) -> Option { + let now = Utc::now().naive_utc(); + + let next_retry_at = self + .wal_connection_retries + .values() + .filter_map(|retry| retry.next_retry_at) + .filter(|next_retry_at| next_retry_at > &now) + .min(); + + next_retry_at.and_then(|next_retry_at| (next_retry_at - now).to_std().ok()) + } + /// Adds another etcd timeline into the state, if its more recent than the one already added there for the same key. fn register_timeline_update(&mut self, timeline_update: BrokerUpdate) { match self @@ -547,52 +613,37 @@ impl WalreceiverState { /// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another. /// /// The candidate that is chosen: - /// * has fewest connection attempts from pageserver to safekeeper node (reset every time we receive a WAL message from the node) - /// * has greatest data Lsn among the ones that are left - /// - /// NOTE: - /// We evict timeline data received from etcd based on time passed since it was registered, along with its connection attempts values, but - /// otherwise to reset the connection attempts, a successful connection to that node is needed. - /// That won't happen now, before all nodes with less connection attempts are connected to first, which might leave the sk node with more advanced state to be ignored. + /// * has no pending retry cooldown + /// * has greatest commit_lsn among the ones that are left fn select_connection_candidate( &self, node_to_omit: Option, ) -> Option<(NodeId, &SkTimelineInfo, String)> { - let all_candidates = self - .applicable_connection_candidates() + self.applicable_connection_candidates() .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit) - .collect::>(); - - let smallest_attempts_allowed = all_candidates - .iter() - .map(|(sk_id, _, _)| { - self.wal_connection_attempts - .get(sk_id) - .copied() - .unwrap_or(0) - }) - .min()?; - - all_candidates - .into_iter() - .filter(|(sk_id, _, _)| { - smallest_attempts_allowed - >= self - .wal_connection_attempts - .get(sk_id) - .copied() - .unwrap_or(0) - }) .max_by_key(|(_, info, _)| info.commit_lsn) } /// Returns a list of safekeepers that have valid info and ready for connection. + /// Some safekeepers are filtered by the retry cooldown. fn applicable_connection_candidates( &self, ) -> impl Iterator { + let now = Utc::now().naive_utc(); + self.wal_stream_candidates .iter() .filter(|(_, info)| info.timeline.commit_lsn.is_some()) + .filter(move |(sk_id, _)| { + let next_retry_at = self + .wal_connection_retries + .get(sk_id) + .and_then(|retry_info| { + retry_info.next_retry_at + }); + + next_retry_at.is_none() || next_retry_at.unwrap() <= now + }) .filter_map(|(sk_id, etcd_info)| { let info = &etcd_info.timeline; match wal_stream_connection_string( @@ -627,7 +678,7 @@ impl WalreceiverState { }); for node_id in node_ids_to_remove { - self.wal_connection_attempts.remove(&node_id); + self.wal_connection_retries.remove(&node_id); } } } @@ -684,7 +735,6 @@ fn wal_stream_connection_string( #[cfg(test)] mod tests { - use crate::repository::{ repo_harness::{RepoHarness, TIMELINE_ID}, Repository, @@ -789,7 +839,7 @@ mod tests { let connection_status = WalConnectionStatus { is_connected: true, - has_received_wal: true, + has_processed_wal: true, latest_connection_update: now, latest_wal_update: now, commit_lsn: Some(Lsn(current_lsn)), @@ -798,6 +848,7 @@ mod tests { state.max_lsn_wal_lag = NonZeroU64::new(100).unwrap(); state.wal_connection = Some(WalConnection { + started_at: now, sk_id: connected_sk_id, status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { @@ -1017,7 +1068,13 @@ mod tests { }, ), ]); - state.wal_connection_attempts = HashMap::from([(NodeId(0), 1), (NodeId(1), 0)]); + state.wal_connection_retries = HashMap::from([( + NodeId(0), + RetryInfo { + next_retry_at: now.checked_add_signed(chrono::Duration::hours(1)), + retry_duration_seconds: WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS, + }, + )]); let candidate_with_less_errors = state .next_connection_candidate() @@ -1025,7 +1082,7 @@ mod tests { assert_eq!( candidate_with_less_errors.safekeeper_id, NodeId(1), - "Should select the node with less connection errors" + "Should select the node with no pending retry cooldown" ); Ok(()) @@ -1043,7 +1100,7 @@ mod tests { let connection_status = WalConnectionStatus { is_connected: true, - has_received_wal: true, + has_processed_wal: true, latest_connection_update: now, latest_wal_update: now, commit_lsn: Some(current_lsn), @@ -1051,6 +1108,7 @@ mod tests { }; state.wal_connection = Some(WalConnection { + started_at: now, sk_id: connected_sk_id, status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { @@ -1130,7 +1188,7 @@ mod tests { let connection_status = WalConnectionStatus { is_connected: true, - has_received_wal: true, + has_processed_wal: true, latest_connection_update: time_over_threshold, latest_wal_update: time_over_threshold, commit_lsn: Some(current_lsn), @@ -1138,6 +1196,7 @@ mod tests { }; state.wal_connection = Some(WalConnection { + started_at: now, sk_id: NodeId(1), status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { @@ -1202,7 +1261,7 @@ mod tests { let connection_status = WalConnectionStatus { is_connected: true, - has_received_wal: true, + has_processed_wal: true, latest_connection_update: now, latest_wal_update: time_over_threshold, commit_lsn: Some(current_lsn), @@ -1210,6 +1269,7 @@ mod tests { }; state.wal_connection = Some(WalConnection { + started_at: now, sk_id: NodeId(1), status: connection_status, connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }), @@ -1281,7 +1341,7 @@ mod tests { max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), wal_connection: None, wal_stream_candidates: HashMap::new(), - wal_connection_attempts: HashMap::new(), + wal_connection_retries: HashMap::new(), } } } diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 0688086117..025bfeb506 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -35,8 +35,9 @@ use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; pub struct WalConnectionStatus { /// If we were able to initiate a postgres connection, this means that safekeeper process is at least running. pub is_connected: bool, - /// Defines a healthy connection as one on which we have received at least some WAL bytes. - pub has_received_wal: bool, + /// Defines a healthy connection as one on which pageserver received WAL from safekeeper + /// and is able to process it in walingest without errors. + pub has_processed_wal: bool, /// Connection establishment time or the timestamp of a latest connection message received. pub latest_connection_update: NaiveDateTime, /// Time of the latest WAL message received. @@ -71,7 +72,7 @@ pub async fn handle_walreceiver_connection( info!("connected!"); let mut connection_status = WalConnectionStatus { is_connected: true, - has_received_wal: false, + has_processed_wal: false, latest_connection_update: Utc::now().naive_utc(), latest_wal_update: Utc::now().naive_utc(), streaming_lsn: None, @@ -117,13 +118,6 @@ pub async fn handle_walreceiver_connection( let identify = identify_system(&mut replication_client).await?; info!("{identify:?}"); - connection_status.latest_connection_update = Utc::now().naive_utc(); - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { - warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); - return Ok(()); - } - - // NB: this is a flush_lsn, not a commit_lsn. let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); let mut caught_up = false; let ZTenantTimelineId { @@ -131,6 +125,14 @@ pub async fn handle_walreceiver_connection( timeline_id, } = id; + connection_status.latest_connection_update = Utc::now().naive_utc(); + connection_status.latest_wal_update = Utc::now().naive_utc(); + connection_status.commit_lsn = Some(end_of_wal); + if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); + return Ok(()); + } + let (repo, timeline) = tokio::task::spawn_blocking(move || { let repo = tenant_mgr::get_repository_for_tenant(tenant_id) .with_context(|| format!("no repository found for tenant {tenant_id}"))?; @@ -181,6 +183,7 @@ pub async fn handle_walreceiver_connection( } { let replication_message = replication_message?; let now = Utc::now().naive_utc(); + let last_rec_lsn_before_msg = last_rec_lsn; // Update the connection status before processing the message. If the message processing // fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper. @@ -193,7 +196,6 @@ pub async fn handle_walreceiver_connection( )); if !xlog_data.data().is_empty() { connection_status.latest_wal_update = now; - connection_status.has_received_wal = true; } } ReplicationMessage::PrimaryKeepAlive(keepalive) => { @@ -265,6 +267,15 @@ pub async fn handle_walreceiver_connection( _ => None, }; + if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg { + // We have successfully processed at least one WAL record. + connection_status.has_processed_wal = true; + if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + warn!("Wal connection event listener dropped, aborting the connection: {e}"); + return Ok(()); + } + } + let timeline_to_check = Arc::clone(&timeline); tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance()) .await diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 63bc9bd517..c90c2a0446 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -90,7 +90,10 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()> { let cmd = parse_cmd(query_string)?; - info!("got query {:?}", query_string); + info!( + "got query {:?} in timeline {:?}", + query_string, self.ztimelineid + ); let create = !(matches!(cmd, SafekeeperPostgresCommand::StartReplication { .. }) || matches!(cmd, SafekeeperPostgresCommand::IdentifySystem)); @@ -106,23 +109,17 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { } match cmd { - SafekeeperPostgresCommand::StartWalPush => { - ReceiveWalConn::new(pgb) - .run(self) - .context("failed to run ReceiveWalConn")?; - } - SafekeeperPostgresCommand::StartReplication { start_lsn } => { - ReplicationConn::new(pgb) - .run(self, pgb, start_lsn) - .context("failed to run ReplicationConn")?; - } - SafekeeperPostgresCommand::IdentifySystem => { - self.handle_identify_system(pgb)?; - } - SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { - handle_json_ctrl(self, pgb, cmd)?; - } + SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb) + .run(self) + .context("failed to run ReceiveWalConn"), + SafekeeperPostgresCommand::StartReplication { start_lsn } => ReplicationConn::new(pgb) + .run(self, pgb, start_lsn) + .context("failed to run ReplicationConn"), + SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb), + SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd), } + .context(format!("timeline {timelineid}"))?; + Ok(()) } } @@ -153,8 +150,15 @@ impl SafekeeperPostgresHandler { /// Handle IDENTIFY_SYSTEM replication command /// fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> { - let start_pos = self.timeline.get().get_end_of_wal(); - let lsn = start_pos.to_string(); + let lsn = if self.is_walproposer_recovery() { + // walproposer should get all local WAL until flush_lsn + self.timeline.get().get_end_of_wal() + } else { + // other clients shouldn't get any uncommitted WAL + self.timeline.get().get_state().0.commit_lsn + } + .to_string(); + let sysid = self .timeline .get() @@ -203,4 +207,11 @@ impl SafekeeperPostgresHandler { .write_message(&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM"))?; Ok(()) } + + /// Returns true if current connection is a replication connection, originating + /// from a walproposer recovery function. This connection gets a special handling: + /// safekeeper must stream all local WAL till the flush_lsn, whether committed or not. + pub fn is_walproposer_recovery(&self) -> bool { + self.appname == Some("wal_proposer_recovery".to_string()) + } } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 243d7bf7d0..97ec945c3e 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -170,6 +170,7 @@ impl ReplicationConn { // spawn the background thread which receives HotStandbyFeedback messages. let bg_timeline = Arc::clone(spg.timeline.get()); let bg_stream_in = self.stream_in.take().unwrap(); + let bg_timeline_id = spg.ztimelineid.unwrap(); let state = ReplicaState::new(); // This replica_id is used below to check if it's time to stop replication. @@ -188,6 +189,8 @@ impl ReplicationConn { let _ = thread::Builder::new() .name("HotStandbyFeedback thread".into()) .spawn(move || { + let _enter = + info_span!("HotStandbyFeedback thread", timeline = %bg_timeline_id).entered(); if let Err(err) = Self::background_thread(bg_stream_in, bg_replica_guard) { error!("Replication background thread failed: {}", err); } @@ -198,13 +201,12 @@ impl ReplicationConn { .build()?; runtime.block_on(async move { - let (_, persisted_state) = spg.timeline.get().get_state(); + let (inmem_state, persisted_state) = spg.timeline.get().get_state(); // add persisted_state.timeline_start_lsn == Lsn(0) check if persisted_state.server.wal_seg_size == 0 { bail!("Cannot start replication before connecting to walproposer"); } - let wal_end = spg.timeline.get().get_end_of_wal(); // Walproposer gets special handling: safekeeper must give proposer all // local WAL till the end, whether committed or not (walproposer will // hang otherwise). That's because walproposer runs the consensus and @@ -214,8 +216,8 @@ impl ReplicationConn { // another compute rises which collects majority and starts fixing log // on this safekeeper itself. That's ok as (old) proposer will never be // able to commit such WAL. - let stop_pos: Option = if spg.appname == Some("wal_proposer_recovery".to_string()) - { + let stop_pos: Option = if spg.is_walproposer_recovery() { + let wal_end = spg.timeline.get().get_end_of_wal(); Some(wal_end) } else { None @@ -226,7 +228,7 @@ impl ReplicationConn { // switch to copy pgb.write_message(&BeMessage::CopyBothResponse)?; - let mut end_pos = Lsn(0); + let mut end_pos = stop_pos.unwrap_or(inmem_state.commit_lsn); let mut wal_reader = WalReader::new( spg.conf.timeline_dir(&spg.timeline.get().zttid), From 77a2bdf3d7714765b1673bec59bdace149332c47 Mon Sep 17 00:00:00 2001 From: Anton Galitsyn Date: Thu, 18 Aug 2022 19:05:40 +0700 Subject: [PATCH 0648/1022] on safekeeper registration pass availability zone param (#2292) --- .github/ansible/scripts/init_safekeeper.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/ansible/scripts/init_safekeeper.sh b/.github/ansible/scripts/init_safekeeper.sh index a9b5025562..879891e7a3 100644 --- a/.github/ansible/scripts/init_safekeeper.sh +++ b/.github/ansible/scripts/init_safekeeper.sh @@ -1,7 +1,8 @@ #!/bin/sh -# get instance id from meta-data service +# fetch params from meta-data service INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) +AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone) # store fqdn hostname in var HOST=$(hostname -f) @@ -14,7 +15,8 @@ cat < Date: Thu, 18 Aug 2022 15:18:59 +0200 Subject: [PATCH 0649/1022] Re-enable test dependency for deploy (#2300) Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 6b76b6e5fc..c9b696e409 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -567,7 +567,7 @@ jobs: #container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly - needs: [ push-docker-hub, calculate-deploy-targets, tag ] + needs: [ push-docker-hub, calculate-deploy-targets, tag, other-tests, pg_regress-tests ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' @@ -622,7 +622,7 @@ jobs: runs-on: dev container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. - needs: [ push-docker-hub, calculate-deploy-targets, tag ] + needs: [ push-docker-hub, calculate-deploy-targets, tag, other-tests, pg_regress-tests ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' From f99ccb5041eebcdb89efc24f5e0cc501fb1a9039 Mon Sep 17 00:00:00 2001 From: MMeent Date: Thu, 18 Aug 2022 17:12:28 +0200 Subject: [PATCH 0650/1022] Extract WalProposer into the neon extension (#2217) Including, but not limited to: * Fixes to neon management code to support walproposer-as-an-extension * Fix issue in expected output of pg settings serialization. * Show the logs of a failed --sync-safekeepers process in CI * Add compat layer for renamed GUCs in postgres.conf * Update vendor/postgres to the latest origin/main --- Cargo.lock | 202 +++++++++--------- compute_tools/src/compute.rs | 6 +- compute_tools/src/pg_helpers.rs | 11 +- compute_tools/tests/cluster_spec.json | 3 +- compute_tools/tests/pg_helpers_tests.rs | 2 +- control_plane/src/compute.rs | 4 +- test_runner/batch_others/test_wal_acceptor.py | 2 +- test_runner/fixtures/neon_fixtures.py | 4 +- vendor/postgres | 2 +- 9 files changed, 125 insertions(+), 111 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a70b2b7dc9..505cbb66c3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -48,9 +48,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.58" +version = "1.0.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb07d2053ccdbe10e2af2995a2f116c1330396493dc1269f6a91d0ae82e19704" +checksum = "c91f1f46651137be86f3a2b9a8359f9ab421d04d941c62b5982e1ca21113adf9" dependencies = [ "backtrace", ] @@ -77,7 +77,7 @@ dependencies = [ "num-traits", "rusticata-macros", "thiserror", - "time 0.3.11", + "time 0.3.12", ] [[package]] @@ -126,9 +126,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.56" +version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96cf8829f67d2eab0b2dfa42c5d0ef737e0724e4a82b01b3e292456202b19716" +checksum = "76464446b8bc32758d7e88ee1a804d9914cd9b1cb264c029899680b0be29826f" dependencies = [ "proc-macro2", "quote", @@ -166,7 +166,7 @@ dependencies = [ "http", "http-body", "hyper", - "itoa 1.0.2", + "itoa 1.0.3", "matchit", "memchr", "mime", @@ -298,9 +298,9 @@ checksum = "37ccbd214614c6783386c1af30caf03192f17891059cecc394b4fb119e363de3" [[package]] name = "bytemuck" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c53dfa917ec274df8ed3c572698f381a24eef2efba9492d797301b72b6db408a" +checksum = "a5377c8865e74a160d21f29c2d40669f53286db6eab59b88540cbb12ffc8b835" [[package]] name = "byteorder" @@ -310,9 +310,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" -version = "1.1.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" +checksum = "ec8a7b6a70fde80372154c65702f00a0f56f3e1c36abbc6c440484be248856db" dependencies = [ "serde", ] @@ -386,9 +386,9 @@ dependencies = [ [[package]] name = "clap" -version = "3.2.12" +version = "3.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab8b79fe3946ceb4a0b1c080b4018992b8d27e9ff363644c1c9b6387c854614d" +checksum = "a3dbbb6653e7c55cc8595ad3e1f7be8f32aba4eb7ff7f0fd1163d4f3d137c0a9" dependencies = [ "atty", "bitflags", @@ -455,7 +455,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap 3.2.12", + "clap 3.2.16", "env_logger", "hyper", "log", @@ -601,9 +601,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.5" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c02a4d71819009c192cf4872265391563fd6a84c81ff2c0f2a7026ca4c1d85c" +checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" dependencies = [ "cfg-if", "crossbeam-utils", @@ -611,9 +611,9 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" dependencies = [ "cfg-if", "crossbeam-epoch", @@ -622,9 +622,9 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.9" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07db9d94cbd326813772c968ccd25999e5f8ae22f4f8d1b11effa37ef6ce281d" +checksum = "045ebe27666471bb549370b4b0b3e51b07f56325befa4284db65fc89c02511b1" dependencies = [ "autocfg", "cfg-if", @@ -636,9 +636,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.10" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d82ee10ce34d7bc12c2122495e7593a9c41347ecdd64185af4ecf72cb1a7f83" +checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc" dependencies = [ "cfg-if", "once_cell", @@ -917,9 +917,9 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" [[package]] name = "fastrand" -version = "1.7.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3fcf0cee53519c866c09b5de1f6c56ff9d647101f81c1964fa632e148896cdf" +checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" dependencies = [ "instant", ] @@ -1086,9 +1086,9 @@ dependencies = [ [[package]] name = "generic-array" -version = "0.14.5" +version = "0.14.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd48d33ec7f05fbfa152300fdad764757cbded343c1aa1cff2fbaf4134851803" +checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" dependencies = [ "typenum", "version_check", @@ -1245,7 +1245,7 @@ checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" dependencies = [ "bytes", "fnv", - "itoa 1.0.2", + "itoa 1.0.3", ] [[package]] @@ -1308,7 +1308,7 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa 1.0.2", + "itoa 1.0.3", "pin-project-lite", "socket2", "tokio", @@ -1391,7 +1391,7 @@ dependencies = [ "ahash", "atty", "indexmap", - "itoa 1.0.2", + "itoa 1.0.3", "lazy_static", "log", "num-format", @@ -1432,15 +1432,15 @@ checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" [[package]] name = "itoa" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "112c678d4050afce233f4f2852bb2eb519230b3cf12f33585275537d7e41578d" +checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754" [[package]] name = "js-sys" -version = "0.3.58" +version = "0.3.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3fac17f7123a73ca62df411b1bf727ccc805daa070338fda671c86dac1bdc27" +checksum = "258451ab10b34f8af53416d1fdab72c22e805f0c92a1136d59470ec0b11138b2" dependencies = [ "wasm-bindgen", ] @@ -1482,9 +1482,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.126" +version = "0.2.127" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836" +checksum = "505e71a4706fa491e9b1b55f51b95d4037d0821ee40131190475f692b35b009b" [[package]] name = "libloading" @@ -1659,7 +1659,7 @@ name = "neon_local" version = "0.1.0" dependencies = [ "anyhow", - "clap 3.2.12", + "clap 3.2.16", "comfy-table", "control_plane", "git-version", @@ -1854,7 +1854,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 3.2.12", + "clap 3.2.16", "close_fds", "const_format", "crc32c", @@ -2155,9 +2155,9 @@ checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" [[package]] name = "prettyplease" -version = "0.1.16" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da6ffbe862780245013cb1c0a48c4e44b7d665548088f91f6b90876d0625e4c2" +checksum = "697ae720ee02011f439e0701db107ffe2916d83f718342d65d7f8bf7b8a5fee9" dependencies = [ "proc-macro2", "syn", @@ -2171,9 +2171,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro2" -version = "1.0.40" +version = "1.0.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd96a1e8ed2596c337f8eae5f24924ec83f5ad5ab21ea8e455d3566c69fbcaf7" +checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab" dependencies = [ "unicode-ident", ] @@ -2271,7 +2271,7 @@ dependencies = [ "base64", "bstr", "bytes", - "clap 3.2.12", + "clap 3.2.16", "futures", "git-version", "hashbrown 0.11.2", @@ -2326,9 +2326,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.20" +version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bcdf212e9776fbcb2d23ab029360416bb1706b1aea2d1a5ba002727cbcab804" +checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" dependencies = [ "proc-macro2", ] @@ -2411,9 +2411,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.2.13" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62f25bc4c7e55e0b0b7a1d43fb893f4fa1361d0abe38b9ce4f323c2adfe6ef42" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ "bitflags", ] @@ -2508,7 +2508,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "rustls", - "rustls-pemfile 1.0.0", + "rustls-pemfile 1.0.1", "serde", "serde_json", "serde_urlencoded", @@ -2708,9 +2708,9 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7522c9de787ff061458fe9a829dc790a3f5b22dc571694fc5883f448b94d9a9" +checksum = "0864aeff53f8c05aa08d86e5ef839d3dfcf07aeba2db32f12db0ef716e87bd55" dependencies = [ "base64", ] @@ -2726,15 +2726,15 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.8" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24c8ad4f0c00e1eb5bc7614d236a7f1300e3dbd76b68cac8e06fb00b015ad8d8" +checksum = "97477e48b4cf8603ad5f7aaf897467cf42ab4218a38ef76fb14c2d6773a6d6a8" [[package]] name = "ryu" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695" +checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" [[package]] name = "safekeeper" @@ -2744,7 +2744,7 @@ dependencies = [ "async-trait", "byteorder", "bytes", - "clap 3.2.12", + "clap 3.2.16", "const_format", "crc32c", "daemonize", @@ -2835,15 +2835,15 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.12" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2333e6df6d6598f2b1974829f853c2b4c5f4a6e503c10af918081aa6f8564e1" +checksum = "93f6841e709003d68bb2deee8c343572bf446003ec20a583e76f7b15cebf3711" [[package]] name = "serde" -version = "1.0.139" +version = "1.0.142" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0171ebb889e45aa68b44aee0859b3eede84c6f5f5c228e6f140c0b2a0a46cad6" +checksum = "e590c437916fb6b221e1d00df6e3294f3fccd70ca7e92541c475d6ed6ef5fee2" dependencies = [ "serde_derive", ] @@ -2860,9 +2860,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.139" +version = "1.0.142" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc1d3230c1de7932af58ad8ffbe1d784bd55efd5a9d84ac24f69c72d83543dfb" +checksum = "34b5b8d809babe02f538c2cfec6f2c1ed10804c0e5a6a041a049a4f5588ccc2e" dependencies = [ "proc-macro2", "quote", @@ -2871,11 +2871,11 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.82" +version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82c2c1fdcd807d1098552c5b9a36e425e42e9fbd7c6a37a8425f390f781f7fa7" +checksum = "38dd04e3c8279e75b31ef29dbdceebfe5ad89f4d0937213c53f7d49d01b3d5a7" dependencies = [ - "itoa 1.0.2", + "itoa 1.0.3", "ryu", "serde", ] @@ -2887,7 +2887,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" dependencies = [ "form_urlencoded", - "itoa 1.0.2", + "itoa 1.0.3", "ryu", "serde", ] @@ -2992,7 +2992,7 @@ dependencies = [ "num-bigint", "num-traits", "thiserror", - "time 0.3.11", + "time 0.3.12", ] [[package]] @@ -3003,9 +3003,12 @@ checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" [[package]] name = "slab" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb703cfe953bccee95685111adeedb76fabe4e97549a58d16f03ea7b9367bb32" +checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" +dependencies = [ + "autocfg", +] [[package]] name = "smallvec" @@ -3113,9 +3116,9 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.98" +version = "1.0.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd" +checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13" dependencies = [ "proc-macro2", "quote", @@ -3191,18 +3194,18 @@ checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb" [[package]] name = "thiserror" -version = "1.0.31" +version = "1.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd829fe32373d27f76265620b5309d0340cb8550f523c1dda251d6298069069a" +checksum = "f5f6586b7f764adc0231f4c79be7b920e766bb2f3e51b3661cdb263828f19994" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.31" +version = "1.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0396bc89e626244658bef819e22d0cc459e795a5ebe878e6ec336d1674a8d79a" +checksum = "12bafc5b54507e0149cdf1b145a5d80ab80a90bcd9275df43d4fff68460f6c21" dependencies = [ "proc-macro2", "quote", @@ -3231,11 +3234,12 @@ dependencies = [ [[package]] name = "time" -version = "0.3.11" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72c91f41dcb2f096c05f0873d667dceec1087ce5bcf984ec8ffb19acddbb3217" +checksum = "74b7cc93fc23ba97fde84f7eea56c55d1ba183f495c6715defdfc7b9cb8c870f" dependencies = [ - "itoa 1.0.2", + "itoa 1.0.3", + "js-sys", "libc", "num_threads", "quickcheck", @@ -3275,9 +3279,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.20.0" +version = "1.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57aec3cfa4c296db7255446efb4928a6be304b431a806216105542a67b6ca82e" +checksum = "7a8325f63a7d4774dd041e363b2409ed1c5cbbd0f867795e661df066b2b0a581" dependencies = [ "autocfg", "bytes", @@ -3607,9 +3611,9 @@ checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992" [[package]] name = "unicode-ident" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7" +checksum = "c4f5b37a154999a8f3f98cc23a628d850e154479cd94decf3414696e12e31aaf" [[package]] name = "unicode-normalization" @@ -3728,7 +3732,7 @@ name = "wal_craft" version = "0.1.0" dependencies = [ "anyhow", - "clap 3.2.12", + "clap 3.2.16", "env_logger", "log", "once_cell", @@ -3772,9 +3776,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.81" +version = "0.2.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c53b543413a17a202f4be280a7e5c62a1c69345f5de525ee64f8cfdbc954994" +checksum = "fc7652e3f6c4706c8d9cd54832c4a4ccb9b5336e2c3bd154d5cccfbf1c1f5f7d" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -3782,13 +3786,13 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.81" +version = "0.2.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5491a68ab4500fa6b4d726bd67408630c3dbe9c4fe7bda16d5c82a1fd8c7340a" +checksum = "662cd44805586bd52971b9586b1df85cdbbd9112e4ef4d8f41559c334dc6ac3f" dependencies = [ "bumpalo", - "lazy_static", "log", + "once_cell", "proc-macro2", "quote", "syn", @@ -3797,9 +3801,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.31" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de9a9cec1733468a8c657e57fa2413d2ae2c0129b95e87c5b72b8ace4d13f31f" +checksum = "fa76fb221a1f8acddf5b54ace85912606980ad661ac7a503b4570ffd3a624dad" dependencies = [ "cfg-if", "js-sys", @@ -3809,9 +3813,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.81" +version = "0.2.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c441e177922bc58f1e12c022624b6216378e5febc2f0533e41ba443d505b80aa" +checksum = "b260f13d3012071dfb1512849c033b1925038373aea48ced3012c09df952c602" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3819,9 +3823,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.81" +version = "0.2.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d94ac45fcf608c1f45ef53e748d35660f168490c10b23704c7779ab8f5c3048" +checksum = "5be8e654bdd9b79216c2929ab90721aa82faf65c48cdf08bdc4e7f51357b80da" dependencies = [ "proc-macro2", "quote", @@ -3832,15 +3836,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.81" +version = "0.2.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a89911bd99e5f3659ec4acf9c4d93b0a90fe4a2a11f15328472058edc5261be" +checksum = "6598dd0bd3c7d51095ff6531a5b23e02acdc81804e30d8f07afb77b7215a140a" [[package]] name = "web-sys" -version = "0.3.58" +version = "0.3.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fed94beee57daf8dd7d51f2b15dc2bcde92d7a72304cdf662a4371008b71b90" +checksum = "ed055ab27f941423197eb86b2035720b1a3ce40504df082cac2ecc6ed73335a1" dependencies = [ "js-sys", "wasm-bindgen", @@ -3993,7 +3997,7 @@ dependencies = [ "scopeguard", "serde", "syn", - "time 0.3.11", + "time 0.3.12", "tokio", "tokio-util", "tracing", @@ -4015,7 +4019,7 @@ dependencies = [ "oid-registry", "rusticata-macros", "thiserror", - "time 0.3.11", + "time 0.3.12", ] [[package]] @@ -4044,6 +4048,6 @@ dependencies = [ [[package]] name = "zeroize" -version = "1.5.6" +version = "1.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20b578acffd8516a6c3f2a1bdefc1ec37e547bb4e0fb8b6b01a4cafc886b4442" +checksum = "c394b5bd0c6f669e7275d9c20aa90ae064cb22e75a1cad54e1b34088034b149f" diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 1e812f2aa0..58469b1c97 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -178,6 +178,7 @@ impl ComputeNode { .args(&["--sync-safekeepers"]) .env("PGDATA", &self.pgdata) // we cannot use -D in this mode .stdout(Stdio::piped()) + .stderr(Stdio::piped()) .spawn() .expect("postgres --sync-safekeepers failed to start"); @@ -187,10 +188,13 @@ impl ComputeNode { let sync_output = sync_handle .wait_with_output() .expect("postgres --sync-safekeepers failed"); + if !sync_output.status.success() { anyhow::bail!( - "postgres --sync-safekeepers exited with non-zero status: {}", + "postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}, stderr: {}", sync_output.status, + String::from_utf8(sync_output.stdout).expect("postgres --sync-safekeepers exited, and stdout is not utf-8"), + String::from_utf8(sync_output.stderr).expect("postgres --sync-safekeepers exited, and stderr is not utf-8"), ); } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 207d09d76b..ac065fa60c 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -62,9 +62,16 @@ impl GenericOption { /// Represent `GenericOption` as configuration option. pub fn to_pg_setting(&self) -> String { if let Some(val) = &self.value { + let name = match self.name.as_str() { + "safekeepers" => "neon.safekeepers", + "wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout", + "wal_acceptor_connect_timeout" => "neon.safekeeper_connect_timeout", + it => it, + }; + match self.vartype.as_ref() { - "string" => format!("{} = '{}'", self.name, val), - _ => format!("{} = {}", self.name, val), + "string" => format!("{} = '{}'", name, val), + _ => format!("{} = {}", name, val), } } else { self.name.to_owned() diff --git a/compute_tools/tests/cluster_spec.json b/compute_tools/tests/cluster_spec.json index bdd6e60a69..c29416d9c4 100644 --- a/compute_tools/tests/cluster_spec.json +++ b/compute_tools/tests/cluster_spec.json @@ -85,7 +85,7 @@ "vartype": "bool" }, { - "name": "safekeepers", + "name": "neon.safekeepers", "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501", "vartype": "string" }, @@ -181,7 +181,6 @@ } ] }, - "delta_operations": [ { "action": "delete_db", diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 1f2e188398..bae944440e 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -28,7 +28,7 @@ mod pg_helpers_tests { assert_eq!( spec.cluster.settings.as_pg_settings(), - "fsync = off\nwal_level = replica\nhot_standby = on\nsafekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" + "fsync = off\nwal_level = replica\nhot_standby = on\nneon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" ); } diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index e78f96074e..57b5e1e10a 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -150,7 +150,7 @@ impl PostgresNode { let port: u16 = conf.parse_field("port", &context)?; let timeline_id: ZTimelineId = conf.parse_field("neon.timeline_id", &context)?; let tenant_id: ZTenantId = conf.parse_field("neon.tenant_id", &context)?; - let uses_wal_proposer = conf.get("safekeepers").is_some(); + let uses_wal_proposer = conf.get("neon.safekeepers").is_some(); // parse recovery_target_lsn, if any let recovery_target_lsn: Option = @@ -341,7 +341,7 @@ impl PostgresNode { .map(|sk| format!("localhost:{}", sk.pg_port)) .collect::>() .join(","); - conf.append("safekeepers", &safekeepers); + conf.append("neon.safekeepers", &safekeepers); } else { // We only use setup without safekeepers for tests, // and don't care about data durability on pageserver, diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index b6f914858e..d922dd0cb4 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -569,7 +569,7 @@ class ProposerPostgres(PgProtocol): f"neon.timeline_id = '{self.timeline_id.hex}'\n", f"neon.tenant_id = '{self.tenant_id.hex}'\n", f"neon.pageserver_connstring = ''\n", - f"safekeepers = '{safekeepers}'\n", + f"neon.safekeepers = '{safekeepers}'\n", f"listen_addresses = '{self.listen_addr}'\n", f"port = '{self.port}'\n", ] diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index fe0a3193c1..5292bc1789 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1855,11 +1855,11 @@ class Postgres(PgProtocol): # walproposer uses different application_name if ("synchronous_standby_names" in cfg_line or # don't repeat safekeepers/wal_acceptors multiple times - "safekeepers" in cfg_line): + "neon.safekeepers" in cfg_line): continue f.write(cfg_line) f.write("synchronous_standby_names = 'walproposer'\n") - f.write("safekeepers = '{}'\n".format(safekeepers)) + f.write("neon.safekeepers = '{}'\n".format(safekeepers)) return self def config(self, lines: List[str]) -> 'Postgres': diff --git a/vendor/postgres b/vendor/postgres index 49015ce98f..7e32bba2aa 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 49015ce98f550d4fc08d3c1fe348faa71a15f51b +Subproject commit 7e32bba2aa2a1752996586bfaf35754f1f0a4d53 From a185821d6f47956e39c275995248f47c5987b2ec Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 18 Aug 2022 22:37:20 +0300 Subject: [PATCH 0651/1022] Explicitly error on cache issues during I/O (#2303) --- pageserver/src/layered_repository/block_io.rs | 9 +++- .../src/layered_repository/ephemeral_file.rs | 54 ++++++++++++------- pageserver/src/layered_repository/timeline.rs | 18 ++++--- pageserver/src/page_cache.rs | 43 ++++++++------- 4 files changed, 77 insertions(+), 47 deletions(-) diff --git a/pageserver/src/layered_repository/block_io.rs b/pageserver/src/layered_repository/block_io.rs index bc3bc082a0..5e32b8833a 100644 --- a/pageserver/src/layered_repository/block_io.rs +++ b/pageserver/src/layered_repository/block_io.rs @@ -157,7 +157,14 @@ where // Look up the right page let cache = page_cache::get(); loop { - match cache.read_immutable_buf(self.file_id, blknum) { + match cache + .read_immutable_buf(self.file_id, blknum) + .map_err(|e| { + std::io::Error::new( + std::io::ErrorKind::Other, + format!("Failed to read immutable buf: {e:#}"), + ) + })? { ReadBufResult::Found(guard) => break Ok(guard), ReadBufResult::NotFound(mut write_guard) => { // Read the page from disk into the buffer diff --git a/pageserver/src/layered_repository/ephemeral_file.rs b/pageserver/src/layered_repository/ephemeral_file.rs index 1776946e7a..a1b2d68cd5 100644 --- a/pageserver/src/layered_repository/ephemeral_file.rs +++ b/pageserver/src/layered_repository/ephemeral_file.rs @@ -12,7 +12,7 @@ use once_cell::sync::Lazy; use std::cmp::min; use std::collections::HashMap; use std::fs::OpenOptions; -use std::io::{Error, ErrorKind}; +use std::io::{self, ErrorKind}; use std::ops::DerefMut; use std::path::PathBuf; use std::sync::{Arc, RwLock}; @@ -51,7 +51,7 @@ impl EphemeralFile { conf: &PageServerConf, tenantid: ZTenantId, timelineid: ZTimelineId, - ) -> Result { + ) -> Result { let mut l = EPHEMERAL_FILES.write().unwrap(); let file_id = l.next_file_id; l.next_file_id += 1; @@ -76,7 +76,7 @@ impl EphemeralFile { }) } - fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), Error> { + fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> { let mut off = 0; while off < PAGE_SZ { let n = self @@ -96,10 +96,13 @@ impl EphemeralFile { Ok(()) } - fn get_buf_for_write(&self, blkno: u32) -> Result { + fn get_buf_for_write(&self, blkno: u32) -> Result { // Look up the right page let cache = page_cache::get(); - let mut write_guard = match cache.write_ephemeral_buf(self.file_id, blkno) { + let mut write_guard = match cache + .write_ephemeral_buf(self.file_id, blkno) + .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))? + { WriteBufResult::Found(guard) => guard, WriteBufResult::NotFound(mut guard) => { // Read the page from disk into the buffer @@ -127,7 +130,7 @@ pub fn is_ephemeral_file(filename: &str) -> bool { } impl FileExt for EphemeralFile { - fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result { + fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result { // Look up the right page let blkno = (offset / PAGE_SZ as u64) as u32; let off = offset as usize % PAGE_SZ; @@ -137,7 +140,10 @@ impl FileExt for EphemeralFile { let mut write_guard; let cache = page_cache::get(); - let buf = match cache.read_ephemeral_buf(self.file_id, blkno) { + let buf = match cache + .read_ephemeral_buf(self.file_id, blkno) + .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))? + { ReadBufResult::Found(guard) => { read_guard = guard; read_guard.as_ref() @@ -158,7 +164,7 @@ impl FileExt for EphemeralFile { Ok(len) } - fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result { + fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result { // Look up the right page let blkno = (offset / PAGE_SZ as u64) as u32; let off = offset as usize % PAGE_SZ; @@ -166,7 +172,10 @@ impl FileExt for EphemeralFile { let mut write_guard; let cache = page_cache::get(); - let buf = match cache.write_ephemeral_buf(self.file_id, blkno) { + let buf = match cache + .write_ephemeral_buf(self.file_id, blkno) + .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))? + { WriteBufResult::Found(guard) => { write_guard = guard; write_guard.deref_mut() @@ -190,7 +199,7 @@ impl FileExt for EphemeralFile { } impl BlobWriter for EphemeralFile { - fn write_blob(&mut self, srcbuf: &[u8]) -> Result { + fn write_blob(&mut self, srcbuf: &[u8]) -> Result { let pos = self.size; let mut blknum = (self.size / PAGE_SZ as u64) as u32; @@ -268,11 +277,11 @@ impl Drop for EphemeralFile { } } -pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Error> { +pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> { if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) { match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) { Ok(_) => Ok(()), - Err(e) => Err(std::io::Error::new( + Err(e) => Err(io::Error::new( ErrorKind::Other, format!( "failed to write back to ephemeral file at {} error: {}", @@ -282,7 +291,7 @@ pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Er )), } } else { - Err(std::io::Error::new( + Err(io::Error::new( ErrorKind::Other, "could not write back page, not found in ephemeral files hash", )) @@ -292,11 +301,14 @@ pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Er impl BlockReader for EphemeralFile { type BlockLease = page_cache::PageReadGuard<'static>; - fn read_blk(&self, blknum: u32) -> Result { + fn read_blk(&self, blknum: u32) -> Result { // Look up the right page let cache = page_cache::get(); loop { - match cache.read_ephemeral_buf(self.file_id, blknum) { + match cache + .read_ephemeral_buf(self.file_id, blknum) + .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))? + { ReadBufResult::Found(guard) => return Ok(guard), ReadBufResult::NotFound(mut write_guard) => { // Read the page from disk into the buffer @@ -311,6 +323,10 @@ impl BlockReader for EphemeralFile { } } +fn to_io_error(e: anyhow::Error, context: &str) -> io::Error { + io::Error::new(ErrorKind::Other, format!("{context}: {e:#}")) +} + #[cfg(test)] mod tests { use super::*; @@ -322,7 +338,7 @@ mod tests { fn repo_harness( test_name: &str, - ) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), Error> { + ) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), io::Error> { let repo_dir = PageServerConf::test_repo_dir(test_name); let _ = fs::remove_dir_all(&repo_dir); let conf = PageServerConf::dummy_conf(repo_dir); @@ -339,7 +355,7 @@ mod tests { // Helper function to slurp contents of a file, starting at the current position, // into a string - fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result { + fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result { let mut buf = Vec::new(); buf.resize(len, 0u8); @@ -351,7 +367,7 @@ mod tests { } #[test] - fn test_ephemeral_files() -> Result<(), Error> { + fn test_ephemeral_files() -> Result<(), io::Error> { let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?; let file_a = EphemeralFile::create(conf, tenantid, timelineid)?; @@ -382,7 +398,7 @@ mod tests { } #[test] - fn test_ephemeral_blobs() -> Result<(), Error> { + fn test_ephemeral_blobs() -> Result<(), io::Error> { let (conf, tenantid, timelineid) = repo_harness("ephemeral_blobs")?; let mut file = EphemeralFile::create(conf, tenantid, timelineid)?; diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 6ef4915bdb..910fc9e9fc 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -2117,7 +2117,7 @@ impl LayeredTimeline { key: Key, request_lsn: Lsn, mut data: ValueReconstructState, - ) -> Result { + ) -> anyhow::Result { // Perform WAL redo if needed data.records.reverse(); @@ -2167,13 +2167,15 @@ impl LayeredTimeline { if img.len() == page_cache::PAGE_SZ { let cache = page_cache::get(); - cache.memorize_materialized_page( - self.tenant_id, - self.timeline_id, - key, - last_rec_lsn, - &img, - ); + cache + .memorize_materialized_page( + self.tenant_id, + self.timeline_id, + key, + last_rec_lsn, + &img, + ) + .context("Materialized page memoization failed")?; } Ok(img) diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 818eaf1b8f..27b1400243 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -45,6 +45,7 @@ use std::{ }, }; +use anyhow::Context; use once_cell::sync::OnceCell; use tracing::error; use utils::{ @@ -342,7 +343,7 @@ impl PageCache { key: Key, lsn: Lsn, img: &[u8], - ) { + ) -> anyhow::Result<()> { let cache_key = CacheKey::MaterializedPage { hash_key: MaterializedPageHashKey { tenant_id, @@ -352,7 +353,7 @@ impl PageCache { lsn, }; - match self.lock_for_write(&cache_key) { + match self.lock_for_write(&cache_key)? { WriteBufResult::Found(write_guard) => { // We already had it in cache. Another thread must've put it there // concurrently. Check that it had the same contents that we @@ -364,17 +365,19 @@ impl PageCache { write_guard.mark_valid(); } } + + Ok(()) } // Section 1.2: Public interface functions for working with Ephemeral pages. - pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult { + pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result { let mut cache_key = CacheKey::EphemeralPage { file_id, blkno }; self.lock_for_read(&mut cache_key) } - pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> WriteBufResult { + pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result { let cache_key = CacheKey::EphemeralPage { file_id, blkno }; self.lock_for_write(&cache_key) @@ -402,7 +405,7 @@ impl PageCache { // Section 1.3: Public interface functions for working with immutable file pages. - pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult { + pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result { let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno }; self.lock_for_read(&mut cache_key) @@ -495,15 +498,16 @@ impl PageCache { /// } /// ``` /// - fn lock_for_read(&self, cache_key: &mut CacheKey) -> ReadBufResult { + fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result { loop { // First check if the key already exists in the cache. if let Some(read_guard) = self.try_lock_for_read(cache_key) { - return ReadBufResult::Found(read_guard); + return Ok(ReadBufResult::Found(read_guard)); } // Not found. Find a victim buffer - let (slot_idx, mut inner) = self.find_victim(); + let (slot_idx, mut inner) = + self.find_victim().context("Failed to find evict victim")?; // Insert mapping for this. At this point, we may find that another // thread did the same thing concurrently. In that case, we evicted @@ -526,10 +530,10 @@ impl PageCache { inner.dirty = false; slot.usage_count.store(1, Ordering::Relaxed); - return ReadBufResult::NotFound(PageWriteGuard { + return Ok(ReadBufResult::NotFound(PageWriteGuard { inner, valid: false, - }); + })); } } @@ -556,15 +560,16 @@ impl PageCache { /// /// Similar to lock_for_read(), but the returned buffer is write-locked and /// may be modified by the caller even if it's already found in the cache. - fn lock_for_write(&self, cache_key: &CacheKey) -> WriteBufResult { + fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result { loop { // First check if the key already exists in the cache. if let Some(write_guard) = self.try_lock_for_write(cache_key) { - return WriteBufResult::Found(write_guard); + return Ok(WriteBufResult::Found(write_guard)); } // Not found. Find a victim buffer - let (slot_idx, mut inner) = self.find_victim(); + let (slot_idx, mut inner) = + self.find_victim().context("Failed to find evict victim")?; // Insert mapping for this. At this point, we may find that another // thread did the same thing concurrently. In that case, we evicted @@ -587,10 +592,10 @@ impl PageCache { inner.dirty = false; slot.usage_count.store(1, Ordering::Relaxed); - return WriteBufResult::NotFound(PageWriteGuard { + return Ok(WriteBufResult::NotFound(PageWriteGuard { inner, valid: false, - }); + })); } } @@ -754,7 +759,7 @@ impl PageCache { /// Find a slot to evict. /// /// On return, the slot is empty and write-locked. - fn find_victim(&self) -> (usize, RwLockWriteGuard) { + fn find_victim(&self) -> anyhow::Result<(usize, RwLockWriteGuard)> { let iter_limit = self.slots.len() * 10; let mut iters = 0; loop { @@ -767,7 +772,7 @@ impl PageCache { let mut inner = match slot.inner.try_write() { Ok(inner) => inner, Err(TryLockError::Poisoned(err)) => { - panic!("buffer lock was poisoned: {:?}", err) + anyhow::bail!("buffer lock was poisoned: {err:?}") } Err(TryLockError::WouldBlock) => { // If we have looped through the whole buffer pool 10 times @@ -777,7 +782,7 @@ impl PageCache { // there are buffers in the pool. In practice, with a reasonably // large buffer pool it really shouldn't happen. if iters > iter_limit { - panic!("could not find a victim buffer to evict"); + anyhow::bail!("exceeded evict iter limit"); } continue; } @@ -804,7 +809,7 @@ impl PageCache { inner.dirty = false; inner.key = None; } - return (slot_idx, inner); + return Ok((slot_idx, inner)); } } } From 37d90dc3b30d480006f3389baa5248cd47e75137 Mon Sep 17 00:00:00 2001 From: MMeent Date: Thu, 18 Aug 2022 21:51:33 +0200 Subject: [PATCH 0652/1022] Fix dependencies issue between compute-tools and compute node docker images (#2304) Compute node docker image requires compute-tools to build, but this dependency (and the argument for which image to pick) weren't described in the workflow file. This lead to out-of-date binaries in latest builds, which subsequently broke these images. --- .github/workflows/build_and_test.yml | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index c9b696e409..1e71a53f99 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -446,9 +446,29 @@ jobs: - name: Kaniko build compute tools run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID + promote-image-compute-tools: + runs-on: dev + needs: [ compute-tools-image ] + if: github.event_name != 'workflow_dispatch' + container: amazon/aws-cli + strategy: + fail-fast: false + matrix: + name: [ compute-tools ] + + steps: + - name: Promote image to latest + run: + MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=$GITHUB_RUN_ID --query 'images[].imageManifest' --output text) && aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST" + compute-node-image: runs-on: dev container: gcr.io/kaniko-project/executor:v1.9.0-debug + # note: This image depends on neondatabase/compute-tools:latest (or :thisversion), + # which isn't available until after the image is promoted. + # Ergo, we must explicitly build and promote compute-tools separately. + needs: + - promote-image-compute-tools steps: - name: Checkout @@ -462,17 +482,17 @@ jobs: - name: Kaniko build compute node working-directory: ./vendor/postgres/ - run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg=COMPUTE_TOOLS_TAG=$GITHUB_RUN_ID --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID promote-images: runs-on: dev - needs: [ neon-image, compute-tools-image, compute-node-image ] + needs: [ neon-image, compute-node-image ] if: github.event_name != 'workflow_dispatch' container: amazon/aws-cli strategy: fail-fast: false matrix: - name: [ neon, compute-tools, compute-node ] + name: [ neon, compute-node ] steps: - name: Promote image to latest From 6b9cef02a1a84b2d02671fd4c596d4fd1cd54b19 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 18 Aug 2022 16:55:48 +0300 Subject: [PATCH 0653/1022] Use better defaults for pageserver Docker image --- Dockerfile | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 17aa0025e8..dccf7b6c54 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,3 +1,8 @@ +### Creates a storage Docker image with postgres, pageserver, safekeeper and proxy binaries. +### The image itself is mainly used as a container for the binaries and for starting e2e tests with custom parameters. +### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used +### inside this image in the real deployments. + # Build Postgres FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS pg-build WORKDIR /home/nonroot @@ -58,7 +63,18 @@ COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy COPY --from=pg-build /home/nonroot/tmp_install/ /usr/local/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ +# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. +# Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values. +RUN mkdir -p /data/.neon/ && chown -R zenith:zenith /data/.neon/ \ + && /usr/local/bin/pageserver -D /data/.neon/ --init \ + -c "id=1234" \ + -c "broker_endpoints=['http://etcd:2379']" \ + -c "pg_distrib_dir='/usr/local'" \ + -c "listen_pg_addr='0.0.0.0:6400'" \ + -c "listen_http_addr='0.0.0.0:9898'" + VOLUME ["/data"] USER zenith EXPOSE 6400 -CMD ["pageserver"] +EXPOSE 9898 +CMD ["/bin/bash"] From 12e87f0df3f39dfa85c2346695941f4128612866 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Fri, 19 Aug 2022 12:07:46 +0200 Subject: [PATCH 0654/1022] Update workflow to fix dependency issue (#2309) * Update workflow to fix dependency issue * Update workflow * Update workflow and dockerfile * Specify tag * Update main dockerfile as well * Mirror rust image to docker hub * Update submodule ref Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 42 +++++++--------------------- Dockerfile | 7 +++-- Dockerfile.compute-tools | 6 +++- vendor/postgres | 2 +- 4 files changed, 21 insertions(+), 36 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1e71a53f99..3a2e8bad64 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -446,29 +446,13 @@ jobs: - name: Kaniko build compute tools run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID - promote-image-compute-tools: - runs-on: dev - needs: [ compute-tools-image ] - if: github.event_name != 'workflow_dispatch' - container: amazon/aws-cli - strategy: - fail-fast: false - matrix: - name: [ compute-tools ] - - steps: - - name: Promote image to latest - run: - MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=$GITHUB_RUN_ID --query 'images[].imageManifest' --output text) && aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST" - compute-node-image: runs-on: dev container: gcr.io/kaniko-project/executor:v1.9.0-debug # note: This image depends on neondatabase/compute-tools:latest (or :thisversion), # which isn't available until after the image is promoted. # Ergo, we must explicitly build and promote compute-tools separately. - needs: - - promote-image-compute-tools + needs: [ compute-tools-image ] steps: - name: Checkout @@ -482,17 +466,17 @@ jobs: - name: Kaniko build compute node working-directory: ./vendor/postgres/ - run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg=COMPUTE_TOOLS_TAG=$GITHUB_RUN_ID --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg=TAG=$GITHUB_RUN_ID --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID promote-images: runs-on: dev - needs: [ neon-image, compute-node-image ] + needs: [ neon-image, compute-node-image, compute-tools-image ] if: github.event_name != 'workflow_dispatch' container: amazon/aws-cli strategy: fail-fast: false matrix: - name: [ neon, compute-node ] + name: [ neon, compute-node, compute-tools ] steps: - name: Promote image to latest @@ -509,18 +493,6 @@ jobs: run: | go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 - -# - name: Get build tag -# run: | -# if [[ "$GITHUB_REF_NAME" == "main" ]]; then -# echo "::set-output name=tag::$(git rev-list --count HEAD)" -# elif [[ "$GITHUB_REF_NAME" == "release" ]]; then -# echo "::set-output name=tag::release-$(git rev-list --count HEAD)" -# else -# echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' " -# echo "::set-output name=tag::$GITHUB_RUN_ID" -# fi -# id: build-tag - name: Configure ECR login run: | @@ -536,6 +508,9 @@ jobs: - name: Pull compute node image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest compute-node + - name: Pull rust image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust + - name: Configure docker login run: | # ECR Credential Helper & Docker Hub don't work together in config, hence reset @@ -551,6 +526,9 @@ jobs: - name: Push compute node image to Docker Hub run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}} + - name: Push rust image to Docker Hub + run: crane push rust neondatabase/rust:pinned + - name: Add latest tag to images if: | (github.ref_name == 'main' || github.ref_name == 'release') && diff --git a/Dockerfile b/Dockerfile index dccf7b6c54..77598fd086 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,9 +2,12 @@ ### The image itself is mainly used as a container for the binaries and for starting e2e tests with custom parameters. ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used ### inside this image in the real deployments. +ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG IMAGE=rust +ARG TAG=pinned # Build Postgres -FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS pg-build +FROM $REPOSITORY/$IMAGE:$TAG AS pg-build WORKDIR /home/nonroot COPY vendor/postgres vendor/postgres @@ -17,7 +20,7 @@ RUN set -e \ && tar -C tmp_install -czf /home/nonroot/postgres_install.tar.gz . # Build zenith binaries -FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS build +FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot ARG GIT_VERSION=local diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 05393021c2..47c408bbf2 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,6 +1,10 @@ # First transient image to build compute_tools binaries # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml -FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS rust-build +ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG IMAGE=rust +ARG TAG=pinned + +FROM $REPOSITORY/$IMAGE:$TAG AS rust-build WORKDIR /home/nonroot # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. diff --git a/vendor/postgres b/vendor/postgres index 7e32bba2aa..3f315a1ec3 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 7e32bba2aa2a1752996586bfaf35754f1f0a4d53 +Subproject commit 3f315a1ec336b3a22a09d2015ce91697def4904e From 80436123349d1020d0193ffc62710c63e588bad2 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 18 Aug 2022 16:03:57 +0300 Subject: [PATCH 0655/1022] Remove Timeline trait, rename LayeredTimeline struct into Timeline --- pageserver/src/basebackup.rs | 17 +- pageserver/src/http/routes.rs | 9 +- pageserver/src/import_datadir.rs | 34 ++- pageserver/src/layered_repository.rs | 22 +- pageserver/src/layered_repository/timeline.rs | 254 +++++++++--------- pageserver/src/lib.rs | 2 - pageserver/src/page_service.rs | 24 +- pageserver/src/pgdatadir_mapping.rs | 112 ++++---- pageserver/src/repository.rs | 99 +------ pageserver/src/tenant_mgr.rs | 10 +- pageserver/src/timelines.rs | 6 +- pageserver/src/walingest.rs | 57 ++-- .../src/walreceiver/connection_manager.rs | 8 +- .../src/walreceiver/walreceiver_connection.rs | 6 +- 14 files changed, 290 insertions(+), 370 deletions(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 33f072553f..864c5b8ac8 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -22,8 +22,8 @@ use std::time::SystemTime; use tar::{Builder, EntryType, Header}; use tracing::*; +use crate::layered_repository::Timeline; use crate::reltag::{RelTag, SlruKind}; -use crate::DatadirTimeline; use postgres_ffi::v14::pg_constants; use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName}; @@ -36,13 +36,12 @@ use utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, /// created mostly to avoid passing a lot of parameters between various functions /// used for constructing tarball. -pub struct Basebackup<'a, W, T> +pub struct Basebackup<'a, W> where W: Write, - T: DatadirTimeline, { ar: Builder>, - timeline: &'a Arc, + timeline: &'a Arc, pub lsn: Lsn, prev_record_lsn: Lsn, full_backup: bool, @@ -57,18 +56,17 @@ where // * When working without safekeepers. In this situation it is important to match the lsn // we are taking basebackup on with the lsn that is used in pageserver's walreceiver // to start the replication. -impl<'a, W, T> Basebackup<'a, W, T> +impl<'a, W> Basebackup<'a, W> where W: Write, - T: DatadirTimeline, { pub fn new( write: W, - timeline: &'a Arc, + timeline: &'a Arc, req_lsn: Option, prev_lsn: Option, full_backup: bool, - ) -> Result> { + ) -> Result> { // Compute postgres doesn't have any previous WAL files, but the first // record that it's going to write needs to include the LSN of the // previous record (xl_prev). We include prev_record_lsn in the @@ -404,10 +402,9 @@ where } } -impl<'a, W, T> Drop for Basebackup<'a, W, T> +impl<'a, W> Drop for Basebackup<'a, W> where W: Write, - T: DatadirTimeline, { /// If the basebackup was not finished, prevent the Archive::drop() from /// writing the end-of-archive marker. diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 1d0adec63d..8d300e554a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -11,10 +11,9 @@ use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, }; -use crate::layered_repository::{metadata::TimelineMetadata, LayeredTimeline}; -use crate::pgdatadir_mapping::DatadirTimeline; +use crate::layered_repository::{metadata::TimelineMetadata, Timeline}; +use crate::repository::Repository; use crate::repository::{LocalTimelineState, RepositoryTimeline}; -use crate::repository::{Repository, Timeline}; use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant_config::TenantConfOpt; @@ -85,7 +84,7 @@ fn get_config(request: &Request) -> &'static PageServerConf { // Helper functions to construct a LocalTimelineInfo struct for a timeline fn local_timeline_info_from_loaded_timeline( - timeline: &LayeredTimeline, + timeline: &Timeline, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> anyhow::Result { @@ -160,7 +159,7 @@ fn local_timeline_info_from_unloaded_timeline(metadata: &TimelineMetadata) -> Lo } fn local_timeline_info_from_repo_timeline( - repo_timeline: &RepositoryTimeline, + repo_timeline: &RepositoryTimeline, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> anyhow::Result { diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 729829c5e8..54e791e5b5 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -11,6 +11,7 @@ use bytes::Bytes; use tracing::*; use walkdir::WalkDir; +use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::walingest::WalIngest; @@ -39,9 +40,9 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result { /// This is currently only used to import a cluster freshly created by initdb. /// The code that deals with the checkpoint would not work right if the /// cluster was not shut down cleanly. -pub fn import_timeline_from_postgres_datadir( +pub fn import_timeline_from_postgres_datadir( path: &Path, - tline: &T, + tline: &Timeline, lsn: Lsn, ) -> Result<()> { let mut pg_control: Option = None; @@ -99,8 +100,8 @@ pub fn import_timeline_from_postgres_datadir( } // subroutine of import_timeline_from_postgres_datadir(), to load one relation file. -fn import_rel( - modification: &mut DatadirModification, +fn import_rel( + modification: &mut DatadirModification, path: &Path, spcoid: Oid, dboid: Oid, @@ -178,8 +179,8 @@ fn import_rel( /// Import an SLRU segment file /// -fn import_slru( - modification: &mut DatadirModification, +fn import_slru( + modification: &mut DatadirModification, slru: SlruKind, path: &Path, mut reader: Reader, @@ -234,12 +235,7 @@ fn import_slru( /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. -fn import_wal( - walpath: &Path, - tline: &T, - startpoint: Lsn, - endpoint: Lsn, -) -> Result<()> { +fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> { let mut waldecoder = WalStreamDecoder::new(startpoint); let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE); @@ -305,12 +301,12 @@ fn import_wal( Ok(()) } -pub fn import_basebackup_from_tar( - tline: &T, +pub fn import_basebackup_from_tar( + tline: &Timeline, reader: Reader, base_lsn: Lsn, ) -> Result<()> { - info!("importing base at {}", base_lsn); + info!("importing base at {base_lsn}"); let mut modification = tline.begin_modification(base_lsn); modification.init_empty()?; @@ -347,8 +343,8 @@ pub fn import_basebackup_from_tar( Ok(()) } -pub fn import_wal_from_tar( - tline: &T, +pub fn import_wal_from_tar( + tline: &Timeline, reader: Reader, start_lsn: Lsn, end_lsn: Lsn, @@ -428,8 +424,8 @@ pub fn import_wal_from_tar( Ok(()) } -pub fn import_file( - modification: &mut DatadirModification, +pub fn import_file( + modification: &mut DatadirModification, file_path: &Path, reader: Reader, len: usize, diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 6bf2e71852..c0f4aece54 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -31,7 +31,7 @@ use crate::config::PageServerConf; use crate::storage_sync::index::RemoteIndex; use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::repository::{GcResult, Repository, RepositoryTimeline, Timeline}; +use crate::repository::{GcResult, Repository, RepositoryTimeline}; use crate::thread_mgr; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; @@ -61,7 +61,7 @@ mod timeline; use storage_layer::Layer; use timeline::LayeredTimelineEntry; -pub use timeline::LayeredTimeline; +pub use timeline::Timeline; // re-export this function so that page_cache.rs can use it. pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; @@ -121,15 +121,13 @@ pub struct LayeredRepository { /// Public interface impl Repository for LayeredRepository { - type Timeline = LayeredTimeline; - - fn get_timeline(&self, timelineid: ZTimelineId) -> Option> { + fn get_timeline(&self, timelineid: ZTimelineId) -> Option> { let timelines = self.timelines.lock().unwrap(); self.get_timeline_internal(timelineid, &timelines) .map(RepositoryTimeline::from) } - fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result> { + fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result> { let mut timelines = self.timelines.lock().unwrap(); match self.get_timeline_load_internal(timelineid, &mut timelines)? { Some(local_loaded_timeline) => Ok(local_loaded_timeline), @@ -140,7 +138,7 @@ impl Repository for LayeredRepository { } } - fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)> { + fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)> { self.timelines .lock() .unwrap() @@ -158,7 +156,7 @@ impl Repository for LayeredRepository { &self, timeline_id: ZTimelineId, initdb_lsn: Lsn, - ) -> Result> { + ) -> Result> { let mut timelines = self.timelines.lock().unwrap(); let vacant_timeline_entry = match timelines.entry(timeline_id) { Entry::Occupied(_) => bail!("Timeline already exists"), @@ -176,7 +174,7 @@ impl Repository for LayeredRepository { let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); timeline::save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?; - let timeline = LayeredTimeline::new( + let timeline = Timeline::new( self.conf, Arc::clone(&self.tenant_conf), metadata, @@ -539,7 +537,7 @@ impl LayeredRepository { &self, timelineid: ZTimelineId, timelines: &mut HashMap, - ) -> anyhow::Result>> { + ) -> anyhow::Result>> { match timelines.get(&timelineid) { Some(entry) => match entry { LayeredTimelineEntry::Loaded(local_timeline) => { @@ -574,7 +572,7 @@ impl LayeredRepository { &self, timeline_id: ZTimelineId, timelines: &mut HashMap, - ) -> anyhow::Result> { + ) -> anyhow::Result> { let metadata = load_metadata(self.conf, timeline_id, self.tenant_id) .context("failed to load metadata")?; let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -591,7 +589,7 @@ impl LayeredRepository { .map(LayeredTimelineEntry::Loaded); let _enter = info_span!("loading local timeline").entered(); - let timeline = LayeredTimeline::new( + let timeline = Timeline::new( self.conf, Arc::clone(&self.tenant_conf), metadata, diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 910fc9e9fc..da3a6981da 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -9,7 +9,7 @@ use once_cell::sync::Lazy; use tracing::*; use std::cmp::{max, min, Ordering}; -use std::collections::{hash_map::Entry, HashMap, HashSet}; +use std::collections::{HashMap, HashSet}; use std::fs; use std::fs::{File, OpenOptions}; use std::io::Write; @@ -43,7 +43,6 @@ use crate::pgdatadir_mapping::BlockNumber; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::reltag::RelTag; use crate::tenant_config::TenantConfOpt; -use crate::DatadirTimeline; use postgres_ffi::v14::xlog_utils::to_pg_timestamp; use utils::{ @@ -52,7 +51,7 @@ use utils::{ zid::{ZTenantId, ZTimelineId}, }; -use crate::repository::{GcResult, RepositoryTimeline, Timeline, TimelineWriter}; +use crate::repository::{GcResult, RepositoryTimeline, TimelineWriter}; use crate::repository::{Key, Value}; use crate::thread_mgr; use crate::virtual_file::VirtualFile; @@ -160,7 +159,7 @@ static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { #[derive(Clone)] pub enum LayeredTimelineEntry { - Loaded(Arc), + Loaded(Arc), Unloaded { id: ZTimelineId, metadata: TimelineMetadata, @@ -191,7 +190,7 @@ impl LayeredTimelineEntry { } } - fn ensure_loaded(&self) -> anyhow::Result<&Arc> { + fn ensure_loaded(&self) -> anyhow::Result<&Arc> { match self { LayeredTimelineEntry::Loaded(timeline) => Ok(timeline), LayeredTimelineEntry::Unloaded { .. } => { @@ -213,7 +212,7 @@ impl LayeredTimelineEntry { } } -impl From for RepositoryTimeline { +impl From for RepositoryTimeline { fn from(entry: LayeredTimelineEntry) -> Self { match entry { LayeredTimelineEntry::Loaded(timeline) => RepositoryTimeline::Loaded(timeline as _), @@ -288,7 +287,7 @@ impl TimelineMetrics { } } -pub struct LayeredTimeline { +pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, @@ -385,7 +384,7 @@ pub struct LayeredTimeline { pub last_received_wal: Mutex>, /// Relation size cache - rel_size_cache: RwLock>, + pub rel_size_cache: RwLock>, } pub struct WalReceiverInfo { @@ -394,46 +393,6 @@ pub struct WalReceiverInfo { pub last_received_msg_ts: u128, } -/// Inherit all the functions from DatadirTimeline, to provide the -/// functionality to store PostgreSQL relations, SLRUs, etc. in a -/// LayeredTimeline. -impl DatadirTimeline for LayeredTimeline { - fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option { - let rel_size_cache = self.rel_size_cache.read().unwrap(); - if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) { - if lsn >= *cached_lsn { - return Some(*nblocks); - } - } - None - } - - fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { - let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - match rel_size_cache.entry(tag) { - Entry::Occupied(mut entry) => { - let cached_lsn = entry.get_mut(); - if lsn >= cached_lsn.0 { - *cached_lsn = (lsn, nblocks); - } - } - Entry::Vacant(entry) => { - entry.insert((lsn, nblocks)); - } - } - } - - fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { - let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - rel_size_cache.insert(tag, (lsn, nblocks)); - } - - fn remove_cached_rel_size(&self, tag: &RelTag) { - let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - rel_size_cache.remove(tag); - } -} - /// /// Information about how much history needs to be retained, needed by /// Garbage Collection. @@ -464,45 +423,37 @@ pub struct GcInfo { } /// Public interface functions -impl Timeline for LayeredTimeline { - fn get_ancestor_lsn(&self) -> Lsn { +impl Timeline { + //------------------------------------------------------------------------------ + // Public GET functions + //------------------------------------------------------------------------------ + + /// Get the LSN where this branch was created + pub fn get_ancestor_lsn(&self) -> Lsn { self.ancestor_lsn } - fn get_ancestor_timeline_id(&self) -> Option { + /// Get the ancestor's timeline id + pub fn get_ancestor_timeline_id(&self) -> Option { self.ancestor_timeline .as_ref() .map(LayeredTimelineEntry::timeline_id) } - /// Wait until WAL has been received up to the given LSN. - fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { - // This should never be called from the WAL receiver thread, because that could lead - // to a deadlock. - ensure!( - !IS_WAL_RECEIVER.with(|c| c.get()), - "wait_lsn called by WAL receiver thread" - ); - - self.metrics.wait_lsn_time_histo.observe_closure_duration( - || self.last_record_lsn - .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) - .with_context(|| { - format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", - lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() - ) - }))?; - - Ok(()) - } - - fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard { + /// Lock and get timeline's GC cuttof + pub fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard { self.latest_gc_cutoff_lsn.read().unwrap() } - /// Look up the value with the given a key - fn get(&self, key: Key, lsn: Lsn) -> Result { + /// Look up given page version. + /// + /// NOTE: It is considered an error to 'get' a key that doesn't exist. The abstraction + /// above this needs to store suitable metadata to track what data exists with + /// what keys, in separate metadata entries. If a non-existent key is requested, + /// the Repository implementation may incorrectly return a value from an ancestor + /// branch, for example, or waste a lot of cycles chasing the non-existing key. + /// + pub fn get(&self, key: Key, lsn: Lsn) -> Result { // Check the page cache. We will get back the most recent page with lsn <= `lsn`. // The cached image can be returned directly if there is no WAL between the cached image // and requested LSN. The cached image can also be used to reduce the amount of WAL needed @@ -531,68 +482,31 @@ impl Timeline for LayeredTimeline { .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) } - /// Public entry point for checkpoint(). All the logic is in the private - /// checkpoint_internal function, this public facade just wraps it for - /// metrics collection. - fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { - match cconf { - CheckpointConfig::Flush => { - self.freeze_inmem_layer(false); - self.flush_frozen_layers(true) - } - CheckpointConfig::Forced => { - self.freeze_inmem_layer(false); - self.flush_frozen_layers(true)?; - self.compact() - } - } - } - - /// - /// Validate lsn against initdb_lsn and latest_gc_cutoff_lsn. - /// - fn check_lsn_is_in_scope( - &self, - lsn: Lsn, - latest_gc_cutoff_lsn: &RwLockReadGuard, - ) -> Result<()> { - ensure!( - lsn >= **latest_gc_cutoff_lsn, - "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", - lsn, - **latest_gc_cutoff_lsn, - ); - Ok(()) - } - - fn get_last_record_lsn(&self) -> Lsn { + /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. + pub fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load().last } - fn get_prev_record_lsn(&self) -> Lsn { + pub fn get_prev_record_lsn(&self) -> Lsn { self.last_record_lsn.load().prev } - fn get_last_record_rlsn(&self) -> RecordLsn { + /// Atomically get both last and prev. + pub fn get_last_record_rlsn(&self) -> RecordLsn { self.last_record_lsn.load() } - fn get_disk_consistent_lsn(&self) -> Lsn { + pub fn get_disk_consistent_lsn(&self) -> Lsn { self.disk_consistent_lsn.load() } - fn writer<'a>(&'a self) -> Box { - Box::new(LayeredTimelineWriter { - tl: self, - _write_guard: self.write_lock.lock().unwrap(), - }) - } - - fn get_physical_size(&self) -> u64 { + /// Get the physical size of the timeline at the latest LSN + pub fn get_physical_size(&self) -> u64 { self.metrics.current_physical_size_gauge.get() } - fn get_physical_size_non_incremental(&self) -> anyhow::Result { + /// Get the physical size of the timeline at the latest LSN non incrementally + pub fn get_physical_size_non_incremental(&self) -> anyhow::Result { let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); // total size of layer files in the current timeline directory let mut total_physical_size = 0; @@ -611,9 +525,89 @@ impl Timeline for LayeredTimeline { Ok(total_physical_size) } + + /// + /// Wait until WAL has been received and processed up to this LSN. + /// + /// You should call this before any of the other get_* or list_* functions. Calling + /// those functions with an LSN that has been processed yet is an error. + /// + pub fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { + // This should never be called from the WAL receiver thread, because that could lead + // to a deadlock. + ensure!( + !IS_WAL_RECEIVER.with(|c| c.get()), + "wait_lsn called by WAL receiver thread" + ); + + self.metrics.wait_lsn_time_histo.observe_closure_duration( + || self.last_record_lsn + .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) + .with_context(|| { + format!( + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", + lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() + ) + }))?; + + Ok(()) + } + + /// Check that it is valid to request operations with that lsn. + pub fn check_lsn_is_in_scope( + &self, + lsn: Lsn, + latest_gc_cutoff_lsn: &RwLockReadGuard, + ) -> Result<()> { + ensure!( + lsn >= **latest_gc_cutoff_lsn, + "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", + lsn, + **latest_gc_cutoff_lsn, + ); + Ok(()) + } + + //------------------------------------------------------------------------------ + // Public PUT functions, to update the repository with new page versions. + // + // These are called by the WAL receiver to digest WAL records. + //------------------------------------------------------------------------------ + + /// Flush to disk all data that was written with the put_* functions + /// + /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't + /// know anything about them here in the repository. + pub fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { + match cconf { + CheckpointConfig::Flush => { + self.freeze_inmem_layer(false); + self.flush_frozen_layers(true) + } + CheckpointConfig::Forced => { + self.freeze_inmem_layer(false); + self.flush_frozen_layers(true)?; + self.compact() + } + } + } + + /// Mutate the timeline with a [`TimelineWriter`]. + /// + /// FIXME: This ought to return &'a TimelineWriter, where TimelineWriter + /// is a generic type in this trait. But that doesn't currently work in + /// Rust: https://rust-lang.github.io/rfcs/1598-generic_associated_types.html + /// TODO kb replace with the concrete type + pub fn writer<'a>(&'a self) -> Box { + Box::new(LayeredTimelineWriter { + tl: self, + _write_guard: self.write_lock.lock().unwrap(), + }) + } } -impl LayeredTimeline { +// Private functions +impl Timeline { fn get_checkpoint_distance(&self) -> u64 { let tenant_conf = self.tenant_conf.read().unwrap(); tenant_conf @@ -662,8 +656,8 @@ impl LayeredTimeline { tenant_id: ZTenantId, walredo_mgr: Arc, upload_layers: bool, - ) -> LayeredTimeline { - let mut result = LayeredTimeline { + ) -> Timeline { + let mut result = Timeline { conf, tenant_conf, timeline_id, @@ -1014,7 +1008,7 @@ impl LayeredTimeline { Some((lsn, img)) } - fn get_ancestor_timeline(&self) -> Result> { + fn get_ancestor_timeline(&self) -> Result> { let ancestor = self .ancestor_timeline .as_ref() @@ -1135,7 +1129,7 @@ impl LayeredTimeline { /// Also flush after a period of time without new data -- it helps /// safekeepers to regard pageserver as caught up and suspend activity. /// - pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { + pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { let last_lsn = self.get_last_record_lsn(); let layers = self.layers.read().unwrap(); if let Some(open_layer) = &layers.open_layer { @@ -2211,12 +2205,12 @@ fn layer_traversal_error( } struct LayeredTimelineWriter<'a> { - tl: &'a LayeredTimeline, + tl: &'a Timeline, _write_guard: MutexGuard<'a, ()>, } impl Deref for LayeredTimelineWriter<'_> { - type Target = dyn Timeline; + type Target = Timeline; fn deref(&self) -> &Self::Target { self.tl diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 47fd8a84cf..06c5f552a4 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -28,8 +28,6 @@ use tracing::info; use crate::thread_mgr::ThreadKind; use metrics::{register_int_gauge_vec, IntGaugeVec}; -use pgdatadir_mapping::DatadirTimeline; - /// Current storage format version /// /// This is embedded in the metadata file, and also in the header of all the diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b63bb90be1..f5f1e4d7bd 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -30,11 +30,11 @@ use utils::{ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; -use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp}; +use crate::layered_repository::Timeline; +use crate::pgdatadir_mapping::LsnForTimestamp; use crate::profiling::profpoint_start; use crate::reltag::RelTag; use crate::repository::Repository; -use crate::repository::Timeline; use crate::tenant_mgr; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; @@ -636,8 +636,8 @@ impl PageServerHandler { /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. - fn wait_or_get_last_lsn( - timeline: &T, + fn wait_or_get_last_lsn( + timeline: &Timeline, mut lsn: Lsn, latest: bool, latest_gc_cutoff_lsn: &RwLockReadGuard, @@ -684,9 +684,9 @@ impl PageServerHandler { Ok(lsn) } - fn handle_get_rel_exists_request( + fn handle_get_rel_exists_request( &self, - timeline: &T, + timeline: &Timeline, req: &PagestreamExistsRequest, ) -> Result { let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered(); @@ -701,9 +701,9 @@ impl PageServerHandler { })) } - fn handle_get_nblocks_request( + fn handle_get_nblocks_request( &self, - timeline: &T, + timeline: &Timeline, req: &PagestreamNblocksRequest, ) -> Result { let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered(); @@ -717,9 +717,9 @@ impl PageServerHandler { })) } - fn handle_db_size_request( + fn handle_db_size_request( &self, - timeline: &T, + timeline: &Timeline, req: &PagestreamDbSizeRequest, ) -> Result { let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered(); @@ -735,9 +735,9 @@ impl PageServerHandler { })) } - fn handle_get_page_at_lsn_request( + fn handle_get_page_at_lsn_request( &self, - timeline: &T, + timeline: &Timeline, req: &PagestreamGetPageRequest, ) -> Result { let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 88fac0ad5a..d10e48393c 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -7,8 +7,8 @@ //! Clarify that) //! use crate::keyspace::{KeySpace, KeySpaceAccum}; +use crate::layered_repository::Timeline; use crate::reltag::{RelTag, SlruKind}; -use crate::repository::Timeline; use crate::repository::*; use crate::walrecord::ZenithWalRecord; use anyhow::{bail, ensure, Result}; @@ -18,7 +18,7 @@ use postgres_ffi::v14::xlog_utils::TimestampTz; use postgres_ffi::BLCKSZ; use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, HashSet}; +use std::collections::{hash_map, HashMap, HashSet}; use std::ops::Range; use tracing::{debug, trace, warn}; use utils::{bin_ser::BeSer, lsn::Lsn}; @@ -35,23 +35,13 @@ pub enum LsnForTimestamp { } /// -/// This trait provides all the functionality to store PostgreSQL relations, SLRUs, +/// This impl provides all the functionality to store PostgreSQL relations, SLRUs, /// and other special kinds of files, in a versioned key-value store. The -/// Timeline trait provides the key-value store. +/// Timeline struct provides the key-value store. /// -/// This is a trait, so that we can easily include all these functions in a Timeline -/// implementation. You're not expected to have different implementations of this trait, -/// rather, this provides an interface and implementation, over Timeline. -/// -/// If you wanted to store other kinds of data in the Neon repository, e.g. -/// flat files or MySQL, you would create a new trait like this, with all the -/// functions that make sense for the kind of data you're storing. For flat files, -/// for example, you might have a function like "fn read(path, offset, size)". -/// We might also have that situation in the future, to support multiple PostgreSQL -/// versions, if there are big changes in how the data is organized in the data -/// directory, or if new special files are introduced. -/// -pub trait DatadirTimeline: Timeline { +/// This is a separate impl, so that we can easily include all these functions in a Timeline +/// implementation, and might be moved into a separate struct later. +impl Timeline { /// Start ingesting a WAL record, or other atomic modification of /// the timeline. /// @@ -75,7 +65,7 @@ pub trait DatadirTimeline: Timeline { /// functions of the timeline until you finish! And if you update the /// same page twice, the last update wins. /// - fn begin_modification(&self, lsn: Lsn) -> DatadirModification + pub fn begin_modification(&self, lsn: Lsn) -> DatadirModification where Self: Sized, { @@ -93,7 +83,7 @@ pub trait DatadirTimeline: Timeline { //------------------------------------------------------------------------------ /// Look up given page version. - fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result { + pub fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); let nblocks = self.get_rel_size(tag, lsn)?; @@ -110,7 +100,7 @@ pub trait DatadirTimeline: Timeline { } // Get size of a database in blocks - fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { let mut total_blocks = 0; let rels = self.list_rels(spcnode, dbnode, lsn)?; @@ -123,7 +113,7 @@ pub trait DatadirTimeline: Timeline { } /// Get size of a relation file - fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { + pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) { @@ -151,7 +141,7 @@ pub trait DatadirTimeline: Timeline { } /// Does relation exist? - fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { + pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); // first try to lookup relation in cache @@ -169,7 +159,7 @@ pub trait DatadirTimeline: Timeline { } /// Get a list of all existing relations in given tablespace and database. - fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { + pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { // fetch directory listing let key = rel_dir_to_key(spcnode, dbnode); let buf = self.get(key, lsn)?; @@ -187,7 +177,7 @@ pub trait DatadirTimeline: Timeline { } /// Look up given SLRU page version. - fn get_slru_page_at_lsn( + pub fn get_slru_page_at_lsn( &self, kind: SlruKind, segno: u32, @@ -199,14 +189,19 @@ pub trait DatadirTimeline: Timeline { } /// Get size of an SLRU segment - fn get_slru_segment_size(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { + pub fn get_slru_segment_size( + &self, + kind: SlruKind, + segno: u32, + lsn: Lsn, + ) -> Result { let key = slru_segment_size_to_key(kind, segno); let mut buf = self.get(key, lsn)?; Ok(buf.get_u32_le()) } /// Get size of an SLRU segment - fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { + pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { // fetch directory listing let key = slru_dir_to_key(kind); let buf = self.get(key, lsn)?; @@ -223,7 +218,7 @@ pub trait DatadirTimeline: Timeline { /// so it's not well defined which LSN you get if there were multiple commits /// "in flight" at that point in time. /// - fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result { + pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result { let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); let min_lsn = *gc_cutoff_lsn_guard; let max_lsn = self.get_last_record_lsn(); @@ -286,7 +281,7 @@ pub trait DatadirTimeline: Timeline { /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits /// with a smaller/larger timestamp. /// - fn is_latest_commit_timestamp_ge_than( + pub fn is_latest_commit_timestamp_ge_than( &self, search_timestamp: TimestampTz, probe_lsn: Lsn, @@ -317,7 +312,7 @@ pub trait DatadirTimeline: Timeline { } /// Get a list of SLRU segments - fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { + pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { // fetch directory entry let key = slru_dir_to_key(kind); @@ -327,14 +322,14 @@ pub trait DatadirTimeline: Timeline { Ok(dir.segments) } - fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { let key = relmap_file_key(spcnode, dbnode); let buf = self.get(key, lsn)?; Ok(buf) } - fn list_dbdirs(&self, lsn: Lsn) -> Result> { + pub fn list_dbdirs(&self, lsn: Lsn) -> Result> { // fetch directory entry let buf = self.get(DBDIR_KEY, lsn)?; let dir = DbDirectory::des(&buf)?; @@ -342,13 +337,13 @@ pub trait DatadirTimeline: Timeline { Ok(dir.dbdirs) } - fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result { + pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result { let key = twophase_file_key(xid); let buf = self.get(key, lsn)?; Ok(buf) } - fn list_twophase_files(&self, lsn: Lsn) -> Result> { + pub fn list_twophase_files(&self, lsn: Lsn) -> Result> { // fetch directory entry let buf = self.get(TWOPHASEDIR_KEY, lsn)?; let dir = TwoPhaseDirectory::des(&buf)?; @@ -356,11 +351,11 @@ pub trait DatadirTimeline: Timeline { Ok(dir.xids) } - fn get_control_file(&self, lsn: Lsn) -> Result { + pub fn get_control_file(&self, lsn: Lsn) -> Result { self.get(CONTROLFILE_KEY, lsn) } - fn get_checkpoint(&self, lsn: Lsn) -> Result { + pub fn get_checkpoint(&self, lsn: Lsn) -> Result { self.get(CHECKPOINT_KEY, lsn) } @@ -369,7 +364,7 @@ pub trait DatadirTimeline: Timeline { /// /// Only relation blocks are counted currently. That excludes metadata, /// SLRUs, twophase files etc. - fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { + pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { // Fetch list of database dirs and iterate them let buf = self.get(DBDIR_KEY, lsn)?; let dbdir = DbDirectory::des(&buf)?; @@ -391,7 +386,7 @@ pub trait DatadirTimeline: Timeline { /// Get a KeySpace that covers all the Keys that are in use at the given LSN. /// Anything that's not listed maybe removed from the underlying storage (from /// that LSN forwards). - fn collect_keyspace(&self, lsn: Lsn) -> Result { + pub fn collect_keyspace(&self, lsn: Lsn) -> Result { // Iterate through key ranges, greedily packing them into partitions let mut result = KeySpaceAccum::new(); @@ -465,27 +460,54 @@ pub trait DatadirTimeline: Timeline { } /// Get cached size of relation if it not updated after specified LSN - fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option; + pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option { + let rel_size_cache = self.rel_size_cache.read().unwrap(); + if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) { + if lsn >= *cached_lsn { + return Some(*nblocks); + } + } + None + } /// Update cached relation size if there is no more recent update - fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber); + pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { + let mut rel_size_cache = self.rel_size_cache.write().unwrap(); + match rel_size_cache.entry(tag) { + hash_map::Entry::Occupied(mut entry) => { + let cached_lsn = entry.get_mut(); + if lsn >= cached_lsn.0 { + *cached_lsn = (lsn, nblocks); + } + } + hash_map::Entry::Vacant(entry) => { + entry.insert((lsn, nblocks)); + } + } + } /// Store cached relation size - fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber); + pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { + let mut rel_size_cache = self.rel_size_cache.write().unwrap(); + rel_size_cache.insert(tag, (lsn, nblocks)); + } /// Remove cached relation size - fn remove_cached_rel_size(&self, tag: &RelTag); + pub fn remove_cached_rel_size(&self, tag: &RelTag) { + let mut rel_size_cache = self.rel_size_cache.write().unwrap(); + rel_size_cache.remove(tag); + } } /// DatadirModification represents an operation to ingest an atomic set of /// updates to the repository. It is created by the 'begin_record' /// function. It is called for each WAL record, so that all the modifications /// by a one WAL record appear atomic. -pub struct DatadirModification<'a, T: DatadirTimeline> { +pub struct DatadirModification<'a> { /// The timeline this modification applies to. You can access this to /// read the state, but note that any pending updates are *not* reflected /// in the state in 'tline' yet. - pub tline: &'a T, + pub tline: &'a Timeline, /// Lsn assigned by begin_modification pub lsn: Lsn, @@ -498,7 +520,7 @@ pub struct DatadirModification<'a, T: DatadirTimeline> { pending_nblocks: isize, } -impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { +impl<'a> DatadirModification<'a> { /// Initialize a completely new repository. /// /// This inserts the directory metadata entries that are assumed to @@ -1371,7 +1393,7 @@ fn is_slru_block_key(key: Key) -> bool { pub fn create_test_timeline( repo: R, timeline_id: utils::zid::ZTimelineId, -) -> Result> { +) -> Result> { let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index d09b01437c..5cdc27a846 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,19 +1,16 @@ use crate::layered_repository::metadata::TimelineMetadata; +use crate::layered_repository::Timeline; use crate::storage_sync::index::RemoteIndex; use crate::walrecord::ZenithWalRecord; -use crate::CheckpointConfig; use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; use bytes::Bytes; use serde::{Deserialize, Serialize}; use std::fmt; use std::ops::{AddAssign, Range}; -use std::sync::{Arc, RwLockReadGuard}; +use std::sync::Arc; use std::time::Duration; -use utils::{ - lsn::{Lsn, RecordLsn}, - zid::ZTimelineId, -}; +use utils::{lsn::Lsn, zid::ZTimelineId}; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] /// Key used in the Repository kv-store. @@ -185,22 +182,20 @@ impl Value { /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. pub trait Repository: Send + Sync { - type Timeline: crate::DatadirTimeline; - /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization. /// See [`crate::remote_storage`] for more details about the synchronization. fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>; /// Get Timeline handle for given zenith timeline ID. /// This function is idempotent. It doesn't change internal state in any way. - fn get_timeline(&self, timelineid: ZTimelineId) -> Option>; + fn get_timeline(&self, timelineid: ZTimelineId) -> Option>; /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded. - fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result>; + fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result>; /// Lists timelines the repository contains. /// Up to repository's implementation to omit certain timelines that ar not considered ready for use. - fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)>; + fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)>; /// Create a new, empty timeline. The caller is responsible for loading data into it /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. @@ -208,7 +203,7 @@ pub trait Repository: Send + Sync { &self, timeline_id: ZTimelineId, initdb_lsn: Lsn, - ) -> Result>; + ) -> Result>; /// Branch a timeline fn branch_timeline( @@ -305,81 +300,6 @@ impl AddAssign for GcResult { } } -pub trait Timeline: Send + Sync { - //------------------------------------------------------------------------------ - // Public GET functions - //------------------------------------------------------------------------------ - - /// - /// Wait until WAL has been received and processed up to this LSN. - /// - /// You should call this before any of the other get_* or list_* functions. Calling - /// those functions with an LSN that has been processed yet is an error. - /// - fn wait_lsn(&self, lsn: Lsn) -> Result<()>; - - /// Lock and get timeline's GC cuttof - fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard; - - /// Look up given page version. - /// - /// NOTE: It is considered an error to 'get' a key that doesn't exist. The abstraction - /// above this needs to store suitable metadata to track what data exists with - /// what keys, in separate metadata entries. If a non-existent key is requested, - /// the Repository implementation may incorrectly return a value from an ancestor - /// branch, for example, or waste a lot of cycles chasing the non-existing key. - /// - fn get(&self, key: Key, lsn: Lsn) -> Result; - - /// Get the ancestor's timeline id - fn get_ancestor_timeline_id(&self) -> Option; - - /// Get the LSN where this branch was created - fn get_ancestor_lsn(&self) -> Lsn; - - //------------------------------------------------------------------------------ - // Public PUT functions, to update the repository with new page versions. - // - // These are called by the WAL receiver to digest WAL records. - //------------------------------------------------------------------------------ - /// Atomically get both last and prev. - fn get_last_record_rlsn(&self) -> RecordLsn; - - /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. - fn get_last_record_lsn(&self) -> Lsn; - - fn get_prev_record_lsn(&self) -> Lsn; - - fn get_disk_consistent_lsn(&self) -> Lsn; - - /// Mutate the timeline with a [`TimelineWriter`]. - /// - /// FIXME: This ought to return &'a TimelineWriter, where TimelineWriter - /// is a generic type in this trait. But that doesn't currently work in - /// Rust: https://rust-lang.github.io/rfcs/1598-generic_associated_types.html - fn writer<'a>(&'a self) -> Box; - - /// - /// Flush to disk all data that was written with the put_* functions - /// - /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't - /// know anything about them here in the repository. - fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>; - - /// - /// Check that it is valid to request operations with that lsn. - fn check_lsn_is_in_scope( - &self, - lsn: Lsn, - latest_gc_cutoff_lsn: &RwLockReadGuard, - ) -> Result<()>; - - /// Get the physical size of the timeline at the latest LSN - fn get_physical_size(&self) -> u64; - /// Get the physical size of the timeline at the latest LSN non incrementally - fn get_physical_size_non_incremental(&self) -> Result; -} - /// Various functions to mutate the timeline. // TODO Currently, Deref is used to allow easy access to read methods from this trait. // This is probably considered a bad practice in Rust and should be fixed eventually, @@ -581,6 +501,9 @@ pub mod repo_harness { #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { + use crate::layered_repository::Timeline; + use crate::CheckpointConfig; + use super::repo_harness::*; use super::*; //use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT}; @@ -689,7 +612,7 @@ mod tests { Ok(()) } - fn make_some_layers(tline: &T, start_lsn: Lsn) -> Result<()> { + fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> Result<()> { let mut lsn = start_lsn; #[allow(non_snake_case)] { diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 64f1caa542..36c3e569a6 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,7 +3,7 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; -use crate::layered_repository::{load_metadata, LayeredRepository, LayeredTimeline}; +use crate::layered_repository::{load_metadata, LayeredRepository, Timeline}; use crate::repository::Repository; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; @@ -100,7 +100,7 @@ struct Tenant { /// /// Local timelines have more metadata that's loaded into memory, /// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`]. - local_timelines: HashMap>, + local_timelines: HashMap>, } #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] @@ -177,7 +177,7 @@ pub enum LocalTimelineUpdate { }, Attach { id: ZTenantTimelineId, - datadir: Arc, + datadir: Arc, }, } @@ -379,7 +379,7 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result anyhow::Result> { +) -> anyhow::Result> { let mut m = tenants_state::write_tenants(); let tenant = m .get_mut(&tenant_id) @@ -486,7 +486,7 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any fn load_local_timeline( repo: &LayeredRepository, timeline_id: ZTimelineId, -) -> anyhow::Result> { +) -> anyhow::Result> { let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| { format!("Inmem timeline {timeline_id} not found in tenant's repository") })?; diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 0d35195691..6a55dd286e 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -20,15 +20,15 @@ use utils::{ use crate::import_datadir; use crate::tenant_mgr; +use crate::CheckpointConfig; use crate::{ config::PageServerConf, repository::Repository, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, }; use crate::{ - layered_repository::{LayeredRepository, LayeredTimeline}, + layered_repository::{LayeredRepository, Timeline}, walredo::WalRedoManager, }; -use crate::{repository::Timeline, CheckpointConfig}; #[derive(Debug, Clone, Copy)] pub struct PointInTime { @@ -160,7 +160,7 @@ pub(crate) fn create_timeline( new_timeline_id: Option, ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, -) -> Result)>> { +) -> Result)>> { let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 05afe4ba3e..c24ffc49de 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -30,6 +30,7 @@ use anyhow::Result; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; +use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::walrecord::*; @@ -43,15 +44,15 @@ use utils::lsn::Lsn; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); -pub struct WalIngest<'a, T: DatadirTimeline> { - timeline: &'a T, +pub struct WalIngest<'a> { + timeline: &'a Timeline, checkpoint: CheckPoint, checkpoint_modified: bool, } -impl<'a, T: DatadirTimeline> WalIngest<'a, T> { - pub fn new(timeline: &T, startpoint: Lsn) -> Result> { +impl<'a> WalIngest<'a> { + pub fn new(timeline: &Timeline, startpoint: Lsn) -> Result { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_checkpoint(startpoint)?; @@ -77,7 +78,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { &mut self, recdata: Bytes, lsn: Lsn, - modification: &mut DatadirModification, + modification: &mut DatadirModification, decoded: &mut DecodedWALRecord, ) -> Result<()> { modification.lsn = lsn; @@ -266,7 +267,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn ingest_decoded_block( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, lsn: Lsn, decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, @@ -326,7 +327,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn ingest_heapam_record( &mut self, buf: &mut Bytes, - modification: &mut DatadirModification, + modification: &mut DatadirModification, decoded: &mut DecodedWALRecord, ) -> Result<()> { // Handle VM bit updates that are implicitly part of heap records. @@ -470,7 +471,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record. fn ingest_xlog_dbase_create( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rec: &XlCreateDatabase, ) -> Result<()> { let db_id = rec.db_id; @@ -537,7 +538,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn ingest_xlog_smgr_create( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rec: &XlSmgrCreate, ) -> Result<()> { let rel = RelTag { @@ -555,7 +556,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { /// This is the same logic as in PostgreSQL's smgr_redo() function. fn ingest_xlog_smgr_truncate( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rec: &XlSmgrTruncate, ) -> Result<()> { let spcnode = rec.rnode.spcnode; @@ -620,7 +621,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { /// fn ingest_xact_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, parsed: &XlXactParsedRecord, is_commit: bool, ) -> Result<()> { @@ -689,7 +690,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn ingest_clog_truncate_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlClogTruncate, ) -> Result<()> { info!( @@ -747,7 +748,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn ingest_multixact_create_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlMultiXactCreate, ) -> Result<()> { // Create WAL record for updating the multixact-offsets page @@ -826,7 +827,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn ingest_multixact_truncate_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlMultiXactTruncate, ) -> Result<()> { self.checkpoint.oldestMulti = xlrec.end_trunc_off; @@ -860,7 +861,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn ingest_relmap_page( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlRelmapUpdate, decoded: &DecodedWALRecord, ) -> Result<()> { @@ -876,7 +877,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn put_rel_creation( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, ) -> Result<()> { modification.put_rel_creation(rel, 0)?; @@ -885,7 +886,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn put_rel_page_image( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, img: Bytes, @@ -897,7 +898,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn put_rel_wal_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, rec: ZenithWalRecord, @@ -909,7 +910,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn put_rel_truncation( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, nblocks: BlockNumber, ) -> Result<()> { @@ -917,11 +918,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { Ok(()) } - fn put_rel_drop( - &mut self, - modification: &mut DatadirModification, - rel: RelTag, - ) -> Result<()> { + fn put_rel_drop(&mut self, modification: &mut DatadirModification, rel: RelTag) -> Result<()> { modification.put_rel_drop(rel)?; Ok(()) } @@ -937,7 +934,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn handle_rel_extend( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, ) -> Result<()> { @@ -968,7 +965,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn put_slru_page_image( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, kind: SlruKind, segno: u32, blknum: BlockNumber, @@ -981,7 +978,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn handle_slru_extend( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, kind: SlruKind, segno: u32, blknum: BlockNumber, @@ -1032,9 +1029,9 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { #[cfg(test)] mod tests { use super::*; + use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::create_test_timeline; use crate::repository::repo_harness::*; - use crate::repository::Timeline; use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; @@ -1046,13 +1043,13 @@ mod tests { forknum: 0, }; - fn assert_current_logical_size(_timeline: &T, _lsn: Lsn) { + fn assert_current_logical_size(_timeline: &Timeline, _lsn: Lsn) { // TODO } static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); - fn init_walingest_test(tline: &T) -> Result> { + fn init_walingest_test(tline: &Timeline) -> Result { let mut m = tline.begin_modification(Lsn(0x10)); m.put_checkpoint(ZERO_CHECKPOINT.clone())?; m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index e8e0a7c52b..2fc44cb26a 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -16,7 +16,7 @@ use std::{ time::Duration, }; -use crate::{layered_repository::LayeredTimeline, repository::Timeline}; +use crate::layered_repository::Timeline; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ @@ -39,7 +39,7 @@ pub(super) fn spawn_connection_manager_task( id: ZTenantTimelineId, broker_loop_prefix: String, mut client: Client, - local_timeline: Arc, + local_timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, @@ -242,7 +242,7 @@ const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5; struct WalreceiverState { id: ZTenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. - local_timeline: Arc, + local_timeline: Arc, /// The timeout on the connection to safekeeper for WAL streaming. wal_connect_timeout: Duration, /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. @@ -300,7 +300,7 @@ struct EtcdSkTimeline { impl WalreceiverState { fn new( id: ZTenantTimelineId, - local_timeline: Arc, + local_timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 025bfeb506..283cc76e66 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -20,11 +20,7 @@ use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; use crate::{ - layered_repository::WalReceiverInfo, - pgdatadir_mapping::DatadirTimeline, - repository::{Repository, Timeline}, - tenant_mgr, - walingest::WalIngest, + layered_repository::WalReceiverInfo, repository::Repository, tenant_mgr, walingest::WalIngest, walrecord::DecodedWALRecord, }; use postgres_ffi::v14::waldecoder::WalStreamDecoder; From c19b4a65f96062e1aeb521e17fde204b27ca2158 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 18 Aug 2022 16:20:52 +0300 Subject: [PATCH 0656/1022] Remove Repository trait, rename LayeredRepository struct into Repository --- pageserver/src/http/routes.rs | 1 - pageserver/src/layered_repository.rs | 69 +++++++++------ pageserver/src/page_service.rs | 1 - pageserver/src/pgdatadir_mapping.rs | 4 +- pageserver/src/repository.rs | 85 ++----------------- pageserver/src/tenant_mgr.rs | 17 ++-- pageserver/src/tenant_tasks.rs | 1 - pageserver/src/timelines.rs | 15 ++-- .../src/walreceiver/connection_manager.rs | 5 +- .../src/walreceiver/walreceiver_connection.rs | 2 +- 10 files changed, 71 insertions(+), 129 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 8d300e554a..da21f6883a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -12,7 +12,6 @@ use super::models::{ TimelineCreateRequest, }; use crate::layered_repository::{metadata::TimelineMetadata, Timeline}; -use crate::repository::Repository; use crate::repository::{LocalTimelineState, RepositoryTimeline}; use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index c0f4aece54..a5877c8482 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -31,7 +31,7 @@ use crate::config::PageServerConf; use crate::storage_sync::index::RemoteIndex; use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::repository::{GcResult, Repository, RepositoryTimeline}; +use crate::repository::{GcResult, RepositoryTimeline}; use crate::thread_mgr; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; @@ -78,7 +78,7 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; /// /// Repository consists of multiple timelines. Keep them in a hash table. /// -pub struct LayeredRepository { +pub struct Repository { // Global pageserver config parameters pub conf: &'static PageServerConf, @@ -119,15 +119,19 @@ pub struct LayeredRepository { upload_layers: bool, } -/// Public interface -impl Repository for LayeredRepository { - fn get_timeline(&self, timelineid: ZTimelineId) -> Option> { +/// A repository corresponds to one .neon directory. One repository holds multiple +/// timelines, forked off from the same initial call to 'initdb'. +impl Repository { + /// Get Timeline handle for given zenith timeline ID. + /// This function is idempotent. It doesn't change internal state in any way. + pub fn get_timeline(&self, timelineid: ZTimelineId) -> Option> { let timelines = self.timelines.lock().unwrap(); self.get_timeline_internal(timelineid, &timelines) .map(RepositoryTimeline::from) } - fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result> { + /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded. + pub fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result> { let mut timelines = self.timelines.lock().unwrap(); match self.get_timeline_load_internal(timelineid, &mut timelines)? { Some(local_loaded_timeline) => Ok(local_loaded_timeline), @@ -138,7 +142,9 @@ impl Repository for LayeredRepository { } } - fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)> { + /// Lists timelines the repository contains. + /// Up to repository's implementation to omit certain timelines that ar not considered ready for use. + pub fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)> { self.timelines .lock() .unwrap() @@ -152,7 +158,9 @@ impl Repository for LayeredRepository { .collect() } - fn create_empty_timeline( + /// Create a new, empty timeline. The caller is responsible for loading data into it + /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. + pub fn create_empty_timeline( &self, timeline_id: ZTimelineId, initdb_lsn: Lsn, @@ -194,7 +202,7 @@ impl Repository for LayeredRepository { } /// Branch a timeline - fn branch_timeline( + pub fn branch_timeline( &self, src: ZTimelineId, dst: ZTimelineId, @@ -284,10 +292,16 @@ impl Repository for LayeredRepository { Ok(()) } - /// Public entry point to GC. All the logic is in the private - /// gc_iteration_internal function, this public facade just wraps it for - /// metrics collection. - fn gc_iteration( + /// perform one garbage collection iteration, removing old data files from disk. + /// this function is periodically called by gc thread. + /// also it can be explicitly requested through page server api 'do_gc' command. + /// + /// 'timelineid' specifies the timeline to GC, or None for all. + /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval). + /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC + /// to make tests more deterministic. + /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed? + pub fn gc_iteration( &self, target_timeline_id: Option, horizon: u64, @@ -305,7 +319,11 @@ impl Repository for LayeredRepository { }) } - fn compaction_iteration(&self) -> Result<()> { + /// Perform one compaction iteration. + /// This function is periodically called by compactor thread. + /// Also it can be explicitly requested per timeline through page server + /// api's 'compact' command. + pub fn compaction_iteration(&self) -> Result<()> { // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // compactions. We don't want to block everything else while the @@ -333,12 +351,11 @@ impl Repository for LayeredRepository { Ok(()) } - /// /// Flush all in-memory data to disk. /// - /// Used at shutdown. + /// Used at graceful shutdown. /// - fn checkpoint(&self) -> Result<()> { + pub fn checkpoint(&self) -> Result<()> { // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // checkpoints. We don't want to block everything else while the @@ -368,7 +385,8 @@ impl Repository for LayeredRepository { Ok(()) } - fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> { + /// Removes timeline-related in-memory data + pub fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> { // in order to be retriable detach needs to be idempotent // (or at least to a point that each time the detach is called it can make progress) let mut timelines = self.timelines.lock().unwrap(); @@ -405,7 +423,9 @@ impl Repository for LayeredRepository { Ok(()) } - fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> { + /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization. + /// See [`crate::remote_storage`] for more details about the synchronization. + pub fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> { debug!("attach timeline_id: {}", timeline_id,); match self.timelines.lock().unwrap().entry(timeline_id) { Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."), @@ -419,13 +439,14 @@ impl Repository for LayeredRepository { Ok(()) } - fn get_remote_index(&self) -> &RemoteIndex { + /// Allows to retrieve remote timeline index from the tenant. Used in walreceiver to grab remote consistent lsn. + pub fn get_remote_index(&self) -> &RemoteIndex { &self.remote_index } } /// Private functions -impl LayeredRepository { +impl Repository { pub fn get_checkpoint_distance(&self) -> u64 { let tenant_conf = self.tenant_conf.read().unwrap(); tenant_conf @@ -515,7 +536,7 @@ impl LayeredRepository { tenant_conf.update(&new_tenant_conf); - LayeredRepository::persist_tenant_config(self.conf, self.tenant_id, *tenant_conf)?; + Repository::persist_tenant_config(self.conf, self.tenant_id, *tenant_conf)?; Ok(()) } @@ -613,8 +634,8 @@ impl LayeredRepository { tenant_id: ZTenantId, remote_index: RemoteIndex, upload_layers: bool, - ) -> LayeredRepository { - LayeredRepository { + ) -> Repository { + Repository { tenant_id, file_lock: RwLock::new(()), conf, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index f5f1e4d7bd..e6114c0fc5 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -34,7 +34,6 @@ use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::profiling::profpoint_start; use crate::reltag::RelTag; -use crate::repository::Repository; use crate::tenant_mgr; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index d10e48393c..beaac292ec 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1390,8 +1390,8 @@ fn is_slru_block_key(key: Key) -> bool { // #[cfg(test)] -pub fn create_test_timeline( - repo: R, +pub fn create_test_timeline( + repo: crate::layered_repository::Repository, timeline_id: utils::zid::ZTimelineId, ) -> Result> { let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 5cdc27a846..d0e1ed24b6 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,6 +1,4 @@ use crate::layered_repository::metadata::TimelineMetadata; -use crate::layered_repository::Timeline; -use crate::storage_sync::index::RemoteIndex; use crate::walrecord::ZenithWalRecord; use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; @@ -10,7 +8,7 @@ use std::fmt; use std::ops::{AddAssign, Range}; use std::sync::Arc; use std::time::Duration; -use utils::{lsn::Lsn, zid::ZTimelineId}; +use utils::lsn::Lsn; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] /// Key used in the Repository kv-store. @@ -178,76 +176,6 @@ impl Value { } } -/// -/// A repository corresponds to one .neon directory. One repository holds multiple -/// timelines, forked off from the same initial call to 'initdb'. -pub trait Repository: Send + Sync { - /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization. - /// See [`crate::remote_storage`] for more details about the synchronization. - fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>; - - /// Get Timeline handle for given zenith timeline ID. - /// This function is idempotent. It doesn't change internal state in any way. - fn get_timeline(&self, timelineid: ZTimelineId) -> Option>; - - /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded. - fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result>; - - /// Lists timelines the repository contains. - /// Up to repository's implementation to omit certain timelines that ar not considered ready for use. - fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)>; - - /// Create a new, empty timeline. The caller is responsible for loading data into it - /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. - fn create_empty_timeline( - &self, - timeline_id: ZTimelineId, - initdb_lsn: Lsn, - ) -> Result>; - - /// Branch a timeline - fn branch_timeline( - &self, - src: ZTimelineId, - dst: ZTimelineId, - start_lsn: Option, - ) -> Result<()>; - - /// Flush all data to disk. - /// - /// this is used at graceful shutdown. - fn checkpoint(&self) -> Result<()>; - - /// perform one garbage collection iteration, removing old data files from disk. - /// this function is periodically called by gc thread. - /// also it can be explicitly requested through page server api 'do_gc' command. - /// - /// 'timelineid' specifies the timeline to GC, or None for all. - /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval). - /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC - /// to make tests more deterministic. - /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed? - fn gc_iteration( - &self, - timelineid: Option, - horizon: u64, - pitr: Duration, - checkpoint_before_gc: bool, - ) -> Result; - - /// Perform one compaction iteration. - /// This function is periodically called by compactor thread. - /// Also it can be explicitly requested per timeline through page server - /// api's 'compact' command. - fn compaction_iteration(&self) -> Result<()>; - - /// removes timeline-related in-memory data - fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()>; - - /// Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn. - fn get_remote_index(&self) -> &RemoteIndex; -} - /// A timeline, that belongs to the current repository. pub enum RepositoryTimeline { /// Timeline, with its files present locally in pageserver's working directory. @@ -332,16 +260,17 @@ pub mod repo_harness { use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; use std::{fs, path::PathBuf}; + use crate::storage_sync::index::RemoteIndex; use crate::{ config::PageServerConf, - layered_repository::LayeredRepository, + layered_repository::Repository, walredo::{WalRedoError, WalRedoManager}, }; use super::*; use crate::tenant_config::{TenantConf, TenantConfOpt}; use hex_literal::hex; - use utils::zid::ZTenantId; + use utils::zid::{ZTenantId, ZTimelineId}; pub const TIMELINE_ID: ZTimelineId = ZTimelineId::from_array(hex!("11223344556677881122334455667788")); @@ -427,14 +356,14 @@ pub mod repo_harness { }) } - pub fn load(&self) -> LayeredRepository { + pub fn load(&self) -> Repository { self.try_load().expect("failed to load test repo") } - pub fn try_load(&self) -> Result { + pub fn try_load(&self) -> Result { let walredo_mgr = Arc::new(TestRedoManager); - let repo = LayeredRepository::new( + let repo = Repository::new( self.conf, TenantConfOpt::from(self.tenant_conf), walredo_mgr, diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 36c3e569a6..5afa38c926 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,8 +3,7 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; -use crate::layered_repository::{load_metadata, LayeredRepository, Timeline}; -use crate::repository::Repository; +use crate::layered_repository::{load_metadata, Repository, Timeline}; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; @@ -94,7 +93,7 @@ mod tenants_state { struct Tenant { state: TenantState, /// Contains in-memory state, including the timeline that might not yet flushed on disk or loaded form disk. - repo: Arc, + repo: Arc, /// Timelines, located locally in the pageserver's datadir. /// Timelines can entirely be removed entirely by the `detach` operation only. /// @@ -365,7 +364,7 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow: Ok(()) } -pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result> { +pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result> { let m = tenants_state::read_tenants(); let tenant = m .get(&tenant_id) @@ -484,7 +483,7 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any } fn load_local_timeline( - repo: &LayeredRepository, + repo: &Repository, timeline_id: ZTimelineId, ) -> anyhow::Result> { let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| { @@ -588,7 +587,7 @@ fn init_local_repository( } fn attach_downloaded_tenant( - repo: &LayeredRepository, + repo: &Repository, downloaded_timelines: HashSet, ) -> anyhow::Result<()> { let mut registration_queue = Vec::with_capacity(downloaded_timelines.len()); @@ -630,14 +629,14 @@ fn load_local_repo( conf: &'static PageServerConf, tenant_id: ZTenantId, remote_index: &RemoteIndex, -) -> anyhow::Result> { +) -> anyhow::Result> { let mut m = tenants_state::write_tenants(); let tenant = m.entry(tenant_id).or_insert_with(|| { // Set up a WAL redo manager, for applying WAL records. let walredo_mgr = PostgresRedoManager::new(conf, tenant_id); // Set up an object repository, for actual data storage. - let repo: Arc = Arc::new(LayeredRepository::new( + let repo: Arc = Arc::new(Repository::new( conf, TenantConfOpt::default(), Arc::new(walredo_mgr), @@ -653,7 +652,7 @@ fn load_local_repo( }); // Restore tenant config - let tenant_conf = LayeredRepository::load_tenant_config(conf, tenant_id)?; + let tenant_conf = Repository::load_tenant_config(conf, tenant_id)?; tenant.repo.update_tenant_config(tenant_conf)?; Ok(Arc::clone(&tenant.repo)) diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index e51744d3cc..ca239ae254 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -5,7 +5,6 @@ use std::collections::HashMap; use std::ops::ControlFlow; use std::time::Duration; -use crate::repository::Repository; use crate::tenant_mgr::TenantState; use crate::thread_mgr::ThreadKind; use crate::{tenant_mgr, thread_mgr}; diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 6a55dd286e..4f760751db 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -22,11 +22,10 @@ use crate::import_datadir; use crate::tenant_mgr; use crate::CheckpointConfig; use crate::{ - config::PageServerConf, repository::Repository, storage_sync::index::RemoteIndex, - tenant_config::TenantConfOpt, + config::PageServerConf, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, }; use crate::{ - layered_repository::{LayeredRepository, Timeline}, + layered_repository::{Repository, Timeline}, walredo::WalRedoManager, }; @@ -42,7 +41,7 @@ pub fn create_repo( tenant_id: ZTenantId, wal_redo_manager: Arc, remote_index: RemoteIndex, -) -> Result> { +) -> Result> { let repo_dir = conf.tenant_path(&tenant_id); ensure!( !repo_dir.exists(), @@ -57,9 +56,9 @@ pub fn create_repo( info!("created directory structure in {}", repo_dir.display()); // Save tenant's config - LayeredRepository::persist_tenant_config(conf, tenant_id, tenant_conf)?; + Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; - Ok(Arc::new(LayeredRepository::new( + Ok(Arc::new(Repository::new( conf, tenant_conf, wal_redo_manager, @@ -104,11 +103,11 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { // - run initdb to init temporary instance and get bootstrap data // - after initialization complete, remove the temp dir. // -fn bootstrap_timeline( +fn bootstrap_timeline( conf: &'static PageServerConf, tenantid: ZTenantId, tli: ZTimelineId, - repo: &R, + repo: &Repository, ) -> Result<()> { let initdb_path = conf .tenant_path(&tenantid) diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 2fc44cb26a..912073a731 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -735,10 +735,7 @@ fn wal_stream_connection_string( #[cfg(test)] mod tests { - use crate::repository::{ - repo_harness::{RepoHarness, TIMELINE_ID}, - Repository, - }; + use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID}; use super::*; diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 283cc76e66..b5f266614e 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -20,7 +20,7 @@ use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; use crate::{ - layered_repository::WalReceiverInfo, repository::Repository, tenant_mgr, walingest::WalIngest, + layered_repository::WalReceiverInfo, tenant_mgr, walingest::WalIngest, walrecord::DecodedWALRecord, }; use postgres_ffi::v14::waldecoder::WalStreamDecoder; From c634cb1d36b503cf03a0891c8be7b9e844525b76 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 18 Aug 2022 16:27:49 +0300 Subject: [PATCH 0657/1022] Remove TimelineWriter trait, rename LayeredTimelineWriter struct into TimelineWriter --- pageserver/src/layered_repository/timeline.rs | 38 ++++++++++++------- pageserver/src/repository.rs | 28 +------------- 2 files changed, 27 insertions(+), 39 deletions(-) diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index da3a6981da..1b77f1fab4 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -51,7 +51,7 @@ use utils::{ zid::{ZTenantId, ZTimelineId}, }; -use crate::repository::{GcResult, RepositoryTimeline, TimelineWriter}; +use crate::repository::{GcResult, RepositoryTimeline}; use crate::repository::{Key, Value}; use crate::thread_mgr; use crate::virtual_file::VirtualFile; @@ -597,12 +597,11 @@ impl Timeline { /// FIXME: This ought to return &'a TimelineWriter, where TimelineWriter /// is a generic type in this trait. But that doesn't currently work in /// Rust: https://rust-lang.github.io/rfcs/1598-generic_associated_types.html - /// TODO kb replace with the concrete type - pub fn writer<'a>(&'a self) -> Box { - Box::new(LayeredTimelineWriter { + pub fn writer(&self) -> TimelineWriter<'_> { + TimelineWriter { tl: self, _write_guard: self.write_lock.lock().unwrap(), - }) + } } } @@ -2204,12 +2203,16 @@ fn layer_traversal_error( Err(msg_iter.fold(err, |err, msg| err.context(msg))) } -struct LayeredTimelineWriter<'a> { +/// Various functions to mutate the timeline. +// TODO Currently, Deref is used to allow easy access to read methods from this trait. +// This is probably considered a bad practice in Rust and should be fixed eventually, +// but will cause large code changes. +pub struct TimelineWriter<'a> { tl: &'a Timeline, _write_guard: MutexGuard<'a, ()>, } -impl Deref for LayeredTimelineWriter<'_> { +impl Deref for TimelineWriter<'_> { type Target = Timeline; fn deref(&self) -> &Self::Target { @@ -2217,23 +2220,32 @@ impl Deref for LayeredTimelineWriter<'_> { } } -impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { - fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> { +impl<'a> TimelineWriter<'a> { + /// Put a new page version that can be constructed from a WAL record + /// + /// This will implicitly extend the relation, if the page is beyond the + /// current end-of-file. + pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> { self.tl.put_value(key, lsn, value) } - fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()> { + pub fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()> { self.tl.put_tombstone(key_range, lsn) } - /// + /// Track the end of the latest digested WAL record. /// Remember the (end of) last valid WAL record remembered in the timeline. /// - fn finish_write(&self, new_lsn: Lsn) { + /// Call this after you have finished writing all the WAL up to 'lsn'. + /// + /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for + /// the 'lsn' or anything older. The previous last record LSN is stored alongside + /// the latest and can be read. + pub fn finish_write(&self, new_lsn: Lsn) { self.tl.finish_write(new_lsn); } - fn update_current_logical_size(&self, delta: isize) { + pub fn update_current_logical_size(&self, delta: isize) { self.tl .current_logical_size .fetch_add(delta, AtomicOrdering::SeqCst); diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index d0e1ed24b6..dc031c03ee 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -8,7 +8,6 @@ use std::fmt; use std::ops::{AddAssign, Range}; use std::sync::Arc; use std::time::Duration; -use utils::lsn::Lsn; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] /// Key used in the Repository kv-store. @@ -228,37 +227,13 @@ impl AddAssign for GcResult { } } -/// Various functions to mutate the timeline. -// TODO Currently, Deref is used to allow easy access to read methods from this trait. -// This is probably considered a bad practice in Rust and should be fixed eventually, -// but will cause large code changes. -pub trait TimelineWriter<'a> { - /// Put a new page version that can be constructed from a WAL record - /// - /// This will implicitly extend the relation, if the page is beyond the - /// current end-of-file. - fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()>; - - fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()>; - - /// Track the end of the latest digested WAL record. - /// - /// Call this after you have finished writing all the WAL up to 'lsn'. - /// - /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for - /// the 'lsn' or anything older. The previous last record LSN is stored alongside - /// the latest and can be read. - fn finish_write(&self, lsn: Lsn); - - fn update_current_logical_size(&self, delta: isize); -} - #[cfg(test)] pub mod repo_harness { use bytes::BytesMut; use once_cell::sync::Lazy; use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; use std::{fs, path::PathBuf}; + use utils::lsn::Lsn; use crate::storage_sync::index::RemoteIndex; use crate::{ @@ -440,6 +415,7 @@ mod tests { use bytes::BytesMut; use hex_literal::hex; use once_cell::sync::Lazy; + use utils::lsn::Lsn; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001"))); From 187a7604099f5a003b6d8d2b85955ee24bb1bd1b Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 18 Aug 2022 16:51:56 +0300 Subject: [PATCH 0658/1022] Reset codestyle cargo cache --- .github/workflows/build_and_test.yml | 6 +++--- .github/workflows/codestyle.yml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 3a2e8bad64..4cabd3d672 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -121,8 +121,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v6-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - v6-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- - name: Cache postgres build id: cache_pg @@ -325,7 +325,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v5-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + key: v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact uses: ./.github/actions/download diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 6f13a38dea..d0685f8fd2 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -65,7 +65,7 @@ jobs: - name: Cache postgres build id: cache_pg - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: | tmp_install/ @@ -94,14 +94,14 @@ jobs: - name: Cache cargo deps id: cache_cargo - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: | ~/.cargo/registry !~/.cargo/registry/src ~/.cargo/git target - key: v2-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} + key: v3-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} - name: Run cargo clippy run: ./run_clippy.sh From aaa60c92ca18a1d4504a56a1b283a560afdb3af9 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 19 Aug 2022 16:24:47 +0300 Subject: [PATCH 0659/1022] Use u64/i64 for logical size, comment on why to use signed i64. usize/isize type corresponds to the CPU architecture's pointer width, i.e. 64 bits on a 64-bit platform and 32 bits on a 32-bit platform. The logical size of a database has nothing to do with the that, so u64/i64 is more appropriate. It doesn't make any difference in practice as long as you're on a 64-bit platform, and it's hard to imagine anyone wanting to run the pageserver on a 32-bit platform, but let's be tidy. Also add a comment on why we use signed i64 for the logical size variable, even though size should never be negative. I'm not sure the reasons are very good, but at least this documents them, and hints at some possible better solutions. --- pageserver/src/http/models.rs | 6 ++-- pageserver/src/layered_repository/timeline.rs | 33 ++++++++++++++----- pageserver/src/pgdatadir_mapping.rs | 24 +++++++------- 3 files changed, 39 insertions(+), 24 deletions(-) diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index a4f270580f..232c202ed9 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -129,9 +129,9 @@ pub struct LocalTimelineInfo { pub latest_gc_cutoff_lsn: Lsn, #[serde_as(as = "DisplayFromStr")] pub disk_consistent_lsn: Lsn, - pub current_logical_size: Option, // is None when timeline is Unloaded - pub current_physical_size: Option, // is None when timeline is Unloaded - pub current_logical_size_non_incremental: Option, + pub current_logical_size: Option, // is None when timeline is Unloaded + pub current_physical_size: Option, // is None when timeline is Unloaded + pub current_logical_size_non_incremental: Option, pub current_physical_size_non_incremental: Option, pub timeline_state: LocalTimelineState, diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 1b77f1fab4..8f3004af98 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -15,7 +15,7 @@ use std::fs::{File, OpenOptions}; use std::io::Write; use std::ops::{Deref, Range}; use std::path::PathBuf; -use std::sync::atomic::{self, AtomicBool, AtomicIsize, Ordering as AtomicOrdering}; +use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError}; use std::time::{Duration, Instant, SystemTime}; @@ -376,7 +376,22 @@ pub struct Timeline { repartition_threshold: u64, /// Current logical size of the "datadir", at the last LSN. - current_logical_size: AtomicIsize, + /// + /// Size shouldn't ever be negative, but this is signed for two reasons: + /// + /// 1. If we initialized the "baseline" size lazily, while we already + /// process incoming WAL, the incoming WAL records could decrement the + /// variable and temporarily make it negative. (This is just future-proofing; + /// the initialization is currently not done lazily.) + /// + /// 2. If there is a bug and we e.g. forget to increment it in some cases + /// when size grows, but remember to decrement it when it shrinks again, the + /// variable could go negative. In that case, it seems better to at least + /// try to keep tracking it, rather than clamp or overflow it. Note that + /// get_current_logical_size() will clamp the returned value to zero if it's + /// negative, and log an error. Could set it permanently to zero or some + /// special value to indicate "broken" instead, but this will do for now. + current_logical_size: AtomicI64, /// Information about the last processed message by the WAL receiver, /// or None if WAL receiver has not received anything for this timeline @@ -695,7 +710,7 @@ impl Timeline { latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), - current_logical_size: AtomicIsize::new(0), + current_logical_size: AtomicI64::new(0), partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), repartition_threshold: 0, @@ -813,7 +828,7 @@ impl Timeline { // Logical size 0 means that it was not initialized, so don't believe that. if ancestor_logical_size != 0 && ancestor.get_last_record_lsn() == self.ancestor_lsn { self.current_logical_size - .store(ancestor_logical_size as isize, AtomicOrdering::SeqCst); + .store(ancestor_logical_size as i64, AtomicOrdering::SeqCst); debug!( "logical size copied from ancestor: {}", ancestor_logical_size @@ -828,7 +843,7 @@ impl Timeline { let last_lsn = self.get_last_record_lsn(); let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?; self.current_logical_size - .store(logical_size as isize, AtomicOrdering::SeqCst); + .store(logical_size as i64, AtomicOrdering::SeqCst); debug!("calculated logical size the hard way: {}", logical_size); timer.stop_and_record(); @@ -837,10 +852,10 @@ impl Timeline { /// Retrieve current logical size of the timeline /// - /// NOTE: counted incrementally, includes ancestors, - pub fn get_current_logical_size(&self) -> usize { + /// NOTE: counted incrementally, includes ancestors. + pub fn get_current_logical_size(&self) -> u64 { let current_logical_size = self.current_logical_size.load(AtomicOrdering::Acquire); - match usize::try_from(current_logical_size) { + match u64::try_from(current_logical_size) { Ok(sz) => sz, Err(_) => { error!( @@ -2245,7 +2260,7 @@ impl<'a> TimelineWriter<'a> { self.tl.finish_write(new_lsn); } - pub fn update_current_logical_size(&self, delta: isize) { + pub fn update_current_logical_size(&self, delta: i64) { self.tl .current_logical_size .fetch_add(delta, AtomicOrdering::SeqCst); diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index beaac292ec..0ace850a82 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -364,22 +364,22 @@ impl Timeline { /// /// Only relation blocks are counted currently. That excludes metadata, /// SLRUs, twophase files etc. - pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { + pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { // Fetch list of database dirs and iterate them let buf = self.get(DBDIR_KEY, lsn)?; let dbdir = DbDirectory::des(&buf)?; - let mut total_size: usize = 0; + let mut total_size: u64 = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { for rel in self.list_rels(*spcnode, *dbnode, lsn)? { let relsize_key = rel_size_to_key(rel); let mut buf = self.get(relsize_key, lsn)?; let relsize = buf.get_u32_le(); - total_size += relsize as usize; + total_size += relsize as u64; } } - Ok(total_size * BLCKSZ as usize) + Ok(total_size * BLCKSZ as u64) } /// @@ -517,7 +517,7 @@ pub struct DatadirModification<'a> { // underlying key-value store by the 'finish' function. pending_updates: HashMap, pending_deletions: Vec>, - pending_nblocks: isize, + pending_nblocks: i64, } impl<'a> DatadirModification<'a> { @@ -676,7 +676,7 @@ impl<'a> DatadirModification<'a> { } // Update logical database size. - self.pending_nblocks -= total_blocks as isize; + self.pending_nblocks -= total_blocks as i64; // Delete all relations and metadata files for the spcnode/dnode self.delete(dbdir_key_range(spcnode, dbnode)); @@ -719,7 +719,7 @@ impl<'a> DatadirModification<'a> { let buf = nblocks.to_le_bytes(); self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); - self.pending_nblocks += nblocks as isize; + self.pending_nblocks += nblocks as i64; // Update relation size cache self.tline.set_cached_rel_size(rel, self.lsn, nblocks); @@ -749,7 +749,7 @@ impl<'a> DatadirModification<'a> { self.tline.set_cached_rel_size(rel, self.lsn, nblocks); // Update logical database size. - self.pending_nblocks -= old_size as isize - nblocks as isize; + self.pending_nblocks -= old_size as i64 - nblocks as i64; } Ok(()) } @@ -771,7 +771,7 @@ impl<'a> DatadirModification<'a> { // Update relation size cache self.tline.set_cached_rel_size(rel, self.lsn, nblocks); - self.pending_nblocks += nblocks as isize - old_size as isize; + self.pending_nblocks += nblocks as i64 - old_size as i64; } Ok(()) } @@ -794,7 +794,7 @@ impl<'a> DatadirModification<'a> { // update logical size let size_key = rel_size_to_key(rel); let old_size = self.get(size_key)?.get_u32_le(); - self.pending_nblocks -= old_size as isize; + self.pending_nblocks -= old_size as i64; // Remove enty from relation size cache self.tline.remove_cached_rel_size(&rel); @@ -936,7 +936,7 @@ impl<'a> DatadirModification<'a> { result?; if pending_nblocks != 0 { - writer.update_current_logical_size(pending_nblocks * BLCKSZ as isize); + writer.update_current_logical_size(pending_nblocks * BLCKSZ as i64); self.pending_nblocks = 0; } @@ -964,7 +964,7 @@ impl<'a> DatadirModification<'a> { writer.finish_write(lsn); if pending_nblocks != 0 { - writer.update_current_logical_size(pending_nblocks * BLCKSZ as isize); + writer.update_current_logical_size(pending_nblocks * BLCKSZ as i64); } Ok(()) From 8ac5a285a14d501123b99b5f1dc78d3e3852243a Mon Sep 17 00:00:00 2001 From: MMeent Date: Fri, 19 Aug 2022 20:02:36 +0200 Subject: [PATCH 0660/1022] Update vendor/postgres to one that is rebased onto REL_14_5 (#2312) This was previously based on REL_14_4 Protected tag of main before rebase is at main-before-rebase-REL_14_5 --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 3f315a1ec3..a479855158 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 3f315a1ec336b3a22a09d2015ce91697def4904e +Subproject commit a4798551587fb5a52740687a341af83b28733dc6 From daba4c7405b132eb22d753bc727353cf740c9bfa Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 19 Aug 2022 21:57:00 +0300 Subject: [PATCH 0661/1022] Add a section in glossary to explain what "logical size" means. (#2306) --- docs/glossary.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/glossary.md b/docs/glossary.md index 665596c68d..25c66828c0 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -92,6 +92,7 @@ The layer map tracks what layers exist in a timeline. ### Layered repository Neon repository implementation that keeps data in layers. + ### LSN The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log. @@ -125,6 +126,26 @@ TODO: use this name consistently in remote storage code. Now `disk_consistent_ls * `ancestor_lsn` - LSN of the branch point (the LSN at which this branch was created) TODO: add table that describes mapping between PostgreSQL (compute), safekeeper and pageserver LSNs. + +### Logical size + +The pageserver tracks the "logical size" of a timeline. It is the +total size of all relations in all Postgres databases on the +timeline. It includes all user and system tables, including their FSM +and VM forks. But it does not include SLRUs, twophase files or any +other such data or metadata that lives outside relations. + +The logical size is calculated by the pageserver, and is sent to +PostgreSQL via feedback messages to the safekeepers. PostgreSQL uses +the logical size to enforce the size limit in the free tier. The +logical size is also shown to users in the web console. + +The logical size is not affected by branches or the physical layout of +layer files in the pageserver. If you have a database with 1 GB +logical size and you create a branch of it, both branches will have 1 +GB logical size, even though the branch is copy-on-write and won't +consume any extra physical disk space until you make changes to it. + ### Page (block) The basic structure used to store relation data. All pages are of the same size. From 84cd40b4162fc692e359c646da9ec9d74a19c4d8 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 19 Aug 2022 22:21:15 +0300 Subject: [PATCH 0662/1022] rustfmt fixes. Not sure why these don't show up as CI failures, but on my laptop, rustfmt insists. --- libs/postgres_ffi/src/nonrelfile_utils.rs | 2 +- libs/postgres_ffi/src/waldecoder.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs index 04ef346d88..1de1d367e0 100644 --- a/libs/postgres_ffi/src/nonrelfile_utils.rs +++ b/libs/postgres_ffi/src/nonrelfile_utils.rs @@ -1,8 +1,8 @@ //! //! Common utilities for dealing with PostgreSQL non-relation files. //! -use crate::transaction_id_precedes; use super::pg_constants; +use crate::transaction_id_precedes; use bytes::BytesMut; use log::*; diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index 0e1c9567cb..768e79621d 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -8,9 +8,9 @@ //! to look deeper into the WAL records to also understand which blocks they modify, the code //! for that is in pageserver/src/walrecord.rs //! +use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC}; use super::pg_constants; use super::xlog_utils::*; -use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use crc32c::*; use log::*; From d48177d0d809e11cd43f3f3a13799f63d98e617a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 19 Aug 2022 22:21:33 +0300 Subject: [PATCH 0663/1022] Expose timeline logical size as a prometheus metric. Physical size was already exposed, and it'd be nice to show both logical and physical size side by side in our graphana dashboards. --- pageserver/src/layered_repository/timeline.rs | 58 ++++++++++++++++--- .../batch_others/test_timeline_size.py | 33 +++++++++-- 2 files changed, 79 insertions(+), 12 deletions(-) diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 8f3004af98..7bbde53dbd 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -139,6 +139,15 @@ static CURRENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_current_logical_size", + "Current logical size grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, // or in testing they estimate how much we would upload if we did. static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { @@ -234,6 +243,8 @@ struct TimelineMetrics { pub last_record_gauge: IntGauge, pub wait_lsn_time_histo: Histogram, pub current_physical_size_gauge: UIntGauge, + /// copy of LayeredTimeline.current_logical_size + pub current_logical_size_gauge: IntGauge, } impl TimelineMetrics { @@ -271,6 +282,9 @@ impl TimelineMetrics { let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); + let current_logical_size_gauge = CURRENT_LOGICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); TimelineMetrics { reconstruct_time_histo, @@ -283,6 +297,7 @@ impl TimelineMetrics { last_record_gauge, wait_lsn_time_histo, current_physical_size_gauge, + current_logical_size_gauge, } } } @@ -391,6 +406,11 @@ pub struct Timeline { /// get_current_logical_size() will clamp the returned value to zero if it's /// negative, and log an error. Could set it permanently to zero or some /// special value to indicate "broken" instead, but this will do for now. + /// + /// Note that we also expose a copy of this value as a prometheus metric, + /// see `current_logical_size_gauge`. Use the `update_current_logical_size` + /// and `set_current_logical_size` functions to modify this, they will + /// also keep the prometheus metric in sync. current_logical_size: AtomicI64, /// Information about the last processed message by the WAL receiver, @@ -827,8 +847,7 @@ impl Timeline { // // Logical size 0 means that it was not initialized, so don't believe that. if ancestor_logical_size != 0 && ancestor.get_last_record_lsn() == self.ancestor_lsn { - self.current_logical_size - .store(ancestor_logical_size as i64, AtomicOrdering::SeqCst); + self.set_current_logical_size(ancestor_logical_size); debug!( "logical size copied from ancestor: {}", ancestor_logical_size @@ -842,8 +861,7 @@ impl Timeline { // Have to calculate it the hard way let last_lsn = self.get_last_record_lsn(); let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?; - self.current_logical_size - .store(logical_size as i64, AtomicOrdering::SeqCst); + self.set_current_logical_size(logical_size); debug!("calculated logical size the hard way: {}", logical_size); timer.stop_and_record(); @@ -867,6 +885,34 @@ impl Timeline { } } + /// Update current logical size, adding `delta' to the old value. + fn update_current_logical_size(&self, delta: i64) { + let new_size = self + .current_logical_size + .fetch_add(delta, AtomicOrdering::SeqCst); + + // Also set the value in the prometheus gauge. Note that + // there is a race condition here: if this is is called by two + // threads concurrently, the prometheus gauge might be set to + // one value while current_logical_size is set to the + // other. Currently, only initialization and the WAL receiver + // updates the logical size, and they don't run concurrently, + // so it cannot happen. And even if it did, it wouldn't be + // very serious, the metrics would just be slightly off until + // the next update. + self.metrics.current_logical_size_gauge.set(new_size); + } + + /// Set current logical size. + fn set_current_logical_size(&self, new_size: u64) { + self.current_logical_size + .store(new_size as i64, AtomicOrdering::SeqCst); + + // Also set the value in the prometheus gauge. Same race condition + // here as in `update_current_logical_size`. + self.metrics.current_logical_size_gauge.set(new_size as i64); + } + /// /// Get a handle to a Layer for reading. /// @@ -2261,9 +2307,7 @@ impl<'a> TimelineWriter<'a> { } pub fn update_current_logical_size(&self, delta: i64) { - self.tl - .current_logical_size - .fetch_add(delta, AtomicOrdering::SeqCst); + self.tl.update_current_logical_size(delta) } } diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 6e1168e38f..4a9359cf43 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -1,4 +1,5 @@ from contextlib import closing +import math import random from uuid import UUID import re @@ -278,11 +279,13 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): assert_physical_size(env, env.initial_tenant, new_timeline_id) -def test_timeline_physical_size_metric(neon_simple_env: NeonEnv): +# The timeline logical and physical sizes are also exposed as prometheus metrics. +# Test the metrics. +def test_timeline_size_metrics(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_metric') - pg = env.postgres.create_start("test_timeline_physical_size_metric") + new_timeline_id = env.neon_cli.create_branch('test_timeline_size_metrics') + pg = env.postgres.create_start("test_timeline_size_metrics") pg.safe_psql_many([ "CREATE TABLE foo (t text)", @@ -301,12 +304,32 @@ def test_timeline_physical_size_metric(neon_simple_env: NeonEnv): metrics, re.MULTILINE) assert matches - - # assert that the metric matches the actual physical size on disk tl_physical_size_metric = int(matches.group(1)) + + # assert that the physical size metric matches the actual physical size on disk timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id) assert tl_physical_size_metric == get_timeline_dir_size(timeline_path) + # Check that the logical size metric is sane, and matches + matches = re.search( + f'^pageserver_current_logical_size{{tenant_id="{env.initial_tenant.hex}",timeline_id="{new_timeline_id.hex}"}} (\\S+)$', + metrics, + re.MULTILINE) + assert matches + tl_logical_size_metric = int(matches.group(1)) + + # An empty database is around 8 MB. There at least 3 databases, 'postgres', + # 'template0', 'template1'. So the total size should be about 32 MB. This isn't + # very accurate and can change with different PostgreSQL versions, so allow a + # couple of MB of slack. + assert math.isclose(tl_logical_size_metric, 32 * 1024 * 1024, abs_tol=2 * 1024 * 1024) + + # The sum of the sizes of all databases, as seen by pg_database_size(), should also + # be close. Again allow some slack, the logical size metric includes some things like + # the SLRUs that are not included in pg_database_size(). + dbsize_sum = pg.safe_psql("select sum(pg_database_size(oid)) from pg_database")[0][0] + assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024) + def test_tenant_physical_size(neon_simple_env: NeonEnv): random.seed(100) From 5522fbab25f1cd7cfaa36cf674e462172f24eff8 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 20 Aug 2022 01:21:18 +0300 Subject: [PATCH 0664/1022] Move all unit tests related to Repository/Timeline to layered_repository.rs There was a nominal split between the tests in layered_repository.rs and repository.rs, such that tests specific to the layered implementation were supposed to be in layered_repository.rs, and tests that should work with any implementation of the traits were supposed to be in repository.rs. In practice, the line was quite muddled. With minor tweaks, many of the tests in layered_repository.rs should work with other implementations too, and vice versa. And in practice we only have one implementation, so it's more straightforward to gather all unit tests in one place. --- pageserver/src/layered_repository.rs | 540 +++++++++++++++++- pageserver/src/layered_repository/metadata.rs | 3 +- pageserver/src/repository.rs | 524 ----------------- pageserver/src/storage_sync.rs | 4 +- pageserver/src/storage_sync/delete.rs | 2 +- pageserver/src/storage_sync/download.rs | 2 +- pageserver/src/storage_sync/index.rs | 2 +- pageserver/src/storage_sync/upload.rs | 2 +- pageserver/src/walingest.rs | 2 +- .../src/walreceiver/connection_manager.rs | 3 +- 10 files changed, 529 insertions(+), 555 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index a5877c8482..42474dac0b 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -905,22 +905,525 @@ pub fn load_metadata( }) } -/// -/// Tests that are specific to the layered storage format. -/// -/// There are more unit tests in repository.rs that work through the -/// Repository interface and are expected to work regardless of the -/// file format and directory layout. The test here are more low level. -/// +#[cfg(test)] +pub mod repo_harness { + use bytes::{Bytes, BytesMut}; + use once_cell::sync::Lazy; + use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; + use std::{fs, path::PathBuf}; + use utils::lsn::Lsn; + + use crate::storage_sync::index::RemoteIndex; + use crate::{ + config::PageServerConf, + layered_repository::Repository, + repository::Key, + walrecord::ZenithWalRecord, + walredo::{WalRedoError, WalRedoManager}, + }; + + use super::*; + use crate::tenant_config::{TenantConf, TenantConfOpt}; + use hex_literal::hex; + use utils::zid::{ZTenantId, ZTimelineId}; + + pub const TIMELINE_ID: ZTimelineId = + ZTimelineId::from_array(hex!("11223344556677881122334455667788")); + pub const NEW_TIMELINE_ID: ZTimelineId = + ZTimelineId::from_array(hex!("AA223344556677881122334455667788")); + + /// Convenience function to create a page image with given string as the only content + #[allow(non_snake_case)] + pub fn TEST_IMG(s: &str) -> Bytes { + let mut buf = BytesMut::new(); + buf.extend_from_slice(s.as_bytes()); + buf.resize(64, 0); + + buf.freeze() + } + + static LOCK: Lazy> = Lazy::new(|| RwLock::new(())); + + impl From for TenantConfOpt { + fn from(tenant_conf: TenantConf) -> Self { + Self { + checkpoint_distance: Some(tenant_conf.checkpoint_distance), + checkpoint_timeout: Some(tenant_conf.checkpoint_timeout), + compaction_target_size: Some(tenant_conf.compaction_target_size), + compaction_period: Some(tenant_conf.compaction_period), + compaction_threshold: Some(tenant_conf.compaction_threshold), + gc_horizon: Some(tenant_conf.gc_horizon), + gc_period: Some(tenant_conf.gc_period), + image_creation_threshold: Some(tenant_conf.image_creation_threshold), + pitr_interval: Some(tenant_conf.pitr_interval), + walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout), + lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout), + max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), + } + } + } + + pub struct RepoHarness<'a> { + pub conf: &'static PageServerConf, + pub tenant_conf: TenantConf, + pub tenant_id: ZTenantId, + + pub lock_guard: ( + Option>, + Option>, + ), + } + + impl<'a> RepoHarness<'a> { + pub fn create(test_name: &'static str) -> Result { + Self::create_internal(test_name, false) + } + pub fn create_exclusive(test_name: &'static str) -> Result { + Self::create_internal(test_name, true) + } + fn create_internal(test_name: &'static str, exclusive: bool) -> Result { + let lock_guard = if exclusive { + (None, Some(LOCK.write().unwrap())) + } else { + (Some(LOCK.read().unwrap()), None) + }; + + let repo_dir = PageServerConf::test_repo_dir(test_name); + let _ = fs::remove_dir_all(&repo_dir); + fs::create_dir_all(&repo_dir)?; + + let conf = PageServerConf::dummy_conf(repo_dir); + // Make a static copy of the config. This can never be free'd, but that's + // OK in a test. + let conf: &'static PageServerConf = Box::leak(Box::new(conf)); + + let tenant_conf = TenantConf::dummy_conf(); + + let tenant_id = ZTenantId::generate(); + fs::create_dir_all(conf.tenant_path(&tenant_id))?; + fs::create_dir_all(conf.timelines_path(&tenant_id))?; + + Ok(Self { + conf, + tenant_conf, + tenant_id, + lock_guard, + }) + } + + pub fn load(&self) -> Repository { + self.try_load().expect("failed to load test repo") + } + + pub fn try_load(&self) -> Result { + let walredo_mgr = Arc::new(TestRedoManager); + + let repo = Repository::new( + self.conf, + TenantConfOpt::from(self.tenant_conf), + walredo_mgr, + self.tenant_id, + RemoteIndex::default(), + false, + ); + // populate repo with locally available timelines + for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) + .expect("should be able to read timelines dir") + { + let timeline_dir_entry = timeline_dir_entry.unwrap(); + let timeline_id: ZTimelineId = timeline_dir_entry + .path() + .file_name() + .unwrap() + .to_string_lossy() + .parse() + .unwrap(); + + repo.attach_timeline(timeline_id)?; + } + + Ok(repo) + } + + pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf { + self.conf.timeline_path(timeline_id, &self.tenant_id) + } + } + + // Mock WAL redo manager that doesn't do much + pub struct TestRedoManager; + + impl WalRedoManager for TestRedoManager { + fn request_redo( + &self, + key: Key, + lsn: Lsn, + base_img: Option, + records: Vec<(Lsn, ZenithWalRecord)>, + ) -> Result { + let s = format!( + "redo for {} to get to {}, with {} and {} records", + key, + lsn, + if base_img.is_some() { + "base image" + } else { + "no base image" + }, + records.len() + ); + println!("{}", s); + + Ok(TEST_IMG(&s)) + } + } +} + #[cfg(test)] pub mod tests { use super::metadata::METADATA_FILE_NAME; use super::*; use crate::keyspace::KeySpaceAccum; - use crate::repository::repo_harness::*; + use crate::layered_repository::repo_harness::*; use crate::repository::{Key, Value}; + use bytes::BytesMut; + use hex_literal::hex; + use once_cell::sync::Lazy; use rand::{thread_rng, Rng}; + static TEST_KEY: Lazy = + Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001"))); + + #[test] + fn test_basic() -> Result<()> { + let repo = RepoHarness::create("test_basic")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + let writer = tline.writer(); + writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; + writer.finish_write(Lsn(0x10)); + drop(writer); + + let writer = tline.writer(); + writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; + writer.finish_write(Lsn(0x20)); + drop(writer); + + assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); + + Ok(()) + } + + #[test] + fn no_duplicate_timelines() -> Result<()> { + let repo = RepoHarness::create("no_duplicate_timelines")?.load(); + let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) { + Ok(_) => panic!("duplicate timeline creation should fail"), + Err(e) => assert_eq!(e.to_string(), "Timeline already exists"), + } + + Ok(()) + } + + /// Convenience function to create a page image with given string as the only content + pub fn test_value(s: &str) -> Value { + let mut buf = BytesMut::new(); + buf.extend_from_slice(s.as_bytes()); + Value::Image(buf.freeze()) + } + + /// + /// Test branch creation + /// + #[test] + fn test_branch() -> Result<()> { + let repo = RepoHarness::create("test_branch")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let writer = tline.writer(); + use std::str::from_utf8; + + #[allow(non_snake_case)] + let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); + #[allow(non_snake_case)] + let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap(); + + // Insert a value on the timeline + writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?; + writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?; + writer.finish_write(Lsn(0x20)); + + writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?; + writer.finish_write(Lsn(0x30)); + writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?; + writer.finish_write(Lsn(0x40)); + + //assert_current_logical_size(&tline, Lsn(0x40)); + + // Branch the history, modify relation differently on the new timeline + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; + let newtline = repo + .get_timeline_load(NEW_TIMELINE_ID) + .expect("Should have a local timeline"); + let new_writer = newtline.writer(); + new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?; + new_writer.finish_write(Lsn(0x40)); + + // Check page contents on both branches + assert_eq!( + from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?, + "foo at 0x40" + ); + assert_eq!( + from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?, + "bar at 0x40" + ); + assert_eq!( + from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?, + "foobar at 0x20" + ); + + //assert_current_logical_size(&tline, Lsn(0x40)); + + Ok(()) + } + + fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> Result<()> { + let mut lsn = start_lsn; + #[allow(non_snake_case)] + { + let writer = tline.writer(); + // Create a relation on the timeline + writer.put( + *TEST_KEY, + lsn, + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + )?; + writer.finish_write(lsn); + lsn += 0x10; + writer.put( + *TEST_KEY, + lsn, + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + )?; + writer.finish_write(lsn); + lsn += 0x10; + } + tline.checkpoint(CheckpointConfig::Forced)?; + { + let writer = tline.writer(); + writer.put( + *TEST_KEY, + lsn, + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + )?; + writer.finish_write(lsn); + lsn += 0x10; + writer.put( + *TEST_KEY, + lsn, + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + )?; + writer.finish_write(lsn); + } + tline.checkpoint(CheckpointConfig::Forced) + } + + #[test] + fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> { + let repo = + RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; + + // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 + // FIXME: this doesn't actually remove any layer currently, given how the checkpointing + // and compaction works. But it does set the 'cutoff' point so that the cross check + // below should fail. + repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + + // try to branch at lsn 25, should fail because we already garbage collected the data + match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { + Ok(_) => panic!("branching should have failed"), + Err(err) => { + assert!(err.to_string().contains("invalid branch start lsn")); + assert!(err + .source() + .unwrap() + .to_string() + .contains("we might've already garbage collected needed data")) + } + } + + Ok(()) + } + + #[test] + fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> { + let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); + + repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; + // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 + match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { + Ok(_) => panic!("branching should have failed"), + Err(err) => { + assert!(&err.to_string().contains("invalid branch start lsn")); + assert!(&err + .source() + .unwrap() + .to_string() + .contains("is earlier than latest GC horizon")); + } + } + + Ok(()) + } + + /* + // FIXME: This currently fails to error out. Calling GC doesn't currently + // remove the old value, we'd need to work a little harder + #[test] + fn test_prohibit_get_for_garbage_collected_data() -> Result<()> { + let repo = + RepoHarness::create("test_prohibit_get_for_garbage_collected_data")? + .load(); + + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; + + repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); + assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); + match tline.get(*TEST_KEY, Lsn(0x25)) { + Ok(_) => panic!("request for page should have failed"), + Err(err) => assert!(err.to_string().contains("not found at")), + } + Ok(()) + } + */ + + #[test] + fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { + let repo = + RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; + + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + let newtline = repo + .get_timeline_load(NEW_TIMELINE_ID) + .expect("Should have a local timeline"); + // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 + repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok()); + + Ok(()) + } + #[test] + fn test_parent_keeps_data_forever_after_branching() -> Result<()> { + let repo = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; + + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + let newtline = repo + .get_timeline_load(NEW_TIMELINE_ID) + .expect("Should have a local timeline"); + + make_some_layers(newtline.as_ref(), Lsn(0x60))?; + + // run gc on parent + repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + + // Check that the data is still accessible on the branch. + assert_eq!( + newtline.get(*TEST_KEY, Lsn(0x50))?, + TEST_IMG(&format!("foo at {}", Lsn(0x40))) + ); + + Ok(()) + } + + #[test] + fn timeline_load() -> Result<()> { + const TEST_NAME: &str = "timeline_load"; + let harness = RepoHarness::create(TEST_NAME)?; + { + let repo = harness.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; + make_some_layers(tline.as_ref(), Lsn(0x8000))?; + tline.checkpoint(CheckpointConfig::Forced)?; + } + + let repo = harness.load(); + let tline = repo + .get_timeline(TIMELINE_ID) + .expect("cannot load timeline"); + assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); + + assert!(repo.get_timeline_load(TIMELINE_ID).is_ok()); + + let tline = repo + .get_timeline(TIMELINE_ID) + .expect("cannot load timeline"); + assert!(matches!(tline, RepositoryTimeline::Loaded(_))); + + Ok(()) + } + + #[test] + fn timeline_load_with_ancestor() -> Result<()> { + const TEST_NAME: &str = "timeline_load_with_ancestor"; + let harness = RepoHarness::create(TEST_NAME)?; + // create two timelines + { + let repo = harness.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + make_some_layers(tline.as_ref(), Lsn(0x20))?; + tline.checkpoint(CheckpointConfig::Forced)?; + + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + + let newtline = repo + .get_timeline_load(NEW_TIMELINE_ID) + .expect("Should have a local timeline"); + + make_some_layers(newtline.as_ref(), Lsn(0x60))?; + tline.checkpoint(CheckpointConfig::Forced)?; + } + + // check that both of them are initially unloaded + let repo = harness.load(); + { + let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline"); + assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); + + let tline = repo + .get_timeline(NEW_TIMELINE_ID) + .expect("cannot get timeline"); + assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); + } + // load only child timeline + let _ = repo + .get_timeline_load(NEW_TIMELINE_ID) + .expect("cannot load timeline"); + + // check that both, child and ancestor are loaded + let tline = repo + .get_timeline(NEW_TIMELINE_ID) + .expect("cannot get timeline"); + assert!(matches!(tline, RepositoryTimeline::Loaded(_))); + + let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline"); + assert!(matches!(tline, RepositoryTimeline::Loaded(_))); + + Ok(()) + } + #[test] fn corrupt_metadata() -> Result<()> { const TEST_NAME: &str = "corrupt_metadata"; @@ -970,11 +1473,8 @@ pub mod tests { let repo = RepoHarness::create("test_images")?.load(); let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - #[allow(non_snake_case)] - let TEST_KEY: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); - let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; + writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; writer.finish_write(Lsn(0x10)); drop(writer); @@ -982,7 +1482,7 @@ pub mod tests { tline.compact()?; let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; + writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; writer.finish_write(Lsn(0x20)); drop(writer); @@ -990,7 +1490,7 @@ pub mod tests { tline.compact()?; let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?; + writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?; writer.finish_write(Lsn(0x30)); drop(writer); @@ -998,18 +1498,18 @@ pub mod tests { tline.compact()?; let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?; + writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?; writer.finish_write(Lsn(0x40)); drop(writer); tline.checkpoint(CheckpointConfig::Forced)?; tline.compact()?; - assert_eq!(tline.get(TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); - assert_eq!(tline.get(TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30")); - assert_eq!(tline.get(TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40")); Ok(()) } diff --git a/pageserver/src/layered_repository/metadata.rs b/pageserver/src/layered_repository/metadata.rs index 0b47f8d697..74679cb43a 100644 --- a/pageserver/src/layered_repository/metadata.rs +++ b/pageserver/src/layered_repository/metadata.rs @@ -175,9 +175,8 @@ impl TimelineMetadata { #[cfg(test)] mod tests { - use crate::repository::repo_harness::TIMELINE_ID; - use super::*; + use crate::layered_repository::repo_harness::TIMELINE_ID; #[test] fn metadata_serializes_correctly() { diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index dc031c03ee..e46a39436d 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -226,527 +226,3 @@ impl AddAssign for GcResult { self.elapsed += other.elapsed; } } - -#[cfg(test)] -pub mod repo_harness { - use bytes::BytesMut; - use once_cell::sync::Lazy; - use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; - use std::{fs, path::PathBuf}; - use utils::lsn::Lsn; - - use crate::storage_sync::index::RemoteIndex; - use crate::{ - config::PageServerConf, - layered_repository::Repository, - walredo::{WalRedoError, WalRedoManager}, - }; - - use super::*; - use crate::tenant_config::{TenantConf, TenantConfOpt}; - use hex_literal::hex; - use utils::zid::{ZTenantId, ZTimelineId}; - - pub const TIMELINE_ID: ZTimelineId = - ZTimelineId::from_array(hex!("11223344556677881122334455667788")); - pub const NEW_TIMELINE_ID: ZTimelineId = - ZTimelineId::from_array(hex!("AA223344556677881122334455667788")); - - /// Convenience function to create a page image with given string as the only content - #[allow(non_snake_case)] - pub fn TEST_IMG(s: &str) -> Bytes { - let mut buf = BytesMut::new(); - buf.extend_from_slice(s.as_bytes()); - buf.resize(64, 0); - - buf.freeze() - } - - static LOCK: Lazy> = Lazy::new(|| RwLock::new(())); - - impl From for TenantConfOpt { - fn from(tenant_conf: TenantConf) -> Self { - Self { - checkpoint_distance: Some(tenant_conf.checkpoint_distance), - checkpoint_timeout: Some(tenant_conf.checkpoint_timeout), - compaction_target_size: Some(tenant_conf.compaction_target_size), - compaction_period: Some(tenant_conf.compaction_period), - compaction_threshold: Some(tenant_conf.compaction_threshold), - gc_horizon: Some(tenant_conf.gc_horizon), - gc_period: Some(tenant_conf.gc_period), - image_creation_threshold: Some(tenant_conf.image_creation_threshold), - pitr_interval: Some(tenant_conf.pitr_interval), - walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout), - lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout), - max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), - } - } - } - - pub struct RepoHarness<'a> { - pub conf: &'static PageServerConf, - pub tenant_conf: TenantConf, - pub tenant_id: ZTenantId, - - pub lock_guard: ( - Option>, - Option>, - ), - } - - impl<'a> RepoHarness<'a> { - pub fn create(test_name: &'static str) -> Result { - Self::create_internal(test_name, false) - } - pub fn create_exclusive(test_name: &'static str) -> Result { - Self::create_internal(test_name, true) - } - fn create_internal(test_name: &'static str, exclusive: bool) -> Result { - let lock_guard = if exclusive { - (None, Some(LOCK.write().unwrap())) - } else { - (Some(LOCK.read().unwrap()), None) - }; - - let repo_dir = PageServerConf::test_repo_dir(test_name); - let _ = fs::remove_dir_all(&repo_dir); - fs::create_dir_all(&repo_dir)?; - - let conf = PageServerConf::dummy_conf(repo_dir); - // Make a static copy of the config. This can never be free'd, but that's - // OK in a test. - let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - - let tenant_conf = TenantConf::dummy_conf(); - - let tenant_id = ZTenantId::generate(); - fs::create_dir_all(conf.tenant_path(&tenant_id))?; - fs::create_dir_all(conf.timelines_path(&tenant_id))?; - - Ok(Self { - conf, - tenant_conf, - tenant_id, - lock_guard, - }) - } - - pub fn load(&self) -> Repository { - self.try_load().expect("failed to load test repo") - } - - pub fn try_load(&self) -> Result { - let walredo_mgr = Arc::new(TestRedoManager); - - let repo = Repository::new( - self.conf, - TenantConfOpt::from(self.tenant_conf), - walredo_mgr, - self.tenant_id, - RemoteIndex::default(), - false, - ); - // populate repo with locally available timelines - for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) - .expect("should be able to read timelines dir") - { - let timeline_dir_entry = timeline_dir_entry.unwrap(); - let timeline_id: ZTimelineId = timeline_dir_entry - .path() - .file_name() - .unwrap() - .to_string_lossy() - .parse() - .unwrap(); - - repo.attach_timeline(timeline_id)?; - } - - Ok(repo) - } - - pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf { - self.conf.timeline_path(timeline_id, &self.tenant_id) - } - } - - // Mock WAL redo manager that doesn't do much - pub struct TestRedoManager; - - impl WalRedoManager for TestRedoManager { - fn request_redo( - &self, - key: Key, - lsn: Lsn, - base_img: Option, - records: Vec<(Lsn, ZenithWalRecord)>, - ) -> Result { - let s = format!( - "redo for {} to get to {}, with {} and {} records", - key, - lsn, - if base_img.is_some() { - "base image" - } else { - "no base image" - }, - records.len() - ); - println!("{}", s); - - Ok(TEST_IMG(&s)) - } - } -} - -/// -/// Tests that should work the same with any Repository/Timeline implementation. -/// -#[allow(clippy::bool_assert_comparison)] -#[cfg(test)] -mod tests { - use crate::layered_repository::Timeline; - use crate::CheckpointConfig; - - use super::repo_harness::*; - use super::*; - //use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT}; - //use std::sync::Arc; - use bytes::BytesMut; - use hex_literal::hex; - use once_cell::sync::Lazy; - use utils::lsn::Lsn; - - static TEST_KEY: Lazy = - Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001"))); - - #[test] - fn test_basic() -> Result<()> { - let repo = RepoHarness::create("test_basic")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - - let writer = tline.writer(); - writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; - writer.finish_write(Lsn(0x10)); - drop(writer); - - let writer = tline.writer(); - writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; - writer.finish_write(Lsn(0x20)); - drop(writer); - - assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); - - Ok(()) - } - - #[test] - fn no_duplicate_timelines() -> Result<()> { - let repo = RepoHarness::create("no_duplicate_timelines")?.load(); - let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - - match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) { - Ok(_) => panic!("duplicate timeline creation should fail"), - Err(e) => assert_eq!(e.to_string(), "Timeline already exists"), - } - - Ok(()) - } - - /// Convenience function to create a page image with given string as the only content - pub fn test_value(s: &str) -> Value { - let mut buf = BytesMut::new(); - buf.extend_from_slice(s.as_bytes()); - Value::Image(buf.freeze()) - } - - /// - /// Test branch creation - /// - #[test] - fn test_branch() -> Result<()> { - let repo = RepoHarness::create("test_branch")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - let writer = tline.writer(); - use std::str::from_utf8; - - #[allow(non_snake_case)] - let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); - #[allow(non_snake_case)] - let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap(); - - // Insert a value on the timeline - writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?; - writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?; - writer.finish_write(Lsn(0x20)); - - writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?; - writer.finish_write(Lsn(0x30)); - writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?; - writer.finish_write(Lsn(0x40)); - - //assert_current_logical_size(&tline, Lsn(0x40)); - - // Branch the history, modify relation differently on the new timeline - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; - let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) - .expect("Should have a local timeline"); - let new_writer = newtline.writer(); - new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?; - new_writer.finish_write(Lsn(0x40)); - - // Check page contents on both branches - assert_eq!( - from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?, - "foo at 0x40" - ); - assert_eq!( - from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?, - "bar at 0x40" - ); - assert_eq!( - from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?, - "foobar at 0x20" - ); - - //assert_current_logical_size(&tline, Lsn(0x40)); - - Ok(()) - } - - fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> Result<()> { - let mut lsn = start_lsn; - #[allow(non_snake_case)] - { - let writer = tline.writer(); - // Create a relation on the timeline - writer.put( - *TEST_KEY, - lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), - )?; - writer.finish_write(lsn); - lsn += 0x10; - writer.put( - *TEST_KEY, - lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), - )?; - writer.finish_write(lsn); - lsn += 0x10; - } - tline.checkpoint(CheckpointConfig::Forced)?; - { - let writer = tline.writer(); - writer.put( - *TEST_KEY, - lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), - )?; - writer.finish_write(lsn); - lsn += 0x10; - writer.put( - *TEST_KEY, - lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), - )?; - writer.finish_write(lsn); - } - tline.checkpoint(CheckpointConfig::Forced) - } - - #[test] - fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> { - let repo = - RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(tline.as_ref(), Lsn(0x20))?; - - // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - // FIXME: this doesn't actually remove any layer currently, given how the checkpointing - // and compaction works. But it does set the 'cutoff' point so that the cross check - // below should fail. - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; - - // try to branch at lsn 25, should fail because we already garbage collected the data - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { - Ok(_) => panic!("branching should have failed"), - Err(err) => { - assert!(err.to_string().contains("invalid branch start lsn")); - assert!(err - .source() - .unwrap() - .to_string() - .contains("we might've already garbage collected needed data")) - } - } - - Ok(()) - } - - #[test] - fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> { - let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); - - repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; - // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { - Ok(_) => panic!("branching should have failed"), - Err(err) => { - assert!(&err.to_string().contains("invalid branch start lsn")); - assert!(&err - .source() - .unwrap() - .to_string() - .contains("is earlier than latest GC horizon")); - } - } - - Ok(()) - } - - /* - // FIXME: This currently fails to error out. Calling GC doesn't currently - // remove the old value, we'd need to work a little harder - #[test] - fn test_prohibit_get_for_garbage_collected_data() -> Result<()> { - let repo = - RepoHarness::create("test_prohibit_get_for_garbage_collected_data")? - .load(); - - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(tline.as_ref(), Lsn(0x20))?; - - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; - let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); - assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); - match tline.get(*TEST_KEY, Lsn(0x25)) { - Ok(_) => panic!("request for page should have failed"), - Err(err) => assert!(err.to_string().contains("not found at")), - } - Ok(()) - } - */ - - #[test] - fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { - let repo = - RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(tline.as_ref(), Lsn(0x20))?; - - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; - let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) - .expect("Should have a local timeline"); - // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; - assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok()); - - Ok(()) - } - #[test] - fn test_parent_keeps_data_forever_after_branching() -> Result<()> { - let repo = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(tline.as_ref(), Lsn(0x20))?; - - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; - let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) - .expect("Should have a local timeline"); - - make_some_layers(newtline.as_ref(), Lsn(0x60))?; - - // run gc on parent - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; - - // Check that the data is still accessible on the branch. - assert_eq!( - newtline.get(*TEST_KEY, Lsn(0x50))?, - TEST_IMG(&format!("foo at {}", Lsn(0x40))) - ); - - Ok(()) - } - - #[test] - fn timeline_load() -> Result<()> { - const TEST_NAME: &str = "timeline_load"; - let harness = RepoHarness::create(TEST_NAME)?; - { - let repo = harness.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; - make_some_layers(tline.as_ref(), Lsn(0x8000))?; - tline.checkpoint(CheckpointConfig::Forced)?; - } - - let repo = harness.load(); - let tline = repo - .get_timeline(TIMELINE_ID) - .expect("cannot load timeline"); - assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); - - assert!(repo.get_timeline_load(TIMELINE_ID).is_ok()); - - let tline = repo - .get_timeline(TIMELINE_ID) - .expect("cannot load timeline"); - assert!(matches!(tline, RepositoryTimeline::Loaded(_))); - - Ok(()) - } - - #[test] - fn timeline_load_with_ancestor() -> Result<()> { - const TEST_NAME: &str = "timeline_load_with_ancestor"; - let harness = RepoHarness::create(TEST_NAME)?; - // create two timelines - { - let repo = harness.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - - make_some_layers(tline.as_ref(), Lsn(0x20))?; - tline.checkpoint(CheckpointConfig::Forced)?; - - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; - - let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) - .expect("Should have a local timeline"); - - make_some_layers(newtline.as_ref(), Lsn(0x60))?; - tline.checkpoint(CheckpointConfig::Forced)?; - } - - // check that both of them are initially unloaded - let repo = harness.load(); - { - let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline"); - assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); - - let tline = repo - .get_timeline(NEW_TIMELINE_ID) - .expect("cannot get timeline"); - assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); - } - // load only child timeline - let _ = repo - .get_timeline_load(NEW_TIMELINE_ID) - .expect("cannot load timeline"); - - // check that both, child and ancestor are loaded - let tline = repo - .get_timeline(NEW_TIMELINE_ID) - .expect("cannot get timeline"); - assert!(matches!(tline, RepositoryTimeline::Loaded(_))); - - let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline"); - assert!(matches!(tline, RepositoryTimeline::Loaded(_))); - - Ok(()) - } -} diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 15f24d7e24..52d544b28c 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -1642,7 +1642,7 @@ fn register_sync_status( mod test_utils { use utils::lsn::Lsn; - use crate::repository::repo_harness::RepoHarness; + use crate::layered_repository::repo_harness::RepoHarness; use super::*; @@ -1687,7 +1687,7 @@ mod test_utils { #[cfg(test)] mod tests { use super::test_utils::dummy_metadata; - use crate::repository::repo_harness::TIMELINE_ID; + use crate::layered_repository::repo_harness::TIMELINE_ID; use hex_literal::hex; use utils::lsn::Lsn; diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index a1b26ee9a2..2e39ed073f 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -111,7 +111,7 @@ mod tests { use utils::lsn::Lsn; use crate::{ - repository::repo_harness::{RepoHarness, TIMELINE_ID}, + layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}, storage_sync::test_utils::{create_local_timeline, dummy_metadata}, }; use remote_storage::LocalFs; diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index f714888d9a..98c45bf9af 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -445,7 +445,7 @@ mod tests { use utils::lsn::Lsn; use crate::{ - repository::repo_harness::{RepoHarness, TIMELINE_ID}, + layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}, storage_sync::{ index::RelativePath, test_utils::{create_local_timeline, dummy_metadata}, diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 134ae893bc..3dddda09bf 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -341,7 +341,7 @@ mod tests { use std::collections::BTreeSet; use super::*; - use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID}; + use crate::layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}; #[test] fn index_part_conversion() { diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 2c41f58721..2acc935537 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -248,7 +248,7 @@ mod tests { use utils::lsn::Lsn; use crate::{ - repository::repo_harness::{RepoHarness, TIMELINE_ID}, + layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}, storage_sync::{ index::RelativePath, test_utils::{create_local_timeline, dummy_metadata}, diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index c24ffc49de..f3789d43e3 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1029,9 +1029,9 @@ impl<'a> WalIngest<'a> { #[cfg(test)] mod tests { use super::*; + use crate::layered_repository::repo_harness::*; use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::create_test_timeline; - use crate::repository::repo_harness::*; use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 912073a731..0261203049 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -735,9 +735,8 @@ fn wal_stream_connection_string( #[cfg(test)] mod tests { - use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID}; - use super::*; + use crate::layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}; #[test] fn no_connection_no_candidate() -> anyhow::Result<()> { From 631cbf5b1ba013ae48cbb463b198837af489efe8 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 19 Aug 2022 14:42:35 +0300 Subject: [PATCH 0665/1022] Use single map to manage timeline data --- pageserver/src/tenant_mgr.rs | 91 ++++++++++-------------------------- 1 file changed, 25 insertions(+), 66 deletions(-) diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 5afa38c926..4025d6706e 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -4,6 +4,7 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; use crate::layered_repository::{load_metadata, Repository, Timeline}; +use crate::repository::RepositoryTimeline; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; @@ -94,12 +95,6 @@ struct Tenant { state: TenantState, /// Contains in-memory state, including the timeline that might not yet flushed on disk or loaded form disk. repo: Arc, - /// Timelines, located locally in the pageserver's datadir. - /// Timelines can entirely be removed entirely by the `detach` operation only. - /// - /// Local timelines have more metadata that's loaded into memory, - /// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`]. - local_timelines: HashMap>, } #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] @@ -288,7 +283,6 @@ pub fn create_tenant_repository( v.insert(Tenant { state: TenantState::Idle, repo, - local_timelines: HashMap::new(), }); Ok(Some(tenant_id)) } @@ -379,20 +373,11 @@ pub fn get_local_timeline_with_load( tenant_id: ZTenantId, timeline_id: ZTimelineId, ) -> anyhow::Result> { - let mut m = tenants_state::write_tenants(); - let tenant = m - .get_mut(&tenant_id) - .with_context(|| format!("Tenant {tenant_id} not found"))?; - - if let Some(page_tline) = tenant.local_timelines.get(&timeline_id) { - Ok(Arc::clone(page_tline)) - } else { - let page_tline = load_local_timeline(&tenant.repo, timeline_id) - .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}"))?; - tenant - .local_timelines - .insert(timeline_id, Arc::clone(&page_tline)); - Ok(page_tline) + let repository = get_repository_for_tenant(tenant_id)?; + match repository.get_timeline(timeline_id) { + Some(RepositoryTimeline::Loaded(loaded_timeline)) => Ok(loaded_timeline), + _ => load_local_timeline(&repository, timeline_id) + .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}")), } } @@ -419,10 +404,7 @@ pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow thread_mgr::shutdown_threads(None, None, Some(timeline_id)); debug!("thread shutdown completed"); match tenants_state::write_tenants().get_mut(&tenant_id) { - Some(tenant) => { - tenant.repo.delete_timeline(timeline_id)?; - tenant.local_timelines.remove(&timeline_id); - } + Some(tenant) => tenant.repo.delete_timeline(timeline_id)?, None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"), } @@ -434,37 +416,31 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any // shutdown the tenant and timeline threads: gc, compaction, page service threads) thread_mgr::shutdown_threads(None, Some(tenant_id), None); - // FIXME should we protect somehow from starting new threads/walreceivers when tenant is in stopping state? - // send stop signal to wal receiver and collect join handles while holding the lock - let walreceiver_join_handles = { - let tenants = tenants_state::write_tenants(); - let tenant = tenants.get(&tenant_id).context("tenant not found")?; - let mut walreceiver_join_handles = Vec::with_capacity(tenant.local_timelines.len()); - for timeline_id in tenant.local_timelines.keys() { + let mut walreceiver_join_handles = Vec::new(); + let removed_tenant = { + let mut tenants_accessor = tenants_state::write_tenants(); + tenants_accessor.remove(&tenant_id) + }; + if let Some(tenant) = removed_tenant { + for (timeline_id, _) in tenant.repo.list_timelines() { let (sender, receiver) = std::sync::mpsc::channel::<()>(); tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach { - id: ZTenantTimelineId::new(tenant_id, *timeline_id), + id: ZTenantTimelineId::new(tenant_id, timeline_id), join_confirmation_sender: sender, }); - walreceiver_join_handles.push((*timeline_id, receiver)); + walreceiver_join_handles.push((timeline_id, receiver)); } - // drop the tenants lock - walreceiver_join_handles - }; + } // wait for wal receivers to stop without holding the lock, because walreceiver // will attempt to change tenant state which is protected by the same global tenants lock. - // TODO do we need a timeout here? how to handle it? // recv_timeout is broken: https://github.com/rust-lang/rust/issues/94518#issuecomment-1057440631 - // need to use crossbeam-channel for (timeline_id, join_handle) in walreceiver_join_handles { info!("waiting for wal receiver to shutdown timeline_id {timeline_id}"); join_handle.recv().context("failed to join walreceiver")?; info!("wal receiver shutdown confirmed timeline_id {timeline_id}"); } - tenants_state::write_tenants().remove(&tenant_id); - // If removal fails there will be no way to successfully retry detach, // because tenant no longer exists in in memory map. And it needs to be removed from it // before we remove files because it contains references to repository @@ -590,34 +566,18 @@ fn attach_downloaded_tenant( repo: &Repository, downloaded_timelines: HashSet, ) -> anyhow::Result<()> { - let mut registration_queue = Vec::with_capacity(downloaded_timelines.len()); - - // first need to register the in-mem representations, to avoid missing ancestors during the local disk data registration for timeline_id in downloaded_timelines { + // first, register timeline metadata repo.attach_timeline(timeline_id).with_context(|| { format!("Failed to load timeline {timeline_id} into in-memory repository") })?; - registration_queue.push(timeline_id); - } - - for timeline_id in registration_queue { - let tenant_id = repo.tenant_id(); - match tenants_state::write_tenants().get_mut(&tenant_id) { - Some(tenant) => match tenant.local_timelines.entry(timeline_id) { - Entry::Occupied(_) => { - anyhow::bail!("Local timeline {timeline_id} already registered") - } - Entry::Vacant(v) => { - v.insert(load_local_timeline(repo, timeline_id).with_context(|| { - format!("Failed to register add local timeline for tenant {tenant_id}") - })?); - } - }, - None => anyhow::bail!( - "Tenant {} not found in local tenant state", - repo.tenant_id() - ), - } + // and then load its layers in memory + let _ = load_local_timeline(repo, timeline_id).with_context(|| { + format!( + "Failed to register add local timeline for tenant {}", + repo.tenant_id(), + ) + })?; } Ok(()) @@ -647,7 +607,6 @@ fn load_local_repo( Tenant { state: TenantState::Idle, repo, - local_timelines: HashMap::new(), } }); From 32be8739b9d3d4d9abb87dbf99a1836d195e283c Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 19 Aug 2022 16:02:26 +0300 Subject: [PATCH 0666/1022] Move walreceiver timeline registration into layered_repository --- pageserver/src/layered_repository.rs | 43 +++++++++++++++------------- pageserver/src/tenant_mgr.rs | 9 ++---- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 42474dac0b..d67b1b0130 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -13,6 +13,7 @@ use anyhow::{bail, ensure, Context, Result}; use tracing::*; +use utils::zid::ZTenantTimelineId; use std::cmp::min; use std::collections::hash_map::Entry; @@ -32,6 +33,7 @@ use crate::storage_sync::index::RemoteIndex; use crate::tenant_config::{TenantConf, TenantConfOpt}; use crate::repository::{GcResult, RepositoryTimeline}; +use crate::tenant_mgr::LocalTimelineUpdate; use crate::thread_mgr; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; @@ -125,8 +127,11 @@ impl Repository { /// Get Timeline handle for given zenith timeline ID. /// This function is idempotent. It doesn't change internal state in any way. pub fn get_timeline(&self, timelineid: ZTimelineId) -> Option> { - let timelines = self.timelines.lock().unwrap(); - self.get_timeline_internal(timelineid, &timelines) + self.timelines + .lock() + .unwrap() + .get(&timelineid) + .cloned() .map(RepositoryTimeline::from) } @@ -198,6 +203,11 @@ impl Repository { let timeline = Arc::new(timeline); vacant_timeline_entry.insert(LayeredTimelineEntry::Loaded(Arc::clone(&timeline))); + crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach { + id: ZTenantTimelineId::new(self.tenant_id(), timeline_id), + datadir: Arc::clone(&timeline), + }); + Ok(timeline) } @@ -540,45 +550,34 @@ impl Repository { Ok(()) } - // Implementation of the public `get_timeline` function. - // Differences from the public: - // * interface in that the caller must already hold the mutex on the 'timelines' hashmap. - fn get_timeline_internal( - &self, - timelineid: ZTimelineId, - timelines: &HashMap, - ) -> Option { - timelines.get(&timelineid).cloned() - } - // Implementation of the public `get_timeline_load` function. // Differences from the public: // * interface in that the caller must already hold the mutex on the 'timelines' hashmap. fn get_timeline_load_internal( &self, - timelineid: ZTimelineId, + timeline_id: ZTimelineId, timelines: &mut HashMap, ) -> anyhow::Result>> { - match timelines.get(&timelineid) { + match timelines.get(&timeline_id) { Some(entry) => match entry { LayeredTimelineEntry::Loaded(local_timeline) => { - debug!("timeline {} found loaded into memory", &timelineid); + debug!("timeline {timeline_id} found loaded into memory"); return Ok(Some(Arc::clone(local_timeline))); } LayeredTimelineEntry::Unloaded { .. } => {} }, None => { - debug!("timeline {} not found", &timelineid); + debug!("timeline {timeline_id} not found"); return Ok(None); } }; debug!( "timeline {} found on a local disk, but not loaded into the memory, loading", - &timelineid + &timeline_id ); - let timeline = self.load_local_timeline(timelineid, timelines)?; + let timeline = self.load_local_timeline(timeline_id, timelines)?; let was_loaded = timelines.insert( - timelineid, + timeline_id, LayeredTimelineEntry::Loaded(Arc::clone(&timeline)), ); ensure!( @@ -586,6 +585,10 @@ impl Repository { || matches!(was_loaded, Some(LayeredTimelineEntry::Unloaded { .. })), "assertion failure, inserted wrong timeline in an incorrect state" ); + crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach { + id: ZTenantTimelineId::new(self.tenant_id(), timeline_id), + datadir: Arc::clone(&timeline), + }); Ok(Some(timeline)) } diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 4025d6706e..f5b4308067 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -21,6 +21,7 @@ use tokio::sync::mpsc; use tracing::*; use utils::lsn::Lsn; +pub use tenants_state::try_send_timeline_update; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; mod tenants_state { @@ -68,7 +69,7 @@ mod tenants_state { Ok(()) } - pub(super) fn try_send_timeline_update(update: LocalTimelineUpdate) { + pub fn try_send_timeline_update(update: LocalTimelineUpdate) { match TIMELINE_UPDATE_SENDER .read() .expect("Failed to read() timeline_update_sender lock, it got poisoned") @@ -466,12 +467,6 @@ fn load_local_timeline( format!("Inmem timeline {timeline_id} not found in tenant's repository") })?; inmem_timeline.init_logical_size()?; - - tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach { - id: ZTenantTimelineId::new(repo.tenant_id(), timeline_id), - datadir: Arc::clone(&inmem_timeline), - }); - Ok(inmem_timeline) } From 777930898580110e8800c3e94b41aceea27e6063 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 19 Aug 2022 17:59:06 +0300 Subject: [PATCH 0667/1022] Ensure timeline logical size is initialized once --- pageserver/src/layered_repository.rs | 18 +++++++------- pageserver/src/layered_repository/timeline.rs | 12 ++++++++++ pageserver/src/tenant_mgr.rs | 24 +++++++++++++------ pageserver/src/walreceiver.rs | 4 ++-- 4 files changed, 41 insertions(+), 17 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index d67b1b0130..dd173498b9 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -205,7 +205,7 @@ impl Repository { crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach { id: ZTenantTimelineId::new(self.tenant_id(), timeline_id), - datadir: Arc::clone(&timeline), + timeline: Arc::clone(&timeline), }); Ok(timeline) @@ -572,8 +572,7 @@ impl Repository { } }; debug!( - "timeline {} found on a local disk, but not loaded into the memory, loading", - &timeline_id + "timeline {timeline_id} found on a local disk, but not loaded into the memory, loading" ); let timeline = self.load_local_timeline(timeline_id, timelines)?; let was_loaded = timelines.insert( @@ -585,10 +584,6 @@ impl Repository { || matches!(was_loaded, Some(LayeredTimelineEntry::Unloaded { .. })), "assertion failure, inserted wrong timeline in an incorrect state" ); - crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach { - id: ZTenantTimelineId::new(self.tenant_id(), timeline_id), - datadir: Arc::clone(&timeline), - }); Ok(Some(timeline)) } @@ -627,7 +622,14 @@ impl Repository { .load_layer_map(disk_consistent_lsn) .context("failed to load layermap")?; - Ok(Arc::new(timeline)) + let timeline = Arc::new(timeline); + + crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach { + id: ZTenantTimelineId::new(self.tenant_id(), timeline_id), + timeline: Arc::clone(&timeline), + }); + + Ok(timeline) } pub fn new( diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 7bbde53dbd..fb5a4d0b83 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -412,6 +412,11 @@ pub struct Timeline { /// and `set_current_logical_size` functions to modify this, they will /// also keep the prometheus metric in sync. current_logical_size: AtomicI64, + // TODO we don't have a good, API to ensure on a compilation level + // that the timeline passes all initialization. + // Hence we ensure that we init at least once for every timeline + // and keep this flag to avoid potentually long recomputes. + logical_size_initialized: AtomicBool, /// Information about the last processed message by the WAL receiver, /// or None if WAL receiver has not received anything for this timeline @@ -731,6 +736,7 @@ impl Timeline { initdb_lsn: metadata.initdb_lsn(), current_logical_size: AtomicI64::new(0), + logical_size_initialized: AtomicBool::new(false), partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), repartition_threshold: 0, @@ -835,6 +841,10 @@ impl Timeline { /// /// This can be a slow operation. pub fn init_logical_size(&self) -> Result<()> { + if self.logical_size_initialized.load(AtomicOrdering::Acquire) { + return Ok(()); + } + // Try a fast-path first: // Copy logical size from ancestor timeline if there has been no changes on this // branch, and no changes on the ancestor branch since the branch point. @@ -907,6 +917,8 @@ impl Timeline { fn set_current_logical_size(&self, new_size: u64) { self.current_logical_size .store(new_size as i64, AtomicOrdering::SeqCst); + self.logical_size_initialized + .store(true, AtomicOrdering::SeqCst); // Also set the value in the prometheus gauge. Same race condition // here as in `update_current_logical_size`. diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index f5b4308067..921d973a41 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -172,15 +172,15 @@ pub enum LocalTimelineUpdate { }, Attach { id: ZTenantTimelineId, - datadir: Arc, + timeline: Arc, }, } impl std::fmt::Debug for LocalTimelineUpdate { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Self::Detach { id, .. } => f.debug_tuple("Remove").field(id).finish(), - Self::Attach { id, .. } => f.debug_tuple("Add").field(id).finish(), + Self::Detach { id, .. } => f.debug_tuple("Detach").field(id).finish(), + Self::Attach { id, .. } => f.debug_tuple("Attach").field(id).finish(), } } } @@ -376,7 +376,10 @@ pub fn get_local_timeline_with_load( ) -> anyhow::Result> { let repository = get_repository_for_tenant(tenant_id)?; match repository.get_timeline(timeline_id) { - Some(RepositoryTimeline::Loaded(loaded_timeline)) => Ok(loaded_timeline), + Some(RepositoryTimeline::Loaded(loaded_timeline)) => { + loaded_timeline.init_logical_size()?; + Ok(loaded_timeline) + } _ => load_local_timeline(&repository, timeline_id) .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}")), } @@ -435,13 +438,17 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any // wait for wal receivers to stop without holding the lock, because walreceiver // will attempt to change tenant state which is protected by the same global tenants lock. + // TODO do we need a timeout here? how to handle it? // recv_timeout is broken: https://github.com/rust-lang/rust/issues/94518#issuecomment-1057440631 + // need to use crossbeam-channel for (timeline_id, join_handle) in walreceiver_join_handles { info!("waiting for wal receiver to shutdown timeline_id {timeline_id}"); join_handle.recv().context("failed to join walreceiver")?; info!("wal receiver shutdown confirmed timeline_id {timeline_id}"); } + tenants_state::write_tenants().remove(&tenant_id); + // If removal fails there will be no way to successfully retry detach, // because tenant no longer exists in in memory map. And it needs to be removed from it // before we remove files because it contains references to repository @@ -561,12 +568,15 @@ fn attach_downloaded_tenant( repo: &Repository, downloaded_timelines: HashSet, ) -> anyhow::Result<()> { - for timeline_id in downloaded_timelines { - // first, register timeline metadata + // first, register timeline metadata to ensure ancestors will be found later during layer load + for &timeline_id in &downloaded_timelines { repo.attach_timeline(timeline_id).with_context(|| { format!("Failed to load timeline {timeline_id} into in-memory repository") })?; - // and then load its layers in memory + } + + // and then load its layers in memory + for timeline_id in downloaded_timelines { let _ = load_local_timeline(repo, timeline_id).with_context(|| { format!( "Failed to register add local timeline for tenant {}", diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 8a466a8a67..d6420e1d18 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -269,7 +269,7 @@ async fn wal_receiver_main_thread_loop_step<'a>( } } // Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly. - LocalTimelineUpdate::Attach { id, datadir } => { + LocalTimelineUpdate::Attach { id, timeline } => { let timeline_connection_managers = local_timeline_wal_receivers .entry(id.tenant_id) .or_default(); @@ -305,7 +305,7 @@ async fn wal_receiver_main_thread_loop_step<'a>( id, broker_prefix.to_owned(), etcd_client.clone(), - datadir, + timeline, wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, From 277f2d6d3d55f6e6391997a9c163193eb1d1a964 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 22 Aug 2022 11:21:50 +0100 Subject: [PATCH 0668/1022] Report test results to Allure (#2229) --- .github/actions/allure-report/action.yml | 219 ++++++ .../actions/run-python-test-set/action.yml | 10 + .github/workflows/build_and_test.yml | 23 + poetry.lock | 718 +++++++++--------- pyproject.toml | 1 + test_runner/fixtures/neon_fixtures.py | 33 +- 6 files changed, 661 insertions(+), 343 deletions(-) create mode 100644 .github/actions/allure-report/action.yml diff --git a/.github/actions/allure-report/action.yml b/.github/actions/allure-report/action.yml new file mode 100644 index 0000000000..2e52bd7695 --- /dev/null +++ b/.github/actions/allure-report/action.yml @@ -0,0 +1,219 @@ +name: 'Create Allure report' +description: 'Create and publish Allure report' + +inputs: + action: + desctiption: 'generate or store' + required: true + build_type: + description: '`build_type` from run-python-test-set action' + required: true + test_selection: + description: '`test_selector` from run-python-test-set action' + required: false + +runs: + using: "composite" + steps: + - name: Validate input parameters + shell: bash -euxo pipefail {0} + run: | + if [ "${{ inputs.action }}" != "store"] && [ "${{ inputs.action }}" != "generate" ]; then + echo 2>&1 "Unknown inputs.action type '${{ inputs.action }}'; allowed 'generate' or 'store' only" + exit 1 + fi + + if [ -z "${{ inputs.test_selection }}" ] && [ "${{ inputs.action }}" == "store" ]; then + echo 2>&1 "inputs.test_selection must be set for 'store' action" + exit 2 + fi + + - name: Calculate key + id: calculate-key + shell: bash -euxo pipefail {0} + run: | + # TODO: for manually triggered workflows (via workflow_dispatch) we need to have a separate key + + pr_number=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) + if [ "${pr_number}" != "null" ]; then + key=pr-${pr_number} + elif [ "${GITHUB_REF}" = "refs/heads/main" ]; then + # Shortcut for a special branch + key=main + else + key=branch-$(echo ${GITHUB_REF#refs/heads/} | tr -cd "[:alnum:]._-") + fi + echo "::set-output name=KEY::${key}" + + - uses: actions/setup-java@v3 + if: ${{ inputs.action == 'generate' }} + with: + distribution: 'temurin' + java-version: '17' + + - name: Install Allure + if: ${{ inputs.action == 'generate' }} + shell: bash -euxo pipefail {0} + run: | + if ! which allure; then + ALLURE_ZIP=allure-${ALLURE_VERSION}.zip + wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP} + echo "${ALLURE_ZIP_MD5} ${ALLURE_ZIP}" | md5sum -c + unzip -q ${ALLURE_ZIP} + echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH + rm -f ${ALLURE_ZIP} + fi + env: + ALLURE_VERSION: 2.19.0 + ALLURE_ZIP_MD5: ced21401a1a8b9dfb68cee9e4c210464 + + - name: Upload Allure results + if: ${{ inputs.action == 'store' }} + env: + REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }} + RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }} + TEST_OUTPUT: /tmp/test_output + BUCKET: neon-github-public-dev + shell: bash -euxo pipefail {0} + run: | + # Add metadata + cat < $TEST_OUTPUT/allure/results/executor.json + { + "name": "GitHub Actions", + "type": "github", + "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html", + "buildOrder": ${GITHUB_RUN_ID}, + "buildName": "GitHub Actions Run #${{ github.run_number }}/${GITHUB_RUN_ATTEMPT}", + "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}", + "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html", + "reportName": "Allure Report" + } + EOF + cat < $TEST_OUTPUT/allure/results/environment.properties + TEST_SELECTION=${{ inputs.test_selection }} + BUILD_TYPE=${{ inputs.build_type }} + EOF + + ARCHIVE="${GITHUB_RUN_ID}-${{ inputs.test_selection }}-${GITHUB_RUN_ATTEMPT}.tar.zst" + ZSTD_NBTHREADS=0 + + tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd . + aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}" + + # Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this + - name: Acquire Allure lock + if: ${{ inputs.action == 'generate' }} + shell: bash -euxo pipefail {0} + env: + LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt + BUCKET: neon-github-public-dev + run: | + LOCK_TIMEOUT=300 # seconds + + for _ in $(seq 1 5); do + for i in $(seq 1 ${LOCK_TIMEOUT}); do + LOCK_ADDED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true) + # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS) + if [ -z "${LOCK_ADDED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ADDED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then + break + fi + sleep 1 + done + echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" > lock.txt + aws s3 mv --only-show-errors lock.txt "s3://${BUCKET}/${LOCK_FILE}" + + # A double-check that exactly WE have acquired the lock + aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt + if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then + break + fi + done + + - name: Generate and publish final Allure report + if: ${{ inputs.action == 'generate' }} + id: generate-report + env: + REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }} + RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }} + TEST_OUTPUT: /tmp/test_output + BUCKET: neon-github-public-dev + shell: bash -euxo pipefail {0} + run: | + # Get previously uploaded data for this run + ZSTD_NBTHREADS=0 + + s3_filepaths=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/${GITHUB_RUN_ID}- | jq --raw-output '.Contents[].Key') + if [ -z "$s3_filepaths" ]; then + # There's no previously uploaded data for this run + exit 0 + fi + for s3_filepath in ${s3_filepaths}; do + aws s3 cp --only-show-errors "s3://${BUCKET}/${s3_filepath}" "${TEST_OUTPUT}/allure/" + + archive=${TEST_OUTPUT}/allure/$(basename $s3_filepath) + mkdir -p ${archive%.tar.zst} + tar -xf ${archive} -C ${archive%.tar.zst} + rm -f ${archive} + done + + # Get history trend + aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${TEST_OUTPUT}/allure/latest/history" || true + + # Generate report + allure generate --clean --output $TEST_OUTPUT/allure/report $TEST_OUTPUT/allure/* + + # Replace a logo link with a redirect to the latest version of the report + sed -i 's| ./index.html + + + + Redirecting to ${REPORT_URL} + + EOF + aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html" + + echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY} + echo "::set-output name=REPORT_URL::${REPORT_URL}" + + - name: Release Allure lock + if: ${{ inputs.action == 'generate' && always() }} + shell: bash -euxo pipefail {0} + env: + LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt + BUCKET: neon-github-public-dev + run: | + aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0 + + if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then + aws s3 rm "s3://${BUCKET}/${LOCK_FILE}" + fi + + - uses: actions/github-script@v6 + if: ${{ inputs.action == 'generate' && always() }} + env: + REPORT_URL: ${{ steps.generate-report.outputs.REPORT_URL }} + BUILD_TYPE: ${{ inputs.build_type }} + SHA: ${{ github.event.pull_request.head.sha || github.sha }} + with: + script: | + const { REPORT_URL, BUILD_TYPE, SHA } = process.env + + result = await github.rest.repos.createCommitStatus({ + owner: context.repo.owner, + repo: context.repo.repo, + sha: `${SHA}`, + state: 'success', + target_url: `${REPORT_URL}`, + context: `Allure report / ${BUILD_TYPE}`, + }) + + console.log(result); diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 3900f93ee4..22447025cb 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -131,8 +131,10 @@ runs: # -n4 uses four processes to run tests via pytest-xdist # -s is not used to prevent pytest from capturing output, because tests are running # in parallel and logs are mixed between different tests + mkdir -p $TEST_OUTPUT/allure/results "${cov_prefix[@]}" ./scripts/pytest \ --junitxml=$TEST_OUTPUT/junit.xml \ + --alluredir=$TEST_OUTPUT/allure/results \ --tb=short \ --verbose \ -m "not remote_cluster" \ @@ -146,6 +148,14 @@ runs: fi fi + - name: Upload Allure results + if: ${{ always() && (inputs.test_selection == 'batch_others' || inputs.test_selection == 'batch_pg_regress') }} + uses: ./.github/actions/allure-report + with: + action: store + build_type: ${{ inputs.build_type }} + test_selection: ${{ inputs.test_selection }} + - name: Delete all data but logs shell: bash -euxo pipefail {0} if: always() diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 4cabd3d672..dab34c84bc 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -298,6 +298,29 @@ jobs: # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones + merge-allure-report: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + needs: [ other-tests, pg_regress-tests ] + if: always() + strategy: + fail-fast: false + matrix: + build_type: [ debug, release ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: false + + - name: Merge and Allure results + uses: ./.github/actions/allure-report + with: + action: generate + build_type: ${{ matrix.build_type }} + coverage-report: runs-on: dev container: diff --git a/poetry.lock b/poetry.lock index 6ab6bb0e20..17b59852f4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -13,6 +13,32 @@ psycopg2-binary = ">=2.8.4" [package.extras] sa = ["sqlalchemy[postgresql_psycopg2binary] (>=1.3,<1.5)"] +[[package]] +name = "allure-pytest" +version = "2.9.45" +description = "Allure pytest integration" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +allure-python-commons = "2.9.45" +pytest = ">=4.5.0" +six = ">=1.9.0" + +[[package]] +name = "allure-python-commons" +version = "2.9.45" +description = "Common module for integrate allure with python-based frameworks" +category = "main" +optional = false +python-versions = ">=3.5" + +[package.dependencies] +attrs = ">=16.0.0" +pluggy = ">=0.4.0" +six = ">=1.9.0" + [[package]] name = "async-timeout" version = "4.0.2" @@ -109,8 +135,8 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "boto3-stubs" -version = "1.24.46" -description = "Type annotations for boto3 1.24.46 generated with mypy-boto3-builder 7.11.3" +version = "1.24.51" +description = "Type annotations for boto3 1.24.51 generated with mypy-boto3-builder 7.11.6" category = "main" optional = false python-versions = ">=3.7" @@ -122,319 +148,321 @@ types-s3transfer = "*" typing-extensions = ">=4.1.0" [package.extras] -accessanalyzer = ["mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)"] -account = ["mypy-boto3-account (>=1.24.0,<1.25.0)"] -acm = ["mypy-boto3-acm (>=1.24.0,<1.25.0)"] -acm-pca = ["mypy-boto3-acm-pca (>=1.24.0,<1.25.0)"] -alexaforbusiness = ["mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)"] -all = ["mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)", "mypy-boto3-account (>=1.24.0,<1.25.0)", "mypy-boto3-acm (>=1.24.0,<1.25.0)", "mypy-boto3-acm-pca (>=1.24.0,<1.25.0)", "mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)", "mypy-boto3-amp (>=1.24.0,<1.25.0)", "mypy-boto3-amplify (>=1.24.0,<1.25.0)", "mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)", "mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)", "mypy-boto3-apigateway (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)", "mypy-boto3-appconfig (>=1.24.0,<1.25.0)", "mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)", "mypy-boto3-appflow (>=1.24.0,<1.25.0)", "mypy-boto3-appintegrations (>=1.24.0,<1.25.0)", "mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-application-insights (>=1.24.0,<1.25.0)", "mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-appmesh (>=1.24.0,<1.25.0)", "mypy-boto3-apprunner (>=1.24.0,<1.25.0)", "mypy-boto3-appstream (>=1.24.0,<1.25.0)", "mypy-boto3-appsync (>=1.24.0,<1.25.0)", "mypy-boto3-athena (>=1.24.0,<1.25.0)", "mypy-boto3-auditmanager (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)", "mypy-boto3-backup (>=1.24.0,<1.25.0)", "mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)", "mypy-boto3-batch (>=1.24.0,<1.25.0)", "mypy-boto3-billingconductor (>=1.24.0,<1.25.0)", "mypy-boto3-braket (>=1.24.0,<1.25.0)", "mypy-boto3-budgets (>=1.24.0,<1.25.0)", "mypy-boto3-ce (>=1.24.0,<1.25.0)", "mypy-boto3-chime (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)", "mypy-boto3-cloud9 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)", "mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)", "mypy-boto3-cloudformation (>=1.24.0,<1.25.0)", "mypy-boto3-cloudfront (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)", "mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)", "mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)", "mypy-boto3-codeartifact (>=1.24.0,<1.25.0)", "mypy-boto3-codebuild (>=1.24.0,<1.25.0)", "mypy-boto3-codecommit (>=1.24.0,<1.25.0)", "mypy-boto3-codedeploy (>=1.24.0,<1.25.0)", "mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)", "mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-codepipeline (>=1.24.0,<1.25.0)", "mypy-boto3-codestar (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)", "mypy-boto3-comprehend (>=1.24.0,<1.25.0)", "mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)", "mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)", "mypy-boto3-config (>=1.24.0,<1.25.0)", "mypy-boto3-connect (>=1.24.0,<1.25.0)", "mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)", "mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)", "mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)", "mypy-boto3-cur (>=1.24.0,<1.25.0)", "mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)", "mypy-boto3-databrew (>=1.24.0,<1.25.0)", "mypy-boto3-dataexchange (>=1.24.0,<1.25.0)", "mypy-boto3-datapipeline (>=1.24.0,<1.25.0)", "mypy-boto3-datasync (>=1.24.0,<1.25.0)", "mypy-boto3-dax (>=1.24.0,<1.25.0)", "mypy-boto3-detective (>=1.24.0,<1.25.0)", "mypy-boto3-devicefarm (>=1.24.0,<1.25.0)", "mypy-boto3-devops-guru (>=1.24.0,<1.25.0)", "mypy-boto3-directconnect (>=1.24.0,<1.25.0)", "mypy-boto3-discovery (>=1.24.0,<1.25.0)", "mypy-boto3-dlm (>=1.24.0,<1.25.0)", "mypy-boto3-dms (>=1.24.0,<1.25.0)", "mypy-boto3-docdb (>=1.24.0,<1.25.0)", "mypy-boto3-drs (>=1.24.0,<1.25.0)", "mypy-boto3-ds (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)", "mypy-boto3-ebs (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)", "mypy-boto3-ecr (>=1.24.0,<1.25.0)", "mypy-boto3-ecr-public (>=1.24.0,<1.25.0)", "mypy-boto3-ecs (>=1.24.0,<1.25.0)", "mypy-boto3-efs (>=1.24.0,<1.25.0)", "mypy-boto3-eks (>=1.24.0,<1.25.0)", "mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)", "mypy-boto3-elasticache (>=1.24.0,<1.25.0)", "mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)", "mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)", "mypy-boto3-elb (>=1.24.0,<1.25.0)", "mypy-boto3-elbv2 (>=1.24.0,<1.25.0)", "mypy-boto3-emr (>=1.24.0,<1.25.0)", "mypy-boto3-emr-containers (>=1.24.0,<1.25.0)", "mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-es (>=1.24.0,<1.25.0)", "mypy-boto3-events (>=1.24.0,<1.25.0)", "mypy-boto3-evidently (>=1.24.0,<1.25.0)", "mypy-boto3-finspace (>=1.24.0,<1.25.0)", "mypy-boto3-finspace-data (>=1.24.0,<1.25.0)", "mypy-boto3-firehose (>=1.24.0,<1.25.0)", "mypy-boto3-fis (>=1.24.0,<1.25.0)", "mypy-boto3-fms (>=1.24.0,<1.25.0)", "mypy-boto3-forecast (>=1.24.0,<1.25.0)", "mypy-boto3-forecastquery (>=1.24.0,<1.25.0)", "mypy-boto3-frauddetector (>=1.24.0,<1.25.0)", "mypy-boto3-fsx (>=1.24.0,<1.25.0)", "mypy-boto3-gamelift (>=1.24.0,<1.25.0)", "mypy-boto3-gamesparks (>=1.24.0,<1.25.0)", "mypy-boto3-glacier (>=1.24.0,<1.25.0)", "mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)", "mypy-boto3-glue (>=1.24.0,<1.25.0)", "mypy-boto3-grafana (>=1.24.0,<1.25.0)", "mypy-boto3-greengrass (>=1.24.0,<1.25.0)", "mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)", "mypy-boto3-groundstation (>=1.24.0,<1.25.0)", "mypy-boto3-guardduty (>=1.24.0,<1.25.0)", "mypy-boto3-health (>=1.24.0,<1.25.0)", "mypy-boto3-healthlake (>=1.24.0,<1.25.0)", "mypy-boto3-honeycode (>=1.24.0,<1.25.0)", "mypy-boto3-iam (>=1.24.0,<1.25.0)", "mypy-boto3-identitystore (>=1.24.0,<1.25.0)", "mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)", "mypy-boto3-importexport (>=1.24.0,<1.25.0)", "mypy-boto3-inspector (>=1.24.0,<1.25.0)", "mypy-boto3-inspector2 (>=1.24.0,<1.25.0)", "mypy-boto3-iot (>=1.24.0,<1.25.0)", "mypy-boto3-iot-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)", "mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)", "mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)", "mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)", "mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)", "mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)", "mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)", "mypy-boto3-iotwireless (>=1.24.0,<1.25.0)", "mypy-boto3-ivs (>=1.24.0,<1.25.0)", "mypy-boto3-ivschat (>=1.24.0,<1.25.0)", "mypy-boto3-kafka (>=1.24.0,<1.25.0)", "mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-kendra (>=1.24.0,<1.25.0)", "mypy-boto3-keyspaces (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)", "mypy-boto3-kms (>=1.24.0,<1.25.0)", "mypy-boto3-lakeformation (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-lex-models (>=1.24.0,<1.25.0)", "mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager-user-subscriptions (>=1.24.0,<1.25.0)", "mypy-boto3-lightsail (>=1.24.0,<1.25.0)", "mypy-boto3-location (>=1.24.0,<1.25.0)", "mypy-boto3-logs (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)", "mypy-boto3-m2 (>=1.24.0,<1.25.0)", "mypy-boto3-machinelearning (>=1.24.0,<1.25.0)", "mypy-boto3-macie (>=1.24.0,<1.25.0)", "mypy-boto3-macie2 (>=1.24.0,<1.25.0)", "mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)", "mypy-boto3-medialive (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)", "mypy-boto3-mediatailor (>=1.24.0,<1.25.0)", "mypy-boto3-memorydb (>=1.24.0,<1.25.0)", "mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)", "mypy-boto3-mgh (>=1.24.0,<1.25.0)", "mypy-boto3-mgn (>=1.24.0,<1.25.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)", "mypy-boto3-mobile (>=1.24.0,<1.25.0)", "mypy-boto3-mq (>=1.24.0,<1.25.0)", "mypy-boto3-mturk (>=1.24.0,<1.25.0)", "mypy-boto3-mwaa (>=1.24.0,<1.25.0)", "mypy-boto3-neptune (>=1.24.0,<1.25.0)", "mypy-boto3-network-firewall (>=1.24.0,<1.25.0)", "mypy-boto3-networkmanager (>=1.24.0,<1.25.0)", "mypy-boto3-nimble (>=1.24.0,<1.25.0)", "mypy-boto3-opensearch (>=1.24.0,<1.25.0)", "mypy-boto3-opsworks (>=1.24.0,<1.25.0)", "mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)", "mypy-boto3-organizations (>=1.24.0,<1.25.0)", "mypy-boto3-outposts (>=1.24.0,<1.25.0)", "mypy-boto3-panorama (>=1.24.0,<1.25.0)", "mypy-boto3-personalize (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-events (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-pi (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)", "mypy-boto3-polly (>=1.24.0,<1.25.0)", "mypy-boto3-pricing (>=1.24.0,<1.25.0)", "mypy-boto3-proton (>=1.24.0,<1.25.0)", "mypy-boto3-qldb (>=1.24.0,<1.25.0)", "mypy-boto3-qldb-session (>=1.24.0,<1.25.0)", "mypy-boto3-quicksight (>=1.24.0,<1.25.0)", "mypy-boto3-ram (>=1.24.0,<1.25.0)", "mypy-boto3-rbin (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-rds-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-rekognition (>=1.24.0,<1.25.0)", "mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)", "mypy-boto3-resource-groups (>=1.24.0,<1.25.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)", "mypy-boto3-robomaker (>=1.24.0,<1.25.0)", "mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)", "mypy-boto3-route53 (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)", "mypy-boto3-route53domains (>=1.24.0,<1.25.0)", "mypy-boto3-route53resolver (>=1.24.0,<1.25.0)", "mypy-boto3-rum (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-s3control (>=1.24.0,<1.25.0)", "mypy-boto3-s3outposts (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-savingsplans (>=1.24.0,<1.25.0)", "mypy-boto3-schemas (>=1.24.0,<1.25.0)", "mypy-boto3-sdb (>=1.24.0,<1.25.0)", "mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)", "mypy-boto3-securityhub (>=1.24.0,<1.25.0)", "mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)", "mypy-boto3-service-quotas (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)", "mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)", "mypy-boto3-ses (>=1.24.0,<1.25.0)", "mypy-boto3-sesv2 (>=1.24.0,<1.25.0)", "mypy-boto3-shield (>=1.24.0,<1.25.0)", "mypy-boto3-signer (>=1.24.0,<1.25.0)", "mypy-boto3-sms (>=1.24.0,<1.25.0)", "mypy-boto3-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)", "mypy-boto3-snowball (>=1.24.0,<1.25.0)", "mypy-boto3-sns (>=1.24.0,<1.25.0)", "mypy-boto3-sqs (>=1.24.0,<1.25.0)", "mypy-boto3-ssm (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)", "mypy-boto3-sso (>=1.24.0,<1.25.0)", "mypy-boto3-sso-admin (>=1.24.0,<1.25.0)", "mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)", "mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)", "mypy-boto3-storagegateway (>=1.24.0,<1.25.0)", "mypy-boto3-sts (>=1.24.0,<1.25.0)", "mypy-boto3-support (>=1.24.0,<1.25.0)", "mypy-boto3-swf (>=1.24.0,<1.25.0)", "mypy-boto3-synthetics (>=1.24.0,<1.25.0)", "mypy-boto3-textract (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-query (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-write (>=1.24.0,<1.25.0)", "mypy-boto3-transcribe (>=1.24.0,<1.25.0)", "mypy-boto3-transfer (>=1.24.0,<1.25.0)", "mypy-boto3-translate (>=1.24.0,<1.25.0)", "mypy-boto3-voice-id (>=1.24.0,<1.25.0)", "mypy-boto3-waf (>=1.24.0,<1.25.0)", "mypy-boto3-waf-regional (>=1.24.0,<1.25.0)", "mypy-boto3-wafv2 (>=1.24.0,<1.25.0)", "mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)", "mypy-boto3-wisdom (>=1.24.0,<1.25.0)", "mypy-boto3-workdocs (>=1.24.0,<1.25.0)", "mypy-boto3-worklink (>=1.24.0,<1.25.0)", "mypy-boto3-workmail (>=1.24.0,<1.25.0)", "mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)", "mypy-boto3-xray (>=1.24.0,<1.25.0)"] -amp = ["mypy-boto3-amp (>=1.24.0,<1.25.0)"] -amplify = ["mypy-boto3-amplify (>=1.24.0,<1.25.0)"] -amplifybackend = ["mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)"] -amplifyuibuilder = ["mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)"] -apigateway = ["mypy-boto3-apigateway (>=1.24.0,<1.25.0)"] -apigatewaymanagementapi = ["mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)"] -apigatewayv2 = ["mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)"] -appconfig = ["mypy-boto3-appconfig (>=1.24.0,<1.25.0)"] -appconfigdata = ["mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)"] -appflow = ["mypy-boto3-appflow (>=1.24.0,<1.25.0)"] -appintegrations = ["mypy-boto3-appintegrations (>=1.24.0,<1.25.0)"] -application-autoscaling = ["mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)"] -application-insights = ["mypy-boto3-application-insights (>=1.24.0,<1.25.0)"] -applicationcostprofiler = ["mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)"] -appmesh = ["mypy-boto3-appmesh (>=1.24.0,<1.25.0)"] -apprunner = ["mypy-boto3-apprunner (>=1.24.0,<1.25.0)"] -appstream = ["mypy-boto3-appstream (>=1.24.0,<1.25.0)"] -appsync = ["mypy-boto3-appsync (>=1.24.0,<1.25.0)"] -athena = ["mypy-boto3-athena (>=1.24.0,<1.25.0)"] -auditmanager = ["mypy-boto3-auditmanager (>=1.24.0,<1.25.0)"] -autoscaling = ["mypy-boto3-autoscaling (>=1.24.0,<1.25.0)"] -autoscaling-plans = ["mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)"] -backup = ["mypy-boto3-backup (>=1.24.0,<1.25.0)"] -backup-gateway = ["mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)"] -batch = ["mypy-boto3-batch (>=1.24.0,<1.25.0)"] -billingconductor = ["mypy-boto3-billingconductor (>=1.24.0,<1.25.0)"] -braket = ["mypy-boto3-braket (>=1.24.0,<1.25.0)"] -budgets = ["mypy-boto3-budgets (>=1.24.0,<1.25.0)"] -ce = ["mypy-boto3-ce (>=1.24.0,<1.25.0)"] -chime = ["mypy-boto3-chime (>=1.24.0,<1.25.0)"] -chime-sdk-identity = ["mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)"] -chime-sdk-media-pipelines = ["mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)"] -chime-sdk-meetings = ["mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)"] -chime-sdk-messaging = ["mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)"] -cloud9 = ["mypy-boto3-cloud9 (>=1.24.0,<1.25.0)"] -cloudcontrol = ["mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)"] -clouddirectory = ["mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)"] -cloudformation = ["mypy-boto3-cloudformation (>=1.24.0,<1.25.0)"] -cloudfront = ["mypy-boto3-cloudfront (>=1.24.0,<1.25.0)"] -cloudhsm = ["mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)"] -cloudhsmv2 = ["mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)"] -cloudsearch = ["mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)"] -cloudsearchdomain = ["mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)"] -cloudtrail = ["mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)"] -cloudwatch = ["mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)"] -codeartifact = ["mypy-boto3-codeartifact (>=1.24.0,<1.25.0)"] -codebuild = ["mypy-boto3-codebuild (>=1.24.0,<1.25.0)"] -codecommit = ["mypy-boto3-codecommit (>=1.24.0,<1.25.0)"] -codedeploy = ["mypy-boto3-codedeploy (>=1.24.0,<1.25.0)"] -codeguru-reviewer = ["mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)"] -codeguruprofiler = ["mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)"] -codepipeline = ["mypy-boto3-codepipeline (>=1.24.0,<1.25.0)"] -codestar = ["mypy-boto3-codestar (>=1.24.0,<1.25.0)"] -codestar-connections = ["mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)"] -codestar-notifications = ["mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)"] -cognito-identity = ["mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)"] -cognito-idp = ["mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)"] -cognito-sync = ["mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)"] -comprehend = ["mypy-boto3-comprehend (>=1.24.0,<1.25.0)"] -comprehendmedical = ["mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)"] -compute-optimizer = ["mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)"] -config = ["mypy-boto3-config (>=1.24.0,<1.25.0)"] -connect = ["mypy-boto3-connect (>=1.24.0,<1.25.0)"] -connect-contact-lens = ["mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)"] -connectcampaigns = ["mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)"] -connectparticipant = ["mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)"] -cur = ["mypy-boto3-cur (>=1.24.0,<1.25.0)"] -customer-profiles = ["mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)"] -databrew = ["mypy-boto3-databrew (>=1.24.0,<1.25.0)"] -dataexchange = ["mypy-boto3-dataexchange (>=1.24.0,<1.25.0)"] -datapipeline = ["mypy-boto3-datapipeline (>=1.24.0,<1.25.0)"] -datasync = ["mypy-boto3-datasync (>=1.24.0,<1.25.0)"] -dax = ["mypy-boto3-dax (>=1.24.0,<1.25.0)"] -detective = ["mypy-boto3-detective (>=1.24.0,<1.25.0)"] -devicefarm = ["mypy-boto3-devicefarm (>=1.24.0,<1.25.0)"] -devops-guru = ["mypy-boto3-devops-guru (>=1.24.0,<1.25.0)"] -directconnect = ["mypy-boto3-directconnect (>=1.24.0,<1.25.0)"] -discovery = ["mypy-boto3-discovery (>=1.24.0,<1.25.0)"] -dlm = ["mypy-boto3-dlm (>=1.24.0,<1.25.0)"] -dms = ["mypy-boto3-dms (>=1.24.0,<1.25.0)"] -docdb = ["mypy-boto3-docdb (>=1.24.0,<1.25.0)"] -drs = ["mypy-boto3-drs (>=1.24.0,<1.25.0)"] -ds = ["mypy-boto3-ds (>=1.24.0,<1.25.0)"] -dynamodb = ["mypy-boto3-dynamodb (>=1.24.0,<1.25.0)"] -dynamodbstreams = ["mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)"] -ebs = ["mypy-boto3-ebs (>=1.24.0,<1.25.0)"] -ec2 = ["mypy-boto3-ec2 (>=1.24.0,<1.25.0)"] -ec2-instance-connect = ["mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)"] -ecr = ["mypy-boto3-ecr (>=1.24.0,<1.25.0)"] -ecr-public = ["mypy-boto3-ecr-public (>=1.24.0,<1.25.0)"] -ecs = ["mypy-boto3-ecs (>=1.24.0,<1.25.0)"] -efs = ["mypy-boto3-efs (>=1.24.0,<1.25.0)"] -eks = ["mypy-boto3-eks (>=1.24.0,<1.25.0)"] -elastic-inference = ["mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)"] -elasticache = ["mypy-boto3-elasticache (>=1.24.0,<1.25.0)"] -elasticbeanstalk = ["mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)"] -elastictranscoder = ["mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)"] -elb = ["mypy-boto3-elb (>=1.24.0,<1.25.0)"] -elbv2 = ["mypy-boto3-elbv2 (>=1.24.0,<1.25.0)"] -emr = ["mypy-boto3-emr (>=1.24.0,<1.25.0)"] -emr-containers = ["mypy-boto3-emr-containers (>=1.24.0,<1.25.0)"] -emr-serverless = ["mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)"] -es = ["mypy-boto3-es (>=1.24.0,<1.25.0)"] -essential = ["mypy-boto3-cloudformation (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-sqs (>=1.24.0,<1.25.0)"] -events = ["mypy-boto3-events (>=1.24.0,<1.25.0)"] -evidently = ["mypy-boto3-evidently (>=1.24.0,<1.25.0)"] -finspace = ["mypy-boto3-finspace (>=1.24.0,<1.25.0)"] -finspace-data = ["mypy-boto3-finspace-data (>=1.24.0,<1.25.0)"] -firehose = ["mypy-boto3-firehose (>=1.24.0,<1.25.0)"] -fis = ["mypy-boto3-fis (>=1.24.0,<1.25.0)"] -fms = ["mypy-boto3-fms (>=1.24.0,<1.25.0)"] -forecast = ["mypy-boto3-forecast (>=1.24.0,<1.25.0)"] -forecastquery = ["mypy-boto3-forecastquery (>=1.24.0,<1.25.0)"] -frauddetector = ["mypy-boto3-frauddetector (>=1.24.0,<1.25.0)"] -fsx = ["mypy-boto3-fsx (>=1.24.0,<1.25.0)"] -gamelift = ["mypy-boto3-gamelift (>=1.24.0,<1.25.0)"] -gamesparks = ["mypy-boto3-gamesparks (>=1.24.0,<1.25.0)"] -glacier = ["mypy-boto3-glacier (>=1.24.0,<1.25.0)"] -globalaccelerator = ["mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)"] -glue = ["mypy-boto3-glue (>=1.24.0,<1.25.0)"] -grafana = ["mypy-boto3-grafana (>=1.24.0,<1.25.0)"] -greengrass = ["mypy-boto3-greengrass (>=1.24.0,<1.25.0)"] -greengrassv2 = ["mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)"] -groundstation = ["mypy-boto3-groundstation (>=1.24.0,<1.25.0)"] -guardduty = ["mypy-boto3-guardduty (>=1.24.0,<1.25.0)"] -health = ["mypy-boto3-health (>=1.24.0,<1.25.0)"] -healthlake = ["mypy-boto3-healthlake (>=1.24.0,<1.25.0)"] -honeycode = ["mypy-boto3-honeycode (>=1.24.0,<1.25.0)"] -iam = ["mypy-boto3-iam (>=1.24.0,<1.25.0)"] -identitystore = ["mypy-boto3-identitystore (>=1.24.0,<1.25.0)"] -imagebuilder = ["mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)"] -importexport = ["mypy-boto3-importexport (>=1.24.0,<1.25.0)"] -inspector = ["mypy-boto3-inspector (>=1.24.0,<1.25.0)"] -inspector2 = ["mypy-boto3-inspector2 (>=1.24.0,<1.25.0)"] -iot = ["mypy-boto3-iot (>=1.24.0,<1.25.0)"] -iot-data = ["mypy-boto3-iot-data (>=1.24.0,<1.25.0)"] -iot-jobs-data = ["mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)"] -iot1click-devices = ["mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)"] -iot1click-projects = ["mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)"] -iotanalytics = ["mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)"] -iotdeviceadvisor = ["mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)"] -iotevents = ["mypy-boto3-iotevents (>=1.24.0,<1.25.0)"] -iotevents-data = ["mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)"] -iotfleethub = ["mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)"] -iotsecuretunneling = ["mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)"] -iotsitewise = ["mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)"] -iotthingsgraph = ["mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)"] -iottwinmaker = ["mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)"] -iotwireless = ["mypy-boto3-iotwireless (>=1.24.0,<1.25.0)"] -ivs = ["mypy-boto3-ivs (>=1.24.0,<1.25.0)"] -ivschat = ["mypy-boto3-ivschat (>=1.24.0,<1.25.0)"] -kafka = ["mypy-boto3-kafka (>=1.24.0,<1.25.0)"] -kafkaconnect = ["mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)"] -kendra = ["mypy-boto3-kendra (>=1.24.0,<1.25.0)"] -keyspaces = ["mypy-boto3-keyspaces (>=1.24.0,<1.25.0)"] -kinesis = ["mypy-boto3-kinesis (>=1.24.0,<1.25.0)"] -kinesis-video-archived-media = ["mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)"] -kinesis-video-media = ["mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)"] -kinesis-video-signaling = ["mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)"] -kinesisanalytics = ["mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)"] -kinesisanalyticsv2 = ["mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)"] -kinesisvideo = ["mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)"] -kms = ["mypy-boto3-kms (>=1.24.0,<1.25.0)"] -lakeformation = ["mypy-boto3-lakeformation (>=1.24.0,<1.25.0)"] -lambda = ["mypy-boto3-lambda (>=1.24.0,<1.25.0)"] -lex-models = ["mypy-boto3-lex-models (>=1.24.0,<1.25.0)"] -lex-runtime = ["mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)"] -lexv2-models = ["mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)"] -lexv2-runtime = ["mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)"] -license-manager = ["mypy-boto3-license-manager (>=1.24.0,<1.25.0)"] -license-manager-user-subscriptions = ["mypy-boto3-license-manager-user-subscriptions (>=1.24.0,<1.25.0)"] -lightsail = ["mypy-boto3-lightsail (>=1.24.0,<1.25.0)"] -location = ["mypy-boto3-location (>=1.24.0,<1.25.0)"] -logs = ["mypy-boto3-logs (>=1.24.0,<1.25.0)"] -lookoutequipment = ["mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)"] -lookoutmetrics = ["mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)"] -lookoutvision = ["mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)"] -m2 = ["mypy-boto3-m2 (>=1.24.0,<1.25.0)"] -machinelearning = ["mypy-boto3-machinelearning (>=1.24.0,<1.25.0)"] -macie = ["mypy-boto3-macie (>=1.24.0,<1.25.0)"] -macie2 = ["mypy-boto3-macie2 (>=1.24.0,<1.25.0)"] -managedblockchain = ["mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)"] -marketplace-catalog = ["mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)"] -marketplace-entitlement = ["mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)"] -marketplacecommerceanalytics = ["mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)"] -mediaconnect = ["mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)"] -mediaconvert = ["mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)"] -medialive = ["mypy-boto3-medialive (>=1.24.0,<1.25.0)"] -mediapackage = ["mypy-boto3-mediapackage (>=1.24.0,<1.25.0)"] -mediapackage-vod = ["mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)"] -mediastore = ["mypy-boto3-mediastore (>=1.24.0,<1.25.0)"] -mediastore-data = ["mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)"] -mediatailor = ["mypy-boto3-mediatailor (>=1.24.0,<1.25.0)"] -memorydb = ["mypy-boto3-memorydb (>=1.24.0,<1.25.0)"] -meteringmarketplace = ["mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)"] -mgh = ["mypy-boto3-mgh (>=1.24.0,<1.25.0)"] -mgn = ["mypy-boto3-mgn (>=1.24.0,<1.25.0)"] -migration-hub-refactor-spaces = ["mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)"] -migrationhub-config = ["mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)"] -migrationhubstrategy = ["mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)"] -mobile = ["mypy-boto3-mobile (>=1.24.0,<1.25.0)"] -mq = ["mypy-boto3-mq (>=1.24.0,<1.25.0)"] -mturk = ["mypy-boto3-mturk (>=1.24.0,<1.25.0)"] -mwaa = ["mypy-boto3-mwaa (>=1.24.0,<1.25.0)"] -neptune = ["mypy-boto3-neptune (>=1.24.0,<1.25.0)"] -network-firewall = ["mypy-boto3-network-firewall (>=1.24.0,<1.25.0)"] -networkmanager = ["mypy-boto3-networkmanager (>=1.24.0,<1.25.0)"] -nimble = ["mypy-boto3-nimble (>=1.24.0,<1.25.0)"] -opensearch = ["mypy-boto3-opensearch (>=1.24.0,<1.25.0)"] -opsworks = ["mypy-boto3-opsworks (>=1.24.0,<1.25.0)"] -opsworkscm = ["mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)"] -organizations = ["mypy-boto3-organizations (>=1.24.0,<1.25.0)"] -outposts = ["mypy-boto3-outposts (>=1.24.0,<1.25.0)"] -panorama = ["mypy-boto3-panorama (>=1.24.0,<1.25.0)"] -personalize = ["mypy-boto3-personalize (>=1.24.0,<1.25.0)"] -personalize-events = ["mypy-boto3-personalize-events (>=1.24.0,<1.25.0)"] -personalize-runtime = ["mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)"] -pi = ["mypy-boto3-pi (>=1.24.0,<1.25.0)"] -pinpoint = ["mypy-boto3-pinpoint (>=1.24.0,<1.25.0)"] -pinpoint-email = ["mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)"] -pinpoint-sms-voice = ["mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)"] -pinpoint-sms-voice-v2 = ["mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)"] -polly = ["mypy-boto3-polly (>=1.24.0,<1.25.0)"] -pricing = ["mypy-boto3-pricing (>=1.24.0,<1.25.0)"] -proton = ["mypy-boto3-proton (>=1.24.0,<1.25.0)"] -qldb = ["mypy-boto3-qldb (>=1.24.0,<1.25.0)"] -qldb-session = ["mypy-boto3-qldb-session (>=1.24.0,<1.25.0)"] -quicksight = ["mypy-boto3-quicksight (>=1.24.0,<1.25.0)"] -ram = ["mypy-boto3-ram (>=1.24.0,<1.25.0)"] -rbin = ["mypy-boto3-rbin (>=1.24.0,<1.25.0)"] -rds = ["mypy-boto3-rds (>=1.24.0,<1.25.0)"] -rds-data = ["mypy-boto3-rds-data (>=1.24.0,<1.25.0)"] -redshift = ["mypy-boto3-redshift (>=1.24.0,<1.25.0)"] -redshift-data = ["mypy-boto3-redshift-data (>=1.24.0,<1.25.0)"] -redshift-serverless = ["mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)"] -rekognition = ["mypy-boto3-rekognition (>=1.24.0,<1.25.0)"] -resiliencehub = ["mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)"] -resource-groups = ["mypy-boto3-resource-groups (>=1.24.0,<1.25.0)"] -resourcegroupstaggingapi = ["mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)"] -robomaker = ["mypy-boto3-robomaker (>=1.24.0,<1.25.0)"] -rolesanywhere = ["mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)"] -route53 = ["mypy-boto3-route53 (>=1.24.0,<1.25.0)"] -route53-recovery-cluster = ["mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)"] -route53-recovery-control-config = ["mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)"] -route53-recovery-readiness = ["mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)"] -route53domains = ["mypy-boto3-route53domains (>=1.24.0,<1.25.0)"] -route53resolver = ["mypy-boto3-route53resolver (>=1.24.0,<1.25.0)"] -rum = ["mypy-boto3-rum (>=1.24.0,<1.25.0)"] -s3 = ["mypy-boto3-s3 (>=1.24.0,<1.25.0)"] -s3control = ["mypy-boto3-s3control (>=1.24.0,<1.25.0)"] -s3outposts = ["mypy-boto3-s3outposts (>=1.24.0,<1.25.0)"] -sagemaker = ["mypy-boto3-sagemaker (>=1.24.0,<1.25.0)"] -sagemaker-a2i-runtime = ["mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)"] -sagemaker-edge = ["mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)"] -sagemaker-featurestore-runtime = ["mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)"] -sagemaker-runtime = ["mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)"] -savingsplans = ["mypy-boto3-savingsplans (>=1.24.0,<1.25.0)"] -schemas = ["mypy-boto3-schemas (>=1.24.0,<1.25.0)"] -sdb = ["mypy-boto3-sdb (>=1.24.0,<1.25.0)"] -secretsmanager = ["mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)"] -securityhub = ["mypy-boto3-securityhub (>=1.24.0,<1.25.0)"] -serverlessrepo = ["mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)"] -service-quotas = ["mypy-boto3-service-quotas (>=1.24.0,<1.25.0)"] -servicecatalog = ["mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)"] -servicecatalog-appregistry = ["mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)"] -servicediscovery = ["mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)"] -ses = ["mypy-boto3-ses (>=1.24.0,<1.25.0)"] -sesv2 = ["mypy-boto3-sesv2 (>=1.24.0,<1.25.0)"] -shield = ["mypy-boto3-shield (>=1.24.0,<1.25.0)"] -signer = ["mypy-boto3-signer (>=1.24.0,<1.25.0)"] -sms = ["mypy-boto3-sms (>=1.24.0,<1.25.0)"] -sms-voice = ["mypy-boto3-sms-voice (>=1.24.0,<1.25.0)"] -snow-device-management = ["mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)"] -snowball = ["mypy-boto3-snowball (>=1.24.0,<1.25.0)"] -sns = ["mypy-boto3-sns (>=1.24.0,<1.25.0)"] -sqs = ["mypy-boto3-sqs (>=1.24.0,<1.25.0)"] -ssm = ["mypy-boto3-ssm (>=1.24.0,<1.25.0)"] -ssm-contacts = ["mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)"] -ssm-incidents = ["mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)"] -sso = ["mypy-boto3-sso (>=1.24.0,<1.25.0)"] -sso-admin = ["mypy-boto3-sso-admin (>=1.24.0,<1.25.0)"] -sso-oidc = ["mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)"] -stepfunctions = ["mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)"] -storagegateway = ["mypy-boto3-storagegateway (>=1.24.0,<1.25.0)"] -sts = ["mypy-boto3-sts (>=1.24.0,<1.25.0)"] -support = ["mypy-boto3-support (>=1.24.0,<1.25.0)"] -swf = ["mypy-boto3-swf (>=1.24.0,<1.25.0)"] -synthetics = ["mypy-boto3-synthetics (>=1.24.0,<1.25.0)"] -textract = ["mypy-boto3-textract (>=1.24.0,<1.25.0)"] -timestream-query = ["mypy-boto3-timestream-query (>=1.24.0,<1.25.0)"] -timestream-write = ["mypy-boto3-timestream-write (>=1.24.0,<1.25.0)"] -transcribe = ["mypy-boto3-transcribe (>=1.24.0,<1.25.0)"] -transfer = ["mypy-boto3-transfer (>=1.24.0,<1.25.0)"] -translate = ["mypy-boto3-translate (>=1.24.0,<1.25.0)"] -voice-id = ["mypy-boto3-voice-id (>=1.24.0,<1.25.0)"] -waf = ["mypy-boto3-waf (>=1.24.0,<1.25.0)"] -waf-regional = ["mypy-boto3-waf-regional (>=1.24.0,<1.25.0)"] -wafv2 = ["mypy-boto3-wafv2 (>=1.24.0,<1.25.0)"] -wellarchitected = ["mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)"] -wisdom = ["mypy-boto3-wisdom (>=1.24.0,<1.25.0)"] -workdocs = ["mypy-boto3-workdocs (>=1.24.0,<1.25.0)"] worklink = ["mypy-boto3-worklink (>=1.24.0,<1.25.0)"] -workmail = ["mypy-boto3-workmail (>=1.24.0,<1.25.0)"] -workmailmessageflow = ["mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)"] -workspaces = ["mypy-boto3-workspaces (>=1.24.0,<1.25.0)"] -workspaces-web = ["mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)"] +workdocs = ["mypy-boto3-workdocs (>=1.24.0,<1.25.0)"] +wisdom = ["mypy-boto3-wisdom (>=1.24.0,<1.25.0)"] +wellarchitected = ["mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)"] +wafv2 = ["mypy-boto3-wafv2 (>=1.24.0,<1.25.0)"] +waf-regional = ["mypy-boto3-waf-regional (>=1.24.0,<1.25.0)"] +waf = ["mypy-boto3-waf (>=1.24.0,<1.25.0)"] +voice-id = ["mypy-boto3-voice-id (>=1.24.0,<1.25.0)"] +translate = ["mypy-boto3-translate (>=1.24.0,<1.25.0)"] +transfer = ["mypy-boto3-transfer (>=1.24.0,<1.25.0)"] +transcribe = ["mypy-boto3-transcribe (>=1.24.0,<1.25.0)"] +timestream-write = ["mypy-boto3-timestream-write (>=1.24.0,<1.25.0)"] +timestream-query = ["mypy-boto3-timestream-query (>=1.24.0,<1.25.0)"] +textract = ["mypy-boto3-textract (>=1.24.0,<1.25.0)"] +synthetics = ["mypy-boto3-synthetics (>=1.24.0,<1.25.0)"] +swf = ["mypy-boto3-swf (>=1.24.0,<1.25.0)"] +support = ["mypy-boto3-support (>=1.24.0,<1.25.0)"] +sts = ["mypy-boto3-sts (>=1.24.0,<1.25.0)"] +storagegateway = ["mypy-boto3-storagegateway (>=1.24.0,<1.25.0)"] +stepfunctions = ["mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)"] +sso-oidc = ["mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)"] +sso-admin = ["mypy-boto3-sso-admin (>=1.24.0,<1.25.0)"] +sso = ["mypy-boto3-sso (>=1.24.0,<1.25.0)"] +ssm-incidents = ["mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)"] +ssm-contacts = ["mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)"] +ssm = ["mypy-boto3-ssm (>=1.24.0,<1.25.0)"] +sqs = ["mypy-boto3-sqs (>=1.24.0,<1.25.0)"] +sns = ["mypy-boto3-sns (>=1.24.0,<1.25.0)"] +snowball = ["mypy-boto3-snowball (>=1.24.0,<1.25.0)"] +snow-device-management = ["mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)"] +sms-voice = ["mypy-boto3-sms-voice (>=1.24.0,<1.25.0)"] +sms = ["mypy-boto3-sms (>=1.24.0,<1.25.0)"] +signer = ["mypy-boto3-signer (>=1.24.0,<1.25.0)"] +shield = ["mypy-boto3-shield (>=1.24.0,<1.25.0)"] +sesv2 = ["mypy-boto3-sesv2 (>=1.24.0,<1.25.0)"] +ses = ["mypy-boto3-ses (>=1.24.0,<1.25.0)"] +servicediscovery = ["mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)"] +servicecatalog-appregistry = ["mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)"] +servicecatalog = ["mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)"] +service-quotas = ["mypy-boto3-service-quotas (>=1.24.0,<1.25.0)"] +serverlessrepo = ["mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)"] +securityhub = ["mypy-boto3-securityhub (>=1.24.0,<1.25.0)"] +secretsmanager = ["mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)"] +sdb = ["mypy-boto3-sdb (>=1.24.0,<1.25.0)"] +schemas = ["mypy-boto3-schemas (>=1.24.0,<1.25.0)"] +savingsplans = ["mypy-boto3-savingsplans (>=1.24.0,<1.25.0)"] +sagemaker-runtime = ["mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)"] +sagemaker-featurestore-runtime = ["mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)"] +sagemaker-edge = ["mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)"] +sagemaker-a2i-runtime = ["mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)"] +sagemaker = ["mypy-boto3-sagemaker (>=1.24.0,<1.25.0)"] +s3outposts = ["mypy-boto3-s3outposts (>=1.24.0,<1.25.0)"] +s3control = ["mypy-boto3-s3control (>=1.24.0,<1.25.0)"] +s3 = ["mypy-boto3-s3 (>=1.24.0,<1.25.0)"] +rum = ["mypy-boto3-rum (>=1.24.0,<1.25.0)"] +route53resolver = ["mypy-boto3-route53resolver (>=1.24.0,<1.25.0)"] +route53domains = ["mypy-boto3-route53domains (>=1.24.0,<1.25.0)"] +route53-recovery-readiness = ["mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)"] +route53-recovery-control-config = ["mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)"] +route53-recovery-cluster = ["mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)"] +route53 = ["mypy-boto3-route53 (>=1.24.0,<1.25.0)"] +rolesanywhere = ["mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)"] +robomaker = ["mypy-boto3-robomaker (>=1.24.0,<1.25.0)"] +resourcegroupstaggingapi = ["mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)"] +resource-groups = ["mypy-boto3-resource-groups (>=1.24.0,<1.25.0)"] +resiliencehub = ["mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)"] +rekognition = ["mypy-boto3-rekognition (>=1.24.0,<1.25.0)"] +redshift-serverless = ["mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)"] +redshift-data = ["mypy-boto3-redshift-data (>=1.24.0,<1.25.0)"] +redshift = ["mypy-boto3-redshift (>=1.24.0,<1.25.0)"] +rds-data = ["mypy-boto3-rds-data (>=1.24.0,<1.25.0)"] +rds = ["mypy-boto3-rds (>=1.24.0,<1.25.0)"] +rbin = ["mypy-boto3-rbin (>=1.24.0,<1.25.0)"] +ram = ["mypy-boto3-ram (>=1.24.0,<1.25.0)"] +quicksight = ["mypy-boto3-quicksight (>=1.24.0,<1.25.0)"] +qldb-session = ["mypy-boto3-qldb-session (>=1.24.0,<1.25.0)"] +qldb = ["mypy-boto3-qldb (>=1.24.0,<1.25.0)"] +proton = ["mypy-boto3-proton (>=1.24.0,<1.25.0)"] +privatenetworks = ["mypy-boto3-privatenetworks (>=1.24.0,<1.25.0)"] +pricing = ["mypy-boto3-pricing (>=1.24.0,<1.25.0)"] +polly = ["mypy-boto3-polly (>=1.24.0,<1.25.0)"] +pinpoint-sms-voice-v2 = ["mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)"] +pinpoint-sms-voice = ["mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)"] +pinpoint-email = ["mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)"] +pinpoint = ["mypy-boto3-pinpoint (>=1.24.0,<1.25.0)"] +pi = ["mypy-boto3-pi (>=1.24.0,<1.25.0)"] +personalize-runtime = ["mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)"] +personalize-events = ["mypy-boto3-personalize-events (>=1.24.0,<1.25.0)"] +personalize = ["mypy-boto3-personalize (>=1.24.0,<1.25.0)"] +panorama = ["mypy-boto3-panorama (>=1.24.0,<1.25.0)"] +outposts = ["mypy-boto3-outposts (>=1.24.0,<1.25.0)"] +organizations = ["mypy-boto3-organizations (>=1.24.0,<1.25.0)"] +opsworkscm = ["mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)"] +opsworks = ["mypy-boto3-opsworks (>=1.24.0,<1.25.0)"] +opensearch = ["mypy-boto3-opensearch (>=1.24.0,<1.25.0)"] +nimble = ["mypy-boto3-nimble (>=1.24.0,<1.25.0)"] +networkmanager = ["mypy-boto3-networkmanager (>=1.24.0,<1.25.0)"] +network-firewall = ["mypy-boto3-network-firewall (>=1.24.0,<1.25.0)"] +neptune = ["mypy-boto3-neptune (>=1.24.0,<1.25.0)"] +mwaa = ["mypy-boto3-mwaa (>=1.24.0,<1.25.0)"] +mturk = ["mypy-boto3-mturk (>=1.24.0,<1.25.0)"] +mq = ["mypy-boto3-mq (>=1.24.0,<1.25.0)"] +mobile = ["mypy-boto3-mobile (>=1.24.0,<1.25.0)"] +migrationhubstrategy = ["mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)"] +migrationhub-config = ["mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)"] +migration-hub-refactor-spaces = ["mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)"] +mgn = ["mypy-boto3-mgn (>=1.24.0,<1.25.0)"] +mgh = ["mypy-boto3-mgh (>=1.24.0,<1.25.0)"] +meteringmarketplace = ["mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)"] +memorydb = ["mypy-boto3-memorydb (>=1.24.0,<1.25.0)"] +mediatailor = ["mypy-boto3-mediatailor (>=1.24.0,<1.25.0)"] +mediastore-data = ["mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)"] +mediastore = ["mypy-boto3-mediastore (>=1.24.0,<1.25.0)"] +mediapackage-vod = ["mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)"] +mediapackage = ["mypy-boto3-mediapackage (>=1.24.0,<1.25.0)"] xray = ["mypy-boto3-xray (>=1.24.0,<1.25.0)"] +workspaces-web = ["mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)"] +workspaces = ["mypy-boto3-workspaces (>=1.24.0,<1.25.0)"] +workmailmessageflow = ["mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)"] +workmail = ["mypy-boto3-workmail (>=1.24.0,<1.25.0)"] +medialive = ["mypy-boto3-medialive (>=1.24.0,<1.25.0)"] +kinesisanalytics = ["mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)"] +kinesis-video-signaling = ["mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)"] +kinesis-video-media = ["mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)"] +kinesis-video-archived-media = ["mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)"] +kinesis = ["mypy-boto3-kinesis (>=1.24.0,<1.25.0)"] +keyspaces = ["mypy-boto3-keyspaces (>=1.24.0,<1.25.0)"] +kendra = ["mypy-boto3-kendra (>=1.24.0,<1.25.0)"] +kafkaconnect = ["mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)"] +kafka = ["mypy-boto3-kafka (>=1.24.0,<1.25.0)"] +ivschat = ["mypy-boto3-ivschat (>=1.24.0,<1.25.0)"] +ivs = ["mypy-boto3-ivs (>=1.24.0,<1.25.0)"] +iotwireless = ["mypy-boto3-iotwireless (>=1.24.0,<1.25.0)"] +iottwinmaker = ["mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)"] +iotthingsgraph = ["mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)"] +iotsitewise = ["mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)"] +iotsecuretunneling = ["mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)"] +iotfleethub = ["mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)"] +iotevents-data = ["mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)"] +iotevents = ["mypy-boto3-iotevents (>=1.24.0,<1.25.0)"] +iotdeviceadvisor = ["mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)"] +iotanalytics = ["mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)"] +iot1click-projects = ["mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)"] +iot1click-devices = ["mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)"] +iot-jobs-data = ["mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)"] +iot-data = ["mypy-boto3-iot-data (>=1.24.0,<1.25.0)"] +iot = ["mypy-boto3-iot (>=1.24.0,<1.25.0)"] +inspector2 = ["mypy-boto3-inspector2 (>=1.24.0,<1.25.0)"] +inspector = ["mypy-boto3-inspector (>=1.24.0,<1.25.0)"] +importexport = ["mypy-boto3-importexport (>=1.24.0,<1.25.0)"] +imagebuilder = ["mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)"] +identitystore = ["mypy-boto3-identitystore (>=1.24.0,<1.25.0)"] +iam = ["mypy-boto3-iam (>=1.24.0,<1.25.0)"] +honeycode = ["mypy-boto3-honeycode (>=1.24.0,<1.25.0)"] +healthlake = ["mypy-boto3-healthlake (>=1.24.0,<1.25.0)"] +health = ["mypy-boto3-health (>=1.24.0,<1.25.0)"] +guardduty = ["mypy-boto3-guardduty (>=1.24.0,<1.25.0)"] +groundstation = ["mypy-boto3-groundstation (>=1.24.0,<1.25.0)"] +greengrassv2 = ["mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)"] +greengrass = ["mypy-boto3-greengrass (>=1.24.0,<1.25.0)"] +grafana = ["mypy-boto3-grafana (>=1.24.0,<1.25.0)"] +glue = ["mypy-boto3-glue (>=1.24.0,<1.25.0)"] +globalaccelerator = ["mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)"] +glacier = ["mypy-boto3-glacier (>=1.24.0,<1.25.0)"] +gamesparks = ["mypy-boto3-gamesparks (>=1.24.0,<1.25.0)"] +gamelift = ["mypy-boto3-gamelift (>=1.24.0,<1.25.0)"] +fsx = ["mypy-boto3-fsx (>=1.24.0,<1.25.0)"] +frauddetector = ["mypy-boto3-frauddetector (>=1.24.0,<1.25.0)"] +forecastquery = ["mypy-boto3-forecastquery (>=1.24.0,<1.25.0)"] +forecast = ["mypy-boto3-forecast (>=1.24.0,<1.25.0)"] +fms = ["mypy-boto3-fms (>=1.24.0,<1.25.0)"] +fis = ["mypy-boto3-fis (>=1.24.0,<1.25.0)"] +firehose = ["mypy-boto3-firehose (>=1.24.0,<1.25.0)"] +finspace-data = ["mypy-boto3-finspace-data (>=1.24.0,<1.25.0)"] +finspace = ["mypy-boto3-finspace (>=1.24.0,<1.25.0)"] +evidently = ["mypy-boto3-evidently (>=1.24.0,<1.25.0)"] +events = ["mypy-boto3-events (>=1.24.0,<1.25.0)"] +essential = ["mypy-boto3-sqs (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-cloudformation (>=1.24.0,<1.25.0)"] +es = ["mypy-boto3-es (>=1.24.0,<1.25.0)"] +emr-serverless = ["mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)"] +emr-containers = ["mypy-boto3-emr-containers (>=1.24.0,<1.25.0)"] +emr = ["mypy-boto3-emr (>=1.24.0,<1.25.0)"] +elbv2 = ["mypy-boto3-elbv2 (>=1.24.0,<1.25.0)"] +elb = ["mypy-boto3-elb (>=1.24.0,<1.25.0)"] +elastictranscoder = ["mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)"] +elasticbeanstalk = ["mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)"] +elasticache = ["mypy-boto3-elasticache (>=1.24.0,<1.25.0)"] +elastic-inference = ["mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)"] +eks = ["mypy-boto3-eks (>=1.24.0,<1.25.0)"] +efs = ["mypy-boto3-efs (>=1.24.0,<1.25.0)"] +ecs = ["mypy-boto3-ecs (>=1.24.0,<1.25.0)"] +ecr-public = ["mypy-boto3-ecr-public (>=1.24.0,<1.25.0)"] +ecr = ["mypy-boto3-ecr (>=1.24.0,<1.25.0)"] +ec2-instance-connect = ["mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)"] +ec2 = ["mypy-boto3-ec2 (>=1.24.0,<1.25.0)"] +ebs = ["mypy-boto3-ebs (>=1.24.0,<1.25.0)"] +dynamodbstreams = ["mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)"] +dynamodb = ["mypy-boto3-dynamodb (>=1.24.0,<1.25.0)"] +ds = ["mypy-boto3-ds (>=1.24.0,<1.25.0)"] +drs = ["mypy-boto3-drs (>=1.24.0,<1.25.0)"] +docdb = ["mypy-boto3-docdb (>=1.24.0,<1.25.0)"] +dms = ["mypy-boto3-dms (>=1.24.0,<1.25.0)"] +dlm = ["mypy-boto3-dlm (>=1.24.0,<1.25.0)"] +discovery = ["mypy-boto3-discovery (>=1.24.0,<1.25.0)"] +directconnect = ["mypy-boto3-directconnect (>=1.24.0,<1.25.0)"] +devops-guru = ["mypy-boto3-devops-guru (>=1.24.0,<1.25.0)"] +devicefarm = ["mypy-boto3-devicefarm (>=1.24.0,<1.25.0)"] +detective = ["mypy-boto3-detective (>=1.24.0,<1.25.0)"] +dax = ["mypy-boto3-dax (>=1.24.0,<1.25.0)"] +datasync = ["mypy-boto3-datasync (>=1.24.0,<1.25.0)"] +datapipeline = ["mypy-boto3-datapipeline (>=1.24.0,<1.25.0)"] +dataexchange = ["mypy-boto3-dataexchange (>=1.24.0,<1.25.0)"] +databrew = ["mypy-boto3-databrew (>=1.24.0,<1.25.0)"] +customer-profiles = ["mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)"] +cur = ["mypy-boto3-cur (>=1.24.0,<1.25.0)"] +connectparticipant = ["mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)"] +connectcampaigns = ["mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)"] +connect-contact-lens = ["mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)"] +connect = ["mypy-boto3-connect (>=1.24.0,<1.25.0)"] +config = ["mypy-boto3-config (>=1.24.0,<1.25.0)"] +compute-optimizer = ["mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)"] +comprehendmedical = ["mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)"] +comprehend = ["mypy-boto3-comprehend (>=1.24.0,<1.25.0)"] +cognito-sync = ["mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)"] +cognito-idp = ["mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)"] +cognito-identity = ["mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)"] +codestar-notifications = ["mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)"] +codestar-connections = ["mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)"] +codestar = ["mypy-boto3-codestar (>=1.24.0,<1.25.0)"] +codepipeline = ["mypy-boto3-codepipeline (>=1.24.0,<1.25.0)"] +mediaconvert = ["mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)"] +mediaconnect = ["mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)"] +marketplacecommerceanalytics = ["mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)"] +marketplace-entitlement = ["mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)"] +marketplace-catalog = ["mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)"] +managedblockchain = ["mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)"] +macie2 = ["mypy-boto3-macie2 (>=1.24.0,<1.25.0)"] +macie = ["mypy-boto3-macie (>=1.24.0,<1.25.0)"] +machinelearning = ["mypy-boto3-machinelearning (>=1.24.0,<1.25.0)"] +m2 = ["mypy-boto3-m2 (>=1.24.0,<1.25.0)"] +lookoutvision = ["mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)"] +lookoutmetrics = ["mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)"] +lookoutequipment = ["mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)"] +logs = ["mypy-boto3-logs (>=1.24.0,<1.25.0)"] +location = ["mypy-boto3-location (>=1.24.0,<1.25.0)"] +lightsail = ["mypy-boto3-lightsail (>=1.24.0,<1.25.0)"] +license-manager-user-subscriptions = ["mypy-boto3-license-manager-user-subscriptions (>=1.24.0,<1.25.0)"] +license-manager = ["mypy-boto3-license-manager (>=1.24.0,<1.25.0)"] +lexv2-runtime = ["mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)"] +lexv2-models = ["mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)"] +lex-runtime = ["mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)"] +lex-models = ["mypy-boto3-lex-models (>=1.24.0,<1.25.0)"] +lambda = ["mypy-boto3-lambda (>=1.24.0,<1.25.0)"] +lakeformation = ["mypy-boto3-lakeformation (>=1.24.0,<1.25.0)"] +kms = ["mypy-boto3-kms (>=1.24.0,<1.25.0)"] +kinesisvideo = ["mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)"] +kinesisanalyticsv2 = ["mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)"] +codeguruprofiler = ["mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)"] +all = ["mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-kafka (>=1.24.0,<1.25.0)", "mypy-boto3-ivschat (>=1.24.0,<1.25.0)", "mypy-boto3-ivs (>=1.24.0,<1.25.0)", "mypy-boto3-iotwireless (>=1.24.0,<1.25.0)", "mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)", "mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)", "mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)", "mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)", "mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents (>=1.24.0,<1.25.0)", "mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)", "mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)", "mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot (>=1.24.0,<1.25.0)", "mypy-boto3-inspector2 (>=1.24.0,<1.25.0)", "mypy-boto3-inspector (>=1.24.0,<1.25.0)", "mypy-boto3-importexport (>=1.24.0,<1.25.0)", "mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)", "mypy-boto3-identitystore (>=1.24.0,<1.25.0)", "mypy-boto3-iam (>=1.24.0,<1.25.0)", "mypy-boto3-honeycode (>=1.24.0,<1.25.0)", "mypy-boto3-healthlake (>=1.24.0,<1.25.0)", "mypy-boto3-health (>=1.24.0,<1.25.0)", "mypy-boto3-guardduty (>=1.24.0,<1.25.0)", "mypy-boto3-groundstation (>=1.24.0,<1.25.0)", "mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)", "mypy-boto3-greengrass (>=1.24.0,<1.25.0)", "mypy-boto3-grafana (>=1.24.0,<1.25.0)", "mypy-boto3-glue (>=1.24.0,<1.25.0)", "mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)", "mypy-boto3-glacier (>=1.24.0,<1.25.0)", "mypy-boto3-gamesparks (>=1.24.0,<1.25.0)", "mypy-boto3-gamelift (>=1.24.0,<1.25.0)", "mypy-boto3-fsx (>=1.24.0,<1.25.0)", "mypy-boto3-frauddetector (>=1.24.0,<1.25.0)", "mypy-boto3-forecastquery (>=1.24.0,<1.25.0)", "mypy-boto3-forecast (>=1.24.0,<1.25.0)", "mypy-boto3-fms (>=1.24.0,<1.25.0)", "mypy-boto3-fis (>=1.24.0,<1.25.0)", "mypy-boto3-firehose (>=1.24.0,<1.25.0)", "mypy-boto3-finspace-data (>=1.24.0,<1.25.0)", "mypy-boto3-finspace (>=1.24.0,<1.25.0)", "mypy-boto3-evidently (>=1.24.0,<1.25.0)", "mypy-boto3-events (>=1.24.0,<1.25.0)", "mypy-boto3-es (>=1.24.0,<1.25.0)", "mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-emr-containers (>=1.24.0,<1.25.0)", "mypy-boto3-emr (>=1.24.0,<1.25.0)", "mypy-boto3-elbv2 (>=1.24.0,<1.25.0)", "mypy-boto3-elb (>=1.24.0,<1.25.0)", "mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)", "mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)", "mypy-boto3-elasticache (>=1.24.0,<1.25.0)", "mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)", "mypy-boto3-eks (>=1.24.0,<1.25.0)", "mypy-boto3-efs (>=1.24.0,<1.25.0)", "mypy-boto3-ecs (>=1.24.0,<1.25.0)", "mypy-boto3-ecr-public (>=1.24.0,<1.25.0)", "mypy-boto3-ecr (>=1.24.0,<1.25.0)", "mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-ebs (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-ds (>=1.24.0,<1.25.0)", "mypy-boto3-drs (>=1.24.0,<1.25.0)", "mypy-boto3-docdb (>=1.24.0,<1.25.0)", "mypy-boto3-dms (>=1.24.0,<1.25.0)", "mypy-boto3-dlm (>=1.24.0,<1.25.0)", "mypy-boto3-discovery (>=1.24.0,<1.25.0)", "mypy-boto3-directconnect (>=1.24.0,<1.25.0)", "mypy-boto3-devops-guru (>=1.24.0,<1.25.0)", "mypy-boto3-devicefarm (>=1.24.0,<1.25.0)", "mypy-boto3-detective (>=1.24.0,<1.25.0)", "mypy-boto3-dax (>=1.24.0,<1.25.0)", "mypy-boto3-datasync (>=1.24.0,<1.25.0)", "mypy-boto3-datapipeline (>=1.24.0,<1.25.0)", "mypy-boto3-dataexchange (>=1.24.0,<1.25.0)", "mypy-boto3-databrew (>=1.24.0,<1.25.0)", "mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)", "mypy-boto3-cur (>=1.24.0,<1.25.0)", "mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)", "mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)", "mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)", "mypy-boto3-connect (>=1.24.0,<1.25.0)", "mypy-boto3-config (>=1.24.0,<1.25.0)", "mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)", "mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)", "mypy-boto3-comprehend (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)", "mypy-boto3-codestar (>=1.24.0,<1.25.0)", "mypy-boto3-codepipeline (>=1.24.0,<1.25.0)", "mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)", "mypy-boto3-codedeploy (>=1.24.0,<1.25.0)", "mypy-boto3-codecommit (>=1.24.0,<1.25.0)", "mypy-boto3-codebuild (>=1.24.0,<1.25.0)", "mypy-boto3-codeartifact (>=1.24.0,<1.25.0)", "mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)", "mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)", "mypy-boto3-cloudfront (>=1.24.0,<1.25.0)", "mypy-boto3-cloudformation (>=1.24.0,<1.25.0)", "mypy-boto3-xray (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces (>=1.24.0,<1.25.0)", "mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)", "mypy-boto3-workmail (>=1.24.0,<1.25.0)", "mypy-boto3-worklink (>=1.24.0,<1.25.0)", "mypy-boto3-workdocs (>=1.24.0,<1.25.0)", "mypy-boto3-wisdom (>=1.24.0,<1.25.0)", "mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)", "mypy-boto3-wafv2 (>=1.24.0,<1.25.0)", "mypy-boto3-waf-regional (>=1.24.0,<1.25.0)", "mypy-boto3-waf (>=1.24.0,<1.25.0)", "mypy-boto3-voice-id (>=1.24.0,<1.25.0)", "mypy-boto3-translate (>=1.24.0,<1.25.0)", "mypy-boto3-transfer (>=1.24.0,<1.25.0)", "mypy-boto3-transcribe (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-write (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-query (>=1.24.0,<1.25.0)", "mypy-boto3-textract (>=1.24.0,<1.25.0)", "mypy-boto3-synthetics (>=1.24.0,<1.25.0)", "mypy-boto3-swf (>=1.24.0,<1.25.0)", "mypy-boto3-support (>=1.24.0,<1.25.0)", "mypy-boto3-sts (>=1.24.0,<1.25.0)", "mypy-boto3-storagegateway (>=1.24.0,<1.25.0)", "mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)", "mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)", "mypy-boto3-sso-admin (>=1.24.0,<1.25.0)", "mypy-boto3-sso (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)", "mypy-boto3-ssm (>=1.24.0,<1.25.0)", "mypy-boto3-sqs (>=1.24.0,<1.25.0)", "mypy-boto3-sns (>=1.24.0,<1.25.0)", "mypy-boto3-snowball (>=1.24.0,<1.25.0)", "mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)", "mypy-boto3-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-sms (>=1.24.0,<1.25.0)", "mypy-boto3-signer (>=1.24.0,<1.25.0)", "mypy-boto3-shield (>=1.24.0,<1.25.0)", "mypy-boto3-sesv2 (>=1.24.0,<1.25.0)", "mypy-boto3-ses (>=1.24.0,<1.25.0)", "mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)", "mypy-boto3-service-quotas (>=1.24.0,<1.25.0)", "mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)", "mypy-boto3-securityhub (>=1.24.0,<1.25.0)", "mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)", "mypy-boto3-sdb (>=1.24.0,<1.25.0)", "mypy-boto3-schemas (>=1.24.0,<1.25.0)", "mypy-boto3-savingsplans (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker (>=1.24.0,<1.25.0)", "mypy-boto3-s3outposts (>=1.24.0,<1.25.0)", "mypy-boto3-s3control (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-rum (>=1.24.0,<1.25.0)", "mypy-boto3-route53resolver (>=1.24.0,<1.25.0)", "mypy-boto3-route53domains (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)", "mypy-boto3-route53 (>=1.24.0,<1.25.0)", "mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)", "mypy-boto3-robomaker (>=1.24.0,<1.25.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)", "mypy-boto3-resource-groups (>=1.24.0,<1.25.0)", "mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)", "mypy-boto3-rekognition (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift (>=1.24.0,<1.25.0)", "mypy-boto3-rds-data (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-rbin (>=1.24.0,<1.25.0)", "mypy-boto3-ram (>=1.24.0,<1.25.0)", "mypy-boto3-quicksight (>=1.24.0,<1.25.0)", "mypy-boto3-qldb-session (>=1.24.0,<1.25.0)", "mypy-boto3-qldb (>=1.24.0,<1.25.0)", "mypy-boto3-proton (>=1.24.0,<1.25.0)", "mypy-boto3-privatenetworks (>=1.24.0,<1.25.0)", "mypy-boto3-pricing (>=1.24.0,<1.25.0)", "mypy-boto3-polly (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint (>=1.24.0,<1.25.0)", "mypy-boto3-pi (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-events (>=1.24.0,<1.25.0)", "mypy-boto3-personalize (>=1.24.0,<1.25.0)", "mypy-boto3-panorama (>=1.24.0,<1.25.0)", "mypy-boto3-outposts (>=1.24.0,<1.25.0)", "mypy-boto3-organizations (>=1.24.0,<1.25.0)", "mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)", "mypy-boto3-opsworks (>=1.24.0,<1.25.0)", "mypy-boto3-opensearch (>=1.24.0,<1.25.0)", "mypy-boto3-nimble (>=1.24.0,<1.25.0)", "mypy-boto3-networkmanager (>=1.24.0,<1.25.0)", "mypy-boto3-network-firewall (>=1.24.0,<1.25.0)", "mypy-boto3-neptune (>=1.24.0,<1.25.0)", "mypy-boto3-mwaa (>=1.24.0,<1.25.0)", "mypy-boto3-mturk (>=1.24.0,<1.25.0)", "mypy-boto3-mq (>=1.24.0,<1.25.0)", "mypy-boto3-mobile (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)", "mypy-boto3-mgn (>=1.24.0,<1.25.0)", "mypy-boto3-mgh (>=1.24.0,<1.25.0)", "mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)", "mypy-boto3-memorydb (>=1.24.0,<1.25.0)", "mypy-boto3-mediatailor (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage (>=1.24.0,<1.25.0)", "mypy-boto3-medialive (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)", "mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)", "mypy-boto3-macie2 (>=1.24.0,<1.25.0)", "mypy-boto3-macie (>=1.24.0,<1.25.0)", "mypy-boto3-machinelearning (>=1.24.0,<1.25.0)", "mypy-boto3-m2 (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)", "mypy-boto3-logs (>=1.24.0,<1.25.0)", "mypy-boto3-location (>=1.24.0,<1.25.0)", "mypy-boto3-lightsail (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager-user-subscriptions (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)", "mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-lex-models (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-lakeformation (>=1.24.0,<1.25.0)", "mypy-boto3-kms (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis (>=1.24.0,<1.25.0)", "mypy-boto3-keyspaces (>=1.24.0,<1.25.0)", "mypy-boto3-kendra (>=1.24.0,<1.25.0)", "mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)", "mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)", "mypy-boto3-cloud9 (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)", "mypy-boto3-chime (>=1.24.0,<1.25.0)", "mypy-boto3-ce (>=1.24.0,<1.25.0)", "mypy-boto3-budgets (>=1.24.0,<1.25.0)", "mypy-boto3-braket (>=1.24.0,<1.25.0)", "mypy-boto3-billingconductor (>=1.24.0,<1.25.0)", "mypy-boto3-batch (>=1.24.0,<1.25.0)", "mypy-boto3-backupstorage (>=1.24.0,<1.25.0)", "mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)", "mypy-boto3-backup (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-auditmanager (>=1.24.0,<1.25.0)", "mypy-boto3-athena (>=1.24.0,<1.25.0)", "mypy-boto3-appsync (>=1.24.0,<1.25.0)", "mypy-boto3-appstream (>=1.24.0,<1.25.0)", "mypy-boto3-apprunner (>=1.24.0,<1.25.0)", "mypy-boto3-appmesh (>=1.24.0,<1.25.0)", "mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-application-insights (>=1.24.0,<1.25.0)", "mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-appintegrations (>=1.24.0,<1.25.0)", "mypy-boto3-appflow (>=1.24.0,<1.25.0)", "mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)", "mypy-boto3-appconfig (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)", "mypy-boto3-apigateway (>=1.24.0,<1.25.0)", "mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)", "mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)", "mypy-boto3-amplify (>=1.24.0,<1.25.0)", "mypy-boto3-amp (>=1.24.0,<1.25.0)", "mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)", "mypy-boto3-acm-pca (>=1.24.0,<1.25.0)", "mypy-boto3-acm (>=1.24.0,<1.25.0)", "mypy-boto3-account (>=1.24.0,<1.25.0)", "mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)"] +budgets = ["mypy-boto3-budgets (>=1.24.0,<1.25.0)"] +braket = ["mypy-boto3-braket (>=1.24.0,<1.25.0)"] +billingconductor = ["mypy-boto3-billingconductor (>=1.24.0,<1.25.0)"] +batch = ["mypy-boto3-batch (>=1.24.0,<1.25.0)"] +backupstorage = ["mypy-boto3-backupstorage (>=1.24.0,<1.25.0)"] +backup-gateway = ["mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)"] +backup = ["mypy-boto3-backup (>=1.24.0,<1.25.0)"] +autoscaling-plans = ["mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)"] +autoscaling = ["mypy-boto3-autoscaling (>=1.24.0,<1.25.0)"] +auditmanager = ["mypy-boto3-auditmanager (>=1.24.0,<1.25.0)"] +athena = ["mypy-boto3-athena (>=1.24.0,<1.25.0)"] +appsync = ["mypy-boto3-appsync (>=1.24.0,<1.25.0)"] +appstream = ["mypy-boto3-appstream (>=1.24.0,<1.25.0)"] +apprunner = ["mypy-boto3-apprunner (>=1.24.0,<1.25.0)"] +appmesh = ["mypy-boto3-appmesh (>=1.24.0,<1.25.0)"] +applicationcostprofiler = ["mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)"] +application-insights = ["mypy-boto3-application-insights (>=1.24.0,<1.25.0)"] +application-autoscaling = ["mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)"] +appintegrations = ["mypy-boto3-appintegrations (>=1.24.0,<1.25.0)"] +appflow = ["mypy-boto3-appflow (>=1.24.0,<1.25.0)"] +appconfigdata = ["mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)"] +appconfig = ["mypy-boto3-appconfig (>=1.24.0,<1.25.0)"] +apigatewayv2 = ["mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)"] +apigatewaymanagementapi = ["mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)"] +apigateway = ["mypy-boto3-apigateway (>=1.24.0,<1.25.0)"] +amplifyuibuilder = ["mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)"] +amplifybackend = ["mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)"] +amplify = ["mypy-boto3-amplify (>=1.24.0,<1.25.0)"] +amp = ["mypy-boto3-amp (>=1.24.0,<1.25.0)"] +codeguru-reviewer = ["mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)"] +codedeploy = ["mypy-boto3-codedeploy (>=1.24.0,<1.25.0)"] +codecommit = ["mypy-boto3-codecommit (>=1.24.0,<1.25.0)"] +codebuild = ["mypy-boto3-codebuild (>=1.24.0,<1.25.0)"] +codeartifact = ["mypy-boto3-codeartifact (>=1.24.0,<1.25.0)"] +cloudwatch = ["mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)"] +cloudtrail = ["mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)"] +cloudsearchdomain = ["mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)"] +cloudsearch = ["mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)"] +cloudhsmv2 = ["mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)"] +cloudhsm = ["mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)"] +cloudfront = ["mypy-boto3-cloudfront (>=1.24.0,<1.25.0)"] +cloudformation = ["mypy-boto3-cloudformation (>=1.24.0,<1.25.0)"] +clouddirectory = ["mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)"] +cloudcontrol = ["mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)"] +cloud9 = ["mypy-boto3-cloud9 (>=1.24.0,<1.25.0)"] +chime-sdk-messaging = ["mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)"] +chime-sdk-meetings = ["mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)"] +chime-sdk-media-pipelines = ["mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)"] +chime-sdk-identity = ["mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)"] +chime = ["mypy-boto3-chime (>=1.24.0,<1.25.0)"] +ce = ["mypy-boto3-ce (>=1.24.0,<1.25.0)"] +alexaforbusiness = ["mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)"] +acm-pca = ["mypy-boto3-acm-pca (>=1.24.0,<1.25.0)"] +acm = ["mypy-boto3-acm (>=1.24.0,<1.25.0)"] +account = ["mypy-boto3-account (>=1.24.0,<1.25.0)"] +accessanalyzer = ["mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)"] [[package]] name = "botocore" @@ -814,7 +842,7 @@ python-versions = "*" [[package]] name = "moto" -version = "3.1.17" +version = "3.1.18" description = "A library that allows your python tests to easily mock out the boto library" category = "main" optional = false @@ -828,7 +856,7 @@ cfn-lint = {version = ">=0.4.0", optional = true, markers = "extra == \"server\" cryptography = ">=3.3.1" docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""} ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""} -flask = {version = "*", optional = true, markers = "extra == \"server\""} +flask = {version = "<2.2.0", optional = true, markers = "extra == \"server\""} flask-cors = {version = "*", optional = true, markers = "extra == \"server\""} graphql-core = {version = "*", optional = true, markers = "extra == \"server\""} idna = {version = ">=2.5,<4", optional = true, markers = "extra == \"server\""} @@ -848,28 +876,28 @@ werkzeug = ">=0.5,<2.2.0" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.7)", "openapi-spec-validator (>=0.2.8)", "setuptools"] -apigateway = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)"] -apigatewayv2 = ["PyYAML (>=5.1)"] -appsync = ["graphql-core"] -awslambda = ["docker (>=2.5.1)"] -batch = ["docker (>=2.5.1)"] -cloudformation = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.7)", "openapi-spec-validator (>=0.2.8)", "setuptools"] -cognitoidp = ["python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)"] -ds = ["sshpubkeys (>=3.1.0)"] -dynamodb = ["docker (>=2.5.1)"] -dynamodb2 = ["docker (>=2.5.1)"] -dynamodbstreams = ["docker (>=2.5.1)"] -ebs = ["sshpubkeys (>=3.1.0)"] -ec2 = ["sshpubkeys (>=3.1.0)"] -efs = ["sshpubkeys (>=3.1.0)"] -glue = ["pyparsing (>=3.0.7)"] -iotdata = ["jsondiff (>=1.1.2)"] -route53resolver = ["sshpubkeys (>=3.1.0)"] +xray = ["setuptools", "aws-xray-sdk (>=0.93,!=0.96)"] +ssm = ["dataclasses", "PyYAML (>=5.1)"] +server = ["flask-cors", "flask (<2.2.0)", "setuptools", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "sshpubkeys (>=3.1.0)", "cfn-lint (>=0.4.0)", "idna (>=2.5,<4)", "aws-xray-sdk (>=0.93,!=0.96)", "jsondiff (>=1.1.2)", "graphql-core", "docker (>=2.5.1)", "ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "PyYAML (>=5.1)"] s3 = ["PyYAML (>=5.1)"] -server = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.7)", "openapi-spec-validator (>=0.2.8)", "setuptools", "flask", "flask-cors"] -ssm = ["PyYAML (>=5.1)", "dataclasses"] -xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] +route53resolver = ["sshpubkeys (>=3.1.0)"] +iotdata = ["jsondiff (>=1.1.2)"] +glue = ["pyparsing (>=3.0.7)"] +efs = ["sshpubkeys (>=3.1.0)"] +ec2 = ["sshpubkeys (>=3.1.0)"] +ebs = ["sshpubkeys (>=3.1.0)"] +dynamodbstreams = ["docker (>=2.5.1)"] +dynamodb2 = ["docker (>=2.5.1)"] +dynamodb = ["docker (>=2.5.1)"] +ds = ["sshpubkeys (>=3.1.0)"] +cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] +cloudformation = ["setuptools", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "sshpubkeys (>=3.1.0)", "cfn-lint (>=0.4.0)", "idna (>=2.5,<4)", "aws-xray-sdk (>=0.93,!=0.96)", "jsondiff (>=1.1.2)", "graphql-core", "docker (>=2.5.1)", "ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "PyYAML (>=5.1)"] +batch = ["docker (>=2.5.1)"] +awslambda = ["docker (>=2.5.1)"] +appsync = ["graphql-core"] +apigatewayv2 = ["PyYAML (>=5.1)"] +apigateway = ["openapi-spec-validator (>=0.2.8)", "ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "PyYAML (>=5.1)"] +all = ["setuptools", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "sshpubkeys (>=3.1.0)", "cfn-lint (>=0.4.0)", "idna (>=2.5,<4)", "aws-xray-sdk (>=0.93,!=0.96)", "jsondiff (>=1.1.2)", "graphql-core", "docker (>=2.5.1)", "ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "PyYAML (>=5.1)"] [[package]] name = "mypy" @@ -1461,13 +1489,21 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "e58b30774603aa0f31579899a6c78579329c580f2f4bbaec209b0f9d52079fc6" +content-hash = "453b90e40481ca6e4395e84beb73489b58c0983e826e369eb0f412ef633ea5e1" [metadata.files] aiopg = [ {file = "aiopg-1.3.4-py3-none-any.whl", hash = "sha256:b5b74a124831aad71608c3c203479db90bac4a7eb3f8982bc48c3d3e6f1e57bf"}, {file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"}, ] +allure-pytest = [ + {file = "allure-pytest-2.9.45.tar.gz", hash = "sha256:20620fde08a597578b157a60ff38bdcc300e312d12eaa38cf28e4a62e22bdaa3"}, + {file = "allure_pytest-2.9.45-py3-none-any.whl", hash = "sha256:9b0325e06f8f79cf03289d4f4d741e57607d0fa12d9c094e243cbb042283f083"}, +] +allure-python-commons = [ + {file = "allure-python-commons-2.9.45.tar.gz", hash = "sha256:c238d28aeac35e8c7c517d8a2327e25ae5bbf2c30b5e2313d20ef11d75f5549d"}, + {file = "allure_python_commons-2.9.45-py3-none-any.whl", hash = "sha256:3572f0526db3946fb14470c58b0b41d343483aad91d37d414e4641815e13691a"}, +] async-timeout = [ {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"}, {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"}, @@ -1512,8 +1548,8 @@ boto3 = [ {file = "boto3-1.24.38.tar.gz", hash = "sha256:f4c6b025f392c934338c7f01badfddbd0d3cf2397ff5df35c31409798dce33f5"}, ] boto3-stubs = [ - {file = "boto3-stubs-1.24.46.tar.gz", hash = "sha256:9482238ed9ea7794e6e66a41376bf75d5950f0328de09fac9d224906dcc624ef"}, - {file = "boto3_stubs-1.24.46-py3-none-any.whl", hash = "sha256:3aa84f2925b4b50b7f47ac41a11ac05302e744cdf460cb7bcf6488319393d8a4"}, + {file = "boto3-stubs-1.24.51.tar.gz", hash = "sha256:ea69c707e9ceab7c11cab1f11fb4bbe98fa5ff8da593f888946d297daa083870"}, + {file = "boto3_stubs-1.24.51-py3-none-any.whl", hash = "sha256:432aebdb18e7c26bf2b148e04eb33e145976cb932bfe0f72b2d512e945927e57"}, ] botocore = [ {file = "botocore-1.27.38-py3-none-any.whl", hash = "sha256:46a0264ff3335496bd9cb404f83ec0d8eb7bfdef8f74a830c13e6a6b9612adea"}, @@ -1763,8 +1799,8 @@ mccabe = [ {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, ] moto = [ - {file = "moto-3.1.17-py3-none-any.whl", hash = "sha256:84797321fad9a9e924c1c0385b302c80ec23429724c016b504f4bfca9d40d33a"}, - {file = "moto-3.1.17.tar.gz", hash = "sha256:f2e5b32e8910c51c0b0de5b73f902bc53e06fb1c1d077d2b848d27e0b0cbe65e"}, + {file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"}, + {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"}, ] mypy = [ {file = "mypy-0.971-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f2899a3cbd394da157194f913a931edfd4be5f274a88041c9dc2d9cdcb1c315c"}, diff --git a/pyproject.toml b/pyproject.toml index 8a3d22f088..a54dbe9ebd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ prometheus-client = "^0.14.1" pytest-timeout = "^2.1.0" Werkzeug = "2.1.2" pytest-order = "^1.0.1" +allure-pytest = "^2.9.45" [tool.poetry.dev-dependencies] yapf = "==0.31.0" diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 5292bc1789..4483355c4c 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -24,6 +24,7 @@ import subprocess import time import filecmp import tempfile +import tarfile from contextlib import closing from pathlib import Path @@ -35,6 +36,7 @@ from psycopg2.extensions import make_dsn, parse_dsn from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple from typing_extensions import Literal +import allure # type: ignore import requests import backoff # type: ignore @@ -2237,6 +2239,14 @@ def get_test_output_dir(request: Any) -> pathlib.Path: return test_dir +ATTACHMENT_SUFFIXES = frozenset(( + '.log', + '.stderr', + '.stdout', + '.diffs', +)) + + # This is autouse, so the test output directory always gets created, even # if a test doesn't put anything there. It also solves a problem with the # neon_simple_env fixture: if TEST_SHARED_FIXTURES is not set, it @@ -2247,7 +2257,7 @@ def get_test_output_dir(request: Any) -> pathlib.Path: # this fixture ensures that the directory exists. That works because # 'autouse' fixtures are run before other fixtures. @pytest.fixture(scope='function', autouse=True) -def test_output_dir(request: Any) -> pathlib.Path: +def test_output_dir(request: Any) -> Iterator[pathlib.Path]: """ Create the working directory for an individual test. """ # one directory per test @@ -2255,7 +2265,26 @@ def test_output_dir(request: Any) -> pathlib.Path: log.info(f'test_output_dir is {test_dir}') shutil.rmtree(test_dir, ignore_errors=True) test_dir.mkdir() - return test_dir + + yield test_dir + + for attachment in test_dir.glob('**/*'): + if attachment.suffix in ATTACHMENT_SUFFIXES: + source = str(attachment) + name = str(attachment.relative_to(test_dir)) + attachment_type = 'text/plain' + extension = attachment.suffix.removeprefix('.') + + # compress files larger than 1Mb, they're hardly readable in a browser + if attachment.stat().st_size > 1024 * 1024: + source = f'{attachment}.tar.gz' + with tarfile.open(source, 'w:gz') as tar: + tar.add(attachment, arcname=attachment.name) + name = f'{name}.tar.gz' + attachment_type = 'application/gzip' + extension = 'tar.gz' + + allure.attach.file(source, name, attachment_type, extension) SKIP_DIRS = frozenset(('pg_wal', From 6b2e1d9065eb39d6533362bbbba53cec2e77ac7d Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 17 Aug 2022 15:05:37 +0100 Subject: [PATCH 0669/1022] test_runner: replace yapf with black and isort --- .github/workflows/codestyle.yml | 7 +- .yapfignore | 10 - docs/sourcetree.md | 7 +- poetry.lock | 743 ++++++++++++++++++-------------- pre-commit.py | 53 ++- pyproject.toml | 42 +- setup.cfg | 43 -- 7 files changed, 496 insertions(+), 409 deletions(-) delete mode 100644 .yapfignore delete mode 100644 setup.cfg diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index d0685f8fd2..bd0f368499 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -128,8 +128,11 @@ jobs: - name: Install Python deps run: ./scripts/pysync - - name: Run yapf to ensure code format - run: poetry run yapf --recursive --diff . + - name: Run isort to ensure code format + run: poetry run isort --diff --check . + + - name: Run black to ensure code format + run: poetry run black --diff --check . - name: Run mypy to check types run: poetry run mypy . diff --git a/.yapfignore b/.yapfignore deleted file mode 100644 index 149428e452..0000000000 --- a/.yapfignore +++ /dev/null @@ -1,10 +0,0 @@ -# This file is only read when `yapf` is run from this directory. -# Hence we only top-level directories here to avoid confusion. -# See source code for the exact file format: https://github.com/google/yapf/blob/c6077954245bc3add82dafd853a1c7305a6ebd20/yapf/yapflib/file_resources.py#L40-L43 -vendor/ -target/ -tmp_install/ -__pycache__/ -test_output/ -.neon/ -.git/ diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 39f7be89a0..f189134865 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -112,11 +112,12 @@ Run `poetry shell` to activate the virtual environment. Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`. ### Obligatory checks -We force code formatting via `yapf` and type hints via `mypy`. -Run the following commands in the repository's root (next to `setup.cfg`): +We force code formatting via `black`, `isort` and type hints via `mypy`. +Run the following commands in the repository's root (next to `pyproject.toml`): ```bash -poetry run yapf -ri . # All code is reformatted +poetry run isort . # Imports are reformatted +poetry run black . # All code is reformatted poetry run mypy . # Ensure there are no typing errors ``` diff --git a/poetry.lock b/poetry.lock index 17b59852f4..cd24641a4f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -117,6 +117,28 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "black" +version = "22.6.0" +description = "The uncompromising code formatter." +category = "dev" +optional = false +python-versions = ">=3.6.2" + +[package.dependencies] +click = ">=8.0.0" +mypy-extensions = ">=0.4.3" +pathspec = ">=0.9.0" +platformdirs = ">=2" +tomli = {version = ">=1.1.0", markers = "python_full_version < \"3.11.0a7\""} +typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} + +[package.extras] +colorama = ["colorama (>=0.4.3)"] +d = ["aiohttp (>=3.7.4)"] +jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] +uvloop = ["uvloop (>=0.15.2)"] + [[package]] name = "boto3" version = "1.24.38" @@ -135,8 +157,8 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "boto3-stubs" -version = "1.24.51" -description = "Type annotations for boto3 1.24.51 generated with mypy-boto3-builder 7.11.6" +version = "1.24.56" +description = "Type annotations for boto3 1.24.56 generated with mypy-boto3-builder 7.11.7" category = "main" optional = false python-versions = ">=3.7" @@ -148,321 +170,321 @@ types-s3transfer = "*" typing-extensions = ">=4.1.0" [package.extras] -worklink = ["mypy-boto3-worklink (>=1.24.0,<1.25.0)"] -workdocs = ["mypy-boto3-workdocs (>=1.24.0,<1.25.0)"] -wisdom = ["mypy-boto3-wisdom (>=1.24.0,<1.25.0)"] -wellarchitected = ["mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)"] -wafv2 = ["mypy-boto3-wafv2 (>=1.24.0,<1.25.0)"] -waf-regional = ["mypy-boto3-waf-regional (>=1.24.0,<1.25.0)"] -waf = ["mypy-boto3-waf (>=1.24.0,<1.25.0)"] -voice-id = ["mypy-boto3-voice-id (>=1.24.0,<1.25.0)"] -translate = ["mypy-boto3-translate (>=1.24.0,<1.25.0)"] -transfer = ["mypy-boto3-transfer (>=1.24.0,<1.25.0)"] -transcribe = ["mypy-boto3-transcribe (>=1.24.0,<1.25.0)"] -timestream-write = ["mypy-boto3-timestream-write (>=1.24.0,<1.25.0)"] -timestream-query = ["mypy-boto3-timestream-query (>=1.24.0,<1.25.0)"] -textract = ["mypy-boto3-textract (>=1.24.0,<1.25.0)"] -synthetics = ["mypy-boto3-synthetics (>=1.24.0,<1.25.0)"] -swf = ["mypy-boto3-swf (>=1.24.0,<1.25.0)"] -support = ["mypy-boto3-support (>=1.24.0,<1.25.0)"] -sts = ["mypy-boto3-sts (>=1.24.0,<1.25.0)"] -storagegateway = ["mypy-boto3-storagegateway (>=1.24.0,<1.25.0)"] -stepfunctions = ["mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)"] -sso-oidc = ["mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)"] -sso-admin = ["mypy-boto3-sso-admin (>=1.24.0,<1.25.0)"] -sso = ["mypy-boto3-sso (>=1.24.0,<1.25.0)"] -ssm-incidents = ["mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)"] -ssm-contacts = ["mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)"] -ssm = ["mypy-boto3-ssm (>=1.24.0,<1.25.0)"] -sqs = ["mypy-boto3-sqs (>=1.24.0,<1.25.0)"] -sns = ["mypy-boto3-sns (>=1.24.0,<1.25.0)"] -snowball = ["mypy-boto3-snowball (>=1.24.0,<1.25.0)"] -snow-device-management = ["mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)"] -sms-voice = ["mypy-boto3-sms-voice (>=1.24.0,<1.25.0)"] -sms = ["mypy-boto3-sms (>=1.24.0,<1.25.0)"] -signer = ["mypy-boto3-signer (>=1.24.0,<1.25.0)"] -shield = ["mypy-boto3-shield (>=1.24.0,<1.25.0)"] -sesv2 = ["mypy-boto3-sesv2 (>=1.24.0,<1.25.0)"] -ses = ["mypy-boto3-ses (>=1.24.0,<1.25.0)"] -servicediscovery = ["mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)"] -servicecatalog-appregistry = ["mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)"] -servicecatalog = ["mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)"] -service-quotas = ["mypy-boto3-service-quotas (>=1.24.0,<1.25.0)"] -serverlessrepo = ["mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)"] -securityhub = ["mypy-boto3-securityhub (>=1.24.0,<1.25.0)"] -secretsmanager = ["mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)"] -sdb = ["mypy-boto3-sdb (>=1.24.0,<1.25.0)"] -schemas = ["mypy-boto3-schemas (>=1.24.0,<1.25.0)"] -savingsplans = ["mypy-boto3-savingsplans (>=1.24.0,<1.25.0)"] -sagemaker-runtime = ["mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)"] -sagemaker-featurestore-runtime = ["mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)"] -sagemaker-edge = ["mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)"] -sagemaker-a2i-runtime = ["mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)"] -sagemaker = ["mypy-boto3-sagemaker (>=1.24.0,<1.25.0)"] -s3outposts = ["mypy-boto3-s3outposts (>=1.24.0,<1.25.0)"] -s3control = ["mypy-boto3-s3control (>=1.24.0,<1.25.0)"] -s3 = ["mypy-boto3-s3 (>=1.24.0,<1.25.0)"] -rum = ["mypy-boto3-rum (>=1.24.0,<1.25.0)"] -route53resolver = ["mypy-boto3-route53resolver (>=1.24.0,<1.25.0)"] -route53domains = ["mypy-boto3-route53domains (>=1.24.0,<1.25.0)"] -route53-recovery-readiness = ["mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)"] -route53-recovery-control-config = ["mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)"] -route53-recovery-cluster = ["mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)"] -route53 = ["mypy-boto3-route53 (>=1.24.0,<1.25.0)"] -rolesanywhere = ["mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)"] -robomaker = ["mypy-boto3-robomaker (>=1.24.0,<1.25.0)"] -resourcegroupstaggingapi = ["mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)"] -resource-groups = ["mypy-boto3-resource-groups (>=1.24.0,<1.25.0)"] -resiliencehub = ["mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)"] -rekognition = ["mypy-boto3-rekognition (>=1.24.0,<1.25.0)"] -redshift-serverless = ["mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)"] -redshift-data = ["mypy-boto3-redshift-data (>=1.24.0,<1.25.0)"] -redshift = ["mypy-boto3-redshift (>=1.24.0,<1.25.0)"] -rds-data = ["mypy-boto3-rds-data (>=1.24.0,<1.25.0)"] -rds = ["mypy-boto3-rds (>=1.24.0,<1.25.0)"] -rbin = ["mypy-boto3-rbin (>=1.24.0,<1.25.0)"] -ram = ["mypy-boto3-ram (>=1.24.0,<1.25.0)"] -quicksight = ["mypy-boto3-quicksight (>=1.24.0,<1.25.0)"] -qldb-session = ["mypy-boto3-qldb-session (>=1.24.0,<1.25.0)"] -qldb = ["mypy-boto3-qldb (>=1.24.0,<1.25.0)"] -proton = ["mypy-boto3-proton (>=1.24.0,<1.25.0)"] -privatenetworks = ["mypy-boto3-privatenetworks (>=1.24.0,<1.25.0)"] -pricing = ["mypy-boto3-pricing (>=1.24.0,<1.25.0)"] -polly = ["mypy-boto3-polly (>=1.24.0,<1.25.0)"] -pinpoint-sms-voice-v2 = ["mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)"] -pinpoint-sms-voice = ["mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)"] -pinpoint-email = ["mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)"] -pinpoint = ["mypy-boto3-pinpoint (>=1.24.0,<1.25.0)"] -pi = ["mypy-boto3-pi (>=1.24.0,<1.25.0)"] -personalize-runtime = ["mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)"] -personalize-events = ["mypy-boto3-personalize-events (>=1.24.0,<1.25.0)"] -personalize = ["mypy-boto3-personalize (>=1.24.0,<1.25.0)"] -panorama = ["mypy-boto3-panorama (>=1.24.0,<1.25.0)"] -outposts = ["mypy-boto3-outposts (>=1.24.0,<1.25.0)"] -organizations = ["mypy-boto3-organizations (>=1.24.0,<1.25.0)"] -opsworkscm = ["mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)"] -opsworks = ["mypy-boto3-opsworks (>=1.24.0,<1.25.0)"] -opensearch = ["mypy-boto3-opensearch (>=1.24.0,<1.25.0)"] -nimble = ["mypy-boto3-nimble (>=1.24.0,<1.25.0)"] -networkmanager = ["mypy-boto3-networkmanager (>=1.24.0,<1.25.0)"] -network-firewall = ["mypy-boto3-network-firewall (>=1.24.0,<1.25.0)"] -neptune = ["mypy-boto3-neptune (>=1.24.0,<1.25.0)"] -mwaa = ["mypy-boto3-mwaa (>=1.24.0,<1.25.0)"] -mturk = ["mypy-boto3-mturk (>=1.24.0,<1.25.0)"] -mq = ["mypy-boto3-mq (>=1.24.0,<1.25.0)"] -mobile = ["mypy-boto3-mobile (>=1.24.0,<1.25.0)"] -migrationhubstrategy = ["mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)"] -migrationhub-config = ["mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)"] -migration-hub-refactor-spaces = ["mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)"] -mgn = ["mypy-boto3-mgn (>=1.24.0,<1.25.0)"] -mgh = ["mypy-boto3-mgh (>=1.24.0,<1.25.0)"] -meteringmarketplace = ["mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)"] -memorydb = ["mypy-boto3-memorydb (>=1.24.0,<1.25.0)"] -mediatailor = ["mypy-boto3-mediatailor (>=1.24.0,<1.25.0)"] -mediastore-data = ["mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)"] -mediastore = ["mypy-boto3-mediastore (>=1.24.0,<1.25.0)"] -mediapackage-vod = ["mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)"] -mediapackage = ["mypy-boto3-mediapackage (>=1.24.0,<1.25.0)"] -xray = ["mypy-boto3-xray (>=1.24.0,<1.25.0)"] -workspaces-web = ["mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)"] -workspaces = ["mypy-boto3-workspaces (>=1.24.0,<1.25.0)"] -workmailmessageflow = ["mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)"] -workmail = ["mypy-boto3-workmail (>=1.24.0,<1.25.0)"] -medialive = ["mypy-boto3-medialive (>=1.24.0,<1.25.0)"] -kinesisanalytics = ["mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)"] -kinesis-video-signaling = ["mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)"] -kinesis-video-media = ["mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)"] -kinesis-video-archived-media = ["mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)"] -kinesis = ["mypy-boto3-kinesis (>=1.24.0,<1.25.0)"] -keyspaces = ["mypy-boto3-keyspaces (>=1.24.0,<1.25.0)"] -kendra = ["mypy-boto3-kendra (>=1.24.0,<1.25.0)"] -kafkaconnect = ["mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)"] -kafka = ["mypy-boto3-kafka (>=1.24.0,<1.25.0)"] -ivschat = ["mypy-boto3-ivschat (>=1.24.0,<1.25.0)"] -ivs = ["mypy-boto3-ivs (>=1.24.0,<1.25.0)"] -iotwireless = ["mypy-boto3-iotwireless (>=1.24.0,<1.25.0)"] -iottwinmaker = ["mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)"] -iotthingsgraph = ["mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)"] -iotsitewise = ["mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)"] -iotsecuretunneling = ["mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)"] -iotfleethub = ["mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)"] -iotevents-data = ["mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)"] -iotevents = ["mypy-boto3-iotevents (>=1.24.0,<1.25.0)"] -iotdeviceadvisor = ["mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)"] -iotanalytics = ["mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)"] -iot1click-projects = ["mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)"] -iot1click-devices = ["mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)"] -iot-jobs-data = ["mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)"] -iot-data = ["mypy-boto3-iot-data (>=1.24.0,<1.25.0)"] -iot = ["mypy-boto3-iot (>=1.24.0,<1.25.0)"] -inspector2 = ["mypy-boto3-inspector2 (>=1.24.0,<1.25.0)"] -inspector = ["mypy-boto3-inspector (>=1.24.0,<1.25.0)"] -importexport = ["mypy-boto3-importexport (>=1.24.0,<1.25.0)"] -imagebuilder = ["mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)"] -identitystore = ["mypy-boto3-identitystore (>=1.24.0,<1.25.0)"] -iam = ["mypy-boto3-iam (>=1.24.0,<1.25.0)"] -honeycode = ["mypy-boto3-honeycode (>=1.24.0,<1.25.0)"] -healthlake = ["mypy-boto3-healthlake (>=1.24.0,<1.25.0)"] -health = ["mypy-boto3-health (>=1.24.0,<1.25.0)"] -guardduty = ["mypy-boto3-guardduty (>=1.24.0,<1.25.0)"] -groundstation = ["mypy-boto3-groundstation (>=1.24.0,<1.25.0)"] -greengrassv2 = ["mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)"] -greengrass = ["mypy-boto3-greengrass (>=1.24.0,<1.25.0)"] -grafana = ["mypy-boto3-grafana (>=1.24.0,<1.25.0)"] -glue = ["mypy-boto3-glue (>=1.24.0,<1.25.0)"] -globalaccelerator = ["mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)"] -glacier = ["mypy-boto3-glacier (>=1.24.0,<1.25.0)"] -gamesparks = ["mypy-boto3-gamesparks (>=1.24.0,<1.25.0)"] -gamelift = ["mypy-boto3-gamelift (>=1.24.0,<1.25.0)"] -fsx = ["mypy-boto3-fsx (>=1.24.0,<1.25.0)"] -frauddetector = ["mypy-boto3-frauddetector (>=1.24.0,<1.25.0)"] -forecastquery = ["mypy-boto3-forecastquery (>=1.24.0,<1.25.0)"] -forecast = ["mypy-boto3-forecast (>=1.24.0,<1.25.0)"] -fms = ["mypy-boto3-fms (>=1.24.0,<1.25.0)"] -fis = ["mypy-boto3-fis (>=1.24.0,<1.25.0)"] -firehose = ["mypy-boto3-firehose (>=1.24.0,<1.25.0)"] -finspace-data = ["mypy-boto3-finspace-data (>=1.24.0,<1.25.0)"] -finspace = ["mypy-boto3-finspace (>=1.24.0,<1.25.0)"] -evidently = ["mypy-boto3-evidently (>=1.24.0,<1.25.0)"] -events = ["mypy-boto3-events (>=1.24.0,<1.25.0)"] -essential = ["mypy-boto3-sqs (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-cloudformation (>=1.24.0,<1.25.0)"] -es = ["mypy-boto3-es (>=1.24.0,<1.25.0)"] -emr-serverless = ["mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)"] -emr-containers = ["mypy-boto3-emr-containers (>=1.24.0,<1.25.0)"] -emr = ["mypy-boto3-emr (>=1.24.0,<1.25.0)"] -elbv2 = ["mypy-boto3-elbv2 (>=1.24.0,<1.25.0)"] -elb = ["mypy-boto3-elb (>=1.24.0,<1.25.0)"] -elastictranscoder = ["mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)"] -elasticbeanstalk = ["mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)"] -elasticache = ["mypy-boto3-elasticache (>=1.24.0,<1.25.0)"] -elastic-inference = ["mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)"] -eks = ["mypy-boto3-eks (>=1.24.0,<1.25.0)"] -efs = ["mypy-boto3-efs (>=1.24.0,<1.25.0)"] -ecs = ["mypy-boto3-ecs (>=1.24.0,<1.25.0)"] -ecr-public = ["mypy-boto3-ecr-public (>=1.24.0,<1.25.0)"] -ecr = ["mypy-boto3-ecr (>=1.24.0,<1.25.0)"] -ec2-instance-connect = ["mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)"] -ec2 = ["mypy-boto3-ec2 (>=1.24.0,<1.25.0)"] -ebs = ["mypy-boto3-ebs (>=1.24.0,<1.25.0)"] -dynamodbstreams = ["mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)"] -dynamodb = ["mypy-boto3-dynamodb (>=1.24.0,<1.25.0)"] -ds = ["mypy-boto3-ds (>=1.24.0,<1.25.0)"] -drs = ["mypy-boto3-drs (>=1.24.0,<1.25.0)"] -docdb = ["mypy-boto3-docdb (>=1.24.0,<1.25.0)"] -dms = ["mypy-boto3-dms (>=1.24.0,<1.25.0)"] -dlm = ["mypy-boto3-dlm (>=1.24.0,<1.25.0)"] -discovery = ["mypy-boto3-discovery (>=1.24.0,<1.25.0)"] -directconnect = ["mypy-boto3-directconnect (>=1.24.0,<1.25.0)"] -devops-guru = ["mypy-boto3-devops-guru (>=1.24.0,<1.25.0)"] -devicefarm = ["mypy-boto3-devicefarm (>=1.24.0,<1.25.0)"] -detective = ["mypy-boto3-detective (>=1.24.0,<1.25.0)"] -dax = ["mypy-boto3-dax (>=1.24.0,<1.25.0)"] -datasync = ["mypy-boto3-datasync (>=1.24.0,<1.25.0)"] -datapipeline = ["mypy-boto3-datapipeline (>=1.24.0,<1.25.0)"] -dataexchange = ["mypy-boto3-dataexchange (>=1.24.0,<1.25.0)"] -databrew = ["mypy-boto3-databrew (>=1.24.0,<1.25.0)"] -customer-profiles = ["mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)"] -cur = ["mypy-boto3-cur (>=1.24.0,<1.25.0)"] -connectparticipant = ["mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)"] -connectcampaigns = ["mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)"] -connect-contact-lens = ["mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)"] -connect = ["mypy-boto3-connect (>=1.24.0,<1.25.0)"] -config = ["mypy-boto3-config (>=1.24.0,<1.25.0)"] -compute-optimizer = ["mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)"] -comprehendmedical = ["mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)"] -comprehend = ["mypy-boto3-comprehend (>=1.24.0,<1.25.0)"] -cognito-sync = ["mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)"] -cognito-idp = ["mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)"] -cognito-identity = ["mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)"] -codestar-notifications = ["mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)"] -codestar-connections = ["mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)"] -codestar = ["mypy-boto3-codestar (>=1.24.0,<1.25.0)"] -codepipeline = ["mypy-boto3-codepipeline (>=1.24.0,<1.25.0)"] -mediaconvert = ["mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)"] -mediaconnect = ["mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)"] -marketplacecommerceanalytics = ["mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)"] -marketplace-entitlement = ["mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)"] -marketplace-catalog = ["mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)"] -managedblockchain = ["mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)"] -macie2 = ["mypy-boto3-macie2 (>=1.24.0,<1.25.0)"] -macie = ["mypy-boto3-macie (>=1.24.0,<1.25.0)"] -machinelearning = ["mypy-boto3-machinelearning (>=1.24.0,<1.25.0)"] -m2 = ["mypy-boto3-m2 (>=1.24.0,<1.25.0)"] -lookoutvision = ["mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)"] -lookoutmetrics = ["mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)"] -lookoutequipment = ["mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)"] -logs = ["mypy-boto3-logs (>=1.24.0,<1.25.0)"] -location = ["mypy-boto3-location (>=1.24.0,<1.25.0)"] -lightsail = ["mypy-boto3-lightsail (>=1.24.0,<1.25.0)"] -license-manager-user-subscriptions = ["mypy-boto3-license-manager-user-subscriptions (>=1.24.0,<1.25.0)"] -license-manager = ["mypy-boto3-license-manager (>=1.24.0,<1.25.0)"] -lexv2-runtime = ["mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)"] -lexv2-models = ["mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)"] -lex-runtime = ["mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)"] -lex-models = ["mypy-boto3-lex-models (>=1.24.0,<1.25.0)"] -lambda = ["mypy-boto3-lambda (>=1.24.0,<1.25.0)"] -lakeformation = ["mypy-boto3-lakeformation (>=1.24.0,<1.25.0)"] -kms = ["mypy-boto3-kms (>=1.24.0,<1.25.0)"] -kinesisvideo = ["mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)"] -kinesisanalyticsv2 = ["mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)"] -codeguruprofiler = ["mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)"] -all = ["mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-kafka (>=1.24.0,<1.25.0)", "mypy-boto3-ivschat (>=1.24.0,<1.25.0)", "mypy-boto3-ivs (>=1.24.0,<1.25.0)", "mypy-boto3-iotwireless (>=1.24.0,<1.25.0)", "mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)", "mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)", "mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)", "mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)", "mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents (>=1.24.0,<1.25.0)", "mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)", "mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)", "mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot (>=1.24.0,<1.25.0)", "mypy-boto3-inspector2 (>=1.24.0,<1.25.0)", "mypy-boto3-inspector (>=1.24.0,<1.25.0)", "mypy-boto3-importexport (>=1.24.0,<1.25.0)", "mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)", "mypy-boto3-identitystore (>=1.24.0,<1.25.0)", "mypy-boto3-iam (>=1.24.0,<1.25.0)", "mypy-boto3-honeycode (>=1.24.0,<1.25.0)", "mypy-boto3-healthlake (>=1.24.0,<1.25.0)", "mypy-boto3-health (>=1.24.0,<1.25.0)", "mypy-boto3-guardduty (>=1.24.0,<1.25.0)", "mypy-boto3-groundstation (>=1.24.0,<1.25.0)", "mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)", "mypy-boto3-greengrass (>=1.24.0,<1.25.0)", "mypy-boto3-grafana (>=1.24.0,<1.25.0)", "mypy-boto3-glue (>=1.24.0,<1.25.0)", "mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)", "mypy-boto3-glacier (>=1.24.0,<1.25.0)", "mypy-boto3-gamesparks (>=1.24.0,<1.25.0)", "mypy-boto3-gamelift (>=1.24.0,<1.25.0)", "mypy-boto3-fsx (>=1.24.0,<1.25.0)", "mypy-boto3-frauddetector (>=1.24.0,<1.25.0)", "mypy-boto3-forecastquery (>=1.24.0,<1.25.0)", "mypy-boto3-forecast (>=1.24.0,<1.25.0)", "mypy-boto3-fms (>=1.24.0,<1.25.0)", "mypy-boto3-fis (>=1.24.0,<1.25.0)", "mypy-boto3-firehose (>=1.24.0,<1.25.0)", "mypy-boto3-finspace-data (>=1.24.0,<1.25.0)", "mypy-boto3-finspace (>=1.24.0,<1.25.0)", "mypy-boto3-evidently (>=1.24.0,<1.25.0)", "mypy-boto3-events (>=1.24.0,<1.25.0)", "mypy-boto3-es (>=1.24.0,<1.25.0)", "mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-emr-containers (>=1.24.0,<1.25.0)", "mypy-boto3-emr (>=1.24.0,<1.25.0)", "mypy-boto3-elbv2 (>=1.24.0,<1.25.0)", "mypy-boto3-elb (>=1.24.0,<1.25.0)", "mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)", "mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)", "mypy-boto3-elasticache (>=1.24.0,<1.25.0)", "mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)", "mypy-boto3-eks (>=1.24.0,<1.25.0)", "mypy-boto3-efs (>=1.24.0,<1.25.0)", "mypy-boto3-ecs (>=1.24.0,<1.25.0)", "mypy-boto3-ecr-public (>=1.24.0,<1.25.0)", "mypy-boto3-ecr (>=1.24.0,<1.25.0)", "mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-ebs (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-ds (>=1.24.0,<1.25.0)", "mypy-boto3-drs (>=1.24.0,<1.25.0)", "mypy-boto3-docdb (>=1.24.0,<1.25.0)", "mypy-boto3-dms (>=1.24.0,<1.25.0)", "mypy-boto3-dlm (>=1.24.0,<1.25.0)", "mypy-boto3-discovery (>=1.24.0,<1.25.0)", "mypy-boto3-directconnect (>=1.24.0,<1.25.0)", "mypy-boto3-devops-guru (>=1.24.0,<1.25.0)", "mypy-boto3-devicefarm (>=1.24.0,<1.25.0)", "mypy-boto3-detective (>=1.24.0,<1.25.0)", "mypy-boto3-dax (>=1.24.0,<1.25.0)", "mypy-boto3-datasync (>=1.24.0,<1.25.0)", "mypy-boto3-datapipeline (>=1.24.0,<1.25.0)", "mypy-boto3-dataexchange (>=1.24.0,<1.25.0)", "mypy-boto3-databrew (>=1.24.0,<1.25.0)", "mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)", "mypy-boto3-cur (>=1.24.0,<1.25.0)", "mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)", "mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)", "mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)", "mypy-boto3-connect (>=1.24.0,<1.25.0)", "mypy-boto3-config (>=1.24.0,<1.25.0)", "mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)", "mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)", "mypy-boto3-comprehend (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)", "mypy-boto3-codestar (>=1.24.0,<1.25.0)", "mypy-boto3-codepipeline (>=1.24.0,<1.25.0)", "mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)", "mypy-boto3-codedeploy (>=1.24.0,<1.25.0)", "mypy-boto3-codecommit (>=1.24.0,<1.25.0)", "mypy-boto3-codebuild (>=1.24.0,<1.25.0)", "mypy-boto3-codeartifact (>=1.24.0,<1.25.0)", "mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)", "mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)", "mypy-boto3-cloudfront (>=1.24.0,<1.25.0)", "mypy-boto3-cloudformation (>=1.24.0,<1.25.0)", "mypy-boto3-xray (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces (>=1.24.0,<1.25.0)", "mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)", "mypy-boto3-workmail (>=1.24.0,<1.25.0)", "mypy-boto3-worklink (>=1.24.0,<1.25.0)", "mypy-boto3-workdocs (>=1.24.0,<1.25.0)", "mypy-boto3-wisdom (>=1.24.0,<1.25.0)", "mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)", "mypy-boto3-wafv2 (>=1.24.0,<1.25.0)", "mypy-boto3-waf-regional (>=1.24.0,<1.25.0)", "mypy-boto3-waf (>=1.24.0,<1.25.0)", "mypy-boto3-voice-id (>=1.24.0,<1.25.0)", "mypy-boto3-translate (>=1.24.0,<1.25.0)", "mypy-boto3-transfer (>=1.24.0,<1.25.0)", "mypy-boto3-transcribe (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-write (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-query (>=1.24.0,<1.25.0)", "mypy-boto3-textract (>=1.24.0,<1.25.0)", "mypy-boto3-synthetics (>=1.24.0,<1.25.0)", "mypy-boto3-swf (>=1.24.0,<1.25.0)", "mypy-boto3-support (>=1.24.0,<1.25.0)", "mypy-boto3-sts (>=1.24.0,<1.25.0)", "mypy-boto3-storagegateway (>=1.24.0,<1.25.0)", "mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)", "mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)", "mypy-boto3-sso-admin (>=1.24.0,<1.25.0)", "mypy-boto3-sso (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)", "mypy-boto3-ssm (>=1.24.0,<1.25.0)", "mypy-boto3-sqs (>=1.24.0,<1.25.0)", "mypy-boto3-sns (>=1.24.0,<1.25.0)", "mypy-boto3-snowball (>=1.24.0,<1.25.0)", "mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)", "mypy-boto3-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-sms (>=1.24.0,<1.25.0)", "mypy-boto3-signer (>=1.24.0,<1.25.0)", "mypy-boto3-shield (>=1.24.0,<1.25.0)", "mypy-boto3-sesv2 (>=1.24.0,<1.25.0)", "mypy-boto3-ses (>=1.24.0,<1.25.0)", "mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)", "mypy-boto3-service-quotas (>=1.24.0,<1.25.0)", "mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)", "mypy-boto3-securityhub (>=1.24.0,<1.25.0)", "mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)", "mypy-boto3-sdb (>=1.24.0,<1.25.0)", "mypy-boto3-schemas (>=1.24.0,<1.25.0)", "mypy-boto3-savingsplans (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker (>=1.24.0,<1.25.0)", "mypy-boto3-s3outposts (>=1.24.0,<1.25.0)", "mypy-boto3-s3control (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-rum (>=1.24.0,<1.25.0)", "mypy-boto3-route53resolver (>=1.24.0,<1.25.0)", "mypy-boto3-route53domains (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)", "mypy-boto3-route53 (>=1.24.0,<1.25.0)", "mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)", "mypy-boto3-robomaker (>=1.24.0,<1.25.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)", "mypy-boto3-resource-groups (>=1.24.0,<1.25.0)", "mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)", "mypy-boto3-rekognition (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift (>=1.24.0,<1.25.0)", "mypy-boto3-rds-data (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-rbin (>=1.24.0,<1.25.0)", "mypy-boto3-ram (>=1.24.0,<1.25.0)", "mypy-boto3-quicksight (>=1.24.0,<1.25.0)", "mypy-boto3-qldb-session (>=1.24.0,<1.25.0)", "mypy-boto3-qldb (>=1.24.0,<1.25.0)", "mypy-boto3-proton (>=1.24.0,<1.25.0)", "mypy-boto3-privatenetworks (>=1.24.0,<1.25.0)", "mypy-boto3-pricing (>=1.24.0,<1.25.0)", "mypy-boto3-polly (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint (>=1.24.0,<1.25.0)", "mypy-boto3-pi (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-events (>=1.24.0,<1.25.0)", "mypy-boto3-personalize (>=1.24.0,<1.25.0)", "mypy-boto3-panorama (>=1.24.0,<1.25.0)", "mypy-boto3-outposts (>=1.24.0,<1.25.0)", "mypy-boto3-organizations (>=1.24.0,<1.25.0)", "mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)", "mypy-boto3-opsworks (>=1.24.0,<1.25.0)", "mypy-boto3-opensearch (>=1.24.0,<1.25.0)", "mypy-boto3-nimble (>=1.24.0,<1.25.0)", "mypy-boto3-networkmanager (>=1.24.0,<1.25.0)", "mypy-boto3-network-firewall (>=1.24.0,<1.25.0)", "mypy-boto3-neptune (>=1.24.0,<1.25.0)", "mypy-boto3-mwaa (>=1.24.0,<1.25.0)", "mypy-boto3-mturk (>=1.24.0,<1.25.0)", "mypy-boto3-mq (>=1.24.0,<1.25.0)", "mypy-boto3-mobile (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)", "mypy-boto3-mgn (>=1.24.0,<1.25.0)", "mypy-boto3-mgh (>=1.24.0,<1.25.0)", "mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)", "mypy-boto3-memorydb (>=1.24.0,<1.25.0)", "mypy-boto3-mediatailor (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage (>=1.24.0,<1.25.0)", "mypy-boto3-medialive (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)", "mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)", "mypy-boto3-macie2 (>=1.24.0,<1.25.0)", "mypy-boto3-macie (>=1.24.0,<1.25.0)", "mypy-boto3-machinelearning (>=1.24.0,<1.25.0)", "mypy-boto3-m2 (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)", "mypy-boto3-logs (>=1.24.0,<1.25.0)", "mypy-boto3-location (>=1.24.0,<1.25.0)", "mypy-boto3-lightsail (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager-user-subscriptions (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)", "mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-lex-models (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-lakeformation (>=1.24.0,<1.25.0)", "mypy-boto3-kms (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis (>=1.24.0,<1.25.0)", "mypy-boto3-keyspaces (>=1.24.0,<1.25.0)", "mypy-boto3-kendra (>=1.24.0,<1.25.0)", "mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)", "mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)", "mypy-boto3-cloud9 (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)", "mypy-boto3-chime (>=1.24.0,<1.25.0)", "mypy-boto3-ce (>=1.24.0,<1.25.0)", "mypy-boto3-budgets (>=1.24.0,<1.25.0)", "mypy-boto3-braket (>=1.24.0,<1.25.0)", "mypy-boto3-billingconductor (>=1.24.0,<1.25.0)", "mypy-boto3-batch (>=1.24.0,<1.25.0)", "mypy-boto3-backupstorage (>=1.24.0,<1.25.0)", "mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)", "mypy-boto3-backup (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-auditmanager (>=1.24.0,<1.25.0)", "mypy-boto3-athena (>=1.24.0,<1.25.0)", "mypy-boto3-appsync (>=1.24.0,<1.25.0)", "mypy-boto3-appstream (>=1.24.0,<1.25.0)", "mypy-boto3-apprunner (>=1.24.0,<1.25.0)", "mypy-boto3-appmesh (>=1.24.0,<1.25.0)", "mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-application-insights (>=1.24.0,<1.25.0)", "mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-appintegrations (>=1.24.0,<1.25.0)", "mypy-boto3-appflow (>=1.24.0,<1.25.0)", "mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)", "mypy-boto3-appconfig (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)", "mypy-boto3-apigateway (>=1.24.0,<1.25.0)", "mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)", "mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)", "mypy-boto3-amplify (>=1.24.0,<1.25.0)", "mypy-boto3-amp (>=1.24.0,<1.25.0)", "mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)", "mypy-boto3-acm-pca (>=1.24.0,<1.25.0)", "mypy-boto3-acm (>=1.24.0,<1.25.0)", "mypy-boto3-account (>=1.24.0,<1.25.0)", "mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)"] -budgets = ["mypy-boto3-budgets (>=1.24.0,<1.25.0)"] -braket = ["mypy-boto3-braket (>=1.24.0,<1.25.0)"] -billingconductor = ["mypy-boto3-billingconductor (>=1.24.0,<1.25.0)"] -batch = ["mypy-boto3-batch (>=1.24.0,<1.25.0)"] -backupstorage = ["mypy-boto3-backupstorage (>=1.24.0,<1.25.0)"] -backup-gateway = ["mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)"] -backup = ["mypy-boto3-backup (>=1.24.0,<1.25.0)"] -autoscaling-plans = ["mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)"] -autoscaling = ["mypy-boto3-autoscaling (>=1.24.0,<1.25.0)"] -auditmanager = ["mypy-boto3-auditmanager (>=1.24.0,<1.25.0)"] -athena = ["mypy-boto3-athena (>=1.24.0,<1.25.0)"] -appsync = ["mypy-boto3-appsync (>=1.24.0,<1.25.0)"] -appstream = ["mypy-boto3-appstream (>=1.24.0,<1.25.0)"] -apprunner = ["mypy-boto3-apprunner (>=1.24.0,<1.25.0)"] -appmesh = ["mypy-boto3-appmesh (>=1.24.0,<1.25.0)"] -applicationcostprofiler = ["mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)"] -application-insights = ["mypy-boto3-application-insights (>=1.24.0,<1.25.0)"] -application-autoscaling = ["mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)"] -appintegrations = ["mypy-boto3-appintegrations (>=1.24.0,<1.25.0)"] -appflow = ["mypy-boto3-appflow (>=1.24.0,<1.25.0)"] -appconfigdata = ["mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)"] -appconfig = ["mypy-boto3-appconfig (>=1.24.0,<1.25.0)"] -apigatewayv2 = ["mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)"] -apigatewaymanagementapi = ["mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)"] -apigateway = ["mypy-boto3-apigateway (>=1.24.0,<1.25.0)"] -amplifyuibuilder = ["mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)"] -amplifybackend = ["mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)"] -amplify = ["mypy-boto3-amplify (>=1.24.0,<1.25.0)"] -amp = ["mypy-boto3-amp (>=1.24.0,<1.25.0)"] -codeguru-reviewer = ["mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)"] -codedeploy = ["mypy-boto3-codedeploy (>=1.24.0,<1.25.0)"] -codecommit = ["mypy-boto3-codecommit (>=1.24.0,<1.25.0)"] -codebuild = ["mypy-boto3-codebuild (>=1.24.0,<1.25.0)"] -codeartifact = ["mypy-boto3-codeartifact (>=1.24.0,<1.25.0)"] -cloudwatch = ["mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)"] -cloudtrail = ["mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)"] -cloudsearchdomain = ["mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)"] -cloudsearch = ["mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)"] -cloudhsmv2 = ["mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)"] -cloudhsm = ["mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)"] -cloudfront = ["mypy-boto3-cloudfront (>=1.24.0,<1.25.0)"] -cloudformation = ["mypy-boto3-cloudformation (>=1.24.0,<1.25.0)"] -clouddirectory = ["mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)"] -cloudcontrol = ["mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)"] -cloud9 = ["mypy-boto3-cloud9 (>=1.24.0,<1.25.0)"] -chime-sdk-messaging = ["mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)"] -chime-sdk-meetings = ["mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)"] -chime-sdk-media-pipelines = ["mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)"] -chime-sdk-identity = ["mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)"] -chime = ["mypy-boto3-chime (>=1.24.0,<1.25.0)"] -ce = ["mypy-boto3-ce (>=1.24.0,<1.25.0)"] -alexaforbusiness = ["mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)"] -acm-pca = ["mypy-boto3-acm-pca (>=1.24.0,<1.25.0)"] -acm = ["mypy-boto3-acm (>=1.24.0,<1.25.0)"] -account = ["mypy-boto3-account (>=1.24.0,<1.25.0)"] accessanalyzer = ["mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)"] +account = ["mypy-boto3-account (>=1.24.0,<1.25.0)"] +acm = ["mypy-boto3-acm (>=1.24.0,<1.25.0)"] +acm-pca = ["mypy-boto3-acm-pca (>=1.24.0,<1.25.0)"] +alexaforbusiness = ["mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)"] +all = ["mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)", "mypy-boto3-account (>=1.24.0,<1.25.0)", "mypy-boto3-acm (>=1.24.0,<1.25.0)", "mypy-boto3-acm-pca (>=1.24.0,<1.25.0)", "mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)", "mypy-boto3-amp (>=1.24.0,<1.25.0)", "mypy-boto3-amplify (>=1.24.0,<1.25.0)", "mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)", "mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)", "mypy-boto3-apigateway (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)", "mypy-boto3-appconfig (>=1.24.0,<1.25.0)", "mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)", "mypy-boto3-appflow (>=1.24.0,<1.25.0)", "mypy-boto3-appintegrations (>=1.24.0,<1.25.0)", "mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-application-insights (>=1.24.0,<1.25.0)", "mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-appmesh (>=1.24.0,<1.25.0)", "mypy-boto3-apprunner (>=1.24.0,<1.25.0)", "mypy-boto3-appstream (>=1.24.0,<1.25.0)", "mypy-boto3-appsync (>=1.24.0,<1.25.0)", "mypy-boto3-athena (>=1.24.0,<1.25.0)", "mypy-boto3-auditmanager (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)", "mypy-boto3-backup (>=1.24.0,<1.25.0)", "mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)", "mypy-boto3-backupstorage (>=1.24.0,<1.25.0)", "mypy-boto3-batch (>=1.24.0,<1.25.0)", "mypy-boto3-billingconductor (>=1.24.0,<1.25.0)", "mypy-boto3-braket (>=1.24.0,<1.25.0)", "mypy-boto3-budgets (>=1.24.0,<1.25.0)", "mypy-boto3-ce (>=1.24.0,<1.25.0)", "mypy-boto3-chime (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)", "mypy-boto3-cloud9 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)", "mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)", "mypy-boto3-cloudformation (>=1.24.0,<1.25.0)", "mypy-boto3-cloudfront (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)", "mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)", "mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)", "mypy-boto3-codeartifact (>=1.24.0,<1.25.0)", "mypy-boto3-codebuild (>=1.24.0,<1.25.0)", "mypy-boto3-codecommit (>=1.24.0,<1.25.0)", "mypy-boto3-codedeploy (>=1.24.0,<1.25.0)", "mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)", "mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-codepipeline (>=1.24.0,<1.25.0)", "mypy-boto3-codestar (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)", "mypy-boto3-comprehend (>=1.24.0,<1.25.0)", "mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)", "mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)", "mypy-boto3-config (>=1.24.0,<1.25.0)", "mypy-boto3-connect (>=1.24.0,<1.25.0)", "mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)", "mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)", "mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)", "mypy-boto3-cur (>=1.24.0,<1.25.0)", "mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)", "mypy-boto3-databrew (>=1.24.0,<1.25.0)", "mypy-boto3-dataexchange (>=1.24.0,<1.25.0)", "mypy-boto3-datapipeline (>=1.24.0,<1.25.0)", "mypy-boto3-datasync (>=1.24.0,<1.25.0)", "mypy-boto3-dax (>=1.24.0,<1.25.0)", "mypy-boto3-detective (>=1.24.0,<1.25.0)", "mypy-boto3-devicefarm (>=1.24.0,<1.25.0)", "mypy-boto3-devops-guru (>=1.24.0,<1.25.0)", "mypy-boto3-directconnect (>=1.24.0,<1.25.0)", "mypy-boto3-discovery (>=1.24.0,<1.25.0)", "mypy-boto3-dlm (>=1.24.0,<1.25.0)", "mypy-boto3-dms (>=1.24.0,<1.25.0)", "mypy-boto3-docdb (>=1.24.0,<1.25.0)", "mypy-boto3-drs (>=1.24.0,<1.25.0)", "mypy-boto3-ds (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)", "mypy-boto3-ebs (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)", "mypy-boto3-ecr (>=1.24.0,<1.25.0)", "mypy-boto3-ecr-public (>=1.24.0,<1.25.0)", "mypy-boto3-ecs (>=1.24.0,<1.25.0)", "mypy-boto3-efs (>=1.24.0,<1.25.0)", "mypy-boto3-eks (>=1.24.0,<1.25.0)", "mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)", "mypy-boto3-elasticache (>=1.24.0,<1.25.0)", "mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)", "mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)", "mypy-boto3-elb (>=1.24.0,<1.25.0)", "mypy-boto3-elbv2 (>=1.24.0,<1.25.0)", "mypy-boto3-emr (>=1.24.0,<1.25.0)", "mypy-boto3-emr-containers (>=1.24.0,<1.25.0)", "mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-es (>=1.24.0,<1.25.0)", "mypy-boto3-events (>=1.24.0,<1.25.0)", "mypy-boto3-evidently (>=1.24.0,<1.25.0)", "mypy-boto3-finspace (>=1.24.0,<1.25.0)", "mypy-boto3-finspace-data (>=1.24.0,<1.25.0)", "mypy-boto3-firehose (>=1.24.0,<1.25.0)", "mypy-boto3-fis (>=1.24.0,<1.25.0)", "mypy-boto3-fms (>=1.24.0,<1.25.0)", "mypy-boto3-forecast (>=1.24.0,<1.25.0)", "mypy-boto3-forecastquery (>=1.24.0,<1.25.0)", "mypy-boto3-frauddetector (>=1.24.0,<1.25.0)", "mypy-boto3-fsx (>=1.24.0,<1.25.0)", "mypy-boto3-gamelift (>=1.24.0,<1.25.0)", "mypy-boto3-gamesparks (>=1.24.0,<1.25.0)", "mypy-boto3-glacier (>=1.24.0,<1.25.0)", "mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)", "mypy-boto3-glue (>=1.24.0,<1.25.0)", "mypy-boto3-grafana (>=1.24.0,<1.25.0)", "mypy-boto3-greengrass (>=1.24.0,<1.25.0)", "mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)", "mypy-boto3-groundstation (>=1.24.0,<1.25.0)", "mypy-boto3-guardduty (>=1.24.0,<1.25.0)", "mypy-boto3-health (>=1.24.0,<1.25.0)", "mypy-boto3-healthlake (>=1.24.0,<1.25.0)", "mypy-boto3-honeycode (>=1.24.0,<1.25.0)", "mypy-boto3-iam (>=1.24.0,<1.25.0)", "mypy-boto3-identitystore (>=1.24.0,<1.25.0)", "mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)", "mypy-boto3-importexport (>=1.24.0,<1.25.0)", "mypy-boto3-inspector (>=1.24.0,<1.25.0)", "mypy-boto3-inspector2 (>=1.24.0,<1.25.0)", "mypy-boto3-iot (>=1.24.0,<1.25.0)", "mypy-boto3-iot-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)", "mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)", "mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)", "mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)", "mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)", "mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)", "mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)", "mypy-boto3-iotwireless (>=1.24.0,<1.25.0)", "mypy-boto3-ivs (>=1.24.0,<1.25.0)", "mypy-boto3-ivschat (>=1.24.0,<1.25.0)", "mypy-boto3-kafka (>=1.24.0,<1.25.0)", "mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-kendra (>=1.24.0,<1.25.0)", "mypy-boto3-keyspaces (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)", "mypy-boto3-kms (>=1.24.0,<1.25.0)", "mypy-boto3-lakeformation (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-lex-models (>=1.24.0,<1.25.0)", "mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager-user-subscriptions (>=1.24.0,<1.25.0)", "mypy-boto3-lightsail (>=1.24.0,<1.25.0)", "mypy-boto3-location (>=1.24.0,<1.25.0)", "mypy-boto3-logs (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)", "mypy-boto3-m2 (>=1.24.0,<1.25.0)", "mypy-boto3-machinelearning (>=1.24.0,<1.25.0)", "mypy-boto3-macie (>=1.24.0,<1.25.0)", "mypy-boto3-macie2 (>=1.24.0,<1.25.0)", "mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)", "mypy-boto3-medialive (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)", "mypy-boto3-mediatailor (>=1.24.0,<1.25.0)", "mypy-boto3-memorydb (>=1.24.0,<1.25.0)", "mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)", "mypy-boto3-mgh (>=1.24.0,<1.25.0)", "mypy-boto3-mgn (>=1.24.0,<1.25.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)", "mypy-boto3-mobile (>=1.24.0,<1.25.0)", "mypy-boto3-mq (>=1.24.0,<1.25.0)", "mypy-boto3-mturk (>=1.24.0,<1.25.0)", "mypy-boto3-mwaa (>=1.24.0,<1.25.0)", "mypy-boto3-neptune (>=1.24.0,<1.25.0)", "mypy-boto3-network-firewall (>=1.24.0,<1.25.0)", "mypy-boto3-networkmanager (>=1.24.0,<1.25.0)", "mypy-boto3-nimble (>=1.24.0,<1.25.0)", "mypy-boto3-opensearch (>=1.24.0,<1.25.0)", "mypy-boto3-opsworks (>=1.24.0,<1.25.0)", "mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)", "mypy-boto3-organizations (>=1.24.0,<1.25.0)", "mypy-boto3-outposts (>=1.24.0,<1.25.0)", "mypy-boto3-panorama (>=1.24.0,<1.25.0)", "mypy-boto3-personalize (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-events (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-pi (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)", "mypy-boto3-polly (>=1.24.0,<1.25.0)", "mypy-boto3-pricing (>=1.24.0,<1.25.0)", "mypy-boto3-privatenetworks (>=1.24.0,<1.25.0)", "mypy-boto3-proton (>=1.24.0,<1.25.0)", "mypy-boto3-qldb (>=1.24.0,<1.25.0)", "mypy-boto3-qldb-session (>=1.24.0,<1.25.0)", "mypy-boto3-quicksight (>=1.24.0,<1.25.0)", "mypy-boto3-ram (>=1.24.0,<1.25.0)", "mypy-boto3-rbin (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-rds-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-rekognition (>=1.24.0,<1.25.0)", "mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)", "mypy-boto3-resource-groups (>=1.24.0,<1.25.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)", "mypy-boto3-robomaker (>=1.24.0,<1.25.0)", "mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)", "mypy-boto3-route53 (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)", "mypy-boto3-route53domains (>=1.24.0,<1.25.0)", "mypy-boto3-route53resolver (>=1.24.0,<1.25.0)", "mypy-boto3-rum (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-s3control (>=1.24.0,<1.25.0)", "mypy-boto3-s3outposts (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-savingsplans (>=1.24.0,<1.25.0)", "mypy-boto3-schemas (>=1.24.0,<1.25.0)", "mypy-boto3-sdb (>=1.24.0,<1.25.0)", "mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)", "mypy-boto3-securityhub (>=1.24.0,<1.25.0)", "mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)", "mypy-boto3-service-quotas (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)", "mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)", "mypy-boto3-ses (>=1.24.0,<1.25.0)", "mypy-boto3-sesv2 (>=1.24.0,<1.25.0)", "mypy-boto3-shield (>=1.24.0,<1.25.0)", "mypy-boto3-signer (>=1.24.0,<1.25.0)", "mypy-boto3-sms (>=1.24.0,<1.25.0)", "mypy-boto3-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)", "mypy-boto3-snowball (>=1.24.0,<1.25.0)", "mypy-boto3-sns (>=1.24.0,<1.25.0)", "mypy-boto3-sqs (>=1.24.0,<1.25.0)", "mypy-boto3-ssm (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)", "mypy-boto3-sso (>=1.24.0,<1.25.0)", "mypy-boto3-sso-admin (>=1.24.0,<1.25.0)", "mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)", "mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)", "mypy-boto3-storagegateway (>=1.24.0,<1.25.0)", "mypy-boto3-sts (>=1.24.0,<1.25.0)", "mypy-boto3-support (>=1.24.0,<1.25.0)", "mypy-boto3-swf (>=1.24.0,<1.25.0)", "mypy-boto3-synthetics (>=1.24.0,<1.25.0)", "mypy-boto3-textract (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-query (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-write (>=1.24.0,<1.25.0)", "mypy-boto3-transcribe (>=1.24.0,<1.25.0)", "mypy-boto3-transfer (>=1.24.0,<1.25.0)", "mypy-boto3-translate (>=1.24.0,<1.25.0)", "mypy-boto3-voice-id (>=1.24.0,<1.25.0)", "mypy-boto3-waf (>=1.24.0,<1.25.0)", "mypy-boto3-waf-regional (>=1.24.0,<1.25.0)", "mypy-boto3-wafv2 (>=1.24.0,<1.25.0)", "mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)", "mypy-boto3-wisdom (>=1.24.0,<1.25.0)", "mypy-boto3-workdocs (>=1.24.0,<1.25.0)", "mypy-boto3-worklink (>=1.24.0,<1.25.0)", "mypy-boto3-workmail (>=1.24.0,<1.25.0)", "mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)", "mypy-boto3-xray (>=1.24.0,<1.25.0)"] +amp = ["mypy-boto3-amp (>=1.24.0,<1.25.0)"] +amplify = ["mypy-boto3-amplify (>=1.24.0,<1.25.0)"] +amplifybackend = ["mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)"] +amplifyuibuilder = ["mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)"] +apigateway = ["mypy-boto3-apigateway (>=1.24.0,<1.25.0)"] +apigatewaymanagementapi = ["mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)"] +apigatewayv2 = ["mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)"] +appconfig = ["mypy-boto3-appconfig (>=1.24.0,<1.25.0)"] +appconfigdata = ["mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)"] +appflow = ["mypy-boto3-appflow (>=1.24.0,<1.25.0)"] +appintegrations = ["mypy-boto3-appintegrations (>=1.24.0,<1.25.0)"] +application-autoscaling = ["mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)"] +application-insights = ["mypy-boto3-application-insights (>=1.24.0,<1.25.0)"] +applicationcostprofiler = ["mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)"] +appmesh = ["mypy-boto3-appmesh (>=1.24.0,<1.25.0)"] +apprunner = ["mypy-boto3-apprunner (>=1.24.0,<1.25.0)"] +appstream = ["mypy-boto3-appstream (>=1.24.0,<1.25.0)"] +appsync = ["mypy-boto3-appsync (>=1.24.0,<1.25.0)"] +athena = ["mypy-boto3-athena (>=1.24.0,<1.25.0)"] +auditmanager = ["mypy-boto3-auditmanager (>=1.24.0,<1.25.0)"] +autoscaling = ["mypy-boto3-autoscaling (>=1.24.0,<1.25.0)"] +autoscaling-plans = ["mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)"] +backup = ["mypy-boto3-backup (>=1.24.0,<1.25.0)"] +backup-gateway = ["mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)"] +backupstorage = ["mypy-boto3-backupstorage (>=1.24.0,<1.25.0)"] +batch = ["mypy-boto3-batch (>=1.24.0,<1.25.0)"] +billingconductor = ["mypy-boto3-billingconductor (>=1.24.0,<1.25.0)"] +braket = ["mypy-boto3-braket (>=1.24.0,<1.25.0)"] +budgets = ["mypy-boto3-budgets (>=1.24.0,<1.25.0)"] +ce = ["mypy-boto3-ce (>=1.24.0,<1.25.0)"] +chime = ["mypy-boto3-chime (>=1.24.0,<1.25.0)"] +chime-sdk-identity = ["mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)"] +chime-sdk-media-pipelines = ["mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)"] +chime-sdk-meetings = ["mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)"] +chime-sdk-messaging = ["mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)"] +cloud9 = ["mypy-boto3-cloud9 (>=1.24.0,<1.25.0)"] +cloudcontrol = ["mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)"] +clouddirectory = ["mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)"] +cloudformation = ["mypy-boto3-cloudformation (>=1.24.0,<1.25.0)"] +cloudfront = ["mypy-boto3-cloudfront (>=1.24.0,<1.25.0)"] +cloudhsm = ["mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)"] +cloudhsmv2 = ["mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)"] +cloudsearch = ["mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)"] +cloudsearchdomain = ["mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)"] +cloudtrail = ["mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)"] +cloudwatch = ["mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)"] +codeartifact = ["mypy-boto3-codeartifact (>=1.24.0,<1.25.0)"] +codebuild = ["mypy-boto3-codebuild (>=1.24.0,<1.25.0)"] +codecommit = ["mypy-boto3-codecommit (>=1.24.0,<1.25.0)"] +codedeploy = ["mypy-boto3-codedeploy (>=1.24.0,<1.25.0)"] +codeguru-reviewer = ["mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)"] +codeguruprofiler = ["mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)"] +codepipeline = ["mypy-boto3-codepipeline (>=1.24.0,<1.25.0)"] +codestar = ["mypy-boto3-codestar (>=1.24.0,<1.25.0)"] +codestar-connections = ["mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)"] +codestar-notifications = ["mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)"] +cognito-identity = ["mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)"] +cognito-idp = ["mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)"] +cognito-sync = ["mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)"] +comprehend = ["mypy-boto3-comprehend (>=1.24.0,<1.25.0)"] +comprehendmedical = ["mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)"] +compute-optimizer = ["mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)"] +config = ["mypy-boto3-config (>=1.24.0,<1.25.0)"] +connect = ["mypy-boto3-connect (>=1.24.0,<1.25.0)"] +connect-contact-lens = ["mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)"] +connectcampaigns = ["mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)"] +connectparticipant = ["mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)"] +cur = ["mypy-boto3-cur (>=1.24.0,<1.25.0)"] +customer-profiles = ["mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)"] +databrew = ["mypy-boto3-databrew (>=1.24.0,<1.25.0)"] +dataexchange = ["mypy-boto3-dataexchange (>=1.24.0,<1.25.0)"] +datapipeline = ["mypy-boto3-datapipeline (>=1.24.0,<1.25.0)"] +datasync = ["mypy-boto3-datasync (>=1.24.0,<1.25.0)"] +dax = ["mypy-boto3-dax (>=1.24.0,<1.25.0)"] +detective = ["mypy-boto3-detective (>=1.24.0,<1.25.0)"] +devicefarm = ["mypy-boto3-devicefarm (>=1.24.0,<1.25.0)"] +devops-guru = ["mypy-boto3-devops-guru (>=1.24.0,<1.25.0)"] +directconnect = ["mypy-boto3-directconnect (>=1.24.0,<1.25.0)"] +discovery = ["mypy-boto3-discovery (>=1.24.0,<1.25.0)"] +dlm = ["mypy-boto3-dlm (>=1.24.0,<1.25.0)"] +dms = ["mypy-boto3-dms (>=1.24.0,<1.25.0)"] +docdb = ["mypy-boto3-docdb (>=1.24.0,<1.25.0)"] +drs = ["mypy-boto3-drs (>=1.24.0,<1.25.0)"] +ds = ["mypy-boto3-ds (>=1.24.0,<1.25.0)"] +dynamodb = ["mypy-boto3-dynamodb (>=1.24.0,<1.25.0)"] +dynamodbstreams = ["mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)"] +ebs = ["mypy-boto3-ebs (>=1.24.0,<1.25.0)"] +ec2 = ["mypy-boto3-ec2 (>=1.24.0,<1.25.0)"] +ec2-instance-connect = ["mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)"] +ecr = ["mypy-boto3-ecr (>=1.24.0,<1.25.0)"] +ecr-public = ["mypy-boto3-ecr-public (>=1.24.0,<1.25.0)"] +ecs = ["mypy-boto3-ecs (>=1.24.0,<1.25.0)"] +efs = ["mypy-boto3-efs (>=1.24.0,<1.25.0)"] +eks = ["mypy-boto3-eks (>=1.24.0,<1.25.0)"] +elastic-inference = ["mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)"] +elasticache = ["mypy-boto3-elasticache (>=1.24.0,<1.25.0)"] +elasticbeanstalk = ["mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)"] +elastictranscoder = ["mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)"] +elb = ["mypy-boto3-elb (>=1.24.0,<1.25.0)"] +elbv2 = ["mypy-boto3-elbv2 (>=1.24.0,<1.25.0)"] +emr = ["mypy-boto3-emr (>=1.24.0,<1.25.0)"] +emr-containers = ["mypy-boto3-emr-containers (>=1.24.0,<1.25.0)"] +emr-serverless = ["mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)"] +es = ["mypy-boto3-es (>=1.24.0,<1.25.0)"] +essential = ["mypy-boto3-cloudformation (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-sqs (>=1.24.0,<1.25.0)"] +events = ["mypy-boto3-events (>=1.24.0,<1.25.0)"] +evidently = ["mypy-boto3-evidently (>=1.24.0,<1.25.0)"] +finspace = ["mypy-boto3-finspace (>=1.24.0,<1.25.0)"] +finspace-data = ["mypy-boto3-finspace-data (>=1.24.0,<1.25.0)"] +firehose = ["mypy-boto3-firehose (>=1.24.0,<1.25.0)"] +fis = ["mypy-boto3-fis (>=1.24.0,<1.25.0)"] +fms = ["mypy-boto3-fms (>=1.24.0,<1.25.0)"] +forecast = ["mypy-boto3-forecast (>=1.24.0,<1.25.0)"] +forecastquery = ["mypy-boto3-forecastquery (>=1.24.0,<1.25.0)"] +frauddetector = ["mypy-boto3-frauddetector (>=1.24.0,<1.25.0)"] +fsx = ["mypy-boto3-fsx (>=1.24.0,<1.25.0)"] +gamelift = ["mypy-boto3-gamelift (>=1.24.0,<1.25.0)"] +gamesparks = ["mypy-boto3-gamesparks (>=1.24.0,<1.25.0)"] +glacier = ["mypy-boto3-glacier (>=1.24.0,<1.25.0)"] +globalaccelerator = ["mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)"] +glue = ["mypy-boto3-glue (>=1.24.0,<1.25.0)"] +grafana = ["mypy-boto3-grafana (>=1.24.0,<1.25.0)"] +greengrass = ["mypy-boto3-greengrass (>=1.24.0,<1.25.0)"] +greengrassv2 = ["mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)"] +groundstation = ["mypy-boto3-groundstation (>=1.24.0,<1.25.0)"] +guardduty = ["mypy-boto3-guardduty (>=1.24.0,<1.25.0)"] +health = ["mypy-boto3-health (>=1.24.0,<1.25.0)"] +healthlake = ["mypy-boto3-healthlake (>=1.24.0,<1.25.0)"] +honeycode = ["mypy-boto3-honeycode (>=1.24.0,<1.25.0)"] +iam = ["mypy-boto3-iam (>=1.24.0,<1.25.0)"] +identitystore = ["mypy-boto3-identitystore (>=1.24.0,<1.25.0)"] +imagebuilder = ["mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)"] +importexport = ["mypy-boto3-importexport (>=1.24.0,<1.25.0)"] +inspector = ["mypy-boto3-inspector (>=1.24.0,<1.25.0)"] +inspector2 = ["mypy-boto3-inspector2 (>=1.24.0,<1.25.0)"] +iot = ["mypy-boto3-iot (>=1.24.0,<1.25.0)"] +iot-data = ["mypy-boto3-iot-data (>=1.24.0,<1.25.0)"] +iot-jobs-data = ["mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)"] +iot1click-devices = ["mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)"] +iot1click-projects = ["mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)"] +iotanalytics = ["mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)"] +iotdeviceadvisor = ["mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)"] +iotevents = ["mypy-boto3-iotevents (>=1.24.0,<1.25.0)"] +iotevents-data = ["mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)"] +iotfleethub = ["mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)"] +iotsecuretunneling = ["mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)"] +iotsitewise = ["mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)"] +iotthingsgraph = ["mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)"] +iottwinmaker = ["mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)"] +iotwireless = ["mypy-boto3-iotwireless (>=1.24.0,<1.25.0)"] +ivs = ["mypy-boto3-ivs (>=1.24.0,<1.25.0)"] +ivschat = ["mypy-boto3-ivschat (>=1.24.0,<1.25.0)"] +kafka = ["mypy-boto3-kafka (>=1.24.0,<1.25.0)"] +kafkaconnect = ["mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)"] +kendra = ["mypy-boto3-kendra (>=1.24.0,<1.25.0)"] +keyspaces = ["mypy-boto3-keyspaces (>=1.24.0,<1.25.0)"] +kinesis = ["mypy-boto3-kinesis (>=1.24.0,<1.25.0)"] +kinesis-video-archived-media = ["mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)"] +kinesis-video-media = ["mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)"] +kinesis-video-signaling = ["mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)"] +kinesisanalytics = ["mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)"] +kinesisanalyticsv2 = ["mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)"] +kinesisvideo = ["mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)"] +kms = ["mypy-boto3-kms (>=1.24.0,<1.25.0)"] +lakeformation = ["mypy-boto3-lakeformation (>=1.24.0,<1.25.0)"] +lambda = ["mypy-boto3-lambda (>=1.24.0,<1.25.0)"] +lex-models = ["mypy-boto3-lex-models (>=1.24.0,<1.25.0)"] +lex-runtime = ["mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)"] +lexv2-models = ["mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)"] +lexv2-runtime = ["mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)"] +license-manager = ["mypy-boto3-license-manager (>=1.24.0,<1.25.0)"] +license-manager-user-subscriptions = ["mypy-boto3-license-manager-user-subscriptions (>=1.24.0,<1.25.0)"] +lightsail = ["mypy-boto3-lightsail (>=1.24.0,<1.25.0)"] +location = ["mypy-boto3-location (>=1.24.0,<1.25.0)"] +logs = ["mypy-boto3-logs (>=1.24.0,<1.25.0)"] +lookoutequipment = ["mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)"] +lookoutmetrics = ["mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)"] +lookoutvision = ["mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)"] +m2 = ["mypy-boto3-m2 (>=1.24.0,<1.25.0)"] +machinelearning = ["mypy-boto3-machinelearning (>=1.24.0,<1.25.0)"] +macie = ["mypy-boto3-macie (>=1.24.0,<1.25.0)"] +macie2 = ["mypy-boto3-macie2 (>=1.24.0,<1.25.0)"] +managedblockchain = ["mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)"] +marketplace-catalog = ["mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)"] +marketplace-entitlement = ["mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)"] +marketplacecommerceanalytics = ["mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)"] +mediaconnect = ["mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)"] +mediaconvert = ["mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)"] +medialive = ["mypy-boto3-medialive (>=1.24.0,<1.25.0)"] +mediapackage = ["mypy-boto3-mediapackage (>=1.24.0,<1.25.0)"] +mediapackage-vod = ["mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)"] +mediastore = ["mypy-boto3-mediastore (>=1.24.0,<1.25.0)"] +mediastore-data = ["mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)"] +mediatailor = ["mypy-boto3-mediatailor (>=1.24.0,<1.25.0)"] +memorydb = ["mypy-boto3-memorydb (>=1.24.0,<1.25.0)"] +meteringmarketplace = ["mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)"] +mgh = ["mypy-boto3-mgh (>=1.24.0,<1.25.0)"] +mgn = ["mypy-boto3-mgn (>=1.24.0,<1.25.0)"] +migration-hub-refactor-spaces = ["mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)"] +migrationhub-config = ["mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)"] +migrationhubstrategy = ["mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)"] +mobile = ["mypy-boto3-mobile (>=1.24.0,<1.25.0)"] +mq = ["mypy-boto3-mq (>=1.24.0,<1.25.0)"] +mturk = ["mypy-boto3-mturk (>=1.24.0,<1.25.0)"] +mwaa = ["mypy-boto3-mwaa (>=1.24.0,<1.25.0)"] +neptune = ["mypy-boto3-neptune (>=1.24.0,<1.25.0)"] +network-firewall = ["mypy-boto3-network-firewall (>=1.24.0,<1.25.0)"] +networkmanager = ["mypy-boto3-networkmanager (>=1.24.0,<1.25.0)"] +nimble = ["mypy-boto3-nimble (>=1.24.0,<1.25.0)"] +opensearch = ["mypy-boto3-opensearch (>=1.24.0,<1.25.0)"] +opsworks = ["mypy-boto3-opsworks (>=1.24.0,<1.25.0)"] +opsworkscm = ["mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)"] +organizations = ["mypy-boto3-organizations (>=1.24.0,<1.25.0)"] +outposts = ["mypy-boto3-outposts (>=1.24.0,<1.25.0)"] +panorama = ["mypy-boto3-panorama (>=1.24.0,<1.25.0)"] +personalize = ["mypy-boto3-personalize (>=1.24.0,<1.25.0)"] +personalize-events = ["mypy-boto3-personalize-events (>=1.24.0,<1.25.0)"] +personalize-runtime = ["mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)"] +pi = ["mypy-boto3-pi (>=1.24.0,<1.25.0)"] +pinpoint = ["mypy-boto3-pinpoint (>=1.24.0,<1.25.0)"] +pinpoint-email = ["mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)"] +pinpoint-sms-voice = ["mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)"] +pinpoint-sms-voice-v2 = ["mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)"] +polly = ["mypy-boto3-polly (>=1.24.0,<1.25.0)"] +pricing = ["mypy-boto3-pricing (>=1.24.0,<1.25.0)"] +privatenetworks = ["mypy-boto3-privatenetworks (>=1.24.0,<1.25.0)"] +proton = ["mypy-boto3-proton (>=1.24.0,<1.25.0)"] +qldb = ["mypy-boto3-qldb (>=1.24.0,<1.25.0)"] +qldb-session = ["mypy-boto3-qldb-session (>=1.24.0,<1.25.0)"] +quicksight = ["mypy-boto3-quicksight (>=1.24.0,<1.25.0)"] +ram = ["mypy-boto3-ram (>=1.24.0,<1.25.0)"] +rbin = ["mypy-boto3-rbin (>=1.24.0,<1.25.0)"] +rds = ["mypy-boto3-rds (>=1.24.0,<1.25.0)"] +rds-data = ["mypy-boto3-rds-data (>=1.24.0,<1.25.0)"] +redshift = ["mypy-boto3-redshift (>=1.24.0,<1.25.0)"] +redshift-data = ["mypy-boto3-redshift-data (>=1.24.0,<1.25.0)"] +redshift-serverless = ["mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)"] +rekognition = ["mypy-boto3-rekognition (>=1.24.0,<1.25.0)"] +resiliencehub = ["mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)"] +resource-groups = ["mypy-boto3-resource-groups (>=1.24.0,<1.25.0)"] +resourcegroupstaggingapi = ["mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)"] +robomaker = ["mypy-boto3-robomaker (>=1.24.0,<1.25.0)"] +rolesanywhere = ["mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)"] +route53 = ["mypy-boto3-route53 (>=1.24.0,<1.25.0)"] +route53-recovery-cluster = ["mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)"] +route53-recovery-control-config = ["mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)"] +route53-recovery-readiness = ["mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)"] +route53domains = ["mypy-boto3-route53domains (>=1.24.0,<1.25.0)"] +route53resolver = ["mypy-boto3-route53resolver (>=1.24.0,<1.25.0)"] +rum = ["mypy-boto3-rum (>=1.24.0,<1.25.0)"] +s3 = ["mypy-boto3-s3 (>=1.24.0,<1.25.0)"] +s3control = ["mypy-boto3-s3control (>=1.24.0,<1.25.0)"] +s3outposts = ["mypy-boto3-s3outposts (>=1.24.0,<1.25.0)"] +sagemaker = ["mypy-boto3-sagemaker (>=1.24.0,<1.25.0)"] +sagemaker-a2i-runtime = ["mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)"] +sagemaker-edge = ["mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)"] +sagemaker-featurestore-runtime = ["mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)"] +sagemaker-runtime = ["mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)"] +savingsplans = ["mypy-boto3-savingsplans (>=1.24.0,<1.25.0)"] +schemas = ["mypy-boto3-schemas (>=1.24.0,<1.25.0)"] +sdb = ["mypy-boto3-sdb (>=1.24.0,<1.25.0)"] +secretsmanager = ["mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)"] +securityhub = ["mypy-boto3-securityhub (>=1.24.0,<1.25.0)"] +serverlessrepo = ["mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)"] +service-quotas = ["mypy-boto3-service-quotas (>=1.24.0,<1.25.0)"] +servicecatalog = ["mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)"] +servicecatalog-appregistry = ["mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)"] +servicediscovery = ["mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)"] +ses = ["mypy-boto3-ses (>=1.24.0,<1.25.0)"] +sesv2 = ["mypy-boto3-sesv2 (>=1.24.0,<1.25.0)"] +shield = ["mypy-boto3-shield (>=1.24.0,<1.25.0)"] +signer = ["mypy-boto3-signer (>=1.24.0,<1.25.0)"] +sms = ["mypy-boto3-sms (>=1.24.0,<1.25.0)"] +sms-voice = ["mypy-boto3-sms-voice (>=1.24.0,<1.25.0)"] +snow-device-management = ["mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)"] +snowball = ["mypy-boto3-snowball (>=1.24.0,<1.25.0)"] +sns = ["mypy-boto3-sns (>=1.24.0,<1.25.0)"] +sqs = ["mypy-boto3-sqs (>=1.24.0,<1.25.0)"] +ssm = ["mypy-boto3-ssm (>=1.24.0,<1.25.0)"] +ssm-contacts = ["mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)"] +ssm-incidents = ["mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)"] +sso = ["mypy-boto3-sso (>=1.24.0,<1.25.0)"] +sso-admin = ["mypy-boto3-sso-admin (>=1.24.0,<1.25.0)"] +sso-oidc = ["mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)"] +stepfunctions = ["mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)"] +storagegateway = ["mypy-boto3-storagegateway (>=1.24.0,<1.25.0)"] +sts = ["mypy-boto3-sts (>=1.24.0,<1.25.0)"] +support = ["mypy-boto3-support (>=1.24.0,<1.25.0)"] +swf = ["mypy-boto3-swf (>=1.24.0,<1.25.0)"] +synthetics = ["mypy-boto3-synthetics (>=1.24.0,<1.25.0)"] +textract = ["mypy-boto3-textract (>=1.24.0,<1.25.0)"] +timestream-query = ["mypy-boto3-timestream-query (>=1.24.0,<1.25.0)"] +timestream-write = ["mypy-boto3-timestream-write (>=1.24.0,<1.25.0)"] +transcribe = ["mypy-boto3-transcribe (>=1.24.0,<1.25.0)"] +transfer = ["mypy-boto3-transfer (>=1.24.0,<1.25.0)"] +translate = ["mypy-boto3-translate (>=1.24.0,<1.25.0)"] +voice-id = ["mypy-boto3-voice-id (>=1.24.0,<1.25.0)"] +waf = ["mypy-boto3-waf (>=1.24.0,<1.25.0)"] +waf-regional = ["mypy-boto3-waf-regional (>=1.24.0,<1.25.0)"] +wafv2 = ["mypy-boto3-wafv2 (>=1.24.0,<1.25.0)"] +wellarchitected = ["mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)"] +wisdom = ["mypy-boto3-wisdom (>=1.24.0,<1.25.0)"] +workdocs = ["mypy-boto3-workdocs (>=1.24.0,<1.25.0)"] +worklink = ["mypy-boto3-worklink (>=1.24.0,<1.25.0)"] +workmail = ["mypy-boto3-workmail (>=1.24.0,<1.25.0)"] +workmailmessageflow = ["mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)"] +workspaces = ["mypy-boto3-workspaces (>=1.24.0,<1.25.0)"] +workspaces-web = ["mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)"] +xray = ["mypy-boto3-xray (>=1.24.0,<1.25.0)"] [[package]] name = "botocore" @@ -713,6 +735,20 @@ category = "main" optional = false python-versions = "*" +[[package]] +name = "isort" +version = "5.10.1" +description = "A Python utility / library to sort Python imports." +category = "dev" +optional = false +python-versions = ">=3.6.1,<4.0" + +[package.extras] +pipfile_deprecated_finder = ["pipreqs", "requirementslib"] +requirements_deprecated_finder = ["pipreqs", "pip-api"] +colors = ["colorama (>=0.4.3,<0.5.0)"] +plugins = ["setuptools"] + [[package]] name = "itsdangerous" version = "2.1.2" @@ -994,6 +1030,14 @@ python-versions = ">=3.6" [package.dependencies] pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" +[[package]] +name = "pathspec" +version = "0.9.0" +description = "Utility library for gitignore style pattern matching of file paths." +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" + [[package]] name = "pbr" version = "5.9.0" @@ -1002,6 +1046,18 @@ category = "main" optional = false python-versions = ">=2.6" +[[package]] +name = "platformdirs" +version = "2.5.2" +description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx-autodoc-typehints (>=1.12)", "sphinx (>=4)"] +test = ["appdirs (==1.4.4)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)", "pytest (>=6)"] + [[package]] name = "pluggy" version = "1.0.0" @@ -1466,14 +1522,6 @@ category = "main" optional = false python-versions = ">=3.4" -[[package]] -name = "yapf" -version = "0.31.0" -description = "A formatter for Python code." -category = "dev" -optional = false -python-versions = "*" - [[package]] name = "zipp" version = "3.8.1" @@ -1489,7 +1537,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "453b90e40481ca6e4395e84beb73489b58c0983e826e369eb0f412ef633ea5e1" +content-hash = "497b963e7a2f80a751ccd201e950cf533caddb6c7c96163c94cea69874840843" [metadata.files] aiopg = [ @@ -1543,13 +1591,38 @@ backoff = [ {file = "backoff-1.11.1-py2.py3-none-any.whl", hash = "sha256:61928f8fa48d52e4faa81875eecf308eccfb1016b018bb6bd21e05b5d90a96c5"}, {file = "backoff-1.11.1.tar.gz", hash = "sha256:ccb962a2378418c667b3c979b504fdeb7d9e0d29c0579e3b13b86467177728cb"}, ] +black = [ + {file = "black-22.6.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f586c26118bc6e714ec58c09df0157fe2d9ee195c764f630eb0d8e7ccce72e69"}, + {file = "black-22.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b270a168d69edb8b7ed32c193ef10fd27844e5c60852039599f9184460ce0807"}, + {file = "black-22.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6797f58943fceb1c461fb572edbe828d811e719c24e03375fd25170ada53825e"}, + {file = "black-22.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c85928b9d5f83b23cee7d0efcb310172412fbf7cb9d9ce963bd67fd141781def"}, + {file = "black-22.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:f6fe02afde060bbeef044af7996f335fbe90b039ccf3f5eb8f16df8b20f77666"}, + {file = "black-22.6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cfaf3895a9634e882bf9d2363fed5af8888802d670f58b279b0bece00e9a872d"}, + {file = "black-22.6.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94783f636bca89f11eb5d50437e8e17fbc6a929a628d82304c80fa9cd945f256"}, + {file = "black-22.6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:2ea29072e954a4d55a2ff58971b83365eba5d3d357352a07a7a4df0d95f51c78"}, + {file = "black-22.6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e439798f819d49ba1c0bd9664427a05aab79bfba777a6db94fd4e56fae0cb849"}, + {file = "black-22.6.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:187d96c5e713f441a5829e77120c269b6514418f4513a390b0499b0987f2ff1c"}, + {file = "black-22.6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:074458dc2f6e0d3dab7928d4417bb6957bb834434516f21514138437accdbe90"}, + {file = "black-22.6.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a218d7e5856f91d20f04e931b6f16d15356db1c846ee55f01bac297a705ca24f"}, + {file = "black-22.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:568ac3c465b1c8b34b61cd7a4e349e93f91abf0f9371eda1cf87194663ab684e"}, + {file = "black-22.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6c1734ab264b8f7929cef8ae5f900b85d579e6cbfde09d7387da8f04771b51c6"}, + {file = "black-22.6.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c9a3ac16efe9ec7d7381ddebcc022119794872abce99475345c5a61aa18c45ad"}, + {file = "black-22.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:b9fd45787ba8aa3f5e0a0a98920c1012c884622c6c920dbe98dbd05bc7c70fbf"}, + {file = "black-22.6.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7ba9be198ecca5031cd78745780d65a3f75a34b2ff9be5837045dce55db83d1c"}, + {file = "black-22.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a3db5b6409b96d9bd543323b23ef32a1a2b06416d525d27e0f67e74f1446c8f2"}, + {file = "black-22.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:560558527e52ce8afba936fcce93a7411ab40c7d5fe8c2463e279e843c0328ee"}, + {file = "black-22.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b154e6bbde1e79ea3260c4b40c0b7b3109ffcdf7bc4ebf8859169a6af72cd70b"}, + {file = "black-22.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:4af5bc0e1f96be5ae9bd7aaec219c901a94d6caa2484c21983d043371c733fc4"}, + {file = "black-22.6.0-py3-none-any.whl", hash = "sha256:ac609cf8ef5e7115ddd07d85d988d074ed00e10fbc3445aee393e70164a2219c"}, + {file = "black-22.6.0.tar.gz", hash = "sha256:6c6d39e28aed379aec40da1c65434c77d75e65bb59a1e1c283de545fb4e7c6c9"}, +] boto3 = [ {file = "boto3-1.24.38-py3-none-any.whl", hash = "sha256:bcf97fd7c494f4e2bbbe2511625500654179c0a6b3bea977d46f97af764e85a4"}, {file = "boto3-1.24.38.tar.gz", hash = "sha256:f4c6b025f392c934338c7f01badfddbd0d3cf2397ff5df35c31409798dce33f5"}, ] boto3-stubs = [ - {file = "boto3-stubs-1.24.51.tar.gz", hash = "sha256:ea69c707e9ceab7c11cab1f11fb4bbe98fa5ff8da593f888946d297daa083870"}, - {file = "boto3_stubs-1.24.51-py3-none-any.whl", hash = "sha256:432aebdb18e7c26bf2b148e04eb33e145976cb932bfe0f72b2d512e945927e57"}, + {file = "boto3-stubs-1.24.56.tar.gz", hash = "sha256:02e11b3669481469b45eee53fa5e0b587e5710f86bb95bd40667d1353d1e4bf6"}, + {file = "boto3_stubs-1.24.56-py3-none-any.whl", hash = "sha256:e5df3a68ddb8299404f63d19decc1f706ebdac64f3133c1e1cab747820337a75"}, ] botocore = [ {file = "botocore-1.27.38-py3-none-any.whl", hash = "sha256:46a0264ff3335496bd9cb404f83ec0d8eb7bfdef8f74a830c13e6a6b9612adea"}, @@ -1713,6 +1786,10 @@ iniconfig = [ {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, ] +isort = [ + {file = "isort-5.10.1-py3-none-any.whl", hash = "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7"}, + {file = "isort-5.10.1.tar.gz", hash = "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951"}, +] itsdangerous = [ {file = "itsdangerous-2.1.2-py3-none-any.whl", hash = "sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44"}, {file = "itsdangerous-2.1.2.tar.gz", hash = "sha256:5dbbc68b317e5e42f327f9021763545dc3fc3bfe22e6deb96aaf1fc38874156a"}, @@ -1851,10 +1928,18 @@ packaging = [ {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, ] +pathspec = [ + {file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"}, + {file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"}, +] pbr = [ {file = "pbr-5.9.0-py2.py3-none-any.whl", hash = "sha256:e547125940bcc052856ded43be8e101f63828c2d94239ffbe2b327ba3d5ccf0a"}, {file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"}, ] +platformdirs = [ + {file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"}, + {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"}, +] pluggy = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, @@ -2208,10 +2293,6 @@ xmltodict = [ {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"}, {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"}, ] -yapf = [ - {file = "yapf-0.31.0-py2.py3-none-any.whl", hash = "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"}, - {file = "yapf-0.31.0.tar.gz", hash = "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d"}, -] zipp = [ {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"}, {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"}, diff --git a/pre-commit.py b/pre-commit.py index ea6a22a7fe..45f140d43a 100755 --- a/pre-commit.py +++ b/pre-commit.py @@ -1,11 +1,10 @@ #!/usr/bin/env python3 -from typing import List +import argparse +import enum import subprocess import sys -import enum -import argparse -import os +from typing import List @enum.unique @@ -37,12 +36,17 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str: return cmd -def yapf(fix_inplace: bool) -> str: - cmd = "poetry run yapf --recursive" - if fix_inplace: - cmd += " --in-place" - else: - cmd += " --diff" +def black(fix_inplace: bool) -> str: + cmd = "poetry run black" + if not fix_inplace: + cmd += " --diff --check" + return cmd + + +def isort(fix_inplace: bool) -> str: + cmd = "poetry run isort" + if not fix_inplace: + cmd += " --diff --check" return cmd @@ -71,11 +75,13 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color: else: print("Please inspect the output below and run make fmt to fix automatically.") if suffix == ".py": - print("If the output is empty, ensure that you've installed Python tooling by\n" - "running './scripts/pysync' in the current directory (no root needed)") + print( + "If the output is empty, ensure that you've installed Python tooling by\n" + "running './scripts/pysync' in the current directory (no root needed)" + ) print() print(res.stdout.decode()) - exit(1) + sys.exit(1) print(colorify("[OK]", Color.GREEN, no_color)) @@ -83,10 +89,12 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color: if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--fix-inplace", action="store_true", help="apply fixes inplace") - parser.add_argument("--no-color", - action="store_true", - help="disable colored output", - default=not sys.stdout.isatty()) + parser.add_argument( + "--no-color", + action="store_true", + help="disable colored output", + default=not sys.stdout.isatty(), + ) args = parser.parse_args() files = get_commit_files() @@ -101,9 +109,16 @@ if __name__ == "__main__": no_color=args.no_color, ) check( - name="yapf", + name="isort", suffix=".py", - cmd=yapf(fix_inplace=args.fix_inplace), + cmd=isort(fix_inplace=args.fix_inplace), + changed_files=files, + no_color=args.no_color, + ) + check( + name="black", + suffix=".py", + cmd=black(fix_inplace=args.fix_inplace), changed_files=files, no_color=args.no_color, ) diff --git a/pyproject.toml b/pyproject.toml index a54dbe9ebd..4f8a49a024 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,10 +30,50 @@ pytest-order = "^1.0.1" allure-pytest = "^2.9.45" [tool.poetry.dev-dependencies] -yapf = "==0.31.0" flake8 = "^3.9.2" mypy = "==0.971" +black = "^22.6.0" +isort = "^5.10.1" [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" + +[tool.black] +line-length = 100 +extend-exclude = ''' +/( + vendor +)/ +''' + +[tool.isort] +profile = "black" +line_length = 100 +skip_gitignore = true +skip = [ + "vendor", +] + +[tool.mypy] +# mypy uses regex +exclude = "^vendor/" +# some tests don't typecheck when this flag is set +check_untyped_defs = false +# Help mypy find imports when running against list of individual files. +# Without this line it would behave differently when executed on the entire project. +mypy_path = "$MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner" + +disallow_incomplete_defs = false +disallow_untyped_calls = false +disallow_untyped_decorators = false +disallow_untyped_defs = false +strict = true + +[[tool.mypy.overrides]] +module = [ + "asyncpg.*", + "cached_property.*", + "pg8000.*", +] +ignore_missing_imports = true diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 7f8c45c8c3..0000000000 --- a/setup.cfg +++ /dev/null @@ -1,43 +0,0 @@ -# Just trying to gather linter settings in one file. -# I wonder if there's a way to de-duplicate them... - -[flake8] -max-line-length = 100 - -[pycodestyle] -max-line-length = 100 - -[yapf] -based_on_style = pep8 -column_limit = 100 -split_all_top_level_comma_separated_values = true - -[mypy] -# mypy uses regex -exclude = ^vendor/ -# some tests don't typecheck when this flag is set -check_untyped_defs = false - -# Help mypy find imports when running against list of individual files. -# Without this line it would behave differently when executed on the entire project. -mypy_path = $MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner - -disallow_incomplete_defs = false -disallow_untyped_calls = false -disallow_untyped_decorators = false -disallow_untyped_defs = false -strict = true - -[mypy-asyncpg.*] -# There is some work in progress, though: https://github.com/MagicStack/asyncpg/pull/577 -ignore_missing_imports = true - -[mypy-pg8000.*] -# Used only in testing clients -ignore_missing_imports = true - -[mypy-cached_property.*] -ignore_missing_imports = true - -[mypy-pytest.*] -ignore_missing_imports = true From 4c2bb43775947775401cbb9d774823c5723a91f8 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 18 Aug 2022 13:37:28 +0100 Subject: [PATCH 0670/1022] Reformat all python files by black & isort --- scripts/coverage | 13 +- scripts/export_import_between_pageservers.py | 347 +++--- scripts/generate_perf_report_page.py | 152 +-- scripts/git-upload | 11 +- scripts/ingest_perf_test_result.py | 82 +- .../batch_others/test_ancestor_branch.py | 71 +- test_runner/batch_others/test_auth.py | 42 +- test_runner/batch_others/test_backpressure.py | 59 +- .../batch_others/test_basebackup_error.py | 3 +- .../batch_others/test_branch_and_gc.py | 101 +- .../batch_others/test_branch_behind.py | 92 +- test_runner/batch_others/test_branching.py | 58 +- .../batch_others/test_broken_timeline.py | 33 +- .../batch_others/test_clog_truncate.py | 55 +- test_runner/batch_others/test_close_fds.py | 28 +- test_runner/batch_others/test_config.py | 14 +- .../batch_others/test_crafted_wal_end.py | 72 +- test_runner/batch_others/test_createdropdb.py | 62 +- test_runner/batch_others/test_createuser.py | 18 +- test_runner/batch_others/test_fsm_truncate.py | 7 +- test_runner/batch_others/test_fullbackup.py | 49 +- .../batch_others/test_gc_aggressive.py | 20 +- test_runner/batch_others/test_import.py | 168 +-- test_runner/batch_others/test_large_schema.py | 19 +- test_runner/batch_others/test_lsn_mapping.py | 32 +- test_runner/batch_others/test_multixact.py | 31 +- test_runner/batch_others/test_neon_cli.py | 35 +- test_runner/batch_others/test_next_xid.py | 12 +- test_runner/batch_others/test_normal_work.py | 24 +- .../batch_others/test_old_request_lsn.py | 30 +- .../batch_others/test_pageserver_api.py | 138 ++- .../batch_others/test_pageserver_catchup.py | 27 +- .../batch_others/test_pageserver_restart.py | 22 +- .../batch_others/test_parallel_copy.py | 14 +- test_runner/batch_others/test_pitr_gc.py | 36 +- test_runner/batch_others/test_proxy.py | 23 +- .../batch_others/test_read_validation.py | 57 +- .../batch_others/test_readonly_node.py | 78 +- test_runner/batch_others/test_recovery.py | 15 +- .../batch_others/test_remote_storage.py | 69 +- test_runner/batch_others/test_subxacts.py | 22 +- test_runner/batch_others/test_tenant_conf.py | 65 +- .../batch_others/test_tenant_detach.py | 38 +- .../batch_others/test_tenant_relocation.py | 179 +-- test_runner/batch_others/test_tenant_tasks.py | 7 +- test_runner/batch_others/test_tenants.py | 52 +- .../test_tenants_with_remote_storage.py | 30 +- .../batch_others/test_timeline_delete.py | 47 +- .../batch_others/test_timeline_size.py | 230 ++-- test_runner/batch_others/test_twophase.py | 30 +- test_runner/batch_others/test_vm_bits.py | 54 +- test_runner/batch_others/test_wal_acceptor.py | 418 ++++--- .../batch_others/test_wal_acceptor_async.py | 267 ++-- test_runner/batch_others/test_wal_restore.py | 46 +- .../batch_pg_regress/test_isolation.py | 35 +- .../batch_pg_regress/test_neon_regress.py | 43 +- .../batch_pg_regress/test_pg_regress.py | 39 +- test_runner/conftest.py | 12 +- test_runner/fixtures/benchmark_fixture.py | 161 ++- test_runner/fixtures/compare_fixtures.py | 105 +- test_runner/fixtures/log_helper.py | 15 +- test_runner/fixtures/metrics.py | 8 +- test_runner/fixtures/neon_fixtures.py | 1093 +++++++++-------- test_runner/fixtures/pg_stats.py | 36 +- test_runner/fixtures/slow.py | 1 + test_runner/fixtures/utils.py | 42 +- .../performance/test_branch_creation.py | 65 +- test_runner/performance/test_bulk_insert.py | 11 +- .../performance/test_bulk_tenant_create.py | 28 +- .../performance/test_compare_pg_stats.py | 79 +- test_runner/performance/test_copy.py | 31 +- test_runner/performance/test_dup_key.py | 26 +- test_runner/performance/test_gist_build.py | 9 +- test_runner/performance/test_hot_page.py | 24 +- test_runner/performance/test_hot_table.py | 20 +- .../performance/test_parallel_copy_to.py | 28 +- test_runner/performance/test_perf_pgbench.py | 83 +- test_runner/performance/test_random_writes.py | 35 +- test_runner/performance/test_seqscans.py | 30 +- test_runner/performance/test_startup.py | 17 +- .../performance/test_wal_backpressure.py | 146 ++- .../performance/test_write_amplification.py | 15 +- test_runner/pg_clients/test_pg_clients.py | 12 +- test_runner/test_broken.py | 16 +- 84 files changed, 3282 insertions(+), 2687 deletions(-) diff --git a/scripts/coverage b/scripts/coverage index f2c46d9ae9..af0d067419 100755 --- a/scripts/coverage +++ b/scripts/coverage @@ -9,13 +9,6 @@ # * https://github.com/taiki-e/cargo-llvm-cov # * https://github.com/llvm/llvm-project/tree/main/llvm/test/tools/llvm-cov -from abc import ABC, abstractmethod -from dataclasses import dataclass -from pathlib import Path -from tempfile import TemporaryDirectory -from textwrap import dedent -from typing import Any, Dict, Iterator, Iterable, List, Optional - import argparse import hashlib import json @@ -24,6 +17,12 @@ import shutil import socket import subprocess import sys +from abc import ABC, abstractmethod +from dataclasses import dataclass +from pathlib import Path +from tempfile import TemporaryDirectory +from textwrap import dedent +from typing import Any, Dict, Iterable, Iterator, List, Optional def file_mtime_or_zero(path: Path) -> int: diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 96f1d36ddb..5b9fc76768 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -20,20 +20,21 @@ # For more context on how to use this, see: # https://github.com/neondatabase/cloud/wiki/Storage-format-migration -import os -from os import path -import shutil -from pathlib import Path -import tempfile -from contextlib import closing -import psycopg2 -import subprocess import argparse +import os +import shutil +import subprocess +import tempfile import time -import requests import uuid +from contextlib import closing +from os import path +from pathlib import Path +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, TypeVar, Union, cast + +import psycopg2 +import requests from psycopg2.extensions import connection as PgConnection -from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple ############################################### ### client-side utils copied from test fixtures @@ -45,7 +46,7 @@ _global_counter = 0 def global_counter() -> int: - """ A really dumb global counter. + """A really dumb global counter. This is useful for giving output files a unique number, so if we run the same command multiple times we can keep their output separate. """ @@ -55,7 +56,7 @@ def global_counter() -> int: def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: - """ Run a process and capture its output + """Run a process and capture its output Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" where "cmd" is the name of the program and NNN is an incrementing counter. @@ -63,13 +64,13 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: Returns basepath for files with captured output. """ assert type(cmd) is list - base = os.path.basename(cmd[0]) + '_{}'.format(global_counter()) + base = os.path.basename(cmd[0]) + "_{}".format(global_counter()) basepath = os.path.join(capture_dir, base) - stdout_filename = basepath + '.stdout' - stderr_filename = basepath + '.stderr' + stdout_filename = basepath + ".stdout" + stderr_filename = basepath + ".stderr" - with open(stdout_filename, 'w') as stdout_f: - with open(stderr_filename, 'w') as stderr_f: + with open(stdout_filename, "w") as stdout_f: + with open(stderr_filename, "w") as stderr_f: print('(capturing output to "{}.stdout")'.format(base)) subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) @@ -77,15 +78,16 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: class PgBin: - """ A helper class for executing postgres binaries """ + """A helper class for executing postgres binaries""" + def __init__(self, log_dir: Path, pg_distrib_dir): self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), 'bin') + self.pg_bin_path = os.path.join(str(pg_distrib_dir), "bin") self.env = os.environ.copy() - self.env['LD_LIBRARY_PATH'] = os.path.join(str(pg_distrib_dir), 'lib') + self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), "lib") def _fixpath(self, command: List[str]): - if '/' not in command[0]: + if "/" not in command[0]: command[0] = os.path.join(self.pg_bin_path, command[0]) def _build_env(self, env_add: Optional[Env]) -> Env: @@ -106,15 +108,17 @@ class PgBin: """ self._fixpath(command) - print('Running command "{}"'.format(' '.join(command))) + print('Running command "{}"'.format(" ".join(command))) env = self._build_env(env) subprocess.run(command, env=env, cwd=cwd, check=True) - def run_capture(self, - command: List[str], - env: Optional[Env] = None, - cwd: Optional[str] = None, - **kwargs: Any) -> str: + def run_capture( + self, + command: List[str], + env: Optional[Env] = None, + cwd: Optional[str] = None, + **kwargs: Any, + ) -> str: """ Run one of the postgres binaries, with stderr and stdout redirected to a file. This is just like `run`, but for chatty programs. Returns basepath for files @@ -122,35 +126,33 @@ class PgBin: """ self._fixpath(command) - print('Running command "{}"'.format(' '.join(command))) + print('Running command "{}"'.format(" ".join(command))) env = self._build_env(env) - return subprocess_capture(str(self.log_dir), - command, - env=env, - cwd=cwd, - check=True, - **kwargs) + return subprocess_capture( + str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs + ) class PgProtocol: - """ Reusable connection logic """ + """Reusable connection logic""" + def __init__(self, **kwargs): self.default_options = kwargs def conn_options(self, **kwargs): conn_options = self.default_options.copy() - if 'dsn' in kwargs: - conn_options.update(parse_dsn(kwargs['dsn'])) + if "dsn" in kwargs: + conn_options.update(parse_dsn(kwargs["dsn"])) conn_options.update(kwargs) # Individual statement timeout in seconds. 2 minutes should be # enough for our tests, but if you need a longer, you can # change it by calling "SET statement_timeout" after # connecting. - if 'options' in conn_options: - conn_options['options'] = f"-cstatement_timeout=120s " + conn_options['options'] + if "options" in conn_options: + conn_options["options"] = f"-cstatement_timeout=120s " + conn_options["options"] else: - conn_options['options'] = "-cstatement_timeout=120s" + conn_options["options"] = "-cstatement_timeout=120s" return conn_options # autocommit=True here by default because that's what we need most of the time @@ -194,18 +196,18 @@ class PgProtocol: class VanillaPostgres(PgProtocol): def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True): - super().__init__(host='localhost', port=port, dbname='postgres') + super().__init__(host="localhost", port=port, dbname="postgres") self.pgdatadir = pgdatadir self.pg_bin = pg_bin self.running = False if init: - self.pg_bin.run_capture(['initdb', '-D', str(pgdatadir)]) + self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)]) self.configure([f"port = {port}\n"]) def configure(self, options: List[str]): """Append lines into postgresql.conf file.""" assert not self.running - with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file: + with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file: conf_file.write("\n".join(options)) def start(self, log_path: Optional[str] = None): @@ -216,12 +218,13 @@ class VanillaPostgres(PgProtocol): log_path = os.path.join(self.pgdatadir, "pg.log") self.pg_bin.run_capture( - ['pg_ctl', '-w', '-D', str(self.pgdatadir), '-l', log_path, 'start']) + ["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"] + ) def stop(self): assert self.running self.running = False - self.pg_bin.run_capture(['pg_ctl', '-w', '-D', str(self.pgdatadir), 'stop']) + self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"]) def __enter__(self): return self @@ -246,9 +249,9 @@ class NeonPageserverHttpClient(requests.Session): res.raise_for_status() except requests.RequestException as e: try: - msg = res.json()['msg'] + msg = res.json()["msg"] except: - msg = '' + msg = "" raise NeonPageserverApiException(msg) from e def check_status(self): @@ -265,17 +268,17 @@ class NeonPageserverHttpClient(requests.Session): res = self.post( f"http://{self.host}:{self.port}/v1/tenant", json={ - 'new_tenant_id': new_tenant_id.hex, + "new_tenant_id": new_tenant_id.hex, }, ) if res.status_code == 409: if ok_if_exists: - print(f'could not create tenant: already exists for id {new_tenant_id}') + print(f"could not create tenant: already exists for id {new_tenant_id}") else: res.raise_for_status() elif res.status_code == 201: - print(f'created tenant {new_tenant_id}') + print(f"created tenant {new_tenant_id}") else: self.verbose_error(res) @@ -299,47 +302,55 @@ class NeonPageserverHttpClient(requests.Session): def lsn_to_hex(num: int) -> str: - """ Convert lsn from int to standard hex notation. """ - return "{:X}/{:X}".format(num >> 32, num & 0xffffffff) + """Convert lsn from int to standard hex notation.""" + return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF) def lsn_from_hex(lsn_hex: str) -> int: - """ Convert lsn from hex notation to int. """ - l, r = lsn_hex.split('/') + """Convert lsn from hex notation to int.""" + l, r = lsn_hex.split("/") return (int(l, 16) << 32) + int(r, 16) -def remote_consistent_lsn(pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID) -> int: +def remote_consistent_lsn( + pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID +) -> int: detail = pageserver_http_client.timeline_detail(tenant, timeline) - if detail['remote'] is None: + if detail["remote"] is None: # No remote information at all. This happens right after creating # a timeline, before any part of it has been uploaded to remote # storage yet. return 0 else: - lsn_str = detail['remote']['remote_consistent_lsn'] + lsn_str = detail["remote"]["remote_consistent_lsn"] assert isinstance(lsn_str, str) return lsn_from_hex(lsn_str) -def wait_for_upload(pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID, - lsn: int): +def wait_for_upload( + pageserver_http_client: NeonPageserverHttpClient, + tenant: uuid.UUID, + timeline: uuid.UUID, + lsn: int, +): """waits for local timeline upload up to specified lsn""" for i in range(10): current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) if current_lsn >= lsn: return - print("waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1)) + print( + "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1 + ) + ) time.sleep(1) - raise Exception("timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn))) + raise Exception( + "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn) + ) + ) ############## @@ -399,7 +410,7 @@ def reconstruct_paths(log_dir, pg_bin, base_tar): # Add all template0copy paths to template0 prefix = f"base/{oid}/" if filepath.startswith(prefix): - suffix = filepath[len(prefix):] + suffix = filepath[len(prefix) :] yield f"base/{template0_oid}/{suffix}" elif filepath.startswith("global"): print(f"skipping {database} global file {filepath}") @@ -451,15 +462,17 @@ def get_rlsn(pageserver_connstr, tenant_id, timeline_id): return last_lsn, prev_lsn -def import_timeline(args, - psql_path, - pageserver_connstr, - pageserver_http, - tenant_id, - timeline_id, - last_lsn, - prev_lsn, - tar_filename): +def import_timeline( + args, + psql_path, + pageserver_connstr, + pageserver_http, + tenant_id, + timeline_id, + last_lsn, + prev_lsn, + tar_filename, +): # Import timelines to new pageserver import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn}" full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """ @@ -469,34 +482,30 @@ def import_timeline(args, print(f"Running: {full_cmd}") - with open(stdout_filename, 'w') as stdout_f: - with open(stderr_filename2, 'w') as stderr_f: + with open(stdout_filename, "w") as stdout_f: + with open(stderr_filename2, "w") as stderr_f: print(f"(capturing output to {stdout_filename})") pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) - subprocess.run(full_cmd, - stdout=stdout_f, - stderr=stderr_f, - env=pg_bin._build_env(None), - shell=True, - check=True) + subprocess.run( + full_cmd, + stdout=stdout_f, + stderr=stderr_f, + env=pg_bin._build_env(None), + shell=True, + check=True, + ) print(f"Done import") # Wait until pageserver persists the files - wait_for_upload(pageserver_http, - uuid.UUID(tenant_id), - uuid.UUID(timeline_id), - lsn_from_hex(last_lsn)) + wait_for_upload( + pageserver_http, uuid.UUID(tenant_id), uuid.UUID(timeline_id), lsn_from_hex(last_lsn) + ) -def export_timeline(args, - psql_path, - pageserver_connstr, - tenant_id, - timeline_id, - last_lsn, - prev_lsn, - tar_filename): +def export_timeline( + args, psql_path, pageserver_connstr, tenant_id, timeline_id, last_lsn, prev_lsn, tar_filename +): # Choose filenames incomplete_filename = tar_filename + ".incomplete" stderr_filename = path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr") @@ -507,15 +516,13 @@ def export_timeline(args, # Run export command print(f"Running: {cmd}") - with open(incomplete_filename, 'w') as stdout_f: - with open(stderr_filename, 'w') as stderr_f: + with open(incomplete_filename, "w") as stdout_f: + with open(stderr_filename, "w") as stderr_f: print(f"(capturing output to {incomplete_filename})") pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) - subprocess.run(cmd, - stdout=stdout_f, - stderr=stderr_f, - env=pg_bin._build_env(None), - check=True) + subprocess.run( + cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True + ) # Add missing rels pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) @@ -551,27 +558,28 @@ def main(args: argparse.Namespace): for timeline in timelines: # Skip timelines we don't need to export - if args.timelines and timeline['timeline_id'] not in args.timelines: + if args.timelines and timeline["timeline_id"] not in args.timelines: print(f"Skipping timeline {timeline['timeline_id']}") continue # Choose filenames - tar_filename = path.join(args.work_dir, - f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar") + tar_filename = path.join( + args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar" + ) # Export timeline from old pageserver if args.only_import is False: last_lsn, prev_lsn = get_rlsn( old_pageserver_connstr, - timeline['tenant_id'], - timeline['timeline_id'], + timeline["tenant_id"], + timeline["timeline_id"], ) export_timeline( args, psql_path, old_pageserver_connstr, - timeline['tenant_id'], - timeline['timeline_id'], + timeline["tenant_id"], + timeline["timeline_id"], last_lsn, prev_lsn, tar_filename, @@ -583,8 +591,8 @@ def main(args: argparse.Namespace): psql_path, new_pageserver_connstr, new_http_client, - timeline['tenant_id'], - timeline['timeline_id'], + timeline["tenant_id"], + timeline["timeline_id"], last_lsn, prev_lsn, tar_filename, @@ -592,117 +600,118 @@ def main(args: argparse.Namespace): # Re-export and compare re_export_filename = tar_filename + ".reexport" - export_timeline(args, - psql_path, - new_pageserver_connstr, - timeline['tenant_id'], - timeline['timeline_id'], - last_lsn, - prev_lsn, - re_export_filename) + export_timeline( + args, + psql_path, + new_pageserver_connstr, + timeline["tenant_id"], + timeline["timeline_id"], + last_lsn, + prev_lsn, + re_export_filename, + ) # Check the size is the same - old_size = os.path.getsize(tar_filename), - new_size = os.path.getsize(re_export_filename), + old_size = (os.path.getsize(tar_filename),) + new_size = (os.path.getsize(re_export_filename),) if old_size != new_size: raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}") -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - '--tenant-id', - dest='tenants', + "--tenant-id", + dest="tenants", required=True, - nargs='+', - help='Id of the tenant to migrate. You can pass multiple arguments', + nargs="+", + help="Id of the tenant to migrate. You can pass multiple arguments", ) parser.add_argument( - '--timeline-id', - dest='timelines', + "--timeline-id", + dest="timelines", required=False, - nargs='+', - help='Id of the timeline to migrate. You can pass multiple arguments', + nargs="+", + help="Id of the timeline to migrate. You can pass multiple arguments", ) parser.add_argument( - '--from-host', - dest='old_pageserver_host', + "--from-host", + dest="old_pageserver_host", required=True, - help='Host of the pageserver to migrate data from', + help="Host of the pageserver to migrate data from", ) parser.add_argument( - '--from-http-port', - dest='old_pageserver_http_port', + "--from-http-port", + dest="old_pageserver_http_port", required=False, type=int, default=9898, - help='HTTP port of the pageserver to migrate data from. Default: 9898', + help="HTTP port of the pageserver to migrate data from. Default: 9898", ) parser.add_argument( - '--from-pg-port', - dest='old_pageserver_pg_port', + "--from-pg-port", + dest="old_pageserver_pg_port", required=False, type=int, default=6400, - help='pg port of the pageserver to migrate data from. Default: 6400', + help="pg port of the pageserver to migrate data from. Default: 6400", ) parser.add_argument( - '--to-host', - dest='new_pageserver_host', + "--to-host", + dest="new_pageserver_host", required=True, - help='Host of the pageserver to migrate data to', + help="Host of the pageserver to migrate data to", ) parser.add_argument( - '--to-http-port', - dest='new_pageserver_http_port', + "--to-http-port", + dest="new_pageserver_http_port", required=False, default=9898, type=int, - help='HTTP port of the pageserver to migrate data to. Default: 9898', + help="HTTP port of the pageserver to migrate data to. Default: 9898", ) parser.add_argument( - '--to-pg-port', - dest='new_pageserver_pg_port', + "--to-pg-port", + dest="new_pageserver_pg_port", required=False, default=6400, type=int, - help='pg port of the pageserver to migrate data to. Default: 6400', + help="pg port of the pageserver to migrate data to. Default: 6400", ) parser.add_argument( - '--ignore-tenant-exists', - dest='ok_if_exists', + "--ignore-tenant-exists", + dest="ok_if_exists", required=False, - help= - 'Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.', + help="Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.", ) parser.add_argument( - '--pg-distrib-dir', - dest='pg_distrib_dir', + "--pg-distrib-dir", + dest="pg_distrib_dir", required=False, - default='/usr/local/', - help='Path where postgres binaries are installed. Default: /usr/local/', + default="/usr/local/", + help="Path where postgres binaries are installed. Default: /usr/local/", ) parser.add_argument( - '--psql-path', - dest='psql_path', + "--psql-path", + dest="psql_path", required=False, - default='/usr/local/bin/psql', - help='Path to the psql binary. Default: /usr/local/bin/psql', + default="/usr/local/bin/psql", + help="Path to the psql binary. Default: /usr/local/bin/psql", ) parser.add_argument( - '--only-import', - dest='only_import', + "--only-import", + dest="only_import", required=False, default=False, - action='store_true', - help='Skip export and tenant creation part', + action="store_true", + help="Skip export and tenant creation part", ) parser.add_argument( - '--work-dir', - dest='work_dir', + "--work-dir", + dest="work_dir", required=True, default=False, - help='directory where temporary tar files are stored', + help="directory where temporary tar files are stored", ) args = parser.parse_args() main(args) diff --git a/scripts/generate_perf_report_page.py b/scripts/generate_perf_report_page.py index 23fa4b76a3..b5b49bb600 100755 --- a/scripts/generate_perf_report_page.py +++ b/scripts/generate_perf_report_page.py @@ -1,31 +1,36 @@ #!/usr/bin/env python3 import argparse +import json from dataclasses import dataclass from pathlib import Path -import json from typing import Any, Dict, List, Optional, Tuple, cast + from jinja2 import Template # skip 'input' columns. They are included in the header and just blow the table -EXCLUDE_COLUMNS = frozenset({ - 'scale', - 'duration', - 'number_of_clients', - 'number_of_threads', - 'init_start_timestamp', - 'init_end_timestamp', - 'run_start_timestamp', - 'run_end_timestamp', -}) +EXCLUDE_COLUMNS = frozenset( + { + "scale", + "duration", + "number_of_clients", + "number_of_threads", + "init_start_timestamp", + "init_end_timestamp", + "run_start_timestamp", + "run_end_timestamp", + } +) -KEY_EXCLUDE_FIELDS = frozenset({ - 'init_start_timestamp', - 'init_end_timestamp', - 'run_start_timestamp', - 'run_end_timestamp', -}) -NEGATIVE_COLOR = 'negative' -POSITIVE_COLOR = 'positive' +KEY_EXCLUDE_FIELDS = frozenset( + { + "init_start_timestamp", + "init_end_timestamp", + "run_start_timestamp", + "run_end_timestamp", + } +) +NEGATIVE_COLOR = "negative" +POSITIVE_COLOR = "positive" EPS = 1e-6 @@ -55,75 +60,76 @@ def get_columns(values: List[Dict[Any, Any]]) -> Tuple[List[Tuple[str, str]], Li value_columns = [] common_columns = [] for item in values: - if item['name'] in KEY_EXCLUDE_FIELDS: + if item["name"] in KEY_EXCLUDE_FIELDS: continue - if item['report'] != 'test_param': - value_columns.append(cast(str, item['name'])) + if item["report"] != "test_param": + value_columns.append(cast(str, item["name"])) else: - common_columns.append((cast(str, item['name']), cast(str, item['value']))) + common_columns.append((cast(str, item["name"]), cast(str, item["value"]))) value_columns.sort() common_columns.sort(key=lambda x: x[0]) # sort by name return common_columns, value_columns def format_ratio(ratio: float, report: str) -> Tuple[str, str]: - color = '' - sign = '+' if ratio > 0 else '' + color = "" + sign = "+" if ratio > 0 else "" if abs(ratio) < 0.05: - return f' ({sign}{ratio:.2f})', color + return f" ({sign}{ratio:.2f})", color - if report not in {'test_param', 'higher_is_better', 'lower_is_better'}: - raise ValueError(f'Unknown report type: {report}') + if report not in {"test_param", "higher_is_better", "lower_is_better"}: + raise ValueError(f"Unknown report type: {report}") - if report == 'test_param': - return f'{ratio:.2f}', color + if report == "test_param": + return f"{ratio:.2f}", color if ratio > 0: - if report == 'higher_is_better': + if report == "higher_is_better": color = POSITIVE_COLOR - elif report == 'lower_is_better': + elif report == "lower_is_better": color = NEGATIVE_COLOR elif ratio < 0: - if report == 'higher_is_better': + if report == "higher_is_better": color = NEGATIVE_COLOR - elif report == 'lower_is_better': + elif report == "lower_is_better": color = POSITIVE_COLOR - return f' ({sign}{ratio:.2f})', color + return f" ({sign}{ratio:.2f})", color def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]: - for item in suit_run.values['data']: - if item['name'] == name: + for item in suit_run.values["data"]: + if item["name"] == name: return cast(Dict[str, Any], item) return None -def get_row_values(columns: List[str], run_result: SuitRun, - prev_result: Optional[SuitRun]) -> List[RowValue]: +def get_row_values( + columns: List[str], run_result: SuitRun, prev_result: Optional[SuitRun] +) -> List[RowValue]: row_values = [] for column in columns: current_value = extract_value(column, run_result) if current_value is None: # should never happen - raise ValueError(f'{column} not found in {run_result.values}') + raise ValueError(f"{column} not found in {run_result.values}") value = current_value["value"] if isinstance(value, float): - value = f'{value:.2f}' + value = f"{value:.2f}" if prev_result is None: - row_values.append(RowValue(value, '', '')) + row_values.append(RowValue(value, "", "")) continue prev_value = extract_value(column, prev_result) if prev_value is None: # this might happen when new metric is added and there is no value for it in previous run # let this be here, TODO add proper handling when this actually happens - raise ValueError(f'{column} not found in previous result') + raise ValueError(f"{column} not found in previous result") # adding `EPS` to each term to avoid ZeroDivisionError when the denominator is zero - ratio = (float(value) + EPS) / (float(prev_value['value']) + EPS) - 1 - ratio_display, color = format_ratio(ratio, current_value['report']) + ratio = (float(value) + EPS) / (float(prev_value["value"]) + EPS) - 1 + ratio_display, color = format_ratio(ratio, current_value["report"]) row_values.append(RowValue(value, color, ratio_display)) return row_values @@ -139,8 +145,10 @@ def prepare_rows_from_runs(value_columns: List[str], runs: List[SuitRun]) -> Lis prev_run = None for run in runs: rows.append( - SuiteRunTableRow(revision=run.revision, - values=get_row_values(value_columns, run, prev_run))) + SuiteRunTableRow( + revision=run.revision, values=get_row_values(value_columns, run, prev_run) + ) + ) prev_run = run return rows @@ -152,27 +160,29 @@ def main(args: argparse.Namespace) -> None: # we have files in form: _.json # fill them in the hashmap so we have grouped items for the # same run configuration (scale, duration etc.) ordered by counter. - for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split('_')[0])): + for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split("_")[0])): run_data = json.loads(item.read_text()) - revision = run_data['revision'] + revision = run_data["revision"] - for suit_result in run_data['result']: - key = "{}{}".format(run_data['platform'], suit_result['suit']) + for suit_result in run_data["result"]: + key = "{}{}".format(run_data["platform"], suit_result["suit"]) # pack total duration as a synthetic value - total_duration = suit_result['total_duration'] - suit_result['data'].append({ - 'name': 'total_duration', - 'value': total_duration, - 'unit': 's', - 'report': 'lower_is_better', - }) - common_columns, value_columns = get_columns(suit_result['data']) + total_duration = suit_result["total_duration"] + suit_result["data"].append( + { + "name": "total_duration", + "value": total_duration, + "unit": "s", + "report": "lower_is_better", + } + ) + common_columns, value_columns = get_columns(suit_result["data"]) grouped_runs.setdefault( key, SuitRuns( - platform=run_data['platform'], - suit=suit_result['suit'], + platform=run_data["platform"], + suit=suit_result["suit"], common_columns=common_columns, value_columns=value_columns, runs=[], @@ -184,26 +194,26 @@ def main(args: argparse.Namespace) -> None: for result in grouped_runs.values(): suit = result.suit context[suit] = { - 'common_columns': result.common_columns, - 'value_columns': result.value_columns, - 'platform': result.platform, + "common_columns": result.common_columns, + "value_columns": result.value_columns, + "platform": result.platform, # reverse the order so newest results are on top of the table - 'rows': reversed(prepare_rows_from_runs(result.value_columns, result.runs)), + "rows": reversed(prepare_rows_from_runs(result.value_columns, result.runs)), } - template = Template((Path(__file__).parent / 'perf_report_template.html').read_text()) + template = Template((Path(__file__).parent / "perf_report_template.html").read_text()) Path(args.out).write_text(template.render(context=context)) -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - '--input-dir', - dest='input_dir', + "--input-dir", + dest="input_dir", required=True, - help='Directory with jsons generated by the test suite', + help="Directory with jsons generated by the test suite", ) - parser.add_argument('--out', required=True, help='Output html file path') + parser.add_argument("--out", required=True, help="Output html file path") args = parser.parse_args() main(args) diff --git a/scripts/git-upload b/scripts/git-upload index a53987894a..d56c0f8e94 100755 --- a/scripts/git-upload +++ b/scripts/git-upload @@ -1,17 +1,16 @@ #!/usr/bin/env python3 -from contextlib import contextmanager -import shlex -from tempfile import TemporaryDirectory -from distutils.dir_util import copy_tree -from pathlib import Path - import argparse import os +import shlex import shutil import subprocess import sys import textwrap +from contextlib import contextmanager +from distutils.dir_util import copy_tree +from pathlib import Path +from tempfile import TemporaryDirectory from typing import Optional diff --git a/scripts/ingest_perf_test_result.py b/scripts/ingest_perf_test_result.py index 89463c986a..71f7ad3262 100644 --- a/scripts/ingest_perf_test_result.py +++ b/scripts/ingest_perf_test_result.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 import argparse -from contextlib import contextmanager import json import os +from contextlib import contextmanager +from datetime import datetime +from pathlib import Path + import psycopg2 import psycopg2.extras -from pathlib import Path -from datetime import datetime CREATE_TABLE = """ CREATE TABLE IF NOT EXISTS perf_test_results ( @@ -24,15 +25,15 @@ CREATE TABLE IF NOT EXISTS perf_test_results ( def err(msg): - print(f'error: {msg}') + print(f"error: {msg}") exit(1) @contextmanager def get_connection_cursor(): - connstr = os.getenv('DATABASE_URL') + connstr = os.getenv("DATABASE_URL") if not connstr: - err('DATABASE_URL environment variable is not set') + err("DATABASE_URL environment variable is not set") with psycopg2.connect(connstr) as conn: with conn.cursor() as cur: yield cur @@ -44,33 +45,35 @@ def create_table(cur): def ingest_perf_test_result(cursor, data_dile: Path, recorded_at_timestamp: int) -> int: run_data = json.loads(data_dile.read_text()) - revision = run_data['revision'] - platform = run_data['platform'] + revision = run_data["revision"] + platform = run_data["platform"] - run_result = run_data['result'] + run_result = run_data["result"] args_list = [] for suit_result in run_result: - suit = suit_result['suit'] - total_duration = suit_result['total_duration'] + suit = suit_result["suit"] + total_duration = suit_result["total_duration"] - suit_result['data'].append({ - 'name': 'total_duration', - 'value': total_duration, - 'unit': 's', - 'report': 'lower_is_better', - }) + suit_result["data"].append( + { + "name": "total_duration", + "value": total_duration, + "unit": "s", + "report": "lower_is_better", + } + ) - for metric in suit_result['data']: + for metric in suit_result["data"]: values = { - 'suit': suit, - 'revision': revision, - 'platform': platform, - 'metric_name': metric['name'], - 'metric_value': metric['value'], - 'metric_unit': metric['unit'], - 'metric_report_type': metric['report'], - 'recorded_at_timestamp': datetime.utcfromtimestamp(recorded_at_timestamp), + "suit": suit, + "revision": revision, + "platform": platform, + "metric_name": metric["name"], + "metric_value": metric["value"], + "metric_unit": metric["unit"], + "metric_report_type": metric["report"], + "recorded_at_timestamp": datetime.utcfromtimestamp(recorded_at_timestamp), } args_list.append(values) @@ -104,13 +107,16 @@ def ingest_perf_test_result(cursor, data_dile: Path, recorded_at_timestamp: int) def main(): - parser = argparse.ArgumentParser(description='Perf test result uploader. \ - Database connection string should be provided via DATABASE_URL environment variable', ) + parser = argparse.ArgumentParser( + description="Perf test result uploader. \ + Database connection string should be provided via DATABASE_URL environment variable", + ) parser.add_argument( - '--ingest', + "--ingest", type=Path, - help='Path to perf test result file, or directory with perf test result files') - parser.add_argument('--initdb', action='store_true', help='Initialuze database') + help="Path to perf test result file, or directory with perf test result files", + ) + parser.add_argument("--initdb", action="store_true", help="Initialuze database") args = parser.parse_args() with get_connection_cursor() as cur: @@ -118,19 +124,19 @@ def main(): create_table(cur) if not args.ingest.exists(): - err(f'ingest path {args.ingest} does not exist') + err(f"ingest path {args.ingest} does not exist") if args.ingest: if args.ingest.is_dir(): - for item in sorted(args.ingest.iterdir(), key=lambda x: int(x.name.split('_')[0])): - recorded_at_timestamp = int(item.name.split('_')[0]) + for item in sorted(args.ingest.iterdir(), key=lambda x: int(x.name.split("_")[0])): + recorded_at_timestamp = int(item.name.split("_")[0]) ingested = ingest_perf_test_result(cur, item, recorded_at_timestamp) - print(f'Ingested {ingested} metric values from {item}') + print(f"Ingested {ingested} metric values from {item}") else: - recorded_at_timestamp = int(args.ingest.name.split('_')[0]) + recorded_at_timestamp = int(args.ingest.name.split("_")[0]) ingested = ingest_perf_test_result(cur, args.ingest, recorded_at_timestamp) - print(f'Ingested {ingested} metric values from {args.ingest}') + print(f"Ingested {ingested} metric values from {args.ingest}") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index c4d36da043..96612a8aef 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -13,83 +13,90 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): # Extend compaction_period and gc_period to disable background compaction and gc. tenant, _ = env.neon_cli.create_tenant( conf={ - 'gc_period': '10 m', - 'gc_horizon': '1048576', - 'checkpoint_distance': '4194304', - 'compaction_period': '10 m', - 'compaction_threshold': '2', - 'compaction_target_size': '4194304', - }) + "gc_period": "10 m", + "gc_horizon": "1048576", + "checkpoint_distance": "4194304", + "compaction_period": "10 m", + "compaction_threshold": "2", + "compaction_target_size": "4194304", + } + ) env.pageserver.safe_psql("failpoints flush-frozen-before-sync=sleep(10000)") - pg_branch0 = env.postgres.create_start('main', tenant_id=tenant) + pg_branch0 = env.postgres.create_start("main", tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() branch0_timeline = query_scalar(branch0_cur, "SHOW neon.timeline_id") log.info(f"b0 timeline {branch0_timeline}") # Create table, and insert 100k rows. - branch0_lsn = query_scalar(branch0_cur, 'SELECT pg_current_wal_insert_lsn()') + branch0_lsn = query_scalar(branch0_cur, "SELECT pg_current_wal_insert_lsn()") log.info(f"b0 at lsn {branch0_lsn}") - branch0_cur.execute('CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)') - branch0_cur.execute(''' + branch0_cur.execute("CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)") + branch0_cur.execute( + """ INSERT INTO foo SELECT '00112233445566778899AABBCCDDEEFF' || ':branch0:' || g FROM generate_series(1, 100000) g - ''') - lsn_100 = query_scalar(branch0_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info(f'LSN after 100k rows: {lsn_100}') + """ + ) + lsn_100 = query_scalar(branch0_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN after 100k rows: {lsn_100}") # Create branch1. - env.neon_cli.create_branch('branch1', 'main', tenant_id=tenant, ancestor_start_lsn=lsn_100) - pg_branch1 = env.postgres.create_start('branch1', tenant_id=tenant) + env.neon_cli.create_branch("branch1", "main", tenant_id=tenant, ancestor_start_lsn=lsn_100) + pg_branch1 = env.postgres.create_start("branch1", tenant_id=tenant) log.info("postgres is running on 'branch1' branch") branch1_cur = pg_branch1.connect().cursor() branch1_timeline = query_scalar(branch1_cur, "SHOW neon.timeline_id") log.info(f"b1 timeline {branch1_timeline}") - branch1_lsn = query_scalar(branch1_cur, 'SELECT pg_current_wal_insert_lsn()') + branch1_lsn = query_scalar(branch1_cur, "SELECT pg_current_wal_insert_lsn()") log.info(f"b1 at lsn {branch1_lsn}") # Insert 100k rows. - branch1_cur.execute(''' + branch1_cur.execute( + """ INSERT INTO foo SELECT '00112233445566778899AABBCCDDEEFF' || ':branch1:' || g FROM generate_series(1, 100000) g - ''') - lsn_200 = query_scalar(branch1_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info(f'LSN after 200k rows: {lsn_200}') + """ + ) + lsn_200 = query_scalar(branch1_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN after 200k rows: {lsn_200}") # Create branch2. - env.neon_cli.create_branch('branch2', 'branch1', tenant_id=tenant, ancestor_start_lsn=lsn_200) - pg_branch2 = env.postgres.create_start('branch2', tenant_id=tenant) + env.neon_cli.create_branch("branch2", "branch1", tenant_id=tenant, ancestor_start_lsn=lsn_200) + pg_branch2 = env.postgres.create_start("branch2", tenant_id=tenant) log.info("postgres is running on 'branch2' branch") branch2_cur = pg_branch2.connect().cursor() branch2_timeline = query_scalar(branch2_cur, "SHOW neon.timeline_id") log.info(f"b2 timeline {branch2_timeline}") - branch2_lsn = query_scalar(branch2_cur, 'SELECT pg_current_wal_insert_lsn()') + branch2_lsn = query_scalar(branch2_cur, "SELECT pg_current_wal_insert_lsn()") log.info(f"b2 at lsn {branch2_lsn}") # Insert 100k rows. - branch2_cur.execute(''' + branch2_cur.execute( + """ INSERT INTO foo SELECT '00112233445566778899AABBCCDDEEFF' || ':branch2:' || g FROM generate_series(1, 100000) g - ''') - lsn_300 = query_scalar(branch2_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info(f'LSN after 300k rows: {lsn_300}') + """ + ) + lsn_300 = query_scalar(branch2_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN after 300k rows: {lsn_300}") # Run compaction on branch1. - compact = f'compact {tenant.hex} {branch1_timeline} {lsn_200}' + compact = f"compact {tenant.hex} {branch1_timeline} {lsn_200}" log.info(compact) env.pageserver.safe_psql(compact) - assert query_scalar(branch0_cur, 'SELECT count(*) FROM foo') == 100000 + assert query_scalar(branch0_cur, "SELECT count(*) FROM foo") == 100000 - assert query_scalar(branch1_cur, 'SELECT count(*) FROM foo') == 200000 + assert query_scalar(branch1_cur, "SELECT count(*) FROM foo") == 200000 - assert query_scalar(branch2_cur, 'SELECT count(*) FROM foo') == 300000 + assert query_scalar(branch2_cur, "SELECT count(*) FROM foo") == 300000 diff --git a/test_runner/batch_others/test_auth.py b/test_runner/batch_others/test_auth.py index 0fd0a5d7e3..16d6ae45c3 100644 --- a/test_runner/batch_others/test_auth.py +++ b/test_runner/batch_others/test_auth.py @@ -1,7 +1,8 @@ from contextlib import closing from uuid import uuid4 -from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException + import pytest +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): @@ -23,41 +24,46 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): ps.safe_psql("set FOO", password=tenant_token) ps.safe_psql("set FOO", password=management_token) - new_timeline_id = env.neon_cli.create_branch('test_pageserver_auth', - tenant_id=env.initial_tenant) + new_timeline_id = env.neon_cli.create_branch( + "test_pageserver_auth", tenant_id=env.initial_tenant + ) # tenant can create branches - tenant_http_client.timeline_create(tenant_id=env.initial_tenant, - ancestor_timeline_id=new_timeline_id) + tenant_http_client.timeline_create( + tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id + ) # console can create branches for tenant - management_http_client.timeline_create(tenant_id=env.initial_tenant, - ancestor_timeline_id=new_timeline_id) + management_http_client.timeline_create( + tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id + ) # fail to create branch using token with different tenant_id - with pytest.raises(NeonPageserverApiException, - match='Forbidden: Tenant id mismatch. Permission denied'): - invalid_tenant_http_client.timeline_create(tenant_id=env.initial_tenant, - ancestor_timeline_id=new_timeline_id) + with pytest.raises( + NeonPageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied" + ): + invalid_tenant_http_client.timeline_create( + tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id + ) # create tenant using management token management_http_client.tenant_create() # fail to create tenant using tenant token with pytest.raises( - NeonPageserverApiException, - match='Forbidden: Attempt to access management api with tenant scope. Permission denied' + NeonPageserverApiException, + match="Forbidden: Attempt to access management api with tenant scope. Permission denied", ): tenant_http_client.tenant_create() -@pytest.mark.parametrize('with_safekeepers', [False, True]) +@pytest.mark.parametrize("with_safekeepers", [False, True]) def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): neon_env_builder.auth_enabled = True if with_safekeepers: neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - branch = f'test_compute_auth_to_pageserver{with_safekeepers}' + branch = f"test_compute_auth_to_pageserver{with_safekeepers}" env.neon_cli.create_branch(branch) pg = env.postgres.create_start(branch) @@ -65,7 +71,7 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder, with_safek with conn.cursor() as cur: # we rely upon autocommit after each statement # as waiting for acceptors happens there - cur.execute('CREATE TABLE t(key int primary key, value text)') + cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - cur.execute('SELECT sum(key) FROM t') - assert cur.fetchone() == (5000050000, ) + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (5000050000,) diff --git a/test_runner/batch_others/test_backpressure.py b/test_runner/batch_others/test_backpressure.py index 4ca03b102b..a81fa380a9 100644 --- a/test_runner/batch_others/test_backpressure.py +++ b/test_runner/batch_others/test_backpressure.py @@ -1,13 +1,13 @@ +import threading +import time from contextlib import closing, contextmanager + import psycopg2.extras import pytest -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log -import time -from fixtures.neon_fixtures import Postgres -import threading +from fixtures.neon_fixtures import NeonEnvBuilder, Postgres -pytest_plugins = ("fixtures.neon_fixtures") +pytest_plugins = "fixtures.neon_fixtures" @contextmanager @@ -44,7 +44,8 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv with pg_cur(pg) as cur: while not stop_event.is_set(): try: - cur.execute(''' + cur.execute( + """ select pg_wal_lsn_diff(pg_current_wal_flush_lsn(),received_lsn) as received_lsn_lag, pg_wal_lsn_diff(pg_current_wal_flush_lsn(),disk_consistent_lsn) as disk_consistent_lsn_lag, pg_wal_lsn_diff(pg_current_wal_flush_lsn(),remote_consistent_lsn) as remote_consistent_lsn_lag, @@ -52,16 +53,19 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_flush_lsn(),disk_consistent_lsn)), pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_flush_lsn(),remote_consistent_lsn)) from backpressure_lsns(); - ''') + """ + ) res = cur.fetchone() received_lsn_lag = res[0] disk_consistent_lsn_lag = res[1] remote_consistent_lsn_lag = res[2] - log.info(f"received_lsn_lag = {received_lsn_lag} ({res[3]}), " - f"disk_consistent_lsn_lag = {disk_consistent_lsn_lag} ({res[4]}), " - f"remote_consistent_lsn_lag = {remote_consistent_lsn_lag} ({res[5]})") + log.info( + f"received_lsn_lag = {received_lsn_lag} ({res[3]}), " + f"disk_consistent_lsn_lag = {disk_consistent_lsn_lag} ({res[4]}), " + f"remote_consistent_lsn_lag = {remote_consistent_lsn_lag} ({res[5]})" + ) # Since feedback from pageserver is not immediate, we should allow some lag overflow lag_overflow = 5 * 1024 * 1024 # 5MB @@ -71,7 +75,9 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv if max_replication_flush_lag_bytes > 0: assert disk_consistent_lsn_lag < max_replication_flush_lag_bytes + lag_overflow if max_replication_apply_lag_bytes > 0: - assert remote_consistent_lsn_lag < max_replication_apply_lag_bytes + lag_overflow + assert ( + remote_consistent_lsn_lag < max_replication_apply_lag_bytes + lag_overflow + ) time.sleep(polling_interval) @@ -79,7 +85,7 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv log.info(f"backpressure check query failed: {e}") stop_event.set() - log.info('check thread stopped') + log.info("check thread stopped") # This test illustrates how to tune backpressure to control the lag @@ -94,10 +100,11 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() # Create a branch for us - env.neon_cli.create_branch('test_backpressure') + env.neon_cli.create_branch("test_backpressure") - pg = env.postgres.create_start('test_backpressure', - config_lines=['max_replication_write_lag=30MB']) + pg = env.postgres.create_start( + "test_backpressure", config_lines=["max_replication_write_lag=30MB"] + ) log.info("postgres is running on 'test_backpressure' branch") # setup check thread @@ -131,23 +138,29 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder): rows_inserted += 100000 except Exception as e: if check_thread.is_alive(): - log.info('stopping check thread') + log.info("stopping check thread") check_stop_event.set() check_thread.join() - assert False, f"Exception {e} while inserting rows, but WAL lag is within configured threshold. That means backpressure is not tuned properly" + assert ( + False + ), f"Exception {e} while inserting rows, but WAL lag is within configured threshold. That means backpressure is not tuned properly" else: - assert False, f"Exception {e} while inserting rows and WAL lag overflowed configured threshold. That means backpressure doesn't work." + assert ( + False + ), f"Exception {e} while inserting rows and WAL lag overflowed configured threshold. That means backpressure doesn't work." log.info(f"inserted {rows_inserted} rows") if check_thread.is_alive(): - log.info('stopping check thread') + log.info("stopping check thread") check_stop_event.set() check_thread.join() - log.info('check thread stopped') + log.info("check thread stopped") else: - assert False, "WAL lag overflowed configured threshold. That means backpressure doesn't work." + assert ( + False + ), "WAL lag overflowed configured threshold. That means backpressure doesn't work." -#TODO test_backpressure_disk_consistent_lsn_lag. Play with pageserver's checkpoint settings -#TODO test_backpressure_remote_consistent_lsn_lag +# TODO test_backpressure_disk_consistent_lsn_lag. Play with pageserver's checkpoint settings +# TODO test_backpressure_remote_consistent_lsn_lag diff --git a/test_runner/batch_others/test_basebackup_error.py b/test_runner/batch_others/test_basebackup_error.py index 0909ed98a7..9960f3afbf 100644 --- a/test_runner/batch_others/test_basebackup_error.py +++ b/test_runner/batch_others/test_basebackup_error.py @@ -1,5 +1,4 @@ import pytest - from fixtures.neon_fixtures import NeonEnv @@ -15,4 +14,4 @@ def test_basebackup_error(neon_simple_env: NeonEnv): env.pageserver.safe_psql(f"failpoints basebackup-before-control-file=return") with pytest.raises(Exception, match="basebackup-before-control-file"): - pg = env.postgres.create_start('test_basebackup_error') + pg = env.postgres.create_start("test_basebackup_error") diff --git a/test_runner/batch_others/test_branch_and_gc.py b/test_runner/batch_others/test_branch_and_gc.py index 8e433f65ad..bc8374543f 100644 --- a/test_runner/batch_others/test_branch_and_gc.py +++ b/test_runner/batch_others/test_branch_and_gc.py @@ -1,6 +1,7 @@ import threading -import pytest import time + +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.utils import lsn_from_hex, query_scalar @@ -49,55 +50,52 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): tenant, _ = env.neon_cli.create_tenant( conf={ # disable background GC - 'gc_period': '10 m', - 'gc_horizon': f'{10 * 1024 ** 3}', - + "gc_period": "10 m", + "gc_horizon": f"{10 * 1024 ** 3}", # small checkpoint distance to create more delta layer files - 'checkpoint_distance': f'{1024 ** 2}', - + "checkpoint_distance": f"{1024 ** 2}", # set the target size to be large to allow the image layer to cover the whole key space - 'compaction_target_size': f'{1024 ** 3}', - + "compaction_target_size": f"{1024 ** 3}", # tweak the default settings to allow quickly create image layers and L1 layers - 'compaction_period': '1 s', - 'compaction_threshold': '2', - 'image_creation_threshold': '1', - + "compaction_period": "1 s", + "compaction_threshold": "2", + "image_creation_threshold": "1", # set PITR interval to be small, so we can do GC - 'pitr_interval': '1 s' - }) + "pitr_interval": "1 s", + } + ) - timeline_main = env.neon_cli.create_timeline(f'test_main', tenant_id=tenant) - pg_main = env.postgres.create_start('test_main', tenant_id=tenant) + timeline_main = env.neon_cli.create_timeline(f"test_main", tenant_id=tenant) + pg_main = env.postgres.create_start("test_main", tenant_id=tenant) main_cur = pg_main.connect().cursor() main_cur.execute( "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')" ) - main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)') - lsn1 = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info(f'LSN1: {lsn1}') + main_cur.execute("INSERT INTO foo SELECT FROM generate_series(1, 100000)") + lsn1 = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN1: {lsn1}") - main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)') - lsn2 = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info(f'LSN2: {lsn2}') + main_cur.execute("INSERT INTO foo SELECT FROM generate_series(1, 100000)") + lsn2 = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN2: {lsn2}") # Set the GC horizon so that lsn1 is inside the horizon, which means # we can create a new branch starting from lsn1. env.pageserver.safe_psql( - f'do_gc {tenant.hex} {timeline_main.hex} {lsn_from_hex(lsn2) - lsn_from_hex(lsn1) + 1024}') + f"do_gc {tenant.hex} {timeline_main.hex} {lsn_from_hex(lsn2) - lsn_from_hex(lsn1) + 1024}" + ) - env.neon_cli.create_branch('test_branch', - 'test_main', - tenant_id=tenant, - ancestor_start_lsn=lsn1) - pg_branch = env.postgres.create_start('test_branch', tenant_id=tenant) + env.neon_cli.create_branch( + "test_branch", "test_main", tenant_id=tenant, ancestor_start_lsn=lsn1 + ) + pg_branch = env.postgres.create_start("test_branch", tenant_id=tenant) branch_cur = pg_branch.connect().cursor() - branch_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)') + branch_cur.execute("INSERT INTO foo SELECT FROM generate_series(1, 100000)") - assert query_scalar(branch_cur, 'SELECT count(*) FROM foo') == 200000 + assert query_scalar(branch_cur, "SELECT count(*) FROM foo") == 200000 # This test simulates a race condition happening when branch creation and GC are performed concurrently. @@ -120,32 +118,31 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): tenant, _ = env.neon_cli.create_tenant( conf={ # disable background GC - 'gc_period': '10 m', - 'gc_horizon': f'{10 * 1024 ** 3}', - + "gc_period": "10 m", + "gc_horizon": f"{10 * 1024 ** 3}", # small checkpoint distance to create more delta layer files - 'checkpoint_distance': f'{1024 ** 2}', - + "checkpoint_distance": f"{1024 ** 2}", # set the target size to be large to allow the image layer to cover the whole key space - 'compaction_target_size': f'{1024 ** 3}', - + "compaction_target_size": f"{1024 ** 3}", # tweak the default settings to allow quickly create image layers and L1 layers - 'compaction_period': '1 s', - 'compaction_threshold': '2', - 'image_creation_threshold': '1', - + "compaction_period": "1 s", + "compaction_threshold": "2", + "image_creation_threshold": "1", # set PITR interval to be small, so we can do GC - 'pitr_interval': '0 s' - }) + "pitr_interval": "0 s", + } + ) - b0 = env.neon_cli.create_branch('b0', tenant_id=tenant) - pg0 = env.postgres.create_start('b0', tenant_id=tenant) - res = pg0.safe_psql_many(queries=[ - "CREATE TABLE t(key serial primary key)", - "INSERT INTO t SELECT FROM generate_series(1, 100000)", - "SELECT pg_current_wal_insert_lsn()", - "INSERT INTO t SELECT FROM generate_series(1, 100000)", - ]) + b0 = env.neon_cli.create_branch("b0", tenant_id=tenant) + pg0 = env.postgres.create_start("b0", tenant_id=tenant) + res = pg0.safe_psql_many( + queries=[ + "CREATE TABLE t(key serial primary key)", + "INSERT INTO t SELECT FROM generate_series(1, 100000)", + "SELECT pg_current_wal_insert_lsn()", + "INSERT INTO t SELECT FROM generate_series(1, 100000)", + ] + ) lsn = res[2][0][0] # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the @@ -166,6 +163,6 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): # The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC. with pytest.raises(Exception, match="invalid branch start lsn"): - env.neon_cli.create_branch('b1', 'b0', tenant_id=tenant, ancestor_start_lsn=lsn) + env.neon_cli.create_branch("b1", "b0", tenant_id=tenant, ancestor_start_lsn=lsn) thread.join() diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index 95f478dda8..51946380d2 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -1,8 +1,8 @@ import psycopg2.extras import pytest from fixtures.log_helper import log -from fixtures.utils import print_gc_result, query_scalar from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import print_gc_result, query_scalar # @@ -21,8 +21,8 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() # Branch at the point where only 100 rows were inserted - env.neon_cli.create_branch('test_branch_behind') - pgmain = env.postgres.create_start('test_branch_behind') + env.neon_cli.create_branch("test_branch_behind") + pgmain = env.postgres.create_start("test_branch_behind") log.info("postgres is running on 'test_branch_behind' branch") main_cur = pgmain.connect().cursor() @@ -30,80 +30,86 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): timeline = query_scalar(main_cur, "SHOW neon.timeline_id") # Create table, and insert the first 100 rows - main_cur.execute('CREATE TABLE foo (t text)') + main_cur.execute("CREATE TABLE foo (t text)") # keep some early lsn to test branch creation on out of date lsn - gced_lsn = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') + gced_lsn = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") - main_cur.execute(''' + main_cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100) g - ''') - lsn_a = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info(f'LSN after 100 rows: {lsn_a}') + """ + ) + lsn_a = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN after 100 rows: {lsn_a}") # Insert some more rows. (This generates enough WAL to fill a few segments.) - main_cur.execute(''' + main_cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 200000) g - ''') - lsn_b = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info(f'LSN after 200100 rows: {lsn_b}') + """ + ) + lsn_b = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN after 200100 rows: {lsn_b}") # Branch at the point where only 100 rows were inserted - env.neon_cli.create_branch('test_branch_behind_hundred', - 'test_branch_behind', - ancestor_start_lsn=lsn_a) + env.neon_cli.create_branch( + "test_branch_behind_hundred", "test_branch_behind", ancestor_start_lsn=lsn_a + ) # Insert many more rows. This generates enough WAL to fill a few segments. - main_cur.execute(''' + main_cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 200000) g - ''') - lsn_c = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') + """ + ) + lsn_c = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") - log.info(f'LSN after 400100 rows: {lsn_c}') + log.info(f"LSN after 400100 rows: {lsn_c}") # Branch at the point where only 200100 rows were inserted - env.neon_cli.create_branch('test_branch_behind_more', - 'test_branch_behind', - ancestor_start_lsn=lsn_b) + env.neon_cli.create_branch( + "test_branch_behind_more", "test_branch_behind", ancestor_start_lsn=lsn_b + ) - pg_hundred = env.postgres.create_start('test_branch_behind_hundred') - pg_more = env.postgres.create_start('test_branch_behind_more') + pg_hundred = env.postgres.create_start("test_branch_behind_hundred") + pg_more = env.postgres.create_start("test_branch_behind_more") # On the 'hundred' branch, we should see only 100 rows hundred_cur = pg_hundred.connect().cursor() - assert query_scalar(hundred_cur, 'SELECT count(*) FROM foo') == 100 + assert query_scalar(hundred_cur, "SELECT count(*) FROM foo") == 100 # On the 'more' branch, we should see 100200 rows more_cur = pg_more.connect().cursor() - assert query_scalar(more_cur, 'SELECT count(*) FROM foo') == 200100 + assert query_scalar(more_cur, "SELECT count(*) FROM foo") == 200100 # All the rows are visible on the main branch - assert query_scalar(main_cur, 'SELECT count(*) FROM foo') == 400100 + assert query_scalar(main_cur, "SELECT count(*) FROM foo") == 400100 # Check bad lsn's for branching # branch at segment boundary - env.neon_cli.create_branch('test_branch_segment_boundary', - 'test_branch_behind', - ancestor_start_lsn="0/3000000") - pg = env.postgres.create_start('test_branch_segment_boundary') - assert pg.safe_psql('SELECT 1')[0][0] == 1 + env.neon_cli.create_branch( + "test_branch_segment_boundary", "test_branch_behind", ancestor_start_lsn="0/3000000" + ) + pg = env.postgres.create_start("test_branch_segment_boundary") + assert pg.safe_psql("SELECT 1")[0][0] == 1 # branch at pre-initdb lsn with pytest.raises(Exception, match="invalid branch start lsn"): - env.neon_cli.create_branch('test_branch_preinitdb', ancestor_start_lsn="0/42") + env.neon_cli.create_branch("test_branch_preinitdb", ancestor_start_lsn="0/42") # branch at pre-ancestor lsn with pytest.raises(Exception, match="less than timeline ancestor lsn"): - env.neon_cli.create_branch('test_branch_preinitdb', - 'test_branch_behind', - ancestor_start_lsn="0/42") + env.neon_cli.create_branch( + "test_branch_preinitdb", "test_branch_behind", ancestor_start_lsn="0/42" + ) # check that we cannot create branch based on garbage collected data with env.pageserver.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: @@ -114,13 +120,13 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): with pytest.raises(Exception, match="invalid branch start lsn"): # this gced_lsn is pretty random, so if gc is disabled this woudln't fail - env.neon_cli.create_branch('test_branch_create_fail', - 'test_branch_behind', - ancestor_start_lsn=gced_lsn) + env.neon_cli.create_branch( + "test_branch_create_fail", "test_branch_behind", ancestor_start_lsn=gced_lsn + ) # check that after gc everything is still there - assert query_scalar(hundred_cur, 'SELECT count(*) FROM foo') == 100 + assert query_scalar(hundred_cur, "SELECT count(*) FROM foo") == 100 - assert query_scalar(more_cur, 'SELECT count(*) FROM foo') == 200100 + assert query_scalar(more_cur, "SELECT count(*) FROM foo") == 200100 - assert query_scalar(main_cur, 'SELECT count(*) FROM foo') == 400100 + assert query_scalar(main_cur, "SELECT count(*) FROM foo") == 400100 diff --git a/test_runner/batch_others/test_branching.py b/test_runner/batch_others/test_branching.py index c61bac7a58..2d08b07f82 100644 --- a/test_runner/batch_others/test_branching.py +++ b/test_runner/batch_others/test_branching.py @@ -1,10 +1,11 @@ -from typing import List -import threading -import pytest -from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres -import time import random +import threading +import time +from typing import List + +import pytest from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres from performance.test_perf_pgbench import get_scales_matrix @@ -20,38 +21,37 @@ from performance.test_perf_pgbench import get_scales_matrix @pytest.mark.parametrize("n_branches", [10]) @pytest.mark.parametrize("scale", get_scales_matrix(1)) @pytest.mark.parametrize("ty", ["cascade", "flat"]) -def test_branching_with_pgbench(neon_simple_env: NeonEnv, - pg_bin: PgBin, - n_branches: int, - scale: int, - ty: str): +def test_branching_with_pgbench( + neon_simple_env: NeonEnv, pg_bin: PgBin, n_branches: int, scale: int, ty: str +): env = neon_simple_env # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test tenant, _ = env.neon_cli.create_tenant( - conf={ - 'gc_period': '5 s', - 'gc_horizon': f'{1024 ** 2}', - 'checkpoint_distance': f'{1024 ** 2}', - 'compaction_target_size': f'{1024 ** 2}', - # set PITR interval to be small, so we can do GC - 'pitr_interval': '5 s' - }) + conf={ + "gc_period": "5 s", + "gc_horizon": f"{1024 ** 2}", + "checkpoint_distance": f"{1024 ** 2}", + "compaction_target_size": f"{1024 ** 2}", + # set PITR interval to be small, so we can do GC + "pitr_interval": "5 s", + } + ) def run_pgbench(pg: Postgres): connstr = pg.connstr() log.info(f"Start a pgbench workload on pg {connstr}") - pg_bin.run_capture(['pgbench', '-i', f'-s{scale}', connstr]) - pg_bin.run_capture(['pgbench', '-T15', connstr]) + pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + pg_bin.run_capture(["pgbench", "-T15", connstr]) - env.neon_cli.create_branch('b0', tenant_id=tenant) + env.neon_cli.create_branch("b0", tenant_id=tenant) pgs: List[Postgres] = [] - pgs.append(env.postgres.create_start('b0', tenant_id=tenant)) + pgs.append(env.postgres.create_start("b0", tenant_id=tenant)) threads: List[threading.Thread] = [] - threads.append(threading.Thread(target=run_pgbench, args=(pgs[0], ), daemon=True)) + threads.append(threading.Thread(target=run_pgbench, args=(pgs[0],), daemon=True)) threads[-1].start() thread_limit = 4 @@ -72,18 +72,18 @@ def test_branching_with_pgbench(neon_simple_env: NeonEnv, threads = [] if ty == "cascade": - env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(i), tenant_id=tenant) + env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(i), tenant_id=tenant) else: - env.neon_cli.create_branch('b{}'.format(i + 1), 'b0', tenant_id=tenant) + env.neon_cli.create_branch("b{}".format(i + 1), "b0", tenant_id=tenant) - pgs.append(env.postgres.create_start('b{}'.format(i + 1), tenant_id=tenant)) + pgs.append(env.postgres.create_start("b{}".format(i + 1), tenant_id=tenant)) - threads.append(threading.Thread(target=run_pgbench, args=(pgs[-1], ), daemon=True)) + threads.append(threading.Thread(target=run_pgbench, args=(pgs[-1],), daemon=True)) threads[-1].start() for thread in threads: thread.join() for pg in pgs: - res = pg.safe_psql('SELECT count(*) from pgbench_accounts') - assert res[0] == (100000 * scale, ) + res = pg.safe_psql("SELECT count(*) from pgbench_accounts") + assert res[0] == (100000 * scale,) diff --git a/test_runner/batch_others/test_broken_timeline.py b/test_runner/batch_others/test_broken_timeline.py index b9e5f637ab..b96a7895eb 100644 --- a/test_runner/batch_others/test_broken_timeline.py +++ b/test_runner/batch_others/test_broken_timeline.py @@ -1,12 +1,12 @@ +import concurrent.futures +import os +from contextlib import closing from typing import List, Tuple from uuid import UUID -import pytest -import concurrent.futures -from contextlib import closing -from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv, Postgres -from fixtures.log_helper import log -import os +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres from fixtures.utils import query_scalar @@ -24,7 +24,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): tenant_id = tenant_id_uuid.hex timeline_id = timeline_id_uuid.hex - pg = env.postgres.create_start(f'main', tenant_id=tenant_id_uuid) + pg = env.postgres.create_start(f"main", tenant_id=tenant_id_uuid) with pg.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'") @@ -42,7 +42,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # Corrupt metadata file on timeline 1 (tenant1, timeline1, pg1) = tenant_timelines[1] metadata_path = "{}/tenants/{}/timelines/{}/metadata".format(env.repo_dir, tenant1, timeline1) - print(f'overwriting metadata file at {metadata_path}') + print(f"overwriting metadata file at {metadata_path}") f = open(metadata_path, "w") f.write("overwritten with garbage!") f.close() @@ -52,17 +52,17 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): (tenant2, timeline2, pg2) = tenant_timelines[2] timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant2, timeline2) for filename in os.listdir(timeline_path): - if filename.startswith('00000'): + if filename.startswith("00000"): # Looks like a layer file. Remove it - os.remove(f'{timeline_path}/{filename}') + os.remove(f"{timeline_path}/{filename}") # Corrupt layer files file on timeline 3 (tenant3, timeline3, pg3) = tenant_timelines[3] timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant3, timeline3) for filename in os.listdir(timeline_path): - if filename.startswith('00000'): + if filename.startswith("00000"): # Looks like a layer file. Corrupt it - f = open(f'{timeline_path}/{filename}', "w") + f = open(f"{timeline_path}/{filename}", "w") f.write("overwritten with garbage!") f.close() @@ -77,7 +77,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): (tenant, timeline, pg) = tenant_timelines[n] with pytest.raises(Exception, match="Cannot load local timeline") as err: pg.start() - log.info(f'compute startup failed as expected: {err}') + log.info(f"compute startup failed as expected: {err}") def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): @@ -87,9 +87,10 @@ def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: futures = [ - executor.submit(env.neon_cli.create_timeline, - f"test-create-multiple-timelines-{i}", - tenant_id) for i in range(4) + executor.submit( + env.neon_cli.create_timeline, f"test-create-multiple-timelines-{i}", tenant_id + ) + for i in range(4) ] for future in futures: future.result() diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/batch_others/test_clog_truncate.py index cdb577f480..1f5df1c130 100644 --- a/test_runner/batch_others/test_clog_truncate.py +++ b/test_runner/batch_others/test_clog_truncate.py @@ -1,10 +1,9 @@ -import time import os - +import time from contextlib import closing -from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv from fixtures.utils import query_scalar @@ -13,40 +12,40 @@ from fixtures.utils import query_scalar # def test_clog_truncate(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch('test_clog_truncate', 'empty') + env.neon_cli.create_branch("test_clog_truncate", "empty") # set aggressive autovacuum to make sure that truncation will happen config = [ - 'autovacuum_max_workers=10', - 'autovacuum_vacuum_threshold=0', - 'autovacuum_vacuum_insert_threshold=0', - 'autovacuum_vacuum_cost_delay=0', - 'autovacuum_vacuum_cost_limit=10000', - 'autovacuum_naptime =1s', - 'autovacuum_freeze_max_age=100000' + "autovacuum_max_workers=10", + "autovacuum_vacuum_threshold=0", + "autovacuum_vacuum_insert_threshold=0", + "autovacuum_vacuum_cost_delay=0", + "autovacuum_vacuum_cost_limit=10000", + "autovacuum_naptime =1s", + "autovacuum_freeze_max_age=100000", ] - pg = env.postgres.create_start('test_clog_truncate', config_lines=config) - log.info('postgres is running on test_clog_truncate branch') + pg = env.postgres.create_start("test_clog_truncate", config_lines=config) + log.info("postgres is running on test_clog_truncate branch") # Install extension containing function needed for test - pg.safe_psql('CREATE EXTENSION neon_test_utils') + pg.safe_psql("CREATE EXTENSION neon_test_utils") # Consume many xids to advance clog with pg.cursor() as cur: - cur.execute('select test_consume_xids(1000*1000*10);') - log.info('xids consumed') + cur.execute("select test_consume_xids(1000*1000*10);") + log.info("xids consumed") # call a checkpoint to trigger TruncateSubtrans - cur.execute('CHECKPOINT;') + cur.execute("CHECKPOINT;") # ensure WAL flush - cur.execute('select txid_current()') + cur.execute("select txid_current()") log.info(cur.fetchone()) # wait for autovacuum to truncate the pg_xact # XXX Is it worth to add a timeout here? - pg_xact_0000_path = os.path.join(pg.pg_xact_dir_path(), '0000') + pg_xact_0000_path = os.path.join(pg.pg_xact_dir_path(), "0000") log.info(f"pg_xact_0000_path = {pg_xact_0000_path}") while os.path.isfile(pg_xact_0000_path): @@ -55,18 +54,18 @@ def test_clog_truncate(neon_simple_env: NeonEnv): # checkpoint to advance latest lsn with pg.cursor() as cur: - cur.execute('CHECKPOINT;') - lsn_after_truncation = query_scalar(cur, 'select pg_current_wal_insert_lsn()') + cur.execute("CHECKPOINT;") + lsn_after_truncation = query_scalar(cur, "select pg_current_wal_insert_lsn()") # create new branch after clog truncation and start a compute node on it - log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}') - env.neon_cli.create_branch('test_clog_truncate_new', - 'test_clog_truncate', - ancestor_start_lsn=lsn_after_truncation) - pg2 = env.postgres.create_start('test_clog_truncate_new') - log.info('postgres is running on test_clog_truncate_new branch') + log.info(f"create branch at lsn_after_truncation {lsn_after_truncation}") + env.neon_cli.create_branch( + "test_clog_truncate_new", "test_clog_truncate", ancestor_start_lsn=lsn_after_truncation + ) + pg2 = env.postgres.create_start("test_clog_truncate_new") + log.info("postgres is running on test_clog_truncate_new branch") # check that new node doesn't contain truncated segment - pg_xact_0000_path_new = os.path.join(pg2.pg_xact_dir_path(), '0000') + pg_xact_0000_path_new = os.path.join(pg2.pg_xact_dir_path(), "0000") log.info(f"pg_xact_0000_path_new = {pg_xact_0000_path_new}") assert os.path.isfile(pg_xact_0000_path_new) is False diff --git a/test_runner/batch_others/test_close_fds.py b/test_runner/batch_others/test_close_fds.py index 9521b1bb4a..c7ea37f9c8 100644 --- a/test_runner/batch_others/test_close_fds.py +++ b/test_runner/batch_others/test_close_fds.py @@ -1,18 +1,18 @@ -from contextlib import closing -import shutil -import time -import subprocess import os.path +import shutil +import subprocess +import time +from contextlib import closing from cached_property import threading -from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv def lsof_path() -> str: path_output = shutil.which("lsof") if path_output is None: - raise RuntimeError('lsof not found in PATH') + raise RuntimeError("lsof not found in PATH") else: return path_output @@ -36,16 +36,18 @@ def test_lsof_pageserver_pid(neon_simple_env: NeonEnv): path = os.path.join(env.repo_dir, "pageserver.pid") lsof = lsof_path() while workload_thread.is_alive(): - res = subprocess.run([lsof, path], - check=False, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + res = subprocess.run( + [lsof, path], + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) # parse the `lsof` command's output to get only the list of commands - commands = [line.split(' ')[0] for line in res.stdout.strip().split('\n')[1:]] + commands = [line.split(" ")[0] for line in res.stdout.strip().split("\n")[1:]] if len(commands) > 0: log.info(f"lsof commands: {commands}") - assert commands == ['pageserve'] + assert commands == ["pageserve"] time.sleep(1.0) diff --git a/test_runner/batch_others/test_config.py b/test_runner/batch_others/test_config.py index 51deeebeed..3477d96b89 100644 --- a/test_runner/batch_others/test_config.py +++ b/test_runner/batch_others/test_config.py @@ -1,7 +1,7 @@ from contextlib import closing -from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv # @@ -12,19 +12,21 @@ def test_config(neon_simple_env: NeonEnv): env.neon_cli.create_branch("test_config", "empty") # change config - pg = env.postgres.create_start('test_config', config_lines=['log_min_messages=debug1']) - log.info('postgres is running on test_config branch') + pg = env.postgres.create_start("test_config", config_lines=["log_min_messages=debug1"]) + log.info("postgres is running on test_config branch") with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute(''' + cur.execute( + """ SELECT setting FROM pg_settings WHERE source != 'default' AND source != 'override' AND name = 'log_min_messages' - ''') + """ + ) # check that config change was applied - assert cur.fetchone() == ('debug1', ) + assert cur.fetchone() == ("debug1",) diff --git a/test_runner/batch_others/test_crafted_wal_end.py b/test_runner/batch_others/test_crafted_wal_end.py index d1c46fc73a..32e5366945 100644 --- a/test_runner/batch_others/test_crafted_wal_end.py +++ b/test_runner/batch_others/test_crafted_wal_end.py @@ -1,34 +1,38 @@ -from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft -from fixtures.log_helper import log import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft # Restart nodes with WAL end having specially crafted shape, like last record # crossing segment boundary, to test decoding issues. -@pytest.mark.parametrize('wal_type', - [ - 'simple', - 'last_wal_record_xlog_switch', - 'last_wal_record_xlog_switch_ends_on_page_boundary', - 'last_wal_record_crossing_segment', - 'wal_record_crossing_segment_followed_by_small_one', - ]) +@pytest.mark.parametrize( + "wal_type", + [ + "simple", + "last_wal_record_xlog_switch", + "last_wal_record_xlog_switch_ends_on_page_boundary", + "last_wal_record_crossing_segment", + "wal_record_crossing_segment_followed_by_small_one", + ], +) def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_crafted_wal_end') + env.neon_cli.create_branch("test_crafted_wal_end") - pg = env.postgres.create('test_crafted_wal_end') + pg = env.postgres.create("test_crafted_wal_end") wal_craft = WalCraft(env) pg.config(wal_craft.postgres_config()) pg.start() - res = pg.safe_psql_many(queries=[ - 'CREATE TABLE keys(key int primary key)', - 'INSERT INTO keys SELECT generate_series(1, 100)', - 'SELECT SUM(key) FROM keys' - ]) - assert res[-1][0] == (5050, ) + res = pg.safe_psql_many( + queries=[ + "CREATE TABLE keys(key int primary key)", + "INSERT INTO keys SELECT generate_series(1, 100)", + "SELECT SUM(key) FROM keys", + ] + ) + assert res[-1][0] == (5050,) wal_craft.in_existing(wal_type, pg.connstr()) @@ -39,13 +43,15 @@ def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): env.pageserver.start() log.info("Trying more queries") - res = pg.safe_psql_many(queries=[ - 'SELECT SUM(key) FROM keys', - 'INSERT INTO keys SELECT generate_series(101, 200)', - 'SELECT SUM(key) FROM keys', - ]) - assert res[0][0] == (5050, ) - assert res[-1][0] == (20100, ) + res = pg.safe_psql_many( + queries=[ + "SELECT SUM(key) FROM keys", + "INSERT INTO keys SELECT generate_series(101, 200)", + "SELECT SUM(key) FROM keys", + ] + ) + assert res[0][0] == (5050,) + assert res[-1][0] == (20100,) log.info("Restarting all safekeepers and pageservers (again)") env.pageserver.stop() @@ -54,10 +60,12 @@ def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): env.pageserver.start() log.info("Trying more queries (again)") - res = pg.safe_psql_many(queries=[ - 'SELECT SUM(key) FROM keys', - 'INSERT INTO keys SELECT generate_series(201, 300)', - 'SELECT SUM(key) FROM keys', - ]) - assert res[0][0] == (20100, ) - assert res[-1][0] == (45150, ) + res = pg.safe_psql_many( + queries=[ + "SELECT SUM(key) FROM keys", + "INSERT INTO keys SELECT generate_series(201, 300)", + "SELECT SUM(key) FROM keys", + ] + ) + assert res[0][0] == (20100,) + assert res[-1][0] == (45150,) diff --git a/test_runner/batch_others/test_createdropdb.py b/test_runner/batch_others/test_createdropdb.py index 0fbf6e2a47..fdb704ff15 100644 --- a/test_runner/batch_others/test_createdropdb.py +++ b/test_runner/batch_others/test_createdropdb.py @@ -1,9 +1,9 @@ import os import pathlib - from contextlib import closing -from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content + from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.utils import query_scalar @@ -12,35 +12,37 @@ from fixtures.utils import query_scalar # def test_createdb(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch('test_createdb', 'empty') + env.neon_cli.create_branch("test_createdb", "empty") - pg = env.postgres.create_start('test_createdb') + pg = env.postgres.create_start("test_createdb") log.info("postgres is running on 'test_createdb' branch") with pg.cursor() as cur: # Cause a 'relmapper' change in the original branch - cur.execute('VACUUM FULL pg_class') + cur.execute("VACUUM FULL pg_class") - cur.execute('CREATE DATABASE foodb') + cur.execute("CREATE DATABASE foodb") - lsn = query_scalar(cur, 'SELECT pg_current_wal_insert_lsn()') + lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create a branch - env.neon_cli.create_branch('test_createdb2', 'test_createdb', ancestor_start_lsn=lsn) - pg2 = env.postgres.create_start('test_createdb2') + env.neon_cli.create_branch("test_createdb2", "test_createdb", ancestor_start_lsn=lsn) + pg2 = env.postgres.create_start("test_createdb2") # Test that you can connect to the new database on both branches for db in (pg, pg2): - with db.cursor(dbname='foodb') as cur: + with db.cursor(dbname="foodb") as cur: # Check database size in both branches - cur.execute(""" + cur.execute( + """ select pg_size_pretty(pg_database_size('foodb')), pg_size_pretty( sum(pg_relation_size(oid, 'main')) +sum(pg_relation_size(oid, 'vm')) +sum(pg_relation_size(oid, 'fsm')) ) FROM pg_class where relisshared is false - """) + """ + ) res = cur.fetchone() assert res is not None # check that dbsize equals sum of all relation sizes, excluding shared ones @@ -53,48 +55,48 @@ def test_createdb(neon_simple_env: NeonEnv): # def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env - env.neon_cli.create_branch('test_dropdb', 'empty') - pg = env.postgres.create_start('test_dropdb') + env.neon_cli.create_branch("test_dropdb", "empty") + pg = env.postgres.create_start("test_dropdb") log.info("postgres is running on 'test_dropdb' branch") with pg.cursor() as cur: - cur.execute('CREATE DATABASE foodb') + cur.execute("CREATE DATABASE foodb") - lsn_before_drop = query_scalar(cur, 'SELECT pg_current_wal_insert_lsn()') + lsn_before_drop = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") dboid = query_scalar(cur, "SELECT oid FROM pg_database WHERE datname='foodb';") with pg.cursor() as cur: - cur.execute('DROP DATABASE foodb') + cur.execute("DROP DATABASE foodb") - cur.execute('CHECKPOINT') + cur.execute("CHECKPOINT") - lsn_after_drop = query_scalar(cur, 'SELECT pg_current_wal_insert_lsn()') + lsn_after_drop = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create two branches before and after database drop. - env.neon_cli.create_branch('test_before_dropdb', - 'test_dropdb', - ancestor_start_lsn=lsn_before_drop) - pg_before = env.postgres.create_start('test_before_dropdb') + env.neon_cli.create_branch( + "test_before_dropdb", "test_dropdb", ancestor_start_lsn=lsn_before_drop + ) + pg_before = env.postgres.create_start("test_before_dropdb") - env.neon_cli.create_branch('test_after_dropdb', - 'test_dropdb', - ancestor_start_lsn=lsn_after_drop) - pg_after = env.postgres.create_start('test_after_dropdb') + env.neon_cli.create_branch( + "test_after_dropdb", "test_dropdb", ancestor_start_lsn=lsn_after_drop + ) + pg_after = env.postgres.create_start("test_after_dropdb") # Test that database exists on the branch before drop - pg_before.connect(dbname='foodb').close() + pg_before.connect(dbname="foodb").close() # Test that database subdir exists on the branch before drop assert pg_before.pgdata_dir - dbpath = pathlib.Path(pg_before.pgdata_dir) / 'base' / str(dboid) + dbpath = pathlib.Path(pg_before.pgdata_dir) / "base" / str(dboid) log.info(dbpath) assert os.path.isdir(dbpath) == True # Test that database subdir doesn't exist on the branch after drop assert pg_after.pgdata_dir - dbpath = pathlib.Path(pg_after.pgdata_dir) / 'base' / str(dboid) + dbpath = pathlib.Path(pg_after.pgdata_dir) / "base" / str(dboid) log.info(dbpath) assert os.path.isdir(dbpath) == False diff --git a/test_runner/batch_others/test_createuser.py b/test_runner/batch_others/test_createuser.py index d48db05395..c5f8246f5b 100644 --- a/test_runner/batch_others/test_createuser.py +++ b/test_runner/batch_others/test_createuser.py @@ -1,5 +1,5 @@ -from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv from fixtures.utils import query_scalar @@ -8,21 +8,21 @@ from fixtures.utils import query_scalar # def test_createuser(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch('test_createuser', 'empty') - pg = env.postgres.create_start('test_createuser') + env.neon_cli.create_branch("test_createuser", "empty") + pg = env.postgres.create_start("test_createuser") log.info("postgres is running on 'test_createuser' branch") with pg.cursor() as cur: # Cause a 'relmapper' change in the original branch - cur.execute('CREATE USER testuser with password %s', ('testpwd', )) + cur.execute("CREATE USER testuser with password %s", ("testpwd",)) - cur.execute('CHECKPOINT') + cur.execute("CHECKPOINT") - lsn = query_scalar(cur, 'SELECT pg_current_wal_insert_lsn()') + lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create a branch - env.neon_cli.create_branch('test_createuser2', 'test_createuser', ancestor_start_lsn=lsn) - pg2 = env.postgres.create_start('test_createuser2') + env.neon_cli.create_branch("test_createuser2", "test_createuser", ancestor_start_lsn=lsn) + pg2 = env.postgres.create_start("test_createuser2") # Test that you can connect to new branch as a new user - assert pg2.safe_psql('select current_user', user='testuser') == [('testuser', )] + assert pg2.safe_psql("select current_user", user="testuser") == [("testuser",)] diff --git a/test_runner/batch_others/test_fsm_truncate.py b/test_runner/batch_others/test_fsm_truncate.py index 0f85942598..54ad2ffa34 100644 --- a/test_runner/batch_others/test_fsm_truncate.py +++ b/test_runner/batch_others/test_fsm_truncate.py @@ -1,11 +1,12 @@ +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient -import pytest def test_fsm_truncate(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_fsm_truncate") - pg = env.postgres.create_start('test_fsm_truncate') + pg = env.postgres.create_start("test_fsm_truncate") pg.safe_psql( - 'CREATE TABLE t1(key int); CREATE TABLE t2(key int); TRUNCATE TABLE t1; TRUNCATE TABLE t2;') + "CREATE TABLE t1(key int); CREATE TABLE t2(key int); TRUNCATE TABLE t1; TRUNCATE TABLE t2;" + ) diff --git a/test_runner/batch_others/test_fullbackup.py b/test_runner/batch_others/test_fullbackup.py index bce085c157..8155f52060 100644 --- a/test_runner/batch_others/test_fullbackup.py +++ b/test_runner/batch_others/test_fullbackup.py @@ -1,22 +1,28 @@ -from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres -from fixtures.neon_fixtures import pg_distrib_dir import os + +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + PortDistributor, + VanillaPostgres, + pg_distrib_dir, +) from fixtures.utils import query_scalar, subprocess_capture num_rows = 1000 # Ensure that regular postgres can start from fullbackup -def test_fullbackup(neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, - port_distributor: PortDistributor): +def test_fullbackup( + neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor +): neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_fullbackup') - pgmain = env.postgres.create_start('test_fullbackup') + env.neon_cli.create_branch("test_fullbackup") + pgmain = env.postgres.create_start("test_fullbackup") log.info("postgres is running on 'test_fullbackup' branch") with pgmain.cursor() as cur: @@ -24,16 +30,18 @@ def test_fullbackup(neon_env_builder: NeonEnvBuilder, # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") - cur.execute(f'''CREATE TABLE tbl AS SELECT 'long string to consume some space' || g - from generate_series(1,{num_rows}) g''') + cur.execute( + f"""CREATE TABLE tbl AS SELECT 'long string to consume some space' || g + from generate_series(1,{num_rows}) g""" + ) cur.execute("CHECKPOINT") - lsn = query_scalar(cur, 'SELECT pg_current_wal_insert_lsn()') + lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") log.info(f"start_backup_lsn = {lsn}") # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')} + psql_env = {"LD_LIBRARY_PATH": os.path.join(str(pg_distrib_dir), "lib")} # Get and unpack fullbackup from pageserver restored_dir_path = env.repo_dir / "restored_datadir" @@ -42,13 +50,14 @@ def test_fullbackup(neon_env_builder: NeonEnvBuilder, cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] result_basepath = pg_bin.run_capture(cmd, env=psql_env) tar_output_file = result_basepath + ".stdout" - subprocess_capture(str(env.repo_dir), - ["tar", "-xf", tar_output_file, "-C", str(restored_dir_path)]) + subprocess_capture( + str(env.repo_dir), ["tar", "-xf", tar_output_file, "-C", str(restored_dir_path)] + ) # HACK # fullbackup returns neon specific pg_control and first WAL segment # use resetwal to overwrite it - pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, 'pg_resetwal') + pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") cmd = [pg_resetwal_path, "-D", str(restored_dir_path)] pg_bin.run_capture(cmd, env=psql_env) @@ -56,9 +65,11 @@ def test_fullbackup(neon_env_builder: NeonEnvBuilder, port = port_distributor.get_port() with VanillaPostgres(restored_dir_path, pg_bin, port, init=False) as vanilla_pg: # TODO make port an optional argument - vanilla_pg.configure([ - f"port={port}", - ]) + vanilla_pg.configure( + [ + f"port={port}", + ] + ) vanilla_pg.start() - num_rows_found = vanilla_pg.safe_psql('select count(*) from tbl;', user="cloud_admin")[0][0] + num_rows_found = vanilla_pg.safe_psql("select count(*) from tbl;", user="cloud_admin")[0][0] assert num_rows == num_rows_found diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/batch_others/test_gc_aggressive.py index d7f6308182..be6b437e30 100644 --- a/test_runner/batch_others/test_gc_aggressive.py +++ b/test_runner/batch_others/test_gc_aggressive.py @@ -1,8 +1,8 @@ import asyncio import random -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres from fixtures.utils import query_scalar # Test configuration @@ -24,7 +24,7 @@ async def update_table(pg: Postgres): while updates_performed < updates_to_perform: updates_performed += 1 id = random.randrange(1, num_rows) - row = await pg_conn.fetchrow(f'UPDATE foo SET counter = counter + 1 WHERE id = {id}') + row = await pg_conn.fetchrow(f"UPDATE foo SET counter = counter + 1 WHERE id = {id}") # Perform aggressive GC with 0 horizon @@ -57,24 +57,26 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = neon_env_builder.init_start() env.neon_cli.create_branch("test_gc_aggressive", "main") - pg = env.postgres.create_start('test_gc_aggressive') - log.info('postgres is running on test_gc_aggressive branch') + pg = env.postgres.create_start("test_gc_aggressive") + log.info("postgres is running on test_gc_aggressive branch") with pg.cursor() as cur: timeline = query_scalar(cur, "SHOW neon.timeline_id") # Create table, and insert the first 100 rows - cur.execute('CREATE TABLE foo (id int, counter int, t text)') - cur.execute(f''' + cur.execute("CREATE TABLE foo (id int, counter int, t text)") + cur.execute( + f""" INSERT INTO foo SELECT g, 0, 'long string to consume some space' || g FROM generate_series(1, {num_rows}) g - ''') - cur.execute('CREATE INDEX ON foo(id)') + """ + ) + cur.execute("CREATE INDEX ON foo(id)") asyncio.run(update_and_gc(env, pg, timeline)) - cur.execute('SELECT COUNT(*), SUM(counter) FROM foo') + cur.execute("SELECT COUNT(*), SUM(counter) FROM foo") r = cur.fetchone() assert r is not None assert r == (num_rows, updates_to_perform) diff --git a/test_runner/batch_others/test_import.py b/test_runner/batch_others/test_import.py index 039945e5e4..a2671727f7 100644 --- a/test_runner/batch_others/test_import.py +++ b/test_runner/batch_others/test_import.py @@ -1,17 +1,24 @@ -import re -import pytest -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, Postgres, wait_for_upload, wait_for_last_record_lsn -from fixtures.utils import lsn_from_hex -from uuid import UUID, uuid4 -import os -import tarfile -import shutil -from pathlib import Path import json -from fixtures.utils import subprocess_capture -from fixtures.log_helper import log +import os +import re +import shutil +import tarfile from contextlib import closing -from fixtures.neon_fixtures import pg_distrib_dir +from pathlib import Path +from uuid import UUID, uuid4 + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + Postgres, + pg_distrib_dir, + wait_for_last_record_lsn, + wait_for_upload, +) +from fixtures.utils import lsn_from_hex, subprocess_capture @pytest.mark.timeout(600) @@ -19,9 +26,11 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build # Put data in vanilla pg vanilla_pg.start() vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") - vanilla_pg.safe_psql('''create table t as select 'long string to consume some space' || g - from generate_series(1,300000) g''') - assert vanilla_pg.safe_psql('select count(*) from t') == [(300000, )] + vanilla_pg.safe_psql( + """create table t as select 'long string to consume some space' || g + from generate_series(1,300000) g""" + ) + assert vanilla_pg.safe_psql("select count(*) from t") == [(300000,)] # Take basebackup basebackup_dir = os.path.join(test_output_dir, "basebackup") @@ -29,15 +38,17 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build wal_tar = os.path.join(basebackup_dir, "pg_wal.tar") os.mkdir(basebackup_dir) vanilla_pg.safe_psql("CHECKPOINT") - pg_bin.run([ - "pg_basebackup", - "-F", - "tar", - "-d", - vanilla_pg.connstr(), - "-D", - basebackup_dir, - ]) + pg_bin.run( + [ + "pg_basebackup", + "-F", + "tar", + "-d", + vanilla_pg.connstr(), + "-D", + basebackup_dir, + ] + ) # Make corrupt base tar with missing pg_control unpacked_base = os.path.join(basebackup_dir, "unpacked-base") @@ -45,9 +56,11 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build os.mkdir(unpacked_base, 0o750) subprocess_capture(str(test_output_dir), ["tar", "-xf", base_tar, "-C", unpacked_base]) os.remove(os.path.join(unpacked_base, "global/pg_control")) - subprocess_capture(str(test_output_dir), - ["tar", "-cf", "corrupt-base.tar"] + os.listdir(unpacked_base), - cwd=unpacked_base) + subprocess_capture( + str(test_output_dir), + ["tar", "-cf", "corrupt-base.tar"] + os.listdir(unpacked_base), + cwd=unpacked_base, + ) # Get start_lsn and end_lsn with open(os.path.join(basebackup_dir, "backup_manifest")) as f: @@ -65,24 +78,26 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build env.pageserver.http_client().tenant_create(tenant) def import_tar(base, wal): - env.neon_cli.raw_cli([ - "timeline", - "import", - "--tenant-id", - tenant.hex, - "--timeline-id", - timeline.hex, - "--node-name", - node_name, - "--base-lsn", - start_lsn, - "--base-tarfile", - base, - "--end-lsn", - end_lsn, - "--wal-tarfile", - wal, - ]) + env.neon_cli.raw_cli( + [ + "timeline", + "import", + "--tenant-id", + tenant.hex, + "--timeline-id", + timeline.hex, + "--node-name", + node_name, + "--base-lsn", + start_lsn, + "--base-tarfile", + base, + "--end-lsn", + end_lsn, + "--wal-tarfile", + wal, + ] + ) # Importing corrupt backup fails with pytest.raises(Exception): @@ -102,7 +117,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build # Check it worked pg = env.postgres.create_start(node_name, tenant_id=tenant) - assert pg.safe_psql('select count(*) from t') == [(300000, )] + assert pg.safe_psql("select count(*) from t") == [(300000,)] @pytest.mark.timeout(600) @@ -111,8 +126,8 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu neon_env_builder.enable_local_fs_remote_storage() env = neon_env_builder.init_start() - timeline = env.neon_cli.create_branch('test_import_from_pageserver_small') - pg = env.postgres.create_start('test_import_from_pageserver_small') + timeline = env.neon_cli.create_branch("test_import_from_pageserver_small") + pg = env.postgres.create_start("test_import_from_pageserver_small") num_rows = 3000 lsn = _generate_data(num_rows, pg) @@ -129,8 +144,8 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne neon_env_builder.enable_local_fs_remote_storage() env = neon_env_builder.init_start() - timeline = env.neon_cli.create_branch('test_import_from_pageserver_multisegment') - pg = env.postgres.create_start('test_import_from_pageserver_multisegment') + timeline = env.neon_cli.create_branch("test_import_from_pageserver_multisegment") + pg = env.postgres.create_start("test_import_from_pageserver_multisegment") # For `test_import_from_pageserver_multisegment`, we want to make sure that the data # is large enough to create multi-segment files. Typically, a segment file's size is @@ -139,8 +154,9 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne num_rows = 30000000 lsn = _generate_data(num_rows, pg) - logical_size = env.pageserver.http_client().timeline_detail( - env.initial_tenant, timeline)['local']['current_logical_size'] + logical_size = env.pageserver.http_client().timeline_detail(env.initial_tenant, timeline)[ + "local" + ]["current_logical_size"] log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB") assert logical_size > 1024**3 # = 1GB @@ -148,7 +164,7 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne # Check if the backup data contains multiple segment files cnt_seg_files = 0 - segfile_re = re.compile('[0-9]+\\.[0-9]+') + segfile_re = re.compile("[0-9]+\\.[0-9]+") with tarfile.open(tar_output_file, "r") as tar_f: for f in tar_f.getnames(): if segfile_re.search(f) is not None: @@ -166,11 +182,13 @@ def _generate_data(num_rows: int, pg: Postgres) -> str: with conn.cursor() as cur: # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") - cur.execute(f'''CREATE TABLE tbl AS SELECT 'long string to consume some space' || g - from generate_series(1,{num_rows}) g''') + cur.execute( + f"""CREATE TABLE tbl AS SELECT 'long string to consume some space' || g + from generate_series(1,{num_rows}) g""" + ) cur.execute("CHECKPOINT") - cur.execute('SELECT pg_current_wal_insert_lsn()') + cur.execute("SELECT pg_current_wal_insert_lsn()") res = cur.fetchone() assert res is not None and isinstance(res[0], str) return res[0] @@ -189,7 +207,7 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')} + psql_env = {"LD_LIBRARY_PATH": os.path.join(str(pg_distrib_dir), "lib")} # Get a fullbackup from pageserver query = f"fullbackup { env.initial_tenant.hex} {timeline.hex} {lsn}" @@ -201,11 +219,11 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel env.postgres.stop_all() env.pageserver.stop() - dir_to_clear = Path(env.repo_dir) / 'tenants' + dir_to_clear = Path(env.repo_dir) / "tenants" shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) - #start the pageserver again + # start the pageserver again env.pageserver.start() # Import using another tenantid, because we use the same pageserver. @@ -216,20 +234,22 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel node_name = "import_from_pageserver" client = env.pageserver.http_client() client.tenant_create(tenant) - env.neon_cli.raw_cli([ - "timeline", - "import", - "--tenant-id", - tenant.hex, - "--timeline-id", - timeline.hex, - "--node-name", - node_name, - "--base-lsn", - lsn, - "--base-tarfile", - os.path.join(tar_output_file), - ]) + env.neon_cli.raw_cli( + [ + "timeline", + "import", + "--tenant-id", + tenant.hex, + "--timeline-id", + timeline.hex, + "--node-name", + node_name, + "--base-lsn", + lsn, + "--base-tarfile", + os.path.join(tar_output_file), + ] + ) # Wait for data to land in s3 wait_for_last_record_lsn(client, tenant, timeline, lsn_from_hex(lsn)) @@ -237,7 +257,7 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel # Check it worked pg = env.postgres.create_start(node_name, tenant_id=tenant) - assert pg.safe_psql('select count(*) from tbl') == [(expected_num_rows, )] + assert pg.safe_psql("select count(*) from tbl") == [(expected_num_rows,)] # Take another fullbackup query = f"fullbackup { tenant.hex} {timeline.hex} {lsn}" diff --git a/test_runner/batch_others/test_large_schema.py b/test_runner/batch_others/test_large_schema.py index 18ae0614a9..f14265f6fd 100644 --- a/test_runner/batch_others/test_large_schema.py +++ b/test_runner/batch_others/test_large_schema.py @@ -1,7 +1,8 @@ -import time import os -from fixtures.neon_fixtures import NeonEnvBuilder +import time + from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder # This test creates large number of tables which cause large catalog. @@ -14,7 +15,7 @@ from fixtures.log_helper import log def test_large_schema(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - pg = env.postgres.create_start('main') + pg = env.postgres.create_start("main") conn = pg.connect() cur = conn.cursor() @@ -22,7 +23,7 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): tables = 2 # 10 is too much for debug build partitions = 1000 for i in range(1, tables + 1): - print(f'iteration {i} / {tables}') + print(f"iteration {i} / {tables}") # Restart compute. Restart is actually not strictly needed. # It is done mostly because this test originally tries to model the problem reported by Ketteq. @@ -52,10 +53,10 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): # It's normal that it takes some time for the pageserver to # restart, and for the connection to fail until it does. It # should eventually recover, so retry until it succeeds. - print(f'failed: {error}') + print(f"failed: {error}") if retries < max_retries: retries += 1 - print(f'retry {retries} / {max_retries}') + print(f"retry {retries} / {max_retries}") time.sleep(retry_sleep) continue else: @@ -67,7 +68,7 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): for i in range(1, tables + 1): cur.execute(f"SELECT count(*) FROM t_{i}") - assert cur.fetchone() == (partitions, ) + assert cur.fetchone() == (partitions,) cur.execute("set enable_sort=off") cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid") @@ -77,6 +78,6 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant_id, timeline_id) for filename in os.listdir(timeline_path): - if filename.startswith('00000'): - log.info(f'layer {filename} size is {os.path.getsize(timeline_path + filename)}') + if filename.startswith("00000"): + log.info(f"layer {filename} size is {os.path.getsize(timeline_path + filename)}") assert os.path.getsize(timeline_path + filename) < 512_000_000 diff --git a/test_runner/batch_others/test_lsn_mapping.py b/test_runner/batch_others/test_lsn_mapping.py index d8b207135e..4db6951b42 100644 --- a/test_runner/batch_others/test_lsn_mapping.py +++ b/test_runner/batch_others/test_lsn_mapping.py @@ -1,13 +1,13 @@ +import math +import time from contextlib import closing from datetime import timedelta, timezone, tzinfo -import math from uuid import UUID -import psycopg2.extras -import psycopg2.errors -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres -from fixtures.log_helper import log -import time +import psycopg2.errors +import psycopg2.extras +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres from fixtures.utils import query_scalar @@ -18,7 +18,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() - new_timeline_id = env.neon_cli.create_branch('test_lsn_mapping') + new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping") pgmain = env.postgres.create_start("test_lsn_mapping") log.info("postgres is running on 'test_lsn_mapping' branch") @@ -35,7 +35,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): for i in range(1000): cur.execute(f"INSERT INTO foo VALUES({i})") # Get the timestamp at UTC - after_timestamp = query_scalar(cur, 'SELECT clock_timestamp()').replace(tzinfo=None) + after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=None) tbl.append([i, after_timestamp]) # Execute one more transaction with synchronous_commit enabled, to flush @@ -47,17 +47,17 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): probe_timestamp = tbl[-1][1] + timedelta(hours=1) result = query_scalar( ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'" + f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'", ) - assert result == 'future' + assert result == "future" # timestamp too the far history probe_timestamp = tbl[0][1] - timedelta(hours=10) result = query_scalar( ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'" + f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'", ) - assert result == 'past' + assert result == "past" # Probe a bunch of timestamps in the valid range for i in range(1, len(tbl), 100): @@ -66,14 +66,14 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Call get_lsn_by_timestamp to get the LSN lsn = query_scalar( ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'" + f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'", ) # Launch a new read-only node at that LSN, and check that only the rows # that were supposed to be committed at that point in time are visible. - pg_here = env.postgres.create_start(branch_name='test_lsn_mapping', - node_name='test_lsn_mapping_read', - lsn=lsn) + pg_here = env.postgres.create_start( + branch_name="test_lsn_mapping", node_name="test_lsn_mapping_read", lsn=lsn + ) assert pg_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i pg_here.stop_and_destroy() diff --git a/test_runner/batch_others/test_multixact.py b/test_runner/batch_others/test_multixact.py index dd00066092..635beb16b7 100644 --- a/test_runner/batch_others/test_multixact.py +++ b/test_runner/batch_others/test_multixact.py @@ -1,5 +1,5 @@ -from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.utils import query_scalar @@ -11,18 +11,21 @@ from fixtures.utils import query_scalar # def test_multixact(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env - env.neon_cli.create_branch('test_multixact', 'empty') - pg = env.postgres.create_start('test_multixact') + env.neon_cli.create_branch("test_multixact", "empty") + pg = env.postgres.create_start("test_multixact") log.info("postgres is running on 'test_multixact' branch") cur = pg.connect().cursor() - cur.execute(''' + cur.execute( + """ CREATE TABLE t1(i int primary key); INSERT INTO t1 select * from generate_series(1, 100); - ''') + """ + ) - next_multixact_id_old = query_scalar(cur, - 'SELECT next_multixact_id FROM pg_control_checkpoint()') + next_multixact_id_old = query_scalar( + cur, "SELECT next_multixact_id FROM pg_control_checkpoint()" + ) # Lock entries using parallel connections in a round-robin fashion. nclients = 20 @@ -40,17 +43,18 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): for i in range(5000): conn = connections[i % nclients] conn.commit() - conn.cursor().execute('select * from t1 for key share') + conn.cursor().execute("select * from t1 for key share") # We have multixacts now. We can close the connections. for c in connections: c.close() # force wal flush - cur.execute('checkpoint') + cur.execute("checkpoint") cur.execute( - 'SELECT next_multixact_id, pg_current_wal_insert_lsn() FROM pg_control_checkpoint()') + "SELECT next_multixact_id, pg_current_wal_insert_lsn() FROM pg_control_checkpoint()" + ) res = cur.fetchone() assert res is not None next_multixact_id = res[0] @@ -60,12 +64,13 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): assert int(next_multixact_id) > int(next_multixact_id_old) # Branch at this point - env.neon_cli.create_branch('test_multixact_new', 'test_multixact', ancestor_start_lsn=lsn) - pg_new = env.postgres.create_start('test_multixact_new') + env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn) + pg_new = env.postgres.create_start("test_multixact_new") log.info("postgres is running on 'test_multixact_new' branch") next_multixact_id_new = pg_new.safe_psql( - 'SELECT next_multixact_id FROM pg_control_checkpoint()')[0][0] + "SELECT next_multixact_id FROM pg_control_checkpoint()" + )[0][0] # Check that we restored pg_controlfile correctly assert next_multixact_id_new == next_multixact_id diff --git a/test_runner/batch_others/test_neon_cli.py b/test_runner/batch_others/test_neon_cli.py index 728bc7b894..1acfa72127 100644 --- a/test_runner/batch_others/test_neon_cli.py +++ b/test_runner/batch_others/test_neon_cli.py @@ -1,21 +1,29 @@ import uuid -import requests - -from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient from typing import cast +import requests +from fixtures.neon_fixtures import ( + DEFAULT_BRANCH_NAME, + NeonEnv, + NeonEnvBuilder, + NeonPageserverHttpClient, +) -def helper_compare_timeline_list(pageserver_http_client: NeonPageserverHttpClient, - env: NeonEnv, - initial_tenant: uuid.UUID): + +def helper_compare_timeline_list( + pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv, initial_tenant: uuid.UUID +): """ Compare timelines list returned by CLI and directly via API. Filters out timelines created by other tests. """ timelines_api = sorted( - map(lambda t: cast(str, t['timeline_id']), - pageserver_http_client.timeline_list(initial_tenant))) + map( + lambda t: cast(str, t["timeline_id"]), + pageserver_http_client.timeline_list(initial_tenant), + ) + ) timelines_cli = env.neon_cli.list_timelines() assert timelines_cli == env.neon_cli.list_timelines(initial_tenant) @@ -32,12 +40,13 @@ def test_cli_timeline_list(neon_simple_env: NeonEnv): helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Create a branch for us - main_timeline_id = env.neon_cli.create_branch('test_cli_branch_list_main') + main_timeline_id = env.neon_cli.create_branch("test_cli_branch_list_main") helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Create a nested branch - nested_timeline_id = env.neon_cli.create_branch('test_cli_branch_list_nested', - 'test_cli_branch_list_main') + nested_timeline_id = env.neon_cli.create_branch( + "test_cli_branch_list_nested", "test_cli_branch_list_main" + ) helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Check that all new branches are visible via CLI @@ -49,7 +58,7 @@ def test_cli_timeline_list(neon_simple_env: NeonEnv): def helper_compare_tenant_list(pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv): tenants = pageserver_http_client.tenant_list() - tenants_api = sorted(map(lambda t: cast(str, t['id']), tenants)) + tenants_api = sorted(map(lambda t: cast(str, t["id"]), tenants)) res = env.neon_cli.list_tenants() tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines())) @@ -97,7 +106,7 @@ def test_cli_ipv4_listeners(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() # Connect to sk port on v4 loopback - res = requests.get(f'http://127.0.0.1:{env.safekeepers[0].port.http}/v1/status') + res = requests.get(f"http://127.0.0.1:{env.safekeepers[0].port.http}/v1/status") assert res.ok # FIXME Test setup is using localhost:xx in ps config. diff --git a/test_runner/batch_others/test_next_xid.py b/test_runner/batch_others/test_next_xid.py index f8d11a9381..698ea0e1d3 100644 --- a/test_runner/batch_others/test_next_xid.py +++ b/test_runner/batch_others/test_next_xid.py @@ -8,15 +8,15 @@ from fixtures.neon_fixtures import NeonEnvBuilder def test_next_xid(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - pg = env.postgres.create_start('main') + pg = env.postgres.create_start("main") conn = pg.connect() cur = conn.cursor() - cur.execute('CREATE TABLE t(x integer)') + cur.execute("CREATE TABLE t(x integer)") iterations = 32 for i in range(1, iterations + 1): - print(f'iteration {i} / {iterations}') + print(f"iteration {i} / {iterations}") # Kill and restart the pageserver. pg.stop() @@ -38,10 +38,10 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder): # It's normal that it takes some time for the pageserver to # restart, and for the connection to fail until it does. It # should eventually recover, so retry until it succeeds. - print(f'failed: {error}') + print(f"failed: {error}") if retries < max_retries: retries += 1 - print(f'retry {retries} / {max_retries}') + print(f"retry {retries} / {max_retries}") time.sleep(retry_sleep) continue else: @@ -51,4 +51,4 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder): conn = pg.connect() cur = conn.cursor() cur.execute("SELECT count(*) FROM t") - assert cur.fetchone() == (iterations, ) + assert cur.fetchone() == (iterations,) diff --git a/test_runner/batch_others/test_normal_work.py b/test_runner/batch_others/test_normal_work.py index 5b25691517..002d697288 100644 --- a/test_runner/batch_others/test_normal_work.py +++ b/test_runner/batch_others/test_normal_work.py @@ -1,33 +1,35 @@ +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient -import pytest def check_tenant(env: NeonEnv, pageserver_http: NeonPageserverHttpClient): tenant_id, timeline_id = env.neon_cli.create_tenant() - pg = env.postgres.create_start('main', tenant_id=tenant_id) + pg = env.postgres.create_start("main", tenant_id=tenant_id) # we rely upon autocommit after each statement - res_1 = pg.safe_psql_many(queries=[ - 'CREATE TABLE t(key int primary key, value text)', - 'INSERT INTO t SELECT generate_series(1,100000), \'payload\'', - 'SELECT sum(key) FROM t', - ]) + res_1 = pg.safe_psql_many( + queries=[ + "CREATE TABLE t(key int primary key, value text)", + "INSERT INTO t SELECT generate_series(1,100000), 'payload'", + "SELECT sum(key) FROM t", + ] + ) - assert res_1[-1][0] == (5000050000, ) + assert res_1[-1][0] == (5000050000,) # TODO check detach on live instance log.info("stopping compute") pg.stop() log.info("compute stopped") pg.start() - res_2 = pg.safe_psql('SELECT sum(key) FROM t') - assert res_2[0] == (5000050000, ) + res_2 = pg.safe_psql("SELECT sum(key) FROM t") + assert res_2[0] == (5000050000,) pg.stop() pageserver_http.tenant_detach(tenant_id) -@pytest.mark.parametrize('num_timelines,num_safekeepers', [(3, 1)]) +@pytest.mark.parametrize("num_timelines,num_safekeepers", [(3, 1)]) def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_safekeepers: int): """ Basic test: diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py index 78a936af19..257913ef3f 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -1,7 +1,7 @@ -from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.log_helper import log -from fixtures.utils import print_gc_result, query_scalar import psycopg2.extras +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import print_gc_result, query_scalar # @@ -19,8 +19,8 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = neon_env_builder.init_start() env.neon_cli.create_branch("test_old_request_lsn", "main") - pg = env.postgres.create_start('test_old_request_lsn') - log.info('postgres is running on test_old_request_lsn branch') + pg = env.postgres.create_start("test_old_request_lsn") + log.info("postgres is running on test_old_request_lsn branch") pg_conn = pg.connect() cur = pg_conn.cursor() @@ -33,25 +33,29 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers. - cur.execute('CREATE TABLE foo (id int4 PRIMARY KEY, val int, t text)') - cur.execute(''' + cur.execute("CREATE TABLE foo (id int4 PRIMARY KEY, val int, t text)") + cur.execute( + """ INSERT INTO foo SELECT g, 1, 'long string to consume some space' || g FROM generate_series(1, 100000) g - ''') + """ + ) # Verify that the table is larger than shared_buffers, so that the SELECT below # will cause GetPage requests. - cur.execute(''' + cur.execute( + """ select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize from pg_settings where name = 'shared_buffers' - ''') + """ + ) row = cur.fetchone() assert row is not None - log.info(f'shared_buffers is {row[0]}, table size {row[1]}') + log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) - cur.execute('VACUUM foo') + cur.execute("VACUUM foo") # Make a lot of updates on a single row, generating a lot of WAL. Trigger # garbage collections so that the page server will remove old page versions. @@ -61,7 +65,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): print_gc_result(row) for j in range(100): - cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;') + cur.execute("UPDATE foo SET val = val + 1 WHERE id = 1;") # All (or at least most of) the updates should've been on the same page, so # that we haven't had to evict any dirty pages for a long time. Now run diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 710b220ae8..5d7619c1b2 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -1,54 +1,65 @@ -from typing import Optional -from uuid import uuid4, UUID -import pytest -import pathlib import os +import pathlib import subprocess -from fixtures.utils import lsn_from_hex +from typing import Optional +from uuid import UUID, uuid4 + +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, - NeonPageserverHttpClient, NeonPageserverApiException, - wait_until, + NeonPageserverHttpClient, neon_binpath, pg_distrib_dir, + wait_until, ) +from fixtures.utils import lsn_from_hex # test that we cannot override node id after init def test_pageserver_init_node_id(neon_simple_env: NeonEnv): repo_dir = neon_simple_env.repo_dir - pageserver_config = repo_dir / 'pageserver.toml' - pageserver_bin = pathlib.Path(neon_binpath) / 'pageserver' - run_pageserver = lambda args: subprocess.run([str(pageserver_bin), '-D', str(repo_dir), *args], - check=False, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + pageserver_config = repo_dir / "pageserver.toml" + pageserver_bin = pathlib.Path(neon_binpath) / "pageserver" + run_pageserver = lambda args: subprocess.run( + [str(pageserver_bin), "-D", str(repo_dir), *args], + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) # remove initial config pageserver_config.unlink() - bad_init = run_pageserver(['--init', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) - assert bad_init.returncode == 1, 'pageserver should not be able to init new config without the node id' + bad_init = run_pageserver(["--init", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']) + assert ( + bad_init.returncode == 1 + ), "pageserver should not be able to init new config without the node id" assert "missing id" in bad_init.stderr - assert not pageserver_config.exists(), 'config file should not be created after init error' + assert not pageserver_config.exists(), "config file should not be created after init error" completed_init = run_pageserver( - ['--init', '-c', 'id = 12345', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) - assert completed_init.returncode == 0, 'pageserver should be able to create a new config with the node id given' - assert pageserver_config.exists(), 'config file should be created successfully' + ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] + ) + assert ( + completed_init.returncode == 0 + ), "pageserver should be able to create a new config with the node id given" + assert pageserver_config.exists(), "config file should be created successfully" bad_reinit = run_pageserver( - ['--init', '-c', 'id = 12345', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) - assert bad_reinit.returncode == 1, 'pageserver should not be able to init new config without the node id' + ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] + ) + assert ( + bad_reinit.returncode == 1 + ), "pageserver should not be able to init new config without the node id" assert "already exists, cannot init it" in bad_reinit.stderr - bad_update = run_pageserver(['--update-config', '-c', 'id = 3']) - assert bad_update.returncode == 1, 'pageserver should not allow updating node id' + bad_update = run_pageserver(["--update-config", "-c", "id = 3"]) + assert bad_update.returncode == 1, "pageserver should not allow updating node id" assert "has node id already, it cannot be overridden" in bad_update.stderr @@ -56,12 +67,12 @@ def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID): client.check_status() # check initial tenant is there - assert initial_tenant.hex in {t['id'] for t in client.tenant_list()} + assert initial_tenant.hex in {t["id"] for t in client.tenant_list()} # create new tenant and check it is also there tenant_id = uuid4() client.tenant_create(tenant_id) - assert tenant_id.hex in {t['id'] for t in client.tenant_list()} + assert tenant_id.hex in {t["id"] for t in client.tenant_list()} timelines = client.timeline_list(tenant_id) assert len(timelines) == 0, "initial tenant should not have any timelines" @@ -74,19 +85,21 @@ def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID): assert len(timelines) > 0 # check it is there - assert timeline_id.hex in {b['timeline_id'] for b in client.timeline_list(tenant_id)} + assert timeline_id.hex in {b["timeline_id"] for b in client.timeline_list(tenant_id)} for timeline in timelines: - timeline_id_str = str(timeline['timeline_id']) - timeline_details = client.timeline_detail(tenant_id=tenant_id, - timeline_id=UUID(timeline_id_str), - include_non_incremental_logical_size=True) + timeline_id_str = str(timeline["timeline_id"]) + timeline_details = client.timeline_detail( + tenant_id=tenant_id, + timeline_id=UUID(timeline_id_str), + include_non_incremental_logical_size=True, + ) - assert timeline_details['tenant_id'] == tenant_id.hex - assert timeline_details['timeline_id'] == timeline_id_str + assert timeline_details["tenant_id"] == tenant_id.hex + assert timeline_details["timeline_id"] == timeline_id_str - local_timeline_details = timeline_details.get('local') + local_timeline_details = timeline_details.get("local") assert local_timeline_details is not None - assert local_timeline_details['timeline_state'] == 'Loaded' + assert local_timeline_details["timeline_state"] == "Loaded" def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): @@ -94,32 +107,43 @@ def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): with env.pageserver.http_client() as client: tenant_id, timeline_id = env.neon_cli.create_tenant() - timeline_details = client.timeline_detail(tenant_id=tenant_id, - timeline_id=timeline_id, - include_non_incremental_logical_size=True) + timeline_details = client.timeline_detail( + tenant_id=tenant_id, timeline_id=timeline_id, include_non_incremental_logical_size=True + ) - assert timeline_details.get('wal_source_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running' - assert timeline_details.get('last_received_msg_lsn') is None, 'Should not be able to connect to WAL streaming without PG compute node running' - assert timeline_details.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running' + assert ( + timeline_details.get("wal_source_connstr") is None + ), "Should not be able to connect to WAL streaming without PG compute node running" + assert ( + timeline_details.get("last_received_msg_lsn") is None + ), "Should not be able to connect to WAL streaming without PG compute node running" + assert ( + timeline_details.get("last_received_msg_ts") is None + ), "Should not be able to connect to WAL streaming without PG compute node running" -def expect_updated_msg_lsn(client: NeonPageserverHttpClient, - tenant_id: UUID, - timeline_id: UUID, - prev_msg_lsn: Optional[int]) -> int: +def expect_updated_msg_lsn( + client: NeonPageserverHttpClient, + tenant_id: UUID, + timeline_id: UUID, + prev_msg_lsn: Optional[int], +) -> int: timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id) # a successful `timeline_details` response must contain the below fields - local_timeline_details = timeline_details['local'] + local_timeline_details = timeline_details["local"] assert "wal_source_connstr" in local_timeline_details.keys() assert "last_received_msg_lsn" in local_timeline_details.keys() assert "last_received_msg_ts" in local_timeline_details.keys() - assert local_timeline_details["last_received_msg_lsn"] is not None, "the last received message's LSN is empty" + assert ( + local_timeline_details["last_received_msg_lsn"] is not None + ), "the last received message's LSN is empty" last_msg_lsn = lsn_from_hex(local_timeline_details["last_received_msg_lsn"]) - assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, \ - f"the last received message's LSN {last_msg_lsn} hasn't been updated \ + assert ( + prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn + ), f"the last received message's LSN {last_msg_lsn} hasn't been updated \ compared to the previous message's LSN {prev_msg_lsn}" return last_msg_lsn @@ -139,15 +163,19 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): # We need to wait here because it's possible that we don't have access to # the latest WAL yet, when the `timeline_detail` API is first called. # See: https://github.com/neondatabase/neon/issues/1768. - lsn = wait_until(number_of_iterations=5, - interval=1, - func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None)) + lsn = wait_until( + number_of_iterations=5, + interval=1, + func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None), + ) # Make a DB modification then expect getting a new WAL receiver's data. pg.safe_psql("CREATE TABLE t(key int primary key, value text)") - wait_until(number_of_iterations=5, - interval=1, - func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn)) + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn), + ) def test_pageserver_http_api_client(neon_simple_env: NeonEnv): diff --git a/test_runner/batch_others/test_pageserver_catchup.py b/test_runner/batch_others/test_pageserver_catchup.py index dd24351e17..cba3203591 100644 --- a/test_runner/batch_others/test_pageserver_catchup.py +++ b/test_runner/batch_others/test_pageserver_catchup.py @@ -9,24 +9,27 @@ def test_pageserver_catchup_while_compute_down(neon_env_builder: NeonEnvBuilder) neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_pageserver_catchup_while_compute_down') + env.neon_cli.create_branch("test_pageserver_catchup_while_compute_down") # Make shared_buffers large to ensure we won't query pageserver while it is down. - pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down', - config_lines=['shared_buffers=512MB']) + pg = env.postgres.create_start( + "test_pageserver_catchup_while_compute_down", config_lines=["shared_buffers=512MB"] + ) pg_conn = pg.connect() cur = pg_conn.cursor() # Create table, and insert some rows. - cur.execute('CREATE TABLE foo (t text)') - cur.execute(''' + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 10000) g - ''') + """ + ) cur.execute("SELECT count(*) FROM foo") - assert cur.fetchone() == (10000, ) + assert cur.fetchone() == (10000,) # Stop and restart pageserver. This is a more or less graceful shutdown, although # the page server doesn't currently have a shutdown routine so there's no difference @@ -35,11 +38,13 @@ def test_pageserver_catchup_while_compute_down(neon_env_builder: NeonEnvBuilder) # insert some more rows # since pageserver is shut down, these will be only on safekeepers - cur.execute(''' + cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 10000) g - ''') + """ + ) # stop safekeepers gracefully env.safekeepers[0].stop() @@ -54,11 +59,11 @@ def test_pageserver_catchup_while_compute_down(neon_env_builder: NeonEnvBuilder) env.safekeepers[2].start() # restart compute node - pg.stop_and_destroy().create_start('test_pageserver_catchup_while_compute_down') + pg.stop_and_destroy().create_start("test_pageserver_catchup_while_compute_down") # Ensure that basebackup went correct and pageserver returned all data pg_conn = pg.connect() cur = pg_conn.cursor() cur.execute("SELECT count(*) FROM foo") - assert cur.fetchone() == (20000, ) + assert cur.fetchone() == (20000,) diff --git a/test_runner/batch_others/test_pageserver_restart.py b/test_runner/batch_others/test_pageserver_restart.py index c656469cb7..e2bd8be9b7 100644 --- a/test_runner/batch_others/test_pageserver_restart.py +++ b/test_runner/batch_others/test_pageserver_restart.py @@ -1,5 +1,5 @@ -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder # Test restarting page server, while safekeeper and compute node keep @@ -7,8 +7,8 @@ from fixtures.log_helper import log def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_pageserver_restart') - pg = env.postgres.create_start('test_pageserver_restart') + env.neon_cli.create_branch("test_pageserver_restart") + pg = env.postgres.create_start("test_pageserver_restart") pg_conn = pg.connect() cur = pg_conn.cursor() @@ -17,18 +17,22 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point # of this test. - cur.execute('CREATE TABLE foo (t text)') - cur.execute(''' + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g - ''') + """ + ) # Verify that the table is larger than shared_buffers - cur.execute(''' + cur.execute( + """ select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize from pg_settings where name = 'shared_buffers' - ''') + """ + ) row = cur.fetchone() assert row is not None log.info(f"shared_buffers is {row[0]}, table size {row[1]}") @@ -49,7 +53,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): cur = pg_conn.cursor() cur.execute("SELECT count(*) FROM foo") - assert cur.fetchone() == (100000, ) + assert cur.fetchone() == (100000,) # Stop the page server by force, and restart it env.pageserver.stop() diff --git a/test_runner/batch_others/test_parallel_copy.py b/test_runner/batch_others/test_parallel_copy.py index 55947fe427..6b7fe4fdda 100644 --- a/test_runner/batch_others/test_parallel_copy.py +++ b/test_runner/batch_others/test_parallel_copy.py @@ -1,7 +1,8 @@ -from io import BytesIO import asyncio -from fixtures.neon_fixtures import NeonEnv, Postgres +from io import BytesIO + from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, Postgres async def repeat_bytes(buf, repetitions: int): @@ -13,7 +14,8 @@ async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str) buf = BytesIO() for i in range(1000): buf.write( - f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode()) + f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode() + ) buf.seek(0) copy_input = repeat_bytes(buf.read(), 5000) @@ -30,7 +32,7 @@ async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str) async def parallel_load_same_table(pg: Postgres, n_parallel: int): workers = [] for worker_id in range(n_parallel): - worker = copy_test_data_to_table(pg, worker_id, f'copytest') + worker = copy_test_data_to_table(pg, worker_id, f"copytest") workers.append(asyncio.create_task(worker)) # await all workers @@ -41,13 +43,13 @@ async def parallel_load_same_table(pg: Postgres, n_parallel: int): def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5): env = neon_simple_env env.neon_cli.create_branch("test_parallel_copy", "empty") - pg = env.postgres.create_start('test_parallel_copy') + pg = env.postgres.create_start("test_parallel_copy") log.info("postgres is running on 'test_parallel_copy' branch") # Create test table conn = pg.connect() cur = conn.cursor() - cur.execute(f'CREATE TABLE copytest (i int, t text)') + cur.execute(f"CREATE TABLE copytest (i int, t text)") # Run COPY TO to load the table with parallel connections. asyncio.run(parallel_load_same_table(pg, n_parallel)) diff --git a/test_runner/batch_others/test_pitr_gc.py b/test_runner/batch_others/test_pitr_gc.py index d63fc4b584..1fc18ebbc4 100644 --- a/test_runner/batch_others/test_pitr_gc.py +++ b/test_runner/batch_others/test_pitr_gc.py @@ -2,8 +2,8 @@ from contextlib import closing import psycopg2.extras from fixtures.log_helper import log -from fixtures.utils import print_gc_result, query_scalar from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import print_gc_result, query_scalar # @@ -14,10 +14,12 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 1 # Set pitr interval such that we need to keep the data - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" + neon_env_builder.pageserver_config_override = ( + "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" + ) env = neon_env_builder.init_start() - pgmain = env.postgres.create_start('main') + pgmain = env.postgres.create_start("main") log.info("postgres is running on 'main' branch") main_pg_conn = pgmain.connect() @@ -25,30 +27,32 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): timeline = query_scalar(main_cur, "SHOW neon.timeline_id") # Create table - main_cur.execute('CREATE TABLE foo (t text)') + main_cur.execute("CREATE TABLE foo (t text)") for i in range(10000): - main_cur.execute(''' + main_cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space'; - ''') + """ + ) if i == 99: # keep some early lsn to test branch creation after GC - main_cur.execute('SELECT pg_current_wal_insert_lsn(), txid_current()') + main_cur.execute("SELECT pg_current_wal_insert_lsn(), txid_current()") res = main_cur.fetchone() assert res is not None lsn_a = res[0] xid_a = res[1] - log.info(f'LSN after 100 rows: {lsn_a} xid {xid_a}') + log.info(f"LSN after 100 rows: {lsn_a} xid {xid_a}") - main_cur.execute('SELECT pg_current_wal_insert_lsn(), txid_current()') + main_cur.execute("SELECT pg_current_wal_insert_lsn(), txid_current()") res = main_cur.fetchone() assert res is not None debug_lsn = res[0] debug_xid = res[1] - log.info(f'LSN after 10000 rows: {debug_lsn} xid {debug_xid}') + log.info(f"LSN after 10000 rows: {debug_lsn} xid {debug_xid}") # run GC with closing(env.pageserver.connect()) as psconn: @@ -61,16 +65,16 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): # Branch at the point where only 100 rows were inserted # It must have been preserved by PITR setting - env.neon_cli.create_branch('test_pitr_gc_hundred', 'main', ancestor_start_lsn=lsn_a) + env.neon_cli.create_branch("test_pitr_gc_hundred", "main", ancestor_start_lsn=lsn_a) - pg_hundred = env.postgres.create_start('test_pitr_gc_hundred') + pg_hundred = env.postgres.create_start("test_pitr_gc_hundred") # On the 'hundred' branch, we should see only 100 rows hundred_pg_conn = pg_hundred.connect() hundred_cur = hundred_pg_conn.cursor() - hundred_cur.execute('SELECT count(*) FROM foo') - assert hundred_cur.fetchone() == (100, ) + hundred_cur.execute("SELECT count(*) FROM foo") + assert hundred_cur.fetchone() == (100,) # All the rows are visible on the main branch - main_cur.execute('SELECT count(*) FROM foo') - assert main_cur.fetchone() == (10000, ) + main_cur.execute("SELECT count(*) FROM foo") + assert main_cur.fetchone() == (10000,) diff --git a/test_runner/batch_others/test_proxy.py b/test_runner/batch_others/test_proxy.py index 2d9957fc38..dcff177044 100644 --- a/test_runner/batch_others/test_proxy.py +++ b/test_runner/batch_others/test_proxy.py @@ -1,25 +1,26 @@ -import pytest import psycopg2 +import pytest def test_proxy_select_1(static_proxy): - static_proxy.safe_psql('select 1', options='project=generic-project-name') + static_proxy.safe_psql("select 1", options="project=generic-project-name") def test_password_hack(static_proxy): - user = 'borat' - password = 'password' - static_proxy.safe_psql(f"create role {user} with login password '{password}'", - options='project=irrelevant') + user = "borat" + password = "password" + static_proxy.safe_psql( + f"create role {user} with login password '{password}'", options="project=irrelevant" + ) # Note the format of `magic`! magic = f"project=irrelevant;{password}" - static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic) + static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic) # Must also check that invalid magic won't be accepted. with pytest.raises(psycopg2.errors.OperationalError): magic = "broken" - static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic) + static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic) # Pass extra options to the server. @@ -28,8 +29,8 @@ def test_password_hack(static_proxy): # See https://github.com/neondatabase/neon/issues/1287 @pytest.mark.xfail def test_proxy_options(static_proxy): - with static_proxy.connect(options='-cproxytest.option=value') as conn: + with static_proxy.connect(options="-cproxytest.option=value") as conn: with conn.cursor() as cur: - cur.execute('SHOW proxytest.option') + cur.execute("SHOW proxytest.option") value = cur.fetchall()[0][0] - assert value == 'value' + assert value == "value" diff --git a/test_runner/batch_others/test_read_validation.py b/test_runner/batch_others/test_read_validation.py index 4be7af4c10..beaae0351b 100644 --- a/test_runner/batch_others/test_read_validation.py +++ b/test_runner/batch_others/test_read_validation.py @@ -1,14 +1,11 @@ from contextlib import closing -from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log - -from psycopg2.errors import UndefinedTable -from psycopg2.errors import IoError - +from fixtures.neon_fixtures import NeonEnv from fixtures.utils import query_scalar +from psycopg2.errors import IoError, UndefinedTable -pytest_plugins = ("fixtures.neon_fixtures") +pytest_plugins = "fixtures.neon_fixtures" extensions = ["pageinspect", "neon_test_utils", "pg_buffercache"] @@ -47,13 +44,15 @@ def test_read_validation(neon_simple_env: NeonEnv): log.info("Test table is populated, validating buffer cache") cache_entries = query_scalar( - c, - "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode)) + c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + ) assert cache_entries > 0, "No buffers cached for the test relation" c.execute( - "select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {}" - .format(relfilenode)) + "select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {}".format( + relfilenode + ) + ) reln = c.fetchone() assert reln is not None @@ -62,21 +61,23 @@ def test_read_validation(neon_simple_env: NeonEnv): c.execute("select clear_buffer_cache()") cache_entries = query_scalar( - c, - "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode)) + c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + ) assert cache_entries == 0, "Failed to clear buffer cache" log.info("Cache is clear, reading stale page version") c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{}'))" - .format(first[0])) + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{}'))".format( + first[0] + ) + ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn" cache_entries = query_scalar( - c, - "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode)) + c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + ) assert cache_entries == 0, "relation buffers detected after invalidation" log.info("Cache is clear, reading latest page version without cache") @@ -88,8 +89,8 @@ def test_read_validation(neon_simple_env: NeonEnv): assert second == direct_latest, "Failed fetch page at latest lsn" cache_entries = query_scalar( - c, - "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode)) + c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + ) assert cache_entries == 0, "relation buffers detected after invalidation" log.info( @@ -97,8 +98,10 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))" - .format(reln[0], reln[1], reln[2], first[0])) + "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format( + reln[0], reln[1], reln[2], first[0] + ) + ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn using oid" @@ -107,20 +110,24 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, NULL ))" - .format(reln[0], reln[1], reln[2])) + "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, NULL ))".format( + reln[0], reln[1], reln[2] + ) + ) direct_latest = c.fetchone() assert second == direct_latest, "Failed fetch page at latest lsn" - c.execute('drop table foo;') + c.execute("drop table foo;") log.info( "Relation dropped, attempting reading stale page version without cache using relation identifiers" ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))" - .format(reln[0], reln[1], reln[2], first[0])) + "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format( + reln[0], reln[1], reln[2], first[0] + ) + ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn using oid" diff --git a/test_runner/batch_others/test_readonly_node.py b/test_runner/batch_others/test_readonly_node.py index 82fc6329cf..0bd78c62a3 100644 --- a/test_runner/batch_others/test_readonly_node.py +++ b/test_runner/batch_others/test_readonly_node.py @@ -12,81 +12,87 @@ from fixtures.utils import query_scalar # def test_readonly_node(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch('test_readonly_node', 'empty') - pgmain = env.postgres.create_start('test_readonly_node') + env.neon_cli.create_branch("test_readonly_node", "empty") + pgmain = env.postgres.create_start("test_readonly_node") log.info("postgres is running on 'test_readonly_node' branch") main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() # Create table, and insert the first 100 rows - main_cur.execute('CREATE TABLE foo (t text)') + main_cur.execute("CREATE TABLE foo (t text)") - main_cur.execute(''' + main_cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100) g - ''') - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_a = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info('LSN after 100 rows: ' + lsn_a) + """ + ) + main_cur.execute("SELECT pg_current_wal_insert_lsn()") + lsn_a = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info("LSN after 100 rows: " + lsn_a) # Insert some more rows. (This generates enough WAL to fill a few segments.) - main_cur.execute(''' + main_cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 200000) g - ''') - lsn_b = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info('LSN after 200100 rows: ' + lsn_b) + """ + ) + lsn_b = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info("LSN after 200100 rows: " + lsn_b) # Insert many more rows. This generates enough WAL to fill a few segments. - main_cur.execute(''' + main_cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 200000) g - ''') + """ + ) - lsn_c = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info('LSN after 400100 rows: ' + lsn_c) + lsn_c = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info("LSN after 400100 rows: " + lsn_c) # Create first read-only node at the point where only 100 rows were inserted - pg_hundred = env.postgres.create_start(branch_name='test_readonly_node', - node_name='test_readonly_node_hundred', - lsn=lsn_a) + pg_hundred = env.postgres.create_start( + branch_name="test_readonly_node", node_name="test_readonly_node_hundred", lsn=lsn_a + ) # And another at the point where 200100 rows were inserted - pg_more = env.postgres.create_start(branch_name='test_readonly_node', - node_name='test_readonly_node_more', - lsn=lsn_b) + pg_more = env.postgres.create_start( + branch_name="test_readonly_node", node_name="test_readonly_node_more", lsn=lsn_b + ) # On the 'hundred' node, we should see only 100 rows hundred_pg_conn = pg_hundred.connect() hundred_cur = hundred_pg_conn.cursor() - hundred_cur.execute('SELECT count(*) FROM foo') - assert hundred_cur.fetchone() == (100, ) + hundred_cur.execute("SELECT count(*) FROM foo") + assert hundred_cur.fetchone() == (100,) # On the 'more' node, we should see 100200 rows more_pg_conn = pg_more.connect() more_cur = more_pg_conn.cursor() - more_cur.execute('SELECT count(*) FROM foo') - assert more_cur.fetchone() == (200100, ) + more_cur.execute("SELECT count(*) FROM foo") + assert more_cur.fetchone() == (200100,) # All the rows are visible on the main branch - main_cur.execute('SELECT count(*) FROM foo') - assert main_cur.fetchone() == (400100, ) + main_cur.execute("SELECT count(*) FROM foo") + assert main_cur.fetchone() == (400100,) # Check creating a node at segment boundary - pg = env.postgres.create_start(branch_name='test_readonly_node', - node_name='test_branch_segment_boundary', - lsn='0/3000000') + pg = env.postgres.create_start( + branch_name="test_readonly_node", node_name="test_branch_segment_boundary", lsn="0/3000000" + ) cur = pg.connect().cursor() - cur.execute('SELECT 1') - assert cur.fetchone() == (1, ) + cur.execute("SELECT 1") + assert cur.fetchone() == (1,) # Create node at pre-initdb lsn with pytest.raises(Exception, match="invalid basebackup lsn"): # compute node startup with invalid LSN should fail - env.postgres.create_start(branch_name='test_readonly_node', - node_name='test_readonly_node_preinitdb', - lsn='0/42') + env.postgres.create_start( + branch_name="test_readonly_node", node_name="test_readonly_node_preinitdb", lsn="0/42" + ) diff --git a/test_runner/batch_others/test_recovery.py b/test_runner/batch_others/test_recovery.py index 5ba783b802..5220aa6c2e 100644 --- a/test_runner/batch_others/test_recovery.py +++ b/test_runner/batch_others/test_recovery.py @@ -1,11 +1,12 @@ +import json import os import time -import psycopg2.extras -import json from ast import Assert from contextlib import closing -from fixtures.neon_fixtures import NeonEnvBuilder + +import psycopg2.extras from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder # @@ -21,13 +22,15 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): # Check if failpoints enables. Otherwise the test doesn't make sense f = env.neon_cli.pageserver_enabled_features() - assert "failpoints" in f["features"], "Build pageserver with --features=failpoints option to run this test" + assert ( + "failpoints" in f["features"] + ), "Build pageserver with --features=failpoints option to run this test" neon_env_builder.start() # Create a branch for us env.neon_cli.create_branch("test_pageserver_recovery", "main") - pg = env.postgres.create_start('test_pageserver_recovery') + pg = env.postgres.create_start("test_pageserver_recovery") log.info("postgres is running on 'test_pageserver_recovery' branch") connstr = pg.connstr() @@ -62,4 +65,4 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): with closing(pg.connect()) as conn: with conn.cursor() as cur: cur.execute("select count(*) from foo") - assert cur.fetchone() == (100000, ) + assert cur.fetchone() == (100000,) diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index ca46010dca..974d3402f6 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -1,14 +1,24 @@ # It's possible to run any regular test with the local fs remote storage via # env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... -import shutil, os -from pathlib import Path +import os +import shutil import time +from pathlib import Path from uuid import UUID -from fixtures.neon_fixtures import NeonEnvBuilder, RemoteStorageKind, assert_timeline_local, available_remote_storages, wait_until, wait_for_last_record_lsn, wait_for_upload -from fixtures.log_helper import log -from fixtures.utils import lsn_from_hex, query_scalar + import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + RemoteStorageKind, + assert_timeline_local, + available_remote_storages, + wait_for_last_record_lsn, + wait_for_upload, + wait_until, +) +from fixtures.utils import lsn_from_hex, query_scalar # @@ -28,7 +38,7 @@ import pytest # * queries the specific data, ensuring that it matches the one stored before # # The tests are done for all types of remote storage pageserver supports. -@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages()) +@pytest.mark.parametrize("remote_storatge_kind", available_remote_storages()) def test_remote_storage_backup_and_restore( neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind, @@ -39,15 +49,15 @@ def test_remote_storage_backup_and_restore( neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storatge_kind, - test_name='test_remote_storage_backup_and_restore', + test_name="test_remote_storage_backup_and_restore", ) data_id = 1 - data_secret = 'very secret secret' + data_secret = "very secret secret" ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() - pg = env.postgres.create_start('main') + pg = env.postgres.create_start("main") client = env.pageserver.http_client() @@ -58,10 +68,12 @@ def test_remote_storage_backup_and_restore( for checkpoint_number in checkpoint_numbers: with pg.cursor() as cur: - cur.execute(f''' + cur.execute( + f""" CREATE TABLE t{checkpoint_number}(id int primary key, secret text); INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); - ''') + """ + ) current_lsn = lsn_from_hex(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) # wait until pageserver receives that data @@ -70,16 +82,16 @@ def test_remote_storage_backup_and_restore( # run checkpoint manually to be sure that data landed in remote storage env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") - log.info(f'waiting for checkpoint {checkpoint_number} upload') + log.info(f"waiting for checkpoint {checkpoint_number} upload") # wait until pageserver successfully uploaded a checkpoint to remote storage wait_for_upload(client, UUID(tenant_id), UUID(timeline_id), current_lsn) - log.info(f'upload of checkpoint {checkpoint_number} is done') + log.info(f"upload of checkpoint {checkpoint_number} is done") ##### Stop the first pageserver instance, erase all its data env.postgres.stop_all() env.pageserver.stop() - dir_to_clear = Path(env.repo_dir) / 'tenants' + dir_to_clear = Path(env.repo_dir) / "tenants" shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) @@ -100,8 +112,8 @@ def test_remote_storage_backup_and_restore( detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) log.info("Timeline detail with active failpoint: %s", detail) - assert detail['local'] is None - assert detail['remote']['awaits_download'] + assert detail["local"] is None + assert detail["remote"]["awaits_download"] # trigger temporary download files removal env.pageserver.stop() @@ -110,19 +122,24 @@ def test_remote_storage_backup_and_restore( client.tenant_attach(UUID(tenant_id)) log.info("waiting for timeline redownload") - wait_until(number_of_iterations=20, - interval=1, - func=lambda: assert_timeline_local(client, UUID(tenant_id), UUID(timeline_id))) + wait_until( + number_of_iterations=20, + interval=1, + func=lambda: assert_timeline_local(client, UUID(tenant_id), UUID(timeline_id)), + ) detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) - assert detail['local'] is not None + assert detail["local"] is not None log.info("Timeline detail after attach completed: %s", detail) - assert lsn_from_hex(detail['local']['last_record_lsn']) >= current_lsn, 'current db Lsn should should not be less than the one stored on remote storage' - assert not detail['remote']['awaits_download'] + assert ( + lsn_from_hex(detail["local"]["last_record_lsn"]) >= current_lsn + ), "current db Lsn should should not be less than the one stored on remote storage" + assert not detail["remote"]["awaits_download"] - pg = env.postgres.create_start('main') + pg = env.postgres.create_start("main") with pg.cursor() as cur: for checkpoint_number in checkpoint_numbers: - assert query_scalar(cur, - f'SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};' - ) == f'{data_secret}|{checkpoint_number}' + assert ( + query_scalar(cur, f"SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};") + == f"{data_secret}|{checkpoint_number}" + ) diff --git a/test_runner/batch_others/test_subxacts.py b/test_runner/batch_others/test_subxacts.py index d06877825e..42234bf535 100644 --- a/test_runner/batch_others/test_subxacts.py +++ b/test_runner/batch_others/test_subxacts.py @@ -1,5 +1,5 @@ -from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content # Test subtransactions @@ -11,28 +11,30 @@ from fixtures.log_helper import log def test_subxacts(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env env.neon_cli.create_branch("test_subxacts", "empty") - pg = env.postgres.create_start('test_subxacts') + pg = env.postgres.create_start("test_subxacts") log.info("postgres is running on 'test_subxacts' branch") pg_conn = pg.connect() cur = pg_conn.cursor() - cur.execute(''' + cur.execute( + """ CREATE TABLE t1(i int, j int); - ''') + """ + ) - cur.execute('select pg_switch_wal();') + cur.execute("select pg_switch_wal();") # Issue 100 transactions, with 1000 subtransactions in each. for i in range(100): - cur.execute('begin') + cur.execute("begin") for j in range(1000): - cur.execute(f'savepoint sp{j}') - cur.execute(f'insert into t1 values ({i}, {j})') - cur.execute('commit') + cur.execute(f"savepoint sp{j}") + cur.execute(f"insert into t1 values ({i}, {j})") + cur.execute("commit") # force wal flush - cur.execute('checkpoint') + cur.execute("checkpoint") # Check that we can restore the content of the datadir correctly check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/batch_others/test_tenant_conf.py b/test_runner/batch_others/test_tenant_conf.py index d25aad742e..1e09ae8db7 100644 --- a/test_runner/batch_others/test_tenant_conf.py +++ b/test_runner/batch_others/test_tenant_conf.py @@ -1,27 +1,28 @@ from contextlib import closing -import pytest import psycopg2.extras - -from fixtures.neon_fixtures import NeonEnvBuilder +import pytest from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder def test_tenant_config(neon_env_builder: NeonEnvBuilder): # set some non-default global config - neon_env_builder.pageserver_config_override = ''' + neon_env_builder.pageserver_config_override = """ page_cache_size=444; wait_lsn_timeout='111 s'; -tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' +tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" env = neon_env_builder.init_start() """Test per tenant configuration""" - tenant, _ = env.neon_cli.create_tenant(conf={ - 'checkpoint_distance': '20000', - 'gc_period': '30sec', - }) + tenant, _ = env.neon_cli.create_tenant( + conf={ + "checkpoint_distance": "20000", + "gc_period": "30sec", + } + ) - env.neon_cli.create_timeline(f'test_tenant_conf', tenant_id=tenant) + env.neon_cli.create_timeline(f"test_tenant_conf", tenant_id=tenant) pg = env.postgres.create_start( "test_tenant_conf", "main", @@ -36,7 +37,8 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' pscur.execute(f"show {env.initial_tenant.hex}") res = pscur.fetchone() assert all( - i in res.items() for i in { + i in res.items() + for i in { "checkpoint_distance": 10000, "compaction_target_size": 1048576, "compaction_period": 1, @@ -44,8 +46,9 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' "gc_horizon": 67108864, "gc_period": 100, "image_creation_threshold": 3, - "pitr_interval": 2592000 - }.items()) + "pitr_interval": 2592000, + }.items() + ) # check the configuration of the new tenant with closing(env.pageserver.connect()) as psconn: @@ -54,7 +57,8 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' res = pscur.fetchone() log.info(f"res: {res}") assert all( - i in res.items() for i in { + i in res.items() + for i in { "checkpoint_distance": 20000, "compaction_target_size": 1048576, "compaction_period": 1, @@ -62,15 +66,18 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' "gc_horizon": 67108864, "gc_period": 30, "image_creation_threshold": 3, - "pitr_interval": 2592000 - }.items()) + "pitr_interval": 2592000, + }.items() + ) # update the config and ensure that it has changed - env.neon_cli.config_tenant(tenant_id=tenant, - conf={ - 'checkpoint_distance': '15000', - 'gc_period': '80sec', - }) + env.neon_cli.config_tenant( + tenant_id=tenant, + conf={ + "checkpoint_distance": "15000", + "gc_period": "80sec", + }, + ) with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: @@ -78,7 +85,8 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' res = pscur.fetchone() log.info(f"after config res: {res}") assert all( - i in res.items() for i in { + i in res.items() + for i in { "checkpoint_distance": 15000, "compaction_target_size": 1048576, "compaction_period": 1, @@ -86,8 +94,9 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' "gc_horizon": 67108864, "gc_period": 80, "image_creation_threshold": 3, - "pitr_interval": 2592000 - }.items()) + "pitr_interval": 2592000, + }.items() + ) # restart the pageserver and ensure that the config is still correct env.pageserver.stop() @@ -99,7 +108,8 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' res = pscur.fetchone() log.info(f"after restart res: {res}") assert all( - i in res.items() for i in { + i in res.items() + for i in { "checkpoint_distance": 15000, "compaction_target_size": 1048576, "compaction_period": 1, @@ -107,5 +117,6 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' "gc_horizon": 67108864, "gc_period": 80, "image_creation_threshold": 3, - "pitr_interval": 2592000 - }.items()) + "pitr_interval": 2592000, + }.items() + ) diff --git a/test_runner/batch_others/test_tenant_detach.py b/test_runner/batch_others/test_tenant_detach.py index afc4f89bbf..f1b30429bf 100644 --- a/test_runner/batch_others/test_tenant_detach.py +++ b/test_runner/batch_others/test_tenant_detach.py @@ -1,9 +1,9 @@ +import uuid from threading import Thread from uuid import uuid4 -import uuid + import psycopg2 import pytest - from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException @@ -11,7 +11,7 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiExc def do_gc_target(env: NeonEnv, tenant_id: uuid.UUID, timeline_id: uuid.UUID): """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" try: - env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0') + env.pageserver.safe_psql(f"do_gc {tenant_id.hex} {timeline_id.hex} 0") except Exception as e: log.error("do_gc failed: %s", e) @@ -22,8 +22,10 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # first check for non existing tenant tenant_id = uuid4() - with pytest.raises(expected_exception=NeonPageserverApiException, - match=f'Tenant not found for id {tenant_id.hex}'): + with pytest.raises( + expected_exception=NeonPageserverApiException, + match=f"Tenant not found for id {tenant_id.hex}", + ): pageserver_http.tenant_detach(tenant_id) # create new nenant @@ -32,17 +34,20 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # assert tenant exists on disk assert (env.repo_dir / "tenants" / tenant_id.hex).exists() - pg = env.postgres.create_start('main', tenant_id=tenant_id) + pg = env.postgres.create_start("main", tenant_id=tenant_id) # we rely upon autocommit after each statement - pg.safe_psql_many(queries=[ - 'CREATE TABLE t(key int primary key, value text)', - 'INSERT INTO t SELECT generate_series(1,100000), \'payload\'', - ]) + pg.safe_psql_many( + queries=[ + "CREATE TABLE t(key int primary key, value text)", + "INSERT INTO t SELECT generate_series(1,100000), 'payload'", + ] + ) # gc should not try to even start - with pytest.raises(expected_exception=psycopg2.DatabaseError, - match='gc target timeline does not exist'): - env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {uuid4().hex} 0') + with pytest.raises( + expected_exception=psycopg2.DatabaseError, match="gc target timeline does not exist" + ): + env.pageserver.safe_psql(f"do_gc {tenant_id.hex} {uuid4().hex} 0") # try to concurrently run gc and detach gc_thread = Thread(target=lambda: do_gc_target(env, tenant_id, timeline_id)) @@ -67,6 +72,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # check that nothing is left on disk for deleted tenant assert not (env.repo_dir / "tenants" / tenant_id.hex).exists() - with pytest.raises(expected_exception=psycopg2.DatabaseError, - match=f'Tenant {tenant_id.hex} not found'): - env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0') + with pytest.raises( + expected_exception=psycopg2.DatabaseError, match=f"Tenant {tenant_id.hex} not found" + ): + env.pageserver.safe_psql(f"do_gc {tenant_id.hex} {timeline_id.hex} 0") diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index eb65e2e3b5..a30804ee8e 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -34,12 +34,14 @@ def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @contextmanager -def new_pageserver_helper(new_pageserver_dir: pathlib.Path, - pageserver_bin: pathlib.Path, - remote_storage_mock_path: pathlib.Path, - pg_port: int, - http_port: int, - broker: Optional[Etcd]): +def new_pageserver_helper( + new_pageserver_dir: pathlib.Path, + pageserver_bin: pathlib.Path, + remote_storage_mock_path: pathlib.Path, + pg_port: int, + http_port: int, + broker: Optional[Etcd], +): """ cannot use NeonPageserver yet because it depends on neon cli which currently lacks support for multiple pageservers @@ -47,10 +49,10 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, # actually run new pageserver cmd = [ str(pageserver_bin), - '--workdir', + "--workdir", str(new_pageserver_dir), - '--daemonize', - '--update-config', + "--daemonize", + "--update-config", f"-c listen_pg_addr='localhost:{pg_port}'", f"-c listen_http_addr='localhost:{http_port}'", f"-c pg_distrib_dir='{pg_distrib_dir}'", @@ -58,7 +60,9 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", ] if broker is not None: - cmd.append(f"-c broker_endpoints=['{broker.client_url()}']", ) + cmd.append( + f"-c broker_endpoints=['{broker.client_url()}']", + ) log.info("starting new pageserver %s", cmd) out = subprocess.check_output(cmd, text=True) @@ -67,7 +71,7 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, yield finally: log.info("stopping new pageserver") - pid = int((new_pageserver_dir / 'pageserver.pid').read_text()) + pid = int((new_pageserver_dir / "pageserver.pid").read_text()) os.kill(pid, signal.SIGQUIT) @@ -105,7 +109,7 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve log.info("successfully recovered %s", inserted_ctr) failed = False load_ok_event.set() - log.info('load thread stopped') + log.info("load thread stopped") def populate_branch( @@ -123,8 +127,10 @@ def populate_branch( cur.execute("SELECT pg_current_wal_flush_lsn()") log.info("pg_current_wal_flush_lsn() %s", lsn_from_hex(cur.fetchone()[0])) - log.info("timeline detail %s", - ps_http.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)) + log.info( + "timeline detail %s", + ps_http.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id), + ) # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -133,7 +139,7 @@ def populate_branch( cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'some payload'") if expected_sum is not None: cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (expected_sum, ) + assert cur.fetchone() == (expected_sum,) cur.execute("SELECT pg_current_wal_flush_lsn()") current_lsn = lsn_from_hex(cur.fetchone()[0]) @@ -166,34 +172,41 @@ def check_timeline_attached( # when load is active these checks can break because lsns are not static # so lets check with some margin - assert_abs_margin_ratio(lsn_from_hex(new_timeline_detail['local']['disk_consistent_lsn']), - lsn_from_hex(old_timeline_detail['local']['disk_consistent_lsn']), - 0.03) + assert_abs_margin_ratio( + lsn_from_hex(new_timeline_detail["local"]["disk_consistent_lsn"]), + lsn_from_hex(old_timeline_detail["local"]["disk_consistent_lsn"]), + 0.03, + ) - assert_abs_margin_ratio(lsn_from_hex(new_timeline_detail['local']['disk_consistent_lsn']), - old_current_lsn, - 0.03) + assert_abs_margin_ratio( + lsn_from_hex(new_timeline_detail["local"]["disk_consistent_lsn"]), old_current_lsn, 0.03 + ) -def switch_pg_to_new_pageserver(env: NeonEnv, - pg: Postgres, - new_pageserver_port: int, - tenant_id: UUID, - timeline_id: UUID) -> pathlib.Path: +def switch_pg_to_new_pageserver( + env: NeonEnv, pg: Postgres, new_pageserver_port: int, tenant_id: UUID, timeline_id: UUID +) -> pathlib.Path: pg.stop() pg_config_file_path = pathlib.Path(pg.config_file_path()) - pg_config_file_path.open('a').write( - f"\nneon.pageserver_connstring = 'postgresql://no_user:@localhost:{new_pageserver_port}'") + pg_config_file_path.open("a").write( + f"\nneon.pageserver_connstring = 'postgresql://no_user:@localhost:{new_pageserver_port}'" + ) pg.start() - timeline_to_detach_local_path = env.repo_dir / 'tenants' / tenant_id.hex / 'timelines' / timeline_id.hex + timeline_to_detach_local_path = ( + env.repo_dir / "tenants" / tenant_id.hex / "timelines" / timeline_id.hex + ) files_before_detach = os.listdir(timeline_to_detach_local_path) - assert 'metadata' in files_before_detach, f'Regular timeline {timeline_to_detach_local_path} should have the metadata file,\ - but got: {files_before_detach}' - assert len(files_before_detach) >= 2, f'Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\ - but got {files_before_detach}' + assert ( + "metadata" in files_before_detach + ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file,\ + but got: {files_before_detach}" + assert ( + len(files_before_detach) >= 2 + ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\ + but got {files_before_detach}" return timeline_to_detach_local_path @@ -202,39 +215,44 @@ def post_migration_check(pg: Postgres, sum_before_migration: int, old_local_path with pg_cur(pg) as cur: # check that data is still there cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (sum_before_migration, ) + assert cur.fetchone() == (sum_before_migration,) # check that we can write new data cur.execute("INSERT INTO t SELECT generate_series(1001,2000), 'some payload'") cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (sum_before_migration + 1500500, ) + assert cur.fetchone() == (sum_before_migration + 1500500,) - assert not os.path.exists(old_local_path), f'After detach, local timeline dir {old_local_path} should be removed' + assert not os.path.exists( + old_local_path + ), f"After detach, local timeline dir {old_local_path} should be removed" @pytest.mark.parametrize( - 'method', + "method", [ # A minor migration involves no storage breaking changes. # It is done by attaching the tenant to a new pageserver. - 'minor', + "minor", # A major migration involves exporting a postgres datadir # basebackup and importing it into the new pageserver. # This kind of migration can tolerate breaking changes # to storage format - 'major', - ]) -@pytest.mark.parametrize('with_load', ['with_load', 'without_load']) -def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, - port_distributor: PortDistributor, - test_output_dir, - method: str, - with_load: str): + "major", + ], +) +@pytest.mark.parametrize("with_load", ["with_load", "without_load"]) +def test_tenant_relocation( + neon_env_builder: NeonEnvBuilder, + port_distributor: PortDistributor, + test_output_dir, + method: str, + with_load: str, +): neon_env_builder.enable_local_fs_remote_storage() env = neon_env_builder.init_start() # create folder for remote storage mock - remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage' + remote_storage_mock_path = env.repo_dir / "local_fs_remote_storage" # we use two branches to check that they are both relocated # first branch is used for load, compute for second one is used to @@ -242,12 +260,15 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, pageserver_http = env.pageserver.http_client() - tenant_id, initial_timeline_id = env.neon_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) + tenant_id, initial_timeline_id = env.neon_cli.create_tenant( + UUID("74ee8b079a0e437eb0afea7d26a07209") + ) log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id) env.neon_cli.create_branch("test_tenant_relocation_main", tenant_id=tenant_id) - pg_main = env.postgres.create_start(branch_name='test_tenant_relocation_main', - tenant_id=tenant_id) + pg_main = env.postgres.create_start( + branch_name="test_tenant_relocation_main", tenant_id=tenant_id + ) timeline_id_main, current_lsn_main = populate_branch( pg_main, @@ -263,8 +284,9 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, ancestor_start_lsn=lsn_to_hex(current_lsn_main), tenant_id=tenant_id, ) - pg_second = env.postgres.create_start(branch_name='test_tenant_relocation_second', - tenant_id=tenant_id) + pg_second = env.postgres.create_start( + branch_name="test_tenant_relocation_second", tenant_id=tenant_id + ) timeline_id_second, current_lsn_second = populate_branch( pg_second, @@ -281,7 +303,7 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_second, current_lsn_second) timeline_detail_second = assert_timeline_local(pageserver_http, tenant_id, timeline_id_second) - if with_load == 'with_load': + if with_load == "with_load": # create load table with pg_cur(pg_main) as cur: cur.execute("CREATE TABLE load(value text)") @@ -317,22 +339,24 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, log.info("inititalizing new pageserver") # bootstrap second pageserver - new_pageserver_dir = env.repo_dir / 'new_pageserver' + new_pageserver_dir = env.repo_dir / "new_pageserver" new_pageserver_dir.mkdir() new_pageserver_pg_port = port_distributor.get_port() new_pageserver_http_port = port_distributor.get_port() log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port) - pageserver_bin = pathlib.Path(neon_binpath) / 'pageserver' + pageserver_bin = pathlib.Path(neon_binpath) / "pageserver" new_pageserver_http = NeonPageserverHttpClient(port=new_pageserver_http_port, auth_token=None) - with new_pageserver_helper(new_pageserver_dir, - pageserver_bin, - remote_storage_mock_path, - new_pageserver_pg_port, - new_pageserver_http_port, - neon_env_builder.broker): + with new_pageserver_helper( + new_pageserver_dir, + pageserver_bin, + remote_storage_mock_path, + new_pageserver_pg_port, + new_pageserver_http_port, + neon_env_builder.broker, + ): # Migrate either by attaching from s3 or import/export basebackup if method == "major": @@ -367,13 +391,16 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, # check that it shows that download is in progress tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id) - assert tenant_status.get('has_in_progress_downloads'), tenant_status + assert tenant_status.get("has_in_progress_downloads"), tenant_status # wait until tenant is downloaded - wait_until(number_of_iterations=10, - interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant( - new_pageserver_http, tenant_id)) + wait_until( + number_of_iterations=10, + interval=1, + func=lambda: assert_no_in_progress_downloads_for_tenant( + new_pageserver_http, tenant_id + ), + ) check_timeline_attached( new_pageserver_http, @@ -392,10 +419,10 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, ) # rewrite neon cli config to use new pageserver for basebackup to start new compute - cli_config_lines = (env.repo_dir / 'config').read_text().splitlines() + cli_config_lines = (env.repo_dir / "config").read_text().splitlines() cli_config_lines[-2] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'" cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'" - (env.repo_dir / 'config').write_text('\n'.join(cli_config_lines)) + (env.repo_dir / "config").write_text("\n".join(cli_config_lines)) old_local_path_main = switch_pg_to_new_pageserver( env, @@ -423,7 +450,8 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, # ensure that we can successfully read all relations on the new pageserver with pg_cur(pg_second) as cur: - cur.execute(''' + cur.execute( + """ DO $$ DECLARE r RECORD; @@ -435,18 +463,19 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, EXECUTE 'SELECT count(*) FROM quote_ident($1)' USING r.relname; END LOOP; END$$; - ''') + """ + ) - if with_load == 'with_load': + if with_load == "with_load": assert load_ok_event.wait(3) - log.info('stopping load thread') + log.info("stopping load thread") load_stop_event.set() load_thread.join(timeout=10) - log.info('load thread stopped') + log.info("load thread stopped") # bring old pageserver back for clean shutdown via neon cli # new pageserver will be shut down by the context manager - cli_config_lines = (env.repo_dir / 'config').read_text().splitlines() + cli_config_lines = (env.repo_dir / "config").read_text().splitlines() cli_config_lines[-2] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'" cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'" - (env.repo_dir / 'config').write_text('\n'.join(cli_config_lines)) + (env.repo_dir / "config").write_text("\n".join(cli_config_lines)) diff --git a/test_runner/batch_others/test_tenant_tasks.py b/test_runner/batch_others/test_tenant_tasks.py index fae2a2199d..8075756ffb 100644 --- a/test_runner/batch_others/test_tenant_tasks.py +++ b/test_runner/batch_others/test_tenant_tasks.py @@ -1,6 +1,7 @@ -from fixtures.neon_fixtures import NeonEnvBuilder, wait_until -from uuid import UUID import time +from uuid import UUID + +from fixtures.neon_fixtures import NeonEnvBuilder, wait_until def get_only_element(l): @@ -47,7 +48,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): tenant, _ = env.neon_cli.create_tenant() timeline = env.neon_cli.create_timeline(name, tenant_id=tenant) pg = env.postgres.create_start(name, tenant_id=tenant) - assert (get_state(tenant) == "Active") + assert get_state(tenant) == "Active" # Stop compute pg.stop() diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/batch_others/test_tenants.py index 8d73d8185c..0e0cd44471 100644 --- a/test_runner/batch_others/test_tenants.py +++ b/test_runner/batch_others/test_tenants.py @@ -1,15 +1,15 @@ +import os from contextlib import closing from datetime import datetime -import os -import pytest -from fixtures.neon_fixtures import NeonEnvBuilder +import pytest from fixtures.log_helper import log from fixtures.metrics import parse_metrics +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.utils import lsn_to_hex -@pytest.mark.parametrize('with_safekeepers', [False, True]) +@pytest.mark.parametrize("with_safekeepers", [False, True]) def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): if with_safekeepers: neon_env_builder.num_safekeepers = 3 @@ -19,17 +19,19 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder, with_safekeepers: tenant_1, _ = env.neon_cli.create_tenant() tenant_2, _ = env.neon_cli.create_tenant() - env.neon_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', - tenant_id=tenant_1) - env.neon_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', - tenant_id=tenant_2) + env.neon_cli.create_timeline( + f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", tenant_id=tenant_1 + ) + env.neon_cli.create_timeline( + f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", tenant_id=tenant_2 + ) pg_tenant1 = env.postgres.create_start( - f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', + f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", tenant_id=tenant_1, ) pg_tenant2 = env.postgres.create_start( - f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', + f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", tenant_id=tenant_2, ) @@ -41,7 +43,7 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder, with_safekeepers: cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (5000050000, ) + assert cur.fetchone() == (5000050000,) def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): @@ -51,11 +53,11 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): tenant_1, _ = env.neon_cli.create_tenant() tenant_2, _ = env.neon_cli.create_tenant() - timeline_1 = env.neon_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_1) - timeline_2 = env.neon_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_2) + timeline_1 = env.neon_cli.create_timeline("test_metrics_normal_work", tenant_id=tenant_1) + timeline_2 = env.neon_cli.create_timeline("test_metrics_normal_work", tenant_id=tenant_2) - pg_tenant1 = env.postgres.create_start('test_metrics_normal_work', tenant_id=tenant_1) - pg_tenant2 = env.postgres.create_start('test_metrics_normal_work', tenant_id=tenant_2) + pg_tenant1 = env.postgres.create_start("test_metrics_normal_work", tenant_id=tenant_1) + pg_tenant2 = env.postgres.create_start("test_metrics_normal_work", tenant_id=tenant_2) for pg in [pg_tenant1, pg_tenant2]: with closing(pg.connect()) as conn: @@ -63,29 +65,28 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (5000050000, ) + assert cur.fetchone() == (5000050000,) collected_metrics = { "pageserver": env.pageserver.http_client().get_metrics(), } for sk in env.safekeepers: - collected_metrics[f'safekeeper{sk.id}'] = sk.http_client().get_metrics_str() + collected_metrics[f"safekeeper{sk.id}"] = sk.http_client().get_metrics_str() for name in collected_metrics: - basepath = os.path.join(neon_env_builder.repo_dir, f'{name}.metrics') + basepath = os.path.join(neon_env_builder.repo_dir, f"{name}.metrics") - with open(basepath, 'w') as stdout_f: + with open(basepath, "w") as stdout_f: print(collected_metrics[name], file=stdout_f, flush=True) all_metrics = [parse_metrics(m, name) for name, m in collected_metrics.items()] ps_metrics = all_metrics[0] sk_metrics = all_metrics[1:] - ttids = [{ - 'tenant_id': tenant_1.hex, 'timeline_id': timeline_1.hex - }, { - 'tenant_id': tenant_2.hex, 'timeline_id': timeline_2.hex - }] + ttids = [ + {"tenant_id": tenant_1.hex, "timeline_id": timeline_1.hex}, + {"tenant_id": tenant_2.hex, "timeline_id": timeline_2.hex}, + ] # Test metrics per timeline for tt in ttids: @@ -105,7 +106,8 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): log.info(f"Checking common metrics for {metrics.name}") log.info( - f"process_cpu_seconds_total: {metrics.query_one('process_cpu_seconds_total').value}") + f"process_cpu_seconds_total: {metrics.query_one('process_cpu_seconds_total').value}" + ) log.info(f"process_threads: {int(metrics.query_one('process_threads').value)}") log.info( f"process_resident_memory_bytes (MB): {metrics.query_one('process_resident_memory_bytes').value / 1024 / 1024}" diff --git a/test_runner/batch_others/test_tenants_with_remote_storage.py b/test_runner/batch_others/test_tenants_with_remote_storage.py index 636616a45b..a127693c32 100644 --- a/test_runner/batch_others/test_tenants_with_remote_storage.py +++ b/test_runner/batch_others/test_tenants_with_remote_storage.py @@ -12,8 +12,15 @@ from typing import List, Tuple from uuid import UUID import pytest - -from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv, Postgres, RemoteStorageKind, available_remote_storages, wait_for_last_record_lsn, wait_for_upload +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + Postgres, + RemoteStorageKind, + available_remote_storages, + wait_for_last_record_lsn, + wait_for_upload, +) from fixtures.utils import lsn_from_hex @@ -28,7 +35,8 @@ async def tenant_workload(env: NeonEnv, pg: Postgres): await pg_conn.execute("CREATE TABLE t(key int primary key, value text)") for i in range(1, 100): await pg_conn.execute( - f"INSERT INTO t SELECT {i}*1000 + g, 'payload' from generate_series(1,1000) g") + f"INSERT INTO t SELECT {i}*1000 + g, 'payload' from generate_series(1,1000) g" + ) # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -46,11 +54,11 @@ async def all_tenants_workload(env: NeonEnv, tenants_pgs): await asyncio.gather(*workers) -@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages()) +@pytest.mark.parametrize("remote_storatge_kind", available_remote_storages()) def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storatge_kind, - test_name='test_tenants_many', + test_name="test_tenants_many", ) env = neon_env_builder.init_start() @@ -61,12 +69,13 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: Re # Use a tiny checkpoint distance, to create a lot of layers quickly tenant, _ = env.neon_cli.create_tenant( conf={ - 'checkpoint_distance': '5000000', - }) - env.neon_cli.create_timeline(f'test_tenants_many', tenant_id=tenant) + "checkpoint_distance": "5000000", + } + ) + env.neon_cli.create_timeline(f"test_tenants_many", tenant_id=tenant) pg = env.postgres.create_start( - f'test_tenants_many', + f"test_tenants_many", tenant_id=tenant, ) tenants_pgs.append((tenant, pg)) @@ -77,7 +86,8 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: Re pageserver_http = env.pageserver.http_client() for tenant, pg in tenants_pgs: res = pg.safe_psql_many( - ["SHOW neon.tenant_id", "SHOW neon.timeline_id", "SELECT pg_current_wal_flush_lsn()"]) + ["SHOW neon.tenant_id", "SHOW neon.timeline_id", "SELECT pg_current_wal_flush_lsn()"] + ) tenant_id = res[0][0][0] timeline_id = res[1][0][0] current_lsn = lsn_from_hex(res[2][0][0]) diff --git a/test_runner/batch_others/test_timeline_delete.py b/test_runner/batch_others/test_timeline_delete.py index 594475faf4..7a55ffb769 100644 --- a/test_runner/batch_others/test_timeline_delete.py +++ b/test_runner/batch_others/test_timeline_delete.py @@ -1,6 +1,6 @@ from uuid import uuid4 -import pytest +import pytest from fixtures.neon_fixtures import NeonEnv, NeonPageserverApiException, wait_until @@ -17,44 +17,57 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # for non existing tenant: invalid_tenant_id = uuid4() - with pytest.raises(NeonPageserverApiException, - match=f"Tenant {invalid_tenant_id.hex} not found in local tenant state"): + with pytest.raises( + NeonPageserverApiException, + match=f"Tenant {invalid_tenant_id.hex} not found in local tenant state", + ): ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id) # construct pair of branches to validate that pageserver prohibits # deletion of ancestor timelines when they have child branches parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_parent", "empty") - leaf_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_branch1", - "test_ancestor_branch_delete_parent") + leaf_timeline_id = env.neon_cli.create_branch( + "test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent" + ) ps_http = env.pageserver.http_client() - with pytest.raises(NeonPageserverApiException, - match="Cannot detach timeline which has child timelines"): + with pytest.raises( + NeonPageserverApiException, match="Cannot detach timeline which has child timelines" + ): - timeline_path = env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / parent_timeline_id.hex + timeline_path = ( + env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / parent_timeline_id.hex + ) assert timeline_path.exists() ps_http.timeline_delete(env.initial_tenant, parent_timeline_id) assert not timeline_path.exists() - timeline_path = env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / leaf_timeline_id.hex + timeline_path = ( + env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / leaf_timeline_id.hex + ) assert timeline_path.exists() # retry deletes when compaction or gc is running in pageserver - wait_until(number_of_iterations=3, - interval=0.2, - func=lambda: ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)) + wait_until( + number_of_iterations=3, + interval=0.2, + func=lambda: ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id), + ) assert not timeline_path.exists() # check 404 - with pytest.raises(NeonPageserverApiException, - match="is not found neither locally nor remotely"): + with pytest.raises( + NeonPageserverApiException, match="is not found neither locally nor remotely" + ): ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) # FIXME leaves tenant without timelines, should we prevent deletion of root timeline? - wait_until(number_of_iterations=3, - interval=0.2, - func=lambda: ps_http.timeline_delete(env.initial_tenant, parent_timeline_id)) + wait_until( + number_of_iterations=3, + interval=0.2, + func=lambda: ps_http.timeline_delete(env.initial_tenant, parent_timeline_id), + ) diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 4a9359cf43..76342cdf98 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -1,25 +1,33 @@ -from contextlib import closing import math import random -from uuid import UUID import re -import psycopg2.extras -import psycopg2.errors -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local, wait_for_last_flush_lsn -from fixtures.log_helper import log import time +from contextlib import closing +from uuid import UUID +import psycopg2.errors +import psycopg2.extras +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + Postgres, + assert_timeline_local, + wait_for_last_flush_lsn, +) from fixtures.utils import get_timeline_dir_size def test_timeline_size(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch('test_timeline_size', 'empty') + new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty") client = env.pageserver.http_client() timeline_details = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - assert timeline_details['local']['current_logical_size'] == timeline_details['local'][ - 'current_logical_size_non_incremental'] + assert ( + timeline_details["local"]["current_logical_size"] + == timeline_details["local"]["current_logical_size_non_incremental"] + ) pgmain = env.postgres.create_start("test_timeline_size") log.info("postgres is running on 'test_timeline_size' branch") @@ -29,32 +37,40 @@ def test_timeline_size(neon_simple_env: NeonEnv): cur.execute("SHOW neon.timeline_id") cur.execute("CREATE TABLE foo (t text)") - cur.execute(""" + cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 10) g - """) + """ + ) res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - local_details = res['local'] - assert local_details["current_logical_size"] == local_details[ - "current_logical_size_non_incremental"] + local_details = res["local"] + assert ( + local_details["current_logical_size"] + == local_details["current_logical_size_non_incremental"] + ) cur.execute("TRUNCATE foo") res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - local_details = res['local'] - assert local_details["current_logical_size"] == local_details[ - "current_logical_size_non_incremental"] + local_details = res["local"] + assert ( + local_details["current_logical_size"] + == local_details["current_logical_size_non_incremental"] + ) def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch('test_timeline_size', 'empty') + new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty") client = env.pageserver.http_client() timeline_details = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - assert timeline_details['local']['current_logical_size'] == timeline_details['local'][ - 'current_logical_size_non_incremental'] + assert ( + timeline_details["local"]["current_logical_size"] + == timeline_details["local"]["current_logical_size_non_incremental"] + ) pgmain = env.postgres.create_start("test_timeline_size") log.info("postgres is running on 'test_timeline_size' branch") @@ -64,32 +80,40 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): cur.execute("SHOW neon.timeline_id") res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - local_details = res['local'] - assert local_details["current_logical_size"] == local_details[ - "current_logical_size_non_incremental"] + local_details = res["local"] + assert ( + local_details["current_logical_size"] + == local_details["current_logical_size_non_incremental"] + ) - cur.execute('CREATE DATABASE foodb') - with closing(pgmain.connect(dbname='foodb')) as conn: + cur.execute("CREATE DATABASE foodb") + with closing(pgmain.connect(dbname="foodb")) as conn: with conn.cursor() as cur2: cur2.execute("CREATE TABLE foo (t text)") - cur2.execute(""" + cur2.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 10) g - """) + """ + ) res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - local_details = res['local'] - assert local_details["current_logical_size"] == local_details[ - "current_logical_size_non_incremental"] + local_details = res["local"] + assert ( + local_details["current_logical_size"] + == local_details["current_logical_size_non_incremental"] + ) - cur.execute('DROP DATABASE foodb') + cur.execute("DROP DATABASE foodb") res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - local_details = res['local'] - assert local_details["current_logical_size"] == local_details[ - "current_logical_size_non_incremental"] + local_details = res["local"] + assert ( + local_details["current_logical_size"] + == local_details["current_logical_size_non_incremental"] + ) # wait until received_lsn_lag is 0 @@ -101,14 +125,17 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 elapsed = time.time() - started_at if elapsed > timeout: raise RuntimeError( - f"timed out waiting for pageserver to reach pg_current_wal_flush_lsn()") + f"timed out waiting for pageserver to reach pg_current_wal_flush_lsn()" + ) - res = pgmain.safe_psql(''' + res = pgmain.safe_psql( + """ SELECT pg_size_pretty(pg_cluster_size()), pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag FROM backpressure_lsns(); - ''')[0] + """ + )[0] log.info(f"pg_cluster_size = {res[0]}, received_lsn_lag = {res[1]}") received_lsn_lag = res[1] @@ -117,17 +144,19 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - new_timeline_id = env.neon_cli.create_branch('test_timeline_size_quota') + new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota") client = env.pageserver.http_client() res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - assert res['local']["current_logical_size"] == res['local'][ - "current_logical_size_non_incremental"] + assert ( + res["local"]["current_logical_size"] == res["local"]["current_logical_size_non_incremental"] + ) pgmain = env.postgres.create_start( "test_timeline_size_quota", # Set small limit for the test - config_lines=['neon.max_cluster_size=30MB']) + config_lines=["neon.max_cluster_size=30MB"], + ) log.info("postgres is running on 'test_timeline_size_quota' branch") with closing(pgmain.connect()) as conn: @@ -140,19 +169,23 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): # Insert many rows. This query must fail because of space limit try: - cur.execute(''' + cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g - ''') + """ + ) wait_for_pageserver_catchup(pgmain) - cur.execute(''' + cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 500000) g - ''') + """ + ) # If we get here, the timeline size limit failed log.error("Query unexpectedly succeeded") @@ -162,17 +195,19 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): log.info(f"Query expectedly failed with: {err}") # drop table to free space - cur.execute('DROP TABLE foo') + cur.execute("DROP TABLE foo") wait_for_pageserver_catchup(pgmain) # create it again and insert some rows. This query must succeed cur.execute("CREATE TABLE foo (t text)") - cur.execute(''' + cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 10000) g - ''') + """ + ) wait_for_pageserver_catchup(pgmain) @@ -183,15 +218,17 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): def test_timeline_physical_size_init(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_init') + new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_init") pg = env.postgres.create_start("test_timeline_physical_size_init") - pg.safe_psql_many([ - "CREATE TABLE foo (t text)", - """INSERT INTO foo + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text)", + """INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 1000) g""", - ]) + ] + ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) @@ -204,15 +241,17 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv): def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_post_checkpoint') + new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_checkpoint") pg = env.postgres.create_start("test_timeline_physical_size_post_checkpoint") - pg.safe_psql_many([ - "CREATE TABLE foo (t text)", - """INSERT INTO foo + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text)", + """INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 1000) g""", - ]) + ] + ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") @@ -223,19 +262,23 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder): # Disable background compaction as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed - neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='10m'}" + neon_env_builder.pageserver_config_override = ( + "tenant_config={checkpoint_distance=100000, compaction_period='10m'}" + ) env = neon_env_builder.init_start() - new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_post_compaction') + new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction") pg = env.postgres.create_start("test_timeline_physical_size_post_compaction") - pg.safe_psql_many([ - "CREATE TABLE foo (t text)", - """INSERT INTO foo + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text)", + """INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g""", - ]) + ] + ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") @@ -247,29 +290,32 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed - neon_env_builder.pageserver_config_override = \ - "tenant_config={checkpoint_distance=100000, compaction_period='10m', gc_period='10m', pitr_interval='1s'}" + neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='10m', gc_period='10m', pitr_interval='1s'}" env = neon_env_builder.init_start() - new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_post_gc') + new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc") pg = env.postgres.create_start("test_timeline_physical_size_post_gc") - pg.safe_psql_many([ - "CREATE TABLE foo (t text)", - """INSERT INTO foo + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text)", + """INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g""", - ]) + ] + ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") - pg.safe_psql(""" + pg.safe_psql( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g - """) + """ + ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") @@ -284,15 +330,17 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): def test_timeline_size_metrics(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch('test_timeline_size_metrics') + new_timeline_id = env.neon_cli.create_branch("test_timeline_size_metrics") pg = env.postgres.create_start("test_timeline_size_metrics") - pg.safe_psql_many([ - "CREATE TABLE foo (t text)", - """INSERT INTO foo + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text)", + """INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g""", - ]) + ] + ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") @@ -302,7 +350,8 @@ def test_timeline_size_metrics(neon_simple_env: NeonEnv): matches = re.search( f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant.hex}",timeline_id="{new_timeline_id.hex}"}} (\\S+)$', metrics, - re.MULTILINE) + re.MULTILINE, + ) assert matches tl_physical_size_metric = int(matches.group(1)) @@ -314,7 +363,8 @@ def test_timeline_size_metrics(neon_simple_env: NeonEnv): matches = re.search( f'^pageserver_current_logical_size{{tenant_id="{env.initial_tenant.hex}",timeline_id="{new_timeline_id.hex}"}} (\\S+)$', metrics, - re.MULTILINE) + re.MULTILINE, + ) assert matches tl_logical_size_metric = int(matches.group(1)) @@ -341,7 +391,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): def get_timeline_physical_size(timeline: UUID): res = client.timeline_detail(tenant, timeline) - return res['local']['current_physical_size_non_incremental'] + return res["local"]["current_physical_size_non_incremental"] timeline_total_size = get_timeline_physical_size(timeline) for i in range(10): @@ -350,10 +400,12 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): timeline = env.neon_cli.create_branch(f"test_tenant_physical_size_{i}", tenant_id=tenant) pg = env.postgres.create_start(f"test_tenant_physical_size_{i}", tenant_id=tenant) - pg.safe_psql_many([ - "CREATE TABLE foo (t text)", - f"INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, {n_rows}) g", - ]) + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text)", + f"INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, {n_rows}) g", + ] + ) wait_for_last_flush_lsn(env, pg, tenant, timeline) env.pageserver.safe_psql(f"checkpoint {tenant.hex} {timeline.hex}") @@ -362,7 +414,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): pg.stop() - tenant_physical_size = int(client.tenant_status(tenant_id=tenant)['current_physical_size']) + tenant_physical_size = int(client.tenant_status(tenant_id=tenant)["current_physical_size"]) assert tenant_physical_size == timeline_total_size @@ -372,6 +424,8 @@ def assert_physical_size(env: NeonEnv, tenant_id: UUID, timeline_id: UUID): client = env.pageserver.http_client() res = assert_timeline_local(client, tenant_id, timeline_id) timeline_path = env.timeline_dir(tenant_id, timeline_id) - assert res["local"]["current_physical_size"] == res["local"][ - "current_physical_size_non_incremental"] + assert ( + res["local"]["current_physical_size"] + == res["local"]["current_physical_size_non_incremental"] + ) assert res["local"]["current_physical_size"] == get_timeline_dir_size(timeline_path) diff --git a/test_runner/batch_others/test_twophase.py b/test_runner/batch_others/test_twophase.py index 04e3d0b7bc..e01ba7caef 100644 --- a/test_runner/batch_others/test_twophase.py +++ b/test_runner/batch_others/test_twophase.py @@ -1,7 +1,7 @@ import os -from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv # @@ -10,37 +10,37 @@ from fixtures.log_helper import log def test_twophase(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_twophase", "empty") - pg = env.postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5']) + pg = env.postgres.create_start("test_twophase", config_lines=["max_prepared_transactions=5"]) log.info("postgres is running on 'test_twophase' branch") conn = pg.connect() cur = conn.cursor() - cur.execute('CREATE TABLE foo (t text)') + cur.execute("CREATE TABLE foo (t text)") # Prepare a transaction that will insert a row - cur.execute('BEGIN') + cur.execute("BEGIN") cur.execute("INSERT INTO foo VALUES ('one')") cur.execute("PREPARE TRANSACTION 'insert_one'") # Prepare another transaction that will insert a row - cur.execute('BEGIN') + cur.execute("BEGIN") cur.execute("INSERT INTO foo VALUES ('two')") cur.execute("PREPARE TRANSACTION 'insert_two'") # Prepare a transaction that will insert a row - cur.execute('BEGIN') + cur.execute("BEGIN") cur.execute("INSERT INTO foo VALUES ('three')") cur.execute("PREPARE TRANSACTION 'insert_three'") # Prepare another transaction that will insert a row - cur.execute('BEGIN') + cur.execute("BEGIN") cur.execute("INSERT INTO foo VALUES ('four')") cur.execute("PREPARE TRANSACTION 'insert_four'") # On checkpoint state data copied to files in # pg_twophase directory and fsynced - cur.execute('CHECKPOINT') + cur.execute("CHECKPOINT") twophase_files = os.listdir(pg.pg_twophase_dir_path()) log.info(twophase_files) @@ -48,7 +48,7 @@ def test_twophase(neon_simple_env: NeonEnv): cur.execute("COMMIT PREPARED 'insert_three'") cur.execute("ROLLBACK PREPARED 'insert_four'") - cur.execute('CHECKPOINT') + cur.execute("CHECKPOINT") twophase_files = os.listdir(pg.pg_twophase_dir_path()) log.info(twophase_files) @@ -59,8 +59,8 @@ def test_twophase(neon_simple_env: NeonEnv): # Start compute on the new branch pg2 = env.postgres.create_start( - 'test_twophase_prepared', - config_lines=['max_prepared_transactions=5'], + "test_twophase_prepared", + config_lines=["max_prepared_transactions=5"], ) # Check that we restored only needed twophase files @@ -76,9 +76,9 @@ def test_twophase(neon_simple_env: NeonEnv): cur2.execute("COMMIT PREPARED 'insert_one'") cur2.execute("ROLLBACK PREPARED 'insert_two'") - cur2.execute('SELECT * FROM foo') - assert cur2.fetchall() == [('one', ), ('three', )] + cur2.execute("SELECT * FROM foo") + assert cur2.fetchall() == [("one",), ("three",)] # Only one committed insert is visible on the original branch - cur.execute('SELECT * FROM foo') - assert cur.fetchall() == [('three', )] + cur.execute("SELECT * FROM foo") + assert cur.fetchall() == [("three",)] diff --git a/test_runner/batch_others/test_vm_bits.py b/test_runner/batch_others/test_vm_bits.py index 29b55f5b8c..c147c6dff5 100644 --- a/test_runner/batch_others/test_vm_bits.py +++ b/test_runner/batch_others/test_vm_bits.py @@ -1,5 +1,5 @@ -from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv # @@ -10,48 +10,50 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_vm_bit_clear", "empty") - pg = env.postgres.create_start('test_vm_bit_clear') + pg = env.postgres.create_start("test_vm_bit_clear") log.info("postgres is running on 'test_vm_bit_clear' branch") pg_conn = pg.connect() cur = pg_conn.cursor() # Install extension containing function needed for test - cur.execute('CREATE EXTENSION neon_test_utils') + cur.execute("CREATE EXTENSION neon_test_utils") # Create a test table and freeze it to set the VM bit. - cur.execute('CREATE TABLE vmtest_delete (id integer PRIMARY KEY)') - cur.execute('INSERT INTO vmtest_delete VALUES (1)') - cur.execute('VACUUM FREEZE vmtest_delete') + cur.execute("CREATE TABLE vmtest_delete (id integer PRIMARY KEY)") + cur.execute("INSERT INTO vmtest_delete VALUES (1)") + cur.execute("VACUUM FREEZE vmtest_delete") - cur.execute('CREATE TABLE vmtest_update (id integer PRIMARY KEY)') - cur.execute('INSERT INTO vmtest_update SELECT g FROM generate_series(1, 1000) g') - cur.execute('VACUUM FREEZE vmtest_update') + cur.execute("CREATE TABLE vmtest_update (id integer PRIMARY KEY)") + cur.execute("INSERT INTO vmtest_update SELECT g FROM generate_series(1, 1000) g") + cur.execute("VACUUM FREEZE vmtest_update") # DELETE and UPDATE the rows. - cur.execute('DELETE FROM vmtest_delete WHERE id = 1') - cur.execute('UPDATE vmtest_update SET id = 5000 WHERE id = 1') + cur.execute("DELETE FROM vmtest_delete WHERE id = 1") + cur.execute("UPDATE vmtest_update SET id = 5000 WHERE id = 1") # Branch at this point, to test that later env.neon_cli.create_branch("test_vm_bit_clear_new", "test_vm_bit_clear") # Clear the buffer cache, to force the VM page to be re-fetched from # the page server - cur.execute('SELECT clear_buffer_cache()') + cur.execute("SELECT clear_buffer_cache()") # Check that an index-only scan doesn't see the deleted row. If the # clearing of the VM bit was not replayed correctly, this would incorrectly # return deleted row. - cur.execute(''' + cur.execute( + """ set enable_seqscan=off; set enable_indexscan=on; set enable_bitmapscan=off; - ''') + """ + ) - cur.execute('SELECT * FROM vmtest_delete WHERE id = 1') - assert (cur.fetchall() == []) - cur.execute('SELECT * FROM vmtest_update WHERE id = 1') - assert (cur.fetchall() == []) + cur.execute("SELECT * FROM vmtest_delete WHERE id = 1") + assert cur.fetchall() == [] + cur.execute("SELECT * FROM vmtest_update WHERE id = 1") + assert cur.fetchall() == [] cur.close() @@ -61,19 +63,21 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): # a dirty VM page is evicted. If the VM bit was not correctly cleared by the # earlier WAL record, the full-page image hides the problem. Starting a new # server at the right point-in-time avoids that full-page image. - pg_new = env.postgres.create_start('test_vm_bit_clear_new') + pg_new = env.postgres.create_start("test_vm_bit_clear_new") log.info("postgres is running on 'test_vm_bit_clear_new' branch") pg_new_conn = pg_new.connect() cur_new = pg_new_conn.cursor() - cur_new.execute(''' + cur_new.execute( + """ set enable_seqscan=off; set enable_indexscan=on; set enable_bitmapscan=off; - ''') + """ + ) - cur_new.execute('SELECT * FROM vmtest_delete WHERE id = 1') - assert (cur_new.fetchall() == []) - cur_new.execute('SELECT * FROM vmtest_update WHERE id = 1') - assert (cur_new.fetchall() == []) + cur_new.execute("SELECT * FROM vmtest_delete WHERE id = 1") + assert cur_new.fetchall() == [] + cur_new.execute("SELECT * FROM vmtest_update WHERE id = 1") + assert cur_new.fetchall() == [] diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index d922dd0cb4..7710ef86cd 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -1,42 +1,59 @@ -import pathlib -import pytest -import random -import time import os +import pathlib +import random import shutil import signal import subprocess import sys import threading +import time import uuid - from contextlib import closing from dataclasses import dataclass, field from pathlib import Path -from fixtures.neon_fixtures import NeonPageserver, PgBin, Etcd, Postgres, RemoteStorageKind, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, available_remote_storages, neon_binpath, PgProtocol, wait_for_last_record_lsn, wait_for_upload -from fixtures.utils import get_dir_size, lsn_to_hex, lsn_from_hex, query_scalar -from fixtures.log_helper import log -from typing import List, Optional, Any +from typing import Any, List, Optional from uuid import uuid4 +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + Etcd, + NeonEnv, + NeonEnvBuilder, + NeonPageserver, + PgBin, + PgProtocol, + PortDistributor, + Postgres, + RemoteStorageKind, + RemoteStorageUsers, + Safekeeper, + SafekeeperPort, + available_remote_storages, + neon_binpath, + wait_for_last_record_lsn, + wait_for_upload, +) +from fixtures.utils import get_dir_size, lsn_from_hex, lsn_to_hex, query_scalar -def wait_lsn_force_checkpoint(tenant_id: str, - timeline_id: str, - pg: Postgres, - ps: NeonPageserver, - pageserver_conn_options={}): - lsn = lsn_from_hex(pg.safe_psql('SELECT pg_current_wal_flush_lsn()')[0][0]) + +def wait_lsn_force_checkpoint( + tenant_id: str, timeline_id: str, pg: Postgres, ps: NeonPageserver, pageserver_conn_options={} +): + lsn = lsn_from_hex(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) log.info(f"pg_current_wal_flush_lsn is {lsn_to_hex(lsn)}, waiting for it on pageserver") auth_token = None - if 'password' in pageserver_conn_options: - auth_token = pageserver_conn_options['password'] + if "password" in pageserver_conn_options: + auth_token = pageserver_conn_options["password"] # wait for the pageserver to catch up - wait_for_last_record_lsn(ps.http_client(auth_token=auth_token), - uuid.UUID(hex=tenant_id), - uuid.UUID(hex=timeline_id), - lsn) + wait_for_last_record_lsn( + ps.http_client(auth_token=auth_token), + uuid.UUID(hex=tenant_id), + uuid.UUID(hex=timeline_id), + lsn, + ) # force checkpoint to advance remote_consistent_lsn with closing(ps.connect(**pageserver_conn_options)) as psconn: @@ -44,10 +61,12 @@ def wait_lsn_force_checkpoint(tenant_id: str, pscur.execute(f"checkpoint {tenant_id} {timeline_id}") # ensure that remote_consistent_lsn is advanced - wait_for_upload(ps.http_client(auth_token=auth_token), - uuid.UUID(hex=tenant_id), - uuid.UUID(hex=timeline_id), - lsn) + wait_for_upload( + ps.http_client(auth_token=auth_token), + uuid.UUID(hex=tenant_id), + uuid.UUID(hex=timeline_id), + lsn, + ) @dataclass @@ -89,7 +108,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): with env.pageserver.http_client() as pageserver_http: timeline_details = [ pageserver_http.timeline_detail( - tenant_id=tenant_id, timeline_id=branch_names_to_timeline_ids[branch_name]) + tenant_id=tenant_id, timeline_id=branch_names_to_timeline_ids[branch_name] + ) for branch_name in branch_names ] # All changes visible to pageserver (last_record_lsn) should be @@ -105,14 +125,14 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): for timeline_detail in timeline_details: timeline_id: str = timeline_detail["timeline_id"] - local_timeline_detail = timeline_detail.get('local') + local_timeline_detail = timeline_detail.get("local") if local_timeline_detail is None: log.debug(f"Timeline {timeline_id} is not present locally, skipping") continue m = TimelineMetrics( timeline_id=timeline_id, - last_record_lsn=lsn_from_hex(local_timeline_detail['last_record_lsn']), + last_record_lsn=lsn_from_hex(local_timeline_detail["last_record_lsn"]), ) for sk_m in sk_metrics: m.flush_lsns.append(sk_m.flush_lsn_inexact[(tenant_id.hex, timeline_id)]) @@ -120,14 +140,20 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns): # Invariant. May be < when transaction is in progress. - assert commit_lsn <= flush_lsn, f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + assert ( + commit_lsn <= flush_lsn + ), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" # We only call collect_metrics() after a transaction is confirmed by # the compute node, which only happens after a consensus of safekeepers # has confirmed the transaction. We assume majority consensus here. - assert (2 * sum(m.last_record_lsn <= lsn - for lsn in m.flush_lsns) > neon_env_builder.num_safekeepers), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" - assert (2 * sum(m.last_record_lsn <= lsn - for lsn in m.commit_lsns) > neon_env_builder.num_safekeepers), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + assert ( + 2 * sum(m.last_record_lsn <= lsn for lsn in m.flush_lsns) + > neon_env_builder.num_safekeepers + ), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + assert ( + 2 * sum(m.last_record_lsn <= lsn for lsn in m.commit_lsns) + > neon_env_builder.num_safekeepers + ), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" timeline_metrics.append(m) log.info(f"{message}: {timeline_metrics}") return timeline_metrics @@ -155,8 +181,10 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): collect_metrics("during INSERT INTO") time.sleep(1) except: - log.error("MetricsChecker's thread failed, the test will be failed on .stop() call", - exc_info=True) + log.error( + "MetricsChecker's thread failed, the test will be failed on .stop() call", + exc_info=True, + ) # We want to preserve traceback as well as the exception exc_type, exc_value, exc_tb = sys.exc_info() assert exc_type @@ -183,7 +211,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): # Check data for 2/3 timelines for pg in pgs[:-1]: res = pg.safe_psql("SELECT sum(key) FROM t") - assert res[0] == (5000050000, ) + assert res[0] == (5000050000,) final_m = collect_metrics("after SELECT") # Assume that LSNs (a) behave similarly in all timelines; and (b) INSERT INTO alters LSN significantly. @@ -208,8 +236,8 @@ def test_restarts(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = n_acceptors env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_safekeepers_restarts') - pg = env.postgres.create_start('test_safekeepers_restarts') + env.neon_cli.create_branch("test_safekeepers_restarts") + pg = env.postgres.create_start("test_safekeepers_restarts") # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -217,9 +245,9 @@ def test_restarts(neon_env_builder: NeonEnvBuilder): cur = pg_conn.cursor() failed_node = None - cur.execute('CREATE TABLE t(key int primary key, value text)') + cur.execute("CREATE TABLE t(key int primary key, value text)") for i in range(n_inserts): - cur.execute("INSERT INTO t values (%s, 'payload');", (i + 1, )) + cur.execute("INSERT INTO t values (%s, 'payload');", (i + 1,)) if random.random() <= fault_probability: if failed_node is None: @@ -228,7 +256,7 @@ def test_restarts(neon_env_builder: NeonEnvBuilder): else: failed_node.start() failed_node = None - assert query_scalar(cur, 'SELECT sum(key) FROM t') == 500500 + assert query_scalar(cur, "SELECT sum(key) FROM t") == 500500 # Test that safekeepers push their info to the broker and learn peer status from it @@ -238,7 +266,7 @@ def test_broker(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_broker", "main") - pg = env.postgres.create_start('test_broker') + pg = env.postgres.create_start("test_broker") pg.safe_psql("CREATE TABLE t(key int primary key, value text)") # learn neon timeline from compute @@ -260,9 +288,10 @@ def test_broker(neon_env_builder: NeonEnvBuilder): while True: stat_after = [cli.timeline_status(tenant_id, timeline_id) for cli in clients] if all( - lsn_from_hex(s_after.remote_consistent_lsn) > lsn_from_hex( - s_before.remote_consistent_lsn) for s_after, - s_before in zip(stat_after, stat_before)): + lsn_from_hex(s_after.remote_consistent_lsn) + > lsn_from_hex(s_before.remote_consistent_lsn) + for s_after, s_before in zip(stat_after, stat_before) + ): break elapsed = time.time() - started_at if elapsed > 20: @@ -273,7 +302,7 @@ def test_broker(neon_env_builder: NeonEnvBuilder): # Test that old WAL consumed by peers and pageserver is removed from safekeepers. -@pytest.mark.parametrize('auth_enabled', [False, True]) +@pytest.mark.parametrize("auth_enabled", [False, True]) def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.num_safekeepers = 2 # to advance remote_consistent_lsn @@ -281,16 +310,18 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_safekeepers_wal_removal') - pg = env.postgres.create_start('test_safekeepers_wal_removal') + env.neon_cli.create_branch("test_safekeepers_wal_removal") + pg = env.postgres.create_start("test_safekeepers_wal_removal") # Note: it is important to insert at least two segments, as currently # control file is synced roughly once in segment range and WAL is not # removed until all horizons are persisted. - pg.safe_psql_many([ - 'CREATE TABLE t(key int primary key, value text)', - "INSERT INTO t SELECT generate_series(1,200000), 'payload'", - ]) + pg.safe_psql_many( + [ + "CREATE TABLE t(key int primary key, value text)", + "INSERT INTO t SELECT generate_series(1,200000), 'payload'", + ] + ) tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] @@ -298,12 +329,12 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # force checkpoint to advance remote_consistent_lsn pageserver_conn_options = {} if auth_enabled: - pageserver_conn_options['password'] = env.auth_keys.generate_tenant_token(tenant_id) + pageserver_conn_options["password"] = env.auth_keys.generate_tenant_token(tenant_id) wait_lsn_force_checkpoint(tenant_id, timeline_id, pg, env.pageserver, pageserver_conn_options) # We will wait for first segment removal. Make sure they exist for starter. first_segments = [ - os.path.join(sk.data_dir(), tenant_id, timeline_id, '000000010000000000000001') + os.path.join(sk.data_dir(), tenant_id, timeline_id, "000000010000000000000001") for sk in env.safekeepers ] assert all(os.path.exists(p) for p in first_segments) @@ -312,25 +343,33 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): http_cli = env.safekeepers[0].http_client() else: http_cli = env.safekeepers[0].http_client( - auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + auth_token=env.auth_keys.generate_tenant_token(tenant_id) + ) http_cli_other = env.safekeepers[0].http_client( - auth_token=env.auth_keys.generate_tenant_token(uuid4().hex)) + auth_token=env.auth_keys.generate_tenant_token(uuid4().hex) + ) http_cli_noauth = env.safekeepers[0].http_client() # Pretend WAL is offloaded to s3. if auth_enabled: - old_backup_lsn = http_cli.timeline_status(tenant_id=tenant_id, - timeline_id=timeline_id).backup_lsn - assert 'FFFFFFFF/FEFFFFFF' != old_backup_lsn + old_backup_lsn = http_cli.timeline_status( + tenant_id=tenant_id, timeline_id=timeline_id + ).backup_lsn + assert "FFFFFFFF/FEFFFFFF" != old_backup_lsn for cli in [http_cli_other, http_cli_noauth]: - with pytest.raises(cli.HTTPError, match='Forbidden|Unauthorized'): - cli.record_safekeeper_info(tenant_id, - timeline_id, {'backup_lsn': 'FFFFFFFF/FEFFFFFF'}) - assert old_backup_lsn == http_cli.timeline_status(tenant_id=tenant_id, - timeline_id=timeline_id).backup_lsn - http_cli.record_safekeeper_info(tenant_id, timeline_id, {'backup_lsn': 'FFFFFFFF/FEFFFFFF'}) - assert 'FFFFFFFF/FEFFFFFF' == http_cli.timeline_status(tenant_id=tenant_id, - timeline_id=timeline_id).backup_lsn + with pytest.raises(cli.HTTPError, match="Forbidden|Unauthorized"): + cli.record_safekeeper_info( + tenant_id, timeline_id, {"backup_lsn": "FFFFFFFF/FEFFFFFF"} + ) + assert ( + old_backup_lsn + == http_cli.timeline_status(tenant_id=tenant_id, timeline_id=timeline_id).backup_lsn + ) + http_cli.record_safekeeper_info(tenant_id, timeline_id, {"backup_lsn": "FFFFFFFF/FEFFFFFF"}) + assert ( + "FFFFFFFF/FEFFFFFF" + == http_cli.timeline_status(tenant_id=tenant_id, timeline_id=timeline_id).backup_lsn + ) # wait till first segment is removed on all safekeepers started_at = time.time() @@ -355,7 +394,8 @@ def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end): elapsed = time.time() - started_at if elapsed > 30: raise RuntimeError( - f"timed out waiting {elapsed:.0f}s for segment ending at {seg_end} get offloaded") + f"timed out waiting {elapsed:.0f}s for segment ending at {seg_end} get offloaded" + ) time.sleep(0.5) @@ -364,8 +404,9 @@ def wait_wal_trim(tenant_id, timeline_id, sk, target_size): http_cli = sk.http_client() while True: tli_status = http_cli.timeline_status(tenant_id, timeline_id) - sk_wal_size = get_dir_size(os.path.join(sk.data_dir(), tenant_id, - timeline_id)) / 1024 / 1024 + sk_wal_size = ( + get_dir_size(os.path.join(sk.data_dir(), tenant_id, timeline_id)) / 1024 / 1024 + ) log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size:.2f}MB status={tli_status}") if sk_wal_size <= target_size: @@ -379,21 +420,21 @@ def wait_wal_trim(tenant_id, timeline_id, sk, target_size): time.sleep(0.5) -@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages()) +@pytest.mark.parametrize("remote_storatge_kind", available_remote_storages()) def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind): neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storatge_kind, - test_name='test_safekeepers_wal_backup', + test_name="test_safekeepers_wal_backup", ) neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_safekeepers_wal_backup') - pg = env.postgres.create_start('test_safekeepers_wal_backup') + env.neon_cli.create_branch("test_safekeepers_wal_backup") + pg = env.postgres.create_start("test_safekeepers_wal_backup") # learn neon timeline from compute tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] @@ -401,11 +442,11 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: Remo pg_conn = pg.connect() cur = pg_conn.cursor() - cur.execute('create table t(key int, value text)') + cur.execute("create table t(key int, value text)") # Shut down subsequently each of safekeepers and fill a segment while sk is # down; ensure segment gets offloaded by others. - offloaded_seg_end = ['0/2000000', '0/3000000', '0/4000000'] + offloaded_seg_end = ["0/2000000", "0/3000000", "0/4000000"] for victim, seg_end in zip(env.safekeepers, offloaded_seg_end): victim.stop() # roughly fills one segment @@ -419,36 +460,36 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: Remo # put one of safekeepers down again env.safekeepers[0].stop() # restart postgres - pg.stop_and_destroy().create_start('test_safekeepers_wal_backup') + pg.stop_and_destroy().create_start("test_safekeepers_wal_backup") # and ensure offloading still works with closing(pg.connect()) as conn: with conn.cursor() as cur: cur.execute("insert into t select generate_series(1,250000), 'payload'") - wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], '0/5000000') + wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], "0/5000000") -@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages()) +@pytest.mark.parametrize("remote_storatge_kind", available_remote_storages()) def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind): neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storatge_kind, - test_name='test_s3_wal_replay', + test_name="test_s3_wal_replay", ) neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_s3_wal_replay') + env.neon_cli.create_branch("test_s3_wal_replay") env.pageserver.stop() - pageserver_tenants_dir = os.path.join(env.repo_dir, 'tenants') - pageserver_fresh_copy = os.path.join(env.repo_dir, 'tenants_fresh') + pageserver_tenants_dir = os.path.join(env.repo_dir, "tenants") + pageserver_fresh_copy = os.path.join(env.repo_dir, "tenants_fresh") log.info(f"Creating a copy of pageserver in a fresh state at {pageserver_fresh_copy}") shutil.copytree(pageserver_tenants_dir, pageserver_fresh_copy) env.pageserver.start() - pg = env.postgres.create_start('test_s3_wal_replay') + pg = env.postgres.create_start("test_s3_wal_replay") # learn neon timeline from compute tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] @@ -462,7 +503,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: R cur.execute("insert into t values (1, 'payload')") expected_sum += 1 - offloaded_seg_end = ['0/3000000'] + offloaded_seg_end = ["0/3000000"] for seg_end in offloaded_seg_end: # roughly fills two segments cur.execute("insert into t select generate_series(1,500000), 'payload'") @@ -476,28 +517,30 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: R # advance remote_consistent_lsn to trigger WAL trimming # this LSN should be less than commit_lsn, so timeline will be active=true in safekeepers, to push etcd updates env.safekeepers[0].http_client().record_safekeeper_info( - tenant_id, timeline_id, {'remote_consistent_lsn': offloaded_seg_end[-1]}) + tenant_id, timeline_id, {"remote_consistent_lsn": offloaded_seg_end[-1]} + ) for sk in env.safekeepers: # require WAL to be trimmed, so no more than one segment is left on disk wait_wal_trim(tenant_id, timeline_id, sk, 16 * 1.5) - last_lsn = query_scalar(cur, 'SELECT pg_current_wal_flush_lsn()') + last_lsn = query_scalar(cur, "SELECT pg_current_wal_flush_lsn()") pageserver_lsn = env.pageserver.http_client().timeline_detail( - uuid.UUID(tenant_id), uuid.UUID((timeline_id)))["local"]["last_record_lsn"] + uuid.UUID(tenant_id), uuid.UUID((timeline_id)) + )["local"]["last_record_lsn"] lag = lsn_from_hex(last_lsn) - lsn_from_hex(pageserver_lsn) log.info( - f'Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb' + f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb" ) # replace pageserver with a fresh copy pg.stop_and_destroy() env.pageserver.stop() - log.info(f'Removing current pageserver state at {pageserver_tenants_dir}') + log.info(f"Removing current pageserver state at {pageserver_tenants_dir}") shutil.rmtree(pageserver_tenants_dir) - log.info(f'Copying fresh pageserver state from {pageserver_fresh_copy}') + log.info(f"Copying fresh pageserver state from {pageserver_fresh_copy}") shutil.move(pageserver_fresh_copy, pageserver_tenants_dir) # start pageserver and wait for replay @@ -509,39 +552,43 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: R while True: elapsed = time.time() - started_at if elapsed > wait_lsn_timeout: - raise RuntimeError(f'Timed out waiting for WAL redo') + raise RuntimeError(f"Timed out waiting for WAL redo") pageserver_lsn = env.pageserver.http_client().timeline_detail( - uuid.UUID(tenant_id), uuid.UUID((timeline_id)))["local"]["last_record_lsn"] + uuid.UUID(tenant_id), uuid.UUID((timeline_id)) + )["local"]["last_record_lsn"] lag = lsn_from_hex(last_lsn) - lsn_from_hex(pageserver_lsn) if time.time() > last_debug_print + 10 or lag <= 0: last_debug_print = time.time() - log.info(f'Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb') + log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb") if lag <= 0: break time.sleep(1) - log.info(f'WAL redo took {elapsed} s') + log.info(f"WAL redo took {elapsed} s") # verify data - pg.create_start('test_s3_wal_replay') + pg.create_start("test_s3_wal_replay") assert pg.safe_psql("select sum(key) from t")[0][0] == expected_sum class ProposerPostgres(PgProtocol): """Object for running postgres without NeonEnv""" - def __init__(self, - pgdata_dir: str, - pg_bin, - timeline_id: uuid.UUID, - tenant_id: uuid.UUID, - listen_addr: str, - port: int): - super().__init__(host=listen_addr, port=port, user='cloud_admin', dbname='postgres') + + def __init__( + self, + pgdata_dir: str, + pg_bin, + timeline_id: uuid.UUID, + tenant_id: uuid.UUID, + listen_addr: str, + port: int, + ): + super().__init__(host=listen_addr, port=port, user="cloud_admin", dbname="postgres") self.pgdata_dir: str = pgdata_dir self.pg_bin: PgBin = pg_bin @@ -551,15 +598,15 @@ class ProposerPostgres(PgProtocol): self.port: int = port def pg_data_dir_path(self) -> str: - """ Path to data directory """ + """Path to data directory""" return self.pgdata_dir def config_file_path(self) -> str: - """ Path to postgresql.conf """ - return os.path.join(self.pgdata_dir, 'postgresql.conf') + """Path to postgresql.conf""" + return os.path.join(self.pgdata_dir, "postgresql.conf") def create_dir_config(self, safekeepers: str): - """ Create dir and config for running --sync-safekeepers """ + """Create dir and config for running --sync-safekeepers""" pathlib.Path(self.pg_data_dir_path()).mkdir(exist_ok=True) with open(self.config_file_path(), "w") as f: @@ -588,36 +635,36 @@ class ProposerPostgres(PgProtocol): } basepath = self.pg_bin.run_capture(command, env) - stdout_filename = basepath + '.stdout' + stdout_filename = basepath + ".stdout" - with open(stdout_filename, 'r') as stdout_f: + with open(stdout_filename, "r") as stdout_f: stdout = stdout_f.read() return stdout.strip("\n ") def initdb(self): - """ Run initdb """ + """Run initdb""" args = ["initdb", "-U", "cloud_admin", "-D", self.pg_data_dir_path()] self.pg_bin.run(args) def start(self): - """ Start postgres with pg_ctl """ + """Start postgres with pg_ctl""" log_path = os.path.join(self.pg_data_dir_path(), "pg.log") args = ["pg_ctl", "-D", self.pg_data_dir_path(), "-l", log_path, "-w", "start"] self.pg_bin.run(args) def stop(self): - """ Stop postgres with pg_ctl """ + """Stop postgres with pg_ctl""" args = ["pg_ctl", "-D", self.pg_data_dir_path(), "-m", "immediate", "-w", "stop"] self.pg_bin.run(args) # insert wal in all safekeepers and run sync on proposer -def test_sync_safekeepers(neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, - port_distributor: PortDistributor): +def test_sync_safekeepers( + neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor +): # We don't really need the full environment for this test, just the # safekeepers would be enough. @@ -629,12 +676,9 @@ def test_sync_safekeepers(neon_env_builder: NeonEnvBuilder, # write config for proposer pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata") - pg = ProposerPostgres(pgdata_dir, - pg_bin, - timeline_id, - tenant_id, - '127.0.0.1', - port_distributor.get_port()) + pg = ProposerPostgres( + pgdata_dir, pg_bin, timeline_id, tenant_id, "127.0.0.1", port_distributor.get_port() + ) pg.create_dir_config(env.get_safekeeper_connstrs()) # valid lsn, which is not in the segment start, nor in zero segment @@ -669,13 +713,13 @@ def test_sync_safekeepers(neon_env_builder: NeonEnvBuilder, assert all(lsn_after_sync == lsn for lsn in lsn_after_append) -@pytest.mark.parametrize('auth_enabled', [False, True]) +@pytest.mark.parametrize("auth_enabled", [False, True]) def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_timeline_status') - pg = env.postgres.create_start('test_timeline_status') + env.neon_cli.create_branch("test_timeline_status") + pg = env.postgres.create_start("test_timeline_status") wa = env.safekeepers[0] @@ -690,7 +734,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): wa_http_cli = wa.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) wa_http_cli.check_status() wa_http_cli_bad = wa.http_client( - auth_token=env.auth_keys.generate_tenant_token(uuid4().hex)) + auth_token=env.auth_keys.generate_tenant_token(uuid4().hex) + ) wa_http_cli_bad.check_status() wa_http_cli_noauth = wa.http_client() wa_http_cli_noauth.check_status() @@ -702,7 +747,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): if auth_enabled: for cli in [wa_http_cli_bad, wa_http_cli_noauth]: - with pytest.raises(cli.HTTPError, match='Forbidden|Unauthorized'): + with pytest.raises(cli.HTTPError, match="Forbidden|Unauthorized"): cli.timeline_status(tenant_id, timeline_id) pg.safe_psql("create table t(i int)") @@ -720,19 +765,23 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): class SafekeeperEnv: - def __init__(self, - repo_dir: Path, - port_distributor: PortDistributor, - pg_bin: PgBin, - num_safekeepers: int = 1): + def __init__( + self, + repo_dir: Path, + port_distributor: PortDistributor, + pg_bin: PgBin, + num_safekeepers: int = 1, + ): self.repo_dir = repo_dir self.port_distributor = port_distributor - self.broker = Etcd(datadir=os.path.join(self.repo_dir, "etcd"), - port=self.port_distributor.get_port(), - peer_port=self.port_distributor.get_port()) + self.broker = Etcd( + datadir=os.path.join(self.repo_dir, "etcd"), + port=self.port_distributor.get_port(), + peer_port=self.port_distributor.get_port(), + ) self.pg_bin = pg_bin self.num_safekeepers = num_safekeepers - self.bin_safekeeper = os.path.join(str(neon_binpath), 'safekeeper') + self.bin_safekeeper = os.path.join(str(neon_binpath), "safekeeper") self.safekeepers: Optional[List[subprocess.CompletedProcess[Any]]] = None self.postgres: Optional[ProposerPostgres] = None self.tenant_id: Optional[uuid.UUID] = None @@ -778,23 +827,25 @@ class SafekeeperEnv: str(i), "--broker-endpoints", self.broker.client_url(), - "--daemonize" + "--daemonize", ] log.info(f'Running command "{" ".join(args)}"') return subprocess.run(args, check=True) def get_safekeeper_connstrs(self): - return ','.join([sk_proc.args[2] for sk_proc in self.safekeepers]) + return ",".join([sk_proc.args[2] for sk_proc in self.safekeepers]) def create_postgres(self): pgdata_dir = os.path.join(self.repo_dir, "proposer_pgdata") - pg = ProposerPostgres(pgdata_dir, - self.pg_bin, - self.timeline_id, - self.tenant_id, - "127.0.0.1", - self.port_distributor.get_port()) + pg = ProposerPostgres( + pgdata_dir, + self.pg_bin, + self.timeline_id, + self.tenant_id, + "127.0.0.1", + self.port_distributor.get_port(), + ) pg.initdb() pg.create_dir_config(self.get_safekeeper_connstrs()) return pg @@ -811,7 +862,7 @@ class SafekeeperEnv: return self def __exit__(self, exc_type, exc_value, traceback): - log.info('Cleaning up all safekeeper and compute nodes') + log.info("Cleaning up all safekeeper and compute nodes") # Stop all the nodes if self.postgres is not None: @@ -821,9 +872,9 @@ class SafekeeperEnv: self.kill_safekeeper(sk_proc.args[6]) -def test_safekeeper_without_pageserver(test_output_dir: str, - port_distributor: PortDistributor, - pg_bin: PgBin): +def test_safekeeper_without_pageserver( + test_output_dir: str, port_distributor: PortDistributor, pg_bin: PgBin +): # Create the environment in the test-specific output dir repo_dir = Path(os.path.join(test_output_dir, "repo")) @@ -845,19 +896,19 @@ def test_safekeeper_without_pageserver(test_output_dir: str, def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str: - return ','.join([f'localhost:{sk.port.pg}' for sk in env.safekeepers if sk.id in sk_names]) + return ",".join([f"localhost:{sk.port.pg}" for sk in env.safekeepers if sk.id in sk_names]) def execute_payload(pg: Postgres): with closing(pg.connect()) as conn: with conn.cursor() as cur: # we rely upon autocommit after each statement # as waiting for acceptors happens there - cur.execute('CREATE TABLE IF NOT EXISTS t(key int, value text)') + cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)") cur.execute("INSERT INTO t VALUES (0, 'something')") - sum_before = query_scalar(cur, 'SELECT SUM(key) FROM t') + sum_before = query_scalar(cur, "SELECT SUM(key) FROM t") cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - sum_after = query_scalar(cur, 'SELECT SUM(key) FROM t') + sum_after = query_scalar(cur, "SELECT SUM(key) FROM t") assert sum_after == sum_before + 5000050000 def show_statuses(safekeepers: List[Safekeeper], tenant_id: str, timeline_id: str): @@ -871,12 +922,12 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 4 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_replace_safekeeper') + env.neon_cli.create_branch("test_replace_safekeeper") log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() active_safekeepers = [1, 2, 3] - pg = env.postgres.create('test_replace_safekeeper') + pg = env.postgres.create("test_replace_safekeeper") pg.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) pg.start() @@ -914,7 +965,7 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): show_statuses(env.safekeepers, tenant_id, timeline_id) log.info("Recreate postgres to replace failed sk1 with new sk4") - pg.stop_and_destroy().create('test_replace_safekeeper') + pg.stop_and_destroy().create("test_replace_safekeeper") active_safekeepers = [2, 3, 4] env.safekeepers[3].start() pg.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) @@ -934,16 +985,16 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): # of WAL segments. def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): # used to calculate delta in collect_stats - last_lsn = .0 + last_lsn = 0.0 # returns LSN and pg_wal size, all in MB def collect_stats(pg: Postgres, cur, enable_logs=True): nonlocal last_lsn assert pg.pgdata_dir is not None - log.info('executing INSERT to generate WAL') + log.info("executing INSERT to generate WAL") current_lsn = lsn_from_hex(query_scalar(cur, "select pg_current_wal_lsn()")) / 1024 / 1024 - pg_wal_size = get_dir_size(os.path.join(pg.pgdata_dir, 'pg_wal')) / 1024 / 1024 + pg_wal_size = get_dir_size(os.path.join(pg.pgdata_dir, "pg_wal")) / 1024 / 1024 if enable_logs: log.info(f"LSN delta: {current_lsn - last_lsn} MB, current WAL size: {pg_wal_size} MB") last_lsn = current_lsn @@ -956,15 +1007,16 @@ def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_wal_deleted_after_broadcast') + env.neon_cli.create_branch("test_wal_deleted_after_broadcast") # Adjust checkpoint config to prevent keeping old WAL segments pg = env.postgres.create_start( - 'test_wal_deleted_after_broadcast', - config_lines=['min_wal_size=32MB', 'max_wal_size=32MB', 'log_checkpoints=on']) + "test_wal_deleted_after_broadcast", + config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"], + ) pg_conn = pg.connect() cur = pg_conn.cursor() - cur.execute('CREATE TABLE t(key int, value text)') + cur.execute("CREATE TABLE t(key int, value text)") collect_stats(pg, cur) @@ -973,15 +1025,15 @@ def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): generate_wal(cur) collect_stats(pg, cur) - log.info('executing checkpoint') - cur.execute('CHECKPOINT') + log.info("executing checkpoint") + cur.execute("CHECKPOINT") wal_size_after_checkpoint = collect_stats(pg, cur)[1] # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) assert wal_size_after_checkpoint < 16 * 2.5 -@pytest.mark.parametrize('auth_enabled', [False, True]) +@pytest.mark.parametrize("auth_enabled", [False, True]) def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.num_safekeepers = 1 neon_env_builder.auth_enabled = auth_enabled @@ -989,25 +1041,25 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # Create two tenants: one will be deleted, other should be preserved. tenant_id = env.initial_tenant.hex - timeline_id_1 = env.neon_cli.create_branch('br1').hex # Active, delete explicitly - timeline_id_2 = env.neon_cli.create_branch('br2').hex # Inactive, delete explicitly - timeline_id_3 = env.neon_cli.create_branch('br3').hex # Active, delete with the tenant - timeline_id_4 = env.neon_cli.create_branch('br4').hex # Inactive, delete with the tenant + timeline_id_1 = env.neon_cli.create_branch("br1").hex # Active, delete explicitly + timeline_id_2 = env.neon_cli.create_branch("br2").hex # Inactive, delete explicitly + timeline_id_3 = env.neon_cli.create_branch("br3").hex # Active, delete with the tenant + timeline_id_4 = env.neon_cli.create_branch("br4").hex # Inactive, delete with the tenant tenant_id_other_uuid, timeline_id_other_uuid = env.neon_cli.create_tenant() tenant_id_other = tenant_id_other_uuid.hex timeline_id_other = timeline_id_other_uuid.hex # Populate branches - pg_1 = env.postgres.create_start('br1') - pg_2 = env.postgres.create_start('br2') - pg_3 = env.postgres.create_start('br3') - pg_4 = env.postgres.create_start('br4') - pg_other = env.postgres.create_start('main', tenant_id=uuid.UUID(hex=tenant_id_other)) + pg_1 = env.postgres.create_start("br1") + pg_2 = env.postgres.create_start("br2") + pg_3 = env.postgres.create_start("br3") + pg_4 = env.postgres.create_start("br4") + pg_other = env.postgres.create_start("main", tenant_id=uuid.UUID(hex=tenant_id_other)) for pg in [pg_1, pg_2, pg_3, pg_4, pg_other]: with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute('CREATE TABLE t(key int primary key)') + cur.execute("CREATE TABLE t(key int primary key)") sk = env.safekeepers[0] sk_data_dir = Path(sk.data_dir()) if not auth_enabled: @@ -1016,7 +1068,8 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): else: sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) sk_http_other = sk.http_client( - auth_token=env.auth_keys.generate_tenant_token(tenant_id_other)) + auth_token=env.auth_keys.generate_tenant_token(tenant_id_other) + ) sk_http_noauth = sk.http_client() assert (sk_data_dir / tenant_id / timeline_id_1).is_dir() assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() @@ -1034,7 +1087,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): for pg in [pg_1, pg_3, pg_other]: with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute('INSERT INTO t (key) VALUES (1)') + cur.execute("INSERT INTO t (key) VALUES (1)") # Remove initial tenant's br1 (active) assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == { @@ -1049,7 +1102,8 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # Ensure repeated deletion succeeds assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == { - "dir_existed": False, "was_active": False + "dir_existed": False, + "was_active": False, } assert not (sk_data_dir / tenant_id / timeline_id_1).exists() assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() @@ -1060,9 +1114,9 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): if auth_enabled: # Ensure we cannot delete the other tenant for sk_h in [sk_http, sk_http_noauth]: - with pytest.raises(sk_h.HTTPError, match='Forbidden|Unauthorized'): + with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): assert sk_h.timeline_delete_force(tenant_id_other, timeline_id_other) - with pytest.raises(sk_h.HTTPError, match='Forbidden|Unauthorized'): + with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): assert sk_h.tenant_delete_force(tenant_id_other) assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() @@ -1078,7 +1132,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() # Remove non-existing branch, should succeed - assert sk_http.timeline_delete_force(tenant_id, '00' * 16) == { + assert sk_http.timeline_delete_force(tenant_id, "00" * 16) == { "dir_existed": False, "was_active": False, } @@ -1107,4 +1161,4 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): sk_http_other.timeline_status(tenant_id_other, timeline_id_other) with closing(pg_other.connect()) as conn: with conn.cursor() as cur: - cur.execute('INSERT INTO t (key) VALUES (123)') + cur.execute("INSERT INTO t (key) VALUES (123)") diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index e1d3ba0919..83285e0cbe 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -1,17 +1,16 @@ import asyncio -import uuid - -import asyncpg import random import time - -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper -from fixtures.log_helper import getLogger -from fixtures.utils import lsn_from_hex, lsn_to_hex -from typing import List, Optional +import uuid from dataclasses import dataclass +from typing import List, Optional -log = getLogger('root.safekeeper_async') +import asyncpg +from fixtures.log_helper import getLogger +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper +from fixtures.utils import lsn_from_hex, lsn_to_hex + +log = getLogger("root.safekeeper_async") class BankClient(object): @@ -21,21 +20,22 @@ class BankClient(object): self.init_amount = init_amount async def initdb(self): - await self.conn.execute('DROP TABLE IF EXISTS bank_accs') - await self.conn.execute('CREATE TABLE bank_accs(uid int primary key, amount int)') + await self.conn.execute("DROP TABLE IF EXISTS bank_accs") + await self.conn.execute("CREATE TABLE bank_accs(uid int primary key, amount int)") await self.conn.execute( - ''' + """ INSERT INTO bank_accs SELECT *, $1 FROM generate_series(0, $2) - ''', + """, self.init_amount, - self.n_accounts - 1) - await self.conn.execute('DROP TABLE IF EXISTS bank_log') - await self.conn.execute('CREATE TABLE bank_log(from_uid int, to_uid int, amount int)') + self.n_accounts - 1, + ) + await self.conn.execute("DROP TABLE IF EXISTS bank_log") + await self.conn.execute("CREATE TABLE bank_log(from_uid int, to_uid int, amount int)") async def check_invariant(self): - row = await self.conn.fetchrow('SELECT sum(amount) AS sum FROM bank_accs') - assert row['sum'] == self.n_accounts * self.init_amount + row = await self.conn.fetchrow("SELECT sum(amount) AS sum FROM bank_accs") + assert row["sum"] == self.n_accounts * self.init_amount async def bank_transfer(conn: asyncpg.Connection, from_uid, to_uid, amount): @@ -45,17 +45,17 @@ async def bank_transfer(conn: asyncpg.Connection, from_uid, to_uid, amount): async with conn.transaction(): await conn.execute( - 'UPDATE bank_accs SET amount = amount + ($1) WHERE uid = $2', + "UPDATE bank_accs SET amount = amount + ($1) WHERE uid = $2", amount, to_uid, ) await conn.execute( - 'UPDATE bank_accs SET amount = amount - ($1) WHERE uid = $2', + "UPDATE bank_accs SET amount = amount - ($1) WHERE uid = $2", amount, from_uid, ) await conn.execute( - 'INSERT INTO bank_log VALUES ($1, $2, $3)', + "INSERT INTO bank_log VALUES ($1, $2, $3)", from_uid, to_uid, amount, @@ -80,12 +80,12 @@ class WorkerStats(object): assert all(cnt > 0 for cnt in self.counters) progress = sum(self.counters) - log.info('All workers made {} transactions'.format(progress)) + log.info("All workers made {} transactions".format(progress)) async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accounts, max_transfer): pg_conn = await pg.connect_async() - log.debug('Started worker {}'.format(worker_id)) + log.debug("Started worker {}".format(worker_id)) while stats.running: from_uid = random.randint(0, n_accounts - 1) @@ -95,19 +95,21 @@ async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accou await bank_transfer(pg_conn, from_uid, to_uid, amount) stats.inc_progress(worker_id) - log.debug('Executed transfer({}) {} => {}'.format(amount, from_uid, to_uid)) + log.debug("Executed transfer({}) {} => {}".format(amount, from_uid, to_uid)) - log.debug('Finished worker {}'.format(worker_id)) + log.debug("Finished worker {}".format(worker_id)) await pg_conn.close() -async def wait_for_lsn(safekeeper: Safekeeper, - tenant_id: str, - timeline_id: str, - wait_lsn: str, - polling_interval=1, - timeout=60): +async def wait_for_lsn( + safekeeper: Safekeeper, + tenant_id: str, + timeline_id: str, + wait_lsn: str, + polling_interval=1, + timeout=60, +): """ Poll flush_lsn from safekeeper until it's greater or equal than provided wait_lsn. To do that, timeline_status is fetched from @@ -119,7 +121,7 @@ async def wait_for_lsn(safekeeper: Safekeeper, flush_lsn = client.timeline_status(tenant_id, timeline_id).flush_lsn log.info( - f'Safekeeper at port {safekeeper.port.pg} has flush_lsn {flush_lsn}, waiting for lsn {wait_lsn}' + f"Safekeeper at port {safekeeper.port.pg} has flush_lsn {flush_lsn}, waiting for lsn {wait_lsn}" ) while lsn_from_hex(wait_lsn) > lsn_from_hex(flush_lsn): @@ -131,22 +133,24 @@ async def wait_for_lsn(safekeeper: Safekeeper, await asyncio.sleep(polling_interval) flush_lsn = client.timeline_status(tenant_id, timeline_id).flush_lsn - log.debug(f'safekeeper port={safekeeper.port.pg} flush_lsn={flush_lsn} wait_lsn={wait_lsn}') + log.debug(f"safekeeper port={safekeeper.port.pg} flush_lsn={flush_lsn} wait_lsn={wait_lsn}") # This test will run several iterations and check progress in each of them. # On each iteration 1 acceptor is stopped, and 2 others should allow # background workers execute transactions. In the end, state should remain # consistent. -async def run_restarts_under_load(env: NeonEnv, - pg: Postgres, - acceptors: List[Safekeeper], - n_workers=10, - n_accounts=100, - init_amount=100000, - max_transfer=100, - period_time=4, - iterations=10): +async def run_restarts_under_load( + env: NeonEnv, + pg: Postgres, + acceptors: List[Safekeeper], + n_workers=10, + n_accounts=100, + init_amount=100000, + max_transfer=100, + period_time=4, + iterations=10, +): # Set timeout for this test at 5 minutes. It should be enough for test to complete, # taking into account that this timeout is checked only at the beginning of every iteration. test_timeout_at = time.monotonic() + 5 * 60 @@ -166,20 +170,21 @@ async def run_restarts_under_load(env: NeonEnv, workers.append(asyncio.create_task(worker)) for it in range(iterations): - assert time.monotonic() < test_timeout_at, 'test timed out' + assert time.monotonic() < test_timeout_at, "test timed out" victim_idx = it % len(acceptors) victim = acceptors[victim_idx] victim.stop() - flush_lsn = await pg_conn.fetchval('SELECT pg_current_wal_flush_lsn()') + flush_lsn = await pg_conn.fetchval("SELECT pg_current_wal_flush_lsn()") flush_lsn = lsn_to_hex(flush_lsn) - log.info(f'Postgres flush_lsn {flush_lsn}') + log.info(f"Postgres flush_lsn {flush_lsn}") pageserver_lsn = env.pageserver.http_client().timeline_detail( - uuid.UUID(tenant_id), uuid.UUID((timeline_id)))["local"]["last_record_lsn"] + uuid.UUID(tenant_id), uuid.UUID((timeline_id)) + )["local"]["last_record_lsn"] sk_ps_lag = lsn_from_hex(flush_lsn) - lsn_from_hex(pageserver_lsn) - log.info(f'Pageserver last_record_lsn={pageserver_lsn} lag={sk_ps_lag / 1024}kb') + log.info(f"Pageserver last_record_lsn={pageserver_lsn} lag={sk_ps_lag / 1024}kb") # Wait until alive safekeepers catch up with postgres for idx, safekeeper in enumerate(acceptors): @@ -193,7 +198,7 @@ async def run_restarts_under_load(env: NeonEnv, victim.start() - log.info('Iterations are finished, exiting coroutines...') + log.info("Iterations are finished, exiting coroutines...") stats.running = False # await all workers await asyncio.gather(*workers) @@ -207,10 +212,11 @@ def test_restarts_under_load(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_safekeepers_restarts_under_load') + env.neon_cli.create_branch("test_safekeepers_restarts_under_load") # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long - pg = env.postgres.create_start('test_safekeepers_restarts_under_load', - config_lines=['max_replication_write_lag=1MB']) + pg = env.postgres.create_start( + "test_safekeepers_restarts_under_load", config_lines=["max_replication_write_lag=1MB"] + ) asyncio.run(run_restarts_under_load(env, pg, env.safekeepers)) @@ -222,15 +228,17 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_restarts_frequent_checkpoints') + env.neon_cli.create_branch("test_restarts_frequent_checkpoints") # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long - pg = env.postgres.create_start('test_restarts_frequent_checkpoints', - config_lines=[ - 'max_replication_write_lag=1MB', - 'min_wal_size=32MB', - 'max_wal_size=32MB', - 'log_checkpoints=on' - ]) + pg = env.postgres.create_start( + "test_restarts_frequent_checkpoints", + config_lines=[ + "max_replication_write_lag=1MB", + "min_wal_size=32MB", + "max_wal_size=32MB", + "log_checkpoints=on", + ], + ) # we try to simulate large (flush_lsn - truncate_lsn) lag, to test that WAL segments # are not removed before broadcasted to all safekeepers, with the help of replication slot @@ -244,51 +252,51 @@ def postgres_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): port=env.port_distributor.get_port(), # In these tests compute has high probability of terminating on its own # before our stop() due to lost consensus leadership. - check_stop_result=False) + check_stop_result=False, + ) # embed current time in node name - node_name = pgdir_name or f'pg_node_{time.time()}' - return pg.create_start(branch_name=branch, - node_name=node_name, - config_lines=['log_statement=all']) + node_name = pgdir_name or f"pg_node_{time.time()}" + return pg.create_start( + branch_name=branch, node_name=node_name, config_lines=["log_statement=all"] + ) -async def exec_compute_query(env: NeonEnv, - branch: str, - query: str, - pgdir_name: Optional[str] = None): +async def exec_compute_query( + env: NeonEnv, branch: str, query: str, pgdir_name: Optional[str] = None +): with postgres_create_start(env, branch=branch, pgdir_name=pgdir_name) as pg: before_conn = time.time() conn = await pg.connect_async() res = await conn.fetch(query) await conn.close() after_conn = time.time() - log.info(f'{query} took {after_conn - before_conn}s') + log.info(f"{query} took {after_conn - before_conn}s") return res -async def run_compute_restarts(env: NeonEnv, - queries=16, - batch_insert=10000, - branch='test_compute_restarts'): +async def run_compute_restarts( + env: NeonEnv, queries=16, batch_insert=10000, branch="test_compute_restarts" +): cnt = 0 sum = 0 - await exec_compute_query(env, branch, 'CREATE TABLE t (i int)') + await exec_compute_query(env, branch, "CREATE TABLE t (i int)") for i in range(queries): if i % 4 == 0: await exec_compute_query( - env, branch, f'INSERT INTO t SELECT 1 FROM generate_series(1, {batch_insert})') + env, branch, f"INSERT INTO t SELECT 1 FROM generate_series(1, {batch_insert})" + ) sum += batch_insert cnt += batch_insert elif (i % 4 == 1) or (i % 4 == 3): # Note that select causes lots of FPI's and increases probability of safekeepers # standing at different LSNs after compute termination. - actual_sum = (await exec_compute_query(env, branch, 'SELECT SUM(i) FROM t'))[0][0] - assert actual_sum == sum, f'Expected sum={sum}, actual={actual_sum}' + actual_sum = (await exec_compute_query(env, branch, "SELECT SUM(i) FROM t"))[0][0] + assert actual_sum == sum, f"Expected sum={sum}, actual={actual_sum}" elif i % 4 == 2: - await exec_compute_query(env, branch, 'UPDATE t SET i = i + 1') + await exec_compute_query(env, branch, "UPDATE t SET i = i + 1") sum += cnt @@ -297,7 +305,7 @@ def test_compute_restarts(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_compute_restarts') + env.neon_cli.create_branch("test_compute_restarts") asyncio.run(run_compute_restarts(env)) @@ -315,7 +323,7 @@ class BackgroundCompute(object): async def run(self): if self.running: - raise Exception('BackgroundCompute is already running') + raise Exception("BackgroundCompute is already running") self.running = True i = 0 @@ -327,17 +335,17 @@ class BackgroundCompute(object): res = await exec_compute_query( self.env, self.branch, - f'INSERT INTO query_log(index, verify_key) VALUES ({self.index}, {verify_key}) RETURNING verify_key', - pgdir_name=f'bgcompute{self.index}_key{verify_key}', + f"INSERT INTO query_log(index, verify_key) VALUES ({self.index}, {verify_key}) RETURNING verify_key", + pgdir_name=f"bgcompute{self.index}_key{verify_key}", ) - log.info(f'result: {res}') + log.info(f"result: {res}") if len(res) != 1: - raise Exception('No result returned') + raise Exception("No result returned") if res[0][0] != verify_key: - raise Exception('Wrong result returned') + raise Exception("Wrong result returned") self.successful_queries.append(verify_key) except Exception as e: - log.info(f'BackgroundCompute {self.index} query failed: {e}') + log.info(f"BackgroundCompute {self.index} query failed: {e}") # With less sleep, there is a very big chance of not committing # anything or only 1 xact during test run. @@ -345,14 +353,12 @@ class BackgroundCompute(object): self.running = False -async def run_concurrent_computes(env: NeonEnv, - num_computes=10, - run_seconds=20, - branch='test_concurrent_computes'): +async def run_concurrent_computes( + env: NeonEnv, num_computes=10, run_seconds=20, branch="test_concurrent_computes" +): await exec_compute_query( - env, - branch, - 'CREATE TABLE query_log (t timestamp default now(), index int, verify_key int)') + env, branch, "CREATE TABLE query_log (t timestamp default now(), index int, verify_key int)" + ) computes = [BackgroundCompute(i, env, branch) for i in range(num_computes)] background_tasks = [asyncio.create_task(compute.run()) for compute in computes] @@ -367,13 +373,17 @@ async def run_concurrent_computes(env: NeonEnv, # work for some time with only one compute -- it should be able to make some xacts TIMEOUT_SECONDS = computes[0].MAX_QUERY_GAP_SECONDS + 3 initial_queries_by_0 = len(computes[0].successful_queries) - log.info(f'Waiting for another query by computes[0], ' - f'it already had {initial_queries_by_0}, timeout is {TIMEOUT_SECONDS}s') + log.info( + f"Waiting for another query by computes[0], " + f"it already had {initial_queries_by_0}, timeout is {TIMEOUT_SECONDS}s" + ) for _ in range(10 * TIMEOUT_SECONDS): current_queries_by_0 = len(computes[0].successful_queries) - initial_queries_by_0 if current_queries_by_0 >= 1: - log.info(f'Found {current_queries_by_0} successful queries ' - f'by computes[0], completing the test') + log.info( + f"Found {current_queries_by_0} successful queries " + f"by computes[0], completing the test" + ) break await asyncio.sleep(0.1) else: @@ -382,12 +392,14 @@ async def run_concurrent_computes(env: NeonEnv, await asyncio.gather(background_tasks[0]) - result = await exec_compute_query(env, branch, 'SELECT * FROM query_log') + result = await exec_compute_query(env, branch, "SELECT * FROM query_log") # we should have inserted something while single compute was running - log.info(f'Executed {len(result)} queries, {current_queries_by_0} of them ' - f'by computes[0] after we started stopping the others') + log.info( + f"Executed {len(result)} queries, {current_queries_by_0} of them " + f"by computes[0] after we started stopping the others" + ) for row in result: - log.info(f'{row[0]} {row[1]} {row[2]}') + log.info(f"{row[0]} {row[1]} {row[2]}") # ensure everything reported as committed wasn't lost for compute in computes: @@ -402,16 +414,15 @@ def test_concurrent_computes(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_concurrent_computes') + env.neon_cli.create_branch("test_concurrent_computes") asyncio.run(run_concurrent_computes(env)) # Stop safekeeper and check that query cannot be executed while safekeeper is down. # Query will insert a single row into a table. -async def check_unavailability(sk: Safekeeper, - conn: asyncpg.Connection, - key: int, - start_delay_sec: int = 2): +async def check_unavailability( + sk: Safekeeper, conn: asyncpg.Connection, key: int, start_delay_sec: int = 2 +): # shutdown one of two acceptors, that is, majority sk.stop() @@ -431,7 +442,7 @@ async def run_unavailability(env: NeonEnv, pg: Postgres): conn = await pg.connect_async() # check basic work with table - await conn.execute('CREATE TABLE t(key int primary key, value text)') + await conn.execute("CREATE TABLE t(key int primary key, value text)") await conn.execute("INSERT INTO t values (1, 'payload')") # stop safekeeper and check that query cannot be executed while safekeeper is down @@ -443,7 +454,7 @@ async def run_unavailability(env: NeonEnv, pg: Postgres): # check that we can execute queries after restart await conn.execute("INSERT INTO t values (4, 'payload')") - result_sum = await conn.fetchval('SELECT sum(key) FROM t') + result_sum = await conn.fetchval("SELECT sum(key) FROM t") assert result_sum == 10 @@ -452,8 +463,8 @@ def test_unavailability(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 2 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_safekeepers_unavailability') - pg = env.postgres.create_start('test_safekeepers_unavailability') + env.neon_cli.create_branch("test_safekeepers_unavailability") + pg = env.postgres.create_start("test_safekeepers_unavailability") asyncio.run(run_unavailability(env, pg)) @@ -473,20 +484,20 @@ async def xmas_garland(safekeepers: List[Safekeeper], data: RaceConditionTest): if random.random() >= 0.5: victims.append(sk) log.info( - f'Iteration {data.iteration}: stopping {list(map(lambda sk: sk.id, victims))} safekeepers' + f"Iteration {data.iteration}: stopping {list(map(lambda sk: sk.id, victims))} safekeepers" ) for v in victims: v.stop() await asyncio.sleep(1) for v in victims: v.start() - log.info(f'Iteration {data.iteration} finished') + log.info(f"Iteration {data.iteration} finished") await asyncio.sleep(1) async def run_race_conditions(env: NeonEnv, pg: Postgres): conn = await pg.connect_async() - await conn.execute('CREATE TABLE t(key int primary key, value text)') + await conn.execute("CREATE TABLE t(key int primary key, value text)") data = RaceConditionTest(0, False) bg_xmas = asyncio.create_task(xmas_garland(env.safekeepers, data)) @@ -501,9 +512,9 @@ async def run_race_conditions(env: NeonEnv, pg: Postgres): expected_sum += i i += 1 - log.info(f'Executed {i-1} queries') + log.info(f"Executed {i-1} queries") - res = await conn.fetchval('SELECT sum(key) FROM t') + res = await conn.fetchval("SELECT sum(key) FROM t") assert res == expected_sum data.is_stopped = True @@ -516,8 +527,8 @@ def test_race_conditions(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_safekeepers_race_conditions') - pg = env.postgres.create_start('test_safekeepers_race_conditions') + env.neon_cli.create_branch("test_safekeepers_race_conditions") + pg = env.postgres.create_start("test_safekeepers_race_conditions") asyncio.run(run_race_conditions(env, pg)) @@ -527,13 +538,15 @@ def test_race_conditions(neon_env_builder: NeonEnvBuilder): async def run_wal_lagging(env: NeonEnv, pg: Postgres): def safekeepers_guc(env: NeonEnv, active_sk: List[bool]) -> str: # use ports 10, 11 and 12 to simulate unavailable safekeepers - return ','.join([ - f'localhost:{sk.port.pg if active else 10 + i}' - for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk)) - ]) + return ",".join( + [ + f"localhost:{sk.port.pg if active else 10 + i}" + for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk)) + ] + ) conn = await pg.connect_async() - await conn.execute('CREATE TABLE t(key int primary key, value text)') + await conn.execute("CREATE TABLE t(key int primary key, value text)") await conn.close() pg.stop() @@ -552,7 +565,7 @@ async def run_wal_lagging(env: NeonEnv, pg: Postgres): continue pg.adjust_for_safekeepers(safekeepers_guc(env, active_sk)) - log.info(f'Iteration {it}: {active_sk}') + log.info(f"Iteration {it}: {active_sk}") pg.start() conn = await pg.connect_async() @@ -569,9 +582,9 @@ async def run_wal_lagging(env: NeonEnv, pg: Postgres): pg.start() conn = await pg.connect_async() - log.info(f'Executed {i-1} queries') + log.info(f"Executed {i-1} queries") - res = await conn.fetchval('SELECT sum(key) FROM t') + res = await conn.fetchval("SELECT sum(key) FROM t") assert res == expected_sum @@ -581,7 +594,7 @@ def test_wal_lagging(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_wal_lagging') - pg = env.postgres.create_start('test_wal_lagging') + env.neon_cli.create_branch("test_wal_lagging") + pg = env.postgres.create_start("test_wal_lagging") asyncio.run(run_wal_lagging(env, pg)) diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py index 809e942415..0847b5a505 100644 --- a/test_runner/batch_others/test_wal_restore.py +++ b/test_runner/batch_others/test_wal_restore.py @@ -1,33 +1,39 @@ import os from pathlib import Path -from fixtures.neon_fixtures import (NeonEnvBuilder, - VanillaPostgres, - PortDistributor, - PgBin, - base_dir, - pg_distrib_dir) +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + PortDistributor, + VanillaPostgres, + base_dir, + pg_distrib_dir, +) -def test_wal_restore(neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, - test_output_dir: Path, - port_distributor: PortDistributor): +def test_wal_restore( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + test_output_dir: Path, + port_distributor: PortDistributor, +): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_wal_restore") - pg = env.postgres.create_start('test_wal_restore') + pg = env.postgres.create_start("test_wal_restore") pg.safe_psql("create table t as select generate_series(1,300000)") tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] env.neon_cli.pageserver_stop() port = port_distributor.get_port() - data_dir = test_output_dir / 'pgsql.restored' + data_dir = test_output_dir / "pgsql.restored" with VanillaPostgres(data_dir, PgBin(test_output_dir), port) as restored: - pg_bin.run_capture([ - os.path.join(base_dir, 'libs/utils/scripts/restore_from_wal.sh'), - os.path.join(pg_distrib_dir, 'bin'), - str(test_output_dir / 'repo' / 'safekeepers' / 'sk1' / str(tenant_id) / '*'), - str(data_dir), - str(port) - ]) + pg_bin.run_capture( + [ + os.path.join(base_dir, "libs/utils/scripts/restore_from_wal.sh"), + os.path.join(pg_distrib_dir, "bin"), + str(test_output_dir / "repo" / "safekeepers" / "sk1" / str(tenant_id) / "*"), + str(data_dir), + str(port), + ] + ) restored.start() - assert restored.safe_psql('select count(*) from t', user='cloud_admin') == [(300000, )] + assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)] diff --git a/test_runner/batch_pg_regress/test_isolation.py b/test_runner/batch_pg_regress/test_isolation.py index 0124459440..7127a069b0 100644 --- a/test_runner/batch_pg_regress/test_isolation.py +++ b/test_runner/batch_pg_regress/test_isolation.py @@ -1,5 +1,6 @@ import os from pathlib import Path + import pytest from fixtures.neon_fixtures import NeonEnv, base_dir, pg_distrib_dir @@ -13,33 +14,33 @@ def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, caps env.neon_cli.create_branch("test_isolation", "empty") # Connect to postgres and create a database called "regression". # isolation tests use prepared transactions, so enable them - pg = env.postgres.create_start('test_isolation', config_lines=['max_prepared_transactions=100']) - pg.safe_psql('CREATE DATABASE isolation_regression') + pg = env.postgres.create_start("test_isolation", config_lines=["max_prepared_transactions=100"]) + pg.safe_psql("CREATE DATABASE isolation_regression") # Create some local directories for pg_isolation_regress to run in. - runpath = test_output_dir / 'regress' - (runpath / 'testtablespace').mkdir(parents=True) + runpath = test_output_dir / "regress" + (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_isolation_regress will need. - build_path = os.path.join(pg_distrib_dir, 'build/src/test/isolation') - src_path = os.path.join(base_dir, 'vendor/postgres/src/test/isolation') - bindir = os.path.join(pg_distrib_dir, 'bin') - schedule = os.path.join(src_path, 'isolation_schedule') - pg_isolation_regress = os.path.join(build_path, 'pg_isolation_regress') + build_path = os.path.join(pg_distrib_dir, "build/src/test/isolation") + src_path = os.path.join(base_dir, "vendor/postgres/src/test/isolation") + bindir = os.path.join(pg_distrib_dir, "bin") + schedule = os.path.join(src_path, "isolation_schedule") + pg_isolation_regress = os.path.join(build_path, "pg_isolation_regress") pg_isolation_regress_command = [ pg_isolation_regress, - '--use-existing', - '--bindir={}'.format(bindir), - '--dlpath={}'.format(build_path), - '--inputdir={}'.format(src_path), - '--schedule={}'.format(schedule), + "--use-existing", + "--bindir={}".format(bindir), + "--dlpath={}".format(build_path), + "--inputdir={}".format(src_path), + "--schedule={}".format(schedule), ] env_vars = { - 'PGPORT': str(pg.default_options['port']), - 'PGUSER': pg.default_options['user'], - 'PGHOST': pg.default_options['host'], + "PGPORT": str(pg.default_options["port"]), + "PGUSER": pg.default_options["user"], + "PGHOST": pg.default_options["host"], } # Run the command. diff --git a/test_runner/batch_pg_regress/test_neon_regress.py b/test_runner/batch_pg_regress/test_neon_regress.py index 66ea67d9f1..5f13e6b2de 100644 --- a/test_runner/batch_pg_regress/test_neon_regress.py +++ b/test_runner/batch_pg_regress/test_neon_regress.py @@ -1,11 +1,8 @@ import os from pathlib import Path -from fixtures.neon_fixtures import (NeonEnv, - check_restored_datadir_content, - base_dir, - pg_distrib_dir) from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, base_dir, check_restored_datadir_content, pg_distrib_dir def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): @@ -13,35 +10,35 @@ def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, c env.neon_cli.create_branch("test_neon_regress", "empty") # Connect to postgres and create a database called "regression". - pg = env.postgres.create_start('test_neon_regress') - pg.safe_psql('CREATE DATABASE regression') + pg = env.postgres.create_start("test_neon_regress") + pg.safe_psql("CREATE DATABASE regression") # Create some local directories for pg_regress to run in. - runpath = test_output_dir / 'regress' - (runpath / 'testtablespace').mkdir(parents=True) + runpath = test_output_dir / "regress" + (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. # This test runs neon specific tests - build_path = os.path.join(pg_distrib_dir, 'build/src/test/regress') - src_path = os.path.join(base_dir, 'test_runner/neon_regress') - bindir = os.path.join(pg_distrib_dir, 'bin') - schedule = os.path.join(src_path, 'parallel_schedule') - pg_regress = os.path.join(build_path, 'pg_regress') + build_path = os.path.join(pg_distrib_dir, "build/src/test/regress") + src_path = os.path.join(base_dir, "test_runner/neon_regress") + bindir = os.path.join(pg_distrib_dir, "bin") + schedule = os.path.join(src_path, "parallel_schedule") + pg_regress = os.path.join(build_path, "pg_regress") pg_regress_command = [ pg_regress, - '--use-existing', - '--bindir={}'.format(bindir), - '--dlpath={}'.format(build_path), - '--schedule={}'.format(schedule), - '--inputdir={}'.format(src_path), + "--use-existing", + "--bindir={}".format(bindir), + "--dlpath={}".format(build_path), + "--schedule={}".format(schedule), + "--inputdir={}".format(src_path), ] log.info(pg_regress_command) env_vars = { - 'PGPORT': str(pg.default_options['port']), - 'PGUSER': pg.default_options['user'], - 'PGHOST': pg.default_options['host'], + "PGPORT": str(pg.default_options["port"]), + "PGUSER": pg.default_options["user"], + "PGHOST": pg.default_options["host"], } # Run the command. @@ -51,8 +48,8 @@ def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, c pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) # checkpoint one more time to ensure that the lsn we get is the latest one - pg.safe_psql('CHECKPOINT') - lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0] + pg.safe_psql("CHECKPOINT") + lsn = pg.safe_psql("select pg_current_wal_insert_lsn()")[0][0] # Check that we restore the content of the datadir correctly check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/batch_pg_regress/test_pg_regress.py b/test_runner/batch_pg_regress/test_pg_regress.py index 28066d7a32..478dbf0a91 100644 --- a/test_runner/batch_pg_regress/test_pg_regress.py +++ b/test_runner/batch_pg_regress/test_pg_regress.py @@ -1,7 +1,8 @@ import os import pathlib + import pytest -from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content, base_dir, pg_distrib_dir +from fixtures.neon_fixtures import NeonEnv, base_dir, check_restored_datadir_content, pg_distrib_dir # The pg_regress tests run for a long time, especially in debug mode, @@ -12,34 +13,34 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: pathlib.Path, pg_ env.neon_cli.create_branch("test_pg_regress", "empty") # Connect to postgres and create a database called "regression". - pg = env.postgres.create_start('test_pg_regress') - pg.safe_psql('CREATE DATABASE regression') + pg = env.postgres.create_start("test_pg_regress") + pg.safe_psql("CREATE DATABASE regression") # Create some local directories for pg_regress to run in. - runpath = test_output_dir / 'regress' - (runpath / 'testtablespace').mkdir(parents=True) + runpath = test_output_dir / "regress" + (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. - build_path = os.path.join(pg_distrib_dir, 'build/src/test/regress') - src_path = os.path.join(base_dir, 'vendor/postgres/src/test/regress') - bindir = os.path.join(pg_distrib_dir, 'bin') - schedule = os.path.join(src_path, 'parallel_schedule') - pg_regress = os.path.join(build_path, 'pg_regress') + build_path = os.path.join(pg_distrib_dir, "build/src/test/regress") + src_path = os.path.join(base_dir, "vendor/postgres/src/test/regress") + bindir = os.path.join(pg_distrib_dir, "bin") + schedule = os.path.join(src_path, "parallel_schedule") + pg_regress = os.path.join(build_path, "pg_regress") pg_regress_command = [ pg_regress, '--bindir=""', - '--use-existing', - '--bindir={}'.format(bindir), - '--dlpath={}'.format(build_path), - '--schedule={}'.format(schedule), - '--inputdir={}'.format(src_path), + "--use-existing", + "--bindir={}".format(bindir), + "--dlpath={}".format(build_path), + "--schedule={}".format(schedule), + "--inputdir={}".format(src_path), ] env_vars = { - 'PGPORT': str(pg.default_options['port']), - 'PGUSER': pg.default_options['user'], - 'PGHOST': pg.default_options['host'], + "PGPORT": str(pg.default_options["port"]), + "PGUSER": pg.default_options["user"], + "PGHOST": pg.default_options["host"], } # Run the command. @@ -49,7 +50,7 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: pathlib.Path, pg_ pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) # checkpoint one more time to ensure that the lsn we get is the latest one - pg.safe_psql('CHECKPOINT') + pg.safe_psql("CHECKPOINT") # Check that we restore the content of the datadir correctly check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 51545d0217..8b7f6a2eea 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -1,5 +1,7 @@ -pytest_plugins = ("fixtures.neon_fixtures", - "fixtures.benchmark_fixture", - "fixtures.pg_stats", - "fixtures.compare_fixtures", - "fixtures.slow") +pytest_plugins = ( + "fixtures.neon_fixtures", + "fixtures.benchmark_fixture", + "fixtures.pg_stats", + "fixtures.compare_fixtures", + "fixtures.slow", +) diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index cca4f7ce17..cec46f9f6d 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -10,12 +10,14 @@ import warnings from contextlib import contextmanager from datetime import datetime from pathlib import Path + # Type-related stuff from typing import Iterator, Optional import pytest from _pytest.config import Config from _pytest.terminal import TerminalReporter + """ This file contains fixtures for micro-benchmarks. @@ -112,8 +114,10 @@ class PgBenchRunResult: # pgbench v14: # initial connection time = 3.858 ms # tps = 309.281539 (without initial connection time) - if (line.startswith("tps = ") and ("(excluding connections establishing)" in line - or "(without initial connection time)")): + if line.startswith("tps = ") and ( + "(excluding connections establishing)" in line + or "(without initial connection time)" + ): tps = float(line.split()[2]) return cls( @@ -154,17 +158,21 @@ class PgBenchInitResult: last_line = stderr.splitlines()[-1] - regex = re.compile(r"done in (\d+\.\d+) s " - r"\(" - r"(?:drop tables (\d+\.\d+) s)?(?:, )?" - r"(?:create tables (\d+\.\d+) s)?(?:, )?" - r"(?:client-side generate (\d+\.\d+) s)?(?:, )?" - r"(?:vacuum (\d+\.\d+) s)?(?:, )?" - r"(?:primary keys (\d+\.\d+) s)?(?:, )?" - r"\)\.") + regex = re.compile( + r"done in (\d+\.\d+) s " + r"\(" + r"(?:drop tables (\d+\.\d+) s)?(?:, )?" + r"(?:create tables (\d+\.\d+) s)?(?:, )?" + r"(?:client-side generate (\d+\.\d+) s)?(?:, )?" + r"(?:vacuum (\d+\.\d+) s)?(?:, )?" + r"(?:primary keys (\d+\.\d+) s)?(?:, )?" + r"\)\." + ) if (m := regex.match(last_line)) is not None: - total, drop_tables, create_tables, client_side_generate, vacuum, primary_keys = [float(v) for v in m.groups() if v is not None] + total, drop_tables, create_tables, client_side_generate, vacuum, primary_keys = [ + float(v) for v in m.groups() if v is not None + ] else: raise RuntimeError(f"can't parse pgbench initialize results from `{last_line}`") @@ -185,11 +193,11 @@ class PgBenchInitResult: class MetricReport(str, enum.Enum): # str is a hack to make it json serializable # this means that this is a constant test parameter # like number of transactions, or number of clients - TEST_PARAM = 'test_param' + TEST_PARAM = "test_param" # reporter can use it to mark test runs with higher values as improvements - HIGHER_IS_BETTER = 'higher_is_better' + HIGHER_IS_BETTER = "higher_is_better" # the same but for lower values - LOWER_IS_BETTER = 'lower_is_better' + LOWER_IS_BETTER = "lower_is_better" class NeonBenchmarker: @@ -197,6 +205,7 @@ class NeonBenchmarker: An object for recording benchmark results. This is created for each test function by the zenbenchmark fixture """ + def __init__(self, property_recorder): # property recorder here is a pytest fixture provided by junitxml module # https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property @@ -244,43 +253,57 @@ class NeonBenchmarker: ) def record_pg_bench_result(self, prefix: str, pg_bench_result: PgBenchRunResult): - self.record(f"{prefix}.number_of_clients", - pg_bench_result.number_of_clients, - '', - MetricReport.TEST_PARAM) - self.record(f"{prefix}.number_of_threads", - pg_bench_result.number_of_threads, - '', - MetricReport.TEST_PARAM) + self.record( + f"{prefix}.number_of_clients", + pg_bench_result.number_of_clients, + "", + MetricReport.TEST_PARAM, + ) + self.record( + f"{prefix}.number_of_threads", + pg_bench_result.number_of_threads, + "", + MetricReport.TEST_PARAM, + ) self.record( f"{prefix}.number_of_transactions_actually_processed", pg_bench_result.number_of_transactions_actually_processed, - '', + "", # that's because this is predefined by test matrix and doesn't change across runs report=MetricReport.TEST_PARAM, ) - self.record(f"{prefix}.latency_average", - pg_bench_result.latency_average, - unit="ms", - report=MetricReport.LOWER_IS_BETTER) + self.record( + f"{prefix}.latency_average", + pg_bench_result.latency_average, + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) if pg_bench_result.latency_stddev is not None: - self.record(f"{prefix}.latency_stddev", - pg_bench_result.latency_stddev, - unit="ms", - report=MetricReport.LOWER_IS_BETTER) - self.record(f"{prefix}.tps", pg_bench_result.tps, '', report=MetricReport.HIGHER_IS_BETTER) - self.record(f"{prefix}.run_duration", - pg_bench_result.run_duration, - unit="s", - report=MetricReport.LOWER_IS_BETTER) - self.record(f"{prefix}.run_start_timestamp", - pg_bench_result.run_start_timestamp, - '', - MetricReport.TEST_PARAM) - self.record(f"{prefix}.run_end_timestamp", - pg_bench_result.run_end_timestamp, - '', - MetricReport.TEST_PARAM) + self.record( + f"{prefix}.latency_stddev", + pg_bench_result.latency_stddev, + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) + self.record(f"{prefix}.tps", pg_bench_result.tps, "", report=MetricReport.HIGHER_IS_BETTER) + self.record( + f"{prefix}.run_duration", + pg_bench_result.run_duration, + unit="s", + report=MetricReport.LOWER_IS_BETTER, + ) + self.record( + f"{prefix}.run_start_timestamp", + pg_bench_result.run_start_timestamp, + "", + MetricReport.TEST_PARAM, + ) + self.record( + f"{prefix}.run_end_timestamp", + pg_bench_result.run_end_timestamp, + "", + MetricReport.TEST_PARAM, + ) def record_pg_bench_init_result(self, prefix: str, result: PgBenchInitResult): test_params = [ @@ -288,10 +311,9 @@ class NeonBenchmarker: "end_timestamp", ] for test_param in test_params: - self.record(f"{prefix}.{test_param}", - getattr(result, test_param), - '', - MetricReport.TEST_PARAM) + self.record( + f"{prefix}.{test_param}", getattr(result, test_param), "", MetricReport.TEST_PARAM + ) metrics = [ "duration", @@ -303,10 +325,9 @@ class NeonBenchmarker: ] for metric in metrics: if (value := getattr(result, metric)) is not None: - self.record(f"{prefix}.{metric}", - value, - unit="s", - report=MetricReport.LOWER_IS_BETTER) + self.record( + f"{prefix}.{metric}", value, unit="s", report=MetricReport.LOWER_IS_BETTER + ) def get_io_writes(self, pageserver) -> int: """ @@ -319,7 +340,7 @@ class NeonBenchmarker: """ Fetch the "maxrss" metric from the pageserver """ - metric_name = r'libmetrics_maxrss_kb' + metric_name = r"libmetrics_maxrss_kb" return self.get_int_counter_value(pageserver, metric_name) def get_int_counter_value(self, pageserver, metric_name) -> int: @@ -332,7 +353,7 @@ class NeonBenchmarker: # all prometheus metrics are floats. So to be pedantic, read it as a float # and round to integer. all_metrics = pageserver.http_client().get_metrics() - matches = re.search(fr'^{metric_name} (\S+)$', all_metrics, re.MULTILINE) + matches = re.search(rf"^{metric_name} (\S+)$", all_metrics, re.MULTILINE) assert matches return int(round(float(matches.group(1)))) @@ -358,10 +379,12 @@ class NeonBenchmarker: yield after = self.get_io_writes(pageserver) - self.record(metric_name, - round((after - before) / (1024 * 1024)), - "MB", - report=MetricReport.LOWER_IS_BETTER) + self.record( + metric_name, + round((after - before) / (1024 * 1024)), + "MB", + report=MetricReport.LOWER_IS_BETTER, + ) @pytest.fixture(scope="function") @@ -410,8 +433,9 @@ def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, result_entry = [] for _, recorded_property in test_report.user_properties: - terminalreporter.write("{}.{}: ".format(test_report.head_line, - recorded_property["name"])) + terminalreporter.write( + "{}.{}: ".format(test_report.head_line, recorded_property["name"]) + ) unit = recorded_property["unit"] value = recorded_property["value"] if unit == "MB": @@ -426,11 +450,13 @@ def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, result_entry.append(recorded_property) - result.append({ - "suit": test_report.nodeid, - "total_duration": test_report.duration, - "data": result_entry, - }) + result.append( + { + "suit": test_report.nodeid, + "total_duration": test_report.duration, + "data": result_entry, + } + ) out_dir = config.getoption("out_dir") if out_dir is None: @@ -442,6 +468,5 @@ def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, return get_out_path(Path(out_dir), revision=revision).write_text( - json.dumps({ - "revision": revision, "platform": platform, "result": result - }, indent=4)) + json.dumps({"revision": revision, "platform": platform, "result": result}, indent=4) + ) diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index e6c3a79697..6bca5be335 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -1,14 +1,14 @@ -import pytest -from contextlib import contextmanager from abc import ABC, abstractmethod -from fixtures.pg_stats import PgStatTable - -from fixtures.neon_fixtures import PgBin, PgProtocol, VanillaPostgres, RemotePostgres, NeonEnv -from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from contextlib import contextmanager # Type-related stuff from typing import Dict, List +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.neon_fixtures import NeonEnv, PgBin, PgProtocol, RemotePostgres, VanillaPostgres +from fixtures.pg_stats import PgStatTable + class PgCompare(ABC): """Common interface of all postgres implementations, useful for benchmarks. @@ -16,6 +16,7 @@ class PgCompare(ABC): This class is a helper class for the neon_with_baseline fixture. See its documentation for more details. """ + @property @abstractmethod def pg(self) -> PgProtocol: @@ -61,7 +62,7 @@ class PgCompare(ABC): data = self._retrieve_pg_stats(pg_stats) for k in set(init_data) & set(data): - self.zenbenchmark.record(k, data[k] - init_data[k], '', MetricReport.HIGHER_IS_BETTER) + self.zenbenchmark.record(k, data[k] - init_data[k], "", MetricReport.HIGHER_IS_BETTER) def _retrieve_pg_stats(self, pg_stats: List[PgStatTable]) -> Dict[str, int]: results: Dict[str, int] = {} @@ -81,17 +82,16 @@ class PgCompare(ABC): class NeonCompare(PgCompare): """PgCompare interface for the neon stack.""" - def __init__(self, - zenbenchmark: NeonBenchmarker, - neon_simple_env: NeonEnv, - pg_bin: PgBin, - branch_name): + + def __init__( + self, zenbenchmark: NeonBenchmarker, neon_simple_env: NeonEnv, pg_bin: PgBin, branch_name + ): self.env = neon_simple_env self._zenbenchmark = zenbenchmark self._pg_bin = pg_bin # We only use one branch and one timeline - self.env.neon_cli.create_branch(branch_name, 'empty') + self.env.neon_cli.create_branch(branch_name, "empty") self._pg = self.env.postgres.create_start(branch_name) self.timeline = self.pg.safe_psql("SHOW neon.timeline_id")[0][0] @@ -118,32 +118,33 @@ class NeonCompare(PgCompare): self.pscur.execute(f"compact {self.env.initial_tenant.hex} {self.timeline}") def report_peak_memory_use(self) -> None: - self.zenbenchmark.record("peak_mem", - self.zenbenchmark.get_peak_mem(self.env.pageserver) / 1024, - 'MB', - report=MetricReport.LOWER_IS_BETTER) + self.zenbenchmark.record( + "peak_mem", + self.zenbenchmark.get_peak_mem(self.env.pageserver) / 1024, + "MB", + report=MetricReport.LOWER_IS_BETTER, + ) def report_size(self) -> None: - timeline_size = self.zenbenchmark.get_timeline_size(self.env.repo_dir, - self.env.initial_tenant, - self.timeline) - self.zenbenchmark.record('size', - timeline_size / (1024 * 1024), - 'MB', - report=MetricReport.LOWER_IS_BETTER) + timeline_size = self.zenbenchmark.get_timeline_size( + self.env.repo_dir, self.env.initial_tenant, self.timeline + ) + self.zenbenchmark.record( + "size", timeline_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER + ) total_files = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_created_persistent_files_total") + self.env.pageserver, "pageserver_created_persistent_files_total" + ) total_bytes = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_written_persistent_bytes_total") - self.zenbenchmark.record("data_uploaded", - total_bytes / (1024 * 1024), - "MB", - report=MetricReport.LOWER_IS_BETTER) - self.zenbenchmark.record("num_files_uploaded", - total_files, - "", - report=MetricReport.LOWER_IS_BETTER) + self.env.pageserver, "pageserver_written_persistent_bytes_total" + ) + self.zenbenchmark.record( + "data_uploaded", total_bytes / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER + ) + self.zenbenchmark.record( + "num_files_uploaded", total_files, "", report=MetricReport.LOWER_IS_BETTER + ) def record_pageserver_writes(self, out_name): return self.zenbenchmark.record_pageserver_writes(self.env.pageserver, out_name) @@ -154,13 +155,16 @@ class NeonCompare(PgCompare): class VanillaCompare(PgCompare): """PgCompare interface for vanilla postgres.""" + def __init__(self, zenbenchmark, vanilla_pg: VanillaPostgres): self._pg = vanilla_pg self._zenbenchmark = zenbenchmark - vanilla_pg.configure([ - 'shared_buffers=1MB', - 'synchronous_commit=off', - ]) + vanilla_pg.configure( + [ + "shared_buffers=1MB", + "synchronous_commit=off", + ] + ) vanilla_pg.start() # Long-lived cursor, useful for flushing @@ -186,16 +190,14 @@ class VanillaCompare(PgCompare): pass # TODO find something def report_size(self) -> None: - data_size = self.pg.get_subdir_size('base') - self.zenbenchmark.record('data_size', - data_size / (1024 * 1024), - 'MB', - report=MetricReport.LOWER_IS_BETTER) - wal_size = self.pg.get_subdir_size('pg_wal') - self.zenbenchmark.record('wal_size', - wal_size / (1024 * 1024), - 'MB', - report=MetricReport.LOWER_IS_BETTER) + data_size = self.pg.get_subdir_size("base") + self.zenbenchmark.record( + "data_size", data_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER + ) + wal_size = self.pg.get_subdir_size("pg_wal") + self.zenbenchmark.record( + "wal_size", wal_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER + ) @contextmanager def record_pageserver_writes(self, out_name): @@ -207,6 +209,7 @@ class VanillaCompare(PgCompare): class RemoteCompare(PgCompare): """PgCompare interface for a remote postgres instance.""" + def __init__(self, zenbenchmark, remote_pg: RemotePostgres): self._pg = remote_pg self._zenbenchmark = zenbenchmark @@ -247,18 +250,18 @@ class RemoteCompare(PgCompare): return self.zenbenchmark.record_duration(out_name) -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def neon_compare(request, zenbenchmark, pg_bin, neon_simple_env) -> NeonCompare: branch_name = request.node.name return NeonCompare(zenbenchmark, neon_simple_env, pg_bin, branch_name) -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def vanilla_compare(zenbenchmark, vanilla_pg) -> VanillaCompare: return VanillaCompare(zenbenchmark, vanilla_pg) -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def remote_compare(zenbenchmark, remote_pg) -> RemoteCompare: return RemoteCompare(zenbenchmark, remote_pg) diff --git a/test_runner/fixtures/log_helper.py b/test_runner/fixtures/log_helper.py index 7c2d83d4e3..17f2402391 100644 --- a/test_runner/fixtures/log_helper.py +++ b/test_runner/fixtures/log_helper.py @@ -1,5 +1,6 @@ import logging import logging.config + """ This file configures logging to use in python tests. Logs are automatically captured and shown in their @@ -22,20 +23,16 @@ https://docs.pytest.org/en/6.2.x/logging.html LOGGING = { "version": 1, "loggers": { - "root": { - "level": "INFO" - }, - "root.safekeeper_async": { - "level": "INFO" # a lot of logs on DEBUG level - } - } + "root": {"level": "INFO"}, + "root.safekeeper_async": {"level": "INFO"}, # a lot of logs on DEBUG level + }, } -def getLogger(name='root') -> logging.Logger: +def getLogger(name="root") -> logging.Logger: """Method to get logger for tests. - Should be used to get correctly initialized logger. """ + Should be used to get correctly initialized logger.""" return logging.getLogger(name) diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 6fc62c6ea9..6159e273c0 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -1,10 +1,10 @@ -from dataclasses import dataclass -from prometheus_client.parser import text_string_to_metric_families -from prometheus_client.samples import Sample -from typing import Dict, List from collections import defaultdict +from dataclasses import dataclass +from typing import Dict, List from fixtures.log_helper import log +from prometheus_client.parser import text_string_to_metric_families +from prometheus_client.samples import Sample class Metrics: diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 4483355c4c..388cc34182 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1,47 +1,45 @@ from __future__ import annotations -from dataclasses import field -from contextlib import contextmanager -from enum import Flag, auto -import enum -import textwrap -from cached_property import cached_property import abc -import asyncpg -import os -import boto3 -import pathlib -import uuid -import warnings -import jwt +import enum +import filecmp import json -import psycopg2 -import pytest +import os +import pathlib import re import shutil import socket import subprocess -import time -import filecmp -import tempfile import tarfile - -from contextlib import closing +import tempfile +import textwrap +import time +import uuid +import warnings +from contextlib import closing, contextmanager +from dataclasses import dataclass, field +from enum import Flag, auto from pathlib import Path -from dataclasses import dataclass +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, TypeVar, Union, cast + +import allure # type: ignore +import asyncpg +import backoff # type: ignore +import boto3 +import jwt +import psycopg2 +import pytest +import requests +from cached_property import cached_property +from fixtures.log_helper import log # Type-related stuff from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import make_dsn, parse_dsn -from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple from typing_extensions import Literal -import allure # type: ignore -import requests -import backoff # type: ignore +from .utils import etcd_path, get_self_dir, lsn_from_hex, lsn_to_hex, subprocess_capture -from .utils import (etcd_path, get_self_dir, subprocess_capture, lsn_from_hex, lsn_to_hex) -from fixtures.log_helper import log """ This file contains pytest fixtures. A fixture is a test resource that can be summoned by placing its name in the test's arguments. @@ -60,11 +58,11 @@ put directly-importable functions into utils.py or another separate file. """ Env = Dict[str, str] -Fn = TypeVar('Fn', bound=Callable[..., Any]) +Fn = TypeVar("Fn", bound=Callable[..., Any]) -DEFAULT_OUTPUT_DIR = 'test_output' -DEFAULT_POSTGRES_DIR = 'tmp_install' -DEFAULT_BRANCH_NAME = 'main' +DEFAULT_OUTPUT_DIR = "test_output" +DEFAULT_POSTGRES_DIR = "tmp_install" +DEFAULT_BRANCH_NAME = "main" BASE_PORT = 15000 WORKER_PORT_NUM = 1000 @@ -92,7 +90,7 @@ def check_interferring_processes(config): return # does not use -c as it is not supported on macOS - cmd = ['pgrep', 'pageserver|postgres|safekeeper'] + cmd = ["pgrep", "pageserver|postgres|safekeeper"] result = subprocess.run(cmd, stdout=subprocess.DEVNULL) if result.returncode == 0: # returncode of 0 means it found something. @@ -100,7 +98,7 @@ def check_interferring_processes(config): # result of the test. # NOTE this shows as an internal pytest error, there might be a better way raise Exception( - 'Found interfering processes running. Stop all Neon pageservers, nodes, safekeepers, as well as stand-alone Postgres.' + "Found interfering processes running. Stop all Neon pageservers, nodes, safekeepers, as well as stand-alone Postgres." ) @@ -111,18 +109,20 @@ def pytest_configure(config): """ check_interferring_processes(config) - numprocesses = config.getoption('numprocesses') - if numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768: # do not use ephemeral ports - raise Exception('Too many workers configured. Cannot distribute ports for services.') + numprocesses = config.getoption("numprocesses") + if ( + numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768 + ): # do not use ephemeral ports + raise Exception("Too many workers configured. Cannot distribute ports for services.") # find the base directory (currently this is the git root) global base_dir - base_dir = os.path.normpath(os.path.join(get_self_dir(), '../..')) - log.info(f'base_dir is {base_dir}') + base_dir = os.path.normpath(os.path.join(get_self_dir(), "../..")) + log.info(f"base_dir is {base_dir}") # Compute the top-level directory for all tests. global top_output_dir - env_test_output = os.environ.get('TEST_OUTPUT') + env_test_output = os.environ.get("TEST_OUTPUT") if env_test_output is not None: top_output_dir = env_test_output else: @@ -131,18 +131,18 @@ def pytest_configure(config): # Find the postgres installation. global pg_distrib_dir - env_postgres_bin = os.environ.get('POSTGRES_DISTRIB_DIR') + env_postgres_bin = os.environ.get("POSTGRES_DISTRIB_DIR") if env_postgres_bin: pg_distrib_dir = env_postgres_bin else: pg_distrib_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR)) - log.info(f'pg_distrib_dir is {pg_distrib_dir}') + log.info(f"pg_distrib_dir is {pg_distrib_dir}") if os.getenv("REMOTE_ENV"): # When testing against a remote server, we only need the client binary. - if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/psql')): + if not os.path.exists(os.path.join(pg_distrib_dir, "bin/psql")): raise Exception('psql not found at "{}"'.format(pg_distrib_dir)) else: - if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/postgres')): + if not os.path.exists(os.path.join(pg_distrib_dir, "bin/postgres")): raise Exception('postgres not found at "{}"'.format(pg_distrib_dir)) if os.getenv("REMOTE_ENV"): @@ -151,25 +151,26 @@ def pytest_configure(config): return # Find the neon binaries. global neon_binpath - env_neon_bin = os.environ.get('NEON_BIN') + env_neon_bin = os.environ.get("NEON_BIN") if env_neon_bin: neon_binpath = env_neon_bin else: - neon_binpath = os.path.join(base_dir, 'target/debug') - log.info(f'neon_binpath is {neon_binpath}') - if not os.path.exists(os.path.join(neon_binpath, 'pageserver')): + neon_binpath = os.path.join(base_dir, "target/debug") + log.info(f"neon_binpath is {neon_binpath}") + if not os.path.exists(os.path.join(neon_binpath, "pageserver")): raise Exception('neon binaries not found at "{}"'.format(neon_binpath)) def profiling_supported(): - """Return True if the pageserver was compiled with the 'profiling' feature - """ - bin_pageserver = os.path.join(str(neon_binpath), 'pageserver') - res = subprocess.run([bin_pageserver, '--version'], - check=True, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + """Return True if the pageserver was compiled with the 'profiling' feature""" + bin_pageserver = os.path.join(str(neon_binpath), "pageserver") + res = subprocess.run( + [bin_pageserver, "--version"], + check=True, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) return "profiling:true" in res.stdout @@ -181,21 +182,21 @@ def shareable_scope(fixture_name, config) -> Literal["session", "function"]: def myfixture(...) ... """ - return 'function' if os.environ.get('TEST_SHARED_FIXTURES') is None else 'session' + return "function" if os.environ.get("TEST_SHARED_FIXTURES") is None else "session" -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def worker_seq_no(worker_id: str): # worker_id is a pytest-xdist fixture # it can be master or gw # parse it to always get a number - if worker_id == 'master': + if worker_id == "master": return 0 - assert worker_id.startswith('gw') + assert worker_id.startswith("gw") return int(worker_id[2:]) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def worker_base_port(worker_seq_no: int): # so we divide ports in ranges of 100 ports # so workers have disjoint set of ports for services @@ -247,15 +248,16 @@ class PortDistributor: return port else: raise RuntimeError( - 'port range configured for test is exhausted, consider enlarging the range') + "port range configured for test is exhausted, consider enlarging the range" + ) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def port_distributor(worker_base_port): return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def default_broker(request: Any, port_distributor: PortDistributor): client_port = port_distributor.get_port() # multiple pytest sessions could get launched in parallel, get them different datadirs @@ -267,12 +269,12 @@ def default_broker(request: Any, port_distributor: PortDistributor): broker.stop() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def run_id(): yield uuid.uuid4() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def mock_s3_server(port_distributor: PortDistributor): mock_s3_server = MockS3Server(port_distributor.get_port()) yield mock_s3_server @@ -280,7 +282,8 @@ def mock_s3_server(port_distributor: PortDistributor): class PgProtocol: - """ Reusable connection logic """ + """Reusable connection logic""" + def __init__(self, **kwargs): self.default_options = kwargs @@ -292,18 +295,18 @@ class PgProtocol: def conn_options(self, **kwargs): result = self.default_options.copy() - if 'dsn' in kwargs: - result.update(parse_dsn(kwargs['dsn'])) + if "dsn" in kwargs: + result.update(parse_dsn(kwargs["dsn"])) result.update(kwargs) # Individual statement timeout in seconds. 2 minutes should be # enough for our tests, but if you need a longer, you can # change it by calling "SET statement_timeout" after # connecting. - options = result.get('options', '') + options = result.get("options", "") if "statement_timeout" not in options: - options = f'-cstatement_timeout=120s {options}' - result['options'] = options + options = f"-cstatement_timeout=120s {options}" + result["options"] = options return result # autocommit=True here by default because that's what we need most of the time @@ -339,19 +342,19 @@ class PgProtocol: # The psycopg2 option 'dbname' is called 'database' is asyncpg conn_options = self.conn_options(**kwargs) - if 'dbname' in conn_options: - conn_options['database'] = conn_options.pop('dbname') + if "dbname" in conn_options: + conn_options["database"] = conn_options.pop("dbname") # Convert options='-c=' to server_settings - if 'options' in conn_options: - options = conn_options.pop('options') - for match in re.finditer(r'-c(\w*)=(\w*)', options): + if "options" in conn_options: + options = conn_options.pop("options") + for match in re.finditer(r"-c(\w*)=(\w*)", options): key = match.group(1) val = match.group(2) - if 'server_options' in conn_options: - conn_options['server_settings'].update({key: val}) + if "server_options" in conn_options: + conn_options["server_settings"].update({key: val}) else: - conn_options['server_settings'] = {key: val} + conn_options["server_settings"] = {key: val} return await asyncpg.connect(**conn_options) def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]: @@ -397,11 +400,9 @@ class AuthKeys: return token def generate_tenant_token(self, tenant_id): - token = jwt.encode({ - "scope": "tenant", "tenant_id": tenant_id - }, - self.priv, - algorithm="RS256") + token = jwt.encode( + {"scope": "tenant", "tenant_id": tenant_id}, self.priv, algorithm="RS256" + ) if isinstance(token, bytes): token = token.decode() @@ -416,6 +417,7 @@ class MockS3Server: Also provides a set of methods to derive the connection properties from and the method to kill the underlying server. """ + def __init__( self, port: int, @@ -425,7 +427,7 @@ class MockS3Server: # XXX: do not use `shell=True` or add `exec ` to the command here otherwise. # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux # if a process is started from the shell process. - self.subprocess = subprocess.Popen(['poetry', 'run', 'moto_server', 's3', f'-p{port}']) + self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"]) error = None try: return_code = self.subprocess.poll() @@ -442,13 +444,13 @@ class MockS3Server: return f"http://127.0.0.1:{self.port}" def region(self) -> str: - return 'us-east-1' + return "us-east-1" def access_key(self) -> str: - return 'test' + return "test" def secret_key(self) -> str: - return 'test' + return "test" def kill(self): self.subprocess.kill() @@ -487,8 +489,8 @@ class S3Storage: def access_env_vars(self) -> Dict[str, str]: return { - 'AWS_ACCESS_KEY_ID': self.access_key, - 'AWS_SECRET_ACCESS_KEY': self.secret_key, + "AWS_ACCESS_KEY_ID": self.access_key, + "AWS_SECRET_ACCESS_KEY": self.secret_key, } @@ -528,6 +530,7 @@ class NeonEnvBuilder: created in the right directory, based on the test name, and it's properly cleaned up after the test has finished. """ + def __init__( self, repo_dir: Path, @@ -592,7 +595,7 @@ class NeonEnvBuilder: elif remote_storage_kind == RemoteStorageKind.REAL_S3: self.enable_real_s3_remote_storage(test_name=test_name, force_enable=force_enable) else: - raise RuntimeError(f'Unknown storage type: {remote_storage_kind}') + raise RuntimeError(f"Unknown storage type: {remote_storage_kind}") def enable_local_fs_remote_storage(self, force_enable=True): """ @@ -600,7 +603,7 @@ class NeonEnvBuilder: Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. """ assert force_enable or self.remote_storage is None, "remote storage is enabled already" - self.remote_storage = LocalFsStorage(Path(self.repo_dir / 'local_fs_remote_storage')) + self.remote_storage = LocalFsStorage(Path(self.repo_dir / "local_fs_remote_storage")) def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable=True): """ @@ -613,7 +616,7 @@ class NeonEnvBuilder: mock_region = self.mock_s3_server.region() self.remote_storage_client = boto3.client( - 's3', + "s3", endpoint_url=mock_endpoint, region_name=mock_region, aws_access_key_id=self.mock_s3_server.access_key(), @@ -652,20 +655,22 @@ class NeonEnvBuilder: self.keep_remote_storage_contents = False # construct a prefix inside bucket for the particular test case and test run - self.remote_storage_prefix = f'{self.run_id}/{test_name}' + self.remote_storage_prefix = f"{self.run_id}/{test_name}" self.remote_storage_client = boto3.client( - 's3', + "s3", region_name=region, aws_access_key_id=access_key, aws_secret_access_key=secret_key, aws_session_token=session_token, ) - self.remote_storage = S3Storage(bucket_name=bucket_name, - bucket_region=region, - access_key=access_key, - secret_key=secret_key, - prefix_in_bucket=self.remote_storage_prefix) + self.remote_storage = S3Storage( + bucket_name=bucket_name, + bucket_region=region, + access_key=access_key, + secret_key=secret_key, + prefix_in_bucket=self.remote_storage_prefix, + ) def cleanup_remote_storage(self): # here wee check for true remote storage, no the local one @@ -678,26 +683,28 @@ class NeonEnvBuilder: log.info("keep_remote_storage_contents skipping remote storage cleanup") return - log.info("removing data from test s3 bucket %s by prefix %s", - self.remote_storage.bucket_name, - self.remote_storage_prefix) - paginator = self.remote_storage_client.get_paginator('list_objects_v2') + log.info( + "removing data from test s3 bucket %s by prefix %s", + self.remote_storage.bucket_name, + self.remote_storage_prefix, + ) + paginator = self.remote_storage_client.get_paginator("list_objects_v2") pages = paginator.paginate( Bucket=self.remote_storage.bucket_name, Prefix=self.remote_storage_prefix, ) - objects_to_delete = {'Objects': []} + objects_to_delete = {"Objects": []} cnt = 0 - for item in pages.search('Contents'): + for item in pages.search("Contents"): # weirdly when nothing is found it returns [None] if item is None: break - objects_to_delete['Objects'].append({'Key': item['Key']}) + objects_to_delete["Objects"].append({"Key": item["Key"]}) # flush once aws limit reached - if len(objects_to_delete['Objects']) >= 1000: + if len(objects_to_delete["Objects"]) >= 1000: self.remote_storage_client.delete_objects( Bucket=self.remote_storage.bucket_name, Delete=objects_to_delete, @@ -706,9 +713,10 @@ class NeonEnvBuilder: cnt += 1 # flush rest - if len(objects_to_delete['Objects']): - self.remote_storage_client.delete_objects(Bucket=self.remote_storage.bucket_name, - Delete=objects_to_delete) + if len(objects_to_delete["Objects"]): + self.remote_storage_client.delete_objects( + Bucket=self.remote_storage.bucket_name, Delete=objects_to_delete + ) log.info("deleted %s objects from remote storage", cnt) @@ -718,7 +726,7 @@ class NeonEnvBuilder: def __exit__(self, exc_type, exc_value, traceback): # Stop all the nodes. if self.env: - log.info('Cleaning up all storage and compute nodes') + log.info("Cleaning up all storage and compute nodes") self.env.postgres.stop_all() for sk in self.env.safekeepers: sk.stop(immediate=True) @@ -759,6 +767,7 @@ class NeonEnv: create_tenant() - initializes a new tenant in the page server, returns the tenant id """ + def __init__(self, config: NeonEnvBuilder): self.repo_dir = config.repo_dir self.rust_log_override = config.rust_log_override @@ -776,15 +785,19 @@ class NeonEnv: self.initial_tenant = uuid.uuid4() # Create a config file corresponding to the options - toml = textwrap.dedent(f""" + toml = textwrap.dedent( + f""" default_tenant_id = '{self.initial_tenant.hex}' - """) + """ + ) - toml += textwrap.dedent(f""" + toml += textwrap.dedent( + f""" [etcd_broker] broker_endpoints = ['{self.broker.client_url()}'] etcd_binary_path = '{self.broker.binary_path}' - """) + """ + ) # Create config for pageserver pageserver_port = PageserverPort( @@ -793,18 +806,20 @@ class NeonEnv: ) pageserver_auth_type = "ZenithJWT" if config.auth_enabled else "Trust" - toml += textwrap.dedent(f""" + toml += textwrap.dedent( + f""" [pageserver] id=1 listen_pg_addr = 'localhost:{pageserver_port.pg}' listen_http_addr = 'localhost:{pageserver_port.http}' auth_type = '{pageserver_auth_type}' - """) + """ + ) # Create a corresponding NeonPageserver object - self.pageserver = NeonPageserver(self, - port=pageserver_port, - config_override=config.pageserver_config_override) + self.pageserver = NeonPageserver( + self, port=pageserver_port, config_override=config.pageserver_config_override + ) # Create config and a Safekeeper object for each safekeeper for i in range(1, config.num_safekeepers + 1): @@ -813,21 +828,29 @@ class NeonEnv: http=self.port_distributor.get_port(), ) id = config.safekeepers_id_start + i # assign ids sequentially - toml += textwrap.dedent(f""" + toml += textwrap.dedent( + f""" [[safekeepers]] id = {id} pg_port = {port.pg} http_port = {port.http} - sync = {'true' if config.safekeepers_enable_fsync else 'false'}""") + sync = {'true' if config.safekeepers_enable_fsync else 'false'}""" + ) if config.auth_enabled: - toml += textwrap.dedent(f""" + toml += textwrap.dedent( + f""" auth_enabled = true - """) - if bool(self.remote_storage_users - & RemoteStorageUsers.SAFEKEEPER) and self.remote_storage is not None: - toml += textwrap.dedent(f""" + """ + ) + if ( + bool(self.remote_storage_users & RemoteStorageUsers.SAFEKEEPER) + and self.remote_storage is not None + ): + toml += textwrap.dedent( + f""" remote_storage = "{remote_storage_to_toml_inline_table(self.remote_storage)}" - """) + """ + ) safekeeper = Safekeeper(env=self, id=id, port=port) self.safekeepers.append(safekeeper) @@ -843,8 +866,8 @@ class NeonEnv: safekeeper.start() def get_safekeeper_connstrs(self) -> str: - """ Get list of safekeeper endpoints suitable for safekeepers GUC """ - return ','.join([f'localhost:{wa.port.pg}' for wa in self.safekeepers]) + """Get list of safekeeper endpoints suitable for safekeepers GUC""" + return ",".join([f"localhost:{wa.port.pg}" for wa in self.safekeepers]) def timeline_dir(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" @@ -852,8 +875,8 @@ class NeonEnv: @cached_property def auth_keys(self) -> AuthKeys: - pub = (Path(self.repo_dir) / 'auth_public_key.pem').read_bytes() - priv = (Path(self.repo_dir) / 'auth_private_key.pem').read_bytes() + pub = (Path(self.repo_dir) / "auth_public_key.pem").read_bytes() + priv = (Path(self.repo_dir) / "auth_private_key.pem").read_bytes() return AuthKeys(pub=pub, priv=priv) @@ -866,11 +889,11 @@ def _shared_simple_env( run_id: uuid.UUID, ) -> Iterator[NeonEnv]: """ - # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES - is set, this is shared by all tests using `neon_simple_env`. + # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES + is set, this is shared by all tests using `neon_simple_env`. """ - if os.environ.get('TEST_SHARED_FIXTURES') is None: + if os.environ.get("TEST_SHARED_FIXTURES") is None: # Create the environment in the per-test output directory repo_dir = os.path.join(get_test_output_dir(request), "repo") else: @@ -879,21 +902,21 @@ def _shared_simple_env( shutil.rmtree(repo_dir, ignore_errors=True) with NeonEnvBuilder( - repo_dir=Path(repo_dir), - port_distributor=port_distributor, - broker=default_broker, - mock_s3_server=mock_s3_server, - run_id=run_id, + repo_dir=Path(repo_dir), + port_distributor=port_distributor, + broker=default_broker, + mock_s3_server=mock_s3_server, + run_id=run_id, ) as builder: env = builder.init_start() # For convenience in tests, create a branch from the freshly-initialized cluster. - env.neon_cli.create_branch('empty', ancestor_branch_name=DEFAULT_BRANCH_NAME) + env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) yield env -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]: """ Simple Neon environment, with no authentication and no safekeepers. @@ -908,7 +931,7 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]: _shared_simple_env.postgres.stop_all() -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def neon_env_builder( test_output_dir, port_distributor: PortDistributor, @@ -934,11 +957,11 @@ def neon_env_builder( # Return the builder to the caller with NeonEnvBuilder( - repo_dir=Path(repo_dir), - port_distributor=port_distributor, - mock_s3_server=mock_s3_server, - broker=default_broker, - run_id=run_id, + repo_dir=Path(repo_dir), + port_distributor=port_distributor, + mock_s3_server=mock_s3_server, + broker=default_broker, + run_id=run_id, ) as builder: yield builder @@ -954,16 +977,16 @@ class NeonPageserverHttpClient(requests.Session): self.auth_token = auth_token if auth_token is not None: - self.headers['Authorization'] = f'Bearer {auth_token}' + self.headers["Authorization"] = f"Bearer {auth_token}" def verbose_error(self, res: requests.Response): try: res.raise_for_status() except requests.RequestException as e: try: - msg = res.json()['msg'] + msg = res.json()["msg"] except: - msg = '' + msg = "" raise NeonPageserverApiException(msg) from e def check_status(self): @@ -980,12 +1003,12 @@ class NeonPageserverHttpClient(requests.Session): res = self.post( f"http://localhost:{self.port}/v1/tenant", json={ - 'new_tenant_id': new_tenant_id.hex if new_tenant_id else None, + "new_tenant_id": new_tenant_id.hex if new_tenant_id else None, }, ) self.verbose_error(res) if res.status_code == 409: - raise Exception(f'could not create tenant: already exists for id {new_tenant_id}') + raise Exception(f"could not create tenant: already exists for id {new_tenant_id}") new_tenant_id = res.json() assert isinstance(new_tenant_id, str) return uuid.UUID(new_tenant_id) @@ -1019,28 +1042,29 @@ class NeonPageserverHttpClient(requests.Session): ancestor_timeline_id: Optional[uuid.UUID] = None, ancestor_start_lsn: Optional[str] = None, ) -> Dict[Any, Any]: - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline", - json={ - 'new_timeline_id': - new_timeline_id.hex if new_timeline_id else None, - 'ancestor_start_lsn': - ancestor_start_lsn, - 'ancestor_timeline_id': - ancestor_timeline_id.hex if ancestor_timeline_id else None, - }) + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline", + json={ + "new_timeline_id": new_timeline_id.hex if new_timeline_id else None, + "ancestor_start_lsn": ancestor_start_lsn, + "ancestor_timeline_id": ancestor_timeline_id.hex if ancestor_timeline_id else None, + }, + ) self.verbose_error(res) if res.status_code == 409: - raise Exception(f'could not create timeline: already exists for id {new_timeline_id}') + raise Exception(f"could not create timeline: already exists for id {new_timeline_id}") res_json = res.json() assert isinstance(res_json, dict) return res_json - def timeline_detail(self, - tenant_id: uuid.UUID, - timeline_id: uuid.UUID, - include_non_incremental_logical_size: bool = False, - include_non_incremental_physical_size: bool = False) -> Dict[Any, Any]: + def timeline_detail( + self, + tenant_id: uuid.UUID, + timeline_id: uuid.UUID, + include_non_incremental_logical_size: bool = False, + include_non_incremental_physical_size: bool = False, + ) -> Dict[Any, Any]: include_non_incremental_logical_size_str = "0" if include_non_incremental_logical_size: @@ -1051,9 +1075,10 @@ class NeonPageserverHttpClient(requests.Session): include_non_incremental_physical_size_str = "1" res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}" + - "?include-non-incremental-logical-size={include_non_incremental_logical_size_str}" + - "&include-non-incremental-physical-size={include_non_incremental_physical_size_str}") + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}" + + "?include-non-incremental-logical-size={include_non_incremental_logical_size_str}" + + "&include-non-incremental-physical-size={include_non_incremental_physical_size_str}" + ) self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) @@ -1061,7 +1086,8 @@ class NeonPageserverHttpClient(requests.Session): def timeline_delete(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): res = self.delete( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}") + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}" + ) self.verbose_error(res) res_json = res.json() assert res_json is None @@ -1079,12 +1105,15 @@ class PageserverPort: http: int -CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", - re.MULTILINE) -CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", - re.MULTILINE) -TIMELINE_DATA_EXTRACTOR = re.compile(r"\s(?P[^\s]+)\s\[(?P[^\]]+)\]", - re.MULTILINE) +CREATE_TIMELINE_ID_EXTRACTOR = re.compile( + r"^Created timeline '(?P[^']+)'", re.MULTILINE +) +CREATE_TIMELINE_ID_EXTRACTOR = re.compile( + r"^Created timeline '(?P[^']+)'", re.MULTILINE +) +TIMELINE_DATA_EXTRACTOR = re.compile( + r"\s(?P[^\s]+)\s\[(?P[^\]]+)\]", re.MULTILINE +) class AbstractNeonCli(abc.ABC): @@ -1093,15 +1122,18 @@ class AbstractNeonCli(abc.ABC): Supports a way to run arbitrary command directly via CLI. Do not use directly, use specific subclasses instead. """ + def __init__(self, env: NeonEnv): self.env = env COMMAND: str = cast(str, None) # To be overwritten by the derived class. - def raw_cli(self, - arguments: List[str], - extra_env_vars: Optional[Dict[str, str]] = None, - check_return_code=True) -> 'subprocess.CompletedProcess[str]': + def raw_cli( + self, + arguments: List[str], + extra_env_vars: Optional[Dict[str, str]] = None, + check_return_code=True, + ) -> "subprocess.CompletedProcess[str]": """ Run the command with the specified arguments. @@ -1122,30 +1154,32 @@ class AbstractNeonCli(abc.ABC): bin_neon = os.path.join(str(neon_binpath), self.COMMAND) args = [bin_neon] + arguments - log.info('Running command "{}"'.format(' '.join(args))) + log.info('Running command "{}"'.format(" ".join(args))) log.info(f'Running in "{self.env.repo_dir}"') env_vars = os.environ.copy() - env_vars['NEON_REPO_DIR'] = str(self.env.repo_dir) - env_vars['POSTGRES_DISTRIB_DIR'] = str(pg_distrib_dir) + env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir) + env_vars["POSTGRES_DISTRIB_DIR"] = str(pg_distrib_dir) if self.env.rust_log_override is not None: - env_vars['RUST_LOG'] = self.env.rust_log_override + env_vars["RUST_LOG"] = self.env.rust_log_override for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): env_vars[extra_env_key] = extra_env_value # Pass coverage settings - var = 'LLVM_PROFILE_FILE' + var = "LLVM_PROFILE_FILE" val = os.environ.get(var) if val: env_vars[var] = val # Intercept CalledProcessError and print more info - res = subprocess.run(args, - env=env_vars, - check=False, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + res = subprocess.run( + args, + env=env_vars, + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) if not res.returncode: log.info(f"Run success: {res.stdout}") elif check_return_code: @@ -1156,10 +1190,9 @@ class AbstractNeonCli(abc.ABC): stderr: {res.stderr} """ log.info(msg) - raise Exception(msg) from subprocess.CalledProcessError(res.returncode, - res.args, - res.stdout, - res.stderr) + raise Exception(msg) from subprocess.CalledProcessError( + res.returncode, res.args, res.stdout, res.stderr + ) return res @@ -1169,12 +1202,14 @@ class NeonCli(AbstractNeonCli): Supports main commands via typed methods and a way to run arbitrary command directly via CLI. """ - COMMAND = 'neon_local' + COMMAND = "neon_local" - def create_tenant(self, - tenant_id: Optional[uuid.UUID] = None, - timeline_id: Optional[uuid.UUID] = None, - conf: Optional[Dict[str, str]] = None) -> Tuple[uuid.UUID, uuid.UUID]: + def create_tenant( + self, + tenant_id: Optional[uuid.UUID] = None, + timeline_id: Optional[uuid.UUID] = None, + conf: Optional[Dict[str, str]] = None, + ) -> Tuple[uuid.UUID, uuid.UUID]: """ Creates a new tenant, returns its id and its initial timeline's id. """ @@ -1183,13 +1218,14 @@ class NeonCli(AbstractNeonCli): if timeline_id is None: timeline_id = uuid.uuid4() if conf is None: - res = self.raw_cli([ - 'tenant', 'create', '--tenant-id', tenant_id.hex, '--timeline-id', timeline_id.hex - ]) + res = self.raw_cli( + ["tenant", "create", "--tenant-id", tenant_id.hex, "--timeline-id", timeline_id.hex] + ) else: - res = self.raw_cli([ - 'tenant', 'create', '--tenant-id', tenant_id.hex, '--timeline-id', timeline_id.hex - ] + sum(list(map(lambda kv: (['-c', kv[0] + ':' + kv[1]]), conf.items())), [])) + res = self.raw_cli( + ["tenant", "create", "--tenant-id", tenant_id.hex, "--timeline-id", timeline_id.hex] + + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) + ) res.check_returncode() return tenant_id, timeline_id @@ -1198,27 +1234,28 @@ class NeonCli(AbstractNeonCli): Update tenant config. """ if conf is None: - res = self.raw_cli(['tenant', 'config', '--tenant-id', tenant_id.hex]) + res = self.raw_cli(["tenant", "config", "--tenant-id", tenant_id.hex]) else: res = self.raw_cli( - ['tenant', 'config', '--tenant-id', tenant_id.hex] + - sum(list(map(lambda kv: (['-c', kv[0] + ':' + kv[1]]), conf.items())), [])) + ["tenant", "config", "--tenant-id", tenant_id.hex] + + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) + ) res.check_returncode() - def list_tenants(self) -> 'subprocess.CompletedProcess[str]': - res = self.raw_cli(['tenant', 'list']) + def list_tenants(self) -> "subprocess.CompletedProcess[str]": + res = self.raw_cli(["tenant", "list"]) res.check_returncode() return res - def create_timeline(self, - new_branch_name: str, - tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: + def create_timeline( + self, new_branch_name: str, tenant_id: Optional[uuid.UUID] = None + ) -> uuid.UUID: cmd = [ - 'timeline', - 'create', - '--branch-name', + "timeline", + "create", + "--branch-name", new_branch_name, - '--tenant-id', + "--tenant-id", (tenant_id or self.env.initial_tenant).hex, ] @@ -1229,17 +1266,17 @@ class NeonCli(AbstractNeonCli): created_timeline_id = None if matches is not None: - created_timeline_id = matches.group('timeline_id') + created_timeline_id = matches.group("timeline_id") return uuid.UUID(created_timeline_id) def create_root_branch(self, branch_name: str, tenant_id: Optional[uuid.UUID] = None): cmd = [ - 'timeline', - 'create', - '--branch-name', + "timeline", + "create", + "--branch-name", branch_name, - '--tenant-id', + "--tenant-id", (tenant_id or self.env.initial_tenant).hex, ] @@ -1250,30 +1287,32 @@ class NeonCli(AbstractNeonCli): created_timeline_id = None if matches is not None: - created_timeline_id = matches.group('timeline_id') + created_timeline_id = matches.group("timeline_id") if created_timeline_id is None: - raise Exception('could not find timeline id after `neon timeline create` invocation') + raise Exception("could not find timeline id after `neon timeline create` invocation") else: return uuid.UUID(created_timeline_id) - def create_branch(self, - new_branch_name: str = DEFAULT_BRANCH_NAME, - ancestor_branch_name: Optional[str] = None, - tenant_id: Optional[uuid.UUID] = None, - ancestor_start_lsn: Optional[str] = None) -> uuid.UUID: + def create_branch( + self, + new_branch_name: str = DEFAULT_BRANCH_NAME, + ancestor_branch_name: Optional[str] = None, + tenant_id: Optional[uuid.UUID] = None, + ancestor_start_lsn: Optional[str] = None, + ) -> uuid.UUID: cmd = [ - 'timeline', - 'branch', - '--branch-name', + "timeline", + "branch", + "--branch-name", new_branch_name, - '--tenant-id', + "--tenant-id", (tenant_id or self.env.initial_tenant).hex, ] if ancestor_branch_name is not None: - cmd.extend(['--ancestor-branch-name', ancestor_branch_name]) + cmd.extend(["--ancestor-branch-name", ancestor_branch_name]) if ancestor_start_lsn is not None: - cmd.extend(['--ancestor-start-lsn', ancestor_start_lsn]) + cmd.extend(["--ancestor-start-lsn", ancestor_start_lsn]) res = self.raw_cli(cmd) res.check_returncode() @@ -1282,10 +1321,10 @@ class NeonCli(AbstractNeonCli): created_timeline_id = None if matches is not None: - created_timeline_id = matches.group('timeline_id') + created_timeline_id = matches.group("timeline_id") if created_timeline_id is None: - raise Exception('could not find timeline id after `neon timeline create` invocation') + raise Exception("could not find timeline id after `neon timeline create` invocation") else: return uuid.UUID(created_timeline_id) @@ -1297,52 +1336,60 @@ class NeonCli(AbstractNeonCli): # (L) main [b49f7954224a0ad25cc0013ea107b54b] # (L) ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540] res = self.raw_cli( - ['timeline', 'list', '--tenant-id', (tenant_id or self.env.initial_tenant).hex]) + ["timeline", "list", "--tenant-id", (tenant_id or self.env.initial_tenant).hex] + ) timelines_cli = sorted( - map(lambda branch_and_id: (branch_and_id[0], branch_and_id[1]), - TIMELINE_DATA_EXTRACTOR.findall(res.stdout))) + map( + lambda branch_and_id: (branch_and_id[0], branch_and_id[1]), + TIMELINE_DATA_EXTRACTOR.findall(res.stdout), + ) + ) return timelines_cli - def init(self, - config_toml: str, - initial_timeline_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]': - with tempfile.NamedTemporaryFile(mode='w+') as tmp: + def init( + self, config_toml: str, initial_timeline_id: Optional[uuid.UUID] = None + ) -> "subprocess.CompletedProcess[str]": + with tempfile.NamedTemporaryFile(mode="w+") as tmp: tmp.write(config_toml) tmp.flush() - cmd = ['init', f'--config={tmp.name}'] + cmd = ["init", f"--config={tmp.name}"] if initial_timeline_id: - cmd.extend(['--timeline-id', initial_timeline_id.hex]) + cmd.extend(["--timeline-id", initial_timeline_id.hex]) append_pageserver_param_overrides( params_to_update=cmd, remote_storage=self.env.remote_storage, remote_storage_users=self.env.remote_storage_users, - pageserver_config_override=self.env.pageserver.config_override) + pageserver_config_override=self.env.pageserver.config_override, + ) res = self.raw_cli(cmd) res.check_returncode() return res def pageserver_enabled_features(self) -> Any: - bin_pageserver = os.path.join(str(neon_binpath), 'pageserver') - args = [bin_pageserver, '--enabled-features'] - log.info('Running command "{}"'.format(' '.join(args))) + bin_pageserver = os.path.join(str(neon_binpath), "pageserver") + args = [bin_pageserver, "--enabled-features"] + log.info('Running command "{}"'.format(" ".join(args))) - res = subprocess.run(args, - check=True, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + res = subprocess.run( + args, + check=True, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) log.info(f"pageserver_enabled_features success: {res.stdout}") return json.loads(res.stdout) - def pageserver_start(self, overrides=()) -> 'subprocess.CompletedProcess[str]': - start_args = ['pageserver', 'start', *overrides] + def pageserver_start(self, overrides=()) -> "subprocess.CompletedProcess[str]": + start_args = ["pageserver", "start", *overrides] append_pageserver_param_overrides( params_to_update=start_args, remote_storage=self.env.remote_storage, remote_storage_users=self.env.remote_storage_users, - pageserver_config_override=self.env.pageserver.config_override) + pageserver_config_override=self.env.pageserver.config_override, + ) s3_env_vars = None if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage): @@ -1350,29 +1397,29 @@ class NeonCli(AbstractNeonCli): return self.raw_cli(start_args, extra_env_vars=s3_env_vars) - def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]': - cmd = ['pageserver', 'stop'] + def pageserver_stop(self, immediate=False) -> "subprocess.CompletedProcess[str]": + cmd = ["pageserver", "stop"] if immediate: - cmd.extend(['-m', 'immediate']) + cmd.extend(["-m", "immediate"]) log.info(f"Stopping pageserver with {cmd}") return self.raw_cli(cmd) - def safekeeper_start(self, id: int) -> 'subprocess.CompletedProcess[str]': + def safekeeper_start(self, id: int) -> "subprocess.CompletedProcess[str]": s3_env_vars = None if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage): s3_env_vars = self.env.remote_storage.access_env_vars() - return self.raw_cli(['safekeeper', 'start', str(id)], extra_env_vars=s3_env_vars) + return self.raw_cli(["safekeeper", "start", str(id)], extra_env_vars=s3_env_vars) - def safekeeper_stop(self, - id: Optional[int] = None, - immediate=False) -> 'subprocess.CompletedProcess[str]': - args = ['safekeeper', 'stop'] + def safekeeper_stop( + self, id: Optional[int] = None, immediate=False + ) -> "subprocess.CompletedProcess[str]": + args = ["safekeeper", "stop"] if id is not None: args.append(str(id)) if immediate: - args.extend(['-m', 'immediate']) + args.extend(["-m", "immediate"]) return self.raw_cli(args) def pg_create( @@ -1382,19 +1429,19 @@ class NeonCli(AbstractNeonCli): tenant_id: Optional[uuid.UUID] = None, lsn: Optional[str] = None, port: Optional[int] = None, - ) -> 'subprocess.CompletedProcess[str]': + ) -> "subprocess.CompletedProcess[str]": args = [ - 'pg', - 'create', - '--tenant-id', + "pg", + "create", + "--tenant-id", (tenant_id or self.env.initial_tenant).hex, - '--branch-name', + "--branch-name", branch_name, ] if lsn is not None: - args.extend(['--lsn', lsn]) + args.extend(["--lsn", lsn]) if port is not None: - args.extend(['--port', str(port)]) + args.extend(["--port", str(port)]) if node_name is not None: args.append(node_name) @@ -1408,17 +1455,17 @@ class NeonCli(AbstractNeonCli): tenant_id: Optional[uuid.UUID] = None, lsn: Optional[str] = None, port: Optional[int] = None, - ) -> 'subprocess.CompletedProcess[str]': + ) -> "subprocess.CompletedProcess[str]": args = [ - 'pg', - 'start', - '--tenant-id', + "pg", + "start", + "--tenant-id", (tenant_id or self.env.initial_tenant).hex, ] if lsn is not None: - args.append(f'--lsn={lsn}') + args.append(f"--lsn={lsn}") if port is not None: - args.append(f'--port={port}') + args.append(f"--port={port}") if node_name is not None: args.append(node_name) @@ -1432,15 +1479,15 @@ class NeonCli(AbstractNeonCli): tenant_id: Optional[uuid.UUID] = None, destroy=False, check_return_code=True, - ) -> 'subprocess.CompletedProcess[str]': + ) -> "subprocess.CompletedProcess[str]": args = [ - 'pg', - 'stop', - '--tenant-id', + "pg", + "stop", + "--tenant-id", (tenant_id or self.env.initial_tenant).hex, ] if destroy: - args.append('--destroy') + args.append("--destroy") if node_name is not None: args.append(node_name) @@ -1453,12 +1500,12 @@ class WalCraft(AbstractNeonCli): Supports main commands via typed methods and a way to run arbitrary command directly via CLI. """ - COMMAND = 'wal_craft' + COMMAND = "wal_craft" def postgres_config(self) -> List[str]: res = self.raw_cli(["print-postgres-config"]) res.check_returncode() - return res.stdout.split('\n') + return res.stdout.split("\n") def in_existing(self, type: str, connection: str) -> None: res = self.raw_cli(["in-existing", type, connection]) @@ -1471,14 +1518,15 @@ class NeonPageserver(PgProtocol): Initializes the repository via `neon init`. """ + def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional[str] = None): - super().__init__(host='localhost', port=port.pg, user='cloud_admin') + super().__init__(host="localhost", port=port.pg, user="cloud_admin") self.env = env self.running = False self.service_port = port self.config_override = config_override - def start(self, overrides=()) -> 'NeonPageserver': + def start(self, overrides=()) -> "NeonPageserver": """ Start the page server. `overrides` allows to add some config to this pageserver start. @@ -1490,7 +1538,7 @@ class NeonPageserver(PgProtocol): self.running = True return self - def stop(self, immediate=False) -> 'NeonPageserver': + def stop(self, immediate=False) -> "NeonPageserver": """ Stop the page server. Returns self. @@ -1523,31 +1571,33 @@ def append_pageserver_param_overrides( remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage) params_to_update.append( - f'--pageserver-config-override=remote_storage={remote_storage_toml_table}') + f"--pageserver-config-override=remote_storage={remote_storage_toml_table}" + ) - env_overrides = os.getenv('ZENITH_PAGESERVER_OVERRIDES') + env_overrides = os.getenv("ZENITH_PAGESERVER_OVERRIDES") if env_overrides is not None: params_to_update += [ - f'--pageserver-config-override={o.strip()}' for o in env_overrides.split(';') + f"--pageserver-config-override={o.strip()}" for o in env_overrides.split(";") ] if pageserver_config_override is not None: params_to_update += [ - f'--pageserver-config-override={o.strip()}' - for o in pageserver_config_override.split(';') + f"--pageserver-config-override={o.strip()}" + for o in pageserver_config_override.split(";") ] class PgBin: - """ A helper class for executing postgres binaries """ + """A helper class for executing postgres binaries""" + def __init__(self, log_dir: Path): self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), 'bin') + self.pg_bin_path = os.path.join(str(pg_distrib_dir), "bin") self.env = os.environ.copy() - self.env['LD_LIBRARY_PATH'] = os.path.join(str(pg_distrib_dir), 'lib') + self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), "lib") def _fixpath(self, command: List[str]): - if '/' not in command[0]: + if "/" not in command[0]: command[0] = os.path.join(self.pg_bin_path, command[0]) def _build_env(self, env_add: Optional[Env]) -> Env: @@ -1572,15 +1622,17 @@ class PgBin: """ self._fixpath(command) - log.info('Running command "{}"'.format(' '.join(command))) + log.info('Running command "{}"'.format(" ".join(command))) env = self._build_env(env) subprocess.run(command, env=env, cwd=cwd, check=True) - def run_capture(self, - command: List[str], - env: Optional[Env] = None, - cwd: Optional[str] = None, - **kwargs: Any) -> str: + def run_capture( + self, + command: List[str], + env: Optional[Env] = None, + cwd: Optional[str] = None, + **kwargs: Any, + ) -> str: """ Run one of the postgres binaries, with stderr and stdout redirected to a file. @@ -1589,35 +1641,32 @@ class PgBin: """ self._fixpath(command) - log.info('Running command "{}"'.format(' '.join(command))) + log.info('Running command "{}"'.format(" ".join(command))) env = self._build_env(env) - return subprocess_capture(str(self.log_dir), - command, - env=env, - cwd=cwd, - check=True, - **kwargs) + return subprocess_capture( + str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs + ) -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def pg_bin(test_output_dir: Path) -> PgBin: return PgBin(test_output_dir) class VanillaPostgres(PgProtocol): def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True): - super().__init__(host='localhost', port=port, dbname='postgres') + super().__init__(host="localhost", port=port, dbname="postgres") self.pgdatadir = pgdatadir self.pg_bin = pg_bin self.running = False if init: - self.pg_bin.run_capture(['initdb', '-D', str(pgdatadir)]) + self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)]) self.configure([f"port = {port}\n"]) def configure(self, options: List[str]): """Append lines into postgresql.conf file.""" assert not self.running - with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file: + with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file: conf_file.write("\n".join(options)) def start(self, log_path: Optional[str] = None): @@ -1628,12 +1677,13 @@ class VanillaPostgres(PgProtocol): log_path = os.path.join(self.pgdatadir, "pg.log") self.pg_bin.run_capture( - ['pg_ctl', '-w', '-D', str(self.pgdatadir), '-l', log_path, 'start']) + ["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"] + ) def stop(self): assert self.running self.running = False - self.pg_bin.run_capture(['pg_ctl', '-w', '-D', str(self.pgdatadir), 'stop']) + self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"]) def get_subdir_size(self, subdir) -> int: """Return size of pgdatadir subdirectory in bytes.""" @@ -1647,9 +1697,10 @@ class VanillaPostgres(PgProtocol): self.stop() -@pytest.fixture(scope='function') -def vanilla_pg(test_output_dir: Path, - port_distributor: PortDistributor) -> Iterator[VanillaPostgres]: +@pytest.fixture(scope="function") +def vanilla_pg( + test_output_dir: Path, port_distributor: PortDistributor +) -> Iterator[VanillaPostgres]: pgdatadir = test_output_dir / "pgdata-vanilla" pg_bin = PgBin(test_output_dir) port = port_distributor.get_port() @@ -1665,18 +1716,18 @@ class RemotePostgres(PgProtocol): self.running = True def configure(self, options: List[str]): - raise Exception('cannot change configuration of remote Posgres instance') + raise Exception("cannot change configuration of remote Posgres instance") def start(self): - raise Exception('cannot start a remote Postgres instance') + raise Exception("cannot start a remote Postgres instance") def stop(self): - raise Exception('cannot stop a remote Postgres instance') + raise Exception("cannot stop a remote Postgres instance") def get_subdir_size(self, subdir) -> int: # TODO: Could use the server's Generic File Access functions if superuser. # See https://www.postgresql.org/docs/14/functions-admin.html#FUNCTIONS-ADMIN-GENFILE - raise Exception('cannot get size of a Postgres instance') + raise Exception("cannot get size of a Postgres instance") def __enter__(self): return self @@ -1686,7 +1737,7 @@ class RemotePostgres(PgProtocol): pass -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def remote_pg(test_output_dir: Path) -> Iterator[RemotePostgres]: pg_bin = PgBin(test_output_dir) @@ -1701,7 +1752,7 @@ def remote_pg(test_output_dir: Path) -> Iterator[RemotePostgres]: class NeonProxy(PgProtocol): def __init__(self, proxy_port: int, http_port: int, auth_endpoint: str): super().__init__(dsn=auth_endpoint, port=proxy_port) - self.host = '127.0.0.1' + self.host = "127.0.0.1" self.http_port = http_port self.proxy_port = proxy_port self.auth_endpoint = auth_endpoint @@ -1712,7 +1763,7 @@ class NeonProxy(PgProtocol): # Start proxy args = [ - os.path.join(str(neon_binpath), 'proxy'), + os.path.join(str(neon_binpath), "proxy"), *["--http", f"{self.host}:{self.http_port}"], *["--proxy", f"{self.host}:{self.proxy_port}"], *["--auth-backend", "postgres"], @@ -1735,7 +1786,7 @@ class NeonProxy(PgProtocol): self._popen.kill() -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def static_proxy(vanilla_pg, port_distributor) -> Iterator[NeonProxy]: """Neon proxy that routes directly to vanilla postgres.""" @@ -1743,28 +1794,28 @@ def static_proxy(vanilla_pg, port_distributor) -> Iterator[NeonProxy]: vanilla_pg.start() vanilla_pg.safe_psql("create user proxy with login superuser password 'password'") - port = vanilla_pg.default_options['port'] - host = vanilla_pg.default_options['host'] - dbname = vanilla_pg.default_options['dbname'] - auth_endpoint = f'postgres://proxy:password@{host}:{port}/{dbname}' + port = vanilla_pg.default_options["port"] + host = vanilla_pg.default_options["host"] + dbname = vanilla_pg.default_options["dbname"] + auth_endpoint = f"postgres://proxy:password@{host}:{port}/{dbname}" proxy_port = port_distributor.get_port() http_port = port_distributor.get_port() - with NeonProxy(proxy_port=proxy_port, http_port=http_port, - auth_endpoint=auth_endpoint) as proxy: + with NeonProxy( + proxy_port=proxy_port, http_port=http_port, auth_endpoint=auth_endpoint + ) as proxy: proxy.start() yield proxy class Postgres(PgProtocol): - """ An object representing a running postgres daemon. """ - def __init__(self, - env: NeonEnv, - tenant_id: uuid.UUID, - port: int, - check_stop_result: bool = True): - super().__init__(host='localhost', port=port, user='cloud_admin', dbname='postgres') + """An object representing a running postgres daemon.""" + + def __init__( + self, env: NeonEnv, tenant_id: uuid.UUID, port: int, check_stop_result: bool = True + ): + super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres") self.env = env self.running = False self.node_name: Optional[str] = None # dubious, see asserts below @@ -1780,7 +1831,7 @@ class Postgres(PgProtocol): node_name: Optional[str] = None, lsn: Optional[str] = None, config_lines: Optional[List[str]] = None, - ) -> 'Postgres': + ) -> "Postgres": """ Create the pg data directory. Returns self. @@ -1789,13 +1840,11 @@ class Postgres(PgProtocol): if not config_lines: config_lines = [] - self.node_name = node_name or f'{branch_name}_pg_node' - self.env.neon_cli.pg_create(branch_name, - node_name=self.node_name, - tenant_id=self.tenant_id, - lsn=lsn, - port=self.port) - path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name + self.node_name = node_name or f"{branch_name}_pg_node" + self.env.neon_cli.pg_create( + branch_name, node_name=self.node_name, tenant_id=self.tenant_id, lsn=lsn, port=self.port + ) + path = pathlib.Path("pgdatadirs") / "tenants" / self.tenant_id.hex / self.node_name self.pgdata_dir = os.path.join(self.env.repo_dir, path) if config_lines is None: @@ -1803,12 +1852,12 @@ class Postgres(PgProtocol): # set small 'max_replication_write_lag' to enable backpressure # and make tests more stable. - config_lines = ['max_replication_write_lag=15MB'] + config_lines + config_lines = ["max_replication_write_lag=15MB"] + config_lines self.config(config_lines) return self - def start(self) -> 'Postgres': + def start(self) -> "Postgres": """ Start the Postgres instance. Returns self. @@ -1818,32 +1867,32 @@ class Postgres(PgProtocol): log.info(f"Starting postgres node {self.node_name}") - run_result = self.env.neon_cli.pg_start(self.node_name, - tenant_id=self.tenant_id, - port=self.port) + run_result = self.env.neon_cli.pg_start( + self.node_name, tenant_id=self.tenant_id, port=self.port + ) self.running = True return self def pg_data_dir_path(self) -> str: - """ Path to data directory """ + """Path to data directory""" assert self.node_name - path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name + path = pathlib.Path("pgdatadirs") / "tenants" / self.tenant_id.hex / self.node_name return os.path.join(self.env.repo_dir, path) def pg_xact_dir_path(self) -> str: - """ Path to pg_xact dir """ - return os.path.join(self.pg_data_dir_path(), 'pg_xact') + """Path to pg_xact dir""" + return os.path.join(self.pg_data_dir_path(), "pg_xact") def pg_twophase_dir_path(self) -> str: - """ Path to pg_twophase dir """ - return os.path.join(self.pg_data_dir_path(), 'pg_twophase') + """Path to pg_twophase dir""" + return os.path.join(self.pg_data_dir_path(), "pg_twophase") def config_file_path(self) -> str: - """ Path to postgresql.conf """ - return os.path.join(self.pg_data_dir_path(), 'postgresql.conf') + """Path to postgresql.conf""" + return os.path.join(self.pg_data_dir_path(), "postgresql.conf") - def adjust_for_safekeepers(self, safekeepers: str) -> 'Postgres': + def adjust_for_safekeepers(self, safekeepers: str) -> "Postgres": """ Adjust instance config for working with wal acceptors instead of pageserver (pre-configured by CLI) directly. @@ -1855,30 +1904,33 @@ class Postgres(PgProtocol): with open(self.config_file_path(), "w") as f: for cfg_line in cfg_lines: # walproposer uses different application_name - if ("synchronous_standby_names" in cfg_line or - # don't repeat safekeepers/wal_acceptors multiple times - "neon.safekeepers" in cfg_line): + if ( + "synchronous_standby_names" in cfg_line + or + # don't repeat safekeepers/wal_acceptors multiple times + "neon.safekeepers" in cfg_line + ): continue f.write(cfg_line) f.write("synchronous_standby_names = 'walproposer'\n") f.write("neon.safekeepers = '{}'\n".format(safekeepers)) return self - def config(self, lines: List[str]) -> 'Postgres': + def config(self, lines: List[str]) -> "Postgres": """ Add lines to postgresql.conf. Lines should be an array of valid postgresql.conf rows. Returns self. """ - with open(self.config_file_path(), 'a') as conf: + with open(self.config_file_path(), "a") as conf: for line in lines: conf.write(line) - conf.write('\n') + conf.write("\n") return self - def stop(self) -> 'Postgres': + def stop(self) -> "Postgres": """ Stop the Postgres instance if it's running. Returns self. @@ -1886,24 +1938,23 @@ class Postgres(PgProtocol): if self.running: assert self.node_name is not None - self.env.neon_cli.pg_stop(self.node_name, - self.tenant_id, - check_return_code=self.check_stop_result) + self.env.neon_cli.pg_stop( + self.node_name, self.tenant_id, check_return_code=self.check_stop_result + ) self.running = False return self - def stop_and_destroy(self) -> 'Postgres': + def stop_and_destroy(self) -> "Postgres": """ Stop the Postgres instance, then destroy it. Returns self. """ assert self.node_name is not None - self.env.neon_cli.pg_stop(self.node_name, - self.tenant_id, - True, - check_return_code=self.check_stop_result) + self.env.neon_cli.pg_stop( + self.node_name, self.tenant_id, True, check_return_code=self.check_stop_result + ) self.node_name = None self.running = False @@ -1915,7 +1966,7 @@ class Postgres(PgProtocol): node_name: Optional[str] = None, lsn: Optional[str] = None, config_lines: Optional[List[str]] = None, - ) -> 'Postgres': + ) -> "Postgres": """ Create a Postgres instance, apply config and then start it. @@ -1943,18 +1994,21 @@ class Postgres(PgProtocol): class PostgresFactory: - """ An object representing multiple running postgres daemons. """ + """An object representing multiple running postgres daemons.""" + def __init__(self, env: NeonEnv): self.env = env self.num_instances = 0 self.instances: List[Postgres] = [] - def create_start(self, - branch_name: str, - node_name: Optional[str] = None, - tenant_id: Optional[uuid.UUID] = None, - lsn: Optional[str] = None, - config_lines: Optional[List[str]] = None) -> Postgres: + def create_start( + self, + branch_name: str, + node_name: Optional[str] = None, + tenant_id: Optional[uuid.UUID] = None, + lsn: Optional[str] = None, + config_lines: Optional[List[str]] = None, + ) -> Postgres: pg = Postgres( self.env, @@ -1971,12 +2025,14 @@ class PostgresFactory: lsn=lsn, ) - def create(self, - branch_name: str, - node_name: Optional[str] = None, - tenant_id: Optional[uuid.UUID] = None, - lsn: Optional[str] = None, - config_lines: Optional[List[str]] = None) -> Postgres: + def create( + self, + branch_name: str, + node_name: Optional[str] = None, + tenant_id: Optional[uuid.UUID] = None, + lsn: Optional[str] = None, + config_lines: Optional[List[str]] = None, + ) -> Postgres: pg = Postgres( self.env, @@ -1994,7 +2050,7 @@ class PostgresFactory: config_lines=config_lines, ) - def stop_all(self) -> 'PostgresFactory': + def stop_all(self) -> "PostgresFactory": for pg in self.instances: pg.stop() @@ -2002,7 +2058,7 @@ class PostgresFactory: def read_pid(path: Path) -> int: - """ Read content of file into number """ + """Read content of file into number""" return int(path.read_text()) @@ -2014,13 +2070,14 @@ class SafekeeperPort: @dataclass class Safekeeper: - """ An object representing a running safekeeper daemon. """ + """An object representing a running safekeeper daemon.""" + env: NeonEnv port: SafekeeperPort id: int running: bool = False - def start(self) -> 'Safekeeper': + def start(self) -> "Safekeeper": assert self.running == False self.env.neon_cli.safekeeper_start(self.id) self.running = True @@ -2034,22 +2091,22 @@ class Safekeeper: elapsed = time.time() - started_at if elapsed > 3: raise RuntimeError( - f"timed out waiting {elapsed:.0f}s for wal acceptor start: {e}") + f"timed out waiting {elapsed:.0f}s for wal acceptor start: {e}" + ) time.sleep(0.5) else: break # success return self - def stop(self, immediate=False) -> 'Safekeeper': - log.info('Stopping safekeeper {}'.format(self.id)) + def stop(self, immediate=False) -> "Safekeeper": + log.info("Stopping safekeeper {}".format(self.id)) self.env.neon_cli.safekeeper_stop(self.id, immediate) self.running = False return self - def append_logical_message(self, - tenant_id: uuid.UUID, - timeline_id: uuid.UUID, - request: Dict[str, Any]) -> Dict[str, Any]: + def append_logical_message( + self, tenant_id: uuid.UUID, timeline_id: uuid.UUID, request: Dict[str, Any] + ) -> Dict[str, Any]: """ Send JSON_CTRL query to append LogicalMessage to WAL and modify safekeeper state. It will construct LogicalMessage from provided @@ -2106,7 +2163,7 @@ class SafekeeperHttpClient(requests.Session): self.auth_token = auth_token if auth_token is not None: - self.headers['Authorization'] = f'Bearer {auth_token}' + self.headers["Authorization"] = f"Bearer {auth_token}" def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() @@ -2115,21 +2172,25 @@ class SafekeeperHttpClient(requests.Session): res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") res.raise_for_status() resj = res.json() - return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'], - flush_lsn=resj['flush_lsn'], - timeline_start_lsn=resj['timeline_start_lsn'], - backup_lsn=resj['backup_lsn'], - remote_consistent_lsn=resj['remote_consistent_lsn']) + return SafekeeperTimelineStatus( + acceptor_epoch=resj["acceptor_state"]["epoch"], + flush_lsn=resj["flush_lsn"], + timeline_start_lsn=resj["timeline_start_lsn"], + backup_lsn=resj["backup_lsn"], + remote_consistent_lsn=resj["remote_consistent_lsn"], + ) def record_safekeeper_info(self, tenant_id: str, timeline_id: str, body): res = self.post( f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", - json=body) + json=body, + ) res.raise_for_status() def timeline_delete_force(self, tenant_id: str, timeline_id: str) -> Dict[Any, Any]: res = self.delete( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" + ) res.raise_for_status() res_json = res.json() assert isinstance(res_json, dict) @@ -2152,21 +2213,24 @@ class SafekeeperHttpClient(requests.Session): metrics = SafekeeperMetrics() for match in re.finditer( - r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE): + r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', + all_metrics_text, + re.MULTILINE, + ): metrics.flush_lsn_inexact[(match.group(1), match.group(2))] = int(match.group(3)) for match in re.finditer( - r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE): + r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', + all_metrics_text, + re.MULTILINE, + ): metrics.commit_lsn_inexact[(match.group(1), match.group(2))] = int(match.group(3)) return metrics @dataclass class Etcd: - """ An object managing etcd instance """ + """An object managing etcd instance""" + datadir: str port: int peer_port: int @@ -2177,16 +2241,16 @@ class Etcd: self.binary_path = etcd_path() def client_url(self): - return f'http://127.0.0.1:{self.port}' + return f"http://127.0.0.1:{self.port}" def check_status(self): with requests.Session() as s: - s.mount('http://', requests.adapters.HTTPAdapter(max_retries=1)) # do not retry + s.mount("http://", requests.adapters.HTTPAdapter(max_retries=1)) # do not retry s.get(f"{self.client_url()}/health").raise_for_status() def try_start(self): if self.handle is not None: - log.debug(f'etcd is already running on port {self.port}') + log.debug(f"etcd is already running on port {self.port}") return pathlib.Path(self.datadir).mkdir(exist_ok=True) @@ -2206,7 +2270,7 @@ class Etcd: # Set --quota-backend-bytes to keep the etcd virtual memory # size smaller. Our test etcd clusters are very small. # See https://github.com/etcd-io/etcd/issues/7910 - f"--quota-backend-bytes=100000000" + f"--quota-backend-bytes=100000000", ] self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file) @@ -2230,21 +2294,23 @@ class Etcd: def get_test_output_dir(request: Any) -> pathlib.Path: - """ Compute the working directory for an individual test. """ + """Compute the working directory for an individual test.""" test_name = request.node.name test_dir = pathlib.Path(top_output_dir) / test_name.replace("/", "-") - log.info(f'get_test_output_dir is {test_dir}') + log.info(f"get_test_output_dir is {test_dir}") # make mypy happy assert isinstance(test_dir, pathlib.Path) return test_dir -ATTACHMENT_SUFFIXES = frozenset(( - '.log', - '.stderr', - '.stdout', - '.diffs', -)) +ATTACHMENT_SUFFIXES = frozenset( + ( + ".log", + ".stderr", + ".stdout", + ".diffs", + ) +) # This is autouse, so the test output directory always gets created, even @@ -2256,51 +2322,59 @@ ATTACHMENT_SUFFIXES = frozenset(( # scope. So it uses the get_test_output_dir() function to get the path, and # this fixture ensures that the directory exists. That works because # 'autouse' fixtures are run before other fixtures. -@pytest.fixture(scope='function', autouse=True) +@pytest.fixture(scope="function", autouse=True) def test_output_dir(request: Any) -> Iterator[pathlib.Path]: - """ Create the working directory for an individual test. """ + """Create the working directory for an individual test.""" # one directory per test test_dir = get_test_output_dir(request) - log.info(f'test_output_dir is {test_dir}') + log.info(f"test_output_dir is {test_dir}") shutil.rmtree(test_dir, ignore_errors=True) test_dir.mkdir() yield test_dir - for attachment in test_dir.glob('**/*'): + for attachment in test_dir.glob("**/*"): if attachment.suffix in ATTACHMENT_SUFFIXES: source = str(attachment) name = str(attachment.relative_to(test_dir)) - attachment_type = 'text/plain' - extension = attachment.suffix.removeprefix('.') + attachment_type = "text/plain" + extension = attachment.suffix.removeprefix(".") # compress files larger than 1Mb, they're hardly readable in a browser if attachment.stat().st_size > 1024 * 1024: - source = f'{attachment}.tar.gz' - with tarfile.open(source, 'w:gz') as tar: + source = f"{attachment}.tar.gz" + with tarfile.open(source, "w:gz") as tar: tar.add(attachment, arcname=attachment.name) - name = f'{name}.tar.gz' - attachment_type = 'application/gzip' - extension = 'tar.gz' + name = f"{name}.tar.gz" + attachment_type = "application/gzip" + extension = "tar.gz" allure.attach.file(source, name, attachment_type, extension) -SKIP_DIRS = frozenset(('pg_wal', - 'pg_stat', - 'pg_stat_tmp', - 'pg_subtrans', - 'pg_logical', - 'pg_replslot/wal_proposer_slot')) +SKIP_DIRS = frozenset( + ( + "pg_wal", + "pg_stat", + "pg_stat_tmp", + "pg_subtrans", + "pg_logical", + "pg_replslot/wal_proposer_slot", + ) +) -SKIP_FILES = frozenset(('pg_internal.init', - 'pg.log', - 'zenith.signal', - 'postgresql.conf', - 'postmaster.opts', - 'postmaster.pid', - 'pg_control')) +SKIP_FILES = frozenset( + ( + "pg_internal.init", + "pg.log", + "zenith.signal", + "postgresql.conf", + "postmaster.opts", + "postmaster.pid", + "pg_control", + ) +) def should_skip_dir(dirname: str) -> bool: @@ -2312,10 +2386,10 @@ def should_skip_file(filename: str) -> bool: return True # check for temp table files according to https://www.postgresql.org/docs/current/storage-file-layout.html # i e "tBBB_FFF" - if not filename.startswith('t'): + if not filename.startswith("t"): return False - tmp_name = filename[1:].split('_') + tmp_name = filename[1:].split("_") if len(tmp_name) != 2: return False @@ -2358,7 +2432,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post restored_dir_path.mkdir(exist_ok=True) pg_bin = PgBin(test_output_dir) - psql_path = os.path.join(pg_bin.pg_bin_path, 'psql') + psql_path = os.path.join(pg_bin.pg_bin_path, "psql") cmd = rf""" {psql_path} \ @@ -2370,12 +2444,12 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')} + psql_env = {"LD_LIBRARY_PATH": os.path.join(str(pg_distrib_dir), "lib")} result = subprocess.run(cmd, env=psql_env, capture_output=True, text=True, shell=True) # Print captured stdout/stderr if basebackup cmd failed. if result.returncode != 0: - log.error('Basebackup shell command failed with:') + log.error("Basebackup shell command failed with:") log.error(result.stdout) log.error(result.stderr) assert result.returncode == 0 @@ -2392,11 +2466,10 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post # filecmp returns (match, mismatch, error) lists # We've already filtered all mismatching files in list_files_to_compare(), # so here expect that the content is identical - (match, mismatch, error) = filecmp.cmpfiles(pg.pgdata_dir, - restored_dir_path, - pgdata_files, - shallow=False) - log.info(f'filecmp result mismatch and error lists:\n\t mismatch={mismatch}\n\t error={error}') + (match, mismatch, error) = filecmp.cmpfiles( + pg.pgdata_dir, restored_dir_path, pgdata_files, shallow=False + ) + log.info(f"filecmp result mismatch and error lists:\n\t mismatch={mismatch}\n\t error={error}") for f in mismatch: @@ -2404,11 +2477,11 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post f2 = os.path.join(restored_dir_path, f) stdout_filename = "{}.filediff".format(f2) - with open(stdout_filename, 'w') as stdout_f: + with open(stdout_filename, "w") as stdout_f: subprocess.run("xxd -b {} > {}.hex ".format(f1, f1), shell=True) subprocess.run("xxd -b {} > {}.hex ".format(f2, f2), shell=True) - cmd = 'diff {}.hex {}.hex'.format(f1, f2) + cmd = "diff {}.hex {}.hex".format(f1, f2) subprocess.run([cmd], stdout=stdout_f, shell=True) assert (mismatch, error) == ([], []) @@ -2432,11 +2505,11 @@ def wait_until(number_of_iterations: int, interval: float, func): raise Exception("timed out while waiting for %s" % func) from last_exception -def assert_timeline_local(pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID): +def assert_timeline_local( + pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID +): timeline_detail = pageserver_http_client.timeline_detail(tenant, timeline) - assert timeline_detail.get('local', {}).get("disk_consistent_lsn"), timeline_detail + assert timeline_detail.get("local", {}).get("disk_consistent_lsn"), timeline_detail return timeline_detail @@ -2445,65 +2518,81 @@ def assert_no_in_progress_downloads_for_tenant( tenant: uuid.UUID, ): tenant_status = pageserver_http_client.tenant_status(tenant) - assert tenant_status['has_in_progress_downloads'] is False, tenant_status + assert tenant_status["has_in_progress_downloads"] is False, tenant_status -def remote_consistent_lsn(pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID) -> int: +def remote_consistent_lsn( + pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID +) -> int: detail = pageserver_http_client.timeline_detail(tenant, timeline) - if detail['remote'] is None: + if detail["remote"] is None: # No remote information at all. This happens right after creating # a timeline, before any part of it has been uploaded to remote # storage yet. return 0 else: - lsn_str = detail['remote']['remote_consistent_lsn'] + lsn_str = detail["remote"]["remote_consistent_lsn"] assert isinstance(lsn_str, str) return lsn_from_hex(lsn_str) -def wait_for_upload(pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID, - lsn: int): +def wait_for_upload( + pageserver_http_client: NeonPageserverHttpClient, + tenant: uuid.UUID, + timeline: uuid.UUID, + lsn: int, +): """waits for local timeline upload up to specified lsn""" for i in range(20): current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) if current_lsn >= lsn: return - log.info("waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1)) + log.info( + "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1 + ) + ) time.sleep(1) - raise Exception("timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn))) + raise Exception( + "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn) + ) + ) -def last_record_lsn(pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID) -> int: +def last_record_lsn( + pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID +) -> int: detail = pageserver_http_client.timeline_detail(tenant, timeline) - lsn_str = detail['local']['last_record_lsn'] + lsn_str = detail["local"]["last_record_lsn"] assert isinstance(lsn_str, str) return lsn_from_hex(lsn_str) -def wait_for_last_record_lsn(pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID, - lsn: int): +def wait_for_last_record_lsn( + pageserver_http_client: NeonPageserverHttpClient, + tenant: uuid.UUID, + timeline: uuid.UUID, + lsn: int, +): """waits for pageserver to catch up to a certain lsn""" for i in range(10): current_lsn = last_record_lsn(pageserver_http_client, tenant, timeline) if current_lsn >= lsn: return - log.info("waiting for last_record_lsn to reach {}, now {}, iteration {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1)) + log.info( + "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1 + ) + ) time.sleep(1) - raise Exception("timed out while waiting for last_record_lsn to reach {}, was {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn))) + raise Exception( + "timed out while waiting for last_record_lsn to reach {}, was {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn) + ) + ) def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: uuid.UUID, timeline: uuid.UUID): diff --git a/test_runner/fixtures/pg_stats.py b/test_runner/fixtures/pg_stats.py index e113d37248..b2e6886eb3 100644 --- a/test_runner/fixtures/pg_stats.py +++ b/test_runner/fixtures/pg_stats.py @@ -18,35 +18,43 @@ class PgStatTable: return f"SELECT {','.join(self.columns)} FROM {self.table} {self.additional_query}" -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def pg_stats_rw() -> List[PgStatTable]: return [ - PgStatTable("pg_stat_database", - ["tup_returned", "tup_fetched", "tup_inserted", "tup_updated", "tup_deleted"], - "WHERE datname='postgres'"), + PgStatTable( + "pg_stat_database", + ["tup_returned", "tup_fetched", "tup_inserted", "tup_updated", "tup_deleted"], + "WHERE datname='postgres'", + ), ] -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def pg_stats_ro() -> List[PgStatTable]: return [ - PgStatTable("pg_stat_database", ["tup_returned", "tup_fetched"], - "WHERE datname='postgres'"), + PgStatTable( + "pg_stat_database", ["tup_returned", "tup_fetched"], "WHERE datname='postgres'" + ), ] -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def pg_stats_wo() -> List[PgStatTable]: return [ - PgStatTable("pg_stat_database", ["tup_inserted", "tup_updated", "tup_deleted"], - "WHERE datname='postgres'"), + PgStatTable( + "pg_stat_database", + ["tup_inserted", "tup_updated", "tup_deleted"], + "WHERE datname='postgres'", + ), ] -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def pg_stats_wal() -> List[PgStatTable]: return [ - PgStatTable("pg_stat_wal", - ["wal_records", "wal_fpi", "wal_bytes", "wal_buffers_full", "wal_write"], - "") + PgStatTable( + "pg_stat_wal", + ["wal_records", "wal_fpi", "wal_bytes", "wal_buffers_full", "wal_write"], + "", + ) ] diff --git a/test_runner/fixtures/slow.py b/test_runner/fixtures/slow.py index c20b766a93..94199ae785 100644 --- a/test_runner/fixtures/slow.py +++ b/test_runner/fixtures/slow.py @@ -1,4 +1,5 @@ import pytest + """ This plugin allows tests to be marked as slow using pytest.mark.slow. By default slow tests are excluded. They need to be specifically requested with the --runslow flag in diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index a37d40014c..48889a8697 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -4,20 +4,19 @@ import pathlib import shutil import subprocess from pathlib import Path - from typing import Any, List, Tuple -from psycopg2.extensions import cursor from fixtures.log_helper import log +from psycopg2.extensions import cursor def get_self_dir() -> str: - """ Get the path to the directory where this script lives. """ + """Get the path to the directory where this script lives.""" return os.path.dirname(os.path.abspath(__file__)) def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: - """ Run a process and capture its output + """Run a process and capture its output Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" where "cmd" is the name of the program and NNN is an incrementing @@ -27,14 +26,14 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: Returns basepath for files with captured output. """ assert type(cmd) is list - base = os.path.basename(cmd[0]) + '_{}'.format(global_counter()) + base = os.path.basename(cmd[0]) + "_{}".format(global_counter()) basepath = os.path.join(capture_dir, base) - stdout_filename = basepath + '.stdout' - stderr_filename = basepath + '.stderr' + stdout_filename = basepath + ".stdout" + stderr_filename = basepath + ".stderr" try: - with open(stdout_filename, 'w') as stdout_f: - with open(stderr_filename, 'w') as stderr_f: + with open(stdout_filename, "w") as stdout_f: + with open(stderr_filename, "w") as stderr_f: log.info(f'Capturing stdout to "{base}.stdout" and stderr to "{base}.stderr"') subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) finally: @@ -50,7 +49,7 @@ _global_counter = 0 def global_counter() -> int: - """ A really dumb global counter. + """A really dumb global counter. This is useful for giving output files a unique number, so if we run the same command multiple times we can keep their output separate. @@ -61,13 +60,13 @@ def global_counter() -> int: def lsn_to_hex(num: int) -> str: - """ Convert lsn from int to standard hex notation. """ - return "{:X}/{:X}".format(num >> 32, num & 0xffffffff) + """Convert lsn from int to standard hex notation.""" + return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF) def lsn_from_hex(lsn_hex: str) -> int: - """ Convert lsn from hex notation to int. """ - l, r = lsn_hex.split('/') + """Convert lsn from hex notation to int.""" + l, r = lsn_hex.split("/") return (int(l, 16) << 32) + int(r, 16) @@ -75,14 +74,16 @@ def print_gc_result(row): log.info("GC duration {elapsed} ms".format_map(row)) log.info( " total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_pitr {layers_needed_by_pitr}" - " needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}" - .format_map(row)) + " needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}".format_map( + row + ) + ) def etcd_path() -> Path: path_output = shutil.which("etcd") if path_output is None: - raise RuntimeError('etcd not found in PATH') + raise RuntimeError("etcd not found in PATH") else: return Path(path_output) @@ -145,7 +146,12 @@ def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]: parts = f_name.split("__") key_parts = parts[0].split("-") lsn_parts = parts[1].split("-") - return int(key_parts[0], 16), int(key_parts[1], 16), int(lsn_parts[0], 16), int(lsn_parts[1], 16) + return ( + int(key_parts[0], 16), + int(key_parts[1], 16), + int(lsn_parts[0], 16), + int(lsn_parts[1], 16), + ) def get_scale_for_db(size_mb: int) -> int: diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 1d39b0830d..9cb346de47 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -1,28 +1,26 @@ import random -import time import statistics import threading +import time import timeit -import pytest from typing import List + +import pytest from fixtures.benchmark_fixture import MetricReport from fixtures.compare_fixtures import NeonCompare from fixtures.log_helper import log def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]): - neon_compare.zenbenchmark.record("branch_creation_duration_max", - max(durs), - 's', - MetricReport.LOWER_IS_BETTER) - neon_compare.zenbenchmark.record("branch_creation_duration_avg", - statistics.mean(durs), - 's', - MetricReport.LOWER_IS_BETTER) - neon_compare.zenbenchmark.record("branch_creation_duration_stdev", - statistics.stdev(durs), - 's', - MetricReport.LOWER_IS_BETTER) + neon_compare.zenbenchmark.record( + "branch_creation_duration_max", max(durs), "s", MetricReport.LOWER_IS_BETTER + ) + neon_compare.zenbenchmark.record( + "branch_creation_duration_avg", statistics.mean(durs), "s", MetricReport.LOWER_IS_BETTER + ) + neon_compare.zenbenchmark.record( + "branch_creation_duration_stdev", statistics.stdev(durs), "s", MetricReport.LOWER_IS_BETTER + ) @pytest.mark.parametrize("n_branches", [20]) @@ -37,15 +35,16 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) # Use aggressive GC and checkpoint settings, so GC and compaction happen more often during the test tenant, _ = env.neon_cli.create_tenant( - conf={ - 'gc_period': '5 s', - 'gc_horizon': f'{4 * 1024 ** 2}', - 'checkpoint_distance': f'{2 * 1024 ** 2}', - 'compaction_target_size': f'{1024 ** 2}', - 'compaction_threshold': '2', - # set PITR interval to be small, so we can do GC - 'pitr_interval': '5 s' - }) + conf={ + "gc_period": "5 s", + "gc_horizon": f"{4 * 1024 ** 2}", + "checkpoint_distance": f"{2 * 1024 ** 2}", + "compaction_target_size": f"{1024 ** 2}", + "compaction_threshold": "2", + # set PITR interval to be small, so we can do GC + "pitr_interval": "5 s", + } + ) def run_pgbench(branch: str): log.info(f"Start a pgbench workload on branch {branch}") @@ -53,15 +52,15 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) pg = env.postgres.create_start(branch, tenant_id=tenant) connstr = pg.connstr() - pg_bin.run_capture(['pgbench', '-i', connstr]) - pg_bin.run_capture(['pgbench', '-c10', '-T10', connstr]) + pg_bin.run_capture(["pgbench", "-i", connstr]) + pg_bin.run_capture(["pgbench", "-c10", "-T10", connstr]) pg.stop() - env.neon_cli.create_branch('b0', tenant_id=tenant) + env.neon_cli.create_branch("b0", tenant_id=tenant) threads: List[threading.Thread] = [] - threads.append(threading.Thread(target=run_pgbench, args=('b0', ), daemon=True)) + threads.append(threading.Thread(target=run_pgbench, args=("b0",), daemon=True)) threads[-1].start() branch_creation_durations = [] @@ -72,13 +71,13 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) p = random.randint(0, i) timer = timeit.default_timer() - env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(p), tenant_id=tenant) + env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p), tenant_id=tenant) dur = timeit.default_timer() - timer log.info(f"Creating branch b{i+1} took {dur}s") branch_creation_durations.append(dur) - threads.append(threading.Thread(target=run_pgbench, args=(f'b{i+1}', ), daemon=True)) + threads.append(threading.Thread(target=run_pgbench, args=(f"b{i+1}",), daemon=True)) threads[-1].start() for thread in threads: @@ -92,10 +91,10 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): env = neon_compare.env - env.neon_cli.create_branch('b0') + env.neon_cli.create_branch("b0") - pg = env.postgres.create_start('b0') - neon_compare.pg_bin.run_capture(['pgbench', '-i', '-s10', pg.connstr()]) + pg = env.postgres.create_start("b0") + neon_compare.pg_bin.run_capture(["pgbench", "-i", "-s10", pg.connstr()]) branch_creation_durations = [] @@ -103,7 +102,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): # random a source branch p = random.randint(0, i) timer = timeit.default_timer() - env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(p)) + env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p)) dur = timeit.default_timer() - timer branch_creation_durations.append(dur) diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 6a5bad8757..9aaf0cbc77 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -1,8 +1,9 @@ from contextlib import closing -from fixtures.neon_fixtures import NeonEnv -from fixtures.log_helper import log + from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker -from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare +from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv # @@ -23,8 +24,8 @@ def test_bulk_insert(neon_with_baseline: PgCompare): cur.execute("create table huge (i int, j int);") # Run INSERT, recording the time and I/O it takes - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('insert'): + with env.record_pageserver_writes("pageserver_writes"): + with env.record_duration("insert"): cur.execute("insert into huge values (generate_series(1, 5000000), 0);") env.flush() diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index fe3c3afe37..cef7ce0c6b 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -1,7 +1,7 @@ import timeit -from fixtures.benchmark_fixture import MetricReport -import pytest +import pytest +from fixtures.benchmark_fixture import MetricReport from fixtures.neon_fixtures import NeonEnvBuilder # Run bulk tenant creation test. @@ -12,7 +12,7 @@ from fixtures.neon_fixtures import NeonEnvBuilder # 2. Average creation time per tenant -@pytest.mark.parametrize('tenants_count', [1, 5, 10]) +@pytest.mark.parametrize("tenants_count", [1, 5, 10]) def test_bulk_tenant_create( neon_env_builder: NeonEnvBuilder, tenants_count: int, @@ -27,22 +27,26 @@ def test_bulk_tenant_create( start = timeit.default_timer() tenant, _ = env.neon_cli.create_tenant() - env.neon_cli.create_timeline(f'test_bulk_tenant_create_{tenants_count}_{i}', - tenant_id=tenant) + env.neon_cli.create_timeline( + f"test_bulk_tenant_create_{tenants_count}_{i}", tenant_id=tenant + ) # FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now? - #if use_safekeepers == 'with_sa': + # if use_safekeepers == 'with_sa': # wa_factory.start_n_new(3) - pg_tenant = env.postgres.create_start(f'test_bulk_tenant_create_{tenants_count}_{i}', - tenant_id=tenant) + pg_tenant = env.postgres.create_start( + f"test_bulk_tenant_create_{tenants_count}_{i}", tenant_id=tenant + ) end = timeit.default_timer() time_slices.append(end - start) pg_tenant.stop() - zenbenchmark.record('tenant_creation_time', - sum(time_slices) / len(time_slices), - 's', - report=MetricReport.LOWER_IS_BETTER) + zenbenchmark.record( + "tenant_creation_time", + sum(time_slices) / len(time_slices), + "s", + report=MetricReport.LOWER_IS_BETTER, + ) diff --git a/test_runner/performance/test_compare_pg_stats.py b/test_runner/performance/test_compare_pg_stats.py index b9bca90231..d39ea55fbb 100644 --- a/test_runner/performance/test_compare_pg_stats.py +++ b/test_runner/performance/test_compare_pg_stats.py @@ -6,7 +6,6 @@ from typing import List import pytest from fixtures.compare_fixtures import PgCompare from fixtures.pg_stats import PgStatTable - from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix @@ -18,85 +17,96 @@ def get_seeds_matrix(default: int = 100): @pytest.mark.parametrize("seed", get_seeds_matrix()) @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix(5)) -def test_compare_pg_stats_rw_with_pgbench_default(neon_with_baseline: PgCompare, - seed: int, - scale: int, - duration: int, - pg_stats_rw: List[PgStatTable]): +def test_compare_pg_stats_rw_with_pgbench_default( + neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_rw: List[PgStatTable], +): env = neon_with_baseline # initialize pgbench - env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.pg_bin.run_capture(["pgbench", f"-s{scale}", "-i", env.pg.connstr()]) env.flush() with env.record_pg_stats(pg_stats_rw): env.pg_bin.run_capture( - ['pgbench', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) + ["pgbench", f"-T{duration}", f"--random-seed={seed}", env.pg.connstr()] + ) env.flush() @pytest.mark.parametrize("seed", get_seeds_matrix()) @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix(5)) -def test_compare_pg_stats_wo_with_pgbench_simple_update(neon_with_baseline: PgCompare, - seed: int, - scale: int, - duration: int, - pg_stats_wo: List[PgStatTable]): +def test_compare_pg_stats_wo_with_pgbench_simple_update( + neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_wo: List[PgStatTable], +): env = neon_with_baseline # initialize pgbench - env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.pg_bin.run_capture(["pgbench", f"-s{scale}", "-i", env.pg.connstr()]) env.flush() with env.record_pg_stats(pg_stats_wo): env.pg_bin.run_capture( - ['pgbench', '-N', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) + ["pgbench", "-N", f"-T{duration}", f"--random-seed={seed}", env.pg.connstr()] + ) env.flush() @pytest.mark.parametrize("seed", get_seeds_matrix()) @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix(5)) -def test_compare_pg_stats_ro_with_pgbench_select_only(neon_with_baseline: PgCompare, - seed: int, - scale: int, - duration: int, - pg_stats_ro: List[PgStatTable]): +def test_compare_pg_stats_ro_with_pgbench_select_only( + neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_ro: List[PgStatTable], +): env = neon_with_baseline # initialize pgbench - env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.pg_bin.run_capture(["pgbench", f"-s{scale}", "-i", env.pg.connstr()]) env.flush() with env.record_pg_stats(pg_stats_ro): env.pg_bin.run_capture( - ['pgbench', '-S', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) + ["pgbench", "-S", f"-T{duration}", f"--random-seed={seed}", env.pg.connstr()] + ) env.flush() @pytest.mark.parametrize("seed", get_seeds_matrix()) @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix(5)) -def test_compare_pg_stats_wal_with_pgbench_default(neon_with_baseline: PgCompare, - seed: int, - scale: int, - duration: int, - pg_stats_wal: List[PgStatTable]): +def test_compare_pg_stats_wal_with_pgbench_default( + neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_wal: List[PgStatTable], +): env = neon_with_baseline # initialize pgbench - env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.pg_bin.run_capture(["pgbench", f"-s{scale}", "-i", env.pg.connstr()]) env.flush() with env.record_pg_stats(pg_stats_wal): env.pg_bin.run_capture( - ['pgbench', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) + ["pgbench", f"-T{duration}", f"--random-seed={seed}", env.pg.connstr()] + ) env.flush() @pytest.mark.parametrize("n_tables", [1, 10]) @pytest.mark.parametrize("duration", get_durations_matrix(10)) -def test_compare_pg_stats_wo_with_heavy_write(neon_with_baseline: PgCompare, - n_tables: int, - duration: int, - pg_stats_wo: List[PgStatTable]): +def test_compare_pg_stats_wo_with_heavy_write( + neon_with_baseline: PgCompare, n_tables: int, duration: int, pg_stats_wo: List[PgStatTable] +): env = neon_with_baseline with env.pg.connect().cursor() as cur: for i in range(n_tables): @@ -112,8 +122,7 @@ def test_compare_pg_stats_wo_with_heavy_write(neon_with_baseline: PgCompare, with env.record_pg_stats(pg_stats_wo): threads = [ - threading.Thread(target=start_single_table_workload, args=(i, )) - for i in range(n_tables) + threading.Thread(target=start_single_table_workload, args=(i,)) for i in range(n_tables) ] for thread in threads: diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py index ad088684d5..bf4804fc07 100644 --- a/test_runner/performance/test_copy.py +++ b/test_runner/performance/test_copy.py @@ -1,11 +1,12 @@ from contextlib import closing -from fixtures.neon_fixtures import NeonEnv -from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker -from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare from io import BufferedReader, RawIOBase from itertools import repeat +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv + class CopyTestData(RawIOBase): def __init__(self, rows: int): @@ -29,7 +30,7 @@ class CopyTestData(RawIOBase): # Number of bytes to read in this call l = min(len(self.linebuf) - self.ptr, len(b)) - b[:l] = self.linebuf[self.ptr:(self.ptr + l)] + b[:l] = self.linebuf[self.ptr : (self.ptr + l)] self.ptr += l return l @@ -52,19 +53,19 @@ def test_copy(neon_with_baseline: PgCompare): # Load data with COPY, recording the time and I/O it takes. # # Since there's no data in the table previously, this extends it. - with env.record_pageserver_writes('copy_extend_pageserver_writes'): - with env.record_duration('copy_extend'): - cur.copy_from(copy_test_data(1000000), 'copytest') + with env.record_pageserver_writes("copy_extend_pageserver_writes"): + with env.record_duration("copy_extend"): + cur.copy_from(copy_test_data(1000000), "copytest") env.flush() # Delete most rows, and VACUUM to make the space available for reuse. - with env.record_pageserver_writes('delete_pageserver_writes'): - with env.record_duration('delete'): + with env.record_pageserver_writes("delete_pageserver_writes"): + with env.record_duration("delete"): cur.execute("delete from copytest where i % 100 <> 0;") env.flush() - with env.record_pageserver_writes('vacuum_pageserver_writes'): - with env.record_duration('vacuum'): + with env.record_pageserver_writes("vacuum_pageserver_writes"): + with env.record_duration("vacuum"): cur.execute("vacuum copytest") env.flush() @@ -72,9 +73,9 @@ def test_copy(neon_with_baseline: PgCompare): # by the VACUUM. # # This will also clear all the VM bits. - with env.record_pageserver_writes('copy_reuse_pageserver_writes'): - with env.record_duration('copy_reuse'): - cur.copy_from(copy_test_data(1000000), 'copytest') + with env.record_pageserver_writes("copy_reuse_pageserver_writes"): + with env.record_duration("copy_reuse"): + cur.copy_from(copy_test_data(1000000), "copytest") env.flush() env.report_peak_memory_use() diff --git a/test_runner/performance/test_dup_key.py b/test_runner/performance/test_dup_key.py index ee867a9845..60fe3014ba 100644 --- a/test_runner/performance/test_dup_key.py +++ b/test_runner/performance/test_dup_key.py @@ -1,5 +1,6 @@ -import pytest from contextlib import closing + +import pytest from fixtures.compare_fixtures import PgCompare from pytest_lazyfixture import lazy_fixture # type: ignore @@ -11,22 +12,24 @@ from pytest_lazyfixture import lazy_fixture # type: ignore pytest.param(lazy_fixture("neon_compare"), id="neon", marks=pytest.mark.slow), pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), - ]) + ], +) def test_dup_key(env: PgCompare): # Update the same page many times, then measure read performance with closing(env.pg.connect()) as conn: with conn.cursor() as cur: - cur.execute('drop table if exists t, f;') + cur.execute("drop table if exists t, f;") cur.execute("SET synchronous_commit=off") cur.execute("SET statement_timeout=0") # Write many updates to the same row - with env.record_duration('write'): + with env.record_duration("write"): cur.execute("create table t (i integer, filler text);") - cur.execute('insert into t values (0);') - cur.execute(""" + cur.execute("insert into t values (0);") + cur.execute( + """ do $$ begin for ivar in 1..5000000 loop @@ -38,13 +41,14 @@ begin end loop; end; $$; -""") +""" + ) # Write 3-4 MB to evict t from compute cache - cur.execute('create table f (i integer);') - cur.execute(f'insert into f values (generate_series(1,100000));') + cur.execute("create table f (i integer);") + cur.execute(f"insert into f values (generate_series(1,100000));") # Read - with env.record_duration('read'): - cur.execute('select * from t;') + with env.record_duration("read"): + cur.execute("select * from t;") cur.fetchall() diff --git a/test_runner/performance/test_gist_build.py b/test_runner/performance/test_gist_build.py index 839eb3f57d..d8fa97fbbf 100644 --- a/test_runner/performance/test_gist_build.py +++ b/test_runner/performance/test_gist_build.py @@ -1,9 +1,10 @@ import os from contextlib import closing + from fixtures.benchmark_fixture import MetricReport -from fixtures.neon_fixtures import NeonEnv -from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare +from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv # @@ -24,8 +25,8 @@ def test_gist_buffering_build(neon_with_baseline: PgCompare): ) # Build the index. - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('build'): + with env.record_pageserver_writes("pageserver_writes"): + with env.record_duration("build"): cur.execute( "create index gist_pointidx2 on gist_point_tbl using gist(p) with (buffering = on)" ) diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py index d3da0310ce..8e8ab9849a 100644 --- a/test_runner/performance/test_hot_page.py +++ b/test_runner/performance/test_hot_page.py @@ -1,5 +1,6 @@ -import pytest from contextlib import closing + +import pytest from fixtures.compare_fixtures import PgCompare from pytest_lazyfixture import lazy_fixture # type: ignore @@ -11,27 +12,28 @@ from pytest_lazyfixture import lazy_fixture # type: ignore pytest.param(lazy_fixture("neon_compare"), id="neon", marks=pytest.mark.slow), pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), - ]) + ], +) def test_hot_page(env: PgCompare): # Update the same page many times, then measure read performance num_writes = 1000000 with closing(env.pg.connect()) as conn: with conn.cursor() as cur: - cur.execute('drop table if exists t, f;') + cur.execute("drop table if exists t, f;") # Write many updates to the same row - with env.record_duration('write'): - cur.execute('create table t (i integer);') - cur.execute('insert into t values (0);') + with env.record_duration("write"): + cur.execute("create table t (i integer);") + cur.execute("insert into t values (0);") for i in range(num_writes): - cur.execute(f'update t set i = {i};') + cur.execute(f"update t set i = {i};") # Write 3-4 MB to evict t from compute cache - cur.execute('create table f (i integer);') - cur.execute(f'insert into f values (generate_series(1,100000));') + cur.execute("create table f (i integer);") + cur.execute(f"insert into f values (generate_series(1,100000));") # Read - with env.record_duration('read'): - cur.execute('select * from t;') + with env.record_duration("read"): + cur.execute("select * from t;") cur.fetchall() diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py index 997c772f88..2f519e152c 100644 --- a/test_runner/performance/test_hot_table.py +++ b/test_runner/performance/test_hot_table.py @@ -1,5 +1,6 @@ -import pytest from contextlib import closing + +import pytest from fixtures.compare_fixtures import PgCompare from pytest_lazyfixture import lazy_fixture # type: ignore @@ -11,7 +12,8 @@ from pytest_lazyfixture import lazy_fixture # type: ignore pytest.param(lazy_fixture("neon_compare"), id="neon", marks=pytest.mark.slow), pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), - ]) + ], +) def test_hot_table(env: PgCompare): # Update a small table many times, then measure read performance num_rows = 100000 # Slightly larger than shared buffers size TODO validate @@ -20,17 +22,17 @@ def test_hot_table(env: PgCompare): with closing(env.pg.connect()) as conn: with conn.cursor() as cur: - cur.execute('drop table if exists t;') + cur.execute("drop table if exists t;") # Write many updates to a small table - with env.record_duration('write'): - cur.execute('create table t (i integer primary key);') - cur.execute(f'insert into t values (generate_series(1,{num_rows}));') + with env.record_duration("write"): + cur.execute("create table t (i integer primary key);") + cur.execute(f"insert into t values (generate_series(1,{num_rows}));") for i in range(num_writes): - cur.execute(f'update t set i = {i + num_rows} WHERE i = {i};') + cur.execute(f"update t set i = {i + num_rows} WHERE i = {i};") # Read the table - with env.record_duration('read'): + with env.record_duration("read"): for i in range(num_reads): - cur.execute('select * from t;') + cur.execute("select * from t;") cur.fetchall() diff --git a/test_runner/performance/test_parallel_copy_to.py b/test_runner/performance/test_parallel_copy_to.py index d4e74ce195..c1883dec7b 100644 --- a/test_runner/performance/test_parallel_copy_to.py +++ b/test_runner/performance/test_parallel_copy_to.py @@ -1,10 +1,11 @@ -from io import BytesIO import asyncio +from io import BytesIO + import asyncpg -from fixtures.neon_fixtures import NeonEnv, Postgres, PgProtocol -from fixtures.log_helper import log from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker -from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare +from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, PgProtocol, Postgres async def repeat_bytes(buf, repetitions: int): @@ -16,7 +17,8 @@ async def copy_test_data_to_table(pg: PgProtocol, worker_id: int, table_name: st buf = BytesIO() for i in range(1000): buf.write( - f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode()) + f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode() + ) buf.seek(0) copy_input = repeat_bytes(buf.read(), 5000) @@ -28,7 +30,7 @@ async def copy_test_data_to_table(pg: PgProtocol, worker_id: int, table_name: st async def parallel_load_different_tables(pg: PgProtocol, n_parallel: int): workers = [] for worker_id in range(n_parallel): - worker = copy_test_data_to_table(pg, worker_id, f'copytest_{worker_id}') + worker = copy_test_data_to_table(pg, worker_id, f"copytest_{worker_id}") workers.append(asyncio.create_task(worker)) # await all workers @@ -43,10 +45,10 @@ def test_parallel_copy_different_tables(neon_with_baseline: PgCompare, n_paralle cur = conn.cursor() for worker_id in range(n_parallel): - cur.execute(f'CREATE TABLE copytest_{worker_id} (i int, t text)') + cur.execute(f"CREATE TABLE copytest_{worker_id} (i int, t text)") - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('load'): + with env.record_pageserver_writes("pageserver_writes"): + with env.record_duration("load"): asyncio.run(parallel_load_different_tables(env.pg, n_parallel)) env.flush() @@ -57,7 +59,7 @@ def test_parallel_copy_different_tables(neon_with_baseline: PgCompare, n_paralle async def parallel_load_same_table(pg: PgProtocol, n_parallel: int): workers = [] for worker_id in range(n_parallel): - worker = copy_test_data_to_table(pg, worker_id, f'copytest') + worker = copy_test_data_to_table(pg, worker_id, f"copytest") workers.append(asyncio.create_task(worker)) # await all workers @@ -70,10 +72,10 @@ def test_parallel_copy_same_table(neon_with_baseline: PgCompare, n_parallel=5): conn = env.pg.connect() cur = conn.cursor() - cur.execute(f'CREATE TABLE copytest (i int, t text)') + cur.execute(f"CREATE TABLE copytest (i int, t text)") - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('load'): + with env.record_pageserver_writes("pageserver_writes"): + with env.record_duration("load"): asyncio.run(parallel_load_same_table(env.pg, n_parallel)) env.flush() diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 89c510e76e..934642d095 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -30,7 +30,7 @@ def init_pgbench(env: PgCompare, cmdline): # duration is actually a metric and uses float instead of int for timestamp start_timestamp = utc_now_timestamp() t0 = timeit.default_timer() - with env.record_pageserver_writes('init.pageserver_writes'): + with env.record_pageserver_writes("init.pageserver_writes"): out = env.pg_bin.run_capture(cmdline) env.flush() @@ -49,10 +49,12 @@ def init_pgbench(env: PgCompare, cmdline): def run_pgbench(env: PgCompare, prefix: str, cmdline): - with env.record_pageserver_writes(f'{prefix}.pageserver_writes'): + with env.record_pageserver_writes(f"{prefix}.pageserver_writes"): run_start_timestamp = utc_now_timestamp() t0 = timeit.default_timer() - out = env.pg_bin.run_capture(cmdline, ) + out = env.pg_bin.run_capture( + cmdline, + ) run_duration = timeit.default_timer() - t0 run_end_timestamp = utc_now_timestamp() env.flush() @@ -78,40 +80,45 @@ def run_pgbench(env: PgCompare, prefix: str, cmdline): # # Currently, the # of connections is hardcoded at 4 def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: PgBenchLoadType): - env.zenbenchmark.record("scale", scale, '', MetricReport.TEST_PARAM) + env.zenbenchmark.record("scale", scale, "", MetricReport.TEST_PARAM) if workload_type == PgBenchLoadType.INIT: # Run initialize init_pgbench( - env, ['pgbench', f'-s{scale}', '-i', env.pg.connstr(options='-cstatement_timeout=1h')]) + env, ["pgbench", f"-s{scale}", "-i", env.pg.connstr(options="-cstatement_timeout=1h")] + ) if workload_type == PgBenchLoadType.SIMPLE_UPDATE: # Run simple-update workload - run_pgbench(env, - "simple-update", - [ - 'pgbench', - '-N', - '-c4', - f'-T{duration}', - '-P2', - '--progress-timestamp', - env.pg.connstr(), - ]) + run_pgbench( + env, + "simple-update", + [ + "pgbench", + "-N", + "-c4", + f"-T{duration}", + "-P2", + "--progress-timestamp", + env.pg.connstr(), + ], + ) if workload_type == PgBenchLoadType.SELECT_ONLY: # Run SELECT workload - run_pgbench(env, - "select-only", - [ - 'pgbench', - '-S', - '-c4', - f'-T{duration}', - '-P2', - '--progress-timestamp', - env.pg.connstr(), - ]) + run_pgbench( + env, + "select-only", + [ + "pgbench", + "-S", + "-c4", + f"-T{duration}", + "-P2", + "--progress-timestamp", + env.pg.connstr(), + ], + ) env.report_size() @@ -121,12 +128,12 @@ def get_durations_matrix(default: int = 45) -> List[int]: rv = [] for d in durations.split(","): d = d.strip().lower() - if d.endswith('h'): - duration = int(d.removesuffix('h')) * 60 * 60 - elif d.endswith('m'): - duration = int(d.removesuffix('m')) * 60 + if d.endswith("h"): + duration = int(d.removesuffix("h")) * 60 * 60 + elif d.endswith("m"): + duration = int(d.removesuffix("m")) * 60 else: - duration = int(d.removesuffix('s')) + duration = int(d.removesuffix("s")) rv.append(duration) return rv @@ -137,10 +144,10 @@ def get_scales_matrix(default: int = 10) -> List[int]: rv = [] for s in scales.split(","): s = s.strip().lower() - if s.endswith('mb'): - scale = get_scale_for_db(int(s.removesuffix('mb'))) - elif s.endswith('gb'): - scale = get_scale_for_db(int(s.removesuffix('gb')) * 1024) + if s.endswith("mb"): + scale = get_scale_for_db(int(s.removesuffix("mb"))) + elif s.endswith("gb"): + scale = get_scale_for_db(int(s.removesuffix("gb")) * 1024) else: scale = int(s) rv.append(scale) @@ -167,9 +174,9 @@ def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int): @pytest.mark.parametrize("duration", get_durations_matrix()) def test_pgbench_flamegraph(zenbenchmark, pg_bin, neon_env_builder, scale: int, duration: int): neon_env_builder.num_safekeepers = 1 - neon_env_builder.pageserver_config_override = ''' + neon_env_builder.pageserver_config_override = """ profiling="page_requests" -''' +""" if not profiling_supported(): pytest.skip("pageserver was built without 'profiling' feature") diff --git a/test_runner/performance/test_random_writes.py b/test_runner/performance/test_random_writes.py index 8931234c51..8ed684af16 100644 --- a/test_runner/performance/test_random_writes.py +++ b/test_runner/performance/test_random_writes.py @@ -1,14 +1,13 @@ import os -from contextlib import closing -from fixtures.benchmark_fixture import MetricReport -from fixtures.neon_fixtures import NeonEnv -from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare -from fixtures.log_helper import log - -import psycopg2.extras import random import time +from contextlib import closing +import psycopg2.extras +from fixtures.benchmark_fixture import MetricReport +from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv from fixtures.utils import query_scalar @@ -43,13 +42,15 @@ def test_random_writes(neon_with_baseline: PgCompare): with closing(env.pg.connect()) as conn: with conn.cursor() as cur: # Create the test table - with env.record_duration('init'): - cur.execute(""" + with env.record_duration("init"): + cur.execute( + """ CREATE TABLE Big( pk integer primary key, count integer default 0 ); - """) + """ + ) # Insert n_rows in batches to avoid query timeouts rows_inserted = 0 @@ -62,7 +63,7 @@ def test_random_writes(neon_with_baseline: PgCompare): # Get table size (can't be predicted because padding and alignment) table_size = query_scalar(cur, "SELECT pg_relation_size('Big')") - env.zenbenchmark.record("table_size", table_size, 'bytes', MetricReport.TEST_PARAM) + env.zenbenchmark.record("table_size", table_size, "bytes", MetricReport.TEST_PARAM) # Decide how much to write, based on knowledge of pageserver implementation. # Avoiding segment collisions maximizes (neon_runtime / vanilla_runtime). @@ -72,13 +73,15 @@ def test_random_writes(neon_with_baseline: PgCompare): # The closer this is to 250 MB, the more realistic the test is. effective_checkpoint_distance = table_size * n_writes // n_rows - env.zenbenchmark.record("effective_checkpoint_distance", - effective_checkpoint_distance, - 'bytes', - MetricReport.TEST_PARAM) + env.zenbenchmark.record( + "effective_checkpoint_distance", + effective_checkpoint_distance, + "bytes", + MetricReport.TEST_PARAM, + ) # Update random keys - with env.record_duration('run'): + with env.record_duration("run"): for it in range(n_iterations): for i in range(n_writes): key = random.randint(1, n_rows) diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py index 8d7ad46c1a..6094ed38e5 100644 --- a/test_runner/performance/test_seqscans.py +++ b/test_runner/performance/test_seqscans.py @@ -2,15 +2,16 @@ # from contextlib import closing from dataclasses import dataclass -from fixtures.neon_fixtures import NeonEnv -from fixtures.log_helper import log + +import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.compare_fixtures import PgCompare -import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv @pytest.mark.parametrize( - 'rows,iters,workers', + "rows,iters,workers", [ # The test table is large enough (3-4 MB) that it doesn't fit in the compute node # cache, so the seqscans go to the page server. But small enough that it fits @@ -18,31 +19,34 @@ import pytest pytest.param(100000, 100, 0), # Also test with a larger table, with and without parallelism pytest.param(10000000, 1, 0), - pytest.param(10000000, 1, 4) - ]) + pytest.param(10000000, 1, 4), + ], +) def test_seqscans(neon_with_baseline: PgCompare, rows: int, iters: int, workers: int): env = neon_with_baseline with closing(env.pg.connect()) as conn: with conn.cursor() as cur: - cur.execute('create table t (i integer);') - cur.execute(f'insert into t values (generate_series(1,{rows}));') + cur.execute("create table t (i integer);") + cur.execute(f"insert into t values (generate_series(1,{rows}));") # Verify that the table is larger than shared_buffers - cur.execute(''' + cur.execute( + """ select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('t') as tbl_ize from pg_settings where name = 'shared_buffers' - ''') + """ + ) row = cur.fetchone() assert row is not None shared_buffers = row[0] table_size = row[1] log.info(f"shared_buffers is {shared_buffers}, table size {table_size}") assert int(shared_buffers) < int(table_size) - env.zenbenchmark.record("table_size", table_size, 'bytes', MetricReport.TEST_PARAM) + env.zenbenchmark.record("table_size", table_size, "bytes", MetricReport.TEST_PARAM) cur.execute(f"set max_parallel_workers_per_gather = {workers}") - with env.record_duration('run'): + with env.record_duration("run"): for i in range(iters): - cur.execute('select count(*) from t;') + cur.execute("select count(*) from t;") diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index 1cfd128e9b..e91b180154 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -1,7 +1,8 @@ -import pytest from contextlib import closing -from fixtures.neon_fixtures import NeonEnvBuilder + +import pytest from fixtures.benchmark_fixture import NeonBenchmarker +from fixtures.neon_fixtures import NeonEnvBuilder # This test sometimes runs for longer than the global 5 minute timeout. @@ -11,15 +12,15 @@ def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker env = neon_env_builder.init_start() # Start - env.neon_cli.create_branch('test_startup') + env.neon_cli.create_branch("test_startup") with zenbenchmark.record_duration("startup_time"): - pg = env.postgres.create_start('test_startup') + pg = env.postgres.create_start("test_startup") pg.safe_psql("select 1;") # Restart pg.stop_and_destroy() with zenbenchmark.record_duration("restart_time"): - pg.create_start('test_startup') + pg.create_start("test_startup") pg.safe_psql("select 1;") # Fill up @@ -28,8 +29,8 @@ def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker with closing(pg.connect()) as conn: with conn.cursor() as cur: for i in range(num_tables): - cur.execute(f'create table t_{i} (i integer);') - cur.execute(f'insert into t_{i} values (generate_series(1,{num_rows}));') + cur.execute(f"create table t_{i} (i integer);") + cur.execute(f"insert into t_{i} values (generate_series(1,{num_rows}));") # Read with zenbenchmark.record_duration("read_time"): @@ -42,7 +43,7 @@ def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker # Restart pg.stop_and_destroy() with zenbenchmark.record_duration("restart_with_data"): - pg.create_start('test_startup') + pg.create_start("test_startup") pg.safe_psql("select 1;") # Read diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index bbb5ddecab..03d5ba208a 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -10,8 +10,7 @@ from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare from fixtures.log_helper import log from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin from fixtures.utils import lsn_from_hex - -from performance.test_perf_pgbench import (get_durations_matrix, get_scales_matrix) +from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix @pytest.fixture(params=["vanilla", "neon_off", "neon_on"]) @@ -30,7 +29,9 @@ def pg_compare(request) -> PgCompare: return fixture else: - assert len(x) == 2, f"request param ({request.param}) should have a format of \ + assert ( + len(x) == 2 + ), f"request param ({request.param}) should have a format of \ `neon_{{safekeepers_enable_fsync}}`" # `NeonCompare` interface @@ -70,8 +71,7 @@ def start_heavy_write_workload(env: PgCompare, n_tables: int, scale: int, num_it with env.record_duration("run_duration"): threads = [ - threading.Thread(target=start_single_table_workload, args=(i, )) - for i in range(n_tables) + threading.Thread(target=start_single_table_workload, args=(i,)) for i in range(n_tables) ] for thread in threads: @@ -95,12 +95,14 @@ def test_heavy_write_workload(pg_compare: PgCompare, n_tables: int, scale: int, ) cur.execute(f"INSERT INTO t{i} (key) VALUES (0)") - workload_thread = threading.Thread(target=start_heavy_write_workload, - args=(env, n_tables, scale, num_iters)) + workload_thread = threading.Thread( + target=start_heavy_write_workload, args=(env, n_tables, scale, num_iters) + ) workload_thread.start() - record_thread = threading.Thread(target=record_lsn_write_lag, - args=(env, lambda: workload_thread.is_alive())) + record_thread = threading.Thread( + target=record_lsn_write_lag, args=(env, lambda: workload_thread.is_alive()) + ) record_thread.start() record_read_latency(env, lambda: workload_thread.is_alive(), "SELECT * from t0 where key = 0") @@ -110,14 +112,16 @@ def test_heavy_write_workload(pg_compare: PgCompare, n_tables: int, scale: int, def start_pgbench_simple_update_workload(env: PgCompare, duration: int): with env.record_duration("run_duration"): - env.pg_bin.run_capture([ - 'pgbench', - '-j10', - '-c10', - '-N', - f'-T{duration}', - env.pg.connstr(options="-csynchronous_commit=off") - ]) + env.pg_bin.run_capture( + [ + "pgbench", + "-j10", + "-c10", + "-N", + f"-T{duration}", + env.pg.connstr(options="-csynchronous_commit=off"), + ] + ) env.flush() @@ -128,20 +132,22 @@ def test_pgbench_simple_update_workload(pg_compare: PgCompare, scale: int, durat env = pg_compare # initialize pgbench tables - env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.pg_bin.run_capture(["pgbench", f"-s{scale}", "-i", env.pg.connstr()]) env.flush() - workload_thread = threading.Thread(target=start_pgbench_simple_update_workload, - args=(env, duration)) + workload_thread = threading.Thread( + target=start_pgbench_simple_update_workload, args=(env, duration) + ) workload_thread.start() - record_thread = threading.Thread(target=record_lsn_write_lag, - args=(env, lambda: workload_thread.is_alive())) + record_thread = threading.Thread( + target=record_lsn_write_lag, args=(env, lambda: workload_thread.is_alive()) + ) record_thread.start() - record_read_latency(env, - lambda: workload_thread.is_alive(), - "SELECT * from pgbench_accounts where aid = 1") + record_read_latency( + env, lambda: workload_thread.is_alive(), "SELECT * from pgbench_accounts where aid = 1" + ) workload_thread.join() record_thread.join() @@ -150,13 +156,15 @@ def start_pgbench_intensive_initialization(env: PgCompare, scale: int, done_even with env.record_duration("run_duration"): # Needs to increase the statement timeout (default: 120s) because the # initialization step can be slow with a large scale. - env.pg_bin.run_capture([ - 'pgbench', - f'-s{scale}', - '-i', - '-Idtg', - env.pg.connstr(options='-cstatement_timeout=600s') - ]) + env.pg_bin.run_capture( + [ + "pgbench", + f"-s{scale}", + "-i", + "-Idtg", + env.pg.connstr(options="-cstatement_timeout=600s"), + ] + ) done_event.set() @@ -170,12 +178,14 @@ def test_pgbench_intensive_init_workload(pg_compare: PgCompare, scale: int): workload_done_event = threading.Event() - workload_thread = threading.Thread(target=start_pgbench_intensive_initialization, - args=(env, scale, workload_done_event)) + workload_thread = threading.Thread( + target=start_pgbench_intensive_initialization, args=(env, scale, workload_done_event) + ) workload_thread.start() - record_thread = threading.Thread(target=record_lsn_write_lag, - args=(env, lambda: not workload_done_event.is_set())) + record_thread = threading.Thread( + target=record_lsn_write_lag, args=(env, lambda: not workload_done_event.is_set()) + ) record_thread.start() record_read_latency(env, lambda: not workload_done_event.is_set(), "SELECT count(*) from foo") @@ -195,13 +205,15 @@ def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_inte cur.execute("CREATE EXTENSION neon") while run_cond(): - cur.execute(''' + cur.execute( + """ select pg_wal_lsn_diff(pg_current_wal_flush_lsn(),received_lsn), pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_flush_lsn(),received_lsn)), pg_current_wal_flush_lsn(), received_lsn from backpressure_lsns(); - ''') + """ + ) res = cur.fetchone() lsn_write_lags.append(res[0]) @@ -220,24 +232,29 @@ def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_inte time.sleep(pool_interval) - env.zenbenchmark.record("lsn_write_lag_max", - float(max(lsn_write_lags) / (1024**2)), - "MB", - MetricReport.LOWER_IS_BETTER) - env.zenbenchmark.record("lsn_write_lag_avg", - float(statistics.mean(lsn_write_lags) / (1024**2)), - "MB", - MetricReport.LOWER_IS_BETTER) - env.zenbenchmark.record("lsn_write_lag_stdev", - float(statistics.stdev(lsn_write_lags) / (1024**2)), - "MB", - MetricReport.LOWER_IS_BETTER) + env.zenbenchmark.record( + "lsn_write_lag_max", + float(max(lsn_write_lags) / (1024**2)), + "MB", + MetricReport.LOWER_IS_BETTER, + ) + env.zenbenchmark.record( + "lsn_write_lag_avg", + float(statistics.mean(lsn_write_lags) / (1024**2)), + "MB", + MetricReport.LOWER_IS_BETTER, + ) + env.zenbenchmark.record( + "lsn_write_lag_stdev", + float(statistics.stdev(lsn_write_lags) / (1024**2)), + "MB", + MetricReport.LOWER_IS_BETTER, + ) -def record_read_latency(env: PgCompare, - run_cond: Callable[[], bool], - read_query: str, - read_interval: float = 1.0): +def record_read_latency( + env: PgCompare, run_cond: Callable[[], bool], read_query: str, read_interval: float = 1.0 +): read_latencies = [] with env.pg.connect().cursor() as cur: @@ -256,15 +273,12 @@ def record_read_latency(env: PgCompare, time.sleep(read_interval) - env.zenbenchmark.record("read_latency_max", - max(read_latencies), - 's', - MetricReport.LOWER_IS_BETTER) - env.zenbenchmark.record("read_latency_avg", - statistics.mean(read_latencies), - 's', - MetricReport.LOWER_IS_BETTER) - env.zenbenchmark.record("read_latency_stdev", - statistics.stdev(read_latencies), - 's', - MetricReport.LOWER_IS_BETTER) + env.zenbenchmark.record( + "read_latency_max", max(read_latencies), "s", MetricReport.LOWER_IS_BETTER + ) + env.zenbenchmark.record( + "read_latency_avg", statistics.mean(read_latencies), "s", MetricReport.LOWER_IS_BETTER + ) + env.zenbenchmark.record( + "read_latency_stdev", statistics.stdev(read_latencies), "s", MetricReport.LOWER_IS_BETTER + ) diff --git a/test_runner/performance/test_write_amplification.py b/test_runner/performance/test_write_amplification.py index 1d729fd78f..7aab469387 100644 --- a/test_runner/performance/test_write_amplification.py +++ b/test_runner/performance/test_write_amplification.py @@ -12,10 +12,11 @@ # Amplification problem at its finest. import os from contextlib import closing + from fixtures.benchmark_fixture import MetricReport -from fixtures.neon_fixtures import NeonEnv -from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare +from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv def test_write_amplification(neon_with_baseline: PgCompare): @@ -23,18 +24,20 @@ def test_write_amplification(neon_with_baseline: PgCompare): with closing(env.pg.connect()) as conn: with conn.cursor() as cur: - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('run'): + with env.record_pageserver_writes("pageserver_writes"): + with env.record_duration("run"): # NOTE: Because each iteration updates every table already created, # the runtime and write amplification is O(n^2), where n is the # number of iterations. for i in range(25): - cur.execute(f''' + cur.execute( + f""" CREATE TABLE tbl{i} AS SELECT g as i, 'long string to consume some space' || g as t FROM generate_series(1, 100000) g - ''') + """ + ) cur.execute(f"create index on tbl{i} (i);") for j in range(1, i): cur.execute(f"delete from tbl{j} where i = {i}") diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py index a117616358..f91a2adf7d 100644 --- a/test_runner/pg_clients/test_pg_clients.py +++ b/test_runner/pg_clients/test_pg_clients.py @@ -18,10 +18,12 @@ from fixtures.utils import subprocess_capture "python/asyncpg", pytest.param( "python/pg8000", # See https://github.com/neondatabase/neon/pull/2008#discussion_r912264281 - marks=pytest.mark.xfail(reason="Handles SSL in incompatible with Neon way")), + marks=pytest.mark.xfail(reason="Handles SSL in incompatible with Neon way"), + ), pytest.param( "swift/PostgresClientKit", # See https://github.com/neondatabase/neon/pull/2008#discussion_r911896592 - marks=pytest.mark.xfail(reason="Neither SNI nor parameters is supported")), + marks=pytest.mark.xfail(reason="Neither SNI nor parameters is supported"), + ), "typescript/postgresql-client", ], ) @@ -31,12 +33,14 @@ def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: st env_file = None with NamedTemporaryFile(mode="w", delete=False) as f: env_file = f.name - f.write(f""" + f.write( + f""" NEON_HOST={conn_options["host"]} NEON_DATABASE={conn_options["dbname"]} NEON_USER={conn_options["user"]} NEON_PASSWORD={conn_options["password"]} - """) + """ + ) image_tag = client.lower() docker_bin = shutil.which("docker") diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py index 3960546689..0281f4f48b 100644 --- a/test_runner/test_broken.py +++ b/test_runner/test_broken.py @@ -1,8 +1,9 @@ -import pytest import os -from fixtures.neon_fixtures import NeonEnv +import pytest from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv + """ Use this test to see what happens when tests fail. @@ -13,8 +14,9 @@ Set the environment variable RUN_BROKEN to see this test run (and fail, and hopefully not leave any server processes behind). """ -run_broken = pytest.mark.skipif(os.environ.get('RUN_BROKEN') is None, - reason="only used for testing the fixtures") +run_broken = pytest.mark.skipif( + os.environ.get("RUN_BROKEN") is None, reason="only used for testing the fixtures" +) @run_broken @@ -23,7 +25,7 @@ def test_broken(neon_simple_env: NeonEnv, pg_bin): env.neon_cli.create_branch("test_broken", "empty") env.postgres.create_start("test_broken") - log.info('postgres is running') + log.info("postgres is running") - log.info('THIS NEXT COMMAND WILL FAIL:') - pg_bin.run('pgbench -i_am_a_broken_test'.split()) + log.info("THIS NEXT COMMAND WILL FAIL:") + pg_bin.run("pgbench -i_am_a_broken_test".split()) From ae3227509c36ae4e6529fdc397933c8b2372c47a Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 18 Aug 2022 13:42:06 +0100 Subject: [PATCH 0671/1022] test_runner: revive flake8 --- .github/workflows/codestyle.yml | 3 +++ docs/sourcetree.md | 3 ++- poetry.lock | 40 ++++++++++++++++----------------- pre-commit.py | 11 +++++++++ pyproject.toml | 2 +- setup.cfg | 8 +++++++ 6 files changed, 45 insertions(+), 22 deletions(-) create mode 100644 setup.cfg diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index bd0f368499..029beba351 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -134,5 +134,8 @@ jobs: - name: Run black to ensure code format run: poetry run black --diff --check . + - name: Run flake8 to ensure code format + run: poetry run flake8 . + - name: Run mypy to check types run: poetry run mypy . diff --git a/docs/sourcetree.md b/docs/sourcetree.md index f189134865..88f4b0e559 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -118,6 +118,7 @@ Run the following commands in the repository's root (next to `pyproject.toml`): ```bash poetry run isort . # Imports are reformatted poetry run black . # All code is reformatted +poetry run flake8 . # Python linter poetry run mypy . # Ensure there are no typing errors ``` @@ -126,7 +127,7 @@ Otherwise it will not find its configuration. Also consider: -* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any. +* Running `pycodestyle` (or a linter of your choice) and fixing possible defects, if any. * Adding more type hints to your code to avoid `Any`. ### Changing dependencies diff --git a/poetry.lock b/poetry.lock index cd24641a4f..e1f2e576eb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -653,16 +653,16 @@ testing = ["pre-commit"] [[package]] name = "flake8" -version = "3.9.2" +version = "5.0.4" description = "the modular source code checker: pep8 pyflakes and co" category = "dev" optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" +python-versions = ">=3.6.1" [package.dependencies] -mccabe = ">=0.6.0,<0.7.0" -pycodestyle = ">=2.7.0,<2.8.0" -pyflakes = ">=2.3.0,<2.4.0" +mccabe = ">=0.7.0,<0.8.0" +pycodestyle = ">=2.9.0,<2.10.0" +pyflakes = ">=2.5.0,<2.6.0" [[package]] name = "flask" @@ -870,11 +870,11 @@ python-versions = ">=3.7" [[package]] name = "mccabe" -version = "0.6.1" +version = "0.7.0" description = "McCabe checker, plugin for flake8" category = "dev" optional = false -python-versions = "*" +python-versions = ">=3.6" [[package]] name = "moto" @@ -1107,11 +1107,11 @@ python-versions = "*" [[package]] name = "pycodestyle" -version = "2.7.0" +version = "2.9.1" description = "Python style guide checker" category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +python-versions = ">=3.6" [[package]] name = "pycparser" @@ -1123,11 +1123,11 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [[package]] name = "pyflakes" -version = "2.3.1" +version = "2.5.0" description = "passive checker of Python programs" category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +python-versions = ">=3.6" [[package]] name = "pyjwt" @@ -1537,7 +1537,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "497b963e7a2f80a751ccd201e950cf533caddb6c7c96163c94cea69874840843" +content-hash = "2112382a6723ed3b77d242db926c7445fa809fafcf11da127b5292565d2ba798" [metadata.files] aiopg = [ @@ -1759,8 +1759,8 @@ execnet = [ {file = "execnet-1.9.0.tar.gz", hash = "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5"}, ] flake8 = [ - {file = "flake8-3.9.2-py2.py3-none-any.whl", hash = "sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"}, - {file = "flake8-3.9.2.tar.gz", hash = "sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b"}, + {file = "flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"}, + {file = "flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"}, ] flask = [ {file = "Flask-2.1.3-py3-none-any.whl", hash = "sha256:9013281a7402ad527f8fd56375164f3aa021ecfaff89bfe3825346c24f87e04c"}, @@ -1872,8 +1872,8 @@ markupsafe = [ {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"}, ] mccabe = [ - {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, - {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, + {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, + {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, ] moto = [ {file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"}, @@ -2026,16 +2026,16 @@ pyasn1 = [ {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, ] pycodestyle = [ - {file = "pycodestyle-2.7.0-py2.py3-none-any.whl", hash = "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068"}, - {file = "pycodestyle-2.7.0.tar.gz", hash = "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"}, + {file = "pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"}, + {file = "pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"}, ] pycparser = [ {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] pyflakes = [ - {file = "pyflakes-2.3.1-py2.py3-none-any.whl", hash = "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3"}, - {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"}, + {file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"}, + {file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"}, ] pyjwt = [ {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"}, diff --git a/pre-commit.py b/pre-commit.py index 45f140d43a..560df6cd0c 100755 --- a/pre-commit.py +++ b/pre-commit.py @@ -50,6 +50,10 @@ def isort(fix_inplace: bool) -> str: return cmd +def flake8() -> str: + return "poetry run flake8" + + def mypy() -> str: return "poetry run mypy" @@ -122,6 +126,13 @@ if __name__ == "__main__": changed_files=files, no_color=args.no_color, ) + check( + name="flake8", + suffix=".py", + cmd=flake8(), + changed_files=files, + no_color=args.no_color, + ) check( name="mypy", suffix=".py", diff --git a/pyproject.toml b/pyproject.toml index 4f8a49a024..d648d1050a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ pytest-order = "^1.0.1" allure-pytest = "^2.9.45" [tool.poetry.dev-dependencies] -flake8 = "^3.9.2" +flake8 = "^5.0.4" mypy = "==0.971" black = "^22.6.0" isort = "^5.10.1" diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000..a067ee731d --- /dev/null +++ b/setup.cfg @@ -0,0 +1,8 @@ +[flake8] +# Move config to pyproject.toml as soon as flake8 supports it +# https://github.com/PyCQA/flake8/issues/234 +extend-ignore = + E203, # Whitespace before ':' -- conflicts with black + E266, # Too many leading '#' for block comment -- we use it for formatting sometimes + E501 # Line too long -- black sorts it out +extend-exclude = vendor/ From 39a3bcac360220b1e406f3616a9d1570bb7bb9b3 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 18 Aug 2022 20:41:13 +0100 Subject: [PATCH 0672/1022] test_runner: fix flake8 warnings --- scripts/export_import_between_pageservers.py | 22 +++++------ .../batch_others/test_basebackup_error.py | 4 +- .../batch_others/test_branch_and_gc.py | 4 +- .../batch_others/test_broken_timeline.py | 6 +-- .../batch_others/test_clog_truncate.py | 3 +- test_runner/batch_others/test_createdropdb.py | 5 +-- test_runner/batch_others/test_fsm_truncate.py | 4 +- .../batch_others/test_gc_aggressive.py | 2 +- test_runner/batch_others/test_lsn_mapping.py | 10 +---- .../batch_others/test_pageserver_api.py | 20 +++++----- .../batch_others/test_parallel_copy.py | 4 +- test_runner/batch_others/test_recovery.py | 5 --- .../batch_others/test_remote_storage.py | 2 +- test_runner/batch_others/test_tenant_conf.py | 5 +-- .../batch_others/test_tenant_relocation.py | 5 +-- test_runner/batch_others/test_tenant_tasks.py | 5 +-- .../test_tenants_with_remote_storage.py | 11 +++--- .../batch_others/test_timeline_size.py | 4 +- test_runner/batch_others/test_wal_acceptor.py | 6 +-- .../batch_pg_regress/test_neon_regress.py | 2 +- test_runner/fixtures/benchmark_fixture.py | 8 ++++ test_runner/fixtures/metrics.py | 2 - test_runner/fixtures/neon_fixtures.py | 37 +++++++++---------- test_runner/fixtures/utils.py | 2 +- test_runner/performance/test_bulk_insert.py | 5 +-- test_runner/performance/test_copy.py | 8 +--- test_runner/performance/test_dup_key.py | 2 +- test_runner/performance/test_gist_build.py | 6 +-- test_runner/performance/test_hot_page.py | 2 +- .../performance/test_parallel_copy_to.py | 11 ++---- test_runner/performance/test_random_writes.py | 7 +--- test_runner/performance/test_seqscans.py | 4 +- .../performance/test_write_amplification.py | 6 +-- .../python/pg8000/pg8000_example.py | 1 - test_runner/pg_clients/test_pg_clients.py | 2 - 35 files changed, 92 insertions(+), 140 deletions(-) diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 5b9fc76768..af847be49e 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -28,13 +28,13 @@ import tempfile import time import uuid from contextlib import closing -from os import path from pathlib import Path -from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, TypeVar, Union, cast +from typing import Any, Dict, List, Optional, Tuple, cast import psycopg2 import requests from psycopg2.extensions import connection as PgConnection +from psycopg2.extensions import parse_dsn ############################################### ### client-side utils copied from test fixtures @@ -149,10 +149,8 @@ class PgProtocol: # enough for our tests, but if you need a longer, you can # change it by calling "SET statement_timeout" after # connecting. - if "options" in conn_options: - conn_options["options"] = f"-cstatement_timeout=120s " + conn_options["options"] - else: - conn_options["options"] = "-cstatement_timeout=120s" + conn_options["options"] = f"-cstatement_timeout=120s {conn_options.get('options', '')}" + return conn_options # autocommit=True here by default because that's what we need most of the time @@ -250,7 +248,7 @@ class NeonPageserverHttpClient(requests.Session): except requests.RequestException as e: try: msg = res.json()["msg"] - except: + except: # noqa: E722 msg = "" raise NeonPageserverApiException(msg) from e @@ -477,8 +475,8 @@ def import_timeline( import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn}" full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """ - stderr_filename2 = path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr") - stdout_filename = path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout") + stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr") + stdout_filename = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout") print(f"Running: {full_cmd}") @@ -495,7 +493,7 @@ def import_timeline( check=True, ) - print(f"Done import") + print("Done import") # Wait until pageserver persists the files wait_for_upload( @@ -508,7 +506,7 @@ def export_timeline( ): # Choose filenames incomplete_filename = tar_filename + ".incomplete" - stderr_filename = path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr") + stderr_filename = os.path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr") # Construct export command query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}" @@ -563,7 +561,7 @@ def main(args: argparse.Namespace): continue # Choose filenames - tar_filename = path.join( + tar_filename = os.path.join( args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar" ) diff --git a/test_runner/batch_others/test_basebackup_error.py b/test_runner/batch_others/test_basebackup_error.py index 9960f3afbf..81a46ee2f0 100644 --- a/test_runner/batch_others/test_basebackup_error.py +++ b/test_runner/batch_others/test_basebackup_error.py @@ -11,7 +11,7 @@ def test_basebackup_error(neon_simple_env: NeonEnv): env.neon_cli.create_branch("test_basebackup_error", "empty") # Introduce failpoint - env.pageserver.safe_psql(f"failpoints basebackup-before-control-file=return") + env.pageserver.safe_psql("failpoints basebackup-before-control-file=return") with pytest.raises(Exception, match="basebackup-before-control-file"): - pg = env.postgres.create_start("test_basebackup_error") + env.postgres.create_start("test_basebackup_error") diff --git a/test_runner/batch_others/test_branch_and_gc.py b/test_runner/batch_others/test_branch_and_gc.py index bc8374543f..deb041b5d1 100644 --- a/test_runner/batch_others/test_branch_and_gc.py +++ b/test_runner/batch_others/test_branch_and_gc.py @@ -65,7 +65,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): } ) - timeline_main = env.neon_cli.create_timeline(f"test_main", tenant_id=tenant) + timeline_main = env.neon_cli.create_timeline("test_main", tenant_id=tenant) pg_main = env.postgres.create_start("test_main", tenant_id=tenant) main_cur = pg_main.connect().cursor() @@ -148,7 +148,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the # branch creation task but the individual timeline GC iteration happens *after* # the branch creation task. - env.pageserver.safe_psql(f"failpoints before-timeline-gc=sleep(2000)") + env.pageserver.safe_psql("failpoints before-timeline-gc=sleep(2000)") def do_gc(): env.pageserver.safe_psql(f"do_gc {tenant.hex} {b0.hex} 0") diff --git a/test_runner/batch_others/test_broken_timeline.py b/test_runner/batch_others/test_broken_timeline.py index b96a7895eb..c4b23c24b8 100644 --- a/test_runner/batch_others/test_broken_timeline.py +++ b/test_runner/batch_others/test_broken_timeline.py @@ -1,8 +1,6 @@ import concurrent.futures import os -from contextlib import closing from typing import List, Tuple -from uuid import UUID import pytest from fixtures.log_helper import log @@ -24,7 +22,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): tenant_id = tenant_id_uuid.hex timeline_id = timeline_id_uuid.hex - pg = env.postgres.create_start(f"main", tenant_id=tenant_id_uuid) + pg = env.postgres.create_start("main", tenant_id=tenant_id_uuid) with pg.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'") @@ -102,7 +100,7 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): tenant_id, _ = env.neon_cli.create_tenant() # Introduce failpoint when creating a new timeline - env.pageserver.safe_psql(f"failpoints before-checkpoint-new-timeline=return") + env.pageserver.safe_psql("failpoints before-checkpoint-new-timeline=return") with pytest.raises(Exception, match="before-checkpoint-new-timeline"): _ = env.neon_cli.create_timeline("test_fix_broken_timelines", tenant_id) diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/batch_others/test_clog_truncate.py index 1f5df1c130..f47e4a99bf 100644 --- a/test_runner/batch_others/test_clog_truncate.py +++ b/test_runner/batch_others/test_clog_truncate.py @@ -1,6 +1,5 @@ import os import time -from contextlib import closing from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv @@ -49,7 +48,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv): log.info(f"pg_xact_0000_path = {pg_xact_0000_path}") while os.path.isfile(pg_xact_0000_path): - log.info(f"file exists. wait for truncation. " "pg_xact_0000_path = {pg_xact_0000_path}") + log.info(f"file exists. wait for truncation: {pg_xact_0000_path=}") time.sleep(5) # checkpoint to advance latest lsn diff --git a/test_runner/batch_others/test_createdropdb.py b/test_runner/batch_others/test_createdropdb.py index fdb704ff15..036e50e6e8 100644 --- a/test_runner/batch_others/test_createdropdb.py +++ b/test_runner/batch_others/test_createdropdb.py @@ -1,6 +1,5 @@ import os import pathlib -from contextlib import closing from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content @@ -92,14 +91,14 @@ def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): dbpath = pathlib.Path(pg_before.pgdata_dir) / "base" / str(dboid) log.info(dbpath) - assert os.path.isdir(dbpath) == True + assert os.path.isdir(dbpath) is True # Test that database subdir doesn't exist on the branch after drop assert pg_after.pgdata_dir dbpath = pathlib.Path(pg_after.pgdata_dir) / "base" / str(dboid) log.info(dbpath) - assert os.path.isdir(dbpath) == False + assert os.path.isdir(dbpath) is False # Check that we restore the content of the datadir correctly check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/batch_others/test_fsm_truncate.py b/test_runner/batch_others/test_fsm_truncate.py index 54ad2ffa34..4551ff97e0 100644 --- a/test_runner/batch_others/test_fsm_truncate.py +++ b/test_runner/batch_others/test_fsm_truncate.py @@ -1,6 +1,4 @@ -import pytest -from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient +from fixtures.neon_fixtures import NeonEnvBuilder def test_fsm_truncate(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/batch_others/test_gc_aggressive.py index be6b437e30..90824f882a 100644 --- a/test_runner/batch_others/test_gc_aggressive.py +++ b/test_runner/batch_others/test_gc_aggressive.py @@ -24,7 +24,7 @@ async def update_table(pg: Postgres): while updates_performed < updates_to_perform: updates_performed += 1 id = random.randrange(1, num_rows) - row = await pg_conn.fetchrow(f"UPDATE foo SET counter = counter + 1 WHERE id = {id}") + await pg_conn.fetchrow(f"UPDATE foo SET counter = counter + 1 WHERE id = {id}") # Perform aggressive GC with 0 horizon diff --git a/test_runner/batch_others/test_lsn_mapping.py b/test_runner/batch_others/test_lsn_mapping.py index 4db6951b42..0c1d3648f2 100644 --- a/test_runner/batch_others/test_lsn_mapping.py +++ b/test_runner/batch_others/test_lsn_mapping.py @@ -1,13 +1,7 @@ -import math -import time -from contextlib import closing -from datetime import timedelta, timezone, tzinfo -from uuid import UUID +from datetime import timedelta -import psycopg2.errors -import psycopg2.extras from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.utils import query_scalar diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 5d7619c1b2..869f53ac0a 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -1,16 +1,12 @@ -import os import pathlib import subprocess from typing import Optional from uuid import UUID, uuid4 -import pytest -from fixtures.log_helper import log from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, - NeonPageserverApiException, NeonPageserverHttpClient, neon_binpath, pg_distrib_dir, @@ -24,13 +20,15 @@ def test_pageserver_init_node_id(neon_simple_env: NeonEnv): repo_dir = neon_simple_env.repo_dir pageserver_config = repo_dir / "pageserver.toml" pageserver_bin = pathlib.Path(neon_binpath) / "pageserver" - run_pageserver = lambda args: subprocess.run( - [str(pageserver_bin), "-D", str(repo_dir), *args], - check=False, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) + + def run_pageserver(args): + return subprocess.run( + [str(pageserver_bin), "-D", str(repo_dir), *args], + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) # remove initial config pageserver_config.unlink() diff --git a/test_runner/batch_others/test_parallel_copy.py b/test_runner/batch_others/test_parallel_copy.py index 6b7fe4fdda..59f19026cc 100644 --- a/test_runner/batch_others/test_parallel_copy.py +++ b/test_runner/batch_others/test_parallel_copy.py @@ -32,7 +32,7 @@ async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str) async def parallel_load_same_table(pg: Postgres, n_parallel: int): workers = [] for worker_id in range(n_parallel): - worker = copy_test_data_to_table(pg, worker_id, f"copytest") + worker = copy_test_data_to_table(pg, worker_id, "copytest") workers.append(asyncio.create_task(worker)) # await all workers @@ -49,7 +49,7 @@ def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5): # Create test table conn = pg.connect() cur = conn.cursor() - cur.execute(f"CREATE TABLE copytest (i int, t text)") + cur.execute("CREATE TABLE copytest (i int, t text)") # Run COPY TO to load the table with parallel connections. asyncio.run(parallel_load_same_table(pg, n_parallel)) diff --git a/test_runner/batch_others/test_recovery.py b/test_runner/batch_others/test_recovery.py index 5220aa6c2e..6aa8b4e9be 100644 --- a/test_runner/batch_others/test_recovery.py +++ b/test_runner/batch_others/test_recovery.py @@ -1,7 +1,4 @@ -import json -import os import time -from ast import Assert from contextlib import closing import psycopg2.extras @@ -33,8 +30,6 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): pg = env.postgres.create_start("test_pageserver_recovery") log.info("postgres is running on 'test_pageserver_recovery' branch") - connstr = pg.connstr() - with closing(pg.connect()) as conn: with conn.cursor() as cur: with closing(env.pageserver.connect()) as psconn: diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index 974d3402f6..1e4fdc8602 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -99,7 +99,7 @@ def test_remote_storage_backup_and_restore( env.pageserver.start() # Introduce failpoint in download - env.pageserver.safe_psql(f"failpoints remote-storage-download-pre-rename=return") + env.pageserver.safe_psql("failpoints remote-storage-download-pre-rename=return") client.tenant_attach(UUID(tenant_id)) diff --git a/test_runner/batch_others/test_tenant_conf.py b/test_runner/batch_others/test_tenant_conf.py index 1e09ae8db7..d496edd6dc 100644 --- a/test_runner/batch_others/test_tenant_conf.py +++ b/test_runner/batch_others/test_tenant_conf.py @@ -1,7 +1,6 @@ from contextlib import closing import psycopg2.extras -import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder @@ -22,8 +21,8 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" } ) - env.neon_cli.create_timeline(f"test_tenant_conf", tenant_id=tenant) - pg = env.postgres.create_start( + env.neon_cli.create_timeline("test_tenant_conf", tenant_id=tenant) + env.postgres.create_start( "test_tenant_conf", "main", tenant, diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index a30804ee8e..4d949e0c13 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -14,7 +14,6 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient, - PageserverPort, PortDistributor, Postgres, assert_no_in_progress_downloads_for_tenant, @@ -56,7 +55,7 @@ def new_pageserver_helper( f"-c listen_pg_addr='localhost:{pg_port}'", f"-c listen_http_addr='localhost:{http_port}'", f"-c pg_distrib_dir='{pg_distrib_dir}'", - f"-c id=2", + "-c id=2", f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", ] if broker is not None: @@ -92,7 +91,7 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve with pg_cur(pg) as cur: cur.execute("INSERT INTO load VALUES ('some payload')") inserted_ctr += 1 - except: + except: # noqa: E722 if not failed: log.info("load failed") failed = True diff --git a/test_runner/batch_others/test_tenant_tasks.py b/test_runner/batch_others/test_tenant_tasks.py index 8075756ffb..8617bc8ea9 100644 --- a/test_runner/batch_others/test_tenant_tasks.py +++ b/test_runner/batch_others/test_tenant_tasks.py @@ -1,10 +1,9 @@ -import time from uuid import UUID from fixtures.neon_fixtures import NeonEnvBuilder, wait_until -def get_only_element(l): +def get_only_element(l): # noqa: E741 assert len(l) == 1 return l[0] @@ -46,7 +45,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): # Create tenant, start compute tenant, _ = env.neon_cli.create_tenant() - timeline = env.neon_cli.create_timeline(name, tenant_id=tenant) + env.neon_cli.create_timeline(name, tenant_id=tenant) pg = env.postgres.create_start(name, tenant_id=tenant) assert get_state(tenant) == "Active" diff --git a/test_runner/batch_others/test_tenants_with_remote_storage.py b/test_runner/batch_others/test_tenants_with_remote_storage.py index a127693c32..7db58c2a70 100644 --- a/test_runner/batch_others/test_tenants_with_remote_storage.py +++ b/test_runner/batch_others/test_tenants_with_remote_storage.py @@ -7,7 +7,6 @@ # import asyncio -from contextlib import closing from typing import List, Tuple from uuid import UUID @@ -25,12 +24,12 @@ from fixtures.utils import lsn_from_hex async def tenant_workload(env: NeonEnv, pg: Postgres): - pageserver_conn = await env.pageserver.connect_async() + await env.pageserver.connect_async() pg_conn = await pg.connect_async() - tenant_id = await pg_conn.fetchval("show neon.tenant_id") - timeline_id = await pg_conn.fetchval("show neon.timeline_id") + await pg_conn.fetchval("show neon.tenant_id") + await pg_conn.fetchval("show neon.timeline_id") await pg_conn.execute("CREATE TABLE t(key int primary key, value text)") for i in range(1, 100): @@ -72,10 +71,10 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: Re "checkpoint_distance": "5000000", } ) - env.neon_cli.create_timeline(f"test_tenants_many", tenant_id=tenant) + env.neon_cli.create_timeline("test_tenants_many", tenant_id=tenant) pg = env.postgres.create_start( - f"test_tenants_many", + "test_tenants_many", tenant_id=tenant, ) tenants_pgs.append((tenant, pg)) diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 76342cdf98..f6b665ec8c 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -125,7 +125,7 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 elapsed = time.time() - started_at if elapsed > timeout: raise RuntimeError( - f"timed out waiting for pageserver to reach pg_current_wal_flush_lsn()" + "timed out waiting for pageserver to reach pg_current_wal_flush_lsn()" ) res = pgmain.safe_psql( @@ -390,7 +390,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): tenant, timeline = env.neon_cli.create_tenant() def get_timeline_physical_size(timeline: UUID): - res = client.timeline_detail(tenant, timeline) + res = client.timeline_detail(tenant, timeline, include_non_incremental_physical_size=True) return res["local"]["current_physical_size_non_incremental"] timeline_total_size = get_timeline_physical_size(timeline) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 7710ef86cd..47838ddb76 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -180,7 +180,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): while not self.should_stop.is_set(): collect_metrics("during INSERT INTO") time.sleep(1) - except: + except: # noqa: E722 log.error( "MetricsChecker's thread failed, the test will be failed on .stop() call", exc_info=True, @@ -552,7 +552,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: R while True: elapsed = time.time() - started_at if elapsed > wait_lsn_timeout: - raise RuntimeError(f"Timed out waiting for WAL redo") + raise RuntimeError("Timed out waiting for WAL redo") pageserver_lsn = env.pageserver.http_client().timeline_detail( uuid.UUID(tenant_id), uuid.UUID((timeline_id)) @@ -615,7 +615,7 @@ class ProposerPostgres(PgProtocol): "shared_preload_libraries = 'neon'\n", f"neon.timeline_id = '{self.timeline_id.hex}'\n", f"neon.tenant_id = '{self.tenant_id.hex}'\n", - f"neon.pageserver_connstring = ''\n", + "neon.pageserver_connstring = ''\n", f"neon.safekeepers = '{safekeepers}'\n", f"listen_addresses = '{self.listen_addr}'\n", f"port = '{self.port}'\n", diff --git a/test_runner/batch_pg_regress/test_neon_regress.py b/test_runner/batch_pg_regress/test_neon_regress.py index 5f13e6b2de..4619647084 100644 --- a/test_runner/batch_pg_regress/test_neon_regress.py +++ b/test_runner/batch_pg_regress/test_neon_regress.py @@ -49,7 +49,7 @@ def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, c # checkpoint one more time to ensure that the lsn we get is the latest one pg.safe_psql("CHECKPOINT") - lsn = pg.safe_psql("select pg_current_wal_insert_lsn()")[0][0] + pg.safe_psql("select pg_current_wal_insert_lsn()")[0][0] # Check that we restore the content of the datadir correctly check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index cec46f9f6d..655ffed90d 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -60,6 +60,7 @@ class PgBenchRunResult: run_duration: float run_start_timestamp: int run_end_timestamp: int + scale: int # TODO progress @@ -130,6 +131,7 @@ class PgBenchRunResult: run_duration=run_duration, run_start_timestamp=run_start_timestamp, run_end_timestamp=run_end_timestamp, + scale=scale, ) @@ -304,6 +306,12 @@ class NeonBenchmarker: "", MetricReport.TEST_PARAM, ) + self.record( + f"{prefix}.scale", + pg_bench_result.scale, + "", + MetricReport.TEST_PARAM, + ) def record_pg_bench_init_result(self, prefix: str, result: PgBenchInitResult): test_params = [ diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 6159e273c0..b51c7250e0 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -1,8 +1,6 @@ from collections import defaultdict -from dataclasses import dataclass from typing import Dict, List -from fixtures.log_helper import log from prometheus_client.parser import text_string_to_metric_families from prometheus_client.samples import Sample diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 388cc34182..f4ed937f02 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -838,7 +838,7 @@ class NeonEnv: ) if config.auth_enabled: toml += textwrap.dedent( - f""" + """ auth_enabled = true """ ) @@ -985,7 +985,7 @@ class NeonPageserverHttpClient(requests.Session): except requests.RequestException as e: try: msg = res.json()["msg"] - except: + except: # noqa: E722 msg = "" raise NeonPageserverApiException(msg) from e @@ -1065,19 +1065,15 @@ class NeonPageserverHttpClient(requests.Session): include_non_incremental_logical_size: bool = False, include_non_incremental_physical_size: bool = False, ) -> Dict[Any, Any]: - - include_non_incremental_logical_size_str = "0" + params = {} if include_non_incremental_logical_size: - include_non_incremental_logical_size_str = "1" - - include_non_incremental_physical_size_str = "0" + params["include-non-incremental-logical-size"] = "yes" if include_non_incremental_physical_size: - include_non_incremental_physical_size_str = "1" + params["include-non-incremental-physical-size"] = "yes" res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}" - + "?include-non-incremental-logical-size={include_non_incremental_logical_size_str}" - + "&include-non-incremental-physical-size={include_non_incremental_physical_size_str}" + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}", + params=params, ) self.verbose_error(res) res_json = res.json() @@ -1532,7 +1528,7 @@ class NeonPageserver(PgProtocol): `overrides` allows to add some config to this pageserver start. Returns self. """ - assert self.running == False + assert self.running is False self.env.neon_cli.pageserver_start(overrides=overrides) self.running = True @@ -1867,9 +1863,7 @@ class Postgres(PgProtocol): log.info(f"Starting postgres node {self.node_name}") - run_result = self.env.neon_cli.pg_start( - self.node_name, tenant_id=self.tenant_id, port=self.port - ) + self.env.neon_cli.pg_start(self.node_name, tenant_id=self.tenant_id, port=self.port) self.running = True return self @@ -2078,7 +2072,7 @@ class Safekeeper: running: bool = False def start(self) -> "Safekeeper": - assert self.running == False + assert self.running is False self.env.neon_cli.safekeeper_start(self.id) self.running = True # wait for wal acceptor start by checking its status @@ -2270,7 +2264,7 @@ class Etcd: # Set --quota-backend-bytes to keep the etcd virtual memory # size smaller. Our test etcd clusters are very small. # See https://github.com/etcd-io/etcd/issues/7910 - f"--quota-backend-bytes=100000000", + "--quota-backend-bytes=100000000", ] self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file) @@ -2395,7 +2389,7 @@ def should_skip_file(filename: str) -> bool: try: list(map(int, tmp_name)) - except: + except: # noqa: E722 return False return True @@ -2508,7 +2502,12 @@ def wait_until(number_of_iterations: int, interval: float, func): def assert_timeline_local( pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID ): - timeline_detail = pageserver_http_client.timeline_detail(tenant, timeline) + timeline_detail = pageserver_http_client.timeline_detail( + tenant, + timeline, + include_non_incremental_logical_size=True, + include_non_incremental_physical_size=True, + ) assert timeline_detail.get("local", {}).get("disk_consistent_lsn"), timeline_detail return timeline_detail diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 48889a8697..324c62170b 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -110,7 +110,7 @@ def get_dir_size(path: str) -> int: for name in files: try: totalbytes += os.path.getsize(os.path.join(root, name)) - except FileNotFoundError as e: + except FileNotFoundError: pass # file could be concurrently removed return totalbytes diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 9aaf0cbc77..d6e67aa361 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -1,9 +1,6 @@ from contextlib import closing -from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker -from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare -from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.compare_fixtures import PgCompare # diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py index bf4804fc07..01b2097112 100644 --- a/test_runner/performance/test_copy.py +++ b/test_runner/performance/test_copy.py @@ -1,11 +1,7 @@ from contextlib import closing from io import BufferedReader, RawIOBase -from itertools import repeat -from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker -from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare -from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.compare_fixtures import PgCompare class CopyTestData(RawIOBase): @@ -28,7 +24,7 @@ class CopyTestData(RawIOBase): self.rownum += 1 # Number of bytes to read in this call - l = min(len(self.linebuf) - self.ptr, len(b)) + l = min(len(self.linebuf) - self.ptr, len(b)) # noqa: E741 b[:l] = self.linebuf[self.ptr : (self.ptr + l)] self.ptr += l diff --git a/test_runner/performance/test_dup_key.py b/test_runner/performance/test_dup_key.py index 60fe3014ba..81752ae740 100644 --- a/test_runner/performance/test_dup_key.py +++ b/test_runner/performance/test_dup_key.py @@ -46,7 +46,7 @@ $$; # Write 3-4 MB to evict t from compute cache cur.execute("create table f (i integer);") - cur.execute(f"insert into f values (generate_series(1,100000));") + cur.execute("insert into f values (generate_series(1,100000));") # Read with env.record_duration("read"): diff --git a/test_runner/performance/test_gist_build.py b/test_runner/performance/test_gist_build.py index d8fa97fbbf..311030b99d 100644 --- a/test_runner/performance/test_gist_build.py +++ b/test_runner/performance/test_gist_build.py @@ -1,10 +1,6 @@ -import os from contextlib import closing -from fixtures.benchmark_fixture import MetricReport -from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare -from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.compare_fixtures import PgCompare # diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py index 8e8ab9849a..aad6ee667a 100644 --- a/test_runner/performance/test_hot_page.py +++ b/test_runner/performance/test_hot_page.py @@ -31,7 +31,7 @@ def test_hot_page(env: PgCompare): # Write 3-4 MB to evict t from compute cache cur.execute("create table f (i integer);") - cur.execute(f"insert into f values (generate_series(1,100000));") + cur.execute("insert into f values (generate_series(1,100000));") # Read with env.record_duration("read"): diff --git a/test_runner/performance/test_parallel_copy_to.py b/test_runner/performance/test_parallel_copy_to.py index c1883dec7b..b4a25e0edc 100644 --- a/test_runner/performance/test_parallel_copy_to.py +++ b/test_runner/performance/test_parallel_copy_to.py @@ -1,11 +1,8 @@ import asyncio from io import BytesIO -import asyncpg -from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker -from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare -from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, PgProtocol, Postgres +from fixtures.compare_fixtures import PgCompare +from fixtures.neon_fixtures import PgProtocol async def repeat_bytes(buf, repetitions: int): @@ -59,7 +56,7 @@ def test_parallel_copy_different_tables(neon_with_baseline: PgCompare, n_paralle async def parallel_load_same_table(pg: PgProtocol, n_parallel: int): workers = [] for worker_id in range(n_parallel): - worker = copy_test_data_to_table(pg, worker_id, f"copytest") + worker = copy_test_data_to_table(pg, worker_id, "copytest") workers.append(asyncio.create_task(worker)) # await all workers @@ -72,7 +69,7 @@ def test_parallel_copy_same_table(neon_with_baseline: PgCompare, n_parallel=5): conn = env.pg.connect() cur = conn.cursor() - cur.execute(f"CREATE TABLE copytest (i int, t text)") + cur.execute("CREATE TABLE copytest (i int, t text)") with env.record_pageserver_writes("pageserver_writes"): with env.record_duration("load"): diff --git a/test_runner/performance/test_random_writes.py b/test_runner/performance/test_random_writes.py index 8ed684af16..df766d52da 100644 --- a/test_runner/performance/test_random_writes.py +++ b/test_runner/performance/test_random_writes.py @@ -1,13 +1,8 @@ -import os import random -import time from contextlib import closing -import psycopg2.extras from fixtures.benchmark_fixture import MetricReport -from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare -from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.compare_fixtures import PgCompare from fixtures.utils import query_scalar diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py index 6094ed38e5..c681c50ff5 100644 --- a/test_runner/performance/test_seqscans.py +++ b/test_runner/performance/test_seqscans.py @@ -1,13 +1,11 @@ # Test sequential scan speed # from contextlib import closing -from dataclasses import dataclass import pytest -from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.benchmark_fixture import MetricReport from fixtures.compare_fixtures import PgCompare from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv @pytest.mark.parametrize( diff --git a/test_runner/performance/test_write_amplification.py b/test_runner/performance/test_write_amplification.py index 7aab469387..30c217e392 100644 --- a/test_runner/performance/test_write_amplification.py +++ b/test_runner/performance/test_write_amplification.py @@ -10,13 +10,9 @@ # in LSN order, writing the oldest layer first. That creates a new 10 MB image # layer to be created for each of those small updates. This is the Write # Amplification problem at its finest. -import os from contextlib import closing -from fixtures.benchmark_fixture import MetricReport -from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare -from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.compare_fixtures import PgCompare def test_write_amplification(neon_with_baseline: PgCompare): diff --git a/test_runner/pg_clients/python/pg8000/pg8000_example.py b/test_runner/pg_clients/python/pg8000/pg8000_example.py index f463867f88..b1d77af5bb 100755 --- a/test_runner/pg_clients/python/pg8000/pg8000_example.py +++ b/test_runner/pg_clients/python/pg8000/pg8000_example.py @@ -1,7 +1,6 @@ #! /usr/bin/env python3 import os -import ssl import pg8000.dbapi diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py index f91a2adf7d..2dbab19e7a 100644 --- a/test_runner/pg_clients/test_pg_clients.py +++ b/test_runner/pg_clients/test_pg_clients.py @@ -1,6 +1,4 @@ -import os import shutil -import subprocess from pathlib import Path from tempfile import NamedTemporaryFile From 6dc56a9be112d10e9fe2c05babe9d24b0590499b Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Fri, 19 Aug 2022 23:49:51 +0200 Subject: [PATCH 0673/1022] Add GitHub templates for epics, bugs and release PRs (neondatabase/cloud#2079) After merging this we will be able to: - Pick Epic or Bug template in the GitHub UI, when creating an issue - Use this link to open a release PR formatted in a unified way and containing a checklist with useful links: https://github.com/neondatabase/neon/compare/release...main?template=release-pr.md&title=Release%20202Y-MM-DD --- .github/ISSUE_TEMPLATE/bug-template.md | 23 +++++++++++++++++++ .github/ISSUE_TEMPLATE/epic-template.md | 25 +++++++++++++++++++++ .github/PULL_REQUEST_TEMPLATE/release-pr.md | 20 +++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug-template.md create mode 100644 .github/ISSUE_TEMPLATE/epic-template.md create mode 100644 .github/PULL_REQUEST_TEMPLATE/release-pr.md diff --git a/.github/ISSUE_TEMPLATE/bug-template.md b/.github/ISSUE_TEMPLATE/bug-template.md new file mode 100644 index 0000000000..d33eec3cde --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug-template.md @@ -0,0 +1,23 @@ +--- +name: Bug Template +about: Used for describing bugs +title: '' +labels: t/bug +assignees: '' + +--- + +## Steps to reproduce + + +## Expected result + + +## Actual result + + +## Environment + + +## Logs, links +- diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md new file mode 100644 index 0000000000..33ad7b1ef5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/epic-template.md @@ -0,0 +1,25 @@ +--- +name: Epic Template +about: A set of related tasks contributing towards specific outcome, comprizing of + more than 1 week of work. +title: 'Epic: ' +labels: t/Epic +assignees: '' + +--- + +## Motivation + + +## DoD + + +## Implementation ideas + + +## Tasks +- [ ] + + +## Other related tasks and Epics +- diff --git a/.github/PULL_REQUEST_TEMPLATE/release-pr.md b/.github/PULL_REQUEST_TEMPLATE/release-pr.md new file mode 100644 index 0000000000..6f86114060 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md @@ -0,0 +1,20 @@ +## Release 202Y-MM-DD + +**NB: this PR must be merged only by 'Create a merge commit'!** + +### Checklist when preparing for release +- [ ] Read or refresh [the release flow guide](https://github.com/neondatabase/cloud/wiki/Release:-general-flow) +- [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers? +- [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan? + + + +### Checklist after release +- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/120/files)) +- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel +- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true) +- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some) +- [ ] Check [cloud SLO dashboard](https://observer.zenith.tech/d/_oWcBMJ7k/cloud-slos?orgId=1) +- [ ] Check [compute startup metrics dashboard](https://observer.zenith.tech/d/5OkYJEmVz/compute-startup-time) + + From 832e60c2b4fe700ba703cea3fb0740a37abeb39a Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 22 Aug 2022 16:38:31 +0100 Subject: [PATCH 0674/1022] Add .git-blame-ignore-revs file (#2318) --- .git-blame-ignore-revs | 1 + 1 file changed, 1 insertion(+) create mode 100644 .git-blame-ignore-revs diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000000..3afa4b683c --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1 @@ +4c2bb43775947775401cbb9d774823c5723a91f8 From 9dd19ec397b27d2766f6a66d5d4000647607a7e7 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 22 Aug 2022 17:54:03 +0300 Subject: [PATCH 0675/1022] Remove interferring proc check We do not need it anymore because ports_distributor checks whether the port can be used before giving it to service --- .github/workflows/benchmarking.yml | 8 ++++---- test_runner/fixtures/neon_fixtures.py | 29 --------------------------- 2 files changed, 4 insertions(+), 33 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 8080d6b7db..4ed6ac80fd 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -106,7 +106,7 @@ jobs: mkdir -p perf-report-staging # Set --sparse-ordering option of pytest-order plugin to ensure tests are running in order of appears in the file, # it's important for test_perf_pgbench.py::test_pgbench_remote_* tests - ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --skip-interfering-proc-check --out-dir perf-report-staging --timeout 5400 + ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --out-dir perf-report-staging --timeout 5400 - name: Submit result env: @@ -186,7 +186,7 @@ jobs: mkdir -p perf-report-captest psql $BENCHMARK_CONNSTR -c "SELECT 1;" - ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_init -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600 + ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_init -v -m "remote_cluster" --out-dir perf-report-captest --timeout 21600 - name: Benchmark simple-update env: @@ -194,7 +194,7 @@ jobs: BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} run: | psql $BENCHMARK_CONNSTR -c "SELECT 1;" - ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_simple_update -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600 + ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_simple_update -v -m "remote_cluster" --out-dir perf-report-captest --timeout 21600 - name: Benchmark select-only env: @@ -202,7 +202,7 @@ jobs: BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} run: | psql $BENCHMARK_CONNSTR -c "SELECT 1;" - ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_select_only -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600 + ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_select_only -v -m "remote_cluster" --out-dir perf-report-captest --timeout 21600 - name: Submit result env: diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index f4ed937f02..f1cffbe5ef 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -15,7 +15,6 @@ import tempfile import textwrap import time import uuid -import warnings from contextlib import closing, contextmanager from dataclasses import dataclass, field from enum import Flag, auto @@ -68,15 +67,6 @@ BASE_PORT = 15000 WORKER_PORT_NUM = 1000 -def pytest_addoption(parser): - parser.addoption( - "--skip-interfering-proc-check", - dest="skip_interfering_proc_check", - action="store_true", - help="skip check for interfering processes", - ) - - # These are set in pytest_configure() base_dir = "" neon_binpath = "" @@ -84,30 +74,11 @@ pg_distrib_dir = "" top_output_dir = "" -def check_interferring_processes(config): - if config.getoption("skip_interfering_proc_check"): - warnings.warn("interfering process check is skipped") - return - - # does not use -c as it is not supported on macOS - cmd = ["pgrep", "pageserver|postgres|safekeeper"] - result = subprocess.run(cmd, stdout=subprocess.DEVNULL) - if result.returncode == 0: - # returncode of 0 means it found something. - # This is bad; we don't want any of those processes polluting the - # result of the test. - # NOTE this shows as an internal pytest error, there might be a better way - raise Exception( - "Found interfering processes running. Stop all Neon pageservers, nodes, safekeepers, as well as stand-alone Postgres." - ) - - def pytest_configure(config): """ Ensure that no unwanted daemons are running before we start testing. Check that we do not overflow available ports range. """ - check_interferring_processes(config) numprocesses = config.getoption("numprocesses") if ( From b98fa5d6b0b2e8151f9b1385dcaadc8c2f329618 Mon Sep 17 00:00:00 2001 From: KlimentSerafimov Date: Mon, 22 Aug 2022 20:02:45 -0400 Subject: [PATCH 0676/1022] Added a new test for making sure the proxy displays a session_id when using link auth. (#2039) Added pytest to check correctness of the link authentication pipeline. Context: this PR is the first step towards refactoring the link authentication pipeline to use https (instead of psql) to send the db info to the proxy. There was a test missing for this pipeline in this repo, so this PR adds that test as preparation for the actual change of psql -> https. Co-authored-by: Bojan Serafimov Co-authored-by: Dmitry Rodionov Co-authored-by: Stas Kelvic Co-authored-by: Dimitrii Ivanov --- poetry.lock | 30 ++++-- pyproject.toml | 1 + test_runner/batch_others/test_proxy.py | 121 +++++++++++++++++++++++++ test_runner/fixtures/neon_fixtures.py | 72 ++++++++++++++- 4 files changed, 216 insertions(+), 8 deletions(-) diff --git a/poetry.lock b/poetry.lock index e1f2e576eb..6bce17008e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -622,8 +622,8 @@ six = ">=1.4.0" websocket-client = ">=0.32.0" [package.extras] +tls = ["idna (>=2.0.0)", "cryptography (>=1.3.4)", "pyOpenSSL (>=17.5.0)"] ssh = ["paramiko (>=2.4.2)"] -tls = ["pyOpenSSL (>=17.5.0)", "cryptography (>=1.3.4)", "idna (>=2.0.0)"] [[package]] name = "ecdsa" @@ -1055,8 +1055,8 @@ optional = false python-versions = ">=3.7" [package.extras] -docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx-autodoc-typehints (>=1.12)", "sphinx (>=4)"] -test = ["appdirs (==1.4.4)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)", "pytest (>=6)"] +test = ["pytest (>=6)", "pytest-mock (>=3.6)", "pytest-cov (>=2.7)", "appdirs (==1.4.4)"] +docs = ["sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)", "proselint (>=0.10.2)", "furo (>=2021.7.5b38)"] [[package]] name = "pluggy" @@ -1067,8 +1067,8 @@ optional = false python-versions = ">=3.6" [package.extras] -dev = ["pre-commit", "tox"] -testing = ["pytest", "pytest-benchmark"] +testing = ["pytest-benchmark", "pytest"] +dev = ["tox", "pre-commit"] [[package]] name = "prometheus-client" @@ -1197,6 +1197,20 @@ toml = "*" [package.extras] testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] +[[package]] +name = "pytest-asyncio" +version = "0.19.0" +description = "Pytest support for asyncio" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +pytest = ">=6.1.0" + +[package.extras] +testing = ["pytest-trio (>=0.7.0)", "mypy (>=0.931)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "coverage (>=6.2)"] + [[package]] name = "pytest-forked" version = "1.4.0" @@ -1537,7 +1551,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "2112382a6723ed3b77d242db926c7445fa809fafcf11da127b5292565d2ba798" +content-hash = "badfeff521c68277b10555ab4174847b7315d82818ef5841e600299fb6128698" [metadata.files] aiopg = [ @@ -2076,6 +2090,10 @@ pytest = [ {file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"}, {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"}, ] +pytest-asyncio = [ + {file = "pytest-asyncio-0.19.0.tar.gz", hash = "sha256:ac4ebf3b6207259750bc32f4c1d8fcd7e79739edbc67ad0c58dd150b1d072fed"}, + {file = "pytest_asyncio-0.19.0-py3-none-any.whl", hash = "sha256:7a97e37cfe1ed296e2e84941384bdd37c376453912d397ed39293e0916f521fa"}, +] pytest-forked = [ {file = "pytest-forked-1.4.0.tar.gz", hash = "sha256:8b67587c8f98cbbadfdd804539ed5455b6ed03802203485dd2f53c1422d7440e"}, {file = "pytest_forked-1.4.0-py3-none-any.whl", hash = "sha256:bbbb6717efc886b9d64537b41fb1497cfaf3c9601276be8da2cccfea5a3c8ad8"}, diff --git a/pyproject.toml b/pyproject.toml index d648d1050a..2c9270934d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ pytest-timeout = "^2.1.0" Werkzeug = "2.1.2" pytest-order = "^1.0.1" allure-pytest = "^2.9.45" +pytest-asyncio = "^0.19.0" [tool.poetry.dev-dependencies] flake8 = "^5.0.4" diff --git a/test_runner/batch_others/test_proxy.py b/test_runner/batch_others/test_proxy.py index dcff177044..4ffd458b22 100644 --- a/test_runner/batch_others/test_proxy.py +++ b/test_runner/batch_others/test_proxy.py @@ -1,5 +1,11 @@ +import json +import subprocess +from urllib.parse import urlparse + import psycopg2 import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres def test_proxy_select_1(static_proxy): @@ -23,6 +29,121 @@ def test_password_hack(static_proxy): static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic) +def get_session_id_from_uri_line(uri_prefix, uri_line): + assert uri_prefix in uri_line + + url_parts = urlparse(uri_line) + psql_session_id = url_parts.path[1:] + assert psql_session_id.isalnum(), "session_id should only contain alphanumeric chars." + link_auth_uri_prefix = uri_line[: -len(url_parts.path)] + # invariant: the prefix must match the uri_prefix. + assert ( + link_auth_uri_prefix == uri_prefix + ), f"Line='{uri_line}' should contain a http auth link of form '{uri_prefix}/'." + # invariant: the entire link_auth_uri should be on its own line, module spaces. + assert " ".join(uri_line.split(" ")) == f"{uri_prefix}/{psql_session_id}" + + return psql_session_id + + +def create_and_send_db_info(local_vanilla_pg, psql_session_id, mgmt_port): + pg_user = "proxy" + pg_password = "password" + + local_vanilla_pg.start() + query = f"create user {pg_user} with login superuser password '{pg_password}'" + local_vanilla_pg.safe_psql(query) + + port = local_vanilla_pg.default_options["port"] + host = local_vanilla_pg.default_options["host"] + dbname = local_vanilla_pg.default_options["dbname"] + + db_info_dict = { + "session_id": psql_session_id, + "result": { + "Success": { + "host": host, + "port": port, + "dbname": dbname, + "user": pg_user, + "password": pg_password, + } + }, + } + db_info_str = json.dumps(db_info_dict) + cmd_args = [ + "psql", + "-h", + "127.0.0.1", # localhost + "-p", + f"{mgmt_port}", + "-c", + db_info_str, + ] + + log.info(f"Sending to proxy the user and db info: {' '.join(cmd_args)}") + p = subprocess.Popen(cmd_args, stdout=subprocess.PIPE) + out, err = p.communicate() + assert "ok" in str(out) + + +async def get_uri_line_from_process_welcome_notice(link_auth_uri_prefix, proc): + """ + Returns the line from the welcome notice from proc containing link_auth_uri_prefix. + :param link_auth_uri_prefix: the uri prefix used to indicate the line of interest + :param proc: the process to read the welcome message from. + :return: a line containing the full link authentication uri. + """ + max_num_lines_of_welcome_message = 15 + for attempt in range(max_num_lines_of_welcome_message): + raw_line = await proc.stderr.readline() + line = raw_line.decode("utf-8").strip() + if link_auth_uri_prefix in line: + return line + assert False, f"did not find line containing '{link_auth_uri_prefix}'" + + +@pytest.mark.asyncio +async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProxy): + """ + Test copied and modified from: test_project_psql_link_auth test from cloud/tests_e2e/tests/test_project.py + Step 1. establish connection to the proxy + Step 2. retrieve session_id: + Step 2.1: read welcome message + Step 2.2: parse session_id + Step 3. create a vanilla_pg and send user and db info via command line (using Popen) a psql query via mgmt port to proxy. + Step 4. assert that select 1 has been executed correctly. + """ + + # Step 1. + psql = PSQL( + host=link_proxy.host, + port=link_proxy.proxy_port, + ) + proc = await psql.run("select 1") + + # Step 2.1 + uri_prefix = link_proxy.link_auth_uri_prefix + line_str = await get_uri_line_from_process_welcome_notice(uri_prefix, proc) + + # step 2.2 + psql_session_id = get_session_id_from_uri_line(uri_prefix, line_str) + log.info(f"Parsed psql_session_id='{psql_session_id}' from Neon welcome message.") + + # Step 3. + create_and_send_db_info(vanilla_pg, psql_session_id, link_proxy.mgmt_port) + + # Step 4. + # Expecting proxy output:: + # b' ?column? \n' + # b'----------\n' + # b' 1\n' + # b'(1 row)\n' + out_bytes = await proc.stdout.read() + expected_out_bytes = b" ?column? \n----------\n 1\n(1 row)\n\n" + assert out_bytes == expected_out_bytes + + # Pass extra options to the server. # # Currently, proxy eats the extra connection options, so this fails. diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index f1cffbe5ef..3af0cf4dcb 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1,6 +1,7 @@ from __future__ import annotations import abc +import asyncio import enum import filecmp import json @@ -1716,21 +1717,58 @@ def remote_pg(test_output_dir: Path) -> Iterator[RemotePostgres]: yield remote_pg +class PSQL: + """ + Helper class to make it easier to run psql in the proxy tests. + Copied and modified from PSQL from cloud/tests_e2e/common/psql.py + """ + + path: str + database_url: str + + def __init__( + self, + path: str = "psql", + host: str = "127.0.0.1", + port: int = 5432, + ): + assert shutil.which(path) + + self.path = path + self.database_url = f"postgres://{host}:{port}/main?options=project%3Dgeneric-project-name" + + async def run(self, query=None): + run_args = [self.path, self.database_url] + run_args += ["--command", query] if query is not None else [] + + cmd_line = subprocess.list2cmdline(run_args) + log.info(f"Run psql: {cmd_line}") + return await asyncio.create_subprocess_exec( + *run_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + + class NeonProxy(PgProtocol): - def __init__(self, proxy_port: int, http_port: int, auth_endpoint: str): + def __init__(self, proxy_port: int, http_port: int, auth_endpoint=None, mgmt_port=None): super().__init__(dsn=auth_endpoint, port=proxy_port) self.host = "127.0.0.1" self.http_port = http_port self.proxy_port = proxy_port + self.mgmt_port = mgmt_port self.auth_endpoint = auth_endpoint self._popen: Optional[subprocess.Popen[bytes]] = None + self.link_auth_uri_prefix = "http://dummy-uri" def start(self) -> None: + """ + Starts a proxy with option '--auth-backend postgres' and a postgres instance already provided though '--auth-endpoint '." + """ assert self._popen is None + assert self.auth_endpoint is not None # Start proxy args = [ - os.path.join(str(neon_binpath), "proxy"), + os.path.join(neon_binpath, "proxy"), *["--http", f"{self.host}:{self.http_port}"], *["--proxy", f"{self.host}:{self.proxy_port}"], *["--auth-backend", "postgres"], @@ -1739,6 +1777,25 @@ class NeonProxy(PgProtocol): self._popen = subprocess.Popen(args) self._wait_until_ready() + def start_with_link_auth(self) -> None: + """ + Starts a proxy with option '--auth-backend link' and a dummy authentication link '--uri dummy-auth-link'." + """ + assert self._popen is None + + # Start proxy + bin_proxy = os.path.join(str(neon_binpath), "proxy") + args = [bin_proxy] + args.extend(["--http", f"{self.host}:{self.http_port}"]) + args.extend(["--proxy", f"{self.host}:{self.proxy_port}"]) + args.extend(["--mgmt", f"{self.host}:{self.mgmt_port}"]) + args.extend(["--auth-backend", "link"]) + args.extend(["--uri", self.link_auth_uri_prefix]) + arg_str = " ".join(args) + log.info(f"starting proxy with command line ::: {arg_str}") + self._popen = subprocess.Popen(args, stdout=subprocess.PIPE) + self._wait_until_ready() + @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10) def _wait_until_ready(self): requests.get(f"http://{self.host}:{self.http_port}/v1/status") @@ -1753,6 +1810,17 @@ class NeonProxy(PgProtocol): self._popen.kill() +@pytest.fixture(scope="function") +def link_proxy(port_distributor) -> Iterator[NeonProxy]: + """Neon proxy that routes through link auth.""" + http_port = port_distributor.get_port() + proxy_port = port_distributor.get_port() + mgmt_port = port_distributor.get_port() + with NeonProxy(proxy_port, http_port, mgmt_port=mgmt_port) as proxy: + proxy.start_with_link_auth() + yield proxy + + @pytest.fixture(scope="function") def static_proxy(vanilla_pg, port_distributor) -> Iterator[NeonProxy]: """Neon proxy that routes directly to vanilla postgres.""" From d110d2c2fddf461cd85bb2d49b86bbe9f7f6998b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 23 Aug 2022 12:14:06 +0300 Subject: [PATCH 0677/1022] Reorder permission checks in HTTP API call handlers. Every handler function now follows the same pattern: 1. extract parameters from the call 2. check permissions 3. execute command. Previously, we extracted some parameters before permission check and some after. Let's be consistent. --- pageserver/src/http/routes.rs | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index da21f6883a..2bb181dd9a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -206,7 +206,6 @@ async fn status_handler(request: Request) -> Result, ApiErr async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; let request_data: TimelineCreateRequest = json_request(&mut request).await?; - check_permission(&request, Some(tenant_id))?; let new_timeline_info = tokio::task::spawn_blocking(move || { @@ -244,11 +243,12 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; let include_non_incremental_logical_size = query_param_present(&request, "include-non-incremental-logical-size"); let include_non_incremental_physical_size = query_param_present(&request, "include-non-incremental-physical-size"); + check_permission(&request, Some(tenant_id))?; + let local_timeline_infos = tokio::task::spawn_blocking(move || { let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); list_local_timelines( @@ -299,13 +299,12 @@ fn query_param_present(request: &Request, param: &str) -> bool { async fn timeline_detail_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; let include_non_incremental_logical_size = query_param_present(&request, "include-non-incremental-logical-size"); let include_non_incremental_physical_size = query_param_present(&request, "include-non-incremental-physical-size"); + check_permission(&request, Some(tenant_id))?; let (local_timeline_info, remote_timeline_info) = async { // any error here will render local timeline as None @@ -369,7 +368,7 @@ async fn tenant_attach_handler(request: Request) -> Result, let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - info!("Handling tenant attach {}", tenant_id,); + info!("Handling tenant attach {}", tenant_id); tokio::task::spawn_blocking(move || { if tenant_mgr::get_tenant_state(tenant_id).is_some() { @@ -478,9 +477,8 @@ async fn gather_tenant_timelines_index_parts( async fn timeline_delete_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; let state = get_state(&request); tokio::task::spawn_blocking(move || { @@ -519,7 +517,6 @@ async fn tenant_detach_handler(request: Request) -> Result, } async fn tenant_list_handler(request: Request) -> Result, ApiError> { - // check for management permission check_permission(&request, None)?; let state = get_state(&request); @@ -587,7 +584,6 @@ async fn tenant_status(request: Request) -> Result, ApiErro } async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { - // check for management permission check_permission(&request, None)?; let request_data: TenantCreateRequest = json_request(&mut request).await?; @@ -656,7 +652,6 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result, ApiError> { let request_data: TenantConfigRequest = json_request(&mut request).await?; let tenant_id = request_data.tenant_id; - // check for management permission check_permission(&request, Some(tenant_id))?; let mut tenant_conf: TenantConfOpt = Default::default(); From 1a666a01d672298a1da12771c4b68a28c1d5ebed Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 23 Aug 2022 12:17:20 +0300 Subject: [PATCH 0678/1022] Improve comments a little. --- pageserver/src/http/models.rs | 3 +++ pageserver/src/layered_repository.rs | 10 +++++----- pageserver/src/layered_repository/timeline.rs | 6 +++--- pageserver/src/page_service.rs | 2 +- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 232c202ed9..654f45a95d 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -150,6 +150,9 @@ pub struct RemoteTimelineInfo { pub awaits_download: bool, } +/// +/// This represents the output of the "timeline_detail" API call. +/// #[serde_as] #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index dd173498b9..0bfa1cd268 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -656,9 +656,9 @@ impl Repository { /// Locate and load config pub fn load_tenant_config( conf: &'static PageServerConf, - tenantid: ZTenantId, + tenant_id: ZTenantId, ) -> anyhow::Result { - let target_config_path = TenantConf::path(conf, tenantid); + let target_config_path = TenantConf::path(conf, tenant_id); info!("load tenantconf from {}", target_config_path.display()); @@ -693,11 +693,11 @@ impl Repository { pub fn persist_tenant_config( conf: &'static PageServerConf, - tenantid: ZTenantId, + tenant_id: ZTenantId, tenant_conf: TenantConfOpt, ) -> anyhow::Result<()> { let _enter = info_span!("saving tenantconf").entered(); - let target_config_path = TenantConf::path(conf, tenantid); + let target_config_path = TenantConf::path(conf, tenant_id); info!("save tenantconf to {}", target_config_path.display()); let mut conf_content = r#"# This file contains a specific per-tenant's config. @@ -834,7 +834,7 @@ impl Repository { // compaction (both require `layer_removal_cs` lock), // but the GC iteration can run concurrently with branch creation. // - // See comments in [`LayeredRepository::branch_timeline`] for more information + // See comments in [`Repository::branch_timeline`] for more information // about why branch creation task can run concurrently with timeline's GC iteration. for timeline in gc_timelines { if thread_mgr::is_shutdown_requested() { diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index fb5a4d0b83..a909dcb5a1 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -354,8 +354,8 @@ pub struct Timeline { upload_layers: AtomicBool, /// Ensures layers aren't frozen by checkpointer between - /// [`LayeredTimeline::get_layer_for_write`] and layer reads. - /// Locked automatically by [`LayeredTimelineWriter`] and checkpointer. + /// [`Timeline::get_layer_for_write`] and layer reads. + /// Locked automatically by [`TimelineWriter`] and checkpointer. /// Must always be acquired before the layer map/individual layer lock /// to avoid deadlock. write_lock: Mutex<()>, @@ -365,7 +365,7 @@ pub struct Timeline { /// Layer removal lock. /// A lock to ensure that no layer of the timeline is removed concurrently by other threads. - /// This lock is acquired in [`LayeredTimeline::gc`], [`LayeredTimeline::compact`], + /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`], /// and [`LayeredRepository::delete_timeline`]. layer_removal_cs: Mutex<()>, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index e6114c0fc5..c21d5a6acc 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -744,7 +744,7 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; /* - // Add a 1s delay to some requests. The delayed causes the requests to + // Add a 1s delay to some requests. The delay helps the requests to // hit the race condition from github issue #1047 more easily. use rand::Rng; if rand::thread_rng().gen::() < 5 { From 63b9dfb2f21be88eef74734b210698463acb8701 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 23 Aug 2022 12:17:48 +0300 Subject: [PATCH 0679/1022] Remove unnecessary 'pub' from test module, and remove dead constant. After making the test module private, the compiler noticed and warned that the constant is unused. --- pageserver/src/layered_repository.rs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 0bfa1cd268..fae52c3daf 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1085,7 +1085,7 @@ pub mod repo_harness { } #[cfg(test)] -pub mod tests { +mod tests { use super::metadata::METADATA_FILE_NAME; use super::*; use crate::keyspace::KeySpaceAccum; @@ -1467,12 +1467,6 @@ pub mod tests { Ok(()) } - // Target file size in the unit tests. In production, the target - // file size is much larger, maybe 1 GB. But a small size makes it - // much faster to exercise all the logic for creating the files, - // garbage collection, compaction etc. - pub const TEST_FILE_SIZE: u64 = 4 * 1024 * 1024; - #[test] fn test_images() -> Result<()> { let repo = RepoHarness::create("test_images")?.load(); From 5f0c95182d7584c4c84c21c51dd80cd5b9c075c0 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 23 Aug 2022 12:18:43 +0300 Subject: [PATCH 0680/1022] Minor cleanup, to pass by reference where possible. --- pageserver/src/page_service.rs | 8 ++++---- pageserver/src/pgdatadir_mapping.rs | 2 +- pageserver/src/walingest.rs | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index c21d5a6acc..ebcff1f2ac 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -494,22 +494,22 @@ impl PageServerHandler { PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME .with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]) .observe_closure_duration(|| { - self.handle_get_rel_exists_request(timeline.as_ref(), &req) + self.handle_get_rel_exists_request(&timeline, &req) }), PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME .with_label_values(&["get_rel_size", &tenant_id, &timeline_id]) .observe_closure_duration(|| { - self.handle_get_nblocks_request(timeline.as_ref(), &req) + self.handle_get_nblocks_request(&timeline, &req) }), PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME .with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]) .observe_closure_duration(|| { - self.handle_get_page_at_lsn_request(timeline.as_ref(), &req) + self.handle_get_page_at_lsn_request(&timeline, &req) }), PagestreamFeMessage::DbSize(req) => SMGR_QUERY_TIME .with_label_values(&["get_db_size", &tenant_id, &timeline_id]) .observe_closure_duration(|| { - self.handle_db_size_request(timeline.as_ref(), &req) + self.handle_db_size_request(&timeline, &req) }), }; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 0ace850a82..0f0bb1ed53 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1391,7 +1391,7 @@ fn is_slru_block_key(key: Key) -> bool { #[cfg(test)] pub fn create_test_timeline( - repo: crate::layered_repository::Repository, + repo: &crate::layered_repository::Repository, timeline_id: utils::zid::ZTimelineId, ) -> Result> { let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index f3789d43e3..c0965e7a22 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1062,7 +1062,7 @@ mod tests { #[test] fn test_relsize() -> Result<()> { let repo = RepoHarness::create("test_relsize")?.load(); - let tline = create_test_timeline(repo, TIMELINE_ID)?; + let tline = create_test_timeline(&repo, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1190,7 +1190,7 @@ mod tests { #[test] fn test_drop_extend() -> Result<()> { let repo = RepoHarness::create("test_drop_extend")?.load(); - let tline = create_test_timeline(repo, TIMELINE_ID)?; + let tline = create_test_timeline(&repo, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1230,7 +1230,7 @@ mod tests { #[test] fn test_truncate_extend() -> Result<()> { let repo = RepoHarness::create("test_truncate_extend")?.load(); - let tline = create_test_timeline(repo, TIMELINE_ID)?; + let tline = create_test_timeline(&repo, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; // Create a 20 MB relation (the size is arbitrary) @@ -1318,7 +1318,7 @@ mod tests { #[test] fn test_large_rel() -> Result<()> { let repo = RepoHarness::create("test_large_rel")?.load(); - let tline = create_test_timeline(repo, TIMELINE_ID)?; + let tline = create_test_timeline(&repo, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; let mut lsn = 0x10; From 4013290508f3aa266ccb04dc4eff1d488f8ca482 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 23 Aug 2022 12:51:49 +0300 Subject: [PATCH 0681/1022] Fix module doc comment. `///` is used for comments on the *next* code that follows, so the comment actually applied to the `use std::collections::BTreeMap;` line that follows. rustfmt complained about that: error: an inner attribute is not permitted following an outer doc comment --> /home/heikki/git-sandbox/neon/libs/utils/src/seqwait_async.rs:7:1 | 5 | /// | --- previous doc comment 6 | 7 | #![warn(missing_docs)] | ^^^^^^^^^^^^^^^^^^^^^^ not permitted following an outer attribute 8 | 9 | use std::collections::BTreeMap; | ------------------------------- the inner attribute doesn't annotate this `use` import | = note: inner attributes, like `#![no_std]`, annotate the item enclosing them, and are usually found at the beginning of source files help: to annotate the `use` import, change the attribute from inner to outer style | 7 - #![warn(missing_docs)] 7 + #[warn(missing_docs)] | `//!` is the correct syntax for comments that apply to the whole file. --- libs/utils/src/seqwait_async.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/libs/utils/src/seqwait_async.rs b/libs/utils/src/seqwait_async.rs index 09138e9dd4..f685e2b569 100644 --- a/libs/utils/src/seqwait_async.rs +++ b/libs/utils/src/seqwait_async.rs @@ -1,8 +1,8 @@ -/// -/// Async version of 'seqwait.rs' -/// -/// NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs. -/// +//! +//! Async version of 'seqwait.rs' +//! +//! NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs. +//! #![warn(missing_docs)] From 8e1d6dd848da5006f63a4a8088954ee39a3f5a05 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Tue, 23 Aug 2022 18:00:02 +0300 Subject: [PATCH 0682/1022] Minor cleanup in pq_proto (#2322) --- libs/utils/src/postgres_backend.rs | 15 +- libs/utils/src/pq_proto.rs | 330 +++++++++-------------------- 2 files changed, 107 insertions(+), 238 deletions(-) diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 4d873bd5ac..604eb75aaf 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -163,14 +163,9 @@ pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool { false } -// Truncate 0 from C string in Bytes and stringify it (returns slice, no allocations) -// PG protocol strings are always C strings. -fn cstr_to_str(b: &Bytes) -> Result<&str> { - let without_null = if b.last() == Some(&0) { - &b[..b.len() - 1] - } else { - &b[..] - }; +// Cast a byte slice to a string slice, dropping null terminator if there's one. +fn cstr_to_str(bytes: &[u8]) -> Result<&str> { + let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); std::str::from_utf8(without_null).map_err(|e| e.into()) } @@ -423,9 +418,9 @@ impl PostgresBackend { self.state = ProtoState::Established; } - FeMessage::Query(m) => { + FeMessage::Query(body) => { // remove null terminator - let query_string = cstr_to_str(&m.body)?; + let query_string = cstr_to_str(&body)?; trace!("got query {:?}", query_string); // xxx distinguish fatal and recoverable errors? diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index 3f14acd50d..2f8dcf31d3 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -25,8 +25,10 @@ pub const TEXT_OID: Oid = 25; #[derive(Debug)] pub enum FeMessage { StartupPacket(FeStartupPacket), - Query(FeQueryMessage), // Simple query - Parse(FeParseMessage), // Extended query protocol + // Simple query. + Query(Bytes), + // Extended query protocol. + Parse(FeParseMessage), Describe(FeDescribeMessage), Bind(FeBindMessage), Execute(FeExecuteMessage), @@ -69,11 +71,6 @@ impl Distribution for Standard { } } -#[derive(Debug)] -pub struct FeQueryMessage { - pub body: Bytes, -} - // We only support the simple case of Parse on unnamed prepared statement and // no params #[derive(Debug)] @@ -89,7 +86,7 @@ pub struct FeDescribeMessage { // we only support unnamed prepared stmt and portal #[derive(Debug)] -pub struct FeBindMessage {} +pub struct FeBindMessage; // we only support unnamed prepared stmt or portal #[derive(Debug)] @@ -100,7 +97,7 @@ pub struct FeExecuteMessage { // we only support unnamed prepared stmt and portal #[derive(Debug)] -pub struct FeCloseMessage {} +pub struct FeCloseMessage; /// Retry a read on EINTR /// @@ -163,22 +160,20 @@ impl FeMessage { Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), Err(e) => return Err(e.into()), }; - let len = retry_read!(stream.read_u32().await)?; - // The message length includes itself, so it better be at least 4 - let bodylen = len + // The message length includes itself, so it better be at least 4. + let len = retry_read!(stream.read_u32().await)? .checked_sub(4) - .context("invalid message length: parsing u32")?; + .context("invalid message length")?; - // Read message body - let mut body_buf: Vec = vec![0; bodylen as usize]; - stream.read_exact(&mut body_buf).await?; + let body = { + let mut buffer = vec![0u8; len as usize]; + stream.read_exact(&mut buffer).await?; + Bytes::from(buffer) + }; - let body = Bytes::from(body_buf); - - // Parse it match tag { - b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { body }))), + b'Q' => Ok(Some(FeMessage::Query(body))), b'P' => Ok(Some(FeParseMessage::parse(body)?)), b'D' => Ok(Some(FeDescribeMessage::parse(body)?)), b'E' => Ok(Some(FeExecuteMessage::parse(body)?)), @@ -302,124 +297,71 @@ impl FeStartupPacket { } impl FeParseMessage { - pub fn parse(mut buf: Bytes) -> anyhow::Result { - let _pstmt_name = read_null_terminated(&mut buf)?; - let query_string = read_null_terminated(&mut buf)?; - let nparams = buf.get_i16(); - + fn parse(mut buf: Bytes) -> anyhow::Result { // FIXME: the rust-postgres driver uses a named prepared statement // for copy_out(). We're not prepared to handle that correctly. For // now, just ignore the statement name, assuming that the client never // uses more than one prepared statement at a time. - /* - if !pstmt_name.is_empty() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "named prepared statements not implemented in Parse", - )); - } - */ - if nparams != 0 { - bail!("query params not implemented"); - } + let _pstmt_name = read_cstr(&mut buf)?; + let query_string = read_cstr(&mut buf)?; + let nparams = buf.get_i16(); + + ensure!(nparams == 0, "query params not implemented"); Ok(FeMessage::Parse(FeParseMessage { query_string })) } } impl FeDescribeMessage { - pub fn parse(mut buf: Bytes) -> anyhow::Result { + fn parse(mut buf: Bytes) -> anyhow::Result { let kind = buf.get_u8(); - let _pstmt_name = read_null_terminated(&mut buf)?; + let _pstmt_name = read_cstr(&mut buf)?; // FIXME: see FeParseMessage::parse - /* - if !pstmt_name.is_empty() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "named prepared statements not implemented in Describe", - )); - } - */ - - if kind != b'S' { - bail!("only prepared statmement Describe is implemented"); - } + ensure!( + kind == b'S', + "only prepared statemement Describe is implemented" + ); Ok(FeMessage::Describe(FeDescribeMessage { kind })) } } impl FeExecuteMessage { - pub fn parse(mut buf: Bytes) -> anyhow::Result { - let portal_name = read_null_terminated(&mut buf)?; + fn parse(mut buf: Bytes) -> anyhow::Result { + let portal_name = read_cstr(&mut buf)?; let maxrows = buf.get_i32(); - if !portal_name.is_empty() { - bail!("named portals not implemented"); - } - - if maxrows != 0 { - bail!("row limit in Execute message not supported"); - } + ensure!(portal_name.is_empty(), "named portals not implemented"); + ensure!(maxrows == 0, "row limit in Execute message not implemented"); Ok(FeMessage::Execute(FeExecuteMessage { maxrows })) } } impl FeBindMessage { - pub fn parse(mut buf: Bytes) -> anyhow::Result { - let portal_name = read_null_terminated(&mut buf)?; - let _pstmt_name = read_null_terminated(&mut buf)?; - - if !portal_name.is_empty() { - bail!("named portals not implemented"); - } + fn parse(mut buf: Bytes) -> anyhow::Result { + let portal_name = read_cstr(&mut buf)?; + let _pstmt_name = read_cstr(&mut buf)?; // FIXME: see FeParseMessage::parse - /* - if !pstmt_name.is_empty() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "named prepared statements not implemented", - )); - } - */ + ensure!(portal_name.is_empty(), "named portals not implemented"); - Ok(FeMessage::Bind(FeBindMessage {})) + Ok(FeMessage::Bind(FeBindMessage)) } } impl FeCloseMessage { - pub fn parse(mut buf: Bytes) -> anyhow::Result { + fn parse(mut buf: Bytes) -> anyhow::Result { let _kind = buf.get_u8(); - let _pstmt_or_portal_name = read_null_terminated(&mut buf)?; + let _pstmt_or_portal_name = read_cstr(&mut buf)?; // FIXME: we do nothing with Close - - Ok(FeMessage::Close(FeCloseMessage {})) + Ok(FeMessage::Close(FeCloseMessage)) } } -fn read_null_terminated(buf: &mut Bytes) -> anyhow::Result { - let mut result = BytesMut::new(); - - loop { - if !buf.has_remaining() { - bail!("no null-terminator in string"); - } - - let byte = buf.get_u8(); - - if byte == 0 { - break; - } - result.put_u8(byte); - } - Ok(result.freeze()) -} - // Backend #[derive(Debug)] @@ -441,7 +383,7 @@ pub enum BeMessage<'a> { // None means column is NULL DataRow(&'a [Option<&'a [u8]>]), ErrorResponse(&'a str), - // single byte - used in response to SSLRequest/GSSENCRequest + /// Single byte - used in response to SSLRequest/GSSENCRequest. EncryptionResponse(bool), NoData, ParameterDescription, @@ -554,49 +496,22 @@ pub static SINGLE_COL_ROWDESC: BeMessage = BeMessage::RowDescription(&[RowDescri formatcode: 0, }]); -// Safe usize -> i32|i16 conversion, from rust-postgres -trait FromUsize: Sized { - fn from_usize(x: usize) -> Result; -} - -macro_rules! from_usize { - ($t:ty) => { - impl FromUsize for $t { - #[inline] - fn from_usize(x: usize) -> io::Result<$t> { - if x > <$t>::max_value() as usize { - Err(io::Error::new( - io::ErrorKind::InvalidInput, - "value too large to transmit", - )) - } else { - Ok(x as $t) - } - } - } - }; -} - -from_usize!(i32); - /// Call f() to write body of the message and prepend it with 4-byte len as /// prescribed by the protocol. -fn write_body(buf: &mut BytesMut, f: F) -> io::Result<()> -where - F: FnOnce(&mut BytesMut) -> io::Result<()>, -{ +fn write_body(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R { let base = buf.len(); buf.extend_from_slice(&[0; 4]); - f(buf)?; + let res = f(buf); - let size = i32::from_usize(buf.len() - base)?; + let size = i32::try_from(buf.len() - base).expect("message too big to transmit"); (&mut buf[base..]).put_slice(&size.to_be_bytes()); - Ok(()) + + res } /// Safe write of s into buf as cstring (String in the protocol). -pub fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> { +fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> { if s.contains(&0) { return Err(io::Error::new( io::ErrorKind::InvalidInput, @@ -608,15 +523,11 @@ pub fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> { Ok(()) } -// Truncate 0 from C string in Bytes and stringify it (returns slice, no allocations) -// PG protocol strings are always C strings. -fn cstr_to_str(b: &Bytes) -> Result<&str> { - let without_null = if b.last() == Some(&0) { - &b[..b.len() - 1] - } else { - &b[..] - }; - std::str::from_utf8(without_null).map_err(|e| e.into()) +fn read_cstr(buf: &mut Bytes) -> anyhow::Result { + let pos = buf.iter().position(|x| *x == 0); + let result = buf.split_to(pos.context("missing terminator")?); + buf.advance(1); // drop the null terminator + Ok(result) } impl<'a> BeMessage<'a> { @@ -631,18 +542,14 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'R'); write_body(buf, |buf| { buf.put_i32(0); // Specifies that the authentication was successful. - Ok::<_, io::Error>(()) - }) - .unwrap(); // write into BytesMut can't fail + }); } BeMessage::AuthenticationCleartextPassword => { buf.put_u8(b'R'); write_body(buf, |buf| { buf.put_i32(3); // Specifies that clear text password is required. - Ok::<_, io::Error>(()) - }) - .unwrap(); // write into BytesMut can't fail + }); } BeMessage::AuthenticationMD5Password(salt) => { @@ -650,9 +557,7 @@ impl<'a> BeMessage<'a> { write_body(buf, |buf| { buf.put_i32(5); // Specifies that an MD5-encrypted password is required. buf.put_slice(&salt[..]); - Ok::<_, io::Error>(()) - }) - .unwrap(); // write into BytesMut can't fail + }); } BeMessage::AuthenticationSasl(msg) => { @@ -677,8 +582,7 @@ impl<'a> BeMessage<'a> { } } Ok::<_, io::Error>(()) - }) - .unwrap() + })?; } BeMessage::BackendKeyData(key_data) => { @@ -686,77 +590,64 @@ impl<'a> BeMessage<'a> { write_body(buf, |buf| { buf.put_i32(key_data.backend_pid); buf.put_i32(key_data.cancel_key); - Ok(()) - }) - .unwrap(); + }); } BeMessage::BindComplete => { buf.put_u8(b'2'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::CloseComplete => { buf.put_u8(b'3'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::CommandComplete(cmd) => { buf.put_u8(b'C'); - write_body(buf, |buf| { - write_cstr(cmd, buf)?; - Ok::<_, io::Error>(()) - })?; + write_body(buf, |buf| write_cstr(cmd, buf))?; } BeMessage::CopyData(data) => { buf.put_u8(b'd'); write_body(buf, |buf| { buf.put_slice(data); - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } BeMessage::CopyDone => { buf.put_u8(b'c'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::CopyFail => { buf.put_u8(b'f'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::CopyInResponse => { buf.put_u8(b'G'); write_body(buf, |buf| { - buf.put_u8(1); /* copy_is_binary */ - buf.put_i16(0); /* numAttributes */ - Ok::<_, io::Error>(()) - }) - .unwrap(); + buf.put_u8(1); // copy_is_binary + buf.put_i16(0); // numAttributes + }); } BeMessage::CopyOutResponse => { buf.put_u8(b'H'); write_body(buf, |buf| { - buf.put_u8(0); /* copy_is_binary */ - buf.put_i16(0); /* numAttributes */ - Ok::<_, io::Error>(()) - }) - .unwrap(); + buf.put_u8(0); // copy_is_binary + buf.put_i16(0); // numAttributes + }); } BeMessage::CopyBothResponse => { buf.put_u8(b'W'); write_body(buf, |buf| { // doesn't matter, used only for replication - buf.put_u8(0); /* copy_is_binary */ - buf.put_i16(0); /* numAttributes */ - Ok::<_, io::Error>(()) - }) - .unwrap(); + buf.put_u8(0); // copy_is_binary + buf.put_i16(0); // numAttributes + }); } BeMessage::DataRow(vals) => { @@ -771,9 +662,7 @@ impl<'a> BeMessage<'a> { buf.put_i32(-1); } } - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } // ErrorResponse is a zero-terminated array of zero-terminated fields. @@ -788,18 +677,17 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'E'); write_body(buf, |buf| { buf.put_u8(b'S'); // severity - write_cstr(&Bytes::from("ERROR"), buf)?; + buf.put_slice(b"ERROR\0"); buf.put_u8(b'C'); // SQLSTATE error code - write_cstr(&Bytes::from("CXX000"), buf)?; + buf.put_slice(b"CXX000\0"); buf.put_u8(b'M'); // the message write_cstr(error_msg.as_bytes(), buf)?; buf.put_u8(0); // terminator Ok::<_, io::Error>(()) - }) - .unwrap(); + })?; } // NoticeResponse has the same format as ErrorResponse. From doc: "The frontend should display the @@ -812,23 +700,22 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'N'); write_body(buf, |buf| { buf.put_u8(b'S'); // severity - write_cstr(&Bytes::from("NOTICE"), buf)?; + buf.put_slice(b"NOTICE\0"); buf.put_u8(b'C'); // SQLSTATE error code - write_cstr(&Bytes::from("CXX000"), buf)?; + buf.put_slice(b"CXX000\0"); buf.put_u8(b'M'); // the message write_cstr(error_msg.as_bytes(), buf)?; buf.put_u8(0); // terminator Ok::<_, io::Error>(()) - }) - .unwrap(); + })?; } BeMessage::NoData => { buf.put_u8(b'n'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::EncryptionResponse(should_negotiate) => { @@ -853,9 +740,7 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'S'); write_body(buf, |buf| { buf.put_slice(&buffer[..cnt]); - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } BeMessage::ParameterDescription => { @@ -863,23 +748,19 @@ impl<'a> BeMessage<'a> { write_body(buf, |buf| { // we don't support params, so always 0 buf.put_i16(0); - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } BeMessage::ParseComplete => { buf.put_u8(b'1'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::ReadyForQuery => { buf.put_u8(b'Z'); write_body(buf, |buf| { buf.put_u8(b'I'); - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } BeMessage::RowDescription(rows) => { @@ -907,9 +788,7 @@ impl<'a> BeMessage<'a> { buf.put_u64(body.wal_end); buf.put_i64(body.timestamp); buf.put_slice(body.data); - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } BeMessage::KeepAlive(req) => { @@ -918,10 +797,8 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'k'); buf.put_u64(req.sent_ptr); buf.put_i64(req.timestamp); - buf.put_u8(if req.request_reply { 1u8 } else { 0u8 }); - Ok::<_, io::Error>(()) - }) - .unwrap(); + buf.put_u8(if req.request_reply { 1 } else { 0 }); + }); } } Ok(()) @@ -968,17 +845,17 @@ impl ReplicationFeedback { // value itself pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> { buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys - write_cstr(&Bytes::from("current_timeline_size"), buf)?; + buf.put_slice(b"current_timeline_size\0"); buf.put_i32(8); buf.put_u64(self.current_timeline_size); - write_cstr(&Bytes::from("ps_writelsn"), buf)?; + buf.put_slice(b"ps_writelsn\0"); buf.put_i32(8); buf.put_u64(self.ps_writelsn); - write_cstr(&Bytes::from("ps_flushlsn"), buf)?; + buf.put_slice(b"ps_flushlsn\0"); buf.put_i32(8); buf.put_u64(self.ps_flushlsn); - write_cstr(&Bytes::from("ps_applylsn"), buf)?; + buf.put_slice(b"ps_applylsn\0"); buf.put_i32(8); buf.put_u64(self.ps_applylsn); @@ -988,7 +865,7 @@ impl ReplicationFeedback { .expect("failed to serialize pg_replytime earlier than PG_EPOCH") .as_micros() as i64; - write_cstr(&Bytes::from("ps_replytime"), buf)?; + buf.put_slice(b"ps_replytime\0"); buf.put_i32(8); buf.put_i64(timestamp); Ok(()) @@ -998,33 +875,30 @@ impl ReplicationFeedback { pub fn parse(mut buf: Bytes) -> ReplicationFeedback { let mut zf = ReplicationFeedback::empty(); let nfields = buf.get_u8(); - let mut i = 0; - while i < nfields { - i += 1; - let key_cstr = read_null_terminated(&mut buf).unwrap(); - let key = cstr_to_str(&key_cstr).unwrap(); - match key { - "current_timeline_size" => { + for _ in 0..nfields { + let key = read_cstr(&mut buf).unwrap(); + match key.as_ref() { + b"current_timeline_size" => { let len = buf.get_i32(); assert_eq!(len, 8); zf.current_timeline_size = buf.get_u64(); } - "ps_writelsn" => { + b"ps_writelsn" => { let len = buf.get_i32(); assert_eq!(len, 8); zf.ps_writelsn = buf.get_u64(); } - "ps_flushlsn" => { + b"ps_flushlsn" => { let len = buf.get_i32(); assert_eq!(len, 8); zf.ps_flushlsn = buf.get_u64(); } - "ps_applylsn" => { + b"ps_applylsn" => { let len = buf.get_i32(); assert_eq!(len, 8); zf.ps_applylsn = buf.get_u64(); } - "ps_replytime" => { + b"ps_replytime" => { let len = buf.get_i32(); assert_eq!(len, 8); let raw_time = buf.get_i64(); @@ -1037,8 +911,8 @@ impl ReplicationFeedback { _ => { let len = buf.get_i32(); warn!( - "ReplicationFeedback parse. unknown key {} of len {}. Skip it.", - key, len + "ReplicationFeedback parse. unknown key {} of len {len}. Skip it.", + String::from_utf8_lossy(key.as_ref()) ); buf.advance(len as usize); } @@ -1084,7 +958,7 @@ mod tests { *first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1; } - write_cstr(&Bytes::from("new_field_one"), &mut data).unwrap(); + data.put_slice(b"new_field_one\0"); data.put_i32(8); data.put_u64(42); From 0c8ee6bd1d2ffd8f16fe0c34b4b16c8266b4fb9a Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Thu, 25 Aug 2022 09:46:52 +0200 Subject: [PATCH 0683/1022] Add postgis & plv8 extensions (#2298) * Add postgis & plv8 extensions * Update Dockerfile & Fix typo's * Update dockerfile * Update Dockerfile * Update dockerfile * Use plv8 step * Reduce giga layer * Reduce layer size further * Prepare for rollout * Fix dependency * Pass on correct build tag * No longer dependent on building tools * Use version from vendor * Revert "Use version from vendor" This reverts commit 7c6670c477efa0822907b853df1221909213cf88. * Revert and push correct set * Add configure step for new approach * Re-add configure flags Co-authored-by: Rory de Zoete Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 9 +-- Dockerfile.compute-node | 93 ++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 7 deletions(-) create mode 100644 Dockerfile.compute-node diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index dab34c84bc..71b9e8d803 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -472,10 +472,6 @@ jobs: compute-node-image: runs-on: dev container: gcr.io/kaniko-project/executor:v1.9.0-debug - # note: This image depends on neondatabase/compute-tools:latest (or :thisversion), - # which isn't available until after the image is promoted. - # Ergo, we must explicitly build and promote compute-tools separately. - needs: [ compute-tools-image ] steps: - name: Checkout @@ -487,9 +483,8 @@ jobs: - name: Configure ECR login run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - name: Kaniko build compute node - working-directory: ./vendor/postgres/ - run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg=TAG=$GITHUB_RUN_ID --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID + - name: Kaniko build compute node with extensions + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID promote-images: runs-on: dev diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node new file mode 100644 index 0000000000..97c070d11e --- /dev/null +++ b/Dockerfile.compute-node @@ -0,0 +1,93 @@ +ARG TAG=pinned + +FROM debian:bullseye-slim AS build-deps +RUN apt update && \ + apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ + libcurl4-openssl-dev libossp-uuid-dev + +# Build Postgres from the neon postgres repository. +FROM build-deps AS pg-build +COPY vendor/postgres postgres +RUN cd postgres && \ + ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ + # Install headers + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install + +# Build PostGIS from the upstream PostGIS mirror. PostGIS compiles against neon postgres sources without changes. +# Perhaps we could even use the upstream binaries, compiled against vanilla Postgres, but it would require some +# investigation to check that it works, and also keeps working in the future. So for now, we compile our own binaries. +FROM build-deps AS postgis-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +RUN apt update && \ + apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget + +RUN wget https://download.osgeo.org/postgis/source/postgis-3.2.3.tar.gz && \ + tar xvzf postgis-3.2.3.tar.gz && \ + cd postgis-3.2.3 && \ + ./autogen.sh && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + ./configure && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + cd extensions/postgis && \ + make clean && \ + make -j $(getconf _NPROCESSORS_ONLN) install + +# Build plv8 +FROM build-deps AS plv8-build +COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +RUN apt update && \ + apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev + +# https://github.com/plv8/plv8/issues/475 +# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary +RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ + echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + apt update && \ + apt install -y --no-install-recommends -t testing binutils + +RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.3.tar.gz && \ + tar xvzf v3.1.3.tar.gz && \ + cd plv8-3.1.3 && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + make && \ + make install && \ + rm -rf /plv8-* + +# Compile and run the Neon-specific `compute_ctl` binary +FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools +USER nonroot +COPY --chown=nonroot compute_tools compute_tools +COPY --chown=nonroot workspace_hack workspace_hack +RUN cd compute_tools && cargo build --release + +# Put it all together into the final image +FROM debian:bullseye-slim +# Add user postgres +RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ + echo "postgres:test_console_pass" | chpasswd && \ + mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ + chown -R postgres:postgres /var/db/postgres && \ + chmod 0750 /var/db/postgres/compute && \ + echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig + +# TODO: Check if we can make the extension setup more modular versus a linear build +# currently plv8-build copies the output /usr/local/pgsql from postgis-build# +COPY --from=plv8-build --chown=postgres /usr/local/pgsql /usr/local/pgsql +COPY --from=compute-tools --chown=postgres /home/nonroot/compute_tools/target/release/compute_ctl /usr/local/bin/compute_ctl + +RUN apt update && \ + apt install -y libreadline-dev libossp-uuid-dev gdal-bin libgdal-dev libprotobuf-c-dev && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# Debian bullseye provides GLIBC 2.31 when 2.34 is necessary as we compiled plv8 with that version +RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ + echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + apt update && \ + apt install -y --no-install-recommends -t testing binutils && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +ENV PATH=/usr/local/pgsql/bin:$PATH +USER postgres +ENTRYPOINT ["/usr/local/bin/compute_ctl"] \ No newline at end of file From 344db0b4aa5aaecc9b23479eefc17c234e00acfa Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Thu, 25 Aug 2022 11:17:09 +0200 Subject: [PATCH 0684/1022] Re-add temporary symlink (#2331) Co-authored-by: Rory de Zoete --- Dockerfile.compute-node | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 97c070d11e..b5e639d5d6 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -88,6 +88,9 @@ RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.lis apt install -y --no-install-recommends -t testing binutils && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +# "temporary" symlink for compatibility with old control-plane +RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl + ENV PATH=/usr/local/pgsql/bin:$PATH USER postgres ENTRYPOINT ["/usr/local/bin/compute_ctl"] \ No newline at end of file From f67d109e6ea0dd554c0d6288362a7be0ddc60460 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Thu, 25 Aug 2022 14:35:01 +0200 Subject: [PATCH 0685/1022] Copy binaries to /usr/local (#2335) * Add extra symlink * Take other approach Co-authored-by: Rory de Zoete --- Dockerfile.compute-node | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index b5e639d5d6..117a4155cd 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -74,7 +74,7 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ # TODO: Check if we can make the extension setup more modular versus a linear build # currently plv8-build copies the output /usr/local/pgsql from postgis-build# -COPY --from=plv8-build --chown=postgres /usr/local/pgsql /usr/local/pgsql +COPY --from=plv8-build --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/compute_tools/target/release/compute_ctl /usr/local/bin/compute_ctl RUN apt update && \ @@ -88,9 +88,8 @@ RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.lis apt install -y --no-install-recommends -t testing binutils && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* -# "temporary" symlink for compatibility with old control-plane +# "temporary" symlink for old control-plane RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl -ENV PATH=/usr/local/pgsql/bin:$PATH USER postgres ENTRYPOINT ["/usr/local/bin/compute_ctl"] \ No newline at end of file From c952f022bb4b3703ee8bc20604e2cda34c84128d Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 25 Aug 2022 13:29:37 +0300 Subject: [PATCH 0686/1022] waldecoder: fix comment --- libs/postgres_ffi/src/waldecoder.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index 768e79621d..b509fc87a5 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -170,6 +170,7 @@ impl WalStreamDecoder { } State::SkippingEverything { .. } => {} } + // now read page contents match &mut self.state { State::WaitingForRecord => { // need to have at least the xl_tot_len field @@ -194,8 +195,8 @@ impl WalStreamDecoder { return Ok(Some(self.complete_record(recordbuf)?)); } else { // Need to assemble the record from pieces. Remember the size of the - // record, and loop back. On next iteration, we will reach the 'else' - // branch below, and copy the part of the record that was on this page + // record, and loop back. On next iterations, we will reach the branch + // below, and copy the part of the record that was on this or next page(s) // to 'recordbuf'. Subsequent iterations will skip page headers, and // append the continuations from the next pages to 'recordbuf'. self.state = State::ReassemblingRecord { From bc588f3a533b7e39f144875a8ac38775204ce2dc Mon Sep 17 00:00:00 2001 From: MMeent Date: Thu, 25 Aug 2022 16:17:32 +0200 Subject: [PATCH 0687/1022] Update WAL redo histograms (#2323) Previously, it could only distinguish REDO task durations down to 5ms, which equates to approx. 200pages/sec or 1.6MB/sec getpage@LSN traffic. This patch improves to 200'000 pages/sec or 1.6GB/sec, allowing for much more precise performance measurement of the redo process. --- pageserver/src/walredo.rs | 46 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 9cf347573a..bf48bd1759 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -89,15 +89,52 @@ pub trait WalRedoManager: Send + Sync { // for access to the postgres process ('wait') since there is only one for // each tenant. +/// Time buckets are small because we want to be able to measure the +/// smallest redo processing times. These buckets allow us to measure down +/// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec. +/// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec. +macro_rules! redo_histogram_time_buckets { + () => { + vec![ + 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000, + 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, + ] + }; +} + +/// While we're at it, also measure the amount of records replayed in each +/// operation. We have a global 'total replayed' counter, but that's not +/// as useful as 'what is the skew for how many records we replay in one +/// operation'. +macro_rules! redo_histogram_count_buckets { + () => { + vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0] + }; +} + static WAL_REDO_TIME: Lazy = Lazy::new(|| { - register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo") - .expect("failed to define a metric") + register_histogram!( + "pageserver_wal_redo_seconds", + "Time spent on WAL redo", + redo_histogram_time_buckets!() + ) + .expect("failed to define a metric") }); static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_wait_seconds", - "Time spent waiting for access to the WAL redo process" + "Time spent waiting for access to the WAL redo process", + redo_histogram_time_buckets!(), + ) + .expect("failed to define a metric") +}); + +static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_wal_redo_records_histogram", + "Histogram of number of records replayed per redo", + redo_histogram_count_buckets!(), ) .expect("failed to define a metric") }); @@ -262,7 +299,10 @@ impl PostgresRedoManager { let end_time = Instant::now(); let duration = end_time.duration_since(lock_time); + WAL_REDO_TIME.observe(duration.as_secs_f64()); + WAL_REDO_RECORDS_HISTOGRAM.observe(records.len() as f64); + debug!( "postgres applied {} WAL records in {} us to reconstruct page image at LSN {}", records.len(), From 04a018a5b12735ef3e4a80bdaa2cdb619024cb0e Mon Sep 17 00:00:00 2001 From: MMeent Date: Thu, 25 Aug 2022 18:48:09 +0200 Subject: [PATCH 0688/1022] Extract neon and neon_test_utils from postgres repo (#2325) * Extract neon and neon_test_utils from postgres repo * Remove neon from vendored postgres repo, and fix build_and_test.yml * Move EmitWarningsOnPlaceholders to end of _PG_init in neon.c (from libpagestore.c) * Fix Makefile location comments * remove Makefile EXTRA_INSTALL flag * Update Dockerfile.compute-node to build and include the neon extension --- .github/workflows/build_and_test.yml | 4 + .github/workflows/codestyle.yml | 3 + Dockerfile.compute-node | 17 +- Makefile | 26 +- pgxn/neon/Makefile | 26 + pgxn/neon/inmem_smgr.c | 286 ++ pgxn/neon/libpagestore.c | 432 +++ pgxn/neon/libpqwalproposer.c | 413 +++ pgxn/neon/neon--1.0.sql | 17 + pgxn/neon/neon.c | 82 + pgxn/neon/neon.control | 4 + pgxn/neon/neon.h | 19 + pgxn/neon/pagestore_client.h | 221 ++ pgxn/neon/pagestore_smgr.c | 1696 ++++++++++++ pgxn/neon/relsize_cache.c | 167 ++ pgxn/neon/walproposer.c | 2403 +++++++++++++++++ pgxn/neon/walproposer.h | 540 ++++ pgxn/neon/walproposer_utils.c | 1110 ++++++++ pgxn/neon/walproposer_utils.h | 19 + pgxn/neon_test_utils/Makefile | 15 + pgxn/neon_test_utils/neon_test_utils--1.0.sql | 29 + pgxn/neon_test_utils/neon_test_utils.control | 5 + pgxn/neon_test_utils/neontest.c | 304 +++ vendor/postgres | 2 +- 24 files changed, 7830 insertions(+), 10 deletions(-) create mode 100644 pgxn/neon/Makefile create mode 100644 pgxn/neon/inmem_smgr.c create mode 100644 pgxn/neon/libpagestore.c create mode 100644 pgxn/neon/libpqwalproposer.c create mode 100644 pgxn/neon/neon--1.0.sql create mode 100644 pgxn/neon/neon.c create mode 100644 pgxn/neon/neon.control create mode 100644 pgxn/neon/neon.h create mode 100644 pgxn/neon/pagestore_client.h create mode 100644 pgxn/neon/pagestore_smgr.c create mode 100644 pgxn/neon/relsize_cache.c create mode 100644 pgxn/neon/walproposer.c create mode 100644 pgxn/neon/walproposer.h create mode 100644 pgxn/neon/walproposer_utils.c create mode 100644 pgxn/neon/walproposer_utils.h create mode 100644 pgxn/neon_test_utils/Makefile create mode 100644 pgxn/neon_test_utils/neon_test_utils--1.0.sql create mode 100644 pgxn/neon_test_utils/neon_test_utils.control create mode 100644 pgxn/neon_test_utils/neontest.c diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 71b9e8d803..6e570b22d4 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -136,6 +136,10 @@ jobs: run: mold -run make postgres -j$(nproc) shell: bash -euxo pipefail {0} + - name: Build neon extensions + run: mold -run make neon-pg-ext -j$(nproc) + shell: bash -euxo pipefail {0} + - name: Run cargo build run: | ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 029beba351..eddfee88fc 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -81,6 +81,9 @@ jobs: if: steps.cache_pg.outputs.cache-hit != 'true' run: make postgres + - name: Build neon extensions + run: make neon-pg-ext + # Plain configure output can contain weird errors like 'error: C compiler cannot create executables' # and the real cause will be inside config.log - name: Print configure logs in case of failure diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 117a4155cd..4527fb9ece 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -13,7 +13,8 @@ RUN cd postgres && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ # Install headers - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install # Build PostGIS from the upstream PostGIS mirror. PostGIS compiles against neon postgres sources without changes. # Perhaps we could even use the upstream binaries, compiled against vanilla Postgres, but it would require some @@ -55,6 +56,16 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.3.tar.gz && \ make install && \ rm -rf /plv8-* +# compile neon extensions +FROM build-deps AS neon-pg-ext-build +COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY pgxn/ pgxn/ + +RUN make -j $(getconf _NPROCESSORS_ONLN) \ + PG_CONFIG=/usr/local/pgsql/bin/pg_config \ + -C pgxn/neon \ + -s install + # Compile and run the Neon-specific `compute_ctl` binary FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools USER nonroot @@ -73,8 +84,8 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig # TODO: Check if we can make the extension setup more modular versus a linear build -# currently plv8-build copies the output /usr/local/pgsql from postgis-build# -COPY --from=plv8-build --chown=postgres /usr/local/pgsql /usr/local +# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc# +COPY --from=neon-pg-ext-build --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/compute_tools/target/release/compute_ctl /usr/local/bin/compute_ctl RUN apt update && \ diff --git a/Makefile b/Makefile index fc75e9fc5e..9d7e1497e5 100644 --- a/Makefile +++ b/Makefile @@ -51,7 +51,7 @@ CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1 # Top level Makefile to build Zenith and PostgreSQL # .PHONY: all -all: zenith postgres +all: zenith postgres neon-pg-ext ### Zenith Rust bits # @@ -87,25 +87,39 @@ postgres: postgres-configure \ postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers` +@echo "Compiling PostgreSQL" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 install - +@echo "Compiling contrib/neon" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon install - +@echo "Compiling contrib/neon_test_utils" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon_test_utils install + +@echo "Compiling libpq" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/interfaces/libpq install +@echo "Compiling pg_buffercache" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache install +@echo "Compiling pageinspect" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect install - .PHONY: postgres-clean postgres-clean: $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/interfaces/libpq clean + +neon-pg-ext: postgres + +@echo "Compiling neon" + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/bin/pg_config \ + -C $(ROOT_PROJECT_DIR)/pgxn/neon install + +@echo "Compiling neon_test_utils" + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/bin/pg_config \ + -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils install + +.PHONY: neon-pg-ext-clean + $(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean + $(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean # This doesn't remove the effects of 'configure'. .PHONY: clean clean: cd $(POSTGRES_INSTALL_DIR)/build && $(MAKE) clean $(CARGO_CMD_PREFIX) cargo clean + cd pgxn/neon && $(MAKE) clean + cd pgxn/neon_test_utils && $(MAKE) clean # This removes everything .PHONY: distclean diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile new file mode 100644 index 0000000000..a6ce611974 --- /dev/null +++ b/pgxn/neon/Makefile @@ -0,0 +1,26 @@ +# pgxs/neon/Makefile + + +MODULE_big = neon +OBJS = \ + $(WIN32RES) \ + inmem_smgr.o \ + libpagestore.o \ + libpqwalproposer.o \ + pagestore_smgr.o \ + relsize_cache.o \ + neon.o \ + walproposer.o \ + walproposer_utils.o + +PG_CPPFLAGS = -I$(libpq_srcdir) +SHLIB_LINK_INTERNAL = $(libpq) + +EXTENSION = neon +DATA = neon--1.0.sql +PGFILEDESC = "neon - cloud storage for PostgreSQL" + + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/pgxn/neon/inmem_smgr.c b/pgxn/neon/inmem_smgr.c new file mode 100644 index 0000000000..7840292b08 --- /dev/null +++ b/pgxn/neon/inmem_smgr.c @@ -0,0 +1,286 @@ +/*------------------------------------------------------------------------- + * + * inmem_smgr.c + * + * This is an implementation of the SMGR interface, used in the WAL redo + * process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent + * storage, the pages that are written out are kept in a small number of + * in-memory buffers. + * + * Normally, replaying a WAL record only needs to access a handful of + * buffers, which fit in the normal buffer cache, so this is just for + * "overflow" storage when the buffer cache is not large enough. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * contrib/neon/inmem_smgr.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xlog.h" +#include "pagestore_client.h" +#include "storage/block.h" +#include "storage/buf_internals.h" +#include "storage/relfilenode.h" +#include "storage/smgr.h" + +/* Size of the in-memory smgr */ +#define MAX_PAGES 64 + +/* If more than WARN_PAGES are used, print a warning in the log */ +#define WARN_PAGES 32 + +static BufferTag page_tag[MAX_PAGES]; +static char page_body[MAX_PAGES][BLCKSZ]; +static int used_pages; + +static int +locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno) +{ + /* We only hold a small number of pages, so linear search */ + for (int i = 0; i < used_pages; i++) + { + if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode) + && forknum == page_tag[i].forkNum + && blkno == page_tag[i].blockNum) + { + return i; + } + } + return -1; +} + +/* + * inmem_init() -- Initialize private state + */ +void +inmem_init(void) +{ + used_pages = 0; +} + +/* + * inmem_exists() -- Does the physical file exist? + */ +bool +inmem_exists(SMgrRelation reln, ForkNumber forknum) +{ + for (int i = 0; i < used_pages; i++) + { + if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode) + && forknum == page_tag[i].forkNum) + { + return true; + } + } + return false; +} + +/* + * inmem_create() -- Create a new relation on zenithd storage + * + * If isRedo is true, it's okay for the relation to exist already. + */ +void +inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo) +{ +} + +/* + * inmem_unlink() -- Unlink a relation. + */ +void +inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo) +{ +} + +/* + * inmem_extend() -- Add a block to the specified relation. + * + * The semantics are nearly the same as mdwrite(): write at the + * specified position. However, this is to be used for the case of + * extending a relation (i.e., blocknum is at or beyond the current + * EOF). Note that we assume writing a block beyond current EOF + * causes intervening file space to become filled with zeroes. + */ +void +inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + char *buffer, bool skipFsync) +{ + /* same as smgwrite() for us */ + inmem_write(reln, forknum, blkno, buffer, skipFsync); +} + +/* + * inmem_open() -- Initialize newly-opened relation. + */ +void +inmem_open(SMgrRelation reln) +{ +} + +/* + * inmem_close() -- Close the specified relation, if it isn't closed already. + */ +void +inmem_close(SMgrRelation reln, ForkNumber forknum) +{ +} + +/* + * inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation + */ +bool +inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + return true; +} + +/* + * inmem_writeback() -- Tell the kernel to write pages back to storage. + */ +void +inmem_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ +} + +/* + * inmem_read() -- Read the specified block from a relation. + */ +void +inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + char *buffer) +{ + int pg; + + pg = locate_page(reln, forknum, blkno); + if (pg < 0) + memset(buffer, 0, BLCKSZ); + else + memcpy(buffer, page_body[pg], BLCKSZ); +} + +/* + * inmem_write() -- Write the supplied block at the appropriate location. + * + * This is to be used only for updating already-existing blocks of a + * relation (ie, those before the current EOF). To extend a relation, + * use mdextend(). + */ +void +inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync) +{ + int pg; + + pg = locate_page(reln, forknum, blocknum); + if (pg < 0) + { + /* + * We assume the buffer cache is large enough to hold all the buffers + * needed for most operations. Overflowing to this "in-mem smgr" in rare + * cases is OK. But if we find that we're using more than WARN_PAGES, + * print a warning so that we get alerted and get to investigate why + * we're accessing so many buffers. + */ + elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1, + "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + blocknum, + used_pages); + if (used_pages == MAX_PAGES) + elog(ERROR, "Inmem storage overflow"); + + pg = used_pages; + used_pages++; + INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum); + } else { + elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + blocknum, + used_pages); + } + memcpy(page_body[pg], buffer, BLCKSZ); +} + +/* + * inmem_nblocks() -- Get the number of blocks stored in a relation. + */ +BlockNumber +inmem_nblocks(SMgrRelation reln, ForkNumber forknum) +{ + /* + * It's not clear why a WAL redo function would call smgrnblocks(). + * During recovery, at least before reaching consistency, the size of a + * relation could be arbitrarily small, if it was truncated after the + * record being replayed, or arbitrarily large if it was extended + * afterwards. But one place where it's called is in + * XLogReadBufferExtended(): it extends the relation, if it's smaller than + * the requested page. That's a waste of time in the WAL redo + * process. Pretend that all relations are maximally sized to avoid it. + */ + return MaxBlockNumber; +} + +/* + * inmem_truncate() -- Truncate relation to specified number of blocks. + */ +void +inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +{ +} + +/* + * inmem_immedsync() -- Immediately sync a relation to stable storage. + */ +void +inmem_immedsync(SMgrRelation reln, ForkNumber forknum) +{ +} + +static const struct f_smgr inmem_smgr = +{ + .smgr_init = inmem_init, + .smgr_shutdown = NULL, + .smgr_open = inmem_open, + .smgr_close = inmem_close, + .smgr_create = inmem_create, + .smgr_exists = inmem_exists, + .smgr_unlink = inmem_unlink, + .smgr_extend = inmem_extend, + .smgr_prefetch = inmem_prefetch, + .smgr_read = inmem_read, + .smgr_write = inmem_write, + .smgr_writeback = inmem_writeback, + .smgr_nblocks = inmem_nblocks, + .smgr_truncate = inmem_truncate, + .smgr_immedsync = inmem_immedsync, +}; + +const f_smgr * +smgr_inmem(BackendId backend, RelFileNode rnode) +{ + Assert(InRecovery); + if (backend != InvalidBackendId) + return smgr_standard(backend, rnode); + else + return &inmem_smgr; +} + +void +smgr_init_inmem() +{ + inmem_init(); +} diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c new file mode 100644 index 0000000000..649fc1037e --- /dev/null +++ b/pgxn/neon/libpagestore.c @@ -0,0 +1,432 @@ +/*------------------------------------------------------------------------- + * + * libpagestore.c + * Handles network communications with the remote pagestore. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * contrib/neon/libpqpagestore.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pagestore_client.h" +#include "fmgr.h" +#include "access/xlog.h" + +#include "libpq-fe.h" +#include "libpq/pqformat.h" +#include "libpq/libpq.h" + +#include "miscadmin.h" +#include "pgstat.h" +#include "utils/guc.h" + +#include "neon.h" +#include "walproposer.h" +#include "walproposer_utils.h" + + +#define PageStoreTrace DEBUG5 + +#define NEON_TAG "[NEON_SMGR] " +#define neon_log(tag, fmt, ...) ereport(tag, \ + (errmsg(NEON_TAG fmt, ## __VA_ARGS__), \ + errhidestmt(true), errhidecontext(true))) + +bool connected = false; +PGconn *pageserver_conn = NULL; + +char *page_server_connstring_raw; + +static ZenithResponse *pageserver_call(ZenithRequest *request); +page_server_api api = { + .request = pageserver_call +}; + +static void +pageserver_connect() +{ + char *query; + int ret; + + Assert(!connected); + + pageserver_conn = PQconnectdb(page_server_connstring); + + if (PQstatus(pageserver_conn) == CONNECTION_BAD) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + PQfinish(pageserver_conn); + pageserver_conn = NULL; + ereport(ERROR, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "could not establish connection to pageserver"), + errdetail_internal("%s", msg))); + } + + query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline); + ret = PQsendQuery(pageserver_conn, query); + if (ret != 1) + { + PQfinish(pageserver_conn); + pageserver_conn = NULL; + neon_log(ERROR, "could not send pagestream command to pageserver"); + } + + while (PQisBusy(pageserver_conn)) + { + int wc; + + /* Sleep until there's something to do */ + wc = WaitLatchOrSocket(MyLatch, + WL_LATCH_SET | WL_SOCKET_READABLE | + WL_EXIT_ON_PM_DEATH, + PQsocket(pageserver_conn), + -1L, PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Data available in socket? */ + if (wc & WL_SOCKET_READABLE) + { + if (!PQconsumeInput(pageserver_conn)) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + PQfinish(pageserver_conn); + pageserver_conn = NULL; + + neon_log(ERROR, "could not complete handshake with pageserver: %s", + msg); + } + } + } + + neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw); + + connected = true; +} + +/* + * A wrapper around PQgetCopyData that checks for interrupts while sleeping. + */ +static int +call_PQgetCopyData(PGconn *conn, char **buffer) +{ + int ret; + +retry: + ret = PQgetCopyData(conn, buffer, 1 /* async */ ); + + if (ret == 0) + { + int wc; + + /* Sleep until there's something to do */ + wc = WaitLatchOrSocket(MyLatch, + WL_LATCH_SET | WL_SOCKET_READABLE | + WL_EXIT_ON_PM_DEATH, + PQsocket(conn), + -1L, PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Data available in socket? */ + if (wc & WL_SOCKET_READABLE) + { + if (!PQconsumeInput(conn)) + neon_log(ERROR, "could not get response from pageserver: %s", + PQerrorMessage(conn)); + } + + goto retry; + } + + return ret; +} + + +static ZenithResponse * +pageserver_call(ZenithRequest *request) +{ + StringInfoData req_buff; + StringInfoData resp_buff; + ZenithResponse *resp; + + PG_TRY(); + { + /* If the connection was lost for some reason, reconnect */ + if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) + { + PQfinish(pageserver_conn); + pageserver_conn = NULL; + connected = false; + } + + if (!connected) + pageserver_connect(); + + req_buff = zm_pack_request(request); + + /* + * Send request. + * + * In principle, this could block if the output buffer is full, and we + * should use async mode and check for interrupts while waiting. In + * practice, our requests are small enough to always fit in the output + * and TCP buffer. + */ + if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn)) + { + neon_log(ERROR, "failed to send page request: %s", + PQerrorMessage(pageserver_conn)); + } + pfree(req_buff.data); + + if (message_level_is_interesting(PageStoreTrace)) + { + char *msg = zm_to_string((ZenithMessage *) request); + + neon_log(PageStoreTrace, "sent request: %s", msg); + pfree(msg); + } + + /* read response */ + resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data); + resp_buff.cursor = 0; + + if (resp_buff.len == -1) + neon_log(ERROR, "end of COPY"); + else if (resp_buff.len == -2) + neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); + + resp = zm_unpack_response(&resp_buff); + PQfreemem(resp_buff.data); + + if (message_level_is_interesting(PageStoreTrace)) + { + char *msg = zm_to_string((ZenithMessage *) resp); + + neon_log(PageStoreTrace, "got response: %s", msg); + pfree(msg); + } + } + PG_CATCH(); + { + /* + * If anything goes wrong while we were sending a request, it's not + * clear what state the connection is in. For example, if we sent the + * request but didn't receive a response yet, we might receive the + * response some time later after we have already sent a new unrelated + * request. Close the connection to avoid getting confused. + */ + if (connected) + { + neon_log(LOG, "dropping connection to page server due to error"); + PQfinish(pageserver_conn); + pageserver_conn = NULL; + connected = false; + } + PG_RE_THROW(); + } + PG_END_TRY(); + + return (ZenithResponse *) resp; +} + + +static bool +check_zenith_id(char **newval, void **extra, GucSource source) +{ + uint8 zid[16]; + + return **newval == '\0' || HexDecodeString(zid, *newval, 16); +} + +static char * +substitute_pageserver_password(const char *page_server_connstring_raw) +{ + char *host = NULL; + char *port = NULL; + char *user = NULL; + char *auth_token = NULL; + char *err = NULL; + char *page_server_connstring = NULL; + PQconninfoOption *conn_options; + PQconninfoOption *conn_option; + MemoryContext oldcontext; + + /* + * Here we substitute password in connection string with an environment + * variable. To simplify things we construct a connection string back with + * only known options. In particular: host port user and password. We do + * not currently use other options and constructing full connstring in an + * URI shape is quite messy. + */ + + if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0') + return NULL; + + /* extract the auth token from the connection string */ + conn_options = PQconninfoParse(page_server_connstring_raw, &err); + if (conn_options == NULL) + { + /* The error string is malloc'd, so we must free it explicitly */ + char *errcopy = err ? pstrdup(err) : "out of memory"; + + PQfreemem(err); + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid connection string syntax: %s", errcopy))); + } + + /* + * Trying to populate pageserver connection string with auth token from + * environment. We are looking for password in with placeholder value like + * $ENV_VAR_NAME, so if password field is present and starts with $ we try + * to fetch environment variable value and fail loudly if it is not set. + */ + for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++) + { + if (strcmp(conn_option->keyword, "host") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + host = conn_option->val; + } + else if (strcmp(conn_option->keyword, "port") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + port = conn_option->val; + } + else if (strcmp(conn_option->keyword, "user") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + user = conn_option->val; + } + else if (strcmp(conn_option->keyword, "password") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + { + /* ensure that this is a template */ + if (strncmp(conn_option->val, "$", 1) != 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1]))); + + neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]); + auth_token = getenv(&conn_option->val[1]); + if (!auth_token) + { + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1]))); + } + else + { + neon_log(LOG, "using auth token from environment passed via env"); + } + } + } + } + + /* + * allocate connection string in TopMemoryContext to make sure it is not + * freed + */ + oldcontext = CurrentMemoryContext; + MemoryContextSwitchTo(TopMemoryContext); + page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port); + MemoryContextSwitchTo(oldcontext); + + PQconninfoFree(conn_options); + return page_server_connstring; +} + +/* + * Module initialization function + */ +void +pg_init_libpagestore(void) +{ + DefineCustomStringVariable("neon.pageserver_connstring", + "connection string to the page server", + NULL, + &page_server_connstring_raw, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + NULL, NULL, NULL); + + DefineCustomStringVariable("neon.timeline_id", + "Zenith timelineid the server is running on", + NULL, + &zenith_timeline, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_zenith_id, NULL, NULL); + + DefineCustomStringVariable("neon.tenant_id", + "Neon tenantid the server is running on", + NULL, + &zenith_tenant, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_zenith_id, NULL, NULL); + + DefineCustomBoolVariable("neon.wal_redo", + "start in wal-redo mode", + NULL, + &wal_redo, + false, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + + DefineCustomIntVariable("neon.max_cluster_size", + "cluster size limit", + NULL, + &max_cluster_size, + -1, -1, INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MB, + NULL, NULL, NULL); + + relsize_hash_init(); + + if (page_server != NULL) + neon_log(ERROR, "libpagestore already loaded"); + + neon_log(PageStoreTrace, "libpagestore already loaded"); + page_server = &api; + + /* substitute password in pageserver_connstring */ + page_server_connstring = substitute_pageserver_password(page_server_connstring_raw); + + /* Is there more correct way to pass CustomGUC to postgres code? */ + zenith_timeline_walproposer = zenith_timeline; + zenith_tenant_walproposer = zenith_tenant; + + if (wal_redo) + { + neon_log(PageStoreTrace, "set inmem_smgr hook"); + smgr_hook = smgr_inmem; + smgr_init_hook = smgr_init_inmem; + } + else if (page_server_connstring && page_server_connstring[0]) + { + neon_log(PageStoreTrace, "set neon_smgr hook"); + smgr_hook = smgr_zenith; + smgr_init_hook = smgr_init_zenith; + dbsize_hook = zenith_dbsize; + } +} diff --git a/pgxn/neon/libpqwalproposer.c b/pgxn/neon/libpqwalproposer.c new file mode 100644 index 0000000000..2b2b7a1a6a --- /dev/null +++ b/pgxn/neon/libpqwalproposer.c @@ -0,0 +1,413 @@ +#include "postgres.h" + +#include "libpq-fe.h" +#include "neon.h" +#include "walproposer.h" + +/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */ +struct WalProposerConn +{ + PGconn* pg_conn; + bool is_nonblocking; /* whether the connection is non-blocking */ + char *recvbuf; /* last received data from libpqprop_async_read */ +}; + +/* Prototypes for exported functions */ +static char* libpqprop_error_message(WalProposerConn* conn); +static WalProposerConnStatusType libpqprop_status(WalProposerConn* conn); +static WalProposerConn* libpqprop_connect_start(char* conninfo); +static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn* conn); +static bool libpqprop_send_query(WalProposerConn* conn, char* query); +static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn* conn); +static pgsocket libpqprop_socket(WalProposerConn* conn); +static int libpqprop_flush(WalProposerConn* conn); +static void libpqprop_finish(WalProposerConn* conn); +static PGAsyncReadResult libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount); +static PGAsyncWriteResult libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size); +static bool libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size); + +static WalProposerFunctionsType PQWalProposerFunctions = { + libpqprop_error_message, + libpqprop_status, + libpqprop_connect_start, + libpqprop_connect_poll, + libpqprop_send_query, + libpqprop_get_query_result, + libpqprop_socket, + libpqprop_flush, + libpqprop_finish, + libpqprop_async_read, + libpqprop_async_write, + libpqprop_blocking_write, +}; + +/* Module initialization */ +void +pg_init_libpqwalproposer(void) +{ + if (WalProposerFunctions != NULL) + elog(ERROR, "libpqwalproposer already loaded"); + WalProposerFunctions = &PQWalProposerFunctions; +} + +/* Helper function */ +static bool +ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking) +{ + /* If we're already correctly blocking or nonblocking, all good */ + if (is_nonblocking == conn->is_nonblocking) + return true; + + /* Otherwise, set it appropriately */ + if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1) + return false; + + conn->is_nonblocking = is_nonblocking; + return true; +} + +/* Exported function definitions */ +static char* +libpqprop_error_message(WalProposerConn* conn) +{ + return PQerrorMessage(conn->pg_conn); +} + +static WalProposerConnStatusType +libpqprop_status(WalProposerConn* conn) +{ + switch (PQstatus(conn->pg_conn)) + { + case CONNECTION_OK: + return WP_CONNECTION_OK; + case CONNECTION_BAD: + return WP_CONNECTION_BAD; + default: + return WP_CONNECTION_IN_PROGRESS; + } +} + +static WalProposerConn* +libpqprop_connect_start(char* conninfo) +{ + WalProposerConn* conn; + PGconn* pg_conn; + + pg_conn = PQconnectStart(conninfo); + /* + * Allocation of a PQconn can fail, and will return NULL. We want to fully replicate the + * behavior of PQconnectStart here. + */ + if (!pg_conn) + return NULL; + + /* + * And in theory this allocation can fail as well, but it's incredibly unlikely if we just + * successfully allocated a PGconn. + * + * palloc will exit on failure though, so there's not much we could do if it *did* fail. + */ + conn = palloc(sizeof(WalProposerConn)); + conn->pg_conn = pg_conn; + conn->is_nonblocking = false; /* connections always start in blocking mode */ + conn->recvbuf = NULL; + return conn; +} + +static WalProposerConnectPollStatusType +libpqprop_connect_poll(WalProposerConn* conn) +{ + WalProposerConnectPollStatusType return_val; + + switch (PQconnectPoll(conn->pg_conn)) + { + case PGRES_POLLING_FAILED: + return_val = WP_CONN_POLLING_FAILED; + break; + case PGRES_POLLING_READING: + return_val = WP_CONN_POLLING_READING; + break; + case PGRES_POLLING_WRITING: + return_val = WP_CONN_POLLING_WRITING; + break; + case PGRES_POLLING_OK: + return_val = WP_CONN_POLLING_OK; + break; + + /* There's a comment at its source about this constant being unused. We'll expect it's never + * returned. */ + case PGRES_POLLING_ACTIVE: + elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); + /* This return is never actually reached, but it's here to make the compiler happy */ + return WP_CONN_POLLING_FAILED; + + default: + Assert(false); + return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */ + } + + return return_val; +} + +static bool +libpqprop_send_query(WalProposerConn* conn, char* query) +{ + /* We need to be in blocking mode for sending the query to run without + * requiring a call to PQflush */ + if (!ensure_nonblocking_status(conn, false)) + return false; + + /* PQsendQuery returns 1 on success, 0 on failure */ + if (!PQsendQuery(conn->pg_conn, query)) + return false; + + return true; +} + +static WalProposerExecStatusType +libpqprop_get_query_result(WalProposerConn* conn) +{ + PGresult* result; + WalProposerExecStatusType return_val; + + /* Marker variable if we need to log an unexpected success result */ + char* unexpected_success = NULL; + + /* Consume any input that we might be missing */ + if (!PQconsumeInput(conn->pg_conn)) + return WP_EXEC_FAILED; + + if (PQisBusy(conn->pg_conn)) + return WP_EXEC_NEEDS_INPUT; + + + result = PQgetResult(conn->pg_conn); + /* PQgetResult returns NULL only if getting the result was successful & there's no more of the + * result to get. */ + if (!result) + { + elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); + return WP_EXEC_UNEXPECTED_SUCCESS; + } + + /* Helper macro to reduce boilerplate */ + #define UNEXPECTED_SUCCESS(msg) \ + return_val = WP_EXEC_UNEXPECTED_SUCCESS; \ + unexpected_success = msg; \ + break; + + + switch (PQresultStatus(result)) + { + /* "true" success case */ + case PGRES_COPY_BOTH: + return_val = WP_EXEC_SUCCESS_COPYBOTH; + break; + + /* Unexpected success case */ + case PGRES_EMPTY_QUERY: + UNEXPECTED_SUCCESS("empty query return"); + case PGRES_COMMAND_OK: + UNEXPECTED_SUCCESS("data-less command end"); + case PGRES_TUPLES_OK: + UNEXPECTED_SUCCESS("tuples return"); + case PGRES_COPY_OUT: + UNEXPECTED_SUCCESS("'Copy Out' response"); + case PGRES_COPY_IN: + UNEXPECTED_SUCCESS("'Copy In' response"); + case PGRES_SINGLE_TUPLE: + UNEXPECTED_SUCCESS("single tuple return"); + case PGRES_PIPELINE_SYNC: + UNEXPECTED_SUCCESS("pipeline sync point"); + + /* Failure cases */ + case PGRES_BAD_RESPONSE: + case PGRES_NONFATAL_ERROR: + case PGRES_FATAL_ERROR: + case PGRES_PIPELINE_ABORTED: + return_val = WP_EXEC_FAILED; + break; + + default: + Assert(false); + return_val = WP_EXEC_FAILED; /* keep the compiler quiet */ + } + + if (unexpected_success) + elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success); + + return return_val; +} + +static pgsocket +libpqprop_socket(WalProposerConn* conn) +{ + return PQsocket(conn->pg_conn); +} + +static int +libpqprop_flush(WalProposerConn* conn) +{ + return (PQflush(conn->pg_conn)); +} + +static void +libpqprop_finish(WalProposerConn* conn) +{ + if (conn->recvbuf != NULL) + PQfreemem(conn->recvbuf); + PQfinish(conn->pg_conn); + pfree(conn); +} + +/* + * Receive a message from the safekeeper. + * + * On success, the data is placed in *buf. It is valid until the next call + * to this function. + */ +static PGAsyncReadResult +libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) +{ + int result; + + if (conn->recvbuf != NULL) + { + PQfreemem(conn->recvbuf); + conn->recvbuf = NULL; + } + + /* Call PQconsumeInput so that we have the data we need */ + if (!PQconsumeInput(conn->pg_conn)) + { + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } + + /* The docs for PQgetCopyData list the return values as: + * 0 if the copy is still in progress, but no "complete row" is + * available + * -1 if the copy is done + * -2 if an error occured + * (> 0) if it was successful; that value is the amount transferred. + * + * The protocol we use between walproposer and safekeeper means that we + * *usually* wouldn't expect to see that the copy is done, but this can + * sometimes be triggered by the server returning an ErrorResponse (which + * also happens to have the effect that the copy is done). + */ + switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true)) + { + case 0: + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_TRY_AGAIN; + case -1: + { + /* + * If we get -1, it's probably because of a server error; the + * safekeeper won't normally send a CopyDone message. + * + * We can check PQgetResult to make sure that the server failed; + * it'll always result in PGRES_FATAL_ERROR + */ + ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); + + if (status != PGRES_FATAL_ERROR) + elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); + + /* If there was actually an error, it'll be properly reported by + * calls to PQerrorMessage -- we don't have to do anything else */ + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } + case -2: + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + default: + /* Positive values indicate the size of the returned result */ + *amount = result; + *buf = conn->recvbuf; + return PG_ASYNC_READ_SUCCESS; + } +} + +static PGAsyncWriteResult +libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size) +{ + int result; + + /* If we aren't in non-blocking mode, switch to it. */ + if (!ensure_nonblocking_status(conn, true)) + return PG_ASYNC_WRITE_FAIL; + + /* The docs for PQputcopyData list the return values as: + * 1 if the data was queued, + * 0 if it was not queued because of full buffers, or + * -1 if an error occured + */ + result = PQputCopyData(conn->pg_conn, buf, size); + + /* We won't get a result of zero because walproposer always empties the + * connection's buffers before sending more */ + Assert(result != 0); + + switch (result) + { + case 1: + /* good -- continue */ + break; + case -1: + return PG_ASYNC_WRITE_FAIL; + default: + elog(FATAL, "invalid return %d from PQputCopyData", result); + } + + /* After queueing the data, we still need to flush to get it to send. + * This might take multiple tries, but we don't want to wait around + * until it's done. + * + * PQflush has the following returns (directly quoting the docs): + * 0 if sucessful, + * 1 if it was unable to send all the data in the send queue yet + * -1 if it failed for some reason + */ + switch (result = PQflush(conn->pg_conn)) { + case 0: + return PG_ASYNC_WRITE_SUCCESS; + case 1: + return PG_ASYNC_WRITE_TRY_FLUSH; + case -1: + return PG_ASYNC_WRITE_FAIL; + default: + elog(FATAL, "invalid return %d from PQflush", result); + } +} + +static bool +libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size) +{ + int result; + + /* If we are in non-blocking mode, switch out of it. */ + if (!ensure_nonblocking_status(conn, false)) + return false; + + /* Ths function is very similar to libpqprop_async_write. For more + * information, refer to the comments there */ + if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1) + return false; + + Assert(result == 1); + + /* Because the connection is non-blocking, flushing returns 0 or -1 */ + + if ((result = PQflush(conn->pg_conn)) == -1) + return false; + + Assert(result == 0); + return true; +} diff --git a/pgxn/neon/neon--1.0.sql b/pgxn/neon/neon--1.0.sql new file mode 100644 index 0000000000..34f1ba78d4 --- /dev/null +++ b/pgxn/neon/neon--1.0.sql @@ -0,0 +1,17 @@ +\echo Use "CREATE EXTENSION neon" to load this file. \quit + +CREATE FUNCTION pg_cluster_size() +RETURNS bigint +AS 'MODULE_PATHNAME', 'pg_cluster_size' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION backpressure_lsns( + OUT received_lsn pg_lsn, + OUT disk_consistent_lsn pg_lsn, + OUT remote_consistent_lsn pg_lsn +) +RETURNS record +AS 'MODULE_PATHNAME', 'backpressure_lsns' +LANGUAGE C STRICT +PARALLEL UNSAFE; diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c new file mode 100644 index 0000000000..595a126f04 --- /dev/null +++ b/pgxn/neon/neon.c @@ -0,0 +1,82 @@ +/*------------------------------------------------------------------------- + * + * neon.c + * Utility functions to expose neon specific information to user + * + * IDENTIFICATION + * contrib/neon/neon.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "fmgr.h" + +#include "access/xact.h" +#include "access/xlog.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "catalog/pg_type.h" +#include "replication/walsender.h" +#include "funcapi.h" +#include "access/htup_details.h" +#include "utils/pg_lsn.h" +#include "utils/guc.h" + +#include "neon.h" +#include "walproposer.h" + +PG_MODULE_MAGIC; +void _PG_init(void); + + +void _PG_init(void) +{ + pg_init_libpagestore(); + pg_init_libpqwalproposer(); + pg_init_walproposer(); + + EmitWarningsOnPlaceholders("neon"); +} + +PG_FUNCTION_INFO_V1(pg_cluster_size); +PG_FUNCTION_INFO_V1(backpressure_lsns); + +Datum +pg_cluster_size(PG_FUNCTION_ARGS) +{ + int64 size; + + size = GetZenithCurrentClusterSize(); + + if (size == 0) + PG_RETURN_NULL(); + + PG_RETURN_INT64(size); +} + + +Datum +backpressure_lsns(PG_FUNCTION_ARGS) +{ + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; + Datum values[3]; + bool nulls[3]; + TupleDesc tupdesc; + + replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); + + tupdesc = CreateTemplateTupleDesc(3); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "received_lsn", PG_LSNOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "disk_consistent_lsn", PG_LSNOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "remote_consistent_lsn", PG_LSNOID, -1, 0); + tupdesc = BlessTupleDesc(tupdesc); + + MemSet(nulls, 0, sizeof(nulls)); + values[0] = LSNGetDatum(writePtr); + values[1] = LSNGetDatum(flushPtr); + values[2] = LSNGetDatum(applyPtr); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control new file mode 100644 index 0000000000..84f79881c1 --- /dev/null +++ b/pgxn/neon/neon.control @@ -0,0 +1,4 @@ +# neon extension +comment = 'cloud storage for PostgreSQL' +default_version = '1.0' +module_pathname = '$libdir/neon' diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h new file mode 100644 index 0000000000..2c66bc7bf0 --- /dev/null +++ b/pgxn/neon/neon.h @@ -0,0 +1,19 @@ +/*------------------------------------------------------------------------- + * + * neon.h + * Functions used in the initialization of this extension. + * + * IDENTIFICATION + * contrib/neon/neon.h + * + *------------------------------------------------------------------------- + */ + +#ifndef NEON_H +#define NEON_H + +extern void pg_init_libpagestore(void); +extern void pg_init_libpqwalproposer(void); +extern void pg_init_walproposer(void); + +#endif /* NEON_H */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h new file mode 100644 index 0000000000..f79a3c9142 --- /dev/null +++ b/pgxn/neon/pagestore_client.h @@ -0,0 +1,221 @@ +/*------------------------------------------------------------------------- + * + * pagestore_client.h + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * contrib/neon/pagestore_client.h + * + *------------------------------------------------------------------------- + */ +#ifndef pageserver_h +#define pageserver_h + +#include "postgres.h" + +#include "access/xlogdefs.h" +#include "storage/relfilenode.h" +#include "storage/block.h" +#include "storage/smgr.h" +#include "lib/stringinfo.h" +#include "libpq/pqformat.h" +#include "utils/memutils.h" + +#include "pg_config.h" + +typedef enum +{ + /* pagestore_client -> pagestore */ + T_ZenithExistsRequest = 0, + T_ZenithNblocksRequest, + T_ZenithGetPageRequest, + T_ZenithDbSizeRequest, + + /* pagestore -> pagestore_client */ + T_ZenithExistsResponse = 100, + T_ZenithNblocksResponse, + T_ZenithGetPageResponse, + T_ZenithErrorResponse, + T_ZenithDbSizeResponse, +} ZenithMessageTag; + + + +/* base struct for c-style inheritance */ +typedef struct +{ + ZenithMessageTag tag; +} ZenithMessage; + +#define messageTag(m) (((const ZenithMessage *)(m))->tag) + +/* + * supertype of all the Zenith*Request structs below + * + * If 'latest' is true, we are requesting the latest page version, and 'lsn' + * is just a hint to the server that we know there are no versions of the page + * (or relation size, for exists/nblocks requests) later than the 'lsn'. + */ +typedef struct +{ + ZenithMessageTag tag; + bool latest; /* if true, request latest page version */ + XLogRecPtr lsn; /* request page version @ this LSN */ +} ZenithRequest; + +typedef struct +{ + ZenithRequest req; + RelFileNode rnode; + ForkNumber forknum; +} ZenithExistsRequest; + +typedef struct +{ + ZenithRequest req; + RelFileNode rnode; + ForkNumber forknum; +} ZenithNblocksRequest; + + +typedef struct +{ + ZenithRequest req; + Oid dbNode; +} ZenithDbSizeRequest; + + +typedef struct +{ + ZenithRequest req; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; +} ZenithGetPageRequest; + +/* supertype of all the Zenith*Response structs below */ +typedef struct +{ + ZenithMessageTag tag; +} ZenithResponse; + +typedef struct +{ + ZenithMessageTag tag; + bool exists; +} ZenithExistsResponse; + +typedef struct +{ + ZenithMessageTag tag; + uint32 n_blocks; +} ZenithNblocksResponse; + +typedef struct +{ + ZenithMessageTag tag; + char page[FLEXIBLE_ARRAY_MEMBER]; +} ZenithGetPageResponse; + +typedef struct +{ + ZenithMessageTag tag; + int64 db_size; +} ZenithDbSizeResponse; + +typedef struct +{ + ZenithMessageTag tag; + char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error message */ +} ZenithErrorResponse; + +extern StringInfoData zm_pack_request(ZenithRequest *msg); +extern ZenithResponse *zm_unpack_response(StringInfo s); +extern char *zm_to_string(ZenithMessage *msg); + +/* + * API + */ + +typedef struct +{ + ZenithResponse *(*request) (ZenithRequest *request); +} page_server_api; + +extern page_server_api *page_server; + +extern char *page_server_connstring; +extern char *zenith_timeline; +extern char *zenith_tenant; +extern bool wal_redo; +extern int32 max_cluster_size; + +extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode); +extern void smgr_init_zenith(void); + +extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode); +extern void smgr_init_inmem(void); +extern void smgr_shutdown_inmem(void); + +/* zenith storage manager functionality */ + +extern void zenith_init(void); +extern void zenith_open(SMgrRelation reln); +extern void zenith_close(SMgrRelation reln, ForkNumber forknum); +extern void zenith_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern bool zenith_exists(SMgrRelation reln, ForkNumber forknum); +extern void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +extern void zenith_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer); + +extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); + +extern void zenith_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum); +extern const int64 zenith_dbsize(Oid dbNode); +extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum); + +/* zenith wal-redo storage manager functionality */ + +extern void inmem_init(void); +extern void inmem_open(SMgrRelation reln); +extern void inmem_close(SMgrRelation reln, ForkNumber forknum); +extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum); +extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +extern void inmem_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer); +extern void inmem_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); +extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); + + +/* utils for zenith relsize cache */ +extern void relsize_hash_init(void); +extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size); +extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); +extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); +extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum); + +#endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c new file mode 100644 index 0000000000..3e1b74dba7 --- /dev/null +++ b/pgxn/neon/pagestore_smgr.c @@ -0,0 +1,1696 @@ +/*------------------------------------------------------------------------- + * + * pagestore_smgr.c + * + * + * + * Temporary and unlogged rels + * --------------------------- + * + * Temporary and unlogged tables are stored locally, by md.c. The functions + * here just pass the calls through to corresponding md.c functions. + * + * Index build operations that use the buffer cache are also handled locally, + * just like unlogged tables. Such operations must be marked by calling + * smgr_start_unlogged_build() and friends. + * + * In order to know what relations are permanent and which ones are not, we + * have added a 'smgr_relpersistence' field to SmgrRelationData, and it is set + * by smgropen() callers, when they have the relcache entry at hand. However, + * sometimes we need to open an SmgrRelation for a relation without the + * relcache. That is needed when we evict a buffer; we might not have the + * SmgrRelation for that relation open yet. To deal with that, the + * 'relpersistence' can be left to zero, meaning we don't know if it's + * permanent or not. Most operations are not allowed with relpersistence==0, + * but smgrwrite() does work, which is what we need for buffer eviction. and + * smgrunlink() so that a backend doesn't need to have the relcache entry at + * transaction commit, where relations that were dropped in the transaction + * are unlinked. + * + * If smgrwrite() is called and smgr_relpersistence == 0, we check if the + * relation file exists locally or not. If it does exist, we assume it's an + * unlogged relation and write the page there. Otherwise it must be a + * permanent relation, WAL-logged and stored on the page server, and we ignore + * the write like we do for permanent relations. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * contrib/neon/pagestore_smgr.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "access/xlog_internal.h" +#include "catalog/pg_class.h" +#include "pagestore_client.h" +#include "pagestore_client.h" +#include "storage/smgr.h" +#include "access/xlogdefs.h" +#include "postmaster/interrupt.h" +#include "replication/walsender.h" +#include "storage/bufmgr.h" +#include "storage/md.h" +#include "fmgr.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "catalog/pg_tablespace_d.h" +#include "postmaster/autovacuum.h" + +/* + * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API + * calls to md.c, and *also* do the calls to the Page Server. On every + * read, compare the versions we read from local disk and Page Server, + * and Assert that they are identical. + */ +/* #define DEBUG_COMPARE_LOCAL */ + +#ifdef DEBUG_COMPARE_LOCAL +#include "access/nbtree.h" +#include "storage/bufpage.h" +#include "access/xlog_internal.h" + +static char *hexdump_page(char *page); +#endif + +#define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId) + +const int SmgrTrace = DEBUG5; + +page_server_api *page_server; + +/* GUCs */ +char *page_server_connstring; // with substituted password +char *zenith_timeline; +char *zenith_tenant; +bool wal_redo = false; +int32 max_cluster_size; + +/* unlogged relation build states */ +typedef enum +{ + UNLOGGED_BUILD_NOT_IN_PROGRESS = 0, + UNLOGGED_BUILD_PHASE_1, + UNLOGGED_BUILD_PHASE_2, + UNLOGGED_BUILD_NOT_PERMANENT +} UnloggedBuildPhase; + +static SMgrRelation unlogged_build_rel = NULL; +static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + +StringInfoData +zm_pack_request(ZenithRequest *msg) +{ + StringInfoData s; + + initStringInfo(&s); + pq_sendbyte(&s, msg->tag); + + switch (messageTag(msg)) + { + /* pagestore_client -> pagestore */ + case T_ZenithExistsRequest: + { + ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->rnode.spcNode); + pq_sendint32(&s, msg_req->rnode.dbNode); + pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendbyte(&s, msg_req->forknum); + + break; + } + case T_ZenithNblocksRequest: + { + ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->rnode.spcNode); + pq_sendint32(&s, msg_req->rnode.dbNode); + pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendbyte(&s, msg_req->forknum); + + break; + } + case T_ZenithDbSizeRequest: + { + ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->dbNode); + + break; + } + case T_ZenithGetPageRequest: + { + ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->rnode.spcNode); + pq_sendint32(&s, msg_req->rnode.dbNode); + pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendbyte(&s, msg_req->forknum); + pq_sendint32(&s, msg_req->blkno); + + break; + } + + /* pagestore -> pagestore_client. We never need to create these. */ + case T_ZenithExistsResponse: + case T_ZenithNblocksResponse: + case T_ZenithGetPageResponse: + case T_ZenithErrorResponse: + case T_ZenithDbSizeResponse: + default: + elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag); + break; + } + return s; +} + +ZenithResponse * +zm_unpack_response(StringInfo s) +{ + ZenithMessageTag tag = pq_getmsgbyte(s); + ZenithResponse *resp = NULL; + + switch (tag) + { + /* pagestore -> pagestore_client */ + case T_ZenithExistsResponse: + { + ZenithExistsResponse *msg_resp = palloc0(sizeof(ZenithExistsResponse)); + + msg_resp->tag = tag; + msg_resp->exists = pq_getmsgbyte(s); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + case T_ZenithNblocksResponse: + { + ZenithNblocksResponse *msg_resp = palloc0(sizeof(ZenithNblocksResponse)); + + msg_resp->tag = tag; + msg_resp->n_blocks = pq_getmsgint(s, 4); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + case T_ZenithGetPageResponse: + { + ZenithGetPageResponse *msg_resp = palloc0(offsetof(ZenithGetPageResponse, page) + BLCKSZ); + + msg_resp->tag = tag; + /* XXX: should be varlena */ + memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + case T_ZenithDbSizeResponse: + { + ZenithDbSizeResponse *msg_resp = palloc0(sizeof(ZenithDbSizeResponse)); + + msg_resp->tag = tag; + msg_resp->db_size = pq_getmsgint64(s); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + case T_ZenithErrorResponse: + { + ZenithErrorResponse *msg_resp; + size_t msglen; + const char *msgtext; + + msgtext = pq_getmsgrawstring(s); + msglen = strlen(msgtext); + + msg_resp = palloc0(sizeof(ZenithErrorResponse) + msglen + 1); + msg_resp->tag = tag; + memcpy(msg_resp->message, msgtext, msglen + 1); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + /* + * pagestore_client -> pagestore + * + * We create these ourselves, and don't need to decode them. + */ + case T_ZenithExistsRequest: + case T_ZenithNblocksRequest: + case T_ZenithGetPageRequest: + case T_ZenithDbSizeRequest: + default: + elog(ERROR, "unexpected zenith message tag 0x%02x", tag); + break; + } + + return resp; +} + +/* dump to json for debugging / error reporting purposes */ +char * +zm_to_string(ZenithMessage *msg) +{ + StringInfoData s; + + initStringInfo(&s); + + switch (messageTag(msg)) + { + /* pagestore_client -> pagestore */ + case T_ZenithExistsRequest: + { + ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithExistsRequest\""); + appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", + msg_req->rnode.spcNode, + msg_req->rnode.dbNode, + msg_req->rnode.relNode); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + + case T_ZenithNblocksRequest: + { + ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithNblocksRequest\""); + appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", + msg_req->rnode.spcNode, + msg_req->rnode.dbNode, + msg_req->rnode.relNode); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + + case T_ZenithGetPageRequest: + { + ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithGetPageRequest\""); + appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", + msg_req->rnode.spcNode, + msg_req->rnode.dbNode, + msg_req->rnode.relNode); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + case T_ZenithDbSizeRequest: + { + ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeRequest\""); + appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + + + /* pagestore -> pagestore_client */ + case T_ZenithExistsResponse: + { + ZenithExistsResponse *msg_resp = (ZenithExistsResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithExistsResponse\""); + appendStringInfo(&s, ", \"exists\": %d}", + msg_resp->exists + ); + appendStringInfoChar(&s, '}'); + + break; + } + case T_ZenithNblocksResponse: + { + ZenithNblocksResponse *msg_resp = (ZenithNblocksResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithNblocksResponse\""); + appendStringInfo(&s, ", \"n_blocks\": %u}", + msg_resp->n_blocks + ); + appendStringInfoChar(&s, '}'); + + break; + } + case T_ZenithGetPageResponse: + { +#if 0 + ZenithGetPageResponse *msg_resp = (ZenithGetPageResponse *) msg; +#endif + + appendStringInfoString(&s, "{\"type\": \"ZenithGetPageResponse\""); + appendStringInfo(&s, ", \"page\": \"XXX\"}"); + appendStringInfoChar(&s, '}'); + break; + } + case T_ZenithErrorResponse: + { + ZenithErrorResponse *msg_resp = (ZenithErrorResponse *) msg; + + /* FIXME: escape double-quotes in the message */ + appendStringInfoString(&s, "{\"type\": \"ZenithErrorResponse\""); + appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message); + appendStringInfoChar(&s, '}'); + break; + } + case T_ZenithDbSizeResponse: + { + ZenithDbSizeResponse *msg_resp = (ZenithDbSizeResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeResponse\""); + appendStringInfo(&s, ", \"db_size\": %ld}", + msg_resp->db_size + ); + appendStringInfoChar(&s, '}'); + + break; + } + + default: + appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag); + } + return s.data; +} + +/* + * Wrapper around log_newpage() that makes a temporary copy of the block and + * WAL-logs that. This makes it safe to use while holding only a shared lock + * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint + * directly because it skips the logging if the LSN is new enough. + */ +static XLogRecPtr +log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, + Page page, bool page_std) +{ + PGAlignedBlock copied_buffer; + + memcpy(copied_buffer.data, page, BLCKSZ); + return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std); +} + +/* + * Is 'buffer' identical to a freshly initialized empty heap page? + */ +static bool +PageIsEmptyHeapPage(char *buffer) +{ + PGAlignedBlock empty_page; + + PageInit((Page) empty_page.data, BLCKSZ, 0); + + return memcmp(buffer, empty_page.data, BLCKSZ) == 0; +} + +static void +zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) +{ + XLogRecPtr lsn = PageGetLSN(buffer); + + if (ShutdownRequestPending) + return; + + /* + * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM + * changes are not WAL-logged when the changes are made, so this is our + * last chance to log them, otherwise they're lost. That's OK for + * correctness, the non-logged updates are not critical. But we want to + * have a reasonably up-to-date VM and FSM in the page server. + */ + if (forknum == FSM_FORKNUM && !RecoveryInProgress()) + { + /* FSM is never WAL-logged and we don't care. */ + XLogRecPtr recptr; + + recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false); + XLogFlush(recptr); + lsn = recptr; + ereport(SmgrTrace, + (errmsg("FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, LSN_FORMAT_ARGS(lsn)))); + } + else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress()) + { + /* + * Always WAL-log vm. We should never miss clearing visibility map + * bits. + * + * TODO Is it too bad for performance? Hopefully we do not evict + * actively used vm too often. + */ + XLogRecPtr recptr; + + recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false); + XLogFlush(recptr); + lsn = recptr; + + ereport(SmgrTrace, + (errmsg("Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X/%X", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, LSN_FORMAT_ARGS(lsn)))); + } + else if (lsn == InvalidXLogRecPtr) + { + /* + * When PostgreSQL extends a relation, it calls smgrextend() with an all-zeros pages, + * and we can just ignore that in Zenith. We do need to remember the new size, + * though, so that smgrnblocks() returns the right answer after the rel has + * been extended. We rely on the relsize cache for that. + * + * A completely empty heap page doesn't need to be WAL-logged, either. The + * heapam can leave such a page behind, if e.g. an insert errors out after + * initializing the page, but before it has inserted the tuple and WAL-logged + * the change. When we read the page from the page server, it will come back + * as all-zeros. That's OK, the heapam will initialize an all-zeros page on + * first use. + * + * In other scenarios, evicting a dirty page with no LSN is a bad sign: it implies + * that the page was not WAL-logged, and its contents will be lost when it's + * evicted. + */ + if (PageIsNew(buffer)) + { + ereport(SmgrTrace, + (errmsg("Page %u of relation %u/%u/%u.%u is all-zeros", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum))); + } + else if (PageIsEmptyHeapPage(buffer)) + { + ereport(SmgrTrace, + (errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum))); + } + else + { + ereport(PANIC, + (errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum))); + } + } + else + { + ereport(SmgrTrace, + (errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, LSN_FORMAT_ARGS(lsn)))); + } + + /* + * Remember the LSN on this page. When we read the page again, we must + * read the same or newer version of it. + */ + SetLastWrittenPageLSN(lsn); +} + + +/* + * zenith_init() -- Initialize private state + */ +void +zenith_init(void) +{ + /* noop */ +#ifdef DEBUG_COMPARE_LOCAL + mdinit(); +#endif +} + +/* + * GetXLogInsertRecPtr uses XLogBytePosToRecPtr to convert logical insert (reserved) position + * to physical position in WAL. It always adds SizeOfXLogShortPHD: + * seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + * so even if there are no records on the page, offset will be SizeOfXLogShortPHD. + * It may cause problems with XLogFlush. So return pointer backward to the origin of the page. + */ +static XLogRecPtr +zm_adjust_lsn(XLogRecPtr lsn) +{ + /* + * If lsn points to the beging of first record on page or segment, then + * "return" it back to the page origin + */ + if ((lsn & (XLOG_BLCKSZ - 1)) == SizeOfXLogShortPHD) + { + lsn -= SizeOfXLogShortPHD; + } + else if ((lsn & (wal_segment_size - 1)) == SizeOfXLogLongPHD) + { + lsn -= SizeOfXLogLongPHD; + } + return lsn; +} + +/* + * Return LSN for requesting pages and number of blocks from page server + */ +static XLogRecPtr +zenith_get_request_lsn(bool *latest) +{ + XLogRecPtr lsn; + + if (RecoveryInProgress()) + { + *latest = false; + lsn = GetXLogReplayRecPtr(NULL); + elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", + (uint32) ((lsn) >> 32), (uint32) (lsn)); + } + else if (am_walsender) + { + *latest = true; + lsn = InvalidXLogRecPtr; + elog(DEBUG1, "am walsender zenith_get_request_lsn lsn 0 "); + } + else + { + XLogRecPtr flushlsn; + + /* + * Use the latest LSN that was evicted from the buffer cache. Any + * pages modified by later WAL records must still in the buffer cache, + * so our request cannot concern those. + */ + *latest = true; + lsn = GetLastWrittenPageLSN(); + Assert(lsn != InvalidXLogRecPtr); + elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ", + (uint32) ((lsn) >> 32), (uint32) (lsn)); + + lsn = zm_adjust_lsn(lsn); + + /* + * Is it possible that the last-written LSN is ahead of last flush + * LSN? Generally not, we shouldn't evict a page from the buffer cache + * before all its modifications have been safely flushed. That's the + * "WAL before data" rule. However, such case does exist at index building, + * _bt_blwritepage logs the full page without flushing WAL before + * smgrextend (files are fsynced before build ends). + */ + flushlsn = GetFlushRecPtr(); + if (lsn > flushlsn) + { + elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", + (uint32) (lsn >> 32), (uint32) lsn, + (uint32) (flushlsn >> 32), (uint32) flushlsn); + XLogFlush(lsn); + } + } + + return lsn; +} + + +/* + * zenith_exists() -- Does the physical file exist? + */ +bool +zenith_exists(SMgrRelation reln, ForkNumber forkNum) +{ + bool exists; + ZenithResponse *resp; + BlockNumber n_blocks; + bool latest; + XLogRecPtr request_lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + /* + * We don't know if it's an unlogged rel stored locally, or permanent + * rel stored in the page server. First check if it exists locally. + * If it does, great. Otherwise check if it exists in the page server. + */ + if (mdexists(reln, forkNum)) + return true; + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdexists(reln, forkNum); + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks)) + { + return true; + } + + /* + * \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server + * will error out if you check that, because the whole dbdir for tablespace + * 0, db 0 doesn't exists. We possibly should change the page server to + * accept that and return 'false', to be consistent with mdexists(). But + * we probably also should fix pg_table_size() to not call smgrexists() + * with bogus relfilenode. + * + * For now, handle that special case here. + */ + if (reln->smgr_rnode.node.spcNode == 0 && + reln->smgr_rnode.node.dbNode == 0 && + reln->smgr_rnode.node.relNode == 0) + { + return false; + } + + request_lsn = zenith_get_request_lsn(&latest); + { + ZenithExistsRequest request = { + .req.tag = T_ZenithExistsRequest, + .req.latest = latest, + .req.lsn = request_lsn, + .rnode = reln->smgr_rnode.node, + .forknum = forkNum + }; + + resp = page_server->request((ZenithRequest *) &request); + } + + switch (resp->tag) + { + case T_ZenithExistsResponse: + exists = ((ZenithExistsResponse *) resp)->exists; + break; + + case T_ZenithErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((ZenithErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + pfree(resp); + return exists; +} + +/* + * zenith_create() -- Create a new relation on zenithd storage + * + * If isRedo is true, it's okay for the relation to exist already. + */ +void +zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) +{ + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdcreate(reln, forkNum, isRedo); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + elog(SmgrTrace, "Create relation %u/%u/%u.%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum); + + /* + * Newly created relation is empty, remember that in the relsize cache. + * + * FIXME: This is currently not just an optimization, but required for + * correctness. Postgres can call smgrnblocks() on the newly-created + * relation. Currently, we don't call SetLastWrittenPageLSN() when a new + * relation created, so if we didn't remember the size in the relsize + * cache, we might call smgrnblocks() on the newly-created relation before + * the creation WAL record hass been received by the page server. + */ + set_cached_relsize(reln->smgr_rnode.node, forkNum, 0); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdcreate(reln, forkNum, isRedo); +#endif +} + +/* + * zenith_unlink() -- Unlink a relation. + * + * Note that we're passed a RelFileNodeBackend --- by the time this is called, + * there won't be an SMgrRelation hashtable entry anymore. + * + * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber + * to delete all forks. + * + * + * If isRedo is true, it's unsurprising for the relation to be already gone. + * Also, we should remove the file immediately instead of queuing a request + * for later, since during redo there's no possibility of creating a + * conflicting relation. + * + * Note: any failure should be reported as WARNING not ERROR, because + * we are usually not in a transaction anymore when this is called. + */ +void +zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) +{ + /* + * Might or might not exist locally, depending on whether it's + * an unlogged or permanent relation (or if DEBUG_COMPARE_LOCAL is + * set). Try to unlink, it won't do any harm if the file doesn't + * exist. + */ + mdunlink(rnode, forkNum, isRedo); + if (!RelFileNodeBackendIsTemp(rnode)) { + forget_cached_relsize(rnode.node, forkNum); + } +} + +/* + * zenith_extend() -- Add a block to the specified relation. + * + * The semantics are nearly the same as mdwrite(): write at the + * specified position. However, this is to be used for the case of + * extending a relation (i.e., blocknum is at or beyond the current + * EOF). Note that we assume writing a block beyond current EOF + * causes intervening file space to become filled with zeroes. + */ +void +zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + char *buffer, bool skipFsync) +{ + XLogRecPtr lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdextend(reln, forkNum, blkno, buffer, skipFsync); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* + * Check that the cluster size limit has not been exceeded. + * + * Temporary and unlogged relations are not included in the cluster size measured + * by the page server, so ignore those. Autovacuum processes are also exempt. + */ + if (max_cluster_size > 0 && + reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && + !IsAutoVacuumWorkerProcess()) + { + uint64 current_size = GetZenithCurrentClusterSize(); + + if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) + ereport(ERROR, + (errcode(ERRCODE_DISK_FULL), + errmsg("could not extend file because cluster size limit (%d MB) has been exceeded", + max_cluster_size), + errhint("This limit is defined by neon.max_cluster_size GUC"))); + } + + zenith_wallog_page(reln, forkNum, blkno, buffer); + set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1); + + lsn = PageGetLSN(buffer); + elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, blkno, + (uint32) (lsn >> 32), (uint32) lsn); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdextend(reln, forkNum, blkno, buffer, skipFsync); +#endif +} + +/* + * zenith_open() -- Initialize newly-opened relation. + */ +void +zenith_open(SMgrRelation reln) +{ + /* + * We don't have anything special to do here. Call mdopen() to let md.c + * initialize itself. That's only needed for temporary or unlogged + * relations, but it's dirt cheap so do it always to make sure the md + * fields are initialized, for debugging purposes if nothing else. + */ + mdopen(reln); + + /* no work */ + elog(SmgrTrace, "[ZENITH_SMGR] open noop"); +} + +/* + * zenith_close() -- Close the specified relation, if it isn't closed already. + */ +void +zenith_close(SMgrRelation reln, ForkNumber forknum) +{ + /* + * Let md.c close it, if it had it open. Doesn't hurt to do this + * even for permanent relations that have no local storage. + */ + mdclose(reln, forknum); +} + +/* + * zenith_prefetch() -- Initiate asynchronous read of the specified block of a relation + */ +bool +zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + switch (reln->smgr_relpersistence) + { + case 0: + /* probably shouldn't happen, but ignore it */ + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdprefetch(reln, forknum, blocknum); + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* not implemented */ + elog(SmgrTrace, "[ZENITH_SMGR] prefetch noop"); + return true; +} + +/* + * zenith_writeback() -- Tell the kernel to write pages back to storage. + * + * This accepts a range of blocks because flushing several pages at once is + * considerably more efficient than doing so individually. + */ +void +zenith_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ + switch (reln->smgr_relpersistence) + { + case 0: + /* mdwriteback() does nothing if the file doesn't exist */ + mdwriteback(reln, forknum, blocknum, nblocks); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdwriteback(reln, forknum, blocknum, nblocks); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* not implemented */ + elog(SmgrTrace, "[ZENITH_SMGR] writeback noop"); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdwriteback(reln, forknum, blocknum, nblocks); +#endif +} + +/* + * While function is defined in the zenith extension it's used within neon_test_utils directly. + * To avoid breaking tests in the runtime please keep function signature in sync. + */ +void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer) +{ + ZenithResponse *resp; + + { + ZenithGetPageRequest request = { + .req.tag = T_ZenithGetPageRequest, + .req.latest = request_latest, + .req.lsn = request_lsn, + .rnode = rnode, + .forknum = forkNum, + .blkno = blkno + }; + + resp = page_server->request((ZenithRequest *) &request); + } + + switch (resp->tag) + { + case T_ZenithGetPageResponse: + memcpy(buffer, ((ZenithGetPageResponse *) resp)->page, BLCKSZ); + break; + + case T_ZenithErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + blkno, + rnode.spcNode, + rnode.dbNode, + rnode.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((ZenithErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + + pfree(resp); +} + +/* + * zenith_read() -- Read the specified block from a relation. + */ +void +zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + char *buffer) +{ + bool latest; + XLogRecPtr request_lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrread() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdread(reln, forkNum, blkno, buffer); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + request_lsn = zenith_get_request_lsn(&latest); + zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); + +#ifdef DEBUG_COMPARE_LOCAL + if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + { + char pageserver_masked[BLCKSZ]; + char mdbuf[BLCKSZ]; + char mdbuf_masked[BLCKSZ]; + + mdread(reln, forkNum, blkno, mdbuf); + + memcpy(pageserver_masked, buffer, BLCKSZ); + memcpy(mdbuf_masked, mdbuf, BLCKSZ); + + if (PageIsNew(mdbuf)) + { + if (!PageIsNew(pageserver_masked)) + { + elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(buffer)); + } + } + else if (PageIsNew(buffer)) + { + elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf)); + } + else if (PageGetSpecialSize(mdbuf) == 0) + { + /* assume heap */ + RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) + { + if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) + { + /* assume btree */ + RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + } + } +#endif +} + +#ifdef DEBUG_COMPARE_LOCAL +static char * +hexdump_page(char *page) +{ + StringInfoData result; + + initStringInfo(&result); + + for (int i = 0; i < BLCKSZ; i++) + { + if (i % 8 == 0) + appendStringInfo(&result, " "); + if (i % 40 == 0) + appendStringInfo(&result, "\n"); + appendStringInfo(&result, "%02x", (unsigned char) (page[i])); + } + + return result.data; +} +#endif + +/* + * zenith_write() -- Write the supplied block at the appropriate location. + * + * This is to be used only for updating already-existing blocks of a + * relation (ie, those before the current EOF). To extend a relation, + * use mdextend(). + */ +void +zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync) +{ + XLogRecPtr lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + /* This is a bit tricky. Check if the relation exists locally */ + if (mdexists(reln, forknum)) + { + /* It exists locally. Guess it's unlogged then. */ + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + + /* + * We could set relpersistence now that we have determined + * that it's local. But we don't dare to do it, because that + * would immediately allow reads as well, which shouldn't + * happen. We could cache it with a different 'relpersistence' + * value, but this isn't performance critical. + */ + return; + } + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + zenith_wallog_page(reln, forknum, blocknum, buffer); + + lsn = PageGetLSN(buffer); + elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, blocknum, + (uint32) (lsn >> 32), (uint32) lsn); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdwrite(reln, forknum, blocknum, buffer, skipFsync); +#endif +} + +/* + * zenith_nblocks() -- Get the number of blocks stored in a relation. + */ +BlockNumber +zenith_nblocks(SMgrRelation reln, ForkNumber forknum) +{ + ZenithResponse *resp; + BlockNumber n_blocks; + bool latest; + XLogRecPtr request_lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdnblocks(reln, forknum); + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks)) + { + elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, n_blocks); + return n_blocks; + } + + request_lsn = zenith_get_request_lsn(&latest); + { + ZenithNblocksRequest request = { + .req.tag = T_ZenithNblocksRequest, + .req.latest = latest, + .req.lsn = request_lsn, + .rnode = reln->smgr_rnode.node, + .forknum = forknum, + }; + + resp = page_server->request((ZenithRequest *) &request); + } + + switch (resp->tag) + { + case T_ZenithNblocksResponse: + n_blocks = ((ZenithNblocksResponse *) resp)->n_blocks; + break; + + case T_ZenithErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((ZenithErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks); + + elog(SmgrTrace, "zenith_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + n_blocks); + + pfree(resp); + return n_blocks; +} + +/* + * zenith_db_size() -- Get the size of the database in bytes. + */ +const int64 +zenith_dbsize(Oid dbNode) +{ + ZenithResponse *resp; + int64 db_size; + XLogRecPtr request_lsn; + bool latest; + + request_lsn = zenith_get_request_lsn(&latest); + { + ZenithDbSizeRequest request = { + .req.tag = T_ZenithDbSizeRequest, + .req.latest = latest, + .req.lsn = request_lsn, + .dbNode = dbNode, + }; + + resp = page_server->request((ZenithRequest *) &request); + } + + switch (resp->tag) + { + case T_ZenithDbSizeResponse: + db_size = ((ZenithDbSizeResponse *) resp)->db_size; + break; + + case T_ZenithErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read db size of db %u from page server at lsn %X/%08X", + dbNode, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((ZenithErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + + elog(SmgrTrace, "zenith_dbsize: db %u (request LSN %X/%08X): %ld bytes", + dbNode, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + db_size); + + pfree(resp); + return db_size; +} + +/* + * zenith_truncate() -- Truncate relation to specified number of blocks. + */ +void +zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +{ + XLogRecPtr lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdtruncate(reln, forknum, nblocks); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks); + + /* + * Truncating a relation drops all its buffers from the buffer cache + * without calling smgrwrite() on them. But we must account for that in + * our tracking of last-written-LSN all the same: any future smgrnblocks() + * request must return the new size after the truncation. We don't know + * what the LSN of the truncation record was, so be conservative and use + * the most recently inserted WAL record's LSN. + */ + lsn = GetXLogInsertRecPtr(); + + lsn = zm_adjust_lsn(lsn); + + /* + * Flush it, too. We don't actually care about it here, but let's uphold + * the invariant that last-written LSN <= flush LSN. + */ + XLogFlush(lsn); + + SetLastWrittenPageLSN(lsn); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdtruncate(reln, forknum, nblocks); +#endif +} + +/* + * zenith_immedsync() -- Immediately sync a relation to stable storage. + * + * Note that only writes already issued are synced; this routine knows + * nothing of dirty buffers that may exist inside the buffer manager. We + * sync active and inactive segments; smgrDoPendingSyncs() relies on this. + * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of + * some segment, then mdtruncate() renders that segment inactive. If we + * crash before the next checkpoint syncs the newly-inactive segment, that + * segment may survive recovery, reintroducing unwanted data into the table. + */ +void +zenith_immedsync(SMgrRelation reln, ForkNumber forknum) +{ + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdimmedsync(reln, forknum); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + elog(SmgrTrace, "[ZENITH_SMGR] immedsync noop"); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdimmedsync(reln, forknum); +#endif +} + +/* + * zenith_start_unlogged_build() -- Starting build operation on a rel. + * + * Some indexes are built in two phases, by first populating the table with + * regular inserts, using the shared buffer cache but skipping WAL-logging, + * and WAL-logging the whole relation after it's done. Zenith relies on the + * WAL to reconstruct pages, so we cannot use the page server in the + * first phase when the changes are not logged. + */ +static void +zenith_start_unlogged_build(SMgrRelation reln) +{ + /* + * Currently, there can be only one unlogged relation build operation in + * progress at a time. That's enough for the current usage. + */ + if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) + elog(ERROR, "unlogged relation build is already in progress"); + Assert(unlogged_build_rel == NULL); + + ereport(SmgrTrace, + (errmsg("starting unlogged build of relation %u/%u/%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode))); + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + unlogged_build_rel = reln; + unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT; + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (smgrnblocks(reln, MAIN_FORKNUM) != 0) + elog(ERROR, "cannot perform unlogged index build, index is not empty "); + + unlogged_build_rel = reln; + unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; + + /* Make the relation look like it's unlogged */ + reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED; + + /* + * FIXME: should we pass isRedo true to create the tablespace dir if it + * doesn't exist? Is it needed? + */ + mdcreate(reln, MAIN_FORKNUM, false); +} + +/* + * zenith_finish_unlogged_build_phase_1() + * + * Call this after you have finished populating a relation in unlogged mode, + * before you start WAL-logging it. + */ +static void +zenith_finish_unlogged_build_phase_1(SMgrRelation reln) +{ + Assert(unlogged_build_rel == reln); + + ereport(SmgrTrace, + (errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode))); + + if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT) + return; + + Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); + Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); + + unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; +} + +/* + * zenith_end_unlogged_build() -- Finish an unlogged rel build. + * + * Call this after you have finished WAL-logging an relation that was + * first populated without WAL-logging. + * + * This removes the local copy of the rel, since it's now been fully + * WAL-logged and is present in the page server. + */ +static void +zenith_end_unlogged_build(SMgrRelation reln) +{ + Assert(unlogged_build_rel == reln); + + ereport(SmgrTrace, + (errmsg("ending unlogged build of relation %u/%u/%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode))); + + if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT) + { + RelFileNodeBackend rnode; + + Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2); + Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); + + /* Make the relation look permanent again */ + reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT; + + /* Remove local copy */ + rnode = reln->smgr_rnode; + for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) + { + elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u", + rnode.node.spcNode, + rnode.node.dbNode, + rnode.node.relNode, + forknum); + + forget_cached_relsize(rnode.node, forknum); + mdclose(reln, forknum); + /* use isRedo == true, so that we drop it immediately */ + mdunlink(rnode, forknum, true); + } + } + + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; +} + +static void +AtEOXact_zenith(XactEvent event, void *arg) +{ + switch (event) + { + case XACT_EVENT_ABORT: + case XACT_EVENT_PARALLEL_ABORT: + + /* + * Forget about any build we might have had in progress. The local + * file will be unlinked by smgrDoPendingDeletes() + */ + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + break; + + case XACT_EVENT_COMMIT: + case XACT_EVENT_PARALLEL_COMMIT: + case XACT_EVENT_PREPARE: + case XACT_EVENT_PRE_COMMIT: + case XACT_EVENT_PARALLEL_PRE_COMMIT: + case XACT_EVENT_PRE_PREPARE: + if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) + { + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + (errmsg("unlogged index build was not properly finished")))); + } + break; + } +} + +static const struct f_smgr zenith_smgr = +{ + .smgr_init = zenith_init, + .smgr_shutdown = NULL, + .smgr_open = zenith_open, + .smgr_close = zenith_close, + .smgr_create = zenith_create, + .smgr_exists = zenith_exists, + .smgr_unlink = zenith_unlink, + .smgr_extend = zenith_extend, + .smgr_prefetch = zenith_prefetch, + .smgr_read = zenith_read, + .smgr_write = zenith_write, + .smgr_writeback = zenith_writeback, + .smgr_nblocks = zenith_nblocks, + .smgr_truncate = zenith_truncate, + .smgr_immedsync = zenith_immedsync, + + .smgr_start_unlogged_build = zenith_start_unlogged_build, + .smgr_finish_unlogged_build_phase_1 = zenith_finish_unlogged_build_phase_1, + .smgr_end_unlogged_build = zenith_end_unlogged_build, +}; + + +const f_smgr * +smgr_zenith(BackendId backend, RelFileNode rnode) +{ + + /* Don't use page server for temp relations */ + if (backend != InvalidBackendId) + return smgr_standard(backend, rnode); + else + return &zenith_smgr; +} + +void +smgr_init_zenith(void) +{ + RegisterXactCallback(AtEOXact_zenith, NULL); + + smgr_init_standard(); + zenith_init(); +} diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c new file mode 100644 index 0000000000..8dfcffe1d1 --- /dev/null +++ b/pgxn/neon/relsize_cache.c @@ -0,0 +1,167 @@ +/*------------------------------------------------------------------------- + * + * relsize_cache.c + * Relation size cache for better zentih performance. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * contrib/neon/relsize_cache.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pagestore_client.h" +#include "storage/relfilenode.h" +#include "storage/smgr.h" +#include "storage/lwlock.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "catalog/pg_tablespace_d.h" +#include "utils/dynahash.h" +#include "utils/guc.h" + + +typedef struct +{ + RelFileNode rnode; + ForkNumber forknum; +} RelTag; + +typedef struct +{ + RelTag tag; + BlockNumber size; +} RelSizeEntry; + +static HTAB *relsize_hash; +static LWLockId relsize_lock; +static int relsize_hash_size; +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; + +/* + * Size of a cache entry is 20 bytes. So this default will take about 1.2 MB, + * which seems reasonable. + */ +#define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024) + +static void +zenith_smgr_shmem_startup(void) +{ + static HASHCTL info; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize"); + info.keysize = sizeof(RelTag); + info.entrysize = sizeof(RelSizeEntry); + relsize_hash = ShmemInitHash("neon_relsize", + relsize_hash_size, relsize_hash_size, + &info, + HASH_ELEM | HASH_BLOBS); + LWLockRelease(AddinShmemInitLock); +} + +bool +get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size) +{ + bool found = false; + + if (relsize_hash_size > 0) + { + RelTag tag; + RelSizeEntry *entry; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_SHARED); + entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL); + if (entry != NULL) + { + *size = entry->size; + found = true; + } + LWLockRelease(relsize_lock); + } + return found; +} + +void +set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) +{ + if (relsize_hash_size > 0) + { + RelTag tag; + RelSizeEntry *entry; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); + entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL); + entry->size = size; + LWLockRelease(relsize_lock); + } +} + +void +update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) +{ + if (relsize_hash_size > 0) + { + RelTag tag; + RelSizeEntry *entry; + bool found; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); + entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found); + if (!found || entry->size < size) + entry->size = size; + LWLockRelease(relsize_lock); + } +} + +void +forget_cached_relsize(RelFileNode rnode, ForkNumber forknum) +{ + if (relsize_hash_size > 0) + { + RelTag tag; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); + hash_search(relsize_hash, &tag, HASH_REMOVE, NULL); + LWLockRelease(relsize_lock); + } +} + +void +relsize_hash_init(void) +{ + DefineCustomIntVariable("neon.relsize_hash_size", + "Sets the maximum number of cached relation sizes for neon", + NULL, + &relsize_hash_size, + DEFAULT_RELSIZE_HASH_SIZE, + 0, + INT_MAX, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + + if (relsize_hash_size > 0) + { + RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry))); + RequestNamedLWLockTranche("neon_relsize", 1); + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = zenith_smgr_shmem_startup; + } +} diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c new file mode 100644 index 0000000000..9625325c0a --- /dev/null +++ b/pgxn/neon/walproposer.c @@ -0,0 +1,2403 @@ +/*------------------------------------------------------------------------- + * + * walproposer.c + * + * Proposer/leader part of the total order broadcast protocol between postgres + * and WAL safekeepers. + * + * We have two ways of launching WalProposer: + * + * 1. As a background worker which will run physical WalSender with + * am_wal_proposer flag set to true. WalSender in turn would handle WAL + * reading part and call WalProposer when ready to scatter WAL. + * + * 2. As a standalone utility by running `postgres --sync-safekeepers`. That + * is needed to create LSN from which it is safe to start postgres. More + * specifically it addresses following problems: + * + * a) Chicken-or-the-egg problem: compute postgres needs data directory + * with non-rel files that are downloaded from pageserver by calling + * basebackup@LSN. This LSN is not arbitrary, it must include all + * previously committed transactions and defined through consensus + * voting, which happens... in walproposer, a part of compute node. + * + * b) Just warranting such LSN is not enough, we must also actually commit + * it and make sure there is a safekeeper who knows this LSN is + * committed so WAL before it can be streamed to pageserver -- otherwise + * basebackup will hang waiting for WAL. Advancing commit_lsn without + * playing consensus game is impossible, so speculative 'let's just poll + * safekeepers, learn start LSN of future epoch and run basebackup' + * won't work. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include +#include "access/xlogdefs.h" +#include "access/xlogutils.h" +#include "storage/latch.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "access/xlog.h" +#include "libpq/pqformat.h" +#include "replication/slot.h" +#include "replication/walreceiver.h" +#include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/timestamp.h" + +#include "neon.h" +#include "walproposer.h" +#include "walproposer_utils.h" +#include "replication/walpropshim.h" + + +char *wal_acceptors_list; +int wal_acceptor_reconnect_timeout; +int wal_acceptor_connect_timeout; +bool am_wal_proposer; + +char *zenith_timeline_walproposer = NULL; +char *zenith_tenant_walproposer = NULL; + +/* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */ +WalProposerFunctionsType *WalProposerFunctions = NULL; + +#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" + +static int n_safekeepers = 0; +static int quorum = 0; +static Safekeeper safekeeper[MAX_SAFEKEEPERS]; +static XLogRecPtr availableLsn; /* WAL has been generated up to this point */ +static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to safekeepers */ +static ProposerGreeting greetRequest; +static VoteRequest voteRequest; /* Vote request for safekeeper */ +static WaitEventSet *waitEvents; +static AppendResponse quorumFeedback; +/* + * Minimal LSN which may be needed for recovery of some safekeeper, + * record-aligned (first record which might not yet received by someone). + */ +static XLogRecPtr truncateLsn; +/* + * Term of the proposer. We want our term to be highest and unique, + * so we collect terms from safekeepers quorum, choose max and +1. + * After that our term is fixed and must not change. If we observe + * that some safekeeper has higher term, it means that we have another + * running compute, so we must stop immediately. + */ +static term_t propTerm; +static TermHistory propTermHistory; /* term history of the proposer */ +static XLogRecPtr propEpochStartLsn; /* epoch start lsn of the proposer */ +static term_t donorEpoch; /* Most advanced acceptor epoch */ +static int donor; /* Most advanced acceptor */ +static XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ +static int n_votes = 0; +static int n_connected = 0; +static TimestampTz last_reconnect_attempt; + +static WalproposerShmemState *walprop_shared; + +/* Prototypes for private functions */ +static void WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId); +static void WalProposerStartImpl(void); +static void WalProposerLoop(void); +static void InitEventSet(void); +static void UpdateEventSet(Safekeeper *sk, uint32 events); +static void HackyRemoveWalProposerEvent(Safekeeper *to_remove); +static void ShutdownConnection(Safekeeper *sk); +static void ResetConnection(Safekeeper *sk); +static long TimeToReconnect(TimestampTz now); +static void ReconnectSafekeepers(void); +static void AdvancePollState(Safekeeper *sk, uint32 events); +static void HandleConnectionEvent(Safekeeper *sk); +static void SendStartWALPush(Safekeeper *sk); +static void RecvStartWALPushResult(Safekeeper *sk); +static void SendProposerGreeting(Safekeeper *sk); +static void RecvAcceptorGreeting(Safekeeper *sk); +static void SendVoteRequest(Safekeeper *sk); +static void RecvVoteResponse(Safekeeper *sk); +static void HandleElectedProposer(void); +static term_t GetHighestTerm(TermHistory *th); +static term_t GetEpoch(Safekeeper *sk); +static void DetermineEpochStartLsn(void); +static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos); +static void SendProposerElected(Safekeeper *sk); +static void WalProposerStartStreaming(XLogRecPtr startpos); +static void StartStreaming(Safekeeper *sk); +static void SendMessageToNode(Safekeeper *sk); +static void BroadcastAppendRequest(void); +static void HandleActiveState(Safekeeper *sk, uint32 events); +static bool SendAppendRequests(Safekeeper *sk); +static bool RecvAppendResponses(Safekeeper *sk); +static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs); +static XLogRecPtr CalculateMinFlushLsn(void); +static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void); +static void HandleSafekeeperResponse(void); +static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); +static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); +static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state); +static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); +static bool AsyncFlush(Safekeeper *sk); + + +static void nwp_shmem_startup_hook(void); +static void nwp_register_gucs(void); +static void nwp_prepare_shmem(void); +static uint64 backpressure_lag_impl(void); + + +static shmem_startup_hook_type prev_shmem_startup_hook_type; + + + +void pg_init_walproposer(void) +{ + if (!process_shared_preload_libraries_in_progress) + return; + + nwp_register_gucs(); + + nwp_prepare_shmem(); + + delay_backend_us = &backpressure_lag_impl; + + WalProposerRegister(); + + WalProposerInit = &WalProposerInitImpl; + WalProposerStart = &WalProposerStartImpl; +} + +static void nwp_register_gucs(void) +{ + DefineCustomStringVariable( + "neon.safekeepers", + "List of Neon WAL acceptors (host:port)", + NULL, /* long_desc */ + &wal_acceptors_list, /* valueAddr */ + "", /* bootValue */ + PGC_POSTMASTER, + GUC_LIST_INPUT, /* extensions can't use GUC_LIST_QUOTE */ + NULL, NULL, NULL + ); + + DefineCustomIntVariable( + "neon.safekeeper_reconnect_timeout", + "Timeout for reconnecting to offline wal acceptor.", + NULL, + &wal_acceptor_reconnect_timeout, + 1000, 0, INT_MAX, /* default, min, max */ + PGC_SIGHUP, /* context */ + GUC_UNIT_MS, /* flags */ + NULL, NULL, NULL + ); + + DefineCustomIntVariable( + "neon.safekeeper_connect_timeout", + "Timeout after which give up connection attempt to safekeeper.", + NULL, + &wal_acceptor_connect_timeout, + 5000, 0, INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MS, + NULL, NULL, NULL + ); + +} + +/* shmem handling */ + +static void nwp_prepare_shmem(void) +{ + RequestAddinShmemSpace(WalproposerShmemSize()); + + prev_shmem_startup_hook_type = shmem_startup_hook; + shmem_startup_hook = nwp_shmem_startup_hook; +} + +static void nwp_shmem_startup_hook(void) +{ + if (prev_shmem_startup_hook_type) + prev_shmem_startup_hook_type(); + + WalproposerShmemInit(); +} + +/* + * WAL proposer bgworker entry point. + */ +void +WalProposerMain(Datum main_arg) +{ + /* Establish signal handlers. */ + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + GetXLogReplayRecPtr(&ThisTimeLineID); + + WalProposerInit(GetFlushRecPtr(), GetSystemIdentifier()); + + last_reconnect_attempt = GetCurrentTimestamp(); + + application_name = (char *) "walproposer"; /* for + * synchronous_standby_names */ + am_wal_proposer = true; + am_walsender = true; + InitWalSender(); + InitProcessPhase2(); + + /* Create replication slot for WAL proposer if not exists */ + if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL) + { + ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false); + ReplicationSlotReserveWal(); + /* Write this slot to disk */ + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + ReplicationSlotRelease(); + } + + WalProposerStart(); +} + +/* + * Create new AppendRequest message and start sending it. This function is + * called from walsender every time the new WAL is available. + */ +void +WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos) +{ + Assert(startpos == availableLsn && endpos >= availableLsn); + availableLsn = endpos; + BroadcastAppendRequest(); +} + +/* + * Advance the WAL proposer state machine, waiting each time for events to occur. + * Will exit only when latch is set, i.e. new WAL should be pushed from walsender + * to walproposer. + */ +void +WalProposerPoll(void) +{ + while (true) + { + Safekeeper *sk; + int rc; + WaitEvent event; + TimestampTz now = GetCurrentTimestamp(); + + rc = WaitEventSetWait(waitEvents, TimeToReconnect(now), + &event, 1, WAIT_EVENT_WAL_SENDER_MAIN); + sk = (Safekeeper *) event.user_data; + + /* + * If the event contains something that one of our safekeeper states + * was waiting for, we'll advance its state. + */ + if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))) + AdvancePollState(sk, event.events); + + /* + * If the timeout expired, attempt to reconnect to any safekeepers that + * we dropped + */ + ReconnectSafekeepers(); + + /* + * If wait is terminated by latch set (walsenders' latch is set on + * each wal flush), then exit loop. (no need for pm death check due to + * WL_EXIT_ON_PM_DEATH) + */ + if (rc != 0 && (event.events & WL_LATCH_SET)) + { + ResetLatch(MyLatch); + break; + } + if (rc == 0) /* timeout expired: poll state */ + { + TimestampTz now; + + /* + * If no WAL was generated during timeout (and we have already + * collected the quorum), then send pool message + */ + if (availableLsn != InvalidXLogRecPtr) + { + BroadcastAppendRequest(); + } + + /* + * Abandon connection attempts which take too long. + */ + now = GetCurrentTimestamp(); + for (int i = 0; i < n_safekeepers; i++) + { + Safekeeper *sk = &safekeeper[i]; + + if ((sk->state == SS_CONNECTING_WRITE || + sk->state == SS_CONNECTING_READ) && + TimestampDifferenceExceeds(sk->startedConnAt, now, + wal_acceptor_connect_timeout)) + { + elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms", + sk->host, sk->port, wal_acceptor_connect_timeout); + ShutdownConnection(sk); + } + } + } + } +} + +/* + * Register a background worker proposing WAL to wal acceptors. + */ +void +WalProposerRegister(void) +{ + BackgroundWorker bgw; + + if (*wal_acceptors_list == '\0') + return; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + +static void +WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) +{ + char *host; + char *sep; + char *port; + + /* Load the libpq-specific functions */ + if (WalProposerFunctions == NULL) + elog(ERROR, "libpqwalproposer didn't initialize correctly"); + + load_file("libpqwalreceiver", false); + if (WalReceiverFunctions == NULL) + elog(ERROR, "libpqwalreceiver didn't initialize correctly"); + + for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep) + { + port = strchr(host, ':'); + if (port == NULL) + { + elog(FATAL, "port is not specified"); + } + *port++ = '\0'; + sep = strchr(port, ','); + if (sep != NULL) + *sep++ = '\0'; + if (n_safekeepers + 1 >= MAX_SAFEKEEPERS) + { + elog(FATAL, "Too many safekeepers"); + } + safekeeper[n_safekeepers].host = host; + safekeeper[n_safekeepers].port = port; + safekeeper[n_safekeepers].state = SS_OFFLINE; + safekeeper[n_safekeepers].conn = NULL; + + /* + * Set conninfo to empty. We'll fill it out once later, in + * `ResetConnection` as needed + */ + safekeeper[n_safekeepers].conninfo[0] = '\0'; + initStringInfo(&safekeeper[n_safekeepers].outbuf); + safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open, .segment_close = wal_segment_close), NULL); + if (safekeeper[n_safekeepers].xlogreader == NULL) + elog(FATAL, "Failed to allocate xlog reader"); + safekeeper[n_safekeepers].flushWrite = false; + safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr; + safekeeper[n_safekeepers].streamingAt = InvalidXLogRecPtr; + n_safekeepers += 1; + } + if (n_safekeepers < 1) + { + elog(FATAL, "Safekeepers addresses are not specified"); + } + quorum = n_safekeepers / 2 + 1; + + /* Fill the greeting package */ + greetRequest.tag = 'g'; + greetRequest.protocolVersion = SK_PROTOCOL_VERSION; + greetRequest.pgVersion = PG_VERSION_NUM; + pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId)); + greetRequest.systemId = systemId; + if (!zenith_timeline_walproposer) + elog(FATAL, "neon.timeline_id is not provided"); + if (*zenith_timeline_walproposer != '\0' && + !HexDecodeString(greetRequest.ztimelineid, zenith_timeline_walproposer, 16)) + elog(FATAL, "Could not parse neon.timeline_id, %s", zenith_timeline_walproposer); + if (!zenith_tenant_walproposer) + elog(FATAL, "neon.tenant_id is not provided"); + if (*zenith_tenant_walproposer != '\0' && + !HexDecodeString(greetRequest.ztenantid, zenith_tenant_walproposer, 16)) + elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer); + + greetRequest.timeline = ThisTimeLineID; + greetRequest.walSegSize = wal_segment_size; + + InitEventSet(); +} + +static void +WalProposerStartImpl(void) +{ + + /* Initiate connections to all safekeeper nodes */ + for (int i = 0; i < n_safekeepers; i++) + { + ResetConnection(&safekeeper[i]); + } + + WalProposerLoop(); +} + +static void +WalProposerLoop(void) +{ + while (true) + WalProposerPoll(); +} + +/* Initializes the internal event set, provided that it is currently null */ +static void +InitEventSet(void) +{ + if (waitEvents) + elog(FATAL, "double-initialization of event set"); + + waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers); + AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, + MyLatch, NULL); + AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); +} + +/* + * Updates the events we're already waiting on for the safekeeper, setting it to + * the provided `events` + * + * This function is called any time the safekeeper's state switches to one where + * it has to wait to continue. This includes the full body of AdvancePollState + * and calls to IO helper functions. + */ +static void +UpdateEventSet(Safekeeper *sk, uint32 events) +{ + /* eventPos = -1 when we don't have an event */ + Assert(sk->eventPos != -1); + + ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL); +} + +/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set. + * + * Note: Internally, this completely reconstructs the event set. It should be avoided if possible. + */ +static void +HackyRemoveWalProposerEvent(Safekeeper *to_remove) +{ + /* Remove the existing event set */ + if (waitEvents) + { + FreeWaitEventSet(waitEvents); + waitEvents = NULL; + } + /* Re-initialize it without adding any safekeeper events */ + InitEventSet(); + + /* + * loop through the existing safekeepers. If they aren't the one we're + * removing, and if they have a socket we can use, re-add the applicable + * events. + */ + for (int i = 0; i < n_safekeepers; i++) + { + uint32 desired_events = WL_NO_EVENTS; + Safekeeper *sk = &safekeeper[i]; + + sk->eventPos = -1; + + if (sk == to_remove) + continue; + + /* If this safekeeper isn't offline, add an event for it! */ + if (sk->conn != NULL) + { + desired_events = SafekeeperStateDesiredEvents(sk->state); + sk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(sk->conn), NULL, sk); + } + } +} + +/* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */ +static void +ShutdownConnection(Safekeeper *sk) +{ + if (sk->conn) + walprop_finish(sk->conn); + sk->conn = NULL; + sk->state = SS_OFFLINE; + sk->flushWrite = false; + sk->streamingAt = InvalidXLogRecPtr; + + if (sk->voteResponse.termHistory.entries) + pfree(sk->voteResponse.termHistory.entries); + sk->voteResponse.termHistory.entries = NULL; + + HackyRemoveWalProposerEvent(sk); +} + +/* + * This function is called to establish new connection or to reestablish + * connection in case of connection failure. + * + * On success, sets the state to SS_CONNECTING_WRITE. + */ +static void +ResetConnection(Safekeeper *sk) +{ + pgsocket sock; /* socket of the new connection */ + + if (sk->state != SS_OFFLINE) + { + ShutdownConnection(sk); + } + + /* + * Try to establish new connection + * + * If the connection information hasn't been filled out, we need to do + * that here. + */ + if (sk->conninfo[0] == '\0') + { + int written = 0; + written = snprintf((char *) &sk->conninfo, MAXCONNINFO, + "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", + sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer); + // currently connection string is not that long, but once we pass something like jwt we might overflow the buffer, + // so it is better to be defensive and check that everything aligns well + if (written > MAXCONNINFO || written < 0) + elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); + } + + sk->conn = walprop_connect_start((char *) &sk->conninfo); + + /* + * "If the result is null, then libpq has been unable to allocate a new + * PGconn structure" + */ + if (!sk->conn) + elog(FATAL, "failed to allocate new PGconn object"); + + /* + * PQconnectStart won't actually start connecting until we run + * PQconnectPoll. Before we do that though, we need to check that it + * didn't immediately fail. + */ + if (walprop_status(sk->conn) == WP_CONNECTION_BAD) + { + /*--- + * According to libpq docs: + * "If the result is CONNECTION_BAD, the connection attempt has already failed, + * typically because of invalid connection parameters." + * We should report this failure. + * + * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS + */ + elog(WARNING, "Immediate failure to connect with node:\n\t%s\n\terror: %s", + sk->conninfo, walprop_error_message(sk->conn)); + + /* + * Even though the connection failed, we still need to clean up the + * object + */ + walprop_finish(sk->conn); + sk->conn = NULL; + return; + } + + /* + * The documentation for PQconnectStart states that we should call + * PQconnectPoll in a loop until it returns PGRES_POLLING_OK or + * PGRES_POLLING_FAILED. The other two possible returns indicate whether + * we should wait for reading or writing on the socket. For the first + * iteration of the loop, we're expected to wait until the socket becomes + * writable. + * + * The wording of the documentation is a little ambiguous; thankfully + * there's an example in the postgres source itself showing this behavior. + * (see libpqrcv_connect, defined in + * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c) + */ + elog(LOG, "connecting with node %s:%s", sk->host, sk->port); + + sk->state = SS_CONNECTING_WRITE; + sk->startedConnAt = GetCurrentTimestamp(); + + sock = walprop_socket(sk->conn); + sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk); + return; +} + +/* + * How much milliseconds left till we should attempt reconnection to + * safekeepers? Returns 0 if it is already high time, -1 if we never reconnect + * (do we actually need this?). + */ +static long +TimeToReconnect(TimestampTz now) +{ + TimestampTz passed; + TimestampTz till_reconnect; + + if (wal_acceptor_reconnect_timeout <= 0) + return -1; + + passed = now - last_reconnect_attempt; + till_reconnect = wal_acceptor_reconnect_timeout * 1000 - passed; + if (till_reconnect <= 0) + return 0; + return (long) (till_reconnect / 1000); +} + +/* If the timeout has expired, attempt to reconnect to all offline safekeepers */ +static void +ReconnectSafekeepers(void) +{ + TimestampTz now = GetCurrentTimestamp(); + + if (TimeToReconnect(now) == 0) + { + last_reconnect_attempt = now; + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].state == SS_OFFLINE) + ResetConnection(&safekeeper[i]); + } + } +} + +/* + * Performs the logic for advancing the state machine of the specified safekeeper, + * given that a certain set of events has occured. + */ +static void +AdvancePollState(Safekeeper *sk, uint32 events) +{ + /* + * Sanity check. We assume further down that the operations don't + * block because the socket is ready. + */ + AssertEventsOkForState(events, sk); + + /* Execute the code corresponding to the current state */ + switch (sk->state) + { + /* + * safekeepers are only taken out of SS_OFFLINE by calls to + * ResetConnection + */ + case SS_OFFLINE: + elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline", + sk->host, sk->port); + break; /* actually unreachable, but prevents + * -Wimplicit-fallthrough */ + + /* + * Both connecting states run the same logic. The only + * difference is the events they're expecting + */ + case SS_CONNECTING_READ: + case SS_CONNECTING_WRITE: + HandleConnectionEvent(sk); + break; + + /* + * Waiting for a successful CopyBoth response. + */ + case SS_WAIT_EXEC_RESULT: + RecvStartWALPushResult(sk); + break; + + /* + * Finish handshake comms: receive information about the safekeeper. + */ + case SS_HANDSHAKE_RECV: + RecvAcceptorGreeting(sk); + break; + + /* + * Voting is an idle state - we don't expect any events to trigger. + * Refer to the execution of SS_HANDSHAKE_RECV to see how nodes are + * transferred from SS_VOTING to sending actual vote requests. + */ + case SS_VOTING: + elog(WARNING, "EOF from node %s:%s in %s state", sk->host, + sk->port, FormatSafekeeperState(sk->state)); + ResetConnection(sk); + return; + + /* Read the safekeeper response for our candidate */ + case SS_WAIT_VERDICT: + RecvVoteResponse(sk); + break; + + /* Flush proposer announcement message */ + case SS_SEND_ELECTED_FLUSH: + + /* + * AsyncFlush ensures we only move on to SS_ACTIVE once the flush + * completes. If we still have more to do, we'll wait until the next + * poll comes along. + */ + if (!AsyncFlush(sk)) + return; + + /* flush is done, event set and state will be updated later */ + StartStreaming(sk); + break; + + /* + * Idle state for waiting votes from quorum. + */ + case SS_IDLE: + elog(WARNING, "EOF from node %s:%s in %s state", sk->host, + sk->port, FormatSafekeeperState(sk->state)); + ResetConnection(sk); + return; + + /* + * Active state is used for streaming WAL and receiving feedback. + */ + case SS_ACTIVE: + HandleActiveState(sk, events); + break; + } +} + +static void +HandleConnectionEvent(Safekeeper *sk) +{ + WalProposerConnectPollStatusType result = walprop_connect_poll(sk->conn); + + /* The new set of events we'll wait on, after updating */ + uint32 new_events = WL_NO_EVENTS; + + switch (result) + { + case WP_CONN_POLLING_OK: + elog(LOG, "connected with node %s:%s", sk->host, + sk->port); + + /* + * We have to pick some event to update event set. + * We'll eventually need the socket to be readable, + * so we go with that. + */ + new_events = WL_SOCKET_READABLE; + break; + + /* + * If we need to poll to finish connecting, + * continue doing that + */ + case WP_CONN_POLLING_READING: + sk->state = SS_CONNECTING_READ; + new_events = WL_SOCKET_READABLE; + break; + case WP_CONN_POLLING_WRITING: + sk->state = SS_CONNECTING_WRITE; + new_events = WL_SOCKET_WRITEABLE; + break; + + case WP_CONN_POLLING_FAILED: + elog(WARNING, "failed to connect to node '%s:%s': %s", + sk->host, sk->port, walprop_error_message(sk->conn)); + + /* + * If connecting failed, we don't want to restart + * the connection because that might run us into a + * loop. Instead, shut it down -- it'll naturally + * restart at a slower interval on calls to + * ReconnectSafekeepers. + */ + ShutdownConnection(sk); + return; + } + + /* + * Because PQconnectPoll can change the socket, we have to + * un-register the old event and re-register an event on + * the new socket. + */ + HackyRemoveWalProposerEvent(sk); + sk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(sk->conn), NULL, sk); + + /* If we successfully connected, send START_WAL_PUSH query */ + if (result == WP_CONN_POLLING_OK) + SendStartWALPush(sk); +} + +/* + * Send "START_WAL_PUSH" message as an empty query to the safekeeper. Performs + * a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT. If something + * goes wrong, change state to SS_OFFLINE and shutdown the connection. + */ +static void +SendStartWALPush(Safekeeper *sk) +{ + if (!walprop_send_query(sk->conn, "START_WAL_PUSH")) + { + elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", + sk->host, sk->port, walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return; + } + sk->state = SS_WAIT_EXEC_RESULT; + UpdateEventSet(sk, WL_SOCKET_READABLE); +} + +static void +RecvStartWALPushResult(Safekeeper *sk) +{ + switch (walprop_get_query_result(sk->conn)) + { + /* + * Successful result, move on to starting the + * handshake + */ + case WP_EXEC_SUCCESS_COPYBOTH: + + SendProposerGreeting(sk); + break; + + /* + * Needs repeated calls to finish. Wait until the + * socket is readable + */ + case WP_EXEC_NEEDS_INPUT: + + /* + * SS_WAIT_EXEC_RESULT is always reached through an + * event, so we don't need to update the event set + */ + break; + + case WP_EXEC_FAILED: + elog(WARNING, "Failed to send query to safekeeper %s:%s: %s", + sk->host, sk->port, walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return; + + /* + * Unexpected result -- funamdentally an error, but we + * want to produce a custom message, rather than a + * generic "something went wrong" + */ + case WP_EXEC_UNEXPECTED_SUCCESS: + elog(WARNING, "Received bad response from safekeeper %s:%s query execution", + sk->host, sk->port); + ShutdownConnection(sk); + return; + } +} + +/* + * Start handshake: first of all send information about the + * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for + * a response to finish the handshake. + */ +static void +SendProposerGreeting(Safekeeper *sk) +{ + /* + * On failure, logging & resetting the connection is handled. + * We just need to handle the control flow. + */ + BlockingWrite(sk, &greetRequest, sizeof(greetRequest), SS_HANDSHAKE_RECV); +} + +static void +RecvAcceptorGreeting(Safekeeper *sk) +{ + /* + * If our reading doesn't immediately succeed, any necessary + * error handling or state setting is taken care of. We can + * leave any other work until later. + */ + sk->greetResponse.apm.tag = 'g'; + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) + return; + + /* Protocol is all good, move to voting. */ + sk->state = SS_VOTING; + + ++n_connected; + if (n_connected <= quorum) + { + /* We're still collecting terms from the majority. */ + propTerm = Max(sk->greetResponse.term, propTerm); + + /* Quorum is acquried, prepare the vote request. */ + if (n_connected == quorum) + { + propTerm++; + elog(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, quorum, propTerm); + + voteRequest = (VoteRequest) + { + .tag = 'v', + .term = propTerm + }; + memcpy(voteRequest.proposerId.data, greetRequest.proposerId.data, UUID_LEN); + } + } + else if (sk->greetResponse.term > propTerm) + { + /* Another compute with higher term is running. */ + elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", + sk->host, sk->port, + sk->greetResponse.term, propTerm); + } + + /* + * Check if we have quorum. If there aren't enough safekeepers, + * wait and do nothing. We'll eventually get a task when the + * election starts. + * + * If we do have quorum, we can start an election. + */ + if (n_connected < quorum) + { + /* + * SS_VOTING is an idle state; read-ready indicates the + * connection closed. + */ + UpdateEventSet(sk, WL_SOCKET_READABLE); + } + else + { + /* + * Now send voting request to the cohort and wait + * responses + */ + for (int j = 0; j < n_safekeepers; j++) + { + /* + * Remember: SS_VOTING indicates that the safekeeper is + * participating in voting, but hasn't sent anything + * yet. + */ + if (safekeeper[j].state == SS_VOTING) + SendVoteRequest(&safekeeper[j]); + } + } +} + +static void +SendVoteRequest(Safekeeper *sk) +{ + /* We have quorum for voting, send our vote request */ + elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, voteRequest.term); + /* On failure, logging & resetting is handled */ + if (!BlockingWrite(sk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT)) + return; + + /* If successful, wait for read-ready with SS_WAIT_VERDICT */ +} + +static void +RecvVoteResponse(Safekeeper *sk) +{ + sk->voteResponse.apm.tag = 'v'; + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse)) + return; + + elog(LOG, + "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", + sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), + LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), + LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), + LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); + + /* + * In case of acceptor rejecting our vote, bail out, but only + * if either it already lives in strictly higher term + * (concurrent compute spotted) or we are not elected yet and + * thus need the vote. + */ + if ((!sk->voteResponse.voteGiven) && + (sk->voteResponse.term > propTerm || n_votes < quorum)) + { + elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", + sk->host, sk->port, + sk->voteResponse.term, propTerm); + } + Assert(sk->voteResponse.term == propTerm); + + /* Handshake completed, do we have quorum? */ + n_votes++; + if (n_votes < quorum) + { + sk->state = SS_IDLE; /* can't do much yet, no quorum */ + } + else if (n_votes > quorum) + { + /* recovery already performed, just start streaming */ + SendProposerElected(sk); + } + else + { + sk->state = SS_IDLE; + UpdateEventSet(sk, WL_SOCKET_READABLE); /* Idle states wait for + * read-ready */ + + HandleElectedProposer(); + } +} + +/* + * Called once a majority of acceptors have voted for us and current proposer + * has been elected. + * + * Sends ProposerElected message to all acceptors in SS_IDLE state and starts + * replication from walsender. + */ +static void +HandleElectedProposer(void) +{ + DetermineEpochStartLsn(); + + /* + * Check if not all safekeepers are up-to-date, we need to + * download WAL needed to synchronize them + */ + if (truncateLsn < propEpochStartLsn) + { + elog(LOG, + "start recovery because truncateLsn=%X/%X is not " + "equal to epochStartLsn=%X/%X", + LSN_FORMAT_ARGS(truncateLsn), + LSN_FORMAT_ARGS(propEpochStartLsn)); + /* Perform recovery */ + if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn)) + elog(FATAL, "Failed to recover state"); + } + else if (syncSafekeepers) + { + /* Sync is not needed: just exit */ + fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn)); + exit(0); + } + + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].state == SS_IDLE) + SendProposerElected(&safekeeper[i]); + } + + /* + * The proposer has been elected, and there will be no quorum waiting + * after this point. There will be no safekeeper with state SS_IDLE + * also, because that state is used only for quorum waiting. + */ + + if (syncSafekeepers) + { + /* + * Send empty message to enforce receiving feedback + * even from nodes who are fully recovered; this is + * required to learn they switched epoch which finishes + * sync-safeekepers who doesn't generate any real new + * records. Will go away once we switch to async acks. + */ + BroadcastAppendRequest(); + + /* keep polling until all safekeepers are synced */ + return; + } + + WalProposerStartStreaming(propEpochStartLsn); + /* Should not return here */ +} + +/* latest term in TermHistory, or 0 is there is no entries */ +static term_t +GetHighestTerm(TermHistory *th) +{ + return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0; +} + +/* safekeeper's epoch is the term of the highest entry in the log */ +static term_t +GetEpoch(Safekeeper *sk) +{ + return GetHighestTerm(&sk->voteResponse.termHistory); +} + +/* If LSN points to the page header, skip it */ +static XLogRecPtr +SkipXLogPageHeader(XLogRecPtr lsn) +{ + if (XLogSegmentOffset(lsn, wal_segment_size) == 0) + { + lsn += SizeOfXLogLongPHD; + } + else if (lsn % XLOG_BLCKSZ == 0) + { + lsn += SizeOfXLogShortPHD; + } + return lsn; +} + +/* + * Called after majority of acceptors gave votes, it calculates the most + * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since + * which we'll write WAL in our term. + * + * Sets truncateLsn along the way (though it is not of much use at this point -- + * only for skipping recovery). + */ +static void +DetermineEpochStartLsn(void) +{ + TermHistory *dth; + + propEpochStartLsn = InvalidXLogRecPtr; + donorEpoch = 0; + truncateLsn = InvalidXLogRecPtr; + timelineStartLsn = InvalidXLogRecPtr; + + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].state == SS_IDLE) + { + if (GetEpoch(&safekeeper[i]) > donorEpoch || + (GetEpoch(&safekeeper[i]) == donorEpoch && + safekeeper[i].voteResponse.flushLsn > propEpochStartLsn)) + { + donorEpoch = GetEpoch(&safekeeper[i]); + propEpochStartLsn = safekeeper[i].voteResponse.flushLsn; + donor = i; + } + truncateLsn = Max(safekeeper[i].voteResponse.truncateLsn, truncateLsn); + + if (safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr) + { + /* timelineStartLsn should be the same everywhere or unknown */ + if (timelineStartLsn != InvalidXLogRecPtr && + timelineStartLsn != safekeeper[i].voteResponse.timelineStartLsn) + { + elog(WARNING, + "inconsistent timelineStartLsn: current %X/%X, received %X/%X", + LSN_FORMAT_ARGS(timelineStartLsn), + LSN_FORMAT_ARGS(safekeeper[i].voteResponse.timelineStartLsn)); + } + timelineStartLsn = safekeeper[i].voteResponse.timelineStartLsn; + } + } + } + + /* + * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing was + * committed yet. Start streaming then from the basebackup LSN. + */ + if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers) + { + propEpochStartLsn = truncateLsn = GetRedoStartLsn(); + if (timelineStartLsn == InvalidXLogRecPtr) + { + timelineStartLsn = GetRedoStartLsn(); + } + elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn)); + } + + /* + * If propEpochStartLsn is not 0, at least one msg with WAL was sent to + * some connected safekeeper; it must have carried truncateLsn pointing to + * the first record. + */ + Assert((truncateLsn != InvalidXLogRecPtr) || + (syncSafekeepers && truncateLsn == propEpochStartLsn)); + + /* + * We will be generating WAL since propEpochStartLsn, so we should set + * availableLsn to mark this LSN as the latest available position. + */ + availableLsn = propEpochStartLsn; + + /* + * Proposer's term history is the donor's + its own entry. + */ + dth = &safekeeper[donor].voteResponse.termHistory; + propTermHistory.n_entries = dth->n_entries + 1; + propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries); + memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries); + propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm; + propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn; + + elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", + quorum, + propTerm, + LSN_FORMAT_ARGS(propEpochStartLsn), + safekeeper[donor].host, safekeeper[donor].port, + LSN_FORMAT_ARGS(truncateLsn) + ); + + /* + * Ensure the basebackup we are running (at RedoStartLsn) matches LSN since + * which we are going to write according to the consensus. If not, we must + * bail out, as clog and other non rel data is inconsistent. + */ + if (!syncSafekeepers) + { + /* + * Basebackup LSN always points to the beginning of the record (not the + * page), as StartupXLOG most probably wants it this way. Safekeepers + * don't skip header as they need continious stream of data, so + * correct LSN for comparison. + */ + if (SkipXLogPageHeader(propEpochStartLsn) != GetRedoStartLsn()) + { + /* + * However, allow to proceed if previously elected leader was me; plain + * restart of walproposer not intervened by concurrent compute (who could + * generate WAL) is ok. + */ + if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == + walprop_shared->mineLastElectedTerm))) + { + elog(PANIC, + "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X", + LSN_FORMAT_ARGS(propEpochStartLsn), + LSN_FORMAT_ARGS(GetRedoStartLsn())); + } + } + walprop_shared->mineLastElectedTerm = propTerm; + } +} + +/* + * Receive WAL from most advanced safekeeper + */ +static bool +WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos) +{ + char conninfo[MAXCONNINFO]; + char *err; + WalReceiverConn *wrconn; + WalRcvStreamOptions options; + + sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", + safekeeper[donor].host, safekeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer); + wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); + if (!wrconn) + { + ereport(WARNING, + (errmsg("could not connect to WAL acceptor %s:%s: %s", + safekeeper[donor].host, safekeeper[donor].port, + err))); + return false; + } + elog(LOG, + "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline " + "%d", + safekeeper[donor].host, safekeeper[donor].port, (uint32) (startpos >> 32), + (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); + + options.logical = false; + options.startpoint = startpos; + options.slotname = NULL; + options.proto.physical.startpointTLI = timeline; + + if (walrcv_startstreaming(wrconn, &options)) + { + XLogRecPtr rec_start_lsn; + XLogRecPtr rec_end_lsn = 0; + int len; + char *buf; + pgsocket wait_fd = PGINVALID_SOCKET; + + while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0) + { + if (len == 0) + { + (void) WaitLatchOrSocket( + MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd, + -1, WAIT_EVENT_WAL_RECEIVER_MAIN); + } + else + { + Assert(buf[0] == 'w' || buf[0] == 'k'); + if (buf[0] == 'k') + continue; /* keepalive */ + memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], + sizeof rec_start_lsn); + rec_start_lsn = pg_ntoh64(rec_start_lsn); + rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE; + + /* write WAL to disk */ + XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn); + + ereport(DEBUG1, + (errmsg("Recover message %X/%X length %d", + LSN_FORMAT_ARGS(rec_start_lsn), len))); + if (rec_end_lsn >= endpos) + break; + } + } + ereport(LOG, + (errmsg("end of replication stream at %X/%X: %m", + LSN_FORMAT_ARGS(rec_end_lsn)))); + walrcv_disconnect(wrconn); + + /* failed to receive all WAL till endpos */ + if (rec_end_lsn < endpos) + return false; + } + else + { + ereport(LOG, + (errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X", + timeline, (uint32) (startpos >> 32), (uint32) startpos))); + return false; + } + + return true; +} + +/* + * Determine for sk the starting streaming point and send it message + * 1) Announcing we are elected proposer (which immediately advances epoch if + * safekeeper is synced, being important for sync-safekeepers) + * 2) Communicating starting streaming point -- safekeeper must truncate its WAL + * beyond it -- and history of term switching. + * + * Sets sk->startStreamingAt. + */ +static void +SendProposerElected(Safekeeper *sk) +{ + ProposerElected msg; + TermHistory *th; + term_t lastCommonTerm; + int i; + + /* + * Determine start LSN by comparing safekeeper's log term switch history and + * proposer's, searching for the divergence point. + * + * Note: there is a vanishingly small chance of no common point even if + * there is some WAL on safekeeper, if immediately after bootstrap compute + * wrote some WAL on single sk and died; we stream since the beginning then. + */ + th = &sk->voteResponse.termHistory; + /* + * If any WAL is present on the sk, it must be authorized by some term. + * OTOH, without any WAL there are no term swiches in the log. + */ + Assert((th->n_entries == 0) == + (sk->voteResponse.flushLsn == InvalidXLogRecPtr)); + /* We must start somewhere. */ + Assert(propTermHistory.n_entries >= 1); + + for (i = 0; i < Min(propTermHistory.n_entries, th->n_entries); i++) + { + if (propTermHistory.entries[i].term != th->entries[i].term) + break; + /* term must begin everywhere at the same point */ + Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn); + } + i--; /* step back to the last common term */ + if (i < 0) + { + /* safekeeper is empty or no common point, start from the beginning */ + sk->startStreamingAt = propTermHistory.entries[0].lsn; + + if (sk->startStreamingAt < truncateLsn) + { + /* + * There's a gap between the WAL starting point and a truncateLsn, + * which can't appear in a normal working cluster. That gap means + * that all safekeepers reported that they have persisted WAL up + * to the truncateLsn before, but now current safekeeper tells + * otherwise. + * + * Also we have a special condition here, which is empty safekeeper + * with no history. In combination with a gap, that can happen when + * we introduce a new safekeeper to the cluster. This is a rare case, + * which is triggered manually for now, and should be treated with + * care. + */ + + /* + * truncateLsn will not change without ack from current safekeeper, + * and it's aligned to the WAL record, so we can safely start + * streaming from this point. + */ + sk->startStreamingAt = truncateLsn; + + elog(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X", + sk->host, sk->port, LSN_FORMAT_ARGS(propTermHistory.entries[0].lsn), + LSN_FORMAT_ARGS(sk->startStreamingAt)); + } + } + else + { + /* + * End of (common) term is the start of the next except it is the last + * one; there it is flush_lsn in case of safekeeper or, in case of + * proposer, LSN it is currently writing, but then we just pick + * safekeeper pos as it obviously can't be higher. + */ + if (propTermHistory.entries[i].term == propTerm) + { + sk->startStreamingAt = sk->voteResponse.flushLsn; + } + else + { + XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn; + XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : + sk->voteResponse.flushLsn); + sk->startStreamingAt = Min(propEndLsn, skEndLsn); + } + } + + Assert(sk->startStreamingAt >= truncateLsn && sk->startStreamingAt <= availableLsn); + + msg.tag = 'e'; + msg.term = propTerm; + msg.startStreamingAt = sk->startStreamingAt; + msg.termHistory = &propTermHistory; + msg.timelineStartLsn = timelineStartLsn; + + lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0; + elog(LOG, + "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", + sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); + + resetStringInfo(&sk->outbuf); + pq_sendint64_le(&sk->outbuf, msg.tag); + pq_sendint64_le(&sk->outbuf, msg.term); + pq_sendint64_le(&sk->outbuf, msg.startStreamingAt); + pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries); + for (int i = 0; i < msg.termHistory->n_entries; i++) + { + pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term); + pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn); + } + pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn); + + if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH)) + return; + + StartStreaming(sk); +} + +/* + * Start walsender streaming replication + */ +static void +WalProposerStartStreaming(XLogRecPtr startpos) +{ + StartReplicationCmd cmd; + + elog(LOG, "WAL proposer starts streaming at %X/%X", + LSN_FORMAT_ARGS(startpos)); + cmd.slotname = WAL_PROPOSER_SLOT_NAME; + cmd.timeline = greetRequest.timeline; + cmd.startpoint = startpos; + StartProposerReplication(&cmd); +} + +/* + * Start streaming to safekeeper sk, always updates state to SS_ACTIVE and sets + * correct event set. + */ +static void +StartStreaming(Safekeeper *sk) +{ + /* + * This is the only entrypoint to state SS_ACTIVE. It's executed + * exactly once for a connection. + */ + sk->state = SS_ACTIVE; + sk->streamingAt = sk->startStreamingAt; + + /* event set will be updated inside SendMessageToNode */ + SendMessageToNode(sk); +} + +/* + * Try to send message to the particular node. Always updates event set. Will + * send at least one message, if socket is ready. + * + * Can be used only for safekeepers in SS_ACTIVE state. State can be changed + * in case of errors. + */ +static void +SendMessageToNode(Safekeeper *sk) +{ + Assert(sk->state == SS_ACTIVE); + + /* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */ + HandleActiveState(sk, WL_SOCKET_WRITEABLE); +} + +/* + * Broadcast new message to all caught-up safekeepers + */ +static void +BroadcastAppendRequest() +{ + for (int i = 0; i < n_safekeepers; i++) + if (safekeeper[i].state == SS_ACTIVE) + SendMessageToNode(&safekeeper[i]); +} + +static void +PrepareAppendRequest(AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn) +{ + Assert(endLsn >= beginLsn); + req->tag = 'a'; + req->term = propTerm; + req->epochStartLsn = propEpochStartLsn; + req->beginLsn = beginLsn; + req->endLsn = endLsn; + req->commitLsn = GetAcknowledgedByQuorumWALPosition(); + req->truncateLsn = truncateLsn; + req->proposerId = greetRequest.proposerId; +} + +/* + * Process all events happened in SS_ACTIVE state, update event set after that. + */ +static void +HandleActiveState(Safekeeper *sk, uint32 events) +{ + uint32 newEvents = WL_SOCKET_READABLE; + + if (events & WL_SOCKET_WRITEABLE) + if (!SendAppendRequests(sk)) + return; + + if (events & WL_SOCKET_READABLE) + if (!RecvAppendResponses(sk)) + return; + + /* + * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data + * in the buffer. + * + * LSN comparison checks if we have pending unsent messages. This check isn't + * necessary now, because we always send append messages immediately after + * arrival. But it's good to have it here in case we change this behavior + * in the future. + */ + if (sk->streamingAt != availableLsn || sk->flushWrite) + newEvents |= WL_SOCKET_WRITEABLE; + + UpdateEventSet(sk, newEvents); +} + +/* + * Send WAL messages starting from sk->streamingAt until the end or non-writable + * socket, whichever comes first. Caller should take care of updating event set. + * Even if no unsent WAL is available, at least one empty message will be sent + * as a heartbeat, if socket is ready. + * + * Can change state if Async* functions encounter errors and reset connection. + * Returns false in this case, true otherwise. + */ +static bool +SendAppendRequests(Safekeeper *sk) +{ + XLogRecPtr endLsn; + AppendRequestHeader *req; + PGAsyncWriteResult writeResult; + WALReadError errinfo; + bool sentAnything = false; + + if (sk->flushWrite) + { + if (!AsyncFlush(sk)) + /* + * AsyncFlush failed, that could happen if the socket is closed or + * we have nothing to write and should wait for writeable socket. + */ + return sk->state == SS_ACTIVE; + + /* Event set will be updated in the end of HandleActiveState */ + sk->flushWrite = false; + } + + while (sk->streamingAt != availableLsn || !sentAnything) + { + sentAnything = true; + + endLsn = sk->streamingAt; + endLsn += MAX_SEND_SIZE; + + /* if we went beyond available WAL, back off */ + if (endLsn > availableLsn) { + endLsn = availableLsn; + } + + req = &sk->appendRequest; + PrepareAppendRequest(&sk->appendRequest, sk->streamingAt, endLsn); + + ereport(DEBUG2, + (errmsg("sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s", + req->endLsn - req->beginLsn, + LSN_FORMAT_ARGS(req->beginLsn), + LSN_FORMAT_ARGS(req->endLsn), + LSN_FORMAT_ARGS(req->commitLsn), + LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port))); + + resetStringInfo(&sk->outbuf); + + /* write AppendRequest header */ + appendBinaryStringInfo(&sk->outbuf, (char*) req, sizeof(AppendRequestHeader)); + + /* write the WAL itself */ + enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); + if (!WALRead(sk->xlogreader, + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req->endLsn - req->beginLsn, + ThisTimeLineID, + &errinfo)) + { + WALReadRaiseError(&errinfo); + } + sk->outbuf.len += req->endLsn - req->beginLsn; + + writeResult = walprop_async_write(sk->conn, sk->outbuf.data, sk->outbuf.len); + + /* Mark current message as sent, whatever the result is */ + sk->streamingAt = endLsn; + + switch (writeResult) + { + case PG_ASYNC_WRITE_SUCCESS: + /* Continue writing the next message */ + break; + + case PG_ASYNC_WRITE_TRY_FLUSH: + /* + * We still need to call PQflush some more to finish the job. + * Caller function will handle this by setting right event set. + */ + sk->flushWrite = true; + return true; + + case PG_ASYNC_WRITE_FAIL: + elog(WARNING, "Failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + default: + Assert(false); + return false; + } + } + + return true; +} + +/* + * Receive and process all available feedback. + * + * Can change state if Async* functions encounter errors and reset connection. + * Returns false in this case, true otherwise. + * + * NB: This function can call SendMessageToNode and produce new messages. + */ +static bool +RecvAppendResponses(Safekeeper *sk) +{ + XLogRecPtr minQuorumLsn; + bool readAnything = false; + + while (true) + { + /* + * If our reading doesn't immediately succeed, any + * necessary error handling or state setting is taken care + * of. We can leave any other work until later. + */ + sk->appendResponse.apm.tag = 'a'; + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse)) + break; + + ereport(DEBUG2, + (errmsg("received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s", + sk->appendResponse.term, + LSN_FORMAT_ARGS(sk->appendResponse.flushLsn), + LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), + sk->host, sk->port))); + + if (sk->appendResponse.term > propTerm) + { + /* Another compute with higher term is running. */ + elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", + sk->host, sk->port, + sk->appendResponse.term, propTerm); + } + + readAnything = true; + } + + if (!readAnything) + return sk->state == SS_ACTIVE; + + HandleSafekeeperResponse(); + + /* + * Also send the new commit lsn to all the safekeepers. + */ + minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); + if (minQuorumLsn > lastSentCommitLsn) + { + BroadcastAppendRequest(); + lastSentCommitLsn = minQuorumLsn; + } + + return sk->state == SS_ACTIVE; +} + +/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */ +void +ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *rf) +{ + uint8 nkeys; + int i; + int32 len; + + /* get number of custom keys */ + nkeys = pq_getmsgbyte(reply_message); + + for (i = 0; i < nkeys; i++) + { + const char *key = pq_getmsgstring(reply_message); + if (strcmp(key, "current_timeline_size") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->currentClusterSize = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", + rf->currentClusterSize); + } + else if (strcmp(key, "ps_writelsn") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->ps_writelsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_writelsn)); + } + else if (strcmp(key, "ps_flushlsn") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->ps_flushlsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_flushlsn)); + } + else if (strcmp(key, "ps_applylsn") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->ps_applylsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_applylsn)); + } + else if (strcmp(key, "ps_replytime") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->ps_replytime = pq_getmsgint64(reply_message); + { + char *replyTimeStr; + + /* Copy because timestamptz_to_str returns a static buffer */ + replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime)); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s", + rf->ps_replytime, replyTimeStr); + + pfree(replyTimeStr); + } + } + else + { + len = pq_getmsgint(reply_message, sizeof(int32)); // read value length + // Skip unknown keys to support backward compatibile protocol changes + elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); + pq_getmsgbytes(reply_message, len); + }; + } +} + +/* + * Combine hot standby feedbacks from all safekeepers. + */ +static void +CombineHotStanbyFeedbacks(HotStandbyFeedback * hs) +{ + hs->ts = 0; + hs->xmin.value = ~0; /* largest unsigned value */ + hs->catalog_xmin.value = ~0; /* largest unsigned value */ + + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].appendResponse.hs.ts != 0) + { + if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin)) + { + hs->xmin = safekeeper[i].appendResponse.hs.xmin; + hs->ts = safekeeper[i].appendResponse.hs.ts; + } + if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin)) + { + hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin; + hs->ts = safekeeper[i].appendResponse.hs.ts; + } + } + } +} + + +/* + * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the + * last WAL record that can be safely discarded. + */ +static XLogRecPtr +CalculateMinFlushLsn(void) +{ + XLogRecPtr lsn = n_safekeepers > 0 + ? safekeeper[0].appendResponse.flushLsn + : InvalidXLogRecPtr; + for (int i = 1; i < n_safekeepers; i++) + { + lsn = Min(lsn, safekeeper[i].appendResponse.flushLsn); + } + return lsn; +} + +/* + * Calculate WAL position acknowledged by quorum + */ +static XLogRecPtr +GetAcknowledgedByQuorumWALPosition(void) +{ + XLogRecPtr responses[MAX_SAFEKEEPERS]; + + /* + * Sort acknowledged LSNs + */ + for (int i = 0; i < n_safekeepers; i++) + { + /* + * Like in Raft, we aren't allowed to commit entries from previous + * terms, so ignore reported LSN until it gets to epochStartLsn. + */ + responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ? + safekeeper[i].appendResponse.flushLsn : 0; + } + qsort(responses, n_safekeepers, sizeof(XLogRecPtr), CompareLsn); + + /* + * Get the smallest LSN committed by quorum + */ + return responses[n_safekeepers - quorum]; +} + +/* + * ReplicationFeedbackShmemSize --- report amount of shared memory space needed + */ +Size +WalproposerShmemSize(void) +{ + return sizeof(WalproposerShmemState); +} + +bool +WalproposerShmemInit(void) +{ + bool found; + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + walprop_shared = ShmemInitStruct("Walproposer shared state", + sizeof(WalproposerShmemState), + &found); + + if (!found) + { + memset(walprop_shared, 0, WalproposerShmemSize()); + SpinLockInit(&walprop_shared->mutex); + } + LWLockRelease(AddinShmemInitLock); + + return found; +} + +void +replication_feedback_set(ReplicationFeedback *rf) +{ + SpinLockAcquire(&walprop_shared->mutex); + memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback)); + SpinLockRelease(&walprop_shared->mutex); +} + + +void +replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) +{ + SpinLockAcquire(&walprop_shared->mutex); + *writeLsn = walprop_shared->feedback.ps_writelsn; + *flushLsn = walprop_shared->feedback.ps_flushlsn; + *applyLsn = walprop_shared->feedback.ps_applylsn; + SpinLockRelease(&walprop_shared->mutex); +} + + +/* + * Get ReplicationFeedback fields from the most advanced safekeeper + */ +static void +GetLatestZentihFeedback(ReplicationFeedback *rf) +{ + int latest_safekeeper = 0; + XLogRecPtr ps_writelsn = InvalidXLogRecPtr; + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn) + { + latest_safekeeper = i; + ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn; + } + } + + rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize; + rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn; + rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn; + rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn; + rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime; + + elog(DEBUG2, "GetLatestZentihFeedback: currentClusterSize %lu," + " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", + rf->currentClusterSize, + LSN_FORMAT_ARGS(rf->ps_writelsn), + LSN_FORMAT_ARGS(rf->ps_flushlsn), + LSN_FORMAT_ARGS(rf->ps_applylsn), + rf->ps_replytime); + + replication_feedback_set(rf); +} + +static void +HandleSafekeeperResponse(void) +{ + HotStandbyFeedback hsFeedback; + XLogRecPtr minQuorumLsn; + XLogRecPtr diskConsistentLsn; + XLogRecPtr minFlushLsn; + + + minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); + diskConsistentLsn = quorumFeedback.rf.ps_flushlsn; + + if (!syncSafekeepers) + { + // Get ReplicationFeedback fields from the most advanced safekeeper + GetLatestZentihFeedback(&quorumFeedback.rf); + SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); + } + + if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn) + { + + if (minQuorumLsn > quorumFeedback.flushLsn) + quorumFeedback.flushLsn = minQuorumLsn; + + /* advance the replication slot */ + if (!syncSafekeepers) + ProcessStandbyReply( + // write_lsn - This is what durably stored in WAL service. + quorumFeedback.flushLsn, + //flush_lsn - This is what durably stored in WAL service. + quorumFeedback.flushLsn, + //apply_lsn - This is what processed and durably saved at pageserver. + quorumFeedback.rf.ps_flushlsn, + GetCurrentTimestamp(), false); + } + + CombineHotStanbyFeedbacks(&hsFeedback); + if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0) + { + quorumFeedback.hs = hsFeedback; + if (!syncSafekeepers) + ProcessStandbyHSFeedback(hsFeedback.ts, + XidFromFullTransactionId(hsFeedback.xmin), + EpochFromFullTransactionId(hsFeedback.xmin), + XidFromFullTransactionId(hsFeedback.catalog_xmin), + EpochFromFullTransactionId(hsFeedback.catalog_xmin)); + } + + /* + * Try to advance truncateLsn to minFlushLsn, which is the last record + * flushed to all safekeepers. We must always start streaming from the + * beginning of the record, which simplifies decoding on the far end. + * + * Advanced truncateLsn should be not further than nearest commitLsn. + * This prevents surprising violation of truncateLsn <= commitLsn + * invariant which might occur because 1) truncateLsn can be advanced + * immediately once chunk is broadcast to all safekeepers, and + * commitLsn generally can't be advanced based on feedback from + * safekeeper who is still in the previous epoch (similar to 'leader + * can't commit entries from previous term' in Raft); 2) chunks we + * read from WAL and send are plain sheets of bytes, but safekeepers + * ack only on record boundaries. + */ + minFlushLsn = CalculateMinFlushLsn(); + if (minFlushLsn > truncateLsn) + { + truncateLsn = minFlushLsn; + + /* + * Advance the replication slot to free up old WAL files. Note + * that slot doesn't exist if we are in syncSafekeepers mode. + */ + if (MyReplicationSlot) + PhysicalConfirmReceivedLocation(truncateLsn); + } + + /* + * Generally sync is done when majority switched the epoch so we committed + * epochStartLsn and made the majority aware of it, ensuring they are + * ready to give all WAL to pageserver. It would mean whichever majority + * is alive, there will be at least one safekeeper who is able to stream + * WAL to pageserver to make basebackup possible. However, since at the + * moment we don't have any good mechanism of defining the healthy and + * most advanced safekeeper who should push the wal into pageserver and + * basically the random one gets connected, to prevent hanging basebackup + * (due to pageserver connecting to not-synced-safekeeper) we currently + * wait for all seemingly alive safekeepers to get synced. + */ + if (syncSafekeepers) + { + int n_synced; + + n_synced = 0; + for (int i = 0; i < n_safekeepers; i++) + { + Safekeeper *sk = &safekeeper[i]; + bool synced = sk->appendResponse.commitLsn >= propEpochStartLsn; + + /* alive safekeeper which is not synced yet; wait for it */ + if (sk->state != SS_OFFLINE && !synced) + return; + if (synced) + n_synced++; + } + if (n_synced >= quorum) + { + /* All safekeepers synced! */ + fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn)); + exit(0); + } + } +} + +/* + * Try to read CopyData message from i'th safekeeper, resetting connection on + * failure. + */ +static bool +AsyncRead(Safekeeper *sk, char **buf, int *buf_size) +{ + switch (walprop_async_read(sk->conn, buf, buf_size)) + { + case PG_ASYNC_READ_SUCCESS: + return true; + + case PG_ASYNC_READ_TRY_AGAIN: + /* WL_SOCKET_READABLE is always set during copyboth */ + return false; + + case PG_ASYNC_READ_FAIL: + elog(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host, + sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + } + Assert(false); + return false; +} + +/* + * Read next message with known type into provided struct, by reading a CopyData + * block from the safekeeper's postgres connection, returning whether the read + * was successful. + * + * If the read needs more polling, we return 'false' and keep the state + * unmodified, waiting until it becomes read-ready to try again. If it fully + * failed, a warning is emitted and the connection is reset. + */ +static bool +AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) +{ + char *buf; + int buf_size; + uint64 tag; + StringInfoData s; + + if (!(AsyncRead(sk, &buf, &buf_size))) + return false; + + /* parse it */ + s.data = buf; + s.len = buf_size; + s.cursor = 0; + + tag = pq_getmsgint64_le(&s); + if (tag != anymsg->tag) + { + elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, + sk->port, FormatSafekeeperState(sk->state)); + ResetConnection(sk); + return false; + } + + switch (tag) + { + case 'g': + { + AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; + msg->term = pq_getmsgint64_le(&s); + msg->nodeId = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } + + case 'v': + { + VoteResponse *msg = (VoteResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->voteGiven = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->truncateLsn = pq_getmsgint64_le(&s); + msg->termHistory.n_entries = pq_getmsgint32_le(&s); + msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); + for (int i = 0; i < msg->termHistory.n_entries; i++) + { + msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); + msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + } + msg->timelineStartLsn = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } + + case 'a': + { + AppendResponse *msg = (AppendResponse *) anymsg; + msg->term = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->commitLsn = pq_getmsgint64_le(&s); + msg->hs.ts = pq_getmsgint64_le(&s); + msg->hs.xmin.value = pq_getmsgint64_le(&s); + msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); + if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) + ParseReplicationFeedbackMessage(&s, &msg->rf); + pq_getmsgend(&s); + return true; + } + + default: + { + Assert(false); + return false; + } + } +} + +/* + * Blocking equivalent to AsyncWrite. + * + * We use this everywhere messages are small enough that they should fit in a + * single packet. + */ +static bool +BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state) +{ + uint32 events; + + if (!walprop_blocking_write(sk->conn, msg, msg_size)) + { + elog(WARNING, "Failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + } + + sk->state = success_state; + + /* + * If the new state will be waiting for events to happen, update the event + * set to wait for those + */ + events = SafekeeperStateDesiredEvents(success_state); + if (events) + UpdateEventSet(sk, events); + + return true; +} + +/* + * Starts a write into the 'i'th safekeeper's postgres connection, moving to + * flush_state (adjusting eventset) if write still needs flushing. + * + * Returns false if sending is unfinished (requires flushing or conn failed). + * Upon failure, a warning is emitted and the connection is reset. + */ +static bool +AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state) +{ + switch (walprop_async_write(sk->conn, msg, msg_size)) + { + case PG_ASYNC_WRITE_SUCCESS: + return true; + case PG_ASYNC_WRITE_TRY_FLUSH: + + /* + * We still need to call PQflush some more to finish the job; go + * to the appropriate state. Update the event set at the bottom of + * this function + */ + sk->state = flush_state; + UpdateEventSet(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE); + return false; + case PG_ASYNC_WRITE_FAIL: + elog(WARNING, "Failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + default: + Assert(false); + return false; + } +} + +/* + * Flushes a previous call to AsyncWrite. This only needs to be called when the + * socket becomes read or write ready *after* calling AsyncWrite. + * + * If flushing successfully completes returns true, otherwise false. Event set + * is updated only if connection fails, otherwise caller should manually unset + * WL_SOCKET_WRITEABLE. + */ +static bool +AsyncFlush(Safekeeper *sk) +{ + /*--- + * PQflush returns: + * 0 if successful [we're good to move on] + * 1 if unable to send everything yet [call PQflush again] + * -1 if it failed [emit an error] + */ + switch (walprop_flush(sk->conn)) + { + case 0: + /* flush is done */ + return true; + case 1: + /* Nothing to do; try again when the socket's ready */ + return false; + case -1: + elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ResetConnection(sk); + return false; + default: + Assert(false); + return false; + } +} + +// Check if we need to suspend inserts because of lagging replication. +static uint64 +backpressure_lag_impl(void) +{ + if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0) + { + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; + XLogRecPtr myFlushLsn = GetFlushRecPtr(); + + replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); +#define MB ((XLogRecPtr)1024*1024) + + elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X", + LSN_FORMAT_ARGS(myFlushLsn), + LSN_FORMAT_ARGS(writePtr), + LSN_FORMAT_ARGS(flushPtr), + LSN_FORMAT_ARGS(applyPtr)); + + if ((writePtr != InvalidXLogRecPtr + && max_replication_write_lag > 0 + && myFlushLsn > writePtr + max_replication_write_lag*MB)) + { + return (myFlushLsn - writePtr - max_replication_write_lag*MB); + } + + if ((flushPtr != InvalidXLogRecPtr + && max_replication_flush_lag > 0 + && myFlushLsn > flushPtr + max_replication_flush_lag*MB)) + { + return (myFlushLsn - flushPtr - max_replication_flush_lag*MB); + } + + if ((applyPtr != InvalidXLogRecPtr + && max_replication_apply_lag > 0 + && myFlushLsn > applyPtr + max_replication_apply_lag*MB)) + { + return (myFlushLsn - applyPtr - max_replication_apply_lag*MB); + } + } + return 0; +} diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h new file mode 100644 index 0000000000..b684d5264f --- /dev/null +++ b/pgxn/neon/walproposer.h @@ -0,0 +1,540 @@ +#ifndef __NEON_WALPROPOSER_H__ +#define __NEON_WALPROPOSER_H__ + +#include "access/xlogdefs.h" +#include "postgres.h" +#include "port.h" +#include "access/xlog_internal.h" +#include "access/transam.h" +#include "nodes/replnodes.h" +#include "utils/uuid.h" +#include "replication/walreceiver.h" + +#define SK_MAGIC 0xCafeCeefu +#define SK_PROTOCOL_VERSION 2 + +#define MAX_SAFEKEEPERS 32 +#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single WAL message */ +#define XLOG_HDR_SIZE (1+8*3) /* 'w' + startPos + walEnd + timestamp */ +#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender message header */ +#define XLOG_HDR_END_POS (1+8) /* offset of end position in wal sender message header */ + +/* + * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured, + * because all WL_* events are given flags equal to some (1 << i), starting from i = 0 + */ +#define WL_NO_EVENTS 0 + +extern char* wal_acceptors_list; +extern int wal_acceptor_reconnect_timeout; +extern int wal_acceptor_connect_timeout; +extern bool am_wal_proposer; + +struct WalProposerConn; /* Defined in libpqwalproposer */ +typedef struct WalProposerConn WalProposerConn; + +struct WalMessage; +typedef struct WalMessage WalMessage; + +extern char *zenith_timeline_walproposer; +extern char *zenith_tenant_walproposer; + +/* Possible return values from ReadPGAsync */ +typedef enum +{ + /* The full read was successful. buf now points to the data */ + PG_ASYNC_READ_SUCCESS, + /* The read is ongoing. Wait until the connection is read-ready, then try + * again. */ + PG_ASYNC_READ_TRY_AGAIN, + /* Reading failed. Check PQerrorMessage(conn) */ + PG_ASYNC_READ_FAIL, +} PGAsyncReadResult; + +/* Possible return values from WritePGAsync */ +typedef enum +{ + /* The write fully completed */ + PG_ASYNC_WRITE_SUCCESS, + /* The write started, but you'll need to call PQflush some more times + * to finish it off. We just tried, so it's best to wait until the + * connection is read- or write-ready to try again. + * + * If it becomes read-ready, call PQconsumeInput and flush again. If it + * becomes write-ready, just call PQflush. + */ + PG_ASYNC_WRITE_TRY_FLUSH, + /* Writing failed. Check PQerrorMessage(conn) */ + PG_ASYNC_WRITE_FAIL, +} PGAsyncWriteResult; + +/* + * WAL safekeeper state, which is used to wait for some event. + * + * States are listed here in the order that they're executed. + * + * Most states, upon failure, will move back to SS_OFFLINE by calls to + * ResetConnection or ShutdownConnection. + */ +typedef enum +{ + /* + * Does not have an active connection and will stay that way until + * further notice. + * + * Moves to SS_CONNECTING_WRITE by calls to ResetConnection. + */ + SS_OFFLINE, + + /* + * Connecting states. "_READ" waits for the socket to be available for + * reading, "_WRITE" waits for writing. There's no difference in the code + * they execute when polled, but we have this distinction in order to + * recreate the event set in HackyRemoveWalProposerEvent. + * + * After the connection is made, "START_WAL_PUSH" query is sent. + */ + SS_CONNECTING_WRITE, + SS_CONNECTING_READ, + + /* + * Waiting for the result of the "START_WAL_PUSH" command. + * + * After we get a successful result, sends handshake to safekeeper. + */ + SS_WAIT_EXEC_RESULT, + + /* + * Executing the receiving half of the handshake. After receiving, moves to + * SS_VOTING. + */ + SS_HANDSHAKE_RECV, + + /* + * Waiting to participate in voting, but a quorum hasn't yet been reached. + * This is an idle state - we do not expect AdvancePollState to be called. + * + * Moved externally by execution of SS_HANDSHAKE_RECV, when we received a + * quorum of handshakes. + */ + SS_VOTING, + + /* + * Already sent voting information, waiting to receive confirmation from the + * node. After receiving, moves to SS_IDLE, if the quorum isn't reached yet. + */ + SS_WAIT_VERDICT, + + /* Need to flush ProposerElected message. */ + SS_SEND_ELECTED_FLUSH, + + /* + * Waiting for quorum to send WAL. Idle state. If the socket becomes + * read-ready, the connection has been closed. + * + * Moves to SS_ACTIVE only by call to StartStreaming. + */ + SS_IDLE, + + /* + * Active phase, when we acquired quorum and have WAL to send or feedback + * to read. + */ + SS_ACTIVE, +} SafekeeperState; + +/* Consensus logical timestamp. */ +typedef uint64 term_t; + +/* neon storage node id */ +typedef uint64 NNodeId; + +/* + * Proposer <-> Acceptor messaging. + */ + +/* Initial Proposer -> Acceptor message */ +typedef struct ProposerGreeting +{ + uint64 tag; /* message tag */ + uint32 protocolVersion; /* proposer-safekeeper protocol version */ + uint32 pgVersion; + pg_uuid_t proposerId; + uint64 systemId; /* Postgres system identifier */ + uint8 ztimelineid[16]; /* Zenith timeline id */ + uint8 ztenantid[16]; + TimeLineID timeline; + uint32 walSegSize; +} ProposerGreeting; + +typedef struct AcceptorProposerMessage +{ + uint64 tag; +} AcceptorProposerMessage; + +/* + * Acceptor -> Proposer initial response: the highest term acceptor voted for. + */ +typedef struct AcceptorGreeting +{ + AcceptorProposerMessage apm; + term_t term; + NNodeId nodeId; +} AcceptorGreeting; + +/* + * Proposer -> Acceptor vote request. + */ +typedef struct VoteRequest +{ + uint64 tag; + term_t term; + pg_uuid_t proposerId; /* for monitoring/debugging */ +} VoteRequest; + +/* Element of term switching chain. */ +typedef struct TermSwitchEntry +{ + term_t term; + XLogRecPtr lsn; +} TermSwitchEntry; + +typedef struct TermHistory +{ + uint32 n_entries; + TermSwitchEntry *entries; +} TermHistory; + +/* Vote itself, sent from safekeeper to proposer */ +typedef struct VoteResponse { + AcceptorProposerMessage apm; + term_t term; + uint64 voteGiven; + /* + * Safekeeper flush_lsn (end of WAL) + history of term switches allow + * proposer to choose the most advanced one. + */ + XLogRecPtr flushLsn; + XLogRecPtr truncateLsn; /* minimal LSN which may be needed for recovery of some safekeeper */ + TermHistory termHistory; + XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ +} VoteResponse; + +/* + * Proposer -> Acceptor message announcing proposer is elected and communicating + * epoch history to it. + */ +typedef struct ProposerElected +{ + uint64 tag; + term_t term; + /* proposer will send since this point */ + XLogRecPtr startStreamingAt; + /* history of term switches up to this proposer */ + TermHistory *termHistory; + /* timeline globally starts at this LSN */ + XLogRecPtr timelineStartLsn; +} ProposerElected; + +/* + * Header of request with WAL message sent from proposer to safekeeper. + */ +typedef struct AppendRequestHeader +{ + uint64 tag; + term_t term; /* term of the proposer */ + /* + * LSN since which current proposer appends WAL (begin_lsn of its first + * record); determines epoch switch point. + */ + XLogRecPtr epochStartLsn; + XLogRecPtr beginLsn; /* start position of message in WAL */ + XLogRecPtr endLsn; /* end position of message in WAL */ + XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ + /* + * minimal LSN which may be needed for recovery of some safekeeper (end lsn + * + 1 of last chunk streamed to everyone) + */ + XLogRecPtr truncateLsn; + pg_uuid_t proposerId; /* for monitoring/debugging */ +} AppendRequestHeader; + +/* + * Hot standby feedback received from replica + */ +typedef struct HotStandbyFeedback +{ + TimestampTz ts; + FullTransactionId xmin; + FullTransactionId catalog_xmin; +} HotStandbyFeedback; + + +typedef struct ReplicationFeedback +{ + // current size of the timeline on pageserver + uint64 currentClusterSize; + // standby_status_update fields that safekeeper received from pageserver + XLogRecPtr ps_writelsn; + XLogRecPtr ps_flushlsn; + XLogRecPtr ps_applylsn; + TimestampTz ps_replytime; +} ReplicationFeedback; + + +typedef struct WalproposerShmemState +{ + slock_t mutex; + ReplicationFeedback feedback; + term_t mineLastElectedTerm; +} WalproposerShmemState; + +/* + * Report safekeeper state to proposer + */ +typedef struct AppendResponse +{ + AcceptorProposerMessage apm; + /* + * Current term of the safekeeper; if it is higher than proposer's, the + * compute is out of date. + */ + term_t term; + // TODO: add comment + XLogRecPtr flushLsn; + // Safekeeper reports back his awareness about which WAL is committed, as + // this is a criterion for walproposer --sync mode exit + XLogRecPtr commitLsn; + HotStandbyFeedback hs; + // Feedback recieved from pageserver includes standby_status_update fields + // and custom zenith feedback. + // This part of the message is extensible. + ReplicationFeedback rf; +} AppendResponse; + +// ReplicationFeedback is extensible part of the message that is parsed separately +// Other fields are fixed part +#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) + + +/* + * Descriptor of safekeeper + */ +typedef struct Safekeeper +{ + char const* host; + char const* port; + char conninfo[MAXCONNINFO]; /* connection info for connecting/reconnecting */ + + /* + * postgres protocol connection to the WAL acceptor + * + * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we + * reach SS_ACTIVE; not before. + */ + WalProposerConn* conn; + /* + * Temporary buffer for the message being sent to the safekeeper. + */ + StringInfoData outbuf; + /* + * WAL reader, allocated for each safekeeper. + */ + XLogReaderState* xlogreader; + + /* + * Streaming will start here; must be record boundary. + */ + XLogRecPtr startStreamingAt; + + bool flushWrite; /* set to true if we need to call AsyncFlush, to flush pending messages */ + XLogRecPtr streamingAt; /* current streaming position */ + AppendRequestHeader appendRequest; /* request for sending to safekeeper */ + + int eventPos; /* position in wait event set. Equal to -1 if no event */ + SafekeeperState state; /* safekeeper state machine state */ + TimestampTz startedConnAt; /* when connection attempt started */ + AcceptorGreeting greetResponse; /* acceptor greeting */ + VoteResponse voteResponse; /* the vote */ + AppendResponse appendResponse; /* feedback for master */ +} Safekeeper; + + +extern PGDLLIMPORT void WalProposerMain(Datum main_arg); +void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); +void WalProposerPoll(void); +void WalProposerRegister(void); +void ParseReplicationFeedbackMessage(StringInfo reply_message, + ReplicationFeedback *rf); +extern void StartProposerReplication(StartReplicationCmd *cmd); + +Size WalproposerShmemSize(void); +bool WalproposerShmemInit(void); +void replication_feedback_set(ReplicationFeedback *rf); +void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); + +/* libpqwalproposer hooks & helper type */ + +/* Re-exported PostgresPollingStatusType */ +typedef enum +{ + WP_CONN_POLLING_FAILED = 0, + WP_CONN_POLLING_READING, + WP_CONN_POLLING_WRITING, + WP_CONN_POLLING_OK, + /* + * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused. + * We've removed it here to avoid clutter. + */ +} WalProposerConnectPollStatusType; + +/* Re-exported and modified ExecStatusType */ +typedef enum +{ + /* We received a single CopyBoth result */ + WP_EXEC_SUCCESS_COPYBOTH, + /* Any success result other than a single CopyBoth was received. The specifics of the result + * were already logged, but it may be useful to provide an error message indicating which + * safekeeper messed up. + * + * Do not expect PQerrorMessage to be appropriately set. */ + WP_EXEC_UNEXPECTED_SUCCESS, + /* No result available at this time. Wait until read-ready, then call again. Internally, this is + * returned when PQisBusy indicates that PQgetResult would block. */ + WP_EXEC_NEEDS_INPUT, + /* Catch-all failure. Check PQerrorMessage. */ + WP_EXEC_FAILED, +} WalProposerExecStatusType; + +/* Re-exported ConnStatusType */ +typedef enum +{ + WP_CONNECTION_OK, + WP_CONNECTION_BAD, + + /* + * The original ConnStatusType has many more tags, but requests that + * they not be relied upon (except for displaying to the user). We + * don't need that extra functionality, so we collect them into a + * single tag here. + */ + WP_CONNECTION_IN_PROGRESS, +} WalProposerConnStatusType; + +/* Re-exported PQerrorMessage */ +typedef char* (*walprop_error_message_fn) (WalProposerConn* conn); + +/* Re-exported PQstatus */ +typedef WalProposerConnStatusType (*walprop_status_fn) (WalProposerConn* conn); + +/* Re-exported PQconnectStart */ +typedef WalProposerConn* (*walprop_connect_start_fn) (char* conninfo); + +/* Re-exported PQconectPoll */ +typedef WalProposerConnectPollStatusType (*walprop_connect_poll_fn) (WalProposerConn* conn); + +/* Blocking wrapper around PQsendQuery */ +typedef bool (*walprop_send_query_fn) (WalProposerConn* conn, char* query); + +/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */ +typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerConn* conn); + +/* Re-exported PQsocket */ +typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn); + +/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */ +typedef int (*walprop_flush_fn) (WalProposerConn* conn); + +/* Re-exported PQfinish */ +typedef void (*walprop_finish_fn) (WalProposerConn* conn); + +/* + * Ergonomic wrapper around PGgetCopyData + * + * Reads a CopyData block from a safekeeper, setting *amount to the number + * of bytes returned. + * + * This function is allowed to assume certain properties specific to the + * protocol with the safekeepers, so it should not be used as-is for any + * other purpose. + * + * Note: If possible, using is generally preferred, because it + * performs a bit of extra checking work that's always required and is normally + * somewhat verbose. + */ +typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn, + char** buf, + int* amount); + +/* + * Ergonomic wrapper around PQputCopyData + PQflush + * + * Starts to write a CopyData block to a safekeeper. + * + * For information on the meaning of return codes, refer to PGAsyncWriteResult. + */ +typedef PGAsyncWriteResult (*walprop_async_write_fn) (WalProposerConn* conn, + void const* buf, + size_t size); + +/* + * Blocking equivalent to walprop_async_write_fn + * + * Returns 'true' if successful, 'false' on failure. + */ +typedef bool (*walprop_blocking_write_fn) (WalProposerConn* conn, void const* buf, size_t size); + +/* All libpqwalproposer exported functions collected together. */ +typedef struct WalProposerFunctionsType +{ + walprop_error_message_fn walprop_error_message; + walprop_status_fn walprop_status; + walprop_connect_start_fn walprop_connect_start; + walprop_connect_poll_fn walprop_connect_poll; + walprop_send_query_fn walprop_send_query; + walprop_get_query_result_fn walprop_get_query_result; + walprop_socket_fn walprop_socket; + walprop_flush_fn walprop_flush; + walprop_finish_fn walprop_finish; + walprop_async_read_fn walprop_async_read; + walprop_async_write_fn walprop_async_write; + walprop_blocking_write_fn walprop_blocking_write; +} WalProposerFunctionsType; + +/* Allow the above functions to be "called" with normal syntax */ +#define walprop_error_message(conn) \ + WalProposerFunctions->walprop_error_message(conn) +#define walprop_status(conn) \ + WalProposerFunctions->walprop_status(conn) +#define walprop_connect_start(conninfo) \ + WalProposerFunctions->walprop_connect_start(conninfo) +#define walprop_connect_poll(conn) \ + WalProposerFunctions->walprop_connect_poll(conn) +#define walprop_send_query(conn, query) \ + WalProposerFunctions->walprop_send_query(conn, query) +#define walprop_get_query_result(conn) \ + WalProposerFunctions->walprop_get_query_result(conn) +#define walprop_set_nonblocking(conn, arg) \ + WalProposerFunctions->walprop_set_nonblocking(conn, arg) +#define walprop_socket(conn) \ + WalProposerFunctions->walprop_socket(conn) +#define walprop_flush(conn) \ + WalProposerFunctions->walprop_flush(conn) +#define walprop_finish(conn) \ + WalProposerFunctions->walprop_finish(conn) +#define walprop_async_read(conn, buf, amount) \ + WalProposerFunctions->walprop_async_read(conn, buf, amount) +#define walprop_async_write(conn, buf, size) \ + WalProposerFunctions->walprop_async_write(conn, buf, size) +#define walprop_blocking_write(conn, buf, size) \ + WalProposerFunctions->walprop_blocking_write(conn, buf, size) + +/* + * The runtime location of the libpqwalproposer functions. + * + * This pointer is set by the initializer in libpqwalproposer, so that we + * can use it later. + */ +extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions; + +#endif /* __NEON_WALPROPOSER_H__ */ diff --git a/pgxn/neon/walproposer_utils.c b/pgxn/neon/walproposer_utils.c new file mode 100644 index 0000000000..7b96fd580c --- /dev/null +++ b/pgxn/neon/walproposer_utils.c @@ -0,0 +1,1110 @@ +#include "postgres.h" + +#include "access/timeline.h" +#include "access/xlogutils.h" +#include "common/logging.h" +#include "common/ip.h" +#include "funcapi.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "miscadmin.h" +#include "postmaster/interrupt.h" +#include "replication/slot.h" +#include "walproposer_utils.h" +#include "replication/walsender_private.h" + +#include "storage/ipc.h" +#include "utils/builtins.h" +#include "utils/ps_status.h" + +#include "libpq-fe.h" +#include +#include + +/* + * These variables are used similarly to openLogFile/SegNo, + * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID + * corresponding the filename of walpropFile. + */ +static int walpropFile = -1; +static TimeLineID walpropFileTLI = 0; +static XLogSegNo walpropSegNo = 0; + +/* START cloned file-local variables and functions from walsender.c */ + +/* + * xlogreader used for replication. Note that a WAL sender doing physical + * replication does not need xlogreader to read WAL, but it needs one to + * keep a state of its work. + */ +static XLogReaderState *xlogreader = NULL; + +/* + * These variables keep track of the state of the timeline we're currently + * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric, + * the timeline is not the latest timeline on this server, and the server's + * history forked off from that timeline at sendTimeLineValidUpto. + */ +static TimeLineID sendTimeLine = 0; +static TimeLineID sendTimeLineNextTLI = 0; +static bool sendTimeLineIsHistoric = false; +static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr; + +/* + * Timestamp of last ProcessRepliesIfAny() that saw a reply from the + * standby. Set to 0 if wal_sender_timeout doesn't need to be active. + */ +static TimestampTz last_reply_timestamp = 0; + +/* Have we sent a heartbeat message asking for reply, since last reply? */ +static bool waiting_for_ping_response = false; + +static bool streamingDoneSending; +static bool streamingDoneReceiving; + +/* Are we there yet? */ +static bool WalSndCaughtUp = false; + +/* Flags set by signal handlers for later service in main loop */ +static volatile sig_atomic_t got_STOPPING = false; + +/* + * How far have we sent WAL already? This is also advertised in + * MyWalSnd->sentPtr. (Actually, this is the next WAL location to send.) + */ +static XLogRecPtr sentPtr = InvalidXLogRecPtr; + +/* + * This is set while we are streaming. When not set + * PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set, + * the main loop is responsible for checking got_STOPPING and terminating when + * it's set (after streaming any remaining WAL). + */ +static volatile sig_atomic_t replication_active = false; + +typedef void (*WalSndSendDataCallback) (void); +static void WalSndLoop(WalSndSendDataCallback send_data); +static void XLogSendPhysical(void); +static XLogRecPtr GetStandbyFlushRecPtr(void); + +static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, + TimeLineID *tli_p); + +/* END cloned file-level variables and functions from walsender.c */ + +int +CompareLsn(const void *a, const void *b) +{ + XLogRecPtr lsn1 = *((const XLogRecPtr *) a); + XLogRecPtr lsn2 = *((const XLogRecPtr *) b); + + if (lsn1 < lsn2) + return -1; + else if (lsn1 == lsn2) + return 0; + else + return 1; +} + +/* Returns a human-readable string corresonding to the SafekeeperState + * + * The string should not be freed. + * + * The strings are intended to be used as a prefix to "state", e.g.: + * + * elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state)); + * + * If this sort of phrasing doesn't fit the message, instead use something like: + * + * elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state)); + */ +char* +FormatSafekeeperState(SafekeeperState state) +{ + char* return_val = NULL; + + switch (state) + { + case SS_OFFLINE: + return_val = "offline"; + break; + case SS_CONNECTING_READ: + case SS_CONNECTING_WRITE: + return_val = "connecting"; + break; + case SS_WAIT_EXEC_RESULT: + return_val = "receiving query result"; + break; + case SS_HANDSHAKE_RECV: + return_val = "handshake (receiving)"; + break; + case SS_VOTING: + return_val = "voting"; + break; + case SS_WAIT_VERDICT: + return_val = "wait-for-verdict"; + break; + case SS_SEND_ELECTED_FLUSH: + return_val = "send-announcement-flush"; + break; + case SS_IDLE: + return_val = "idle"; + break; + case SS_ACTIVE: + return_val = "active"; + break; + } + + Assert(return_val != NULL); + + return return_val; +} + +/* Asserts that the provided events are expected for given safekeeper's state */ +void +AssertEventsOkForState(uint32 events, Safekeeper* sk) +{ + uint32 expected = SafekeeperStateDesiredEvents(sk->state); + + /* The events are in-line with what we're expecting, under two conditions: + * (a) if we aren't expecting anything, `events` has no read- or + * write-ready component. + * (b) if we are expecting something, there's overlap + * (i.e. `events & expected != 0`) + */ + bool events_ok_for_state; /* long name so the `Assert` is more clear later */ + + if (expected == WL_NO_EVENTS) + events_ok_for_state = ((events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)) == 0); + else + events_ok_for_state = ((events & expected) != 0); + + if (!events_ok_for_state) + { + /* To give a descriptive message in the case of failure, we use elog and + * then an assertion that's guaranteed to fail. */ + elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", + FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state)); + Assert(events_ok_for_state); + } +} + +/* Returns the set of events a safekeeper in this state should be waiting on + * + * This will return WL_NO_EVENTS (= 0) for some events. */ +uint32 +SafekeeperStateDesiredEvents(SafekeeperState state) +{ + uint32 result = WL_NO_EVENTS; + + /* If the state doesn't have a modifier, we can check the base state */ + switch (state) + { + /* Connecting states say what they want in the name */ + case SS_CONNECTING_READ: + result = WL_SOCKET_READABLE; + break; + case SS_CONNECTING_WRITE: + result = WL_SOCKET_WRITEABLE; + break; + + /* Reading states need the socket to be read-ready to continue */ + case SS_WAIT_EXEC_RESULT: + case SS_HANDSHAKE_RECV: + case SS_WAIT_VERDICT: + result = WL_SOCKET_READABLE; + break; + + /* Idle states use read-readiness as a sign that the connection has been + * disconnected. */ + case SS_VOTING: + case SS_IDLE: + result = WL_SOCKET_READABLE; + break; + + /* + * Flush states require write-ready for flushing. + * Active state does both reading and writing. + * + * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should + * check sk->flushWrite here to set WL_SOCKET_WRITEABLE. + */ + case SS_SEND_ELECTED_FLUSH: + case SS_ACTIVE: + result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; + break; + + /* The offline state expects no events. */ + case SS_OFFLINE: + result = WL_NO_EVENTS; + break; + + default: + Assert(false); + break; + } + + return result; +} + +/* Returns a human-readable string corresponding to the event set + * + * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the + * returned string may be meaingless. + * + * The string should not be freed. It should also not be expected to remain the same between + * function calls. */ +char* +FormatEvents(uint32 events) +{ + static char return_str[8]; + + /* Helper variable to check if there's extra bits */ + uint32 all_flags = WL_LATCH_SET + | WL_SOCKET_READABLE + | WL_SOCKET_WRITEABLE + | WL_TIMEOUT + | WL_POSTMASTER_DEATH + | WL_EXIT_ON_PM_DEATH + | WL_SOCKET_CONNECTED; + + /* The formatting here isn't supposed to be *particularly* useful -- it's just to give an + * sense of what events have been triggered without needing to remember your powers of two. */ + + return_str[0] = (events & WL_LATCH_SET ) ? 'L' : '_'; + return_str[1] = (events & WL_SOCKET_READABLE ) ? 'R' : '_'; + return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_'; + return_str[3] = (events & WL_TIMEOUT ) ? 'T' : '_'; + return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_'; + return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_'; + return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_'; + + if (events & (~all_flags)) + { + elog(WARNING, "Event formatting found unexpected component %d", + events & (~all_flags)); + return_str[6] = '*'; + return_str[7] = '\0'; + } + else + return_str[6] = '\0'; + + return (char *) &return_str; +} + +/* + * Convert a character which represents a hexadecimal digit to an integer. + * + * Returns -1 if the character is not a hexadecimal digit. + */ +static int +HexDecodeChar(char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + + return -1; +} + +/* + * Decode a hex string into a byte string, 2 hex chars per byte. + * + * Returns false if invalid characters are encountered; otherwise true. + */ +bool +HexDecodeString(uint8 *result, char *input, int nbytes) +{ + int i; + + for (i = 0; i < nbytes; ++i) + { + int n1 = HexDecodeChar(input[i * 2]); + int n2 = HexDecodeChar(input[i * 2 + 1]); + + if (n1 < 0 || n2 < 0) + return false; + result[i] = n1 * 16 + n2; + } + + return true; +} + +/* -------------------------------- + * pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order + * -------------------------------- + */ +uint32 +pq_getmsgint32_le(StringInfo msg) +{ + uint32 n32; + + pq_copymsgbytes(msg, (char *) &n32, sizeof(n32)); + + return n32; +} + +/* -------------------------------- + * pq_getmsgint64 - get a binary 8-byte int from a message buffer in native (LE) order + * -------------------------------- + */ +uint64 +pq_getmsgint64_le(StringInfo msg) +{ + uint64 n64; + + pq_copymsgbytes(msg, (char *) &n64, sizeof(n64)); + + return n64; +} + +/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */ +void +pq_sendint32_le(StringInfo buf, uint32 i) +{ + enlargeStringInfo(buf, sizeof(uint32)); + memcpy(buf->data + buf->len, &i, sizeof(uint32)); + buf->len += sizeof(uint32); +} + +/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */ +void +pq_sendint64_le(StringInfo buf, uint64 i) +{ + enlargeStringInfo(buf, sizeof(uint64)); + memcpy(buf->data + buf->len, &i, sizeof(uint64)); + buf->len += sizeof(uint64); +} + +/* + * Write XLOG data to disk. + */ +void +XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr) +{ + int startoff; + int byteswritten; + + while (nbytes > 0) + { + int segbytes; + + /* Close the current segment if it's completed */ + if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) + XLogWalPropClose(recptr); + + if (walpropFile < 0) + { + bool use_existent = true; + + /* Create/use new log file */ + XLByteToSeg(recptr, walpropSegNo, wal_segment_size); + walpropFile = XLogFileInit(walpropSegNo, &use_existent, false); + walpropFileTLI = ThisTimeLineID; + } + + /* Calculate the start offset of the received logs */ + startoff = XLogSegmentOffset(recptr, wal_segment_size); + + if (startoff + nbytes > wal_segment_size) + segbytes = wal_segment_size - startoff; + else + segbytes = nbytes; + + /* OK to write the logs */ + errno = 0; + + byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff); + if (byteswritten <= 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno; + + /* if write didn't set errno, assume no disk space */ + if (errno == 0) + errno = ENOSPC; + + save_errno = errno; + XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to log segment %s " + "at offset %u, length %lu: %m", + xlogfname, startoff, (unsigned long) segbytes))); + } + + /* Update state for write */ + recptr += byteswritten; + + nbytes -= byteswritten; + buf += byteswritten; + } + + /* + * Close the current segment if it's fully written up in the last cycle of + * the loop. + */ + if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) + { + XLogWalPropClose(recptr); + } +} + +/* + * Close the current segment. + */ +void +XLogWalPropClose(XLogRecPtr recptr) +{ + Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)); + + if (close(walpropFile) != 0) + { + char xlogfname[MAXFNAMELEN]; + XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); + + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close log segment %s: %m", + xlogfname))); + } + + walpropFile = -1; +} + +/* START of cloned functions from walsender.c */ + +/* + * Handle START_REPLICATION command. + * + * At the moment, this never returns, but an ereport(ERROR) will take us back + * to the main loop. + */ +void +StartProposerReplication(StartReplicationCmd *cmd) +{ + XLogRecPtr FlushPtr; + + if (ThisTimeLineID == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); + + /* create xlogreader for physical replication */ + xlogreader = + XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.segment_open = WalSndSegmentOpen, + .segment_close = wal_segment_close), + NULL); + + if (!xlogreader) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* + * We assume here that we're logging enough information in the WAL for + * log-shipping, since this is checked in PostmasterMain(). + * + * NOTE: wal_level can only change at shutdown, so in most cases it is + * difficult for there to be WAL data that we can still see that was + * written at wal_level='minimal'. + */ + + if (cmd->slotname) + { + ReplicationSlotAcquire(cmd->slotname, true); + if (SlotIsLogical(MyReplicationSlot)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot use a logical replication slot for physical replication"))); + + /* + * We don't need to verify the slot's restart_lsn here; instead we + * rely on the caller requesting the starting point to use. If the + * WAL segment doesn't exist, we'll fail later. + */ + } + + /* + * Select the timeline. If it was given explicitly by the client, use + * that. Otherwise use the timeline of the last replayed record, which is + * kept in ThisTimeLineID. + * + * Neon doesn't currently use PG Timelines, but it may in the future, so + * we keep this code around to lighten the load for when we need it. + */ + if (am_cascading_walsender) + { + /* this also updates ThisTimeLineID */ + FlushPtr = GetStandbyFlushRecPtr(); + } + else + FlushPtr = GetFlushRecPtr(); + + if (cmd->timeline != 0) + { + XLogRecPtr switchpoint; + + sendTimeLine = cmd->timeline; + if (sendTimeLine == ThisTimeLineID) + { + sendTimeLineIsHistoric = false; + sendTimeLineValidUpto = InvalidXLogRecPtr; + } + else + { + List *timeLineHistory; + + sendTimeLineIsHistoric = true; + + /* + * Check that the timeline the client requested exists, and the + * requested start location is on that timeline. + */ + timeLineHistory = readTimeLineHistory(ThisTimeLineID); + switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory, + &sendTimeLineNextTLI); + list_free_deep(timeLineHistory); + + /* + * Found the requested timeline in the history. Check that + * requested startpoint is on that timeline in our history. + * + * This is quite loose on purpose. We only check that we didn't + * fork off the requested timeline before the switchpoint. We + * don't check that we switched *to* it before the requested + * starting point. This is because the client can legitimately + * request to start replication from the beginning of the WAL + * segment that contains switchpoint, but on the new timeline, so + * that it doesn't end up with a partial segment. If you ask for + * too old a starting point, you'll get an error later when we + * fail to find the requested WAL segment in pg_wal. + * + * XXX: we could be more strict here and only allow a startpoint + * that's older than the switchpoint, if it's still in the same + * WAL segment. + */ + if (!XLogRecPtrIsInvalid(switchpoint) && + switchpoint < cmd->startpoint) + { + ereport(ERROR, + (errmsg("requested starting point %X/%X on timeline %u is not in this server's history", + LSN_FORMAT_ARGS(cmd->startpoint), + cmd->timeline), + errdetail("This server's history forked from timeline %u at %X/%X.", + cmd->timeline, + LSN_FORMAT_ARGS(switchpoint)))); + } + sendTimeLineValidUpto = switchpoint; + } + } + else + { + sendTimeLine = ThisTimeLineID; + sendTimeLineValidUpto = InvalidXLogRecPtr; + sendTimeLineIsHistoric = false; + } + + streamingDoneSending = streamingDoneReceiving = false; + + /* If there is nothing to stream, don't even enter COPY mode */ + if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto) + { + /* + * When we first start replication the standby will be behind the + * primary. For some applications, for example synchronous + * replication, it is important to have a clear state for this initial + * catchup mode, so we can trigger actions when we change streaming + * state later. We may stay in this state for a long time, which is + * exactly why we want to be able to monitor whether or not we are + * still here. + */ + WalSndSetState(WALSNDSTATE_CATCHUP); + + /* + * Don't allow a request to stream from a future point in WAL that + * hasn't been flushed to disk in this server yet. + */ + if (FlushPtr < cmd->startpoint) + { + ereport(ERROR, + (errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X", + LSN_FORMAT_ARGS(cmd->startpoint), + LSN_FORMAT_ARGS(FlushPtr)))); + } + + /* Start streaming from the requested point */ + sentPtr = cmd->startpoint; + + /* Initialize shared memory status, too */ + SpinLockAcquire(&MyWalSnd->mutex); + MyWalSnd->sentPtr = sentPtr; + SpinLockRelease(&MyWalSnd->mutex); + + SyncRepInitConfig(); + + /* Main loop of walsender */ + replication_active = true; + + WalSndLoop(XLogSendPhysical); + + replication_active = false; + if (got_STOPPING) + proc_exit(0); + WalSndSetState(WALSNDSTATE_STARTUP); + + Assert(streamingDoneSending && streamingDoneReceiving); + } + + if (cmd->slotname) + ReplicationSlotRelease(); + + /* + * Copy is finished now. Send a single-row result set indicating the next + * timeline. + */ + if (sendTimeLineIsHistoric) + { + char startpos_str[8 + 1 + 8 + 1]; + DestReceiver *dest; + TupOutputState *tstate; + TupleDesc tupdesc; + Datum values[2]; + bool nulls[2]; + + snprintf(startpos_str, sizeof(startpos_str), "%X/%X", + LSN_FORMAT_ARGS(sendTimeLineValidUpto)); + + dest = CreateDestReceiver(DestRemoteSimple); + MemSet(nulls, false, sizeof(nulls)); + + /* + * Need a tuple descriptor representing two columns. int8 may seem + * like a surprising data type for this, but in theory int4 would not + * be wide enough for this, as TimeLineID is unsigned. + */ + tupdesc = CreateTemplateTupleDesc(2); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli", + INT8OID, -1, 0); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos", + TEXTOID, -1, 0); + + /* prepare for projection of tuple */ + tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual); + + values[0] = Int64GetDatum((int64) sendTimeLineNextTLI); + values[1] = CStringGetTextDatum(startpos_str); + + /* send it to dest */ + do_tup_output(tstate, values, nulls); + + end_tup_output(tstate); + } + + /* Send CommandComplete message */ + EndReplicationCommand("START_STREAMING"); +} + +/* + * Returns the latest point in WAL that has been safely flushed to disk, and + * can be sent to the standby. This should only be called when in recovery, + * ie. we're streaming to a cascaded standby. + * + * As a side-effect, ThisTimeLineID is updated to the TLI of the last + * replayed WAL record. + */ +static XLogRecPtr +GetStandbyFlushRecPtr(void) +{ + XLogRecPtr replayPtr; + TimeLineID replayTLI; + XLogRecPtr receivePtr; + TimeLineID receiveTLI; + XLogRecPtr result; + + /* + * We can safely send what's already been replayed. Also, if walreceiver + * is streaming WAL from the same timeline, we can send anything that it + * has streamed, but hasn't been replayed yet. + */ + + receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI); + replayPtr = GetXLogReplayRecPtr(&replayTLI); + + ThisTimeLineID = replayTLI; + + result = replayPtr; + if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr) + result = receivePtr; + + return result; +} + +/* XLogReaderRoutine->segment_open callback */ +static void +WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, + TimeLineID *tli_p) +{ + char path[MAXPGPATH]; + + /*------- + * When reading from a historic timeline, and there is a timeline switch + * within this segment, read from the WAL segment belonging to the new + * timeline. + * + * For example, imagine that this server is currently on timeline 5, and + * we're streaming timeline 4. The switch from timeline 4 to 5 happened at + * 0/13002088. In pg_wal, we have these files: + * + * ... + * 000000040000000000000012 + * 000000040000000000000013 + * 000000050000000000000013 + * 000000050000000000000014 + * ... + * + * In this situation, when requested to send the WAL from segment 0x13, on + * timeline 4, we read the WAL from file 000000050000000000000013. Archive + * recovery prefers files from newer timelines, so if the segment was + * restored from the archive on this server, the file belonging to the old + * timeline, 000000040000000000000013, might not exist. Their contents are + * equal up to the switchpoint, because at a timeline switch, the used + * portion of the old segment is copied to the new file. ------- + */ + *tli_p = sendTimeLine; + if (sendTimeLineIsHistoric) + { + XLogSegNo endSegNo; + + XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize); + if (nextSegNo == endSegNo) + *tli_p = sendTimeLineNextTLI; + } + + XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize); + state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY); + if (state->seg.ws_file >= 0) + return; + + /* + * If the file is not found, assume it's because the standby asked for a + * too old WAL segment that has already been removed or recycled. + */ + if (errno == ENOENT) + { + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; + + XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("requested WAL segment %s has already been removed", + xlogfname))); + } + else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); +} + + +/* Main loop of walsender process that streams the WAL over Copy messages. */ +static void +WalSndLoop(WalSndSendDataCallback send_data) +{ + /* + * Initialize the last reply timestamp. That enables timeout processing + * from hereon. + */ + last_reply_timestamp = GetCurrentTimestamp(); + waiting_for_ping_response = false; + + /* + * Loop until we reach the end of this timeline or the client requests to + * stop streaming. + */ + for (;;) + { + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Process any requests or signals received recently */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + SyncRepInitConfig(); + } + + /* always true */ + if (am_wal_proposer) + { + send_data(); + if (WalSndCaughtUp) + { + if (MyWalSnd->state == WALSNDSTATE_CATCHUP) + WalSndSetState(WALSNDSTATE_STREAMING); + WalProposerPoll(); + WalSndCaughtUp = false; + } + continue; + } + } +} + +/* + * Send out the WAL in its normal physical/stored form. + * + * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk, + * but not yet sent to the client, and buffer it in the libpq output + * buffer. + * + * If there is no unsent WAL remaining, WalSndCaughtUp is set to true, + * otherwise WalSndCaughtUp is set to false. + */ +static void +XLogSendPhysical(void) +{ + XLogRecPtr SendRqstPtr; + XLogRecPtr startptr; + XLogRecPtr endptr; + Size nbytes PG_USED_FOR_ASSERTS_ONLY; + + /* If requested switch the WAL sender to the stopping state. */ + if (got_STOPPING) + WalSndSetState(WALSNDSTATE_STOPPING); + + if (streamingDoneSending) + { + WalSndCaughtUp = true; + return; + } + + /* Figure out how far we can safely send the WAL. */ + if (sendTimeLineIsHistoric) + { + /* + * Streaming an old timeline that's in this server's history, but is + * not the one we're currently inserting or replaying. It can be + * streamed up to the point where we switched off that timeline. + */ + SendRqstPtr = sendTimeLineValidUpto; + } + else if (am_cascading_walsender) + { + /* + * Streaming the latest timeline on a standby. + * + * Attempt to send all WAL that has already been replayed, so that we + * know it's valid. If we're receiving WAL through streaming + * replication, it's also OK to send any WAL that has been received + * but not replayed. + * + * The timeline we're recovering from can change, or we can be + * promoted. In either case, the current timeline becomes historic. We + * need to detect that so that we don't try to stream past the point + * where we switched to another timeline. We check for promotion or + * timeline switch after calculating FlushPtr, to avoid a race + * condition: if the timeline becomes historic just after we checked + * that it was still current, it's still be OK to stream it up to the + * FlushPtr that was calculated before it became historic. + */ + bool becameHistoric = false; + + SendRqstPtr = GetStandbyFlushRecPtr(); + + if (!RecoveryInProgress()) + { + /* + * We have been promoted. RecoveryInProgress() updated + * ThisTimeLineID to the new current timeline. + */ + am_cascading_walsender = false; + becameHistoric = true; + } + else + { + /* + * Still a cascading standby. But is the timeline we're sending + * still the one recovery is recovering from? ThisTimeLineID was + * updated by the GetStandbyFlushRecPtr() call above. + */ + if (sendTimeLine != ThisTimeLineID) + becameHistoric = true; + } + + if (becameHistoric) + { + /* + * The timeline we were sending has become historic. Read the + * timeline history file of the new timeline to see where exactly + * we forked off from the timeline we were sending. + */ + List *history; + + history = readTimeLineHistory(ThisTimeLineID); + sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI); + + Assert(sendTimeLine < sendTimeLineNextTLI); + list_free_deep(history); + + sendTimeLineIsHistoric = true; + + SendRqstPtr = sendTimeLineValidUpto; + } + } + else + { + /* + * Streaming the current timeline on a primary. + * + * Attempt to send all data that's already been written out and + * fsync'd to disk. We cannot go further than what's been written out + * given the current implementation of WALRead(). And in any case + * it's unsafe to send WAL that is not securely down to disk on the + * primary: if the primary subsequently crashes and restarts, standbys + * must not have applied any WAL that got lost on the primary. + */ + SendRqstPtr = GetFlushRecPtr(); + } + + /* + * Record the current system time as an approximation of the time at which + * this WAL location was written for the purposes of lag tracking. + * + * In theory we could make XLogFlush() record a time in shmem whenever WAL + * is flushed and we could get that time as well as the LSN when we call + * GetFlushRecPtr() above (and likewise for the cascading standby + * equivalent), but rather than putting any new code into the hot WAL path + * it seems good enough to capture the time here. We should reach this + * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that + * may take some time, we read the WAL flush pointer and take the time + * very close to together here so that we'll get a later position if it is + * still moving. + * + * Because LagTrackerWrite ignores samples when the LSN hasn't advanced, + * this gives us a cheap approximation for the WAL flush time for this + * LSN. + * + * Note that the LSN is not necessarily the LSN for the data contained in + * the present message; it's the end of the WAL, which might be further + * ahead. All the lag tracking machinery cares about is finding out when + * that arbitrary LSN is eventually reported as written, flushed and + * applied, so that it can measure the elapsed time. + */ + LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp()); + + /* + * If this is a historic timeline and we've reached the point where we + * forked to the next timeline, stop streaming. + * + * Note: We might already have sent WAL > sendTimeLineValidUpto. The + * startup process will normally replay all WAL that has been received + * from the primary, before promoting, but if the WAL streaming is + * terminated at a WAL page boundary, the valid portion of the timeline + * might end in the middle of a WAL record. We might've already sent the + * first half of that partial WAL record to the cascading standby, so that + * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't + * replay the partial WAL record either, so it can still follow our + * timeline switch. + */ + if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr) + { + /* close the current file. */ + if (xlogreader->seg.ws_file >= 0) + wal_segment_close(xlogreader); + + /* Send CopyDone */ + pq_putmessage_noblock('c', NULL, 0); + streamingDoneSending = true; + + WalSndCaughtUp = true; + + elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)", + LSN_FORMAT_ARGS(sendTimeLineValidUpto), + LSN_FORMAT_ARGS(sentPtr)); + return; + } + + /* Do we have any work to do? */ + Assert(sentPtr <= SendRqstPtr); + if (SendRqstPtr <= sentPtr) + { + WalSndCaughtUp = true; + return; + } + + /* + * Figure out how much to send in one message. If there's no more than + * MAX_SEND_SIZE bytes to send, send everything. Otherwise send + * MAX_SEND_SIZE bytes, but round back to logfile or page boundary. + * + * The rounding is not only for performance reasons. Walreceiver relies on + * the fact that we never split a WAL record across two messages. Since a + * long WAL record is split at page boundary into continuation records, + * page boundary is always a safe cut-off point. We also assume that + * SendRqstPtr never points to the middle of a WAL record. + */ + startptr = sentPtr; + endptr = startptr; + endptr += MAX_SEND_SIZE; + + /* if we went beyond SendRqstPtr, back off */ + if (SendRqstPtr <= endptr) + { + endptr = SendRqstPtr; + if (sendTimeLineIsHistoric) + WalSndCaughtUp = false; + else + WalSndCaughtUp = true; + } + else + { + /* round down to page boundary. */ + endptr -= (endptr % XLOG_BLCKSZ); + WalSndCaughtUp = false; + } + + nbytes = endptr - startptr; + Assert(nbytes <= MAX_SEND_SIZE); + + /* always true */ + if (am_wal_proposer) + { + WalProposerBroadcast(startptr, endptr); + } + else + { + /* code removed for brevity */ + } + sentPtr = endptr; + + /* Update shared memory status */ + { + WalSnd *walsnd = MyWalSnd; + + SpinLockAcquire(&walsnd->mutex); + walsnd->sentPtr = sentPtr; + SpinLockRelease(&walsnd->mutex); + } + + /* Report progress of XLOG streaming in PS display */ + if (update_process_title) + { + char activitymsg[50]; + + snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", + LSN_FORMAT_ARGS(sentPtr)); + set_ps_display(activitymsg); + } +} + diff --git a/pgxn/neon/walproposer_utils.h b/pgxn/neon/walproposer_utils.h new file mode 100644 index 0000000000..4771d3ff82 --- /dev/null +++ b/pgxn/neon/walproposer_utils.h @@ -0,0 +1,19 @@ +#ifndef __NEON_WALPROPOSER_UTILS_H__ +#define __NEON_WALPROPOSER_UTILS_H__ + +#include "walproposer.h" + +int CompareLsn(const void *a, const void *b); +char* FormatSafekeeperState(SafekeeperState state); +void AssertEventsOkForState(uint32 events, Safekeeper* sk); +uint32 SafekeeperStateDesiredEvents(SafekeeperState state); +char* FormatEvents(uint32 events); +bool HexDecodeString(uint8 *result, char *input, int nbytes); +uint32 pq_getmsgint32_le(StringInfo msg); +uint64 pq_getmsgint64_le(StringInfo msg); +void pq_sendint32_le(StringInfo buf, uint32 i); +void pq_sendint64_le(StringInfo buf, uint64 i); +void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr); +void XLogWalPropClose(XLogRecPtr recptr); + +#endif /* __NEON_WALPROPOSER_UTILS_H__ */ diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile new file mode 100644 index 0000000000..9c774ec185 --- /dev/null +++ b/pgxn/neon_test_utils/Makefile @@ -0,0 +1,15 @@ +# pgxs/neon_test_utils/Makefile + + +MODULE_big = neon_test_utils +OBJS = \ + $(WIN32RES) \ + neontest.o + +EXTENSION = neon_test_utils +DATA = neon_test_utils--1.0.sql +PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging" + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/pgxn/neon_test_utils/neon_test_utils--1.0.sql b/pgxn/neon_test_utils/neon_test_utils--1.0.sql new file mode 100644 index 0000000000..402981a9a6 --- /dev/null +++ b/pgxn/neon_test_utils/neon_test_utils--1.0.sql @@ -0,0 +1,29 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION neon_test_utils" to load this file. \quit + +CREATE FUNCTION test_consume_xids(nxids int) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_consume_xids' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION clear_buffer_cache() +RETURNS VOID +AS 'MODULE_PATHNAME', 'clear_buffer_cache' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION neon_xlogflush(lsn pg_lsn) +RETURNS VOID +AS 'MODULE_PATHNAME', 'neon_xlogflush' +LANGUAGE C PARALLEL UNSAFE; diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control new file mode 100644 index 0000000000..94e6720503 --- /dev/null +++ b/pgxn/neon_test_utils/neon_test_utils.control @@ -0,0 +1,5 @@ +# neon_test_utils extension +comment = 'helpers for neon testing and debugging' +default_version = '1.0' +module_pathname = '$libdir/neon_test_utils' +relocatable = true diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c new file mode 100644 index 0000000000..3e30065cd3 --- /dev/null +++ b/pgxn/neon_test_utils/neontest.c @@ -0,0 +1,304 @@ +/*------------------------------------------------------------------------- + * + * neontest.c + * Helpers for neon testing and debugging + * + * IDENTIFICATION + * contrib/neon_test_utils/neontest.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/relation.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/namespace.h" +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "utils/builtins.h" +#include "utils/pg_lsn.h" +#include "utils/rel.h" +#include "utils/varlena.h" +#include "../neon/pagestore_client.h" + +PG_MODULE_MAGIC; + +extern void _PG_init(void); + +PG_FUNCTION_INFO_V1(test_consume_xids); +PG_FUNCTION_INFO_V1(clear_buffer_cache); +PG_FUNCTION_INFO_V1(get_raw_page_at_lsn); +PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex); +PG_FUNCTION_INFO_V1(neon_xlogflush); + +/* + * Linkage to functions in zenith module. + * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c + */ +typedef void (*zenith_read_at_lsn_type)(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); + +static zenith_read_at_lsn_type zenith_read_at_lsn_ptr; + +/* + * Module initialize function: fetch function pointers for cross-module calls. + */ +void +_PG_init(void) +{ + /* Asserts verify that typedefs above match original declarations */ + AssertVariableIsOfType(&zenith_read_at_lsn, zenith_read_at_lsn_type); + zenith_read_at_lsn_ptr = (zenith_read_at_lsn_type) + load_external_function("$libdir/neon", "zenith_read_at_lsn", + true, NULL); +} + +#define zenith_read_at_lsn zenith_read_at_lsn_ptr + +/* + * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound. + */ +Datum +test_consume_xids(PG_FUNCTION_ARGS) +{ + int32 nxids = PG_GETARG_INT32(0); + TransactionId topxid; + FullTransactionId fullxid; + TransactionId xid; + TransactionId targetxid; + + /* make sure we have a top-XID first */ + topxid = GetTopTransactionId(); + + xid = ReadNextTransactionId(); + + targetxid = xid + nxids; + while (targetxid < FirstNormalTransactionId) + targetxid++; + + while (TransactionIdPrecedes(xid, targetxid)) + { + fullxid = GetNewTransactionId(true); + xid = XidFromFullTransactionId(fullxid); + elog(DEBUG1, "topxid: %u xid: %u", topxid, xid); + } + + PG_RETURN_VOID(); +} + +/* + * Flush the buffer cache, evicting all pages that are not currently pinned. + */ +Datum +clear_buffer_cache(PG_FUNCTION_ARGS) +{ + bool save_zenith_test_evict; + + /* + * Temporarily set the zenith_test_evict GUC, so that when we pin and + * unpin a buffer, the buffer is evicted. We use that hack to evict all + * buffers, as there is no explicit "evict this buffer" function in the + * buffer manager. + */ + save_zenith_test_evict = zenith_test_evict; + zenith_test_evict = true; + PG_TRY(); + { + /* Scan through all the buffers */ + for (int i = 0; i < NBuffers; i++) + { + BufferDesc *bufHdr; + uint32 buf_state; + Buffer bufferid; + bool isvalid; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blocknum; + + /* Peek into the buffer header to see what page it holds. */ + bufHdr = GetBufferDescriptor(i); + buf_state = LockBufHdr(bufHdr); + + if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) + isvalid = true; + else + isvalid = false; + bufferid = BufferDescriptorGetBuffer(bufHdr); + rnode = bufHdr->tag.rnode; + forknum = bufHdr->tag.forkNum; + blocknum = bufHdr->tag.blockNum; + + UnlockBufHdr(bufHdr, buf_state); + + /* + * Pin the buffer, and release it again. Because we have + * zenith_test_evict==true, this will evict the page from + * the buffer cache if no one else is holding a pin on it. + */ + if (isvalid) + { + if (ReadRecentBuffer(rnode, forknum, blocknum, bufferid)) + ReleaseBuffer(bufferid); + } + } + } + PG_FINALLY(); + { + /* restore the GUC */ + zenith_test_evict = save_zenith_test_evict; + } + PG_END_TRY(); + + PG_RETURN_VOID(); +} + + +/* + * Reads the page from page server without buffer cache + * usage mimics get_raw_page() in pageinspect, but offers reading versions at specific LSN + * NULL read lsn will result in reading the latest version. + * + * Note: reading latest version will result in waiting for latest changes to reach the page server, + * if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page + */ +Datum +get_raw_page_at_lsn(PG_FUNCTION_ARGS) +{ + bytea *raw_page; + ForkNumber forknum; + RangeVar *relrv; + Relation rel; + char *raw_page_data; + text *relname; + text *forkname; + uint32 blkno; + + bool request_latest = PG_ARGISNULL(3); + uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3); + + if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) + PG_RETURN_NULL(); + + relname = PG_GETARG_TEXT_PP(0); + forkname = PG_GETARG_TEXT_PP(1); + blkno = PG_GETARG_UINT32(2); + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to use raw page functions"))); + + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + rel = relation_openrv(relrv, AccessShareLock); + + /* Check that this relation has storage */ + if (rel->rd_rel->relkind == RELKIND_VIEW) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from view \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from composite type \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from foreign table \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from partitioned table \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from partitioned index \"%s\"", + RelationGetRelationName(rel)))); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + + forknum = forkname_to_number(text_to_cstring(forkname)); + + /* Initialize buffer to copy to */ + raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); + raw_page_data = VARDATA(raw_page); + + zenith_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data); + + relation_close(rel, AccessShareLock); + + PG_RETURN_BYTEA_P(raw_page); +} + +/* + * Another option to read a relation page from page server without cache + * this version doesn't validate input and allows reading blocks of dropped relations + * + * Note: reading latest version will result in waiting for latest changes to reach the page server, + * if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page + */ +Datum +get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) +{ + char *raw_page_data; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to use raw page functions"))); + + if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) || + PG_ARGISNULL(3) || PG_ARGISNULL(4)) + PG_RETURN_NULL(); + + { + RelFileNode rnode = { + .spcNode = PG_GETARG_OID(0), + .dbNode = PG_GETARG_OID(1), + .relNode = PG_GETARG_OID(2) + }; + + ForkNumber forknum = PG_GETARG_UINT32(3); + + uint32 blkno = PG_GETARG_UINT32(4); + bool request_latest = PG_ARGISNULL(5); + uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); + + + /* Initialize buffer to copy to */ + bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); + raw_page_data = VARDATA(raw_page); + + zenith_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data); + PG_RETURN_BYTEA_P(raw_page); + } +} + +/* + * Directly calls XLogFlush(lsn) to flush WAL buffers. + */ +Datum +neon_xlogflush(PG_FUNCTION_ARGS) +{ + XLogRecPtr lsn = PG_GETARG_LSN(0); + XLogFlush(lsn); + PG_RETURN_VOID(); +} diff --git a/vendor/postgres b/vendor/postgres index a479855158..8f132d968c 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit a4798551587fb5a52740687a341af83b28733dc6 +Subproject commit 8f132d968cd44068fc6f72e4047f7d3d6320f4bb From a5ca6a9d2b69a8d4a67900901710c21167451a54 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Fri, 26 Aug 2022 13:59:04 +0200 Subject: [PATCH 0689/1022] Move legacy version of compute-node Dockerfile from postgres repo (#2339) It's used by e2e CI. Building Dockerfile.compute-node will take unreasonable ammount of time without v2 runners. TODO: remove once cloud repo CI is moved to v2 runners. --- Dockerfile.compute-node.legacy | 87 ++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 Dockerfile.compute-node.legacy diff --git a/Dockerfile.compute-node.legacy b/Dockerfile.compute-node.legacy new file mode 100644 index 0000000000..ba34e2486f --- /dev/null +++ b/Dockerfile.compute-node.legacy @@ -0,0 +1,87 @@ +# +# Legacy version of the Dockerfile for the compute node. +# Used by e2e CI. Building Dockerfile.compute-node will take +# unreasonable ammount of time without v2 runners. +# +# TODO: remove once cloud repo CI is moved to v2 runners. +# + + +# Allow specifiyng different compute-tools tag and image repo, so we are +# able to use different images +ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG IMAGE=compute-tools +ARG TAG=latest + +# +# Image with pre-built tools +# +FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps +# Only to get ready compute_ctl binary as deppendency + +# +# Image with Postgres build deps +# +FROM debian:buster-slim AS build-deps + +RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ + libcurl4-openssl-dev libossp-uuid-dev + +# +# Image with built Postgres +# +FROM build-deps AS pg-build + +# Add user postgres +RUN adduser postgres +RUN mkdir /pg && chown postgres:postgres /pg + +# Copy source files +COPY ./vendor/postgres /pg/ +COPY ./pgxn /pg/ + +# Build and install Postgres locally +RUN mkdir /pg/compute_build && cd /pg/compute_build && \ + ../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \ + # Install main binaries and contribs + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ + # Install headers + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install + +# Install neon contrib +RUN make MAKELEVEL=0 PG_CONFIG=/pg/compute_build/postgres_bin/bin/pg_config -j $(getconf _NPROCESSORS_ONLN) -C /pg/neon install + +USER postgres +WORKDIR /pg + +# +# Final compute node image to be exported +# +FROM debian:buster-slim + +# libreadline-dev is required to run psql +RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev + +# Add user postgres +RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ + echo "postgres:test_console_pass" | chpasswd && \ + mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ + chown -R postgres:postgres /var/db/postgres && \ + chmod 0750 /var/db/postgres/compute + +# Copy ready Postgres binaries +COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local + +# Copy binaries from compute-tools +COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl + +# XXX: temporary symlink for compatibility with old control-plane +RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl + +# Add postgres shared objects to the search path +RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig + +USER postgres + +ENTRYPOINT ["/usr/local/bin/compute_ctl"] From a56ae15edf448534d00a6a21504cbc556d927cfd Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 26 Aug 2022 15:40:22 +0300 Subject: [PATCH 0690/1022] Lock cargo dependencies during CI builds --- .dockerignore | 1 + .github/workflows/build_and_test.yml | 4 ++-- .github/workflows/codestyle.yml | 2 +- Dockerfile | 2 +- Dockerfile.compute-node | 10 +++++----- Dockerfile.compute-tools | 2 +- run_clippy.sh | 4 ++-- 7 files changed, 13 insertions(+), 12 deletions(-) diff --git a/.dockerignore b/.dockerignore index 0667d8870e..8a3d32e6d2 100644 --- a/.dockerignore +++ b/.dockerignore @@ -3,6 +3,7 @@ **/.pytest_cache .git +.github target tmp_check tmp_install diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 6e570b22d4..bf6eb69930 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -95,11 +95,11 @@ jobs: if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" CARGO_FEATURES="" - CARGO_FLAGS="" + CARGO_FLAGS="--locked" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" CARGO_FEATURES="--features profiling" - CARGO_FLAGS="--release $CARGO_FEATURES" + CARGO_FLAGS="--locked --release $CARGO_FEATURES" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index eddfee88fc..b64ea8a01f 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -110,7 +110,7 @@ jobs: run: ./run_clippy.sh - name: Ensure all project builds - run: cargo build --all --all-targets + run: cargo build --locked --all --all-targets check-codestyle-python: runs-on: [ self-hosted, Linux, k8s-runner ] diff --git a/Dockerfile b/Dockerfile index 77598fd086..2dbe71f1ad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -40,7 +40,7 @@ COPY . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ - && mold -run cargo build --release \ +&& mold -run cargo build --locked --release \ && cachepot -s # Build final image diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 4527fb9ece..057441e730 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -69,9 +69,9 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ # Compile and run the Neon-specific `compute_ctl` binary FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools USER nonroot -COPY --chown=nonroot compute_tools compute_tools -COPY --chown=nonroot workspace_hack workspace_hack -RUN cd compute_tools && cargo build --release +# Copy entire project to get Cargo.* files with proper dependencies for the whole project +COPY --chown=nonroot . . +RUN cd compute_tools && cargo build --locked --release # Put it all together into the final image FROM debian:bullseye-slim @@ -86,7 +86,7 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ # TODO: Check if we can make the extension setup more modular versus a linear build # currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc# COPY --from=neon-pg-ext-build --chown=postgres /usr/local/pgsql /usr/local -COPY --from=compute-tools --chown=postgres /home/nonroot/compute_tools/target/release/compute_ctl /usr/local/bin/compute_ctl +COPY --from=compute-tools --chown=postgres /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl RUN apt update && \ apt install -y libreadline-dev libossp-uuid-dev gdal-bin libgdal-dev libprotobuf-c-dev && \ @@ -103,4 +103,4 @@ RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.lis RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl USER postgres -ENTRYPOINT ["/usr/local/bin/compute_ctl"] \ No newline at end of file +ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 47c408bbf2..8231cd0ebb 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -20,7 +20,7 @@ ARG CACHEPOT_BUCKET=neon-github-dev COPY . . RUN set -e \ - && mold -run cargo build -p compute_tools --release \ + && mold -run cargo build -p compute_tools --locked --release \ && cachepot -s # Final image that only has one binary diff --git a/run_clippy.sh b/run_clippy.sh index 13af3fd2c5..9feb8de4ea 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -13,10 +13,10 @@ # avoid running regular linting script that checks every feature. if [[ "$OSTYPE" == "darwin"* ]]; then # no extra features to test currently, add more here when needed - cargo clippy --all --all-targets -- -A unknown_lints -D warnings + cargo clippy --locked --all --all-targets -- -A unknown_lints -D warnings else # * `-A unknown_lints` – do not warn about unknown lint suppressions # that people with newer toolchains might use # * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) - cargo clippy --all --all-targets --all-features -- -A unknown_lints -D warnings + cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings fi From 6d30e21a326ed3d323bd563cbdecb15ee1d95ec9 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 26 Aug 2022 20:42:32 +0300 Subject: [PATCH 0691/1022] Fix proxy tests (#2343) There might be different psql & locale configurations, therefore we should explicitly reset them to defaults. --- test_runner/batch_others/test_proxy.py | 17 +++-------------- test_runner/fixtures/neon_fixtures.py | 13 ++++++++----- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/test_runner/batch_others/test_proxy.py b/test_runner/batch_others/test_proxy.py index 4ffd458b22..1efb795140 100644 --- a/test_runner/batch_others/test_proxy.py +++ b/test_runner/batch_others/test_proxy.py @@ -115,33 +115,22 @@ async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProx Step 4. assert that select 1 has been executed correctly. """ - # Step 1. psql = PSQL( host=link_proxy.host, port=link_proxy.proxy_port, ) - proc = await psql.run("select 1") + proc = await psql.run("select 42") - # Step 2.1 uri_prefix = link_proxy.link_auth_uri_prefix line_str = await get_uri_line_from_process_welcome_notice(uri_prefix, proc) - # step 2.2 psql_session_id = get_session_id_from_uri_line(uri_prefix, line_str) log.info(f"Parsed psql_session_id='{psql_session_id}' from Neon welcome message.") - # Step 3. create_and_send_db_info(vanilla_pg, psql_session_id, link_proxy.mgmt_port) - # Step 4. - # Expecting proxy output:: - # b' ?column? \n' - # b'----------\n' - # b' 1\n' - # b'(1 row)\n' - out_bytes = await proc.stdout.read() - expected_out_bytes = b" ?column? \n----------\n 1\n(1 row)\n\n" - assert out_bytes == expected_out_bytes + out = (await proc.stdout.read()).decode("utf-8").strip() + assert out == "42" # Pass extra options to the server. diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 3af0cf4dcb..ad686e1fce 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1738,13 +1738,16 @@ class PSQL: self.database_url = f"postgres://{host}:{port}/main?options=project%3Dgeneric-project-name" async def run(self, query=None): - run_args = [self.path, self.database_url] - run_args += ["--command", query] if query is not None else [] + run_args = [self.path, "--no-psqlrc", "--quiet", "--tuples-only", self.database_url] + if query is not None: + run_args += ["--command", query] - cmd_line = subprocess.list2cmdline(run_args) - log.info(f"Run psql: {cmd_line}") + log.info(f"Run psql: {subprocess.list2cmdline(run_args)}") return await asyncio.create_subprocess_exec( - *run_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE + *run_args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env={"LC_ALL": "C", **os.environ}, # one locale to rule them all ) From c0a867d86fc76ae43d15b3111701cef046c3cc9a Mon Sep 17 00:00:00 2001 From: MMeent Date: Fri, 26 Aug 2022 19:58:08 +0200 Subject: [PATCH 0692/1022] Include neon extensions in the main neon images (#2341) Oversight in #2325 - apparently this area wasn't well-covered by tests in the neon repo. Fixes #2340 --- Dockerfile | 7 ++++--- vendor/postgres | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2dbe71f1ad..aa31e227da 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,12 +10,13 @@ ARG TAG=pinned FROM $REPOSITORY/$IMAGE:$TAG AS pg-build WORKDIR /home/nonroot -COPY vendor/postgres vendor/postgres -COPY Makefile Makefile +COPY --chown=nonroot vendor/postgres vendor/postgres +COPY --chown=nonroot pgxn pgxn +COPY --chown=nonroot Makefile Makefile ENV BUILD_TYPE release RUN set -e \ - && mold -run make -j $(nproc) -s postgres \ + && mold -run make -j $(nproc) -s neon-pg-ext \ && rm -rf tmp_install/build \ && tar -C tmp_install -czf /home/nonroot/postgres_install.tar.gz . diff --git a/vendor/postgres b/vendor/postgres index 8f132d968c..22d9ead36b 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 8f132d968cd44068fc6f72e4047f7d3d6320f4bb +Subproject commit 22d9ead36beeab6b6a99c64f9b0b1576927ad91b From ec20534173b06bc89e2c5ee604e63b256713d9ac Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 27 Aug 2022 17:54:56 +0300 Subject: [PATCH 0693/1022] Fix minor typos and leftover comments. --- pageserver/src/layered_repository/filename.rs | 2 +- pageserver/src/layered_repository/metadata.rs | 4 ++-- pageserver/src/layered_repository/timeline.rs | 2 +- pageserver/src/walreceiver/walreceiver_connection.rs | 2 +- test_runner/batch_others/test_remote_storage.py | 6 +++--- .../batch_others/test_tenants_with_remote_storage.py | 6 +++--- test_runner/batch_others/test_wal_acceptor.py | 12 ++++++------ 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pageserver/src/layered_repository/filename.rs b/pageserver/src/layered_repository/filename.rs index f088088277..5ebac2332d 100644 --- a/pageserver/src/layered_repository/filename.rs +++ b/pageserver/src/layered_repository/filename.rs @@ -10,7 +10,7 @@ use std::path::PathBuf; use utils::lsn::Lsn; -// Note: LayeredTimeline::load_layer_map() relies on this sort order +// Note: Timeline::load_layer_map() relies on this sort order #[derive(Debug, PartialEq, Eq, Clone)] pub struct DeltaFileName { pub key_range: Range, diff --git a/pageserver/src/layered_repository/metadata.rs b/pageserver/src/layered_repository/metadata.rs index 74679cb43a..f3ddd42e76 100644 --- a/pageserver/src/layered_repository/metadata.rs +++ b/pageserver/src/layered_repository/metadata.rs @@ -1,4 +1,4 @@ -//! Every image of a certain timeline from [`crate::layered_repository::LayeredRepository`] +//! Every image of a certain timeline from [`crate::layered_repository::Repository`] //! has a metadata that needs to be stored persistently. //! //! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of @@ -30,7 +30,7 @@ pub const METADATA_FILE_NAME: &str = "metadata"; /// Metadata stored on disk for each timeline /// -/// The fields correspond to the values we hold in memory, in LayeredTimeline. +/// The fields correspond to the values we hold in memory, in Timeline. #[derive(Debug, Clone, PartialEq, Eq)] pub struct TimelineMetadata { hdr: TimelineMetadataHeader, diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index a909dcb5a1..ecf9a87500 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -366,7 +366,7 @@ pub struct Timeline { /// Layer removal lock. /// A lock to ensure that no layer of the timeline is removed concurrently by other threads. /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`], - /// and [`LayeredRepository::delete_timeline`]. + /// and [`Repository::delete_timeline`]. layer_removal_cs: Mutex<()>, // Needed to ensure that we can't create a branch at a point that was already garbage collected diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index b5f266614e..f816198eda 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -63,7 +63,7 @@ pub async fn handle_walreceiver_connection( ) .await .context("Timed out while waiting for walreceiver connection to open")? - .context("Failed to open walreceiver conection")?; + .context("Failed to open walreceiver connection")?; info!("connected!"); let mut connection_status = WalConnectionStatus { diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index 1e4fdc8602..0015c75670 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -38,17 +38,17 @@ from fixtures.utils import lsn_from_hex, query_scalar # * queries the specific data, ensuring that it matches the one stored before # # The tests are done for all types of remote storage pageserver supports. -@pytest.mark.parametrize("remote_storatge_kind", available_remote_storages()) +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) def test_remote_storage_backup_and_restore( neon_env_builder: NeonEnvBuilder, - remote_storatge_kind: RemoteStorageKind, + remote_storage_kind: RemoteStorageKind, ): # Use this test to check more realistic SK ids: some etcd key parsing bugs were related, # and this test needs SK to write data to pageserver, so it will be visible neon_env_builder.safekeepers_id_start = 12 neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storatge_kind, + remote_storage_kind=remote_storage_kind, test_name="test_remote_storage_backup_and_restore", ) diff --git a/test_runner/batch_others/test_tenants_with_remote_storage.py b/test_runner/batch_others/test_tenants_with_remote_storage.py index 7db58c2a70..083150e12a 100644 --- a/test_runner/batch_others/test_tenants_with_remote_storage.py +++ b/test_runner/batch_others/test_tenants_with_remote_storage.py @@ -53,10 +53,10 @@ async def all_tenants_workload(env: NeonEnv, tenants_pgs): await asyncio.gather(*workers) -@pytest.mark.parametrize("remote_storatge_kind", available_remote_storages()) -def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind): +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storatge_kind, + remote_storage_kind=remote_storage_kind, test_name="test_tenants_many", ) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 47838ddb76..28daeb18ed 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -420,12 +420,12 @@ def wait_wal_trim(tenant_id, timeline_id, sk, target_size): time.sleep(0.5) -@pytest.mark.parametrize("remote_storatge_kind", available_remote_storages()) -def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind): +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storatge_kind, + remote_storage_kind=remote_storage_kind, test_name="test_safekeepers_wal_backup", ) @@ -468,12 +468,12 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: Remo wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], "0/5000000") -@pytest.mark.parametrize("remote_storatge_kind", available_remote_storages()) -def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind): +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storatge_kind, + remote_storage_kind=remote_storage_kind, test_name="test_s3_wal_replay", ) From 88a339ed73b82bdb3aa9afcb5facdd5e63c20f99 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 27 Aug 2022 18:14:30 +0300 Subject: [PATCH 0694/1022] Update a few crates "cargo tree -d" showed that we're building multiple versions of some crates. Update some crates, to avoid depending on multiple versions. --- Cargo.lock | 45 ++++++++++----------------------------- libs/utils/Cargo.toml | 2 +- proxy/Cargo.toml | 4 ++-- workspace_hack/Cargo.toml | 10 +++++---- 4 files changed, 20 insertions(+), 41 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 505cbb66c3..73b9c318ea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1164,20 +1164,14 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" -[[package]] -name = "hashbrown" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" -dependencies = [ - "ahash", -] - [[package]] name = "hashbrown" version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +dependencies = [ + "ahash", +] [[package]] name = "heck" @@ -1379,7 +1373,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" dependencies = [ "autocfg", - "hashbrown 0.12.3", + "hashbrown", ] [[package]] @@ -2274,7 +2268,7 @@ dependencies = [ "clap 3.2.16", "futures", "git-version", - "hashbrown 0.11.2", + "hashbrown", "hex", "hmac 0.12.1", "hyper", @@ -2289,7 +2283,7 @@ dependencies = [ "routerify", "rstest", "rustls", - "rustls-pemfile 0.2.1", + "rustls-pemfile", "scopeguard", "serde", "serde_json", @@ -2315,15 +2309,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "quickcheck" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6" -dependencies = [ - "rand", -] - [[package]] name = "quote" version = "1.0.21" @@ -2508,7 +2493,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "rustls", - "rustls-pemfile 1.0.1", + "rustls-pemfile", "serde", "serde_json", "serde_urlencoded", @@ -2697,15 +2682,6 @@ dependencies = [ "webpki", ] -[[package]] -name = "rustls-pemfile" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eebeaeb360c87bfb72e84abdb3447159c0eaececf1bef2aecd65a8be949d1c9" -dependencies = [ - "base64", -] - [[package]] name = "rustls-pemfile" version = "1.0.1" @@ -3242,7 +3218,6 @@ dependencies = [ "js-sys", "libc", "num_threads", - "quickcheck", "time-macros", ] @@ -3683,7 +3658,7 @@ dependencies = [ "rand", "routerify", "rustls", - "rustls-pemfile 0.2.1", + "rustls-pemfile", "rustls-split", "serde", "serde_json", @@ -3969,6 +3944,7 @@ version = "0.1.0" dependencies = [ "ahash", "anyhow", + "bstr", "bytes", "chrono", "clap 2.34.0", @@ -3978,7 +3954,7 @@ dependencies = [ "futures-task", "futures-util", "generic-array", - "hashbrown 0.11.2", + "hashbrown", "hex", "hyper", "indexmap", @@ -3993,6 +3969,7 @@ dependencies = [ "prost", "rand", "regex", + "regex-automata", "regex-syntax", "scopeguard", "serde", diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index e3e78ec68f..28ad658de4 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -39,7 +39,7 @@ bytes = "1.0.1" hex-literal = "0.3" tempfile = "3.2" criterion = "0.3" -rustls-pemfile = "0.2.1" +rustls-pemfile = "1" [[bench]] name = "benchmarks" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 230fc8a253..d3f7ea5fdc 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -11,7 +11,7 @@ bstr = "0.2.17" bytes = { version = "1.0.1", features = ['serde'] } clap = "3.0" futures = "0.3.13" -hashbrown = "0.11.2" +hashbrown = "0.12" hex = "0.4.3" hmac = "0.12.1" hyper = "0.14" @@ -23,7 +23,7 @@ rand = "0.8.3" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } routerify = "3" rustls = "0.20.0" -rustls-pemfile = "0.2.1" +rustls-pemfile = "1" scopeguard = "1.1.0" serde = "1" serde_json = "1" diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 4dc7e4e157..bfe61b9ced 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -16,6 +16,7 @@ publish = false [dependencies] ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } +bstr = { version = "0.2", features = ["lazy_static", "regex-automata", "serde", "serde1", "serde1-nostd", "std", "unicode"] } bytes = { version = "1", features = ["serde", "std"] } chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "std", "time", "winapi"] } clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } @@ -25,7 +26,7 @@ futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink" futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] } futures-util = { version = "0.3", default-features = false, features = ["alloc", "async-await", "async-await-macro", "channel", "futures-channel", "futures-io", "futures-macro", "futures-sink", "io", "memchr", "sink", "slab", "std"] } generic-array = { version = "0.14", default-features = false, features = ["more_lengths"] } -hashbrown = { version = "0.11", features = ["ahash", "inline-more", "raw"] } +hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } hex = { version = "0.4", features = ["alloc", "serde", "std"] } hyper = { version = "0.14", features = ["client", "full", "h2", "http1", "http2", "runtime", "server", "socket2", "stream", "tcp"] } indexmap = { version = "1", default-features = false, features = ["std"] } @@ -40,12 +41,13 @@ num-traits = { version = "0.2", features = ["i128", "std"] } prost = { version = "0.10", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } +regex-automata = { version = "0.1", features = ["regex-syntax", "std"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } -time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "quickcheck", "quickcheck-dep", "std", "time-macros"] } +time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] } tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "winapi"] } -tokio-util = { version = "0.7", features = ["codec", "io"] } +tokio-util = { version = "0.7", features = ["codec", "io", "tracing"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } tracing-core = { version = "0.1", features = ["lazy_static", "std", "valuable"] } @@ -55,7 +57,7 @@ anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } either = { version = "1", features = ["use_std"] } -hashbrown = { version = "0.11", features = ["ahash", "inline-more", "raw"] } +hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } From 34b5d7aa9f59d12e753a1d00ce410350b439d7a0 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 27 Aug 2022 18:14:33 +0300 Subject: [PATCH 0695/1022] Remove unused dependency --- Cargo.lock | 1 - libs/postgres_ffi/Cargo.toml | 1 - pageserver/Cargo.toml | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 73b9c318ea..603e034ed3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2105,7 +2105,6 @@ dependencies = [ "bindgen", "byteorder", "bytes", - "chrono", "crc32c", "env_logger", "hex", diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 0118701a7e..5b9ecb7394 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -4,7 +4,6 @@ version = "0.1.0" edition = "2021" [dependencies] -chrono = "0.4.19" rand = "0.8.3" regex = "1.4.5" bytes = "1.0.1" diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 63a2263ae0..902765f424 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -15,7 +15,7 @@ failpoints = ["fail/failpoints"] chrono = "0.4.19" rand = "0.8.3" regex = "1.4.5" -bytes = { version = "1.0.1", features = ['serde'] } +bytes = "1.0.1" byteorder = "1.4.3" futures = "0.3.13" hex = "0.4.3" From f8188e679c51dcd07e90a4152d720e323748c412 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 27 Aug 2022 18:14:35 +0300 Subject: [PATCH 0696/1022] Downgrade a few panics into plain errors. Let's not bring down the whole pageserver if you import a bogus tar archive to one timeline. --- pageserver/src/import_datadir.rs | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 54e791e5b5..4cc3aafb0e 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -331,7 +331,11 @@ pub fn import_basebackup_from_tar( debug!("directory {:?}", file_path); } _ => { - panic!("tar::EntryType::?? {}", file_path.display()); + bail!( + "entry {} in backup tar archive is of unexpected type: {:?}", + file_path.display(), + header.entry_type() + ); } } } @@ -384,7 +388,11 @@ pub fn import_wal_from_tar( continue; } _ => { - panic!("tar::EntryType::?? {}", file_path.display()); + bail!( + "entry {} in WAL tar archive is of unexpected type: {:?}", + file_path.display(), + header.entry_type() + ); } } }; @@ -424,14 +432,12 @@ pub fn import_wal_from_tar( Ok(()) } -pub fn import_file( +fn import_file( modification: &mut DatadirModification, file_path: &Path, reader: Reader, len: usize, ) -> Result> { - debug!("looking at {:?}", file_path); - if file_path.starts_with("global") { let spcnode = pg_constants::GLOBALTABLESPACE_OID; let dbnode = 0; @@ -553,7 +559,10 @@ pub fn import_file( // this to import arbitrary postgres databases. bail!("Importing pg_tblspc is not implemented"); } else { - debug!("ignored"); + debug!( + "ignoring unrecognized file \"{}\" in tar archive", + file_path.display() + ); } Ok(None) From 5f189cd3855c3526d52f6184363e2026458025ae Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 27 Aug 2022 18:14:38 +0300 Subject: [PATCH 0697/1022] Remove some unnecessary derives. Doesn't make much difference, but let's be tidy. --- pageserver/src/storage_sync/index.rs | 2 +- safekeeper/src/safekeeper.rs | 2 +- safekeeper/src/send_wal.rs | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 3dddda09bf..7e644da412 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -210,7 +210,7 @@ impl RemoteTimelineIndex { } /// Restored index part data about the timeline, stored in the remote index. -#[derive(Debug, PartialEq, Eq, Clone)] +#[derive(Debug, Clone)] pub struct RemoteTimeline { timeline_layers: HashSet, missing_layers: HashSet, diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 22f8ca2de4..ed34669dde 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -332,7 +332,7 @@ pub struct AppendRequestHeader { } /// Report safekeeper state to proposer -#[derive(Debug, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize)] pub struct AppendResponse { // Current term of the safekeeper; if it is higher than proposer's, the // compute is out of date. diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 97ec945c3e..38523f9f82 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -36,7 +36,7 @@ const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; type FullTransactionId = u64; /// Hot standby feedback received from replica -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct HotStandbyFeedback { pub ts: TimestampTz, pub xmin: FullTransactionId, @@ -54,7 +54,7 @@ impl HotStandbyFeedback { } /// Standby status update -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Deserialize)] pub struct StandbyReply { pub write_lsn: Lsn, // last lsn received by pageserver pub flush_lsn: Lsn, // pageserver's disk consistent lSN From 7a840ec60ca7248625a9e88fbddcf14ca67207ed Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 27 Aug 2022 18:14:40 +0300 Subject: [PATCH 0698/1022] Move save_metadata function. `timeline.rs` seems like a better home for it. --- pageserver/src/layered_repository.rs | 6 +- pageserver/src/layered_repository/metadata.rs | 64 +++++++++++++++---- pageserver/src/layered_repository/timeline.rs | 41 +----------- 3 files changed, 56 insertions(+), 55 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index fae52c3daf..36b8e3eb9e 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -69,7 +69,7 @@ pub use timeline::Timeline; pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; // re-export for use in storage_sync.rs -pub use crate::layered_repository::timeline::save_metadata; +pub use crate::layered_repository::metadata::save_metadata; // re-export for use in walreceiver pub use crate::layered_repository::timeline::WalReceiverInfo; @@ -185,7 +185,7 @@ impl Repository { crashsafe_dir::create_dir_all(timeline_path)?; let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); - timeline::save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?; + save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?; let timeline = Timeline::new( self.conf, @@ -294,7 +294,7 @@ impl Repository { src_timeline.initdb_lsn, ); crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; - timeline::save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; + save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; timelines.insert(dst, LayeredTimelineEntry::Unloaded { id: dst, metadata }); info!("branched timeline {} from {} at {}", dst, src, start_lsn); diff --git a/pageserver/src/layered_repository/metadata.rs b/pageserver/src/layered_repository/metadata.rs index f3ddd42e76..910dba4644 100644 --- a/pageserver/src/layered_repository/metadata.rs +++ b/pageserver/src/layered_repository/metadata.rs @@ -6,10 +6,13 @@ //! //! The module contains all structs and related helper methods related to timeline metadata. +use std::fs::{File, OpenOptions}; +use std::io::Write; use std::path::PathBuf; -use anyhow::ensure; +use anyhow::{bail, ensure, Context}; use serde::{Deserialize, Serialize}; +use tracing::info_span; use utils::{ bin_ser::BeSer, lsn::Lsn, @@ -17,6 +20,7 @@ use utils::{ }; use crate::config::PageServerConf; +use crate::virtual_file::VirtualFile; use crate::STORAGE_FORMAT_VERSION; /// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic. @@ -65,17 +69,6 @@ struct TimelineMetadataBody { initdb_lsn: Lsn, } -/// Points to a place in pageserver's local directory, -/// where certain timeline's metadata file should be located. -pub fn metadata_path( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, -) -> PathBuf { - conf.timeline_path(&timelineid, &tenantid) - .join(METADATA_FILE_NAME) -} - impl TimelineMetadata { pub fn new( disk_consistent_lsn: Lsn, @@ -173,6 +166,53 @@ impl TimelineMetadata { } } +/// Points to a place in pageserver's local directory, +/// where certain timeline's metadata file should be located. +pub fn metadata_path( + conf: &'static PageServerConf, + timelineid: ZTimelineId, + tenantid: ZTenantId, +) -> PathBuf { + conf.timeline_path(&timelineid, &tenantid) + .join(METADATA_FILE_NAME) +} + +/// Save timeline metadata to file +pub fn save_metadata( + conf: &'static PageServerConf, + timelineid: ZTimelineId, + tenantid: ZTenantId, + data: &TimelineMetadata, + first_save: bool, +) -> anyhow::Result<()> { + let _enter = info_span!("saving metadata").entered(); + let path = metadata_path(conf, timelineid, tenantid); + // use OpenOptions to ensure file presence is consistent with first_save + let mut file = VirtualFile::open_with_options( + &path, + OpenOptions::new().write(true).create_new(first_save), + )?; + + let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?; + + if file.write(&metadata_bytes)? != metadata_bytes.len() { + bail!("Could not write all the metadata bytes in a single call"); + } + file.sync_all()?; + + // fsync the parent directory to ensure the directory entry is durable + if first_save { + let timeline_dir = File::open( + &path + .parent() + .expect("Metadata should always have a parent dir"), + )?; + timeline_dir.sync_all()?; + } + + Ok(()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index ecf9a87500..5f3d669dc1 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -11,8 +11,6 @@ use tracing::*; use std::cmp::{max, min, Ordering}; use std::collections::{HashMap, HashSet}; use std::fs; -use std::fs::{File, OpenOptions}; -use std::io::Write; use std::ops::{Deref, Range}; use std::path::PathBuf; use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering}; @@ -32,7 +30,7 @@ use crate::layered_repository::{ image_layer::{ImageLayer, ImageLayerWriter}, inmemory_layer::InMemoryLayer, layer_map::{LayerMap, SearchResult}, - metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}, + metadata::{save_metadata, TimelineMetadata, METADATA_FILE_NAME}, par_fsync, storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}, }; @@ -54,7 +52,6 @@ use utils::{ use crate::repository::{GcResult, RepositoryTimeline}; use crate::repository::{Key, Value}; use crate::thread_mgr; -use crate::virtual_file::VirtualFile; use crate::walreceiver::IS_WAL_RECEIVER; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; @@ -2342,39 +2339,3 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { bail!("couldn't find an unused backup number for {:?}", path) } - -/// Save timeline metadata to file -pub fn save_metadata( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - data: &TimelineMetadata, - first_save: bool, -) -> Result<()> { - let _enter = info_span!("saving metadata").entered(); - let path = metadata_path(conf, timelineid, tenantid); - // use OpenOptions to ensure file presence is consistent with first_save - let mut file = VirtualFile::open_with_options( - &path, - OpenOptions::new().write(true).create_new(first_save), - )?; - - let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?; - - if file.write(&metadata_bytes)? != metadata_bytes.len() { - bail!("Could not write all the metadata bytes in a single call"); - } - file.sync_all()?; - - // fsync the parent directory to ensure the directory entry is durable - if first_save { - let timeline_dir = File::open( - &path - .parent() - .expect("Metadata should always have a parent dir"), - )?; - timeline_dir.sync_all()?; - } - - Ok(()) -} From bfa1d916124962f0079c25ab7c17fd6fb5d698a1 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 29 Aug 2022 11:23:37 +0300 Subject: [PATCH 0699/1022] Introduce RCU, and use it to protect latest_gc_cutoff_lsn. `latest_gc_cutoff_lsn` tracks the cutoff point where GC has been performed. Anything older than the cutoff might already have been GC'd away, and cannot be queried by get_page_at_lsn requests. It's protected by an RWLock. Whenever a get_page_at_lsn requests comes in, it first grabs the lock and reads the current `latest_gc_cutoff`, and holds the lock it until the request has been served. The lock ensures that GC doesn't start concurrently and remove page versions that we still need to satisfy the request. With the lock, get_page_at_lsn request could potentially be blocked for a long time. GC only holds the lock in exclusive mode for a short duration, but depending on how whether the RWLock is "fair", a read request might be queued behind the GC's exclusive request, which in turn might be queued behind a long-running read operation, like a basebackup. If the lock implementation is not fair, i.e. if a reader can always jump the queue if the lock is already held in read mode, then another problem arises: GC might be starved if a constant stream of GetPage requests comes in. To avoid the long wait or starvation, introduce a Read-Copy-Update mechanism to replace the lock on `latest_gc_cutoff_lsn`. With the RCU, reader can always read the latest value without blocking (except for a very short duration if the lock protecting the RCU is contended; that's comparable to a spinlock). And a writer can always write a new value without waiting for readers to finish using the old value. The old readers will continue to see the old value through their guard object, while new readers will see the new value. This is purely theoretical ATM, we don't have any reports of either starvation or blocking behind GC happening in practice. But it's simple to fix, so let's nip that problem in the bud. --- libs/utils/src/lib.rs | 3 + libs/utils/src/simple_rcu.rs | 217 ++++++++++++++++++ pageserver/src/layered_repository.rs | 5 +- pageserver/src/layered_repository/timeline.rs | 33 ++- pageserver/src/page_service.rs | 5 +- 5 files changed, 249 insertions(+), 14 deletions(-) create mode 100644 libs/utils/src/simple_rcu.rs diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 1b011bb73a..fa7a37adf1 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -8,6 +8,9 @@ pub mod lsn; /// SeqWait allows waiting for a future sequence number to arrive pub mod seqwait; +/// A simple Read-Copy-Update implementation. +pub mod simple_rcu; + /// append only ordered map implemented with a Vec pub mod vec_map; diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs new file mode 100644 index 0000000000..24423815ab --- /dev/null +++ b/libs/utils/src/simple_rcu.rs @@ -0,0 +1,217 @@ +//! +//! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat +//! similar to a lock, but it allows readers to "hold on" to an old value of RCU +//! without blocking writers, and allows writing a new values without blocking +//! readers. When you update the new value, the new value is immediately visible +//! to new readers, but the update waits until all existing readers have +//! finishe, so that no one sees the old value anymore. +//! +//! This implementation isn't wait-free; it uses an RwLock that is held for a +//! short duration when the value is read or updated. +//! +#![warn(missing_docs)] + +use std::ops::Deref; +use std::sync::mpsc::{sync_channel, Receiver, SyncSender}; +use std::sync::{Arc, Weak}; +use std::sync::{Mutex, RwLock, RwLockWriteGuard}; + +/// +/// Rcu allows multiple readers to read and hold onto a value without blocking +/// (for very long). Storing to the Rcu updates the value, making new readers +/// immediately see the new value, but it also waits for all current readers to +/// finish. +/// +pub struct Rcu { + inner: RwLock>, +} + +struct RcuInner { + current_cell: Arc>, + old_cells: Vec>>, +} + +/// +/// RcuCell holds one value. It can be the latest one, or an old one. +/// +struct RcuCell { + value: V, + + /// A dummy channel. We never send anything to this channel. The point is + /// that when the RcuCell is dropped, any cloned Senders will be notified + /// that the channel is closed. Updaters can use this to wait out until the + /// RcuCell has been dropped, i.e. until the old value is no longer in use. + /// + /// We never do anything with the receiver, we just need to hold onto it so + /// that the Senders will be notified when it's dropped. But because it's + /// not Sync, we need a Mutex on it. + watch: (SyncSender<()>, Mutex>), +} + +impl RcuCell { + fn new(value: V) -> Self { + let (watch_sender, watch_receiver) = sync_channel(0); + RcuCell { + value, + watch: (watch_sender, Mutex::new(watch_receiver)), + } + } +} + +impl Rcu { + /// Create a new `Rcu`, initialized to `starting_val` + pub fn new(starting_val: V) -> Self { + let inner = RcuInner { + current_cell: Arc::new(RcuCell::new(starting_val)), + old_cells: Vec::new(), + }; + Self { + inner: RwLock::new(inner), + } + } + + /// + /// Read current value. Any store() calls will block until the returned + /// guard object is dropped. + /// + pub fn read(&self) -> RcuReadGuard { + let current_cell = Arc::clone(&self.inner.read().unwrap().current_cell); + RcuReadGuard { cell: current_cell } + } + + /// + /// Lock the current value for updating. Returns a guard object that can be + /// used to read the current value, and to store a new value. + /// + /// Note: holding the write-guard blocks concurrent readers, so you should + /// finish the update and drop the guard quickly! + /// + pub fn write(&self) -> RcuWriteGuard<'_, V> { + let inner = self.inner.write().unwrap(); + RcuWriteGuard { inner } + } +} + +/// +/// Read guard returned by `read` +/// +pub struct RcuReadGuard { + cell: Arc>, +} + +impl Deref for RcuReadGuard { + type Target = V; + + fn deref(&self) -> &V { + &self.cell.value + } +} + +/// +/// Read guard returned by `read` +/// +pub struct RcuWriteGuard<'a, V> { + inner: RwLockWriteGuard<'a, RcuInner>, +} + +impl<'a, V> Deref for RcuWriteGuard<'a, V> { + type Target = V; + + fn deref(&self) -> &V { + &self.inner.current_cell.value + } +} + +impl<'a, V> RcuWriteGuard<'a, V> { + /// + /// Store a new value. The new value will be written to the Rcu immediately, + /// and will be immediately seen by any `read` calls that start afterwards. + /// But if there are any readers still holding onto the old value, or any + /// even older values, this will await until they have been released. + /// + /// This will drop the write-guard before it starts waiting for the reads to + /// finish, so a new write operation can begin before this functio returns. + /// + pub fn store(mut self, new_val: V) { + let new_cell = Arc::new(RcuCell::new(new_val)); + + let mut watches = Vec::new(); + { + let old = std::mem::replace(&mut self.inner.current_cell, new_cell); + self.inner.old_cells.push(Arc::downgrade(&old)); + + // cleanup old cells that no longer have any readers, and collect + // the watches for any that do. + self.inner.old_cells.retain(|weak| { + if let Some(cell) = weak.upgrade() { + watches.push(cell.watch.0.clone()); + true + } else { + false + } + }); + } + drop(self); + + // after all the old_cells are no longer in use, we're done + for w in watches.iter_mut() { + // This will block until the Receiver is closed. That happens then + // the RcuCell is dropped. + #[allow(clippy::single_match)] + match w.send(()) { + Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"), + Err(_) => { + // closed, which means that the cell has been dropped, and + // its value is no longer in use + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::{Arc, Mutex}; + use std::thread::{sleep, spawn}; + use std::time::Duration; + + #[test] + fn basic() { + let rcu = Arc::new(Rcu::new(1)); + let log = Arc::new(Mutex::new(Vec::new())); + + let a = rcu.read(); + assert_eq!(*a, 1); + log.lock().unwrap().push("one"); + + let (rcu_clone, log_clone) = (Arc::clone(&rcu), Arc::clone(&log)); + let thread = spawn(move || { + log_clone.lock().unwrap().push("store two start"); + let write_guard = rcu_clone.write(); + assert_eq!(*write_guard, 1); + write_guard.store(2); + log_clone.lock().unwrap().push("store two done"); + }); + // without this sleep the test can pass on accident if the writer is slow + sleep(Duration::from_secs(1)); + + // new read should see the new value + let b = rcu.read(); + assert_eq!(*b, 2); + + // old guard still sees the old value + assert_eq!(*a, 1); + + // Release the old guard. This lets the store in the thread to finish. + log.lock().unwrap().push("release a"); + drop(a); + + thread.join().unwrap(); + + assert_eq!( + log.lock().unwrap().as_slice(), + &["one", "store two start", "release a", "store two done",] + ); + } +} diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 36b8e3eb9e..73c30b51b8 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -254,7 +254,8 @@ impl Repository { src_timeline .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) .context(format!( - "invalid branch start lsn: less than latest GC cutoff {latest_gc_cutoff_lsn}" + "invalid branch start lsn: less than latest GC cutoff {}", + *latest_gc_cutoff_lsn ))?; { let gc_info = src_timeline.gc_info.read().unwrap(); @@ -290,7 +291,7 @@ impl Repository { dst_prev, Some(src), start_lsn, - *src_timeline.latest_gc_cutoff_lsn.read().unwrap(), + *src_timeline.latest_gc_cutoff_lsn.read(), src_timeline.initdb_lsn, ); crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 5f3d669dc1..1a941affe5 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -14,7 +14,7 @@ use std::fs; use std::ops::{Deref, Range}; use std::path::PathBuf; use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError}; +use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError}; use std::time::{Duration, Instant, SystemTime}; use metrics::{ @@ -46,6 +46,7 @@ use postgres_ffi::v14::xlog_utils::to_pg_timestamp; use utils::{ lsn::{AtomicLsn, Lsn, RecordLsn}, seqwait::SeqWait, + simple_rcu::{Rcu, RcuReadGuard}, zid::{ZTenantId, ZTimelineId}, }; @@ -367,7 +368,7 @@ pub struct Timeline { layer_removal_cs: Mutex<()>, // Needed to ensure that we can't create a branch at a point that was already garbage collected - pub latest_gc_cutoff_lsn: RwLock, + pub latest_gc_cutoff_lsn: Rcu, // List of child timelines and their branch points. This is needed to avoid // garbage collecting data that is still needed by the child timelines. @@ -478,8 +479,8 @@ impl Timeline { } /// Lock and get timeline's GC cuttof - pub fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard { - self.latest_gc_cutoff_lsn.read().unwrap() + pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { + self.latest_gc_cutoff_lsn.read() } /// Look up given page version. @@ -594,7 +595,7 @@ impl Timeline { pub fn check_lsn_is_in_scope( &self, lsn: Lsn, - latest_gc_cutoff_lsn: &RwLockReadGuard, + latest_gc_cutoff_lsn: &RcuReadGuard, ) -> Result<()> { ensure!( lsn >= **latest_gc_cutoff_lsn, @@ -729,7 +730,7 @@ impl Timeline { pitr_cutoff: Lsn(0), }), - latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), + latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), current_logical_size: AtomicI64::new(0), @@ -1377,7 +1378,7 @@ impl Timeline { ondisk_prev_record_lsn, ancestor_timelineid, self.ancestor_lsn, - *self.latest_gc_cutoff_lsn.read().unwrap(), + *self.latest_gc_cutoff_lsn.read(), self.initdb_lsn, ); @@ -2032,9 +2033,21 @@ impl Timeline { let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %new_gc_cutoff).entered(); - // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. - // See branch_timeline() for details. - *self.latest_gc_cutoff_lsn.write().unwrap() = new_gc_cutoff; + // We need to ensure that no one tries to read page versions or create + // branches at a point before latest_gc_cutoff_lsn. See branch_timeline() + // for details. This will block until the old value is no longer in use. + // + // The GC cutoff should only ever move forwards. + { + let write_guard = self.latest_gc_cutoff_lsn.write(); + ensure!( + *write_guard <= new_gc_cutoff, + "Cannot move GC cutoff LSN backwards (was {}, new {})", + *write_guard, + new_gc_cutoff + ); + write_guard.store(new_gc_cutoff); + } info!("GC starting"); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index ebcff1f2ac..fbc70f7690 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -17,13 +17,14 @@ use std::io::{self, Read}; use std::net::TcpListener; use std::str; use std::str::FromStr; -use std::sync::{Arc, RwLockReadGuard}; +use std::sync::Arc; use tracing::*; use utils::{ auth::{self, Claims, JwtAuth, Scope}, lsn::Lsn, postgres_backend::{self, is_socket_read_timed_out, AuthType, PostgresBackend}, pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC}, + simple_rcu::RcuReadGuard, zid::{ZTenantId, ZTimelineId}, }; @@ -639,7 +640,7 @@ impl PageServerHandler { timeline: &Timeline, mut lsn: Lsn, latest: bool, - latest_gc_cutoff_lsn: &RwLockReadGuard, + latest_gc_cutoff_lsn: &RcuReadGuard, ) -> Result { if latest { // Latest page version was requested. If LSN is given, it is a hint From 1324dd89ed612e709fc8c84206d0a32936382789 Mon Sep 17 00:00:00 2001 From: MMeent Date: Mon, 29 Aug 2022 13:44:56 +0200 Subject: [PATCH 0700/1022] Mark PostGIS and PLV8 as trusted extensions (#2355) Now, users can install these extensions themselves if they are owner of the database they try to install the extension in. --- Dockerfile.compute-node | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 057441e730..950ec16016 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -33,7 +33,11 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.2.3.tar.gz && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ cd extensions/postgis && \ make clean && \ - make -j $(getconf _NPROCESSORS_ONLN) install + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control # Build plv8 FROM build-deps AS plv8-build @@ -54,7 +58,8 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.3.tar.gz && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ make && \ make install && \ - rm -rf /plv8-* + rm -rf /plv8-* && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control # compile neon extensions FROM build-deps AS neon-pg-ext-build From ee8b5f967dab4ffac35f8c920b4b2d37567c3105 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 29 Aug 2022 17:59:04 +0300 Subject: [PATCH 0701/1022] Add fork_at_current_lsn function which creates branch at current LSN (#2344) * Add fork_at_current_lsn function which creates branch at current LSN * Undo use of fork_at_current_lsn in test_branching because of short GC period * Add missed return in fork_at_current_lsn * Add missed return in fork_at_current_lsn * Update test_runner/fixtures/neon_fixtures.py Co-authored-by: Heikki Linnakangas * Update test_runner/fixtures/neon_fixtures.py Co-authored-by: Heikki Linnakangas * Update test_runner/fixtures/neon_fixtures.py Co-authored-by: Heikki Linnakangas Co-authored-by: Heikki Linnakangas --- test_runner/batch_others/test_twophase.py | 4 ++-- test_runner/batch_others/test_vm_bits.py | 4 ++-- test_runner/fixtures/neon_fixtures.py | 16 ++++++++++++++++ 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/test_runner/batch_others/test_twophase.py b/test_runner/batch_others/test_twophase.py index e01ba7caef..f3b0f9ca06 100644 --- a/test_runner/batch_others/test_twophase.py +++ b/test_runner/batch_others/test_twophase.py @@ -1,7 +1,7 @@ import os from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn # @@ -55,7 +55,7 @@ def test_twophase(neon_simple_env: NeonEnv): assert len(twophase_files) == 2 # Create a branch with the transaction in prepared state - env.neon_cli.create_branch("test_twophase_prepared", "test_twophase") + fork_at_current_lsn(env, pg, "test_twophase_prepared", "test_twophase") # Start compute on the new branch pg2 = env.postgres.create_start( diff --git a/test_runner/batch_others/test_vm_bits.py b/test_runner/batch_others/test_vm_bits.py index c147c6dff5..16a870471b 100644 --- a/test_runner/batch_others/test_vm_bits.py +++ b/test_runner/batch_others/test_vm_bits.py @@ -1,5 +1,5 @@ from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn # @@ -33,7 +33,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): cur.execute("UPDATE vmtest_update SET id = 5000 WHERE id = 1") # Branch at this point, to test that later - env.neon_cli.create_branch("test_vm_bit_clear_new", "test_vm_bit_clear") + fork_at_current_lsn(env, pg, "test_vm_bit_clear_new", "test_vm_bit_clear") # Clear the buffer cache, to force the VM page to be re-fetched from # the page server diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index ad686e1fce..32fd6f19c3 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2640,3 +2640,19 @@ def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: uuid.UUID, timel """Wait for pageserver to catch up the latest flush LSN""" last_flush_lsn = lsn_from_hex(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) + + +def fork_at_current_lsn( + env: NeonEnv, + pg: Postgres, + new_branch_name: str, + ancestor_branch_name: str, + tenant_id: Optional[uuid.UUID] = None, +) -> uuid.UUID: + """ + Create new branch at the last LSN of an existing branch. + The "last LSN" is taken from the given Postgres instance. The pageserver will wait for all the + the WAL up to that LSN to arrive in the pageserver before creating the branch. + """ + current_lsn = pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0] + return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn) From 07b4ace52fd6097e74d982ba4dbd74dd28a4f8dc Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 27 Aug 2022 01:50:18 +0300 Subject: [PATCH 0702/1022] Use more restrictive .dockerignore --- .dockerignore | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/.dockerignore b/.dockerignore index 8a3d32e6d2..2c78951923 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,19 +1,18 @@ -**/.git/ -**/__pycache__ -**/.pytest_cache +* -.git -.github -target -tmp_check -tmp_install -tmp_check_cli -test_output -.vscode -.neon -integration_tests/.neon -.mypy_cache - -Dockerfile -.dockerignore +!Cargo.toml +!Cargo.lock +!Makefile +!.cargo/ +!.config/ +!control_plane/ +!compute_tools/ +!libs/ +!pageserver/ +!pgxn/ +!proxy/ +!safekeeper/ +!vendor/postgres/ +!workspace_hack/ +!neon_local/ From 60408db101b2ddcf877759405acb6ab6f6af7505 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 30 Aug 2022 10:52:58 +0300 Subject: [PATCH 0703/1022] Fix logging scopes in safekeeper. --- safekeeper/src/timeline.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 3a10c5d59e..f482dbb3aa 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -529,7 +529,7 @@ impl Timeline { // release the lock before removing } let _enter = - info_span!("", timeline = %self.zttid.tenant_id, tenant = %self.zttid.timeline_id) + info_span!("", tenant = %self.zttid.tenant_id, timeline = %self.zttid.timeline_id) .entered(); remover(horizon_segno - 1)?; self.mutex.lock().unwrap().last_removed_segno = horizon_segno; @@ -626,7 +626,7 @@ impl GlobalTimelines { zttid: ZTenantTimelineId, create: bool, ) -> Result> { - let _enter = info_span!("", timeline = %zttid.tenant_id).entered(); + let _enter = info_span!("", timeline = %zttid.timeline_id).entered(); let mut state = TIMELINES_STATE.lock().unwrap(); From 96a50e99cf1b6800207570962e206a65db8215de Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Tue, 30 Aug 2022 17:36:21 +0300 Subject: [PATCH 0704/1022] Forward various connection params to compute nodes. (#2336) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, proxy didn't forward auxiliary `options` parameter and other ones to the client's compute node, e.g. ``` $ psql "user=john host=localhost dbname=postgres options='-cgeqo=off'" postgres=# show geqo; ┌──────┐ │ geqo │ ├──────┤ │ on │ └──────┘ (1 row) ``` With this patch we now forward `options`, `application_name` and `replication`. Further reading: https://www.postgresql.org/docs/current/libpq-connect.html Fixes #1287. --- Cargo.lock | 1 + libs/utils/src/pq_proto.rs | 157 +++++++++++++++++------ proxy/Cargo.toml | 1 + proxy/src/auth/backend.rs | 4 +- proxy/src/auth/backend/console.rs | 8 +- proxy/src/auth/backend/legacy_console.rs | 12 +- proxy/src/auth/backend/postgres.rs | 6 +- proxy/src/auth/credentials.rs | 85 ++++++------ proxy/src/cancellation.rs | 2 +- proxy/src/compute.rs | 39 +++++- proxy/src/proxy.rs | 52 +++++--- safekeeper/src/handler.rs | 25 ++-- test_runner/batch_others/test_proxy.py | 6 +- 13 files changed, 271 insertions(+), 127 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 603e034ed3..2e300e46f5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2271,6 +2271,7 @@ dependencies = [ "hex", "hmac 0.12.1", "hyper", + "itertools", "md5", "metrics", "once_cell", diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index 2f8dcf31d3..dde76039d7 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -7,11 +7,14 @@ use anyhow::{bail, ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use postgres_protocol::PG_EPOCH; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::future::Future; -use std::io::{self, Cursor}; -use std::str; -use std::time::{Duration, SystemTime}; +use std::{ + borrow::Cow, + collections::HashMap, + future::Future, + io::{self, Cursor}, + str, + time::{Duration, SystemTime}, +}; use tokio::io::AsyncReadExt; use tracing::{trace, warn}; @@ -53,7 +56,67 @@ pub enum FeStartupPacket { }, } -pub type StartupMessageParams = HashMap; +#[derive(Debug)] +pub struct StartupMessageParams { + params: HashMap, +} + +impl StartupMessageParams { + /// Get parameter's value by its name. + pub fn get(&self, name: &str) -> Option<&str> { + self.params.get(name).map(|s| s.as_str()) + } + + /// Split command-line options according to PostgreSQL's logic, + /// taking into account all escape sequences but leaving them as-is. + /// [`None`] means that there's no `options` in [`Self`]. + pub fn options_raw(&self) -> Option> { + // See `postgres: pg_split_opts`. + let mut last_was_escape = false; + let iter = self + .get("options")? + .split(move |c: char| { + // We split by non-escaped whitespace symbols. + let should_split = c.is_ascii_whitespace() && !last_was_escape; + last_was_escape = c == '\\' && !last_was_escape; + should_split + }) + .filter(|s| !s.is_empty()); + + Some(iter) + } + + /// Split command-line options according to PostgreSQL's logic, + /// applying all escape sequences (using owned strings as needed). + /// [`None`] means that there's no `options` in [`Self`]. + pub fn options_escaped(&self) -> Option>> { + // See `postgres: pg_split_opts`. + let iter = self.options_raw()?.map(|s| { + let mut preserve_next_escape = false; + let escape = |c| { + // We should remove '\\' unless it's preceded by '\\'. + let should_remove = c == '\\' && !preserve_next_escape; + preserve_next_escape = should_remove; + should_remove + }; + + match s.contains('\\') { + true => Cow::Owned(s.replace(escape, "")), + false => Cow::Borrowed(s), + } + }); + + Some(iter) + } + + // This function is mostly useful in tests. + #[doc(hidden)] + pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self { + Self { + params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(), + } + } +} #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct CancelKeyData { @@ -237,9 +300,9 @@ impl FeStartupPacket { stream.read_exact(params_bytes.as_mut()).await?; // Parse params depending on request code - let most_sig_16_bits = request_code >> 16; - let least_sig_16_bits = request_code & ((1 << 16) - 1); - let message = match (most_sig_16_bits, least_sig_16_bits) { + let req_hi = request_code >> 16; + let req_lo = request_code & ((1 << 16) - 1); + let message = match (req_hi, req_lo) { (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => { ensure!(params_len == 8, "expected 8 bytes for CancelRequest params"); let mut cursor = Cursor::new(params_bytes); @@ -248,49 +311,44 @@ impl FeStartupPacket { cancel_key: cursor.read_i32().await?, }) } - (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => FeStartupPacket::SslRequest, + (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => { + // Requested upgrade to SSL (aka TLS) + FeStartupPacket::SslRequest + } (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => { + // Requested upgrade to GSSAPI FeStartupPacket::GssEncRequest } (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => { bail!("Unrecognized request code {}", unrecognized_code) } + // TODO bail if protocol major_version is not 3? (major_version, minor_version) => { - // TODO bail if protocol major_version is not 3? - // Parse null-terminated (String) pairs of param name / param value - let params_str = str::from_utf8(¶ms_bytes).unwrap(); - let mut params_tokens = params_str.split('\0'); - let mut params: HashMap = HashMap::new(); - while let Some(name) = params_tokens.next() { - let value = params_tokens + // Parse pairs of null-terminated strings (key, value). + // See `postgres: ProcessStartupPacket, build_startup_packet`. + let mut tokens = str::from_utf8(¶ms_bytes) + .context("StartupMessage params: invalid utf-8")? + .strip_suffix('\0') // drop packet's own null terminator + .context("StartupMessage params: missing null terminator")? + .split_terminator('\0'); + + let mut params = HashMap::new(); + while let Some(name) = tokens.next() { + let value = tokens .next() - .context("expected even number of params in StartupMessage")?; - if name == "options" { - // parsing options arguments "...&options=%3D+=..." - // '%3D' is '=' and '+' is ' ' + .context("StartupMessage params: key without value")?; - // Note: we allow users that don't have SNI capabilities, - // to pass a special keyword argument 'project' - // to be used to determine the cluster name by the proxy. - - //TODO: write unit test for this and refactor in its own function. - for cmdopt in value.split(' ') { - let nameval: Vec<&str> = cmdopt.split('=').collect(); - if nameval.len() == 2 { - params.insert(nameval[0].to_string(), nameval[1].to_string()); - } - } - } else { - params.insert(name.to_string(), value.to_string()); - } + params.insert(name.to_owned(), value.to_owned()); } + FeStartupPacket::StartupMessage { major_version, minor_version, - params, + params: StartupMessageParams { params }, } } }; + Ok(Some(FeMessage::StartupPacket(message))) }) } @@ -967,6 +1025,33 @@ mod tests { assert_eq!(zf, zf_parsed); } + #[test] + fn test_startup_message_params_options_escaped() { + fn split_options(params: &StartupMessageParams) -> Vec> { + params + .options_escaped() + .expect("options are None") + .collect() + } + + let make_params = |options| StartupMessageParams::new([("options", options)]); + + let params = StartupMessageParams::new([]); + assert!(matches!(params.options_escaped(), None)); + + let params = make_params(""); + assert!(split_options(¶ms).is_empty()); + + let params = make_params("foo"); + assert_eq!(split_options(¶ms), ["foo"]); + + let params = make_params(" foo bar "); + assert_eq!(split_options(¶ms), ["foo", "bar"]); + + let params = make_params("foo\\ bar \\ \\\\ baz\\ lol"); + assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]); + } + // Make sure that `read` is sync/async callable async fn _assert(stream: &mut (impl tokio::io::AsyncRead + Unpin)) { let _ = FeMessage::read(&mut [].as_ref()); diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index d3f7ea5fdc..5a450793f1 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -15,6 +15,7 @@ hashbrown = "0.12" hex = "0.4.3" hmac = "0.12.1" hyper = "0.14" +itertools = "0.10.3" once_cell = "1.13.0" md5 = "0.7.0" parking_lot = "0.12" diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index bb7e7ef67b..9c43620ffb 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -127,7 +127,7 @@ impl BackendType> { } } -impl BackendType { +impl BackendType> { /// Authenticate the client via the requested backend, possibly using credentials. pub async fn authenticate( mut self, @@ -149,7 +149,7 @@ impl BackendType { // Finally we may finish the initialization of `creds`. // TODO: add missing type safety to ClientCredentials. - creds.project = Some(payload.project); + creds.project = Some(payload.project.into()); let mut config = match &self { Console(creds) => { diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index 87906679ea..e239320e9b 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -121,7 +121,7 @@ pub enum AuthInfo { #[must_use] pub(super) struct Api<'a> { endpoint: &'a ApiUrl, - creds: &'a ClientCredentials, + creds: &'a ClientCredentials<'a>, } impl<'a> Api<'a> { @@ -143,7 +143,7 @@ impl<'a> Api<'a> { url.path_segments_mut().push("proxy_get_role_secret"); url.query_pairs_mut() .append_pair("project", self.creds.project().expect("impossible")) - .append_pair("role", &self.creds.user); + .append_pair("role", self.creds.user); // TODO: use a proper logger println!("cplane request: {url}"); @@ -187,8 +187,8 @@ impl<'a> Api<'a> { config .host(host) .port(port) - .dbname(&self.creds.dbname) - .user(&self.creds.user); + .dbname(self.creds.dbname) + .user(self.creds.user); Ok(config) } diff --git a/proxy/src/auth/backend/legacy_console.rs b/proxy/src/auth/backend/legacy_console.rs index 17ba44e833..b99a004dcd 100644 --- a/proxy/src/auth/backend/legacy_console.rs +++ b/proxy/src/auth/backend/legacy_console.rs @@ -56,7 +56,7 @@ enum ProxyAuthResponse { NotReady { ready: bool }, // TODO: get rid of `ready` } -impl ClientCredentials { +impl ClientCredentials<'_> { fn is_existing_user(&self) -> bool { self.user.ends_with("@zenith") } @@ -64,15 +64,15 @@ impl ClientCredentials { async fn authenticate_proxy_client( auth_endpoint: &reqwest::Url, - creds: &ClientCredentials, + creds: &ClientCredentials<'_>, md5_response: &str, salt: &[u8; 4], psql_session_id: &str, ) -> Result { let mut url = auth_endpoint.clone(); url.query_pairs_mut() - .append_pair("login", &creds.user) - .append_pair("database", &creds.dbname) + .append_pair("login", creds.user) + .append_pair("database", creds.dbname) .append_pair("md5response", md5_response) .append_pair("salt", &hex::encode(salt)) .append_pair("psql_session_id", psql_session_id); @@ -103,7 +103,7 @@ async fn authenticate_proxy_client( async fn handle_existing_user( auth_endpoint: &reqwest::Url, client: &mut PqStream, - creds: &ClientCredentials, + creds: &ClientCredentials<'_>, ) -> auth::Result { let psql_session_id = super::link::new_psql_session_id(); let md5_salt = rand::random(); @@ -136,7 +136,7 @@ async fn handle_existing_user( pub async fn handle_user( auth_endpoint: &reqwest::Url, auth_link_uri: &reqwest::Url, - creds: &ClientCredentials, + creds: &ClientCredentials<'_>, client: &mut PqStream, ) -> auth::Result { if creds.is_existing_user() { diff --git a/proxy/src/auth/backend/postgres.rs b/proxy/src/auth/backend/postgres.rs index 183fa52ec1..2055ee14c8 100644 --- a/proxy/src/auth/backend/postgres.rs +++ b/proxy/src/auth/backend/postgres.rs @@ -17,7 +17,7 @@ use tokio::io::{AsyncRead, AsyncWrite}; #[must_use] pub(super) struct Api<'a> { endpoint: &'a ApiUrl, - creds: &'a ClientCredentials, + creds: &'a ClientCredentials<'a>, } // Helps eliminate graceless `.map_err` calls without introducing another ctor. @@ -87,8 +87,8 @@ impl<'a> Api<'a> { config .host(self.endpoint.host_str().unwrap_or("localhost")) .port(self.endpoint.port().unwrap_or(5432)) - .dbname(&self.creds.dbname) - .user(&self.creds.user); + .dbname(self.creds.dbname) + .user(self.creds.user); Ok(config) } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 4c72da1c48..ea71eba010 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,6 +1,7 @@ //! User credentials used in authentication. use crate::error::UserFacingError; +use std::borrow::Cow; use thiserror::Error; use utils::pq_proto::StartupMessageParams; @@ -27,51 +28,59 @@ impl UserFacingError for ClientCredsParseError {} /// Various client credentials which we use for authentication. /// Note that we don't store any kind of client key or password here. #[derive(Debug, Clone, PartialEq, Eq)] -pub struct ClientCredentials { - pub user: String, - pub dbname: String, - pub project: Option, +pub struct ClientCredentials<'a> { + pub user: &'a str, + pub dbname: &'a str, + pub project: Option>, } -impl ClientCredentials { +impl ClientCredentials<'_> { pub fn project(&self) -> Option<&str> { self.project.as_deref() } } -impl ClientCredentials { +impl<'a> ClientCredentials<'a> { pub fn parse( - mut options: StartupMessageParams, + params: &'a StartupMessageParams, sni: Option<&str>, common_name: Option<&str>, ) -> Result { use ClientCredsParseError::*; - // Some parameters are absolutely necessary, others not so much. - let mut get_param = |key| options.remove(key).ok_or(MissingKey(key)); - // Some parameters are stored in the startup message. + let get_param = |key| params.get(key).ok_or(MissingKey(key)); let user = get_param("user")?; let dbname = get_param("database")?; - let project_a = get_param("project").ok(); + + // Project name might be passed via PG's command-line options. + let project_a = params.options_raw().and_then(|options| { + for opt in options { + if let Some(value) = opt.strip_prefix("project=") { + return Some(Cow::Borrowed(value)); + } + } + None + }); // Alternative project name is in fact a subdomain from SNI. // NOTE: we do not consider SNI if `common_name` is missing. let project_b = sni .zip(common_name) .map(|(sni, cn)| { - // TODO: what if SNI is present but just a common name? subdomain_from_sni(sni, cn) - .ok_or_else(|| InconsistentSni(sni.to_owned(), cn.to_owned())) + .ok_or_else(|| InconsistentSni(sni.into(), cn.into())) + .map(Cow::<'static, str>::Owned) }) .transpose()?; let project = match (project_a, project_b) { // Invariant: if we have both project name variants, they should match. - (Some(a), Some(b)) if a != b => Some(Err(InconsistentProjectNames(a, b))), - (a, b) => a.or(b).map(|name| { - // Invariant: project name may not contain certain characters. - check_project_name(name).map_err(MalformedProjectName) + (Some(a), Some(b)) if a != b => Some(Err(InconsistentProjectNames(a.into(), b.into()))), + // Invariant: project name may not contain certain characters. + (a, b) => a.or(b).map(|name| match project_name_valid(&name) { + false => Err(MalformedProjectName(name.into())), + true => Ok(name), }), } .transpose()?; @@ -84,12 +93,8 @@ impl ClientCredentials { } } -fn check_project_name(name: String) -> Result { - if name.chars().all(|c| c.is_alphanumeric() || c == '-') { - Ok(name) - } else { - Err(name) - } +fn project_name_valid(name: &str) -> bool { + name.chars().all(|c| c.is_alphanumeric() || c == '-') } fn subdomain_from_sni(sni: &str, common_name: &str) -> Option { @@ -102,18 +107,14 @@ fn subdomain_from_sni(sni: &str, common_name: &str) -> Option { mod tests { use super::*; - fn make_options<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> StartupMessageParams { - StartupMessageParams::from(pairs.map(|(k, v)| (k.to_owned(), v.to_owned()))) - } - #[test] #[ignore = "TODO: fix how database is handled"] fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. - let options = make_options([("user", "john_doe")]); + let options = StartupMessageParams::new([("user", "john_doe")]); // TODO: check that `creds.dbname` is None. - let creds = ClientCredentials::parse(options, None, None)?; + let creds = ClientCredentials::parse(&options, None, None)?; assert_eq!(creds.user, "john_doe"); Ok(()) @@ -121,9 +122,9 @@ mod tests { #[test] fn parse_missing_project() -> anyhow::Result<()> { - let options = make_options([("user", "john_doe"), ("database", "world")]); + let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]); - let creds = ClientCredentials::parse(options, None, None)?; + let creds = ClientCredentials::parse(&options, None, None)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.dbname, "world"); assert_eq!(creds.project, None); @@ -133,12 +134,12 @@ mod tests { #[test] fn parse_project_from_sni() -> anyhow::Result<()> { - let options = make_options([("user", "john_doe"), ("database", "world")]); + let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]); let sni = Some("foo.localhost"); let common_name = Some("localhost"); - let creds = ClientCredentials::parse(options, sni, common_name)?; + let creds = ClientCredentials::parse(&options, sni, common_name)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("foo")); @@ -148,13 +149,13 @@ mod tests { #[test] fn parse_project_from_options() -> anyhow::Result<()> { - let options = make_options([ + let options = StartupMessageParams::new([ ("user", "john_doe"), ("database", "world"), - ("project", "bar"), + ("options", "-ckey=1 project=bar -c geqo=off"), ]); - let creds = ClientCredentials::parse(options, None, None)?; + let creds = ClientCredentials::parse(&options, None, None)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("bar")); @@ -164,16 +165,16 @@ mod tests { #[test] fn parse_projects_identical() -> anyhow::Result<()> { - let options = make_options([ + let options = StartupMessageParams::new([ ("user", "john_doe"), ("database", "world"), - ("project", "baz"), + ("options", "project=baz"), ]); let sni = Some("baz.localhost"); let common_name = Some("localhost"); - let creds = ClientCredentials::parse(options, sni, common_name)?; + let creds = ClientCredentials::parse(&options, sni, common_name)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("baz")); @@ -183,17 +184,17 @@ mod tests { #[test] fn parse_projects_different() { - let options = make_options([ + let options = StartupMessageParams::new([ ("user", "john_doe"), ("database", "world"), - ("project", "first"), + ("options", "project=first"), ]); let sni = Some("second.localhost"); let common_name = Some("localhost"); assert!(matches!( - ClientCredentials::parse(options, sni, common_name).expect_err("should fail"), + ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"), ClientCredsParseError::InconsistentProjectNames(_, _) )); } diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index a801313635..b7412b6f5b 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -95,7 +95,7 @@ impl<'a> Session<'a> { /// Store the cancel token for the given session. /// This enables query cancellation in [`crate::proxy::handshake`]. - pub fn enable_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData { + pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData { self.cancel_map .0 .lock() diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 3bad36661b..4ae44ded57 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,9 +1,11 @@ use crate::{cancellation::CancelClosure, error::UserFacingError}; use futures::TryFutureExt; +use itertools::Itertools; use std::{io, net::SocketAddr}; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::NoTls; +use utils::pq_proto::StartupMessageParams; #[derive(Debug, Error)] pub enum ConnectionError { @@ -110,7 +112,42 @@ pub struct PostgresConnection { impl NodeInfo { /// Connect to a corresponding compute node. - pub async fn connect(&self) -> Result<(PostgresConnection, CancelClosure), ConnectionError> { + pub async fn connect( + mut self, + params: &StartupMessageParams, + ) -> Result<(PostgresConnection, CancelClosure), ConnectionError> { + if let Some(options) = params.options_raw() { + // We must drop all proxy-specific parameters. + #[allow(unstable_name_collisions)] + let options: String = options + .filter(|opt| !opt.starts_with("project=")) + .intersperse(" ") // TODO: use impl from std once it's stabilized + .collect(); + + self.config.options(&options); + } + + if let Some(app_name) = params.get("application_name") { + self.config.application_name(app_name); + } + + if let Some(replication) = params.get("replication") { + use tokio_postgres::config::ReplicationMode; + match replication { + "true" | "on" | "yes" | "1" => { + self.config.replication_mode(ReplicationMode::Physical); + } + "database" => { + self.config.replication_mode(ReplicationMode::Logical); + } + _other => {} + } + } + + // TODO: extend the list of the forwarded startup parameters. + // Currently, tokio-postgres doesn't allow us to pass + // arbitrary parameters, but the ones above are a good start. + let (socket_addr, mut stream) = self .connect_raw() .await diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 29be79c886..72cb822910 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -1,6 +1,6 @@ use crate::auth; use crate::cancellation::{self, CancelMap}; -use crate::config::{ProxyConfig, TlsConfig}; +use crate::config::{AuthUrls, ProxyConfig, TlsConfig}; use crate::stream::{MetricsStream, PqStream, Stream}; use anyhow::{bail, Context}; use futures::TryFutureExt; @@ -93,20 +93,21 @@ async fn handle_client( None => return Ok(()), // it's a cancellation request }; + // Extract credentials which we're going to use for auth. let creds = { let sni = stream.get_ref().sni_hostname(); let common_name = tls.and_then(|tls| tls.common_name.as_deref()); let result = config .auth_backend - .map(|_| auth::ClientCredentials::parse(params, sni, common_name)) + .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name)) .transpose(); async { result }.or_else(|e| stream.throw_error(e)).await? }; - let client = Client::new(stream, creds); + let client = Client::new(stream, creds, ¶ms); cancel_map - .with_session(|session| client.connect_to_db(config, session)) + .with_session(|session| client.connect_to_db(&config.auth_urls, session)) .await } @@ -174,38 +175,57 @@ async fn handshake( } /// Thin connection context. -struct Client { +struct Client<'a, S> { /// The underlying libpq protocol stream. stream: PqStream, /// Client credentials that we care about. - creds: auth::BackendType, + creds: auth::BackendType>, + /// KV-dictionary with PostgreSQL connection params. + params: &'a StartupMessageParams, } -impl Client { +impl<'a, S> Client<'a, S> { /// Construct a new connection context. - fn new(stream: PqStream, creds: auth::BackendType) -> Self { - Self { stream, creds } + fn new( + stream: PqStream, + creds: auth::BackendType>, + params: &'a StartupMessageParams, + ) -> Self { + Self { + stream, + creds, + params, + } } } -impl Client { +impl Client<'_, S> { /// Let the client authenticate and connect to the designated compute node. async fn connect_to_db( self, - config: &ProxyConfig, + urls: &AuthUrls, session: cancellation::Session<'_>, ) -> anyhow::Result<()> { - let Self { mut stream, creds } = self; + let Self { + mut stream, + creds, + params, + } = self; // Authenticate and connect to a compute node. - let auth = creds.authenticate(&config.auth_urls, &mut stream).await; + let auth = creds.authenticate(urls, &mut stream).await; let node = async { auth }.or_else(|e| stream.throw_error(e)).await?; + let reported_auth_ok = node.reported_auth_ok; - let (db, cancel_closure) = node.connect().or_else(|e| stream.throw_error(e)).await?; - let cancel_key_data = session.enable_cancellation(cancel_closure); + let (db, cancel_closure) = node + .connect(params) + .or_else(|e| stream.throw_error(e)) + .await?; + + let cancel_key_data = session.enable_query_cancellation(cancel_closure); // Report authentication success if we haven't done this already. - if !node.reported_auth_ok { + if !reported_auth_ok { stream .write_message_noflush(&Be::AuthenticationOk)? .write_message_noflush(&BeParameterStatusMessage::encoding())?; diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index c90c2a0446..3e301259ed 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -11,7 +11,6 @@ use anyhow::{bail, Context, Result}; use postgres_ffi::PG_TLI; use regex::Regex; -use std::str::FromStr; use std::sync::Arc; use tracing::info; use utils::{ @@ -67,18 +66,22 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { // ztenant id and ztimeline id are passed in connection string params fn startup(&mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket) -> Result<()> { if let FeStartupPacket::StartupMessage { params, .. } = sm { - self.ztenantid = match params.get("ztenantid") { - Some(z) => Some(ZTenantId::from_str(z)?), // just curious, can I do that from .map? - _ => None, - }; - - self.ztimelineid = match params.get("ztimelineid") { - Some(z) => Some(ZTimelineId::from_str(z)?), - _ => None, - }; + if let Some(options) = params.options_raw() { + for opt in options { + match opt.split_once('=') { + Some(("ztenantid", value)) => { + self.ztenantid = Some(value.parse()?); + } + Some(("ztimelineid", value)) => { + self.ztimelineid = Some(value.parse()?); + } + _ => continue, + } + } + } if let Some(app_name) = params.get("application_name") { - self.appname = Some(app_name.clone()); + self.appname = Some(app_name.to_owned()); } Ok(()) diff --git a/test_runner/batch_others/test_proxy.py b/test_runner/batch_others/test_proxy.py index 1efb795140..bd02841dc0 100644 --- a/test_runner/batch_others/test_proxy.py +++ b/test_runner/batch_others/test_proxy.py @@ -134,12 +134,8 @@ async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProx # Pass extra options to the server. -# -# Currently, proxy eats the extra connection options, so this fails. -# See https://github.com/neondatabase/neon/issues/1287 -@pytest.mark.xfail def test_proxy_options(static_proxy): - with static_proxy.connect(options="-cproxytest.option=value") as conn: + with static_proxy.connect(options="project=irrelevant -cproxytest.option=value") as conn: with conn.cursor() as cur: cur.execute("SHOW proxytest.option") value = cur.fetchall()[0][0] From 3aca717f3d994875d2dd2a4a09568ced9b9de4c5 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 30 Aug 2022 18:25:38 +0300 Subject: [PATCH 0705/1022] Reorganize python tests. Merge batch_others and batch_pg_regress. The original idea was to split all the python tests into multiple "batches" and run each batch in parallel as a separate CI job. However, the batch_pg_regress batch was pretty short compared to all the tests in batch_others. We could split batch_others into multiple batches, but it actually seems better to just treat them as one big pool of tests and use pytest's handle the parallelism on its own. If we need to split them across multiple nodes in the future, we could use pytest-shard or something else, instead of managing the batches ourselves. Merge test_neon_regress.py, test_pg_regress.py and test_isolation.py into one file, test_pg_regress.py. Seems more clear to group all pg_regress-based tests into one file, now that they would all be in the same directory. --- .../actions/run-python-test-set/action.yml | 2 +- .github/workflows/build_and_test.yml | 43 +---- pageserver/src/page_service.rs | 2 +- test_runner/README.md | 20 ++- .../batch_pg_regress/test_isolation.py | 50 ------ .../batch_pg_regress/test_neon_regress.py | 55 ------ .../batch_pg_regress/test_pg_regress.py | 56 ------ test_runner/neon_regress/README.md | 8 - .../test_ancestor_branch.py | 0 .../{batch_others => regress}/test_auth.py | 0 .../test_backpressure.py | 0 .../test_basebackup_error.py | 0 .../test_branch_and_gc.py | 0 .../test_branch_behind.py | 0 .../test_branching.py | 9 +- .../test_broken_timeline.py | 0 .../test_clog_truncate.py | 0 .../test_close_fds.py | 0 .../{batch_others => regress}/test_config.py | 0 .../test_crafted_wal_end.py | 0 .../test_createdropdb.py | 0 .../test_createuser.py | 0 .../test_fsm_truncate.py | 0 .../test_fullbackup.py | 0 .../test_gc_aggressive.py | 0 .../{batch_others => regress}/test_import.py | 0 .../test_large_schema.py | 0 .../test_lsn_mapping.py | 0 .../test_multixact.py | 0 .../test_neon_cli.py | 0 .../test_next_xid.py | 0 .../test_normal_work.py | 0 .../test_old_request_lsn.py | 0 .../test_pageserver_api.py | 0 .../test_pageserver_catchup.py | 0 .../test_pageserver_restart.py | 0 .../test_parallel_copy.py | 0 test_runner/regress/test_pg_regress.py | 159 ++++++++++++++++++ .../{batch_others => regress}/test_pitr_gc.py | 0 .../{batch_others => regress}/test_proxy.py | 0 .../test_read_validation.py | 0 .../test_readonly_node.py | 0 .../test_recovery.py | 0 .../test_remote_storage.py | 0 .../{batch_others => regress}/test_setup.py | 0 .../test_subxacts.py | 0 .../test_tenant_conf.py | 0 .../test_tenant_detach.py | 0 .../test_tenant_relocation.py | 0 .../test_tenant_tasks.py | 0 .../{batch_others => regress}/test_tenants.py | 0 .../test_tenants_with_remote_storage.py | 0 .../test_timeline_delete.py | 0 .../test_timeline_size.py | 0 .../test_twophase.py | 0 .../{batch_others => regress}/test_vm_bits.py | 0 .../test_wal_acceptor.py | 0 .../test_wal_acceptor_async.py | 0 .../test_wal_restore.py | 0 .../{neon_regress => sql_regress}/.gitignore | 0 test_runner/sql_regress/README.md | 13 ++ .../expected/.gitignore | 0 .../expected/neon-cid.out | 0 .../expected/neon-clog.out | 0 .../expected/neon-rel-truncate.out | 0 .../expected/neon-vacuum-full.out | 0 .../parallel_schedule | 0 .../sql/.gitignore | 0 .../sql/neon-cid.sql | 0 .../sql/neon-clog.sql | 0 .../sql/neon-rel-truncate.sql | 0 .../sql/neon-vacuum-full.sql | 0 72 files changed, 201 insertions(+), 216 deletions(-) delete mode 100644 test_runner/batch_pg_regress/test_isolation.py delete mode 100644 test_runner/batch_pg_regress/test_neon_regress.py delete mode 100644 test_runner/batch_pg_regress/test_pg_regress.py delete mode 100644 test_runner/neon_regress/README.md rename test_runner/{batch_others => regress}/test_ancestor_branch.py (100%) rename test_runner/{batch_others => regress}/test_auth.py (100%) rename test_runner/{batch_others => regress}/test_backpressure.py (100%) rename test_runner/{batch_others => regress}/test_basebackup_error.py (100%) rename test_runner/{batch_others => regress}/test_branch_and_gc.py (100%) rename test_runner/{batch_others => regress}/test_branch_behind.py (100%) rename test_runner/{batch_others => regress}/test_branching.py (91%) rename test_runner/{batch_others => regress}/test_broken_timeline.py (100%) rename test_runner/{batch_others => regress}/test_clog_truncate.py (100%) rename test_runner/{batch_others => regress}/test_close_fds.py (100%) rename test_runner/{batch_others => regress}/test_config.py (100%) rename test_runner/{batch_others => regress}/test_crafted_wal_end.py (100%) rename test_runner/{batch_others => regress}/test_createdropdb.py (100%) rename test_runner/{batch_others => regress}/test_createuser.py (100%) rename test_runner/{batch_others => regress}/test_fsm_truncate.py (100%) rename test_runner/{batch_others => regress}/test_fullbackup.py (100%) rename test_runner/{batch_others => regress}/test_gc_aggressive.py (100%) rename test_runner/{batch_others => regress}/test_import.py (100%) rename test_runner/{batch_others => regress}/test_large_schema.py (100%) rename test_runner/{batch_others => regress}/test_lsn_mapping.py (100%) rename test_runner/{batch_others => regress}/test_multixact.py (100%) rename test_runner/{batch_others => regress}/test_neon_cli.py (100%) rename test_runner/{batch_others => regress}/test_next_xid.py (100%) rename test_runner/{batch_others => regress}/test_normal_work.py (100%) rename test_runner/{batch_others => regress}/test_old_request_lsn.py (100%) rename test_runner/{batch_others => regress}/test_pageserver_api.py (100%) rename test_runner/{batch_others => regress}/test_pageserver_catchup.py (100%) rename test_runner/{batch_others => regress}/test_pageserver_restart.py (100%) rename test_runner/{batch_others => regress}/test_parallel_copy.py (100%) create mode 100644 test_runner/regress/test_pg_regress.py rename test_runner/{batch_others => regress}/test_pitr_gc.py (100%) rename test_runner/{batch_others => regress}/test_proxy.py (100%) rename test_runner/{batch_others => regress}/test_read_validation.py (100%) rename test_runner/{batch_others => regress}/test_readonly_node.py (100%) rename test_runner/{batch_others => regress}/test_recovery.py (100%) rename test_runner/{batch_others => regress}/test_remote_storage.py (100%) rename test_runner/{batch_others => regress}/test_setup.py (100%) rename test_runner/{batch_others => regress}/test_subxacts.py (100%) rename test_runner/{batch_others => regress}/test_tenant_conf.py (100%) rename test_runner/{batch_others => regress}/test_tenant_detach.py (100%) rename test_runner/{batch_others => regress}/test_tenant_relocation.py (100%) rename test_runner/{batch_others => regress}/test_tenant_tasks.py (100%) rename test_runner/{batch_others => regress}/test_tenants.py (100%) rename test_runner/{batch_others => regress}/test_tenants_with_remote_storage.py (100%) rename test_runner/{batch_others => regress}/test_timeline_delete.py (100%) rename test_runner/{batch_others => regress}/test_timeline_size.py (100%) rename test_runner/{batch_others => regress}/test_twophase.py (100%) rename test_runner/{batch_others => regress}/test_vm_bits.py (100%) rename test_runner/{batch_others => regress}/test_wal_acceptor.py (100%) rename test_runner/{batch_others => regress}/test_wal_acceptor_async.py (100%) rename test_runner/{batch_others => regress}/test_wal_restore.py (100%) rename test_runner/{neon_regress => sql_regress}/.gitignore (100%) create mode 100644 test_runner/sql_regress/README.md rename test_runner/{neon_regress => sql_regress}/expected/.gitignore (100%) rename test_runner/{neon_regress => sql_regress}/expected/neon-cid.out (100%) rename test_runner/{neon_regress => sql_regress}/expected/neon-clog.out (100%) rename test_runner/{neon_regress => sql_regress}/expected/neon-rel-truncate.out (100%) rename test_runner/{neon_regress => sql_regress}/expected/neon-vacuum-full.out (100%) rename test_runner/{neon_regress => sql_regress}/parallel_schedule (100%) rename test_runner/{neon_regress => sql_regress}/sql/.gitignore (100%) rename test_runner/{neon_regress => sql_regress}/sql/neon-cid.sql (100%) rename test_runner/{neon_regress => sql_regress}/sql/neon-clog.sql (100%) rename test_runner/{neon_regress => sql_regress}/sql/neon-rel-truncate.sql (100%) rename test_runner/{neon_regress => sql_regress}/sql/neon-vacuum-full.sql (100%) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 22447025cb..a4bcaff56d 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -149,7 +149,7 @@ runs: fi - name: Upload Allure results - if: ${{ always() && (inputs.test_selection == 'batch_others' || inputs.test_selection == 'batch_pg_regress') }} + if: ${{ always() && (inputs.test_selection == 'regress') }} uses: ./.github/actions/allure-report with: action: store diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index bf6eb69930..8b1dc3a9c4 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -206,7 +206,7 @@ jobs: if: matrix.build_type == 'debug' uses: ./.github/actions/save-coverage-data - pg_regress-tests: + regress-tests: runs-on: dev container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned @@ -224,42 +224,13 @@ jobs: submodules: true fetch-depth: 2 - - name: Pytest regress tests + - name: Pytest regression tests uses: ./.github/actions/run-python-test-set with: build_type: ${{ matrix.build_type }} rust_toolchain: ${{ matrix.rust_toolchain }} - test_selection: batch_pg_regress + test_selection: regress needs_postgres_source: true - - - name: Merge and upload coverage data - if: matrix.build_type == 'debug' - uses: ./.github/actions/save-coverage-data - - other-tests: - runs-on: dev - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned - options: --init - needs: [ build-neon ] - strategy: - fail-fast: false - matrix: - build_type: [ debug, release ] - rust_toolchain: [ 1.58 ] - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 2 - - - name: Pytest other tests - uses: ./.github/actions/run-python-test-set - with: - build_type: ${{ matrix.build_type }} - rust_toolchain: ${{ matrix.rust_toolchain }} - test_selection: batch_others run_with_real_s3: true real_s3_bucket: ci-tests-s3 real_s3_region: us-west-2 @@ -307,7 +278,7 @@ jobs: container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init - needs: [ other-tests, pg_regress-tests ] + needs: [ regress-tests ] if: always() strategy: fail-fast: false @@ -330,7 +301,7 @@ jobs: container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init - needs: [ other-tests, pg_regress-tests ] + needs: [ regress-tests ] strategy: fail-fast: false matrix: @@ -587,7 +558,7 @@ jobs: #container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly - needs: [ push-docker-hub, calculate-deploy-targets, tag, other-tests, pg_regress-tests ] + needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' @@ -642,7 +613,7 @@ jobs: runs-on: dev container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. - needs: [ push-docker-hub, calculate-deploy-targets, tag, other-tests, pg_regress-tests ] + needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index fbc70f7690..d59a82d488 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1077,7 +1077,7 @@ impl postgres_backend::Handler for PageServerHandler { .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("do_gc ") { // Run GC immediately on given timeline. - // FIXME: This is just for tests. See test_runner/batch_others/test_gc.py. + // FIXME: This is just for tests. See test_runner/regress/test_gc.py. // This probably should require special authentication or a global flag to // enable, I don't think we want to or need to allow regular clients to invoke // GC. diff --git a/test_runner/README.md b/test_runner/README.md index 4b54c45175..c7ec361d65 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -15,12 +15,22 @@ Prerequisites: ### Test Organization -The tests are divided into a few batches, such that each batch takes roughly -the same amount of time. The batches can be run in parallel, to minimize total -runtime. Currently, there are only two batches: +Regression tests are in the 'regress' directory. They can be run in +parallel to minimize total runtime. Most regression test sets up their +environment with its own pageservers and safekeepers (but see +`TEST_SHARED_FIXTURES`). -- test_batch_pg_regress: Runs PostgreSQL regression tests -- test_others: All other tests +'pg_clients' contains tests for connecting with various client +libraries. Each client test uses a Dockerfile that pulls an image that +contains the client, and connects to PostgreSQL with it. The client +tests can be run against an existing PostgreSQL or Neon installation. + +'performance' contains performance regression tests. Each test +exercises a particular scenario or workload, and outputs +measurements. They should be run serially, to avoid the tests +interfering with the performance of each other. Some performance tests +set up their own Neon environment, while others can be run against an +existing PostgreSQL or Neon environment. ### Running the tests diff --git a/test_runner/batch_pg_regress/test_isolation.py b/test_runner/batch_pg_regress/test_isolation.py deleted file mode 100644 index 7127a069b0..0000000000 --- a/test_runner/batch_pg_regress/test_isolation.py +++ /dev/null @@ -1,50 +0,0 @@ -import os -from pathlib import Path - -import pytest -from fixtures.neon_fixtures import NeonEnv, base_dir, pg_distrib_dir - - -# The isolation tests run for a long time, especially in debug mode, -# so use a larger-than-default timeout. -@pytest.mark.timeout(1800) -def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): - env = neon_simple_env - - env.neon_cli.create_branch("test_isolation", "empty") - # Connect to postgres and create a database called "regression". - # isolation tests use prepared transactions, so enable them - pg = env.postgres.create_start("test_isolation", config_lines=["max_prepared_transactions=100"]) - pg.safe_psql("CREATE DATABASE isolation_regression") - - # Create some local directories for pg_isolation_regress to run in. - runpath = test_output_dir / "regress" - (runpath / "testtablespace").mkdir(parents=True) - - # Compute all the file locations that pg_isolation_regress will need. - build_path = os.path.join(pg_distrib_dir, "build/src/test/isolation") - src_path = os.path.join(base_dir, "vendor/postgres/src/test/isolation") - bindir = os.path.join(pg_distrib_dir, "bin") - schedule = os.path.join(src_path, "isolation_schedule") - pg_isolation_regress = os.path.join(build_path, "pg_isolation_regress") - - pg_isolation_regress_command = [ - pg_isolation_regress, - "--use-existing", - "--bindir={}".format(bindir), - "--dlpath={}".format(build_path), - "--inputdir={}".format(src_path), - "--schedule={}".format(schedule), - ] - - env_vars = { - "PGPORT": str(pg.default_options["port"]), - "PGUSER": pg.default_options["user"], - "PGHOST": pg.default_options["host"], - } - - # Run the command. - # We don't capture the output. It's not too chatty, and it always - # logs the exact same data to `regression.out` anyway. - with capsys.disabled(): - pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath) diff --git a/test_runner/batch_pg_regress/test_neon_regress.py b/test_runner/batch_pg_regress/test_neon_regress.py deleted file mode 100644 index 4619647084..0000000000 --- a/test_runner/batch_pg_regress/test_neon_regress.py +++ /dev/null @@ -1,55 +0,0 @@ -import os -from pathlib import Path - -from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, base_dir, check_restored_datadir_content, pg_distrib_dir - - -def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): - env = neon_simple_env - - env.neon_cli.create_branch("test_neon_regress", "empty") - # Connect to postgres and create a database called "regression". - pg = env.postgres.create_start("test_neon_regress") - pg.safe_psql("CREATE DATABASE regression") - - # Create some local directories for pg_regress to run in. - runpath = test_output_dir / "regress" - (runpath / "testtablespace").mkdir(parents=True) - - # Compute all the file locations that pg_regress will need. - # This test runs neon specific tests - build_path = os.path.join(pg_distrib_dir, "build/src/test/regress") - src_path = os.path.join(base_dir, "test_runner/neon_regress") - bindir = os.path.join(pg_distrib_dir, "bin") - schedule = os.path.join(src_path, "parallel_schedule") - pg_regress = os.path.join(build_path, "pg_regress") - - pg_regress_command = [ - pg_regress, - "--use-existing", - "--bindir={}".format(bindir), - "--dlpath={}".format(build_path), - "--schedule={}".format(schedule), - "--inputdir={}".format(src_path), - ] - - log.info(pg_regress_command) - env_vars = { - "PGPORT": str(pg.default_options["port"]), - "PGUSER": pg.default_options["user"], - "PGHOST": pg.default_options["host"], - } - - # Run the command. - # We don't capture the output. It's not too chatty, and it always - # logs the exact same data to `regression.out` anyway. - with capsys.disabled(): - pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - - # checkpoint one more time to ensure that the lsn we get is the latest one - pg.safe_psql("CHECKPOINT") - pg.safe_psql("select pg_current_wal_insert_lsn()")[0][0] - - # Check that we restore the content of the datadir correctly - check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/batch_pg_regress/test_pg_regress.py b/test_runner/batch_pg_regress/test_pg_regress.py deleted file mode 100644 index 478dbf0a91..0000000000 --- a/test_runner/batch_pg_regress/test_pg_regress.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -import pathlib - -import pytest -from fixtures.neon_fixtures import NeonEnv, base_dir, check_restored_datadir_content, pg_distrib_dir - - -# The pg_regress tests run for a long time, especially in debug mode, -# so use a larger-than-default timeout. -@pytest.mark.timeout(1800) -def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: pathlib.Path, pg_bin, capsys): - env = neon_simple_env - - env.neon_cli.create_branch("test_pg_regress", "empty") - # Connect to postgres and create a database called "regression". - pg = env.postgres.create_start("test_pg_regress") - pg.safe_psql("CREATE DATABASE regression") - - # Create some local directories for pg_regress to run in. - runpath = test_output_dir / "regress" - (runpath / "testtablespace").mkdir(parents=True) - - # Compute all the file locations that pg_regress will need. - build_path = os.path.join(pg_distrib_dir, "build/src/test/regress") - src_path = os.path.join(base_dir, "vendor/postgres/src/test/regress") - bindir = os.path.join(pg_distrib_dir, "bin") - schedule = os.path.join(src_path, "parallel_schedule") - pg_regress = os.path.join(build_path, "pg_regress") - - pg_regress_command = [ - pg_regress, - '--bindir=""', - "--use-existing", - "--bindir={}".format(bindir), - "--dlpath={}".format(build_path), - "--schedule={}".format(schedule), - "--inputdir={}".format(src_path), - ] - - env_vars = { - "PGPORT": str(pg.default_options["port"]), - "PGUSER": pg.default_options["user"], - "PGHOST": pg.default_options["host"], - } - - # Run the command. - # We don't capture the output. It's not too chatty, and it always - # logs the exact same data to `regression.out` anyway. - with capsys.disabled(): - pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - - # checkpoint one more time to ensure that the lsn we get is the latest one - pg.safe_psql("CHECKPOINT") - - # Check that we restore the content of the datadir correctly - check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/neon_regress/README.md b/test_runner/neon_regress/README.md deleted file mode 100644 index b23a55462e..0000000000 --- a/test_runner/neon_regress/README.md +++ /dev/null @@ -1,8 +0,0 @@ -To add a new SQL test - -- add sql script to run to neon_regress/sql/testname.sql -- add expected output to neon_regress/expected/testname.out -- add testname to parallel_schedule - -That's it. -For more complex tests see PostgreSQL regression tests. These works basically the same. diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py similarity index 100% rename from test_runner/batch_others/test_ancestor_branch.py rename to test_runner/regress/test_ancestor_branch.py diff --git a/test_runner/batch_others/test_auth.py b/test_runner/regress/test_auth.py similarity index 100% rename from test_runner/batch_others/test_auth.py rename to test_runner/regress/test_auth.py diff --git a/test_runner/batch_others/test_backpressure.py b/test_runner/regress/test_backpressure.py similarity index 100% rename from test_runner/batch_others/test_backpressure.py rename to test_runner/regress/test_backpressure.py diff --git a/test_runner/batch_others/test_basebackup_error.py b/test_runner/regress/test_basebackup_error.py similarity index 100% rename from test_runner/batch_others/test_basebackup_error.py rename to test_runner/regress/test_basebackup_error.py diff --git a/test_runner/batch_others/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py similarity index 100% rename from test_runner/batch_others/test_branch_and_gc.py rename to test_runner/regress/test_branch_and_gc.py diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/regress/test_branch_behind.py similarity index 100% rename from test_runner/batch_others/test_branch_behind.py rename to test_runner/regress/test_branch_behind.py diff --git a/test_runner/batch_others/test_branching.py b/test_runner/regress/test_branching.py similarity index 91% rename from test_runner/batch_others/test_branching.py rename to test_runner/regress/test_branching.py index 2d08b07f82..0c1490294d 100644 --- a/test_runner/batch_others/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -62,10 +62,11 @@ def test_branching_with_pgbench( time.sleep(delay) log.info(f"Sleep {delay}s") - # If the number of concurrent threads exceeds a threshold, - # wait for all the threads to finish before spawning a new one. - # Because tests defined in `batch_others` are run concurrently in CI, - # we want to avoid the situation that one test exhausts resources for other tests. + # If the number of concurrent threads exceeds a threshold, wait for + # all the threads to finish before spawning a new one. Because the + # regression tests in this directory are run concurrently in CI, we + # want to avoid the situation that one test exhausts resources for + # other tests. if len(threads) >= thread_limit: for thread in threads: thread.join() diff --git a/test_runner/batch_others/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py similarity index 100% rename from test_runner/batch_others/test_broken_timeline.py rename to test_runner/regress/test_broken_timeline.py diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py similarity index 100% rename from test_runner/batch_others/test_clog_truncate.py rename to test_runner/regress/test_clog_truncate.py diff --git a/test_runner/batch_others/test_close_fds.py b/test_runner/regress/test_close_fds.py similarity index 100% rename from test_runner/batch_others/test_close_fds.py rename to test_runner/regress/test_close_fds.py diff --git a/test_runner/batch_others/test_config.py b/test_runner/regress/test_config.py similarity index 100% rename from test_runner/batch_others/test_config.py rename to test_runner/regress/test_config.py diff --git a/test_runner/batch_others/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py similarity index 100% rename from test_runner/batch_others/test_crafted_wal_end.py rename to test_runner/regress/test_crafted_wal_end.py diff --git a/test_runner/batch_others/test_createdropdb.py b/test_runner/regress/test_createdropdb.py similarity index 100% rename from test_runner/batch_others/test_createdropdb.py rename to test_runner/regress/test_createdropdb.py diff --git a/test_runner/batch_others/test_createuser.py b/test_runner/regress/test_createuser.py similarity index 100% rename from test_runner/batch_others/test_createuser.py rename to test_runner/regress/test_createuser.py diff --git a/test_runner/batch_others/test_fsm_truncate.py b/test_runner/regress/test_fsm_truncate.py similarity index 100% rename from test_runner/batch_others/test_fsm_truncate.py rename to test_runner/regress/test_fsm_truncate.py diff --git a/test_runner/batch_others/test_fullbackup.py b/test_runner/regress/test_fullbackup.py similarity index 100% rename from test_runner/batch_others/test_fullbackup.py rename to test_runner/regress/test_fullbackup.py diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py similarity index 100% rename from test_runner/batch_others/test_gc_aggressive.py rename to test_runner/regress/test_gc_aggressive.py diff --git a/test_runner/batch_others/test_import.py b/test_runner/regress/test_import.py similarity index 100% rename from test_runner/batch_others/test_import.py rename to test_runner/regress/test_import.py diff --git a/test_runner/batch_others/test_large_schema.py b/test_runner/regress/test_large_schema.py similarity index 100% rename from test_runner/batch_others/test_large_schema.py rename to test_runner/regress/test_large_schema.py diff --git a/test_runner/batch_others/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py similarity index 100% rename from test_runner/batch_others/test_lsn_mapping.py rename to test_runner/regress/test_lsn_mapping.py diff --git a/test_runner/batch_others/test_multixact.py b/test_runner/regress/test_multixact.py similarity index 100% rename from test_runner/batch_others/test_multixact.py rename to test_runner/regress/test_multixact.py diff --git a/test_runner/batch_others/test_neon_cli.py b/test_runner/regress/test_neon_cli.py similarity index 100% rename from test_runner/batch_others/test_neon_cli.py rename to test_runner/regress/test_neon_cli.py diff --git a/test_runner/batch_others/test_next_xid.py b/test_runner/regress/test_next_xid.py similarity index 100% rename from test_runner/batch_others/test_next_xid.py rename to test_runner/regress/test_next_xid.py diff --git a/test_runner/batch_others/test_normal_work.py b/test_runner/regress/test_normal_work.py similarity index 100% rename from test_runner/batch_others/test_normal_work.py rename to test_runner/regress/test_normal_work.py diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py similarity index 100% rename from test_runner/batch_others/test_old_request_lsn.py rename to test_runner/regress/test_old_request_lsn.py diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py similarity index 100% rename from test_runner/batch_others/test_pageserver_api.py rename to test_runner/regress/test_pageserver_api.py diff --git a/test_runner/batch_others/test_pageserver_catchup.py b/test_runner/regress/test_pageserver_catchup.py similarity index 100% rename from test_runner/batch_others/test_pageserver_catchup.py rename to test_runner/regress/test_pageserver_catchup.py diff --git a/test_runner/batch_others/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py similarity index 100% rename from test_runner/batch_others/test_pageserver_restart.py rename to test_runner/regress/test_pageserver_restart.py diff --git a/test_runner/batch_others/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py similarity index 100% rename from test_runner/batch_others/test_parallel_copy.py rename to test_runner/regress/test_parallel_copy.py diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py new file mode 100644 index 0000000000..119528b8f9 --- /dev/null +++ b/test_runner/regress/test_pg_regress.py @@ -0,0 +1,159 @@ +# +# This file runs pg_regress-based tests. +# +import os +from pathlib import Path + +import pytest +from fixtures.neon_fixtures import NeonEnv, base_dir, check_restored_datadir_content, pg_distrib_dir + + +# Run the main PostgreSQL regression tests, in src/test/regress. +# +# This runs for a long time, especially in debug mode, so use a larger-than-default +# timeout. +@pytest.mark.timeout(1800) +def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): + env = neon_simple_env + + env.neon_cli.create_branch("test_pg_regress", "empty") + # Connect to postgres and create a database called "regression". + pg = env.postgres.create_start("test_pg_regress") + pg.safe_psql("CREATE DATABASE regression") + + # Create some local directories for pg_regress to run in. + runpath = test_output_dir / "regress" + (runpath / "testtablespace").mkdir(parents=True) + + # Compute all the file locations that pg_regress will need. + build_path = os.path.join(pg_distrib_dir, "build/src/test/regress") + src_path = os.path.join(base_dir, "vendor/postgres/src/test/regress") + bindir = os.path.join(pg_distrib_dir, "bin") + schedule = os.path.join(src_path, "parallel_schedule") + pg_regress = os.path.join(build_path, "pg_regress") + + pg_regress_command = [ + pg_regress, + '--bindir=""', + "--use-existing", + "--bindir={}".format(bindir), + "--dlpath={}".format(build_path), + "--schedule={}".format(schedule), + "--inputdir={}".format(src_path), + ] + + env_vars = { + "PGPORT": str(pg.default_options["port"]), + "PGUSER": pg.default_options["user"], + "PGHOST": pg.default_options["host"], + } + + # Run the command. + # We don't capture the output. It's not too chatty, and it always + # logs the exact same data to `regression.out` anyway. + with capsys.disabled(): + pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) + + # checkpoint one more time to ensure that the lsn we get is the latest one + pg.safe_psql("CHECKPOINT") + + # Check that we restore the content of the datadir correctly + check_restored_datadir_content(test_output_dir, env, pg) + + +# Run the PostgreSQL "isolation" tests, in src/test/isolation. +# +# This runs for a long time, especially in debug mode, so use a larger-than-default +# timeout. +@pytest.mark.timeout(1800) +def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): + env = neon_simple_env + + env.neon_cli.create_branch("test_isolation", "empty") + # Connect to postgres and create a database called "regression". + # isolation tests use prepared transactions, so enable them + pg = env.postgres.create_start("test_isolation", config_lines=["max_prepared_transactions=100"]) + pg.safe_psql("CREATE DATABASE isolation_regression") + + # Create some local directories for pg_isolation_regress to run in. + runpath = test_output_dir / "regress" + (runpath / "testtablespace").mkdir(parents=True) + + # Compute all the file locations that pg_isolation_regress will need. + build_path = os.path.join(pg_distrib_dir, "build/src/test/isolation") + src_path = os.path.join(base_dir, "vendor/postgres/src/test/isolation") + bindir = os.path.join(pg_distrib_dir, "bin") + schedule = os.path.join(src_path, "isolation_schedule") + pg_isolation_regress = os.path.join(build_path, "pg_isolation_regress") + + pg_isolation_regress_command = [ + pg_isolation_regress, + "--use-existing", + "--bindir={}".format(bindir), + "--dlpath={}".format(build_path), + "--inputdir={}".format(src_path), + "--schedule={}".format(schedule), + ] + + env_vars = { + "PGPORT": str(pg.default_options["port"]), + "PGUSER": pg.default_options["user"], + "PGHOST": pg.default_options["host"], + } + + # Run the command. + # We don't capture the output. It's not too chatty, and it always + # logs the exact same data to `regression.out` anyway. + with capsys.disabled(): + pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath) + + +# Run extra Neon-specific pg_regress-based tests. The tests and their +# schedule file are in the sql_regress/ directory. +def test_sql_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): + env = neon_simple_env + + env.neon_cli.create_branch("test_sql_regress", "empty") + # Connect to postgres and create a database called "regression". + pg = env.postgres.create_start("test_sql_regress") + pg.safe_psql("CREATE DATABASE regression") + + # Create some local directories for pg_regress to run in. + runpath = test_output_dir / "regress" + (runpath / "testtablespace").mkdir(parents=True) + + # Compute all the file locations that pg_regress will need. + # This test runs neon specific tests + build_path = os.path.join(pg_distrib_dir, "build/src/test/regress") + src_path = os.path.join(base_dir, "test_runner/sql_regress") + bindir = os.path.join(pg_distrib_dir, "bin") + schedule = os.path.join(src_path, "parallel_schedule") + pg_regress = os.path.join(build_path, "pg_regress") + + pg_regress_command = [ + pg_regress, + "--use-existing", + "--bindir={}".format(bindir), + "--dlpath={}".format(build_path), + "--schedule={}".format(schedule), + "--inputdir={}".format(src_path), + ] + + env_vars = { + "PGPORT": str(pg.default_options["port"]), + "PGUSER": pg.default_options["user"], + "PGHOST": pg.default_options["host"], + } + + # Run the command. + # We don't capture the output. It's not too chatty, and it always + # logs the exact same data to `regression.out` anyway. + with capsys.disabled(): + pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) + + # checkpoint one more time to ensure that the lsn we get is the latest one + pg.safe_psql("CHECKPOINT") + pg.safe_psql("select pg_current_wal_insert_lsn()")[0][0] + + # Check that we restore the content of the datadir correctly + check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/batch_others/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py similarity index 100% rename from test_runner/batch_others/test_pitr_gc.py rename to test_runner/regress/test_pitr_gc.py diff --git a/test_runner/batch_others/test_proxy.py b/test_runner/regress/test_proxy.py similarity index 100% rename from test_runner/batch_others/test_proxy.py rename to test_runner/regress/test_proxy.py diff --git a/test_runner/batch_others/test_read_validation.py b/test_runner/regress/test_read_validation.py similarity index 100% rename from test_runner/batch_others/test_read_validation.py rename to test_runner/regress/test_read_validation.py diff --git a/test_runner/batch_others/test_readonly_node.py b/test_runner/regress/test_readonly_node.py similarity index 100% rename from test_runner/batch_others/test_readonly_node.py rename to test_runner/regress/test_readonly_node.py diff --git a/test_runner/batch_others/test_recovery.py b/test_runner/regress/test_recovery.py similarity index 100% rename from test_runner/batch_others/test_recovery.py rename to test_runner/regress/test_recovery.py diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/regress/test_remote_storage.py similarity index 100% rename from test_runner/batch_others/test_remote_storage.py rename to test_runner/regress/test_remote_storage.py diff --git a/test_runner/batch_others/test_setup.py b/test_runner/regress/test_setup.py similarity index 100% rename from test_runner/batch_others/test_setup.py rename to test_runner/regress/test_setup.py diff --git a/test_runner/batch_others/test_subxacts.py b/test_runner/regress/test_subxacts.py similarity index 100% rename from test_runner/batch_others/test_subxacts.py rename to test_runner/regress/test_subxacts.py diff --git a/test_runner/batch_others/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py similarity index 100% rename from test_runner/batch_others/test_tenant_conf.py rename to test_runner/regress/test_tenant_conf.py diff --git a/test_runner/batch_others/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py similarity index 100% rename from test_runner/batch_others/test_tenant_detach.py rename to test_runner/regress/test_tenant_detach.py diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py similarity index 100% rename from test_runner/batch_others/test_tenant_relocation.py rename to test_runner/regress/test_tenant_relocation.py diff --git a/test_runner/batch_others/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py similarity index 100% rename from test_runner/batch_others/test_tenant_tasks.py rename to test_runner/regress/test_tenant_tasks.py diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/regress/test_tenants.py similarity index 100% rename from test_runner/batch_others/test_tenants.py rename to test_runner/regress/test_tenants.py diff --git a/test_runner/batch_others/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py similarity index 100% rename from test_runner/batch_others/test_tenants_with_remote_storage.py rename to test_runner/regress/test_tenants_with_remote_storage.py diff --git a/test_runner/batch_others/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py similarity index 100% rename from test_runner/batch_others/test_timeline_delete.py rename to test_runner/regress/test_timeline_delete.py diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/regress/test_timeline_size.py similarity index 100% rename from test_runner/batch_others/test_timeline_size.py rename to test_runner/regress/test_timeline_size.py diff --git a/test_runner/batch_others/test_twophase.py b/test_runner/regress/test_twophase.py similarity index 100% rename from test_runner/batch_others/test_twophase.py rename to test_runner/regress/test_twophase.py diff --git a/test_runner/batch_others/test_vm_bits.py b/test_runner/regress/test_vm_bits.py similarity index 100% rename from test_runner/batch_others/test_vm_bits.py rename to test_runner/regress/test_vm_bits.py diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py similarity index 100% rename from test_runner/batch_others/test_wal_acceptor.py rename to test_runner/regress/test_wal_acceptor.py diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py similarity index 100% rename from test_runner/batch_others/test_wal_acceptor_async.py rename to test_runner/regress/test_wal_acceptor_async.py diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/regress/test_wal_restore.py similarity index 100% rename from test_runner/batch_others/test_wal_restore.py rename to test_runner/regress/test_wal_restore.py diff --git a/test_runner/neon_regress/.gitignore b/test_runner/sql_regress/.gitignore similarity index 100% rename from test_runner/neon_regress/.gitignore rename to test_runner/sql_regress/.gitignore diff --git a/test_runner/sql_regress/README.md b/test_runner/sql_regress/README.md new file mode 100644 index 0000000000..1ae8aaf61a --- /dev/null +++ b/test_runner/sql_regress/README.md @@ -0,0 +1,13 @@ +Simple tests that only need a PostgreSQL connection to run. +These are run by the regress/test_pg_regress.py test, which uses +the PostgreSQL pg_regress utility. + +To add a new SQL test: + +- add sql script to run to neon_regress/sql/testname.sql +- add expected output to neon_regress/expected/testname.out +- add testname to parallel_schedule + +That's it. +For more complex tests see PostgreSQL regression tests in src/test/regress. +These work basically the same. diff --git a/test_runner/neon_regress/expected/.gitignore b/test_runner/sql_regress/expected/.gitignore similarity index 100% rename from test_runner/neon_regress/expected/.gitignore rename to test_runner/sql_regress/expected/.gitignore diff --git a/test_runner/neon_regress/expected/neon-cid.out b/test_runner/sql_regress/expected/neon-cid.out similarity index 100% rename from test_runner/neon_regress/expected/neon-cid.out rename to test_runner/sql_regress/expected/neon-cid.out diff --git a/test_runner/neon_regress/expected/neon-clog.out b/test_runner/sql_regress/expected/neon-clog.out similarity index 100% rename from test_runner/neon_regress/expected/neon-clog.out rename to test_runner/sql_regress/expected/neon-clog.out diff --git a/test_runner/neon_regress/expected/neon-rel-truncate.out b/test_runner/sql_regress/expected/neon-rel-truncate.out similarity index 100% rename from test_runner/neon_regress/expected/neon-rel-truncate.out rename to test_runner/sql_regress/expected/neon-rel-truncate.out diff --git a/test_runner/neon_regress/expected/neon-vacuum-full.out b/test_runner/sql_regress/expected/neon-vacuum-full.out similarity index 100% rename from test_runner/neon_regress/expected/neon-vacuum-full.out rename to test_runner/sql_regress/expected/neon-vacuum-full.out diff --git a/test_runner/neon_regress/parallel_schedule b/test_runner/sql_regress/parallel_schedule similarity index 100% rename from test_runner/neon_regress/parallel_schedule rename to test_runner/sql_regress/parallel_schedule diff --git a/test_runner/neon_regress/sql/.gitignore b/test_runner/sql_regress/sql/.gitignore similarity index 100% rename from test_runner/neon_regress/sql/.gitignore rename to test_runner/sql_regress/sql/.gitignore diff --git a/test_runner/neon_regress/sql/neon-cid.sql b/test_runner/sql_regress/sql/neon-cid.sql similarity index 100% rename from test_runner/neon_regress/sql/neon-cid.sql rename to test_runner/sql_regress/sql/neon-cid.sql diff --git a/test_runner/neon_regress/sql/neon-clog.sql b/test_runner/sql_regress/sql/neon-clog.sql similarity index 100% rename from test_runner/neon_regress/sql/neon-clog.sql rename to test_runner/sql_regress/sql/neon-clog.sql diff --git a/test_runner/neon_regress/sql/neon-rel-truncate.sql b/test_runner/sql_regress/sql/neon-rel-truncate.sql similarity index 100% rename from test_runner/neon_regress/sql/neon-rel-truncate.sql rename to test_runner/sql_regress/sql/neon-rel-truncate.sql diff --git a/test_runner/neon_regress/sql/neon-vacuum-full.sql b/test_runner/sql_regress/sql/neon-vacuum-full.sql similarity index 100% rename from test_runner/neon_regress/sql/neon-vacuum-full.sql rename to test_runner/sql_regress/sql/neon-vacuum-full.sql From f09bd6bc887c8370f6f82b4e942b504213eb8164 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 30 Aug 2022 18:44:06 +0300 Subject: [PATCH 0706/1022] Fix size checks in the "local" remote storage implementation. The code correctly detected too short and too long inputs, but the error message was bogus for the case the input stream was too long: Error: Provided stream has actual size 5 fthat is smaller than the given stream size 4 That check was only supposed to check for too small inputs, but it in fact caught too long inputs too. That was good, because the check below that that was supposed to check for too long inputs was in fact broken, and never did anything. It tried to read input a buffer of size 0, to check if there is any extra data, but reading to a zero-sized buffer always returns 0. --- libs/remote_storage/src/local_fs.rs | 49 ++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 07b04084b9..a65d0887af 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -150,8 +150,7 @@ impl RemoteStorage for LocalFs { ); let from_size_bytes = from_size_bytes as u64; - // Require to read 1 byte more than the expected to check later, that the stream and its size match. - let mut buffer_to_read = from.take(from_size_bytes + 1); + let mut buffer_to_read = from.take(from_size_bytes); let bytes_read = io::copy(&mut buffer_to_read, &mut destination) .await @@ -162,17 +161,15 @@ impl RemoteStorage for LocalFs { ) })?; + if bytes_read < from_size_bytes { + bail!("Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes"); + } + // Check if there is any extra data after the given size. + let mut from = buffer_to_read.into_inner(); + let extra_read = from.read(&mut [1]).await?; ensure!( - bytes_read == from_size_bytes, - "Provided stream has actual size {} fthat is smaller than the given stream size {}", - bytes_read, - from_size_bytes - ); - - ensure!( - buffer_to_read.read(&mut [0]).await? == 0, - "Provided stream has bigger size than the given stream size {}", - from_size_bytes + extra_read == 0, + "Provided stream was larger than expected: expected {from_size_bytes} bytes", ); destination.flush().await.with_context(|| { @@ -609,6 +606,34 @@ mod fs_tests { Ok(()) } + #[tokio::test] + async fn upload_file_negatives() -> anyhow::Result<()> { + let storage = create_storage()?; + + let id = storage.remote_object_id(&storage.working_directory.join("dummy"))?; + let content = std::io::Cursor::new(b"12345"); + + // Check that you get an error if the size parameter doesn't match the actual + // size of the stream. + storage + .upload(content.clone(), 0, &id, None) + .await + .expect_err("upload with zero size succeeded"); + storage + .upload(content.clone(), 4, &id, None) + .await + .expect_err("upload with too short size succeeded"); + storage + .upload(content.clone(), 6, &id, None) + .await + .expect_err("upload with too large size succeeded"); + + // Correct size is 5, this should succeed. + storage.upload(content, 5, &id, None).await?; + + Ok(()) + } + fn create_storage() -> anyhow::Result { LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned()) } From a4803233bbf825449c2481aa78bf37c929cc0411 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 30 Aug 2022 22:19:52 +0300 Subject: [PATCH 0707/1022] Remove `RemoteObjectName` and many remote storage generics in pageserver (#2360) --- libs/remote_storage/src/lib.rs | 17 +- libs/remote_storage/src/local_fs.rs | 18 +- libs/remote_storage/src/s3_bucket.rs | 44 ++-- pageserver/src/bin/pageserver.rs | 14 +- pageserver/src/http/routes.rs | 28 +-- pageserver/src/storage_sync.rs | 118 ++++------ pageserver/src/storage_sync/delete.rs | 86 ++++---- pageserver/src/storage_sync/download.rs | 276 ++++++++++++++---------- pageserver/src/storage_sync/upload.rs | 152 +++++++------ pageserver/src/tenant_mgr.rs | 8 +- 10 files changed, 392 insertions(+), 369 deletions(-) diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 07f8cb08aa..d5ad2f8633 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -42,19 +42,13 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; -pub trait RemoteObjectName { - // Needed to retrieve last component for RemoteObjectId. - // In other words a file name - fn object_name(&self) -> Option<&str>; -} - /// Storage (potentially remote) API to manage its state. /// This storage tries to be unaware of any layered repository context, /// providing basic CRUD operations for storage files. #[async_trait::async_trait] pub trait RemoteStorage: Send + Sync { /// A way to uniquely reference a file in the remote storage. - type RemoteObjectId: RemoteObjectName; + type RemoteObjectId; /// Attempts to derive the storage path out of the local path, if the latter is correct. fn remote_object_id(&self, local_path: &Path) -> anyhow::Result; @@ -71,7 +65,7 @@ pub trait RemoteStorage: Send + Sync { /// so this method doesnt need to. async fn list_prefixes( &self, - prefix: Option, + prefix: Option<&Self::RemoteObjectId>, ) -> anyhow::Result>; /// Streams the local file contents into remote into the remote storage entry. @@ -163,6 +157,13 @@ impl GenericRemoteStorage { } } } + + pub fn as_local(&self) -> Option<&LocalFs> { + match self { + Self::Local(local_fs) => Some(local_fs), + _ => None, + } + } } /// Extra set of key-value pairs that contain arbitrary metadata about the storage entry. diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index a65d0887af..ddf6c01759 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -5,7 +5,6 @@ //! volume is mounted to the local FS. use std::{ - borrow::Cow, future::Future, path::{Path, PathBuf}, pin::Pin, @@ -18,16 +17,10 @@ use tokio::{ }; use tracing::*; -use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectName}; +use crate::{path_with_suffix_extension, Download, DownloadError}; use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; -impl RemoteObjectName for PathBuf { - fn object_name(&self) -> Option<&str> { - self.file_stem().and_then(|n| n.to_str()) - } -} - pub struct LocalFs { working_directory: PathBuf, storage_root: PathBuf, @@ -113,13 +106,10 @@ impl RemoteStorage for LocalFs { async fn list_prefixes( &self, - prefix: Option, + prefix: Option<&Self::RemoteObjectId>, ) -> anyhow::Result> { - let path = match prefix { - Some(prefix) => Cow::Owned(prefix), - None => Cow::Borrowed(&self.storage_root), - }; - get_all_files(path.as_ref(), false).await + let path = prefix.unwrap_or(&self.storage_root); + get_all_files(path, false).await } async fn upload( diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 1b241fe4ed..db31200c36 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -19,9 +19,7 @@ use tokio::{io, sync::Semaphore}; use tokio_util::io::ReaderStream; use tracing::debug; -use crate::{ - strip_path_prefix, Download, DownloadError, RemoteObjectName, RemoteStorage, S3Config, -}; +use crate::{strip_path_prefix, Download, DownloadError, RemoteStorage, S3Config}; use super::StorageMetadata; @@ -96,6 +94,23 @@ const S3_PREFIX_SEPARATOR: char = '/'; pub struct S3ObjectKey(String); impl S3ObjectKey { + /// Turn a/b/c or a/b/c/ into c + pub fn object_name(&self) -> Option<&str> { + // corner case, char::to_string is not const, thats why this is more verbose than it needs to be + // see https://github.com/rust-lang/rust/issues/88674 + if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR { + return None; + } + + if self.0.ends_with(S3_PREFIX_SEPARATOR) { + self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1) + } else { + self.0 + .rsplit_once(S3_PREFIX_SEPARATOR) + .map(|(_, last)| last) + } + } + fn key(&self) -> &str { &self.0 } @@ -119,25 +134,6 @@ impl S3ObjectKey { } } -impl RemoteObjectName for S3ObjectKey { - /// Turn a/b/c or a/b/c/ into c - fn object_name(&self) -> Option<&str> { - // corner case, char::to_string is not const, thats why this is more verbose than it needs to be - // see https://github.com/rust-lang/rust/issues/88674 - if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR { - return None; - } - - if self.0.ends_with(S3_PREFIX_SEPARATOR) { - self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1) - } else { - self.0 - .rsplit_once(S3_PREFIX_SEPARATOR) - .map(|(_, last)| last) - } - } -} - /// AWS S3 storage. pub struct S3Bucket { workdir: PathBuf, @@ -316,11 +312,11 @@ impl RemoteStorage for S3Bucket { /// Note: it wont include empty "directories" async fn list_prefixes( &self, - prefix: Option, + prefix: Option<&Self::RemoteObjectId>, ) -> anyhow::Result> { // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix - .map(|p| p.0) + .map(|p| p.0.clone()) .or_else(|| self.prefix_in_bucket.clone()) .map(|mut p| { // required to end with a separator diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 1a13147f42..7a33a548e7 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -1,6 +1,7 @@ //! Main entry point for the Page Server executable. -use std::{env, ops::ControlFlow, path::Path, str::FromStr}; +use remote_storage::GenericRemoteStorage; +use std::{env, ops::ControlFlow, path::Path, str::FromStr, sync::Arc}; use tracing::*; use anyhow::{bail, Context, Result}; @@ -298,7 +299,14 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() }; info!("Using auth: {:#?}", conf.auth_type); - let remote_index = tenant_mgr::init_tenant_mgr(conf)?; + let remote_storage = conf + .remote_storage_config + .as_ref() + .map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config)) + .transpose() + .context("Failed to init generic remote storage")? + .map(Arc::new); + let remote_index = tenant_mgr::init_tenant_mgr(conf, remote_storage.as_ref().map(Arc::clone))?; // Spawn a new thread for the http endpoint // bind before launching separate thread so the error reported before startup exits @@ -310,7 +318,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() "http_endpoint_thread", true, move || { - let router = http::make_router(conf, auth_cloned, remote_index)?; + let router = http::make_router(conf, auth_cloned, remote_index, remote_storage)?; endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher()) }, )?; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2bb181dd9a..ef18129504 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -35,7 +35,7 @@ struct State { auth: Option>, remote_index: RemoteIndex, allowlist_routes: Vec, - remote_storage: Option, + remote_storage: Option>, } impl State { @@ -43,20 +43,12 @@ impl State { conf: &'static PageServerConf, auth: Option>, remote_index: RemoteIndex, + remote_storage: Option>, ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() .map(|v| v.parse().unwrap()) .collect::>(); - // Note that this remote storage is created separately from the main one in the sync_loop. - // It's fine since it's stateless and some code duplication saves us from bloating the code around with generics. - let remote_storage = conf - .remote_storage_config - .as_ref() - .map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config)) - .transpose() - .context("Failed to init generic remote storage")?; - Ok(Self { conf, auth, @@ -448,16 +440,8 @@ async fn gather_tenant_timelines_index_parts( tenant_id: ZTenantId, ) -> anyhow::Result>> { let index_parts = match state.remote_storage.as_ref() { - Some(GenericRemoteStorage::Local(local_storage)) => { - storage_sync::gather_tenant_timelines_index_parts(state.conf, local_storage, tenant_id) - .await - } - // FIXME here s3 storage contains its own limits, that are separate from sync storage thread ones - // because it is a different instance. We can move this limit to some global static - // or use one instance everywhere. - Some(GenericRemoteStorage::S3(s3_storage)) => { - storage_sync::gather_tenant_timelines_index_parts(state.conf, s3_storage, tenant_id) - .await + Some(storage) => { + storage_sync::gather_tenant_timelines_index_parts(state.conf, storage, tenant_id).await } None => return Ok(None), } @@ -714,6 +698,7 @@ pub fn make_router( conf: &'static PageServerConf, auth: Option>, remote_index: RemoteIndex, + remote_storage: Option>, ) -> anyhow::Result> { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); @@ -730,7 +715,8 @@ pub fn make_router( Ok(router .data(Arc::new( - State::new(conf, auth, remote_index).context("Failed to initialize router state")?, + State::new(conf, auth, remote_index, remote_storage) + .context("Failed to initialize router state")?, )) .get("/v1/status", status_handler) .get("/v1/tenant", tenant_list_handler) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 52d544b28c..a52cde7286 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -156,7 +156,7 @@ use std::{ use anyhow::{anyhow, bail, Context}; use futures::stream::{FuturesUnordered, StreamExt}; use once_cell::sync::{Lazy, OnceCell}; -use remote_storage::{GenericRemoteStorage, RemoteStorage}; +use remote_storage::GenericRemoteStorage; use tokio::{ fs, runtime::Runtime, @@ -253,36 +253,20 @@ pub struct SyncStartupData { /// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states. pub fn start_local_timeline_sync( config: &'static PageServerConf, + storage: Option>, ) -> anyhow::Result { let local_timeline_files = local_tenant_timeline_files(config) .context("Failed to collect local tenant timeline files")?; - match config.remote_storage_config.as_ref() { - Some(storage_config) => { - match GenericRemoteStorage::new(config.workdir.clone(), storage_config) - .context("Failed to init the generic remote storage")? - { - GenericRemoteStorage::Local(local_fs_storage) => { - storage_sync::spawn_storage_sync_thread( - config, - local_timeline_files, - local_fs_storage, - storage_config.max_concurrent_syncs, - storage_config.max_sync_errors, - ) - } - GenericRemoteStorage::S3(s3_bucket_storage) => { - storage_sync::spawn_storage_sync_thread( - config, - local_timeline_files, - s3_bucket_storage, - storage_config.max_concurrent_syncs, - storage_config.max_sync_errors, - ) - } - } - .context("Failed to spawn the storage sync thread") - } + match storage.zip(config.remote_storage_config.as_ref()) { + Some((storage, storage_config)) => storage_sync::spawn_storage_sync_thread( + config, + local_timeline_files, + storage, + storage_config.max_concurrent_syncs, + storage_config.max_sync_errors, + ) + .context("Failed to spawn the storage sync thread"), None => { info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled"); let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); @@ -810,17 +794,13 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { /// Launch a thread to perform remote storage sync tasks. /// See module docs for loop step description. -pub(super) fn spawn_storage_sync_thread( +pub(super) fn spawn_storage_sync_thread( conf: &'static PageServerConf, local_timeline_files: HashMap)>, - storage: S, + storage: Arc, max_concurrent_timelines_sync: NonZeroUsize, max_sync_errors: NonZeroU32, -) -> anyhow::Result -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> anyhow::Result { let sync_queue = SyncQueue::new(max_concurrent_timelines_sync); SYNC_QUEUE .set(sync_queue) @@ -860,7 +840,7 @@ where storage_sync_loop( runtime, conf, - (Arc::new(storage), remote_index_clone, sync_queue), + (storage, remote_index_clone, sync_queue), max_sync_errors, ); Ok(()) @@ -873,15 +853,12 @@ where }) } -fn storage_sync_loop( +fn storage_sync_loop( runtime: Runtime, conf: &'static PageServerConf, - (storage, index, sync_queue): (Arc, RemoteIndex, &SyncQueue), + (storage, index, sync_queue): (Arc, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, -) where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) { info!("Starting remote storage sync loop"); loop { let loop_storage = Arc::clone(&storage); @@ -983,18 +960,14 @@ enum UploadStatus { Nothing, } -async fn process_batches( +async fn process_batches( conf: &'static PageServerConf, max_sync_errors: NonZeroU32, - storage: Arc, + storage: Arc, index: &RemoteIndex, batched_tasks: HashMap, sync_queue: &SyncQueue, -) -> HashSet -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> HashSet { let mut sync_results = batched_tasks .into_iter() .map(|(sync_id, batch)| { @@ -1030,17 +1003,13 @@ where downloaded_timelines } -async fn process_sync_task_batch( +async fn process_sync_task_batch( conf: &'static PageServerConf, - (storage, index, sync_queue): (Arc, RemoteIndex, &SyncQueue), + (storage, index, sync_queue): (Arc, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, sync_id: ZTenantTimelineId, batch: SyncTaskBatch, -) -> DownloadStatus -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> DownloadStatus { let sync_start = Instant::now(); let current_remote_timeline = { index.read().await.timeline_entry(&sync_id).cloned() }; @@ -1175,19 +1144,15 @@ where download_status } -async fn download_timeline_data( +async fn download_timeline_data( conf: &'static PageServerConf, - (storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue), + (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), current_remote_timeline: Option<&RemoteTimeline>, sync_id: ZTenantTimelineId, new_download_data: SyncData, sync_start: Instant, task_name: &str, -) -> DownloadStatus -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> DownloadStatus { match download_timeline_layers( conf, storage, @@ -1298,17 +1263,14 @@ async fn update_local_metadata( Ok(()) } -async fn delete_timeline_data( +async fn delete_timeline_data( conf: &'static PageServerConf, - (storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue), + (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), sync_id: ZTenantTimelineId, mut new_delete_data: SyncData, sync_start: Instant, task_name: &str, -) where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) { let timeline_delete = &mut new_delete_data.data; if !timeline_delete.deletion_registered { @@ -1343,19 +1305,15 @@ async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result( +async fn upload_timeline_data( conf: &'static PageServerConf, - (storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue), + (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), current_remote_timeline: Option<&RemoteTimeline>, sync_id: ZTenantTimelineId, new_upload_data: SyncData, sync_start: Instant, task_name: &str, -) -> UploadStatus -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> UploadStatus { let mut uploaded_data = match upload_timeline_layers( storage, sync_queue, @@ -1406,17 +1364,13 @@ enum RemoteDataUpdate<'a> { Delete(&'a HashSet), } -async fn update_remote_data( +async fn update_remote_data( conf: &'static PageServerConf, - storage: &S, + storage: &GenericRemoteStorage, index: &RemoteIndex, sync_id: ZTenantTimelineId, update: RemoteDataUpdate<'_>, -) -> anyhow::Result<()> -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> anyhow::Result<()> { let updated_remote_timeline = { let mut index_accessor = index.write().await; diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index 2e39ed073f..d80a082d0c 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -1,27 +1,25 @@ //! Timeline synchronization logic to delete a bulk of timeline's remote files from the remote storage. +use std::path::Path; + use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; use tracing::{debug, error, info}; use crate::storage_sync::{SyncQueue, SyncTask}; -use remote_storage::RemoteStorage; +use remote_storage::{GenericRemoteStorage, RemoteStorage}; use utils::zid::ZTenantTimelineId; use super::{LayersDeletion, SyncData}; /// Attempts to remove the timleline layers from the remote storage. /// If the task had not adjusted the metadata before, the deletion will fail. -pub(super) async fn delete_timeline_layers<'a, P, S>( - storage: &'a S, +pub(super) async fn delete_timeline_layers<'a>( + storage: &'a GenericRemoteStorage, sync_queue: &SyncQueue, sync_id: ZTenantTimelineId, mut delete_data: SyncData, -) -> bool -where - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> bool { if !delete_data.data.deletion_registered { error!("Cannot delete timeline layers before the deletion metadata is not registered, reenqueueing"); delete_data.retries += 1; @@ -45,25 +43,14 @@ where let mut delete_tasks = layers_to_delete .into_iter() .map(|local_layer_path| async { - let storage_path = - match storage - .remote_object_id(&local_layer_path) - .with_context(|| { - format!( - "Failed to get the layer storage path for local path '{}'", - local_layer_path.display() - ) - }) { - Ok(path) => path, - Err(e) => return Err((e, local_layer_path)), - }; - - match storage.delete(&storage_path).await.with_context(|| { - format!( - "Failed to delete remote layer from storage at '{:?}'", - storage_path - ) - }) { + match match storage { + GenericRemoteStorage::Local(storage) => { + remove_storage_object(storage, &local_layer_path).await + } + GenericRemoteStorage::S3(storage) => { + remove_storage_object(storage, &local_layer_path).await + } + } { Ok(()) => Ok(local_layer_path), Err(e) => Err((e, local_layer_path)), } @@ -101,6 +88,28 @@ where errored } +async fn remove_storage_object(storage: &S, local_layer_path: &Path) -> anyhow::Result<()> +where + P: std::fmt::Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let storage_path = storage + .remote_object_id(local_layer_path) + .with_context(|| { + format!( + "Failed to get the layer storage path for local path '{}'", + local_layer_path.display() + ) + })?; + + storage.delete(&storage_path).await.with_context(|| { + format!( + "Failed to delete remote layer from storage at '{:?}'", + storage_path + ) + }) +} + #[cfg(test)] mod tests { use std::{collections::HashSet, num::NonZeroUsize}; @@ -114,7 +123,7 @@ mod tests { layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}, storage_sync::test_utils::{create_local_timeline, dummy_metadata}, }; - use remote_storage::LocalFs; + use remote_storage::{LocalFs, RemoteStorage}; use super::*; @@ -123,10 +132,10 @@ mod tests { let harness = RepoHarness::create("delete_timeline_negative")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new( + let storage = GenericRemoteStorage::Local(LocalFs::new( tempdir()?.path().to_path_buf(), harness.conf.workdir.clone(), - )?; + )?); let deleted = delete_timeline_layers( &storage, @@ -158,17 +167,20 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "c", "d"]; - let storage = LocalFs::new( + let storage = GenericRemoteStorage::Local(LocalFs::new( tempdir()?.path().to_path_buf(), harness.conf.workdir.clone(), - )?; + )?); + + let local_storage = storage.as_local().unwrap(); + let current_retries = 3; let metadata = dummy_metadata(Lsn(0x30)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); let timeline_upload = create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; for local_path in timeline_upload.layers_to_upload { - let remote_path = storage.remote_object_id(&local_path)?; + let remote_path = local_storage.remote_object_id(&local_path)?; let remote_parent_dir = remote_path.parent().unwrap(); if !remote_parent_dir.exists() { fs::create_dir_all(&remote_parent_dir).await?; @@ -176,11 +188,11 @@ mod tests { fs::copy(&local_path, &remote_path).await?; } assert_eq!( - storage + local_storage .list() .await? .into_iter() - .map(|remote_path| storage.local_path(&remote_path).unwrap()) + .map(|remote_path| local_storage.local_path(&remote_path).unwrap()) .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) .sorted() .collect::>(), @@ -213,11 +225,11 @@ mod tests { assert!(deleted, "Should be able to delete timeline files"); assert_eq!( - storage + local_storage .list() .await? .into_iter() - .map(|remote_path| storage.local_path(&remote_path).unwrap()) + .map(|remote_path| local_storage.local_path(&remote_path).unwrap()) .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) .sorted() .collect::>(), diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 98c45bf9af..8e6aa47c88 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -9,7 +9,9 @@ use std::{ use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; -use remote_storage::{path_with_suffix_extension, DownloadError, RemoteObjectName, RemoteStorage}; +use remote_storage::{ + path_with_suffix_extension, Download, DownloadError, GenericRemoteStorage, RemoteStorage, +}; use tokio::{ fs, io::{self, AsyncWriteExt}, @@ -62,15 +64,11 @@ impl Default for TenantIndexParts { } } -pub async fn download_index_parts( +pub async fn download_index_parts( conf: &'static PageServerConf, - storage: &S, + storage: &GenericRemoteStorage, keys: HashSet, -) -> HashMap -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> HashMap { let mut index_parts: HashMap = HashMap::new(); let mut part_downloads = keys @@ -114,60 +112,17 @@ where /// Note: The function is rather expensive from s3 access point of view, it will execute ceil(N/1000) + N requests. /// At least one request to obtain a list of tenant timelines (more requests is there are more than 1000 timelines). /// And then will attempt to download all index files that belong to these timelines. -pub async fn gather_tenant_timelines_index_parts( +pub async fn gather_tenant_timelines_index_parts( conf: &'static PageServerConf, - storage: &S, + storage: &GenericRemoteStorage, tenant_id: ZTenantId, -) -> anyhow::Result> -where - P: RemoteObjectName + Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> anyhow::Result> { let tenant_path = conf.timelines_path(&tenant_id); - let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| { - format!( - "Failed to get tenant storage path for local path '{}'", - tenant_path.display() - ) - })?; - - let timelines = storage - .list_prefixes(Some(tenant_storage_path)) + let timeline_sync_ids = get_timeline_sync_ids(storage, &tenant_path, tenant_id) .await - .with_context(|| { - format!( - "Failed to list tenant storage path to get remote timelines to download: {}", - tenant_id - ) - })?; + .with_context(|| format!("Failed to list timeline sync ids for tenat {tenant_id}"))?; - if timelines.is_empty() { - anyhow::bail!( - "no timelines found on the remote storage for tenant {}", - tenant_id - ) - } - - let mut sync_ids = HashSet::new(); - - for timeline_remote_storage_key in timelines { - let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { - anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") - })?; - - let timeline_id: ZTimelineId = object_name - .parse() - .with_context(|| { - format!("failed to parse object name into timeline id for tenant {tenant_id} '{object_name}'") - })?; - - sync_ids.insert(ZTenantTimelineId { - tenant_id, - timeline_id, - }); - } - - match download_index_parts(conf, storage, sync_ids) + match download_index_parts(conf, storage, timeline_sync_ids) .await .remove(&tenant_id) .ok_or_else(|| anyhow::anyhow!("Missing tenant index parts. This is a bug."))? @@ -180,29 +135,15 @@ where } /// Retrieves index data from the remote storage for a given timeline. -async fn download_index_part( +async fn download_index_part( conf: &'static PageServerConf, - storage: &S, + storage: &GenericRemoteStorage, sync_id: ZTenantTimelineId, -) -> Result -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> Result { let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME) .with_extension(IndexPart::FILE_EXTENSION); - let part_storage_path = storage - .remote_object_id(&index_part_path) - .with_context(|| { - format!( - "Failed to get the index part storage path for local path '{}'", - index_part_path.display() - ) - }) - .map_err(DownloadError::BadInput)?; - - let mut index_part_download = storage.download(&part_storage_path).await?; + let mut index_part_download = download_storage_object(storage, &index_part_path).await?; let mut index_part_bytes = Vec::new(); io::copy( @@ -211,14 +152,18 @@ where ) .await .with_context(|| { - format!("Failed to download an index part from storage path {part_storage_path:?}") + format!( + "Failed to download an index part into file '{}'", + index_part_path.display() + ) }) .map_err(DownloadError::Other)?; let index_part: IndexPart = serde_json::from_slice(&index_part_bytes) .with_context(|| { format!( - "Failed to deserialize index part file from storage path '{part_storage_path:?}'" + "Failed to deserialize index part file into file '{}'", + index_part_path.display() ) }) .map_err(DownloadError::Other)?; @@ -249,18 +194,14 @@ pub(super) enum DownloadedTimeline { /// updated in the end, if the remote one contains a newer disk_consistent_lsn. /// /// On an error, bumps the retries count and updates the files to skip with successful downloads, rescheduling the task. -pub(super) async fn download_timeline_layers<'a, P, S>( +pub(super) async fn download_timeline_layers<'a>( conf: &'static PageServerConf, - storage: &'a S, + storage: &'a GenericRemoteStorage, sync_queue: &'a SyncQueue, remote_timeline: Option<&'a RemoteTimeline>, sync_id: ZTenantTimelineId, mut download_data: SyncData, -) -> DownloadedTimeline -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> DownloadedTimeline { let remote_timeline = match remote_timeline { Some(remote_timeline) => { if !remote_timeline.awaits_download { @@ -300,15 +241,6 @@ where layer_desination_path.display() ); } else { - let layer_storage_path = storage - .remote_object_id(&layer_desination_path) - .with_context(|| { - format!( - "Failed to get the layer storage path for local path '{}'", - layer_desination_path.display() - ) - })?; - // Perform a rename inspired by durable_rename from file_utils.c. // The sequence: // write(tmp) @@ -329,19 +261,23 @@ where temp_file_path.display() ) })?; - let mut download = storage - .download(&layer_storage_path) + + let mut layer_download = download_storage_object(storage, &layer_desination_path) .await .with_context(|| { format!( - "Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'" + "Failed to initiate the download the layer for {sync_id} into file '{}'", + temp_file_path.display() + ) + })?; + io::copy(&mut layer_download.download_stream, &mut destination_file) + .await + .with_context(|| { + format!( + "Failed to download the layer for {sync_id} into file '{}'", + temp_file_path.display() ) })?; - io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| { - format!( - "Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display() - ) - })?; // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: // A file will not be closed immediately when it goes out of scope if there are any IO operations @@ -429,6 +365,121 @@ where } } +async fn download_storage_object( + storage: &GenericRemoteStorage, + to_path: &Path, +) -> Result { + async fn do_download_storage_object( + storage: &S, + to_path: &Path, + ) -> Result + where + P: std::fmt::Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, + { + let remote_object_path = storage + .remote_object_id(to_path) + .with_context(|| { + format!( + "Failed to get the storage path for target local path '{}'", + to_path.display() + ) + }) + .map_err(DownloadError::BadInput)?; + + storage.download(&remote_object_path).await + } + + match storage { + GenericRemoteStorage::Local(storage) => do_download_storage_object(storage, to_path).await, + GenericRemoteStorage::S3(storage) => do_download_storage_object(storage, to_path).await, + } +} + +async fn get_timeline_sync_ids( + storage: &GenericRemoteStorage, + tenant_path: &Path, + tenant_id: ZTenantId, +) -> anyhow::Result> { + let timeline_ids: Vec = match storage { + GenericRemoteStorage::Local(storage) => list_prefixes(storage, tenant_path) + .await? + .into_iter() + .map(|timeline_directory_path| { + timeline_directory_path + .file_stem() + .with_context(|| { + format!( + "Failed to get timeline id string from file '{}'", + timeline_directory_path.display() + ) + })? + .to_string_lossy() + .as_ref() + .parse() + .with_context(|| { + format!( + "failed to parse directory name '{}' as timeline id", + timeline_directory_path.display() + ) + }) + }) + .collect::>(), + GenericRemoteStorage::S3(storage) => list_prefixes(storage, tenant_path) + .await? + .into_iter() + .map(|s3_path| { + s3_path + .object_name() + .with_context(|| { + format!("Failed to get object name out of S3 path {s3_path:?}") + })? + .parse() + .with_context(|| { + format!("failed to parse object name '{s3_path:?}' as timeline id") + }) + }) + .collect::>(), + } + .with_context(|| { + format!("Tenant {tenant_id} has at least one incorrect timeline subdirectory") + })?; + + if timeline_ids.is_empty() { + anyhow::bail!("no timelines found on the remote storage for tenant {tenant_id}") + } + + Ok(timeline_ids + .into_iter() + .map(|timeline_id| ZTenantTimelineId { + tenant_id, + timeline_id, + }) + .collect()) +} + +async fn list_prefixes(storage: &S, tenant_path: &Path) -> anyhow::Result> +where + P: std::fmt::Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let tenant_storage_path = storage.remote_object_id(tenant_path).with_context(|| { + format!( + "Failed to get tenant storage path for local path '{}'", + tenant_path.display() + ) + })?; + + storage + .list_prefixes(Some(&tenant_storage_path)) + .await + .with_context(|| { + format!( + "Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download" + ) + }) +} + async fn fsync_path(path: impl AsRef) -> Result<(), io::Error> { fs::File::open(path).await?.sync_all().await } @@ -461,10 +512,11 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"]; - let storage = LocalFs::new( - tempdir()?.path().to_path_buf(), + let storage = GenericRemoteStorage::Local(LocalFs::new( + tempdir()?.path().to_owned(), harness.conf.workdir.clone(), - )?; + )?); + let local_storage = storage.as_local().unwrap(); let current_retries = 3; let metadata = dummy_metadata(Lsn(0x30)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); @@ -472,7 +524,7 @@ mod tests { create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; for local_path in timeline_upload.layers_to_upload { - let remote_path = storage.remote_object_id(&local_path)?; + let remote_path = local_storage.remote_object_id(&local_path)?; let remote_parent_dir = remote_path.parent().unwrap(); if !remote_parent_dir.exists() { fs::create_dir_all(&remote_parent_dir).await?; @@ -558,7 +610,10 @@ mod tests { let harness = RepoHarness::create("download_timeline_negatives")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?; + let storage = GenericRemoteStorage::Local(LocalFs::new( + tempdir()?.path().to_owned(), + harness.conf.workdir.clone(), + )?); let empty_remote_timeline_download = download_timeline_layers( harness.conf, @@ -614,10 +669,11 @@ mod tests { let harness = RepoHarness::create("test_download_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new( - tempdir()?.path().to_path_buf(), + let storage = GenericRemoteStorage::Local(LocalFs::new( + tempdir()?.path().to_owned(), harness.conf.workdir.clone(), - )?; + )?); + let local_storage = storage.as_local().unwrap(); let metadata = dummy_metadata(Lsn(0x30)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); @@ -638,7 +694,7 @@ mod tests { metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME) .with_extension(IndexPart::FILE_EXTENSION); - let storage_path = storage.remote_object_id(&local_index_part_path)?; + let storage_path = local_storage.remote_object_id(&local_index_part_path)?; fs::create_dir_all(storage_path.parent().unwrap()).await?; fs::write(&storage_path, serde_json::to_vec(&index_part)?).await?; diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 2acc935537..a8c768e0ae 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -1,11 +1,14 @@ //! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints. -use std::{fmt::Debug, path::PathBuf}; +use std::{ + fmt::Debug, + path::{Path, PathBuf}, +}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; use once_cell::sync::Lazy; -use remote_storage::RemoteStorage; +use remote_storage::{GenericRemoteStorage, RemoteStorage}; use tokio::fs; use tracing::{debug, error, info, warn}; @@ -30,16 +33,12 @@ static NO_LAYERS_UPLOAD: Lazy = Lazy::new(|| { }); /// Serializes and uploads the given index part data to the remote storage. -pub(super) async fn upload_index_part( +pub(super) async fn upload_index_part( conf: &'static PageServerConf, - storage: &S, + storage: &GenericRemoteStorage, sync_id: ZTenantTimelineId, index_part: IndexPart, -) -> anyhow::Result<()> -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> anyhow::Result<()> { let index_part_bytes = serde_json::to_vec(&index_part) .context("Failed to serialize index part file into bytes")?; let index_part_size = index_part_bytes.len(); @@ -48,27 +47,9 @@ where let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME) .with_extension(IndexPart::FILE_EXTENSION); - let index_part_storage_path = - storage - .remote_object_id(&index_part_path) - .with_context(|| { - format!( - "Failed to get the index part storage path for local path '{}'", - index_part_path.display() - ) - })?; - - storage - .upload( - index_part_bytes, - index_part_size, - &index_part_storage_path, - None, - ) + upload_storage_object(storage, index_part_bytes, index_part_size, &index_part_path) .await - .with_context(|| { - format!("Failed to upload index part to the storage path '{index_part_storage_path:?}'") - }) + .with_context(|| format!("Failed to upload index part for '{sync_id}'")) } /// Timeline upload result, with extra data, needed for uploading. @@ -84,17 +65,13 @@ pub(super) enum UploadedTimeline { /// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload. /// /// On an error, bumps the retries count and reschedules the entire task. -pub(super) async fn upload_timeline_layers<'a, P, S>( - storage: &'a S, +pub(super) async fn upload_timeline_layers<'a>( + storage: &'a GenericRemoteStorage, sync_queue: &SyncQueue, remote_timeline: Option<&'a RemoteTimeline>, sync_id: ZTenantTimelineId, mut upload_data: SyncData, -) -> UploadedTimeline -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> UploadedTimeline { let upload = &mut upload_data.data; let new_upload_lsn = upload .metadata @@ -132,16 +109,6 @@ where let mut upload_tasks = layers_to_upload .into_iter() .map(|source_path| async move { - let storage_path = storage - .remote_object_id(&source_path) - .with_context(|| { - format!( - "Failed to get the layer storage path for local path '{}'", - source_path.display() - ) - }) - .map_err(UploadError::Other)?; - let source_file = match fs::File::open(&source_path).await.with_context(|| { format!( "Failed to upen a source file for layer '{}'", @@ -164,15 +131,10 @@ where .map_err(UploadError::Other)? .len() as usize; - match storage - .upload(source_file, source_size, &storage_path, None) + match upload_storage_object(storage, source_file, source_size, &source_path) .await - .with_context(|| { - format!( - "Failed to upload a layer from local path '{}'", - source_path.display() - ) - }) { + .with_context(|| format!("Failed to upload layer file for {sync_id}")) + { Ok(()) => Ok(source_path), Err(e) => Err(UploadError::MissingLocalFile(source_path, e)), } @@ -231,6 +193,51 @@ where } } +async fn upload_storage_object( + storage: &GenericRemoteStorage, + from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, + from_size_bytes: usize, + from_path: &Path, +) -> anyhow::Result<()> { + async fn do_upload_storage_object( + storage: &S, + from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, + from_size_bytes: usize, + from_path: &Path, + ) -> anyhow::Result<()> + where + P: std::fmt::Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, + { + let target_storage_path = storage.remote_object_id(from_path).with_context(|| { + format!( + "Failed to get the storage path for source local path '{}'", + from_path.display() + ) + })?; + + storage + .upload(from, from_size_bytes, &target_storage_path, None) + .await + .with_context(|| { + format!( + "Failed to upload from '{}' to storage path '{:?}'", + from_path.display(), + target_storage_path + ) + }) + } + + match storage { + GenericRemoteStorage::Local(storage) => { + do_upload_storage_object(storage, from, from_size_bytes, from_path).await + } + GenericRemoteStorage::S3(storage) => { + do_upload_storage_object(storage, from, from_size_bytes, from_path).await + } + } +} + enum UploadError { MissingLocalFile(PathBuf, anyhow::Error), Other(anyhow::Error), @@ -243,7 +250,7 @@ mod tests { num::NonZeroUsize, }; - use remote_storage::LocalFs; + use remote_storage::{LocalFs, RemoteStorage}; use tempfile::tempdir; use utils::lsn::Lsn; @@ -264,10 +271,11 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b"]; - let storage = LocalFs::new( - tempdir()?.path().to_path_buf(), + let storage = GenericRemoteStorage::Local(LocalFs::new( + tempdir()?.path().to_owned(), harness.conf.workdir.clone(), - )?; + )?); + let local_storage = storage.as_local().unwrap(); let current_retries = 3; let metadata = dummy_metadata(Lsn(0x30)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); @@ -276,7 +284,7 @@ mod tests { timeline_upload.metadata = None; assert!( - storage.list().await?.is_empty(), + local_storage.list().await?.is_empty(), "Storage should be empty before any uploads are made" ); @@ -322,7 +330,7 @@ mod tests { "Successful upload without metadata should not have it returned either" ); - let storage_files = storage.list().await?; + let storage_files = local_storage.list().await?; assert_eq!( storage_files.len(), layer_files.len(), @@ -331,7 +339,7 @@ mod tests { assert_eq!( storage_files .into_iter() - .map(|storage_path| storage.local_path(&storage_path)) + .map(|storage_path| local_storage.local_path(&storage_path)) .collect::>>()?, layer_files .into_iter() @@ -351,7 +359,11 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a1", "b1"]; - let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?; + let storage = GenericRemoteStorage::Local(LocalFs::new( + tempdir()?.path().to_owned(), + harness.conf.workdir.clone(), + )?); + let local_storage = storage.as_local().unwrap(); let current_retries = 5; let metadata = dummy_metadata(Lsn(0x40)); @@ -365,7 +377,7 @@ mod tests { create_local_timeline(&harness, TIMELINE_ID, &layers_to_upload, metadata.clone()) .await?; assert!( - storage.list().await?.is_empty(), + local_storage.list().await?.is_empty(), "Storage should be empty before any uploads are made" ); @@ -414,7 +426,7 @@ mod tests { "Successful upload should not change its metadata" ); - let storage_files = storage.list().await?; + let storage_files = local_storage.list().await?; assert_eq!( storage_files.len(), layer_files.len(), @@ -423,7 +435,7 @@ mod tests { assert_eq!( storage_files .into_iter() - .map(|storage_path| storage.local_path(&storage_path)) + .map(|storage_path| local_storage.local_path(&storage_path)) .collect::>>()?, layer_files .into_iter() @@ -440,7 +452,11 @@ mod tests { let harness = RepoHarness::create("test_upload_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?; + let storage = GenericRemoteStorage::Local(LocalFs::new( + tempdir()?.path().to_owned(), + harness.conf.workdir.clone(), + )?); + let local_storage = storage.as_local().unwrap(); let metadata = dummy_metadata(Lsn(0x40)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); @@ -458,12 +474,12 @@ mod tests { ); assert!( - storage.list().await?.is_empty(), + local_storage.list().await?.is_empty(), "Storage should be empty before any uploads are made" ); upload_index_part(harness.conf, &storage, sync_id, index_part.clone()).await?; - let storage_files = storage.list().await?; + let storage_files = local_storage.list().await?; assert_eq!( storage_files.len(), 1, diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 921d973a41..4a907ac0e1 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -12,6 +12,7 @@ use crate::thread_mgr::ThreadKind; use crate::walredo::PostgresRedoManager; use crate::{thread_mgr, timelines, walreceiver}; use anyhow::Context; +use remote_storage::GenericRemoteStorage; use serde::{Deserialize, Serialize}; use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; @@ -131,7 +132,10 @@ impl fmt::Display for TenantState { /// Initialize repositories with locally available timelines. /// Timelines that are only partially available locally (remote storage has more data than this pageserver) /// are scheduled for download and added to the repository once download is completed. -pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result { +pub fn init_tenant_mgr( + conf: &'static PageServerConf, + remote_storage: Option>, +) -> anyhow::Result { let (timeline_updates_sender, timeline_updates_receiver) = mpsc::unbounded_channel::(); tenants_state::set_timeline_update_sender(timeline_updates_sender)?; @@ -140,7 +144,7 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result Date: Wed, 31 Aug 2022 14:36:24 +0200 Subject: [PATCH 0708/1022] Remove deprecated notification channel (#2330) Co-authored-by: Rory de Zoete --- .github/workflows/notifications.yml | 45 ----------------------------- 1 file changed, 45 deletions(-) delete mode 100644 .github/workflows/notifications.yml diff --git a/.github/workflows/notifications.yml b/.github/workflows/notifications.yml deleted file mode 100644 index 55dc979896..0000000000 --- a/.github/workflows/notifications.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: Send Notifications - -on: - push: - branches: [ main ] - -jobs: - send-notifications: - timeout-minutes: 30 - name: send commit notifications - runs-on: ubuntu-latest - - steps: - - - name: Checkout - uses: actions/checkout@v2 - with: - submodules: true - fetch-depth: 2 - - - name: Form variables for notification message - id: git_info_grab - run: | - git_stat=$(git show --stat=50) - git_stat="${git_stat//'%'/'%25'}" - git_stat="${git_stat//$'\n'/'%0A'}" - git_stat="${git_stat//$'\r'/'%0D'}" - git_stat="${git_stat// / }" # space -> 'Space En', as github tends to eat ordinary spaces - echo "::set-output name=git_stat::$git_stat" - echo "::set-output name=sha_short::$(git rev-parse --short HEAD)" - echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})" - - - name: Send notification - uses: appleboy/telegram-action@master - with: - to: ${{ secrets.TELEGRAM_TO }} - token: ${{ secrets.TELEGRAM_TOKEN }} - format: markdown - args: | - *@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }}) - - ``` - ${{ steps.git_info_grab.outputs.git_stat }} - ``` - From d7c9cfe7bb30c0908a74dd5a80a437ec3ab36571 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 31 Aug 2022 16:15:26 +0100 Subject: [PATCH 0709/1022] Create Allure report for perf tests (#2326) --- .github/actions/allure-report/action.yml | 10 +- .../actions/run-python-test-set/action.yml | 39 +++-- .github/workflows/benchmarking.yml | 78 +++++---- .github/workflows/build_and_test.yml | 4 +- poetry.lock | 149 +++++++++--------- pyproject.toml | 2 +- test_runner/fixtures/neon_fixtures.py | 63 +++----- test_runner/fixtures/utils.py | 39 ++++- 8 files changed, 195 insertions(+), 189 deletions(-) diff --git a/.github/actions/allure-report/action.yml b/.github/actions/allure-report/action.yml index 2e52bd7695..34761f8df1 100644 --- a/.github/actions/allure-report/action.yml +++ b/.github/actions/allure-report/action.yml @@ -18,7 +18,7 @@ runs: - name: Validate input parameters shell: bash -euxo pipefail {0} run: | - if [ "${{ inputs.action }}" != "store"] && [ "${{ inputs.action }}" != "generate" ]; then + if [ "${{ inputs.action }}" != "store" ] && [ "${{ inputs.action }}" != "generate" ]; then echo 2>&1 "Unknown inputs.action type '${{ inputs.action }}'; allowed 'generate' or 'store' only" exit 1 fi @@ -41,7 +41,7 @@ runs: # Shortcut for a special branch key=main else - key=branch-$(echo ${GITHUB_REF#refs/heads/} | tr -cd "[:alnum:]._-") + key=branch-$(echo ${GITHUB_REF#refs/heads/} | tr -c "[:alnum:]._-" "-") fi echo "::set-output name=KEY::${key}" @@ -94,7 +94,7 @@ runs: BUILD_TYPE=${{ inputs.build_type }} EOF - ARCHIVE="${GITHUB_RUN_ID}-${{ inputs.test_selection }}-${GITHUB_RUN_ATTEMPT}.tar.zst" + ARCHIVE="${GITHUB_RUN_ID}-${{ inputs.test_selection }}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst" ZSTD_NBTHREADS=0 tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd . @@ -207,7 +207,7 @@ runs: script: | const { REPORT_URL, BUILD_TYPE, SHA } = process.env - result = await github.rest.repos.createCommitStatus({ + await github.rest.repos.createCommitStatus({ owner: context.repo.owner, repo: context.repo.repo, sha: `${SHA}`, @@ -215,5 +215,3 @@ runs: target_url: `${REPORT_URL}`, context: `Allure report / ${BUILD_TYPE}`, }) - - console.log(result); diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index a4bcaff56d..1cc65b4286 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -3,11 +3,11 @@ description: 'Runs a Neon python test set, performing all the required preparati inputs: build_type: - description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug".' + description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug", or "remote" for the remote cluster' required: true rust_toolchain: description: 'Rust toolchain version to fetch the caches' - required: true + required: false test_selection: description: 'A python test suite to run' required: true @@ -52,6 +52,7 @@ runs: using: "composite" steps: - name: Get Neon artifact + if: inputs.build_type != 'remote' uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact @@ -78,7 +79,6 @@ runs: - name: Run pytest env: NEON_BIN: /tmp/neon/bin - POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install TEST_OUTPUT: /tmp/test_output # this variable will be embedded in perf test report # and is needed to distinguish different environments @@ -88,6 +88,12 @@ runs: AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }} shell: bash -euxo pipefail {0} run: | + export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} + + if [ "${BUILD_TYPE}" = "remote" ]; then + export REMOTE_ENV=1 + fi + PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)" rm -rf $PERF_REPORT_DIR @@ -119,6 +125,13 @@ runs: cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) elif [[ "${{ inputs.build_type }}" == "release" ]]; then cov_prefix=() + else + cov_prefix=() + fi + + # Wake up the cluster if we use remote neon instance + if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then + ${POSTGRES_DISTRIB_DIR}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" fi # Run the tests. @@ -137,7 +150,6 @@ runs: --alluredir=$TEST_OUTPUT/allure/results \ --tb=short \ --verbose \ - -m "not remote_cluster" \ -rA $TEST_SELECTION $EXTRA_PARAMS if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then @@ -148,25 +160,10 @@ runs: fi fi - - name: Upload Allure results - if: ${{ always() && (inputs.test_selection == 'regress') }} + - name: Create Allure report + if: always() uses: ./.github/actions/allure-report with: action: store build_type: ${{ inputs.build_type }} test_selection: ${{ inputs.test_selection }} - - - name: Delete all data but logs - shell: bash -euxo pipefail {0} - if: always() - run: | - du -sh /tmp/test_output/* - find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete - du -sh /tmp/test_output/* - - - name: Upload python test logs - if: always() - uses: ./.github/actions/upload - with: - name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs - path: /tmp/test_output/ diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 4ed6ac80fd..1370917377 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -128,9 +128,9 @@ jobs: env: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" TEST_PG_BENCH_SCALES_MATRIX: "10gb" - REMOTE_ENV: "1" POSTGRES_DISTRIB_DIR: /usr TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote strategy: fail-fast: false @@ -138,23 +138,15 @@ jobs: connstr: [ BENCHMARK_CAPTEST_CONNSTR, BENCHMARK_RDS_CONNSTR ] runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2817580636 + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned + options: --init timeout-minutes: 360 # 6h steps: - uses: actions/checkout@v3 - - name: Cache poetry deps - id: cache_poetry - uses: actions/cache@v3 - with: - path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} - - - name: Install Python deps - run: ./scripts/pysync - - name: Calculate platform id: calculate-platform env: @@ -173,50 +165,54 @@ jobs: - name: Install Deps run: | - echo "deb http://apt.postgresql.org/pub/repos/apt focal-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list - wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add - sudo apt -y update - sudo apt install -y postgresql-14 postgresql-client-14 + sudo apt install -y postgresql-14 - name: Benchmark init + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: true + extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init env: PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} - run: | - mkdir -p perf-report-captest - - psql $BENCHMARK_CONNSTR -c "SELECT 1;" - ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_init -v -m "remote_cluster" --out-dir perf-report-captest --timeout 21600 - name: Benchmark simple-update + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: true + extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update env: PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} - run: | - psql $BENCHMARK_CONNSTR -c "SELECT 1;" - ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_simple_update -v -m "remote_cluster" --out-dir perf-report-captest --timeout 21600 - - - name: Benchmark select-only - env: - PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} - BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} - run: | - psql $BENCHMARK_CONNSTR -c "SELECT 1;" - ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_select_only -v -m "remote_cluster" --out-dir perf-report-captest --timeout 21600 - - - name: Submit result - env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - run: | - REPORT_FROM=$(realpath perf-report-captest) REPORT_TO=staging scripts/generate_and_push_perf_report.sh - - name: Upload logs - if: always() - uses: ./.github/actions/upload + - name: Benchmark simple-update + uses: ./.github/actions/run-python-test-set with: - name: bench-captest-${{ steps.calculate-platform.outputs.PLATFORM }} - path: /tmp/test_output/ + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: true + extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only + env: + PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} + BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + + - name: Create Allure report + uses: ./.github/actions/allure-report + with: + action: generate + build_type: ${{ env.BUILD_TYPE }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8b1dc3a9c4..a3314738fa 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -278,7 +278,7 @@ jobs: container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init - needs: [ regress-tests ] + needs: [ regress-tests, benchmarks ] if: always() strategy: fail-fast: false @@ -290,7 +290,7 @@ jobs: with: submodules: false - - name: Merge and Allure results + - name: Create Allure report uses: ./.github/actions/allure-report with: action: generate diff --git a/poetry.lock b/poetry.lock index 6bce17008e..2af0d97511 100644 --- a/poetry.lock +++ b/poetry.lock @@ -15,20 +15,20 @@ sa = ["sqlalchemy[postgresql_psycopg2binary] (>=1.3,<1.5)"] [[package]] name = "allure-pytest" -version = "2.9.45" +version = "2.10.0" description = "Allure pytest integration" category = "main" optional = false python-versions = "*" [package.dependencies] -allure-python-commons = "2.9.45" +allure-python-commons = "2.10.0" pytest = ">=4.5.0" six = ">=1.9.0" [[package]] name = "allure-python-commons" -version = "2.9.45" +version = "2.10.0" description = "Common module for integrate allure with python-based frameworks" category = "main" optional = false @@ -56,9 +56,9 @@ optional = false python-versions = ">=3.6.0" [package.extras] -dev = ["Cython (>=0.29.24,<0.30.0)", "pytest (>=6.0)", "Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "pycodestyle (>=2.7.0,<2.8.0)", "flake8 (>=3.9.2,<3.10.0)", "uvloop (>=0.15.3)"] -docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)"] -test = ["pycodestyle (>=2.7.0,<2.8.0)", "flake8 (>=3.9.2,<3.10.0)", "uvloop (>=0.15.3)"] +dev = ["Cython (>=0.29.24,<0.30.0)", "Sphinx (>=4.1.2,<4.2.0)", "flake8 (>=3.9.2,<3.10.0)", "pycodestyle (>=2.7.0,<2.8.0)", "pytest (>=6.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "uvloop (>=0.15.3)"] +docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] +test = ["flake8 (>=3.9.2,<3.10.0)", "pycodestyle (>=2.7.0,<2.8.0)", "uvloop (>=0.15.3)"] [[package]] name = "atomicwrites" @@ -77,10 +77,10 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [package.extras] -dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit", "cloudpickle"] -docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] -tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"] -tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"] +dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"] +docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] +tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"] +tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"] [[package]] name = "aws-sam-translator" @@ -95,7 +95,7 @@ boto3 = ">=1.19.5,<2.0.0" jsonschema = ">=3.2,<4.0" [package.extras] -dev = ["coverage (>=5.3,<6.0)", "flake8 (>=3.8.4,<3.9.0)", "tox (>=3.24,<4.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-xdist (>=2.5,<3.0)", "pytest-env (>=0.6.2,<0.7.0)", "pylint (>=2.9.0,<2.10.0)", "pyyaml (>=5.4,<6.0)", "pytest (>=6.2.5,<6.3.0)", "parameterized (>=0.7.4,<0.8.0)", "click (>=7.1,<8.0)", "dateparser (>=0.7,<1.0)", "boto3 (>=1.23,<2)", "tenacity (>=7.0.0,<7.1.0)", "requests (>=2.24.0,<2.25.0)", "docopt (>=0.6.2,<0.7.0)", "black (==20.8b1)"] +dev = ["black (==20.8b1)", "boto3 (>=1.23,<2)", "click (>=7.1,<8.0)", "coverage (>=5.3,<6.0)", "dateparser (>=0.7,<1.0)", "docopt (>=0.6.2,<0.7.0)", "flake8 (>=3.8.4,<3.9.0)", "parameterized (>=0.7.4,<0.8.0)", "pylint (>=2.9.0,<2.10.0)", "pytest (>=6.2.5,<6.3.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-env (>=0.6.2,<0.7.0)", "pytest-xdist (>=2.5,<3.0)", "pyyaml (>=5.4,<6.0)", "requests (>=2.24.0,<2.25.0)", "tenacity (>=7.0.0,<7.1.0)", "tox (>=3.24,<4.0)"] [[package]] name = "aws-xray-sdk" @@ -157,8 +157,8 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "boto3-stubs" -version = "1.24.56" -description = "Type annotations for boto3 1.24.56 generated with mypy-boto3-builder 7.11.7" +version = "1.24.58" +description = "Type annotations for boto3 1.24.58 generated with mypy-boto3-builder 7.11.7" category = "main" optional = false python-versions = ">=3.7" @@ -175,7 +175,7 @@ account = ["mypy-boto3-account (>=1.24.0,<1.25.0)"] acm = ["mypy-boto3-acm (>=1.24.0,<1.25.0)"] acm-pca = ["mypy-boto3-acm-pca (>=1.24.0,<1.25.0)"] alexaforbusiness = ["mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)"] -all = ["mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)", "mypy-boto3-account (>=1.24.0,<1.25.0)", "mypy-boto3-acm (>=1.24.0,<1.25.0)", "mypy-boto3-acm-pca (>=1.24.0,<1.25.0)", "mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)", "mypy-boto3-amp (>=1.24.0,<1.25.0)", "mypy-boto3-amplify (>=1.24.0,<1.25.0)", "mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)", "mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)", "mypy-boto3-apigateway (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)", "mypy-boto3-appconfig (>=1.24.0,<1.25.0)", "mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)", "mypy-boto3-appflow (>=1.24.0,<1.25.0)", "mypy-boto3-appintegrations (>=1.24.0,<1.25.0)", "mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-application-insights (>=1.24.0,<1.25.0)", "mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-appmesh (>=1.24.0,<1.25.0)", "mypy-boto3-apprunner (>=1.24.0,<1.25.0)", "mypy-boto3-appstream (>=1.24.0,<1.25.0)", "mypy-boto3-appsync (>=1.24.0,<1.25.0)", "mypy-boto3-athena (>=1.24.0,<1.25.0)", "mypy-boto3-auditmanager (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)", "mypy-boto3-backup (>=1.24.0,<1.25.0)", "mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)", "mypy-boto3-backupstorage (>=1.24.0,<1.25.0)", "mypy-boto3-batch (>=1.24.0,<1.25.0)", "mypy-boto3-billingconductor (>=1.24.0,<1.25.0)", "mypy-boto3-braket (>=1.24.0,<1.25.0)", "mypy-boto3-budgets (>=1.24.0,<1.25.0)", "mypy-boto3-ce (>=1.24.0,<1.25.0)", "mypy-boto3-chime (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)", "mypy-boto3-cloud9 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)", "mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)", "mypy-boto3-cloudformation (>=1.24.0,<1.25.0)", "mypy-boto3-cloudfront (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)", "mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)", "mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)", "mypy-boto3-codeartifact (>=1.24.0,<1.25.0)", "mypy-boto3-codebuild (>=1.24.0,<1.25.0)", "mypy-boto3-codecommit (>=1.24.0,<1.25.0)", "mypy-boto3-codedeploy (>=1.24.0,<1.25.0)", "mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)", "mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-codepipeline (>=1.24.0,<1.25.0)", "mypy-boto3-codestar (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)", "mypy-boto3-comprehend (>=1.24.0,<1.25.0)", "mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)", "mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)", "mypy-boto3-config (>=1.24.0,<1.25.0)", "mypy-boto3-connect (>=1.24.0,<1.25.0)", "mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)", "mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)", "mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)", "mypy-boto3-cur (>=1.24.0,<1.25.0)", "mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)", "mypy-boto3-databrew (>=1.24.0,<1.25.0)", "mypy-boto3-dataexchange (>=1.24.0,<1.25.0)", "mypy-boto3-datapipeline (>=1.24.0,<1.25.0)", "mypy-boto3-datasync (>=1.24.0,<1.25.0)", "mypy-boto3-dax (>=1.24.0,<1.25.0)", "mypy-boto3-detective (>=1.24.0,<1.25.0)", "mypy-boto3-devicefarm (>=1.24.0,<1.25.0)", "mypy-boto3-devops-guru (>=1.24.0,<1.25.0)", "mypy-boto3-directconnect (>=1.24.0,<1.25.0)", "mypy-boto3-discovery (>=1.24.0,<1.25.0)", "mypy-boto3-dlm (>=1.24.0,<1.25.0)", "mypy-boto3-dms (>=1.24.0,<1.25.0)", "mypy-boto3-docdb (>=1.24.0,<1.25.0)", "mypy-boto3-drs (>=1.24.0,<1.25.0)", "mypy-boto3-ds (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)", "mypy-boto3-ebs (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)", "mypy-boto3-ecr (>=1.24.0,<1.25.0)", "mypy-boto3-ecr-public (>=1.24.0,<1.25.0)", "mypy-boto3-ecs (>=1.24.0,<1.25.0)", "mypy-boto3-efs (>=1.24.0,<1.25.0)", "mypy-boto3-eks (>=1.24.0,<1.25.0)", "mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)", "mypy-boto3-elasticache (>=1.24.0,<1.25.0)", "mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)", "mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)", "mypy-boto3-elb (>=1.24.0,<1.25.0)", "mypy-boto3-elbv2 (>=1.24.0,<1.25.0)", "mypy-boto3-emr (>=1.24.0,<1.25.0)", "mypy-boto3-emr-containers (>=1.24.0,<1.25.0)", "mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-es (>=1.24.0,<1.25.0)", "mypy-boto3-events (>=1.24.0,<1.25.0)", "mypy-boto3-evidently (>=1.24.0,<1.25.0)", "mypy-boto3-finspace (>=1.24.0,<1.25.0)", "mypy-boto3-finspace-data (>=1.24.0,<1.25.0)", "mypy-boto3-firehose (>=1.24.0,<1.25.0)", "mypy-boto3-fis (>=1.24.0,<1.25.0)", "mypy-boto3-fms (>=1.24.0,<1.25.0)", "mypy-boto3-forecast (>=1.24.0,<1.25.0)", "mypy-boto3-forecastquery (>=1.24.0,<1.25.0)", "mypy-boto3-frauddetector (>=1.24.0,<1.25.0)", "mypy-boto3-fsx (>=1.24.0,<1.25.0)", "mypy-boto3-gamelift (>=1.24.0,<1.25.0)", "mypy-boto3-gamesparks (>=1.24.0,<1.25.0)", "mypy-boto3-glacier (>=1.24.0,<1.25.0)", "mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)", "mypy-boto3-glue (>=1.24.0,<1.25.0)", "mypy-boto3-grafana (>=1.24.0,<1.25.0)", "mypy-boto3-greengrass (>=1.24.0,<1.25.0)", "mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)", "mypy-boto3-groundstation (>=1.24.0,<1.25.0)", "mypy-boto3-guardduty (>=1.24.0,<1.25.0)", "mypy-boto3-health (>=1.24.0,<1.25.0)", "mypy-boto3-healthlake (>=1.24.0,<1.25.0)", "mypy-boto3-honeycode (>=1.24.0,<1.25.0)", "mypy-boto3-iam (>=1.24.0,<1.25.0)", "mypy-boto3-identitystore (>=1.24.0,<1.25.0)", "mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)", "mypy-boto3-importexport (>=1.24.0,<1.25.0)", "mypy-boto3-inspector (>=1.24.0,<1.25.0)", "mypy-boto3-inspector2 (>=1.24.0,<1.25.0)", "mypy-boto3-iot (>=1.24.0,<1.25.0)", "mypy-boto3-iot-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)", "mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)", "mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)", "mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)", "mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)", "mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)", "mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)", "mypy-boto3-iotwireless (>=1.24.0,<1.25.0)", "mypy-boto3-ivs (>=1.24.0,<1.25.0)", "mypy-boto3-ivschat (>=1.24.0,<1.25.0)", "mypy-boto3-kafka (>=1.24.0,<1.25.0)", "mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-kendra (>=1.24.0,<1.25.0)", "mypy-boto3-keyspaces (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)", "mypy-boto3-kms (>=1.24.0,<1.25.0)", "mypy-boto3-lakeformation (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-lex-models (>=1.24.0,<1.25.0)", "mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager-user-subscriptions (>=1.24.0,<1.25.0)", "mypy-boto3-lightsail (>=1.24.0,<1.25.0)", "mypy-boto3-location (>=1.24.0,<1.25.0)", "mypy-boto3-logs (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)", "mypy-boto3-m2 (>=1.24.0,<1.25.0)", "mypy-boto3-machinelearning (>=1.24.0,<1.25.0)", "mypy-boto3-macie (>=1.24.0,<1.25.0)", "mypy-boto3-macie2 (>=1.24.0,<1.25.0)", "mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)", "mypy-boto3-medialive (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)", "mypy-boto3-mediatailor (>=1.24.0,<1.25.0)", "mypy-boto3-memorydb (>=1.24.0,<1.25.0)", "mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)", "mypy-boto3-mgh (>=1.24.0,<1.25.0)", "mypy-boto3-mgn (>=1.24.0,<1.25.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)", "mypy-boto3-mobile (>=1.24.0,<1.25.0)", "mypy-boto3-mq (>=1.24.0,<1.25.0)", "mypy-boto3-mturk (>=1.24.0,<1.25.0)", "mypy-boto3-mwaa (>=1.24.0,<1.25.0)", "mypy-boto3-neptune (>=1.24.0,<1.25.0)", "mypy-boto3-network-firewall (>=1.24.0,<1.25.0)", "mypy-boto3-networkmanager (>=1.24.0,<1.25.0)", "mypy-boto3-nimble (>=1.24.0,<1.25.0)", "mypy-boto3-opensearch (>=1.24.0,<1.25.0)", "mypy-boto3-opsworks (>=1.24.0,<1.25.0)", "mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)", "mypy-boto3-organizations (>=1.24.0,<1.25.0)", "mypy-boto3-outposts (>=1.24.0,<1.25.0)", "mypy-boto3-panorama (>=1.24.0,<1.25.0)", "mypy-boto3-personalize (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-events (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-pi (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)", "mypy-boto3-polly (>=1.24.0,<1.25.0)", "mypy-boto3-pricing (>=1.24.0,<1.25.0)", "mypy-boto3-privatenetworks (>=1.24.0,<1.25.0)", "mypy-boto3-proton (>=1.24.0,<1.25.0)", "mypy-boto3-qldb (>=1.24.0,<1.25.0)", "mypy-boto3-qldb-session (>=1.24.0,<1.25.0)", "mypy-boto3-quicksight (>=1.24.0,<1.25.0)", "mypy-boto3-ram (>=1.24.0,<1.25.0)", "mypy-boto3-rbin (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-rds-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-rekognition (>=1.24.0,<1.25.0)", "mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)", "mypy-boto3-resource-groups (>=1.24.0,<1.25.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)", "mypy-boto3-robomaker (>=1.24.0,<1.25.0)", "mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)", "mypy-boto3-route53 (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)", "mypy-boto3-route53domains (>=1.24.0,<1.25.0)", "mypy-boto3-route53resolver (>=1.24.0,<1.25.0)", "mypy-boto3-rum (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-s3control (>=1.24.0,<1.25.0)", "mypy-boto3-s3outposts (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-savingsplans (>=1.24.0,<1.25.0)", "mypy-boto3-schemas (>=1.24.0,<1.25.0)", "mypy-boto3-sdb (>=1.24.0,<1.25.0)", "mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)", "mypy-boto3-securityhub (>=1.24.0,<1.25.0)", "mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)", "mypy-boto3-service-quotas (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)", "mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)", "mypy-boto3-ses (>=1.24.0,<1.25.0)", "mypy-boto3-sesv2 (>=1.24.0,<1.25.0)", "mypy-boto3-shield (>=1.24.0,<1.25.0)", "mypy-boto3-signer (>=1.24.0,<1.25.0)", "mypy-boto3-sms (>=1.24.0,<1.25.0)", "mypy-boto3-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)", "mypy-boto3-snowball (>=1.24.0,<1.25.0)", "mypy-boto3-sns (>=1.24.0,<1.25.0)", "mypy-boto3-sqs (>=1.24.0,<1.25.0)", "mypy-boto3-ssm (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)", "mypy-boto3-sso (>=1.24.0,<1.25.0)", "mypy-boto3-sso-admin (>=1.24.0,<1.25.0)", "mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)", "mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)", "mypy-boto3-storagegateway (>=1.24.0,<1.25.0)", "mypy-boto3-sts (>=1.24.0,<1.25.0)", "mypy-boto3-support (>=1.24.0,<1.25.0)", "mypy-boto3-swf (>=1.24.0,<1.25.0)", "mypy-boto3-synthetics (>=1.24.0,<1.25.0)", "mypy-boto3-textract (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-query (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-write (>=1.24.0,<1.25.0)", "mypy-boto3-transcribe (>=1.24.0,<1.25.0)", "mypy-boto3-transfer (>=1.24.0,<1.25.0)", "mypy-boto3-translate (>=1.24.0,<1.25.0)", "mypy-boto3-voice-id (>=1.24.0,<1.25.0)", "mypy-boto3-waf (>=1.24.0,<1.25.0)", "mypy-boto3-waf-regional (>=1.24.0,<1.25.0)", "mypy-boto3-wafv2 (>=1.24.0,<1.25.0)", "mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)", "mypy-boto3-wisdom (>=1.24.0,<1.25.0)", "mypy-boto3-workdocs (>=1.24.0,<1.25.0)", "mypy-boto3-worklink (>=1.24.0,<1.25.0)", "mypy-boto3-workmail (>=1.24.0,<1.25.0)", "mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)", "mypy-boto3-xray (>=1.24.0,<1.25.0)"] +all = ["mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)", "mypy-boto3-account (>=1.24.0,<1.25.0)", "mypy-boto3-acm (>=1.24.0,<1.25.0)", "mypy-boto3-acm-pca (>=1.24.0,<1.25.0)", "mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)", "mypy-boto3-amp (>=1.24.0,<1.25.0)", "mypy-boto3-amplify (>=1.24.0,<1.25.0)", "mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)", "mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)", "mypy-boto3-apigateway (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)", "mypy-boto3-appconfig (>=1.24.0,<1.25.0)", "mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)", "mypy-boto3-appflow (>=1.24.0,<1.25.0)", "mypy-boto3-appintegrations (>=1.24.0,<1.25.0)", "mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-application-insights (>=1.24.0,<1.25.0)", "mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-appmesh (>=1.24.0,<1.25.0)", "mypy-boto3-apprunner (>=1.24.0,<1.25.0)", "mypy-boto3-appstream (>=1.24.0,<1.25.0)", "mypy-boto3-appsync (>=1.24.0,<1.25.0)", "mypy-boto3-athena (>=1.24.0,<1.25.0)", "mypy-boto3-auditmanager (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)", "mypy-boto3-backup (>=1.24.0,<1.25.0)", "mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)", "mypy-boto3-backupstorage (>=1.24.0,<1.25.0)", "mypy-boto3-batch (>=1.24.0,<1.25.0)", "mypy-boto3-billingconductor (>=1.24.0,<1.25.0)", "mypy-boto3-braket (>=1.24.0,<1.25.0)", "mypy-boto3-budgets (>=1.24.0,<1.25.0)", "mypy-boto3-ce (>=1.24.0,<1.25.0)", "mypy-boto3-chime (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)", "mypy-boto3-cloud9 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)", "mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)", "mypy-boto3-cloudformation (>=1.24.0,<1.25.0)", "mypy-boto3-cloudfront (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)", "mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)", "mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)", "mypy-boto3-codeartifact (>=1.24.0,<1.25.0)", "mypy-boto3-codebuild (>=1.24.0,<1.25.0)", "mypy-boto3-codecommit (>=1.24.0,<1.25.0)", "mypy-boto3-codedeploy (>=1.24.0,<1.25.0)", "mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)", "mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-codepipeline (>=1.24.0,<1.25.0)", "mypy-boto3-codestar (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)", "mypy-boto3-comprehend (>=1.24.0,<1.25.0)", "mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)", "mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)", "mypy-boto3-config (>=1.24.0,<1.25.0)", "mypy-boto3-connect (>=1.24.0,<1.25.0)", "mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)", "mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)", "mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)", "mypy-boto3-cur (>=1.24.0,<1.25.0)", "mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)", "mypy-boto3-databrew (>=1.24.0,<1.25.0)", "mypy-boto3-dataexchange (>=1.24.0,<1.25.0)", "mypy-boto3-datapipeline (>=1.24.0,<1.25.0)", "mypy-boto3-datasync (>=1.24.0,<1.25.0)", "mypy-boto3-dax (>=1.24.0,<1.25.0)", "mypy-boto3-detective (>=1.24.0,<1.25.0)", "mypy-boto3-devicefarm (>=1.24.0,<1.25.0)", "mypy-boto3-devops-guru (>=1.24.0,<1.25.0)", "mypy-boto3-directconnect (>=1.24.0,<1.25.0)", "mypy-boto3-discovery (>=1.24.0,<1.25.0)", "mypy-boto3-dlm (>=1.24.0,<1.25.0)", "mypy-boto3-dms (>=1.24.0,<1.25.0)", "mypy-boto3-docdb (>=1.24.0,<1.25.0)", "mypy-boto3-drs (>=1.24.0,<1.25.0)", "mypy-boto3-ds (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)", "mypy-boto3-ebs (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)", "mypy-boto3-ecr (>=1.24.0,<1.25.0)", "mypy-boto3-ecr-public (>=1.24.0,<1.25.0)", "mypy-boto3-ecs (>=1.24.0,<1.25.0)", "mypy-boto3-efs (>=1.24.0,<1.25.0)", "mypy-boto3-eks (>=1.24.0,<1.25.0)", "mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)", "mypy-boto3-elasticache (>=1.24.0,<1.25.0)", "mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)", "mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)", "mypy-boto3-elb (>=1.24.0,<1.25.0)", "mypy-boto3-elbv2 (>=1.24.0,<1.25.0)", "mypy-boto3-emr (>=1.24.0,<1.25.0)", "mypy-boto3-emr-containers (>=1.24.0,<1.25.0)", "mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-es (>=1.24.0,<1.25.0)", "mypy-boto3-events (>=1.24.0,<1.25.0)", "mypy-boto3-evidently (>=1.24.0,<1.25.0)", "mypy-boto3-finspace (>=1.24.0,<1.25.0)", "mypy-boto3-finspace-data (>=1.24.0,<1.25.0)", "mypy-boto3-firehose (>=1.24.0,<1.25.0)", "mypy-boto3-fis (>=1.24.0,<1.25.0)", "mypy-boto3-fms (>=1.24.0,<1.25.0)", "mypy-boto3-forecast (>=1.24.0,<1.25.0)", "mypy-boto3-forecastquery (>=1.24.0,<1.25.0)", "mypy-boto3-frauddetector (>=1.24.0,<1.25.0)", "mypy-boto3-fsx (>=1.24.0,<1.25.0)", "mypy-boto3-gamelift (>=1.24.0,<1.25.0)", "mypy-boto3-gamesparks (>=1.24.0,<1.25.0)", "mypy-boto3-glacier (>=1.24.0,<1.25.0)", "mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)", "mypy-boto3-glue (>=1.24.0,<1.25.0)", "mypy-boto3-grafana (>=1.24.0,<1.25.0)", "mypy-boto3-greengrass (>=1.24.0,<1.25.0)", "mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)", "mypy-boto3-groundstation (>=1.24.0,<1.25.0)", "mypy-boto3-guardduty (>=1.24.0,<1.25.0)", "mypy-boto3-health (>=1.24.0,<1.25.0)", "mypy-boto3-healthlake (>=1.24.0,<1.25.0)", "mypy-boto3-honeycode (>=1.24.0,<1.25.0)", "mypy-boto3-iam (>=1.24.0,<1.25.0)", "mypy-boto3-identitystore (>=1.24.0,<1.25.0)", "mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)", "mypy-boto3-importexport (>=1.24.0,<1.25.0)", "mypy-boto3-inspector (>=1.24.0,<1.25.0)", "mypy-boto3-inspector2 (>=1.24.0,<1.25.0)", "mypy-boto3-iot (>=1.24.0,<1.25.0)", "mypy-boto3-iot-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)", "mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)", "mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)", "mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)", "mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)", "mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)", "mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)", "mypy-boto3-iotwireless (>=1.24.0,<1.25.0)", "mypy-boto3-ivs (>=1.24.0,<1.25.0)", "mypy-boto3-ivschat (>=1.24.0,<1.25.0)", "mypy-boto3-kafka (>=1.24.0,<1.25.0)", "mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-kendra (>=1.24.0,<1.25.0)", "mypy-boto3-keyspaces (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)", "mypy-boto3-kms (>=1.24.0,<1.25.0)", "mypy-boto3-lakeformation (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-lex-models (>=1.24.0,<1.25.0)", "mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager-user-subscriptions (>=1.24.0,<1.25.0)", "mypy-boto3-lightsail (>=1.24.0,<1.25.0)", "mypy-boto3-location (>=1.24.0,<1.25.0)", "mypy-boto3-logs (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)", "mypy-boto3-m2 (>=1.24.0,<1.25.0)", "mypy-boto3-machinelearning (>=1.24.0,<1.25.0)", "mypy-boto3-macie (>=1.24.0,<1.25.0)", "mypy-boto3-macie2 (>=1.24.0,<1.25.0)", "mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)", "mypy-boto3-medialive (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)", "mypy-boto3-mediatailor (>=1.24.0,<1.25.0)", "mypy-boto3-memorydb (>=1.24.0,<1.25.0)", "mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)", "mypy-boto3-mgh (>=1.24.0,<1.25.0)", "mypy-boto3-mgn (>=1.24.0,<1.25.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)", "mypy-boto3-mobile (>=1.24.0,<1.25.0)", "mypy-boto3-mq (>=1.24.0,<1.25.0)", "mypy-boto3-mturk (>=1.24.0,<1.25.0)", "mypy-boto3-mwaa (>=1.24.0,<1.25.0)", "mypy-boto3-neptune (>=1.24.0,<1.25.0)", "mypy-boto3-network-firewall (>=1.24.0,<1.25.0)", "mypy-boto3-networkmanager (>=1.24.0,<1.25.0)", "mypy-boto3-nimble (>=1.24.0,<1.25.0)", "mypy-boto3-opensearch (>=1.24.0,<1.25.0)", "mypy-boto3-opsworks (>=1.24.0,<1.25.0)", "mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)", "mypy-boto3-organizations (>=1.24.0,<1.25.0)", "mypy-boto3-outposts (>=1.24.0,<1.25.0)", "mypy-boto3-panorama (>=1.24.0,<1.25.0)", "mypy-boto3-personalize (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-events (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-pi (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)", "mypy-boto3-polly (>=1.24.0,<1.25.0)", "mypy-boto3-pricing (>=1.24.0,<1.25.0)", "mypy-boto3-privatenetworks (>=1.24.0,<1.25.0)", "mypy-boto3-proton (>=1.24.0,<1.25.0)", "mypy-boto3-qldb (>=1.24.0,<1.25.0)", "mypy-boto3-qldb-session (>=1.24.0,<1.25.0)", "mypy-boto3-quicksight (>=1.24.0,<1.25.0)", "mypy-boto3-ram (>=1.24.0,<1.25.0)", "mypy-boto3-rbin (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-rds-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-rekognition (>=1.24.0,<1.25.0)", "mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)", "mypy-boto3-resource-groups (>=1.24.0,<1.25.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)", "mypy-boto3-robomaker (>=1.24.0,<1.25.0)", "mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)", "mypy-boto3-route53 (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)", "mypy-boto3-route53domains (>=1.24.0,<1.25.0)", "mypy-boto3-route53resolver (>=1.24.0,<1.25.0)", "mypy-boto3-rum (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-s3control (>=1.24.0,<1.25.0)", "mypy-boto3-s3outposts (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-savingsplans (>=1.24.0,<1.25.0)", "mypy-boto3-schemas (>=1.24.0,<1.25.0)", "mypy-boto3-sdb (>=1.24.0,<1.25.0)", "mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)", "mypy-boto3-securityhub (>=1.24.0,<1.25.0)", "mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)", "mypy-boto3-service-quotas (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)", "mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)", "mypy-boto3-ses (>=1.24.0,<1.25.0)", "mypy-boto3-sesv2 (>=1.24.0,<1.25.0)", "mypy-boto3-shield (>=1.24.0,<1.25.0)", "mypy-boto3-signer (>=1.24.0,<1.25.0)", "mypy-boto3-sms (>=1.24.0,<1.25.0)", "mypy-boto3-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)", "mypy-boto3-snowball (>=1.24.0,<1.25.0)", "mypy-boto3-sns (>=1.24.0,<1.25.0)", "mypy-boto3-sqs (>=1.24.0,<1.25.0)", "mypy-boto3-ssm (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)", "mypy-boto3-sso (>=1.24.0,<1.25.0)", "mypy-boto3-sso-admin (>=1.24.0,<1.25.0)", "mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)", "mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)", "mypy-boto3-storagegateway (>=1.24.0,<1.25.0)", "mypy-boto3-sts (>=1.24.0,<1.25.0)", "mypy-boto3-support (>=1.24.0,<1.25.0)", "mypy-boto3-support-app (>=1.24.0,<1.25.0)", "mypy-boto3-swf (>=1.24.0,<1.25.0)", "mypy-boto3-synthetics (>=1.24.0,<1.25.0)", "mypy-boto3-textract (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-query (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-write (>=1.24.0,<1.25.0)", "mypy-boto3-transcribe (>=1.24.0,<1.25.0)", "mypy-boto3-transfer (>=1.24.0,<1.25.0)", "mypy-boto3-translate (>=1.24.0,<1.25.0)", "mypy-boto3-voice-id (>=1.24.0,<1.25.0)", "mypy-boto3-waf (>=1.24.0,<1.25.0)", "mypy-boto3-waf-regional (>=1.24.0,<1.25.0)", "mypy-boto3-wafv2 (>=1.24.0,<1.25.0)", "mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)", "mypy-boto3-wisdom (>=1.24.0,<1.25.0)", "mypy-boto3-workdocs (>=1.24.0,<1.25.0)", "mypy-boto3-worklink (>=1.24.0,<1.25.0)", "mypy-boto3-workmail (>=1.24.0,<1.25.0)", "mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)", "mypy-boto3-xray (>=1.24.0,<1.25.0)"] amp = ["mypy-boto3-amp (>=1.24.0,<1.25.0)"] amplify = ["mypy-boto3-amplify (>=1.24.0,<1.25.0)"] amplifybackend = ["mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)"] @@ -464,6 +464,7 @@ stepfunctions = ["mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)"] storagegateway = ["mypy-boto3-storagegateway (>=1.24.0,<1.25.0)"] sts = ["mypy-boto3-sts (>=1.24.0,<1.25.0)"] support = ["mypy-boto3-support (>=1.24.0,<1.25.0)"] +support-app = ["mypy-boto3-support-app (>=1.24.0,<1.25.0)"] swf = ["mypy-boto3-swf (>=1.24.0,<1.25.0)"] synthetics = ["mypy-boto3-synthetics (>=1.24.0,<1.25.0)"] textract = ["mypy-boto3-textract (>=1.24.0,<1.25.0)"] @@ -601,11 +602,11 @@ cffi = ">=1.12" [package.extras] docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"] -docstest = ["pyenchant (>=1.6.11)", "twine (>=1.12.0)", "sphinxcontrib-spelling (>=4.0.1)"] +docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"] sdist = ["setuptools_rust (>=0.11.4)"] ssh = ["bcrypt (>=3.1.5)"] -test = ["pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"] +test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", "pytest-xdist", "pytz"] [[package]] name = "docker" @@ -622,8 +623,8 @@ six = ">=1.4.0" websocket-client = ">=0.32.0" [package.extras] -tls = ["idna (>=2.0.0)", "cryptography (>=1.3.4)", "pyOpenSSL (>=17.5.0)"] ssh = ["paramiko (>=2.4.2)"] +tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"] [[package]] name = "ecdsa" @@ -723,9 +724,9 @@ python-versions = ">=3.7" zipp = ">=0.5" [package.extras] -docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"] +docs = ["jaraco.packaging (>=9)", "rst.linker (>=1.9)", "sphinx"] perf = ["ipython"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.3)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"] +testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"] [[package]] name = "iniconfig" @@ -744,10 +745,10 @@ optional = false python-versions = ">=3.6.1,<4.0" [package.extras] -pipfile_deprecated_finder = ["pipreqs", "requirementslib"] -requirements_deprecated_finder = ["pipreqs", "pip-api"] colors = ["colorama (>=0.4.3,<0.5.0)"] +pipfile_deprecated_finder = ["pipreqs", "requirementslib"] plugins = ["setuptools"] +requirements_deprecated_finder = ["pip-api", "pipreqs"] [[package]] name = "itsdangerous" @@ -820,9 +821,9 @@ optional = false python-versions = ">=2.7" [package.extras] -testing = ["pytest-flake8 (>=1.1.1)", "jsonlib", "enum34", "pytest-flake8 (<1.1.0)", "sqlalchemy", "scikit-learn", "pymongo", "pandas", "numpy", "feedparser", "ecdsa", "pytest-cov", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest (>=3.5,!=3.7.3)"] -"testing.libs" = ["yajl", "ujson", "simplejson"] -docs = ["rst.linker (>=1.9)", "jaraco.packaging (>=3.2)", "sphinx"] +docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"] +testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"] +"testing.libs" = ["simplejson", "ujson", "yajl"] [[package]] name = "jsonpointer" @@ -847,7 +848,7 @@ six = ">=1.11.0" [package.extras] format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"] -format_nongpl = ["idna", "jsonpointer (>1.13)", "webcolors", "rfc3986-validator (>0.1.0)", "rfc3339-validator"] +format_nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"] [[package]] name = "junit-xml" @@ -912,28 +913,28 @@ werkzeug = ">=0.5,<2.2.0" xmltodict = "*" [package.extras] -xray = ["setuptools", "aws-xray-sdk (>=0.93,!=0.96)"] -ssm = ["dataclasses", "PyYAML (>=5.1)"] -server = ["flask-cors", "flask (<2.2.0)", "setuptools", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "sshpubkeys (>=3.1.0)", "cfn-lint (>=0.4.0)", "idna (>=2.5,<4)", "aws-xray-sdk (>=0.93,!=0.96)", "jsondiff (>=1.1.2)", "graphql-core", "docker (>=2.5.1)", "ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "PyYAML (>=5.1)"] -s3 = ["PyYAML (>=5.1)"] -route53resolver = ["sshpubkeys (>=3.1.0)"] -iotdata = ["jsondiff (>=1.1.2)"] -glue = ["pyparsing (>=3.0.7)"] -efs = ["sshpubkeys (>=3.1.0)"] -ec2 = ["sshpubkeys (>=3.1.0)"] -ebs = ["sshpubkeys (>=3.1.0)"] -dynamodbstreams = ["docker (>=2.5.1)"] -dynamodb2 = ["docker (>=2.5.1)"] -dynamodb = ["docker (>=2.5.1)"] -ds = ["sshpubkeys (>=3.1.0)"] -cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] -cloudformation = ["setuptools", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "sshpubkeys (>=3.1.0)", "cfn-lint (>=0.4.0)", "idna (>=2.5,<4)", "aws-xray-sdk (>=0.93,!=0.96)", "jsondiff (>=1.1.2)", "graphql-core", "docker (>=2.5.1)", "ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "PyYAML (>=5.1)"] -batch = ["docker (>=2.5.1)"] -awslambda = ["docker (>=2.5.1)"] -appsync = ["graphql-core"] +all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] apigatewayv2 = ["PyYAML (>=5.1)"] -apigateway = ["openapi-spec-validator (>=0.2.8)", "ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "PyYAML (>=5.1)"] -all = ["setuptools", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "sshpubkeys (>=3.1.0)", "cfn-lint (>=0.4.0)", "idna (>=2.5,<4)", "aws-xray-sdk (>=0.93,!=0.96)", "jsondiff (>=1.1.2)", "graphql-core", "docker (>=2.5.1)", "ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "PyYAML (>=5.1)"] +appsync = ["graphql-core"] +awslambda = ["docker (>=2.5.1)"] +batch = ["docker (>=2.5.1)"] +cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] +ds = ["sshpubkeys (>=3.1.0)"] +dynamodb = ["docker (>=2.5.1)"] +dynamodb2 = ["docker (>=2.5.1)"] +dynamodbstreams = ["docker (>=2.5.1)"] +ebs = ["sshpubkeys (>=3.1.0)"] +ec2 = ["sshpubkeys (>=3.1.0)"] +efs = ["sshpubkeys (>=3.1.0)"] +glue = ["pyparsing (>=3.0.7)"] +iotdata = ["jsondiff (>=1.1.2)"] +route53resolver = ["sshpubkeys (>=3.1.0)"] +s3 = ["PyYAML (>=5.1)"] +server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (<2.2.0)", "flask-cors", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +ssm = ["PyYAML (>=5.1)", "dataclasses"] +xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] [[package]] name = "mypy" @@ -981,11 +982,11 @@ optional = false python-versions = ">=3.8" [package.extras] -default = ["numpy (>=1.19)", "scipy (>=1.8)", "matplotlib (>=3.4)", "pandas (>=1.3)"] -developer = ["pre-commit (>=2.19)", "mypy (>=0.960)"] -doc = ["sphinx (>=5)", "pydata-sphinx-theme (>=0.9)", "sphinx-gallery (>=0.10)", "numpydoc (>=1.4)", "pillow (>=9.1)", "nb2plots (>=0.6)", "texext (>=0.6.6)"] -extra = ["lxml (>=4.6)", "pygraphviz (>=1.9)", "pydot (>=1.4.2)", "sympy (>=1.10)"] -test = ["pytest (>=7.1)", "pytest-cov (>=3.0)", "codecov (>=2.1)"] +default = ["matplotlib (>=3.4)", "numpy (>=1.19)", "pandas (>=1.3)", "scipy (>=1.8)"] +developer = ["mypy (>=0.960)", "pre-commit (>=2.19)"] +doc = ["nb2plots (>=0.6)", "numpydoc (>=1.4)", "pillow (>=9.1)", "pydata-sphinx-theme (>=0.9)", "sphinx (>=5)", "sphinx-gallery (>=0.10)", "texext (>=0.6.6)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.9)", "sympy (>=1.10)"] +test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"] [[package]] name = "openapi-schema-validator" @@ -1000,8 +1001,8 @@ jsonschema = ">=3.0.0,<5.0.0" [package.extras] isodate = ["isodate"] -strict-rfc3339 = ["strict-rfc3339"] rfc3339-validator = ["rfc3339-validator"] +strict-rfc3339 = ["strict-rfc3339"] [[package]] name = "openapi-spec-validator" @@ -1055,8 +1056,8 @@ optional = false python-versions = ">=3.7" [package.extras] -test = ["pytest (>=6)", "pytest-mock (>=3.6)", "pytest-cov (>=2.7)", "appdirs (==1.4.4)"] -docs = ["sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)", "proselint (>=0.10.2)", "furo (>=2021.7.5b38)"] +docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)"] +test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)"] [[package]] name = "pluggy" @@ -1067,8 +1068,8 @@ optional = false python-versions = ">=3.6" [package.extras] -testing = ["pytest-benchmark", "pytest"] -dev = ["tox", "pre-commit"] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] [[package]] name = "prometheus-client" @@ -1142,9 +1143,9 @@ cryptography = {version = ">=3.3.1", optional = true, markers = "extra == \"cryp [package.extras] crypto = ["cryptography (>=3.3.1)"] -dev = ["sphinx", "sphinx-rtd-theme", "zope.interface", "cryptography (>=3.3.1)", "pytest (>=6.0.0,<7.0.0)", "coverage[toml] (==5.0.4)", "mypy", "pre-commit"] +dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.3.1)", "mypy", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx", "sphinx-rtd-theme", "zope.interface"] docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"] -tests = ["pytest (>=6.0.0,<7.0.0)", "coverage[toml] (==5.0.4)"] +tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] [[package]] name = "pyparsing" @@ -1155,7 +1156,7 @@ optional = false python-versions = ">=3.6.8" [package.extras] -diagrams = ["railroad-diagrams", "jinja2"] +diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pypiwin32" @@ -1209,7 +1210,7 @@ python-versions = ">=3.7" pytest = ">=6.1.0" [package.extras] -testing = ["pytest-trio (>=0.7.0)", "mypy (>=0.931)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "coverage (>=6.2)"] +testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"] [[package]] name = "pytest-forked" @@ -1304,8 +1305,8 @@ rsa = "*" [package.extras] cryptography = ["cryptography (>=3.4.0)"] -pycrypto = ["pycrypto (>=2.6.0,<2.7.0)", "pyasn1"] -pycryptodome = ["pycryptodome (>=3.3.1,<4.0.0)", "pyasn1"] +pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"] +pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"] [[package]] name = "pytz" @@ -1362,7 +1363,7 @@ requests = ">=2.0,<3.0" urllib3 = ">=1.25.10" [package.extras] -tests = ["pytest (>=7.0.0)", "coverage (>=6.0.0)", "pytest-cov", "pytest-asyncio", "pytest-localserver", "flake8", "types-mock", "types-requests", "mypy"] +tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-localserver", "types-mock", "types-requests"] [[package]] name = "rsa" @@ -1492,8 +1493,8 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" [package.extras] -brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] -secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] @@ -1545,13 +1546,13 @@ optional = false python-versions = ">=3.7" [package.extras] -docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"] +docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"] +testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "badfeff521c68277b10555ab4174847b7315d82818ef5841e600299fb6128698" +content-hash = "ead1495454ee6d880bb240447025db93a25ebe263c2709de5f144cc2d85dc975" [metadata.files] aiopg = [ @@ -1559,12 +1560,12 @@ aiopg = [ {file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"}, ] allure-pytest = [ - {file = "allure-pytest-2.9.45.tar.gz", hash = "sha256:20620fde08a597578b157a60ff38bdcc300e312d12eaa38cf28e4a62e22bdaa3"}, - {file = "allure_pytest-2.9.45-py3-none-any.whl", hash = "sha256:9b0325e06f8f79cf03289d4f4d741e57607d0fa12d9c094e243cbb042283f083"}, + {file = "allure-pytest-2.10.0.tar.gz", hash = "sha256:3b2ab67629f4cbd8617abd817d2b22292c6eb7efd5584f992d1af8143aea6ee7"}, + {file = "allure_pytest-2.10.0-py3-none-any.whl", hash = "sha256:08274096594758447db54c3b2c382526ee04f1fe12119cdaee92d2d93c84b530"}, ] allure-python-commons = [ - {file = "allure-python-commons-2.9.45.tar.gz", hash = "sha256:c238d28aeac35e8c7c517d8a2327e25ae5bbf2c30b5e2313d20ef11d75f5549d"}, - {file = "allure_python_commons-2.9.45-py3-none-any.whl", hash = "sha256:3572f0526db3946fb14470c58b0b41d343483aad91d37d414e4641815e13691a"}, + {file = "allure-python-commons-2.10.0.tar.gz", hash = "sha256:d4d31344b0f0037a4a11e16b91b28cf0eeb23ffa0e50c27fcfc6aabe72212d3c"}, + {file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"}, ] async-timeout = [ {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"}, @@ -1635,8 +1636,8 @@ boto3 = [ {file = "boto3-1.24.38.tar.gz", hash = "sha256:f4c6b025f392c934338c7f01badfddbd0d3cf2397ff5df35c31409798dce33f5"}, ] boto3-stubs = [ - {file = "boto3-stubs-1.24.56.tar.gz", hash = "sha256:02e11b3669481469b45eee53fa5e0b587e5710f86bb95bd40667d1353d1e4bf6"}, - {file = "boto3_stubs-1.24.56-py3-none-any.whl", hash = "sha256:e5df3a68ddb8299404f63d19decc1f706ebdac64f3133c1e1cab747820337a75"}, + {file = "boto3-stubs-1.24.58.tar.gz", hash = "sha256:95ab521a9a931cc21d48c97c5bd7de0e37370d9b6a298e3905ec621db9243897"}, + {file = "boto3_stubs-1.24.58-py3-none-any.whl", hash = "sha256:a16940df2a347f7890075af8c0b202b06057bc18ff4c640ef94e09ce4176adb9"}, ] botocore = [ {file = "botocore-1.27.38-py3-none-any.whl", hash = "sha256:46a0264ff3335496bd9cb404f83ec0d8eb7bfdef8f74a830c13e6a6b9612adea"}, diff --git a/pyproject.toml b/pyproject.toml index 2c9270934d..ec166ea7cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ prometheus-client = "^0.14.1" pytest-timeout = "^2.1.0" Werkzeug = "2.1.2" pytest-order = "^1.0.1" -allure-pytest = "^2.9.45" +allure-pytest = "^2.10.0" pytest-asyncio = "^0.19.0" [tool.poetry.dev-dependencies] diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 32fd6f19c3..bbc35736bc 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -6,12 +6,10 @@ import enum import filecmp import json import os -import pathlib import re import shutil import socket import subprocess -import tarfile import tempfile import textwrap import time @@ -22,7 +20,6 @@ from enum import Flag, auto from pathlib import Path from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, TypeVar, Union, cast -import allure # type: ignore import asyncpg import backoff # type: ignore import boto3 @@ -38,7 +35,14 @@ from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import make_dsn, parse_dsn from typing_extensions import Literal -from .utils import etcd_path, get_self_dir, lsn_from_hex, lsn_to_hex, subprocess_capture +from .utils import ( + allure_attach_from_dir, + etcd_path, + get_self_dir, + lsn_from_hex, + lsn_to_hex, + subprocess_capture, +) """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -99,7 +103,7 @@ def pytest_configure(config): top_output_dir = env_test_output else: top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR) - pathlib.Path(top_output_dir).mkdir(exist_ok=True) + Path(top_output_dir).mkdir(exist_ok=True) # Find the postgres installation. global pg_distrib_dir @@ -234,11 +238,12 @@ def default_broker(request: Any, port_distributor: PortDistributor): client_port = port_distributor.get_port() # multiple pytest sessions could get launched in parallel, get them different datadirs etcd_datadir = os.path.join(get_test_output_dir(request), f"etcd_datadir_{client_port}") - pathlib.Path(etcd_datadir).mkdir(exist_ok=True, parents=True) + Path(etcd_datadir).mkdir(exist_ok=True, parents=True) broker = Etcd(datadir=etcd_datadir, port=client_port, peer_port=port_distributor.get_port()) yield broker broker.stop() + allure_attach_from_dir(Path(etcd_datadir)) @pytest.fixture(scope="session") @@ -1882,7 +1887,7 @@ class Postgres(PgProtocol): self.env.neon_cli.pg_create( branch_name, node_name=self.node_name, tenant_id=self.tenant_id, lsn=lsn, port=self.port ) - path = pathlib.Path("pgdatadirs") / "tenants" / self.tenant_id.hex / self.node_name + path = Path("pgdatadirs") / "tenants" / self.tenant_id.hex / self.node_name self.pgdata_dir = os.path.join(self.env.repo_dir, path) if config_lines is None: @@ -1913,7 +1918,7 @@ class Postgres(PgProtocol): def pg_data_dir_path(self) -> str: """Path to data directory""" assert self.node_name - path = pathlib.Path("pgdatadirs") / "tenants" / self.tenant_id.hex / self.node_name + path = Path("pgdatadirs") / "tenants" / self.tenant_id.hex / self.node_name return os.path.join(self.env.repo_dir, path) def pg_xact_dir_path(self) -> str: @@ -2289,7 +2294,7 @@ class Etcd: log.debug(f"etcd is already running on port {self.port}") return - pathlib.Path(self.datadir).mkdir(exist_ok=True) + Path(self.datadir).mkdir(exist_ok=True) if not self.binary_path.is_file(): raise RuntimeError(f"etcd broker binary '{self.binary_path}' is not a file") @@ -2329,26 +2334,16 @@ class Etcd: self.handle.wait() -def get_test_output_dir(request: Any) -> pathlib.Path: +def get_test_output_dir(request: Any) -> Path: """Compute the working directory for an individual test.""" test_name = request.node.name - test_dir = pathlib.Path(top_output_dir) / test_name.replace("/", "-") + test_dir = Path(top_output_dir) / test_name.replace("/", "-") log.info(f"get_test_output_dir is {test_dir}") # make mypy happy - assert isinstance(test_dir, pathlib.Path) + assert isinstance(test_dir, Path) return test_dir -ATTACHMENT_SUFFIXES = frozenset( - ( - ".log", - ".stderr", - ".stdout", - ".diffs", - ) -) - - # This is autouse, so the test output directory always gets created, even # if a test doesn't put anything there. It also solves a problem with the # neon_simple_env fixture: if TEST_SHARED_FIXTURES is not set, it @@ -2359,7 +2354,7 @@ ATTACHMENT_SUFFIXES = frozenset( # this fixture ensures that the directory exists. That works because # 'autouse' fixtures are run before other fixtures. @pytest.fixture(scope="function", autouse=True) -def test_output_dir(request: Any) -> Iterator[pathlib.Path]: +def test_output_dir(request: Any) -> Iterator[Path]: """Create the working directory for an individual test.""" # one directory per test @@ -2370,23 +2365,7 @@ def test_output_dir(request: Any) -> Iterator[pathlib.Path]: yield test_dir - for attachment in test_dir.glob("**/*"): - if attachment.suffix in ATTACHMENT_SUFFIXES: - source = str(attachment) - name = str(attachment.relative_to(test_dir)) - attachment_type = "text/plain" - extension = attachment.suffix.removeprefix(".") - - # compress files larger than 1Mb, they're hardly readable in a browser - if attachment.stat().st_size > 1024 * 1024: - source = f"{attachment}.tar.gz" - with tarfile.open(source, "w:gz") as tar: - tar.add(attachment, arcname=attachment.name) - name = f"{name}.tar.gz" - attachment_type = "application/gzip" - extension = "tar.gz" - - allure.attach.file(source, name, attachment_type, extension) + allure_attach_from_dir(test_dir) SKIP_DIRS = frozenset( @@ -2439,7 +2418,7 @@ def should_skip_file(filename: str) -> bool: # # Test helpers # -def list_files_to_compare(pgdata_dir: pathlib.Path): +def list_files_to_compare(pgdata_dir: Path): pgdata_files = [] for root, _file, filenames in os.walk(pgdata_dir): for filename in filenames: @@ -2492,7 +2471,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post # list files we're going to compare assert pg.pgdata_dir - pgdata_files = list_files_to_compare(pathlib.Path(pg.pgdata_dir)) + pgdata_files = list_files_to_compare(Path(pg.pgdata_dir)) restored_files = list_files_to_compare(restored_dir_path) # check that file sets are equal diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 324c62170b..88bf6d634d 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,11 +1,13 @@ import contextlib import os -import pathlib +import re import shutil import subprocess +import tarfile from pathlib import Path from typing import Any, List, Tuple +import allure # type: ignore from fixtures.log_helper import log from psycopg2.extensions import cursor @@ -116,7 +118,7 @@ def get_dir_size(path: str) -> int: return totalbytes -def get_timeline_dir_size(path: pathlib.Path) -> int: +def get_timeline_dir_size(path: Path) -> int: """Get the timeline directory's total size, which only counts the layer files' size.""" sz = 0 for dir_entry in path.iterdir(): @@ -161,3 +163,36 @@ def get_scale_for_db(size_mb: int) -> int: """ return round(0.06689 * size_mb - 0.5) + + +ATTACHMENT_NAME_REGEX = re.compile( + r".+\.log|.+\.stderr|.+\.stdout|.+\.filediff|.+\.metrics|flamegraph\.svg|regression\.diffs" +) + + +def allure_attach_from_dir(dir: Path): + """Attach all non-empty files from `dir` that matches `ATTACHMENT_NAME_REGEX` to Allure report""" + + for attachment in Path(dir).glob("**/*"): + if ATTACHMENT_NAME_REGEX.fullmatch(attachment.name) and attachment.stat().st_size > 0: + source = str(attachment) + name = str(attachment.relative_to(dir)) + + # compress files larger than 1Mb, they're hardly readable in a browser + if attachment.stat().st_size > 1024 * 1024: + source = f"{attachment}.tar.gz" + with tarfile.open(source, "w:gz") as tar: + tar.add(attachment, arcname=attachment.name) + name = f"{name}.tar.gz" + + if source.endswith(".tar.gz"): + attachment_type = "application/gzip" + extension = "tar.gz" + elif source.endswith(".svg"): + attachment_type = "image/svg+xml" + extension = "svg" + else: + attachment_type = "text/plain" + extension = attachment.suffix.removeprefix(".") + + allure.attach.file(source, name, attachment_type, extension) From 13beeb59cd0da7b9482a5631cfd74ab23c33c48c Mon Sep 17 00:00:00 2001 From: MMeent Date: Thu, 1 Sep 2022 12:53:17 +0200 Subject: [PATCH 0710/1022] Update extensions included in compute-node Update PLV8 to 3.1.4 - which is the latest release. Update PostGIS to 3.3.0 Remove PLV8 from the final image -- there is an issue we hit when installing PLV8, and we don't quite know what it is yet. --- Dockerfile.compute-node | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 950ec16016..2e031b17da 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -1,4 +1,7 @@ ARG TAG=pinned +# apparently, ARGs don't get replaced in RUN commands in kaniko +# ARG POSTGIS_VERSION=3.3.0 +# ARG PLV8_VERSION=3.1.4 FROM debian:bullseye-slim AS build-deps RUN apt update && \ @@ -24,9 +27,9 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget -RUN wget https://download.osgeo.org/postgis/source/postgis-3.2.3.tar.gz && \ - tar xvzf postgis-3.2.3.tar.gz && \ - cd postgis-3.2.3 && \ +RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ + tar xvzf postgis-3.3.0.tar.gz && \ + cd postgis-3.3.0 && \ ./autogen.sh && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ ./configure && \ @@ -52,18 +55,18 @@ RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.lis apt update && \ apt install -y --no-install-recommends -t testing binutils -RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.3.tar.gz && \ - tar xvzf v3.1.3.tar.gz && \ - cd plv8-3.1.3 && \ +RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ + tar xvzf v3.1.4.tar.gz && \ + cd plv8-3.1.4 && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ - make && \ - make install && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control # compile neon extensions FROM build-deps AS neon-pg-ext-build -COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ From 46c8a93976873da6199c0c128969129e2751f9b6 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Thu, 1 Sep 2022 15:06:52 +0300 Subject: [PATCH 0711/1022] Fix PERF_TEST_RESULT_CONNSTR for benchmark init (#2375) --- .github/actions/run-python-test-set/action.yml | 2 +- .github/workflows/benchmarking.yml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 1cc65b4286..2344fba13c 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -24,7 +24,7 @@ inputs: required: false default: 'true' save_perf_report: - description: 'Whether to upload the performance report' + description: 'Whether to upload the performance report, if true PERF_TEST_RESULT_CONNSTR env variable should be set' required: false default: 'false' run_with_real_s3: diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 1370917377..4c58dda6b6 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -179,6 +179,8 @@ jobs: env: PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: Benchmark simple-update uses: ./.github/actions/run-python-test-set From 15c5f3e6cfe86774c55ced328bd507266d464f0f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 30 Aug 2022 22:18:01 +0300 Subject: [PATCH 0712/1022] Fix misc typos in comments and variable names. --- control_plane/src/safekeeper.rs | 2 +- control_plane/src/storage.rs | 2 +- pageserver/src/layered_repository/timeline.rs | 2 +- pageserver/src/storage_sync/download.rs | 24 +++++++++---------- safekeeper/src/bin/safekeeper.rs | 2 +- test_runner/regress/test_tenant_relocation.py | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 652736058a..2cc1ae7853 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -46,7 +46,7 @@ impl ResponseErrorMessageExt for Response { return Ok(self); } - // reqwest do not export it's error construction utility functions, so lets craft the message ourselves + // reqwest does not export its error construction utility functions, so let's craft the message ourselves let url = self.url().to_owned(); Err(SafekeeperHttpError::Response( match self.json::() { diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index aab29628e3..9fdab5f88c 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -57,7 +57,7 @@ impl ResponseErrorMessageExt for Response { return Ok(self); } - // reqwest do not export it's error construction utility functions, so lets craft the message ourselves + // reqwest does not export its error construction utility functions, so let's craft the message ourselves let url = self.url().to_owned(); Err(PageserverHttpError::Response( match self.json::() { diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 1a941affe5..81bc975272 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -379,7 +379,7 @@ pub struct Timeline { // It is needed in checks when we want to error on some operations // when they are requested for pre-initdb lsn. // It can be unified with latest_gc_cutoff_lsn under some "first_valid_lsn", - // though lets keep them both for better error visibility. + // though let's keep them both for better error visibility. pub initdb_lsn: Lsn, /// When did we last calculate the partitioning? diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 8e6aa47c88..ded4c042c4 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -234,11 +234,11 @@ pub(super) async fn download_timeline_layers<'a>( let mut download_tasks = layers_to_download .into_iter() - .map(|layer_desination_path| async move { - if layer_desination_path.exists() { + .map(|layer_destination_path| async move { + if layer_destination_path.exists() { debug!( "Layer already exists locally, skipping download: {}", - layer_desination_path.display() + layer_destination_path.display() ); } else { // Perform a rename inspired by durable_rename from file_utils.c. @@ -252,7 +252,7 @@ pub(super) async fn download_timeline_layers<'a>( // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com // If pageserver crashes the temp file will be deleted on startup and re-downloaded. let temp_file_path = - path_with_suffix_extension(&layer_desination_path, TEMP_DOWNLOAD_EXTENSION); + path_with_suffix_extension(&layer_destination_path, TEMP_DOWNLOAD_EXTENSION); let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| { @@ -262,7 +262,7 @@ pub(super) async fn download_timeline_layers<'a>( ) })?; - let mut layer_download = download_storage_object(storage, &layer_desination_path) + let mut layer_download = download_storage_object(storage, &layer_destination_path) .await .with_context(|| { format!( @@ -284,9 +284,9 @@ pub(super) async fn download_timeline_layers<'a>( // that have not yet completed. To ensure that a file is closed immediately when it is dropped, // you should call flush before dropping it. // - // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because - // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations. - // But for additional safety lets check/wait for any pending operations. + // From the tokio code I see that it waits for pending operations to complete. There shouldn't be any because + // we assume that `destination_file` file is fully written. I.e there is no pending .write(...).await operations. + // But for additional safety let's check/wait for any pending operations. destination_file.flush().await.with_context(|| { format!( "failed to flush source file at {}", @@ -307,16 +307,16 @@ pub(super) async fn download_timeline_layers<'a>( anyhow::bail!("remote-storage-download-pre-rename failpoint triggered") }); - fs::rename(&temp_file_path, &layer_desination_path).await?; + fs::rename(&temp_file_path, &layer_destination_path).await?; - fsync_path(&layer_desination_path).await.with_context(|| { + fsync_path(&layer_destination_path).await.with_context(|| { format!( "Cannot fsync layer destination path {}", - layer_desination_path.display(), + layer_destination_path.display(), ) })?; } - Ok::<_, anyhow::Error>(layer_desination_path) + Ok::<_, anyhow::Error>(layer_destination_path) }) .collect::>(); diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 6c9c59c76b..244c793250 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -70,7 +70,7 @@ fn main() -> anyhow::Result<()> { .help(formatcp!("http endpoint address for metrics on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")), ) // FIXME this argument is no longer needed since pageserver address is forwarded from compute. - // However because this argument is in use by console's e2e tests lets keep it for now and remove separately. + // However because this argument is in use by console's e2e tests let's keep it for now and remove separately. // So currently it is a noop. .arg( Arg::new("pageserver") diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 4d949e0c13..19b0ec05a7 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -170,7 +170,7 @@ def check_timeline_attached( new_timeline_detail = assert_timeline_local(new_pageserver_http_client, tenant_id, timeline_id) # when load is active these checks can break because lsns are not static - # so lets check with some margin + # so let's check with some margin assert_abs_margin_ratio( lsn_from_hex(new_timeline_detail["local"]["disk_consistent_lsn"]), lsn_from_hex(old_timeline_detail["local"]["disk_consistent_lsn"]), From 40813adba2515372eb6f3a612c5a453a0ab84d03 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 1 Sep 2022 21:51:48 +0300 Subject: [PATCH 0713/1022] Pevent creation of empty layers with duplicates (#2327) * Pevent creation of empty layers with duplicates * Add comments --- pageserver/src/layered_repository/timeline.rs | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 81bc975272..a624a3ccf5 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -1795,47 +1795,54 @@ impl Timeline { if !same_key { dup_end_lsn = Lsn::INVALID; } - // Determine size occupied by this key. We stop at next key, or when size becomes larger than target_file_size + // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { next_key_size = next_size; if key != next_key { if dup_end_lsn.is_valid() { - dup_start_lsn = dup_end_lsn; - dup_end_lsn = lsn_range.end; + // We are writting segment with duplicates: + // place all remaining values of this key in separate segment + dup_start_lsn = dup_end_lsn; // new segments starts where old stops + dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range } break; } key_values_total_size += next_size; - if key_values_total_size > target_file_size { - // split key between multiple layers: such layer can contain only single key + // Check if it is time to split segment: if total keys size is larger than target file size. + // We need to avoid generation of empty segments if next_size > target_file_size. + if key_values_total_size > target_file_size && lsn != next_lsn { + // Split key between multiple layers: such layer can contain only single key dup_start_lsn = if dup_end_lsn.is_valid() { - dup_end_lsn + dup_end_lsn // new segment with duplicates starts where old one stops } else { - lsn + lsn // start with the first LSN for this key }; - dup_end_lsn = next_lsn; + dup_end_lsn = next_lsn; // upper LSN boundary is exclusive break; } } - // handle case when loop reaches last key + // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set. if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { dup_start_lsn = dup_end_lsn; dup_end_lsn = lsn_range.end; } if writer.is_some() { let written_size = writer.as_mut().unwrap().size(); - // check if key cause layer overflow + // check if key cause layer overflow... if is_dup_layer || dup_end_lsn.is_valid() || written_size + key_values_total_size > target_file_size { + // ... if so, flush previous layer and prepare to write new one new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?); writer = None; } } + // Remember size of key value because at next iteration we will access next item key_values_total_size = next_key_size; } if writer.is_none() { + // Create writer if not initiaized yet writer = Some(DeltaLayerWriter::new( self.conf, self.timeline_id, From f0a0d7bb7ad470e72cba404a7e857e20a8b6de55 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 2 Sep 2022 00:34:37 +0300 Subject: [PATCH 0714/1022] Split RcuWriteGuard::store() into two stages: store and wait. This makes it easier to explain which stages allow concurrent readers and writers. Expand the comments with examples, too. --- libs/utils/src/simple_rcu.rs | 146 +++++++++++++----- pageserver/src/layered_repository/timeline.rs | 4 +- 2 files changed, 111 insertions(+), 39 deletions(-) diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs index 24423815ab..177a839d75 100644 --- a/libs/utils/src/simple_rcu.rs +++ b/libs/utils/src/simple_rcu.rs @@ -9,6 +9,36 @@ //! This implementation isn't wait-free; it uses an RwLock that is held for a //! short duration when the value is read or updated. //! +//! # Examples +//! +//! Read a value and do things with it while holding the guard: +//! +//! ``` +//! # let rcu = utils::simple_rcu::Rcu::new(1); +//! { +//! let read = rcu.read(); +//! println!("the current value is {}", *read); +//! // exiting the scope drops the read-guard, and allows concurrent writers +//! // to finish. +//! } +//! ``` +//! +//! Increment the value by one, and wait for old readers to finish: +//! +//! ``` +//! # let rcu = utils::simple_rcu::Rcu::new(1); +//! let write_guard = rcu.lock_for_write(); +//! +//! // NB: holding `write_guard` blocks new readers and writers. Keep this section short! +//! let new_value = *write_guard + 1; +//! +//! let waitlist = write_guard.store_and_unlock(new_value); // consumes `write_guard` +//! +//! // Concurrent reads and writes are now possible again. Wait for all the readers +//! // that still observe the old value to finish. +//! waitlist.wait(); +//! ``` +//! #![warn(missing_docs)] use std::ops::Deref; @@ -84,9 +114,10 @@ impl Rcu { /// used to read the current value, and to store a new value. /// /// Note: holding the write-guard blocks concurrent readers, so you should - /// finish the update and drop the guard quickly! + /// finish the update and drop the guard quickly! Multiple writers can be + /// waiting on the RcuWriteGuard::store step at the same time, however. /// - pub fn write(&self) -> RcuWriteGuard<'_, V> { + pub fn lock_for_write(&self) -> RcuWriteGuard<'_, V> { let inner = self.inner.write().unwrap(); RcuWriteGuard { inner } } @@ -108,7 +139,13 @@ impl Deref for RcuReadGuard { } /// -/// Read guard returned by `read` +/// Write guard returned by `write` +/// +/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so +/// it should only be held for a short duration! +/// +/// Calling `store` consumes the guard, making new reads and new writes possible +/// again. /// pub struct RcuWriteGuard<'a, V> { inner: RwLockWriteGuard<'a, RcuInner>, @@ -126,13 +163,11 @@ impl<'a, V> RcuWriteGuard<'a, V> { /// /// Store a new value. The new value will be written to the Rcu immediately, /// and will be immediately seen by any `read` calls that start afterwards. - /// But if there are any readers still holding onto the old value, or any - /// even older values, this will await until they have been released. /// - /// This will drop the write-guard before it starts waiting for the reads to - /// finish, so a new write operation can begin before this functio returns. + /// Returns a list of readers that can see old values. You can call `wait()` + /// on it to wait for them to finish. /// - pub fn store(mut self, new_val: V) { + pub fn store_and_unlock(mut self, new_val: V) -> RcuWaitList { let new_cell = Arc::new(RcuCell::new(new_val)); let mut watches = Vec::new(); @@ -151,11 +186,23 @@ impl<'a, V> RcuWriteGuard<'a, V> { } }); } - drop(self); + RcuWaitList(watches) + } +} +/// +/// List of readers who can still see old values. +/// +pub struct RcuWaitList(Vec>); + +impl RcuWaitList { + /// + /// Wait for old readers to finish. + /// + pub fn wait(mut self) { // after all the old_cells are no longer in use, we're done - for w in watches.iter_mut() { - // This will block until the Receiver is closed. That happens then + for w in self.0.iter_mut() { + // This will block until the Receiver is closed. That happens when // the RcuCell is dropped. #[allow(clippy::single_match)] match w.send(()) { @@ -177,41 +224,66 @@ mod tests { use std::time::Duration; #[test] - fn basic() { - let rcu = Arc::new(Rcu::new(1)); + fn two_writers() { + let rcu = Rcu::new(1); + + let read1 = rcu.read(); + assert_eq!(*read1, 1); + + let write2 = rcu.lock_for_write(); + assert_eq!(*write2, 1); + let wait2 = write2.store_and_unlock(2); + + let read2 = rcu.read(); + assert_eq!(*read2, 2); + + let write3 = rcu.lock_for_write(); + assert_eq!(*write3, 2); + let wait3 = write3.store_and_unlock(3); + + // new reader can see the new value, and old readers continue to see the old values. + let read3 = rcu.read(); + assert_eq!(*read3, 3); + assert_eq!(*read2, 2); + assert_eq!(*read1, 1); + let log = Arc::new(Mutex::new(Vec::new())); - - let a = rcu.read(); - assert_eq!(*a, 1); - log.lock().unwrap().push("one"); - - let (rcu_clone, log_clone) = (Arc::clone(&rcu), Arc::clone(&log)); - let thread = spawn(move || { - log_clone.lock().unwrap().push("store two start"); - let write_guard = rcu_clone.write(); - assert_eq!(*write_guard, 1); - write_guard.store(2); - log_clone.lock().unwrap().push("store two done"); + // Wait for the old readers to finish in separate threads. + let log_clone = Arc::clone(&log); + let thread2 = spawn(move || { + wait2.wait(); + log_clone.lock().unwrap().push("wait2 done"); }); + let log_clone = Arc::clone(&log); + let thread3 = spawn(move || { + wait3.wait(); + log_clone.lock().unwrap().push("wait3 done"); + }); + // without this sleep the test can pass on accident if the writer is slow - sleep(Duration::from_secs(1)); + sleep(Duration::from_millis(500)); - // new read should see the new value - let b = rcu.read(); - assert_eq!(*b, 2); + // Release first reader. This allows first write to finish, but calling + // wait() on the second one would still block. + log.lock().unwrap().push("dropping read1"); + drop(read1); + thread2.join().unwrap(); - // old guard still sees the old value - assert_eq!(*a, 1); + sleep(Duration::from_millis(500)); - // Release the old guard. This lets the store in the thread to finish. - log.lock().unwrap().push("release a"); - drop(a); - - thread.join().unwrap(); + // Release second reader, and finish second writer. + log.lock().unwrap().push("dropping read2"); + drop(read2); + thread3.join().unwrap(); assert_eq!( log.lock().unwrap().as_slice(), - &["one", "store two start", "release a", "store two done",] + &[ + "dropping read1", + "wait2 done", + "dropping read2", + "wait3 done" + ] ); } } diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index a624a3ccf5..8b90cc4e6b 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -2046,14 +2046,14 @@ impl Timeline { // // The GC cutoff should only ever move forwards. { - let write_guard = self.latest_gc_cutoff_lsn.write(); + let write_guard = self.latest_gc_cutoff_lsn.lock_for_write(); ensure!( *write_guard <= new_gc_cutoff, "Cannot move GC cutoff LSN backwards (was {}, new {})", *write_guard, new_gc_cutoff ); - write_guard.store(new_gc_cutoff); + write_guard.store_and_unlock(new_gc_cutoff).wait(); } info!("GC starting"); From 47bd307cb8c2941bf66405b4580a11099f4dfe3f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 2 Sep 2022 10:16:47 +0300 Subject: [PATCH 0715/1022] Add python types to represent LSNs, tenant IDs and timeline IDs. (#2351) For better ergonomics. I always found it weird that we used UUID to actually mean a tenant or timeline ID. It worked because it happened to have the same length, 16 bytes, but it was hacky. --- test_runner/fixtures/benchmark_fixture.py | 6 +- test_runner/fixtures/neon_fixtures.py | 272 +++++++++--------- test_runner/fixtures/types.py | 89 ++++++ test_runner/fixtures/utils.py | 11 - .../performance/test_wal_backpressure.py | 10 +- test_runner/regress/test_ancestor_branch.py | 9 +- test_runner/regress/test_auth.py | 6 +- test_runner/regress/test_branch_and_gc.py | 15 +- test_runner/regress/test_branch_behind.py | 19 +- test_runner/regress/test_broken_timeline.py | 14 +- test_runner/regress/test_fullbackup.py | 7 +- test_runner/regress/test_gc_aggressive.py | 9 +- test_runner/regress/test_import.py | 42 +-- test_runner/regress/test_lsn_mapping.py | 6 +- test_runner/regress/test_neon_cli.py | 18 +- test_runner/regress/test_old_request_lsn.py | 9 +- test_runner/regress/test_pageserver_api.py | 33 ++- test_runner/regress/test_pitr_gc.py | 7 +- test_runner/regress/test_readonly_node.py | 9 +- test_runner/regress/test_remote_storage.py | 28 +- test_runner/regress/test_tenant_conf.py | 10 +- test_runner/regress/test_tenant_detach.py | 22 +- test_runner/regress/test_tenant_relocation.py | 55 ++-- test_runner/regress/test_tenant_tasks.py | 11 +- test_runner/regress/test_tenants.py | 18 +- .../test_tenants_with_remote_storage.py | 18 +- test_runner/regress/test_timeline_delete.py | 17 +- test_runner/regress/test_timeline_size.py | 30 +- test_runner/regress/test_wal_acceptor.py | 262 ++++++++--------- .../regress/test_wal_acceptor_async.py | 28 +- test_runner/regress/test_wal_restore.py | 3 +- 31 files changed, 599 insertions(+), 494 deletions(-) create mode 100644 test_runner/fixtures/types.py diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 655ffed90d..338cc47ea2 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -5,7 +5,6 @@ import json import os import re import timeit -import uuid import warnings from contextlib import contextmanager from datetime import datetime @@ -17,6 +16,7 @@ from typing import Iterator, Optional import pytest from _pytest.config import Config from _pytest.terminal import TerminalReporter +from fixtures.types import ZTenantId, ZTimelineId """ This file contains fixtures for micro-benchmarks. @@ -365,11 +365,11 @@ class NeonBenchmarker: assert matches return int(round(float(matches.group(1)))) - def get_timeline_size(self, repo_dir: Path, tenantid: uuid.UUID, timelineid: str): + def get_timeline_size(self, repo_dir: Path, tenantid: ZTenantId, timelineid: ZTimelineId): """ Calculate the on-disk size of a timeline """ - path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid.hex, timelineid) + path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid) totalbytes = 0 for root, dirs, files in os.walk(path): diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index bbc35736bc..9ad9c0cd2f 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -29,20 +29,14 @@ import pytest import requests from cached_property import cached_property from fixtures.log_helper import log +from fixtures.types import Lsn, ZTenantId, ZTimelineId # Type-related stuff from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import make_dsn, parse_dsn from typing_extensions import Literal -from .utils import ( - allure_attach_from_dir, - etcd_path, - get_self_dir, - lsn_from_hex, - lsn_to_hex, - subprocess_capture, -) +from .utils import allure_attach_from_dir, etcd_path, get_self_dir, subprocess_capture """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -378,7 +372,7 @@ class AuthKeys: def generate_tenant_token(self, tenant_id): token = jwt.encode( - {"scope": "tenant", "tenant_id": tenant_id}, self.priv, algorithm="RS256" + {"scope": "tenant", "tenant_id": str(tenant_id)}, self.priv, algorithm="RS256" ) if isinstance(token, bytes): @@ -759,12 +753,12 @@ class NeonEnv: # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. - self.initial_tenant = uuid.uuid4() + self.initial_tenant = ZTenantId.generate() # Create a config file corresponding to the options toml = textwrap.dedent( f""" - default_tenant_id = '{self.initial_tenant.hex}' + default_tenant_id = '{self.initial_tenant}' """ ) @@ -846,9 +840,9 @@ class NeonEnv: """Get list of safekeeper endpoints suitable for safekeepers GUC""" return ",".join([f"localhost:{wa.port.pg}" for wa in self.safekeepers]) - def timeline_dir(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Path: + def timeline_dir(self, tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" - return self.repo_dir / "tenants" / tenant_id.hex / "timelines" / timeline_id.hex + return self.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) @cached_property def auth_keys(self) -> AuthKeys: @@ -976,11 +970,11 @@ class NeonPageserverHttpClient(requests.Session): assert isinstance(res_json, list) return res_json - def tenant_create(self, new_tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: + def tenant_create(self, new_tenant_id: Optional[ZTenantId] = None) -> ZTenantId: res = self.post( f"http://localhost:{self.port}/v1/tenant", json={ - "new_tenant_id": new_tenant_id.hex if new_tenant_id else None, + "new_tenant_id": str(new_tenant_id) if new_tenant_id else None, }, ) self.verbose_error(res) @@ -988,25 +982,25 @@ class NeonPageserverHttpClient(requests.Session): raise Exception(f"could not create tenant: already exists for id {new_tenant_id}") new_tenant_id = res.json() assert isinstance(new_tenant_id, str) - return uuid.UUID(new_tenant_id) + return ZTenantId(new_tenant_id) - def tenant_attach(self, tenant_id: uuid.UUID): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/attach") + def tenant_attach(self, tenant_id: ZTenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach") self.verbose_error(res) - def tenant_detach(self, tenant_id: uuid.UUID): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/detach") + def tenant_detach(self, tenant_id: ZTenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach") self.verbose_error(res) - def tenant_status(self, tenant_id: uuid.UUID) -> Dict[Any, Any]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}") + def tenant_status(self, tenant_id: ZTenantId) -> Dict[Any, Any]: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) return res_json - def timeline_list(self, tenant_id: uuid.UUID) -> List[Dict[str, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline") + def timeline_list(self, tenant_id: ZTenantId) -> List[Dict[str, Any]]: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline") self.verbose_error(res) res_json = res.json() assert isinstance(res_json, list) @@ -1014,17 +1008,17 @@ class NeonPageserverHttpClient(requests.Session): def timeline_create( self, - tenant_id: uuid.UUID, - new_timeline_id: Optional[uuid.UUID] = None, - ancestor_timeline_id: Optional[uuid.UUID] = None, - ancestor_start_lsn: Optional[str] = None, + tenant_id: ZTenantId, + new_timeline_id: Optional[ZTimelineId] = None, + ancestor_timeline_id: Optional[ZTimelineId] = None, + ancestor_start_lsn: Optional[Lsn] = None, ) -> Dict[Any, Any]: res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline", + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json={ - "new_timeline_id": new_timeline_id.hex if new_timeline_id else None, - "ancestor_start_lsn": ancestor_start_lsn, - "ancestor_timeline_id": ancestor_timeline_id.hex if ancestor_timeline_id else None, + "new_timeline_id": str(new_timeline_id) if new_timeline_id else None, + "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None, + "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None, }, ) self.verbose_error(res) @@ -1037,8 +1031,8 @@ class NeonPageserverHttpClient(requests.Session): def timeline_detail( self, - tenant_id: uuid.UUID, - timeline_id: uuid.UUID, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, include_non_incremental_logical_size: bool = False, include_non_incremental_physical_size: bool = False, ) -> Dict[Any, Any]: @@ -1049,7 +1043,7 @@ class NeonPageserverHttpClient(requests.Session): params["include-non-incremental-physical-size"] = "yes" res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}", + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", params=params, ) self.verbose_error(res) @@ -1057,9 +1051,9 @@ class NeonPageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def timeline_delete(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): + def timeline_delete(self, tenant_id: ZTenantId, timeline_id: ZTimelineId): res = self.delete( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}" + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" ) self.verbose_error(res) res_json = res.json() @@ -1179,38 +1173,52 @@ class NeonCli(AbstractNeonCli): def create_tenant( self, - tenant_id: Optional[uuid.UUID] = None, - timeline_id: Optional[uuid.UUID] = None, + tenant_id: Optional[ZTenantId] = None, + timeline_id: Optional[ZTimelineId] = None, conf: Optional[Dict[str, str]] = None, - ) -> Tuple[uuid.UUID, uuid.UUID]: + ) -> Tuple[ZTenantId, ZTimelineId]: """ Creates a new tenant, returns its id and its initial timeline's id. """ if tenant_id is None: - tenant_id = uuid.uuid4() + tenant_id = ZTenantId.generate() if timeline_id is None: - timeline_id = uuid.uuid4() + timeline_id = ZTimelineId.generate() if conf is None: res = self.raw_cli( - ["tenant", "create", "--tenant-id", tenant_id.hex, "--timeline-id", timeline_id.hex] + [ + "tenant", + "create", + "--tenant-id", + str(tenant_id), + "--timeline-id", + str(timeline_id), + ] ) else: res = self.raw_cli( - ["tenant", "create", "--tenant-id", tenant_id.hex, "--timeline-id", timeline_id.hex] + [ + "tenant", + "create", + "--tenant-id", + str(tenant_id), + "--timeline-id", + str(timeline_id), + ] + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) ) res.check_returncode() return tenant_id, timeline_id - def config_tenant(self, tenant_id: uuid.UUID, conf: Dict[str, str]): + def config_tenant(self, tenant_id: ZTenantId, conf: Dict[str, str]): """ Update tenant config. """ if conf is None: - res = self.raw_cli(["tenant", "config", "--tenant-id", tenant_id.hex]) + res = self.raw_cli(["tenant", "config", "--tenant-id", str(tenant_id)]) else: res = self.raw_cli( - ["tenant", "config", "--tenant-id", tenant_id.hex] + ["tenant", "config", "--tenant-id", str(tenant_id)] + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) ) res.check_returncode() @@ -1221,15 +1229,15 @@ class NeonCli(AbstractNeonCli): return res def create_timeline( - self, new_branch_name: str, tenant_id: Optional[uuid.UUID] = None - ) -> uuid.UUID: + self, new_branch_name: str, tenant_id: Optional[ZTenantId] = None + ) -> ZTimelineId: cmd = [ "timeline", "create", "--branch-name", new_branch_name, "--tenant-id", - (tenant_id or self.env.initial_tenant).hex, + str(tenant_id or self.env.initial_tenant), ] res = self.raw_cli(cmd) @@ -1241,16 +1249,16 @@ class NeonCli(AbstractNeonCli): if matches is not None: created_timeline_id = matches.group("timeline_id") - return uuid.UUID(created_timeline_id) + return ZTimelineId(str(created_timeline_id)) - def create_root_branch(self, branch_name: str, tenant_id: Optional[uuid.UUID] = None): + def create_root_branch(self, branch_name: str, tenant_id: Optional[ZTenantId] = None): cmd = [ "timeline", "create", "--branch-name", branch_name, "--tenant-id", - (tenant_id or self.env.initial_tenant).hex, + str(tenant_id or self.env.initial_tenant), ] res = self.raw_cli(cmd) @@ -1265,27 +1273,27 @@ class NeonCli(AbstractNeonCli): if created_timeline_id is None: raise Exception("could not find timeline id after `neon timeline create` invocation") else: - return uuid.UUID(created_timeline_id) + return ZTimelineId(created_timeline_id) def create_branch( self, new_branch_name: str = DEFAULT_BRANCH_NAME, ancestor_branch_name: Optional[str] = None, - tenant_id: Optional[uuid.UUID] = None, - ancestor_start_lsn: Optional[str] = None, - ) -> uuid.UUID: + tenant_id: Optional[ZTenantId] = None, + ancestor_start_lsn: Optional[Lsn] = None, + ) -> ZTimelineId: cmd = [ "timeline", "branch", "--branch-name", new_branch_name, "--tenant-id", - (tenant_id or self.env.initial_tenant).hex, + str(tenant_id or self.env.initial_tenant), ] if ancestor_branch_name is not None: cmd.extend(["--ancestor-branch-name", ancestor_branch_name]) if ancestor_start_lsn is not None: - cmd.extend(["--ancestor-start-lsn", ancestor_start_lsn]) + cmd.extend(["--ancestor-start-lsn", str(ancestor_start_lsn)]) res = self.raw_cli(cmd) res.check_returncode() @@ -1299,9 +1307,11 @@ class NeonCli(AbstractNeonCli): if created_timeline_id is None: raise Exception("could not find timeline id after `neon timeline create` invocation") else: - return uuid.UUID(created_timeline_id) + return ZTimelineId(str(created_timeline_id)) - def list_timelines(self, tenant_id: Optional[uuid.UUID] = None) -> List[Tuple[str, str]]: + def list_timelines( + self, tenant_id: Optional[ZTenantId] = None + ) -> List[Tuple[str, ZTimelineId]]: """ Returns a list of (branch_name, timeline_id) tuples out of parsed `neon timeline list` CLI output. """ @@ -1309,18 +1319,18 @@ class NeonCli(AbstractNeonCli): # (L) main [b49f7954224a0ad25cc0013ea107b54b] # (L) ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540] res = self.raw_cli( - ["timeline", "list", "--tenant-id", (tenant_id or self.env.initial_tenant).hex] + ["timeline", "list", "--tenant-id", str(tenant_id or self.env.initial_tenant)] ) timelines_cli = sorted( map( - lambda branch_and_id: (branch_and_id[0], branch_and_id[1]), + lambda branch_and_id: (branch_and_id[0], ZTimelineId(branch_and_id[1])), TIMELINE_DATA_EXTRACTOR.findall(res.stdout), ) ) return timelines_cli def init( - self, config_toml: str, initial_timeline_id: Optional[uuid.UUID] = None + self, config_toml: str, initial_timeline_id: Optional[ZTimelineId] = None ) -> "subprocess.CompletedProcess[str]": with tempfile.NamedTemporaryFile(mode="w+") as tmp: tmp.write(config_toml) @@ -1328,7 +1338,7 @@ class NeonCli(AbstractNeonCli): cmd = ["init", f"--config={tmp.name}"] if initial_timeline_id: - cmd.extend(["--timeline-id", initial_timeline_id.hex]) + cmd.extend(["--timeline-id", str(initial_timeline_id)]) append_pageserver_param_overrides( params_to_update=cmd, remote_storage=self.env.remote_storage, @@ -1399,20 +1409,20 @@ class NeonCli(AbstractNeonCli): self, branch_name: str, node_name: Optional[str] = None, - tenant_id: Optional[uuid.UUID] = None, - lsn: Optional[str] = None, + tenant_id: Optional[ZTenantId] = None, + lsn: Optional[Lsn] = None, port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "pg", "create", "--tenant-id", - (tenant_id or self.env.initial_tenant).hex, + str(tenant_id or self.env.initial_tenant), "--branch-name", branch_name, ] if lsn is not None: - args.extend(["--lsn", lsn]) + args.extend(["--lsn", str(lsn)]) if port is not None: args.extend(["--port", str(port)]) if node_name is not None: @@ -1425,15 +1435,15 @@ class NeonCli(AbstractNeonCli): def pg_start( self, node_name: str, - tenant_id: Optional[uuid.UUID] = None, - lsn: Optional[str] = None, + tenant_id: Optional[ZTenantId] = None, + lsn: Optional[Lsn] = None, port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "pg", "start", "--tenant-id", - (tenant_id or self.env.initial_tenant).hex, + str(tenant_id or self.env.initial_tenant), ] if lsn is not None: args.append(f"--lsn={lsn}") @@ -1449,7 +1459,7 @@ class NeonCli(AbstractNeonCli): def pg_stop( self, node_name: str, - tenant_id: Optional[uuid.UUID] = None, + tenant_id: Optional[ZTenantId] = None, destroy=False, check_return_code=True, ) -> "subprocess.CompletedProcess[str]": @@ -1457,7 +1467,7 @@ class NeonCli(AbstractNeonCli): "pg", "stop", "--tenant-id", - (tenant_id or self.env.initial_tenant).hex, + str(tenant_id or self.env.initial_tenant), ] if destroy: args.append("--destroy") @@ -1856,7 +1866,7 @@ class Postgres(PgProtocol): """An object representing a running postgres daemon.""" def __init__( - self, env: NeonEnv, tenant_id: uuid.UUID, port: int, check_stop_result: bool = True + self, env: NeonEnv, tenant_id: ZTenantId, port: int, check_stop_result: bool = True ): super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres") self.env = env @@ -1872,7 +1882,7 @@ class Postgres(PgProtocol): self, branch_name: str, node_name: Optional[str] = None, - lsn: Optional[str] = None, + lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> "Postgres": """ @@ -1887,7 +1897,7 @@ class Postgres(PgProtocol): self.env.neon_cli.pg_create( branch_name, node_name=self.node_name, tenant_id=self.tenant_id, lsn=lsn, port=self.port ) - path = Path("pgdatadirs") / "tenants" / self.tenant_id.hex / self.node_name + path = Path("pgdatadirs") / "tenants" / str(self.tenant_id) / self.node_name self.pgdata_dir = os.path.join(self.env.repo_dir, path) if config_lines is None: @@ -1918,7 +1928,7 @@ class Postgres(PgProtocol): def pg_data_dir_path(self) -> str: """Path to data directory""" assert self.node_name - path = Path("pgdatadirs") / "tenants" / self.tenant_id.hex / self.node_name + path = Path("pgdatadirs") / "tenants" / str(self.tenant_id) / self.node_name return os.path.join(self.env.repo_dir, path) def pg_xact_dir_path(self) -> str: @@ -2005,7 +2015,7 @@ class Postgres(PgProtocol): self, branch_name: str, node_name: Optional[str] = None, - lsn: Optional[str] = None, + lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> "Postgres": """ @@ -2046,8 +2056,8 @@ class PostgresFactory: self, branch_name: str, node_name: Optional[str] = None, - tenant_id: Optional[uuid.UUID] = None, - lsn: Optional[str] = None, + tenant_id: Optional[ZTenantId] = None, + lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> Postgres: @@ -2070,8 +2080,8 @@ class PostgresFactory: self, branch_name: str, node_name: Optional[str] = None, - tenant_id: Optional[uuid.UUID] = None, - lsn: Optional[str] = None, + tenant_id: Optional[ZTenantId] = None, + lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> Postgres: @@ -2146,7 +2156,7 @@ class Safekeeper: return self def append_logical_message( - self, tenant_id: uuid.UUID, timeline_id: uuid.UUID, request: Dict[str, Any] + self, tenant_id: ZTenantId, timeline_id: ZTimelineId, request: Dict[str, Any] ) -> Dict[str, Any]: """ Send JSON_CTRL query to append LogicalMessage to WAL and modify @@ -2156,7 +2166,7 @@ class Safekeeper: # "replication=0" hacks psycopg not to send additional queries # on startup, see https://github.com/psycopg/psycopg2/pull/482 - connstr = f"host=localhost port={self.port.pg} replication=0 options='-c ztimelineid={timeline_id.hex} ztenantid={tenant_id.hex}'" + connstr = f"host=localhost port={self.port.pg} replication=0 options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'" with closing(psycopg2.connect(connstr)) as conn: # server doesn't support transactions @@ -2181,18 +2191,18 @@ class Safekeeper: @dataclass class SafekeeperTimelineStatus: acceptor_epoch: int - flush_lsn: str - timeline_start_lsn: str - backup_lsn: str - remote_consistent_lsn: str + flush_lsn: Lsn + timeline_start_lsn: Lsn + backup_lsn: Lsn + remote_consistent_lsn: Lsn @dataclass class SafekeeperMetrics: # These are metrics from Prometheus which uses float64 internally. # As a consequence, values may differ from real original int64s. - flush_lsn_inexact: Dict[Tuple[str, str], int] = field(default_factory=dict) - commit_lsn_inexact: Dict[Tuple[str, str], int] = field(default_factory=dict) + flush_lsn_inexact: Dict[Tuple[ZTenantId, ZTimelineId], int] = field(default_factory=dict) + commit_lsn_inexact: Dict[Tuple[ZTenantId, ZTimelineId], int] = field(default_factory=dict) class SafekeeperHttpClient(requests.Session): @@ -2209,26 +2219,30 @@ class SafekeeperHttpClient(requests.Session): def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() - def timeline_status(self, tenant_id: str, timeline_id: str) -> SafekeeperTimelineStatus: + def timeline_status( + self, tenant_id: ZTenantId, timeline_id: ZTimelineId + ) -> SafekeeperTimelineStatus: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") res.raise_for_status() resj = res.json() return SafekeeperTimelineStatus( acceptor_epoch=resj["acceptor_state"]["epoch"], - flush_lsn=resj["flush_lsn"], - timeline_start_lsn=resj["timeline_start_lsn"], - backup_lsn=resj["backup_lsn"], - remote_consistent_lsn=resj["remote_consistent_lsn"], + flush_lsn=Lsn(resj["flush_lsn"]), + timeline_start_lsn=Lsn(resj["timeline_start_lsn"]), + backup_lsn=Lsn(resj["backup_lsn"]), + remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]), ) - def record_safekeeper_info(self, tenant_id: str, timeline_id: str, body): + def record_safekeeper_info(self, tenant_id: ZTenantId, timeline_id: ZTimelineId, body): res = self.post( f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", json=body, ) res.raise_for_status() - def timeline_delete_force(self, tenant_id: str, timeline_id: str) -> Dict[Any, Any]: + def timeline_delete_force( + self, tenant_id: ZTenantId, timeline_id: ZTimelineId + ) -> Dict[Any, Any]: res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" ) @@ -2237,7 +2251,7 @@ class SafekeeperHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def tenant_delete_force(self, tenant_id: str) -> Dict[Any, Any]: + def tenant_delete_force(self, tenant_id: ZTenantId) -> Dict[Any, Any]: res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") res.raise_for_status() res_json = res.json() @@ -2258,13 +2272,17 @@ class SafekeeperHttpClient(requests.Session): all_metrics_text, re.MULTILINE, ): - metrics.flush_lsn_inexact[(match.group(1), match.group(2))] = int(match.group(3)) + metrics.flush_lsn_inexact[ + (ZTenantId(match.group(1)), ZTimelineId(match.group(2))) + ] = int(match.group(3)) for match in re.finditer( r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', all_metrics_text, re.MULTILINE, ): - metrics.commit_lsn_inexact[(match.group(1), match.group(2))] = int(match.group(3)) + metrics.commit_lsn_inexact[ + (ZTenantId(match.group(1)), ZTimelineId(match.group(2))) + ] = int(match.group(3)) return metrics @@ -2437,7 +2455,7 @@ def list_files_to_compare(pgdata_dir: Path): # pg is the existing and running compute node, that we want to compare with a basebackup def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Postgres): # Get the timeline ID. We need it for the 'basebackup' command - timeline = pg.safe_psql("SHOW neon.timeline_id")[0][0] + timeline = ZTimelineId(pg.safe_psql("SHOW neon.timeline_id")[0][0]) # stop postgres to ensure that files won't change pg.stop() @@ -2453,7 +2471,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post {psql_path} \ --no-psqlrc \ postgres://localhost:{env.pageserver.service_port.pg} \ - -c 'basebackup {pg.tenant_id.hex} {timeline}' \ + -c 'basebackup {pg.tenant_id} {timeline}' \ | tar -x -C {restored_dir_path} """ @@ -2521,7 +2539,7 @@ def wait_until(number_of_iterations: int, interval: float, func): def assert_timeline_local( - pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID + pageserver_http_client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId ): timeline_detail = pageserver_http_client.timeline_detail( tenant, @@ -2535,33 +2553,33 @@ def assert_timeline_local( def assert_no_in_progress_downloads_for_tenant( pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, + tenant: ZTenantId, ): tenant_status = pageserver_http_client.tenant_status(tenant) assert tenant_status["has_in_progress_downloads"] is False, tenant_status def remote_consistent_lsn( - pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID -) -> int: + pageserver_http_client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId +) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) if detail["remote"] is None: # No remote information at all. This happens right after creating # a timeline, before any part of it has been uploaded to remote # storage yet. - return 0 + return Lsn(0) else: lsn_str = detail["remote"]["remote_consistent_lsn"] assert isinstance(lsn_str, str) - return lsn_from_hex(lsn_str) + return Lsn(lsn_str) def wait_for_upload( pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID, - lsn: int, + tenant: ZTenantId, + timeline: ZTimelineId, + lsn: Lsn, ): """waits for local timeline upload up to specified lsn""" for i in range(20): @@ -2570,32 +2588,32 @@ def wait_for_upload( return log.info( "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1 + lsn, current_lsn, i + 1 ) ) time.sleep(1) raise Exception( "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn) + lsn, current_lsn ) ) def last_record_lsn( - pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID -) -> int: + pageserver_http_client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId +) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) lsn_str = detail["local"]["last_record_lsn"] assert isinstance(lsn_str, str) - return lsn_from_hex(lsn_str) + return Lsn(lsn_str) def wait_for_last_record_lsn( pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID, - lsn: int, + tenant: ZTenantId, + timeline: ZTimelineId, + lsn: Lsn, ): """waits for pageserver to catch up to a certain lsn""" for i in range(10): @@ -2604,20 +2622,18 @@ def wait_for_last_record_lsn( return log.info( "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1 + lsn, current_lsn, i + 1 ) ) time.sleep(1) raise Exception( - "timed out while waiting for last_record_lsn to reach {}, was {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn) - ) + "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn) ) -def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: uuid.UUID, timeline: uuid.UUID): +def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: ZTenantId, timeline: ZTimelineId): """Wait for pageserver to catch up the latest flush LSN""" - last_flush_lsn = lsn_from_hex(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) @@ -2626,8 +2642,8 @@ def fork_at_current_lsn( pg: Postgres, new_branch_name: str, ancestor_branch_name: str, - tenant_id: Optional[uuid.UUID] = None, -) -> uuid.UUID: + tenant_id: Optional[ZTenantId] = None, +) -> ZTimelineId: """ Create new branch at the last LSN of an existing branch. The "last LSN" is taken from the given Postgres instance. The pageserver will wait for all the diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py new file mode 100644 index 0000000000..d5cb200080 --- /dev/null +++ b/test_runner/fixtures/types.py @@ -0,0 +1,89 @@ +import random +from functools import total_ordering +from typing import Union + + +@total_ordering +class Lsn: + """ + Datatype for an LSN. Internally it is a 64-bit integer, but the string + representation is like "1/123abcd". See also pg_lsn datatype in Postgres + """ + + def __init__(self, x: Union[int, str]): + if isinstance(x, int): + self.lsn_int = x + else: + """Convert lsn from hex notation to int.""" + l, r = x.split("/") + self.lsn_int = (int(l, 16) << 32) + int(r, 16) + # FIXME: error if it doesn't look like a valid LSN + + def __str__(self): + """Convert lsn from int to standard hex notation.""" + return "{:X}/{:X}".format(self.lsn_int >> 32, self.lsn_int & 0xFFFFFFFF) + + def __repr__(self): + return 'Lsn("{:X}/{:X}")'.format(self.lsn_int >> 32, self.lsn_int & 0xFFFFFFFF) + + def __int__(self): + return self.lsn_int + + def __lt__(self, other: "Lsn") -> bool: + return self.lsn_int < other.lsn_int + + def __eq__(self, other) -> bool: + if not isinstance(other, Lsn): + return NotImplemented + return self.lsn_int == other.lsn_int + + # Returns the difference between two Lsns, in bytes + def __sub__(self, other: "Lsn") -> int: + return self.lsn_int - other.lsn_int + + def __hash__(self): + return hash(self.lsn_int) + + +@total_ordering +class ZId: + """ + Datatype for a Neon tenant and timeline IDs. Internally it's a 16-byte array, and + the string representation is in hex. This corresponds to the ZId / ZTenantId / + ZTimelineIds in in the Rust code. + """ + + def __init__(self, x: str): + self.id = bytearray.fromhex(x) + assert len(self.id) == 16 + + def __str__(self): + return self.id.hex() + + def __lt__(self, other) -> bool: + if not isinstance(other, type(self)): + return NotImplemented + return self.id < other.id + + def __eq__(self, other) -> bool: + if not isinstance(other, type(self)): + return NotImplemented + return self.id == other.id + + def __hash__(self): + return hash(str(self.id)) + + @classmethod + def generate(cls): + """Generate a random ID""" + return cls(random.randbytes(16).hex()) + + +class ZTenantId(ZId): + def __repr__(self): + return f'ZTenantId("{self.id.hex()}")' + + +class ZTimelineId(ZId): + def __repr__(self): + return f'ZTimelineId("{self.id.hex()}")' diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 88bf6d634d..726116e53c 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -61,17 +61,6 @@ def global_counter() -> int: return _global_counter -def lsn_to_hex(num: int) -> str: - """Convert lsn from int to standard hex notation.""" - return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF) - - -def lsn_from_hex(lsn_hex: str) -> int: - """Convert lsn from hex notation to int.""" - l, r = lsn_hex.split("/") - return (int(l, 16) << 32) + int(r, 16) - - def print_gc_result(row): log.info("GC duration {elapsed} ms".format_map(row)) log.info( diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 03d5ba208a..47e2435052 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -9,7 +9,7 @@ from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare from fixtures.log_helper import log from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin -from fixtures.utils import lsn_from_hex +from fixtures.types import Lsn from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix @@ -198,8 +198,8 @@ def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_inte return lsn_write_lags = [] - last_received_lsn = 0 - last_pg_flush_lsn = 0 + last_received_lsn = Lsn(0) + last_pg_flush_lsn = Lsn(0) with env.pg.connect().cursor() as cur: cur.execute("CREATE EXTENSION neon") @@ -218,11 +218,11 @@ def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_inte res = cur.fetchone() lsn_write_lags.append(res[0]) - curr_received_lsn = lsn_from_hex(res[3]) + curr_received_lsn = Lsn(res[3]) lsn_process_speed = (curr_received_lsn - last_received_lsn) / (1024**2) last_received_lsn = curr_received_lsn - curr_pg_flush_lsn = lsn_from_hex(res[2]) + curr_pg_flush_lsn = Lsn(res[2]) lsn_produce_speed = (curr_pg_flush_lsn - last_pg_flush_lsn) / (1024**2) last_pg_flush_lsn = curr_pg_flush_lsn diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index 96612a8aef..b8e81824b0 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -1,5 +1,6 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.types import ZTimelineId from fixtures.utils import query_scalar @@ -26,7 +27,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): pg_branch0 = env.postgres.create_start("main", tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() - branch0_timeline = query_scalar(branch0_cur, "SHOW neon.timeline_id") + branch0_timeline = ZTimelineId(query_scalar(branch0_cur, "SHOW neon.timeline_id")) log.info(f"b0 timeline {branch0_timeline}") # Create table, and insert 100k rows. @@ -50,7 +51,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on 'branch1' branch") branch1_cur = pg_branch1.connect().cursor() - branch1_timeline = query_scalar(branch1_cur, "SHOW neon.timeline_id") + branch1_timeline = ZTimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id")) log.info(f"b1 timeline {branch1_timeline}") branch1_lsn = query_scalar(branch1_cur, "SELECT pg_current_wal_insert_lsn()") @@ -73,7 +74,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on 'branch2' branch") branch2_cur = pg_branch2.connect().cursor() - branch2_timeline = query_scalar(branch2_cur, "SHOW neon.timeline_id") + branch2_timeline = ZTimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id")) log.info(f"b2 timeline {branch2_timeline}") branch2_lsn = query_scalar(branch2_cur, "SELECT pg_current_wal_insert_lsn()") @@ -91,7 +92,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info(f"LSN after 300k rows: {lsn_300}") # Run compaction on branch1. - compact = f"compact {tenant.hex} {branch1_timeline} {lsn_200}" + compact = f"compact {tenant} {branch1_timeline} {lsn_200}" log.info(compact) env.pageserver.safe_psql(compact) diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 16d6ae45c3..08e38e1461 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -1,8 +1,8 @@ from contextlib import closing -from uuid import uuid4 import pytest from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException +from fixtures.types import ZTenantId def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): @@ -11,9 +11,9 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): ps = env.pageserver - tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant.hex) + tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant) tenant_http_client = env.pageserver.http_client(tenant_token) - invalid_tenant_token = env.auth_keys.generate_tenant_token(uuid4().hex) + invalid_tenant_token = env.auth_keys.generate_tenant_token(ZTenantId.generate()) invalid_tenant_http_client = env.pageserver.http_client(invalid_tenant_token) management_token = env.auth_keys.generate_management_token() diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index deb041b5d1..c8c5929066 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -4,7 +4,8 @@ import time import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv -from fixtures.utils import lsn_from_hex, query_scalar +from fixtures.types import Lsn +from fixtures.utils import query_scalar # Test the GC implementation when running with branching. @@ -74,18 +75,16 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')" ) main_cur.execute("INSERT INTO foo SELECT FROM generate_series(1, 100000)") - lsn1 = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + lsn1 = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) log.info(f"LSN1: {lsn1}") main_cur.execute("INSERT INTO foo SELECT FROM generate_series(1, 100000)") - lsn2 = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + lsn2 = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) log.info(f"LSN2: {lsn2}") # Set the GC horizon so that lsn1 is inside the horizon, which means # we can create a new branch starting from lsn1. - env.pageserver.safe_psql( - f"do_gc {tenant.hex} {timeline_main.hex} {lsn_from_hex(lsn2) - lsn_from_hex(lsn1) + 1024}" - ) + env.pageserver.safe_psql(f"do_gc {tenant} {timeline_main} {lsn2 - lsn1 + 1024}") env.neon_cli.create_branch( "test_branch", "test_main", tenant_id=tenant, ancestor_start_lsn=lsn1 @@ -143,7 +142,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): "INSERT INTO t SELECT FROM generate_series(1, 100000)", ] ) - lsn = res[2][0][0] + lsn = Lsn(res[2][0][0]) # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the # branch creation task but the individual timeline GC iteration happens *after* @@ -151,7 +150,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): env.pageserver.safe_psql("failpoints before-timeline-gc=sleep(2000)") def do_gc(): - env.pageserver.safe_psql(f"do_gc {tenant.hex} {b0.hex} 0") + env.pageserver.safe_psql(f"do_gc {tenant} {b0} 0") thread = threading.Thread(target=do_gc, daemon=True) thread.start() diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index 51946380d2..5bd6368bfc 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -2,6 +2,7 @@ import psycopg2.extras import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.types import Lsn, ZTimelineId from fixtures.utils import print_gc_result, query_scalar @@ -27,13 +28,13 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): main_cur = pgmain.connect().cursor() - timeline = query_scalar(main_cur, "SHOW neon.timeline_id") + timeline = ZTimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) # Create table, and insert the first 100 rows main_cur.execute("CREATE TABLE foo (t text)") # keep some early lsn to test branch creation on out of date lsn - gced_lsn = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + gced_lsn = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) main_cur.execute( """ @@ -42,7 +43,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): FROM generate_series(1, 100) g """ ) - lsn_a = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + lsn_a = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) log.info(f"LSN after 100 rows: {lsn_a}") # Insert some more rows. (This generates enough WAL to fill a few segments.) @@ -53,7 +54,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): FROM generate_series(1, 200000) g """ ) - lsn_b = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + lsn_b = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) log.info(f"LSN after 200100 rows: {lsn_b}") # Branch at the point where only 100 rows were inserted @@ -69,7 +70,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): FROM generate_series(1, 200000) g """ ) - lsn_c = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + lsn_c = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) log.info(f"LSN after 400100 rows: {lsn_c}") @@ -96,25 +97,25 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): # branch at segment boundary env.neon_cli.create_branch( - "test_branch_segment_boundary", "test_branch_behind", ancestor_start_lsn="0/3000000" + "test_branch_segment_boundary", "test_branch_behind", ancestor_start_lsn=Lsn("0/3000000") ) pg = env.postgres.create_start("test_branch_segment_boundary") assert pg.safe_psql("SELECT 1")[0][0] == 1 # branch at pre-initdb lsn with pytest.raises(Exception, match="invalid branch start lsn"): - env.neon_cli.create_branch("test_branch_preinitdb", ancestor_start_lsn="0/42") + env.neon_cli.create_branch("test_branch_preinitdb", ancestor_start_lsn=Lsn("0/42")) # branch at pre-ancestor lsn with pytest.raises(Exception, match="less than timeline ancestor lsn"): env.neon_cli.create_branch( - "test_branch_preinitdb", "test_branch_behind", ancestor_start_lsn="0/42" + "test_branch_preinitdb", "test_branch_behind", ancestor_start_lsn=Lsn("0/42") ) # check that we cannot create branch based on garbage collected data with env.pageserver.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: # call gc to advace latest_gc_cutoff_lsn - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") + pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") row = pscur.fetchone() print_gc_result(row) diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index c4b23c24b8..bf44dfd949 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -5,7 +5,7 @@ from typing import List, Tuple import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres -from fixtures.utils import query_scalar +from fixtures.types import ZTenantId, ZTimelineId # Test restarting page server, while safekeeper and compute node keep @@ -15,19 +15,15 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - tenant_timelines: List[Tuple[str, str, Postgres]] = [] + tenant_timelines: List[Tuple[ZTenantId, ZTimelineId, Postgres]] = [] for n in range(4): - tenant_id_uuid, timeline_id_uuid = env.neon_cli.create_tenant() - tenant_id = tenant_id_uuid.hex - timeline_id = timeline_id_uuid.hex + tenant_id, timeline_id = env.neon_cli.create_tenant() - pg = env.postgres.create_start("main", tenant_id=tenant_id_uuid) + pg = env.postgres.create_start("main", tenant_id=tenant_id) with pg.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'") - - timeline_id = query_scalar(cur, "SHOW neon.timeline_id") pg.stop() tenant_timelines.append((tenant_id, timeline_id, pg)) @@ -109,5 +105,5 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): env.neon_cli.pageserver_start() # Check that tenant with "broken" timeline is not loaded. - with pytest.raises(Exception, match=f"Failed to get repo for tenant {tenant_id.hex}"): + with pytest.raises(Exception, match=f"Failed to get repo for tenant {tenant_id}"): env.neon_cli.list_timelines(tenant_id) diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index 8155f52060..af94865549 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -8,6 +8,7 @@ from fixtures.neon_fixtures import ( VanillaPostgres, pg_distrib_dir, ) +from fixtures.types import Lsn, ZTimelineId from fixtures.utils import query_scalar, subprocess_capture num_rows = 1000 @@ -26,7 +27,7 @@ def test_fullbackup( log.info("postgres is running on 'test_fullbackup' branch") with pgmain.cursor() as cur: - timeline = query_scalar(cur, "SHOW neon.timeline_id") + timeline = ZTimelineId(query_scalar(cur, "SHOW neon.timeline_id")) # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") @@ -36,7 +37,7 @@ def test_fullbackup( ) cur.execute("CHECKPOINT") - lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") + lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) log.info(f"start_backup_lsn = {lsn}") # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. @@ -46,7 +47,7 @@ def test_fullbackup( # Get and unpack fullbackup from pageserver restored_dir_path = env.repo_dir / "restored_datadir" os.mkdir(restored_dir_path, 0o750) - query = f"fullbackup {env.initial_tenant.hex} {timeline} {lsn}" + query = f"fullbackup {env.initial_tenant} {timeline} {lsn}" cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] result_basepath = pg_bin.run_capture(cmd, env=psql_env) tar_output_file = result_basepath + ".stdout" diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 90824f882a..67ce8871cd 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -3,6 +3,7 @@ import random from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres +from fixtures.types import ZTimelineId from fixtures.utils import query_scalar # Test configuration @@ -28,15 +29,15 @@ async def update_table(pg: Postgres): # Perform aggressive GC with 0 horizon -async def gc(env: NeonEnv, timeline: str): +async def gc(env: NeonEnv, timeline: ZTimelineId): psconn = await env.pageserver.connect_async() while updates_performed < updates_to_perform: - await psconn.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") + await psconn.execute(f"do_gc {env.initial_tenant} {timeline} 0") # At the same time, run UPDATEs and GC -async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: str): +async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: ZTimelineId): workers = [] for worker_id in range(num_connections): workers.append(asyncio.create_task(update_table(pg))) @@ -61,7 +62,7 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on test_gc_aggressive branch") with pg.cursor() as cur: - timeline = query_scalar(cur, "SHOW neon.timeline_id") + timeline = ZTimelineId(query_scalar(cur, "SHOW neon.timeline_id")) # Create table, and insert the first 100 rows cur.execute("CREATE TABLE foo (id int, counter int, t text)") diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index a2671727f7..fc9f41bda0 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -5,7 +5,6 @@ import shutil import tarfile from contextlib import closing from pathlib import Path -from uuid import UUID, uuid4 import pytest from fixtures.log_helper import log @@ -18,7 +17,8 @@ from fixtures.neon_fixtures import ( wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.utils import lsn_from_hex, subprocess_capture +from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.utils import subprocess_capture @pytest.mark.timeout(600) @@ -69,8 +69,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] node_name = "import_from_vanilla" - tenant = uuid4() - timeline = uuid4() + tenant = ZTenantId.generate() + timeline = ZTimelineId.generate() # Set up pageserver for import neon_env_builder.enable_local_fs_remote_storage() @@ -83,9 +83,9 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build "timeline", "import", "--tenant-id", - tenant.hex, + str(tenant), "--timeline-id", - timeline.hex, + str(timeline), "--node-name", node_name, "--base-lsn", @@ -112,8 +112,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build import_tar(base_tar, wal_tar) # Wait for data to land in s3 - wait_for_last_record_lsn(client, tenant, timeline, lsn_from_hex(end_lsn)) - wait_for_upload(client, tenant, timeline, lsn_from_hex(end_lsn)) + wait_for_last_record_lsn(client, tenant, timeline, Lsn(end_lsn)) + wait_for_upload(client, tenant, timeline, Lsn(end_lsn)) # Check it worked pg = env.postgres.create_start(node_name, tenant_id=tenant) @@ -173,7 +173,7 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne assert cnt_seg_files > 0 -def _generate_data(num_rows: int, pg: Postgres) -> str: +def _generate_data(num_rows: int, pg: Postgres) -> Lsn: """Generate a table with `num_rows` rows. Returns: @@ -191,10 +191,12 @@ def _generate_data(num_rows: int, pg: Postgres) -> str: cur.execute("SELECT pg_current_wal_insert_lsn()") res = cur.fetchone() assert res is not None and isinstance(res[0], str) - return res[0] + return Lsn(res[0]) -def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timeline: UUID) -> str: +def _import( + expected_num_rows: int, lsn: Lsn, env: NeonEnv, pg_bin: PgBin, timeline: ZTimelineId +) -> str: """Test importing backup data to the pageserver. Args: @@ -210,7 +212,7 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel psql_env = {"LD_LIBRARY_PATH": os.path.join(str(pg_distrib_dir), "lib")} # Get a fullbackup from pageserver - query = f"fullbackup { env.initial_tenant.hex} {timeline.hex} {lsn}" + query = f"fullbackup { env.initial_tenant} {timeline} {lsn}" cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] result_basepath = pg_bin.run_capture(cmd, env=psql_env) tar_output_file = result_basepath + ".stdout" @@ -228,7 +230,7 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel # Import using another tenantid, because we use the same pageserver. # TODO Create another pageserver to make test more realistic. - tenant = uuid4() + tenant = ZTenantId.generate() # Import to pageserver node_name = "import_from_pageserver" @@ -239,28 +241,28 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel "timeline", "import", "--tenant-id", - tenant.hex, + str(tenant), "--timeline-id", - timeline.hex, + str(timeline), "--node-name", node_name, "--base-lsn", - lsn, + str(lsn), "--base-tarfile", os.path.join(tar_output_file), ] ) # Wait for data to land in s3 - wait_for_last_record_lsn(client, tenant, timeline, lsn_from_hex(lsn)) - wait_for_upload(client, tenant, timeline, lsn_from_hex(lsn)) + wait_for_last_record_lsn(client, tenant, timeline, lsn) + wait_for_upload(client, tenant, timeline, lsn) # Check it worked pg = env.postgres.create_start(node_name, tenant_id=tenant) assert pg.safe_psql("select count(*) from tbl") == [(expected_num_rows,)] # Take another fullbackup - query = f"fullbackup { tenant.hex} {timeline.hex} {lsn}" + query = f"fullbackup { tenant} {timeline} {lsn}" cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] result_basepath = pg_bin.run_capture(cmd, env=psql_env) new_tar_output_file = result_basepath + ".stdout" @@ -272,6 +274,6 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel # Check that gc works psconn = env.pageserver.connect() pscur = psconn.cursor() - pscur.execute(f"do_gc {tenant.hex} {timeline.hex} 0") + pscur.execute(f"do_gc {tenant} {timeline} 0") return tar_output_file diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 0c1d3648f2..f6ca7000dd 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -41,7 +41,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): probe_timestamp = tbl[-1][1] + timedelta(hours=1) result = query_scalar( ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'", + f"get_lsn_by_timestamp {env.initial_tenant} {new_timeline_id} '{probe_timestamp.isoformat()}Z'", ) assert result == "future" @@ -49,7 +49,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): probe_timestamp = tbl[0][1] - timedelta(hours=10) result = query_scalar( ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'", + f"get_lsn_by_timestamp {env.initial_tenant} {new_timeline_id} '{probe_timestamp.isoformat()}Z'", ) assert result == "past" @@ -60,7 +60,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Call get_lsn_by_timestamp to get the LSN lsn = query_scalar( ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'", + f"get_lsn_by_timestamp {env.initial_tenant} {new_timeline_id} '{probe_timestamp.isoformat()}Z'", ) # Launch a new read-only node at that LSN, and check that only the rows diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index 1acfa72127..b2342e5ee8 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -1,4 +1,3 @@ -import uuid from typing import cast import requests @@ -8,10 +7,11 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, NeonPageserverHttpClient, ) +from fixtures.types import ZTenantId, ZTimelineId def helper_compare_timeline_list( - pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv, initial_tenant: uuid.UUID + pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv, initial_tenant: ZTenantId ): """ Compare timelines list returned by CLI and directly via API. @@ -20,7 +20,7 @@ def helper_compare_timeline_list( timelines_api = sorted( map( - lambda t: cast(str, t["timeline_id"]), + lambda t: ZTimelineId(t["timeline_id"]), pageserver_http_client.timeline_list(initial_tenant), ) ) @@ -52,8 +52,8 @@ def test_cli_timeline_list(neon_simple_env: NeonEnv): # Check that all new branches are visible via CLI timelines_cli = [timeline_id for (_, timeline_id) in env.neon_cli.list_timelines()] - assert main_timeline_id.hex in timelines_cli - assert nested_timeline_id.hex in timelines_cli + assert main_timeline_id in timelines_cli + assert nested_timeline_id in timelines_cli def helper_compare_tenant_list(pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv): @@ -85,11 +85,11 @@ def test_cli_tenant_list(neon_simple_env: NeonEnv): helper_compare_tenant_list(pageserver_http_client, env) res = env.neon_cli.list_tenants() - tenants = sorted(map(lambda t: t.split()[0], res.stdout.splitlines())) + tenants = sorted(map(lambda t: ZTenantId(t.split()[0]), res.stdout.splitlines())) - assert env.initial_tenant.hex in tenants - assert tenant1.hex in tenants - assert tenant2.hex in tenants + assert env.initial_tenant in tenants + assert tenant1 in tenants + assert tenant2 in tenants def test_cli_tenant_create(neon_simple_env: NeonEnv): diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index 257913ef3f..2b5e2edb5f 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -1,6 +1,7 @@ import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.types import ZTimelineId from fixtures.utils import print_gc_result, query_scalar @@ -26,7 +27,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): cur = pg_conn.cursor() # Get the timeline ID of our branch. We need it for the 'do_gc' command - timeline = query_scalar(cur, "SHOW neon.timeline_id") + timeline = ZTimelineId(query_scalar(cur, "SHOW neon.timeline_id")) psconn = env.pageserver.connect() pscur = psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) @@ -60,9 +61,9 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Make a lot of updates on a single row, generating a lot of WAL. Trigger # garbage collections so that the page server will remove old page versions. for i in range(10): - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) + pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") + gcrow = pscur.fetchone() + print_gc_result(gcrow) for j in range(100): cur.execute("UPDATE foo SET val = val + 1 WHERE id = 1;") diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 869f53ac0a..8ee38fcf4f 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -1,7 +1,6 @@ import pathlib import subprocess from typing import Optional -from uuid import UUID, uuid4 from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, @@ -12,7 +11,7 @@ from fixtures.neon_fixtures import ( pg_distrib_dir, wait_until, ) -from fixtures.utils import lsn_from_hex +from fixtures.types import Lsn, ZTenantId, ZTimelineId # test that we cannot override node id after init @@ -61,39 +60,39 @@ def test_pageserver_init_node_id(neon_simple_env: NeonEnv): assert "has node id already, it cannot be overridden" in bad_update.stderr -def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID): +def check_client(client: NeonPageserverHttpClient, initial_tenant: ZTenantId): client.check_status() # check initial tenant is there - assert initial_tenant.hex in {t["id"] for t in client.tenant_list()} + assert initial_tenant in {ZTenantId(t["id"]) for t in client.tenant_list()} # create new tenant and check it is also there - tenant_id = uuid4() + tenant_id = ZTenantId.generate() client.tenant_create(tenant_id) - assert tenant_id.hex in {t["id"] for t in client.tenant_list()} + assert tenant_id in {ZTenantId(t["id"]) for t in client.tenant_list()} timelines = client.timeline_list(tenant_id) assert len(timelines) == 0, "initial tenant should not have any timelines" # create timeline - timeline_id = uuid4() + timeline_id = ZTimelineId.generate() client.timeline_create(tenant_id=tenant_id, new_timeline_id=timeline_id) timelines = client.timeline_list(tenant_id) assert len(timelines) > 0 # check it is there - assert timeline_id.hex in {b["timeline_id"] for b in client.timeline_list(tenant_id)} + assert timeline_id in {ZTimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id)} for timeline in timelines: - timeline_id_str = str(timeline["timeline_id"]) + timeline_id = ZTimelineId(timeline["timeline_id"]) timeline_details = client.timeline_detail( tenant_id=tenant_id, - timeline_id=UUID(timeline_id_str), + timeline_id=timeline_id, include_non_incremental_logical_size=True, ) - assert timeline_details["tenant_id"] == tenant_id.hex - assert timeline_details["timeline_id"] == timeline_id_str + assert ZTenantId(timeline_details["tenant_id"]) == tenant_id + assert ZTimelineId(timeline_details["timeline_id"]) == timeline_id local_timeline_details = timeline_details.get("local") assert local_timeline_details is not None @@ -122,10 +121,10 @@ def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): def expect_updated_msg_lsn( client: NeonPageserverHttpClient, - tenant_id: UUID, - timeline_id: UUID, - prev_msg_lsn: Optional[int], -) -> int: + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + prev_msg_lsn: Optional[Lsn], +) -> Lsn: timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id) # a successful `timeline_details` response must contain the below fields @@ -138,7 +137,7 @@ def expect_updated_msg_lsn( local_timeline_details["last_received_msg_lsn"] is not None ), "the last received message's LSN is empty" - last_msg_lsn = lsn_from_hex(local_timeline_details["last_received_msg_lsn"]) + last_msg_lsn = Lsn(local_timeline_details["last_received_msg_lsn"]) assert ( prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn ), f"the last received message's LSN {last_msg_lsn} hasn't been updated \ diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index 1fc18ebbc4..329f4b7d24 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -3,6 +3,7 @@ from contextlib import closing import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.types import ZTimelineId from fixtures.utils import print_gc_result, query_scalar @@ -24,7 +25,7 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() - timeline = query_scalar(main_cur, "SHOW neon.timeline_id") + timeline = ZTimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) # Create table main_cur.execute("CREATE TABLE foo (t text)") @@ -57,9 +58,9 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): # run GC with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - pscur.execute(f"compact {env.initial_tenant.hex} {timeline}") + pscur.execute(f"compact {env.initial_tenant} {timeline}") # perform aggressive GC. Data still should be kept because of the PITR setting. - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") + pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") row = pscur.fetchone() print_gc_result(row) diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 0bd78c62a3..fac9d97a42 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -1,6 +1,7 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv +from fixtures.types import Lsn from fixtures.utils import query_scalar @@ -84,7 +85,9 @@ def test_readonly_node(neon_simple_env: NeonEnv): # Check creating a node at segment boundary pg = env.postgres.create_start( - branch_name="test_readonly_node", node_name="test_branch_segment_boundary", lsn="0/3000000" + branch_name="test_readonly_node", + node_name="test_branch_segment_boundary", + lsn=Lsn("0/3000000"), ) cur = pg.connect().cursor() cur.execute("SELECT 1") @@ -94,5 +97,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): with pytest.raises(Exception, match="invalid basebackup lsn"): # compute node startup with invalid LSN should fail env.postgres.create_start( - branch_name="test_readonly_node", node_name="test_readonly_node_preinitdb", lsn="0/42" + branch_name="test_readonly_node", + node_name="test_readonly_node_preinitdb", + lsn=Lsn("0/42"), ) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 0015c75670..04baef6ba0 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -5,7 +5,6 @@ import os import shutil import time from pathlib import Path -from uuid import UUID import pytest from fixtures.log_helper import log @@ -18,7 +17,8 @@ from fixtures.neon_fixtures import ( wait_for_upload, wait_until, ) -from fixtures.utils import lsn_from_hex, query_scalar +from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.utils import query_scalar # @@ -61,8 +61,8 @@ def test_remote_storage_backup_and_restore( client = env.pageserver.http_client() - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) checkpoint_numbers = range(1, 3) @@ -74,17 +74,17 @@ def test_remote_storage_backup_and_restore( INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); """ ) - current_lsn = lsn_from_hex(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) # wait until pageserver receives that data - wait_for_last_record_lsn(client, UUID(tenant_id), UUID(timeline_id), current_lsn) + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) # run checkpoint manually to be sure that data landed in remote storage env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") log.info(f"waiting for checkpoint {checkpoint_number} upload") # wait until pageserver successfully uploaded a checkpoint to remote storage - wait_for_upload(client, UUID(tenant_id), UUID(timeline_id), current_lsn) + wait_for_upload(client, tenant_id, timeline_id, current_lsn) log.info(f"upload of checkpoint {checkpoint_number} is done") ##### Stop the first pageserver instance, erase all its data @@ -101,16 +101,16 @@ def test_remote_storage_backup_and_restore( # Introduce failpoint in download env.pageserver.safe_psql("failpoints remote-storage-download-pre-rename=return") - client.tenant_attach(UUID(tenant_id)) + client.tenant_attach(tenant_id) # is there a better way to assert that failpoint triggered? time.sleep(10) # assert cannot attach timeline that is scheduled for download with pytest.raises(Exception, match="Conflict: Tenant download is already in progress"): - client.tenant_attach(UUID(tenant_id)) + client.tenant_attach(tenant_id) - detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) + detail = client.timeline_detail(tenant_id, timeline_id) log.info("Timeline detail with active failpoint: %s", detail) assert detail["local"] is None assert detail["remote"]["awaits_download"] @@ -119,20 +119,20 @@ def test_remote_storage_backup_and_restore( env.pageserver.stop() env.pageserver.start() - client.tenant_attach(UUID(tenant_id)) + client.tenant_attach(tenant_id) log.info("waiting for timeline redownload") wait_until( number_of_iterations=20, interval=1, - func=lambda: assert_timeline_local(client, UUID(tenant_id), UUID(timeline_id)), + func=lambda: assert_timeline_local(client, tenant_id, timeline_id), ) - detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) + detail = client.timeline_detail(tenant_id, timeline_id) assert detail["local"] is not None log.info("Timeline detail after attach completed: %s", detail) assert ( - lsn_from_hex(detail["local"]["last_record_lsn"]) >= current_lsn + Lsn(detail["local"]["last_record_lsn"]) >= current_lsn ), "current db Lsn should should not be less than the one stored on remote storage" assert not detail["remote"]["awaits_download"] diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index d496edd6dc..51a8101b11 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -32,8 +32,8 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" # it should match global configuration with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - log.info(f"show {env.initial_tenant.hex}") - pscur.execute(f"show {env.initial_tenant.hex}") + log.info(f"show {env.initial_tenant}") + pscur.execute(f"show {env.initial_tenant}") res = pscur.fetchone() assert all( i in res.items() @@ -52,7 +52,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" # check the configuration of the new tenant with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant.hex}") + pscur.execute(f"show {tenant}") res = pscur.fetchone() log.info(f"res: {res}") assert all( @@ -80,7 +80,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant.hex}") + pscur.execute(f"show {tenant}") res = pscur.fetchone() log.info(f"after config res: {res}") assert all( @@ -103,7 +103,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant.hex}") + pscur.execute(f"show {tenant}") res = pscur.fetchone() log.info(f"after restart res: {res}") assert all( diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index f1b30429bf..147e22b38f 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -1,17 +1,16 @@ -import uuid from threading import Thread -from uuid import uuid4 import psycopg2 import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException +from fixtures.types import ZTenantId, ZTimelineId -def do_gc_target(env: NeonEnv, tenant_id: uuid.UUID, timeline_id: uuid.UUID): +def do_gc_target(env: NeonEnv, tenant_id: ZTenantId, timeline_id: ZTimelineId): """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" try: - env.pageserver.safe_psql(f"do_gc {tenant_id.hex} {timeline_id.hex} 0") + env.pageserver.safe_psql(f"do_gc {tenant_id} {timeline_id} 0") except Exception as e: log.error("do_gc failed: %s", e) @@ -21,10 +20,10 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): pageserver_http = env.pageserver.http_client() # first check for non existing tenant - tenant_id = uuid4() + tenant_id = ZTenantId.generate() with pytest.raises( expected_exception=NeonPageserverApiException, - match=f"Tenant not found for id {tenant_id.hex}", + match=f"Tenant not found for id {tenant_id}", ): pageserver_http.tenant_detach(tenant_id) @@ -32,7 +31,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): tenant_id, timeline_id = env.neon_cli.create_tenant() # assert tenant exists on disk - assert (env.repo_dir / "tenants" / tenant_id.hex).exists() + assert (env.repo_dir / "tenants" / str(tenant_id)).exists() pg = env.postgres.create_start("main", tenant_id=tenant_id) # we rely upon autocommit after each statement @@ -47,7 +46,8 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): with pytest.raises( expected_exception=psycopg2.DatabaseError, match="gc target timeline does not exist" ): - env.pageserver.safe_psql(f"do_gc {tenant_id.hex} {uuid4().hex} 0") + bogus_timeline_id = ZTimelineId.generate() + env.pageserver.safe_psql(f"do_gc {tenant_id} {bogus_timeline_id} 0") # try to concurrently run gc and detach gc_thread = Thread(target=lambda: do_gc_target(env, tenant_id, timeline_id)) @@ -70,9 +70,9 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): gc_thread.join(timeout=10) # check that nothing is left on disk for deleted tenant - assert not (env.repo_dir / "tenants" / tenant_id.hex).exists() + assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() with pytest.raises( - expected_exception=psycopg2.DatabaseError, match=f"Tenant {tenant_id.hex} not found" + expected_exception=psycopg2.DatabaseError, match=f"Tenant {tenant_id} not found" ): - env.pageserver.safe_psql(f"do_gc {tenant_id.hex} {timeline_id.hex} 0") + env.pageserver.safe_psql(f"do_gc {tenant_id} {timeline_id} 0") diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 19b0ec05a7..56563ebe87 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -5,7 +5,6 @@ import subprocess import threading from contextlib import closing, contextmanager from typing import Any, Dict, Optional, Tuple -from uuid import UUID import pytest from fixtures.log_helper import log @@ -25,7 +24,8 @@ from fixtures.neon_fixtures import ( wait_for_upload, wait_until, ) -from fixtures.utils import lsn_from_hex, lsn_to_hex, subprocess_capture +from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.utils import query_scalar, subprocess_capture def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @@ -113,19 +113,21 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve def populate_branch( pg: Postgres, - tenant_id: UUID, + tenant_id: ZTenantId, ps_http: NeonPageserverHttpClient, create_table: bool, expected_sum: Optional[int], -) -> Tuple[UUID, int]: +) -> Tuple[ZTimelineId, Lsn]: # insert some data with pg_cur(pg) as cur: cur.execute("SHOW neon.timeline_id") - timeline_id = UUID(cur.fetchone()[0]) - log.info("timeline to relocate %s", timeline_id.hex) + timeline_id = ZTimelineId(cur.fetchone()[0]) + log.info("timeline to relocate %s", timeline_id) - cur.execute("SELECT pg_current_wal_flush_lsn()") - log.info("pg_current_wal_flush_lsn() %s", lsn_from_hex(cur.fetchone()[0])) + log.info( + "pg_current_wal_flush_lsn(): %s", + Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")), + ) log.info( "timeline detail %s", ps_http.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id), @@ -139,21 +141,20 @@ def populate_branch( if expected_sum is not None: cur.execute("SELECT sum(key) FROM t") assert cur.fetchone() == (expected_sum,) - cur.execute("SELECT pg_current_wal_flush_lsn()") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - current_lsn = lsn_from_hex(cur.fetchone()[0]) return timeline_id, current_lsn def ensure_checkpoint( pageserver_cur, pageserver_http: NeonPageserverHttpClient, - tenant_id: UUID, - timeline_id: UUID, - current_lsn: int, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + current_lsn: Lsn, ): # run checkpoint manually to be sure that data landed in remote storage - pageserver_cur.execute(f"checkpoint {tenant_id.hex} {timeline_id.hex}") + pageserver_cur.execute(f"checkpoint {tenant_id} {timeline_id}") # wait until pageserver successfully uploaded a checkpoint to remote storage wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) @@ -161,10 +162,10 @@ def ensure_checkpoint( def check_timeline_attached( new_pageserver_http_client: NeonPageserverHttpClient, - tenant_id: UUID, - timeline_id: UUID, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, old_timeline_detail: Dict[str, Any], - old_current_lsn: int, + old_current_lsn: Lsn, ): # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint new_timeline_detail = assert_timeline_local(new_pageserver_http_client, tenant_id, timeline_id) @@ -172,18 +173,22 @@ def check_timeline_attached( # when load is active these checks can break because lsns are not static # so let's check with some margin assert_abs_margin_ratio( - lsn_from_hex(new_timeline_detail["local"]["disk_consistent_lsn"]), - lsn_from_hex(old_timeline_detail["local"]["disk_consistent_lsn"]), + int(Lsn(new_timeline_detail["local"]["disk_consistent_lsn"])), + int(Lsn(old_timeline_detail["local"]["disk_consistent_lsn"])), 0.03, ) assert_abs_margin_ratio( - lsn_from_hex(new_timeline_detail["local"]["disk_consistent_lsn"]), old_current_lsn, 0.03 + int(Lsn(new_timeline_detail["local"]["disk_consistent_lsn"])), int(old_current_lsn), 0.03 ) def switch_pg_to_new_pageserver( - env: NeonEnv, pg: Postgres, new_pageserver_port: int, tenant_id: UUID, timeline_id: UUID + env: NeonEnv, + pg: Postgres, + new_pageserver_port: int, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, ) -> pathlib.Path: pg.stop() @@ -195,7 +200,7 @@ def switch_pg_to_new_pageserver( pg.start() timeline_to_detach_local_path = ( - env.repo_dir / "tenants" / tenant_id.hex / "timelines" / timeline_id.hex + env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) ) files_before_detach = os.listdir(timeline_to_detach_local_path) assert ( @@ -260,7 +265,7 @@ def test_tenant_relocation( pageserver_http = env.pageserver.http_client() tenant_id, initial_timeline_id = env.neon_cli.create_tenant( - UUID("74ee8b079a0e437eb0afea7d26a07209") + ZTenantId("74ee8b079a0e437eb0afea7d26a07209") ) log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id) @@ -280,7 +285,7 @@ def test_tenant_relocation( env.neon_cli.create_branch( new_branch_name="test_tenant_relocation_second", ancestor_branch_name="test_tenant_relocation_main", - ancestor_start_lsn=lsn_to_hex(current_lsn_main), + ancestor_start_lsn=current_lsn_main, tenant_id=tenant_id, ) pg_second = env.postgres.create_start( @@ -365,7 +370,7 @@ def test_tenant_relocation( "python", os.path.join(base_dir, "scripts/export_import_between_pageservers.py"), "--tenant-id", - tenant_id.hex, + str(tenant_id), "--from-host", "localhost", "--from-http-port", diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 8617bc8ea9..befa4616be 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,6 +1,5 @@ -from uuid import UUID - from fixtures.neon_fixtures import NeonEnvBuilder, wait_until +from fixtures.types import ZTenantId, ZTimelineId def get_only_element(l): # noqa: E741 @@ -23,7 +22,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): def get_state(tenant): all_states = client.tenant_list() - matching = [t for t in all_states if t["id"] == tenant.hex] + matching = [t for t in all_states if ZTenantId(t["id"]) == tenant] return get_only_element(matching)["state"] def get_metric_value(name): @@ -35,8 +34,8 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): value = line.lstrip(name).strip() return int(value) - def delete_all_timelines(tenant): - timelines = [UUID(t["timeline_id"]) for t in client.timeline_list(tenant)] + def delete_all_timelines(tenant: ZTenantId): + timelines = [ZTimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)] for t in timelines: client.timeline_delete(tenant, t) @@ -55,7 +54,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): # Detach all tenants and wait for them to go idle # TODO they should be already idle since there are no active computes for tenant_info in client.tenant_list(): - tenant_id = UUID(tenant_info["id"]) + tenant_id = ZTenantId(tenant_info["id"]) delete_all_timelines(tenant_id) wait_until(10, 0.2, lambda: assert_idle(tenant_id)) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 0e0cd44471..8bbf45205a 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -6,7 +6,7 @@ import pytest from fixtures.log_helper import log from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.utils import lsn_to_hex +from fixtures.types import Lsn @pytest.mark.parametrize("with_safekeepers", [False, True]) @@ -84,22 +84,24 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): sk_metrics = all_metrics[1:] ttids = [ - {"tenant_id": tenant_1.hex, "timeline_id": timeline_1.hex}, - {"tenant_id": tenant_2.hex, "timeline_id": timeline_2.hex}, + {"tenant_id": str(tenant_1), "timeline_id": str(timeline_1)}, + {"tenant_id": str(tenant_2), "timeline_id": str(timeline_2)}, ] # Test metrics per timeline for tt in ttids: log.info(f"Checking metrics for {tt}") - ps_lsn = int(ps_metrics.query_one("pageserver_last_record_lsn", filter=tt).value) - sk_lsns = [int(sk.query_one("safekeeper_commit_lsn", filter=tt).value) for sk in sk_metrics] + ps_lsn = Lsn(int(ps_metrics.query_one("pageserver_last_record_lsn", filter=tt).value)) + sk_lsns = [ + Lsn(int(sk.query_one("safekeeper_commit_lsn", filter=tt).value)) for sk in sk_metrics + ] - log.info(f"ps_lsn: {lsn_to_hex(ps_lsn)}") - log.info(f"sk_lsns: {list(map(lsn_to_hex, sk_lsns))}") + log.info(f"ps_lsn: {ps_lsn}") + log.info(f"sk_lsns: {sk_lsns}") assert ps_lsn <= max(sk_lsns) - assert ps_lsn > 0 + assert ps_lsn > Lsn(0) # Test common metrics for metrics in all_metrics: diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 083150e12a..70b474c9a9 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -8,7 +8,6 @@ import asyncio from typing import List, Tuple -from uuid import UUID import pytest from fixtures.neon_fixtures import ( @@ -20,7 +19,7 @@ from fixtures.neon_fixtures import ( wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.utils import lsn_from_hex +from fixtures.types import Lsn, ZTenantId, ZTimelineId async def tenant_workload(env: NeonEnv, pg: Postgres): @@ -28,9 +27,6 @@ async def tenant_workload(env: NeonEnv, pg: Postgres): pg_conn = await pg.connect_async() - await pg_conn.fetchval("show neon.tenant_id") - await pg_conn.fetchval("show neon.timeline_id") - await pg_conn.execute("CREATE TABLE t(key int primary key, value text)") for i in range(1, 100): await pg_conn.execute( @@ -62,7 +58,7 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem env = neon_env_builder.init_start() - tenants_pgs: List[Tuple[UUID, Postgres]] = [] + tenants_pgs: List[Tuple[ZTenantId, Postgres]] = [] for _ in range(1, 5): # Use a tiny checkpoint distance, to create a lot of layers quickly @@ -87,13 +83,13 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem res = pg.safe_psql_many( ["SHOW neon.tenant_id", "SHOW neon.timeline_id", "SELECT pg_current_wal_flush_lsn()"] ) - tenant_id = res[0][0][0] - timeline_id = res[1][0][0] - current_lsn = lsn_from_hex(res[2][0][0]) + tenant_id = ZTenantId(res[0][0][0]) + timeline_id = ZTimelineId(res[1][0][0]) + current_lsn = Lsn(res[2][0][0]) # wait until pageserver receives all the data - wait_for_last_record_lsn(pageserver_http, UUID(tenant_id), UUID(timeline_id), current_lsn) + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) # run final checkpoint manually to flush all the data to remote storage env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") - wait_for_upload(pageserver_http, UUID(tenant_id), UUID(timeline_id), current_lsn) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 7a55ffb769..a5dadc535b 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -1,7 +1,6 @@ -from uuid import uuid4 - import pytest from fixtures.neon_fixtures import NeonEnv, NeonPageserverApiException, wait_until +from fixtures.types import ZTenantId, ZTimelineId def test_timeline_delete(neon_simple_env: NeonEnv): @@ -11,15 +10,15 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # first try to delete non existing timeline # for existing tenant: - invalid_timeline_id = uuid4() + invalid_timeline_id = ZTimelineId.generate() with pytest.raises(NeonPageserverApiException, match="timeline not found"): ps_http.timeline_delete(tenant_id=env.initial_tenant, timeline_id=invalid_timeline_id) # for non existing tenant: - invalid_tenant_id = uuid4() + invalid_tenant_id = ZTenantId.generate() with pytest.raises( NeonPageserverApiException, - match=f"Tenant {invalid_tenant_id.hex} not found in local tenant state", + match=f"Tenant {invalid_tenant_id} not found in local tenant state", ): ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id) @@ -37,7 +36,11 @@ def test_timeline_delete(neon_simple_env: NeonEnv): ): timeline_path = ( - env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / parent_timeline_id.hex + env.repo_dir + / "tenants" + / str(env.initial_tenant) + / "timelines" + / str(parent_timeline_id) ) assert timeline_path.exists() @@ -46,7 +49,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): assert not timeline_path.exists() timeline_path = ( - env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / leaf_timeline_id.hex + env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id) ) assert timeline_path.exists() diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index f6b665ec8c..aba8567541 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -3,7 +3,6 @@ import random import re import time from contextlib import closing -from uuid import UUID import psycopg2.errors import psycopg2.extras @@ -15,6 +14,7 @@ from fixtures.neon_fixtures import ( assert_timeline_local, wait_for_last_flush_lsn, ) +from fixtures.types import ZTenantId, ZTimelineId from fixtures.utils import get_timeline_dir_size @@ -34,8 +34,6 @@ def test_timeline_size(neon_simple_env: NeonEnv): with closing(pgmain.connect()) as conn: with conn.cursor() as cur: - cur.execute("SHOW neon.timeline_id") - cur.execute("CREATE TABLE foo (t text)") cur.execute( """ @@ -77,8 +75,6 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): with closing(pgmain.connect()) as conn: with conn.cursor() as cur: - cur.execute("SHOW neon.timeline_id") - res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) local_details = res["local"] assert ( @@ -254,7 +250,7 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -281,8 +277,8 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") - env.pageserver.safe_psql(f"compact {env.initial_tenant.hex} {new_timeline_id.hex}") + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + env.pageserver.safe_psql(f"compact {env.initial_tenant} {new_timeline_id}") assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -307,7 +303,7 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") pg.safe_psql( """ @@ -318,9 +314,9 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") - env.pageserver.safe_psql(f"do_gc {env.initial_tenant.hex} {new_timeline_id.hex} 0") + env.pageserver.safe_psql(f"do_gc {env.initial_tenant} {new_timeline_id} 0") assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -343,12 +339,12 @@ def test_timeline_size_metrics(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") # get the metrics and parse the metric for the current timeline's physical size metrics = env.pageserver.http_client().get_metrics() matches = re.search( - f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant.hex}",timeline_id="{new_timeline_id.hex}"}} (\\S+)$', + f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$', metrics, re.MULTILINE, ) @@ -361,7 +357,7 @@ def test_timeline_size_metrics(neon_simple_env: NeonEnv): # Check that the logical size metric is sane, and matches matches = re.search( - f'^pageserver_current_logical_size{{tenant_id="{env.initial_tenant.hex}",timeline_id="{new_timeline_id.hex}"}} (\\S+)$', + f'^pageserver_current_logical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$', metrics, re.MULTILINE, ) @@ -389,7 +385,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): tenant, timeline = env.neon_cli.create_tenant() - def get_timeline_physical_size(timeline: UUID): + def get_timeline_physical_size(timeline: ZTimelineId): res = client.timeline_detail(tenant, timeline, include_non_incremental_physical_size=True) return res["local"]["current_physical_size_non_incremental"] @@ -408,7 +404,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, tenant, timeline) - env.pageserver.safe_psql(f"checkpoint {tenant.hex} {timeline.hex}") + env.pageserver.safe_psql(f"checkpoint {tenant} {timeline}") timeline_total_size += get_timeline_physical_size(timeline) @@ -418,7 +414,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): assert tenant_physical_size == timeline_total_size -def assert_physical_size(env: NeonEnv, tenant_id: UUID, timeline_id: UUID): +def assert_physical_size(env: NeonEnv, tenant_id: ZTenantId, timeline_id: ZTimelineId): """Check the current physical size returned from timeline API matches the total physical size of the timeline on disk""" client = env.pageserver.http_client() diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 28daeb18ed..cd370e60c0 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -7,12 +7,10 @@ import subprocess import sys import threading import time -import uuid from contextlib import closing from dataclasses import dataclass, field from pathlib import Path from typing import Any, List, Optional -from uuid import uuid4 import pytest from fixtures.log_helper import log @@ -34,14 +32,19 @@ from fixtures.neon_fixtures import ( wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.utils import get_dir_size, lsn_from_hex, lsn_to_hex, query_scalar +from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.utils import get_dir_size, query_scalar def wait_lsn_force_checkpoint( - tenant_id: str, timeline_id: str, pg: Postgres, ps: NeonPageserver, pageserver_conn_options={} + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + pg: Postgres, + ps: NeonPageserver, + pageserver_conn_options={}, ): - lsn = lsn_from_hex(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - log.info(f"pg_current_wal_flush_lsn is {lsn_to_hex(lsn)}, waiting for it on pageserver") + lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver") auth_token = None if "password" in pageserver_conn_options: @@ -50,8 +53,8 @@ def wait_lsn_force_checkpoint( # wait for the pageserver to catch up wait_for_last_record_lsn( ps.http_client(auth_token=auth_token), - uuid.UUID(hex=tenant_id), - uuid.UUID(hex=timeline_id), + tenant_id, + timeline_id, lsn, ) @@ -63,19 +66,19 @@ def wait_lsn_force_checkpoint( # ensure that remote_consistent_lsn is advanced wait_for_upload( ps.http_client(auth_token=auth_token), - uuid.UUID(hex=tenant_id), - uuid.UUID(hex=timeline_id), + tenant_id, + timeline_id, lsn, ) @dataclass class TimelineMetrics: - timeline_id: str - last_record_lsn: int + timeline_id: ZTimelineId + last_record_lsn: Lsn # One entry per each Safekeeper, order is the same - flush_lsns: List[int] = field(default_factory=list) - commit_lsns: List[int] = field(default_factory=list) + flush_lsns: List[Lsn] = field(default_factory=list) + commit_lsns: List[Lsn] = field(default_factory=list) # Run page server and multiple acceptors, and multiple compute nodes running @@ -123,7 +126,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): timeline_metrics = [] for timeline_detail in timeline_details: - timeline_id: str = timeline_detail["timeline_id"] + timeline_id = ZTimelineId(timeline_detail["timeline_id"]) local_timeline_detail = timeline_detail.get("local") if local_timeline_detail is None: @@ -132,11 +135,11 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): m = TimelineMetrics( timeline_id=timeline_id, - last_record_lsn=lsn_from_hex(local_timeline_detail["last_record_lsn"]), + last_record_lsn=Lsn(local_timeline_detail["last_record_lsn"]), ) for sk_m in sk_metrics: - m.flush_lsns.append(sk_m.flush_lsn_inexact[(tenant_id.hex, timeline_id)]) - m.commit_lsns.append(sk_m.commit_lsn_inexact[(tenant_id.hex, timeline_id)]) + m.flush_lsns.append(Lsn(sk_m.flush_lsn_inexact[(tenant_id, timeline_id)])) + m.commit_lsns.append(Lsn(sk_m.commit_lsn_inexact[(tenant_id, timeline_id)])) for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns): # Invariant. May be < when transaction is in progress. @@ -216,7 +219,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): final_m = collect_metrics("after SELECT") # Assume that LSNs (a) behave similarly in all timelines; and (b) INSERT INTO alters LSN significantly. # Also assume that safekeepers will not be significantly out of sync in this test. - middle_lsn = (init_m[0].last_record_lsn + final_m[0].last_record_lsn) // 2 + middle_lsn = Lsn((int(init_m[0].last_record_lsn) + int(final_m[0].last_record_lsn)) // 2) assert max(init_m[0].flush_lsns) < middle_lsn < min(final_m[0].flush_lsns) assert max(init_m[0].commit_lsns) < middle_lsn < min(final_m[0].commit_lsns) assert max(init_m[1].flush_lsns) < middle_lsn < min(final_m[1].flush_lsns) @@ -270,8 +273,8 @@ def test_broker(neon_env_builder: NeonEnvBuilder): pg.safe_psql("CREATE TABLE t(key int primary key, value text)") # learn neon timeline from compute - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) # wait until remote_consistent_lsn gets advanced on all safekeepers clients = [sk.http_client() for sk in env.safekeepers] @@ -288,8 +291,7 @@ def test_broker(neon_env_builder: NeonEnvBuilder): while True: stat_after = [cli.timeline_status(tenant_id, timeline_id) for cli in clients] if all( - lsn_from_hex(s_after.remote_consistent_lsn) - > lsn_from_hex(s_before.remote_consistent_lsn) + s_after.remote_consistent_lsn > s_before.remote_consistent_lsn for s_after, s_before in zip(stat_after, stat_before) ): break @@ -323,8 +325,8 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): ] ) - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) # force checkpoint to advance remote_consistent_lsn pageserver_conn_options = {} @@ -334,7 +336,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # We will wait for first segment removal. Make sure they exist for starter. first_segments = [ - os.path.join(sk.data_dir(), tenant_id, timeline_id, "000000010000000000000001") + os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id), "000000010000000000000001") for sk in env.safekeepers ] assert all(os.path.exists(p) for p in first_segments) @@ -346,7 +348,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): auth_token=env.auth_keys.generate_tenant_token(tenant_id) ) http_cli_other = env.safekeepers[0].http_client( - auth_token=env.auth_keys.generate_tenant_token(uuid4().hex) + auth_token=env.auth_keys.generate_tenant_token(ZTenantId.generate()) ) http_cli_noauth = env.safekeepers[0].http_client() @@ -367,7 +369,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): ) http_cli.record_safekeeper_info(tenant_id, timeline_id, {"backup_lsn": "FFFFFFFF/FEFFFFFF"}) assert ( - "FFFFFFFF/FEFFFFFF" + Lsn("FFFFFFFF/FEFFFFFF") == http_cli.timeline_status(tenant_id=tenant_id, timeline_id=timeline_id).backup_lsn ) @@ -382,14 +384,14 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): time.sleep(0.5) -def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end): +def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end: Lsn): started_at = time.time() http_cli = live_sk.http_client() while True: tli_status = http_cli.timeline_status(tenant_id, timeline_id) log.info(f"live sk status is {tli_status}") - if lsn_from_hex(tli_status.backup_lsn) >= lsn_from_hex(seg_end): + if tli_status.backup_lsn >= seg_end: break elapsed = time.time() - started_at if elapsed > 30: @@ -399,23 +401,22 @@ def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end): time.sleep(0.5) -def wait_wal_trim(tenant_id, timeline_id, sk, target_size): +def wait_wal_trim(tenant_id, timeline_id, sk, target_size_mb): started_at = time.time() http_cli = sk.http_client() while True: tli_status = http_cli.timeline_status(tenant_id, timeline_id) - sk_wal_size = ( - get_dir_size(os.path.join(sk.data_dir(), tenant_id, timeline_id)) / 1024 / 1024 - ) - log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size:.2f}MB status={tli_status}") + sk_wal_size = get_dir_size(os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id))) + sk_wal_size_mb = sk_wal_size / 1024 / 1024 + log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}") - if sk_wal_size <= target_size: + if sk_wal_size_mb <= target_size_mb: break elapsed = time.time() - started_at if elapsed > 20: raise RuntimeError( - f"timed out waiting {elapsed:.0f}s for sk_id={sk.id} to trim WAL to {target_size:.2f}MB, current size is {sk_wal_size:.2f}MB" + f"timed out waiting {elapsed:.0f}s for sk_id={sk.id} to trim WAL to {target_size_mb:.2f}MB, current size is {sk_wal_size_mb:.2f}MB" ) time.sleep(0.5) @@ -437,8 +438,8 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot pg = env.postgres.create_start("test_safekeepers_wal_backup") # learn neon timeline from compute - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) pg_conn = pg.connect() cur = pg_conn.cursor() @@ -446,7 +447,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot # Shut down subsequently each of safekeepers and fill a segment while sk is # down; ensure segment gets offloaded by others. - offloaded_seg_end = ["0/2000000", "0/3000000", "0/4000000"] + offloaded_seg_end = [Lsn("0/2000000"), Lsn("0/3000000"), Lsn("0/4000000")] for victim, seg_end in zip(env.safekeepers, offloaded_seg_end): victim.stop() # roughly fills one segment @@ -465,7 +466,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot with closing(pg.connect()) as conn: with conn.cursor() as cur: cur.execute("insert into t select generate_series(1,250000), 'payload'") - wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], "0/5000000") + wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], Lsn("0/5000000")) @pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) @@ -492,8 +493,8 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re pg = env.postgres.create_start("test_s3_wal_replay") # learn neon timeline from compute - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) expected_sum = 0 @@ -503,7 +504,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re cur.execute("insert into t values (1, 'payload')") expected_sum += 1 - offloaded_seg_end = ["0/3000000"] + offloaded_seg_end = [Lsn("0/3000000")] for seg_end in offloaded_seg_end: # roughly fills two segments cur.execute("insert into t select generate_series(1,500000), 'payload'") @@ -517,7 +518,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re # advance remote_consistent_lsn to trigger WAL trimming # this LSN should be less than commit_lsn, so timeline will be active=true in safekeepers, to push etcd updates env.safekeepers[0].http_client().record_safekeeper_info( - tenant_id, timeline_id, {"remote_consistent_lsn": offloaded_seg_end[-1]} + tenant_id, timeline_id, {"remote_consistent_lsn": str(offloaded_seg_end[-1])} ) for sk in env.safekeepers: @@ -526,10 +527,10 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re last_lsn = query_scalar(cur, "SELECT pg_current_wal_flush_lsn()") - pageserver_lsn = env.pageserver.http_client().timeline_detail( - uuid.UUID(tenant_id), uuid.UUID((timeline_id)) - )["local"]["last_record_lsn"] - lag = lsn_from_hex(last_lsn) - lsn_from_hex(pageserver_lsn) + pageserver_lsn = env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["local"][ + "last_record_lsn" + ] + lag = Lsn(last_lsn) - Lsn(pageserver_lsn) log.info( f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb" ) @@ -554,10 +555,10 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re if elapsed > wait_lsn_timeout: raise RuntimeError("Timed out waiting for WAL redo") - pageserver_lsn = env.pageserver.http_client().timeline_detail( - uuid.UUID(tenant_id), uuid.UUID((timeline_id)) - )["local"]["last_record_lsn"] - lag = lsn_from_hex(last_lsn) - lsn_from_hex(pageserver_lsn) + pageserver_lsn = env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[ + "local" + ]["last_record_lsn"] + lag = Lsn(last_lsn) - Lsn(pageserver_lsn) if time.time() > last_debug_print + 10 or lag <= 0: last_debug_print = time.time() @@ -583,8 +584,8 @@ class ProposerPostgres(PgProtocol): self, pgdata_dir: str, pg_bin, - timeline_id: uuid.UUID, - tenant_id: uuid.UUID, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, listen_addr: str, port: int, ): @@ -592,8 +593,8 @@ class ProposerPostgres(PgProtocol): self.pgdata_dir: str = pgdata_dir self.pg_bin: PgBin = pg_bin - self.timeline_id: uuid.UUID = timeline_id - self.tenant_id: uuid.UUID = tenant_id + self.tenant_id: ZTenantId = tenant_id + self.timeline_id: ZTimelineId = timeline_id self.listen_addr: str = listen_addr self.port: int = port @@ -613,8 +614,8 @@ class ProposerPostgres(PgProtocol): cfg = [ "synchronous_standby_names = 'walproposer'\n", "shared_preload_libraries = 'neon'\n", - f"neon.timeline_id = '{self.timeline_id.hex}'\n", - f"neon.tenant_id = '{self.tenant_id.hex}'\n", + f"neon.timeline_id = '{self.timeline_id}'\n", + f"neon.tenant_id = '{self.tenant_id}'\n", "neon.pageserver_connstring = ''\n", f"neon.safekeepers = '{safekeepers}'\n", f"listen_addresses = '{self.listen_addr}'\n", @@ -623,7 +624,7 @@ class ProposerPostgres(PgProtocol): f.writelines(cfg) - def sync_safekeepers(self) -> str: + def sync_safekeepers(self) -> Lsn: """ Run 'postgres --sync-safekeepers'. Returns execution result, which is commit_lsn after sync. @@ -639,7 +640,7 @@ class ProposerPostgres(PgProtocol): with open(stdout_filename, "r") as stdout_f: stdout = stdout_f.read() - return stdout.strip("\n ") + return Lsn(stdout.strip("\n ")) def initdb(self): """Run initdb""" @@ -671,18 +672,18 @@ def test_sync_safekeepers( neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - timeline_id = uuid.uuid4() - tenant_id = uuid.uuid4() + tenant_id = ZTenantId.generate() + timeline_id = ZTimelineId.generate() # write config for proposer pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata") pg = ProposerPostgres( - pgdata_dir, pg_bin, timeline_id, tenant_id, "127.0.0.1", port_distributor.get_port() + pgdata_dir, pg_bin, tenant_id, timeline_id, "127.0.0.1", port_distributor.get_port() ) pg.create_dir_config(env.get_safekeeper_connstrs()) # valid lsn, which is not in the segment start, nor in zero segment - epoch_start_lsn = 0x16B9188 # 0/16B9188 + epoch_start_lsn = Lsn("0/16B9188") begin_lsn = epoch_start_lsn # append and commit WAL @@ -697,14 +698,14 @@ def test_sync_safekeepers( "set_commit_lsn": True, "send_proposer_elected": True, "term": 2, - "begin_lsn": begin_lsn, - "epoch_start_lsn": epoch_start_lsn, - "truncate_lsn": epoch_start_lsn, + "begin_lsn": int(begin_lsn), + "epoch_start_lsn": int(epoch_start_lsn), + "truncate_lsn": int(epoch_start_lsn), }, ) - lsn_hex = lsn_to_hex(res["inserted_wal"]["end_lsn"]) - lsn_after_append.append(lsn_hex) - log.info(f"safekeeper[{i}] lsn after append: {lsn_hex}") + lsn = Lsn(res["inserted_wal"]["end_lsn"]) + lsn_after_append.append(lsn) + log.info(f"safekeeper[{i}] lsn after append: {lsn}") # run sync safekeepers lsn_after_sync = pg.sync_safekeepers() @@ -724,8 +725,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): wa = env.safekeepers[0] # learn neon timeline from compute - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) if not auth_enabled: wa_http_cli = wa.http_client() @@ -734,7 +735,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): wa_http_cli = wa.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) wa_http_cli.check_status() wa_http_cli_bad = wa.http_client( - auth_token=env.auth_keys.generate_tenant_token(uuid4().hex) + auth_token=env.auth_keys.generate_tenant_token(ZTenantId.generate()) ) wa_http_cli_bad.check_status() wa_http_cli_noauth = wa.http_client() @@ -784,15 +785,15 @@ class SafekeeperEnv: self.bin_safekeeper = os.path.join(str(neon_binpath), "safekeeper") self.safekeepers: Optional[List[subprocess.CompletedProcess[Any]]] = None self.postgres: Optional[ProposerPostgres] = None - self.tenant_id: Optional[uuid.UUID] = None - self.timeline_id: Optional[uuid.UUID] = None + self.tenant_id: Optional[ZTenantId] = None + self.timeline_id: Optional[ZTimelineId] = None def init(self) -> "SafekeeperEnv": assert self.postgres is None, "postgres is already initialized" assert self.safekeepers is None, "safekeepers are already initialized" - self.timeline_id = uuid.uuid4() - self.tenant_id = uuid.uuid4() + self.tenant_id = ZTenantId.generate() + self.timeline_id = ZTimelineId.generate() self.repo_dir.mkdir(exist_ok=True) # Create config and a Safekeeper object for each safekeeper @@ -841,8 +842,8 @@ class SafekeeperEnv: pg = ProposerPostgres( pgdata_dir, self.pg_bin, - self.timeline_id, self.tenant_id, + self.timeline_id, "127.0.0.1", self.port_distributor.get_port(), ) @@ -911,7 +912,9 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): sum_after = query_scalar(cur, "SELECT SUM(key) FROM t") assert sum_after == sum_before + 5000050000 - def show_statuses(safekeepers: List[Safekeeper], tenant_id: str, timeline_id: str): + def show_statuses( + safekeepers: List[Safekeeper], tenant_id: ZTenantId, timeline_id: ZTimelineId + ): for sk in safekeepers: http_cli = sk.http_client() try: @@ -932,8 +935,8 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): pg.start() # learn neon timeline from compute - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) execute_payload(pg) show_statuses(env.safekeepers, tenant_id, timeline_id) @@ -985,20 +988,21 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): # of WAL segments. def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): # used to calculate delta in collect_stats - last_lsn = 0.0 + last_lsn = Lsn(0) - # returns LSN and pg_wal size, all in MB + # returns pg_wal size in MB def collect_stats(pg: Postgres, cur, enable_logs=True): nonlocal last_lsn assert pg.pgdata_dir is not None log.info("executing INSERT to generate WAL") - current_lsn = lsn_from_hex(query_scalar(cur, "select pg_current_wal_lsn()")) / 1024 / 1024 - pg_wal_size = get_dir_size(os.path.join(pg.pgdata_dir, "pg_wal")) / 1024 / 1024 + current_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + pg_wal_size_mb = get_dir_size(os.path.join(pg.pgdata_dir, "pg_wal")) / 1024 / 1024 if enable_logs: - log.info(f"LSN delta: {current_lsn - last_lsn} MB, current WAL size: {pg_wal_size} MB") + lsn_delta_mb = (current_lsn - last_lsn) / 1024 / 1024 + log.info(f"LSN delta: {lsn_delta_mb} MB, current WAL size: {pg_wal_size_mb} MB") last_lsn = current_lsn - return current_lsn, pg_wal_size + return pg_wal_size_mb # generates about ~20MB of WAL, to create at least one new segment def generate_wal(cur): @@ -1027,7 +1031,7 @@ def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): log.info("executing checkpoint") cur.execute("CHECKPOINT") - wal_size_after_checkpoint = collect_stats(pg, cur)[1] + wal_size_after_checkpoint = collect_stats(pg, cur) # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) assert wal_size_after_checkpoint < 16 * 2.5 @@ -1040,22 +1044,20 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): env = neon_env_builder.init_start() # Create two tenants: one will be deleted, other should be preserved. - tenant_id = env.initial_tenant.hex - timeline_id_1 = env.neon_cli.create_branch("br1").hex # Active, delete explicitly - timeline_id_2 = env.neon_cli.create_branch("br2").hex # Inactive, delete explicitly - timeline_id_3 = env.neon_cli.create_branch("br3").hex # Active, delete with the tenant - timeline_id_4 = env.neon_cli.create_branch("br4").hex # Inactive, delete with the tenant + tenant_id = env.initial_tenant + timeline_id_1 = env.neon_cli.create_branch("br1") # Active, delete explicitly + timeline_id_2 = env.neon_cli.create_branch("br2") # Inactive, delete explicitly + timeline_id_3 = env.neon_cli.create_branch("br3") # Active, delete with the tenant + timeline_id_4 = env.neon_cli.create_branch("br4") # Inactive, delete with the tenant - tenant_id_other_uuid, timeline_id_other_uuid = env.neon_cli.create_tenant() - tenant_id_other = tenant_id_other_uuid.hex - timeline_id_other = timeline_id_other_uuid.hex + tenant_id_other, timeline_id_other = env.neon_cli.create_tenant() # Populate branches pg_1 = env.postgres.create_start("br1") pg_2 = env.postgres.create_start("br2") pg_3 = env.postgres.create_start("br3") pg_4 = env.postgres.create_start("br4") - pg_other = env.postgres.create_start("main", tenant_id=uuid.UUID(hex=tenant_id_other)) + pg_other = env.postgres.create_start("main", tenant_id=tenant_id_other) for pg in [pg_1, pg_2, pg_3, pg_4, pg_other]: with closing(pg.connect()) as conn: with conn.cursor() as cur: @@ -1071,11 +1073,11 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): auth_token=env.auth_keys.generate_tenant_token(tenant_id_other) ) sk_http_noauth = sk.http_client() - assert (sk_data_dir / tenant_id / timeline_id_1).is_dir() - assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() - assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() - assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() - assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state. pg_2.stop_and_destroy() @@ -1094,22 +1096,22 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): "dir_existed": True, "was_active": True, } - assert not (sk_data_dir / tenant_id / timeline_id_1).exists() - assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() - assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() - assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() - assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Ensure repeated deletion succeeds assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == { "dir_existed": False, "was_active": False, } - assert not (sk_data_dir / tenant_id / timeline_id_1).exists() - assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() - assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() - assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() - assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() if auth_enabled: # Ensure we cannot delete the other tenant @@ -1118,44 +1120,44 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): assert sk_h.timeline_delete_force(tenant_id_other, timeline_id_other) with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): assert sk_h.tenant_delete_force(tenant_id_other) - assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Remove initial tenant's br2 (inactive) assert sk_http.timeline_delete_force(tenant_id, timeline_id_2) == { "dir_existed": True, "was_active": False, } - assert not (sk_data_dir / tenant_id / timeline_id_1).exists() - assert not (sk_data_dir / tenant_id / timeline_id_2).exists() - assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() - assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() - assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Remove non-existing branch, should succeed - assert sk_http.timeline_delete_force(tenant_id, "00" * 16) == { + assert sk_http.timeline_delete_force(tenant_id, ZTimelineId("00" * 16)) == { "dir_existed": False, "was_active": False, } - assert not (sk_data_dir / tenant_id / timeline_id_1).exists() - assert not (sk_data_dir / tenant_id / timeline_id_2).exists() - assert (sk_data_dir / tenant_id / timeline_id_3).exists() - assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() - assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Remove initial tenant fully (two branches are active) response = sk_http.tenant_delete_force(tenant_id) - assert response[timeline_id_3] == { + assert response[str(timeline_id_3)] == { "dir_existed": True, "was_active": True, } - assert not (sk_data_dir / tenant_id).exists() - assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + assert not (sk_data_dir / str(tenant_id)).exists() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Remove initial tenant again. response = sk_http.tenant_delete_force(tenant_id) assert response == {} - assert not (sk_data_dir / tenant_id).exists() - assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + assert not (sk_data_dir / str(tenant_id)).exists() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Ensure the other tenant still works sk_http_other.timeline_status(tenant_id_other, timeline_id_other) diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 83285e0cbe..e36d3cf94b 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -1,14 +1,13 @@ import asyncio import random import time -import uuid from dataclasses import dataclass from typing import List, Optional import asyncpg from fixtures.log_helper import getLogger from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper -from fixtures.utils import lsn_from_hex, lsn_to_hex +from fixtures.types import Lsn, ZTenantId, ZTimelineId log = getLogger("root.safekeeper_async") @@ -104,9 +103,9 @@ async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accou async def wait_for_lsn( safekeeper: Safekeeper, - tenant_id: str, - timeline_id: str, - wait_lsn: str, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + wait_lsn: Lsn, polling_interval=1, timeout=60, ): @@ -124,7 +123,7 @@ async def wait_for_lsn( f"Safekeeper at port {safekeeper.port.pg} has flush_lsn {flush_lsn}, waiting for lsn {wait_lsn}" ) - while lsn_from_hex(wait_lsn) > lsn_from_hex(flush_lsn): + while wait_lsn > flush_lsn: elapsed = time.time() - started_at if elapsed > timeout: raise RuntimeError( @@ -156,8 +155,8 @@ async def run_restarts_under_load( test_timeout_at = time.monotonic() + 5 * 60 pg_conn = await pg.connect_async() - tenant_id = await pg_conn.fetchval("show neon.tenant_id") - timeline_id = await pg_conn.fetchval("show neon.timeline_id") + tenant_id = ZTenantId(await pg_conn.fetchval("show neon.tenant_id")) + timeline_id = ZTimelineId(await pg_conn.fetchval("show neon.timeline_id")) bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount) # create tables and initial balances @@ -176,14 +175,15 @@ async def run_restarts_under_load( victim = acceptors[victim_idx] victim.stop() - flush_lsn = await pg_conn.fetchval("SELECT pg_current_wal_flush_lsn()") - flush_lsn = lsn_to_hex(flush_lsn) + flush_lsn = Lsn(await pg_conn.fetchval("SELECT pg_current_wal_flush_lsn()")) log.info(f"Postgres flush_lsn {flush_lsn}") - pageserver_lsn = env.pageserver.http_client().timeline_detail( - uuid.UUID(tenant_id), uuid.UUID((timeline_id)) - )["local"]["last_record_lsn"] - sk_ps_lag = lsn_from_hex(flush_lsn) - lsn_from_hex(pageserver_lsn) + pageserver_lsn = Lsn( + env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["local"][ + "last_record_lsn" + ] + ) + sk_ps_lag = flush_lsn - pageserver_lsn log.info(f"Pageserver last_record_lsn={pageserver_lsn} lag={sk_ps_lag / 1024}kb") # Wait until alive safekeepers catch up with postgres diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 0847b5a505..6fd509c4d1 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -9,6 +9,7 @@ from fixtures.neon_fixtures import ( base_dir, pg_distrib_dir, ) +from fixtures.types import ZTenantId def test_wal_restore( @@ -21,7 +22,7 @@ def test_wal_restore( env.neon_cli.create_branch("test_wal_restore") pg = env.postgres.create_start("test_wal_restore") pg.safe_psql("create table t as select generate_series(1,300000)") - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) env.neon_cli.pageserver_stop() port = port_distributor.get_port() data_dir = test_output_dir / "pgsql.restored" From 8a7333438a566a32a99f41bc238b5d596eab2cda Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 2 Sep 2022 11:58:28 +0300 Subject: [PATCH 0716/1022] Extract common remote storage operations into GenericRemoteStorage (#2373) --- libs/remote_storage/src/lib.rs | 96 +++++++++++++++++++++++++ pageserver/src/storage_sync/download.rs | 39 ++-------- pageserver/src/storage_sync/upload.rs | 58 ++------------- safekeeper/src/wal_backup.rs | 78 ++++++-------------- 4 files changed, 128 insertions(+), 143 deletions(-) diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index d5ad2f8633..8a10e098a1 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -164,6 +164,102 @@ impl GenericRemoteStorage { _ => None, } } + + /// Takes storage object contents and its size and uploads to remote storage, + /// mapping `from_path` to the corresponding remote object id in the storage. + /// + /// The storage object does not have to be present on the `from_path`, + /// this path is used for the remote object id conversion only. + pub async fn upload_storage_object( + &self, + from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, + from_size_bytes: usize, + from_path: &Path, + ) -> anyhow::Result<()> { + async fn do_upload_storage_object( + storage: &S, + from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, + from_size_bytes: usize, + from_path: &Path, + ) -> anyhow::Result<()> + where + P: std::fmt::Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, + { + let target_storage_path = storage.remote_object_id(from_path).with_context(|| { + format!( + "Failed to get the storage path for source local path '{}'", + from_path.display() + ) + })?; + + storage + .upload(from, from_size_bytes, &target_storage_path, None) + .await + .with_context(|| { + format!( + "Failed to upload from '{}' to storage path '{:?}'", + from_path.display(), + target_storage_path + ) + }) + } + + match self { + GenericRemoteStorage::Local(storage) => { + do_upload_storage_object(storage, from, from_size_bytes, from_path).await + } + GenericRemoteStorage::S3(storage) => { + do_upload_storage_object(storage, from, from_size_bytes, from_path).await + } + } + } + + /// Downloads the storage object into the `to_path` provided. + /// `byte_range` could be specified to dowload only a part of the file, if needed. + pub async fn download_storage_object( + &self, + byte_range: Option<(u64, Option)>, + to_path: &Path, + ) -> Result { + async fn do_download_storage_object( + storage: &S, + byte_range: Option<(u64, Option)>, + to_path: &Path, + ) -> Result + where + P: std::fmt::Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, + { + let remote_object_path = storage + .remote_object_id(to_path) + .with_context(|| { + format!( + "Failed to get the storage path for target local path '{}'", + to_path.display() + ) + }) + .map_err(DownloadError::BadInput)?; + + match byte_range { + Some((start, end)) => { + storage + .download_byte_range(&remote_object_path, start, end) + .await + } + None => storage.download(&remote_object_path).await, + } + } + + match self { + GenericRemoteStorage::Local(storage) => { + do_download_storage_object(storage, byte_range, to_path).await + } + GenericRemoteStorage::S3(storage) => { + do_download_storage_object(storage, byte_range, to_path).await + } + } + } } /// Extra set of key-value pairs that contain arbitrary metadata about the storage entry. diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index ded4c042c4..ebc9a252b7 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -10,7 +10,7 @@ use std::{ use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; use remote_storage::{ - path_with_suffix_extension, Download, DownloadError, GenericRemoteStorage, RemoteStorage, + path_with_suffix_extension, DownloadError, GenericRemoteStorage, RemoteStorage, }; use tokio::{ fs, @@ -143,7 +143,9 @@ async fn download_index_part( let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME) .with_extension(IndexPart::FILE_EXTENSION); - let mut index_part_download = download_storage_object(storage, &index_part_path).await?; + let mut index_part_download = storage + .download_storage_object(None, &index_part_path) + .await?; let mut index_part_bytes = Vec::new(); io::copy( @@ -262,7 +264,7 @@ pub(super) async fn download_timeline_layers<'a>( ) })?; - let mut layer_download = download_storage_object(storage, &layer_destination_path) + let mut layer_download = storage.download_storage_object(None, &layer_destination_path) .await .with_context(|| { format!( @@ -365,37 +367,6 @@ pub(super) async fn download_timeline_layers<'a>( } } -async fn download_storage_object( - storage: &GenericRemoteStorage, - to_path: &Path, -) -> Result { - async fn do_download_storage_object( - storage: &S, - to_path: &Path, - ) -> Result - where - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, - { - let remote_object_path = storage - .remote_object_id(to_path) - .with_context(|| { - format!( - "Failed to get the storage path for target local path '{}'", - to_path.display() - ) - }) - .map_err(DownloadError::BadInput)?; - - storage.download(&remote_object_path).await - } - - match storage { - GenericRemoteStorage::Local(storage) => do_download_storage_object(storage, to_path).await, - GenericRemoteStorage::S3(storage) => do_download_storage_object(storage, to_path).await, - } -} - async fn get_timeline_sync_ids( storage: &GenericRemoteStorage, tenant_path: &Path, diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index a8c768e0ae..7ef775e690 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -1,14 +1,11 @@ //! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints. -use std::{ - fmt::Debug, - path::{Path, PathBuf}, -}; +use std::{fmt::Debug, path::PathBuf}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; use once_cell::sync::Lazy; -use remote_storage::{GenericRemoteStorage, RemoteStorage}; +use remote_storage::GenericRemoteStorage; use tokio::fs; use tracing::{debug, error, info, warn}; @@ -47,7 +44,8 @@ pub(super) async fn upload_index_part( let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME) .with_extension(IndexPart::FILE_EXTENSION); - upload_storage_object(storage, index_part_bytes, index_part_size, &index_part_path) + storage + .upload_storage_object(index_part_bytes, index_part_size, &index_part_path) .await .with_context(|| format!("Failed to upload index part for '{sync_id}'")) } @@ -131,7 +129,8 @@ pub(super) async fn upload_timeline_layers<'a>( .map_err(UploadError::Other)? .len() as usize; - match upload_storage_object(storage, source_file, source_size, &source_path) + match storage + .upload_storage_object(source_file, source_size, &source_path) .await .with_context(|| format!("Failed to upload layer file for {sync_id}")) { @@ -193,51 +192,6 @@ pub(super) async fn upload_timeline_layers<'a>( } } -async fn upload_storage_object( - storage: &GenericRemoteStorage, - from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, - from_size_bytes: usize, - from_path: &Path, -) -> anyhow::Result<()> { - async fn do_upload_storage_object( - storage: &S, - from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, - from_size_bytes: usize, - from_path: &Path, - ) -> anyhow::Result<()> - where - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, - { - let target_storage_path = storage.remote_object_id(from_path).with_context(|| { - format!( - "Failed to get the storage path for source local path '{}'", - from_path.display() - ) - })?; - - storage - .upload(from, from_size_bytes, &target_storage_path, None) - .await - .with_context(|| { - format!( - "Failed to upload from '{}' to storage path '{:?}'", - from_path.display(), - target_storage_path - ) - }) - } - - match storage { - GenericRemoteStorage::Local(storage) => { - do_upload_storage_object(storage, from, from_size_bytes, from_path).await - } - GenericRemoteStorage::S3(storage) => { - do_upload_storage_object(storage, from, from_size_bytes, from_path).await - } - } -} - enum UploadError { MissingLocalFile(PathBuf, anyhow::Error), Other(anyhow::Error), diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 3552452470..a15ba02863 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -13,7 +13,7 @@ use std::time::Duration; use postgres_ffi::v14::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr}; use postgres_ffi::PG_TLI; -use remote_storage::{GenericRemoteStorage, RemoteStorage}; +use remote_storage::GenericRemoteStorage; use tokio::fs::File; use tokio::runtime::Builder; @@ -419,73 +419,37 @@ static REMOTE_STORAGE: OnceCell> = OnceCell::new(); async fn backup_object(source_file: &Path, size: usize) -> Result<()> { let storage = REMOTE_STORAGE.get().expect("failed to get remote storage"); - let file = File::open(&source_file).await?; + let file = tokio::io::BufReader::new(File::open(&source_file).await.with_context(|| { + format!( + "Failed to open file {} for wal backup", + source_file.display() + ) + })?); - // Storage is initialized by launcher at this point. - match storage.as_ref().unwrap() { - GenericRemoteStorage::Local(local_storage) => { - let destination = local_storage.remote_object_id(source_file)?; - - debug!( - "local upload about to start from {} to {}", - source_file.display(), - destination.display() - ); - local_storage.upload(file, size, &destination, None).await - } - GenericRemoteStorage::S3(s3_storage) => { - let s3key = s3_storage.remote_object_id(source_file)?; - - debug!( - "S3 upload about to start from {} to {:?}", - source_file.display(), - s3key - ); - s3_storage.upload(file, size, &s3key, None).await - } - }?; - - Ok(()) + storage + .as_ref() + .expect("Storage should be initialized by launcher at this point.") + .upload_storage_object(file, size, source_file) + .await } pub async fn read_object( file_path: PathBuf, offset: u64, ) -> anyhow::Result>> { - let download = match REMOTE_STORAGE + let download = REMOTE_STORAGE .get() .context("Failed to get remote storage")? .as_ref() .context("No remote storage configured")? - { - GenericRemoteStorage::Local(local_storage) => { - let source = local_storage.remote_object_id(&file_path)?; - - info!( - "local download about to start from {} at offset {}", - source.display(), - offset - ); - local_storage - .download_byte_range(&source, offset, None) - .await - } - GenericRemoteStorage::S3(s3_storage) => { - let s3key = s3_storage.remote_object_id(&file_path)?; - - info!( - "S3 download about to start from {:?} at offset {}", - s3key, offset - ); - s3_storage.download_byte_range(&s3key, offset, None).await - } - } - .with_context(|| { - format!( - "Failed to open WAL segment download stream for local storage path {}", - file_path.display() - ) - })?; + .download_storage_object(Some((offset, None)), &file_path) + .await + .with_context(|| { + format!( + "Failed to open WAL segment download stream for local storage path {}", + file_path.display() + ) + })?; Ok(download.download_stream) } From f78a542cbad53d3cb12b2655ec71abfb51ebc22a Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 23 Aug 2022 23:58:49 +0300 Subject: [PATCH 0717/1022] Calculate timeline initial logical size in the background Start the calculation on the first size request, return partially calculated size during calculation, retry if failed. Remove "fast" size init through the ancestor: the current approach is fast enough for now and there are better ways to optimize the calculation via incremental ancestor size computation --- pageserver/src/http/routes.rs | 10 +- pageserver/src/layered_repository.rs | 48 ++- pageserver/src/layered_repository/timeline.rs | 323 ++++++++++++------ pageserver/src/pgdatadir_mapping.rs | 6 +- pageserver/src/tenant_mgr.rs | 30 +- .../src/walreceiver/walreceiver_connection.rs | 10 +- test_runner/regress/test_broken_timeline.py | 18 +- test_runner/regress/test_timeline_size.py | 54 ++- 8 files changed, 324 insertions(+), 175 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index ef18129504..710014de98 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -75,7 +75,7 @@ fn get_config(request: &Request) -> &'static PageServerConf { // Helper functions to construct a LocalTimelineInfo struct for a timeline fn local_timeline_info_from_loaded_timeline( - timeline: &Timeline, + timeline: &Arc, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> anyhow::Result { @@ -106,7 +106,11 @@ fn local_timeline_info_from_loaded_timeline( prev_record_lsn: Some(timeline.get_prev_record_lsn()), latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), timeline_state: LocalTimelineState::Loaded, - current_logical_size: Some(timeline.get_current_logical_size()), + current_logical_size: Some( + timeline + .get_current_logical_size() + .context("Timeline info creation failed to get current logical size")?, + ), current_physical_size: Some(timeline.get_physical_size()), current_logical_size_non_incremental: if include_non_incremental_logical_size { Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?) @@ -212,7 +216,7 @@ async fn timeline_create_handler(mut request: Request) -> Result { // Created. Construct a TimelineInfo for it. - let local_info = local_timeline_info_from_loaded_timeline(new_timeline.as_ref(), false, false)?; + let local_info = local_timeline_info_from_loaded_timeline(&new_timeline, false, false)?; Ok(Some(TimelineInfo { tenant_id, timeline_id: new_timeline_id, diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 73c30b51b8..9d405b0033 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -136,14 +136,11 @@ impl Repository { } /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded. - pub fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result> { + pub fn get_timeline_load(&self, timeline_id: ZTimelineId) -> Result> { let mut timelines = self.timelines.lock().unwrap(); - match self.get_timeline_load_internal(timelineid, &mut timelines)? { + match self.get_timeline_load_internal(timeline_id, &mut timelines)? { Some(local_loaded_timeline) => Ok(local_loaded_timeline), - None => anyhow::bail!( - "cannot get local timeline: unknown timeline id: {}", - timelineid - ), + None => anyhow::bail!("cannot get local timeline, unknown timeline id: {timeline_id}"), } } @@ -559,33 +556,34 @@ impl Repository { timeline_id: ZTimelineId, timelines: &mut HashMap, ) -> anyhow::Result>> { - match timelines.get(&timeline_id) { + Ok(match timelines.get(&timeline_id) { Some(entry) => match entry { LayeredTimelineEntry::Loaded(local_timeline) => { debug!("timeline {timeline_id} found loaded into memory"); - return Ok(Some(Arc::clone(local_timeline))); + Some(Arc::clone(local_timeline)) + } + LayeredTimelineEntry::Unloaded { .. } => { + debug!( + "timeline {timeline_id} found on a local disk, but not loaded into the memory, loading" + ); + let timeline = self.load_local_timeline(timeline_id, timelines)?; + let was_loaded = timelines.insert( + timeline_id, + LayeredTimelineEntry::Loaded(Arc::clone(&timeline)), + ); + ensure!( + was_loaded.is_none() + || matches!(was_loaded, Some(LayeredTimelineEntry::Unloaded { .. })), + "assertion failure, inserted wrong timeline in an incorrect state" + ); + Some(timeline) } - LayeredTimelineEntry::Unloaded { .. } => {} }, None => { debug!("timeline {timeline_id} not found"); - return Ok(None); + None } - }; - debug!( - "timeline {timeline_id} found on a local disk, but not loaded into the memory, loading" - ); - let timeline = self.load_local_timeline(timeline_id, timelines)?; - let was_loaded = timelines.insert( - timeline_id, - LayeredTimelineEntry::Loaded(Arc::clone(&timeline)), - ); - ensure!( - was_loaded.is_none() - || matches!(was_loaded, Some(LayeredTimelineEntry::Unloaded { .. })), - "assertion failure, inserted wrong timeline in an incorrect state" - ); - Ok(Some(timeline)) + }) } fn load_local_timeline( diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 8b90cc4e6b..fd719812a3 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -5,17 +5,17 @@ use bytes::Bytes; use fail::fail_point; use itertools::Itertools; use metrics::core::{AtomicU64, GenericCounter}; -use once_cell::sync::Lazy; +use once_cell::sync::{Lazy, OnceCell}; use tracing::*; use std::cmp::{max, min, Ordering}; use std::collections::{HashMap, HashSet}; -use std::fs; use std::ops::{Deref, Range}; use std::path::PathBuf; use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError}; +use std::sync::{mpsc, Arc, Mutex, MutexGuard, RwLock, TryLockError}; use std::time::{Duration, Instant, SystemTime}; +use std::{fs, thread}; use metrics::{ register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec, @@ -137,13 +137,13 @@ static CURRENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { - register_int_gauge_vec!( +static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( "pageserver_current_logical_size", "Current logical size grouped by timeline", &["tenant_id", "timeline_id"] ) - .expect("failed to define a metric") + .expect("failed to define current logical size metric") }); // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, @@ -242,7 +242,7 @@ struct TimelineMetrics { pub wait_lsn_time_histo: Histogram, pub current_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size - pub current_logical_size_gauge: IntGauge, + pub current_logical_size_gauge: UIntGauge, } impl TimelineMetrics { @@ -389,6 +389,37 @@ pub struct Timeline { repartition_threshold: u64, /// Current logical size of the "datadir", at the last LSN. + current_logical_size: LogicalSize, + // TODO task management should be done outside timeline, managed along with other tasks. + #[allow(clippy::type_complexity)] + initial_size_computation_task: + Mutex>, mpsc::Receiver<()>)>>, + + /// Information about the last processed message by the WAL receiver, + /// or None if WAL receiver has not received anything for this timeline + /// yet. + pub last_received_wal: Mutex>, + + /// Relation size cache + pub rel_size_cache: RwLock>, +} + +/// Internal structure to hold all data needed for logical size calculation. +/// Calculation consists of two parts: +/// 1. Initial size calculation. That might take a long time, because it requires +/// reading all layers containing relation sizes up to the `initial_part_end`. +/// 2. Collecting an incremental part and adding that to the initial size. +/// Increments are appended on walreceiver writing new timeline data, +/// which result in increase or decrease of the logical size. +struct LogicalSize { + /// Size, potentially slow to compute, derived from all layers located locally on this node's FS. + /// Might require reading multiple layers, and even ancestor's layers, to collect the size. + /// + /// NOTE: initial size is not a constant and will change between restarts. + initial_logical_size: OnceCell, + /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines. + initial_part_end: Option, + /// All other size changes after startup, combined together. /// /// Size shouldn't ever be negative, but this is signed for two reasons: /// @@ -407,22 +438,82 @@ pub struct Timeline { /// /// Note that we also expose a copy of this value as a prometheus metric, /// see `current_logical_size_gauge`. Use the `update_current_logical_size` - /// and `set_current_logical_size` functions to modify this, they will - /// also keep the prometheus metric in sync. - current_logical_size: AtomicI64, - // TODO we don't have a good, API to ensure on a compilation level - // that the timeline passes all initialization. - // Hence we ensure that we init at least once for every timeline - // and keep this flag to avoid potentually long recomputes. - logical_size_initialized: AtomicBool, + /// to modify this, it will also keep the prometheus metric in sync. + size_added_after_initial: AtomicI64, +} - /// Information about the last processed message by the WAL receiver, - /// or None if WAL receiver has not received anything for this timeline - /// yet. - pub last_received_wal: Mutex>, +/// Normalized current size, that the data in pageserver occupies. +#[derive(Debug, Clone, Copy)] +enum CurrentLogicalSize { + /// The size is not yet calculated to the end, this is an intermediate result, + /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative, + /// yet total logical size cannot be below 0. + Approximate(u64), + // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are + // available for observation without any calculations. + Exact(u64), +} - /// Relation size cache - pub rel_size_cache: RwLock>, +impl CurrentLogicalSize { + fn size(&self) -> u64 { + *match self { + Self::Approximate(size) => size, + Self::Exact(size) => size, + } + } +} + +impl LogicalSize { + fn empty_initial() -> Self { + Self { + initial_logical_size: OnceCell::with_value(0), + initial_part_end: None, + size_added_after_initial: AtomicI64::new(0), + } + } + + fn deferred_initial(compute_to: Lsn) -> Self { + Self { + initial_logical_size: OnceCell::new(), + initial_part_end: Some(compute_to), + size_added_after_initial: AtomicI64::new(0), + } + } + + fn current_size(&self) -> anyhow::Result { + let size_increment = self.size_added_after_initial.load(AtomicOrdering::Acquire); + match self.initial_logical_size.get() { + Some(initial_size) => { + let absolute_size_increment = u64::try_from( + size_increment + .checked_abs() + .with_context(|| format!("Size added after initial {size_increment} is not expected to be i64::MIN"))?, + ).with_context(|| format!("Failed to convert size increment {size_increment} to u64"))?; + + if size_increment < 0 { + initial_size.checked_sub(absolute_size_increment) + } else { + initial_size.checked_add(absolute_size_increment) + }.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}")) + .map(CurrentLogicalSize::Exact) + } + None => { + let non_negative_size_increment = size_increment.max(0); + u64::try_from(non_negative_size_increment) + .with_context(|| { + format!( + "Failed to convert size increment {non_negative_size_increment} to u64" + ) + }) + .map(CurrentLogicalSize::Approximate) + } + } + } + + fn increment_size(&self, delta: i64) { + self.size_added_after_initial + .fetch_add(delta, AtomicOrdering::SeqCst); + } } pub struct WalReceiverInfo { @@ -491,7 +582,9 @@ impl Timeline { /// the Repository implementation may incorrectly return a value from an ancestor /// branch, for example, or waste a lot of cycles chasing the non-existing key. /// - pub fn get(&self, key: Key, lsn: Lsn) -> Result { + pub fn get(&self, key: Key, lsn: Lsn) -> anyhow::Result { + anyhow::ensure!(lsn.is_valid(), "Invalid LSN"); + // Check the page cache. We will get back the most recent page with lsn <= `lsn`. // The cached image can be returned directly if there is no WAL between the cached image // and requested LSN. The cached image can also be used to reduce the amount of WAL needed @@ -694,6 +787,8 @@ impl Timeline { walredo_mgr: Arc, upload_layers: bool, ) -> Timeline { + let disk_consistent_lsn = metadata.disk_consistent_lsn(); + let mut result = Timeline { conf, tenant_conf, @@ -705,12 +800,12 @@ impl Timeline { // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. last_record_lsn: SeqWait::new(RecordLsn { - last: metadata.disk_consistent_lsn(), + last: disk_consistent_lsn, prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)), }), - disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0), + disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0), - last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0), + last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0), last_freeze_ts: RwLock::new(Instant::now()), ancestor_timeline: ancestor, @@ -733,8 +828,16 @@ impl Timeline { latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), - current_logical_size: AtomicI64::new(0), - logical_size_initialized: AtomicBool::new(false), + current_logical_size: if disk_consistent_lsn.is_valid() { + // we're creating timeline data with some layer files existing locally, + // need to recalculate timeline's logical size based on data in the layers. + LogicalSize::deferred_initial(disk_consistent_lsn) + } else { + // we're creating timeline data without any layers existing locally, + // initial logical size is 0. + LogicalSize::empty_initial() + }, + initial_size_computation_task: Mutex::new(None), partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), repartition_threshold: 0, @@ -835,92 +938,114 @@ impl Timeline { Ok(()) } - /// (Re-)calculate the logical size of the database at the latest LSN. + /// Retrieve current logical size of the timeline. /// - /// This can be a slow operation. - pub fn init_logical_size(&self) -> Result<()> { - if self.logical_size_initialized.load(AtomicOrdering::Acquire) { - return Ok(()); - } + /// The size could be lagging behind the actual number, in case + /// the initial size calculation has not been run (gets triggered on the first size access). + pub fn get_current_logical_size(self: &Arc) -> anyhow::Result { + let current_size = self.current_logical_size.current_size()?; + debug!("Current size: {current_size:?}"); - // Try a fast-path first: - // Copy logical size from ancestor timeline if there has been no changes on this - // branch, and no changes on the ancestor branch since the branch point. - if self.get_ancestor_lsn() == self.get_last_record_lsn() && self.ancestor_timeline.is_some() + let size = current_size.size(); + if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) = + (current_size, self.current_logical_size.initial_part_end) { - let ancestor = self.get_ancestor_timeline()?; - let ancestor_logical_size = ancestor.get_current_logical_size(); - // Check LSN after getting logical size to exclude race condition - // when ancestor timeline is concurrently updated. - // - // Logical size 0 means that it was not initialized, so don't believe that. - if ancestor_logical_size != 0 && ancestor.get_last_record_lsn() == self.ancestor_lsn { - self.set_current_logical_size(ancestor_logical_size); - debug!( - "logical size copied from ancestor: {}", - ancestor_logical_size - ); - return Ok(()); - } + self.try_spawn_size_init_task(init_lsn); } - let timer = self.metrics.init_logical_size_histo.start_timer(); - - // Have to calculate it the hard way - let last_lsn = self.get_last_record_lsn(); - let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?; - self.set_current_logical_size(logical_size); - debug!("calculated logical size the hard way: {}", logical_size); - - timer.stop_and_record(); - Ok(()) + Ok(size) } - /// Retrieve current logical size of the timeline - /// - /// NOTE: counted incrementally, includes ancestors. - pub fn get_current_logical_size(&self) -> u64 { - let current_logical_size = self.current_logical_size.load(AtomicOrdering::Acquire); - match u64::try_from(current_logical_size) { - Ok(sz) => sz, + fn try_spawn_size_init_task(self: &Arc, init_lsn: Lsn) { + let timeline_id = self.timeline_id; + + let mut task_guard = match self.initial_size_computation_task.try_lock() { + Ok(guard) => guard, Err(_) => { - error!( - "current_logical_size is out of range: {}", - current_logical_size - ); - 0 + debug!("Skipping timeline logical size init: task lock is taken already"); + return; + } + }; + + if let Some((old_task, task_finish_signal)) = task_guard.take() { + // TODO rust 1.61 would allow to remove `task_finish_signal` entirely and call `old_task.is_finished()` instead + match task_finish_signal.try_recv() { + // task has either signaled successfully that it finished or panicked and dropped the sender part without signalling + Ok(()) | Err(mpsc::TryRecvError::Disconnected) => { + match old_task.join() { + // we're here due to OnceCell::get not returning the value + Ok(Ok(())) => { + error!("Timeline {timeline_id} size init task finished, yet the size was not updated, rescheduling the computation") + } + Ok(Err(task_error)) => { + error!("Error during timeline {timeline_id} size init: {task_error:?}") + } + Err(e) => error!("Timeline {timeline_id} size init task panicked: {e:?}"), + } + } + // task had not yet finished: no signal was sent and the sender channel is not dropped + Err(mpsc::TryRecvError::Empty) => { + // let the task finish + *task_guard = Some((old_task, task_finish_signal)); + return; + } } } + + if task_guard.is_none() { + let thread_timeline = Arc::clone(self); + let (finish_sender, finish_receiver) = mpsc::channel(); + + match thread::Builder::new() + .name(format!( + "Timeline {timeline_id} initial logical size calculation" + )) + .spawn(move || { + let _enter = info_span!("initial_logical_size_calculation", timeline = %timeline_id).entered(); + let calculated_size = thread_timeline.calculate_logical_size(init_lsn)?; + match thread_timeline.current_logical_size.initial_logical_size.set(calculated_size) { + Ok(()) => info!("Successfully calculated initial logical size"), + Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"), + } + + finish_sender.send(()).ok(); + Ok(()) + }) { + Ok(guard) => *task_guard = Some((guard, finish_receiver)), + Err(e) => error!("Failed to spawn timeline {timeline_id} size init task: {e}"), + } + } + } + + /// Calculate the logical size of the database at the latest LSN. + /// + /// NOTE: counted incrementally, includes ancestors, this can be a slow operation. + fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result { + info!("Calculating logical size for timeline {}", self.timeline_id); + let timer = self.metrics.init_logical_size_histo.start_timer(); + let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn)?; + debug!("calculated logical size: {logical_size}"); + timer.stop_and_record(); + Ok(logical_size) } /// Update current logical size, adding `delta' to the old value. fn update_current_logical_size(&self, delta: i64) { - let new_size = self - .current_logical_size - .fetch_add(delta, AtomicOrdering::SeqCst); + let logical_size = &self.current_logical_size; + logical_size.increment_size(delta); // Also set the value in the prometheus gauge. Note that // there is a race condition here: if this is is called by two // threads concurrently, the prometheus gauge might be set to // one value while current_logical_size is set to the - // other. Currently, only initialization and the WAL receiver - // updates the logical size, and they don't run concurrently, - // so it cannot happen. And even if it did, it wouldn't be - // very serious, the metrics would just be slightly off until - // the next update. - self.metrics.current_logical_size_gauge.set(new_size); - } - - /// Set current logical size. - fn set_current_logical_size(&self, new_size: u64) { - self.current_logical_size - .store(new_size as i64, AtomicOrdering::SeqCst); - self.logical_size_initialized - .store(true, AtomicOrdering::SeqCst); - - // Also set the value in the prometheus gauge. Same race condition - // here as in `update_current_logical_size`. - self.metrics.current_logical_size_gauge.set(new_size as i64); + // other. + match logical_size.current_size() { + Ok(new_current_size) => self + .metrics + .current_logical_size_gauge + .set(new_current_size.size()), + Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"), + } } /// @@ -1446,7 +1571,15 @@ impl Timeline { Ok(new_delta_path) } - pub fn compact(&self) -> Result<()> { + pub fn compact(&self) -> anyhow::Result<()> { + let last_record_lsn = self.get_last_record_lsn(); + + // Last record Lsn could be zero in case the timelie was just created + if !last_record_lsn.is_valid() { + warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); + return Ok(()); + } + // // High level strategy for compaction / image creation: // diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 0f0bb1ed53..24002a36e5 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -936,7 +936,7 @@ impl<'a> DatadirModification<'a> { result?; if pending_nblocks != 0 { - writer.update_current_logical_size(pending_nblocks * BLCKSZ as i64); + writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); self.pending_nblocks = 0; } @@ -948,7 +948,7 @@ impl<'a> DatadirModification<'a> { /// underlying timeline. /// All the modifications in this atomic update are stamped by the specified LSN. /// - pub fn commit(&mut self) -> Result<()> { + pub fn commit(&mut self) -> anyhow::Result<()> { let writer = self.tline.writer(); let lsn = self.lsn; let pending_nblocks = self.pending_nblocks; @@ -964,7 +964,7 @@ impl<'a> DatadirModification<'a> { writer.finish_write(lsn); if pending_nblocks != 0 { - writer.update_current_logical_size(pending_nblocks * BLCKSZ as i64); + writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); } Ok(()) diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 4a907ac0e1..fec8a80b9b 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -4,7 +4,6 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; use crate::layered_repository::{load_metadata, Repository, Timeline}; -use crate::repository::RepositoryTimeline; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; @@ -378,15 +377,7 @@ pub fn get_local_timeline_with_load( tenant_id: ZTenantId, timeline_id: ZTimelineId, ) -> anyhow::Result> { - let repository = get_repository_for_tenant(tenant_id)?; - match repository.get_timeline(timeline_id) { - Some(RepositoryTimeline::Loaded(loaded_timeline)) => { - loaded_timeline.init_logical_size()?; - Ok(loaded_timeline) - } - _ => load_local_timeline(&repository, timeline_id) - .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}")), - } + get_repository_for_tenant(tenant_id)?.get_timeline_load(timeline_id) } pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { @@ -470,17 +461,6 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any Ok(()) } -fn load_local_timeline( - repo: &Repository, - timeline_id: ZTimelineId, -) -> anyhow::Result> { - let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| { - format!("Inmem timeline {timeline_id} not found in tenant's repository") - })?; - inmem_timeline.init_logical_size()?; - Ok(inmem_timeline) -} - /// /// Get list of tenants, for the mgmt API /// @@ -489,9 +469,11 @@ pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec { .iter() .map(|(id, tenant)| { let has_in_progress_downloads = remote_index - .tenant_entry(id) - .map(|entry| entry.has_in_progress_downloads()); + .tenant_entry(id) + .map(|entry| entry.has_in_progress_downloads()); + // TODO this is not correct when we might have remote storage sync disabled: + // we keep `RemoteTimelineIndex` in memory anyway for simplicity and this error message is printed still if has_in_progress_downloads.is_none() { error!("timeline is not found in remote index while it is present in the tenants registry") } @@ -581,7 +563,7 @@ fn attach_downloaded_tenant( // and then load its layers in memory for timeline_id in downloaded_timelines { - let _ = load_local_timeline(repo, timeline_id).with_context(|| { + repo.get_timeline_load(timeline_id).with_context(|| { format!( "Failed to register add local timeline for tenant {}", repo.tenant_id(), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index f816198eda..2c29a56ad2 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -315,18 +315,20 @@ pub async fn handle_walreceiver_connection( // Send zenith feedback message. // Regular standby_status_update fields are put into this message. - let zenith_status_update = ReplicationFeedback { - current_timeline_size: timeline.get_current_logical_size() as u64, + let status_update = ReplicationFeedback { + current_timeline_size: timeline + .get_current_logical_size() + .context("Status update creation failed to get current logical size")?, ps_writelsn: write_lsn, ps_flushlsn: flush_lsn, ps_applylsn: apply_lsn, ps_replytime: ts, }; - debug!("zenith_status_update {zenith_status_update:?}"); + debug!("zenith_status_update {status_update:?}"); let mut data = BytesMut::new(); - zenith_status_update.serialize(&mut data)?; + status_update.serialize(&mut data)?; physical_stream .as_mut() .zenith_status_update(data.len() as u64, &data) diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index bf44dfd949..31b54f827b 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -67,11 +67,21 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100 # But all others are broken - for n in range(1, 4): - (tenant, timeline, pg) = tenant_timelines[n] - with pytest.raises(Exception, match="Cannot load local timeline") as err: + + # First timeline would fail instantly due to corrupt metadata file + (_tenant, _timeline, pg) = tenant_timelines[1] + with pytest.raises(Exception, match="Cannot load local timeline") as err: + pg.start() + log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") + + # Yet other timelines will fail when their layers will be queried during basebackup: we don't check layer file contents on startup, when loading the timeline + for n in range(2, 4): + (_tenant, _timeline, pg) = tenant_timelines[n] + with pytest.raises(Exception, match="extracting base backup failed") as err: pg.start() - log.info(f"compute startup failed as expected: {err}") + log.info( + f"compute startup failed lazily for timeline with corrupt layers, during basebackup preparation: {err}" + ) def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index aba8567541..6fbc430e80 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -10,6 +10,7 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + NeonPageserverHttpClient, Postgres, assert_timeline_local, wait_for_last_flush_lsn, @@ -23,11 +24,7 @@ def test_timeline_size(neon_simple_env: NeonEnv): new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty") client = env.pageserver.http_client() - timeline_details = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - assert ( - timeline_details["local"]["current_logical_size"] - == timeline_details["local"]["current_logical_size_non_incremental"] - ) + wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) pgmain = env.postgres.create_start("test_timeline_size") log.info("postgres is running on 'test_timeline_size' branch") @@ -61,17 +58,14 @@ def test_timeline_size(neon_simple_env: NeonEnv): def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty") + new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty") client = env.pageserver.http_client() + wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) timeline_details = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - assert ( - timeline_details["local"]["current_logical_size"] - == timeline_details["local"]["current_logical_size_non_incremental"] - ) - pgmain = env.postgres.create_start("test_timeline_size") - log.info("postgres is running on 'test_timeline_size' branch") + pgmain = env.postgres.create_start("test_timeline_size_createdropdb") + log.info("postgres is running on 'test_timeline_size_createdropdb' branch") with closing(pgmain.connect()) as conn: with conn.cursor() as cur: @@ -81,6 +75,10 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): local_details["current_logical_size"] == local_details["current_logical_size_non_incremental"] ) + assert ( + timeline_details["local"]["current_logical_size_non_incremental"] + == local_details["current_logical_size_non_incremental"] + ), "no writes should not change the incremental logical size" cur.execute("CREATE DATABASE foodb") with closing(pgmain.connect(dbname="foodb")) as conn: @@ -140,13 +138,10 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() + client = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota") - client = env.pageserver.http_client() - res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - assert ( - res["local"]["current_logical_size"] == res["local"]["current_logical_size_non_incremental"] - ) + wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) pgmain = env.postgres.create_start( "test_timeline_size_quota", @@ -211,6 +206,12 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): pg_cluster_size = cur.fetchone() log.info(f"pg_cluster_size = {pg_cluster_size}") + new_res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) + assert ( + new_res["local"]["current_logical_size"] + == new_res["local"]["current_logical_size_non_incremental"] + ), "after the WAL is streamed, current_logical_size is expected to be calculated and to be equal its non-incremental value" + def test_timeline_physical_size_init(neon_simple_env: NeonEnv): env = neon_simple_env @@ -425,3 +426,22 @@ def assert_physical_size(env: NeonEnv, tenant_id: ZTenantId, timeline_id: ZTimel == res["local"]["current_physical_size_non_incremental"] ) assert res["local"]["current_physical_size"] == get_timeline_dir_size(timeline_path) + + +# Timeline logical size initialization is an asynchronous background task that runs once, +# try a few times to ensure it's activated properly +def wait_for_timeline_size_init( + client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId +): + for i in range(10): + timeline_details = assert_timeline_local(client, tenant, timeline) + if ( + timeline_details["local"]["current_logical_size"] + == timeline_details["local"]["current_logical_size_non_incremental"] + ): + return + log.info(f"waiting for current_logical_size of a timeline to be calculated, iteration {i}") + time.sleep(1) + raise Exception( + f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}" + ) From 2db20e55871fdbbe38c2ae7a28b0692a67be4838 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 1 Sep 2022 16:22:22 +0300 Subject: [PATCH 0718/1022] Remove [Un]Loaded timeline code (#2359) --- pageserver/src/http/models.rs | 2 - pageserver/src/http/routes.rs | 59 +-- pageserver/src/layered_repository.rs | 392 +++++++++--------- pageserver/src/layered_repository/timeline.rs | 111 +---- pageserver/src/page_service.rs | 69 +-- pageserver/src/repository.rs | 26 -- pageserver/src/storage_sync.rs | 17 +- pageserver/src/tenant_mgr.rs | 60 +-- pageserver/src/timelines.rs | 17 +- .../src/walreceiver/walreceiver_connection.rs | 2 +- test_runner/regress/test_broken_timeline.py | 6 +- test_runner/regress/test_pageserver_api.py | 5 +- 12 files changed, 290 insertions(+), 476 deletions(-) diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 654f45a95d..7c7d7f7b0c 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -8,7 +8,6 @@ use utils::{ }; // These enums are used in the API response fields. -use crate::repository::LocalTimelineState; use crate::tenant_mgr::TenantState; #[serde_as] @@ -133,7 +132,6 @@ pub struct LocalTimelineInfo { pub current_physical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, pub current_physical_size_non_incremental: Option, - pub timeline_state: LocalTimelineState, pub wal_source_connstr: Option, #[serde_as(as = "Option")] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 710014de98..f1033eeb2a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -11,8 +11,7 @@ use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, }; -use crate::layered_repository::{metadata::TimelineMetadata, Timeline}; -use crate::repository::{LocalTimelineState, RepositoryTimeline}; +use crate::layered_repository::Timeline; use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant_config::TenantConfOpt; @@ -74,7 +73,7 @@ fn get_config(request: &Request) -> &'static PageServerConf { // Helper functions to construct a LocalTimelineInfo struct for a timeline -fn local_timeline_info_from_loaded_timeline( +fn local_timeline_info_from_timeline( timeline: &Arc, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, @@ -105,7 +104,6 @@ fn local_timeline_info_from_loaded_timeline( last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), - timeline_state: LocalTimelineState::Loaded, current_logical_size: Some( timeline .get_current_logical_size() @@ -129,61 +127,20 @@ fn local_timeline_info_from_loaded_timeline( Ok(info) } -fn local_timeline_info_from_unloaded_timeline(metadata: &TimelineMetadata) -> LocalTimelineInfo { - LocalTimelineInfo { - ancestor_timeline_id: metadata.ancestor_timeline(), - ancestor_lsn: { - match metadata.ancestor_lsn() { - Lsn(0) => None, - lsn @ Lsn(_) => Some(lsn), - } - }, - disk_consistent_lsn: metadata.disk_consistent_lsn(), - last_record_lsn: metadata.disk_consistent_lsn(), - prev_record_lsn: metadata.prev_record_lsn(), - latest_gc_cutoff_lsn: metadata.latest_gc_cutoff_lsn(), - timeline_state: LocalTimelineState::Unloaded, - current_logical_size: None, - current_physical_size: None, - current_logical_size_non_incremental: None, - current_physical_size_non_incremental: None, - wal_source_connstr: None, - last_received_msg_lsn: None, - last_received_msg_ts: None, - } -} - -fn local_timeline_info_from_repo_timeline( - repo_timeline: &RepositoryTimeline, - include_non_incremental_logical_size: bool, - include_non_incremental_physical_size: bool, -) -> anyhow::Result { - match repo_timeline { - RepositoryTimeline::Loaded(timeline) => local_timeline_info_from_loaded_timeline( - timeline, - include_non_incremental_logical_size, - include_non_incremental_physical_size, - ), - RepositoryTimeline::Unloaded { metadata } => { - Ok(local_timeline_info_from_unloaded_timeline(metadata)) - } - } -} - fn list_local_timelines( tenant_id: ZTenantId, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> Result> { let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?; + .with_context(|| format!("Failed to get repo for tenant {tenant_id}"))?; let repo_timelines = repo.list_timelines(); let mut local_timeline_info = Vec::with_capacity(repo_timelines.len()); for (timeline_id, repository_timeline) in repo_timelines { local_timeline_info.push(( timeline_id, - local_timeline_info_from_repo_timeline( + local_timeline_info_from_timeline( &repository_timeline, include_non_incremental_logical_size, include_non_incremental_physical_size, @@ -214,12 +171,12 @@ async fn timeline_create_handler(mut request: Request) -> Result { + Ok(Some(new_timeline)) => { // Created. Construct a TimelineInfo for it. - let local_info = local_timeline_info_from_loaded_timeline(&new_timeline, false, false)?; + let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)?; Ok(Some(TimelineInfo { tenant_id, - timeline_id: new_timeline_id, + timeline_id: new_timeline.timeline_id, local: Some(local_info), remote: None, })) @@ -311,7 +268,7 @@ async fn timeline_detail_handler(request: Request) -> Result>, tenant_id: ZTenantId, - timelines: Mutex>, + timelines: Mutex>>, // This mutex prevents creation of new timelines during GC. // Adding yet another mutex (in addition to `timelines`) is needed because holding // `timelines` mutex during all GC iteration (especially with enforced checkpoint) @@ -126,37 +128,18 @@ pub struct Repository { impl Repository { /// Get Timeline handle for given zenith timeline ID. /// This function is idempotent. It doesn't change internal state in any way. - pub fn get_timeline(&self, timelineid: ZTimelineId) -> Option> { - self.timelines - .lock() - .unwrap() - .get(&timelineid) - .cloned() - .map(RepositoryTimeline::from) - } - - /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded. - pub fn get_timeline_load(&self, timeline_id: ZTimelineId) -> Result> { - let mut timelines = self.timelines.lock().unwrap(); - match self.get_timeline_load_internal(timeline_id, &mut timelines)? { - Some(local_loaded_timeline) => Ok(local_loaded_timeline), - None => anyhow::bail!("cannot get local timeline, unknown timeline id: {timeline_id}"), - } + pub fn get_timeline(&self, timeline_id: ZTimelineId) -> Option> { + self.timelines.lock().unwrap().get(&timeline_id).cloned() } /// Lists timelines the repository contains. /// Up to repository's implementation to omit certain timelines that ar not considered ready for use. - pub fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)> { + pub fn list_timelines(&self) -> Vec<(ZTimelineId, Arc)> { self.timelines .lock() .unwrap() .iter() - .map(|(timeline_id, timeline_entry)| { - ( - *timeline_id, - RepositoryTimeline::from(timeline_entry.clone()), - ) - }) + .map(|(timeline_id, timeline_entry)| (*timeline_id, Arc::clone(timeline_entry))) .collect() } @@ -164,16 +147,18 @@ impl Repository { /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. pub fn create_empty_timeline( &self, - timeline_id: ZTimelineId, + new_timeline_id: ZTimelineId, initdb_lsn: Lsn, ) -> Result> { + // XXX: keep the lock to avoid races during timeline creation let mut timelines = self.timelines.lock().unwrap(); - let vacant_timeline_entry = match timelines.entry(timeline_id) { - Entry::Occupied(_) => bail!("Timeline already exists"), - Entry::Vacant(vacant_entry) => vacant_entry, - }; - let timeline_path = self.conf.timeline_path(&timeline_id, &self.tenant_id); + anyhow::ensure!( + timelines.get(&new_timeline_id).is_none(), + "Timeline {new_timeline_id} already exists" + ); + + let timeline_path = self.conf.timeline_path(&new_timeline_id, &self.tenant_id); if timeline_path.exists() { bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.") } @@ -181,31 +166,25 @@ impl Repository { // Create the timeline directory, and write initial metadata to file. crashsafe_dir::create_dir_all(timeline_path)?; - let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); - save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?; - - let timeline = Timeline::new( + let new_metadata = + TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); + save_metadata( self.conf, - Arc::clone(&self.tenant_conf), - metadata, - None, - timeline_id, + new_timeline_id, self.tenant_id, - Arc::clone(&self.walredo_mgr), - self.upload_layers, - ); - timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); + &new_metadata, + true, + )?; - // Insert if not exists - let timeline = Arc::new(timeline); - vacant_timeline_entry.insert(LayeredTimelineEntry::Loaded(Arc::clone(&timeline))); + let new_timeline = + self.initialize_new_timeline(new_timeline_id, new_metadata, &mut timelines)?; + new_timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); - crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach { - id: ZTenantTimelineId::new(self.tenant_id(), timeline_id), - timeline: Arc::clone(&timeline), - }); + if let hash_map::Entry::Vacant(v) = timelines.entry(new_timeline_id) { + v.insert(Arc::clone(&new_timeline)); + } - Ok(timeline) + Ok(new_timeline) } /// Branch a timeline @@ -214,7 +193,7 @@ impl Repository { src: ZTimelineId, dst: ZTimelineId, start_lsn: Option, - ) -> Result<()> { + ) -> Result> { // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn // about timelines, so otherwise a race condition is possible, where we create new timeline and GC // concurrently removes data that is needed by the new timeline. @@ -229,12 +208,12 @@ impl Repository { // Step 2 is to avoid initializing the new branch using data removed by past GC iterations // or in-queue GC iterations. + // XXX: keep the lock to avoid races during timeline creation let mut timelines = self.timelines.lock().unwrap(); - let src_timeline = self - .get_timeline_load_internal(src, &mut timelines) + let src_timeline = timelines + .get(&src) // message about timeline being remote is one .context up in the stack - .context("failed to load timeline for branching")? - .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {}", &src))?; + .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {src}"))?; let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); @@ -252,7 +231,7 @@ impl Repository { .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) .context(format!( "invalid branch start lsn: less than latest GC cutoff {}", - *latest_gc_cutoff_lsn + *latest_gc_cutoff_lsn, ))?; { let gc_info = src_timeline.gc_info.read().unwrap(); @@ -293,11 +272,13 @@ impl Repository { ); crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; - timelines.insert(dst, LayeredTimelineEntry::Unloaded { id: dst, metadata }); - info!("branched timeline {} from {} at {}", dst, src, start_lsn); + let new_timeline = self.initialize_new_timeline(dst, metadata, &mut timelines)?; + timelines.insert(dst, Arc::clone(&new_timeline)); - Ok(()) + info!("branched timeline {dst} from {src} at {start_lsn}"); + + Ok(new_timeline) } /// perform one garbage collection iteration, removing old data files from disk. @@ -346,14 +327,7 @@ impl Repository { for (timelineid, timeline) in &timelines_to_compact { let _entered = info_span!("compact", timeline = %timelineid, tenant = %self.tenant_id).entered(); - match timeline { - LayeredTimelineEntry::Loaded(timeline) => { - timeline.compact()?; - } - LayeredTimelineEntry::Unloaded { .. } => { - debug!("Cannot compact remote timeline {}", timelineid) - } - } + timeline.compact()?; } Ok(()) @@ -371,15 +345,7 @@ impl Repository { let timelines = self.timelines.lock().unwrap(); let timelines_to_compact = timelines .iter() - // filter to get only loaded timelines - .filter_map(|(timelineid, entry)| match entry { - LayeredTimelineEntry::Loaded(timeline) => Some((timelineid, timeline)), - LayeredTimelineEntry::Unloaded { .. } => { - debug!("Skipping checkpoint for unloaded timeline {}", timelineid); - None - } - }) - .map(|(timelineid, timeline)| (*timelineid, timeline.clone())) + .map(|(timelineid, timeline)| (*timelineid, Arc::clone(timeline))) .collect::>(); drop(timelines); @@ -403,7 +369,7 @@ impl Repository { // because detach removes files, which will break child branches let children_exist = timelines .iter() - .any(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id)); + .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); ensure!( !children_exist, @@ -431,19 +397,36 @@ impl Repository { Ok(()) } - /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization. - /// See [`crate::remote_storage`] for more details about the synchronization. - pub fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> { - debug!("attach timeline_id: {}", timeline_id,); - match self.timelines.lock().unwrap().entry(timeline_id) { - Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."), - Entry::Vacant(entry) => { - // we need to get metadata of a timeline, another option is to pass it along with Downloaded status - let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?; - // finally we make newly downloaded timeline visible to repository - entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata }) - }, + pub fn init_attach_timelines( + &self, + timelines: Vec<(ZTimelineId, TimelineMetadata)>, + ) -> anyhow::Result<()> { + let sorted_timelines = if timelines.len() == 1 { + timelines + } else if !timelines.is_empty() { + tree_sort_timelines(timelines)? + } else { + warn!("No timelines to attach received"); + return Ok(()); }; + + let mut timelines_accessor = self.timelines.lock().unwrap(); + for (timeline_id, metadata) in sorted_timelines { + let timeline = self + .initialize_new_timeline(timeline_id, metadata, &mut timelines_accessor) + .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; + + match timelines_accessor.entry(timeline.timeline_id) { + hash_map::Entry::Occupied(_) => anyhow::bail!( + "Found freshly initialized timeline {} in the tenant map", + timeline.timeline_id + ), + hash_map::Entry::Vacant(v) => { + v.insert(timeline); + } + } + } + Ok(()) } @@ -453,6 +436,49 @@ impl Repository { } } +/// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id), +/// perform a topological sort, so that the parent of each timeline comes +/// before the children. +fn tree_sort_timelines( + timelines: Vec<(ZTimelineId, TimelineMetadata)>, +) -> Result> { + let mut result = Vec::with_capacity(timelines.len()); + + let mut now = Vec::with_capacity(timelines.len()); + // (ancestor, children) + let mut later: HashMap> = + HashMap::with_capacity(timelines.len()); + + for (timeline_id, metadata) in timelines { + if let Some(ancestor_id) = metadata.ancestor_timeline() { + let children = later.entry(ancestor_id).or_default(); + children.push((timeline_id, metadata)); + } else { + now.push((timeline_id, metadata)); + } + } + + while let Some((timeline_id, metadata)) = now.pop() { + result.push((timeline_id, metadata)); + // All children of this can be loaded now + if let Some(mut children) = later.remove(&timeline_id) { + now.append(&mut children); + } + } + + // All timelines should be visited now. Unless there were timelines with missing ancestors. + if !later.is_empty() { + for (missing_id, orphan_ids) in later { + for (orphan_id, _) in orphan_ids { + error!("could not load timeline {orphan_id} because its ancestor timeline {missing_id} could not be loaded"); + } + } + bail!("could not load tenant because some timelines are missing ancestors"); + } + + Ok(result) +} + /// Private functions impl Repository { pub fn get_checkpoint_distance(&self) -> u64 { @@ -548,87 +574,49 @@ impl Repository { Ok(()) } - // Implementation of the public `get_timeline_load` function. - // Differences from the public: - // * interface in that the caller must already hold the mutex on the 'timelines' hashmap. - fn get_timeline_load_internal( + fn initialize_new_timeline( &self, - timeline_id: ZTimelineId, - timelines: &mut HashMap, - ) -> anyhow::Result>> { - Ok(match timelines.get(&timeline_id) { - Some(entry) => match entry { - LayeredTimelineEntry::Loaded(local_timeline) => { - debug!("timeline {timeline_id} found loaded into memory"); - Some(Arc::clone(local_timeline)) - } - LayeredTimelineEntry::Unloaded { .. } => { - debug!( - "timeline {timeline_id} found on a local disk, but not loaded into the memory, loading" - ); - let timeline = self.load_local_timeline(timeline_id, timelines)?; - let was_loaded = timelines.insert( - timeline_id, - LayeredTimelineEntry::Loaded(Arc::clone(&timeline)), - ); - ensure!( - was_loaded.is_none() - || matches!(was_loaded, Some(LayeredTimelineEntry::Unloaded { .. })), - "assertion failure, inserted wrong timeline in an incorrect state" - ); - Some(timeline) - } - }, - None => { - debug!("timeline {timeline_id} not found"); - None - } - }) - } - - fn load_local_timeline( - &self, - timeline_id: ZTimelineId, - timelines: &mut HashMap, + new_timeline_id: ZTimelineId, + new_metadata: TimelineMetadata, + timelines: &mut MutexGuard>>, ) -> anyhow::Result> { - let metadata = load_metadata(self.conf, timeline_id, self.tenant_id) - .context("failed to load metadata")?; - let disk_consistent_lsn = metadata.disk_consistent_lsn(); + let ancestor = match new_metadata.ancestor_timeline() { + Some(ancestor_timeline_id) => Some( + timelines + .get(&ancestor_timeline_id) + .cloned() + .with_context(|| { + format!( + "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found" + ) + })?, + ), + None => None, + }; - let ancestor = metadata - .ancestor_timeline() - .map(|ancestor_timeline_id| { - trace!("loading {timeline_id}'s ancestor {}", &ancestor_timeline_id); - self.get_timeline_load_internal(ancestor_timeline_id, timelines) - }) - .transpose() - .context("cannot load ancestor timeline")? - .flatten() - .map(LayeredTimelineEntry::Loaded); - let _enter = info_span!("loading local timeline").entered(); + let new_disk_consistent_lsn = new_metadata.disk_consistent_lsn(); - let timeline = Timeline::new( + let new_timeline = Arc::new(Timeline::new( self.conf, Arc::clone(&self.tenant_conf), - metadata, + new_metadata, ancestor, - timeline_id, + new_timeline_id, self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, - ); - timeline - .load_layer_map(disk_consistent_lsn) + )); + + new_timeline + .load_layer_map(new_disk_consistent_lsn) .context("failed to load layermap")?; - let timeline = Arc::new(timeline); - crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach { - id: ZTenantTimelineId::new(self.tenant_id(), timeline_id), - timeline: Arc::clone(&timeline), + id: ZTenantTimelineId::new(self.tenant_id(), new_timeline_id), + timeline: Arc::clone(&new_timeline), }); - Ok(timeline) + Ok(new_timeline) } pub fn new( @@ -775,18 +763,20 @@ impl Repository { // This is unresolved question for now, how to do gc in presence of remote timelines // especially when this is combined with branching. // Somewhat related: https://github.com/zenithdb/zenith/issues/999 - if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() { + if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { // If target_timeline is specified, we only need to know branchpoints of its children if let Some(timelineid) = target_timeline_id { if ancestor_timeline_id == &timelineid { - all_branchpoints - .insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); + all_branchpoints.insert(( + *ancestor_timeline_id, + timeline_entry.get_ancestor_lsn(), + )); } } // Collect branchpoints for all timelines else { all_branchpoints - .insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); + .insert((*ancestor_timeline_id, timeline_entry.get_ancestor_lsn())); } } @@ -801,7 +791,9 @@ impl Repository { let mut gc_timelines = Vec::with_capacity(timeline_ids.len()); for timeline_id in timeline_ids { // Timeline is known to be local and loaded. - let timeline = self.get_timeline_load(timeline_id)?; + let timeline = self + .get_timeline(timeline_id) + .with_context(|| format!("Timeline {timeline_id} was not found"))?; // If target_timeline is specified, ignore all other timelines if let Some(target_timelineid) = target_timeline_id { @@ -1031,20 +1023,21 @@ pub mod repo_harness { false, ); // populate repo with locally available timelines + let mut timelines_to_load = Vec::new(); for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) .expect("should be able to read timelines dir") { - let timeline_dir_entry = timeline_dir_entry.unwrap(); + let timeline_dir_entry = timeline_dir_entry?; let timeline_id: ZTimelineId = timeline_dir_entry .path() .file_name() .unwrap() .to_string_lossy() - .parse() - .unwrap(); - - repo.attach_timeline(timeline_id)?; + .parse()?; + let timeline_metadata = load_metadata(self.conf, timeline_id, self.tenant_id)?; + timelines_to_load.push((timeline_id, timeline_metadata)); } + repo.init_attach_timelines(timelines_to_load)?; Ok(repo) } @@ -1127,7 +1120,10 @@ mod tests { match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) { Ok(_) => panic!("duplicate timeline creation should fail"), - Err(e) => assert_eq!(e.to_string(), "Timeline already exists"), + Err(e) => assert_eq!( + e.to_string(), + format!("Timeline {TIMELINE_ID} already exists") + ), } Ok(()) @@ -1170,7 +1166,7 @@ mod tests { // Branch the history, modify relation differently on the new timeline repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) + .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); let new_writer = newtline.writer(); new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?; @@ -1318,7 +1314,7 @@ mod tests { repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) + .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; @@ -1334,7 +1330,7 @@ mod tests { repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) + .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); make_some_layers(newtline.as_ref(), Lsn(0x60))?; @@ -1363,17 +1359,8 @@ mod tests { } let repo = harness.load(); - let tline = repo - .get_timeline(TIMELINE_ID) + repo.get_timeline(TIMELINE_ID) .expect("cannot load timeline"); - assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); - - assert!(repo.get_timeline_load(TIMELINE_ID).is_ok()); - - let tline = repo - .get_timeline(TIMELINE_ID) - .expect("cannot load timeline"); - assert!(matches!(tline, RepositoryTimeline::Loaded(_))); Ok(()) } @@ -1393,7 +1380,7 @@ mod tests { repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) + .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); make_some_layers(newtline.as_ref(), Lsn(0x60))?; @@ -1402,28 +1389,15 @@ mod tests { // check that both of them are initially unloaded let repo = harness.load(); - { - let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline"); - assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); - - let tline = repo - .get_timeline(NEW_TIMELINE_ID) - .expect("cannot get timeline"); - assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); - } - // load only child timeline - let _ = repo - .get_timeline_load(NEW_TIMELINE_ID) - .expect("cannot load timeline"); // check that both, child and ancestor are loaded - let tline = repo + let _child_tline = repo .get_timeline(NEW_TIMELINE_ID) - .expect("cannot get timeline"); - assert!(matches!(tline, RepositoryTimeline::Loaded(_))); + .expect("cannot get child timeline loaded"); - let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline"); - assert!(matches!(tline, RepositoryTimeline::Loaded(_))); + let _ancestor_tline = repo + .get_timeline(TIMELINE_ID) + .expect("cannot get ancestor timeline loaded"); Ok(()) } @@ -1447,7 +1421,9 @@ mod tests { std::fs::write(metadata_path, metadata_bytes)?; let err = harness.try_load().err().expect("should fail"); - assert_eq!(err.to_string(), "failed to load local metadata"); + assert!(err + .to_string() + .starts_with("Failed to parse metadata bytes from path")); let mut found_error_message = false; let mut err_source = err.source(); @@ -1663,7 +1639,9 @@ mod tests { for _ in 0..50 { let new_tline_id = ZTimelineId::generate(); repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?; - tline = repo.get_timeline_load(new_tline_id)?; + tline = repo + .get_timeline(new_tline_id) + .expect("Should have the branched timeline"); tline_id = new_tline_id; for _ in 0..NUM_KEYS { @@ -1722,7 +1700,9 @@ mod tests { for idx in 0..NUM_TLINES { let new_tline_id = ZTimelineId::generate(); repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?; - tline = repo.get_timeline_load(new_tline_id)?; + tline = repo + .get_timeline(new_tline_id) + .expect("Should have the branched timeline"); tline_id = new_tline_id; for _ in 0..NUM_KEYS { @@ -1749,11 +1729,11 @@ mod tests { if lsn.0 == 0 { continue; } - println!("chekcking [{}][{}] at {}", idx, blknum, lsn); + println!("checking [{idx}][{blknum}] at {lsn}"); test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, *lsn)?, - TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn)) + TEST_IMG(&format!("{idx} {blknum} at {lsn}")) ); } } diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index fd719812a3..821995fad1 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -50,7 +50,7 @@ use utils::{ zid::{ZTenantId, ZTimelineId}, }; -use crate::repository::{GcResult, RepositoryTimeline}; +use crate::repository::GcResult; use crate::repository::{Key, Value}; use crate::thread_mgr; use crate::walreceiver::IS_WAL_RECEIVER; @@ -164,72 +164,6 @@ static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -#[derive(Clone)] -pub enum LayeredTimelineEntry { - Loaded(Arc), - Unloaded { - id: ZTimelineId, - metadata: TimelineMetadata, - }, -} - -impl LayeredTimelineEntry { - fn timeline_id(&self) -> ZTimelineId { - match self { - LayeredTimelineEntry::Loaded(timeline) => timeline.timeline_id, - LayeredTimelineEntry::Unloaded { id, .. } => *id, - } - } - - pub fn ancestor_timeline_id(&self) -> Option { - match self { - LayeredTimelineEntry::Loaded(timeline) => { - timeline.ancestor_timeline.as_ref().map(|t| t.timeline_id()) - } - LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_timeline(), - } - } - - pub fn ancestor_lsn(&self) -> Lsn { - match self { - LayeredTimelineEntry::Loaded(timeline) => timeline.ancestor_lsn, - LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_lsn(), - } - } - - fn ensure_loaded(&self) -> anyhow::Result<&Arc> { - match self { - LayeredTimelineEntry::Loaded(timeline) => Ok(timeline), - LayeredTimelineEntry::Unloaded { .. } => { - anyhow::bail!("timeline is unloaded") - } - } - } - - pub fn layer_removal_guard(&self) -> Result>, anyhow::Error> { - match self { - LayeredTimelineEntry::Loaded(timeline) => timeline - .layer_removal_cs - .try_lock() - .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) - .map(Some), - - LayeredTimelineEntry::Unloaded { .. } => Ok(None), - } - } -} - -impl From for RepositoryTimeline { - fn from(entry: LayeredTimelineEntry) -> Self { - match entry { - LayeredTimelineEntry::Loaded(timeline) => RepositoryTimeline::Loaded(timeline as _), - LayeredTimelineEntry::Unloaded { metadata, .. } => { - RepositoryTimeline::Unloaded { metadata } - } - } - } -} - struct TimelineMetrics { pub reconstruct_time_histo: Histogram, pub materialized_page_cache_hit_counter: GenericCounter, @@ -342,7 +276,7 @@ pub struct Timeline { // Parent timeline that this timeline was branched from, and the LSN // of the branch point. - ancestor_timeline: Option, + ancestor_timeline: Option>, ancestor_lsn: Lsn, // Metrics @@ -566,7 +500,7 @@ impl Timeline { pub fn get_ancestor_timeline_id(&self) -> Option { self.ancestor_timeline .as_ref() - .map(LayeredTimelineEntry::timeline_id) + .map(|ancestor| ancestor.timeline_id) } /// Lock and get timeline's GC cuttof @@ -781,7 +715,7 @@ impl Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, metadata: TimelineMetadata, - ancestor: Option, + ancestor: Option>, timeline_id: ZTimelineId, tenant_id: ZTenantId, walredo_mgr: Arc, @@ -938,6 +872,12 @@ impl Timeline { Ok(()) } + pub fn layer_removal_guard(&self) -> Result, anyhow::Error> { + self.layer_removal_cs + .try_lock() + .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) + } + /// Retrieve current logical size of the timeline. /// /// The size could be lagging behind the actual number, in case @@ -1204,24 +1144,13 @@ impl Timeline { } fn get_ancestor_timeline(&self) -> Result> { - let ancestor = self - .ancestor_timeline - .as_ref() - .with_context(|| { - format!( - "Ancestor is missing. Timeline id: {} Ancestor id {:?}", - self.timeline_id, - self.get_ancestor_timeline_id(), - ) - })? - .ensure_loaded() - .with_context(|| { - format!( - "Ancestor timeline is not loaded. Timeline id: {} Ancestor id {:?}", - self.timeline_id, - self.get_ancestor_timeline_id(), - ) - })?; + let ancestor = self.ancestor_timeline.as_ref().with_context(|| { + format!( + "Ancestor is missing. Timeline id: {} Ancestor id {:?}", + self.timeline_id, + self.get_ancestor_timeline_id(), + ) + })?; Ok(Arc::clone(ancestor)) } @@ -1251,7 +1180,9 @@ impl Timeline { layer = Arc::clone(open_layer); } else { // No writeable layer yet. Create one. - let start_lsn = layers.next_open_layer_at.unwrap(); + let start_lsn = layers + .next_open_layer_at + .context("No next open layer found")?; trace!( "creating layer for write at {}/{} for record at {}", @@ -1496,7 +1427,7 @@ impl Timeline { let ancestor_timelineid = self .ancestor_timeline .as_ref() - .map(LayeredTimelineEntry::timeline_id); + .map(|ancestor| ancestor.timeline_id); let metadata = TimelineMetadata::new( disk_consistent_lsn, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d59a82d488..7f7fa3c22b 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -457,18 +457,18 @@ impl PageServerHandler { fn handle_pagerequests( &self, pgb: &mut PostgresBackend, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: ZTimelineId, + tenant_id: ZTenantId, ) -> anyhow::Result<()> { - let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered(); + let _enter = + info_span!("pagestream", timeline = %timeline_id, tenant = %tenant_id).entered(); // NOTE: pagerequests handler exits when connection is closed, // so there is no need to reset the association - thread_mgr::associate_with(Some(tenantid), Some(timelineid)); + thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Check that the timeline exists - let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) - .context("Cannot load local timeline")?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; /* switch client to COPYBOTH */ pgb.write_message(&BeMessage::CopyBothResponse)?; @@ -488,8 +488,8 @@ impl PageServerHandler { }; let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; - let tenant_id = tenantid.to_string(); - let timeline_id = timelineid.to_string(); + let tenant_id = tenant_id.to_string(); + let timeline_id = timeline_id.to_string(); let response = match zenith_fe_msg { PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME @@ -599,7 +599,9 @@ impl PageServerHandler { info_span!("import wal", timeline = %timeline_id, tenant = %tenant_id).entered(); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - let timeline = repo.get_timeline_load(timeline_id)?; + let timeline = repo + .get_timeline(timeline_id) + .with_context(|| format!("Timeline {timeline_id} was not found"))?; ensure!(timeline.get_last_record_lsn() == start_lsn); // TODO leave clean state on error. For now you can use detach to clean @@ -762,19 +764,18 @@ impl PageServerHandler { fn handle_basebackup_request( &self, pgb: &mut PostgresBackend, - timelineid: ZTimelineId, + timeline_id: ZTimelineId, lsn: Option, prev_lsn: Option, - tenantid: ZTenantId, + tenant_id: ZTenantId, full_backup: bool, ) -> anyhow::Result<()> { - let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty); + let span = info_span!("basebackup", timeline = %timeline_id, tenant = %tenant_id, lsn = field::Empty); let _enter = span.enter(); info!("starting"); // check that the timeline exists - let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) - .context("Cannot load local timeline")?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { timeline @@ -906,12 +907,11 @@ impl postgres_backend::Handler for PageServerHandler { "invalid param number for get_last_record_rlsn command" ); - let tenantid = ZTenantId::from_str(params[0])?; - let timelineid = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; - self.check_permission(Some(tenantid))?; - let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) - .context("Cannot load local timeline")?; + self.check_permission(Some(tenant_id))?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; let end_of_timeline = timeline.get_last_record_rlsn(); @@ -1134,10 +1134,9 @@ impl postgres_backend::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("Invalid compact: '{}'", query_string))?; - let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) - .context("Couldn't load timeline")?; + let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; timeline.compact()?; pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? @@ -1152,11 +1151,9 @@ impl postgres_backend::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid checkpoint command: '{}'", query_string))?; - let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - - let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) - .context("Cannot load local timeline")?; + let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). timeline.checkpoint(CheckpointConfig::Forced)?; @@ -1172,10 +1169,9 @@ impl postgres_backend::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?; - let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) - .context("Cannot load local timeline")?; + let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; let timestamp_pg = to_pg_timestamp(timestamp); @@ -1201,6 +1197,15 @@ impl postgres_backend::Handler for PageServerHandler { } } +fn get_local_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Result> { + tenant_mgr::get_repository_for_tenant(tenant_id) + .and_then(|repo| { + repo.get_timeline(timeline_id) + .context("No timeline in tenant's repository") + }) + .with_context(|| format!("Could not get timeline {timeline_id} in tenant {tenant_id}")) +} + /// /// A std::io::Write implementation that wraps all data written to it in CopyData /// messages. diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index e46a39436d..c3b08c93de 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,4 +1,3 @@ -use crate::layered_repository::metadata::TimelineMetadata; use crate::walrecord::ZenithWalRecord; use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; @@ -6,7 +5,6 @@ use bytes::Bytes; use serde::{Deserialize, Serialize}; use std::fmt; use std::ops::{AddAssign, Range}; -use std::sync::Arc; use std::time::Duration; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] @@ -175,30 +173,6 @@ impl Value { } } -/// A timeline, that belongs to the current repository. -pub enum RepositoryTimeline { - /// Timeline, with its files present locally in pageserver's working directory. - /// Loaded into pageserver's memory and ready to be used. - Loaded(Arc), - - /// All the data is available locally, but not loaded into memory, so loading have to be done before actually using the timeline - Unloaded { - // It is ok to keep metadata here, because it is not changed when timeline is unloaded. - // FIXME can s3 sync actually change it? It can change it when timeline is in awaiting download state. - // but we currently do not download something for the timeline once it is local (even if there are new checkpoints) is it correct? - // also it is not that good to keep TimelineMetadata here, because it is layered repo implementation detail - metadata: TimelineMetadata, - }, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub enum LocalTimelineState { - // timeline is loaded into memory (with layer map and all the bits), - Loaded, - // timeline is on disk locally and ready to be loaded into memory. - Unloaded, -} - /// /// Result of performing GC /// diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index a52cde7286..0bdc30a73f 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -903,8 +903,10 @@ fn storage_sync_loop( "Sync loop step completed, {} new tenant state update(s)", updated_tenants.len() ); - let mut sync_status_updates: HashMap> = - HashMap::new(); + let mut timelines_to_attach: HashMap< + ZTenantId, + Vec<(ZTimelineId, TimelineMetadata)>, + > = HashMap::new(); let index_accessor = runtime.block_on(index.read()); for tenant_id in updated_tenants { let tenant_entry = match index_accessor.tenant_entry(&tenant_id) { @@ -930,13 +932,18 @@ fn storage_sync_loop( // and register them all at once in a repository for download // to be submitted in a single operation to repository // so it can apply them at once to internal timeline map. - sync_status_updates - .insert(tenant_id, tenant_entry.keys().copied().collect()); + timelines_to_attach.insert( + tenant_id, + tenant_entry + .iter() + .map(|(&id, entry)| (id, entry.metadata.clone())) + .collect(), + ); } } drop(index_accessor); // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. - attach_downloaded_tenants(conf, &index, sync_status_updates); + attach_downloaded_tenants(conf, &index, timelines_to_attach); } } ControlFlow::Break(()) => { diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index fec8a80b9b..cbf9f2094a 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,6 +3,7 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; +use crate::layered_repository::metadata::TimelineMetadata; use crate::layered_repository::{load_metadata, Repository, Timeline}; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; @@ -14,7 +15,7 @@ use anyhow::Context; use remote_storage::GenericRemoteStorage; use serde::{Deserialize, Serialize}; use std::collections::hash_map::Entry; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::fmt; use std::sync::Arc; use tokio::sync::mpsc; @@ -192,7 +193,7 @@ impl std::fmt::Debug for LocalTimelineUpdate { pub fn attach_downloaded_tenants( conf: &'static PageServerConf, remote_index: &RemoteIndex, - sync_status_updates: HashMap>, + sync_status_updates: HashMap>, ) { if sync_status_updates.is_empty() { debug!("No sync status updates to apply"); @@ -212,11 +213,9 @@ pub fn attach_downloaded_tenants( continue; } }; - match attach_downloaded_tenant(&repo, downloaded_timelines) { - Ok(()) => info!("successfully applied sync status updates for tenant {tenant_id}"), - Err(e) => error!( - "Failed to apply timeline sync timeline status updates for tenant {tenant_id}: {e:?}" - ), + match repo.init_attach_timelines(downloaded_timelines) { + Ok(()) => info!("successfully loaded local timelines for tenant {tenant_id}"), + Err(e) => error!("Failed to load local timelines for tenant {tenant_id}: {e:?}"), } } } @@ -371,15 +370,6 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result anyhow::Result> { - get_repository_for_tenant(tenant_id)?.get_timeline_load(timeline_id) -} - pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { // Start with the shutdown of timeline tasks (this shuts down the walreceiver) // It is important that we do not take locks here, and do not check whether the timeline exists @@ -499,7 +489,7 @@ fn check_broken_timeline( conf: &'static PageServerConf, tenant_id: ZTenantId, timeline_id: ZTimelineId, -) -> anyhow::Result<()> { +) -> anyhow::Result { let metadata = load_metadata(conf, timeline_id, tenant_id).context("failed to load metadata")?; @@ -509,7 +499,7 @@ fn check_broken_timeline( anyhow::bail!("Timeline {timeline_id} has a zero disk consistent LSN."); } - Ok(()) + Ok(metadata) } /// Note: all timelines are attached at once if and only if all of them are locally complete @@ -519,14 +509,14 @@ fn init_local_repository( local_timeline_init_statuses: HashMap, remote_index: &RemoteIndex, ) -> anyhow::Result<(), anyhow::Error> { - let mut timelines_to_attach = HashSet::new(); + let mut timelines_to_attach = Vec::new(); for (timeline_id, init_status) in local_timeline_init_statuses { match init_status { LocalTimelineInitStatus::LocallyComplete => { debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository"); - check_broken_timeline(conf, tenant_id, timeline_id) + let metadata = check_broken_timeline(conf, tenant_id, timeline_id) .context("found broken timeline")?; - timelines_to_attach.insert(timeline_id); + timelines_to_attach.push((timeline_id, metadata)); } LocalTimelineInitStatus::NeedsSync => { debug!( @@ -545,32 +535,8 @@ fn init_local_repository( // Lets fail here loudly to be on the safe side. // XXX: It may be a better api to actually distinguish between repository startup // and processing of newly downloaded timelines. - attach_downloaded_tenant(&repo, timelines_to_attach) - .with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))?; - Ok(()) -} - -fn attach_downloaded_tenant( - repo: &Repository, - downloaded_timelines: HashSet, -) -> anyhow::Result<()> { - // first, register timeline metadata to ensure ancestors will be found later during layer load - for &timeline_id in &downloaded_timelines { - repo.attach_timeline(timeline_id).with_context(|| { - format!("Failed to load timeline {timeline_id} into in-memory repository") - })?; - } - - // and then load its layers in memory - for timeline_id in downloaded_timelines { - repo.get_timeline_load(timeline_id).with_context(|| { - format!( - "Failed to register add local timeline for tenant {}", - repo.tenant_id(), - ) - })?; - } - + repo.init_attach_timelines(timelines_to_attach) + .with_context(|| format!("Failed to init local timelines for tenant {tenant_id}"))?; Ok(()) } diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 4f760751db..936699c2ec 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -108,7 +108,7 @@ fn bootstrap_timeline( tenantid: ZTenantId, tli: ZTimelineId, repo: &Repository, -) -> Result<()> { +) -> Result> { let initdb_path = conf .tenant_path(&tenantid) .join(format!("tmp-timeline-{}", tli)); @@ -141,7 +141,7 @@ fn bootstrap_timeline( // Remove temp dir. We don't need it anymore fs::remove_dir_all(pgdata_path)?; - Ok(()) + Ok(timeline) } /// @@ -159,7 +159,7 @@ pub(crate) fn create_timeline( new_timeline_id: Option, ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, -) -> Result)>> { +) -> Result>> { let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; @@ -168,11 +168,11 @@ pub(crate) fn create_timeline( return Ok(None); } - match ancestor_timeline_id { + let loaded_timeline = match ancestor_timeline_id { Some(ancestor_timeline_id) => { let ancestor_timeline = repo - .get_timeline_load(ancestor_timeline_id) - .context("Cannot branch off the timeline that's not present locally")?; + .get_timeline(ancestor_timeline_id) + .context("Cannot branch off the timeline that's not present in pageserver")?; if let Some(lsn) = ancestor_start_lsn.as_mut() { // Wait for the WAL to arrive and be processed on the parent branch up @@ -201,8 +201,5 @@ pub(crate) fn create_timeline( None => bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?, }; - // load the timeline into memory - let loaded_timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; - - Ok(Some((new_timeline_id, loaded_timeline))) + Ok(Some(loaded_timeline)) } diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 2c29a56ad2..d441bbb4ab 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -132,7 +132,7 @@ pub async fn handle_walreceiver_connection( let (repo, timeline) = tokio::task::spawn_blocking(move || { let repo = tenant_mgr::get_repository_for_tenant(tenant_id) .with_context(|| format!("no repository found for tenant {tenant_id}"))?; - let timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id) + let timeline = repo.get_timeline(timeline_id) .with_context(|| { format!("local timeline {timeline_id} not found for tenant {tenant_id}") })?; diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 31b54f827b..4aba2494e9 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -68,9 +68,11 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # But all others are broken - # First timeline would fail instantly due to corrupt metadata file + # First timeline would not get loaded into pageserver due to corrupt metadata file (_tenant, _timeline, pg) = tenant_timelines[1] - with pytest.raises(Exception, match="Cannot load local timeline") as err: + with pytest.raises( + Exception, match=f"Could not get timeline {timeline1} in tenant {tenant1}" + ) as err: pg.start() log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 8ee38fcf4f..a7b7189824 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -93,10 +93,7 @@ def check_client(client: NeonPageserverHttpClient, initial_tenant: ZTenantId): assert ZTenantId(timeline_details["tenant_id"]) == tenant_id assert ZTimelineId(timeline_details["timeline_id"]) == timeline_id - - local_timeline_details = timeline_details.get("local") - assert local_timeline_details is not None - assert local_timeline_details["timeline_state"] == "Loaded" + assert timeline_details.get("local") is not None def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): From 827c3013bde272660d2577883cbc07c770c5a4e6 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 2 Sep 2022 13:20:31 +0300 Subject: [PATCH 0719/1022] Adjust benchmark code to Ids --- test_runner/fixtures/compare_fixtures.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 6bca5be335..ceeeffc785 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -112,10 +112,10 @@ class NeonCompare(PgCompare): return self._pg_bin def flush(self): - self.pscur.execute(f"do_gc {self.env.initial_tenant.hex} {self.timeline} 0") + self.pscur.execute(f"do_gc {self.env.initial_tenant} {self.timeline} 0") def compact(self): - self.pscur.execute(f"compact {self.env.initial_tenant.hex} {self.timeline}") + self.pscur.execute(f"compact {self.env.initial_tenant} {self.timeline}") def report_peak_memory_use(self) -> None: self.zenbenchmark.record( From 8b28adb6a63c493e099f8f9e6a81e1b48b3caa70 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 2 Sep 2022 13:24:00 +0300 Subject: [PATCH 0720/1022] Merge file name and extension for index part files --- pageserver/src/storage_sync/download.rs | 6 ++---- pageserver/src/storage_sync/index.rs | 3 +-- pageserver/src/storage_sync/upload.rs | 12 ++---------- 3 files changed, 5 insertions(+), 16 deletions(-) diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index ebc9a252b7..e11a863dcc 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -141,8 +141,7 @@ async fn download_index_part( sync_id: ZTenantTimelineId, ) -> Result { let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) - .with_file_name(IndexPart::FILE_NAME) - .with_extension(IndexPart::FILE_EXTENSION); + .with_file_name(IndexPart::FILE_NAME); let mut index_part_download = storage .download_storage_object(None, &index_part_path) .await?; @@ -663,8 +662,7 @@ mod tests { let local_index_part_path = metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id) - .with_file_name(IndexPart::FILE_NAME) - .with_extension(IndexPart::FILE_EXTENSION); + .with_file_name(IndexPart::FILE_NAME); let storage_path = local_storage.remote_object_id(&local_index_part_path)?; fs::create_dir_all(storage_path.parent().unwrap()).await?; fs::write(&storage_path, serde_json::to_vec(&index_part)?).await?; diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 7e644da412..b17bb40da4 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -278,8 +278,7 @@ pub struct IndexPart { } impl IndexPart { - pub const FILE_NAME: &'static str = "index_part"; - pub const FILE_EXTENSION: &'static str = "json"; + pub const FILE_NAME: &'static str = "index_part.json"; #[cfg(test)] pub fn new( diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 7ef775e690..38bad73d3b 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -42,8 +42,7 @@ pub(super) async fn upload_index_part( let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes)); let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) - .with_file_name(IndexPart::FILE_NAME) - .with_extension(IndexPart::FILE_EXTENSION); + .with_file_name(IndexPart::FILE_NAME); storage .upload_storage_object(index_part_bytes, index_part_size, &index_part_path) .await @@ -442,17 +441,10 @@ mod tests { let index_part_path = storage_files.first().unwrap(); assert_eq!( - index_part_path.file_stem().and_then(|name| name.to_str()), + index_part_path.file_name().and_then(|name| name.to_str()), Some(IndexPart::FILE_NAME), "Remote index part should have the correct name" ); - assert_eq!( - index_part_path - .extension() - .and_then(|extension| extension.to_str()), - Some(IndexPart::FILE_EXTENSION), - "Remote index part should have the correct extension" - ); let remote_index_part: IndexPart = serde_json::from_slice(&fs::read(&index_part_path).await?)?; From 73f926c39a606ab119e080d542bc103d5c402c56 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 2 Sep 2022 13:13:42 +0300 Subject: [PATCH 0721/1022] Return safekeeper remote storage logging during downloads --- safekeeper/src/wal_backup.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index a15ba02863..5c6991c196 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -437,16 +437,23 @@ pub async fn read_object( file_path: PathBuf, offset: u64, ) -> anyhow::Result>> { - let download = REMOTE_STORAGE + let storage = REMOTE_STORAGE .get() .context("Failed to get remote storage")? .as_ref() - .context("No remote storage configured")? + .context("No remote storage configured")?; + + info!( + "segment download about to start for local path {} at offset {}", + file_path.display(), + offset + ); + let download = storage .download_storage_object(Some((offset, None)), &file_path) .await .with_context(|| { format!( - "Failed to open WAL segment download stream for local storage path {}", + "Failed to open WAL segment download stream for local path {}", file_path.display() ) })?; From a463749f59c3fe020065c4cacc91df8fa11ffb99 Mon Sep 17 00:00:00 2001 From: MMeent Date: Fri, 2 Sep 2022 14:34:40 +0200 Subject: [PATCH 0722/1022] Slim down compute-node images (#2346) Slim down compute-node images: - Optimize compute_ctl build for size, not performance & debug-ability - Don't run unused stages. Saves time in not building the PLV8 extension. - Do not include static libraries in clean postgres - Do the installation and finishing touches in the final layer in one job This allows docker (and kaniko) to only register one change to the files, removing potentially duplicate changed files. - The runtime library for libreadline-dev is libreadline8, changing the dependency saves 45 MB - libprotobuf-c-dev -> libprotobuf-c1, saving 100 kB - libossp-uuid-dev -> libossp-uuid16, saving 150 kB - gdal-bin + libgdal-dev -> libgeos-c1v5 + libgdal28 + libproj19, saving 747MB - binutils @ testing -> libc6 @ testing, saving 32 MB --- .github/workflows/build_and_test.yml | 2 +- Cargo.toml | 53 ++++++++++++++++++ Dockerfile.compute-node | 84 ++++++++++++++++++++++------ 3 files changed, 122 insertions(+), 17 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a3314738fa..6fae36c6e4 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -459,7 +459,7 @@ jobs: run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - name: Kaniko build compute node with extensions - run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID promote-images: runs-on: dev diff --git a/Cargo.toml b/Cargo.toml index f0934853f0..a19f65a14f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,59 @@ members = [ # Besides, debug info should not affect the performance. debug = true +[profile.release-line-debug] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +[profile.release-line-debug-lto] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +lto = true + +[profile.release-line-debug-size] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +opt-level = "s" +[profile.release-line-debug-zize] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +opt-level = "z" +[profile.release-line-debug-size-lto] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +opt-level = "s" +lto = true +[profile.release-line-debug-zize-lto] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +opt-level = "z" +lto = true + +[profile.release-no-debug] +inherits = "release" +debug = false # true = 2 = all symbols, 1 = line only + +[profile.release-no-debug-size] +inherits = "release" +debug = false # true = 2 = all symbols, 1 = line only +opt-level = "s" +[profile.release-no-debug-zize] +inherits = "release" +debug = false # true = 2 = all symbols, 1 = line only +opt-level = "z" + +[profile.release-no-debug-size-lto] +inherits = "release" +debug = false # true = 2 = all symbols, 1 = line only +opt-level = "s" +lto = true + +[profile.release-no-debug-zize-lto] +inherits = "release" +debug = false # true = 2 = all symbols, 1 = line only +opt-level = "z" +lto = true + + # This is only needed for proxy's tests. # TODO: we should probably fork `tokio-postgres-rustls` instead. [patch.crates-io] diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 2e031b17da..3298032030 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -3,12 +3,18 @@ ARG TAG=pinned # ARG POSTGIS_VERSION=3.3.0 # ARG PLV8_VERSION=3.1.4 +# +# Layer "build-deps" +# FROM debian:bullseye-slim AS build-deps RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ libcurl4-openssl-dev libossp-uuid-dev +# +# Layer "pg-build" # Build Postgres from the neon postgres repository. +# FROM build-deps AS pg-build COPY vendor/postgres postgres RUN cd postgres && \ @@ -19,9 +25,14 @@ RUN cd postgres && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install -# Build PostGIS from the upstream PostGIS mirror. PostGIS compiles against neon postgres sources without changes. -# Perhaps we could even use the upstream binaries, compiled against vanilla Postgres, but it would require some -# investigation to check that it works, and also keeps working in the future. So for now, we compile our own binaries. +# +# Layer "postgis-build" +# Build PostGIS from the upstream PostGIS mirror. +# +# PostGIS compiles against neon postgres sources without changes. Perhaps we +# could even use the upstream binaries, compiled against vanilla Postgres, but +# it would require some investigation to check that it works, and also keeps +# working in the future. So for now, we compile our own binaries. FROM build-deps AS postgis-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ @@ -42,7 +53,10 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control +# +# Layer "plv8-build" # Build plv8 +# FROM build-deps AS plv8-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ @@ -64,7 +78,10 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ rm -rf /plv8-* && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control +# +# Layer "neon-pg-ext-build" # compile neon extensions +# FROM build-deps AS neon-pg-ext-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ @@ -79,9 +96,32 @@ FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . -RUN cd compute_tools && cargo build --locked --release +RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto +# +# Clean up postgres folder before inclusion +# +FROM neon-pg-ext-build AS postgres-cleanup-layer +COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql + +# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) +RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp + +# Remove headers that we won't need anymore - we've completed installation of all extensions +RUN rm -r /usr/local/pgsql/include + +# Remove now-useless PGXS src infrastructure +RUN rm -r /usr/local/pgsql/lib/pgxs/src + +# Remove static postgresql libraries - all compilation is finished, so we +# can now remove these files - they must be included in other binaries by now +# if they were to be used by other libraries. +RUN rm /usr/local/pgsql/lib/lib*.a + +# +# Final layer # Put it all together into the final image +# FROM debian:bullseye-slim # Add user postgres RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ @@ -93,22 +133,34 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ # TODO: Check if we can make the extension setup more modular versus a linear build # currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc# -COPY --from=neon-pg-ext-build --chown=postgres /usr/local/pgsql /usr/local -COPY --from=compute-tools --chown=postgres /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl +COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local +COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl +# Install: +# libreadline8 for psql +# libossp-uuid16 for extension ossp-uuid +# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS +# GLIBC 2.34 for plv8. +# Debian bullseye provides GLIBC 2.31, so we install the library from testing +# +# Lastly, link compute_ctl into zenith_ctl while we're at it, +# so that we don't need to put this in another layer. RUN apt update && \ - apt install -y libreadline-dev libossp-uuid-dev gdal-bin libgdal-dev libprotobuf-c-dev && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* - -# Debian bullseye provides GLIBC 2.31 when 2.34 is necessary as we compiled plv8 with that version -RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ + apt install --no-install-recommends -y \ + libreadline8 \ + libossp-uuid16 \ + libgeos-c1v5 \ + libgdal28 \ + libproj19 \ + libprotobuf-c1 && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ + echo "Installing GLIBC 2.34" && \ + echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ apt update && \ - apt install -y --no-install-recommends -t testing binutils && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* - -# "temporary" symlink for old control-plane -RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl + apt install -y --no-install-recommends -t testing libc6 && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ + ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl USER postgres ENTRYPOINT ["/usr/local/bin/compute_ctl"] From a4e79db348b4de57c55ff991f93e89831baec2c2 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 2 Sep 2022 15:46:46 +0300 Subject: [PATCH 0723/1022] Move `neon_local` to `control_plane`. Seems a bit silly to have a separate crate just for the executable. It relies on the control plane for everything it does, and it's the only user of the control plane. --- Cargo.lock | 20 +++---------------- Cargo.toml | 1 - control_plane/Cargo.toml | 3 +++ .../src/bin/neon_local.rs | 7 +++++++ neon_local/Cargo.toml | 19 ------------------ 5 files changed, 13 insertions(+), 37 deletions(-) rename neon_local/src/main.rs => control_plane/src/bin/neon_local.rs (99%) delete mode 100644 neon_local/Cargo.toml diff --git a/Cargo.lock b/Cargo.lock index 2e300e46f5..563a998601 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -495,6 +495,9 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", + "clap 3.2.16", + "comfy-table", + "git-version", "nix", "once_cell", "pageserver", @@ -1648,23 +1651,6 @@ dependencies = [ "tempfile", ] -[[package]] -name = "neon_local" -version = "0.1.0" -dependencies = [ - "anyhow", - "clap 3.2.16", - "comfy-table", - "control_plane", - "git-version", - "pageserver", - "postgres", - "safekeeper", - "serde_json", - "utils", - "workspace_hack", -] - [[package]] name = "nix" version = "0.23.1" diff --git a/Cargo.toml b/Cargo.toml index a19f65a14f..1936b261f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,6 @@ members = [ "proxy", "safekeeper", "workspace_hack", - "neon_local", "libs/*", ] diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 425eb332c3..8a79a6e566 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" edition = "2021" [dependencies] +clap = "3.0" +comfy-table = "5.0.1" +git-version = "0.3.5" tar = "0.4.38" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } serde = { version = "1.0", features = ["derive"] } diff --git a/neon_local/src/main.rs b/control_plane/src/bin/neon_local.rs similarity index 99% rename from neon_local/src/main.rs rename to control_plane/src/bin/neon_local.rs index 78a465539a..828d6a2e5a 100644 --- a/neon_local/src/main.rs +++ b/control_plane/src/bin/neon_local.rs @@ -1,3 +1,10 @@ +//! +//! `neon_local` is an executable that can be used to create a local +//! Neon environment, for testing purposes. The local environment is +//! quite different from the cloud environment with Kubernetes, but it +//! easier to work with locally. The python tests in `test_runner` +//! rely on `neon_local` to set up the environment for each test. +//! use anyhow::{anyhow, bail, Context, Result}; use clap::{App, AppSettings, Arg, ArgMatches}; use control_plane::compute::ComputeControlPlane; diff --git a/neon_local/Cargo.toml b/neon_local/Cargo.toml deleted file mode 100644 index 2fc38cfe02..0000000000 --- a/neon_local/Cargo.toml +++ /dev/null @@ -1,19 +0,0 @@ -[package] -name = "neon_local" -version = "0.1.0" -edition = "2021" - -[dependencies] -clap = "3.0" -anyhow = "1.0" -serde_json = "1" -comfy-table = "5.0.1" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -git-version = "0.3.5" - -# FIXME: 'pageserver' is needed for BranchInfo. Refactor -pageserver = { path = "../pageserver" } -control_plane = { path = "../control_plane" } -safekeeper = { path = "../safekeeper" } -utils = { path = "../libs/utils" } -workspace_hack = { version = "0.1", path = "../workspace_hack" } From 71c965b0e162a5f1431b417b794e64fb5a39832f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 3 Sep 2022 08:48:28 +0300 Subject: [PATCH 0724/1022] Move backpressure throttling implementation to neon extension and function for monitoring throttling time (#2380) * Move backpressure throttling implementation to neon extension and function for monitoring throttling time * Add missing includes * Bump postgres version --- pgxn/neon/neon--1.0.sql | 7 +++++++ pgxn/neon/neon.c | 7 +++++++ pgxn/neon/walproposer.c | 46 +++++++++++++++++++++++++++++++++++++++-- pgxn/neon/walproposer.h | 3 +++ vendor/postgres | 2 +- 5 files changed, 62 insertions(+), 3 deletions(-) diff --git a/pgxn/neon/neon--1.0.sql b/pgxn/neon/neon--1.0.sql index 34f1ba78d4..58b98a5923 100644 --- a/pgxn/neon/neon--1.0.sql +++ b/pgxn/neon/neon--1.0.sql @@ -15,3 +15,10 @@ RETURNS record AS 'MODULE_PATHNAME', 'backpressure_lsns' LANGUAGE C STRICT PARALLEL UNSAFE; + +CREATE FUNCTION backpressure_throttling_time() +RETURNS bigint +AS 'MODULE_PATHNAME', 'backpressure_throttling_time' +LANGUAGE C STRICT +PARALLEL UNSAFE; + diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 595a126f04..62d2624e56 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -40,6 +40,7 @@ void _PG_init(void) PG_FUNCTION_INFO_V1(pg_cluster_size); PG_FUNCTION_INFO_V1(backpressure_lsns); +PG_FUNCTION_INFO_V1(backpressure_throttling_time); Datum pg_cluster_size(PG_FUNCTION_ARGS) @@ -80,3 +81,9 @@ backpressure_lsns(PG_FUNCTION_ARGS) PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); } + +Datum +backpressure_throttling_time(PG_FUNCTION_ARGS) +{ + PG_RETURN_UINT64(BackpressureThrottlingTime()); +} diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 9625325c0a..3baa4802b0 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -36,6 +36,7 @@ #include #include #include +#include "access/xact.h" #include "access/xlogdefs.h" #include "access/xlogutils.h" #include "storage/latch.h" @@ -58,6 +59,7 @@ #include "utils/builtins.h" #include "utils/guc.h" #include "utils/memutils.h" +#include "utils/ps_status.h" #include "utils/timestamp.h" #include "neon.h" @@ -159,8 +161,9 @@ static void nwp_shmem_startup_hook(void); static void nwp_register_gucs(void); static void nwp_prepare_shmem(void); static uint64 backpressure_lag_impl(void); +static bool backpressure_throttling_impl(void); - +static process_interrupts_callback_t PrevProcessInterruptsCallback; static shmem_startup_hook_type prev_shmem_startup_hook_type; @@ -175,9 +178,11 @@ void pg_init_walproposer(void) nwp_prepare_shmem(); delay_backend_us = &backpressure_lag_impl; + PrevProcessInterruptsCallback = ProcessInterruptsCallback; + ProcessInterruptsCallback = backpressure_throttling_impl; WalProposerRegister(); - + WalProposerInit = &WalProposerInitImpl; WalProposerStart = &WalProposerStartImpl; } @@ -1963,6 +1968,7 @@ WalproposerShmemInit(void) { memset(walprop_shared, 0, WalproposerShmemSize()); SpinLockInit(&walprop_shared->mutex); + pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); } LWLockRelease(AddinShmemInitLock); @@ -2401,3 +2407,39 @@ backpressure_lag_impl(void) } return 0; } + +#define BACK_PRESSURE_DELAY 10000L // 0.01 sec + +static bool backpressure_throttling_impl(void) +{ + int64 lag; + TimestampTz start, stop; + bool retry = PrevProcessInterruptsCallback + ? PrevProcessInterruptsCallback() + : false; + + // Don't throttle read only transactions and wal sender. + if (am_walsender || !TransactionIdIsValid(GetCurrentTransactionIdIfAny())) + return retry; + + // Calculate replicas lag + lag = backpressure_lag_impl(); + if (lag == 0) + return retry; + + // Suspend writers until replicas catch up + set_ps_display("backpressure throttling"); + + elog(DEBUG2, "backpressure throttling: lag %lu", lag); + start = GetCurrentTimestamp(); + pg_usleep(BACK_PRESSURE_DELAY); + stop = GetCurrentTimestamp(); + pg_atomic_add_fetch_u64(&walprop_shared->backpressureThrottlingTime, stop - start); + return true; +} + +uint64 +BackpressureThrottlingTime(void) +{ + return pg_atomic_read_u64(&walprop_shared->backpressureThrottlingTime); +} diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index b684d5264f..75167163f3 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -287,6 +287,7 @@ typedef struct WalproposerShmemState slock_t mutex; ReplicationFeedback feedback; term_t mineLastElectedTerm; + pg_atomic_uint64 backpressureThrottlingTime; } WalproposerShmemState; /* @@ -537,4 +538,6 @@ typedef struct WalProposerFunctionsType */ extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions; +extern uint64 BackpressureThrottlingTime(void); + #endif /* __NEON_WALPROPOSER_H__ */ diff --git a/vendor/postgres b/vendor/postgres index 22d9ead36b..bbd2ab1544 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 22d9ead36beeab6b6a99c64f9b0b1576927ad91b +Subproject commit bbd2ab15443935a6871b39f90ed669160d9987ad From eef74754082a0e8e0f8486d9022be98c5f682ce8 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 3 Sep 2022 17:06:19 +0300 Subject: [PATCH 0725/1022] Add tests for measuring effect of lsn caching (#2384) * Add tests for measurif effet of lsn caching * Fix formatting of test_latency.py * Fix test_lsn_mapping test --- test_runner/performance/test_latency.py | 29 +++++++++++++++++++++++++ test_runner/regress/test_lsn_mapping.py | 6 +++-- 2 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 test_runner/performance/test_latency.py diff --git a/test_runner/performance/test_latency.py b/test_runner/performance/test_latency.py new file mode 100644 index 0000000000..9aa618650d --- /dev/null +++ b/test_runner/performance/test_latency.py @@ -0,0 +1,29 @@ +import threading + +import pytest +from fixtures.compare_fixtures import PgCompare +from fixtures.neon_fixtures import Postgres +from performance.test_perf_pgbench import get_scales_matrix +from performance.test_wal_backpressure import record_read_latency + + +def start_write_workload(pg: Postgres, scale: int = 10): + with pg.connect().cursor() as cur: + cur.execute(f"create table big as select generate_series(1,{scale*100_000})") + + +# Measure latency of reads on one table, while lots of writes are happening on another table. +# The fine-grained tracking of last-written LSNs helps to keep the latency low. Without it, the reads would +# often need to wait for the WAL records of the unrelated writes to be processed by the pageserver. +@pytest.mark.parametrize("scale", get_scales_matrix(1)) +def test_measure_read_latency_heavy_write_workload(neon_with_baseline: PgCompare, scale: int): + env = neon_with_baseline + pg = env.pg + + with pg.connect().cursor() as cur: + cur.execute(f"create table small as select generate_series(1,{scale*100_000})") + + write_thread = threading.Thread(target=start_write_workload, args=(pg, scale * 100)) + write_thread.start() + + record_read_latency(env, lambda: write_thread.is_alive(), "SELECT count(*) from small") diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index f6ca7000dd..9d1efec2c1 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -1,7 +1,7 @@ from datetime import timedelta from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn from fixtures.utils import query_scalar @@ -34,9 +34,11 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Execute one more transaction with synchronous_commit enabled, to flush # all the previous transactions - cur.execute("SET synchronous_commit=on") cur.execute("INSERT INTO foo VALUES (-1)") + # Wait until WAL is received by pageserver + wait_for_last_flush_lsn(env, pgmain, env.initial_tenant, new_timeline_id) + # Check edge cases: timestamp in the future probe_timestamp = tbl[-1][1] + timedelta(hours=1) result = query_scalar( From 2b6c49b2ea07fb95ef3eb571d4081b396bac08f2 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 3 Sep 2022 14:06:00 +0300 Subject: [PATCH 0726/1022] Fix negative usize parsing --- pageserver/src/layered_repository/timeline.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 821995fad1..b050ef4030 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -432,14 +432,12 @@ impl LogicalSize { .map(CurrentLogicalSize::Exact) } None => { - let non_negative_size_increment = size_increment.max(0); - u64::try_from(non_negative_size_increment) - .with_context(|| { - format!( - "Failed to convert size increment {non_negative_size_increment} to u64" - ) - }) - .map(CurrentLogicalSize::Approximate) + let non_negative_size_increment = if size_increment < 0 { + 0 + } else { + u64::try_from(size_increment).expect("not negative, cannot fail") + }; + Ok(CurrentLogicalSize::Approximate(non_negative_size_increment)) } } } From 846d71b948a3a471a13ca9e5d9b813d048b14138 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 4 Sep 2022 22:25:32 +0300 Subject: [PATCH 0727/1022] Add test for last written lsn cache (#1949) * Fix pythin style * Fix iport of test_backpressure in test_latency * Apply changed to moved neon extension * Apply changed to moved neon extension * Merge with main * Update pgxn/neon/pagestore_smgr.c Co-authored-by: Heikki Linnakangas * Bump postgres version Co-authored-by: Heikki Linnakangas --- pgxn/neon/pagestore_smgr.c | 29 +++++++++++++++++++---------- vendor/postgres | 2 +- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 3e1b74dba7..21d6dfec52 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -558,7 +558,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * Remember the LSN on this page. When we read the page again, we must * read the same or newer version of it. */ - SetLastWrittenPageLSN(lsn); + SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forknum, blocknum); } @@ -603,7 +603,7 @@ zm_adjust_lsn(XLogRecPtr lsn) * Return LSN for requesting pages and number of blocks from page server */ static XLogRecPtr -zenith_get_request_lsn(bool *latest) +zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) { XLogRecPtr lsn; @@ -630,9 +630,9 @@ zenith_get_request_lsn(bool *latest) * so our request cannot concern those. */ *latest = true; - lsn = GetLastWrittenPageLSN(); + lsn = GetLastWrittenLSN(rnode, forknum, blkno); Assert(lsn != InvalidXLogRecPtr); - elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ", + elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenLSN lsn %X/%X ", (uint32) ((lsn) >> 32), (uint32) (lsn)); lsn = zm_adjust_lsn(lsn); @@ -716,7 +716,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - request_lsn = zenith_get_request_lsn(&latest); + request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, REL_METADATA_PSEUDO_BLOCKNO); { ZenithExistsRequest request = { .req.tag = T_ZenithExistsRequest, @@ -791,7 +791,7 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * * FIXME: This is currently not just an optimization, but required for * correctness. Postgres can call smgrnblocks() on the newly-created - * relation. Currently, we don't call SetLastWrittenPageLSN() when a new + * relation. Currently, we don't call SetLastWrittenLSN() when a new * relation created, so if we didn't remember the size in the relsize * cache, we might call smgrnblocks() on the newly-created relation before * the creation WAL record hass been received by the page server. @@ -904,6 +904,8 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, if (IS_LOCAL_REL(reln)) mdextend(reln, forkNum, blkno, buffer, skipFsync); #endif + + SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forkNum); } /* @@ -1079,7 +1081,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - request_lsn = zenith_get_request_lsn(&latest); + request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno); zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); #ifdef DEBUG_COMPARE_LOCAL @@ -1284,7 +1286,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) return n_blocks; } - request_lsn = zenith_get_request_lsn(&latest); + request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node, forknum, REL_METADATA_PSEUDO_BLOCKNO); { ZenithNblocksRequest request = { .req.tag = T_ZenithNblocksRequest, @@ -1343,8 +1345,9 @@ zenith_dbsize(Oid dbNode) int64 db_size; XLogRecPtr request_lsn; bool latest; + RelFileNode dummy_node = {InvalidOid, InvalidOid, InvalidOid}; - request_lsn = zenith_get_request_lsn(&latest); + request_lsn = zenith_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); { ZenithDbSizeRequest request = { .req.tag = T_ZenithDbSizeRequest, @@ -1431,7 +1434,13 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) */ XLogFlush(lsn); - SetLastWrittenPageLSN(lsn); + /* + * Truncate may affect several chunks of relations. So we should either update last written LSN for all of them, + * or update LSN for "dummy" metadata block. Second approach seems more efficient. If the relation is extended + * again later, the extension will update the last-written LSN for the extended pages, so there's no harm in + * leaving behind obsolete entries for the truncated chunks. + */ + SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forknum); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) diff --git a/vendor/postgres b/vendor/postgres index bbd2ab1544..a4963aa6df 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit bbd2ab15443935a6871b39f90ed669160d9987ad +Subproject commit a4963aa6df6a44bdee17ef387c01bcf46f6017fd From 7a3e8bb7fb965bea6af0ae4c7393838e21f33d5e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 5 Sep 2022 11:02:13 +0300 Subject: [PATCH 0728/1022] Make tracing span names consistent for mgmt API handlers. --- pageserver/src/http/routes.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index f1033eeb2a..52997da5a0 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -162,7 +162,7 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result) -> Result) -> Result, let state = get_state(&request); let conf = state.conf; tokio::task::spawn_blocking(move || { - let _enter = info_span!("tenant_detach_handler", tenant = %tenant_id).entered(); + let _enter = info_span!("tenant_detach", tenant = %tenant_id).entered(); tenant_mgr::detach_tenant(conf, tenant_id) }) .await From aeb1cf9c36d3a895828fd8376ea474c5b635c025 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 5 Sep 2022 11:09:32 +0300 Subject: [PATCH 0729/1022] Fix misc typos and grammar in comments. --- pageserver/src/tenant_mgr.rs | 8 ++++---- test_runner/fixtures/neon_fixtures.py | 4 ++-- test_runner/fixtures/types.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index cbf9f2094a..7c82745142 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -435,10 +435,10 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any tenants_state::write_tenants().remove(&tenant_id); // If removal fails there will be no way to successfully retry detach, - // because tenant no longer exists in in memory map. And it needs to be removed from it - // before we remove files because it contains references to repository - // which references ephemeral files which are deleted on drop. So if we keep these references - // code will attempt to remove files which no longer exist. This can be fixed by having shutdown + // because the tenant no longer exists in the in-memory map. And it needs to be removed from it + // before we remove files, because it contains references to repository + // which references ephemeral files which are deleted on drop. So if we keep these references, + // we will attempt to remove files which no longer exist. This can be fixed by having shutdown // mechanism for repository that will clean temporary data to avoid any references to ephemeral files let local_tenant_directory = conf.tenant_path(&tenant_id); std::fs::remove_dir_all(&local_tenant_directory).with_context(|| { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9ad9c0cd2f..8ffb2eb829 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2522,8 +2522,8 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post def wait_until(number_of_iterations: int, interval: float, func): """ - Wait until 'func' returns successfully, without exception. Returns the last return value - from the the function. + Wait until 'func' returns successfully, without exception. Returns the + last return value from the function. """ last_exception = None for i in range(number_of_iterations): diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py index d5cb200080..bdf675a785 100644 --- a/test_runner/fixtures/types.py +++ b/test_runner/fixtures/types.py @@ -50,7 +50,7 @@ class ZId: """ Datatype for a Neon tenant and timeline IDs. Internally it's a 16-byte array, and the string representation is in hex. This corresponds to the ZId / ZTenantId / - ZTimelineIds in in the Rust code. + ZTimelineIds in the Rust code. """ def __init__(self, x: str): From ad057124beecafe188e3402cdd7e7581dc6ec096 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 5 Sep 2022 13:12:02 +0300 Subject: [PATCH 0730/1022] Update relation size cache only when latest LSN is requested (#2310) * Update relation size cache only when latest LSN is requested * Fix tests * Add a test case for timetravel query after pageserver restart. This test is currently failing, the queries return incorrect results. I don't know why, needs to be investigated. FAILED test_runner/batch_others/test_readonly_node.py::test_timetravel - assert 85 == 100000 If you remove the pageserver restart from the test, it passes. * yapf3 test_readonly_node.py * Add comment about cache correction in case of setting incorrect latest flag * Fix formatting for test_readonly_node.py * Remove unused imports * Fix mypy warning for test_readonly_node.py * Fix formatting of test_readonly_node.py * Bump postgres version Co-authored-by: Heikki Linnakangas --- pageserver/src/basebackup.rs | 6 +- pageserver/src/page_service.rs | 9 +- pageserver/src/pgdatadir_mapping.rs | 58 ++++++----- pageserver/src/walingest.rs | 112 ++++++++++++---------- test_runner/regress/test_readonly_node.py | 51 +++++++++- 5 files changed, 154 insertions(+), 82 deletions(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 864c5b8ac8..48b5f1a695 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -186,7 +186,7 @@ where } fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> { - let nblocks = self.timeline.get_rel_size(tag, self.lsn)?; + let nblocks = self.timeline.get_rel_size(tag, self.lsn, false)?; // Function that adds relation segment data to archive let mut add_file = |segment_index, data: &Vec| -> anyhow::Result<()> { @@ -207,7 +207,9 @@ where for (seg, blocks) in chunks.into_iter().enumerate() { let mut segment_data: Vec = vec![]; for blknum in blocks { - let img = self.timeline.get_rel_page_at_lsn(tag, blknum, self.lsn)?; + let img = self + .timeline + .get_rel_page_at_lsn(tag, blknum, self.lsn, false)?; segment_data.extend_from_slice(&img[..]); } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 7f7fa3c22b..358618f20c 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -696,7 +696,7 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; - let exists = timeline.get_rel_exists(req.rel, lsn)?; + let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { exists, @@ -712,7 +712,7 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; - let n_blocks = timeline.get_rel_size(req.rel, lsn)?; + let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { n_blocks, @@ -728,7 +728,8 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; - let total_blocks = timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn)?; + let total_blocks = + timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?; let db_size = total_blocks as i64 * BLCKSZ as i64; @@ -754,7 +755,7 @@ impl PageServerHandler { std::thread::sleep(std::time::Duration::from_millis(1000)); } */ - let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn)?; + let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 24002a36e5..7bba64179c 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -83,10 +83,16 @@ impl Timeline { //------------------------------------------------------------------------------ /// Look up given page version. - pub fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result { + pub fn get_rel_page_at_lsn( + &self, + tag: RelTag, + blknum: BlockNumber, + lsn: Lsn, + latest: bool, + ) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); - let nblocks = self.get_rel_size(tag, lsn)?; + let nblocks = self.get_rel_size(tag, lsn, latest)?; if blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", @@ -100,20 +106,20 @@ impl Timeline { } // Get size of a database in blocks - pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn, latest: bool) -> Result { let mut total_blocks = 0; let rels = self.list_rels(spcnode, dbnode, lsn)?; for rel in rels { - let n_blocks = self.get_rel_size(rel, lsn)?; + let n_blocks = self.get_rel_size(rel, lsn, latest)?; total_blocks += n_blocks as usize; } Ok(total_blocks) } /// Get size of a relation file - pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { + pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) { @@ -122,7 +128,7 @@ impl Timeline { if (tag.forknum == pg_constants::FSM_FORKNUM || tag.forknum == pg_constants::VISIBILITYMAP_FORKNUM) - && !self.get_rel_exists(tag, lsn)? + && !self.get_rel_exists(tag, lsn, latest)? { // FIXME: Postgres sometimes calls smgrcreate() to create // FSM, and smgrnblocks() on it immediately afterwards, @@ -135,13 +141,21 @@ impl Timeline { let mut buf = self.get(key, lsn)?; let nblocks = buf.get_u32_le(); - // Update relation size cache - self.update_cached_rel_size(tag, lsn, nblocks); + if latest { + // Update relation size cache only if "latest" flag is set. + // This flag is set by compute when it is working with most recent version of relation. + // Typically master compute node always set latest=true. + // Please notice, that even if compute node "by mistake" specifies old LSN but set + // latest=true, then it can not cause cache corruption, because with latest=true + // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be + // associated with most recent value of LSN. + self.update_cached_rel_size(tag, lsn, nblocks); + } Ok(nblocks) } /// Does relation exist? - pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { + pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); // first try to lookup relation in cache @@ -660,7 +674,7 @@ impl<'a> DatadirModification<'a> { pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> { let req_lsn = self.tline.get_last_record_lsn(); - let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn)?; + let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?; // Remove entry from dbdir let buf = self.get(DBDIR_KEY)?; @@ -733,7 +747,7 @@ impl<'a> DatadirModification<'a> { pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { ensure!(rel.relnode != 0, "invalid relnode"); let last_lsn = self.tline.get_last_record_lsn(); - if self.tline.get_rel_exists(rel, last_lsn)? { + if self.tline.get_rel_exists(rel, last_lsn, true)? { let size_key = rel_size_to_key(rel); // Fetch the old size first let old_size = self.get(size_key)?.get_u32_le(); @@ -1499,19 +1513,19 @@ mod tests { writer.finish()?; // Test read before rel creation. Should error out. - assert!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x10)).is_err()); + assert!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x10), false).is_err()); // Read block beyond end of relation at different points in time. // These reads should fall into different delta, image, and in-memory layers. - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x20))?, ZERO_PAGE); - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x25))?, ZERO_PAGE); - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x30))?, ZERO_PAGE); - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x35))?, ZERO_PAGE); - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, ZERO_PAGE); - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x45))?, ZERO_PAGE); - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, ZERO_PAGE); - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x55))?, ZERO_PAGE); - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x20), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x25), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x30), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x35), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x45), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x55), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)?, ZERO_PAGE); // Test on an in-memory layer with no preceding layer let mut writer = tline.begin_record(Lsn(0x70)); @@ -1523,7 +1537,7 @@ mod tests { )?; writer.finish()?; - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_B, 1, Lsn(0x70))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_B, 1, Lsn(0x70), false)?6, ZERO_PAGE); Ok(()) } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index c0965e7a22..57592a46d3 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -504,7 +504,7 @@ impl<'a> WalIngest<'a> { assert_eq!(src_rel.spcnode, src_tablespace_id); assert_eq!(src_rel.dbnode, src_db_id); - let nblocks = modification.tline.get_rel_size(src_rel, req_lsn)?; + let nblocks = modification.tline.get_rel_size(src_rel, req_lsn, true)?; let dst_rel = RelTag { spcnode: tablespace_id, dbnode: db_id, @@ -521,7 +521,7 @@ impl<'a> WalIngest<'a> { let content = modification .tline - .get_rel_page_at_lsn(src_rel, blknum, req_lsn)?; + .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; } @@ -680,7 +680,7 @@ impl<'a> WalIngest<'a> { relnode: xnode.relnode, }; let last_lsn = self.timeline.get_last_record_lsn(); - if modification.tline.get_rel_exists(rel, last_lsn)? { + if modification.tline.get_rel_exists(rel, last_lsn, true)? { self.put_rel_drop(modification, rel)?; } } @@ -924,10 +924,10 @@ impl<'a> WalIngest<'a> { } fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> Result { - let nblocks = if !self.timeline.get_rel_exists(rel, lsn)? { + let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true)? { 0 } else { - self.timeline.get_rel_size(rel, lsn)? + self.timeline.get_rel_size(rel, lsn, true)? }; Ok(nblocks) } @@ -943,12 +943,12 @@ impl<'a> WalIngest<'a> { // record. // TODO: would be nice if to be more explicit about it let last_lsn = modification.lsn; - let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? { + let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn, true)? { // create it with 0 size initially, the logic below will extend it modification.put_rel_creation(rel, 0)?; 0 } else { - self.timeline.get_rel_size(rel, last_lsn)? + self.timeline.get_rel_size(rel, last_lsn, true)? }; if new_nblocks > old_nblocks { @@ -1082,43 +1082,43 @@ mod tests { assert_current_logical_size(&*tline, Lsn(0x50)); // The relation was created at LSN 2, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); - assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10)).is_err()); + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false); + assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err()); - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50))?, 3); + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3); // Check page contents at each LSN assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20))?, + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)?, TEST_IMG("foo blk 0 at 2") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30))?, + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40))?, + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50))?, + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50))?, + tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?, TEST_IMG("foo blk 2 at 5") ); @@ -1129,20 +1129,20 @@ mod tests { assert_current_logical_size(&*tline, Lsn(0x60)); // Check reported size and contents after truncation - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 2); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 2); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60))?, + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)?, TEST_IMG("foo blk 1 at 4") ); // should still see the truncated block with older LSN - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50))?, 3); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50))?, + tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?, TEST_IMG("foo blk 2 at 5") ); @@ -1150,19 +1150,19 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x68)); walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68))?, 0); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68), false)?, 0); // Extend from 0 to 2 blocks, leaving a gap let mut m = tline.begin_modification(Lsn(0x70)); walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70))?, 2); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70), false)?, 2); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)?, ZERO_PAGE ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70))?, + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)?, TEST_IMG("foo blk 1") ); @@ -1170,15 +1170,15 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x80)); walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, 1501); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, 1501); for blk in 2..1500 { assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80))?, + tline.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)?, ZERO_PAGE ); } assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80))?, + tline.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)?, TEST_IMG("foo blk 1500") ); @@ -1198,8 +1198,8 @@ mod tests { m.commit()?; // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1); + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1); // Drop rel let mut m = tline.begin_modification(Lsn(0x30)); @@ -1207,10 +1207,10 @@ mod tests { m.commit()?; // Check that rel is not visible anymore - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false); + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30), false)?, false); // FIXME: should fail - //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30))?.is_none()); + //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30), false)?.is_none()); // Re-create it let mut m = tline.begin_modification(Lsn(0x40)); @@ -1218,8 +1218,8 @@ mod tests { m.commit()?; // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40))?, 1); + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40), false)?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40), false)?, 1); Ok(()) } @@ -1243,18 +1243,18 @@ mod tests { m.commit()?; // The relation was created at LSN 20, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); - assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10)).is_err()); + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false); + assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err()); - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, relsize); + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, relsize); // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, lsn)?, + tline.get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)?, TEST_IMG(&data) ); } @@ -1266,24 +1266,24 @@ mod tests { m.commit()?; // Check reported size and contents after truncation - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 1); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 1); for blkno in 0..1 { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60))?, + tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)?, TEST_IMG(&data) ); } // should still see all blocks with older LSN - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50))?, relsize); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, relsize); for blkno in 0..relsize { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50))?, + tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)?, TEST_IMG(&data) ); } @@ -1298,14 +1298,14 @@ mod tests { } m.commit()?; - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, relsize); + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80), false)?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, relsize); // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x80); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80))?, + tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)?, TEST_IMG(&data) ); } @@ -1332,14 +1332,17 @@ mod tests { assert_current_logical_size(&*tline, Lsn(lsn)); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn))?, RELSEG_SIZE + 1); + assert_eq!( + tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + RELSEG_SIZE + 1 + ); // Truncate one block lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn))?, RELSEG_SIZE); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, RELSEG_SIZE); assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate another block @@ -1347,7 +1350,10 @@ mod tests { let mut m = tline.begin_modification(Lsn(lsn)); walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn))?, RELSEG_SIZE - 1); + assert_eq!( + tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + RELSEG_SIZE - 1 + ); assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate to 1500, and then truncate all the way down to 0, one block at a time @@ -1359,7 +1365,7 @@ mod tests { walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?; m.commit()?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn))?, + tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, size as BlockNumber ); diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index fac9d97a42..3be64e077f 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -1,6 +1,6 @@ import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import NeonEnv, wait_for_last_record_lsn from fixtures.types import Lsn from fixtures.utils import query_scalar @@ -101,3 +101,52 @@ def test_readonly_node(neon_simple_env: NeonEnv): node_name="test_readonly_node_preinitdb", lsn=Lsn("0/42"), ) + + +# Similar test, but with more data, and we force checkpoints +def test_timetravel(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_timetravel", "empty") + pg = env.postgres.create_start("test_timetravel") + + client = env.pageserver.http_client() + + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + lsns = [] + + with pg.cursor() as cur: + cur.execute( + """ + CREATE TABLE testtab(id serial primary key, iteration int, data text); + INSERT INTO testtab (iteration, data) SELECT 0, 'data' FROM generate_series(1, 100000); + """ + ) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + lsns.append((0, current_lsn)) + + for i in range(1, 5): + with pg.cursor() as cur: + cur.execute(f"UPDATE testtab SET iteration = {i}") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + lsns.append((i, current_lsn)) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to force a new layer file + env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") + + ##### Restart pageserver + env.postgres.stop_all() + env.pageserver.stop() + env.pageserver.start() + + for (i, lsn) in lsns: + pg_old = env.postgres.create_start( + branch_name="test_timetravel", node_name=f"test_old_lsn_{i}", lsn=lsn + ) + with pg_old.cursor() as cur: + assert query_scalar(cur, f"select count(*) from testtab where iteration={i}") == 100000 + assert query_scalar(cur, f"select count(*) from testtab where iteration<>{i}") == 0 From 772078eb5ccdacba2e01a2a09d54d3813c0c3512 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Thu, 1 Sep 2022 01:28:18 +0300 Subject: [PATCH 0731/1022] Reword proxy SNI error message Be more strict with project id/name difference and explain how to get project id out of the domain name. --- proxy/src/auth.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 4e78c576e2..d09470d15e 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -46,9 +46,9 @@ pub enum AuthErrorImpl { MalformedPassword(&'static str), #[error( - "Project name is not specified. \ + "Project ID is not specified. \ Either please upgrade the postgres client library (libpq) for SNI support \ - or pass the project name as a parameter: '&options=project%3D'. \ + or pass the project ID (first part of the domain name) as a parameter: '?options=project%3D'. \ See more at https://neon.tech/sni" )] MissingProjectName, From ee0071e90d2084148e0ff931883ac1e3c8c8dc41 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 5 Sep 2022 14:30:37 +0100 Subject: [PATCH 0732/1022] Fix nightly benchmark reports (#2392) --- .github/actions/run-python-test-set/action.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 2344fba13c..01ddced313 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -80,14 +80,14 @@ runs: env: NEON_BIN: /tmp/neon/bin TEST_OUTPUT: /tmp/test_output - # this variable will be embedded in perf test report - # and is needed to distinguish different environments - PLATFORM: github-actions-selfhosted BUILD_TYPE: ${{ inputs.build_type }} AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }} AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }} shell: bash -euxo pipefail {0} run: | + # PLATFORM will be embedded in the perf test report + # and it is needed to distinguish different environments + export PLATFORM=${PLATFORM:-github-actions-selfhosted} export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} if [ "${BUILD_TYPE}" = "remote" ]; then @@ -155,7 +155,7 @@ runs: if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then export REPORT_FROM="$PERF_REPORT_DIR" - export REPORT_TO=local + export REPORT_TO="$PLATFORM" scripts/generate_and_push_perf_report.sh fi fi From 05e263d0d36bb2e49aa83b4488361b78ec3016e9 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 5 Sep 2022 18:30:54 +0300 Subject: [PATCH 0733/1022] Prepare pg 15 support (build system and submodules) (#2337) * Add submodule postgres-15 * Support pg_15 in pgxn/neon * Renamed zenith -> neon in Makefile * fix name of codestyle check * Refactor build system to prepare for building multiple Postgres versions. Rename "vendor/postgres" to "vendor/postgres-v14" Change Postgres build and install directory paths to be version-specific: - tmp_install/build -> pg_install/build/14 - tmp_install/* -> pg_install/14/* And Makefile targets: - "make postgres" -> "make postgres-v14" - "make postgres-headers" -> "make postgres-v14-headers" - etc. Add Makefile aliases: - "make postgres" to build "postgres-v14" and in future, "postgres-v15" - similarly for "make postgres-headers" Fix POSTGRES_DISTRIB_DIR path in pytest scripts * Make postgres version a variable in codestyle workflow * Support vendor/postgres-v15 in codestyle check workflow * Support postgres-v15 building in Makefile * fix pg version in Dockerfile.compute-node * fix kaniko path * Build neon extensions in version-specific directories * fix obsolete mentions of vendor/postgres * use vendor/postgres-v14 in Dockerfile.compute-node.legacy * Use PG_VERSION_NUM to gate dependencies in inmem_smgr.c * Use versioned ECR repositories and image names for compute-node. The image name format is compute-node-vXX, where XX is postgres major version number. For now only v14 is supported. Old format unversioned name (compute-node) is left, because cloud repo depends on it. * update vendor/postgres submodule url (zenith->neondatabase rename) * Fix postgres path in python tests after rebase * fix path in regress test * Use separate dockerfiles to build compute-node: Dockerfile.compute-node-v15 should be identical to Dockerfile.compute-node-v14 except for the version number. This is a hack, because Kaniko doesn't support build ARGs properly * bump vendor/postgres-v14 and vendor/postgres-v15 * Don't use Kaniko cache for v14 and v15 compute-node images * Build compute-node images for different versions in different jobs Co-authored-by: Heikki Linnakangas --- .dockerignore | 3 +- .../actions/run-python-test-set/action.yml | 2 +- .github/workflows/build_and_test.yml | 67 +++++-- .github/workflows/codestyle.yml | 18 +- .github/workflows/pg_clients.yml | 2 +- .gitignore | 2 +- .gitmodules | 10 +- Dockerfile | 19 +- ...ompute-node => Dockerfile.compute-node-v14 | 7 +- Dockerfile.compute-node-v15 | 172 +++++++++++++++++ Dockerfile.compute-node.legacy | 3 +- Makefile | 174 ++++++++++++------ NOTICE | 4 +- README.md | 5 +- control_plane/src/local_env.rs | 4 +- docs/settings.md | 2 +- docs/sourcetree.md | 6 +- libs/postgres_ffi/build.rs | 9 +- libs/postgres_ffi/src/xlog_utils.rs | 2 +- .../wal_craft/src/bin/wal_craft.rs | 2 +- pageserver/src/config.rs | 2 +- pgxn/neon/inmem_smgr.c | 4 + pgxn/neon/pagestore_smgr.c | 9 + pgxn/neon/relsize_cache.c | 28 +++ pgxn/neon/walproposer.c | 58 +++++- pgxn/neon/walproposer_utils.c | 97 ++++++++-- test_runner/fixtures/neon_fixtures.py | 4 +- test_runner/regress/test_pg_regress.py | 10 +- vendor/{postgres => postgres-v14} | 0 vendor/postgres-v15 | 1 + 30 files changed, 593 insertions(+), 133 deletions(-) rename Dockerfile.compute-node => Dockerfile.compute-node-v14 (98%) create mode 100644 Dockerfile.compute-node-v15 rename vendor/{postgres => postgres-v14} (100%) create mode 160000 vendor/postgres-v15 diff --git a/.dockerignore b/.dockerignore index 2c78951923..9f8a22d598 100644 --- a/.dockerignore +++ b/.dockerignore @@ -13,6 +13,7 @@ !pgxn/ !proxy/ !safekeeper/ -!vendor/postgres/ +!vendor/postgres-v14/ +!vendor/postgres-v15/ !workspace_hack/ !neon_local/ diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 01ddced313..f04f5d11b8 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -88,7 +88,7 @@ runs: # PLATFORM will be embedded in the perf test report # and it is needed to distinguish different environments export PLATFORM=${PLATFORM:-github-actions-selfhosted} - export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} + export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install/v14} if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 6fae36c6e4..6eddbc3335 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -78,8 +78,8 @@ jobs: fetch-depth: 1 - name: Set pg revision for caching - id: pg_ver - run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres) + id: pg_v14_rev + run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14) shell: bash -euxo pipefail {0} # Set some environment variables used by all the steps. @@ -124,12 +124,12 @@ jobs: v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- - - name: Cache postgres build + - name: Cache postgres v14 build id: cache_pg uses: actions/cache@v3 with: - path: tmp_install/ - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + path: pg_install/v14 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Build postgres if: steps.cache_pg.outputs.cache-hit != 'true' @@ -192,7 +192,7 @@ jobs: shell: bash -euxo pipefail {0} - name: Install postgres binaries - run: cp -a tmp_install /tmp/neon/pg_install + run: cp -a pg_install /tmp/neon/pg_install shell: bash -euxo pipefail {0} - name: Upload Neon artifact @@ -447,7 +447,6 @@ jobs: compute-node-image: runs-on: dev container: gcr.io/kaniko-project/executor:v1.9.0-debug - steps: - name: Checkout uses: actions/checkout@v1 # v3 won't work with kaniko @@ -458,18 +457,57 @@ jobs: - name: Configure ECR login run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - name: Kaniko build compute node with extensions - run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID + # compute-node uses postgres 14, which is default now + # cloud repo depends on this image name, thus duplicating it + # remove compute-node when cloud repo is updated + - name: Kaniko build compute node with extensions v14 (compatibility) + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID + + compute-node-image-v14: + runs-on: dev + container: gcr.io/kaniko-project/executor:v1.9.0-debug + steps: + - name: Checkout + uses: actions/checkout@v1 # v3 won't work with kaniko + with: + submodules: true + fetch-depth: 0 + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build compute node with extensions v14 + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID + + + compute-node-image-v15: + runs-on: dev + container: gcr.io/kaniko-project/executor:v1.9.0-debug + steps: + - name: Checkout + uses: actions/checkout@v1 # v3 won't work with kaniko + with: + submodules: true + fetch-depth: 0 + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build compute node with extensions v15 + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID promote-images: runs-on: dev - needs: [ neon-image, compute-node-image, compute-tools-image ] + needs: [ neon-image, compute-node-image, compute-node-image-v14, compute-tools-image ] if: github.event_name != 'workflow_dispatch' container: amazon/aws-cli strategy: fail-fast: false matrix: - name: [ neon, compute-node, compute-tools ] + # compute-node uses postgres 14, which is default now + # cloud repo depends on this image name, thus duplicating it + # remove compute-node when cloud repo is updated + name: [ neon, compute-node, compute-node-v14, compute-tools ] steps: - name: Promote image to latest @@ -501,6 +539,9 @@ jobs: - name: Pull compute node image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest compute-node + - name: Pull compute node v14 image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest compute-node-v14 + - name: Pull rust image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust @@ -519,6 +560,9 @@ jobs: - name: Push compute node image to Docker Hub run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}} + - name: Push compute node v14 image to Docker Hub + run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} + - name: Push rust image to Docker Hub run: crane push rust neondatabase/rust:pinned @@ -530,6 +574,7 @@ jobs: crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest calculate-deploy-targets: runs-on: [ self-hosted, Linux, k8s-runner ] diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index b64ea8a01f..a5e31d49ee 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -27,8 +27,10 @@ jobs: # Rust toolchains (e.g. nightly or 1.37.0), add them here. rust_toolchain: [1.58] os: [ubuntu-latest, macos-latest] + # To support several Postgres versions, add them here. + postgres_version: [v14, v15] timeout-minutes: 60 - name: run regression test suite + name: check codestyle rust and postgres runs-on: ${{ matrix.os }} steps: @@ -61,14 +63,14 @@ jobs: - name: Set pg revision for caching id: pg_ver - run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres) + run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-${{matrix.postgres_version}}) - - name: Cache postgres build + - name: Cache postgres ${{matrix.postgres_version}} build id: cache_pg uses: actions/cache@v3 with: path: | - tmp_install/ + pg_install/${{matrix.postgres_version}} key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }} - name: Set extra env for macOS @@ -90,10 +92,10 @@ jobs: if: failure() continue-on-error: true run: | - echo '' && echo '=== config.log ===' && echo '' - cat tmp_install/build/config.log - echo '' && echo '=== configure.log ===' && echo '' - cat tmp_install/build/configure.log + echo '' && echo '=== Postgres ${{matrix.postgres_version}} config.log ===' && echo '' + cat pg_install/build/${{matrix.postgres_version}}/config.log + echo '' && echo '=== Postgres ${{matrix.postgres_version}} configure.log ===' && echo '' + cat pg_install/build/${{matrix.postgres_version}}/configure.log - name: Cache cargo deps id: cache_cargo diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index 95052619cd..bf14865db2 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -52,7 +52,7 @@ jobs: REMOTE_ENV: 1 BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" - POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install/v14 shell: bash -euxo pipefail {0} run: | # Test framework expects we have psql binary; diff --git a/.gitignore b/.gitignore index ed718c8c79..618ff2c5b9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ +/pg_install /target /tmp_check -/tmp_install /tmp_check_cli __pycache__/ test_output/ diff --git a/.gitmodules b/.gitmodules index 8975c6e2fa..23765194c1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,8 @@ -[submodule "vendor/postgres"] - path = vendor/postgres - url = https://github.com/zenithdb/postgres +[submodule "vendor/postgres-v14"] + path = vendor/postgres-v14 + url = https://github.com/neondatabase/postgres.git branch = main +[submodule "vendor/postgres-v15"] + path = vendor/postgres-v15 + url = https://github.com/neondatabase/postgres.git + branch = REL_15_STABLE_neon diff --git a/Dockerfile b/Dockerfile index aa31e227da..d379c05051 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,20 +5,24 @@ ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com ARG IMAGE=rust ARG TAG=pinned +# ARGs don't get replaced in RUN commands in Kaniko +# so use hardcoded value below +# ARG PG_VERSION=v14 # Build Postgres FROM $REPOSITORY/$IMAGE:$TAG AS pg-build WORKDIR /home/nonroot -COPY --chown=nonroot vendor/postgres vendor/postgres +ARG PG_VERSION=v14 +COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile ENV BUILD_TYPE release RUN set -e \ - && mold -run make -j $(nproc) -s neon-pg-ext \ - && rm -rf tmp_install/build \ - && tar -C tmp_install -czf /home/nonroot/postgres_install.tar.gz . + && mold -run make -j $(nproc) -s neon-pg-ext-v14 \ + && rm -rf pg_install/v14/build \ + && tar -C pg_install/v14 -czf /home/nonroot/postgres_install.tar.gz . # Build zenith binaries FROM $REPOSITORY/$IMAGE:$TAG AS build @@ -35,7 +39,8 @@ ARG CACHEPOT_BUCKET=neon-github-dev #ARG AWS_ACCESS_KEY_ID #ARG AWS_SECRET_ACCESS_KEY -COPY --from=pg-build /home/nonroot/tmp_install/include/postgresql/server tmp_install/include/postgresql/server +ARG PG_VERSION=v14 +COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY . . # Show build caching stats to check if it was used in the end. @@ -64,7 +69,9 @@ COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/pageserver COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy /usr/local/bin -COPY --from=pg-build /home/nonroot/tmp_install/ /usr/local/ +# v14 is default for now +ARG PG_VERSION=v14 +COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node-v14 similarity index 98% rename from Dockerfile.compute-node rename to Dockerfile.compute-node-v14 index 3298032030..8ddf752191 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node-v14 @@ -2,6 +2,7 @@ ARG TAG=pinned # apparently, ARGs don't get replaced in RUN commands in kaniko # ARG POSTGIS_VERSION=3.3.0 # ARG PLV8_VERSION=3.1.4 +# ARG PG_VERSION=v14 # # Layer "build-deps" @@ -16,7 +17,7 @@ RUN apt update && \ # Build Postgres from the neon postgres repository. # FROM build-deps AS pg-build -COPY vendor/postgres postgres +COPY vendor/postgres-v14 postgres RUN cd postgres && \ ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ @@ -28,8 +29,8 @@ RUN cd postgres && \ # # Layer "postgis-build" # Build PostGIS from the upstream PostGIS mirror. -# -# PostGIS compiles against neon postgres sources without changes. Perhaps we +# +# PostGIS compiles against neon postgres sources without changes. Perhaps we # could even use the upstream binaries, compiled against vanilla Postgres, but # it would require some investigation to check that it works, and also keeps # working in the future. So for now, we compile our own binaries. diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15 new file mode 100644 index 0000000000..f949ef7680 --- /dev/null +++ b/Dockerfile.compute-node-v15 @@ -0,0 +1,172 @@ +# +# This file is identical to the Dockerfile.compute-node-v14 file +# except for the version of Postgres that is built. +# + +ARG TAG=pinned +# apparently, ARGs don't get replaced in RUN commands in kaniko +# ARG POSTGIS_VERSION=3.3.0 +# ARG PLV8_VERSION=3.1.4 +# ARG PG_VERSION=v15 + +# +# Layer "build-deps" +# +FROM debian:bullseye-slim AS build-deps +RUN apt update && \ + apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ + libcurl4-openssl-dev libossp-uuid-dev + +# +# Layer "pg-build" +# Build Postgres from the neon postgres repository. +# +FROM build-deps AS pg-build +COPY vendor/postgres-v15 postgres +RUN cd postgres && \ + ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ + # Install headers + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install + +# +# Layer "postgis-build" +# Build PostGIS from the upstream PostGIS mirror. +# +# PostGIS compiles against neon postgres sources without changes. Perhaps we +# could even use the upstream binaries, compiled against vanilla Postgres, but +# it would require some investigation to check that it works, and also keeps +# working in the future. So for now, we compile our own binaries. +FROM build-deps AS postgis-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +RUN apt update && \ + apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget + +RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ + tar xvzf postgis-3.3.0.tar.gz && \ + cd postgis-3.3.0 && \ + ./autogen.sh && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + ./configure && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + cd extensions/postgis && \ + make clean && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control + +# +# Layer "plv8-build" +# Build plv8 +# +FROM build-deps AS plv8-build +COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +RUN apt update && \ + apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev + +# https://github.com/plv8/plv8/issues/475 +# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary +RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ + echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + apt update && \ + apt install -y --no-install-recommends -t testing binutils + +RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ + tar xvzf v3.1.4.tar.gz && \ + cd plv8-3.1.4 && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + rm -rf /plv8-* && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control + +# +# Layer "neon-pg-ext-build" +# compile neon extensions +# +FROM build-deps AS neon-pg-ext-build +COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY pgxn/ pgxn/ + +RUN make -j $(getconf _NPROCESSORS_ONLN) \ + PG_CONFIG=/usr/local/pgsql/bin/pg_config \ + -C pgxn/neon \ + -s install + +# Compile and run the Neon-specific `compute_ctl` binary +FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools +USER nonroot +# Copy entire project to get Cargo.* files with proper dependencies for the whole project +COPY --chown=nonroot . . +RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto + +# +# Clean up postgres folder before inclusion +# +FROM neon-pg-ext-build AS postgres-cleanup-layer +COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql + +# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) +RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp + +# Remove headers that we won't need anymore - we've completed installation of all extensions +RUN rm -r /usr/local/pgsql/include + +# Remove now-useless PGXS src infrastructure +RUN rm -r /usr/local/pgsql/lib/pgxs/src + +# Remove static postgresql libraries - all compilation is finished, so we +# can now remove these files - they must be included in other binaries by now +# if they were to be used by other libraries. +RUN rm /usr/local/pgsql/lib/lib*.a + +# +# Final layer +# Put it all together into the final image +# +FROM debian:bullseye-slim +# Add user postgres +RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ + echo "postgres:test_console_pass" | chpasswd && \ + mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ + chown -R postgres:postgres /var/db/postgres && \ + chmod 0750 /var/db/postgres/compute && \ + echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig + +# TODO: Check if we can make the extension setup more modular versus a linear build +# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc# +COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local +COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl + +# Install: +# libreadline8 for psql +# libossp-uuid16 for extension ossp-uuid +# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS +# GLIBC 2.34 for plv8. +# Debian bullseye provides GLIBC 2.31, so we install the library from testing +# +# Lastly, link compute_ctl into zenith_ctl while we're at it, +# so that we don't need to put this in another layer. +RUN apt update && \ + apt install --no-install-recommends -y \ + libreadline8 \ + libossp-uuid16 \ + libgeos-c1v5 \ + libgdal28 \ + libproj19 \ + libprotobuf-c1 && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ + echo "Installing GLIBC 2.34" && \ + echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ + echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + apt update && \ + apt install -y --no-install-recommends -t testing libc6 && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ + ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl + +USER postgres +ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/Dockerfile.compute-node.legacy b/Dockerfile.compute-node.legacy index ba34e2486f..7689167156 100644 --- a/Dockerfile.compute-node.legacy +++ b/Dockerfile.compute-node.legacy @@ -37,7 +37,8 @@ RUN adduser postgres RUN mkdir /pg && chown postgres:postgres /pg # Copy source files -COPY ./vendor/postgres /pg/ +# version 14 is default for now +COPY ./vendor/postgres-v14 /pg/ COPY ./pgxn /pg/ # Build and install Postgres locally diff --git a/Makefile b/Makefile index 9d7e1497e5..0b2b097ebc 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,7 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) -# Where to install Postgres, default is ./tmp_install, maybe useful for package managers -POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/tmp_install - -# Seccomp BPF is only available for Linux -UNAME_S := $(shell uname -s) -ifeq ($(UNAME_S),Linux) - SECCOMP = --with-libseccomp -else - SECCOMP = -endif +# Where to install Postgres, default is ./pg_install, maybe useful for package managers +POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/ # # We differentiate between release / debug build types using the BUILD_TYPE @@ -28,6 +20,13 @@ else $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) endif +# Seccomp BPF is only available for Linux +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Linux) + PG_CONFIGURE_OPTS += --with-libseccomp +endif + + # macOS with brew-installed openssl requires explicit paths # It can be configured with OPENSSL_PREFIX variable UNAME_S := $(shell uname -s) @@ -48,75 +47,136 @@ CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+) CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1 # -# Top level Makefile to build Zenith and PostgreSQL +# Top level Makefile to build Neon and PostgreSQL # .PHONY: all -all: zenith postgres neon-pg-ext +all: neon postgres neon-pg-ext -### Zenith Rust bits +### Neon Rust bits # # The 'postgres_ffi' depends on the Postgres headers. -.PHONY: zenith -zenith: postgres-headers - +@echo "Compiling Zenith" +.PHONY: neon +neon: postgres-v14-headers + +@echo "Compiling Neon" $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) ### PostgreSQL parts -$(POSTGRES_INSTALL_DIR)/build/config.status: - +@echo "Configuring postgres build" - mkdir -p $(POSTGRES_INSTALL_DIR)/build - (cd $(POSTGRES_INSTALL_DIR)/build && \ - $(ROOT_PROJECT_DIR)/vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \ +# The rules are duplicated for Postgres v14 and 15. We may want to refactor +# to avoid the duplication in the future, but it's tolerable for now. +# +$(POSTGRES_INSTALL_DIR)/build/v14/config.status: + +@echo "Configuring Postgres v14 build" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14 + (cd $(POSTGRES_INSTALL_DIR)/build/v14 && \ + $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure CFLAGS='$(PG_CFLAGS)' \ $(PG_CONFIGURE_OPTS) \ - $(SECCOMP) \ - --prefix=$(abspath $(POSTGRES_INSTALL_DIR)) > configure.log) + --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log) -# nicer alias for running 'configure' -.PHONY: postgres-configure -postgres-configure: $(POSTGRES_INSTALL_DIR)/build/config.status +$(POSTGRES_INSTALL_DIR)/build/v15/config.status: + +@echo "Configuring Postgres v15 build" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15 + (cd $(POSTGRES_INSTALL_DIR)/build/v15 && \ + $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure CFLAGS='$(PG_CFLAGS)' \ + $(PG_CONFIGURE_OPTS) \ + --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log) -# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/include -.PHONY: postgres-headers -postgres-headers: postgres-configure - +@echo "Installing PostgreSQL headers" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/include MAKELEVEL=0 install +# nicer alias to run 'configure' +.PHONY: postgres-v14-configure +postgres-v14-configure: $(POSTGRES_INSTALL_DIR)/build/v14/config.status -# Compile and install PostgreSQL and contrib/neon -.PHONY: postgres -postgres: postgres-configure \ - postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers` - +@echo "Compiling PostgreSQL" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 install - +@echo "Compiling libpq" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/interfaces/libpq install - +@echo "Compiling pg_buffercache" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache install - +@echo "Compiling pageinspect" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect install +.PHONY: postgres-v15-configure +postgres-v15-configure: $(POSTGRES_INSTALL_DIR)/build/v15/config.status -.PHONY: postgres-clean -postgres-clean: - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/interfaces/libpq clean +# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)//include +.PHONY: postgres-v14-headers +postgres-v14-headers: postgres-v14-configure + +@echo "Installing PostgreSQL v14 headers" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/include MAKELEVEL=0 install -neon-pg-ext: postgres - +@echo "Compiling neon" - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/bin/pg_config \ - -C $(ROOT_PROJECT_DIR)/pgxn/neon install - +@echo "Compiling neon_test_utils" - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/bin/pg_config \ - -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils install +.PHONY: postgres-v15-headers +postgres-v15-headers: postgres-v15-configure + +@echo "Installing PostgreSQL v15 headers" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/include MAKELEVEL=0 install + +# Compile and install PostgreSQL +.PHONY: postgres-v14 +postgres-v14: postgres-v14-configure \ + postgres-v14-headers # to prevent `make install` conflicts with neon's `postgres-headers` + +@echo "Compiling PostgreSQL v14" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install + +@echo "Compiling libpq v14" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install + +@echo "Compiling pg_buffercache v14" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install + +@echo "Compiling pageinspect v14" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect install + +.PHONY: postgres-v15 +postgres-v15: postgres-v15-configure \ + postgres-v15-headers # to prevent `make install` conflicts with neon's `postgres-headers` + +@echo "Compiling PostgreSQL v15" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install + +@echo "Compiling libpq v15" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install + +@echo "Compiling pg_buffercache v15" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install + +@echo "Compiling pageinspect v15" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect install + +# shorthand to build all Postgres versions +postgres: postgres-v14 postgres-v15 + +.PHONY: postgres-v14-clean +postgres-v14-clean: + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq clean + +.PHONY: postgres-v15-clean +postgres-v15-clean: + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq clean + +neon-pg-ext-v14: postgres-v14 + +@echo "Compiling neon v14" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v14 + (cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install) + +@echo "Compiling neon_test_utils" v14 + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 + (cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install) + +neon-pg-ext-v15: postgres-v15 + +@echo "Compiling neon v15" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v15 + (cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install) + +@echo "Compiling neon_test_utils" v15 + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 + (cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install) .PHONY: neon-pg-ext-clean $(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean $(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean +neon-pg-ext: neon-pg-ext-v14 neon-pg-ext-v15 +postgres-headers: postgres-v14-headers postgres-v15-headers +postgres-clean: postgres-v14-clean postgres-v15-clean + # This doesn't remove the effects of 'configure'. .PHONY: clean clean: - cd $(POSTGRES_INSTALL_DIR)/build && $(MAKE) clean + cd $(POSTGRES_INSTALL_DIR)/build/v14 && $(MAKE) clean + cd $(POSTGRES_INSTALL_DIR)/build/v15 && $(MAKE) clean $(CARGO_CMD_PREFIX) cargo clean cd pgxn/neon && $(MAKE) clean cd pgxn/neon_test_utils && $(MAKE) clean diff --git a/NOTICE b/NOTICE index 47cc4e798f..4fbec9763b 100644 --- a/NOTICE +++ b/NOTICE @@ -1,5 +1,5 @@ Neon Copyright 2022 Neon Inc. -The PostgreSQL submodule in vendor/postgres is licensed under the -PostgreSQL license. See vendor/postgres/COPYRIGHT. +The PostgreSQL submodules in vendor/postgres-v14 and vendor/postgres-v15 are licensed under the +PostgreSQL license. See vendor/postgres-v14/COPYRIGHT and vendor/postgres-v15/COPYRIGHT. diff --git a/README.md b/README.md index f557b19987..57d0a144cb 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ Pageserver consists of: - WAL receiver - service that receives WAL from WAL service and stores it in the repository. - Page service - service that communicates with compute nodes and responds with pages from the repository. - WAL redo - service that builds pages from base images and WAL records on Page service request + ## Running local installation @@ -101,7 +102,7 @@ make -j`sysctl -n hw.logicalcpu` ``` #### Dependency installation notes -To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively. +To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively. To run the integration tests or Python scripts (not required to use the code), install Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry](https://python-poetry.org/)) in the project directory. @@ -208,7 +209,7 @@ Ensure your dependencies are installed as described [here](https://github.com/ne ```sh git clone --recursive https://github.com/neondatabase/neon.git -make # builds also postgres and installs it to ./tmp_install +make # builds also postgres and installs it to ./pg_install ./scripts/pytest ``` diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 75e552f6cc..c4a61dbd7b 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -289,13 +289,13 @@ impl LocalEnv { let mut env: LocalEnv = toml::from_str(toml)?; // Find postgres binaries. - // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install". + // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install/v14". if env.pg_distrib_dir == Path::new("") { if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { env.pg_distrib_dir = postgres_bin.into(); } else { let cwd = env::current_dir()?; - env.pg_distrib_dir = cwd.join("tmp_install") + env.pg_distrib_dir = cwd.join("pg_install/v14") } } diff --git a/docs/settings.md b/docs/settings.md index 5a0e976b47..30db495dbe 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -157,7 +157,7 @@ for other files and for sockets for incoming connections. A directory with Postgres installation to use during pageserver activities. Inside that dir, a `bin/postgres` binary should be present. -The default distrib dir is `./tmp_install/`. +The default distrib dir is `./pg_install/`. #### workdir (-D) diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 88f4b0e559..f3bc9230e2 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -40,15 +40,15 @@ and create new databases and accounts (control plane API in our case). Integration tests, written in Python using the `pytest` framework. -`/vendor/postgres`: +`/vendor/postgres-v14`: PostgreSQL source tree, with the modifications needed for Neon. -`/vendor/postgres/contrib/neon`: +`/pgxn/neon`: PostgreSQL extension that implements storage manager API and network communications with remote page server. -`/vendor/postgres/contrib/neon_test_utils`: +`/pgxn/neon_test_utils`: PostgreSQL extension that contains functions needed for testing and debugging. diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 69b2711c22..19507f0557 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -47,14 +47,17 @@ fn main() { println!("cargo:rerun-if-changed=bindgen_deps.h"); // Finding the location of C headers for the Postgres server: - // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/tmp_install` - // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/tmp_install/include/postgresql/server` + // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/pg_install` + // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/pg_install/v14/include/postgresql/server` let mut pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") { postgres_install_dir.into() } else { - PathBuf::from("tmp_install") + PathBuf::from("pg_install") }; + // Currently, we only expect to find PostgreSQL v14 sources, in "pg_install/v14". In the + // future, we will run this for all supported PostgreSQL versions. + pg_install_dir.push("v14"); if pg_install_dir.is_relative() { let cwd = env::current_dir().unwrap(); diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index e7838c3f2c..0d9aaa4708 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -449,7 +449,7 @@ mod tests { .join("..") .join(".."); let cfg = Conf { - pg_distrib_dir: top_path.join("tmp_install"), + pg_distrib_dir: top_path.join("pg_install/v14"), datadir: top_path.join(format!("test_output/{}", test_name)), }; if cfg.datadir.exists() { diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 938f8f421b..2a607db6dc 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -37,7 +37,7 @@ fn main() -> Result<()> { Arg::new("pg-distrib-dir") .long("pg-distrib-dir") .takes_value(true) - .help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)") + .help("Directory with Postgres distribution (bin and lib directories, e.g. pg_install/v14)") .default_value("/usr/local") ) ) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index c1c4169e14..fb70ea327d 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -205,7 +205,7 @@ impl Default for PageServerConfigBuilder { workdir: Set(PathBuf::new()), pg_distrib_dir: Set(env::current_dir() .expect("cannot access current directory") - .join("tmp_install")), + .join("pg_install/v14")), auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), diff --git a/pgxn/neon/inmem_smgr.c b/pgxn/neon/inmem_smgr.c index 7840292b08..13fd4d50b6 100644 --- a/pgxn/neon/inmem_smgr.c +++ b/pgxn/neon/inmem_smgr.c @@ -29,6 +29,10 @@ #include "storage/relfilenode.h" #include "storage/smgr.h" +#if PG_VERSION_NUM >= 150000 +#include "access/xlogutils.h" +#endif + /* Size of the in-memory smgr */ #define MAX_PAGES 64 diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 21d6dfec52..e3f083fd43 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -64,6 +64,11 @@ #include "catalog/pg_tablespace_d.h" #include "postmaster/autovacuum.h" +#if PG_VERSION_NUM >= 150000 +#include "access/xlogutils.h" +#include "access/xlogrecovery.h" +#endif + /* * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API * calls to md.c, and *also* do the calls to the Page Server. On every @@ -645,7 +650,11 @@ zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, Bloc * _bt_blwritepage logs the full page without flushing WAL before * smgrextend (files are fsynced before build ends). */ +#if PG_VERSION_NUM >= 150000 + flushlsn = GetFlushRecPtr(NULL); +#else flushlsn = GetFlushRecPtr(); +#endif if (lsn > flushlsn) { elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c index 8dfcffe1d1..31021f3e41 100644 --- a/pgxn/neon/relsize_cache.c +++ b/pgxn/neon/relsize_cache.c @@ -24,6 +24,9 @@ #include "utils/dynahash.h" #include "utils/guc.h" +#if PG_VERSION_NUM >= 150000 +#include "miscadmin.h" +#endif typedef struct { @@ -41,6 +44,10 @@ static HTAB *relsize_hash; static LWLockId relsize_lock; static int relsize_hash_size; static shmem_startup_hook_type prev_shmem_startup_hook = NULL; +#if PG_VERSION_NUM >= 150000 +static shmem_request_hook_type prev_shmem_request_hook = NULL; +static void relsize_shmem_request(void); +#endif /* * Size of a cache entry is 20 bytes. So this default will take about 1.2 MB, @@ -158,10 +165,31 @@ relsize_hash_init(void) if (relsize_hash_size > 0) { +#if PG_VERSION_NUM >= 150000 + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = relsize_shmem_request; +#else RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry))); RequestNamedLWLockTranche("neon_relsize", 1); +#endif prev_shmem_startup_hook = shmem_startup_hook; shmem_startup_hook = zenith_smgr_shmem_startup; } } + +#if PG_VERSION_NUM >= 150000 +/* + * shmem_request hook: request additional shared resources. We'll allocate or + * attach to the shared resources in zenith_smgr_shmem_startup(). + */ +static void +relsize_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + + RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry))); + RequestNamedLWLockTranche("neon_relsize", 1); +} +#endif diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 3baa4802b0..a769a5216b 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -39,6 +39,10 @@ #include "access/xact.h" #include "access/xlogdefs.h" #include "access/xlogutils.h" +#include "access/xloginsert.h" +#if PG_VERSION_NUM >= 150000 +#include "access/xlogrecovery.h" +#endif #include "storage/latch.h" #include "miscadmin.h" #include "pgstat.h" @@ -165,7 +169,10 @@ static bool backpressure_throttling_impl(void); static process_interrupts_callback_t PrevProcessInterruptsCallback; static shmem_startup_hook_type prev_shmem_startup_hook_type; - +#if PG_VERSION_NUM >= 150000 +static shmem_request_hook_type prev_shmem_request_hook = NULL; +static void walproposer_shmem_request(void); +#endif void pg_init_walproposer(void) @@ -221,19 +228,38 @@ static void nwp_register_gucs(void) GUC_UNIT_MS, NULL, NULL, NULL ); - + } /* shmem handling */ static void nwp_prepare_shmem(void) { +#if PG_VERSION_NUM >= 150000 + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = walproposer_shmem_request; +#else RequestAddinShmemSpace(WalproposerShmemSize()); - +#endif prev_shmem_startup_hook_type = shmem_startup_hook; shmem_startup_hook = nwp_shmem_startup_hook; } +#if PG_VERSION_NUM >= 150000 +/* + * shmem_request hook: request additional shared resources. We'll allocate or + * attach to the shared resources in nwp_shmem_startup_hook(). + */ +static void +walproposer_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + + RequestAddinShmemSpace(WalproposerShmemSize()); +} +#endif + static void nwp_shmem_startup_hook(void) { if (prev_shmem_startup_hook_type) @@ -248,6 +274,10 @@ static void nwp_shmem_startup_hook(void) void WalProposerMain(Datum main_arg) { +#if PG_VERSION_NUM >= 150000 + TimeLineID tli; +#endif + /* Establish signal handlers. */ pqsignal(SIGUSR1, procsignal_sigusr1_handler); pqsignal(SIGHUP, SignalHandlerForConfigReload); @@ -255,9 +285,14 @@ WalProposerMain(Datum main_arg) BackgroundWorkerUnblockSignals(); +#if PG_VERSION_NUM >= 150000 + // FIXME pass proper tli to WalProposerInit ? + GetXLogReplayRecPtr(&tli); + WalProposerInit(GetFlushRecPtr(NULL), GetSystemIdentifier()); +#else GetXLogReplayRecPtr(&ThisTimeLineID); - WalProposerInit(GetFlushRecPtr(), GetSystemIdentifier()); +#endif last_reconnect_attempt = GetCurrentTimestamp(); @@ -468,7 +503,12 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) !HexDecodeString(greetRequest.ztenantid, zenith_tenant_walproposer, 16)) elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer); +#if PG_VERSION_NUM >= 150000 +// FIXME don't use hardcoded timeline id + greetRequest.timeline = 1; +#else greetRequest.timeline = ThisTimeLineID; +#endif greetRequest.walSegSize = wal_segment_size; InitEventSet(); @@ -1702,7 +1742,12 @@ SendAppendRequests(Safekeeper *sk) &sk->outbuf.data[sk->outbuf.len], req->beginLsn, req->endLsn - req->beginLsn, + #if PG_VERSION_NUM >= 150000 + // FIXME don't use hardcoded timelineid here + 1, + #else ThisTimeLineID, + #endif &errinfo)) { WALReadRaiseError(&errinfo); @@ -2373,8 +2418,11 @@ backpressure_lag_impl(void) XLogRecPtr writePtr; XLogRecPtr flushPtr; XLogRecPtr applyPtr; +#if PG_VERSION_NUM >= 150000 + XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL); +#else XLogRecPtr myFlushLsn = GetFlushRecPtr(); - +#endif replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); #define MB ((XLogRecPtr)1024*1024) diff --git a/pgxn/neon/walproposer_utils.c b/pgxn/neon/walproposer_utils.c index 7b96fd580c..417a8c4586 100644 --- a/pgxn/neon/walproposer_utils.c +++ b/pgxn/neon/walproposer_utils.c @@ -21,6 +21,11 @@ #include #include +#if PG_VERSION_NUM >= 150000 +#include "access/xlogutils.h" +#include "access/xlogrecovery.h" +#endif + /* * These variables are used similarly to openLogFile/SegNo, * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID @@ -85,7 +90,11 @@ static volatile sig_atomic_t replication_active = false; typedef void (*WalSndSendDataCallback) (void); static void WalSndLoop(WalSndSendDataCallback send_data); static void XLogSendPhysical(void); +#if PG_VERSION_NUM >= 150000 +static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli); +#else static XLogRecPtr GetStandbyFlushRecPtr(void); +#endif static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, TimeLineID *tli_p); @@ -222,10 +231,10 @@ SafekeeperStateDesiredEvents(SafekeeperState state) result = WL_SOCKET_READABLE; break; - /* + /* * Flush states require write-ready for flushing. * Active state does both reading and writing. - * + * * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should * check sk->flushWrite here to set WL_SOCKET_WRITEABLE. */ @@ -398,12 +407,21 @@ XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr) if (walpropFile < 0) { + #if PG_VERSION_NUM >= 150000 + // FIXME Is it ok to use hardcoded value here? + TimeLineID tli = 1; + #else bool use_existent = true; - + #endif /* Create/use new log file */ XLByteToSeg(recptr, walpropSegNo, wal_segment_size); + #if PG_VERSION_NUM >= 150000 + walpropFile = XLogFileInit(walpropSegNo, tli); + walpropFileTLI = tli; + #else walpropFile = XLogFileInit(walpropSegNo, &use_existent, false); walpropFileTLI = ThisTimeLineID; + #endif } /* Calculate the start offset of the received logs */ @@ -488,11 +506,14 @@ void StartProposerReplication(StartReplicationCmd *cmd) { XLogRecPtr FlushPtr; + TimeLineID currTLI; + #if PG_VERSION_NUM < 150000 if (ThisTimeLineID == 0) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); + #endif /* create xlogreader for physical replication */ xlogreader = @@ -534,10 +555,19 @@ StartProposerReplication(StartReplicationCmd *cmd) * Select the timeline. If it was given explicitly by the client, use * that. Otherwise use the timeline of the last replayed record, which is * kept in ThisTimeLineID. - * + * * Neon doesn't currently use PG Timelines, but it may in the future, so * we keep this code around to lighten the load for when we need it. */ +#if PG_VERSION_NUM >= 150000 + if (am_cascading_walsender) + { + /* this also updates ThisTimeLineID */ + FlushPtr = GetStandbyFlushRecPtr(&currTLI); + } + else + FlushPtr = GetFlushRecPtr(&currTLI); +#else if (am_cascading_walsender) { /* this also updates ThisTimeLineID */ @@ -546,12 +576,16 @@ StartProposerReplication(StartReplicationCmd *cmd) else FlushPtr = GetFlushRecPtr(); + currTLI = ThisTimeLineID; +#endif + + if (cmd->timeline != 0) { XLogRecPtr switchpoint; sendTimeLine = cmd->timeline; - if (sendTimeLine == ThisTimeLineID) + if (sendTimeLine == currTLI) { sendTimeLineIsHistoric = false; sendTimeLineValidUpto = InvalidXLogRecPtr; @@ -566,7 +600,7 @@ StartProposerReplication(StartReplicationCmd *cmd) * Check that the timeline the client requested exists, and the * requested start location is on that timeline. */ - timeLineHistory = readTimeLineHistory(ThisTimeLineID); + timeLineHistory = readTimeLineHistory(currTLI); switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory, &sendTimeLineNextTLI); list_free_deep(timeLineHistory); @@ -605,7 +639,7 @@ StartProposerReplication(StartReplicationCmd *cmd) } else { - sendTimeLine = ThisTimeLineID; + sendTimeLine = currTLI; sendTimeLineValidUpto = InvalidXLogRecPtr; sendTimeLineIsHistoric = false; } @@ -710,6 +744,34 @@ StartProposerReplication(StartReplicationCmd *cmd) EndReplicationCommand("START_STREAMING"); } +#if PG_VERSION_NUM >= 150000 +static XLogRecPtr +GetStandbyFlushRecPtr(TimeLineID *tli) +{ + XLogRecPtr replayPtr; + TimeLineID replayTLI; + XLogRecPtr receivePtr; + TimeLineID receiveTLI; + XLogRecPtr result; + + /* + * We can safely send what's already been replayed. Also, if walreceiver + * is streaming WAL from the same timeline, we can send anything that it + * has streamed, but hasn't been replayed yet. + */ + + receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI); + replayPtr = GetXLogReplayRecPtr(&replayTLI); + + *tli = replayTLI; + + result = replayPtr; + if (receiveTLI == replayTLI && receivePtr > replayPtr) + result = receivePtr; + + return result; +} +#else /* * Returns the latest point in WAL that has been safely flushed to disk, and * can be sent to the standby. This should only be called when in recovery, @@ -744,6 +806,9 @@ GetStandbyFlushRecPtr(void) return result; } +#endif + + /* XLogReaderRoutine->segment_open callback */ static void @@ -878,6 +943,7 @@ XLogSendPhysical(void) XLogRecPtr startptr; XLogRecPtr endptr; Size nbytes PG_USED_FOR_ASSERTS_ONLY; + TimeLineID currTLI; /* If requested switch the WAL sender to the stopping state. */ if (got_STOPPING) @@ -919,9 +985,12 @@ XLogSendPhysical(void) * FlushPtr that was calculated before it became historic. */ bool becameHistoric = false; - +#if PG_VERSION_NUM >= 150000 + SendRqstPtr = GetStandbyFlushRecPtr(&currTLI); +#else SendRqstPtr = GetStandbyFlushRecPtr(); - + currTLI = ThisTimeLineID; +#endif if (!RecoveryInProgress()) { /* @@ -935,10 +1004,10 @@ XLogSendPhysical(void) { /* * Still a cascading standby. But is the timeline we're sending - * still the one recovery is recovering from? ThisTimeLineID was + * still the one recovery is recovering from? currTLI was * updated by the GetStandbyFlushRecPtr() call above. */ - if (sendTimeLine != ThisTimeLineID) + if (sendTimeLine != currTLI) becameHistoric = true; } @@ -951,7 +1020,7 @@ XLogSendPhysical(void) */ List *history; - history = readTimeLineHistory(ThisTimeLineID); + history = readTimeLineHistory(currTLI); sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI); Assert(sendTimeLine < sendTimeLineNextTLI); @@ -974,7 +1043,11 @@ XLogSendPhysical(void) * primary: if the primary subsequently crashes and restarts, standbys * must not have applied any WAL that got lost on the primary. */ + #if PG_VERSION_NUM >= 150000 + SendRqstPtr = GetFlushRecPtr(NULL); + #else SendRqstPtr = GetFlushRecPtr(); + #endif } /* diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 8ffb2eb829..b47e560325 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -59,7 +59,7 @@ Env = Dict[str, str] Fn = TypeVar("Fn", bound=Callable[..., Any]) DEFAULT_OUTPUT_DIR = "test_output" -DEFAULT_POSTGRES_DIR = "tmp_install" +DEFAULT_POSTGRES_DIR = "pg_install/v14" DEFAULT_BRANCH_NAME = "main" BASE_PORT = 15000 @@ -188,7 +188,7 @@ def can_bind(host: str, port: int) -> bool: Check whether a host:port is available to bind for listening Inspired by the can_bind() perl function used in Postgres tests, in - vendor/postgres/src/test/perl/PostgresNode.pm + vendor/postgres-v14/src/test/perl/PostgresNode.pm """ with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: # TODO: The pageserver and safekeepers don't use SO_REUSEADDR at the diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 119528b8f9..aa5a65f446 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -26,8 +26,8 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, cap (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. - build_path = os.path.join(pg_distrib_dir, "build/src/test/regress") - src_path = os.path.join(base_dir, "vendor/postgres/src/test/regress") + build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/regress") + src_path = os.path.join(base_dir, "vendor/postgres-v14/src/test/regress") bindir = os.path.join(pg_distrib_dir, "bin") schedule = os.path.join(src_path, "parallel_schedule") pg_regress = os.path.join(build_path, "pg_regress") @@ -80,8 +80,8 @@ def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, caps (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_isolation_regress will need. - build_path = os.path.join(pg_distrib_dir, "build/src/test/isolation") - src_path = os.path.join(base_dir, "vendor/postgres/src/test/isolation") + build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/isolation") + src_path = os.path.join(base_dir, "vendor/postgres-v14/src/test/isolation") bindir = os.path.join(pg_distrib_dir, "bin") schedule = os.path.join(src_path, "isolation_schedule") pg_isolation_regress = os.path.join(build_path, "pg_isolation_regress") @@ -124,7 +124,7 @@ def test_sql_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, ca # Compute all the file locations that pg_regress will need. # This test runs neon specific tests - build_path = os.path.join(pg_distrib_dir, "build/src/test/regress") + build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/regress") src_path = os.path.join(base_dir, "test_runner/sql_regress") bindir = os.path.join(pg_distrib_dir, "bin") schedule = os.path.join(src_path, "parallel_schedule") diff --git a/vendor/postgres b/vendor/postgres-v14 similarity index 100% rename from vendor/postgres rename to vendor/postgres-v14 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 new file mode 160000 index 0000000000..26c6466873 --- /dev/null +++ b/vendor/postgres-v15 @@ -0,0 +1 @@ +Subproject commit 26c64668736b729a3e4c02c6fc0a84544118df26 From f081419e68a32b1420eb1a1337a1d666955278bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Tue, 6 Sep 2022 11:30:20 +0300 Subject: [PATCH 0734/1022] Cleanup tenant specific metrics once a tenant is detached. (#2328) * Add test for pageserver metric cleanup once a tenant is detached. * Remove tenant specific timeline metrics on detach. * Use definitions from timeline_metrics in page service. * Move metrics to own file from layered_repository/timeline.rs * TIMELINE_METRICS: define smgr metrics * REMOVE SMGR cleanup from timeline_metrics. Doesn't seem to work as expected. * Vritual file centralized metrics, except for evicted file as there's no tenat id or timeline id. * Use STORAGE_TIME from timeline_metrics in layered_repository. * Remove timelineless gc metrics for tenant on detach. * Rename timeline metrics -> metrics as it's more generic. * Don't create a TimelineMetrics instance for VirtualFile * Move the rest of the metric definitions to metrics.rs too. * UUID -> ZTenantId * Use consistent style for dict. * Use Repository's Drop trait for dropping STORAGE_TIME metrics. * No need for Arc, TimelineMetrics is used in just one place. Due to that, we can fall back using ZTenantId and ZTimelineId too to avoid additional string allocation. --- pageserver/src/layered_repository.rs | 9 +- .../src/layered_repository/layer_map.rs | 8 +- pageserver/src/layered_repository/timeline.rs | 190 +------- pageserver/src/lib.rs | 12 +- pageserver/src/metrics.rs | 419 ++++++++++++++++++ pageserver/src/page_service.rs | 23 +- pageserver/src/storage_sync.rs | 35 +- pageserver/src/storage_sync/upload.rs | 12 +- pageserver/src/tenant_tasks.rs | 13 +- pageserver/src/virtual_file.rs | 35 +- .../src/walreceiver/walreceiver_connection.rs | 3 +- pageserver/src/walredo.rs | 69 +-- test_runner/fixtures/metrics.py | 31 +- test_runner/regress/test_tenants.py | 49 +- 14 files changed, 522 insertions(+), 386 deletions(-) create mode 100644 pageserver/src/metrics.rs diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 74abbeba86..200834300b 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -32,9 +32,11 @@ use std::time::{Duration, Instant}; use self::metadata::{metadata_path, TimelineMetadata}; use crate::config::PageServerConf; +use crate::metrics::remove_tenant_metrics; use crate::storage_sync::index::RemoteIndex; use crate::tenant_config::{TenantConf, TenantConfOpt}; +use crate::metrics::STORAGE_TIME; use crate::repository::GcResult; use crate::tenant_mgr::LocalTimelineUpdate; use crate::thread_mgr; @@ -301,7 +303,7 @@ impl Repository { .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); - timeline::STORAGE_TIME + STORAGE_TIME .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str]) .observe_closure_duration(|| { self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc) @@ -858,6 +860,11 @@ impl Repository { } } +impl Drop for Repository { + fn drop(&mut self) { + remove_tenant_metrics(&self.tenant_id); + } +} /// Dump contents of a layer file to stdout. pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { use std::os::unix::fs::FileExt; diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 8363d6314f..88dcf32409 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -13,21 +13,15 @@ use crate::layered_repository::inmemory_layer::InMemoryLayer; use crate::layered_repository::storage_layer::Layer; use crate::layered_repository::storage_layer::{range_eq, range_overlaps}; +use crate::metrics::NUM_ONDISK_LAYERS; use crate::repository::Key; use anyhow::Result; -use metrics::{register_int_gauge, IntGauge}; -use once_cell::sync::Lazy; use std::collections::VecDeque; use std::ops::Range; use std::sync::Arc; use tracing::*; use utils::lsn::Lsn; -static NUM_ONDISK_LAYERS: Lazy = Lazy::new(|| { - register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk") - .expect("failed to define a metric") -}); - /// /// LayerMap tracks what layers exist on a timeline. /// diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index b050ef4030..aa9d636739 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -4,8 +4,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::Bytes; use fail::fail_point; use itertools::Itertools; -use metrics::core::{AtomicU64, GenericCounter}; -use once_cell::sync::{Lazy, OnceCell}; +use once_cell::sync::OnceCell; use tracing::*; use std::cmp::{max, min, Ordering}; @@ -17,12 +16,6 @@ use std::sync::{mpsc, Arc, Mutex, MutexGuard, RwLock, TryLockError}; use std::time::{Duration, Instant, SystemTime}; use std::{fs, thread}; -use metrics::{ - register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec, - register_uint_gauge_vec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, - IntGaugeVec, UIntGauge, UIntGaugeVec, -}; - use crate::layered_repository::{ delta_layer::{DeltaLayer, DeltaLayerWriter}, ephemeral_file::is_ephemeral_file, @@ -37,6 +30,7 @@ use crate::layered_repository::{ use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; +use crate::metrics::TimelineMetrics; use crate::pgdatadir_mapping::BlockNumber; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::reltag::RelTag; @@ -58,182 +52,6 @@ use crate::walredo::WalRedoManager; use crate::CheckpointConfig; use crate::{page_cache, storage_sync}; -/// Prometheus histogram buckets (in seconds) that capture the majority of -/// latencies in the microsecond range but also extend far enough up to distinguish -/// "bad" from "really bad". -fn get_buckets_for_critical_operations() -> Vec { - let buckets_per_digit = 5; - let min_exponent = -6; - let max_exponent = 2; - - let mut buckets = vec![]; - // Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp - // because it's more numerically stable and doesn't result in numbers like 9.999999 - for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) { - buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64)) - } - buckets -} - -// Metrics collected on operations on the storage repository. -pub static STORAGE_TIME: Lazy = Lazy::new(|| { - register_histogram_vec!( - "pageserver_storage_operations_seconds", - "Time spent on storage operations", - &["operation", "tenant_id", "timeline_id"], - get_buckets_for_critical_operations(), - ) - .expect("failed to define a metric") -}); - -// Metrics collected on operations on the storage repository. -static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { - register_histogram_vec!( - "pageserver_getpage_reconstruct_seconds", - "Time spent in reconstruct_value", - &["tenant_id", "timeline_id"], - get_buckets_for_critical_operations(), - ) - .expect("failed to define a metric") -}); - -static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_materialized_cache_hits_total", - "Number of cache hits from materialized page cache", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - -static WAIT_LSN_TIME: Lazy = Lazy::new(|| { - register_histogram_vec!( - "pageserver_wait_lsn_seconds", - "Time spent waiting for WAL to arrive", - &["tenant_id", "timeline_id"], - get_buckets_for_critical_operations(), - ) - .expect("failed to define a metric") -}); - -static LAST_RECORD_LSN: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "pageserver_last_record_lsn", - "Last record LSN grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - -// Metrics for determining timeline's physical size. -// A layered timeline's physical is defined as the total size of -// (delta/image) layer files on disk. -static CURRENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { - register_uint_gauge_vec!( - "pageserver_current_physical_size", - "Current physical size grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - -static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { - register_uint_gauge_vec!( - "pageserver_current_logical_size", - "Current logical size grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define current logical size metric") -}); - -// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, -// or in testing they estimate how much we would upload if we did. -static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { - register_int_counter!( - "pageserver_created_persistent_files_total", - "Number of files created that are meant to be uploaded to cloud storage", - ) - .expect("failed to define a metric") -}); - -static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { - register_int_counter!( - "pageserver_written_persistent_bytes_total", - "Total bytes written that are meant to be uploaded to cloud storage", - ) - .expect("failed to define a metric") -}); - -struct TimelineMetrics { - pub reconstruct_time_histo: Histogram, - pub materialized_page_cache_hit_counter: GenericCounter, - pub flush_time_histo: Histogram, - pub compact_time_histo: Histogram, - pub create_images_time_histo: Histogram, - pub init_logical_size_histo: Histogram, - pub load_layer_map_histo: Histogram, - pub last_record_gauge: IntGauge, - pub wait_lsn_time_histo: Histogram, - pub current_physical_size_gauge: UIntGauge, - /// copy of LayeredTimeline.current_logical_size - pub current_logical_size_gauge: UIntGauge, -} - -impl TimelineMetrics { - fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self { - let tenant_id = tenant_id.to_string(); - let timeline_id = timeline_id.to_string(); - - let reconstruct_time_histo = RECONSTRUCT_TIME - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) - .unwrap(); - let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) - .unwrap(); - let flush_time_histo = STORAGE_TIME - .get_metric_with_label_values(&["layer flush", &tenant_id, &timeline_id]) - .unwrap(); - let compact_time_histo = STORAGE_TIME - .get_metric_with_label_values(&["compact", &tenant_id, &timeline_id]) - .unwrap(); - let create_images_time_histo = STORAGE_TIME - .get_metric_with_label_values(&["create images", &tenant_id, &timeline_id]) - .unwrap(); - let init_logical_size_histo = STORAGE_TIME - .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id]) - .unwrap(); - let load_layer_map_histo = STORAGE_TIME - .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id]) - .unwrap(); - let last_record_gauge = LAST_RECORD_LSN - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) - .unwrap(); - let wait_lsn_time_histo = WAIT_LSN_TIME - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) - .unwrap(); - let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) - .unwrap(); - let current_logical_size_gauge = CURRENT_LOGICAL_SIZE - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) - .unwrap(); - - TimelineMetrics { - reconstruct_time_histo, - materialized_page_cache_hit_counter, - flush_time_histo, - compact_time_histo, - create_images_time_histo, - init_logical_size_histo, - load_layer_map_histo, - last_record_gauge, - wait_lsn_time_histo, - current_physical_size_gauge, - current_logical_size_gauge, - } - } -} - pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, @@ -1494,8 +1312,8 @@ impl Timeline { let sz = new_delta_path.metadata()?.len(); self.metrics.current_physical_size_gauge.add(sz); // update metrics - NUM_PERSISTENT_FILES_CREATED.inc_by(1); - PERSISTENT_BYTES_WRITTEN.inc_by(sz); + self.metrics.num_persistent_files_created.inc_by(1); + self.metrics.persistent_bytes_written.inc_by(sz); Ok(new_delta_path) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 06c5f552a4..4731179e22 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -4,6 +4,7 @@ pub mod http; pub mod import_datadir; pub mod keyspace; pub mod layered_repository; +pub mod metrics; pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; @@ -22,11 +23,9 @@ pub mod walreceiver; pub mod walrecord; pub mod walredo; -use once_cell::sync::Lazy; use tracing::info; use crate::thread_mgr::ThreadKind; -use metrics::{register_int_gauge_vec, IntGaugeVec}; /// Current storage format version /// @@ -39,15 +38,6 @@ pub const STORAGE_FORMAT_VERSION: u16 = 3; pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; pub const DELTA_FILE_MAGIC: u16 = 0x5A61; -static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "pageserver_live_connections", - "Number of live network connections", - &["pageserver_connection_kind"] - ) - .expect("failed to define a metric") -}); - pub const LOG_FILE_NAME: &str = "pageserver.log"; /// Config for the Repository checkpointer diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs new file mode 100644 index 0000000000..35fdeacce5 --- /dev/null +++ b/pageserver/src/metrics.rs @@ -0,0 +1,419 @@ +use metrics::core::{AtomicU64, GenericCounter}; +use metrics::{ + register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec, + register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec, + IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, +}; +use once_cell::sync::Lazy; +use utils::zid::{ZTenantId, ZTimelineId}; + +/// Prometheus histogram buckets (in seconds) that capture the majority of +/// latencies in the microsecond range but also extend far enough up to distinguish +/// "bad" from "really bad". +fn get_buckets_for_critical_operations() -> Vec { + let buckets_per_digit = 5; + let min_exponent = -6; + let max_exponent = 2; + + let mut buckets = vec![]; + // Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp + // because it's more numerically stable and doesn't result in numbers like 9.999999 + for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) { + buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64)) + } + buckets +} + +// Metrics collected on operations on the storage repository. +const STORAGE_TIME_OPERATIONS: &[&str] = &[ + "layer flush", + "compact", + "create images", + "init logical size", + "load layer map", + "gc", +]; + +pub static STORAGE_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_storage_operations_seconds", + "Time spent on storage operations", + &["operation", "tenant_id", "timeline_id"], + get_buckets_for_critical_operations(), + ) + .expect("failed to define a metric") +}); + +// Metrics collected on operations on the storage repository. +static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_getpage_reconstruct_seconds", + "Time spent in reconstruct_value", + &["tenant_id", "timeline_id"], + get_buckets_for_critical_operations(), + ) + .expect("failed to define a metric") +}); + +static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_materialized_cache_hits_total", + "Number of cache hits from materialized page cache", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static WAIT_LSN_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_wait_lsn_seconds", + "Time spent waiting for WAL to arrive", + &["tenant_id", "timeline_id"], + get_buckets_for_critical_operations(), + ) + .expect("failed to define a metric") +}); + +static LAST_RECORD_LSN: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_last_record_lsn", + "Last record LSN grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +// Metrics for determining timeline's physical size. +// A layered timeline's physical is defined as the total size of +// (delta/image) layer files on disk. +static CURRENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_current_physical_size", + "Current physical size grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_current_logical_size", + "Current logical size grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define current logical size metric") +}); + +// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, +// or in testing they estimate how much we would upload if we did. +static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { + IntCounter::new( + "pageserver_created_persistent_files_total", + "Number of files created that are meant to be uploaded to cloud storage", + ) + .expect("failed to define a metric") +}); + +static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { + IntCounter::new( + "pageserver_written_persistent_bytes_total", + "Total bytes written that are meant to be uploaded to cloud storage", + ) + .expect("failed to define a metric") +}); + +// Metrics collected on disk IO operations +const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ + 0.000001, // 1 usec + 0.00001, // 10 usec + 0.0001, // 100 usec + 0.001, // 1 msec + 0.01, // 10 msec + 0.1, // 100 msec + 1.0, // 1 sec +]; + +const STORAGE_IO_TIME_OPERATIONS: &[&str] = + &["open", "close", "read", "write", "seek", "fsync", "gc"]; + +const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"]; + +pub static STORAGE_IO_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_io_operations_seconds", + "Time spent in IO operations", + &["operation", "tenant_id", "timeline_id"], + STORAGE_IO_TIME_BUCKETS.into() + ) + .expect("failed to define a metric") +}); + +pub static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_io_operations_bytes_total", + "Total amount of bytes read/written in IO operations", + &["operation", "tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[ + "get_rel_exists", + "get_rel_size", + "get_page_at_lsn", + "get_db_size", +]; + +const SMGR_QUERY_TIME_BUCKETS: &[f64] = &[ + 0.00001, // 1/100000 s + 0.0001, 0.00015, 0.0002, 0.00025, 0.0003, 0.00035, 0.0005, 0.00075, // 1/10000 s + 0.001, 0.0025, 0.005, 0.0075, // 1/1000 s + 0.01, 0.0125, 0.015, 0.025, 0.05, // 1/100 s + 0.1, // 1/10 s +]; + +pub static SMGR_QUERY_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_smgr_query_seconds", + "Time spent on smgr query handling", + &["smgr_query_type", "tenant_id", "timeline_id"], + SMGR_QUERY_TIME_BUCKETS.into() + ) + .expect("failed to define a metric") +}); + +pub static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_live_connections", + "Number of live network connections", + &["pageserver_connection_kind"] + ) + .expect("failed to define a metric") +}); + +pub static NUM_ONDISK_LAYERS: Lazy = Lazy::new(|| { + register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk") + .expect("failed to define a metric") +}); + +pub static REMAINING_SYNC_ITEMS: Lazy = Lazy::new(|| { + register_int_gauge!( + "pageserver_remote_storage_remaining_sync_items", + "Number of storage sync items left in the queue" + ) + .expect("failed to register pageserver remote storage remaining sync items int gauge") +}); + +pub static IMAGE_SYNC_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_remote_storage_image_sync_seconds", + "Time took to synchronize (download or upload) a whole pageserver image. \ + Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)", + &["tenant_id", "timeline_id", "operation_kind", "status"], + vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0] + ) + .expect("failed to register pageserver image sync time histogram vec") +}); + +pub static REMOTE_INDEX_UPLOAD: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_storage_remote_index_uploads_total", + "Number of remote index uploads", + &["tenant_id", "timeline_id"], + ) + .expect("failed to register pageserver remote index upload vec") +}); + +pub static NO_LAYERS_UPLOAD: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_storage_no_layers_uploads_total", + "Number of skipped uploads due to no layers", + &["tenant_id", "timeline_id"], + ) + .expect("failed to register pageserver no layers upload vec") +}); + +pub static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_task_events", + "Number of task start/stop/fail events.", + &["event"], + ) + .expect("Failed to register tenant_task_events metric") +}); + +// Metrics collected on WAL redo operations +// +// We collect the time spent in actual WAL redo ('redo'), and time waiting +// for access to the postgres process ('wait') since there is only one for +// each tenant. + +/// Time buckets are small because we want to be able to measure the +/// smallest redo processing times. These buckets allow us to measure down +/// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec. +/// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec. +macro_rules! redo_histogram_time_buckets { + () => { + vec![ + 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000, + 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, + ] + }; +} + +/// While we're at it, also measure the amount of records replayed in each +/// operation. We have a global 'total replayed' counter, but that's not +/// as useful as 'what is the skew for how many records we replay in one +/// operation'. +macro_rules! redo_histogram_count_buckets { + () => { + vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0] + }; +} + +pub static WAL_REDO_TIME: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_wal_redo_seconds", + "Time spent on WAL redo", + redo_histogram_time_buckets!() + ) + .expect("failed to define a metric") +}); + +pub static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_wal_redo_wait_seconds", + "Time spent waiting for access to the WAL redo process", + redo_histogram_time_buckets!(), + ) + .expect("failed to define a metric") +}); + +pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_wal_redo_records_histogram", + "Histogram of number of records replayed per redo", + redo_histogram_count_buckets!(), + ) + .expect("failed to define a metric") +}); + +pub static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_replayed_wal_records_total", + "Number of WAL records replayed in WAL redo process" + ) + .unwrap() +}); + +#[derive(Debug)] +pub struct TimelineMetrics { + tenant_id: String, + timeline_id: String, + pub reconstruct_time_histo: Histogram, + pub materialized_page_cache_hit_counter: GenericCounter, + pub flush_time_histo: Histogram, + pub compact_time_histo: Histogram, + pub create_images_time_histo: Histogram, + pub init_logical_size_histo: Histogram, + pub load_layer_map_histo: Histogram, + pub last_record_gauge: IntGauge, + pub wait_lsn_time_histo: Histogram, + pub current_physical_size_gauge: UIntGauge, + /// copy of LayeredTimeline.current_logical_size + pub current_logical_size_gauge: UIntGauge, + pub num_persistent_files_created: IntCounter, + pub persistent_bytes_written: IntCounter, +} + +impl TimelineMetrics { + pub fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self { + let tenant_id = tenant_id.to_string(); + let timeline_id = timeline_id.to_string(); + let reconstruct_time_histo = RECONSTRUCT_TIME + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let flush_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["layer flush", &tenant_id, &timeline_id]) + .unwrap(); + let compact_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["compact", &tenant_id, &timeline_id]) + .unwrap(); + let create_images_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["create images", &tenant_id, &timeline_id]) + .unwrap(); + let init_logical_size_histo = STORAGE_TIME + .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id]) + .unwrap(); + let load_layer_map_histo = STORAGE_TIME + .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id]) + .unwrap(); + let last_record_gauge = LAST_RECORD_LSN + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let wait_lsn_time_histo = WAIT_LSN_TIME + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let current_logical_size_gauge = CURRENT_LOGICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED.clone(); + let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN.clone(); + + TimelineMetrics { + tenant_id, + timeline_id, + reconstruct_time_histo, + materialized_page_cache_hit_counter, + flush_time_histo, + compact_time_histo, + create_images_time_histo, + init_logical_size_histo, + load_layer_map_histo, + last_record_gauge, + wait_lsn_time_histo, + current_physical_size_gauge, + current_logical_size_gauge, + num_persistent_files_created, + persistent_bytes_written, + } + } +} + +impl Drop for TimelineMetrics { + fn drop(&mut self) { + let tenant_id = &self.tenant_id; + let timeline_id = &self.timeline_id; + let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]); + let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]); + let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); + let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]); + let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + + for op in STORAGE_TIME_OPERATIONS { + let _ = STORAGE_TIME.remove_label_values(&[op, tenant_id, timeline_id]); + } + for op in STORAGE_IO_TIME_OPERATIONS { + let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]); + } + + for op in STORAGE_IO_SIZE_OPERATIONS { + let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]); + } + + for op in SMGR_QUERY_TIME_OPERATIONS { + let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]); + } + } +} + +pub fn remove_tenant_metrics(tenant_id: &ZTenantId) { + let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]); +} diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 358618f20c..783fcb2412 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -11,7 +11,6 @@ use anyhow::{bail, ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use once_cell::sync::Lazy; use regex::Regex; use std::io::{self, Read}; use std::net::TcpListener; @@ -32,6 +31,7 @@ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; use crate::layered_repository::Timeline; +use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::profiling::profpoint_start; use crate::reltag::RelTag; @@ -39,7 +39,6 @@ use crate::tenant_mgr; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; use crate::CheckpointConfig; -use metrics::{register_histogram_vec, HistogramVec}; use postgres_ffi::v14::xlog_utils::to_pg_timestamp; use postgres_ffi::v14::pg_constants::DEFAULTTABLESPACE_OID; @@ -374,7 +373,7 @@ fn page_service_conn_main( // Immediately increment the gauge, then create a job to decrement it on thread exit. // One of the pros of `defer!` is that this will *most probably* // get called, even in presence of panics. - let gauge = crate::LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]); + let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]); gauge.inc(); scopeguard::defer! { gauge.dec(); @@ -427,24 +426,6 @@ struct PageServerHandler { claims: Option, } -const TIME_BUCKETS: &[f64] = &[ - 0.00001, // 1/100000 s - 0.0001, 0.00015, 0.0002, 0.00025, 0.0003, 0.00035, 0.0005, 0.00075, // 1/10000 s - 0.001, 0.0025, 0.005, 0.0075, // 1/1000 s - 0.01, 0.0125, 0.015, 0.025, 0.05, // 1/100 s - 0.1, // 1/10 s -]; - -static SMGR_QUERY_TIME: Lazy = Lazy::new(|| { - register_histogram_vec!( - "pageserver_smgr_query_seconds", - "Time spent on smgr query handling", - &["smgr_query_type", "tenant_id", "timeline_id"], - TIME_BUCKETS.into() - ) - .expect("failed to define a metric") -}); - impl PageServerHandler { pub fn new(conf: &'static PageServerConf, auth: Option>) -> Self { PageServerHandler { diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 0bdc30a73f..491f882e0b 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -155,7 +155,7 @@ use std::{ use anyhow::{anyhow, bail, Context}; use futures::stream::{FuturesUnordered, StreamExt}; -use once_cell::sync::{Lazy, OnceCell}; +use once_cell::sync::OnceCell; use remote_storage::GenericRemoteStorage; use tokio::{ fs, @@ -170,6 +170,7 @@ use self::{ index::{IndexPart, RemoteTimeline, RemoteTimelineIndex}, upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, }; +use crate::metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD}; use crate::{ config::PageServerConf, exponential_backoff, @@ -183,44 +184,12 @@ use crate::{ thread_mgr::ThreadKind, }; -use metrics::{ - register_histogram_vec, register_int_counter_vec, register_int_gauge, HistogramVec, - IntCounterVec, IntGauge, -}; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; use self::download::download_index_parts; pub use self::download::gather_tenant_timelines_index_parts; pub use self::download::TEMP_DOWNLOAD_EXTENSION; -static REMAINING_SYNC_ITEMS: Lazy = Lazy::new(|| { - register_int_gauge!( - "pageserver_remote_storage_remaining_sync_items", - "Number of storage sync items left in the queue" - ) - .expect("failed to register pageserver remote storage remaining sync items int gauge") -}); - -static IMAGE_SYNC_TIME: Lazy = Lazy::new(|| { - register_histogram_vec!( - "pageserver_remote_storage_image_sync_seconds", - "Time took to synchronize (download or upload) a whole pageserver image. \ - Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)", - &["tenant_id", "timeline_id", "operation_kind", "status"], - vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0] - ) - .expect("failed to register pageserver image sync time histogram vec") -}); - -static REMOTE_INDEX_UPLOAD: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_remote_storage_remote_index_uploads_total", - "Number of remote index uploads", - &["tenant_id", "timeline_id"], - ) - .expect("failed to register pageserver remote index upload vec") -}); - static SYNC_QUEUE: OnceCell = OnceCell::new(); /// A timeline status to share with pageserver's sync counterpart, diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 38bad73d3b..8dd73d9431 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -4,7 +4,6 @@ use std::{fmt::Debug, path::PathBuf}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; -use once_cell::sync::Lazy; use remote_storage::GenericRemoteStorage; use tokio::fs; use tracing::{debug, error, info, warn}; @@ -15,19 +14,10 @@ use super::{ index::{IndexPart, RemoteTimeline}, LayersUpload, SyncData, SyncQueue, }; +use crate::metrics::NO_LAYERS_UPLOAD; use crate::{ config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, }; -use metrics::{register_int_counter_vec, IntCounterVec}; - -static NO_LAYERS_UPLOAD: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_remote_storage_no_layers_uploads_total", - "Number of skipped uploads due to no layers", - &["tenant_id", "timeline_id"], - ) - .expect("failed to register pageserver no layers upload vec") -}); /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part( diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index ca239ae254..11be13b80c 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -5,28 +5,19 @@ use std::collections::HashMap; use std::ops::ControlFlow; use std::time::Duration; +use crate::metrics::TENANT_TASK_EVENTS; use crate::tenant_mgr::TenantState; use crate::thread_mgr::ThreadKind; use crate::{tenant_mgr, thread_mgr}; use anyhow::{self, Context}; use futures::stream::FuturesUnordered; use futures::StreamExt; -use metrics::{register_int_counter_vec, IntCounterVec}; -use once_cell::sync::{Lazy, OnceCell}; +use once_cell::sync::OnceCell; use tokio::sync::mpsc; use tokio::sync::watch; use tracing::*; use utils::zid::ZTenantId; -static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_tenant_task_events", - "Number of task start/stop/fail events.", - &["event"], - ) - .expect("Failed to register tenant_task_events metric") -}); - /// /// Compaction task's main loop /// diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 5b24b848ad..7a2c699b44 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -10,7 +10,7 @@ //! This is similar to PostgreSQL's virtual file descriptor facility in //! src/backend/storage/file/fd.c //! -use once_cell::sync::Lazy; +use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME}; use once_cell::sync::OnceCell; use std::fs::{File, OpenOptions}; use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write}; @@ -19,38 +19,6 @@ use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{RwLock, RwLockWriteGuard}; -use metrics::{register_histogram_vec, register_int_gauge_vec, HistogramVec, IntGaugeVec}; - -// Metrics collected on disk IO operations -const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ - 0.000001, // 1 usec - 0.00001, // 10 usec - 0.0001, // 100 usec - 0.001, // 1 msec - 0.01, // 10 msec - 0.1, // 100 msec - 1.0, // 1 sec -]; - -static STORAGE_IO_TIME: Lazy = Lazy::new(|| { - register_histogram_vec!( - "pageserver_io_operations_seconds", - "Time spent in IO operations", - &["operation", "tenant_id", "timeline_id"], - STORAGE_IO_TIME_BUCKETS.into() - ) - .expect("failed to define a metric") -}); - -static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "pageserver_io_operations_bytes_total", - "Total amount of bytes read/written in IO operations", - &["operation", "tenant_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - /// /// A virtual file descriptor. You can use this just like std::fs::File, but internally /// the underlying file is closed if the system is low on file descriptors, @@ -85,7 +53,6 @@ pub struct VirtualFile { pub path: PathBuf, open_options: OpenOptions, - /// For metrics tenantid: String, timelineid: String, } diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index d441bbb4ab..4c30481e02 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -19,6 +19,7 @@ use tokio_postgres::{replication::ReplicationStream, Client}; use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; +use crate::metrics::LIVE_CONNECTIONS_COUNT; use crate::{ layered_repository::WalReceiverInfo, tenant_mgr, walingest::WalIngest, walrecord::DecodedWALRecord, @@ -105,7 +106,7 @@ pub async fn handle_walreceiver_connection( // Immediately increment the gauge, then create a job to decrement it on task exit. // One of the pros of `defer!` is that this will *most probably* // get called, even in presence of panics. - let gauge = crate::LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]); + let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]); gauge.inc(); scopeguard::defer! { gauge.dec(); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index bf48bd1759..4e49fd9373 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -21,7 +21,6 @@ use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; -use once_cell::sync::Lazy; use serde::Serialize; use std::fs; use std::fs::OpenOptions; @@ -39,11 +38,13 @@ use tracing::*; use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock, zid::ZTenantId}; use crate::config::PageServerConf; +use crate::metrics::{ + WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, WAL_REDO_WAIT_TIME, +}; use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; use crate::walrecord::ZenithWalRecord; -use metrics::{register_histogram, register_int_counter, Histogram, IntCounter}; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, transaction_id_set_status, @@ -83,70 +84,6 @@ pub trait WalRedoManager: Send + Sync { ) -> Result; } -// Metrics collected on WAL redo operations -// -// We collect the time spent in actual WAL redo ('redo'), and time waiting -// for access to the postgres process ('wait') since there is only one for -// each tenant. - -/// Time buckets are small because we want to be able to measure the -/// smallest redo processing times. These buckets allow us to measure down -/// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec. -/// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec. -macro_rules! redo_histogram_time_buckets { - () => { - vec![ - 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000, - 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, - ] - }; -} - -/// While we're at it, also measure the amount of records replayed in each -/// operation. We have a global 'total replayed' counter, but that's not -/// as useful as 'what is the skew for how many records we replay in one -/// operation'. -macro_rules! redo_histogram_count_buckets { - () => { - vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0] - }; -} - -static WAL_REDO_TIME: Lazy = Lazy::new(|| { - register_histogram!( - "pageserver_wal_redo_seconds", - "Time spent on WAL redo", - redo_histogram_time_buckets!() - ) - .expect("failed to define a metric") -}); - -static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { - register_histogram!( - "pageserver_wal_redo_wait_seconds", - "Time spent waiting for access to the WAL redo process", - redo_histogram_time_buckets!(), - ) - .expect("failed to define a metric") -}); - -static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { - register_histogram!( - "pageserver_wal_redo_records_histogram", - "Histogram of number of records replayed per redo", - redo_histogram_count_buckets!(), - ) - .expect("failed to define a metric") -}); - -static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { - register_int_counter!( - "pageserver_replayed_wal_records_total", - "Number of WAL records replayed in WAL redo process" - ) - .unwrap() -}); - /// /// This is the real implementation that uses a Postgres process to /// perform WAL replay. Only one thread can use the process at a time, diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index b51c7250e0..4d680aa641 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -16,8 +16,11 @@ class Metrics: def query_all(self, name: str, filter: Dict[str, str]) -> List[Sample]: res = [] for sample in self.metrics[name]: - if all(sample.labels[k] == v for k, v in filter.items()): - res.append(sample) + try: + if all(sample.labels[k] == v for k, v in filter.items()): + res.append(sample) + except KeyError: + pass return res def query_one(self, name: str, filter: Dict[str, str] = {}) -> Sample: @@ -34,3 +37,27 @@ def parse_metrics(text: str, name: str = ""): metrics.metrics[sample.name].append(sample) return metrics + + +PAGESERVER_PER_TENANT_METRICS = [ + "pageserver_current_logical_size", + "pageserver_current_physical_size", + "pageserver_getpage_reconstruct_seconds_bucket", + "pageserver_getpage_reconstruct_seconds_count", + "pageserver_getpage_reconstruct_seconds_sum", + "pageserver_io_operations_bytes_total", + "pageserver_io_operations_seconds_bucket", + "pageserver_io_operations_seconds_count", + "pageserver_io_operations_seconds_sum", + "pageserver_last_record_lsn", + "pageserver_materialized_cache_hits_total", + "pageserver_smgr_query_seconds_bucket", + "pageserver_smgr_query_seconds_count", + "pageserver_smgr_query_seconds_sum", + "pageserver_storage_operations_seconds_bucket", + "pageserver_storage_operations_seconds_count", + "pageserver_storage_operations_seconds_sum", + "pageserver_wait_lsn_seconds_bucket", + "pageserver_wait_lsn_seconds_count", + "pageserver_wait_lsn_seconds_sum", +] diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 8bbf45205a..767f94d167 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -1,12 +1,14 @@ import os from contextlib import closing from datetime import datetime +from typing import List import pytest from fixtures.log_helper import log -from fixtures.metrics import parse_metrics +from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import Lsn +from fixtures.types import Lsn, ZTenantId +from prometheus_client.samples import Sample @pytest.mark.parametrize("with_safekeepers", [False, True]) @@ -122,3 +124,46 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): log.info( f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}" ) + + +def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder): + """Tests that when a tenant is detached, the tenant specific metrics are not left behind""" + + neon_env_builder.num_safekeepers = 3 + + env = neon_env_builder.init_start() + tenant_1, _ = env.neon_cli.create_tenant() + tenant_2, _ = env.neon_cli.create_tenant() + + env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_1) + env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_2) + + pg_tenant1 = env.postgres.create_start("test_metrics_removed_after_detach", tenant_id=tenant_1) + pg_tenant2 = env.postgres.create_start("test_metrics_removed_after_detach", tenant_id=tenant_2) + + for pg in [pg_tenant1, pg_tenant2]: + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (5000050000,) + + def get_ps_metric_samples_for_tenant(tenant_id: ZTenantId) -> List[Sample]: + ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver") + samples = [] + for metric_name in ps_metrics.metrics: + for sample in ps_metrics.query_all( + name=metric_name, filter={"tenant_id": str(tenant_id)} + ): + samples.append(sample) + return samples + + for tenant in [tenant_1, tenant_2]: + pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)]) + assert pre_detach_samples == set(PAGESERVER_PER_TENANT_METRICS) + + env.pageserver.http_client().tenant_detach(tenant) + + post_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)]) + assert post_detach_samples == set() From cf157ad8e4541031bc2acded74a9584d6565ec24 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 6 Sep 2022 13:00:40 +0300 Subject: [PATCH 0735/1022] Add test that repeatedly kills and restarts the pageserver. This caught or reproduced several bugs when I originally wrote this test back in May, including #1731, #1740, #1751, and #707. I believe all the issues have been fixed now, but since this was a very fruitful test, let's add it to the test suite. We didn't commit this earlier, because the test was very slow especially with a debug build. We've since changed the build options so that even the debug builds are not quite so slow anymore. --- .../regress/test_pageserver_restart.py | 72 ++++++++++++++++++- 1 file changed, 69 insertions(+), 3 deletions(-) diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index e2bd8be9b7..eac5e6e61d 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -1,3 +1,6 @@ +from contextlib import closing + +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder @@ -38,9 +41,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) - # Stop and restart pageserver. This is a more or less graceful shutdown, although - # the page server doesn't currently have a shutdown routine so there's no difference - # between stopping and crashing. + # Stop the pageserver gracefully and restart it. env.pageserver.stop() env.pageserver.start() @@ -58,3 +59,68 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # Stop the page server by force, and restart it env.pageserver.stop() env.pageserver.start() + + +# Test that repeatedly kills and restarts the page server, while the +# safekeeper and compute node keep running. +@pytest.mark.timeout(540) +def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + # Use a tiny checkpoint distance, to create a lot of layers quickly. + # That allows us to stress the compaction and layer flushing logic more. + tenant, _ = env.neon_cli.create_tenant( + conf={ + "checkpoint_distance": "5000000", + } + ) + env.neon_cli.create_timeline("test_pageserver_chaos", tenant_id=tenant) + pg = env.postgres.create_start("test_pageserver_chaos", tenant_id=tenant) + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers, otherwise the SELECT after restart will just return answer + # from shared_buffers without hitting the page server, which defeats the point + # of this test. + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE foo (id int, t text, updates int)") + cur.execute("CREATE INDEX ON foo (id)") + cur.execute( + """ + INSERT INTO foo + SELECT g, 'long string to consume some space' || g, 0 + FROM generate_series(1, 100000) g + """ + ) + + # Verify that the table is larger than shared_buffers + cur.execute( + """ + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize + from pg_settings where name = 'shared_buffers' + """ + ) + row = cur.fetchone() + assert row is not None + log.info(f"shared_buffers is {row[0]}, table size {row[1]}") + assert int(row[0]) < int(row[1]) + + # Update the whole table, then immediately kill and restart the pageserver + for i in range(1, 15): + pg.safe_psql("UPDATE foo set updates = updates + 1") + + # This kills the pageserver immediately, to simulate a crash + env.pageserver.stop(immediate=True) + env.pageserver.start() + + # Stopping the pageserver breaks the connection from the postgres backend to + # the page server, and causes the next query on the connection to fail. Start a new + # postgres connection too, to avoid that error. (Ideally, the compute node would + # handle that and retry internally, without propagating the error to the user, but + # currently it doesn't...) + pg_conn = pg.connect() + cur = pg_conn.cursor() + + # Check that all the updates are visible + num_updates = pg.safe_psql("SELECT sum(updates) FROM foo")[0][0] + assert num_updates == i * 100000 From f441fe57d47cade9454bc584f6b820c868d43272 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 6 Sep 2022 17:35:40 +0300 Subject: [PATCH 0736/1022] Register prometheus counters correctly. Commit f081419e68 moved all the prometheus counters to `metrics.rs`, but accidentally replaced a couple of `register_int_counter!(...)` calls with just `IntCounter::new(...)`. Because of that, the counters were not registered in the metrics registry, and were not exposed through the metrics HTTP endpoint. Fixes failures we're seeing in a bunch of 'performance' tests because of the missing metrics. --- pageserver/src/metrics.rs | 4 ++-- test_runner/fixtures/benchmark_fixture.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 35fdeacce5..ada0bbd359 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -107,7 +107,7 @@ static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, // or in testing they estimate how much we would upload if we did. static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { - IntCounter::new( + register_int_counter!( "pageserver_created_persistent_files_total", "Number of files created that are meant to be uploaded to cloud storage", ) @@ -115,7 +115,7 @@ static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { }); static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { - IntCounter::new( + register_int_counter!( "pageserver_written_persistent_bytes_total", "Total bytes written that are meant to be uploaded to cloud storage", ) diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 338cc47ea2..b9cdfdebc4 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -362,7 +362,7 @@ class NeonBenchmarker: # and round to integer. all_metrics = pageserver.http_client().get_metrics() matches = re.search(rf"^{metric_name} (\S+)$", all_metrics, re.MULTILINE) - assert matches + assert matches, f"metric {metric_name} not found" return int(round(float(matches.group(1)))) def get_timeline_size(self, repo_dir: Path, tenantid: ZTenantId, timelineid: ZTimelineId): From 65b592d4bd6d503816fd9f4fbb8e11505623f1cd Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 5 Sep 2022 13:59:04 +0300 Subject: [PATCH 0737/1022] Remove deprecated management API for timeline detach. It is no longer used anywhere. --- pageserver/src/http/openapi_spec.yml | 45 ---------------------------- pageserver/src/http/routes.rs | 5 ---- 2 files changed, 50 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index fc3e80ba19..6beb938d6a 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -257,51 +257,6 @@ paths: schema: $ref: "#/components/schemas/Error" - /v1/tenant/{tenant_id}/timeline/{timeline_id}/detach: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - format: hex - - name: timeline_id - in: path - required: true - schema: - type: string - format: hex - post: - description: Deprecated, use DELETE /v1/tenant/{tenant_id}/timeline/{timeline_id} instead - deprecated: true - responses: - "200": - description: Ok - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - /v1/tenant/{tenant_id}/detach: parameters: - name: tenant_id diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 52997da5a0..09c4812067 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -696,10 +696,5 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler, ) - // for backward compatibility - .post( - "/v1/tenant/:tenant_id/timeline/:timeline_id/detach", - timeline_delete_handler, - ) .any(handler_404)) } From 2794cd83c70abfd99a879614de9fd5766b707d52 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 7 Sep 2022 12:40:48 +0300 Subject: [PATCH 0738/1022] Prepare pg 15 support (generate bindings for pg15) (#2396) Another preparatory commit for pg15 support: * generate bindings for both pg14 and pg15; * update Makefile and CI scripts: now neon build depends on both PostgreSQL versions; * some code refactoring to decrease version-specific dependencies. --- .github/workflows/build_and_test.yml | 27 ++++- Dockerfile | 11 +- Makefile | 2 +- libs/postgres_ffi/README.md | 8 +- libs/postgres_ffi/build.rs | 162 +++++++++++++------------ libs/postgres_ffi/src/lib.rs | 16 ++- libs/postgres_ffi/src/pg_constants.rs | 9 +- libs/postgres_ffi/src/waldecoder.rs | 6 +- libs/postgres_ffi/src/xlog_utils.rs | 36 +++--- libs/postgres_ffi/wal_craft/src/lib.rs | 6 +- pageserver/src/basebackup.rs | 19 +-- pageserver/src/import_datadir.rs | 16 +-- pageserver/src/pgdatadir_mapping.rs | 3 +- pageserver/src/walrecord.rs | 4 +- safekeeper/src/json_ctrl.rs | 4 +- safekeeper/src/metrics.rs | 2 +- safekeeper/src/safekeeper.rs | 2 +- safekeeper/src/send_wal.rs | 4 +- safekeeper/src/timeline.rs | 2 +- safekeeper/src/wal_backup.rs | 4 +- safekeeper/src/wal_storage.rs | 4 +- 21 files changed, 186 insertions(+), 161 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 6eddbc3335..6d966ce0a2 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -77,11 +77,16 @@ jobs: submodules: true fetch-depth: 1 - - name: Set pg revision for caching + - name: Set pg 14 revision for caching id: pg_v14_rev run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14) shell: bash -euxo pipefail {0} + - name: Set pg 15 revision for caching + id: pg_v15_rev + run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v15) + shell: bash -euxo pipefail {0} + # Set some environment variables used by all the steps. # # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. @@ -125,15 +130,27 @@ jobs: v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- - name: Cache postgres v14 build - id: cache_pg + id: cache_pg_14 uses: actions/cache@v3 with: path: pg_install/v14 key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - name: Build postgres - if: steps.cache_pg.outputs.cache-hit != 'true' - run: mold -run make postgres -j$(nproc) + - name: Cache postgres v15 build + id: cache_pg_15 + uses: actions/cache@v3 + with: + path: pg_install/v15 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Build postgres v14 + if: steps.cache_pg_14.outputs.cache-hit != 'true' + run: mold -run make postgres-v14 -j$(nproc) + shell: bash -euxo pipefail {0} + + - name: Build postgres v15 + if: steps.cache_pg_15.outputs.cache-hit != 'true' + run: mold -run make postgres-v15 -j$(nproc) shell: bash -euxo pipefail {0} - name: Build neon extensions diff --git a/Dockerfile b/Dockerfile index d379c05051..3e173f4d5b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,23 +5,21 @@ ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com ARG IMAGE=rust ARG TAG=pinned -# ARGs don't get replaced in RUN commands in Kaniko -# so use hardcoded value below -# ARG PG_VERSION=v14 # Build Postgres FROM $REPOSITORY/$IMAGE:$TAG AS pg-build WORKDIR /home/nonroot -ARG PG_VERSION=v14 COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 +COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15 COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile ENV BUILD_TYPE release RUN set -e \ - && mold -run make -j $(nproc) -s neon-pg-ext-v14 \ + && mold -run make -j $(nproc) -s neon-pg-ext \ && rm -rf pg_install/v14/build \ + && rm -rf pg_install/v15/build \ && tar -C pg_install/v14 -czf /home/nonroot/postgres_install.tar.gz . # Build zenith binaries @@ -39,8 +37,8 @@ ARG CACHEPOT_BUCKET=neon-github-dev #ARG AWS_ACCESS_KEY_ID #ARG AWS_SECRET_ACCESS_KEY -ARG PG_VERSION=v14 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server +COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server COPY . . # Show build caching stats to check if it was used in the end. @@ -70,7 +68,6 @@ COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/safekeeper COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy /usr/local/bin # v14 is default for now -ARG PG_VERSION=v14 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ diff --git a/Makefile b/Makefile index 0b2b097ebc..0e7ceec15b 100644 --- a/Makefile +++ b/Makefile @@ -56,7 +56,7 @@ all: neon postgres neon-pg-ext # # The 'postgres_ffi' depends on the Postgres headers. .PHONY: neon -neon: postgres-v14-headers +neon: postgres-v14-headers postgres-v15-headers +@echo "Compiling Neon" $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) diff --git a/libs/postgres_ffi/README.md b/libs/postgres_ffi/README.md index 5656314fd7..de046eb3da 100644 --- a/libs/postgres_ffi/README.md +++ b/libs/postgres_ffi/README.md @@ -9,9 +9,11 @@ should be auto-generated too, but that's a TODO. The PostgreSQL on-disk file format is not portable across different CPU architectures and operating systems. It is also subject to change -in each major PostgreSQL version. Currently, this module is based on -PostgreSQL v14, but in the future we will probably need a separate -copy for each PostgreSQL version. +in each major PostgreSQL version. Currently, this module supports +PostgreSQL v14 and v15: bindings and code that depends on them are version-specific. +This code is organized in modules: `postgres_ffi::v14` and `postgres_ffi::v15` +Version independend code is explicitly exported into shared `postgres_ffi`. + TODO: Currently, there is also some code that deals with WAL records in pageserver/src/waldecoder.rs. That should be moved into this diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 19507f0557..8389ac37fe 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -48,90 +48,98 @@ fn main() { // Finding the location of C headers for the Postgres server: // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/pg_install` - // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/pg_install/v14/include/postgresql/server` - let mut pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") - { + // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/pg_install/{PG_MAJORVERSION}/include/postgresql/server` + let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") { postgres_install_dir.into() } else { PathBuf::from("pg_install") }; - // Currently, we only expect to find PostgreSQL v14 sources, in "pg_install/v14". In the - // future, we will run this for all supported PostgreSQL versions. - pg_install_dir.push("v14"); - if pg_install_dir.is_relative() { - let cwd = env::current_dir().unwrap(); - pg_install_dir = cwd.join("..").join("..").join(pg_install_dir); - } - - let pg_config_bin = pg_install_dir.join("bin").join("pg_config"); - let inc_server_path: String = if pg_config_bin.exists() { - let output = Command::new(pg_config_bin) - .arg("--includedir-server") - .output() - .expect("failed to execute `pg_config --includedir-server`"); - - if !output.status.success() { - panic!("`pg_config --includedir-server` failed") + for pg_version in &["v14", "v15"] { + let mut pg_install_dir_versioned = pg_install_dir.join(pg_version); + if pg_install_dir_versioned.is_relative() { + let cwd = env::current_dir().unwrap(); + pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned); } - String::from_utf8(output.stdout).unwrap().trim_end().into() - } else { - pg_install_dir - .join("include") - .join("postgresql") - .join("server") - .into_os_string() - .into_string() - .unwrap() - }; + let pg_config_bin = pg_install_dir_versioned + .join(pg_version) + .join("bin") + .join("pg_config"); + let inc_server_path: String = if pg_config_bin.exists() { + let output = Command::new(pg_config_bin) + .arg("--includedir-server") + .output() + .expect("failed to execute `pg_config --includedir-server`"); - // The bindgen::Builder is the main entry point - // to bindgen, and lets you build up options for - // the resulting bindings. - let bindings = bindgen::Builder::default() - // - // All the needed PostgreSQL headers are included from 'bindgen_deps.h' - // - .header("bindgen_deps.h") - // - // Tell cargo to invalidate the built crate whenever any of the - // included header files changed. - // - .parse_callbacks(Box::new(PostgresFfiCallbacks)) - // - // These are the types and constants that we want to generate bindings for - // - .allowlist_type("BlockNumber") - .allowlist_type("OffsetNumber") - .allowlist_type("MultiXactId") - .allowlist_type("MultiXactOffset") - .allowlist_type("MultiXactStatus") - .allowlist_type("ControlFileData") - .allowlist_type("CheckPoint") - .allowlist_type("FullTransactionId") - .allowlist_type("XLogRecord") - .allowlist_type("XLogPageHeaderData") - .allowlist_type("XLogLongPageHeaderData") - .allowlist_var("XLOG_PAGE_MAGIC") - .allowlist_var("PG_CONTROL_FILE_SIZE") - .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC") - .allowlist_type("PageHeaderData") - .allowlist_type("DBState") - // Because structs are used for serialization, tell bindgen to emit - // explicit padding fields. - .explicit_padding(true) - // - .clang_arg(format!("-I{inc_server_path}")) - // - // Finish the builder and generate the bindings. - // - .generate() - .expect("Unable to generate bindings"); + if !output.status.success() { + panic!("`pg_config --includedir-server` failed") + } - // Write the bindings to the $OUT_DIR/bindings.rs file. - let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); - bindings - .write_to_file(out_path.join("bindings.rs")) - .expect("Couldn't write bindings!"); + String::from_utf8(output.stdout).unwrap().trim_end().into() + } else { + pg_install_dir_versioned + .join("include") + .join("postgresql") + .join("server") + .into_os_string() + .into_string() + .unwrap() + }; + + // The bindgen::Builder is the main entry point + // to bindgen, and lets you build up options for + // the resulting bindings. + let bindings = bindgen::Builder::default() + // + // All the needed PostgreSQL headers are included from 'bindgen_deps.h' + // + .header("bindgen_deps.h") + // + // Tell cargo to invalidate the built crate whenever any of the + // included header files changed. + // + .parse_callbacks(Box::new(PostgresFfiCallbacks)) + // + // These are the types and constants that we want to generate bindings for + // + .allowlist_type("BlockNumber") + .allowlist_type("OffsetNumber") + .allowlist_type("XLogRecPtr") + .allowlist_type("XLogSegNo") + .allowlist_type("TimeLineID") + .allowlist_type("TimestampTz") + .allowlist_type("MultiXactId") + .allowlist_type("MultiXactOffset") + .allowlist_type("MultiXactStatus") + .allowlist_type("ControlFileData") + .allowlist_type("CheckPoint") + .allowlist_type("FullTransactionId") + .allowlist_type("XLogRecord") + .allowlist_type("XLogPageHeaderData") + .allowlist_type("XLogLongPageHeaderData") + .allowlist_var("XLOG_PAGE_MAGIC") + .allowlist_var("PG_CONTROL_FILE_SIZE") + .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC") + .allowlist_type("PageHeaderData") + .allowlist_type("DBState") + // Because structs are used for serialization, tell bindgen to emit + // explicit padding fields. + .explicit_padding(true) + // + .clang_arg(format!("-I{inc_server_path}")) + // + // Finish the builder and generate the bindings. + // + .generate() + .expect("Unable to generate bindings"); + + // Write the bindings to the $OUT_DIR/bindings_$pg_version.rs file. + let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); + let filename = format!("bindings_{pg_version}.rs"); + + bindings + .write_to_file(out_path.join(filename)) + .expect("Couldn't write bindings!"); + } } diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 022355329c..f43232ed0c 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -13,13 +13,17 @@ macro_rules! postgres_ffi { ($version:ident) => { #[path = "."] pub mod $version { - // fixme: does this have to be 'pub'? pub mod bindings { // bindgen generates bindings for a lot of stuff we don't need #![allow(dead_code)] use serde::{Deserialize, Serialize}; - include!(concat!(env!("OUT_DIR"), "/bindings.rs")); + include!(concat!( + env!("OUT_DIR"), + "/bindings_", + stringify!($version), + ".rs" + )); } pub mod controlfile_utils; pub mod nonrelfile_utils; @@ -28,6 +32,8 @@ macro_rules! postgres_ffi { pub mod waldecoder; pub mod xlog_utils; + pub const PG_MAJORVERSION: &str = stringify!($version); + // Re-export some symbols from bindings pub use bindings::DBState_DB_SHUTDOWNED; pub use bindings::{CheckPoint, ControlFileData, XLogRecord}; @@ -36,20 +42,26 @@ macro_rules! postgres_ffi { } postgres_ffi!(v14); +postgres_ffi!(v15); // Export some widely used datatypes that are unlikely to change across Postgres versions pub use v14::bindings::{uint32, uint64, Oid}; pub use v14::bindings::{BlockNumber, OffsetNumber}; pub use v14::bindings::{MultiXactId, TransactionId}; +pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo}; // Likewise for these, although the assumption that these don't change is a little more iffy. pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; +pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; // from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and // --with-segsize=SEGSIZE, but assume the defaults for now. pub const BLCKSZ: u16 = 8192; pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32); pub const XLOG_BLCKSZ: usize = 8192; +pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; + +pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; // PG timeline is always 1, changing it doesn't have any useful meaning in Neon. // diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 42b5c5d842..8cc9fa7af6 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -7,7 +7,7 @@ //! comments on them. //! -use super::bindings::PageHeaderData; +use super::bindings::{PageHeaderData, XLogRecord}; use crate::BLCKSZ; // @@ -176,7 +176,7 @@ pub const XLOG_DBASE_DROP: u8 = 0x10; pub const XLOG_TBLSPC_CREATE: u8 = 0x00; pub const XLOG_TBLSPC_DROP: u8 = 0x10; -pub const SIZEOF_XLOGRECORD: u32 = 24; +pub const SIZEOF_XLOGRECORD: u32 = std::mem::size_of::() as u32; // // from xlogrecord.h @@ -206,15 +206,10 @@ pub const INVALID_TRANSACTION_ID: u32 = 0; pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000; pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384; -/* FIXME: pageserver should request wal_seg_size from compute node */ -pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; - pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; pub const XLP_LONG_HEADER: u16 = 0x0002; -pub const PG_MAJORVERSION: &str = "14"; - // List of subdirectories inside pgdata. // Copied from src/bin/initdb/initdb.c pub const PGDATA_SUBDIRS: [&str; 22] = [ diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index b509fc87a5..4d79e4b1d1 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -9,8 +9,8 @@ //! for that is in pageserver/src/walrecord.rs //! use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC}; -use super::pg_constants; use super::xlog_utils::*; +use crate::WAL_SEGMENT_SIZE; use bytes::{Buf, BufMut, Bytes, BytesMut}; use crc32c::*; use log::*; @@ -133,7 +133,7 @@ impl WalStreamDecoder { // However, we may have to skip some page headers if we're processing the XLOG_SWITCH record or skipping padding for whatever reason. match self.state { State::WaitingForRecord | State::ReassemblingRecord { .. } => { - if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 { + if self.lsn.segment_offset(WAL_SEGMENT_SIZE) == 0 { // parse long header if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD { @@ -265,7 +265,7 @@ impl WalStreamDecoder { // to the next WAL segment. let next_lsn = if xlogrec.is_xlog_switch_record() { trace!("saw xlog switch record at {}", self.lsn); - self.lsn + self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) + self.lsn + self.lsn.calc_padding(WAL_SEGMENT_SIZE as u64) } else { // Pad to an 8-byte boundary self.lsn.align() diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 0d9aaa4708..f8606b6e47 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -10,14 +10,14 @@ use crc32c::crc32c_append; use super::bindings::{ - CheckPoint, FullTransactionId, XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, - XLOG_PAGE_MAGIC, + CheckPoint, FullTransactionId, TimeLineID, TimestampTz, XLogLongPageHeaderData, + XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC, }; use super::pg_constants; -use super::pg_constants::WAL_SEGMENT_SIZE; -use crate::v14::waldecoder::WalStreamDecoder; +use super::waldecoder::WalStreamDecoder; use crate::PG_TLI; use crate::{uint32, uint64, Oid}; +use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; use bytes::BytesMut; use bytes::{Buf, Bytes}; @@ -37,11 +37,9 @@ use utils::bin_ser::SerializeError; use utils::lsn::Lsn; pub const XLOG_FNAME_LEN: usize = 24; -pub const XLOG_BLCKSZ: usize = 8192; pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8; pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2; -pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::(); pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::(); @@ -49,11 +47,6 @@ pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::(); #[allow(clippy::identity_op)] pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; -pub type XLogRecPtr = u64; -pub type TimeLineID = u32; -pub type TimestampTz = i64; -pub type XLogSegNo = u64; - /// Interval of checkpointing metadata file. We should store metadata file to enforce /// predicate that checkpoint.nextXid is larger than any XID in WAL. /// But flushing checkpoint file for each transaction seems to be too expensive, @@ -318,9 +311,9 @@ impl CheckPoint { // We need this segment to start compute node. // pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result { - let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize); + let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE as usize); - let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, pg_constants::WAL_SEGMENT_SIZE); + let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); let hdr = XLogLongPageHeaderData { std: { XLogPageHeaderData { @@ -333,7 +326,7 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result Result Vec { #[cfg(test)] mod tests { + use super::super::PG_MAJORVERSION; use super::*; use regex::Regex; use std::cmp::min; @@ -434,23 +428,23 @@ mod tests { use utils::const_assert; fn init_logging() { - let _ = env_logger::Builder::from_env( - env_logger::Env::default() - .default_filter_or("wal_craft=info,postgres_ffi::xlog_utils=trace"), - ) + let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or( + format!("wal_craft=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"), + )) .is_test(true) .try_init(); } fn test_end_of_wal(test_name: &str) { use wal_craft::*; + // Craft some WAL let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("..") .join(".."); let cfg = Conf { - pg_distrib_dir: top_path.join("pg_install/v14"), - datadir: top_path.join(format!("test_output/{}", test_name)), + pg_distrib_dir: top_path.join(format!("pg_install/{PG_MAJORVERSION}")), + datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)), }; if cfg.datadir.exists() { fs::remove_dir_all(&cfg.datadir).unwrap(); diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 6ac5afb27f..2ad92d776d 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -4,10 +4,8 @@ use log::*; use once_cell::sync::Lazy; use postgres::types::PgLsn; use postgres::Client; -use postgres_ffi::v14::pg_constants::WAL_SEGMENT_SIZE; -use postgres_ffi::v14::xlog_utils::{ - XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, -}; +use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; +use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; use std::cmp::Ordering; use std::fs; use std::path::{Path, PathBuf}; diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 48b5f1a695..cd99c3c67d 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -30,7 +30,7 @@ use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFil use postgres_ffi::v14::{CheckPoint, ControlFileData}; use postgres_ffi::TransactionId; use postgres_ffi::PG_TLI; -use postgres_ffi::{BLCKSZ, RELSEG_SIZE}; +use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, @@ -268,8 +268,11 @@ where None }; + // TODO pass this as a parameter + let pg_version = "14"; + if spcnode == pg_constants::GLOBALTABLESPACE_OID { - let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes(); + let version_bytes = pg_version.as_bytes(); let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?; self.ar.append(&header, version_bytes)?; @@ -312,7 +315,7 @@ where if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); - let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes(); + let version_bytes = pg_version.as_bytes(); let header = new_tar_header(&dst_path, version_bytes.len() as u64)?; self.ar.append(&header, version_bytes)?; @@ -358,7 +361,7 @@ where let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?; // Generate new pg_control needed for bootstrap - checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0; + checkpoint.redo = normalize_lsn(self.lsn, WAL_SEGMENT_SIZE).0; //reset some fields we don't want to preserve //TODO Check this. @@ -392,13 +395,13 @@ where self.ar.append(&header, &pg_control_bytes[..])?; //send wal segment - let segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE); - let wal_file_name = XLogFileName(PG_TLI, segno, pg_constants::WAL_SEGMENT_SIZE); + let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE); + let wal_file_name = XLogFileName(PG_TLI, segno, WAL_SEGMENT_SIZE); let wal_file_path = format!("pg_wal/{}", wal_file_name); - let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?; + let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?; let wal_seg = generate_wal_segment(segno, pg_control.system_identifier) .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; - ensure!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE); + ensure!(wal_seg.len() == WAL_SEGMENT_SIZE); self.ar.append(&header, &wal_seg[..])?; Ok(()) } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 4cc3aafb0e..f8f614f8f4 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -21,7 +21,7 @@ use postgres_ffi::v14::waldecoder::*; use postgres_ffi::v14::xlog_utils::*; use postgres_ffi::v14::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED}; use postgres_ffi::Oid; -use postgres_ffi::BLCKSZ; +use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; // Returns checkpoint LSN from controlfile @@ -238,15 +238,15 @@ fn import_slru( fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> { let mut waldecoder = WalStreamDecoder::new(startpoint); - let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE); - let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE); + let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE); + let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = startpoint; let mut walingest = WalIngest::new(tline, startpoint)?; while last_lsn <= endpoint { // FIXME: assume postgresql tli 1 for now - let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE); + let filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE); let mut buf = Vec::new(); // Read local file @@ -265,7 +265,7 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) } let nread = file.read_to_end(&mut buf)?; - if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize { + if nread != WAL_SEGMENT_SIZE - offset as usize { // Maybe allow this for .partial files? error!("read only {} bytes from WAL file", nread); } @@ -355,8 +355,8 @@ pub fn import_wal_from_tar( ) -> Result<()> { // Set up walingest mutable state let mut waldecoder = WalStreamDecoder::new(start_lsn); - let mut segno = start_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE); - let mut offset = start_lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE); + let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE); + let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = start_lsn; let mut walingest = WalIngest::new(tline, start_lsn)?; @@ -373,7 +373,7 @@ pub fn import_wal_from_tar( match header.entry_type() { tar::EntryType::Regular => { // FIXME: assume postgresql tli 1 for now - let expected_filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE); + let expected_filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE); let file_name = file_path .file_name() .expect("missing wal filename") diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 7bba64179c..ba48a77961 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -14,9 +14,8 @@ use crate::walrecord::ZenithWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; use postgres_ffi::v14::pg_constants; -use postgres_ffi::v14::xlog_utils::TimestampTz; use postgres_ffi::BLCKSZ; -use postgres_ffi::{Oid, TransactionId}; +use postgres_ffi::{Oid, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::{hash_map, HashMap, HashSet}; use std::ops::Range; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index c56b1c6c0c..c718a4c30c 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -4,10 +4,10 @@ use anyhow::Result; use bytes::{Buf, Bytes}; use postgres_ffi::v14::pg_constants; -use postgres_ffi::v14::xlog_utils::{TimestampTz, XLOG_SIZE_OF_XLOG_RECORD}; +use postgres_ffi::v14::xlog_utils::XLOG_SIZE_OF_XLOG_RECORD; use postgres_ffi::v14::XLogRecord; use postgres_ffi::BLCKSZ; -use postgres_ffi::{BlockNumber, OffsetNumber}; +use postgres_ffi::{BlockNumber, OffsetNumber, TimestampTz}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; use serde::{Deserialize, Serialize}; use tracing::*; diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 3f84e7b183..16c1d36131 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -18,8 +18,8 @@ use crate::safekeeper::{ }; use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry}; use crate::timeline::TimelineTools; -use postgres_ffi::v14::pg_constants; use postgres_ffi::v14::xlog_utils; +use postgres_ffi::WAL_SEGMENT_SIZE; use utils::{ lsn::Lsn, postgres_backend::PostgresBackend, @@ -100,7 +100,7 @@ fn prepare_safekeeper(spg: &mut SafekeeperPostgresHandler) -> Result<()> { ztli: spg.ztimelineid.unwrap(), tenant_id: spg.ztenantid.unwrap(), tli: 0, - wal_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32, // 16MB, default for tests + wal_seg_size: WAL_SEGMENT_SIZE as u32, // 16MB, default for tests }); let response = spg.timeline.get().process_msg(&greeting_request)?; diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 648f0634f8..c693035dd3 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -7,7 +7,7 @@ use metrics::{ proto::MetricFamily, Gauge, IntGaugeVec, }; -use postgres_ffi::v14::xlog_utils::XLogSegNo; +use postgres_ffi::XLogSegNo; use utils::{lsn::Lsn, zid::ZTenantTimelineId}; use crate::{ diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index ed34669dde..a2bdcb55e7 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -5,7 +5,7 @@ use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use etcd_broker::subscription_value::SkTimelineInfo; -use postgres_ffi::v14::xlog_utils::{TimeLineID, XLogSegNo, MAX_SEND_SIZE}; +use postgres_ffi::{TimeLineID, XLogSegNo, MAX_SEND_SIZE}; use serde::{Deserialize, Serialize}; use std::cmp::max; use std::cmp::min; diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 38523f9f82..293cf67c57 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -6,9 +6,9 @@ use crate::timeline::{ReplicaState, Timeline, TimelineTools}; use crate::wal_storage::WalReader; use anyhow::{bail, Context, Result}; -use postgres_ffi::v14::xlog_utils::{get_current_timestamp, TimestampTz, MAX_SEND_SIZE}; - use bytes::Bytes; +use postgres_ffi::v14::xlog_utils::get_current_timestamp; +use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; use serde::{Deserialize, Serialize}; use std::cmp::min; use std::net::Shutdown; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index f482dbb3aa..8d101e6ff6 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -6,7 +6,7 @@ use anyhow::{bail, Context, Result}; use etcd_broker::subscription_value::SkTimelineInfo; use once_cell::sync::Lazy; -use postgres_ffi::v14::xlog_utils::XLogSegNo; +use postgres_ffi::XLogSegNo; use serde::Serialize; use tokio::sync::watch; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 5c6991c196..6acc70e85a 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -11,8 +11,8 @@ use std::pin::Pin; use std::sync::Arc; use std::time::Duration; -use postgres_ffi::v14::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr}; -use postgres_ffi::PG_TLI; +use postgres_ffi::v14::xlog_utils::{XLogFileName, XLogSegNoOffsetToRecPtr}; +use postgres_ffi::{XLogSegNo, PG_TLI}; use remote_storage::GenericRemoteStorage; use tokio::fs::File; use tokio::runtime::Builder; diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 6a45ae1411..644237a00d 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -14,9 +14,9 @@ use tokio::io::AsyncRead; use once_cell::sync::Lazy; use postgres_ffi::v14::xlog_utils::{ - find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, XLogSegNo, + find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, }; -use postgres_ffi::PG_TLI; +use postgres_ffi::{XLogSegNo, PG_TLI}; use std::cmp::min; use std::fs::{self, remove_file, File, OpenOptions}; From dc2150a90eaeee5f4a297d896f4eeb9ded63a8e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Wed, 7 Sep 2022 15:11:03 +0300 Subject: [PATCH 0739/1022] Add built files to gitignore (#2404) --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 618ff2c5b9..f1afdee599 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,6 @@ test_output/ *.key *.crt +*.o +*.so +*.Po From 83dca73f85ad859d156b2550c5108faddb2cff0d Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 7 Sep 2022 14:16:48 +0100 Subject: [PATCH 0740/1022] Store Allure tests statistics in database (#2367) --- .github/actions/allure-report/action.yml | 8 +- .github/workflows/build_and_test.yml | 18 +++++ scripts/ingest_perf_test_result.py | 7 +- scripts/ingest_regress_test_result.py | 97 ++++++++++++++++++++++++ 4 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 scripts/ingest_regress_test_result.py diff --git a/.github/actions/allure-report/action.yml b/.github/actions/allure-report/action.yml index 34761f8df1..ec751f51fc 100644 --- a/.github/actions/allure-report/action.yml +++ b/.github/actions/allure-report/action.yml @@ -11,6 +11,10 @@ inputs: test_selection: description: '`test_selector` from run-python-test-set action' required: false +outputs: + report-url: + description: 'Allure report URL' + value: ${{ steps.generate-report.outputs.report-url }} runs: using: "composite" @@ -182,7 +186,7 @@ runs: aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html" echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY} - echo "::set-output name=REPORT_URL::${REPORT_URL}" + echo "::set-output name=report-url::${REPORT_URL}" - name: Release Allure lock if: ${{ inputs.action == 'generate' && always() }} @@ -200,7 +204,7 @@ runs: - uses: actions/github-script@v6 if: ${{ inputs.action == 'generate' && always() }} env: - REPORT_URL: ${{ steps.generate-report.outputs.REPORT_URL }} + REPORT_URL: ${{ steps.generate-report.outputs.report-url }} BUILD_TYPE: ${{ inputs.build_type }} SHA: ${{ github.event.pull_request.head.sha || github.sha }} with: diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 6d966ce0a2..1387514cc2 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -253,6 +253,7 @@ jobs: real_s3_region: us-west-2 real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}" real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}" + - name: Merge and upload coverage data if: matrix.build_type == 'debug' uses: ./.github/actions/save-coverage-data @@ -308,11 +309,28 @@ jobs: submodules: false - name: Create Allure report + id: create-allure-report uses: ./.github/actions/allure-report with: action: generate build_type: ${{ matrix.build_type }} + - name: Store Allure test stat in the DB + env: + BUILD_TYPE: ${{ matrix.build_type }} + SHA: ${{ github.event.pull_request.head.sha || github.sha }} + REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }} + TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} + shell: bash -euxo pipefail {0} + run: | + curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json + ./scripts/pysync + + # Workaround for https://github.com/neondatabase/cloud/issues/2188 + psql "$TEST_RESULT_CONNSTR" -c "SELECT 1;" || sleep 10 + + DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json + coverage-report: runs-on: dev container: diff --git a/scripts/ingest_perf_test_result.py b/scripts/ingest_perf_test_result.py index 71f7ad3262..7f2af290a2 100644 --- a/scripts/ingest_perf_test_result.py +++ b/scripts/ingest_perf_test_result.py @@ -2,6 +2,7 @@ import argparse import json import os +import sys from contextlib import contextmanager from datetime import datetime from pathlib import Path @@ -26,7 +27,7 @@ CREATE TABLE IF NOT EXISTS perf_test_results ( def err(msg): print(f"error: {msg}") - exit(1) + sys.exit(1) @contextmanager @@ -43,8 +44,8 @@ def create_table(cur): cur.execute(CREATE_TABLE) -def ingest_perf_test_result(cursor, data_dile: Path, recorded_at_timestamp: int) -> int: - run_data = json.loads(data_dile.read_text()) +def ingest_perf_test_result(cursor, data_file: Path, recorded_at_timestamp: int) -> int: + run_data = json.loads(data_file.read_text()) revision = run_data["revision"] platform = run_data["platform"] diff --git a/scripts/ingest_regress_test_result.py b/scripts/ingest_regress_test_result.py new file mode 100644 index 0000000000..e07a972c67 --- /dev/null +++ b/scripts/ingest_regress_test_result.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +import argparse +import os +import sys +from contextlib import contextmanager +from pathlib import Path + +import psycopg2 + +CREATE_TABLE = """ +CREATE TABLE IF NOT EXISTS regress_test_results ( + id SERIAL PRIMARY KEY, + reference CHAR(255), + revision CHAR(40), + build_type CHAR(16), + data JSONB +) +""" + + +def err(msg): + print(f"error: {msg}") + sys.exit(1) + + +@contextmanager +def get_connection_cursor(): + connstr = os.getenv("DATABASE_URL") + if not connstr: + err("DATABASE_URL environment variable is not set") + with psycopg2.connect(connstr, connect_timeout=30) as conn: + with conn.cursor() as cur: + yield cur + + +def create_table(cur): + cur.execute(CREATE_TABLE) + + +def ingest_regress_test_result( + cursor, reference: str, revision: str, build_type: str, data_file: Path +): + values = ( + reference, + revision, + build_type, + data_file.read_text(), + ) + cursor.execute( + """ + INSERT INTO regress_test_results ( + reference, + revision, + build_type, + data + ) VALUES (%s, %s, %s, %s) + """, + values, + ) + + +def main(): + parser = argparse.ArgumentParser( + description="Regress test result uploader. \ + Database connection string should be provided via DATABASE_URL environment variable", + ) + parser.add_argument("--initdb", action="store_true", help="Initialuze database") + parser.add_argument( + "--reference", type=str, required=True, help="git reference, for example refs/heads/main" + ) + parser.add_argument("--revision", type=str, required=True, help="git revision") + parser.add_argument( + "--build-type", type=str, required=True, help="build type: release, debug or remote" + ) + parser.add_argument( + "--ingest", type=Path, required=True, help="Path to regress test result file" + ) + + args = parser.parse_args() + with get_connection_cursor() as cur: + if args.initdb: + create_table(cur) + + if not args.ingest.exists(): + err(f"ingest path {args.ingest} does not exist") + + ingest_regress_test_result( + cur, + reference=args.reference, + revision=args.revision, + build_type=args.build_type, + data_file=args.ingest, + ) + + +if __name__ == "__main__": + main() From 9e3136ea378547308abee959e8175224fee79572 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 7 Sep 2022 21:40:08 +0100 Subject: [PATCH 0741/1022] scripts/ingest_regress_test_result.py: fix json data insertion (#2408) --- scripts/ingest_regress_test_result.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scripts/ingest_regress_test_result.py b/scripts/ingest_regress_test_result.py index e07a972c67..974167483a 100644 --- a/scripts/ingest_regress_test_result.py +++ b/scripts/ingest_regress_test_result.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import argparse import os +import re import sys from contextlib import contextmanager from pathlib import Path @@ -40,11 +41,17 @@ def create_table(cur): def ingest_regress_test_result( cursor, reference: str, revision: str, build_type: str, data_file: Path ): + data = data_file.read_text() + # In the JSON report we can have lines related to LazyFixture with escaped double-quote + # It's hard to insert them into jsonb field as is, so replace \" with ' to make it easier for us + # + # "" -> "" + data = re.sub(r'("")', r"\g<1>'\g<2>'\g<3>", data) values = ( reference, revision, build_type, - data_file.read_text(), + data, ) cursor.execute( """ From 1351beae19be72b148a2ae6bebec29c5aafa38c0 Mon Sep 17 00:00:00 2001 From: MMeent Date: Thu, 8 Sep 2022 12:57:30 +0200 Subject: [PATCH 0742/1022] Fix race condition in ginHeapTupleFastInsert (#2412) Because the metadata was not locked, it could be updated concurrently such that we wouldn't actually have the tail block. The current ordering works better, as we still only start XLogBeginInsert() once we have all potentially interesting buffers loaded in memory, but still have correct lock lifetimes. See also: access/transam/README section Write-Ahead Log Coding --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index a4963aa6df..e8518d3fc8 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit a4963aa6df6a44bdee17ef387c01bcf46f6017fd +Subproject commit e8518d3fc85e3da420d2f5a2742a21386e6585ec diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 26c6466873..313769bb62 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 26c64668736b729a3e4c02c6fc0a84544118df26 +Subproject commit 313769bb6229f46380e24d8f6ff535f9185458af From 171385ac14efa41b8e9cfe73851ff772c9722ce4 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 8 Sep 2022 16:02:11 +0100 Subject: [PATCH 0743/1022] Pass COPT and PG_CFLAGS to Extension's CFLAGS (#2405) * fix incompatible-function-pointer-types warning * Pass COPT and PG_CFLAGS to Extension's CFLAGS --- .github/workflows/codestyle.yml | 1 + Makefile | 9 ++++----- pgxn/neon/pagestore_client.h | 2 +- pgxn/neon/pagestore_smgr.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index a5e31d49ee..bc21054e18 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -17,6 +17,7 @@ concurrency: env: RUST_BACKTRACE: 1 + COPT: '-Werror' jobs: check-codestyle-rust: diff --git a/Makefile b/Makefile index 0e7ceec15b..4d7b1bee07 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,6 @@ ifeq ($(UNAME_S),Linux) PG_CONFIGURE_OPTS += --with-libseccomp endif - # macOS with brew-installed openssl requires explicit paths # It can be configured with OPENSSL_PREFIX variable UNAME_S := $(shell uname -s) @@ -144,24 +143,24 @@ neon-pg-ext-v14: postgres-v14 +@echo "Compiling neon v14" mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v14 (cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install) +@echo "Compiling neon_test_utils" v14 mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 (cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install) neon-pg-ext-v15: postgres-v15 +@echo "Compiling neon v15" mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v15 (cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install) +@echo "Compiling neon_test_utils" v15 mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 (cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install) .PHONY: neon-pg-ext-clean diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index f79a3c9142..93ea6771eb 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -182,7 +182,7 @@ extern void zenith_write(SMgrRelation reln, ForkNumber forknum, extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum); -extern const int64 zenith_dbsize(Oid dbNode); +extern int64 zenith_dbsize(Oid dbNode); extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index e3f083fd43..d49df7af58 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1347,7 +1347,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) /* * zenith_db_size() -- Get the size of the database in bytes. */ -const int64 +int64 zenith_dbsize(Oid dbNode) { ZenithResponse *resp; From 35b4816f09b0697fe2c7e1c7b15d87cdb85cf1b7 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 31 Aug 2022 12:17:41 +0300 Subject: [PATCH 0744/1022] Turn GenericRemoteStorage into just a newtype around 'Arc' We had a pattern like this: match remote_storage { GenericRemoteStorage::Local(storage) => { let source = storage.remote_object_id(&file_path)?; ... storage .function(&source, ...) .await }, GenericRemoteStorage::S3(storage) => { ... exact same code as for the Local case ... }, This removes the code duplication, by allowing you to call the functions directly on GenericRemoteStorage. Also change RemoveObjectId to be just a type alias for String. Now that the callers of GenericRemoteStorage functions don't know whether they're dealing with the LocalFs or S3 implementation, RemoveObjectId must be the same type for both. --- libs/remote_storage/src/lib.rs | 235 +++++++++++++----------- libs/remote_storage/src/local_fs.rs | 131 ++++++++----- libs/remote_storage/src/s3_bucket.rs | 164 +++++++---------- pageserver/src/bin/pageserver.rs | 12 +- pageserver/src/http/routes.rs | 6 +- pageserver/src/storage_sync.rs | 26 +-- pageserver/src/storage_sync/delete.rs | 31 ++-- pageserver/src/storage_sync/download.rs | 110 ++++------- pageserver/src/storage_sync/upload.rs | 24 ++- pageserver/src/tenant_mgr.rs | 2 +- safekeeper/src/wal_backup.rs | 13 +- 11 files changed, 374 insertions(+), 380 deletions(-) diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 8a10e098a1..55db91dc31 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -14,8 +14,10 @@ use std::{ ffi::OsStr, fmt::Debug, num::{NonZeroU32, NonZeroUsize}, + ops::Deref, path::{Path, PathBuf}, pin::Pin, + sync::Arc, }; use anyhow::{bail, Context}; @@ -24,10 +26,7 @@ use tokio::io; use toml_edit::Item; use tracing::info; -pub use self::{ - local_fs::LocalFs, - s3_bucket::{S3Bucket, S3ObjectKey}, -}; +pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket}; /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage. /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency @@ -42,22 +41,62 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; +const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; + +#[derive(Clone, PartialEq, Eq)] +pub struct RemoteObjectId(String); + +impl From for String { + fn from(id: RemoteObjectId) -> Self { + id.0 + } +} + +/// +/// A key that refers to an object in remote storage. It works much like a Path, +/// but it's a separate datatype so that you don't accidentally mix local paths +/// and remote keys. +/// +impl RemoteObjectId { + // Needed to retrieve last component for RemoteObjectId. + // In other words a file name + /// Turn a/b/c or a/b/c/ into c + pub fn object_name(&self) -> Option<&str> { + // corner case, char::to_string is not const, thats why this is more verbose than it needs to be + // see https://github.com/rust-lang/rust/issues/88674 + if self.0.len() == 1 && self.0.chars().next().unwrap() == REMOTE_STORAGE_PREFIX_SEPARATOR { + return None; + } + + if self.0.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + self.0.rsplit(REMOTE_STORAGE_PREFIX_SEPARATOR).nth(1) + } else { + self.0 + .rsplit_once(REMOTE_STORAGE_PREFIX_SEPARATOR) + .map(|(_, last)| last) + } + } +} + +impl Debug for RemoteObjectId { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + self.0.fmt(fmt) + } +} + /// Storage (potentially remote) API to manage its state. /// This storage tries to be unaware of any layered repository context, /// providing basic CRUD operations for storage files. #[async_trait::async_trait] -pub trait RemoteStorage: Send + Sync { - /// A way to uniquely reference a file in the remote storage. - type RemoteObjectId; - +pub trait RemoteStorage: Send + Sync + 'static { /// Attempts to derive the storage path out of the local path, if the latter is correct. - fn remote_object_id(&self, local_path: &Path) -> anyhow::Result; + fn remote_object_id(&self, local_path: &Path) -> anyhow::Result; /// Gets the download path of the given storage file. - fn local_path(&self, remote_object_id: &Self::RemoteObjectId) -> anyhow::Result; + fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result; /// Lists all items the storage has right now. - async fn list(&self) -> anyhow::Result>; + async fn list(&self) -> anyhow::Result>; /// Lists all top level subdirectories for a given prefix /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id @@ -65,34 +104,39 @@ pub trait RemoteStorage: Send + Sync { /// so this method doesnt need to. async fn list_prefixes( &self, - prefix: Option<&Self::RemoteObjectId>, - ) -> anyhow::Result>; + prefix: Option<&RemoteObjectId>, + ) -> anyhow::Result>; /// Streams the local file contents into remote into the remote storage entry. async fn upload( &self, - from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, // S3 PUT request requires the content length to be specified, // otherwise it starts to fail with the concurrent connection count increasing. from_size_bytes: usize, - to: &Self::RemoteObjectId, + to: &RemoteObjectId, metadata: Option, ) -> anyhow::Result<()>; /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer. /// Returns the metadata, if any was stored with the file previously. - async fn download(&self, from: &Self::RemoteObjectId) -> Result; + async fn download(&self, from: &RemoteObjectId) -> Result; /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer. /// Returns the metadata, if any was stored with the file previously. async fn download_byte_range( &self, - from: &Self::RemoteObjectId, + from: &RemoteObjectId, start_inclusive: u64, end_exclusive: Option, ) -> Result; - async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>; + async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()>; + + /// Downcast to LocalFs implementation. For tests. + fn as_local(&self) -> Option<&LocalFs> { + None + } } pub struct Download { @@ -135,34 +179,37 @@ impl std::error::Error for DownloadError {} /// Every storage, currently supported. /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics. -pub enum GenericRemoteStorage { - Local(LocalFs), - S3(S3Bucket), +#[derive(Clone)] +pub struct GenericRemoteStorage(Arc); + +impl Deref for GenericRemoteStorage { + type Target = dyn RemoteStorage; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } } impl GenericRemoteStorage { - pub fn new( + pub fn new(storage: impl RemoteStorage) -> Self { + Self(Arc::new(storage)) + } + + pub fn from_config( working_directory: PathBuf, storage_config: &RemoteStorageConfig, - ) -> anyhow::Result { - match &storage_config.storage { + ) -> anyhow::Result { + Ok(match &storage_config.storage { RemoteStorageKind::LocalFs(root) => { info!("Using fs root '{}' as a remote storage", root.display()); - LocalFs::new(root.clone(), working_directory).map(GenericRemoteStorage::Local) + GenericRemoteStorage::new(LocalFs::new(root.clone(), working_directory)?) } RemoteStorageKind::AwsS3(s3_config) => { info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'", - s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); - S3Bucket::new(s3_config, working_directory).map(GenericRemoteStorage::S3) + s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); + GenericRemoteStorage::new(S3Bucket::new(s3_config, working_directory)?) } - } - } - - pub fn as_local(&self) -> Option<&LocalFs> { - match self { - Self::Local(local_fs) => Some(local_fs), - _ => None, - } + }) } /// Takes storage object contents and its size and uploads to remote storage, @@ -172,47 +219,26 @@ impl GenericRemoteStorage { /// this path is used for the remote object id conversion only. pub async fn upload_storage_object( &self, - from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, + from: Box, from_size_bytes: usize, from_path: &Path, ) -> anyhow::Result<()> { - async fn do_upload_storage_object( - storage: &S, - from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, - from_size_bytes: usize, - from_path: &Path, - ) -> anyhow::Result<()> - where - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, - { - let target_storage_path = storage.remote_object_id(from_path).with_context(|| { + let target_storage_path = self.remote_object_id(from_path).with_context(|| { + format!( + "Failed to get the storage path for source local path '{}'", + from_path.display() + ) + })?; + + self.upload(from, from_size_bytes, &target_storage_path, None) + .await + .with_context(|| { format!( - "Failed to get the storage path for source local path '{}'", - from_path.display() + "Failed to upload from '{}' to storage path '{:?}'", + from_path.display(), + target_storage_path ) - })?; - - storage - .upload(from, from_size_bytes, &target_storage_path, None) - .await - .with_context(|| { - format!( - "Failed to upload from '{}' to storage path '{:?}'", - from_path.display(), - target_storage_path - ) - }) - } - - match self { - GenericRemoteStorage::Local(storage) => { - do_upload_storage_object(storage, from, from_size_bytes, from_path).await - } - GenericRemoteStorage::S3(storage) => { - do_upload_storage_object(storage, from, from_size_bytes, from_path).await - } - } + }) } /// Downloads the storage object into the `to_path` provided. @@ -222,42 +248,22 @@ impl GenericRemoteStorage { byte_range: Option<(u64, Option)>, to_path: &Path, ) -> Result { - async fn do_download_storage_object( - storage: &S, - byte_range: Option<(u64, Option)>, - to_path: &Path, - ) -> Result - where - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, - { - let remote_object_path = storage - .remote_object_id(to_path) - .with_context(|| { - format!( - "Failed to get the storage path for target local path '{}'", - to_path.display() - ) - }) - .map_err(DownloadError::BadInput)?; + let remote_object_path = self + .remote_object_id(to_path) + .with_context(|| { + format!( + "Failed to get the storage path for target local path '{}'", + to_path.display() + ) + }) + .map_err(DownloadError::BadInput)?; - match byte_range { - Some((start, end)) => { - storage - .download_byte_range(&remote_object_path, start, end) - .await - } - None => storage.download(&remote_object_path).await, - } - } - - match self { - GenericRemoteStorage::Local(storage) => { - do_download_storage_object(storage, byte_range, to_path).await - } - GenericRemoteStorage::S3(storage) => { - do_download_storage_object(storage, byte_range, to_path).await + match byte_range { + Some((start, end)) => { + self.download_byte_range(&remote_object_path, start, end) + .await } + None => self.download(&remote_object_path).await, } } } @@ -463,4 +469,23 @@ mod tests { "/foo/bar.baz..temp" ); } + + #[test] + fn object_name() { + let k = RemoteObjectId("a/b/c".to_owned()); + assert_eq!(k.object_name(), Some("c")); + + let k = RemoteObjectId("a/b/c/".to_owned()); + assert_eq!(k.object_name(), Some("c")); + + let k = RemoteObjectId("a/".to_owned()); + assert_eq!(k.object_name(), Some("a")); + + // XXX is it impossible to have an empty key? + let k = RemoteObjectId("".to_owned()); + assert_eq!(k.object_name(), None); + + let k = RemoteObjectId("/".to_owned()); + assert_eq!(k.object_name(), None); + } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index ddf6c01759..2561c0ca24 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -17,10 +17,19 @@ use tokio::{ }; use tracing::*; -use crate::{path_with_suffix_extension, Download, DownloadError}; +use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectId}; use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; +/// Convert a Path in the remote storage into a RemoteObjectId +fn remote_object_id_from_path(path: &Path) -> anyhow::Result { + Ok(RemoteObjectId( + path.to_str() + .ok_or_else(|| anyhow::anyhow!("unexpected characters found in path"))? + .to_string(), + )) +} + pub struct LocalFs { working_directory: PathBuf, storage_root: PathBuf, @@ -43,11 +52,17 @@ impl LocalFs { }) } - fn resolve_in_storage(&self, path: &Path) -> anyhow::Result { + /// + /// Get the absolute path in the local filesystem to given remote object. + /// + /// This is public so that it can be used in tests. Should not be used elsewhere. + /// + pub fn resolve_in_storage(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result { + let path = PathBuf::from(&remote_object_id.0); if path.is_relative() { Ok(self.storage_root.join(path)) } else if path.starts_with(&self.storage_root) { - Ok(path.to_path_buf()) + Ok(path) } else { bail!( "Path '{}' does not belong to the current storage", @@ -85,38 +100,42 @@ impl LocalFs { #[async_trait::async_trait] impl RemoteStorage for LocalFs { - type RemoteObjectId = PathBuf; - - fn remote_object_id(&self, local_path: &Path) -> anyhow::Result { - Ok(self.storage_root.join( + /// Convert a "local" path into a "remote path" + fn remote_object_id(&self, local_path: &Path) -> anyhow::Result { + let path = self.storage_root.join( strip_path_prefix(&self.working_directory, local_path) .context("local path does not belong to this storage")?, - )) + ); + remote_object_id_from_path(&path) } - fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result { - let relative_path = strip_path_prefix(&self.storage_root, storage_path) + fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result { + let storage_path = PathBuf::from(&remote_object_id.0); + let relative_path = strip_path_prefix(&self.storage_root, &storage_path) .context("local path does not belong to this storage")?; Ok(self.working_directory.join(relative_path)) } - async fn list(&self) -> anyhow::Result> { + async fn list(&self) -> anyhow::Result> { get_all_files(&self.storage_root, true).await } async fn list_prefixes( &self, - prefix: Option<&Self::RemoteObjectId>, - ) -> anyhow::Result> { - let path = prefix.unwrap_or(&self.storage_root); + prefix: Option<&RemoteObjectId>, + ) -> anyhow::Result> { + let path = match prefix { + Some(prefix) => Path::new(&prefix.0), + None => &self.storage_root, + }; get_all_files(path, false).await } async fn upload( &self, - from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, from_size_bytes: usize, - to: &Self::RemoteObjectId, + to: &RemoteObjectId, metadata: Option, ) -> anyhow::Result<()> { let target_file_path = self.resolve_in_storage(to)?; @@ -197,7 +216,7 @@ impl RemoteStorage for LocalFs { Ok(()) } - async fn download(&self, from: &Self::RemoteObjectId) -> Result { + async fn download(&self, from: &RemoteObjectId) -> Result { let file_path = self .resolve_in_storage(from) .map_err(DownloadError::BadInput)?; @@ -231,7 +250,7 @@ impl RemoteStorage for LocalFs { async fn download_byte_range( &self, - from: &Self::RemoteObjectId, + from: &RemoteObjectId, start_inclusive: u64, end_exclusive: Option, ) -> Result { @@ -285,7 +304,7 @@ impl RemoteStorage for LocalFs { } } - async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> { + async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()> { let file_path = self.resolve_in_storage(path)?; if file_path.exists() && file_path.is_file() { Ok(fs::remove_file(file_path).await?) @@ -296,6 +315,10 @@ impl RemoteStorage for LocalFs { ) } } + + fn as_local(&self) -> Option<&LocalFs> { + Some(self) + } } fn storage_metadata_path(original_path: &Path) -> PathBuf { @@ -305,7 +328,7 @@ fn storage_metadata_path(original_path: &Path) -> PathBuf { fn get_all_files<'a, P>( directory_path: P, recursive: bool, -) -> Pin>> + Send + Sync + 'a>> +) -> Pin>> + Send + Sync + 'a>> where P: AsRef + Send + Sync + 'a, { @@ -322,12 +345,12 @@ where debug!("{:?} us a symlink, skipping", entry_path) } else if file_type.is_dir() { if recursive { - paths.extend(get_all_files(entry_path, true).await?.into_iter()) + paths.extend(get_all_files(&entry_path, true).await?.into_iter()) } else { - paths.push(dir_entry.path()) + paths.push(remote_object_id_from_path(&dir_entry.path())?) } } else { - paths.push(dir_entry.path()); + paths.push(remote_object_id_from_path(&dir_entry.path())?); } } Ok(paths) @@ -389,9 +412,15 @@ mod pure_tests { .join("file_name"); let expected_path = storage_root.join(local_path.strip_prefix(&workdir)?); + let actual_path = PathBuf::from( + storage + .remote_object_id(&local_path) + .expect("Matching path should map to storage path normally") + .0, + ); assert_eq!( expected_path, - storage.remote_object_id(&local_path).expect("Matching path should map to storage path normally"), + actual_path, "File paths from workdir should be stored in local fs storage with the same path they have relative to the workdir" ); @@ -452,7 +481,9 @@ mod pure_tests { assert_eq!( local_path, storage - .local_path(&storage_root.join(local_path.strip_prefix(&workdir)?)) + .local_path(&remote_object_id_from_path( + &storage_root.join(local_path.strip_prefix(&workdir)?) + )?) .expect("For a valid input, valid local path should be parsed"), "Should be able to parse metadata out of the correctly named remote delta file" ); @@ -476,8 +507,7 @@ mod pure_tests { #[test] fn local_path_negatives() -> anyhow::Result<()> { #[track_caller] - #[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.local_path` parameter requirements - fn local_path_error(storage: &LocalFs, storage_path: &PathBuf) -> String { + fn local_path_error(storage: &LocalFs, storage_path: &RemoteObjectId) -> String { match storage.local_path(storage_path) { Ok(wrong_path) => panic!( "Expected local path input {:?} to cause an error, but got file path: {:?}", @@ -494,7 +524,8 @@ mod pure_tests { }; let totally_wrong_path = "wrong_wrong_wrong"; - let error_message = local_path_error(&storage, &PathBuf::from(totally_wrong_path)); + let error_message = + local_path_error(&storage, &RemoteObjectId(totally_wrong_path.to_string())); assert!(error_message.contains(totally_wrong_path)); Ok(()) @@ -537,7 +568,7 @@ mod fs_tests { storage: &LocalFs, #[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.local_path` parameter requirements - remote_storage_path: &PathBuf, + remote_storage_path: &RemoteObjectId, expected_metadata: Option<&StorageMetadata>, ) -> anyhow::Result { let mut download = storage @@ -568,12 +599,20 @@ mod fs_tests { "whatever_contents", ) .await?; - let target_path = PathBuf::from("/").join("somewhere").join("else"); - match storage.upload(file, size, &target_path, None).await { + let target_path = "/somewhere/else"; + match storage + .upload( + Box::new(file), + size, + &RemoteObjectId(target_path.to_string()), + None, + ) + .await + { Ok(()) => panic!("Should not allow storing files with wrong target path"), Err(e) => { let message = format!("{:?}", e); - assert!(message.contains(&target_path.display().to_string())); + assert!(message.contains(target_path)); assert!(message.contains("does not belong to the current storage")); } } @@ -606,20 +645,20 @@ mod fs_tests { // Check that you get an error if the size parameter doesn't match the actual // size of the stream. storage - .upload(content.clone(), 0, &id, None) + .upload(Box::new(content.clone()), 0, &id, None) .await .expect_err("upload with zero size succeeded"); storage - .upload(content.clone(), 4, &id, None) + .upload(Box::new(content.clone()), 4, &id, None) .await .expect_err("upload with too short size succeeded"); storage - .upload(content.clone(), 6, &id, None) + .upload(Box::new(content.clone()), 6, &id, None) .await .expect_err("upload with too large size succeeded"); // Correct size is 5, this should succeed. - storage.upload(content, 5, &id, None).await?; + storage.upload(Box::new(content), 5, &id, None).await?; Ok(()) } @@ -643,8 +682,8 @@ mod fs_tests { "We should upload and download the same contents" ); - let non_existing_path = PathBuf::from("somewhere").join("else"); - match storage.download(&non_existing_path).await { + let non_existing_path = "somewhere/else"; + match storage.download(&RemoteObjectId(non_existing_path.to_string())).await { Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"), } @@ -783,7 +822,7 @@ mod fs_tests { Err(e) => { let error_string = e.to_string(); assert!(error_string.contains("does not exist")); - assert!(error_string.contains(&upload_target.display().to_string())); + assert!(error_string.contains(&upload_target.0)); } } Ok(()) @@ -844,15 +883,19 @@ mod fs_tests { storage: &LocalFs, name: &str, metadata: Option, - ) -> anyhow::Result { + ) -> anyhow::Result { let timeline_path = workdir.join("timelines").join("some_timeline"); let relative_timeline_path = timeline_path.strip_prefix(&workdir)?; let storage_path = storage.storage_root.join(relative_timeline_path).join(name); + let remote_object_id = RemoteObjectId(storage_path.to_str().unwrap().to_string()); let from_path = storage.working_directory.join(name); let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?; - storage.upload(file, size, &storage_path, metadata).await?; - Ok(storage_path) + + storage + .upload(Box::new(file), size, &remote_object_id, metadata) + .await?; + remote_object_id_from_path(&storage_path) } async fn create_file_for_upload( @@ -877,9 +920,9 @@ mod fs_tests { format!("contents for {name}") } - async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result> { + async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result> { let mut files = storage.list().await?; - files.sort(); + files.sort_by(|a, b| a.0.cmp(&b.0)); Ok(files) } } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index db31200c36..74632430cd 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -19,7 +19,10 @@ use tokio::{io, sync::Semaphore}; use tokio_util::io::ReaderStream; use tracing::debug; -use crate::{strip_path_prefix, Download, DownloadError, RemoteStorage, S3Config}; +use crate::{ + strip_path_prefix, Download, DownloadError, RemoteObjectId, RemoteStorage, S3Config, + REMOTE_STORAGE_PREFIX_SEPARATOR, +}; use super::StorageMetadata; @@ -88,50 +91,26 @@ pub(super) mod metrics { } } -const S3_PREFIX_SEPARATOR: char = '/'; +fn download_destination( + id: &RemoteObjectId, + workdir: &Path, + prefix_to_strip: Option<&str>, +) -> PathBuf { + let path_without_prefix = match prefix_to_strip { + Some(prefix) => id.0.strip_prefix(prefix).unwrap_or_else(|| { + panic!( + "Could not strip prefix '{}' from S3 object key '{}'", + prefix, id.0 + ) + }), + None => &id.0, + }; -#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Hash)] -pub struct S3ObjectKey(String); - -impl S3ObjectKey { - /// Turn a/b/c or a/b/c/ into c - pub fn object_name(&self) -> Option<&str> { - // corner case, char::to_string is not const, thats why this is more verbose than it needs to be - // see https://github.com/rust-lang/rust/issues/88674 - if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR { - return None; - } - - if self.0.ends_with(S3_PREFIX_SEPARATOR) { - self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1) - } else { - self.0 - .rsplit_once(S3_PREFIX_SEPARATOR) - .map(|(_, last)| last) - } - } - - fn key(&self) -> &str { - &self.0 - } - - fn download_destination(&self, workdir: &Path, prefix_to_strip: Option<&str>) -> PathBuf { - let path_without_prefix = match prefix_to_strip { - Some(prefix) => self.0.strip_prefix(prefix).unwrap_or_else(|| { - panic!( - "Could not strip prefix '{}' from S3 object key '{}'", - prefix, self.0 - ) - }), - None => &self.0, - }; - - workdir.join( - path_without_prefix - .split(S3_PREFIX_SEPARATOR) - .collect::(), - ) - } + workdir.join( + path_without_prefix + .split(REMOTE_STORAGE_PREFIX_SEPARATOR) + .collect::(), + ) } /// AWS S3 storage. @@ -193,12 +172,12 @@ impl S3Bucket { let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| { let mut prefix = prefix; - while prefix.starts_with(S3_PREFIX_SEPARATOR) { + while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { prefix = &prefix[1..] } let mut prefix = prefix.to_string(); - while prefix.ends_with(S3_PREFIX_SEPARATOR) { + while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { prefix.pop(); } prefix @@ -249,23 +228,25 @@ impl S3Bucket { #[async_trait::async_trait] impl RemoteStorage for S3Bucket { - type RemoteObjectId = S3ObjectKey; - - fn remote_object_id(&self, local_path: &Path) -> anyhow::Result { + fn remote_object_id(&self, local_path: &Path) -> anyhow::Result { let relative_path = strip_path_prefix(&self.workdir, local_path)?; let mut key = self.prefix_in_bucket.clone().unwrap_or_default(); for segment in relative_path { - key.push(S3_PREFIX_SEPARATOR); + key.push(REMOTE_STORAGE_PREFIX_SEPARATOR); key.push_str(&segment.to_string_lossy()); } - Ok(S3ObjectKey(key)) + Ok(RemoteObjectId(key)) } - fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result { - Ok(storage_path.download_destination(&self.workdir, self.prefix_in_bucket.as_deref())) + fn local_path(&self, storage_path: &RemoteObjectId) -> anyhow::Result { + Ok(download_destination( + storage_path, + &self.workdir, + self.prefix_in_bucket.as_deref(), + )) } - async fn list(&self) -> anyhow::Result> { + async fn list(&self) -> anyhow::Result> { let mut document_keys = Vec::new(); let mut continuation_token = None; @@ -296,7 +277,7 @@ impl RemoteStorage for S3Bucket { .contents .unwrap_or_default() .into_iter() - .filter_map(|o| Some(S3ObjectKey(o.key?))), + .filter_map(|o| Some(RemoteObjectId(o.key?))), ); match fetch_response.continuation_token { @@ -312,8 +293,8 @@ impl RemoteStorage for S3Bucket { /// Note: it wont include empty "directories" async fn list_prefixes( &self, - prefix: Option<&Self::RemoteObjectId>, - ) -> anyhow::Result> { + prefix: Option<&RemoteObjectId>, + ) -> anyhow::Result> { // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix .map(|p| p.0.clone()) @@ -321,8 +302,8 @@ impl RemoteStorage for S3Bucket { .map(|mut p| { // required to end with a separator // otherwise request will return only the entry of a prefix - if !p.ends_with(S3_PREFIX_SEPARATOR) { - p.push(S3_PREFIX_SEPARATOR); + if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); } p }); @@ -345,7 +326,7 @@ impl RemoteStorage for S3Bucket { bucket: self.bucket_name.clone(), prefix: list_prefix.clone(), continuation_token, - delimiter: Some(S3_PREFIX_SEPARATOR.to_string()), + delimiter: Some(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()), ..ListObjectsV2Request::default() }) .await @@ -359,7 +340,7 @@ impl RemoteStorage for S3Bucket { .common_prefixes .unwrap_or_default() .into_iter() - .filter_map(|o| Some(S3ObjectKey(o.prefix?))), + .filter_map(|o| Some(RemoteObjectId(o.prefix?))), ); match fetch_response.continuation_token { @@ -373,9 +354,9 @@ impl RemoteStorage for S3Bucket { async fn upload( &self, - from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, from_size_bytes: usize, - to: &Self::RemoteObjectId, + to: &RemoteObjectId, metadata: Option, ) -> anyhow::Result<()> { let _guard = self @@ -392,7 +373,7 @@ impl RemoteStorage for S3Bucket { from_size_bytes, )), bucket: self.bucket_name.clone(), - key: to.key().to_owned(), + key: to.0.to_owned(), metadata: metadata.map(|m| m.0), ..PutObjectRequest::default() }) @@ -404,10 +385,10 @@ impl RemoteStorage for S3Bucket { Ok(()) } - async fn download(&self, from: &Self::RemoteObjectId) -> Result { + async fn download(&self, from: &RemoteObjectId) -> Result { self.download_object(GetObjectRequest { bucket: self.bucket_name.clone(), - key: from.key().to_owned(), + key: from.0.to_owned(), ..GetObjectRequest::default() }) .await @@ -415,7 +396,7 @@ impl RemoteStorage for S3Bucket { async fn download_byte_range( &self, - from: &Self::RemoteObjectId, + from: &RemoteObjectId, start_inclusive: u64, end_exclusive: Option, ) -> Result { @@ -429,14 +410,14 @@ impl RemoteStorage for S3Bucket { self.download_object(GetObjectRequest { bucket: self.bucket_name.clone(), - key: from.key().to_owned(), + key: from.0.to_owned(), range, ..GetObjectRequest::default() }) .await } - async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> { + async fn delete(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<()> { let _guard = self .concurrency_limiter .acquire() @@ -448,7 +429,7 @@ impl RemoteStorage for S3Bucket { self.client .delete_object(DeleteObjectRequest { bucket: self.bucket_name.clone(), - key: path.key().to_owned(), + key: remote_object_id.0.to_owned(), ..DeleteObjectRequest::default() }) .await @@ -467,43 +448,24 @@ mod tests { use super::*; #[test] - fn object_name() { - let k = S3ObjectKey("a/b/c".to_owned()); - assert_eq!(k.object_name(), Some("c")); - - let k = S3ObjectKey("a/b/c/".to_owned()); - assert_eq!(k.object_name(), Some("c")); - - let k = S3ObjectKey("a/".to_owned()); - assert_eq!(k.object_name(), Some("a")); - - // XXX is it impossible to have an empty key? - let k = S3ObjectKey("".to_owned()); - assert_eq!(k.object_name(), None); - - let k = S3ObjectKey("/".to_owned()); - assert_eq!(k.object_name(), None); - } - - #[test] - fn download_destination() -> anyhow::Result<()> { + fn test_download_destination() -> anyhow::Result<()> { let workdir = tempdir()?.path().to_owned(); let local_path = workdir.join("one").join("two").join("test_name"); let relative_path = local_path.strip_prefix(&workdir)?; - let key = S3ObjectKey(format!( + let key = RemoteObjectId(format!( "{}{}", - S3_PREFIX_SEPARATOR, + REMOTE_STORAGE_PREFIX_SEPARATOR, relative_path .iter() .map(|segment| segment.to_str().unwrap()) .collect::>() - .join(&S3_PREFIX_SEPARATOR.to_string()), + .join(&REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()), )); assert_eq!( local_path, - key.download_destination(&workdir, None), + download_destination(&key, &workdir, None), "Download destination should consist of s3 path joined with the workdir prefix" ); @@ -520,8 +482,8 @@ mod tests { let storage = dummy_storage(workdir); - let expected_key = S3ObjectKey(format!( - "{}{S3_PREFIX_SEPARATOR}{segment_1}{S3_PREFIX_SEPARATOR}{segment_2}", + let expected_key = RemoteObjectId(format!( + "{}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_1}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_2}", storage.prefix_in_bucket.as_deref().unwrap_or_default(), )); @@ -592,7 +554,7 @@ mod tests { storage.prefix_in_bucket.as_deref(), ); assert_eq!( - s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()), + download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()), storage .local_path(&s3_key) .expect("For a valid input, valid S3 info should be parsed"), @@ -604,7 +566,7 @@ mod tests { storage.prefix_in_bucket.as_deref(), ); assert_eq!( - s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()), + download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()), storage .local_path(&s3_key) .expect("For a valid input, valid S3 info should be parsed"), @@ -645,11 +607,11 @@ mod tests { } } - fn create_s3_key(relative_file_path: &Path, prefix: Option<&str>) -> S3ObjectKey { - S3ObjectKey(relative_file_path.iter().fold( + fn create_s3_key(relative_file_path: &Path, prefix: Option<&str>) -> RemoteObjectId { + RemoteObjectId(relative_file_path.iter().fold( prefix.unwrap_or_default().to_string(), |mut path_string, segment| { - path_string.push(S3_PREFIX_SEPARATOR); + path_string.push(REMOTE_STORAGE_PREFIX_SEPARATOR); path_string.push_str(segment.to_str().unwrap()); path_string }, diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 7a33a548e7..5a43516728 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -1,7 +1,7 @@ //! Main entry point for the Page Server executable. use remote_storage::GenericRemoteStorage; -use std::{env, ops::ControlFlow, path::Path, str::FromStr, sync::Arc}; +use std::{env, ops::ControlFlow, path::Path, str::FromStr}; use tracing::*; use anyhow::{bail, Context, Result}; @@ -302,11 +302,13 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() let remote_storage = conf .remote_storage_config .as_ref() - .map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config)) + .map(|storage_config| { + GenericRemoteStorage::from_config(conf.workdir.clone(), storage_config) + }) .transpose() - .context("Failed to init generic remote storage")? - .map(Arc::new); - let remote_index = tenant_mgr::init_tenant_mgr(conf, remote_storage.as_ref().map(Arc::clone))?; + .context("Failed to init generic remote storage")?; + + let remote_index = tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())?; // Spawn a new thread for the http endpoint // bind before launching separate thread so the error reported before startup exits diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 09c4812067..a31c2fd2a5 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -34,7 +34,7 @@ struct State { auth: Option>, remote_index: RemoteIndex, allowlist_routes: Vec, - remote_storage: Option>, + remote_storage: Option, } impl State { @@ -42,7 +42,7 @@ impl State { conf: &'static PageServerConf, auth: Option>, remote_index: RemoteIndex, - remote_storage: Option>, + remote_storage: Option, ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() @@ -659,7 +659,7 @@ pub fn make_router( conf: &'static PageServerConf, auth: Option>, remote_index: RemoteIndex, - remote_storage: Option>, + remote_storage: Option, ) -> anyhow::Result> { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 491f882e0b..42fd6b8ea8 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -150,7 +150,7 @@ use std::{ num::{NonZeroU32, NonZeroUsize}, ops::ControlFlow, path::{Path, PathBuf}, - sync::{Arc, Condvar, Mutex}, + sync::{Condvar, Mutex}, }; use anyhow::{anyhow, bail, Context}; @@ -222,7 +222,7 @@ pub struct SyncStartupData { /// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states. pub fn start_local_timeline_sync( config: &'static PageServerConf, - storage: Option>, + storage: Option, ) -> anyhow::Result { let local_timeline_files = local_tenant_timeline_files(config) .context("Failed to collect local tenant timeline files")?; @@ -766,7 +766,7 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { pub(super) fn spawn_storage_sync_thread( conf: &'static PageServerConf, local_timeline_files: HashMap)>, - storage: Arc, + storage: GenericRemoteStorage, max_concurrent_timelines_sync: NonZeroUsize, max_sync_errors: NonZeroU32, ) -> anyhow::Result { @@ -825,12 +825,12 @@ pub(super) fn spawn_storage_sync_thread( fn storage_sync_loop( runtime: Runtime, conf: &'static PageServerConf, - (storage, index, sync_queue): (Arc, RemoteIndex, &SyncQueue), + (storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, ) { info!("Starting remote storage sync loop"); loop { - let loop_storage = Arc::clone(&storage); + let loop_storage = storage.clone(); let (batched_tasks, remaining_queue_length) = sync_queue.next_task_batch(); @@ -939,7 +939,7 @@ enum UploadStatus { async fn process_batches( conf: &'static PageServerConf, max_sync_errors: NonZeroU32, - storage: Arc, + storage: GenericRemoteStorage, index: &RemoteIndex, batched_tasks: HashMap, sync_queue: &SyncQueue, @@ -947,7 +947,7 @@ async fn process_batches( let mut sync_results = batched_tasks .into_iter() .map(|(sync_id, batch)| { - let storage = Arc::clone(&storage); + let storage = storage.clone(); let index = index.clone(); async move { let state_update = process_sync_task_batch( @@ -981,7 +981,7 @@ async fn process_batches( async fn process_sync_task_batch( conf: &'static PageServerConf, - (storage, index, sync_queue): (Arc, RemoteIndex, &SyncQueue), + (storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, sync_id: ZTenantTimelineId, batch: SyncTaskBatch, @@ -1009,7 +1009,7 @@ async fn process_sync_task_batch( ControlFlow::Continue(()) => { upload_timeline_data( conf, - (storage.as_ref(), &index, sync_queue), + (&storage, &index, sync_queue), current_remote_timeline.as_ref(), sync_id, upload_data, @@ -1020,7 +1020,7 @@ async fn process_sync_task_batch( } ControlFlow::Break(()) => match update_remote_data( conf, - storage.as_ref(), + &storage, &index, sync_id, RemoteDataUpdate::Upload { @@ -1053,7 +1053,7 @@ async fn process_sync_task_batch( ControlFlow::Continue(()) => { return download_timeline_data( conf, - (storage.as_ref(), &index, sync_queue), + (&storage, &index, sync_queue), current_remote_timeline.as_ref(), sync_id, download_data, @@ -1086,7 +1086,7 @@ async fn process_sync_task_batch( ControlFlow::Continue(()) => { delete_timeline_data( conf, - (storage.as_ref(), &index, sync_queue), + (&storage, &index, sync_queue), sync_id, delete_data, sync_start, @@ -1098,7 +1098,7 @@ async fn process_sync_task_batch( ControlFlow::Break(()) => { if let Err(e) = update_remote_data( conf, - storage.as_ref(), + &storage, &index, sync_id, RemoteDataUpdate::Delete(&delete_data.data.deleted_layers), diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index d80a082d0c..794ecbaeb3 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -7,15 +7,15 @@ use futures::stream::{FuturesUnordered, StreamExt}; use tracing::{debug, error, info}; use crate::storage_sync::{SyncQueue, SyncTask}; -use remote_storage::{GenericRemoteStorage, RemoteStorage}; +use remote_storage::GenericRemoteStorage; use utils::zid::ZTenantTimelineId; use super::{LayersDeletion, SyncData}; /// Attempts to remove the timleline layers from the remote storage. /// If the task had not adjusted the metadata before, the deletion will fail. -pub(super) async fn delete_timeline_layers<'a>( - storage: &'a GenericRemoteStorage, +pub(super) async fn delete_timeline_layers( + storage: &GenericRemoteStorage, sync_queue: &SyncQueue, sync_id: ZTenantTimelineId, mut delete_data: SyncData, @@ -43,14 +43,7 @@ pub(super) async fn delete_timeline_layers<'a>( let mut delete_tasks = layers_to_delete .into_iter() .map(|local_layer_path| async { - match match storage { - GenericRemoteStorage::Local(storage) => { - remove_storage_object(storage, &local_layer_path).await - } - GenericRemoteStorage::S3(storage) => { - remove_storage_object(storage, &local_layer_path).await - } - } { + match remove_storage_object(storage, &local_layer_path).await { Ok(()) => Ok(local_layer_path), Err(e) => Err((e, local_layer_path)), } @@ -88,11 +81,10 @@ pub(super) async fn delete_timeline_layers<'a>( errored } -async fn remove_storage_object(storage: &S, local_layer_path: &Path) -> anyhow::Result<()> -where - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +async fn remove_storage_object( + storage: &GenericRemoteStorage, + local_layer_path: &Path, +) -> anyhow::Result<()> { let storage_path = storage .remote_object_id(local_layer_path) .with_context(|| { @@ -132,7 +124,7 @@ mod tests { let harness = RepoHarness::create("delete_timeline_negative")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = GenericRemoteStorage::Local(LocalFs::new( + let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_path_buf(), harness.conf.workdir.clone(), )?); @@ -167,7 +159,7 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "c", "d"]; - let storage = GenericRemoteStorage::Local(LocalFs::new( + let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_path_buf(), harness.conf.workdir.clone(), )?); @@ -180,7 +172,8 @@ mod tests { let timeline_upload = create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; for local_path in timeline_upload.layers_to_upload { - let remote_path = local_storage.remote_object_id(&local_path)?; + let remote_path = + local_storage.resolve_in_storage(&local_storage.remote_object_id(&local_path)?)?; let remote_parent_dir = remote_path.parent().unwrap(); if !remote_parent_dir.exists() { fs::create_dir_all(&remote_parent_dir).await?; diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index e11a863dcc..372ca0a463 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -9,9 +9,7 @@ use std::{ use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; -use remote_storage::{ - path_with_suffix_extension, DownloadError, GenericRemoteStorage, RemoteStorage, -}; +use remote_storage::{path_with_suffix_extension, DownloadError, GenericRemoteStorage}; use tokio::{ fs, io::{self, AsyncWriteExt}, @@ -371,68 +369,6 @@ async fn get_timeline_sync_ids( tenant_path: &Path, tenant_id: ZTenantId, ) -> anyhow::Result> { - let timeline_ids: Vec = match storage { - GenericRemoteStorage::Local(storage) => list_prefixes(storage, tenant_path) - .await? - .into_iter() - .map(|timeline_directory_path| { - timeline_directory_path - .file_stem() - .with_context(|| { - format!( - "Failed to get timeline id string from file '{}'", - timeline_directory_path.display() - ) - })? - .to_string_lossy() - .as_ref() - .parse() - .with_context(|| { - format!( - "failed to parse directory name '{}' as timeline id", - timeline_directory_path.display() - ) - }) - }) - .collect::>(), - GenericRemoteStorage::S3(storage) => list_prefixes(storage, tenant_path) - .await? - .into_iter() - .map(|s3_path| { - s3_path - .object_name() - .with_context(|| { - format!("Failed to get object name out of S3 path {s3_path:?}") - })? - .parse() - .with_context(|| { - format!("failed to parse object name '{s3_path:?}' as timeline id") - }) - }) - .collect::>(), - } - .with_context(|| { - format!("Tenant {tenant_id} has at least one incorrect timeline subdirectory") - })?; - - if timeline_ids.is_empty() { - anyhow::bail!("no timelines found on the remote storage for tenant {tenant_id}") - } - - Ok(timeline_ids - .into_iter() - .map(|timeline_id| ZTenantTimelineId { - tenant_id, - timeline_id, - }) - .collect()) -} - -async fn list_prefixes(storage: &S, tenant_path: &Path) -> anyhow::Result> -where - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ let tenant_storage_path = storage.remote_object_id(tenant_path).with_context(|| { format!( "Failed to get tenant storage path for local path '{}'", @@ -440,14 +376,37 @@ where ) })?; - storage + let timelines = storage .list_prefixes(Some(&tenant_storage_path)) .await .with_context(|| { format!( "Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download" ) - }) + })?; + + if timelines.is_empty() { + anyhow::bail!("no timelines found on the remote storage") + } + + let mut sync_ids = HashSet::new(); + + for timeline_remote_storage_key in timelines { + let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { + anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") + })?; + + let timeline_id: ZTimelineId = object_name.parse().with_context(|| { + format!("failed to parse object name into timeline id '{object_name}'") + })?; + + sync_ids.insert(ZTenantTimelineId { + tenant_id, + timeline_id, + }); + } + + Ok(sync_ids) } async fn fsync_path(path: impl AsRef) -> Result<(), io::Error> { @@ -459,6 +418,7 @@ mod tests { use std::{ collections::{BTreeSet, HashSet}, num::NonZeroUsize, + path::PathBuf, }; use remote_storage::{LocalFs, RemoteStorage}; @@ -482,7 +442,7 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"]; - let storage = GenericRemoteStorage::Local(LocalFs::new( + let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), harness.conf.workdir.clone(), )?); @@ -494,7 +454,8 @@ mod tests { create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; for local_path in timeline_upload.layers_to_upload { - let remote_path = local_storage.remote_object_id(&local_path)?; + let remote_path = + local_storage.resolve_in_storage(&storage.remote_object_id(&local_path)?)?; let remote_parent_dir = remote_path.parent().unwrap(); if !remote_parent_dir.exists() { fs::create_dir_all(&remote_parent_dir).await?; @@ -580,7 +541,7 @@ mod tests { let harness = RepoHarness::create("download_timeline_negatives")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = GenericRemoteStorage::Local(LocalFs::new( + let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), harness.conf.workdir.clone(), )?); @@ -639,7 +600,7 @@ mod tests { let harness = RepoHarness::create("test_download_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = GenericRemoteStorage::Local(LocalFs::new( + let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), harness.conf.workdir.clone(), )?); @@ -663,9 +624,10 @@ mod tests { let local_index_part_path = metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); - let storage_path = local_storage.remote_object_id(&local_index_part_path)?; - fs::create_dir_all(storage_path.parent().unwrap()).await?; - fs::write(&storage_path, serde_json::to_vec(&index_part)?).await?; + let index_part_remote_id = local_storage.remote_object_id(&local_index_part_path)?; + let index_part_local_path = PathBuf::from(String::from(index_part_remote_id)); + fs::create_dir_all(index_part_local_path.parent().unwrap()).await?; + fs::write(&index_part_local_path, serde_json::to_vec(&index_part)?).await?; let downloaded_index_part = download_index_part(harness.conf, &storage, sync_id).await?; diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 8dd73d9431..7070f941f5 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -34,7 +34,11 @@ pub(super) async fn upload_index_part( let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); storage - .upload_storage_object(index_part_bytes, index_part_size, &index_part_path) + .upload_storage_object( + Box::new(index_part_bytes), + index_part_size, + &index_part_path, + ) .await .with_context(|| format!("Failed to upload index part for '{sync_id}'")) } @@ -119,7 +123,7 @@ pub(super) async fn upload_timeline_layers<'a>( .len() as usize; match storage - .upload_storage_object(source_file, source_size, &source_path) + .upload_storage_object(Box::new(source_file), source_size, &source_path) .await .with_context(|| format!("Failed to upload layer file for {sync_id}")) { @@ -214,8 +218,8 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b"]; - let storage = GenericRemoteStorage::Local(LocalFs::new( - tempdir()?.path().to_owned(), + let storage = GenericRemoteStorage::new(LocalFs::new( + tempdir()?.path().to_path_buf(), harness.conf.workdir.clone(), )?); let local_storage = storage.as_local().unwrap(); @@ -302,7 +306,7 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a1", "b1"]; - let storage = GenericRemoteStorage::Local(LocalFs::new( + let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), harness.conf.workdir.clone(), )?); @@ -395,7 +399,7 @@ mod tests { let harness = RepoHarness::create("test_upload_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = GenericRemoteStorage::Local(LocalFs::new( + let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), harness.conf.workdir.clone(), )?); @@ -431,13 +435,13 @@ mod tests { let index_part_path = storage_files.first().unwrap(); assert_eq!( - index_part_path.file_name().and_then(|name| name.to_str()), + index_part_path.object_name(), Some(IndexPart::FILE_NAME), "Remote index part should have the correct name" ); - - let remote_index_part: IndexPart = - serde_json::from_slice(&fs::read(&index_part_path).await?)?; + let remote_index_part: IndexPart = serde_json::from_slice( + &fs::read(local_storage.resolve_in_storage(index_part_path)?).await?, + )?; assert_eq!( index_part, remote_index_part, "Remote index part should match the local one" diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 7c82745142..041bd50737 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -134,7 +134,7 @@ impl fmt::Display for TenantState { /// are scheduled for download and added to the repository once download is completed. pub fn init_tenant_mgr( conf: &'static PageServerConf, - remote_storage: Option>, + remote_storage: Option, ) -> anyhow::Result { let (timeline_updates_sender, timeline_updates_receiver) = mpsc::unbounded_channel::(); diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 6acc70e85a..5d946e37a4 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -127,7 +127,8 @@ async fn wal_backup_launcher_main_loop( let conf_ = conf.clone(); REMOTE_STORAGE.get_or_init(|| { conf_.remote_storage.as_ref().map(|c| { - GenericRemoteStorage::new(conf_.workdir, c).expect("failed to create remote storage") + GenericRemoteStorage::from_config(conf_.workdir, c) + .expect("failed to create remote storage") }) }); @@ -417,7 +418,11 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec { static REMOTE_STORAGE: OnceCell> = OnceCell::new(); async fn backup_object(source_file: &Path, size: usize) -> Result<()> { - let storage = REMOTE_STORAGE.get().expect("failed to get remote storage"); + let storage = REMOTE_STORAGE + .get() + .expect("failed to get remote storage") + .as_ref() + .unwrap(); let file = tokio::io::BufReader::new(File::open(&source_file).await.with_context(|| { format!( @@ -427,9 +432,7 @@ async fn backup_object(source_file: &Path, size: usize) -> Result<()> { })?); storage - .as_ref() - .expect("Storage should be initialized by launcher at this point.") - .upload_storage_object(file, size, source_file) + .upload_storage_object(Box::new(file), size, source_file) .await } From 0b76b82e0ebfbbaaf9a6cf07217a5055c52ac196 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 8 Sep 2022 14:08:02 +0300 Subject: [PATCH 0745/1022] review clean up --- libs/remote_storage/src/lib.rs | 16 ++++++++-------- pageserver/src/storage_sync/download.rs | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 55db91dc31..e89f60de7e 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -12,7 +12,7 @@ use std::{ borrow::Cow, collections::HashMap, ffi::OsStr, - fmt::Debug, + fmt::{Debug, Display}, num::{NonZeroU32, NonZeroUsize}, ops::Deref, path::{Path, PathBuf}, @@ -46,12 +46,6 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; #[derive(Clone, PartialEq, Eq)] pub struct RemoteObjectId(String); -impl From for String { - fn from(id: RemoteObjectId) -> Self { - id.0 - } -} - /// /// A key that refers to an object in remote storage. It works much like a Path, /// but it's a separate datatype so that you don't accidentally mix local paths @@ -80,7 +74,13 @@ impl RemoteObjectId { impl Debug for RemoteObjectId { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { - self.0.fmt(fmt) + Debug::fmt(&self.0, fmt) + } +} + +impl Display for RemoteObjectId { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + Display::fmt(&self.0, fmt) } } diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 372ca0a463..b0beb4219a 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -625,7 +625,7 @@ mod tests { metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); let index_part_remote_id = local_storage.remote_object_id(&local_index_part_path)?; - let index_part_local_path = PathBuf::from(String::from(index_part_remote_id)); + let index_part_local_path = PathBuf::from(index_part_remote_id.to_string()); fs::create_dir_all(index_part_local_path.parent().unwrap()).await?; fs::write(&index_part_local_path, serde_json::to_vec(&index_part)?).await?; From d3f83eda52a1f4e372f9149ffc8b824ef3478a25 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 9 Sep 2022 00:07:14 +0300 Subject: [PATCH 0746/1022] Use regular agent for triggering e2e tests --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1387514cc2..bf9de7d857 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -412,7 +412,7 @@ jobs: trigger-e2e-tests: runs-on: dev container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned options: --init needs: [ build-neon ] steps: From c9e7c2f014a2d6ce269ccb7943a22d778378e512 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 7 Sep 2022 17:03:20 +0300 Subject: [PATCH 0747/1022] Ensure all temporary and empty directories and files are cleansed on pageserver startup --- libs/remote_storage/src/lib.rs | 7 + libs/remote_storage/src/local_fs.rs | 5 +- pageserver/src/http/routes.rs | 5 +- pageserver/src/layered_repository.rs | 105 ++-- .../src/layered_repository/delta_layer.rs | 7 +- .../src/layered_repository/image_layer.rs | 4 +- pageserver/src/lib.rs | 79 +++ pageserver/src/storage_sync.rs | 314 +++--------- pageserver/src/storage_sync/download.rs | 5 +- pageserver/src/tenant_mgr.rs | 480 +++++++++++++----- pageserver/src/tenant_tasks.rs | 10 - pageserver/src/timelines.rs | 21 +- pageserver/src/walredo.rs | 25 +- test_runner/regress/test_broken_timeline.py | 45 +- 14 files changed, 639 insertions(+), 473 deletions(-) diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index e89f60de7e..6b3fd29a0e 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -344,6 +344,8 @@ impl Debug for S3Config { } } +/// Adds a suffix to the file(directory) name, either appending the suffux to the end of its extension, +/// or if there's no extension, creates one and puts a suffix there. pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { let new_extension = match original_path .as_ref() @@ -468,6 +470,11 @@ mod tests { &path_with_suffix_extension(&p, ".temp").to_string_lossy(), "/foo/bar.baz..temp" ); + let p = PathBuf::from("/foo/bar/dir/"); + assert_eq!( + &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + "/foo/bar/dir..temp" + ); } #[test] diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 2561c0ca24..3ffbf3cb39 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -21,6 +21,8 @@ use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectId} use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; +const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp"; + /// Convert a Path in the remote storage into a RemoteObjectId fn remote_object_id_from_path(path: &Path) -> anyhow::Result { Ok(RemoteObjectId( @@ -143,7 +145,8 @@ impl RemoteStorage for LocalFs { // We need this dance with sort of durable rename (without fsyncs) // to prevent partial uploads. This was really hit when pageserver shutdown // cancelled the upload and partial file was left on the fs - let temp_file_path = path_with_suffix_extension(&target_file_path, "temp"); + let temp_file_path = + path_with_suffix_extension(&target_file_path, LOCAL_FS_TEMP_FILE_SUFFIX); let mut destination = io::BufWriter::new( fs::OpenOptions::new() .write(true) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a31c2fd2a5..59142bd9b2 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -470,7 +470,7 @@ async fn tenant_list_handler(request: Request) -> Result, A let response_data = tokio::task::spawn_blocking(move || { let _enter = info_span!("tenant_list").entered(); - crate::tenant_mgr::list_tenants(&remote_index) + crate::tenant_mgr::list_tenant_info(&remote_index) }) .await .map_err(ApiError::from_err)?; @@ -640,7 +640,8 @@ async fn tenant_config_handler(mut request: Request) -> Result Result { + let _guard = match self.file_lock.try_read() { + Ok(g) => g, + Err(_) => { + info!("File lock write acquired, shutting down GC"); + return Ok(GcResult::default()); + } + }; + let timeline_str = target_timeline_id .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); @@ -315,6 +323,14 @@ impl Repository { /// Also it can be explicitly requested per timeline through page server /// api's 'compact' command. pub fn compaction_iteration(&self) -> Result<()> { + let _guard = match self.file_lock.try_read() { + Ok(g) => g, + Err(_) => { + info!("File lock write acquired, shutting down compaction"); + return Ok(()); + } + }; + // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // compactions. We don't want to block everything else while the @@ -401,10 +417,10 @@ impl Repository { pub fn init_attach_timelines( &self, - timelines: Vec<(ZTimelineId, TimelineMetadata)>, + timelines: HashMap, ) -> anyhow::Result<()> { let sorted_timelines = if timelines.len() == 1 { - timelines + timelines.into_iter().collect() } else if !timelines.is_empty() { tree_sort_timelines(timelines)? } else { @@ -442,7 +458,7 @@ impl Repository { /// perform a topological sort, so that the parent of each timeline comes /// before the children. fn tree_sort_timelines( - timelines: Vec<(ZTimelineId, TimelineMetadata)>, + timelines: HashMap, ) -> Result> { let mut result = Vec::with_capacity(timelines.len()); @@ -567,13 +583,8 @@ impl Repository { .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag) } - pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) -> Result<()> { - let mut tenant_conf = self.tenant_conf.write().unwrap(); - - tenant_conf.update(&new_tenant_conf); - - Repository::persist_tenant_config(self.conf, self.tenant_id, *tenant_conf)?; - Ok(()) + pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) { + self.tenant_conf.write().unwrap().update(&new_tenant_conf); } fn initialize_new_timeline( @@ -648,32 +659,37 @@ impl Repository { tenant_id: ZTenantId, ) -> anyhow::Result { let target_config_path = TenantConf::path(conf, tenant_id); + let target_config_display = target_config_path.display(); - info!("load tenantconf from {}", target_config_path.display()); + info!("loading tenantconf from {target_config_display}"); // FIXME If the config file is not found, assume that we're attaching // a detached tenant and config is passed via attach command. // https://github.com/neondatabase/neon/issues/1555 if !target_config_path.exists() { - info!( - "tenant config not found in {}", - target_config_path.display() - ); - return Ok(Default::default()); + info!("tenant config not found in {target_config_display}"); + return Ok(TenantConfOpt::default()); } // load and parse file - let config = fs::read_to_string(target_config_path)?; + let config = fs::read_to_string(&target_config_path).with_context(|| { + format!("Failed to load config from path '{target_config_display}'") + })?; - let toml = config.parse::()?; + let toml = config.parse::().with_context(|| { + format!("Failed to parse config from file '{target_config_display}' as toml file") + })?; - let mut tenant_conf: TenantConfOpt = Default::default(); + let mut tenant_conf = TenantConfOpt::default(); for (key, item) in toml.iter() { match key { "tenant_config" => { - tenant_conf = PageServerConf::parse_toml_tenant_conf(item)?; + tenant_conf = PageServerConf::parse_toml_tenant_conf(item).with_context(|| { + format!("Failed to parse config from file '{target_config_display}' as pageserver config") + })?; } - _ => bail!("unrecognized pageserver option '{}'", key), + _ => bail!("config file {target_config_display} has unrecognized pageserver option '{key}'"), + } } @@ -888,26 +904,6 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { Ok(()) } -pub fn load_metadata( - conf: &'static PageServerConf, - timeline_id: ZTimelineId, - tenant_id: ZTenantId, -) -> anyhow::Result { - let metadata_path = metadata_path(conf, timeline_id, tenant_id); - let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { - format!( - "Failed to read metadata bytes from path {}", - metadata_path.display() - ) - })?; - TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| { - format!( - "Failed to parse metadata bytes from path {}", - metadata_path.display() - ) - }) -} - #[cfg(test)] pub mod repo_harness { use bytes::{Bytes, BytesMut}; @@ -925,6 +921,7 @@ pub mod repo_harness { walredo::{WalRedoError, WalRedoManager}, }; + use super::metadata::metadata_path; use super::*; use crate::tenant_config::{TenantConf, TenantConfOpt}; use hex_literal::hex; @@ -1030,7 +1027,7 @@ pub mod repo_harness { false, ); // populate repo with locally available timelines - let mut timelines_to_load = Vec::new(); + let mut timelines_to_load = HashMap::new(); for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) .expect("should be able to read timelines dir") { @@ -1042,7 +1039,7 @@ pub mod repo_harness { .to_string_lossy() .parse()?; let timeline_metadata = load_metadata(self.conf, timeline_id, self.tenant_id)?; - timelines_to_load.push((timeline_id, timeline_metadata)); + timelines_to_load.insert(timeline_id, timeline_metadata); } repo.init_attach_timelines(timelines_to_load)?; @@ -1054,6 +1051,26 @@ pub mod repo_harness { } } + fn load_metadata( + conf: &'static PageServerConf, + timeline_id: ZTimelineId, + tenant_id: ZTenantId, + ) -> anyhow::Result { + let metadata_path = metadata_path(conf, timeline_id, tenant_id); + let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { + format!( + "Failed to read metadata bytes from path {}", + metadata_path.display() + ) + })?; + TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| { + format!( + "Failed to parse metadata bytes from path {}", + metadata_path.display() + ) + }) + } + // Mock WAL redo manager that doesn't do much pub struct TestRedoManager; diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index ce5cb57745..af02f84bc0 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -34,7 +34,7 @@ use crate::layered_repository::storage_layer::{ use crate::page_cache::{PageReadGuard, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::virtual_file::VirtualFile; -use crate::walrecord; +use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use rand::{distributions::Alphanumeric, Rng}; @@ -447,11 +447,12 @@ impl DeltaLayer { .collect(); conf.timeline_path(&timelineid, &tenantid).join(format!( - "{}-XXX__{:016X}-{:016X}.{}.temp", + "{}-XXX__{:016X}-{:016X}.{}.{}", key_start, u64::from(lsn_range.start), u64::from(lsn_range.end), - rand_string + rand_string, + TEMP_FILE_SUFFIX, )) } diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index bb24553afd..4fe771bb3f 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -30,7 +30,7 @@ use crate::layered_repository::storage_layer::{ use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value, KEY_SIZE}; use crate::virtual_file::VirtualFile; -use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION}; +use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use hex; @@ -255,7 +255,7 @@ impl ImageLayer { .collect(); conf.timeline_path(&timelineid, &tenantid) - .join(format!("{}.{}.temp", fname, rand_string)) + .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}")) } /// diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 4731179e22..86bbf25b67 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -23,7 +23,10 @@ pub mod walreceiver; pub mod walrecord; pub mod walredo; +use std::collections::HashMap; + use tracing::info; +use utils::zid::{ZTenantId, ZTimelineId}; use crate::thread_mgr::ThreadKind; @@ -100,6 +103,50 @@ fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds } } +/// A newtype to store arbitrary data grouped by tenant and timeline ids. +/// One could use [`utils::zid::ZTenantTimelineId`] for grouping, but that would +/// not include the cases where a certain tenant has zero timelines. +/// This is sometimes important: a tenant could be registered during initial load from FS, +/// even if he has no timelines on disk. +#[derive(Debug)] +pub struct TenantTimelineValues(HashMap>); + +impl TenantTimelineValues { + fn new() -> Self { + Self(HashMap::new()) + } + + fn with_capacity(capacity: usize) -> Self { + Self(HashMap::with_capacity(capacity)) + } + + /// A convenience method to map certain values and omit some of them, if needed. + /// Tenants that won't have any timeline entries due to the filtering, will still be preserved + /// in the structure. + fn filter_map(self, map: F) -> TenantTimelineValues + where + F: Fn(T) -> Option, + { + let capacity = self.0.len(); + self.0.into_iter().fold( + TenantTimelineValues::::with_capacity(capacity), + |mut new_values, (tenant_id, old_values)| { + let new_timeline_values = new_values.0.entry(tenant_id).or_default(); + for (timeline_id, old_value) in old_values { + if let Some(new_value) = map(old_value) { + new_timeline_values.insert(timeline_id, new_value); + } + } + new_values + }, + ) + } +} + +/// A suffix to be used during file sync from the remote storage, +/// to ensure that we do not leave corrupted files that pretend to be layers. +const TEMP_FILE_SUFFIX: &str = "___temp"; + #[cfg(test)] mod backoff_defaults_tests { use super::*; @@ -130,3 +177,35 @@ mod backoff_defaults_tests { ); } } + +#[cfg(test)] +mod tests { + use crate::layered_repository::repo_harness::TIMELINE_ID; + + use super::*; + + #[test] + fn tenant_timeline_value_mapping() { + let first_tenant = ZTenantId::generate(); + let second_tenant = ZTenantId::generate(); + assert_ne!(first_tenant, second_tenant); + + let mut initial = TenantTimelineValues::new(); + initial + .0 + .entry(first_tenant) + .or_default() + .insert(TIMELINE_ID, "test_value"); + let _ = initial.0.entry(second_tenant).or_default(); + assert_eq!(initial.0.len(), 2, "Should have entries for both tenants"); + + let filtered = initial.filter_map(|_| None::<&str>).0; + assert_eq!( + filtered.len(), + 2, + "Should have entries for both tenants even after filtering away all entries" + ); + assert!(filtered.contains_key(&first_tenant)); + assert!(filtered.contains_key(&second_tenant)); + } +} diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 42fd6b8ea8..57a964cb67 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -145,7 +145,6 @@ mod upload; use std::{ collections::{hash_map, HashMap, HashSet, VecDeque}, - ffi::OsStr, fmt::Debug, num::{NonZeroU32, NonZeroUsize}, ops::ControlFlow, @@ -170,244 +169,56 @@ use self::{ index::{IndexPart, RemoteTimeline, RemoteTimelineIndex}, upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, }; -use crate::metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD}; use crate::{ config::PageServerConf, exponential_backoff, - layered_repository::{ - ephemeral_file::is_ephemeral_file, - metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}, - }, - storage_sync::{self, index::RemoteIndex}, - tenant_mgr::attach_downloaded_tenants, + layered_repository::metadata::{metadata_path, TimelineMetadata}, + storage_sync::index::RemoteIndex, + tenant_mgr::attach_local_tenants, thread_mgr, thread_mgr::ThreadKind, }; +use crate::{ + metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD}, + TenantTimelineValues, +}; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; use self::download::download_index_parts; pub use self::download::gather_tenant_timelines_index_parts; -pub use self::download::TEMP_DOWNLOAD_EXTENSION; static SYNC_QUEUE: OnceCell = OnceCell::new(); /// A timeline status to share with pageserver's sync counterpart, /// after comparing local and remote timeline state. -#[derive(Clone, Copy, Debug)] +#[derive(Clone)] pub enum LocalTimelineInitStatus { /// The timeline has every remote layer present locally. /// There could be some layers requiring uploading, /// but this does not block the timeline from any user interaction. - LocallyComplete, + LocallyComplete(TimelineMetadata), /// A timeline has some files remotely, that are not present locally and need downloading. /// Downloading might update timeline's metadata locally and current pageserver logic deals with local layers only, /// so the data needs to be downloaded first before the timeline can be used. NeedsSync, } -type LocalTimelineInitStatuses = HashMap>; +impl std::fmt::Debug for LocalTimelineInitStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::LocallyComplete(_) => write!(f, "LocallyComplete"), + Self::NeedsSync => write!(f, "NeedsSync"), + } + } +} /// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization. /// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still, /// to simplify the received code. pub struct SyncStartupData { pub remote_index: RemoteIndex, - pub local_timeline_init_statuses: LocalTimelineInitStatuses, -} - -/// Based on the config, initiates the remote storage connection and starts a separate thread -/// that ensures that pageserver and the remote storage are in sync with each other. -/// If no external configuration connection given, no thread or storage initialization is done. -/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states. -pub fn start_local_timeline_sync( - config: &'static PageServerConf, - storage: Option, -) -> anyhow::Result { - let local_timeline_files = local_tenant_timeline_files(config) - .context("Failed to collect local tenant timeline files")?; - - match storage.zip(config.remote_storage_config.as_ref()) { - Some((storage, storage_config)) => storage_sync::spawn_storage_sync_thread( - config, - local_timeline_files, - storage, - storage_config.max_concurrent_syncs, - storage_config.max_sync_errors, - ) - .context("Failed to spawn the storage sync thread"), - None => { - info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled"); - let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); - for ( - ZTenantTimelineId { - tenant_id, - timeline_id, - }, - _, - ) in local_timeline_files - { - local_timeline_init_statuses - .entry(tenant_id) - .or_default() - .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete); - } - Ok(SyncStartupData { - local_timeline_init_statuses, - remote_index: RemoteIndex::default(), - }) - } - } -} - -fn local_tenant_timeline_files( - config: &'static PageServerConf, -) -> anyhow::Result)>> { - let mut local_tenant_timeline_files = HashMap::new(); - let tenants_dir = config.tenants_path(); - for tenants_dir_entry in std::fs::read_dir(&tenants_dir) - .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? - { - match &tenants_dir_entry { - Ok(tenants_dir_entry) => { - match collect_timelines_for_tenant(config, &tenants_dir_entry.path()) { - Ok(collected_files) => { - local_tenant_timeline_files.extend(collected_files.into_iter()) - } - Err(e) => error!( - "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}", - tenants_dir.display(), - tenants_dir_entry, - e - ), - } - } - Err(e) => error!( - "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}", - tenants_dir_entry, - tenants_dir.display(), - e - ), - } - } - - Ok(local_tenant_timeline_files) -} - -fn collect_timelines_for_tenant( - config: &'static PageServerConf, - tenant_path: &Path, -) -> anyhow::Result)>> { - let mut timelines = HashMap::new(); - let tenant_id = tenant_path - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .context("Could not parse tenant id out of the tenant dir name")?; - let timelines_dir = config.timelines_path(&tenant_id); - - for timelines_dir_entry in std::fs::read_dir(&timelines_dir).with_context(|| { - format!( - "Failed to list timelines dir entry for tenant {}", - tenant_id - ) - })? { - match timelines_dir_entry { - Ok(timelines_dir_entry) => { - let timeline_path = timelines_dir_entry.path(); - match collect_timeline_files(&timeline_path) { - Ok((timeline_id, metadata, timeline_files)) => { - timelines.insert( - ZTenantTimelineId { - tenant_id, - timeline_id, - }, - (metadata, timeline_files), - ); - } - Err(e) => error!( - "Failed to process timeline dir contents at '{}', reason: {:?}", - timeline_path.display(), - e - ), - } - } - Err(e) => error!( - "Failed to list timelines for entry tenant {}, reason: {:?}", - tenant_id, e - ), - } - } - - Ok(timelines) -} - -// discover timeline files and extract timeline metadata -// NOTE: ephemeral files are excluded from the list -fn collect_timeline_files( - timeline_dir: &Path, -) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet)> { - let mut timeline_files = HashSet::new(); - let mut timeline_metadata_path = None; - - let timeline_id = timeline_dir - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .context("Could not parse timeline id out of the timeline dir name")?; - let timeline_dir_entries = - std::fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; - for entry in timeline_dir_entries { - let entry_path = entry.context("Failed to list timeline dir entry")?.path(); - if entry_path.is_file() { - if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) { - timeline_metadata_path = Some(entry_path); - } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) { - debug!("skipping ephemeral file {}", entry_path.display()); - continue; - } else if entry_path.extension().and_then(OsStr::to_str) - == Some(TEMP_DOWNLOAD_EXTENSION) - { - info!("removing temp download file at {}", entry_path.display()); - std::fs::remove_file(&entry_path).with_context(|| { - format!( - "failed to remove temp download file at {}", - entry_path.display() - ) - })?; - } else if entry_path.extension().and_then(OsStr::to_str) == Some("temp") { - info!("removing temp layer file at {}", entry_path.display()); - std::fs::remove_file(&entry_path).with_context(|| { - format!( - "failed to remove temp layer file at {}", - entry_path.display() - ) - })?; - } else { - timeline_files.insert(entry_path); - } - } - } - - // FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed - // then attach is lost. There would be no retries for that, - // initial collect will fail because there is no metadata. - // We either need to start download if we see empty dir after restart or attach caller should - // be aware of that and retry attach if awaits_download for timeline switched from true to false - // but timelinne didn't appear locally. - // Check what happens with remote index in that case. - let timeline_metadata_path = match timeline_metadata_path { - Some(path) => path, - None => bail!("No metadata file found in the timeline directory"), - }; - let metadata = TimelineMetadata::from_bytes( - &std::fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, - ) - .context("Failed to parse timeline metadata file bytes")?; - - Ok((timeline_id, metadata, timeline_files)) + pub local_timeline_init_statuses: TenantTimelineValues, } /// Global queue of sync tasks. @@ -763,9 +574,9 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { /// Launch a thread to perform remote storage sync tasks. /// See module docs for loop step description. -pub(super) fn spawn_storage_sync_thread( +pub fn spawn_storage_sync_thread( conf: &'static PageServerConf, - local_timeline_files: HashMap)>, + local_timeline_files: TenantTimelineValues<(TimelineMetadata, HashSet)>, storage: GenericRemoteStorage, max_concurrent_timelines_sync: NonZeroUsize, max_sync_errors: NonZeroU32, @@ -784,19 +595,43 @@ pub(super) fn spawn_storage_sync_thread( .build() .context("Failed to create storage sync runtime")?; + // TODO we are able to "attach" empty tenants, but not doing it now since it might require big wait time: + // * we need to list every timeline for tenant on S3, that might be a costly operation + // * we need to download every timeline for the tenant, to activate it in memory + // + // When on-demand download gets merged, we're able to do this fast by storing timeline metadata only. + let mut empty_tenants = TenantTimelineValues::::new(); + let mut keys_for_index_part_downloads = HashSet::new(); + let mut timelines_to_sync = HashMap::new(); + + for (tenant_id, timeline_data) in local_timeline_files.0 { + if timeline_data.is_empty() { + let _ = empty_tenants.0.entry(tenant_id).or_default(); + } else { + for (timeline_id, timeline_data) in timeline_data { + let id = ZTenantTimelineId::new(tenant_id, timeline_id); + keys_for_index_part_downloads.insert(id); + timelines_to_sync.insert(id, timeline_data); + } + } + } + let applicable_index_parts = runtime.block_on(download_index_parts( conf, &storage, - local_timeline_files.keys().copied().collect(), + keys_for_index_part_downloads, )); let remote_index = RemoteIndex::from_parts(conf, applicable_index_parts)?; - let local_timeline_init_statuses = schedule_first_sync_tasks( + let mut local_timeline_init_statuses = schedule_first_sync_tasks( &mut runtime.block_on(remote_index.write()), sync_queue, - local_timeline_files, + timelines_to_sync, ); + local_timeline_init_statuses + .0 + .extend(empty_tenants.0.into_iter()); let remote_index_clone = remote_index.clone(); thread_mgr::spawn( @@ -872,10 +707,7 @@ fn storage_sync_loop( "Sync loop step completed, {} new tenant state update(s)", updated_tenants.len() ); - let mut timelines_to_attach: HashMap< - ZTenantId, - Vec<(ZTimelineId, TimelineMetadata)>, - > = HashMap::new(); + let mut timelines_to_attach = TenantTimelineValues::new(); let index_accessor = runtime.block_on(index.read()); for tenant_id in updated_tenants { let tenant_entry = match index_accessor.tenant_entry(&tenant_id) { @@ -901,7 +733,7 @@ fn storage_sync_loop( // and register them all at once in a repository for download // to be submitted in a single operation to repository // so it can apply them at once to internal timeline map. - timelines_to_attach.insert( + timelines_to_attach.0.insert( tenant_id, tenant_entry .iter() @@ -912,7 +744,9 @@ fn storage_sync_loop( } drop(index_accessor); // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. - attach_downloaded_tenants(conf, &index, timelines_to_attach); + if let Err(e) = attach_local_tenants(conf, &index, timelines_to_attach) { + error!("Failed to attach new timelines: {e:?}"); + }; } } ControlFlow::Break(()) => { @@ -1443,11 +1277,10 @@ fn schedule_first_sync_tasks( index: &mut RemoteTimelineIndex, sync_queue: &SyncQueue, local_timeline_files: HashMap)>, -) -> LocalTimelineInitStatuses { - let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); +) -> TenantTimelineValues { + let mut local_timeline_init_statuses = TenantTimelineValues::new(); - let mut new_sync_tasks = - VecDeque::with_capacity(local_timeline_files.len().max(local_timeline_files.len())); + let mut new_sync_tasks = VecDeque::with_capacity(local_timeline_files.len()); for (sync_id, (local_metadata, local_files)) in local_timeline_files { match index.timeline_entry_mut(&sync_id) { @@ -1459,18 +1292,27 @@ fn schedule_first_sync_tasks( local_files, remote_timeline, ); - let was_there = local_timeline_init_statuses + match local_timeline_init_statuses + .0 .entry(sync_id.tenant_id) .or_default() - .insert(sync_id.timeline_id, timeline_status); - - if was_there.is_some() { - // defensive check - warn!( - "Overwriting timeline init sync status. Status {timeline_status:?}, timeline {}", - sync_id.timeline_id - ); + .entry(sync_id.timeline_id) + { + hash_map::Entry::Occupied(mut o) => { + { + // defensive check + warn!( + "Overwriting timeline init sync status. Status {timeline_status:?}, timeline {}", + sync_id.timeline_id + ); + } + o.insert(timeline_status); + } + hash_map::Entry::Vacant(v) => { + v.insert(timeline_status); + } } + remote_timeline.awaits_download = awaits_download; } None => { @@ -1481,15 +1323,16 @@ fn schedule_first_sync_tasks( SyncTask::upload(LayersUpload { layers_to_upload: local_files, uploaded_layers: HashSet::new(), - metadata: Some(local_metadata), + metadata: Some(local_metadata.clone()), }), )); local_timeline_init_statuses + .0 .entry(sync_id.tenant_id) .or_default() .insert( sync_id.timeline_id, - LocalTimelineInitStatus::LocallyComplete, + LocalTimelineInitStatus::LocallyComplete(local_metadata), ); } } @@ -1523,7 +1366,10 @@ fn compare_local_and_remote_timeline( // we do not need to manipulate with remote consistent lsn here // because it will be updated when sync will be completed } else { - (LocalTimelineInitStatus::LocallyComplete, false) + ( + LocalTimelineInitStatus::LocallyComplete(local_metadata.clone()), + false, + ) }; let layers_to_upload = local_files diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index b0beb4219a..91ee557b79 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -18,6 +18,7 @@ use tracing::{debug, error, info, warn}; use crate::{ config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, + TEMP_FILE_SUFFIX, }; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; @@ -26,8 +27,6 @@ use super::{ LayersDownload, SyncData, SyncQueue, }; -pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; - // We collect timelines remotely available for each tenant // in case we failed to gather all index parts (due to an error) // Poisoned variant is returned. @@ -251,7 +250,7 @@ pub(super) async fn download_timeline_layers<'a>( // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com // If pageserver crashes the temp file will be deleted on startup and re-downloaded. let temp_file_path = - path_with_suffix_extension(&layer_destination_path, TEMP_DOWNLOAD_EXTENSION); + path_with_suffix_extension(&layer_destination_path, TEMP_FILE_SUFFIX); let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| { diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 041bd50737..baa58f5eb5 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,24 +3,26 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; -use crate::layered_repository::metadata::TimelineMetadata; -use crate::layered_repository::{load_metadata, Repository, Timeline}; +use crate::layered_repository::ephemeral_file::is_ephemeral_file; +use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}; +use crate::layered_repository::{Repository, Timeline}; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; use crate::thread_mgr::ThreadKind; use crate::walredo::PostgresRedoManager; -use crate::{thread_mgr, timelines, walreceiver}; +use crate::{thread_mgr, timelines, walreceiver, TenantTimelineValues, TEMP_FILE_SUFFIX}; use anyhow::Context; use remote_storage::GenericRemoteStorage; use serde::{Deserialize, Serialize}; -use std::collections::hash_map::Entry; -use std::collections::HashMap; +use std::collections::hash_map::{self, Entry}; +use std::collections::{HashMap, HashSet}; +use std::ffi::OsStr; use std::fmt; +use std::path::{Path, PathBuf}; use std::sync::Arc; use tokio::sync::mpsc; use tracing::*; -use utils::lsn::Lsn; pub use tenants_state::try_send_timeline_update; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; @@ -136,34 +138,49 @@ pub fn init_tenant_mgr( conf: &'static PageServerConf, remote_storage: Option, ) -> anyhow::Result { + let _entered = info_span!("init_tenant_mgr").entered(); let (timeline_updates_sender, timeline_updates_receiver) = mpsc::unbounded_channel::(); tenants_state::set_timeline_update_sender(timeline_updates_sender)?; walreceiver::init_wal_receiver_main_thread(conf, timeline_updates_receiver)?; - let SyncStartupData { - remote_index, - local_timeline_init_statuses, - } = storage_sync::start_local_timeline_sync(conf, remote_storage) - .context("Failed to set up local files sync with external storage")?; + let local_tenant_files = local_tenant_timeline_files(conf) + .context("Failed to collect local tenant timeline files")?; - for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses { - if let Err(err) = - init_local_repository(conf, tenant_id, local_timeline_init_statuses, &remote_index) - { - // Report the error, but continue with the startup for other tenants. An error - // loading a tenant is serious, but it's better to complete the startup and - // serve other tenants, than fail completely. - error!("Failed to initialize local tenant {tenant_id}: {:?}", err); + let (remote_index, tenants_to_attach) = if let Some(storage) = remote_storage { + let storage_config = conf + .remote_storage_config + .as_ref() + .expect("remote storage without config"); - if let Err(err) = set_tenant_state(tenant_id, TenantState::Broken) { - error!( - "Failed to set tenant state to broken {tenant_id}: {:?}", - err - ); - } - } - } + let SyncStartupData { + remote_index, + local_timeline_init_statuses, + } = storage_sync::spawn_storage_sync_thread( + conf, + local_tenant_files, + storage, + storage_config.max_concurrent_syncs, + storage_config.max_sync_errors, + ) + .context("Failed to spawn the storage sync thread")?; + + ( + remote_index, + local_timeline_init_statuses.filter_map(|init_status| match init_status { + LocalTimelineInitStatus::LocallyComplete(metadata) => Some(metadata), + LocalTimelineInitStatus::NeedsSync => None, + }), + ) + } else { + info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled"); + ( + RemoteIndex::default(), + local_tenant_files.filter_map(|(metadata, _)| Some(metadata)), + ) + }; + + attach_local_tenants(conf, &remote_index, tenants_to_attach)?; Ok(remote_index) } @@ -189,35 +206,69 @@ impl std::fmt::Debug for LocalTimelineUpdate { } } -/// Updates tenants' repositories, changing their timelines state in memory. -pub fn attach_downloaded_tenants( +/// Reads local files to load tenants and their timelines given into pageserver's memory. +/// Ignores other timelines that might be present for tenant, but were not passed as a parameter. +/// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken", +/// and the load continues. +pub fn attach_local_tenants( conf: &'static PageServerConf, remote_index: &RemoteIndex, - sync_status_updates: HashMap>, -) { - if sync_status_updates.is_empty() { - debug!("No sync status updates to apply"); - return; - } - for (tenant_id, downloaded_timelines) in sync_status_updates { - info!( - "Registering downlloaded timelines for {tenant_id} {} timelines", - downloaded_timelines.len() - ); - debug!("Downloaded timelines: {downloaded_timelines:?}"); + tenants_to_attach: TenantTimelineValues, +) -> anyhow::Result<()> { + let _entered = info_span!("attach_local_tenants").entered(); + let number_of_tenants = tenants_to_attach.0.len(); - let repo = match load_local_repo(conf, tenant_id, remote_index) { - Ok(repo) => repo, - Err(e) => { - error!("Failed to load repo for tenant {tenant_id} Error: {e:?}"); - continue; + for (tenant_id, local_timelines) in tenants_to_attach.0 { + info!( + "Attaching {} timelines for {tenant_id}", + local_timelines.len() + ); + debug!("Timelines to attach: {local_timelines:?}"); + + let repository = load_local_repo(conf, tenant_id, remote_index) + .context("Failed to load repository for tenant")?; + + let repo = Arc::clone(&repository); + { + match tenants_state::write_tenants().entry(tenant_id) { + hash_map::Entry::Occupied(_) => { + anyhow::bail!("Cannot attach tenant {tenant_id}: there's already an entry in the tenant state"); + } + hash_map::Entry::Vacant(v) => { + v.insert(Tenant { + state: TenantState::Idle, + repo, + }); + } } - }; - match repo.init_attach_timelines(downloaded_timelines) { - Ok(()) => info!("successfully loaded local timelines for tenant {tenant_id}"), - Err(e) => error!("Failed to load local timelines for tenant {tenant_id}: {e:?}"), } + // XXX: current timeline init enables walreceiver that looks for tenant in the state, so insert the tenant entry before + repository + .init_attach_timelines(local_timelines) + .context("Failed to attach timelines for tenant")?; } + + info!("Processed {number_of_tenants} local tenants during attach"); + Ok(()) +} + +fn load_local_repo( + conf: &'static PageServerConf, + tenant_id: ZTenantId, + remote_index: &RemoteIndex, +) -> anyhow::Result> { + let repository = Repository::new( + conf, + TenantConfOpt::default(), + Arc::new(PostgresRedoManager::new(conf, tenant_id)), + tenant_id, + remote_index.clone(), + conf.remote_storage_config.is_some(), + ); + let tenant_conf = Repository::load_tenant_config(conf, tenant_id)?; + repository.update_tenant_config(tenant_conf); + + Ok(Arc::new(repository)) } /// @@ -293,13 +344,14 @@ pub fn create_tenant_repository( } pub fn update_tenant_config( + conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: ZTenantId, ) -> anyhow::Result<()> { info!("configuring tenant {tenant_id}"); - let repo = get_repository_for_tenant(tenant_id)?; + get_repository_for_tenant(tenant_id)?.update_tenant_config(tenant_conf); - repo.update_tenant_config(tenant_conf)?; + Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; Ok(()) } @@ -392,7 +444,7 @@ pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow debug!("waiting for threads to shutdown"); thread_mgr::shutdown_threads(None, None, Some(timeline_id)); debug!("thread shutdown completed"); - match tenants_state::write_tenants().get_mut(&tenant_id) { + match tenants_state::read_tenants().get(&tenant_id) { Some(tenant) => tenant.repo.delete_timeline(timeline_id)?, None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"), } @@ -428,12 +480,10 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any // need to use crossbeam-channel for (timeline_id, join_handle) in walreceiver_join_handles { info!("waiting for wal receiver to shutdown timeline_id {timeline_id}"); - join_handle.recv().context("failed to join walreceiver")?; + join_handle.recv().ok(); info!("wal receiver shutdown confirmed timeline_id {timeline_id}"); } - tenants_state::write_tenants().remove(&tenant_id); - // If removal fails there will be no way to successfully retry detach, // because the tenant no longer exists in the in-memory map. And it needs to be removed from it // before we remove files, because it contains references to repository @@ -443,7 +493,7 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any let local_tenant_directory = conf.tenant_path(&tenant_id); std::fs::remove_dir_all(&local_tenant_directory).with_context(|| { format!( - "Failed to remove local timeline directory '{}'", + "Failed to remove local tenant directory '{}'", local_tenant_directory.display() ) })?; @@ -454,7 +504,7 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any /// /// Get list of tenants, for the mgmt API /// -pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec { +pub fn list_tenant_info(remote_index: &RemoteTimelineIndex) -> Vec { tenants_state::read_tenants() .iter() .map(|(id, tenant)| { @@ -478,98 +528,248 @@ pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec { .collect() } -/// Check if a given timeline is "broken" \[1\]. -/// The function returns an error if the timeline is "broken". -/// -/// \[1\]: it's not clear now how should we classify a timeline as broken. -/// A timeline is categorized as broken when any of following conditions is true: -/// - failed to load the timeline's metadata -/// - the timeline's disk consistent LSN is zero -fn check_broken_timeline( - conf: &'static PageServerConf, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, -) -> anyhow::Result { - let metadata = - load_metadata(conf, timeline_id, tenant_id).context("failed to load metadata")?; +/// Attempts to collect information about all tenant and timelines, existing on the local FS. +/// If finds any, deletes all temporary files and directories, created before. Also removes empty directories, +/// that may appear due to such removals. +/// Does not fail on particular timeline or tenant collection errors, rather logging them and ignoring the entities. +fn local_tenant_timeline_files( + config: &'static PageServerConf, +) -> anyhow::Result)>> { + let _entered = info_span!("local_tenant_timeline_files").entered(); - // A timeline with zero disk consistent LSN can happen when the page server - // failed to checkpoint the timeline import data when creating that timeline. - if metadata.disk_consistent_lsn() == Lsn::INVALID { - anyhow::bail!("Timeline {timeline_id} has a zero disk consistent LSN."); + let mut local_tenant_timeline_files = TenantTimelineValues::new(); + let tenants_dir = config.tenants_path(); + for tenants_dir_entry in std::fs::read_dir(&tenants_dir) + .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? + { + match &tenants_dir_entry { + Ok(tenants_dir_entry) => { + let tenant_dir_path = tenants_dir_entry.path(); + if is_temporary(&tenant_dir_path) { + info!( + "Found temporary tenant directory, removing: {}", + tenant_dir_path.display() + ); + if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) { + error!( + "Failed to remove temporary directory '{}': {:?}", + tenant_dir_path.display(), + e + ); + } + } else { + match collect_timelines_for_tenant(config, &tenant_dir_path) { + Ok((tenant_id, collected_files)) => { + if collected_files.is_empty() { + match remove_if_empty(&tenant_dir_path) { + Ok(true) => info!("Removed empty tenant directory {}", tenant_dir_path.display()), + Ok(false) => { + // insert empty timeline entry: it has some non-temporary files inside that we cannot remove + // so make obvious for HTTP API callers, that something exists there and try to load the tenant + let _ = local_tenant_timeline_files.0.entry(tenant_id).or_default(); + }, + Err(e) => error!("Failed to remove empty tenant directory: {e:?}"), + } + } else { + local_tenant_timeline_files.0.entry(tenant_id).or_default().extend(collected_files.into_iter()) + } + }, + Err(e) => error!( + "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}", + tenants_dir.display(), + tenants_dir_entry, + e + ), + } + } + } + Err(e) => error!( + "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}", + tenants_dir_entry, + tenants_dir.display(), + e + ), + } } - Ok(metadata) + info!( + "Collected files for {} tenants", + local_tenant_timeline_files.0.len() + ); + Ok(local_tenant_timeline_files) } -/// Note: all timelines are attached at once if and only if all of them are locally complete -fn init_local_repository( - conf: &'static PageServerConf, - tenant_id: ZTenantId, - local_timeline_init_statuses: HashMap, - remote_index: &RemoteIndex, -) -> anyhow::Result<(), anyhow::Error> { - let mut timelines_to_attach = Vec::new(); - for (timeline_id, init_status) in local_timeline_init_statuses { - match init_status { - LocalTimelineInitStatus::LocallyComplete => { - debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository"); - let metadata = check_broken_timeline(conf, tenant_id, timeline_id) - .context("found broken timeline")?; - timelines_to_attach.push((timeline_id, metadata)); +fn remove_if_empty(tenant_dir_path: &Path) -> anyhow::Result { + let directory_is_empty = tenant_dir_path + .read_dir() + .with_context(|| { + format!( + "Failed to read directory '{}' contents", + tenant_dir_path.display() + ) + })? + .next() + .is_none(); + + if directory_is_empty { + std::fs::remove_dir_all(&tenant_dir_path).with_context(|| { + format!( + "Failed to remove empty directory '{}'", + tenant_dir_path.display(), + ) + })?; + + Ok(true) + } else { + Ok(false) + } +} + +fn is_temporary(path: &Path) -> bool { + match path.file_name() { + Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX), + None => false, + } +} + +#[allow(clippy::type_complexity)] +fn collect_timelines_for_tenant( + config: &'static PageServerConf, + tenant_path: &Path, +) -> anyhow::Result<( + ZTenantId, + HashMap)>, +)> { + let tenant_id = tenant_path + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .context("Could not parse tenant id out of the tenant dir name")?; + let timelines_dir = config.timelines_path(&tenant_id); + + let mut tenant_timelines = HashMap::new(); + for timelines_dir_entry in std::fs::read_dir(&timelines_dir) + .with_context(|| format!("Failed to list timelines dir entry for tenant {tenant_id}"))? + { + match timelines_dir_entry { + Ok(timelines_dir_entry) => { + let timeline_dir = timelines_dir_entry.path(); + if is_temporary(&timeline_dir) { + info!( + "Found temporary timeline directory, removing: {}", + timeline_dir.display() + ); + if let Err(e) = std::fs::remove_dir_all(&timeline_dir) { + error!( + "Failed to remove temporary directory '{}': {:?}", + timeline_dir.display(), + e + ); + } + } else { + match collect_timeline_files(&timeline_dir) { + Ok((timeline_id, metadata, timeline_files)) => { + tenant_timelines.insert(timeline_id, (metadata, timeline_files)); + } + Err(e) => { + error!( + "Failed to process timeline dir contents at '{}', reason: {:?}", + timeline_dir.display(), + e + ); + match remove_if_empty(&timeline_dir) { + Ok(true) => info!( + "Removed empty timeline directory {}", + timeline_dir.display() + ), + Ok(false) => (), + Err(e) => { + error!("Failed to remove empty timeline directory: {e:?}") + } + } + } + } + } } - LocalTimelineInitStatus::NeedsSync => { - debug!( - "timeline {tenant_id} for tenant {timeline_id} needs sync, \ - so skipped for adding into repository until sync is finished" - ); - return Ok(()); + Err(e) => { + error!("Failed to list timelines for entry tenant {tenant_id}, reason: {e:?}") } } } - // initialize local tenant - let repo = load_local_repo(conf, tenant_id, remote_index) - .with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?; - - // Lets fail here loudly to be on the safe side. - // XXX: It may be a better api to actually distinguish between repository startup - // and processing of newly downloaded timelines. - repo.init_attach_timelines(timelines_to_attach) - .with_context(|| format!("Failed to init local timelines for tenant {tenant_id}"))?; - Ok(()) -} - -// Sets up wal redo manager and repository for tenant. Reduces code duplication. -// Used during pageserver startup, or when new tenant is attached to pageserver. -fn load_local_repo( - conf: &'static PageServerConf, - tenant_id: ZTenantId, - remote_index: &RemoteIndex, -) -> anyhow::Result> { - let mut m = tenants_state::write_tenants(); - let tenant = m.entry(tenant_id).or_insert_with(|| { - // Set up a WAL redo manager, for applying WAL records. - let walredo_mgr = PostgresRedoManager::new(conf, tenant_id); - - // Set up an object repository, for actual data storage. - let repo: Arc = Arc::new(Repository::new( - conf, - TenantConfOpt::default(), - Arc::new(walredo_mgr), - tenant_id, - remote_index.clone(), - conf.remote_storage_config.is_some(), - )); - Tenant { - state: TenantState::Idle, - repo, + if tenant_timelines.is_empty() { + match remove_if_empty(&timelines_dir) { + Ok(true) => info!( + "Removed empty tenant timelines directory {}", + timelines_dir.display() + ), + Ok(false) => (), + Err(e) => error!("Failed to remove empty tenant timelines directory: {e:?}"), } - }); + } - // Restore tenant config - let tenant_conf = Repository::load_tenant_config(conf, tenant_id)?; - tenant.repo.update_tenant_config(tenant_conf)?; - - Ok(Arc::clone(&tenant.repo)) + Ok((tenant_id, tenant_timelines)) +} + +// discover timeline files and extract timeline metadata +// NOTE: ephemeral files are excluded from the list +fn collect_timeline_files( + timeline_dir: &Path, +) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet)> { + let mut timeline_files = HashSet::new(); + let mut timeline_metadata_path = None; + + let timeline_id = timeline_dir + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .context("Could not parse timeline id out of the timeline dir name")?; + let timeline_dir_entries = + std::fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; + for entry in timeline_dir_entries { + let entry_path = entry.context("Failed to list timeline dir entry")?.path(); + if entry_path.is_file() { + if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) { + timeline_metadata_path = Some(entry_path); + } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) { + debug!("skipping ephemeral file {}", entry_path.display()); + continue; + } else if is_temporary(&entry_path) { + info!("removing temp timeline file at {}", entry_path.display()); + std::fs::remove_file(&entry_path).with_context(|| { + format!( + "failed to remove temp download file at {}", + entry_path.display() + ) + })?; + } else { + timeline_files.insert(entry_path); + } + } + } + + // FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed + // then attach is lost. There would be no retries for that, + // initial collect will fail because there is no metadata. + // We either need to start download if we see empty dir after restart or attach caller should + // be aware of that and retry attach if awaits_download for timeline switched from true to false + // but timelinne didn't appear locally. + // Check what happens with remote index in that case. + let timeline_metadata_path = match timeline_metadata_path { + Some(path) => path, + None => anyhow::bail!("No metadata file found in the timeline directory"), + }; + let metadata = TimelineMetadata::from_bytes( + &std::fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, + ) + .context("Failed to parse timeline metadata file bytes")?; + + anyhow::ensure!( + metadata.ancestor_timeline().is_some() || !timeline_files.is_empty(), + "Timeline has no ancestor and no layer files" + ); + + Ok((timeline_id, metadata, timeline_files)) } diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 11be13b80c..4e9a5fc6ec 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -34,11 +34,6 @@ async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { // Break if we're not allowed to write to disk let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - // TODO do this inside repo.compaction_iteration instead. - let _guard = match repo.file_lock.try_read() { - Ok(g) => g, - Err(_) => return Ok(ControlFlow::Break(())), - }; // Run compaction let compaction_period = repo.get_compaction_period(); @@ -233,11 +228,6 @@ async fn gc_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { // Break if we're not allowed to write to disk let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - // TODO do this inside repo.gc_iteration instead. - let _guard = match repo.file_lock.try_read() { - Ok(g) => g, - Err(_) => return Ok(ControlFlow::Break(())), - }; // Run gc let gc_period = repo.get_gc_period(); diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 936699c2ec..9356893908 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -3,6 +3,7 @@ // use anyhow::{bail, ensure, Context, Result}; +use remote_storage::path_with_suffix_extension; use std::{ fs, @@ -18,12 +19,12 @@ use utils::{ zid::{ZTenantId, ZTimelineId}, }; -use crate::import_datadir; use crate::tenant_mgr; use crate::CheckpointConfig; use crate::{ config::PageServerConf, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, }; +use crate::{import_datadir, TEMP_FILE_SUFFIX}; use crate::{ layered_repository::{Repository, Timeline}, walredo::WalRedoManager, @@ -105,13 +106,17 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { // fn bootstrap_timeline( conf: &'static PageServerConf, - tenantid: ZTenantId, - tli: ZTimelineId, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, repo: &Repository, ) -> Result> { - let initdb_path = conf - .tenant_path(&tenantid) - .join(format!("tmp-timeline-{}", tli)); + // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` + // temporary directory for basebackup files for the given timeline. + let initdb_path = path_with_suffix_extension( + conf.timelines_path(&tenant_id) + .join(format!("basebackup-{timeline_id}")), + TEMP_FILE_SUFFIX, + ); // Init temporarily repo to get bootstrap data run_initdb(conf, &initdb_path)?; @@ -123,7 +128,7 @@ fn bootstrap_timeline( // LSN, and any WAL after that. // Initdb lsn will be equal to last_record_lsn which will be set after import. // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = repo.create_empty_timeline(tli, lsn)?; + let timeline = repo.create_empty_timeline(timeline_id, lsn)?; import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; fail::fail_point!("before-checkpoint-new-timeline", |_| { @@ -134,7 +139,7 @@ fn bootstrap_timeline( info!( "created root timeline {} timeline.lsn {}", - tli, + timeline_id, timeline.get_last_record_lsn() ); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 4e49fd9373..dd946659bb 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -21,6 +21,7 @@ use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; +use remote_storage::path_with_suffix_extension; use serde::Serialize; use std::fs; use std::fs::OpenOptions; @@ -37,7 +38,6 @@ use std::time::Instant; use tracing::*; use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock, zid::ZTenantId}; -use crate::config::PageServerConf; use crate::metrics::{ WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, WAL_REDO_WAIT_TIME, }; @@ -45,6 +45,7 @@ use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; use crate::walrecord::ZenithWalRecord; +use crate::{config::PageServerConf, TEMP_FILE_SUFFIX}; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, transaction_id_set_status, @@ -569,20 +570,24 @@ impl PostgresRedoProcess { // // Start postgres binary in special WAL redo mode. // - fn launch(conf: &PageServerConf, tenantid: &ZTenantId) -> Result { + fn launch(conf: &PageServerConf, tenant_id: &ZTenantId) -> Result { // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we // just create one with constant name. That fails if you try to launch more than // one WAL redo manager concurrently. - let datadir = conf.tenant_path(tenantid).join("wal-redo-datadir"); + let datadir = path_with_suffix_extension( + conf.tenant_path(tenant_id).join("wal-redo-datadir"), + TEMP_FILE_SUFFIX, + ); // Create empty data directory for wal-redo postgres, deleting old one first. if datadir.exists() { - info!("directory {:?} exists, removing", &datadir); - if let Err(e) = fs::remove_dir_all(&datadir) { - error!("could not remove old wal-redo-datadir: {:#}", e); - } + info!( + "old temporary datadir {} exists, removing", + datadir.display() + ); + fs::remove_dir_all(&datadir)?; } - info!("running initdb in {:?}", datadir.display()); + info!("running initdb in {}", datadir.display()); let initdb = Command::new(conf.pg_bin_dir().join("initdb")) .args(&["-D", &datadir.to_string_lossy()]) .arg("-N") @@ -591,7 +596,7 @@ impl PostgresRedoProcess { .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) .close_fds() .output() - .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {}", e)))?; + .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?; if !initdb.status.success() { return Err(Error::new( @@ -645,7 +650,7 @@ impl PostgresRedoProcess { })?; info!( - "launched WAL redo postgres process on {:?}", + "launched WAL redo postgres process on {}", datadir.display() ); diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 4aba2494e9..1d083b3ef9 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -32,33 +32,34 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # Leave the first timeline alone, but corrupt the others in different ways (tenant0, timeline0, pg0) = tenant_timelines[0] + log.info(f"Timeline {tenant0}/{timeline0} is left intact") - # Corrupt metadata file on timeline 1 (tenant1, timeline1, pg1) = tenant_timelines[1] - metadata_path = "{}/tenants/{}/timelines/{}/metadata".format(env.repo_dir, tenant1, timeline1) - print(f"overwriting metadata file at {metadata_path}") + metadata_path = f"{env.repo_dir}/tenants/{tenant1}/timelines/{timeline1}/metadata" f = open(metadata_path, "w") f.write("overwritten with garbage!") f.close() + log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled") - # Missing layer files file on timeline 2. (This would actually work - # if we had Cloud Storage enabled in this test.) (tenant2, timeline2, pg2) = tenant_timelines[2] - timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant2, timeline2) + timeline_path = f"{env.repo_dir}/tenants/{tenant2}/timelines/{timeline2}/" for filename in os.listdir(timeline_path): if filename.startswith("00000"): # Looks like a layer file. Remove it os.remove(f"{timeline_path}/{filename}") + log.info( + f"Timeline {tenant2}/{timeline2} got its layer files removed (no remote storage enabled)" + ) - # Corrupt layer files file on timeline 3 (tenant3, timeline3, pg3) = tenant_timelines[3] - timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant3, timeline3) + timeline_path = f"{env.repo_dir}/tenants/{tenant3}/timelines/{timeline3}/" for filename in os.listdir(timeline_path): if filename.startswith("00000"): # Looks like a layer file. Corrupt it f = open(f"{timeline_path}/{filename}", "w") f.write("overwritten with garbage!") f.close() + log.info(f"Timeline {tenant3}/{timeline3} got its layer files spoiled") env.pageserver.start() @@ -69,20 +70,28 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # But all others are broken # First timeline would not get loaded into pageserver due to corrupt metadata file - (_tenant, _timeline, pg) = tenant_timelines[1] with pytest.raises( Exception, match=f"Could not get timeline {timeline1} in tenant {tenant1}" ) as err: - pg.start() + pg1.start() + log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") + + # Second timeline has no ancestors, only the metadata file and no layer files + # We don't have the remote storage enabled, which means timeline is in an incorrect state, + # it's not loaded at all + with pytest.raises( + Exception, match=f"Could not get timeline {timeline2} in tenant {tenant2}" + ) as err: + pg2.start() log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") # Yet other timelines will fail when their layers will be queried during basebackup: we don't check layer file contents on startup, when loading the timeline - for n in range(2, 4): - (_tenant, _timeline, pg) = tenant_timelines[n] + for n in range(3, 4): + (bad_tenant, bad_timeline, pg) = tenant_timelines[n] with pytest.raises(Exception, match="extracting base backup failed") as err: pg.start() log.info( - f"compute startup failed lazily for timeline with corrupt layers, during basebackup preparation: {err}" + f"compute startup failed lazily for timeline {bad_tenant}/{bad_timeline} with corrupt layers, during basebackup preparation: {err}" ) @@ -107,6 +116,8 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): tenant_id, _ = env.neon_cli.create_tenant() + old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + # Introduce failpoint when creating a new timeline env.pageserver.safe_psql("failpoints before-checkpoint-new-timeline=return") with pytest.raises(Exception, match="before-checkpoint-new-timeline"): @@ -116,6 +127,8 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): env.neon_cli.pageserver_stop(immediate=True) env.neon_cli.pageserver_start() - # Check that tenant with "broken" timeline is not loaded. - with pytest.raises(Exception, match=f"Failed to get repo for tenant {tenant_id}"): - env.neon_cli.list_timelines(tenant_id) + # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. + new_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + assert ( + new_tenant_timelines == old_tenant_timelines + ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" From 31ec3b790686615448ee2d00e5b4b9b5ce143b74 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 5 Sep 2022 10:13:36 +0300 Subject: [PATCH 0748/1022] Use the toolchain file to define current rustc version used --- .dockerignore | 1 + README.md | 19 +++++++++++++++---- rust-toolchain.toml | 7 +++++++ 3 files changed, 23 insertions(+), 4 deletions(-) create mode 100644 rust-toolchain.toml diff --git a/.dockerignore b/.dockerignore index 9f8a22d598..4bc8e5fa13 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,5 +1,6 @@ * +!rust-toolchain.toml !Cargo.toml !Cargo.lock !Makefile diff --git a/README.md b/README.md index 57d0a144cb..eb13b111f5 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,17 @@ brew install libpq brew link --force libpq ``` +#### Rustc version + +The project uses [rust toolchain file](./rust-toolchain.toml) to define the version it's built with in CI for testing and local builds. + +This file is automatically picked up by [`rustup`](https://rust-lang.github.io/rustup/overrides.html#the-toolchain-file) that installs (if absent) and uses the toolchain version pinned in the file. + +rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory. + +non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file. +Never rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates. + #### Building on Linux 1. Build neon and patched postgres @@ -78,9 +89,9 @@ brew link --force libpq git clone --recursive https://github.com/neondatabase/neon.git cd neon -# The preferred and default is to make a debug build. This will create a +# The preferred and default is to make a debug build. This will create a # demonstrably slower build than a release build. If you want to use a release -# build, utilize "BUILD_TYPE=release make -j`nproc`" +# build, utilize "BUILD_TYPE=release make -j`nproc`" make -j`nproc` ``` @@ -94,9 +105,9 @@ make -j`nproc` git clone --recursive https://github.com/neondatabase/neon.git cd neon -# The preferred and default is to make a debug build. This will create a +# The preferred and default is to make a debug build. This will create a # demonstrably slower build than a release build. If you want to use a release -# build, utilize "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" +# build, utilize "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" make -j`sysctl -n hw.logicalcpu` ``` diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000000..ee699464c6 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,7 @@ +[toolchain] +channel = "1.60" +profile = "default" +# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. +# https://rust-lang.github.io/rustup/concepts/profiles.html +# but we also need `llvm-tools-preview` for coverage data merges on CI +components = ["llvm-tools-preview", "rustfmt", "clippy"] From 923f642549c9b3b96cb53b959f34f2cb47d799e1 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 5 Sep 2022 11:18:22 +0300 Subject: [PATCH 0749/1022] Collect cargo build timings --- .github/workflows/build_and_test.yml | 27 +++++++++++++++++++++------ .github/workflows/codestyle.yml | 5 ++--- README.md | 10 +++++----- rust-toolchain.toml | 8 +++++++- test_runner/fixtures/utils.py | 5 ++++- 5 files changed, 39 insertions(+), 16 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index bf9de7d857..7ee694fa16 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -54,7 +54,11 @@ jobs: fail-fast: false matrix: build_type: [ debug, release ] - rust_toolchain: [ 1.58 ] + # TODO this version is currently needed to make build statuses more informative + # and to clear cargo caches in a more transparent way. + # We should rather read this value from the file in the root of the repo, `rust-toolchain.toml` since it's + # truly setting what version of compiler the sources are built with + rust_toolchain: [ '1.60' ] env: BUILD_TYPE: ${{ matrix.build_type }} @@ -100,11 +104,11 @@ jobs: if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" CARGO_FEATURES="" - CARGO_FLAGS="--locked" + CARGO_FLAGS="--locked --timings" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" CARGO_FEATURES="--features profiling" - CARGO_FLAGS="--locked --release $CARGO_FEATURES" + CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV @@ -218,6 +222,17 @@ jobs: name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact path: /tmp/neon + - name: Prepare cargo build timing stats for storing + run: | + mkdir -p "/tmp/neon/cargo-timings/$BUILD_TYPE/" + cp -r ./target/cargo-timings/* "/tmp/neon/cargo-timings/$BUILD_TYPE/" + shell: bash -euxo pipefail {0} + - name: Upload cargo build stats + uses: ./.github/actions/upload + with: + name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-build-stats + path: /tmp/neon/cargo-timings/ + # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - name: Merge and upload coverage data if: matrix.build_type == 'debug' @@ -233,7 +248,7 @@ jobs: fail-fast: false matrix: build_type: [ debug, release ] - rust_toolchain: [ 1.58 ] + rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 @@ -269,7 +284,7 @@ jobs: fail-fast: false matrix: build_type: [ release ] - rust_toolchain: [ 1.58 ] + rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 @@ -341,7 +356,7 @@ jobs: fail-fast: false matrix: build_type: [ debug ] - rust_toolchain: [ 1.58 ] + rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index bc21054e18..ac6bfe655f 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -24,9 +24,8 @@ jobs: strategy: fail-fast: false matrix: - # If we want to duplicate this job for different - # Rust toolchains (e.g. nightly or 1.37.0), add them here. - rust_toolchain: [1.58] + # TODO read from `rust-toolchain.toml` and do the same in the build and test workflow too. + rust_toolchain: ['1.60'] os: [ubuntu-latest, macos-latest] # To support several Postgres versions, add them here. postgres_version: [v14, v15] diff --git a/README.md b/README.md index eb13b111f5..977afc2a2c 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ This file is automatically picked up by [`rustup`](https://rust-lang.github.io/r rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory. non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file. -Never rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates. +Newer rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates. #### Building on Linux @@ -90,8 +90,8 @@ git clone --recursive https://github.com/neondatabase/neon.git cd neon # The preferred and default is to make a debug build. This will create a -# demonstrably slower build than a release build. If you want to use a release -# build, utilize "BUILD_TYPE=release make -j`nproc`" +# demonstrably slower build than a release build. For a release build, +# use "BUILD_TYPE=release make -j`nproc`" make -j`nproc` ``` @@ -106,8 +106,8 @@ git clone --recursive https://github.com/neondatabase/neon.git cd neon # The preferred and default is to make a debug build. This will create a -# demonstrably slower build than a release build. If you want to use a release -# build, utilize "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" +# demonstrably slower build than a release build. For a release build, +# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" make -j`sysctl -n hw.logicalcpu` ``` diff --git a/rust-toolchain.toml b/rust-toolchain.toml index ee699464c6..8023348aae 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,11 @@ [toolchain] -channel = "1.60" +# We try to stick to a toolchain version that is widely available on popular distributions, so that most people +# can use the toolchain that comes with their operating system. But if there's a feature we miss badly from a later +# version, we can consider updating. As of this writing, 1.60 is available on Debian 'experimental' but not yet on +# 'testing' or even 'unstable', which is a bit more cutting-edge than we'd like. Hopefully the 1.60 packages reach +# 'testing' soon (and similarly for the other distributions). +# See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package. +channel = "1.60" # do update CI matrix values when updating this profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 726116e53c..5fb91344ad 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -155,7 +155,7 @@ def get_scale_for_db(size_mb: int) -> int: ATTACHMENT_NAME_REGEX = re.compile( - r".+\.log|.+\.stderr|.+\.stdout|.+\.filediff|.+\.metrics|flamegraph\.svg|regression\.diffs" + r".+\.log|.+\.stderr|.+\.stdout|.+\.filediff|.+\.metrics|flamegraph\.svg|regression\.diffs|.+\.html" ) @@ -180,6 +180,9 @@ def allure_attach_from_dir(dir: Path): elif source.endswith(".svg"): attachment_type = "image/svg+xml" extension = "svg" + elif source.endswith(".html"): + attachment_type = "text/html" + extension = "html" else: attachment_type = "text/plain" extension = attachment.suffix.removeprefix(".") From 648e86e9df9c06f3a961cdcca6f1c23f88272b6e Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 9 Sep 2022 16:02:29 +0300 Subject: [PATCH 0750/1022] Use Debian images with libc 2.31 to build legacy compute tools --- Dockerfile.compute-node.legacy | 4 ++-- rust-toolchain.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile.compute-node.legacy b/Dockerfile.compute-node.legacy index 7689167156..6653d81019 100644 --- a/Dockerfile.compute-node.legacy +++ b/Dockerfile.compute-node.legacy @@ -22,7 +22,7 @@ FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps # # Image with Postgres build deps # -FROM debian:buster-slim AS build-deps +FROM debian:bullseye-slim AS build-deps RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ libcurl4-openssl-dev libossp-uuid-dev @@ -59,7 +59,7 @@ WORKDIR /pg # # Final compute node image to be exported # -FROM debian:buster-slim +FROM debian:bullseye-slim # libreadline-dev is required to run psql RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 8023348aae..1a27e92fec 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -5,7 +5,7 @@ # 'testing' or even 'unstable', which is a bit more cutting-edge than we'd like. Hopefully the 1.60 packages reach # 'testing' soon (and similarly for the other distributions). # See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package. -channel = "1.60" # do update CI matrix values when updating this +channel = "1.60" # do update GitHub CI cache values for rust builds, when changing this value profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html From 18dafbb9ba0f49e65b6382acf009255a13861eab Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 9 Sep 2022 16:47:09 +0300 Subject: [PATCH 0751/1022] Remove deceiving rust version from the CI files --- .../actions/run-python-test-set/action.yml | 5 +---- .github/workflows/build_and_test.yml | 22 +++++-------------- .github/workflows/codestyle.yml | 17 +++++--------- 3 files changed, 13 insertions(+), 31 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index f04f5d11b8..4c18641938 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -5,9 +5,6 @@ inputs: build_type: description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug", or "remote" for the remote cluster' required: true - rust_toolchain: - description: 'Rust toolchain version to fetch the caches' - required: false test_selection: description: 'A python test suite to run' required: true @@ -55,7 +52,7 @@ runs: if: inputs.build_type != 'remote' uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact + name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact path: /tmp/neon - name: Checkout diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7ee694fa16..d586741d68 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -54,11 +54,6 @@ jobs: fail-fast: false matrix: build_type: [ debug, release ] - # TODO this version is currently needed to make build statuses more informative - # and to clear cargo caches in a more transparent way. - # We should rather read this value from the file in the root of the repo, `rust-toolchain.toml` since it's - # truly setting what version of compiler the sources are built with - rust_toolchain: [ '1.60' ] env: BUILD_TYPE: ${{ matrix.build_type }} @@ -130,8 +125,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} + v8-${{ runner.os }}-${{ matrix.build_type }}-cargo- - name: Cache postgres v14 build id: cache_pg_14 @@ -219,7 +214,7 @@ jobs: - name: Upload Neon artifact uses: ./.github/actions/upload with: - name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact + name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact path: /tmp/neon - name: Prepare cargo build timing stats for storing @@ -230,7 +225,7 @@ jobs: - name: Upload cargo build stats uses: ./.github/actions/upload with: - name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-build-stats + name: neon-${{ runner.os }}-${{ matrix.build_type }}-build-stats path: /tmp/neon/cargo-timings/ # XXX: keep this after the binaries.list is formed, so the coverage can properly work later @@ -248,7 +243,6 @@ jobs: fail-fast: false matrix: build_type: [ debug, release ] - rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 @@ -260,7 +254,6 @@ jobs: uses: ./.github/actions/run-python-test-set with: build_type: ${{ matrix.build_type }} - rust_toolchain: ${{ matrix.rust_toolchain }} test_selection: regress needs_postgres_source: true run_with_real_s3: true @@ -284,7 +277,6 @@ jobs: fail-fast: false matrix: build_type: [ release ] - rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 @@ -296,7 +288,6 @@ jobs: uses: ./.github/actions/run-python-test-set with: build_type: ${{ matrix.build_type }} - rust_toolchain: ${{ matrix.rust_toolchain }} test_selection: performance run_in_parallel: false save_perf_report: true @@ -356,7 +347,6 @@ jobs: fail-fast: false matrix: build_type: [ debug ] - rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 @@ -373,12 +363,12 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + key: v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact + name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact path: /tmp/neon - name: Get coverage artifact diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index ac6bfe655f..53d0f9c5d8 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -24,8 +24,11 @@ jobs: strategy: fail-fast: false matrix: - # TODO read from `rust-toolchain.toml` and do the same in the build and test workflow too. - rust_toolchain: ['1.60'] + # XXX: both OSes have rustup + # * https://github.com/actions/runner-images/blob/main/images/macos/macos-12-Readme.md#rust-tools + # * https://github.com/actions/runner-images/blob/main/images/linux/Ubuntu2204-Readme.md#rust-tools + # this is all we need to install our toolchain later via rust-toolchain.toml + # so don't install any toolchain explicitly. os: [ubuntu-latest, macos-latest] # To support several Postgres versions, add them here. postgres_version: [v14, v15] @@ -40,14 +43,6 @@ jobs: submodules: true fetch-depth: 2 - - name: Install rust toolchain ${{ matrix.rust_toolchain }} - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: ${{ matrix.rust_toolchain }} - components: rustfmt, clippy - override: true - - name: Check formatting run: cargo fmt --all -- --check @@ -106,7 +101,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git target - key: v3-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} + key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust - name: Run cargo clippy run: ./run_clippy.sh From a48f9f377df5c076f0f6afa8b1812709ea334d35 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 10 Sep 2022 01:23:19 +0300 Subject: [PATCH 0752/1022] Fix typo in issue template --- .github/ISSUE_TEMPLATE/epic-template.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md index 33ad7b1ef5..7707e0aa67 100644 --- a/.github/ISSUE_TEMPLATE/epic-template.md +++ b/.github/ISSUE_TEMPLATE/epic-template.md @@ -1,6 +1,6 @@ --- name: Epic Template -about: A set of related tasks contributing towards specific outcome, comprizing of +about: A set of related tasks contributing towards specific outcome, comprising of more than 1 week of work. title: 'Epic: ' labels: t/Epic From 698d6d0badad9aa2a12b033a33d28c19ffaec79c Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 12 Sep 2022 00:07:34 +0300 Subject: [PATCH 0753/1022] Use stable coverage API with rustc 1.60 --- scripts/coverage | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/scripts/coverage b/scripts/coverage index af0d067419..1dc92e57cc 100755 --- a/scripts/coverage +++ b/scripts/coverage @@ -75,8 +75,6 @@ class Cargo: def rustlib_dir(self) -> Path: if not self._rustlib_dir: cmd = [ - 'cargo', - '-Zunstable-options', 'rustc', '--print=target-libdir', ] @@ -397,7 +395,7 @@ class State: # Enable LLVM's source-based coverage # see: https://clang.llvm.org/docs/SourceBasedCodeCoverage.html # see: https://blog.rust-lang.org/inside-rust/2020/11/12/source-based-code-coverage.html - '-Zinstrument-coverage', + '-Cinstrument-coverage', # Link every bit of code to prevent "holes" in coverage report # see: https://doc.rust-lang.org/rustc/codegen-options/index.html#link-dead-code '-Clink-dead-code', @@ -410,10 +408,6 @@ class State: f'--remap-path-prefix {self.cwd}=', ]) - # XXX: God, have mercy on our souls... - # see: https://github.com/rust-lang/rust/pull/90132 - os.environ['RUSTC_BOOTSTRAP'] = '1' - def _merge_profraw(self) -> bool: profdata_path = self.profdata_dir / '-'.join([ self.profraw_prefix, From 40c845e57d7060b1946e3a9e9d6bf076a8847e52 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sun, 11 Sep 2022 21:48:01 +0300 Subject: [PATCH 0754/1022] Switch to async for all concurrency in the pageserver. Instead of spawning helper threads, we now use Tokio tasks. There are multiple Tokio runtimes, for different kinds of tasks. One for serving libpq client connections, another for background operations like GC and compaction, and so on. That's not strictly required, we could use just one runtime, but with this you can still get an overview of what's happening with "top -H". There's one subtle behavior in how TenantState is updated. Before this patch, if you deleted all timelines from a tenant, its GC and compaction loops were stopped, and the tenant went back to Idle state. We no longer do that. The empty tenant stays Active. The changes to test_tenant_tasks.py are related to that. There's still plenty of synchronous code and blocking. For example, we still use blocking std::io functions for all file I/O, and the communication with WAL redo processes is still uses low-level unix poll(). We might want to rewrite those later, but this will do for now. The model is that local file I/O is considered to be fast enough that blocking - and preventing other tasks running in the same thread - is acceptable. --- Cargo.lock | 15 +- docs/pageserver-thread-mgmt.md | 47 +- libs/utils/Cargo.toml | 2 + libs/utils/src/lib.rs | 4 +- libs/utils/src/postgres_backend_async.rs | 485 +++++++++++++++ libs/utils/src/seqwait.rs | 53 +- libs/utils/src/seqwait_async.rs | 224 ------- pageserver/Cargo.toml | 5 +- pageserver/src/basebackup.rs | 5 +- pageserver/src/bin/pageserver.rs | 77 ++- pageserver/src/http/routes.rs | 35 +- pageserver/src/layered_repository.rs | 44 +- pageserver/src/layered_repository/timeline.rs | 180 +++--- pageserver/src/lib.rs | 27 +- pageserver/src/page_service.rs | 551 +++++++++--------- pageserver/src/storage_sync.rs | 71 +-- pageserver/src/storage_sync/upload.rs | 2 +- pageserver/src/task_mgr.rs | 463 +++++++++++++++ pageserver/src/tenant_mgr.rs | 255 +++----- pageserver/src/tenant_tasks.rs | 306 +++------- pageserver/src/thread_mgr.rs | 409 ------------- pageserver/src/timelines.rs | 49 +- pageserver/src/walreceiver.rs | 291 ++------- .../src/walreceiver/connection_manager.rs | 87 ++- .../src/walreceiver/walreceiver_connection.rs | 75 +-- test_runner/regress/test_tenant_tasks.py | 15 +- workspace_hack/Cargo.toml | 4 +- 27 files changed, 1840 insertions(+), 1941 deletions(-) create mode 100644 libs/utils/src/postgres_backend_async.rs delete mode 100644 libs/utils/src/seqwait_async.rs create mode 100644 pageserver/src/task_mgr.rs delete mode 100644 pageserver/src/thread_mgr.rs diff --git a/Cargo.lock b/Cargo.lock index 563a998601..e9ebcdc5ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1831,6 +1831,8 @@ name = "pageserver" version = "0.1.0" dependencies = [ "anyhow", + "async-stream", + "async-trait", "byteorder", "bytes", "chrono", @@ -1871,6 +1873,7 @@ dependencies = [ "thiserror", "tokio", "tokio-postgres", + "tokio-util", "toml_edit", "tracing", "url", @@ -3481,9 +3484,9 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" -version = "0.1.34" +version = "0.1.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d0ecdcb44a79f0fe9844f0c4f33a342cbcbb5117de8001e6ba0dc2351327d09" +checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307" dependencies = [ "cfg-if", "log", @@ -3505,11 +3508,11 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.26" +version = "0.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f54c8ca710e81886d498c2fd3331b56c93aa248d49de2222ad2742247c60072f" +checksum = "5aeea4303076558a00714b823f9ad67d58a3bbda1df83d8827d21193156e22f7" dependencies = [ - "lazy_static", + "once_cell", "valuable", ] @@ -3626,6 +3629,7 @@ name = "utils" version = "0.1.0" dependencies = [ "anyhow", + "async-trait", "bincode", "byteorder", "bytes", @@ -3653,6 +3657,7 @@ dependencies = [ "tempfile", "thiserror", "tokio", + "tokio-rustls", "tracing", "tracing-subscriber", "workspace_hack", diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md index 9ee3e40085..e351c972cb 100644 --- a/docs/pageserver-thread-mgmt.md +++ b/docs/pageserver-thread-mgmt.md @@ -1,26 +1,39 @@ ## Thread management -Each thread in the system is tracked by the `thread_mgr` module. It -maintains a registry of threads, and which tenant or timeline they are -operating on. This is used for safe shutdown of a tenant, or the whole -system. +The pageserver uses Tokio for handling concurrency. Everything runs in +Tokio tasks, although some parts are written in blocking style and use +spawn_blocking(). + +Each Tokio task is tracked by the `task_mgr` module. It maintains a +registry of tasks, and which tenant or timeline they are operating +on. ### Handling shutdown -When a tenant or timeline is deleted, we need to shut down all threads -operating on it, before deleting the data on disk. A thread registered -in the thread registry can check if it has been requested to shut down, -by calling `is_shutdown_requested()`. For async operations, there's also -a `shudown_watcher()` async task that can be used to wake up on shutdown. +When a tenant or timeline is deleted, we need to shut down all tasks +operating on it, before deleting the data on disk. There's a function, +`shutdown_tasks`, to request all tasks of a particular tenant or +timeline to shutdown. It will also wait for them to finish. + +A task registered in the task registry can check if it has been +requested to shut down, by calling `is_shutdown_requested()`. There's +also a `shudown_watcher()` Future that can be used with `tokio::select!` +or similar, to wake up on shutdown. + ### Sync vs async -The primary programming model in the page server is synchronous, -blocking code. However, there are some places where async code is -used. Be very careful when mixing sync and async code. - -Async is primarily used to wait for incoming data on network -connections. For example, all WAL receivers have a shared thread pool, -with one async Task for each connection. Once a piece of WAL has been -received from the network, the thread calls the blocking functions in +We use async to wait for incoming data on network connections, and to +perform other long-running operations. For example, each WAL receiver +connection is handled by a tokio Task. Once a piece of WAL has been +received from the network, the task calls the blocking functions in the Repository to process the WAL. + +The core storage code in `layered_repository/` is synchronous, with +blocking locks and I/O calls. The current model is that we consider +disk I/Os to be short enough that we perform them while running in a +Tokio task. If that becomes a problem, we should use `spawn_blocking` +before entering the synchronous parts of the code, or switch to using +tokio I/O functions. + +Be very careful when mixing sync and async code! diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 28ad658de4..ce55277f29 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] +async-trait = "0.1" anyhow = "1.0" bincode = "1.3" bytes = "1.0.1" @@ -16,6 +17,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1" thiserror = "1.0" tokio = { version = "1.17", features = ["macros"]} +tokio-rustls = "0.23" tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } nix = "0.23.0" diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index fa7a37adf1..caa7ac6c09 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -14,11 +14,9 @@ pub mod simple_rcu; /// append only ordered map implemented with a Vec pub mod vec_map; -// Async version of SeqWait. Currently unused. -// pub mod seqwait_async; - pub mod bin_ser; pub mod postgres_backend; +pub mod postgres_backend_async; pub mod pq_proto; // dealing with connstring parsing and handy access to it's parts diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs new file mode 100644 index 0000000000..383ad3742f --- /dev/null +++ b/libs/utils/src/postgres_backend_async.rs @@ -0,0 +1,485 @@ +//! Server-side asynchronous Postgres connection, as limited as we need. +//! To use, create PostgresBackend and run() it, passing the Handler +//! implementation determining how to process the queries. Currently its API +//! is rather narrow, but we can extend it once required. + +use crate::postgres_backend::AuthType; +use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket}; +use anyhow::{bail, Context, Result}; +use bytes::{Bytes, BytesMut}; +use rand::Rng; +use std::future::Future; +use std::net::SocketAddr; +use std::pin::Pin; +use std::sync::Arc; +use std::task::Poll; +use tracing::{debug, error, trace}; + +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use tokio_rustls::TlsAcceptor; + +#[async_trait::async_trait] +pub trait Handler { + /// Handle single query. + /// postgres_backend will issue ReadyForQuery after calling this (this + /// might be not what we want after CopyData streaming, but currently we don't + /// care). + async fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>; + + /// Called on startup packet receival, allows to process params. + /// + /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users + /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow + /// to override whole init logic in implementations. + fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> { + Ok(()) + } + + /// Check auth md5 + fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> { + bail!("MD5 auth failed") + } + + /// Check auth jwt + fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> { + bail!("JWT auth failed") + } +} + +/// PostgresBackend protocol state. +/// XXX: The order of the constructors matters. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] +pub enum ProtoState { + Initialization, + Encrypted, + Authentication, + Established, + Closed, +} + +#[derive(Clone, Copy)] +pub enum ProcessMsgResult { + Continue, + Break, +} + +/// Always-writeable sock_split stream. +/// May not be readable. See [`PostgresBackend::take_stream_in`] +pub enum Stream { + Unencrypted(tokio::net::TcpStream), + Tls(Box>), + Broken, +} + +impl AsyncWrite for Stream { + fn poll_write( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf), + Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf), + Self::Broken => unreachable!(), + } + } + fn poll_flush( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx), + Self::Tls(stream) => Pin::new(stream).poll_flush(cx), + Self::Broken => unreachable!(), + } + } + fn poll_shutdown( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx), + Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx), + Self::Broken => unreachable!(), + } + } +} +impl AsyncRead for Stream { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf), + Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf), + Self::Broken => unreachable!(), + } + } +} + +pub struct PostgresBackend { + stream: Stream, + // Output buffer. c.f. BeMessage::write why we are using BytesMut here. + buf_out: BytesMut, + + pub state: ProtoState, + + md5_salt: [u8; 4], + auth_type: AuthType, + + peer_addr: SocketAddr, + pub tls_config: Option>, +} + +pub fn query_from_cstring(query_string: Bytes) -> Vec { + let mut query_string = query_string.to_vec(); + if let Some(ch) = query_string.last() { + if *ch == 0 { + query_string.pop(); + } + } + query_string +} + +// Cast a byte slice to a string slice, dropping null terminator if there's one. +fn cstr_to_str(bytes: &[u8]) -> Result<&str> { + let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); + std::str::from_utf8(without_null).map_err(|e| e.into()) +} + +impl PostgresBackend { + pub fn new( + socket: tokio::net::TcpStream, + auth_type: AuthType, + tls_config: Option>, + ) -> std::io::Result { + let peer_addr = socket.peer_addr()?; + + Ok(Self { + stream: Stream::Unencrypted(socket), + buf_out: BytesMut::with_capacity(10 * 1024), + state: ProtoState::Initialization, + md5_salt: [0u8; 4], + auth_type, + tls_config, + peer_addr, + }) + } + + pub fn get_peer_addr(&self) -> &SocketAddr { + &self.peer_addr + } + + /// Read full message or return None if connection is closed. + pub async fn read_message(&mut self) -> Result> { + use ProtoState::*; + match self.state { + Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await, + Authentication | Established => FeMessage::read_fut(&mut self.stream).await, + Closed => Ok(None), + } + } + + /// Flush output buffer into the socket. + pub async fn flush(&mut self) -> std::io::Result<&mut Self> { + self.stream.write_all(&self.buf_out).await?; + self.buf_out.clear(); + Ok(self) + } + + /// Write message into internal output buffer. + pub fn write_message(&mut self, message: &BeMessage<'_>) -> Result<&mut Self, std::io::Error> { + BeMessage::write(&mut self.buf_out, message)?; + Ok(self) + } + + // Wrapper for run_message_loop() that shuts down socket when we are done + pub async fn run(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()> + where + F: Fn() -> S, + S: Future, + { + let ret = self.run_message_loop(handler, shutdown_watcher).await; + let _ = self.stream.shutdown(); + ret + } + + async fn run_message_loop( + &mut self, + handler: &mut impl Handler, + shutdown_watcher: F, + ) -> Result<()> + where + F: Fn() -> S, + S: Future, + { + trace!("postgres backend to {:?} started", self.peer_addr); + + tokio::select!( + biased; + + _ = shutdown_watcher() => { + // We were requested to shut down. + tracing::info!("shutdown request received during handshake"); + return Ok(()) + }, + + result = async { + while self.state < ProtoState::Established { + if let Some(msg) = self.read_message().await? { + trace!("got message {msg:?} during handshake"); + + match self.process_handshake_message(handler, msg).await? { + ProcessMsgResult::Continue => { + self.flush().await?; + continue; + } + ProcessMsgResult::Break => { + trace!("postgres backend to {:?} exited during handshake", self.peer_addr); + return Ok(()); + } + } + } else { + trace!("postgres backend to {:?} exited during handshake", self.peer_addr); + return Ok(()); + } + } + Ok::<(), anyhow::Error>(()) + } => { + // Handshake complete. + result?; + } + ); + + // Authentication completed + let mut query_string = Bytes::new(); + while let Some(msg) = tokio::select!( + biased; + _ = shutdown_watcher() => { + // We were requested to shut down. + tracing::info!("shutdown request received in run_message_loop"); + Ok(None) + }, + msg = self.read_message() => { msg }, + )? { + trace!("got message {:?}", msg); + + let result = self.process_message(handler, msg, &mut query_string).await; + self.flush().await?; + match result? { + ProcessMsgResult::Continue => { + self.flush().await?; + continue; + } + ProcessMsgResult::Break => break, + } + } + + trace!("postgres backend to {:?} exited", self.peer_addr); + Ok(()) + } + + async fn start_tls(&mut self) -> anyhow::Result<()> { + if let Stream::Unencrypted(plain_stream) = + std::mem::replace(&mut self.stream, Stream::Broken) + { + let acceptor = TlsAcceptor::from(self.tls_config.clone().unwrap()); + let tls_stream = acceptor.accept(plain_stream).await?; + + self.stream = Stream::Tls(Box::new(tls_stream)); + return Ok(()); + }; + bail!("TLS already started"); + } + + async fn process_handshake_message( + &mut self, + handler: &mut impl Handler, + msg: FeMessage, + ) -> Result { + assert!(self.state < ProtoState::Established); + let have_tls = self.tls_config.is_some(); + match msg { + FeMessage::StartupPacket(m) => { + trace!("got startup message {m:?}"); + + match m { + FeStartupPacket::SslRequest => { + debug!("SSL requested"); + + self.write_message(&BeMessage::EncryptionResponse(have_tls))?; + if have_tls { + self.start_tls().await?; + self.state = ProtoState::Encrypted; + } + } + FeStartupPacket::GssEncRequest => { + debug!("GSS requested"); + self.write_message(&BeMessage::EncryptionResponse(false))?; + } + FeStartupPacket::StartupMessage { .. } => { + if have_tls && !matches!(self.state, ProtoState::Encrypted) { + self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?; + bail!("client did not connect with TLS"); + } + + // NB: startup() may change self.auth_type -- we are using that in proxy code + // to bypass auth for new users. + handler.startup(self, &m)?; + + match self.auth_type { + AuthType::Trust => { + self.write_message(&BeMessage::AuthenticationOk)? + .write_message(&BeParameterStatusMessage::encoding())? + // The async python driver requires a valid server_version + .write_message(&BeMessage::ParameterStatus( + BeParameterStatusMessage::ServerVersion("14.1"), + ))? + .write_message(&BeMessage::ReadyForQuery)?; + self.state = ProtoState::Established; + } + AuthType::MD5 => { + rand::thread_rng().fill(&mut self.md5_salt); + self.write_message(&BeMessage::AuthenticationMD5Password( + self.md5_salt, + ))?; + self.state = ProtoState::Authentication; + } + AuthType::ZenithJWT => { + self.write_message(&BeMessage::AuthenticationCleartextPassword)?; + self.state = ProtoState::Authentication; + } + } + } + FeStartupPacket::CancelRequest { .. } => { + self.state = ProtoState::Closed; + return Ok(ProcessMsgResult::Break); + } + } + } + + FeMessage::PasswordMessage(m) => { + trace!("got password message '{:?}'", m); + + assert!(self.state == ProtoState::Authentication); + + match self.auth_type { + AuthType::Trust => unreachable!(), + AuthType::MD5 => { + let (_, md5_response) = m.split_last().context("protocol violation")?; + + if let Err(e) = handler.check_auth_md5(self, md5_response) { + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + bail!("auth failed: {}", e); + } + } + AuthType::ZenithJWT => { + let (_, jwt_response) = m.split_last().context("protocol violation")?; + + if let Err(e) = handler.check_auth_jwt(self, jwt_response) { + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + bail!("auth failed: {}", e); + } + } + } + self.write_message(&BeMessage::AuthenticationOk)? + .write_message(&BeParameterStatusMessage::encoding())? + .write_message(&BeMessage::ReadyForQuery)?; + self.state = ProtoState::Established; + } + + _ => { + self.state = ProtoState::Closed; + return Ok(ProcessMsgResult::Break); + } + } + Ok(ProcessMsgResult::Continue) + } + + async fn process_message( + &mut self, + handler: &mut impl Handler, + msg: FeMessage, + unnamed_query_string: &mut Bytes, + ) -> Result { + // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth + // TODO: change that to proper top-level match of protocol state with separate message handling for each state + assert!(self.state == ProtoState::Established); + + match msg { + FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => { + bail!("protocol violation"); + } + + FeMessage::Query(body) => { + // remove null terminator + let query_string = cstr_to_str(&body)?; + + trace!("got query {:?}", query_string); + // xxx distinguish fatal and recoverable errors? + if let Err(e) = handler.process_query(self, query_string).await { + // ":?" uses the alternate formatting style, which makes anyhow display the + // full cause of the error, not just the top-level context + its trace. + // We don't want to send that in the ErrorResponse though, + // because it's not relevant to the compute node logs. + error!("query handler for '{}' failed: {:?}", query_string, e); + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + // TODO: untangle convoluted control flow + if e.to_string().contains("failed to run") { + return Ok(ProcessMsgResult::Break); + } + } + self.write_message(&BeMessage::ReadyForQuery)?; + } + + FeMessage::Parse(m) => { + *unnamed_query_string = m.query_string; + self.write_message(&BeMessage::ParseComplete)?; + } + + FeMessage::Describe(_) => { + self.write_message(&BeMessage::ParameterDescription)? + .write_message(&BeMessage::NoData)?; + } + + FeMessage::Bind(_) => { + self.write_message(&BeMessage::BindComplete)?; + } + + FeMessage::Close(_) => { + self.write_message(&BeMessage::CloseComplete)?; + } + + FeMessage::Execute(_) => { + let query_string = cstr_to_str(unnamed_query_string)?; + trace!("got execute {:?}", query_string); + // xxx distinguish fatal and recoverable errors? + if let Err(e) = handler.process_query(self, query_string).await { + error!("query handler for '{}' failed: {:?}", query_string, e); + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + } + // NOTE there is no ReadyForQuery message. This handler is used + // for basebackup and it uses CopyOut which doesn't require + // ReadyForQuery message and backend just switches back to + // processing mode after sending CopyDone or ErrorResponse. + } + + FeMessage::Sync => { + self.write_message(&BeMessage::ReadyForQuery)?; + } + + FeMessage::Terminate => { + return Ok(ProcessMsgResult::Break); + } + + // We prefer explicit pattern matching to wildcards, because + // this helps us spot the places where new variants are missing + FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => { + bail!("unexpected message type: {:?}", msg); + } + } + + Ok(ProcessMsgResult::Continue) + } +} diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index a531975d60..467b900a13 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -4,9 +4,10 @@ use std::cmp::{Eq, Ordering, PartialOrd}; use std::collections::BinaryHeap; use std::fmt::Debug; use std::mem; -use std::sync::mpsc::{channel, Receiver, Sender}; use std::sync::Mutex; use std::time::Duration; +use tokio::sync::watch::{channel, Receiver, Sender}; +use tokio::time::timeout; /// An error happened while waiting for a number #[derive(Debug, PartialEq, Eq, thiserror::Error)] @@ -141,10 +142,10 @@ where /// /// This call won't complete until someone has called `advance` /// with a number greater than or equal to the one we're waiting for. - pub fn wait_for(&self, num: V) -> Result<(), SeqWaitError> { + pub async fn wait_for(&self, num: V) -> Result<(), SeqWaitError> { match self.queue_for_wait(num) { Ok(None) => Ok(()), - Ok(Some(rx)) => rx.recv().map_err(|_| SeqWaitError::Shutdown), + Ok(Some(mut rx)) => rx.changed().await.map_err(|_| SeqWaitError::Shutdown), Err(e) => Err(e), } } @@ -156,13 +157,18 @@ where /// /// If that hasn't happened after the specified timeout duration, /// [`SeqWaitError::Timeout`] will be returned. - pub fn wait_for_timeout(&self, num: V, timeout_duration: Duration) -> Result<(), SeqWaitError> { + pub async fn wait_for_timeout( + &self, + num: V, + timeout_duration: Duration, + ) -> Result<(), SeqWaitError> { match self.queue_for_wait(num) { Ok(None) => Ok(()), - Ok(Some(rx)) => rx.recv_timeout(timeout_duration).map_err(|e| match e { - std::sync::mpsc::RecvTimeoutError::Timeout => SeqWaitError::Timeout, - std::sync::mpsc::RecvTimeoutError::Disconnected => SeqWaitError::Shutdown, - }), + Ok(Some(mut rx)) => match timeout(timeout_duration, rx.changed()).await { + Ok(Ok(())) => Ok(()), + Ok(Err(_)) => Err(SeqWaitError::Shutdown), + Err(_) => Err(SeqWaitError::Timeout), + }, Err(e) => Err(e), } } @@ -179,7 +185,7 @@ where } // Create a new channel. - let (tx, rx) = channel(); + let (tx, rx) = channel(()); internal.waiters.push(Waiter { wake_num: num, wake_channel: tx, @@ -235,7 +241,6 @@ mod tests { use super::*; use std::sync::Arc; use std::thread::sleep; - use std::thread::spawn; use std::time::Duration; impl MonotonicCounter for i32 { @@ -248,25 +253,25 @@ mod tests { } } - #[test] - fn seqwait() { + #[tokio::test] + async fn seqwait() { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); let seq3 = Arc::clone(&seq); - spawn(move || { - seq2.wait_for(42).expect("wait_for 42"); + tokio::task::spawn(async move { + seq2.wait_for(42).await.expect("wait_for 42"); let old = seq2.advance(100); assert_eq!(old, 99); - seq2.wait_for(999).expect_err("no 999"); + seq2.wait_for(999).await.expect_err("no 999"); }); - spawn(move || { - seq3.wait_for(42).expect("wait_for 42"); - seq3.wait_for(0).expect("wait_for 0"); + tokio::task::spawn(async move { + seq3.wait_for(42).await.expect("wait_for 42"); + seq3.wait_for(0).await.expect("wait_for 0"); }); sleep(Duration::from_secs(1)); let old = seq.advance(99); assert_eq!(old, 0); - seq.wait_for(100).expect("wait_for 100"); + seq.wait_for(100).await.expect("wait_for 100"); // Calling advance with a smaller value is a no-op assert_eq!(seq.advance(98), 100); @@ -275,16 +280,16 @@ mod tests { seq.shutdown(); } - #[test] - fn seqwait_timeout() { + #[tokio::test] + async fn seqwait_timeout() { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); - spawn(move || { + tokio::task::spawn(async move { let timeout = Duration::from_millis(1); - let res = seq2.wait_for_timeout(42, timeout); + let res = seq2.wait_for_timeout(42, timeout).await; assert_eq!(res, Err(SeqWaitError::Timeout)); }); - sleep(Duration::from_secs(1)); + tokio::time::sleep(Duration::from_secs(1)).await; // This will attempt to wake, but nothing will happen // because the waiter already dropped its Receiver. let old = seq.advance(99); diff --git a/libs/utils/src/seqwait_async.rs b/libs/utils/src/seqwait_async.rs deleted file mode 100644 index f685e2b569..0000000000 --- a/libs/utils/src/seqwait_async.rs +++ /dev/null @@ -1,224 +0,0 @@ -//! -//! Async version of 'seqwait.rs' -//! -//! NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs. -//! - -#![warn(missing_docs)] - -use std::collections::BTreeMap; -use std::fmt::Debug; -use std::mem; -use std::sync::Mutex; -use std::time::Duration; -use tokio::sync::watch::{channel, Receiver, Sender}; -use tokio::time::timeout; - -/// An error happened while waiting for a number -#[derive(Debug, PartialEq, thiserror::Error)] -#[error("SeqWaitError")] -pub enum SeqWaitError { - /// The wait timeout was reached - Timeout, - /// [`SeqWait::shutdown`] was called - Shutdown, -} - -/// Internal components of a `SeqWait` -struct SeqWaitInt -where - T: Ord, -{ - waiters: BTreeMap, Receiver<()>)>, - current: T, - shutdown: bool, -} - -/// A tool for waiting on a sequence number -/// -/// This provides a way to await the arrival of a number. -/// As soon as the number arrives by another caller calling -/// [`advance`], then the waiter will be woken up. -/// -/// This implementation takes a blocking Mutex on both [`wait_for`] -/// and [`advance`], meaning there may be unexpected executor blocking -/// due to thread scheduling unfairness. There are probably better -/// implementations, but we can probably live with this for now. -/// -/// [`wait_for`]: SeqWait::wait_for -/// [`advance`]: SeqWait::advance -/// -pub struct SeqWait -where - T: Ord, -{ - internal: Mutex>, -} - -impl SeqWait -where - T: Ord + Debug + Copy, -{ - /// Create a new `SeqWait`, initialized to a particular number - pub fn new(starting_num: T) -> Self { - let internal = SeqWaitInt { - waiters: BTreeMap::new(), - current: starting_num, - shutdown: false, - }; - SeqWait { - internal: Mutex::new(internal), - } - } - - /// Shut down a `SeqWait`, causing all waiters (present and - /// future) to return an error. - pub fn shutdown(&self) { - let waiters = { - // Prevent new waiters; wake all those that exist. - // Wake everyone with an error. - let mut internal = self.internal.lock().unwrap(); - - // This will steal the entire waiters map. - // When we drop it all waiters will be woken. - mem::take(&mut internal.waiters) - - // Drop the lock as we exit this scope. - }; - - // When we drop the waiters list, each Receiver will - // be woken with an error. - // This drop doesn't need to be explicit; it's done - // here to make it easier to read the code and understand - // the order of events. - drop(waiters); - } - - /// Wait for a number to arrive - /// - /// This call won't complete until someone has called `advance` - /// with a number greater than or equal to the one we're waiting for. - pub async fn wait_for(&self, num: T) -> Result<(), SeqWaitError> { - let mut rx = { - let mut internal = self.internal.lock().unwrap(); - if internal.current >= num { - return Ok(()); - } - if internal.shutdown { - return Err(SeqWaitError::Shutdown); - } - - // If we already have a channel for waiting on this number, reuse it. - if let Some((_, rx)) = internal.waiters.get_mut(&num) { - // an Err from changed() means the sender was dropped. - rx.clone() - } else { - // Create a new channel. - let (tx, rx) = channel(()); - internal.waiters.insert(num, (tx, rx.clone())); - rx - } - // Drop the lock as we exit this scope. - }; - rx.changed().await.map_err(|_| SeqWaitError::Shutdown) - } - - /// Wait for a number to arrive - /// - /// This call won't complete until someone has called `advance` - /// with a number greater than or equal to the one we're waiting for. - /// - /// If that hasn't happened after the specified timeout duration, - /// [`SeqWaitError::Timeout`] will be returned. - pub async fn wait_for_timeout( - &self, - num: T, - timeout_duration: Duration, - ) -> Result<(), SeqWaitError> { - timeout(timeout_duration, self.wait_for(num)) - .await - .unwrap_or(Err(SeqWaitError::Timeout)) - } - - /// Announce a new number has arrived - /// - /// All waiters at this value or below will be woken. - /// - /// `advance` will panic if you send it a lower number than - /// a previous call. - pub fn advance(&self, num: T) { - let wake_these = { - let mut internal = self.internal.lock().unwrap(); - - if internal.current > num { - panic!( - "tried to advance backwards, from {:?} to {:?}", - internal.current, num - ); - } - internal.current = num; - - // split_off will give me all the high-numbered waiters, - // so split and then swap. Everything at or above `num` - // stays. - let mut split = internal.waiters.split_off(&num); - std::mem::swap(&mut split, &mut internal.waiters); - - // `split_at` didn't get the value at `num`; if it's - // there take that too. - if let Some(sleeper) = internal.waiters.remove(&num) { - split.insert(num, sleeper); - } - - split - }; - - for (_wake_num, (tx, _rx)) in wake_these { - // This can fail if there are no receivers. - // We don't care; discard the error. - let _ = tx.send(()); - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::sync::Arc; - use tokio::time::{sleep, Duration}; - - #[tokio::test] - async fn seqwait() { - let seq = Arc::new(SeqWait::new(0)); - let seq2 = Arc::clone(&seq); - let seq3 = Arc::clone(&seq); - tokio::spawn(async move { - seq2.wait_for(42).await.expect("wait_for 42"); - seq2.advance(100); - seq2.wait_for(999).await.expect_err("no 999"); - }); - tokio::spawn(async move { - seq3.wait_for(42).await.expect("wait_for 42"); - seq3.wait_for(0).await.expect("wait_for 0"); - }); - sleep(Duration::from_secs(1)).await; - seq.advance(99); - seq.wait_for(100).await.expect("wait_for 100"); - seq.shutdown(); - } - - #[tokio::test] - async fn seqwait_timeout() { - let seq = Arc::new(SeqWait::new(0)); - let seq2 = Arc::clone(&seq); - tokio::spawn(async move { - let timeout = Duration::from_millis(1); - let res = seq2.wait_for_timeout(42, timeout).await; - assert_eq!(res, Err(SeqWaitError::Timeout)); - }); - sleep(Duration::from_secs(1)).await; - // This will attempt to wake, but nothing will happen - // because the waiter already dropped its Receiver. - seq.advance(99); - } -} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 902765f424..e73c73bd9c 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -12,6 +12,8 @@ profiling = ["pprof"] failpoints = ["fail/failpoints"] [dependencies] +async-stream = "0.3" +async-trait = "0.1" chrono = "0.4.19" rand = "0.8.3" regex = "1.4.5" @@ -24,6 +26,7 @@ itertools = "0.10.3" clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } +tokio-util = { version = "0.7.3", features = ["io", "io-util"] } postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } @@ -43,7 +46,7 @@ pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallcl toml_edit = { version = "0.13", features = ["easy"] } scopeguard = "1.1.0" const_format = "0.2.21" -tracing = "0.1.27" +tracing = "0.1.36" signal-hook = "0.3.10" url = "2" nix = "0.23" diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index cd99c3c67d..61facc852d 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -81,9 +81,8 @@ where // an old LSN and it doesn't have any WAL of its own yet. We will set // prev_lsn to Lsn(0) if we cannot provide the correct value. let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { - // Backup was requested at a particular LSN. Wait for it to arrive. - info!("waiting for {}", req_lsn); - timeline.wait_lsn(req_lsn)?; + // Backup was requested at a particular LSN. The caller should've + // already checked that it's a valid LSN. // If the requested point is the end of the timeline, we can // provide prev_lsn. (get_last_record_rlsn() might return it as diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 5a43516728..ec71e5b320 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -4,7 +4,7 @@ use remote_storage::GenericRemoteStorage; use std::{env, ops::ControlFlow, path::Path, str::FromStr}; use tracing::*; -use anyhow::{bail, Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; use clap::{App, Arg}; use daemonize::Daemonize; @@ -12,13 +12,15 @@ use daemonize::Daemonize; use fail::FailScenario; use pageserver::{ config::{defaults::*, PageServerConf}, - http, page_cache, page_service, profiling, tenant_mgr, thread_mgr, - thread_mgr::ThreadKind, - virtual_file, LOG_FILE_NAME, + http, page_cache, page_service, profiling, task_mgr, + task_mgr::TaskKind, + task_mgr::{ + BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, + }, + tenant_mgr, virtual_file, LOG_FILE_NAME, }; use utils::{ auth::JwtAuth, - http::endpoint, logging, postgres_backend::AuthType, project_git_version, @@ -286,7 +288,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() // start profiler (if enabled) let profiler_guard = profiling::init_profiler(conf); - pageserver::tenant_tasks::init_tenant_task_pool()?; + WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_etcd_client(conf))?; // initialize authentication for incoming connections let auth = match &conf.auth_type { @@ -307,35 +309,54 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() }) .transpose() .context("Failed to init generic remote storage")?; + let remote_index = { + let _rt_guard = BACKGROUND_RUNTIME.enter(); + tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())? + }; - let remote_index = tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())?; - - // Spawn a new thread for the http endpoint + // Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME. // bind before launching separate thread so the error reported before startup exits - let auth_cloned = auth.clone(); - thread_mgr::spawn( - ThreadKind::HttpEndpointListener, - None, - None, - "http_endpoint_thread", - true, - move || { - let router = http::make_router(conf, auth_cloned, remote_index, remote_storage)?; - endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher()) - }, - )?; - // Spawn a thread to listen for libpq connections. It will spawn further threads + // Create a Service from the router above to handle incoming requests. + { + let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); + + let router = http::make_router(conf, auth.clone(), remote_index, remote_storage)?; + let service = + utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap(); + let server = hyper::Server::from_tcp(http_listener)? + .serve(service) + .with_graceful_shutdown(task_mgr::shutdown_watcher()); + + task_mgr::spawn( + MGMT_REQUEST_RUNTIME.handle(), + TaskKind::HttpEndpointListener, + None, + None, + "http endpoint listener", + true, + async { + server.await?; + Ok(()) + }, + ); + } + + // Spawn a task to listen for libpq connections. It will spawn further tasks // for each connection. - thread_mgr::spawn( - ThreadKind::LibpqEndpointListener, + task_mgr::spawn( + COMPUTE_REQUEST_RUNTIME.handle(), + TaskKind::LibpqEndpointListener, None, None, - "libpq endpoint thread", + "libpq endpoint listener", true, - move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type), - )?; + async move { + page_service::libpq_listener_main(conf, auth, pageserver_listener, conf.auth_type).await + }, + ); + // All started up! Now just sit and wait for shutdown signal. signals.handle(|signal| match signal { Signal::Quit => { info!( @@ -352,7 +373,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() signal.name() ); profiling::exit_profiler(conf, &profiler_guard); - pageserver::shutdown_pageserver(0); + BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0)); unreachable!() } }) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 59142bd9b2..78f83511cb 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -161,16 +161,14 @@ async fn timeline_create_handler(mut request: Request) -> Result { // Created. Construct a TimelineInfo for it. let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)?; @@ -184,9 +182,10 @@ async fn timeline_create_handler(mut request: Request) -> Result Ok(None), // timeline already exists Err(err) => Err(err), } - }) - .await - .map_err(ApiError::from_err)??; + } + .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn)) + .await + .map_err(ApiError::from_err)?; Ok(match new_timeline_info { Some(info) => json_response(StatusCode::CREATED, info)?, @@ -426,12 +425,10 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, let state = get_state(&request); let conf = state.conf; - tokio::task::spawn_blocking(move || { - let _enter = info_span!("tenant_detach", tenant = %tenant_id).entered(); - tenant_mgr::detach_tenant(conf, tenant_id) - }) - .await - .map_err(ApiError::from_err)??; + tenant_mgr::detach_tenant(conf, tenant_id) + .instrument(info_span!("tenant_detach", tenant = %tenant_id)) + .await + .map_err(ApiError::from_err)?; let mut remote_index = state.remote_index.write().await; remote_index.remove_tenant_entry(&tenant_id); @@ -583,7 +578,7 @@ async fn tenant_create_handler(mut request: Request) -> Result, - // Overridden tenant-specific config parameters. // We keep TenantConfOpt sturct here to preserve the information // about parameters that are not set. @@ -284,7 +270,7 @@ impl Repository { } /// perform one garbage collection iteration, removing old data files from disk. - /// this function is periodically called by gc thread. + /// this function is periodically called by gc task. /// also it can be explicitly requested through page server api 'do_gc' command. /// /// 'timelineid' specifies the timeline to GC, or None for all. @@ -299,14 +285,6 @@ impl Repository { pitr: Duration, checkpoint_before_gc: bool, ) -> Result { - let _guard = match self.file_lock.try_read() { - Ok(g) => g, - Err(_) => { - info!("File lock write acquired, shutting down GC"); - return Ok(GcResult::default()); - } - }; - let timeline_str = target_timeline_id .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); @@ -319,18 +297,10 @@ impl Repository { } /// Perform one compaction iteration. - /// This function is periodically called by compactor thread. + /// This function is periodically called by compactor task. /// Also it can be explicitly requested per timeline through page server /// api's 'compact' command. pub fn compaction_iteration(&self) -> Result<()> { - let _guard = match self.file_lock.try_read() { - Ok(g) => g, - Err(_) => { - info!("File lock write acquired, shutting down compaction"); - return Ok(()); - } - }; - // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // compactions. We don't want to block everything else while the @@ -624,10 +594,7 @@ impl Repository { .load_layer_map(new_disk_consistent_lsn) .context("failed to load layermap")?; - crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach { - id: ZTenantTimelineId::new(self.tenant_id(), new_timeline_id), - timeline: Arc::clone(&new_timeline), - }); + new_timeline.launch_wal_receiver()?; Ok(new_timeline) } @@ -642,7 +609,6 @@ impl Repository { ) -> Repository { Repository { tenant_id, - file_lock: RwLock::new(()), conf, tenant_conf: Arc::new(RwLock::new(tenant_conf)), timelines: Mutex::new(HashMap::new()), @@ -846,7 +812,7 @@ impl Repository { // See comments in [`Repository::branch_timeline`] for more information // about why branch creation task can run concurrently with timeline's GC iteration. for timeline in gc_timelines { - if thread_mgr::is_shutdown_requested() { + if task_mgr::is_shutdown_requested() { // We were requested to shut down. Stop and return with the progress we // made. break; diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index aa9d636739..60abbe33e6 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -5,16 +5,17 @@ use bytes::Bytes; use fail::fail_point; use itertools::Itertools; use once_cell::sync::OnceCell; +use tokio::task::spawn_blocking; use tracing::*; use std::cmp::{max, min, Ordering}; use std::collections::{HashMap, HashSet}; +use std::fs; use std::ops::{Deref, Range}; use std::path::PathBuf; use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering}; -use std::sync::{mpsc, Arc, Mutex, MutexGuard, RwLock, TryLockError}; +use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError}; use std::time::{Duration, Instant, SystemTime}; -use std::{fs, thread}; use crate::layered_repository::{ delta_layer::{DeltaLayer, DeltaLayerWriter}, @@ -46,8 +47,9 @@ use utils::{ use crate::repository::GcResult; use crate::repository::{Key, Value}; -use crate::thread_mgr; -use crate::walreceiver::IS_WAL_RECEIVER; +use crate::task_mgr; +use crate::task_mgr::TaskKind; +use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task}; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; use crate::{page_cache, storage_sync}; @@ -56,7 +58,7 @@ pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, - tenant_id: ZTenantId, + pub tenant_id: ZTenantId, pub timeline_id: ZTimelineId, pub layers: RwLock, @@ -110,11 +112,11 @@ pub struct Timeline { /// to avoid deadlock. write_lock: Mutex<()>, - /// Used to ensure that there is only one thread + /// Used to ensure that there is only task performing flushing at a time layer_flush_lock: Mutex<()>, /// Layer removal lock. - /// A lock to ensure that no layer of the timeline is removed concurrently by other threads. + /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks. /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`], /// and [`Repository::delete_timeline`]. layer_removal_cs: Mutex<()>, @@ -142,10 +144,7 @@ pub struct Timeline { /// Current logical size of the "datadir", at the last LSN. current_logical_size: LogicalSize, - // TODO task management should be done outside timeline, managed along with other tasks. - #[allow(clippy::type_complexity)] - initial_size_computation_task: - Mutex>, mpsc::Receiver<()>)>>, + initial_size_computation_started: AtomicBool, /// Information about the last processed message by the WAL receiver, /// or None if WAL receiver has not received anything for this timeline @@ -413,23 +412,23 @@ impl Timeline { /// You should call this before any of the other get_* or list_* functions. Calling /// those functions with an LSN that has been processed yet is an error. /// - pub fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { - // This should never be called from the WAL receiver thread, because that could lead + pub async fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { + // This should never be called from the WAL receiver, because that could lead // to a deadlock. ensure!( - !IS_WAL_RECEIVER.with(|c| c.get()), - "wait_lsn called by WAL receiver thread" + task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnection), + "wait_lsn cannot be called in WAL receiver" ); - self.metrics.wait_lsn_time_histo.observe_closure_duration( - || self.last_record_lsn - .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) - .with_context(|| { - format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", - lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() - ) - }))?; + let _timer = self.metrics.wait_lsn_time_histo.start_timer(); + + self.last_record_lsn.wait_for_timeout(lsn, self.conf.wait_lsn_timeout).await + .with_context(|| + format!( + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", + lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() + ) + )?; Ok(()) } @@ -587,7 +586,7 @@ impl Timeline { // initial logical size is 0. LogicalSize::empty_initial() }, - initial_size_computation_task: Mutex::new(None), + initial_size_computation_started: AtomicBool::new(false), partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), repartition_threshold: 0, @@ -598,6 +597,43 @@ impl Timeline { result } + pub fn launch_wal_receiver(self: &Arc) -> anyhow::Result<()> { + if !is_etcd_client_initialized() { + if cfg!(test) { + info!("not launching WAL receiver because etcd client hasn't been initialized"); + return Ok(()); + } else { + panic!("etcd client not initialized"); + } + } + + info!( + "launching WAL receiver for timeline {} of tenant {}", + self.timeline_id, self.tenant_id + ); + let tenant_conf_guard = self.tenant_conf.read().unwrap(); + let lagging_wal_timeout = tenant_conf_guard + .lagging_wal_timeout + .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout); + let walreceiver_connect_timeout = tenant_conf_guard + .walreceiver_connect_timeout + .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout); + let max_lsn_wal_lag = tenant_conf_guard + .max_lsn_wal_lag + .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); + drop(tenant_conf_guard); + let self_clone = Arc::clone(self); + let _ = spawn_connection_manager_task( + self.conf.broker_etcd_prefix.clone(), + self_clone, + walreceiver_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + )?; + + Ok(()) + } + /// /// Scan the timeline directory to populate the layer map. /// Returns all timeline-related files that were found and loaded. @@ -715,61 +751,34 @@ impl Timeline { fn try_spawn_size_init_task(self: &Arc, init_lsn: Lsn) { let timeline_id = self.timeline_id; - let mut task_guard = match self.initial_size_computation_task.try_lock() { - Ok(guard) => guard, - Err(_) => { - debug!("Skipping timeline logical size init: task lock is taken already"); - return; - } - }; - - if let Some((old_task, task_finish_signal)) = task_guard.take() { - // TODO rust 1.61 would allow to remove `task_finish_signal` entirely and call `old_task.is_finished()` instead - match task_finish_signal.try_recv() { - // task has either signaled successfully that it finished or panicked and dropped the sender part without signalling - Ok(()) | Err(mpsc::TryRecvError::Disconnected) => { - match old_task.join() { - // we're here due to OnceCell::get not returning the value - Ok(Ok(())) => { - error!("Timeline {timeline_id} size init task finished, yet the size was not updated, rescheduling the computation") - } - Ok(Err(task_error)) => { - error!("Error during timeline {timeline_id} size init: {task_error:?}") - } - Err(e) => error!("Timeline {timeline_id} size init task panicked: {e:?}"), - } - } - // task had not yet finished: no signal was sent and the sender channel is not dropped - Err(mpsc::TryRecvError::Empty) => { - // let the task finish - *task_guard = Some((old_task, task_finish_signal)); - return; - } - } - } - - if task_guard.is_none() { - let thread_timeline = Arc::clone(self); - let (finish_sender, finish_receiver) = mpsc::channel(); - - match thread::Builder::new() - .name(format!( - "Timeline {timeline_id} initial logical size calculation" - )) - .spawn(move || { - let _enter = info_span!("initial_logical_size_calculation", timeline = %timeline_id).entered(); - let calculated_size = thread_timeline.calculate_logical_size(init_lsn)?; - match thread_timeline.current_logical_size.initial_logical_size.set(calculated_size) { + // Atomically check if the timeline size calculation had already started. + // If the flag was not already set, this sets it. + if !self + .initial_size_computation_started + .swap(true, AtomicOrdering::SeqCst) + { + // We need to start the computation task. + let self_clone = Arc::clone(self); + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::InitialLogicalSizeCalculation, + Some(self.tenant_id), + Some(self.timeline_id), + "initial size calculation", + false, + async move { + let calculated_size = self_clone.calculate_logical_size(init_lsn)?; + let result = spawn_blocking(move || { + self_clone.current_logical_size.initial_logical_size.set(calculated_size) + }).await?; + match result { Ok(()) => info!("Successfully calculated initial logical size"), Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"), } - - finish_sender.send(()).ok(); Ok(()) - }) { - Ok(guard) => *task_guard = Some((guard, finish_receiver)), - Err(e) => error!("Failed to spawn timeline {timeline_id} size init task: {e}"), - } + } + .instrument(info_span!("initial_logical_size_calculation", timeline = %timeline_id)) + ); } } @@ -1099,22 +1108,23 @@ impl Timeline { self.last_freeze_at.store(last_lsn); *(self.last_freeze_ts.write().unwrap()) = Instant::now(); - // Launch a thread to flush the frozen layer to disk, unless - // a thread was already running. (If the thread was running + // Launch a task to flush the frozen layer to disk, unless + // a task was already running. (If the task was running // at the time that we froze the layer, it must've seen the // the layer we just froze before it exited; see comments // in flush_frozen_layers()) if let Ok(guard) = self.layer_flush_lock.try_lock() { drop(guard); let self_clone = Arc::clone(self); - thread_mgr::spawn( - thread_mgr::ThreadKind::LayerFlushThread, + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::LayerFlushTask, Some(self.tenant_id), Some(self.timeline_id), - "layer flush thread", + "layer flush task", false, - move || self_clone.flush_frozen_layers(false), - )?; + async move { self_clone.flush_frozen_layers(false) }, + ); } } } @@ -1123,8 +1133,8 @@ impl Timeline { /// Flush all frozen layers to disk. /// - /// Only one thread at a time can be doing layer-flushing for a - /// given timeline. If 'wait' is true, and another thread is + /// Only one task at a time can be doing layer-flushing for a + /// given timeline. If 'wait' is true, and another task is /// currently doing the flushing, this function will wait for it /// to finish. If 'wait' is false, this function will return /// immediately instead. diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 86bbf25b67..8b9251229e 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -12,10 +12,10 @@ pub mod profiling; pub mod reltag; pub mod repository; pub mod storage_sync; +pub mod task_mgr; pub mod tenant_config; pub mod tenant_mgr; pub mod tenant_tasks; -pub mod thread_mgr; pub mod timelines; pub mod virtual_file; pub mod walingest; @@ -28,7 +28,7 @@ use std::collections::HashMap; use tracing::info; use utils::zid::{ZTenantId, ZTimelineId}; -use crate::thread_mgr::ThreadKind; +use crate::task_mgr::TaskKind; /// Current storage format version /// @@ -52,30 +52,31 @@ pub enum CheckpointConfig { Forced, } -pub fn shutdown_pageserver(exit_code: i32) { - // Shut down the libpq endpoint thread. This prevents new connections from +pub async fn shutdown_pageserver(exit_code: i32) { + // Shut down the libpq endpoint task. This prevents new connections from // being accepted. - thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None); + task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await; - // Shut down any page service threads. - thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None); + // Shut down any page service tasks. + task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await; // Shut down all the tenants. This flushes everything to disk and kills - // the checkpoint and GC threads. - tenant_mgr::shutdown_all_tenants(); + // the checkpoint and GC tasks. + tenant_mgr::shutdown_all_tenants().await; // Stop syncing with remote storage. // - // FIXME: Does this wait for the sync thread to finish syncing what's queued up? + // FIXME: Does this wait for the sync tasks to finish syncing what's queued up? // Should it? - thread_mgr::shutdown_threads(Some(ThreadKind::StorageSync), None, None); + task_mgr::shutdown_tasks(Some(TaskKind::StorageSync), None, None).await; // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. - thread_mgr::shutdown_threads(Some(ThreadKind::HttpEndpointListener), None, None); + // FIXME: We should probably stop accepting commands like attach/detach earlier. + task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await; // There should be nothing left, but let's be sure - thread_mgr::shutdown_threads(None, None, None); + task_mgr::shutdown_tasks(None, None, None).await; info!("Shut down successfully completed"); std::process::exit(exit_code); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 783fcb2412..149144bfe4 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -11,17 +11,21 @@ use anyhow::{bail, ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; +use futures::{Stream, StreamExt}; use regex::Regex; -use std::io::{self, Read}; +use std::io; use std::net::TcpListener; use std::str; use std::str::FromStr; use std::sync::Arc; +use tokio_util::io::StreamReader; +use tokio_util::io::SyncIoBridge; use tracing::*; use utils::{ auth::{self, Claims, JwtAuth, Scope}, lsn::Lsn, - postgres_backend::{self, is_socket_read_timed_out, AuthType, PostgresBackend}, + postgres_backend::AuthType, + postgres_backend_async::{self, PostgresBackend}, pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC}, simple_rcu::RcuReadGuard, zid::{ZTenantId, ZTimelineId}, @@ -35,9 +39,9 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::profiling::profpoint_start; use crate::reltag::RelTag; +use crate::task_mgr; +use crate::task_mgr::TaskKind; use crate::tenant_mgr; -use crate::thread_mgr; -use crate::thread_mgr::ThreadKind; use crate::CheckpointConfig; use postgres_ffi::v14::xlog_utils::to_pg_timestamp; @@ -201,93 +205,49 @@ impl PagestreamBeMessage { } } -/// Implements Read for the server side of CopyIn -struct CopyInReader<'a> { - pgb: &'a mut PostgresBackend, +fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream> + '_ { + async_stream::try_stream! { + loop { + let msg = tokio::select! { + biased; - /// Overflow buffer for bytes sent in CopyData messages - /// that the reader (caller of read) hasn't asked for yet. - /// TODO use BytesMut? - buf: Vec, + _ = task_mgr::shutdown_watcher() => { + // We were requested to shut down. + let msg = format!("pageserver is shutting down"); + let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg)); + Err(anyhow::anyhow!(msg)) + } - /// Bytes before `buf_begin` are considered as dropped. - /// This allows us to implement O(1) pop_front on Vec. - /// The Vec won't grow large because we only add to it - /// when it's empty. - buf_begin: usize, -} + msg = pgb.read_message() => { msg } + }; -impl<'a> CopyInReader<'a> { - // NOTE: pgb should be in copy in state already - fn new(pgb: &'a mut PostgresBackend) -> Self { - Self { - pgb, - buf: Vec::<_>::new(), - buf_begin: 0, - } - } -} - -impl<'a> Drop for CopyInReader<'a> { - fn drop(&mut self) { - // Finalize copy protocol so that self.pgb can be reused - // TODO instead, maybe take ownership of pgb and give it back at the end - let mut buf: Vec = vec![]; - let _ = self.read_to_end(&mut buf); - } -} - -impl<'a> Read for CopyInReader<'a> { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - while !thread_mgr::is_shutdown_requested() { - // Return from buffer if nonempty - if self.buf_begin < self.buf.len() { - let bytes_to_read = std::cmp::min(buf.len(), self.buf.len() - self.buf_begin); - buf[..bytes_to_read].copy_from_slice(&self.buf[self.buf_begin..][..bytes_to_read]); - self.buf_begin += bytes_to_read; - return Ok(bytes_to_read); - } - - // Delete garbage - self.buf.clear(); - self.buf_begin = 0; - - // Wait for client to send CopyData bytes - match self.pgb.read_message() { + match msg { Ok(Some(message)) => { let copy_data_bytes = match message { FeMessage::CopyData(bytes) => bytes, - FeMessage::CopyDone => return Ok(0), + FeMessage::CopyDone => { break }, FeMessage::Sync => continue, m => { let msg = format!("unexpected message {:?}", m); - self.pgb.write_message(&BeMessage::ErrorResponse(&msg))?; - return Err(io::Error::new(io::ErrorKind::Other, msg)); + pgb.write_message(&BeMessage::ErrorResponse(&msg))?; + Err(io::Error::new(io::ErrorKind::Other, msg))?; + break; } }; - // Return as much as we can, saving the rest in self.buf - let mut reader = copy_data_bytes.reader(); - let bytes_read = reader.read(buf)?; - reader.read_to_end(&mut self.buf)?; - return Ok(bytes_read); + yield copy_data_bytes; } Ok(None) => { let msg = "client closed connection"; - self.pgb.write_message(&BeMessage::ErrorResponse(msg))?; - return Err(io::Error::new(io::ErrorKind::Other, msg)); + pgb.write_message(&BeMessage::ErrorResponse(msg))?; + pgb.flush().await?; + Err(io::Error::new(io::ErrorKind::Other, msg))?; } Err(e) => { - if !is_socket_read_timed_out(&e) { - return Err(io::Error::new(io::ErrorKind::Other, e)); - } + Err(io::Error::new(io::ErrorKind::Other, e))?; } - } + }; } - - // Shutting down - let msg = "Importer thread was shut down"; - Err(io::Error::new(io::ErrorKind::Other, msg)) } } @@ -296,61 +256,49 @@ impl<'a> Read for CopyInReader<'a> { /// /// Main loop of the page service. /// -/// Listens for connections, and launches a new handler thread for each. +/// Listens for connections, and launches a new handler task for each. /// -pub fn thread_main( +pub async fn libpq_listener_main( conf: &'static PageServerConf, auth: Option>, listener: TcpListener, auth_type: AuthType, ) -> anyhow::Result<()> { listener.set_nonblocking(true)?; - let basic_rt = tokio::runtime::Builder::new_current_thread() - .enable_io() - .build()?; - - let tokio_listener = { - let _guard = basic_rt.enter(); - tokio::net::TcpListener::from_std(listener) - }?; + let tokio_listener = tokio::net::TcpListener::from_std(listener)?; // Wait for a new connection to arrive, or for server shutdown. - while let Some(res) = basic_rt.block_on(async { - let shutdown_watcher = thread_mgr::shutdown_watcher(); - tokio::select! { - biased; + while let Some(res) = tokio::select! { + biased; - _ = shutdown_watcher => { - // We were requested to shut down. - None - } - - res = tokio_listener.accept() => { - Some(res) - } + _ = task_mgr::shutdown_watcher() => { + // We were requested to shut down. + None } - }) { + + res = tokio_listener.accept() => { + Some(res) + } + } { match res { Ok((socket, peer_addr)) => { - // Connection established. Spawn a new thread to handle it. + // Connection established. Spawn a new task to handle it. debug!("accepted connection from {}", peer_addr); let local_auth = auth.clone(); - // PageRequestHandler threads are not associated with any particular - // timeline in the thread manager. In practice most connections will + // PageRequestHandler tasks are not associated with any particular + // timeline in the task manager. In practice most connections will // only deal with a particular timeline, but we don't know which one // yet. - if let Err(err) = thread_mgr::spawn( - ThreadKind::PageRequestHandler, + task_mgr::spawn( + &tokio::runtime::Handle::current(), + TaskKind::PageRequestHandler, None, None, - "serving Page Service thread", + "serving compute connection task", false, - move || page_service_conn_main(conf, local_auth, socket, auth_type), - ) { - // Thread creation failed. Log the error and continue. - error!("could not spawn page service thread: {:?}", err); - } + page_service_conn_main(conf, local_auth, socket, auth_type), + ); } Err(err) => { // accept() failed. Log the error, and loop back to retry on next connection. @@ -364,13 +312,13 @@ pub fn thread_main( Ok(()) } -fn page_service_conn_main( +async fn page_service_conn_main( conf: &'static PageServerConf, auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, ) -> anyhow::Result<()> { - // Immediately increment the gauge, then create a job to decrement it on thread exit. + // Immediately increment the gauge, then create a job to decrement it on task exit. // One of the pros of `defer!` is that this will *most probably* // get called, even in presence of panics. let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]); @@ -379,22 +327,17 @@ fn page_service_conn_main( gauge.dec(); } - // We use Tokio to accept the connection, but the rest of the code works with a - // regular socket. Convert. - let socket = socket - .into_std() - .context("could not convert tokio::net:TcpStream to std::net::TcpStream")?; - socket - .set_nonblocking(false) - .context("could not put socket to blocking mode")?; - socket .set_nodelay(true) .context("could not set TCP_NODELAY")?; let mut conn_handler = PageServerHandler::new(conf, auth); - let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?; - match pgbackend.run(&mut conn_handler) { + let pgbackend = PostgresBackend::new(socket, auth_type, None)?; + + let result = pgbackend + .run(&mut conn_handler, task_mgr::shutdown_watcher) + .await; + match result { Ok(()) => { // we've been requested to shut down Ok(()) @@ -435,92 +378,95 @@ impl PageServerHandler { } } - fn handle_pagerequests( + #[instrument(skip(self, pgb))] + async fn handle_pagerequests( &self, pgb: &mut PostgresBackend, - timeline_id: ZTimelineId, tenant_id: ZTenantId, + timeline_id: ZTimelineId, ) -> anyhow::Result<()> { - let _enter = - info_span!("pagestream", timeline = %timeline_id, tenant = %tenant_id).entered(); - // NOTE: pagerequests handler exits when connection is closed, // so there is no need to reset the association - thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); + task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Check that the timeline exists let timeline = get_local_timeline(tenant_id, timeline_id)?; - /* switch client to COPYBOTH */ + // switch client to COPYBOTH pgb.write_message(&BeMessage::CopyBothResponse)?; + pgb.flush().await?; - while !thread_mgr::is_shutdown_requested() { - let msg = pgb.read_message(); + loop { + let msg = tokio::select! { + biased; - let profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); - match msg { - Ok(message) => { - if let Some(message) = message { - trace!("query: {:?}", message); - - let copy_data_bytes = match message { - FeMessage::CopyData(bytes) => bytes, - _ => continue, - }; - - let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; - let tenant_id = tenant_id.to_string(); - let timeline_id = timeline_id.to_string(); - - let response = match zenith_fe_msg { - PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]) - .observe_closure_duration(|| { - self.handle_get_rel_exists_request(&timeline, &req) - }), - PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_size", &tenant_id, &timeline_id]) - .observe_closure_duration(|| { - self.handle_get_nblocks_request(&timeline, &req) - }), - PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME - .with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]) - .observe_closure_duration(|| { - self.handle_get_page_at_lsn_request(&timeline, &req) - }), - PagestreamFeMessage::DbSize(req) => SMGR_QUERY_TIME - .with_label_values(&["get_db_size", &tenant_id, &timeline_id]) - .observe_closure_duration(|| { - self.handle_db_size_request(&timeline, &req) - }), - }; - - let response = response.unwrap_or_else(|e| { - // print the all details to the log with {:#}, but for the client the - // error message is enough - error!("error reading relation or page version: {:?}", e); - PagestreamBeMessage::Error(PagestreamErrorResponse { - message: e.to_string(), - }) - }); - - pgb.write_message(&BeMessage::CopyData(&response.serialize()))?; - } else { - break; - } + _ = task_mgr::shutdown_watcher() => { + // We were requested to shut down. + info!("shutdown request received in page handler"); + break; } - Err(e) => { - if !is_socket_read_timed_out(&e) { - return Err(e); - } + + msg = pgb.read_message() => { msg } + }; + + let copy_data_bytes = match msg? { + Some(FeMessage::CopyData(bytes)) => bytes, + Some(m) => { + bail!("unexpected message: {m:?} during COPY"); } - } - drop(profiling_guard); + None => break, // client disconnected + }; + + trace!("query: {:?}", copy_data_bytes); + + let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; + let tenant_str = tenant_id.to_string(); + let timeline_str = timeline_id.to_string(); + + let response = match zenith_fe_msg { + PagestreamFeMessage::Exists(req) => { + let _timer = SMGR_QUERY_TIME + .with_label_values(&["get_rel_exists", &tenant_str, &timeline_str]) + .start_timer(); + self.handle_get_rel_exists_request(&timeline, &req).await + } + PagestreamFeMessage::Nblocks(req) => { + let _timer = SMGR_QUERY_TIME + .with_label_values(&["get_rel_size", &tenant_str, &timeline_str]) + .start_timer(); + self.handle_get_nblocks_request(&timeline, &req).await + } + PagestreamFeMessage::GetPage(req) => { + let _timer = SMGR_QUERY_TIME + .with_label_values(&["get_page_at_lsn", &tenant_str, &timeline_str]) + .start_timer(); + self.handle_get_page_at_lsn_request(&timeline, &req).await + } + PagestreamFeMessage::DbSize(req) => { + let _timer = SMGR_QUERY_TIME + .with_label_values(&["get_db_size", &tenant_str, &timeline_str]) + .start_timer(); + self.handle_db_size_request(&timeline, &req).await + } + }; + + let response = response.unwrap_or_else(|e| { + // print the all details to the log with {:#}, but for the client the + // error message is enough + error!("error reading relation or page version: {:?}", e); + PagestreamBeMessage::Error(PagestreamErrorResponse { + message: e.to_string(), + }) + }); + + pgb.write_message(&BeMessage::CopyData(&response.serialize()))?; + pgb.flush().await?; } Ok(()) } - fn handle_import_basebackup( + #[instrument(skip(self, pgb))] + async fn handle_import_basebackup( &self, pgb: &mut PostgresBackend, tenant_id: ZTenantId, @@ -528,10 +474,7 @@ impl PageServerHandler { base_lsn: Lsn, _end_lsn: Lsn, ) -> anyhow::Result<()> { - thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); - let _enter = - info_span!("import basebackup", timeline = %timeline_id, tenant = %tenant_id).entered(); - + task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; @@ -550,8 +493,24 @@ impl PageServerHandler { // Import basebackup provided via CopyData info!("importing basebackup"); pgb.write_message(&BeMessage::CopyInResponse)?; - let reader = CopyInReader::new(pgb); - import_basebackup_from_tar(&*timeline, reader, base_lsn)?; + pgb.flush().await?; + + // import_basebackup_from_tar() is not async, mainly because the Tar crate + // it uses is not async. So we need to jump through some hoops: + // - convert the input from client connection to a synchronous Read + // - use block_in_place() + let mut copyin_stream = Box::pin(copyin_stream(pgb)); + let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); + tokio::task::block_in_place(|| import_basebackup_from_tar(&timeline, reader, base_lsn))?; + + // Drain the rest of the Copy data + let mut bytes_after_tar = 0; + while let Some(bytes) = copyin_stream.next().await { + bytes_after_tar += bytes?.len(); + } + if bytes_after_tar > 0 { + warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive"); + } // TODO check checksum // Meanwhile you can verify client-side by taking fullbackup @@ -563,11 +522,14 @@ impl PageServerHandler { info!("flushing layers"); timeline.checkpoint(CheckpointConfig::Flush)?; + timeline.launch_wal_receiver()?; + info!("done"); Ok(()) } - fn handle_import_wal( + #[instrument(skip(self, pgb))] + async fn handle_import_wal( &self, pgb: &mut PostgresBackend, tenant_id: ZTenantId, @@ -575,9 +537,7 @@ impl PageServerHandler { start_lsn: Lsn, end_lsn: Lsn, ) -> anyhow::Result<()> { - thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); - let _enter = - info_span!("import wal", timeline = %timeline_id, tenant = %tenant_id).entered(); + task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; let timeline = repo @@ -591,8 +551,22 @@ impl PageServerHandler { // Import wal provided via CopyData info!("importing wal"); pgb.write_message(&BeMessage::CopyInResponse)?; - let reader = CopyInReader::new(pgb); - import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn)?; + pgb.flush().await?; + let mut copyin_stream = Box::pin(copyin_stream(pgb)); + let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); + tokio::task::block_in_place(|| { + import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn) + })?; + info!("wal import complete"); + + // Drain the rest of the Copy data + let mut bytes_after_tar = 0; + while let Some(bytes) = copyin_stream.next().await { + bytes_after_tar += bytes?.len(); + } + if bytes_after_tar > 0 { + warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive"); + } // TODO Does it make sense to overshoot? ensure!(timeline.get_last_record_lsn() >= end_lsn); @@ -619,7 +593,7 @@ impl PageServerHandler { /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. - fn wait_or_get_last_lsn( + async fn wait_or_get_last_lsn( timeline: &Timeline, mut lsn: Lsn, latest: bool, @@ -647,7 +621,7 @@ impl PageServerHandler { if lsn <= last_record_lsn { lsn = last_record_lsn; } else { - timeline.wait_lsn(lsn)?; + timeline.wait_lsn(lsn).await?; // Since we waited for 'lsn' to arrive, that is now the last // record LSN. (Or close enough for our purposes; the // last-record LSN can advance immediately after we return @@ -657,7 +631,7 @@ impl PageServerHandler { if lsn == Lsn(0) { bail!("invalid LSN(0) in request"); } - timeline.wait_lsn(lsn)?; + timeline.wait_lsn(lsn).await?; } ensure!( lsn >= **latest_gc_cutoff_lsn, @@ -667,15 +641,15 @@ impl PageServerHandler { Ok(lsn) } - fn handle_get_rel_exists_request( + #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + async fn handle_get_rel_exists_request( &self, timeline: &Timeline, req: &PagestreamExistsRequest, ) -> Result { - let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered(); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?; @@ -684,14 +658,15 @@ impl PageServerHandler { })) } - fn handle_get_nblocks_request( + #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + async fn handle_get_nblocks_request( &self, timeline: &Timeline, req: &PagestreamNblocksRequest, ) -> Result { - let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered(); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?; @@ -700,14 +675,15 @@ impl PageServerHandler { })) } - fn handle_db_size_request( + #[instrument(skip(timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))] + async fn handle_db_size_request( &self, timeline: &Timeline, req: &PagestreamDbSizeRequest, ) -> Result { - let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered(); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; let total_blocks = timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?; @@ -719,15 +695,15 @@ impl PageServerHandler { })) } - fn handle_get_page_at_lsn_request( + #[instrument(skip(timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))] + async fn handle_get_page_at_lsn_request( &self, timeline: &Timeline, req: &PagestreamGetPageRequest, ) -> Result { - let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn) - .entered(); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; /* // Add a 1s delay to some requests. The delay helps the requests to // hit the race condition from github issue #1047 more easily. @@ -736,6 +712,11 @@ impl PageServerHandler { std::thread::sleep(std::time::Duration::from_millis(1000)); } */ + + // FIXME: this profiling now happens at different place than it used to. The + // current profiling is based on a thread-local variable, so it doesn't work + // across awaits + let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { @@ -743,23 +724,23 @@ impl PageServerHandler { })) } - fn handle_basebackup_request( + #[instrument(skip(self, pgb))] + async fn handle_basebackup_request( &self, pgb: &mut PostgresBackend, + tenant_id: ZTenantId, timeline_id: ZTimelineId, lsn: Option, prev_lsn: Option, - tenant_id: ZTenantId, full_backup: bool, ) -> anyhow::Result<()> { - let span = info_span!("basebackup", timeline = %timeline_id, tenant = %tenant_id, lsn = field::Empty); - let _enter = span.enter(); - info!("starting"); - // check that the timeline exists let timeline = get_local_timeline(tenant_id, timeline_id)?; let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { + // Backup was requested at a particular LSN. Wait for it to arrive. + info!("waiting for {}", lsn); + timeline.wait_lsn(lsn).await?; timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) .context("invalid basebackup lsn")?; @@ -767,18 +748,22 @@ impl PageServerHandler { // switch client to COPYOUT pgb.write_message(&BeMessage::CopyOutResponse)?; + pgb.flush().await?; /* Send a tarball of the latest layer on the timeline */ - { - let mut writer = CopyDataSink { pgb }; - + let mut writer = CopyDataSink { + pgb, + rt: tokio::runtime::Handle::current(), + }; + tokio::task::block_in_place(|| { let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?; - span.record("lsn", &basebackup.lsn.to_string().as_str()); - basebackup.send_tarball()?; - } + tracing::Span::current().record("lsn", &basebackup.lsn.to_string().as_str()); + basebackup.send_tarball() + })?; pgb.write_message(&BeMessage::CopyDone)?; - info!("done"); + pgb.flush().await?; + info!("basebackup complete"); Ok(()) } @@ -801,7 +786,8 @@ impl PageServerHandler { } } -impl postgres_backend::Handler for PageServerHandler { +#[async_trait::async_trait] +impl postgres_backend_async::Handler for PageServerHandler { fn check_auth_jwt( &mut self, _pgb: &mut PostgresBackend, @@ -831,11 +817,7 @@ impl postgres_backend::Handler for PageServerHandler { Ok(()) } - fn is_shutdown_requested(&self) -> bool { - thread_mgr::is_shutdown_requested() - } - - fn process_query( + async fn process_query( &mut self, pgb: &mut PostgresBackend, query_string: &str, @@ -849,12 +831,13 @@ impl postgres_backend::Handler for PageServerHandler { params.len() == 2, "invalid param number for pagestream command" ); - let tenantid = ZTenantId::from_str(params[0])?; - let timelineid = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; - self.check_permission(Some(tenantid))?; + self.check_permission(Some(tenant_id))?; - self.handle_pagerequests(pgb, timelineid, tenantid)?; + self.handle_pagerequests(pgb, tenant_id, timeline_id) + .await?; } else if query_string.starts_with("basebackup ") { let (_, params_raw) = query_string.split_at("basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); @@ -864,10 +847,10 @@ impl postgres_backend::Handler for PageServerHandler { "invalid param number for basebackup command" ); - let tenantid = ZTenantId::from_str(params[0])?; - let timelineid = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; - self.check_permission(Some(tenantid))?; + self.check_permission(Some(tenant_id))?; let lsn = if params.len() == 3 { Some(Lsn::from_str(params[2])?) @@ -876,8 +859,9 @@ impl postgres_backend::Handler for PageServerHandler { }; // Check that the timeline exists - self.handle_basebackup_request(pgb, timelineid, lsn, None, tenantid, false)?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false) + .await?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } // return pair of prev_lsn and last_lsn else if query_string.starts_with("get_last_record_rlsn ") { @@ -897,11 +881,11 @@ impl postgres_backend::Handler for PageServerHandler { let end_of_timeline = timeline.get_last_record_rlsn(); - pgb.write_message_noflush(&BeMessage::RowDescription(&[ + pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::text_col(b"prev_lsn"), RowDescriptor::text_col(b"last_lsn"), ]))? - .write_message_noflush(&BeMessage::DataRow(&[ + .write_message(&BeMessage::DataRow(&[ Some(end_of_timeline.prev.to_string().as_bytes()), Some(end_of_timeline.last.to_string().as_bytes()), ]))? @@ -917,8 +901,8 @@ impl postgres_backend::Handler for PageServerHandler { "invalid param number for fullbackup command" ); - let tenantid = ZTenantId::from_str(params[0])?; - let timelineid = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; // The caller is responsible for providing correct lsn and prev_lsn. let lsn = if params.len() > 2 { @@ -932,11 +916,12 @@ impl postgres_backend::Handler for PageServerHandler { None }; - self.check_permission(Some(tenantid))?; + self.check_permission(Some(tenant_id))?; // Check that the timeline exists - self.handle_basebackup_request(pgb, timelineid, lsn, prev_lsn, tenantid, true)?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true) + .await?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("import basebackup ") { // Import the `base` section (everything but the wal) of a basebackup. // Assumes the tenant already exists on this pageserver. @@ -952,18 +937,21 @@ impl postgres_backend::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("import basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); ensure!(params.len() == 4); - let tenant = ZTenantId::from_str(params[0])?; - let timeline = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; let base_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; - self.check_permission(Some(tenant))?; + self.check_permission(Some(tenant_id))?; - match self.handle_import_basebackup(pgb, tenant, timeline, base_lsn, end_lsn) { - Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, + match self + .handle_import_basebackup(pgb, tenant_id, timeline_id, base_lsn, end_lsn) + .await + { + Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, Err(e) => { error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}"); - pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))? + pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))? } }; } else if query_string.starts_with("import wal ") { @@ -974,24 +962,27 @@ impl postgres_backend::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("import wal ".len()); let params = params_raw.split_whitespace().collect::>(); ensure!(params.len() == 4); - let tenant = ZTenantId::from_str(params[0])?; - let timeline = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; let start_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; - self.check_permission(Some(tenant))?; + self.check_permission(Some(tenant_id))?; - match self.handle_import_wal(pgb, tenant, timeline, start_lsn, end_lsn) { - Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, + match self + .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn) + .await + { + Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, Err(e) => { error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}"); - pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))? + pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))? } }; } else if query_string.to_ascii_lowercase().starts_with("set ") { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("failpoints ") { ensure!(fail::has_failpoints(), "Cannot manage failpoints because pageserver was compiled without failpoints support"); @@ -1016,7 +1007,7 @@ impl postgres_backend::Handler for PageServerHandler { bail!("Invalid failpoints format"); } } - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("show ") { // show let (_, params_raw) = query_string.split_at("show ".len()); @@ -1024,7 +1015,7 @@ impl postgres_backend::Handler for PageServerHandler { ensure!(params.len() == 1, "invalid param number for config command"); let tenantid = ZTenantId::from_str(params[0])?; let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - pgb.write_message_noflush(&BeMessage::RowDescription(&[ + pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), RowDescriptor::int8_col(b"checkpoint_timeout"), RowDescriptor::int8_col(b"compaction_target_size"), @@ -1035,7 +1026,7 @@ impl postgres_backend::Handler for PageServerHandler { RowDescriptor::int8_col(b"image_creation_threshold"), RowDescriptor::int8_col(b"pitr_interval"), ]))? - .write_message_noflush(&BeMessage::DataRow(&[ + .write_message(&BeMessage::DataRow(&[ Some(repo.get_checkpoint_distance().to_string().as_bytes()), Some( repo.get_checkpoint_timeout() @@ -1072,10 +1063,10 @@ impl postgres_backend::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid do_gc: '{}'", query_string))?; - let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; let gc_horizon: u64 = caps .get(4) @@ -1084,8 +1075,8 @@ impl postgres_backend::Handler for PageServerHandler { // Use tenant's pitr setting let pitr = repo.get_pitr_interval(); - let result = repo.gc_iteration(Some(timelineid), gc_horizon, pitr, true)?; - pgb.write_message_noflush(&BeMessage::RowDescription(&[ + let result = repo.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; + pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"layers_total"), RowDescriptor::int8_col(b"layers_needed_by_cutoff"), RowDescriptor::int8_col(b"layers_needed_by_pitr"), @@ -1094,7 +1085,7 @@ impl postgres_backend::Handler for PageServerHandler { RowDescriptor::int8_col(b"layers_removed"), RowDescriptor::int8_col(b"elapsed"), ]))? - .write_message_noflush(&BeMessage::DataRow(&[ + .write_message(&BeMessage::DataRow(&[ Some(result.layers_total.to_string().as_bytes()), Some(result.layers_needed_by_cutoff.to_string().as_bytes()), Some(result.layers_needed_by_pitr.to_string().as_bytes()), @@ -1121,8 +1112,8 @@ impl postgres_backend::Handler for PageServerHandler { let timeline = get_local_timeline(tenant_id, timeline_id)?; timeline.compact()?; - pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message(&SINGLE_COL_ROWDESC)? + .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("checkpoint ") { // Run checkpoint immediately on given timeline. @@ -1140,8 +1131,8 @@ impl postgres_backend::Handler for PageServerHandler { // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). timeline.checkpoint(CheckpointConfig::Forced)?; - pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message(&SINGLE_COL_ROWDESC)? + .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("get_lsn_by_timestamp ") { // Locate LSN of last transaction with timestamp less or equal than sppecified // TODO lazy static @@ -1158,7 +1149,7 @@ impl postgres_backend::Handler for PageServerHandler { let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; let timestamp_pg = to_pg_timestamp(timestamp); - pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( + pgb.write_message(&BeMessage::RowDescription(&[RowDescriptor::text_col( b"lsn", )]))?; let result = match timeline.find_lsn_for_timestamp(timestamp_pg)? { @@ -1167,14 +1158,12 @@ impl postgres_backend::Handler for PageServerHandler { LsnForTimestamp::Past(_lsn) => "past".into(), LsnForTimestamp::NoData(_lsn) => "nodata".into(), }; - pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?; + pgb.write_message(&BeMessage::DataRow(&[Some(result.as_bytes())]))?; pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { bail!("unknown command"); } - pgb.flush()?; - Ok(()) } } @@ -1194,6 +1183,7 @@ fn get_local_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Result< /// struct CopyDataSink<'a> { pgb: &'a mut PostgresBackend, + rt: tokio::runtime::Handle, } impl<'a> io::Write for CopyDataSink<'a> { @@ -1205,6 +1195,7 @@ impl<'a> io::Write for CopyDataSink<'a> { // FIXME: flush isn't really required, but makes it easier // to view in wireshark self.pgb.write_message(&BeMessage::CopyData(data))?; + self.rt.block_on(self.pgb.flush())?; trace!("CopyData sent for {} bytes!", data.len()); Ok(data.len()) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 57a964cb67..8ebfa6a935 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -37,7 +37,7 @@ //! | access to this storage | //! +------------------------+ //! -//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so. +//! First, during startup, the pageserver inits the storage sync task with the async loop, or leaves the loop uninitialised, if configured so. //! The loop inits the storage connection and checks the remote files stored. //! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server). //! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can @@ -158,7 +158,6 @@ use once_cell::sync::OnceCell; use remote_storage::GenericRemoteStorage; use tokio::{ fs, - runtime::Runtime, time::{Duration, Instant}, }; use tracing::*; @@ -174,9 +173,10 @@ use crate::{ exponential_backoff, layered_repository::metadata::{metadata_path, TimelineMetadata}, storage_sync::index::RemoteIndex, + task_mgr, + task_mgr::TaskKind, + task_mgr::BACKGROUND_RUNTIME, tenant_mgr::attach_local_tenants, - thread_mgr, - thread_mgr::ThreadKind, }; use crate::{ metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD}, @@ -264,7 +264,7 @@ impl SyncQueue { .unwrap() .0; - if thread_mgr::is_shutdown_requested() { + if task_mgr::is_shutdown_requested() { return (HashMap::new(), q.len()); } } @@ -574,7 +574,7 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { /// Launch a thread to perform remote storage sync tasks. /// See module docs for loop step description. -pub fn spawn_storage_sync_thread( +pub fn spawn_storage_sync_task( conf: &'static PageServerConf, local_timeline_files: TenantTimelineValues<(TimelineMetadata, HashSet)>, storage: GenericRemoteStorage, @@ -590,11 +590,6 @@ pub fn spawn_storage_sync_thread( None => bail!("Could not get sync queue during the sync loop step, aborting"), }; - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .context("Failed to create storage sync runtime")?; - // TODO we are able to "attach" empty tenants, but not doing it now since it might require big wait time: // * we need to list every timeline for tenant on S3, that might be a costly operation // * we need to download every timeline for the tenant, to activate it in memory @@ -616,7 +611,7 @@ pub fn spawn_storage_sync_thread( } } - let applicable_index_parts = runtime.block_on(download_index_parts( + let applicable_index_parts = BACKGROUND_RUNTIME.block_on(download_index_parts( conf, &storage, keys_for_index_part_downloads, @@ -625,7 +620,7 @@ pub fn spawn_storage_sync_thread( let remote_index = RemoteIndex::from_parts(conf, applicable_index_parts)?; let mut local_timeline_init_statuses = schedule_first_sync_tasks( - &mut runtime.block_on(remote_index.write()), + &mut BACKGROUND_RUNTIME.block_on(remote_index.write()), sync_queue, timelines_to_sync, ); @@ -634,31 +629,30 @@ pub fn spawn_storage_sync_thread( .extend(empty_tenants.0.into_iter()); let remote_index_clone = remote_index.clone(); - thread_mgr::spawn( - ThreadKind::StorageSync, + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::StorageSync, None, None, - "Remote storage sync thread", + "Remote storage sync task", false, - move || { + async move { storage_sync_loop( - runtime, conf, (storage, remote_index_clone, sync_queue), max_sync_errors, - ); + ) + .await; Ok(()) }, - ) - .context("Failed to spawn remote storage sync thread")?; + ); Ok(SyncStartupData { remote_index, local_timeline_init_statuses, }) } -fn storage_sync_loop( - runtime: Runtime, +async fn storage_sync_loop( conf: &'static PageServerConf, (storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, @@ -669,7 +663,7 @@ fn storage_sync_loop( let (batched_tasks, remaining_queue_length) = sync_queue.next_task_batch(); - if thread_mgr::is_shutdown_requested() { + if task_mgr::is_shutdown_requested() { info!("Shutdown requested, stopping"); break; } @@ -683,20 +677,19 @@ fn storage_sync_loop( } // Concurrently perform all the tasks in the batch - let loop_step = runtime.block_on(async { - tokio::select! { - step = process_batches( - conf, - max_sync_errors, - loop_storage, - &index, - batched_tasks, - sync_queue, - ) - .instrument(info_span!("storage_sync_loop_step")) => ControlFlow::Continue(step), - _ = thread_mgr::shutdown_watcher() => ControlFlow::Break(()), - } - }); + let loop_step = tokio::select! { + step = process_batches( + conf, + max_sync_errors, + loop_storage, + &index, + batched_tasks, + sync_queue, + ) + .instrument(info_span!("storage_sync_loop_step")) => ControlFlow::Continue(step) + , + _ = task_mgr::shutdown_watcher() => ControlFlow::Break(()), + }; match loop_step { ControlFlow::Continue(updated_tenants) => { @@ -708,7 +701,7 @@ fn storage_sync_loop( updated_tenants.len() ); let mut timelines_to_attach = TenantTimelineValues::new(); - let index_accessor = runtime.block_on(index.read()); + let index_accessor = index.read().await; for tenant_id in updated_tenants { let tenant_entry = match index_accessor.tenant_entry(&tenant_id) { Some(tenant_entry) => tenant_entry, diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 7070f941f5..a4285e426b 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -153,7 +153,7 @@ pub(super) async fn upload_timeline_layers<'a>( // We have run the upload sync task, but the file we wanted to upload is gone. // This is "fine" due the asynchronous nature of the sync loop: it only reacts to events and might need to // retry the upload tasks, if S3 or network is down: but during this time, pageserver might still operate and - // run compaction/gc threads, removing redundant files from disk. + // run compaction/gc tasks, removing redundant files from disk. // It's not good to pause GC/compaction because of those and we would rather skip such uploads. // // Yet absence of such files might also mean that the timeline metadata file was updated (GC moves the Lsn forward, for instance). diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs new file mode 100644 index 0000000000..2aa803d119 --- /dev/null +++ b/pageserver/src/task_mgr.rs @@ -0,0 +1,463 @@ +//! +//! This module provides centralized handling of tokio tasks in the Page Server. +//! +//! We provide a few basic facilities: +//! - A global registry of tasks that lists what kind of tasks they are, and +//! which tenant or timeline they are working on +//! +//! - The ability to request a task to shut down. +//! +//! +//! # How it works? +//! +//! There is a global hashmap of all the tasks (`TASKS`). Whenever a new +//! task is spawned, a PageServerTask entry is added there, and when a +//! task dies, it removes itself from the hashmap. If you want to kill a +//! task, you can scan the hashmap to find it. +//! +//! # Task shutdown +//! +//! To kill a task, we rely on co-operation from the victim. Each task is +//! expected to periodically call the `is_shutdown_requested()` function, and +//! if it returns true, exit gracefully. In addition to that, when waiting for +//! the network or other long-running operation, you can use +//! `shutdown_watcher()` function to get a Future that will become ready if +//! the current task has been requested to shut down. You can use that with +//! Tokio select!(). +//! +//! +//! TODO: This would be a good place to also handle panics in a somewhat sane way. +//! Depending on what task panics, we might want to kill the whole server, or +//! only a single tenant or timeline. +//! + +// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro. +// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224. +#![allow(clippy::declare_interior_mutable_const)] + +use std::collections::HashMap; +use std::future::Future; +use std::panic::AssertUnwindSafe; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Arc, Mutex}; + +use futures::FutureExt; +use tokio::runtime::Runtime; +use tokio::sync::watch; +use tokio::task::JoinHandle; +use tokio::task_local; + +use tracing::{debug, error, info, warn}; + +use once_cell::sync::Lazy; + +use utils::zid::{ZTenantId, ZTimelineId}; + +use crate::shutdown_pageserver; + +// +// There are four runtimes: +// +// Compute request runtime +// - used to handle connections from compute nodes. Any tasks related to satisfying +// GetPage requests, base backups, import, and other such compute node operations +// are handled by the Compute request runtime +// - page_service.rs +// - this includes layer downloads from remote storage, if a layer is needed to +// satisfy a GetPage request +// +// Management request runtime +// - used to handle HTTP API requests +// +// WAL receiver runtime: +// - used to handle WAL receiver connections. +// - and to receiver updates from etcd +// +// Background runtime +// - layer flushing +// - garbage collection +// - compaction +// - remote storage uploads +// - initial tenant loading +// +// Everything runs in a tokio task. If you spawn new tasks, spawn it using the correct +// runtime. +// +// There might be situations when one task needs to wait for a task running in another +// Runtime to finish. For example, if a background operation needs a layer from remote +// storage, it will start to download it. If a background operation needs a remote layer, +// and the download was already initiated by a GetPage request, the background task +// will wait for the download - running in the Page server runtime - to finish. +// Another example: the initial tenant loading tasks are launched in the background ops +// runtime. If a GetPage request comes in before the load of a tenant has finished, the +// GetPage request will wait for the tenant load to finish. +// +// The core Timeline code is synchronous, and uses a bunch of std Mutexes and RWLocks to +// protect data structures. Let's keep it that way. Synchronous code is easier to debug +// and analyze, and there's a lot of hairy, low-level, performance critical code there. +// +// It's nice to have different runtimes, so that you can quickly eyeball how much CPU +// time each class of operations is taking, with 'top -H' or similar. +// +// It's also good to avoid hogging all threads that would be needed to process +// other operations, if the upload tasks e.g. get blocked on locks. It shouldn't +// happen, but still. +// +pub static COMPUTE_REQUEST_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("compute request worker") + .enable_all() + .build() + .expect("Failed to create compute request runtime") +}); + +pub static MGMT_REQUEST_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("mgmt request worker") + .enable_all() + .build() + .expect("Failed to create mgmt request runtime") +}); + +pub static WALRECEIVER_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("walreceiver worker") + .enable_all() + .build() + .expect("Failed to create walreceiver runtime") +}); + +pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("background op worker") + .enable_all() + .build() + .expect("Failed to create background op runtime") +}); + +pub struct PageserverTaskId(u64); + +/// Each task that we track is associated with a "task ID". It's just an +/// increasing number that we assign. Note that it is different from tokio::task::Id. +static NEXT_TASK_ID: Lazy = Lazy::new(|| AtomicU64::new(1)); + +/// Global registry of tasks +static TASKS: Lazy>>> = + Lazy::new(|| Mutex::new(HashMap::new())); + +task_local! { + // There is a Tokio watch channel for each task, which can be used to signal the + // task that it needs to shut down. This task local variable holds the receiving + // end of the channel. The sender is kept in the global registry, so that anyone + // can send the signal to request task shutdown. + static SHUTDOWN_RX: watch::Receiver; + + // Each task holds reference to its own PageServerTask here. + static CURRENT_TASK: Arc; +} + +/// +/// There are many kinds of tasks in the system. Some are associated with a particular +/// tenant or timeline, while others are global. +/// +/// Note that we don't try to limit how many task of a certain kind can be running +/// at the same time. +/// +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum TaskKind { + // libpq listener task. It just accepts connection and spawns a + // PageRequestHandler task for each connection. + LibpqEndpointListener, + + // HTTP endpoint listener. + HttpEndpointListener, + + // Task that handles a single connection. A PageRequestHandler task + // starts detached from any particular tenant or timeline, but it can be + // associated with one later, after receiving a command from the client. + PageRequestHandler, + + // Manages the WAL receiver connection for one timeline. It subscribes to + // events from etcd, decides which safekeeper to connect to. It spawns a + // separate WalReceiverConnection task to handle each connection. + WalReceiverManager, + + // Handles a connection to a safekeeper, to stream WAL to a timeline. + WalReceiverConnection, + + // Garbage collection worker. One per tenant + GarbageCollector, + + // Compaction. One per tenant. + Compaction, + + // Initial logical size calculation + InitialLogicalSizeCalculation, + + // Task that flushes frozen in-memory layers to disk + LayerFlushTask, + + // Task that manages the remote upload queue + StorageSync, + + // task that handles the initial downloading of all tenants + InitialLoad, + + // task that handles attaching a tenant + Attach, +} + +#[derive(Default)] +struct MutableTaskState { + /// Tenant and timeline that this task is associated with. + tenant_id: Option, + timeline_id: Option, + + /// Handle for waiting for the task to exit. It can be None, if the + /// the task has already exited. + join_handle: Option>, +} + +struct PageServerTask { + #[allow(dead_code)] // unused currently + task_id: PageserverTaskId, + + kind: TaskKind, + + name: String, + + // To request task shutdown, send 'true' to the channel to notify the task. + shutdown_tx: watch::Sender, + + mutable: Mutex, +} + +/// Launch a new task +/// Note: if shutdown_process_on_error is set to true failure +/// of the task will lead to shutdown of entire process +pub fn spawn( + runtime: &tokio::runtime::Handle, + kind: TaskKind, + tenant_id: Option, + timeline_id: Option, + name: &str, + shutdown_process_on_error: bool, + future: F, +) -> PageserverTaskId +where + F: Future> + Send + 'static, +{ + let (shutdown_tx, shutdown_rx) = watch::channel(false); + let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed); + let task = Arc::new(PageServerTask { + task_id: PageserverTaskId(task_id), + kind, + name: name.to_string(), + shutdown_tx, + mutable: Mutex::new(MutableTaskState { + tenant_id, + timeline_id, + join_handle: None, + }), + }); + + TASKS.lock().unwrap().insert(task_id, Arc::clone(&task)); + + let mut task_mut = task.mutable.lock().unwrap(); + + let task_name = name.to_string(); + let task_cloned = Arc::clone(&task); + let join_handle = runtime.spawn(task_wrapper( + task_name, + task_id, + task_cloned, + shutdown_rx, + shutdown_process_on_error, + future, + )); + task_mut.join_handle = Some(join_handle); + drop(task_mut); + + // The task is now running. Nothing more to do here + PageserverTaskId(task_id) +} + +/// This wrapper function runs in a newly-spawned task. It initializes the +/// task-local variables and calls the payload function. +async fn task_wrapper( + task_name: String, + task_id: u64, + task: Arc, + shutdown_rx: watch::Receiver, + shutdown_process_on_error: bool, + future: F, +) where + F: Future> + Send + 'static, +{ + debug!("Starting task '{}'", task_name); + + let result = SHUTDOWN_RX + .scope( + shutdown_rx, + CURRENT_TASK.scope(task, { + // We use AssertUnwindSafe here so that the payload function + // doesn't need to be UnwindSafe. We don't do anything after the + // unwinding that would expose us to unwind-unsafe behavior. + AssertUnwindSafe(future).catch_unwind() + }), + ) + .await; + task_finish(result, task_name, task_id, shutdown_process_on_error).await; +} + +async fn task_finish( + result: std::result::Result< + anyhow::Result<()>, + std::boxed::Box, + >, + task_name: String, + task_id: u64, + shutdown_process_on_error: bool, +) { + // Remove our entry from the global hashmap. + let task = TASKS + .lock() + .unwrap() + .remove(&task_id) + .expect("no task in registry"); + + let mut shutdown_process = false; + { + let task_mut = task.mutable.lock().unwrap(); + + match result { + Ok(Ok(())) => { + debug!("Task '{}' exited normally", task_name); + } + Ok(Err(err)) => { + if shutdown_process_on_error { + error!( + "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + shutdown_process = true; + } else { + error!( + "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + } + } + Err(err) => { + if shutdown_process_on_error { + error!( + "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + shutdown_process = true; + } else { + error!( + "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + } + } + } + } + + if shutdown_process { + shutdown_pageserver(1).await; + } +} + +// expected to be called from the task of the given id. +pub fn associate_with(tenant_id: Option, timeline_id: Option) { + CURRENT_TASK.with(|ct| { + let mut task_mut = ct.mutable.lock().unwrap(); + task_mut.tenant_id = tenant_id; + task_mut.timeline_id = timeline_id; + }); +} + +/// Is there a task running that matches the criteria + +/// Signal and wait for tasks to shut down. +/// +/// +/// The arguments are used to select the tasks to kill. Any None arguments are +/// ignored. For example, to shut down all WalReceiver tasks: +/// +/// shutdown_tasks(Some(TaskKind::WalReceiver), None, None) +/// +/// Or to shut down all tasks for given timeline: +/// +/// shutdown_tasks(None, Some(tenantid), Some(timelineid)) +/// +pub async fn shutdown_tasks( + kind: Option, + tenant_id: Option, + timeline_id: Option, +) { + let mut victim_tasks = Vec::new(); + + { + let tasks = TASKS.lock().unwrap(); + for task in tasks.values() { + let task_mut = task.mutable.lock().unwrap(); + if (kind.is_none() || Some(task.kind) == kind) + && (tenant_id.is_none() || task_mut.tenant_id == tenant_id) + && (timeline_id.is_none() || task_mut.timeline_id == timeline_id) + { + let _ = task.shutdown_tx.send_replace(true); + victim_tasks.push(Arc::clone(task)); + } + } + } + + for task in victim_tasks { + let join_handle = { + let mut task_mut = task.mutable.lock().unwrap(); + info!("waiting for {} to shut down", task.name); + let join_handle = task_mut.join_handle.take(); + drop(task_mut); + join_handle + }; + if let Some(join_handle) = join_handle { + let _ = join_handle.await; + } else { + // Possibly one of: + // * The task had not even fully started yet. + // * It was shut down concurrently and already exited + } + } +} + +pub fn current_task_kind() -> Option { + CURRENT_TASK.try_with(|ct| ct.kind).ok() +} + +/// A Future that can be used to check if the current task has been requested to +/// shut down. +pub async fn shutdown_watcher() { + let mut shutdown_rx = SHUTDOWN_RX + .try_with(|rx| rx.clone()) + .expect("shutdown_requested() called in an unexpected task or thread"); + + while !*shutdown_rx.borrow() { + if shutdown_rx.changed().await.is_err() { + break; + } + } +} + +/// Has the current task been requested to shut down? +pub fn is_shutdown_requested() -> bool { + if let Ok(shutdown_rx) = SHUTDOWN_RX.try_with(|rx| rx.clone()) { + *shutdown_rx.borrow() + } else { + if !cfg!(test) { + warn!("is_shutdown_requested() called in an unexpected task or thread"); + } + false + } +} diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index baa58f5eb5..db256b0f65 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -5,14 +5,14 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; use crate::layered_repository::ephemeral_file::is_ephemeral_file; use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}; -use crate::layered_repository::{Repository, Timeline}; +use crate::layered_repository::Repository; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; +use crate::task_mgr::{self, TaskKind}; use crate::tenant_config::TenantConfOpt; -use crate::thread_mgr::ThreadKind; -use crate::walredo::PostgresRedoManager; -use crate::{thread_mgr, timelines, walreceiver, TenantTimelineValues, TEMP_FILE_SUFFIX}; -use anyhow::Context; +use crate::walredo::{PostgresRedoManager, WalRedoManager}; +use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; +use anyhow::{ensure, Context}; use remote_storage::GenericRemoteStorage; use serde::{Deserialize, Serialize}; use std::collections::hash_map::{self, Entry}; @@ -21,34 +21,24 @@ use std::ffi::OsStr; use std::fmt; use std::path::{Path, PathBuf}; use std::sync::Arc; -use tokio::sync::mpsc; use tracing::*; -pub use tenants_state::try_send_timeline_update; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +use utils::crashsafe_dir; +use utils::zid::{ZTenantId, ZTimelineId}; mod tenants_state { - use anyhow::ensure; use once_cell::sync::Lazy; use std::{ collections::HashMap, sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}, }; - use tokio::sync::mpsc; - use tracing::{debug, error}; use utils::zid::ZTenantId; - use crate::tenant_mgr::{LocalTimelineUpdate, Tenant}; + use crate::tenant_mgr::Tenant; static TENANTS: Lazy>> = Lazy::new(|| RwLock::new(HashMap::new())); - /// Sends updates to the local timelines (creation and deletion) to the WAL receiver, - /// so that it can enable/disable corresponding processes. - static TIMELINE_UPDATE_SENDER: Lazy< - RwLock>>, - > = Lazy::new(|| RwLock::new(None)); - pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap> { TENANTS .read() @@ -60,39 +50,6 @@ mod tenants_state { .write() .expect("Failed to write() tenants lock, it got poisoned") } - - pub(super) fn set_timeline_update_sender( - timeline_updates_sender: mpsc::UnboundedSender, - ) -> anyhow::Result<()> { - let mut sender_guard = TIMELINE_UPDATE_SENDER - .write() - .expect("Failed to write() timeline_update_sender lock, it got poisoned"); - ensure!(sender_guard.is_none(), "Timeline update sender already set"); - *sender_guard = Some(timeline_updates_sender); - Ok(()) - } - - pub fn try_send_timeline_update(update: LocalTimelineUpdate) { - match TIMELINE_UPDATE_SENDER - .read() - .expect("Failed to read() timeline_update_sender lock, it got poisoned") - .as_ref() - { - Some(sender) => { - if let Err(e) = sender.send(update) { - error!("Failed to send timeline update: {}", e); - } - } - None => debug!("Timeline update sender is not enabled, cannot send update {update:?}"), - } - } - - pub(super) fn stop_timeline_update_sender() { - TIMELINE_UPDATE_SENDER - .write() - .expect("Failed to write() timeline_update_sender lock, it got poisoned") - .take(); - } } struct Tenant { @@ -103,9 +60,6 @@ struct Tenant { #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] pub enum TenantState { - // All data for this tenant is complete on local disk, but we haven't loaded the Repository, - // Timeline and Layer structs into memory yet, so it cannot be accessed yet. - //Ready, // This tenant exists on local disk, and the layer map has been loaded into memory. // The local disk might have some newer files that don't exist in cloud storage yet. Active, @@ -139,10 +93,6 @@ pub fn init_tenant_mgr( remote_storage: Option, ) -> anyhow::Result { let _entered = info_span!("init_tenant_mgr").entered(); - let (timeline_updates_sender, timeline_updates_receiver) = - mpsc::unbounded_channel::(); - tenants_state::set_timeline_update_sender(timeline_updates_sender)?; - walreceiver::init_wal_receiver_main_thread(conf, timeline_updates_receiver)?; let local_tenant_files = local_tenant_timeline_files(conf) .context("Failed to collect local tenant timeline files")?; @@ -156,7 +106,7 @@ pub fn init_tenant_mgr( let SyncStartupData { remote_index, local_timeline_init_statuses, - } = storage_sync::spawn_storage_sync_thread( + } = storage_sync::spawn_storage_sync_task( conf, local_tenant_files, storage, @@ -185,27 +135,6 @@ pub fn init_tenant_mgr( Ok(remote_index) } -pub enum LocalTimelineUpdate { - Detach { - id: ZTenantTimelineId, - // used to signal to the detach caller that walreceiver successfully terminated for specified id - join_confirmation_sender: std::sync::mpsc::Sender<()>, - }, - Attach { - id: ZTenantTimelineId, - timeline: Arc, - }, -} - -impl std::fmt::Debug for LocalTimelineUpdate { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::Detach { id, .. } => f.debug_tuple("Detach").field(id).finish(), - Self::Attach { id, .. } => f.debug_tuple("Attach").field(id).finish(), - } - } -} - /// Reads local files to load tenants and their timelines given into pageserver's memory. /// Ignores other timelines that might be present for tenant, but were not passed as a parameter. /// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken", @@ -274,24 +203,26 @@ fn load_local_repo( /// /// Shut down all tenants. This runs as part of pageserver shutdown. /// -pub fn shutdown_all_tenants() { - tenants_state::stop_timeline_update_sender(); - let mut m = tenants_state::write_tenants(); - let mut tenantids = Vec::new(); - for (tenantid, tenant) in m.iter_mut() { - match tenant.state { - TenantState::Active | TenantState::Idle | TenantState::Stopping => { - tenant.state = TenantState::Stopping; - tenantids.push(*tenantid) +pub async fn shutdown_all_tenants() { + let tenantids = { + let mut m = tenants_state::write_tenants(); + let mut tenantids = Vec::new(); + for (tenantid, tenant) in m.iter_mut() { + match tenant.state { + TenantState::Active | TenantState::Idle | TenantState::Stopping => { + tenant.state = TenantState::Stopping; + tenantids.push(*tenantid) + } + TenantState::Broken => {} } - TenantState::Broken => {} } - } - drop(m); + drop(m); + tenantids + }; - thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiverManager), None, None); + task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await; - // Ok, no background threads running anymore. Flush any remaining data in + // Ok, no background tasks running anymore. Flush any remaining data in // memory to disk. // // We assume that any incoming connections that might request pages from @@ -314,7 +245,40 @@ pub fn shutdown_all_tenants() { } } -pub fn create_tenant_repository( +fn create_repo( + conf: &'static PageServerConf, + tenant_conf: TenantConfOpt, + tenant_id: ZTenantId, + wal_redo_manager: Arc, + remote_index: RemoteIndex, +) -> anyhow::Result> { + let repo_dir = conf.tenant_path(&tenant_id); + ensure!( + !repo_dir.exists(), + "cannot create new tenant repo: '{}' directory already exists", + tenant_id + ); + + // top-level dir may exist if we are creating it through CLI + crashsafe_dir::create_dir_all(&repo_dir) + .with_context(|| format!("could not create directory {}", repo_dir.display()))?; + crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; + info!("created directory structure in {}", repo_dir.display()); + + // Save tenant's config + Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; + + Ok(Arc::new(Repository::new( + conf, + tenant_conf, + wal_redo_manager, + tenant_id, + remote_index, + conf.remote_storage_config.is_some(), + ))) +} + +pub fn create_tenant( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: ZTenantId, @@ -327,17 +291,12 @@ pub fn create_tenant_repository( } Entry::Vacant(v) => { let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); - let repo = timelines::create_repo( - conf, - tenant_conf, - tenant_id, - wal_redo_manager, - remote_index, - )?; + let repo = create_repo(conf, tenant_conf, tenant_id, wal_redo_manager, remote_index)?; v.insert(Tenant { - state: TenantState::Idle, + state: TenantState::Active, repo, }); + crate::tenant_tasks::start_background_loops(tenant_id); Ok(Some(tenant_id)) } } @@ -360,13 +319,15 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> Option { } pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { - let mut m = tenants_state::write_tenants(); - let tenant = m - .get_mut(&tenant_id) - .with_context(|| format!("Tenant not found for id {tenant_id}"))?; - let old_state = tenant.state; - tenant.state = new_state; - drop(m); + let old_state = { + let mut m = tenants_state::write_tenants(); + let tenant = m + .get_mut(&tenant_id) + .with_context(|| format!("Tenant not found for id {tenant_id}"))?; + let old_state = tenant.state; + tenant.state = new_state; + old_state + }; match (old_state, new_state) { (TenantState::Broken, TenantState::Broken) @@ -389,24 +350,15 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow: // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. - // TODO maybe use tokio::sync::watch instead? - crate::tenant_tasks::start_compaction_loop(tenant_id)?; - crate::tenant_tasks::start_gc_loop(tenant_id)?; + crate::tenant_tasks::start_background_loops(tenant_id); } (TenantState::Idle, TenantState::Stopping) => { info!("stopping idle tenant {tenant_id}"); } (TenantState::Active, TenantState::Stopping | TenantState::Idle) => { - info!("stopping tenant {tenant_id} threads due to new state {new_state}"); - thread_mgr::shutdown_threads( - Some(ThreadKind::WalReceiverManager), - Some(tenant_id), - None, - ); + info!("stopping tenant {tenant_id} tasks due to new state {new_state}"); - // Wait until all gc/compaction tasks finish - let repo = get_repository_for_tenant(tenant_id)?; - let _guard = repo.file_lock.write().unwrap(); + // Note: The caller is responsible for waiting for any tasks to finish. } } @@ -422,28 +374,28 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result anyhow::Result<()> { +pub async fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { // Start with the shutdown of timeline tasks (this shuts down the walreceiver) // It is important that we do not take locks here, and do not check whether the timeline exists - // because if we hold tenants_state::write_tenants() while awaiting for the threads to join + // because if we hold tenants_state::write_tenants() while awaiting for the tasks to join // we cannot create new timelines and tenants, and that can take quite some time, // it can even become stuck due to a bug making whole pageserver unavailable for some operations // so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation // and then try to actually remove timeline from inmemory state and this is the point when concurrent requests // will synchronize and either fail with the not found error or succeed - let (sender, receiver) = std::sync::mpsc::channel::<()>(); - tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach { - id: ZTenantTimelineId::new(tenant_id, timeline_id), - join_confirmation_sender: sender, - }); - debug!("waiting for wal receiver to shutdown"); - let _ = receiver.recv(); + task_mgr::shutdown_tasks( + Some(TaskKind::WalReceiverManager), + Some(tenant_id), + Some(timeline_id), + ) + .await; debug!("wal receiver shutdown confirmed"); - debug!("waiting for threads to shutdown"); - thread_mgr::shutdown_threads(None, None, Some(timeline_id)); - debug!("thread shutdown completed"); + + info!("waiting for timeline tasks to shutdown"); + task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await; + info!("timeline task shutdown completed"); match tenants_state::read_tenants().get(&tenant_id) { Some(tenant) => tenant.repo.delete_timeline(timeline_id)?, None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"), @@ -452,36 +404,17 @@ pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow Ok(()) } -pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> anyhow::Result<()> { +pub async fn detach_tenant( + conf: &'static PageServerConf, + tenant_id: ZTenantId, +) -> anyhow::Result<()> { set_tenant_state(tenant_id, TenantState::Stopping)?; - // shutdown the tenant and timeline threads: gc, compaction, page service threads) - thread_mgr::shutdown_threads(None, Some(tenant_id), None); + // shutdown all tenant and timeline tasks: gc, compaction, page service) + task_mgr::shutdown_tasks(None, Some(tenant_id), None).await; - let mut walreceiver_join_handles = Vec::new(); - let removed_tenant = { + { let mut tenants_accessor = tenants_state::write_tenants(); - tenants_accessor.remove(&tenant_id) - }; - if let Some(tenant) = removed_tenant { - for (timeline_id, _) in tenant.repo.list_timelines() { - let (sender, receiver) = std::sync::mpsc::channel::<()>(); - tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach { - id: ZTenantTimelineId::new(tenant_id, timeline_id), - join_confirmation_sender: sender, - }); - walreceiver_join_handles.push((timeline_id, receiver)); - } - } - - // wait for wal receivers to stop without holding the lock, because walreceiver - // will attempt to change tenant state which is protected by the same global tenants lock. - // TODO do we need a timeout here? how to handle it? - // recv_timeout is broken: https://github.com/rust-lang/rust/issues/94518#issuecomment-1057440631 - // need to use crossbeam-channel - for (timeline_id, join_handle) in walreceiver_join_handles { - info!("waiting for wal receiver to shutdown timeline_id {timeline_id}"); - join_handle.recv().ok(); - info!("wal receiver shutdown confirmed timeline_id {timeline_id}"); + tenants_accessor.remove(&tenant_id); } // If removal fails there will be no way to successfully retry detach, diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 4e9a5fc6ec..9aaafe7f92 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -1,270 +1,130 @@ //! This module contains functions to serve per-tenant background processes, //! such as compaction and GC -use std::collections::HashMap; -use std::ops::ControlFlow; use std::time::Duration; use crate::metrics::TENANT_TASK_EVENTS; +use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant_mgr; use crate::tenant_mgr::TenantState; -use crate::thread_mgr::ThreadKind; -use crate::{tenant_mgr, thread_mgr}; -use anyhow::{self, Context}; -use futures::stream::FuturesUnordered; -use futures::StreamExt; -use once_cell::sync::OnceCell; -use tokio::sync::mpsc; -use tokio::sync::watch; use tracing::*; use utils::zid::ZTenantId; +pub fn start_background_loops(tenant_id: ZTenantId) { + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::Compaction, + Some(tenant_id), + None, + &format!("compactor for tenant {tenant_id}"), + false, + compaction_loop(tenant_id), + ); + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::GarbageCollector, + Some(tenant_id), + None, + &format!("garbage collector for tenant {tenant_id}"), + false, + gc_loop(tenant_id), + ); +} + /// /// Compaction task's main loop /// -async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { - loop { - trace!("waking up"); +async fn compaction_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { + info!("starting compaction loop for {tenant_id}"); + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + let result = async { + loop { + trace!("waking up"); + + // Run blocking part of the task - // Run blocking part of the task - let period: Result, _> = tokio::task::spawn_blocking(move || { // Break if tenant is not active - if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { - return Ok(ControlFlow::Break(())); + if tenant_mgr::get_tenant_state(tenant_id) != Some(TenantState::Active) { + break Ok(()); } - - // Break if we're not allowed to write to disk - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + // This should not fail. If someone started us, it means that the tenant exists. + // And before you remove a tenant, you have to wait until all the associated tasks + // exit. + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; // Run compaction - let compaction_period = repo.get_compaction_period(); - repo.compaction_iteration()?; - Ok(ControlFlow::Continue(compaction_period)) - }) - .await; - - // Decide whether to sleep or break - let sleep_duration = match period { - Ok(Ok(ControlFlow::Continue(period))) => period, - Ok(Ok(ControlFlow::Break(()))) => break, - Ok(Err(e)) => { + let mut sleep_duration = repo.get_compaction_period(); + if let Err(e) = repo.compaction_iteration() { error!("Compaction failed, retrying: {}", e); - Duration::from_secs(2) + sleep_duration = Duration::from_secs(2) } - Err(e) => { - error!("Compaction join error, retrying: {}", e); - Duration::from_secs(2) - } - }; - // Sleep - tokio::select! { - _ = cancel.changed() => { - trace!("received cancellation request"); - break; - }, - _ = tokio::time::sleep(sleep_duration) => {}, + // Sleep + tokio::select! { + _ = task_mgr::shutdown_watcher() => { + trace!("received cancellation request"); + break Ok(()); + }, + _ = tokio::time::sleep(sleep_duration) => {}, + } } } + .await; + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - trace!( + info!( "compaction loop stopped. State is {:?}", - tenant_mgr::get_tenant_state(tenantid) + tenant_mgr::get_tenant_state(tenant_id) ); -} - -static START_GC_LOOP: OnceCell> = OnceCell::new(); -static START_COMPACTION_LOOP: OnceCell> = OnceCell::new(); - -/// Spawn a task that will periodically schedule garbage collection until -/// the tenant becomes inactive. This should be called on tenant -/// activation. -pub fn start_gc_loop(tenantid: ZTenantId) -> anyhow::Result<()> { - START_GC_LOOP - .get() - .context("Failed to get START_GC_LOOP")? - .blocking_send(tenantid) - .context("Failed to send to START_GC_LOOP channel")?; - Ok(()) -} - -/// Spawn a task that will periodically schedule compaction until -/// the tenant becomes inactive. This should be called on tenant -/// activation. -pub fn start_compaction_loop(tenantid: ZTenantId) -> anyhow::Result<()> { - START_COMPACTION_LOOP - .get() - .context("failed to get START_COMPACTION_LOOP")? - .blocking_send(tenantid) - .context("failed to send to START_COMPACTION_LOOP")?; - Ok(()) -} - -/// Spawn the TenantTaskManager -/// This needs to be called before start_gc_loop or start_compaction_loop -pub fn init_tenant_task_pool() -> anyhow::Result<()> { - let runtime = tokio::runtime::Builder::new_multi_thread() - .thread_name("tenant-task-worker") - .enable_all() - .on_thread_start(|| { - thread_mgr::register(ThreadKind::TenantTaskWorker, "tenant-task-worker") - }) - .on_thread_stop(thread_mgr::deregister) - .build()?; - - let (gc_send, mut gc_recv) = mpsc::channel::(100); - START_GC_LOOP - .set(gc_send) - .expect("Failed to set START_GC_LOOP"); - - let (compaction_send, mut compaction_recv) = mpsc::channel::(100); - START_COMPACTION_LOOP - .set(compaction_send) - .expect("Failed to set START_COMPACTION_LOOP"); - - // TODO this is getting repetitive - let mut gc_loops = HashMap::>::new(); - let mut compaction_loops = HashMap::>::new(); - - thread_mgr::spawn( - ThreadKind::TenantTaskManager, - None, - None, - "Tenant task manager main thread", - true, - move || { - runtime.block_on(async move { - let mut futures = FuturesUnordered::new(); - loop { - tokio::select! { - _ = thread_mgr::shutdown_watcher() => { - // Send cancellation to all tasks - for (_, cancel) in gc_loops.drain() { - cancel.send(()).ok(); - } - for (_, cancel) in compaction_loops.drain() { - cancel.send(()).ok(); - } - - // Exit after all tasks finish - while let Some(result) = futures.next().await { - match result { - Ok(()) => { - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - }, - Err(e) => { - TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc(); - error!("loop join error {}", e) - }, - } - } - break; - }, - tenantid = gc_recv.recv() => { - let tenantid = tenantid.expect("Gc task channel closed unexpectedly"); - - // Spawn new task, request cancellation of the old one if exists - let (cancel_send, cancel_recv) = watch::channel(()); - let handle = tokio::spawn(gc_loop(tenantid, cancel_recv) - .instrument(info_span!("gc loop", tenant = %tenantid))); - if let Some(old_cancel_send) = gc_loops.insert(tenantid, cancel_send) { - old_cancel_send.send(()).ok(); - } - - // Update metrics, remember handle - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - futures.push(handle); - }, - tenantid = compaction_recv.recv() => { - let tenantid = tenantid.expect("Compaction task channel closed unexpectedly"); - - // Spawn new task, request cancellation of the old one if exists - let (cancel_send, cancel_recv) = watch::channel(()); - let handle = tokio::spawn(compaction_loop(tenantid, cancel_recv) - .instrument(info_span!("compaction loop", tenant = %tenantid))); - if let Some(old_cancel_send) = compaction_loops.insert(tenantid, cancel_send) { - old_cancel_send.send(()).ok(); - } - - // Update metrics, remember handle - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - futures.push(handle); - }, - result = futures.next() => { - // Log and count any unhandled panics - match result { - Some(Ok(())) => { - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - }, - Some(Err(e)) => { - TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc(); - error!("loop join error {}", e) - }, - None => {}, - }; - }, - } - } - }); - Ok(()) - }, - )?; - - Ok(()) + result } /// /// GC task's main loop /// -async fn gc_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { - loop { - trace!("waking up"); +async fn gc_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { + info!("starting gc loop for {tenant_id}"); + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + let result = async { + loop { + trace!("waking up"); - // Run blocking part of the task - let period: Result, _> = tokio::task::spawn_blocking(move || { // Break if tenant is not active - if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { - return Ok(ControlFlow::Break(())); + if tenant_mgr::get_tenant_state(tenant_id) != Some(TenantState::Active) { + break Ok(()); } - - // Break if we're not allowed to write to disk - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + // This should not fail. If someone started us, it means that the tenant exists. + // And before you remove a tenant, you have to wait until all the associated tasks + // exit. + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; // Run gc let gc_period = repo.get_gc_period(); let gc_horizon = repo.get_gc_horizon(); + let mut sleep_duration = gc_period; if gc_horizon > 0 { - repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?; + if let Err(e) = repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false) + { + error!("Gc failed, retrying: {}", e); + sleep_duration = Duration::from_secs(2) + } } - Ok(ControlFlow::Continue(gc_period)) - }) - .await; - - // Decide whether to sleep or break - let sleep_duration = match period { - Ok(Ok(ControlFlow::Continue(period))) => period, - Ok(Ok(ControlFlow::Break(()))) => break, - Ok(Err(e)) => { - error!("Gc failed, retrying: {}", e); - Duration::from_secs(2) + // Sleep + tokio::select! { + _ = task_mgr::shutdown_watcher() => { + trace!("received cancellation request"); + break Ok(()); + }, + _ = tokio::time::sleep(sleep_duration) => {}, } - Err(e) => { - error!("Gc join error, retrying: {}", e); - Duration::from_secs(2) - } - }; - - // Sleep - tokio::select! { - _ = cancel.changed() => { - trace!("received cancellation request"); - break; - }, - _ = tokio::time::sleep(sleep_duration) => {}, } } - trace!( + .await; + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); + info!( "GC loop stopped. State is {:?}", - tenant_mgr::get_tenant_state(tenantid) + tenant_mgr::get_tenant_state(tenant_id) ); + result } diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs deleted file mode 100644 index cdd38febbc..0000000000 --- a/pageserver/src/thread_mgr.rs +++ /dev/null @@ -1,409 +0,0 @@ -//! -//! This module provides centralized handling of threads in the Page Server. -//! -//! We provide a few basic facilities: -//! - A global registry of threads that lists what kind of threads they are, and -//! which tenant or timeline they are working on -//! -//! - The ability to request a thread to shut down. -//! -//! -//! # How it works? -//! -//! There is a global hashmap of all the threads (`THREADS`). Whenever a new -//! thread is spawned, a PageServerThread entry is added there, and when a -//! thread dies, it removes itself from the hashmap. If you want to kill a -//! thread, you can scan the hashmap to find it. -//! -//! # Thread shutdown -//! -//! To kill a thread, we rely on co-operation from the victim. Each thread is -//! expected to periodically call the `is_shutdown_requested()` function, and -//! if it returns true, exit gracefully. In addition to that, when waiting for -//! the network or other long-running operation, you can use -//! `shutdown_watcher()` function to get a Future that will become ready if -//! the current thread has been requested to shut down. You can use that with -//! Tokio select!(), but note that it relies on thread-local storage, so it -//! will only work with the "current-thread" Tokio runtime! -//! -//! -//! TODO: This would be a good place to also handle panics in a somewhat sane way. -//! Depending on what thread panics, we might want to kill the whole server, or -//! only a single tenant or timeline. -//! - -use std::cell::RefCell; -use std::collections::HashMap; -use std::panic; -use std::panic::AssertUnwindSafe; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; -use std::sync::{Arc, Mutex}; -use std::thread; -use std::thread::JoinHandle; - -use tokio::sync::watch; - -use tracing::{debug, error, info, warn}; - -use once_cell::sync::Lazy; - -use utils::zid::{ZTenantId, ZTimelineId}; - -use crate::shutdown_pageserver; - -/// Each thread that we track is associated with a "thread ID". It's just -/// an increasing number that we assign, not related to any system thread -/// id. -static NEXT_THREAD_ID: Lazy = Lazy::new(|| AtomicU64::new(1)); - -/// Global registry of threads -static THREADS: Lazy>>> = - Lazy::new(|| Mutex::new(HashMap::new())); - -// There is a Tokio watch channel for each thread, which can be used to signal the -// thread that it needs to shut down. This thread local variable holds the receiving -// end of the channel. The sender is kept in the global registry, so that anyone -// can send the signal to request thread shutdown. -thread_local!(static SHUTDOWN_RX: RefCell>> = RefCell::new(None)); - -// Each thread holds reference to its own PageServerThread here. -thread_local!(static CURRENT_THREAD: RefCell>> = RefCell::new(None)); - -/// -/// There are many kinds of threads in the system. Some are associated with a particular -/// tenant or timeline, while others are global. -/// -/// Note that we don't try to limit how may threads of a certain kind can be running -/// at the same time. -/// -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub enum ThreadKind { - // libpq listener thread. It just accepts connection and spawns a - // PageRequestHandler thread for each connection. - LibpqEndpointListener, - - // HTTP endpoint listener. - HttpEndpointListener, - - // Thread that handles a single connection. A PageRequestHandler thread - // starts detached from any particular tenant or timeline, but it can be - // associated with one later, after receiving a command from the client. - PageRequestHandler, - - // Main walreceiver manager thread that ensures that every timeline spawns a connection to safekeeper, to fetch WAL. - WalReceiverManager, - - // Thread that schedules new compaction and gc jobs - TenantTaskManager, - - // Worker thread for tenant tasks thread pool - TenantTaskWorker, - - // Thread that flushes frozen in-memory layers to disk - LayerFlushThread, - - // Thread for synchronizing pageserver layer files with the remote storage. - // Shared by all tenants. - StorageSync, -} - -#[derive(Default)] -struct MutableThreadState { - /// Tenant and timeline that this thread is associated with. - tenant_id: Option, - timeline_id: Option, - - /// Handle for waiting for the thread to exit. It can be None, if the - /// the thread has already exited. OR if this thread is managed externally - /// and was not spawned through thread_mgr.rs::spawn function. - join_handle: Option>, -} - -struct PageServerThread { - thread_id: u64, - - kind: ThreadKind, - - name: String, - - // To request thread shutdown, set the flag, and send a dummy message to the - // channel to notify it. - shutdown_requested: AtomicBool, - shutdown_tx: watch::Sender<()>, - - mutable: Mutex, -} - -/// Launch a new thread -/// Note: if shutdown_process_on_error is set to true failure -/// of the thread will lead to shutdown of entire process -pub fn spawn( - kind: ThreadKind, - tenant_id: Option, - timeline_id: Option, - name: &str, - shutdown_process_on_error: bool, - f: F, -) -> std::io::Result -where - F: FnOnce() -> anyhow::Result<()> + Send + 'static, -{ - let (shutdown_tx, shutdown_rx) = watch::channel(()); - let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); - let thread = Arc::new(PageServerThread { - thread_id, - kind, - name: name.to_string(), - shutdown_requested: AtomicBool::new(false), - shutdown_tx, - mutable: Mutex::new(MutableThreadState { - tenant_id, - timeline_id, - join_handle: None, - }), - }); - - THREADS - .lock() - .unwrap() - .insert(thread_id, Arc::clone(&thread)); - - let mut thread_mut = thread.mutable.lock().unwrap(); - - let thread_cloned = Arc::clone(&thread); - let thread_name = name.to_string(); - let join_handle = match thread::Builder::new() - .name(name.to_string()) - .spawn(move || { - thread_wrapper( - thread_name, - thread_id, - thread_cloned, - shutdown_rx, - shutdown_process_on_error, - f, - ) - }) { - Ok(handle) => handle, - Err(err) => { - error!("Failed to spawn thread '{}': {}", name, err); - // Could not spawn the thread. Remove the entry - THREADS.lock().unwrap().remove(&thread_id); - return Err(err); - } - }; - thread_mut.join_handle = Some(join_handle); - drop(thread_mut); - - // The thread is now running. Nothing more to do here - Ok(thread_id) -} - -/// This wrapper function runs in a newly-spawned thread. It initializes the -/// thread-local variables and calls the payload function -fn thread_wrapper( - thread_name: String, - thread_id: u64, - thread: Arc, - shutdown_rx: watch::Receiver<()>, - shutdown_process_on_error: bool, - f: F, -) where - F: FnOnce() -> anyhow::Result<()> + Send + 'static, -{ - SHUTDOWN_RX.with(|rx| { - *rx.borrow_mut() = Some(shutdown_rx); - }); - CURRENT_THREAD.with(|ct| { - *ct.borrow_mut() = Some(thread); - }); - - debug!("Starting thread '{}'", thread_name); - - // We use AssertUnwindSafe here so that the payload function - // doesn't need to be UnwindSafe. We don't do anything after the - // unwinding that would expose us to unwind-unsafe behavior. - let result = panic::catch_unwind(AssertUnwindSafe(f)); - - // Remove our entry from the global hashmap. - let thread = THREADS - .lock() - .unwrap() - .remove(&thread_id) - .expect("no thread in registry"); - - let thread_mut = thread.mutable.lock().unwrap(); - match result { - Ok(Ok(())) => debug!("Thread '{}' exited normally", thread_name), - Ok(Err(err)) => { - if shutdown_process_on_error { - error!( - "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err - ); - shutdown_pageserver(1); - } else { - error!( - "Thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err - ); - } - } - Err(err) => { - if shutdown_process_on_error { - error!( - "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err - ); - shutdown_pageserver(1); - } else { - error!( - "Thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err - ); - } - } - } -} - -// expected to be called from the thread of the given id. -pub fn associate_with(tenant_id: Option, timeline_id: Option) { - CURRENT_THREAD.with(|ct| { - let borrowed = ct.borrow(); - let mut thread_mut = borrowed.as_ref().unwrap().mutable.lock().unwrap(); - thread_mut.tenant_id = tenant_id; - thread_mut.timeline_id = timeline_id; - }); -} - -/// Is there a thread running that matches the criteria - -/// Signal and wait for threads to shut down. -/// -/// -/// The arguments are used to select the threads to kill. Any None arguments are -/// ignored. For example, to shut down all WalReceiver threads: -/// -/// shutdown_threads(Some(ThreadKind::WalReceiver), None, None) -/// -/// Or to shut down all threads for given timeline: -/// -/// shutdown_threads(None, Some(timelineid), None) -/// -pub fn shutdown_threads( - kind: Option, - tenant_id: Option, - timeline_id: Option, -) { - let mut victim_threads = Vec::new(); - - let threads = THREADS.lock().unwrap(); - for thread in threads.values() { - let thread_mut = thread.mutable.lock().unwrap(); - if (kind.is_none() || Some(thread.kind) == kind) - && (tenant_id.is_none() || thread_mut.tenant_id == tenant_id) - && (timeline_id.is_none() || thread_mut.timeline_id == timeline_id) - { - thread.shutdown_requested.store(true, Ordering::Relaxed); - // FIXME: handle error? - let _ = thread.shutdown_tx.send(()); - victim_threads.push(Arc::clone(thread)); - } - } - drop(threads); - - for thread in victim_threads { - let mut thread_mut = thread.mutable.lock().unwrap(); - info!("waiting for {} to shut down", thread.name); - if let Some(join_handle) = thread_mut.join_handle.take() { - drop(thread_mut); - let _ = join_handle.join(); - } else { - // Possibly one of: - // * The thread had not even fully started yet. - // * It was shut down concurrently and already exited - // * Is managed through `register`/`deregister` fns without providing a join handle - } - } -} - -/// A Future that can be used to check if the current thread has been requested to -/// shut down. -pub async fn shutdown_watcher() { - let _ = SHUTDOWN_RX - .with(|rx| { - rx.borrow() - .as_ref() - .expect("shutdown_requested() called in an unexpected thread") - .clone() - }) - .changed() - .await; -} - -/// Has the current thread been requested to shut down? -pub fn is_shutdown_requested() -> bool { - CURRENT_THREAD.with(|ct| { - if let Some(ct) = ct.borrow().as_ref() { - ct.shutdown_requested.load(Ordering::Relaxed) - } else { - if !cfg!(test) { - warn!("is_shutdown_requested() called in an unexpected thread"); - } - false - } - }) -} - -/// Needed to register threads that were not spawned through spawn function. -/// For example tokio blocking threads. This function is expected to be used -/// in tandem with `deregister`. -/// NOTE: threads registered through this function cannot be joined -pub fn register(kind: ThreadKind, name: &str) { - CURRENT_THREAD.with(|ct| { - let mut borrowed = ct.borrow_mut(); - if borrowed.is_some() { - panic!("thread already registered") - }; - let (shutdown_tx, shutdown_rx) = watch::channel(()); - let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); - - let thread = Arc::new(PageServerThread { - thread_id, - kind, - name: name.to_owned(), - shutdown_requested: AtomicBool::new(false), - shutdown_tx, - mutable: Mutex::new(MutableThreadState { - tenant_id: None, - timeline_id: None, - join_handle: None, - }), - }); - - *borrowed = Some(Arc::clone(&thread)); - - SHUTDOWN_RX.with(|rx| { - *rx.borrow_mut() = Some(shutdown_rx); - }); - - THREADS.lock().unwrap().insert(thread_id, thread); - }); -} - -// Expected to be used in tandem with `register`. See the doc for `register` for more details -pub fn deregister() { - CURRENT_THREAD.with(|ct| { - let mut borrowed = ct.borrow_mut(); - let thread = match borrowed.take() { - Some(thread) => thread, - None => panic!("calling deregister on unregistered thread"), - }; - - SHUTDOWN_RX.with(|rx| { - *rx.borrow_mut() = None; - }); - - THREADS.lock().unwrap().remove(&thread.thread_id) - }); -} diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 9356893908..35dec54d5c 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -2,7 +2,7 @@ //! Timeline management code // -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{bail, Context, Result}; use remote_storage::path_with_suffix_extension; use std::{ @@ -14,21 +14,15 @@ use std::{ use tracing::*; use utils::{ - crashsafe_dir, lsn::Lsn, zid::{ZTenantId, ZTimelineId}, }; +use crate::config::PageServerConf; +use crate::layered_repository::{Repository, Timeline}; use crate::tenant_mgr; use crate::CheckpointConfig; -use crate::{ - config::PageServerConf, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, -}; use crate::{import_datadir, TEMP_FILE_SUFFIX}; -use crate::{ - layered_repository::{Repository, Timeline}, - walredo::WalRedoManager, -}; #[derive(Debug, Clone, Copy)] pub struct PointInTime { @@ -36,39 +30,6 @@ pub struct PointInTime { pub lsn: Lsn, } -pub fn create_repo( - conf: &'static PageServerConf, - tenant_conf: TenantConfOpt, - tenant_id: ZTenantId, - wal_redo_manager: Arc, - remote_index: RemoteIndex, -) -> Result> { - let repo_dir = conf.tenant_path(&tenant_id); - ensure!( - !repo_dir.exists(), - "cannot create new tenant repo: '{}' directory already exists", - tenant_id - ); - - // top-level dir may exist if we are creating it through CLI - crashsafe_dir::create_dir_all(&repo_dir) - .with_context(|| format!("could not create directory {}", repo_dir.display()))?; - crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; - info!("created directory structure in {}", repo_dir.display()); - - // Save tenant's config - Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; - - Ok(Arc::new(Repository::new( - conf, - tenant_conf, - wal_redo_manager, - tenant_id, - remote_index, - conf.remote_storage_config.is_some(), - ))) -} - // Create the cluster temporarily in 'initdbpath' directory inside the repository // to get bootstrap data for timeline initialization. // @@ -158,7 +119,7 @@ fn bootstrap_timeline( /// the same timeline ID already exists, returns None. If `new_timeline_id` is not given, /// a new unique ID is generated. /// -pub(crate) fn create_timeline( +pub(crate) async fn create_timeline( conf: &'static PageServerConf, tenant_id: ZTenantId, new_timeline_id: Option, @@ -187,7 +148,7 @@ pub(crate) fn create_timeline( // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. *lsn = lsn.align(); - ancestor_timeline.wait_lsn(*lsn)?; + ancestor_timeline.wait_lsn(*lsn).await?; let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); if ancestor_ancestor_lsn > *lsn { diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index d6420e1d18..deac299747 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -23,131 +23,61 @@ mod connection_manager; mod walreceiver_connection; +use crate::config::PageServerConf; +use crate::task_mgr::WALRECEIVER_RUNTIME; + use anyhow::{ensure, Context}; use etcd_broker::Client; use itertools::Itertools; -use std::cell::Cell; -use std::collections::{hash_map, HashMap, HashSet}; +use once_cell::sync::OnceCell; use std::future::Future; -use std::num::NonZeroU64; use std::sync::Arc; -use std::thread_local; -use std::time::Duration; -use tokio::{ - select, - sync::{mpsc, watch}, - task::JoinHandle, -}; +use tokio::sync::watch; use tracing::*; use url::Url; -use crate::config::PageServerConf; -use crate::tenant_mgr::{self, LocalTimelineUpdate, TenantState}; -use crate::thread_mgr::{self, ThreadKind}; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +pub use connection_manager::spawn_connection_manager_task; -thread_local! { - // Boolean that is true only for WAL receiver threads - // - // This is used in `wait_lsn` to guard against usage that might lead to a deadlock. - pub(crate) static IS_WAL_RECEIVER: Cell = Cell::new(false); -} +static ETCD_CLIENT: OnceCell = OnceCell::new(); -/// Sets up the main WAL receiver thread that manages the rest of the subtasks inside of it, per timeline. -/// See comments in [`wal_receiver_main_thread_loop_step`] for more details on per timeline activities. -pub fn init_wal_receiver_main_thread( - conf: &'static PageServerConf, - mut timeline_updates_receiver: mpsc::UnboundedReceiver, -) -> anyhow::Result<()> { +/// +/// Initialize the etcd client. This must be called once at page server startup. +/// +pub async fn init_etcd_client(conf: &'static PageServerConf) -> anyhow::Result<()> { let etcd_endpoints = conf.broker_endpoints.clone(); ensure!( !etcd_endpoints.is_empty(), "Cannot start wal receiver: etcd endpoints are empty" ); - let broker_prefix = &conf.broker_etcd_prefix; - info!( - "Starting wal receiver main thread, etcd endpoints: {}", - etcd_endpoints.iter().map(Url::to_string).join(", ") - ); - let runtime = tokio::runtime::Builder::new_multi_thread() - .thread_name("wal-receiver-runtime-thread") - .enable_all() - .on_thread_start(|| IS_WAL_RECEIVER.with(|c| c.set(true))) - .build() - .context("Failed to create storage sync runtime")?; - let etcd_client = runtime - .block_on(Client::connect(etcd_endpoints, None)) + let etcd_client = Client::connect(etcd_endpoints.clone(), None) + .await .context("Failed to connect to etcd")?; - thread_mgr::spawn( - ThreadKind::WalReceiverManager, - None, - None, - "WAL receiver manager main thread", - true, - move || { - runtime.block_on(async move { - let mut local_timeline_wal_receivers = HashMap::new(); - loop { - select! { - _ = thread_mgr::shutdown_watcher() => { - info!("Shutdown signal received"); - shutdown_all_wal_connections(&mut local_timeline_wal_receivers).await; - break; - }, - _ = wal_receiver_main_thread_loop_step( - broker_prefix, - &etcd_client, - &mut timeline_updates_receiver, - &mut local_timeline_wal_receivers, - ) => {}, - } - } - }.instrument(info_span!("wal_receiver_main"))); + // FIXME: Should we still allow the pageserver to start, if etcd + // doesn't work? It could still serve GetPage requests, with the + // data it has locally and from what it can download from remote + // storage + if ETCD_CLIENT.set(etcd_client).is_err() { + panic!("etcd already initialized"); + } - info!("Wal receiver main thread stopped"); - Ok(()) - }, - ) - .map(|_thread_id| ()) - .context("Failed to spawn wal receiver main thread") + info!( + "Initialized etcd client with endpoints: {}", + etcd_endpoints.iter().map(Url::to_string).join(", ") + ); + Ok(()) } -async fn shutdown_all_wal_connections( - local_timeline_wal_receivers: &mut HashMap>>, -) { - info!("Shutting down all WAL connections"); - let mut broker_join_handles = Vec::new(); - for (tenant_id, timelines) in local_timeline_wal_receivers.drain() { - for (timeline_id, handles) in timelines { - handles.cancellation.send(()).ok(); - broker_join_handles.push(( - ZTenantTimelineId::new(tenant_id, timeline_id), - handles.handle, - )); - } - } +/// +/// Get a handle to the etcd client +/// +pub fn get_etcd_client() -> &'static etcd_broker::Client { + ETCD_CLIENT.get().expect("etcd client not initialized") +} - let mut tenants = HashSet::with_capacity(broker_join_handles.len()); - for (id, broker_join_handle) in broker_join_handles { - tenants.insert(id.tenant_id); - debug!("Waiting for wal broker for timeline {id} to finish"); - if let Err(e) = broker_join_handle.await { - error!("Failed to join on wal broker for timeline {id}: {e}"); - } - } - if let Err(e) = tokio::task::spawn_blocking(move || { - for tenant_id in tenants { - if let Err(e) = tenant_mgr::set_tenant_state(tenant_id, TenantState::Idle) { - error!("Failed to make tenant {tenant_id} idle: {e:?}"); - } - } - }) - .await - { - error!("Failed to await a task to make all tenants idle: {e:?}"); - } +pub fn is_etcd_client_initialized() -> bool { + ETCD_CLIENT.get().is_some() } /// A handle of an asynchronous task. @@ -157,8 +87,7 @@ async fn shutdown_all_wal_connections( /// Note that the communication happens via the `watch` channel, that does not accumulate the events, replacing the old one with the never one on submission. /// That may lead to certain events not being observed by the listener. #[derive(Debug)] -struct TaskHandle { - handle: JoinHandle>, +pub struct TaskHandle { events_receiver: watch::Receiver>, cancellation: watch::Sender<()>, } @@ -167,7 +96,7 @@ struct TaskHandle { pub enum TaskEvent { Started, NewEvent(E), - End(Result<(), String>), + End, } impl TaskHandle { @@ -184,164 +113,28 @@ impl TaskHandle { let events_sender = Arc::new(events_sender); let sender = Arc::clone(&events_sender); - let handle = tokio::task::spawn(async move { + let _ = WALRECEIVER_RUNTIME.spawn(async move { events_sender.send(TaskEvent::Started).ok(); task(sender, cancellation_receiver).await }); TaskHandle { - handle, events_receiver, cancellation, } } async fn next_task_event(&mut self) -> TaskEvent { - select! { - next_task_event = self.events_receiver.changed() => match next_task_event { - Ok(()) => self.events_receiver.borrow().clone(), - Err(_task_channel_part_dropped) => join_on_handle(&mut self.handle).await, - }, - task_completion_result = join_on_handle(&mut self.handle) => task_completion_result, + match self.events_receiver.changed().await { + Ok(()) => self.events_receiver.borrow().clone(), + Err(_task_channel_part_dropped) => TaskEvent::End, } } /// Aborts current task, waiting for it to finish. - async fn shutdown(self) { + pub async fn shutdown(mut self) { self.cancellation.send(()).ok(); - if let Err(e) = self.handle.await { - error!("Task failed to shut down: {e}") - } + // wait until the sender is dropped + while self.events_receiver.changed().await.is_ok() {} } } - -async fn join_on_handle(handle: &mut JoinHandle>) -> TaskEvent { - match handle.await { - Ok(task_result) => TaskEvent::End(task_result), - Err(e) => { - if e.is_cancelled() { - TaskEvent::End(Ok(())) - } else { - TaskEvent::End(Err(format!("WAL receiver task panicked: {e}"))) - } - } - } -} - -/// A step to process timeline attach/detach events to enable/disable the corresponding WAL receiver machinery. -/// In addition to WAL streaming management, the step ensures that corresponding tenant has its service threads enabled or disabled. -/// This is done here, since only walreceiver knows when a certain tenant has no streaming enabled. -/// -/// Cannot fail, should always try to process the next timeline event even if the other one was not processed properly. -async fn wal_receiver_main_thread_loop_step<'a>( - broker_prefix: &'a str, - etcd_client: &'a Client, - timeline_updates_receiver: &'a mut mpsc::UnboundedReceiver, - local_timeline_wal_receivers: &'a mut HashMap>>, -) { - // Only react on updates from [`tenant_mgr`] on local timeline attach/detach. - match timeline_updates_receiver.recv().await { - Some(update) => { - info!("Processing timeline update: {update:?}"); - match update { - // Timeline got detached, stop all related tasks and remove public timeline data. - LocalTimelineUpdate::Detach { - id, - join_confirmation_sender, - } => { - match local_timeline_wal_receivers.get_mut(&id.tenant_id) { - Some(wal_receivers) => { - if let hash_map::Entry::Occupied(o) = wal_receivers.entry(id.timeline_id) { - o.remove().shutdown().await - } - if wal_receivers.is_empty() { - if let Err(e) = change_tenant_state(id.tenant_id, TenantState::Idle).await { - error!("Failed to make tenant idle for id {id}: {e:#}"); - } - } - } - None => warn!("Timeline {id} does not have a tenant entry in wal receiver main thread"), - }; - if let Err(e) = join_confirmation_sender.send(()) { - warn!("cannot send wal_receiver shutdown confirmation {e}") - } else { - info!("confirm walreceiver shutdown for {id}"); - } - } - // Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly. - LocalTimelineUpdate::Attach { id, timeline } => { - let timeline_connection_managers = local_timeline_wal_receivers - .entry(id.tenant_id) - .or_default(); - - if timeline_connection_managers.is_empty() { - if let Err(e) = change_tenant_state(id.tenant_id, TenantState::Active).await - { - error!("Failed to make tenant active for id {id}: {e:#}"); - return; - } - } - - let vacant_connection_manager_entry = - match timeline_connection_managers.entry(id.timeline_id) { - hash_map::Entry::Occupied(_) => { - debug!("Attepted to readd an existing timeline {id}, ignoring"); - return; - } - hash_map::Entry::Vacant(v) => v, - }; - - let (wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag) = - match fetch_tenant_settings(id.tenant_id).await { - Ok(settings) => settings, - Err(e) => { - error!("Failed to fetch tenant settings for id {id}: {e:#}"); - return; - } - }; - - vacant_connection_manager_entry.insert( - connection_manager::spawn_connection_manager_task( - id, - broker_prefix.to_owned(), - etcd_client.clone(), - timeline, - wal_connect_timeout, - lagging_wal_timeout, - max_lsn_wal_lag, - ), - ); - } - } - } - None => { - info!("Local timeline update channel closed"); - shutdown_all_wal_connections(local_timeline_wal_receivers).await; - } - } -} - -async fn fetch_tenant_settings( - tenant_id: ZTenantId, -) -> anyhow::Result<(Duration, Duration, NonZeroU64)> { - tokio::task::spawn_blocking(move || { - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("no repository found for tenant {tenant_id}"))?; - Ok::<_, anyhow::Error>(( - repo.get_wal_receiver_connect_timeout(), - repo.get_lagging_wal_timeout(), - repo.get_max_lsn_wal_lag(), - )) - }) - .await - .with_context(|| format!("Failed to join on tenant {tenant_id} settings fetch task"))? -} - -async fn change_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { - tokio::task::spawn_blocking(move || { - tenant_mgr::set_tenant_state(tenant_id, new_state) - .with_context(|| format!("Failed to activate tenant {tenant_id}")) - }) - .await - .with_context(|| format!("Failed to spawn activation task for tenant {tenant_id}"))? -} diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 0261203049..1fcb768ddf 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -17,6 +17,9 @@ use std::{ }; use crate::layered_repository::Timeline; +use crate::task_mgr; +use crate::task_mgr::TaskKind; +use crate::task_mgr::WALRECEIVER_RUNTIME; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ @@ -26,7 +29,10 @@ use etcd_broker::{ use tokio::select; use tracing::*; -use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; +use crate::{ + exponential_backoff, walreceiver::get_etcd_client, DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, +}; use utils::{ lsn::Lsn, zid::{NodeId, ZTenantTimelineId}, @@ -35,29 +41,38 @@ use utils::{ use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle}; /// Spawns the loop to take care of the timeline's WAL streaming connection. -pub(super) fn spawn_connection_manager_task( - id: ZTenantTimelineId, +pub fn spawn_connection_manager_task( broker_loop_prefix: String, - mut client: Client, - local_timeline: Arc, + timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, -) -> TaskHandle<()> { - TaskHandle::spawn(move |_, mut cancellation| { +) -> anyhow::Result<()> { + let mut etcd_client = get_etcd_client().clone(); + + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + + task_mgr::spawn( + WALRECEIVER_RUNTIME.handle(), + TaskKind::WalReceiverManager, + Some(tenant_id), + Some(timeline_id), + &format!("walreceiver for tenant {} timeline {}", timeline.tenant_id, timeline.timeline_id), + false, async move { info!("WAL receiver broker started, connecting to etcd"); let mut walreceiver_state = WalreceiverState::new( - id, - local_timeline, + timeline, wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, ); loop { select! { - _ = cancellation.changed() => { - info!("Broker subscription init cancelled, shutting down"); + _ = task_mgr::shutdown_watcher() => { + info!("WAL receiver shutdown requested, shutting down"); + // Kill current connection, if any if let Some(wal_connection) = walreceiver_state.wal_connection.take() { wal_connection.connection_task.shutdown().await; @@ -67,14 +82,15 @@ pub(super) fn spawn_connection_manager_task( _ = connection_manager_loop_step( &broker_loop_prefix, - &mut client, + &mut etcd_client, &mut walreceiver_state, ) => {}, } } } - .instrument(info_span!("wal_connection_manager", id = %id)) - }) + .instrument(info_span!("wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id)) + ); + Ok(()) } /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. @@ -85,7 +101,10 @@ async fn connection_manager_loop_step( etcd_client: &mut Client, walreceiver_state: &mut WalreceiverState, ) { - let id = walreceiver_state.id; + let id = ZTenantTimelineId { + tenant_id: walreceiver_state.timeline.tenant_id, + timeline_id: walreceiver_state.timeline.timeline_id, + }; // XXX: We never explicitly cancel etcd task, instead establishing one and never letting it go, // running the entire loop step as much as possible to an end. @@ -98,6 +117,14 @@ async fn connection_manager_loop_step( loop { let time_until_next_retry = walreceiver_state.time_until_next_retry(); + // These things are happening concurrently: + // + // - keep receiving WAL on the current connection + // - if the shared state says we need to change connection, disconnect and return + // - this runs in a separate task and we receive updates via a watch channel + // - change connection if the rules decide so, or if the current connection dies + // - receive updates from broker + // - this might change the current desired connection select! { broker_connection_result = &mut broker_subscription.watcher_handle => { cleanup_broker_connection(broker_connection_result, walreceiver_state); @@ -110,7 +137,8 @@ async fn connection_manager_loop_step( None => None, } } => { - let wal_connection = walreceiver_state.wal_connection.as_mut().expect("Should have a connection, as checked by the corresponding select! guard"); + let wal_connection = walreceiver_state.wal_connection.as_mut() + .expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { TaskEvent::Started => {}, TaskEvent::NewEvent(status) => { @@ -123,16 +151,14 @@ async fn connection_manager_loop_step( } wal_connection.status = status; }, - TaskEvent::End(end_result) => { - match end_result { - Ok(()) => debug!("WAL receiving task finished"), - Err(e) => warn!("WAL receiving task failed: {e}"), - }; + TaskEvent::End => { + debug!("WAL receiving task finished"); walreceiver_state.drop_old_connection(false).await; }, } }, + // Got a new update from etcd broker_update = broker_subscription.value_updates.recv() => { match broker_update { Some(broker_update) => walreceiver_state.register_timeline_update(broker_update), @@ -241,8 +267,9 @@ const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5; /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. struct WalreceiverState { id: ZTenantTimelineId, + /// Use pageserver data about the timeline to filter out some of the safekeepers. - local_timeline: Arc, + timeline: Arc, /// The timeout on the connection to safekeeper for WAL streaming. wal_connect_timeout: Duration, /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. @@ -299,15 +326,18 @@ struct EtcdSkTimeline { impl WalreceiverState { fn new( - id: ZTenantTimelineId, - local_timeline: Arc, + timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, ) -> Self { + let id = ZTenantTimelineId { + tenant_id: timeline.tenant_id, + timeline_id: timeline.timeline_id, + }; Self { id, - local_timeline, + timeline, wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, @@ -323,10 +353,11 @@ impl WalreceiverState { let id = self.id; let connect_timeout = self.wal_connect_timeout; + let timeline = Arc::clone(&self.timeline); let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { async move { super::walreceiver_connection::handle_walreceiver_connection( - id, + timeline, &new_wal_source_connstr, events_sender.as_ref(), cancellation, @@ -520,7 +551,7 @@ impl WalreceiverState { let current_lsn = match existing_wal_connection.status.streaming_lsn { Some(lsn) => lsn, - None => self.local_timeline.get_last_record_lsn(), + None => self.timeline.get_last_record_lsn(), }; let current_commit_lsn = existing_wal_connection .status @@ -1328,7 +1359,7 @@ mod tests { tenant_id: harness.tenant_id, timeline_id: TIMELINE_ID, }, - local_timeline: harness + timeline: harness .load() .create_empty_timeline(TIMELINE_ID, Lsn(0)) .expect("Failed to create an empty timeline for dummy wal connection manager"), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 4c30481e02..e8fa9f9aca 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -21,11 +21,17 @@ use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; use crate::metrics::LIVE_CONNECTIONS_COUNT; use crate::{ - layered_repository::WalReceiverInfo, tenant_mgr, walingest::WalIngest, + layered_repository::{Timeline, WalReceiverInfo}, + task_mgr, + task_mgr::TaskKind, + task_mgr::WALRECEIVER_RUNTIME, + tenant_mgr, + walingest::WalIngest, walrecord::DecodedWALRecord, }; use postgres_ffi::v14::waldecoder::WalStreamDecoder; -use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; +use utils::zid::ZTenantTimelineId; +use utils::{lsn::Lsn, pq_proto::ReplicationFeedback}; /// Status of the connection. #[derive(Debug, Clone)] @@ -48,7 +54,7 @@ pub struct WalConnectionStatus { /// Open a connection to the given safekeeper and receive WAL, sending back progress /// messages as we go. pub async fn handle_walreceiver_connection( - id: ZTenantTimelineId, + timeline: Arc, wal_source_connstr: &str, events_sender: &watch::Sender>, mut cancellation: watch::Receiver<()>, @@ -83,24 +89,31 @@ pub async fn handle_walreceiver_connection( // The connection object performs the actual communication with the database, // so spawn it off to run on its own. let mut connection_cancellation = cancellation.clone(); - tokio::spawn( + task_mgr::spawn( + WALRECEIVER_RUNTIME.handle(), + TaskKind::WalReceiverConnection, + Some(timeline.tenant_id), + Some(timeline.timeline_id), + "walreceiver connection", + false, async move { select! { - connection_result = connection => match connection_result{ - Ok(()) => info!("Walreceiver db connection closed"), - Err(connection_error) => { - if connection_error.is_closed() { - info!("Connection closed regularly: {connection_error}") - } else { - warn!("Connection aborted: {connection_error}") - } - } - }, + connection_result = connection => match connection_result{ + Ok(()) => info!("Walreceiver db connection closed"), + Err(connection_error) => { + if connection_error.is_closed() { + info!("Connection closed regularly: {connection_error}") + } else { + warn!("Connection aborted: {connection_error}") + } + } + }, - _ = connection_cancellation.changed() => info!("Connection cancelled"), + _ = connection_cancellation.changed() => info!("Connection cancelled"), } + Ok(()) } - .instrument(info_span!("safekeeper_handle_db")), + .instrument(info_span!("walreceiver connection")), ); // Immediately increment the gauge, then create a job to decrement it on task exit. @@ -117,10 +130,6 @@ pub async fn handle_walreceiver_connection( let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); let mut caught_up = false; - let ZTenantTimelineId { - tenant_id, - timeline_id, - } = id; connection_status.latest_connection_update = Utc::now().naive_utc(); connection_status.latest_wal_update = Utc::now().naive_utc(); @@ -130,17 +139,10 @@ pub async fn handle_walreceiver_connection( return Ok(()); } - let (repo, timeline) = tokio::task::spawn_blocking(move || { - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("no repository found for tenant {tenant_id}"))?; - let timeline = repo.get_timeline(timeline_id) - .with_context(|| { - format!("local timeline {timeline_id} not found for tenant {tenant_id}") - })?; - Ok::<_, anyhow::Error>((repo, timeline)) - }) - .await - .with_context(|| format!("Failed to spawn blocking task to get repository and timeline for tenant {tenant_id} timeline {timeline_id}"))??; + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + let repo = tenant_mgr::get_repository_for_tenant(tenant_id) + .with_context(|| format!("no repository found for tenant {tenant_id}"))?; // // Start streaming the WAL, from where we left off previously. @@ -273,11 +275,12 @@ pub async fn handle_walreceiver_connection( } } - let timeline_to_check = Arc::clone(&timeline); - tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance()) - .await - .with_context(|| format!("Spawned checkpoint check task panicked for timeline {id}"))? - .with_context(|| format!("Failed to check checkpoint distance for timeline {id}"))?; + timeline.check_checkpoint_distance().with_context(|| { + format!( + "Failed to check checkpoint distance for timeline {}", + timeline.timeline_id + ) + })?; if let Some(last_lsn) = status_update { let remote_index = repo.get_remote_index(); diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index befa4616be..315ec7f306 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,3 +1,4 @@ +from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_until from fixtures.types import ZTenantId, ZTimelineId @@ -39,9 +40,6 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): for t in timelines: client.timeline_delete(tenant, t) - def assert_idle(tenant): - assert get_state(tenant) == "Idle" - # Create tenant, start compute tenant, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline(name, tenant_id=tenant) @@ -51,18 +49,21 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): # Stop compute pg.stop() - # Detach all tenants and wait for them to go idle - # TODO they should be already idle since there are no active computes + # Delete all timelines on all tenants for tenant_info in client.tenant_list(): tenant_id = ZTenantId(tenant_info["id"]) delete_all_timelines(tenant_id) - wait_until(10, 0.2, lambda: assert_idle(tenant_id)) - # Assert that all tasks finish quickly after tenants go idle + # Assert that all tasks finish quickly after tenant is detached + assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0 + client.tenant_detach(tenant) + client.tenant_detach(env.initial_tenant) + def assert_tasks_finish(): tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}') tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}') tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}') + log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}") assert tasks_started == tasks_ended assert tasks_panicked == 0 diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index bfe61b9ced..096b3a5d70 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -47,9 +47,9 @@ scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] } tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "winapi"] } -tokio-util = { version = "0.7", features = ["codec", "io", "tracing"] } +tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } -tracing-core = { version = "0.1", features = ["lazy_static", "std", "valuable"] } +tracing-core = { version = "0.1", features = ["once_cell", "std", "valuable"] } [build-dependencies] ahash = { version = "0.7", features = ["std"] } From 2a837d7de71a3f8bd74bbaa0d85f056bdac6f861 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 13 Sep 2022 00:04:33 +0300 Subject: [PATCH 0755/1022] Create tenants in temporary directory first (#2426) --- pageserver/src/layered_repository.rs | 59 ++++++++++--- pageserver/src/tenant_mgr.rs | 127 ++++++++++++++++++++------- test_runner/regress/test_tenants.py | 41 ++++++++- 3 files changed, 182 insertions(+), 45 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 768bdd396b..ecc0bfe3b5 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -21,6 +21,8 @@ use std::collections::BTreeSet; use std::collections::HashMap; use std::fs; use std::fs::File; +use std::fs::OpenOptions; +use std::io::Write; use std::num::NonZeroU64; use std::ops::Bound::Included; use std::path::Path; @@ -38,6 +40,7 @@ use crate::tenant_config::{TenantConf, TenantConfOpt}; use crate::metrics::STORAGE_TIME; use crate::repository::GcResult; use crate::task_mgr; +use crate::virtual_file::VirtualFile; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; @@ -663,14 +666,14 @@ impl Repository { } pub fn persist_tenant_config( - conf: &'static PageServerConf, - tenant_id: ZTenantId, + target_config_path: &Path, tenant_conf: TenantConfOpt, + first_save: bool, ) -> anyhow::Result<()> { let _enter = info_span!("saving tenantconf").entered(); - let target_config_path = TenantConf::path(conf, tenant_id); - info!("save tenantconf to {}", target_config_path.display()); + info!("persisting tenantconf to {}", target_config_path.display()); + // TODO this will prepend comments endlessly let mut conf_content = r#"# This file contains a specific per-tenant's config. # It is read in case of pageserver restart. @@ -681,12 +684,48 @@ impl Repository { // Convert the config to a toml file. conf_content += &toml_edit::easy::to_string(&tenant_conf)?; - fs::write(&target_config_path, conf_content).with_context(|| { - format!( - "Failed to write config file into path '{}'", - target_config_path.display() - ) - }) + let mut target_config_file = VirtualFile::open_with_options( + target_config_path, + OpenOptions::new().write(true).create_new(first_save), + )?; + + target_config_file + .write(conf_content.as_bytes()) + .context("Failed to write toml bytes into file") + .and_then(|_| { + target_config_file + .sync_all() + .context("Faile to fsync config file") + }) + .with_context(|| { + format!( + "Failed to write config file into path '{}'", + target_config_path.display() + ) + })?; + + // fsync the parent directory to ensure the directory entry is durable + if first_save { + target_config_path + .parent() + .context("Config file does not have a parent") + .and_then(|target_config_parent| { + File::open(target_config_parent).context("Failed to open config parent") + }) + .and_then(|tenant_dir| { + tenant_dir + .sync_all() + .context("Failed to fsync config parent") + }) + .with_context(|| { + format!( + "Failed to fsync on firts save for config {}", + target_config_path.display() + ) + })?; + } + + Ok(()) } // diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index db256b0f65..a9f015229f 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -9,16 +9,14 @@ use crate::layered_repository::Repository; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::task_mgr::{self, TaskKind}; -use crate::tenant_config::TenantConfOpt; +use crate::tenant_config::{TenantConf, TenantConfOpt}; use crate::walredo::{PostgresRedoManager, WalRedoManager}; use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; -use anyhow::{ensure, Context}; -use remote_storage::GenericRemoteStorage; -use serde::{Deserialize, Serialize}; -use std::collections::hash_map::{self, Entry}; -use std::collections::{HashMap, HashSet}; +use anyhow::Context; +use remote_storage::{path_with_suffix_extension, GenericRemoteStorage}; +use std::collections::{hash_map, HashMap, HashSet}; use std::ffi::OsStr; -use std::fmt; +use std::fs; use std::path::{Path, PathBuf}; use std::sync::Arc; use tracing::*; @@ -58,7 +56,7 @@ struct Tenant { repo: Arc, } -#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum TenantState { // This tenant exists on local disk, and the layer map has been loaded into memory. // The local disk might have some newer files that don't exist in cloud storage yet. @@ -74,8 +72,8 @@ pub enum TenantState { Broken, } -impl fmt::Display for TenantState { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl std::fmt::Display for TenantState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Active => f.write_str("Active"), Self::Idle => f.write_str("Idle"), @@ -252,21 +250,71 @@ fn create_repo( wal_redo_manager: Arc, remote_index: RemoteIndex, ) -> anyhow::Result> { - let repo_dir = conf.tenant_path(&tenant_id); - ensure!( - !repo_dir.exists(), - "cannot create new tenant repo: '{}' directory already exists", - tenant_id + let target_tenant_directory = conf.tenant_path(&tenant_id); + anyhow::ensure!( + !target_tenant_directory.exists(), + "cannot create new tenant repo: '{tenant_id}' directory already exists", ); - // top-level dir may exist if we are creating it through CLI - crashsafe_dir::create_dir_all(&repo_dir) - .with_context(|| format!("could not create directory {}", repo_dir.display()))?; - crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; - info!("created directory structure in {}", repo_dir.display()); + let temporary_tenant_dir = + path_with_suffix_extension(&target_tenant_directory, TEMP_FILE_SUFFIX); + debug!( + "Creating temporary directory structure in {}", + temporary_tenant_dir.display() + ); - // Save tenant's config - Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; + let temporary_tenant_timelines_dir = rebase_directory( + &conf.timelines_path(&tenant_id), + &target_tenant_directory, + &temporary_tenant_dir, + )?; + let temporary_tenant_config_path = rebase_directory( + &TenantConf::path(conf, tenant_id), + &target_tenant_directory, + &temporary_tenant_dir, + )?; + + // top-level dir may exist if we are creating it through CLI + crashsafe_dir::create_dir_all(&temporary_tenant_dir).with_context(|| { + format!( + "could not create temporary tenant directory {}", + temporary_tenant_dir.display() + ) + })?; + // first, create a config in the top-level temp directory, fsync the file + Repository::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true)?; + // then, create a subdirectory in the top-level temp directory, fsynced + crashsafe_dir::create_dir(&temporary_tenant_timelines_dir).with_context(|| { + format!( + "could not create temporary tenant timelines directory {}", + temporary_tenant_timelines_dir.display() + ) + })?; + + fail::fail_point!("tenant-creation-before-tmp-rename", |_| { + anyhow::bail!("failpoint tenant-creation-before-tmp-rename"); + }); + + // move-rename tmp directory with all files synced into a permanent directory, fsync its parent + fs::rename(&temporary_tenant_dir, &target_tenant_directory).with_context(|| { + format!( + "failed to move temporary tenant directory {} into the permanent one {}", + temporary_tenant_dir.display(), + target_tenant_directory.display() + ) + })?; + let target_dir_parent = target_tenant_directory.parent().with_context(|| { + format!( + "Failed to get tenant dir parent for {}", + target_tenant_directory.display() + ) + })?; + fs::File::open(target_dir_parent)?.sync_all()?; + + info!( + "created directory structure in {}", + target_tenant_directory.display() + ); Ok(Arc::new(Repository::new( conf, @@ -278,6 +326,17 @@ fn create_repo( ))) } +fn rebase_directory(original_path: &Path, base: &Path, new_base: &Path) -> anyhow::Result { + let relative_path = original_path.strip_prefix(base).with_context(|| { + format!( + "Failed to strip base prefix '{}' off path '{}'", + base.display(), + original_path.display() + ) + })?; + Ok(new_base.join(relative_path)) +} + pub fn create_tenant( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, @@ -285,11 +344,11 @@ pub fn create_tenant( remote_index: RemoteIndex, ) -> anyhow::Result> { match tenants_state::write_tenants().entry(tenant_id) { - Entry::Occupied(_) => { + hash_map::Entry::Occupied(_) => { debug!("tenant {tenant_id} already exists"); Ok(None) } - Entry::Vacant(v) => { + hash_map::Entry::Vacant(v) => { let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); let repo = create_repo(conf, tenant_conf, tenant_id, wal_redo_manager, remote_index)?; v.insert(Tenant { @@ -310,7 +369,7 @@ pub fn update_tenant_config( info!("configuring tenant {tenant_id}"); get_repository_for_tenant(tenant_id)?.update_tenant_config(tenant_conf); - Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; + Repository::persist_tenant_config(&TenantConf::path(conf, tenant_id), tenant_conf, false)?; Ok(()) } @@ -424,7 +483,7 @@ pub async fn detach_tenant( // we will attempt to remove files which no longer exist. This can be fixed by having shutdown // mechanism for repository that will clean temporary data to avoid any references to ephemeral files let local_tenant_directory = conf.tenant_path(&tenant_id); - std::fs::remove_dir_all(&local_tenant_directory).with_context(|| { + fs::remove_dir_all(&local_tenant_directory).with_context(|| { format!( "Failed to remove local tenant directory '{}'", local_tenant_directory.display() @@ -472,7 +531,7 @@ fn local_tenant_timeline_files( let mut local_tenant_timeline_files = TenantTimelineValues::new(); let tenants_dir = config.tenants_path(); - for tenants_dir_entry in std::fs::read_dir(&tenants_dir) + for tenants_dir_entry in fs::read_dir(&tenants_dir) .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? { match &tenants_dir_entry { @@ -483,7 +542,7 @@ fn local_tenant_timeline_files( "Found temporary tenant directory, removing: {}", tenant_dir_path.display() ); - if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) { + if let Err(e) = fs::remove_dir_all(&tenant_dir_path) { error!( "Failed to remove temporary directory '{}': {:?}", tenant_dir_path.display(), @@ -545,7 +604,7 @@ fn remove_if_empty(tenant_dir_path: &Path) -> anyhow::Result { .is_none(); if directory_is_empty { - std::fs::remove_dir_all(&tenant_dir_path).with_context(|| { + fs::remove_dir_all(&tenant_dir_path).with_context(|| { format!( "Failed to remove empty directory '{}'", tenant_dir_path.display(), @@ -582,7 +641,7 @@ fn collect_timelines_for_tenant( let timelines_dir = config.timelines_path(&tenant_id); let mut tenant_timelines = HashMap::new(); - for timelines_dir_entry in std::fs::read_dir(&timelines_dir) + for timelines_dir_entry in fs::read_dir(&timelines_dir) .with_context(|| format!("Failed to list timelines dir entry for tenant {tenant_id}"))? { match timelines_dir_entry { @@ -593,7 +652,7 @@ fn collect_timelines_for_tenant( "Found temporary timeline directory, removing: {}", timeline_dir.display() ); - if let Err(e) = std::fs::remove_dir_all(&timeline_dir) { + if let Err(e) = fs::remove_dir_all(&timeline_dir) { error!( "Failed to remove temporary directory '{}': {:?}", timeline_dir.display(), @@ -660,7 +719,7 @@ fn collect_timeline_files( .parse::() .context("Could not parse timeline id out of the timeline dir name")?; let timeline_dir_entries = - std::fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; + fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; for entry in timeline_dir_entries { let entry_path = entry.context("Failed to list timeline dir entry")?.path(); if entry_path.is_file() { @@ -671,7 +730,7 @@ fn collect_timeline_files( continue; } else if is_temporary(&entry_path) { info!("removing temp timeline file at {}", entry_path.display()); - std::fs::remove_file(&entry_path).with_context(|| { + fs::remove_file(&entry_path).with_context(|| { format!( "failed to remove temp download file at {}", entry_path.display() @@ -695,7 +754,7 @@ fn collect_timeline_files( None => anyhow::bail!("No metadata file found in the timeline directory"), }; let metadata = TimelineMetadata::from_bytes( - &std::fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, + &fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, ) .context("Failed to parse timeline metadata file bytes")?; diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 767f94d167..bd53aae25c 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -1,16 +1,55 @@ import os from contextlib import closing from datetime import datetime +from pathlib import Path from typing import List import pytest from fixtures.log_helper import log from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder from fixtures.types import Lsn, ZTenantId from prometheus_client.samples import Sample +def test_tenant_creation_fails(neon_simple_env: NeonEnv): + tenants_dir = Path(neon_simple_env.repo_dir) / "tenants" + initial_tenants = sorted( + map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) + ) + initial_tenant_dirs = set([d for d in tenants_dir.iterdir()]) + + neon_simple_env.pageserver.safe_psql("failpoints tenant-creation-before-tmp-rename=return") + with pytest.raises(Exception, match="tenant-creation-before-tmp-rename"): + _ = neon_simple_env.neon_cli.create_tenant() + + new_tenants = sorted( + map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) + ) + assert initial_tenants == new_tenants, "should not create new tenants" + + new_tenant_dirs = list(set([d for d in tenants_dir.iterdir()]) - initial_tenant_dirs) + assert len(new_tenant_dirs) == 1, "should have new tenant directory created" + tmp_tenant_dir = new_tenant_dirs[0] + assert str(tmp_tenant_dir).endswith( + ".___temp" + ), "new tenant directory created should be a temporary one" + + neon_simple_env.pageserver.stop() + neon_simple_env.pageserver.start() + + tenants_after_restart = sorted( + map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) + ) + dirs_after_restart = set([d for d in tenants_dir.iterdir()]) + assert ( + tenants_after_restart == initial_tenants + ), "should load all non-corrupt tenants after restart" + assert ( + dirs_after_restart == initial_tenant_dirs + ), "pageserver should clean its temp tenant dirs on restart" + + @pytest.mark.parametrize("with_safekeepers", [False, True]) def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): if with_safekeepers: From 4f7557fb58145022450bfb926913b9016c19aab9 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 13 Sep 2022 09:45:45 +0100 Subject: [PATCH 0756/1022] github/workflows: Create projects using API (#2403) * github/actions: add neon projects related actions * workflows/benchmarking: create projects using API * workflows/pg_clients: create projects using API --- .../actions/neon-project-create/action.yml | 81 +++++++++++++ .../actions/neon-project-delete/action.yml | 54 +++++++++ .github/workflows/benchmarking.yml | 113 +++++++++++------- .github/workflows/pg_clients.yml | 18 ++- 4 files changed, 223 insertions(+), 43 deletions(-) create mode 100644 .github/actions/neon-project-create/action.yml create mode 100644 .github/actions/neon-project-delete/action.yml diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml new file mode 100644 index 0000000000..d4fced4196 --- /dev/null +++ b/.github/actions/neon-project-create/action.yml @@ -0,0 +1,81 @@ +name: 'Create Neon Project' +description: 'Create Neon Project using API' + +inputs: + api_key: + desctiption: 'Neon API key' + required: true + environment: + desctiption: 'dev (aka captest) or stage' + required: true + region_id: + desctiption: 'Region ID, if not set the project will be created in the default region' + required: false +outputs: + dsn: + description: 'Created Project DSN (for main database)' + value: ${{ steps.create-neon-project.outputs.dsn }} + project_id: + description: 'Created Project ID' + value: ${{ steps.create-neon-project.outputs.project_id }} + +runs: + using: "composite" + steps: + - name: Parse Input + id: parse-input + shell: bash -euxo pipefail {0} + run: | + case "${ENVIRONMENT}" in + dev) + API_HOST=console.dev.neon.tech + REGION_ID=${REGION_ID:-eu-west-1} + ;; + staging) + API_HOST=console.stage.neon.tech + REGION_ID=${REGION_ID:-us-east-1} + ;; + *) + echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only" + exit 1 + ;; + esac + + echo "::set-output name=api_host::${API_HOST}" + echo "::set-output name=region_id::${REGION_ID}" + env: + ENVIRONMENT: ${{ inputs.environment }} + REGION_ID: ${{ inputs.region_id }} + + - name: Create Neon Project + id: create-neon-project + # A shell without `set -x` to not to expose password/dsn in logs + shell: bash -euo pipefail {0} + run: | + project=$(curl \ + "https://${API_HOST}/api/v1/projects" \ + --fail \ + --header "Accept: application/json" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer ${API_KEY}" \ + --data "{ + \"project\": { + \"platform_id\": \"serverless\", + \"region_id\": \"${REGION_ID}\", + \"settings\": { } + } + }") + + # Mask password + echo "::add-mask::$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .password')" + + dsn=$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .dsn')/main + echo "::add-mask::${dsn}" + echo "::set-output name=dsn::${dsn}" + + project_id=$(echo $project | jq --raw-output '.id') + echo "::set-output name=project_id::${project_id}" + env: + API_KEY: ${{ inputs.api_key }} + API_HOST: ${{ steps.parse-input.outputs.api_host }} + REGION_ID: ${{ steps.parse-input.outputs.region_id }} diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml new file mode 100644 index 0000000000..e7c6f58901 --- /dev/null +++ b/.github/actions/neon-project-delete/action.yml @@ -0,0 +1,54 @@ +name: 'Delete Neon Project' +description: 'Delete Neon Project using API' + +inputs: + api_key: + desctiption: 'Neon API key' + required: true + environment: + desctiption: 'dev (aka captest) or stage' + required: true + project_id: + desctiption: 'ID of the Project to delete' + required: true + +runs: + using: "composite" + steps: + - name: Parse Input + id: parse-input + shell: bash -euxo pipefail {0} + run: | + case "${ENVIRONMENT}" in + dev) + API_HOST=console.dev.neon.tech + ;; + staging) + API_HOST=console.stage.neon.tech + ;; + *) + echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only" + exit 1 + ;; + esac + + echo "::set-output name=api_host::${API_HOST}" + env: + ENVIRONMENT: ${{ inputs.environment }} + + - name: Delete Neon Project + shell: bash -euxo pipefail {0} + run: | + # Allow PROJECT_ID to be empty/null for cases when .github/actions/neon-project-create failed + if [ -n "${PROJECT_ID}" ]; then + curl -X "POST" \ + "https://${API_HOST}/api/v1/projects/${PROJECT_ID}/delete" \ + --fail \ + --header "Accept: application/json" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer ${API_KEY}" + fi + env: + API_KEY: ${{ inputs.api_key }} + PROJECT_ID: ${{ inputs.project_id }} + API_HOST: ${{ steps.parse-input.outputs.api_host }} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 4c58dda6b6..49fbc74dd6 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -14,6 +14,13 @@ on: - cron: '36 4 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually + inputs: + environment: + description: 'Environment to run remote tests on (dev or staging)' + required: false + region_id: + description: 'Use a particular region. If empty the default one will be used' + false: true defaults: run: @@ -62,19 +69,12 @@ jobs: echo Pgbench $POSTGRES_DISTRIB_DIR/bin/pgbench --version - # FIXME cluster setup is skipped due to various changes in console API - # for now pre created cluster is used. When API gain some stability - # after massive changes dynamic cluster setup will be revived. - # So use pre created cluster. It needs to be started manually, but stop is automatic after 5 minutes of inactivity - - name: Setup cluster - env: - BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" - run: | - set -e - - echo "Starting cluster" - # wake up the cluster - $POSTGRES_DISTRIB_DIR/bin/psql $BENCHMARK_CONNSTR -c "SELECT 1" + - name: Create Neon Project + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + environment: ${{ github.event.inputs.environment || 'staging' }} + api_key: ${{ ( github.event.inputs.environment || 'staging' ) == 'staging' && secrets.NEON_STAGING_API_KEY || secrets.NEON_CAPTEST_API_KEY }} - name: Run benchmark # pgbench is installed system wide from official repo @@ -97,7 +97,7 @@ jobs: TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" PLATFORM: "neon-staging" - BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" + BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally run: | # just to be sure that no data was cached on self hosted runner @@ -115,6 +115,14 @@ jobs: run: | REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh + - name: Delete Neon Project + if: ${{ always() }} + uses: ./.github/actions/neon-project-delete + with: + environment: staging + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 @@ -131,11 +139,12 @@ jobs: POSTGRES_DISTRIB_DIR: /usr TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote + SAVE_PERF_REPORT: true strategy: fail-fast: false matrix: - connstr: [ BENCHMARK_CAPTEST_CONNSTR, BENCHMARK_RDS_CONNSTR ] + platform: [ neon-captest, rds-aurora ] runs-on: dev container: @@ -147,38 +156,52 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Calculate platform - id: calculate-platform - env: - CONNSTR: ${{ matrix.connstr }} - run: | - if [ "${CONNSTR}" = "BENCHMARK_CAPTEST_CONNSTR" ]; then - PLATFORM=neon-captest - elif [ "${CONNSTR}" = "BENCHMARK_RDS_CONNSTR" ]; then - PLATFORM=rds-aurora - else - echo 2>&1 "Unknown CONNSTR=${CONNSTR}. Allowed are BENCHMARK_CAPTEST_CONNSTR, and BENCHMARK_RDS_CONNSTR only" - exit 1 - fi - - echo "::set-output name=PLATFORM::${PLATFORM}" - - name: Install Deps run: | sudo apt -y update sudo apt install -y postgresql-14 + - name: Create Neon Project + if: matrix.platform == 'neon-captest' + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + environment: ${{ github.event.inputs.environment || 'dev' }} + api_key: ${{ ( github.event.inputs.environment || 'dev' ) == 'staging' && secrets.NEON_STAGING_API_KEY || secrets.NEON_CAPTEST_API_KEY }} + + - name: Set up Connection String + id: set-up-connstr + run: | + case "${PLATFORM}" in + neon-captest) + CONNSTR=${{ steps.create-neon-project.outputs.dsn }} + ;; + rds-aurora) + CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }} + ;; + *) + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest' or 'rds-aurora'" + exit 1 + ;; + esac + + echo "::set-output name=connstr::${CONNSTR}" + + psql ${CONNSTR} -c "SELECT version();" + env: + PLATFORM: ${{ matrix.platform }} + - name: Benchmark init uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false - save_perf_report: true + save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init env: - PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} - BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} + PLATFORM: ${{ matrix.platform }} + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -188,25 +211,25 @@ jobs: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false - save_perf_report: true + save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update env: - PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} - BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} + PLATFORM: ${{ matrix.platform }} + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - - name: Benchmark simple-update + - name: Benchmark select-only uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false - save_perf_report: true + save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only env: - PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} - BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} + PLATFORM: ${{ matrix.platform }} + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -216,6 +239,14 @@ jobs: action: generate build_type: ${{ env.BUILD_TYPE }} + - name: Delete Neon Project + if: ${{ matrix.platform == 'neon-captest' && always() }} + uses: ./.github/actions/neon-project-delete + with: + environment: dev + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_CAPTEST_API_KEY }} + - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index bf14865db2..d04d002811 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -47,11 +47,17 @@ jobs: shell: bash -euxo pipefail {0} run: ./scripts/pysync + - name: Create Neon Project + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + environment: staging + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + - name: Run pytest env: REMOTE_ENV: 1 - BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" - + BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install/v14 shell: bash -euxo pipefail {0} run: | @@ -65,6 +71,14 @@ jobs: -m "remote_cluster" \ -rA "test_runner/pg_clients" + - name: Delete Neon Project + if: ${{ always() }} + uses: ./.github/actions/neon-project-delete + with: + environment: staging + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI. # It will be fixed after switching to gen2 runner - name: Upload python test logs From f44afbaf62efb2910cefb671457fe60ada9163d5 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 13 Sep 2022 12:26:20 +0300 Subject: [PATCH 0757/1022] Changes of neon extension to support local prefetch (#2369) * Changes of neon extension to support local prefetch * Catch exceptions in pageserver_receive * Bump posgres version * Bump posgres version * Bump posgres version * Bump posgres version --- pgxn/neon/libpagestore.c | 158 +++++++++++++++++++++-------------- pgxn/neon/pagestore_client.h | 6 +- pgxn/neon/pagestore_smgr.c | 139 ++++++++++++++++++++++++++++-- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 5 files changed, 233 insertions(+), 74 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 649fc1037e..d0572e66cb 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -43,11 +43,6 @@ PGconn *pageserver_conn = NULL; char *page_server_connstring_raw; -static ZenithResponse *pageserver_call(ZenithRequest *request); -page_server_api api = { - .request = pageserver_call -}; - static void pageserver_connect() { @@ -154,60 +149,86 @@ retry: } -static ZenithResponse * -pageserver_call(ZenithRequest *request) +static void +pageserver_disconnect(void) +{ + /* + * If anything goes wrong while we were sending a request, it's not + * clear what state the connection is in. For example, if we sent the + * request but didn't receive a response yet, we might receive the + * response some time later after we have already sent a new unrelated + * request. Close the connection to avoid getting confused. + */ + if (connected) + { + neon_log(LOG, "dropping connection to page server due to error"); + PQfinish(pageserver_conn); + pageserver_conn = NULL; + connected = false; + } +} + +static void +pageserver_send(ZenithRequest *request) { StringInfoData req_buff; + + /* If the connection was lost for some reason, reconnect */ + if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) + { + PQfinish(pageserver_conn); + pageserver_conn = NULL; + connected = false; + } + + if (!connected) + pageserver_connect(); + + req_buff = zm_pack_request(request); + + /* + * Send request. + * + * In principle, this could block if the output buffer is full, and we + * should use async mode and check for interrupts while waiting. In + * practice, our requests are small enough to always fit in the output + * and TCP buffer. + */ + if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) + { + char* msg = PQerrorMessage(pageserver_conn); + pageserver_disconnect(); + neon_log(ERROR, "failed to send page request: %s", msg); + } + pfree(req_buff.data); + + if (message_level_is_interesting(PageStoreTrace)) + { + char *msg = zm_to_string((ZenithMessage *) request); + neon_log(PageStoreTrace, "sent request: %s", msg); + pfree(msg); + } +} + +static ZenithResponse * +pageserver_receive(void) +{ StringInfoData resp_buff; ZenithResponse *resp; PG_TRY(); { - /* If the connection was lost for some reason, reconnect */ - if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) - { - PQfinish(pageserver_conn); - pageserver_conn = NULL; - connected = false; - } - - if (!connected) - pageserver_connect(); - - req_buff = zm_pack_request(request); - - /* - * Send request. - * - * In principle, this could block if the output buffer is full, and we - * should use async mode and check for interrupts while waiting. In - * practice, our requests are small enough to always fit in the output - * and TCP buffer. - */ - if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn)) - { - neon_log(ERROR, "failed to send page request: %s", - PQerrorMessage(pageserver_conn)); - } - pfree(req_buff.data); - - if (message_level_is_interesting(PageStoreTrace)) - { - char *msg = zm_to_string((ZenithMessage *) request); - - neon_log(PageStoreTrace, "sent request: %s", msg); - pfree(msg); - } - /* read response */ resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data); resp_buff.cursor = 0; - if (resp_buff.len == -1) - neon_log(ERROR, "end of COPY"); - else if (resp_buff.len == -2) - neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); - + if (resp_buff.len < 0) + { + if (resp_buff.len == -1) + neon_log(ERROR, "end of COPY"); + else if (resp_buff.len == -2) + neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); + } resp = zm_unpack_response(&resp_buff); PQfreemem(resp_buff.data); @@ -221,20 +242,7 @@ pageserver_call(ZenithRequest *request) } PG_CATCH(); { - /* - * If anything goes wrong while we were sending a request, it's not - * clear what state the connection is in. For example, if we sent the - * request but didn't receive a response yet, we might receive the - * response some time later after we have already sent a new unrelated - * request. Close the connection to avoid getting confused. - */ - if (connected) - { - neon_log(LOG, "dropping connection to page server due to error"); - PQfinish(pageserver_conn); - pageserver_conn = NULL; - connected = false; - } + pageserver_disconnect(); PG_RE_THROW(); } PG_END_TRY(); @@ -243,6 +251,32 @@ pageserver_call(ZenithRequest *request) } +static void +pageserver_flush(void) +{ + if (PQflush(pageserver_conn)) + { + char* msg = PQerrorMessage(pageserver_conn); + pageserver_disconnect(); + neon_log(ERROR, "failed to flush page requests: %s", msg); + } +} + +static ZenithResponse * +pageserver_call(ZenithRequest* request) +{ + pageserver_send(request); + pageserver_flush(); + return pageserver_receive(); +} + +page_server_api api = { + .request = pageserver_call, + .send = pageserver_send, + .flush = pageserver_flush, + .receive = pageserver_receive +}; + static bool check_zenith_id(char **newval, void **extra, GucSource source) { diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 93ea6771eb..5b21abc1bd 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -142,7 +142,10 @@ extern char *zm_to_string(ZenithMessage *msg); typedef struct { ZenithResponse *(*request) (ZenithRequest *request); -} page_server_api; + void (*send) (ZenithRequest *request); + ZenithResponse *(*receive) (void); + void (*flush) (void); +} page_server_api; extern page_server_api *page_server; @@ -171,6 +174,7 @@ extern void zenith_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +extern void zenith_reset_prefetch(SMgrRelation reln); extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index d49df7af58..ebf899dfdb 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -57,6 +57,8 @@ #include "postmaster/interrupt.h" #include "replication/walsender.h" #include "storage/bufmgr.h" +#include "storage/relfilenode.h" +#include "storage/buf_internals.h" #include "storage/md.h" #include "fmgr.h" #include "miscadmin.h" @@ -110,6 +112,49 @@ typedef enum static SMgrRelation unlogged_build_rel = NULL; static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + +/* + * Prefetch implementation: + * Prefetch is performed locally by each backend. + * There can be up to MAX_PREFETCH_REQUESTS registered using smgr_prefetch + * before smgr_read. All this requests are appended to primary smgr_read request. + * It is assumed that pages will be requested in prefetch order. + * Reading of prefetch responses is delayed until them are actually needed (smgr_read). + * It make it possible to parallelize processing and receiving of prefetched pages. + * In case of prefetch miss or any other SMGR request other than smgr_read, + * all prefetch responses has to be consumed. + */ + +#define MAX_PREFETCH_REQUESTS 128 + +BufferTag prefetch_requests[MAX_PREFETCH_REQUESTS]; +BufferTag prefetch_responses[MAX_PREFETCH_REQUESTS]; +int n_prefetch_requests; +int n_prefetch_responses; +int n_prefetched_buffers; +int n_prefetch_hits; +int n_prefetch_misses; +XLogRecPtr prefetch_lsn; + +static void +consume_prefetch_responses(void) +{ + for (int i = n_prefetched_buffers; i < n_prefetch_responses; i++) { + ZenithResponse* resp = page_server->receive(); + pfree(resp); + } + n_prefetched_buffers = 0; + n_prefetch_responses = 0; +} + +static ZenithResponse* +page_server_request(void const* req) +{ + consume_prefetch_responses(); + return page_server->request((ZenithRequest*)req); +} + + StringInfoData zm_pack_request(ZenithRequest *msg) { @@ -735,7 +780,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) .forknum = forkNum }; - resp = page_server->request((ZenithRequest *) &request); + resp = page_server_request(&request); } switch (resp->tag) @@ -948,6 +993,16 @@ zenith_close(SMgrRelation reln, ForkNumber forknum) mdclose(reln, forknum); } + +/* + * zenith_reset_prefetch() -- reoe all previously rgistered prefeth requests + */ +void +zenith_reset_prefetch(SMgrRelation reln) +{ + n_prefetch_requests = 0; +} + /* * zenith_prefetch() -- Initiate asynchronous read of the specified block of a relation */ @@ -971,9 +1026,15 @@ zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - /* not implemented */ - elog(SmgrTrace, "[ZENITH_SMGR] prefetch noop"); - return true; + if (n_prefetch_requests < MAX_PREFETCH_REQUESTS) + { + prefetch_requests[n_prefetch_requests].rnode = reln->smgr_rnode.node; + prefetch_requests[n_prefetch_requests].forkNum = forknum; + prefetch_requests[n_prefetch_requests].blockNum = blocknum; + n_prefetch_requests += 1; + return true; + } + return false; } /* @@ -1022,7 +1083,47 @@ void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno XLogRecPtr request_lsn, bool request_latest, char *buffer) { ZenithResponse *resp; + int i; + /* + * Try to find prefetched page. + * It is assumed that pages will be requested in the same order as them are prefetched, + * but some other backend may load page in shared buffers, so some prefetch responses should + * be skipped. + */ + for (i = n_prefetched_buffers; i < n_prefetch_responses; i++) + { + resp = page_server->receive(); + if (resp->tag == T_ZenithGetPageResponse && + RelFileNodeEquals(prefetch_responses[i].rnode, rnode) && + prefetch_responses[i].forkNum == forkNum && + prefetch_responses[i].blockNum == blkno) + { + char* page = ((ZenithGetPageResponse *) resp)->page; + /* + * Check if prefetched page is still relevant. + * If it is updated by some other backend, then it should not + * be requested from smgr unless it is evicted from shared buffers. + * In the last case last_evicted_lsn should be updated and + * request_lsn should be greater than prefetch_lsn. + * Maximum with page LSN is used because page returned by page server + * may have LSN either greater either smaller than requested. + */ + if (Max(prefetch_lsn, PageGetLSN(page)) >= request_lsn) + { + n_prefetched_buffers = i+1; + n_prefetch_hits += 1; + n_prefetch_requests = 0; + memcpy(buffer, page, BLCKSZ); + pfree(resp); + return; + } + } + pfree(resp); + } + n_prefetched_buffers = 0; + n_prefetch_responses = 0; + n_prefetch_misses += 1; { ZenithGetPageRequest request = { .req.tag = T_ZenithGetPageRequest, @@ -1032,10 +1133,29 @@ void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno .forknum = forkNum, .blkno = blkno }; - - resp = page_server->request((ZenithRequest *) &request); + if (n_prefetch_requests > 0) + { + /* Combine all prefetch requests with primary request */ + page_server->send((ZenithRequest *) &request); + for (i = 0; i < n_prefetch_requests; i++) + { + request.rnode = prefetch_requests[i].rnode; + request.forknum = prefetch_requests[i].forkNum; + request.blkno = prefetch_requests[i].blockNum; + prefetch_responses[i] = prefetch_requests[i]; + page_server->send((ZenithRequest *) &request); + } + page_server->flush(); + n_prefetch_responses = n_prefetch_requests; + n_prefetch_requests = 0; + prefetch_lsn = request_lsn; + resp = page_server->receive(); + } + else + { + resp = page_server->request((ZenithRequest *) &request); + } } - switch (resp->tag) { case T_ZenithGetPageResponse: @@ -1305,7 +1425,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) .forknum = forknum, }; - resp = page_server->request((ZenithRequest *) &request); + resp = page_server_request(&request); } switch (resp->tag) @@ -1365,7 +1485,7 @@ zenith_dbsize(Oid dbNode) .dbNode = dbNode, }; - resp = page_server->request((ZenithRequest *) &request); + resp = page_server_request(&request); } switch (resp->tag) @@ -1680,6 +1800,7 @@ static const struct f_smgr zenith_smgr = .smgr_unlink = zenith_unlink, .smgr_extend = zenith_extend, .smgr_prefetch = zenith_prefetch, + .smgr_reset_prefetch = zenith_reset_prefetch, .smgr_read = zenith_read, .smgr_write = zenith_write, .smgr_writeback = zenith_writeback, diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index e8518d3fc8..114676d2ed 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit e8518d3fc85e3da420d2f5a2742a21386e6585ec +Subproject commit 114676d2edd5307226d9448ec467821fdb77467d diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 313769bb62..b1dbd93e2b 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 313769bb6229f46380e24d8f6ff535f9185458af +Subproject commit b1dbd93e2b1691e93860f7e59b9e1fe5a6e79786 From 1a8c8b04d70bd82a20055e2653c4aa593e3bfc34 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 7 Sep 2022 18:01:49 +0300 Subject: [PATCH 0758/1022] Merge Repository and Tenant entities, rework tenant background jobs --- control_plane/src/bin/neon_local.rs | 8 +- pageserver/src/basebackup.rs | 2 +- pageserver/src/bin/dump_layerfile.rs | 2 +- pageserver/src/bin/pageserver.rs | 2 +- pageserver/src/bin/update_metadata.rs | 2 +- pageserver/src/config.rs | 2 +- pageserver/src/http/models.rs | 5 +- pageserver/src/http/openapi_spec.yml | 4 +- pageserver/src/http/routes.rs | 123 ++++--- pageserver/src/import_datadir.rs | 2 +- pageserver/src/lib.rs | 4 +- pageserver/src/page_cache.rs | 2 +- pageserver/src/page_service.rs | 56 ++-- pageserver/src/pgdatadir_mapping.rs | 10 +- pageserver/src/storage_sync.rs | 34 +- pageserver/src/storage_sync/delete.rs | 6 +- pageserver/src/storage_sync/download.rs | 10 +- pageserver/src/storage_sync/index.rs | 8 +- pageserver/src/storage_sync/upload.rs | 12 +- .../src/{layered_repository.rs => tenant.rs} | 255 +++++++++----- .../{layered_repository => tenant}/blob_io.rs | 2 +- .../block_io.rs | 2 +- .../delta_layer.rs | 12 +- .../disk_btree.rs | 2 +- .../disk_btree_test_data.rs | 0 .../ephemeral_file.rs | 14 +- .../filename.rs | 0 .../image_layer.rs | 12 +- .../inmemory_layer.rs | 12 +- .../layer_map.rs | 6 +- .../metadata.rs | 4 +- .../par_fsync.rs | 0 .../storage_layer.rs | 0 .../timeline.rs | 4 +- pageserver/src/tenant_mgr.rs | 312 +++++++----------- pageserver/src/tenant_tasks.rs | 147 ++++++--- pageserver/src/timelines.rs | 31 +- pageserver/src/walingest.rs | 25 +- .../src/walreceiver/connection_manager.rs | 20 +- .../src/walreceiver/walreceiver_connection.rs | 7 +- test_runner/regress/test_broken_timeline.py | 4 +- test_runner/regress/test_tenant_tasks.py | 8 +- test_runner/regress/test_timeline_delete.py | 5 +- 43 files changed, 615 insertions(+), 563 deletions(-) rename pageserver/src/{layered_repository.rs => tenant.rs} (88%) rename pageserver/src/{layered_repository => tenant}/blob_io.rs (98%) rename pageserver/src/{layered_repository => tenant}/block_io.rs (98%) rename pageserver/src/{layered_repository => tenant}/delta_layer.rs (98%) rename pageserver/src/{layered_repository => tenant}/disk_btree.rs (99%) rename pageserver/src/{layered_repository => tenant}/disk_btree_test_data.rs (100%) rename pageserver/src/{layered_repository => tenant}/ephemeral_file.rs (97%) rename pageserver/src/{layered_repository => tenant}/filename.rs (100%) rename pageserver/src/{layered_repository => tenant}/image_layer.rs (97%) rename pageserver/src/{layered_repository => tenant}/inmemory_layer.rs (96%) rename pageserver/src/{layered_repository => tenant}/layer_map.rs (98%) rename pageserver/src/{layered_repository => tenant}/metadata.rs (98%) rename pageserver/src/{layered_repository => tenant}/par_fsync.rs (100%) rename pageserver/src/{layered_repository => tenant}/storage_layer.rs (100%) rename pageserver/src/{layered_repository => tenant}/timeline.rs (99%) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 828d6a2e5a..e3160db53b 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -543,13 +543,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an match tenant_match.subcommand() { Some(("list", _)) => { for t in pageserver.tenant_list()? { - println!( - "{} {}", - t.id, - t.state - .map(|s| s.to_string()) - .unwrap_or_else(|| String::from("")) - ); + println!("{} {:?}", t.id, t.state); } } Some(("create", create_match)) => { diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 61facc852d..eca6a3c87f 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -22,8 +22,8 @@ use std::time::SystemTime; use tar::{Builder, EntryType, Header}; use tracing::*; -use crate::layered_repository::Timeline; use crate::reltag::{RelTag, SlruKind}; +use crate::tenant::Timeline; use postgres_ffi::v14::pg_constants; use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName}; diff --git a/pageserver/src/bin/dump_layerfile.rs b/pageserver/src/bin/dump_layerfile.rs index 87390a1b06..7e766ce859 100644 --- a/pageserver/src/bin/dump_layerfile.rs +++ b/pageserver/src/bin/dump_layerfile.rs @@ -3,8 +3,8 @@ //! A handy tool for debugging, that's all. use anyhow::Result; use clap::{App, Arg}; -use pageserver::layered_repository::dump_layerfile_from_path; use pageserver::page_cache; +use pageserver::tenant::dump_layerfile_from_path; use pageserver::virtual_file; use std::path::PathBuf; use utils::project_git_version; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index ec71e5b320..679c6f76e7 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -182,7 +182,7 @@ fn initialize_config( cfg_file_path.display() ); } else { - // We're initializing the repo, so there's no config file yet + // We're initializing the tenant, so there's no config file yet ( DEFAULT_CONFIG_FILE .parse::() diff --git a/pageserver/src/bin/update_metadata.rs b/pageserver/src/bin/update_metadata.rs index 983fdb8647..3339564b0f 100644 --- a/pageserver/src/bin/update_metadata.rs +++ b/pageserver/src/bin/update_metadata.rs @@ -3,7 +3,7 @@ //! A handy tool for debugging, that's all. use anyhow::Result; use clap::{App, Arg}; -use pageserver::layered_repository::metadata::TimelineMetadata; +use pageserver::tenant::metadata::TimelineMetadata; use std::path::PathBuf; use std::str::FromStr; use utils::{lsn::Lsn, project_git_version}; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index fb70ea327d..56171f46e3 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -19,7 +19,7 @@ use utils::{ zid::{NodeId, ZTenantId, ZTimelineId}, }; -use crate::layered_repository::TIMELINES_SEGMENT_NAME; +use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::tenant_config::{TenantConf, TenantConfOpt}; pub mod defaults { diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 7c7d7f7b0c..0ccf23776c 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -7,8 +7,7 @@ use utils::{ zid::{NodeId, ZTenantId, ZTimelineId}, }; -// These enums are used in the API response fields. -use crate::tenant_mgr::TenantState; +use crate::tenant::TenantState; #[serde_as] #[derive(Serialize, Deserialize)] @@ -108,7 +107,7 @@ impl TenantConfigRequest { pub struct TenantInfo { #[serde_as(as = "DisplayFromStr")] pub id: ZTenantId, - pub state: Option, + pub state: TenantState, pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub has_in_progress_downloads: Option, } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 6beb938d6a..b9a62d0f32 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -489,6 +489,7 @@ components: type: object required: - id + - state properties: id: type: string @@ -573,7 +574,6 @@ components: required: - last_record_lsn - disk_consistent_lsn - - timeline_state properties: last_record_lsn: type: string @@ -581,8 +581,6 @@ components: disk_consistent_lsn: type: string format: hex - timeline_state: - type: string ancestor_timeline_id: type: string format: hex diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 78f83511cb..36ba2e9b66 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -11,9 +11,9 @@ use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, }; -use crate::layered_repository::Timeline; use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; +use crate::tenant::{TenantState, Timeline}; use crate::tenant_config::TenantConfOpt; use crate::{config::PageServerConf, tenant_mgr, timelines}; use utils::{ @@ -132,12 +132,11 @@ fn list_local_timelines( include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> Result> { - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("Failed to get repo for tenant {tenant_id}"))?; - let repo_timelines = repo.list_timelines(); + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; + let timelines = tenant.list_timelines(); - let mut local_timeline_info = Vec::with_capacity(repo_timelines.len()); - for (timeline_id, repository_timeline) in repo_timelines { + let mut local_timeline_info = Vec::with_capacity(timelines.len()); + for (timeline_id, repository_timeline) in timelines { local_timeline_info.push(( timeline_id, local_timeline_info_from_timeline( @@ -201,23 +200,31 @@ async fn timeline_list_handler(request: Request) -> Result, query_param_present(&request, "include-non-incremental-physical-size"); check_permission(&request, Some(tenant_id))?; - let local_timeline_infos = tokio::task::spawn_blocking(move || { + let timelines = tokio::task::spawn_blocking(move || { let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); - list_local_timelines( - tenant_id, - include_non_incremental_logical_size, - include_non_incremental_physical_size, - ) + Ok::<_, anyhow::Error>(tenant_mgr::get_tenant(tenant_id, true)?.list_timelines()) }) .await .map_err(ApiError::from_err)??; - let mut response_data = Vec::with_capacity(local_timeline_infos.len()); - for (timeline_id, local_timeline_info) in local_timeline_infos { + let mut response_data = Vec::with_capacity(timelines.len()); + for (timeline_id, timeline) in timelines { + let local = match local_timeline_info_from_timeline( + &timeline, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + ) { + Ok(local) => Some(local), + Err(e) => { + error!("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}"); + None + } + }; + response_data.push(TimelineInfo { tenant_id, timeline_id, - local: Some(local_timeline_info), + local, remote: get_state(&request) .remote_index .read() @@ -259,28 +266,25 @@ async fn timeline_detail_handler(request: Request) -> Result(local_timeline) + let timeline = tokio::task::spawn_blocking(move || { + tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id) }) .await - .ok() - .and_then(|r| r.ok()) - .flatten(); + .map_err(ApiError::from_err)?; + + let local_timeline_info = match timeline.and_then(|timeline| { + local_timeline_info_from_timeline( + &timeline, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + ) + }) { + Ok(local_info) => Some(local_info), + Err(e) => { + error!("Failed to get local timeline info: {e:#}"); + None + } + }; let remote_timeline_info = { let remote_index_read = get_state(&request).remote_index.read().await; @@ -294,25 +298,26 @@ async fn timeline_detail_handler(request: Request) -> Result((local_timeline_info, remote_timeline_info)) } .instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id)) - .await; + .await?; if local_timeline_info.is_none() && remote_timeline_info.is_none() { - return Err(ApiError::NotFound(format!( + Err(ApiError::NotFound(format!( "Timeline {tenant_id}/{timeline_id} is not found neither locally nor remotely" - ))); + ))) + } else { + json_response( + StatusCode::OK, + TimelineInfo { + tenant_id, + timeline_id, + local: local_timeline_info, + remote: remote_timeline_info, + }, + ) } - - let timeline_info = TimelineInfo { - tenant_id, - timeline_id, - local: local_timeline_info, - remote: remote_timeline_info, - }; - - json_response(StatusCode::OK, timeline_info) } // TODO makes sense to provide tenant config right away the same way as it handled in tenant_create @@ -320,10 +325,10 @@ async fn tenant_attach_handler(request: Request) -> Result, let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - info!("Handling tenant attach {}", tenant_id); + info!("Handling tenant attach {tenant_id}"); tokio::task::spawn_blocking(move || { - if tenant_mgr::get_tenant_state(tenant_id).is_some() { + if tenant_mgr::get_tenant(tenant_id, false).is_ok() { anyhow::bail!("Tenant is already present locally") }; Ok(()) @@ -426,7 +431,7 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, ApiErro check_permission(&request, Some(tenant_id))?; // if tenant is in progress of downloading it can be absent in global tenant map - let tenant_state = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant_state(tenant_id)) + let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false)) .await .map_err(ApiError::from_err)?; @@ -494,13 +499,25 @@ async fn tenant_status(request: Request) -> Result, ApiErro false }); + let tenant_state = match tenant { + Ok(tenant) => tenant.current_state(), + Err(e) => { + error!("Failed to get local tenant state: {e:#}"); + if has_in_progress_downloads { + TenantState::Paused + } else { + TenantState::Broken + } + } + }; + let current_physical_size = match tokio::task::spawn_blocking(move || list_local_timelines(tenant_id, false, false)) .await .map_err(ApiError::from_err)? { Err(err) => { - // Getting local timelines can fail when no local repo is on disk (e.g, when tenant data is being downloaded). + // Getting local timelines can fail when no local tenant directory is on disk (e.g, when tenant data is being downloaded). // In that case, put a warning message into log and operate normally. warn!("Failed to get local timelines for tenant {tenant_id}: {err}"); None diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index f8f614f8f4..ee0780f4b2 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -11,9 +11,9 @@ use bytes::Bytes; use tracing::*; use walkdir::WalkDir; -use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; +use crate::tenant::Timeline; use crate::walingest::WalIngest; use crate::walrecord::DecodedWALRecord; use postgres_ffi::v14::relfile_utils::*; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 8b9251229e..5742568079 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -3,7 +3,6 @@ pub mod config; pub mod http; pub mod import_datadir; pub mod keyspace; -pub mod layered_repository; pub mod metrics; pub mod page_cache; pub mod page_service; @@ -13,6 +12,7 @@ pub mod reltag; pub mod repository; pub mod storage_sync; pub mod task_mgr; +pub mod tenant; pub mod tenant_config; pub mod tenant_mgr; pub mod tenant_tasks; @@ -181,7 +181,7 @@ mod backoff_defaults_tests { #[cfg(test)] mod tests { - use crate::layered_repository::repo_harness::TIMELINE_ID; + use crate::tenant::harness::TIMELINE_ID; use super::*; diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 27b1400243..15c3c22dd6 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -53,8 +53,8 @@ use utils::{ zid::{ZTenantId, ZTimelineId}, }; -use crate::layered_repository::writeback_ephemeral_file; use crate::repository::Key; +use crate::tenant::writeback_ephemeral_file; static PAGE_CACHE: OnceCell = OnceCell::new(); const TEST_PAGE_CACHE_SIZE: usize = 50; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 149144bfe4..b03dab20e0 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -34,13 +34,13 @@ use utils::{ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; -use crate::layered_repository::Timeline; use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::profiling::profpoint_start; use crate::reltag::RelTag; use crate::task_mgr; use crate::task_mgr::TaskKind; +use crate::tenant::Timeline; use crate::tenant_mgr; use crate::CheckpointConfig; use postgres_ffi::v14::xlog_utils::to_pg_timestamp; @@ -477,8 +477,8 @@ impl PageServerHandler { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - let timeline = repo.create_empty_timeline(timeline_id, base_lsn)?; + let timeline = tenant_mgr::get_tenant(tenant_id, true)? + .create_empty_timeline(timeline_id, base_lsn)?; // TODO mark timeline as not ready until it reaches end_lsn. // We might have some wal to import as well, and we should prevent compute @@ -539,10 +539,7 @@ impl PageServerHandler { ) -> anyhow::Result<()> { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - let timeline = repo - .get_timeline(timeline_id) - .with_context(|| format!("Timeline {timeline_id} was not found"))?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; ensure!(timeline.get_last_record_lsn() == start_lsn); // TODO leave clean state on error. For now you can use detach to clean @@ -770,7 +767,7 @@ impl PageServerHandler { // when accessing management api supply None as an argument // when using to authorize tenant pass corresponding tenant id - fn check_permission(&self, tenantid: Option) -> Result<()> { + fn check_permission(&self, tenant_id: Option) -> Result<()> { if self.auth.is_none() { // auth is set to Trust, nothing to check so just return ok return Ok(()); @@ -782,7 +779,7 @@ impl PageServerHandler { .claims .as_ref() .expect("claims presence already checked"); - auth::check_permission(claims, tenantid) + auth::check_permission(claims, tenant_id) } } @@ -809,7 +806,7 @@ impl postgres_backend_async::Handler for PageServerHandler { } info!( - "jwt auth succeeded for scope: {:#?} by tenantid: {:?}", + "jwt auth succeeded for scope: {:#?} by tenant id: {:?}", data.claims.scope, data.claims.tenant_id, ); @@ -1013,8 +1010,8 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("show ".len()); let params = params_raw.split(' ').collect::>(); ensure!(params.len() == 1, "invalid param number for config command"); - let tenantid = ZTenantId::from_str(params[0])?; - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + let tenant_id = ZTenantId::from_str(params[0])?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), RowDescriptor::int8_col(b"checkpoint_timeout"), @@ -1027,25 +1024,27 @@ impl postgres_backend_async::Handler for PageServerHandler { RowDescriptor::int8_col(b"pitr_interval"), ]))? .write_message(&BeMessage::DataRow(&[ - Some(repo.get_checkpoint_distance().to_string().as_bytes()), + Some(tenant.get_checkpoint_distance().to_string().as_bytes()), Some( - repo.get_checkpoint_timeout() + tenant + .get_checkpoint_timeout() .as_secs() .to_string() .as_bytes(), ), - Some(repo.get_compaction_target_size().to_string().as_bytes()), + Some(tenant.get_compaction_target_size().to_string().as_bytes()), Some( - repo.get_compaction_period() + tenant + .get_compaction_period() .as_secs() .to_string() .as_bytes(), ), - Some(repo.get_compaction_threshold().to_string().as_bytes()), - Some(repo.get_gc_horizon().to_string().as_bytes()), - Some(repo.get_gc_period().as_secs().to_string().as_bytes()), - Some(repo.get_image_creation_threshold().to_string().as_bytes()), - Some(repo.get_pitr_interval().as_secs().to_string().as_bytes()), + Some(tenant.get_compaction_threshold().to_string().as_bytes()), + Some(tenant.get_gc_horizon().to_string().as_bytes()), + Some(tenant.get_gc_period().as_secs().to_string().as_bytes()), + Some(tenant.get_image_creation_threshold().to_string().as_bytes()), + Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()), ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("do_gc ") { @@ -1066,16 +1065,16 @@ impl postgres_backend_async::Handler for PageServerHandler { let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; let gc_horizon: u64 = caps .get(4) .map(|h| h.as_str().parse()) - .unwrap_or_else(|| Ok(repo.get_gc_horizon()))?; + .unwrap_or_else(|| Ok(tenant.get_gc_horizon()))?; // Use tenant's pitr setting - let pitr = repo.get_pitr_interval(); - let result = repo.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; + let pitr = tenant.get_pitr_interval(); + let result = tenant.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"layers_total"), RowDescriptor::int8_col(b"layers_needed_by_cutoff"), @@ -1169,12 +1168,7 @@ impl postgres_backend_async::Handler for PageServerHandler { } fn get_local_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Result> { - tenant_mgr::get_repository_for_tenant(tenant_id) - .and_then(|repo| { - repo.get_timeline(timeline_id) - .context("No timeline in tenant's repository") - }) - .with_context(|| format!("Could not get timeline {timeline_id} in tenant {tenant_id}")) + tenant_mgr::get_tenant(tenant_id, true).and_then(|tenant| tenant.get_timeline(timeline_id)) } /// diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index ba48a77961..2454b6f54f 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -7,9 +7,9 @@ //! Clarify that) //! use crate::keyspace::{KeySpace, KeySpaceAccum}; -use crate::layered_repository::Timeline; use crate::reltag::{RelTag, SlruKind}; use crate::repository::*; +use crate::tenant::Timeline; use crate::walrecord::ZenithWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; @@ -1398,16 +1398,12 @@ fn is_slru_block_key(key: Key) -> bool { && key.field6 != 0xffffffff // and not SlruSegSize } -// -//-- Tests that should work the same with any Repository/Timeline implementation. -// - #[cfg(test)] pub fn create_test_timeline( - repo: &crate::layered_repository::Repository, + tenant: &crate::tenant::Tenant, timeline_id: utils::zid::ZTimelineId, ) -> Result> { - let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; + let tline = tenant.create_empty_timeline(timeline_id, Lsn(8))?; let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; m.commit()?; diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 8ebfa6a935..c104dba298 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -46,10 +46,10 @@ //! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata. //! If the storage sync loop was successfully started before, pageserver schedules the layer files and the updated metadata file for upload, every time a layer is flushed to disk. //! The uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either). -//! See [`crate::layered_repository`] for the upload calls and the adjacent logic. +//! See [`crate::tenant`] for the upload calls and the adjacent logic. //! -//! Synchronization logic is able to communicate back with updated timeline sync states, [`crate::repository::TimelineSyncStatusUpdate`], -//! submitted via [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state. +//! Synchronization logic is able to communicate back with updated timeline sync states, submitted via [`crate::tenant_mgr::attach_local_tenants`] function. +//! Tenant manager applies corresponding timeline updates in pageserver's in-memory state. //! Such submissions happen in two cases: //! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future //! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory @@ -171,11 +171,11 @@ use self::{ use crate::{ config::PageServerConf, exponential_backoff, - layered_repository::metadata::{metadata_path, TimelineMetadata}, storage_sync::index::RemoteIndex, task_mgr, task_mgr::TaskKind, task_mgr::BACKGROUND_RUNTIME, + tenant::metadata::{metadata_path, TimelineMetadata}, tenant_mgr::attach_local_tenants, }; use crate::{ @@ -714,17 +714,17 @@ async fn storage_sync_loop( }; if tenant_entry.has_in_progress_downloads() { - info!("Tenant {tenant_id} has pending timeline downloads, skipping repository registration"); + info!("Tenant {tenant_id} has pending timeline downloads, skipping tenant registration"); continue; } else { info!( - "Tenant {tenant_id} download completed. Picking to register in repository" + "Tenant {tenant_id} download completed. Picking to register in tenant" ); // Here we assume that if tenant has no in-progress downloads that // means that it is the last completed timeline download that triggered // sync status update. So we look at the index for available timelines - // and register them all at once in a repository for download - // to be submitted in a single operation to repository + // and register them all at once in a tenant for download + // to be submitted in a single operation to tenant // so it can apply them at once to internal timeline map. timelines_to_attach.0.insert( tenant_id, @@ -737,9 +737,7 @@ async fn storage_sync_loop( } drop(index_accessor); // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. - if let Err(e) = attach_local_tenants(conf, &index, timelines_to_attach) { - error!("Failed to attach new timelines: {e:?}"); - }; + attach_local_tenants(conf, &index, timelines_to_attach); } } ControlFlow::Break(()) => { @@ -1038,13 +1036,7 @@ async fn update_local_metadata( timeline_id, } = sync_id; tokio::task::spawn_blocking(move || { - crate::layered_repository::save_metadata( - conf, - timeline_id, - tenant_id, - &cloned_metadata, - true, - ) + crate::tenant::save_metadata(conf, timeline_id, tenant_id, &cloned_metadata, true) }) .await .with_context(|| { @@ -1411,12 +1403,12 @@ fn register_sync_status( mod test_utils { use utils::lsn::Lsn; - use crate::layered_repository::repo_harness::RepoHarness; + use crate::tenant::harness::TenantHarness; use super::*; pub(super) async fn create_local_timeline( - harness: &RepoHarness<'_>, + harness: &TenantHarness<'_>, timeline_id: ZTimelineId, filenames: &[&str], metadata: TimelineMetadata, @@ -1456,7 +1448,7 @@ mod test_utils { #[cfg(test)] mod tests { use super::test_utils::dummy_metadata; - use crate::layered_repository::repo_harness::TIMELINE_ID; + use crate::tenant::harness::TIMELINE_ID; use hex_literal::hex; use utils::lsn::Lsn; diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index 794ecbaeb3..945f5fded8 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -112,8 +112,8 @@ mod tests { use utils::lsn::Lsn; use crate::{ - layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}, storage_sync::test_utils::{create_local_timeline, dummy_metadata}, + tenant::harness::{TenantHarness, TIMELINE_ID}, }; use remote_storage::{LocalFs, RemoteStorage}; @@ -121,7 +121,7 @@ mod tests { #[tokio::test] async fn delete_timeline_negative() -> anyhow::Result<()> { - let harness = RepoHarness::create("delete_timeline_negative")?; + let harness = TenantHarness::create("delete_timeline_negative")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( @@ -154,7 +154,7 @@ mod tests { #[tokio::test] async fn delete_timeline() -> anyhow::Result<()> { - let harness = RepoHarness::create("delete_timeline")?; + let harness = TenantHarness::create("delete_timeline")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 91ee557b79..32f228b447 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -17,7 +17,7 @@ use tokio::{ use tracing::{debug, error, info, warn}; use crate::{ - config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, + config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path, TEMP_FILE_SUFFIX, }; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; @@ -425,18 +425,18 @@ mod tests { use utils::lsn::Lsn; use crate::{ - layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}, storage_sync::{ index::RelativePath, test_utils::{create_local_timeline, dummy_metadata}, }, + tenant::harness::{TenantHarness, TIMELINE_ID}, }; use super::*; #[tokio::test] async fn download_timeline() -> anyhow::Result<()> { - let harness = RepoHarness::create("download_timeline")?; + let harness = TenantHarness::create("download_timeline")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); @@ -537,7 +537,7 @@ mod tests { #[tokio::test] async fn download_timeline_negatives() -> anyhow::Result<()> { - let harness = RepoHarness::create("download_timeline_negatives")?; + let harness = TenantHarness::create("download_timeline_negatives")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( @@ -596,7 +596,7 @@ mod tests { #[tokio::test] async fn test_download_index_part() -> anyhow::Result<()> { - let harness = RepoHarness::create("test_download_index_part")?; + let harness = TenantHarness::create("test_download_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index b17bb40da4..cff14cde49 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -15,7 +15,7 @@ use serde_with::{serde_as, DisplayFromStr}; use tokio::sync::RwLock; use tracing::log::warn; -use crate::{config::PageServerConf, layered_repository::metadata::TimelineMetadata}; +use crate::{config::PageServerConf, tenant::metadata::TimelineMetadata}; use utils::{ lsn::Lsn, zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, @@ -340,11 +340,11 @@ mod tests { use std::collections::BTreeSet; use super::*; - use crate::layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}; + use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; #[test] fn index_part_conversion() { - let harness = RepoHarness::create("index_part_conversion").unwrap(); + let harness = TenantHarness::create("index_part_conversion").unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); let metadata = TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1)); @@ -462,7 +462,7 @@ mod tests { #[test] fn index_part_conversion_negatives() { - let harness = RepoHarness::create("index_part_conversion_negatives").unwrap(); + let harness = TenantHarness::create("index_part_conversion_negatives").unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); let metadata = TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1)); diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index a4285e426b..bd09e6b898 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -15,9 +15,7 @@ use super::{ LayersUpload, SyncData, SyncQueue, }; use crate::metrics::NO_LAYERS_UPLOAD; -use crate::{ - config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, -}; +use crate::{config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path}; /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part( @@ -202,18 +200,18 @@ mod tests { use utils::lsn::Lsn; use crate::{ - layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}, storage_sync::{ index::RelativePath, test_utils::{create_local_timeline, dummy_metadata}, }, + tenant::harness::{TenantHarness, TIMELINE_ID}, }; use super::{upload_index_part, *}; #[tokio::test] async fn regular_layer_upload() -> anyhow::Result<()> { - let harness = RepoHarness::create("regular_layer_upload")?; + let harness = TenantHarness::create("regular_layer_upload")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); @@ -301,7 +299,7 @@ mod tests { // Currently, GC can run between upload retries, removing local layers scheduled for upload. Test this scenario. #[tokio::test] async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> { - let harness = RepoHarness::create("layer_upload_after_local_fs_update")?; + let harness = TenantHarness::create("layer_upload_after_local_fs_update")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); @@ -396,7 +394,7 @@ mod tests { #[tokio::test] async fn test_upload_index_part() -> anyhow::Result<()> { - let harness = RepoHarness::create("test_upload_index_part")?; + let harness = TenantHarness::create("test_upload_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/tenant.rs similarity index 88% rename from pageserver/src/layered_repository.rs rename to pageserver/src/tenant.rs index ecc0bfe3b5..4ef810faba 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/tenant.rs @@ -1,6 +1,6 @@ //! //! Timeline repository implementation that keeps old data in files on disk, and -//! the recent changes in memory. See layered_repository/*_layer.rs files. +//! the recent changes in memory. See tenant/*_layer.rs files. //! The functions here are responsible for locating the correct layer for the //! get/put call, walking back the timeline branching history as needed. //! @@ -12,6 +12,7 @@ //! use anyhow::{bail, ensure, Context, Result}; +use tokio::sync::watch; use tracing::*; use std::cmp::min; @@ -71,24 +72,26 @@ use storage_layer::Layer; pub use timeline::Timeline; // re-export this function so that page_cache.rs can use it. -pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; +pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file; // re-export for use in storage_sync.rs -pub use crate::layered_repository::metadata::save_metadata; +pub use crate::tenant::metadata::save_metadata; // re-export for use in walreceiver -pub use crate::layered_repository::timeline::WalReceiverInfo; +pub use crate::tenant::timeline::WalReceiverInfo; /// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; /// -/// Repository consists of multiple timelines. Keep them in a hash table. +/// Tenant consists of multiple timelines. Keep them in a hash table. /// -pub struct Repository { +pub struct Tenant { // Global pageserver config parameters pub conf: &'static PageServerConf, + state: watch::Sender, + // Overridden tenant-specific config parameters. // We keep TenantConfOpt sturct here to preserve the information // about parameters that are not set. @@ -114,17 +117,40 @@ pub struct Repository { upload_layers: bool, } +/// A state of a tenant in pageserver's memory. +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum TenantState { + /// Tenant is fully operational, its background jobs might be running or not. + Active { background_jobs_running: bool }, + /// A tenant is recognized by pageserver, but not yet ready to operate: + /// e.g. not present locally and being downloaded or being read into memory from the file system. + Paused, + /// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated. + Broken, +} + /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. -impl Repository { +impl Tenant { /// Get Timeline handle for given zenith timeline ID. /// This function is idempotent. It doesn't change internal state in any way. - pub fn get_timeline(&self, timeline_id: ZTimelineId) -> Option> { - self.timelines.lock().unwrap().get(&timeline_id).cloned() + pub fn get_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result> { + self.timelines + .lock() + .unwrap() + .get(&timeline_id) + .with_context(|| { + format!( + "Timeline {} was not found for tenant {}", + timeline_id, + self.tenant_id() + ) + }) + .map(Arc::clone) } - /// Lists timelines the repository contains. - /// Up to repository's implementation to omit certain timelines that ar not considered ready for use. + /// Lists timelines the tenant contains. + /// Up to tenant's implementation to omit certain timelines that ar not considered ready for use. pub fn list_timelines(&self) -> Vec<(ZTimelineId, Arc)> { self.timelines .lock() @@ -425,6 +451,54 @@ impl Repository { pub fn get_remote_index(&self) -> &RemoteIndex { &self.remote_index } + + pub fn current_state(&self) -> TenantState { + *self.state.borrow() + } + + pub fn is_active(&self) -> bool { + matches!(self.current_state(), TenantState::Active { .. }) + } + + pub fn should_run_tasks(&self) -> bool { + matches!( + self.current_state(), + TenantState::Active { + background_jobs_running: true + } + ) + } + + /// Changes tenant status to active, if it was not broken before. + /// Otherwise, ignores the state change, logging an error. + pub fn activate(&self, enable_background_jobs: bool) { + self.set_state(TenantState::Active { + background_jobs_running: enable_background_jobs, + }); + } + + pub fn set_state(&self, new_state: TenantState) { + match (self.current_state(), new_state) { + (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => { + debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}"); + } + (TenantState::Broken, _) => { + error!("Ignoring state update {new_state:?} for broken tenant"); + } + (_, new_state) => { + self.state.send_replace(new_state); + if self.should_run_tasks() { + // Spawn gc and compaction loops. The loops will shut themselves + // down when they notice that the tenant is inactive. + crate::tenant_tasks::start_background_loops(self.tenant_id); + } + } + } + } + + pub fn subscribe_for_state_updates(&self) -> watch::Receiver { + self.state.subscribe() + } } /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id), @@ -471,7 +545,7 @@ fn tree_sort_timelines( } /// Private functions -impl Repository { +impl Tenant { pub fn get_checkpoint_distance(&self) -> u64 { let tenant_conf = self.tenant_conf.read().unwrap(); tenant_conf @@ -609,8 +683,9 @@ impl Repository { tenant_id: ZTenantId, remote_index: RemoteIndex, upload_layers: bool, - ) -> Repository { - Repository { + ) -> Tenant { + let (state, _) = watch::channel(TenantState::Paused); + Tenant { tenant_id, conf, tenant_conf: Arc::new(RwLock::new(tenant_conf)), @@ -619,6 +694,7 @@ impl Repository { walredo_mgr, remote_index, upload_layers, + state, } } @@ -848,7 +924,7 @@ impl Repository { // compaction (both require `layer_removal_cs` lock), // but the GC iteration can run concurrently with branch creation. // - // See comments in [`Repository::branch_timeline`] for more information + // See comments in [`Tenant::branch_timeline`] for more information // about why branch creation task can run concurrently with timeline's GC iteration. for timeline in gc_timelines { if task_mgr::is_shutdown_requested() { @@ -881,7 +957,7 @@ impl Repository { } } -impl Drop for Repository { +impl Drop for Tenant { fn drop(&mut self) { remove_tenant_metrics(&self.tenant_id); } @@ -910,7 +986,7 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { } #[cfg(test)] -pub mod repo_harness { +pub mod harness { use bytes::{Bytes, BytesMut}; use once_cell::sync::Lazy; use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; @@ -920,8 +996,8 @@ pub mod repo_harness { use crate::storage_sync::index::RemoteIndex; use crate::{ config::PageServerConf, - layered_repository::Repository, repository::Key, + tenant::Tenant, walrecord::ZenithWalRecord, walredo::{WalRedoError, WalRedoManager}, }; @@ -968,7 +1044,7 @@ pub mod repo_harness { } } - pub struct RepoHarness<'a> { + pub struct TenantHarness<'a> { pub conf: &'static PageServerConf, pub tenant_conf: TenantConf, pub tenant_id: ZTenantId, @@ -979,7 +1055,7 @@ pub mod repo_harness { ), } - impl<'a> RepoHarness<'a> { + impl<'a> TenantHarness<'a> { pub fn create(test_name: &'static str) -> Result { Self::create_internal(test_name, false) } @@ -1016,14 +1092,14 @@ pub mod repo_harness { }) } - pub fn load(&self) -> Repository { - self.try_load().expect("failed to load test repo") + pub fn load(&self) -> Tenant { + self.try_load().expect("failed to load test tenant") } - pub fn try_load(&self) -> Result { + pub fn try_load(&self) -> Result { let walredo_mgr = Arc::new(TestRedoManager); - let repo = Repository::new( + let tenant = Tenant::new( self.conf, TenantConfOpt::from(self.tenant_conf), walredo_mgr, @@ -1031,7 +1107,7 @@ pub mod repo_harness { RemoteIndex::default(), false, ); - // populate repo with locally available timelines + // populate tenant with locally available timelines let mut timelines_to_load = HashMap::new(); for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) .expect("should be able to read timelines dir") @@ -1043,12 +1119,13 @@ pub mod repo_harness { .unwrap() .to_string_lossy() .parse()?; + let timeline_metadata = load_metadata(self.conf, timeline_id, self.tenant_id)?; timelines_to_load.insert(timeline_id, timeline_metadata); } - repo.init_attach_timelines(timelines_to_load)?; + tenant.init_attach_timelines(timelines_to_load)?; - Ok(repo) + Ok(tenant) } pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf { @@ -1110,8 +1187,8 @@ mod tests { use super::metadata::METADATA_FILE_NAME; use super::*; use crate::keyspace::KeySpaceAccum; - use crate::layered_repository::repo_harness::*; use crate::repository::{Key, Value}; + use crate::tenant::harness::*; use bytes::BytesMut; use hex_literal::hex; use once_cell::sync::Lazy; @@ -1122,8 +1199,8 @@ mod tests { #[test] fn test_basic() -> Result<()> { - let repo = RepoHarness::create("test_basic")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_basic")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1144,10 +1221,10 @@ mod tests { #[test] fn no_duplicate_timelines() -> Result<()> { - let repo = RepoHarness::create("no_duplicate_timelines")?.load(); - let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("no_duplicate_timelines")?.load(); + let _ = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) { + match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0)) { Ok(_) => panic!("duplicate timeline creation should fail"), Err(e) => assert_eq!( e.to_string(), @@ -1170,8 +1247,8 @@ mod tests { /// #[test] fn test_branch() -> Result<()> { - let repo = RepoHarness::create("test_branch")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_branch")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let writer = tline.writer(); use std::str::from_utf8; @@ -1193,8 +1270,8 @@ mod tests { //assert_current_logical_size(&tline, Lsn(0x40)); // Branch the history, modify relation differently on the new timeline - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; - let newtline = repo + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; + let newtline = tenant .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); let new_writer = newtline.writer(); @@ -1263,19 +1340,20 @@ mod tests { #[test] fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> { - let repo = - RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = + TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? + .load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 // FIXME: this doesn't actually remove any layer currently, given how the checkpointing // and compaction works. But it does set the 'cutoff' point so that the cross check // below should fail. - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; // try to branch at lsn 25, should fail because we already garbage collected the data - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { + match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), Err(err) => { assert!(err.to_string().contains("invalid branch start lsn")); @@ -1292,11 +1370,12 @@ mod tests { #[test] fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> { - let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); + let tenant = + TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); - repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { + match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), Err(err) => { assert!(&err.to_string().contains("invalid branch start lsn")); @@ -1336,36 +1415,37 @@ mod tests { #[test] fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { - let repo = - RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = + TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; - let newtline = repo + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + let newtline = tenant .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok()); Ok(()) } #[test] fn test_parent_keeps_data_forever_after_branching() -> Result<()> { - let repo = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = + TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; - let newtline = repo + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + let newtline = tenant .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); make_some_layers(newtline.as_ref(), Lsn(0x60))?; // run gc on parent - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; // Check that the data is still accessible on the branch. assert_eq!( @@ -1379,16 +1459,17 @@ mod tests { #[test] fn timeline_load() -> Result<()> { const TEST_NAME: &str = "timeline_load"; - let harness = RepoHarness::create(TEST_NAME)?; + let harness = TenantHarness::create(TEST_NAME)?; { - let repo = harness.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; + let tenant = harness.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; make_some_layers(tline.as_ref(), Lsn(0x8000))?; tline.checkpoint(CheckpointConfig::Forced)?; } - let repo = harness.load(); - repo.get_timeline(TIMELINE_ID) + let tenant = harness.load(); + tenant + .get_timeline(TIMELINE_ID) .expect("cannot load timeline"); Ok(()) @@ -1397,18 +1478,18 @@ mod tests { #[test] fn timeline_load_with_ancestor() -> Result<()> { const TEST_NAME: &str = "timeline_load_with_ancestor"; - let harness = RepoHarness::create(TEST_NAME)?; + let harness = TenantHarness::create(TEST_NAME)?; // create two timelines { - let repo = harness.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = harness.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tline.checkpoint(CheckpointConfig::Forced)?; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; - let newtline = repo + let newtline = tenant .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); @@ -1417,14 +1498,14 @@ mod tests { } // check that both of them are initially unloaded - let repo = harness.load(); + let tenant = harness.load(); // check that both, child and ancestor are loaded - let _child_tline = repo + let _child_tline = tenant .get_timeline(NEW_TIMELINE_ID) .expect("cannot get child timeline loaded"); - let _ancestor_tline = repo + let _ancestor_tline = tenant .get_timeline(TIMELINE_ID) .expect("cannot get ancestor timeline loaded"); @@ -1434,11 +1515,11 @@ mod tests { #[test] fn corrupt_metadata() -> Result<()> { const TEST_NAME: &str = "corrupt_metadata"; - let harness = RepoHarness::create(TEST_NAME)?; - let repo = harness.load(); + let harness = TenantHarness::create(TEST_NAME)?; + let tenant = harness.load(); - repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - drop(repo); + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + drop(tenant); let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); @@ -1473,8 +1554,8 @@ mod tests { #[test] fn test_images() -> Result<()> { - let repo = RepoHarness::create("test_images")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_images")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1523,8 +1604,8 @@ mod tests { // #[test] fn test_bulk_insert() -> Result<()> { - let repo = RepoHarness::create("test_bulk_insert")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_bulk_insert")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let mut lsn = Lsn(0x10); @@ -1563,8 +1644,8 @@ mod tests { #[test] fn test_random_updates() -> Result<()> { - let repo = RepoHarness::create("test_random_updates")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_random_updates")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; const NUM_KEYS: usize = 1000; @@ -1633,8 +1714,8 @@ mod tests { #[test] fn test_traverse_branches() -> Result<()> { - let repo = RepoHarness::create("test_traverse_branches")?.load(); - let mut tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_traverse_branches")?.load(); + let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; const NUM_KEYS: usize = 1000; @@ -1667,8 +1748,8 @@ mod tests { let mut tline_id = TIMELINE_ID; for _ in 0..50 { let new_tline_id = ZTimelineId::generate(); - repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?; - tline = repo + tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; + tline = tenant .get_timeline(new_tline_id) .expect("Should have the branched timeline"); tline_id = new_tline_id; @@ -1712,8 +1793,8 @@ mod tests { #[test] fn test_traverse_ancestors() -> Result<()> { - let repo = RepoHarness::create("test_traverse_ancestors")?.load(); - let mut tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_traverse_ancestors")?.load(); + let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; const NUM_KEYS: usize = 100; const NUM_TLINES: usize = 50; @@ -1728,8 +1809,8 @@ mod tests { #[allow(clippy::needless_range_loop)] for idx in 0..NUM_TLINES { let new_tline_id = ZTimelineId::generate(); - repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?; - tline = repo + tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; + tline = tenant .get_timeline(new_tline_id) .expect("Should have the branched timeline"); tline_id = new_tline_id; diff --git a/pageserver/src/layered_repository/blob_io.rs b/pageserver/src/tenant/blob_io.rs similarity index 98% rename from pageserver/src/layered_repository/blob_io.rs rename to pageserver/src/tenant/blob_io.rs index a4c6186056..78ecbcb9c1 100644 --- a/pageserver/src/layered_repository/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -11,8 +11,8 @@ //! len < 128: 0XXXXXXX //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! -use crate::layered_repository::block_io::{BlockCursor, BlockReader}; use crate::page_cache::PAGE_SZ; +use crate::tenant::block_io::{BlockCursor, BlockReader}; use std::cmp::min; use std::io::{Error, ErrorKind}; diff --git a/pageserver/src/layered_repository/block_io.rs b/pageserver/src/tenant/block_io.rs similarity index 98% rename from pageserver/src/layered_repository/block_io.rs rename to pageserver/src/tenant/block_io.rs index 5e32b8833a..bbcdabe1cd 100644 --- a/pageserver/src/layered_repository/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -60,7 +60,7 @@ where /// the underlying BlockReader. For example: /// /// ```no_run -/// # use pageserver::layered_repository::block_io::{BlockReader, FileBlockReader}; +/// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader}; /// # let reader: FileBlockReader = todo!(); /// let cursor = reader.block_cursor(); /// let buf = cursor.read_blk(1); diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs similarity index 98% rename from pageserver/src/layered_repository/delta_layer.rs rename to pageserver/src/tenant/delta_layer.rs index af02f84bc0..ff6d3652f9 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/tenant/delta_layer.rs @@ -24,15 +24,13 @@ //! "values" part. //! use crate::config::PageServerConf; -use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; -use crate::layered_repository::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader}; -use crate::layered_repository::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; -use crate::layered_repository::filename::{DeltaFileName, PathOrConf}; -use crate::layered_repository::storage_layer::{ - Layer, ValueReconstructResult, ValueReconstructState, -}; use crate::page_cache::{PageReadGuard, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; +use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; +use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader}; +use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; +use crate::tenant::filename::{DeltaFileName, PathOrConf}; +use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use crate::virtual_file::VirtualFile; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; diff --git a/pageserver/src/layered_repository/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs similarity index 99% rename from pageserver/src/layered_repository/disk_btree.rs rename to pageserver/src/tenant/disk_btree.rs index c130a42a8e..33255dbd82 100644 --- a/pageserver/src/layered_repository/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -25,7 +25,7 @@ use std::{cmp::Ordering, io, result}; use thiserror::Error; use tracing::error; -use crate::layered_repository::block_io::{BlockReader, BlockWriter}; +use crate::tenant::block_io::{BlockReader, BlockWriter}; // The maximum size of a value stored in the B-tree. 5 bytes is enough currently. pub const VALUE_SZ: usize = 5; diff --git a/pageserver/src/layered_repository/disk_btree_test_data.rs b/pageserver/src/tenant/disk_btree_test_data.rs similarity index 100% rename from pageserver/src/layered_repository/disk_btree_test_data.rs rename to pageserver/src/tenant/disk_btree_test_data.rs diff --git a/pageserver/src/layered_repository/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs similarity index 97% rename from pageserver/src/layered_repository/ephemeral_file.rs rename to pageserver/src/tenant/ephemeral_file.rs index a1b2d68cd5..c675e4e778 100644 --- a/pageserver/src/layered_repository/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -2,11 +2,11 @@ //! used to keep in-memory layers spilled on disk. use crate::config::PageServerConf; -use crate::layered_repository::blob_io::BlobWriter; -use crate::layered_repository::block_io::BlockReader; use crate::page_cache; use crate::page_cache::PAGE_SZ; use crate::page_cache::{ReadBufResult, WriteBufResult}; +use crate::tenant::blob_io::BlobWriter; +use crate::tenant::block_io::BlockReader; use crate::virtual_file::VirtualFile; use once_cell::sync::Lazy; use std::cmp::min; @@ -330,13 +330,13 @@ fn to_io_error(e: anyhow::Error, context: &str) -> io::Error { #[cfg(test)] mod tests { use super::*; - use crate::layered_repository::blob_io::{BlobCursor, BlobWriter}; - use crate::layered_repository::block_io::BlockCursor; + use crate::tenant::blob_io::{BlobCursor, BlobWriter}; + use crate::tenant::block_io::BlockCursor; use rand::{seq::SliceRandom, thread_rng, RngCore}; use std::fs; use std::str::FromStr; - fn repo_harness( + fn harness( test_name: &str, ) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), io::Error> { let repo_dir = PageServerConf::test_repo_dir(test_name); @@ -368,7 +368,7 @@ mod tests { #[test] fn test_ephemeral_files() -> Result<(), io::Error> { - let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?; + let (conf, tenantid, timelineid) = harness("ephemeral_files")?; let file_a = EphemeralFile::create(conf, tenantid, timelineid)?; @@ -399,7 +399,7 @@ mod tests { #[test] fn test_ephemeral_blobs() -> Result<(), io::Error> { - let (conf, tenantid, timelineid) = repo_harness("ephemeral_blobs")?; + let (conf, tenantid, timelineid) = harness("ephemeral_blobs")?; let mut file = EphemeralFile::create(conf, tenantid, timelineid)?; diff --git a/pageserver/src/layered_repository/filename.rs b/pageserver/src/tenant/filename.rs similarity index 100% rename from pageserver/src/layered_repository/filename.rs rename to pageserver/src/tenant/filename.rs diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/tenant/image_layer.rs similarity index 97% rename from pageserver/src/layered_repository/image_layer.rs rename to pageserver/src/tenant/image_layer.rs index 4fe771bb3f..518643241d 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/tenant/image_layer.rs @@ -20,15 +20,13 @@ //! mapping from Key to an offset in the "values" part. The //! actual page images are stored in the "values" part. use crate::config::PageServerConf; -use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; -use crate::layered_repository::block_io::{BlockBuf, BlockReader, FileBlockReader}; -use crate::layered_repository::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; -use crate::layered_repository::filename::{ImageFileName, PathOrConf}; -use crate::layered_repository::storage_layer::{ - Layer, ValueReconstructResult, ValueReconstructState, -}; use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value, KEY_SIZE}; +use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; +use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; +use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; +use crate::tenant::filename::{ImageFileName, PathOrConf}; +use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use crate::virtual_file::VirtualFile; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{bail, ensure, Context, Result}; diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/tenant/inmemory_layer.rs similarity index 96% rename from pageserver/src/layered_repository/inmemory_layer.rs rename to pageserver/src/tenant/inmemory_layer.rs index 5f269a868f..0e7b215b1e 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/tenant/inmemory_layer.rs @@ -5,14 +5,12 @@ //! its position in the file, is kept in memory, though. //! use crate::config::PageServerConf; -use crate::layered_repository::blob_io::{BlobCursor, BlobWriter}; -use crate::layered_repository::block_io::BlockReader; -use crate::layered_repository::delta_layer::{DeltaLayer, DeltaLayerWriter}; -use crate::layered_repository::ephemeral_file::EphemeralFile; -use crate::layered_repository::storage_layer::{ - Layer, ValueReconstructResult, ValueReconstructState, -}; use crate::repository::{Key, Value}; +use crate::tenant::blob_io::{BlobCursor, BlobWriter}; +use crate::tenant::block_io::BlockReader; +use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter}; +use crate::tenant::ephemeral_file::EphemeralFile; +use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use crate::walrecord; use anyhow::{bail, ensure, Result}; use std::cell::RefCell; diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/tenant/layer_map.rs similarity index 98% rename from pageserver/src/layered_repository/layer_map.rs rename to pageserver/src/tenant/layer_map.rs index 88dcf32409..c24e3976fb 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -10,11 +10,11 @@ //! corresponding files are written to disk. //! -use crate::layered_repository::inmemory_layer::InMemoryLayer; -use crate::layered_repository::storage_layer::Layer; -use crate::layered_repository::storage_layer::{range_eq, range_overlaps}; use crate::metrics::NUM_ONDISK_LAYERS; use crate::repository::Key; +use crate::tenant::inmemory_layer::InMemoryLayer; +use crate::tenant::storage_layer::Layer; +use crate::tenant::storage_layer::{range_eq, range_overlaps}; use anyhow::Result; use std::collections::VecDeque; use std::ops::Range; diff --git a/pageserver/src/layered_repository/metadata.rs b/pageserver/src/tenant/metadata.rs similarity index 98% rename from pageserver/src/layered_repository/metadata.rs rename to pageserver/src/tenant/metadata.rs index 910dba4644..4ea2b7d55b 100644 --- a/pageserver/src/layered_repository/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -1,4 +1,4 @@ -//! Every image of a certain timeline from [`crate::layered_repository::Repository`] +//! Every image of a certain timeline from [`crate::tenant::Tenant`] //! has a metadata that needs to be stored persistently. //! //! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of @@ -216,7 +216,7 @@ pub fn save_metadata( #[cfg(test)] mod tests { use super::*; - use crate::layered_repository::repo_harness::TIMELINE_ID; + use crate::tenant::harness::TIMELINE_ID; #[test] fn metadata_serializes_correctly() { diff --git a/pageserver/src/layered_repository/par_fsync.rs b/pageserver/src/tenant/par_fsync.rs similarity index 100% rename from pageserver/src/layered_repository/par_fsync.rs rename to pageserver/src/tenant/par_fsync.rs diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs similarity index 100% rename from pageserver/src/layered_repository/storage_layer.rs rename to pageserver/src/tenant/storage_layer.rs diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/tenant/timeline.rs similarity index 99% rename from pageserver/src/layered_repository/timeline.rs rename to pageserver/src/tenant/timeline.rs index 60abbe33e6..c96ad99909 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -17,7 +17,7 @@ use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering} use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError}; use std::time::{Duration, Instant, SystemTime}; -use crate::layered_repository::{ +use crate::tenant::{ delta_layer::{DeltaLayer, DeltaLayerWriter}, ephemeral_file::is_ephemeral_file, filename::{DeltaFileName, ImageFileName}, @@ -118,7 +118,7 @@ pub struct Timeline { /// Layer removal lock. /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks. /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`], - /// and [`Repository::delete_timeline`]. + /// and [`Tenant::delete_timeline`]. layer_removal_cs: Mutex<()>, // Needed to ensure that we can't create a branch at a point that was already garbage collected diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index a9f015229f..a8a9926c77 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -1,26 +1,31 @@ //! This module acts as a switchboard to access different repositories managed by this //! page server. -use crate::config::PageServerConf; -use crate::http::models::TenantInfo; -use crate::layered_repository::ephemeral_file::is_ephemeral_file; -use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}; -use crate::layered_repository::Repository; -use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; -use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; -use crate::task_mgr::{self, TaskKind}; -use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::walredo::{PostgresRedoManager, WalRedoManager}; -use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; -use anyhow::Context; -use remote_storage::{path_with_suffix_extension, GenericRemoteStorage}; use std::collections::{hash_map, HashMap, HashSet}; use std::ffi::OsStr; use std::fs; use std::path::{Path, PathBuf}; use std::sync::Arc; + +use anyhow::Context; use tracing::*; +use remote_storage::{path_with_suffix_extension, GenericRemoteStorage}; + +use crate::config::PageServerConf; +use crate::http::models::TenantInfo; +use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; +use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; +use crate::task_mgr::{self, TaskKind}; +use crate::tenant::{ + ephemeral_file::is_ephemeral_file, + metadata::{TimelineMetadata, METADATA_FILE_NAME}, + Tenant, TenantState, +}; +use crate::tenant_config::{TenantConf, TenantConfOpt}; +use crate::walredo::PostgresRedoManager; +use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; + use utils::crashsafe_dir; use utils::zid::{ZTenantId, ZTimelineId}; @@ -28,64 +33,31 @@ mod tenants_state { use once_cell::sync::Lazy; use std::{ collections::HashMap, - sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}, + sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}, }; use utils::zid::ZTenantId; - use crate::tenant_mgr::Tenant; + use crate::tenant::Tenant; - static TENANTS: Lazy>> = + static TENANTS: Lazy>>> = Lazy::new(|| RwLock::new(HashMap::new())); - pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap> { + pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap>> { TENANTS .read() .expect("Failed to read() tenants lock, it got poisoned") } - pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap> { + pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap>> { TENANTS .write() .expect("Failed to write() tenants lock, it got poisoned") } } -struct Tenant { - state: TenantState, - /// Contains in-memory state, including the timeline that might not yet flushed on disk or loaded form disk. - repo: Arc, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] -pub enum TenantState { - // This tenant exists on local disk, and the layer map has been loaded into memory. - // The local disk might have some newer files that don't exist in cloud storage yet. - Active, - // Tenant is active, but there is no walreceiver connection. - Idle, - // This tenant exists on local disk, and the layer map has been loaded into memory. - // The local disk might have some newer files that don't exist in cloud storage yet. - // The tenant cannot be accessed anymore for any reason, but graceful shutdown. - Stopping, - - // Something went wrong loading the tenant state - Broken, -} - -impl std::fmt::Display for TenantState { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Active => f.write_str("Active"), - Self::Idle => f.write_str("Idle"), - Self::Stopping => f.write_str("Stopping"), - Self::Broken => f.write_str("Broken"), - } - } -} - /// Initialize repositories with locally available timelines. /// Timelines that are only partially available locally (remote storage has more data than this pageserver) -/// are scheduled for download and added to the repository once download is completed. +/// are scheduled for download and added to the tenant once download is completed. pub fn init_tenant_mgr( conf: &'static PageServerConf, remote_storage: Option, @@ -128,7 +100,7 @@ pub fn init_tenant_mgr( ) }; - attach_local_tenants(conf, &remote_index, tenants_to_attach)?; + attach_local_tenants(conf, &remote_index, tenants_to_attach); Ok(remote_index) } @@ -141,7 +113,7 @@ pub fn attach_local_tenants( conf: &'static PageServerConf, remote_index: &RemoteIndex, tenants_to_attach: TenantTimelineValues, -) -> anyhow::Result<()> { +) { let _entered = info_span!("attach_local_tenants").entered(); let number_of_tenants = tenants_to_attach.0.len(); @@ -152,104 +124,109 @@ pub fn attach_local_tenants( ); debug!("Timelines to attach: {local_timelines:?}"); - let repository = load_local_repo(conf, tenant_id, remote_index) - .context("Failed to load repository for tenant")?; - - let repo = Arc::clone(&repository); + let tenant = load_local_tenant(conf, tenant_id, remote_index); { match tenants_state::write_tenants().entry(tenant_id) { hash_map::Entry::Occupied(_) => { - anyhow::bail!("Cannot attach tenant {tenant_id}: there's already an entry in the tenant state"); + error!("Cannot attach tenant {tenant_id}: there's already an entry in the tenant state"); + continue; } hash_map::Entry::Vacant(v) => { - v.insert(Tenant { - state: TenantState::Idle, - repo, - }); + v.insert(Arc::clone(&tenant)); + } + } + } + + if tenant.current_state() == TenantState::Broken { + warn!("Skipping timeline load for broken tenant {tenant_id}") + } else { + let has_timelines = !local_timelines.is_empty(); + match tenant.init_attach_timelines(local_timelines) { + Ok(()) => { + info!("successfully loaded local timelines for tenant {tenant_id}"); + tenant.activate(has_timelines); + } + Err(e) => { + error!("Failed to attach tenant timelines: {e:?}"); + tenant.set_state(TenantState::Broken); } } } - // XXX: current timeline init enables walreceiver that looks for tenant in the state, so insert the tenant entry before - repository - .init_attach_timelines(local_timelines) - .context("Failed to attach timelines for tenant")?; } - info!("Processed {number_of_tenants} local tenants during attach"); - Ok(()) + info!("Processed {number_of_tenants} local tenants during attach") } -fn load_local_repo( +fn load_local_tenant( conf: &'static PageServerConf, tenant_id: ZTenantId, remote_index: &RemoteIndex, -) -> anyhow::Result> { - let repository = Repository::new( +) -> Arc { + let tenant = Arc::new(Tenant::new( conf, TenantConfOpt::default(), Arc::new(PostgresRedoManager::new(conf, tenant_id)), tenant_id, remote_index.clone(), conf.remote_storage_config.is_some(), - ); - let tenant_conf = Repository::load_tenant_config(conf, tenant_id)?; - repository.update_tenant_config(tenant_conf); - - Ok(Arc::new(repository)) + )); + match Tenant::load_tenant_config(conf, tenant_id) { + Ok(tenant_conf) => { + tenant.update_tenant_config(tenant_conf); + tenant.activate(false); + } + Err(e) => { + error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}"); + tenant.set_state(TenantState::Broken); + } + } + tenant } /// /// Shut down all tenants. This runs as part of pageserver shutdown. /// pub async fn shutdown_all_tenants() { - let tenantids = { + let tenants_to_shut_down = { let mut m = tenants_state::write_tenants(); - let mut tenantids = Vec::new(); - for (tenantid, tenant) in m.iter_mut() { - match tenant.state { - TenantState::Active | TenantState::Idle | TenantState::Stopping => { - tenant.state = TenantState::Stopping; - tenantids.push(*tenantid) - } - TenantState::Broken => {} + let mut tenants_to_shut_down = Vec::with_capacity(m.len()); + for (_, tenant) in m.drain() { + if tenant.is_active() { + // updates tenant state, forbidding new GC and compaction iterations from starting + tenant.set_state(TenantState::Paused); + tenants_to_shut_down.push(tenant) } } drop(m); - tenantids + tenants_to_shut_down }; + // Shut down all existing walreceiver connections and stop accepting the new ones. task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await; // Ok, no background tasks running anymore. Flush any remaining data in // memory to disk. // // We assume that any incoming connections that might request pages from - // the repository have already been terminated by the caller, so there + // the tenant have already been terminated by the caller, so there // should be no more activity in any of the repositories. // // On error, log it but continue with the shutdown for other tenants. - for tenant_id in tenantids { + for tenant in tenants_to_shut_down { + let tenant_id = tenant.tenant_id(); debug!("shutdown tenant {tenant_id}"); - match get_repository_for_tenant(tenant_id) { - Ok(repo) => { - if let Err(err) = repo.checkpoint() { - error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}"); - } - } - Err(err) => { - error!("Could not get repository for tenant {tenant_id} during shutdown: {err:?}"); - } + + if let Err(err) = tenant.checkpoint() { + error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}"); } } } -fn create_repo( +fn create_tenant_files( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: ZTenantId, - wal_redo_manager: Arc, - remote_index: RemoteIndex, -) -> anyhow::Result> { +) -> anyhow::Result<()> { let target_tenant_directory = conf.tenant_path(&tenant_id); anyhow::ensure!( !target_tenant_directory.exists(), @@ -282,7 +259,7 @@ fn create_repo( ) })?; // first, create a config in the top-level temp directory, fsync the file - Repository::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true)?; + Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true)?; // then, create a subdirectory in the top-level temp directory, fsynced crashsafe_dir::create_dir(&temporary_tenant_timelines_dir).with_context(|| { format!( @@ -312,18 +289,11 @@ fn create_repo( fs::File::open(target_dir_parent)?.sync_all()?; info!( - "created directory structure in {}", + "created tenant directory structure in {}", target_tenant_directory.display() ); - Ok(Arc::new(Repository::new( - conf, - tenant_conf, - wal_redo_manager, - tenant_id, - remote_index, - conf.remote_storage_config.is_some(), - ))) + Ok(()) } fn rebase_directory(original_path: &Path, base: &Path, new_base: &Path) -> anyhow::Result { @@ -350,12 +320,17 @@ pub fn create_tenant( } hash_map::Entry::Vacant(v) => { let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); - let repo = create_repo(conf, tenant_conf, tenant_id, wal_redo_manager, remote_index)?; - v.insert(Tenant { - state: TenantState::Active, - repo, - }); - crate::tenant_tasks::start_background_loops(tenant_id); + create_tenant_files(conf, tenant_conf, tenant_id)?; + let tenant = Arc::new(Tenant::new( + conf, + tenant_conf, + wal_redo_manager, + tenant_id, + remote_index, + conf.remote_storage_config.is_some(), + )); + tenant.activate(false); + v.insert(tenant); Ok(Some(tenant_id)) } } @@ -367,70 +342,23 @@ pub fn update_tenant_config( tenant_id: ZTenantId, ) -> anyhow::Result<()> { info!("configuring tenant {tenant_id}"); - get_repository_for_tenant(tenant_id)?.update_tenant_config(tenant_conf); - - Repository::persist_tenant_config(&TenantConf::path(conf, tenant_id), tenant_conf, false)?; + get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf); + Tenant::persist_tenant_config(&TenantConf::path(conf, tenant_id), tenant_conf, false)?; Ok(()) } -pub fn get_tenant_state(tenantid: ZTenantId) -> Option { - Some(tenants_state::read_tenants().get(&tenantid)?.state) -} - -pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { - let old_state = { - let mut m = tenants_state::write_tenants(); - let tenant = m - .get_mut(&tenant_id) - .with_context(|| format!("Tenant not found for id {tenant_id}"))?; - let old_state = tenant.state; - tenant.state = new_state; - old_state - }; - - match (old_state, new_state) { - (TenantState::Broken, TenantState::Broken) - | (TenantState::Active, TenantState::Active) - | (TenantState::Idle, TenantState::Idle) - | (TenantState::Stopping, TenantState::Stopping) => { - debug!("tenant {tenant_id} already in state {new_state}"); - } - (TenantState::Broken, ignored) => { - debug!("Ignoring {ignored} since tenant {tenant_id} is in broken state"); - } - (_, TenantState::Broken) => { - debug!("Setting tenant {tenant_id} status to broken"); - } - (TenantState::Stopping, ignored) => { - debug!("Ignoring {ignored} since tenant {tenant_id} is in stopping state"); - } - (TenantState::Idle, TenantState::Active) => { - info!("activating tenant {tenant_id}"); - - // Spawn gc and compaction loops. The loops will shut themselves - // down when they notice that the tenant is inactive. - crate::tenant_tasks::start_background_loops(tenant_id); - } - (TenantState::Idle, TenantState::Stopping) => { - info!("stopping idle tenant {tenant_id}"); - } - (TenantState::Active, TenantState::Stopping | TenantState::Idle) => { - info!("stopping tenant {tenant_id} tasks due to new state {new_state}"); - - // Note: The caller is responsible for waiting for any tasks to finish. - } - } - - Ok(()) -} - -pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result> { +/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. +/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. +pub fn get_tenant(tenant_id: ZTenantId, active_only: bool) -> anyhow::Result> { let m = tenants_state::read_tenants(); let tenant = m .get(&tenant_id) - .with_context(|| format!("Tenant {tenant_id} not found"))?; - - Ok(Arc::clone(&tenant.repo)) + .with_context(|| format!("Tenant {tenant_id} not found in the local state"))?; + if active_only && !tenant.is_active() { + anyhow::bail!("Tenant {tenant_id} is not active") + } else { + Ok(Arc::clone(tenant)) + } } pub async fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { @@ -455,9 +383,14 @@ pub async fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> info!("waiting for timeline tasks to shutdown"); task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await; info!("timeline task shutdown completed"); - match tenants_state::read_tenants().get(&tenant_id) { - Some(tenant) => tenant.repo.delete_timeline(timeline_id)?, - None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"), + match get_tenant(tenant_id, true) { + Ok(tenant) => { + tenant.delete_timeline(timeline_id)?; + if tenant.list_timelines().is_empty() { + tenant.activate(false); + } + } + Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"), } Ok(()) @@ -467,21 +400,24 @@ pub async fn detach_tenant( conf: &'static PageServerConf, tenant_id: ZTenantId, ) -> anyhow::Result<()> { - set_tenant_state(tenant_id, TenantState::Stopping)?; + let tenant = match { + let mut tenants_accessor = tenants_state::write_tenants(); + tenants_accessor.remove(&tenant_id) + } { + Some(tenant) => tenant, + None => anyhow::bail!("Tenant not found for id {tenant_id}"), + }; + + tenant.set_state(TenantState::Paused); // shutdown all tenant and timeline tasks: gc, compaction, page service) task_mgr::shutdown_tasks(None, Some(tenant_id), None).await; - { - let mut tenants_accessor = tenants_state::write_tenants(); - tenants_accessor.remove(&tenant_id); - } - // If removal fails there will be no way to successfully retry detach, // because the tenant no longer exists in the in-memory map. And it needs to be removed from it - // before we remove files, because it contains references to repository + // before we remove files, because it contains references to tenant // which references ephemeral files which are deleted on drop. So if we keep these references, // we will attempt to remove files which no longer exist. This can be fixed by having shutdown - // mechanism for repository that will clean temporary data to avoid any references to ephemeral files + // mechanism for tenant that will clean temporary data to avoid any references to ephemeral files let local_tenant_directory = conf.tenant_path(&tenant_id); fs::remove_dir_all(&local_tenant_directory).with_context(|| { format!( @@ -512,7 +448,7 @@ pub fn list_tenant_info(remote_index: &RemoteTimelineIndex) -> Vec { TenantInfo { id: *id, - state: Some(tenant.state), + state: tenant.current_state(), current_physical_size: None, has_in_progress_downloads, } diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 9aaafe7f92..3ef54838af 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -1,12 +1,14 @@ //! This module contains functions to serve per-tenant background processes, //! such as compaction and GC +use std::ops::ControlFlow; +use std::sync::Arc; use std::time::Duration; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::{Tenant, TenantState}; use crate::tenant_mgr; -use crate::tenant_mgr::TenantState; use tracing::*; use utils::zid::ZTenantId; @@ -18,7 +20,10 @@ pub fn start_background_loops(tenant_id: ZTenantId) { None, &format!("compactor for tenant {tenant_id}"), false, - compaction_loop(tenant_id), + async move { + compaction_loop(tenant_id).await; + Ok(()) + }, ); task_mgr::spawn( BACKGROUND_RUNTIME.handle(), @@ -27,43 +32,50 @@ pub fn start_background_loops(tenant_id: ZTenantId) { None, &format!("garbage collector for tenant {tenant_id}"), false, - gc_loop(tenant_id), + async move { + gc_loop(tenant_id).await; + Ok(()) + }, ); } /// /// Compaction task's main loop /// -async fn compaction_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { +async fn compaction_loop(tenant_id: ZTenantId) { + let wait_duration = Duration::from_secs(2); + info!("starting compaction loop for {tenant_id}"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - let result = async { + async { loop { trace!("waking up"); + let tenant = tokio::select! { + _ = task_mgr::shutdown_watcher() => { + info!("received compaction cancellation request"); + return; + }, + tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { + ControlFlow::Break(()) => return, + ControlFlow::Continue(tenant) => tenant, + }, + }; + // Run blocking part of the task - // Break if tenant is not active - if tenant_mgr::get_tenant_state(tenant_id) != Some(TenantState::Active) { - break Ok(()); - } - // This should not fail. If someone started us, it means that the tenant exists. - // And before you remove a tenant, you have to wait until all the associated tasks - // exit. - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - // Run compaction - let mut sleep_duration = repo.get_compaction_period(); - if let Err(e) = repo.compaction_iteration() { - error!("Compaction failed, retrying: {}", e); - sleep_duration = Duration::from_secs(2) + let mut sleep_duration = tenant.get_compaction_period(); + if let Err(e) = tenant.compaction_iteration() { + error!("Compaction failed, retrying: {e:#}"); + sleep_duration = wait_duration; } // Sleep tokio::select! { _ = task_mgr::shutdown_watcher() => { - trace!("received cancellation request"); - break Ok(()); + info!("received compaction cancellation request during idling"); + break ; }, _ = tokio::time::sleep(sleep_duration) => {}, } @@ -72,49 +84,49 @@ async fn compaction_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { .await; TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - info!( - "compaction loop stopped. State is {:?}", - tenant_mgr::get_tenant_state(tenant_id) - ); - result + trace!("compaction loop stopped."); } /// /// GC task's main loop /// -async fn gc_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { +async fn gc_loop(tenant_id: ZTenantId) { + let wait_duration = Duration::from_secs(2); + info!("starting gc loop for {tenant_id}"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - let result = async { + async { loop { trace!("waking up"); - // Break if tenant is not active - if tenant_mgr::get_tenant_state(tenant_id) != Some(TenantState::Active) { - break Ok(()); - } - // This should not fail. If someone started us, it means that the tenant exists. - // And before you remove a tenant, you have to wait until all the associated tasks - // exit. - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + let tenant = tokio::select! { + _ = task_mgr::shutdown_watcher() => { + info!("received GC cancellation request"); + return; + }, + tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { + ControlFlow::Break(()) => return, + ControlFlow::Continue(tenant) => tenant, + }, + }; // Run gc - let gc_period = repo.get_gc_period(); - let gc_horizon = repo.get_gc_horizon(); + let gc_period = tenant.get_gc_period(); + let gc_horizon = tenant.get_gc_horizon(); let mut sleep_duration = gc_period; if gc_horizon > 0 { - if let Err(e) = repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false) + if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false) { - error!("Gc failed, retrying: {}", e); - sleep_duration = Duration::from_secs(2) + error!("Gc failed, retrying: {e:#}"); + sleep_duration = wait_duration; } } // Sleep tokio::select! { _ = task_mgr::shutdown_watcher() => { - trace!("received cancellation request"); - break Ok(()); + info!("received GC cancellation request during idling"); + break; }, _ = tokio::time::sleep(sleep_duration) => {}, } @@ -122,9 +134,50 @@ async fn gc_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { } .await; TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - info!( - "GC loop stopped. State is {:?}", - tenant_mgr::get_tenant_state(tenant_id) - ); - result + trace!("GC loop stopped."); +} + +async fn wait_for_active_tenant( + tenant_id: ZTenantId, + wait: Duration, +) -> ControlFlow<(), Arc> { + let tenant = loop { + match tenant_mgr::get_tenant(tenant_id, false) { + Ok(tenant) => break tenant, + Err(e) => { + error!("Failed to get a tenant {tenant_id}: {e:#}"); + tokio::time::sleep(wait).await; + } + } + }; + + // if the tenant has a proper status already, no need to wait for anything + if tenant.should_run_tasks() { + ControlFlow::Continue(tenant) + } else { + let mut tenant_state_updates = tenant.subscribe_for_state_updates(); + loop { + match tenant_state_updates.changed().await { + Ok(()) => { + let new_state = *tenant_state_updates.borrow(); + match new_state { + TenantState::Active { + background_jobs_running: true, + } => { + debug!("Tenant state changed to active with background jobs enabled, continuing the task loop"); + return ControlFlow::Continue(tenant); + } + state => { + debug!("Not running the task loop, tenant is not active with background jobs enabled: {state:?}"); + tokio::time::sleep(wait).await; + } + } + } + Err(_sender_dropped_error) => { + info!("Tenant dropped the state updates sender, quitting waiting for tenant and the task loop"); + return ControlFlow::Break(()); + } + } + } + } } diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 35dec54d5c..69d14babf0 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -2,34 +2,28 @@ //! Timeline management code // -use anyhow::{bail, Context, Result}; -use remote_storage::path_with_suffix_extension; - use std::{ fs, path::Path, process::{Command, Stdio}, sync::Arc, }; + +use anyhow::{bail, Context, Result}; use tracing::*; +use remote_storage::path_with_suffix_extension; use utils::{ lsn::Lsn, zid::{ZTenantId, ZTimelineId}, }; use crate::config::PageServerConf; -use crate::layered_repository::{Repository, Timeline}; +use crate::tenant::{Tenant, Timeline}; use crate::tenant_mgr; use crate::CheckpointConfig; use crate::{import_datadir, TEMP_FILE_SUFFIX}; -#[derive(Debug, Clone, Copy)] -pub struct PointInTime { - pub timeline_id: ZTimelineId, - pub lsn: Lsn, -} - // Create the cluster temporarily in 'initdbpath' directory inside the repository // to get bootstrap data for timeline initialization. // @@ -69,7 +63,7 @@ fn bootstrap_timeline( conf: &'static PageServerConf, tenant_id: ZTenantId, timeline_id: ZTimelineId, - repo: &Repository, + tenant: &Tenant, ) -> Result> { // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` // temporary directory for basebackup files for the given timeline. @@ -89,7 +83,7 @@ fn bootstrap_timeline( // LSN, and any WAL after that. // Initdb lsn will be equal to last_record_lsn which will be set after import. // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = repo.create_empty_timeline(timeline_id, lsn)?; + let timeline = tenant.create_empty_timeline(timeline_id, lsn)?; import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; fail::fail_point!("before-checkpoint-new-timeline", |_| { @@ -127,16 +121,16 @@ pub(crate) async fn create_timeline( mut ancestor_start_lsn: Option, ) -> Result>> { let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; if conf.timeline_path(&new_timeline_id, &tenant_id).exists() { - debug!("timeline {} already exists", new_timeline_id); + debug!("timeline {new_timeline_id} already exists"); return Ok(None); } let loaded_timeline = match ancestor_timeline_id { Some(ancestor_timeline_id) => { - let ancestor_timeline = repo + let ancestor_timeline = tenant .get_timeline(ancestor_timeline_id) .context("Cannot branch off the timeline that's not present in pageserver")?; @@ -162,10 +156,13 @@ pub(crate) async fn create_timeline( } } - repo.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? + tenant.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? } - None => bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?, + None => bootstrap_timeline(conf, tenant_id, new_timeline_id, &tenant)?, }; + // Have added new timeline into the tenant, now its background tasks are needed. + tenant.activate(true); + Ok(Some(loaded_timeline)) } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 57592a46d3..45d0916dec 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -30,9 +30,9 @@ use anyhow::Result; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; -use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; +use crate::tenant::Timeline; use crate::walrecord::*; use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; use postgres_ffi::v14::pg_constants; @@ -1022,16 +1022,13 @@ impl<'a> WalIngest<'a> { } } -/// -/// Tests that should work the same with any Repository/Timeline implementation. -/// #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { use super::*; - use crate::layered_repository::repo_harness::*; - use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::create_test_timeline; + use crate::tenant::harness::*; + use crate::tenant::Timeline; use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; @@ -1061,8 +1058,8 @@ mod tests { #[test] fn test_relsize() -> Result<()> { - let repo = RepoHarness::create("test_relsize")?.load(); - let tline = create_test_timeline(&repo, TIMELINE_ID)?; + let tenant = TenantHarness::create("test_relsize")?.load(); + let tline = create_test_timeline(&tenant, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1189,8 +1186,8 @@ mod tests { // and then created it again within the same layer. #[test] fn test_drop_extend() -> Result<()> { - let repo = RepoHarness::create("test_drop_extend")?.load(); - let tline = create_test_timeline(&repo, TIMELINE_ID)?; + let tenant = TenantHarness::create("test_drop_extend")?.load(); + let tline = create_test_timeline(&tenant, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1229,8 +1226,8 @@ mod tests { // and then extended it again within the same layer. #[test] fn test_truncate_extend() -> Result<()> { - let repo = RepoHarness::create("test_truncate_extend")?.load(); - let tline = create_test_timeline(&repo, TIMELINE_ID)?; + let tenant = TenantHarness::create("test_truncate_extend")?.load(); + let tline = create_test_timeline(&tenant, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; // Create a 20 MB relation (the size is arbitrary) @@ -1317,8 +1314,8 @@ mod tests { /// split into multiple 1 GB segments in Postgres. #[test] fn test_large_rel() -> Result<()> { - let repo = RepoHarness::create("test_large_rel")?.load(); - let tline = create_test_timeline(&repo, TIMELINE_ID)?; + let tenant = TenantHarness::create("test_large_rel")?.load(); + let tline = create_test_timeline(&tenant, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; let mut lsn = 0x10; diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 1fcb768ddf..69e400f291 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -16,10 +16,10 @@ use std::{ time::Duration, }; -use crate::layered_repository::Timeline; use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::task_mgr::WALRECEIVER_RUNTIME; +use crate::tenant::Timeline; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ @@ -767,11 +767,11 @@ fn wal_stream_connection_string( #[cfg(test)] mod tests { use super::*; - use crate::layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}; + use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; #[test] fn no_connection_no_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("no_connection_no_candidate")?; + let harness = TenantHarness::create("no_connection_no_candidate")?; let mut state = dummy_state(&harness); let now = Utc::now().naive_utc(); @@ -857,7 +857,7 @@ mod tests { #[tokio::test] async fn connection_no_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("connection_no_candidate")?; + let harness = TenantHarness::create("connection_no_candidate")?; let mut state = dummy_state(&harness); let now = Utc::now().naive_utc(); @@ -948,7 +948,7 @@ mod tests { #[test] fn no_connection_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("no_connection_candidate")?; + let harness = TenantHarness::create("no_connection_candidate")?; let mut state = dummy_state(&harness); let now = Utc::now().naive_utc(); @@ -1053,7 +1053,7 @@ mod tests { #[tokio::test] async fn candidate_with_many_connection_failures() -> anyhow::Result<()> { - let harness = RepoHarness::create("candidate_with_many_connection_failures")?; + let harness = TenantHarness::create("candidate_with_many_connection_failures")?; let mut state = dummy_state(&harness); let now = Utc::now().naive_utc(); @@ -1117,7 +1117,7 @@ mod tests { #[tokio::test] async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("lsn_wal_over_threshcurrent_candidate")?; + let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?; let mut state = dummy_state(&harness); let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1204,7 +1204,7 @@ mod tests { #[tokio::test] async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("timeout_connection_threshhold_current_candidate")?; + let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?; let mut state = dummy_state(&harness); let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1276,7 +1276,7 @@ mod tests { #[tokio::test] async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("timeout_wal_over_threshhold_current_candidate")?; + let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?; let mut state = dummy_state(&harness); let current_lsn = Lsn(100_000).align(); let new_lsn = Lsn(100_100).align(); @@ -1353,7 +1353,7 @@ mod tests { const DUMMY_SAFEKEEPER_CONNSTR: &str = "safekeeper_connstr"; - fn dummy_state(harness: &RepoHarness) -> WalreceiverState { + fn dummy_state(harness: &TenantHarness) -> WalreceiverState { WalreceiverState { id: ZTenantTimelineId { tenant_id: harness.tenant_id, diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index e8fa9f9aca..6f1fbc2c9d 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -21,10 +21,10 @@ use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; use crate::metrics::LIVE_CONNECTIONS_COUNT; use crate::{ - layered_repository::{Timeline, WalReceiverInfo}, task_mgr, task_mgr::TaskKind, task_mgr::WALRECEIVER_RUNTIME, + tenant::{Timeline, WalReceiverInfo}, tenant_mgr, walingest::WalIngest, walrecord::DecodedWALRecord, @@ -141,8 +141,7 @@ pub async fn handle_walreceiver_connection( let tenant_id = timeline.tenant_id; let timeline_id = timeline.timeline_id; - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("no repository found for tenant {tenant_id}"))?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; // // Start streaming the WAL, from where we left off previously. @@ -283,7 +282,7 @@ pub async fn handle_walreceiver_connection( })?; if let Some(last_lsn) = status_update { - let remote_index = repo.get_remote_index(); + let remote_index = tenant.get_remote_index(); let timeline_remote_consistent_lsn = remote_index .read() .await diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 1d083b3ef9..ce3a74930e 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -71,7 +71,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # First timeline would not get loaded into pageserver due to corrupt metadata file with pytest.raises( - Exception, match=f"Could not get timeline {timeline1} in tenant {tenant1}" + Exception, match=f"Timeline {timeline1} was not found for tenant {tenant1}" ) as err: pg1.start() log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") @@ -80,7 +80,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # We don't have the remote storage enabled, which means timeline is in an incorrect state, # it's not loaded at all with pytest.raises( - Exception, match=f"Could not get timeline {timeline2} in tenant {tenant2}" + Exception, match=f"Timeline {timeline2} was not found for tenant {tenant2}" ) as err: pg2.start() log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 315ec7f306..1214d703d0 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -40,11 +40,16 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): for t in timelines: client.timeline_delete(tenant, t) + def assert_active_without_jobs(tenant): + assert get_state(tenant) == {"Active": {"background_jobs_running": False}} + # Create tenant, start compute tenant, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline(name, tenant_id=tenant) pg = env.postgres.create_start(name, tenant_id=tenant) - assert get_state(tenant) == "Active" + assert get_state(tenant) == { + "Active": {"background_jobs_running": True} + }, "Pageserver should activate a tenant and start background jobs if timelines are loaded" # Stop compute pg.stop() @@ -53,6 +58,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): for tenant_info in client.tenant_list(): tenant_id = ZTenantId(tenant_info["id"]) delete_all_timelines(tenant_id) + wait_until(10, 0.2, lambda: assert_active_without_jobs(tenant_id)) # Assert that all tasks finish quickly after tenant is detached assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0 diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index a5dadc535b..5a20dbd232 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -18,7 +18,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): invalid_tenant_id = ZTenantId.generate() with pytest.raises( NeonPageserverApiException, - match=f"Tenant {invalid_tenant_id} not found in local tenant state", + match=f"Tenant {invalid_tenant_id} not found in the local state", ): ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id) @@ -64,7 +64,8 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # check 404 with pytest.raises( - NeonPageserverApiException, match="is not found neither locally nor remotely" + NeonPageserverApiException, + match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} is not found neither locally nor remotely", ): ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) From 59d04ab66aa68be3a7b3cd7997182f9b62636190 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 13 Sep 2022 18:24:11 +0100 Subject: [PATCH 0759/1022] test_runner: redact passwords from log messages (#2434) --- test_runner/fixtures/log_helper.py | 13 +++++++++++++ test_runner/fixtures/neon_fixtures.py | 3 ++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/test_runner/fixtures/log_helper.py b/test_runner/fixtures/log_helper.py index 17f2402391..7d112fce89 100644 --- a/test_runner/fixtures/log_helper.py +++ b/test_runner/fixtures/log_helper.py @@ -1,5 +1,6 @@ import logging import logging.config +import re """ This file configures logging to use in python tests. @@ -29,6 +30,17 @@ LOGGING = { } +class PasswordFilter(logging.Filter): + """Filter out password from logs.""" + + # Good enough to filter our passwords produced by PgProtocol.connstr + FILTER = re.compile(r"(\s*)password=[^\s]+(\s*)") + + def filter(self, record: logging.LogRecord) -> bool: + record.msg = self.FILTER.sub(r"\1password=\2", str(record.msg)) + return True + + def getLogger(name="root") -> logging.Logger: """Method to get logger for tests. @@ -38,5 +50,6 @@ def getLogger(name="root") -> logging.Logger: # default logger for tests log = getLogger() +log.addFilter(PasswordFilter()) logging.config.dictConfig(LOGGING) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b47e560325..69c6d31315 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -125,7 +125,8 @@ def pytest_configure(config): if env_neon_bin: neon_binpath = env_neon_bin else: - neon_binpath = os.path.join(base_dir, "target/debug") + build_type = os.environ.get("BUILD_TYPE", "debug") + neon_binpath = os.path.join(base_dir, "target", build_type) log.info(f"neon_binpath is {neon_binpath}") if not os.path.exists(os.path.join(neon_binpath, "pageserver")): raise Exception('neon binaries not found at "{}"'.format(neon_binpath)) From db0c49148db3bbc74d314313b601e6f1e7c0be3a Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 13 Sep 2022 20:07:16 +0300 Subject: [PATCH 0760/1022] clean up metrics in handle_pagerequests --- pageserver/src/page_service.rs | 53 +++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b03dab20e0..388f40f916 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -362,6 +362,39 @@ async fn page_service_conn_main( } } +struct PageRequestMetrics { + get_rel_exists: metrics::Histogram, + get_rel_size: metrics::Histogram, + get_page_at_lsn: metrics::Histogram, + get_db_size: metrics::Histogram, +} + +impl PageRequestMetrics { + fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self { + let tenant_id = tenant_id.to_string(); + let timeline_id = timeline_id.to_string(); + + let get_rel_exists = + SMGR_QUERY_TIME.with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]); + + let get_rel_size = + SMGR_QUERY_TIME.with_label_values(&["get_rel_size", &tenant_id, &timeline_id]); + + let get_page_at_lsn = + SMGR_QUERY_TIME.with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]); + + let get_db_size = + SMGR_QUERY_TIME.with_label_values(&["get_db_size", &tenant_id, &timeline_id]); + + Self { + get_rel_exists, + get_rel_size, + get_page_at_lsn, + get_db_size, + } + } +} + #[derive(Debug)] struct PageServerHandler { conf: &'static PageServerConf, @@ -396,6 +429,8 @@ impl PageServerHandler { pgb.write_message(&BeMessage::CopyBothResponse)?; pgb.flush().await?; + let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id); + loop { let msg = tokio::select! { biased; @@ -420,32 +455,22 @@ impl PageServerHandler { trace!("query: {:?}", copy_data_bytes); let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; - let tenant_str = tenant_id.to_string(); - let timeline_str = timeline_id.to_string(); let response = match zenith_fe_msg { PagestreamFeMessage::Exists(req) => { - let _timer = SMGR_QUERY_TIME - .with_label_values(&["get_rel_exists", &tenant_str, &timeline_str]) - .start_timer(); + let _timer = metrics.get_rel_exists.start_timer(); self.handle_get_rel_exists_request(&timeline, &req).await } PagestreamFeMessage::Nblocks(req) => { - let _timer = SMGR_QUERY_TIME - .with_label_values(&["get_rel_size", &tenant_str, &timeline_str]) - .start_timer(); + let _timer = metrics.get_rel_size.start_timer(); self.handle_get_nblocks_request(&timeline, &req).await } PagestreamFeMessage::GetPage(req) => { - let _timer = SMGR_QUERY_TIME - .with_label_values(&["get_page_at_lsn", &tenant_str, &timeline_str]) - .start_timer(); + let _timer = metrics.get_page_at_lsn.start_timer(); self.handle_get_page_at_lsn_request(&timeline, &req).await } PagestreamFeMessage::DbSize(req) => { - let _timer = SMGR_QUERY_TIME - .with_label_values(&["get_db_size", &tenant_str, &timeline_str]) - .start_timer(); + let _timer = metrics.get_db_size.start_timer(); self.handle_db_size_request(&timeline, &req).await } }; From d4d57ea2ddb49c6d40b90e171188dbeecee8f9fe Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 13 Sep 2022 19:26:26 +0100 Subject: [PATCH 0761/1022] github/workflows: fix project creation via API (#2437) --- .github/actions/neon-project-create/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index d4fced4196..ba81afaaff 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -60,7 +60,7 @@ runs: --header "Authorization: Bearer ${API_KEY}" \ --data "{ \"project\": { - \"platform_id\": \"serverless\", + \"platform_id\": \"aws\", \"region_id\": \"${REGION_ID}\", \"settings\": { } } From 1d53173e62673aecc9e2c73ab6ba6f0488249207 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 13 Sep 2022 20:41:26 +0300 Subject: [PATCH 0762/1022] update openapi spec (tenant state has changed) --- pageserver/src/http/openapi_spec.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index b9a62d0f32..1f2eba05ec 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -494,7 +494,13 @@ components: id: type: string state: - type: string + oneOf: + - type: string + - type: object + properties: + background_jobs_running: + type: boolean + current_physical_size: type: integer has_in_progress_downloads: From 32b7259d5e639e3dd16e3758a1534f0f47d9a6f2 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 13 Sep 2022 22:37:20 +0300 Subject: [PATCH 0763/1022] Timeline data management RFC (#2152) --- docs/SUMMARY.md | 1 + docs/rfcs/017-timeline-data-management.md | 413 ++++++++++++++++++ .../lock_legend.svg | 4 + .../proposed_timeline_data_access_sync_1.svg | 4 + .../proposed_timeline_data_access_sync_2.svg | 4 + .../proposed_timeline_tenant_state.svg | 4 + .../timeline_data_access_sync_1.svg | 4 + .../timeline_data_access_sync_2.svg | 4 + .../timeline_tenant_state.svg | 4 + 9 files changed, 442 insertions(+) create mode 100644 docs/rfcs/017-timeline-data-management.md create mode 100644 docs/rfcs/images/017-timeline-data-management/lock_legend.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/proposed_timeline_tenant_state.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_1.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_2.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/timeline_tenant_state.svg diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 95ac512ea8..fb6467ffd5 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -79,4 +79,5 @@ - [014-storage-lsm](rfcs/014-storage-lsm.md) - [015-storage-messaging](rfcs/015-storage-messaging.md) - [016-connection-routing](rfcs/016-connection-routing.md) +- [017-timeline-data-management](rfcs/017-timeline-data-management.md) - [cluster-size-limits](rfcs/cluster-size-limits.md) diff --git a/docs/rfcs/017-timeline-data-management.md b/docs/rfcs/017-timeline-data-management.md new file mode 100644 index 0000000000..a8ca3c7ca9 --- /dev/null +++ b/docs/rfcs/017-timeline-data-management.md @@ -0,0 +1,413 @@ +# Name + +Tenant and timeline data management in pageserver + +## Summary + +This RFC attempts to describe timeline-related data management as it's done now in pageserver, highlight current complexities caused by this and propose a set of changes to mitigate them. + +The main goal is to prepare for future [on-demand layer downloads](https://github.com/neondatabase/neon/issues/2029), yet timeline data is one of the core primitive of pageserver, so a number of other RFCs are affected either. +Due to that, this document won't have a single implementation, rather requiring a set of code changes to achieve the final state. + +RFC considers the repository at the `main` branch, commit [`28243d68e60ffc7e69f158522f589f7d2e09186d`](https://github.com/neondatabase/neon/tree/28243d68e60ffc7e69f158522f589f7d2e09186d) on the time of writing. + +## Motivation + +In recent discussions, it became more clear that timeline-related code becomes harder to change: it consists of multiple disjoint modules, each requiring a synchronization to access. +The lower the code is, the complex the sync gets since many concurrent processes are involved and require orchestration to keep the data consistent. +As the number of modules and isolated data grows per timeline, more questions and corner cases arise: + +- https://github.com/neondatabase/neon/issues/1559 + right now it's not straightened out what to do when the synchronization task fails for too many times: every separate module's data has to be treated differently. + +- https://github.com/neondatabase/neon/issues/1751 + GC and compaction file activities are not well known outside their tasks code, causing race bugs + +- https://github.com/neondatabase/neon/issues/2003 + Even the tenant management gets affected: we have to alter its state based on timeline state, yet the data for making the decision is separated and the synchronisation logic has bugs + +- more issues were brought in discussions, but apparently they were too specific to the code to mention them in the issues. + For instance, `tenant_mgr` itself is a static object that we can not mock anyhow, which reduces our capabilities to test the data synchronization logic. + In fact, we have zero Rust tests that cover the case of synchronizing more than one module's data. + +On demand layer downloads would require us to dynamically manage the layer files, which we almost not doing at all on the module level, resulting in the most of their APIs dealing with timelines, rather than the layer files. +The disjoint data that would require data synchronization with possibly a chain of lock acquisitions, some async and some sync, and it would be hard to unit test it with the current code state. + +Neither this helps to easy start the on-demand download epic, nor it's easy to add more timeline-related code on top, whatever the task is. +We have to develop a vision on a number of topics before progressing safely: + +- timeline and tenant data structure and how should we access it +- sync and async worlds and in what way that should evolve +- unit tests for the complex logic + +This RFC aims to provide a general overview of the existing situation and propose ways to improve it. +The changes proposed are quite big and no single PR is expected to do the adjustments, they should gradually be done during the on-demand download work later. + +## What is a timeline and its data + +First, we need to define what data we want to manage per timeline. +Currently, the data every timeline operates is: + +- a set of layer files, on the FS + + Never updated files, created after pageserver's checkpoints and compaction runs, can be removed from the local FS due to compaction, gc or timeline deletion. + +- a set of layer files, on the remote storage + + Identically named and placed in tenant subdirectories files on the remote storage (S3), copied by a special background sync thread + +- a `metadata` file, on the FS + + Updated after every checkpoint with the never `disk_consistent_lsn` and `latest_gc_cutoff_lsn` values. Used to quickly restore timeline's basic metadata on pageserver restart. + Also contains data about the ancestor, if the timeline was branched off another timeline. + +- an `index_part.json` file, on the remote storage + + Contains `metadata` file contents and a list of layer files, available in the current S3 "directory" for the timeline. + Used to avoid potentially slow and expensive `S3 list` command, updated by the remotes storage sync thread after every operation with the remote layer files. + +- LayerMap and PageCache, in memory + + Dynamic, used to store and retrieve the page data to users. + +- timeline info, in memory + + LSNs, walreceiver data, `RemoteTimelineIndex` and other data to share via HTTP API and internal processes. + +- metrics data, in memory + + Data to push or provide to Prometheus, Opentelemetry, etc. + +Besides the data, every timeline currently needs an etcd connection to receive WAL events and connect to safekeepers. + +Timeline could be an ancestor to another one, forming a dependency tree, which is implicit right now: every time relations are looked up in place, based on the corresponding `TimelineMetadata` struct contents. +Yet, there's knowledge on a tenant as a group of timelines, belonging to a single user which is used in GC and compaction tasks, run on every tenant. +`tenant_mgr` manages tenant creation and its task startup, along with the remote storage sync for timeline layers. + +Last file being managed per-tenant is the tenant config file, created and updated on the local FS to hold tenant-specific configuration between restarts. +It's not yet anyhow synchronized with the remote storage, so only exists on the local FS. + +### How the data is stored + +We have multiple places where timeline data is stored: + +- `tenant_mgr` [holds](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L43) a static `static ref TENANTS: RwLock>` with the `Tenant` having the `local_timelines: HashMap>` inside + +- same `Tenant` above has actually two references to timelines: another via its `repo: Arc` with `pub type RepositoryImpl = LayeredRepository;` that [holds](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/layered_repository.rs#L178) `Mutex>` + +- `RemoteTimelineIndex` [contains](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/storage_sync/index.rs#L84) the metadata about timelines on the remote storage (S3) for sync reasons and possible HTTP API queries + +- `walreceiver` [stores](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/walreceiver.rs#L60) the metadata for possible HTTP API queries and its [internal state](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/walreceiver/connection_manager.rs#L245) with a reference to the timeline, its current connections and etcd subscription (if any) + +- `PageCache` contains timeline-related data, and is created globally for the whole pageserver + +- implicitly, we also have files on local FS, that contain timeline state. We operate on those files and for some operations (GC, compaction) yet we don't anyhow synchronize the access to the files per se: there are more high-level locks, ensuring only one of a group of operations is running at a time. + + On practice though, `LayerMap` and layer files are tightly coupled together: current low-level code requires a timeline to be loaded into the memory to work with it, and the code removes the layer files after removing the entry from the `LayerMap` first. + +Based on this, a high-level pageserver's module diagram with data and entities could be: + +![timeline tenant state diagram](./images/017-timeline-data-management/timeline_tenant_state.svg) + +A few comments on the diagram: + +- the diagram does not show all the data and replaces a few newtypes and type aliases (for example, completely ignores "unloaded" timelines due to reasons described below) + + It aims to show main data and means of synchronizing it. + +- modules tend to isolate their data inside and provide access to it via API + +Due to multitenancy, that results in a common pattern for storing both tenant and timeline data: `RwLock` or `Mutex` around the `HashMap`, gc and compaction tasks also use the same lock pattern to ensure no concurrent runs are happening. + +- part of the modules is asynchronous, while the other is not, that complicates the data access + +Currently, anything that's not related to tasks (walreceiver, storage sync, GC, compaction) is blocking. + +Async tasks that try to access the data in the sync world, have to call `std::sync::Mutex::lock` method, which blocks the thread the callee async task runs on, also blocking other async tasks running in the same thread. Methods of `std::sync::RwLock` have the same issues, forcing async tasks either to block or spawn another, "blocking" task on a separate thread. + +Sync tasks that try to access the data in the async world, cannot use `.await` hence have to have some `Runtime` doing those calls for them. [`tokio::sync::Mutex`](https://docs.rs/tokio/1.19.2/tokio/sync/struct.Mutex.html#method.blocking_lock) and [`tokio::sync::RwLock`](https://docs.rs/tokio/1.19.2/tokio/sync/struct.RwLock.html#method.blocking_read) provide an API to simplify such calls. Similarly, both `std::sync` and `tokio::sync` have channels that are able to communicate into one direction without blocking and requiring `.await` calls, hence can be used to connect both worlds without locking. + +Some modules are in transition, started as async "blocking" tasks and being fully synchronous in their entire code below the start. Current idea is to transfer them to the async further, but it's not yet done. + +- locks are used in two different ways: + + - `RwLock>` ones to hold the shared data and ensure its atomic updates + - `Mutex<()>` for synchronizing the tasks, used to implicitly order the data access + + The "shared data" locks of the first kind are mainly accessed briefly to either look up or alter the data, yet there are a few notable exceptions, such as + `latest_gc_cutoff_lsn: RwLock` that is explicitly held in a few places to prevent GC thread from progressing. Those are covered later in the data access diagrams. + +- some synchronizations are not yet implemented + +E.g. asynchronous storage sync module does not synchronize with almost synchronous GC and compaction tasks when the layer files are uploaded to the remote storage. +That occasionally results in the files being deleted before the storage upload task is run for this layer, but due to the incremental nature of the layer files, we can handle such situations without issues. + +- `LayeredRepository` covers lots of responsibilities: GC and compaction task synchronisation, timeline access (`local_timelines` in `Tenant` is not used directly before the timeline from the repository is accessed), layer flushing to FS, layer sync to remote storage scheduling, etc. + +### How is this data accessed? + +There are multiple ways the data is accessed, from different sources: + +1. [HTTP requests](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/http/routes.rs) + +High-level CRUD API for managing tenants, timelines and getting data about them. +Current API list (modified for readability): + +```rust +.get("/v1/status", status_handler) // pageserver status +.get("/v1/tenant", tenant_list_handler) +.post("/v1/tenant", tenant_create_handler) // can create "empty" timelines or branch off the existing ones +.get("/v1/tenant/:tenant_id", tenant_status) // the only tenant public metadata +.put("/v1/tenant/config", tenant_config_handler) // tenant config data and local file manager +.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler) +.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) +.post("/v1/tenant/:tenant_id/attach", tenant_attach_handler) // download entire tenant from the remote storage and load its timelines memory +.post("/v1/tenant/:tenant_id/detach", tenant_detach_handler) // delete all tenant timelines from memory, remote corresponding storage and local FS files +.get("/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler) +.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler) +.get("/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver", wal_receiver_get_handler) // get walreceiver stats metadata +``` + +Overall, neither HTTP operation goes below `LayeredRepository` level and does not interact with layers: instead, they manage tenant and timeline entities, their configuration and metadata. + +`GET` data is small (relative to layer files contents), updated via brief `.write()/.lock()` calls and read via copying/cloning the data to release the lock soon. +It does not mean that the operations themselves are short, e.g. `tenant_attach_handler` downloads multiple files from the remote storage which might take time, yet the final data is inserted in memory via one brief write under the lock. + +Non-`GET` operations mostly follow the same rule, with two differences: + +- `tenant_detach_handler` has to wait for its background tasks to stop before shutting down, which requires more work with locks +- `timeline_create_handler` currently requires GC to be paused before branching the timeline, which requires orchestrating too. + This is the only HTTP operation, able to load the timeline into memory: rest of the operations are reading the metadata or, as in `tenant_attach_handler`, schedule a deferred task to download timeline and load it into memory. + +"Timeline data synchronization" section below describes both complex cases in more details. + +2. [libpq requests](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/page_service.rs) + +Is the main interface of pageserver, intended to handle libpq (and similar) requests. +Operates on `LayeredTimeline` and, lower, `LayerMap` modules; all timelines accessed during the operation are loaded into memory immediately (if not loaded already), operations bail on timeline load errors. + +- `pagestream` + + Page requests: `get_rel_exists`, `get_rel_size`, `get_page_at_lsn`, `get_db_size` + + Main API points, intended to be used by `compute` to show the data to the user. All require requests to be made at certain Lsn, if this Lsn is not available in the memory, request processing is paused until that happens or bails after a timeout. + +- `basebackup` and `fullbackup` + + Options to generate postgres-compatible backup archives. + +- `import basebackup` + +- `import wal` + + Import the `pg_wal` section of the basebackup archive. + +- `get_last_record_rlsn`, `get_lsn_by_timestamp` + +"Metadata" retrieval methods, that still requires internal knowledge about layers. + +- `set`, `fallpoints`, `show` + +Utility methods to support various edge cases or help with debugging/testing. + +- `do_gc`, `compact`, `checkpoint` + +Manual triggers for corresponding tenant tasks (GC, compaction) and inmemory layer flushing on disk (checkpointing), with upload task scheduling as a follow-up. + +Apart from loading into memory, every timeline layer has to be accessed using specific set of locking primitives, especially if a write operations happens: otherwise, GC or compaction might spoil the data. User API is implicitly affected by this synchronization during branching, when a GC has to be orchestrated properly before the new timeline could be branched off the existing one. +See "Timeline data synchronization" section for the united synchronization diagram on the topic. + +3. internal access + +Entities within pageserver that update files on local FS and remote storage, metadata in memory; has to use internal data for those operations. +Places that access internal, lower data are also required to have the corresponding timeline successfully loaded into memory and accessed with corresponding synchronization. + +If ancestors' data is accessed via its child branch, it means more than one timeline has to be loaded into memory entirely and more locking primitives usage involved. +Right now, all ancestors are resolved in-place: every place that has to check timeline's ancestor has to lock the timelines map, check if one is loaded into the memory, load it there or bail if it's not present, and get the information required and so on. + +- periodic GC and compaction tasks + +Alter metadata (GC info), in-memory data (layer relations, page caches, etc.) and layer files on disk. +Same as its libpq counterparts, needs full synchronization with the low level layer management code. + +- storage sync task + +Alters metadata (`RemoteTimelineIndex`), layer files on remote storage (upload, delete) and local FS (download) and in-memory data (registers downloaded timelines in the repository). +Currently, does not know anything about layer files contents, rather focusing on the file structure and metadata file updates: due to the fact that the layer files cannot be updated (only created or deleted), storage sync is able to back up the files to the remote storage without further low-level synchronizations: only when the timeline is downloaded, a load operation is needed to run, possibly pausing GC and compaction tasks. + +- walreceiver and walingest task + +Per timeline, subscribes for etcd events from safekeeper and eventually spawns a walreceiver connection task to receive WAL from a safekeeper node. +Fills memory with data, eventually triggering a checkpoint task that creates a new layer file in the local FS and schedules a remote storage sync upload task. +During WAL receiving, also updates a separate in-memory data structure with the walreceiver stats, used later via HTTP API. + +Layer updates require low-level set of sync primitives used to preserve the data consistency. + +- checkpoint (layer freeze) task + +Periodic, short-lived tasks to generate a new layer file in the FS. Requires low level synchronization in the end, when the layer is being registered after creating and has additional mode to ensure only one concurrent compaction happens at a time. + +### Timeline data synchronization + +Here's a high-level timeline data access diagram, considering the synchronization locks, based on the state diagram above. + +For brevity, diagrams do not show `RwLock>` data accesses, considering them almost instant to happen. +`RwLock` is close to be an exception to the previous rule, since it's taken in a multiple places to ensure all layers are inserted correctly. +Yet the only long operation in the current code is a `.write()` lock on the map during its creation, while all other lock usages tend to be short in the current code. +Note though, that due to current "working with loaded timeline only", prevailing amount of the locks taken on the struct are `.write()` locks, not the `.read()` ones. +To simplify the diagrams, these accesses are now considered "fast" data access, not the synchronization attempts. + +`write_lock` synchronization diagram: + +![timeline data access synchronization(1)](./images/017-timeline-data-management/timeline_data_access_sync_1.svg) + +Comments: + +- `write_lock: Mutex<()>` ensures that all timeline data being written into **in-memory layers** is done without races, one concurrent write at a time +- `layer_flush_lock: Mutex<()>` and layer flushing seems to be slightly bloated with various ways to create a layer on disk and write it in memory + The lock itself seem to repeat `write_lock` purpose when it touches in-memory layers, and also to limit the on-disk layer creations. + Yet the latter is not really done consistently, since remote storage sync manages to download and register the new layers without touching the locks +- `freeze_inmem_layer(true)` that touches both `write_lock` and `layer_flush_lock` seems not very aligned with the rest of the locks to those primitives; it also now restricts the layer creation concurrency even more, yet there are various `freeze_inmem_layer(false)` that are ignoring those restrictions at the same time + +![timeline data access synchronization(2)](./images/017-timeline-data-management/timeline_data_access_sync_2.svg) + +Comments: + +- `partitioning: Mutex<(KeyPartitioning, Lsn)>` lock is a data sync lock that's not used to synchronize the tasks (all other such kinds were considered "almost instant" and omitted on the diagram), yet is very similar to what `write_lock` and `layer_flush_lock` do: it ensures the timeline in-memory data is up-to-date with the layer files state on disk, which is what `LayerMap` is for. + +- there are multiple locks that do similar task management operations: + - `gc_cs: Mutex<()>` and `latest_gc_cutoff_lsn: RwLock` ensures that branching and gc are not run concurrently + - `layer_removal_cs: Mutex<()>` lock ensure gc, compaction and timeline deletion via HTTP API do not run concurrently + - `file_lock: RwLock<()>` is used as a semaphore, to ensure "all" gc and compaction tasks are shut down and do not start + Yet that lock does take only gc and compaction from internal loops: libpq call is not cancelled and waited upon. + +Those operations do not seem to belong to a timeline. Moreover, some of those could be eliminated entirely due to duplication of their tasks. + +## Proposed implementation + +### How to structure timeline data access better + +- adjust tenant state handling + +Current [`TenantState`](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L108) [changes](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L317) mainly indicates whether GC and compaction tasks are running or not; another state, `Broken` shows only in case any timeline does not load during startup. + +We could start both GC and compaction tasks at the time the tenant is created and adjust the tasks to throttle/sleep on timeline absence and wake up when the first one is added. +The latter becomes more important on download on demand, since we won't have the entire timeline in reach to verify its correctness. Moreover, if any network connection happens, the timeline could fail temporarily and entire tenant should be marked as broken due to that. + +Since nothing verifies the `TenantState` via HTTP API currently, it makes sense to remove the whole state entirely and don't write the code to synchronize its changes. +Instead, we could indicate internal issues for every timeline and have a better API to "stop" timeline processing without deleting its data, making our API less restrictive. + +- remove the "unloaded" status for the timeline + +Current approach to timeline management [assumes](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/layered_repository.rs#L486-L493) + +```rust +#[derive(Clone)] +enum LayeredTimelineEntry { + Loaded(Arc), + Unloaded { + id: ZTimelineId, + metadata: TimelineMetadata, + }, +} +``` + +supposes that timelines have to be in `Unloaded` state. + +The difference between both variants is whether its layer map was loaded from disk and kept in memory (Loaded) or not (Unloaded). +The idea behind such separation was to lazy load timelines in memory with all their layers only after its first access and potentially unload them later. + +Yet now there's no public API methods, that deal with unloaded timelines' layers: all of them either bail when such timeline is worked on, or load it into memory and continue working. +Moreover, every timeline in the local FS is loaded on pageserver startup now, so only two places where `Unloaded` variant is used are branching and timeline attach, with both loading the timeline into memory before the end of the operation. +Even if that loading into memory bails for some reason, next GC or compaction task periodic run would load such timeline into memory. +There are a few timeline methods that return timeline metadata without loading its layers, but such metadata also comes from the `metadata` FS file, not the layer files (so no page info could be retrieved without loading the entire layer map first). + +With the layer on-demand download, it's not feasible anymore to wait for the entire layer map to be loaded into the memory, since it might not even be available on the local FS when requested: `LayerMap` needs to be changed to contain metadata to retrieve the missing layers and handle partially present on the local FS timeline state. + +To accommodate to that and move away from the redundant status, a timeline should always be "loaded" with its metadata read from the disk and its layer map prepared to be downloaded when requested, per layer. + +Layers in the layer map, on the other hand, could be in various state: loaded, unloaded, downloading, downloading failed, etc. and their state has to be handled instead, if we want to support on-demand download in the future. + +This way, tenants and timelines could always try to serve requests and do their internal tasks periodically, trying to recover. + +- scale down the remote storage sync to per layer file, not per timeline as now + +Due to the reasons from the previous bullet, current remote storage model needs its timeline download approach to be changed. +Right now, a timeline is marked as "ready" only after all its layers on the remote storage are downloaded on the local storage. +With the on-demand download approach, only remote storage timeline metadata should be downloaded from S3, leaving the rest of the layers ready for download if/when it's requested. + +Note: while the remote storage sync should operate per layer, it should stay global for all tenants, to better manage S3 limits and sync queue priorities. +Yet the only place using remote storage should be the layer map. + +- encapsulate `tenant_mgr` logic into a regular Rust struct, unite with part of the `Repository` and anything else needed to manage the timeline data in a single place and to test it independently + +[`Repository`](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/repository.rs#L187) trait gets closer to `tenant_mgr` in terms of functionality: there are two background task-related functions, that are run on all timelines of a tenant: `gc_iteration` (it does allow running on a single timeline, but GC task runs it on all timelines) and `compaction_iteration` that are related to service tasks, not the data storage; and the metadata management functions, also not really related to the timeline contents. + +`tenant_mgr` proxies some of the `Repository` calls, yet both service tasks use `tenant_mgr` to access the data they need, creating a circular dependency between their APIs. +To avoid excessive synchronization between components, taking multiple locks for that and static state, we can organize the data access and updates in one place. +One potential benefit Rust gets from this is the ability to track and manage timeline resources, if all the related data is located in one place. + +- move `RemoteStorage` usage from `LayeredRepository` into `LayerMap`, as the rest of the layer-based entities (layer files, etc.) + +Layer == file in our model, since pageserver always either tries to load the LayerMap from disk for the timeline not in memory, or assumes the file contents matches its memory. +`LayeredRepository` is one of the most loaded objects currently and not everything from it deserves unification with the `tenant_mgr`. +In particular, layer files need to be better prepared for future download on demand functionality, where every layer could be dynamically loaded and unloaded from memory and local FS. +Current amount of locks and sync-async separation would make it hard to implement truly dynamic (un)loading; moreover, we would need retries with backoffs, since the unloaded layer files are most probably not available on the local FS either and network is not always reliable. + +One of the solutions to the issue is already being developed for the remote storage sync: [SyncQueue](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/storage_sync.rs#L463) +The queue is able to batch CRUD layer operations (both for local and remote FS contexts) and reorder them to increase the sync speed. +Similar approach could be generalized for all layer modifications, including in-memory ones such as GC or compaction: this way, we could manage all layer modifications and reads in one place with lesser locks and tests that are closer to unit tests. + +- change the approach to locking synchronization + +A number of locks in the timeline seem to be used to coordinate gc, compaction tasks and related processes. +It should be done in a task manager or other place, external to the timeline. + +Timeline contents still needs to be synchronized, considering the task work, so fields like `latest_gc_cutoff_lsn: RwLock` are expected to stay for that purpose, but general amount of locks should be reduced. + +### Putting it all together + +If the proposal bullets applied to the diagrams above, the state could be represented as: + +![timeline timeline tenant state](./images/017-timeline-data-management/proposed_timeline_tenant_state.svg) + +The reorders aim to put all tasks into separated modules, with strictly defined interfaces and as less knowledge about other components, as possible. +This way, all timeline data is now in the `data_storage`, including the GC, walreceiver, `RemoteTimelineIndex`, `LayerMap`, etc. with some API to get the data in the way, +more convenient for the data sync system inside. +So far, it seems that a few maps with `Arc>` with actual data operations added inside each `SeparateData` struct, if needed. + +`page_cache` is proposed to placed into the same `data_storage` since it contains tenant timelines' data: this way, all metadata and data is in the same struct, simplifying things with Rust's borrow checker and allowing us to share internals between data modules and later might simplify timeline in-memory size tracking. + +`task_manager` is related to data storage and manages all tenant and timeline tasks, manages shared resources (runtimes, thread pools, etcd connection, etc.) and synchronizes tasks. +All locks such as `gc_cs` belong to this module tree, as primitives inherently related to the task synchronization. +Tasks have to access timelines and their metadata, but should do that through `data_storage` API and similar. + +`task_manager` should (re)start, stop and track all tasks that are run in it, selecting an appropriate runtime depending on a task kind (we have async/sync task separation, CPU and IO bound tasks separation, ...) +Some locks such as `layer_removal_cs` one are not needed, if the only component that starts the tasks ensures they don't run concurrently. + +`LayeredTimeline` is still split into two parts, more high-level with whatever primitives needed to sync its state, and the actual state storage with `LayerMap` and other low level entities. +Only `LayerMap` knows what storage it's layer files are taken from (inmem, local FS, etc.), and it's responsible for synchronizing the layers when needed, as also reacting to sync events, successful or not. + +Last but not least, `tenant config file` has to be backed into a remote storage, as tenant-specific information for all timelines. +Tenant and timelines have volatile information that's now partially mixed with constant information (e.g. fields in `metadata` file), that model should be better split and handled, in case we want to properly support its backups and synchronization. + +![proposed timeline data access synchronization(1)](./images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg) + +There's still a need to keep inmemory layer buffer synchronized during layer freezing, yet that could happen on a layer level, not on a timeline level, as `write_lock` used to be, so we could lower the sync primitives one layer deeper, preparing us for download on demand feature, where multiple layers could be concurrently streamed and written from various data sources. + +Flushing the frozen layer requires creating a new layer on disk and further remote storage upload, so `LayerMap` has to get those flushed bytes and queue them later: no need to block in the timeline itself for anything again, rather locking on the layer level, if needed. + +![proposed timeline data access synchronization(2)](./images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg) + +Lock diagrams legend: + +![lock diagrams legend](./images/017-timeline-data-management/lock_legend.svg) + +After the frozen layers are flushed, something has to ensure that the layer structure is intact, so a repartitioning lock is needed still, and could also guard the layer map structure changes, since both are needed either way. +This locking belongs to the `LowLevelLayeredTimeline` from the proposed data structure diagram, as the place with all such data being held. + +Similarly, branching is still required to be done after certain Lsn in our current model, but this needs only one lock to synchronize and that could be the `gc_cs: Mutex<()>` lock. +It raises the question of where this lock has to be placed, it's the only place that requires pausing a GC task during external, HTTP request handling. +The right place for the lock seems to be the `task_manager` that could manage GC in more fine-grained way to accommodate the incoming branching request. + +There's no explicit lock sync between GC, compaction or other mutually exclusive tasks: it is a job of the `task_manager` to ensure those are not run concurrently. diff --git a/docs/rfcs/images/017-timeline-data-management/lock_legend.svg b/docs/rfcs/images/017-timeline-data-management/lock_legend.svg new file mode 100644 index 0000000000..d6d2bc00ae --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/lock_legend.svg @@ -0,0 +1,4 @@ + + + +
Lock interaction legend:

Lock interaction legend:...
LOCK NAME
LOCK NAME
LOCK NAME
LOCK NAME
Event flow
Event flow
or
or
lock acquisition, 
every lock is shown with a single lines
Different lines of the same shape denote different locks
lock acquisition,...
Continuous lock acquisition,
lock release is explicitly shown later
Continuous lock acquisition,...
Lock release
Lock release
Instant lock acquisition and release
Instant lock acquisition and rele...
Lock details (RwLock/Mutex)
are shown on the corresponding arrows
and lock names
Lock details (RwLock/Mutex)...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg new file mode 100644 index 0000000000..d1c97d1738 --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg @@ -0,0 +1,4 @@ + + + +
walreceiver loop
walreceiver loop
DatadirModification::flush after every file
DatadirModification::flush aft...
HTTP API call
to create an empty timeline
HTTP API call...
libpq call
to import basebackup archive
libpq call...
libpq call
to import wal
libpq call...
zenith.signal
file processed
zenith.signal...
process timeline wal
(walingest)
process timeline wal...
DatadirModification::commit
DatadirModification::commit
process timeline wal
(walingest)
process timeline wal...
process timeline wal
(walingest)
process timeline wal...
process timeline files
process timeline files
DatadirModification::commit
DatadirModification::commit
layer_write_lock.lock()
layer_write_lock.lock()
timeline::writer call
timeline::writer call
DatadirModification::commit
DatadirModification::commit
DatadirModification::commit
DatadirModification::commit
after all files processed
after all files processed
and
and
timeline::writer call
timeline::writer call
checkpoint(Flush)
checkpoint(Flush)
checkpoint(Forced)
checkpoint(Forced)
checkpoint(Flush)
checkpoint(Flush)
checkpoint(Forced)
checkpoint(Forced)
libpq call
to checkpoint
libpq call...
checkpoint(Forced)
checkpoint(Forced)
libpq call
to do_gc
libpq call...
checkpoint(Flush)
checkpoint(Flush)
shutdown() system call
shutdown() system call
additionally: every time the timeline is accessed, it's done via .lock() on the timeline mutex inside the repo
additionally: every time the timeline is accessed, it's done via .lock() on the timeline mutex i...
held through entire freezing
held through entire freezing
flush_frozen_layers
schedules the operation in to LayerMap
flush_frozen_layers...

freeze_inmem_layer(true)

freeze_inmem_layer(true)...
checkpoint(Flush)
checkpoint(Flush)
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg new file mode 100644 index 0000000000..81918fcd98 --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg @@ -0,0 +1,4 @@ + + + +
libpq pagerequest calls
basebackup
libpq pagerequest calls...
libpq do_gc call
libpq do_gc call
periodic GC
periodic GC
checkpoint(Forced)
checkpoint(Forced)
periodic compaction
periodic compaction
gc
gc
compact
compact
partitioning.lock()
partitioning.lock()
gc
gc
compact
compact
HTTP API call
to branch a timeline
HTTP API call...
checkpoint(Forced)
checkpoint(Forced)
takes the lock when ready to do gc
holds during entire operation
takes the lock when ready to do gc...
gc_cs.lock()
gc_cs.lock()
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
other checkpoint sources
other checkpoint sources
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
holds lock during
entire operation
holds lock during...
holds lock during
entire branching
holds lock during...
wait_or_get_last_lsn
@
page request Lsn
wait_or_get_last_lsn...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/proposed_timeline_tenant_state.svg b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_tenant_state.svg new file mode 100644 index 0000000000..207017fb1b --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_tenant_state.svg @@ -0,0 +1,4 @@ + + + +sLayer 1Layer 2
contained in
contained in
metadataLayer 1
...
...
...
...
index_part.json
Files in the remote storage
Files in the remote storage
Files in the local FS
Files in the local FS
Tenanta number of maps with Arc<RwLock<Data>> patternfor tenants, timelines, gc, walreceiver, remove storage, etc. metadataLayeredTimelinewrite_lock: Mutex<()>latest_gc_cutoff_lsn: RwLock<Lsn>process: Mutex<Option<PostgresRedoPorcess>> inside               PostgresRedoManagercompactionPeriodically runs on all tenant timelines, each processed separately. Merges (removes and adds) layer fileswalreceiver tasksetcd subscriptions, periodic timeline writes and checkpointstenant config fileLowLevelLayeredTimelinepartitioning: Mutex<(KeyPartitioning, Lsn)>layers: RwLock<LayerMap>
tenant contains timeline layer data
tenant con...
remote storage syncstorage sync queue and S3 connectionsperiodically writes into the remote indexgcPeriodically runs on all tenant timelines, with shared context.Removes layer files
Tasks interact with layers, via LayerMap
Tasks interact with layers, via LayerMap
task_managerruntime, threadpools, shared connections (etcd), etc.logic to manage tenant/timeline taskstenant config file in any form
layer map schedules sync tasks
and calls logic on their completion
layer map schedules sync tasks...
page cachematerialized_page_map: RwLock<HashMap<...>>ephemeral_page_map: RwLock<HashMap<...>>immutable_page_map: RwLock<HashMap<...>>tenant storageHashMap<TenantId, Tenant>Tenant state information, its sync and task manager interaction
layer map manages local and remote files
in a queue-based manner
layer map manages local and remote files...
tasks update or read metadata via the storage
tasks update or read metadata via the storage
Legend:
Legend:
interaction between components,
arrows show which component does the data access
interaction between components,...
data relation,
arrows show where current data is contained in
data relation,...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_1.svg b/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_1.svg new file mode 100644 index 0000000000..b968fedd8c --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_1.svg @@ -0,0 +1,4 @@ + + + +
walreceiver loop
walreceiver loop
DatadirModification::flush after every file
DatadirModification::flush aft...
HTTP API call
to create an empty timeline
HTTP API call...
libpq call
to import basebackup archive
libpq call...
libpq call
to import wal
libpq call...
zenith.signal
file processed
zenith.signal...
process timeline wal
(walingest)
process timeline wal...
DatadirModification::commit
DatadirModification::commit
process timeline wal
(walingest)
process timeline wal...
process timeline wal
(walingest)
process timeline wal...
process timeline files
process timeline files
DatadirModification::commit
DatadirModification::commit
write_lock.lock()
w...
timeline::writer call
timeline::writer call
DatadirModification::commit
DatadirModification::commit
DatadirModification::commit
DatadirModification::commit
after all files processed
after all files processed
and
and
timeline::writer call
timeline::writer call
checkpoint(Flush)
checkpoint(Flush)
checkpoint(Forced)
checkpoint(Forced)
checkpoint(Flush)
checkpoint(Flush)
check_checkpoint_distance
check_checkpoint_distance
checkpoint(Forced)
checkpoint(Forced)
libpq call
to checkpoint
libpq call...
checkpoint(Forced)
checkpoint(Forced)
libpq call
to do_gc
libpq call...
checkpoint(Flush)
checkpoint(Flush)
shutdown() system call
shutdown() system call
additionally: every time the timeline is accessed, it's done via .lock() on the timeline mutex inside the repo
additionally: every time the timeline is accessed, it's done via .lock() on the timeline mutex i...
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
held through entire freezing
h...
 layer_flush_lock.lock() 
...
skips both flushes if the lock is taken
s...
skips the flush if the lock is taken 
s...
always waits for the lock
and runs
frozen layers flush 
holding the lock
always waits f...
flush_frozen_layers(false)
flush_frozen_layers(false)

freeze_inmem_layer(true)

freeze_inmem_layer(true)...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_2.svg b/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_2.svg new file mode 100644 index 0000000000..382d834517 --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_2.svg @@ -0,0 +1,4 @@ + + + +
libpq pagerequest calls
basebackup
libpq pagerequest calls...
libpq do_gc call
libpq do_gc call
periodic GC
periodic GC
checkpoint(Forced)
checkpoint(Forced)
periodic compaction
periodic compaction
tenant idle/detach
shutdown
tenant idle/detach...
gc
gc
compact
compact
lock is held for
almost entire operations
lock is held for...
RwLock(file_lock)
RwLock(file_lock)
read
read
read
read
write
write
HTTP API call
delete timeline
HTTP API call...
layer_removal_cs.lock()
layer_removal_cs.lock()
lock is held for
the entire operation
lock is held for...
partitioning.lock()
partitioning.lock()
gc
gc
compact
compact
HTTP API call
to branch a timeline
HTTP API call...
gc_cs.lock()
gc_cs.lock()
held during entire
branching
held during entire...
checkpoint(Forced)
checkpoint(Forced)
write updated value,
release the lock
write updated value,...
RwLock(latest_gc_cutoff_lsn)
RwLock(latest_gc_cutoff_lsn)
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
other checkpoint sources
other checkpoint sources
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
holds read during
enire operation
holds read during...
holds read during
enire branching
holds read during...
wait_or_get_last_lsn
@
page request Lsn
wait_or_get_last_lsn...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/timeline_tenant_state.svg b/docs/rfcs/images/017-timeline-data-management/timeline_tenant_state.svg new file mode 100644 index 0000000000..c4bc36f309 --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/timeline_tenant_state.svg @@ -0,0 +1,4 @@ + + + +                                             Tasks                                                                                                                                                                   StateLayer 1Layer 2
contained in
contained in
metadataLayer 1
...
...
...
...
index_part.json
Files in the remote storage
Files in the remote storage
Files in the local FS
Files in the local FS
LayeredRepositorytimelines: Mutex<HashMap<TimelineId, LayeredTimeline>>gc_cs: Mutex<()>file_lock: RwLock<()>tenant_conf: Arc<RwLock<TenantConfOpt>>remote_index: Arc<RwLock<HashMap<                        TenantTimelineId, RemoteTimelineMetadata>>tenant_mgrstatic ref TENANTS: RwLock<HashMap<TenantId, Tenant>>Tenantstate: TenantStaterepo: Arc<LayeredRepository>local_timelines: HashMap<TimelineId, Arc<DatadirTimelineImpl>>PageCachematerialized_page_map: RwLock<HashMap<...>>ephemeral_page_map: RwLock<HashMap<...>>immutable_page_map: RwLock<HashMap<...>>DatadirTimelineImplpartitioning: Mutex<(KeyPartitioning, Lsn)>tline: Arc<LayeredTimeline>compactionPeriodically runs on all tenant timelines, each processed separately. Merges (removes and adds) layer fileswalreceiver tasksetcd subscriptions, periodic timeline writes and checkpointstenant config fileLayeredTimelinewrite_lock: Mutex<()>layer_flush_lock: Mutex<()>layer_removal_cs: Mutex<()>latest_gc_cutoff_lsn: RwLock<Lsn>tenant_conf: Arc<RwLock<TenantConfOpt>>gc_info: RwLock<GcInfo>process: Mutex<Option<PostgresRedoPorcess>> inside               PostgresRedoManagerlayers: RwLock<LayerMap>layer flush taskPer timeline, moves in-memory data to disk when scheduled (adds layers)remote storage sync taskstorage sync queue and S3 connectionsperiodically writes into the remote indexgcPeriodically runs on all tenant timelines, with shared context.Removes layer files
Backed by repository:
Backed by repository:
get page requests lookup and update
get page requests lookup and update
flushes new files on disk, loads existing into memory
flushes new files on disk, loads existing into memory
Tasks interact with files on disk, full CRUD
Remote storage sync task is the only one to interact with other storage
Tasks interact with files on disk, full CRUD...
schedules layer sync
schedules layer sync
Text is not SVG - cannot display
\ No newline at end of file From 35761ac6b6f4daee78bcaabd083e88ec3b877958 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Tue, 13 Sep 2022 23:55:18 +0200 Subject: [PATCH 0764/1022] docs/sourcetree: add info about IDE config (#2332) --- docs/sourcetree.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/docs/sourcetree.md b/docs/sourcetree.md index f3bc9230e2..339a90e0ba 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -134,3 +134,42 @@ Also consider: To add new package or change an existing one you can use `poetry add` or `poetry update` or edit `pyproject.toml` manually. Do not forget to run `poetry lock` in the latter case. More details are available in poetry's [documentation](https://python-poetry.org/docs/). + +## Configuring IDEs +Neon consists of three projects in different languages which use different project models. + +* A bunch of Rust crates, all available from the root `Cargo.toml`. +* Integration tests in Python in the `test_runner` directory. Some stand-alone Python scripts exist as well. +* Postgres and our Postgres extensions in C built with Makefiles under `vendor/postgres` and `pgxn`. + +### CLion +You can use CLion with the [Rust plugin](https://plugins.jetbrains.com/plugin/8182-rust) to develop Neon. It should pick up Rust and Python projects whenever you open Neon's repository as a project. We have not tried setting up a debugger, though. + +C code requires some extra care, as it's built via Make, not CMake. Some of our developers have successfully used [compilation database](https://www.jetbrains.com/help/clion/compilation-database.html#compdb_generate) for CLion. It is a JSON file which lists all C source files and corresponding compilation keys. CLion can use it instead of `CMakeLists.txt`. To set up a project with a compilation database: + +1. Clone the Neon repository and install all dependencies, including Python. Do not open it with CLion just yet. +2. Run the following commands in the repository's root: + ```bash + # Install a `compiledb` tool which can parse make's output and generate the compilation database. + poetry add -D compiledb + # Run Make without actually compiling code so we can generate the compilation database. It still may take a few minutes. + make --dry-run --print-directory --keep-going --assume-new=* postgres neon-pg-ext | poetry run compiledb --verbose --no-build + # Uninstall the tool + poetry remove -D compiledb + # Make sure the compile_commands.json file is not committed. + echo /compile_commands.json >>.git/info/exclude + ``` +3. Open CLion, click "Open File or Project" and choose the generated `compile_commands.json` file to be opened "as a project". You cannot add a compilation database into an existing CLion project, you have to create a new one. _Do not_ open the directory as a project, open the file. +4. The newly created project should start indexing Postgres source code in C, as well as the C standard library. You may have to [configure the C compiler for the compilation database](https://www.jetbrains.com/help/clion/compilation-database.html#compdb_toolchain). +5. Open the `Cargo.toml` file in an editor in the same project. CLion should pick up the hint and start indexing Rust code. +7. Now you have a CLion project which knows about C files, Rust files. It should pick up Python files automatically as well. + +You can also enable Cargo Clippy diagnostics and enable Rustfmt instead of built-in code formatter. + +Whenever you change layout of C files, you may need to regenerate the compilation database. No need to re-create the CLion project, changes should be picked up automatically. + +Known issues (fixes and suggestions are welcome): + +* Test results may be hard to read in CLion, both for unit tests in Rust and integration tests in Python. Use command line to run them instead. +* CLion does not support non-local Python interpreters, unlike PyCharm. E.g. if you use WSL, CLion does not see `poetry` and installed dependencies. Python support is limited. +* Cargo Clippy diagnostics in CLion may take a lot of resources. From ba8698bbcbc4f3a4d46e0eeaa48cec3191c0d440 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 13 Sep 2022 21:06:10 +0300 Subject: [PATCH 0765/1022] update neon_local output in readme --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 977afc2a2c..03ed57a0fa 100644 --- a/README.md +++ b/README.md @@ -125,16 +125,18 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r # Create repository in .neon with proper paths to binaries and data # Later that would be responsibility of a package install script > ./target/debug/neon_local init -initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c -created initial timeline de200bd42b49cc1814412c7e592dd6e9 timeline.lsn 0/16B5A50 -initial timeline de200bd42b49cc1814412c7e592dd6e9 created -pageserver init succeeded +Starting pageserver at '127.0.0.1:64000' in '.neon' + +Pageserver started +Successfully initialized timeline 7dd0907914ac399ff3be45fb252bfdb7 +Stopping pageserver gracefully...done! # start pageserver and safekeeper > ./target/debug/neon_local start +Starting etcd broker using /usr/bin/etcd Starting pageserver at '127.0.0.1:64000' in '.neon' + Pageserver started -initializing for sk 1 for 7676 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1' Safekeeper started From 260ec20a0218f3da95a2393c9ba377049967dcb2 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 13 Sep 2022 23:58:27 +0300 Subject: [PATCH 0766/1022] Refotmat pgxn code, add typedefs.list that was used --- pgxn/neon/inmem_smgr.c | 28 +- pgxn/neon/libpagestore.c | 25 +- pgxn/neon/libpqwalproposer.c | 237 +- pgxn/neon/neon.c | 9 +- pgxn/neon/neon.h | 2 +- pgxn/neon/pagestore_client.h | 19 +- pgxn/neon/pagestore_smgr.c | 169 +- pgxn/neon/walproposer.c | 682 +++--- pgxn/neon/walproposer.h | 343 +-- pgxn/neon/walproposer_utils.c | 142 +- pgxn/neon/walproposer_utils.h | 26 +- pgxn/neon_test_utils/neontest.c | 30 +- pgxn/typedefs.list | 3776 +++++++++++++++++++++++++++++++ 13 files changed, 4691 insertions(+), 797 deletions(-) create mode 100644 pgxn/typedefs.list diff --git a/pgxn/neon/inmem_smgr.c b/pgxn/neon/inmem_smgr.c index 13fd4d50b6..4926d759e8 100644 --- a/pgxn/neon/inmem_smgr.c +++ b/pgxn/neon/inmem_smgr.c @@ -188,10 +188,10 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, { /* * We assume the buffer cache is large enough to hold all the buffers - * needed for most operations. Overflowing to this "in-mem smgr" in rare - * cases is OK. But if we find that we're using more than WARN_PAGES, - * print a warning so that we get alerted and get to investigate why - * we're accessing so many buffers. + * needed for most operations. Overflowing to this "in-mem smgr" in + * rare cases is OK. But if we find that we're using more than + * WARN_PAGES, print a warning so that we get alerted and get to + * investigate why we're accessing so many buffers. */ elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", @@ -207,7 +207,9 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, pg = used_pages; used_pages++; INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum); - } else { + } + else + { elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u", reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, @@ -226,14 +228,14 @@ BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum) { /* - * It's not clear why a WAL redo function would call smgrnblocks(). - * During recovery, at least before reaching consistency, the size of a - * relation could be arbitrarily small, if it was truncated after the - * record being replayed, or arbitrarily large if it was extended - * afterwards. But one place where it's called is in - * XLogReadBufferExtended(): it extends the relation, if it's smaller than - * the requested page. That's a waste of time in the WAL redo - * process. Pretend that all relations are maximally sized to avoid it. + * It's not clear why a WAL redo function would call smgrnblocks(). During + * recovery, at least before reaching consistency, the size of a relation + * could be arbitrarily small, if it was truncated after the record being + * replayed, or arbitrarily large if it was extended afterwards. But one + * place where it's called is in XLogReadBufferExtended(): it extends the + * relation, if it's smaller than the requested page. That's a waste of + * time in the WAL redo process. Pretend that all relations are maximally + * sized to avoid it. */ return MaxBlockNumber; } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index d0572e66cb..55285a6345 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -153,11 +153,11 @@ static void pageserver_disconnect(void) { /* - * If anything goes wrong while we were sending a request, it's not - * clear what state the connection is in. For example, if we sent the - * request but didn't receive a response yet, we might receive the - * response some time later after we have already sent a new unrelated - * request. Close the connection to avoid getting confused. + * If anything goes wrong while we were sending a request, it's not clear + * what state the connection is in. For example, if we sent the request + * but didn't receive a response yet, we might receive the response some + * time later after we have already sent a new unrelated request. Close + * the connection to avoid getting confused. */ if (connected) { @@ -191,12 +191,13 @@ pageserver_send(ZenithRequest *request) * * In principle, this could block if the output buffer is full, and we * should use async mode and check for interrupts while waiting. In - * practice, our requests are small enough to always fit in the output - * and TCP buffer. + * practice, our requests are small enough to always fit in the output and + * TCP buffer. */ if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) { - char* msg = PQerrorMessage(pageserver_conn); + char *msg = PQerrorMessage(pageserver_conn); + pageserver_disconnect(); neon_log(ERROR, "failed to send page request: %s", msg); } @@ -205,6 +206,7 @@ pageserver_send(ZenithRequest *request) if (message_level_is_interesting(PageStoreTrace)) { char *msg = zm_to_string((ZenithMessage *) request); + neon_log(PageStoreTrace, "sent request: %s", msg); pfree(msg); } @@ -255,15 +257,16 @@ static void pageserver_flush(void) { if (PQflush(pageserver_conn)) - { - char* msg = PQerrorMessage(pageserver_conn); + { + char *msg = PQerrorMessage(pageserver_conn); + pageserver_disconnect(); neon_log(ERROR, "failed to flush page requests: %s", msg); } } static ZenithResponse * -pageserver_call(ZenithRequest* request) +pageserver_call(ZenithRequest *request) { pageserver_send(request); pageserver_flush(); diff --git a/pgxn/neon/libpqwalproposer.c b/pgxn/neon/libpqwalproposer.c index 2b2b7a1a6a..1f739f3722 100644 --- a/pgxn/neon/libpqwalproposer.c +++ b/pgxn/neon/libpqwalproposer.c @@ -7,38 +7,40 @@ /* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */ struct WalProposerConn { - PGconn* pg_conn; - bool is_nonblocking; /* whether the connection is non-blocking */ - char *recvbuf; /* last received data from libpqprop_async_read */ + PGconn *pg_conn; + bool is_nonblocking; /* whether the connection is non-blocking */ + char *recvbuf; /* last received data from + * libpqprop_async_read */ }; /* Prototypes for exported functions */ -static char* libpqprop_error_message(WalProposerConn* conn); -static WalProposerConnStatusType libpqprop_status(WalProposerConn* conn); -static WalProposerConn* libpqprop_connect_start(char* conninfo); -static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn* conn); -static bool libpqprop_send_query(WalProposerConn* conn, char* query); -static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn* conn); -static pgsocket libpqprop_socket(WalProposerConn* conn); -static int libpqprop_flush(WalProposerConn* conn); -static void libpqprop_finish(WalProposerConn* conn); -static PGAsyncReadResult libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount); -static PGAsyncWriteResult libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size); -static bool libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size); +static char *libpqprop_error_message(WalProposerConn * conn); +static WalProposerConnStatusType libpqprop_status(WalProposerConn * conn); +static WalProposerConn * libpqprop_connect_start(char *conninfo); +static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn * conn); +static bool libpqprop_send_query(WalProposerConn * conn, char *query); +static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn * conn); +static pgsocket libpqprop_socket(WalProposerConn * conn); +static int libpqprop_flush(WalProposerConn * conn); +static void libpqprop_finish(WalProposerConn * conn); +static PGAsyncReadResult libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount); +static PGAsyncWriteResult libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size); +static bool libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size); -static WalProposerFunctionsType PQWalProposerFunctions = { +static WalProposerFunctionsType PQWalProposerFunctions = +{ libpqprop_error_message, - libpqprop_status, - libpqprop_connect_start, - libpqprop_connect_poll, - libpqprop_send_query, - libpqprop_get_query_result, - libpqprop_socket, - libpqprop_flush, - libpqprop_finish, - libpqprop_async_read, - libpqprop_async_write, - libpqprop_blocking_write, + libpqprop_status, + libpqprop_connect_start, + libpqprop_connect_poll, + libpqprop_send_query, + libpqprop_get_query_result, + libpqprop_socket, + libpqprop_flush, + libpqprop_finish, + libpqprop_async_read, + libpqprop_async_write, + libpqprop_blocking_write, }; /* Module initialization */ @@ -52,7 +54,7 @@ pg_init_libpqwalproposer(void) /* Helper function */ static bool -ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking) +ensure_nonblocking_status(WalProposerConn * conn, bool is_nonblocking) { /* If we're already correctly blocking or nonblocking, all good */ if (is_nonblocking == conn->is_nonblocking) @@ -67,14 +69,14 @@ ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking) } /* Exported function definitions */ -static char* -libpqprop_error_message(WalProposerConn* conn) +static char * +libpqprop_error_message(WalProposerConn * conn) { return PQerrorMessage(conn->pg_conn); } static WalProposerConnStatusType -libpqprop_status(WalProposerConn* conn) +libpqprop_status(WalProposerConn * conn) { switch (PQstatus(conn->pg_conn)) { @@ -87,35 +89,38 @@ libpqprop_status(WalProposerConn* conn) } } -static WalProposerConn* -libpqprop_connect_start(char* conninfo) +static WalProposerConn * +libpqprop_connect_start(char *conninfo) { - WalProposerConn* conn; - PGconn* pg_conn; + WalProposerConn *conn; + PGconn *pg_conn; pg_conn = PQconnectStart(conninfo); + /* - * Allocation of a PQconn can fail, and will return NULL. We want to fully replicate the - * behavior of PQconnectStart here. + * Allocation of a PQconn can fail, and will return NULL. We want to fully + * replicate the behavior of PQconnectStart here. */ if (!pg_conn) return NULL; /* - * And in theory this allocation can fail as well, but it's incredibly unlikely if we just - * successfully allocated a PGconn. + * And in theory this allocation can fail as well, but it's incredibly + * unlikely if we just successfully allocated a PGconn. * - * palloc will exit on failure though, so there's not much we could do if it *did* fail. + * palloc will exit on failure though, so there's not much we could do if + * it *did* fail. */ conn = palloc(sizeof(WalProposerConn)); conn->pg_conn = pg_conn; - conn->is_nonblocking = false; /* connections always start in blocking mode */ + conn->is_nonblocking = false; /* connections always start in blocking + * mode */ conn->recvbuf = NULL; return conn; } static WalProposerConnectPollStatusType -libpqprop_connect_poll(WalProposerConn* conn) +libpqprop_connect_poll(WalProposerConn * conn) { WalProposerConnectPollStatusType return_val; @@ -134,26 +139,34 @@ libpqprop_connect_poll(WalProposerConn* conn) return_val = WP_CONN_POLLING_OK; break; - /* There's a comment at its source about this constant being unused. We'll expect it's never - * returned. */ + /* + * There's a comment at its source about this constant being + * unused. We'll expect it's never returned. + */ case PGRES_POLLING_ACTIVE: elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); - /* This return is never actually reached, but it's here to make the compiler happy */ + + /* + * This return is never actually reached, but it's here to make + * the compiler happy + */ return WP_CONN_POLLING_FAILED; default: Assert(false); - return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */ + return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */ } return return_val; } static bool -libpqprop_send_query(WalProposerConn* conn, char* query) +libpqprop_send_query(WalProposerConn * conn, char *query) { - /* We need to be in blocking mode for sending the query to run without - * requiring a call to PQflush */ + /* + * We need to be in blocking mode for sending the query to run without + * requiring a call to PQflush + */ if (!ensure_nonblocking_status(conn, false)) return false; @@ -165,13 +178,13 @@ libpqprop_send_query(WalProposerConn* conn, char* query) } static WalProposerExecStatusType -libpqprop_get_query_result(WalProposerConn* conn) +libpqprop_get_query_result(WalProposerConn * conn) { - PGresult* result; + PGresult *result; WalProposerExecStatusType return_val; /* Marker variable if we need to log an unexpected success result */ - char* unexpected_success = NULL; + char *unexpected_success = NULL; /* Consume any input that we might be missing */ if (!PQconsumeInput(conn->pg_conn)) @@ -182,8 +195,11 @@ libpqprop_get_query_result(WalProposerConn* conn) result = PQgetResult(conn->pg_conn); - /* PQgetResult returns NULL only if getting the result was successful & there's no more of the - * result to get. */ + + /* + * PQgetResult returns NULL only if getting the result was successful & + * there's no more of the result to get. + */ if (!result) { elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); @@ -191,7 +207,7 @@ libpqprop_get_query_result(WalProposerConn* conn) } /* Helper macro to reduce boilerplate */ - #define UNEXPECTED_SUCCESS(msg) \ +#define UNEXPECTED_SUCCESS(msg) \ return_val = WP_EXEC_UNEXPECTED_SUCCESS; \ unexpected_success = msg; \ break; @@ -199,12 +215,12 @@ libpqprop_get_query_result(WalProposerConn* conn) switch (PQresultStatus(result)) { - /* "true" success case */ + /* "true" success case */ case PGRES_COPY_BOTH: return_val = WP_EXEC_SUCCESS_COPYBOTH; break; - /* Unexpected success case */ + /* Unexpected success case */ case PGRES_EMPTY_QUERY: UNEXPECTED_SUCCESS("empty query return"); case PGRES_COMMAND_OK: @@ -220,7 +236,7 @@ libpqprop_get_query_result(WalProposerConn* conn) case PGRES_PIPELINE_SYNC: UNEXPECTED_SUCCESS("pipeline sync point"); - /* Failure cases */ + /* Failure cases */ case PGRES_BAD_RESPONSE: case PGRES_NONFATAL_ERROR: case PGRES_FATAL_ERROR: @@ -230,7 +246,7 @@ libpqprop_get_query_result(WalProposerConn* conn) default: Assert(false); - return_val = WP_EXEC_FAILED; /* keep the compiler quiet */ + return_val = WP_EXEC_FAILED; /* keep the compiler quiet */ } if (unexpected_success) @@ -240,19 +256,19 @@ libpqprop_get_query_result(WalProposerConn* conn) } static pgsocket -libpqprop_socket(WalProposerConn* conn) +libpqprop_socket(WalProposerConn * conn) { return PQsocket(conn->pg_conn); } static int -libpqprop_flush(WalProposerConn* conn) +libpqprop_flush(WalProposerConn * conn) { return (PQflush(conn->pg_conn)); } static void -libpqprop_finish(WalProposerConn* conn) +libpqprop_finish(WalProposerConn * conn) { if (conn->recvbuf != NULL) PQfreemem(conn->recvbuf); @@ -267,9 +283,9 @@ libpqprop_finish(WalProposerConn* conn) * to this function. */ static PGAsyncReadResult -libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) +libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount) { - int result; + int result; if (conn->recvbuf != NULL) { @@ -285,12 +301,11 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) return PG_ASYNC_READ_FAIL; } - /* The docs for PQgetCopyData list the return values as: - * 0 if the copy is still in progress, but no "complete row" is - * available - * -1 if the copy is done - * -2 if an error occured - * (> 0) if it was successful; that value is the amount transferred. + /* + * The docs for PQgetCopyData list the return values as: 0 if the copy is + * still in progress, but no "complete row" is available -1 if the copy is + * done -2 if an error occured (> 0) if it was successful; that value is + * the amount transferred. * * The protocol we use between walproposer and safekeeper means that we * *usually* wouldn't expect to see that the copy is done, but this can @@ -304,25 +319,28 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) *buf = NULL; return PG_ASYNC_READ_TRY_AGAIN; case -1: - { - /* - * If we get -1, it's probably because of a server error; the - * safekeeper won't normally send a CopyDone message. - * - * We can check PQgetResult to make sure that the server failed; - * it'll always result in PGRES_FATAL_ERROR - */ - ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); + { + /* + * If we get -1, it's probably because of a server error; the + * safekeeper won't normally send a CopyDone message. + * + * We can check PQgetResult to make sure that the server + * failed; it'll always result in PGRES_FATAL_ERROR + */ + ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); - if (status != PGRES_FATAL_ERROR) - elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); + if (status != PGRES_FATAL_ERROR) + elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); - /* If there was actually an error, it'll be properly reported by - * calls to PQerrorMessage -- we don't have to do anything else */ - *amount = 0; - *buf = NULL; - return PG_ASYNC_READ_FAIL; - } + /* + * If there was actually an error, it'll be properly reported + * by calls to PQerrorMessage -- we don't have to do anything + * else + */ + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } case -2: *amount = 0; *buf = NULL; @@ -336,23 +354,25 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) } static PGAsyncWriteResult -libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size) +libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size) { - int result; + int result; /* If we aren't in non-blocking mode, switch to it. */ if (!ensure_nonblocking_status(conn, true)) return PG_ASYNC_WRITE_FAIL; - /* The docs for PQputcopyData list the return values as: - * 1 if the data was queued, - * 0 if it was not queued because of full buffers, or - * -1 if an error occured + /* + * The docs for PQputcopyData list the return values as: 1 if the data was + * queued, 0 if it was not queued because of full buffers, or -1 if an + * error occured */ result = PQputCopyData(conn->pg_conn, buf, size); - /* We won't get a result of zero because walproposer always empties the - * connection's buffers before sending more */ + /* + * We won't get a result of zero because walproposer always empties the + * connection's buffers before sending more + */ Assert(result != 0); switch (result) @@ -366,16 +386,17 @@ libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size) elog(FATAL, "invalid return %d from PQputCopyData", result); } - /* After queueing the data, we still need to flush to get it to send. - * This might take multiple tries, but we don't want to wait around - * until it's done. + /* + * After queueing the data, we still need to flush to get it to send. This + * might take multiple tries, but we don't want to wait around until it's + * done. * - * PQflush has the following returns (directly quoting the docs): - * 0 if sucessful, - * 1 if it was unable to send all the data in the send queue yet - * -1 if it failed for some reason + * PQflush has the following returns (directly quoting the docs): 0 if + * sucessful, 1 if it was unable to send all the data in the send queue + * yet -1 if it failed for some reason */ - switch (result = PQflush(conn->pg_conn)) { + switch (result = PQflush(conn->pg_conn)) + { case 0: return PG_ASYNC_WRITE_SUCCESS; case 1: @@ -388,16 +409,18 @@ libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size) } static bool -libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size) +libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size) { - int result; + int result; /* If we are in non-blocking mode, switch out of it. */ if (!ensure_nonblocking_status(conn, false)) return false; - /* Ths function is very similar to libpqprop_async_write. For more - * information, refer to the comments there */ + /* + * Ths function is very similar to libpqprop_async_write. For more + * information, refer to the comments there + */ if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1) return false; diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 62d2624e56..5346680b0b 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -29,7 +29,8 @@ PG_MODULE_MAGIC; void _PG_init(void); -void _PG_init(void) +void +_PG_init(void) { pg_init_libpagestore(); pg_init_libpqwalproposer(); @@ -59,9 +60,9 @@ pg_cluster_size(PG_FUNCTION_ARGS) Datum backpressure_lsns(PG_FUNCTION_ARGS) { - XLogRecPtr writePtr; - XLogRecPtr flushPtr; - XLogRecPtr applyPtr; + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; Datum values[3]; bool nulls[3]; TupleDesc tupdesc; diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 2c66bc7bf0..dad9c1b508 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -16,4 +16,4 @@ extern void pg_init_libpagestore(void); extern void pg_init_libpqwalproposer(void); extern void pg_init_walproposer(void); -#endif /* NEON_H */ +#endif /* NEON_H */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 5b21abc1bd..7dc38c13fb 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -83,8 +83,8 @@ typedef struct typedef struct { ZenithRequest req; - Oid dbNode; -} ZenithDbSizeRequest; + Oid dbNode; +} ZenithDbSizeRequest; typedef struct @@ -123,12 +123,13 @@ typedef struct { ZenithMessageTag tag; int64 db_size; -} ZenithDbSizeResponse; +} ZenithDbSizeResponse; typedef struct { ZenithMessageTag tag; - char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error message */ + char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error + * message */ } ZenithErrorResponse; extern StringInfoData zm_pack_request(ZenithRequest *msg); @@ -142,12 +143,12 @@ extern char *zm_to_string(ZenithMessage *msg); typedef struct { ZenithResponse *(*request) (ZenithRequest *request); - void (*send) (ZenithRequest *request); + void (*send) (ZenithRequest *request); ZenithResponse *(*receive) (void); - void (*flush) (void); + void (*flush) (void); } page_server_api; -extern page_server_api *page_server; +extern page_server_api * page_server; extern char *page_server_connstring; extern char *zenith_timeline; @@ -179,7 +180,7 @@ extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber block char *buffer); extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); + XLogRecPtr request_lsn, bool request_latest, char *buffer); extern void zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); @@ -217,7 +218,7 @@ extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); /* utils for zenith relsize cache */ extern void relsize_hash_init(void); -extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size); +extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size); extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index ebf899dfdb..504ae60d4a 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -94,7 +94,9 @@ const int SmgrTrace = DEBUG5; page_server_api *page_server; /* GUCs */ -char *page_server_connstring; // with substituted password +char *page_server_connstring; + +//with substituted password char *zenith_timeline; char *zenith_tenant; bool wal_redo = false; @@ -107,7 +109,7 @@ typedef enum UNLOGGED_BUILD_PHASE_1, UNLOGGED_BUILD_PHASE_2, UNLOGGED_BUILD_NOT_PERMANENT -} UnloggedBuildPhase; +} UnloggedBuildPhase; static SMgrRelation unlogged_build_rel = NULL; static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; @@ -127,31 +129,33 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; #define MAX_PREFETCH_REQUESTS 128 -BufferTag prefetch_requests[MAX_PREFETCH_REQUESTS]; -BufferTag prefetch_responses[MAX_PREFETCH_REQUESTS]; -int n_prefetch_requests; -int n_prefetch_responses; -int n_prefetched_buffers; -int n_prefetch_hits; -int n_prefetch_misses; -XLogRecPtr prefetch_lsn; +BufferTag prefetch_requests[MAX_PREFETCH_REQUESTS]; +BufferTag prefetch_responses[MAX_PREFETCH_REQUESTS]; +int n_prefetch_requests; +int n_prefetch_responses; +int n_prefetched_buffers; +int n_prefetch_hits; +int n_prefetch_misses; +XLogRecPtr prefetch_lsn; static void consume_prefetch_responses(void) { - for (int i = n_prefetched_buffers; i < n_prefetch_responses; i++) { - ZenithResponse* resp = page_server->receive(); + for (int i = n_prefetched_buffers; i < n_prefetch_responses; i++) + { + ZenithResponse *resp = page_server->receive(); + pfree(resp); } n_prefetched_buffers = 0; n_prefetch_responses = 0; } -static ZenithResponse* -page_server_request(void const* req) +static ZenithResponse * +page_server_request(void const *req) { consume_prefetch_responses(); - return page_server->request((ZenithRequest*)req); + return page_server->request((ZenithRequest *) req); } @@ -196,11 +200,11 @@ zm_pack_request(ZenithRequest *msg) { ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); - pq_sendint32(&s, msg_req->dbNode); + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->dbNode); - break; + break; } case T_ZenithGetPageRequest: { @@ -546,21 +550,22 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, else if (lsn == InvalidXLogRecPtr) { /* - * When PostgreSQL extends a relation, it calls smgrextend() with an all-zeros pages, - * and we can just ignore that in Zenith. We do need to remember the new size, - * though, so that smgrnblocks() returns the right answer after the rel has - * been extended. We rely on the relsize cache for that. + * When PostgreSQL extends a relation, it calls smgrextend() with an + * all-zeros pages, and we can just ignore that in Zenith. We do need + * to remember the new size, though, so that smgrnblocks() returns the + * right answer after the rel has been extended. We rely on the + * relsize cache for that. * - * A completely empty heap page doesn't need to be WAL-logged, either. The - * heapam can leave such a page behind, if e.g. an insert errors out after - * initializing the page, but before it has inserted the tuple and WAL-logged - * the change. When we read the page from the page server, it will come back - * as all-zeros. That's OK, the heapam will initialize an all-zeros page on - * first use. + * A completely empty heap page doesn't need to be WAL-logged, either. + * The heapam can leave such a page behind, if e.g. an insert errors + * out after initializing the page, but before it has inserted the + * tuple and WAL-logged the change. When we read the page from the + * page server, it will come back as all-zeros. That's OK, the heapam + * will initialize an all-zeros page on first use. * - * In other scenarios, evicting a dirty page with no LSN is a bad sign: it implies - * that the page was not WAL-logged, and its contents will be lost when it's - * evicted. + * In other scenarios, evicting a dirty page with no LSN is a bad + * sign: it implies that the page was not WAL-logged, and its contents + * will be lost when it's evicted. */ if (PageIsNew(buffer)) { @@ -691,9 +696,9 @@ zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, Bloc * Is it possible that the last-written LSN is ahead of last flush * LSN? Generally not, we shouldn't evict a page from the buffer cache * before all its modifications have been safely flushed. That's the - * "WAL before data" rule. However, such case does exist at index building, - * _bt_blwritepage logs the full page without flushing WAL before - * smgrextend (files are fsynced before build ends). + * "WAL before data" rule. However, such case does exist at index + * building, _bt_blwritepage logs the full page without flushing WAL + * before smgrextend (files are fsynced before build ends). */ #if PG_VERSION_NUM >= 150000 flushlsn = GetFlushRecPtr(NULL); @@ -728,10 +733,12 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) switch (reln->smgr_relpersistence) { case 0: + /* - * We don't know if it's an unlogged rel stored locally, or permanent - * rel stored in the page server. First check if it exists locally. - * If it does, great. Otherwise check if it exists in the page server. + * We don't know if it's an unlogged rel stored locally, or + * permanent rel stored in the page server. First check if it + * exists locally. If it does, great. Otherwise check if it exists + * in the page server. */ if (mdexists(reln, forkNum)) return true; @@ -755,11 +762,11 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) /* * \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server - * will error out if you check that, because the whole dbdir for tablespace - * 0, db 0 doesn't exists. We possibly should change the page server to - * accept that and return 'false', to be consistent with mdexists(). But - * we probably also should fix pg_table_size() to not call smgrexists() - * with bogus relfilenode. + * will error out if you check that, because the whole dbdir for + * tablespace 0, db 0 doesn't exists. We possibly should change the page + * server to accept that and return 'false', to be consistent with + * mdexists(). But we probably also should fix pg_table_size() to not call + * smgrexists() with bogus relfilenode. * * For now, handle that special case here. */ @@ -880,13 +887,13 @@ void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) { /* - * Might or might not exist locally, depending on whether it's - * an unlogged or permanent relation (or if DEBUG_COMPARE_LOCAL is - * set). Try to unlink, it won't do any harm if the file doesn't - * exist. + * Might or might not exist locally, depending on whether it's an unlogged + * or permanent relation (or if DEBUG_COMPARE_LOCAL is set). Try to + * unlink, it won't do any harm if the file doesn't exist. */ mdunlink(rnode, forkNum, isRedo); - if (!RelFileNodeBackendIsTemp(rnode)) { + if (!RelFileNodeBackendIsTemp(rnode)) + { forget_cached_relsize(rnode.node, forkNum); } } @@ -926,8 +933,9 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, /* * Check that the cluster size limit has not been exceeded. * - * Temporary and unlogged relations are not included in the cluster size measured - * by the page server, so ignore those. Autovacuum processes are also exempt. + * Temporary and unlogged relations are not included in the cluster size + * measured by the page server, so ignore those. Autovacuum processes are + * also exempt. */ if (max_cluster_size > 0 && reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && @@ -937,10 +945,10 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) ereport(ERROR, - (errcode(ERRCODE_DISK_FULL), - errmsg("could not extend file because cluster size limit (%d MB) has been exceeded", - max_cluster_size), - errhint("This limit is defined by neon.max_cluster_size GUC"))); + (errcode(ERRCODE_DISK_FULL), + errmsg("could not extend file because cluster size limit (%d MB) has been exceeded", + max_cluster_size), + errhint("This limit is defined by neon.max_cluster_size GUC"))); } zenith_wallog_page(reln, forkNum, blkno, buffer); @@ -987,8 +995,8 @@ void zenith_close(SMgrRelation reln, ForkNumber forknum) { /* - * Let md.c close it, if it had it open. Doesn't hurt to do this - * even for permanent relations that have no local storage. + * Let md.c close it, if it had it open. Doesn't hurt to do this even for + * permanent relations that have no local storage. */ mdclose(reln, forknum); } @@ -1079,17 +1087,18 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum, * While function is defined in the zenith extension it's used within neon_test_utils directly. * To avoid breaking tests in the runtime please keep function signature in sync. */ -void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer) +void +zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer) { ZenithResponse *resp; - int i; + int i; /* - * Try to find prefetched page. - * It is assumed that pages will be requested in the same order as them are prefetched, - * but some other backend may load page in shared buffers, so some prefetch responses should - * be skipped. + * Try to find prefetched page. It is assumed that pages will be requested + * in the same order as them are prefetched, but some other backend may + * load page in shared buffers, so some prefetch responses should be + * skipped. */ for (i = n_prefetched_buffers; i < n_prefetch_responses; i++) { @@ -1099,19 +1108,20 @@ void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno prefetch_responses[i].forkNum == forkNum && prefetch_responses[i].blockNum == blkno) { - char* page = ((ZenithGetPageResponse *) resp)->page; + char *page = ((ZenithGetPageResponse *) resp)->page; + /* - * Check if prefetched page is still relevant. - * If it is updated by some other backend, then it should not - * be requested from smgr unless it is evicted from shared buffers. - * In the last case last_evicted_lsn should be updated and - * request_lsn should be greater than prefetch_lsn. - * Maximum with page LSN is used because page returned by page server - * may have LSN either greater either smaller than requested. + * Check if prefetched page is still relevant. If it is updated by + * some other backend, then it should not be requested from smgr + * unless it is evicted from shared buffers. In the last case + * last_evicted_lsn should be updated and request_lsn should be + * greater than prefetch_lsn. Maximum with page LSN is used + * because page returned by page server may have LSN either + * greater either smaller than requested. */ if (Max(prefetch_lsn, PageGetLSN(page)) >= request_lsn) { - n_prefetched_buffers = i+1; + n_prefetched_buffers = i + 1; n_prefetch_hits += 1; n_prefetch_requests = 0; memcpy(buffer, page, BLCKSZ); @@ -1133,6 +1143,7 @@ void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno .forknum = forkNum, .blkno = blkno }; + if (n_prefetch_requests > 0) { /* Combine all prefetch requests with primary request */ @@ -1471,8 +1482,8 @@ int64 zenith_dbsize(Oid dbNode) { ZenithResponse *resp; - int64 db_size; - XLogRecPtr request_lsn; + int64 db_size; + XLogRecPtr request_lsn; bool latest; RelFileNode dummy_node = {InvalidOid, InvalidOid, InvalidOid}; @@ -1564,10 +1575,12 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) XLogFlush(lsn); /* - * Truncate may affect several chunks of relations. So we should either update last written LSN for all of them, - * or update LSN for "dummy" metadata block. Second approach seems more efficient. If the relation is extended - * again later, the extension will update the last-written LSN for the extended pages, so there's no harm in - * leaving behind obsolete entries for the truncated chunks. + * Truncate may affect several chunks of relations. So we should either + * update last written LSN for all of them, or update LSN for "dummy" + * metadata block. Second approach seems more efficient. If the relation + * is extended again later, the extension will update the last-written LSN + * for the extended pages, so there's no harm in leaving behind obsolete + * entries for the truncated chunks. */ SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forknum); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index a769a5216b..05257ced4c 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -88,8 +88,9 @@ WalProposerFunctionsType *WalProposerFunctions = NULL; static int n_safekeepers = 0; static int quorum = 0; static Safekeeper safekeeper[MAX_SAFEKEEPERS]; -static XLogRecPtr availableLsn; /* WAL has been generated up to this point */ -static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to safekeepers */ +static XLogRecPtr availableLsn; /* WAL has been generated up to this point */ +static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to + * safekeepers */ static ProposerGreeting greetRequest; static VoteRequest voteRequest; /* Vote request for safekeeper */ static WaitEventSet *waitEvents; @@ -99,6 +100,7 @@ static AppendResponse quorumFeedback; * record-aligned (first record which might not yet received by someone). */ static XLogRecPtr truncateLsn; + /* * Term of the proposer. We want our term to be highest and unique, * so we collect terms from safekeepers quorum, choose max and +1. @@ -116,7 +118,7 @@ static int n_votes = 0; static int n_connected = 0; static TimestampTz last_reconnect_attempt; -static WalproposerShmemState *walprop_shared; +static WalproposerShmemState * walprop_shared; /* Prototypes for private functions */ static void WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId); @@ -138,7 +140,7 @@ static void RecvAcceptorGreeting(Safekeeper *sk); static void SendVoteRequest(Safekeeper *sk); static void RecvVoteResponse(Safekeeper *sk); static void HandleElectedProposer(void); -static term_t GetHighestTerm(TermHistory *th); +static term_t GetHighestTerm(TermHistory * th); static term_t GetEpoch(Safekeeper *sk); static void DetermineEpochStartLsn(void); static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos); @@ -155,7 +157,7 @@ static XLogRecPtr CalculateMinFlushLsn(void); static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void); static void HandleSafekeeperResponse(void); static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); -static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); +static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg); static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state); static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); static bool AsyncFlush(Safekeeper *sk); @@ -175,7 +177,8 @@ static void walproposer_shmem_request(void); #endif -void pg_init_walproposer(void) +void +pg_init_walproposer(void) { if (!process_shared_preload_libraries_in_progress) return; @@ -194,50 +197,53 @@ void pg_init_walproposer(void) WalProposerStart = &WalProposerStartImpl; } -static void nwp_register_gucs(void) +static void +nwp_register_gucs(void) { DefineCustomStringVariable( - "neon.safekeepers", - "List of Neon WAL acceptors (host:port)", - NULL, /* long_desc */ - &wal_acceptors_list, /* valueAddr */ - "", /* bootValue */ - PGC_POSTMASTER, - GUC_LIST_INPUT, /* extensions can't use GUC_LIST_QUOTE */ - NULL, NULL, NULL - ); + "neon.safekeepers", + "List of Neon WAL acceptors (host:port)", + NULL, /* long_desc */ + &wal_acceptors_list, /* valueAddr */ + "", /* bootValue */ + PGC_POSTMASTER, + GUC_LIST_INPUT, /* extensions can't use + * GUC_LIST_QUOTE */ + NULL, NULL, NULL + ); DefineCustomIntVariable( - "neon.safekeeper_reconnect_timeout", - "Timeout for reconnecting to offline wal acceptor.", - NULL, - &wal_acceptor_reconnect_timeout, - 1000, 0, INT_MAX, /* default, min, max */ - PGC_SIGHUP, /* context */ - GUC_UNIT_MS, /* flags */ - NULL, NULL, NULL - ); + "neon.safekeeper_reconnect_timeout", + "Timeout for reconnecting to offline wal acceptor.", + NULL, + &wal_acceptor_reconnect_timeout, + 1000, 0, INT_MAX, /* default, min, max */ + PGC_SIGHUP, /* context */ + GUC_UNIT_MS, /* flags */ + NULL, NULL, NULL + ); DefineCustomIntVariable( - "neon.safekeeper_connect_timeout", - "Timeout after which give up connection attempt to safekeeper.", - NULL, - &wal_acceptor_connect_timeout, - 5000, 0, INT_MAX, - PGC_SIGHUP, - GUC_UNIT_MS, - NULL, NULL, NULL - ); + "neon.safekeeper_connect_timeout", + "Timeout after which give up connection attempt to safekeeper.", + NULL, + &wal_acceptor_connect_timeout, + 5000, 0, INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MS, + NULL, NULL, NULL + ); } /* shmem handling */ -static void nwp_prepare_shmem(void) +static void +nwp_prepare_shmem(void) { #if PG_VERSION_NUM >= 150000 - prev_shmem_request_hook = shmem_request_hook; - shmem_request_hook = walproposer_shmem_request; + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = walproposer_shmem_request; #else RequestAddinShmemSpace(WalproposerShmemSize()); #endif @@ -260,7 +266,8 @@ walproposer_shmem_request(void) } #endif -static void nwp_shmem_startup_hook(void) +static void +nwp_shmem_startup_hook(void) { if (prev_shmem_startup_hook_type) prev_shmem_startup_hook_type(); @@ -275,7 +282,7 @@ void WalProposerMain(Datum main_arg) { #if PG_VERSION_NUM >= 150000 - TimeLineID tli; + TimeLineID tli; #endif /* Establish signal handlers. */ @@ -286,7 +293,7 @@ WalProposerMain(Datum main_arg) BackgroundWorkerUnblockSignals(); #if PG_VERSION_NUM >= 150000 - // FIXME pass proper tli to WalProposerInit ? + /* FIXME pass proper tli to WalProposerInit ? */ GetXLogReplayRecPtr(&tli); WalProposerInit(GetFlushRecPtr(NULL), GetSystemIdentifier()); #else @@ -339,7 +346,7 @@ WalProposerPoll(void) { while (true) { - Safekeeper *sk; + Safekeeper *sk; int rc; WaitEvent event; TimestampTz now = GetCurrentTimestamp(); @@ -356,8 +363,8 @@ WalProposerPoll(void) AdvancePollState(sk, event.events); /* - * If the timeout expired, attempt to reconnect to any safekeepers that - * we dropped + * If the timeout expired, attempt to reconnect to any safekeepers + * that we dropped */ ReconnectSafekeepers(); @@ -371,7 +378,7 @@ WalProposerPoll(void) ResetLatch(MyLatch); break; } - if (rc == 0) /* timeout expired: poll state */ + if (rc == 0) /* timeout expired: poll state */ { TimestampTz now; @@ -390,12 +397,12 @@ WalProposerPoll(void) now = GetCurrentTimestamp(); for (int i = 0; i < n_safekeepers; i++) { - Safekeeper *sk = &safekeeper[i]; + Safekeeper *sk = &safekeeper[i]; if ((sk->state == SS_CONNECTING_WRITE || - sk->state == SS_CONNECTING_READ) && + sk->state == SS_CONNECTING_READ) && TimestampDifferenceExceeds(sk->startedConnAt, now, - wal_acceptor_connect_timeout)) + wal_acceptor_connect_timeout)) { elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms", sk->host, sk->port, wal_acceptor_connect_timeout); @@ -472,7 +479,7 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) */ safekeeper[n_safekeepers].conninfo[0] = '\0'; initStringInfo(&safekeeper[n_safekeepers].outbuf); - safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open, .segment_close = wal_segment_close), NULL); + safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL); if (safekeeper[n_safekeepers].xlogreader == NULL) elog(FATAL, "Failed to allocate xlog reader"); safekeeper[n_safekeepers].flushWrite = false; @@ -504,7 +511,7 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer); #if PG_VERSION_NUM >= 150000 -// FIXME don't use hardcoded timeline id +/* FIXME don't use hardcoded timeline id */ greetRequest.timeline = 1; #else greetRequest.timeline = ThisTimeLineID; @@ -589,7 +596,7 @@ HackyRemoveWalProposerEvent(Safekeeper *to_remove) for (int i = 0; i < n_safekeepers; i++) { uint32 desired_events = WL_NO_EVENTS; - Safekeeper *sk = &safekeeper[i]; + Safekeeper *sk = &safekeeper[i]; sk->eventPos = -1; @@ -647,12 +654,21 @@ ResetConnection(Safekeeper *sk) */ if (sk->conninfo[0] == '\0') { - int written = 0; + int written = 0; + written = snprintf((char *) &sk->conninfo, MAXCONNINFO, - "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", - sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer); - // currently connection string is not that long, but once we pass something like jwt we might overflow the buffer, - // so it is better to be defensive and check that everything aligns well + "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", + sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer); + + /* + * currently connection string is not that long, but once we pass + * something like jwt we might overflow the buffer, + */ + + /* + * so it is better to be defensive and check that everything aligns + * well + */ if (written > MAXCONNINFO || written < 0) elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); } @@ -762,8 +778,8 @@ static void AdvancePollState(Safekeeper *sk, uint32 events) { /* - * Sanity check. We assume further down that the operations don't - * block because the socket is ready. + * Sanity check. We assume further down that the operations don't block + * because the socket is ready. */ AssertEventsOkForState(events, sk); @@ -777,12 +793,12 @@ AdvancePollState(Safekeeper *sk, uint32 events) case SS_OFFLINE: elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline", sk->host, sk->port); - break; /* actually unreachable, but prevents - * -Wimplicit-fallthrough */ + break; /* actually unreachable, but prevents + * -Wimplicit-fallthrough */ /* - * Both connecting states run the same logic. The only - * difference is the events they're expecting + * Both connecting states run the same logic. The only difference + * is the events they're expecting */ case SS_CONNECTING_READ: case SS_CONNECTING_WRITE: @@ -797,20 +813,22 @@ AdvancePollState(Safekeeper *sk, uint32 events) break; /* - * Finish handshake comms: receive information about the safekeeper. + * Finish handshake comms: receive information about the + * safekeeper. */ case SS_HANDSHAKE_RECV: RecvAcceptorGreeting(sk); break; /* - * Voting is an idle state - we don't expect any events to trigger. - * Refer to the execution of SS_HANDSHAKE_RECV to see how nodes are - * transferred from SS_VOTING to sending actual vote requests. + * Voting is an idle state - we don't expect any events to + * trigger. Refer to the execution of SS_HANDSHAKE_RECV to see how + * nodes are transferred from SS_VOTING to sending actual vote + * requests. */ case SS_VOTING: elog(WARNING, "EOF from node %s:%s in %s state", sk->host, - sk->port, FormatSafekeeperState(sk->state)); + sk->port, FormatSafekeeperState(sk->state)); ResetConnection(sk); return; @@ -824,8 +842,8 @@ AdvancePollState(Safekeeper *sk, uint32 events) /* * AsyncFlush ensures we only move on to SS_ACTIVE once the flush - * completes. If we still have more to do, we'll wait until the next - * poll comes along. + * completes. If we still have more to do, we'll wait until the + * next poll comes along. */ if (!AsyncFlush(sk)) return; @@ -839,7 +857,7 @@ AdvancePollState(Safekeeper *sk, uint32 events) */ case SS_IDLE: elog(WARNING, "EOF from node %s:%s in %s state", sk->host, - sk->port, FormatSafekeeperState(sk->state)); + sk->port, FormatSafekeeperState(sk->state)); ResetConnection(sk); return; @@ -864,19 +882,17 @@ HandleConnectionEvent(Safekeeper *sk) { case WP_CONN_POLLING_OK: elog(LOG, "connected with node %s:%s", sk->host, - sk->port); + sk->port); /* - * We have to pick some event to update event set. - * We'll eventually need the socket to be readable, - * so we go with that. + * We have to pick some event to update event set. We'll + * eventually need the socket to be readable, so we go with that. */ new_events = WL_SOCKET_READABLE; break; /* - * If we need to poll to finish connecting, - * continue doing that + * If we need to poll to finish connecting, continue doing that */ case WP_CONN_POLLING_READING: sk->state = SS_CONNECTING_READ; @@ -889,13 +905,12 @@ HandleConnectionEvent(Safekeeper *sk) case WP_CONN_POLLING_FAILED: elog(WARNING, "failed to connect to node '%s:%s': %s", - sk->host, sk->port, walprop_error_message(sk->conn)); + sk->host, sk->port, walprop_error_message(sk->conn)); /* - * If connecting failed, we don't want to restart - * the connection because that might run us into a - * loop. Instead, shut it down -- it'll naturally - * restart at a slower interval on calls to + * If connecting failed, we don't want to restart the connection + * because that might run us into a loop. Instead, shut it down -- + * it'll naturally restart at a slower interval on calls to * ReconnectSafekeepers. */ ShutdownConnection(sk); @@ -903,9 +918,8 @@ HandleConnectionEvent(Safekeeper *sk) } /* - * Because PQconnectPoll can change the socket, we have to - * un-register the old event and re-register an event on - * the new socket. + * Because PQconnectPoll can change the socket, we have to un-register the + * old event and re-register an event on the new socket. */ HackyRemoveWalProposerEvent(sk); sk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(sk->conn), NULL, sk); @@ -926,7 +940,7 @@ SendStartWALPush(Safekeeper *sk) if (!walprop_send_query(sk->conn, "START_WAL_PUSH")) { elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", - sk->host, sk->port, walprop_error_message(sk->conn)); + sk->host, sk->port, walprop_error_message(sk->conn)); ShutdownConnection(sk); return; } @@ -940,8 +954,7 @@ RecvStartWALPushResult(Safekeeper *sk) switch (walprop_get_query_result(sk->conn)) { /* - * Successful result, move on to starting the - * handshake + * Successful result, move on to starting the handshake */ case WP_EXEC_SUCCESS_COPYBOTH: @@ -949,31 +962,31 @@ RecvStartWALPushResult(Safekeeper *sk) break; /* - * Needs repeated calls to finish. Wait until the - * socket is readable + * Needs repeated calls to finish. Wait until the socket is + * readable */ case WP_EXEC_NEEDS_INPUT: /* - * SS_WAIT_EXEC_RESULT is always reached through an - * event, so we don't need to update the event set + * SS_WAIT_EXEC_RESULT is always reached through an event, so we + * don't need to update the event set */ break; case WP_EXEC_FAILED: elog(WARNING, "Failed to send query to safekeeper %s:%s: %s", - sk->host, sk->port, walprop_error_message(sk->conn)); + sk->host, sk->port, walprop_error_message(sk->conn)); ShutdownConnection(sk); return; /* - * Unexpected result -- funamdentally an error, but we - * want to produce a custom message, rather than a - * generic "something went wrong" + * Unexpected result -- funamdentally an error, but we want to + * produce a custom message, rather than a generic "something went + * wrong" */ case WP_EXEC_UNEXPECTED_SUCCESS: elog(WARNING, "Received bad response from safekeeper %s:%s query execution", - sk->host, sk->port); + sk->host, sk->port); ShutdownConnection(sk); return; } @@ -988,8 +1001,8 @@ static void SendProposerGreeting(Safekeeper *sk) { /* - * On failure, logging & resetting the connection is handled. - * We just need to handle the control flow. + * On failure, logging & resetting the connection is handled. We just need + * to handle the control flow. */ BlockingWrite(sk, &greetRequest, sizeof(greetRequest), SS_HANDSHAKE_RECV); } @@ -998,12 +1011,12 @@ static void RecvAcceptorGreeting(Safekeeper *sk) { /* - * If our reading doesn't immediately succeed, any necessary - * error handling or state setting is taken care of. We can - * leave any other work until later. + * If our reading doesn't immediately succeed, any necessary error + * handling or state setting is taken care of. We can leave any other work + * until later. */ sk->greetResponse.apm.tag = 'g'; - if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->greetResponse)) return; /* Protocol is all good, move to voting. */ @@ -1033,37 +1046,34 @@ RecvAcceptorGreeting(Safekeeper *sk) { /* Another compute with higher term is running. */ elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", - sk->host, sk->port, - sk->greetResponse.term, propTerm); + sk->host, sk->port, + sk->greetResponse.term, propTerm); } /* - * Check if we have quorum. If there aren't enough safekeepers, - * wait and do nothing. We'll eventually get a task when the - * election starts. + * Check if we have quorum. If there aren't enough safekeepers, wait and + * do nothing. We'll eventually get a task when the election starts. * * If we do have quorum, we can start an election. */ if (n_connected < quorum) { /* - * SS_VOTING is an idle state; read-ready indicates the - * connection closed. + * SS_VOTING is an idle state; read-ready indicates the connection + * closed. */ UpdateEventSet(sk, WL_SOCKET_READABLE); } else { /* - * Now send voting request to the cohort and wait - * responses + * Now send voting request to the cohort and wait responses */ for (int j = 0; j < n_safekeepers; j++) { /* * Remember: SS_VOTING indicates that the safekeeper is - * participating in voting, but hasn't sent anything - * yet. + * participating in voting, but hasn't sent anything yet. */ if (safekeeper[j].state == SS_VOTING) SendVoteRequest(&safekeeper[j]); @@ -1087,28 +1097,27 @@ static void RecvVoteResponse(Safekeeper *sk) { sk->voteResponse.apm.tag = 'v'; - if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse)) + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->voteResponse)) return; elog(LOG, - "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", - sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), - LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), - LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), - LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); + "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", + sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), + LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), + LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), + LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); /* - * In case of acceptor rejecting our vote, bail out, but only - * if either it already lives in strictly higher term - * (concurrent compute spotted) or we are not elected yet and - * thus need the vote. + * In case of acceptor rejecting our vote, bail out, but only if either it + * already lives in strictly higher term (concurrent compute spotted) or + * we are not elected yet and thus need the vote. */ if ((!sk->voteResponse.voteGiven) && (sk->voteResponse.term > propTerm || n_votes < quorum)) { elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", - sk->host, sk->port, - sk->voteResponse.term, propTerm); + sk->host, sk->port, + sk->voteResponse.term, propTerm); } Assert(sk->voteResponse.term == propTerm); @@ -1116,7 +1125,7 @@ RecvVoteResponse(Safekeeper *sk) n_votes++; if (n_votes < quorum) { - sk->state = SS_IDLE; /* can't do much yet, no quorum */ + sk->state = SS_IDLE; /* can't do much yet, no quorum */ } else if (n_votes > quorum) { @@ -1146,16 +1155,16 @@ HandleElectedProposer(void) DetermineEpochStartLsn(); /* - * Check if not all safekeepers are up-to-date, we need to - * download WAL needed to synchronize them + * Check if not all safekeepers are up-to-date, we need to download WAL + * needed to synchronize them */ if (truncateLsn < propEpochStartLsn) { elog(LOG, - "start recovery because truncateLsn=%X/%X is not " - "equal to epochStartLsn=%X/%X", - LSN_FORMAT_ARGS(truncateLsn), - LSN_FORMAT_ARGS(propEpochStartLsn)); + "start recovery because truncateLsn=%X/%X is not " + "equal to epochStartLsn=%X/%X", + LSN_FORMAT_ARGS(truncateLsn), + LSN_FORMAT_ARGS(propEpochStartLsn)); /* Perform recovery */ if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn)) elog(FATAL, "Failed to recover state"); @@ -1175,18 +1184,17 @@ HandleElectedProposer(void) /* * The proposer has been elected, and there will be no quorum waiting - * after this point. There will be no safekeeper with state SS_IDLE - * also, because that state is used only for quorum waiting. + * after this point. There will be no safekeeper with state SS_IDLE also, + * because that state is used only for quorum waiting. */ if (syncSafekeepers) { /* - * Send empty message to enforce receiving feedback - * even from nodes who are fully recovered; this is - * required to learn they switched epoch which finishes - * sync-safeekepers who doesn't generate any real new - * records. Will go away once we switch to async acks. + * Send empty message to enforce receiving feedback even from nodes + * who are fully recovered; this is required to learn they switched + * epoch which finishes sync-safeekepers who doesn't generate any real + * new records. Will go away once we switch to async acks. */ BroadcastAppendRequest(); @@ -1200,7 +1208,7 @@ HandleElectedProposer(void) /* latest term in TermHistory, or 0 is there is no entries */ static term_t -GetHighestTerm(TermHistory *th) +GetHighestTerm(TermHistory * th) { return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0; } @@ -1276,8 +1284,8 @@ DetermineEpochStartLsn(void) } /* - * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing was - * committed yet. Start streaming then from the basebackup LSN. + * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing + * was committed yet. Start streaming then from the basebackup LSN. */ if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers) { @@ -1322,24 +1330,24 @@ DetermineEpochStartLsn(void) ); /* - * Ensure the basebackup we are running (at RedoStartLsn) matches LSN since - * which we are going to write according to the consensus. If not, we must - * bail out, as clog and other non rel data is inconsistent. + * Ensure the basebackup we are running (at RedoStartLsn) matches LSN + * since which we are going to write according to the consensus. If not, + * we must bail out, as clog and other non rel data is inconsistent. */ if (!syncSafekeepers) { /* - * Basebackup LSN always points to the beginning of the record (not the - * page), as StartupXLOG most probably wants it this way. Safekeepers - * don't skip header as they need continious stream of data, so - * correct LSN for comparison. + * Basebackup LSN always points to the beginning of the record (not + * the page), as StartupXLOG most probably wants it this way. + * Safekeepers don't skip header as they need continious stream of + * data, so correct LSN for comparison. */ if (SkipXLogPageHeader(propEpochStartLsn) != GetRedoStartLsn()) { /* - * However, allow to proceed if previously elected leader was me; plain - * restart of walproposer not intervened by concurrent compute (who could - * generate WAL) is ok. + * However, allow to proceed if previously elected leader was me; + * plain restart of walproposer not intervened by concurrent + * compute (who could generate WAL) is ok. */ if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == walprop_shared->mineLastElectedTerm))) @@ -1407,7 +1415,7 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec { Assert(buf[0] == 'w' || buf[0] == 'k'); if (buf[0] == 'k') - continue; /* keepalive */ + continue; /* keepalive */ memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], sizeof rec_start_lsn); rec_start_lsn = pg_ntoh64(rec_start_lsn); @@ -1457,18 +1465,20 @@ SendProposerElected(Safekeeper *sk) { ProposerElected msg; TermHistory *th; - term_t lastCommonTerm; - int i; + term_t lastCommonTerm; + int i; /* - * Determine start LSN by comparing safekeeper's log term switch history and - * proposer's, searching for the divergence point. + * Determine start LSN by comparing safekeeper's log term switch history + * and proposer's, searching for the divergence point. * * Note: there is a vanishingly small chance of no common point even if * there is some WAL on safekeeper, if immediately after bootstrap compute - * wrote some WAL on single sk and died; we stream since the beginning then. + * wrote some WAL on single sk and died; we stream since the beginning + * then. */ th = &sk->voteResponse.termHistory; + /* * If any WAL is present on the sk, it must be authorized by some term. * OTOH, without any WAL there are no term swiches in the log. @@ -1485,7 +1495,7 @@ SendProposerElected(Safekeeper *sk) /* term must begin everywhere at the same point */ Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn); } - i--; /* step back to the last common term */ + i--; /* step back to the last common term */ if (i < 0) { /* safekeeper is empty or no common point, start from the beginning */ @@ -1500,17 +1510,17 @@ SendProposerElected(Safekeeper *sk) * to the truncateLsn before, but now current safekeeper tells * otherwise. * - * Also we have a special condition here, which is empty safekeeper - * with no history. In combination with a gap, that can happen when - * we introduce a new safekeeper to the cluster. This is a rare case, - * which is triggered manually for now, and should be treated with - * care. + * Also we have a special condition here, which is empty + * safekeeper with no history. In combination with a gap, that can + * happen when we introduce a new safekeeper to the cluster. This + * is a rare case, which is triggered manually for now, and should + * be treated with care. */ /* - * truncateLsn will not change without ack from current safekeeper, - * and it's aligned to the WAL record, so we can safely start - * streaming from this point. + * truncateLsn will not change without ack from current + * safekeeper, and it's aligned to the WAL record, so we can + * safely start streaming from this point. */ sk->startStreamingAt = truncateLsn; @@ -1533,9 +1543,10 @@ SendProposerElected(Safekeeper *sk) } else { - XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn; - XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : - sk->voteResponse.flushLsn); + XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn; + XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : + sk->voteResponse.flushLsn); + sk->startStreamingAt = Min(propEndLsn, skEndLsn); } } @@ -1595,8 +1606,8 @@ static void StartStreaming(Safekeeper *sk) { /* - * This is the only entrypoint to state SS_ACTIVE. It's executed - * exactly once for a connection. + * This is the only entrypoint to state SS_ACTIVE. It's executed exactly + * once for a connection. */ sk->state = SS_ACTIVE; sk->streamingAt = sk->startStreamingAt; @@ -1617,7 +1628,10 @@ SendMessageToNode(Safekeeper *sk) { Assert(sk->state == SS_ACTIVE); - /* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */ + /* + * Note: we always send everything to the safekeeper until WOULDBLOCK or + * nothing left to send + */ HandleActiveState(sk, WL_SOCKET_WRITEABLE); } @@ -1633,7 +1647,7 @@ BroadcastAppendRequest() } static void -PrepareAppendRequest(AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn) +PrepareAppendRequest(AppendRequestHeader * req, XLogRecPtr beginLsn, XLogRecPtr endLsn) { Assert(endLsn >= beginLsn); req->tag = 'a'; @@ -1652,7 +1666,7 @@ PrepareAppendRequest(AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr e static void HandleActiveState(Safekeeper *sk, uint32 events) { - uint32 newEvents = WL_SOCKET_READABLE; + uint32 newEvents = WL_SOCKET_READABLE; if (events & WL_SOCKET_WRITEABLE) if (!SendAppendRequests(sk)) @@ -1666,10 +1680,10 @@ HandleActiveState(Safekeeper *sk, uint32 events) * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data * in the buffer. * - * LSN comparison checks if we have pending unsent messages. This check isn't - * necessary now, because we always send append messages immediately after - * arrival. But it's good to have it here in case we change this behavior - * in the future. + * LSN comparison checks if we have pending unsent messages. This check + * isn't necessary now, because we always send append messages immediately + * after arrival. But it's good to have it here in case we change this + * behavior in the future. */ if (sk->streamingAt != availableLsn || sk->flushWrite) newEvents |= WL_SOCKET_WRITEABLE; @@ -1689,15 +1703,16 @@ HandleActiveState(Safekeeper *sk, uint32 events) static bool SendAppendRequests(Safekeeper *sk) { - XLogRecPtr endLsn; + XLogRecPtr endLsn; AppendRequestHeader *req; PGAsyncWriteResult writeResult; WALReadError errinfo; - bool sentAnything = false; + bool sentAnything = false; if (sk->flushWrite) { if (!AsyncFlush(sk)) + /* * AsyncFlush failed, that could happen if the socket is closed or * we have nothing to write and should wait for writeable socket. @@ -1716,7 +1731,8 @@ SendAppendRequests(Safekeeper *sk) endLsn += MAX_SEND_SIZE; /* if we went beyond available WAL, back off */ - if (endLsn > availableLsn) { + if (endLsn > availableLsn) + { endLsn = availableLsn; } @@ -1734,21 +1750,21 @@ SendAppendRequests(Safekeeper *sk) resetStringInfo(&sk->outbuf); /* write AppendRequest header */ - appendBinaryStringInfo(&sk->outbuf, (char*) req, sizeof(AppendRequestHeader)); + appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader)); /* write the WAL itself */ enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); if (!WALRead(sk->xlogreader, - &sk->outbuf.data[sk->outbuf.len], - req->beginLsn, - req->endLsn - req->beginLsn, - #if PG_VERSION_NUM >= 150000 - // FIXME don't use hardcoded timelineid here - 1, - #else - ThisTimeLineID, - #endif - &errinfo)) + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req->endLsn - req->beginLsn, +#if PG_VERSION_NUM >= 150000 + /* FIXME don't use hardcoded timelineid here */ + 1, +#else + ThisTimeLineID, +#endif + &errinfo)) { WALReadRaiseError(&errinfo); } @@ -1766,17 +1782,19 @@ SendAppendRequests(Safekeeper *sk) break; case PG_ASYNC_WRITE_TRY_FLUSH: + /* * We still need to call PQflush some more to finish the job. - * Caller function will handle this by setting right event set. + * Caller function will handle this by setting right event + * set. */ sk->flushWrite = true; return true; case PG_ASYNC_WRITE_FAIL: elog(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), - walprop_error_message(sk->conn)); + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); ShutdownConnection(sk); return false; default: @@ -1800,17 +1818,17 @@ static bool RecvAppendResponses(Safekeeper *sk) { XLogRecPtr minQuorumLsn; - bool readAnything = false; + bool readAnything = false; while (true) { /* - * If our reading doesn't immediately succeed, any - * necessary error handling or state setting is taken care - * of. We can leave any other work until later. + * If our reading doesn't immediately succeed, any necessary error + * handling or state setting is taken care of. We can leave any other + * work until later. */ sk->appendResponse.apm.tag = 'a'; - if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse)) + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->appendResponse)) break; ereport(DEBUG2, @@ -1824,8 +1842,8 @@ RecvAppendResponses(Safekeeper *sk) { /* Another compute with higher term is running. */ elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", - sk->host, sk->port, - sk->appendResponse.term, propTerm); + sk->host, sk->port, + sk->appendResponse.term, propTerm); } readAnything = true; @@ -1851,11 +1869,11 @@ RecvAppendResponses(Safekeeper *sk) /* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */ void -ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *rf) +ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * rf) { - uint8 nkeys; - int i; - int32 len; + uint8 nkeys; + int i; + int32 len; /* get number of custom keys */ nkeys = pq_getmsgbyte(reply_message); @@ -1863,54 +1881,65 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *r for (i = 0; i < nkeys; i++) { const char *key = pq_getmsgstring(reply_message); + if (strcmp(key, "current_timeline_size") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); // read value length + pq_getmsgint(reply_message, sizeof(int32)); + //read value length rf->currentClusterSize = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", - rf->currentClusterSize); + elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", + rf->currentClusterSize); } else if (strcmp(key, "ps_writelsn") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); // read value length + pq_getmsgint(reply_message, sizeof(int32)); + //read value length rf->ps_writelsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_writelsn)); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_writelsn)); } else if (strcmp(key, "ps_flushlsn") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); // read value length + pq_getmsgint(reply_message, sizeof(int32)); + //read value length rf->ps_flushlsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_flushlsn)); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_flushlsn)); } else if (strcmp(key, "ps_applylsn") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); // read value length + pq_getmsgint(reply_message, sizeof(int32)); + //read value length rf->ps_applylsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_applylsn)); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_applylsn)); } else if (strcmp(key, "ps_replytime") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); // read value length - rf->ps_replytime = pq_getmsgint64(reply_message); + pq_getmsgint(reply_message, sizeof(int32)); + //read value length + rf->ps_replytime = pq_getmsgint64(reply_message); { char *replyTimeStr; /* Copy because timestamptz_to_str returns a static buffer */ replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime)); elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s", - rf->ps_replytime, replyTimeStr); + rf->ps_replytime, replyTimeStr); pfree(replyTimeStr); } } else { - len = pq_getmsgint(reply_message, sizeof(int32)); // read value length - // Skip unknown keys to support backward compatibile protocol changes - elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); + len = pq_getmsgint(reply_message, sizeof(int32)); + //read value length + + /* + * Skip unknown keys to support backward compatibile protocol + * changes + */ + elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); pq_getmsgbytes(reply_message, len); }; } @@ -1952,9 +1981,10 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs) static XLogRecPtr CalculateMinFlushLsn(void) { - XLogRecPtr lsn = n_safekeepers > 0 - ? safekeeper[0].appendResponse.flushLsn - : InvalidXLogRecPtr; + XLogRecPtr lsn = n_safekeepers > 0 + ? safekeeper[0].appendResponse.flushLsn + : InvalidXLogRecPtr; + for (int i = 1; i < n_safekeepers; i++) { lsn = Min(lsn, safekeeper[i].appendResponse.flushLsn); @@ -2006,8 +2036,8 @@ WalproposerShmemInit(void) LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); walprop_shared = ShmemInitStruct("Walproposer shared state", - sizeof(WalproposerShmemState), - &found); + sizeof(WalproposerShmemState), + &found); if (!found) { @@ -2021,7 +2051,7 @@ WalproposerShmemInit(void) } void -replication_feedback_set(ReplicationFeedback *rf) +replication_feedback_set(ReplicationFeedback * rf) { SpinLockAcquire(&walprop_shared->mutex); memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback)); @@ -2044,10 +2074,11 @@ replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRe * Get ReplicationFeedback fields from the most advanced safekeeper */ static void -GetLatestZentihFeedback(ReplicationFeedback *rf) +GetLatestZentihFeedback(ReplicationFeedback * rf) { - int latest_safekeeper = 0; - XLogRecPtr ps_writelsn = InvalidXLogRecPtr; + int latest_safekeeper = 0; + XLogRecPtr ps_writelsn = InvalidXLogRecPtr; + for (int i = 0; i < n_safekeepers; i++) { if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn) @@ -2064,12 +2095,12 @@ GetLatestZentihFeedback(ReplicationFeedback *rf) rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime; elog(DEBUG2, "GetLatestZentihFeedback: currentClusterSize %lu," - " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", - rf->currentClusterSize, - LSN_FORMAT_ARGS(rf->ps_writelsn), - LSN_FORMAT_ARGS(rf->ps_flushlsn), - LSN_FORMAT_ARGS(rf->ps_applylsn), - rf->ps_replytime); + " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", + rf->currentClusterSize, + LSN_FORMAT_ARGS(rf->ps_writelsn), + LSN_FORMAT_ARGS(rf->ps_flushlsn), + LSN_FORMAT_ARGS(rf->ps_applylsn), + rf->ps_replytime); replication_feedback_set(rf); } @@ -2080,7 +2111,7 @@ HandleSafekeeperResponse(void) HotStandbyFeedback hsFeedback; XLogRecPtr minQuorumLsn; XLogRecPtr diskConsistentLsn; - XLogRecPtr minFlushLsn; + XLogRecPtr minFlushLsn; minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); @@ -2088,7 +2119,7 @@ HandleSafekeeperResponse(void) if (!syncSafekeepers) { - // Get ReplicationFeedback fields from the most advanced safekeeper + /* Get ReplicationFeedback fields from the most advanced safekeeper */ GetLatestZentihFeedback(&quorumFeedback.rf); SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); } @@ -2102,11 +2133,15 @@ HandleSafekeeperResponse(void) /* advance the replication slot */ if (!syncSafekeepers) ProcessStandbyReply( - // write_lsn - This is what durably stored in WAL service. + /* write_lsn - This is what durably stored in WAL service. */ quorumFeedback.flushLsn, - //flush_lsn - This is what durably stored in WAL service. + /* flush_lsn - This is what durably stored in WAL service. */ quorumFeedback.flushLsn, - //apply_lsn - This is what processed and durably saved at pageserver. + + /* + * apply_lsn - This is what processed and durably saved at + * pageserver. + */ quorumFeedback.rf.ps_flushlsn, GetCurrentTimestamp(), false); } @@ -2128,15 +2163,14 @@ HandleSafekeeperResponse(void) * flushed to all safekeepers. We must always start streaming from the * beginning of the record, which simplifies decoding on the far end. * - * Advanced truncateLsn should be not further than nearest commitLsn. - * This prevents surprising violation of truncateLsn <= commitLsn - * invariant which might occur because 1) truncateLsn can be advanced - * immediately once chunk is broadcast to all safekeepers, and - * commitLsn generally can't be advanced based on feedback from - * safekeeper who is still in the previous epoch (similar to 'leader - * can't commit entries from previous term' in Raft); 2) chunks we - * read from WAL and send are plain sheets of bytes, but safekeepers - * ack only on record boundaries. + * Advanced truncateLsn should be not further than nearest commitLsn. This + * prevents surprising violation of truncateLsn <= commitLsn invariant + * which might occur because 1) truncateLsn can be advanced immediately + * once chunk is broadcast to all safekeepers, and commitLsn generally + * can't be advanced based on feedback from safekeeper who is still in the + * previous epoch (similar to 'leader can't commit entries from previous + * term' in Raft); 2) chunks we read from WAL and send are plain sheets of + * bytes, but safekeepers ack only on record boundaries. */ minFlushLsn = CalculateMinFlushLsn(); if (minFlushLsn > truncateLsn) @@ -2144,8 +2178,8 @@ HandleSafekeeperResponse(void) truncateLsn = minFlushLsn; /* - * Advance the replication slot to free up old WAL files. Note - * that slot doesn't exist if we are in syncSafekeepers mode. + * Advance the replication slot to free up old WAL files. Note that + * slot doesn't exist if we are in syncSafekeepers mode. */ if (MyReplicationSlot) PhysicalConfirmReceivedLocation(truncateLsn); @@ -2170,7 +2204,7 @@ HandleSafekeeperResponse(void) n_synced = 0; for (int i = 0; i < n_safekeepers; i++) { - Safekeeper *sk = &safekeeper[i]; + Safekeeper *sk = &safekeeper[i]; bool synced = sk->appendResponse.commitLsn >= propEpochStartLsn; /* alive safekeeper which is not synced yet; wait for it */ @@ -2225,11 +2259,11 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) * failed, a warning is emitted and the connection is reset. */ static bool -AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) +AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg) { - char *buf; - int buf_size; - uint64 tag; + char *buf; + int buf_size; + uint64 tag; StringInfoData s; if (!(AsyncRead(sk, &buf, &buf_size))) @@ -2252,54 +2286,56 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) switch (tag) { case 'g': - { - AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; - msg->term = pq_getmsgint64_le(&s); - msg->nodeId = pq_getmsgint64_le(&s); - pq_getmsgend(&s); - return true; - } + { + AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->nodeId = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } case 'v': - { - VoteResponse *msg = (VoteResponse *) anymsg; - - msg->term = pq_getmsgint64_le(&s); - msg->voteGiven = pq_getmsgint64_le(&s); - msg->flushLsn = pq_getmsgint64_le(&s); - msg->truncateLsn = pq_getmsgint64_le(&s); - msg->termHistory.n_entries = pq_getmsgint32_le(&s); - msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); - for (int i = 0; i < msg->termHistory.n_entries; i++) { - msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); - msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + VoteResponse *msg = (VoteResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->voteGiven = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->truncateLsn = pq_getmsgint64_le(&s); + msg->termHistory.n_entries = pq_getmsgint32_le(&s); + msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); + for (int i = 0; i < msg->termHistory.n_entries; i++) + { + msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); + msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + } + msg->timelineStartLsn = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; } - msg->timelineStartLsn = pq_getmsgint64_le(&s); - pq_getmsgend(&s); - return true; - } case 'a': - { - AppendResponse *msg = (AppendResponse *) anymsg; - msg->term = pq_getmsgint64_le(&s); - msg->flushLsn = pq_getmsgint64_le(&s); - msg->commitLsn = pq_getmsgint64_le(&s); - msg->hs.ts = pq_getmsgint64_le(&s); - msg->hs.xmin.value = pq_getmsgint64_le(&s); - msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); - if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) - ParseReplicationFeedbackMessage(&s, &msg->rf); - pq_getmsgend(&s); - return true; - } + { + AppendResponse *msg = (AppendResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->commitLsn = pq_getmsgint64_le(&s); + msg->hs.ts = pq_getmsgint64_le(&s); + msg->hs.xmin.value = pq_getmsgint64_le(&s); + msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); + if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) + ParseReplicationFeedbackMessage(&s, &msg->rf); + pq_getmsgend(&s); + return true; + } default: - { - Assert(false); - return false; - } + { + Assert(false); + return false; + } } } @@ -2367,7 +2403,7 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta ShutdownConnection(sk); return false; default: - Assert(false); + Assert(false); return false; } } @@ -2409,19 +2445,19 @@ AsyncFlush(Safekeeper *sk) } } -// Check if we need to suspend inserts because of lagging replication. +/* Check if we need to suspend inserts because of lagging replication. */ static uint64 backpressure_lag_impl(void) { if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0) { - XLogRecPtr writePtr; - XLogRecPtr flushPtr; - XLogRecPtr applyPtr; + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; #if PG_VERSION_NUM >= 150000 - XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL); + XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL); #else - XLogRecPtr myFlushLsn = GetFlushRecPtr(); + XLogRecPtr myFlushLsn = GetFlushRecPtr(); #endif replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); #define MB ((XLogRecPtr)1024*1024) @@ -2434,23 +2470,23 @@ backpressure_lag_impl(void) if ((writePtr != InvalidXLogRecPtr && max_replication_write_lag > 0 - && myFlushLsn > writePtr + max_replication_write_lag*MB)) + && myFlushLsn > writePtr + max_replication_write_lag * MB)) { - return (myFlushLsn - writePtr - max_replication_write_lag*MB); + return (myFlushLsn - writePtr - max_replication_write_lag * MB); } if ((flushPtr != InvalidXLogRecPtr && max_replication_flush_lag > 0 - && myFlushLsn > flushPtr + max_replication_flush_lag*MB)) + && myFlushLsn > flushPtr + max_replication_flush_lag * MB)) { - return (myFlushLsn - flushPtr - max_replication_flush_lag*MB); + return (myFlushLsn - flushPtr - max_replication_flush_lag * MB); } if ((applyPtr != InvalidXLogRecPtr && max_replication_apply_lag > 0 - && myFlushLsn > applyPtr + max_replication_apply_lag*MB)) + && myFlushLsn > applyPtr + max_replication_apply_lag * MB)) { - return (myFlushLsn - applyPtr - max_replication_apply_lag*MB); + return (myFlushLsn - applyPtr - max_replication_apply_lag * MB); } } return 0; @@ -2458,24 +2494,26 @@ backpressure_lag_impl(void) #define BACK_PRESSURE_DELAY 10000L // 0.01 sec -static bool backpressure_throttling_impl(void) +static bool +backpressure_throttling_impl(void) { - int64 lag; - TimestampTz start, stop; - bool retry = PrevProcessInterruptsCallback - ? PrevProcessInterruptsCallback() - : false; + int64 lag; + TimestampTz start, + stop; + bool retry = PrevProcessInterruptsCallback + ? PrevProcessInterruptsCallback() + : false; - // Don't throttle read only transactions and wal sender. + /* Don't throttle read only transactions and wal sender. */ if (am_walsender || !TransactionIdIsValid(GetCurrentTransactionIdIfAny())) return retry; - // Calculate replicas lag + /* Calculate replicas lag */ lag = backpressure_lag_impl(); if (lag == 0) return retry; - // Suspend writers until replicas catch up + /* Suspend writers until replicas catch up */ set_ps_display("backpressure throttling"); elog(DEBUG2, "backpressure throttling: lag %lu", lag); diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 75167163f3..59e70f33bf 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -14,10 +14,13 @@ #define SK_PROTOCOL_VERSION 2 #define MAX_SAFEKEEPERS 32 -#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single WAL message */ -#define XLOG_HDR_SIZE (1+8*3) /* 'w' + startPos + walEnd + timestamp */ -#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender message header */ -#define XLOG_HDR_END_POS (1+8) /* offset of end position in wal sender message header */ +#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single + * WAL message */ +#define XLOG_HDR_SIZE (1+8*3) /* 'w' + startPos + walEnd + timestamp */ +#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender + * message header */ +#define XLOG_HDR_END_POS (1+8) /* offset of end position in wal sender + * message header */ /* * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured, @@ -25,12 +28,12 @@ */ #define WL_NO_EVENTS 0 -extern char* wal_acceptors_list; -extern int wal_acceptor_reconnect_timeout; -extern int wal_acceptor_connect_timeout; -extern bool am_wal_proposer; +extern char *wal_acceptors_list; +extern int wal_acceptor_reconnect_timeout; +extern int wal_acceptor_connect_timeout; +extern bool am_wal_proposer; -struct WalProposerConn; /* Defined in libpqwalproposer */ +struct WalProposerConn; /* Defined in libpqwalproposer */ typedef struct WalProposerConn WalProposerConn; struct WalMessage; @@ -44,21 +47,26 @@ typedef enum { /* The full read was successful. buf now points to the data */ PG_ASYNC_READ_SUCCESS, - /* The read is ongoing. Wait until the connection is read-ready, then try - * again. */ + + /* + * The read is ongoing. Wait until the connection is read-ready, then try + * again. + */ PG_ASYNC_READ_TRY_AGAIN, /* Reading failed. Check PQerrorMessage(conn) */ PG_ASYNC_READ_FAIL, -} PGAsyncReadResult; +} PGAsyncReadResult; /* Possible return values from WritePGAsync */ typedef enum { /* The write fully completed */ PG_ASYNC_WRITE_SUCCESS, - /* The write started, but you'll need to call PQflush some more times - * to finish it off. We just tried, so it's best to wait until the - * connection is read- or write-ready to try again. + + /* + * The write started, but you'll need to call PQflush some more times to + * finish it off. We just tried, so it's best to wait until the connection + * is read- or write-ready to try again. * * If it becomes read-ready, call PQconsumeInput and flush again. If it * becomes write-ready, just call PQflush. @@ -66,7 +74,7 @@ typedef enum PG_ASYNC_WRITE_TRY_FLUSH, /* Writing failed. Check PQerrorMessage(conn) */ PG_ASYNC_WRITE_FAIL, -} PGAsyncWriteResult; +} PGAsyncWriteResult; /* * WAL safekeeper state, which is used to wait for some event. @@ -79,8 +87,8 @@ typedef enum typedef enum { /* - * Does not have an active connection and will stay that way until - * further notice. + * Does not have an active connection and will stay that way until further + * notice. * * Moves to SS_CONNECTING_WRITE by calls to ResetConnection. */ @@ -105,8 +113,8 @@ typedef enum SS_WAIT_EXEC_RESULT, /* - * Executing the receiving half of the handshake. After receiving, moves to - * SS_VOTING. + * Executing the receiving half of the handshake. After receiving, moves + * to SS_VOTING. */ SS_HANDSHAKE_RECV, @@ -120,8 +128,9 @@ typedef enum SS_VOTING, /* - * Already sent voting information, waiting to receive confirmation from the - * node. After receiving, moves to SS_IDLE, if the quorum isn't reached yet. + * Already sent voting information, waiting to receive confirmation from + * the node. After receiving, moves to SS_IDLE, if the quorum isn't + * reached yet. */ SS_WAIT_VERDICT, @@ -141,7 +150,7 @@ typedef enum * to read. */ SS_ACTIVE, -} SafekeeperState; +} SafekeeperState; /* Consensus logical timestamp. */ typedef uint64 term_t; @@ -156,21 +165,21 @@ typedef uint64 NNodeId; /* Initial Proposer -> Acceptor message */ typedef struct ProposerGreeting { - uint64 tag; /* message tag */ - uint32 protocolVersion; /* proposer-safekeeper protocol version */ - uint32 pgVersion; - pg_uuid_t proposerId; - uint64 systemId; /* Postgres system identifier */ - uint8 ztimelineid[16]; /* Zenith timeline id */ - uint8 ztenantid[16]; - TimeLineID timeline; - uint32 walSegSize; -} ProposerGreeting; + uint64 tag; /* message tag */ + uint32 protocolVersion; /* proposer-safekeeper protocol version */ + uint32 pgVersion; + pg_uuid_t proposerId; + uint64 systemId; /* Postgres system identifier */ + uint8 ztimelineid[16]; /* Zenith timeline id */ + uint8 ztenantid[16]; + TimeLineID timeline; + uint32 walSegSize; +} ProposerGreeting; typedef struct AcceptorProposerMessage { - uint64 tag; -} AcceptorProposerMessage; + uint64 tag; +} AcceptorProposerMessage; /* * Acceptor -> Proposer initial response: the highest term acceptor voted for. @@ -180,7 +189,7 @@ typedef struct AcceptorGreeting AcceptorProposerMessage apm; term_t term; NNodeId nodeId; -} AcceptorGreeting; +} AcceptorGreeting; /* * Proposer -> Acceptor vote request. @@ -189,36 +198,39 @@ typedef struct VoteRequest { uint64 tag; term_t term; - pg_uuid_t proposerId; /* for monitoring/debugging */ -} VoteRequest; + pg_uuid_t proposerId; /* for monitoring/debugging */ +} VoteRequest; /* Element of term switching chain. */ typedef struct TermSwitchEntry { - term_t term; - XLogRecPtr lsn; -} TermSwitchEntry; + term_t term; + XLogRecPtr lsn; +} TermSwitchEntry; typedef struct TermHistory { - uint32 n_entries; + uint32 n_entries; TermSwitchEntry *entries; -} TermHistory; +} TermHistory; /* Vote itself, sent from safekeeper to proposer */ -typedef struct VoteResponse { +typedef struct VoteResponse +{ AcceptorProposerMessage apm; - term_t term; - uint64 voteGiven; + term_t term; + uint64 voteGiven; + /* * Safekeeper flush_lsn (end of WAL) + history of term switches allow - * proposer to choose the most advanced one. + * proposer to choose the most advanced one. */ - XLogRecPtr flushLsn; - XLogRecPtr truncateLsn; /* minimal LSN which may be needed for recovery of some safekeeper */ + XLogRecPtr flushLsn; + XLogRecPtr truncateLsn; /* minimal LSN which may be needed for + * recovery of some safekeeper */ TermHistory termHistory; - XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ -} VoteResponse; + XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ +} VoteResponse; /* * Proposer -> Acceptor message announcing proposer is elected and communicating @@ -226,60 +238,62 @@ typedef struct VoteResponse { */ typedef struct ProposerElected { - uint64 tag; - term_t term; + uint64 tag; + term_t term; /* proposer will send since this point */ - XLogRecPtr startStreamingAt; + XLogRecPtr startStreamingAt; /* history of term switches up to this proposer */ TermHistory *termHistory; /* timeline globally starts at this LSN */ - XLogRecPtr timelineStartLsn; -} ProposerElected; + XLogRecPtr timelineStartLsn; +} ProposerElected; /* * Header of request with WAL message sent from proposer to safekeeper. */ typedef struct AppendRequestHeader { - uint64 tag; - term_t term; /* term of the proposer */ + uint64 tag; + term_t term; /* term of the proposer */ + /* * LSN since which current proposer appends WAL (begin_lsn of its first * record); determines epoch switch point. */ - XLogRecPtr epochStartLsn; - XLogRecPtr beginLsn; /* start position of message in WAL */ - XLogRecPtr endLsn; /* end position of message in WAL */ - XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ + XLogRecPtr epochStartLsn; + XLogRecPtr beginLsn; /* start position of message in WAL */ + XLogRecPtr endLsn; /* end position of message in WAL */ + XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ + /* - * minimal LSN which may be needed for recovery of some safekeeper (end lsn - * + 1 of last chunk streamed to everyone) + * minimal LSN which may be needed for recovery of some safekeeper (end + * lsn + 1 of last chunk streamed to everyone) */ - XLogRecPtr truncateLsn; - pg_uuid_t proposerId; /* for monitoring/debugging */ -} AppendRequestHeader; + XLogRecPtr truncateLsn; + pg_uuid_t proposerId; /* for monitoring/debugging */ +} AppendRequestHeader; /* * Hot standby feedback received from replica */ typedef struct HotStandbyFeedback { - TimestampTz ts; + TimestampTz ts; FullTransactionId xmin; FullTransactionId catalog_xmin; -} HotStandbyFeedback; +} HotStandbyFeedback; -typedef struct ReplicationFeedback +typedef struct ReplicationFeedback { - // current size of the timeline on pageserver - uint64 currentClusterSize; - // standby_status_update fields that safekeeper received from pageserver - XLogRecPtr ps_writelsn; - XLogRecPtr ps_flushlsn; - XLogRecPtr ps_applylsn; + /* current size of the timeline on pageserver */ + uint64 currentClusterSize; + /* standby_status_update fields that safekeeper received from pageserver */ + XLogRecPtr ps_writelsn; + XLogRecPtr ps_flushlsn; + XLogRecPtr ps_applylsn; TimestampTz ps_replytime; -} ReplicationFeedback; +} ReplicationFeedback; typedef struct WalproposerShmemState @@ -288,7 +302,7 @@ typedef struct WalproposerShmemState ReplicationFeedback feedback; term_t mineLastElectedTerm; pg_atomic_uint64 backpressureThrottlingTime; -} WalproposerShmemState; +} WalproposerShmemState; /* * Report safekeeper state to proposer @@ -296,25 +310,26 @@ typedef struct WalproposerShmemState typedef struct AppendResponse { AcceptorProposerMessage apm; + /* * Current term of the safekeeper; if it is higher than proposer's, the * compute is out of date. */ - term_t term; - // TODO: add comment - XLogRecPtr flushLsn; - // Safekeeper reports back his awareness about which WAL is committed, as - // this is a criterion for walproposer --sync mode exit - XLogRecPtr commitLsn; + term_t term; + /* TODO: add comment */ + XLogRecPtr flushLsn; + /* Safekeeper reports back his awareness about which WAL is committed, as */ + /* this is a criterion for walproposer --sync mode exit */ + XLogRecPtr commitLsn; HotStandbyFeedback hs; - // Feedback recieved from pageserver includes standby_status_update fields - // and custom zenith feedback. - // This part of the message is extensible. + /* Feedback recieved from pageserver includes standby_status_update fields */ + /* and custom zenith feedback. */ + /* This part of the message is extensible. */ ReplicationFeedback rf; -} AppendResponse; +} AppendResponse; -// ReplicationFeedback is extensible part of the message that is parsed separately -// Other fields are fixed part +/* ReplicationFeedback is extensible part of the message that is parsed separately */ +/* Other fields are fixed part */ #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) @@ -323,9 +338,10 @@ typedef struct AppendResponse */ typedef struct Safekeeper { - char const* host; - char const* port; - char conninfo[MAXCONNINFO]; /* connection info for connecting/reconnecting */ + char const *host; + char const *port; + char conninfo[MAXCONNINFO]; /* connection info for + * connecting/reconnecting */ /* * postgres protocol connection to the WAL acceptor @@ -333,46 +349,50 @@ typedef struct Safekeeper * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we * reach SS_ACTIVE; not before. */ - WalProposerConn* conn; + WalProposerConn *conn; + /* * Temporary buffer for the message being sent to the safekeeper. */ StringInfoData outbuf; + /* * WAL reader, allocated for each safekeeper. */ - XLogReaderState* xlogreader; + XLogReaderState *xlogreader; /* * Streaming will start here; must be record boundary. */ - XLogRecPtr startStreamingAt; + XLogRecPtr startStreamingAt; - bool flushWrite; /* set to true if we need to call AsyncFlush, to flush pending messages */ - XLogRecPtr streamingAt; /* current streaming position */ - AppendRequestHeader appendRequest; /* request for sending to safekeeper */ + bool flushWrite; /* set to true if we need to call AsyncFlush, + * to flush pending messages */ + XLogRecPtr streamingAt; /* current streaming position */ + AppendRequestHeader appendRequest; /* request for sending to safekeeper */ - int eventPos; /* position in wait event set. Equal to -1 if no event */ - SafekeeperState state; /* safekeeper state machine state */ - TimestampTz startedConnAt; /* when connection attempt started */ - AcceptorGreeting greetResponse; /* acceptor greeting */ - VoteResponse voteResponse; /* the vote */ - AppendResponse appendResponse; /* feedback for master */ + int eventPos; /* position in wait event set. Equal to -1 if + * no event */ + SafekeeperState state; /* safekeeper state machine state */ + TimestampTz startedConnAt; /* when connection attempt started */ + AcceptorGreeting greetResponse; /* acceptor greeting */ + VoteResponse voteResponse; /* the vote */ + AppendResponse appendResponse; /* feedback for master */ } Safekeeper; extern PGDLLIMPORT void WalProposerMain(Datum main_arg); -void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); -void WalProposerPoll(void); -void WalProposerRegister(void); -void ParseReplicationFeedbackMessage(StringInfo reply_message, - ReplicationFeedback *rf); +void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); +void WalProposerPoll(void); +void WalProposerRegister(void); +void ParseReplicationFeedbackMessage(StringInfo reply_message, + ReplicationFeedback * rf); extern void StartProposerReplication(StartReplicationCmd *cmd); -Size WalproposerShmemSize(void); -bool WalproposerShmemInit(void); -void replication_feedback_set(ReplicationFeedback *rf); -void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); +Size WalproposerShmemSize(void); +bool WalproposerShmemInit(void); +void replication_feedback_set(ReplicationFeedback * rf); +void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); /* libpqwalproposer hooks & helper type */ @@ -383,29 +403,37 @@ typedef enum WP_CONN_POLLING_READING, WP_CONN_POLLING_WRITING, WP_CONN_POLLING_OK, + /* * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused. * We've removed it here to avoid clutter. */ -} WalProposerConnectPollStatusType; +} WalProposerConnectPollStatusType; /* Re-exported and modified ExecStatusType */ typedef enum { /* We received a single CopyBoth result */ WP_EXEC_SUCCESS_COPYBOTH, - /* Any success result other than a single CopyBoth was received. The specifics of the result - * were already logged, but it may be useful to provide an error message indicating which - * safekeeper messed up. + + /* + * Any success result other than a single CopyBoth was received. The + * specifics of the result were already logged, but it may be useful to + * provide an error message indicating which safekeeper messed up. * - * Do not expect PQerrorMessage to be appropriately set. */ + * Do not expect PQerrorMessage to be appropriately set. + */ WP_EXEC_UNEXPECTED_SUCCESS, - /* No result available at this time. Wait until read-ready, then call again. Internally, this is - * returned when PQisBusy indicates that PQgetResult would block. */ + + /* + * No result available at this time. Wait until read-ready, then call + * again. Internally, this is returned when PQisBusy indicates that + * PQgetResult would block. + */ WP_EXEC_NEEDS_INPUT, /* Catch-all failure. Check PQerrorMessage. */ WP_EXEC_FAILED, -} WalProposerExecStatusType; +} WalProposerExecStatusType; /* Re-exported ConnStatusType */ typedef enum @@ -414,40 +442,39 @@ typedef enum WP_CONNECTION_BAD, /* - * The original ConnStatusType has many more tags, but requests that - * they not be relied upon (except for displaying to the user). We - * don't need that extra functionality, so we collect them into a - * single tag here. + * The original ConnStatusType has many more tags, but requests that they + * not be relied upon (except for displaying to the user). We don't need + * that extra functionality, so we collect them into a single tag here. */ WP_CONNECTION_IN_PROGRESS, -} WalProposerConnStatusType; +} WalProposerConnStatusType; /* Re-exported PQerrorMessage */ -typedef char* (*walprop_error_message_fn) (WalProposerConn* conn); +typedef char *(*walprop_error_message_fn) (WalProposerConn * conn); /* Re-exported PQstatus */ -typedef WalProposerConnStatusType (*walprop_status_fn) (WalProposerConn* conn); +typedef WalProposerConnStatusType(*walprop_status_fn) (WalProposerConn * conn); /* Re-exported PQconnectStart */ -typedef WalProposerConn* (*walprop_connect_start_fn) (char* conninfo); +typedef WalProposerConn * (*walprop_connect_start_fn) (char *conninfo); /* Re-exported PQconectPoll */ -typedef WalProposerConnectPollStatusType (*walprop_connect_poll_fn) (WalProposerConn* conn); +typedef WalProposerConnectPollStatusType(*walprop_connect_poll_fn) (WalProposerConn * conn); /* Blocking wrapper around PQsendQuery */ -typedef bool (*walprop_send_query_fn) (WalProposerConn* conn, char* query); +typedef bool (*walprop_send_query_fn) (WalProposerConn * conn, char *query); /* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */ -typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerConn* conn); +typedef WalProposerExecStatusType(*walprop_get_query_result_fn) (WalProposerConn * conn); /* Re-exported PQsocket */ -typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn); +typedef pgsocket (*walprop_socket_fn) (WalProposerConn * conn); /* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */ -typedef int (*walprop_flush_fn) (WalProposerConn* conn); +typedef int (*walprop_flush_fn) (WalProposerConn * conn); /* Re-exported PQfinish */ -typedef void (*walprop_finish_fn) (WalProposerConn* conn); +typedef void (*walprop_finish_fn) (WalProposerConn * conn); /* * Ergonomic wrapper around PGgetCopyData @@ -463,9 +490,9 @@ typedef void (*walprop_finish_fn) (WalProposerConn* conn); * performs a bit of extra checking work that's always required and is normally * somewhat verbose. */ -typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn, - char** buf, - int* amount); +typedef PGAsyncReadResult(*walprop_async_read_fn) (WalProposerConn * conn, + char **buf, + int *amount); /* * Ergonomic wrapper around PQputCopyData + PQflush @@ -474,33 +501,33 @@ typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn, * * For information on the meaning of return codes, refer to PGAsyncWriteResult. */ -typedef PGAsyncWriteResult (*walprop_async_write_fn) (WalProposerConn* conn, - void const* buf, - size_t size); +typedef PGAsyncWriteResult(*walprop_async_write_fn) (WalProposerConn * conn, + void const *buf, + size_t size); /* * Blocking equivalent to walprop_async_write_fn * * Returns 'true' if successful, 'false' on failure. */ -typedef bool (*walprop_blocking_write_fn) (WalProposerConn* conn, void const* buf, size_t size); +typedef bool (*walprop_blocking_write_fn) (WalProposerConn * conn, void const *buf, size_t size); /* All libpqwalproposer exported functions collected together. */ typedef struct WalProposerFunctionsType { - walprop_error_message_fn walprop_error_message; - walprop_status_fn walprop_status; - walprop_connect_start_fn walprop_connect_start; - walprop_connect_poll_fn walprop_connect_poll; - walprop_send_query_fn walprop_send_query; - walprop_get_query_result_fn walprop_get_query_result; - walprop_socket_fn walprop_socket; - walprop_flush_fn walprop_flush; - walprop_finish_fn walprop_finish; - walprop_async_read_fn walprop_async_read; - walprop_async_write_fn walprop_async_write; - walprop_blocking_write_fn walprop_blocking_write; -} WalProposerFunctionsType; + walprop_error_message_fn walprop_error_message; + walprop_status_fn walprop_status; + walprop_connect_start_fn walprop_connect_start; + walprop_connect_poll_fn walprop_connect_poll; + walprop_send_query_fn walprop_send_query; + walprop_get_query_result_fn walprop_get_query_result; + walprop_socket_fn walprop_socket; + walprop_flush_fn walprop_flush; + walprop_finish_fn walprop_finish; + walprop_async_read_fn walprop_async_read; + walprop_async_write_fn walprop_async_write; + walprop_blocking_write_fn walprop_blocking_write; +} WalProposerFunctionsType; /* Allow the above functions to be "called" with normal syntax */ #define walprop_error_message(conn) \ @@ -536,8 +563,8 @@ typedef struct WalProposerFunctionsType * This pointer is set by the initializer in libpqwalproposer, so that we * can use it later. */ -extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions; +extern PGDLLIMPORT WalProposerFunctionsType * WalProposerFunctions; extern uint64 BackpressureThrottlingTime(void); -#endif /* __NEON_WALPROPOSER_H__ */ +#endif /* __NEON_WALPROPOSER_H__ */ diff --git a/pgxn/neon/walproposer_utils.c b/pgxn/neon/walproposer_utils.c index 417a8c4586..e1dcaa081d 100644 --- a/pgxn/neon/walproposer_utils.c +++ b/pgxn/neon/walproposer_utils.c @@ -127,10 +127,10 @@ CompareLsn(const void *a, const void *b) * * elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state)); */ -char* +char * FormatSafekeeperState(SafekeeperState state) { - char* return_val = NULL; + char *return_val = NULL; switch (state) { @@ -171,27 +171,30 @@ FormatSafekeeperState(SafekeeperState state) /* Asserts that the provided events are expected for given safekeeper's state */ void -AssertEventsOkForState(uint32 events, Safekeeper* sk) +AssertEventsOkForState(uint32 events, Safekeeper *sk) { - uint32 expected = SafekeeperStateDesiredEvents(sk->state); + uint32 expected = SafekeeperStateDesiredEvents(sk->state); - /* The events are in-line with what we're expecting, under two conditions: - * (a) if we aren't expecting anything, `events` has no read- or - * write-ready component. - * (b) if we are expecting something, there's overlap - * (i.e. `events & expected != 0`) + /* + * The events are in-line with what we're expecting, under two conditions: + * (a) if we aren't expecting anything, `events` has no read- or + * write-ready component. (b) if we are expecting something, there's + * overlap (i.e. `events & expected != 0`) */ - bool events_ok_for_state; /* long name so the `Assert` is more clear later */ + bool events_ok_for_state; /* long name so the `Assert` is more + * clear later */ if (expected == WL_NO_EVENTS) - events_ok_for_state = ((events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)) == 0); + events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0); else events_ok_for_state = ((events & expected) != 0); if (!events_ok_for_state) { - /* To give a descriptive message in the case of failure, we use elog and - * then an assertion that's guaranteed to fail. */ + /* + * To give a descriptive message in the case of failure, we use elog + * and then an assertion that's guaranteed to fail. + */ elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state)); Assert(events_ok_for_state); @@ -204,12 +207,12 @@ AssertEventsOkForState(uint32 events, Safekeeper* sk) uint32 SafekeeperStateDesiredEvents(SafekeeperState state) { - uint32 result = WL_NO_EVENTS; + uint32 result = WL_NO_EVENTS; /* If the state doesn't have a modifier, we can check the base state */ switch (state) { - /* Connecting states say what they want in the name */ + /* Connecting states say what they want in the name */ case SS_CONNECTING_READ: result = WL_SOCKET_READABLE; break; @@ -217,33 +220,35 @@ SafekeeperStateDesiredEvents(SafekeeperState state) result = WL_SOCKET_WRITEABLE; break; - /* Reading states need the socket to be read-ready to continue */ + /* Reading states need the socket to be read-ready to continue */ case SS_WAIT_EXEC_RESULT: case SS_HANDSHAKE_RECV: case SS_WAIT_VERDICT: result = WL_SOCKET_READABLE; break; - /* Idle states use read-readiness as a sign that the connection has been - * disconnected. */ + /* + * Idle states use read-readiness as a sign that the connection + * has been disconnected. + */ case SS_VOTING: case SS_IDLE: result = WL_SOCKET_READABLE; break; - /* - * Flush states require write-ready for flushing. - * Active state does both reading and writing. - * - * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should - * check sk->flushWrite here to set WL_SOCKET_WRITEABLE. - */ + /* + * Flush states require write-ready for flushing. Active state + * does both reading and writing. + * + * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We + * should check sk->flushWrite here to set WL_SOCKET_WRITEABLE. + */ case SS_SEND_ELECTED_FLUSH: case SS_ACTIVE: result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; break; - /* The offline state expects no events. */ + /* The offline state expects no events. */ case SS_OFFLINE: result = WL_NO_EVENTS; break; @@ -263,27 +268,30 @@ SafekeeperStateDesiredEvents(SafekeeperState state) * * The string should not be freed. It should also not be expected to remain the same between * function calls. */ -char* +char * FormatEvents(uint32 events) { static char return_str[8]; /* Helper variable to check if there's extra bits */ - uint32 all_flags = WL_LATCH_SET - | WL_SOCKET_READABLE - | WL_SOCKET_WRITEABLE - | WL_TIMEOUT - | WL_POSTMASTER_DEATH - | WL_EXIT_ON_PM_DEATH - | WL_SOCKET_CONNECTED; + uint32 all_flags = WL_LATCH_SET + | WL_SOCKET_READABLE + | WL_SOCKET_WRITEABLE + | WL_TIMEOUT + | WL_POSTMASTER_DEATH + | WL_EXIT_ON_PM_DEATH + | WL_SOCKET_CONNECTED; - /* The formatting here isn't supposed to be *particularly* useful -- it's just to give an - * sense of what events have been triggered without needing to remember your powers of two. */ + /* + * The formatting here isn't supposed to be *particularly* useful -- it's + * just to give an sense of what events have been triggered without + * needing to remember your powers of two. + */ - return_str[0] = (events & WL_LATCH_SET ) ? 'L' : '_'; - return_str[1] = (events & WL_SOCKET_READABLE ) ? 'R' : '_'; + return_str[0] = (events & WL_LATCH_SET) ? 'L' : '_'; + return_str[1] = (events & WL_SOCKET_READABLE) ? 'R' : '_'; return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_'; - return_str[3] = (events & WL_TIMEOUT ) ? 'T' : '_'; + return_str[3] = (events & WL_TIMEOUT) ? 'T' : '_'; return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_'; return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_'; return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_'; @@ -291,7 +299,7 @@ FormatEvents(uint32 events) if (events & (~all_flags)) { elog(WARNING, "Event formatting found unexpected component %d", - events & (~all_flags)); + events & (~all_flags)); return_str[6] = '*'; return_str[7] = '\0'; } @@ -407,21 +415,21 @@ XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr) if (walpropFile < 0) { - #if PG_VERSION_NUM >= 150000 - // FIXME Is it ok to use hardcoded value here? - TimeLineID tli = 1; - #else +#if PG_VERSION_NUM >= 150000 + /* FIXME Is it ok to use hardcoded value here? */ + TimeLineID tli = 1; +#else bool use_existent = true; - #endif +#endif /* Create/use new log file */ XLByteToSeg(recptr, walpropSegNo, wal_segment_size); - #if PG_VERSION_NUM >= 150000 +#if PG_VERSION_NUM >= 150000 walpropFile = XLogFileInit(walpropSegNo, tli); walpropFileTLI = tli; - #else +#else walpropFile = XLogFileInit(walpropSegNo, &use_existent, false); walpropFileTLI = ThisTimeLineID; - #endif +#endif } /* Calculate the start offset of the received logs */ @@ -483,6 +491,7 @@ XLogWalPropClose(XLogRecPtr recptr) if (close(walpropFile) != 0) { char xlogfname[MAXFNAMELEN]; + XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); ereport(PANIC, @@ -508,12 +517,12 @@ StartProposerReplication(StartReplicationCmd *cmd) XLogRecPtr FlushPtr; TimeLineID currTLI; - #if PG_VERSION_NUM < 150000 +#if PG_VERSION_NUM < 150000 if (ThisTimeLineID == 0) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); - #endif + errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); +#endif /* create xlogreader for physical replication */ xlogreader = @@ -525,7 +534,7 @@ StartProposerReplication(StartReplicationCmd *cmd) if (!xlogreader) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); + errmsg("out of memory"))); /* * We assume here that we're logging enough information in the WAL for @@ -542,7 +551,7 @@ StartProposerReplication(StartReplicationCmd *cmd) if (SlotIsLogical(MyReplicationSlot)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("cannot use a logical replication slot for physical replication"))); + errmsg("cannot use a logical replication slot for physical replication"))); /* * We don't need to verify the slot's restart_lsn here; instead we @@ -630,9 +639,9 @@ StartProposerReplication(StartReplicationCmd *cmd) (errmsg("requested starting point %X/%X on timeline %u is not in this server's history", LSN_FORMAT_ARGS(cmd->startpoint), cmd->timeline), - errdetail("This server's history forked from timeline %u at %X/%X.", - cmd->timeline, - LSN_FORMAT_ARGS(switchpoint)))); + errdetail("This server's history forked from timeline %u at %X/%X.", + cmd->timeline, + LSN_FORMAT_ARGS(switchpoint)))); } sendTimeLineValidUpto = switchpoint; } @@ -869,14 +878,14 @@ WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, errno = save_errno; ereport(ERROR, (errcode_for_file_access(), - errmsg("requested WAL segment %s has already been removed", - xlogfname))); + errmsg("requested WAL segment %s has already been removed", + xlogfname))); } else ereport(ERROR, (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", - path))); + errmsg("could not open file \"%s\": %m", + path))); } @@ -943,7 +952,7 @@ XLogSendPhysical(void) XLogRecPtr startptr; XLogRecPtr endptr; Size nbytes PG_USED_FOR_ASSERTS_ONLY; - TimeLineID currTLI; + TimeLineID currTLI; /* If requested switch the WAL sender to the stopping state. */ if (got_STOPPING) @@ -1004,8 +1013,8 @@ XLogSendPhysical(void) { /* * Still a cascading standby. But is the timeline we're sending - * still the one recovery is recovering from? currTLI was - * updated by the GetStandbyFlushRecPtr() call above. + * still the one recovery is recovering from? currTLI was updated + * by the GetStandbyFlushRecPtr() call above. */ if (sendTimeLine != currTLI) becameHistoric = true; @@ -1043,11 +1052,11 @@ XLogSendPhysical(void) * primary: if the primary subsequently crashes and restarts, standbys * must not have applied any WAL that got lost on the primary. */ - #if PG_VERSION_NUM >= 150000 +#if PG_VERSION_NUM >= 150000 SendRqstPtr = GetFlushRecPtr(NULL); - #else +#else SendRqstPtr = GetFlushRecPtr(); - #endif +#endif } /* @@ -1180,4 +1189,3 @@ XLogSendPhysical(void) set_ps_display(activitymsg); } } - diff --git a/pgxn/neon/walproposer_utils.h b/pgxn/neon/walproposer_utils.h index 4771d3ff82..aa5df5fa43 100644 --- a/pgxn/neon/walproposer_utils.h +++ b/pgxn/neon/walproposer_utils.h @@ -3,17 +3,17 @@ #include "walproposer.h" -int CompareLsn(const void *a, const void *b); -char* FormatSafekeeperState(SafekeeperState state); -void AssertEventsOkForState(uint32 events, Safekeeper* sk); -uint32 SafekeeperStateDesiredEvents(SafekeeperState state); -char* FormatEvents(uint32 events); -bool HexDecodeString(uint8 *result, char *input, int nbytes); -uint32 pq_getmsgint32_le(StringInfo msg); -uint64 pq_getmsgint64_le(StringInfo msg); -void pq_sendint32_le(StringInfo buf, uint32 i); -void pq_sendint64_le(StringInfo buf, uint64 i); -void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr); -void XLogWalPropClose(XLogRecPtr recptr); +int CompareLsn(const void *a, const void *b); +char *FormatSafekeeperState(SafekeeperState state); +void AssertEventsOkForState(uint32 events, Safekeeper *sk); +uint32 SafekeeperStateDesiredEvents(SafekeeperState state); +char *FormatEvents(uint32 events); +bool HexDecodeString(uint8 *result, char *input, int nbytes); +uint32 pq_getmsgint32_le(StringInfo msg); +uint64 pq_getmsgint64_le(StringInfo msg); +void pq_sendint32_le(StringInfo buf, uint32 i); +void pq_sendint64_le(StringInfo buf, uint64 i); +void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr); +void XLogWalPropClose(XLogRecPtr recptr); -#endif /* __NEON_WALPROPOSER_UTILS_H__ */ +#endif /* __NEON_WALPROPOSER_UTILS_H__ */ diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 3e30065cd3..07bd7bdd28 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -39,8 +39,8 @@ PG_FUNCTION_INFO_V1(neon_xlogflush); * Linkage to functions in zenith module. * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c */ -typedef void (*zenith_read_at_lsn_type)(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); +typedef void (*zenith_read_at_lsn_type) (RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); static zenith_read_at_lsn_type zenith_read_at_lsn_ptr; @@ -136,8 +136,8 @@ clear_buffer_cache(PG_FUNCTION_ARGS) /* * Pin the buffer, and release it again. Because we have - * zenith_test_evict==true, this will evict the page from - * the buffer cache if no one else is holding a pin on it. + * zenith_test_evict==true, this will evict the page from the + * buffer cache if no one else is holding a pin on it. */ if (isvalid) { @@ -177,8 +177,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) text *forkname; uint32 blkno; - bool request_latest = PG_ARGISNULL(3); - uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3); + bool request_latest = PG_ARGISNULL(3); + uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3); if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) PG_RETURN_NULL(); @@ -262,7 +262,7 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("must be superuser to use raw page functions"))); + errmsg("must be superuser to use raw page functions"))); if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) || PG_ARGISNULL(3) || PG_ARGISNULL(4)) @@ -271,19 +271,20 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) { RelFileNode rnode = { .spcNode = PG_GETARG_OID(0), - .dbNode = PG_GETARG_OID(1), + .dbNode = PG_GETARG_OID(1), .relNode = PG_GETARG_OID(2) }; - ForkNumber forknum = PG_GETARG_UINT32(3); + ForkNumber forknum = PG_GETARG_UINT32(3); - uint32 blkno = PG_GETARG_UINT32(4); - bool request_latest = PG_ARGISNULL(5); - uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); + uint32 blkno = PG_GETARG_UINT32(4); + bool request_latest = PG_ARGISNULL(5); + uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); /* Initialize buffer to copy to */ - bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); @@ -298,7 +299,8 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) Datum neon_xlogflush(PG_FUNCTION_ARGS) { - XLogRecPtr lsn = PG_GETARG_LSN(0); + XLogRecPtr lsn = PG_GETARG_LSN(0); + XLogFlush(lsn); PG_RETURN_VOID(); } diff --git a/pgxn/typedefs.list b/pgxn/typedefs.list new file mode 100644 index 0000000000..760f384212 --- /dev/null +++ b/pgxn/typedefs.list @@ -0,0 +1,3776 @@ +ACCESS_ALLOWED_ACE +ACL +ACL_SIZE_INFORMATION +AFFIX +ASN1_INTEGER +ASN1_OBJECT +ASN1_STRING +AV +A_ArrayExpr +A_Const +A_Expr +A_Expr_Kind +A_Indices +A_Indirection +A_Star +AbsoluteTime +AccessMethodInfo +AccessPriv +Acl +AclItem +AclMaskHow +AclMode +AclResult +AcquireSampleRowsFunc +ActionList +ActiveSnapshotElt +AddForeignUpdateTargets_function +AffixNode +AffixNodeData +AfterTriggerEvent +AfterTriggerEventChunk +AfterTriggerEventData +AfterTriggerEventList +AfterTriggerShared +AfterTriggerSharedData +AfterTriggersData +AfterTriggersQueryData +AfterTriggersTableData +AfterTriggersTransData +Agg +AggClauseCosts +AggInfo +AggPath +AggSplit +AggState +AggStatePerAgg +AggStatePerGroup +AggStatePerHash +AggStatePerPhase +AggStatePerTrans +AggStrategy +AggTransInfo +Aggref +AggregateInstrumentation +AlenState +Alias +AllocBlock +AllocChunk +AllocPointer +AllocSet +AllocSetContext +AllocSetFreeList +AllocateDesc +AllocateDescKind +AlterCollationStmt +AlterDatabaseSetStmt +AlterDatabaseStmt +AlterDefaultPrivilegesStmt +AlterDomainStmt +AlterEnumStmt +AlterEventTrigStmt +AlterExtensionContentsStmt +AlterExtensionStmt +AlterFdwStmt +AlterForeignServerStmt +AlterFunctionStmt +AlterObjectDependsStmt +AlterObjectSchemaStmt +AlterOpFamilyStmt +AlterOperatorStmt +AlterOwnerStmt +AlterPolicyStmt +AlterPublicationStmt +AlterRoleSetStmt +AlterRoleStmt +AlterSeqStmt +AlterStatsStmt +AlterSubscriptionStmt +AlterSubscriptionType +AlterSystemStmt +AlterTSConfigType +AlterTSConfigurationStmt +AlterTSDictionaryStmt +AlterTableCmd +AlterTableMoveAllStmt +AlterTableSpaceOptionsStmt +AlterTableStmt +AlterTableType +AlterTableUtilityContext +AlterTypeRecurseParams +AlterTypeStmt +AlterUserMappingStmt +AlteredTableInfo +AlternativeSubPlan +AmcheckOptions +AnalyzeAttrComputeStatsFunc +AnalyzeAttrFetchFunc +AnalyzeForeignTable_function +AnlExprData +AnlIndexData +AnyArrayType +Append +AppendPath +AppendRelInfo +AppendState +ApplyExecutionData +ApplySubXactData +Archive +ArchiveEntryPtrType +ArchiveFormat +ArchiveHandle +ArchiveMode +ArchiveOpts +ArchiverOutput +ArchiverStage +ArrayAnalyzeExtraData +ArrayBuildState +ArrayBuildStateAny +ArrayBuildStateArr +ArrayCoerceExpr +ArrayConstIterState +ArrayExpr +ArrayExprIterState +ArrayIOData +ArrayIterator +ArrayMapState +ArrayMetaState +ArrayParseState +ArraySubWorkspace +ArrayType +AsyncQueueControl +AsyncQueueEntry +AsyncRequest +AttInMetadata +AttStatsSlot +AttoptCacheEntry +AttoptCacheKey +AttrDefInfo +AttrDefault +AttrMap +AttrMissing +AttrNumber +AttributeOpts +AuthRequest +AutoPrewarmSharedState +AutoVacOpts +AutoVacuumShmemStruct +AutoVacuumWorkItem +AutoVacuumWorkItemType +AuxProcType +BF_ctx +BF_key +BF_word +BF_word_signed +BIGNUM +BIO +BIO_METHOD +BITVECP +BMS_Comparison +BMS_Membership +BN_CTX +BOOL +BOOLEAN +BOX +BTArrayKeyInfo +BTBuildState +BTCycleId +BTDedupInterval +BTDedupState +BTDedupStateData +BTDeletedPageData +BTIndexStat +BTInsertState +BTInsertStateData +BTLeader +BTMetaPageData +BTOneVacInfo +BTOptions +BTPS_State +BTPageOpaque +BTPageOpaqueData +BTPageStat +BTPageState +BTParallelScanDesc +BTPendingFSM +BTScanInsert +BTScanInsertData +BTScanOpaque +BTScanOpaqueData +BTScanPos +BTScanPosData +BTScanPosItem +BTShared +BTSortArrayContext +BTSpool +BTStack +BTStackData +BTVacInfo +BTVacState +BTVacuumPosting +BTVacuumPostingData +BTWriteState +BUF_MEM +BYTE +BY_HANDLE_FILE_INFORMATION +Backend +BackendId +BackendParameters +BackendState +BackendType +BackgroundWorker +BackgroundWorkerArray +BackgroundWorkerHandle +BackgroundWorkerSlot +Barrier +BaseBackupCmd +BeginDirectModify_function +BeginForeignInsert_function +BeginForeignModify_function +BeginForeignScan_function +BeginSampleScan_function +BernoulliSamplerData +BgWorkerStartTime +BgwHandleStatus +BinaryArithmFunc +BindParamCbData +BipartiteMatchState +BitmapAnd +BitmapAndPath +BitmapAndState +BitmapHeapPath +BitmapHeapScan +BitmapHeapScanState +BitmapIndexScan +BitmapIndexScanState +BitmapOr +BitmapOrPath +BitmapOrState +Bitmapset +BlobInfo +Block +BlockId +BlockIdData +BlockInfoRecord +BlockNumber +BlockSampler +BlockSamplerData +BlockedProcData +BlockedProcsData +BloomBuildState +BloomFilter +BloomMetaPageData +BloomOpaque +BloomOptions +BloomPageOpaque +BloomPageOpaqueData +BloomScanOpaque +BloomScanOpaqueData +BloomSignatureWord +BloomState +BloomTuple +BlowfishContext +BoolAggState +BoolExpr +BoolExprType +BoolTestType +BooleanTest +BpChar +BrinBuildState +BrinDesc +BrinMemTuple +BrinMetaPageData +BrinOpaque +BrinOpcInfo +BrinOptions +BrinRevmap +BrinSpecialSpace +BrinStatsData +BrinTuple +BrinValues +BtreeCheckState +BtreeLevel +Bucket +BufFile +Buffer +BufferAccessStrategy +BufferAccessStrategyType +BufferCachePagesContext +BufferCachePagesRec +BufferDesc +BufferDescPadded +BufferHeapTupleTableSlot +BufferLookupEnt +BufferStrategyControl +BufferTag +BufferUsage +BuildAccumulator +BuiltinScript +BulkInsertState +BulkInsertStateData +CACHESIGN +CAC_state +CCFastEqualFN +CCHashFN +CEOUC_WAIT_MODE +CFuncHashTabEntry +CHAR +CHECKPOINT +CHKVAL +CIRCLE +CMPDAffix +CONTEXT +COP +CRITICAL_SECTION +CRSSnapshotAction +CState +CTECycleClause +CTEMaterialize +CTESearchClause +CV +CachedExpression +CachedPlan +CachedPlanSource +CallContext +CallStmt +CancelRequestPacket +CaseExpr +CaseTestExpr +CaseWhen +Cash +CastInfo +CatCList +CatCTup +CatCache +CatCacheHeader +CatalogId +CatalogIndexState +ChangeVarNodes_context +CheckPoint +CheckPointStmt +CheckpointStatsData +CheckpointerRequest +CheckpointerShmemStruct +Chromosome +CkptSortItem +CkptTsStatus +ClientAuthentication_hook_type +ClientCertMode +ClientCertName +ClientData +ClonePtrType +ClosePortalStmt +ClosePtrType +Clump +ClusterInfo +ClusterParams +ClusterStmt +CmdType +CoalesceExpr +CoerceParamHook +CoerceToDomain +CoerceToDomainValue +CoerceViaIO +CoercionContext +CoercionForm +CoercionPathType +CollAliasData +CollInfo +CollateClause +CollateExpr +CollateStrength +CollectedATSubcmd +CollectedCommand +CollectedCommandType +ColorTrgm +ColorTrgmInfo +ColumnCompareData +ColumnDef +ColumnIOData +ColumnRef +ColumnsHashData +CombinationGenerator +ComboCidEntry +ComboCidEntryData +ComboCidKey +ComboCidKeyData +Command +CommandDest +CommandId +CommandTag +CommandTagBehavior +CommentItem +CommentStmt +CommitTimestampEntry +CommitTimestampShared +CommonEntry +CommonTableExpr +CompareScalarsContext +CompiledExprState +CompositeIOData +CompositeTypeStmt +CompoundAffixFlag +CompressionAlgorithm +CompressorState +ComputeXidHorizonsResult +ConditionVariable +ConditionVariableMinimallyPadded +ConditionalStack +ConfigData +ConfigVariable +ConnCacheEntry +ConnCacheKey +ConnParams +ConnStatusType +ConnType +ConnectionStateEnum +ConnsAllowedState +ConsiderSplitContext +Const +ConstrCheck +ConstrType +Constraint +ConstraintCategory +ConstraintInfo +ConstraintsSetStmt +ControlData +ControlFileData +ConvInfo +ConvProcInfo +ConversionLocation +ConvertRowtypeExpr +CookedConstraint +CopyDest +CopyFormatOptions +CopyFromState +CopyFromStateData +CopyInsertMethod +CopyMultiInsertBuffer +CopyMultiInsertInfo +CopySource +CopyStmt +CopyToState +CopyToStateData +Cost +CostSelector +Counters +CoverExt +CoverPos +CreateAmStmt +CreateCastStmt +CreateConversionStmt +CreateDomainStmt +CreateEnumStmt +CreateEventTrigStmt +CreateExtensionStmt +CreateFdwStmt +CreateForeignServerStmt +CreateForeignTableStmt +CreateFunctionStmt +CreateOpClassItem +CreateOpClassStmt +CreateOpFamilyStmt +CreatePLangStmt +CreatePolicyStmt +CreatePublicationStmt +CreateRangeStmt +CreateReplicationSlotCmd +CreateRoleStmt +CreateSchemaStmt +CreateSchemaStmtContext +CreateSeqStmt +CreateStatsStmt +CreateStmt +CreateStmtContext +CreateSubscriptionStmt +CreateTableAsStmt +CreateTableSpaceStmt +CreateTransformStmt +CreateTrigStmt +CreateUserMappingStmt +CreatedbStmt +CredHandle +CteItem +CteScan +CteScanState +CteState +CtlCommand +CtxtHandle +CurrentOfExpr +CustomExecMethods +CustomOutPtrType +CustomPath +CustomScan +CustomScanMethods +CustomScanState +CycleCtr +DBState +DCHCacheEntry +DEADLOCK_INFO +DECountItem +DH +DIR +DNSServiceErrorType +DNSServiceRef +DR_copy +DR_intorel +DR_printtup +DR_sqlfunction +DR_transientrel +DSA +DWORD +DataDumperPtr +DataPageDeleteStack +DatabaseInfo +DateADT +Datum +DatumTupleFields +DbInfo +DbInfoArr +DeClonePtrType +DeadLockState +DeallocateStmt +DeclareCursorStmt +DecodedBkpBlock +DecodingOutputState +DefElem +DefElemAction +DefaultACLInfo +DefineStmt +DeleteStmt +DependencyGenerator +DependencyGeneratorData +DependencyType +DestReceiver +DictISpell +DictInt +DictSimple +DictSnowball +DictSubState +DictSyn +DictThesaurus +DimensionInfo +DirectoryMethodData +DirectoryMethodFile +DisableTimeoutParams +DiscardMode +DiscardStmt +DistanceValue +DistinctExpr +DoStmt +DocRepresentation +DomainConstraintCache +DomainConstraintRef +DomainConstraintState +DomainConstraintType +DomainIOData +DropBehavior +DropOwnedStmt +DropReplicationSlotCmd +DropRoleStmt +DropStmt +DropSubscriptionStmt +DropTableSpaceStmt +DropUserMappingStmt +DropdbStmt +DumpComponents +DumpId +DumpOptions +DumpSignalInformation +DumpableObject +DumpableObjectType +DynamicFileList +DynamicZoneAbbrev +EC_KEY +EDGE +ENGINE +EOM_flatten_into_method +EOM_get_flat_size_method +EPQState +EPlan +EState +EVP_CIPHER +EVP_CIPHER_CTX +EVP_MD +EVP_MD_CTX +EVP_PKEY +EachState +Edge +EditableObjectType +ElementsState +EnableTimeoutParams +EndBlobPtrType +EndBlobsPtrType +EndDataPtrType +EndDirectModify_function +EndForeignInsert_function +EndForeignModify_function +EndForeignScan_function +EndSampleScan_function +EnumItem +EolType +EphemeralNameRelationType +EphemeralNamedRelation +EphemeralNamedRelationData +EphemeralNamedRelationMetadata +EphemeralNamedRelationMetadataData +EquivalenceClass +EquivalenceMember +ErrorContextCallback +ErrorData +EstimateDSMForeignScan_function +EstimationInfo +EventTriggerCacheEntry +EventTriggerCacheItem +EventTriggerCacheStateType +EventTriggerData +EventTriggerEvent +EventTriggerInfo +EventTriggerQueryState +ExceptionLabelMap +ExceptionMap +ExclusiveBackupState +ExecAuxRowMark +ExecEvalBoolSubroutine +ExecEvalSubroutine +ExecForeignBatchInsert_function +ExecForeignDelete_function +ExecForeignInsert_function +ExecForeignTruncate_function +ExecForeignUpdate_function +ExecParallelEstimateContext +ExecParallelInitializeDSMContext +ExecPhraseData +ExecProcNodeMtd +ExecRowMark +ExecScanAccessMtd +ExecScanRecheckMtd +ExecStatus +ExecStatusType +ExecuteStmt +ExecutorCheckPerms_hook_type +ExecutorEnd_hook_type +ExecutorFinish_hook_type +ExecutorRun_hook_type +ExecutorStart_hook_type +ExpandedArrayHeader +ExpandedObjectHeader +ExpandedObjectMethods +ExpandedRange +ExpandedRecordFieldInfo +ExpandedRecordHeader +ExplainDirectModify_function +ExplainForeignModify_function +ExplainForeignScan_function +ExplainFormat +ExplainOneQuery_hook_type +ExplainState +ExplainStmt +ExplainWorkersState +ExportedSnapshot +Expr +ExprContext +ExprContextCallbackFunction +ExprContext_CB +ExprDoneCond +ExprEvalOp +ExprEvalOpLookup +ExprEvalRowtypeCache +ExprEvalStep +ExprState +ExprStateEvalFunc +ExtensibleNode +ExtensibleNodeEntry +ExtensibleNodeMethods +ExtensionControlFile +ExtensionInfo +ExtensionMemberId +ExtensionVersionInfo +FDWCollateState +FD_SET +FILE +FILETIME +FILE_INFORMATION_CLASS +FILE_STANDARD_INFORMATION +FSMAddress +FSMPage +FSMPageData +FakeRelCacheEntry +FakeRelCacheEntryData +FastPathStrongRelationLockData +FdwInfo +FdwRoutine +FetchDirection +FetchStmt +FieldSelect +FieldStore +File +FileFdwExecutionState +FileFdwPlanState +FileNameMap +FileTag +FinalPathExtraData +FindColsContext +FindSplitData +FindSplitStrat +FixedParallelExecutorState +FixedParallelState +FixedParamState +FlagMode +FlushPosition +FmgrBuiltin +FmgrHookEventType +FmgrInfo +ForBothCellState +ForBothState +ForEachState +ForFiveState +ForFourState +ForThreeState +ForeignAsyncConfigureWait_function +ForeignAsyncNotify_function +ForeignAsyncRequest_function +ForeignDataWrapper +ForeignKeyCacheInfo +ForeignKeyOptInfo +ForeignPath +ForeignScan +ForeignScanState +ForeignServer +ForeignServerInfo +ForeignTable +ForeignTruncateInfo +ForkNumber +FormData_pg_aggregate +FormData_pg_am +FormData_pg_amop +FormData_pg_amproc +FormData_pg_attrdef +FormData_pg_attribute +FormData_pg_auth_members +FormData_pg_authid +FormData_pg_cast +FormData_pg_class +FormData_pg_collation +FormData_pg_constraint +FormData_pg_conversion +FormData_pg_database +FormData_pg_default_acl +FormData_pg_depend +FormData_pg_enum +FormData_pg_event_trigger +FormData_pg_extension +FormData_pg_foreign_data_wrapper +FormData_pg_foreign_server +FormData_pg_foreign_table +FormData_pg_index +FormData_pg_inherits +FormData_pg_language +FormData_pg_largeobject +FormData_pg_largeobject_metadata +FormData_pg_namespace +FormData_pg_opclass +FormData_pg_operator +FormData_pg_opfamily +FormData_pg_partitioned_table +FormData_pg_policy +FormData_pg_proc +FormData_pg_publication +FormData_pg_publication_rel +FormData_pg_range +FormData_pg_replication_origin +FormData_pg_rewrite +FormData_pg_sequence +FormData_pg_sequence_data +FormData_pg_shdepend +FormData_pg_statistic +FormData_pg_statistic_ext +FormData_pg_subscription +FormData_pg_subscription_rel +FormData_pg_tablespace +FormData_pg_transform +FormData_pg_trigger +FormData_pg_ts_config +FormData_pg_ts_config_map +FormData_pg_ts_dict +FormData_pg_ts_parser +FormData_pg_ts_template +FormData_pg_type +FormData_pg_user_mapping +Form_pg_aggregate +Form_pg_am +Form_pg_amop +Form_pg_amproc +Form_pg_attrdef +Form_pg_attribute +Form_pg_auth_members +Form_pg_authid +Form_pg_cast +Form_pg_class +Form_pg_collation +Form_pg_constraint +Form_pg_conversion +Form_pg_database +Form_pg_default_acl +Form_pg_depend +Form_pg_enum +Form_pg_event_trigger +Form_pg_extension +Form_pg_foreign_data_wrapper +Form_pg_foreign_server +Form_pg_foreign_table +Form_pg_index +Form_pg_inherits +Form_pg_language +Form_pg_largeobject +Form_pg_largeobject_metadata +Form_pg_namespace +Form_pg_opclass +Form_pg_operator +Form_pg_opfamily +Form_pg_partitioned_table +Form_pg_policy +Form_pg_proc +Form_pg_publication +Form_pg_publication_rel +Form_pg_range +Form_pg_replication_origin +Form_pg_rewrite +Form_pg_sequence +Form_pg_sequence_data +Form_pg_shdepend +Form_pg_statistic +Form_pg_statistic_ext +Form_pg_subscription +Form_pg_subscription_rel +Form_pg_tablespace +Form_pg_transform +Form_pg_trigger +Form_pg_ts_config +Form_pg_ts_config_map +Form_pg_ts_dict +Form_pg_ts_parser +Form_pg_ts_template +Form_pg_type +Form_pg_user_mapping +FormatNode +FreeBlockNumberArray +FreeListData +FreePageBtree +FreePageBtreeHeader +FreePageBtreeInternalKey +FreePageBtreeLeafKey +FreePageBtreeSearchResult +FreePageManager +FreePageSpanLeader +FromCharDateMode +FromExpr +FullTransactionId +FuncCall +FuncCallContext +FuncCandidateList +FuncDetailCode +FuncExpr +FuncInfo +FuncLookupError +FunctionCallInfo +FunctionCallInfoBaseData +FunctionParameter +FunctionParameterMode +FunctionScan +FunctionScanPerFuncState +FunctionScanState +FuzzyAttrMatchState +GBT_NUMKEY +GBT_NUMKEY_R +GBT_VARKEY +GBT_VARKEY_R +GENERAL_NAME +GISTBuildBuffers +GISTBuildState +GISTDeletedPageContents +GISTENTRY +GISTInsertStack +GISTInsertState +GISTIntArrayBigOptions +GISTIntArrayOptions +GISTNodeBuffer +GISTNodeBufferPage +GISTPageOpaque +GISTPageOpaqueData +GISTPageSplitInfo +GISTSTATE +GISTScanOpaque +GISTScanOpaqueData +GISTSearchHeapItem +GISTSearchItem +GISTTYPE +GIST_SPLITVEC +GMReaderTupleBuffer +GV +Gather +GatherMerge +GatherMergePath +GatherMergeState +GatherPath +GatherState +Gene +GeneratePruningStepsContext +GenerationBlock +GenerationChunk +GenerationContext +GenerationPointer +GenericCosts +GenericXLogState +GeqoPrivateData +GetForeignJoinPaths_function +GetForeignModifyBatchSize_function +GetForeignPaths_function +GetForeignPlan_function +GetForeignRelSize_function +GetForeignRowMarkType_function +GetForeignUpperPaths_function +GetState +GiSTOptions +GinBtree +GinBtreeData +GinBtreeDataLeafInsertData +GinBtreeEntryInsertData +GinBtreeStack +GinBuildState +GinChkVal +GinEntries +GinEntryAccumulator +GinIndexStat +GinMetaPageData +GinNullCategory +GinOptions +GinPageOpaque +GinPageOpaqueData +GinPlaceToPageRC +GinPostingList +GinQualCounts +GinScanEntry +GinScanKey +GinScanOpaque +GinScanOpaqueData +GinState +GinStatsData +GinTernaryValue +GinTupleCollector +GinVacuumState +GistBuildMode +GistEntryVector +GistHstoreOptions +GistInetKey +GistNSN +GistOptBufferingMode +GistSortedBuildPageState +GistSplitUnion +GistSplitVector +GistTsVectorOptions +GistVacState +GlobalTransaction +GlobalVisState +GrantRoleStmt +GrantStmt +GrantTargetType +Group +GroupClause +GroupPath +GroupPathExtraData +GroupResultPath +GroupState +GroupVarInfo +GroupingFunc +GroupingSet +GroupingSetData +GroupingSetKind +GroupingSetsPath +GucAction +GucBoolAssignHook +GucBoolCheckHook +GucContext +GucEnumAssignHook +GucEnumCheckHook +GucIntAssignHook +GucIntCheckHook +GucRealAssignHook +GucRealCheckHook +GucShowHook +GucSource +GucStack +GucStackState +GucStringAssignHook +GucStringCheckHook +HANDLE +HASHACTION +HASHBUCKET +HASHCTL +HASHELEMENT +HASHHDR +HASHSEGMENT +HASH_SEQ_STATUS +HCRYPTPROV +HE +HEntry +HIST_ENTRY +HKEY +HLOCAL +HMAC_CTX +HMODULE +HOldEntry +HRESULT +HSParser +HSpool +HStore +HTAB +HTSV_Result +HV +Hash +HashAggBatch +HashAggSpill +HashAllocFunc +HashBuildState +HashCompareFunc +HashCopyFunc +HashIndexStat +HashInstrumentation +HashJoin +HashJoinState +HashJoinTable +HashJoinTuple +HashMemoryChunk +HashMetaPage +HashMetaPageData +HashOptions +HashPageOpaque +HashPageOpaqueData +HashPageStat +HashPath +HashScanOpaque +HashScanOpaqueData +HashScanPosData +HashScanPosItem +HashSkewBucket +HashState +HashTapeInfo +HashValueFunc +HbaLine +HbaToken +HeadlineJsonState +HeadlineParsedText +HeadlineWordEntry +HeapCheckContext +HeapScanDesc +HeapTuple +HeapTupleData +HeapTupleFields +HeapTupleForceOption +HeapTupleHeader +HeapTupleHeaderData +HeapTupleTableSlot +HistControl +HotStandbyState +I32 +ICU_Convert_Func +ID +INFIX +INT128 +INTERFACE_INFO +IOFuncSelector +IO_STATUS_BLOCK +IPCompareMethod +ITEM +IV +IdentLine +IdentifierLookup +IdentifySystemCmd +IfStackElem +ImportForeignSchemaStmt +ImportForeignSchemaType +ImportForeignSchema_function +ImportQual +InProgressEnt +IncludeWal +InclusionOpaque +IncrementVarSublevelsUp_context +IncrementalSort +IncrementalSortExecutionStatus +IncrementalSortGroupInfo +IncrementalSortInfo +IncrementalSortPath +IncrementalSortState +Index +IndexAMProperty +IndexAmRoutine +IndexArrayKeyInfo +IndexAttachInfo +IndexAttrBitmapKind +IndexBuildCallback +IndexBuildResult +IndexBulkDeleteCallback +IndexBulkDeleteResult +IndexClause +IndexClauseSet +IndexDeleteCounts +IndexDeletePrefetchState +IndexElem +IndexFetchHeapData +IndexFetchTableData +IndexInfo +IndexList +IndexOnlyScan +IndexOnlyScanState +IndexOptInfo +IndexOrderByDistance +IndexPath +IndexRuntimeKeyInfo +IndexScan +IndexScanDesc +IndexScanState +IndexStateFlagsAction +IndexStmt +IndexTuple +IndexTupleData +IndexUniqueCheck +IndexVacuumInfo +IndxInfo +InferClause +InferenceElem +InfoItem +InhInfo +InheritableSocket +InitSampleScan_function +InitializeDSMForeignScan_function +InitializeWorkerForeignScan_function +InlineCodeBlock +InsertStmt +Instrumentation +Int128AggState +Int8TransTypeData +IntRBTreeNode +IntegerSet +InternalDefaultACL +InternalGrant +Interval +IntoClause +InvalidationChunk +InvalidationListHeader +IpcMemoryId +IpcMemoryKey +IpcMemoryState +IpcSemaphoreId +IpcSemaphoreKey +IsForeignPathAsyncCapable_function +IsForeignRelUpdatable_function +IsForeignScanParallelSafe_function +IsoConnInfo +IspellDict +Item +ItemId +ItemIdData +ItemPointer +ItemPointerData +IterateDirectModify_function +IterateForeignScan_function +IterateJsonStringValuesState +JEntry +JHashState +JOBOBJECTINFOCLASS +JOBOBJECT_BASIC_LIMIT_INFORMATION +JOBOBJECT_BASIC_UI_RESTRICTIONS +JOBOBJECT_SECURITY_LIMIT_INFORMATION +JitContext +JitInstrumentation +JitProviderCallbacks +JitProviderCompileExprCB +JitProviderInit +JitProviderReleaseContextCB +JitProviderResetAfterErrorCB +Join +JoinCostWorkspace +JoinExpr +JoinHashEntry +JoinPath +JoinPathExtraData +JoinState +JoinType +JsObject +JsValue +JsonAggState +JsonBaseObjectInfo +JsonHashEntry +JsonIterateStringValuesAction +JsonLexContext +JsonLikeRegexContext +JsonManifestFileField +JsonManifestParseContext +JsonManifestParseState +JsonManifestSemanticState +JsonManifestWALRangeField +JsonParseContext +JsonParseErrorType +JsonPath +JsonPathBool +JsonPathExecContext +JsonPathExecResult +JsonPathGinAddPathItemFunc +JsonPathGinContext +JsonPathGinExtractNodesFunc +JsonPathGinNode +JsonPathGinNodeType +JsonPathGinPath +JsonPathGinPathItem +JsonPathItem +JsonPathItemType +JsonPathKeyword +JsonPathParseItem +JsonPathParseResult +JsonPathPredicateCallback +JsonPathString +JsonSemAction +JsonTokenType +JsonTransformStringValuesAction +JsonTypeCategory +JsonValueList +JsonValueListIterator +Jsonb +JsonbAggState +JsonbContainer +JsonbInState +JsonbIterState +JsonbIterator +JsonbIteratorToken +JsonbPair +JsonbParseState +JsonbSubWorkspace +JsonbTypeCategory +JsonbValue +JumbleState +JunkFilter +KeyArray +KeySuffix +KeyWord +LARGE_INTEGER +LDAP +LDAPMessage +LDAPURLDesc +LDAP_TIMEVAL +LINE +LLVMAttributeRef +LLVMBasicBlockRef +LLVMBuilderRef +LLVMIntPredicate +LLVMJitContext +LLVMJitHandle +LLVMMemoryBufferRef +LLVMModuleRef +LLVMOrcJITStackRef +LLVMOrcModuleHandle +LLVMOrcTargetAddress +LLVMPassManagerBuilderRef +LLVMPassManagerRef +LLVMSharedModuleRef +LLVMTargetMachineRef +LLVMTargetRef +LLVMTypeRef +LLVMValueRef +LOCALLOCK +LOCALLOCKOWNER +LOCALLOCKTAG +LOCALPREDICATELOCK +LOCK +LOCKMASK +LOCKMETHODID +LOCKMODE +LOCKTAG +LONG +LONG_PTR +LOOP +LPBYTE +LPCTSTR +LPCWSTR +LPDWORD +LPSECURITY_ATTRIBUTES +LPSERVICE_STATUS +LPSTR +LPTHREAD_START_ROUTINE +LPTSTR +LPVOID +LPWSTR +LSEG +LUID +LVDeadTuples +LVPagePruneState +LVParallelState +LVRelState +LVSavedErrInfo +LVShared +LVSharedIndStats +LWLock +LWLockHandle +LWLockMode +LWLockPadded +LabelProvider +LagTracker +LargeObjectDesc +LastAttnumInfo +Latch +LerpFunc +LexDescr +LexemeEntry +LexemeHashKey +LexemeInfo +LexemeKey +LexizeData +LibraryInfo +Limit +LimitOption +LimitPath +LimitState +LimitStateCond +List +ListCell +ListDictionary +ListParsedLex +ListenAction +ListenActionKind +ListenStmt +LoadStmt +LocalBufferLookupEnt +LocalPgBackendStatus +LocalTransactionId +LocationIndex +LocationLen +LockAcquireResult +LockClauseStrength +LockData +LockInfoData +LockInstanceData +LockMethod +LockMethodData +LockRelId +LockRows +LockRowsPath +LockRowsState +LockStmt +LockTagType +LockTupleMode +LockViewRecurse_context +LockWaitPolicy +LockingClause +LogOpts +LogStmtLevel +LogicalDecodeBeginCB +LogicalDecodeBeginPrepareCB +LogicalDecodeChangeCB +LogicalDecodeCommitCB +LogicalDecodeCommitPreparedCB +LogicalDecodeFilterByOriginCB +LogicalDecodeFilterPrepareCB +LogicalDecodeMessageCB +LogicalDecodePrepareCB +LogicalDecodeRollbackPreparedCB +LogicalDecodeShutdownCB +LogicalDecodeStartupCB +LogicalDecodeStreamAbortCB +LogicalDecodeStreamChangeCB +LogicalDecodeStreamCommitCB +LogicalDecodeStreamMessageCB +LogicalDecodeStreamPrepareCB +LogicalDecodeStreamStartCB +LogicalDecodeStreamStopCB +LogicalDecodeStreamTruncateCB +LogicalDecodeTruncateCB +LogicalDecodingContext +LogicalErrorCallbackState +LogicalOutputPluginInit +LogicalOutputPluginWriterPrepareWrite +LogicalOutputPluginWriterUpdateProgress +LogicalOutputPluginWriterWrite +LogicalRepBeginData +LogicalRepCommitData +LogicalRepCtxStruct +LogicalRepMsgType +LogicalRepPartMapEntry +LogicalRepRelId +LogicalRepRelMapEntry +LogicalRepRelation +LogicalRepTupleData +LogicalRepTyp +LogicalRepWorker +LogicalRewriteMappingData +LogicalTape +LogicalTapeSet +LtreeGistOptions +LtreeSignature +MAGIC +MBuf +MCVItem +MCVList +MEMORY_BASIC_INFORMATION +MINIDUMPWRITEDUMP +MINIDUMP_TYPE +MJEvalResult +MTTargetRelLookup +MVDependencies +MVDependency +MVNDistinct +MVNDistinctItem +Material +MaterialPath +MaterialState +MdfdVec +Memoize +MemoizeEntry +MemoizeInstrumentation +MemoizeKey +MemoizePath +MemoizeState +MemoizeTuple +MemoryContext +MemoryContextCallback +MemoryContextCallbackFunction +MemoryContextCounters +MemoryContextData +MemoryContextMethods +MemoryStatsPrintFunc +MergeAppend +MergeAppendPath +MergeAppendState +MergeJoin +MergeJoinClause +MergeJoinState +MergePath +MergeScanSelCache +MetaCommand +MinMaxAggInfo +MinMaxAggPath +MinMaxExpr +MinMaxMultiOptions +MinMaxOp +MinimalTuple +MinimalTupleData +MinimalTupleTableSlot +MinmaxMultiOpaque +MinmaxOpaque +ModifyTable +ModifyTablePath +ModifyTableState +MorphOpaque +MsgType +MultiAssignRef +MultiSortSupport +MultiSortSupportData +MultiXactId +MultiXactMember +MultiXactOffset +MultiXactStateData +MultiXactStatus +MultirangeIOData +MultirangeParseState +MultirangeType +NDBOX +NODE +NTSTATUS +NUMCacheEntry +NUMDesc +NUMProc +NV +Name +NameData +NameHashEntry +NamedArgExpr +NamedLWLockTranche +NamedLWLockTrancheRequest +NamedTuplestoreScan +NamedTuplestoreScanState +NamespaceInfo +NestLoop +NestLoopParam +NestLoopState +NestPath +NewColumnValue +NewConstraint +NextSampleBlock_function +NextSampleTuple_function +NextValueExpr +Node +NodeTag +NonEmptyRange +Notification +NotificationHash +NotificationList +NotifyStmt +Nsrt +NullIfExpr +NullTest +NullTestType +NullableDatum +Numeric +NumericAggState +NumericDigit +NumericSortSupport +NumericSumAccum +NumericVar +OM_uint32 +OP +OSAPerGroupState +OSAPerQueryState +OSInfo +OSSLCipher +OSSLDigest +OVERLAPPED +ObjectAccessDrop +ObjectAccessNamespaceSearch +ObjectAccessPostAlter +ObjectAccessPostCreate +ObjectAccessType +ObjectAddress +ObjectAddressAndFlags +ObjectAddressExtra +ObjectAddressStack +ObjectAddresses +ObjectClass +ObjectPropertyType +ObjectType +ObjectWithArgs +Offset +OffsetNumber +OffsetVarNodes_context +Oid +OidOptions +OkeysState +OldSnapshotControlData +OldSnapshotTimeMapping +OldToNewMapping +OldToNewMappingData +OnCommitAction +OnCommitItem +OnConflictAction +OnConflictClause +OnConflictExpr +OnConflictSetState +OpBtreeInterpretation +OpClassCacheEnt +OpExpr +OpFamilyMember +OpFamilyOpFuncGroup +OpclassInfo +Operator +OperatorElement +OpfamilyInfo +OprCacheEntry +OprCacheKey +OprInfo +OprProofCacheEntry +OprProofCacheKey +OutputContext +OutputPluginCallbacks +OutputPluginOptions +OutputPluginOutputType +OverrideSearchPath +OverrideStackEntry +OverridingKind +PACE_HEADER +PACL +PATH +PBOOL +PCtxtHandle +PFN +PFN_NTQUERYINFORMATIONFILE +PGAlignedBlock +PGAlignedXLogBlock +PGAsyncStatusType +PGCALL2 +PGChecksummablePage +PGContextVisibility +PGEvent +PGEventConnDestroy +PGEventConnReset +PGEventId +PGEventProc +PGEventRegister +PGEventResultCopy +PGEventResultCreate +PGEventResultDestroy +PGFInfoFunction +PGFileType +PGFunction +PGLZ_HistEntry +PGLZ_Strategy +PGMessageField +PGModuleMagicFunction +PGNoticeHooks +PGOutputData +PGPROC +PGP_CFB +PGP_Context +PGP_MPI +PGP_PubKey +PGP_S2K +PGPing +PGQueryClass +PGRUsage +PGSemaphore +PGSemaphoreData +PGShmemHeader +PGTargetServerType +PGTernaryBool +PGTransactionStatusType +PGVerbosity +PG_Locale_Strategy +PG_Lock_Status +PG_init_t +PGcancel +PGcmdQueueEntry +PGconn +PGdataValue +PGlobjfuncs +PGnotify +PGpipelineStatus +PGresAttDesc +PGresAttValue +PGresParamDesc +PGresult +PGresult_data +PHANDLE +PIO_STATUS_BLOCK +PLAINTREE +PLAssignStmt +PLUID_AND_ATTRIBUTES +PLcword +PLpgSQL_case_when +PLpgSQL_condition +PLpgSQL_datum +PLpgSQL_datum_type +PLpgSQL_diag_item +PLpgSQL_exception +PLpgSQL_exception_block +PLpgSQL_execstate +PLpgSQL_expr +PLpgSQL_func_hashkey +PLpgSQL_function +PLpgSQL_getdiag_kind +PLpgSQL_if_elsif +PLpgSQL_label_type +PLpgSQL_nsitem +PLpgSQL_nsitem_type +PLpgSQL_plugin +PLpgSQL_promise_type +PLpgSQL_raise_option +PLpgSQL_raise_option_type +PLpgSQL_rec +PLpgSQL_recfield +PLpgSQL_resolve_option +PLpgSQL_row +PLpgSQL_stmt +PLpgSQL_stmt_assert +PLpgSQL_stmt_assign +PLpgSQL_stmt_block +PLpgSQL_stmt_call +PLpgSQL_stmt_case +PLpgSQL_stmt_close +PLpgSQL_stmt_commit +PLpgSQL_stmt_dynexecute +PLpgSQL_stmt_dynfors +PLpgSQL_stmt_execsql +PLpgSQL_stmt_exit +PLpgSQL_stmt_fetch +PLpgSQL_stmt_forc +PLpgSQL_stmt_foreach_a +PLpgSQL_stmt_fori +PLpgSQL_stmt_forq +PLpgSQL_stmt_fors +PLpgSQL_stmt_getdiag +PLpgSQL_stmt_if +PLpgSQL_stmt_loop +PLpgSQL_stmt_open +PLpgSQL_stmt_perform +PLpgSQL_stmt_raise +PLpgSQL_stmt_return +PLpgSQL_stmt_return_next +PLpgSQL_stmt_return_query +PLpgSQL_stmt_rollback +PLpgSQL_stmt_type +PLpgSQL_stmt_while +PLpgSQL_trigtype +PLpgSQL_type +PLpgSQL_type_type +PLpgSQL_var +PLpgSQL_variable +PLwdatum +PLword +PLyArrayToOb +PLyCursorObject +PLyDatumToOb +PLyDatumToObFunc +PLyExceptionEntry +PLyExecutionContext +PLyObToArray +PLyObToDatum +PLyObToDatumFunc +PLyObToDomain +PLyObToScalar +PLyObToTransform +PLyObToTuple +PLyObject_AsString_t +PLyPlanObject +PLyProcedure +PLyProcedureEntry +PLyProcedureKey +PLyResultObject +PLySRFState +PLySavedArgs +PLyScalarToOb +PLySubtransactionData +PLySubtransactionObject +PLyTransformToOb +PLyTupleToOb +PLyUnicode_FromStringAndSize_t +PLy_elog_impl_t +PMINIDUMP_CALLBACK_INFORMATION +PMINIDUMP_EXCEPTION_INFORMATION +PMINIDUMP_USER_STREAM_INFORMATION +PMSignalData +PMSignalReason +PMState +POLYGON +PQArgBlock +PQEnvironmentOption +PQExpBuffer +PQExpBufferData +PQcommMethods +PQconninfoOption +PQnoticeProcessor +PQnoticeReceiver +PQprintOpt +PQsslKeyPassHook_OpenSSL_type +PREDICATELOCK +PREDICATELOCKTAG +PREDICATELOCKTARGET +PREDICATELOCKTARGETTAG +PROCESS_INFORMATION +PROCLOCK +PROCLOCKTAG +PROC_HDR +PROC_QUEUE +PSID +PSID_AND_ATTRIBUTES +PSQL_COMP_CASE +PSQL_ECHO +PSQL_ECHO_HIDDEN +PSQL_ERROR_ROLLBACK +PTEntryArray +PTIterationArray +PTOKEN_PRIVILEGES +PTOKEN_USER +PUTENVPROC +PVOID +PX_Alias +PX_Cipher +PX_Combo +PX_HMAC +PX_MD +Page +PageData +PageGistNSN +PageHeader +PageHeaderData +PageXLogRecPtr +PagetableEntry +Pairs +ParallelAppendState +ParallelBitmapHeapState +ParallelBlockTableScanDesc +ParallelBlockTableScanWorker +ParallelBlockTableScanWorkerData +ParallelCompletionPtr +ParallelContext +ParallelExecutorInfo +ParallelHashGrowth +ParallelHashJoinBatch +ParallelHashJoinBatchAccessor +ParallelHashJoinState +ParallelIndexScanDesc +ParallelReadyList +ParallelSlot +ParallelSlotArray +ParallelSlotResultHandler +ParallelState +ParallelTableScanDesc +ParallelTableScanDescData +ParallelWorkerContext +ParallelWorkerInfo +Param +ParamCompileHook +ParamExecData +ParamExternData +ParamFetchHook +ParamKind +ParamListInfo +ParamPathInfo +ParamRef +ParamsErrorCbData +ParentMapEntry +ParseCallbackState +ParseExprKind +ParseNamespaceColumn +ParseNamespaceItem +ParseParamRefHook +ParseState +ParsedLex +ParsedScript +ParsedText +ParsedWord +ParserSetupHook +ParserState +PartClauseInfo +PartClauseMatchStatus +PartClauseTarget +PartitionBoundInfo +PartitionBoundInfoData +PartitionBoundSpec +PartitionCmd +PartitionDesc +PartitionDescData +PartitionDirectory +PartitionDirectoryEntry +PartitionDispatch +PartitionElem +PartitionHashBound +PartitionKey +PartitionListValue +PartitionMap +PartitionPruneCombineOp +PartitionPruneContext +PartitionPruneInfo +PartitionPruneState +PartitionPruneStep +PartitionPruneStepCombine +PartitionPruneStepOp +PartitionPruningData +PartitionRangeBound +PartitionRangeDatum +PartitionRangeDatumKind +PartitionScheme +PartitionSpec +PartitionTupleRouting +PartitionedRelPruneInfo +PartitionedRelPruningData +PartitionwiseAggregateType +PasswordType +Path +PathClauseUsage +PathCostComparison +PathHashStack +PathKey +PathKeysComparison +PathTarget +PatternInfo +PatternInfoArray +Pattern_Prefix_Status +Pattern_Type +PendingFsyncEntry +PendingRelDelete +PendingRelSync +PendingUnlinkEntry +PendingWriteback +PerlInterpreter +Perl_check_t +Perl_ppaddr_t +Permutation +PermutationStep +PermutationStepBlocker +PermutationStepBlockerType +PgArchData +PgBackendGSSStatus +PgBackendSSLStatus +PgBackendStatus +PgBenchExpr +PgBenchExprLink +PgBenchExprList +PgBenchExprType +PgBenchFunction +PgBenchValue +PgBenchValueType +PgChecksumMode +PgFdwAnalyzeState +PgFdwConnState +PgFdwDirectModifyState +PgFdwModifyState +PgFdwOption +PgFdwPathExtraData +PgFdwRelationInfo +PgFdwScanState +PgIfAddrCallback +PgStat_ArchiverStats +PgStat_BackendFunctionEntry +PgStat_Counter +PgStat_FunctionCallUsage +PgStat_FunctionCounts +PgStat_FunctionEntry +PgStat_GlobalStats +PgStat_Msg +PgStat_MsgAnalyze +PgStat_MsgAnlAncestors +PgStat_MsgArchiver +PgStat_MsgAutovacStart +PgStat_MsgBgWriter +PgStat_MsgChecksumFailure +PgStat_MsgConnect +PgStat_MsgDeadlock +PgStat_MsgDisconnect +PgStat_MsgDropdb +PgStat_MsgDummy +PgStat_MsgFuncpurge +PgStat_MsgFuncstat +PgStat_MsgHdr +PgStat_MsgInquiry +PgStat_MsgRecoveryConflict +PgStat_MsgReplSlot +PgStat_MsgResetcounter +PgStat_MsgResetreplslotcounter +PgStat_MsgResetsharedcounter +PgStat_MsgResetsinglecounter +PgStat_MsgResetslrucounter +PgStat_MsgSLRU +PgStat_MsgTabpurge +PgStat_MsgTabstat +PgStat_MsgTempFile +PgStat_MsgVacuum +PgStat_MsgWal +PgStat_SLRUStats +PgStat_Shared_Reset_Target +PgStat_Single_Reset_Type +PgStat_StatDBEntry +PgStat_StatFuncEntry +PgStat_StatReplSlotEntry +PgStat_StatTabEntry +PgStat_SubXactStatus +PgStat_TableCounts +PgStat_TableEntry +PgStat_TableStatus +PgStat_TableXactStatus +PgStat_WalStats +PgXmlErrorContext +PgXmlStrictness +Pg_finfo_record +Pg_magic_struct +PipeProtoChunk +PipeProtoHeader +PlaceHolderInfo +PlaceHolderVar +Plan +PlanDirectModify_function +PlanForeignModify_function +PlanInvalItem +PlanRowMark +PlanState +PlannedStmt +PlannerGlobal +PlannerInfo +PlannerParamItem +Point +Pointer +PolicyInfo +PolyNumAggState +Pool +PopulateArrayContext +PopulateArrayState +PopulateRecordCache +PopulateRecordsetState +Port +Portal +PortalHashEnt +PortalStatus +PortalStrategy +PostParseColumnRefHook +PostgresPollingStatusType +PostingItem +PostponedQual +PreParseColumnRefHook +PredClass +PredIterInfo +PredIterInfoData +PredXactList +PredXactListElement +PredicateLockData +PredicateLockTargetType +PrefetchBufferResult +PrepParallelRestorePtrType +PrepareStmt +PreparedStatement +PresortedKeyData +PrewarmType +PrintExtraTocPtrType +PrintTocDataPtrType +PrintfArgType +PrintfArgValue +PrintfTarget +PrinttupAttrInfo +PrivTarget +PrivateRefCountEntry +ProcArrayStruct +ProcLangInfo +ProcSignalBarrierType +ProcSignalHeader +ProcSignalReason +ProcSignalSlot +ProcState +ProcWaitStatus +ProcessUtilityContext +ProcessUtility_hook_type +ProcessingMode +ProgressCommandType +ProjectSet +ProjectSetPath +ProjectSetState +ProjectionInfo +ProjectionPath +ProtocolVersion +PrsStorage +PruneState +PruneStepResult +PsqlScanCallbacks +PsqlScanQuoteType +PsqlScanResult +PsqlScanState +PsqlScanStateData +PsqlSettings +Publication +PublicationActions +PublicationInfo +PublicationPartOpt +PublicationRelInfo +PullFilter +PullFilterOps +PushFilter +PushFilterOps +PushFunction +PyCFunction +PyCodeObject +PyMappingMethods +PyMethodDef +PyModuleDef +PyObject +PySequenceMethods +PyTypeObject +Py_ssize_t +QPRS_STATE +QTN2QTState +QTNode +QUERYTYPE +QUERY_SECURITY_CONTEXT_TOKEN_FN +QualCost +QualItem +Query +QueryCompletion +QueryDesc +QueryEnvironment +QueryInfo +QueryItem +QueryItemType +QueryMode +QueryOperand +QueryOperator +QueryRepresentation +QueryRepresentationOperand +QuerySource +QueueBackendStatus +QueuePosition +QuitSignalReason +RBTNode +RBTOrderControl +RBTree +RBTreeIterator +REPARSE_JUNCTION_DATA_BUFFER +RIX +RI_CompareHashEntry +RI_CompareKey +RI_ConstraintInfo +RI_QueryHashEntry +RI_QueryKey +RTEKind +RWConflict +RWConflictPoolHeader +RandomState +Range +RangeBound +RangeBox +RangeFunction +RangeIOData +RangeQueryClause +RangeSubselect +RangeTableFunc +RangeTableFuncCol +RangeTableSample +RangeTblEntry +RangeTblFunction +RangeTblRef +RangeType +RangeVar +RangeVarGetRelidCallback +Ranges +RawColumnDefault +RawParseMode +RawStmt +ReInitializeDSMForeignScan_function +ReScanForeignScan_function +ReadBufPtrType +ReadBufferMode +ReadBytePtrType +ReadExtraTocPtrType +ReadFunc +ReassignOwnedStmt +RecheckForeignScan_function +RecordCacheEntry +RecordCompareData +RecordIOData +RecoveryLockListsEntry +RecoveryPauseState +RecoveryState +RecoveryTargetTimeLineGoal +RecoveryTargetType +RectBox +RecursionContext +RecursiveUnion +RecursiveUnionPath +RecursiveUnionState +RefetchForeignRow_function +RefreshMatViewStmt +RegProcedure +Regis +RegisNode +RegisteredBgWorker +ReindexErrorInfo +ReindexIndexInfo +ReindexObjectType +ReindexParams +ReindexStmt +ReindexType +RelFileNode +RelFileNodeBackend +RelIdCacheEnt +RelInfo +RelInfoArr +RelMapFile +RelMapping +RelOptInfo +RelOptKind +RelSizeEntry +RelTag +RelToCheck +RelToCluster +RelabelType +Relation +RelationData +RelationInfo +RelationPtr +RelationSyncEntry +RelcacheCallbackFunction +RelfilenodeMapEntry +RelfilenodeMapKey +Relids +RelocationBufferInfo +RelptrFreePageBtree +RelptrFreePageManager +RelptrFreePageSpanLeader +RenameStmt +ReopenPtrType +ReorderBuffer +ReorderBufferApplyChangeCB +ReorderBufferApplyTruncateCB +ReorderBufferBeginCB +ReorderBufferChange +ReorderBufferCommitCB +ReorderBufferCommitPreparedCB +ReorderBufferDiskChange +ReorderBufferIterTXNEntry +ReorderBufferIterTXNState +ReorderBufferMessageCB +ReorderBufferPrepareCB +ReorderBufferRollbackPreparedCB +ReorderBufferStreamAbortCB +ReorderBufferStreamChangeCB +ReorderBufferStreamCommitCB +ReorderBufferStreamMessageCB +ReorderBufferStreamPrepareCB +ReorderBufferStreamStartCB +ReorderBufferStreamStopCB +ReorderBufferStreamTruncateCB +ReorderBufferTXN +ReorderBufferTXNByIdEnt +ReorderBufferToastEnt +ReorderBufferTupleBuf +ReorderBufferTupleCidEnt +ReorderBufferTupleCidKey +ReorderTuple +RepOriginId +ReparameterizeForeignPathByChild_function +ReplaceVarsFromTargetList_context +ReplaceVarsNoMatchOption +ReplicaIdentityStmt +ReplicationKind +ReplicationSlot +ReplicationSlotCtlData +ReplicationSlotOnDisk +ReplicationSlotPersistency +ReplicationSlotPersistentData +ReplicationState +ReplicationStateCtl +ReplicationStateOnDisk +ResTarget +ReservoirState +ReservoirStateData +ResourceArray +ResourceOwner +ResourceReleaseCallback +ResourceReleaseCallbackItem +ResourceReleasePhase +RestoreOptions +RestorePass +RestrictInfo +Result +ResultRelInfo +ResultState +ReturnSetInfo +ReturnStmt +RevmapContents +RewriteMappingDataEntry +RewriteMappingFile +RewriteRule +RewriteState +RmgrData +RmgrDescData +RmgrId +RmgrIds +RoleSpec +RoleSpecType +RoleStmtType +RollupData +RowCompareExpr +RowCompareType +RowExpr +RowIdentityVarInfo +RowMarkClause +RowMarkType +RowSecurityDesc +RowSecurityPolicy +RuleInfo +RuleLock +RuleStmt +RunningTransactions +RunningTransactionsData +SC_HANDLE +SECURITY_ATTRIBUTES +SECURITY_STATUS +SEG +SERIALIZABLEXACT +SERIALIZABLEXID +SERIALIZABLEXIDTAG +SERVICE_STATUS +SERVICE_STATUS_HANDLE +SERVICE_TABLE_ENTRY +SHM_QUEUE +SID_AND_ATTRIBUTES +SID_IDENTIFIER_AUTHORITY +SID_NAME_USE +SISeg +SIZE_T +SMgrRelation +SMgrRelationData +SMgrSortArray +SOCKADDR +SOCKET +SPELL +SPICallbackArg +SPIExecuteOptions +SPIParseOpenOptions +SPIPlanPtr +SPIPrepareOptions +SPITupleTable +SPLITCOST +SPNode +SPNodeData +SPPageDesc +SQLCmd +SQLDropObject +SQLFunctionCache +SQLFunctionCachePtr +SQLFunctionParseInfo +SQLFunctionParseInfoPtr +SQLValueFunction +SQLValueFunctionOp +SSL +SSLExtensionInfoContext +SSL_CTX +STARTUPINFO +STRLEN +SV +SYNCHRONIZATION_BARRIER +SampleScan +SampleScanGetSampleSize_function +SampleScanState +SamplerRandomState +ScalarArrayOpExpr +ScalarArrayOpExprHashEntry +ScalarArrayOpExprHashTable +ScalarIOData +ScalarItem +ScalarMCVItem +Scan +ScanDirection +ScanKey +ScanKeyData +ScanKeywordHashFunc +ScanKeywordList +ScanState +ScanTypeControl +ScannerCallbackState +SchemaQuery +SecBuffer +SecBufferDesc +SecLabelItem +SecLabelStmt +SeenRelsEntry +SelectLimit +SelectStmt +Selectivity +SemTPadded +SemiAntiJoinFactors +SeqScan +SeqScanState +SeqTable +SeqTableData +SerCommitSeqNo +SerialControl +SerializableXactHandle +SerializedActiveRelMaps +SerializedRanges +SerializedReindexState +SerializedSnapshotData +SerializedTransactionState +Session +SessionBackupState +SessionEndType +SetConstraintState +SetConstraintStateData +SetConstraintTriggerData +SetExprState +SetFunctionReturnMode +SetOp +SetOpCmd +SetOpPath +SetOpState +SetOpStatePerGroup +SetOpStrategy +SetOperation +SetOperationStmt +SetQuantifier +SetToDefault +SetupWorkerPtrType +ShDependObjectInfo +SharedAggInfo +SharedBitmapState +SharedDependencyObjectType +SharedDependencyType +SharedExecutorInstrumentation +SharedFileSet +SharedHashInfo +SharedIncrementalSortInfo +SharedInvalCatalogMsg +SharedInvalCatcacheMsg +SharedInvalRelcacheMsg +SharedInvalRelmapMsg +SharedInvalSmgrMsg +SharedInvalSnapshotMsg +SharedInvalidationMessage +SharedJitInstrumentation +SharedMemoizeInfo +SharedRecordTableEntry +SharedRecordTableKey +SharedRecordTypmodRegistry +SharedSortInfo +SharedTuplestore +SharedTuplestoreAccessor +SharedTuplestoreChunk +SharedTuplestoreParticipant +SharedTypmodTableEntry +Sharedsort +ShellTypeInfo +ShippableCacheEntry +ShippableCacheKey +ShmemIndexEnt +ShutdownForeignScan_function +ShutdownInformation +ShutdownMode +SignTSVector +SimpleActionList +SimpleActionListCell +SimpleEcontextStackEntry +SimpleOidList +SimpleOidListCell +SimplePtrList +SimplePtrListCell +SimpleStats +SimpleStringList +SimpleStringListCell +SingleBoundSortItem +Size +SkipPages +SlabBlock +SlabChunk +SlabContext +SlabSlot +SlotErrCallbackArg +SlotNumber +SlruCtl +SlruCtlData +SlruErrorCause +SlruPageStatus +SlruScanCallback +SlruShared +SlruSharedData +SlruWriteAll +SlruWriteAllData +SnapBuild +SnapBuildOnDisk +SnapBuildState +Snapshot +SnapshotData +SnapshotType +SockAddr +Sort +SortBy +SortByDir +SortByNulls +SortCoordinate +SortGroupClause +SortItem +SortPath +SortShimExtra +SortState +SortSupport +SortSupportData +SortTuple +SortTupleComparator +SortedPoint +SpGistBuildState +SpGistCache +SpGistDeadTuple +SpGistDeadTupleData +SpGistInnerTuple +SpGistInnerTupleData +SpGistLUPCache +SpGistLastUsedPage +SpGistLeafTuple +SpGistLeafTupleData +SpGistMetaPageData +SpGistNodeTuple +SpGistNodeTupleData +SpGistOptions +SpGistPageOpaque +SpGistPageOpaqueData +SpGistScanOpaque +SpGistScanOpaqueData +SpGistSearchItem +SpGistState +SpGistTypeDesc +SpecialJoinInfo +SpinDelayStatus +SplitInterval +SplitLR +SplitPoint +SplitTextOutputData +SplitVar +SplitedPageLayout +StackElem +StartBlobPtrType +StartBlobsPtrType +StartDataPtrType +StartReplicationCmd +StartupStatusEnum +StatEntry +StatExtEntry +StatMsgType +StateFileChunk +StatisticExtInfo +Stats +StatsBuildData +StatsData +StatsElem +StatsExtInfo +StdAnalyzeData +StdRdOptIndexCleanup +StdRdOptions +Step +StopList +StrategyNumber +StreamCtl +StreamXidHash +StringInfo +StringInfoData +StripnullState +SubLink +SubLinkType +SubPlan +SubPlanState +SubRemoveRels +SubTransactionId +SubXactCallback +SubXactCallbackItem +SubXactEvent +SubXactInfo +SubqueryScan +SubqueryScanPath +SubqueryScanState +SubscriptExecSetup +SubscriptExecSteps +SubscriptRoutines +SubscriptTransform +SubscriptingRef +SubscriptingRefState +Subscription +SubscriptionInfo +SubscriptionRelState +SupportRequestCost +SupportRequestIndexCondition +SupportRequestRows +SupportRequestSelectivity +SupportRequestSimplify +Syn +SyncOps +SyncRepConfigData +SyncRepStandbyData +SyncRequestHandler +SyncRequestType +SysFKRelationship +SysScanDesc +SyscacheCallbackFunction +SystemRowsSamplerData +SystemSamplerData +SystemTimeSamplerData +TAR_MEMBER +TBMIterateResult +TBMIteratingState +TBMIterator +TBMSharedIterator +TBMSharedIteratorState +TBMStatus +TBlockState +TIDBitmap +TM_FailureData +TM_IndexDelete +TM_IndexDeleteOp +TM_IndexStatus +TM_Result +TOKEN_DEFAULT_DACL +TOKEN_INFORMATION_CLASS +TOKEN_PRIVILEGES +TOKEN_USER +TParser +TParserCharTest +TParserPosition +TParserSpecial +TParserState +TParserStateAction +TParserStateActionItem +TQueueDestReceiver +TRGM +TSAnyCacheEntry +TSConfigCacheEntry +TSConfigInfo +TSDictInfo +TSDictionaryCacheEntry +TSExecuteCallback +TSLexeme +TSParserCacheEntry +TSParserInfo +TSQuery +TSQueryData +TSQueryParserState +TSQuerySign +TSReadPointer +TSTemplateInfo +TSTernaryValue +TSTokenTypeStorage +TSVector +TSVectorBuildState +TSVectorData +TSVectorParseState +TSVectorStat +TState +TStoreState +TXNEntryFile +TYPCATEGORY +T_Action +T_WorkerStatus +TabStatHashEntry +TabStatusArray +TableAmRoutine +TableAttachInfo +TableDataInfo +TableFunc +TableFuncRoutine +TableFuncScan +TableFuncScanState +TableInfo +TableLikeClause +TableSampleClause +TableScanDesc +TableScanDescData +TableSpaceCacheEntry +TableSpaceOpts +TablespaceList +TablespaceListCell +TapeBlockTrailer +TapeShare +TarMethodData +TarMethodFile +TargetEntry +TclExceptionNameMap +Tcl_DString +Tcl_FileProc +Tcl_HashEntry +Tcl_HashTable +Tcl_Interp +Tcl_NotifierProcs +Tcl_Obj +Tcl_Time +TempNamespaceStatus +TestDecodingData +TestDecodingTxnData +TestSpec +TextFreq +TextPositionState +TheLexeme +TheSubstitute +TidExpr +TidExprType +TidHashKey +TidOpExpr +TidPath +TidRangePath +TidRangeScan +TidRangeScanState +TidScan +TidScanState +TimeADT +TimeLineHistoryCmd +TimeLineHistoryEntry +TimeLineID +TimeOffset +TimeStamp +TimeTzADT +TimeZoneAbbrevTable +TimeoutId +TimeoutType +Timestamp +TimestampTz +TmFromChar +TmToChar +ToastAttrInfo +ToastCompressionId +ToastTupleContext +ToastedAttribute +TocEntry +TokenAuxData +TokenizedLine +TrackItem +TransInvalidationInfo +TransState +TransactionId +TransactionState +TransactionStateData +TransactionStmt +TransactionStmtKind +TransformInfo +TransformJsonStringValuesState +TransitionCaptureState +TrgmArc +TrgmArcInfo +TrgmBound +TrgmColor +TrgmColorInfo +TrgmGistOptions +TrgmNFA +TrgmPackArcInfo +TrgmPackedArc +TrgmPackedGraph +TrgmPackedState +TrgmPrefix +TrgmState +TrgmStateKey +TrieChar +Trigger +TriggerData +TriggerDesc +TriggerEvent +TriggerFlags +TriggerInfo +TriggerTransition +TruncateStmt +TsmRoutine +TupOutputState +TupSortStatus +TupStoreStatus +TupleConstr +TupleConversionMap +TupleDesc +TupleHashEntry +TupleHashEntryData +TupleHashIterator +TupleHashTable +TupleQueueReader +TupleTableSlot +TupleTableSlotOps +TuplesortInstrumentation +TuplesortMethod +TuplesortSpaceType +Tuplesortstate +Tuplestorestate +TwoPhaseCallback +TwoPhaseFileHeader +TwoPhaseLockRecord +TwoPhasePgStatRecord +TwoPhasePredicateLockRecord +TwoPhasePredicateRecord +TwoPhasePredicateRecordType +TwoPhasePredicateXactRecord +TwoPhaseRecordOnDisk +TwoPhaseRmgrId +TwoPhaseStateData +Type +TypeCacheEntry +TypeCacheEnumData +TypeCast +TypeCat +TypeFuncClass +TypeInfo +TypeName +U +U32 +U8 +UChar +UCharIterator +UColAttribute +UColAttributeValue +UCollator +UConverter +UErrorCode +UINT +ULARGE_INTEGER +ULONG +ULONG_PTR +UV +UVersionInfo +UnicodeNormalizationForm +UnicodeNormalizationQC +Unique +UniquePath +UniquePathMethod +UniqueState +UnlistenStmt +UnpackTarState +UnresolvedTup +UnresolvedTupData +UpdateStmt +UpperRelationKind +UpperUniquePath +UserAuth +UserMapping +UserOpts +VacAttrStats +VacAttrStatsP +VacErrPhase +VacOptValue +VacuumParams +VacuumRelation +VacuumStmt +ValidateIndexState +Value +ValuesScan +ValuesScanState +Var +VarBit +VarChar +VarParamState +VarString +VarStringSortSupport +Variable +VariableAssignHook +VariableCache +VariableCacheData +VariableSetKind +VariableSetStmt +VariableShowStmt +VariableSpace +VariableStatData +VariableSubstituteHook +VersionedQuery +Vfd +ViewCheckOption +ViewOptCheckOption +ViewOptions +ViewStmt +VirtualTransactionId +VirtualTupleTableSlot +VolatileFunctionStatus +Vsrt +WAIT_ORDER +WALAvailability +WALInsertLock +WALInsertLockPadded +WALOpenSegment +WALReadError +WALSegmentCloseCB +WALSegmentContext +WALSegmentOpenCB +WCHAR +WCOKind +WFW_WaitOption +WIDGET +WORD +WORKSTATE +WSABUF +WSADATA +WSANETWORKEVENTS +WSAPROTOCOL_INFO +WaitEvent +WaitEventActivity +WaitEventClient +WaitEventIO +WaitEventIPC +WaitEventSet +WaitEventTimeout +WaitPMResult +WalCloseMethod +WalLevel +Safekeeper +WalMessage +WalRcvData +WalRcvExecResult +WalRcvExecStatus +WalRcvState +WalRcvStreamOptions +WalReceiverConn +WalReceiverFunctionsType +WalSnd +WalSndCtlData +WalSndSendDataCallback +WalSndState +WalTimeSample +WalUsage +WalWriteMethod +Walfile +WindowAgg +WindowAggPath +WindowAggState +WindowClause +WindowClauseSortData +WindowDef +WindowFunc +WindowFuncExprState +WindowFuncLists +WindowObject +WindowObjectData +WindowStatePerAgg +WindowStatePerAggData +WindowStatePerFunc +WithCheckOption +WithClause +WordEntry +WordEntryIN +WordEntryPos +WordEntryPosVector +WordEntryPosVector1 +WorkTableScan +WorkTableScanState +WorkerInfo +WorkerInfoData +WorkerInstrumentation +WorkerJobDumpPtrType +WorkerJobRestorePtrType +Working_State +WriteBufPtrType +WriteBytePtrType +WriteDataCallback +WriteDataPtrType +WriteExtraTocPtrType +WriteFunc +WriteManifestState +WriteTarState +WritebackContext +X509 +X509_EXTENSION +X509_NAME +X509_NAME_ENTRY +X509_STORE +X509_STORE_CTX +XLTW_Oper +XLogCtlData +XLogCtlInsert +XLogDumpConfig +XLogDumpPrivate +XLogDumpStats +XLogLongPageHeader +XLogLongPageHeaderData +XLogPageHeader +XLogPageHeaderData +XLogPageReadCB +XLogPageReadPrivate +XLogReaderRoutine +XLogReaderState +XLogRecData +XLogRecPtr +XLogRecord +XLogRecordBlockCompressHeader +XLogRecordBlockHeader +XLogRecordBlockImageHeader +XLogRecordBuffer +XLogRedoAction +XLogSegNo +XLogSource +XLogwrtResult +XLogwrtRqst +XPVIV +XPVMG +XactCallback +XactCallbackItem +XactEvent +XactLockTableWaitInfo +XidBoundsViolation +XidCacheStatus +XidCommitStatus +XidStatus +XmlExpr +XmlExprOp +XmlOptionType +XmlSerialize +XmlTableBuilderData +YYLTYPE +YYSTYPE +YY_BUFFER_STATE +ZenithErrorResponse +ZenithExistsRequest +ZenithExistsResponse +ZenithGetPageRequest +ZenithGetPageResponse +ZenithMessage +ZenithMessageTag +ZenithNblocksRequest +ZenithNblocksResponse +ZenithRequest +ZenithResponse +_SPI_connection +_SPI_plan +__AssignProcessToJobObject +__CreateJobObject +__CreateRestrictedToken +__IsProcessInJob +__QueryInformationJobObject +__SetInformationJobObject +__time64_t +_dev_t +_ino_t +_resultmap +_stringlist +acquireLocksOnSubLinks_context +adjust_appendrel_attrs_context +aff_regex_struct +allocfunc +amadjustmembers_function +ambeginscan_function +ambuild_function +ambuildempty_function +ambuildphasename_function +ambulkdelete_function +amcanreturn_function +amcostestimate_function +amendscan_function +amestimateparallelscan_function +amgetbitmap_function +amgettuple_function +aminitparallelscan_function +aminsert_function +ammarkpos_function +amoptions_function +amparallelrescan_function +amproperty_function +amrescan_function +amrestrpos_function +amvacuumcleanup_function +amvalidate_function +array_iter +array_unnest_fctx +assign_collations_context +autovac_table +av_relation +avl_dbase +avl_node +avl_tree +avw_dbase +backslashResult +backup_manifest_info +backup_manifest_option +base_yy_extra_type +basebackup_options +bgworker_main_type +binaryheap +binaryheap_comparator +bitmapword +bits16 +bits32 +bits8 +bloom_filter +brin_column_state +brin_serialize_callback_type +bytea +cached_re_str +cashKEY +cfp +check_agg_arguments_context +check_function_callback +check_network_data +check_object_relabel_type +check_password_hook_type +check_ungrouped_columns_context +chr +clock_t +cmpEntriesArg +cmpfunc +codes_t +coercion +collation_cache_entry +color +colormaprange +compare_context +config_var_value +contain_aggs_of_level_context +convert_testexpr_context +copy_data_source_cb +core_YYSTYPE +core_yy_extra_type +core_yyscan_t +corrupt_items +cost_qual_eval_context +cp_hash_func +create_upper_paths_hook_type +createdb_failure_params +crosstab_HashEnt +crosstab_cat_desc +datapagemap_iterator_t +datapagemap_t +dateKEY +datetkn +dce_uuid_t +decimal +deparse_columns +deparse_context +deparse_expr_cxt +deparse_namespace +destructor +dev_t +digit +disassembledLeaf +dlist_head +dlist_iter +dlist_mutable_iter +dlist_node +ds_state +dsa_area +dsa_area_control +dsa_area_pool +dsa_area_span +dsa_handle +dsa_pointer +dsa_pointer_atomic +dsa_segment_header +dsa_segment_index +dsa_segment_map +dshash_compare_function +dshash_hash +dshash_hash_function +dshash_parameters +dshash_partition +dshash_table +dshash_table_control +dshash_table_handle +dshash_table_item +dsm_control_header +dsm_control_item +dsm_handle +dsm_op +dsm_segment +dsm_segment_detach_callback +eLogType +ean13 +eary +ec_matches_callback_type +ec_member_foreign_arg +ec_member_matches_arg +emit_log_hook_type +eval_const_expressions_context +exec_thread_arg +execution_state +explain_get_index_name_hook_type +f_smgr +fd_set +fe_scram_state +fe_scram_state_enum +fetch_range_request +file_action_t +file_entry_t +file_type_t +filehash_hash +filehash_iterator +filemap_t +fill_string_relopt +finalize_primnode_context +find_dependent_phvs_context +find_expr_references_context +fix_join_expr_context +fix_scan_expr_context +fix_upper_expr_context +flatten_join_alias_vars_context +float4 +float4KEY +float8 +float8KEY +floating_decimal_32 +floating_decimal_64 +fmAggrefPtr +fmExprContextCallbackFunction +fmNodePtr +fmStringInfo +fmgr_hook_type +foreign_glob_cxt +foreign_loc_cxt +freeaddrinfo_ptr_t +freefunc +fsec_t +gbt_vsrt_arg +gbtree_ninfo +gbtree_vinfo +generate_series_fctx +generate_series_numeric_fctx +generate_series_timestamp_fctx +generate_series_timestamptz_fctx +generate_subscripts_fctx +get_attavgwidth_hook_type +get_index_stats_hook_type +get_relation_info_hook_type +get_relation_stats_hook_type +getaddrinfo_ptr_t +getnameinfo_ptr_t +gid_t +gin_leafpage_items_state +ginxlogCreatePostingTree +ginxlogDeleteListPages +ginxlogDeletePage +ginxlogInsert +ginxlogInsertDataInternal +ginxlogInsertEntry +ginxlogInsertListPage +ginxlogRecompressDataLeaf +ginxlogSplit +ginxlogUpdateMeta +ginxlogVacuumDataLeafPage +gistxlogDelete +gistxlogPage +gistxlogPageDelete +gistxlogPageReuse +gistxlogPageSplit +gistxlogPageUpdate +grouping_sets_data +gseg_picksplit_item +gss_buffer_desc +gss_cred_id_t +gss_ctx_id_t +gss_name_t +gtrgm_consistent_cache +gzFile +hashfunc +hbaPort +heap_page_items_state +help_handler +hlCheck +hstoreCheckKeyLen_t +hstoreCheckValLen_t +hstorePairs_t +hstoreUniquePairs_t +hstoreUpgrade_t +hyperLogLogState +ifState +ilist +import_error_callback_arg +indexed_tlist +inet +inetKEY +inet_struct +init_function +inline_cte_walker_context +inline_error_callback_arg +ino_t +inquiry +instr_time +int128 +int16 +int16KEY +int2vector +int32 +int32KEY +int32_t +int64 +int64KEY +int8 +internalPQconninfoOption +intptr_t +intset_internal_node +intset_leaf_node +intset_node +intvKEY +itemIdCompact +itemIdCompactData +iterator +jmp_buf +join_search_hook_type +json_aelem_action +json_manifest_error_callback +json_manifest_perfile_callback +json_manifest_perwalrange_callback +json_ofield_action +json_scalar_action +json_struct_action +keyEntryData +key_t +lclContext +lclTocEntry +leafSegmentInfo +leaf_item +libpq_source +line_t +lineno_t +list_sort_comparator +local_relopt +local_relopts +local_source +locale_t +locate_agg_of_level_context +locate_var_of_level_context +locate_windowfunc_context +logstreamer_param +lquery +lquery_level +lquery_variant +ltree +ltree_gist +ltree_level +ltxtquery +mXactCacheEnt +mac8KEY +macKEY +macaddr +macaddr8 +macaddr_sortsupport_state +manifest_file +manifest_files_hash +manifest_files_iterator +manifest_wal_range +map_variable_attnos_context +max_parallel_hazard_context +mb2wchar_with_len_converter +mbchar_verifier +mbcharacter_incrementer +mbdisplaylen_converter +mblen_converter +mbstr_verifier +memoize_hash +memoize_iterator +metastring +mix_data_t +mixedStruct +mode_t +movedb_failure_params +mp_digit +mp_int +mp_result +mp_sign +mp_size +mp_small +mp_usmall +mp_word +mpz_t +multirange_bsearch_comparison +mxact +mxtruncinfo +needs_fmgr_hook_type +network_sortsupport_state +nodeitem +normal_rand_fctx +ntile_context +numeric +object_access_hook_type +off_t +oidKEY +oidvector +on_dsm_detach_callback +on_exit_nicely_callback +openssl_tls_init_hook_typ +ossl_EVP_cipher_func +other +output_type +pagetable_hash +pagetable_iterator +pairingheap +pairingheap_comparator +pairingheap_node +parallel_worker_main_type +parse_error_callback_arg +parser_context +partition_method_t +pendingPosition +pgParameterStatus +pg_atomic_flag +pg_atomic_uint32 +pg_atomic_uint64 +pg_checksum_context +pg_checksum_raw_context +pg_checksum_type +pg_conn_host +pg_conn_host_type +pg_conv_map +pg_crc32 +pg_crc32c +pg_cryptohash_ctx +pg_cryptohash_type +pg_ctype_cache +pg_enc +pg_enc2gettext +pg_enc2name +pg_encname +pg_funcptr_t +pg_gssinfo +pg_hmac_ctx +pg_int64 +pg_local_to_utf_combined +pg_locale_t +pg_mb_radix_tree +pg_md5_ctx +pg_on_exit_callback +pg_re_flags +pg_saslprep_rc +pg_sha1_ctx +pg_sha224_ctx +pg_sha256_ctx +pg_sha384_ctx +pg_sha512_ctx +pg_snapshot +pg_stack_base_t +pg_time_t +pg_time_usec_t +pg_tz +pg_tz_cache +pg_tzenum +pg_unicode_decompinfo +pg_unicode_decomposition +pg_unicode_norminfo +pg_unicode_normprops +pg_unicode_recompinfo +pg_utf_to_local_combined +pg_uuid_t +pg_wc_probefunc +pg_wchar +pg_wchar_tbl +pgp_armor_headers_state +pgpid_t +pgsocket +pgsql_thing_t +pgssEntry +pgssGlobalStats +pgssHashKey +pgssSharedState +pgssStoreKind +pgssVersion +pgstat_page +pgstattuple_type +pgthreadlock_t +pid_t +pivot_field +planner_hook_type +plperl_array_info +plperl_call_data +plperl_interp_desc +plperl_proc_desc +plperl_proc_key +plperl_proc_ptr +plperl_query_desc +plperl_query_entry +plpgsql_CastHashEntry +plpgsql_CastHashKey +plpgsql_HashEnt +pltcl_call_state +pltcl_interp_desc +pltcl_proc_desc +pltcl_proc_key +pltcl_proc_ptr +pltcl_query_desc +pointer +polymorphic_actuals +pos_trgm +post_parse_analyze_hook_type +postprocess_result_function +pqbool +pqsigfunc +printQueryOpt +printTableContent +printTableFooter +printTableOpt +printTextFormat +printTextLineFormat +printTextLineWrap +printTextRule +printfunc +priv_map +process_file_callback_t +process_sublinks_context +proclist_head +proclist_mutable_iter +proclist_node +promptStatus_t +pthread_barrier_t +pthread_cond_t +pthread_key_t +pthread_mutex_t +pthread_once_t +pthread_t +ptrdiff_t +pull_var_clause_context +pull_varattnos_context +pull_varnos_context +pull_vars_context +pullup_replace_vars_context +pushdown_safety_info +qc_hash_func +qsort_arg_comparator +qsort_comparator +query_pathkeys_callback +radius_attribute +radius_packet +rangeTableEntry_used_context +rank_context +rbt_allocfunc +rbt_combiner +rbt_comparator +rbt_freefunc +reduce_outer_joins_state +reference +regex_arc_t +regex_t +regexp +regexp_matches_ctx +registered_buffer +regmatch_t +regoff_t +regproc +relopt_bool +relopt_enum +relopt_enum_elt_def +relopt_gen +relopt_int +relopt_kind +relopt_parse_elt +relopt_real +relopt_string +relopt_type +relopt_value +relopts_validator +remoteConn +remoteConnHashEnt +remoteDep +rendezvousHashEntry +replace_rte_variables_callback +replace_rte_variables_context +ret_type +rewind_source +rewrite_event +rijndael_ctx +rm_detail_t +role_auth_extra +row_security_policy_hook_type +rsv_callback +saophash_hash +save_buffer +scram_state +scram_state_enum +sem_t +sequence_magic +set_join_pathlist_hook_type +set_rel_pathlist_hook_type +shm_mq +shm_mq_handle +shm_mq_iovec +shm_mq_result +shm_toc +shm_toc_entry +shm_toc_estimator +shmem_startup_hook_type +sig_atomic_t +sigjmp_buf +signedbitmapword +sigset_t +size_t +slist_head +slist_iter +slist_mutable_iter +slist_node +slock_t +socket_set +spgBulkDeleteState +spgChooseIn +spgChooseOut +spgChooseResultType +spgConfigIn +spgConfigOut +spgInnerConsistentIn +spgInnerConsistentOut +spgLeafConsistentIn +spgLeafConsistentOut +spgNodePtr +spgPickSplitIn +spgPickSplitOut +spgVacPendingItem +spgxlogAddLeaf +spgxlogAddNode +spgxlogMoveLeafs +spgxlogPickSplit +spgxlogSplitTuple +spgxlogState +spgxlogVacuumLeaf +spgxlogVacuumRedirect +spgxlogVacuumRoot +split_pathtarget_context +split_pathtarget_item +sql_error_callback_arg +sqlparseInfo +sqlparseState +ss_lru_item_t +ss_scan_location_t +ss_scan_locations_t +ssize_t +standard_qp_extra +stemmer_module +stmtCacheEntry +storeInfo +storeRes_func +stream_stop_callback +string +substitute_actual_parameters_context +substitute_actual_srf_parameters_context +substitute_phv_relids_context +svtype +symbol +tablespaceinfo +teSection +temp_tablespaces_extra +test_re_flags +test_regex_ctx +test_shm_mq_header +test_spec +test_start_function +text +timeKEY +time_t +timeout_handler_proc +timeout_params +timerCA +tlist_vinfo +toast_compress_header +transferMode +transfer_thread_arg +trgm +trgm_mb_char +trivalue +tsKEY +ts_parserstate +ts_tokenizer +ts_tokentype +tsearch_readline_state +tuplehash_hash +tuplehash_iterator +type +tzEntry +u1byte +u4byte +u_char +u_int +uchr +uid_t +uint128 +uint16 +uint16_t +uint32 +uint32_t +uint64 +uint64_t +uint8 +uint8_t +uintptr_t +unicodeStyleBorderFormat +unicodeStyleColumnFormat +unicodeStyleFormat +unicodeStyleRowFormat +unicode_linestyle +unit_conversion +unlogged_relation_entry +utf_local_conversion_func +uuidKEY +uuid_rc_t +uuid_sortsupport_state +uuid_t +va_list +vacuumingOptions +validate_string_relopt +varatt_expanded +varattrib_1b +varattrib_1b_e +varattrib_4b +vbits +verifier_context +walrcv_check_conninfo_fn +walrcv_connect_fn +walrcv_create_slot_fn +walrcv_disconnect_fn +walrcv_endstreaming_fn +walrcv_exec_fn +walrcv_get_backend_pid_fn +walrcv_get_conninfo_fn +walrcv_get_senderinfo_fn +walrcv_identify_system_fn +walrcv_readtimelinehistoryfile_fn +walrcv_receive_fn +walrcv_send_fn +walrcv_server_version_fn +walrcv_startstreaming_fn +wchar2mb_with_len_converter +wchar_t +win32_deadchild_waitinfo +wint_t +worker_state +worktable +wrap +xl_brin_createidx +xl_brin_desummarize +xl_brin_insert +xl_brin_revmap_extend +xl_brin_samepage_update +xl_brin_update +xl_btree_dedup +xl_btree_delete +xl_btree_insert +xl_btree_mark_page_halfdead +xl_btree_metadata +xl_btree_newroot +xl_btree_reuse_page +xl_btree_split +xl_btree_unlink_page +xl_btree_update +xl_btree_vacuum +xl_clog_truncate +xl_commit_ts_truncate +xl_dbase_create_rec +xl_dbase_drop_rec +xl_end_of_recovery +xl_hash_add_ovfl_page +xl_hash_delete +xl_hash_init_bitmap_page +xl_hash_init_meta_page +xl_hash_insert +xl_hash_move_page_contents +xl_hash_split_allocate_page +xl_hash_split_complete +xl_hash_squeeze_page +xl_hash_update_meta_page +xl_hash_vacuum_one_page +xl_heap_confirm +xl_heap_delete +xl_heap_freeze_page +xl_heap_freeze_tuple +xl_heap_header +xl_heap_inplace +xl_heap_insert +xl_heap_lock +xl_heap_lock_updated +xl_heap_multi_insert +xl_heap_new_cid +xl_heap_prune +xl_heap_rewrite_mapping +xl_heap_truncate +xl_heap_update +xl_heap_vacuum +xl_heap_visible +xl_invalid_page +xl_invalid_page_key +xl_invalidations +xl_logical_message +xl_multi_insert_tuple +xl_multixact_create +xl_multixact_truncate +xl_overwrite_contrecord +xl_parameter_change +xl_relmap_update +xl_replorigin_drop +xl_replorigin_set +xl_restore_point +xl_running_xacts +xl_seq_rec +xl_smgr_create +xl_smgr_truncate +xl_standby_lock +xl_standby_locks +xl_tblspc_create_rec +xl_tblspc_drop_rec +xl_xact_abort +xl_xact_assignment +xl_xact_commit +xl_xact_dbinfo +xl_xact_invals +xl_xact_origin +xl_xact_parsed_abort +xl_xact_parsed_commit +xl_xact_parsed_prepare +xl_xact_prepare +xl_xact_relfilenodes +xl_xact_subxacts +xl_xact_twophase +xl_xact_xinfo +xmlBuffer +xmlBufferPtr +xmlChar +xmlDocPtr +xmlErrorPtr +xmlExternalEntityLoader +xmlGenericErrorFunc +xmlNodePtr +xmlNodeSetPtr +xmlParserCtxtPtr +xmlParserInputPtr +xmlStructuredErrorFunc +xmlTextWriter +xmlTextWriterPtr +xmlXPathCompExprPtr +xmlXPathContextPtr +xmlXPathObjectPtr +xmltype +xpath_workspace +xsltSecurityPrefsPtr +xsltStylesheetPtr +xsltTransformContextPtr +yy_parser +yy_size_t +yyscan_t +z_stream +z_streamp +zic_t From b8eb908a3df34f437b4f123461b14b599be4a8b4 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 13 Sep 2022 15:43:53 +0300 Subject: [PATCH 0767/1022] Rename old project name references --- Cargo.lock | 8 +- Cargo.toml | 2 +- Dockerfile | 16 +- compute_tools/Cargo.toml | 4 +- control_plane/Cargo.toml | 2 +- control_plane/simple.conf | 2 +- control_plane/src/bin/neon_local.rs | 34 +- control_plane/src/compute.rs | 20 +- control_plane/src/local_env.rs | 54 +-- control_plane/src/postgresql_conf.rs | 2 +- control_plane/src/safekeeper.rs | 8 +- control_plane/src/storage.rs | 36 +- docs/authentication.md | 4 +- docs/multitenancy.md | 18 +- docs/pageserver-services.md | 2 +- docs/pageserver-storage.md | 10 +- docs/pageserver-tenant-migration.md | 4 +- docs/rfcs/013-term-history.md | 2 +- docs/rfcs/cluster-size-limits.md | 8 +- docs/sourcetree.md | 11 +- libs/etcd_broker/src/subscription_key.rs | 26 +- libs/postgres_ffi/Cargo.toml | 2 +- libs/postgres_ffi/wal_craft/Cargo.toml | 2 +- libs/utils/Cargo.toml | 4 +- libs/utils/benches/benchmarks.rs | 4 +- libs/utils/src/auth.rs | 14 +- libs/utils/src/http/endpoint.rs | 6 +- libs/utils/src/http/mod.rs | 2 +- libs/utils/src/{zid.rs => id.rs} | 88 ++-- libs/utils/src/lib.rs | 2 +- libs/utils/src/postgres_backend.rs | 12 +- libs/utils/src/postgres_backend_async.rs | 4 +- pageserver/Cargo.toml | 8 +- pageserver/src/bin/dump_layerfile.rs | 2 +- pageserver/src/bin/pageserver.rs | 4 +- pageserver/src/bin/update_metadata.rs | 2 +- pageserver/src/config.rs | 16 +- pageserver/src/http/models.rs | 24 +- pageserver/src/http/routes.rs | 42 +- pageserver/src/import_datadir.rs | 2 +- pageserver/src/lib.rs | 10 +- pageserver/src/metrics.rs | 6 +- pageserver/src/page_cache.rs | 14 +- pageserver/src/page_service.rs | 76 ++-- pageserver/src/pgdatadir_mapping.rs | 8 +- pageserver/src/repository.rs | 4 +- pageserver/src/storage_sync.rs | 72 ++-- pageserver/src/storage_sync/delete.rs | 8 +- pageserver/src/storage_sync/download.rs | 40 +- pageserver/src/storage_sync/index.rs | 42 +- pageserver/src/storage_sync/upload.rs | 12 +- pageserver/src/task_mgr.rs | 18 +- pageserver/src/tenant.rs | 100 ++--- pageserver/src/tenant/delta_layer.rs | 84 ++-- pageserver/src/tenant/ephemeral_file.rs | 36 +- pageserver/src/tenant/image_layer.rs | 84 ++-- pageserver/src/tenant/inmemory_layer.rs | 38 +- pageserver/src/tenant/layer_map.rs | 2 +- pageserver/src/tenant/metadata.rs | 20 +- pageserver/src/tenant/storage_layer.rs | 10 +- pageserver/src/tenant/timeline.rs | 16 +- pageserver/src/tenant_config.rs | 6 +- pageserver/src/tenant_mgr.rs | 36 +- pageserver/src/tenant_tasks.rs | 12 +- pageserver/src/timelines.rs | 14 +- pageserver/src/virtual_file.rs | 38 +- pageserver/src/walingest.rs | 26 +- .../src/walreceiver/connection_manager.rs | 18 +- .../src/walreceiver/walreceiver_connection.rs | 8 +- pageserver/src/walrecord.rs | 16 +- pageserver/src/walredo.rs | 80 ++-- pgxn/neon/inmem_smgr.c | 2 +- pgxn/neon/libpagestore.c | 49 ++- pgxn/neon/neon.c | 2 - pgxn/neon/pagestore_client.h | 153 ++++--- pgxn/neon/pagestore_smgr.c | 408 +++++++++--------- pgxn/neon/relsize_cache.c | 6 +- pgxn/neon/walproposer.c | 114 ++--- pgxn/neon/walproposer.h | 38 +- pgxn/neon_test_utils/neontest.c | 32 +- proxy/Cargo.toml | 2 +- pyproject.toml | 2 +- safekeeper/Cargo.toml | 6 +- safekeeper/src/bin/safekeeper.rs | 6 +- safekeeper/src/broker.rs | 10 +- safekeeper/src/control_file.rs | 18 +- safekeeper/src/control_file_upgrade.rs | 25 +- safekeeper/src/handler.rs | 30 +- safekeeper/src/http/models.rs | 4 +- safekeeper/src/http/routes.rs | 14 +- safekeeper/src/json_ctrl.rs | 4 +- safekeeper/src/lib.rs | 6 +- safekeeper/src/metrics.rs | 4 +- safekeeper/src/receive_wal.rs | 2 +- safekeeper/src/safekeeper.rs | 36 +- safekeeper/src/send_wal.rs | 8 +- safekeeper/src/timeline.rs | 49 ++- safekeeper/src/wal_backup.rs | 14 +- safekeeper/src/wal_storage.rs | 8 +- scripts/generate_and_push_perf_report.sh | 8 +- scripts/perf_report_template.html | 4 +- test_runner/README.md | 2 +- test_runner/fixtures/benchmark_fixture.py | 6 +- test_runner/fixtures/neon_fixtures.py | 132 +++--- test_runner/fixtures/types.py | 14 +- test_runner/performance/README.md | 2 +- test_runner/regress/test_ancestor_branch.py | 8 +- test_runner/regress/test_auth.py | 4 +- test_runner/regress/test_branch_behind.py | 4 +- test_runner/regress/test_broken_timeline.py | 4 +- test_runner/regress/test_fullbackup.py | 4 +- test_runner/regress/test_gc_aggressive.py | 8 +- test_runner/regress/test_import.py | 12 +- test_runner/regress/test_neon_cli.py | 8 +- test_runner/regress/test_old_request_lsn.py | 4 +- test_runner/regress/test_pageserver_api.py | 24 +- test_runner/regress/test_pitr_gc.py | 4 +- test_runner/regress/test_remote_storage.py | 8 +- test_runner/regress/test_tenant_detach.py | 8 +- test_runner/regress/test_tenant_relocation.py | 22 +- test_runner/regress/test_tenant_tasks.py | 10 +- test_runner/regress/test_tenants.py | 4 +- .../test_tenants_with_remote_storage.py | 8 +- test_runner/regress/test_timeline_delete.py | 6 +- test_runner/regress/test_timeline_size.py | 8 +- test_runner/regress/test_wal_acceptor.py | 64 ++- .../regress/test_wal_acceptor_async.py | 10 +- test_runner/regress/test_wal_restore.py | 4 +- 128 files changed, 1428 insertions(+), 1495 deletions(-) rename libs/utils/src/{zid.rs => id.rs} (76%) diff --git a/Cargo.lock b/Cargo.lock index e9ebcdc5ac..d4234d2b00 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2048,7 +2048,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.2" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "bytes", "fallible-iterator", @@ -2061,7 +2061,7 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "base64", "byteorder", @@ -2079,7 +2079,7 @@ dependencies = [ [[package]] name = "postgres-types" version = "0.2.3" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "bytes", "fallible-iterator", @@ -3295,7 +3295,7 @@ dependencies = [ [[package]] name = "tokio-postgres" version = "0.7.6" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "async-trait", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 1936b261f7..bc2a705558 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -70,4 +70,4 @@ lto = true # This is only needed for proxy's tests. # TODO: we should probably fork `tokio-postgres-rustls` instead. [patch.crates-io] -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } diff --git a/Dockerfile b/Dockerfile index 3e173f4d5b..eacb88d168 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,7 +22,7 @@ RUN set -e \ && rm -rf pg_install/v15/build \ && tar -C pg_install/v14 -czf /home/nonroot/postgres_install.tar.gz . -# Build zenith binaries +# Build neon binaries FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot ARG GIT_VERSION=local @@ -60,12 +60,12 @@ RUN set -e \ openssl \ ca-certificates \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ - && useradd -d /data zenith \ - && chown -R zenith:zenith /data + && useradd -d /data neon \ + && chown -R neon:neon /data -COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/pageserver /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/safekeeper /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin # v14 is default for now COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/ @@ -73,7 +73,7 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. # Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values. -RUN mkdir -p /data/.neon/ && chown -R zenith:zenith /data/.neon/ \ +RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \ && /usr/local/bin/pageserver -D /data/.neon/ --init \ -c "id=1234" \ -c "broker_endpoints=['http://etcd:2379']" \ @@ -82,7 +82,7 @@ RUN mkdir -p /data/.neon/ && chown -R zenith:zenith /data/.neon/ \ -c "listen_http_addr='0.0.0.0:9898'" VOLUME ["/data"] -USER zenith +USER neon EXPOSE 6400 EXPOSE 9898 CMD ["/bin/bash"] diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 78b85d0e79..b13f7f191d 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -10,12 +10,12 @@ clap = "3.0" env_logger = "0.9" hyper = { version = "0.14", features = ["full"] } log = { version = "0.4", features = ["std", "serde"] } -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } regex = "1" serde = { version = "1.0", features = ["derive"] } serde_json = "1" tar = "0.4" tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] } -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } url = "2.2.2" workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 8a79a6e566..ab9df8534c 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -8,7 +8,7 @@ clap = "3.0" comfy-table = "5.0.1" git-version = "0.3.5" tar = "0.4.38" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } serde = { version = "1.0", features = ["derive"] } serde_with = "1.12.0" toml = "0.5" diff --git a/control_plane/simple.conf b/control_plane/simple.conf index 925e2f14ee..ae60657400 100644 --- a/control_plane/simple.conf +++ b/control_plane/simple.conf @@ -1,4 +1,4 @@ -# Minimal zenith environment with one safekeeper. This is equivalent to the built-in +# Minimal neon environment with one safekeeper. This is equivalent to the built-in # defaults that you get with no --config [pageserver] listen_pg_addr = '127.0.0.1:64000' diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index e3160db53b..e16fd8764a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -27,10 +27,10 @@ use std::process::exit; use std::str::FromStr; use utils::{ auth::{Claims, Scope}, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, postgres_backend::AuthType, project_git_version, - zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; // Default id of a safekeeper node, if not specified on the command line. @@ -72,7 +72,7 @@ struct TimelineTreeEl { /// Name, recovered from neon config mappings pub name: Option, /// Holds all direct children of this timeline referenced using `timeline_id`. - pub children: BTreeSet, + pub children: BTreeSet, } // Main entry point for the 'neon_local' CLI utility @@ -321,7 +321,7 @@ fn main() -> Result<()> { /// fn print_timelines_tree( timelines: Vec, - mut timeline_name_mappings: HashMap, + mut timeline_name_mappings: HashMap, ) -> Result<()> { let mut timelines_hash = timelines .iter() @@ -332,7 +332,7 @@ fn print_timelines_tree( info: t.clone(), children: BTreeSet::new(), name: timeline_name_mappings - .remove(&ZTenantTimelineId::new(t.tenant_id, t.timeline_id)), + .remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)), }, ) }) @@ -374,7 +374,7 @@ fn print_timeline( nesting_level: usize, is_last: &[bool], timeline: &TimelineTreeEl, - timelines: &HashMap, + timelines: &HashMap, ) -> Result<()> { let local_remote = match (timeline.info.local.as_ref(), timeline.info.remote.as_ref()) { (None, None) => unreachable!("in this case no info for a timeline is found"), @@ -452,8 +452,8 @@ fn print_timeline( /// Connects to the pageserver to query this information. fn get_timeline_infos( env: &local_env::LocalEnv, - tenant_id: &ZTenantId, -) -> Result> { + tenant_id: &TenantId, +) -> Result> { Ok(PageServerNode::from_env(env) .timeline_list(tenant_id)? .into_iter() @@ -462,7 +462,7 @@ fn get_timeline_infos( } // Helper function to parse --tenant_id option, or get the default from config file -fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result { +fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result { if let Some(tenant_id_from_arguments) = parse_tenant_id(sub_match).transpose() { tenant_id_from_arguments } else if let Some(default_id) = env.default_tenant_id { @@ -472,18 +472,18 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R } } -fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { +fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { sub_match .value_of("tenant-id") - .map(ZTenantId::from_str) + .map(TenantId::from_str) .transpose() .context("Failed to parse tenant id from the argument string") } -fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result> { +fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result> { sub_match .value_of("timeline-id") - .map(ZTimelineId::from_str) + .map(TimelineId::from_str) .transpose() .context("Failed to parse timeline id from the argument string") } @@ -504,9 +504,9 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { let mut env = LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; env.init().context("Failed to initialize neon repository")?; - - // default_tenantid was generated by the `env.init()` call above - let initial_tenant_id = env.default_tenant_id.unwrap(); + let initial_tenant_id = env + .default_tenant_id + .expect("default_tenant_id should be generated by the `env.init()` call above"); // Initialize pageserver, create initial tenant and timeline. let pageserver = PageServerNode::from_env(&env); @@ -759,7 +759,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { }; let branch_name = timeline_name_mappings - .get(&ZTenantTimelineId::new(tenant_id, node.timeline_id)) + .get(&TenantTimelineId::new(tenant_id, node.timeline_id)) .map(|name| name.as_str()) .unwrap_or("?"); @@ -810,7 +810,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let node = cplane.nodes.get(&(tenant_id, node_name.to_owned())); - let auth_token = if matches!(env.pageserver.auth_type, AuthType::ZenithJWT) { + let auth_token = if matches!(env.pageserver.auth_type, AuthType::NeonJWT) { let claims = Claims::new(Some(tenant_id), Scope::Tenant); Some(env.generate_auth_token(&claims)?) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 57b5e1e10a..b678d620df 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -13,9 +13,9 @@ use std::time::Duration; use anyhow::{Context, Result}; use utils::{ connstring::connection_host_port, + id::{TenantId, TimelineId}, lsn::Lsn, postgres_backend::AuthType, - zid::{ZTenantId, ZTimelineId}, }; use crate::local_env::LocalEnv; @@ -28,7 +28,7 @@ use crate::storage::PageServerNode; pub struct ComputeControlPlane { base_port: u16, pageserver: Arc, - pub nodes: BTreeMap<(ZTenantId, String), Arc>, + pub nodes: BTreeMap<(TenantId, String), Arc>, env: LocalEnv, } @@ -76,9 +76,9 @@ impl ComputeControlPlane { pub fn new_node( &mut self, - tenant_id: ZTenantId, + tenant_id: TenantId, name: &str, - timeline_id: ZTimelineId, + timeline_id: TimelineId, lsn: Option, port: Option, ) -> Result> { @@ -114,9 +114,9 @@ pub struct PostgresNode { pub env: LocalEnv, pageserver: Arc, is_test: bool, - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, pub lsn: Option, // if it's a read-only node. None for primary - pub tenant_id: ZTenantId, + pub tenant_id: TenantId, uses_wal_proposer: bool, } @@ -148,8 +148,8 @@ impl PostgresNode { // Read a few options from the config file let context = format!("in config file {}", cfg_path_str); let port: u16 = conf.parse_field("port", &context)?; - let timeline_id: ZTimelineId = conf.parse_field("neon.timeline_id", &context)?; - let tenant_id: ZTenantId = conf.parse_field("neon.tenant_id", &context)?; + let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?; + let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?; let uses_wal_proposer = conf.get("neon.safekeepers").is_some(); // parse recovery_target_lsn, if any @@ -292,7 +292,7 @@ impl PostgresNode { // variable during compute pg startup. It is done this way because // otherwise user will be able to retrieve the value using SHOW // command or pg_settings - let password = if let AuthType::ZenithJWT = auth_type { + let password = if let AuthType::NeonJWT = auth_type { "$ZENITH_AUTH_TOKEN" } else { "" @@ -301,7 +301,7 @@ impl PostgresNode { // Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN // We parse this string and build it back with token from env var, and for simplicity rebuild // uses only needed variables namely host, port, user, password. - format!("postgresql://no_user:{}@{}:{}", password, host, port) + format!("postgresql://no_user:{password}@{host}:{port}") }; conf.append("shared_preload_libraries", "neon"); conf.append_line(""); diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index c4a61dbd7b..7afaad26dc 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -14,8 +14,8 @@ use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use utils::{ auth::{encode_from_key_file, Claims, Scope}, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, postgres_backend::AuthType, - zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use crate::safekeeper::SafekeeperNode; @@ -48,13 +48,13 @@ pub struct LocalEnv { // Path to pageserver binary. #[serde(default)] - pub zenith_distrib_dir: PathBuf, + pub neon_distrib_dir: PathBuf, - // Default tenant ID to use with the 'zenith' command line utility, when - // --tenantid is not explicitly specified. + // Default tenant ID to use with the 'neon_local' command line utility, when + // --tenant_id is not explicitly specified. #[serde(default)] #[serde_as(as = "Option")] - pub default_tenant_id: Option, + pub default_tenant_id: Option, // used to issue tokens during e.g pg start #[serde(default)] @@ -69,11 +69,11 @@ pub struct LocalEnv { /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. #[serde(default)] - // A `HashMap>` would be more appropriate here, + // A `HashMap>` would be more appropriate here, // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". #[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")] - branch_name_mappings: HashMap>, + branch_name_mappings: HashMap>, } /// Etcd broker config for cluster internal communication. @@ -204,20 +204,20 @@ impl LocalEnv { } pub fn pageserver_bin(&self) -> anyhow::Result { - Ok(self.zenith_distrib_dir.join("pageserver")) + Ok(self.neon_distrib_dir.join("pageserver")) } pub fn safekeeper_bin(&self) -> anyhow::Result { - Ok(self.zenith_distrib_dir.join("safekeeper")) + Ok(self.neon_distrib_dir.join("safekeeper")) } pub fn pg_data_dirs_path(&self) -> PathBuf { self.base_data_dir.join("pgdatadirs").join("tenants") } - pub fn pg_data_dir(&self, tenantid: &ZTenantId, branch_name: &str) -> PathBuf { + pub fn pg_data_dir(&self, tenant_id: &TenantId, branch_name: &str) -> PathBuf { self.pg_data_dirs_path() - .join(tenantid.to_string()) + .join(tenant_id.to_string()) .join(branch_name) } @@ -233,8 +233,8 @@ impl LocalEnv { pub fn register_branch_mapping( &mut self, branch_name: String, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, ) -> anyhow::Result<()> { let existing_values = self .branch_name_mappings @@ -260,22 +260,22 @@ impl LocalEnv { pub fn get_branch_timeline_id( &self, branch_name: &str, - tenant_id: ZTenantId, - ) -> Option { + tenant_id: TenantId, + ) -> Option { self.branch_name_mappings .get(branch_name)? .iter() .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id) .map(|&(_, timeline_id)| timeline_id) - .map(ZTimelineId::from) + .map(TimelineId::from) } - pub fn timeline_name_mappings(&self) -> HashMap { + pub fn timeline_name_mappings(&self) -> HashMap { self.branch_name_mappings .iter() .flat_map(|(name, tenant_timelines)| { tenant_timelines.iter().map(|&(tenant_id, timeline_id)| { - (ZTenantTimelineId::new(tenant_id, timeline_id), name.clone()) + (TenantTimelineId::new(tenant_id, timeline_id), name.clone()) }) }) .collect() @@ -299,14 +299,14 @@ impl LocalEnv { } } - // Find zenith binaries. - if env.zenith_distrib_dir == Path::new("") { - env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); + // Find neon binaries. + if env.neon_distrib_dir == Path::new("") { + env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); } // If no initial tenant ID was given, generate it. if env.default_tenant_id.is_none() { - env.default_tenant_id = Some(ZTenantId::generate()); + env.default_tenant_id = Some(TenantId::generate()); } env.base_data_dir = base_path(); @@ -320,12 +320,12 @@ impl LocalEnv { if !repopath.exists() { bail!( - "Zenith config is not found in {}. You need to run 'neon_local init' first", + "Neon config is not found in {}. You need to run 'neon_local init' first", repopath.to_str().unwrap() ); } - // TODO: check that it looks like a zenith repository + // TODO: check that it looks like a neon repository // load and parse file let config = fs::read_to_string(repopath.join("config"))?; @@ -404,10 +404,10 @@ impl LocalEnv { ); } for binary in ["pageserver", "safekeeper"] { - if !self.zenith_distrib_dir.join(binary).exists() { + if !self.neon_distrib_dir.join(binary).exists() { bail!( - "Can't find binary '{binary}' in zenith distrib dir '{}'", - self.zenith_distrib_dir.display() + "Can't find binary '{binary}' in neon distrib dir '{}'", + self.neon_distrib_dir.display() ); } } diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs index a71108da01..34dc769e78 100644 --- a/control_plane/src/postgresql_conf.rs +++ b/control_plane/src/postgresql_conf.rs @@ -2,7 +2,7 @@ /// Module for parsing postgresql.conf file. /// /// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just -/// enough to extract a few settings we need in Zenith, assuming you don't do +/// enough to extract a few settings we need in Neon, assuming you don't do /// funny stuff like include-directives or funny escaping. use anyhow::{bail, Context, Result}; use once_cell::sync::Lazy; diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 2cc1ae7853..600a9ffe05 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -17,7 +17,7 @@ use thiserror::Error; use utils::{ connstring::connection_address, http::error::HttpErrorBody, - zid::{NodeId, ZTenantId, ZTimelineId}, + id::{NodeId, TenantId, TimelineId}, }; use crate::local_env::{LocalEnv, SafekeeperConf}; @@ -269,7 +269,7 @@ impl SafekeeperNode { fn http_request(&self, method: Method, url: U) -> RequestBuilder { // TODO: authentication - //if self.env.auth_type == AuthType::ZenithJWT { + //if self.env.auth_type == AuthType::NeonJWT { // builder = builder.bearer_auth(&self.env.safekeeper_auth_token) //} self.http_client.request(method, url) @@ -284,8 +284,8 @@ impl SafekeeperNode { pub fn timeline_create( &self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, peer_ids: Vec, ) -> Result<()> { Ok(self diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 9fdab5f88c..d2cc5e096c 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -21,9 +21,9 @@ use thiserror::Error; use utils::{ connstring::connection_address, http::error::HttpErrorBody, + id::{TenantId, TimelineId}, lsn::Lsn, postgres_backend::AuthType, - zid::{ZTenantId, ZTimelineId}, }; use crate::local_env::LocalEnv; @@ -83,7 +83,7 @@ pub struct PageServerNode { impl PageServerNode { pub fn from_env(env: &LocalEnv) -> PageServerNode { - let password = if env.pageserver.auth_type == AuthType::ZenithJWT { + let password = if env.pageserver.auth_type == AuthType::NeonJWT { &env.pageserver.auth_token } else { "" @@ -109,10 +109,10 @@ impl PageServerNode { pub fn initialize( &self, - create_tenant: Option, - initial_timeline_id: Option, + create_tenant: Option, + initial_timeline_id: Option, config_overrides: &[&str], - ) -> anyhow::Result { + ) -> anyhow::Result { let id = format!("id={}", self.env.pageserver.id); // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let pg_distrib_dir_param = @@ -173,9 +173,9 @@ impl PageServerNode { fn try_init_timeline( &self, - new_tenant_id: Option, - new_timeline_id: Option, - ) -> anyhow::Result { + new_tenant_id: Option, + new_timeline_id: Option, + ) -> anyhow::Result { let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?; let initial_timeline_info = self.timeline_create(initial_tenant_id, new_timeline_id, None, None)?; @@ -345,7 +345,7 @@ impl PageServerNode { fn http_request(&self, method: Method, url: U) -> RequestBuilder { let mut builder = self.http_client.request(method, url); - if self.env.pageserver.auth_type == AuthType::ZenithJWT { + if self.env.pageserver.auth_type == AuthType::NeonJWT { builder = builder.bearer_auth(&self.env.pageserver.auth_token) } builder @@ -368,9 +368,9 @@ impl PageServerNode { pub fn tenant_create( &self, - new_tenant_id: Option, + new_tenant_id: Option, settings: HashMap<&str, &str>, - ) -> anyhow::Result { + ) -> anyhow::Result { self.http_request(Method::POST, format!("{}/tenant", self.http_base_url)) .json(&TenantCreateRequest { new_tenant_id, @@ -422,7 +422,7 @@ impl PageServerNode { }) } - pub fn tenant_config(&self, tenant_id: ZTenantId, settings: HashMap<&str, &str>) -> Result<()> { + pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> { self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url)) .json(&TenantConfigRequest { tenant_id, @@ -471,7 +471,7 @@ impl PageServerNode { Ok(()) } - pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result> { + pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result> { let timeline_infos: Vec = self .http_request( Method::GET, @@ -486,10 +486,10 @@ impl PageServerNode { pub fn timeline_create( &self, - tenant_id: ZTenantId, - new_timeline_id: Option, + tenant_id: TenantId, + new_timeline_id: Option, ancestor_start_lsn: Option, - ancestor_timeline_id: Option, + ancestor_timeline_id: Option, ) -> anyhow::Result { self.http_request( Method::POST, @@ -524,8 +524,8 @@ impl PageServerNode { /// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`) pub fn timeline_import( &self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, base: (Lsn, PathBuf), pg_wal: Option<(Lsn, PathBuf)>, ) -> anyhow::Result<()> { diff --git a/docs/authentication.md b/docs/authentication.md index 7200ffc62f..9748a7ab0d 100644 --- a/docs/authentication.md +++ b/docs/authentication.md @@ -2,14 +2,14 @@ ### Overview -Current state of authentication includes usage of JWT tokens in communication between compute and pageserver and between CLI and pageserver. JWT token is signed using RSA keys. CLI generates a key pair during call to `zenith init`. Using following openssl commands: +Current state of authentication includes usage of JWT tokens in communication between compute and pageserver and between CLI and pageserver. JWT token is signed using RSA keys. CLI generates a key pair during call to `neon_local init`. Using following openssl commands: ```bash openssl genrsa -out private_key.pem 2048 openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem ``` -CLI also generates signed token and saves it in the config for later access to pageserver. Now authentication is optional. Pageserver has two variables in config: `auth_validation_public_key_path` and `auth_type`, so when auth type present and set to `ZenithJWT` pageserver will require authentication for connections. Actual JWT is passed in password field of connection string. There is a caveat for psql, it silently truncates passwords to 100 symbols, so to correctly pass JWT via psql you have to either use PGPASSWORD environment variable, or store password in psql config file. +CLI also generates signed token and saves it in the config for later access to pageserver. Now authentication is optional. Pageserver has two variables in config: `auth_validation_public_key_path` and `auth_type`, so when auth type present and set to `NeonJWT` pageserver will require authentication for connections. Actual JWT is passed in password field of connection string. There is a caveat for psql, it silently truncates passwords to 100 symbols, so to correctly pass JWT via psql you have to either use PGPASSWORD environment variable, or store password in psql config file. Currently there is no authentication between compute and safekeepers, because this communication layer is under heavy refactoring. After this refactoring support for authentication will be added there too. Now safekeeper supports "hardcoded" token passed via environment variable to be able to use callmemaybe command in pageserver. diff --git a/docs/multitenancy.md b/docs/multitenancy.md index c697ae93cd..35c69e69a1 100644 --- a/docs/multitenancy.md +++ b/docs/multitenancy.md @@ -2,26 +2,26 @@ ### Overview -Zenith supports multitenancy. One pageserver can serve multiple tenants at once. Tenants can be managed via zenith CLI. During page server setup tenant can be created using ```zenith init --create-tenant``` Also tenants can be added into the system on the fly without pageserver restart. This can be done using the following cli command: ```zenith tenant create``` Tenants use random identifiers which can be represented as a 32 symbols hexadecimal string. So zenith tenant create accepts desired tenant id as an optional argument. The concept of timelines/branches is working independently per tenant. +Neon supports multitenancy. One pageserver can serve multiple tenants at once. Tenants can be managed via neon_local CLI. During page server setup tenant can be created using ```neon_local init --create-tenant``` Also tenants can be added into the system on the fly without pageserver restart. This can be done using the following cli command: ```neon_local tenant create``` Tenants use random identifiers which can be represented as a 32 symbols hexadecimal string. So neon_local tenant create accepts desired tenant id as an optional argument. The concept of timelines/branches is working independently per tenant. ### Tenants in other commands -By default during `zenith init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct argument `--tenantid=` is provided. So generally tenantid more frequently appears in internal pageserver interface. Its commands take tenantid argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants. +By default during `neon_local init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct argument `--tenant_id=` is provided. So generally tenant_id more frequently appears in internal pageserver interface. Its commands take tenant_id argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants. Examples for cli: ```sh -zenith tenant list +neon_local tenant list -zenith tenant create // generates new id +neon_local tenant create // generates new id -zenith tenant create ee6016ec31116c1b7c33dfdfca38892f +neon_local tenant create ee6016ec31116c1b7c33dfdfca38892f -zenith pg create main // default tenant from zenith init +neon_local pg create main // default tenant from neon init -zenith pg create main --tenantid=ee6016ec31116c1b7c33dfdfca38892f +neon_local pg create main --tenant_id=ee6016ec31116c1b7c33dfdfca38892f -zenith branch --tenantid=ee6016ec31116c1b7c33dfdfca38892f +neon_local branch --tenant_id=ee6016ec31116c1b7c33dfdfca38892f ``` ### Data layout @@ -56,4 +56,4 @@ Tenant id is passed to postgres via GUC the same way as the timeline. Tenant id ### Safety -For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline). +For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenant_id, timeline_id) pair so there can only be one writer for particular (tenant_id, timeline_id). diff --git a/docs/pageserver-services.md b/docs/pageserver-services.md index 07a91f543d..fc259c8a5f 100644 --- a/docs/pageserver-services.md +++ b/docs/pageserver-services.md @@ -109,7 +109,7 @@ Repository The repository stores all the page versions, or WAL records needed to reconstruct them. Each tenant has a separate Repository, which is -stored in the .neon/tenants/ directory. +stored in the .neon/tenants/ directory. Repository is an abstract trait, defined in `repository.rs`. It is implemented by the LayeredRepository object in diff --git a/docs/pageserver-storage.md b/docs/pageserver-storage.md index 8d03e68ac7..77e7ff35bc 100644 --- a/docs/pageserver-storage.md +++ b/docs/pageserver-storage.md @@ -123,7 +123,7 @@ The files are called "layer files". Each layer file covers a range of keys, and a range of LSNs (or a single LSN, in case of image layers). You can think of it as a rectangle in the two-dimensional key-LSN space. The layer files for each timeline are stored in the timeline's subdirectory under -`.neon/tenants//timelines`. +`.neon/tenants//timelines`. There are two kind of layer files: images, and delta layers. An image file contains a snapshot of all keys at a particular LSN, whereas a delta file @@ -351,7 +351,7 @@ branch. Note: It doesn't make any difference if the child branch is created when the end of the main branch was at LSN 250, or later when the tip of the main branch had already moved on. The latter case, creating a -branch at a historic LSN, is how we support PITR in Zenith. +branch at a historic LSN, is how we support PITR in Neon. # Garbage collection @@ -396,9 +396,9 @@ table: main/orders_200_300 DELETE main/orders_300 STILL NEEDED BY orders_300_400 main/orders_300_400 KEEP, NEWER THAN GC HORIZON - main/orders_400 .. - main/orders_400_500 .. - main/orders_500 .. + main/orders_400 .. + main/orders_400_500 .. + main/orders_500 .. main/customers_100 DELETE main/customers_100_200 DELETE main/customers_200 KEEP, NO NEWER VERSION diff --git a/docs/pageserver-tenant-migration.md b/docs/pageserver-tenant-migration.md index a846213ab2..5fb2097030 100644 --- a/docs/pageserver-tenant-migration.md +++ b/docs/pageserver-tenant-migration.md @@ -9,7 +9,7 @@ This feature allows to migrate a timeline from one pageserver to another by util Pageserver implements two new http handlers: timeline attach and timeline detach. Timeline migration is performed in a following way: 1. Timeline attach is called on a target pageserver. This asks pageserver to download latest checkpoint uploaded to s3. -2. For now it is necessary to manually initialize replication stream via callmemaybe call so target pageserver initializes replication from safekeeper (it is desired to avoid this and initialize replication directly in attach handler, but this requires some refactoring (probably [#997](https://github.com/zenithdb/zenith/issues/997)/[#1049](https://github.com/zenithdb/zenith/issues/1049)) +2. For now it is necessary to manually initialize replication stream via callmemaybe call so target pageserver initializes replication from safekeeper (it is desired to avoid this and initialize replication directly in attach handler, but this requires some refactoring (probably [#997](https://github.com/neondatabase/neon/issues/997)/[#1049](https://github.com/neondatabase/neon/issues/1049)) 3. Replication state can be tracked via timeline detail pageserver call. 4. Compute node should be restarted with new pageserver connection string. Issue with multiple compute nodes for one timeline is handled on the safekeeper consensus level. So this is not a problem here.Currently responsibility for rescheduling the compute with updated config lies on external coordinator (console). 5. Timeline is detached from old pageserver. On disk data is removed. @@ -18,5 +18,5 @@ Timeline migration is performed in a following way: ### Implementation details Now safekeeper needs to track which pageserver it is replicating to. This introduces complications into replication code: -* We need to distinguish different pageservers (now this is done by connection string which is imperfect and is covered here: https://github.com/zenithdb/zenith/issues/1105). Callmemaybe subscription management also needs to track that (this is already implemented). +* We need to distinguish different pageservers (now this is done by connection string which is imperfect and is covered here: https://github.com/neondatabase/neon/issues/1105). Callmemaybe subscription management also needs to track that (this is already implemented). * We need to track which pageserver is the primary. This is needed to avoid reconnections to non primary pageservers. Because we shouldn't reconnect to them when they decide to stop their walreceiver. I e this can appear when there is a load on the compute and we are trying to detach timeline from old pageserver. In this case callmemaybe will try to reconnect to it because replication termination condition is not met (page server with active compute could never catch up to the latest lsn, so there is always some wal tail) diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md index 59833526c5..7e815abf73 100644 --- a/docs/rfcs/013-term-history.md +++ b/docs/rfcs/013-term-history.md @@ -70,7 +70,7 @@ two options. ...start sending WAL conservatively since the horizon (1.1), and truncate obsolete part of WAL only when recovery is finished, i.e. epochStartLsn (4) is -reached, i.e. 2.3 transferred -- that's what https://github.com/zenithdb/zenith/pull/505 proposes. +reached, i.e. 2.3 transferred -- that's what https://github.com/neondatabase/neon/pull/505 proposes. Then the following is possible: diff --git a/docs/rfcs/cluster-size-limits.md b/docs/rfcs/cluster-size-limits.md index bd4cb9ef32..4ef006d9a6 100644 --- a/docs/rfcs/cluster-size-limits.md +++ b/docs/rfcs/cluster-size-limits.md @@ -15,7 +15,7 @@ The stateless compute node that performs validation is separate from the storage Limit the maximum size of a PostgreSQL instance to limit free tier users (and other tiers in the future). First of all, this is needed to control our free tier production costs. -Another reason to limit resources is risk management — we haven't (fully) tested and optimized zenith for big clusters, +Another reason to limit resources is risk management — we haven't (fully) tested and optimized neon for big clusters, so we don't want to give users access to the functionality that we don't think is ready. ## Components @@ -43,20 +43,20 @@ Then this size should be reported to compute node. `current_timeline_size` value is included in the walreceiver's custom feedback message: `ReplicationFeedback.` -(PR about protocol changes https://github.com/zenithdb/zenith/pull/1037). +(PR about protocol changes https://github.com/neondatabase/neon/pull/1037). This message is received by the safekeeper and propagated to compute node as a part of `AppendResponse`. Finally, when compute node receives the `current_timeline_size` from safekeeper (or from pageserver directly), it updates the global variable. -And then every zenith_extend() operation checks if limit is reached `(current_timeline_size > neon.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so. +And then every neon_extend() operation checks if limit is reached `(current_timeline_size > neon.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so. (see Postgres error codes [https://www.postgresql.org/docs/devel/errcodes-appendix.html](https://www.postgresql.org/docs/devel/errcodes-appendix.html)) TODO: We can allow autovacuum processes to bypass this check, simply checking `IsAutoVacuumWorkerProcess()`. It would be nice to allow manual VACUUM and VACUUM FULL to bypass the check, but it's uneasy to distinguish these operations at the low level. See issues https://github.com/neondatabase/neon/issues/1245 -https://github.com/zenithdb/zenith/issues/1445 +https://github.com/neondatabase/neon/issues/1445 TODO: We should warn users if the limit is soon to be reached. diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 339a90e0ba..c1a860f126 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -10,7 +10,7 @@ Intended to be used in integration tests and in CLI tools for local installation `/docs`: -Documentation of the Zenith features and concepts. +Documentation of the Neon features and concepts. Now it is mostly dev documentation. `/monitoring`: @@ -19,7 +19,7 @@ TODO `/pageserver`: -Zenith storage service. +Neon storage service. The pageserver has a few different duties: - Store and manage the data. @@ -54,7 +54,7 @@ PostgreSQL extension that contains functions needed for testing and debugging. `/safekeeper`: -The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver. +The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver. It acts as a holding area and redistribution center for recently generated WAL. For more detailed info, see [walservice.md](./walservice.md) @@ -64,11 +64,6 @@ The workspace_hack crate exists only to pin down some dependencies. We use [cargo-hakari](https://crates.io/crates/cargo-hakari) for automation. -`/zenith` - -Main entry point for the 'zenith' CLI utility. -TODO: Doesn't it belong to control_plane? - `/libs`: Unites granular neon helper crates under the hood. diff --git a/libs/etcd_broker/src/subscription_key.rs b/libs/etcd_broker/src/subscription_key.rs index 8f8579f4e5..a11d2ab106 100644 --- a/libs/etcd_broker/src/subscription_key.rs +++ b/libs/etcd_broker/src/subscription_key.rs @@ -11,7 +11,7 @@ use std::{fmt::Display, str::FromStr}; use once_cell::sync::Lazy; use regex::{Captures, Regex}; -use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId}; +use utils::id::{NodeId, TenantId, TenantTimelineId}; /// The subscription kind to the timeline updates from safekeeper. #[derive(Debug, Clone, PartialEq, Eq, Hash)] @@ -30,13 +30,13 @@ pub enum SubscriptionKind { /// Get every update in etcd. All, /// Get etcd updates for any timeiline of a certain tenant, affected by any operation from any node kind. - TenantTimelines(ZTenantId), + TenantTimelines(TenantId), /// Get etcd updates for a certain timeline of a tenant, affected by any operation from any node kind. - Timeline(ZTenantTimelineId), + Timeline(TenantTimelineId), /// Get etcd timeline updates, specific to a certain node kind. - Node(ZTenantTimelineId, NodeKind), + Node(TenantTimelineId, NodeKind), /// Get etcd timeline updates for a certain operation on specific nodes. - Operation(ZTenantTimelineId, NodeKind, OperationKind), + Operation(TenantTimelineId, NodeKind, OperationKind), } /// All kinds of nodes, able to write into etcd. @@ -67,7 +67,7 @@ static SUBSCRIPTION_FULL_KEY_REGEX: Lazy = Lazy::new(|| { /// No other etcd keys are considered during system's work. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct SubscriptionFullKey { - pub id: ZTenantTimelineId, + pub id: TenantTimelineId, pub node_kind: NodeKind, pub operation: OperationKind, pub node_id: NodeId, @@ -83,7 +83,7 @@ impl SubscriptionKey { } /// Subscribes to a given timeline info updates from safekeepers. - pub fn sk_timeline_info(cluster_prefix: String, timeline: ZTenantTimelineId) -> Self { + pub fn sk_timeline_info(cluster_prefix: String, timeline: TenantTimelineId) -> Self { Self { cluster_prefix, kind: SubscriptionKind::Operation( @@ -97,7 +97,7 @@ impl SubscriptionKey { /// Subscribes to all timeine updates during specific operations, running on the corresponding nodes. pub fn operation( cluster_prefix: String, - timeline: ZTenantTimelineId, + timeline: TenantTimelineId, node_kind: NodeKind, operation: OperationKind, ) -> Self { @@ -175,7 +175,7 @@ impl FromStr for SubscriptionFullKey { }; Ok(Self { - id: ZTenantTimelineId::new( + id: TenantTimelineId::new( parse_capture(&key_captures, 1)?, parse_capture(&key_captures, 2)?, ), @@ -247,7 +247,7 @@ impl FromStr for SkOperationKind { #[cfg(test)] mod tests { - use utils::zid::ZTimelineId; + use utils::id::TimelineId; use super::*; @@ -256,9 +256,9 @@ mod tests { let prefix = "neon"; let node_kind = NodeKind::Safekeeper; let operation_kind = OperationKind::Safekeeper(SkOperationKind::WalBackup); - let tenant_id = ZTenantId::generate(); - let timeline_id = ZTimelineId::generate(); - let id = ZTenantTimelineId::new(tenant_id, timeline_id); + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + let id = TenantTimelineId::new(tenant_id, timeline_id); let node_id = NodeId(1); let timeline_subscription_keys = [ diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 5b9ecb7394..2b453fa0dc 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -21,7 +21,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" } [dev-dependencies] env_logger = "0.9" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } wal_craft = { path = "wal_craft" } [build-dependencies] diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index 114f08113b..f848ac1273 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -11,6 +11,6 @@ clap = "3.0" env_logger = "0.9" log = "0.4" once_cell = "1.13.0" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres_ffi = { path = "../" } tempfile = "3.2" diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index ce55277f29..ef2aa8b305 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -10,8 +10,8 @@ bincode = "1.3" bytes = "1.0.1" hyper = { version = "0.14.7", features = ["full"] } pin-project-lite = "0.2.7" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } routerify = "3" serde = { version = "1.0", features = ["derive"] } serde_json = "1" diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index 0339939934..badcb5774e 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -1,11 +1,11 @@ #![allow(unused)] use criterion::{criterion_group, criterion_main, Criterion}; -use utils::zid; +use utils::id; pub fn bench_zid_stringify(c: &mut Criterion) { // Can only use public methods. - let ztl = zid::ZTenantTimelineId::generate(); + let ztl = id::TenantTimelineId::generate(); c.bench_function("zid.to_string", |b| { b.iter(|| { diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 3bdabacad4..b190b0d1c5 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -14,7 +14,7 @@ use jsonwebtoken::{ use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; -use crate::zid::ZTenantId; +use crate::id::TenantId; const JWT_ALGORITHM: Algorithm = Algorithm::RS256; @@ -30,23 +30,23 @@ pub enum Scope { pub struct Claims { #[serde(default)] #[serde_as(as = "Option")] - pub tenant_id: Option, + pub tenant_id: Option, pub scope: Scope, } impl Claims { - pub fn new(tenant_id: Option, scope: Scope) -> Self { + pub fn new(tenant_id: Option, scope: Scope) -> Self { Self { tenant_id, scope } } } -pub fn check_permission(claims: &Claims, tenantid: Option) -> Result<()> { - match (&claims.scope, tenantid) { +pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result<()> { + match (&claims.scope, tenant_id) { (Scope::Tenant, None) => { bail!("Attempt to access management api with tenant scope. Permission denied") } - (Scope::Tenant, Some(tenantid)) => { - if claims.tenant_id.unwrap() != tenantid { + (Scope::Tenant, Some(tenant_id)) => { + if claims.tenant_id.unwrap() != tenant_id { bail!("Tenant id mismatch. Permission denied") } Ok(()) diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 69bf5ef87a..4066791e2b 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -1,6 +1,6 @@ use crate::auth::{self, Claims, JwtAuth}; use crate::http::error; -use crate::zid::ZTenantId; +use crate::id::TenantId; use anyhow::anyhow; use hyper::header::AUTHORIZATION; use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server}; @@ -137,9 +137,9 @@ pub fn auth_middleware( }) } -pub fn check_permission(req: &Request, tenantid: Option) -> Result<(), ApiError> { +pub fn check_permission(req: &Request, tenant_id: Option) -> Result<(), ApiError> { match req.context::() { - Some(claims) => Ok(auth::check_permission(&claims, tenantid) + Some(claims) => Ok(auth::check_permission(&claims, tenant_id) .map_err(|err| ApiError::Forbidden(err.to_string()))?), None => Ok(()), // claims is None because auth is disabled } diff --git a/libs/utils/src/http/mod.rs b/libs/utils/src/http/mod.rs index 0bb53ef51d..74ed6bb5b2 100644 --- a/libs/utils/src/http/mod.rs +++ b/libs/utils/src/http/mod.rs @@ -3,6 +3,6 @@ pub mod error; pub mod json; pub mod request; -/// Current fast way to apply simple http routing in various Zenith binaries. +/// Current fast way to apply simple http routing in various Neon binaries. /// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed. pub use routerify::{ext::RequestExt, RouterBuilder, RouterService}; diff --git a/libs/utils/src/zid.rs b/libs/utils/src/id.rs similarity index 76% rename from libs/utils/src/zid.rs rename to libs/utils/src/id.rs index 6da5355f61..059ce69ca4 100644 --- a/libs/utils/src/zid.rs +++ b/libs/utils/src/id.rs @@ -4,7 +4,7 @@ use hex::FromHex; use rand::Rng; use serde::{Deserialize, Serialize}; -/// Zenith ID is a 128-bit random ID. +/// Neon ID is a 128-bit random ID. /// Used to represent various identifiers. Provides handy utility methods and impls. /// /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look @@ -13,13 +13,13 @@ use serde::{Deserialize, Serialize}; /// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. /// Check the `serde_with::serde_as` documentation for options for more complex types. #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] -struct ZId([u8; 16]); +struct Id([u8; 16]); -impl ZId { - pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> ZId { +impl Id { + pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> Id { let mut arr = [0u8; 16]; buf.copy_to_slice(&mut arr); - ZId::from(arr) + Id::from(arr) } pub fn as_arr(&self) -> [u8; 16] { @@ -29,7 +29,7 @@ impl ZId { pub fn generate() -> Self { let mut tli_buf = [0u8; 16]; rand::thread_rng().fill(&mut tli_buf); - ZId::from(tli_buf) + Id::from(tli_buf) } fn hex_encode(&self) -> String { @@ -44,54 +44,54 @@ impl ZId { } } -impl FromStr for ZId { +impl FromStr for Id { type Err = hex::FromHexError; - fn from_str(s: &str) -> Result { + fn from_str(s: &str) -> Result { Self::from_hex(s) } } -// this is needed for pretty serialization and deserialization of ZId's using serde integration with hex crate -impl FromHex for ZId { +// this is needed for pretty serialization and deserialization of Id's using serde integration with hex crate +impl FromHex for Id { type Error = hex::FromHexError; fn from_hex>(hex: T) -> Result { let mut buf: [u8; 16] = [0u8; 16]; hex::decode_to_slice(hex, &mut buf)?; - Ok(ZId(buf)) + Ok(Id(buf)) } } -impl AsRef<[u8]> for ZId { +impl AsRef<[u8]> for Id { fn as_ref(&self) -> &[u8] { &self.0 } } -impl From<[u8; 16]> for ZId { +impl From<[u8; 16]> for Id { fn from(b: [u8; 16]) -> Self { - ZId(b) + Id(b) } } -impl fmt::Display for ZId { +impl fmt::Display for Id { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(&self.hex_encode()) } } -impl fmt::Debug for ZId { +impl fmt::Debug for Id { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(&self.hex_encode()) } } -macro_rules! zid_newtype { +macro_rules! id_newtype { ($t:ident) => { impl $t { pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> $t { - $t(ZId::get_from_buf(buf)) + $t(Id::get_from_buf(buf)) } pub fn as_arr(&self) -> [u8; 16] { @@ -99,11 +99,11 @@ macro_rules! zid_newtype { } pub fn generate() -> Self { - $t(ZId::generate()) + $t(Id::generate()) } pub const fn from_array(b: [u8; 16]) -> Self { - $t(ZId(b)) + $t(Id(b)) } } @@ -111,14 +111,14 @@ macro_rules! zid_newtype { type Err = hex::FromHexError; fn from_str(s: &str) -> Result<$t, Self::Err> { - let value = ZId::from_str(s)?; + let value = Id::from_str(s)?; Ok($t(value)) } } impl From<[u8; 16]> for $t { fn from(b: [u8; 16]) -> Self { - $t(ZId::from(b)) + $t(Id::from(b)) } } @@ -126,7 +126,7 @@ macro_rules! zid_newtype { type Error = hex::FromHexError; fn from_hex>(hex: T) -> Result { - Ok($t(ZId::from_hex(hex)?)) + Ok($t(Id::from_hex(hex)?)) } } @@ -150,7 +150,7 @@ macro_rules! zid_newtype { }; } -/// Zenith timeline IDs are different from PostgreSQL timeline +/// Neon timeline IDs are different from PostgreSQL timeline /// IDs. They serve a similar purpose though: they differentiate /// between different "histories" of the same cluster. However, /// PostgreSQL timeline IDs are a bit cumbersome, because they are only @@ -158,7 +158,7 @@ macro_rules! zid_newtype { /// timeline history. Those limitations mean that we cannot generate a /// new PostgreSQL timeline ID by just generating a random number. And /// that in turn is problematic for the "pull/push" workflow, where you -/// have a local copy of a zenith repository, and you periodically sync +/// have a local copy of a Neon repository, and you periodically sync /// the local changes with a remote server. When you work "detached" /// from the remote server, you cannot create a PostgreSQL timeline ID /// that's guaranteed to be different from all existing timelines in @@ -168,55 +168,55 @@ macro_rules! zid_newtype { /// branches? If they pick the same one, and later try to push the /// branches to the same remote server, they will get mixed up. /// -/// To avoid those issues, Zenith has its own concept of timelines that +/// To avoid those issues, Neon has its own concept of timelines that /// is separate from PostgreSQL timelines, and doesn't have those -/// limitations. A zenith timeline is identified by a 128-bit ID, which +/// limitations. A Neon timeline is identified by a 128-bit ID, which /// is usually printed out as a hex string. /// /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. -/// See [`ZId`] for alternative ways to serialize it. +/// See [`Id`] for alternative ways to serialize it. #[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] -pub struct ZTimelineId(ZId); +pub struct TimelineId(Id); -zid_newtype!(ZTimelineId); +id_newtype!(TimelineId); -/// Zenith Tenant Id represents identifiar of a particular tenant. +/// Neon Tenant Id represents identifiar of a particular tenant. /// Is used for distinguishing requests and data belonging to different users. /// /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. -/// See [`ZId`] for alternative ways to serialize it. +/// See [`Id`] for alternative ways to serialize it. #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] -pub struct ZTenantId(ZId); +pub struct TenantId(Id); -zid_newtype!(ZTenantId); +id_newtype!(TenantId); -// A pair uniquely identifying Zenith instance. +// A pair uniquely identifying Neon instance. #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub struct ZTenantTimelineId { - pub tenant_id: ZTenantId, - pub timeline_id: ZTimelineId, +pub struct TenantTimelineId { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, } -impl ZTenantTimelineId { - pub fn new(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Self { - ZTenantTimelineId { +impl TenantTimelineId { + pub fn new(tenant_id: TenantId, timeline_id: TimelineId) -> Self { + TenantTimelineId { tenant_id, timeline_id, } } pub fn generate() -> Self { - Self::new(ZTenantId::generate(), ZTimelineId::generate()) + Self::new(TenantId::generate(), TimelineId::generate()) } pub fn empty() -> Self { - Self::new(ZTenantId::from([0u8; 16]), ZTimelineId::from([0u8; 16])) + Self::new(TenantId::from([0u8; 16]), TimelineId::from([0u8; 16])) } } -impl fmt::Display for ZTenantTimelineId { +impl fmt::Display for TenantTimelineId { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}/{}", self.tenant_id, self.timeline_id) } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index caa7ac6c09..2c80556446 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -29,7 +29,7 @@ pub mod crashsafe_dir; pub mod auth; // utility functions and helper traits for unified unique id generation/serialization etc. -pub mod zid; +pub mod id; // http endpoint utils pub mod http; diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 604eb75aaf..0498e0887b 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -63,7 +63,7 @@ pub enum AuthType { Trust, MD5, // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT - ZenithJWT, + NeonJWT, } impl FromStr for AuthType { @@ -73,8 +73,8 @@ impl FromStr for AuthType { match s { "Trust" => Ok(Self::Trust), "MD5" => Ok(Self::MD5), - "ZenithJWT" => Ok(Self::ZenithJWT), - _ => bail!("invalid value \"{}\" for auth type", s), + "NeonJWT" => Ok(Self::NeonJWT), + _ => bail!("invalid value \"{s}\" for auth type"), } } } @@ -84,7 +84,7 @@ impl fmt::Display for AuthType { f.write_str(match self { AuthType::Trust => "Trust", AuthType::MD5 => "MD5", - AuthType::ZenithJWT => "ZenithJWT", + AuthType::NeonJWT => "NeonJWT", }) } } @@ -376,7 +376,7 @@ impl PostgresBackend { ))?; self.state = ProtoState::Authentication; } - AuthType::ZenithJWT => { + AuthType::NeonJWT => { self.write_message(&BeMessage::AuthenticationCleartextPassword)?; self.state = ProtoState::Authentication; } @@ -403,7 +403,7 @@ impl PostgresBackend { bail!("auth failed: {}", e); } } - AuthType::ZenithJWT => { + AuthType::NeonJWT => { let (_, jwt_response) = m.split_last().context("protocol violation")?; if let Err(e) = handler.check_auth_jwt(self, jwt_response) { diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs index 383ad3742f..87e4478a99 100644 --- a/libs/utils/src/postgres_backend_async.rs +++ b/libs/utils/src/postgres_backend_async.rs @@ -346,7 +346,7 @@ impl PostgresBackend { ))?; self.state = ProtoState::Authentication; } - AuthType::ZenithJWT => { + AuthType::NeonJWT => { self.write_message(&BeMessage::AuthenticationCleartextPassword)?; self.state = ProtoState::Authentication; } @@ -374,7 +374,7 @@ impl PostgresBackend { bail!("auth failed: {}", e); } } - AuthType::ZenithJWT => { + AuthType::NeonJWT => { let (_, jwt_response) = m.split_last().context("protocol violation")?; if let Err(e) = handler.check_auth_jwt(self, jwt_response) { diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index e73c73bd9c..11d2d94906 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -27,10 +27,10 @@ clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } tokio-util = { version = "0.7.3", features = ["io", "io-util"] } -postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } anyhow = { version = "1.0", features = ["backtrace"] } crc32c = "0.6.0" thiserror = "1.0" diff --git a/pageserver/src/bin/dump_layerfile.rs b/pageserver/src/bin/dump_layerfile.rs index 7e766ce859..f5247ee609 100644 --- a/pageserver/src/bin/dump_layerfile.rs +++ b/pageserver/src/bin/dump_layerfile.rs @@ -12,7 +12,7 @@ use utils::project_git_version; project_git_version!(GIT_VERSION); fn main() -> Result<()> { - let arg_matches = App::new("Zenith dump_layerfile utility") + let arg_matches = App::new("Neon dump_layerfile utility") .about("Dump contents of one layer file, for debugging") .version(GIT_VERSION) .arg( diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 679c6f76e7..92d5eab379 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -40,7 +40,7 @@ fn version() -> String { } fn main() -> anyhow::Result<()> { - let arg_matches = App::new("Zenith page server") + let arg_matches = App::new("Neon page server") .about("Materializes WAL stream to pages and serves them to the postgres") .version(&*version()) .arg( @@ -293,7 +293,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() // initialize authentication for incoming connections let auth = match &conf.auth_type { AuthType::Trust | AuthType::MD5 => None, - AuthType::ZenithJWT => { + AuthType::NeonJWT => { // unwrap is ok because check is performed when creating config, so path is set and file exists let key_path = conf.auth_validation_public_key_path.as_ref().unwrap(); Some(JwtAuth::from_key_path(key_path)?.into()) diff --git a/pageserver/src/bin/update_metadata.rs b/pageserver/src/bin/update_metadata.rs index 3339564b0f..16359c2532 100644 --- a/pageserver/src/bin/update_metadata.rs +++ b/pageserver/src/bin/update_metadata.rs @@ -11,7 +11,7 @@ use utils::{lsn::Lsn, project_git_version}; project_git_version!(GIT_VERSION); fn main() -> Result<()> { - let arg_matches = App::new("Zenith update metadata utility") + let arg_matches = App::new("Neon update metadata utility") .about("Dump or update metadata file") .version(GIT_VERSION) .arg( diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 56171f46e3..75c71b09d2 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -15,8 +15,8 @@ use toml_edit; use toml_edit::{Document, Item}; use url::Url; use utils::{ + id::{NodeId, TenantId, TimelineId}, postgres_backend::AuthType, - zid::{NodeId, ZTenantId, ZTimelineId}, }; use crate::tenant::TIMELINES_SEGMENT_NAME; @@ -342,16 +342,16 @@ impl PageServerConf { self.workdir.join("tenants") } - pub fn tenant_path(&self, tenantid: &ZTenantId) -> PathBuf { - self.tenants_path().join(tenantid.to_string()) + pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf { + self.tenants_path().join(tenant_id.to_string()) } - pub fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf { - self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME) + pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf { + self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME) } - pub fn timeline_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf { - self.timelines_path(tenantid).join(timelineid.to_string()) + pub fn timeline_path(&self, timeline_id: &TimelineId, tenant_id: &TenantId) -> PathBuf { + self.timelines_path(tenant_id).join(timeline_id.to_string()) } // @@ -419,7 +419,7 @@ impl PageServerConf { let mut conf = builder.build().context("invalid config")?; - if conf.auth_type == AuthType::ZenithJWT { + if conf.auth_type == AuthType::NeonJWT { let auth_validation_public_key_path = conf .auth_validation_public_key_path .get_or_insert_with(|| workdir.join("auth_public_key.pem")); diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 0ccf23776c..c0dc5b9677 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -3,8 +3,8 @@ use std::num::NonZeroU64; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use utils::{ + id::{NodeId, TenantId, TimelineId}, lsn::Lsn, - zid::{NodeId, ZTenantId, ZTimelineId}, }; use crate::tenant::TenantState; @@ -14,10 +14,10 @@ use crate::tenant::TenantState; pub struct TimelineCreateRequest { #[serde(default)] #[serde_as(as = "Option")] - pub new_timeline_id: Option, + pub new_timeline_id: Option, #[serde(default)] #[serde_as(as = "Option")] - pub ancestor_timeline_id: Option, + pub ancestor_timeline_id: Option, #[serde(default)] #[serde_as(as = "Option")] pub ancestor_start_lsn: Option, @@ -28,7 +28,7 @@ pub struct TimelineCreateRequest { pub struct TenantCreateRequest { #[serde(default)] #[serde_as(as = "Option")] - pub new_tenant_id: Option, + pub new_tenant_id: Option, pub checkpoint_distance: Option, pub checkpoint_timeout: Option, pub compaction_target_size: Option, @@ -46,7 +46,7 @@ pub struct TenantCreateRequest { #[serde_as] #[derive(Serialize, Deserialize)] #[serde(transparent)] -pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId); +pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId); #[derive(Serialize)] pub struct StatusResponse { @@ -54,7 +54,7 @@ pub struct StatusResponse { } impl TenantCreateRequest { - pub fn new(new_tenant_id: Option) -> TenantCreateRequest { + pub fn new(new_tenant_id: Option) -> TenantCreateRequest { TenantCreateRequest { new_tenant_id, ..Default::default() @@ -65,7 +65,7 @@ impl TenantCreateRequest { #[serde_as] #[derive(Serialize, Deserialize)] pub struct TenantConfigRequest { - pub tenant_id: ZTenantId, + pub tenant_id: TenantId, #[serde(default)] #[serde_as(as = "Option")] pub checkpoint_distance: Option, @@ -83,7 +83,7 @@ pub struct TenantConfigRequest { } impl TenantConfigRequest { - pub fn new(tenant_id: ZTenantId) -> TenantConfigRequest { + pub fn new(tenant_id: TenantId) -> TenantConfigRequest { TenantConfigRequest { tenant_id, checkpoint_distance: None, @@ -106,7 +106,7 @@ impl TenantConfigRequest { #[derive(Serialize, Deserialize, Clone)] pub struct TenantInfo { #[serde_as(as = "DisplayFromStr")] - pub id: ZTenantId, + pub id: TenantId, pub state: TenantState, pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub has_in_progress_downloads: Option, @@ -116,7 +116,7 @@ pub struct TenantInfo { #[derive(Debug, Serialize, Deserialize, Clone)] pub struct LocalTimelineInfo { #[serde_as(as = "Option")] - pub ancestor_timeline_id: Option, + pub ancestor_timeline_id: Option, #[serde_as(as = "Option")] pub ancestor_lsn: Option, #[serde_as(as = "DisplayFromStr")] @@ -154,9 +154,9 @@ pub struct RemoteTimelineInfo { #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { #[serde_as(as = "DisplayFromStr")] - pub tenant_id: ZTenantId, + pub tenant_id: TenantId, #[serde_as(as = "DisplayFromStr")] - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, pub local: Option, pub remote: Option, } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 36ba2e9b66..2e49429f38 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -25,8 +25,8 @@ use utils::{ request::parse_request_param, RequestExt, RouterBuilder, }, + id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, }; struct State { @@ -128,10 +128,10 @@ fn local_timeline_info_from_timeline( } fn list_local_timelines( - tenant_id: ZTenantId, + tenant_id: TenantId, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, -) -> Result> { +) -> Result> { let tenant = tenant_mgr::get_tenant(tenant_id, true)?; let timelines = tenant.list_timelines(); @@ -156,7 +156,7 @@ async fn status_handler(request: Request) -> Result, ApiErr } async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let request_data: TimelineCreateRequest = json_request(&mut request).await?; check_permission(&request, Some(tenant_id))?; @@ -164,8 +164,8 @@ async fn timeline_create_handler(mut request: Request) -> Result { @@ -193,7 +193,7 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let include_non_incremental_logical_size = query_param_present(&request, "include-non-incremental-logical-size"); let include_non_incremental_physical_size = @@ -229,7 +229,7 @@ async fn timeline_list_handler(request: Request) -> Result, .remote_index .read() .await - .timeline_entry(&ZTenantTimelineId { + .timeline_entry(&TenantTimelineId { tenant_id, timeline_id, }) @@ -257,8 +257,8 @@ fn query_param_present(request: &Request, param: &str) -> bool { } async fn timeline_detail_handler(request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let include_non_incremental_logical_size = query_param_present(&request, "include-non-incremental-logical-size"); let include_non_incremental_physical_size = @@ -289,7 +289,7 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; info!("Handling tenant attach {tenant_id}"); @@ -402,8 +402,8 @@ async fn tenant_attach_handler(request: Request) -> Result, /// for details see comment to `storage_sync::gather_tenant_timelines_index_parts` async fn gather_tenant_timelines_index_parts( state: &State, - tenant_id: ZTenantId, -) -> anyhow::Result>> { + tenant_id: TenantId, +) -> anyhow::Result>> { let index_parts = match state.remote_storage.as_ref() { Some(storage) => { storage_sync::gather_tenant_timelines_index_parts(state.conf, storage, tenant_id).await @@ -425,8 +425,8 @@ async fn gather_tenant_timelines_index_parts( } async fn timeline_delete_handler(request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_id))?; let state = get_state(&request); @@ -436,7 +436,7 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let state = get_state(&request); @@ -479,7 +479,7 @@ async fn tenant_list_handler(request: Request) -> Result, A } async fn tenant_status(request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; // if tenant is in progress of downloading it can be absent in global tenant map @@ -588,8 +588,8 @@ async fn tenant_create_handler(mut request: Request) -> Result(HashMap>); +pub struct TenantTimelineValues(HashMap>); impl TenantTimelineValues { fn new() -> Self { @@ -187,8 +187,8 @@ mod tests { #[test] fn tenant_timeline_value_mapping() { - let first_tenant = ZTenantId::generate(); - let second_tenant = ZTenantId::generate(); + let first_tenant = TenantId::generate(); + let second_tenant = TenantId::generate(); assert_ne!(first_tenant, second_tenant); let mut initial = TenantTimelineValues::new(); diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index ada0bbd359..2f03943429 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -5,7 +5,7 @@ use metrics::{ IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; -use utils::zid::{ZTenantId, ZTimelineId}; +use utils::id::{TenantId, TimelineId}; /// Prometheus histogram buckets (in seconds) that capture the majority of /// latencies in the microsecond range but also extend far enough up to distinguish @@ -327,7 +327,7 @@ pub struct TimelineMetrics { } impl TimelineMetrics { - pub fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self { + pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { let tenant_id = tenant_id.to_string(); let timeline_id = timeline_id.to_string(); let reconstruct_time_histo = RECONSTRUCT_TIME @@ -414,6 +414,6 @@ impl Drop for TimelineMetrics { } } -pub fn remove_tenant_metrics(tenant_id: &ZTenantId) { +pub fn remove_tenant_metrics(tenant_id: &TenantId) { let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]); } diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 15c3c22dd6..d2fe06697e 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -49,8 +49,8 @@ use anyhow::Context; use once_cell::sync::OnceCell; use tracing::error; use utils::{ + id::{TenantId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, }; use crate::repository::Key; @@ -109,8 +109,8 @@ enum CacheKey { #[derive(Debug, PartialEq, Eq, Hash, Clone)] struct MaterializedPageHashKey { - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, key: Key, } @@ -308,8 +308,8 @@ impl PageCache { /// returned page. pub fn lookup_materialized_page( &self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, key: &Key, lsn: Lsn, ) -> Option<(Lsn, PageReadGuard)> { @@ -338,8 +338,8 @@ impl PageCache { /// pub fn memorize_materialized_page( &self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, key: Key, lsn: Lsn, img: &[u8], diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 388f40f916..b06814c557 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -23,12 +23,12 @@ use tokio_util::io::SyncIoBridge; use tracing::*; use utils::{ auth::{self, Claims, JwtAuth, Scope}, + id::{TenantId, TimelineId}, lsn::Lsn, postgres_backend::AuthType, postgres_backend_async::{self, PostgresBackend}, pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC}, simple_rcu::RcuReadGuard, - zid::{ZTenantId, ZTimelineId}, }; use crate::basebackup; @@ -123,7 +123,7 @@ impl PagestreamFeMessage { fn parse(mut body: Bytes) -> anyhow::Result { // TODO these gets can fail - // these correspond to the ZenithMessageTag enum in pagestore_client.h + // these correspond to the NeonMessageTag enum in pagestore_client.h // // TODO: consider using protobuf or serde bincode for less error prone // serialization. @@ -370,7 +370,7 @@ struct PageRequestMetrics { } impl PageRequestMetrics { - fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self { + fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { let tenant_id = tenant_id.to_string(); let timeline_id = timeline_id.to_string(); @@ -415,8 +415,8 @@ impl PageServerHandler { async fn handle_pagerequests( &self, pgb: &mut PostgresBackend, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, ) -> anyhow::Result<()> { // NOTE: pagerequests handler exits when connection is closed, // so there is no need to reset the association @@ -452,11 +452,11 @@ impl PageServerHandler { None => break, // client disconnected }; - trace!("query: {:?}", copy_data_bytes); + trace!("query: {copy_data_bytes:?}"); - let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; + let neon_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; - let response = match zenith_fe_msg { + let response = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { let _timer = metrics.get_rel_exists.start_timer(); self.handle_get_rel_exists_request(&timeline, &req).await @@ -494,8 +494,8 @@ impl PageServerHandler { async fn handle_import_basebackup( &self, pgb: &mut PostgresBackend, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, base_lsn: Lsn, _end_lsn: Lsn, ) -> anyhow::Result<()> { @@ -557,8 +557,8 @@ impl PageServerHandler { async fn handle_import_wal( &self, pgb: &mut PostgresBackend, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, start_lsn: Lsn, end_lsn: Lsn, ) -> anyhow::Result<()> { @@ -750,8 +750,8 @@ impl PageServerHandler { async fn handle_basebackup_request( &self, pgb: &mut PostgresBackend, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, lsn: Option, prev_lsn: Option, full_backup: bool, @@ -792,7 +792,7 @@ impl PageServerHandler { // when accessing management api supply None as an argument // when using to authorize tenant pass corresponding tenant id - fn check_permission(&self, tenant_id: Option) -> Result<()> { + fn check_permission(&self, tenant_id: Option) -> Result<()> { if self.auth.is_none() { // auth is set to Trust, nothing to check so just return ok return Ok(()); @@ -815,7 +815,7 @@ impl postgres_backend_async::Handler for PageServerHandler { _pgb: &mut PostgresBackend, jwt_response: &[u8], ) -> anyhow::Result<()> { - // this unwrap is never triggered, because check_auth_jwt only called when auth_type is ZenithJWT + // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT // which requires auth to be present let data = self .auth @@ -853,8 +853,8 @@ impl postgres_backend_async::Handler for PageServerHandler { params.len() == 2, "invalid param number for pagestream command" ); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; self.check_permission(Some(tenant_id))?; @@ -869,8 +869,8 @@ impl postgres_backend_async::Handler for PageServerHandler { "invalid param number for basebackup command" ); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; self.check_permission(Some(tenant_id))?; @@ -895,8 +895,8 @@ impl postgres_backend_async::Handler for PageServerHandler { "invalid param number for get_last_record_rlsn command" ); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; self.check_permission(Some(tenant_id))?; let timeline = get_local_timeline(tenant_id, timeline_id)?; @@ -923,8 +923,8 @@ impl postgres_backend_async::Handler for PageServerHandler { "invalid param number for fullbackup command" ); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; // The caller is responsible for providing correct lsn and prev_lsn. let lsn = if params.len() > 2 { @@ -959,8 +959,8 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("import basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); ensure!(params.len() == 4); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; let base_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; @@ -984,8 +984,8 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("import wal ".len()); let params = params_raw.split_whitespace().collect::>(); ensure!(params.len() == 4); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; let start_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; @@ -1035,7 +1035,7 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("show ".len()); let params = params_raw.split(' ').collect::>(); ensure!(params.len() == 1, "invalid param number for config command"); - let tenant_id = ZTenantId::from_str(params[0])?; + let tenant_id = TenantId::from_str(params[0])?; let tenant = tenant_mgr::get_tenant(tenant_id, true)?; pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), @@ -1087,8 +1087,8 @@ impl postgres_backend_async::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid do_gc: '{}'", query_string))?; - let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; let tenant = tenant_mgr::get_tenant(tenant_id, true)?; @@ -1131,8 +1131,8 @@ impl postgres_backend_async::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("Invalid compact: '{}'", query_string))?; - let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; let timeline = get_local_timeline(tenant_id, timeline_id)?; timeline.compact()?; @@ -1148,8 +1148,8 @@ impl postgres_backend_async::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid checkpoint command: '{}'", query_string))?; - let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; let timeline = get_local_timeline(tenant_id, timeline_id)?; // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). @@ -1166,8 +1166,8 @@ impl postgres_backend_async::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?; - let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; let timeline = get_local_timeline(tenant_id, timeline_id)?; let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; @@ -1192,7 +1192,7 @@ impl postgres_backend_async::Handler for PageServerHandler { } } -fn get_local_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Result> { +fn get_local_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> Result> { tenant_mgr::get_tenant(tenant_id, true).and_then(|tenant| tenant.get_timeline(timeline_id)) } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 2454b6f54f..9d4b438dc4 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -10,7 +10,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::reltag::{RelTag, SlruKind}; use crate::repository::*; use crate::tenant::Timeline; -use crate::walrecord::ZenithWalRecord; +use crate::walrecord::NeonWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; use postgres_ffi::v14::pg_constants; @@ -570,7 +570,7 @@ impl<'a> DatadirModification<'a> { &mut self, rel: RelTag, blknum: BlockNumber, - rec: ZenithWalRecord, + rec: NeonWalRecord, ) -> Result<()> { ensure!(rel.relnode != 0, "invalid relnode"); self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec)); @@ -583,7 +583,7 @@ impl<'a> DatadirModification<'a> { kind: SlruKind, segno: u32, blknum: BlockNumber, - rec: ZenithWalRecord, + rec: NeonWalRecord, ) -> Result<()> { self.put( slru_block_to_key(kind, segno, blknum), @@ -1401,7 +1401,7 @@ fn is_slru_block_key(key: Key) -> bool { #[cfg(test)] pub fn create_test_timeline( tenant: &crate::tenant::Tenant, - timeline_id: utils::zid::ZTimelineId, + timeline_id: utils::id::TimelineId, ) -> Result> { let tline = tenant.create_empty_timeline(timeline_id, Lsn(8))?; let mut m = tline.begin_modification(Lsn(8)); diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index c3b08c93de..f6ea9d8c5d 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,4 +1,4 @@ -use crate::walrecord::ZenithWalRecord; +use crate::walrecord::NeonWalRecord; use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; use bytes::Bytes; @@ -157,7 +157,7 @@ pub enum Value { /// replayed get the full value. Replaying the WAL record /// might need a previous version of the value (if will_init() /// returns false), or it may be replayed stand-alone (true). - WalRecord(ZenithWalRecord), + WalRecord(NeonWalRecord), } impl Value { diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index c104dba298..9d259bf1e2 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -68,7 +68,7 @@ //! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure. //! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexPart`], containing the list of remote files. //! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download. -//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`], +//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`TenantId`] and [`TimelineId`], //! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrieve its shard contents, if needed, same as any layer files. //! //! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed. @@ -183,7 +183,7 @@ use crate::{ TenantTimelineValues, }; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; use self::download::download_index_parts; pub use self::download::gather_tenant_timelines_index_parts; @@ -227,7 +227,7 @@ pub struct SyncStartupData { struct SyncQueue { max_timelines_per_batch: NonZeroUsize, - queue: Mutex>, + queue: Mutex>, condvar: Condvar, } @@ -241,7 +241,7 @@ impl SyncQueue { } /// Queue a new task - fn push(&self, sync_id: ZTenantTimelineId, new_task: SyncTask) { + fn push(&self, sync_id: TenantTimelineId, new_task: SyncTask) { let mut q = self.queue.lock().unwrap(); q.push_back((sync_id, new_task)); @@ -254,7 +254,7 @@ impl SyncQueue { /// A timeline has to care to not to delete certain layers from the remote storage before the corresponding uploads happen. /// Other than that, due to "immutable" nature of the layers, the order of their deletion/uploading/downloading does not matter. /// Hence, we merge the layers together into single task per timeline and run those concurrently (with the deletion happening only after successful uploading). - fn next_task_batch(&self) -> (HashMap, usize) { + fn next_task_batch(&self) -> (HashMap, usize) { // Wait for the first task in blocking fashion let mut q = self.queue.lock().unwrap(); while q.is_empty() { @@ -488,8 +488,8 @@ struct LayersDeletion { /// /// Ensure that the loop is started otherwise the task is never processed. pub fn schedule_layer_upload( - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, layers_to_upload: HashSet, metadata: Option, ) { @@ -501,7 +501,7 @@ pub fn schedule_layer_upload( } }; sync_queue.push( - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, }, @@ -519,8 +519,8 @@ pub fn schedule_layer_upload( /// /// Ensure that the loop is started otherwise the task is never processed. pub fn schedule_layer_delete( - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, layers_to_delete: HashSet, ) { let sync_queue = match SYNC_QUEUE.get() { @@ -531,7 +531,7 @@ pub fn schedule_layer_delete( } }; sync_queue.push( - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, }, @@ -551,7 +551,7 @@ pub fn schedule_layer_delete( /// On any failure, the task gets retried, omitting already downloaded layers. /// /// Ensure that the loop is started otherwise the task is never processed. -pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { +pub fn schedule_layer_download(tenant_id: TenantId, timeline_id: TimelineId) { debug!("Scheduling layer download for tenant {tenant_id}, timeline {timeline_id}"); let sync_queue = match SYNC_QUEUE.get() { Some(queue) => queue, @@ -561,7 +561,7 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { } }; sync_queue.push( - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, }, @@ -604,7 +604,7 @@ pub fn spawn_storage_sync_task( let _ = empty_tenants.0.entry(tenant_id).or_default(); } else { for (timeline_id, timeline_data) in timeline_data { - let id = ZTenantTimelineId::new(tenant_id, timeline_id); + let id = TenantTimelineId::new(tenant_id, timeline_id); keys_for_index_part_downloads.insert(id); timelines_to_sync.insert(id, timeline_data); } @@ -766,9 +766,9 @@ async fn process_batches( max_sync_errors: NonZeroU32, storage: GenericRemoteStorage, index: &RemoteIndex, - batched_tasks: HashMap, + batched_tasks: HashMap, sync_queue: &SyncQueue, -) -> HashSet { +) -> HashSet { let mut sync_results = batched_tasks .into_iter() .map(|(sync_id, batch)| { @@ -808,7 +808,7 @@ async fn process_sync_task_batch( conf: &'static PageServerConf, (storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, batch: SyncTaskBatch, ) -> DownloadStatus { let sync_start = Instant::now(); @@ -949,7 +949,7 @@ async fn download_timeline_data( conf: &'static PageServerConf, (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), current_remote_timeline: Option<&RemoteTimeline>, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, new_download_data: SyncData, sync_start: Instant, task_name: &str, @@ -999,7 +999,7 @@ async fn download_timeline_data( async fn update_local_metadata( conf: &'static PageServerConf, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, remote_timeline: Option<&RemoteTimeline>, ) -> anyhow::Result<()> { let remote_metadata = match remote_timeline { @@ -1031,7 +1031,7 @@ async fn update_local_metadata( info!("Updating local timeline metadata from remote timeline: local disk_consistent_lsn={local_lsn:?}, remote disk_consistent_lsn={remote_lsn}"); // clone because spawn_blocking requires static lifetime let cloned_metadata = remote_metadata.to_owned(); - let ZTenantTimelineId { + let TenantTimelineId { tenant_id, timeline_id, } = sync_id; @@ -1061,7 +1061,7 @@ async fn update_local_metadata( async fn delete_timeline_data( conf: &'static PageServerConf, (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, mut new_delete_data: SyncData, sync_start: Instant, task_name: &str, @@ -1104,7 +1104,7 @@ async fn upload_timeline_data( conf: &'static PageServerConf, (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), current_remote_timeline: Option<&RemoteTimeline>, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, new_upload_data: SyncData, sync_start: Instant, task_name: &str, @@ -1163,7 +1163,7 @@ async fn update_remote_data( conf: &'static PageServerConf, storage: &GenericRemoteStorage, index: &RemoteIndex, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, update: RemoteDataUpdate<'_>, ) -> anyhow::Result<()> { let updated_remote_timeline = { @@ -1261,7 +1261,7 @@ async fn validate_task_retries( fn schedule_first_sync_tasks( index: &mut RemoteTimelineIndex, sync_queue: &SyncQueue, - local_timeline_files: HashMap)>, + local_timeline_files: HashMap)>, ) -> TenantTimelineValues { let mut local_timeline_init_statuses = TenantTimelineValues::new(); @@ -1331,8 +1331,8 @@ fn schedule_first_sync_tasks( /// bool in return value stands for awaits_download fn compare_local_and_remote_timeline( - new_sync_tasks: &mut VecDeque<(ZTenantTimelineId, SyncTask)>, - sync_id: ZTenantTimelineId, + new_sync_tasks: &mut VecDeque<(TenantTimelineId, SyncTask)>, + sync_id: TenantTimelineId, local_metadata: TimelineMetadata, local_files: HashSet, remote_entry: &RemoteTimeline, @@ -1377,7 +1377,7 @@ fn compare_local_and_remote_timeline( } fn register_sync_status( - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, sync_start: Instant, sync_name: &str, sync_status: Option, @@ -1409,7 +1409,7 @@ mod test_utils { pub(super) async fn create_local_timeline( harness: &TenantHarness<'_>, - timeline_id: ZTimelineId, + timeline_id: TimelineId, filenames: &[&str], metadata: TimelineMetadata, ) -> anyhow::Result { @@ -1454,8 +1454,8 @@ mod tests { use super::*; - const TEST_SYNC_ID: ZTenantTimelineId = ZTenantTimelineId { - tenant_id: ZTenantId::from_array(hex!("11223344556677881122334455667788")), + const TEST_SYNC_ID: TenantTimelineId = TenantTimelineId { + tenant_id: TenantId::from_array(hex!("11223344556677881122334455667788")), timeline_id: TIMELINE_ID, }; @@ -1464,12 +1464,12 @@ mod tests { let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); assert_eq!(sync_queue.len(), 0); - let sync_id_2 = ZTenantTimelineId { - tenant_id: ZTenantId::from_array(hex!("22223344556677881122334455667788")), + let sync_id_2 = TenantTimelineId { + tenant_id: TenantId::from_array(hex!("22223344556677881122334455667788")), timeline_id: TIMELINE_ID, }; - let sync_id_3 = ZTenantTimelineId { - tenant_id: ZTenantId::from_array(hex!("33223344556677881122334455667788")), + let sync_id_3 = TenantTimelineId { + tenant_id: TenantId::from_array(hex!("33223344556677881122334455667788")), timeline_id: TIMELINE_ID, }; assert!(sync_id_2 != TEST_SYNC_ID); @@ -1591,8 +1591,8 @@ mod tests { layers_to_skip: HashSet::from([PathBuf::from("sk4")]), }; - let sync_id_2 = ZTenantTimelineId { - tenant_id: ZTenantId::from_array(hex!("22223344556677881122334455667788")), + let sync_id_2 = TenantTimelineId { + tenant_id: TenantId::from_array(hex!("22223344556677881122334455667788")), timeline_id: TIMELINE_ID, }; assert!(sync_id_2 != TEST_SYNC_ID); diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index 945f5fded8..21a3372e70 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -8,7 +8,7 @@ use tracing::{debug, error, info}; use crate::storage_sync::{SyncQueue, SyncTask}; use remote_storage::GenericRemoteStorage; -use utils::zid::ZTenantTimelineId; +use utils::id::TenantTimelineId; use super::{LayersDeletion, SyncData}; @@ -17,7 +17,7 @@ use super::{LayersDeletion, SyncData}; pub(super) async fn delete_timeline_layers( storage: &GenericRemoteStorage, sync_queue: &SyncQueue, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, mut delete_data: SyncData, ) -> bool { if !delete_data.data.deletion_registered { @@ -123,7 +123,7 @@ mod tests { async fn delete_timeline_negative() -> anyhow::Result<()> { let harness = TenantHarness::create("delete_timeline_negative")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_path_buf(), harness.conf.workdir.clone(), @@ -157,7 +157,7 @@ mod tests { let harness = TenantHarness::create("delete_timeline")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "c", "d"]; let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_path_buf(), diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 32f228b447..80d5ca5994 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -20,7 +20,7 @@ use crate::{ config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path, TEMP_FILE_SUFFIX, }; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; use super::{ index::{IndexPart, RemoteTimeline}, @@ -33,14 +33,14 @@ use super::{ // When data is received succesfully without errors Present variant is used. pub enum TenantIndexParts { Poisoned { - present: HashMap, - missing: HashSet, + present: HashMap, + missing: HashSet, }, - Present(HashMap), + Present(HashMap), } impl TenantIndexParts { - fn add_poisoned(&mut self, timeline_id: ZTimelineId) { + fn add_poisoned(&mut self, timeline_id: TimelineId) { match self { TenantIndexParts::Poisoned { missing, .. } => { missing.insert(timeline_id); @@ -64,9 +64,9 @@ impl Default for TenantIndexParts { pub async fn download_index_parts( conf: &'static PageServerConf, storage: &GenericRemoteStorage, - keys: HashSet, -) -> HashMap { - let mut index_parts: HashMap = HashMap::new(); + keys: HashSet, +) -> HashMap { + let mut index_parts: HashMap = HashMap::new(); let mut part_downloads = keys .into_iter() @@ -112,8 +112,8 @@ pub async fn download_index_parts( pub async fn gather_tenant_timelines_index_parts( conf: &'static PageServerConf, storage: &GenericRemoteStorage, - tenant_id: ZTenantId, -) -> anyhow::Result> { + tenant_id: TenantId, +) -> anyhow::Result> { let tenant_path = conf.timelines_path(&tenant_id); let timeline_sync_ids = get_timeline_sync_ids(storage, &tenant_path, tenant_id) .await @@ -135,7 +135,7 @@ pub async fn gather_tenant_timelines_index_parts( async fn download_index_part( conf: &'static PageServerConf, storage: &GenericRemoteStorage, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, ) -> Result { let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); @@ -197,7 +197,7 @@ pub(super) async fn download_timeline_layers<'a>( storage: &'a GenericRemoteStorage, sync_queue: &'a SyncQueue, remote_timeline: Option<&'a RemoteTimeline>, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, mut download_data: SyncData, ) -> DownloadedTimeline { let remote_timeline = match remote_timeline { @@ -335,7 +335,7 @@ pub(super) async fn download_timeline_layers<'a>( } // fsync timeline directory which is a parent directory for downloaded files - let ZTenantTimelineId { + let TenantTimelineId { tenant_id, timeline_id, } = &sync_id; @@ -366,8 +366,8 @@ pub(super) async fn download_timeline_layers<'a>( async fn get_timeline_sync_ids( storage: &GenericRemoteStorage, tenant_path: &Path, - tenant_id: ZTenantId, -) -> anyhow::Result> { + tenant_id: TenantId, +) -> anyhow::Result> { let tenant_storage_path = storage.remote_object_id(tenant_path).with_context(|| { format!( "Failed to get tenant storage path for local path '{}'", @@ -395,11 +395,11 @@ async fn get_timeline_sync_ids( anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") })?; - let timeline_id: ZTimelineId = object_name.parse().with_context(|| { + let timeline_id: TimelineId = object_name.parse().with_context(|| { format!("failed to parse object name into timeline id '{object_name}'") })?; - sync_ids.insert(ZTenantTimelineId { + sync_ids.insert(TenantTimelineId { tenant_id, timeline_id, }); @@ -439,7 +439,7 @@ mod tests { let harness = TenantHarness::create("download_timeline")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"]; let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), @@ -539,7 +539,7 @@ mod tests { async fn download_timeline_negatives() -> anyhow::Result<()> { let harness = TenantHarness::create("download_timeline_negatives")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), harness.conf.workdir.clone(), @@ -597,7 +597,7 @@ mod tests { #[tokio::test] async fn test_download_index_part() -> anyhow::Result<()> { let harness = TenantHarness::create("test_download_index_part")?; - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index cff14cde49..13495ffefe 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -17,8 +17,8 @@ use tracing::log::warn; use crate::{config::PageServerConf, tenant::metadata::TimelineMetadata}; use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use super::download::TenantIndexParts; @@ -49,7 +49,7 @@ impl RelativePath { } #[derive(Debug, Clone, Default)] -pub struct TenantEntry(HashMap); +pub struct TenantEntry(HashMap); impl TenantEntry { pub fn has_in_progress_downloads(&self) -> bool { @@ -59,7 +59,7 @@ impl TenantEntry { } impl Deref for TenantEntry { - type Target = HashMap; + type Target = HashMap; fn deref(&self) -> &Self::Target { &self.0 @@ -72,8 +72,8 @@ impl DerefMut for TenantEntry { } } -impl From> for TenantEntry { - fn from(inner: HashMap) -> Self { +impl From> for TenantEntry { + fn from(inner: HashMap) -> Self { Self(inner) } } @@ -81,7 +81,7 @@ impl From> for TenantEntry { /// An index to track tenant files that exist on the remote storage. #[derive(Debug, Clone, Default)] pub struct RemoteTimelineIndex { - entries: HashMap, + entries: HashMap, } /// A wrapper to synchronize the access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`]. @@ -91,9 +91,9 @@ pub struct RemoteIndex(Arc>); impl RemoteIndex { pub fn from_parts( conf: &'static PageServerConf, - index_parts: HashMap, + index_parts: HashMap, ) -> anyhow::Result { - let mut entries: HashMap = HashMap::new(); + let mut entries: HashMap = HashMap::new(); for (tenant_id, index_parts) in index_parts { match index_parts { @@ -136,30 +136,30 @@ impl Clone for RemoteIndex { impl RemoteTimelineIndex { pub fn timeline_entry( &self, - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, - }: &ZTenantTimelineId, + }: &TenantTimelineId, ) -> Option<&RemoteTimeline> { self.entries.get(tenant_id)?.get(timeline_id) } pub fn timeline_entry_mut( &mut self, - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, - }: &ZTenantTimelineId, + }: &TenantTimelineId, ) -> Option<&mut RemoteTimeline> { self.entries.get_mut(tenant_id)?.get_mut(timeline_id) } pub fn add_timeline_entry( &mut self, - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, - }: ZTenantTimelineId, + }: TenantTimelineId, entry: RemoteTimeline, ) { self.entries @@ -170,10 +170,10 @@ impl RemoteTimelineIndex { pub fn remove_timeline_entry( &mut self, - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, - }: ZTenantTimelineId, + }: TenantTimelineId, ) -> Option { self.entries .entry(tenant_id) @@ -181,25 +181,25 @@ impl RemoteTimelineIndex { .remove(&timeline_id) } - pub fn tenant_entry(&self, tenant_id: &ZTenantId) -> Option<&TenantEntry> { + pub fn tenant_entry(&self, tenant_id: &TenantId) -> Option<&TenantEntry> { self.entries.get(tenant_id) } - pub fn tenant_entry_mut(&mut self, tenant_id: &ZTenantId) -> Option<&mut TenantEntry> { + pub fn tenant_entry_mut(&mut self, tenant_id: &TenantId) -> Option<&mut TenantEntry> { self.entries.get_mut(tenant_id) } - pub fn add_tenant_entry(&mut self, tenant_id: ZTenantId) -> &mut TenantEntry { + pub fn add_tenant_entry(&mut self, tenant_id: TenantId) -> &mut TenantEntry { self.entries.entry(tenant_id).or_default() } - pub fn remove_tenant_entry(&mut self, tenant_id: &ZTenantId) -> Option { + pub fn remove_tenant_entry(&mut self, tenant_id: &TenantId) -> Option { self.entries.remove(tenant_id) } pub fn set_awaits_download( &mut self, - id: &ZTenantTimelineId, + id: &TenantTimelineId, awaits_download: bool, ) -> anyhow::Result<()> { self.timeline_entry_mut(id) diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index bd09e6b898..aa5a2232cf 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -8,7 +8,7 @@ use remote_storage::GenericRemoteStorage; use tokio::fs; use tracing::{debug, error, info, warn}; -use utils::zid::ZTenantTimelineId; +use utils::id::TenantTimelineId; use super::{ index::{IndexPart, RemoteTimeline}, @@ -21,7 +21,7 @@ use crate::{config::PageServerConf, storage_sync::SyncTask, tenant::metadata::me pub(super) async fn upload_index_part( conf: &'static PageServerConf, storage: &GenericRemoteStorage, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, index_part: IndexPart, ) -> anyhow::Result<()> { let index_part_bytes = serde_json::to_vec(&index_part) @@ -58,7 +58,7 @@ pub(super) async fn upload_timeline_layers<'a>( storage: &'a GenericRemoteStorage, sync_queue: &SyncQueue, remote_timeline: Option<&'a RemoteTimeline>, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, mut upload_data: SyncData, ) -> UploadedTimeline { let upload = &mut upload_data.data; @@ -213,7 +213,7 @@ mod tests { async fn regular_layer_upload() -> anyhow::Result<()> { let harness = TenantHarness::create("regular_layer_upload")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b"]; let storage = GenericRemoteStorage::new(LocalFs::new( @@ -301,7 +301,7 @@ mod tests { async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> { let harness = TenantHarness::create("layer_upload_after_local_fs_update")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a1", "b1"]; let storage = GenericRemoteStorage::new(LocalFs::new( @@ -395,7 +395,7 @@ mod tests { #[tokio::test] async fn test_upload_index_part() -> anyhow::Result<()> { let harness = TenantHarness::create("test_upload_index_part")?; - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 2aa803d119..dad6e0039d 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -51,7 +51,7 @@ use tracing::{debug, error, info, warn}; use once_cell::sync::Lazy; -use utils::zid::{ZTenantId, ZTimelineId}; +use utils::id::{TenantId, TimelineId}; use crate::shutdown_pageserver; @@ -210,8 +210,8 @@ pub enum TaskKind { #[derive(Default)] struct MutableTaskState { /// Tenant and timeline that this task is associated with. - tenant_id: Option, - timeline_id: Option, + tenant_id: Option, + timeline_id: Option, /// Handle for waiting for the task to exit. It can be None, if the /// the task has already exited. @@ -238,8 +238,8 @@ struct PageServerTask { pub fn spawn( runtime: &tokio::runtime::Handle, kind: TaskKind, - tenant_id: Option, - timeline_id: Option, + tenant_id: Option, + timeline_id: Option, name: &str, shutdown_process_on_error: bool, future: F, @@ -371,7 +371,7 @@ async fn task_finish( } // expected to be called from the task of the given id. -pub fn associate_with(tenant_id: Option, timeline_id: Option) { +pub fn associate_with(tenant_id: Option, timeline_id: Option) { CURRENT_TASK.with(|ct| { let mut task_mut = ct.mutable.lock().unwrap(); task_mut.tenant_id = tenant_id; @@ -391,12 +391,12 @@ pub fn associate_with(tenant_id: Option, timeline_id: Option, - tenant_id: Option, - timeline_id: Option, + tenant_id: Option, + timeline_id: Option, ) { let mut victim_tasks = Vec::new(); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 4ef810faba..41fd98ec07 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -4,7 +4,7 @@ //! The functions here are responsible for locating the correct layer for the //! get/put call, walking back the timeline branching history as needed. //! -//! The files are stored in the .neon/tenants//timelines/ +//! The files are stored in the .neon/tenants//timelines/ //! directory. See docs/pageserver-storage.md for how the files are managed. //! In addition to the layer files, there is a metadata file in the same //! directory that contains information about the timeline, in particular its @@ -48,8 +48,8 @@ use crate::CheckpointConfig; use toml_edit; use utils::{ crashsafe_dir, + id::{TenantId, TimelineId}, lsn::{Lsn, RecordLsn}, - zid::{ZTenantId, ZTimelineId}, }; mod blob_io; @@ -80,7 +80,7 @@ pub use crate::tenant::metadata::save_metadata; // re-export for use in walreceiver pub use crate::tenant::timeline::WalReceiverInfo; -/// Parts of the `.neon/tenants//timelines/` directory prefix. +/// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; /// @@ -98,8 +98,8 @@ pub struct Tenant { // This is necessary to allow global config updates. tenant_conf: Arc>, - tenant_id: ZTenantId, - timelines: Mutex>>, + tenant_id: TenantId, + timelines: Mutex>>, // This mutex prevents creation of new timelines during GC. // Adding yet another mutex (in addition to `timelines`) is needed because holding // `timelines` mutex during all GC iteration (especially with enforced checkpoint) @@ -134,7 +134,7 @@ pub enum TenantState { impl Tenant { /// Get Timeline handle for given zenith timeline ID. /// This function is idempotent. It doesn't change internal state in any way. - pub fn get_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result> { + pub fn get_timeline(&self, timeline_id: TimelineId) -> anyhow::Result> { self.timelines .lock() .unwrap() @@ -151,7 +151,7 @@ impl Tenant { /// Lists timelines the tenant contains. /// Up to tenant's implementation to omit certain timelines that ar not considered ready for use. - pub fn list_timelines(&self) -> Vec<(ZTimelineId, Arc)> { + pub fn list_timelines(&self) -> Vec<(TimelineId, Arc)> { self.timelines .lock() .unwrap() @@ -164,7 +164,7 @@ impl Tenant { /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. pub fn create_empty_timeline( &self, - new_timeline_id: ZTimelineId, + new_timeline_id: TimelineId, initdb_lsn: Lsn, ) -> Result> { // XXX: keep the lock to avoid races during timeline creation @@ -207,8 +207,8 @@ impl Tenant { /// Branch a timeline pub fn branch_timeline( &self, - src: ZTimelineId, - dst: ZTimelineId, + src: TimelineId, + dst: TimelineId, start_lsn: Option, ) -> Result> { // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn @@ -302,14 +302,14 @@ impl Tenant { /// this function is periodically called by gc task. /// also it can be explicitly requested through page server api 'do_gc' command. /// - /// 'timelineid' specifies the timeline to GC, or None for all. + /// 'target_timeline_id' specifies the timeline to GC, or None for all. /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval). /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC /// to make tests more deterministic. /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed? pub fn gc_iteration( &self, - target_timeline_id: Option, + target_timeline_id: Option, horizon: u64, pitr: Duration, checkpoint_before_gc: bool, @@ -337,13 +337,13 @@ impl Tenant { let timelines = self.timelines.lock().unwrap(); let timelines_to_compact = timelines .iter() - .map(|(timelineid, timeline)| (*timelineid, timeline.clone())) + .map(|(timeline_id, timeline)| (*timeline_id, timeline.clone())) .collect::>(); drop(timelines); - for (timelineid, timeline) in &timelines_to_compact { + for (timeline_id, timeline) in &timelines_to_compact { let _entered = - info_span!("compact", timeline = %timelineid, tenant = %self.tenant_id).entered(); + info_span!("compact", timeline = %timeline_id, tenant = %self.tenant_id).entered(); timeline.compact()?; } @@ -362,13 +362,13 @@ impl Tenant { let timelines = self.timelines.lock().unwrap(); let timelines_to_compact = timelines .iter() - .map(|(timelineid, timeline)| (*timelineid, Arc::clone(timeline))) + .map(|(timeline_id, timeline)| (*timeline_id, Arc::clone(timeline))) .collect::>(); drop(timelines); - for (timelineid, timeline) in &timelines_to_compact { + for (timeline_id, timeline) in &timelines_to_compact { let _entered = - info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenant_id) + info_span!("checkpoint", timeline = %timeline_id, tenant = %self.tenant_id) .entered(); timeline.checkpoint(CheckpointConfig::Flush)?; } @@ -377,7 +377,7 @@ impl Tenant { } /// Removes timeline-related in-memory data - pub fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> { + pub fn delete_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<()> { // in order to be retriable detach needs to be idempotent // (or at least to a point that each time the detach is called it can make progress) let mut timelines = self.timelines.lock().unwrap(); @@ -416,7 +416,7 @@ impl Tenant { pub fn init_attach_timelines( &self, - timelines: HashMap, + timelines: HashMap, ) -> anyhow::Result<()> { let sorted_timelines = if timelines.len() == 1 { timelines.into_iter().collect() @@ -505,13 +505,13 @@ impl Tenant { /// perform a topological sort, so that the parent of each timeline comes /// before the children. fn tree_sort_timelines( - timelines: HashMap, -) -> Result> { + timelines: HashMap, +) -> Result> { let mut result = Vec::with_capacity(timelines.len()); let mut now = Vec::with_capacity(timelines.len()); // (ancestor, children) - let mut later: HashMap> = + let mut later: HashMap> = HashMap::with_capacity(timelines.len()); for (timeline_id, metadata) in timelines { @@ -636,9 +636,9 @@ impl Tenant { fn initialize_new_timeline( &self, - new_timeline_id: ZTimelineId, + new_timeline_id: TimelineId, new_metadata: TimelineMetadata, - timelines: &mut MutexGuard>>, + timelines: &mut MutexGuard>>, ) -> anyhow::Result> { let ancestor = match new_metadata.ancestor_timeline() { Some(ancestor_timeline_id) => Some( @@ -680,7 +680,7 @@ impl Tenant { conf: &'static PageServerConf, tenant_conf: TenantConfOpt, walredo_mgr: Arc, - tenant_id: ZTenantId, + tenant_id: TenantId, remote_index: RemoteIndex, upload_layers: bool, ) -> Tenant { @@ -701,7 +701,7 @@ impl Tenant { /// Locate and load config pub fn load_tenant_config( conf: &'static PageServerConf, - tenant_id: ZTenantId, + tenant_id: TenantId, ) -> anyhow::Result { let target_config_path = TenantConf::path(conf, tenant_id); let target_config_display = target_config_path.display(); @@ -830,7 +830,7 @@ impl Tenant { // we do. fn gc_iteration_internal( &self, - target_timeline_id: Option, + target_timeline_id: Option, horizon: u64, pitr: Duration, checkpoint_before_gc: bool, @@ -848,7 +848,7 @@ impl Tenant { // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. - let mut all_branchpoints: BTreeSet<(ZTimelineId, Lsn)> = BTreeSet::new(); + let mut all_branchpoints: BTreeSet<(TimelineId, Lsn)> = BTreeSet::new(); let timeline_ids = { if let Some(target_timeline_id) = target_timeline_id.as_ref() { if timelines.get(target_timeline_id).is_none() { @@ -861,11 +861,11 @@ impl Tenant { .map(|(timeline_id, timeline_entry)| { // This is unresolved question for now, how to do gc in presence of remote timelines // especially when this is combined with branching. - // Somewhat related: https://github.com/zenithdb/zenith/issues/999 + // Somewhat related: https://github.com/neondatabase/neon/issues/999 if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timelineid) = target_timeline_id { - if ancestor_timeline_id == &timelineid { + if let Some(timeline_id) = target_timeline_id { + if ancestor_timeline_id == &timeline_id { all_branchpoints.insert(( *ancestor_timeline_id, timeline_entry.get_ancestor_lsn(), @@ -895,8 +895,8 @@ impl Tenant { .with_context(|| format!("Timeline {timeline_id} was not found"))?; // If target_timeline is specified, ignore all other timelines - if let Some(target_timelineid) = target_timeline_id { - if timeline_id != target_timelineid { + if let Some(target_timeline_id) = target_timeline_id { + if timeline_id != target_timeline_id { continue; } } @@ -952,7 +952,7 @@ impl Tenant { Ok(totals) } - pub fn tenant_id(&self) -> ZTenantId { + pub fn tenant_id(&self) -> TenantId { self.tenant_id } } @@ -998,7 +998,7 @@ pub mod harness { config::PageServerConf, repository::Key, tenant::Tenant, - walrecord::ZenithWalRecord, + walrecord::NeonWalRecord, walredo::{WalRedoError, WalRedoManager}, }; @@ -1006,12 +1006,12 @@ pub mod harness { use super::*; use crate::tenant_config::{TenantConf, TenantConfOpt}; use hex_literal::hex; - use utils::zid::{ZTenantId, ZTimelineId}; + use utils::id::{TenantId, TimelineId}; - pub const TIMELINE_ID: ZTimelineId = - ZTimelineId::from_array(hex!("11223344556677881122334455667788")); - pub const NEW_TIMELINE_ID: ZTimelineId = - ZTimelineId::from_array(hex!("AA223344556677881122334455667788")); + pub const TIMELINE_ID: TimelineId = + TimelineId::from_array(hex!("11223344556677881122334455667788")); + pub const NEW_TIMELINE_ID: TimelineId = + TimelineId::from_array(hex!("AA223344556677881122334455667788")); /// Convenience function to create a page image with given string as the only content #[allow(non_snake_case)] @@ -1047,7 +1047,7 @@ pub mod harness { pub struct TenantHarness<'a> { pub conf: &'static PageServerConf, pub tenant_conf: TenantConf, - pub tenant_id: ZTenantId, + pub tenant_id: TenantId, pub lock_guard: ( Option>, @@ -1080,7 +1080,7 @@ pub mod harness { let tenant_conf = TenantConf::dummy_conf(); - let tenant_id = ZTenantId::generate(); + let tenant_id = TenantId::generate(); fs::create_dir_all(conf.tenant_path(&tenant_id))?; fs::create_dir_all(conf.timelines_path(&tenant_id))?; @@ -1113,7 +1113,7 @@ pub mod harness { .expect("should be able to read timelines dir") { let timeline_dir_entry = timeline_dir_entry?; - let timeline_id: ZTimelineId = timeline_dir_entry + let timeline_id: TimelineId = timeline_dir_entry .path() .file_name() .unwrap() @@ -1128,15 +1128,15 @@ pub mod harness { Ok(tenant) } - pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf { + pub fn timeline_path(&self, timeline_id: &TimelineId) -> PathBuf { self.conf.timeline_path(timeline_id, &self.tenant_id) } } fn load_metadata( conf: &'static PageServerConf, - timeline_id: ZTimelineId, - tenant_id: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, ) -> anyhow::Result { let metadata_path = metadata_path(conf, timeline_id, tenant_id); let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { @@ -1162,7 +1162,7 @@ pub mod harness { key: Key, lsn: Lsn, base_img: Option, - records: Vec<(Lsn, ZenithWalRecord)>, + records: Vec<(Lsn, NeonWalRecord)>, ) -> Result { let s = format!( "redo for {} to get to {}, with {} and {} records", @@ -1747,7 +1747,7 @@ mod tests { let mut tline_id = TIMELINE_ID; for _ in 0..50 { - let new_tline_id = ZTimelineId::generate(); + let new_tline_id = TimelineId::generate(); tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; tline = tenant .get_timeline(new_tline_id) @@ -1808,7 +1808,7 @@ mod tests { #[allow(clippy::needless_range_loop)] for idx in 0..NUM_TLINES { - let new_tline_id = ZTimelineId::generate(); + let new_tline_id = TimelineId::generate(); tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; tline = tenant .get_timeline(new_tline_id) diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs index ff6d3652f9..892000c20b 100644 --- a/pageserver/src/tenant/delta_layer.rs +++ b/pageserver/src/tenant/delta_layer.rs @@ -7,7 +7,7 @@ //! must be page images or WAL records with the 'will_init' flag set, so that //! they can be replayed without referring to an older page version. //! -//! The delta files are stored in timelines/ directory. Currently, +//! The delta files are stored in timelines/ directory. Currently, //! there are no subdirectories, and each delta file is named like this: //! //! -__-, lsn_range: Range, @@ -81,8 +81,8 @@ impl From<&DeltaLayer> for Summary { magic: DELTA_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenantid: layer.tenantid, - timelineid: layer.timelineid, + tenant_id: layer.tenant_id, + timeline_id: layer.timeline_id, key_range: layer.key_range.clone(), lsn_range: layer.lsn_range.clone(), @@ -173,8 +173,8 @@ impl DeltaKey { pub struct DeltaLayer { path_or_conf: PathOrConf, - pub tenantid: ZTenantId, - pub timelineid: ZTimelineId, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, pub key_range: Range, pub lsn_range: Range, @@ -194,12 +194,12 @@ pub struct DeltaLayerInner { } impl Layer for DeltaLayer { - fn get_tenant_id(&self) -> ZTenantId { - self.tenantid + fn get_tenant_id(&self) -> TenantId { + self.tenant_id } - fn get_timeline_id(&self) -> ZTimelineId { - self.timelineid + fn get_timeline_id(&self) -> TimelineId { + self.timeline_id } fn get_key_range(&self) -> Range { @@ -344,8 +344,8 @@ impl Layer for DeltaLayer { fn dump(&self, verbose: bool) -> Result<()> { println!( "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----", - self.tenantid, - self.timelineid, + self.tenant_id, + self.timeline_id, self.key_range.start, self.key_range.end, self.lsn_range.start, @@ -419,22 +419,22 @@ impl Layer for DeltaLayer { impl DeltaLayer { fn path_for( path_or_conf: &PathOrConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, fname: &DeltaFileName, ) -> PathBuf { match path_or_conf { PathOrConf::Path(path) => path.clone(), PathOrConf::Conf(conf) => conf - .timeline_path(&timelineid, &tenantid) + .timeline_path(&timeline_id, &tenant_id) .join(fname.to_string()), } } fn temp_path_for( conf: &PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, key_start: Key, lsn_range: &Range, ) -> PathBuf { @@ -444,7 +444,7 @@ impl DeltaLayer { .map(char::from) .collect(); - conf.timeline_path(&timelineid, &tenantid).join(format!( + conf.timeline_path(&timeline_id, &tenant_id).join(format!( "{}-XXX__{:016X}-{:016X}.{}.{}", key_start, u64::from(lsn_range.start), @@ -535,14 +535,14 @@ impl DeltaLayer { /// Create a DeltaLayer struct representing an existing file on disk. pub fn new( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, filename: &DeltaFileName, ) -> DeltaLayer { DeltaLayer { path_or_conf: PathOrConf::Conf(conf), - timelineid, - tenantid, + timeline_id, + tenant_id, key_range: filename.key_range.clone(), lsn_range: filename.lsn_range.clone(), inner: RwLock::new(DeltaLayerInner { @@ -568,8 +568,8 @@ impl DeltaLayer { Ok(DeltaLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), - timelineid: summary.timelineid, - tenantid: summary.tenantid, + timeline_id: summary.timeline_id, + tenant_id: summary.tenant_id, key_range: summary.key_range, lsn_range: summary.lsn_range, inner: RwLock::new(DeltaLayerInner { @@ -592,8 +592,8 @@ impl DeltaLayer { pub fn path(&self) -> PathBuf { Self::path_for( &self.path_or_conf, - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, &self.layer_name(), ) } @@ -613,8 +613,8 @@ impl DeltaLayer { pub struct DeltaLayerWriter { conf: &'static PageServerConf, path: PathBuf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, key_start: Key, lsn_range: Range, @@ -630,8 +630,8 @@ impl DeltaLayerWriter { /// pub fn new( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, key_start: Key, lsn_range: Range, ) -> Result { @@ -641,7 +641,7 @@ impl DeltaLayerWriter { // // Note: This overwrites any existing file. There shouldn't be any. // FIXME: throw an error instead? - let path = DeltaLayer::temp_path_for(conf, timelineid, tenantid, key_start, &lsn_range); + let path = DeltaLayer::temp_path_for(conf, timeline_id, tenant_id, key_start, &lsn_range); let mut file = VirtualFile::create(&path)?; // make room for the header block @@ -656,8 +656,8 @@ impl DeltaLayerWriter { Ok(DeltaLayerWriter { conf, path, - timelineid, - tenantid, + timeline_id, + tenant_id, key_start, lsn_range, tree: tree_builder, @@ -718,8 +718,8 @@ impl DeltaLayerWriter { let summary = Summary { magic: DELTA_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenantid: self.tenantid, - timelineid: self.timelineid, + tenant_id: self.tenant_id, + timeline_id: self.timeline_id, key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), index_start_blk, @@ -733,8 +733,8 @@ impl DeltaLayerWriter { // set inner.file here. The first read will have to re-open it. let layer = DeltaLayer { path_or_conf: PathOrConf::Conf(self.conf), - tenantid: self.tenantid, - timelineid: self.timelineid, + tenant_id: self.tenant_id, + timeline_id: self.timeline_id, key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), inner: RwLock::new(DeltaLayerInner { @@ -753,8 +753,8 @@ impl DeltaLayerWriter { // FIXME: throw an error instead? let final_path = DeltaLayer::path_for( &PathOrConf::Conf(self.conf), - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, &DeltaFileName { key_range: self.key_start..key_end, lsn_range: self.lsn_range, diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index c675e4e778..0774fa42a6 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -17,7 +17,7 @@ use std::ops::DerefMut; use std::path::PathBuf; use std::sync::{Arc, RwLock}; use tracing::*; -use utils::zid::{ZTenantId, ZTimelineId}; +use utils::id::{TenantId, TimelineId}; use std::os::unix::fs::FileExt; @@ -39,8 +39,8 @@ pub struct EphemeralFiles { pub struct EphemeralFile { file_id: u64, - _tenantid: ZTenantId, - _timelineid: ZTimelineId, + _tenant_id: TenantId, + _timeline_id: TimelineId, file: Arc, pub size: u64, @@ -49,15 +49,15 @@ pub struct EphemeralFile { impl EphemeralFile { pub fn create( conf: &PageServerConf, - tenantid: ZTenantId, - timelineid: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, ) -> Result { let mut l = EPHEMERAL_FILES.write().unwrap(); let file_id = l.next_file_id; l.next_file_id += 1; let filename = conf - .timeline_path(&timelineid, &tenantid) + .timeline_path(&timeline_id, &tenant_id) .join(PathBuf::from(format!("ephemeral-{}", file_id))); let file = VirtualFile::open_with_options( @@ -69,8 +69,8 @@ impl EphemeralFile { Ok(EphemeralFile { file_id, - _tenantid: tenantid, - _timelineid: timelineid, + _tenant_id: tenant_id, + _timeline_id: timeline_id, file: file_rc, size: 0, }) @@ -338,7 +338,7 @@ mod tests { fn harness( test_name: &str, - ) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), io::Error> { + ) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> { let repo_dir = PageServerConf::test_repo_dir(test_name); let _ = fs::remove_dir_all(&repo_dir); let conf = PageServerConf::dummy_conf(repo_dir); @@ -346,11 +346,11 @@ mod tests { // OK in a test. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap(); - let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap(); - fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?; + let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap(); + let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap(); + fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?; - Ok((conf, tenantid, timelineid)) + Ok((conf, tenant_id, timeline_id)) } // Helper function to slurp contents of a file, starting at the current position, @@ -368,9 +368,9 @@ mod tests { #[test] fn test_ephemeral_files() -> Result<(), io::Error> { - let (conf, tenantid, timelineid) = harness("ephemeral_files")?; + let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?; - let file_a = EphemeralFile::create(conf, tenantid, timelineid)?; + let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?; file_a.write_all_at(b"foo", 0)?; assert_eq!("foo", read_string(&file_a, 0, 20)?); @@ -381,7 +381,7 @@ mod tests { // Open a lot of files, enough to cause some page evictions. let mut efiles = Vec::new(); for fileno in 0..100 { - let efile = EphemeralFile::create(conf, tenantid, timelineid)?; + let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?; efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?; assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?); efiles.push((fileno, efile)); @@ -399,9 +399,9 @@ mod tests { #[test] fn test_ephemeral_blobs() -> Result<(), io::Error> { - let (conf, tenantid, timelineid) = harness("ephemeral_blobs")?; + let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?; - let mut file = EphemeralFile::create(conf, tenantid, timelineid)?; + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?; let pos_foo = file.write_blob(b"foo")?; assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice()); diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs index 518643241d..92bf022fee 100644 --- a/pageserver/src/tenant/image_layer.rs +++ b/pageserver/src/tenant/image_layer.rs @@ -4,7 +4,7 @@ //! but does not exist in the layer, does not exist. //! //! An image layer is stored in a file on disk. The file is stored in -//! timelines/ directory. Currently, there are no +//! timelines/ directory. Currently, there are no //! subdirectories, and each image layer file is named like this: //! //! -__ @@ -44,8 +44,8 @@ use tracing::*; use utils::{ bin_ser::BeSer, + id::{TenantId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, }; /// @@ -56,12 +56,12 @@ use utils::{ /// #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] struct Summary { - /// Magic value to identify this as a zenith image file. Always IMAGE_FILE_MAGIC. + /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC. magic: u16, format_version: u16, - tenantid: ZTenantId, - timelineid: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, key_range: Range, lsn: Lsn, @@ -77,8 +77,8 @@ impl From<&ImageLayer> for Summary { Self { magic: IMAGE_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenantid: layer.tenantid, - timelineid: layer.timelineid, + tenant_id: layer.tenant_id, + timeline_id: layer.timeline_id, key_range: layer.key_range.clone(), lsn: layer.lsn, @@ -97,8 +97,8 @@ impl From<&ImageLayer> for Summary { /// pub struct ImageLayer { path_or_conf: PathOrConf, - pub tenantid: ZTenantId, - pub timelineid: ZTimelineId, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, pub key_range: Range, // This entry contains an image of all pages as of this LSN @@ -128,12 +128,12 @@ impl Layer for ImageLayer { Some(self.path()) } - fn get_tenant_id(&self) -> ZTenantId { - self.tenantid + fn get_tenant_id(&self) -> TenantId { + self.tenant_id } - fn get_timeline_id(&self) -> ZTimelineId { - self.timelineid + fn get_timeline_id(&self) -> TimelineId { + self.timeline_id } fn get_key_range(&self) -> Range { @@ -202,7 +202,7 @@ impl Layer for ImageLayer { fn dump(&self, verbose: bool) -> Result<()> { println!( "----- image layer for ten {} tli {} key {}-{} at {} ----", - self.tenantid, self.timelineid, self.key_range.start, self.key_range.end, self.lsn + self.tenant_id, self.timeline_id, self.key_range.start, self.key_range.end, self.lsn ); if !verbose { @@ -228,22 +228,22 @@ impl Layer for ImageLayer { impl ImageLayer { fn path_for( path_or_conf: &PathOrConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, fname: &ImageFileName, ) -> PathBuf { match path_or_conf { PathOrConf::Path(path) => path.to_path_buf(), PathOrConf::Conf(conf) => conf - .timeline_path(&timelineid, &tenantid) + .timeline_path(&timeline_id, &tenant_id) .join(fname.to_string()), } } fn temp_path_for( conf: &PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, fname: &ImageFileName, ) -> PathBuf { let rand_string: String = rand::thread_rng() @@ -252,7 +252,7 @@ impl ImageLayer { .map(char::from) .collect(); - conf.timeline_path(&timelineid, &tenantid) + conf.timeline_path(&timeline_id, &tenant_id) .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}")) } @@ -336,14 +336,14 @@ impl ImageLayer { /// Create an ImageLayer struct representing an existing file on disk pub fn new( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, filename: &ImageFileName, ) -> ImageLayer { ImageLayer { path_or_conf: PathOrConf::Conf(conf), - timelineid, - tenantid, + timeline_id, + tenant_id, key_range: filename.key_range.clone(), lsn: filename.lsn, inner: RwLock::new(ImageLayerInner { @@ -369,8 +369,8 @@ impl ImageLayer { Ok(ImageLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), - timelineid: summary.timelineid, - tenantid: summary.tenantid, + timeline_id: summary.timeline_id, + tenant_id: summary.tenant_id, key_range: summary.key_range, lsn: summary.lsn, inner: RwLock::new(ImageLayerInner { @@ -393,8 +393,8 @@ impl ImageLayer { pub fn path(&self) -> PathBuf { Self::path_for( &self.path_or_conf, - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, &self.layer_name(), ) } @@ -414,8 +414,8 @@ impl ImageLayer { pub struct ImageLayerWriter { conf: &'static PageServerConf, path: PathBuf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, key_range: Range, lsn: Lsn, @@ -426,8 +426,8 @@ pub struct ImageLayerWriter { impl ImageLayerWriter { pub fn new( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, key_range: &Range, lsn: Lsn, ) -> anyhow::Result { @@ -435,8 +435,8 @@ impl ImageLayerWriter { // We'll atomically rename it to the final name when we're done. let path = ImageLayer::temp_path_for( conf, - timelineid, - tenantid, + timeline_id, + tenant_id, &ImageFileName { key_range: key_range.clone(), lsn, @@ -458,8 +458,8 @@ impl ImageLayerWriter { let writer = ImageLayerWriter { conf, path, - timelineid, - tenantid, + timeline_id, + tenant_id, key_range: key_range.clone(), lsn, tree: tree_builder, @@ -502,8 +502,8 @@ impl ImageLayerWriter { let summary = Summary { magic: IMAGE_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenantid: self.tenantid, - timelineid: self.timelineid, + tenant_id: self.tenant_id, + timeline_id: self.timeline_id, key_range: self.key_range.clone(), lsn: self.lsn, index_start_blk, @@ -517,8 +517,8 @@ impl ImageLayerWriter { // set inner.file here. The first read will have to re-open it. let layer = ImageLayer { path_or_conf: PathOrConf::Conf(self.conf), - timelineid: self.timelineid, - tenantid: self.tenantid, + timeline_id: self.timeline_id, + tenant_id: self.tenant_id, key_range: self.key_range.clone(), lsn: self.lsn, inner: RwLock::new(ImageLayerInner { @@ -538,8 +538,8 @@ impl ImageLayerWriter { // FIXME: throw an error instead? let final_path = ImageLayer::path_for( &PathOrConf::Conf(self.conf), - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, &ImageFileName { key_range: self.key_range.clone(), lsn: self.lsn, diff --git a/pageserver/src/tenant/inmemory_layer.rs b/pageserver/src/tenant/inmemory_layer.rs index 0e7b215b1e..9aa33a72ca 100644 --- a/pageserver/src/tenant/inmemory_layer.rs +++ b/pageserver/src/tenant/inmemory_layer.rs @@ -18,9 +18,9 @@ use std::collections::HashMap; use tracing::*; use utils::{ bin_ser::BeSer, + id::{TenantId, TimelineId}, lsn::Lsn, vec_map::VecMap, - zid::{ZTenantId, ZTimelineId}, }; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods @@ -37,8 +37,8 @@ thread_local! { pub struct InMemoryLayer { conf: &'static PageServerConf, - tenantid: ZTenantId, - timelineid: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, /// /// This layer contains all the changes from 'start_lsn'. The @@ -94,12 +94,12 @@ impl Layer for InMemoryLayer { None } - fn get_tenant_id(&self) -> ZTenantId { - self.tenantid + fn get_tenant_id(&self) -> TenantId { + self.tenant_id } - fn get_timeline_id(&self) -> ZTimelineId { - self.timelineid + fn get_timeline_id(&self) -> TimelineId { + self.timeline_id } fn get_key_range(&self) -> Range { @@ -197,7 +197,7 @@ impl Layer for InMemoryLayer { println!( "----- in-memory layer for tli {} LSNs {}-{} ----", - self.timelineid, self.start_lsn, end_str, + self.timeline_id, self.start_lsn, end_str, ); if !verbose { @@ -251,22 +251,18 @@ impl InMemoryLayer { /// pub fn create( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, start_lsn: Lsn, ) -> Result { - trace!( - "initializing new empty InMemoryLayer for writing on timeline {} at {}", - timelineid, - start_lsn - ); + trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = EphemeralFile::create(conf, tenantid, timelineid)?; + let file = EphemeralFile::create(conf, tenant_id, timeline_id)?; Ok(InMemoryLayer { conf, - timelineid, - tenantid, + timeline_id, + tenant_id, start_lsn, inner: RwLock::new(InMemoryLayerInner { end_lsn: None, @@ -281,7 +277,7 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { - trace!("put_value key {} at {}/{}", key, self.timelineid, lsn); + trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); let mut inner = self.inner.write().unwrap(); inner.assert_writeable(); @@ -344,8 +340,8 @@ impl InMemoryLayer { let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, Key::MIN, self.start_lsn..inner.end_lsn.unwrap(), )?; diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index c24e3976fb..8abeebf54c 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -2,7 +2,7 @@ //! The layer map tracks what layers exist in a timeline. //! //! When the timeline is first accessed, the server lists of all layer files -//! in the timelines/ directory, and populates this map with +//! in the timelines/ directory, and populates this map with //! ImageLayer and DeltaLayer structs corresponding to each file. When the first //! new WAL record is received, we create an InMemoryLayer to hold the incoming //! records. Now and then, in the checkpoint() function, the in-memory layer is diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 4ea2b7d55b..ace4dc91e9 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -15,8 +15,8 @@ use serde::{Deserialize, Serialize}; use tracing::info_span; use utils::{ bin_ser::BeSer, + id::{TenantId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, }; use crate::config::PageServerConf; @@ -63,7 +63,7 @@ struct TimelineMetadataBody { // doing a clean shutdown, so that there is no more WAL beyond // 'disk_consistent_lsn' prev_record_lsn: Option, - ancestor_timeline: Option, + ancestor_timeline: Option, ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, @@ -73,7 +73,7 @@ impl TimelineMetadata { pub fn new( disk_consistent_lsn: Lsn, prev_record_lsn: Option, - ancestor_timeline: Option, + ancestor_timeline: Option, ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, @@ -149,7 +149,7 @@ impl TimelineMetadata { self.body.prev_record_lsn } - pub fn ancestor_timeline(&self) -> Option { + pub fn ancestor_timeline(&self) -> Option { self.body.ancestor_timeline } @@ -170,23 +170,23 @@ impl TimelineMetadata { /// where certain timeline's metadata file should be located. pub fn metadata_path( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, ) -> PathBuf { - conf.timeline_path(&timelineid, &tenantid) + conf.timeline_path(&timeline_id, &tenant_id) .join(METADATA_FILE_NAME) } /// Save timeline metadata to file pub fn save_metadata( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, data: &TimelineMetadata, first_save: bool, ) -> anyhow::Result<()> { let _enter = info_span!("saving metadata").entered(); - let path = metadata_path(conf, timelineid, tenantid); + let path = metadata_path(conf, timeline_id, tenant_id); // use OpenOptions to ensure file presence is consistent with first_save let mut file = VirtualFile::open_with_options( &path, diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index e10330bdd3..8dafcab124 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -3,15 +3,15 @@ //! use crate::repository::{Key, Value}; -use crate::walrecord::ZenithWalRecord; +use crate::walrecord::NeonWalRecord; use anyhow::Result; use bytes::Bytes; use std::ops::Range; use std::path::PathBuf; use utils::{ + id::{TenantId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, }; pub fn range_overlaps(a: &Range, b: &Range) -> bool @@ -50,7 +50,7 @@ where /// #[derive(Debug)] pub struct ValueReconstructState { - pub records: Vec<(Lsn, ZenithWalRecord)>, + pub records: Vec<(Lsn, NeonWalRecord)>, pub img: Option<(Lsn, Bytes)>, } @@ -84,10 +84,10 @@ pub enum ValueReconstructResult { /// LSN /// pub trait Layer: Send + Sync { - fn get_tenant_id(&self) -> ZTenantId; + fn get_tenant_id(&self) -> TenantId; /// Identify the timeline this layer belongs to - fn get_timeline_id(&self) -> ZTimelineId; + fn get_timeline_id(&self) -> TimelineId; /// Range of keys that this layer covers fn get_key_range(&self) -> Range; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c96ad99909..e821ef1b9a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -39,10 +39,10 @@ use crate::tenant_config::TenantConfOpt; use postgres_ffi::v14::xlog_utils::to_pg_timestamp; use utils::{ + id::{TenantId, TimelineId}, lsn::{AtomicLsn, Lsn, RecordLsn}, seqwait::SeqWait, simple_rcu::{Rcu, RcuReadGuard}, - zid::{ZTenantId, ZTimelineId}, }; use crate::repository::GcResult; @@ -58,8 +58,8 @@ pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, - pub tenant_id: ZTenantId, - pub timeline_id: ZTimelineId, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, pub layers: RwLock, @@ -312,7 +312,7 @@ impl Timeline { } /// Get the ancestor's timeline id - pub fn get_ancestor_timeline_id(&self) -> Option { + pub fn get_ancestor_timeline_id(&self) -> Option { self.ancestor_timeline .as_ref() .map(|ancestor| ancestor.timeline_id) @@ -531,8 +531,8 @@ impl Timeline { tenant_conf: Arc>, metadata: TimelineMetadata, ancestor: Option>, - timeline_id: ZTimelineId, - tenant_id: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, walredo_mgr: Arc, upload_layers: bool, ) -> Timeline { @@ -1250,7 +1250,7 @@ impl Timeline { None }; - let ancestor_timelineid = self + let ancestor_timeline_id = self .ancestor_timeline .as_ref() .map(|ancestor| ancestor.timeline_id); @@ -1258,7 +1258,7 @@ impl Timeline { let metadata = TimelineMetadata::new( disk_consistent_lsn, ondisk_prev_record_lsn, - ancestor_timelineid, + ancestor_timeline_id, self.ancestor_lsn, *self.latest_gc_cutoff_lsn.read(), self.initdb_lsn, diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index 73bf3636d2..4448ffc456 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -13,7 +13,7 @@ use serde::{Deserialize, Serialize}; use std::num::NonZeroU64; use std::path::PathBuf; use std::time::Duration; -use utils::zid::ZTenantId; +use utils::id::TenantId; pub const TENANT_CONFIG_NAME: &str = "config"; @@ -217,8 +217,8 @@ impl TenantConf { /// Points to a place in pageserver's local directory, /// where certain tenant's tenantconf file should be located. - pub fn path(conf: &'static PageServerConf, tenantid: ZTenantId) -> PathBuf { - conf.tenant_path(&tenantid).join(TENANT_CONFIG_NAME) + pub fn path(conf: &'static PageServerConf, tenant_id: TenantId) -> PathBuf { + conf.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME) } #[cfg(test)] diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index a8a9926c77..d6fa843305 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -27,7 +27,7 @@ use crate::walredo::PostgresRedoManager; use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; use utils::crashsafe_dir; -use utils::zid::{ZTenantId, ZTimelineId}; +use utils::id::{TenantId, TimelineId}; mod tenants_state { use once_cell::sync::Lazy; @@ -35,20 +35,20 @@ mod tenants_state { collections::HashMap, sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}, }; - use utils::zid::ZTenantId; + use utils::id::TenantId; use crate::tenant::Tenant; - static TENANTS: Lazy>>> = + static TENANTS: Lazy>>> = Lazy::new(|| RwLock::new(HashMap::new())); - pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap>> { + pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap>> { TENANTS .read() .expect("Failed to read() tenants lock, it got poisoned") } - pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap>> { + pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap>> { TENANTS .write() .expect("Failed to write() tenants lock, it got poisoned") @@ -159,7 +159,7 @@ pub fn attach_local_tenants( fn load_local_tenant( conf: &'static PageServerConf, - tenant_id: ZTenantId, + tenant_id: TenantId, remote_index: &RemoteIndex, ) -> Arc { let tenant = Arc::new(Tenant::new( @@ -225,7 +225,7 @@ pub async fn shutdown_all_tenants() { fn create_tenant_files( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, - tenant_id: ZTenantId, + tenant_id: TenantId, ) -> anyhow::Result<()> { let target_tenant_directory = conf.tenant_path(&tenant_id); anyhow::ensure!( @@ -310,9 +310,9 @@ fn rebase_directory(original_path: &Path, base: &Path, new_base: &Path) -> anyho pub fn create_tenant( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, - tenant_id: ZTenantId, + tenant_id: TenantId, remote_index: RemoteIndex, -) -> anyhow::Result> { +) -> anyhow::Result> { match tenants_state::write_tenants().entry(tenant_id) { hash_map::Entry::Occupied(_) => { debug!("tenant {tenant_id} already exists"); @@ -339,7 +339,7 @@ pub fn create_tenant( pub fn update_tenant_config( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, - tenant_id: ZTenantId, + tenant_id: TenantId, ) -> anyhow::Result<()> { info!("configuring tenant {tenant_id}"); get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf); @@ -349,7 +349,7 @@ pub fn update_tenant_config( /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. -pub fn get_tenant(tenant_id: ZTenantId, active_only: bool) -> anyhow::Result> { +pub fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result> { let m = tenants_state::read_tenants(); let tenant = m .get(&tenant_id) @@ -361,7 +361,7 @@ pub fn get_tenant(tenant_id: ZTenantId, active_only: bool) -> anyhow::Result anyhow::Result<()> { +pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> { // Start with the shutdown of timeline tasks (this shuts down the walreceiver) // It is important that we do not take locks here, and do not check whether the timeline exists // because if we hold tenants_state::write_tenants() while awaiting for the tasks to join @@ -398,7 +398,7 @@ pub async fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> pub async fn detach_tenant( conf: &'static PageServerConf, - tenant_id: ZTenantId, + tenant_id: TenantId, ) -> anyhow::Result<()> { let tenant = match { let mut tenants_accessor = tenants_state::write_tenants(); @@ -565,14 +565,14 @@ fn collect_timelines_for_tenant( config: &'static PageServerConf, tenant_path: &Path, ) -> anyhow::Result<( - ZTenantId, - HashMap)>, + TenantId, + HashMap)>, )> { let tenant_id = tenant_path .file_name() .and_then(OsStr::to_str) .unwrap_or_default() - .parse::() + .parse::() .context("Could not parse tenant id out of the tenant dir name")?; let timelines_dir = config.timelines_path(&tenant_id); @@ -644,7 +644,7 @@ fn collect_timelines_for_tenant( // NOTE: ephemeral files are excluded from the list fn collect_timeline_files( timeline_dir: &Path, -) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet)> { +) -> anyhow::Result<(TimelineId, TimelineMetadata, HashSet)> { let mut timeline_files = HashSet::new(); let mut timeline_metadata_path = None; @@ -652,7 +652,7 @@ fn collect_timeline_files( .file_name() .and_then(OsStr::to_str) .unwrap_or_default() - .parse::() + .parse::() .context("Could not parse timeline id out of the timeline dir name")?; let timeline_dir_entries = fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 3ef54838af..c543a0ecb1 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -10,9 +10,9 @@ use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::{Tenant, TenantState}; use crate::tenant_mgr; use tracing::*; -use utils::zid::ZTenantId; +use utils::id::TenantId; -pub fn start_background_loops(tenant_id: ZTenantId) { +pub fn start_background_loops(tenant_id: TenantId) { task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, @@ -42,9 +42,8 @@ pub fn start_background_loops(tenant_id: ZTenantId) { /// /// Compaction task's main loop /// -async fn compaction_loop(tenant_id: ZTenantId) { +async fn compaction_loop(tenant_id: TenantId) { let wait_duration = Duration::from_secs(2); - info!("starting compaction loop for {tenant_id}"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { @@ -90,9 +89,8 @@ async fn compaction_loop(tenant_id: ZTenantId) { /// /// GC task's main loop /// -async fn gc_loop(tenant_id: ZTenantId) { +async fn gc_loop(tenant_id: TenantId) { let wait_duration = Duration::from_secs(2); - info!("starting gc loop for {tenant_id}"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { @@ -138,7 +136,7 @@ async fn gc_loop(tenant_id: ZTenantId) { } async fn wait_for_active_tenant( - tenant_id: ZTenantId, + tenant_id: TenantId, wait: Duration, ) -> ControlFlow<(), Arc> { let tenant = loop { diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 69d14babf0..88b26e18f4 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -14,8 +14,8 @@ use tracing::*; use remote_storage::path_with_suffix_extension; use utils::{ + id::{TenantId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, }; use crate::config::PageServerConf; @@ -61,8 +61,8 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { // fn bootstrap_timeline( conf: &'static PageServerConf, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, tenant: &Tenant, ) -> Result> { // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` @@ -115,12 +115,12 @@ fn bootstrap_timeline( /// pub(crate) async fn create_timeline( conf: &'static PageServerConf, - tenant_id: ZTenantId, - new_timeline_id: Option, - ancestor_timeline_id: Option, + tenant_id: TenantId, + new_timeline_id: Option, + ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, ) -> Result>> { - let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); + let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); let tenant = tenant_mgr::get_tenant(tenant_id, true)?; if conf.timeline_path(&new_timeline_id, &tenant_id).exists() { diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 7a2c699b44..896c2603a2 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -53,8 +53,8 @@ pub struct VirtualFile { pub path: PathBuf, open_options: OpenOptions, - tenantid: String, - timelineid: String, + tenant_id: String, + timeline_id: String, } #[derive(Debug, PartialEq, Clone, Copy)] @@ -149,7 +149,7 @@ impl OpenFiles { // old file. // if let Some(old_file) = slot_guard.file.take() { - // We do not have information about tenantid/timelineid of evicted file. + // We do not have information about tenant_id/timeline_id of evicted file. // It is possible to store path together with file or use filepath crate, // but as far as close() is not expected to be fast, it is not so critical to gather // precise per-tenant statistic here. @@ -197,18 +197,18 @@ impl VirtualFile { ) -> Result { let path_str = path.to_string_lossy(); let parts = path_str.split('/').collect::>(); - let tenantid; - let timelineid; + let tenant_id; + let timeline_id; if parts.len() > 5 && parts[parts.len() - 5] == "tenants" { - tenantid = parts[parts.len() - 4].to_string(); - timelineid = parts[parts.len() - 2].to_string(); + tenant_id = parts[parts.len() - 4].to_string(); + timeline_id = parts[parts.len() - 2].to_string(); } else { - tenantid = "*".to_string(); - timelineid = "*".to_string(); + tenant_id = "*".to_string(); + timeline_id = "*".to_string(); } let (handle, mut slot_guard) = get_open_files().find_victim_slot(); let file = STORAGE_IO_TIME - .with_label_values(&["open", &tenantid, &timelineid]) + .with_label_values(&["open", &tenant_id, &timeline_id]) .observe_closure_duration(|| open_options.open(path))?; // Strip all options other than read and write. @@ -226,8 +226,8 @@ impl VirtualFile { pos: 0, path: path.to_path_buf(), open_options: reopen_options, - tenantid, - timelineid, + tenant_id, + timeline_id, }; slot_guard.file.replace(file); @@ -267,7 +267,7 @@ impl VirtualFile { // Found a cached file descriptor. slot.recently_used.store(true, Ordering::Relaxed); return Ok(STORAGE_IO_TIME - .with_label_values(&[op, &self.tenantid, &self.timelineid]) + .with_label_values(&[op, &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| func(file))); } } @@ -294,7 +294,7 @@ impl VirtualFile { // Open the physical file let file = STORAGE_IO_TIME - .with_label_values(&["open", &self.tenantid, &self.timelineid]) + .with_label_values(&["open", &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| self.open_options.open(&self.path))?; // Perform the requested operation on it @@ -308,7 +308,7 @@ impl VirtualFile { // may deadlock on subsequent read calls. // Simply replacing all `RwLock` in project causes deadlocks, so use it sparingly. let result = STORAGE_IO_TIME - .with_label_values(&[op, &self.tenantid, &self.timelineid]) + .with_label_values(&[op, &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| func(&file)); // Store the File in the slot and update the handle in the VirtualFile @@ -333,11 +333,11 @@ impl Drop for VirtualFile { if slot_guard.tag == handle.tag { slot.recently_used.store(false, Ordering::Relaxed); // Unlike files evicted by replacement algorithm, here - // we group close time by tenantid/timelineid. + // we group close time by tenant_id/timeline_id. // At allows to compare number/time of "normal" file closes // with file eviction. STORAGE_IO_TIME - .with_label_values(&["close", &self.tenantid, &self.timelineid]) + .with_label_values(&["close", &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| slot_guard.file.take()); } } @@ -399,7 +399,7 @@ impl FileExt for VirtualFile { let result = self.with_file("read", |file| file.read_at(buf, offset))?; if let Ok(size) = result { STORAGE_IO_SIZE - .with_label_values(&["read", &self.tenantid, &self.timelineid]) + .with_label_values(&["read", &self.tenant_id, &self.timeline_id]) .add(size as i64); } result @@ -409,7 +409,7 @@ impl FileExt for VirtualFile { let result = self.with_file("write", |file| file.write_at(buf, offset))?; if let Ok(size) = result { STORAGE_IO_SIZE - .with_label_values(&["write", &self.tenantid, &self.timelineid]) + .with_label_values(&["write", &self.tenant_id, &self.timeline_id]) .add(size as i64); } result diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 45d0916dec..bede4ac13e 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1,5 +1,5 @@ //! -//! Parse PostgreSQL WAL records and store them in a zenith Timeline. +//! Parse PostgreSQL WAL records and store them in a neon Timeline. //! //! The pipeline for ingesting WAL looks like this: //! @@ -9,7 +9,7 @@ //! and decodes it to individual WAL records. It feeds the WAL records //! to WalIngest, which parses them and stores them in the Repository. //! -//! The zenith Repository can store page versions in two formats: as +//! The neon Repository can store page versions in two formats: as //! page images, or a WAL records. WalIngest::ingest_record() extracts //! page images out of some WAL records, but most it stores as WAL //! records. If a WAL record modifies multiple pages, WalIngest @@ -315,7 +315,7 @@ impl<'a> WalIngest<'a> { assert_eq!(image.len(), BLCKSZ as usize); self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?; } else { - let rec = ZenithWalRecord::Postgres { + let rec = NeonWalRecord::Postgres { will_init: blk.will_init || blk.apply_image, rec: decoded.record.clone(), }; @@ -428,7 +428,7 @@ impl<'a> WalIngest<'a> { modification, vm_rel, new_vm_blk.unwrap(), - ZenithWalRecord::ClearVisibilityMapFlags { + NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno, flags: pg_constants::VISIBILITYMAP_VALID_BITS, @@ -442,7 +442,7 @@ impl<'a> WalIngest<'a> { modification, vm_rel, new_vm_blk, - ZenithWalRecord::ClearVisibilityMapFlags { + NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno: None, flags: pg_constants::VISIBILITYMAP_VALID_BITS, @@ -454,7 +454,7 @@ impl<'a> WalIngest<'a> { modification, vm_rel, old_vm_blk, - ZenithWalRecord::ClearVisibilityMapFlags { + NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno: None, old_heap_blkno, flags: pg_constants::VISIBILITYMAP_VALID_BITS, @@ -642,12 +642,12 @@ impl<'a> WalIngest<'a> { segno, rpageno, if is_commit { - ZenithWalRecord::ClogSetCommitted { + NeonWalRecord::ClogSetCommitted { xids: page_xids, timestamp: parsed.xact_time, } } else { - ZenithWalRecord::ClogSetAborted { xids: page_xids } + NeonWalRecord::ClogSetAborted { xids: page_xids } }, )?; page_xids = Vec::new(); @@ -662,12 +662,12 @@ impl<'a> WalIngest<'a> { segno, rpageno, if is_commit { - ZenithWalRecord::ClogSetCommitted { + NeonWalRecord::ClogSetCommitted { xids: page_xids, timestamp: parsed.xact_time, } } else { - ZenithWalRecord::ClogSetAborted { xids: page_xids } + NeonWalRecord::ClogSetAborted { xids: page_xids } }, )?; @@ -760,7 +760,7 @@ impl<'a> WalIngest<'a> { SlruKind::MultiXactOffsets, segno, rpageno, - ZenithWalRecord::MultixactOffsetCreate { + NeonWalRecord::MultixactOffsetCreate { mid: xlrec.mid, moff: xlrec.moff, }, @@ -794,7 +794,7 @@ impl<'a> WalIngest<'a> { SlruKind::MultiXactMembers, pageno / pg_constants::SLRU_PAGES_PER_SEGMENT, pageno % pg_constants::SLRU_PAGES_PER_SEGMENT, - ZenithWalRecord::MultixactMembersCreate { + NeonWalRecord::MultixactMembersCreate { moff: offset, members: this_page_members, }, @@ -901,7 +901,7 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, - rec: ZenithWalRecord, + rec: NeonWalRecord, ) -> Result<()> { self.handle_rel_extend(modification, rel, blknum)?; modification.put_rel_wal_record(rel, blknum, rec)?; diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 69e400f291..1e4b4e7d52 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -34,8 +34,8 @@ use crate::{ DEFAULT_MAX_BACKOFF_SECONDS, }; use utils::{ + id::{NodeId, TenantTimelineId}, lsn::Lsn, - zid::{NodeId, ZTenantTimelineId}, }; use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle}; @@ -101,7 +101,7 @@ async fn connection_manager_loop_step( etcd_client: &mut Client, walreceiver_state: &mut WalreceiverState, ) { - let id = ZTenantTimelineId { + let id = TenantTimelineId { tenant_id: walreceiver_state.timeline.tenant_id, timeline_id: walreceiver_state.timeline.timeline_id, }; @@ -230,7 +230,7 @@ fn cleanup_broker_connection( async fn subscribe_for_timeline_updates( etcd_client: &mut Client, broker_prefix: &str, - id: ZTenantTimelineId, + id: TenantTimelineId, ) -> BrokerSubscription { let mut attempt = 0; loop { @@ -266,7 +266,7 @@ const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5; /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. struct WalreceiverState { - id: ZTenantTimelineId, + id: TenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. timeline: Arc, @@ -331,7 +331,7 @@ impl WalreceiverState { lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, ) -> Self { - let id = ZTenantTimelineId { + let id = TenantTimelineId { tenant_id: timeline.tenant_id, timeline_id: timeline.timeline_id, }; @@ -746,10 +746,10 @@ enum ReconnectReason { } fn wal_stream_connection_string( - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, - }: ZTenantTimelineId, + }: TenantTimelineId, listen_pg_addr_str: &str, ) -> anyhow::Result { let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db"); @@ -760,7 +760,7 @@ fn wal_stream_connection_string( })?; let (host, port) = utils::connstring::connection_host_port(&me_conf); Ok(format!( - "host={host} port={port} options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'" + "host={host} port={port} options='-c timeline_id={timeline_id} tenant_id={tenant_id}'" )) } @@ -1355,7 +1355,7 @@ mod tests { fn dummy_state(harness: &TenantHarness) -> WalreceiverState { WalreceiverState { - id: ZTenantTimelineId { + id: TenantTimelineId { tenant_id: harness.tenant_id, timeline_id: TIMELINE_ID, }, diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 6f1fbc2c9d..29c4cea882 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -30,7 +30,7 @@ use crate::{ walrecord::DecodedWALRecord, }; use postgres_ffi::v14::waldecoder::WalStreamDecoder; -use utils::zid::ZTenantTimelineId; +use utils::id::TenantTimelineId; use utils::{lsn::Lsn, pq_proto::ReplicationFeedback}; /// Status of the connection. @@ -288,7 +288,7 @@ pub async fn handle_walreceiver_connection( .await // here we either do not have this timeline in remote index // or there were no checkpoints for it yet - .timeline_entry(&ZTenantTimelineId { + .timeline_entry(&TenantTimelineId { tenant_id, timeline_id, }) @@ -316,7 +316,7 @@ pub async fn handle_walreceiver_connection( }; *timeline.last_received_wal.lock().unwrap() = Some(last_received_wal); - // Send zenith feedback message. + // Send the replication feedback message. // Regular standby_status_update fields are put into this message. let status_update = ReplicationFeedback { current_timeline_size: timeline @@ -328,7 +328,7 @@ pub async fn handle_walreceiver_connection( ps_replytime: ts, }; - debug!("zenith_status_update {status_update:?}"); + debug!("neon_status_update {status_update:?}"); let mut data = BytesMut::new(); status_update.serialize(&mut data)?; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index c718a4c30c..dbf9bf9d33 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -13,10 +13,10 @@ use serde::{Deserialize, Serialize}; use tracing::*; use utils::bin_ser::DeserializeError; -/// Each update to a page is represented by a ZenithWalRecord. It can be a wrapper -/// around a PostgreSQL WAL record, or a custom zenith-specific "record". +/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper +/// around a PostgreSQL WAL record, or a custom neon-specific "record". #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -pub enum ZenithWalRecord { +pub enum NeonWalRecord { /// Native PostgreSQL WAL record Postgres { will_init: bool, rec: Bytes }, @@ -45,14 +45,14 @@ pub enum ZenithWalRecord { }, } -impl ZenithWalRecord { +impl NeonWalRecord { /// Does replaying this WAL record initialize the page from scratch, or does /// it need to be applied over the previous image of the page? pub fn will_init(&self) -> bool { match self { - ZenithWalRecord::Postgres { will_init, rec: _ } => *will_init, + NeonWalRecord::Postgres { will_init, rec: _ } => *will_init, - // None of the special zenith record types currently initialize the page + // None of the special neon record types currently initialize the page _ => false, } } @@ -767,9 +767,9 @@ pub fn decode_wal_record( /// Build a human-readable string to describe a WAL record /// /// For debugging purposes -pub fn describe_wal_record(rec: &ZenithWalRecord) -> Result { +pub fn describe_wal_record(rec: &NeonWalRecord) -> Result { match rec { - ZenithWalRecord::Postgres { will_init, rec } => Ok(format!( + NeonWalRecord::Postgres { will_init, rec } => Ok(format!( "will_init: {}, {}", will_init, describe_postgres_wal_record(rec)? diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index dd946659bb..9faabfebda 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -36,7 +36,7 @@ use std::sync::Mutex; use std::time::Duration; use std::time::Instant; use tracing::*; -use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock, zid::ZTenantId}; +use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; use crate::metrics::{ WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, WAL_REDO_WAIT_TIME, @@ -44,7 +44,7 @@ use crate::metrics::{ use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; -use crate::walrecord::ZenithWalRecord; +use crate::walrecord::NeonWalRecord; use crate::{config::PageServerConf, TEMP_FILE_SUFFIX}; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, @@ -81,7 +81,7 @@ pub trait WalRedoManager: Send + Sync { key: Key, lsn: Lsn, base_img: Option, - records: Vec<(Lsn, ZenithWalRecord)>, + records: Vec<(Lsn, NeonWalRecord)>, ) -> Result; } @@ -93,20 +93,20 @@ pub trait WalRedoManager: Send + Sync { /// records. /// pub struct PostgresRedoManager { - tenantid: ZTenantId, + tenant_id: TenantId, conf: &'static PageServerConf, process: Mutex>, } -/// Can this request be served by zenith redo functions +/// Can this request be served by neon redo functions /// or we need to pass it to wal-redo postgres process? -fn can_apply_in_zenith(rec: &ZenithWalRecord) -> bool { +fn can_apply_in_neon(rec: &NeonWalRecord) -> bool { // Currently, we don't have bespoken Rust code to replay any - // Postgres WAL records. But everything else is handled in zenith. + // Postgres WAL records. But everything else is handled in neon. #[allow(clippy::match_like_matches_macro)] match rec { - ZenithWalRecord::Postgres { + NeonWalRecord::Postgres { will_init: _, rec: _, } => false, @@ -143,7 +143,7 @@ impl WalRedoManager for PostgresRedoManager { key: Key, lsn: Lsn, base_img: Option, - records: Vec<(Lsn, ZenithWalRecord)>, + records: Vec<(Lsn, NeonWalRecord)>, ) -> Result { if records.is_empty() { error!("invalid WAL redo request with no records"); @@ -151,14 +151,14 @@ impl WalRedoManager for PostgresRedoManager { } let mut img: Option = base_img; - let mut batch_zenith = can_apply_in_zenith(&records[0].1); + let mut batch_neon = can_apply_in_neon(&records[0].1); let mut batch_start = 0; for i in 1..records.len() { - let rec_zenith = can_apply_in_zenith(&records[i].1); + let rec_neon = can_apply_in_neon(&records[i].1); - if rec_zenith != batch_zenith { - let result = if batch_zenith { - self.apply_batch_zenith(key, lsn, img, &records[batch_start..i]) + if rec_neon != batch_neon { + let result = if batch_neon { + self.apply_batch_neon(key, lsn, img, &records[batch_start..i]) } else { self.apply_batch_postgres( key, @@ -170,13 +170,13 @@ impl WalRedoManager for PostgresRedoManager { }; img = Some(result?); - batch_zenith = rec_zenith; + batch_neon = rec_neon; batch_start = i; } } // last batch - if batch_zenith { - self.apply_batch_zenith(key, lsn, img, &records[batch_start..]) + if batch_neon { + self.apply_batch_neon(key, lsn, img, &records[batch_start..]) } else { self.apply_batch_postgres( key, @@ -193,10 +193,10 @@ impl PostgresRedoManager { /// /// Create a new PostgresRedoManager. /// - pub fn new(conf: &'static PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager { + pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager { // The actual process is launched lazily, on first request. PostgresRedoManager { - tenantid, + tenant_id, conf, process: Mutex::new(None), } @@ -210,7 +210,7 @@ impl PostgresRedoManager { key: Key, lsn: Lsn, base_img: Option, - records: &[(Lsn, ZenithWalRecord)], + records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> Result { let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; @@ -222,7 +222,7 @@ impl PostgresRedoManager { // launch the WAL redo process on first use if process_guard.is_none() { - let p = PostgresRedoProcess::launch(self.conf, &self.tenantid)?; + let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id)?; *process_guard = Some(p); } let process = process_guard.as_mut().unwrap(); @@ -263,14 +263,14 @@ impl PostgresRedoManager { } /// - /// Process a batch of WAL records using bespoken Zenith code. + /// Process a batch of WAL records using bespoken Neon code. /// - fn apply_batch_zenith( + fn apply_batch_neon( &self, key: Key, lsn: Lsn, base_img: Option, - records: &[(Lsn, ZenithWalRecord)], + records: &[(Lsn, NeonWalRecord)], ) -> Result { let start_time = Instant::now(); @@ -280,13 +280,13 @@ impl PostgresRedoManager { page.extend_from_slice(&fpi[..]); } else { // All the current WAL record types that we can handle require a base image. - error!("invalid zenith WAL redo request with no base image"); + error!("invalid neon WAL redo request with no base image"); return Err(WalRedoError::InvalidRequest); } // Apply all the WAL records in the batch for (record_lsn, record) in records.iter() { - self.apply_record_zenith(key, &mut page, *record_lsn, record)?; + self.apply_record_neon(key, &mut page, *record_lsn, record)?; } // Success! let end_time = Instant::now(); @@ -294,7 +294,7 @@ impl PostgresRedoManager { WAL_REDO_TIME.observe(duration.as_secs_f64()); debug!( - "zenith applied {} WAL records in {} ms to reconstruct page image at LSN {}", + "neon applied {} WAL records in {} ms to reconstruct page image at LSN {}", records.len(), duration.as_micros(), lsn @@ -303,22 +303,22 @@ impl PostgresRedoManager { Ok(page.freeze()) } - fn apply_record_zenith( + fn apply_record_neon( &self, key: Key, page: &mut BytesMut, _record_lsn: Lsn, - record: &ZenithWalRecord, + record: &NeonWalRecord, ) -> Result<(), WalRedoError> { match record { - ZenithWalRecord::Postgres { + NeonWalRecord::Postgres { will_init: _, rec: _, } => { - error!("tried to pass postgres wal record to zenith WAL redo"); + error!("tried to pass postgres wal record to neon WAL redo"); return Err(WalRedoError::InvalidRequest); } - ZenithWalRecord::ClearVisibilityMapFlags { + NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno, flags, @@ -360,7 +360,7 @@ impl PostgresRedoManager { } // Non-relational WAL records are handled here, with custom code that has the // same effects as the corresponding Postgres WAL redo function. - ZenithWalRecord::ClogSetCommitted { xids, timestamp } => { + NeonWalRecord::ClogSetCommitted { xids, timestamp } => { let (slru_kind, segno, blknum) = key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; assert_eq!( @@ -410,7 +410,7 @@ impl PostgresRedoManager { ); } } - ZenithWalRecord::ClogSetAborted { xids } => { + NeonWalRecord::ClogSetAborted { xids } => { let (slru_kind, segno, blknum) = key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; assert_eq!( @@ -441,7 +441,7 @@ impl PostgresRedoManager { transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page); } } - ZenithWalRecord::MultixactOffsetCreate { mid, moff } => { + NeonWalRecord::MultixactOffsetCreate { mid, moff } => { let (slru_kind, segno, blknum) = key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; assert_eq!( @@ -474,7 +474,7 @@ impl PostgresRedoManager { LittleEndian::write_u32(&mut page[offset..offset + 4], *moff); } - ZenithWalRecord::MultixactMembersCreate { moff, members } => { + NeonWalRecord::MultixactMembersCreate { moff, members } => { let (slru_kind, segno, blknum) = key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; assert_eq!( @@ -570,7 +570,7 @@ impl PostgresRedoProcess { // // Start postgres binary in special WAL redo mode. // - fn launch(conf: &PageServerConf, tenant_id: &ZTenantId) -> Result { + fn launch(conf: &PageServerConf, tenant_id: &TenantId) -> Result { // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we // just create one with constant name. That fails if you try to launch more than // one WAL redo manager concurrently. @@ -686,7 +686,7 @@ impl PostgresRedoProcess { &mut self, tag: BufferTag, base_img: Option, - records: &[(Lsn, ZenithWalRecord)], + records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> Result { // Serialize all the messages to send the WAL redo process first. @@ -700,7 +700,7 @@ impl PostgresRedoProcess { build_push_page_msg(tag, &img, &mut writebuf); } for (lsn, rec) in records.iter() { - if let ZenithWalRecord::Postgres { + if let NeonWalRecord::Postgres { will_init: _, rec: postgres_rec, } = rec @@ -709,7 +709,7 @@ impl PostgresRedoProcess { } else { return Err(Error::new( ErrorKind::Other, - "tried to pass zenith wal record to postgres WAL redo", + "tried to pass neon wal record to postgres WAL redo", )); } } diff --git a/pgxn/neon/inmem_smgr.c b/pgxn/neon/inmem_smgr.c index 4926d759e8..bc0ee352b8 100644 --- a/pgxn/neon/inmem_smgr.c +++ b/pgxn/neon/inmem_smgr.c @@ -86,7 +86,7 @@ inmem_exists(SMgrRelation reln, ForkNumber forknum) } /* - * inmem_create() -- Create a new relation on zenithd storage + * inmem_create() -- Create a new relation on neon storage * * If isRedo is true, it's okay for the relation to exist already. */ diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 55285a6345..296865838d 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -30,13 +30,12 @@ #include "walproposer.h" #include "walproposer_utils.h" - #define PageStoreTrace DEBUG5 #define NEON_TAG "[NEON_SMGR] " -#define neon_log(tag, fmt, ...) ereport(tag, \ - (errmsg(NEON_TAG fmt, ## __VA_ARGS__), \ - errhidestmt(true), errhidecontext(true))) +#define neon_log(tag, fmt, ...) ereport(tag, \ + (errmsg(NEON_TAG fmt, ##__VA_ARGS__), \ + errhidestmt(true), errhidecontext(true))) bool connected = false; PGconn *pageserver_conn = NULL; @@ -65,7 +64,7 @@ pageserver_connect() errdetail_internal("%s", msg))); } - query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline); + query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); ret = PQsendQuery(pageserver_conn, query); if (ret != 1) { @@ -169,7 +168,7 @@ pageserver_disconnect(void) } static void -pageserver_send(ZenithRequest *request) +pageserver_send(NeonRequest * request) { StringInfoData req_buff; @@ -205,18 +204,18 @@ pageserver_send(ZenithRequest *request) if (message_level_is_interesting(PageStoreTrace)) { - char *msg = zm_to_string((ZenithMessage *) request); + char *msg = zm_to_string((NeonMessage *) request); neon_log(PageStoreTrace, "sent request: %s", msg); pfree(msg); } } -static ZenithResponse * +static NeonResponse * pageserver_receive(void) { StringInfoData resp_buff; - ZenithResponse *resp; + NeonResponse *resp; PG_TRY(); { @@ -236,7 +235,7 @@ pageserver_receive(void) if (message_level_is_interesting(PageStoreTrace)) { - char *msg = zm_to_string((ZenithMessage *) resp); + char *msg = zm_to_string((NeonMessage *) resp); neon_log(PageStoreTrace, "got response: %s", msg); pfree(msg); @@ -249,7 +248,7 @@ pageserver_receive(void) } PG_END_TRY(); - return (ZenithResponse *) resp; + return (NeonResponse *) resp; } @@ -265,8 +264,8 @@ pageserver_flush(void) } } -static ZenithResponse * -pageserver_call(ZenithRequest *request) +static NeonResponse * +pageserver_call(NeonRequest * request) { pageserver_send(request); pageserver_flush(); @@ -281,7 +280,7 @@ page_server_api api = { }; static bool -check_zenith_id(char **newval, void **extra, GucSource source) +check_neon_id(char **newval, void **extra, GucSource source) { uint8 zid[16]; @@ -403,22 +402,22 @@ pg_init_libpagestore(void) NULL, NULL, NULL); DefineCustomStringVariable("neon.timeline_id", - "Zenith timelineid the server is running on", + "Neon timeline_id the server is running on", NULL, - &zenith_timeline, + &neon_timeline, "", PGC_POSTMASTER, 0, /* no flags required */ - check_zenith_id, NULL, NULL); + check_neon_id, NULL, NULL); DefineCustomStringVariable("neon.tenant_id", - "Neon tenantid the server is running on", + "Neon tenant_id the server is running on", NULL, - &zenith_tenant, + &neon_tenant, "", PGC_POSTMASTER, 0, /* no flags required */ - check_zenith_id, NULL, NULL); + check_neon_id, NULL, NULL); DefineCustomBoolVariable("neon.wal_redo", "start in wal-redo mode", @@ -450,8 +449,8 @@ pg_init_libpagestore(void) page_server_connstring = substitute_pageserver_password(page_server_connstring_raw); /* Is there more correct way to pass CustomGUC to postgres code? */ - zenith_timeline_walproposer = zenith_timeline; - zenith_tenant_walproposer = zenith_tenant; + neon_timeline_walproposer = neon_timeline; + neon_tenant_walproposer = neon_tenant; if (wal_redo) { @@ -462,8 +461,8 @@ pg_init_libpagestore(void) else if (page_server_connstring && page_server_connstring[0]) { neon_log(PageStoreTrace, "set neon_smgr hook"); - smgr_hook = smgr_zenith; - smgr_init_hook = smgr_init_zenith; - dbsize_hook = zenith_dbsize; + smgr_hook = smgr_neon; + smgr_init_hook = smgr_init_neon; + dbsize_hook = neon_dbsize; } } diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 5346680b0b..2a2a163ee8 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -28,7 +28,6 @@ PG_MODULE_MAGIC; void _PG_init(void); - void _PG_init(void) { @@ -56,7 +55,6 @@ pg_cluster_size(PG_FUNCTION_ARGS) PG_RETURN_INT64(size); } - Datum backpressure_lsns(PG_FUNCTION_ARGS) { diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 7dc38c13fb..633c7b465c 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -28,31 +28,29 @@ typedef enum { /* pagestore_client -> pagestore */ - T_ZenithExistsRequest = 0, - T_ZenithNblocksRequest, - T_ZenithGetPageRequest, - T_ZenithDbSizeRequest, + T_NeonExistsRequest = 0, + T_NeonNblocksRequest, + T_NeonGetPageRequest, + T_NeonDbSizeRequest, /* pagestore -> pagestore_client */ - T_ZenithExistsResponse = 100, - T_ZenithNblocksResponse, - T_ZenithGetPageResponse, - T_ZenithErrorResponse, - T_ZenithDbSizeResponse, -} ZenithMessageTag; - - + T_NeonExistsResponse = 100, + T_NeonNblocksResponse, + T_NeonGetPageResponse, + T_NeonErrorResponse, + T_NeonDbSizeResponse, +} NeonMessageTag; /* base struct for c-style inheritance */ typedef struct { - ZenithMessageTag tag; -} ZenithMessage; + NeonMessageTag tag; +} NeonMessage; -#define messageTag(m) (((const ZenithMessage *)(m))->tag) +#define messageTag(m) (((const NeonMessage *)(m))->tag) /* - * supertype of all the Zenith*Request structs below + * supertype of all the Neon*Request structs below * * If 'latest' is true, we are requesting the latest page version, and 'lsn' * is just a hint to the server that we know there are no versions of the page @@ -60,81 +58,79 @@ typedef struct */ typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; bool latest; /* if true, request latest page version */ XLogRecPtr lsn; /* request page version @ this LSN */ -} ZenithRequest; +} NeonRequest; typedef struct { - ZenithRequest req; + NeonRequest req; RelFileNode rnode; ForkNumber forknum; -} ZenithExistsRequest; +} NeonExistsRequest; typedef struct { - ZenithRequest req; + NeonRequest req; RelFileNode rnode; ForkNumber forknum; -} ZenithNblocksRequest; - +} NeonNblocksRequest; typedef struct { - ZenithRequest req; + NeonRequest req; Oid dbNode; -} ZenithDbSizeRequest; - +} NeonDbSizeRequest; typedef struct { - ZenithRequest req; + NeonRequest req; RelFileNode rnode; ForkNumber forknum; BlockNumber blkno; -} ZenithGetPageRequest; +} NeonGetPageRequest; -/* supertype of all the Zenith*Response structs below */ +/* supertype of all the Neon*Response structs below */ typedef struct { - ZenithMessageTag tag; -} ZenithResponse; + NeonMessageTag tag; +} NeonResponse; typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; bool exists; -} ZenithExistsResponse; +} NeonExistsResponse; typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; uint32 n_blocks; -} ZenithNblocksResponse; +} NeonNblocksResponse; typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; char page[FLEXIBLE_ARRAY_MEMBER]; -} ZenithGetPageResponse; +} NeonGetPageResponse; typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; int64 db_size; -} ZenithDbSizeResponse; +} NeonDbSizeResponse; typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error * message */ -} ZenithErrorResponse; +} NeonErrorResponse; -extern StringInfoData zm_pack_request(ZenithRequest *msg); -extern ZenithResponse *zm_unpack_response(StringInfo s); -extern char *zm_to_string(ZenithMessage *msg); +extern StringInfoData zm_pack_request(NeonRequest * msg); +extern NeonResponse * zm_unpack_response(StringInfo s); +extern char *zm_to_string(NeonMessage * msg); /* * API @@ -142,57 +138,57 @@ extern char *zm_to_string(ZenithMessage *msg); typedef struct { - ZenithResponse *(*request) (ZenithRequest *request); - void (*send) (ZenithRequest *request); - ZenithResponse *(*receive) (void); + NeonResponse *(*request) (NeonRequest * request); + void (*send) (NeonRequest * request); + NeonResponse *(*receive) (void); void (*flush) (void); } page_server_api; extern page_server_api * page_server; extern char *page_server_connstring; -extern char *zenith_timeline; -extern char *zenith_tenant; +extern char *neon_timeline; +extern char *neon_tenant; extern bool wal_redo; extern int32 max_cluster_size; -extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode); -extern void smgr_init_zenith(void); +extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode); +extern void smgr_init_neon(void); extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode); extern void smgr_init_inmem(void); extern void smgr_shutdown_inmem(void); -/* zenith storage manager functionality */ +/* Neon storage manager functionality */ -extern void zenith_init(void); -extern void zenith_open(SMgrRelation reln); -extern void zenith_close(SMgrRelation reln, ForkNumber forknum); -extern void zenith_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); -extern bool zenith_exists(SMgrRelation reln, ForkNumber forknum); -extern void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); -extern void zenith_extend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); -extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); -extern void zenith_reset_prefetch(SMgrRelation reln); -extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer); +extern void neon_init(void); +extern void neon_open(SMgrRelation reln); +extern void neon_close(SMgrRelation reln, ForkNumber forknum); +extern void neon_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern bool neon_exists(SMgrRelation reln, ForkNumber forknum); +extern void neon_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +extern void neon_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void neon_reset_prefetch(SMgrRelation reln); +extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer); -extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); +extern void neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); -extern void zenith_write(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); -extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks); -extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum); -extern int64 zenith_dbsize(Oid dbNode); -extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); -extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum); +extern void neon_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void neon_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); +extern int64 neon_dbsize(Oid dbNode); +extern void neon_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum); -/* zenith wal-redo storage manager functionality */ +/* neon wal-redo storage manager functionality */ extern void inmem_init(void); extern void inmem_open(SMgrRelation reln); @@ -215,8 +211,7 @@ extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); - -/* utils for zenith relsize cache */ +/* utils for neon relsize cache */ extern void relsize_hash_init(void); extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size); extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 504ae60d4a..24adee019f 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -96,9 +96,9 @@ page_server_api *page_server; /* GUCs */ char *page_server_connstring; -//with substituted password -char *zenith_timeline; -char *zenith_tenant; +/*with substituted password*/ +char *neon_timeline; +char *neon_tenant; bool wal_redo = false; int32 max_cluster_size; @@ -143,7 +143,7 @@ consume_prefetch_responses(void) { for (int i = n_prefetched_buffers; i < n_prefetch_responses; i++) { - ZenithResponse *resp = page_server->receive(); + NeonResponse *resp = page_server->receive(); pfree(resp); } @@ -151,16 +151,16 @@ consume_prefetch_responses(void) n_prefetch_responses = 0; } -static ZenithResponse * +static NeonResponse * page_server_request(void const *req) { consume_prefetch_responses(); - return page_server->request((ZenithRequest *) req); + return page_server->request((NeonRequest *) req); } StringInfoData -zm_pack_request(ZenithRequest *msg) +zm_pack_request(NeonRequest * msg) { StringInfoData s; @@ -170,9 +170,9 @@ zm_pack_request(ZenithRequest *msg) switch (messageTag(msg)) { /* pagestore_client -> pagestore */ - case T_ZenithExistsRequest: + case T_NeonExistsRequest: { - ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg; + NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); @@ -183,9 +183,9 @@ zm_pack_request(ZenithRequest *msg) break; } - case T_ZenithNblocksRequest: + case T_NeonNblocksRequest: { - ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg; + NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); @@ -196,9 +196,9 @@ zm_pack_request(ZenithRequest *msg) break; } - case T_ZenithDbSizeRequest: + case T_NeonDbSizeRequest: { - ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; + NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); @@ -206,9 +206,9 @@ zm_pack_request(ZenithRequest *msg) break; } - case T_ZenithGetPageRequest: + case T_NeonGetPageRequest: { - ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg; + NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); @@ -222,91 +222,91 @@ zm_pack_request(ZenithRequest *msg) } /* pagestore -> pagestore_client. We never need to create these. */ - case T_ZenithExistsResponse: - case T_ZenithNblocksResponse: - case T_ZenithGetPageResponse: - case T_ZenithErrorResponse: - case T_ZenithDbSizeResponse: + case T_NeonExistsResponse: + case T_NeonNblocksResponse: + case T_NeonGetPageResponse: + case T_NeonErrorResponse: + case T_NeonDbSizeResponse: default: - elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag); + elog(ERROR, "unexpected neon message tag 0x%02x", msg->tag); break; } return s; } -ZenithResponse * +NeonResponse * zm_unpack_response(StringInfo s) { - ZenithMessageTag tag = pq_getmsgbyte(s); - ZenithResponse *resp = NULL; + NeonMessageTag tag = pq_getmsgbyte(s); + NeonResponse *resp = NULL; switch (tag) { /* pagestore -> pagestore_client */ - case T_ZenithExistsResponse: + case T_NeonExistsResponse: { - ZenithExistsResponse *msg_resp = palloc0(sizeof(ZenithExistsResponse)); + NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse)); msg_resp->tag = tag; msg_resp->exists = pq_getmsgbyte(s); pq_getmsgend(s); - resp = (ZenithResponse *) msg_resp; + resp = (NeonResponse *) msg_resp; break; } - case T_ZenithNblocksResponse: + case T_NeonNblocksResponse: { - ZenithNblocksResponse *msg_resp = palloc0(sizeof(ZenithNblocksResponse)); + NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse)); msg_resp->tag = tag; msg_resp->n_blocks = pq_getmsgint(s, 4); pq_getmsgend(s); - resp = (ZenithResponse *) msg_resp; + resp = (NeonResponse *) msg_resp; break; } - case T_ZenithGetPageResponse: + case T_NeonGetPageResponse: { - ZenithGetPageResponse *msg_resp = palloc0(offsetof(ZenithGetPageResponse, page) + BLCKSZ); + NeonGetPageResponse *msg_resp = palloc0(offsetof(NeonGetPageResponse, page) + BLCKSZ); msg_resp->tag = tag; /* XXX: should be varlena */ memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); pq_getmsgend(s); - resp = (ZenithResponse *) msg_resp; + resp = (NeonResponse *) msg_resp; break; } - case T_ZenithDbSizeResponse: + case T_NeonDbSizeResponse: { - ZenithDbSizeResponse *msg_resp = palloc0(sizeof(ZenithDbSizeResponse)); + NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse)); msg_resp->tag = tag; msg_resp->db_size = pq_getmsgint64(s); pq_getmsgend(s); - resp = (ZenithResponse *) msg_resp; + resp = (NeonResponse *) msg_resp; break; } - case T_ZenithErrorResponse: + case T_NeonErrorResponse: { - ZenithErrorResponse *msg_resp; + NeonErrorResponse *msg_resp; size_t msglen; const char *msgtext; msgtext = pq_getmsgrawstring(s); msglen = strlen(msgtext); - msg_resp = palloc0(sizeof(ZenithErrorResponse) + msglen + 1); + msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1); msg_resp->tag = tag; memcpy(msg_resp->message, msgtext, msglen + 1); pq_getmsgend(s); - resp = (ZenithResponse *) msg_resp; + resp = (NeonResponse *) msg_resp; break; } @@ -315,12 +315,12 @@ zm_unpack_response(StringInfo s) * * We create these ourselves, and don't need to decode them. */ - case T_ZenithExistsRequest: - case T_ZenithNblocksRequest: - case T_ZenithGetPageRequest: - case T_ZenithDbSizeRequest: + case T_NeonExistsRequest: + case T_NeonNblocksRequest: + case T_NeonGetPageRequest: + case T_NeonDbSizeRequest: default: - elog(ERROR, "unexpected zenith message tag 0x%02x", tag); + elog(ERROR, "unexpected neon message tag 0x%02x", tag); break; } @@ -329,7 +329,7 @@ zm_unpack_response(StringInfo s) /* dump to json for debugging / error reporting purposes */ char * -zm_to_string(ZenithMessage *msg) +zm_to_string(NeonMessage * msg) { StringInfoData s; @@ -338,11 +338,11 @@ zm_to_string(ZenithMessage *msg) switch (messageTag(msg)) { /* pagestore_client -> pagestore */ - case T_ZenithExistsRequest: + case T_NeonExistsRequest: { - ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg; + NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithExistsRequest\""); + appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\""); appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", msg_req->rnode.spcNode, msg_req->rnode.dbNode, @@ -354,11 +354,11 @@ zm_to_string(ZenithMessage *msg) break; } - case T_ZenithNblocksRequest: + case T_NeonNblocksRequest: { - ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg; + NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithNblocksRequest\""); + appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\""); appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", msg_req->rnode.spcNode, msg_req->rnode.dbNode, @@ -370,11 +370,11 @@ zm_to_string(ZenithMessage *msg) break; } - case T_ZenithGetPageRequest: + case T_NeonGetPageRequest: { - ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg; + NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithGetPageRequest\""); + appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\""); appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", msg_req->rnode.spcNode, msg_req->rnode.dbNode, @@ -386,11 +386,11 @@ zm_to_string(ZenithMessage *msg) appendStringInfoChar(&s, '}'); break; } - case T_ZenithDbSizeRequest: + case T_NeonDbSizeRequest: { - ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; + NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeRequest\""); + appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); @@ -398,61 +398,57 @@ zm_to_string(ZenithMessage *msg) break; } - /* pagestore -> pagestore_client */ - case T_ZenithExistsResponse: + case T_NeonExistsResponse: { - ZenithExistsResponse *msg_resp = (ZenithExistsResponse *) msg; + NeonExistsResponse *msg_resp = (NeonExistsResponse *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithExistsResponse\""); + appendStringInfoString(&s, "{\"type\": \"NeonExistsResponse\""); appendStringInfo(&s, ", \"exists\": %d}", - msg_resp->exists - ); + msg_resp->exists); appendStringInfoChar(&s, '}'); break; } - case T_ZenithNblocksResponse: + case T_NeonNblocksResponse: { - ZenithNblocksResponse *msg_resp = (ZenithNblocksResponse *) msg; + NeonNblocksResponse *msg_resp = (NeonNblocksResponse *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithNblocksResponse\""); + appendStringInfoString(&s, "{\"type\": \"NeonNblocksResponse\""); appendStringInfo(&s, ", \"n_blocks\": %u}", - msg_resp->n_blocks - ); + msg_resp->n_blocks); appendStringInfoChar(&s, '}'); break; } - case T_ZenithGetPageResponse: + case T_NeonGetPageResponse: { #if 0 - ZenithGetPageResponse *msg_resp = (ZenithGetPageResponse *) msg; + NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg; #endif - appendStringInfoString(&s, "{\"type\": \"ZenithGetPageResponse\""); + appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\""); appendStringInfo(&s, ", \"page\": \"XXX\"}"); appendStringInfoChar(&s, '}'); break; } - case T_ZenithErrorResponse: + case T_NeonErrorResponse: { - ZenithErrorResponse *msg_resp = (ZenithErrorResponse *) msg; + NeonErrorResponse *msg_resp = (NeonErrorResponse *) msg; /* FIXME: escape double-quotes in the message */ - appendStringInfoString(&s, "{\"type\": \"ZenithErrorResponse\""); + appendStringInfoString(&s, "{\"type\": \"NeonErrorResponse\""); appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message); appendStringInfoChar(&s, '}'); break; } - case T_ZenithDbSizeResponse: + case T_NeonDbSizeResponse: { - ZenithDbSizeResponse *msg_resp = (ZenithDbSizeResponse *) msg; + NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeResponse\""); + appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\""); appendStringInfo(&s, ", \"db_size\": %ld}", - msg_resp->db_size - ); + msg_resp->db_size); appendStringInfoChar(&s, '}'); break; @@ -494,7 +490,7 @@ PageIsEmptyHeapPage(char *buffer) } static void -zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) +neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) { XLogRecPtr lsn = PageGetLSN(buffer); @@ -551,8 +547,8 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, { /* * When PostgreSQL extends a relation, it calls smgrextend() with an - * all-zeros pages, and we can just ignore that in Zenith. We do need - * to remember the new size, though, so that smgrnblocks() returns the + * all-zeros pages, and we can just ignore that in Neon. We do need to + * remember the new size, though, so that smgrnblocks() returns the * right answer after the rel has been extended. We rely on the * relsize cache for that. * @@ -616,12 +612,11 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forknum, blocknum); } - /* - * zenith_init() -- Initialize private state + * neon_init() -- Initialize private state */ void -zenith_init(void) +neon_init(void) { /* noop */ #ifdef DEBUG_COMPARE_LOCAL @@ -658,7 +653,7 @@ zm_adjust_lsn(XLogRecPtr lsn) * Return LSN for requesting pages and number of blocks from page server */ static XLogRecPtr -zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) +neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) { XLogRecPtr lsn; @@ -666,14 +661,14 @@ zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, Bloc { *latest = false; lsn = GetXLogReplayRecPtr(NULL); - elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", + elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", (uint32) ((lsn) >> 32), (uint32) (lsn)); } else if (am_walsender) { *latest = true; lsn = InvalidXLogRecPtr; - elog(DEBUG1, "am walsender zenith_get_request_lsn lsn 0 "); + elog(DEBUG1, "am walsender neon_get_request_lsn lsn 0 "); } else { @@ -687,7 +682,7 @@ zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, Bloc *latest = true; lsn = GetLastWrittenLSN(rnode, forknum, blkno); Assert(lsn != InvalidXLogRecPtr); - elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenLSN lsn %X/%X ", + elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ", (uint32) ((lsn) >> 32), (uint32) (lsn)); lsn = zm_adjust_lsn(lsn); @@ -717,15 +712,14 @@ zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, Bloc return lsn; } - /* - * zenith_exists() -- Does the physical file exist? + * neon_exists() -- Does the physical file exist? */ bool -zenith_exists(SMgrRelation reln, ForkNumber forkNum) +neon_exists(SMgrRelation reln, ForkNumber forkNum) { bool exists; - ZenithResponse *resp; + NeonResponse *resp; BlockNumber n_blocks; bool latest; XLogRecPtr request_lsn; @@ -777,26 +771,25 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, REL_METADATA_PSEUDO_BLOCKNO); + request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, REL_METADATA_PSEUDO_BLOCKNO); { - ZenithExistsRequest request = { - .req.tag = T_ZenithExistsRequest, + NeonExistsRequest request = { + .req.tag = T_NeonExistsRequest, .req.latest = latest, .req.lsn = request_lsn, .rnode = reln->smgr_rnode.node, - .forknum = forkNum - }; + .forknum = forkNum}; resp = page_server_request(&request); } switch (resp->tag) { - case T_ZenithExistsResponse: - exists = ((ZenithExistsResponse *) resp)->exists; + case T_NeonExistsResponse: + exists = ((NeonExistsResponse *) resp)->exists; break; - case T_ZenithErrorResponse: + case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", @@ -806,7 +799,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", - ((ZenithErrorResponse *) resp)->message))); + ((NeonErrorResponse *) resp)->message))); break; default: @@ -817,12 +810,12 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) } /* - * zenith_create() -- Create a new relation on zenithd storage + * neon_create() -- Create a new relation on neond storage * * If isRedo is true, it's okay for the relation to exist already. */ void -zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) +neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) { switch (reln->smgr_relpersistence) { @@ -866,7 +859,7 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) } /* - * zenith_unlink() -- Unlink a relation. + * neon_unlink() -- Unlink a relation. * * Note that we're passed a RelFileNodeBackend --- by the time this is called, * there won't be an SMgrRelation hashtable entry anymore. @@ -884,7 +877,7 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * we are usually not in a transaction anymore when this is called. */ void -zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) +neon_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) { /* * Might or might not exist locally, depending on whether it's an unlogged @@ -899,7 +892,7 @@ zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) } /* - * zenith_extend() -- Add a block to the specified relation. + * neon_extend() -- Add a block to the specified relation. * * The semantics are nearly the same as mdwrite(): write at the * specified position. However, this is to be used for the case of @@ -908,8 +901,8 @@ zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) * causes intervening file space to become filled with zeroes. */ void -zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, - char *buffer, bool skipFsync) +neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + char *buffer, bool skipFsync) { XLogRecPtr lsn; @@ -951,7 +944,7 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, errhint("This limit is defined by neon.max_cluster_size GUC"))); } - zenith_wallog_page(reln, forkNum, blkno, buffer); + neon_wallog_page(reln, forkNum, blkno, buffer); set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1); lsn = PageGetLSN(buffer); @@ -971,10 +964,10 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, } /* - * zenith_open() -- Initialize newly-opened relation. + * neon_open() -- Initialize newly-opened relation. */ void -zenith_open(SMgrRelation reln) +neon_open(SMgrRelation reln) { /* * We don't have anything special to do here. Call mdopen() to let md.c @@ -985,14 +978,14 @@ zenith_open(SMgrRelation reln) mdopen(reln); /* no work */ - elog(SmgrTrace, "[ZENITH_SMGR] open noop"); + elog(SmgrTrace, "[NEON_SMGR] open noop"); } /* - * zenith_close() -- Close the specified relation, if it isn't closed already. + * neon_close() -- Close the specified relation, if it isn't closed already. */ void -zenith_close(SMgrRelation reln, ForkNumber forknum) +neon_close(SMgrRelation reln, ForkNumber forknum) { /* * Let md.c close it, if it had it open. Doesn't hurt to do this even for @@ -1003,19 +996,19 @@ zenith_close(SMgrRelation reln, ForkNumber forknum) /* - * zenith_reset_prefetch() -- reoe all previously rgistered prefeth requests + * neon_reset_prefetch() -- reoe all previously rgistered prefeth requests */ void -zenith_reset_prefetch(SMgrRelation reln) +neon_reset_prefetch(SMgrRelation reln) { n_prefetch_requests = 0; } /* - * zenith_prefetch() -- Initiate asynchronous read of the specified block of a relation + * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation */ bool -zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { switch (reln->smgr_relpersistence) { @@ -1046,14 +1039,14 @@ zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) } /* - * zenith_writeback() -- Tell the kernel to write pages back to storage. + * neon_writeback() -- Tell the kernel to write pages back to storage. * * This accepts a range of blocks because flushing several pages at once is * considerably more efficient than doing so individually. */ void -zenith_writeback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks) +neon_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) { switch (reln->smgr_relpersistence) { @@ -1075,7 +1068,7 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum, } /* not implemented */ - elog(SmgrTrace, "[ZENITH_SMGR] writeback noop"); + elog(SmgrTrace, "[NEON_SMGR] writeback noop"); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1084,14 +1077,14 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum, } /* - * While function is defined in the zenith extension it's used within neon_test_utils directly. + * While function is defined in the neon extension it's used within neon_test_utils directly. * To avoid breaking tests in the runtime please keep function signature in sync. */ void -zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer) +neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer) { - ZenithResponse *resp; + NeonResponse *resp; int i; /* @@ -1103,12 +1096,12 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, for (i = n_prefetched_buffers; i < n_prefetch_responses; i++) { resp = page_server->receive(); - if (resp->tag == T_ZenithGetPageResponse && + if (resp->tag == T_NeonGetPageResponse && RelFileNodeEquals(prefetch_responses[i].rnode, rnode) && prefetch_responses[i].forkNum == forkNum && prefetch_responses[i].blockNum == blkno) { - char *page = ((ZenithGetPageResponse *) resp)->page; + char *page = ((NeonGetPageResponse *) resp)->page; /* * Check if prefetched page is still relevant. If it is updated by @@ -1135,8 +1128,8 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, n_prefetch_responses = 0; n_prefetch_misses += 1; { - ZenithGetPageRequest request = { - .req.tag = T_ZenithGetPageRequest, + NeonGetPageRequest request = { + .req.tag = T_NeonGetPageRequest, .req.latest = request_latest, .req.lsn = request_lsn, .rnode = rnode, @@ -1147,14 +1140,14 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, if (n_prefetch_requests > 0) { /* Combine all prefetch requests with primary request */ - page_server->send((ZenithRequest *) &request); + page_server->send((NeonRequest *) & request); for (i = 0; i < n_prefetch_requests; i++) { request.rnode = prefetch_requests[i].rnode; request.forknum = prefetch_requests[i].forkNum; request.blkno = prefetch_requests[i].blockNum; prefetch_responses[i] = prefetch_requests[i]; - page_server->send((ZenithRequest *) &request); + page_server->send((NeonRequest *) & request); } page_server->flush(); n_prefetch_responses = n_prefetch_requests; @@ -1164,16 +1157,16 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, } else { - resp = page_server->request((ZenithRequest *) &request); + resp = page_server->request((NeonRequest *) & request); } } switch (resp->tag) { - case T_ZenithGetPageResponse: - memcpy(buffer, ((ZenithGetPageResponse *) resp)->page, BLCKSZ); + case T_NeonGetPageResponse: + memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); break; - case T_ZenithErrorResponse: + case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", @@ -1184,7 +1177,7 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", - ((ZenithErrorResponse *) resp)->message))); + ((NeonErrorResponse *) resp)->message))); break; default: @@ -1195,11 +1188,11 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, } /* - * zenith_read() -- Read the specified block from a relation. + * neon_read() -- Read the specified block from a relation. */ void -zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, - char *buffer) +neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + char *buffer) { bool latest; XLogRecPtr request_lsn; @@ -1221,8 +1214,8 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno); - zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); + request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno); + neon_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -1328,15 +1321,15 @@ hexdump_page(char *page) #endif /* - * zenith_write() -- Write the supplied block at the appropriate location. + * neon_write() -- Write the supplied block at the appropriate location. * * This is to be used only for updating already-existing blocks of a * relation (ie, those before the current EOF). To extend a relation, * use mdextend(). */ void -zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer, bool skipFsync) +neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync) { XLogRecPtr lsn; @@ -1372,7 +1365,7 @@ zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - zenith_wallog_page(reln, forknum, blocknum, buffer); + neon_wallog_page(reln, forknum, blocknum, buffer); lsn = PageGetLSN(buffer); elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", @@ -1389,12 +1382,12 @@ zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } /* - * zenith_nblocks() -- Get the number of blocks stored in a relation. + * neon_nblocks() -- Get the number of blocks stored in a relation. */ BlockNumber -zenith_nblocks(SMgrRelation reln, ForkNumber forknum) +neon_nblocks(SMgrRelation reln, ForkNumber forknum) { - ZenithResponse *resp; + NeonResponse *resp; BlockNumber n_blocks; bool latest; XLogRecPtr request_lsn; @@ -1426,10 +1419,10 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) return n_blocks; } - request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node, forknum, REL_METADATA_PSEUDO_BLOCKNO); + request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forknum, REL_METADATA_PSEUDO_BLOCKNO); { - ZenithNblocksRequest request = { - .req.tag = T_ZenithNblocksRequest, + NeonNblocksRequest request = { + .req.tag = T_NeonNblocksRequest, .req.latest = latest, .req.lsn = request_lsn, .rnode = reln->smgr_rnode.node, @@ -1441,11 +1434,11 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) switch (resp->tag) { - case T_ZenithNblocksResponse: - n_blocks = ((ZenithNblocksResponse *) resp)->n_blocks; + case T_NeonNblocksResponse: + n_blocks = ((NeonNblocksResponse *) resp)->n_blocks; break; - case T_ZenithErrorResponse: + case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", @@ -1455,7 +1448,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", - ((ZenithErrorResponse *) resp)->message))); + ((NeonErrorResponse *) resp)->message))); break; default: @@ -1463,7 +1456,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) } update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks); - elog(SmgrTrace, "zenith_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", + elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode, @@ -1476,21 +1469,21 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) } /* - * zenith_db_size() -- Get the size of the database in bytes. + * neon_db_size() -- Get the size of the database in bytes. */ int64 -zenith_dbsize(Oid dbNode) +neon_dbsize(Oid dbNode) { - ZenithResponse *resp; + NeonResponse *resp; int64 db_size; XLogRecPtr request_lsn; bool latest; RelFileNode dummy_node = {InvalidOid, InvalidOid, InvalidOid}; - request_lsn = zenith_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); + request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); { - ZenithDbSizeRequest request = { - .req.tag = T_ZenithDbSizeRequest, + NeonDbSizeRequest request = { + .req.tag = T_NeonDbSizeRequest, .req.latest = latest, .req.lsn = request_lsn, .dbNode = dbNode, @@ -1501,25 +1494,25 @@ zenith_dbsize(Oid dbNode) switch (resp->tag) { - case T_ZenithDbSizeResponse: - db_size = ((ZenithDbSizeResponse *) resp)->db_size; + case T_NeonDbSizeResponse: + db_size = ((NeonDbSizeResponse *) resp)->db_size; break; - case T_ZenithErrorResponse: + case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("could not read db size of db %u from page server at lsn %X/%08X", dbNode, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", - ((ZenithErrorResponse *) resp)->message))); + ((NeonErrorResponse *) resp)->message))); break; default: elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); } - elog(SmgrTrace, "zenith_dbsize: db %u (request LSN %X/%08X): %ld bytes", + elog(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", dbNode, (uint32) (request_lsn >> 32), (uint32) request_lsn, db_size); @@ -1529,10 +1522,10 @@ zenith_dbsize(Oid dbNode) } /* - * zenith_truncate() -- Truncate relation to specified number of blocks. + * neon_truncate() -- Truncate relation to specified number of blocks. */ void -zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) { XLogRecPtr lsn; @@ -1591,7 +1584,7 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) } /* - * zenith_immedsync() -- Immediately sync a relation to stable storage. + * neon_immedsync() -- Immediately sync a relation to stable storage. * * Note that only writes already issued are synced; this routine knows * nothing of dirty buffers that may exist inside the buffer manager. We @@ -1602,7 +1595,7 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * segment may survive recovery, reintroducing unwanted data into the table. */ void -zenith_immedsync(SMgrRelation reln, ForkNumber forknum) +neon_immedsync(SMgrRelation reln, ForkNumber forknum) { switch (reln->smgr_relpersistence) { @@ -1622,7 +1615,7 @@ zenith_immedsync(SMgrRelation reln, ForkNumber forknum) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - elog(SmgrTrace, "[ZENITH_SMGR] immedsync noop"); + elog(SmgrTrace, "[NEON_SMGR] immedsync noop"); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1631,16 +1624,16 @@ zenith_immedsync(SMgrRelation reln, ForkNumber forknum) } /* - * zenith_start_unlogged_build() -- Starting build operation on a rel. + * neon_start_unlogged_build() -- Starting build operation on a rel. * * Some indexes are built in two phases, by first populating the table with * regular inserts, using the shared buffer cache but skipping WAL-logging, - * and WAL-logging the whole relation after it's done. Zenith relies on the + * and WAL-logging the whole relation after it's done. Neon relies on the * WAL to reconstruct pages, so we cannot use the page server in the * first phase when the changes are not logged. */ static void -zenith_start_unlogged_build(SMgrRelation reln) +neon_start_unlogged_build(SMgrRelation reln) { /* * Currently, there can be only one unlogged relation build operation in @@ -1692,13 +1685,13 @@ zenith_start_unlogged_build(SMgrRelation reln) } /* - * zenith_finish_unlogged_build_phase_1() + * neon_finish_unlogged_build_phase_1() * * Call this after you have finished populating a relation in unlogged mode, * before you start WAL-logging it. */ static void -zenith_finish_unlogged_build_phase_1(SMgrRelation reln) +neon_finish_unlogged_build_phase_1(SMgrRelation reln) { Assert(unlogged_build_rel == reln); @@ -1718,7 +1711,7 @@ zenith_finish_unlogged_build_phase_1(SMgrRelation reln) } /* - * zenith_end_unlogged_build() -- Finish an unlogged rel build. + * neon_end_unlogged_build() -- Finish an unlogged rel build. * * Call this after you have finished WAL-logging an relation that was * first populated without WAL-logging. @@ -1727,7 +1720,7 @@ zenith_finish_unlogged_build_phase_1(SMgrRelation reln) * WAL-logged and is present in the page server. */ static void -zenith_end_unlogged_build(SMgrRelation reln) +neon_end_unlogged_build(SMgrRelation reln) { Assert(unlogged_build_rel == reln); @@ -1769,7 +1762,7 @@ zenith_end_unlogged_build(SMgrRelation reln) } static void -AtEOXact_zenith(XactEvent event, void *arg) +AtEOXact_neon(XactEvent event, void *arg) { switch (event) { @@ -1802,47 +1795,46 @@ AtEOXact_zenith(XactEvent event, void *arg) } } -static const struct f_smgr zenith_smgr = +static const struct f_smgr neon_smgr = { - .smgr_init = zenith_init, + .smgr_init = neon_init, .smgr_shutdown = NULL, - .smgr_open = zenith_open, - .smgr_close = zenith_close, - .smgr_create = zenith_create, - .smgr_exists = zenith_exists, - .smgr_unlink = zenith_unlink, - .smgr_extend = zenith_extend, - .smgr_prefetch = zenith_prefetch, - .smgr_reset_prefetch = zenith_reset_prefetch, - .smgr_read = zenith_read, - .smgr_write = zenith_write, - .smgr_writeback = zenith_writeback, - .smgr_nblocks = zenith_nblocks, - .smgr_truncate = zenith_truncate, - .smgr_immedsync = zenith_immedsync, + .smgr_open = neon_open, + .smgr_close = neon_close, + .smgr_create = neon_create, + .smgr_exists = neon_exists, + .smgr_unlink = neon_unlink, + .smgr_extend = neon_extend, + .smgr_prefetch = neon_prefetch, + .smgr_reset_prefetch = neon_reset_prefetch, + .smgr_read = neon_read, + .smgr_write = neon_write, + .smgr_writeback = neon_writeback, + .smgr_nblocks = neon_nblocks, + .smgr_truncate = neon_truncate, + .smgr_immedsync = neon_immedsync, - .smgr_start_unlogged_build = zenith_start_unlogged_build, - .smgr_finish_unlogged_build_phase_1 = zenith_finish_unlogged_build_phase_1, - .smgr_end_unlogged_build = zenith_end_unlogged_build, + .smgr_start_unlogged_build = neon_start_unlogged_build, + .smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1, + .smgr_end_unlogged_build = neon_end_unlogged_build, }; - const f_smgr * -smgr_zenith(BackendId backend, RelFileNode rnode) +smgr_neon(BackendId backend, RelFileNode rnode) { /* Don't use page server for temp relations */ if (backend != InvalidBackendId) return smgr_standard(backend, rnode); else - return &zenith_smgr; + return &neon_smgr; } void -smgr_init_zenith(void) +smgr_init_neon(void) { - RegisterXactCallback(AtEOXact_zenith, NULL); + RegisterXactCallback(AtEOXact_neon, NULL); smgr_init_standard(); - zenith_init(); + neon_init(); } diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c index 31021f3e41..d4262c730a 100644 --- a/pgxn/neon/relsize_cache.c +++ b/pgxn/neon/relsize_cache.c @@ -56,7 +56,7 @@ static void relsize_shmem_request(void); #define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024) static void -zenith_smgr_shmem_startup(void) +neon_smgr_shmem_startup(void) { static HASHCTL info; @@ -174,14 +174,14 @@ relsize_hash_init(void) #endif prev_shmem_startup_hook = shmem_startup_hook; - shmem_startup_hook = zenith_smgr_shmem_startup; + shmem_startup_hook = neon_smgr_shmem_startup; } } #if PG_VERSION_NUM >= 150000 /* * shmem_request hook: request additional shared resources. We'll allocate or - * attach to the shared resources in zenith_smgr_shmem_startup(). + * attach to the shared resources in neon_smgr_shmem_startup(). */ static void relsize_shmem_request(void) diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 05257ced4c..fc0b660a64 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -71,14 +71,13 @@ #include "walproposer_utils.h" #include "replication/walpropshim.h" - char *wal_acceptors_list; int wal_acceptor_reconnect_timeout; int wal_acceptor_connect_timeout; bool am_wal_proposer; -char *zenith_timeline_walproposer = NULL; -char *zenith_tenant_walproposer = NULL; +char *neon_timeline_walproposer = NULL; +char *neon_tenant_walproposer = NULL; /* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */ WalProposerFunctionsType *WalProposerFunctions = NULL; @@ -89,7 +88,7 @@ static int n_safekeepers = 0; static int quorum = 0; static Safekeeper safekeeper[MAX_SAFEKEEPERS]; static XLogRecPtr availableLsn; /* WAL has been generated up to this point */ -static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to +static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to* * safekeepers */ static ProposerGreeting greetRequest; static VoteRequest voteRequest; /* Vote request for safekeeper */ @@ -162,7 +161,6 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, Safekeeper static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); static bool AsyncFlush(Safekeeper *sk); - static void nwp_shmem_startup_hook(void); static void nwp_register_gucs(void); static void nwp_prepare_shmem(void); @@ -176,7 +174,6 @@ static shmem_request_hook_type prev_shmem_request_hook = NULL; static void walproposer_shmem_request(void); #endif - void pg_init_walproposer(void) { @@ -207,10 +204,9 @@ nwp_register_gucs(void) &wal_acceptors_list, /* valueAddr */ "", /* bootValue */ PGC_POSTMASTER, - GUC_LIST_INPUT, /* extensions can't use + GUC_LIST_INPUT, /* extensions can't use* * GUC_LIST_QUOTE */ - NULL, NULL, NULL - ); + NULL, NULL, NULL); DefineCustomIntVariable( "neon.safekeeper_reconnect_timeout", @@ -220,8 +216,7 @@ nwp_register_gucs(void) 1000, 0, INT_MAX, /* default, min, max */ PGC_SIGHUP, /* context */ GUC_UNIT_MS, /* flags */ - NULL, NULL, NULL - ); + NULL, NULL, NULL); DefineCustomIntVariable( "neon.safekeeper_connect_timeout", @@ -231,9 +226,7 @@ nwp_register_gucs(void) 5000, 0, INT_MAX, PGC_SIGHUP, GUC_UNIT_MS, - NULL, NULL, NULL - ); - + NULL, NULL, NULL); } /* shmem handling */ @@ -499,19 +492,19 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) greetRequest.pgVersion = PG_VERSION_NUM; pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId)); greetRequest.systemId = systemId; - if (!zenith_timeline_walproposer) + if (!neon_timeline_walproposer) elog(FATAL, "neon.timeline_id is not provided"); - if (*zenith_timeline_walproposer != '\0' && - !HexDecodeString(greetRequest.ztimelineid, zenith_timeline_walproposer, 16)) - elog(FATAL, "Could not parse neon.timeline_id, %s", zenith_timeline_walproposer); - if (!zenith_tenant_walproposer) + if (*neon_timeline_walproposer != '\0' && + !HexDecodeString(greetRequest.timeline_id, neon_timeline_walproposer, 16)) + elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline_walproposer); + if (!neon_tenant_walproposer) elog(FATAL, "neon.tenant_id is not provided"); - if (*zenith_tenant_walproposer != '\0' && - !HexDecodeString(greetRequest.ztenantid, zenith_tenant_walproposer, 16)) - elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer); + if (*neon_tenant_walproposer != '\0' && + !HexDecodeString(greetRequest.tenant_id, neon_tenant_walproposer, 16)) + elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant_walproposer); #if PG_VERSION_NUM >= 150000 -/* FIXME don't use hardcoded timeline id */ + /* FIXME don't use hardcoded timeline id */ greetRequest.timeline = 1; #else greetRequest.timeline = ThisTimeLineID; @@ -657,8 +650,8 @@ ResetConnection(Safekeeper *sk) int written = 0; written = snprintf((char *) &sk->conninfo, MAXCONNINFO, - "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", - sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer); + "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", + sk->host, sk->port, neon_timeline_walproposer, neon_tenant_walproposer); /* * currently connection string is not that long, but once we pass @@ -1326,8 +1319,7 @@ DetermineEpochStartLsn(void) propTerm, LSN_FORMAT_ARGS(propEpochStartLsn), safekeeper[donor].host, safekeeper[donor].port, - LSN_FORMAT_ARGS(truncateLsn) - ); + LSN_FORMAT_ARGS(truncateLsn)); /* * Ensure the basebackup we are running (at RedoStartLsn) matches LSN @@ -1373,8 +1365,8 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec WalReceiverConn *wrconn; WalRcvStreamOptions options; - sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", - safekeeper[donor].host, safekeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer); + sprintf(conninfo, "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", + safekeeper[donor].host, safekeeper[donor].port, neon_timeline_walproposer, neon_tenant_walproposer); wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); if (!wrconn) { @@ -1544,8 +1536,7 @@ SendProposerElected(Safekeeper *sk) else { XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn; - XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : - sk->voteResponse.flushLsn); + XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : sk->voteResponse.flushLsn); sk->startStreamingAt = Min(propEndLsn, skEndLsn); } @@ -1759,7 +1750,7 @@ SendAppendRequests(Safekeeper *sk) req->beginLsn, req->endLsn - req->beginLsn, #if PG_VERSION_NUM >= 150000 - /* FIXME don't use hardcoded timelineid here */ + /* FIXME don't use hardcoded timeline_id here */ 1, #else ThisTimeLineID, @@ -1784,9 +1775,9 @@ SendAppendRequests(Safekeeper *sk) case PG_ASYNC_WRITE_TRY_FLUSH: /* - * We still need to call PQflush some more to finish the job. - * Caller function will handle this by setting right event - * set. + * * We still need to call PQflush some more to finish the + * job. Caller function will handle this by setting right + * event* set. */ sk->flushWrite = true; return true; @@ -1885,40 +1876,40 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * if (strcmp(key, "current_timeline_size") == 0) { pq_getmsgint(reply_message, sizeof(int32)); - //read value length - rf->currentClusterSize = pq_getmsgint64(reply_message); + /* read value length */ + rf->currentClusterSize = pq_getmsgint64(reply_message); elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", rf->currentClusterSize); } else if (strcmp(key, "ps_writelsn") == 0) { pq_getmsgint(reply_message, sizeof(int32)); - //read value length - rf->ps_writelsn = pq_getmsgint64(reply_message); + /* read value length */ + rf->ps_writelsn = pq_getmsgint64(reply_message); elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", LSN_FORMAT_ARGS(rf->ps_writelsn)); } else if (strcmp(key, "ps_flushlsn") == 0) { pq_getmsgint(reply_message, sizeof(int32)); - //read value length - rf->ps_flushlsn = pq_getmsgint64(reply_message); + /* read value length */ + rf->ps_flushlsn = pq_getmsgint64(reply_message); elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", LSN_FORMAT_ARGS(rf->ps_flushlsn)); } else if (strcmp(key, "ps_applylsn") == 0) { pq_getmsgint(reply_message, sizeof(int32)); - //read value length - rf->ps_applylsn = pq_getmsgint64(reply_message); + /* read value length */ + rf->ps_applylsn = pq_getmsgint64(reply_message); elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", LSN_FORMAT_ARGS(rf->ps_applylsn)); } else if (strcmp(key, "ps_replytime") == 0) { pq_getmsgint(reply_message, sizeof(int32)); - //read value length - rf->ps_replytime = pq_getmsgint64(reply_message); + /* read value length */ + rf->ps_replytime = pq_getmsgint64(reply_message); { char *replyTimeStr; @@ -1933,13 +1924,13 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * else { len = pq_getmsgint(reply_message, sizeof(int32)); - //read value length + /* read value length */ /* * Skip unknown keys to support backward compatibile protocol * changes */ - elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); + elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); pq_getmsgbytes(reply_message, len); }; } @@ -1973,7 +1964,6 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs) } } - /* * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the * last WAL record that can be safely discarded. @@ -2009,8 +1999,7 @@ GetAcknowledgedByQuorumWALPosition(void) * Like in Raft, we aren't allowed to commit entries from previous * terms, so ignore reported LSN until it gets to epochStartLsn. */ - responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ? - safekeeper[i].appendResponse.flushLsn : 0; + responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ? safekeeper[i].appendResponse.flushLsn : 0; } qsort(responses, n_safekeepers, sizeof(XLogRecPtr), CompareLsn); @@ -2058,7 +2047,6 @@ replication_feedback_set(ReplicationFeedback * rf) SpinLockRelease(&walprop_shared->mutex); } - void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) { @@ -2069,12 +2057,11 @@ replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRe SpinLockRelease(&walprop_shared->mutex); } - /* * Get ReplicationFeedback fields from the most advanced safekeeper */ static void -GetLatestZentihFeedback(ReplicationFeedback * rf) +GetLatestNeonFeedback(ReplicationFeedback * rf) { int latest_safekeeper = 0; XLogRecPtr ps_writelsn = InvalidXLogRecPtr; @@ -2094,7 +2081,7 @@ GetLatestZentihFeedback(ReplicationFeedback * rf) rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn; rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime; - elog(DEBUG2, "GetLatestZentihFeedback: currentClusterSize %lu," + elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", rf->currentClusterSize, LSN_FORMAT_ARGS(rf->ps_writelsn), @@ -2113,14 +2100,13 @@ HandleSafekeeperResponse(void) XLogRecPtr diskConsistentLsn; XLogRecPtr minFlushLsn; - minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); diskConsistentLsn = quorumFeedback.rf.ps_flushlsn; if (!syncSafekeepers) { /* Get ReplicationFeedback fields from the most advanced safekeeper */ - GetLatestZentihFeedback(&quorumFeedback.rf); + GetLatestNeonFeedback(&quorumFeedback.rf); SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); } @@ -2139,7 +2125,7 @@ HandleSafekeeperResponse(void) quorumFeedback.flushLsn, /* - * apply_lsn - This is what processed and durably saved at + * apply_lsn - This is what processed and durably saved at* * pageserver. */ quorumFeedback.rf.ps_flushlsn, @@ -2460,7 +2446,7 @@ backpressure_lag_impl(void) XLogRecPtr myFlushLsn = GetFlushRecPtr(); #endif replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); -#define MB ((XLogRecPtr)1024*1024) +#define MB ((XLogRecPtr)1024 * 1024) elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X", LSN_FORMAT_ARGS(myFlushLsn), @@ -2468,23 +2454,17 @@ backpressure_lag_impl(void) LSN_FORMAT_ARGS(flushPtr), LSN_FORMAT_ARGS(applyPtr)); - if ((writePtr != InvalidXLogRecPtr - && max_replication_write_lag > 0 - && myFlushLsn > writePtr + max_replication_write_lag * MB)) + if ((writePtr != InvalidXLogRecPtr && max_replication_write_lag > 0 && myFlushLsn > writePtr + max_replication_write_lag * MB)) { return (myFlushLsn - writePtr - max_replication_write_lag * MB); } - if ((flushPtr != InvalidXLogRecPtr - && max_replication_flush_lag > 0 - && myFlushLsn > flushPtr + max_replication_flush_lag * MB)) + if ((flushPtr != InvalidXLogRecPtr && max_replication_flush_lag > 0 && myFlushLsn > flushPtr + max_replication_flush_lag * MB)) { return (myFlushLsn - flushPtr - max_replication_flush_lag * MB); } - if ((applyPtr != InvalidXLogRecPtr - && max_replication_apply_lag > 0 - && myFlushLsn > applyPtr + max_replication_apply_lag * MB)) + if ((applyPtr != InvalidXLogRecPtr && max_replication_apply_lag > 0 && myFlushLsn > applyPtr + max_replication_apply_lag * MB)) { return (myFlushLsn - applyPtr - max_replication_apply_lag * MB); } diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 59e70f33bf..051c7c02a6 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -10,16 +10,16 @@ #include "utils/uuid.h" #include "replication/walreceiver.h" -#define SK_MAGIC 0xCafeCeefu -#define SK_PROTOCOL_VERSION 2 +#define SK_MAGIC 0xCafeCeefu +#define SK_PROTOCOL_VERSION 2 -#define MAX_SAFEKEEPERS 32 -#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single - * WAL message */ -#define XLOG_HDR_SIZE (1+8*3) /* 'w' + startPos + walEnd + timestamp */ -#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender +#define MAX_SAFEKEEPERS 32 +#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single* WAL + * message */ +#define XLOG_HDR_SIZE (1 + 8 * 3) /* 'w' + startPos + walEnd + timestamp */ +#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender* * message header */ -#define XLOG_HDR_END_POS (1+8) /* offset of end position in wal sender +#define XLOG_HDR_END_POS (1 + 8) /* offset of end position in wal sender* * message header */ /* @@ -39,8 +39,8 @@ typedef struct WalProposerConn WalProposerConn; struct WalMessage; typedef struct WalMessage WalMessage; -extern char *zenith_timeline_walproposer; -extern char *zenith_tenant_walproposer; +extern char *neon_timeline_walproposer; +extern char *neon_tenant_walproposer; /* Possible return values from ReadPGAsync */ typedef enum @@ -170,8 +170,8 @@ typedef struct ProposerGreeting uint32 pgVersion; pg_uuid_t proposerId; uint64 systemId; /* Postgres system identifier */ - uint8 ztimelineid[16]; /* Zenith timeline id */ - uint8 ztenantid[16]; + uint8 timeline_id[16]; /* Neon timeline id */ + uint8 tenant_id[16]; TimeLineID timeline; uint32 walSegSize; } ProposerGreeting; @@ -226,7 +226,7 @@ typedef struct VoteResponse * proposer to choose the most advanced one. */ XLogRecPtr flushLsn; - XLogRecPtr truncateLsn; /* minimal LSN which may be needed for + XLogRecPtr truncateLsn; /* minimal LSN which may be needed for* * recovery of some safekeeper */ TermHistory termHistory; XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ @@ -283,7 +283,6 @@ typedef struct HotStandbyFeedback FullTransactionId catalog_xmin; } HotStandbyFeedback; - typedef struct ReplicationFeedback { /* current size of the timeline on pageserver */ @@ -295,7 +294,6 @@ typedef struct ReplicationFeedback TimestampTz ps_replytime; } ReplicationFeedback; - typedef struct WalproposerShmemState { slock_t mutex; @@ -323,7 +321,7 @@ typedef struct AppendResponse XLogRecPtr commitLsn; HotStandbyFeedback hs; /* Feedback recieved from pageserver includes standby_status_update fields */ - /* and custom zenith feedback. */ + /* and custom neon feedback. */ /* This part of the message is extensible. */ ReplicationFeedback rf; } AppendResponse; @@ -332,7 +330,6 @@ typedef struct AppendResponse /* Other fields are fixed part */ #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) - /* * Descriptor of safekeeper */ @@ -340,7 +337,7 @@ typedef struct Safekeeper { char const *host; char const *port; - char conninfo[MAXCONNINFO]; /* connection info for + char conninfo[MAXCONNINFO]; /* connection info for* * connecting/reconnecting */ /* @@ -366,12 +363,12 @@ typedef struct Safekeeper */ XLogRecPtr startStreamingAt; - bool flushWrite; /* set to true if we need to call AsyncFlush, + bool flushWrite; /* set to true if we need to call AsyncFlush,* * to flush pending messages */ XLogRecPtr streamingAt; /* current streaming position */ AppendRequestHeader appendRequest; /* request for sending to safekeeper */ - int eventPos; /* position in wait event set. Equal to -1 if + int eventPos; /* position in wait event set. Equal to -1 if* * no event */ SafekeeperState state; /* safekeeper state machine state */ TimestampTz startedConnAt; /* when connection attempt started */ @@ -380,7 +377,6 @@ typedef struct Safekeeper AppendResponse appendResponse; /* feedback for master */ } Safekeeper; - extern PGDLLIMPORT void WalProposerMain(Datum main_arg); void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); void WalProposerPoll(void); diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 07bd7bdd28..e0cea4177b 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -36,13 +36,13 @@ PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex); PG_FUNCTION_INFO_V1(neon_xlogflush); /* - * Linkage to functions in zenith module. + * Linkage to functions in neon module. * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c */ -typedef void (*zenith_read_at_lsn_type) (RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); +typedef void (*neon_read_at_lsn_type) (RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); -static zenith_read_at_lsn_type zenith_read_at_lsn_ptr; +static neon_read_at_lsn_type neon_read_at_lsn_ptr; /* * Module initialize function: fetch function pointers for cross-module calls. @@ -51,13 +51,13 @@ void _PG_init(void) { /* Asserts verify that typedefs above match original declarations */ - AssertVariableIsOfType(&zenith_read_at_lsn, zenith_read_at_lsn_type); - zenith_read_at_lsn_ptr = (zenith_read_at_lsn_type) - load_external_function("$libdir/neon", "zenith_read_at_lsn", + AssertVariableIsOfType(&neon_read_at_lsn, neon_read_at_lsn_type); + neon_read_at_lsn_ptr = (neon_read_at_lsn_type) + load_external_function("$libdir/neon", "neon_read_at_lsn", true, NULL); } -#define zenith_read_at_lsn zenith_read_at_lsn_ptr +#define neon_read_at_lsn neon_read_at_lsn_ptr /* * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound. @@ -96,7 +96,7 @@ test_consume_xids(PG_FUNCTION_ARGS) Datum clear_buffer_cache(PG_FUNCTION_ARGS) { - bool save_zenith_test_evict; + bool save_neon_test_evict; /* * Temporarily set the zenith_test_evict GUC, so that when we pin and @@ -104,7 +104,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS) * buffers, as there is no explicit "evict this buffer" function in the * buffer manager. */ - save_zenith_test_evict = zenith_test_evict; + save_neon_test_evict = zenith_test_evict; zenith_test_evict = true; PG_TRY(); { @@ -149,14 +149,13 @@ clear_buffer_cache(PG_FUNCTION_ARGS) PG_FINALLY(); { /* restore the GUC */ - zenith_test_evict = save_zenith_test_evict; + zenith_test_evict = save_neon_test_evict; } PG_END_TRY(); PG_RETURN_VOID(); } - /* * Reads the page from page server without buffer cache * usage mimics get_raw_page() in pageinspect, but offers reading versions at specific LSN @@ -232,7 +231,6 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); - forknum = forkname_to_number(text_to_cstring(forkname)); /* Initialize buffer to copy to */ @@ -240,7 +238,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - zenith_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data); relation_close(rel, AccessShareLock); @@ -272,8 +270,7 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) RelFileNode rnode = { .spcNode = PG_GETARG_OID(0), .dbNode = PG_GETARG_OID(1), - .relNode = PG_GETARG_OID(2) - }; + .relNode = PG_GETARG_OID(2)}; ForkNumber forknum = PG_GETARG_UINT32(3); @@ -281,14 +278,13 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) bool request_latest = PG_ARGISNULL(5); uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); - /* Initialize buffer to copy to */ bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - zenith_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data); PG_RETURN_BYTEA_P(raw_page); } } diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 5a450793f1..5417f4f2b3 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -32,7 +32,7 @@ sha2 = "0.10.2" socket2 = "0.4.4" thiserror = "1.0.30" tokio = { version = "1.17", features = ["macros"] } -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-rustls = "0.23.0" url = "2.2.2" git-version = "0.3.5" diff --git a/pyproject.toml b/pyproject.toml index ec166ea7cd..9c2aa39c7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.poetry] -name = "zenith" +name = "neon" version = "0.1.0" description = "" authors = [] diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 4ed30413e2..cae095c3c2 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -14,8 +14,8 @@ tracing = "0.1.27" clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["macros", "fs"] } -postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } anyhow = "1.0" crc32c = "0.6.0" humantime = "2.1.0" @@ -25,7 +25,7 @@ serde = { version = "1.0", features = ["derive"] } serde_with = "1.12.0" hex = "0.4.3" const_format = "0.2.21" -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } git-version = "0.3.5" async-trait = "0.1" once_cell = "1.13.0" diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 244c793250..d518ac01cc 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -30,8 +30,8 @@ use safekeeper::wal_service; use safekeeper::SafeKeeperConf; use utils::auth::JwtAuth; use utils::{ - http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener, - zid::NodeId, + http::endpoint, id::NodeId, logging, project_git_version, shutdown::exit_now, signals, + tcp_listener, }; const LOCK_FILE_NAME: &str = "safekeeper.lock"; @@ -39,7 +39,7 @@ const ID_FILE_NAME: &str = "safekeeper.id"; project_git_version!(GIT_VERSION); fn main() -> anyhow::Result<()> { - let arg_matches = App::new("Zenith safekeeper") + let arg_matches = App::new("Neon safekeeper") .about("Store WAL stream to local file system and push it to WAL receivers") .version(GIT_VERSION) .arg( diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index ce66131700..f276fad613 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -22,7 +22,7 @@ use etcd_broker::{ subscription_key::{OperationKind, SkOperationKind, SubscriptionKey}, Client, PutOptions, }; -use utils::zid::{NodeId, ZTenantTimelineId}; +use utils::id::{NodeId, TenantTimelineId}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; @@ -45,7 +45,7 @@ pub fn thread_main(conf: SafeKeeperConf) { /// Key to per timeline per safekeeper data. fn timeline_safekeeper_path( broker_etcd_prefix: String, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, sk_id: NodeId, ) -> String { format!( @@ -162,12 +162,12 @@ pub fn get_candiate_name(system_id: NodeId) -> String { } async fn push_sk_info( - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, mut client: Client, key: String, sk_info: SkTimelineInfo, mut lease: Lease, -) -> anyhow::Result<(ZTenantTimelineId, Lease)> { +) -> anyhow::Result<(TenantTimelineId, Lease)> { let put_opts = PutOptions::new().with_lease(lease.id); client .put( @@ -202,7 +202,7 @@ struct Lease { /// Push once in a while data about all active timelines to the broker. async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { let mut client = Client::connect(&conf.broker_endpoints, None).await?; - let mut leases: HashMap = HashMap::new(); + let mut leases: HashMap = HashMap::new(); let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); loop { diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 7fc75246e1..ff23f0360f 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -14,7 +14,7 @@ use tracing::*; use crate::control_file_upgrade::upgrade_control_file; use crate::safekeeper::{SafeKeeperState, SK_FORMAT_VERSION, SK_MAGIC}; use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; -use utils::{bin_ser::LeSer, zid::ZTenantTimelineId}; +use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; @@ -55,7 +55,7 @@ pub struct FileStorage { } impl FileStorage { - pub fn restore_new(zttid: &ZTenantTimelineId, conf: &SafeKeeperConf) -> Result { + pub fn restore_new(zttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { let timeline_dir = conf.timeline_dir(zttid); let tenant_id = zttid.tenant_id.to_string(); let timeline_id = zttid.timeline_id.to_string(); @@ -72,7 +72,7 @@ impl FileStorage { } pub fn create_new( - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, conf: &SafeKeeperConf, state: SafeKeeperState, ) -> Result { @@ -115,7 +115,7 @@ impl FileStorage { // Load control file for given zttid at path specified by conf. pub fn load_control_file_conf( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, ) -> Result { let path = conf.timeline_dir(zttid).join(CONTROL_FILE_NAME); Self::load_control_file(path) @@ -252,7 +252,7 @@ mod test { use crate::{safekeeper::SafeKeeperState, SafeKeeperConf}; use anyhow::Result; use std::fs; - use utils::{lsn::Lsn, zid::ZTenantTimelineId}; + use utils::{id::TenantTimelineId, lsn::Lsn}; fn stub_conf() -> SafeKeeperConf { let workdir = tempfile::tempdir().unwrap().into_path(); @@ -264,7 +264,7 @@ mod test { fn load_from_control_file( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); Ok(( @@ -275,7 +275,7 @@ mod test { fn create( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); let state = SafeKeeperState::empty(); @@ -286,7 +286,7 @@ mod test { #[test] fn test_read_write_safekeeper_state() { let conf = stub_conf(); - let zttid = ZTenantTimelineId::generate(); + let zttid = TenantTimelineId::generate(); { let (mut storage, mut state) = create(&conf, &zttid).expect("failed to create state"); // change something @@ -301,7 +301,7 @@ mod test { #[test] fn test_safekeeper_state_checksum_mismatch() { let conf = stub_conf(); - let zttid = ZTenantTimelineId::generate(); + let zttid = TenantTimelineId::generate(); { let (mut storage, mut state) = create(&conf, &zttid).expect("failed to read state"); diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 91d2f61c10..87204d6b49 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -7,9 +7,9 @@ use serde::{Deserialize, Serialize}; use tracing::*; use utils::{ bin_ser::LeSer, + id::{TenantId, TimelineId}, lsn::Lsn, pq_proto::SystemId, - zid::{ZTenantId, ZTimelineId}, }; /// Persistent consensus state of the acceptor. @@ -45,9 +45,8 @@ pub struct ServerInfoV2 { /// Postgres server version pub pg_version: u32, pub system_id: SystemId, - pub tenant_id: ZTenantId, - /// Zenith timelineid - pub ztli: ZTimelineId, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, pub wal_seg_size: u32, } @@ -76,10 +75,9 @@ pub struct ServerInfoV3 { pub pg_version: u32, pub system_id: SystemId, #[serde(with = "hex")] - pub tenant_id: ZTenantId, - /// Zenith timelineid + pub tenant_id: TenantId, #[serde(with = "hex")] - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, pub wal_seg_size: u32, } @@ -106,10 +104,9 @@ pub struct SafeKeeperStateV3 { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SafeKeeperStateV4 { #[serde(with = "hex")] - pub tenant_id: ZTenantId, - /// Zenith timelineid + pub tenant_id: TenantId, #[serde(with = "hex")] - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, /// persistent acceptor state pub acceptor_state: AcceptorState, /// information about server @@ -154,7 +151,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result }; return Ok(SafeKeeperState { tenant_id: oldstate.server.tenant_id, - timeline_id: oldstate.server.ztli, + timeline_id: oldstate.server.timeline_id, acceptor_state: ac, server: ServerInfo { pg_version: oldstate.server.pg_version, @@ -181,7 +178,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result }; return Ok(SafeKeeperState { tenant_id: oldstate.server.tenant_id, - timeline_id: oldstate.server.ztli, + timeline_id: oldstate.server.timeline_id, acceptor_state: oldstate.acceptor_state, server, proposer_uuid: oldstate.proposer_uuid, @@ -193,9 +190,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), }); - // migrate to moving ztenantid/ztli to the top and adding some lsns + // migrate to moving tenant_id/timeline_id to the top and adding some lsns } else if version == 3 { - info!("reading safekeeper control file version {}", version); + info!("reading safekeeper control file version {version}"); let oldstate = SafeKeeperStateV3::des(&buf[..buf.len()])?; let server = ServerInfo { pg_version: oldstate.server.pg_version, diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 3e301259ed..41b9ad66e1 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -14,10 +14,10 @@ use regex::Regex; use std::sync::Arc; use tracing::info; use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, postgres_backend::{self, PostgresBackend}, pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}, - zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, }; /// Safekeeper handler of postgres commands @@ -25,8 +25,8 @@ pub struct SafekeeperPostgresHandler { pub conf: SafeKeeperConf, /// assigned application name pub appname: Option, - pub ztenantid: Option, - pub ztimelineid: Option, + pub tenant_id: Option, + pub timeline_id: Option, pub timeline: Option>, } @@ -63,17 +63,17 @@ fn parse_cmd(cmd: &str) -> Result { } impl postgres_backend::Handler for SafekeeperPostgresHandler { - // ztenant id and ztimeline id are passed in connection string params + // tenant_id and timeline_id are passed in connection string params fn startup(&mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket) -> Result<()> { if let FeStartupPacket::StartupMessage { params, .. } = sm { if let Some(options) = params.options_raw() { for opt in options { match opt.split_once('=') { - Some(("ztenantid", value)) => { - self.ztenantid = Some(value.parse()?); + Some(("tenant_id", value)) => { + self.tenant_id = Some(value.parse()?); } - Some(("ztimelineid", value)) => { - self.ztimelineid = Some(value.parse()?); + Some(("timeline_id", value)) => { + self.timeline_id = Some(value.parse()?); } _ => continue, } @@ -95,18 +95,18 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { info!( "got query {:?} in timeline {:?}", - query_string, self.ztimelineid + query_string, self.timeline_id ); let create = !(matches!(cmd, SafekeeperPostgresCommand::StartReplication { .. }) || matches!(cmd, SafekeeperPostgresCommand::IdentifySystem)); - let tenantid = self.ztenantid.context("tenantid is required")?; - let timelineid = self.ztimelineid.context("timelineid is required")?; + let tenant_id = self.tenant_id.context("tenant_id is required")?; + let timeline_id = self.timeline_id.context("timeline_id is required")?; if self.timeline.is_none() { self.timeline.set( &self.conf, - ZTenantTimelineId::new(tenantid, timelineid), + TenantTimelineId::new(tenant_id, timeline_id), create, )?; } @@ -121,7 +121,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb), SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd), } - .context(format!("timeline {timelineid}"))?; + .context(format!("timeline {timeline_id}"))?; Ok(()) } @@ -132,8 +132,8 @@ impl SafekeeperPostgresHandler { SafekeeperPostgresHandler { conf, appname: None, - ztenantid: None, - ztimelineid: None, + tenant_id: None, + timeline_id: None, timeline: None, } } diff --git a/safekeeper/src/http/models.rs b/safekeeper/src/http/models.rs index 4b3ae7798e..e13ea50eaf 100644 --- a/safekeeper/src/http/models.rs +++ b/safekeeper/src/http/models.rs @@ -1,8 +1,8 @@ use serde::{Deserialize, Serialize}; -use utils::zid::{NodeId, ZTimelineId}; +use utils::id::{NodeId, TimelineId}; #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, pub peer_ids: Vec, } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 13356c5921..14c9414c09 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -21,8 +21,8 @@ use utils::{ request::{ensure_no_body, parse_request_param}, RequestExt, RouterBuilder, }, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, - zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use super::models::TimelineCreateRequest; @@ -68,9 +68,9 @@ struct AcceptorStateStatus { #[derive(Debug, Serialize)] struct TimelineStatus { #[serde(serialize_with = "display_serialize")] - tenant_id: ZTenantId, + tenant_id: TenantId, #[serde(serialize_with = "display_serialize")] - timeline_id: ZTimelineId, + timeline_id: TimelineId, acceptor_state: AcceptorStateStatus, #[serde(serialize_with = "display_serialize")] flush_lsn: Lsn, @@ -90,7 +90,7 @@ struct TimelineStatus { /// Report info about timeline. async fn timeline_status_handler(request: Request) -> Result, ApiError> { - let zttid = ZTenantTimelineId::new( + let zttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); @@ -125,7 +125,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result, ApiError> { let request_data: TimelineCreateRequest = json_request(&mut request).await?; - let zttid = ZTenantTimelineId { + let zttid = TenantTimelineId { tenant_id: parse_request_param(&request, "tenant_id")?, timeline_id: request_data.timeline_id, }; @@ -146,7 +146,7 @@ async fn timeline_create_handler(mut request: Request) -> Result, ) -> Result, ApiError> { - let zttid = ZTenantTimelineId::new( + let zttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); @@ -181,7 +181,7 @@ async fn tenant_delete_force_handler( /// Used only in tests to hand craft required data. async fn record_safekeeper_info(mut request: Request) -> Result, ApiError> { - let zttid = ZTenantTimelineId::new( + let zttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 16c1d36131..00fc43521b 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -97,8 +97,8 @@ fn prepare_safekeeper(spg: &mut SafekeeperPostgresHandler) -> Result<()> { pg_version: 0, // unknown proposer_id: [0u8; 16], system_id: 0, - ztli: spg.ztimelineid.unwrap(), - tenant_id: spg.ztenantid.unwrap(), + timeline_id: spg.timeline_id.unwrap(), + tenant_id: spg.tenant_id.unwrap(), tli: 0, wal_seg_size: WAL_SEGMENT_SIZE as u32, // 16MB, default for tests }); diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 0335d61d3f..b466d5aab5 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -5,7 +5,7 @@ use std::path::PathBuf; use std::time::Duration; use url::Url; -use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId}; +use utils::id::{NodeId, TenantId, TenantTimelineId}; pub mod broker; pub mod control_file; @@ -61,11 +61,11 @@ pub struct SafeKeeperConf { } impl SafeKeeperConf { - pub fn tenant_dir(&self, tenant_id: &ZTenantId) -> PathBuf { + pub fn tenant_dir(&self, tenant_id: &TenantId) -> PathBuf { self.workdir.join(tenant_id.to_string()) } - pub fn timeline_dir(&self, zttid: &ZTenantTimelineId) -> PathBuf { + pub fn timeline_dir(&self, zttid: &TenantTimelineId) -> PathBuf { self.tenant_dir(&zttid.tenant_id) .join(zttid.timeline_id.to_string()) } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index c693035dd3..3fa3916266 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -8,7 +8,7 @@ use metrics::{ Gauge, IntGaugeVec, }; use postgres_ffi::XLogSegNo; -use utils::{lsn::Lsn, zid::ZTenantTimelineId}; +use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::{ safekeeper::{SafeKeeperState, SafekeeperMemState}, @@ -16,7 +16,7 @@ use crate::{ }; pub struct FullTimelineInfo { - pub zttid: ZTenantTimelineId, + pub zttid: TenantTimelineId, pub replicas: Vec, pub wal_backup_active: bool, pub timeline_is_active: bool, diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index af4cfb6ba4..b0b6a73621 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -53,7 +53,7 @@ impl<'pg> ReceiveWalConn<'pg> { /// Receive WAL from wal_proposer pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<()> { - let _enter = info_span!("WAL acceptor", timeline = %spg.ztimelineid.unwrap()).entered(); + let _enter = info_span!("WAL acceptor", timeline = %spg.timeline_id.unwrap()).entered(); // Notify the libpq client that it's allowed to send `CopyData` messages self.pg_backend diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index a2bdcb55e7..fa045eed90 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -19,9 +19,9 @@ use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; use utils::{ bin_ser::LeSer, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, pq_proto::{ReplicationFeedback, SystemId}, - zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; pub const SK_MAGIC: u32 = 0xcafeceefu32; @@ -166,10 +166,9 @@ pub struct Peers(pub Vec<(NodeId, PeerInfo)>); #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SafeKeeperState { #[serde(with = "hex")] - pub tenant_id: ZTenantId, - /// Zenith timelineid + pub tenant_id: TenantId, #[serde(with = "hex")] - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, /// persistent acceptor state pub acceptor_state: AcceptorState, /// information about server @@ -219,7 +218,7 @@ pub struct SafekeeperMemState { } impl SafeKeeperState { - pub fn new(zttid: &ZTenantTimelineId, peers: Vec) -> SafeKeeperState { + pub fn new(zttid: &TenantTimelineId, peers: Vec) -> SafeKeeperState { SafeKeeperState { tenant_id: zttid.tenant_id, timeline_id: zttid.timeline_id, @@ -245,7 +244,7 @@ impl SafeKeeperState { #[cfg(test)] pub fn empty() -> Self { - SafeKeeperState::new(&ZTenantTimelineId::empty(), vec![]) + SafeKeeperState::new(&TenantTimelineId::empty(), vec![]) } } @@ -260,9 +259,8 @@ pub struct ProposerGreeting { pub pg_version: u32, pub proposer_id: PgUuid, pub system_id: SystemId, - /// Zenith timelineid - pub ztli: ZTimelineId, - pub tenant_id: ZTenantId, + pub timeline_id: TimelineId, + pub tenant_id: TenantId, pub tli: TimeLineID, pub wal_seg_size: u32, } @@ -507,13 +505,13 @@ where { // constructor pub fn new( - ztli: ZTimelineId, + timeline_id: TimelineId, state: CTRL, mut wal_store: WAL, node_id: NodeId, ) -> Result> { - if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id { - bail!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id); + if state.timeline_id != TimelineId::from([0u8; 16]) && timeline_id != state.timeline_id { + bail!("Calling SafeKeeper::new with inconsistent timeline_id ({}) and SafeKeeperState.server.timeline_id ({})", timeline_id, state.timeline_id); } // initialize wal_store, if state is already initialized @@ -600,10 +598,10 @@ where self.state.tenant_id ); } - if msg.ztli != self.state.timeline_id { + if msg.timeline_id != self.state.timeline_id { bail!( "invalid timeline ID, got {}, expected {}", - msg.ztli, + msg.timeline_id, self.state.timeline_id ); } @@ -982,9 +980,9 @@ mod tests { persisted_state: SafeKeeperState::empty(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let ztli = ZTimelineId::from([0u8; 16]); + let timeline_id = TimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(timeline_id, storage, wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -1000,7 +998,7 @@ mod tests { persisted_state: state, }; - sk = SafeKeeper::new(ztli, storage, sk.wal_store, NodeId(0)).unwrap(); + sk = SafeKeeper::new(timeline_id, storage, sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request); @@ -1016,9 +1014,9 @@ mod tests { persisted_state: SafeKeeperState::empty(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let ztli = ZTimelineId::from([0u8; 16]); + let timeline_id = TimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(timeline_id, storage, wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { term: 1, diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 293cf67c57..375b6eea18 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -30,7 +30,7 @@ use utils::{ // See: https://www.postgresql.org/docs/13/protocol-replication.html const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h'; const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r'; -// zenith extension of replication protocol +// neon extension of replication protocol const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; type FullTransactionId = u64; @@ -105,7 +105,7 @@ impl ReplicationConn { match &msg { FeMessage::CopyData(m) => { // There's three possible data messages that the client is supposed to send here: - // `HotStandbyFeedback` and `StandbyStatusUpdate` and `ZenithStandbyFeedback`. + // `HotStandbyFeedback` and `StandbyStatusUpdate` and `NeonStandbyFeedback`. match m.first().cloned() { Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => { @@ -165,12 +165,12 @@ impl ReplicationConn { pgb: &mut PostgresBackend, mut start_pos: Lsn, ) -> Result<()> { - let _enter = info_span!("WAL sender", timeline = %spg.ztimelineid.unwrap()).entered(); + let _enter = info_span!("WAL sender", timeline = %spg.timeline_id.unwrap()).entered(); // spawn the background thread which receives HotStandbyFeedback messages. let bg_timeline = Arc::clone(spg.timeline.get()); let bg_stream_in = self.stream_in.take().unwrap(); - let bg_timeline_id = spg.ztimelineid.unwrap(); + let bg_timeline_id = spg.timeline_id.unwrap(); let state = ReplicaState::new(); // This replica_id is used below to check if it's time to stop replication. diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 8d101e6ff6..cf317c41c3 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -21,9 +21,9 @@ use tokio::sync::mpsc::Sender; use tracing::*; use utils::{ + id::{NodeId, TenantId, TenantTimelineId}, lsn::Lsn, pq_proto::ReplicationFeedback, - zid::{NodeId, ZTenantId, ZTenantTimelineId}, }; use crate::control_file; @@ -98,7 +98,7 @@ impl SharedState { /// Initialize timeline state, creating control file fn create( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, peer_ids: Vec, ) -> Result { let state = SafeKeeperState::new(zttid, peer_ids); @@ -119,7 +119,7 @@ impl SharedState { /// Restore SharedState from control file. /// If file doesn't exist, bails out. - fn restore(conf: &SafeKeeperConf, zttid: &ZTenantTimelineId) -> Result { + fn restore(conf: &SafeKeeperConf, zttid: &TenantTimelineId) -> Result { let control_store = control_file::FileStorage::restore_new(zttid, conf)?; let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); @@ -143,7 +143,7 @@ impl SharedState { /// Mark timeline active/inactive and return whether s3 offloading requires /// start/stop action. - fn update_status(&mut self, ttid: ZTenantTimelineId) -> bool { + fn update_status(&mut self, ttid: TenantTimelineId) -> bool { let is_active = self.is_active(); if self.active != is_active { info!("timeline {} active={} now", ttid, is_active); @@ -213,7 +213,7 @@ impl SharedState { // // To choose what feedback to use and resend to compute node, // we need to know which pageserver compute node considers to be main. - // See https://github.com/zenithdb/zenith/issues/1171 + // See https://github.com/neondatabase/neon/issues/1171 // if let Some(pageserver_feedback) = state.pageserver_feedback { if let Some(acc_feedback) = acc.pageserver_feedback { @@ -227,7 +227,7 @@ impl SharedState { // last lsn received by pageserver // FIXME if multiple pageservers are streaming WAL, last_received_lsn must be tracked per pageserver. - // See https://github.com/zenithdb/zenith/issues/1171 + // See https://github.com/neondatabase/neon/issues/1171 acc.last_received_lsn = Lsn::from(pageserver_feedback.ps_writelsn); // When at least one pageserver has preserved data up to remote_consistent_lsn, @@ -256,11 +256,11 @@ impl SharedState { /// Database instance (tenant) pub struct Timeline { - pub zttid: ZTenantTimelineId, + pub zttid: TenantTimelineId, /// Sending here asks for wal backup launcher attention (start/stop /// offloading). Sending zttid instead of concrete command allows to do /// sending without timeline lock. - wal_backup_launcher_tx: Sender, + wal_backup_launcher_tx: Sender, commit_lsn_watch_tx: watch::Sender, /// For breeding receivers. commit_lsn_watch_rx: watch::Receiver, @@ -269,8 +269,8 @@ pub struct Timeline { impl Timeline { fn new( - zttid: ZTenantTimelineId, - wal_backup_launcher_tx: Sender, + zttid: TenantTimelineId, + wal_backup_launcher_tx: Sender, shared_state: SharedState, ) -> Timeline { let (commit_lsn_watch_tx, commit_lsn_watch_rx) = @@ -539,13 +539,13 @@ impl Timeline { // Utilities needed by various Connection-like objects pub trait TimelineTools { - fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()>; + fn set(&mut self, conf: &SafeKeeperConf, zttid: TenantTimelineId, create: bool) -> Result<()>; fn get(&self) -> &Arc; } impl TimelineTools for Option> { - fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()> { + fn set(&mut self, conf: &SafeKeeperConf, zttid: TenantTimelineId, create: bool) -> Result<()> { *self = Some(GlobalTimelines::get(conf, zttid, create)?); Ok(()) } @@ -556,8 +556,8 @@ impl TimelineTools for Option> { } struct GlobalTimelinesState { - timelines: HashMap>, - wal_backup_launcher_tx: Option>, + timelines: HashMap>, + wal_backup_launcher_tx: Option>, } static TIMELINES_STATE: Lazy> = Lazy::new(|| { @@ -577,7 +577,7 @@ pub struct TimelineDeleteForceResult { pub struct GlobalTimelines; impl GlobalTimelines { - pub fn init(wal_backup_launcher_tx: Sender) { + pub fn init(wal_backup_launcher_tx: Sender) { let mut state = TIMELINES_STATE.lock().unwrap(); assert!(state.wal_backup_launcher_tx.is_none()); state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); @@ -586,7 +586,7 @@ impl GlobalTimelines { fn create_internal( mut state: MutexGuard, conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, peer_ids: Vec, ) -> Result> { match state.timelines.get(&zttid) { @@ -612,7 +612,7 @@ impl GlobalTimelines { pub fn create( conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, peer_ids: Vec, ) -> Result> { let state = TIMELINES_STATE.lock().unwrap(); @@ -623,7 +623,7 @@ impl GlobalTimelines { /// If control file doesn't exist and create=false, bails out. pub fn get( conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, create: bool, ) -> Result> { let _enter = info_span!("", timeline = %zttid.timeline_id).entered(); @@ -664,13 +664,12 @@ impl GlobalTimelines { } /// Get loaded timeline, if it exists. - pub fn get_loaded(zttid: ZTenantTimelineId) -> Option> { + pub fn get_loaded(zttid: TenantTimelineId) -> Option> { let state = TIMELINES_STATE.lock().unwrap(); state.timelines.get(&zttid).map(Arc::clone) } - /// Get ZTenantTimelineIDs of all active timelines. - pub fn get_active_timelines() -> HashSet { + pub fn get_active_timelines() -> HashSet { let state = TIMELINES_STATE.lock().unwrap(); state .timelines @@ -692,7 +691,7 @@ impl GlobalTimelines { fn delete_force_internal( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, was_active: bool, ) -> Result { match std::fs::remove_dir_all(conf.timeline_dir(zttid)) { @@ -721,7 +720,7 @@ impl GlobalTimelines { /// TODO: ensure all of the above never happens. pub async fn delete_force( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, ) -> Result { info!("deleting timeline {}", zttid); let timeline = TIMELINES_STATE.lock().unwrap().timelines.remove(zttid); @@ -737,8 +736,8 @@ impl GlobalTimelines { /// There may be a race if new timelines are created simultaneously. pub async fn delete_force_all_for_tenant( conf: &SafeKeeperConf, - tenant_id: &ZTenantId, - ) -> Result> { + tenant_id: &TenantId, + ) -> Result> { info!("deleting all timelines for tenant {}", tenant_id); let mut to_delete = HashMap::new(); { diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 5d946e37a4..85e967e218 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -23,7 +23,7 @@ use tokio::sync::watch; use tokio::time::sleep; use tracing::*; -use utils::{lsn::Lsn, zid::ZTenantTimelineId}; +use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::broker::{Election, ElectionLeader}; use crate::timeline::{GlobalTimelines, Timeline}; @@ -38,7 +38,7 @@ const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; pub fn wal_backup_launcher_thread_main( conf: SafeKeeperConf, - wal_backup_launcher_rx: Receiver, + wal_backup_launcher_rx: Receiver, ) { let rt = Builder::new_multi_thread() .worker_threads(conf.backup_runtime_threads) @@ -53,7 +53,7 @@ pub fn wal_backup_launcher_thread_main( /// Check whether wal backup is required for timeline. If yes, mark that launcher is /// aware of current status and return the timeline. -fn is_wal_backup_required(zttid: ZTenantTimelineId) -> Option> { +fn is_wal_backup_required(zttid: TenantTimelineId) -> Option> { GlobalTimelines::get_loaded(zttid).filter(|t| t.wal_backup_attend()) } @@ -70,7 +70,7 @@ struct WalBackupTimelineEntry { /// Start per timeline task, if it makes sense for this safekeeper to offload. fn consider_start_task( conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, task: &mut WalBackupTimelineEntry, ) { if !task.timeline.can_wal_backup() { @@ -117,7 +117,7 @@ const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000; /// panics and separate elections from offloading itself. async fn wal_backup_launcher_main_loop( conf: SafeKeeperConf, - mut wal_backup_launcher_rx: Receiver, + mut wal_backup_launcher_rx: Receiver, ) { info!( "WAL backup launcher started, remote config {:?}", @@ -135,7 +135,7 @@ async fn wal_backup_launcher_main_loop( // Presense in this map means launcher is aware s3 offloading is needed for // the timeline, but task is started only if it makes sense for to offload // from this safekeeper. - let mut tasks: HashMap = HashMap::new(); + let mut tasks: HashMap = HashMap::new(); let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC)); loop { @@ -193,7 +193,7 @@ struct WalBackupTask { /// Offload single timeline. async fn backup_task_main( - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, timeline_dir: PathBuf, mut shutdown_rx: Receiver<()>, election: Election, diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 644237a00d..58b69f06e7 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -25,7 +25,7 @@ use std::path::{Path, PathBuf}; use tracing::*; -use utils::{lsn::Lsn, zid::ZTenantTimelineId}; +use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::safekeeper::SafeKeeperState; @@ -86,7 +86,7 @@ struct WalStorageMetrics { } impl WalStorageMetrics { - fn new(zttid: &ZTenantTimelineId) -> Self { + fn new(zttid: &TenantTimelineId) -> Self { let tenant_id = zttid.tenant_id.to_string(); let timeline_id = zttid.timeline_id.to_string(); Self { @@ -130,7 +130,7 @@ pub trait Storage { /// When storage is just created, all LSNs are zeroes and there are no segments on disk. pub struct PhysicalStorage { metrics: WalStorageMetrics, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, timeline_dir: PathBuf, conf: SafeKeeperConf, @@ -161,7 +161,7 @@ pub struct PhysicalStorage { } impl PhysicalStorage { - pub fn new(zttid: &ZTenantTimelineId, conf: &SafeKeeperConf) -> PhysicalStorage { + pub fn new(zttid: &TenantTimelineId, conf: &SafeKeeperConf) -> PhysicalStorage { let timeline_dir = conf.timeline_dir(zttid); PhysicalStorage { metrics: WalStorageMetrics::new(zttid), diff --git a/scripts/generate_and_push_perf_report.sh b/scripts/generate_and_push_perf_report.sh index df84fa0dd8..9e03302b0f 100755 --- a/scripts/generate_and_push_perf_report.sh +++ b/scripts/generate_and_push_perf_report.sh @@ -5,8 +5,8 @@ set -eux -o pipefail SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -echo "Uploading perf report to zenith pg" -# ingest per test results data into zenith backed postgres running in staging to build grafana reports on that data +echo "Uploading perf report to neon pg" +# ingest per test results data into neon backed postgres running in staging to build grafana reports on that data DATABASE_URL="$PERF_TEST_RESULT_CONNSTR" poetry run python "$SCRIPT_DIR"/ingest_perf_test_result.py --ingest "$REPORT_FROM" # Activate poetry's venv. Needed because git upload does not run in a project dir (it uses tmp to store the repository) @@ -16,8 +16,8 @@ DATABASE_URL="$PERF_TEST_RESULT_CONNSTR" poetry run python "$SCRIPT_DIR"/ingest_ echo "Uploading perf result to zenith-perf-data" scripts/git-upload \ - --repo=https://"$VIP_VAP_ACCESS_TOKEN"@github.com/zenithdb/zenith-perf-data.git \ - --message="add performance test result for $GITHUB_SHA zenith revision" \ + --repo=https://"$VIP_VAP_ACCESS_TOKEN"@github.com/neondatabase/zenith-perf-data.git \ + --message="add performance test result for $GITHUB_SHA neon revision" \ --branch=master \ copy "$REPORT_FROM" "data/$REPORT_TO" `# COPY FROM TO_RELATIVE`\ --merge \ diff --git a/scripts/perf_report_template.html b/scripts/perf_report_template.html index 2847e75a00..c86ab37c2d 100644 --- a/scripts/perf_report_template.html +++ b/scripts/perf_report_template.html @@ -19,7 +19,7 @@ } -

Zenith Performance Tests

+

Neon Performance Tests

{% for suit_name, suit_data in context.items() %}

Runs for {{ suit_name }}

@@ -38,7 +38,7 @@ {% for row in suit_data.rows %} -
{{ row.revision[:6] }} + {{ row.revision[:6] }} {% for column_value in row.values %} {{ column_value.value }}{{column_value.ratio}} {% endfor %} diff --git a/test_runner/README.md b/test_runner/README.md index c7ec361d65..44751944b3 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -60,7 +60,7 @@ Useful environment variables: `TEST_OUTPUT`: Set the directory where test state and test output files should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. -`ZENITH_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as +`NEON_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as `--pageserver-config-override=${value}` parameter values when neon_local cli is invoked `RUST_LOG`: logging configuration to pass into Neon CLI diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index b9cdfdebc4..b5565dab0f 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -16,7 +16,7 @@ from typing import Iterator, Optional import pytest from _pytest.config import Config from _pytest.terminal import TerminalReporter -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId """ This file contains fixtures for micro-benchmarks. @@ -365,11 +365,11 @@ class NeonBenchmarker: assert matches, f"metric {metric_name} not found" return int(round(float(matches.group(1)))) - def get_timeline_size(self, repo_dir: Path, tenantid: ZTenantId, timelineid: ZTimelineId): + def get_timeline_size(self, repo_dir: Path, tenant_id: TenantId, timeline_id: TimelineId): """ Calculate the on-disk size of a timeline """ - path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid) + path = f"{repo_dir}/tenants/{tenant_id}/timelines/{timeline_id}" totalbytes = 0 for root, dirs, files in os.walk(path): diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 69c6d31315..0c03429f95 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -29,7 +29,7 @@ import pytest import requests from cached_property import cached_property from fixtures.log_helper import log -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId # Type-related stuff from psycopg2.extensions import connection as PgConnection @@ -754,7 +754,7 @@ class NeonEnv: # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. - self.initial_tenant = ZTenantId.generate() + self.initial_tenant = TenantId.generate() # Create a config file corresponding to the options toml = textwrap.dedent( @@ -776,7 +776,7 @@ class NeonEnv: pg=self.port_distributor.get_port(), http=self.port_distributor.get_port(), ) - pageserver_auth_type = "ZenithJWT" if config.auth_enabled else "Trust" + pageserver_auth_type = "NeonJWT" if config.auth_enabled else "Trust" toml += textwrap.dedent( f""" @@ -841,7 +841,7 @@ class NeonEnv: """Get list of safekeeper endpoints suitable for safekeepers GUC""" return ",".join([f"localhost:{wa.port.pg}" for wa in self.safekeepers]) - def timeline_dir(self, tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Path: + def timeline_dir(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" return self.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) @@ -971,7 +971,7 @@ class NeonPageserverHttpClient(requests.Session): assert isinstance(res_json, list) return res_json - def tenant_create(self, new_tenant_id: Optional[ZTenantId] = None) -> ZTenantId: + def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId: res = self.post( f"http://localhost:{self.port}/v1/tenant", json={ @@ -983,24 +983,24 @@ class NeonPageserverHttpClient(requests.Session): raise Exception(f"could not create tenant: already exists for id {new_tenant_id}") new_tenant_id = res.json() assert isinstance(new_tenant_id, str) - return ZTenantId(new_tenant_id) + return TenantId(new_tenant_id) - def tenant_attach(self, tenant_id: ZTenantId): + def tenant_attach(self, tenant_id: TenantId): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach") self.verbose_error(res) - def tenant_detach(self, tenant_id: ZTenantId): + def tenant_detach(self, tenant_id: TenantId): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach") self.verbose_error(res) - def tenant_status(self, tenant_id: ZTenantId) -> Dict[Any, Any]: + def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) return res_json - def timeline_list(self, tenant_id: ZTenantId) -> List[Dict[str, Any]]: + def timeline_list(self, tenant_id: TenantId) -> List[Dict[str, Any]]: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline") self.verbose_error(res) res_json = res.json() @@ -1009,9 +1009,9 @@ class NeonPageserverHttpClient(requests.Session): def timeline_create( self, - tenant_id: ZTenantId, - new_timeline_id: Optional[ZTimelineId] = None, - ancestor_timeline_id: Optional[ZTimelineId] = None, + tenant_id: TenantId, + new_timeline_id: Optional[TimelineId] = None, + ancestor_timeline_id: Optional[TimelineId] = None, ancestor_start_lsn: Optional[Lsn] = None, ) -> Dict[Any, Any]: res = self.post( @@ -1032,8 +1032,8 @@ class NeonPageserverHttpClient(requests.Session): def timeline_detail( self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, include_non_incremental_logical_size: bool = False, include_non_incremental_physical_size: bool = False, ) -> Dict[Any, Any]: @@ -1052,7 +1052,7 @@ class NeonPageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def timeline_delete(self, tenant_id: ZTenantId, timeline_id: ZTimelineId): + def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId): res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" ) @@ -1174,17 +1174,17 @@ class NeonCli(AbstractNeonCli): def create_tenant( self, - tenant_id: Optional[ZTenantId] = None, - timeline_id: Optional[ZTimelineId] = None, + tenant_id: Optional[TenantId] = None, + timeline_id: Optional[TimelineId] = None, conf: Optional[Dict[str, str]] = None, - ) -> Tuple[ZTenantId, ZTimelineId]: + ) -> Tuple[TenantId, TimelineId]: """ Creates a new tenant, returns its id and its initial timeline's id. """ if tenant_id is None: - tenant_id = ZTenantId.generate() + tenant_id = TenantId.generate() if timeline_id is None: - timeline_id = ZTimelineId.generate() + timeline_id = TimelineId.generate() if conf is None: res = self.raw_cli( [ @@ -1211,7 +1211,7 @@ class NeonCli(AbstractNeonCli): res.check_returncode() return tenant_id, timeline_id - def config_tenant(self, tenant_id: ZTenantId, conf: Dict[str, str]): + def config_tenant(self, tenant_id: TenantId, conf: Dict[str, str]): """ Update tenant config. """ @@ -1230,8 +1230,8 @@ class NeonCli(AbstractNeonCli): return res def create_timeline( - self, new_branch_name: str, tenant_id: Optional[ZTenantId] = None - ) -> ZTimelineId: + self, new_branch_name: str, tenant_id: Optional[TenantId] = None + ) -> TimelineId: cmd = [ "timeline", "create", @@ -1250,9 +1250,9 @@ class NeonCli(AbstractNeonCli): if matches is not None: created_timeline_id = matches.group("timeline_id") - return ZTimelineId(str(created_timeline_id)) + return TimelineId(str(created_timeline_id)) - def create_root_branch(self, branch_name: str, tenant_id: Optional[ZTenantId] = None): + def create_root_branch(self, branch_name: str, tenant_id: Optional[TenantId] = None): cmd = [ "timeline", "create", @@ -1274,15 +1274,15 @@ class NeonCli(AbstractNeonCli): if created_timeline_id is None: raise Exception("could not find timeline id after `neon timeline create` invocation") else: - return ZTimelineId(created_timeline_id) + return TimelineId(created_timeline_id) def create_branch( self, new_branch_name: str = DEFAULT_BRANCH_NAME, ancestor_branch_name: Optional[str] = None, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, ancestor_start_lsn: Optional[Lsn] = None, - ) -> ZTimelineId: + ) -> TimelineId: cmd = [ "timeline", "branch", @@ -1308,11 +1308,9 @@ class NeonCli(AbstractNeonCli): if created_timeline_id is None: raise Exception("could not find timeline id after `neon timeline create` invocation") else: - return ZTimelineId(str(created_timeline_id)) + return TimelineId(str(created_timeline_id)) - def list_timelines( - self, tenant_id: Optional[ZTenantId] = None - ) -> List[Tuple[str, ZTimelineId]]: + def list_timelines(self, tenant_id: Optional[TenantId] = None) -> List[Tuple[str, TimelineId]]: """ Returns a list of (branch_name, timeline_id) tuples out of parsed `neon timeline list` CLI output. """ @@ -1324,14 +1322,14 @@ class NeonCli(AbstractNeonCli): ) timelines_cli = sorted( map( - lambda branch_and_id: (branch_and_id[0], ZTimelineId(branch_and_id[1])), + lambda branch_and_id: (branch_and_id[0], TimelineId(branch_and_id[1])), TIMELINE_DATA_EXTRACTOR.findall(res.stdout), ) ) return timelines_cli def init( - self, config_toml: str, initial_timeline_id: Optional[ZTimelineId] = None + self, config_toml: str, initial_timeline_id: Optional[TimelineId] = None ) -> "subprocess.CompletedProcess[str]": with tempfile.NamedTemporaryFile(mode="w+") as tmp: tmp.write(config_toml) @@ -1410,7 +1408,7 @@ class NeonCli(AbstractNeonCli): self, branch_name: str, node_name: Optional[str] = None, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": @@ -1436,7 +1434,7 @@ class NeonCli(AbstractNeonCli): def pg_start( self, node_name: str, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": @@ -1460,7 +1458,7 @@ class NeonCli(AbstractNeonCli): def pg_stop( self, node_name: str, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, destroy=False, check_return_code=True, ) -> "subprocess.CompletedProcess[str]": @@ -1558,7 +1556,7 @@ def append_pageserver_param_overrides( f"--pageserver-config-override=remote_storage={remote_storage_toml_table}" ) - env_overrides = os.getenv("ZENITH_PAGESERVER_OVERRIDES") + env_overrides = os.getenv("NEON_PAGESERVER_OVERRIDES") if env_overrides is not None: params_to_update += [ f"--pageserver-config-override={o.strip()}" for o in env_overrides.split(";") @@ -1867,7 +1865,7 @@ class Postgres(PgProtocol): """An object representing a running postgres daemon.""" def __init__( - self, env: NeonEnv, tenant_id: ZTenantId, port: int, check_stop_result: bool = True + self, env: NeonEnv, tenant_id: TenantId, port: int, check_stop_result: bool = True ): super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres") self.env = env @@ -2057,7 +2055,7 @@ class PostgresFactory: self, branch_name: str, node_name: Optional[str] = None, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> Postgres: @@ -2081,7 +2079,7 @@ class PostgresFactory: self, branch_name: str, node_name: Optional[str] = None, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> Postgres: @@ -2157,7 +2155,7 @@ class Safekeeper: return self def append_logical_message( - self, tenant_id: ZTenantId, timeline_id: ZTimelineId, request: Dict[str, Any] + self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any] ) -> Dict[str, Any]: """ Send JSON_CTRL query to append LogicalMessage to WAL and modify @@ -2167,7 +2165,7 @@ class Safekeeper: # "replication=0" hacks psycopg not to send additional queries # on startup, see https://github.com/psycopg/psycopg2/pull/482 - connstr = f"host=localhost port={self.port.pg} replication=0 options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'" + connstr = f"host=localhost port={self.port.pg} replication=0 options='-c timeline_id={timeline_id} tenant_id={tenant_id}'" with closing(psycopg2.connect(connstr)) as conn: # server doesn't support transactions @@ -2202,8 +2200,8 @@ class SafekeeperTimelineStatus: class SafekeeperMetrics: # These are metrics from Prometheus which uses float64 internally. # As a consequence, values may differ from real original int64s. - flush_lsn_inexact: Dict[Tuple[ZTenantId, ZTimelineId], int] = field(default_factory=dict) - commit_lsn_inexact: Dict[Tuple[ZTenantId, ZTimelineId], int] = field(default_factory=dict) + flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) + commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) class SafekeeperHttpClient(requests.Session): @@ -2221,7 +2219,7 @@ class SafekeeperHttpClient(requests.Session): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() def timeline_status( - self, tenant_id: ZTenantId, timeline_id: ZTimelineId + self, tenant_id: TenantId, timeline_id: TimelineId ) -> SafekeeperTimelineStatus: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") res.raise_for_status() @@ -2234,16 +2232,14 @@ class SafekeeperHttpClient(requests.Session): remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]), ) - def record_safekeeper_info(self, tenant_id: ZTenantId, timeline_id: ZTimelineId, body): + def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): res = self.post( f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", json=body, ) res.raise_for_status() - def timeline_delete_force( - self, tenant_id: ZTenantId, timeline_id: ZTimelineId - ) -> Dict[Any, Any]: + def timeline_delete_force(self, tenant_id: TenantId, timeline_id: TimelineId) -> Dict[Any, Any]: res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" ) @@ -2252,7 +2248,7 @@ class SafekeeperHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def tenant_delete_force(self, tenant_id: ZTenantId) -> Dict[Any, Any]: + def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]: res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") res.raise_for_status() res_json = res.json() @@ -2273,16 +2269,16 @@ class SafekeeperHttpClient(requests.Session): all_metrics_text, re.MULTILINE, ): - metrics.flush_lsn_inexact[ - (ZTenantId(match.group(1)), ZTimelineId(match.group(2))) - ] = int(match.group(3)) + metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int( + match.group(3) + ) for match in re.finditer( r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', all_metrics_text, re.MULTILINE, ): metrics.commit_lsn_inexact[ - (ZTenantId(match.group(1)), ZTimelineId(match.group(2))) + (TenantId(match.group(1)), TimelineId(match.group(2))) ] = int(match.group(3)) return metrics @@ -2456,7 +2452,7 @@ def list_files_to_compare(pgdata_dir: Path): # pg is the existing and running compute node, that we want to compare with a basebackup def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Postgres): # Get the timeline ID. We need it for the 'basebackup' command - timeline = ZTimelineId(pg.safe_psql("SHOW neon.timeline_id")[0][0]) + timeline = TimelineId(pg.safe_psql("SHOW neon.timeline_id")[0][0]) # stop postgres to ensure that files won't change pg.stop() @@ -2540,7 +2536,7 @@ def wait_until(number_of_iterations: int, interval: float, func): def assert_timeline_local( - pageserver_http_client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId + pageserver_http_client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId ): timeline_detail = pageserver_http_client.timeline_detail( tenant, @@ -2554,14 +2550,14 @@ def assert_timeline_local( def assert_no_in_progress_downloads_for_tenant( pageserver_http_client: NeonPageserverHttpClient, - tenant: ZTenantId, + tenant: TenantId, ): tenant_status = pageserver_http_client.tenant_status(tenant) assert tenant_status["has_in_progress_downloads"] is False, tenant_status def remote_consistent_lsn( - pageserver_http_client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId + pageserver_http_client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) @@ -2578,8 +2574,8 @@ def remote_consistent_lsn( def wait_for_upload( pageserver_http_client: NeonPageserverHttpClient, - tenant: ZTenantId, - timeline: ZTimelineId, + tenant: TenantId, + timeline: TimelineId, lsn: Lsn, ): """waits for local timeline upload up to specified lsn""" @@ -2601,7 +2597,7 @@ def wait_for_upload( def last_record_lsn( - pageserver_http_client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId + pageserver_http_client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) @@ -2612,8 +2608,8 @@ def last_record_lsn( def wait_for_last_record_lsn( pageserver_http_client: NeonPageserverHttpClient, - tenant: ZTenantId, - timeline: ZTimelineId, + tenant: TenantId, + timeline: TimelineId, lsn: Lsn, ): """waits for pageserver to catch up to a certain lsn""" @@ -2632,7 +2628,7 @@ def wait_for_last_record_lsn( ) -def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: ZTenantId, timeline: ZTimelineId): +def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId): """Wait for pageserver to catch up the latest flush LSN""" last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) @@ -2643,8 +2639,8 @@ def fork_at_current_lsn( pg: Postgres, new_branch_name: str, ancestor_branch_name: str, - tenant_id: Optional[ZTenantId] = None, -) -> ZTimelineId: + tenant_id: Optional[TenantId] = None, +) -> TimelineId: """ Create new branch at the last LSN of an existing branch. The "last LSN" is taken from the given Postgres instance. The pageserver will wait for all the diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py index bdf675a785..de2e131b79 100644 --- a/test_runner/fixtures/types.py +++ b/test_runner/fixtures/types.py @@ -46,11 +46,11 @@ class Lsn: @total_ordering -class ZId: +class Id: """ Datatype for a Neon tenant and timeline IDs. Internally it's a 16-byte array, and - the string representation is in hex. This corresponds to the ZId / ZTenantId / - ZTimelineIds in the Rust code. + the string representation is in hex. This corresponds to the Id / TenantId / + TimelineIds in the Rust code. """ def __init__(self, x: str): @@ -79,11 +79,11 @@ class ZId: return cls(random.randbytes(16).hex()) -class ZTenantId(ZId): +class TenantId(Id): def __repr__(self): - return f'ZTenantId("{self.id.hex()}")' + return f'`TenantId("{self.id.hex()}")' -class ZTimelineId(ZId): +class TimelineId(Id): def __repr__(self): - return f'ZTimelineId("{self.id.hex()}")' + return f'TimelineId("{self.id.hex()}")' diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index 8bac8080db..21e48cf899 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -20,4 +20,4 @@ All tests run only once. Usually to obtain more consistent performance numbers, Local test results for main branch, and results of daily performance tests, are stored in a neon project deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks. -There is also an inconsistency in test naming. Test name should be the same across platforms, and results can be differentiated by the platform field. But currently, platform is sometimes included in test name because of the way how parametrization works in pytest. I.e. there is a platform switch in the dashboard with zenith-local-ci and zenith-staging variants. I.e. some tests under zenith-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[zenith]` which is highly confusing. +There is also an inconsistency in test naming. Test name should be the same across platforms, and results can be differentiated by the platform field. But currently, platform is sometimes included in test name because of the way how parametrization works in pytest. I.e. there is a platform switch in the dashboard with neon-local-ci and neon-staging variants. I.e. some tests under neon-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]` which is highly confusing. diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index b8e81824b0..cb2621ff02 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -1,6 +1,6 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import ZTimelineId +from fixtures.types import TimelineId from fixtures.utils import query_scalar @@ -27,7 +27,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): pg_branch0 = env.postgres.create_start("main", tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() - branch0_timeline = ZTimelineId(query_scalar(branch0_cur, "SHOW neon.timeline_id")) + branch0_timeline = TimelineId(query_scalar(branch0_cur, "SHOW neon.timeline_id")) log.info(f"b0 timeline {branch0_timeline}") # Create table, and insert 100k rows. @@ -51,7 +51,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on 'branch1' branch") branch1_cur = pg_branch1.connect().cursor() - branch1_timeline = ZTimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id")) + branch1_timeline = TimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id")) log.info(f"b1 timeline {branch1_timeline}") branch1_lsn = query_scalar(branch1_cur, "SELECT pg_current_wal_insert_lsn()") @@ -74,7 +74,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on 'branch2' branch") branch2_cur = pg_branch2.connect().cursor() - branch2_timeline = ZTimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id")) + branch2_timeline = TimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id")) log.info(f"b2 timeline {branch2_timeline}") branch2_lsn = query_scalar(branch2_cur, "SELECT pg_current_wal_insert_lsn()") diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 08e38e1461..d9082efada 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -2,7 +2,7 @@ from contextlib import closing import pytest from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException -from fixtures.types import ZTenantId +from fixtures.types import TenantId def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): @@ -13,7 +13,7 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant) tenant_http_client = env.pageserver.http_client(tenant_token) - invalid_tenant_token = env.auth_keys.generate_tenant_token(ZTenantId.generate()) + invalid_tenant_token = env.auth_keys.generate_tenant_token(TenantId.generate()) invalid_tenant_http_client = env.pageserver.http_client(invalid_tenant_token) management_token = env.auth_keys.generate_management_token() diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index 5bd6368bfc..cfb9649867 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -2,7 +2,7 @@ import psycopg2.extras import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import Lsn, ZTimelineId +from fixtures.types import Lsn, TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -28,7 +28,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): main_cur = pgmain.connect().cursor() - timeline = ZTimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) + timeline = TimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) # Create table, and insert the first 100 rows main_cur.execute("CREATE TABLE foo (t text)") diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index ce3a74930e..fd81981b2b 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -5,7 +5,7 @@ from typing import List, Tuple import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId # Test restarting page server, while safekeeper and compute node keep @@ -15,7 +15,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - tenant_timelines: List[Tuple[ZTenantId, ZTimelineId, Postgres]] = [] + tenant_timelines: List[Tuple[TenantId, TimelineId, Postgres]] = [] for n in range(4): tenant_id, timeline_id = env.neon_cli.create_tenant() diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index af94865549..8de2687c9b 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -8,7 +8,7 @@ from fixtures.neon_fixtures import ( VanillaPostgres, pg_distrib_dir, ) -from fixtures.types import Lsn, ZTimelineId +from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar, subprocess_capture num_rows = 1000 @@ -27,7 +27,7 @@ def test_fullbackup( log.info("postgres is running on 'test_fullbackup' branch") with pgmain.cursor() as cur: - timeline = ZTimelineId(query_scalar(cur, "SHOW neon.timeline_id")) + timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 67ce8871cd..88d4ad8a6e 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -3,7 +3,7 @@ import random from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres -from fixtures.types import ZTimelineId +from fixtures.types import TimelineId from fixtures.utils import query_scalar # Test configuration @@ -29,7 +29,7 @@ async def update_table(pg: Postgres): # Perform aggressive GC with 0 horizon -async def gc(env: NeonEnv, timeline: ZTimelineId): +async def gc(env: NeonEnv, timeline: TimelineId): psconn = await env.pageserver.connect_async() while updates_performed < updates_to_perform: @@ -37,7 +37,7 @@ async def gc(env: NeonEnv, timeline: ZTimelineId): # At the same time, run UPDATEs and GC -async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: ZTimelineId): +async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: TimelineId): workers = [] for worker_id in range(num_connections): workers.append(asyncio.create_task(update_table(pg))) @@ -62,7 +62,7 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on test_gc_aggressive branch") with pg.cursor() as cur: - timeline = ZTimelineId(query_scalar(cur, "SHOW neon.timeline_id")) + timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) # Create table, and insert the first 100 rows cur.execute("CREATE TABLE foo (id int, counter int, t text)") diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index fc9f41bda0..60cc0551ab 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -17,7 +17,7 @@ from fixtures.neon_fixtures import ( wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import subprocess_capture @@ -69,8 +69,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] node_name = "import_from_vanilla" - tenant = ZTenantId.generate() - timeline = ZTimelineId.generate() + tenant = TenantId.generate() + timeline = TimelineId.generate() # Set up pageserver for import neon_env_builder.enable_local_fs_remote_storage() @@ -195,7 +195,7 @@ def _generate_data(num_rows: int, pg: Postgres) -> Lsn: def _import( - expected_num_rows: int, lsn: Lsn, env: NeonEnv, pg_bin: PgBin, timeline: ZTimelineId + expected_num_rows: int, lsn: Lsn, env: NeonEnv, pg_bin: PgBin, timeline: TimelineId ) -> str: """Test importing backup data to the pageserver. @@ -228,9 +228,9 @@ def _import( # start the pageserver again env.pageserver.start() - # Import using another tenantid, because we use the same pageserver. + # Import using another tenant_id, because we use the same pageserver. # TODO Create another pageserver to make test more realistic. - tenant = ZTenantId.generate() + tenant = TenantId.generate() # Import to pageserver node_name = "import_from_pageserver" diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index b2342e5ee8..a9dc63dd50 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -7,11 +7,11 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, NeonPageserverHttpClient, ) -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId def helper_compare_timeline_list( - pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv, initial_tenant: ZTenantId + pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv, initial_tenant: TenantId ): """ Compare timelines list returned by CLI and directly via API. @@ -20,7 +20,7 @@ def helper_compare_timeline_list( timelines_api = sorted( map( - lambda t: ZTimelineId(t["timeline_id"]), + lambda t: TimelineId(t["timeline_id"]), pageserver_http_client.timeline_list(initial_tenant), ) ) @@ -85,7 +85,7 @@ def test_cli_tenant_list(neon_simple_env: NeonEnv): helper_compare_tenant_list(pageserver_http_client, env) res = env.neon_cli.list_tenants() - tenants = sorted(map(lambda t: ZTenantId(t.split()[0]), res.stdout.splitlines())) + tenants = sorted(map(lambda t: TenantId(t.split()[0]), res.stdout.splitlines())) assert env.initial_tenant in tenants assert tenant1 in tenants diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index 2b5e2edb5f..c99e13f45f 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -1,7 +1,7 @@ import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import ZTimelineId +from fixtures.types import TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -27,7 +27,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): cur = pg_conn.cursor() # Get the timeline ID of our branch. We need it for the 'do_gc' command - timeline = ZTimelineId(query_scalar(cur, "SHOW neon.timeline_id")) + timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) psconn = env.pageserver.connect() pscur = psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index a7b7189824..def6bd5b33 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -11,7 +11,7 @@ from fixtures.neon_fixtures import ( pg_distrib_dir, wait_until, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId # test that we cannot override node id after init @@ -60,39 +60,39 @@ def test_pageserver_init_node_id(neon_simple_env: NeonEnv): assert "has node id already, it cannot be overridden" in bad_update.stderr -def check_client(client: NeonPageserverHttpClient, initial_tenant: ZTenantId): +def check_client(client: NeonPageserverHttpClient, initial_tenant: TenantId): client.check_status() # check initial tenant is there - assert initial_tenant in {ZTenantId(t["id"]) for t in client.tenant_list()} + assert initial_tenant in {TenantId(t["id"]) for t in client.tenant_list()} # create new tenant and check it is also there - tenant_id = ZTenantId.generate() + tenant_id = TenantId.generate() client.tenant_create(tenant_id) - assert tenant_id in {ZTenantId(t["id"]) for t in client.tenant_list()} + assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()} timelines = client.timeline_list(tenant_id) assert len(timelines) == 0, "initial tenant should not have any timelines" # create timeline - timeline_id = ZTimelineId.generate() + timeline_id = TimelineId.generate() client.timeline_create(tenant_id=tenant_id, new_timeline_id=timeline_id) timelines = client.timeline_list(tenant_id) assert len(timelines) > 0 # check it is there - assert timeline_id in {ZTimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id)} + assert timeline_id in {TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id)} for timeline in timelines: - timeline_id = ZTimelineId(timeline["timeline_id"]) + timeline_id = TimelineId(timeline["timeline_id"]) timeline_details = client.timeline_detail( tenant_id=tenant_id, timeline_id=timeline_id, include_non_incremental_logical_size=True, ) - assert ZTenantId(timeline_details["tenant_id"]) == tenant_id - assert ZTimelineId(timeline_details["timeline_id"]) == timeline_id + assert TenantId(timeline_details["tenant_id"]) == tenant_id + assert TimelineId(timeline_details["timeline_id"]) == timeline_id assert timeline_details.get("local") is not None @@ -118,8 +118,8 @@ def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): def expect_updated_msg_lsn( client: NeonPageserverHttpClient, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, prev_msg_lsn: Optional[Lsn], ) -> Lsn: timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id) diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index 329f4b7d24..786266b70e 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -3,7 +3,7 @@ from contextlib import closing import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import ZTimelineId +from fixtures.types import TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -25,7 +25,7 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() - timeline = ZTimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) + timeline = TimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) # Create table main_cur.execute("CREATE TABLE foo (t text)") diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 04baef6ba0..cbe74cad5c 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -1,5 +1,5 @@ # It's possible to run any regular test with the local fs remote storage via -# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... +# env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... import os import shutil @@ -17,7 +17,7 @@ from fixtures.neon_fixtures import ( wait_for_upload, wait_until, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar @@ -61,8 +61,8 @@ def test_remote_storage_backup_and_restore( client = env.pageserver.http_client() - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) checkpoint_numbers = range(1, 3) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 147e22b38f..e3c9a091f9 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -4,10 +4,10 @@ import psycopg2 import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId -def do_gc_target(env: NeonEnv, tenant_id: ZTenantId, timeline_id: ZTimelineId): +def do_gc_target(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" try: env.pageserver.safe_psql(f"do_gc {tenant_id} {timeline_id} 0") @@ -20,7 +20,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): pageserver_http = env.pageserver.http_client() # first check for non existing tenant - tenant_id = ZTenantId.generate() + tenant_id = TenantId.generate() with pytest.raises( expected_exception=NeonPageserverApiException, match=f"Tenant not found for id {tenant_id}", @@ -46,7 +46,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): with pytest.raises( expected_exception=psycopg2.DatabaseError, match="gc target timeline does not exist" ): - bogus_timeline_id = ZTimelineId.generate() + bogus_timeline_id = TimelineId.generate() env.pageserver.safe_psql(f"do_gc {tenant_id} {bogus_timeline_id} 0") # try to concurrently run gc and detach diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 56563ebe87..aa7d92f1fd 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -24,7 +24,7 @@ from fixtures.neon_fixtures import ( wait_for_upload, wait_until, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, subprocess_capture @@ -113,15 +113,15 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve def populate_branch( pg: Postgres, - tenant_id: ZTenantId, + tenant_id: TenantId, ps_http: NeonPageserverHttpClient, create_table: bool, expected_sum: Optional[int], -) -> Tuple[ZTimelineId, Lsn]: +) -> Tuple[TimelineId, Lsn]: # insert some data with pg_cur(pg) as cur: cur.execute("SHOW neon.timeline_id") - timeline_id = ZTimelineId(cur.fetchone()[0]) + timeline_id = TimelineId(cur.fetchone()[0]) log.info("timeline to relocate %s", timeline_id) log.info( @@ -149,8 +149,8 @@ def populate_branch( def ensure_checkpoint( pageserver_cur, pageserver_http: NeonPageserverHttpClient, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, current_lsn: Lsn, ): # run checkpoint manually to be sure that data landed in remote storage @@ -162,8 +162,8 @@ def ensure_checkpoint( def check_timeline_attached( new_pageserver_http_client: NeonPageserverHttpClient, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, old_timeline_detail: Dict[str, Any], old_current_lsn: Lsn, ): @@ -187,8 +187,8 @@ def switch_pg_to_new_pageserver( env: NeonEnv, pg: Postgres, new_pageserver_port: int, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, ) -> pathlib.Path: pg.stop() @@ -265,7 +265,7 @@ def test_tenant_relocation( pageserver_http = env.pageserver.http_client() tenant_id, initial_timeline_id = env.neon_cli.create_tenant( - ZTenantId("74ee8b079a0e437eb0afea7d26a07209") + TenantId("74ee8b079a0e437eb0afea7d26a07209") ) log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id) diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 1214d703d0..97a13bbcb0 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,6 +1,6 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_until -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId def get_only_element(l): # noqa: E741 @@ -23,7 +23,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): def get_state(tenant): all_states = client.tenant_list() - matching = [t for t in all_states if ZTenantId(t["id"]) == tenant] + matching = [t for t in all_states if TenantId(t["id"]) == tenant] return get_only_element(matching)["state"] def get_metric_value(name): @@ -35,8 +35,8 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): value = line.lstrip(name).strip() return int(value) - def delete_all_timelines(tenant: ZTenantId): - timelines = [ZTimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)] + def delete_all_timelines(tenant: TenantId): + timelines = [TimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)] for t in timelines: client.timeline_delete(tenant, t) @@ -56,7 +56,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): # Delete all timelines on all tenants for tenant_info in client.tenant_list(): - tenant_id = ZTenantId(tenant_info["id"]) + tenant_id = TenantId(tenant_info["id"]) delete_all_timelines(tenant_id) wait_until(10, 0.2, lambda: assert_active_without_jobs(tenant_id)) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index bd53aae25c..4e7610a96f 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -8,7 +8,7 @@ import pytest from fixtures.log_helper import log from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder -from fixtures.types import Lsn, ZTenantId +from fixtures.types import Lsn, TenantId from prometheus_client.samples import Sample @@ -188,7 +188,7 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde cur.execute("SELECT sum(key) FROM t") assert cur.fetchone() == (5000050000,) - def get_ps_metric_samples_for_tenant(tenant_id: ZTenantId) -> List[Sample]: + def get_ps_metric_samples_for_tenant(tenant_id: TenantId) -> List[Sample]: ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver") samples = [] for metric_name in ps_metrics.metrics: diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 70b474c9a9..85f371c845 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -19,7 +19,7 @@ from fixtures.neon_fixtures import ( wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId async def tenant_workload(env: NeonEnv, pg: Postgres): @@ -58,7 +58,7 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem env = neon_env_builder.init_start() - tenants_pgs: List[Tuple[ZTenantId, Postgres]] = [] + tenants_pgs: List[Tuple[TenantId, Postgres]] = [] for _ in range(1, 5): # Use a tiny checkpoint distance, to create a lot of layers quickly @@ -83,8 +83,8 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem res = pg.safe_psql_many( ["SHOW neon.tenant_id", "SHOW neon.timeline_id", "SELECT pg_current_wal_flush_lsn()"] ) - tenant_id = ZTenantId(res[0][0][0]) - timeline_id = ZTimelineId(res[1][0][0]) + tenant_id = TenantId(res[0][0][0]) + timeline_id = TimelineId(res[1][0][0]) current_lsn = Lsn(res[2][0][0]) # wait until pageserver receives all the data diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 5a20dbd232..2eea8dd3cc 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -1,6 +1,6 @@ import pytest from fixtures.neon_fixtures import NeonEnv, NeonPageserverApiException, wait_until -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId def test_timeline_delete(neon_simple_env: NeonEnv): @@ -10,12 +10,12 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # first try to delete non existing timeline # for existing tenant: - invalid_timeline_id = ZTimelineId.generate() + invalid_timeline_id = TimelineId.generate() with pytest.raises(NeonPageserverApiException, match="timeline not found"): ps_http.timeline_delete(tenant_id=env.initial_tenant, timeline_id=invalid_timeline_id) # for non existing tenant: - invalid_tenant_id = ZTenantId.generate() + invalid_tenant_id = TenantId.generate() with pytest.raises( NeonPageserverApiException, match=f"Tenant {invalid_tenant_id} not found in the local state", diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 6fbc430e80..83018f46f5 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -15,7 +15,7 @@ from fixtures.neon_fixtures import ( assert_timeline_local, wait_for_last_flush_lsn, ) -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId from fixtures.utils import get_timeline_dir_size @@ -386,7 +386,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): tenant, timeline = env.neon_cli.create_tenant() - def get_timeline_physical_size(timeline: ZTimelineId): + def get_timeline_physical_size(timeline: TimelineId): res = client.timeline_detail(tenant, timeline, include_non_incremental_physical_size=True) return res["local"]["current_physical_size_non_incremental"] @@ -415,7 +415,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): assert tenant_physical_size == timeline_total_size -def assert_physical_size(env: NeonEnv, tenant_id: ZTenantId, timeline_id: ZTimelineId): +def assert_physical_size(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): """Check the current physical size returned from timeline API matches the total physical size of the timeline on disk""" client = env.pageserver.http_client() @@ -431,7 +431,7 @@ def assert_physical_size(env: NeonEnv, tenant_id: ZTenantId, timeline_id: ZTimel # Timeline logical size initialization is an asynchronous background task that runs once, # try a few times to ensure it's activated properly def wait_for_timeline_size_init( - client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId + client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId ): for i in range(10): timeline_details = assert_timeline_local(client, tenant, timeline) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index cd370e60c0..8c5b4c8c30 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -32,13 +32,13 @@ from fixtures.neon_fixtures import ( wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import get_dir_size, query_scalar def wait_lsn_force_checkpoint( - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, pg: Postgres, ps: NeonPageserver, pageserver_conn_options={}, @@ -74,7 +74,7 @@ def wait_lsn_force_checkpoint( @dataclass class TimelineMetrics: - timeline_id: ZTimelineId + timeline_id: TimelineId last_record_lsn: Lsn # One entry per each Safekeeper, order is the same flush_lsns: List[Lsn] = field(default_factory=list) @@ -126,7 +126,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): timeline_metrics = [] for timeline_detail in timeline_details: - timeline_id = ZTimelineId(timeline_detail["timeline_id"]) + timeline_id = TimelineId(timeline_detail["timeline_id"]) local_timeline_detail = timeline_detail.get("local") if local_timeline_detail is None: @@ -273,8 +273,8 @@ def test_broker(neon_env_builder: NeonEnvBuilder): pg.safe_psql("CREATE TABLE t(key int primary key, value text)") # learn neon timeline from compute - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) # wait until remote_consistent_lsn gets advanced on all safekeepers clients = [sk.http_client() for sk in env.safekeepers] @@ -325,8 +325,8 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): ] ) - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) # force checkpoint to advance remote_consistent_lsn pageserver_conn_options = {} @@ -348,7 +348,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): auth_token=env.auth_keys.generate_tenant_token(tenant_id) ) http_cli_other = env.safekeepers[0].http_client( - auth_token=env.auth_keys.generate_tenant_token(ZTenantId.generate()) + auth_token=env.auth_keys.generate_tenant_token(TenantId.generate()) ) http_cli_noauth = env.safekeepers[0].http_client() @@ -438,8 +438,8 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot pg = env.postgres.create_start("test_safekeepers_wal_backup") # learn neon timeline from compute - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) pg_conn = pg.connect() cur = pg_conn.cursor() @@ -493,8 +493,8 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re pg = env.postgres.create_start("test_s3_wal_replay") # learn neon timeline from compute - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) expected_sum = 0 @@ -584,8 +584,8 @@ class ProposerPostgres(PgProtocol): self, pgdata_dir: str, pg_bin, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, listen_addr: str, port: int, ): @@ -593,8 +593,8 @@ class ProposerPostgres(PgProtocol): self.pgdata_dir: str = pgdata_dir self.pg_bin: PgBin = pg_bin - self.tenant_id: ZTenantId = tenant_id - self.timeline_id: ZTimelineId = timeline_id + self.tenant_id: TenantId = tenant_id + self.timeline_id: TimelineId = timeline_id self.listen_addr: str = listen_addr self.port: int = port @@ -672,8 +672,8 @@ def test_sync_safekeepers( neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - tenant_id = ZTenantId.generate() - timeline_id = ZTimelineId.generate() + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() # write config for proposer pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata") @@ -725,8 +725,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): wa = env.safekeepers[0] # learn neon timeline from compute - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) if not auth_enabled: wa_http_cli = wa.http_client() @@ -735,7 +735,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): wa_http_cli = wa.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) wa_http_cli.check_status() wa_http_cli_bad = wa.http_client( - auth_token=env.auth_keys.generate_tenant_token(ZTenantId.generate()) + auth_token=env.auth_keys.generate_tenant_token(TenantId.generate()) ) wa_http_cli_bad.check_status() wa_http_cli_noauth = wa.http_client() @@ -785,15 +785,15 @@ class SafekeeperEnv: self.bin_safekeeper = os.path.join(str(neon_binpath), "safekeeper") self.safekeepers: Optional[List[subprocess.CompletedProcess[Any]]] = None self.postgres: Optional[ProposerPostgres] = None - self.tenant_id: Optional[ZTenantId] = None - self.timeline_id: Optional[ZTimelineId] = None + self.tenant_id: Optional[TenantId] = None + self.timeline_id: Optional[TimelineId] = None def init(self) -> "SafekeeperEnv": assert self.postgres is None, "postgres is already initialized" assert self.safekeepers is None, "safekeepers are already initialized" - self.tenant_id = ZTenantId.generate() - self.timeline_id = ZTimelineId.generate() + self.tenant_id = TenantId.generate() + self.timeline_id = TimelineId.generate() self.repo_dir.mkdir(exist_ok=True) # Create config and a Safekeeper object for each safekeeper @@ -912,9 +912,7 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): sum_after = query_scalar(cur, "SELECT SUM(key) FROM t") assert sum_after == sum_before + 5000050000 - def show_statuses( - safekeepers: List[Safekeeper], tenant_id: ZTenantId, timeline_id: ZTimelineId - ): + def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId): for sk in safekeepers: http_cli = sk.http_client() try: @@ -935,8 +933,8 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): pg.start() # learn neon timeline from compute - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) execute_payload(pg) show_statuses(env.safekeepers, tenant_id, timeline_id) @@ -1134,7 +1132,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Remove non-existing branch, should succeed - assert sk_http.timeline_delete_force(tenant_id, ZTimelineId("00" * 16)) == { + assert sk_http.timeline_delete_force(tenant_id, TimelineId("00" * 16)) == { "dir_existed": False, "was_active": False, } diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index e36d3cf94b..9d2008296a 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -7,7 +7,7 @@ from typing import List, Optional import asyncpg from fixtures.log_helper import getLogger from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId log = getLogger("root.safekeeper_async") @@ -103,8 +103,8 @@ async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accou async def wait_for_lsn( safekeeper: Safekeeper, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, wait_lsn: Lsn, polling_interval=1, timeout=60, @@ -155,8 +155,8 @@ async def run_restarts_under_load( test_timeout_at = time.monotonic() + 5 * 60 pg_conn = await pg.connect_async() - tenant_id = ZTenantId(await pg_conn.fetchval("show neon.tenant_id")) - timeline_id = ZTimelineId(await pg_conn.fetchval("show neon.timeline_id")) + tenant_id = TenantId(await pg_conn.fetchval("show neon.tenant_id")) + timeline_id = TimelineId(await pg_conn.fetchval("show neon.timeline_id")) bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount) # create tables and initial balances diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 6fd509c4d1..21921a3bc2 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -9,7 +9,7 @@ from fixtures.neon_fixtures import ( base_dir, pg_distrib_dir, ) -from fixtures.types import ZTenantId +from fixtures.types import TenantId def test_wal_restore( @@ -22,7 +22,7 @@ def test_wal_restore( env.neon_cli.create_branch("test_wal_restore") pg = env.postgres.create_start("test_wal_restore") pg.safe_psql("create table t as select generate_series(1,300000)") - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) env.neon_cli.pageserver_stop() port = port_distributor.get_port() data_dir = test_output_dir / "pgsql.restored" From 6db6e7ddda3c67a3d48387955859452e93f7d751 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 13 Sep 2022 22:51:28 +0300 Subject: [PATCH 0768/1022] Use backward-compatible safekeeper code --- safekeeper/src/handler.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 41b9ad66e1..ad2c0ec8bf 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -68,11 +68,14 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { if let FeStartupPacket::StartupMessage { params, .. } = sm { if let Some(options) = params.options_raw() { for opt in options { + // FIXME `ztenantid` and `ztimelineid` left for compatibility during deploy, + // remove these after the PR gets deployed: + // https://github.com/neondatabase/neon/pull/2433#discussion_r970005064 match opt.split_once('=') { - Some(("tenant_id", value)) => { + Some(("ztenantid", value)) | Some(("tenant_id", value)) => { self.tenant_id = Some(value.parse()?); } - Some(("timeline_id", value)) => { + Some(("ztimelineid", value)) | Some(("timeline_id", value)) => { self.timeline_id = Some(value.parse()?); } _ => continue, From c3096532f9ceee8fad82b4c741b0108bd143cc06 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 14 Sep 2022 09:23:51 +0300 Subject: [PATCH 0769/1022] Fix vendor/postgres-v15 to point to correct v15 branch. Commit f44afbaf62 updated vendor/postgres-v15 to point to a commit that was built on top of PostgreSQL 14 rather than 15. So we accidentally had two copies of PostgreSQL v14 in the repository. Oops. This updates it to point to the correct version. --- vendor/postgres-v15 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index b1dbd93e2b..cf4db95b84 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit b1dbd93e2b1691e93860f7e59b9e1fe5a6e79786 +Subproject commit cf4db95b8480e08425e52ef46f78cb5a234baa0e From d87c9e62d64c8a4628096a4ce5c8307fc1daa2e6 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 14 Sep 2022 11:53:34 +0100 Subject: [PATCH 0770/1022] Nightly Benchmarks: perform tests on both pre-created and fresh projects (#2443) --- .github/workflows/benchmarking.yml | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 49fbc74dd6..fab0a9aa04 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -144,7 +144,9 @@ jobs: strategy: fail-fast: false matrix: - platform: [ neon-captest, rds-aurora ] + # neon-captest: Run pgbench, reusing existing project + # neon-captest-new: Same, but on a freshly created project + platform: [ neon-captest, neon-captest-new, rds-aurora ] runs-on: dev container: @@ -162,7 +164,7 @@ jobs: sudo apt install -y postgresql-14 - name: Create Neon Project - if: matrix.platform == 'neon-captest' + if: matrix.platform == 'neon-captest-new' id: create-neon-project uses: ./.github/actions/neon-project-create with: @@ -174,13 +176,16 @@ jobs: run: | case "${PLATFORM}" in neon-captest) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} + ;; + neon-captest-new) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest' or 'rds-aurora'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest', 'neon-captest-new' or 'rds-aurora'" exit 1 ;; esac @@ -240,7 +245,7 @@ jobs: build_type: ${{ env.BUILD_TYPE }} - name: Delete Neon Project - if: ${{ matrix.platform == 'neon-captest' && always() }} + if: ${{ matrix.platform == 'neon-captest-new' && always() }} uses: ./.github/actions/neon-project-delete with: environment: dev @@ -252,6 +257,6 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} From f86ea09323ac0d6f2904dcf603652044cea50664 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 14 Sep 2022 09:53:06 +0300 Subject: [PATCH 0771/1022] Avoid recompiling postgres_ffi every time you run "make". Running "make" at the top level calls "make install" to install the PostgreSQL headers into the pg_install/ directory. That always updated the modification time of the headers even if there were no changes, triggering recompilation of the postgres_ffi bindings. To avoid that, use 'install -C', to install the PostgreSQL headers. However, there was an upstream PostgreSQL issue that the src/include/Makefile didn't respect the INSTALL configure option. That was just fixed in upstream PostgreSQL, so cherry-pick that fix to our vendor/postgres repositories. Fixes https://github.com/neondatabase/neon/issues/1873. --- Makefile | 6 ++++++ vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 4d7b1bee07..4ac51ed174 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,12 @@ ifeq ($(UNAME_S),Darwin) PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib endif +# Use -C option so that when PostgreSQL "make install" installs the +# headers, the mtime of the headers are not changed when there have +# been no changes to the files. Changing the mtime triggers an +# unnecessary rebuild of 'postgres_ffi'. +PG_CONFIGURE_OPTS += INSTALL='install -C' + # Choose whether we should be silent or verbose CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose) # Fix for a corner case when make doesn't pass a jobserver diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 114676d2ed..ce723ee499 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 114676d2edd5307226d9448ec467821fdb77467d +Subproject commit ce723ee499450cb108aede464a35a17f3d75cf84 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index cf4db95b84..0858387047 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit cf4db95b8480e08425e52ef46f78cb5a234baa0e +Subproject commit 08583870479e30c64aeb5a97d6fee9cf470f05fb From 87bf7be5370cc2a621cd51d5a4cb3b1ed76e4633 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 14 Sep 2022 21:27:47 +0300 Subject: [PATCH 0772/1022] [proxy] Drop support for legacy cloud API (#2448) Apparently, it no longer exists in the cloud. --- proxy/src/auth.rs | 5 - proxy/src/auth/backend.rs | 22 +-- proxy/src/auth/backend/legacy_console.rs | 208 ----------------------- proxy/src/config.rs | 19 +-- proxy/src/main.rs | 31 +++- 5 files changed, 30 insertions(+), 255 deletions(-) delete mode 100644 proxy/src/auth/backend/legacy_console.rs diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index d09470d15e..a50d23e351 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -22,10 +22,6 @@ pub type Result = std::result::Result; /// Common authentication error. #[derive(Debug, Error)] pub enum AuthErrorImpl { - // This will be dropped in the future. - #[error(transparent)] - Legacy(#[from] backend::LegacyAuthError), - #[error(transparent)] Link(#[from] backend::LinkAuthError), @@ -78,7 +74,6 @@ impl UserFacingError for AuthError { fn to_string_client(&self) -> String { use AuthErrorImpl::*; match self.0.as_ref() { - Legacy(e) => e.to_string_client(), Link(e) => e.to_string_client(), GetAuthInfo(e) => e.to_string_client(), WakeCompute(e) => e.to_string_client(), diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 9c43620ffb..de0719a196 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -6,9 +6,6 @@ pub use link::LinkAuthError; mod console; pub use console::{GetAuthInfoError, WakeComputeError}; -mod legacy_console; -pub use legacy_console::LegacyAuthError; - use crate::{ auth::{self, AuthFlow, ClientCredentials}, compute, config, mgmt, @@ -56,7 +53,7 @@ impl std::fmt::Debug for DatabaseInfo { fmt.debug_struct("DatabaseInfo") .field("host", &self.host) .field("port", &self.port) - .finish() + .finish_non_exhaustive() } } @@ -88,8 +85,6 @@ impl From for tokio_postgres::Config { /// backends which require them for the authentication process. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum BackendType { - /// Legacy Cloud API (V1) + link auth. - LegacyConsole(T), /// Current Cloud API (V2). Console(T), /// Local mock of Cloud API (V2). @@ -105,7 +100,6 @@ impl BackendType { pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType { use BackendType::*; match self { - LegacyConsole(x) => LegacyConsole(f(x)), Console(x) => Console(f(x)), Postgres(x) => Postgres(f(x)), Link => Link, @@ -119,7 +113,6 @@ impl BackendType> { pub fn transpose(self) -> Result, E> { use BackendType::*; match self { - LegacyConsole(x) => x.map(LegacyConsole), Console(x) => x.map(Console), Postgres(x) => x.map(Postgres), Link => Ok(Link), @@ -176,15 +169,6 @@ impl BackendType> { } match self { - LegacyConsole(creds) => { - legacy_console::handle_user( - &urls.auth_endpoint, - &urls.auth_link_uri, - &creds, - client, - ) - .await - } Console(creds) => { console::Api::new(&urls.auth_endpoint, &creds) .handle_user(client) @@ -208,7 +192,6 @@ mod tests { #[test] fn test_backend_type_map() { let values = [ - BackendType::LegacyConsole(0), BackendType::Console(0), BackendType::Postgres(0), BackendType::Link, @@ -222,8 +205,7 @@ mod tests { #[test] fn test_backend_type_transpose() { let values = [ - BackendType::LegacyConsole(Ok::<_, ()>(0)), - BackendType::Console(Ok(0)), + BackendType::Console(Ok::<_, ()>(0)), BackendType::Postgres(Ok(0)), BackendType::Link, ]; diff --git a/proxy/src/auth/backend/legacy_console.rs b/proxy/src/auth/backend/legacy_console.rs deleted file mode 100644 index b99a004dcd..0000000000 --- a/proxy/src/auth/backend/legacy_console.rs +++ /dev/null @@ -1,208 +0,0 @@ -//! Cloud API V1. - -use super::DatabaseInfo; -use crate::{ - auth::{self, ClientCredentials}, - compute, - error::UserFacingError, - stream::PqStream, - waiters, -}; -use serde::{Deserialize, Serialize}; -use thiserror::Error; -use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::BeMessage as Be; - -#[derive(Debug, Error)] -pub enum LegacyAuthError { - /// Authentication error reported by the console. - #[error("Authentication failed: {0}")] - AuthFailed(String), - - /// HTTP status (other than 200) returned by the console. - #[error("Console responded with an HTTP status: {0}")] - HttpStatus(reqwest::StatusCode), - - #[error("Console responded with a malformed JSON: {0}")] - BadResponse(#[from] serde_json::Error), - - #[error(transparent)] - Transport(#[from] reqwest::Error), - - #[error(transparent)] - WaiterRegister(#[from] waiters::RegisterError), - - #[error(transparent)] - WaiterWait(#[from] waiters::WaitError), -} - -impl UserFacingError for LegacyAuthError { - fn to_string_client(&self) -> String { - use LegacyAuthError::*; - match self { - AuthFailed(_) | HttpStatus(_) => self.to_string(), - _ => "Internal error".to_string(), - } - } -} - -// NOTE: the order of constructors is important. -// https://serde.rs/enum-representations.html#untagged -#[derive(Serialize, Deserialize, Debug)] -#[serde(untagged)] -enum ProxyAuthResponse { - Ready { conn_info: DatabaseInfo }, - Error { error: String }, - NotReady { ready: bool }, // TODO: get rid of `ready` -} - -impl ClientCredentials<'_> { - fn is_existing_user(&self) -> bool { - self.user.ends_with("@zenith") - } -} - -async fn authenticate_proxy_client( - auth_endpoint: &reqwest::Url, - creds: &ClientCredentials<'_>, - md5_response: &str, - salt: &[u8; 4], - psql_session_id: &str, -) -> Result { - let mut url = auth_endpoint.clone(); - url.query_pairs_mut() - .append_pair("login", creds.user) - .append_pair("database", creds.dbname) - .append_pair("md5response", md5_response) - .append_pair("salt", &hex::encode(salt)) - .append_pair("psql_session_id", psql_session_id); - - super::with_waiter(psql_session_id, |waiter| async { - println!("cloud request: {}", url); - // TODO: leverage `reqwest::Client` to reuse connections - let resp = reqwest::get(url).await?; - if !resp.status().is_success() { - return Err(LegacyAuthError::HttpStatus(resp.status())); - } - - let auth_info = serde_json::from_str(resp.text().await?.as_str())?; - println!("got auth info: {:?}", auth_info); - - use ProxyAuthResponse::*; - let db_info = match auth_info { - Ready { conn_info } => conn_info, - Error { error } => return Err(LegacyAuthError::AuthFailed(error)), - NotReady { .. } => waiter.await?.map_err(LegacyAuthError::AuthFailed)?, - }; - - Ok(db_info) - }) - .await -} - -async fn handle_existing_user( - auth_endpoint: &reqwest::Url, - client: &mut PqStream, - creds: &ClientCredentials<'_>, -) -> auth::Result { - let psql_session_id = super::link::new_psql_session_id(); - let md5_salt = rand::random(); - - client - .write_message(&Be::AuthenticationMD5Password(md5_salt)) - .await?; - - // Read client's password hash - let msg = client.read_password_message().await?; - let md5_response = parse_password(&msg).ok_or(auth::AuthErrorImpl::MalformedPassword( - "the password should be a valid null-terminated utf-8 string", - ))?; - - let db_info = authenticate_proxy_client( - auth_endpoint, - creds, - md5_response, - &md5_salt, - &psql_session_id, - ) - .await?; - - Ok(compute::NodeInfo { - reported_auth_ok: false, - config: db_info.into(), - }) -} - -pub async fn handle_user( - auth_endpoint: &reqwest::Url, - auth_link_uri: &reqwest::Url, - creds: &ClientCredentials<'_>, - client: &mut PqStream, -) -> auth::Result { - if creds.is_existing_user() { - handle_existing_user(auth_endpoint, client, creds).await - } else { - super::link::handle_user(auth_link_uri, client).await - } -} - -fn parse_password(bytes: &[u8]) -> Option<&str> { - std::str::from_utf8(bytes).ok()?.strip_suffix('\0') -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn test_proxy_auth_response() { - // Ready - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": true, - "conn_info": DatabaseInfo::default(), - })) - .unwrap(); - assert!(matches!( - auth, - ProxyAuthResponse::Ready { - conn_info: DatabaseInfo { .. } - } - )); - - // Error - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": false, - "error": "too bad, so sad", - })) - .unwrap(); - assert!(matches!(auth, ProxyAuthResponse::Error { .. })); - - // NotReady - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": false, - })) - .unwrap(); - assert!(matches!(auth, ProxyAuthResponse::NotReady { .. })); - } - - #[test] - fn parse_db_info() -> anyhow::Result<()> { - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - "password": "password", - }))?; - - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - }))?; - - Ok(()) - } -} diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 1f01c25734..8835d660d5 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,21 +1,6 @@ use crate::{auth, url::ApiUrl}; -use anyhow::{bail, ensure, Context}; -use std::{str::FromStr, sync::Arc}; - -impl FromStr for auth::BackendType<()> { - type Err = anyhow::Error; - - fn from_str(s: &str) -> anyhow::Result { - use auth::BackendType::*; - Ok(match s { - "legacy" => LegacyConsole(()), - "console" => Console(()), - "postgres" => Postgres(()), - "link" => Link, - _ => bail!("Invalid option `{s}` for auth method"), - }) - } -} +use anyhow::{ensure, Context}; +use std::sync::Arc; pub struct ProxyConfig { pub tls_config: Option, diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 2521f2af21..efe45f6386 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -20,7 +20,7 @@ mod url; mod waiters; use anyhow::{bail, Context}; -use clap::{App, Arg}; +use clap::{self, Arg}; use config::ProxyConfig; use futures::FutureExt; use std::{future::Future, net::SocketAddr}; @@ -36,9 +36,26 @@ async fn flatten_err( f.map(|r| r.context("join error").and_then(|x| x)).await } +/// A proper parser for auth backend parameter. +impl clap::ValueEnum for auth::BackendType<()> { + fn value_variants<'a>() -> &'a [Self] { + use auth::BackendType::*; + &[Console(()), Postgres(()), Link] + } + + fn to_possible_value<'a>(&self) -> Option> { + use auth::BackendType::*; + Some(clap::PossibleValue::new(match self { + Console(_) => "console", + Postgres(_) => "postgres", + Link => "link", + })) + } +} + #[tokio::main] async fn main() -> anyhow::Result<()> { - let arg_matches = App::new("Neon proxy/router") + let arg_matches = clap::App::new("Neon proxy/router") .version(GIT_VERSION) .arg( Arg::new("proxy") @@ -52,8 +69,8 @@ async fn main() -> anyhow::Result<()> { Arg::new("auth-backend") .long("auth-backend") .takes_value(true) - .help("Possible values: legacy | console | postgres | link") - .default_value("legacy"), + .value_parser(clap::builder::EnumValueParser::>::new()) + .default_value("link"), ) .arg( Arg::new("mgmt") @@ -118,6 +135,10 @@ async fn main() -> anyhow::Result<()> { let mgmt_address: SocketAddr = arg_matches.value_of("mgmt").unwrap().parse()?; let http_address: SocketAddr = arg_matches.value_of("http").unwrap().parse()?; + let auth_backend = *arg_matches + .try_get_one::>("auth-backend")? + .unwrap(); + let auth_urls = config::AuthUrls { auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, @@ -125,7 +146,7 @@ async fn main() -> anyhow::Result<()> { let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig { tls_config, - auth_backend: arg_matches.value_of("auth-backend").unwrap().parse()?, + auth_backend, auth_urls, })); From 757e2147c12a4d63cfecf84018b5453cbec474bd Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Thu, 15 Sep 2022 14:21:22 +0200 Subject: [PATCH 0773/1022] Follow-up for neondatabase/neon#2448 (#2452) * remove `legacy` mode from the proxy readme * explicitly specify `authBackend` in the link auth proxy helm-values for all envs --- .github/helm-values/neon-stress.proxy.yaml | 1 + .github/helm-values/production.proxy.yaml | 1 + .github/helm-values/staging.proxy.yaml | 1 + proxy/README.md | 17 +++++++---------- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/helm-values/neon-stress.proxy.yaml b/.github/helm-values/neon-stress.proxy.yaml index 8236f9873a..ce432ca23c 100644 --- a/.github/helm-values/neon-stress.proxy.yaml +++ b/.github/helm-values/neon-stress.proxy.yaml @@ -1,6 +1,7 @@ fullnameOverride: "neon-stress-proxy" settings: + authBackend: "link" authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/" uri: "https://console.dev.neon.tech/psql_session/" diff --git a/.github/helm-values/production.proxy.yaml b/.github/helm-values/production.proxy.yaml index 87c61c90cf..c26a6258be 100644 --- a/.github/helm-values/production.proxy.yaml +++ b/.github/helm-values/production.proxy.yaml @@ -1,4 +1,5 @@ settings: + authBackend: "link" authEndpoint: "https://console.neon.tech/authenticate_proxy_request/" uri: "https://console.neon.tech/psql_session/" diff --git a/.github/helm-values/staging.proxy.yaml b/.github/helm-values/staging.proxy.yaml index 34ba972b64..25842429a5 100644 --- a/.github/helm-values/staging.proxy.yaml +++ b/.github/helm-values/staging.proxy.yaml @@ -5,6 +5,7 @@ image: repository: neondatabase/neon settings: + authBackend: "link" authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/" uri: "https://console.stage.neon.tech/psql_session/" diff --git a/proxy/README.md b/proxy/README.md index 458a7d9bbf..4ead098b73 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -2,10 +2,8 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following backends are currently implemented: -* legacy - old method, when username ends with `@zenith` it uses md5 auth dbname as the cluster name; otherwise, it sends a login link and waits for the console to call back * console - new SCRAM-based console API; uses SNI info to select the destination cluster + new SCRAM-based console API; uses SNI info to select the destination project (endpoint soon) * postgres uses postgres to select auth secrets of existing roles. Useful for local testing * link @@ -13,21 +11,20 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme a ## Using SNI-based routing on localhost -Now proxy determines cluster name from the subdomain, request to the `my-cluster-42.somedomain.tld` will be routed to the cluster named `my-cluster-42`. Unfortunately `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy: +Now proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy: -``` +```sh openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me" - ``` -now you can start proxy: +start proxy -``` +```sh ./target/debug/proxy -c server.crt -k server.key ``` -and connect to it: +and connect to it -``` +```sh PGSSLROOTCERT=./server.crt psql 'postgres://my-cluster-42.localtest.me:1234?sslmode=verify-full' ``` From a8d97325291b207d3481ed9578246398c6576ec2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 15 Sep 2022 03:28:24 +0000 Subject: [PATCH 0774/1022] Bump axum-core from 0.2.7 to 0.2.8 Bumps [axum-core](https://github.com/tokio-rs/axum) from 0.2.7 to 0.2.8. - [Release notes](https://github.com/tokio-rs/axum/releases) - [Changelog](https://github.com/tokio-rs/axum/blob/main/CHANGELOG.md) - [Commits](https://github.com/tokio-rs/axum/compare/axum-core-v0.2.7...axum-core-v0.2.8) --- updated-dependencies: - dependency-name: axum-core dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Cargo.lock | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d4234d2b00..a258fab5f6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -183,9 +183,9 @@ dependencies = [ [[package]] name = "axum-core" -version = "0.2.7" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4f44a0e6200e9d11a1cdc989e4b358f6e3d354fbf48478f345a17f4e43f8635" +checksum = "d9f0c0a60006f2a293d82d571f635042a72edf927539b7685bd62d361963839b" dependencies = [ "async-trait", "bytes", @@ -193,6 +193,8 @@ dependencies = [ "http", "http-body", "mime", + "tower-layer", + "tower-service", ] [[package]] From 1062e57feeae80fa9771ad42dc66cd10ffcf5e36 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 15 Sep 2022 16:33:42 +0300 Subject: [PATCH 0775/1022] Don't run codestyle checks separately for Postgres v14 and v15. Previously, we compiled neon separately for Postgres v14 and v15, for the codestyle checks. But that was bogus; we actually just ran "make postgres", which always compiled both versions. The version really only affected the caching. Fix that, by copying the build steps from the main build_and_test.yml workflow. --- .github/workflows/codestyle.yml | 53 ++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 53d0f9c5d8..237cf81205 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -30,8 +30,6 @@ jobs: # this is all we need to install our toolchain later via rust-toolchain.toml # so don't install any toolchain explicitly. os: [ubuntu-latest, macos-latest] - # To support several Postgres versions, add them here. - postgres_version: [v14, v15] timeout-minutes: 60 name: check codestyle rust and postgres runs-on: ${{ matrix.os }} @@ -56,17 +54,29 @@ jobs: if: matrix.os == 'macos-latest' run: brew install flex bison openssl - - name: Set pg revision for caching - id: pg_ver - run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-${{matrix.postgres_version}}) + - name: Set pg 14 revision for caching + id: pg_v14_rev + run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14) + shell: bash -euxo pipefail {0} - - name: Cache postgres ${{matrix.postgres_version}} build - id: cache_pg + - name: Set pg 15 revision for caching + id: pg_v15_rev + run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v15) + shell: bash -euxo pipefail {0} + + - name: Cache postgres v14 build + id: cache_pg_14 uses: actions/cache@v3 with: - path: | - pg_install/${{matrix.postgres_version}} - key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }} + path: pg_install/v14 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Cache postgres v15 build + id: cache_pg_15 + uses: actions/cache@v3 + with: + path: pg_install/v15 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Set extra env for macOS if: matrix.os == 'macos-latest' @@ -74,24 +84,19 @@ jobs: echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV - - name: Build postgres - if: steps.cache_pg.outputs.cache-hit != 'true' - run: make postgres + - name: Build postgres v14 + if: steps.cache_pg_14.outputs.cache-hit != 'true' + run: make postgres-v14 + shell: bash -euxo pipefail {0} + + - name: Build postgres v15 + if: steps.cache_pg_15.outputs.cache-hit != 'true' + run: make postgres-v15 + shell: bash -euxo pipefail {0} - name: Build neon extensions run: make neon-pg-ext - # Plain configure output can contain weird errors like 'error: C compiler cannot create executables' - # and the real cause will be inside config.log - - name: Print configure logs in case of failure - if: failure() - continue-on-error: true - run: | - echo '' && echo '=== Postgres ${{matrix.postgres_version}} config.log ===' && echo '' - cat pg_install/build/${{matrix.postgres_version}}/config.log - echo '' && echo '=== Postgres ${{matrix.postgres_version}} configure.log ===' && echo '' - cat pg_install/build/${{matrix.postgres_version}}/configure.log - - name: Cache cargo deps id: cache_cargo uses: actions/cache@v3 From 9d9d8e951947b9cbaca4ab11937bda8d681dc24c Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 15 Sep 2022 19:16:07 +0200 Subject: [PATCH 0776/1022] docs/sourcetree: update CLion set up instructions (#2454) After #2325 the old method no longer works as our Makefile does not print compilation commands when run with --dry-run, see https://github.com/neondatabase/neon/issues/2378#issuecomment-1241421325 This method is much slower but is hopefully robust. Add some more notes while we're here. --- docs/sourcetree.md | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/docs/sourcetree.md b/docs/sourcetree.md index c1a860f126..8043450a55 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -147,8 +147,16 @@ C code requires some extra care, as it's built via Make, not CMake. Some of our ```bash # Install a `compiledb` tool which can parse make's output and generate the compilation database. poetry add -D compiledb - # Run Make without actually compiling code so we can generate the compilation database. It still may take a few minutes. - make --dry-run --print-directory --keep-going --assume-new=* postgres neon-pg-ext | poetry run compiledb --verbose --no-build + # Clean the build tree so we can rebuild from scratch. + # Unfortunately, our and Postgres Makefiles do not work well with either --dry-run or --assume-new, + # so we don't know a way to generate the compilation database without recompiling everything, + # see https://github.com/neondatabase/neon/issues/2378#issuecomment-1241421325 + make distclean + # Rebuild the Postgres parts from scratch and save the compilation commands to the compilation database. + # You can alter the -j parameter to your liking. + # Note that we only build for a specific version of Postgres. The extension code is shared, but headers are + # different, so we set up CLion to only use a specific version of the headers. + make -j$(nproc) --print-directory postgres-v15 neon-pg-ext-v15 | poetry run compiledb --verbose --no-build # Uninstall the tool poetry remove -D compiledb # Make sure the compile_commands.json file is not committed. @@ -157,7 +165,8 @@ C code requires some extra care, as it's built via Make, not CMake. Some of our 3. Open CLion, click "Open File or Project" and choose the generated `compile_commands.json` file to be opened "as a project". You cannot add a compilation database into an existing CLion project, you have to create a new one. _Do not_ open the directory as a project, open the file. 4. The newly created project should start indexing Postgres source code in C, as well as the C standard library. You may have to [configure the C compiler for the compilation database](https://www.jetbrains.com/help/clion/compilation-database.html#compdb_toolchain). 5. Open the `Cargo.toml` file in an editor in the same project. CLion should pick up the hint and start indexing Rust code. -7. Now you have a CLion project which knows about C files, Rust files. It should pick up Python files automatically as well. +6. Now you have a CLion project which knows about C files, Rust files. It should pick up Python files automatically as well. +7. Set up correct code indentation in CLion's settings: Editor > Code Style > C/C++, choose the "Project" scheme on the top, and tick the "Use tab character" on the "Tabs and Indents" tab. Ensure that "Tab size" is 4. You can also enable Cargo Clippy diagnostics and enable Rustfmt instead of built-in code formatter. @@ -168,3 +177,4 @@ Known issues (fixes and suggestions are welcome): * Test results may be hard to read in CLion, both for unit tests in Rust and integration tests in Python. Use command line to run them instead. * CLion does not support non-local Python interpreters, unlike PyCharm. E.g. if you use WSL, CLion does not see `poetry` and installed dependencies. Python support is limited. * Cargo Clippy diagnostics in CLion may take a lot of resources. +* `poetry add -D` updates some packages and changes `poetry.lock` drastically even when followed by `poetry remove -D`. Feel free to `git checkout poetry.lock` and `./scripts/pysync` to revert these changes. From e968b5e5025616f2a7d03cd7307c54a49185925c Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 15 Sep 2022 20:43:51 +0200 Subject: [PATCH 0777/1022] tests: do not set num_safekeepers = 1, it's the default (#2457) Also get rid if `with_safekeepers` parameter in tests. Its meaning has changed: `False` meant "no safekeepers" which is not supported anymore, so we assume it's always `True`. See #1648 --- test_runner/performance/test_perf_pgbench.py | 1 - test_runner/regress/test_auth.py | 8 +++----- test_runner/regress/test_branch_behind.py | 7 ------- test_runner/regress/test_crafted_wal_end.py | 1 - test_runner/regress/test_fullbackup.py | 2 -- test_runner/regress/test_import.py | 2 -- test_runner/regress/test_lsn_mapping.py | 1 - test_runner/regress/test_pitr_gc.py | 2 -- test_runner/regress/test_recovery.py | 1 - test_runner/regress/test_tenants.py | 18 ++++++------------ test_runner/regress/test_wal_acceptor.py | 1 - 11 files changed, 9 insertions(+), 35 deletions(-) diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 934642d095..2a2213b783 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -173,7 +173,6 @@ def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int): @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix()) def test_pgbench_flamegraph(zenbenchmark, pg_bin, neon_env_builder, scale: int, duration: int): - neon_env_builder.num_safekeepers = 1 neon_env_builder.pageserver_config_override = """ profiling="page_requests" """ diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index d9082efada..ce4a8ffa9e 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -56,14 +56,12 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): tenant_http_client.tenant_create() -@pytest.mark.parametrize("with_safekeepers", [False, True]) -def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): +def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder): neon_env_builder.auth_enabled = True - if with_safekeepers: - neon_env_builder.num_safekeepers = 3 + neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - branch = f"test_compute_auth_to_pageserver{with_safekeepers}" + branch = "test_compute_auth_to_pageserver" env.neon_cli.create_branch(branch) pg = env.postgres.create_start(branch) diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index cfb9649867..b0d0737172 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -10,13 +10,6 @@ from fixtures.utils import print_gc_result, query_scalar # Create a couple of branches off the main branch, at a historical point in time. # def test_branch_behind(neon_env_builder: NeonEnvBuilder): - - # Use safekeeper in this test to avoid a subtle race condition. - # Without safekeeper, walreceiver reconnection can stuck - # because of IO deadlock. - # - # See https://github.com/neondatabase/neon/issues/1068 - neon_env_builder.num_safekeepers = 1 # Disable pitr, because here we want to test branch creation after GC neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py index 32e5366945..e94c9a2bd0 100644 --- a/test_runner/regress/test_crafted_wal_end.py +++ b/test_runner/regress/test_crafted_wal_end.py @@ -17,7 +17,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft ], ) def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): - neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() env.neon_cli.create_branch("test_crafted_wal_end") diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index 8de2687c9b..0048e7b580 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -18,8 +18,6 @@ num_rows = 1000 def test_fullbackup( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor ): - - neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() env.neon_cli.create_branch("test_fullbackup") diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 60cc0551ab..7b61b03b97 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -122,7 +122,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build @pytest.mark.timeout(600) def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 1 neon_env_builder.enable_local_fs_remote_storage() env = neon_env_builder.init_start() @@ -140,7 +139,6 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu # @pytest.mark.skipif(os.environ.get('BUILD_TYPE') == "debug", reason="only run with release build") @pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2255") def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 1 neon_env_builder.enable_local_fs_remote_storage() env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 9d1efec2c1..ef99954a76 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -9,7 +9,6 @@ from fixtures.utils import query_scalar # Test pageserver get_lsn_by_timestamp API # def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping") diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index 786266b70e..57b2ee1c04 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -12,8 +12,6 @@ from fixtures.utils import print_gc_result, query_scalar # Insert some data, run GC and create a branch in the past. # def test_pitr_gc(neon_env_builder: NeonEnvBuilder): - - neon_env_builder.num_safekeepers = 1 # Set pitr interval such that we need to keep the data neon_env_builder.pageserver_config_override = ( "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 6aa8b4e9be..08c15d8f09 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -10,7 +10,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder # Test pageserver recovery after crash # def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 1 # Override default checkpointer settings to run it more often neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 4e7610a96f..4500395c8f 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -50,29 +50,23 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): ), "pageserver should clean its temp tenant dirs on restart" -@pytest.mark.parametrize("with_safekeepers", [False, True]) -def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): - if with_safekeepers: - neon_env_builder.num_safekeepers = 3 +def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() """Tests tenants with and without wal acceptors""" tenant_1, _ = env.neon_cli.create_tenant() tenant_2, _ = env.neon_cli.create_tenant() - env.neon_cli.create_timeline( - f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", tenant_id=tenant_1 - ) - env.neon_cli.create_timeline( - f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", tenant_id=tenant_2 - ) + env.neon_cli.create_timeline("test_tenants_normal_work", tenant_id=tenant_1) + env.neon_cli.create_timeline("test_tenants_normal_work", tenant_id=tenant_2) pg_tenant1 = env.postgres.create_start( - f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", + "test_tenants_normal_work", tenant_id=tenant_1, ) pg_tenant2 = env.postgres.create_start( - f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", + "test_tenants_normal_work", tenant_id=tenant_2, ) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 8c5b4c8c30..089ed91c98 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1037,7 +1037,6 @@ def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize("auth_enabled", [False, True]) def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): - neon_env_builder.num_safekeepers = 1 neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() From 96e867642fbe730a3fe13c572383d68b393ca567 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Thu, 15 Sep 2022 18:20:23 -0400 Subject: [PATCH 0778/1022] Validate tenant create options (#2450) Co-authored-by: Kirill Bulatov --- control_plane/src/storage.rs | 79 ++++++++++++++----------- test_runner/regress/test_tenant_conf.py | 16 ++++- 2 files changed, 58 insertions(+), 37 deletions(-) diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index d2cc5e096c..3bbbdc5865 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -371,43 +371,50 @@ impl PageServerNode { new_tenant_id: Option, settings: HashMap<&str, &str>, ) -> anyhow::Result { + let mut settings = settings.clone(); + let request = TenantCreateRequest { + new_tenant_id, + checkpoint_distance: settings + .remove("checkpoint_distance") + .map(|x| x.parse::()) + .transpose()?, + checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()), + compaction_target_size: settings + .remove("compaction_target_size") + .map(|x| x.parse::()) + .transpose()?, + compaction_period: settings.remove("compaction_period").map(|x| x.to_string()), + compaction_threshold: settings + .remove("compaction_threshold") + .map(|x| x.parse::()) + .transpose()?, + gc_horizon: settings + .remove("gc_horizon") + .map(|x| x.parse::()) + .transpose()?, + gc_period: settings.remove("gc_period").map(|x| x.to_string()), + image_creation_threshold: settings + .remove("image_creation_threshold") + .map(|x| x.parse::()) + .transpose()?, + pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), + walreceiver_connect_timeout: settings + .remove("walreceiver_connect_timeout") + .map(|x| x.to_string()), + lagging_wal_timeout: settings + .remove("lagging_wal_timeout") + .map(|x| x.to_string()), + max_lsn_wal_lag: settings + .remove("max_lsn_wal_lag") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, + }; + if !settings.is_empty() { + bail!("Unrecognized tenant settings: {settings:?}") + } self.http_request(Method::POST, format!("{}/tenant", self.http_base_url)) - .json(&TenantCreateRequest { - new_tenant_id, - checkpoint_distance: settings - .get("checkpoint_distance") - .map(|x| x.parse::()) - .transpose()?, - checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()), - compaction_target_size: settings - .get("compaction_target_size") - .map(|x| x.parse::()) - .transpose()?, - compaction_period: settings.get("compaction_period").map(|x| x.to_string()), - compaction_threshold: settings - .get("compaction_threshold") - .map(|x| x.parse::()) - .transpose()?, - gc_horizon: settings - .get("gc_horizon") - .map(|x| x.parse::()) - .transpose()?, - gc_period: settings.get("gc_period").map(|x| x.to_string()), - image_creation_threshold: settings - .get("image_creation_threshold") - .map(|x| x.parse::()) - .transpose()?, - pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()), - walreceiver_connect_timeout: settings - .get("walreceiver_connect_timeout") - .map(|x| x.to_string()), - lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()), - max_lsn_wal_lag: settings - .get("max_lsn_wal_lag") - .map(|x| x.parse::()) - .transpose() - .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, - }) + .json(&request) .send()? .error_from_body()? .json::>() diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 51a8101b11..c6cf416d12 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -6,6 +6,7 @@ from fixtures.neon_fixtures import NeonEnvBuilder def test_tenant_config(neon_env_builder: NeonEnvBuilder): + """Test per tenant configuration""" # set some non-default global config neon_env_builder.pageserver_config_override = """ page_cache_size=444; @@ -13,7 +14,20 @@ wait_lsn_timeout='111 s'; tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" env = neon_env_builder.init_start() - """Test per tenant configuration""" + + # Check that we raise on misspelled configs + invalid_conf_key = "some_invalid_setting_name_blah_blah_123" + try: + env.neon_cli.create_tenant( + conf={ + invalid_conf_key: "20000", + } + ) + except Exception as e: + assert invalid_conf_key in str(e) + else: + raise AssertionError("Expected validation error") + tenant, _ = env.neon_cli.create_tenant( conf={ "checkpoint_distance": "20000", From 031e57a973d5be159012a7af44d4b41f7abd61be Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 14 Sep 2022 16:10:52 +0300 Subject: [PATCH 0779/1022] Disable failpoints by default --- .github/workflows/build_and_test.yml | 6 ++++-- pageserver/Cargo.toml | 10 +++++----- test_runner/README.md | 1 + 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d586741d68..7688f9c1bd 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -94,15 +94,17 @@ jobs: # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, # because "cargo metadata" doesn't accept --release or --debug options # + # We run tests with addtional features, that are turned off by default (e.g. in release builds), see + # corresponding Cargo.toml files for their descriptions. - name: Set env variables run: | if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FEATURES="" + CARGO_FEATURES="--features failpoints" CARGO_FLAGS="--locked --timings" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" - CARGO_FEATURES="--features profiling" + CARGO_FEATURES="--features failpoints,profiling" CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 11d2d94906..ce09e788bd 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -4,12 +4,12 @@ version = "0.1.0" edition = "2021" [features] -# It is simpler infra-wise to have failpoints enabled by default -# It shouldn't affect performance in any way because failpoints -# are not placed in hot code paths -default = ["failpoints"] -profiling = ["pprof"] +default = [] + +# Feature that enables a special API, fail_point! macro (adds some runtime cost) +# to run tests on outage conditions failpoints = ["fail/failpoints"] +profiling = ["pprof"] [dependencies] async-stream = "0.3" diff --git a/test_runner/README.md b/test_runner/README.md index 44751944b3..01fe4ff863 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -6,6 +6,7 @@ Prerequisites: - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python) - Neon and Postgres binaries - See the root [README.md](/README.md) for build directions + If you want to test tests with failpoints, you would need to add `--features failpoints` to Rust code build commands. - Tests can be run from the git tree; or see the environment variables below to run from other directories. - The neon git repo, including the postgres submodule From db5ec0dae70aed65d79a23574afb4f2ea8d4fa06 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Thu, 15 Sep 2022 23:50:46 -0700 Subject: [PATCH 0780/1022] Cleanup/simplify logical size calculation (#2459) Should produce identical results; replaces an error case that shouldn't be possible with `expect`. --- pageserver/src/tenant/timeline.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e821ef1b9a..95bdf715b5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -232,14 +232,16 @@ impl LogicalSize { } fn current_size(&self) -> anyhow::Result { - let size_increment = self.size_added_after_initial.load(AtomicOrdering::Acquire); + let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire); + // ^^^ keep this type explicit so that the casts in this function break if + // we change the type. match self.initial_logical_size.get() { Some(initial_size) => { let absolute_size_increment = u64::try_from( size_increment .checked_abs() .with_context(|| format!("Size added after initial {size_increment} is not expected to be i64::MIN"))?, - ).with_context(|| format!("Failed to convert size increment {size_increment} to u64"))?; + ).expect("casting nonnegative i64 to u64 should not fail"); if size_increment < 0 { initial_size.checked_sub(absolute_size_increment) @@ -249,11 +251,7 @@ impl LogicalSize { .map(CurrentLogicalSize::Exact) } None => { - let non_negative_size_increment = if size_increment < 0 { - 0 - } else { - u64::try_from(size_increment).expect("not negative, cannot fail") - }; + let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0); Ok(CurrentLogicalSize::Approximate(non_negative_size_increment)) } } From 74312e268febaff8829b6fa795268231bd985699 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 16 Sep 2022 09:49:33 +0300 Subject: [PATCH 0781/1022] Tidy up storege artifact build flags * Simplify test build features handling * Build only necessary binaries during the release build --- .github/workflows/build_and_test.yml | 4 ++-- Dockerfile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7688f9c1bd..f67d42f2ff 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -101,7 +101,7 @@ jobs: if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" CARGO_FEATURES="--features failpoints" - CARGO_FLAGS="--locked --timings" + CARGO_FLAGS="--locked --timings $CARGO_FEATURES" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" CARGO_FEATURES="--features failpoints,profiling" @@ -160,7 +160,7 @@ jobs: - name: Run cargo build run: | - ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests + ${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests shell: bash -euxo pipefail {0} - name: Run cargo test diff --git a/Dockerfile b/Dockerfile index eacb88d168..711a92a90e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,7 +44,7 @@ COPY . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ -&& mold -run cargo build --locked --release \ +&& mold -run cargo build --bin pageserver --bin safekeeper --bin proxy --locked --release \ && cachepot -s # Build final image From 72b33997c773a963521d8007136c30080292e85e Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 16 Sep 2022 10:09:54 +0100 Subject: [PATCH 0782/1022] Nightly Benchmarks: trigger tests earlier (#2463) --- .github/workflows/benchmarking.yml | 3 ++- .github/workflows/build_and_test.yml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index fab0a9aa04..df0e8a4275 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -11,7 +11,7 @@ on: # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '36 4 * * *' # run once a day, timezone is utc + - cron: '0 3 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually inputs: @@ -239,6 +239,7 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: Create Allure report + if: always() uses: ./.github/actions/allure-report with: action: generate diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index f67d42f2ff..5bff469582 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -324,6 +324,7 @@ jobs: build_type: ${{ matrix.build_type }} - name: Store Allure test stat in the DB + if: ${{ steps.create-allure-report.outputs.report-url }} env: BUILD_TYPE: ${{ matrix.build_type }} SHA: ${{ github.event.pull_request.head.sha || github.sha }} From 4db15d3c7cbfbbe17c6f18af7b5eae3198fafadf Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 14 Sep 2022 18:22:00 +0300 Subject: [PATCH 0783/1022] change prefix_in_bucket in pageserver config --- .github/ansible/deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index b47db6a9b5..c06a0ef5b3 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -71,7 +71,7 @@ - "[remote_storage]" - "bucket_name = '{{ bucket_name }}'" - "bucket_region = '{{ bucket_region }}'" - - "prefix_in_bucket = '{{ inventory_hostname }}'" + - "prefix_in_bucket = 'pageserver/v1'" become: true tags: - pageserver From 44fd4e3c9f9b8087dc0871785f87ed7848538839 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 16 Sep 2022 16:59:05 +0300 Subject: [PATCH 0784/1022] add more logs --- pageserver/src/storage_sync.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 9d259bf1e2..64e0f9a9e3 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -601,6 +601,7 @@ pub fn spawn_storage_sync_task( for (tenant_id, timeline_data) in local_timeline_files.0 { if timeline_data.is_empty() { + info!("got empty tenant {}", tenant_id); let _ = empty_tenants.0.entry(tenant_id).or_default(); } else { for (timeline_id, timeline_data) in timeline_data { @@ -1303,6 +1304,10 @@ fn schedule_first_sync_tasks( None => { // TODO (rodionov) does this mean that we've crashed during tenant creation? // is it safe to upload this checkpoint? could it be half broken? + warn!( + "marking {} as locally complete, while it doesnt exist in remote index", + sync_id + ); new_sync_tasks.push_back(( sync_id, SyncTask::upload(LayersUpload { @@ -1337,6 +1342,8 @@ fn compare_local_and_remote_timeline( local_files: HashSet, remote_entry: &RemoteTimeline, ) -> (LocalTimelineInitStatus, bool) { + let _entered = info_span!("compare_local_and_remote_timeline", sync_id = %sync_id).entered(); + let remote_files = remote_entry.stored_files(); let number_of_layers_to_download = remote_files.difference(&local_files).count(); @@ -1347,10 +1354,12 @@ fn compare_local_and_remote_timeline( layers_to_skip: local_files.clone(), }), )); + info!("NeedsSync"); (LocalTimelineInitStatus::NeedsSync, true) // we do not need to manipulate with remote consistent lsn here // because it will be updated when sync will be completed } else { + info!("LocallyComplete"); ( LocalTimelineInitStatus::LocallyComplete(local_metadata.clone()), false, From 9c35a094527fea58f1f402f99682fe9dc8c23b02 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Fri, 16 Sep 2022 08:37:44 -0700 Subject: [PATCH 0785/1022] Improve build errors when `postgres_ffi` fails (#2460) This commit does two things of note: 1. Bumps the bindgen dependency from `0.59.1` to `0.60.1`. This gets us an actual error type from bindgen, so we can display what's wrong. 2. Adds `anyhow` as a build dependency, so our error message can be prettier. It's already used heavily elsewhere in the crates in this repo, so I figured the fact it's a build dependency doesn't matter much. I ran into this from running `cargo ` without running `make` first. Here's a comparison of the compiler output in those two cases. Before this commit: ``` error: failed to run custom build command for `postgres_ffi v0.1.0 ($repo_path/libs/postgres_ffi)` Caused by: process didn't exit successfully: `$repo_path/target/debug/build/postgres_ffi-2f7253b3ad3ca840/build-script-build` (exit status: 101) --- stdout cargo:rerun-if-changed=bindgen_deps.h --- stderr bindgen_deps.h:7:10: fatal error: 'c.h' file not found bindgen_deps.h:7:10: fatal error: 'c.h' file not found, err: true thread 'main' panicked at 'Unable to generate bindings: ()', libs/postgres_ffi/build.rs:135:14 note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace ``` After this commit: ``` error: failed to run custom build command for `postgres_ffi v0.1.0 ($repo_path/libs/postgres_ffi)` Caused by: process didn't exit successfully: `$repo_path/target/debug/build/postgres_ffi-e01fb59602596748/build-script-build` (exit status: 1) --- stdout cargo:rerun-if-changed=bindgen_deps.h --- stderr bindgen_deps.h:7:10: fatal error: 'c.h' file not found Error: Unable to generate bindings Caused by: clang diagnosed error: bindgen_deps.h:7:10: fatal error: 'c.h' file not found ``` --- Cargo.lock | 6 +++--- libs/postgres_ffi/Cargo.toml | 3 ++- libs/postgres_ffi/build.rs | 29 +++++++++++++++++++---------- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a258fab5f6..ca169dc0c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -229,14 +229,14 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.59.2" +version = "0.60.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bd2a9a458e8f4304c52c43ebb0cfbd520289f8379a52e329a38afda99bf8eb8" +checksum = "062dddbc1ba4aca46de6338e2bf87771414c335f7b2f2036e8f3e9befebf88e6" dependencies = [ "bitflags", "cexpr", "clang-sys", - "clap 2.34.0", + "clap 3.2.16", "env_logger", "lazy_static", "lazycell", diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 2b453fa0dc..60caca76b8 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -25,4 +25,5 @@ postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d05 wal_craft = { path = "wal_craft" } [build-dependencies] -bindgen = "0.59.1" +anyhow = "1.0" +bindgen = "0.60.1" diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 8389ac37fe..25ff398bbd 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -4,6 +4,7 @@ use std::env; use std::path::PathBuf; use std::process::Command; +use anyhow::{anyhow, Context}; use bindgen::callbacks::ParseCallbacks; #[derive(Debug)] @@ -42,7 +43,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { } } -fn main() { +fn main() -> anyhow::Result<()> { // Tell cargo to invalidate the built crate whenever the wrapper changes println!("cargo:rerun-if-changed=bindgen_deps.h"); @@ -58,7 +59,7 @@ fn main() { for pg_version in &["v14", "v15"] { let mut pg_install_dir_versioned = pg_install_dir.join(pg_version); if pg_install_dir_versioned.is_relative() { - let cwd = env::current_dir().unwrap(); + let cwd = env::current_dir().context("Failed to get current_dir")?; pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned); } @@ -70,21 +71,25 @@ fn main() { let output = Command::new(pg_config_bin) .arg("--includedir-server") .output() - .expect("failed to execute `pg_config --includedir-server`"); + .context("failed to execute `pg_config --includedir-server`")?; if !output.status.success() { panic!("`pg_config --includedir-server` failed") } - String::from_utf8(output.stdout).unwrap().trim_end().into() + String::from_utf8(output.stdout) + .context("pg_config output is not UTF-8")? + .trim_end() + .into() } else { - pg_install_dir_versioned + let server_path = pg_install_dir_versioned .join("include") .join("postgresql") .join("server") - .into_os_string() + .into_os_string(); + server_path .into_string() - .unwrap() + .map_err(|s| anyhow!("Bad postgres server path {s:?}"))? }; // The bindgen::Builder is the main entry point @@ -132,14 +137,18 @@ fn main() { // Finish the builder and generate the bindings. // .generate() - .expect("Unable to generate bindings"); + .context("Unable to generate bindings")?; // Write the bindings to the $OUT_DIR/bindings_$pg_version.rs file. - let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); + let out_path: PathBuf = env::var("OUT_DIR") + .context("Couldn't read OUT_DIR environment variable var")? + .into(); let filename = format!("bindings_{pg_version}.rs"); bindings .write_to_file(out_path.join(filename)) - .expect("Couldn't write bindings!"); + .context("Couldn't write bindings")?; } + + Ok(()) } From 65a5010e256da28cbf9a9410ecd7953d8f57cd00 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Fri, 16 Sep 2022 17:44:02 +0200 Subject: [PATCH 0786/1022] Use custom `install` command in Makefile to speed up incremental builds (#2458) Fixes #1873: previously any run of `make` caused the `postgres-v15-headers` target to build. It copied a bunch of headers via `install -C`. Unfortunately, some origins were symlinks in the `./pg_install/build` directory pointing inside `./vendor/postgres-v15` (e.g. `pg_config_os.h` pointing to `linux.h`). GNU coreutils' `install` ignores the `-C` key for non-regular files and always overwrites the destination if the origin is a symlink. That in turn made Cargo rebuild the `postgres_ffi` crate and all its dependencies because it thinks that Postgres headers changed, even if they did not. That was slow. Now we use a custom script that wraps the `install` program. It handles one specific case and makes sure individual headers are never copied if their content did not change. Hence, `postgres_ffi` is not rebuilt unless there were some changes to the C code. One may still have slow incremental single-threaded builds because Postgres Makefiles spawn about 2800 sub-makes even if no files have been changed. A no-op build takes "only" 3-4 seconds on my machine now when run with `-j30`, and 20 seconds when run with `-j1`. --- Makefile | 2 +- scripts/ninstall.sh | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100755 scripts/ninstall.sh diff --git a/Makefile b/Makefile index 4ac51ed174..738a45fd5e 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,7 @@ endif # headers, the mtime of the headers are not changed when there have # been no changes to the files. Changing the mtime triggers an # unnecessary rebuild of 'postgres_ffi'. -PG_CONFIGURE_OPTS += INSTALL='install -C' +PG_CONFIGURE_OPTS += INSTALL='$(ROOT_PROJECT_DIR)/scripts/ninstall.sh -C' # Choose whether we should be silent or verbose CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose) diff --git a/scripts/ninstall.sh b/scripts/ninstall.sh new file mode 100755 index 0000000000..3554e3e4df --- /dev/null +++ b/scripts/ninstall.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -euo pipefail +# GNU coreutil's `install -C` always overrides the destination if the source +# is not a regular file, which is the case with lots of headers symlinked into +# the build directory by `./configure`. That causes Rust's Cargo to think that +# Postgres headers have been updated after `make` call even if no files have been +# touched. That causes long recompilation of `postgres_ffi` and all dependent +# packages. To counter that, we handle a special case here: do not copy the file +# if its content did not change. We only handle a single case where `install` +# installs a single file with a specific set of arguments, the rest does not +# matter in our configuration. +# +# Such behavior may be incorrect if e.g. permissions have changed, but it should +# not happen during normal Neon development that often, and rebuild should help. +# +# See https://github.com/neondatabase/neon/issues/1873 +if [ "$#" == "5" ]; then + if [ "$1" == "-C" ] && [ "$2" == "-m" ] && [ "$3" == "644" ]; then + if [ -e "$5" ] && diff -q "$4" "$5" >/dev/null 2>&1; then + exit 0 + fi + fi +fi +install "$@" From b46c8b4ae008f88a0693837752d0ca8007a54dd5 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 16 Sep 2022 11:35:51 +0300 Subject: [PATCH 0787/1022] Add an alias to build test images simply --- .cargo/config.toml | 3 +++ test_runner/README.md | 2 ++ 2 files changed, 5 insertions(+) diff --git a/.cargo/config.toml b/.cargo/config.toml index 76a2ff549e..d70d57a817 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -11,3 +11,6 @@ opt-level = 3 [profile.dev] # Turn on a small amount of optimization in Development mode. opt-level = 1 + +[alias] +build_testing = ["build", "--features", "failpoints"] diff --git a/test_runner/README.md b/test_runner/README.md index 01fe4ff863..f17a4a5a5d 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -7,6 +7,8 @@ Prerequisites: - Neon and Postgres binaries - See the root [README.md](/README.md) for build directions If you want to test tests with failpoints, you would need to add `--features failpoints` to Rust code build commands. + For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags. + Usage example: `cargo build_testing --release` is equivalent to `cargo build --features failpoints --release` - Tests can be run from the git tree; or see the environment variables below to run from other directories. - The neon git repo, including the postgres submodule From c9c3c77c31e45cf59c02dbe142d0c99432fc4f18 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Fri, 16 Sep 2022 19:51:35 +0200 Subject: [PATCH 0788/1022] Fix Docker image builds (follow-up for #2458) (#2469) Put ninstall.sh inside Docker images for building --- .dockerignore | 1 + Dockerfile | 1 + 2 files changed, 2 insertions(+) diff --git a/.dockerignore b/.dockerignore index 4bc8e5fa13..92eb4f24de 100644 --- a/.dockerignore +++ b/.dockerignore @@ -18,3 +18,4 @@ !vendor/postgres-v15/ !workspace_hack/ !neon_local/ +!scripts/ninstall.sh diff --git a/Dockerfile b/Dockerfile index 711a92a90e..213934a844 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,7 @@ COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15 COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile +COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh ENV BUILD_TYPE release RUN set -e \ From 846d126579bd34f0b57b11a4e5477d8d239feea2 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 19 Sep 2022 12:56:08 +0300 Subject: [PATCH 0789/1022] Set last written lsn for created relation (#2398) * Set last written lsn for created relation * use current LSN for updating last written LSN of relation metadata * Update LSN for the extended blocks even for pges without LSN (zeroed) * Update pgxn/neon/pagestore_smgr.c Co-authored-by: Heikki Linnakangas Co-authored-by: Heikki Linnakangas --- pgxn/neon/pagestore_smgr.c | 12 +++++++++++- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 24adee019f..8e6dd373b0 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -959,7 +959,17 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, if (IS_LOCAL_REL(reln)) mdextend(reln, forkNum, blkno, buffer, skipFsync); #endif - + /* + * smgr_extend is often called with an all-zeroes page, so lsn==InvalidXLogRecPtr. + * An smgr_write() call will come for the buffer later, after it has been initialized + * with the real page contents, and it is eventually evicted from the buffer cache. + * But we need a valid LSN to the relation metadata update now. + */ + if (lsn == InvalidXLogRecPtr) + { + lsn = GetXLogInsertRecPtr(); + SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forkNum, blkno); + } SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forkNum); } diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index ce723ee499..796770565f 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit ce723ee499450cb108aede464a35a17f3d75cf84 +Subproject commit 796770565ff668b585e80733b8d679961ad50e93 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 0858387047..7d144ae2f3 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 08583870479e30c64aeb5a97d6fee9cf470f05fb +Subproject commit 7d144ae2f3649570f60a0477993b8c8ad9dd8c4b From 90ed12630e698441a66fce7c095cc5a02487a26d Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Mon, 19 Sep 2022 12:57:44 +0200 Subject: [PATCH 0790/1022] Add zenith-us-stage-ps-4 and undo changes in prefix_in_bucket in pageserver config (#2473) * Add zenith-us-stage-ps-4 * Undo changes in prefix_in_bucket in pageserver config (Rollback #2449) --- .github/ansible/deploy.yaml | 2 +- .github/ansible/staging.hosts | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index c06a0ef5b3..b47db6a9b5 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -71,7 +71,7 @@ - "[remote_storage]" - "bucket_name = '{{ bucket_name }}'" - "bucket_region = '{{ bucket_region }}'" - - "prefix_in_bucket = 'pageserver/v1'" + - "prefix_in_bucket = '{{ inventory_hostname }}'" become: true tags: - pageserver diff --git a/.github/ansible/staging.hosts b/.github/ansible/staging.hosts index 2bb28f1972..c470f8a814 100644 --- a/.github/ansible/staging.hosts +++ b/.github/ansible/staging.hosts @@ -2,6 +2,7 @@ #zenith-us-stage-ps-1 console_region_id=27 zenith-us-stage-ps-2 console_region_id=27 zenith-us-stage-ps-3 console_region_id=27 +zenith-us-stage-ps-4 console_region_id=27 [safekeepers] zenith-us-stage-sk-4 console_region_id=27 From d11cb4b2f115eb3be48f31926b952bbbbd21e6f7 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 19 Sep 2022 14:23:44 +0300 Subject: [PATCH 0791/1022] Bump vendor/postgres-v15 to the latest state of REL_15_STABLE_neon branch --- vendor/postgres-v15 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 7d144ae2f3..34c47d6c99 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 7d144ae2f3649570f60a0477993b8c8ad9dd8c4b +Subproject commit 34c47d6c99415c94296d5e599ec5590d0001d6c2 From 4b5e7f2f82aaa0c1427b42976a555d7c236ee5ad Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 19 Sep 2022 11:14:34 +0300 Subject: [PATCH 0792/1022] Temporarily disable storage deployments Do not update configs Do not restart servieces Still update binaries --- .github/ansible/deploy.yaml | 42 ++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index b47db6a9b5..6982445558 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -63,18 +63,18 @@ tags: - pageserver - - name: update remote storage (s3) config - lineinfile: - path: /storage/pageserver/data/pageserver.toml - line: "{{ item }}" - loop: - - "[remote_storage]" - - "bucket_name = '{{ bucket_name }}'" - - "bucket_region = '{{ bucket_region }}'" - - "prefix_in_bucket = '{{ inventory_hostname }}'" - become: true - tags: - - pageserver + # - name: update remote storage (s3) config + # lineinfile: + # path: /storage/pageserver/data/pageserver.toml + # line: "{{ item }}" + # loop: + # - "[remote_storage]" + # - "bucket_name = '{{ bucket_name }}'" + # - "bucket_region = '{{ bucket_region }}'" + # - "prefix_in_bucket = '{{ inventory_hostname }}'" + # become: true + # tags: + # - pageserver - name: upload systemd service definition ansible.builtin.template: @@ -87,15 +87,15 @@ tags: - pageserver - - name: start systemd service - ansible.builtin.systemd: - daemon_reload: yes - name: pageserver - enabled: yes - state: restarted - become: true - tags: - - pageserver + # - name: start systemd service + # ansible.builtin.systemd: + # daemon_reload: yes + # name: pageserver + # enabled: yes + # state: restarted + # become: true + # tags: + # - pageserver - name: post version to console when: console_mgmt_base_url is defined From fcb4a61a120ab29de19f8a0bbe64aa29bed5f194 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 19 Sep 2022 18:41:18 +0300 Subject: [PATCH 0793/1022] Adjust spans around gc and compaction So compaction and gc loops have their own span to always show tenant id in log messages. --- pageserver/src/page_service.rs | 3 +++ pageserver/src/tenant.rs | 6 +----- pageserver/src/tenant/timeline.rs | 9 +++++---- pageserver/src/tenant_tasks.rs | 20 +++++++++++-------- .../src/walreceiver/connection_manager.rs | 9 +++++++-- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b06814c557..1461a6d117 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1090,6 +1090,9 @@ impl postgres_backend_async::Handler for PageServerHandler { let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; + let _span_guard = + info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered(); + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; let gc_horizon: u64 = caps diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 41fd98ec07..f56f10d7ea 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -342,8 +342,7 @@ impl Tenant { drop(timelines); for (timeline_id, timeline) in &timelines_to_compact { - let _entered = - info_span!("compact", timeline = %timeline_id, tenant = %self.tenant_id).entered(); + let _entered = info_span!("compact_timeline", timeline = %timeline_id).entered(); timeline.compact()?; } @@ -835,9 +834,6 @@ impl Tenant { pitr: Duration, checkpoint_before_gc: bool, ) -> Result { - let _span_guard = - info_span!("gc iteration", tenant = %self.tenant_id, timeline = ?target_timeline_id) - .entered(); let mut totals: GcResult = Default::default(); let now = Instant::now(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 95bdf715b5..8670e979ee 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1916,18 +1916,19 @@ impl Timeline { let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + let _enter = + info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff) + .entered(); + // Nothing to GC. Return early. let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); if latest_gc_cutoff >= new_gc_cutoff { info!( - "Nothing to GC for timeline {}: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", - self.timeline_id + "Nothing to GC: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", ); return Ok(result); } - let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %new_gc_cutoff).entered(); - // We need to ensure that no one tries to read page versions or create // branches at a point before latest_gc_cutoff_lsn. See branch_timeline() // for details. This will block until the old value is no longer in use. diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index c543a0ecb1..8329b15c08 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -21,7 +21,9 @@ pub fn start_background_loops(tenant_id: TenantId) { &format!("compactor for tenant {tenant_id}"), false, async move { - compaction_loop(tenant_id).await; + compaction_loop(tenant_id) + .instrument(info_span!("compaction_loop", tenant_id = %tenant_id)) + .await; Ok(()) }, ); @@ -33,7 +35,9 @@ pub fn start_background_loops(tenant_id: TenantId) { &format!("garbage collector for tenant {tenant_id}"), false, async move { - gc_loop(tenant_id).await; + gc_loop(tenant_id) + .instrument(info_span!("gc_loop", tenant_id = %tenant_id)) + .await; Ok(()) }, ); @@ -44,7 +48,7 @@ pub fn start_background_loops(tenant_id: TenantId) { /// async fn compaction_loop(tenant_id: TenantId) { let wait_duration = Duration::from_secs(2); - info!("starting compaction loop for {tenant_id}"); + info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { loop { @@ -52,7 +56,7 @@ async fn compaction_loop(tenant_id: TenantId) { let tenant = tokio::select! { _ = task_mgr::shutdown_watcher() => { - info!("received compaction cancellation request"); + info!("received cancellation request"); return; }, tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { @@ -73,7 +77,7 @@ async fn compaction_loop(tenant_id: TenantId) { // Sleep tokio::select! { _ = task_mgr::shutdown_watcher() => { - info!("received compaction cancellation request during idling"); + info!("received cancellation request during idling"); break ; }, _ = tokio::time::sleep(sleep_duration) => {}, @@ -91,7 +95,7 @@ async fn compaction_loop(tenant_id: TenantId) { /// async fn gc_loop(tenant_id: TenantId) { let wait_duration = Duration::from_secs(2); - info!("starting gc loop for {tenant_id}"); + info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { loop { @@ -99,7 +103,7 @@ async fn gc_loop(tenant_id: TenantId) { let tenant = tokio::select! { _ = task_mgr::shutdown_watcher() => { - info!("received GC cancellation request"); + info!("received cancellation request"); return; }, tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { @@ -123,7 +127,7 @@ async fn gc_loop(tenant_id: TenantId) { // Sleep tokio::select! { _ = task_mgr::shutdown_watcher() => { - info!("received GC cancellation request during idling"); + info!("received cancellation request during idling"); break; }, _ = tokio::time::sleep(sleep_duration) => {}, diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 1e4b4e7d52..799062e935 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -58,7 +58,10 @@ pub fn spawn_connection_manager_task( TaskKind::WalReceiverManager, Some(tenant_id), Some(timeline_id), - &format!("walreceiver for tenant {} timeline {}", timeline.tenant_id, timeline.timeline_id), + &format!( + "walreceiver for tenant {} timeline {}", + timeline.tenant_id, timeline.timeline_id + ), false, async move { info!("WAL receiver broker started, connecting to etcd"); @@ -88,7 +91,9 @@ pub fn spawn_connection_manager_task( } } } - .instrument(info_span!("wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id)) + .instrument( + info_span!("wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id), + ), ); Ok(()) } From 6985f6cd6c53ae96ad4afaaaf546f5d94c869d50 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 19 Sep 2022 20:56:11 +0300 Subject: [PATCH 0794/1022] Add a new benchmark data series for prefetching. Also run benchmarks with the seqscan prefetching (commit f44afbaf62) enabled. Renames the 'neon-captest' test to 'neon-captest-reuse', for clarity --- .github/workflows/benchmarking.yml | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index df0e8a4275..9a9021ac37 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -144,9 +144,10 @@ jobs: strategy: fail-fast: false matrix: - # neon-captest: Run pgbench, reusing existing project - # neon-captest-new: Same, but on a freshly created project - platform: [ neon-captest, neon-captest-new, rds-aurora ] + # neon-captest-new: Run pgbench in a freshly created project + # neon-captest-reuse: Same, but reusing existing project + # neon-captest-prefetch: Same, with prefetching enabled (new project) + platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-aurora ] runs-on: dev container: @@ -164,7 +165,7 @@ jobs: sudo apt install -y postgresql-14 - name: Create Neon Project - if: matrix.platform == 'neon-captest-new' + if: matrix.platform != 'neon-captest-reuse' id: create-neon-project uses: ./.github/actions/neon-project-create with: @@ -175,17 +176,20 @@ jobs: id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest) + neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; neon-captest-new) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; + neon-captest-prefetch) + CONNSTR=${{ steps.create-neon-project.outputs.dsn }}?options=-cenable_seqscan_prefetch%3Don%20-cseqscan_prefetch_buffers%3D10 + ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest', 'neon-captest-new' or 'rds-aurora'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch' or 'rds-aurora'" exit 1 ;; esac @@ -246,7 +250,7 @@ jobs: build_type: ${{ env.BUILD_TYPE }} - name: Delete Neon Project - if: ${{ matrix.platform == 'neon-captest-new' && always() }} + if: ${{ matrix.platform != 'neon-captest-reuse' && always() }} uses: ./.github/actions/neon-project-delete with: environment: dev From bb3c66d86f6c91e05d72d52baedcb4ff32617c2e Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 19 Sep 2022 23:28:51 +0100 Subject: [PATCH 0795/1022] github/workflows: Make publishing perf reports more configurable (#2440) --- .github/actions/neon-project-create/action.yml | 1 + .github/actions/run-python-test-set/action.yml | 14 +++++--------- .github/workflows/benchmarking.yml | 10 +++++++--- .github/workflows/build_and_test.yml | 2 +- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index ba81afaaff..2f58ae77ad 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -60,6 +60,7 @@ runs: --header "Authorization: Bearer ${API_KEY}" \ --data "{ \"project\": { + \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\", \"platform_id\": \"aws\", \"region_id\": \"${REGION_ID}\", \"settings\": { } diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 4c18641938..e69cb28df1 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -112,10 +112,8 @@ runs: fi if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then - if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then - mkdir -p "$PERF_REPORT_DIR" - EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" - fi + mkdir -p "$PERF_REPORT_DIR" + EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" fi if [[ "${{ inputs.build_type }}" == "debug" ]]; then @@ -150,11 +148,9 @@ runs: -rA $TEST_SELECTION $EXTRA_PARAMS if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then - if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then - export REPORT_FROM="$PERF_REPORT_DIR" - export REPORT_TO="$PLATFORM" - scripts/generate_and_push_perf_report.sh - fi + export REPORT_FROM="$PERF_REPORT_DIR" + export REPORT_TO="$PLATFORM" + scripts/generate_and_push_perf_report.sh fi - name: Create Allure report diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 9a9021ac37..0430f0b9c0 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -19,8 +19,12 @@ on: description: 'Environment to run remote tests on (dev or staging)' required: false region_id: - description: 'Use a particular region. If empty the default one will be used' - false: true + description: 'Use a particular region. If not set the default region will be used' + required: false + save_perf_report: + type: boolean + description: 'Publish perf report or not. If not set, the report is published only for the main branch' + required: false defaults: run: @@ -139,7 +143,7 @@ jobs: POSTGRES_DISTRIB_DIR: /usr TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - SAVE_PERF_REPORT: true + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} strategy: fail-fast: false diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5bff469582..0b6cb21120 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -292,7 +292,7 @@ jobs: build_type: ${{ matrix.build_type }} test_selection: performance run_in_parallel: false - save_perf_report: true + save_perf_report: ${{ github.ref == 'refs/heads/main' }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" From e4f775436f534e8de49d0cb5a2c955e73ac6f03e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 19 Sep 2022 23:52:21 +0300 Subject: [PATCH 0796/1022] Don't override other options than statement_timeout in test conn string. In commit 6985f6cd6c, I tried passing extra GUCs in the 'options' part of the connection string, but it didn't work because the pgbench test overrode it with the statement_timeout. Change it so that it adds the statement_timeout to any other options, instead of replacing them. --- test_runner/performance/test_perf_pgbench.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 2a2213b783..d9bf237e49 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -84,9 +84,8 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P if workload_type == PgBenchLoadType.INIT: # Run initialize - init_pgbench( - env, ["pgbench", f"-s{scale}", "-i", env.pg.connstr(options="-cstatement_timeout=1h")] - ) + options = "-cstatement_timeout=1h " + env.pg.default_options["options"] + init_pgbench(env, ["pgbench", f"-s{scale}", "-i", env.pg.connstr(options=options)]) if workload_type == PgBenchLoadType.SIMPLE_UPDATE: # Run simple-update workload From 566e816298a201c9150f0c42846949296997d74d Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 20 Sep 2022 09:42:39 +0200 Subject: [PATCH 0797/1022] Refactor safekeeper timelines handling (#2329) See https://github.com/neondatabase/neon/pull/2329 for details --- Cargo.lock | 2 + libs/utils/src/postgres_backend.rs | 18 +- pageserver/src/lib.rs | 2 +- safekeeper/Cargo.toml | 2 + safekeeper/src/bin/safekeeper.rs | 6 +- safekeeper/src/broker.rs | 42 +- safekeeper/src/control_file.rs | 73 ++- safekeeper/src/handler.rs | 67 +-- safekeeper/src/http/routes.rs | 62 ++- safekeeper/src/json_ctrl.rs | 61 +-- safekeeper/src/lib.rs | 9 +- safekeeper/src/metrics.rs | 19 +- safekeeper/src/receive_wal.rs | 27 +- safekeeper/src/remove_wal.rs | 21 +- safekeeper/src/safekeeper.rs | 141 +++--- safekeeper/src/send_wal.rs | 20 +- safekeeper/src/timeline.rs | 665 ++++++++++++------------- safekeeper/src/timelines_global_map.rs | 348 +++++++++++++ safekeeper/src/wal_backup.rs | 72 +-- safekeeper/src/wal_storage.rs | 249 ++++----- 20 files changed, 1097 insertions(+), 809 deletions(-) create mode 100644 safekeeper/src/timelines_global_map.rs diff --git a/Cargo.lock b/Cargo.lock index ca169dc0c8..2f4a57b698 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2723,6 +2723,7 @@ dependencies = [ "hyper", "metrics", "once_cell", + "parking_lot 0.12.1", "postgres", "postgres-protocol", "postgres_ffi", @@ -2733,6 +2734,7 @@ dependencies = [ "serde_with", "signal-hook", "tempfile", + "thiserror", "tokio", "tokio-postgres", "toml_edit", diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 0498e0887b..adee46c2dd 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -429,8 +429,22 @@ impl PostgresBackend { // full cause of the error, not just the top-level context + its trace. // We don't want to send that in the ErrorResponse though, // because it's not relevant to the compute node logs. - error!("query handler for '{}' failed: {:?}", query_string, e); - self.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?; + // + // We also don't want to log full stacktrace when the error is primitive, + // such as usual connection closed. + let short_error = format!("{:#}", e); + let root_cause = e.root_cause().to_string(); + if root_cause.contains("connection closed unexpectedly") + || root_cause.contains("Broken pipe (os error 32)") + { + error!( + "query handler for '{}' failed: {}", + query_string, short_error + ); + } else { + error!("query handler for '{}' failed: {:?}", query_string, e); + } + self.write_message_noflush(&BeMessage::ErrorResponse(&short_error))?; // TODO: untangle convoluted control flow if e.to_string().contains("failed to run") { return Ok(ProcessMsgResult::Break); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 959942aa12..acd37161a0 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -105,7 +105,7 @@ fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds } /// A newtype to store arbitrary data grouped by tenant and timeline ids. -/// One could use [`utils::zid::TenantTimelineId`] for grouping, but that would +/// One could use [`utils::id::TenantTimelineId`] for grouping, but that would /// not include the cases where a certain tenant has zero timelines. /// This is sometimes important: a tenant could be registered during initial load from FS, /// even if he has no timelines on disk. diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index cae095c3c2..87ee63d1df 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -30,6 +30,8 @@ git-version = "0.3.5" async-trait = "0.1" once_cell = "1.13.0" toml_edit = { version = "0.13", features = ["easy"] } +thiserror = "1" +parking_lot = "0.12.1" postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index d518ac01cc..7726f25a2d 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -24,9 +24,9 @@ use safekeeper::defaults::{ }; use safekeeper::http; use safekeeper::remove_wal; -use safekeeper::timeline::GlobalTimelines; use safekeeper::wal_backup; use safekeeper::wal_service; +use safekeeper::GlobalTimelines; use safekeeper::SafeKeeperConf; use utils::auth::JwtAuth; use utils::{ @@ -298,7 +298,9 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); - GlobalTimelines::init(wal_backup_launcher_tx); + + // Load all timelines from disk to memory. + GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?; let conf_ = conf.clone(); threads.push( diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index f276fad613..6a2456ecda 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -10,6 +10,7 @@ use etcd_broker::LeaseKeeper; use std::collections::hash_map::Entry; use std::collections::HashMap; +use std::collections::HashSet; use std::time::Duration; use tokio::spawn; use tokio::task::JoinHandle; @@ -17,7 +18,8 @@ use tokio::{runtime, time::sleep}; use tracing::*; use url::Url; -use crate::{timeline::GlobalTimelines, SafeKeeperConf}; +use crate::GlobalTimelines; +use crate::SafeKeeperConf; use etcd_broker::{ subscription_key::{OperationKind, SkOperationKind, SubscriptionKey}, Client, PutOptions, @@ -45,12 +47,12 @@ pub fn thread_main(conf: SafeKeeperConf) { /// Key to per timeline per safekeeper data. fn timeline_safekeeper_path( broker_etcd_prefix: String, - zttid: TenantTimelineId, + ttid: TenantTimelineId, sk_id: NodeId, ) -> String { format!( "{}/{sk_id}", - SubscriptionKey::sk_timeline_info(broker_etcd_prefix, zttid).watch_key() + SubscriptionKey::sk_timeline_info(broker_etcd_prefix, ttid).watch_key() ) } @@ -162,7 +164,7 @@ pub fn get_candiate_name(system_id: NodeId) -> String { } async fn push_sk_info( - zttid: TenantTimelineId, + ttid: TenantTimelineId, mut client: Client, key: String, sk_info: SkTimelineInfo, @@ -190,7 +192,7 @@ async fn push_sk_info( .await .context("failed to receive LeaseKeepAliveResponse")?; - Ok((zttid, lease)) + Ok((ttid, lease)) } struct Lease { @@ -210,11 +212,15 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // is under plain mutex. That's ok, all this code is not performance // sensitive and there is no risk of deadlock as we don't await while // lock is held. - let active_tlis = GlobalTimelines::get_active_timelines(); + let mut active_tlis = GlobalTimelines::get_all(); + active_tlis.retain(|tli| tli.is_active()); + + let active_tlis_set: HashSet = + active_tlis.iter().map(|tli| tli.ttid).collect(); // // Get and maintain (if not yet) per timeline lease to automatically delete obsolete data. - for zttid in active_tlis.iter() { - if let Entry::Vacant(v) = leases.entry(*zttid) { + for tli in &active_tlis { + if let Entry::Vacant(v) = leases.entry(tli.ttid) { let lease = client.lease_grant(LEASE_TTL_SEC, None).await?; let (keeper, ka_stream) = client.lease_keep_alive(lease.id()).await?; v.insert(Lease { @@ -224,30 +230,26 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { }); } } - leases.retain(|zttid, _| active_tlis.contains(zttid)); + leases.retain(|ttid, _| active_tlis_set.contains(ttid)); // Push data concurrently to not suffer from latency, with many timelines it can be slow. let handles = active_tlis .iter() - .filter_map(|zttid| GlobalTimelines::get_loaded(*zttid)) .map(|tli| { let sk_info = tli.get_public_info(&conf); - let key = timeline_safekeeper_path( - conf.broker_etcd_prefix.clone(), - tli.zttid, - conf.my_id, - ); - let lease = leases.remove(&tli.zttid).unwrap(); - tokio::spawn(push_sk_info(tli.zttid, client.clone(), key, sk_info, lease)) + let key = + timeline_safekeeper_path(conf.broker_etcd_prefix.clone(), tli.ttid, conf.my_id); + let lease = leases.remove(&tli.ttid).unwrap(); + tokio::spawn(push_sk_info(tli.ttid, client.clone(), key, sk_info, lease)) }) .collect::>(); for h in handles { - let (zttid, lease) = h.await??; + let (ttid, lease) = h.await??; // It is ugly to pull leases from hash and then put it back, but // otherwise we have to resort to long living per tli tasks (which // would generate a lot of errors when etcd is down) as task wants to // have 'static objects, we can't borrow to it. - leases.insert(zttid, lease); + leases.insert(ttid, lease); } sleep(push_interval).await; @@ -279,7 +281,7 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { match subscription.value_updates.recv().await { Some(new_info) => { // note: there are blocking operations below, but it's considered fine for now - if let Ok(tli) = GlobalTimelines::get(&conf, new_info.key.id, false) { + if let Ok(tli) = GlobalTimelines::get(new_info.key.id) { tli.record_safekeeper_info(&new_info.value, new_info.key.node_id) .await? } diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index ff23f0360f..22ed34cc00 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -9,8 +9,6 @@ use std::io::{Read, Write}; use std::ops::Deref; use std::path::{Path, PathBuf}; -use tracing::*; - use crate::control_file_upgrade::upgrade_control_file; use crate::safekeeper::{SafeKeeperState, SK_FORMAT_VERSION, SK_MAGIC}; use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; @@ -55,12 +53,13 @@ pub struct FileStorage { } impl FileStorage { - pub fn restore_new(zttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { - let timeline_dir = conf.timeline_dir(zttid); - let tenant_id = zttid.tenant_id.to_string(); - let timeline_id = zttid.timeline_id.to_string(); + /// Initialize storage by loading state from disk. + pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { + let timeline_dir = conf.timeline_dir(ttid); + let tenant_id = ttid.tenant_id.to_string(); + let timeline_id = ttid.timeline_id.to_string(); - let state = Self::load_control_file_conf(conf, zttid)?; + let state = Self::load_control_file_conf(conf, ttid)?; Ok(FileStorage { timeline_dir, @@ -71,28 +70,28 @@ impl FileStorage { }) } + /// Create file storage for a new timeline, but don't persist it yet. pub fn create_new( - zttid: &TenantTimelineId, + ttid: &TenantTimelineId, conf: &SafeKeeperConf, state: SafeKeeperState, ) -> Result { - let timeline_dir = conf.timeline_dir(zttid); - let tenant_id = zttid.tenant_id.to_string(); - let timeline_id = zttid.timeline_id.to_string(); + let timeline_dir = conf.timeline_dir(ttid); + let tenant_id = ttid.tenant_id.to_string(); + let timeline_id = ttid.timeline_id.to_string(); - let mut store = FileStorage { + let store = FileStorage { timeline_dir, conf: conf.clone(), persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS .with_label_values(&[&tenant_id, &timeline_id]), - state: state.clone(), + state, }; - store.persist(&state)?; Ok(store) } - // Check the magic/version in the on-disk data and deserialize it, if possible. + /// Check the magic/version in the on-disk data and deserialize it, if possible. fn deser_sk_state(buf: &mut &[u8]) -> Result { // Read the version independent part let magic = buf.read_u32::()?; @@ -112,23 +111,17 @@ impl FileStorage { upgrade_control_file(buf, version) } - // Load control file for given zttid at path specified by conf. + /// Load control file for given ttid at path specified by conf. pub fn load_control_file_conf( conf: &SafeKeeperConf, - zttid: &TenantTimelineId, + ttid: &TenantTimelineId, ) -> Result { - let path = conf.timeline_dir(zttid).join(CONTROL_FILE_NAME); + let path = conf.timeline_dir(ttid).join(CONTROL_FILE_NAME); Self::load_control_file(path) } /// Read in the control file. - /// If create=false and file doesn't exist, bails out. pub fn load_control_file>(control_file_path: P) -> Result { - info!( - "loading control file {}", - control_file_path.as_ref().display(), - ); - let mut control_file = OpenOptions::new() .read(true) .write(true) @@ -179,8 +172,8 @@ impl Deref for FileStorage { } impl Storage for FileStorage { - // persists state durably to underlying storage - // for description see https://lwn.net/Articles/457667/ + /// persists state durably to underlying storage + /// for description see https://lwn.net/Articles/457667/ fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { let _timer = &self.persist_control_file_seconds.start_timer(); @@ -264,57 +257,57 @@ mod test { fn load_from_control_file( conf: &SafeKeeperConf, - zttid: &TenantTimelineId, + ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); + fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir"); Ok(( - FileStorage::restore_new(zttid, conf)?, - FileStorage::load_control_file_conf(conf, zttid)?, + FileStorage::restore_new(ttid, conf)?, + FileStorage::load_control_file_conf(conf, ttid)?, )) } fn create( conf: &SafeKeeperConf, - zttid: &TenantTimelineId, + ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); + fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir"); let state = SafeKeeperState::empty(); - let storage = FileStorage::create_new(zttid, conf, state.clone())?; + let storage = FileStorage::create_new(ttid, conf, state.clone())?; Ok((storage, state)) } #[test] fn test_read_write_safekeeper_state() { let conf = stub_conf(); - let zttid = TenantTimelineId::generate(); + let ttid = TenantTimelineId::generate(); { - let (mut storage, mut state) = create(&conf, &zttid).expect("failed to create state"); + let (mut storage, mut state) = create(&conf, &ttid).expect("failed to create state"); // change something state.commit_lsn = Lsn(42); storage.persist(&state).expect("failed to persist state"); } - let (_, state) = load_from_control_file(&conf, &zttid).expect("failed to read state"); + let (_, state) = load_from_control_file(&conf, &ttid).expect("failed to read state"); assert_eq!(state.commit_lsn, Lsn(42)); } #[test] fn test_safekeeper_state_checksum_mismatch() { let conf = stub_conf(); - let zttid = TenantTimelineId::generate(); + let ttid = TenantTimelineId::generate(); { - let (mut storage, mut state) = create(&conf, &zttid).expect("failed to read state"); + let (mut storage, mut state) = create(&conf, &ttid).expect("failed to read state"); // change something state.commit_lsn = Lsn(42); storage.persist(&state).expect("failed to persist state"); } - let control_path = conf.timeline_dir(&zttid).join(CONTROL_FILE_NAME); + let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME); let mut data = fs::read(&control_path).unwrap(); data[0] += 1; // change the first byte of the file to fail checksum validation fs::write(&control_path, &data).expect("failed to write control file"); - match load_from_control_file(&conf, &zttid) { + match load_from_control_file(&conf, &ttid) { Err(err) => assert!(err .to_string() .contains("safekeeper control file checksum mismatch")), diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index ad2c0ec8bf..ca887399e1 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -3,15 +3,15 @@ use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; use crate::receive_wal::ReceiveWalConn; -use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage}; + use crate::send_wal::ReplicationConn; -use crate::timeline::{Timeline, TimelineTools}; -use crate::SafeKeeperConf; + +use crate::{GlobalTimelines, SafeKeeperConf}; use anyhow::{bail, Context, Result}; use postgres_ffi::PG_TLI; use regex::Regex; -use std::sync::Arc; + use tracing::info; use utils::{ id::{TenantId, TenantTimelineId, TimelineId}, @@ -27,7 +27,7 @@ pub struct SafekeeperPostgresHandler { pub appname: Option, pub tenant_id: Option, pub timeline_id: Option, - pub timeline: Option>, + pub ttid: TenantTimelineId, } /// Parsed Postgres command. @@ -101,30 +101,21 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { query_string, self.timeline_id ); - let create = !(matches!(cmd, SafekeeperPostgresCommand::StartReplication { .. }) - || matches!(cmd, SafekeeperPostgresCommand::IdentifySystem)); - - let tenant_id = self.tenant_id.context("tenant_id is required")?; - let timeline_id = self.timeline_id.context("timeline_id is required")?; - if self.timeline.is_none() { - self.timeline.set( - &self.conf, - TenantTimelineId::new(tenant_id, timeline_id), - create, - )?; - } + let tenant_id = self.tenant_id.context("tenantid is required")?; + let timeline_id = self.timeline_id.context("timelineid is required")?; + self.ttid = TenantTimelineId::new(tenant_id, timeline_id); match cmd { - SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb) - .run(self) - .context("failed to run ReceiveWalConn"), - SafekeeperPostgresCommand::StartReplication { start_lsn } => ReplicationConn::new(pgb) - .run(self, pgb, start_lsn) - .context("failed to run ReplicationConn"), + SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb).run(self), + SafekeeperPostgresCommand::StartReplication { start_lsn } => { + ReplicationConn::new(pgb).run(self, pgb, start_lsn) + } SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb), SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd), } - .context(format!("timeline {timeline_id}"))?; + .context(format!( + "Failed to process query for timeline {timeline_id}" + ))?; Ok(()) } @@ -137,42 +128,26 @@ impl SafekeeperPostgresHandler { appname: None, tenant_id: None, timeline_id: None, - timeline: None, + ttid: TenantTimelineId::empty(), } } - /// Shortcut for calling `process_msg` in the timeline. - pub fn process_safekeeper_msg( - &self, - msg: &ProposerAcceptorMessage, - ) -> Result> { - self.timeline - .get() - .process_msg(msg) - .context("failed to process ProposerAcceptorMessage") - } - /// /// Handle IDENTIFY_SYSTEM replication command /// fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> { + let tli = GlobalTimelines::get(self.ttid)?; + let lsn = if self.is_walproposer_recovery() { // walproposer should get all local WAL until flush_lsn - self.timeline.get().get_end_of_wal() + tli.get_flush_lsn() } else { // other clients shouldn't get any uncommitted WAL - self.timeline.get().get_state().0.commit_lsn + tli.get_state().0.commit_lsn } .to_string(); - let sysid = self - .timeline - .get() - .get_state() - .1 - .server - .system_id - .to_string(); + let sysid = tli.get_state().1.server.system_id.to_string(); let lsn_bytes = lsn.as_bytes(); let tli = PG_TLI.to_string(); let tli_bytes = tli.as_bytes(); diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 14c9414c09..244325368b 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,3 +1,4 @@ +use anyhow::anyhow; use hyper::{Body, Request, Response, StatusCode, Uri}; use once_cell::sync::Lazy; @@ -9,7 +10,9 @@ use std::sync::Arc; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; -use crate::timeline::{GlobalTimelines, TimelineDeleteForceResult}; + +use crate::timelines_global_map::TimelineDeleteForceResult; +use crate::GlobalTimelines; use crate::SafeKeeperConf; use etcd_broker::subscription_value::SkTimelineInfo; use utils::{ @@ -90,15 +93,15 @@ struct TimelineStatus { /// Report info about timeline. async fn timeline_status_handler(request: Request) -> Result, ApiError> { - let zttid = TenantTimelineId::new( + let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); - check_permission(&request, Some(zttid.tenant_id))?; + check_permission(&request, Some(ttid.tenant_id))?; - let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?; + let tli = GlobalTimelines::get(ttid)?; let (inmem, state) = tli.get_state(); - let flush_lsn = tli.get_end_of_wal(); + let flush_lsn = tli.get_flush_lsn(); let acc_state = AcceptorStateStatus { term: state.acceptor_state.term, @@ -108,8 +111,8 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { let request_data: TimelineCreateRequest = json_request(&mut request).await?; - let zttid = TenantTimelineId { + let ttid = TenantTimelineId { tenant_id: parse_request_param(&request, "tenant_id")?, timeline_id: request_data.timeline_id, }; - check_permission(&request, Some(zttid.tenant_id))?; - GlobalTimelines::create(get_conf(&request), zttid, request_data.peer_ids) - .map_err(ApiError::from_err)?; + check_permission(&request, Some(ttid.tenant_id))?; - json_response(StatusCode::CREATED, ()) + Err(ApiError::from_err(anyhow!("not implemented"))) } /// Deactivates the timeline and removes its data directory. -/// -/// It does not try to stop any processing of the timeline; there is no such code at the time of writing. -/// However, it tries to check whether the timeline was active and report it to caller just in case. -/// Note that this information is inaccurate: -/// 1. There is a race condition between checking the timeline for activity and actual directory deletion. -/// 2. At the time of writing Safekeeper rarely marks a timeline inactive. E.g. disconnecting the compute node does nothing. async fn timeline_delete_force_handler( mut request: Request, ) -> Result, ApiError> { - let zttid = TenantTimelineId::new( + let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); - check_permission(&request, Some(zttid.tenant_id))?; + check_permission(&request, Some(ttid.tenant_id))?; ensure_no_body(&mut request).await?; - json_response( - StatusCode::OK, - GlobalTimelines::delete_force(get_conf(&request), &zttid) - .await - .map_err(ApiError::from_err)?, - ) + let resp = tokio::task::spawn_blocking(move || GlobalTimelines::delete_force(&ttid)) + .await + .map_err(ApiError::from_err)??; + json_response(StatusCode::OK, resp) } /// Deactivates all timelines for the tenant and removes its data directory. @@ -168,27 +161,30 @@ async fn tenant_delete_force_handler( let tenant_id = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; ensure_no_body(&mut request).await?; + let delete_info = tokio::task::spawn_blocking(move || { + GlobalTimelines::delete_force_all_for_tenant(&tenant_id) + }) + .await + .map_err(ApiError::from_err)??; json_response( StatusCode::OK, - GlobalTimelines::delete_force_all_for_tenant(get_conf(&request), &tenant_id) - .await - .map_err(ApiError::from_err)? + delete_info .iter() - .map(|(zttid, resp)| (format!("{}", zttid.timeline_id), *resp)) + .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp)) .collect::>(), ) } /// Used only in tests to hand craft required data. async fn record_safekeeper_info(mut request: Request) -> Result, ApiError> { - let zttid = TenantTimelineId::new( + let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); - check_permission(&request, Some(zttid.tenant_id))?; + check_permission(&request, Some(ttid.tenant_id))?; let safekeeper_info: SkTimelineInfo = json_request(&mut request).await?; - let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?; + let tli = GlobalTimelines::get(ttid)?; tli.record_safekeeper_info(&safekeeper_info, NodeId(1)) .await?; diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 00fc43521b..2456eb0752 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -6,18 +6,22 @@ //! modifications in tests. //! +use std::sync::Arc; + use anyhow::Result; use bytes::Bytes; use serde::{Deserialize, Serialize}; use tracing::*; +use utils::id::TenantTimelineId; use crate::handler::SafekeeperPostgresHandler; -use crate::safekeeper::{AcceptorProposerMessage, AppendResponse}; +use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo}; use crate::safekeeper::{ - AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, ProposerGreeting, + AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, }; use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry}; -use crate::timeline::TimelineTools; +use crate::timeline::Timeline; +use crate::GlobalTimelines; use postgres_ffi::v14::xlog_utils; use postgres_ffi::WAL_SEGMENT_SIZE; use utils::{ @@ -57,23 +61,23 @@ struct AppendResult { /// content, and then append it with specified term and lsn. This /// function is used to test safekeepers in different scenarios. pub fn handle_json_ctrl( - spg: &mut SafekeeperPostgresHandler, + spg: &SafekeeperPostgresHandler, pgb: &mut PostgresBackend, append_request: &AppendLogicalMessage, ) -> Result<()> { info!("JSON_CTRL request: {:?}", append_request); // need to init safekeeper state before AppendRequest - prepare_safekeeper(spg)?; + let tli = prepare_safekeeper(spg.ttid)?; // if send_proposer_elected is true, we need to update local history if append_request.send_proposer_elected { - send_proposer_elected(spg, append_request.term, append_request.epoch_start_lsn)?; + send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn)?; } - let inserted_wal = append_logical_message(spg, append_request)?; + let inserted_wal = append_logical_message(&tli, append_request)?; let response = AppendResult { - state: spg.timeline.get().get_state().1, + state: tli.get_state().1, inserted_wal, }; let response_data = serde_json::to_vec(&response)?; @@ -91,28 +95,20 @@ pub fn handle_json_ctrl( /// Prepare safekeeper to process append requests without crashes, /// by sending ProposerGreeting with default server.wal_seg_size. -fn prepare_safekeeper(spg: &mut SafekeeperPostgresHandler) -> Result<()> { - let greeting_request = ProposerAcceptorMessage::Greeting(ProposerGreeting { - protocol_version: 2, // current protocol - pg_version: 0, // unknown - proposer_id: [0u8; 16], - system_id: 0, - timeline_id: spg.timeline_id.unwrap(), - tenant_id: spg.tenant_id.unwrap(), - tli: 0, - wal_seg_size: WAL_SEGMENT_SIZE as u32, // 16MB, default for tests - }); - - let response = spg.timeline.get().process_msg(&greeting_request)?; - match response { - Some(AcceptorProposerMessage::Greeting(_)) => Ok(()), - _ => anyhow::bail!("not GreetingResponse"), - } +fn prepare_safekeeper(ttid: TenantTimelineId) -> Result> { + GlobalTimelines::create( + ttid, + ServerInfo { + pg_version: 0, // unknown + wal_seg_size: WAL_SEGMENT_SIZE as u32, + system_id: 0, + }, + ) } -fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: Lsn) -> Result<()> { +fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> Result<()> { // add new term to existing history - let history = spg.timeline.get().get_state().1.acceptor_state.term_history; + let history = tli.get_state().1.acceptor_state.term_history; let history = history.up_to(lsn.checked_sub(1u64).unwrap()); let mut history_entries = history.0; history_entries.push(TermSwitchEntry { term, lsn }); @@ -125,7 +121,7 @@ fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: L timeline_start_lsn: lsn, }); - spg.timeline.get().process_msg(&proposer_elected_request)?; + tli.process_msg(&proposer_elected_request)?; Ok(()) } @@ -138,12 +134,9 @@ struct InsertedWAL { /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. -fn append_logical_message( - spg: &mut SafekeeperPostgresHandler, - msg: &AppendLogicalMessage, -) -> Result { +fn append_logical_message(tli: &Arc, msg: &AppendLogicalMessage) -> Result { let wal_data = xlog_utils::encode_logical_message(&msg.lm_prefix, &msg.lm_message); - let sk_state = spg.timeline.get().get_state().1; + let sk_state = tli.get_state().1; let begin_lsn = msg.begin_lsn; let end_lsn = begin_lsn + wal_data.len() as u64; @@ -167,7 +160,7 @@ fn append_logical_message( wal_data: Bytes::from(wal_data), }); - let response = spg.timeline.get().process_msg(&append_request)?; + let response = tli.process_msg(&append_request)?; let append_response = match response { Some(AcceptorProposerMessage::AppendResponse(resp)) => resp, diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index b466d5aab5..58a237a5d3 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -23,6 +23,9 @@ pub mod wal_backup; pub mod wal_service; pub mod wal_storage; +mod timelines_global_map; +pub use timelines_global_map::GlobalTimelines; + pub mod defaults { use const_format::formatcp; use std::time::Duration; @@ -65,9 +68,9 @@ impl SafeKeeperConf { self.workdir.join(tenant_id.to_string()) } - pub fn timeline_dir(&self, zttid: &TenantTimelineId) -> PathBuf { - self.tenant_dir(&zttid.tenant_id) - .join(zttid.timeline_id.to_string()) + pub fn timeline_dir(&self, ttid: &TenantTimelineId) -> PathBuf { + self.tenant_dir(&ttid.tenant_id) + .join(ttid.timeline_id.to_string()) } } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 3fa3916266..851a568aec 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -12,11 +12,12 @@ use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::{ safekeeper::{SafeKeeperState, SafekeeperMemState}, - timeline::{GlobalTimelines, ReplicaState}, + timeline::ReplicaState, + GlobalTimelines, }; pub struct FullTimelineInfo { - pub zttid: TenantTimelineId, + pub ttid: TenantTimelineId, pub replicas: Vec, pub wal_backup_active: bool, pub timeline_is_active: bool, @@ -235,11 +236,17 @@ impl Collector for TimelineCollector { self.disk_usage.reset(); self.acceptor_term.reset(); - let timelines = GlobalTimelines::active_timelines_metrics(); + let timelines = GlobalTimelines::get_all(); - for tli in timelines { - let tenant_id = tli.zttid.tenant_id.to_string(); - let timeline_id = tli.zttid.timeline_id.to_string(); + for arc_tli in timelines { + let tli = arc_tli.info_for_metrics(); + if tli.is_none() { + continue; + } + let tli = tli.unwrap(); + + let tenant_id = tli.ttid.tenant_id.to_string(); + let timeline_id = tli.ttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; let mut most_advanced: Option = None; diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index b0b6a73621..e28caa2f19 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -7,7 +7,9 @@ use anyhow::{anyhow, bail, Result}; use bytes::BytesMut; use tracing::*; +use crate::safekeeper::ServerInfo; use crate::timeline::Timeline; +use crate::GlobalTimelines; use std::net::SocketAddr; use std::sync::mpsc::channel; @@ -20,7 +22,6 @@ use crate::safekeeper::AcceptorProposerMessage; use crate::safekeeper::ProposerAcceptorMessage; use crate::handler::SafekeeperPostgresHandler; -use crate::timeline::TimelineTools; use utils::{ postgres_backend::PostgresBackend, pq_proto::{BeMessage, FeMessage}, @@ -67,15 +68,21 @@ impl<'pg> ReceiveWalConn<'pg> { // Receive information about server let next_msg = poll_reader.recv_msg()?; - match next_msg { + let tli = match next_msg { ProposerAcceptorMessage::Greeting(ref greeting) => { info!( "start handshake with wal proposer {} sysid {} timeline {}", self.peer_addr, greeting.system_id, greeting.tli, ); + let server_info = ServerInfo { + pg_version: greeting.pg_version, + system_id: greeting.system_id, + wal_seg_size: greeting.wal_seg_size, + }; + GlobalTimelines::create(spg.ttid, server_info)? } _ => bail!("unexpected message {:?} instead of greeting", next_msg), - } + }; let mut next_msg = Some(next_msg); @@ -88,7 +95,7 @@ impl<'pg> ReceiveWalConn<'pg> { while let Some(ProposerAcceptorMessage::AppendRequest(append_request)) = next_msg { let msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request); - let reply = spg.process_safekeeper_msg(&msg)?; + let reply = tli.process_msg(&msg)?; if let Some(reply) = reply { self.write_msg(&reply)?; } @@ -97,13 +104,13 @@ impl<'pg> ReceiveWalConn<'pg> { } // flush all written WAL to the disk - let reply = spg.process_safekeeper_msg(&ProposerAcceptorMessage::FlushWAL)?; + let reply = tli.process_msg(&ProposerAcceptorMessage::FlushWAL)?; if let Some(reply) = reply { self.write_msg(&reply)?; } } else if let Some(msg) = next_msg.take() { // process other message - let reply = spg.process_safekeeper_msg(&msg)?; + let reply = tli.process_msg(&msg)?; if let Some(reply) = reply { self.write_msg(&reply)?; } @@ -112,9 +119,9 @@ impl<'pg> ReceiveWalConn<'pg> { // Register the connection and defer unregister. Do that only // after processing first message, as it sets wal_seg_size, // wanted by many. - spg.timeline.get().on_compute_connect()?; + tli.on_compute_connect()?; _guard = Some(ComputeConnectionGuard { - timeline: Arc::clone(spg.timeline.get()), + timeline: Arc::clone(&tli), }); first_time_through = false; } @@ -190,6 +197,8 @@ struct ComputeConnectionGuard { impl Drop for ComputeConnectionGuard { fn drop(&mut self) { - self.timeline.on_compute_disconnect().unwrap(); + if let Err(e) = self.timeline.on_compute_disconnect() { + error!("failed to unregister compute connection: {}", e); + } } } diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index 004c0243f9..b6d497f34e 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -4,20 +4,21 @@ use std::{thread, time::Duration}; use tracing::*; -use crate::{timeline::GlobalTimelines, SafeKeeperConf}; +use crate::{GlobalTimelines, SafeKeeperConf}; pub fn thread_main(conf: SafeKeeperConf) { let wal_removal_interval = Duration::from_millis(5000); loop { - let active_tlis = GlobalTimelines::get_active_timelines(); - for zttid in &active_tlis { - if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) { - if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) { - warn!( - "failed to remove WAL for tenant {} timeline {}: {}", - tli.zttid.tenant_id, tli.zttid.timeline_id, e - ); - } + let tlis = GlobalTimelines::get_all(); + for tli in &tlis { + if !tli.is_active() { + continue; + } + let ttid = tli.ttid; + let _enter = + info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id).entered(); + if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) { + warn!("failed to remove WAL: {}", e); } } thread::sleep(wal_removal_interval) diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index fa045eed90..d34a77e02b 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -218,19 +218,19 @@ pub struct SafekeeperMemState { } impl SafeKeeperState { - pub fn new(zttid: &TenantTimelineId, peers: Vec) -> SafeKeeperState { + pub fn new( + ttid: &TenantTimelineId, + server_info: ServerInfo, + peers: Vec, + ) -> SafeKeeperState { SafeKeeperState { - tenant_id: zttid.tenant_id, - timeline_id: zttid.timeline_id, + tenant_id: ttid.tenant_id, + timeline_id: ttid.timeline_id, acceptor_state: AcceptorState { term: 0, term_history: TermHistory::empty(), }, - server: ServerInfo { - pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ - system_id: 0, /* Postgres system identifier */ - wal_seg_size: 0, - }, + server: server_info, proposer_uuid: [0; 16], timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), @@ -244,7 +244,15 @@ impl SafeKeeperState { #[cfg(test)] pub fn empty() -> Self { - SafeKeeperState::new(&TenantTimelineId::empty(), vec![]) + SafeKeeperState::new( + &TenantTimelineId::empty(), + ServerInfo { + pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ + system_id: 0, /* Postgres system identifier */ + wal_seg_size: 0, + }, + vec![], + ) } } @@ -479,8 +487,12 @@ impl AcceptorProposerMessage { } } -/// SafeKeeper which consumes events (messages from compute) and provides -/// replies. +/// Safekeeper implements consensus to reliably persist WAL across nodes. +/// It controls all WAL disk writes and updates of control file. +/// +/// Currently safekeeper processes: +/// - messages from compute (proposers) and provides replies +/// - messages from broker peers pub struct SafeKeeper { /// Maximum commit_lsn between all nodes, can be ahead of local flush_lsn. /// Note: be careful to set only if we are sure our WAL (term history) matches @@ -503,20 +515,20 @@ where CTRL: control_file::Storage, WAL: wal_storage::Storage, { - // constructor - pub fn new( - timeline_id: TimelineId, - state: CTRL, - mut wal_store: WAL, - node_id: NodeId, - ) -> Result> { - if state.timeline_id != TimelineId::from([0u8; 16]) && timeline_id != state.timeline_id { - bail!("Calling SafeKeeper::new with inconsistent timeline_id ({}) and SafeKeeperState.server.timeline_id ({})", timeline_id, state.timeline_id); + /// Accepts a control file storage containing the safekeeper state. + /// State must be initialized, i.e. contain filled `tenant_id`, `timeline_id` + /// and `server` (`wal_seg_size` inside it) fields. + pub fn new(state: CTRL, wal_store: WAL, node_id: NodeId) -> Result> { + if state.tenant_id == TenantId::from([0u8; 16]) + || state.timeline_id == TimelineId::from([0u8; 16]) + { + bail!( + "Calling SafeKeeper::new with empty tenant_id ({}) or timeline_id ({})", + state.tenant_id, + state.timeline_id + ); } - // initialize wal_store, if state is already initialized - wal_store.init_storage(&state)?; - Ok(SafeKeeper { global_commit_lsn: state.commit_lsn, epoch_start_lsn: Lsn(0), @@ -574,7 +586,7 @@ where &mut self, msg: &ProposerGreeting, ) -> Result> { - /* Check protocol compatibility */ + // Check protocol compatibility if msg.protocol_version != SK_PROTOCOL_VERSION { bail!( "incompatible protocol version {}, expected {}", @@ -582,11 +594,11 @@ where SK_PROTOCOL_VERSION ); } - /* Postgres upgrade is not treated as fatal error */ + // Postgres upgrade is not treated as fatal error if msg.pg_version != self.state.server.pg_version && self.state.server.pg_version != UNKNOWN_SERVER_VERSION { - info!( + warn!( "incompatible server version {}, expected {}", msg.pg_version, self.state.server.pg_version ); @@ -605,17 +617,25 @@ where self.state.timeline_id ); } - - // set basic info about server, if not yet - // TODO: verify that is doesn't change after - { - let mut state = self.state.clone(); - state.server.system_id = msg.system_id; - state.server.wal_seg_size = msg.wal_seg_size; - self.state.persist(&state)?; + if self.state.server.wal_seg_size != msg.wal_seg_size { + bail!( + "invalid wal_seg_size, got {}, expected {}", + msg.wal_seg_size, + self.state.server.wal_seg_size + ); } - self.wal_store.init_storage(&self.state)?; + // system_id will be updated on mismatch + if self.state.server.system_id != msg.system_id { + warn!( + "unexpected system ID arrived, got {}, expected {}", + msg.system_id, self.state.server.system_id + ); + + let mut state = self.state.clone(); + state.server.system_id = msg.system_id; + self.state.persist(&state)?; + } info!( "processed greeting from proposer {:?}, sending term {:?}", @@ -665,16 +685,6 @@ where Ok(Some(AcceptorProposerMessage::VoteResponse(resp))) } - /// Bump our term if received a note from elected proposer with higher one - fn bump_if_higher(&mut self, term: Term) -> Result<()> { - if self.state.acceptor_state.term < term { - let mut state = self.state.clone(); - state.acceptor_state.term = term; - self.state.persist(&state)?; - } - Ok(()) - } - /// Form AppendResponse from current state. fn append_response(&self) -> AppendResponse { let ar = AppendResponse { @@ -691,7 +701,12 @@ where fn handle_elected(&mut self, msg: &ProposerElected) -> Result> { info!("received ProposerElected {:?}", msg); - self.bump_if_higher(msg.term)?; + if self.state.acceptor_state.term < msg.term { + let mut state = self.state.clone(); + state.acceptor_state.term = msg.term; + self.state.persist(&state)?; + } + // If our term is higher, ignore the message (next feedback will inform the compute) if self.state.acceptor_state.term > msg.term { return Ok(None); @@ -748,7 +763,7 @@ where } /// Advance commit_lsn taking into account what we have locally - pub fn update_commit_lsn(&mut self) -> Result<()> { + fn update_commit_lsn(&mut self) -> Result<()> { let commit_lsn = min(self.global_commit_lsn, self.flush_lsn()); assert!(commit_lsn >= self.inmem.commit_lsn); @@ -768,6 +783,11 @@ where Ok(()) } + /// Persist control file to disk, called only after timeline creation (bootstrap). + pub fn persist(&mut self) -> Result<()> { + self.persist_control_file(self.state.clone()) + } + /// Persist in-memory state to the disk, taking other data from state. fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> { state.commit_lsn = self.inmem.commit_lsn; @@ -918,6 +938,8 @@ where #[cfg(test)] mod tests { + use postgres_ffi::WAL_SEGMENT_SIZE; + use super::*; use crate::wal_storage::Storage; use std::ops::Deref; @@ -942,6 +964,14 @@ mod tests { } } + fn test_sk_state() -> SafeKeeperState { + let mut state = SafeKeeperState::empty(); + state.server.wal_seg_size = WAL_SEGMENT_SIZE as u32; + state.tenant_id = TenantId::from([1u8; 16]); + state.timeline_id = TimelineId::from([1u8; 16]); + state + } + struct DummyWalStore { lsn: Lsn, } @@ -951,10 +981,6 @@ mod tests { self.lsn } - fn init_storage(&mut self, _state: &SafeKeeperState) -> Result<()> { - Ok(()) - } - fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { self.lsn = startpos + buf.len() as u64; Ok(()) @@ -977,12 +1003,10 @@ mod tests { #[test] fn test_voting() { let storage = InMemoryState { - persisted_state: SafeKeeperState::empty(), + persisted_state: test_sk_state(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let timeline_id = TimelineId::from([0u8; 16]); - - let mut sk = SafeKeeper::new(timeline_id, storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(storage, wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -998,7 +1022,7 @@ mod tests { persisted_state: state, }; - sk = SafeKeeper::new(timeline_id, storage, sk.wal_store, NodeId(0)).unwrap(); + sk = SafeKeeper::new(storage, sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request); @@ -1011,12 +1035,11 @@ mod tests { #[test] fn test_epoch_switch() { let storage = InMemoryState { - persisted_state: SafeKeeperState::empty(), + persisted_state: test_sk_state(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let timeline_id = TimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(timeline_id, storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(storage, wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { term: 1, diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 375b6eea18..5a38558e9c 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -2,8 +2,9 @@ //! with the "START_REPLICATION" message. use crate::handler::SafekeeperPostgresHandler; -use crate::timeline::{ReplicaState, Timeline, TimelineTools}; +use crate::timeline::{ReplicaState, Timeline}; use crate::wal_storage::WalReader; +use crate::GlobalTimelines; use anyhow::{bail, Context, Result}; use bytes::Bytes; @@ -167,8 +168,10 @@ impl ReplicationConn { ) -> Result<()> { let _enter = info_span!("WAL sender", timeline = %spg.timeline_id.unwrap()).entered(); + let tli = GlobalTimelines::get(spg.ttid)?; + // spawn the background thread which receives HotStandbyFeedback messages. - let bg_timeline = Arc::clone(spg.timeline.get()); + let bg_timeline = Arc::clone(&tli); let bg_stream_in = self.stream_in.take().unwrap(); let bg_timeline_id = spg.timeline_id.unwrap(); @@ -201,11 +204,8 @@ impl ReplicationConn { .build()?; runtime.block_on(async move { - let (inmem_state, persisted_state) = spg.timeline.get().get_state(); + let (inmem_state, persisted_state) = tli.get_state(); // add persisted_state.timeline_start_lsn == Lsn(0) check - if persisted_state.server.wal_seg_size == 0 { - bail!("Cannot start replication before connecting to walproposer"); - } // Walproposer gets special handling: safekeeper must give proposer all // local WAL till the end, whether committed or not (walproposer will @@ -217,7 +217,7 @@ impl ReplicationConn { // on this safekeeper itself. That's ok as (old) proposer will never be // able to commit such WAL. let stop_pos: Option = if spg.is_walproposer_recovery() { - let wal_end = spg.timeline.get().get_end_of_wal(); + let wal_end = tli.get_flush_lsn(); Some(wal_end) } else { None @@ -231,7 +231,7 @@ impl ReplicationConn { let mut end_pos = stop_pos.unwrap_or(inmem_state.commit_lsn); let mut wal_reader = WalReader::new( - spg.conf.timeline_dir(&spg.timeline.get().zttid), + spg.conf.timeline_dir(&tli.ttid), &persisted_state, start_pos, spg.conf.wal_backup_enabled, @@ -241,7 +241,7 @@ impl ReplicationConn { let mut send_buf = vec![0u8; MAX_SEND_SIZE]; // watcher for commit_lsn updates - let mut commit_lsn_watch_rx = spg.timeline.get().get_commit_lsn_watch_rx(); + let mut commit_lsn_watch_rx = tli.get_commit_lsn_watch_rx(); loop { if let Some(stop_pos) = stop_pos { @@ -258,7 +258,7 @@ impl ReplicationConn { } else { // TODO: also check once in a while whether we are walsender // to right pageserver. - if spg.timeline.get().stop_walsender(replica_id)? { + if tli.should_walsender_stop(replica_id) { // Shut down, timeline is suspended. // TODO create proper error type for this bail!("end streaming to {:?}", spg.appname); diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index cf317c41c3..4000815857 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -1,27 +1,25 @@ -//! This module contains timeline id -> safekeeper state map with file-backed -//! persistence and support for interaction between sending and receiving wal. +//! This module implements Timeline lifecycle management and has all neccessary code +//! to glue together SafeKeeper and all other background services. -use anyhow::{bail, Context, Result}; +use anyhow::{bail, Result}; use etcd_broker::subscription_value::SkTimelineInfo; -use once_cell::sync::Lazy; use postgres_ffi::XLogSegNo; -use serde::Serialize; use tokio::sync::watch; use std::cmp::{max, min}; -use std::collections::{HashMap, HashSet}; -use std::fs::{self}; -use std::sync::{Arc, Mutex, MutexGuard}; +use parking_lot::{Mutex, MutexGuard}; + +use std::path::PathBuf; use tokio::sync::mpsc::Sender; use tracing::*; use utils::{ - id::{NodeId, TenantId, TenantTimelineId}, + id::{NodeId, TenantTimelineId}, lsn::Lsn, pq_proto::ReplicationFeedback, }; @@ -29,7 +27,7 @@ use utils::{ use crate::control_file; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, - SafekeeperMemState, + SafekeeperMemState, ServerInfo, }; use crate::send_wal::HotStandbyFeedback; @@ -73,7 +71,7 @@ impl ReplicaState { } /// Shared state associated with database instance -struct SharedState { +pub struct SharedState { /// Safekeeper object sk: SafeKeeper, /// State of replicas @@ -95,17 +93,21 @@ struct SharedState { } impl SharedState { - /// Initialize timeline state, creating control file - fn create( + /// Initialize fresh timeline state without persisting anything to disk. + fn create_new( conf: &SafeKeeperConf, - zttid: &TenantTimelineId, - peer_ids: Vec, + ttid: &TenantTimelineId, + state: SafeKeeperState, ) -> Result { - let state = SafeKeeperState::new(zttid, peer_ids); - let control_store = control_file::FileStorage::create_new(zttid, conf, state)?; + if state.server.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(*ttid)); + } - let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); - let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?; + // We don't want to write anything to disk, because we may have existing timeline there. + // These functions should not change anything on disk. + let control_store = control_file::FileStorage::create_new(ttid, conf, state)?; + let wal_store = wal_storage::PhysicalStorage::new(ttid, conf, &control_store)?; + let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?; Ok(Self { sk, @@ -117,16 +119,17 @@ impl SharedState { }) } - /// Restore SharedState from control file. - /// If file doesn't exist, bails out. - fn restore(conf: &SafeKeeperConf, zttid: &TenantTimelineId) -> Result { - let control_store = control_file::FileStorage::restore_new(zttid, conf)?; - let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); + /// Restore SharedState from control file. If file doesn't exist, bails out. + fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result { + let control_store = control_file::FileStorage::restore_new(ttid, conf)?; + if control_store.server.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(*ttid)); + } - info!("timeline {} restored", zttid.timeline_id); + let wal_store = wal_storage::PhysicalStorage::new(ttid, conf, &control_store)?; Ok(Self { - sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?, + sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?, replicas: Vec::new(), wal_backup_active: false, active: false, @@ -134,6 +137,7 @@ impl SharedState { last_removed_segno: 0, }) } + fn is_active(&self) -> bool { self.is_wal_backup_required() // FIXME: add tracking of relevant pageservers and check them here individually, @@ -254,148 +258,289 @@ impl SharedState { } } -/// Database instance (tenant) +#[derive(Debug, thiserror::Error)] +pub enum TimelineError { + #[error("Timeline {0} was cancelled and cannot be used anymore")] + Cancelled(TenantTimelineId), + #[error("Timeline {0} was not found in global map")] + NotFound(TenantTimelineId), + #[error("Timeline {0} exists on disk, but wasn't loaded on startup")] + Invalid(TenantTimelineId), + #[error("Timeline {0} is already exists")] + AlreadyExists(TenantTimelineId), + #[error("Timeline {0} is not initialized, wal_seg_size is zero")] + UninitializedWalSegSize(TenantTimelineId), +} + +/// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline. +/// It also holds SharedState and provides mutually exclusive access to it. pub struct Timeline { - pub zttid: TenantTimelineId, + pub ttid: TenantTimelineId, + /// Sending here asks for wal backup launcher attention (start/stop - /// offloading). Sending zttid instead of concrete command allows to do + /// offloading). Sending ttid instead of concrete command allows to do /// sending without timeline lock. wal_backup_launcher_tx: Sender, + + /// Used to broadcast commit_lsn updates to all background jobs. commit_lsn_watch_tx: watch::Sender, - /// For breeding receivers. commit_lsn_watch_rx: watch::Receiver, + + /// Safekeeper and other state, that should remain consistent and synchronized + /// with the disk. mutex: Mutex, + + /// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal. + cancellation_tx: watch::Sender, + + /// Timeline should not be used after cancellation. Background tasks should + /// monitor this channel and stop eventually after receiving `true` from this channel. + cancellation_rx: watch::Receiver, + + /// Directory where timeline state is stored. + timeline_dir: PathBuf, } impl Timeline { - fn new( - zttid: TenantTimelineId, + /// Load existing timeline from disk. + pub fn load_timeline( + conf: SafeKeeperConf, + ttid: TenantTimelineId, wal_backup_launcher_tx: Sender, - shared_state: SharedState, - ) -> Timeline { + ) -> Result { + let shared_state = SharedState::restore(&conf, &ttid)?; let (commit_lsn_watch_tx, commit_lsn_watch_rx) = - watch::channel(shared_state.sk.inmem.commit_lsn); - Timeline { - zttid, + watch::channel(shared_state.sk.state.commit_lsn); + let (cancellation_tx, cancellation_rx) = watch::channel(false); + + Ok(Timeline { + ttid, wal_backup_launcher_tx, commit_lsn_watch_tx, commit_lsn_watch_rx, mutex: Mutex::new(shared_state), + cancellation_rx, + cancellation_tx, + timeline_dir: conf.timeline_dir(&ttid), + }) + } + + /// Create a new timeline, which is not yet persisted to disk. + pub fn create_empty( + conf: SafeKeeperConf, + ttid: TenantTimelineId, + wal_backup_launcher_tx: Sender, + server_info: ServerInfo, + ) -> Result { + let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID); + let (cancellation_tx, cancellation_rx) = watch::channel(false); + let state = SafeKeeperState::new(&ttid, server_info, vec![]); + + Ok(Timeline { + ttid, + wal_backup_launcher_tx, + commit_lsn_watch_tx, + commit_lsn_watch_rx, + mutex: Mutex::new(SharedState::create_new(&conf, &ttid, state)?), + cancellation_rx, + cancellation_tx, + timeline_dir: conf.timeline_dir(&ttid), + }) + } + + /// Initialize fresh timeline on disk and start background tasks. If bootstrap + /// fails, timeline is cancelled and cannot be used anymore. + /// + /// Bootstrap is transactional, so if it fails, created files will be deleted, + /// and state on disk should remain unchanged. + pub fn bootstrap(&self, shared_state: &mut MutexGuard) -> Result<()> { + match std::fs::metadata(&self.timeline_dir) { + Ok(_) => { + // Timeline directory exists on disk, we should leave state unchanged + // and return error. + bail!(TimelineError::Invalid(self.ttid)); + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} + Err(e) => { + return Err(e.into()); + } } + + // Create timeline directory. + std::fs::create_dir_all(&self.timeline_dir)?; + + // Write timeline to disk and TODO: start background tasks. + match || -> Result<()> { + shared_state.sk.persist()?; + // TODO: add more initialization steps here + Ok(()) + }() { + Ok(_) => Ok(()), + Err(e) => { + // Bootstrap failed, cancel timeline and remove timeline directory. + self.cancel(); + + if let Err(fs_err) = std::fs::remove_dir_all(&self.timeline_dir) { + warn!( + "failed to remove timeline {} directory after bootstrap failure: {}", + self.ttid, fs_err + ); + } + + Err(e) + } + } + } + + /// Delete timeline from disk completely, by removing timeline directory. Background + /// timeline activities will stop eventually. + pub fn delete_from_disk( + &self, + shared_state: &mut MutexGuard, + ) -> Result<(bool, bool)> { + let was_active = shared_state.active; + self.cancel(); + let dir_existed = delete_dir(&self.timeline_dir)?; + Ok((dir_existed, was_active)) + } + + /// Cancel timeline to prevent further usage. Background tasks will stop + /// eventually after receiving cancellation signal. + fn cancel(&self) { + info!("Timeline {} is cancelled", self.ttid); + let _ = self.cancellation_tx.send(true); + let res = self.wal_backup_launcher_tx.blocking_send(self.ttid); + if let Err(e) = res { + error!("Failed to send stop signal to wal_backup_launcher: {}", e); + } + } + + /// Returns if timeline is cancelled. + pub fn is_cancelled(&self) -> bool { + *self.cancellation_rx.borrow() + } + + /// Take a writing mutual exclusive lock on timeline shared_state. + pub fn write_shared_state(&self) -> MutexGuard { + self.mutex.lock() } /// Register compute connection, starting timeline-related activity if it is /// not running yet. - /// Can fail only if channel to a static thread got closed, which is not normal at all. pub fn on_compute_connect(&self) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + let is_wal_backup_action_pending: bool; { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); shared_state.num_computes += 1; - is_wal_backup_action_pending = shared_state.update_status(self.zttid); + is_wal_backup_action_pending = shared_state.update_status(self.ttid); } // Wake up wal backup launcher, if offloading not started yet. if is_wal_backup_action_pending { - self.wal_backup_launcher_tx.blocking_send(self.zttid)?; + // Can fail only if channel to a static thread got closed, which is not normal at all. + self.wal_backup_launcher_tx.blocking_send(self.ttid)?; } Ok(()) } /// De-register compute connection, shutting down timeline activity if /// pageserver doesn't need catchup. - /// Can fail only if channel to a static thread got closed, which is not normal at all. pub fn on_compute_disconnect(&self) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + let is_wal_backup_action_pending: bool; { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); shared_state.num_computes -= 1; - is_wal_backup_action_pending = shared_state.update_status(self.zttid); + is_wal_backup_action_pending = shared_state.update_status(self.ttid); } // Wake up wal backup launcher, if it is time to stop the offloading. if is_wal_backup_action_pending { - self.wal_backup_launcher_tx.blocking_send(self.zttid)?; + // Can fail only if channel to a static thread got closed, which is not normal at all. + self.wal_backup_launcher_tx.blocking_send(self.ttid)?; } Ok(()) } - /// Whether we still need this walsender running? + /// Returns true if walsender should stop sending WAL to pageserver. /// TODO: check this pageserver is actually interested in this timeline. - pub fn stop_walsender(&self, replica_id: usize) -> Result { - let mut shared_state = self.mutex.lock().unwrap(); + pub fn should_walsender_stop(&self, replica_id: usize) -> bool { + if self.is_cancelled() { + return true; + } + + let mut shared_state = self.write_shared_state(); if shared_state.num_computes == 0 { let replica_state = shared_state.replicas[replica_id].unwrap(); let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet (replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); if stop { - shared_state.update_status(self.zttid); - return Ok(true); + shared_state.update_status(self.ttid); + return true; } } - Ok(false) + false } /// Returns whether s3 offloading is required and sets current status as /// matching it. pub fn wal_backup_attend(&self) -> bool { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.wal_backup_attend() - } - - // Can this safekeeper offload to s3? Recently joined safekeepers might not - // have necessary WAL. - pub fn can_wal_backup(&self) -> bool { - self.mutex.lock().unwrap().can_wal_backup() - } - - /// Deactivates the timeline, assuming it is being deleted. - /// Returns whether the timeline was already active. - /// - /// We assume all threads will stop by themselves eventually (possibly with errors, but no panics). - /// There should be no compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but - /// we're deleting the timeline anyway. - pub async fn deactivate_for_delete(&self) -> Result { - let was_active: bool; - { - let shared_state = self.mutex.lock().unwrap(); - was_active = shared_state.active; + if self.is_cancelled() { + return false; } - self.wal_backup_launcher_tx.send(self.zttid).await?; - Ok(was_active) + + self.write_shared_state().wal_backup_attend() } - fn is_active(&self) -> bool { - let shared_state = self.mutex.lock().unwrap(); - shared_state.active + /// Can this safekeeper offload to s3? Recently joined safekeepers might not + /// have necessary WAL. + pub fn can_wal_backup(&self) -> bool { + if self.is_cancelled() { + return false; + } + + let shared_state = self.write_shared_state(); + shared_state.can_wal_backup() } - /// Returns full timeline info, required for the metrics. - /// If the timeline is not active, returns None instead. + /// Returns full timeline info, required for the metrics. If the timeline is + /// not active, returns None instead. pub fn info_for_metrics(&self) -> Option { - let shared_state = self.mutex.lock().unwrap(); - if !shared_state.active { + if self.is_cancelled() { return None; } - Some(FullTimelineInfo { - zttid: self.zttid, - replicas: shared_state - .replicas - .iter() - .filter_map(|r| r.as_ref()) - .copied() - .collect(), - wal_backup_active: shared_state.wal_backup_active, - timeline_is_active: shared_state.active, - num_computes: shared_state.num_computes, - last_removed_segno: shared_state.last_removed_segno, - epoch_start_lsn: shared_state.sk.epoch_start_lsn, - mem_state: shared_state.sk.inmem.clone(), - persisted_state: shared_state.sk.state.clone(), - flush_lsn: shared_state.sk.wal_store.flush_lsn(), - }) + let state = self.write_shared_state(); + if state.active { + Some(FullTimelineInfo { + ttid: self.ttid, + replicas: state + .replicas + .iter() + .filter_map(|r| r.as_ref()) + .copied() + .collect(), + wal_backup_active: state.wal_backup_active, + timeline_is_active: state.active, + num_computes: state.num_computes, + last_removed_segno: state.last_removed_segno, + epoch_start_lsn: state.sk.epoch_start_lsn, + mem_state: state.sk.inmem.clone(), + persisted_state: state.sk.state.clone(), + flush_lsn: state.sk.wal_store.flush_lsn(), + }) + } else { + None + } } + /// Returns commit_lsn watch channel. pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver { self.commit_lsn_watch_rx.clone() } @@ -405,10 +550,14 @@ impl Timeline { &self, msg: &ProposerAcceptorMessage, ) -> Result> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + let mut rmsg: Option; let commit_lsn: Lsn; { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); rmsg = shared_state.sk.process_msg(msg)?; // if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn @@ -426,28 +575,46 @@ impl Timeline { Ok(rmsg) } + /// Returns wal_seg_size. pub fn get_wal_seg_size(&self) -> usize { - self.mutex.lock().unwrap().get_wal_seg_size() + self.write_shared_state().get_wal_seg_size() } + /// Returns true only if the timeline is loaded and active. + pub fn is_active(&self) -> bool { + if self.is_cancelled() { + return false; + } + + self.write_shared_state().active + } + + /// Returns state of the timeline. pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) { - let shared_state = self.mutex.lock().unwrap(); - (shared_state.sk.inmem.clone(), shared_state.sk.state.clone()) + let state = self.write_shared_state(); + (state.sk.inmem.clone(), state.sk.state.clone()) } + /// Returns latest backup_lsn. pub fn get_wal_backup_lsn(&self) -> Lsn { - self.mutex.lock().unwrap().sk.inmem.backup_lsn + self.write_shared_state().sk.inmem.backup_lsn } - pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) { - self.mutex.lock().unwrap().sk.inmem.backup_lsn = backup_lsn; + /// Sets backup_lsn to the given value. + pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + + self.write_shared_state().sk.inmem.backup_lsn = backup_lsn; // we should check whether to shut down offloader, but this will be done // soon by peer communication anyway. + Ok(()) } - /// Prepare public safekeeper info for reporting. + /// Return public safekeeper info for broadcasting to broker and other peers. pub fn get_public_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo { - let shared_state = self.mutex.lock().unwrap(); + let shared_state = self.write_shared_state(); SkTimelineInfo { last_log_term: Some(shared_state.sk.get_epoch()), flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()), @@ -473,54 +640,53 @@ impl Timeline { let is_wal_backup_action_pending: bool; let commit_lsn: Lsn; { - let mut shared_state = self.mutex.lock().unwrap(); - // WAL seg size not initialized yet (no message from compute ever - // received), can't do much without it. - if shared_state.get_wal_seg_size() == 0 { - return Ok(()); - } + let mut shared_state = self.write_shared_state(); shared_state.sk.record_safekeeper_info(sk_info)?; - is_wal_backup_action_pending = shared_state.update_status(self.zttid); + is_wal_backup_action_pending = shared_state.update_status(self.ttid); commit_lsn = shared_state.sk.inmem.commit_lsn; } self.commit_lsn_watch_tx.send(commit_lsn)?; // Wake up wal backup launcher, if it is time to stop the offloading. if is_wal_backup_action_pending { - self.wal_backup_launcher_tx.send(self.zttid).await?; + self.wal_backup_launcher_tx.send(self.ttid).await?; } Ok(()) } + /// Add send_wal replica to the in-memory vector of replicas. pub fn add_replica(&self, state: ReplicaState) -> usize { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.add_replica(state) + self.write_shared_state().add_replica(state) } + /// Update replication replica state. pub fn update_replica_state(&self, id: usize, state: ReplicaState) { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); shared_state.replicas[id] = Some(state); } + /// Remove send_wal replica from the in-memory vector of replicas. pub fn remove_replica(&self, id: usize) { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); assert!(shared_state.replicas[id].is_some()); shared_state.replicas[id] = None; } - pub fn get_end_of_wal(&self) -> Lsn { - let shared_state = self.mutex.lock().unwrap(); - shared_state.sk.wal_store.flush_lsn() + /// Returns flush_lsn. + pub fn get_flush_lsn(&self) -> Lsn { + self.write_shared_state().sk.wal_store.flush_lsn() } + /// Delete WAL segments from disk that are no longer needed. This is determined + /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn. pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + let horizon_segno: XLogSegNo; let remover: Box Result<(), anyhow::Error>>; { - let shared_state = self.mutex.lock().unwrap(); - // WAL seg size not initialized yet, no WAL exists. - if shared_state.get_wal_seg_size() == 0 { - return Ok(()); - } + let shared_state = self.write_shared_state(); horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled); remover = shared_state.sk.wal_store.remove_up_to(); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { @@ -528,243 +694,22 @@ impl Timeline { } // release the lock before removing } - let _enter = - info_span!("", tenant = %self.zttid.tenant_id, timeline = %self.zttid.timeline_id) - .entered(); + + // delete old WAL files remover(horizon_segno - 1)?; - self.mutex.lock().unwrap().last_removed_segno = horizon_segno; + + // update last_removed_segno + let mut shared_state = self.write_shared_state(); + shared_state.last_removed_segno = horizon_segno; Ok(()) } } -// Utilities needed by various Connection-like objects -pub trait TimelineTools { - fn set(&mut self, conf: &SafeKeeperConf, zttid: TenantTimelineId, create: bool) -> Result<()>; - - fn get(&self) -> &Arc; -} - -impl TimelineTools for Option> { - fn set(&mut self, conf: &SafeKeeperConf, zttid: TenantTimelineId, create: bool) -> Result<()> { - *self = Some(GlobalTimelines::get(conf, zttid, create)?); - Ok(()) - } - - fn get(&self) -> &Arc { - self.as_ref().unwrap() - } -} - -struct GlobalTimelinesState { - timelines: HashMap>, - wal_backup_launcher_tx: Option>, -} - -static TIMELINES_STATE: Lazy> = Lazy::new(|| { - Mutex::new(GlobalTimelinesState { - timelines: HashMap::new(), - wal_backup_launcher_tx: None, - }) -}); - -#[derive(Clone, Copy, Serialize)] -pub struct TimelineDeleteForceResult { - pub dir_existed: bool, - pub was_active: bool, -} - -/// A zero-sized struct used to manage access to the global timelines map. -pub struct GlobalTimelines; - -impl GlobalTimelines { - pub fn init(wal_backup_launcher_tx: Sender) { - let mut state = TIMELINES_STATE.lock().unwrap(); - assert!(state.wal_backup_launcher_tx.is_none()); - state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); - } - - fn create_internal( - mut state: MutexGuard, - conf: &SafeKeeperConf, - zttid: TenantTimelineId, - peer_ids: Vec, - ) -> Result> { - match state.timelines.get(&zttid) { - Some(_) => bail!("timeline {} already exists", zttid), - None => { - // TODO: check directory existence - let dir = conf.timeline_dir(&zttid); - fs::create_dir_all(dir)?; - - let shared_state = SharedState::create(conf, &zttid, peer_ids) - .context("failed to create shared state")?; - - let new_tli = Arc::new(Timeline::new( - zttid, - state.wal_backup_launcher_tx.as_ref().unwrap().clone(), - shared_state, - )); - state.timelines.insert(zttid, Arc::clone(&new_tli)); - Ok(new_tli) - } - } - } - - pub fn create( - conf: &SafeKeeperConf, - zttid: TenantTimelineId, - peer_ids: Vec, - ) -> Result> { - let state = TIMELINES_STATE.lock().unwrap(); - GlobalTimelines::create_internal(state, conf, zttid, peer_ids) - } - - /// Get a timeline with control file loaded from the global TIMELINES_STATE.timelines map. - /// If control file doesn't exist and create=false, bails out. - pub fn get( - conf: &SafeKeeperConf, - zttid: TenantTimelineId, - create: bool, - ) -> Result> { - let _enter = info_span!("", timeline = %zttid.timeline_id).entered(); - - let mut state = TIMELINES_STATE.lock().unwrap(); - - match state.timelines.get(&zttid) { - Some(result) => Ok(Arc::clone(result)), - None => { - let shared_state = SharedState::restore(conf, &zttid); - - let shared_state = match shared_state { - Ok(shared_state) => shared_state, - Err(error) => { - // TODO: always create timeline explicitly - if error - .root_cause() - .to_string() - .contains("No such file or directory") - && create - { - return GlobalTimelines::create_internal(state, conf, zttid, vec![]); - } else { - return Err(error); - } - } - }; - - let new_tli = Arc::new(Timeline::new( - zttid, - state.wal_backup_launcher_tx.as_ref().unwrap().clone(), - shared_state, - )); - state.timelines.insert(zttid, Arc::clone(&new_tli)); - Ok(new_tli) - } - } - } - - /// Get loaded timeline, if it exists. - pub fn get_loaded(zttid: TenantTimelineId) -> Option> { - let state = TIMELINES_STATE.lock().unwrap(); - state.timelines.get(&zttid).map(Arc::clone) - } - - pub fn get_active_timelines() -> HashSet { - let state = TIMELINES_STATE.lock().unwrap(); - state - .timelines - .iter() - .filter(|&(_, tli)| tli.is_active()) - .map(|(zttid, _)| *zttid) - .collect() - } - - /// Return FullTimelineInfo for all active timelines. - pub fn active_timelines_metrics() -> Vec { - let state = TIMELINES_STATE.lock().unwrap(); - state - .timelines - .iter() - .filter_map(|(_, tli)| tli.info_for_metrics()) - .collect() - } - - fn delete_force_internal( - conf: &SafeKeeperConf, - zttid: &TenantTimelineId, - was_active: bool, - ) -> Result { - match std::fs::remove_dir_all(conf.timeline_dir(zttid)) { - Ok(_) => Ok(TimelineDeleteForceResult { - dir_existed: true, - was_active, - }), - Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(TimelineDeleteForceResult { - dir_existed: false, - was_active, - }), - Err(e) => Err(e.into()), - } - } - - /// Deactivates and deletes the timeline, see `Timeline::deactivate_for_delete()`, the deletes - /// the corresponding data directory. - /// We assume all timeline threads do not care about `GlobalTimelines` not containing the timeline - /// anymore, and they will eventually terminate without panics. - /// - /// There are multiple ways the timeline may be accidentally "re-created" (so we end up with two - /// `Timeline` objects in memory): - /// a) a compute node connects after this method is called, or - /// b) an HTTP GET request about the timeline is made and it's able to restore the current state, or - /// c) an HTTP POST request for timeline creation is made after the timeline is already deleted. - /// TODO: ensure all of the above never happens. - pub async fn delete_force( - conf: &SafeKeeperConf, - zttid: &TenantTimelineId, - ) -> Result { - info!("deleting timeline {}", zttid); - let timeline = TIMELINES_STATE.lock().unwrap().timelines.remove(zttid); - let mut was_active = false; - if let Some(tli) = timeline { - was_active = tli.deactivate_for_delete().await?; - } - GlobalTimelines::delete_force_internal(conf, zttid, was_active) - } - - /// Deactivates and deletes all timelines for the tenant, see `delete()`. - /// Returns map of all timelines which the tenant had, `true` if a timeline was active. - /// There may be a race if new timelines are created simultaneously. - pub async fn delete_force_all_for_tenant( - conf: &SafeKeeperConf, - tenant_id: &TenantId, - ) -> Result> { - info!("deleting all timelines for tenant {}", tenant_id); - let mut to_delete = HashMap::new(); - { - // Keep mutex in this scope. - let timelines = &mut TIMELINES_STATE.lock().unwrap().timelines; - for (&zttid, tli) in timelines.iter() { - if zttid.tenant_id == *tenant_id { - to_delete.insert(zttid, tli.clone()); - } - } - // TODO: test that the correct subset of timelines is removed. It's complicated because they are implicitly created currently. - timelines.retain(|zttid, _| !to_delete.contains_key(zttid)); - } - let mut deleted = HashMap::new(); - for (zttid, timeline) in to_delete { - let was_active = timeline.deactivate_for_delete().await?; - deleted.insert( - zttid, - GlobalTimelines::delete_force_internal(conf, &zttid, was_active)?, - ); - } - // There may be inactive timelines, so delete the whole tenant dir as well. - match std::fs::remove_dir_all(conf.tenant_dir(tenant_id)) { - Ok(_) => (), - Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), - e => e?, - }; - Ok(deleted) +/// Deletes directory and it's contents. Returns false if directory does not exist. +fn delete_dir(path: &PathBuf) -> Result { + match std::fs::remove_dir_all(path) { + Ok(_) => Ok(true), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), + Err(e) => Err(e.into()), } } diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs new file mode 100644 index 0000000000..cf99a243d7 --- /dev/null +++ b/safekeeper/src/timelines_global_map.rs @@ -0,0 +1,348 @@ +//! This module contains global (tenant_id, timeline_id) -> Arc mapping. +//! All timelines should always be present in this map, this is done by loading them +//! all from the disk on startup and keeping them in memory. + +use crate::safekeeper::ServerInfo; +use crate::timeline::{Timeline, TimelineError}; +use crate::SafeKeeperConf; +use anyhow::{anyhow, bail, Context, Result}; +use once_cell::sync::Lazy; +use serde::Serialize; +use std::collections::HashMap; +use std::path::PathBuf; +use std::str::FromStr; +use std::sync::{Arc, Mutex, MutexGuard}; +use tokio::sync::mpsc::Sender; +use tracing::*; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; + +struct GlobalTimelinesState { + timelines: HashMap>, + wal_backup_launcher_tx: Option>, + conf: SafeKeeperConf, +} + +impl GlobalTimelinesState { + /// Get dependencies for a timeline constructor. + fn get_dependencies(&self) -> (SafeKeeperConf, Sender) { + ( + self.conf.clone(), + self.wal_backup_launcher_tx.as_ref().unwrap().clone(), + ) + } + + /// Insert timeline into the map. Returns error if timeline with the same id already exists. + fn try_insert(&mut self, timeline: Arc) -> Result<()> { + let ttid = timeline.ttid; + if self.timelines.contains_key(&ttid) { + bail!(TimelineError::AlreadyExists(ttid)); + } + self.timelines.insert(ttid, timeline); + Ok(()) + } + + /// Get timeline from the map. Returns error if timeline doesn't exist. + fn get(&self, ttid: &TenantTimelineId) -> Result> { + self.timelines + .get(ttid) + .cloned() + .ok_or_else(|| anyhow!(TimelineError::NotFound(*ttid))) + } +} + +static TIMELINES_STATE: Lazy> = Lazy::new(|| { + Mutex::new(GlobalTimelinesState { + timelines: HashMap::new(), + wal_backup_launcher_tx: None, + conf: SafeKeeperConf::default(), + }) +}); + +/// A zero-sized struct used to manage access to the global timelines map. +pub struct GlobalTimelines; + +impl GlobalTimelines { + /// Inject dependencies needed for the timeline constructors and load all timelines to memory. + pub fn init( + conf: SafeKeeperConf, + wal_backup_launcher_tx: Sender, + ) -> Result<()> { + let mut state = TIMELINES_STATE.lock().unwrap(); + assert!(state.wal_backup_launcher_tx.is_none()); + state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); + state.conf = conf; + + // Iterate through all directories and load tenants for all directories + // named as a valid tenant_id. + let mut tenant_count = 0; + let tenants_dir = state.conf.workdir.clone(); + for tenants_dir_entry in std::fs::read_dir(&tenants_dir) + .with_context(|| format!("failed to list tenants dir {}", tenants_dir.display()))? + { + match &tenants_dir_entry { + Ok(tenants_dir_entry) => { + if let Ok(tenant_id) = + TenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or("")) + { + tenant_count += 1; + GlobalTimelines::load_tenant_timelines(&mut state, tenant_id)?; + } + } + Err(e) => error!( + "failed to list tenants dir entry {:?} in directory {}, reason: {:?}", + tenants_dir_entry, + tenants_dir.display(), + e + ), + } + } + + info!( + "found {} tenants directories, successfully loaded {} timelines", + tenant_count, + state.timelines.len() + ); + Ok(()) + } + + /// Loads all timelines for the given tenant to memory. Returns fs::read_dir errors if any. + fn load_tenant_timelines( + state: &mut MutexGuard, + tenant_id: TenantId, + ) -> Result<()> { + let timelines_dir = state.conf.tenant_dir(&tenant_id); + for timelines_dir_entry in std::fs::read_dir(&timelines_dir) + .with_context(|| format!("failed to list timelines dir {}", timelines_dir.display()))? + { + match &timelines_dir_entry { + Ok(timeline_dir_entry) => { + if let Ok(timeline_id) = + TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or("")) + { + let ttid = TenantTimelineId::new(tenant_id, timeline_id); + match Timeline::load_timeline( + state.conf.clone(), + ttid, + state.wal_backup_launcher_tx.as_ref().unwrap().clone(), + ) { + Ok(timeline) => { + state.timelines.insert(ttid, Arc::new(timeline)); + } + // If we can't load a timeline, it's most likely because of a corrupted + // directory. We will log an error and won't allow to delete/recreate + // this timeline. The only way to fix this timeline is to repair manually + // and restart the safekeeper. + Err(e) => error!( + "failed to load timeline {} for tenant {}, reason: {:?}", + timeline_id, tenant_id, e + ), + } + } + } + Err(e) => error!( + "failed to list timelines dir entry {:?} in directory {}, reason: {:?}", + timelines_dir_entry, + timelines_dir.display(), + e + ), + } + } + + Ok(()) + } + + /// Create a new timeline with the given id. If the timeline already exists, returns + /// an existing timeline. + pub fn create(ttid: TenantTimelineId, server_info: ServerInfo) -> Result> { + let (conf, wal_backup_launcher_tx) = { + let state = TIMELINES_STATE.lock().unwrap(); + if let Ok(timeline) = state.get(&ttid) { + // Timeline already exists, return it. + return Ok(timeline); + } + state.get_dependencies() + }; + + info!("creating new timeline {}", ttid); + + let timeline = Arc::new(Timeline::create_empty( + conf, + ttid, + wal_backup_launcher_tx, + server_info, + )?); + + // Take a lock and finish the initialization holding this mutex. No other threads + // can interfere with creation after we will insert timeline into the map. + let mut shared_state = timeline.write_shared_state(); + + // We can get a race condition here in case of concurrent create calls, but only + // in theory. create() will return valid timeline on the next try. + TIMELINES_STATE + .lock() + .unwrap() + .try_insert(timeline.clone())?; + + // Write the new timeline to the disk and start background workers. + // Bootstrap is transactional, so if it fails, the timeline will be deleted, + // and the state on disk should remain unchanged. + match timeline.bootstrap(&mut shared_state) { + Ok(_) => { + // We are done with bootstrap, release the lock, return the timeline. + drop(shared_state); + Ok(timeline) + } + Err(e) => { + // Note: the most likely reason for bootstrap failure is that the timeline + // directory already exists on disk. This happens when timeline is corrupted + // and wasn't loaded from disk on startup because of that. We want to preserve + // the timeline directory in this case, for further inspection. + + // TODO: this is an unusual error, perhaps we should send it to sentry + // TODO: compute will try to create timeline every second, we should add backoff + error!("failed to bootstrap timeline {}: {}", ttid, e); + + // Timeline failed to bootstrap, it cannot be used. Remove it from the map. + TIMELINES_STATE.lock().unwrap().timelines.remove(&ttid); + Err(e) + } + } + } + + /// Get a timeline from the global map. If it's not present, it doesn't exist on disk, + /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid, + /// i.e. loaded in memory and not cancelled. + pub fn get(ttid: TenantTimelineId) -> Result> { + let res = TIMELINES_STATE.lock().unwrap().get(&ttid); + + match res { + Ok(tli) => { + if tli.is_cancelled() { + anyhow::bail!(TimelineError::Cancelled(ttid)); + } + Ok(tli) + } + Err(e) => Err(e), + } + } + + /// Returns all timelines. This is used for background timeline proccesses. + pub fn get_all() -> Vec> { + let global_lock = TIMELINES_STATE.lock().unwrap(); + global_lock + .timelines + .values() + .cloned() + .filter(|t| !t.is_cancelled()) + .collect() + } + + /// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant, + /// and that's why it can return cancelled timelines, to retry deleting them. + fn get_all_for_tenant(tenant_id: TenantId) -> Vec> { + let global_lock = TIMELINES_STATE.lock().unwrap(); + global_lock + .timelines + .values() + .filter(|t| t.ttid.tenant_id == tenant_id) + .cloned() + .collect() + } + + /// Cancels timeline, then deletes the corresponding data directory. + pub fn delete_force(ttid: &TenantTimelineId) -> Result { + let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid); + match tli_res { + Ok(timeline) => { + // Take a lock and finish the deletion holding this mutex. + let mut shared_state = timeline.write_shared_state(); + + info!("deleting timeline {}", ttid); + let (dir_existed, was_active) = timeline.delete_from_disk(&mut shared_state)?; + + // Remove timeline from the map. + TIMELINES_STATE.lock().unwrap().timelines.remove(ttid); + + Ok(TimelineDeleteForceResult { + dir_existed, + was_active, + }) + } + Err(_) => { + // Timeline is not memory, but it may still exist on disk in broken state. + let dir_path = TIMELINES_STATE.lock().unwrap().conf.timeline_dir(ttid); + let dir_existed = delete_dir(dir_path)?; + + Ok(TimelineDeleteForceResult { + dir_existed, + was_active: false, + }) + } + } + } + + /// Deactivates and deletes all timelines for the tenant. Returns map of all timelines which + /// the tenant had, `true` if a timeline was active. There may be a race if new timelines are + /// created simultaneously. In that case the function will return error and the caller should + /// retry tenant deletion again later. + pub fn delete_force_all_for_tenant( + tenant_id: &TenantId, + ) -> Result> { + info!("deleting all timelines for tenant {}", tenant_id); + let to_delete = Self::get_all_for_tenant(*tenant_id); + + let mut err = None; + + let mut deleted = HashMap::new(); + for tli in &to_delete { + match Self::delete_force(&tli.ttid) { + Ok(result) => { + deleted.insert(tli.ttid, result); + } + Err(e) => { + error!("failed to delete timeline {}: {}", tli.ttid, e); + // Save error to return later. + err = Some(e); + } + } + } + + // If there was an error, return it. + if let Some(e) = err { + return Err(e); + } + + // There may be broken timelines on disk, so delete the whole tenant dir as well. + // Note that we could concurrently create new timelines while we were deleting them, + // so the directory may be not empty. In this case timelines will have bad state + // and timeline background jobs can panic. + delete_dir(TIMELINES_STATE.lock().unwrap().conf.tenant_dir(tenant_id))?; + + let tlis_after_delete = Self::get_all_for_tenant(*tenant_id); + if !tlis_after_delete.is_empty() { + // Some timelines were created while we were deleting them, returning error + // to the caller, so it can retry later. + bail!( + "failed to delete all timelines for tenant {}: some timelines were created while we were deleting them", + tenant_id + ); + } + + Ok(deleted) + } +} + +#[derive(Clone, Copy, Serialize)] +pub struct TimelineDeleteForceResult { + pub dir_existed: bool, + pub was_active: bool, +} + +/// Deletes directory and it's contents. Returns false if directory does not exist. +fn delete_dir(path: PathBuf) -> Result { + match std::fs::remove_dir_all(path) { + Ok(_) => Ok(true), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), + Err(e) => Err(e.into()), + } +} diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 85e967e218..0d5321fb3a 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -26,8 +26,8 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::broker::{Election, ElectionLeader}; -use crate::timeline::{GlobalTimelines, Timeline}; -use crate::{broker, SafeKeeperConf}; +use crate::timeline::Timeline; +use crate::{broker, GlobalTimelines, SafeKeeperConf}; use once_cell::sync::OnceCell; @@ -53,8 +53,10 @@ pub fn wal_backup_launcher_thread_main( /// Check whether wal backup is required for timeline. If yes, mark that launcher is /// aware of current status and return the timeline. -fn is_wal_backup_required(zttid: TenantTimelineId) -> Option> { - GlobalTimelines::get_loaded(zttid).filter(|t| t.wal_backup_attend()) +fn is_wal_backup_required(ttid: TenantTimelineId) -> Option> { + GlobalTimelines::get(ttid) + .ok() + .filter(|tli| tli.wal_backup_attend()) } struct WalBackupTaskHandle { @@ -70,20 +72,20 @@ struct WalBackupTimelineEntry { /// Start per timeline task, if it makes sense for this safekeeper to offload. fn consider_start_task( conf: &SafeKeeperConf, - zttid: TenantTimelineId, + ttid: TenantTimelineId, task: &mut WalBackupTimelineEntry, ) { if !task.timeline.can_wal_backup() { return; } - info!("starting WAL backup task for {}", zttid); + info!("starting WAL backup task for {}", ttid); // TODO: decide who should offload right here by simply checking current // state instead of running elections in offloading task. let election_name = SubscriptionKey { cluster_prefix: conf.broker_etcd_prefix.clone(), kind: SubscriptionKind::Operation( - zttid, + ttid, NodeKind::Safekeeper, OperationKind::Safekeeper(SkOperationKind::WalBackup), ), @@ -97,11 +99,11 @@ fn consider_start_task( ); let (shutdown_tx, shutdown_rx) = mpsc::channel(1); - let timeline_dir = conf.timeline_dir(&zttid); + let timeline_dir = conf.timeline_dir(&ttid); let handle = tokio::spawn( - backup_task_main(zttid, timeline_dir, shutdown_rx, election) - .instrument(info_span!("WAL backup task", zttid = %zttid)), + backup_task_main(ttid, timeline_dir, shutdown_rx, election) + .instrument(info_span!("WAL backup task", ttid = %ttid)), ); task.handle = Some(WalBackupTaskHandle { @@ -140,33 +142,33 @@ async fn wal_backup_launcher_main_loop( let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC)); loop { tokio::select! { - zttid = wal_backup_launcher_rx.recv() => { + ttid = wal_backup_launcher_rx.recv() => { // channel is never expected to get closed - let zttid = zttid.unwrap(); + let ttid = ttid.unwrap(); if conf.remote_storage.is_none() || !conf.wal_backup_enabled { continue; /* just drain the channel and do nothing */ } - let timeline = is_wal_backup_required(zttid); + let timeline = is_wal_backup_required(ttid); // do we need to do anything at all? - if timeline.is_some() != tasks.contains_key(&zttid) { + if timeline.is_some() != tasks.contains_key(&ttid) { if let Some(timeline) = timeline { // need to start the task - let entry = tasks.entry(zttid).or_insert(WalBackupTimelineEntry { + let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry { timeline, handle: None, }); - consider_start_task(&conf, zttid, entry); + consider_start_task(&conf, ttid, entry); } else { // need to stop the task - info!("stopping WAL backup task for {}", zttid); + info!("stopping WAL backup task for {}", ttid); - let entry = tasks.remove(&zttid).unwrap(); + let entry = tasks.remove(&ttid).unwrap(); if let Some(wb_handle) = entry.handle { // Tell the task to shutdown. Error means task exited earlier, that's ok. let _ = wb_handle.shutdown_tx.send(()).await; // Await the task itself. TODO: restart panicked tasks earlier. if let Err(e) = wb_handle.handle.await { - warn!("WAL backup task for {} panicked: {}", zttid, e); + warn!("WAL backup task for {} panicked: {}", ttid, e); } } } @@ -174,8 +176,8 @@ async fn wal_backup_launcher_main_loop( } // Start known tasks, if needed and possible. _ = ticker.tick() => { - for (zttid, entry) in tasks.iter_mut().filter(|(_, entry)| entry.handle.is_none()) { - consider_start_task(&conf, *zttid, entry); + for (ttid, entry) in tasks.iter_mut().filter(|(_, entry)| entry.handle.is_none()) { + consider_start_task(&conf, *ttid, entry); } } } @@ -191,26 +193,26 @@ struct WalBackupTask { election: Election, } -/// Offload single timeline. +/// Offload single timeline. Called only after we checked that backup +/// is required (wal_backup_attend) and possible (can_wal_backup). async fn backup_task_main( - zttid: TenantTimelineId, + ttid: TenantTimelineId, timeline_dir: PathBuf, mut shutdown_rx: Receiver<()>, election: Election, ) { info!("started"); - let timeline: Arc = if let Some(tli) = GlobalTimelines::get_loaded(zttid) { - tli - } else { - /* Timeline could get deleted while task was starting, just exit then. */ - info!("no timeline, exiting"); + let res = GlobalTimelines::get(ttid); + if let Err(e) = res { + error!("backup error for timeline {}: {}", ttid, e); return; - }; + } + let tli = res.unwrap(); let mut wb = WalBackupTask { - wal_seg_size: timeline.get_wal_seg_size(), - commit_lsn_watch_rx: timeline.get_commit_lsn_watch_rx(), - timeline, + wal_seg_size: tli.get_wal_seg_size(), + commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), + timeline: tli, timeline_dir, leader: None, election, @@ -322,7 +324,11 @@ impl WalBackupTask { { Ok(backup_lsn_result) => { backup_lsn = backup_lsn_result; - self.timeline.set_wal_backup_lsn(backup_lsn_result); + let res = self.timeline.set_wal_backup_lsn(backup_lsn_result); + if let Err(e) = res { + error!("backup error: {}", e); + return; + } retry_attempt = 0; } Err(e) => { diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 58b69f06e7..ea613dd0f1 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -7,7 +7,7 @@ //! //! Note that last file has `.partial` suffix, that's different from postgres. -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{bail, Context, Result}; use std::io::{self, Seek, SeekFrom}; use std::pin::Pin; use tokio::io::AsyncRead; @@ -17,7 +17,7 @@ use postgres_ffi::v14::xlog_utils::{ find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, }; use postgres_ffi::{XLogSegNo, PG_TLI}; -use std::cmp::min; +use std::cmp::{max, min}; use std::fs::{self, remove_file, File, OpenOptions}; use std::io::Write; @@ -86,9 +86,9 @@ struct WalStorageMetrics { } impl WalStorageMetrics { - fn new(zttid: &TenantTimelineId) -> Self { - let tenant_id = zttid.tenant_id.to_string(); - let timeline_id = zttid.timeline_id.to_string(); + fn new(ttid: &TenantTimelineId) -> Self { + let tenant_id = ttid.tenant_id.to_string(); + let timeline_id = ttid.timeline_id.to_string(); Self { write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&[&tenant_id, &timeline_id]), write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), @@ -101,9 +101,6 @@ pub trait Storage { /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn; - /// Init storage with wal_seg_size and read WAL from disk to get latest LSN. - fn init_storage(&mut self, state: &SafeKeeperState) -> Result<()>; - /// Write piece of WAL from buf to disk, but not necessarily sync it. fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; @@ -119,7 +116,7 @@ pub trait Storage { } /// PhysicalStorage is a storage that stores WAL on disk. Writes are separated from flushes -/// for better performance. Storage must be initialized before use. +/// for better performance. Storage is initialized in the constructor. /// /// WAL is stored in segments, each segment is a file. Last segment has ".partial" suffix in /// its filename and may be not fully flushed. @@ -127,16 +124,14 @@ pub trait Storage { /// Relationship of LSNs: /// `write_lsn` >= `write_record_lsn` >= `flush_record_lsn` /// -/// When storage is just created, all LSNs are zeroes and there are no segments on disk. +/// When storage is created first time, all LSNs are zeroes and there are no segments on disk. pub struct PhysicalStorage { metrics: WalStorageMetrics, - zttid: TenantTimelineId, timeline_dir: PathBuf, conf: SafeKeeperConf, - // fields below are filled upon initialization - /// None if uninitialized, Some(usize) if storage is initialized. - wal_seg_size: Option, + /// Size of WAL segment in bytes. + wal_seg_size: usize, /// Written to disk, but possibly still in the cache and not fully persisted. /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. @@ -161,25 +156,47 @@ pub struct PhysicalStorage { } impl PhysicalStorage { - pub fn new(zttid: &TenantTimelineId, conf: &SafeKeeperConf) -> PhysicalStorage { - let timeline_dir = conf.timeline_dir(zttid); - PhysicalStorage { - metrics: WalStorageMetrics::new(zttid), - zttid: *zttid, + /// Create new storage. If commit_lsn is not zero, flush_lsn is tried to be restored from + /// the disk. Otherwise, all LSNs are set to zero. + pub fn new( + ttid: &TenantTimelineId, + conf: &SafeKeeperConf, + state: &SafeKeeperState, + ) -> Result { + let timeline_dir = conf.timeline_dir(ttid); + let wal_seg_size = state.server.wal_seg_size as usize; + + // Find out where stored WAL ends, starting at commit_lsn which is a + // known recent record boundary (unless we don't have WAL at all). + let write_lsn = if state.commit_lsn == Lsn(0) { + Lsn(0) + } else { + find_end_of_wal(&timeline_dir, wal_seg_size, state.commit_lsn)? + }; + + // TODO: do we really know that write_lsn is fully flushed to disk? + // If not, maybe it's better to call fsync() here to be sure? + let flush_lsn = write_lsn; + + info!( + "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}", + ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn, + ); + if flush_lsn < state.commit_lsn || flush_lsn < state.peer_horizon_lsn { + warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", ttid.timeline_id); + } + + Ok(PhysicalStorage { + metrics: WalStorageMetrics::new(ttid), timeline_dir, conf: conf.clone(), - wal_seg_size: None, - write_lsn: Lsn(0), - write_record_lsn: Lsn(0), - flush_record_lsn: Lsn(0), - decoder: WalStreamDecoder::new(Lsn(0)), + wal_seg_size, + write_lsn, + write_record_lsn: write_lsn, + flush_record_lsn: flush_lsn, + decoder: WalStreamDecoder::new(write_lsn), file: None, - } - } - - /// Wrapper for flush_lsn updates that also updates metrics. - fn update_flush_lsn(&mut self) { - self.flush_record_lsn = self.write_record_lsn; + }) } /// Call fdatasync if config requires so. @@ -204,9 +221,9 @@ impl PhysicalStorage { /// Open or create WAL segment file. Caller must call seek to the wanted position. /// Returns `file` and `is_partial`. - fn open_or_create(&self, segno: XLogSegNo, wal_seg_size: usize) -> Result<(File, bool)> { + fn open_or_create(&self, segno: XLogSegNo) -> Result<(File, bool)> { let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; // Try to open already completed segment if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { @@ -222,24 +239,18 @@ impl PhysicalStorage { .open(&wal_file_partial_path) .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?; - write_zeroes(&mut file, wal_seg_size)?; + write_zeroes(&mut file, self.wal_seg_size)?; self.fsync_file(&mut file)?; Ok((file, true)) } } /// Write WAL bytes, which are known to be located in a single WAL segment. - fn write_in_segment( - &mut self, - segno: u64, - xlogoff: usize, - buf: &[u8], - wal_seg_size: usize, - ) -> Result<()> { + fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> { let mut file = if let Some(file) = self.file.take() { file } else { - let (mut file, is_partial) = self.open_or_create(segno, wal_seg_size)?; + let (mut file, is_partial) = self.open_or_create(segno)?; assert!(is_partial, "unexpected write into non-partial segment file"); file.seek(SeekFrom::Start(xlogoff as u64))?; file @@ -247,13 +258,13 @@ impl PhysicalStorage { file.write_all(buf)?; - if xlogoff + buf.len() == wal_seg_size { + if xlogoff + buf.len() == self.wal_seg_size { // If we reached the end of a WAL segment, flush and close it. self.fdatasync_file(&mut file)?; // Rename partial file to completed file let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; fs::rename(&wal_file_partial_path, &wal_file_path)?; } else { // otherwise, file can be reused later @@ -269,10 +280,6 @@ impl PhysicalStorage { /// /// Updates `write_lsn`. fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> { - let wal_seg_size = self - .wal_seg_size - .ok_or_else(|| anyhow!("wal_seg_size is not initialized"))?; - if self.write_lsn != pos { // need to flush the file before discarding it if let Some(mut file) = self.file.take() { @@ -284,17 +291,17 @@ impl PhysicalStorage { while !buf.is_empty() { // Extract WAL location for this block - let xlogoff = self.write_lsn.segment_offset(wal_seg_size) as usize; - let segno = self.write_lsn.segment_number(wal_seg_size); + let xlogoff = self.write_lsn.segment_offset(self.wal_seg_size) as usize; + let segno = self.write_lsn.segment_number(self.wal_seg_size); // If crossing a WAL boundary, only write up until we reach wal segment size. - let bytes_write = if xlogoff + buf.len() > wal_seg_size { - wal_seg_size - xlogoff + let bytes_write = if xlogoff + buf.len() > self.wal_seg_size { + self.wal_seg_size - xlogoff } else { buf.len() }; - self.write_in_segment(segno, xlogoff, &buf[..bytes_write], wal_seg_size)?; + self.write_in_segment(segno, xlogoff, &buf[..bytes_write])?; self.write_lsn += bytes_write as u64; buf = &buf[bytes_write..]; } @@ -309,53 +316,6 @@ impl Storage for PhysicalStorage { self.flush_record_lsn } - /// Storage needs to know wal_seg_size to know which segment to read/write, but - /// wal_seg_size is not always known at the moment of storage creation. This method - /// allows to postpone its initialization. - fn init_storage(&mut self, state: &SafeKeeperState) -> Result<()> { - if state.server.wal_seg_size == 0 { - // wal_seg_size is still unknown. This is dead path normally, should - // be used only in tests. - return Ok(()); - } - - if let Some(wal_seg_size) = self.wal_seg_size { - // physical storage is already initialized - assert_eq!(wal_seg_size, state.server.wal_seg_size as usize); - return Ok(()); - } - - // initialize physical storage - let wal_seg_size = state.server.wal_seg_size as usize; - self.wal_seg_size = Some(wal_seg_size); - - // Find out where stored WAL ends, starting at commit_lsn which is a - // known recent record boundary (unless we don't have WAL at all). - self.write_lsn = if state.commit_lsn == Lsn(0) { - Lsn(0) - } else { - find_end_of_wal(&self.timeline_dir, wal_seg_size, state.commit_lsn)? - }; - - self.write_record_lsn = self.write_lsn; - - // TODO: do we really know that write_lsn is fully flushed to disk? - // If not, maybe it's better to call fsync() here to be sure? - self.update_flush_lsn(); - - info!( - "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}", - self.zttid.timeline_id, self.flush_record_lsn, state.commit_lsn, state.peer_horizon_lsn, - ); - if self.flush_record_lsn < state.commit_lsn - || self.flush_record_lsn < state.peer_horizon_lsn - { - warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", self.zttid.timeline_id); - } - - Ok(()) - } - /// Write WAL to disk. fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { // Disallow any non-sequential writes, which can result in gaps or overwrites. @@ -419,80 +379,83 @@ impl Storage for PhysicalStorage { // We have unflushed data (write_lsn != flush_lsn), but no file. // This should only happen if last file was fully written and flushed, // but haven't updated flush_lsn yet. - assert!(self.write_lsn.segment_offset(self.wal_seg_size.unwrap()) == 0); + if self.write_lsn.segment_offset(self.wal_seg_size) != 0 { + bail!( + "unexpected unflushed data with no open file, write_lsn={}, flush_lsn={}", + self.write_lsn, + self.flush_record_lsn + ); + } } // everything is flushed now, let's update flush_lsn - self.update_flush_lsn(); + self.flush_record_lsn = self.write_record_lsn; Ok(()) } /// Truncate written WAL by removing all WAL segments after the given LSN. /// end_pos must point to the end of the WAL record. fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { - let wal_seg_size = self - .wal_seg_size - .ok_or_else(|| anyhow!("wal_seg_size is not initialized"))?; - // Streaming must not create a hole, so truncate cannot be called on non-written lsn - assert!(self.write_lsn == Lsn(0) || self.write_lsn >= end_pos); + if self.write_lsn != Lsn(0) && end_pos > self.write_lsn { + bail!( + "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}", + self.write_lsn, + end_pos + ); + } // Close previously opened file, if any if let Some(mut unflushed_file) = self.file.take() { self.fdatasync_file(&mut unflushed_file)?; } - let xlogoff = end_pos.segment_offset(wal_seg_size) as usize; - let segno = end_pos.segment_number(wal_seg_size); - let (mut file, is_partial) = self.open_or_create(segno, wal_seg_size)?; + let xlogoff = end_pos.segment_offset(self.wal_seg_size) as usize; + let segno = end_pos.segment_number(self.wal_seg_size); + + // Remove all segments after the given LSN. + remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno)?; + + let (mut file, is_partial) = self.open_or_create(segno)?; // Fill end with zeroes file.seek(SeekFrom::Start(xlogoff as u64))?; - write_zeroes(&mut file, wal_seg_size - xlogoff)?; + write_zeroes(&mut file, self.wal_seg_size - xlogoff)?; self.fdatasync_file(&mut file)?; if !is_partial { // Make segment partial once again let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; fs::rename(&wal_file_path, &wal_file_partial_path)?; } - // Remove all subsequent segments - let mut segno = segno; - loop { - segno += 1; - let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; - // TODO: better use fs::try_exists which is currently available only in nightly build - if wal_file_path.exists() { - fs::remove_file(&wal_file_path)?; - } else if wal_file_partial_path.exists() { - fs::remove_file(&wal_file_partial_path)?; - } else { - break; - } - } - // Update LSNs self.write_lsn = end_pos; self.write_record_lsn = end_pos; - self.update_flush_lsn(); + self.flush_record_lsn = end_pos; Ok(()) } fn remove_up_to(&self) -> Box Result<()>> { let timeline_dir = self.timeline_dir.clone(); - let wal_seg_size = self.wal_seg_size.unwrap(); + let wal_seg_size = self.wal_seg_size; Box::new(move |segno_up_to: XLogSegNo| { - remove_up_to(&timeline_dir, wal_seg_size, segno_up_to) + remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to) }) } } -/// Remove all WAL segments in timeline_dir <= given segno. -fn remove_up_to(timeline_dir: &Path, wal_seg_size: usize, segno_up_to: XLogSegNo) -> Result<()> { +/// Remove all WAL segments in timeline_dir that match the given predicate. +fn remove_segments_from_disk( + timeline_dir: &Path, + wal_seg_size: usize, + remove_predicate: impl Fn(XLogSegNo) -> bool, +) -> Result<()> { let mut n_removed = 0; + let mut min_removed = u64::MAX; + let mut max_removed = u64::MIN; + for entry in fs::read_dir(&timeline_dir)? { let entry = entry?; let entry_path = entry.path(); @@ -504,19 +467,21 @@ fn remove_up_to(timeline_dir: &Path, wal_seg_size: usize, segno_up_to: XLogSegNo continue; } let (segno, _) = XLogFromFileName(fname_str, wal_seg_size); - if segno <= segno_up_to { + if remove_predicate(segno) { remove_file(entry_path)?; n_removed += 1; + min_removed = min(min_removed, segno); + max_removed = max(max_removed, segno); } } } - let segno_from = segno_up_to - n_removed + 1; - info!( - "removed {} WAL segments [{}; {}]", - n_removed, - XLogFileName(PG_TLI, segno_from, wal_seg_size), - XLogFileName(PG_TLI, segno_up_to, wal_seg_size) - ); + + if n_removed > 0 { + info!( + "removed {} WAL segments [{}; {}]", + n_removed, min_removed, max_removed + ); + } Ok(()) } @@ -526,8 +491,10 @@ pub struct WalReader { pos: Lsn, wal_segment: Option>>, - enable_remote_read: bool, // S3 will be used to read WAL if LSN is not available locally + enable_remote_read: bool, + + // We don't have WAL locally if LSN is less than local_start_lsn local_start_lsn: Lsn, } From 7863c4a702617b2af5917d6a273a675395455e69 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 20 Sep 2022 08:37:06 +0300 Subject: [PATCH 0798/1022] Regenerate Hakari files, add a CI check for that --- .github/workflows/codestyle.yml | 10 +++++++++- Cargo.lock | 22 +++------------------- libs/postgres_ffi/wal_craft/Cargo.toml | 1 + workspace_hack/Cargo.toml | 6 ++---- 4 files changed, 15 insertions(+), 24 deletions(-) diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 237cf81205..5220258ef0 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -30,7 +30,7 @@ jobs: # this is all we need to install our toolchain later via rust-toolchain.toml # so don't install any toolchain explicitly. os: [ubuntu-latest, macos-latest] - timeout-minutes: 60 + timeout-minutes: 90 name: check codestyle rust and postgres runs-on: ${{ matrix.os }} @@ -108,6 +108,14 @@ jobs: target key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust + # https://github.com/facebookincubator/cargo-guppy/tree/main/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci + - name: Check every project module is covered by Hakari + run: | + cargo install cargo-hakari + cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date + cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack + shell: bash -euxo pipefail {0} + - name: Run cargo clippy run: ./run_clippy.sh diff --git a/Cargo.lock b/Cargo.lock index 2f4a57b698..3ce0ce465f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -377,13 +377,9 @@ version = "2.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" dependencies = [ - "ansi_term", - "atty", "bitflags", - "strsim 0.8.0", "textwrap 0.11.0", "unicode-width", - "vec_map", ] [[package]] @@ -396,7 +392,7 @@ dependencies = [ "bitflags", "clap_lex", "indexmap", - "strsim 0.10.0", + "strsim", "termcolor", "textwrap 0.15.0", ] @@ -746,7 +742,7 @@ dependencies = [ "ident_case", "proc-macro2", "quote", - "strsim 0.10.0", + "strsim", "syn", ] @@ -3023,12 +3019,6 @@ dependencies = [ "unicode-normalization", ] -[[package]] -name = "strsim" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" - [[package]] name = "strsim" version = "0.10.0" @@ -3685,12 +3675,6 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" -[[package]] -name = "vec_map" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" - [[package]] name = "version_check" version = "0.9.4" @@ -3709,6 +3693,7 @@ dependencies = [ "postgres", "postgres_ffi", "tempfile", + "workspace_hack", ] [[package]] @@ -3942,7 +3927,6 @@ dependencies = [ "bstr", "bytes", "chrono", - "clap 2.34.0", "either", "fail", "futures-channel", diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index f848ac1273..88466737ed 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -14,3 +14,4 @@ once_cell = "1.13.0" postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres_ffi = { path = "../" } tempfile = "3.2" +workspace_hack = { version = "0.1", path = "../../../workspace_hack" } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 096b3a5d70..96594bbf96 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -19,7 +19,6 @@ anyhow = { version = "1", features = ["backtrace", "std"] } bstr = { version = "0.2", features = ["lazy_static", "regex-automata", "serde", "serde1", "serde1-nostd", "std", "unicode"] } bytes = { version = "1", features = ["serde", "std"] } chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "std", "time", "winapi"] } -clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } either = { version = "1", features = ["use_std"] } fail = { version = "0.5", default-features = false, features = ["failpoints"] } futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] } @@ -46,16 +45,15 @@ regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] } -tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "winapi"] } +tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } -tracing-core = { version = "0.1", features = ["once_cell", "std", "valuable"] } +tracing-core = { version = "0.1", features = ["once_cell", "std"] } [build-dependencies] ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } -clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } either = { version = "1", features = ["use_std"] } hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } From a5019bf771e878b8e3f02563d7803580450ff39f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 20 Sep 2022 12:38:47 +0300 Subject: [PATCH 0799/1022] Use a simpler way to set extra options for benchmark test. Commit 43a4f7173e fixed the case that there are extra options in the connection string, but broke it in the case when there are not. Fix that. But on second thoughts, it's more straightforward set the options with ALTER DATABASE, so change the workflow yaml file to do that instead. --- .github/workflows/benchmarking.yml | 13 +++++++++---- test_runner/performance/test_perf_pgbench.py | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 0430f0b9c0..4e28223c18 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -183,12 +183,9 @@ jobs: neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; - neon-captest-new) + neon-captest-new | neon-captest-prefetch) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; - neon-captest-prefetch) - CONNSTR=${{ steps.create-neon-project.outputs.dsn }}?options=-cenable_seqscan_prefetch%3Don%20-cseqscan_prefetch_buffers%3D10 - ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }} ;; @@ -204,6 +201,14 @@ jobs: env: PLATFORM: ${{ matrix.platform }} + - name: Set database options + if: matrix.platform == 'neon-captest-prefetch' + run: | + psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET enable_seqscan_prefetch=on" + psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET seqscan_prefetch_buffers=10" + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + - name: Benchmark init uses: ./.github/actions/run-python-test-set with: diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index d9bf237e49..e167ddaafa 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -84,7 +84,7 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P if workload_type == PgBenchLoadType.INIT: # Run initialize - options = "-cstatement_timeout=1h " + env.pg.default_options["options"] + options = "-cstatement_timeout=1h " + env.pg.default_options.get("options", "") init_pgbench(env, ["pgbench", f"-s{scale}", "-i", env.pg.connstr(options=options)]) if workload_type == PgBenchLoadType.SIMPLE_UPDATE: From 4b25b9652a024dd876259088ef8fad56e708ba4d Mon Sep 17 00:00:00 2001 From: sharnoff Date: Tue, 20 Sep 2022 11:06:31 -0700 Subject: [PATCH 0800/1022] Rename more zid-like idents (#2480) Follow-up to PR #2433 (b8eb908a). There's still a few more unresolved locations that have been left as-is for the same compatibility reasons in the original PR. --- libs/utils/benches/benchmarks.rs | 12 +++---- libs/utils/src/pq_proto.rs | 46 +++++++++++++------------- pageserver/src/tenant.rs | 2 +- pgxn/neon/libpagestore.c | 12 +++---- pgxn/neon/pagestore_client.h | 6 ++-- pgxn/neon/pagestore_smgr.c | 12 +++---- safekeeper/src/control_file_upgrade.rs | 2 +- 7 files changed, 46 insertions(+), 46 deletions(-) diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index badcb5774e..98d839ca55 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -3,20 +3,20 @@ use criterion::{criterion_group, criterion_main, Criterion}; use utils::id; -pub fn bench_zid_stringify(c: &mut Criterion) { +pub fn bench_id_stringify(c: &mut Criterion) { // Can only use public methods. - let ztl = id::TenantTimelineId::generate(); + let ttid = id::TenantTimelineId::generate(); - c.bench_function("zid.to_string", |b| { + c.bench_function("id.to_string", |b| { b.iter(|| { // FIXME measurement overhead? //for _ in 0..1000 { - // ztl.tenant_id.to_string(); + // ttid.tenant_id.to_string(); //} - ztl.tenant_id.to_string(); + ttid.tenant_id.to_string(); }) }); } -criterion_group!(benches, bench_zid_stringify); +criterion_group!(benches, bench_id_stringify); criterion_main!(benches); diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index dde76039d7..21952ab87e 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -931,7 +931,7 @@ impl ReplicationFeedback { // Deserialize ReplicationFeedback message pub fn parse(mut buf: Bytes) -> ReplicationFeedback { - let mut zf = ReplicationFeedback::empty(); + let mut rf = ReplicationFeedback::empty(); let nfields = buf.get_u8(); for _ in 0..nfields { let key = read_cstr(&mut buf).unwrap(); @@ -939,31 +939,31 @@ impl ReplicationFeedback { b"current_timeline_size" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.current_timeline_size = buf.get_u64(); + rf.current_timeline_size = buf.get_u64(); } b"ps_writelsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.ps_writelsn = buf.get_u64(); + rf.ps_writelsn = buf.get_u64(); } b"ps_flushlsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.ps_flushlsn = buf.get_u64(); + rf.ps_flushlsn = buf.get_u64(); } b"ps_applylsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.ps_applylsn = buf.get_u64(); + rf.ps_applylsn = buf.get_u64(); } b"ps_replytime" => { let len = buf.get_i32(); assert_eq!(len, 8); let raw_time = buf.get_i64(); if raw_time > 0 { - zf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); + rf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); } else { - zf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); + rf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); } } _ => { @@ -976,8 +976,8 @@ impl ReplicationFeedback { } } } - trace!("ReplicationFeedback parsed is {:?}", zf); - zf + trace!("ReplicationFeedback parsed is {:?}", rf); + rf } } @@ -987,29 +987,29 @@ mod tests { #[test] fn test_replication_feedback_serialization() { - let mut zf = ReplicationFeedback::empty(); - // Fill zf with some values - zf.current_timeline_size = 12345678; + let mut rf = ReplicationFeedback::empty(); + // Fill rf with some values + rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. - zf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); - zf.serialize(&mut data).unwrap(); + rf.serialize(&mut data).unwrap(); - let zf_parsed = ReplicationFeedback::parse(data.freeze()); - assert_eq!(zf, zf_parsed); + let rf_parsed = ReplicationFeedback::parse(data.freeze()); + assert_eq!(rf, rf_parsed); } #[test] fn test_replication_feedback_unknown_key() { - let mut zf = ReplicationFeedback::empty(); - // Fill zf with some values - zf.current_timeline_size = 12345678; + let mut rf = ReplicationFeedback::empty(); + // Fill rf with some values + rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. - zf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); - zf.serialize(&mut data).unwrap(); + rf.serialize(&mut data).unwrap(); // Add an extra field to the buffer and adjust number of keys if let Some(first) = data.first_mut() { @@ -1021,8 +1021,8 @@ mod tests { data.put_u64(42); // Parse serialized data and check that new field is not parsed - let zf_parsed = ReplicationFeedback::parse(data.freeze()); - assert_eq!(zf, zf_parsed); + let rf_parsed = ReplicationFeedback::parse(data.freeze()); + assert_eq!(rf, rf_parsed); } #[test] diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index f56f10d7ea..204caf6dfa 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -132,7 +132,7 @@ pub enum TenantState { /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. impl Tenant { - /// Get Timeline handle for given zenith timeline ID. + /// Get Timeline handle for given Neon timeline ID. /// This function is idempotent. It doesn't change internal state in any way. pub fn get_timeline(&self, timeline_id: TimelineId) -> anyhow::Result> { self.timelines diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 296865838d..9cd2a86941 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -183,7 +183,7 @@ pageserver_send(NeonRequest * request) if (!connected) pageserver_connect(); - req_buff = zm_pack_request(request); + req_buff = nm_pack_request(request); /* * Send request. @@ -204,7 +204,7 @@ pageserver_send(NeonRequest * request) if (message_level_is_interesting(PageStoreTrace)) { - char *msg = zm_to_string((NeonMessage *) request); + char *msg = nm_to_string((NeonMessage *) request); neon_log(PageStoreTrace, "sent request: %s", msg); pfree(msg); @@ -230,12 +230,12 @@ pageserver_receive(void) else if (resp_buff.len == -2) neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); } - resp = zm_unpack_response(&resp_buff); + resp = nm_unpack_response(&resp_buff); PQfreemem(resp_buff.data); if (message_level_is_interesting(PageStoreTrace)) { - char *msg = zm_to_string((NeonMessage *) resp); + char *msg = nm_to_string((NeonMessage *) resp); neon_log(PageStoreTrace, "got response: %s", msg); pfree(msg); @@ -282,9 +282,9 @@ page_server_api api = { static bool check_neon_id(char **newval, void **extra, GucSource source) { - uint8 zid[16]; + uint8 id[16]; - return **newval == '\0' || HexDecodeString(zid, *newval, 16); + return **newval == '\0' || HexDecodeString(id, *newval, 16); } static char * diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 633c7b465c..e0cda11b63 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -128,9 +128,9 @@ typedef struct * message */ } NeonErrorResponse; -extern StringInfoData zm_pack_request(NeonRequest * msg); -extern NeonResponse * zm_unpack_response(StringInfo s); -extern char *zm_to_string(NeonMessage * msg); +extern StringInfoData nm_pack_request(NeonRequest * msg); +extern NeonResponse * nm_unpack_response(StringInfo s); +extern char *nm_to_string(NeonMessage * msg); /* * API diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 8e6dd373b0..1187550f2a 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -160,7 +160,7 @@ page_server_request(void const *req) StringInfoData -zm_pack_request(NeonRequest * msg) +nm_pack_request(NeonRequest * msg) { StringInfoData s; @@ -235,7 +235,7 @@ zm_pack_request(NeonRequest * msg) } NeonResponse * -zm_unpack_response(StringInfo s) +nm_unpack_response(StringInfo s) { NeonMessageTag tag = pq_getmsgbyte(s); NeonResponse *resp = NULL; @@ -329,7 +329,7 @@ zm_unpack_response(StringInfo s) /* dump to json for debugging / error reporting purposes */ char * -zm_to_string(NeonMessage * msg) +nm_to_string(NeonMessage * msg) { StringInfoData s; @@ -632,7 +632,7 @@ neon_init(void) * It may cause problems with XLogFlush. So return pointer backward to the origin of the page. */ static XLogRecPtr -zm_adjust_lsn(XLogRecPtr lsn) +nm_adjust_lsn(XLogRecPtr lsn) { /* * If lsn points to the beging of first record on page or segment, then @@ -685,7 +685,7 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ", (uint32) ((lsn) >> 32), (uint32) (lsn)); - lsn = zm_adjust_lsn(lsn); + lsn = nm_adjust_lsn(lsn); /* * Is it possible that the last-written LSN is ahead of last flush @@ -1569,7 +1569,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) */ lsn = GetXLogInsertRecPtr(); - lsn = zm_adjust_lsn(lsn); + lsn = nm_adjust_lsn(lsn); /* * Flush it, too. We don't actually care about it here, but let's uphold diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 87204d6b49..d8434efb20 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -167,7 +167,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), }); - // migrate to hexing some zids + // migrate to hexing some ids } else if version == 2 { info!("reading safekeeper control file version {}", version); let oldstate = SafeKeeperStateV2::des(&buf[..buf.len()])?; From 4a3b3ff11d89d02300041e32f43847110637f2e0 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Tue, 20 Sep 2022 11:28:12 -0700 Subject: [PATCH 0801/1022] Move testing pageserver libpq cmds to HTTP api (#2429) Closes #2422. The APIs have been feature gated with the `testing_api!` macro so that they return 400s when support hasn't been compiled in. --- .cargo/config.toml | 2 +- .github/workflows/build_and_test.yml | 4 +- README.md | 7 +- pageserver/Cargo.toml | 6 +- pageserver/src/bin/pageserver.rs | 4 +- pageserver/src/http/models.rs | 18 +++ pageserver/src/http/routes.rs | 141 ++++++++++++++++++ pageserver/src/page_service.rs | 115 +------------- pageserver/src/repository.rs | 11 +- test_runner/README.md | 4 +- test_runner/fixtures/neon_fixtures.py | 57 +++++++ test_runner/regress/test_ancestor_branch.py | 7 +- test_runner/regress/test_basebackup_error.py | 3 +- test_runner/regress/test_branch_and_gc.py | 11 +- test_runner/regress/test_branch_behind.py | 13 +- test_runner/regress/test_broken_timeline.py | 3 +- test_runner/regress/test_gc_aggressive.py | 12 +- test_runner/regress/test_import.py | 5 +- test_runner/regress/test_old_request_lsn.py | 9 +- test_runner/regress/test_pitr_gc.py | 15 +- test_runner/regress/test_readonly_node.py | 3 +- test_runner/regress/test_recovery.py | 41 ++--- test_runner/regress/test_remote_storage.py | 5 +- test_runner/regress/test_tenant_detach.py | 23 +-- test_runner/regress/test_tenant_relocation.py | 30 ++-- test_runner/regress/test_tenants.py | 3 +- .../test_tenants_with_remote_storage.py | 2 +- test_runner/regress/test_timeline_size.py | 21 ++- test_runner/regress/test_wal_acceptor.py | 4 +- 29 files changed, 352 insertions(+), 227 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index d70d57a817..c40783bc1b 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -13,4 +13,4 @@ opt-level = 3 opt-level = 1 [alias] -build_testing = ["build", "--features", "failpoints"] +build_testing = ["build", "--features", "testing"] diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 0b6cb21120..44db968753 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -100,11 +100,11 @@ jobs: run: | if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FEATURES="--features failpoints" + CARGO_FEATURES="--features testing" CARGO_FLAGS="--locked --timings $CARGO_FEATURES" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" - CARGO_FEATURES="--features failpoints,profiling" + CARGO_FEATURES="--features testing,profiling" CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV diff --git a/README.md b/README.md index 03ed57a0fa..dc469c36b1 100644 --- a/README.md +++ b/README.md @@ -222,7 +222,12 @@ Ensure your dependencies are installed as described [here](https://github.com/ne ```sh git clone --recursive https://github.com/neondatabase/neon.git -make # builds also postgres and installs it to ./pg_install + +# either: +CARGO_BUILD_FLAGS="--features=testing" make +# or: +make debug + ./scripts/pytest ``` diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index ce09e788bd..85ece97d9b 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -5,10 +5,10 @@ edition = "2021" [features] default = [] +# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, +# which adds some runtime cost to run tests on outage conditions +testing = ["fail/failpoints"] -# Feature that enables a special API, fail_point! macro (adds some runtime cost) -# to run tests on outage conditions -failpoints = ["fail/failpoints"] profiling = ["pprof"] [dependencies] diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 92d5eab379..fb79ad3945 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -87,8 +87,8 @@ fn main() -> anyhow::Result<()> { if arg_matches.is_present("enabled-features") { let features: &[&str] = &[ - #[cfg(feature = "failpoints")] - "failpoints", + #[cfg(feature = "testing")] + "testing", #[cfg(feature = "profiling")] "profiling", ]; diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index c0dc5b9677..2d7d560d2a 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -160,3 +160,21 @@ pub struct TimelineInfo { pub local: Option, pub remote: Option, } + +pub type ConfigureFailpointsRequest = Vec; + +/// Information for configuring a single fail point +#[derive(Debug, Serialize, Deserialize)] +pub struct FailpointConfig { + /// Name of the fail point + pub name: String, + /// List of actions to take, using the format described in `fail::cfg` + /// + /// We also support `actions = "exit"` to cause the fail point to immediately exit. + pub actions: String, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct TimelineGcRequest { + pub gc_horizon: Option, +} diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2e49429f38..bfc9e4462b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -29,6 +29,12 @@ use utils::{ lsn::Lsn, }; +// Imports only used for testing APIs +#[cfg(feature = "testing")] +use super::models::{ConfigureFailpointsRequest, TimelineGcRequest}; +#[cfg(feature = "testing")] +use crate::CheckpointConfig; + struct State { conf: &'static PageServerConf, auth: Option>, @@ -661,6 +667,103 @@ async fn tenant_config_handler(mut request: Request) -> Result) -> Result, ApiError> { + if !fail::has_failpoints() { + return Err(ApiError::BadRequest( + "Cannot manage failpoints because pageserver was compiled without failpoints support" + .to_owned(), + )); + } + + let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; + for fp in failpoints { + info!("cfg failpoint: {} {}", fp.name, fp.actions); + + // We recognize one extra "action" that's not natively recognized + // by the failpoints crate: exit, to immediately kill the process + let cfg_result = if fp.actions == "exit" { + fail::cfg_callback(fp.name, || { + info!("Exit requested by failpoint"); + std::process::exit(1); + }) + } else { + fail::cfg(fp.name, &fp.actions) + }; + + if let Err(err_msg) = cfg_result { + return Err(ApiError::BadRequest(format!( + "Failed to configure failpoints: {err_msg}" + ))); + } + } + + json_response(StatusCode::OK, ()) +} + +// Run GC immediately on given timeline. +// FIXME: This is just for tests. See test_runner/regress/test_gc.py. +// This probably should require special authentication or a global flag to +// enable, I don't think we want to or need to allow regular clients to invoke +// GC. +// @hllinnaka in commits ec44f4b29, 3aca717f3 +#[cfg(feature = "testing")] +async fn timeline_gc_handler(mut request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + // FIXME: currently this will return a 500 error on bad tenant id; it should be 4XX + let repo = tenant_mgr::get_tenant(tenant_id, false)?; + let gc_req: TimelineGcRequest = json_request(&mut request).await?; + + let _span_guard = + info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered(); + let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| repo.get_gc_horizon()); + + // Use tenant's pitr setting + let pitr = repo.get_pitr_interval(); + let result = repo.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; + json_response(StatusCode::OK, result) +} + +// Run compaction immediately on given timeline. +// FIXME This is just for tests. Don't expect this to be exposed to +// the users or the api. +// @dhammika in commit a0781f229 +#[cfg(feature = "testing")] +async fn timeline_compact_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let repo = tenant_mgr::get_tenant(tenant_id, true)?; + // FIXME: currently this will return a 500 error on bad timeline id; it should be 4XX + let timeline = repo.get_timeline(timeline_id).with_context(|| { + format!("No timeline {timeline_id} in repository for tenant {tenant_id}") + })?; + timeline.compact()?; + + json_response(StatusCode::OK, ()) +} + +// Run checkpoint immediately on given timeline. +#[cfg(feature = "testing")] +async fn timeline_checkpoint_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let repo = tenant_mgr::get_tenant(tenant_id, true)?; + // FIXME: currently this will return a 500 error on bad timeline id; it should be 4XX + let timeline = repo.get_timeline(timeline_id).with_context(|| { + format!("No timeline {timeline_id} in repository for tenant {tenant_id}") + })?; + timeline.checkpoint(CheckpointConfig::Forced)?; + + json_response(StatusCode::OK, ()) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -687,12 +790,38 @@ pub fn make_router( })) } + macro_rules! testing_api { + ($handler_desc:literal, $handler:path $(,)?) => {{ + #[cfg(not(feature = "testing"))] + async fn cfg_disabled(_req: Request) -> Result, ApiError> { + Err(ApiError::BadRequest( + concat!( + "Cannot ", + $handler_desc, + " because pageserver was compiled without testing APIs", + ) + .to_owned(), + )) + } + + #[cfg(feature = "testing")] + let handler = $handler; + #[cfg(not(feature = "testing"))] + let handler = cfg_disabled; + handler + }}; + } + Ok(router .data(Arc::new( State::new(conf, auth, remote_index, remote_storage) .context("Failed to initialize router state")?, )) .get("/v1/status", status_handler) + .put( + "/v1/failpoints", + testing_api!("manage failpoints", failpoints_handler), + ) .get("/v1/tenant", tenant_list_handler) .post("/v1/tenant", tenant_create_handler) .get("/v1/tenant/:tenant_id", tenant_status) @@ -705,6 +834,18 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler, ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", + testing_api!("run timeline GC", timeline_gc_handler), + ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/compact", + testing_api!("run timeline compaction", timeline_compact_handler), + ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", + testing_api!("run timeline checkpoint", timeline_checkpoint_handler), + ) .delete( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 1461a6d117..9e159f7391 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -27,7 +27,7 @@ use utils::{ lsn::Lsn, postgres_backend::AuthType, postgres_backend_async::{self, PostgresBackend}, - pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC}, + pq_proto::{BeMessage, FeMessage, RowDescriptor}, simple_rcu::RcuReadGuard, }; @@ -1005,31 +1005,6 @@ impl postgres_backend_async::Handler for PageServerHandler { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("failpoints ") { - ensure!(fail::has_failpoints(), "Cannot manage failpoints because pageserver was compiled without failpoints support"); - - let (_, failpoints) = query_string.split_at("failpoints ".len()); - - for failpoint in failpoints.split(';') { - if let Some((name, actions)) = failpoint.split_once('=') { - info!("cfg failpoint: {} {}", name, actions); - - // We recognize one extra "action" that's not natively recognized - // by the failpoints crate: exit, to immediately kill the process - if actions == "exit" { - fail::cfg_callback(name, || { - info!("Exit requested by failpoint"); - std::process::exit(1); - }) - .unwrap(); - } else { - fail::cfg(name, actions).unwrap(); - } - } else { - bail!("Invalid failpoints format"); - } - } - pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("show ") { // show let (_, params_raw) = query_string.split_at("show ".len()); @@ -1072,94 +1047,6 @@ impl postgres_backend_async::Handler for PageServerHandler { Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()), ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("do_gc ") { - // Run GC immediately on given timeline. - // FIXME: This is just for tests. See test_runner/regress/test_gc.py. - // This probably should require special authentication or a global flag to - // enable, I don't think we want to or need to allow regular clients to invoke - // GC. - - // do_gc - let re = Regex::new(r"^do_gc ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)([[:digit:]]+)?") - .unwrap(); - - let caps = re - .captures(query_string) - .with_context(|| format!("invalid do_gc: '{}'", query_string))?; - - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - - let _span_guard = - info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered(); - - let tenant = tenant_mgr::get_tenant(tenant_id, true)?; - - let gc_horizon: u64 = caps - .get(4) - .map(|h| h.as_str().parse()) - .unwrap_or_else(|| Ok(tenant.get_gc_horizon()))?; - - // Use tenant's pitr setting - let pitr = tenant.get_pitr_interval(); - let result = tenant.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; - pgb.write_message(&BeMessage::RowDescription(&[ - RowDescriptor::int8_col(b"layers_total"), - RowDescriptor::int8_col(b"layers_needed_by_cutoff"), - RowDescriptor::int8_col(b"layers_needed_by_pitr"), - RowDescriptor::int8_col(b"layers_needed_by_branches"), - RowDescriptor::int8_col(b"layers_not_updated"), - RowDescriptor::int8_col(b"layers_removed"), - RowDescriptor::int8_col(b"elapsed"), - ]))? - .write_message(&BeMessage::DataRow(&[ - Some(result.layers_total.to_string().as_bytes()), - Some(result.layers_needed_by_cutoff.to_string().as_bytes()), - Some(result.layers_needed_by_pitr.to_string().as_bytes()), - Some(result.layers_needed_by_branches.to_string().as_bytes()), - Some(result.layers_not_updated.to_string().as_bytes()), - Some(result.layers_removed.to_string().as_bytes()), - Some(result.elapsed.as_millis().to_string().as_bytes()), - ]))? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("compact ") { - // Run compaction immediately on given timeline. - // FIXME This is just for tests. Don't expect this to be exposed to - // the users or the api. - - // compact - let re = Regex::new(r"^compact ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)?").unwrap(); - - let caps = re - .captures(query_string) - .with_context(|| format!("Invalid compact: '{}'", query_string))?; - - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = get_local_timeline(tenant_id, timeline_id)?; - timeline.compact()?; - - pgb.write_message(&SINGLE_COL_ROWDESC)? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("checkpoint ") { - // Run checkpoint immediately on given timeline. - - // checkpoint - let re = Regex::new(r"^checkpoint ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)?").unwrap(); - - let caps = re - .captures(query_string) - .with_context(|| format!("invalid checkpoint command: '{}'", query_string))?; - - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = get_local_timeline(tenant_id, timeline_id)?; - - // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). - timeline.checkpoint(CheckpointConfig::Forced)?; - - pgb.write_message(&SINGLE_COL_ROWDESC)? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("get_lsn_by_timestamp ") { // Locate LSN of last transaction with timestamp less or equal than sppecified // TODO lazy static diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index f6ea9d8c5d..cfcc87a2ed 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -176,7 +176,7 @@ impl Value { /// /// Result of performing GC /// -#[derive(Default)] +#[derive(Default, Serialize)] pub struct GcResult { pub layers_total: u64, pub layers_needed_by_cutoff: u64, @@ -185,9 +185,18 @@ pub struct GcResult { pub layers_not_updated: u64, pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. + #[serde(serialize_with = "serialize_duration_as_millis")] pub elapsed: Duration, } +// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds +fn serialize_duration_as_millis(d: &Duration, serializer: S) -> Result +where + S: serde::Serializer, +{ + d.as_millis().serialize(serializer) +} + impl AddAssign for GcResult { fn add_assign(&mut self, other: Self) { self.layers_total += other.layers_total; diff --git a/test_runner/README.md b/test_runner/README.md index f17a4a5a5d..79b2418af6 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -6,9 +6,9 @@ Prerequisites: - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python) - Neon and Postgres binaries - See the root [README.md](/README.md) for build directions - If you want to test tests with failpoints, you would need to add `--features failpoints` to Rust code build commands. + If you want to test tests with test-only APIs, you would need to add `--features testing` to Rust code build commands. For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags. - Usage example: `cargo build_testing --release` is equivalent to `cargo build --features failpoints --release` + Usage example: `cargo build_testing --release` is equivalent to `cargo build --features testing --release` - Tests can be run from the git tree; or see the environment variables below to run from other directories. - The neon git repo, including the postgres submodule diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 0c03429f95..1e83ee3839 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -964,6 +964,24 @@ class NeonPageserverHttpClient(requests.Session): def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]) -> None: + if isinstance(config_strings, tuple): + pairs = [config_strings] + else: + pairs = config_strings + + log.info(f"Requesting config failpoints: {repr(pairs)}") + + res = self.put( + f"http://localhost:{self.port}/v1/failpoints", + json=[{"name": name, "actions": actions} for name, actions in pairs], + ) + log.info(f"Got failpoints request response code {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + def tenant_list(self) -> List[Dict[Any, Any]]: res = self.get(f"http://localhost:{self.port}/v1/tenant") self.verbose_error(res) @@ -1061,6 +1079,45 @@ class NeonPageserverHttpClient(requests.Session): assert res_json is None return res_json + def timeline_gc( + self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int] + ) -> dict[str, Any]: + log.info( + f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}" + ) + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc", + json={"gc_horizon": gc_horizon}, + ) + log.info(f"Got GC request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + return res_json + + def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId): + log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact" + ) + log.info(f"Got compact request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + + def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): + log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint" + ) + log.info(f"Got checkpoint request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + def get_metrics(self) -> str: res = self.get(f"http://localhost:{self.port}/metrics") self.verbose_error(res) diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index cb2621ff02..d7aebfb938 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -9,6 +9,7 @@ from fixtures.utils import query_scalar # def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() # Override defaults, 1M gc_horizon and 4M checkpoint_distance. # Extend compaction_period and gc_period to disable background compaction and gc. @@ -23,7 +24,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): } ) - env.pageserver.safe_psql("failpoints flush-frozen-before-sync=sleep(10000)") + pageserver_http.configure_failpoints(("flush-frozen-before-sync", "sleep(10000)")) pg_branch0 = env.postgres.create_start("main", tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() @@ -92,9 +93,9 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info(f"LSN after 300k rows: {lsn_300}") # Run compaction on branch1. - compact = f"compact {tenant} {branch1_timeline} {lsn_200}" + compact = f"compact {tenant} {branch1_timeline}" log.info(compact) - env.pageserver.safe_psql(compact) + pageserver_http.timeline_compact(tenant, branch1_timeline) assert query_scalar(branch0_cur, "SELECT count(*) FROM foo") == 100000 diff --git a/test_runner/regress/test_basebackup_error.py b/test_runner/regress/test_basebackup_error.py index 81a46ee2f0..94d3999d17 100644 --- a/test_runner/regress/test_basebackup_error.py +++ b/test_runner/regress/test_basebackup_error.py @@ -9,9 +9,10 @@ from fixtures.neon_fixtures import NeonEnv def test_basebackup_error(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_basebackup_error", "empty") + pageserver_http = env.pageserver.http_client() # Introduce failpoint - env.pageserver.safe_psql("failpoints basebackup-before-control-file=return") + pageserver_http.configure_failpoints(("basebackup-before-control-file", "return")) with pytest.raises(Exception, match="basebackup-before-control-file"): env.postgres.create_start("test_basebackup_error") diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index c8c5929066..12debe50eb 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -47,6 +47,7 @@ from fixtures.utils import query_scalar # could not find data for key ... at LSN ..., for request at LSN ... def test_branch_and_gc(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() tenant, _ = env.neon_cli.create_tenant( conf={ @@ -84,7 +85,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): # Set the GC horizon so that lsn1 is inside the horizon, which means # we can create a new branch starting from lsn1. - env.pageserver.safe_psql(f"do_gc {tenant} {timeline_main} {lsn2 - lsn1 + 1024}") + pageserver_http_client.timeline_gc(tenant, timeline_main, lsn2 - lsn1 + 1024) env.neon_cli.create_branch( "test_branch", "test_main", tenant_id=tenant, ancestor_start_lsn=lsn1 @@ -113,6 +114,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): # For more details, see discussion in https://github.com/neondatabase/neon/pull/2101#issuecomment-1185273447. def test_branch_creation_before_gc(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() + # Disable background GC but set the `pitr_interval` to be small, so GC can delete something tenant, _ = env.neon_cli.create_tenant( conf={ @@ -147,10 +150,10 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the # branch creation task but the individual timeline GC iteration happens *after* # the branch creation task. - env.pageserver.safe_psql("failpoints before-timeline-gc=sleep(2000)") + pageserver_http_client.configure_failpoints(("before-timeline-gc", "sleep(2000)")) def do_gc(): - env.pageserver.safe_psql(f"do_gc {tenant} {b0} 0") + pageserver_http_client.timeline_gc(tenant, b0, 0) thread = threading.Thread(target=do_gc, daemon=True) thread.start() @@ -161,7 +164,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): time.sleep(1.0) # The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC. - with pytest.raises(Exception, match="invalid branch start lsn"): + with pytest.raises(Exception, match="invalid branch start lsn: .*"): env.neon_cli.create_branch("b1", "b0", tenant_id=tenant, ancestor_start_lsn=lsn) thread.join() diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index b0d0737172..0e2a8b346b 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -1,4 +1,3 @@ -import psycopg2.extras import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder @@ -96,7 +95,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): assert pg.safe_psql("SELECT 1")[0][0] == 1 # branch at pre-initdb lsn - with pytest.raises(Exception, match="invalid branch start lsn"): + with pytest.raises(Exception, match="invalid branch start lsn: .*"): env.neon_cli.create_branch("test_branch_preinitdb", ancestor_start_lsn=Lsn("0/42")) # branch at pre-ancestor lsn @@ -106,13 +105,11 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): ) # check that we cannot create branch based on garbage collected data - with env.pageserver.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - # call gc to advace latest_gc_cutoff_lsn - pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) + with env.pageserver.http_client() as pageserver_http: + gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + print_gc_result(gc_result) - with pytest.raises(Exception, match="invalid branch start lsn"): + with pytest.raises(Exception, match="invalid branch start lsn: .*"): # this gced_lsn is pretty random, so if gc is disabled this woudln't fail env.neon_cli.create_branch( "test_branch_create_fail", "test_branch_behind", ancestor_start_lsn=gced_lsn diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index fd81981b2b..7baa67935d 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -113,13 +113,14 @@ def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http = env.pageserver.http_client() tenant_id, _ = env.neon_cli.create_tenant() old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) # Introduce failpoint when creating a new timeline - env.pageserver.safe_psql("failpoints before-checkpoint-new-timeline=return") + pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return")) with pytest.raises(Exception, match="before-checkpoint-new-timeline"): _ = env.neon_cli.create_timeline("test_fix_broken_timelines", tenant_id) diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 88d4ad8a6e..332bef225f 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -1,4 +1,5 @@ import asyncio +import concurrent.futures import random from fixtures.log_helper import log @@ -30,10 +31,15 @@ async def update_table(pg: Postgres): # Perform aggressive GC with 0 horizon async def gc(env: NeonEnv, timeline: TimelineId): - psconn = await env.pageserver.connect_async() + pageserver_http = env.pageserver.http_client() - while updates_performed < updates_to_perform: - await psconn.execute(f"do_gc {env.initial_tenant} {timeline} 0") + loop = asyncio.get_running_loop() + + with concurrent.futures.ThreadPoolExecutor() as pool: + while updates_performed < updates_to_perform: + await loop.run_in_executor( + pool, lambda: pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + ) # At the same time, run UPDATEs and GC diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 7b61b03b97..885a0dc26f 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -270,8 +270,7 @@ def _import( assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file) # Check that gc works - psconn = env.pageserver.connect() - pscur = psconn.cursor() - pscur.execute(f"do_gc {tenant} {timeline} 0") + pageserver_http = env.pageserver.http_client() + pageserver_http.timeline_gc(tenant, timeline, 0) return tar_output_file diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index c99e13f45f..3e387bb6cc 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -1,4 +1,3 @@ -import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.types import TimelineId @@ -29,8 +28,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Get the timeline ID of our branch. We need it for the 'do_gc' command timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) - psconn = env.pageserver.connect() - pscur = psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) + pageserver_http = env.pageserver.http_client() # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers. @@ -61,9 +59,8 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Make a lot of updates on a single row, generating a lot of WAL. Trigger # garbage collections so that the page server will remove old page versions. for i in range(10): - pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") - gcrow = pscur.fetchone() - print_gc_result(gcrow) + gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + print_gc_result(gc_result) for j in range(100): cur.execute("UPDATE foo SET val = val + 1 WHERE id = 1;") diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index 57b2ee1c04..d8b7256577 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -1,6 +1,3 @@ -from contextlib import closing - -import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.types import TimelineId @@ -54,13 +51,11 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): log.info(f"LSN after 10000 rows: {debug_lsn} xid {debug_xid}") # run GC - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - pscur.execute(f"compact {env.initial_tenant} {timeline}") - # perform aggressive GC. Data still should be kept because of the PITR setting. - pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) + with env.pageserver.http_client() as pageserver_http: + pageserver_http.timeline_compact(env.initial_tenant, timeline) + # perform aggressive GC. Data still should be kept because of the PITR setting. + gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + print_gc_result(gc_result) # Branch at the point where only 100 rows were inserted # It must have been preserved by PITR setting diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 3be64e077f..dfa57aec25 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -106,6 +106,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): # Similar test, but with more data, and we force checkpoints def test_timetravel(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() env.neon_cli.create_branch("test_timetravel", "empty") pg = env.postgres.create_start("test_timetravel") @@ -136,7 +137,7 @@ def test_timetravel(neon_simple_env: NeonEnv): wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) # run checkpoint manually to force a new layer file - env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") + pageserver_http_client.timeline_checkpoint(tenant_id, timeline_id) ##### Restart pageserver env.postgres.stop_all() diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 08c15d8f09..d0ba96e8e0 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -1,7 +1,6 @@ import time from contextlib import closing -import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder @@ -19,8 +18,8 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): f = env.neon_cli.pageserver_enabled_features() assert ( - "failpoints" in f["features"] - ), "Build pageserver with --features=failpoints option to run this test" + "testing" in f["features"] + ), "Build pageserver with --features=testing option to run this test" neon_env_builder.start() # Create a branch for us @@ -31,26 +30,28 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): with closing(pg.connect()) as conn: with conn.cursor() as cur: - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - # Create and initialize test table - cur.execute("CREATE TABLE foo(x bigint)") - cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))") + with env.pageserver.http_client() as pageserver_http: + # Create and initialize test table + cur.execute("CREATE TABLE foo(x bigint)") + cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))") - # Sleep for some time to let checkpoint create image layers - time.sleep(2) + # Sleep for some time to let checkpoint create image layers + time.sleep(2) - # Configure failpoints - pscur.execute( - "failpoints flush-frozen-before-sync=sleep(2000);checkpoint-after-sync=exit" - ) + # Configure failpoints + pageserver_http.configure_failpoints( + [ + ("flush-frozen-before-sync", "sleep(2000)"), + ("checkpoint-after-sync", "exit"), + ] + ) - # Do some updates until pageserver is crashed - try: - while True: - cur.execute("update foo set x=x+1") - except Exception as err: - log.info(f"Expected server crash {err}") + # Do some updates until pageserver is crashed + try: + while True: + cur.execute("update foo set x=x+1") + except Exception as err: + log.info(f"Expected server crash {err}") log.info("Wait before server restart") env.pageserver.stop() diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index cbe74cad5c..3e775b10b0 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -57,6 +57,7 @@ def test_remote_storage_backup_and_restore( ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() pg = env.postgres.create_start("main") client = env.pageserver.http_client() @@ -80,7 +81,7 @@ def test_remote_storage_backup_and_restore( wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) # run checkpoint manually to be sure that data landed in remote storage - env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) log.info(f"waiting for checkpoint {checkpoint_number} upload") # wait until pageserver successfully uploaded a checkpoint to remote storage @@ -99,7 +100,7 @@ def test_remote_storage_backup_and_restore( env.pageserver.start() # Introduce failpoint in download - env.pageserver.safe_psql("failpoints remote-storage-download-pre-rename=return") + pageserver_http.configure_failpoints(("remote-storage-download-pre-rename", "return")) client.tenant_attach(tenant_id) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index e3c9a091f9..f18e6867a9 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -1,16 +1,21 @@ from threading import Thread -import psycopg2 import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + NeonPageserverApiException, + NeonPageserverHttpClient, +) from fixtures.types import TenantId, TimelineId -def do_gc_target(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): +def do_gc_target( + pageserver_http: NeonPageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId +): """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" try: - env.pageserver.safe_psql(f"do_gc {tenant_id} {timeline_id} 0") + pageserver_http.timeline_gc(tenant_id, timeline_id, 0) except Exception as e: log.error("do_gc failed: %s", e) @@ -44,13 +49,13 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # gc should not try to even start with pytest.raises( - expected_exception=psycopg2.DatabaseError, match="gc target timeline does not exist" + expected_exception=NeonPageserverApiException, match="gc target timeline does not exist" ): bogus_timeline_id = TimelineId.generate() - env.pageserver.safe_psql(f"do_gc {tenant_id} {bogus_timeline_id} 0") + pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0) # try to concurrently run gc and detach - gc_thread = Thread(target=lambda: do_gc_target(env, tenant_id, timeline_id)) + gc_thread = Thread(target=lambda: do_gc_target(pageserver_http, tenant_id, timeline_id)) gc_thread.start() last_error = None @@ -73,6 +78,6 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() with pytest.raises( - expected_exception=psycopg2.DatabaseError, match=f"Tenant {tenant_id} not found" + expected_exception=NeonPageserverApiException, match=f"Tenant {tenant_id} not found" ): - env.pageserver.safe_psql(f"do_gc {tenant_id} {timeline_id} 0") + pageserver_http.timeline_gc(tenant_id, timeline_id, 0) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index aa7d92f1fd..2b01546198 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -147,14 +147,13 @@ def populate_branch( def ensure_checkpoint( - pageserver_cur, pageserver_http: NeonPageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId, current_lsn: Lsn, ): # run checkpoint manually to be sure that data landed in remote storage - pageserver_cur.execute(f"checkpoint {tenant_id} {timeline_id}") + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) # wait until pageserver successfully uploaded a checkpoint to remote storage wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) @@ -324,22 +323,19 @@ def test_tenant_relocation( # this requirement introduces a problem # if user creates a branch during migration # it wont appear on the new pageserver - with pg_cur(env.pageserver) as cur: - ensure_checkpoint( - cur, - pageserver_http=pageserver_http, - tenant_id=tenant_id, - timeline_id=timeline_id_main, - current_lsn=current_lsn_main, - ) + ensure_checkpoint( + pageserver_http=pageserver_http, + tenant_id=tenant_id, + timeline_id=timeline_id_main, + current_lsn=current_lsn_main, + ) - ensure_checkpoint( - cur, - pageserver_http=pageserver_http, - tenant_id=tenant_id, - timeline_id=timeline_id_second, - current_lsn=current_lsn_second, - ) + ensure_checkpoint( + pageserver_http=pageserver_http, + tenant_id=tenant_id, + timeline_id=timeline_id_second, + current_lsn=current_lsn_second, + ) log.info("inititalizing new pageserver") # bootstrap second pageserver diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 4500395c8f..52b9e6369c 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -19,7 +19,8 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): ) initial_tenant_dirs = set([d for d in tenants_dir.iterdir()]) - neon_simple_env.pageserver.safe_psql("failpoints tenant-creation-before-tmp-rename=return") + pageserver_http = neon_simple_env.pageserver.http_client() + pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return")) with pytest.raises(Exception, match="tenant-creation-before-tmp-rename"): _ = neon_simple_env.neon_cli.create_tenant() diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 85f371c845..83affac062 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -91,5 +91,5 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) # run final checkpoint manually to flush all the data to remote storage - env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 83018f46f5..979d1a107f 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -238,6 +238,7 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv): def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_checkpoint") pg = env.postgres.create_start("test_timeline_physical_size_post_checkpoint") @@ -251,7 +252,7 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -264,6 +265,7 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder ) env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction") pg = env.postgres.create_start("test_timeline_physical_size_post_compaction") @@ -278,8 +280,8 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") - env.pageserver.safe_psql(f"compact {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) + pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id) assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -290,6 +292,7 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='10m', gc_period='10m', pitr_interval='1s'}" env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc") pg = env.postgres.create_start("test_timeline_physical_size_post_gc") @@ -304,7 +307,7 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) pg.safe_psql( """ @@ -315,9 +318,9 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"do_gc {env.initial_tenant} {new_timeline_id} 0") + pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None) assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -326,6 +329,7 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): # Test the metrics. def test_timeline_size_metrics(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_size_metrics") pg = env.postgres.create_start("test_timeline_size_metrics") @@ -340,7 +344,7 @@ def test_timeline_size_metrics(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) # get the metrics and parse the metric for the current timeline's physical size metrics = env.pageserver.http_client().get_metrics() @@ -382,6 +386,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): random.seed(100) env = neon_simple_env + pageserver_http = env.pageserver.http_client() client = env.pageserver.http_client() tenant, timeline = env.neon_cli.create_tenant() @@ -405,7 +410,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, tenant, timeline) - env.pageserver.safe_psql(f"checkpoint {tenant} {timeline}") + pageserver_http.timeline_checkpoint(tenant, timeline) timeline_total_size += get_timeline_physical_size(timeline) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 089ed91c98..931de0f1e3 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -59,9 +59,7 @@ def wait_lsn_force_checkpoint( ) # force checkpoint to advance remote_consistent_lsn - with closing(ps.connect(**pageserver_conn_options)) as psconn: - with psconn.cursor() as pscur: - pscur.execute(f"checkpoint {tenant_id} {timeline_id}") + ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id) # ensure that remote_consistent_lsn is advanced wait_for_upload( From 6fc719db13a1feec1fef4bd227147ea19e56cf0f Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 20 Sep 2022 07:52:39 +0300 Subject: [PATCH 0802/1022] Merge timelines.rs with tenant.rs --- pageserver/src/http/routes.rs | 7 +- pageserver/src/lib.rs | 1 - pageserver/src/tenant.rs | 324 ++++++++++++++++++++++++---------- pageserver/src/timelines.rs | 168 ------------------ 4 files changed, 233 insertions(+), 267 deletions(-) delete mode 100644 pageserver/src/timelines.rs diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index bfc9e4462b..0c6f7927fa 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -15,7 +15,7 @@ use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant::{TenantState, Timeline}; use crate::tenant_config::TenantConfOpt; -use crate::{config::PageServerConf, tenant_mgr, timelines}; +use crate::{config::PageServerConf, tenant_mgr}; use utils::{ auth::JwtAuth, http::{ @@ -166,10 +166,9 @@ async fn timeline_create_handler(mut request: Request) -> Result TenantId { + self.tenant_id + } + /// Get Timeline handle for given Neon timeline ID. /// This function is idempotent. It doesn't change internal state in any way. pub fn get_timeline(&self, timeline_id: TimelineId) -> anyhow::Result> { @@ -142,8 +148,7 @@ impl Tenant { .with_context(|| { format!( "Timeline {} was not found for tenant {}", - timeline_id, - self.tenant_id() + timeline_id, self.tenant_id ) }) .map(Arc::clone) @@ -204,98 +209,67 @@ impl Tenant { Ok(new_timeline) } - /// Branch a timeline - pub fn branch_timeline( + /// Create a new timeline. + /// + /// Returns the new timeline ID and reference to its Timeline object. + /// + /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with + /// the same timeline ID already exists, returns None. If `new_timeline_id` is not given, + /// a new unique ID is generated. + pub async fn create_timeline( &self, - src: TimelineId, - dst: TimelineId, - start_lsn: Option, - ) -> Result> { - // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn - // about timelines, so otherwise a race condition is possible, where we create new timeline and GC - // concurrently removes data that is needed by the new timeline. - let _gc_cs = self.gc_cs.lock().unwrap(); + new_timeline_id: Option, + ancestor_timeline_id: Option, + mut ancestor_start_lsn: Option, + ) -> Result>> { + let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); - // In order for the branch creation task to not wait for GC/compaction, - // we need to make sure that the starting LSN of the child branch is not out of scope midway by - // - // 1. holding the GC lock to prevent overwritting timeline's GC data - // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline - // - // Step 2 is to avoid initializing the new branch using data removed by past GC iterations - // or in-queue GC iterations. - - // XXX: keep the lock to avoid races during timeline creation - let mut timelines = self.timelines.lock().unwrap(); - let src_timeline = timelines - .get(&src) - // message about timeline being remote is one .context up in the stack - .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {src}"))?; - - let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); - - // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN - let start_lsn = start_lsn.unwrap_or_else(|| { - let lsn = src_timeline.get_last_record_lsn(); - info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}"); - lsn - }); - - // Check if the starting LSN is out of scope because it is less than - // 1. the latest GC cutoff LSN or - // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration. - src_timeline - .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) - .context(format!( - "invalid branch start lsn: less than latest GC cutoff {}", - *latest_gc_cutoff_lsn, - ))?; + if self + .conf + .timeline_path(&new_timeline_id, &self.tenant_id) + .exists() { - let gc_info = src_timeline.gc_info.read().unwrap(); - let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); - if start_lsn < cutoff { - bail!(format!( - "invalid branch start lsn: less than planned GC cutoff {cutoff}" - )); - } + debug!("timeline {new_timeline_id} already exists"); + return Ok(None); } - // Determine prev-LSN for the new timeline. We can only determine it if - // the timeline was branched at the current end of the source timeline. - let RecordLsn { - last: src_last, - prev: src_prev, - } = src_timeline.get_last_record_rlsn(); - let dst_prev = if src_last == start_lsn { - Some(src_prev) - } else { - None + let loaded_timeline = match ancestor_timeline_id { + Some(ancestor_timeline_id) => { + let ancestor_timeline = self + .get_timeline(ancestor_timeline_id) + .context("Cannot branch off the timeline that's not present in pageserver")?; + + if let Some(lsn) = ancestor_start_lsn.as_mut() { + // Wait for the WAL to arrive and be processed on the parent branch up + // to the requested branch point. The repository code itself doesn't + // require it, but if we start to receive WAL on the new timeline, + // decoding the new WAL might need to look up previous pages, relation + // sizes etc. and that would get confused if the previous page versions + // are not in the repository yet. + *lsn = lsn.align(); + ancestor_timeline.wait_lsn(*lsn).await?; + + let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); + if ancestor_ancestor_lsn > *lsn { + // can we safely just branch from the ancestor instead? + anyhow::bail!( + "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", + lsn, + ancestor_timeline_id, + ancestor_ancestor_lsn, + ); + } + } + + self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? + } + None => self.bootstrap_timeline(new_timeline_id)?, }; - // create a new timeline directory - let timelinedir = self.conf.timeline_path(&dst, &self.tenant_id); - crashsafe_dir::create_dir(&timelinedir)?; + // Have added new timeline into the tenant, now its background tasks are needed. + self.activate(true); - // Create the metadata file, noting the ancestor of the new timeline. - // There is initially no data in it, but all the read-calls know to look - // into the ancestor. - let metadata = TimelineMetadata::new( - start_lsn, - dst_prev, - Some(src), - start_lsn, - *src_timeline.latest_gc_cutoff_lsn.read(), - src_timeline.initdb_lsn, - ); - crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; - save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; - - let new_timeline = self.initialize_new_timeline(dst, metadata, &mut timelines)?; - timelines.insert(dst, Arc::clone(&new_timeline)); - - info!("branched timeline {dst} from {src} at {start_lsn}"); - - Ok(new_timeline) + Ok(Some(loaded_timeline)) } /// perform one garbage collection iteration, removing old data files from disk. @@ -948,9 +922,171 @@ impl Tenant { Ok(totals) } - pub fn tenant_id(&self) -> TenantId { - self.tenant_id + fn branch_timeline( + &self, + src: TimelineId, + dst: TimelineId, + start_lsn: Option, + ) -> Result> { + // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn + // about timelines, so otherwise a race condition is possible, where we create new timeline and GC + // concurrently removes data that is needed by the new timeline. + let _gc_cs = self.gc_cs.lock().unwrap(); + + // In order for the branch creation task to not wait for GC/compaction, + // we need to make sure that the starting LSN of the child branch is not out of scope midway by + // + // 1. holding the GC lock to prevent overwritting timeline's GC data + // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline + // + // Step 2 is to avoid initializing the new branch using data removed by past GC iterations + // or in-queue GC iterations. + + // XXX: keep the lock to avoid races during timeline creation + let mut timelines = self.timelines.lock().unwrap(); + let src_timeline = timelines + .get(&src) + // message about timeline being remote is one .context up in the stack + .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {src}"))?; + + let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); + + // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN + let start_lsn = start_lsn.unwrap_or_else(|| { + let lsn = src_timeline.get_last_record_lsn(); + info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}"); + lsn + }); + + // Check if the starting LSN is out of scope because it is less than + // 1. the latest GC cutoff LSN or + // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration. + src_timeline + .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) + .context(format!( + "invalid branch start lsn: less than latest GC cutoff {}", + *latest_gc_cutoff_lsn, + ))?; + { + let gc_info = src_timeline.gc_info.read().unwrap(); + let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); + if start_lsn < cutoff { + bail!(format!( + "invalid branch start lsn: less than planned GC cutoff {cutoff}" + )); + } + } + + // Determine prev-LSN for the new timeline. We can only determine it if + // the timeline was branched at the current end of the source timeline. + let RecordLsn { + last: src_last, + prev: src_prev, + } = src_timeline.get_last_record_rlsn(); + let dst_prev = if src_last == start_lsn { + Some(src_prev) + } else { + None + }; + + // create a new timeline directory + let timelinedir = self.conf.timeline_path(&dst, &self.tenant_id); + crashsafe_dir::create_dir(&timelinedir)?; + + // Create the metadata file, noting the ancestor of the new timeline. + // There is initially no data in it, but all the read-calls know to look + // into the ancestor. + let metadata = TimelineMetadata::new( + start_lsn, + dst_prev, + Some(src), + start_lsn, + *src_timeline.latest_gc_cutoff_lsn.read(), + src_timeline.initdb_lsn, + ); + crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; + save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; + + let new_timeline = self.initialize_new_timeline(dst, metadata, &mut timelines)?; + timelines.insert(dst, Arc::clone(&new_timeline)); + + info!("branched timeline {dst} from {src} at {start_lsn}"); + + Ok(new_timeline) } + + /// - run initdb to init temporary instance and get bootstrap data + /// - after initialization complete, remove the temp dir. + fn bootstrap_timeline(&self, timeline_id: TimelineId) -> Result> { + // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` + // temporary directory for basebackup files for the given timeline. + let initdb_path = path_with_suffix_extension( + self.conf + .timelines_path(&self.tenant_id) + .join(format!("basebackup-{timeline_id}")), + TEMP_FILE_SUFFIX, + ); + + // Init temporarily repo to get bootstrap data + run_initdb(self.conf, &initdb_path)?; + let pgdata_path = initdb_path; + + let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); + + // Import the contents of the data directory at the initial checkpoint + // LSN, and any WAL after that. + // Initdb lsn will be equal to last_record_lsn which will be set after import. + // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. + let timeline = self.create_empty_timeline(timeline_id, lsn)?; + import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; + + fail::fail_point!("before-checkpoint-new-timeline", |_| { + bail!("failpoint before-checkpoint-new-timeline"); + }); + + timeline.checkpoint(CheckpointConfig::Forced)?; + + info!( + "created root timeline {} timeline.lsn {}", + timeline_id, + timeline.get_last_record_lsn() + ); + + // Remove temp dir. We don't need it anymore + fs::remove_dir_all(pgdata_path)?; + + Ok(timeline) + } +} + +/// Create the cluster temporarily in 'initdbpath' directory inside the repository +/// to get bootstrap data for timeline initialization. +fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { + info!("running initdb in {}... ", initdbpath.display()); + + let initdb_path = conf.pg_bin_dir().join("initdb"); + let initdb_output = Command::new(initdb_path) + .args(&["-D", &initdbpath.to_string_lossy()]) + .args(&["-U", &conf.superuser]) + .args(&["-E", "utf8"]) + .arg("--no-instructions") + // This is only used for a temporary installation that is deleted shortly after, + // so no need to fsync it + .arg("--no-sync") + .env_clear() + .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) + .stdout(Stdio::null()) + .output() + .context("failed to execute initdb")?; + if !initdb_output.status.success() { + bail!( + "initdb failed: '{}'", + String::from_utf8_lossy(&initdb_output.stderr) + ); + } + + Ok(()) } impl Drop for Tenant { diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs deleted file mode 100644 index 88b26e18f4..0000000000 --- a/pageserver/src/timelines.rs +++ /dev/null @@ -1,168 +0,0 @@ -//! -//! Timeline management code -// - -use std::{ - fs, - path::Path, - process::{Command, Stdio}, - sync::Arc, -}; - -use anyhow::{bail, Context, Result}; -use tracing::*; - -use remote_storage::path_with_suffix_extension; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; - -use crate::config::PageServerConf; -use crate::tenant::{Tenant, Timeline}; -use crate::tenant_mgr; -use crate::CheckpointConfig; -use crate::{import_datadir, TEMP_FILE_SUFFIX}; - -// Create the cluster temporarily in 'initdbpath' directory inside the repository -// to get bootstrap data for timeline initialization. -// -fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { - info!("running initdb in {}... ", initdbpath.display()); - - let initdb_path = conf.pg_bin_dir().join("initdb"); - let initdb_output = Command::new(initdb_path) - .args(&["-D", &initdbpath.to_string_lossy()]) - .args(&["-U", &conf.superuser]) - .args(&["-E", "utf8"]) - .arg("--no-instructions") - // This is only used for a temporary installation that is deleted shortly after, - // so no need to fsync it - .arg("--no-sync") - .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) - .stdout(Stdio::null()) - .output() - .context("failed to execute initdb")?; - if !initdb_output.status.success() { - bail!( - "initdb failed: '{}'", - String::from_utf8_lossy(&initdb_output.stderr) - ); - } - - Ok(()) -} - -// -// - run initdb to init temporary instance and get bootstrap data -// - after initialization complete, remove the temp dir. -// -fn bootstrap_timeline( - conf: &'static PageServerConf, - tenant_id: TenantId, - timeline_id: TimelineId, - tenant: &Tenant, -) -> Result> { - // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` - // temporary directory for basebackup files for the given timeline. - let initdb_path = path_with_suffix_extension( - conf.timelines_path(&tenant_id) - .join(format!("basebackup-{timeline_id}")), - TEMP_FILE_SUFFIX, - ); - - // Init temporarily repo to get bootstrap data - run_initdb(conf, &initdb_path)?; - let pgdata_path = initdb_path; - - let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); - - // Import the contents of the data directory at the initial checkpoint - // LSN, and any WAL after that. - // Initdb lsn will be equal to last_record_lsn which will be set after import. - // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = tenant.create_empty_timeline(timeline_id, lsn)?; - import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; - - fail::fail_point!("before-checkpoint-new-timeline", |_| { - bail!("failpoint before-checkpoint-new-timeline"); - }); - - timeline.checkpoint(CheckpointConfig::Forced)?; - - info!( - "created root timeline {} timeline.lsn {}", - timeline_id, - timeline.get_last_record_lsn() - ); - - // Remove temp dir. We don't need it anymore - fs::remove_dir_all(pgdata_path)?; - - Ok(timeline) -} - -/// -/// Create a new timeline. -/// -/// Returns the new timeline ID and reference to its Timeline object. -/// -/// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with -/// the same timeline ID already exists, returns None. If `new_timeline_id` is not given, -/// a new unique ID is generated. -/// -pub(crate) async fn create_timeline( - conf: &'static PageServerConf, - tenant_id: TenantId, - new_timeline_id: Option, - ancestor_timeline_id: Option, - mut ancestor_start_lsn: Option, -) -> Result>> { - let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); - let tenant = tenant_mgr::get_tenant(tenant_id, true)?; - - if conf.timeline_path(&new_timeline_id, &tenant_id).exists() { - debug!("timeline {new_timeline_id} already exists"); - return Ok(None); - } - - let loaded_timeline = match ancestor_timeline_id { - Some(ancestor_timeline_id) => { - let ancestor_timeline = tenant - .get_timeline(ancestor_timeline_id) - .context("Cannot branch off the timeline that's not present in pageserver")?; - - if let Some(lsn) = ancestor_start_lsn.as_mut() { - // Wait for the WAL to arrive and be processed on the parent branch up - // to the requested branch point. The repository code itself doesn't - // require it, but if we start to receive WAL on the new timeline, - // decoding the new WAL might need to look up previous pages, relation - // sizes etc. and that would get confused if the previous page versions - // are not in the repository yet. - *lsn = lsn.align(); - ancestor_timeline.wait_lsn(*lsn).await?; - - let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); - if ancestor_ancestor_lsn > *lsn { - // can we safely just branch from the ancestor instead? - anyhow::bail!( - "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", - lsn, - ancestor_timeline_id, - ancestor_ancestor_lsn, - ); - } - } - - tenant.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? - } - None => bootstrap_timeline(conf, tenant_id, new_timeline_id, &tenant)?, - }; - - // Have added new timeline into the tenant, now its background tasks are needed. - tenant.activate(true); - - Ok(Some(loaded_timeline)) -} From 310c507303d642c97a778f9850b57e1593ba5717 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 20 Sep 2022 07:58:06 +0300 Subject: [PATCH 0803/1022] Merge path retrieval methods in config.rs --- pageserver/src/config.rs | 17 +++++++++++++++++ pageserver/src/storage_sync.rs | 13 ++++--------- pageserver/src/storage_sync/download.rs | 15 +++++++-------- pageserver/src/storage_sync/upload.rs | 5 +++-- pageserver/src/tenant.rs | 9 ++++----- pageserver/src/tenant/metadata.rs | 17 +---------------- pageserver/src/tenant/timeline.rs | 4 ++-- pageserver/src/tenant_config.rs | 11 ----------- pageserver/src/tenant_mgr.rs | 12 +++++------- 9 files changed, 43 insertions(+), 60 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 75c71b09d2..945ee098ea 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -22,6 +22,10 @@ use utils::{ use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::tenant_config::{TenantConf, TenantConfOpt}; +/// The name of the metadata file pageserver creates per timeline. +pub const METADATA_FILE_NAME: &str = "metadata"; +const TENANT_CONFIG_NAME: &str = "config"; + pub mod defaults { use crate::tenant_config::defaults::*; use const_format::formatcp; @@ -346,6 +350,12 @@ impl PageServerConf { self.tenants_path().join(tenant_id.to_string()) } + /// Points to a place in pageserver's local directory, + /// where certain tenant's tenantconf file should be located. + pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf { + self.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME) + } + pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf { self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME) } @@ -354,6 +364,13 @@ impl PageServerConf { self.timelines_path(tenant_id).join(timeline_id.to_string()) } + /// Points to a place in pageserver's local directory, + /// where certain timeline's metadata file should be located. + pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf { + self.timeline_path(&timeline_id, &tenant_id) + .join(METADATA_FILE_NAME) + } + // // Postgres distribution paths // diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 64e0f9a9e3..489d0ad4ed 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -169,13 +169,8 @@ use self::{ upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, }; use crate::{ - config::PageServerConf, - exponential_backoff, - storage_sync::index::RemoteIndex, - task_mgr, - task_mgr::TaskKind, - task_mgr::BACKGROUND_RUNTIME, - tenant::metadata::{metadata_path, TimelineMetadata}, + config::PageServerConf, exponential_backoff, storage_sync::index::RemoteIndex, task_mgr, + task_mgr::TaskKind, task_mgr::BACKGROUND_RUNTIME, tenant::metadata::TimelineMetadata, tenant_mgr::attach_local_tenants, }; use crate::{ @@ -1012,7 +1007,7 @@ async fn update_local_metadata( }; let remote_lsn = remote_metadata.disk_consistent_lsn(); - let local_metadata_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id); + let local_metadata_path = conf.metadata_path(sync_id.timeline_id, sync_id.tenant_id); let local_lsn = if local_metadata_path.exists() { let local_metadata = read_metadata_file(&local_metadata_path) .await @@ -1433,7 +1428,7 @@ mod test_utils { } fs::write( - metadata_path(harness.conf, timeline_id, harness.tenant_id), + harness.conf.metadata_path(timeline_id, harness.tenant_id), metadata.to_bytes()?, ) .await?; diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 80d5ca5994..980001f95d 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -16,10 +16,7 @@ use tokio::{ }; use tracing::{debug, error, info, warn}; -use crate::{ - config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path, - TEMP_FILE_SUFFIX, -}; +use crate::{config::PageServerConf, storage_sync::SyncTask, TEMP_FILE_SUFFIX}; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use super::{ @@ -137,7 +134,8 @@ async fn download_index_part( storage: &GenericRemoteStorage, sync_id: TenantTimelineId, ) -> Result { - let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) + let index_part_path = conf + .metadata_path(sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); let mut index_part_download = storage .download_storage_object(None, &index_part_path) @@ -620,9 +618,10 @@ mod tests { metadata.to_bytes()?, ); - let local_index_part_path = - metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id) - .with_file_name(IndexPart::FILE_NAME); + let local_index_part_path = harness + .conf + .metadata_path(sync_id.timeline_id, sync_id.tenant_id) + .with_file_name(IndexPart::FILE_NAME); let index_part_remote_id = local_storage.remote_object_id(&local_index_part_path)?; let index_part_local_path = PathBuf::from(index_part_remote_id.to_string()); fs::create_dir_all(index_part_local_path.parent().unwrap()).await?; diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index aa5a2232cf..75657915c0 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -15,7 +15,7 @@ use super::{ LayersUpload, SyncData, SyncQueue, }; use crate::metrics::NO_LAYERS_UPLOAD; -use crate::{config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path}; +use crate::{config::PageServerConf, storage_sync::SyncTask}; /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part( @@ -29,7 +29,8 @@ pub(super) async fn upload_index_part( let index_part_size = index_part_bytes.len(); let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes)); - let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) + let index_part_path = conf + .metadata_path(sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); storage .upload_storage_object( diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index cf236a0a9c..b753c1979c 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -41,7 +41,7 @@ use crate::metrics::{remove_tenant_metrics, STORAGE_TIME}; use crate::repository::GcResult; use crate::storage_sync::index::RemoteIndex; use crate::task_mgr; -use crate::tenant_config::{TenantConf, TenantConfOpt}; +use crate::tenant_config::TenantConfOpt; use crate::virtual_file::VirtualFile; use crate::walredo::WalRedoManager; use crate::{CheckpointConfig, TEMP_FILE_SUFFIX}; @@ -676,7 +676,7 @@ impl Tenant { conf: &'static PageServerConf, tenant_id: TenantId, ) -> anyhow::Result { - let target_config_path = TenantConf::path(conf, tenant_id); + let target_config_path = conf.tenant_config_path(tenant_id); let target_config_display = target_config_path.display(); info!("loading tenantconf from {target_config_display}"); @@ -1134,7 +1134,6 @@ pub mod harness { walredo::{WalRedoError, WalRedoManager}, }; - use super::metadata::metadata_path; use super::*; use crate::tenant_config::{TenantConf, TenantConfOpt}; use hex_literal::hex; @@ -1270,7 +1269,7 @@ pub mod harness { timeline_id: TimelineId, tenant_id: TenantId, ) -> anyhow::Result { - let metadata_path = metadata_path(conf, timeline_id, tenant_id); + let metadata_path = conf.metadata_path(timeline_id, tenant_id); let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { format!( "Failed to read metadata bytes from path {}", @@ -1316,8 +1315,8 @@ pub mod harness { #[cfg(test)] mod tests { - use super::metadata::METADATA_FILE_NAME; use super::*; + use crate::config::METADATA_FILE_NAME; use crate::keyspace::KeySpaceAccum; use crate::repository::{Key, Value}; use crate::tenant::harness::*; diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index ace4dc91e9..606acbf2f1 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -8,7 +8,6 @@ use std::fs::{File, OpenOptions}; use std::io::Write; -use std::path::PathBuf; use anyhow::{bail, ensure, Context}; use serde::{Deserialize, Serialize}; @@ -29,9 +28,6 @@ use crate::STORAGE_FORMAT_VERSION; /// see PG_CONTROL_MAX_SAFE_SIZE const METADATA_MAX_SIZE: usize = 512; -/// The name of the metadata file pageserver creates per timeline. -pub const METADATA_FILE_NAME: &str = "metadata"; - /// Metadata stored on disk for each timeline /// /// The fields correspond to the values we hold in memory, in Timeline. @@ -166,17 +162,6 @@ impl TimelineMetadata { } } -/// Points to a place in pageserver's local directory, -/// where certain timeline's metadata file should be located. -pub fn metadata_path( - conf: &'static PageServerConf, - timeline_id: TimelineId, - tenant_id: TenantId, -) -> PathBuf { - conf.timeline_path(&timeline_id, &tenant_id) - .join(METADATA_FILE_NAME) -} - /// Save timeline metadata to file pub fn save_metadata( conf: &'static PageServerConf, @@ -186,7 +171,7 @@ pub fn save_metadata( first_save: bool, ) -> anyhow::Result<()> { let _enter = info_span!("saving metadata").entered(); - let path = metadata_path(conf, timeline_id, tenant_id); + let path = conf.metadata_path(timeline_id, tenant_id); // use OpenOptions to ensure file presence is consistent with first_save let mut file = VirtualFile::open_with_options( &path, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8670e979ee..b80d023c7f 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -24,12 +24,12 @@ use crate::tenant::{ image_layer::{ImageLayer, ImageLayerWriter}, inmemory_layer::InMemoryLayer, layer_map::{LayerMap, SearchResult}, - metadata::{save_metadata, TimelineMetadata, METADATA_FILE_NAME}, + metadata::{save_metadata, TimelineMetadata}, par_fsync, storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}, }; -use crate::config::PageServerConf; +use crate::config::{PageServerConf, METADATA_FILE_NAME}; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::metrics::TimelineMetrics; use crate::pgdatadir_mapping::BlockNumber; diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index 4448ffc456..4c5d5cc3f3 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -8,14 +8,9 @@ //! We cannot use global or default config instead, because wrong settings //! may lead to a data loss. //! -use crate::config::PageServerConf; use serde::{Deserialize, Serialize}; use std::num::NonZeroU64; -use std::path::PathBuf; use std::time::Duration; -use utils::id::TenantId; - -pub const TENANT_CONFIG_NAME: &str = "config"; pub mod defaults { // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB @@ -215,12 +210,6 @@ impl TenantConf { } } - /// Points to a place in pageserver's local directory, - /// where certain tenant's tenantconf file should be located. - pub fn path(conf: &'static PageServerConf, tenant_id: TenantId) -> PathBuf { - conf.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME) - } - #[cfg(test)] pub fn dummy_conf() -> Self { TenantConf { diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index d6fa843305..2c6f5fa863 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -12,17 +12,15 @@ use tracing::*; use remote_storage::{path_with_suffix_extension, GenericRemoteStorage}; -use crate::config::PageServerConf; +use crate::config::{PageServerConf, METADATA_FILE_NAME}; use crate::http::models::TenantInfo; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::{ - ephemeral_file::is_ephemeral_file, - metadata::{TimelineMetadata, METADATA_FILE_NAME}, - Tenant, TenantState, + ephemeral_file::is_ephemeral_file, metadata::TimelineMetadata, Tenant, TenantState, }; -use crate::tenant_config::{TenantConf, TenantConfOpt}; +use crate::tenant_config::TenantConfOpt; use crate::walredo::PostgresRedoManager; use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; @@ -246,7 +244,7 @@ fn create_tenant_files( &temporary_tenant_dir, )?; let temporary_tenant_config_path = rebase_directory( - &TenantConf::path(conf, tenant_id), + &conf.tenant_config_path(tenant_id), &target_tenant_directory, &temporary_tenant_dir, )?; @@ -343,7 +341,7 @@ pub fn update_tenant_config( ) -> anyhow::Result<()> { info!("configuring tenant {tenant_id}"); get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf); - Tenant::persist_tenant_config(&TenantConf::path(conf, tenant_id), tenant_conf, false)?; + Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?; Ok(()) } From 6b8dcad1bbc02b0f045c0ee192629ef129dd5755 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 20 Sep 2022 08:13:25 +0300 Subject: [PATCH 0804/1022] Unify timeline creation steps --- pageserver/src/tenant.rs | 73 ++++++++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 22 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index b753c1979c..40c9f1e9ad 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -185,27 +185,12 @@ impl Tenant { bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.") } - // Create the timeline directory, and write initial metadata to file. - crashsafe_dir::create_dir_all(timeline_path)?; - let new_metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); - save_metadata( - self.conf, - new_timeline_id, - self.tenant_id, - &new_metadata, - true, - )?; - let new_timeline = - self.initialize_new_timeline(new_timeline_id, new_metadata, &mut timelines)?; + self.create_initialized_timeline(new_timeline_id, new_metadata, &mut timelines)?; new_timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); - if let hash_map::Entry::Vacant(v) = timelines.entry(new_timeline_id) { - v.insert(Arc::clone(&new_timeline)); - } - Ok(new_timeline) } @@ -1004,12 +989,7 @@ impl Tenant { *src_timeline.latest_gc_cutoff_lsn.read(), src_timeline.initdb_lsn, ); - crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; - save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; - - let new_timeline = self.initialize_new_timeline(dst, metadata, &mut timelines)?; - timelines.insert(dst, Arc::clone(&new_timeline)); - + let new_timeline = self.create_initialized_timeline(dst, metadata, &mut timelines)?; info!("branched timeline {dst} from {src} at {start_lsn}"); Ok(new_timeline) @@ -1057,6 +1037,55 @@ impl Tenant { Ok(timeline) } + + fn create_initialized_timeline( + &self, + new_timeline_id: TimelineId, + new_metadata: TimelineMetadata, + timelines: &mut MutexGuard>>, + ) -> Result> { + crashsafe_dir::create_dir_all(self.conf.timeline_path(&new_timeline_id, &self.tenant_id)) + .with_context(|| { + format!( + "Failed to create timeline {}/{} directory", + new_timeline_id, self.tenant_id + ) + })?; + save_metadata( + self.conf, + new_timeline_id, + self.tenant_id, + &new_metadata, + true, + ) + .with_context(|| { + format!( + "Failed to create timeline {}/{} metadata", + new_timeline_id, self.tenant_id + ) + })?; + + let new_timeline = self + .initialize_new_timeline(new_timeline_id, new_metadata, timelines) + .with_context(|| { + format!( + "Failed to initialize timeline {}/{}", + new_timeline_id, self.tenant_id + ) + })?; + + match timelines.entry(new_timeline_id) { + hash_map::Entry::Occupied(_) => anyhow::bail!( + "Found freshly initialized timeline {} in the tenant map", + new_timeline_id + ), + hash_map::Entry::Vacant(v) => { + v.insert(Arc::clone(&new_timeline)); + } + } + + Ok(new_timeline) + } } /// Create the cluster temporarily in 'initdbpath' directory inside the repository From 8d7024a8c26d9f143202d28665ec2ae8a8e32ea1 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 20 Sep 2022 08:24:18 +0300 Subject: [PATCH 0805/1022] Move path manipulation function to utils --- Cargo.lock | 7 +-- libs/remote_storage/Cargo.toml | 1 + libs/remote_storage/src/lib.rs | 47 ------------------ libs/remote_storage/src/local_fs.rs | 3 +- libs/utils/src/crashsafe_dir.rs | 49 ++++++++++++++++++- pageserver/src/storage_sync/download.rs | 7 ++- pageserver/src/tenant.rs | 6 +-- pageserver/src/tenant_mgr.rs | 4 +- .../src/walreceiver/connection_manager.rs | 2 +- pageserver/src/walredo.rs | 2 +- workspace_hack/Cargo.toml | 6 --- 11 files changed, 62 insertions(+), 72 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3ce0ce465f..fc4ef90b8b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2444,6 +2444,7 @@ dependencies = [ "tokio-util", "toml_edit", "tracing", + "utils", "workspace_hack", ] @@ -3929,13 +3930,7 @@ dependencies = [ "chrono", "either", "fail", - "futures-channel", - "futures-task", - "futures-util", - "generic-array", "hashbrown", - "hex", - "hyper", "indexmap", "itoa 0.4.8", "libc", diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index b3485f274a..cec344a4ad 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" anyhow = { version = "1.0", features = ["backtrace"] } async-trait = "0.1" metrics = { version = "0.1", path = "../metrics" } +utils = { version = "0.1", path = "../utils" } once_cell = "1.13.0" rusoto_core = "0.48" rusoto_s3 = "0.48" diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 6b3fd29a0e..4bdd2b9608 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -9,9 +9,7 @@ mod local_fs; mod s3_bucket; use std::{ - borrow::Cow, collections::HashMap, - ffi::OsStr, fmt::{Debug, Display}, num::{NonZeroU32, NonZeroUsize}, ops::Deref, @@ -344,22 +342,6 @@ impl Debug for S3Config { } } -/// Adds a suffix to the file(directory) name, either appending the suffux to the end of its extension, -/// or if there's no extension, creates one and puts a suffix there. -pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { - let new_extension = match original_path - .as_ref() - .extension() - .map(OsStr::to_string_lossy) - { - Some(extension) => Cow::Owned(format!("{extension}.{suffix}")), - None => Cow::Borrowed(suffix), - }; - original_path - .as_ref() - .with_extension(new_extension.as_ref()) -} - impl RemoteStorageConfig { pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { let local_path = toml.get("local_path"); @@ -448,35 +430,6 @@ fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result { mod tests { use super::*; - #[test] - fn test_path_with_suffix_extension() { - let p = PathBuf::from("/foo/bar"); - assert_eq!( - &path_with_suffix_extension(&p, "temp").to_string_lossy(), - "/foo/bar.temp" - ); - let p = PathBuf::from("/foo/bar"); - assert_eq!( - &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), - "/foo/bar.temp.temp" - ); - let p = PathBuf::from("/foo/bar.baz"); - assert_eq!( - &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), - "/foo/bar.baz.temp.temp" - ); - let p = PathBuf::from("/foo/bar.baz"); - assert_eq!( - &path_with_suffix_extension(&p, ".temp").to_string_lossy(), - "/foo/bar.baz..temp" - ); - let p = PathBuf::from("/foo/bar/dir/"); - assert_eq!( - &path_with_suffix_extension(&p, ".temp").to_string_lossy(), - "/foo/bar/dir..temp" - ); - } - #[test] fn object_name() { let k = RemoteObjectId("a/b/c".to_owned()); diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 3ffbf3cb39..5723a512f6 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -16,8 +16,9 @@ use tokio::{ io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, }; use tracing::*; +use utils::crashsafe_dir::path_with_suffix_extension; -use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectId}; +use crate::{Download, DownloadError, RemoteObjectId}; use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; diff --git a/libs/utils/src/crashsafe_dir.rs b/libs/utils/src/crashsafe_dir.rs index a7eab73a43..032ab0a916 100644 --- a/libs/utils/src/crashsafe_dir.rs +++ b/libs/utils/src/crashsafe_dir.rs @@ -1,7 +1,9 @@ use std::{ + borrow::Cow, + ffi::OsStr, fs::{self, File}, io, - path::Path, + path::{Path, PathBuf}, }; /// Similar to [`std::fs::create_dir`], except we fsync the @@ -74,6 +76,22 @@ pub fn create_dir_all(path: impl AsRef) -> io::Result<()> { Ok(()) } +/// Adds a suffix to the file(directory) name, either appending the suffix to the end of its extension, +/// or if there's no extension, creates one and puts a suffix there. +pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { + let new_extension = match original_path + .as_ref() + .extension() + .map(OsStr::to_string_lossy) + { + Some(extension) => Cow::Owned(format!("{extension}.{suffix}")), + None => Cow::Borrowed(suffix), + }; + original_path + .as_ref() + .with_extension(new_extension.as_ref()) +} + #[cfg(test)] mod tests { use tempfile::tempdir; @@ -122,4 +140,33 @@ mod tests { let invalid_dir_path = file_path.join("folder"); create_dir_all(&invalid_dir_path).unwrap_err(); } + + #[test] + fn test_path_with_suffix_extension() { + let p = PathBuf::from("/foo/bar"); + assert_eq!( + &path_with_suffix_extension(&p, "temp").to_string_lossy(), + "/foo/bar.temp" + ); + let p = PathBuf::from("/foo/bar"); + assert_eq!( + &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + "/foo/bar.temp.temp" + ); + let p = PathBuf::from("/foo/bar.baz"); + assert_eq!( + &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + "/foo/bar.baz.temp.temp" + ); + let p = PathBuf::from("/foo/bar.baz"); + assert_eq!( + &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + "/foo/bar.baz..temp" + ); + let p = PathBuf::from("/foo/bar/dir/"); + assert_eq!( + &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + "/foo/bar/dir..temp" + ); + } } diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 980001f95d..3e850443d8 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -9,7 +9,7 @@ use std::{ use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; -use remote_storage::{path_with_suffix_extension, DownloadError, GenericRemoteStorage}; +use remote_storage::{DownloadError, GenericRemoteStorage}; use tokio::{ fs, io::{self, AsyncWriteExt}, @@ -17,7 +17,10 @@ use tokio::{ use tracing::{debug, error, info, warn}; use crate::{config::PageServerConf, storage_sync::SyncTask, TEMP_FILE_SUFFIX}; -use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::{ + crashsafe_dir::path_with_suffix_extension, + id::{TenantId, TenantTimelineId, TimelineId}, +}; use super::{ index::{IndexPart, RemoteTimeline}, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 40c9f1e9ad..ca97796870 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -14,6 +14,7 @@ use anyhow::{bail, ensure, Context, Result}; use tokio::sync::watch; use tracing::*; +use utils::crashsafe_dir::path_with_suffix_extension; use std::cmp::min; use std::collections::hash_map; @@ -45,7 +46,6 @@ use crate::tenant_config::TenantConfOpt; use crate::virtual_file::VirtualFile; use crate::walredo::WalRedoManager; use crate::{CheckpointConfig, TEMP_FILE_SUFFIX}; -use remote_storage::path_with_suffix_extension; use toml_edit; use utils::{ @@ -974,10 +974,6 @@ impl Tenant { None }; - // create a new timeline directory - let timelinedir = self.conf.timeline_path(&dst, &self.tenant_id); - crashsafe_dir::create_dir(&timelinedir)?; - // Create the metadata file, noting the ancestor of the new timeline. // There is initially no data in it, but all the read-calls know to look // into the ancestor. diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 2c6f5fa863..fcb2c18b79 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -10,7 +10,7 @@ use std::sync::Arc; use anyhow::Context; use tracing::*; -use remote_storage::{path_with_suffix_extension, GenericRemoteStorage}; +use remote_storage::GenericRemoteStorage; use crate::config::{PageServerConf, METADATA_FILE_NAME}; use crate::http::models::TenantInfo; @@ -24,7 +24,7 @@ use crate::tenant_config::TenantConfOpt; use crate::walredo::PostgresRedoManager; use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; -use utils::crashsafe_dir; +use utils::crashsafe_dir::{self, path_with_suffix_extension}; use utils::id::{TenantId, TimelineId}; mod tenants_state { diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 799062e935..148372c9d0 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -1358,7 +1358,7 @@ mod tests { const DUMMY_SAFEKEEPER_CONNSTR: &str = "safekeeper_connstr"; - fn dummy_state(harness: &TenantHarness) -> WalreceiverState { + fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState { WalreceiverState { id: TenantTimelineId { tenant_id: harness.tenant_id, diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 9faabfebda..79c2edc96e 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -21,7 +21,6 @@ use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; -use remote_storage::path_with_suffix_extension; use serde::Serialize; use std::fs; use std::fs::OpenOptions; @@ -36,6 +35,7 @@ use std::sync::Mutex; use std::time::Duration; use std::time::Instant; use tracing::*; +use utils::crashsafe_dir::path_with_suffix_extension; use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; use crate::metrics::{ diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 96594bbf96..dc4cbb5284 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -21,13 +21,7 @@ bytes = { version = "1", features = ["serde", "std"] } chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "std", "time", "winapi"] } either = { version = "1", features = ["use_std"] } fail = { version = "0.5", default-features = false, features = ["failpoints"] } -futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] } -futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] } -futures-util = { version = "0.3", default-features = false, features = ["alloc", "async-await", "async-await-macro", "channel", "futures-channel", "futures-io", "futures-macro", "futures-sink", "io", "memchr", "sink", "slab", "std"] } -generic-array = { version = "0.14", default-features = false, features = ["more_lengths"] } hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } -hex = { version = "0.4", features = ["alloc", "serde", "std"] } -hyper = { version = "0.14", features = ["client", "full", "h2", "http1", "http2", "runtime", "server", "socket2", "stream", "tcp"] } indexmap = { version = "1", default-features = false, features = ["std"] } itoa = { version = "0.4", features = ["i128", "std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } From 6f949e15563280cc791b02940c711a5641813891 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Tue, 20 Sep 2022 17:02:10 -0700 Subject: [PATCH 0806/1022] Improve pageserver/safekeepeer HTTP API errors (#2461) Part of the general work on improving pageserver logs. Brief summary of changes: * Remove `ApiError::from_err` * Remove `impl From for ApiError` * Convert `ApiError::{BadRequest, NotFound}` to use `anyhow::Error` * Note: `NotFound` has more verbose formatting because it's more likely to have useful information for the receiving "user" * Explicitly convert from `tokio::task::JoinError`s into `InternalServerError`s where appropriate Also note: many of the places where errors were implicitly converted to 500s have now been updated to return a more appropriate error. Some places where it's not yet possible to distinguish the error types have been left as 500s. --- libs/utils/src/http/error.rs | 17 +-- libs/utils/src/http/json.rs | 13 +- libs/utils/src/http/request.rs | 13 +- pageserver/src/http/routes.rs | 220 +++++++++++++++++++++------------ safekeeper/src/http/routes.rs | 39 ++++-- 5 files changed, 195 insertions(+), 107 deletions(-) diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index b3bbec0f1c..b0ecb746d9 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -1,12 +1,11 @@ -use anyhow::anyhow; use hyper::{header, Body, Response, StatusCode}; use serde::{Deserialize, Serialize}; use thiserror::Error; #[derive(Debug, Error)] pub enum ApiError { - #[error("Bad request: {0}")] - BadRequest(String), + #[error("Bad request: {0:#?}")] + BadRequest(anyhow::Error), #[error("Forbidden: {0}")] Forbidden(String), @@ -15,24 +14,20 @@ pub enum ApiError { Unauthorized(String), #[error("NotFound: {0}")] - NotFound(String), + NotFound(anyhow::Error), #[error("Conflict: {0}")] Conflict(String), #[error(transparent)] - InternalServerError(#[from] anyhow::Error), + InternalServerError(anyhow::Error), } impl ApiError { - pub fn from_err>(err: E) -> Self { - Self::InternalServerError(anyhow!(err)) - } - pub fn into_response(self) -> Response { match self { - ApiError::BadRequest(_) => HttpErrorBody::response_from_msg_and_status( - self.to_string(), + ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( + format!("{err:#?}"), // use debug printing so that we give the cause StatusCode::BAD_REQUEST, ), ApiError::Forbidden(_) => { diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs index 08f2ac4205..8981fdd1dd 100644 --- a/libs/utils/src/http/json.rs +++ b/libs/utils/src/http/json.rs @@ -1,3 +1,4 @@ +use anyhow::Context; use bytes::Buf; use hyper::{header, Body, Request, Response, StatusCode}; use serde::{Deserialize, Serialize}; @@ -9,20 +10,24 @@ pub async fn json_request Deserialize<'de>>( ) -> Result { let whole_body = hyper::body::aggregate(request.body_mut()) .await - .map_err(ApiError::from_err)?; + .context("Failed to read request body") + .map_err(ApiError::BadRequest)?; serde_json::from_reader(whole_body.reader()) - .map_err(|err| ApiError::BadRequest(format!("Failed to parse json request {}", err))) + .context("Failed to parse json request") + .map_err(ApiError::BadRequest) } pub fn json_response( status: StatusCode, data: T, ) -> Result, ApiError> { - let json = serde_json::to_string(&data).map_err(ApiError::from_err)?; + let json = serde_json::to_string(&data) + .context("Failed to serialize JSON response") + .map_err(ApiError::InternalServerError)?; let response = Response::builder() .status(status) .header(header::CONTENT_TYPE, "application/json") .body(Body::from(json)) - .map_err(ApiError::from_err)?; + .map_err(|e| ApiError::InternalServerError(e.into()))?; Ok(response) } diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs index 4984d695fd..7b96ccd584 100644 --- a/libs/utils/src/http/request.rs +++ b/libs/utils/src/http/request.rs @@ -1,6 +1,7 @@ use std::str::FromStr; use super::error::ApiError; +use anyhow::anyhow; use hyper::{body::HttpBody, Body, Request}; use routerify::ext::RequestExt; @@ -10,9 +11,8 @@ pub fn get_request_param<'a>( ) -> Result<&'a str, ApiError> { match request.param(param_name) { Some(arg) => Ok(arg), - None => Err(ApiError::BadRequest(format!( - "no {} specified in path param", - param_name + None => Err(ApiError::BadRequest(anyhow!( + "no {param_name} specified in path param", ))), } } @@ -23,16 +23,15 @@ pub fn parse_request_param( ) -> Result { match get_request_param(request, param_name)?.parse() { Ok(v) => Ok(v), - Err(_) => Err(ApiError::BadRequest(format!( - "failed to parse {}", - param_name + Err(_) => Err(ApiError::BadRequest(anyhow!( + "failed to parse {param_name}", ))), } } pub async fn ensure_no_body(request: &mut Request) -> Result<(), ApiError> { match request.body_mut().data().await { - Some(_) => Err(ApiError::BadRequest("Unexpected request body".into())), + Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))), None => Ok(()), } } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 0c6f7927fa..c676dfacd2 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,9 +1,10 @@ use std::sync::Arc; -use anyhow::{Context, Result}; +use anyhow::{anyhow, Context, Result}; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use remote_storage::GenericRemoteStorage; +use tokio::task::JoinError; use tracing::*; use super::models::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo}; @@ -166,7 +167,7 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result { // Created. Construct a TimelineInfo for it. - let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)?; + let local_info = local_timeline_info_from_timeline(&new_timeline, false, false) + .map_err(ApiError::InternalServerError)?; Ok(Some(TimelineInfo { tenant_id, timeline_id: new_timeline.timeline_id, @@ -184,12 +186,11 @@ async fn timeline_create_handler(mut request: Request) -> Result Ok(None), // timeline already exists - Err(err) => Err(err), + Err(err) => Err(ApiError::InternalServerError(err)), } } .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn)) - .await - .map_err(ApiError::from_err)?; + .await?; Ok(match new_timeline_info { Some(info) => json_response(StatusCode::CREATED, info)?, @@ -207,10 +208,11 @@ async fn timeline_list_handler(request: Request) -> Result, let timelines = tokio::task::spawn_blocking(move || { let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); - Ok::<_, anyhow::Error>(tenant_mgr::get_tenant(tenant_id, true)?.list_timelines()) + let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?; + Ok(tenant.list_timelines()) }) .await - .map_err(ApiError::from_err)??; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; let mut response_data = Vec::with_capacity(timelines.len()); for (timeline_id, timeline) in timelines { @@ -275,7 +277,7 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result((local_timeline_info, remote_timeline_info)) + Ok::<_, ApiError>((local_timeline_info, remote_timeline_info)) } .instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id)) .await?; if local_timeline_info.is_none() && remote_timeline_info.is_none() { - Err(ApiError::NotFound(format!( + Err(ApiError::NotFound(anyhow!( "Timeline {tenant_id}/{timeline_id} is not found neither locally nor remotely" ))) } else { @@ -332,14 +334,14 @@ async fn tenant_attach_handler(request: Request) -> Result, info!("Handling tenant attach {tenant_id}"); - tokio::task::spawn_blocking(move || { - if tenant_mgr::get_tenant(tenant_id, false).is_ok() { - anyhow::bail!("Tenant is already present locally") - }; - Ok(()) + tokio::task::spawn_blocking(move || match tenant_mgr::get_tenant(tenant_id, false) { + Ok(_) => Err(ApiError::Conflict( + "Tenant is already present locally".to_owned(), + )), + Err(_) => Ok(()), }) .await - .map_err(ApiError::from_err)??; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; let state = get_state(&request); let remote_index = &state.remote_index; @@ -364,12 +366,12 @@ async fn tenant_attach_handler(request: Request) -> Result, // download index parts for every tenant timeline let remote_timelines = match gather_tenant_timelines_index_parts(state, tenant_id).await { Ok(Some(remote_timelines)) => remote_timelines, - Ok(None) => return Err(ApiError::NotFound("Unknown remote tenant".to_string())), + Ok(None) => return Err(ApiError::NotFound(anyhow!("Unknown remote tenant"))), Err(e) => { error!("Failed to retrieve remote tenant data: {:?}", e); - return Err(ApiError::NotFound( - "Failed to retrieve remote tenant".to_string(), - )); + return Err(ApiError::NotFound(anyhow!( + "Failed to retrieve remote tenant" + ))); } }; @@ -392,7 +394,8 @@ async fn tenant_attach_handler(request: Request) -> Result, for (timeline_id, mut remote_timeline) in remote_timelines { tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id)) .await - .context("Failed to create new timeline directory")?; + .context("Failed to create new timeline directory") + .map_err(ApiError::InternalServerError)?; remote_timeline.awaits_download = true; tenant_entry.insert(timeline_id, remote_timeline); @@ -438,7 +441,10 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, tenant_mgr::detach_tenant(conf, tenant_id) .instrument(info_span!("tenant_detach", tenant = %tenant_id)) .await - .map_err(ApiError::from_err)?; + // FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors. + // Replace this with better handling once the error type permits it. + .map_err(ApiError::InternalServerError)?; let mut remote_index = state.remote_index.write().await; remote_index.remove_tenant_entry(&tenant_id); @@ -478,7 +486,7 @@ async fn tenant_list_handler(request: Request) -> Result, A crate::tenant_mgr::list_tenant_info(&remote_index) }) .await - .map_err(ApiError::from_err)?; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?; json_response(StatusCode::OK, response_data) } @@ -490,7 +498,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro // if tenant is in progress of downloading it can be absent in global tenant map let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false)) .await - .map_err(ApiError::from_err)?; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?; let state = get_state(&request); let remote_index = &state.remote_index; @@ -519,7 +527,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro let current_physical_size = match tokio::task::spawn_blocking(move || list_local_timelines(tenant_id, false, false)) .await - .map_err(ApiError::from_err)? + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))? { Err(err) => { // Getting local timelines can fail when no local tenant directory is on disk (e.g, when tenant data is being downloaded). @@ -545,6 +553,16 @@ async fn tenant_status(request: Request) -> Result, ApiErro ) } +// Helper function to standardize the error messages we produce on bad durations +// +// Intended to be used with anyhow's `with_context`, e.g.: +// +// let value = result.with_context(bad_duration("name", &value))?; +// +fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String { + move || format!("Cannot parse `{field_name}` duration {value:?}") +} + async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { check_permission(&request, None)?; @@ -553,25 +571,39 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result) -> Result json_response(StatusCode::CREATED, TenantCreateResponse(id))?, @@ -618,24 +659,38 @@ async fn tenant_config_handler(mut request: Request) -> Result) -> Result) -> Result) -> Result) -> Result, ApiError> { if !fail::has_failpoints() { - return Err(ApiError::BadRequest( + return Err(ApiError::BadRequest(anyhow!( "Cannot manage failpoints because pageserver was compiled without failpoints support" - .to_owned(), - )); + ))); } let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; @@ -691,7 +754,7 @@ async fn failpoints_handler(mut request: Request) -> Result }; if let Err(err_msg) = cfg_result { - return Err(ApiError::BadRequest(format!( + return Err(ApiError::BadRequest(anyhow!( "Failed to configure failpoints: {err_msg}" ))); } @@ -713,7 +776,7 @@ async fn timeline_gc_handler(mut request: Request) -> Result) -> Result) -> Result) -> Result {{ #[cfg(not(feature = "testing"))] async fn cfg_disabled(_req: Request) -> Result, ApiError> { - Err(ApiError::BadRequest( - concat!( - "Cannot ", - $handler_desc, - " because pageserver was compiled without testing APIs", - ) - .to_owned(), - )) + Err(ApiError::BadRequest(anyhow!(concat!( + "Cannot ", + $handler_desc, + " because pageserver was compiled without testing APIs", + )))) } #[cfg(feature = "testing")] diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 244325368b..43c0a17f84 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,12 +1,14 @@ use anyhow::anyhow; use hyper::{Body, Request, Response, StatusCode, Uri}; +use anyhow::Context; use once_cell::sync::Lazy; use serde::Serialize; use serde::Serializer; use std::collections::{HashMap, HashSet}; use std::fmt::Display; use std::sync::Arc; +use tokio::task::JoinError; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; @@ -99,7 +101,12 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result) -> Result Date: Wed, 21 Sep 2022 13:13:11 +0300 Subject: [PATCH 0807/1022] Use prebuilt image with Hakari for CI style checks (#2488) --- .github/workflows/codestyle.yml | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 5220258ef0..641943199e 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -108,20 +108,32 @@ jobs: target key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust - # https://github.com/facebookincubator/cargo-guppy/tree/main/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - - name: Check every project module is covered by Hakari - run: | - cargo install cargo-hakari - cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date - cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack - shell: bash -euxo pipefail {0} - - name: Run cargo clippy run: ./run_clippy.sh - name: Ensure all project builds run: cargo build --locked --all --all-targets + check-rust-dependencies: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: false + fetch-depth: 1 + + # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci + - name: Check every project module is covered by Hakari + run: | + cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date + cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack + shell: bash -euxo pipefail {0} + check-codestyle-python: runs-on: [ self-hosted, Linux, k8s-runner ] steps: From b82e2e3f18cbeb08c45074015cbe4606d36c51c5 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 21 Sep 2022 11:08:12 +0300 Subject: [PATCH 0808/1022] Bump postgres submodules and update docs/core_changes.md. The old change to downgrade a WARNING in postgres vacuumlazy.c was reverted. --- docs/core_changes.md | 25 ------------------------- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 3 files changed, 2 insertions(+), 27 deletions(-) diff --git a/docs/core_changes.md b/docs/core_changes.md index 8f29dd9121..ea219adae9 100644 --- a/docs/core_changes.md +++ b/docs/core_changes.md @@ -148,31 +148,6 @@ relcache? (I think we do cache nblocks in relcache already, check why that's not Neon) -## Misc change in vacuumlazy.c - -``` -index 8aab6e324e..c684c4fbee 100644 ---- a/src/backend/access/heap/vacuumlazy.c -+++ b/src/backend/access/heap/vacuumlazy.c -@@ -1487,7 +1487,10 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) - else if (all_visible_according_to_vm && !PageIsAllVisible(page) - && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer)) - { -- elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", -+ /* ZENITH-XXX: all visible hint is not wal-logged -+ * FIXME: Replay visibilitymap changes in pageserver -+ */ -+ elog(DEBUG1, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", - vacrel->relname, blkno); - visibilitymap_clear(vacrel->rel, blkno, vmbuffer, - VISIBILITYMAP_VALID_BITS); -``` - - -Is this still needed? If that WARNING happens, it looks like potential corruption that we should -fix! - - ## Use buffer manager when extending VM or FSM ``` diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 796770565f..19d948fd47 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 796770565ff668b585e80733b8d679961ad50e93 +Subproject commit 19d948fd47f45d83367062d9a54709cf2d9c8921 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 34c47d6c99..5b8b3eeef5 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 34c47d6c99415c94296d5e599ec5590d0001d6c2 +Subproject commit 5b8b3eeef5ec34c0cad9377833906a1387841d04 From 19fa410ff84ad41ce39fcbdedf1e8e7c158ef1b4 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 21 Sep 2022 12:50:37 +0100 Subject: [PATCH 0809/1022] NeonCompare: switch to new pageserver HTTP API --- test_runner/fixtures/compare_fixtures.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index ceeeffc785..78a12c6c45 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -89,16 +89,13 @@ class NeonCompare(PgCompare): self.env = neon_simple_env self._zenbenchmark = zenbenchmark self._pg_bin = pg_bin + self.pageserver_http_client = self.env.pageserver.http_client() # We only use one branch and one timeline self.env.neon_cli.create_branch(branch_name, "empty") self._pg = self.env.postgres.create_start(branch_name) self.timeline = self.pg.safe_psql("SHOW neon.timeline_id")[0][0] - # Long-lived cursor, useful for flushing - self.psconn = self.env.pageserver.connect() - self.pscur = self.psconn.cursor() - @property def pg(self): return self._pg @@ -112,10 +109,10 @@ class NeonCompare(PgCompare): return self._pg_bin def flush(self): - self.pscur.execute(f"do_gc {self.env.initial_tenant} {self.timeline} 0") + self.pageserver_http_client.timeline_gc(self.env.initial_tenant, self.timeline, 0) def compact(self): - self.pscur.execute(f"compact {self.env.initial_tenant} {self.timeline}") + self.pageserver_http_client.timeline_compact(self.env.initial_tenant, self.timeline) def report_peak_memory_use(self) -> None: self.zenbenchmark.record( From 7eebb45ea6635404d494563af3d58790a44a68eb Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 21 Sep 2022 18:13:30 +0200 Subject: [PATCH 0810/1022] Reduce metrics footprint in safekeeper (#2491) Fixes bugs with metrics in control_file and wal_storage, where we haven't deleted metrics for inactive timelines. --- safekeeper/src/control_file.rs | 24 +----- safekeeper/src/metrics.rs | 138 ++++++++++++++++++++++++++++++++- safekeeper/src/safekeeper.rs | 4 + safekeeper/src/timeline.rs | 1 + safekeeper/src/wal_storage.rs | 92 +++++----------------- 5 files changed, 162 insertions(+), 97 deletions(-) diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 22ed34cc00..6be3f9abb2 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -2,7 +2,6 @@ use anyhow::{bail, ensure, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use once_cell::sync::Lazy; use std::fs::{self, File, OpenOptions}; use std::io::{Read, Write}; @@ -10,8 +9,8 @@ use std::ops::Deref; use std::path::{Path, PathBuf}; use crate::control_file_upgrade::upgrade_control_file; +use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; use crate::safekeeper::{SafeKeeperState, SK_FORMAT_VERSION, SK_MAGIC}; -use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; @@ -24,16 +23,6 @@ const CONTROL_FILE_NAME: &str = "safekeeper.control"; const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial"; pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); -static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { - register_histogram_vec!( - "safekeeper_persist_control_file_seconds", - "Seconds to persist and sync control file, grouped by timeline", - &["tenant_id", "timeline_id"], - DISK_WRITE_SECONDS_BUCKETS.to_vec() - ) - .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec") -}); - /// Storage should keep actual state inside of it. It should implement Deref /// trait to access state fields and have persist method for updating that state. pub trait Storage: Deref { @@ -46,7 +35,6 @@ pub struct FileStorage { // save timeline dir to avoid reconstructing it every time timeline_dir: PathBuf, conf: SafeKeeperConf, - persist_control_file_seconds: Histogram, /// Last state persisted to disk. state: SafeKeeperState, @@ -56,16 +44,12 @@ impl FileStorage { /// Initialize storage by loading state from disk. pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { let timeline_dir = conf.timeline_dir(ttid); - let tenant_id = ttid.tenant_id.to_string(); - let timeline_id = ttid.timeline_id.to_string(); let state = Self::load_control_file_conf(conf, ttid)?; Ok(FileStorage { timeline_dir, conf: conf.clone(), - persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS - .with_label_values(&[&tenant_id, &timeline_id]), state, }) } @@ -77,14 +61,10 @@ impl FileStorage { state: SafeKeeperState, ) -> Result { let timeline_dir = conf.timeline_dir(ttid); - let tenant_id = ttid.tenant_id.to_string(); - let timeline_id = ttid.timeline_id.to_string(); let store = FileStorage { timeline_dir, conf: conf.clone(), - persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS - .with_label_values(&[&tenant_id, &timeline_id]), state, }; @@ -175,7 +155,7 @@ impl Storage for FileStorage { /// persists state durably to underlying storage /// for description see https://lwn.net/Articles/457667/ fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { - let _timer = &self.persist_control_file_seconds.start_timer(); + let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer(); // write data to safekeeper.control.partial let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL); diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 851a568aec..51138df776 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -1,12 +1,15 @@ -//! This module exports metrics for all active timelines. +//! Global safekeeper mertics and per-timeline safekeeper metrics. use std::time::{Instant, SystemTime}; +use ::metrics::{register_histogram, GaugeVec, Histogram, DISK_WRITE_SECONDS_BUCKETS}; +use anyhow::Result; use metrics::{ core::{AtomicU64, Collector, Desc, GenericGaugeVec, Opts}, proto::MetricFamily, Gauge, IntGaugeVec, }; +use once_cell::sync::Lazy; use postgres_ffi::XLogSegNo; use utils::{id::TenantTimelineId, lsn::Lsn}; @@ -16,6 +19,85 @@ use crate::{ GlobalTimelines, }; +// Global metrics across all timelines. +pub static WRITE_WAL_BYTES: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_write_wal_bytes", + "Bytes written to WAL in a single request", + vec![ + 1.0, + 10.0, + 100.0, + 1024.0, + 8192.0, + 128.0 * 1024.0, + 1024.0 * 1024.0, + 10.0 * 1024.0 * 1024.0 + ] + ) + .expect("Failed to register safekeeper_write_wal_bytes histogram") +}); +pub static WRITE_WAL_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_write_wal_seconds", + "Seconds spent writing and syncing WAL to a disk in a single request", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_write_wal_seconds histogram") +}); +pub static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_flush_wal_seconds", + "Seconds spent syncing WAL to a disk", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_flush_wal_seconds histogram") +}); +pub static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_persist_control_file_seconds", + "Seconds to persist and sync control file", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec") +}); + +/// Metrics for WalStorage in a single timeline. +#[derive(Clone, Default)] +pub struct WalStorageMetrics { + /// How much bytes were written in total. + write_wal_bytes: u64, + /// How much time spent writing WAL to disk, waiting for write(2). + write_wal_seconds: f64, + /// How much time spent syncing WAL to disk, waiting for fsync(2). + flush_wal_seconds: f64, +} + +impl WalStorageMetrics { + pub fn observe_write_bytes(&mut self, bytes: usize) { + self.write_wal_bytes += bytes as u64; + WRITE_WAL_BYTES.observe(bytes as f64); + } + + pub fn observe_write_seconds(&mut self, seconds: f64) { + self.write_wal_seconds += seconds; + WRITE_WAL_SECONDS.observe(seconds); + } + + pub fn observe_flush_seconds(&mut self, seconds: f64) { + self.flush_wal_seconds += seconds; + FLUSH_WAL_SECONDS.observe(seconds); + } +} + +/// Accepts a closure that returns a result, and returns the duration of the closure. +pub fn time_io_closure(closure: impl FnOnce() -> Result<()>) -> Result { + let start = std::time::Instant::now(); + closure()?; + Ok(start.elapsed().as_secs_f64()) +} + +/// Metrics for a single timeline. pub struct FullTimelineInfo { pub ttid: TenantTimelineId, pub replicas: Vec, @@ -29,8 +111,11 @@ pub struct FullTimelineInfo { pub persisted_state: SafeKeeperState, pub flush_lsn: Lsn, + + pub wal_storage: WalStorageMetrics, } +/// Collects metrics for all active timelines. pub struct TimelineCollector { descs: Vec, commit_lsn: GenericGaugeVec, @@ -46,6 +131,9 @@ pub struct TimelineCollector { connected_computes: IntGaugeVec, disk_usage: GenericGaugeVec, acceptor_term: GenericGaugeVec, + written_wal_bytes: GenericGaugeVec, + written_wal_seconds: GaugeVec, + flushed_wal_seconds: GaugeVec, collect_timeline_metrics: Gauge, } @@ -186,6 +274,36 @@ impl TimelineCollector { .unwrap(); descs.extend(acceptor_term.desc().into_iter().cloned()); + let written_wal_bytes = GenericGaugeVec::new( + Opts::new( + "safekeeper_written_wal_bytes_total", + "Number of WAL bytes written to disk, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(written_wal_bytes.desc().into_iter().cloned()); + + let written_wal_seconds = GaugeVec::new( + Opts::new( + "safekeeper_written_wal_seconds_total", + "Total time spent in write(2) writing WAL to disk, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(written_wal_seconds.desc().into_iter().cloned()); + + let flushed_wal_seconds = GaugeVec::new( + Opts::new( + "safekeeper_flushed_wal_seconds_total", + "Total time spent in fsync(2) flushing WAL to disk, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(flushed_wal_seconds.desc().into_iter().cloned()); + let collect_timeline_metrics = Gauge::new( "safekeeper_collect_timeline_metrics_seconds", "Time spent collecting timeline metrics, including obtaining mutex lock for all timelines", @@ -208,6 +326,9 @@ impl TimelineCollector { connected_computes, disk_usage, acceptor_term, + written_wal_bytes, + written_wal_seconds, + flushed_wal_seconds, collect_timeline_metrics, } } @@ -235,6 +356,9 @@ impl Collector for TimelineCollector { self.connected_computes.reset(); self.disk_usage.reset(); self.acceptor_term.reset(); + self.written_wal_bytes.reset(); + self.written_wal_seconds.reset(); + self.flushed_wal_seconds.reset(); let timelines = GlobalTimelines::get_all(); @@ -292,6 +416,15 @@ impl Collector for TimelineCollector { self.acceptor_term .with_label_values(labels) .set(tli.persisted_state.acceptor_state.term as u64); + self.written_wal_bytes + .with_label_values(labels) + .set(tli.wal_storage.write_wal_bytes); + self.written_wal_seconds + .with_label_values(labels) + .set(tli.wal_storage.write_wal_seconds); + self.flushed_wal_seconds + .with_label_values(labels) + .set(tli.wal_storage.flush_wal_seconds); if let Some(feedback) = most_advanced { self.feedback_ps_write_lsn @@ -332,6 +465,9 @@ impl Collector for TimelineCollector { mfs.extend(self.connected_computes.collect()); mfs.extend(self.disk_usage.collect()); mfs.extend(self.acceptor_term.collect()); + mfs.extend(self.written_wal_bytes.collect()); + mfs.extend(self.written_wal_seconds.collect()); + mfs.extend(self.flushed_wal_seconds.collect()); // report time it took to collect all info let elapsed = start_collecting.elapsed().as_secs_f64(); diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index d34a77e02b..65340ac0ed 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -998,6 +998,10 @@ mod tests { fn remove_up_to(&self) -> Box Result<()>> { Box::new(move |_segno_up_to: XLogSegNo| Ok(())) } + + fn get_metrics(&self) -> crate::metrics::WalStorageMetrics { + crate::metrics::WalStorageMetrics::default() + } } #[test] diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 4000815857..ec29e13931 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -534,6 +534,7 @@ impl Timeline { mem_state: state.sk.inmem.clone(), persisted_state: state.sk.state.clone(), flush_lsn: state.sk.wal_store.flush_lsn(), + wal_storage: state.sk.wal_store.get_metrics(), }) } else { None diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index ea613dd0f1..692bd18342 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -8,11 +8,11 @@ //! Note that last file has `.partial` suffix, that's different from postgres. use anyhow::{bail, Context, Result}; + use std::io::{self, Seek, SeekFrom}; use std::pin::Pin; use tokio::io::AsyncRead; -use once_cell::sync::Lazy; use postgres_ffi::v14::xlog_utils::{ find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, }; @@ -27,6 +27,7 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; +use crate::metrics::{time_io_closure, WalStorageMetrics}; use crate::safekeeper::SafeKeeperState; use crate::wal_backup::read_object; @@ -36,67 +37,8 @@ use postgres_ffi::XLOG_BLCKSZ; use postgres_ffi::v14::waldecoder::WalStreamDecoder; -use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; - use tokio::io::{AsyncReadExt, AsyncSeekExt}; -// The prometheus crate does not support u64 yet, i64 only (see `IntGauge`). -// i64 is faster than f64, so update to u64 when available. -static WRITE_WAL_BYTES: Lazy = Lazy::new(|| { - register_histogram_vec!( - "safekeeper_write_wal_bytes", - "Bytes written to WAL in a single request, grouped by timeline", - &["tenant_id", "timeline_id"], - vec![ - 1.0, - 10.0, - 100.0, - 1024.0, - 8192.0, - 128.0 * 1024.0, - 1024.0 * 1024.0, - 10.0 * 1024.0 * 1024.0 - ] - ) - .expect("Failed to register safekeeper_write_wal_bytes histogram vec") -}); -static WRITE_WAL_SECONDS: Lazy = Lazy::new(|| { - register_histogram_vec!( - "safekeeper_write_wal_seconds", - "Seconds spent writing and syncing WAL to a disk in a single request, grouped by timeline", - &["tenant_id", "timeline_id"], - DISK_WRITE_SECONDS_BUCKETS.to_vec() - ) - .expect("Failed to register safekeeper_write_wal_seconds histogram vec") -}); -static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { - register_histogram_vec!( - "safekeeper_flush_wal_seconds", - "Seconds spent syncing WAL to a disk, grouped by timeline", - &["tenant_id", "timeline_id"], - DISK_WRITE_SECONDS_BUCKETS.to_vec() - ) - .expect("Failed to register safekeeper_flush_wal_seconds histogram vec") -}); - -struct WalStorageMetrics { - write_wal_bytes: Histogram, - write_wal_seconds: Histogram, - flush_wal_seconds: Histogram, -} - -impl WalStorageMetrics { - fn new(ttid: &TenantTimelineId) -> Self { - let tenant_id = ttid.tenant_id.to_string(); - let timeline_id = ttid.timeline_id.to_string(); - Self { - write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&[&tenant_id, &timeline_id]), - write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), - flush_wal_seconds: FLUSH_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), - } - } -} - pub trait Storage { /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn; @@ -113,6 +55,9 @@ pub trait Storage { /// Remove all segments <= given segno. Returns closure as we want to do /// that without timeline lock. fn remove_up_to(&self) -> Box Result<()>>; + + /// Get metrics for this timeline. + fn get_metrics(&self) -> WalStorageMetrics; } /// PhysicalStorage is a storage that stores WAL on disk. Writes are separated from flushes @@ -187,7 +132,7 @@ impl PhysicalStorage { } Ok(PhysicalStorage { - metrics: WalStorageMetrics::new(ttid), + metrics: WalStorageMetrics::default(), timeline_dir, conf: conf.clone(), wal_seg_size, @@ -200,28 +145,26 @@ impl PhysicalStorage { } /// Call fdatasync if config requires so. - fn fdatasync_file(&self, file: &mut File) -> Result<()> { + fn fdatasync_file(&mut self, file: &mut File) -> Result<()> { if !self.conf.no_sync { self.metrics - .flush_wal_seconds - .observe_closure_duration(|| file.sync_data())?; + .observe_flush_seconds(time_io_closure(|| Ok(file.sync_data()?))?); } Ok(()) } /// Call fsync if config requires so. - fn fsync_file(&self, file: &mut File) -> Result<()> { + fn fsync_file(&mut self, file: &mut File) -> Result<()> { if !self.conf.no_sync { self.metrics - .flush_wal_seconds - .observe_closure_duration(|| file.sync_all())?; + .observe_flush_seconds(time_io_closure(|| Ok(file.sync_all()?))?); } Ok(()) } /// Open or create WAL segment file. Caller must call seek to the wanted position. /// Returns `file` and `is_partial`. - fn open_or_create(&self, segno: XLogSegNo) -> Result<(File, bool)> { + fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> { let (wal_file_path, wal_file_partial_path) = wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; @@ -335,13 +278,10 @@ impl Storage for PhysicalStorage { ); } - { - let _timer = self.metrics.write_wal_seconds.start_timer(); - self.write_exact(startpos, buf)?; - } - + let write_seconds = time_io_closure(|| self.write_exact(startpos, buf))?; // WAL is written, updating write metrics - self.metrics.write_wal_bytes.observe(buf.len() as f64); + self.metrics.observe_write_seconds(write_seconds); + self.metrics.observe_write_bytes(buf.len()); // figure out last record's end lsn for reporting (if we got the // whole record) @@ -444,6 +384,10 @@ impl Storage for PhysicalStorage { remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to) }) } + + fn get_metrics(&self) -> WalStorageMetrics { + self.metrics.clone() + } } /// Remove all WAL segments in timeline_dir that match the given predicate. From e9a103c09f4e24a70697a3187419b4a51b024209 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 21 Sep 2022 21:42:47 +0300 Subject: [PATCH 0811/1022] [proxy] Pass extra parameters to the console (#2467) With this change we now pass additional params to the console's auth methods. --- Cargo.lock | 6 ++ proxy/Cargo.toml | 8 +- proxy/src/auth.rs | 2 +- proxy/src/auth/backend.rs | 132 +++++++++++++++--------------- proxy/src/auth/backend/console.rs | 57 +++++++++---- proxy/src/auth/backend/link.rs | 6 +- proxy/src/config.rs | 10 +-- proxy/src/http.rs | 92 ++++++++++++++++----- proxy/src/http/server.rs | 27 ++++++ proxy/src/main.rs | 48 +++++------ proxy/src/proxy.rs | 24 +++--- proxy/src/url.rs | 12 +-- workspace_hack/Cargo.toml | 1 + 13 files changed, 259 insertions(+), 166 deletions(-) create mode 100644 proxy/src/http/server.rs diff --git a/Cargo.lock b/Cargo.lock index fc4ef90b8b..0579d381cc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2283,6 +2283,7 @@ dependencies = [ "tokio-rustls", "url", "utils", + "uuid", "workspace_hack", "x509-parser", ] @@ -3663,6 +3664,10 @@ name = "uuid" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" +dependencies = [ + "getrandom", + "serde", +] [[package]] name = "valuable" @@ -3953,6 +3958,7 @@ dependencies = [ "tokio-util", "tracing", "tracing-core", + "uuid", ] [[package]] diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 5417f4f2b3..7d0449cd1a 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -11,13 +11,14 @@ bstr = "0.2.17" bytes = { version = "1.0.1", features = ['serde'] } clap = "3.0" futures = "0.3.13" +git-version = "0.3.5" hashbrown = "0.12" hex = "0.4.3" hmac = "0.12.1" hyper = "0.14" itertools = "0.10.3" -once_cell = "1.13.0" md5 = "0.7.0" +once_cell = "1.13.0" parking_lot = "0.12" pin-project-lite = "0.2.7" rand = "0.8.3" @@ -35,14 +36,13 @@ tokio = { version = "1.17", features = ["macros"] } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-rustls = "0.23.0" url = "2.2.2" -git-version = "0.3.5" +uuid = { version = "0.8.2", features = ["v4", "serde"]} +x509-parser = "0.13.2" utils = { path = "../libs/utils" } metrics = { path = "../libs/metrics" } workspace_hack = { version = "0.1", path = "../workspace_hack" } -x509-parser = "0.13.2" - [dev-dependencies] rcgen = "0.8.14" rstest = "0.12" diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index a50d23e351..2df4f9d920 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,7 +1,7 @@ //! Client authentication mechanisms. pub mod backend; -pub use backend::{BackendType, DatabaseInfo}; +pub use backend::{BackendType, ConsoleReqExtra, DatabaseInfo}; mod credentials; pub use credentials::ClientCredentials; diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index de0719a196..7e93a32950 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -8,13 +8,12 @@ pub use console::{GetAuthInfoError, WakeComputeError}; use crate::{ auth::{self, AuthFlow, ClientCredentials}, - compute, config, mgmt, - stream::PqStream, + compute, http, mgmt, stream, url, waiters::{self, Waiter, Waiters}, }; - use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; +use std::borrow::Cow; use tokio::io::{AsyncRead, AsyncWrite}; static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); @@ -75,6 +74,14 @@ impl From for tokio_postgres::Config { } } +/// Extra query params we'd like to pass to the console. +pub struct ConsoleReqExtra<'a> { + /// A unique identifier for a connection. + pub session_id: uuid::Uuid, + /// Name of client application, if set. + pub application_name: Option<&'a str>, +} + /// This type serves two purposes: /// /// * When `T` is `()`, it's just a regular auth backend selector @@ -83,53 +90,83 @@ impl From for tokio_postgres::Config { /// * However, when we substitute `T` with [`ClientCredentials`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum BackendType { +#[derive(Debug)] +pub enum BackendType<'a, T> { /// Current Cloud API (V2). - Console(T), + Console(Cow<'a, http::Endpoint>, T), /// Local mock of Cloud API (V2). - Postgres(T), + Postgres(Cow<'a, url::ApiUrl>, T), /// Authentication via a web browser. - Link, + Link(Cow<'a, url::ApiUrl>), } -impl BackendType { +impl std::fmt::Display for BackendType<'_, ()> { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use BackendType::*; + match self { + Console(endpoint, _) => fmt + .debug_tuple("Console") + .field(&endpoint.url().as_str()) + .finish(), + Postgres(endpoint, _) => fmt + .debug_tuple("Postgres") + .field(&endpoint.as_str()) + .finish(), + Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(), + } + } +} + +impl BackendType<'_, T> { + /// Very similar to [`std::option::Option::as_ref`]. + /// This helps us pass structured config to async tasks. + pub fn as_ref(&self) -> BackendType<'_, &T> { + use BackendType::*; + match self { + Console(c, x) => Console(Cow::Borrowed(c), x), + Postgres(c, x) => Postgres(Cow::Borrowed(c), x), + Link(c) => Link(Cow::Borrowed(c)), + } + } +} + +impl<'a, T> BackendType<'a, T> { /// Very similar to [`std::option::Option::map`]. /// Maps [`BackendType`] to [`BackendType`] by applying /// a function to a contained value. - pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType { + pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R> { use BackendType::*; match self { - Console(x) => Console(f(x)), - Postgres(x) => Postgres(f(x)), - Link => Link, + Console(c, x) => Console(c, f(x)), + Postgres(c, x) => Postgres(c, f(x)), + Link(c) => Link(c), } } } -impl BackendType> { +impl<'a, T, E> BackendType<'a, Result> { /// Very similar to [`std::option::Option::transpose`]. /// This is most useful for error handling. - pub fn transpose(self) -> Result, E> { + pub fn transpose(self) -> Result, E> { use BackendType::*; match self { - Console(x) => x.map(Console), - Postgres(x) => x.map(Postgres), - Link => Ok(Link), + Console(c, x) => x.map(|x| Console(c, x)), + Postgres(c, x) => x.map(|x| Postgres(c, x)), + Link(c) => Ok(Link(c)), } } } -impl BackendType> { +impl BackendType<'_, ClientCredentials<'_>> { /// Authenticate the client via the requested backend, possibly using credentials. pub async fn authenticate( mut self, - urls: &config::AuthUrls, - client: &mut PqStream, + extra: &ConsoleReqExtra<'_>, + client: &mut stream::PqStream, ) -> super::Result { use BackendType::*; - if let Console(creds) | Postgres(creds) = &mut self { + if let Console(_, creds) | Postgres(_, creds) = &mut self { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the project name. // We now expect to see a very specific payload in the place of password. @@ -145,15 +182,13 @@ impl BackendType> { creds.project = Some(payload.project.into()); let mut config = match &self { - Console(creds) => { - console::Api::new(&urls.auth_endpoint, creds) + Console(endpoint, creds) => { + console::Api::new(endpoint, extra, creds) .wake_compute() .await? } - Postgres(creds) => { - postgres::Api::new(&urls.auth_endpoint, creds) - .wake_compute() - .await? + Postgres(endpoint, creds) => { + postgres::Api::new(endpoint, creds).wake_compute().await? } _ => unreachable!("see the patterns above"), }; @@ -169,49 +204,18 @@ impl BackendType> { } match self { - Console(creds) => { - console::Api::new(&urls.auth_endpoint, &creds) + Console(endpoint, creds) => { + console::Api::new(&endpoint, extra, &creds) .handle_user(client) .await } - Postgres(creds) => { - postgres::Api::new(&urls.auth_endpoint, &creds) + Postgres(endpoint, creds) => { + postgres::Api::new(&endpoint, &creds) .handle_user(client) .await } // NOTE: this auth backend doesn't use client credentials. - Link => link::handle_user(&urls.auth_link_uri, client).await, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_backend_type_map() { - let values = [ - BackendType::Console(0), - BackendType::Postgres(0), - BackendType::Link, - ]; - - for value in values { - assert_eq!(value.map(|x| x), value); - } - } - - #[test] - fn test_backend_type_transpose() { - let values = [ - BackendType::Console(Ok::<_, ()>(0)), - BackendType::Postgres(Ok(0)), - BackendType::Link, - ]; - - for value in values { - assert_eq!(value.map(Result::unwrap), value.transpose().unwrap()); + Link(url) => link::handle_user(&url, client).await, } } } diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index e239320e9b..e5ee07813c 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -1,12 +1,12 @@ //! Cloud API V2. +use super::ConsoleReqExtra; use crate::{ auth::{self, AuthFlow, ClientCredentials}, compute::{self, ComputeConnCfg}, error::{io_error, UserFacingError}, - scram, + http, scram, stream::PqStream, - url::ApiUrl, }; use serde::{Deserialize, Serialize}; use std::future::Future; @@ -120,14 +120,23 @@ pub enum AuthInfo { #[must_use] pub(super) struct Api<'a> { - endpoint: &'a ApiUrl, + endpoint: &'a http::Endpoint, + extra: &'a ConsoleReqExtra<'a>, creds: &'a ClientCredentials<'a>, } impl<'a> Api<'a> { /// Construct an API object containing the auth parameters. - pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Self { - Self { endpoint, creds } + pub(super) fn new( + endpoint: &'a http::Endpoint, + extra: &'a ConsoleReqExtra<'a>, + creds: &'a ClientCredentials, + ) -> Self { + Self { + endpoint, + extra, + creds, + } } /// Authenticate the existing user or throw an error. @@ -139,16 +148,22 @@ impl<'a> Api<'a> { } async fn get_auth_info(&self) -> Result { - let mut url = self.endpoint.clone(); - url.path_segments_mut().push("proxy_get_role_secret"); - url.query_pairs_mut() - .append_pair("project", self.creds.project().expect("impossible")) - .append_pair("role", self.creds.user); + let req = self + .endpoint + .get("proxy_get_role_secret") + .header("X-Request-ID", uuid::Uuid::new_v4().to_string()) + .query(&[("session_id", self.extra.session_id)]) + .query(&[ + ("application_name", self.extra.application_name), + ("project", Some(self.creds.project().expect("impossible"))), + ("role", Some(self.creds.user)), + ]) + .build()?; // TODO: use a proper logger - println!("cplane request: {url}"); + println!("cplane request: {}", req.url()); - let resp = reqwest::get(url.into_inner()).await?; + let resp = self.endpoint.execute(req).await?; if !resp.status().is_success() { return Err(TransportError::HttpStatus(resp.status()).into()); } @@ -162,15 +177,21 @@ impl<'a> Api<'a> { /// Wake up the compute node and return the corresponding connection info. pub(super) async fn wake_compute(&self) -> Result { - let mut url = self.endpoint.clone(); - url.path_segments_mut().push("proxy_wake_compute"); - url.query_pairs_mut() - .append_pair("project", self.creds.project().expect("impossible")); + let req = self + .endpoint + .get("proxy_wake_compute") + .header("X-Request-ID", uuid::Uuid::new_v4().to_string()) + .query(&[("session_id", self.extra.session_id)]) + .query(&[ + ("application_name", self.extra.application_name), + ("project", Some(self.creds.project().expect("impossible"))), + ]) + .build()?; // TODO: use a proper logger - println!("cplane request: {url}"); + println!("cplane request: {}", req.url()); - let resp = reqwest::get(url.into_inner()).await?; + let resp = self.endpoint.execute(req).await?; if !resp.status().is_success() { return Err(TransportError::HttpStatus(resp.status()).into()); } diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index d740a4c5c4..eefa246eba 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -29,7 +29,7 @@ impl UserFacingError for LinkAuthError { } } -fn hello_message(redirect_uri: &str, session_id: &str) -> String { +fn hello_message(redirect_uri: &reqwest::Url, session_id: &str) -> String { format!( concat![ "Welcome to Neon!\n", @@ -46,11 +46,11 @@ pub fn new_psql_session_id() -> String { } pub async fn handle_user( - redirect_uri: &reqwest::Url, + link_uri: &reqwest::Url, client: &mut PqStream, ) -> auth::Result { let psql_session_id = new_psql_session_id(); - let greeting = hello_message(redirect_uri.as_str(), &psql_session_id); + let greeting = hello_message(link_uri, &psql_session_id); let db_info = super::with_waiter(psql_session_id, |waiter| async { // Give user a URL to spawn a new database diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 8835d660d5..031fa84509 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,16 +1,10 @@ -use crate::{auth, url::ApiUrl}; +use crate::auth; use anyhow::{ensure, Context}; use std::sync::Arc; pub struct ProxyConfig { pub tls_config: Option, - pub auth_backend: auth::BackendType<()>, - pub auth_urls: AuthUrls, -} - -pub struct AuthUrls { - pub auth_endpoint: ApiUrl, - pub auth_link_uri: ApiUrl, + pub auth_backend: auth::BackendType<'static, ()>, } pub struct TlsConfig { diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 5a75718742..dbeb3dc784 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -1,27 +1,81 @@ -use anyhow::anyhow; -use hyper::{Body, Request, Response, StatusCode}; -use std::net::TcpListener; -use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService}; +pub mod server; -async fn status_handler(_: Request) -> Result, ApiError> { - json_response(StatusCode::OK, "") +use crate::url::ApiUrl; + +/// Thin convenience wrapper for an API provided by an http endpoint. +#[derive(Debug, Clone)] +pub struct Endpoint { + /// API's base URL. + endpoint: ApiUrl, + /// Connection manager with built-in pooling. + client: reqwest::Client, } -fn make_router() -> RouterBuilder { - let router = endpoint::make_router(); - router.get("/v1/status", status_handler) -} - -pub async fn thread_main(http_listener: TcpListener) -> anyhow::Result<()> { - scopeguard::defer! { - println!("http has shut down"); +impl Endpoint { + /// Construct a new HTTP endpoint wrapper. + pub fn new(endpoint: ApiUrl, client: reqwest::Client) -> Self { + Self { endpoint, client } } - let service = || RouterService::new(make_router().build()?); + pub fn url(&self) -> &ApiUrl { + &self.endpoint + } - hyper::Server::from_tcp(http_listener)? - .serve(service().map_err(|e| anyhow!(e))?) - .await?; + /// Return a [builder](reqwest::RequestBuilder) for a `GET` request, + /// appending a single `path` segment to the base endpoint URL. + pub fn get(&self, path: &str) -> reqwest::RequestBuilder { + let mut url = self.endpoint.clone(); + url.path_segments_mut().push(path); + self.client.get(url.into_inner()) + } - Ok(()) + /// Execute a [request](reqwest::Request). + pub async fn execute( + &self, + request: reqwest::Request, + ) -> Result { + self.client.execute(request).await + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn optional_query_params() -> anyhow::Result<()> { + let url = "http://example.com".parse()?; + let endpoint = Endpoint::new(url, reqwest::Client::new()); + + // Validate that this pattern makes sense. + let req = endpoint + .get("frobnicate") + .query(&[ + ("foo", Some("10")), // should be just `foo=10` + ("bar", None), // shouldn't be passed at all + ]) + .build()?; + + assert_eq!(req.url().as_str(), "http://example.com/frobnicate?foo=10"); + + Ok(()) + } + + #[test] + fn uuid_params() -> anyhow::Result<()> { + let url = "http://example.com".parse()?; + let endpoint = Endpoint::new(url, reqwest::Client::new()); + + let req = endpoint + .get("frobnicate") + .query(&[("session_id", uuid::Uuid::nil())]) + .build()?; + + assert_eq!( + req.url().as_str(), + "http://example.com/frobnicate?session_id=00000000-0000-0000-0000-000000000000" + ); + + Ok(()) + } } diff --git a/proxy/src/http/server.rs b/proxy/src/http/server.rs new file mode 100644 index 0000000000..5a75718742 --- /dev/null +++ b/proxy/src/http/server.rs @@ -0,0 +1,27 @@ +use anyhow::anyhow; +use hyper::{Body, Request, Response, StatusCode}; +use std::net::TcpListener; +use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService}; + +async fn status_handler(_: Request) -> Result, ApiError> { + json_response(StatusCode::OK, "") +} + +fn make_router() -> RouterBuilder { + let router = endpoint::make_router(); + router.get("/v1/status", status_handler) +} + +pub async fn thread_main(http_listener: TcpListener) -> anyhow::Result<()> { + scopeguard::defer! { + println!("http has shut down"); + } + + let service = || RouterService::new(make_router().build()?); + + hyper::Server::from_tcp(http_listener)? + .serve(service().map_err(|e| anyhow!(e))?) + .await?; + + Ok(()) +} diff --git a/proxy/src/main.rs b/proxy/src/main.rs index efe45f6386..f2dc7425ba 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -23,7 +23,7 @@ use anyhow::{bail, Context}; use clap::{self, Arg}; use config::ProxyConfig; use futures::FutureExt; -use std::{future::Future, net::SocketAddr}; +use std::{borrow::Cow, future::Future, net::SocketAddr}; use tokio::{net::TcpListener, task::JoinError}; use utils::project_git_version; @@ -36,23 +36,6 @@ async fn flatten_err( f.map(|r| r.context("join error").and_then(|x| x)).await } -/// A proper parser for auth backend parameter. -impl clap::ValueEnum for auth::BackendType<()> { - fn value_variants<'a>() -> &'a [Self] { - use auth::BackendType::*; - &[Console(()), Postgres(()), Link] - } - - fn to_possible_value<'a>(&self) -> Option> { - use auth::BackendType::*; - Some(clap::PossibleValue::new(match self { - Console(_) => "console", - Postgres(_) => "postgres", - Link => "link", - })) - } -} - #[tokio::main] async fn main() -> anyhow::Result<()> { let arg_matches = clap::App::new("Neon proxy/router") @@ -69,7 +52,7 @@ async fn main() -> anyhow::Result<()> { Arg::new("auth-backend") .long("auth-backend") .takes_value(true) - .value_parser(clap::builder::EnumValueParser::>::new()) + .possible_values(["console", "postgres", "link"]) .default_value("link"), ) .arg( @@ -135,23 +118,30 @@ async fn main() -> anyhow::Result<()> { let mgmt_address: SocketAddr = arg_matches.value_of("mgmt").unwrap().parse()?; let http_address: SocketAddr = arg_matches.value_of("http").unwrap().parse()?; - let auth_backend = *arg_matches - .try_get_one::>("auth-backend")? - .unwrap(); - - let auth_urls = config::AuthUrls { - auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, - auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, + let auth_backend = match arg_matches.value_of("auth-backend").unwrap() { + "console" => { + let url = arg_matches.value_of("auth-endpoint").unwrap().parse()?; + let endpoint = http::Endpoint::new(url, reqwest::Client::new()); + auth::BackendType::Console(Cow::Owned(endpoint), ()) + } + "postgres" => { + let url = arg_matches.value_of("auth-endpoint").unwrap().parse()?; + auth::BackendType::Postgres(Cow::Owned(url), ()) + } + "link" => { + let url = arg_matches.value_of("uri").unwrap().parse()?; + auth::BackendType::Link(Cow::Owned(url)) + } + other => bail!("unsupported auth backend: {other}"), }; let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig { tls_config, auth_backend, - auth_urls, })); println!("Version: {GIT_VERSION}"); - println!("Authentication backend: {:?}", config.auth_backend); + println!("Authentication backend: {}", config.auth_backend); // Check that we can bind to address before further initialization println!("Starting http on {}", http_address); @@ -164,7 +154,7 @@ async fn main() -> anyhow::Result<()> { let proxy_listener = TcpListener::bind(proxy_address).await?; let tasks = [ - tokio::spawn(http::thread_main(http_listener)), + tokio::spawn(http::server::thread_main(http_listener)), tokio::spawn(proxy::thread_main(config, proxy_listener)), tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)), ] diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 72cb822910..efb1b6f358 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -1,6 +1,6 @@ use crate::auth; use crate::cancellation::{self, CancelMap}; -use crate::config::{AuthUrls, ProxyConfig, TlsConfig}; +use crate::config::{ProxyConfig, TlsConfig}; use crate::stream::{MetricsStream, PqStream, Stream}; use anyhow::{bail, Context}; use futures::TryFutureExt; @@ -99,6 +99,7 @@ async fn handle_client( let common_name = tls.and_then(|tls| tls.common_name.as_deref()); let result = config .auth_backend + .as_ref() .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name)) .transpose(); @@ -107,7 +108,7 @@ async fn handle_client( let client = Client::new(stream, creds, ¶ms); cancel_map - .with_session(|session| client.connect_to_db(&config.auth_urls, session)) + .with_session(|session| client.connect_to_db(session)) .await } @@ -179,7 +180,7 @@ struct Client<'a, S> { /// The underlying libpq protocol stream. stream: PqStream, /// Client credentials that we care about. - creds: auth::BackendType>, + creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, /// KV-dictionary with PostgreSQL connection params. params: &'a StartupMessageParams, } @@ -188,7 +189,7 @@ impl<'a, S> Client<'a, S> { /// Construct a new connection context. fn new( stream: PqStream, - creds: auth::BackendType>, + creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, params: &'a StartupMessageParams, ) -> Self { Self { @@ -201,19 +202,22 @@ impl<'a, S> Client<'a, S> { impl Client<'_, S> { /// Let the client authenticate and connect to the designated compute node. - async fn connect_to_db( - self, - urls: &AuthUrls, - session: cancellation::Session<'_>, - ) -> anyhow::Result<()> { + async fn connect_to_db(self, session: cancellation::Session<'_>) -> anyhow::Result<()> { let Self { mut stream, creds, params, } = self; + let extra = auth::ConsoleReqExtra { + // Currently it's OK to generate a new UUID **here**, but + // it might be better to move this to `cancellation::Session`. + session_id: uuid::Uuid::new_v4(), + application_name: params.get("application_name"), + }; + // Authenticate and connect to a compute node. - let auth = creds.authenticate(urls, &mut stream).await; + let auth = creds.authenticate(&extra, &mut stream).await; let node = async { auth }.or_else(|e| stream.throw_error(e)).await?; let reported_auth_ok = node.reported_auth_ok; diff --git a/proxy/src/url.rs b/proxy/src/url.rs index 76d6ad0e66..92c64bb8ad 100644 --- a/proxy/src/url.rs +++ b/proxy/src/url.rs @@ -1,8 +1,8 @@ use anyhow::bail; -use url::form_urlencoded::Serializer; /// A [url](url::Url) type with additional guarantees. -#[derive(Debug, Clone)] +#[repr(transparent)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct ApiUrl(url::Url); impl ApiUrl { @@ -11,11 +11,6 @@ impl ApiUrl { self.0 } - /// See [`url::Url::query_pairs_mut`]. - pub fn query_pairs_mut(&mut self) -> Serializer<'_, url::UrlQuery<'_>> { - self.0.query_pairs_mut() - } - /// See [`url::Url::path_segments_mut`]. pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut { // We've already verified that it works during construction. @@ -72,10 +67,7 @@ mod tests { let mut b = url.parse::().expect("unexpected parsing failure"); a.path_segments_mut().unwrap().push("method"); - a.query_pairs_mut().append_pair("key", "value"); - b.path_segments_mut().push("method"); - b.query_pairs_mut().append_pair("key", "value"); assert_eq!(a, b.into_inner()); } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index dc4cbb5284..3670ca5fea 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -43,6 +43,7 @@ tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } tracing-core = { version = "0.1", features = ["once_cell", "std"] } +uuid = { version = "0.8", features = ["getrandom", "serde", "std", "v4"] } [build-dependencies] ahash = { version = "0.7", features = ["std"] } From f3073a4db93e2d4e39e2bbef03ed6b742ef3afa0 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 22 Sep 2022 08:35:06 +0300 Subject: [PATCH 0812/1022] R-Tree layer map (#2317) Replace the layer array and linear search with R-tree So far, the in-memory layer map that holds information about layer files that exist, has used a simple Vec, in no particular order, to hold information about all the layers. That obviously doesn't scale very well; with thousands of layer files the linear search was consuming a lot of CPU. Replace it with a two-dimensional R-tree, with Key and LSN ranges as the dimensions. For the R-tree, use the 'rstar' crate. To be able to use that, we convert the Keys and LSNs into 256-bit integers. 64 bits would be enough to represent LSNs, and 128 bits would be enough to represent Keys. However, we use 256 bits, because rstar internally performs multiplication to calculate the area of rectangles, and the result of multiplying two 128 bit integers doesn't necessarily fit in 128 bits, causing integer overflow and, if overflow-checks are enabled, panic. To avoid that, we use 256 bit integers. Add a performance test that creates a lot of layer files, to demonstrate the benefit. --- Cargo.lock | 222 +++++++++++++- pageserver/Cargo.toml | 3 + pageserver/src/repository.rs | 13 + pageserver/src/tenant/delta_layer.rs | 2 +- pageserver/src/tenant/layer_map.rs | 347 +++++++++++++++++----- pageserver/src/tenant/timeline.rs | 2 +- test_runner/performance/test_layer_map.py | 39 +++ workspace_hack/Cargo.toml | 3 +- 8 files changed, 548 insertions(+), 83 deletions(-) create mode 100644 test_runner/performance/test_layer_map.py diff --git a/Cargo.lock b/Cargo.lock index 0579d381cc..ddb10352b8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -37,6 +37,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "amplify_num" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f27d3d00d3d115395a7a8a4dc045feb7aa82b641e485f7e15f4e67ac16f4f56d" + [[package]] name = "ansi_term" version = "0.12.1" @@ -135,6 +141,15 @@ dependencies = [ "syn", ] +[[package]] +name = "atomic-polyfill" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c041a8d9751a520ee19656232a18971f18946a7900f1520ee4400002244dd89" +dependencies = [ + "critical-section", +] + [[package]] name = "atty" version = "0.2.14" @@ -212,6 +227,21 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "bare-metal" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5deb64efa5bd81e31fcd1938615a6d98c82eafcbcd787162b6f63b91d6bac5b3" +dependencies = [ + "rustc_version 0.2.3", +] + +[[package]] +name = "bare-metal" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8fe8f5a8a398345e52358e18ff07cc17a568fbca5c6f73873d3a62056309603" + [[package]] name = "base64" version = "0.13.0" @@ -250,6 +280,18 @@ dependencies = [ "which", ] +[[package]] +name = "bit_field" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb6dd1c2376d2e096796e234a70e17e94cc2d5d54ff8ce42b28cef1d0d359a4" + +[[package]] +name = "bitfield" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46afbd2983a5d5a7bd740ccb198caf5b82f45c40c09c0eed36052d91cb92e719" + [[package]] name = "bitflags" version = "1.3.2" @@ -528,6 +570,18 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" +[[package]] +name = "cortex-m" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70858629a458fdfd39f9675c4dc309411f2a3f83bede76988d81bf1a0ecee9e0" +dependencies = [ + "bare-metal 0.2.5", + "bitfield", + "embedded-hal", + "volatile-register", +] + [[package]] name = "cpp_demangle" version = "0.3.5" @@ -552,7 +606,7 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e" dependencies = [ - "rustc_version", + "rustc_version 0.4.0", ] [[package]] @@ -600,6 +654,18 @@ dependencies = [ "itertools", ] +[[package]] +name = "critical-section" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95da181745b56d4bd339530ec393508910c909c784e8962d15d722bacf0bcbcd" +dependencies = [ + "bare-metal 1.0.0", + "cfg-if", + "cortex-m", + "riscv", +] + [[package]] name = "crossbeam-channel" version = "0.5.6" @@ -844,6 +910,16 @@ version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f107b87b6afc2a64fd13cac55fe06d6c8859f12d4b14cbcdd2c67d0976781be" +[[package]] +name = "embedded-hal" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35949884794ad573cf46071e41c9b60efb0cb311e3ca01f7af807af1debc66ff" +dependencies = [ + "nb 0.1.3", + "void", +] + [[package]] name = "encoding_rs" version = "0.8.31" @@ -1165,6 +1241,15 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" +[[package]] +name = "hash32" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67" +dependencies = [ + "byteorder", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -1174,6 +1259,19 @@ dependencies = [ "ahash", ] +[[package]] +name = "heapless" +version = "0.7.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743" +dependencies = [ + "atomic-polyfill", + "hash32", + "rustc_version 0.4.0", + "spin 0.9.4", + "stable_deref_trait", +] + [[package]] name = "heck" version = "0.3.3" @@ -1491,6 +1589,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "libm" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565" + [[package]] name = "lock_api" version = "0.4.7" @@ -1649,6 +1753,21 @@ dependencies = [ "tempfile", ] +[[package]] +name = "nb" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "801d31da0513b6ec5214e9bf433a77966320625a37860f910be265be6e18d06f" +dependencies = [ + "nb 1.0.0", +] + +[[package]] +name = "nb" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "546c37ac5d9e56f55e73b677106873d9d9f5190605e41a856503623648488cae" + [[package]] name = "nix" version = "0.23.1" @@ -1716,6 +1835,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -1828,6 +1948,7 @@ checksum = "648001efe5d5c0102d8cea768e348da85d90af8ba91f0bea908f157951493cd4" name = "pageserver" version = "0.1.0" dependencies = [ + "amplify_num", "anyhow", "async-stream", "async-trait", @@ -1852,6 +1973,7 @@ dependencies = [ "itertools", "metrics", "nix", + "num-traits", "once_cell", "postgres", "postgres-protocol", @@ -1861,6 +1983,7 @@ dependencies = [ "rand", "regex", "remote_storage", + "rstar", "scopeguard", "serde", "serde_json", @@ -2515,12 +2638,33 @@ dependencies = [ "cc", "libc", "once_cell", - "spin", + "spin 0.5.2", "untrusted", "web-sys", "winapi", ] +[[package]] +name = "riscv" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6907ccdd7a31012b70faf2af85cd9e5ba97657cc3987c4f13f8e4d2c2a088aba" +dependencies = [ + "bare-metal 1.0.0", + "bit_field", + "riscv-target", +] + +[[package]] +name = "riscv-target" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88aa938cda42a0cf62a20cfe8d139ff1af20c2e681212b5b34adb5a58333f222" +dependencies = [ + "lazy_static", + "regex", +] + [[package]] name = "routerify" version = "3.0.0" @@ -2534,6 +2678,17 @@ dependencies = [ "regex", ] +[[package]] +name = "rstar" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b40f1bfe5acdab44bc63e6699c28b74f75ec43afb59f3eda01e145aff86a25fa" +dependencies = [ + "heapless", + "num-traits", + "smallvec", +] + [[package]] name = "rstest" version = "0.12.0" @@ -2543,7 +2698,7 @@ dependencies = [ "cfg-if", "proc-macro2", "quote", - "rustc_version", + "rustc_version 0.4.0", "syn", ] @@ -2565,7 +2720,7 @@ dependencies = [ "log", "rusoto_credential", "rusoto_signature", - "rustc_version", + "rustc_version 0.4.0", "serde", "serde_json", "tokio", @@ -2623,7 +2778,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "rusoto_credential", - "rustc_version", + "rustc_version 0.4.0", "serde", "sha2 0.9.9", "tokio", @@ -2641,13 +2796,22 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +dependencies = [ + "semver 0.9.0", +] + [[package]] name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ - "semver", + "semver 1.0.13", ] [[package]] @@ -2800,12 +2964,27 @@ dependencies = [ "libc", ] +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +dependencies = [ + "semver-parser", +] + [[package]] name = "semver" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93f6841e709003d68bb2deee8c343572bf446003ec20a583e76f7b15cebf3711" +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" + [[package]] name = "serde" version = "1.0.142" @@ -2999,6 +3178,15 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" +[[package]] +name = "spin" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09" +dependencies = [ + "lock_api", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -3675,6 +3863,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "vcell" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77439c1b53d2303b20d9459b1ade71a83c716e3f9c34f3228c00e6f185d6c002" + [[package]] name = "vcpkg" version = "0.2.15" @@ -3687,6 +3881,21 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" + +[[package]] +name = "volatile-register" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ee8f19f9d74293faf70901bc20ad067dc1ad390d2cbf1e3f75f721ffee908b6" +dependencies = [ + "vcell", +] + [[package]] name = "wal_craft" version = "0.1.0" @@ -3952,6 +4161,7 @@ dependencies = [ "regex-syntax", "scopeguard", "serde", + "stable_deref_trait", "syn", "time 0.3.12", "tokio", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 85ece97d9b..1ec7ec4f98 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -54,6 +54,9 @@ once_cell = "1.13.0" crossbeam-utils = "0.8.5" fail = "0.5.0" git-version = "0.3.5" +rstar = "0.9.3" +num-traits = "0.2.15" +amplify_num = "0.4.1" postgres_ffi = { path = "../libs/postgres_ffi" } etcd_broker = { path = "../libs/etcd_broker" } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index cfcc87a2ed..0c2fedd7d5 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -24,6 +24,19 @@ pub struct Key { pub const KEY_SIZE: usize = 18; impl Key { + /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish. + /// As long as Neon does not support tablespace (because of lack of access to local file system), + /// we can assume that only some predefined namespace OIDs are used which can fit in u16 + pub fn to_i128(&self) -> i128 { + assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); + (((self.field1 & 0xf) as i128) << 120) + | (((self.field2 & 0xFFFF) as i128) << 104) + | ((self.field3 as i128) << 72) + | ((self.field4 as i128) << 40) + | ((self.field5 as i128) << 32) + | self.field6 as i128 + } + pub fn next(&self) -> Key { self.add(1) } diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs index 892000c20b..57c5be91a4 100644 --- a/pageserver/src/tenant/delta_layer.rs +++ b/pageserver/src/tenant/delta_layer.rs @@ -713,7 +713,7 @@ impl DeltaLayerWriter { for buf in block_buf.blocks { file.write_all(buf.as_ref())?; } - + assert!(self.lsn_range.start < self.lsn_range.end); // Fill in the summary on blk 0 let summary = Summary { magic: DELTA_FILE_MAGIC, diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 8abeebf54c..495833e3ae 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -15,9 +15,15 @@ use crate::repository::Key; use crate::tenant::inmemory_layer::InMemoryLayer; use crate::tenant::storage_layer::Layer; use crate::tenant::storage_layer::{range_eq, range_overlaps}; +use amplify_num::i256; use anyhow::Result; +use num_traits::identities::{One, Zero}; +use num_traits::{Bounded, Num, Signed}; +use rstar::{RTree, RTreeObject, AABB}; +use std::cmp::Ordering; use std::collections::VecDeque; use std::ops::Range; +use std::ops::{Add, Div, Mul, Neg, Rem, Sub}; use std::sync::Arc; use tracing::*; use utils::lsn::Lsn; @@ -47,14 +53,163 @@ pub struct LayerMap { pub frozen_layers: VecDeque>, /// All the historic layers are kept here + historic_layers: RTree, - /// TODO: This is a placeholder implementation of a data structure - /// to hold information about all the layer files on disk and in - /// S3. Currently, it's just a vector and all operations perform a - /// linear scan over it. That obviously becomes slow as the - /// number of layers grows. I'm imagining that an R-tree or some - /// other 2D data structure would be the long-term solution here. - historic_layers: Vec>, + /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient. + /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree. + l0_delta_layers: Vec>, +} + +struct LayerRTreeObject { + layer: Arc, +} + +// Representation of Key as numeric type. +// We can not use native implementation of i128, because rstar::RTree +// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi). +// Overflow will cause panic in debug mode and incorrect area calculation in release mode, +// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work). +// By using i256 as the type, even though all the actual values would fit in i128, we can be +// sure that multiplication doesn't overflow. +// + +#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)] +struct IntKey(i256); + +impl Copy for IntKey {} + +impl IntKey { + fn from(i: i128) -> Self { + IntKey(i256::from(i)) + } +} + +impl Bounded for IntKey { + fn min_value() -> Self { + IntKey(i256::MIN) + } + fn max_value() -> Self { + IntKey(i256::MAX) + } +} + +impl Signed for IntKey { + fn is_positive(&self) -> bool { + self.0 > i256::ZERO + } + fn is_negative(&self) -> bool { + self.0 < i256::ZERO + } + fn signum(&self) -> Self { + match self.0.cmp(&i256::ZERO) { + Ordering::Greater => IntKey(i256::ONE), + Ordering::Less => IntKey(-i256::ONE), + Ordering::Equal => IntKey(i256::ZERO), + } + } + fn abs(&self) -> Self { + IntKey(self.0.abs()) + } + fn abs_sub(&self, other: &Self) -> Self { + if self.0 <= other.0 { + IntKey(i256::ZERO) + } else { + IntKey(self.0 - other.0) + } + } +} + +impl Neg for IntKey { + type Output = Self; + fn neg(self) -> Self::Output { + IntKey(-self.0) + } +} + +impl Rem for IntKey { + type Output = Self; + fn rem(self, rhs: Self) -> Self::Output { + IntKey(self.0 % rhs.0) + } +} + +impl Div for IntKey { + type Output = Self; + fn div(self, rhs: Self) -> Self::Output { + IntKey(self.0 / rhs.0) + } +} + +impl Add for IntKey { + type Output = Self; + fn add(self, rhs: Self) -> Self::Output { + IntKey(self.0 + rhs.0) + } +} + +impl Sub for IntKey { + type Output = Self; + fn sub(self, rhs: Self) -> Self::Output { + IntKey(self.0 - rhs.0) + } +} + +impl Mul for IntKey { + type Output = Self; + fn mul(self, rhs: Self) -> Self::Output { + IntKey(self.0 * rhs.0) + } +} + +impl One for IntKey { + fn one() -> Self { + IntKey(i256::ONE) + } +} + +impl Zero for IntKey { + fn zero() -> Self { + IntKey(i256::ZERO) + } + fn is_zero(&self) -> bool { + self.0 == i256::ZERO + } +} + +impl Num for IntKey { + type FromStrRadixErr = ::FromStrRadixErr; + fn from_str_radix(str: &str, radix: u32) -> Result { + Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?))) + } +} + +impl PartialEq for LayerRTreeObject { + fn eq(&self, other: &Self) -> bool { + // FIXME: ptr_eq might fail to return true for 'dyn' + // references. Clippy complains about this. In practice it + // seems to work, the assertion below would be triggered + // otherwise but this ought to be fixed. + #[allow(clippy::vtable_address_comparisons)] + Arc::ptr_eq(&self.layer, &other.layer) + } +} + +impl RTreeObject for LayerRTreeObject { + type Envelope = AABB<[IntKey; 2]>; + fn envelope(&self) -> Self::Envelope { + let key_range = self.layer.get_key_range(); + let lsn_range = self.layer.get_lsn_range(); + AABB::from_corners( + [ + IntKey::from(key_range.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(key_range.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive + ) + } } /// Return value of LayerMap::search @@ -80,19 +235,24 @@ impl LayerMap { // Find the latest image layer that covers the given key let mut latest_img: Option> = None; let mut latest_img_lsn: Option = None; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [IntKey::from(key.to_i128()), IntKey::from(0i128)], + [ + IntKey::from(key.to_i128()), + IntKey::from(end_lsn.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } + assert!(l.get_key_range().contains(&key)); let img_lsn = l.get_lsn_range().start; - - if img_lsn >= end_lsn { - // too new - continue; - } + assert!(img_lsn < end_lsn); if Lsn(img_lsn.0 + 1) == end_lsn { // found exact match return Ok(Some(SearchResult { @@ -108,19 +268,24 @@ impl LayerMap { // Search the delta layers let mut latest_delta: Option> = None; - for l in self.historic_layers.iter() { + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if !l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } - + assert!(l.get_key_range().contains(&key)); if l.get_lsn_range().start >= end_lsn { - // too new - continue; + info!( + "Candidate delta layer {}..{} is too new for lsn {}", + l.get_lsn_range().start, + l.get_lsn_range().end, + end_lsn + ); } - + assert!(l.get_lsn_range().start < end_lsn); if l.get_lsn_range().end >= end_lsn { // this layer contains the requested point in the key/lsn space. // No need to search any further @@ -170,7 +335,10 @@ impl LayerMap { /// Insert an on-disk layer /// pub fn insert_historic(&mut self, layer: Arc) { - self.historic_layers.push(layer); + if layer.get_key_range() == (Key::MIN..Key::MAX) { + self.l0_delta_layers.push(layer.clone()); + } + self.historic_layers.insert(LayerRTreeObject { layer }); NUM_ONDISK_LAYERS.inc(); } @@ -180,17 +348,22 @@ impl LayerMap { /// This should be called when the corresponding file on disk has been deleted. /// pub fn remove_historic(&mut self, layer: Arc) { - let len_before = self.historic_layers.len(); + if layer.get_key_range() == (Key::MIN..Key::MAX) { + let len_before = self.l0_delta_layers.len(); - // FIXME: ptr_eq might fail to return true for 'dyn' - // references. Clippy complains about this. In practice it - // seems to work, the assertion below would be triggered - // otherwise but this ought to be fixed. - #[allow(clippy::vtable_address_comparisons)] - self.historic_layers - .retain(|other| !Arc::ptr_eq(other, &layer)); - - assert_eq!(self.historic_layers.len(), len_before - 1); + // FIXME: ptr_eq might fail to return true for 'dyn' + // references. Clippy complains about this. In practice it + // seems to work, the assertion below would be triggered + // otherwise but this ought to be fixed. + #[allow(clippy::vtable_address_comparisons)] + self.l0_delta_layers + .retain(|other| !Arc::ptr_eq(other, &layer)); + assert_eq!(self.l0_delta_layers.len(), len_before - 1); + } + assert!(self + .historic_layers + .remove(&LayerRTreeObject { layer }) + .is_some()); NUM_ONDISK_LAYERS.dec(); } @@ -207,15 +380,26 @@ impl LayerMap { loop { let mut made_progress = false; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [ + IntKey::from(range_remain.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(range_remain.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } let img_lsn = l.get_lsn_range().start; - if !l.is_incremental() - && l.get_key_range().contains(&range_remain.start) - && lsn_range.contains(&img_lsn) - { + if l.get_key_range().contains(&range_remain.start) && lsn_range.contains(&img_lsn) { made_progress = true; let img_key_end = l.get_key_range().end; @@ -232,8 +416,8 @@ impl LayerMap { } } - pub fn iter_historic_layers(&self) -> impl Iterator> { - self.historic_layers.iter() + pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { + self.historic_layers.iter().map(|e| e.layer.clone()) } /// Find the last image layer that covers 'key', ignoring any image layers @@ -241,19 +425,22 @@ impl LayerMap { fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option> { let mut candidate_lsn = Lsn(0); let mut candidate = None; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [IntKey::from(key.to_i128()), IntKey::from(0)], + [IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } - + assert!(l.get_key_range().contains(&key)); let this_lsn = l.get_lsn_range().start; - if this_lsn > lsn { - continue; - } + assert!(this_lsn <= lsn); if this_lsn < candidate_lsn { // our previous candidate was better continue; @@ -279,10 +466,19 @@ impl LayerMap { lsn: Lsn, ) -> Result, Option>)>> { let mut points = vec![key_range.start]; - for l in self.historic_layers.iter() { - if l.get_lsn_range().start > lsn { - continue; - } + let envelope = AABB::from_corners( + [IntKey::from(key_range.start.to_i128()), IntKey::from(0)], + [ + IntKey::from(key_range.end.to_i128()), + IntKey::from(lsn.0 as i128), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; + assert!(l.get_lsn_range().start <= lsn); let range = l.get_key_range(); if key_range.contains(&range.start) { points.push(l.get_key_range().start); @@ -315,16 +511,29 @@ impl LayerMap { /// given key and LSN range. pub fn count_deltas(&self, key_range: &Range, lsn_range: &Range) -> Result { let mut result = 0; - for l in self.historic_layers.iter() { + if lsn_range.start >= lsn_range.end { + return Ok(0); + } + let envelope = AABB::from_corners( + [ + IntKey::from(key_range.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(key_range.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if !l.is_incremental() { continue; } - if !range_overlaps(&l.get_lsn_range(), lsn_range) { - continue; - } - if !range_overlaps(&l.get_key_range(), key_range) { - continue; - } + assert!(range_overlaps(&l.get_lsn_range(), lsn_range)); + assert!(range_overlaps(&l.get_key_range(), key_range)); // We ignore level0 delta layers. Unless the whole keyspace fits // into one partition @@ -341,17 +550,7 @@ impl LayerMap { /// Return all L0 delta layers pub fn get_level0_deltas(&self) -> Result>> { - let mut deltas = Vec::new(); - for l in self.historic_layers.iter() { - if !l.is_incremental() { - continue; - } - if l.get_key_range() != (Key::MIN..Key::MAX) { - continue; - } - deltas.push(Arc::clone(l)); - } - Ok(deltas) + Ok(self.l0_delta_layers.clone()) } /// debugging function to print out the contents of the layer map @@ -370,8 +569,8 @@ impl LayerMap { } println!("historic_layers:"); - for layer in self.historic_layers.iter() { - layer.dump(verbose)?; + for e in self.historic_layers.iter() { + e.layer.dump(verbose)?; } println!("End dump LayerMap"); Ok(()) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b80d023c7f..6de1d44876 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2050,7 +2050,7 @@ impl Timeline { l.filename().display(), l.is_incremental(), ); - layers_to_remove.push(Arc::clone(l)); + layers_to_remove.push(Arc::clone(&l)); } // Actually delete the layers from disk and remove them from the map. diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py new file mode 100644 index 0000000000..d71fb6d12c --- /dev/null +++ b/test_runner/performance/test_layer_map.py @@ -0,0 +1,39 @@ +import time + +from fixtures.neon_fixtures import NeonEnvBuilder + + +# +# Benchmark searching the layer map, when there are a lot of small layer files. +# +def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): + + env = neon_env_builder.init_start() + n_iters = 10 + n_records = 100000 + + # We want to have a lot of lot of layer files to exercise the layer map. Make + # gc_horizon and checkpoint_distance very small, so that we get a lot of small layer files. + tenant, _ = env.neon_cli.create_tenant( + conf={ + "gc_period": "100 m", + "gc_horizon": "1048576", + "checkpoint_distance": "8192", + "compaction_period": "1 s", + "compaction_threshold": "1", + "compaction_target_size": "8192", + } + ) + + env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant) + pg = env.postgres.create_start("test_layer_map", tenant_id=tenant) + cur = pg.connect().cursor() + cur.execute("create table t(x integer)") + for i in range(n_iters): + cur.execute(f"insert into t values (generate_series(1,{n_records}))") + time.sleep(1) + + cur.execute("vacuum t") + with zenbenchmark.record_duration("test_query"): + cur.execute("SELECT count(*) from t") + assert cur.fetchone() == (n_iters * n_records,) diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 3670ca5fea..f37a42945e 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -30,7 +30,7 @@ memchr = { version = "2", features = ["std", "use_std"] } nom = { version = "7", features = ["alloc", "std"] } num-bigint = { version = "0.4", features = ["std"] } num-integer = { version = "0.1", default-features = false, features = ["i128", "std"] } -num-traits = { version = "0.2", features = ["i128", "std"] } +num-traits = { version = "0.2", features = ["i128", "libm", "std"] } prost = { version = "0.10", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } @@ -38,6 +38,7 @@ regex-automata = { version = "0.1", features = ["regex-syntax", "std"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } +stable_deref_trait = { version = "1", features = ["alloc", "std"] } time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] } tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } From e764c1e60fd8e7afaf346bc70f0b9269097e8a1a Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 22 Sep 2022 01:02:53 +0300 Subject: [PATCH 0813/1022] remove self argument from several spans --- pageserver/src/page_service.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 9e159f7391..7de6403b83 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -663,7 +663,7 @@ impl PageServerHandler { Ok(lsn) } - #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] async fn handle_get_rel_exists_request( &self, timeline: &Timeline, @@ -680,7 +680,7 @@ impl PageServerHandler { })) } - #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] async fn handle_get_nblocks_request( &self, timeline: &Timeline, @@ -697,7 +697,7 @@ impl PageServerHandler { })) } - #[instrument(skip(timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))] async fn handle_db_size_request( &self, timeline: &Timeline, @@ -717,7 +717,7 @@ impl PageServerHandler { })) } - #[instrument(skip(timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))] async fn handle_get_page_at_lsn_request( &self, timeline: &Timeline, From 86bf4919817d34a2e56590596eb5f8270ce8b79e Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 14 Sep 2022 17:09:28 +0300 Subject: [PATCH 0814/1022] Support pg 15 - Split postgres_ffi into two version specific files. - Preserve pg_version in timeline metadata. - Use pg_version in safekeeper code. Check for postgres major version mismatch. - Clean up the code to use DEFAULT_PG_VERSION constant everywhere, instead of hardcoding. - Parameterize python tests: use DEFAULT_PG_VERSION env and pg_version fixture. To run tests using a specific PostgreSQL version, pass the DEFAULT_PG_VERSION environment variable: 'DEFAULT_PG_VERSION='15' ./scripts/pytest test_runner/regress' Currently don't all tests pass, because rust code relies on the default version of PostgreSQL in a few places. --- control_plane/src/bin/neon_local.rs | 95 +++++++++++-- control_plane/src/compute.rs | 49 +++++-- control_plane/src/local_env.rs | 48 +++++-- control_plane/src/storage.rs | 22 ++- libs/postgres_ffi/src/lib.rs | 129 +++++++++++++++++- libs/postgres_ffi/src/nonrelfile_utils.rs | 2 +- libs/postgres_ffi/src/pg_constants.rs | 19 +-- libs/postgres_ffi/src/pg_constants_v14.rs | 5 + libs/postgres_ffi/src/pg_constants_v15.rs | 10 ++ libs/postgres_ffi/src/relfile_utils.rs | 25 ++-- libs/postgres_ffi/src/waldecoder.rs | 49 +------ libs/postgres_ffi/src/xlog_utils.rs | 38 +++++- pageserver/src/basebackup.rs | 82 +++++------ pageserver/src/bin/update_metadata.rs | 2 + pageserver/src/config.rs | 45 ++++-- pageserver/src/http/models.rs | 1 + pageserver/src/http/routes.rs | 1 + pageserver/src/import_datadir.rs | 20 +-- pageserver/src/lib.rs | 2 + pageserver/src/page_service.rs | 31 ++++- pageserver/src/pgdatadir_mapping.rs | 10 +- pageserver/src/reltag.rs | 6 +- pageserver/src/storage_sync.rs | 12 +- pageserver/src/storage_sync/index.rs | 23 +++- pageserver/src/tenant.rs | 49 ++++--- pageserver/src/tenant/metadata.rs | 9 ++ pageserver/src/tenant/timeline.rs | 17 ++- pageserver/src/walingest.rs | 83 +++++++---- .../src/walreceiver/connection_manager.rs | 2 +- .../src/walreceiver/walreceiver_connection.rs | 4 +- pageserver/src/walrecord.rs | 38 ++++-- pageserver/src/walredo.rs | 30 ++-- safekeeper/src/json_ctrl.rs | 11 +- safekeeper/src/safekeeper.rs | 19 ++- safekeeper/src/send_wal.rs | 2 +- safekeeper/src/wal_backup.rs | 3 +- safekeeper/src/wal_storage.rs | 10 +- test_runner/fixtures/neon_fixtures.py | 30 +++- test_runner/regress/test_import.py | 5 + test_runner/regress/test_pg_regress.py | 18 ++- test_runner/regress/test_wal_acceptor.py | 9 +- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 43 files changed, 777 insertions(+), 292 deletions(-) create mode 100644 libs/postgres_ffi/src/pg_constants_v14.rs create mode 100644 libs/postgres_ffi/src/pg_constants_v15.rs diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index e16fd8764a..92782ea235 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -39,6 +39,8 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); +const DEFAULT_PG_VERSION: &str = "14"; + fn default_conf(etcd_binary_path: &Path) -> String { format!( r#" @@ -105,6 +107,13 @@ fn main() -> Result<()> { .takes_value(true) .required(false); + let pg_version_arg = Arg::new("pg-version") + .long("pg-version") + .help("Postgres version to use for the initial tenant") + .required(false) + .takes_value(true) + .default_value(DEFAULT_PG_VERSION); + let port_arg = Arg::new("port") .long("port") .required(false) @@ -146,6 +155,7 @@ fn main() -> Result<()> { .required(false) .value_name("config"), ) + .arg(pg_version_arg.clone()) ) .subcommand( App::new("timeline") @@ -164,7 +174,9 @@ fn main() -> Result<()> { .subcommand(App::new("create") .about("Create a new blank timeline") .arg(tenant_id_arg.clone()) - .arg(branch_name_arg.clone())) + .arg(branch_name_arg.clone()) + .arg(pg_version_arg.clone()) + ) .subcommand(App::new("import") .about("Import timeline from basebackup directory") .arg(tenant_id_arg.clone()) @@ -178,7 +190,9 @@ fn main() -> Result<()> { .arg(Arg::new("wal-tarfile").long("wal-tarfile").takes_value(true) .help("Wal to add after base")) .arg(Arg::new("end-lsn").long("end-lsn").takes_value(true) - .help("Lsn the basebackup ends at"))) + .help("Lsn the basebackup ends at")) + .arg(pg_version_arg.clone()) + ) ).subcommand( App::new("tenant") .setting(AppSettings::ArgRequiredElseHelp) @@ -188,6 +202,7 @@ fn main() -> Result<()> { .arg(tenant_id_arg.clone()) .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false)) + .arg(pg_version_arg.clone()) ) .subcommand(App::new("config") .arg(tenant_id_arg.clone()) @@ -239,8 +254,9 @@ fn main() -> Result<()> { Arg::new("config-only") .help("Don't do basebackup, create compute node with only config files") .long("config-only") - .required(false) - )) + .required(false)) + .arg(pg_version_arg.clone()) + ) .subcommand(App::new("start") .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") .arg(pg_node_arg.clone()) @@ -248,7 +264,9 @@ fn main() -> Result<()> { .arg(branch_name_arg.clone()) .arg(timeline_id_arg.clone()) .arg(lsn_arg.clone()) - .arg(port_arg.clone())) + .arg(port_arg.clone()) + .arg(pg_version_arg.clone()) + ) .subcommand( App::new("stop") .arg(pg_node_arg.clone()) @@ -501,9 +519,16 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { default_conf(&EtcdBroker::locate_etcd()?) }; + let pg_version = init_match + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + let mut env = LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; - env.init().context("Failed to initialize neon repository")?; + env.init(pg_version) + .context("Failed to initialize neon repository")?; let initial_tenant_id = env .default_tenant_id .expect("default_tenant_id should be generated by the `env.init()` call above"); @@ -515,6 +540,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { Some(initial_tenant_id), initial_timeline_id_arg, &pageserver_config_overrides(init_match), + pg_version, ) .unwrap_or_else(|e| { eprintln!("pageserver init failed: {e}"); @@ -557,8 +583,19 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an // Create an initial timeline for the new tenant let new_timeline_id = parse_timeline_id(create_match)?; - let timeline_info = - pageserver.timeline_create(new_tenant_id, new_timeline_id, None, None)?; + let pg_version = create_match + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + + let timeline_info = pageserver.timeline_create( + new_tenant_id, + new_timeline_id, + None, + None, + Some(pg_version), + )?; let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info .local @@ -607,7 +644,15 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let new_branch_name = create_match .value_of("branch-name") .ok_or_else(|| anyhow!("No branch name provided"))?; - let timeline_info = pageserver.timeline_create(tenant_id, None, None, None)?; + + let pg_version = create_match + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + + let timeline_info = + pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?; let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info @@ -655,7 +700,14 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal)?; println!("Creating node for imported timeline ..."); env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; - cplane.new_node(tenant_id, name, timeline_id, None, None)?; + + let pg_version = import_match + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + + cplane.new_node(tenant_id, name, timeline_id, None, None, pg_version)?; println!("Done"); } Some(("branch", branch_match)) => { @@ -682,6 +734,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - None, start_lsn, Some(ancestor_timeline_id), + None, )?; let new_timeline_id = timeline_info.timeline_id; @@ -797,7 +850,14 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { Some(p) => Some(p.parse()?), None => None, }; - cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port)?; + + let pg_version = sub_args + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + + cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port, pg_version)?; } "start" => { let port: Option = match sub_args.value_of("port") { @@ -835,16 +895,23 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { .map(Lsn::from_str) .transpose() .context("Failed to parse Lsn from the request")?; + let pg_version = sub_args + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; // when used with custom port this results in non obvious behaviour // port is remembered from first start command, i e // start --port X // stop // start <-- will also use port X even without explicit port argument println!( - "Starting new postgres {} on timeline {} ...", - node_name, timeline_id + "Starting new postgres (v{}) {} on timeline {} ...", + pg_version, node_name, timeline_id ); - let node = cplane.new_node(tenant_id, node_name, timeline_id, lsn, port)?; + + let node = + cplane.new_node(tenant_id, node_name, timeline_id, lsn, port, pg_version)?; node.start(&auth_token)?; } } diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index b678d620df..89994c5647 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -18,7 +18,7 @@ use utils::{ postgres_backend::AuthType, }; -use crate::local_env::LocalEnv; +use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION}; use crate::postgresql_conf::PostgresConf; use crate::storage::PageServerNode; @@ -81,6 +81,7 @@ impl ComputeControlPlane { timeline_id: TimelineId, lsn: Option, port: Option, + pg_version: u32, ) -> Result> { let port = port.unwrap_or_else(|| self.get_port()); let node = Arc::new(PostgresNode { @@ -93,6 +94,7 @@ impl ComputeControlPlane { lsn, tenant_id, uses_wal_proposer: false, + pg_version, }); node.create_pgdata()?; @@ -118,6 +120,7 @@ pub struct PostgresNode { pub lsn: Option, // if it's a read-only node. None for primary pub tenant_id: TenantId, uses_wal_proposer: bool, + pg_version: u32, } impl PostgresNode { @@ -152,6 +155,14 @@ impl PostgresNode { let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?; let uses_wal_proposer = conf.get("neon.safekeepers").is_some(); + // Read postgres version from PG_VERSION file to determine which postgres version binary to use. + // If it doesn't exist, assume broken data directory and use default pg version. + let pg_version_path = entry.path().join("PG_VERSION"); + + let pg_version_str = + fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string()); + let pg_version = u32::from_str(&pg_version_str)?; + // parse recovery_target_lsn, if any let recovery_target_lsn: Option = conf.parse_field_optional("recovery_target_lsn", &context)?; @@ -167,17 +178,24 @@ impl PostgresNode { lsn: recovery_target_lsn, tenant_id, uses_wal_proposer, + pg_version, }) } - fn sync_safekeepers(&self, auth_token: &Option) -> Result { - let pg_path = self.env.pg_bin_dir().join("postgres"); + fn sync_safekeepers(&self, auth_token: &Option, pg_version: u32) -> Result { + let pg_path = self.env.pg_bin_dir(pg_version).join("postgres"); let mut cmd = Command::new(&pg_path); cmd.arg("--sync-safekeepers") .env_clear() - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) + .env( + "LD_LIBRARY_PATH", + self.env.pg_lib_dir(pg_version).to_str().unwrap(), + ) + .env( + "DYLD_LIBRARY_PATH", + self.env.pg_lib_dir(pg_version).to_str().unwrap(), + ) .env("PGDATA", self.pgdata().to_str().unwrap()) .stdout(Stdio::piped()) // Comment this to avoid capturing stderr (useful if command hangs) @@ -259,8 +277,8 @@ impl PostgresNode { }) } - // Connect to a page server, get base backup, and untar it to initialize a - // new data directory + // Write postgresql.conf with default configuration + // and PG_VERSION file to the data directory of a new node. fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> { let mut conf = PostgresConf::new(); conf.append("max_wal_senders", "10"); @@ -357,6 +375,9 @@ impl PostgresNode { let mut file = File::create(self.pgdata().join("postgresql.conf"))?; file.write_all(conf.to_string().as_bytes())?; + let mut file = File::create(self.pgdata().join("PG_VERSION"))?; + file.write_all(self.pg_version.to_string().as_bytes())?; + Ok(()) } @@ -368,7 +389,7 @@ impl PostgresNode { // latest data from the pageserver. That is a bit clumsy but whole bootstrap // procedure evolves quite actively right now, so let's think about it again // when things would be more stable (TODO). - let lsn = self.sync_safekeepers(auth_token)?; + let lsn = self.sync_safekeepers(auth_token, self.pg_version)?; if lsn == Lsn(0) { None } else { @@ -401,7 +422,7 @@ impl PostgresNode { } fn pg_ctl(&self, args: &[&str], auth_token: &Option) -> Result<()> { - let pg_ctl_path = self.env.pg_bin_dir().join("pg_ctl"); + let pg_ctl_path = self.env.pg_bin_dir(self.pg_version).join("pg_ctl"); let mut cmd = Command::new(pg_ctl_path); cmd.args( [ @@ -417,8 +438,14 @@ impl PostgresNode { .concat(), ) .env_clear() - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()); + .env( + "LD_LIBRARY_PATH", + self.env.pg_lib_dir(self.pg_version).to_str().unwrap(), + ) + .env( + "DYLD_LIBRARY_PATH", + self.env.pg_lib_dir(self.pg_version).to_str().unwrap(), + ); if let Some(token) = auth_token { cmd.env("ZENITH_AUTH_TOKEN", token); } diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 7afaad26dc..14bb4cf346 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -20,6 +20,8 @@ use utils::{ use crate::safekeeper::SafekeeperNode; +pub const DEFAULT_PG_VERSION: u32 = 14; + // // This data structures represents neon_local CLI config // @@ -195,12 +197,40 @@ impl Default for SafekeeperConf { } impl LocalEnv { - // postgres installation paths - pub fn pg_bin_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("bin") + pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf { + let mut path = self.pg_distrib_dir.clone(); + + if pg_version != DEFAULT_PG_VERSION { + // step up to the parent directory + // We assume that the pg_distrib subdirs + // for different pg versions + // are located in the same directory + // and follow the naming convention: v14, v15, etc. + path.pop(); + + match pg_version { + 14 => return path.join(format!("v{pg_version}")), + 15 => return path.join(format!("v{pg_version}")), + _ => panic!("Unsupported postgres version: {}", pg_version), + }; + } + + path } - pub fn pg_lib_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("lib") + + pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf { + match pg_version { + 14 => self.pg_distrib_dir(pg_version).join("bin"), + 15 => self.pg_distrib_dir(pg_version).join("bin"), + _ => panic!("Unsupported postgres version: {}", pg_version), + } + } + pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf { + match pg_version { + 14 => self.pg_distrib_dir(pg_version).join("lib"), + 15 => self.pg_distrib_dir(pg_version).join("lib"), + _ => panic!("Unsupported postgres version: {}", pg_version), + } } pub fn pageserver_bin(&self) -> anyhow::Result { @@ -290,6 +320,8 @@ impl LocalEnv { // Find postgres binaries. // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install/v14". + // Note that later in the code we assume, that distrib dirs follow the same pattern + // for all postgres versions. if env.pg_distrib_dir == Path::new("") { if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { env.pg_distrib_dir = postgres_bin.into(); @@ -384,7 +416,7 @@ impl LocalEnv { // // Initialize a new Neon repository // - pub fn init(&mut self) -> anyhow::Result<()> { + pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> { // check if config already exists let base_path = &self.base_data_dir; ensure!( @@ -397,10 +429,10 @@ impl LocalEnv { "directory '{}' already exists. Perhaps already initialized?", base_path.display() ); - if !self.pg_distrib_dir.join("bin/postgres").exists() { + if !self.pg_bin_dir(pg_version).join("postgres").exists() { bail!( "Can't find postgres binary at {}", - self.pg_distrib_dir.display() + self.pg_bin_dir(pg_version).display() ); } for binary in ["pageserver", "safekeeper"] { diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 3bbbdc5865..95ade14fbf 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -112,11 +112,15 @@ impl PageServerNode { create_tenant: Option, initial_timeline_id: Option, config_overrides: &[&str], + pg_version: u32, ) -> anyhow::Result { let id = format!("id={}", self.env.pageserver.id); // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. - let pg_distrib_dir_param = - format!("pg_distrib_dir='{}'", self.env.pg_distrib_dir.display()); + let pg_distrib_dir_param = format!( + "pg_distrib_dir='{}'", + self.env.pg_distrib_dir(pg_version).display() + ); + let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type); let listen_http_addr_param = format!( "listen_http_addr='{}'", @@ -159,7 +163,7 @@ impl PageServerNode { self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?; let init_result = self - .try_init_timeline(create_tenant, initial_timeline_id) + .try_init_timeline(create_tenant, initial_timeline_id, pg_version) .context("Failed to create initial tenant and timeline for pageserver"); match &init_result { Ok(initial_timeline_id) => { @@ -175,10 +179,16 @@ impl PageServerNode { &self, new_tenant_id: Option, new_timeline_id: Option, + pg_version: u32, ) -> anyhow::Result { let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?; - let initial_timeline_info = - self.timeline_create(initial_tenant_id, new_timeline_id, None, None)?; + let initial_timeline_info = self.timeline_create( + initial_tenant_id, + new_timeline_id, + None, + None, + Some(pg_version), + )?; Ok(initial_timeline_info.timeline_id) } @@ -497,6 +507,7 @@ impl PageServerNode { new_timeline_id: Option, ancestor_start_lsn: Option, ancestor_timeline_id: Option, + pg_version: Option, ) -> anyhow::Result { self.http_request( Method::POST, @@ -506,6 +517,7 @@ impl PageServerNode { new_timeline_id, ancestor_start_lsn, ancestor_timeline_id, + pg_version, }) .send()? .error_from_body()? diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index f43232ed0c..25e1f6029c 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -7,6 +7,8 @@ // https://github.com/rust-lang/rust-bindgen/issues/1651 #![allow(deref_nullptr)] +use bytes::Bytes; +use utils::bin_ser::SerializeError; use utils::lsn::Lsn; macro_rules! postgres_ffi { @@ -24,11 +26,11 @@ macro_rules! postgres_ffi { stringify!($version), ".rs" )); + + include!(concat!("pg_constants_", stringify!($version), ".rs")); } pub mod controlfile_utils; pub mod nonrelfile_utils; - pub mod pg_constants; - pub mod relfile_utils; pub mod waldecoder; pub mod xlog_utils; @@ -44,6 +46,9 @@ macro_rules! postgres_ffi { postgres_ffi!(v14); postgres_ffi!(v15); +pub mod pg_constants; +pub mod relfile_utils; + // Export some widely used datatypes that are unlikely to change across Postgres versions pub use v14::bindings::{uint32, uint64, Oid}; pub use v14::bindings::{BlockNumber, OffsetNumber}; @@ -52,8 +57,11 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo}; // Likewise for these, although the assumption that these don't change is a little more iffy. pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; +pub use v14::bindings::{PageHeaderData, XLogRecord}; pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; +pub use v14::bindings::{CheckPoint, ControlFileData}; + // from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and // --with-segsize=SEGSIZE, but assume the defaults for now. pub const BLCKSZ: u16 = 8192; @@ -63,6 +71,50 @@ pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; +// Export some version independent functions that are used outside of this mod +pub use v14::xlog_utils::encode_logical_message; +pub use v14::xlog_utils::get_current_timestamp; +pub use v14::xlog_utils::to_pg_timestamp; +pub use v14::xlog_utils::XLogFileName; + +pub use v14::bindings::DBState_DB_SHUTDOWNED; + +pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool { + if version == 14 { + bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0 + } else { + assert_eq!(version, 15); + bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0 + || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0 + || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0 + } +} + +pub fn generate_wal_segment( + segno: u64, + system_id: u64, + pg_version: u32, +) -> Result { + match pg_version { + 14 => v14::xlog_utils::generate_wal_segment(segno, system_id), + 15 => v15::xlog_utils::generate_wal_segment(segno, system_id), + _ => Err(SerializeError::BadInput), + } +} + +pub fn generate_pg_control( + pg_control_bytes: &[u8], + checkpoint_bytes: &[u8], + lsn: Lsn, + pg_version: u32, +) -> anyhow::Result<(Bytes, u64)> { + match pg_version { + 14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), + 15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), + _ => anyhow::bail!("Unknown version {}", pg_version), + } +} + // PG timeline is always 1, changing it doesn't have any useful meaning in Neon. // // NOTE: this is not to be confused with Neon timelines; different concept! @@ -74,7 +126,7 @@ pub const PG_TLI: u32 = 1; // See TransactionIdIsNormal in transam.h pub const fn transaction_id_is_normal(id: TransactionId) -> bool { - id > v14::pg_constants::FIRST_NORMAL_TRANSACTION_ID + id > pg_constants::FIRST_NORMAL_TRANSACTION_ID } // See TransactionIdPrecedes in transam.c @@ -109,3 +161,74 @@ pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) { pg[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes()); pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); } + +pub mod waldecoder { + + use crate::{v14, v15}; + use bytes::{Buf, Bytes, BytesMut}; + use std::num::NonZeroU32; + use thiserror::Error; + use utils::lsn::Lsn; + + pub enum State { + WaitingForRecord, + ReassemblingRecord { + recordbuf: BytesMut, + contlen: NonZeroU32, + }, + SkippingEverything { + skip_until_lsn: Lsn, + }, + } + + pub struct WalStreamDecoder { + pub lsn: Lsn, + pub pg_version: u32, + pub inputbuf: BytesMut, + pub state: State, + } + + #[derive(Error, Debug, Clone)] + #[error("{msg} at {lsn}")] + pub struct WalDecodeError { + pub msg: String, + pub lsn: Lsn, + } + + impl WalStreamDecoder { + pub fn new(lsn: Lsn, pg_version: u32) -> WalStreamDecoder { + WalStreamDecoder { + lsn, + pg_version, + inputbuf: BytesMut::new(), + state: State::WaitingForRecord, + } + } + + // The latest LSN position fed to the decoder. + pub fn available(&self) -> Lsn { + self.lsn + self.inputbuf.remaining() as u64 + } + + pub fn feed_bytes(&mut self, buf: &[u8]) { + self.inputbuf.extend_from_slice(buf); + } + + pub fn poll_decode(&mut self) -> Result, WalDecodeError> { + match self.pg_version { + 14 => { + use self::v14::waldecoder::WalStreamDecoderHandler; + self.poll_decode_internal() + } + 15 => { + use self::v15::waldecoder::WalStreamDecoderHandler; + self.poll_decode_internal() + } + _ => Err(WalDecodeError { + msg: format!("Unknown version {}", self.pg_version), + lsn: self.lsn, + }), + } + } + } +} diff --git a/libs/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs index 1de1d367e0..01e5554b8a 100644 --- a/libs/postgres_ffi/src/nonrelfile_utils.rs +++ b/libs/postgres_ffi/src/nonrelfile_utils.rs @@ -1,7 +1,7 @@ //! //! Common utilities for dealing with PostgreSQL non-relation files. //! -use super::pg_constants; +use crate::pg_constants; use crate::transaction_id_precedes; use bytes::BytesMut; use log::*; diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 8cc9fa7af6..6aaa739a69 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -1,14 +1,16 @@ //! //! Misc constants, copied from PostgreSQL headers. //! +//! Only place version-independent constants here. +//! //! TODO: These probably should be auto-generated using bindgen, //! rather than copied by hand. Although on the other hand, it's nice //! to have them all here in one place, and have the ability to add //! comments on them. //! -use super::bindings::{PageHeaderData, XLogRecord}; use crate::BLCKSZ; +use crate::{PageHeaderData, XLogRecord}; // // From pg_tablespace_d.h @@ -16,14 +18,6 @@ use crate::BLCKSZ; pub const DEFAULTTABLESPACE_OID: u32 = 1663; pub const GLOBALTABLESPACE_OID: u32 = 1664; -// -// Fork numbers, from relpath.h -// -pub const MAIN_FORKNUM: u8 = 0; -pub const FSM_FORKNUM: u8 = 1; -pub const VISIBILITYMAP_FORKNUM: u8 = 2; -pub const INIT_FORKNUM: u8 = 3; - // From storage_xlog.h pub const XLOG_SMGR_CREATE: u8 = 0x10; pub const XLOG_SMGR_TRUNCATE: u8 = 0x20; @@ -114,7 +108,6 @@ pub const XLOG_NEXTOID: u8 = 0x30; pub const XLOG_SWITCH: u8 = 0x40; pub const XLOG_FPI_FOR_HINT: u8 = 0xA0; pub const XLOG_FPI: u8 = 0xB0; -pub const DB_SHUTDOWNED: u32 = 1; // From multixact.h pub const FIRST_MULTIXACT_ID: u32 = 1; @@ -169,10 +162,6 @@ pub const RM_HEAP_ID: u8 = 10; pub const XLR_INFO_MASK: u8 = 0x0F; pub const XLR_RMGR_INFO_MASK: u8 = 0xF0; -// from dbcommands_xlog.h -pub const XLOG_DBASE_CREATE: u8 = 0x00; -pub const XLOG_DBASE_DROP: u8 = 0x10; - pub const XLOG_TBLSPC_CREATE: u8 = 0x00; pub const XLOG_TBLSPC_DROP: u8 = 0x10; @@ -197,8 +186,6 @@ pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous /* Information stored in bimg_info */ pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */ -pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ -pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ /* From transam.h */ pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3; diff --git a/libs/postgres_ffi/src/pg_constants_v14.rs b/libs/postgres_ffi/src/pg_constants_v14.rs new file mode 100644 index 0000000000..810898ee80 --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v14.rs @@ -0,0 +1,5 @@ +pub const XLOG_DBASE_CREATE: u8 = 0x00; +pub const XLOG_DBASE_DROP: u8 = 0x10; + +pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ +pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ diff --git a/libs/postgres_ffi/src/pg_constants_v15.rs b/libs/postgres_ffi/src/pg_constants_v15.rs new file mode 100644 index 0000000000..6fa5eb008c --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v15.rs @@ -0,0 +1,10 @@ +pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; + +pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; +pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x00; +pub const XLOG_DBASE_DROP: u8 = 0x20; + +pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ +pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ diff --git a/libs/postgres_ffi/src/relfile_utils.rs b/libs/postgres_ffi/src/relfile_utils.rs index f3476acc9c..1dc9f367ff 100644 --- a/libs/postgres_ffi/src/relfile_utils.rs +++ b/libs/postgres_ffi/src/relfile_utils.rs @@ -1,10 +1,17 @@ //! //! Common utilities for dealing with PostgreSQL relation files. //! -use super::pg_constants; use once_cell::sync::OnceCell; use regex::Regex; +// +// Fork numbers, from relpath.h +// +pub const MAIN_FORKNUM: u8 = 0; +pub const FSM_FORKNUM: u8 = 1; +pub const VISIBILITYMAP_FORKNUM: u8 = 2; +pub const INIT_FORKNUM: u8 = 3; + #[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] pub enum FilePathError { #[error("invalid relation fork name")] @@ -23,10 +30,10 @@ impl From for FilePathError { pub fn forkname_to_number(forkname: Option<&str>) -> Result { match forkname { // "main" is not in filenames, it's implicit if the fork name is not present - None => Ok(pg_constants::MAIN_FORKNUM), - Some("fsm") => Ok(pg_constants::FSM_FORKNUM), - Some("vm") => Ok(pg_constants::VISIBILITYMAP_FORKNUM), - Some("init") => Ok(pg_constants::INIT_FORKNUM), + None => Ok(MAIN_FORKNUM), + Some("fsm") => Ok(FSM_FORKNUM), + Some("vm") => Ok(VISIBILITYMAP_FORKNUM), + Some("init") => Ok(INIT_FORKNUM), Some(_) => Err(FilePathError::InvalidForkName), } } @@ -34,10 +41,10 @@ pub fn forkname_to_number(forkname: Option<&str>) -> Result { /// Convert Postgres fork number to the right suffix of the relation data file. pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> { match forknum { - pg_constants::MAIN_FORKNUM => None, - pg_constants::FSM_FORKNUM => Some("fsm"), - pg_constants::VISIBILITYMAP_FORKNUM => Some("vm"), - pg_constants::INIT_FORKNUM => Some("init"), + MAIN_FORKNUM => None, + FSM_FORKNUM => Some("fsm"), + VISIBILITYMAP_FORKNUM => Some("vm"), + INIT_FORKNUM => Some("init"), _ => Some("UNKNOWN FORKNUM"), } } diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index 4d79e4b1d1..5b46d52321 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -8,6 +8,7 @@ //! to look deeper into the WAL records to also understand which blocks they modify, the code //! for that is in pageserver/src/walrecord.rs //! +use super::super::waldecoder::{State, WalDecodeError, WalStreamDecoder}; use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC}; use super::xlog_utils::*; use crate::WAL_SEGMENT_SIZE; @@ -16,55 +17,19 @@ use crc32c::*; use log::*; use std::cmp::min; use std::num::NonZeroU32; -use thiserror::Error; use utils::lsn::Lsn; -enum State { - WaitingForRecord, - ReassemblingRecord { - recordbuf: BytesMut, - contlen: NonZeroU32, - }, - SkippingEverything { - skip_until_lsn: Lsn, - }, -} - -pub struct WalStreamDecoder { - lsn: Lsn, - inputbuf: BytesMut, - state: State, -} - -#[derive(Error, Debug, Clone)] -#[error("{msg} at {lsn}")] -pub struct WalDecodeError { - msg: String, - lsn: Lsn, +pub trait WalStreamDecoderHandler { + fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError>; + fn poll_decode_internal(&mut self) -> Result, WalDecodeError>; + fn complete_record(&mut self, recordbuf: Bytes) -> Result<(Lsn, Bytes), WalDecodeError>; } // // WalRecordStream is a Stream that returns a stream of WAL records // FIXME: This isn't a proper rust stream // -impl WalStreamDecoder { - pub fn new(lsn: Lsn) -> WalStreamDecoder { - WalStreamDecoder { - lsn, - inputbuf: BytesMut::new(), - state: State::WaitingForRecord, - } - } - - // The latest LSN position fed to the decoder. - pub fn available(&self) -> Lsn { - self.lsn + self.inputbuf.remaining() as u64 - } - - pub fn feed_bytes(&mut self, buf: &[u8]) { - self.inputbuf.extend_from_slice(buf); - } - +impl WalStreamDecoderHandler for WalStreamDecoder { fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError> { let validate_impl = || { if hdr.xlp_magic != XLOG_PAGE_MAGIC as u16 { @@ -125,7 +90,7 @@ impl WalStreamDecoder { /// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function /// Err(WalDecodeError): an error occurred while decoding, meaning the input was invalid. /// - pub fn poll_decode(&mut self) -> Result, WalDecodeError> { + fn poll_decode_internal(&mut self) -> Result, WalDecodeError> { // Run state machine that validates page headers, and reassembles records // that cross page boundaries. loop { diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index f8606b6e47..8389a6e971 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -9,12 +9,13 @@ use crc32c::crc32c_append; +use super::super::waldecoder::WalStreamDecoder; use super::bindings::{ - CheckPoint, FullTransactionId, TimeLineID, TimestampTz, XLogLongPageHeaderData, - XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC, + CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz, + XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC, }; -use super::pg_constants; -use super::waldecoder::WalStreamDecoder; +use super::PG_MAJORVERSION; +use crate::pg_constants; use crate::PG_TLI; use crate::{uint32, uint64, Oid}; use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; @@ -113,6 +114,30 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn { } } +pub fn generate_pg_control( + pg_control_bytes: &[u8], + checkpoint_bytes: &[u8], + lsn: Lsn, +) -> anyhow::Result<(Bytes, u64)> { + let mut pg_control = ControlFileData::decode(pg_control_bytes)?; + let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?; + + // Generate new pg_control needed for bootstrap + checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0; + + //reset some fields we don't want to preserve + //TODO Check this. + //We may need to determine the value from twophase data. + checkpoint.oldestActiveXid = 0; + + //save new values in pg_control + pg_control.checkPoint = 0; + pg_control.checkPointCopy = checkpoint; + pg_control.state = DBState_DB_SHUTDOWNED; + + Ok((pg_control.encode(), pg_control.system_identifier)) +} + pub fn get_current_timestamp() -> TimestampTz { to_pg_timestamp(SystemTime::now()) } @@ -144,7 +169,10 @@ pub fn find_end_of_wal( let mut result = start_lsn; let mut curr_lsn = start_lsn; let mut buf = [0u8; XLOG_BLCKSZ]; - let mut decoder = WalStreamDecoder::new(start_lsn); + let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); + info!("find_end_of_wal PG_VERSION: {}", pg_version); + + let mut decoder = WalStreamDecoder::new(start_lsn, pg_version); // loop over segments loop { diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index eca6a3c87f..d0a57a473b 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -25,10 +25,10 @@ use tracing::*; use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; -use postgres_ffi::v14::pg_constants; -use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName}; -use postgres_ffi::v14::{CheckPoint, ControlFileData}; +use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; +use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA}; use postgres_ffi::TransactionId; +use postgres_ffi::XLogFileName; use postgres_ffi::PG_TLI; use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; @@ -129,15 +129,15 @@ where // TODO include checksum // Create pgdata subdirs structure - for dir in pg_constants::PGDATA_SUBDIRS.iter() { + for dir in PGDATA_SUBDIRS.iter() { let header = new_tar_header_dir(*dir)?; self.ar.append(&header, &mut io::empty())?; } // Send empty config files. - for filepath in pg_constants::PGDATA_SPECIAL_FILES.iter() { + for filepath in PGDATA_SPECIAL_FILES.iter() { if *filepath == "pg_hba.conf" { - let data = pg_constants::PG_HBA.as_bytes(); + let data = PG_HBA.as_bytes(); let header = new_tar_header(filepath, data.len() as u64)?; self.ar.append(&header, data)?; } else { @@ -267,16 +267,12 @@ where None }; - // TODO pass this as a parameter - let pg_version = "14"; + if spcnode == GLOBALTABLESPACE_OID { + let pg_version_str = self.timeline.pg_version.to_string(); + let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?; + self.ar.append(&header, pg_version_str.as_bytes())?; - if spcnode == pg_constants::GLOBALTABLESPACE_OID { - let version_bytes = pg_version.as_bytes(); - let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; - - let header = new_tar_header("global/PG_VERSION", version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; + info!("timeline.pg_version {}", self.timeline.pg_version); if let Some(img) = relmap_img { // filenode map for global tablespace @@ -305,7 +301,7 @@ where return Ok(()); } // User defined tablespaces are not supported - ensure!(spcnode == pg_constants::DEFAULTTABLESPACE_OID); + ensure!(spcnode == DEFAULTTABLESPACE_OID); // Append dir path for each database let path = format!("base/{}", dbnode); @@ -314,9 +310,10 @@ where if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); - let version_bytes = pg_version.as_bytes(); - let header = new_tar_header(&dst_path, version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; + + let pg_version_str = self.timeline.pg_version.to_string(); + let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?; + self.ar.append(&header, pg_version_str.as_bytes())?; let relmap_path = format!("base/{}/pg_filenode.map", dbnode); let header = new_tar_header(&relmap_path, img.len() as u64)?; @@ -348,30 +345,6 @@ where // Also send zenith.signal file with extra bootstrap data. // fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> { - let checkpoint_bytes = self - .timeline - .get_checkpoint(self.lsn) - .context("failed to get checkpoint bytes")?; - let pg_control_bytes = self - .timeline - .get_control_file(self.lsn) - .context("failed get control bytes")?; - let mut pg_control = ControlFileData::decode(&pg_control_bytes)?; - let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?; - - // Generate new pg_control needed for bootstrap - checkpoint.redo = normalize_lsn(self.lsn, WAL_SEGMENT_SIZE).0; - - //reset some fields we don't want to preserve - //TODO Check this. - //We may need to determine the value from twophase data. - checkpoint.oldestActiveXid = 0; - - //save new values in pg_control - pg_control.checkPoint = 0; - pg_control.checkPointCopy = checkpoint; - pg_control.state = pg_constants::DB_SHUTDOWNED; - // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { @@ -388,8 +361,23 @@ where zenith_signal.as_bytes(), )?; + let checkpoint_bytes = self + .timeline + .get_checkpoint(self.lsn) + .context("failed to get checkpoint bytes")?; + let pg_control_bytes = self + .timeline + .get_control_file(self.lsn) + .context("failed get control bytes")?; + + let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control( + &pg_control_bytes, + &checkpoint_bytes, + self.lsn, + self.timeline.pg_version, + )?; + //send pg_control - let pg_control_bytes = pg_control.encode(); let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; self.ar.append(&header, &pg_control_bytes[..])?; @@ -398,8 +386,10 @@ where let wal_file_name = XLogFileName(PG_TLI, segno, WAL_SEGMENT_SIZE); let wal_file_path = format!("pg_wal/{}", wal_file_name); let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?; - let wal_seg = generate_wal_segment(segno, pg_control.system_identifier) - .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; + + let wal_seg = + postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version) + .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; ensure!(wal_seg.len() == WAL_SEGMENT_SIZE); self.ar.append(&header, &wal_seg[..])?; Ok(()) diff --git a/pageserver/src/bin/update_metadata.rs b/pageserver/src/bin/update_metadata.rs index 16359c2532..e66049c457 100644 --- a/pageserver/src/bin/update_metadata.rs +++ b/pageserver/src/bin/update_metadata.rs @@ -50,6 +50,7 @@ fn main() -> Result<()> { meta.ancestor_lsn(), meta.latest_gc_cutoff_lsn(), meta.initdb_lsn(), + meta.pg_version(), ); update_meta = true; } @@ -62,6 +63,7 @@ fn main() -> Result<()> { meta.ancestor_lsn(), meta.latest_gc_cutoff_lsn(), meta.initdb_lsn(), + meta.pg_version(), ); update_meta = true; } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 945ee098ea..a4346c0190 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -21,6 +21,7 @@ use utils::{ use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::tenant_config::{TenantConf, TenantConfOpt}; +use crate::DEFAULT_PG_VERSION; /// The name of the metadata file pageserver creates per timeline. pub const METADATA_FILE_NAME: &str = "metadata"; @@ -209,7 +210,7 @@ impl Default for PageServerConfigBuilder { workdir: Set(PathBuf::new()), pg_distrib_dir: Set(env::current_dir() .expect("cannot access current directory") - .join("pg_install/v14")), + .join(format!("pg_install/v{}", DEFAULT_PG_VERSION))), auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), @@ -374,13 +375,40 @@ impl PageServerConf { // // Postgres distribution paths // + pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf { + let mut path = self.pg_distrib_dir.clone(); - pub fn pg_bin_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("bin") + if pg_version != DEFAULT_PG_VERSION { + // step up to the parent directory + // We assume that the pg_distrib subdirs + // for different pg versions + // are located in the same directory + // and follow the naming convention: v14, v15, etc. + path.pop(); + + match pg_version { + 14 => return path.join(format!("v{pg_version}")), + 15 => return path.join(format!("v{pg_version}")), + _ => panic!("Unsupported postgres version: {}", pg_version), + }; + } + + path } - pub fn pg_lib_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("lib") + pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf { + match pg_version { + 14 => self.pg_distrib_dir(pg_version).join("bin"), + 15 => self.pg_distrib_dir(pg_version).join("bin"), + _ => panic!("Unsupported postgres version: {}", pg_version), + } + } + pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf { + match pg_version { + 14 => self.pg_distrib_dir(pg_version).join("lib"), + 15 => self.pg_distrib_dir(pg_version).join("lib"), + _ => panic!("Unsupported postgres version: {}", pg_version), + } } /// Parse a configuration file (pageserver.toml) into a PageServerConf struct, @@ -449,10 +477,11 @@ impl PageServerConf { ); } - if !conf.pg_distrib_dir.join("bin/postgres").exists() { + let pg_version = DEFAULT_PG_VERSION; + if !conf.pg_bin_dir(pg_version).join("postgres").exists() { bail!( "Can't find postgres binary at {}", - conf.pg_distrib_dir.display() + conf.pg_bin_dir(pg_version).display() ); } @@ -863,7 +892,7 @@ broker_endpoints = ['{broker_endpoint}'] let workdir = tempdir_path.join("workdir"); fs::create_dir_all(&workdir)?; - let pg_distrib_dir = tempdir_path.join("pg_distrib"); + let pg_distrib_dir = tempdir_path.join(format!("pg_distrib/v{DEFAULT_PG_VERSION}")); fs::create_dir_all(&pg_distrib_dir)?; let postgres_bin_dir = pg_distrib_dir.join("bin"); fs::create_dir_all(&postgres_bin_dir)?; diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 2d7d560d2a..851fa881a0 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -21,6 +21,7 @@ pub struct TimelineCreateRequest { #[serde(default)] #[serde_as(as = "Option")] pub ancestor_start_lsn: Option, + pub pg_version: Option, } #[serde_as] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index c676dfacd2..6892c0b391 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -173,6 +173,7 @@ async fn timeline_create_handler(mut request: Request) -> Result { // Created. Construct a TimelineInfo for it. diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index c1e736d552..23c4351b4e 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -16,11 +16,13 @@ use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; use crate::walingest::WalIngest; use crate::walrecord::DecodedWALRecord; -use postgres_ffi::v14::relfile_utils::*; -use postgres_ffi::v14::waldecoder::*; -use postgres_ffi::v14::xlog_utils::*; -use postgres_ffi::v14::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED}; +use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::*; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::ControlFileData; +use postgres_ffi::DBState_DB_SHUTDOWNED; use postgres_ffi::Oid; +use postgres_ffi::XLogFileName; use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; @@ -236,7 +238,7 @@ fn import_slru( /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> { - let mut waldecoder = WalStreamDecoder::new(startpoint); + let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version); let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE); let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE); @@ -354,7 +356,7 @@ pub fn import_wal_from_tar( end_lsn: Lsn, ) -> Result<()> { // Set up walingest mutable state - let mut waldecoder = WalStreamDecoder::new(start_lsn); + let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version); let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE); let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = start_lsn; @@ -439,7 +441,7 @@ fn import_file( len: usize, ) -> Result> { if file_path.starts_with("global") { - let spcnode = pg_constants::GLOBALTABLESPACE_OID; + let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; let dbnode = 0; match file_path @@ -467,7 +469,7 @@ fn import_file( debug!("imported relmap file") } "PG_VERSION" => { - debug!("ignored"); + debug!("ignored PG_VERSION file"); } _ => { import_rel(modification, file_path, spcnode, dbnode, reader, len)?; @@ -495,7 +497,7 @@ fn import_file( debug!("imported relmap file") } "PG_VERSION" => { - debug!("ignored"); + debug!("ignored PG_VERSION file"); } _ => { import_rel(modification, file_path, spcnode, dbnode, reader, len)?; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index e918a39457..0bd5e242d3 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -36,6 +36,8 @@ use crate::task_mgr::TaskKind; /// format, bump this! pub const STORAGE_FORMAT_VERSION: u16 = 3; +pub const DEFAULT_PG_VERSION: u32 = 14; + // Magic constants used to identify different kinds of files pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; pub const DELTA_FILE_MAGIC: u16 = 0x5A61; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 7de6403b83..fed5d0dcc4 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -43,9 +43,9 @@ use crate::task_mgr::TaskKind; use crate::tenant::Timeline; use crate::tenant_mgr; use crate::CheckpointConfig; -use postgres_ffi::v14::xlog_utils::to_pg_timestamp; -use postgres_ffi::v14::pg_constants::DEFAULTTABLESPACE_OID; +use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; +use postgres_ffi::to_pg_timestamp; use postgres_ffi::BLCKSZ; // Wrapped in libpq CopyData @@ -498,12 +498,16 @@ impl PageServerHandler { timeline_id: TimelineId, base_lsn: Lsn, _end_lsn: Lsn, + pg_version: u32, ) -> anyhow::Result<()> { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); - let timeline = tenant_mgr::get_tenant(tenant_id, true)? - .create_empty_timeline(timeline_id, base_lsn)?; + let timeline = tenant_mgr::get_tenant(tenant_id, true)?.create_empty_timeline( + timeline_id, + base_lsn, + pg_version, + )?; // TODO mark timeline as not ready until it reaches end_lsn. // We might have some wal to import as well, and we should prevent compute @@ -958,16 +962,31 @@ impl postgres_backend_async::Handler for PageServerHandler { // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN" let (_, params_raw) = query_string.split_at("import basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!(params.len() == 4); + ensure!(params.len() >= 4); let tenant_id = TenantId::from_str(params[0])?; let timeline_id = TimelineId::from_str(params[1])?; let base_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; + let pg_version = if params.len() == 5 { + u32::from_str(params[4])? + } else { + // If version is not provided, assume default. + // TODO: this may lead to weird errors if the version is wrong. + crate::DEFAULT_PG_VERSION + }; + self.check_permission(Some(tenant_id))?; match self - .handle_import_basebackup(pgb, tenant_id, timeline_id, base_lsn, end_lsn) + .handle_import_basebackup( + pgb, + tenant_id, + timeline_id, + base_lsn, + end_lsn, + pg_version, + ) .await { Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 9d4b438dc4..fc9867dc05 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -13,7 +13,7 @@ use crate::tenant::Timeline; use crate::walrecord::NeonWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; -use postgres_ffi::v14::pg_constants; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; use postgres_ffi::{Oid, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; @@ -125,8 +125,7 @@ impl Timeline { return Ok(nblocks); } - if (tag.forknum == pg_constants::FSM_FORKNUM - || tag.forknum == pg_constants::VISIBILITYMAP_FORKNUM) + if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) && !self.get_rel_exists(tag, lsn, latest)? { // FIXME: Postgres sometimes calls smgrcreate() to create @@ -1090,6 +1089,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); // 03 misc // controlfile // checkpoint +// pg_version // // Below is a full list of the keyspace allocation: // @@ -1128,7 +1128,6 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); // // Checkpoint: // 03 00000000 00000000 00000000 00 00000001 - //-- Section 01: relation data and metadata const DBDIR_KEY: Key = Key { @@ -1402,8 +1401,9 @@ fn is_slru_block_key(key: Key) -> bool { pub fn create_test_timeline( tenant: &crate::tenant::Tenant, timeline_id: utils::id::TimelineId, + pg_version: u32, ) -> Result> { - let tline = tenant.create_empty_timeline(timeline_id, Lsn(8))?; + let tline = tenant.create_empty_timeline(timeline_id, Lsn(8), pg_version)?; let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; m.commit()?; diff --git a/pageserver/src/reltag.rs b/pageserver/src/reltag.rs index e3d08f8b3d..43d38bd986 100644 --- a/pageserver/src/reltag.rs +++ b/pageserver/src/reltag.rs @@ -2,8 +2,8 @@ use serde::{Deserialize, Serialize}; use std::cmp::Ordering; use std::fmt; -use postgres_ffi::v14::pg_constants; -use postgres_ffi::v14::relfile_utils::forknumber_to_name; +use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; +use postgres_ffi::relfile_utils::forknumber_to_name; use postgres_ffi::Oid; /// @@ -78,7 +78,7 @@ impl fmt::Display for RelTag { impl RelTag { pub fn to_segfile_name(&self, segno: u32) -> String { - let mut name = if self.spcnode == pg_constants::GLOBALTABLESPACE_OID { + let mut name = if self.spcnode == GLOBALTABLESPACE_OID { "global/".to_string() } else { format!("base/{}/", self.dbnode) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 489d0ad4ed..892a34a76f 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -1445,7 +1445,17 @@ mod test_utils { } pub(super) fn dummy_metadata(disk_consistent_lsn: Lsn) -> TimelineMetadata { - TimelineMetadata::new(disk_consistent_lsn, None, None, Lsn(0), Lsn(0), Lsn(0)) + TimelineMetadata::new( + disk_consistent_lsn, + None, + None, + Lsn(0), + Lsn(0), + Lsn(0), + // Any version will do + // but it should be consistent with the one in the tests + crate::DEFAULT_PG_VERSION, + ) } } diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 13495ffefe..db37c7b411 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -341,13 +341,21 @@ mod tests { use super::*; use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; + use crate::DEFAULT_PG_VERSION; #[test] fn index_part_conversion() { let harness = TenantHarness::create("index_part_conversion").unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); - let metadata = - TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1)); + let metadata = TimelineMetadata::new( + Lsn(5).align(), + Some(Lsn(4)), + None, + Lsn(3), + Lsn(2), + Lsn(1), + DEFAULT_PG_VERSION, + ); let remote_timeline = RemoteTimeline { timeline_layers: HashSet::from([ timeline_path.join("layer_1"), @@ -464,8 +472,15 @@ mod tests { fn index_part_conversion_negatives() { let harness = TenantHarness::create("index_part_conversion_negatives").unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); - let metadata = - TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1)); + let metadata = TimelineMetadata::new( + Lsn(5).align(), + Some(Lsn(4)), + None, + Lsn(3), + Lsn(2), + Lsn(1), + DEFAULT_PG_VERSION, + ); let conversion_result = IndexPart::from_remote_timeline( &timeline_path, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ca97796870..5860e13534 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -171,6 +171,7 @@ impl Tenant { &self, new_timeline_id: TimelineId, initdb_lsn: Lsn, + pg_version: u32, ) -> Result> { // XXX: keep the lock to avoid races during timeline creation let mut timelines = self.timelines.lock().unwrap(); @@ -186,7 +187,7 @@ impl Tenant { } let new_metadata = - TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); + TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn, pg_version,); let new_timeline = self.create_initialized_timeline(new_timeline_id, new_metadata, &mut timelines)?; new_timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); @@ -387,6 +388,11 @@ impl Tenant { let mut timelines_accessor = self.timelines.lock().unwrap(); for (timeline_id, metadata) in sorted_timelines { + info!( + "Attaching timeline {} pg_version {}", + timeline_id, + metadata.pg_version() + ); let timeline = self .initialize_new_timeline(timeline_id, metadata, &mut timelines_accessor) .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; @@ -613,7 +619,7 @@ impl Tenant { }; let new_disk_consistent_lsn = new_metadata.disk_consistent_lsn(); - + let pg_version = new_metadata.pg_version(); let new_timeline = Arc::new(Timeline::new( self.conf, Arc::clone(&self.tenant_conf), @@ -623,6 +629,7 @@ impl Tenant { self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, + pg_version, )); new_timeline @@ -984,6 +991,7 @@ impl Tenant { start_lsn, *src_timeline.latest_gc_cutoff_lsn.read(), src_timeline.initdb_lsn, + src_timeline.pg_version, ); let new_timeline = self.create_initialized_timeline(dst, metadata, &mut timelines)?; info!("branched timeline {dst} from {src} at {start_lsn}"); @@ -1319,6 +1327,7 @@ pub mod harness { lsn: Lsn, base_img: Option, records: Vec<(Lsn, NeonWalRecord)>, + _pg_version: u32, ) -> Result { let s = format!( "redo for {} to get to {}, with {} and {} records", @@ -1345,6 +1354,7 @@ mod tests { use crate::keyspace::KeySpaceAccum; use crate::repository::{Key, Value}; use crate::tenant::harness::*; + use crate::DEFAULT_PG_VERSION; use bytes::BytesMut; use hex_literal::hex; use once_cell::sync::Lazy; @@ -1356,7 +1366,7 @@ mod tests { #[test] fn test_basic() -> Result<()> { let tenant = TenantHarness::create("test_basic")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1378,9 +1388,9 @@ mod tests { #[test] fn no_duplicate_timelines() -> Result<()> { let tenant = TenantHarness::create("no_duplicate_timelines")?.load(); - let _ = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let _ = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; - match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0)) { + match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION) { Ok(_) => panic!("duplicate timeline creation should fail"), Err(e) => assert_eq!( e.to_string(), @@ -1404,7 +1414,7 @@ mod tests { #[test] fn test_branch() -> Result<()> { let tenant = TenantHarness::create("test_branch")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; let writer = tline.writer(); use std::str::from_utf8; @@ -1499,7 +1509,7 @@ mod tests { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? .load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 @@ -1529,7 +1539,7 @@ mod tests { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), @@ -1555,7 +1565,7 @@ mod tests { RepoHarness::create("test_prohibit_get_for_garbage_collected_data")? .load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; @@ -1573,7 +1583,7 @@ mod tests { fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; @@ -1590,7 +1600,7 @@ mod tests { fn test_parent_keeps_data_forever_after_branching() -> Result<()> { let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; @@ -1618,7 +1628,8 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; { let tenant = harness.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; + let tline = + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x8000))?; tline.checkpoint(CheckpointConfig::Forced)?; } @@ -1638,7 +1649,7 @@ mod tests { // create two timelines { let tenant = harness.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tline.checkpoint(CheckpointConfig::Forced)?; @@ -1674,7 +1685,7 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; let tenant = harness.load(); - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; drop(tenant); let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); @@ -1711,7 +1722,7 @@ mod tests { #[test] fn test_images() -> Result<()> { let tenant = TenantHarness::create("test_images")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1761,7 +1772,7 @@ mod tests { #[test] fn test_bulk_insert() -> Result<()> { let tenant = TenantHarness::create("test_bulk_insert")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; let mut lsn = Lsn(0x10); @@ -1801,7 +1812,7 @@ mod tests { #[test] fn test_random_updates() -> Result<()> { let tenant = TenantHarness::create("test_random_updates")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; const NUM_KEYS: usize = 1000; @@ -1871,7 +1882,7 @@ mod tests { #[test] fn test_traverse_branches() -> Result<()> { let tenant = TenantHarness::create("test_traverse_branches")?.load(); - let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; const NUM_KEYS: usize = 1000; @@ -1950,7 +1961,7 @@ mod tests { #[test] fn test_traverse_ancestors() -> Result<()> { let tenant = TenantHarness::create("test_traverse_ancestors")?.load(); - let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; const NUM_KEYS: usize = 100; const NUM_TLINES: usize = 50; diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 606acbf2f1..41790b4d11 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -63,6 +63,7 @@ struct TimelineMetadataBody { ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, + pg_version: u32, } impl TimelineMetadata { @@ -73,6 +74,7 @@ impl TimelineMetadata { ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, + pg_version: u32, ) -> Self { Self { hdr: TimelineMetadataHeader { @@ -87,6 +89,7 @@ impl TimelineMetadata { ancestor_lsn, latest_gc_cutoff_lsn, initdb_lsn, + pg_version, }, } } @@ -160,6 +163,10 @@ impl TimelineMetadata { pub fn initdb_lsn(&self) -> Lsn { self.body.initdb_lsn } + + pub fn pg_version(&self) -> u32 { + self.body.pg_version + } } /// Save timeline metadata to file @@ -212,6 +219,8 @@ mod tests { Lsn(0), Lsn(0), Lsn(0), + // Any version will do here, so use the default + crate::DEFAULT_PG_VERSION, ); let metadata_bytes = original_metadata diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6de1d44876..019de81d64 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -37,7 +37,7 @@ use crate::pgdatadir_mapping::LsnForTimestamp; use crate::reltag::RelTag; use crate::tenant_config::TenantConfOpt; -use postgres_ffi::v14::xlog_utils::to_pg_timestamp; +use postgres_ffi::to_pg_timestamp; use utils::{ id::{TenantId, TimelineId}, lsn::{AtomicLsn, Lsn, RecordLsn}, @@ -61,6 +61,8 @@ pub struct Timeline { pub tenant_id: TenantId, pub timeline_id: TimelineId, + pub pg_version: u32, + pub layers: RwLock, last_freeze_at: AtomicLsn, @@ -533,6 +535,7 @@ impl Timeline { tenant_id: TenantId, walredo_mgr: Arc, upload_layers: bool, + pg_version: u32, ) -> Timeline { let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -541,6 +544,7 @@ impl Timeline { tenant_conf, timeline_id, tenant_id, + pg_version, layers: RwLock::new(LayerMap::default()), walredo_mgr, @@ -1260,6 +1264,7 @@ impl Timeline { self.ancestor_lsn, *self.latest_gc_cutoff_lsn.read(), self.initdb_lsn, + self.pg_version, ); fail_point!("checkpoint-before-saving-metadata", |x| bail!( @@ -2133,9 +2138,13 @@ impl Timeline { let last_rec_lsn = data.records.last().unwrap().0; - let img = - self.walredo_mgr - .request_redo(key, request_lsn, base_img, data.records)?; + let img = self.walredo_mgr.request_redo( + key, + request_lsn, + base_img, + data.records, + self.pg_version, + )?; if img.len() == page_cache::PAGE_SZ { let cache = page_cache::get(); diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index bede4ac13e..1d5cab38b9 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -34,8 +34,9 @@ use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; use crate::walrecord::*; +use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; -use postgres_ffi::v14::pg_constants; use postgres_ffi::v14::xlog_utils::*; use postgres_ffi::v14::CheckPoint; use postgres_ffi::TransactionId; @@ -82,7 +83,8 @@ impl<'a> WalIngest<'a> { decoded: &mut DecodedWALRecord, ) -> Result<()> { modification.lsn = lsn; - decode_wal_record(recdata, decoded).context("failed decoding wal record")?; + decode_wal_record(recdata, decoded, self.timeline.pg_version) + .context("failed decoding wal record")?; let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -113,18 +115,49 @@ impl<'a> WalIngest<'a> { let truncate = XlSmgrTruncate::decode(&mut buf); self.ingest_xlog_smgr_truncate(modification, &truncate)?; } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { - if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == pg_constants::XLOG_DBASE_CREATE - { - let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(modification, &createdb)?; - } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == pg_constants::XLOG_DBASE_DROP - { - let dropdb = XlDropDatabase::decode(&mut buf); - for tablespace_id in dropdb.tablespace_ids { - trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + debug!( + "handle RM_DBASE_ID for Postgres version {:?}", + self.timeline.pg_version + ); + if self.timeline.pg_version == 14 { + if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE + { + let createdb = XlCreateDatabase::decode(&mut buf); + debug!("XLOG_DBASE_CREATE v14"); + + self.ingest_xlog_dbase_create(modification, &createdb)?; + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v14::bindings::XLOG_DBASE_DROP + { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + } + } + } else if self.timeline.pg_version == 15 { + if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG + { + debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY + { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + debug!("XLOG_DBASE_CREATE_FILE_COPY"); + let createdb = XlCreateDatabase::decode(&mut buf); + self.ingest_xlog_dbase_create(modification, &createdb)?; + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v15::bindings::XLOG_DBASE_DROP + { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + } } } } else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID { @@ -291,7 +324,7 @@ impl<'a> WalIngest<'a> { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED) == 0 + && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version) { // Extract page image from FPI record let img_len = blk.bimg_len as usize; @@ -392,7 +425,7 @@ impl<'a> WalIngest<'a> { // Clear the VM bits if required. if new_heap_blkno.is_some() || old_heap_blkno.is_some() { let vm_rel = RelTag { - forknum: pg_constants::VISIBILITYMAP_FORKNUM, + forknum: VISIBILITYMAP_FORKNUM, spcnode: decoded.blocks[0].rnode_spcnode, dbnode: decoded.blocks[0].rnode_dbnode, relnode: decoded.blocks[0].rnode_relnode, @@ -568,7 +601,7 @@ impl<'a> WalIngest<'a> { spcnode, dbnode, relnode, - forknum: pg_constants::MAIN_FORKNUM, + forknum: MAIN_FORKNUM, }; self.put_rel_truncation(modification, rel, rec.blkno)?; } @@ -577,7 +610,7 @@ impl<'a> WalIngest<'a> { spcnode, dbnode, relnode, - forknum: pg_constants::FSM_FORKNUM, + forknum: FSM_FORKNUM, }; // FIXME: 'blkno' stored in the WAL record is the new size of the @@ -600,7 +633,7 @@ impl<'a> WalIngest<'a> { spcnode, dbnode, relnode, - forknum: pg_constants::VISIBILITYMAP_FORKNUM, + forknum: VISIBILITYMAP_FORKNUM, }; // FIXME: Like with the FSM above, the logic to truncate the VM @@ -672,7 +705,7 @@ impl<'a> WalIngest<'a> { )?; for xnode in &parsed.xnodes { - for forknum in pg_constants::MAIN_FORKNUM..=pg_constants::VISIBILITYMAP_FORKNUM { + for forknum in MAIN_FORKNUM..=VISIBILITYMAP_FORKNUM { let rel = RelTag { forknum, spcnode: xnode.spcnode, @@ -1032,6 +1065,8 @@ mod tests { use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; + use crate::DEFAULT_PG_VERSION; + /// Arbitrary relation tag, for testing. const TESTREL_A: RelTag = RelTag { spcnode: 0, @@ -1059,7 +1094,7 @@ mod tests { #[test] fn test_relsize() -> Result<()> { let tenant = TenantHarness::create("test_relsize")?.load(); - let tline = create_test_timeline(&tenant, TIMELINE_ID)?; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1187,7 +1222,7 @@ mod tests { #[test] fn test_drop_extend() -> Result<()> { let tenant = TenantHarness::create("test_drop_extend")?.load(); - let tline = create_test_timeline(&tenant, TIMELINE_ID)?; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1227,7 +1262,7 @@ mod tests { #[test] fn test_truncate_extend() -> Result<()> { let tenant = TenantHarness::create("test_truncate_extend")?.load(); - let tline = create_test_timeline(&tenant, TIMELINE_ID)?; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; let mut walingest = init_walingest_test(&*tline)?; // Create a 20 MB relation (the size is arbitrary) @@ -1315,7 +1350,7 @@ mod tests { #[test] fn test_large_rel() -> Result<()> { let tenant = TenantHarness::create("test_large_rel")?.load(); - let tline = create_test_timeline(&tenant, TIMELINE_ID)?; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; let mut walingest = init_walingest_test(&*tline)?; let mut lsn = 0x10; diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 148372c9d0..a82e69e5ba 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -1366,7 +1366,7 @@ mod tests { }, timeline: harness .load() - .create_empty_timeline(TIMELINE_ID, Lsn(0)) + .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION) .expect("Failed to create an empty timeline for dummy wal connection manager"), wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 29c4cea882..5ac9a3ef7a 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -29,7 +29,7 @@ use crate::{ walingest::WalIngest, walrecord::DecodedWALRecord, }; -use postgres_ffi::v14::waldecoder::WalStreamDecoder; +use postgres_ffi::waldecoder::WalStreamDecoder; use utils::id::TenantTimelineId; use utils::{lsn::Lsn, pq_proto::ReplicationFeedback}; @@ -166,7 +166,7 @@ pub async fn handle_walreceiver_connection( let physical_stream = ReplicationStream::new(copy_stream); pin!(physical_stream); - let mut waldecoder = WalStreamDecoder::new(startpoint); + let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index dbf9bf9d33..258e1a445f 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -3,12 +3,11 @@ //! use anyhow::Result; use bytes::{Buf, Bytes}; -use postgres_ffi::v14::pg_constants; -use postgres_ffi::v14::xlog_utils::XLOG_SIZE_OF_XLOG_RECORD; -use postgres_ffi::v14::XLogRecord; +use postgres_ffi::pg_constants; use postgres_ffi::BLCKSZ; use postgres_ffi::{BlockNumber, OffsetNumber, TimestampTz}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; +use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD}; use serde::{Deserialize, Serialize}; use tracing::*; use utils::bin_ser::DeserializeError; @@ -390,6 +389,16 @@ impl XlXactParsedRecord { xid = buf.get_u32_le(); trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE"); } + + if xinfo & postgres_ffi::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 { + let nitems = buf.get_i32_le(); + debug!( + "XLOG_XACT_COMMIT-XACT_XINFO_HAS_DROPPED_STAT nitems {}", + nitems + ); + //FIXME: do we need to handle dropped stats here? + } + XlXactParsedRecord { xid, info, @@ -517,6 +526,7 @@ impl XlMultiXactTruncate { pub fn decode_wal_record( record: Bytes, decoded: &mut DecodedWALRecord, + pg_version: u32, ) -> Result<(), DeserializeError> { let mut rnode_spcnode: u32 = 0; let mut rnode_dbnode: u32 = 0; @@ -610,9 +620,21 @@ pub fn decode_wal_record( blk.hole_offset = buf.get_u16_le(); blk.bimg_info = buf.get_u8(); - blk.apply_image = (blk.bimg_info & pg_constants::BKPIMAGE_APPLY) != 0; + blk.apply_image = if pg_version == 14 { + (blk.bimg_info & postgres_ffi::v14::bindings::BKPIMAGE_APPLY) != 0 + } else { + assert_eq!(pg_version, 15); + (blk.bimg_info & postgres_ffi::v15::bindings::BKPIMAGE_APPLY) != 0 + }; - if blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED != 0 { + let blk_img_is_compressed = + postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version); + + if blk_img_is_compressed { + debug!("compressed block image , pg_version = {}", pg_version); + } + + if blk_img_is_compressed { if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 { blk.hole_length = buf.get_u16_le(); } else { @@ -665,9 +687,7 @@ pub fn decode_wal_record( * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED * flag is set. */ - if (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0) - && blk.bimg_len == BLCKSZ - { + if !blk_img_is_compressed && blk.bimg_len == BLCKSZ { // TODO /* report_invalid_record(state, @@ -683,7 +703,7 @@ pub fn decode_wal_record( * IS_COMPRESSED flag is set. */ if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0 - && blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0 + && !blk_img_is_compressed && blk.bimg_len != BLCKSZ { // TODO diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 79c2edc96e..15a9408dc9 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -46,11 +46,12 @@ use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; use crate::walrecord::NeonWalRecord; use crate::{config::PageServerConf, TEMP_FILE_SUFFIX}; +use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, transaction_id_set_status, }; -use postgres_ffi::v14::pg_constants; use postgres_ffi::BLCKSZ; /// @@ -82,6 +83,7 @@ pub trait WalRedoManager: Send + Sync { lsn: Lsn, base_img: Option, records: Vec<(Lsn, NeonWalRecord)>, + pg_version: u32, ) -> Result; } @@ -144,6 +146,7 @@ impl WalRedoManager for PostgresRedoManager { lsn: Lsn, base_img: Option, records: Vec<(Lsn, NeonWalRecord)>, + pg_version: u32, ) -> Result { if records.is_empty() { error!("invalid WAL redo request with no records"); @@ -166,6 +169,7 @@ impl WalRedoManager for PostgresRedoManager { img, &records[batch_start..i], self.conf.wal_redo_timeout, + pg_version, ) }; img = Some(result?); @@ -184,6 +188,7 @@ impl WalRedoManager for PostgresRedoManager { img, &records[batch_start..], self.conf.wal_redo_timeout, + pg_version, ) } } @@ -212,6 +217,7 @@ impl PostgresRedoManager { base_img: Option, records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, + pg_version: u32, ) -> Result { let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; @@ -222,7 +228,7 @@ impl PostgresRedoManager { // launch the WAL redo process on first use if process_guard.is_none() { - let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id)?; + let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id, pg_version)?; *process_guard = Some(p); } let process = process_guard.as_mut().unwrap(); @@ -326,7 +332,7 @@ impl PostgresRedoManager { // sanity check that this is modifying the correct relation let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; assert!( - rel.forknum == pg_constants::VISIBILITYMAP_FORKNUM, + rel.forknum == VISIBILITYMAP_FORKNUM, "ClearVisibilityMapFlags record on unexpected rel {}", rel ); @@ -570,7 +576,11 @@ impl PostgresRedoProcess { // // Start postgres binary in special WAL redo mode. // - fn launch(conf: &PageServerConf, tenant_id: &TenantId) -> Result { + fn launch( + conf: &PageServerConf, + tenant_id: &TenantId, + pg_version: u32, + ) -> Result { // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we // just create one with constant name. That fails if you try to launch more than // one WAL redo manager concurrently. @@ -588,12 +598,12 @@ impl PostgresRedoProcess { fs::remove_dir_all(&datadir)?; } info!("running initdb in {}", datadir.display()); - let initdb = Command::new(conf.pg_bin_dir().join("initdb")) + let initdb = Command::new(conf.pg_bin_dir(pg_version).join("initdb")) .args(&["-D", &datadir.to_string_lossy()]) .arg("-N") .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) .close_fds() .output() .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?; @@ -619,14 +629,14 @@ impl PostgresRedoProcess { } // Start postgres itself - let mut child = Command::new(conf.pg_bin_dir().join("postgres")) + let mut child = Command::new(conf.pg_bin_dir(pg_version).join("postgres")) .arg("--wal-redo") .stdin(Stdio::piped()) .stderr(Stdio::piped()) .stdout(Stdio::piped()) .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) .env("PGDATA", &datadir) // The redo process is not trusted, so it runs in seccomp mode // (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 2456eb0752..3de410d117 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -22,7 +22,7 @@ use crate::safekeeper::{ use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry}; use crate::timeline::Timeline; use crate::GlobalTimelines; -use postgres_ffi::v14::xlog_utils; +use postgres_ffi::encode_logical_message; use postgres_ffi::WAL_SEGMENT_SIZE; use utils::{ lsn::Lsn, @@ -47,6 +47,7 @@ pub struct AppendLogicalMessage { epoch_start_lsn: Lsn, begin_lsn: Lsn, truncate_lsn: Lsn, + pg_version: u32, } #[derive(Serialize, Deserialize)] @@ -68,7 +69,7 @@ pub fn handle_json_ctrl( info!("JSON_CTRL request: {:?}", append_request); // need to init safekeeper state before AppendRequest - let tli = prepare_safekeeper(spg.ttid)?; + let tli = prepare_safekeeper(spg.ttid, append_request.pg_version)?; // if send_proposer_elected is true, we need to update local history if append_request.send_proposer_elected { @@ -95,11 +96,11 @@ pub fn handle_json_ctrl( /// Prepare safekeeper to process append requests without crashes, /// by sending ProposerGreeting with default server.wal_seg_size. -fn prepare_safekeeper(ttid: TenantTimelineId) -> Result> { +fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result> { GlobalTimelines::create( ttid, ServerInfo { - pg_version: 0, // unknown + pg_version, wal_seg_size: WAL_SEGMENT_SIZE as u32, system_id: 0, }, @@ -135,7 +136,7 @@ struct InsertedWAL { /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. fn append_logical_message(tli: &Arc, msg: &AppendLogicalMessage) -> Result { - let wal_data = xlog_utils::encode_logical_message(&msg.lm_prefix, &msg.lm_message); + let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); let sk_state = tli.get_state().1; let begin_lsn = msg.begin_lsn; diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 65340ac0ed..eec24faf2f 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -27,7 +27,7 @@ use utils::{ pub const SK_MAGIC: u32 = 0xcafeceefu32; pub const SK_FORMAT_VERSION: u32 = 6; const SK_PROTOCOL_VERSION: u32 = 2; -const UNKNOWN_SERVER_VERSION: u32 = 0; +pub const UNKNOWN_SERVER_VERSION: u32 = 0; /// Consensus logical timestamp. pub type Term = u64; @@ -594,15 +594,20 @@ where SK_PROTOCOL_VERSION ); } - // Postgres upgrade is not treated as fatal error - if msg.pg_version != self.state.server.pg_version + /* Postgres major version mismatch is treated as fatal error + * because safekeepers parse WAL headers and the format + * may change between versions. + */ + if msg.pg_version / 10000 != self.state.server.pg_version / 10000 && self.state.server.pg_version != UNKNOWN_SERVER_VERSION { - warn!( + bail!( "incompatible server version {}, expected {}", - msg.pg_version, self.state.server.pg_version + msg.pg_version, + self.state.server.pg_version ); } + if msg.tenant_id != self.state.tenant_id { bail!( "invalid tenant ID, got {}, expected {}", @@ -634,6 +639,10 @@ where let mut state = self.state.clone(); state.server.system_id = msg.system_id; + state.server.wal_seg_size = msg.wal_seg_size; + if msg.pg_version != UNKNOWN_SERVER_VERSION { + state.server.pg_version = msg.pg_version; + } self.state.persist(&state)?; } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 5a38558e9c..2829c875ed 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -8,7 +8,7 @@ use crate::GlobalTimelines; use anyhow::{bail, Context, Result}; use bytes::Bytes; -use postgres_ffi::v14::xlog_utils::get_current_timestamp; +use postgres_ffi::get_current_timestamp; use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; use serde::{Deserialize, Serialize}; use std::cmp::min; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 0d5321fb3a..c82a003161 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -11,7 +11,8 @@ use std::pin::Pin; use std::sync::Arc; use std::time::Duration; -use postgres_ffi::v14::xlog_utils::{XLogFileName, XLogSegNoOffsetToRecPtr}; +use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; +use postgres_ffi::XLogFileName; use postgres_ffi::{XLogSegNo, PG_TLI}; use remote_storage::GenericRemoteStorage; use tokio::fs::File; diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 692bd18342..44dc313ef6 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -29,13 +29,14 @@ use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::metrics::{time_io_closure, WalStorageMetrics}; use crate::safekeeper::SafeKeeperState; +use crate::safekeeper::UNKNOWN_SERVER_VERSION; use crate::wal_backup::read_object; use crate::SafeKeeperConf; -use postgres_ffi::v14::xlog_utils::XLogFileName; +use postgres_ffi::XLogFileName; use postgres_ffi::XLOG_BLCKSZ; -use postgres_ffi::v14::waldecoder::WalStreamDecoder; +use postgres_ffi::waldecoder::WalStreamDecoder; use tokio::io::{AsyncReadExt, AsyncSeekExt}; @@ -139,7 +140,7 @@ impl PhysicalStorage { write_lsn, write_record_lsn: write_lsn, flush_record_lsn: flush_lsn, - decoder: WalStreamDecoder::new(write_lsn), + decoder: WalStreamDecoder::new(write_lsn, UNKNOWN_SERVER_VERSION), file: None, }) } @@ -291,7 +292,8 @@ impl Storage for PhysicalStorage { self.decoder.available(), startpos, ); - self.decoder = WalStreamDecoder::new(startpos); + let pg_version = self.decoder.pg_version; + self.decoder = WalStreamDecoder::new(startpos, pg_version); } self.decoder.feed_bytes(buf); loop { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1e83ee3839..c1ebc6aa7d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -59,7 +59,7 @@ Env = Dict[str, str] Fn = TypeVar("Fn", bound=Callable[..., Any]) DEFAULT_OUTPUT_DIR = "test_output" -DEFAULT_POSTGRES_DIR = "pg_install/v14" +DEFAULT_PG_VERSION_DEFAULT = "14" DEFAULT_BRANCH_NAME = "main" BASE_PORT = 15000 @@ -71,6 +71,7 @@ base_dir = "" neon_binpath = "" pg_distrib_dir = "" top_output_dir = "" +pg_version = "" def pytest_configure(config): @@ -100,12 +101,21 @@ def pytest_configure(config): Path(top_output_dir).mkdir(exist_ok=True) # Find the postgres installation. + global pg_version + pg_version = os.environ.get("DEFAULT_PG_VERSION", DEFAULT_PG_VERSION_DEFAULT) + global pg_distrib_dir + + # TODO get rid of the POSTGRES_DISTRIB_DIR env var ? + # use DEFAULT_PG_VERSION instead to generate the path env_postgres_bin = os.environ.get("POSTGRES_DISTRIB_DIR") if env_postgres_bin: pg_distrib_dir = env_postgres_bin else: - pg_distrib_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR)) + pg_distrib_dir = os.path.normpath( + os.path.join(base_dir, "pg_install/v{}".format(pg_version)) + ) + log.info(f"pg_distrib_dir is {pg_distrib_dir}") if os.getenv("REMOTE_ENV"): # When testing against a remote server, we only need the client binary. @@ -1185,6 +1195,7 @@ class AbstractNeonCli(abc.ABC): env_vars = os.environ.copy() env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir) env_vars["POSTGRES_DISTRIB_DIR"] = str(pg_distrib_dir) + env_vars["DEFAULT_PG_VERSION"] = str(pg_version) if self.env.rust_log_override is not None: env_vars["RUST_LOG"] = self.env.rust_log_override for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): @@ -1251,6 +1262,8 @@ class NeonCli(AbstractNeonCli): str(tenant_id), "--timeline-id", str(timeline_id), + "--pg-version", + pg_version, ] ) else: @@ -1262,6 +1275,8 @@ class NeonCli(AbstractNeonCli): str(tenant_id), "--timeline-id", str(timeline_id), + "--pg-version", + pg_version, ] + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) ) @@ -1296,6 +1311,8 @@ class NeonCli(AbstractNeonCli): new_branch_name, "--tenant-id", str(tenant_id or self.env.initial_tenant), + "--pg-version", + pg_version, ] res = self.raw_cli(cmd) @@ -1317,6 +1334,8 @@ class NeonCli(AbstractNeonCli): branch_name, "--tenant-id", str(tenant_id or self.env.initial_tenant), + "--pg-version", + pg_version, ] res = self.raw_cli(cmd) @@ -1395,6 +1414,9 @@ class NeonCli(AbstractNeonCli): cmd = ["init", f"--config={tmp.name}"] if initial_timeline_id: cmd.extend(["--timeline-id", str(initial_timeline_id)]) + + cmd.extend(["--pg-version", pg_version]) + append_pageserver_param_overrides( params_to_update=cmd, remote_storage=self.env.remote_storage, @@ -1476,6 +1498,8 @@ class NeonCli(AbstractNeonCli): str(tenant_id or self.env.initial_tenant), "--branch-name", branch_name, + "--pg-version", + pg_version, ] if lsn is not None: args.extend(["--lsn", str(lsn)]) @@ -1500,6 +1524,8 @@ class NeonCli(AbstractNeonCli): "start", "--tenant-id", str(tenant_id or self.env.initial_tenant), + "--pg-version", + pg_version, ] if lsn is not None: args.append(f"--lsn={lsn}") diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 885a0dc26f..417595ae4d 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -14,6 +14,7 @@ from fixtures.neon_fixtures import ( PgBin, Postgres, pg_distrib_dir, + pg_version, wait_for_last_record_lsn, wait_for_upload, ) @@ -96,6 +97,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build end_lsn, "--wal-tarfile", wal, + "--pg-version", + pg_version, ] ) @@ -248,6 +251,8 @@ def _import( str(lsn), "--base-tarfile", os.path.join(tar_output_file), + "--pg-version", + pg_version, ] ) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index aa5a65f446..4934fb9354 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -5,7 +5,13 @@ import os from pathlib import Path import pytest -from fixtures.neon_fixtures import NeonEnv, base_dir, check_restored_datadir_content, pg_distrib_dir +from fixtures.neon_fixtures import ( + NeonEnv, + base_dir, + check_restored_datadir_content, + pg_distrib_dir, + pg_version, +) # Run the main PostgreSQL regression tests, in src/test/regress. @@ -26,8 +32,8 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, cap (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. - build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/regress") - src_path = os.path.join(base_dir, "vendor/postgres-v14/src/test/regress") + build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/regress").format(pg_version) + src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/regress").format(pg_version) bindir = os.path.join(pg_distrib_dir, "bin") schedule = os.path.join(src_path, "parallel_schedule") pg_regress = os.path.join(build_path, "pg_regress") @@ -80,8 +86,8 @@ def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, caps (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_isolation_regress will need. - build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/isolation") - src_path = os.path.join(base_dir, "vendor/postgres-v14/src/test/isolation") + build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/isolation".format(pg_version)) + src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/isolation".format(pg_version)) bindir = os.path.join(pg_distrib_dir, "bin") schedule = os.path.join(src_path, "isolation_schedule") pg_isolation_regress = os.path.join(build_path, "pg_isolation_regress") @@ -124,7 +130,7 @@ def test_sql_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, ca # Compute all the file locations that pg_regress will need. # This test runs neon specific tests - build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/regress") + build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/regress").format(pg_version) src_path = os.path.join(base_dir, "test_runner/sql_regress") bindir = os.path.join(pg_distrib_dir, "bin") schedule = os.path.join(src_path, "parallel_schedule") diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 931de0f1e3..73e26bd207 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -29,6 +29,7 @@ from fixtures.neon_fixtures import ( SafekeeperPort, available_remote_storages, neon_binpath, + pg_version, wait_for_last_record_lsn, wait_for_upload, ) @@ -634,6 +635,9 @@ class ProposerPostgres(PgProtocol): } basepath = self.pg_bin.run_capture(command, env) + + log.info(f"postgres --sync-safekeepers output: {basepath}") + stdout_filename = basepath + ".stdout" with open(stdout_filename, "r") as stdout_f: @@ -662,7 +666,9 @@ class ProposerPostgres(PgProtocol): # insert wal in all safekeepers and run sync on proposer def test_sync_safekeepers( - neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + port_distributor: PortDistributor, ): # We don't really need the full environment for this test, just the @@ -699,6 +705,7 @@ def test_sync_safekeepers( "begin_lsn": int(begin_lsn), "epoch_start_lsn": int(epoch_start_lsn), "truncate_lsn": int(epoch_start_lsn), + "pg_version": int(pg_version) * 10000, }, ) lsn = Lsn(res["inserted_wal"]["end_lsn"]) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 19d948fd47..796770565f 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 19d948fd47f45d83367062d9a54709cf2d9c8921 +Subproject commit 796770565ff668b585e80733b8d679961ad50e93 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 5b8b3eeef5..9383aaa9c2 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 5b8b3eeef5ec34c0cad9377833906a1387841d04 +Subproject commit 9383aaa9c2616fd81cfafb058fe0d692f5e43ac3 From 9dfede81467aaaabf21518a949ce870d735155e5 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 14 Sep 2022 18:34:30 +0300 Subject: [PATCH 0815/1022] Handle backwards-compatibility of TimelineMetadata. This commit bumps TimelineMetadata format version and makes it independent from STORAGE_FORMAT_VERSION. --- pageserver/src/lib.rs | 6 +- pageserver/src/tenant/metadata.rs | 161 ++++++++++++++++++++++++++---- 2 files changed, 148 insertions(+), 19 deletions(-) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 0bd5e242d3..7937f72de7 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -31,9 +31,11 @@ use crate::task_mgr::TaskKind; /// Current storage format version /// -/// This is embedded in the metadata file, and also in the header of all the -/// layer files. If you make any backwards-incompatible changes to the storage +/// This is embedded in the header of all the layer files. +/// If you make any backwards-incompatible changes to the storage /// format, bump this! +/// Note that TimelineMetadata uses its own version number to track +/// backwards-compatible changes to the metadata format. pub const STORAGE_FORMAT_VERSION: u16 = 3; pub const DEFAULT_PG_VERSION: u32 = 14; diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 41790b4d11..6d18153b4c 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -20,7 +20,12 @@ use utils::{ use crate::config::PageServerConf; use crate::virtual_file::VirtualFile; -use crate::STORAGE_FORMAT_VERSION; + +/// Use special format number to enable backward compatibility. +const METADATA_FORMAT_VERSION: u16 = 4; + +/// Previous supported format versions. +const METADATA_OLD_FORMAT_VERSION: u16 = 3; /// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic. /// @@ -34,19 +39,19 @@ const METADATA_MAX_SIZE: usize = 512; #[derive(Debug, Clone, PartialEq, Eq)] pub struct TimelineMetadata { hdr: TimelineMetadataHeader, - body: TimelineMetadataBody, + body: TimelineMetadataBodyV2, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] struct TimelineMetadataHeader { checksum: u32, // CRC of serialized metadata body size: u16, // size of serialized metadata - format_version: u16, // storage format version (used for compatibility checks) + format_version: u16, // metadata format version (used for compatibility checks) } const METADATA_HDR_SIZE: usize = std::mem::size_of::(); #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -struct TimelineMetadataBody { +struct TimelineMetadataBodyV2 { disk_consistent_lsn: Lsn, // This is only set if we know it. We track it in memory when the page // server is running, but we only track the value corresponding to @@ -66,6 +71,26 @@ struct TimelineMetadataBody { pg_version: u32, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +struct TimelineMetadataBodyV1 { + disk_consistent_lsn: Lsn, + // This is only set if we know it. We track it in memory when the page + // server is running, but we only track the value corresponding to + // 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a + // lot. We only store it in the metadata file when we flush *all* the + // in-memory data so that 'last_record_lsn' is the same as + // 'disk_consistent_lsn'. That's OK, because after page server restart, as + // soon as we reprocess at least one record, we will have a valid + // 'prev_record_lsn' value in memory again. This is only really needed when + // doing a clean shutdown, so that there is no more WAL beyond + // 'disk_consistent_lsn' + prev_record_lsn: Option, + ancestor_timeline: Option, + ancestor_lsn: Lsn, + latest_gc_cutoff_lsn: Lsn, + initdb_lsn: Lsn, +} + impl TimelineMetadata { pub fn new( disk_consistent_lsn: Lsn, @@ -80,9 +105,9 @@ impl TimelineMetadata { hdr: TimelineMetadataHeader { checksum: 0, size: 0, - format_version: STORAGE_FORMAT_VERSION, + format_version: METADATA_FORMAT_VERSION, }, - body: TimelineMetadataBody { + body: TimelineMetadataBodyV2 { disk_consistent_lsn, prev_record_lsn, ancestor_timeline, @@ -94,16 +119,43 @@ impl TimelineMetadata { } } + fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result { + let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?; + + // backward compatible only up to this version + ensure!( + hdr.format_version == METADATA_OLD_FORMAT_VERSION, + "unsupported metadata format version {}", + hdr.format_version + ); + + let metadata_size = hdr.size as usize; + + let body: TimelineMetadataBodyV1 = + TimelineMetadataBodyV1::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; + + let body = TimelineMetadataBodyV2 { + disk_consistent_lsn: body.disk_consistent_lsn, + prev_record_lsn: body.prev_record_lsn, + ancestor_timeline: body.ancestor_timeline, + ancestor_lsn: body.ancestor_lsn, + latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn, + initdb_lsn: body.initdb_lsn, + pg_version: 14, // All timelines created before this version had pg_version 14 + }; + + hdr.format_version = METADATA_FORMAT_VERSION; + + Ok(Self { hdr, body }) + } + pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result { ensure!( metadata_bytes.len() == METADATA_MAX_SIZE, "metadata bytes size is wrong" ); let hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?; - ensure!( - hdr.format_version == STORAGE_FORMAT_VERSION, - "format version mismatch" - ); + let metadata_size = hdr.size as usize; ensure!( metadata_size <= METADATA_MAX_SIZE, @@ -114,13 +166,20 @@ impl TimelineMetadata { hdr.checksum == calculated_checksum, "metadata checksum mismatch" ); - let body = TimelineMetadataBody::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; - ensure!( - body.disk_consistent_lsn.is_aligned(), - "disk_consistent_lsn is not aligned" - ); - Ok(TimelineMetadata { hdr, body }) + if hdr.format_version != METADATA_FORMAT_VERSION { + // If metadata has the old format, + // upgrade it and return the result + TimelineMetadata::upgrade_timeline_metadata(&metadata_bytes) + } else { + let body = + TimelineMetadataBodyV2::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; + ensure!( + body.disk_consistent_lsn.is_aligned(), + "disk_consistent_lsn is not aligned" + ); + Ok(TimelineMetadata { hdr, body }) + } } pub fn to_bytes(&self) -> anyhow::Result> { @@ -128,7 +187,7 @@ impl TimelineMetadata { let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); let hdr = TimelineMetadataHeader { size: metadata_size as u16, - format_version: STORAGE_FORMAT_VERSION, + format_version: METADATA_FORMAT_VERSION, checksum: crc32c::crc32c(&body_bytes), }; let hdr_bytes = hdr.ser()?; @@ -235,4 +294,72 @@ mod tests { "Metadata that was serialized to bytes and deserialized back should not change" ); } + + // Generate old version metadata and read it with current code. + // Ensure that it is upgraded correctly + #[test] + fn test_metadata_upgrade() { + #[derive(Debug, Clone, PartialEq, Eq)] + struct TimelineMetadataV1 { + hdr: TimelineMetadataHeader, + body: TimelineMetadataBodyV1, + } + + let metadata_v1 = TimelineMetadataV1 { + hdr: TimelineMetadataHeader { + checksum: 0, + size: 0, + format_version: METADATA_OLD_FORMAT_VERSION, + }, + body: TimelineMetadataBodyV1 { + disk_consistent_lsn: Lsn(0x200), + prev_record_lsn: Some(Lsn(0x100)), + ancestor_timeline: Some(TIMELINE_ID), + ancestor_lsn: Lsn(0), + latest_gc_cutoff_lsn: Lsn(0), + initdb_lsn: Lsn(0), + }, + }; + + impl TimelineMetadataV1 { + pub fn to_bytes(&self) -> anyhow::Result> { + let body_bytes = self.body.ser()?; + let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); + let hdr = TimelineMetadataHeader { + size: metadata_size as u16, + format_version: METADATA_OLD_FORMAT_VERSION, + checksum: crc32c::crc32c(&body_bytes), + }; + let hdr_bytes = hdr.ser()?; + let mut metadata_bytes = vec![0u8; METADATA_MAX_SIZE]; + metadata_bytes[0..METADATA_HDR_SIZE].copy_from_slice(&hdr_bytes); + metadata_bytes[METADATA_HDR_SIZE..metadata_size].copy_from_slice(&body_bytes); + Ok(metadata_bytes) + } + } + + let metadata_bytes = metadata_v1 + .to_bytes() + .expect("Should serialize correct metadata to bytes"); + + // This should deserialize to the latest version format + let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes) + .expect("Should deserialize its own bytes"); + + let expected_metadata = TimelineMetadata::new( + Lsn(0x200), + Some(Lsn(0x100)), + Some(TIMELINE_ID), + Lsn(0), + Lsn(0), + Lsn(0), + 14, // All timelines created before this version had pg_version 14 + ); + + assert_eq!( + deserialized_metadata.body, expected_metadata.body, + "Metadata of the old version {} should be upgraded to the latest version {}", + METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION + ); + } } From 03c606f7c5fbb1bcd2ba79ea0d21849d298c1400 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 14 Sep 2022 19:40:04 +0300 Subject: [PATCH 0816/1022] Pass pg_version parameter to timeline import command. Add pg_version field to LocalTimelineInfo. Use pg_version in the export_import_between_pageservers script --- control_plane/src/bin/neon_local.rs | 12 ++++++------ control_plane/src/storage.rs | 6 ++++-- pageserver/src/http/models.rs | 1 + pageserver/src/http/routes.rs | 1 + pageserver/src/page_service.rs | 13 +++---------- scripts/export_import_between_pageservers.py | 4 +++- 6 files changed, 18 insertions(+), 19 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 92782ea235..93947d5326 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -695,18 +695,18 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - // TODO validate both or none are provided let pg_wal = end_lsn.zip(wal_tarfile); - let mut cplane = ComputeControlPlane::load(env.clone())?; - println!("Importing timeline into pageserver ..."); - pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal)?; - println!("Creating node for imported timeline ..."); - env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; - let pg_version = import_match .value_of("pg-version") .unwrap() .parse::() .context("Failed to parse postgres version from the argument string")?; + let mut cplane = ComputeControlPlane::load(env.clone())?; + println!("Importing timeline into pageserver ..."); + pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?; + println!("Creating node for imported timeline ..."); + env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; + cplane.new_node(tenant_id, name, timeline_id, None, None, pg_version)?; println!("Done"); } diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 95ade14fbf..9032f99971 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -547,6 +547,7 @@ impl PageServerNode { timeline_id: TimelineId, base: (Lsn, PathBuf), pg_wal: Option<(Lsn, PathBuf)>, + pg_version: u32, ) -> anyhow::Result<()> { let mut client = self.pg_connection_config.connect(NoTls).unwrap(); @@ -565,8 +566,9 @@ impl PageServerNode { }; // Import base - let import_cmd = - format!("import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn}"); + let import_cmd = format!( + "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}" + ); let mut writer = client.copy_in(&import_cmd)?; io::copy(&mut base_reader, &mut writer)?; writer.finish()?; diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 851fa881a0..d5559653b2 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -138,6 +138,7 @@ pub struct LocalTimelineInfo { pub last_received_msg_lsn: Option, /// the timestamp (in microseconds) of the last received message pub last_received_msg_ts: Option, + pub pg_version: u32, } #[serde_as] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 6892c0b391..a55c6c973e 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -130,6 +130,7 @@ fn local_timeline_info_from_timeline( wal_source_connstr, last_received_msg_lsn, last_received_msg_ts, + pg_version: timeline.pg_version, }; Ok(info) } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index fed5d0dcc4..368b4c8bee 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -959,22 +959,15 @@ impl postgres_backend_async::Handler for PageServerHandler { // 1. Get start/end LSN from backup_manifest file // 2. Run: // cat my_backup/base.tar | psql -h $PAGESERVER \ - // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN" + // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION" let (_, params_raw) = query_string.split_at("import basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!(params.len() >= 4); + ensure!(params.len() == 5); let tenant_id = TenantId::from_str(params[0])?; let timeline_id = TimelineId::from_str(params[1])?; let base_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; - - let pg_version = if params.len() == 5 { - u32::from_str(params[4])? - } else { - // If version is not provided, assume default. - // TODO: this may lead to weird errors if the version is wrong. - crate::DEFAULT_PG_VERSION - }; + let pg_version = u32::from_str(params[4])?; self.check_permission(Some(tenant_id))?; diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index af847be49e..0fccf5199d 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -470,9 +470,10 @@ def import_timeline( last_lsn, prev_lsn, tar_filename, + pg_version, ): # Import timelines to new pageserver - import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn}" + import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn} {pg_version}" full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """ stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr") @@ -594,6 +595,7 @@ def main(args: argparse.Namespace): last_lsn, prev_lsn, tar_filename, + timeline["local"]["pg_version"], ) # Re-export and compare From a4397d43e997247f703b28baa81d5ffa727a65bd Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 14 Sep 2022 20:16:22 +0300 Subject: [PATCH 0817/1022] Rename waldecoder -> waldecoder_handler.rs. Add comments --- libs/postgres_ffi/src/lib.rs | 8 +++++--- .../src/{waldecoder.rs => waldecoder_handler.rs} | 11 +++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) rename libs/postgres_ffi/src/{waldecoder.rs => waldecoder_handler.rs} (95%) diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 25e1f6029c..1a6620a180 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -31,7 +31,7 @@ macro_rules! postgres_ffi { } pub mod controlfile_utils; pub mod nonrelfile_utils; - pub mod waldecoder; + pub mod waldecoder_handler; pub mod xlog_utils; pub const PG_MAJORVERSION: &str = stringify!($version); @@ -216,12 +216,14 @@ pub mod waldecoder { pub fn poll_decode(&mut self) -> Result, WalDecodeError> { match self.pg_version { + // This is a trick to support both versions simultaneously. + // See WalStreamDecoderHandler comments. 14 => { - use self::v14::waldecoder::WalStreamDecoderHandler; + use self::v14::waldecoder_handler::WalStreamDecoderHandler; self.poll_decode_internal() } 15 => { - use self::v15::waldecoder::WalStreamDecoderHandler; + use self::v15::waldecoder_handler::WalStreamDecoderHandler; self.poll_decode_internal() } _ => Err(WalDecodeError { diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder_handler.rs similarity index 95% rename from libs/postgres_ffi/src/waldecoder.rs rename to libs/postgres_ffi/src/waldecoder_handler.rs index 5b46d52321..b4d50375bd 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder_handler.rs @@ -26,8 +26,15 @@ pub trait WalStreamDecoderHandler { } // -// WalRecordStream is a Stream that returns a stream of WAL records -// FIXME: This isn't a proper rust stream +// This is a trick to support several postgres versions simultaneously. +// +// Page decoding code depends on postgres bindings, so it is compiled for each version. +// Thus WalStreamDecoder implements several WalStreamDecoderHandler traits. +// WalStreamDecoder poll_decode() method dispatches to the right handler based on the postgres version. +// Other methods are internal and are not dispatched. +// +// It is similar to having several impl blocks for the same struct, +// but the impls here are in different modules, so need to use a trait. // impl WalStreamDecoderHandler for WalStreamDecoder { fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError> { From a69e060f0f0683683c33fa39128173aadc35a04b Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 14 Sep 2022 20:38:59 +0300 Subject: [PATCH 0818/1022] fix clippy warning --- pageserver/src/tenant/metadata.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 6d18153b4c..3fb9ccd936 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -170,7 +170,7 @@ impl TimelineMetadata { if hdr.format_version != METADATA_FORMAT_VERSION { // If metadata has the old format, // upgrade it and return the result - TimelineMetadata::upgrade_timeline_metadata(&metadata_bytes) + TimelineMetadata::upgrade_timeline_metadata(metadata_bytes) } else { let body = TimelineMetadataBodyV2::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; From d45de3d58f12fb143963faf61cd874831e3cc6a9 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 15 Sep 2022 17:27:10 +0300 Subject: [PATCH 0819/1022] update build scripts to match pg_distrib_dir versioning schema --- .github/actions/run-python-test-set/action.yml | 4 ++-- .github/workflows/pg_clients.yml | 4 ++-- Dockerfile | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index e69cb28df1..fc3b1c9c37 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -85,7 +85,7 @@ runs: # PLATFORM will be embedded in the perf test report # and it is needed to distinguish different environments export PLATFORM=${PLATFORM:-github-actions-selfhosted} - export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install/v14} + export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 @@ -126,7 +126,7 @@ runs: # Wake up the cluster if we use remote neon instance if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then - ${POSTGRES_DISTRIB_DIR}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" + ${POSTGRES_DISTRIB_DIR}/v14/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" fi # Run the tests. diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index d04d002811..0600f9234f 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -58,12 +58,12 @@ jobs: env: REMOTE_ENV: 1 BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} - POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install/v14 + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install shell: bash -euxo pipefail {0} run: | # Test framework expects we have psql binary; # but since we don't really need it in this test, let's mock it - mkdir -p "$POSTGRES_DISTRIB_DIR/bin" && touch "$POSTGRES_DISTRIB_DIR/bin/psql"; + mkdir -p "$POSTGRES_DISTRIB_DIR/v14/bin" && touch "$POSTGRES_DISTRIB_DIR/v14/bin/psql"; ./scripts/pytest \ --junitxml=$TEST_OUTPUT/junit.xml \ --tb=short \ diff --git a/Dockerfile b/Dockerfile index 213934a844..876a20cc1a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -68,8 +68,8 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin -# v14 is default for now -COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/ +COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ +COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. @@ -78,7 +78,7 @@ RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \ && /usr/local/bin/pageserver -D /data/.neon/ --init \ -c "id=1234" \ -c "broker_endpoints=['http://etcd:2379']" \ - -c "pg_distrib_dir='/usr/local'" \ + -c "pg_distrib_dir='/usr/local/'" \ -c "listen_pg_addr='0.0.0.0:6400'" \ -c "listen_http_addr='0.0.0.0:9898'" From 5dddeb8d88354621a6b1e690057b16ce1a5c6a79 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 15 Sep 2022 17:40:29 +0300 Subject: [PATCH 0820/1022] Use non-versioned pg_distrib dir --- control_plane/src/local_env.rs | 29 ++++++++++-------------- control_plane/src/storage.rs | 2 +- docs/settings.md | 2 ++ pageserver/src/config.rs | 40 ++++++++++------------------------ pageserver/src/http/routes.rs | 2 +- 5 files changed, 26 insertions(+), 49 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 14bb4cf346..f4fbc99420 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -197,25 +197,18 @@ impl Default for SafekeeperConf { } impl LocalEnv { + pub fn pg_distrib_dir_raw(&self) -> PathBuf { + self.pg_distrib_dir.clone() + } + pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf { - let mut path = self.pg_distrib_dir.clone(); + let path = self.pg_distrib_dir.clone(); - if pg_version != DEFAULT_PG_VERSION { - // step up to the parent directory - // We assume that the pg_distrib subdirs - // for different pg versions - // are located in the same directory - // and follow the naming convention: v14, v15, etc. - path.pop(); - - match pg_version { - 14 => return path.join(format!("v{pg_version}")), - 15 => return path.join(format!("v{pg_version}")), - _ => panic!("Unsupported postgres version: {}", pg_version), - }; + match pg_version { + 14 => path.join(format!("v{pg_version}")), + 15 => path.join(format!("v{pg_version}")), + _ => panic!("Unsupported postgres version: {}", pg_version), } - - path } pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf { @@ -319,7 +312,7 @@ impl LocalEnv { let mut env: LocalEnv = toml::from_str(toml)?; // Find postgres binaries. - // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install/v14". + // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". // Note that later in the code we assume, that distrib dirs follow the same pattern // for all postgres versions. if env.pg_distrib_dir == Path::new("") { @@ -327,7 +320,7 @@ impl LocalEnv { env.pg_distrib_dir = postgres_bin.into(); } else { let cwd = env::current_dir()?; - env.pg_distrib_dir = cwd.join("pg_install/v14") + env.pg_distrib_dir = cwd.join("pg_install") } } diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 9032f99971..bfbd6e91c3 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -118,7 +118,7 @@ impl PageServerNode { // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let pg_distrib_dir_param = format!( "pg_distrib_dir='{}'", - self.env.pg_distrib_dir(pg_version).display() + self.env.pg_distrib_dir_raw().display() ); let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type); diff --git a/docs/settings.md b/docs/settings.md index 30db495dbe..878681fce1 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -155,6 +155,8 @@ for other files and for sockets for incoming connections. #### pg_distrib_dir A directory with Postgres installation to use during pageserver activities. +Since pageserver supports several postgres versions, `pg_distrib_dir` contains +a subdirectory for each version with naming convention `v{PG_MAJOR_VERSION}/`. Inside that dir, a `bin/postgres` binary should be present. The default distrib dir is `./pg_install/`. diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index a4346c0190..b75f8f8265 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -21,7 +21,6 @@ use utils::{ use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::DEFAULT_PG_VERSION; /// The name of the metadata file pageserver creates per timeline. pub const METADATA_FILE_NAME: &str = "metadata"; @@ -210,7 +209,7 @@ impl Default for PageServerConfigBuilder { workdir: Set(PathBuf::new()), pg_distrib_dir: Set(env::current_dir() .expect("cannot access current directory") - .join(format!("pg_install/v{}", DEFAULT_PG_VERSION))), + .join(format!("pg_install",))), auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), @@ -376,24 +375,13 @@ impl PageServerConf { // Postgres distribution paths // pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf { - let mut path = self.pg_distrib_dir.clone(); + let path = self.pg_distrib_dir.clone(); - if pg_version != DEFAULT_PG_VERSION { - // step up to the parent directory - // We assume that the pg_distrib subdirs - // for different pg versions - // are located in the same directory - // and follow the naming convention: v14, v15, etc. - path.pop(); - - match pg_version { - 14 => return path.join(format!("v{pg_version}")), - 15 => return path.join(format!("v{pg_version}")), - _ => panic!("Unsupported postgres version: {}", pg_version), - }; + match pg_version { + 14 => path.join(format!("v{pg_version}")), + 15 => path.join(format!("v{pg_version}")), + _ => panic!("Unsupported postgres version: {}", pg_version), } - - path } pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf { @@ -477,14 +465,6 @@ impl PageServerConf { ); } - let pg_version = DEFAULT_PG_VERSION; - if !conf.pg_bin_dir(pg_version).join("postgres").exists() { - bail!( - "Can't find postgres binary at {}", - conf.pg_bin_dir(pg_version).display() - ); - } - conf.default_tenant_conf = t_conf.merge(TenantConf::default()); Ok(conf) @@ -654,6 +634,7 @@ mod tests { use tempfile::{tempdir, TempDir}; use super::*; + use crate::DEFAULT_PG_VERSION; const ALL_BASE_VALUES_TOML: &str = r#" # Initial configuration file created by 'pageserver --init' @@ -892,9 +873,10 @@ broker_endpoints = ['{broker_endpoint}'] let workdir = tempdir_path.join("workdir"); fs::create_dir_all(&workdir)?; - let pg_distrib_dir = tempdir_path.join(format!("pg_distrib/v{DEFAULT_PG_VERSION}")); - fs::create_dir_all(&pg_distrib_dir)?; - let postgres_bin_dir = pg_distrib_dir.join("bin"); + let pg_distrib_dir = tempdir_path.join("pg_distrib"); + let pg_distrib_dir_versioned = pg_distrib_dir.join(format!("v{DEFAULT_PG_VERSION}")); + fs::create_dir_all(&pg_distrib_dir_versioned)?; + let postgres_bin_dir = pg_distrib_dir_versioned.join("bin"); fs::create_dir_all(&postgres_bin_dir)?; fs::write(postgres_bin_dir.join("postgres"), "I'm postgres, trust me")?; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a55c6c973e..72cbb0e819 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -174,7 +174,7 @@ async fn timeline_create_handler(mut request: Request) -> Result { // Created. Construct a TimelineInfo for it. From 1255ef806feea438c45dc3ee808ab53deefca6c6 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Sun, 18 Sep 2022 21:10:00 +0300 Subject: [PATCH 0821/1022] pass version to wal_craft.rs --- libs/postgres_ffi/src/xlog_utils.rs | 3 ++- libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs | 15 ++++++++++++++- libs/postgres_ffi/wal_craft/src/lib.rs | 15 +++++++++++++-- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 8389a6e971..038e0491a0 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -471,7 +471,8 @@ mod tests { .join("..") .join(".."); let cfg = Conf { - pg_distrib_dir: top_path.join(format!("pg_install/{PG_MAJORVERSION}")), + pg_version: PG_MAJORVERSION, + pg_distrib_dir: top_path.join(format!("pg_install")), datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)), }; if cfg.datadir.exists() { diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 2a607db6dc..9b9f76de7c 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -37,9 +37,16 @@ fn main() -> Result<()> { Arg::new("pg-distrib-dir") .long("pg-distrib-dir") .takes_value(true) - .help("Directory with Postgres distribution (bin and lib directories, e.g. pg_install/v14)") + .help("Directory with Postgres distribution (bin and lib directories, e.g. pg_install)") .default_value("/usr/local") ) + .arg( + Arg::new("pg-version") + .long("pg-version") + .help("Postgres version to use for the initial tenant") + .required(true) + .takes_value(true) + ) ) .subcommand( App::new("in-existing") @@ -82,8 +89,14 @@ fn main() -> Result<()> { } Ok(()) } + Some(("with-initdb", arg_matches)) => { let cfg = Conf { + pg_version: arg_matches + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?, pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(), datadir: arg_matches.value_of("datadir").unwrap().into(), }; diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 2ad92d776d..7ffe19e209 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -15,6 +15,7 @@ use tempfile::{tempdir, TempDir}; #[derive(Debug, Clone, PartialEq, Eq)] pub struct Conf { + pub pg_version: u32, pub pg_distrib_dir: PathBuf, pub datadir: PathBuf, } @@ -36,12 +37,22 @@ pub static REQUIRED_POSTGRES_CONFIG: Lazy> = Lazy::new(|| { }); impl Conf { + pub fn pg_distrib_dir(&self) -> PathBuf { + let path = self.pg_distrib_dir.clone(); + + match self.pg_version { + 14 => path.join(format!("v{}", self.pg_version)), + 15 => path.join(format!("v{}", self.pg_version)), + _ => panic!("Unsupported postgres version: {}", self.pg_version), + } + } + fn pg_bin_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("bin") + self.pg_distrib_dir().join("bin") } fn pg_lib_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("lib") + self.pg_distrib_dir().join("lib") } pub fn wal_dir(&self) -> PathBuf { From 0fde59aa4628c3e25048e014e1519e3c83462092 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Sun, 18 Sep 2022 22:45:29 +0300 Subject: [PATCH 0822/1022] use pg_version in python tests --- scripts/export_import_between_pageservers.py | 33 ++++-- test_runner/fixtures/neon_fixtures.py | 106 ++++++++++++------- test_runner/regress/test_import.py | 5 +- test_runner/regress/test_pg_regress.py | 26 ++--- test_runner/regress/test_wal_acceptor.py | 3 +- test_runner/regress/test_wal_restore.py | 4 +- 6 files changed, 109 insertions(+), 68 deletions(-) diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 0fccf5199d..1285d0476b 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -80,11 +80,13 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: class PgBin: """A helper class for executing postgres binaries""" - def __init__(self, log_dir: Path, pg_distrib_dir): + def __init__(self, log_dir: Path, pg_distrib_dir, pg_version): self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), "bin") + self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin") self.env = os.environ.copy() - self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), "lib") + self.env["LD_LIBRARY_PATH"] = os.path.join( + str(pg_distrib_dir), "v{}".format(pg_version), "lib" + ) def _fixpath(self, command: List[str]): if "/" not in command[0]: @@ -484,7 +486,7 @@ def import_timeline( with open(stdout_filename, "w") as stdout_f: with open(stderr_filename2, "w") as stderr_f: print(f"(capturing output to {stdout_filename})") - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) subprocess.run( full_cmd, stdout=stdout_f, @@ -503,7 +505,15 @@ def import_timeline( def export_timeline( - args, psql_path, pageserver_connstr, tenant_id, timeline_id, last_lsn, prev_lsn, tar_filename + args, + psql_path, + pageserver_connstr, + tenant_id, + timeline_id, + last_lsn, + prev_lsn, + tar_filename, + pg_version, ): # Choose filenames incomplete_filename = tar_filename + ".incomplete" @@ -518,13 +528,13 @@ def export_timeline( with open(incomplete_filename, "w") as stdout_f: with open(stderr_filename, "w") as stderr_f: print(f"(capturing output to {incomplete_filename})") - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) subprocess.run( cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True ) # Add missing rels - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin) # Log more info @@ -533,7 +543,8 @@ def export_timeline( def main(args: argparse.Namespace): - psql_path = str(Path(args.pg_distrib_dir) / "bin" / "psql") + # any psql version will do here. use current DEFAULT_PG_VERSION = 14 + psql_path = str(Path(args.pg_distrib_dir) / "v14" / "bin" / "psql") old_pageserver_host = args.old_pageserver_host new_pageserver_host = args.new_pageserver_host @@ -566,6 +577,8 @@ def main(args: argparse.Namespace): args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar" ) + pg_version = timeline["local"]["pg_version"] + # Export timeline from old pageserver if args.only_import is False: last_lsn, prev_lsn = get_rlsn( @@ -582,6 +595,7 @@ def main(args: argparse.Namespace): last_lsn, prev_lsn, tar_filename, + pg_version, ) # Import into new pageserver @@ -595,7 +609,7 @@ def main(args: argparse.Namespace): last_lsn, prev_lsn, tar_filename, - timeline["local"]["pg_version"], + pg_version, ) # Re-export and compare @@ -609,6 +623,7 @@ def main(args: argparse.Namespace): last_lsn, prev_lsn, re_export_filename, + pg_version, ) # Check the size is the same diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index c1ebc6aa7d..3c60437426 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -59,8 +59,8 @@ Env = Dict[str, str] Fn = TypeVar("Fn", bound=Callable[..., Any]) DEFAULT_OUTPUT_DIR = "test_output" -DEFAULT_PG_VERSION_DEFAULT = "14" DEFAULT_BRANCH_NAME = "main" +DEFAULT_PG_VERSION_DEFAULT = "14" BASE_PORT = 15000 WORKER_PORT_NUM = 1000 @@ -71,7 +71,7 @@ base_dir = "" neon_binpath = "" pg_distrib_dir = "" top_output_dir = "" -pg_version = "" +default_pg_version = "" def pytest_configure(config): @@ -101,29 +101,36 @@ def pytest_configure(config): Path(top_output_dir).mkdir(exist_ok=True) # Find the postgres installation. - global pg_version - pg_version = os.environ.get("DEFAULT_PG_VERSION", DEFAULT_PG_VERSION_DEFAULT) + global default_pg_version + log.info(f"default_pg_version is {default_pg_version}") + env_default_pg_version = os.environ.get("DEFAULT_PG_VERSION") + if env_default_pg_version: + default_pg_version = env_default_pg_version + log.info(f"default_pg_version is set to {default_pg_version}") + else: + default_pg_version = DEFAULT_PG_VERSION_DEFAULT global pg_distrib_dir - # TODO get rid of the POSTGRES_DISTRIB_DIR env var ? - # use DEFAULT_PG_VERSION instead to generate the path env_postgres_bin = os.environ.get("POSTGRES_DISTRIB_DIR") if env_postgres_bin: pg_distrib_dir = env_postgres_bin else: - pg_distrib_dir = os.path.normpath( - os.path.join(base_dir, "pg_install/v{}".format(pg_version)) - ) + pg_distrib_dir = os.path.normpath(os.path.join(base_dir, "pg_install")) log.info(f"pg_distrib_dir is {pg_distrib_dir}") + psql_bin_path = os.path.join(pg_distrib_dir, "v{}".format(default_pg_version), "bin/psql") + postgres_bin_path = os.path.join( + pg_distrib_dir, "v{}".format(default_pg_version), "bin/postgres" + ) + if os.getenv("REMOTE_ENV"): # When testing against a remote server, we only need the client binary. - if not os.path.exists(os.path.join(pg_distrib_dir, "bin/psql")): - raise Exception('psql not found at "{}"'.format(pg_distrib_dir)) + if not os.path.exists(psql_bin_path): + raise Exception('psql not found at "{}"'.format(psql_bin_path)) else: - if not os.path.exists(os.path.join(pg_distrib_dir, "bin/postgres")): - raise Exception('postgres not found at "{}"'.format(pg_distrib_dir)) + if not os.path.exists(postgres_bin_path): + raise Exception('postgres not found at "{}"'.format(postgres_bin_path)) if os.getenv("REMOTE_ENV"): # we are in remote env and do not have neon binaries locally @@ -549,6 +556,7 @@ class NeonEnvBuilder: self.env: Optional[NeonEnv] = None self.remote_storage_prefix: Optional[str] = None self.keep_remote_storage_contents: bool = True + self.pg_version = default_pg_version def init(self) -> NeonEnv: # Cannot create more than one environment from one builder @@ -761,6 +769,7 @@ class NeonEnv: self.broker = config.broker self.remote_storage = config.remote_storage self.remote_storage_users = config.remote_storage_users + self.pg_version = config.pg_version # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. @@ -1195,7 +1204,6 @@ class AbstractNeonCli(abc.ABC): env_vars = os.environ.copy() env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir) env_vars["POSTGRES_DISTRIB_DIR"] = str(pg_distrib_dir) - env_vars["DEFAULT_PG_VERSION"] = str(pg_version) if self.env.rust_log_override is not None: env_vars["RUST_LOG"] = self.env.rust_log_override for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): @@ -1263,7 +1271,7 @@ class NeonCli(AbstractNeonCli): "--timeline-id", str(timeline_id), "--pg-version", - pg_version, + self.env.pg_version, ] ) else: @@ -1276,7 +1284,7 @@ class NeonCli(AbstractNeonCli): "--timeline-id", str(timeline_id), "--pg-version", - pg_version, + self.env.pg_version, ] + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) ) @@ -1302,7 +1310,9 @@ class NeonCli(AbstractNeonCli): return res def create_timeline( - self, new_branch_name: str, tenant_id: Optional[TenantId] = None + self, + new_branch_name: str, + tenant_id: Optional[TenantId] = None, ) -> TimelineId: cmd = [ "timeline", @@ -1312,7 +1322,7 @@ class NeonCli(AbstractNeonCli): "--tenant-id", str(tenant_id or self.env.initial_tenant), "--pg-version", - pg_version, + self.env.pg_version, ] res = self.raw_cli(cmd) @@ -1326,7 +1336,11 @@ class NeonCli(AbstractNeonCli): return TimelineId(str(created_timeline_id)) - def create_root_branch(self, branch_name: str, tenant_id: Optional[TenantId] = None): + def create_root_branch( + self, + branch_name: str, + tenant_id: Optional[TenantId] = None, + ): cmd = [ "timeline", "create", @@ -1335,7 +1349,7 @@ class NeonCli(AbstractNeonCli): "--tenant-id", str(tenant_id or self.env.initial_tenant), "--pg-version", - pg_version, + self.env.pg_version, ] res = self.raw_cli(cmd) @@ -1405,7 +1419,9 @@ class NeonCli(AbstractNeonCli): return timelines_cli def init( - self, config_toml: str, initial_timeline_id: Optional[TimelineId] = None + self, + config_toml: str, + initial_timeline_id: Optional[TimelineId] = None, ) -> "subprocess.CompletedProcess[str]": with tempfile.NamedTemporaryFile(mode="w+") as tmp: tmp.write(config_toml) @@ -1415,7 +1431,7 @@ class NeonCli(AbstractNeonCli): if initial_timeline_id: cmd.extend(["--timeline-id", str(initial_timeline_id)]) - cmd.extend(["--pg-version", pg_version]) + cmd.extend(["--pg-version", self.env.pg_version]) append_pageserver_param_overrides( params_to_update=cmd, @@ -1443,7 +1459,10 @@ class NeonCli(AbstractNeonCli): log.info(f"pageserver_enabled_features success: {res.stdout}") return json.loads(res.stdout) - def pageserver_start(self, overrides=()) -> "subprocess.CompletedProcess[str]": + def pageserver_start( + self, + overrides=(), + ) -> "subprocess.CompletedProcess[str]": start_args = ["pageserver", "start", *overrides] append_pageserver_param_overrides( params_to_update=start_args, @@ -1499,7 +1518,7 @@ class NeonCli(AbstractNeonCli): "--branch-name", branch_name, "--pg-version", - pg_version, + self.env.pg_version, ] if lsn is not None: args.extend(["--lsn", str(lsn)]) @@ -1525,7 +1544,7 @@ class NeonCli(AbstractNeonCli): "--tenant-id", str(tenant_id or self.env.initial_tenant), "--pg-version", - pg_version, + self.env.pg_version, ] if lsn is not None: args.append(f"--lsn={lsn}") @@ -1655,11 +1674,13 @@ def append_pageserver_param_overrides( class PgBin: """A helper class for executing postgres binaries""" - def __init__(self, log_dir: Path): + def __init__(self, log_dir: Path, pg_version: str): self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), "bin") + self.pg_version = pg_version + self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin") + self.pg_lib_dir = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "lib") self.env = os.environ.copy() - self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), "lib") + self.env["LD_LIBRARY_PATH"] = self.pg_lib_dir def _fixpath(self, command: List[str]): if "/" not in command[0]: @@ -1714,8 +1735,8 @@ class PgBin: @pytest.fixture(scope="function") -def pg_bin(test_output_dir: Path) -> PgBin: - return PgBin(test_output_dir) +def pg_bin(test_output_dir: Path, pg_version: str) -> PgBin: + return PgBin(test_output_dir, pg_version) class VanillaPostgres(PgProtocol): @@ -1762,12 +1783,19 @@ class VanillaPostgres(PgProtocol): self.stop() +@pytest.fixture(scope="session") +def pg_version() -> str: + return default_pg_version + + @pytest.fixture(scope="function") def vanilla_pg( - test_output_dir: Path, port_distributor: PortDistributor + test_output_dir: Path, + port_distributor: PortDistributor, + pg_version: str, ) -> Iterator[VanillaPostgres]: pgdatadir = test_output_dir / "pgdata-vanilla" - pg_bin = PgBin(test_output_dir) + pg_bin = PgBin(test_output_dir, pg_version) port = port_distributor.get_port() with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: yield vanilla_pg @@ -1803,8 +1831,8 @@ class RemotePostgres(PgProtocol): @pytest.fixture(scope="function") -def remote_pg(test_output_dir: Path) -> Iterator[RemotePostgres]: - pg_bin = PgBin(test_output_dir) +def remote_pg(test_output_dir: Path, pg_version: str) -> Iterator[RemotePostgres]: + pg_bin = PgBin(test_output_dir, pg_version) connstr = os.getenv("BENCHMARK_CONNSTR") if connstr is None: @@ -2533,7 +2561,11 @@ def list_files_to_compare(pgdata_dir: Path): # pg is the existing and running compute node, that we want to compare with a basebackup -def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Postgres): +def check_restored_datadir_content( + test_output_dir: Path, + env: NeonEnv, + pg: Postgres, +): # Get the timeline ID. We need it for the 'basebackup' command timeline = TimelineId(pg.safe_psql("SHOW neon.timeline_id")[0][0]) @@ -2544,7 +2576,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post restored_dir_path = env.repo_dir / f"{pg.node_name}_restored_datadir" restored_dir_path.mkdir(exist_ok=True) - pg_bin = PgBin(test_output_dir) + pg_bin = PgBin(test_output_dir, env.pg_version) psql_path = os.path.join(pg_bin.pg_bin_path, "psql") cmd = rf""" @@ -2557,7 +2589,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": os.path.join(str(pg_distrib_dir), "lib")} + psql_env = {"LD_LIBRARY_PATH": pg_bin.pg_lib_dir} result = subprocess.run(cmd, env=psql_env, capture_output=True, text=True, shell=True) # Print captured stdout/stderr if basebackup cmd failed. diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 417595ae4d..c84d282a4d 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -14,7 +14,6 @@ from fixtures.neon_fixtures import ( PgBin, Postgres, pg_distrib_dir, - pg_version, wait_for_last_record_lsn, wait_for_upload, ) @@ -98,7 +97,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build "--wal-tarfile", wal, "--pg-version", - pg_version, + env.pg_version, ] ) @@ -252,7 +251,7 @@ def _import( "--base-tarfile", os.path.join(tar_output_file), "--pg-version", - pg_version, + env.pg_version, ] ) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 4934fb9354..f23811b671 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -5,13 +5,7 @@ import os from pathlib import Path import pytest -from fixtures.neon_fixtures import ( - NeonEnv, - base_dir, - check_restored_datadir_content, - pg_distrib_dir, - pg_version, -) +from fixtures.neon_fixtures import NeonEnv, base_dir, check_restored_datadir_content, pg_distrib_dir # Run the main PostgreSQL regression tests, in src/test/regress. @@ -32,9 +26,9 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, cap (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. - build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/regress").format(pg_version) - src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/regress").format(pg_version) - bindir = os.path.join(pg_distrib_dir, "bin") + build_path = os.path.join(pg_distrib_dir, "build/v{}/src/test/regress").format(env.pg_version) + src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/regress").format(env.pg_version) + bindir = os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin") schedule = os.path.join(src_path, "parallel_schedule") pg_regress = os.path.join(build_path, "pg_regress") @@ -86,9 +80,11 @@ def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, caps (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_isolation_regress will need. - build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/isolation".format(pg_version)) - src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/isolation".format(pg_version)) - bindir = os.path.join(pg_distrib_dir, "bin") + build_path = os.path.join(pg_distrib_dir, "build/v{}/src/test/isolation".format(env.pg_version)) + src_path = os.path.join( + base_dir, "vendor/postgres-v{}/src/test/isolation".format(env.pg_version) + ) + bindir = os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin") schedule = os.path.join(src_path, "isolation_schedule") pg_isolation_regress = os.path.join(build_path, "pg_isolation_regress") @@ -130,9 +126,9 @@ def test_sql_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, ca # Compute all the file locations that pg_regress will need. # This test runs neon specific tests - build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/regress").format(pg_version) + build_path = os.path.join(pg_distrib_dir, "build/v{}/src/test/regress").format(env.pg_version) src_path = os.path.join(base_dir, "test_runner/sql_regress") - bindir = os.path.join(pg_distrib_dir, "bin") + bindir = os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin") schedule = os.path.join(src_path, "parallel_schedule") pg_regress = os.path.join(build_path, "pg_regress") diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 73e26bd207..d5a5ec2f36 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -29,7 +29,6 @@ from fixtures.neon_fixtures import ( SafekeeperPort, available_remote_storages, neon_binpath, - pg_version, wait_for_last_record_lsn, wait_for_upload, ) @@ -705,7 +704,7 @@ def test_sync_safekeepers( "begin_lsn": int(begin_lsn), "epoch_start_lsn": int(epoch_start_lsn), "truncate_lsn": int(epoch_start_lsn), - "pg_version": int(pg_version) * 10000, + "pg_version": int(env.pg_version) * 10000, }, ) lsn = Lsn(res["inserted_wal"]["end_lsn"]) diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 21921a3bc2..db6f1e5137 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -26,11 +26,11 @@ def test_wal_restore( env.neon_cli.pageserver_stop() port = port_distributor.get_port() data_dir = test_output_dir / "pgsql.restored" - with VanillaPostgres(data_dir, PgBin(test_output_dir), port) as restored: + with VanillaPostgres(data_dir, PgBin(test_output_dir, env.pg_version), port) as restored: pg_bin.run_capture( [ os.path.join(base_dir, "libs/utils/scripts/restore_from_wal.sh"), - os.path.join(pg_distrib_dir, "bin"), + os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin"), str(test_output_dir / "repo" / "safekeepers" / "sk1" / str(tenant_id) / "*"), str(data_dir), str(port), From 8d890b3cbb150136dd6a7eab9556bd006fe18823 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 19 Sep 2022 08:38:30 +0300 Subject: [PATCH 0823/1022] fix clippy warnings --- libs/postgres_ffi/src/xlog_utils.rs | 6 ++++-- pageserver/src/config.rs | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 038e0491a0..2c16cc9cd9 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -466,13 +466,15 @@ mod tests { fn test_end_of_wal(test_name: &str) { use wal_craft::*; + let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); + // Craft some WAL let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("..") .join(".."); let cfg = Conf { - pg_version: PG_MAJORVERSION, - pg_distrib_dir: top_path.join(format!("pg_install")), + pg_version, + pg_distrib_dir: top_path.join("pg_install"), datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)), }; if cfg.datadir.exists() { diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index b75f8f8265..a52a3e8262 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -209,7 +209,7 @@ impl Default for PageServerConfigBuilder { workdir: Set(PathBuf::new()), pg_distrib_dir: Set(env::current_dir() .expect("cannot access current directory") - .join(format!("pg_install",))), + .join("pg_install")), auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), From 862902f9e5846b7edef14b296557a926efec5264 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 19 Sep 2022 14:38:51 +0300 Subject: [PATCH 0824/1022] Update readme and openapi spec --- pageserver/src/http/openapi_spec.yml | 3 +++ test_runner/README.md | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 1f2eba05ec..4e748207c8 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -307,6 +307,7 @@ paths: description: | Create a timeline. Returns new timeline id on success.\ If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline. + If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver. requestBody: content: application/json: @@ -322,6 +323,8 @@ paths: ancestor_start_lsn: type: string format: hex + pg_version: + type: integer responses: "201": description: TimelineInfo diff --git a/test_runner/README.md b/test_runner/README.md index 79b2418af6..d6ee5730ac 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -60,6 +60,12 @@ Useful environment variables: `NEON_BIN`: The directory where neon binaries can be found. `POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found. +Since pageserver supports several postgres versions, `POSTGRES_DISTRIB_DIR` must contain +a subdirectory for each version with naming convention `v{PG_VERSION}/`. +Inside that dir, a `bin/postgres` binary should be present. +`DEFAULT_PG_VERSION`: The version of Postgres to use, +This is used to construct full path to the postgres binaries. +Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"` `TEST_OUTPUT`: Set the directory where test state and test output files should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. From ed6b75e3018922f1110cb451de94e634d860e2ad Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 19 Sep 2022 15:03:11 +0300 Subject: [PATCH 0825/1022] show pg_version in create_timeline info span --- pageserver/src/http/routes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 72cbb0e819..55429420a8 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -191,7 +191,7 @@ async fn timeline_create_handler(mut request: Request) -> Result Err(ApiError::InternalServerError(err)), } } - .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn)) + .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) .await?; Ok(match new_timeline_info { From 3618c242b9ffbf678f7e68472a5d256ad51cc538 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 19 Sep 2022 15:14:01 +0300 Subject: [PATCH 0826/1022] use version specific find_end_of_wal function --- safekeeper/src/wal_storage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 44dc313ef6..9e198fc148 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -14,7 +14,7 @@ use std::pin::Pin; use tokio::io::AsyncRead; use postgres_ffi::v14::xlog_utils::{ - find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, + IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, }; use postgres_ffi::{XLogSegNo, PG_TLI}; use std::cmp::{max, min}; From d8d3cd49f4ad753443364574a31d84fc56557b46 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 21 Sep 2022 15:31:05 +0300 Subject: [PATCH 0827/1022] Update libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs Co-authored-by: MMeent --- libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 9b9f76de7c..9563298cd8 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -37,7 +37,7 @@ fn main() -> Result<()> { Arg::new("pg-distrib-dir") .long("pg-distrib-dir") .takes_value(true) - .help("Directory with Postgres distribution (bin and lib directories, e.g. pg_install)") + .help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)") .default_value("/usr/local") ) .arg( From eba419fda360bdc4a2025474b2afcd92d0ff369b Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 21 Sep 2022 18:15:34 +0300 Subject: [PATCH 0828/1022] Clean up the pg_version choice code --- libs/postgres_ffi/src/lib.rs | 13 ++++++------- pageserver/src/walingest.rs | 2 +- pageserver/src/walrecord.rs | 4 ++-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 1a6620a180..95ecc7b061 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -79,14 +79,13 @@ pub use v14::xlog_utils::XLogFileName; pub use v14::bindings::DBState_DB_SHUTDOWNED; -pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool { - if version == 14 { - bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0 - } else { - assert_eq!(version, 15); - bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0 +pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result { + match version { + 14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0), + 15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0 || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0 - || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0 + || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0), + _ => anyhow::bail!("Unknown version {}", version), } } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 1d5cab38b9..d3d2c6d9b2 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -324,7 +324,7 @@ impl<'a> WalIngest<'a> { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version) + && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)? { // Extract page image from FPI record let img_len = blk.bimg_len as usize; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 258e1a445f..38fb9a4247 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -527,7 +527,7 @@ pub fn decode_wal_record( record: Bytes, decoded: &mut DecodedWALRecord, pg_version: u32, -) -> Result<(), DeserializeError> { +) -> Result<()> { let mut rnode_spcnode: u32 = 0; let mut rnode_dbnode: u32 = 0; let mut rnode_relnode: u32 = 0; @@ -628,7 +628,7 @@ pub fn decode_wal_record( }; let blk_img_is_compressed = - postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version); + postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)?; if blk_img_is_compressed { debug!("compressed block image , pg_version = {}", pg_version); From d098542ddeb1b01b5e05e299f4979ad1677f127a Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 21 Sep 2022 18:46:20 +0300 Subject: [PATCH 0829/1022] Make test_timeline_size_metrics more stable: Compare size with Vanilla postgres size instead of hardcoded value --- test_runner/regress/test_timeline_size.py | 38 +++++++++++++++++++---- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 979d1a107f..3a482be5db 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -3,6 +3,7 @@ import random import re import time from contextlib import closing +from pathlib import Path import psycopg2.errors import psycopg2.extras @@ -11,7 +12,10 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient, + PgBin, + PortDistributor, Postgres, + VanillaPostgres, assert_timeline_local, wait_for_last_flush_lsn, ) @@ -327,7 +331,12 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): # The timeline logical and physical sizes are also exposed as prometheus metrics. # Test the metrics. -def test_timeline_size_metrics(neon_simple_env: NeonEnv): +def test_timeline_size_metrics( + neon_simple_env: NeonEnv, + test_output_dir: Path, + port_distributor: PortDistributor, + pg_version: str, +): env = neon_simple_env pageserver_http = env.pageserver.http_client() @@ -369,11 +378,28 @@ def test_timeline_size_metrics(neon_simple_env: NeonEnv): assert matches tl_logical_size_metric = int(matches.group(1)) - # An empty database is around 8 MB. There at least 3 databases, 'postgres', - # 'template0', 'template1'. So the total size should be about 32 MB. This isn't - # very accurate and can change with different PostgreSQL versions, so allow a - # couple of MB of slack. - assert math.isclose(tl_logical_size_metric, 32 * 1024 * 1024, abs_tol=2 * 1024 * 1024) + pgdatadir = test_output_dir / "pgdata-vanilla" + pg_bin = PgBin(test_output_dir, pg_version) + port = port_distributor.get_port() + with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: + vanilla_pg.configure([f"port={port}"]) + vanilla_pg.start() + + # Create database based on template0 because we can't connect to template0 + vanilla_pg.safe_psql("CREATE TABLE foo (t text)") + vanilla_pg.safe_psql( + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""" + ) + vanilla_size_sum = vanilla_pg.safe_psql( + "select sum(pg_database_size(oid)) from pg_database" + )[0][0] + + # Compare the size with Vanilla postgres. + # Allow some slack, because the logical size metric includes some things like + # the SLRUs that are not included in pg_database_size(). + assert math.isclose(tl_logical_size_metric, vanilla_size_sum, abs_tol=2 * 1024 * 1024) # The sum of the sizes of all databases, as seen by pg_database_size(), should also # be close. Again allow some slack, the logical size metric includes some things like From 1fa7d6aebf4df5e55f4f4c98e9cdba507a7d2345 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 21 Sep 2022 18:48:58 +0300 Subject: [PATCH 0830/1022] Use DEFAULT_PG_VERSION env in CI pytest --- .github/actions/run-python-test-set/action.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index fc3b1c9c37..bed0bc69dc 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -86,6 +86,7 @@ runs: # and it is needed to distinguish different environments export PLATFORM=${PLATFORM:-github-actions-selfhosted} export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} + export DEFAULT_PG_VERSION=${DEFAULT_PG_VERSION:-14} if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 From 64f64d563777cf311624f7bbc23e06ab9a9b7b3d Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 10:02:43 +0300 Subject: [PATCH 0831/1022] Fix after rebase: bump vendor/postgres-v14 to match main --- vendor/postgres-v14 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 796770565f..19d948fd47 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 796770565ff668b585e80733b8d679961ad50e93 +Subproject commit 19d948fd47f45d83367062d9a54709cf2d9c8921 From 2d012f0d324a0c764e3956171c981fa2e0455464 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 12:37:13 +0300 Subject: [PATCH 0832/1022] Fix rebase conflicts in pageserver code --- pageserver/src/tenant.rs | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 5860e13534..ed41641277 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -186,8 +186,15 @@ impl Tenant { bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.") } - let new_metadata = - TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn, pg_version,); + let new_metadata = TimelineMetadata::new( + Lsn(0), + None, + None, + Lsn(0), + initdb_lsn, + initdb_lsn, + pg_version, + ); let new_timeline = self.create_initialized_timeline(new_timeline_id, new_metadata, &mut timelines)?; new_timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); @@ -207,6 +214,7 @@ impl Tenant { new_timeline_id: Option, ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, + pg_version: u32, ) -> Result>> { let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); @@ -249,7 +257,7 @@ impl Tenant { self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? } - None => self.bootstrap_timeline(new_timeline_id)?, + None => self.bootstrap_timeline(new_timeline_id, pg_version)?, }; // Have added new timeline into the tenant, now its background tasks are needed. @@ -1001,7 +1009,11 @@ impl Tenant { /// - run initdb to init temporary instance and get bootstrap data /// - after initialization complete, remove the temp dir. - fn bootstrap_timeline(&self, timeline_id: TimelineId) -> Result> { + fn bootstrap_timeline( + &self, + timeline_id: TimelineId, + pg_version: u32, + ) -> Result> { // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` // temporary directory for basebackup files for the given timeline. let initdb_path = path_with_suffix_extension( @@ -1012,7 +1024,7 @@ impl Tenant { ); // Init temporarily repo to get bootstrap data - run_initdb(self.conf, &initdb_path)?; + run_initdb(self.conf, &initdb_path, pg_version)?; let pgdata_path = initdb_path; let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); @@ -1021,7 +1033,7 @@ impl Tenant { // LSN, and any WAL after that. // Initdb lsn will be equal to last_record_lsn which will be set after import. // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = self.create_empty_timeline(timeline_id, lsn)?; + let timeline = self.create_empty_timeline(timeline_id, lsn, pg_version)?; import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; fail::fail_point!("before-checkpoint-new-timeline", |_| { @@ -1094,10 +1106,10 @@ impl Tenant { /// Create the cluster temporarily in 'initdbpath' directory inside the repository /// to get bootstrap data for timeline initialization. -fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { +fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path, pg_version: u32) -> Result<()> { info!("running initdb in {}... ", initdbpath.display()); - let initdb_path = conf.pg_bin_dir().join("initdb"); + let initdb_path = conf.pg_bin_dir(pg_version).join("initdb"); let initdb_output = Command::new(initdb_path) .args(&["-D", &initdbpath.to_string_lossy()]) .args(&["-U", &conf.superuser]) @@ -1107,8 +1119,8 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { // so no need to fsync it .arg("--no-sync") .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) .stdout(Stdio::null()) .output() .context("failed to execute initdb")?; From 5e151192f5b4bc1df4162914426fd026193fae0c Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 12:43:11 +0300 Subject: [PATCH 0833/1022] Fix rebase conflicts in safekeeper code --- safekeeper/src/timeline.rs | 8 +++++++- safekeeper/src/wal_storage.rs | 21 +++++++++++++++------ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index ec29e13931..c16fc9f40c 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -24,12 +24,12 @@ use utils::{ pq_proto::ReplicationFeedback, }; -use crate::control_file; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, SafekeeperMemState, ServerInfo, }; use crate::send_wal::HotStandbyFeedback; +use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; use crate::metrics::FullTimelineInfo; use crate::wal_storage; @@ -103,6 +103,10 @@ impl SharedState { bail!(TimelineError::UninitializedWalSegSize(*ttid)); } + if state.server.pg_version == UNKNOWN_SERVER_VERSION { + bail!(TimelineError::UninitialinzedPgVersion(*ttid)); + } + // We don't want to write anything to disk, because we may have existing timeline there. // These functions should not change anything on disk. let control_store = control_file::FileStorage::create_new(ttid, conf, state)?; @@ -270,6 +274,8 @@ pub enum TimelineError { AlreadyExists(TenantTimelineId), #[error("Timeline {0} is not initialized, wal_seg_size is zero")] UninitializedWalSegSize(TenantTimelineId), + #[error("Timeline {0} is not initialized, pg_version is unknown")] + UninitialinzedPgVersion(TenantTimelineId), } /// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline. diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 9e198fc148..95ad71bbbd 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -13,9 +13,7 @@ use std::io::{self, Seek, SeekFrom}; use std::pin::Pin; use tokio::io::AsyncRead; -use postgres_ffi::v14::xlog_utils::{ - IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, -}; +use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; use postgres_ffi::{XLogSegNo, PG_TLI}; use std::cmp::{max, min}; @@ -29,7 +27,6 @@ use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::metrics::{time_io_closure, WalStorageMetrics}; use crate::safekeeper::SafeKeeperState; -use crate::safekeeper::UNKNOWN_SERVER_VERSION; use crate::wal_backup::read_object; use crate::SafeKeeperConf; @@ -117,7 +114,19 @@ impl PhysicalStorage { let write_lsn = if state.commit_lsn == Lsn(0) { Lsn(0) } else { - find_end_of_wal(&timeline_dir, wal_seg_size, state.commit_lsn)? + match state.server.pg_version / 10000 { + 14 => postgres_ffi::v14::xlog_utils::find_end_of_wal( + &timeline_dir, + wal_seg_size, + state.commit_lsn, + )?, + 15 => postgres_ffi::v15::xlog_utils::find_end_of_wal( + &timeline_dir, + wal_seg_size, + state.commit_lsn, + )?, + _ => bail!("unsupported postgres version"), + } }; // TODO: do we really know that write_lsn is fully flushed to disk? @@ -140,7 +149,7 @@ impl PhysicalStorage { write_lsn, write_record_lsn: write_lsn, flush_record_lsn: flush_lsn, - decoder: WalStreamDecoder::new(write_lsn, UNKNOWN_SERVER_VERSION), + decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000), file: None, }) } From 262fa3be0911a5e8ed7c310012cb064e5e39f470 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 22 Sep 2022 17:07:08 +0300 Subject: [PATCH 0834/1022] pageserver pg proto: add missing auth checks (#2494) Fixes #1858 --- pageserver/src/page_service.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 368b4c8bee..758faa4d9a 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1023,6 +1023,9 @@ impl postgres_backend_async::Handler for PageServerHandler { let params = params_raw.split(' ').collect::>(); ensure!(params.len() == 1, "invalid param number for config command"); let tenant_id = TenantId::from_str(params[0])?; + + self.check_permission(Some(tenant_id))?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), @@ -1067,14 +1070,14 @@ impl postgres_backend_async::Handler for PageServerHandler { let caps = re .captures(query_string) .with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?; - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = get_local_timeline(tenant_id, timeline_id)?; - let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; let timestamp_pg = to_pg_timestamp(timestamp); + self.check_permission(Some(tenant_id))?; + + let timeline = get_local_timeline(tenant_id, timeline_id)?; pgb.write_message(&BeMessage::RowDescription(&[RowDescriptor::text_col( b"lsn", )]))?; From 7138db927947515aae31cca0132a16e9d98469d4 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 15:48:35 +0300 Subject: [PATCH 0835/1022] Fix paths to postgres binaries in the deploy script --- .github/ansible/get_binaries.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh index f44a1ca50a..f96cff247f 100755 --- a/.github/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -24,7 +24,8 @@ tar -xzf postgres_install.tar.gz -C neon_install docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/ docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ -docker cp ${ID}:/usr/local/bin/postgres neon_install/bin/ +docker cp ${ID}:/usr/local/v14/bin/postgres neon_install/bin/v14 +docker cp ${ID}:/usr/local/v15/bin/postgres neon_install/bin/v15 docker rm -vf ${ID} # store version to file (for ansible playbooks) and create binaries tarball From 8b42c184e77f1284902e60fe29c353b7d8322eb1 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 16:06:32 +0300 Subject: [PATCH 0836/1022] Update LD_LIBRARY_PATH in deploy scripts --- .github/ansible/deploy.yaml | 4 ++-- .github/ansible/systemd/pageserver.service | 2 +- .github/ansible/systemd/safekeeper.service | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index 6982445558..7409051574 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -58,7 +58,7 @@ creates: "/storage/pageserver/data/tenants" environment: NEON_REPO_DIR: "/storage/pageserver/data" - LD_LIBRARY_PATH: "/usr/local/lib" + LD_LIBRARY_PATH: "/usr/local/v14/lib" become: true tags: - pageserver @@ -132,7 +132,7 @@ creates: "/storage/safekeeper/data/safekeeper.id" environment: NEON_REPO_DIR: "/storage/safekeeper/data" - LD_LIBRARY_PATH: "/usr/local/lib" + LD_LIBRARY_PATH: "/usr/local/v14/lib" become: true tags: - safekeeper diff --git a/.github/ansible/systemd/pageserver.service b/.github/ansible/systemd/pageserver.service index bb78054fa3..688c7e7b87 100644 --- a/.github/ansible/systemd/pageserver.service +++ b/.github/ansible/systemd/pageserver.service @@ -5,7 +5,7 @@ After=network.target auditd.service [Service] Type=simple User=pageserver -Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib +Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed diff --git a/.github/ansible/systemd/safekeeper.service b/.github/ansible/systemd/safekeeper.service index d5c6d00017..36af414761 100644 --- a/.github/ansible/systemd/safekeeper.service +++ b/.github/ansible/systemd/safekeeper.service @@ -5,7 +5,7 @@ After=network.target auditd.service [Service] Type=simple User=safekeeper -Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib +Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}' ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed From 7c1695e87d91f3ebac6c64ca699304c15568559d Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 16:11:46 +0300 Subject: [PATCH 0837/1022] fix psql path in export_import_between_pageservers script --- scripts/export_import_between_pageservers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 1285d0476b..6f6c3864dd 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -710,8 +710,8 @@ if __name__ == "__main__": "--psql-path", dest="psql_path", required=False, - default="/usr/local/bin/psql", - help="Path to the psql binary. Default: /usr/local/bin/psql", + default="/usr/local/v14/bin/psql", + help="Path to the psql binary. Default: /usr/local/v14/bin/psql", ) parser.add_argument( "--only-import", From eb9200abc82ba9634b9fdf229415df7dffb7a38b Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 17:11:52 +0300 Subject: [PATCH 0838/1022] Use version-specific path in pytest CI script --- .github/actions/run-python-test-set/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index bed0bc69dc..f3531004a1 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -127,7 +127,7 @@ runs: # Wake up the cluster if we use remote neon instance if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then - ${POSTGRES_DISTRIB_DIR}/v14/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" + ${POSTGRES_DISTRIB_DIR}/v{$DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" fi # Run the tests. From c81ede8644ea8cdd71b102235f7cd2fffa2a53d2 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 20:51:31 +0300 Subject: [PATCH 0839/1022] Hotfix for safekeeper timelines with unknown pg_version. Assume DEFAULT_PG_VERSION = 14 --- safekeeper/src/wal_storage.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 95ad71bbbd..eee7c703f9 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -125,7 +125,17 @@ impl PhysicalStorage { wal_seg_size, state.commit_lsn, )?, - _ => bail!("unsupported postgres version"), + pg_majorversion => { + // This is a quik hack to work with old timelines that don't have + // pg_version in the control file. We can remove it after this is fixed properly. + const DEFAULT_PG_MAJOR_VERSION: u32 = 14; + warn!("unknown postgres version {pg_majorversion} assume {DEFAULT_PG_MAJOR_VERSION}"); + postgres_ffi::v14::xlog_utils::find_end_of_wal( + &timeline_dir, + wal_seg_size, + state.commit_lsn, + )? + } } }; From 43560506c070ae1c557c9bdd847ea0497dde1923 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 22 Sep 2022 17:23:02 +0300 Subject: [PATCH 0840/1022] remove duplicate walreceiver connection span --- pageserver/src/walreceiver/walreceiver_connection.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 5ac9a3ef7a..15cfad1dcd 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -16,7 +16,7 @@ use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; use tokio::{pin, select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; -use tracing::{debug, error, info, info_span, trace, warn, Instrument}; +use tracing::{debug, error, info, trace, warn}; use super::TaskEvent; use crate::metrics::LIVE_CONNECTIONS_COUNT; @@ -112,8 +112,7 @@ pub async fn handle_walreceiver_connection( _ = connection_cancellation.changed() => info!("Connection cancelled"), } Ok(()) - } - .instrument(info_span!("walreceiver connection")), + }, ); // Immediately increment the gauge, then create a job to decrement it on task exit. From b0377f750a798f99e71b640d3a07ae76d480435f Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Fri, 23 Sep 2022 10:25:26 +0200 Subject: [PATCH 0841/1022] Add staging-test region to normal staging rollouts (#2500) --- .github/ansible/staging.hosts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/ansible/staging.hosts b/.github/ansible/staging.hosts index c470f8a814..f5accc188a 100644 --- a/.github/ansible/staging.hosts +++ b/.github/ansible/staging.hosts @@ -3,11 +3,15 @@ zenith-us-stage-ps-2 console_region_id=27 zenith-us-stage-ps-3 console_region_id=27 zenith-us-stage-ps-4 console_region_id=27 +zenith-us-stage-test-ps-1 console_region_id=28 [safekeepers] zenith-us-stage-sk-4 console_region_id=27 zenith-us-stage-sk-5 console_region_id=27 zenith-us-stage-sk-6 console_region_id=27 +zenith-us-stage-test-sk-1 console_region_id=28 +zenith-us-stage-test-sk-2 console_region_id=28 +zenith-us-stage-test-sk-3 console_region_id=28 [storage:children] pageservers From 52819898e4c65bcc79206d4ff20af9f1f5f08396 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Fri, 23 Sep 2022 11:25:29 +0200 Subject: [PATCH 0842/1022] Extend image push step with production ECR (#2465) * Extend image push step with production ECR * Put copy step before auth change * Use correct name * Only push on main * Fix typo --- .github/workflows/build_and_test.yml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 44db968753..5f84e20452 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -588,7 +588,16 @@ jobs: - name: Pull rust image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust - - name: Configure docker login + - name: Push images to production ECR + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + run: | + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest + + - name: Configure Docker Hub login run: | # ECR Credential Helper & Docker Hub don't work together in config, hence reset echo "" > /github/home/.docker/config.json @@ -609,7 +618,7 @@ jobs: - name: Push rust image to Docker Hub run: crane push rust neondatabase/rust:pinned - - name: Add latest tag to images + - name: Add latest tag to images in Docker Hub if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' From eb0c6bcf1a1b4eed35ba2bb439b5e30905e753f9 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 22 Sep 2022 17:31:16 +0300 Subject: [PATCH 0843/1022] reenable storage deployments --- .github/ansible/deploy.yaml | 42 ++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index 7409051574..e206f9d5ba 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -63,18 +63,18 @@ tags: - pageserver - # - name: update remote storage (s3) config - # lineinfile: - # path: /storage/pageserver/data/pageserver.toml - # line: "{{ item }}" - # loop: - # - "[remote_storage]" - # - "bucket_name = '{{ bucket_name }}'" - # - "bucket_region = '{{ bucket_region }}'" - # - "prefix_in_bucket = '{{ inventory_hostname }}'" - # become: true - # tags: - # - pageserver + - name: update remote storage (s3) config + lineinfile: + path: /storage/pageserver/data/pageserver.toml + line: "{{ item }}" + loop: + - "[remote_storage]" + - "bucket_name = '{{ bucket_name }}'" + - "bucket_region = '{{ bucket_region }}'" + - "prefix_in_bucket = '{{ inventory_hostname }}'" + become: true + tags: + - pageserver - name: upload systemd service definition ansible.builtin.template: @@ -87,15 +87,15 @@ tags: - pageserver - # - name: start systemd service - # ansible.builtin.systemd: - # daemon_reload: yes - # name: pageserver - # enabled: yes - # state: restarted - # become: true - # tags: - # - pageserver + - name: start systemd service + ansible.builtin.systemd: + daemon_reload: yes + name: pageserver + enabled: yes + state: restarted + become: true + tags: + - pageserver - name: post version to console when: console_mgmt_base_url is defined From 3e65209a067d7243162d9bd84841425e088a0d9b Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 23 Sep 2022 12:50:36 +0100 Subject: [PATCH 0844/1022] Nightly Benchmarks: use Postgres binaries from artifacts (#2501) --- .github/actions/download/action.yml | 9 ++++-- .../actions/run-python-test-set/action.yml | 2 +- .github/actions/upload/action.yml | 9 ++++-- .github/workflows/benchmarking.yml | 21 ++++++++++---- .github/workflows/build_and_test.yml | 29 +++++++++++++++++-- 5 files changed, 54 insertions(+), 16 deletions(-) diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml index 5aa45164e7..731ef6639d 100644 --- a/.github/actions/download/action.yml +++ b/.github/actions/download/action.yml @@ -12,6 +12,9 @@ inputs: description: "Allow to skip if file doesn't exist, fail otherwise" default: false required: false + prefix: + description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" + required: false runs: using: "composite" @@ -23,18 +26,18 @@ runs: TARGET: ${{ inputs.path }} ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }} + PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }} run: | BUCKET=neon-github-public-dev - PREFIX=artifacts/${GITHUB_RUN_ID} FILENAME=$(basename $ARCHIVE) - S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) + S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) if [ -z "${S3_KEY}" ]; then if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then echo '::set-output name=SKIPPED::true' exit 0 else - echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME} nor its version from previous attempts exist" + echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist" exit 1 fi fi diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index f3531004a1..cc6ab65b76 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -127,7 +127,7 @@ runs: # Wake up the cluster if we use remote neon instance if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then - ${POSTGRES_DISTRIB_DIR}/v{$DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" fi # Run the tests. diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml index de8df3230f..291a2cf3b0 100644 --- a/.github/actions/upload/action.yml +++ b/.github/actions/upload/action.yml @@ -7,6 +7,9 @@ inputs: path: description: "A directory or file to upload" required: true + prefix: + description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" + required: false runs: using: "composite" @@ -42,14 +45,14 @@ runs: env: SOURCE: ${{ inputs.path }} ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst + PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }} run: | BUCKET=neon-github-public-dev - PREFIX=artifacts/${GITHUB_RUN_ID} FILENAME=$(basename $ARCHIVE) FILESIZE=$(du -sh ${ARCHIVE} | cut -f1) - time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME} + time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${FILENAME} # Ref https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#adding-a-job-summary - echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY} + echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 4e28223c18..4d91e9fa74 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -46,7 +46,8 @@ jobs: runs-on: [self-hosted, zenith-benchmarker] env: - POSTGRES_DISTRIB_DIR: "/usr/pgsql-14" + POSTGRES_DISTRIB_DIR: /tmp/pg_install + DEFAULT_PG_VERSION: 14 steps: - name: Checkout zenith repo @@ -71,7 +72,7 @@ jobs: echo Poetry poetry --version echo Pgbench - $POSTGRES_DISTRIB_DIR/bin/pgbench --version + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - name: Create Neon Project id: create-neon-project @@ -140,7 +141,8 @@ jobs: env: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" TEST_PG_BENCH_SCALES_MATRIX: "10gb" - POSTGRES_DISTRIB_DIR: /usr + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 14 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} @@ -163,10 +165,17 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Install Deps + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Add Postgres binaries to PATH run: | - sudo apt -y update - sudo apt install -y postgresql-14 + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version + echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - name: Create Neon Project if: matrix.platform != 'neon-captest-reuse' diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5f84e20452..8a7cdec89c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -268,6 +268,32 @@ jobs: if: matrix.build_type == 'debug' uses: ./.github/actions/save-coverage-data + upload-latest-artifacts: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + needs: [ regress-tests ] + if: github.ref_name == 'main' + steps: + - name: Copy Neon artifact to the latest directory + shell: bash -euxo pipefail {0} + env: + BUCKET: neon-github-public-dev + PREFIX: artifacts/${{ github.run_id }} + run: | + for build_type in debug release; do + FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst + + S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) + if [ -z "${S3_KEY}" ]; then + echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist" + exit 1 + fi + + time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/artifacts/latest/${FILENAME} + done + benchmarks: runs-on: dev container: @@ -335,9 +361,6 @@ jobs: curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json ./scripts/pysync - # Workaround for https://github.com/neondatabase/cloud/issues/2188 - psql "$TEST_RESULT_CONNSTR" -c "SELECT 1;" || sleep 10 - DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json coverage-report: From bc3ba23e0a485e3fc5434ea093062bc4347915f1 Mon Sep 17 00:00:00 2001 From: MMeent Date: Fri, 23 Sep 2022 14:35:36 +0200 Subject: [PATCH 0845/1022] Fix extreme metrics bloat in storage sync (#2506) * Fix extreme metrics bloat in storage sync From 78 metrics per (timeline, tenant) pair down to (max) 10 metrics per (timeline, tenant) pair, plus another 117 metrics in a global histogram that replaces the previous per-timeline histogram. * Drop image sync operation metric series when dropping TimelineMetrics. --- pageserver/src/metrics.rs | 45 ++++++++++++++++++++++----- pageserver/src/storage_sync.rs | 56 +++++++++++++++++++--------------- 2 files changed, 69 insertions(+), 32 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 2f03943429..5c2f81d731 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,8 +1,9 @@ use metrics::core::{AtomicU64, GenericCounter}; use metrics::{ - register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec, - register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec, - IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, + register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, + register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, + GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, + UIntGaugeVec, }; use once_cell::sync::Lazy; use utils::id::{TenantId, TimelineId}; @@ -204,12 +205,34 @@ pub static REMAINING_SYNC_ITEMS: Lazy = Lazy::new(|| { .expect("failed to register pageserver remote storage remaining sync items int gauge") }); -pub static IMAGE_SYNC_TIME: Lazy = Lazy::new(|| { +pub static IMAGE_SYNC_TIME: Lazy = Lazy::new(|| { + register_gauge_vec!( + "pageserver_remote_storage_image_sync_duration", + "Time spent to synchronize (up/download) a whole pageserver image", + &["tenant_id", "timeline_id"], + ) + .expect("failed to register per-timeline pageserver image sync time vec") +}); + +pub static IMAGE_SYNC_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"]; +pub static IMAGE_SYNC_STATUS: &[&str] = &["success", "failure", "abort"]; + +pub static IMAGE_SYNC_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_storage_image_sync_count", + "Number of synchronization operations executed for pageserver images. \ + Grouped by tenant, timeline, operation_kind and status", + &["tenant_id", "timeline_id", "operation_kind", "status"] + ) + .expect("failed to register pageserver image sync count vec") +}); + +pub static IMAGE_SYNC_TIME_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_remote_storage_image_sync_seconds", "Time took to synchronize (download or upload) a whole pageserver image. \ - Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)", - &["tenant_id", "timeline_id", "operation_kind", "status"], + Grouped by operation_kind and status", + &["operation_kind", "status"], vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0] ) .expect("failed to register pageserver image sync time histogram vec") @@ -256,7 +279,7 @@ macro_rules! redo_histogram_time_buckets { () => { vec![ 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000, - 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, + 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, ] }; } @@ -411,6 +434,14 @@ impl Drop for TimelineMetrics { for op in SMGR_QUERY_TIME_OPERATIONS { let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]); } + + for op in IMAGE_SYNC_OPERATION_KINDS { + for status in IMAGE_SYNC_STATUS { + let _ = IMAGE_SYNC_COUNT.remove_label_values(&[tenant_id, timeline_id, op, status]); + } + } + + let _ = IMAGE_SYNC_TIME.remove_label_values(&[tenant_id, timeline_id]); } } diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 892a34a76f..776d9214d4 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -178,6 +178,7 @@ use crate::{ TenantTimelineValues, }; +use crate::metrics::{IMAGE_SYNC_COUNT, IMAGE_SYNC_TIME_HISTOGRAM}; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use self::download::download_index_parts; @@ -835,7 +836,6 @@ async fn process_sync_task_batch( sync_id, upload_data, sync_start, - "upload", ) .await } @@ -879,7 +879,6 @@ async fn process_sync_task_batch( sync_id, download_data, sync_start, - "download", ) .await; } @@ -911,7 +910,6 @@ async fn process_sync_task_batch( sync_id, delete_data, sync_start, - "delete", ) .instrument(info_span!("delete_timeline_data")) .await; @@ -948,8 +946,9 @@ async fn download_timeline_data( sync_id: TenantTimelineId, new_download_data: SyncData, sync_start: Instant, - task_name: &str, ) -> DownloadStatus { + static TASK_NAME: &str = "download"; + match download_timeline_layers( conf, storage, @@ -961,19 +960,19 @@ async fn download_timeline_data( .await { DownloadedTimeline::Abort => { - register_sync_status(sync_id, sync_start, task_name, None); + register_sync_status(sync_id, sync_start, TASK_NAME, None); if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) { error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}"); } } DownloadedTimeline::FailedAndRescheduled => { - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); } DownloadedTimeline::Successful(mut download_data) => { match update_local_metadata(conf, sync_id, current_remote_timeline).await { Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) { Ok(()) => { - register_sync_status(sync_id, sync_start, task_name, Some(true)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(true)); return DownloadStatus::Downloaded; } Err(e) => { @@ -984,7 +983,7 @@ async fn download_timeline_data( error!("Failed to update local timeline metadata: {e:?}"); download_data.retries += 1; sync_queue.push(sync_id, SyncTask::Download(download_data)); - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); } } } @@ -1060,8 +1059,9 @@ async fn delete_timeline_data( sync_id: TenantTimelineId, mut new_delete_data: SyncData, sync_start: Instant, - task_name: &str, ) { + static TASK_NAME: &str = "delete"; + let timeline_delete = &mut new_delete_data.data; if !timeline_delete.deletion_registered { @@ -1077,14 +1077,14 @@ async fn delete_timeline_data( error!("Failed to update remote timeline {sync_id}: {e:?}"); new_delete_data.retries += 1; sync_queue.push(sync_id, SyncTask::Delete(new_delete_data)); - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); return; } } timeline_delete.deletion_registered = true; let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await; - register_sync_status(sync_id, sync_start, task_name, Some(sync_status)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(sync_status)); } async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result { @@ -1103,8 +1103,8 @@ async fn upload_timeline_data( sync_id: TenantTimelineId, new_upload_data: SyncData, sync_start: Instant, - task_name: &str, ) -> UploadStatus { + static TASK_NAME: &str = "upload"; let mut uploaded_data = match upload_timeline_layers( storage, sync_queue, @@ -1115,7 +1115,7 @@ async fn upload_timeline_data( .await { UploadedTimeline::FailedAndRescheduled(e) => { - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); return UploadStatus::Failed(e); } UploadedTimeline::Successful(upload_data) => upload_data, @@ -1134,14 +1134,14 @@ async fn upload_timeline_data( .await { Ok(()) => { - register_sync_status(sync_id, sync_start, task_name, Some(true)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(true)); UploadStatus::Uploaded } Err(e) => { error!("Failed to update remote timeline {sync_id}: {e:?}"); uploaded_data.retries += 1; sync_queue.push(sync_id, SyncTask::Upload(uploaded_data)); - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); UploadStatus::Failed(e) } } @@ -1391,16 +1391,22 @@ fn register_sync_status( let tenant_id = sync_id.tenant_id.to_string(); let timeline_id = sync_id.timeline_id.to_string(); - match sync_status { - Some(true) => { - IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "success"]) - } - Some(false) => { - IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "failure"]) - } - None => return, - } - .observe(secs_elapsed) + + let sync_status = match sync_status { + Some(true) => "success", + Some(false) => "failure", + None => "abort", + }; + + IMAGE_SYNC_TIME_HISTOGRAM + .with_label_values(&[sync_name, sync_status]) + .observe(secs_elapsed); + IMAGE_SYNC_TIME + .with_label_values(&[&tenant_id, &timeline_id]) + .add(secs_elapsed); + IMAGE_SYNC_COUNT + .with_label_values(&[&tenant_id, &timeline_id, sync_name, sync_status]) + .inc(); } #[cfg(test)] From ebab89ebd22fa77ff0cf6821ff22716642fe8a03 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 23 Sep 2022 13:51:33 +0100 Subject: [PATCH 0846/1022] test_runner: pass password to pgbench via PGPASSWORD (#2468) --- test_runner/fixtures/log_helper.py | 13 -------- test_runner/fixtures/neon_fixtures.py | 5 +++ test_runner/performance/test_perf_pgbench.py | 34 +++++++++++++------- 3 files changed, 28 insertions(+), 24 deletions(-) diff --git a/test_runner/fixtures/log_helper.py b/test_runner/fixtures/log_helper.py index 7d112fce89..17f2402391 100644 --- a/test_runner/fixtures/log_helper.py +++ b/test_runner/fixtures/log_helper.py @@ -1,6 +1,5 @@ import logging import logging.config -import re """ This file configures logging to use in python tests. @@ -30,17 +29,6 @@ LOGGING = { } -class PasswordFilter(logging.Filter): - """Filter out password from logs.""" - - # Good enough to filter our passwords produced by PgProtocol.connstr - FILTER = re.compile(r"(\s*)password=[^\s]+(\s*)") - - def filter(self, record: logging.LogRecord) -> bool: - record.msg = self.FILTER.sub(r"\1password=\2", str(record.msg)) - return True - - def getLogger(name="root") -> logging.Logger: """Method to get logger for tests. @@ -50,6 +38,5 @@ def getLogger(name="root") -> logging.Logger: # default logger for tests log = getLogger() -log.addFilter(PasswordFilter()) logging.config.dictConfig(LOGGING) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 3c60437426..aa9fd68df5 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -283,10 +283,15 @@ class PgProtocol: return str(make_dsn(**self.conn_options(**kwargs))) def conn_options(self, **kwargs): + """ + Construct a dictionary of connection options from default values and extra parameters. + An option can be dropped from the returning dictionary by None-valued extra parameter. + """ result = self.default_options.copy() if "dsn" in kwargs: result.update(parse_dsn(kwargs["dsn"])) result.update(kwargs) + result = {k: v for k, v in result.items() if v is not None} # Individual statement timeout in seconds. 2 minutes should be # enough for our tests, but if you need a longer, you can diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index e167ddaafa..656826d6a3 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -4,7 +4,7 @@ import os import timeit from datetime import datetime from pathlib import Path -from typing import List +from typing import Dict, List import pytest from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult @@ -24,14 +24,18 @@ def utc_now_timestamp() -> int: return calendar.timegm(datetime.utcnow().utctimetuple()) -def init_pgbench(env: PgCompare, cmdline): +def init_pgbench(env: PgCompare, cmdline, password: None): + environ: Dict[str, str] = {} + if password is not None: + environ["PGPASSWORD"] = password + # calculate timestamps and durations separately # timestamp is intended to be used for linking to grafana and logs # duration is actually a metric and uses float instead of int for timestamp start_timestamp = utc_now_timestamp() t0 = timeit.default_timer() with env.record_pageserver_writes("init.pageserver_writes"): - out = env.pg_bin.run_capture(cmdline) + out = env.pg_bin.run_capture(cmdline, env=environ) env.flush() duration = timeit.default_timer() - t0 @@ -48,13 +52,15 @@ def init_pgbench(env: PgCompare, cmdline): env.zenbenchmark.record_pg_bench_init_result("init", res) -def run_pgbench(env: PgCompare, prefix: str, cmdline): +def run_pgbench(env: PgCompare, prefix: str, cmdline, password: None): + environ: Dict[str, str] = {} + if password is not None: + environ["PGPASSWORD"] = password + with env.record_pageserver_writes(f"{prefix}.pageserver_writes"): run_start_timestamp = utc_now_timestamp() t0 = timeit.default_timer() - out = env.pg_bin.run_capture( - cmdline, - ) + out = env.pg_bin.run_capture(cmdline, env=environ) run_duration = timeit.default_timer() - t0 run_end_timestamp = utc_now_timestamp() env.flush() @@ -82,10 +88,14 @@ def run_pgbench(env: PgCompare, prefix: str, cmdline): def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: PgBenchLoadType): env.zenbenchmark.record("scale", scale, "", MetricReport.TEST_PARAM) + password = env.pg.default_options.get("password", None) + options = "-cstatement_timeout=1h " + env.pg.default_options.get("options", "") + # drop password from the connection string by passing password=None and set password separately + connstr = env.pg.connstr(password=None, options=options) + if workload_type == PgBenchLoadType.INIT: # Run initialize - options = "-cstatement_timeout=1h " + env.pg.default_options.get("options", "") - init_pgbench(env, ["pgbench", f"-s{scale}", "-i", env.pg.connstr(options=options)]) + init_pgbench(env, ["pgbench", f"-s{scale}", "-i", connstr], password=password) if workload_type == PgBenchLoadType.SIMPLE_UPDATE: # Run simple-update workload @@ -99,8 +109,9 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P f"-T{duration}", "-P2", "--progress-timestamp", - env.pg.connstr(), + connstr, ], + password=password, ) if workload_type == PgBenchLoadType.SELECT_ONLY: @@ -115,8 +126,9 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P f"-T{duration}", "-P2", "--progress-timestamp", - env.pg.connstr(), + connstr, ], + password=password, ) env.report_size() From 1dffba9de6a0e30e1cb63c9462c88c2f6587d2f0 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 23 Sep 2022 18:30:44 +0300 Subject: [PATCH 0847/1022] Write more tests for the proxy... (#1918) And change a few more things in the process. --- proxy/src/auth/backend/console.rs | 12 ++++++++ proxy/src/auth/credentials.rs | 11 +++----- proxy/src/cancellation.rs | 46 +++++++++++++++++++++++++++++++ proxy/src/parse.rs | 28 ++++++++++++++++++- 4 files changed, 89 insertions(+), 8 deletions(-) diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index e5ee07813c..a351b82c6a 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -259,3 +259,15 @@ fn parse_host_port(input: &str) -> Option<(&str, u16)> { let (host, port) = input.split_once(':')?; Some((host, port.parse().ok()?)) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_host_port() { + let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse"); + assert_eq!(host, "127.0.0.1"); + assert_eq!(port, 5432); + } +} diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index ea71eba010..e43bcf8791 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -54,13 +54,10 @@ impl<'a> ClientCredentials<'a> { let dbname = get_param("database")?; // Project name might be passed via PG's command-line options. - let project_a = params.options_raw().and_then(|options| { - for opt in options { - if let Some(value) = opt.strip_prefix("project=") { - return Some(Cow::Borrowed(value)); - } - } - None + let project_a = params.options_raw().and_then(|mut options| { + options + .find_map(|opt| opt.strip_prefix("project=")) + .map(Cow::Borrowed) }); // Alternative project name is in fact a subdomain from SNI. diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index b7412b6f5b..92f8e35dab 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -52,6 +52,16 @@ impl CancelMap { let session = Session::new(key, self); f(session).await } + + #[cfg(test)] + fn contains(&self, session: &Session) -> bool { + self.0.lock().contains_key(&session.key) + } + + #[cfg(test)] + fn is_empty(&self) -> bool { + self.0.lock().is_empty() + } } /// This should've been a [`std::future::Future`], but @@ -104,3 +114,39 @@ impl<'a> Session<'a> { self.key } } + +#[cfg(test)] +mod tests { + use super::*; + use once_cell::sync::Lazy; + + #[tokio::test] + async fn check_session_drop() -> anyhow::Result<()> { + static CANCEL_MAP: Lazy = Lazy::new(Default::default); + + let (tx, rx) = tokio::sync::oneshot::channel(); + let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move { + assert!(CANCEL_MAP.contains(&session)); + + tx.send(()).expect("failed to send"); + let () = futures::future::pending().await; // sleep forever + + Ok(()) + })); + + // Wait until the task has been spawned. + let () = rx.await.context("failed to hear from the task")?; + + // Drop the session's entry by cancelling the task. + task.abort(); + let error = task.await.expect_err("task should have failed"); + if !error.is_cancelled() { + anyhow::bail!(error); + } + + // Check that the session has been dropped. + assert!(CANCEL_MAP.is_empty()); + + Ok(()) + } +} diff --git a/proxy/src/parse.rs b/proxy/src/parse.rs index 8a05ff9c82..cbd48d91e9 100644 --- a/proxy/src/parse.rs +++ b/proxy/src/parse.rs @@ -1,6 +1,5 @@ //! Small parsing helpers. -use std::convert::TryInto; use std::ffi::CStr; pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { @@ -10,9 +9,36 @@ pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { Some((unsafe { CStr::from_bytes_with_nul_unchecked(cstr) }, other)) } +/// See . pub fn split_at_const(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> { (bytes.len() >= N).then(|| { let (head, tail) = bytes.split_at(N); (head.try_into().unwrap(), tail) }) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_split_cstr() { + assert!(split_cstr(b"").is_none()); + assert!(split_cstr(b"foo").is_none()); + + let (cstr, rest) = split_cstr(b"\0").expect("uh-oh"); + assert_eq!(cstr.to_bytes(), b""); + assert_eq!(rest, b""); + + let (cstr, rest) = split_cstr(b"foo\0bar").expect("uh-oh"); + assert_eq!(cstr.to_bytes(), b"foo"); + assert_eq!(rest, b"bar"); + } + + #[test] + fn test_split_at_const() { + assert!(split_at_const::<0>(b"").is_some()); + assert!(split_at_const::<1>(b"").is_none()); + assert!(matches!(split_at_const::<1>(b"ok"), Some((b"o", b"k")))); + } +} From 5ccd54c699a3953486ce200c6f8ad3a9e39b8eb0 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Fri, 23 Sep 2022 13:08:05 +0300 Subject: [PATCH 0848/1022] Add support for h3-pg and re-enable plv8 --- Dockerfile.compute-node-v14 | 50 ++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14 index 8ddf752191..f3773868d0 100644 --- a/Dockerfile.compute-node-v14 +++ b/Dockerfile.compute-node-v14 @@ -8,9 +8,12 @@ ARG TAG=pinned # Layer "build-deps" # FROM debian:bullseye-slim AS build-deps +RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ + echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + apt update RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ - libcurl4-openssl-dev libossp-uuid-dev + libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev # # Layer "pg-build" @@ -37,7 +40,7 @@ RUN cd postgres && \ FROM build-deps AS postgis-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget + apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ tar xvzf postgis-3.3.0.tar.gz && \ @@ -59,15 +62,13 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ # Build plv8 # FROM build-deps AS plv8-build -COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev + apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 # https://github.com/plv8/plv8/issues/475 # Debian bullseye provides binutils 2.35 when >= 2.38 is necessary -RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ - echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ - apt update && \ +RUN apt update && \ apt install -y --no-install-recommends -t testing binutils RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ @@ -79,12 +80,45 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ rm -rf /plv8-* && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control +# +# Layer "h3-pg-build" +# Build h3_pg +# +FROM build-deps AS h3-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +# packaged cmake is too old +RUN apt update && \ + apt install -y --no-install-recommends -t testing cmake + +RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \ + tar xvzf h3.tgz && \ + cd h3-4.0.1 && \ + mkdir build && \ + cd build && \ + cmake .. -DCMAKE_BUILD_TYPE=Release && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + DESTDIR=/h3 make install && \ + cp -R /h3/usr / && \ + rm -rf build + +RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \ + tar xvzf h3-pg.tgz && \ + cd h3-pg-4.0.1 && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control + # # Layer "neon-pg-ext-build" # compile neon extensions # FROM build-deps AS neon-pg-ext-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /h3/usr / COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -132,8 +166,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ chmod 0750 /var/db/postgres/compute && \ echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig -# TODO: Check if we can make the extension setup more modular versus a linear build -# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc# COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl From 805bb198c287a5a1ac3e28627165313335c69cc9 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Fri, 23 Sep 2022 11:49:28 -0700 Subject: [PATCH 0849/1022] Miscellaneous small fixups (#2503) Changes are: * Correct typo "firts" -> "first" * Change to * Fix weird indentation that rustfmt was failing to handle * Use existing `anyhow::{anyhow,bail}!` as `{anyhow,bail}!` if it's already in scope * Spell `Result` as `anyhow::Result` * In general, closer to matching the rest of the codebase * Change usages of `hash_map::Entry` to `Entry` when it's already in scope * A quick search shows our style on this one varies across the files it's used in --- pageserver/src/tenant.rs | 23 +++++++++++------------ pageserver/src/tenant/timeline.rs | 8 +++++--- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ed41641277..c9ad3bf232 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -17,7 +17,6 @@ use tracing::*; use utils::crashsafe_dir::path_with_suffix_extension; use std::cmp::min; -use std::collections::hash_map; use std::collections::hash_map::Entry; use std::collections::BTreeSet; use std::collections::HashMap; @@ -246,12 +245,12 @@ impl Tenant { let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); if ancestor_ancestor_lsn > *lsn { // can we safely just branch from the ancestor instead? - anyhow::bail!( - "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", - lsn, - ancestor_timeline_id, - ancestor_ancestor_lsn, - ); + bail!( + "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", + lsn, + ancestor_timeline_id, + ancestor_ancestor_lsn, + ); } } @@ -406,11 +405,11 @@ impl Tenant { .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; match timelines_accessor.entry(timeline.timeline_id) { - hash_map::Entry::Occupied(_) => anyhow::bail!( + Entry::Occupied(_) => bail!( "Found freshly initialized timeline {} in the tenant map", timeline.timeline_id ), - hash_map::Entry::Vacant(v) => { + Entry::Vacant(v) => { v.insert(timeline); } } @@ -768,7 +767,7 @@ impl Tenant { }) .with_context(|| { format!( - "Failed to fsync on firts save for config {}", + "Failed to fsync on first save for config {}", target_config_path.display() ) })?; @@ -1091,11 +1090,11 @@ impl Tenant { })?; match timelines.entry(new_timeline_id) { - hash_map::Entry::Occupied(_) => anyhow::bail!( + Entry::Occupied(_) => bail!( "Found freshly initialized timeline {} in the tenant map", new_timeline_id ), - hash_map::Entry::Vacant(v) => { + Entry::Vacant(v) => { v.insert(Arc::clone(&new_timeline)); } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 019de81d64..74e873e632 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -343,7 +343,9 @@ impl Timeline { match cached_lsn.cmp(&lsn) { Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image - Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn + Ordering::Greater => { + unreachable!("the returned lsn should never be after the requested lsn") + } } Some((cached_lsn, cached_img)) } @@ -726,10 +728,10 @@ impl Timeline { Ok(()) } - pub fn layer_removal_guard(&self) -> Result, anyhow::Error> { + pub fn layer_removal_guard(&self) -> anyhow::Result> { self.layer_removal_cs .try_lock() - .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) + .map_err(|e| anyhow!("cannot lock compaction critical section {e}")) } /// Retrieve current logical size of the timeline. From 093264a69523c5f8f007b35cf26be4e0b11c1de9 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 23 Sep 2022 19:59:27 +0300 Subject: [PATCH 0850/1022] Fix deploy bin and lib paths for postgres --- .github/ansible/get_binaries.sh | 4 ++-- Dockerfile | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh index f96cff247f..dbbd5b454a 100755 --- a/.github/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -24,8 +24,8 @@ tar -xzf postgres_install.tar.gz -C neon_install docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/ docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ -docker cp ${ID}:/usr/local/v14/bin/postgres neon_install/bin/v14 -docker cp ${ID}:/usr/local/v15/bin/postgres neon_install/bin/v15 +docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/ +docker cp ${ID}:/usr/local/v15/bin/ neon_install/v15/bin/ docker rm -vf ${ID} # store version to file (for ansible playbooks) and create binaries tarball diff --git a/Dockerfile b/Dockerfile index 876a20cc1a..69402919ec 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,9 +19,8 @@ COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh ENV BUILD_TYPE release RUN set -e \ && mold -run make -j $(nproc) -s neon-pg-ext \ - && rm -rf pg_install/v14/build \ - && rm -rf pg_install/v15/build \ - && tar -C pg_install/v14 -czf /home/nonroot/postgres_install.tar.gz . + && rm -rf pg_install/build \ + && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz . # Build neon binaries FROM $REPOSITORY/$IMAGE:$TAG AS build From 1165686201db64f5c58dbfcb791462f85a513352 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 23 Sep 2022 20:13:58 +0300 Subject: [PATCH 0851/1022] fix deploy lib paths for postgres --- .github/ansible/get_binaries.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh index dbbd5b454a..b2f1fb38e6 100755 --- a/.github/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -26,6 +26,8 @@ docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/ docker cp ${ID}:/usr/local/v15/bin/ neon_install/v15/bin/ +docker cp ${ID}:/usr/local/v14/lib/ neon_install/v14/lib/ +docker cp ${ID}:/usr/local/v15/lib/ neon_install/v15/lib/ docker rm -vf ${ID} # store version to file (for ansible playbooks) and create binaries tarball From 367cc012903a7dc60d061a17ab61227f97598120 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Mon, 26 Sep 2022 10:07:18 +0300 Subject: [PATCH 0852/1022] Fix deploy paths --- .github/ansible/get_binaries.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh index b2f1fb38e6..a484bfb0a0 100755 --- a/.github/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -21,6 +21,7 @@ docker pull --quiet neondatabase/neon:${DOCKER_TAG} ID=$(docker create neondatabase/neon:${DOCKER_TAG}) docker cp ${ID}:/data/postgres_install.tar.gz . tar -xzf postgres_install.tar.gz -C neon_install +mkdir neon_install/bin/ docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/ docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ From df45c0d0e57477768097c13c2c3299e634f963b8 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Mon, 26 Sep 2022 12:16:52 +0300 Subject: [PATCH 0853/1022] Disable plv8 again --- Dockerfile.compute-node-v14 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14 index f3773868d0..ed57b29009 100644 --- a/Dockerfile.compute-node-v14 +++ b/Dockerfile.compute-node-v14 @@ -116,7 +116,8 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3 # FROM build-deps AS neon-pg-ext-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +# plv8 still sometimes crashes during the creation +# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /h3/usr / COPY pgxn/ pgxn/ From d15116f2cc4b26ad36f9cf28c5cf9f9343269cc3 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Fri, 23 Sep 2022 14:36:08 +0000 Subject: [PATCH 0854/1022] Update pg_version for old timelines --- safekeeper/src/control_file_upgrade.rs | 12 ++++++++++++ safekeeper/src/safekeeper.rs | 3 +-- safekeeper/src/timeline.rs | 2 ++ safekeeper/src/wal_storage.rs | 16 +++++----------- 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index d8434efb20..1ce9186085 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -248,6 +248,18 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result oldstate.timeline_start_lsn = Lsn(1); oldstate.local_start_lsn = Lsn(1); + return Ok(oldstate); + } else if version == 6 { + info!("reading safekeeper control file version {}", version); + let mut oldstate = SafeKeeperState::des(&buf[..buf.len()])?; + if oldstate.server.pg_version != 0 { + return Ok(oldstate); + } + + // set pg_version to the default v14 + info!("setting pg_version to 140005"); + oldstate.server.pg_version = 140005; + return Ok(oldstate); } bail!("unsupported safekeeper control file version {}", version) diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index eec24faf2f..7869aa8b3a 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -25,7 +25,7 @@ use utils::{ }; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 6; +pub const SK_FORMAT_VERSION: u32 = 7; const SK_PROTOCOL_VERSION: u32 = 2; pub const UNKNOWN_SERVER_VERSION: u32 = 0; @@ -639,7 +639,6 @@ where let mut state = self.state.clone(); state.server.system_id = msg.system_id; - state.server.wal_seg_size = msg.wal_seg_size; if msg.pg_version != UNKNOWN_SERVER_VERSION { state.server.pg_version = msg.pg_version; } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index c16fc9f40c..dc7503af65 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -314,6 +314,8 @@ impl Timeline { ttid: TenantTimelineId, wal_backup_launcher_tx: Sender, ) -> Result { + let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered(); + let shared_state = SharedState::restore(&conf, &ttid)?; let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(shared_state.sk.state.commit_lsn); diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index eee7c703f9..8fbd479d95 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -111,6 +111,10 @@ impl PhysicalStorage { // Find out where stored WAL ends, starting at commit_lsn which is a // known recent record boundary (unless we don't have WAL at all). + // + // NB: find_end_of_wal MUST be backwards compatible with the previously + // written WAL. If find_end_of_wal fails to read any WAL written by an + // older version of the code, we could lose data forever. let write_lsn = if state.commit_lsn == Lsn(0) { Lsn(0) } else { @@ -125,17 +129,7 @@ impl PhysicalStorage { wal_seg_size, state.commit_lsn, )?, - pg_majorversion => { - // This is a quik hack to work with old timelines that don't have - // pg_version in the control file. We can remove it after this is fixed properly. - const DEFAULT_PG_MAJOR_VERSION: u32 = 14; - warn!("unknown postgres version {pg_majorversion} assume {DEFAULT_PG_MAJOR_VERSION}"); - postgres_ffi::v14::xlog_utils::find_end_of_wal( - &timeline_dir, - wal_seg_size, - state.commit_lsn, - )? - } + _ => bail!("unsupported postgres version: {}", state.server.pg_version), } }; From fb68d01449edb4be9a0d064d69a442dd3688783e Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 26 Sep 2022 23:57:02 +0300 Subject: [PATCH 0855/1022] Preserve task result in TaskHandle by keeping join handle around (#2521) * Preserve task result in TaskHandle by keeping join handle around The solution is not great, but it should hep to debug staging issue I tried to do it in a least destructive way. TaskHandle used only in one place so it is ok to use something less generic unless we want to extend its usage across the codebase. In its current current form for its single usage place it looks too abstract Some problems around this code: 1. Task can drop event sender and continue running 2. Task cannot be joined several times (probably not needed, but still, can be surprising) 3. Had to split task event into two types because ahyhow::Error does not implement clone. So TaskContinueEvent derives clone but usual task evend does not. Clone requirement appears because we clone the current value in next_task_event. Taking it by reference is complicated. 4. Split between Init and Started is artificial and comes from watch::channel requirement to have some initial value. To summarize from 3 and 4. It may be a better idea to use RWLock or a bounded channel instead --- pageserver/src/walreceiver.rs | 76 ++++++++++++++----- .../src/walreceiver/connection_manager.rs | 43 ++++++----- .../src/walreceiver/walreceiver_connection.rs | 16 ++-- 3 files changed, 89 insertions(+), 46 deletions(-) diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index deac299747..c7de24080a 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -31,7 +31,6 @@ use etcd_broker::Client; use itertools::Itertools; use once_cell::sync::OnceCell; use std::future::Future; -use std::sync::Arc; use tokio::sync::watch; use tracing::*; use url::Url; @@ -88,37 +87,44 @@ pub fn is_etcd_client_initialized() -> bool { /// That may lead to certain events not being observed by the listener. #[derive(Debug)] pub struct TaskHandle { - events_receiver: watch::Receiver>, + join_handle: Option>>, + events_receiver: watch::Receiver>, cancellation: watch::Sender<()>, } -#[derive(Debug, Clone)] pub enum TaskEvent { + Update(TaskStateUpdate), + End(anyhow::Result<()>), +} + +#[derive(Debug, Clone)] +pub enum TaskStateUpdate { + Init, Started, - NewEvent(E), - End, + Progress(E), } impl TaskHandle { /// Initializes the task, starting it immediately after the creation. pub fn spawn( - task: impl FnOnce(Arc>>, watch::Receiver<()>) -> Fut + Send + 'static, + task: impl FnOnce(watch::Sender>, watch::Receiver<()>) -> Fut + + Send + + 'static, ) -> Self where - Fut: Future> + Send, - E: Sync + Send + 'static, + Fut: Future> + Send, + E: Send + Sync + 'static, { let (cancellation, cancellation_receiver) = watch::channel(()); - let (events_sender, events_receiver) = watch::channel(TaskEvent::Started); - let events_sender = Arc::new(events_sender); + let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started); - let sender = Arc::clone(&events_sender); - let _ = WALRECEIVER_RUNTIME.spawn(async move { - events_sender.send(TaskEvent::Started).ok(); - task(sender, cancellation_receiver).await + let join_handle = WALRECEIVER_RUNTIME.spawn(async move { + events_sender.send(TaskStateUpdate::Started).ok(); + task(events_sender, cancellation_receiver).await }); TaskHandle { + join_handle: Some(join_handle), events_receiver, cancellation, } @@ -126,15 +132,45 @@ impl TaskHandle { async fn next_task_event(&mut self) -> TaskEvent { match self.events_receiver.changed().await { - Ok(()) => self.events_receiver.borrow().clone(), - Err(_task_channel_part_dropped) => TaskEvent::End, + Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()), + Err(_task_channel_part_dropped) => { + TaskEvent::End(match self.join_handle.take() { + Some(jh) => { + if !jh.is_finished() { + warn!("sender is dropped while join handle is still alive"); + } + + jh.await + .map_err(|e| anyhow::anyhow!("Failed to join task: {e}")) + .and_then(|x| x) + } + None => { + // Another option is to have an enum, join handle or result and give away the reference to it + Err(anyhow::anyhow!("Task was joined more than once")) + } + }) + } } } /// Aborts current task, waiting for it to finish. - pub async fn shutdown(mut self) { - self.cancellation.send(()).ok(); - // wait until the sender is dropped - while self.events_receiver.changed().await.is_ok() {} + pub async fn shutdown(self) { + match self.join_handle { + Some(jh) => { + self.cancellation.send(()).ok(); + match jh.await { + Ok(Ok(())) => debug!("Shutdown success"), + Ok(Err(e)) => error!("Shutdown task error: {e:?}"), + Err(join_error) => { + if join_error.is_cancelled() { + error!("Shutdown task was cancelled"); + } else { + error!("Shutdown task join error: {join_error}") + } + } + } + } + None => {} + } } } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index a82e69e5ba..29179e9871 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -16,10 +16,10 @@ use std::{ time::Duration, }; -use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::task_mgr::WALRECEIVER_RUNTIME; use crate::tenant::Timeline; +use crate::{task_mgr, walreceiver::TaskStateUpdate}; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ @@ -145,19 +145,26 @@ async fn connection_manager_loop_step( let wal_connection = walreceiver_state.wal_connection.as_mut() .expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { - TaskEvent::Started => {}, - TaskEvent::NewEvent(status) => { - if status.has_processed_wal { - // We have advanced last_record_lsn by processing the WAL received - // from this safekeeper. This is good enough to clean unsuccessful - // retries history and allow reconnecting to this safekeeper without - // sleeping for a long time. - walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); + TaskEvent::Update(c) => { + match c { + TaskStateUpdate::Init | TaskStateUpdate::Started => {}, + TaskStateUpdate::Progress(status) => { + if status.has_processed_wal { + // We have advanced last_record_lsn by processing the WAL received + // from this safekeeper. This is good enough to clean unsuccessful + // retries history and allow reconnecting to this safekeeper without + // sleeping for a long time. + walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); + } + wal_connection.status = status.to_owned(); + } } - wal_connection.status = status; }, - TaskEvent::End => { - debug!("WAL receiving task finished"); + TaskEvent::End(walreceiver_task_result) => { + match walreceiver_task_result { + Ok(()) => debug!("WAL receiving task finished"), + Err(e) => error!("wal receiver task finished with an error: {e:?}"), + } walreceiver_state.drop_old_connection(false).await; }, } @@ -363,13 +370,13 @@ impl WalreceiverState { async move { super::walreceiver_connection::handle_walreceiver_connection( timeline, - &new_wal_source_connstr, - events_sender.as_ref(), + new_wal_source_connstr, + events_sender, cancellation, connect_timeout, ) .await - .map_err(|e| format!("walreceiver connection handling failure: {e:#}")) + .context("walreceiver connection handling failure") } .instrument(info_span!("walreceiver_connection", id = %id)) }); @@ -885,7 +892,7 @@ mod tests { status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskEvent::NewEvent(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status.clone())) .ok(); Ok(()) }), @@ -1145,7 +1152,7 @@ mod tests { status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskEvent::NewEvent(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status.clone())) .ok(); Ok(()) }), @@ -1233,7 +1240,7 @@ mod tests { status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskEvent::NewEvent(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status.clone())) .ok(); Ok(()) }), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 15cfad1dcd..ef5baeb570 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -18,8 +18,7 @@ use tokio::{pin, select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; use tracing::{debug, error, info, trace, warn}; -use super::TaskEvent; -use crate::metrics::LIVE_CONNECTIONS_COUNT; +use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate}; use crate::{ task_mgr, task_mgr::TaskKind, @@ -55,8 +54,8 @@ pub struct WalConnectionStatus { /// messages as we go. pub async fn handle_walreceiver_connection( timeline: Arc, - wal_source_connstr: &str, - events_sender: &watch::Sender>, + wal_source_connstr: String, + events_sender: watch::Sender>, mut cancellation: watch::Receiver<()>, connect_timeout: Duration, ) -> anyhow::Result<()> { @@ -81,7 +80,7 @@ pub async fn handle_walreceiver_connection( streaming_lsn: None, commit_lsn: None, }; - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); return Ok(()); } @@ -133,7 +132,7 @@ pub async fn handle_walreceiver_connection( connection_status.latest_connection_update = Utc::now().naive_utc(); connection_status.latest_wal_update = Utc::now().naive_utc(); connection_status.commit_lsn = Some(end_of_wal); - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); return Ok(()); } @@ -201,7 +200,7 @@ pub async fn handle_walreceiver_connection( } &_ => {} }; - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { warn!("Wal connection event listener dropped, aborting the connection: {e}"); return Ok(()); } @@ -267,7 +266,8 @@ pub async fn handle_walreceiver_connection( if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg { // We have successfully processed at least one WAL record. connection_status.has_processed_wal = true; - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) + { warn!("Wal connection event listener dropped, aborting the connection: {e}"); return Ok(()); } From 2233ca2a391e25699b459c76669b7cb5a1396b5f Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Thu, 22 Sep 2022 12:46:20 +0200 Subject: [PATCH 0856/1022] seqwait.rs unit tests don't check return value --- libs/utils/src/seqwait.rs | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index 467b900a13..bf330a482c 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -240,7 +240,6 @@ where mod tests { use super::*; use std::sync::Arc; - use std::thread::sleep; use std::time::Duration; impl MonotonicCounter for i32 { @@ -258,17 +257,19 @@ mod tests { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); let seq3 = Arc::clone(&seq); - tokio::task::spawn(async move { + let jh1 = tokio::task::spawn(async move { seq2.wait_for(42).await.expect("wait_for 42"); let old = seq2.advance(100); assert_eq!(old, 99); - seq2.wait_for(999).await.expect_err("no 999"); + seq2.wait_for_timeout(999, Duration::from_millis(100)) + .await + .expect_err("no 999"); }); - tokio::task::spawn(async move { + let jh2 = tokio::task::spawn(async move { seq3.wait_for(42).await.expect("wait_for 42"); seq3.wait_for(0).await.expect("wait_for 0"); }); - sleep(Duration::from_secs(1)); + tokio::time::sleep(Duration::from_millis(200)).await; let old = seq.advance(99); assert_eq!(old, 0); seq.wait_for(100).await.expect("wait_for 100"); @@ -277,6 +278,9 @@ mod tests { assert_eq!(seq.advance(98), 100); assert_eq!(seq.load(), 100); + jh1.await.unwrap(); + jh2.await.unwrap(); + seq.shutdown(); } @@ -284,15 +288,18 @@ mod tests { async fn seqwait_timeout() { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); - tokio::task::spawn(async move { + let jh = tokio::task::spawn(async move { let timeout = Duration::from_millis(1); let res = seq2.wait_for_timeout(42, timeout).await; assert_eq!(res, Err(SeqWaitError::Timeout)); }); - tokio::time::sleep(Duration::from_secs(1)).await; + tokio::time::sleep(Duration::from_millis(200)).await; // This will attempt to wake, but nothing will happen // because the waiter already dropped its Receiver. let old = seq.advance(99); - assert_eq!(old, 0) + assert_eq!(old, 0); + jh.await.unwrap(); + + seq.shutdown(); } } From fc7087b16f79a3c0c04f8ea8c6fdc2cd74472f81 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 27 Sep 2022 10:57:59 +0200 Subject: [PATCH 0857/1022] Add metric for loaded safekeeper timelines (#2509) --- safekeeper/src/metrics.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 51138df776..095d80623a 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -2,7 +2,7 @@ use std::time::{Instant, SystemTime}; -use ::metrics::{register_histogram, GaugeVec, Histogram, DISK_WRITE_SECONDS_BUCKETS}; +use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS}; use anyhow::Result; use metrics::{ core::{AtomicU64, Collector, Desc, GenericGaugeVec, Opts}, @@ -135,6 +135,7 @@ pub struct TimelineCollector { written_wal_seconds: GaugeVec, flushed_wal_seconds: GaugeVec, collect_timeline_metrics: Gauge, + timelines_count: IntGauge, } impl Default for TimelineCollector { @@ -311,6 +312,13 @@ impl TimelineCollector { .unwrap(); descs.extend(collect_timeline_metrics.desc().into_iter().cloned()); + let timelines_count = IntGauge::new( + "safekeeper_timelines", + "Total number of timelines loaded in-memory", + ) + .unwrap(); + descs.extend(timelines_count.desc().into_iter().cloned()); + TimelineCollector { descs, commit_lsn, @@ -330,6 +338,7 @@ impl TimelineCollector { written_wal_seconds, flushed_wal_seconds, collect_timeline_metrics, + timelines_count, } } } @@ -361,6 +370,7 @@ impl Collector for TimelineCollector { self.flushed_wal_seconds.reset(); let timelines = GlobalTimelines::get_all(); + let timelines_count = timelines.len(); for arc_tli in timelines { let tli = arc_tli.info_for_metrics(); @@ -474,6 +484,10 @@ impl Collector for TimelineCollector { self.collect_timeline_metrics.set(elapsed); mfs.extend(self.collect_timeline_metrics.collect()); + // report total number of timelines + self.timelines_count.set(timelines_count as i64); + mfs.extend(self.timelines_count.collect()); + mfs } } From dabb6d2675717dad380805434e1984a7d0a73f96 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 27 Sep 2022 12:36:17 +0200 Subject: [PATCH 0858/1022] Fix log level for sk startup logs (#2526) --- libs/postgres_ffi/src/xlog_utils.rs | 6 +++--- safekeeper/src/wal_storage.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 2c16cc9cd9..fbd8468a93 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -170,7 +170,7 @@ pub fn find_end_of_wal( let mut curr_lsn = start_lsn; let mut buf = [0u8; XLOG_BLCKSZ]; let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); - info!("find_end_of_wal PG_VERSION: {}", pg_version); + debug!("find_end_of_wal PG_VERSION: {}", pg_version); let mut decoder = WalStreamDecoder::new(start_lsn, pg_version); @@ -182,7 +182,7 @@ pub fn find_end_of_wal( match open_wal_segment(&seg_file_path)? { None => { // no more segments - info!( + debug!( "find_end_of_wal reached end at {:?}, segment {:?} doesn't exist", result, seg_file_path ); @@ -205,7 +205,7 @@ pub fn find_end_of_wal( match decoder.poll_decode() { Ok(Some(record)) => result = record.0, Err(e) => { - info!( + debug!( "find_end_of_wal reached end at {:?}, decode error: {:?}", result, e ); diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 8fbd479d95..bc5e2d7b24 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -137,7 +137,7 @@ impl PhysicalStorage { // If not, maybe it's better to call fsync() here to be sure? let flush_lsn = write_lsn; - info!( + debug!( "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}", ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn, ); From 7b2f9dc9080821985525fd81fd33e10967062fb1 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 3 Oct 2022 13:33:55 +0300 Subject: [PATCH 0859/1022] Reuse existing tenants during attach (#2540) --- pageserver/src/storage_sync.rs | 1 + pageserver/src/tenant.rs | 46 ++++----- pageserver/src/tenant_mgr.rs | 27 +++--- .../test_tenants_with_remote_storage.py | 96 +++++++++++++++++++ 4 files changed, 136 insertions(+), 34 deletions(-) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 776d9214d4..bee460d173 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -639,6 +639,7 @@ pub fn spawn_storage_sync_task( (storage, remote_index_clone, sync_queue), max_sync_errors, ) + .instrument(info_span!("storage_sync_loop")) .await; Ok(()) }, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index c9ad3bf232..672ee3a488 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -400,16 +400,19 @@ impl Tenant { timeline_id, metadata.pg_version() ); - let timeline = self - .initialize_new_timeline(timeline_id, metadata, &mut timelines_accessor) - .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; - - match timelines_accessor.entry(timeline.timeline_id) { - Entry::Occupied(_) => bail!( - "Found freshly initialized timeline {} in the tenant map", - timeline.timeline_id + let ancestor = metadata + .ancestor_timeline() + .and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id)) + .cloned(); + match timelines_accessor.entry(timeline_id) { + Entry::Occupied(_) => warn!( + "Timeline {}/{} already exists in the tenant map, skipping its initialization", + self.tenant_id, timeline_id ), Entry::Vacant(v) => { + let timeline = self + .initialize_new_timeline(timeline_id, metadata, ancestor) + .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; v.insert(timeline); } } @@ -609,21 +612,14 @@ impl Tenant { &self, new_timeline_id: TimelineId, new_metadata: TimelineMetadata, - timelines: &mut MutexGuard>>, + ancestor: Option>, ) -> anyhow::Result> { - let ancestor = match new_metadata.ancestor_timeline() { - Some(ancestor_timeline_id) => Some( - timelines - .get(&ancestor_timeline_id) - .cloned() - .with_context(|| { - format!( - "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found" - ) - })?, - ), - None => None, - }; + if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() { + anyhow::ensure!( + ancestor.is_some(), + "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found" + ) + } let new_disk_consistent_lsn = new_metadata.disk_consistent_lsn(); let pg_version = new_metadata.pg_version(); @@ -1080,8 +1076,12 @@ impl Tenant { ) })?; + let ancestor = new_metadata + .ancestor_timeline() + .and_then(|ancestor_timeline_id| timelines.get(&ancestor_timeline_id)) + .cloned(); let new_timeline = self - .initialize_new_timeline(new_timeline_id, new_metadata, timelines) + .initialize_new_timeline(new_timeline_id, new_metadata, ancestor) .with_context(|| { format!( "Failed to initialize timeline {}/{}", diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index fcb2c18b79..1efd3d4af4 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -107,6 +107,9 @@ pub fn init_tenant_mgr( /// Ignores other timelines that might be present for tenant, but were not passed as a parameter. /// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken", /// and the load continues. +/// +/// Attach happens on startup and sucessful timeline downloads +/// (some subset of timeline files, always including its metadata, after which the new one needs to be registered). pub fn attach_local_tenants( conf: &'static PageServerConf, remote_index: &RemoteIndex, @@ -122,18 +125,20 @@ pub fn attach_local_tenants( ); debug!("Timelines to attach: {local_timelines:?}"); - let tenant = load_local_tenant(conf, tenant_id, remote_index); - { - match tenants_state::write_tenants().entry(tenant_id) { - hash_map::Entry::Occupied(_) => { - error!("Cannot attach tenant {tenant_id}: there's already an entry in the tenant state"); - continue; - } - hash_map::Entry::Vacant(v) => { - v.insert(Arc::clone(&tenant)); - } + let mut tenants_accessor = tenants_state::write_tenants(); + let tenant = match tenants_accessor.entry(tenant_id) { + hash_map::Entry::Occupied(o) => { + info!("Tenant {tenant_id} was found in pageserver's memory"); + Arc::clone(o.get()) } - } + hash_map::Entry::Vacant(v) => { + info!("Tenant {tenant_id} was not found in pageserver's memory, loading it"); + let tenant = load_local_tenant(conf, tenant_id, remote_index); + v.insert(Arc::clone(&tenant)); + tenant + } + }; + drop(tenants_accessor); if tenant.current_state() == TenantState::Broken { warn!("Skipping timeline load for broken tenant {tenant_id}") diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 83affac062..d8424e22c8 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -7,19 +7,25 @@ # import asyncio +import os +from pathlib import Path from typing import List, Tuple import pytest +from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + NeonPageserverHttpClient, Postgres, RemoteStorageKind, available_remote_storages, wait_for_last_record_lsn, wait_for_upload, + wait_until, ) from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import query_scalar async def tenant_workload(env: NeonEnv, pg: Postgres): @@ -93,3 +99,93 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem # run final checkpoint manually to flush all the data to remote storage pageserver_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_tenants_attached_after_download( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="remote_storage_kind", + ) + + data_id = 1 + data_secret = "very secret secret" + + ##### First start, insert secret data and upload it to the remote storage + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + for checkpoint_number in range(1, 3): + with pg.cursor() as cur: + cur.execute( + f""" + CREATE TABLE t{checkpoint_number}(id int primary key, secret text); + INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); + """ + ) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + log.info(f"waiting for checkpoint {checkpoint_number} upload") + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(client, tenant_id, timeline_id, current_lsn) + log.info(f"upload of checkpoint {checkpoint_number} is done") + + ##### Stop the pageserver, erase its layer file to force it being downloaded from S3 + env.postgres.stop_all() + env.pageserver.stop() + + timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + local_layer_deleted = False + for path in Path.iterdir(timeline_dir): + if path.name.startswith("00000"): + # Looks like a layer file. Remove it + os.remove(path) + local_layer_deleted = True + break + assert local_layer_deleted, f"Found no local layer files to delete in directory {timeline_dir}" + + ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory + env.pageserver.start() + client = env.pageserver.http_client() + + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: expect_tenant_to_download_timeline(client, tenant_id), + ) + + restored_timelines = client.timeline_list(tenant_id) + assert ( + len(restored_timelines) == 1 + ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage" + retored_timeline = restored_timelines[0] + assert retored_timeline["timeline_id"] == str( + timeline_id + ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" + + +def expect_tenant_to_download_timeline( + client: NeonPageserverHttpClient, + tenant_id: TenantId, +): + for tenant in client.tenant_list(): + if tenant["id"] == str(tenant_id): + assert not tenant.get( + "has_in_progress_downloads", True + ), f"Tenant {tenant_id} should have no downloads in progress" + return + assert False, f"Tenant {tenant_id} is missing on pageserver" From 4f2ac51bdd21ada43efc2b30ad2b3724ed9331cf Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 30 Sep 2022 12:21:56 +0300 Subject: [PATCH 0860/1022] Bump rustc to 1.61 --- .github/workflows/build_and_test.yml | 6 +++--- .github/workflows/codestyle.yml | 2 +- rust-toolchain.toml | 9 ++++----- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8a7cdec89c..22042489a8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -127,8 +127,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} - v8-${{ runner.os }}-${{ matrix.build_type }}-cargo- + v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} + v9-${{ runner.os }}-${{ matrix.build_type }}-cargo- - name: Cache postgres v14 build id: cache_pg_14 @@ -389,7 +389,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} + key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact uses: ./.github/actions/download diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 641943199e..6d39958bab 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -106,7 +106,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git target - key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust + key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust - name: Run cargo clippy run: ./run_clippy.sh diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 1a27e92fec..5aa0f8d4e5 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,11 +1,10 @@ [toolchain] # We try to stick to a toolchain version that is widely available on popular distributions, so that most people # can use the toolchain that comes with their operating system. But if there's a feature we miss badly from a later -# version, we can consider updating. As of this writing, 1.60 is available on Debian 'experimental' but not yet on -# 'testing' or even 'unstable', which is a bit more cutting-edge than we'd like. Hopefully the 1.60 packages reach -# 'testing' soon (and similarly for the other distributions). -# See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package. -channel = "1.60" # do update GitHub CI cache values for rust builds, when changing this value +# version, we can consider updating. +# See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package, +# we use "unstable" version number as the highest version used in the project by default. +channel = "1.61" # do update GitHub CI cache values for rust builds, when changing this value profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html From 31123d1fa89f445581826559e8ed440455f01cff Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 3 Oct 2022 17:44:17 +0300 Subject: [PATCH 0861/1022] Silence clippies, minor doc fix (#2543) * doc: remove stray backtick * chore: clippy::let_unit_value * chore: silence useless_transmute, duplicate_mod * chore: remove allowing deref_nullptr not needed since bindgen 0.60.0. * chore: remove repeated allowed lints they are already allowed from the crate root. --- docs/sourcetree.md | 2 +- libs/postgres_ffi/src/lib.rs | 8 +++++--- libs/postgres_ffi/src/xlog_utils.rs | 6 ------ pageserver/src/tenant/timeline.rs | 2 +- proxy/src/cancellation.rs | 4 ++-- 5 files changed, 9 insertions(+), 13 deletions(-) diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 8043450a55..c468134b81 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -96,7 +96,7 @@ A single virtual environment with all dependencies is described in the single `P sudo apt install python3.9 ``` - Install `poetry` - - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`. + - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation). - Install dependencies via `./scripts/pysync`. - Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile)) so if you have different version some linting tools can yield different result locally vs in the CI. diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 95ecc7b061..f3dad159be 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -3,9 +3,11 @@ #![allow(non_snake_case)] // bindgen creates some unsafe code with no doc comments. #![allow(clippy::missing_safety_doc)] -// suppress warnings on rust 1.53 due to bindgen unit tests. -// https://github.com/rust-lang/rust-bindgen/issues/1651 -#![allow(deref_nullptr)] +// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code. +#![allow(clippy::useless_transmute)] +// modules included with the postgres_ffi macro depend on the types of the specific version's +// types, and trigger a too eager lint. +#![allow(clippy::duplicate_mod)] use bytes::Bytes; use utils::bin_ser::SerializeError; diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index fbd8468a93..953723a8f0 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -57,12 +57,10 @@ pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; /// in order to let CLOG_TRUNCATE mechanism correctly extend CLOG. const XID_CHECKPOINT_INTERVAL: u32 = 1024; -#[allow(non_snake_case)] pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo { (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo } -#[allow(non_snake_case)] pub fn XLogSegNoOffsetToRecPtr( segno: XLogSegNo, offset: u32, @@ -71,7 +69,6 @@ pub fn XLogSegNoOffsetToRecPtr( segno * (wal_segsz_bytes as u64) + (offset as u64) } -#[allow(non_snake_case)] pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { format!( "{:>08X}{:>08X}{:>08X}", @@ -81,7 +78,6 @@ pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize ) } -#[allow(non_snake_case)] pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) { let tli = u32::from_str_radix(&fname[0..8], 16).unwrap(); let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo; @@ -89,12 +85,10 @@ pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLin (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli) } -#[allow(non_snake_case)] pub fn IsXLogFileName(fname: &str) -> bool { return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit()); } -#[allow(non_snake_case)] pub fn IsPartialXLogFileName(fname: &str) -> bool { fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8]) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 74e873e632..247e076230 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -627,7 +627,7 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); drop(tenant_conf_guard); let self_clone = Arc::clone(self); - let _ = spawn_connection_manager_task( + spawn_connection_manager_task( self.conf.broker_etcd_prefix.clone(), self_clone, walreceiver_connect_timeout, diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 92f8e35dab..eb9312e6bb 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -129,13 +129,13 @@ mod tests { assert!(CANCEL_MAP.contains(&session)); tx.send(()).expect("failed to send"); - let () = futures::future::pending().await; // sleep forever + futures::future::pending::<()>().await; // sleep forever Ok(()) })); // Wait until the task has been spawned. - let () = rx.await.context("failed to hear from the task")?; + rx.await.context("failed to hear from the task")?; // Drop the session's entry by cancelling the task. task.abort(); From 537b2c1ae6d9c61ae7ed4a02c04a370354b3bcdb Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 4 Oct 2022 10:49:39 +0300 Subject: [PATCH 0862/1022] Remove unnecessary check for open PostgreSQL TCP port. The loop checked if the TCP port is open for connections, by trying to connect to it. That seems unnecessary. By the time the postmaster.pid file says that it's ready, the port should be open. Remove that check. --- compute_tools/src/compute.rs | 9 +-------- compute_tools/src/pg_helpers.rs | 22 ++++++++-------------- 2 files changed, 9 insertions(+), 22 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 58469b1c97..1e848627e3 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -258,14 +258,7 @@ impl ComputeNode { .spawn() .expect("cannot start postgres process"); - // Try default Postgres port if it is not provided - let port = self - .spec - .cluster - .settings - .find("port") - .unwrap_or_else(|| "5432".to_string()); - wait_for_postgres(&mut pg, &port, pgdata_path)?; + wait_for_postgres(&mut pg, pgdata_path)?; // If connection fails, // it may be the old node with `zenith_admin` superuser. diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index ac065fa60c..8802dae639 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -1,11 +1,9 @@ use std::fmt::Write; use std::fs::File; use std::io::{BufRead, BufReader}; -use std::net::{SocketAddr, TcpStream}; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::Child; -use std::str::FromStr; use std::{fs, thread, time}; use anyhow::{bail, Result}; @@ -230,21 +228,16 @@ pub fn get_existing_dbs(client: &mut Client) -> Result> { Ok(postgres_dbs) } -/// Wait for Postgres to become ready to accept connections: -/// - state should be `ready` in the `pgdata/postmaster.pid` -/// - and we should be able to connect to 127.0.0.1:5432 -pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()> { +/// Wait for Postgres to become ready to accept connections. It's ready to +/// accept connections when the state-field in `pgdata/postmaster.pid` says +/// 'ready'. +pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { let pid_path = pgdata.join("postmaster.pid"); let mut slept: u64 = 0; // ms let pause = time::Duration::from_millis(100); - let timeout = time::Duration::from_millis(10); - let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap(); - loop { - // Sleep POSTGRES_WAIT_TIMEOUT at max (a bit longer actually if consider a TCP timeout, - // but postgres starts listening almost immediately, even if it is not really - // ready to accept connections). + // Sleep POSTGRES_WAIT_TIMEOUT at max if slept >= POSTGRES_WAIT_TIMEOUT { bail!("timed out while waiting for Postgres to start"); } @@ -263,10 +256,9 @@ pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<() // Pid file could be there and we could read it, but it could be empty, for example. if let Some(Ok(line)) = last_line { let status = line.trim(); - let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok(); // Now Postgres is ready to accept connections - if status == "ready" && can_connect { + if status == "ready" { break; } } @@ -276,6 +268,8 @@ pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<() slept += 100; } + log::info!("PostgreSQL is now running, continuing to configure it"); + Ok(()) } From 9b9bbad462160bf75df7ee69bc83a4da9eee2b38 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 4 Oct 2022 13:00:15 +0300 Subject: [PATCH 0863/1022] Use 'notify' crate to wait for PostgreSQL startup. Compute node startup time is very important. After launching PostgreSQL, use 'notify' to be notified immediately when it has updated the PID file, instead of polling. The polling loop had 100 ms interval so this shaves up to 100 ms from the startup time. --- Cargo.lock | 70 +++++++++++++++++++++++++++++++++ compute_tools/Cargo.toml | 2 + compute_tools/src/pg_helpers.rs | 62 +++++++++++++++++++++++------ workspace_hack/Cargo.toml | 1 + 4 files changed, 124 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ddb10352b8..69a8fa19ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -497,8 +497,10 @@ dependencies = [ "chrono", "clap 3.2.16", "env_logger", + "futures", "hyper", "log", + "notify", "postgres", "regex", "serde", @@ -1072,6 +1074,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "fsevent-sys" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76ee7a02da4d231650c7cea31349b889be2f45ddb3ef3032d2ec8185f6313fd2" +dependencies = [ + "libc", +] + [[package]] name = "futures" version = "0.3.21" @@ -1493,6 +1504,26 @@ dependencies = [ "str_stack", ] +[[package]] +name = "inotify" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff" +dependencies = [ + "bitflags", + "inotify-sys", + "libc", +] + +[[package]] +name = "inotify-sys" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" +dependencies = [ + "libc", +] + [[package]] name = "instant" version = "0.1.12" @@ -1552,6 +1583,26 @@ dependencies = [ "simple_asn1", ] +[[package]] +name = "kqueue" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d6112e8f37b59803ac47a42d14f1f3a59bbf72fc6857ffc5be455e28a691f8e" +dependencies = [ + "kqueue-sys", + "libc", +] + +[[package]] +name = "kqueue-sys" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8367585489f01bc55dd27404dcf56b95e6da061a256a666ab23be9ba96a2e587" +dependencies = [ + "bitflags", + "libc", +] + [[package]] name = "kstring" version = "1.0.6" @@ -1797,6 +1848,24 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "notify" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2c66da08abae1c024c01d635253e402341b4060a12e99b31c7594063bf490a" +dependencies = [ + "bitflags", + "crossbeam-channel", + "filetime", + "fsevent-sys", + "inotify", + "kqueue", + "libc", + "mio", + "walkdir", + "winapi", +] + [[package]] name = "num-bigint" version = "0.4.3" @@ -4142,6 +4211,7 @@ dependencies = [ "bstr", "bytes", "chrono", + "crossbeam-utils", "either", "fail", "hashbrown", diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index b13f7f191d..43cf7ae2dd 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -8,8 +8,10 @@ anyhow = "1.0" chrono = "0.4" clap = "3.0" env_logger = "0.9" +futures = "0.3.13" hyper = { version = "0.14", features = ["full"] } log = { version = "0.4", features = ["std", "serde"] } +notify = "5.0.0" postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } regex = "1" serde = { version = "1.0", features = ["derive"] } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 8802dae639..769dbfac73 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -1,16 +1,19 @@ use std::fmt::Write; +use std::fs; use std::fs::File; use std::io::{BufRead, BufReader}; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::Child; -use std::{fs, thread, time}; +use std::time::{Duration, Instant}; use anyhow::{bail, Result}; use postgres::{Client, Transaction}; use serde::Deserialize; -const POSTGRES_WAIT_TIMEOUT: u64 = 60 * 1000; // milliseconds +use notify::{RecursiveMode, Watcher}; + +const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds /// Rust representation of Postgres role info with only those fields /// that matter for us. @@ -233,29 +236,63 @@ pub fn get_existing_dbs(client: &mut Client) -> Result> { /// 'ready'. pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { let pid_path = pgdata.join("postmaster.pid"); - let mut slept: u64 = 0; // ms - let pause = time::Duration::from_millis(100); + // PostgreSQL writes line "ready" to the postmaster.pid file, when it has + // completed initialization and is ready to accept connections. We want to + // react quickly and perform the rest of our initialization as soon as + // PostgreSQL starts accepting connections. Use 'notify' to be notified + // whenever the PID file is changed, and whenever it changes, read it to + // check if it's now "ready". + // + // You cannot actually watch a file before it exists, so we first watch the + // data directory, and once the postmaster.pid file appears, we switch to + // watch the file instead. We also wake up every 100 ms to poll, just in + // case we miss some events for some reason. Not strictly necessary, but + // better safe than sorry. + let (tx, rx) = std::sync::mpsc::channel(); + let mut watcher = notify::recommended_watcher(move |res| { + let _ = tx.send(res); + })?; + watcher.watch(pgdata, RecursiveMode::NonRecursive)?; + + let started_at = Instant::now(); + let mut postmaster_pid_seen = false; loop { - // Sleep POSTGRES_WAIT_TIMEOUT at max - if slept >= POSTGRES_WAIT_TIMEOUT { - bail!("timed out while waiting for Postgres to start"); - } - if let Ok(Some(status)) = pg.try_wait() { // Postgres exited, that is not what we expected, bail out earlier. let code = status.code().unwrap_or(-1); bail!("Postgres exited unexpectedly with code {}", code); } + let res = rx.recv_timeout(Duration::from_millis(100)); + log::debug!("woken up by notify: {res:?}"); + // If there are multiple events in the channel already, we only need to be + // check once. Swallow the extra events before we go ahead to check the + // pid file. + while let Ok(res) = rx.try_recv() { + log::debug!("swallowing extra event: {res:?}"); + } + // Check that we can open pid file first. if let Ok(file) = File::open(&pid_path) { + if !postmaster_pid_seen { + log::debug!("postmaster.pid appeared"); + watcher + .unwatch(pgdata) + .expect("Failed to remove pgdata dir watch"); + watcher + .watch(&pid_path, RecursiveMode::NonRecursive) + .expect("Failed to add postmaster.pid file watch"); + postmaster_pid_seen = true; + } + let file = BufReader::new(file); let last_line = file.lines().last(); // Pid file could be there and we could read it, but it could be empty, for example. if let Some(Ok(line)) = last_line { let status = line.trim(); + log::debug!("last line of postmaster.pid: {status:?}"); // Now Postgres is ready to accept connections if status == "ready" { @@ -264,8 +301,11 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { } } - thread::sleep(pause); - slept += 100; + // Give up after POSTGRES_WAIT_TIMEOUT. + let duration = started_at.elapsed(); + if duration >= POSTGRES_WAIT_TIMEOUT { + bail!("timed out while waiting for Postgres to start"); + } } log::info!("PostgreSQL is now running, continuing to configure it"); diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index f37a42945e..6977665c7d 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -19,6 +19,7 @@ anyhow = { version = "1", features = ["backtrace", "std"] } bstr = { version = "0.2", features = ["lazy_static", "regex-automata", "serde", "serde1", "serde1-nostd", "std", "unicode"] } bytes = { version = "1", features = ["serde", "std"] } chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "std", "time", "winapi"] } +crossbeam-utils = { version = "0.8", features = ["once_cell", "std"] } either = { version = "1", features = ["use_std"] } fail = { version = "0.5", default-features = false, features = ["failpoints"] } hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } From 5cf53786f9196c9461119ed5a0653707b7804e96 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 26 Sep 2022 21:47:08 +0300 Subject: [PATCH 0864/1022] Improve pytest ergonomics 1. Disable perf tests by default 2. Add instruction to run tests in parallel --- pytest.ini | 1 + test_runner/README.md | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/pytest.ini b/pytest.ini index bfa07e520b..7197b078c6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -5,6 +5,7 @@ filterwarnings = ignore:record_property is incompatible with junit_family:pytest.PytestWarning addopts = -m 'not remote_cluster' + --ignore=test_runner/performance markers = remote_cluster testpaths = diff --git a/test_runner/README.md b/test_runner/README.md index d6ee5730ac..e066ac3235 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -56,6 +56,14 @@ If you want to run all tests that have the string "bench" in their names: `./scripts/pytest -k bench` +To run tests in parellel we utilize `pytest-xdist` plugin. By default everything runs single threaded. Number of workers can be specified with `-n` argument: + +`./scripts/pytest -n4` + +By default performance tests are excluded. To run them explicitly pass performance tests selection to the script: + +`./scripts/pytest test_runner/performance` + Useful environment variables: `NEON_BIN`: The directory where neon binaries can be found. From 231dfbaed630963e709166677908fff0b558e35e Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 3 Oct 2022 22:13:26 +0300 Subject: [PATCH 0865/1022] Do not remove empty timelines/ directory for tenants --- pageserver/src/tenant_mgr.rs | 44 ++++++++++++++++++----------- test_runner/regress/test_tenants.py | 37 ++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 16 deletions(-) diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 1efd3d4af4..0e8ee8c067 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -108,6 +108,10 @@ pub fn init_tenant_mgr( /// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken", /// and the load continues. /// +/// For successful tenant attach, it first has to have a `timelines/` subdirectory and a tenant config file that's loaded into memory successfully. +/// If either of the conditions fails, the tenant will be added to memory with [`TenantState::Broken`] state, otherwise we start to load its timelines. +/// Alternatively, tenant is considered loaded successfully, if it's already in pageserver's memory (i.e. was loaded already before). +/// /// Attach happens on startup and sucessful timeline downloads /// (some subset of timeline files, always including its metadata, after which the new one needs to be registered). pub fn attach_local_tenants( @@ -173,16 +177,28 @@ fn load_local_tenant( remote_index.clone(), conf.remote_storage_config.is_some(), )); - match Tenant::load_tenant_config(conf, tenant_id) { - Ok(tenant_conf) => { - tenant.update_tenant_config(tenant_conf); - tenant.activate(false); - } - Err(e) => { - error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}"); - tenant.set_state(TenantState::Broken); + + let tenant_timelines_dir = conf.timelines_path(&tenant_id); + if !tenant_timelines_dir.is_dir() { + error!( + "Tenant {} has no timelines directory at {}", + tenant_id, + tenant_timelines_dir.display() + ); + tenant.set_state(TenantState::Broken); + } else { + match Tenant::load_tenant_config(conf, tenant_id) { + Ok(tenant_conf) => { + tenant.update_tenant_config(tenant_conf); + tenant.activate(false); + } + Err(e) => { + error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}"); + tenant.set_state(TenantState::Broken); + } } } + tenant } @@ -630,14 +646,10 @@ fn collect_timelines_for_tenant( } if tenant_timelines.is_empty() { - match remove_if_empty(&timelines_dir) { - Ok(true) => info!( - "Removed empty tenant timelines directory {}", - timelines_dir.display() - ), - Ok(false) => (), - Err(e) => error!("Failed to remove empty tenant timelines directory: {e:?}"), - } + // this is normal, we've removed all broken, empty and temporary timeline dirs + // but should allow the tenant to stay functional and allow creating new timelines + // on a restart, we require tenants to have the timelines dir, so leave it on disk + debug!("Tenant {tenant_id} has no timelines loaded"); } Ok((tenant_id, tenant_timelines)) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 52b9e6369c..ba5109a16f 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -1,4 +1,5 @@ import os +import shutil from contextlib import closing from datetime import datetime from pathlib import Path @@ -201,3 +202,39 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde post_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)]) assert post_detach_samples == set() + + +def test_pageserver_with_empty_tenants(neon_simple_env: NeonEnv): + env = neon_simple_env + client = env.pageserver.http_client() + + tenant_without_timelines_dir = env.initial_tenant + shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_without_timelines_dir) / "timelines") + + tenant_with_empty_timelines_dir = client.tenant_create() + for timeline_dir_entry in Path.iterdir( + Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines_dir) / "timelines" + ): + if timeline_dir_entry.is_dir(): + shutil.rmtree(timeline_dir_entry) + else: + timeline_dir_entry.unlink() + + env.postgres.stop_all() + for _ in range(0, 3): + env.pageserver.stop() + env.pageserver.start() + + client = env.pageserver.http_client() + tenants = client.tenant_list() + + assert ( + len(tenants) == 1 + ), "Pageserver should attach only tenants with empty timelines/ dir on restart" + loaded_tenant = tenants[0] + assert loaded_tenant["id"] == str( + tenant_with_empty_timelines_dir + ), f"Tenant {tenant_with_empty_timelines_dir} should be loaded as the only one with tenants/ directory" + assert loaded_tenant["state"] == { + "Active": {"background_jobs_running": False} + }, "Empty tenant should be loaded and ready for timeline creation" From d823e84ed5497c61ff04b9a4f689470c62ec2e9a Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 3 Oct 2022 23:14:39 +0300 Subject: [PATCH 0866/1022] Allow attaching tenants with zero timelines --- pageserver/src/http/routes.rs | 13 ++++-- test_runner/fixtures/neon_fixtures.py | 7 +++- test_runner/regress/test_tenants.py | 57 ++++++++++++++++++++------- 3 files changed, 59 insertions(+), 18 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 55429420a8..a1bd65c308 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -337,9 +337,16 @@ async fn tenant_attach_handler(request: Request) -> Result, info!("Handling tenant attach {tenant_id}"); tokio::task::spawn_blocking(move || match tenant_mgr::get_tenant(tenant_id, false) { - Ok(_) => Err(ApiError::Conflict( - "Tenant is already present locally".to_owned(), - )), + Ok(tenant) => { + if tenant.list_timelines().is_empty() { + info!("Attaching to tenant {tenant_id} with zero timelines"); + Ok(()) + } else { + Err(ApiError::Conflict( + "Tenant is already present locally".to_owned(), + )) + } + } Err(_) => Ok(()), }) .await diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index aa9fd68df5..5c2c3edbd8 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -455,6 +455,9 @@ class RemoteStorageKind(enum.Enum): LOCAL_FS = "local_fs" MOCK_S3 = "mock_s3" REAL_S3 = "real_s3" + # Pass to tests that are generic to remote storage + # to ensure the test pass with or without the remote storage + NOOP = "noop" def available_remote_storages() -> List[RemoteStorageKind]: @@ -583,7 +586,9 @@ class NeonEnvBuilder: test_name: str, force_enable: bool = True, ): - if remote_storage_kind == RemoteStorageKind.LOCAL_FS: + if remote_storage_kind == RemoteStorageKind.NOOP: + return + elif remote_storage_kind == RemoteStorageKind.LOCAL_FS: self.enable_local_fs_remote_storage(force_enable=force_enable) elif remote_storage_kind == RemoteStorageKind.MOCK_S3: self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index ba5109a16f..f49b6fccb9 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -8,8 +8,13 @@ from typing import List import pytest from fixtures.log_helper import log from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder -from fixtures.types import Lsn, TenantId +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + RemoteStorageKind, + available_remote_storages, +) +from fixtures.types import Lsn, TenantId, TimelineId from prometheus_client.samples import Sample @@ -204,26 +209,50 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde assert post_detach_samples == set() -def test_pageserver_with_empty_tenants(neon_simple_env: NeonEnv): - env = neon_simple_env +# Check that empty tenants work with or without the remote storage +@pytest.mark.parametrize( + "remote_storage_kind", available_remote_storages() + [RemoteStorageKind.NOOP] +) +def test_pageserver_with_empty_tenants( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_pageserver_with_empty_tenants", + ) + + env = neon_env_builder.init_start() client = env.pageserver.http_client() tenant_without_timelines_dir = env.initial_tenant + log.info( + f"Tenant {tenant_without_timelines_dir} becomes broken: it abnormally looses tenants/ directory and is expected to be completely ignored when pageserver restarts" + ) shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_without_timelines_dir) / "timelines") tenant_with_empty_timelines_dir = client.tenant_create() - for timeline_dir_entry in Path.iterdir( - Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines_dir) / "timelines" - ): - if timeline_dir_entry.is_dir(): - shutil.rmtree(timeline_dir_entry) - else: - timeline_dir_entry.unlink() + log.info( + f"Tenant {tenant_with_empty_timelines_dir} gets all of its timelines deleted: still should be functional" + ) + temp_timelines = client.timeline_list(tenant_with_empty_timelines_dir) + for temp_timeline in temp_timelines: + client.timeline_delete( + tenant_with_empty_timelines_dir, TimelineId(temp_timeline["timeline_id"]) + ) + files_in_timelines_dir = sum( + 1 + for _p in Path.iterdir( + Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines_dir) / "timelines" + ) + ) + assert ( + files_in_timelines_dir == 0 + ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory" + # Trigger timeline reinitialization after pageserver restart env.postgres.stop_all() - for _ in range(0, 3): - env.pageserver.stop() - env.pageserver.start() + env.pageserver.stop() + env.pageserver.start() client = env.pageserver.http_client() tenants = client.tenant_list() From 580584c8fce303da90d898d81703ab54e81e39b9 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Tue, 4 Oct 2022 19:14:45 +0100 Subject: [PATCH 0867/1022] Remove control_plane deps on pageserver/safekeeper (#2513) Creates new `pageserver_api` and `safekeeper_api` crates to serve as the shared dependencies. Should reduce both recompile times and cold compile times. Decreases the size of the optimized `neon_local` binary: 380M -> 179M. No significant changes for anything else (mostly as expected). --- Cargo.lock | 28 +++++++++++++++++-- control_plane/Cargo.toml | 6 ++-- control_plane/src/bin/neon_local.rs | 6 ++-- control_plane/src/safekeeper.rs | 2 +- control_plane/src/storage.rs | 2 +- libs/pageserver_api/Cargo.toml | 12 ++++++++ libs/pageserver_api/src/lib.rs | 9 ++++++ .../pageserver_api/src}/models.rs | 12 +++++++- libs/safekeeper_api/Cargo.toml | 12 ++++++++ libs/safekeeper_api/src/lib.rs | 10 +++++++ .../safekeeper_api/src}/models.rs | 0 pageserver/Cargo.toml | 1 + pageserver/src/config.rs | 8 +++--- pageserver/src/http/mod.rs | 3 +- pageserver/src/tenant.rs | 13 +-------- safekeeper/Cargo.toml | 1 + safekeeper/src/http/mod.rs | 3 +- safekeeper/src/lib.rs | 9 +++--- 18 files changed, 104 insertions(+), 33 deletions(-) create mode 100644 libs/pageserver_api/Cargo.toml create mode 100644 libs/pageserver_api/src/lib.rs rename {pageserver/src/http => libs/pageserver_api/src}/models.rs (90%) create mode 100644 libs/safekeeper_api/Cargo.toml create mode 100644 libs/safekeeper_api/src/lib.rs rename {safekeeper/src/http => libs/safekeeper_api/src}/models.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index 69a8fa19ab..ab508c7109 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -542,11 +542,11 @@ dependencies = [ "git-version", "nix", "once_cell", - "pageserver", + "pageserver_api", "postgres", "regex", "reqwest", - "safekeeper", + "safekeeper_api", "serde", "serde_with", "tar", @@ -2044,6 +2044,7 @@ dependencies = [ "nix", "num-traits", "once_cell", + "pageserver_api", "postgres", "postgres-protocol", "postgres-types", @@ -2072,6 +2073,17 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_api" +version = "0.1.0" +dependencies = [ + "const_format", + "serde", + "serde_with", + "utils", + "workspace_hack", +] + [[package]] name = "parking_lot" version = "0.11.2" @@ -2960,6 +2972,7 @@ dependencies = [ "postgres_ffi", "regex", "remote_storage", + "safekeeper_api", "serde", "serde_json", "serde_with", @@ -2975,6 +2988,17 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "safekeeper_api" +version = "0.1.0" +dependencies = [ + "const_format", + "serde", + "serde_with", + "utils", + "workspace_hack", +] + [[package]] name = "same-file" version = "1.0.6" diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index ab9df8534c..ee8481e141 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -19,7 +19,9 @@ thiserror = "1" nix = "0.23" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } -pageserver = { path = "../pageserver" } -safekeeper = { path = "../safekeeper" } +# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api +# instead, so that recompile times are better. +pageserver_api = { path = "../libs/pageserver_api" } +safekeeper_api = { path = "../libs/safekeeper_api" } utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 93947d5326..0c26842b34 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -12,12 +12,12 @@ use control_plane::local_env::{EtcdBroker, LocalEnv}; use control_plane::safekeeper::SafekeeperNode; use control_plane::storage::PageServerNode; use control_plane::{etcd, local_env}; -use pageserver::config::defaults::{ +use pageserver_api::models::TimelineInfo; +use pageserver_api::{ DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR, }; -use pageserver::http::models::TimelineInfo; -use safekeeper::defaults::{ +use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 600a9ffe05..34b2f3000a 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -12,7 +12,7 @@ use nix::unistd::Pid; use postgres::Config; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; -use safekeeper::http::models::TimelineCreateRequest; +use safekeeper_api::models::TimelineCreateRequest; use thiserror::Error; use utils::{ connstring::connection_address, diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index bfbd6e91c3..59cb3d7efb 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -11,7 +11,7 @@ use anyhow::{bail, Context}; use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; -use pageserver::http::models::{ +use pageserver_api::models::{ TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo, }; use postgres::{Config, NoTls}; diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml new file mode 100644 index 0000000000..be8762100c --- /dev/null +++ b/libs/pageserver_api/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "pageserver_api" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde = { version = "1.0", features = ["derive"] } +serde_with = "1.12.0" +const_format = "0.2.21" + +utils = { path = "../utils" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs new file mode 100644 index 0000000000..a36c1692a9 --- /dev/null +++ b/libs/pageserver_api/src/lib.rs @@ -0,0 +1,9 @@ +use const_format::formatcp; + +/// Public API types +pub mod models; + +pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; +pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); +pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; +pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); diff --git a/pageserver/src/http/models.rs b/libs/pageserver_api/src/models.rs similarity index 90% rename from pageserver/src/http/models.rs rename to libs/pageserver_api/src/models.rs index d5559653b2..43059ead84 100644 --- a/pageserver/src/http/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -7,7 +7,17 @@ use utils::{ lsn::Lsn, }; -use crate::tenant::TenantState; +/// A state of a tenant in pageserver's memory. +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum TenantState { + /// Tenant is fully operational, its background jobs might be running or not. + Active { background_jobs_running: bool }, + /// A tenant is recognized by pageserver, but not yet ready to operate: + /// e.g. not present locally and being downloaded or being read into memory from the file system. + Paused, + /// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated. + Broken, +} #[serde_as] #[derive(Serialize, Deserialize)] diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml new file mode 100644 index 0000000000..852d643f30 --- /dev/null +++ b/libs/safekeeper_api/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "safekeeper_api" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde = { version = "1.0", features = ["derive"] } +serde_with = "1.12.0" +const_format = "0.2.21" + +utils = { path = "../utils" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs new file mode 100644 index 0000000000..0a391478da --- /dev/null +++ b/libs/safekeeper_api/src/lib.rs @@ -0,0 +1,10 @@ +use const_format::formatcp; + +/// Public API types +pub mod models; + +pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; +pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); + +pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; +pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); diff --git a/safekeeper/src/http/models.rs b/libs/safekeeper_api/src/models.rs similarity index 100% rename from safekeeper/src/http/models.rs rename to libs/safekeeper_api/src/models.rs diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 1ec7ec4f98..88430f3a86 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -58,6 +58,7 @@ rstar = "0.9.3" num-traits = "0.2.15" amplify_num = "0.4.1" +pageserver_api = { path = "../libs/pageserver_api" } postgres_ffi = { path = "../libs/postgres_ffi" } etcd_broker = { path = "../libs/etcd_broker" } metrics = { path = "../libs/metrics" } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index a52a3e8262..6e3c7baad8 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -30,10 +30,10 @@ pub mod defaults { use crate::tenant_config::defaults::*; use const_format::formatcp; - pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; - pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); - pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; - pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); + pub use pageserver_api::{ + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PG_LISTEN_PORT, + }; pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; diff --git a/pageserver/src/http/mod.rs b/pageserver/src/http/mod.rs index 4c0be17ecd..1c083bd382 100644 --- a/pageserver/src/http/mod.rs +++ b/pageserver/src/http/mod.rs @@ -1,3 +1,4 @@ -pub mod models; pub mod routes; pub use routes::make_router; + +pub use pageserver_api::models; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 672ee3a488..c2fb9ef242 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -45,6 +45,7 @@ use crate::tenant_config::TenantConfOpt; use crate::virtual_file::VirtualFile; use crate::walredo::WalRedoManager; use crate::{CheckpointConfig, TEMP_FILE_SUFFIX}; +pub use pageserver_api::models::TenantState; use toml_edit; use utils::{ @@ -118,18 +119,6 @@ pub struct Tenant { upload_layers: bool, } -/// A state of a tenant in pageserver's memory. -#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] -pub enum TenantState { - /// Tenant is fully operational, its background jobs might be running or not. - Active { background_jobs_running: bool }, - /// A tenant is recognized by pageserver, but not yet ready to operate: - /// e.g. not present locally and being downloaded or being read into memory from the file system. - Paused, - /// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated. - Broken, -} - /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. impl Tenant { diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 87ee63d1df..cb1cecade9 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -33,6 +33,7 @@ toml_edit = { version = "0.13", features = ["easy"] } thiserror = "1" parking_lot = "0.12.1" +safekeeper_api = { path = "../libs/safekeeper_api" } postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } utils = { path = "../libs/utils" } diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 4c0be17ecd..1831470007 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -1,3 +1,4 @@ -pub mod models; pub mod routes; pub use routes::make_router; + +pub use safekeeper_api::models; diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 58a237a5d3..e38a5a4633 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -27,14 +27,13 @@ mod timelines_global_map; pub use timelines_global_map::GlobalTimelines; pub mod defaults { - use const_format::formatcp; use std::time::Duration; - pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; - pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); + pub use safekeeper_api::{ + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PG_LISTEN_PORT, + }; - pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; - pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(10); pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8; } From b99bed510d742babc061097a528f2dc09284c681 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Wed, 5 Oct 2022 16:14:09 +0300 Subject: [PATCH 0868/1022] Move proxies to neon-proxy namespace (#2555) --- .github/workflows/build_and_test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 22042489a8..4f2f8f0833 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -768,5 +768,5 @@ jobs: - name: Re-deploy proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s - helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s From f25dd75be9539e44b4d3d8c5864f73cea910f897 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 6 Oct 2022 01:07:02 +0300 Subject: [PATCH 0869/1022] Fix deadlock in safekeeper metrics (#2566) We had a problem where almost all of the threads were waiting on a futex syscall. More specifically: - `/metrics` handler was inside `TimelineCollector::collect()`, waiting on a mutex for a single Timeline - This exact timeline was inside `control_file::FileStorage::persist()`, waiting on a mutex for Lazy initialization of `PERSIST_CONTROL_FILE_SECONDS` - `PERSIST_CONTROL_FILE_SECONDS: Lazy` was blocked on `prometheus::register` - `prometheus::register` calls `DEFAULT_REGISTRY.write().register()` to take a write lock on Registry and add a new metric - `DEFAULT_REGISTRY` lock was already taken inside `DEFAULT_REGISTRY.gather()`, which was called by `/metrics` handler to collect all metrics This commit creates another Registry with a separate lock, to avoid deadlock in a case where `TimelineCollector` triggers registration of new metrics inside default registry. --- libs/metrics/src/lib.rs | 19 +++++++++++++++++-- libs/utils/src/http/endpoint.rs | 9 ++++++++- safekeeper/src/bin/safekeeper.rs | 3 +-- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 920d3fd17e..e290828d37 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -3,7 +3,7 @@ //! Otherwise, we might not see all metrics registered via //! a default registry. use once_cell::sync::Lazy; -use prometheus::core::{AtomicU64, GenericGauge, GenericGaugeVec}; +use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec}; pub use prometheus::opts; pub use prometheus::register; pub use prometheus::{core, default_registry, proto}; @@ -17,6 +17,7 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec}; pub use prometheus::{register_int_gauge, IntGauge}; pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; pub use prometheus::{Encoder, TextEncoder}; +use prometheus::{Registry, Result}; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; @@ -32,13 +33,27 @@ macro_rules! register_uint_gauge_vec { }}; } +/// Special internal registry, to collect metrics independently from the default registry. +/// Was introduced to fix deadlock with lazy registration of metrics in the default registry. +static INTERNAL_REGISTRY: Lazy = Lazy::new(Registry::new); + +/// Register a collector in the internal registry. MUST be called before the first call to `gather()`. +/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector +/// while holding the lock. +pub fn register_internal(c: Box) -> Result<()> { + INTERNAL_REGISTRY.register(c) +} + /// Gathers all Prometheus metrics and records the I/O stats just before that. /// /// Metrics gathering is a relatively simple and standalone operation, so /// it might be fine to do it this way to keep things simple. pub fn gather() -> Vec { update_rusage_metrics(); - prometheus::gather() + let mut mfs = prometheus::gather(); + let mut internal_mfs = INTERNAL_REGISTRY.gather(); + mfs.append(&mut internal_mfs); + mfs } static DISK_IO_BYTES: Lazy = Lazy::new(|| { diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 4066791e2b..7a519929cf 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -9,6 +9,7 @@ use once_cell::sync::Lazy; use routerify::ext::RequestExt; use routerify::RequestInfo; use routerify::{Middleware, Router, RouterBuilder, RouterService}; +use tokio::task::JoinError; use tracing::info; use std::future::Future; @@ -35,7 +36,13 @@ async fn prometheus_metrics_handler(_req: Request) -> Result, init: bo // Register metrics collector for active timelines. It's important to do this // after daemonizing, otherwise process collector will be upset. - let registry = metrics::default_registry(); let timeline_collector = safekeeper::metrics::TimelineCollector::new(); - registry.register(Box::new(timeline_collector))?; + metrics::register_internal(Box::new(timeline_collector))?; let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; From ff8c481777ecb82c6553a9235f79199154a5a8b3 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 6 Oct 2022 09:01:56 +0300 Subject: [PATCH 0870/1022] Normalize last_record LSN in wal receiver (#2529) * Add test for branching on page boundary * Normalize start recovery point Co-authored-by: Heikki Linnakangas Co-authored-by: Thang Pham --- .../src/walreceiver/walreceiver_connection.rs | 10 +++++ test_runner/regress/test_branching.py | 38 +++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index ef5baeb570..a4a6af455c 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -12,6 +12,8 @@ use chrono::{NaiveDateTime, Utc}; use fail::fail_point; use futures::StreamExt; use postgres::{SimpleQueryMessage, SimpleQueryRow}; +use postgres_ffi::v14::xlog_utils::normalize_lsn; +use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; use tokio::{pin, select, sync::watch, time}; @@ -156,6 +158,14 @@ pub async fn handle_walreceiver_connection( // There might be some padding after the last full record, skip it. startpoint += startpoint.calc_padding(8u32); + // If the starting point is at a WAL page boundary, skip past the page header. We don't need the page headers + // for anything, and in some corner cases, the compute node might have never generated the WAL for page headers + //. That happens if you create a branch at page boundary: the start point of the branch is at the page boundary, + // but when the compute node first starts on the branch, we normalize the first REDO position to just after the page + // header (see generate_pg_control()), so the WAL for the page header is never streamed from the compute node + // to the safekeepers. + startpoint = normalize_lsn(startpoint, WAL_SEGMENT_SIZE); + info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}..."); let query = format!("START_REPLICATION PHYSICAL {startpoint}"); diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 0c1490294d..3b78700e9f 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -6,6 +6,8 @@ from typing import List import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres +from fixtures.types import Lsn +from fixtures.utils import query_scalar from performance.test_perf_pgbench import get_scales_matrix @@ -88,3 +90,39 @@ def test_branching_with_pgbench( for pg in pgs: res = pg.safe_psql("SELECT count(*) from pgbench_accounts") assert res[0] == (100000 * scale,) + + +# Test branching from an "unnormalized" LSN. +# +# Context: +# When doing basebackup for a newly created branch, pageserver generates +# 'pg_control' file to bootstrap WAL segment by specifying the redo position +# a "normalized" LSN based on the timeline's starting LSN: +# +# checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0; +# +# This test checks if the pageserver is able to handle a "unnormalized" starting LSN. +# +# Related: see discussion in https://github.com/neondatabase/neon/pull/2143#issuecomment-1209092186 +def test_branching_unnormalized_start_lsn(neon_simple_env: NeonEnv, pg_bin: PgBin): + XLOG_BLCKSZ = 8192 + + env = neon_simple_env + + env.neon_cli.create_branch("b0") + pg0 = env.postgres.create_start("b0") + + pg_bin.run_capture(["pgbench", "-i", pg0.connstr()]) + + with pg0.cursor() as cur: + curr_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # Specify the `start_lsn` as a number that is divided by `XLOG_BLCKSZ` + # and is smaller than `curr_lsn`. + start_lsn = Lsn((int(curr_lsn) - XLOG_BLCKSZ) // XLOG_BLCKSZ * XLOG_BLCKSZ) + + log.info(f"Branching b1 from b0 starting at lsn {start_lsn}...") + env.neon_cli.create_branch("b1", "b0", ancestor_start_lsn=start_lsn) + pg1 = env.postgres.create_start("b1") + + pg_bin.run_capture(["pgbench", "-i", pg1.connstr()]) From c5a428a61a7d60b7f75b062a18b9257d2fe6896d Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 4 Oct 2022 21:27:18 +0300 Subject: [PATCH 0871/1022] Update Dockerfile.compute-node-v15 to match v14 version. Fix build script to promote the image for v15 to neon dockerhub --- .github/workflows/build_and_test.yml | 4 +-- Dockerfile.compute-node-v15 | 51 +++++++++++++++++++++++----- 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 4f2f8f0833..72018a12a8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -564,7 +564,7 @@ jobs: promote-images: runs-on: dev - needs: [ neon-image, compute-node-image, compute-node-image-v14, compute-tools-image ] + needs: [ neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ] if: github.event_name != 'workflow_dispatch' container: amazon/aws-cli strategy: @@ -573,7 +573,7 @@ jobs: # compute-node uses postgres 14, which is default now # cloud repo depends on this image name, thus duplicating it # remove compute-node when cloud repo is updated - name: [ neon, compute-node, compute-node-v14, compute-tools ] + name: [ neon, compute-node, compute-node-v14, compute-node-v15, compute-tools ] steps: - name: Promote image to latest diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15 index f949ef7680..7e33a0d7c8 100644 --- a/Dockerfile.compute-node-v15 +++ b/Dockerfile.compute-node-v15 @@ -13,9 +13,12 @@ ARG TAG=pinned # Layer "build-deps" # FROM debian:bullseye-slim AS build-deps +RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ + echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + apt update RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ - libcurl4-openssl-dev libossp-uuid-dev + libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev # # Layer "pg-build" @@ -42,7 +45,7 @@ RUN cd postgres && \ FROM build-deps AS postgis-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget + apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ tar xvzf postgis-3.3.0.tar.gz && \ @@ -64,15 +67,13 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ # Build plv8 # FROM build-deps AS plv8-build -COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev + apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 # https://github.com/plv8/plv8/issues/475 # Debian bullseye provides binutils 2.35 when >= 2.38 is necessary -RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ - echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ - apt update && \ +RUN apt update && \ apt install -y --no-install-recommends -t testing binutils RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ @@ -84,12 +85,46 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ rm -rf /plv8-* && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control +# +# Layer "h3-pg-build" +# Build h3_pg +# +FROM build-deps AS h3-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +# packaged cmake is too old +RUN apt update && \ + apt install -y --no-install-recommends -t testing cmake + +RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \ + tar xvzf h3.tgz && \ + cd h3-4.0.1 && \ + mkdir build && \ + cd build && \ + cmake .. -DCMAKE_BUILD_TYPE=Release && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + DESTDIR=/h3 make install && \ + cp -R /h3/usr / && \ + rm -rf build + +RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \ + tar xvzf h3-pg.tgz && \ + cd h3-pg-4.0.1 && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control + # # Layer "neon-pg-ext-build" # compile neon extensions # FROM build-deps AS neon-pg-ext-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +# plv8 still sometimes crashes during the creation +# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /h3/usr / COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -137,8 +172,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ chmod 0750 /var/db/postgres/compute && \ echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig -# TODO: Check if we can make the extension setup more modular versus a linear build -# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc# COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl From 4a216c5f7f3735c34bae9810501a662559e666c8 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 5 Oct 2022 11:06:13 +0300 Subject: [PATCH 0872/1022] Use PostGIS 3.3.1 that is compatible with pg 15 --- Dockerfile.compute-node-v15 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15 index 7e33a0d7c8..bdb4330c4f 100644 --- a/Dockerfile.compute-node-v15 +++ b/Dockerfile.compute-node-v15 @@ -5,7 +5,7 @@ ARG TAG=pinned # apparently, ARGs don't get replaced in RUN commands in kaniko -# ARG POSTGIS_VERSION=3.3.0 +# ARG POSTGIS_VERSION=3.3.1 # ARG PLV8_VERSION=3.1.4 # ARG PG_VERSION=v15 @@ -47,9 +47,9 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc -RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ - tar xvzf postgis-3.3.0.tar.gz && \ - cd postgis-3.3.0 && \ +RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \ + tar xvzf postgis-3.3.1.tar.gz && \ + cd postgis-3.3.1 && \ ./autogen.sh && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ ./configure && \ From ed85d97f1754c8ce64958c5c73d02bf017a8f81c Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 5 Oct 2022 14:22:41 +0300 Subject: [PATCH 0873/1022] bump vendor/postgres-v15. Rebase it to Stamp 15rc2 --- vendor/postgres-v15 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 9383aaa9c2..ff18cec1ee 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 9383aaa9c2616fd81cfafb058fe0d692f5e43ac3 +Subproject commit ff18cec1ee9b80055accd9c76b040875329b11ed From 254cb7dc4f8968373a154c020d1c843559000551 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 5 Oct 2022 19:02:11 +0300 Subject: [PATCH 0874/1022] Update CI script to push compute-node-v15 to dockerhub --- .github/workflows/build_and_test.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 72018a12a8..6556fb6c9b 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -608,6 +608,9 @@ jobs: - name: Pull compute node v14 image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest compute-node-v14 + - name: Pull compute node v15 image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest compute-node-v15 + - name: Pull rust image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust @@ -638,6 +641,9 @@ jobs: - name: Push compute node v14 image to Docker Hub run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} + - name: Push compute node v15 image to Docker Hub + run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} + - name: Push rust image to Docker Hub run: crane push rust neondatabase/rust:pinned @@ -650,6 +656,7 @@ jobs: crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest calculate-deploy-targets: runs-on: [ self-hosted, Linux, k8s-runner ] From e8b195acb7bccb564c014dcbdc887ebeb52f1a51 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Oct 2022 11:13:40 +0300 Subject: [PATCH 0875/1022] fix: apply notify workaround on m1 mac docker (#2564) workaround as discussed in the notify repository. --- compute_tools/src/pg_helpers.rs | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 769dbfac73..ad7ea0abc8 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -250,9 +250,36 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { // case we miss some events for some reason. Not strictly necessary, but // better safe than sorry. let (tx, rx) = std::sync::mpsc::channel(); - let mut watcher = notify::recommended_watcher(move |res| { + let (mut watcher, rx): (Box, _) = match notify::recommended_watcher(move |res| { let _ = tx.send(res); - })?; + }) { + Ok(watcher) => (Box::new(watcher), rx), + Err(e) => { + match e.kind { + notify::ErrorKind::Io(os) if os.raw_os_error() == Some(38) => { + // docker on m1 macs does not support recommended_watcher + // but return "Function not implemented (os error 38)" + // see https://github.com/notify-rs/notify/issues/423 + let (tx, rx) = std::sync::mpsc::channel(); + + // let's poll it faster than what we check the results for (100ms) + let config = + notify::Config::default().with_poll_interval(Duration::from_millis(50)); + + let watcher = notify::PollWatcher::new( + move |res| { + let _ = tx.send(res); + }, + config, + )?; + + (Box::new(watcher), rx) + } + _ => return Err(e.into()), + } + } + }; + watcher.watch(pgdata, RecursiveMode::NonRecursive)?; let started_at = Instant::now(); From 47bae68a2eb889375b332b726e31597d2a06f0a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s?= Date: Thu, 6 Oct 2022 11:42:50 +0200 Subject: [PATCH 0876/1022] Make get_lsn_by_timestamp available in mgmt API (#2536) (#2560) Co-authored-by: andres --- pageserver/src/http/openapi_spec.yml | 56 ++++++++++++++++++++++++ pageserver/src/http/routes.rs | 49 +++++++++++++++++++++ pageserver/src/page_service.rs | 30 ------------- test_runner/fixtures/neon_fixtures.py | 13 ++++++ test_runner/regress/test_lsn_mapping.py | 57 +++++++++++-------------- 5 files changed, 144 insertions(+), 61 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 4e748207c8..97fdcd7bbd 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -207,6 +207,62 @@ paths: schema: $ref: "#/components/schemas/Error" + + /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + get: + description: Get LSN by a timestamp + parameters: + - name: timestamp + in: query + required: true + schema: + type: string + format: date-time + description: A timestamp to get the LSN + responses: + "200": + description: OK + content: + application/json: + schema: + type: string + "400": + description: Error when no tenant id found in path, no timeline id or invalid timestamp + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/attach: parameters: - name: tenant_id diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a1bd65c308..e743f27aff 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -12,6 +12,7 @@ use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, }; +use crate::pgdatadir_mapping::LsnForTimestamp; use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant::{TenantState, Timeline}; @@ -265,6 +266,23 @@ fn query_param_present(request: &Request, param: &str) -> bool { .unwrap_or(false) } +fn get_query_param(request: &Request, param_name: &str) -> Result { + request.uri().query().map_or( + Err(ApiError::BadRequest(anyhow!("empty query in request"))), + |v| { + url::form_urlencoded::parse(v.as_bytes()) + .into_owned() + .find(|(k, _)| k == param_name) + .map_or( + Err(ApiError::BadRequest(anyhow!( + "no {param_name} specified in query parameters" + ))), + |(_, v)| Ok(v), + ) + }, + ) +} + async fn timeline_detail_handler(request: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -329,6 +347,33 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let timestamp_raw = get_query_param(&request, "timestamp")?; + let timestamp = humantime::parse_rfc3339(timestamp_raw.as_str()) + .with_context(|| format!("Invalid time: {:?}", timestamp_raw)) + .map_err(ApiError::BadRequest)?; + let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp); + + let timeline = tenant_mgr::get_tenant(tenant_id, true) + .and_then(|tenant| tenant.get_timeline(timeline_id)) + .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}")) + .map_err(ApiError::NotFound)?; + let result = match timeline + .find_lsn_for_timestamp(timestamp_pg) + .map_err(ApiError::InternalServerError)? + { + LsnForTimestamp::Present(lsn) => format!("{}", lsn), + LsnForTimestamp::Future(_lsn) => "future".into(), + LsnForTimestamp::Past(_lsn) => "past".into(), + LsnForTimestamp::NoData(_lsn) => "nodata".into(), + }; + json_response(StatusCode::OK, result) +} + // TODO makes sense to provide tenant config right away the same way as it handled in tenant_create async fn tenant_attach_handler(request: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; @@ -908,6 +953,10 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler, ) + .get( + "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp", + get_lsn_by_timestamp_handler, + ) .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", testing_api!("run timeline GC", timeline_gc_handler), diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 758faa4d9a..795a99058d 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -12,7 +12,6 @@ use anyhow::{bail, ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use futures::{Stream, StreamExt}; -use regex::Regex; use std::io; use std::net::TcpListener; use std::str; @@ -35,7 +34,6 @@ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; -use crate::pgdatadir_mapping::LsnForTimestamp; use crate::profiling::profpoint_start; use crate::reltag::RelTag; use crate::task_mgr; @@ -45,7 +43,6 @@ use crate::tenant_mgr; use crate::CheckpointConfig; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; -use postgres_ffi::to_pg_timestamp; use postgres_ffi::BLCKSZ; // Wrapped in libpq CopyData @@ -1062,33 +1059,6 @@ impl postgres_backend_async::Handler for PageServerHandler { Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()), ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("get_lsn_by_timestamp ") { - // Locate LSN of last transaction with timestamp less or equal than sppecified - // TODO lazy static - let re = Regex::new(r"^get_lsn_by_timestamp ([[:xdigit:]]+) ([[:xdigit:]]+) '(.*)'$") - .unwrap(); - let caps = re - .captures(query_string) - .with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?; - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; - let timestamp_pg = to_pg_timestamp(timestamp); - - self.check_permission(Some(tenant_id))?; - - let timeline = get_local_timeline(tenant_id, timeline_id)?; - pgb.write_message(&BeMessage::RowDescription(&[RowDescriptor::text_col( - b"lsn", - )]))?; - let result = match timeline.find_lsn_for_timestamp(timestamp_pg)? { - LsnForTimestamp::Present(lsn) => format!("{}", lsn), - LsnForTimestamp::Future(_lsn) => "future".into(), - LsnForTimestamp::Past(_lsn) => "past".into(), - LsnForTimestamp::NoData(_lsn) => "nodata".into(), - }; - pgb.write_message(&BeMessage::DataRow(&[Some(result.as_bytes())]))?; - pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { bail!("unknown command"); } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 5c2c3edbd8..38d818b3d8 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1136,6 +1136,19 @@ class NeonPageserverHttpClient(requests.Session): assert res_json is None return res_json + def timeline_get_lsn_by_timestamp( + self, tenant_id: TenantId, timeline_id: TimelineId, timestamp + ): + log.info( + f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}" + ) + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}", + ) + self.verbose_error(res) + res_json = res.json() + return res_json + def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") res = self.put( diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index ef99954a76..c5a49a6704 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -15,7 +15,6 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): pgmain = env.postgres.create_start("test_lsn_mapping") log.info("postgres is running on 'test_lsn_mapping' branch") - ps_cur = env.pageserver.connect().cursor() cur = pgmain.connect().cursor() # Create table, and insert rows, each in a separate transaction # Disable synchronous_commit to make this initialization go faster. @@ -38,37 +37,33 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Wait until WAL is received by pageserver wait_for_last_flush_lsn(env, pgmain, env.initial_tenant, new_timeline_id) - # Check edge cases: timestamp in the future - probe_timestamp = tbl[-1][1] + timedelta(hours=1) - result = query_scalar( - ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant} {new_timeline_id} '{probe_timestamp.isoformat()}Z'", - ) - assert result == "future" - - # timestamp too the far history - probe_timestamp = tbl[0][1] - timedelta(hours=10) - result = query_scalar( - ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant} {new_timeline_id} '{probe_timestamp.isoformat()}Z'", - ) - assert result == "past" - - # Probe a bunch of timestamps in the valid range - for i in range(1, len(tbl), 100): - probe_timestamp = tbl[i][1] - - # Call get_lsn_by_timestamp to get the LSN - lsn = query_scalar( - ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant} {new_timeline_id} '{probe_timestamp.isoformat()}Z'", + with env.pageserver.http_client() as client: + # Check edge cases: timestamp in the future + probe_timestamp = tbl[-1][1] + timedelta(hours=1) + result = client.timeline_get_lsn_by_timestamp( + env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z" ) + assert result == "future" - # Launch a new read-only node at that LSN, and check that only the rows - # that were supposed to be committed at that point in time are visible. - pg_here = env.postgres.create_start( - branch_name="test_lsn_mapping", node_name="test_lsn_mapping_read", lsn=lsn + # timestamp too the far history + probe_timestamp = tbl[0][1] - timedelta(hours=10) + result = client.timeline_get_lsn_by_timestamp( + env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z" ) - assert pg_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i + assert result == "past" - pg_here.stop_and_destroy() + # Probe a bunch of timestamps in the valid range + for i in range(1, len(tbl), 100): + probe_timestamp = tbl[i][1] + lsn = client.timeline_get_lsn_by_timestamp( + env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z" + ) + # Call get_lsn_by_timestamp to get the LSN + # Launch a new read-only node at that LSN, and check that only the rows + # that were supposed to be committed at that point in time are visible. + pg_here = env.postgres.create_start( + branch_name="test_lsn_mapping", node_name="test_lsn_mapping_read", lsn=lsn + ) + assert pg_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i + + pg_here.stop_and_destroy() From 687ba81366491c84250e4df9c20e31e595b34476 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 6 Oct 2022 16:53:52 +0300 Subject: [PATCH 0877/1022] Display sync safekeepers output in compute_ctl (#2571) Pipe postgres output to compute_ctl stdout and create a test to check that compute_ctl works and prints postgres logs. --- compute_tools/src/compute.rs | 7 +- test_runner/fixtures/neon_fixtures.py | 10 ++ test_runner/regress/test_compute_ctl.py | 203 ++++++++++++++++++++++++ 3 files changed, 216 insertions(+), 4 deletions(-) create mode 100644 test_runner/regress/test_compute_ctl.py diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 1e848627e3..bfdd2340ec 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -178,7 +178,6 @@ impl ComputeNode { .args(&["--sync-safekeepers"]) .env("PGDATA", &self.pgdata) // we cannot use -D in this mode .stdout(Stdio::piped()) - .stderr(Stdio::piped()) .spawn() .expect("postgres --sync-safekeepers failed to start"); @@ -191,10 +190,10 @@ impl ComputeNode { if !sync_output.status.success() { anyhow::bail!( - "postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}, stderr: {}", + "postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}", sync_output.status, - String::from_utf8(sync_output.stdout).expect("postgres --sync-safekeepers exited, and stdout is not utf-8"), - String::from_utf8(sync_output.stderr).expect("postgres --sync-safekeepers exited, and stderr is not utf-8"), + String::from_utf8(sync_output.stdout) + .expect("postgres --sync-safekeepers exited, and stdout is not utf-8"), ); } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 38d818b3d8..28c65223ba 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1200,6 +1200,7 @@ class AbstractNeonCli(abc.ABC): arguments: List[str], extra_env_vars: Optional[Dict[str, str]] = None, check_return_code=True, + timeout=None, ) -> "subprocess.CompletedProcess[str]": """ Run the command with the specified arguments. @@ -1246,6 +1247,7 @@ class AbstractNeonCli(abc.ABC): universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + timeout=timeout, ) if not res.returncode: log.info(f"Run success: {res.stdout}") @@ -1619,6 +1621,14 @@ class WalCraft(AbstractNeonCli): res.check_returncode() +class ComputeCtl(AbstractNeonCli): + """ + A typed wrapper around the `compute_ctl` CLI tool. + """ + + COMMAND = "compute_ctl" + + class NeonPageserver(PgProtocol): """ An object representing a running pageserver. diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py new file mode 100644 index 0000000000..01b64b8b17 --- /dev/null +++ b/test_runner/regress/test_compute_ctl.py @@ -0,0 +1,203 @@ +import os +from subprocess import TimeoutExpired + +from fixtures.log_helper import log +from fixtures.neon_fixtures import ComputeCtl, NeonEnvBuilder, PgBin + + +# Test that compute_ctl works and prints "--sync-safekeepers" logs. +def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + ctl = ComputeCtl(env) + + env.neon_cli.create_branch("test_compute_ctl", "main") + pg = env.postgres.create_start("test_compute_ctl") + pg.safe_psql("CREATE TABLE t(key int primary key, value text)") + + with open(pg.config_file_path(), "r") as f: + cfg_lines = f.readlines() + cfg_map = {} + for line in cfg_lines: + if "=" in line: + k, v = line.split("=") + cfg_map[k] = v.strip("\n '\"") + log.info(f"postgres config: {cfg_map}") + pgdata = pg.pg_data_dir_path() + pg_bin_path = os.path.join(pg_bin.pg_bin_path, "postgres") + + pg.stop_and_destroy() + + spec = ( + """ +{ + "format_version": 1.0, + + "timestamp": "2021-05-23T18:25:43.511Z", + "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b", + + "cluster": { + "cluster_id": "test-cluster-42", + "name": "Neon Test", + "state": "restarted", + "roles": [ + ], + "databases": [ + ], + "settings": [ + { + "name": "fsync", + "value": "off", + "vartype": "bool" + }, + { + "name": "wal_level", + "value": "replica", + "vartype": "enum" + }, + { + "name": "hot_standby", + "value": "on", + "vartype": "bool" + }, + { + "name": "neon.safekeepers", + "value": """ + + f'"{cfg_map["neon.safekeepers"]}"' + + """, + "vartype": "string" + }, + { + "name": "wal_log_hints", + "value": "on", + "vartype": "bool" + }, + { + "name": "log_connections", + "value": "on", + "vartype": "bool" + }, + { + "name": "shared_buffers", + "value": "32768", + "vartype": "integer" + }, + { + "name": "port", + "value": """ + + f'"{cfg_map["port"]}"' + + """, + "vartype": "integer" + }, + { + "name": "max_connections", + "value": "100", + "vartype": "integer" + }, + { + "name": "max_wal_senders", + "value": "10", + "vartype": "integer" + }, + { + "name": "listen_addresses", + "value": "0.0.0.0", + "vartype": "string" + }, + { + "name": "wal_sender_timeout", + "value": "0", + "vartype": "integer" + }, + { + "name": "password_encryption", + "value": "md5", + "vartype": "enum" + }, + { + "name": "maintenance_work_mem", + "value": "65536", + "vartype": "integer" + }, + { + "name": "max_parallel_workers", + "value": "8", + "vartype": "integer" + }, + { + "name": "max_worker_processes", + "value": "8", + "vartype": "integer" + }, + { + "name": "neon.tenant_id", + "value": """ + + f'"{cfg_map["neon.tenant_id"]}"' + + """, + "vartype": "string" + }, + { + "name": "max_replication_slots", + "value": "10", + "vartype": "integer" + }, + { + "name": "neon.timeline_id", + "value": """ + + f'"{cfg_map["neon.timeline_id"]}"' + + """, + "vartype": "string" + }, + { + "name": "shared_preload_libraries", + "value": "neon", + "vartype": "string" + }, + { + "name": "synchronous_standby_names", + "value": "walproposer", + "vartype": "string" + }, + { + "name": "neon.pageserver_connstring", + "value": """ + + f'"{cfg_map["neon.pageserver_connstring"]}"' + + """, + "vartype": "string" + } + ] + }, + "delta_operations": [ + ] +} +""" + ) + + ps_connstr = cfg_map["neon.pageserver_connstring"] + log.info(f"ps_connstr: {ps_connstr}, pgdata: {pgdata}") + + # run compute_ctl and wait for 10s + try: + ctl.raw_cli( + ["--connstr", ps_connstr, "--pgdata", pgdata, "--spec", spec, "--pgbin", pg_bin_path], + timeout=10, + ) + except TimeoutExpired as exc: + ctl_logs = exc.stderr.decode("utf-8") + log.info("compute_ctl output:\n" + ctl_logs) + + start = "starting safekeepers syncing" + end = "safekeepers synced at LSN" + start_pos = ctl_logs.index(start) + assert start_pos != -1 + end_pos = ctl_logs.index(end, start_pos) + assert end_pos != -1 + sync_safekeepers_logs = ctl_logs[start_pos : end_pos + len(end)] + log.info("sync_safekeepers_logs:\n" + sync_safekeepers_logs) + + # assert that --sync-safekeepers logs are present in the output + assert "connecting with node" in sync_safekeepers_logs + assert "connected with node" in sync_safekeepers_logs + assert "proposer connected to quorum (2)" in sync_safekeepers_logs + assert "got votes from majority (2)" in sync_safekeepers_logs + assert "sending elected msg to node" in sync_safekeepers_logs From 9e1eb69d5543aef874152c6af32889e3b806850a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 6 Oct 2022 18:42:05 +0300 Subject: [PATCH 0878/1022] Increase default compaction_period setting to 20 s. The previous default of 1 s caused excessive CPU usage when there were a lot of projects. Polling every timeline once a second was too aggressive so let's reduce it. Fixes https://github.com/neondatabase/neon/issues/2542, but we probably also want do to something so that we don't poll timelines that have received no new WAL or layers since last check. --- pageserver/src/tenant_config.rs | 2 +- test_runner/regress/test_tenant_conf.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index 4c5d5cc3f3..dc1b9353a6 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -24,7 +24,7 @@ pub mod defaults { // This parameter determines L1 layer file size. pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; - pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s"; + pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index c6cf416d12..46a945a58b 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -54,7 +54,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" for i in { "checkpoint_distance": 10000, "compaction_target_size": 1048576, - "compaction_period": 1, + "compaction_period": 20, "compaction_threshold": 10, "gc_horizon": 67108864, "gc_period": 100, @@ -74,7 +74,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" for i in { "checkpoint_distance": 20000, "compaction_target_size": 1048576, - "compaction_period": 1, + "compaction_period": 20, "compaction_threshold": 10, "gc_horizon": 67108864, "gc_period": 30, @@ -102,7 +102,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" for i in { "checkpoint_distance": 15000, "compaction_target_size": 1048576, - "compaction_period": 1, + "compaction_period": 20, "compaction_threshold": 10, "gc_horizon": 67108864, "gc_period": 80, @@ -125,7 +125,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" for i in { "checkpoint_distance": 15000, "compaction_target_size": 1048576, - "compaction_period": 1, + "compaction_period": 20, "compaction_threshold": 10, "gc_horizon": 67108864, "gc_period": 80, From 8e51c27e1ad522aab5c7ab64208fdaf7bb0d181d Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 7 Oct 2022 13:58:31 +0300 Subject: [PATCH 0879/1022] Restore artifact versions (#2578) Context: https://github.com/neondatabase/neon/pull/2128/files#r989489965 Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 6556fb6c9b..7cc8715526 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -494,7 +494,7 @@ jobs: run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - name: Kaniko build neon - run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID compute-tools-image: runs-on: dev @@ -508,7 +508,7 @@ jobs: run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - name: Kaniko build compute tools - run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID compute-node-image: runs-on: dev @@ -527,7 +527,7 @@ jobs: # cloud repo depends on this image name, thus duplicating it # remove compute-node when cloud repo is updated - name: Kaniko build compute node with extensions v14 (compatibility) - run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID compute-node-image-v14: runs-on: dev @@ -543,7 +543,7 @@ jobs: run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - name: Kaniko build compute node with extensions v14 - run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID compute-node-image-v15: @@ -560,7 +560,7 @@ jobs: run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - name: Kaniko build compute node with extensions v15 - run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID promote-images: runs-on: dev From e516c376d66aa8d5363a35f32a4b0267c454ad4b Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 7 Oct 2022 14:34:57 +0300 Subject: [PATCH 0880/1022] [proxy] Improve logging (#2554) * [proxy] Use `tracing::*` instead of `println!` for logging * Fix a minor misnomer * Log more stuff --- Cargo.lock | 3 ++ libs/utils/src/pq_proto.rs | 14 ++++++++ proxy/Cargo.toml | 5 ++- proxy/src/auth/backend.rs | 23 ++++++++++++-- proxy/src/auth/backend/console.rs | 18 ++++++----- proxy/src/auth/backend/link.rs | 7 ++-- proxy/src/auth/credentials.rs | 8 +++++ proxy/src/cancellation.rs | 11 +++++-- proxy/src/compute.rs | 12 +++++-- proxy/src/http/server.rs | 5 +-- proxy/src/main.rs | 20 ++++++++---- proxy/src/mgmt.rs | 11 ++++--- proxy/src/proxy.rs | 53 +++++++++++++++++++++---------- 13 files changed, 140 insertions(+), 50 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ab508c7109..8488fc4f9d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2452,6 +2452,7 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", + "atty", "base64", "bstr", "bytes", @@ -2485,6 +2486,8 @@ dependencies = [ "tokio-postgres", "tokio-postgres-rustls", "tokio-rustls", + "tracing", + "tracing-subscriber", "url", "utils", "uuid", diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index 21952ab87e..8c4e297f82 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -10,6 +10,7 @@ use serde::{Deserialize, Serialize}; use std::{ borrow::Cow, collections::HashMap, + fmt, future::Future, io::{self, Cursor}, str, @@ -124,6 +125,19 @@ pub struct CancelKeyData { pub cancel_key: i32, } +impl fmt::Display for CancelKeyData { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let hi = (self.backend_pid as u64) << 32; + let lo = self.cancel_key as u64; + let id = hi | lo; + + // This format is more compact and might work better for logs. + f.debug_tuple("CancelKeyData") + .field(&format_args!("{:x}", id)) + .finish() + } +} + use rand::distributions::{Distribution, Standard}; impl Distribution for Standard { fn sample(&self, rng: &mut R) -> CancelKeyData { diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 7d0449cd1a..8049737989 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [dependencies] anyhow = "1.0" -async-trait = "0.1" +atty = "0.2.14" base64 = "0.13.0" bstr = "0.2.17" bytes = { version = "1.0.1", features = ['serde'] } @@ -35,6 +35,8 @@ thiserror = "1.0.30" tokio = { version = "1.17", features = ["macros"] } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-rustls = "0.23.0" +tracing = "0.1.36" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } url = "2.2.2" uuid = { version = "0.8.2", features = ["v4", "serde"]} x509-parser = "0.13.2" @@ -44,6 +46,7 @@ metrics = { path = "../libs/metrics" } workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] +async-trait = "0.1" rcgen = "0.8.14" rstest = "0.12" tokio-postgres-rustls = "0.9.0" diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 7e93a32950..bb919770c1 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -15,6 +15,7 @@ use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; use std::borrow::Cow; use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::{info, warn}; static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); @@ -171,6 +172,8 @@ impl BackendType<'_, ClientCredentials<'_>> { // support SNI or other means of passing the project name. // We now expect to see a very specific payload in the place of password. if creds.project().is_none() { + warn!("project name not specified, resorting to the password hack auth flow"); + let payload = AuthFlow::new(client) .begin(auth::PasswordHack) .await? @@ -179,6 +182,7 @@ impl BackendType<'_, ClientCredentials<'_>> { // Finally we may finish the initialization of `creds`. // TODO: add missing type safety to ClientCredentials. + info!(project = &payload.project, "received missing parameter"); creds.project = Some(payload.project.into()); let mut config = match &self { @@ -196,6 +200,7 @@ impl BackendType<'_, ClientCredentials<'_>> { // We should use a password from payload as well. config.password(payload.password); + info!("user successfully authenticated (using the password hack)"); return Ok(compute::NodeInfo { reported_auth_ok: false, config, @@ -203,19 +208,31 @@ impl BackendType<'_, ClientCredentials<'_>> { } } - match self { + let res = match self { Console(endpoint, creds) => { + info!( + user = creds.user, + project = creds.project(), + "performing authentication using the console" + ); console::Api::new(&endpoint, extra, &creds) .handle_user(client) .await } Postgres(endpoint, creds) => { + info!("performing mock authentication using a local postgres instance"); postgres::Api::new(&endpoint, &creds) .handle_user(client) .await } // NOTE: this auth backend doesn't use client credentials. - Link(url) => link::handle_user(&url, client).await, - } + Link(url) => { + info!("performing link authentication"); + link::handle_user(&url, client).await + } + }?; + + info!("user successfully authenticated"); + Ok(res) } } diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index a351b82c6a..7dbb173b88 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -12,6 +12,7 @@ use serde::{Deserialize, Serialize}; use std::future::Future; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::info; const REQUEST_FAILED: &str = "Console request failed"; @@ -148,10 +149,11 @@ impl<'a> Api<'a> { } async fn get_auth_info(&self) -> Result { + let request_id = uuid::Uuid::new_v4().to_string(); let req = self .endpoint .get("proxy_get_role_secret") - .header("X-Request-ID", uuid::Uuid::new_v4().to_string()) + .header("X-Request-ID", &request_id) .query(&[("session_id", self.extra.session_id)]) .query(&[ ("application_name", self.extra.application_name), @@ -160,9 +162,7 @@ impl<'a> Api<'a> { ]) .build()?; - // TODO: use a proper logger - println!("cplane request: {}", req.url()); - + info!(id = request_id, url = req.url().as_str(), "request"); let resp = self.endpoint.execute(req).await?; if !resp.status().is_success() { return Err(TransportError::HttpStatus(resp.status()).into()); @@ -177,10 +177,11 @@ impl<'a> Api<'a> { /// Wake up the compute node and return the corresponding connection info. pub(super) async fn wake_compute(&self) -> Result { + let request_id = uuid::Uuid::new_v4().to_string(); let req = self .endpoint .get("proxy_wake_compute") - .header("X-Request-ID", uuid::Uuid::new_v4().to_string()) + .header("X-Request-ID", &request_id) .query(&[("session_id", self.extra.session_id)]) .query(&[ ("application_name", self.extra.application_name), @@ -188,9 +189,7 @@ impl<'a> Api<'a> { ]) .build()?; - // TODO: use a proper logger - println!("cplane request: {}", req.url()); - + info!(id = request_id, url = req.url().as_str(), "request"); let resp = self.endpoint.execute(req).await?; if !resp.status().is_success() { return Err(TransportError::HttpStatus(resp.status()).into()); @@ -227,15 +226,18 @@ where GetAuthInfo: Future>, WakeCompute: Future>, { + info!("fetching user's authentication info"); let auth_info = get_auth_info(endpoint).await?; let flow = AuthFlow::new(client); let scram_keys = match auth_info { AuthInfo::Md5(_) => { // TODO: decide if we should support MD5 in api v2 + info!("auth endpoint chooses MD5"); return Err(auth::AuthError::bad_auth_method("MD5")); } AuthInfo::Scram(secret) => { + info!("auth endpoint chooses SCRAM"); let scram = auth::Scram(&secret); Some(compute::ScramKeys { client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index eefa246eba..863ed53645 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -1,6 +1,7 @@ use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::info; use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; #[derive(Debug, Error)] @@ -53,14 +54,16 @@ pub async fn handle_user( let greeting = hello_message(link_uri, &psql_session_id); let db_info = super::with_waiter(psql_session_id, |waiter| async { - // Give user a URL to spawn a new database + // Give user a URL to spawn a new database. + info!("sending the auth URL to the user"); client .write_message_noflush(&Be::AuthenticationOk)? .write_message_noflush(&BeParameterStatusMessage::encoding())? .write_message(&Be::NoticeResponse(&greeting)) .await?; - // Wait for web console response (see `mgmt`) + // Wait for web console response (see `mgmt`). + info!("waiting for console's reply..."); waiter.await?.map_err(LinkAuthError::AuthFailed) }) .await?; diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index e43bcf8791..57128a61f5 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -3,6 +3,7 @@ use crate::error::UserFacingError; use std::borrow::Cow; use thiserror::Error; +use tracing::info; use utils::pq_proto::StartupMessageParams; #[derive(Debug, Error, PartialEq, Eq, Clone)] @@ -82,6 +83,13 @@ impl<'a> ClientCredentials<'a> { } .transpose()?; + info!( + user = user, + dbname = dbname, + project = project.as_deref(), + "credentials" + ); + Ok(Self { user, dbname, diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index eb9312e6bb..404533ad42 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -4,6 +4,7 @@ use parking_lot::Mutex; use std::net::SocketAddr; use tokio::net::TcpStream; use tokio_postgres::{CancelToken, NoTls}; +use tracing::info; use utils::pq_proto::CancelKeyData; /// Enables serving `CancelRequest`s. @@ -18,8 +19,9 @@ impl CancelMap { .lock() .get(&key) .and_then(|x| x.clone()) - .with_context(|| format!("unknown session: {:?}", key))?; + .with_context(|| format!("query cancellation key not found: {key}"))?; + info!("cancelling query per user's request using key {key}"); cancel_closure.try_cancel_query().await } @@ -41,14 +43,16 @@ impl CancelMap { self.0 .lock() .try_insert(key, None) - .map_err(|_| anyhow!("session already exists: {:?}", key))?; + .map_err(|_| anyhow!("query cancellation key already exists: {key}"))?; // This will guarantee that the session gets dropped // as soon as the future is finished. scopeguard::defer! { self.0.lock().remove(&key); + info!("dropped query cancellation key {key}"); } + info!("registered new query cancellation key {key}"); let session = Session::new(key, self); f(session).await } @@ -102,10 +106,13 @@ impl<'a> Session<'a> { fn new(key: CancelKeyData, cancel_map: &'a CancelMap) -> Self { Self { key, cancel_map } } +} +impl Session<'_> { /// Store the cancel token for the given session. /// This enables query cancellation in [`crate::proxy::handshake`]. pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData { + info!("enabling query cancellation for this session"); self.cancel_map .0 .lock() diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 4ae44ded57..8e4caf6eeb 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -5,6 +5,7 @@ use std::{io, net::SocketAddr}; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::NoTls; +use tracing::{error, info}; use utils::pq_proto::StartupMessageParams; #[derive(Debug, Error)] @@ -54,6 +55,7 @@ impl NodeInfo { use tokio_postgres::config::Host; let connect_once = |host, port| { + info!("trying to connect to a compute node at {host}:{port}"); TcpStream::connect((host, port)).and_then(|socket| async { let socket_addr = socket.peer_addr()?; // This prevents load balancer from severing the connection. @@ -72,7 +74,11 @@ impl NodeInfo { if ports.len() > 1 && ports.len() != hosts.len() { return Err(io::Error::new( io::ErrorKind::Other, - format!("couldn't connect: bad compute config, ports and hosts entries' count does not match: {:?}", self.config), + format!( + "couldn't connect: bad compute config, \ + ports and hosts entries' count does not match: {:?}", + self.config + ), )); } @@ -88,7 +94,7 @@ impl NodeInfo { Ok(socket) => return Ok(socket), Err(err) => { // We can't throw an error here, as there might be more hosts to try. - println!("failed to connect to compute `{host}:{port}`: {err}"); + error!("failed to connect to a compute node at {host}:{port}: {err}"); connection_error = Some(err); } } @@ -160,8 +166,8 @@ impl NodeInfo { .ok_or(ConnectionError::FailedToFetchPgVersion)? .into(); + info!("connected to user's compute node at {socket_addr}"); let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); - let db = PostgresConnection { stream, version }; Ok((db, cancel_closure)) diff --git a/proxy/src/http/server.rs b/proxy/src/http/server.rs index 5a75718742..05f6feb307 100644 --- a/proxy/src/http/server.rs +++ b/proxy/src/http/server.rs @@ -1,6 +1,7 @@ use anyhow::anyhow; use hyper::{Body, Request, Response, StatusCode}; use std::net::TcpListener; +use tracing::info; use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService}; async fn status_handler(_: Request) -> Result, ApiError> { @@ -12,9 +13,9 @@ fn make_router() -> RouterBuilder { router.get("/v1/status", status_handler) } -pub async fn thread_main(http_listener: TcpListener) -> anyhow::Result<()> { +pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<()> { scopeguard::defer! { - println!("http has shut down"); + info!("http has shut down"); } let service = || RouterService::new(make_router().build()?); diff --git a/proxy/src/main.rs b/proxy/src/main.rs index f2dc7425ba..2e6c365d32 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -25,6 +25,7 @@ use config::ProxyConfig; use futures::FutureExt; use std::{borrow::Cow, future::Future, net::SocketAddr}; use tokio::{net::TcpListener, task::JoinError}; +use tracing::info; use utils::project_git_version; project_git_version!(GIT_VERSION); @@ -38,6 +39,11 @@ async fn flatten_err( #[tokio::main] async fn main() -> anyhow::Result<()> { + tracing_subscriber::fmt() + .with_ansi(atty::is(atty::Stream::Stdout)) + .with_target(false) + .init(); + let arg_matches = clap::App::new("Neon proxy/router") .version(GIT_VERSION) .arg( @@ -140,22 +146,22 @@ async fn main() -> anyhow::Result<()> { auth_backend, })); - println!("Version: {GIT_VERSION}"); - println!("Authentication backend: {}", config.auth_backend); + info!("Version: {GIT_VERSION}"); + info!("Authentication backend: {}", config.auth_backend); // Check that we can bind to address before further initialization - println!("Starting http on {}", http_address); + info!("Starting http on {http_address}"); let http_listener = TcpListener::bind(http_address).await?.into_std()?; - println!("Starting mgmt on {}", mgmt_address); + info!("Starting mgmt on {mgmt_address}"); let mgmt_listener = TcpListener::bind(mgmt_address).await?.into_std()?; - println!("Starting proxy on {}", proxy_address); + info!("Starting proxy on {proxy_address}"); let proxy_listener = TcpListener::bind(proxy_address).await?; let tasks = [ - tokio::spawn(http::server::thread_main(http_listener)), - tokio::spawn(proxy::thread_main(config, proxy_listener)), + tokio::spawn(http::server::task_main(http_listener)), + tokio::spawn(proxy::task_main(config, proxy_listener)), tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)), ] .map(flatten_err); diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 8737d170b1..67693b1fb0 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -5,6 +5,7 @@ use std::{ net::{TcpListener, TcpStream}, thread, }; +use tracing::{error, info}; use utils::{ postgres_backend::{self, AuthType, PostgresBackend}, pq_proto::{BeMessage, SINGLE_COL_ROWDESC}, @@ -19,7 +20,7 @@ use utils::{ /// pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> { scopeguard::defer! { - println!("mgmt has shut down"); + info!("mgmt has shut down"); } listener @@ -27,14 +28,14 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> { .context("failed to set listener to blocking")?; loop { let (socket, peer_addr) = listener.accept().context("failed to accept a new client")?; - println!("accepted connection from {}", peer_addr); + info!("accepted connection from {peer_addr}"); socket .set_nodelay(true) .context("failed to set client socket option")?; thread::spawn(move || { if let Err(err) = handle_connection(socket) { - println!("error: {}", err); + error!("{err}"); } }); } @@ -102,14 +103,14 @@ impl postgres_backend::Handler for MgmtHandler { let res = try_process_query(pgb, query_string); // intercept and log error message if res.is_err() { - println!("Mgmt query failed: #{:?}", res); + error!("mgmt query failed: {res:?}"); } res } } fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::Result<()> { - println!("Got mgmt query [redacted]"); // Content contains password, don't print it + info!("got mgmt query [redacted]"); // Content contains password, don't print it let resp: PsqlSessionResponse = serde_json::from_str(query_string)?; diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index efb1b6f358..5dcaa000cf 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -8,6 +8,7 @@ use metrics::{register_int_counter, IntCounter}; use once_cell::sync::Lazy; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::{error, info, info_span, Instrument}; use utils::pq_proto::{BeMessage as Be, *}; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; @@ -43,17 +44,17 @@ where F: std::future::Future>, { future.await.map_err(|err| { - println!("error: {}", err); + error!("{err}"); err }) } -pub async fn thread_main( +pub async fn task_main( config: &'static ProxyConfig, listener: tokio::net::TcpListener, ) -> anyhow::Result<()> { scopeguard::defer! { - println!("proxy has shut down"); + info!("proxy has shut down"); } // When set for the server socket, the keepalive setting @@ -63,22 +64,29 @@ pub async fn thread_main( let cancel_map = Arc::new(CancelMap::default()); loop { let (socket, peer_addr) = listener.accept().await?; - println!("accepted connection from {}", peer_addr); + info!("accepted connection from {peer_addr}"); + let session_id = uuid::Uuid::new_v4(); let cancel_map = Arc::clone(&cancel_map); - tokio::spawn(log_error(async move { - socket - .set_nodelay(true) - .context("failed to set socket option")?; + tokio::spawn( + log_error(async move { + info!("spawned a task for {peer_addr}"); - handle_client(config, &cancel_map, socket).await - })); + socket + .set_nodelay(true) + .context("failed to set socket option")?; + + handle_client(config, &cancel_map, session_id, socket).await + }) + .instrument(info_span!("client", session = format_args!("{session_id}"))), + ); } } async fn handle_client( config: &ProxyConfig, cancel_map: &CancelMap, + session_id: uuid::Uuid, stream: impl AsyncRead + AsyncWrite + Unpin + Send, ) -> anyhow::Result<()> { // The `closed` counter will increase when this future is destroyed. @@ -88,7 +96,8 @@ async fn handle_client( } let tls = config.tls_config.as_ref(); - let (mut stream, params) = match handshake(stream, tls, cancel_map).await? { + let do_handshake = handshake(stream, tls, cancel_map).instrument(info_span!("handshake")); + let (mut stream, params) = match do_handshake.await? { Some(x) => x, None => return Ok(()), // it's a cancellation request }; @@ -106,7 +115,7 @@ async fn handle_client( async { result }.or_else(|e| stream.throw_error(e)).await? }; - let client = Client::new(stream, creds, ¶ms); + let client = Client::new(stream, creds, ¶ms, session_id); cancel_map .with_session(|session| client.connect_to_db(session)) .await @@ -127,7 +136,7 @@ async fn handshake( let mut stream = PqStream::new(Stream::from_raw(stream)); loop { let msg = stream.read_startup_packet().await?; - println!("got message: {:?}", msg); + info!("received {msg:?}"); use FeStartupPacket::*; match msg { @@ -164,11 +173,13 @@ async fn handshake( stream.throw_error_str(ERR_INSECURE_CONNECTION).await?; } + info!(session_type = "normal", "successful handshake"); break Ok(Some((stream, params))); } CancelRequest(cancel_key_data) => { cancel_map.cancel_session(cancel_key_data).await?; + info!(session_type = "cancellation", "successful handshake"); break Ok(None); } } @@ -183,6 +194,8 @@ struct Client<'a, S> { creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, /// KV-dictionary with PostgreSQL connection params. params: &'a StartupMessageParams, + /// Unique connection ID. + session_id: uuid::Uuid, } impl<'a, S> Client<'a, S> { @@ -191,11 +204,13 @@ impl<'a, S> Client<'a, S> { stream: PqStream, creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, params: &'a StartupMessageParams, + session_id: uuid::Uuid, ) -> Self { Self { stream, creds, params, + session_id, } } } @@ -207,17 +222,20 @@ impl Client<'_, S> { mut stream, creds, params, + session_id, } = self; let extra = auth::ConsoleReqExtra { - // Currently it's OK to generate a new UUID **here**, but - // it might be better to move this to `cancellation::Session`. - session_id: uuid::Uuid::new_v4(), + session_id, // aka this connection's id application_name: params.get("application_name"), }; // Authenticate and connect to a compute node. - let auth = creds.authenticate(&extra, &mut stream).await; + let auth = creds + .authenticate(&extra, &mut stream) + .instrument(info_span!("auth")) + .await; + let node = async { auth }.or_else(|e| stream.throw_error(e)).await?; let reported_auth_ok = node.reported_auth_ok; @@ -251,6 +269,7 @@ impl Client<'_, S> { } // Starting from here we only proxy the client's traffic. + info!("performing the proxy pass..."); let mut db = MetricsStream::new(db.stream, inc_proxied); let mut client = MetricsStream::new(stream.into_inner(), inc_proxied); let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?; From 725be60bb7a15e63bb6c65de3730dc6f9dae4bf1 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 16 Sep 2022 16:21:07 +0300 Subject: [PATCH 0881/1022] Storage messaging rfc 2. --- docs/rfcs/018-storage-messaging-2.md | 163 +++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 docs/rfcs/018-storage-messaging-2.md diff --git a/docs/rfcs/018-storage-messaging-2.md b/docs/rfcs/018-storage-messaging-2.md new file mode 100644 index 0000000000..364f62dd2e --- /dev/null +++ b/docs/rfcs/018-storage-messaging-2.md @@ -0,0 +1,163 @@ +# Storage messaging + +Safekeepers need to communicate to each other to +* Trim WAL on safekeepers; +* Decide on which SK should push WAL to the S3; +* Decide on when to shut down SK<->pageserver connection; +* Understand state of each other to perform peer recovery; + +Pageservers need to communicate to safekeepers to decide which SK should provide +WAL to the pageserver. + +This is an iteration on [015-storage-messaging](https://github.com/neondatabase/neon/blob/main/docs/rfcs/015-storage-messaging.md) describing current situation, +potential performance issue and ways to address it. + +## Background + +What we have currently is very close to etcd variant described in +015-storage-messaging. Basically, we have single `SkTimelineInfo` message +periodically sent by all safekeepers to etcd for each timeline. +* Safekeepers subscribe to it to learn status of peers (currently they subscribe to + 'everything', but they can and should fetch data only for timelines they hold). +* Pageserver subscribes to it (separate watch per timeline) to learn safekeepers + positions; based on that, it decides from which safekeepers to pull WAL. + +Also, safekeepers use etcd elections API to make sure only single safekeeper +offloads WAL. + +It works, and callmemaybe is gone. However, this has a performance +hazard. Currently deployed etcd can do about 6k puts per second (using its own +`benchmark` tool); on my 6 core laptop, while running on tmpfs, this gets to +35k. Making benchmark closer to our usage [etcd watch bench](https://github.com/arssher/etcd-client/blob/watch-bench/examples/watch_bench.rs), +I get ~10k received messages per second with various number of publisher-subscribers +(laptop, tmpfs). Diving this by 12 (3 sks generate msg, 1 ps + 3 sk consume them) we +get about 800 active timelines, if message is sent each second. Not extremely +low, but quite reachable. + +A lot of idle watches seem to be ok though -- which is good, as pageserver +subscribes to all its timelines regardless of their activity. + +Also, running etcd with fsyncs disabled is messy -- data dir must be wiped on +each restart or there is a risk of corruption errors. + +The reason is etcd making much more than what we need; it is a fault tolerant +store with strong consistency, but I claim all we need here is just simplest pub +sub with best effort delivery, because +* We already have centralized source of truth for long running data, like which + tlis are on which nodes -- the console. +* Momentary data (safekeeper/pageserver progress) doesn't make sense to persist. + Instead of putting each change to broker, expecting it to reliably deliver it + is better to just have constant flow of data for active timelines: 1) they + serve as natural heartbeats -- if node can't send, we shouldn't pull WAL from + it 2) it is simpler -- no need to track delivery to/from the broker. + Moreover, latency here is important: the faster we obtain fresh data, the + faster we can switch to proper safekeeper after failure. +* As for WAL offloading leader election, it is trivial to achieve through these + heartbeats -- just take suitable node through deterministic rule (min node + id). Once network is stable, this is a converging process (well, except + complicated failure topology, but even then making it converge is not + hard). Such elections bear some risk of several offloaders running + concurrently for a short period of time, but that's harmless. + + Generally, if one needs strong consistency, electing leader per se is not + enough; it must be accompanied with number (logical clock ts), checked at + every action to track causality. s3 doesn't provide CAS, so it can't + differentiate old/new leader, this must be solved differently. + + We could use etcd CAS (its most powerful/useful primitive actually) to issue + these leader numbers (and e.g. prefix files in s3), but currently I don't see + need for that. + + +Obviously best effort pub sub is much more simpler and performant; the one proposed is + +## gRPC broker + +I took tonic and [prototyped](https://github.com/neondatabase/neon/blob/asher/neon-broker/broker/src/broker.rs) the replacement of functionality we currently use +with grpc streams and tokio mpsc channels. The implementation description is at the file header. + +It is just 500 lines of code and core functionality is complete. 1-1 pub sub +gives about 120k received messages per second; having multiple subscribers in +different connecitons quickly scales to 1 million received messages per second. +I had concerns about many concurrent streams in singe connection, but 2^20 +subscribers still work (though eat memory, with 10 publishers 20GB are consumed; +in this implementation each publisher holds full copy of all subscribers). There +is `bench.rs` nearby which I used for testing. + +`SkTimelineInfo` is wired here, but another message can be added (e.g. if +pageservers want to communicate with each other) with templating. + +### Fault tolerance + +Since such broker is stateless, we can run it under k8s. Or add proxying to +other members, with best-effort this is simple. + +### Security implications + +Communication happens in a private network that is not exposed to users; +additionaly we can add auth to the broker. + +## Alternative: get existing pub-sub + +We could take some existing pub sub solution, e.g. RabbitMQ, Redis. But in this +case IMV simplicity of our own outweights external dependency costs (RabbitMQ is +much more complicated and needs VM; Redis Rust client maintenance is not +ideal...). Also note that projects like CockroachDB and TiDB are based on gRPC +as well. + +## Alternative: direct communication + +Apart from being transport, broker solves one more task: discovery, i.e. letting +safekeepers and pageservers find each other. We can let safekeepers know, for +each timeline, both other safekeepers for this timeline and pageservers serving +it. In this case direct communication is possible: + - each safekeeper pushes to each other safekeeper status of timelines residing + on both of them, letting remove WAL, decide who offloads, decide on peer + recovery; + - each safekeeper pushes to each pageserver status of timelines residing on + both of them, letting pageserver choose from which sk to pull WAL; + +It was mostly described in [014-safekeeper-gossip](https://github.com/neondatabase/neon/blob/main/docs/rfcs/014-safekeepers-gossip.md), but I want to recap on that. + +The main pro is less one dependency: less moving parts, easier to run Neon +locally/manually, less places to monitor. Fault tolerance for broker disappears, +no kuber or something. To me this is a big thing. + +Also (though not a big thing) idle watches for inactive timelines disappear: +naturally safekeepers learn about compute connection first and start pushing +status to pageserver(s), notifying it should pull. + +Importantly, I think that eventually knowing and persisting peers and +pageservers on safekeepers is inevitable: +- Knowing peer safekeepers for the timeline is required for correct + automatic membership change -- new member set must be hardened on old + majority before proceeding. It is required to get rid of sync-safekeepers + as well (peer recovery up to flush_lsn). +- Knowing pageservers where the timeline is attached is needed to + 1. Understand when to shut down activity on the timeline, i.e. push data to + the broker. We can have a lot of timelines sleeping quietly which + shouldn't occupy resources. + 2. Preserve WAL for these (currently we offload to s3 and take it from there, + but serving locally is better, and we get one less condition on which WAL + can be removed from s3). + +I suppose this membership data should be passed to safekeepers directly from the +console because +1. Console is the original source of this data, conceptually this is the + simplest way (rather than passing it through compute or something). +2. We already have similar code for deleting timeline on safekeepers + (and attaching/detaching timeline on pageserver), this is a typical + action -- queue operation against storage node and execute it until it + completes (or timeline is dropped). + +Cons of direct communication are +- It is more complicated: each safekeeper should maintain set of peers it talks + to, and set of timelines for each such peer -- they ought to be multiplexed + into single connection. +- Totally, we have O(n^2) connections instead of O(n) with broker schema + (still O(n) on each node). However, these are relatively stable, async and + thus not very expensive, I don't think this is a big problem. Up to 10k + storage nodes I doubt connection overhead would be noticeable. + +I'd use gRPC for direct communication, and in this sense gRPC based broker is a +step towards it. From a22165d41ea4046d393e131e9ae47fe22e09dec5 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 8 Oct 2022 10:07:33 +0300 Subject: [PATCH 0882/1022] Add tests for comparing root and child branch performance. Author: Thang Pham --- .../performance/test_branch_creation.py | 43 +++++++++ test_runner/performance/test_branching.py | 94 +++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 test_runner/performance/test_branching.py diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 9cb346de47..4b109c150f 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -3,12 +3,15 @@ import statistics import threading import time import timeit +from contextlib import closing from typing import List import pytest from fixtures.benchmark_fixture import MetricReport from fixtures.compare_fixtures import NeonCompare from fixtures.log_helper import log +from fixtures.neon_fixtures import wait_for_last_record_lsn +from fixtures.types import Lsn def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]): @@ -107,3 +110,43 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): branch_creation_durations.append(dur) _record_branch_creation_durations(neon_compare, branch_creation_durations) + + +# Test measures the branch creation time when branching from a timeline with a lot of relations. +# +# This test measures the latency of branch creation under two scenarios +# 1. The ancestor branch is not under any workloads +# 2. The ancestor branch is under a workload (busy) +# +# To simulate the workload, the test runs a concurrent insertion on the ancestor branch right before branching. +def test_branch_creation_many_relations(neon_compare: NeonCompare): + env = neon_compare.env + + timeline_id = env.neon_cli.create_branch("root") + + pg = env.postgres.create_start("root") + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + for i in range(10000): + cur.execute(f"CREATE TABLE t{i} as SELECT g FROM generate_series(1, 1000) g") + + # Wait for the pageserver to finish processing all the pending WALs, + # as we don't want the LSN wait time to be included during the branch creation + flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + wait_for_last_record_lsn( + env.pageserver.http_client(), env.initial_tenant, timeline_id, flush_lsn + ) + + with neon_compare.record_duration("create_branch_time_not_busy_root"): + env.neon_cli.create_branch("child_not_busy", "root") + + # run a concurrent insertion to make the ancestor "busy" during the branch creation + thread = threading.Thread( + target=pg.safe_psql, args=("INSERT INTO t0 VALUES (generate_series(1, 100000))",) + ) + thread.start() + + with neon_compare.record_duration("create_branch_time_busy_root"): + env.neon_cli.create_branch("child_busy", "root") + + thread.join() diff --git a/test_runner/performance/test_branching.py b/test_runner/performance/test_branching.py new file mode 100644 index 0000000000..562e751458 --- /dev/null +++ b/test_runner/performance/test_branching.py @@ -0,0 +1,94 @@ +import timeit +from pathlib import Path +from typing import List + +from fixtures.benchmark_fixture import PgBenchRunResult +from fixtures.compare_fixtures import NeonCompare +from performance.test_perf_pgbench import utc_now_timestamp + +# ----------------------------------------------------------------------- +# Start of `test_compare_child_and_root_*` tests +# ----------------------------------------------------------------------- + +# `test_compare_child_and_root_*` tests compare the performance of a branch and its child branch(s). +# A common pattern in those tests is initializing a root branch then creating a child branch(s) from the root. +# Each test then runs a similar workload for both child branch and root branch. Each measures and reports +# some latencies/metrics during the workload for performance comparison between a branch and its ancestor. + + +def test_compare_child_and_root_pgbench_perf(neon_compare: NeonCompare): + env = neon_compare.env + pg_bin = neon_compare.pg_bin + + def run_pgbench_on_branch(branch: str, cmd: List[str]): + run_start_timestamp = utc_now_timestamp() + t0 = timeit.default_timer() + out = pg_bin.run_capture( + cmd, + ) + run_duration = timeit.default_timer() - t0 + run_end_timestamp = utc_now_timestamp() + + stdout = Path(f"{out}.stdout").read_text() + + res = PgBenchRunResult.parse_from_stdout( + stdout=stdout, + run_duration=run_duration, + run_start_timestamp=run_start_timestamp, + run_end_timestamp=run_end_timestamp, + ) + neon_compare.zenbenchmark.record_pg_bench_result(branch, res) + + env.neon_cli.create_branch("root") + pg_root = env.postgres.create_start("root") + pg_bin.run_capture(["pgbench", "-i", pg_root.connstr(), "-s10"]) + + env.neon_cli.create_branch("child", "root") + pg_child = env.postgres.create_start("child") + + run_pgbench_on_branch("root", ["pgbench", "-c10", "-T10", pg_root.connstr()]) + run_pgbench_on_branch("child", ["pgbench", "-c10", "-T10", pg_child.connstr()]) + + +def test_compare_child_and_root_write_perf(neon_compare: NeonCompare): + env = neon_compare.env + env.neon_cli.create_branch("root") + pg_root = env.postgres.create_start("root") + + pg_root.safe_psql( + "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')", + ) + + env.neon_cli.create_branch("child", "root") + pg_child = env.postgres.create_start("child") + + with neon_compare.record_duration("root_run_duration"): + pg_root.safe_psql("INSERT INTO foo SELECT FROM generate_series(1,1000000)") + with neon_compare.record_duration("child_run_duration"): + pg_child.safe_psql("INSERT INTO foo SELECT FROM generate_series(1,1000000)") + + +def test_compare_child_and_root_read_perf(neon_compare: NeonCompare): + env = neon_compare.env + env.neon_cli.create_branch("root") + pg_root = env.postgres.create_start("root") + + pg_root.safe_psql_many( + [ + "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')", + "INSERT INTO foo SELECT FROM generate_series(1,1000000)", + ] + ) + + env.neon_cli.create_branch("child", "root") + pg_child = env.postgres.create_start("child") + + with neon_compare.record_duration("root_run_duration"): + pg_root.safe_psql("SELECT count(*) from foo") + with neon_compare.record_duration("child_run_duration"): + pg_child.safe_psql("SELECT count(*) from foo") + + +# ----------------------------------------------------------------------- +# End of `test_compare_child_and_root_*` tests +# ----------------------------------------------------------------------- From 9f79e7edea01e381dfc781383d0d65c6d18b5d1b Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 8 Oct 2022 15:42:17 +0300 Subject: [PATCH 0883/1022] Merge pageserver helper binaries and provide it for deployment (#2590) --- Dockerfile | 10 +- pageserver/src/bin/dump_layerfile.rs | 35 ------ pageserver/src/bin/pageserver_binutils.rs | 144 ++++++++++++++++++++++ pageserver/src/bin/update_metadata.rs | 75 ----------- pageserver/src/tenant/delta_layer.rs | 2 +- pageserver/src/tenant/filename.rs | 2 +- pageserver/src/tenant/image_layer.rs | 2 +- 7 files changed, 152 insertions(+), 118 deletions(-) delete mode 100644 pageserver/src/bin/dump_layerfile.rs create mode 100644 pageserver/src/bin/pageserver_binutils.rs delete mode 100644 pageserver/src/bin/update_metadata.rs diff --git a/Dockerfile b/Dockerfile index 69402919ec..cb4e213687 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,7 +44,7 @@ COPY . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ -&& mold -run cargo build --bin pageserver --bin safekeeper --bin proxy --locked --release \ +&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin safekeeper --bin proxy --locked --release \ && cachepot -s # Build final image @@ -63,9 +63,10 @@ RUN set -e \ && useradd -d /data neon \ && chown -R neon:neon /data -COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin -COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin -COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ @@ -85,4 +86,3 @@ VOLUME ["/data"] USER neon EXPOSE 6400 EXPOSE 9898 -CMD ["/bin/bash"] diff --git a/pageserver/src/bin/dump_layerfile.rs b/pageserver/src/bin/dump_layerfile.rs deleted file mode 100644 index f5247ee609..0000000000 --- a/pageserver/src/bin/dump_layerfile.rs +++ /dev/null @@ -1,35 +0,0 @@ -//! Main entry point for the dump_layerfile executable -//! -//! A handy tool for debugging, that's all. -use anyhow::Result; -use clap::{App, Arg}; -use pageserver::page_cache; -use pageserver::tenant::dump_layerfile_from_path; -use pageserver::virtual_file; -use std::path::PathBuf; -use utils::project_git_version; - -project_git_version!(GIT_VERSION); - -fn main() -> Result<()> { - let arg_matches = App::new("Neon dump_layerfile utility") - .about("Dump contents of one layer file, for debugging") - .version(GIT_VERSION) - .arg( - Arg::new("path") - .help("Path to file to dump") - .required(true) - .index(1), - ) - .get_matches(); - - let path = PathBuf::from(arg_matches.value_of("path").unwrap()); - - // Basic initialization of things that don't change after startup - virtual_file::init(10); - page_cache::init(100); - - dump_layerfile_from_path(&path, true)?; - - Ok(()) -} diff --git a/pageserver/src/bin/pageserver_binutils.rs b/pageserver/src/bin/pageserver_binutils.rs new file mode 100644 index 0000000000..ec7699f194 --- /dev/null +++ b/pageserver/src/bin/pageserver_binutils.rs @@ -0,0 +1,144 @@ +//! A helper tool to manage pageserver binary files. +//! Accepts a file as an argument, attempts to parse it with all ways possible +//! and prints its interpreted context. +//! +//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file. +use std::{ + path::{Path, PathBuf}, + str::FromStr, +}; + +use anyhow::Context; +use clap::{App, Arg}; + +use pageserver::{ + page_cache, + tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, + virtual_file, +}; +use postgres_ffi::ControlFileData; +use utils::{lsn::Lsn, project_git_version}; + +project_git_version!(GIT_VERSION); + +const METADATA_SUBCOMMAND: &str = "metadata"; + +fn main() -> anyhow::Result<()> { + let arg_matches = App::new("Neon Pageserver binutils") + .about("Reads pageserver (and related) binary files management utility") + .version(GIT_VERSION) + .arg(Arg::new("path").help("Input file path").required(false)) + .subcommand( + App::new(METADATA_SUBCOMMAND) + .about("Read and update pageserver metadata file") + .arg( + Arg::new("metadata_path") + .help("Input metadata file path") + .required(false), + ) + .arg( + Arg::new("disk_consistent_lsn") + .long("disk_consistent_lsn") + .takes_value(true) + .help("Replace disk consistent Lsn"), + ) + .arg( + Arg::new("prev_record_lsn") + .long("prev_record_lsn") + .takes_value(true) + .help("Replace previous record Lsn"), + ), + ) + .get_matches(); + + match arg_matches.subcommand() { + Some((subcommand_name, subcommand_matches)) => { + let path = PathBuf::from( + subcommand_matches + .value_of("metadata_path") + .context("'metadata_path' argument is missing")?, + ); + anyhow::ensure!( + subcommand_name == METADATA_SUBCOMMAND, + "Unknown subcommand {subcommand_name}" + ); + handle_metadata(&path, subcommand_matches)?; + } + None => { + let path = PathBuf::from( + arg_matches + .value_of("path") + .context("'path' argument is missing")?, + ); + println!( + "No subcommand specified, attempting to guess the format for file {}", + path.display() + ); + if let Err(e) = read_pg_control_file(&path) { + println!( + "Failed to read input file as a pg control one: {e:#}\n\ + Attempting to read it as layer file" + ); + print_layerfile(&path)?; + } + } + }; + Ok(()) +} + +fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> { + let control_file = ControlFileData::decode(&std::fs::read(&control_file_path)?)?; + println!("{control_file:?}"); + let control_file_initdb = Lsn(control_file.checkPoint); + println!( + "pg_initdb_lsn: {}, aligned: {}", + control_file_initdb, + control_file_initdb.align() + ); + Ok(()) +} + +fn print_layerfile(path: &Path) -> anyhow::Result<()> { + // Basic initialization of things that don't change after startup + virtual_file::init(10); + page_cache::init(100); + dump_layerfile_from_path(path, true) +} + +fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> { + let metadata_bytes = std::fs::read(&path)?; + let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?; + println!("Current metadata:\n{meta:?}"); + let mut update_meta = false; + if let Some(disk_consistent_lsn) = arg_matches.value_of("disk_consistent_lsn") { + meta = TimelineMetadata::new( + Lsn::from_str(disk_consistent_lsn)?, + meta.prev_record_lsn(), + meta.ancestor_timeline(), + meta.ancestor_lsn(), + meta.latest_gc_cutoff_lsn(), + meta.initdb_lsn(), + meta.pg_version(), + ); + update_meta = true; + } + if let Some(prev_record_lsn) = arg_matches.value_of("prev_record_lsn") { + meta = TimelineMetadata::new( + meta.disk_consistent_lsn(), + Some(Lsn::from_str(prev_record_lsn)?), + meta.ancestor_timeline(), + meta.ancestor_lsn(), + meta.latest_gc_cutoff_lsn(), + meta.initdb_lsn(), + meta.pg_version(), + ); + update_meta = true; + } + + if update_meta { + let metadata_bytes = meta.to_bytes()?; + std::fs::write(&path, &metadata_bytes)?; + } + + Ok(()) +} diff --git a/pageserver/src/bin/update_metadata.rs b/pageserver/src/bin/update_metadata.rs deleted file mode 100644 index e66049c457..0000000000 --- a/pageserver/src/bin/update_metadata.rs +++ /dev/null @@ -1,75 +0,0 @@ -//! Main entry point for the edit_metadata executable -//! -//! A handy tool for debugging, that's all. -use anyhow::Result; -use clap::{App, Arg}; -use pageserver::tenant::metadata::TimelineMetadata; -use std::path::PathBuf; -use std::str::FromStr; -use utils::{lsn::Lsn, project_git_version}; - -project_git_version!(GIT_VERSION); - -fn main() -> Result<()> { - let arg_matches = App::new("Neon update metadata utility") - .about("Dump or update metadata file") - .version(GIT_VERSION) - .arg( - Arg::new("path") - .help("Path to metadata file") - .required(true), - ) - .arg( - Arg::new("disk_lsn") - .short('d') - .long("disk_lsn") - .takes_value(true) - .help("Replace disk constistent lsn"), - ) - .arg( - Arg::new("prev_lsn") - .short('p') - .long("prev_lsn") - .takes_value(true) - .help("Previous record LSN"), - ) - .get_matches(); - - let path = PathBuf::from(arg_matches.value_of("path").unwrap()); - let metadata_bytes = std::fs::read(&path)?; - let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?; - println!("Current metadata:\n{:?}", &meta); - - let mut update_meta = false; - - if let Some(disk_lsn) = arg_matches.value_of("disk_lsn") { - meta = TimelineMetadata::new( - Lsn::from_str(disk_lsn)?, - meta.prev_record_lsn(), - meta.ancestor_timeline(), - meta.ancestor_lsn(), - meta.latest_gc_cutoff_lsn(), - meta.initdb_lsn(), - meta.pg_version(), - ); - update_meta = true; - } - - if let Some(prev_lsn) = arg_matches.value_of("prev_lsn") { - meta = TimelineMetadata::new( - meta.disk_consistent_lsn(), - Some(Lsn::from_str(prev_lsn)?), - meta.ancestor_timeline(), - meta.ancestor_lsn(), - meta.latest_gc_cutoff_lsn(), - meta.initdb_lsn(), - meta.pg_version(), - ); - update_meta = true; - } - if update_meta { - let metadata_bytes = meta.to_bytes()?; - std::fs::write(&path, &metadata_bytes)?; - } - Ok(()) -} diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs index 57c5be91a4..41715ab0a4 100644 --- a/pageserver/src/tenant/delta_layer.rs +++ b/pageserver/src/tenant/delta_layer.rs @@ -556,7 +556,7 @@ impl DeltaLayer { /// Create a DeltaLayer struct representing an existing file on disk. /// - /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary. + /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary. pub fn new_for_path(path: &Path, file: F) -> Result where F: FileExt, diff --git a/pageserver/src/tenant/filename.rs b/pageserver/src/tenant/filename.rs index 5ebac2332d..0ebf2d479b 100644 --- a/pageserver/src/tenant/filename.rs +++ b/pageserver/src/tenant/filename.rs @@ -177,7 +177,7 @@ impl fmt::Display for ImageFileName { /// /// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the /// global config, and paths to layer files are constructed using the tenant/timeline -/// path from the config. But in the 'dump_layerfile' binary, we need to construct a Layer +/// path from the config. But in the 'pageserver_binutils' binary, we need to construct a Layer /// struct for a file on disk, without having a page server running, so that we have no /// config. In that case, we use the Path variant to hold the full path to the file on /// disk. diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs index 92bf022fee..cbfa0134b0 100644 --- a/pageserver/src/tenant/image_layer.rs +++ b/pageserver/src/tenant/image_layer.rs @@ -357,7 +357,7 @@ impl ImageLayer { /// Create an ImageLayer struct representing an existing file on disk. /// - /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary. + /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary. pub fn new_for_path(path: &Path, file: F) -> Result where F: std::os::unix::prelude::FileExt, From 01d2c52c8245fdd4f8ed562b210a07e41bf45bdb Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 8 Oct 2022 09:57:05 +0300 Subject: [PATCH 0884/1022] Tidy up feature reporting --- pageserver/src/bin/pageserver.rs | 23 +++++++++++++---------- pageserver/src/http/routes.rs | 2 +- run_clippy.sh | 2 +- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index fb79ad3945..46a36c6118 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -31,11 +31,20 @@ use utils::{ project_git_version!(GIT_VERSION); +const FEATURES: &[&str] = &[ + #[cfg(feature = "testing")] + "testing", + #[cfg(feature = "fail/failpoints")] + "fail/failpoints", + #[cfg(feature = "profiling")] + "profiling", +]; + fn version() -> String { format!( - "{GIT_VERSION} profiling:{} failpoints:{}", - cfg!(feature = "profiling"), - fail::has_failpoints() + "{GIT_VERSION} failpoints: {}, features: {:?}", + fail::has_failpoints(), + FEATURES, ) } @@ -86,13 +95,7 @@ fn main() -> anyhow::Result<()> { .get_matches(); if arg_matches.is_present("enabled-features") { - let features: &[&str] = &[ - #[cfg(feature = "testing")] - "testing", - #[cfg(feature = "profiling")] - "profiling", - ]; - println!("{{\"features\": {features:?} }}"); + println!("{{\"features\": {FEATURES:?} }}"); return Ok(()); } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index e743f27aff..f1318cb325 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -784,7 +784,7 @@ async fn tenant_config_handler(mut request: Request) -> Result) -> Result, ApiError> { if !fail::has_failpoints() { return Err(ApiError::BadRequest(anyhow!( diff --git a/run_clippy.sh b/run_clippy.sh index 9feb8de4ea..bf770432d0 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -13,7 +13,7 @@ # avoid running regular linting script that checks every feature. if [[ "$OSTYPE" == "darwin"* ]]; then # no extra features to test currently, add more here when needed - cargo clippy --locked --all --all-targets -- -A unknown_lints -D warnings + cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -D warnings else # * `-A unknown_lints` – do not warn about unknown lint suppressions # that people with newer toolchains might use From 3be3bb77302c371d1c58fda7d17dbf56dd9ad061 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 8 Oct 2022 10:03:52 +0300 Subject: [PATCH 0885/1022] Be more verbose with initdb for pageserver timeline creation --- pageserver/src/tenant.rs | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index c2fb9ef242..4da5790f51 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1094,12 +1094,22 @@ impl Tenant { /// Create the cluster temporarily in 'initdbpath' directory inside the repository /// to get bootstrap data for timeline initialization. -fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path, pg_version: u32) -> Result<()> { - info!("running initdb in {}... ", initdbpath.display()); +fn run_initdb( + conf: &'static PageServerConf, + initdb_target_dir: &Path, + pg_version: u32, +) -> Result<()> { + let initdb_bin_path = conf.pg_bin_dir(pg_version).join("initdb"); + let initdb_lib_dir = conf.pg_lib_dir(pg_version); + info!( + "running {} in {}, libdir: {}", + initdb_bin_path.display(), + initdb_target_dir.display(), + initdb_lib_dir.display(), + ); - let initdb_path = conf.pg_bin_dir(pg_version).join("initdb"); - let initdb_output = Command::new(initdb_path) - .args(&["-D", &initdbpath.to_string_lossy()]) + let initdb_output = Command::new(initdb_bin_path) + .args(&["-D", &initdb_target_dir.to_string_lossy()]) .args(&["-U", &conf.superuser]) .args(&["-E", "utf8"]) .arg("--no-instructions") @@ -1107,8 +1117,8 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path, pg_version: u32) // so no need to fsync it .arg("--no-sync") .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) + .env("LD_LIBRARY_PATH", &initdb_lib_dir) + .env("DYLD_LIBRARY_PATH", &initdb_lib_dir) .stdout(Stdio::null()) .output() .context("failed to execute initdb")?; From 3e35f10adca053afd6c786237cc06bc6f8db5050 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 8 Oct 2022 12:21:17 +0300 Subject: [PATCH 0886/1022] Add a script to reformat the project --- scripts/reformat | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100755 scripts/reformat diff --git a/scripts/reformat b/scripts/reformat new file mode 100755 index 0000000000..67140a705a --- /dev/null +++ b/scripts/reformat @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +set -euox pipefail + +# Runs all formatting tools to ensure the project is up to date +echo 'Reformatting Rust code' +cargo fmt +echo 'Reformatting Python code' +poetry run isort test_runner +poetry run flake8 test_runner +poetry run black test_runner From 13f0e7a5b4a2ea1187955926e036d4ac57ed094c Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 8 Oct 2022 23:53:12 +0300 Subject: [PATCH 0887/1022] Deploy pageserver_binutils to the envs --- .github/ansible/get_binaries.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh index a484bfb0a0..9d2d0926f5 100755 --- a/.github/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -23,6 +23,7 @@ docker cp ${ID}:/data/postgres_install.tar.gz . tar -xzf postgres_install.tar.gz -C neon_install mkdir neon_install/bin/ docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/ +docker cp ${ID}:/usr/local/bin/pageserver_binutils neon_install/bin/ docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/ From 34bea270f0e320c1f77f76c0f0b0000774935b68 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Mon, 10 Oct 2022 12:12:50 +0300 Subject: [PATCH 0888/1022] Fix POSTGRES_DISTRIB_DIR for benchmarks on ec2 runner (#2594) --- .github/workflows/benchmarking.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 4d91e9fa74..6c7dce9beb 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -46,7 +46,7 @@ jobs: runs-on: [self-hosted, zenith-benchmarker] env: - POSTGRES_DISTRIB_DIR: /tmp/pg_install + POSTGRES_DISTRIB_DIR: /usr/pgsql DEFAULT_PG_VERSION: 14 steps: From 241e5497574146f7c29775ffdd311b801a5cc78c Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Tue, 11 Oct 2022 01:07:19 +0300 Subject: [PATCH 0889/1022] Switch neon-stress etcd to dedicatd instance (#2602) --- .github/ansible/neon-stress.hosts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ansible/neon-stress.hosts b/.github/ansible/neon-stress.hosts index 750fd8106a..c1bc8243f8 100644 --- a/.github/ansible/neon-stress.hosts +++ b/.github/ansible/neon-stress.hosts @@ -16,5 +16,5 @@ env_name = neon-stress console_mgmt_base_url = http://neon-stress-console.local bucket_name = neon-storage-ireland bucket_region = eu-west-1 -etcd_endpoints = etcd-stress.local:2379 +etcd_endpoints = neon-stress-etcd.local:2379 safekeeper_enable_s3_offload = false From e52029309059ef2197aaff1a725deb4d42976fd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Tue, 11 Oct 2022 09:54:32 +0300 Subject: [PATCH 0890/1022] Add build info metric to pageserver, safekeeper and proxy (#2596) * Test that we emit build info metric for pageserver, safekeeper and proxy with some non-zero length revision label * Emit libmetrics_build_info on startup of pageserver, safekeeper and proxy with label "revision" which tells the git revision. --- libs/metrics/src/lib.rs | 10 ++++++++++ pageserver/src/bin/pageserver.rs | 4 ++++ proxy/src/main.rs | 2 ++ safekeeper/src/bin/safekeeper.rs | 2 ++ test_runner/fixtures/neon_fixtures.py | 5 +++++ test_runner/regress/test_build_info_metric.py | 19 +++++++++++++++++++ 6 files changed, 42 insertions(+) create mode 100644 test_runner/regress/test_build_info_metric.py diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index e290828d37..880ab0e83c 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -77,6 +77,16 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[ 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, ]; +pub fn set_build_info_metric(revision: &str) { + let metric = register_int_gauge_vec!( + "libmetrics_build_info", + "Build/version information", + &["revision"] + ) + .expect("Failed to register build info metric"); + metric.with_label_values(&[revision]).set(1); +} + // Records I/O stats in a "cross-platform" way. // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats. // An alternative is to read procfs (`/proc/[pid]/io`) which does not work under macOS at all, hence abandoned. diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 46a36c6118..4cd82e37b1 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -10,6 +10,8 @@ use clap::{App, Arg}; use daemonize::Daemonize; use fail::FailScenario; +use metrics::set_build_info_metric; + use pageserver::{ config::{defaults::*, PageServerConf}, http, page_cache, page_service, profiling, task_mgr, @@ -359,6 +361,8 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() }, ); + set_build_info_metric(GIT_VERSION); + // All started up! Now just sit and wait for shutdown signal. signals.handle(|signal| match signal { Signal::Quit => { diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 2e6c365d32..91ef26a37f 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -23,6 +23,7 @@ use anyhow::{bail, Context}; use clap::{self, Arg}; use config::ProxyConfig; use futures::FutureExt; +use metrics::set_build_info_metric; use std::{borrow::Cow, future::Future, net::SocketAddr}; use tokio::{net::TcpListener, task::JoinError}; use tracing::info; @@ -166,6 +167,7 @@ async fn main() -> anyhow::Result<()> { ] .map(flatten_err); + set_build_info_metric(GIT_VERSION); // This will block until all tasks have completed. // Furthermore, the first one to fail will cancel the rest. let _: Vec<()> = futures::future::try_join_all(tasks).await?; diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index e4545dad87..3f55d823cc 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -17,6 +17,7 @@ use toml_edit::Document; use tracing::*; use url::{ParseError, Url}; +use metrics::set_build_info_metric; use safekeeper::broker; use safekeeper::control_file; use safekeeper::defaults::{ @@ -363,6 +364,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo })?, ); + set_build_info_metric(GIT_VERSION); // TODO: put more thoughts into handling of failed threads // We probably should restart them. diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 28c65223ba..5df0f5cc50 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1961,6 +1961,11 @@ class NeonProxy(PgProtocol): def _wait_until_ready(self): requests.get(f"http://{self.host}:{self.http_port}/v1/status") + def get_metrics(self) -> str: + request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics") + request_result.raise_for_status() + return request_result.text + def __enter__(self): return self diff --git a/test_runner/regress/test_build_info_metric.py b/test_runner/regress/test_build_info_metric.py new file mode 100644 index 0000000000..b75b5bd775 --- /dev/null +++ b/test_runner/regress/test_build_info_metric.py @@ -0,0 +1,19 @@ +from fixtures.metrics import parse_metrics +from fixtures.neon_fixtures import NeonEnvBuilder, NeonProxy + + +def test_build_info_metric(neon_env_builder: NeonEnvBuilder, link_proxy: NeonProxy): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + parsed_metrics = {} + + parsed_metrics["pageserver"] = parse_metrics(env.pageserver.http_client().get_metrics()) + parsed_metrics["safekeeper"] = parse_metrics(env.safekeepers[0].http_client().get_metrics_str()) + parsed_metrics["proxy"] = parse_metrics(link_proxy.get_metrics()) + + for component, metrics in parsed_metrics.items(): + sample = metrics.query_one("libmetrics_build_info") + + assert "revision" in sample.labels + assert len(sample.labels["revision"]) > 0 From db26bc49cca14c31005ae9e1dd3bf63fdaaeff50 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 11 Oct 2022 14:47:55 +0300 Subject: [PATCH 0891/1022] Remove obsolete FIXME comment. Commit c634cb1d36 removed the trait and changed the function to return a &TimelineWriter, as the FIXME said we should do, but forgot to remove the FIXME. --- pageserver/src/tenant/timeline.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 247e076230..3639e30fee 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -475,10 +475,6 @@ impl Timeline { } /// Mutate the timeline with a [`TimelineWriter`]. - /// - /// FIXME: This ought to return &'a TimelineWriter, where TimelineWriter - /// is a generic type in this trait. But that doesn't currently work in - /// Rust: https://rust-lang.github.io/rfcs/1598-generic_associated_types.html pub fn writer(&self) -> TimelineWriter<'_> { TimelineWriter { tl: self, From 47366522a88f7fe06503de9639b89338e82c2f1a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 11 Oct 2022 14:47:59 +0300 Subject: [PATCH 0892/1022] Make the return type of 'list_timelines' simpler. It's enough to return just the Timeline references. You can get the timeline's ID easily from Timeline. --- pageserver/src/http/routes.rs | 7 ++++--- pageserver/src/tenant.rs | 6 +++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index f1318cb325..2b4ad86310 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -145,9 +145,9 @@ fn list_local_timelines( let timelines = tenant.list_timelines(); let mut local_timeline_info = Vec::with_capacity(timelines.len()); - for (timeline_id, repository_timeline) in timelines { + for repository_timeline in timelines { local_timeline_info.push(( - timeline_id, + repository_timeline.timeline_id, local_timeline_info_from_timeline( &repository_timeline, include_non_incremental_logical_size, @@ -218,7 +218,8 @@ async fn timeline_list_handler(request: Request) -> Result, .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; let mut response_data = Vec::with_capacity(timelines.len()); - for (timeline_id, timeline) in timelines { + for timeline in timelines { + let timeline_id = timeline.timeline_id; let local = match local_timeline_info_from_timeline( &timeline, include_non_incremental_logical_size, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 4da5790f51..994e1db47b 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -144,12 +144,12 @@ impl Tenant { /// Lists timelines the tenant contains. /// Up to tenant's implementation to omit certain timelines that ar not considered ready for use. - pub fn list_timelines(&self) -> Vec<(TimelineId, Arc)> { + pub fn list_timelines(&self) -> Vec> { self.timelines .lock() .unwrap() - .iter() - .map(|(timeline_id, timeline_entry)| (*timeline_id, Arc::clone(timeline_entry))) + .values() + .map(Arc::clone) .collect() } From 676c63c32961d2582d8140adda2ec659a37482a8 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 11 Oct 2022 14:48:02 +0300 Subject: [PATCH 0893/1022] Improve comments. --- pageserver/src/tenant.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 994e1db47b..31d4696a31 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -153,8 +153,9 @@ impl Tenant { .collect() } - /// Create a new, empty timeline. The caller is responsible for loading data into it - /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. + /// This is used to create the initial 'main' timeline during bootstrapping, + /// or when importing a new base backup. The caller is expected to load an + /// initial image of the datadir to the new timeline after this. pub fn create_empty_timeline( &self, new_timeline_id: TimelineId, @@ -906,6 +907,7 @@ impl Tenant { Ok(totals) } + /// Branch an existing timeline fn branch_timeline( &self, src: TimelineId, @@ -981,7 +983,7 @@ impl Tenant { dst_prev, Some(src), start_lsn, - *src_timeline.latest_gc_cutoff_lsn.read(), + *src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer? src_timeline.initdb_lsn, src_timeline.pg_version, ); From e5e40a31f4ce84a37f88aad2fe612d144fc55e6f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 11 Oct 2022 14:48:04 +0300 Subject: [PATCH 0894/1022] Clean up terms "delete timeline" and "detach tenant". You cannot attach/detach an individual timeline, attach/detach always applies to the whole tenant. However, you can *delete* a single timeline from a tenant. Fix some comments and error messages that confused these two operations. --- pageserver/src/tenant.rs | 2 +- test_runner/regress/test_normal_work.py | 2 +- test_runner/regress/test_tenant_detach.py | 2 +- test_runner/regress/test_timeline_delete.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 31d4696a31..3dd2f92b5e 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -346,7 +346,7 @@ impl Tenant { ensure!( !children_exist, - "Cannot detach timeline which has child timelines" + "Cannot delete timeline which has child timelines" ); let timeline_entry = match timelines.entry(timeline_id) { Entry::Occupied(e) => e, diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py index 002d697288..73918ee476 100644 --- a/test_runner/regress/test_normal_work.py +++ b/test_runner/regress/test_normal_work.py @@ -39,7 +39,7 @@ def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_s * restart compute * check that the data is there * stop compute - * detach timeline + * detach tenant Repeat check for several tenants/timelines. """ diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index f18e6867a9..a310eac1f7 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -70,7 +70,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): break # else is called if the loop finished without reaching "break" else: - pytest.fail(f"could not detach timeline: {last_error}") + pytest.fail(f"could not detach tenant: {last_error}") gc_thread.join(timeout=10) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 2eea8dd3cc..ac248c1b4b 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -32,7 +32,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): ps_http = env.pageserver.http_client() with pytest.raises( - NeonPageserverApiException, match="Cannot detach timeline which has child timelines" + NeonPageserverApiException, match="Cannot delete timeline which has child timelines" ): timeline_path = ( From 6d0dacc4ce57d89ef2f9691e6a79d9998d331ac8 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 31 Aug 2022 17:40:40 +0300 Subject: [PATCH 0895/1022] Recreate timeline on pageserver in s3_wal_replay test. That's closer to real usage than switching to brand new pageserver. --- test_runner/regress/test_wal_acceptor.py | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index d5a5ec2f36..1f9a0157fc 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1,7 +1,6 @@ import os import pathlib import random -import shutil import signal import subprocess import sys @@ -481,13 +480,6 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re env = neon_env_builder.init_start() env.neon_cli.create_branch("test_s3_wal_replay") - env.pageserver.stop() - pageserver_tenants_dir = os.path.join(env.repo_dir, "tenants") - pageserver_fresh_copy = os.path.join(env.repo_dir, "tenants_fresh") - log.info(f"Creating a copy of pageserver in a fresh state at {pageserver_fresh_copy}") - shutil.copytree(pageserver_tenants_dir, pageserver_fresh_copy) - env.pageserver.start() - pg = env.postgres.create_start("test_s3_wal_replay") # learn neon timeline from compute @@ -525,25 +517,19 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re last_lsn = query_scalar(cur, "SELECT pg_current_wal_flush_lsn()") - pageserver_lsn = env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["local"][ - "last_record_lsn" - ] + ps_cli = env.pageserver.http_client() + pageserver_lsn = ps_cli.timeline_detail(tenant_id, timeline_id)["local"]["last_record_lsn"] lag = Lsn(last_lsn) - Lsn(pageserver_lsn) log.info( f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb" ) - # replace pageserver with a fresh copy pg.stop_and_destroy() - env.pageserver.stop() - log.info(f"Removing current pageserver state at {pageserver_tenants_dir}") - shutil.rmtree(pageserver_tenants_dir) - log.info(f"Copying fresh pageserver state from {pageserver_fresh_copy}") - shutil.move(pageserver_fresh_copy, pageserver_tenants_dir) + # recreate timeline on pageserver from scratch + ps_cli.timeline_delete(tenant_id, timeline_id) + ps_cli.timeline_create(tenant_id, timeline_id) - # start pageserver and wait for replay - env.pageserver.start() wait_lsn_timeout = 60 * 3 started_at = time.time() last_debug_print = 0.0 From 93775f6ca75060f86395427e993b5c393ec16a60 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 12 Oct 2022 10:22:24 +0100 Subject: [PATCH 0896/1022] GitHub Actions: replace deprecated set-output with GITHUB_OUTPUT (#2608) --- .github/actions/allure-report/action.yml | 4 ++-- .github/actions/download/action.yml | 4 ++-- .github/actions/neon-project-create/action.yml | 8 ++++---- .github/actions/neon-project-delete/action.yml | 2 +- .github/workflows/benchmarking.yml | 2 +- .github/workflows/build_and_test.yml | 14 +++++++------- .github/workflows/codestyle.yml | 4 ++-- 7 files changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/actions/allure-report/action.yml b/.github/actions/allure-report/action.yml index ec751f51fc..dfb314571b 100644 --- a/.github/actions/allure-report/action.yml +++ b/.github/actions/allure-report/action.yml @@ -47,7 +47,7 @@ runs: else key=branch-$(echo ${GITHUB_REF#refs/heads/} | tr -c "[:alnum:]._-" "-") fi - echo "::set-output name=KEY::${key}" + echo "KEY=${key}" >> $GITHUB_OUTPUT - uses: actions/setup-java@v3 if: ${{ inputs.action == 'generate' }} @@ -186,7 +186,7 @@ runs: aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html" echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY} - echo "::set-output name=report-url::${REPORT_URL}" + echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT - name: Release Allure lock if: ${{ inputs.action == 'generate' && always() }} diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml index 731ef6639d..eb34d4206a 100644 --- a/.github/actions/download/action.yml +++ b/.github/actions/download/action.yml @@ -34,7 +34,7 @@ runs: S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) if [ -z "${S3_KEY}" ]; then if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then - echo '::set-output name=SKIPPED::true' + echo 'SKIPPED=true' >> $GITHUB_OUTPUT exit 0 else echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist" @@ -42,7 +42,7 @@ runs: fi fi - echo '::set-output name=SKIPPED::false' + echo 'SKIPPED=false' >> $GITHUB_OUTPUT mkdir -p $(dirname $ARCHIVE) time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} ${ARCHIVE} diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index 2f58ae77ad..b4fd151582 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -41,8 +41,8 @@ runs: ;; esac - echo "::set-output name=api_host::${API_HOST}" - echo "::set-output name=region_id::${REGION_ID}" + echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT + echo "region_id=${REGION_ID}" >> $GITHUB_OUTPUT env: ENVIRONMENT: ${{ inputs.environment }} REGION_ID: ${{ inputs.region_id }} @@ -72,10 +72,10 @@ runs: dsn=$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .dsn')/main echo "::add-mask::${dsn}" - echo "::set-output name=dsn::${dsn}" + echo "dsn=${dsn}" >> $GITHUB_OUTPUT project_id=$(echo $project | jq --raw-output '.id') - echo "::set-output name=project_id::${project_id}" + echo "project_id=${project_id}" >> $GITHUB_OUTPUT env: API_KEY: ${{ inputs.api_key }} API_HOST: ${{ steps.parse-input.outputs.api_host }} diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml index e7c6f58901..d417c489ef 100644 --- a/.github/actions/neon-project-delete/action.yml +++ b/.github/actions/neon-project-delete/action.yml @@ -32,7 +32,7 @@ runs: ;; esac - echo "::set-output name=api_host::${API_HOST}" + echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT env: ENVIRONMENT: ${{ inputs.environment }} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 6c7dce9beb..dee5968ef3 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -204,7 +204,7 @@ jobs: ;; esac - echo "::set-output name=connstr::${CONNSTR}" + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT psql ${CONNSTR} -c "SELECT version();" env: diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7cc8715526..2c6aa02b22 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -35,12 +35,12 @@ jobs: echo ref:$GITHUB_REF_NAME echo rev:$(git rev-list --count HEAD) if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "::set-output name=tag::$(git rev-list --count HEAD)" + echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "::set-output name=tag::release-$(git rev-list --count HEAD)" + echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - echo "::set-output name=tag::$GITHUB_RUN_ID" + echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT fi shell: bash id: build-tag @@ -78,12 +78,12 @@ jobs: - name: Set pg 14 revision for caching id: pg_v14_rev - run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14) + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT shell: bash -euxo pipefail {0} - name: Set pg 15 revision for caching id: pg_v15_rev - run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v15) + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT shell: bash -euxo pipefail {0} # Set some environment variables used by all the steps. @@ -671,10 +671,10 @@ jobs: if [[ "$GITHUB_REF_NAME" == "main" ]]; then STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}' NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}' - echo "::set-output name=include::[$STAGING, $NEON_STRESS]" + echo "include=[$STAGING, $NEON_STRESS]" >> $GITHUB_OUTPUT elif [[ "$GITHUB_REF_NAME" == "release" ]]; then PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}' - echo "::set-output name=include::[$PRODUCTION]" + echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" exit 1 diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 6d39958bab..961d811a51 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -56,12 +56,12 @@ jobs: - name: Set pg 14 revision for caching id: pg_v14_rev - run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14) + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT shell: bash -euxo pipefail {0} - name: Set pg 15 revision for caching id: pg_v15_rev - run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v15) + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT shell: bash -euxo pipefail {0} - name: Cache postgres v14 build From 771e61425ec0a0521f3c93c10783c41fbd3867d2 Mon Sep 17 00:00:00 2001 From: danieltprice <10074684+danieltprice@users.noreply.github.com> Date: Wed, 12 Oct 2022 08:38:28 -0300 Subject: [PATCH 0897/1022] Update release-pr.md (#2600) Update the Release Notes PR example that is referenced from the checklist. The Release Notes file structure changed recently. --- .github/PULL_REQUEST_TEMPLATE/release-pr.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/PULL_REQUEST_TEMPLATE/release-pr.md b/.github/PULL_REQUEST_TEMPLATE/release-pr.md index 6f86114060..8fcc3bd4af 100644 --- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md +++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md @@ -10,7 +10,7 @@ ### Checklist after release -- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/120/files)) +- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/219/files)) - [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel - [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true) - [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some) From 6ace79345de41e4569856c7ad6978cb1bb1e1765 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 12 Oct 2022 21:00:44 +0300 Subject: [PATCH 0898/1022] [proxy] Add more context to console requests logging (#2583) --- proxy/src/auth/backend/console.rs | 68 ++++++++++++++----------------- proxy/src/auth/backend/link.rs | 7 ++-- proxy/src/http.rs | 11 +++++ proxy/src/proxy.rs | 8 ++-- proxy/src/stream.rs | 8 ++-- 5 files changed, 53 insertions(+), 49 deletions(-) diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index 7dbb173b88..cf99aa08ef 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -8,36 +8,20 @@ use crate::{ http, scram, stream::PqStream, }; +use futures::TryFutureExt; use serde::{Deserialize, Serialize}; use std::future::Future; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::info; +use tracing::{error, info, info_span}; const REQUEST_FAILED: &str = "Console request failed"; #[derive(Debug, Error)] -pub enum TransportError { - #[error("Console responded with a malformed JSON: {0}")] - BadResponse(#[from] serde_json::Error), +#[error("{}", REQUEST_FAILED)] +pub struct TransportError(#[from] std::io::Error); - /// HTTP status (other than 200) returned by the console. - #[error("Console responded with an HTTP status: {0}")] - HttpStatus(reqwest::StatusCode), - - #[error(transparent)] - Io(#[from] std::io::Error), -} - -impl UserFacingError for TransportError { - fn to_string_client(&self) -> String { - use TransportError::*; - match self { - HttpStatus(_) => self.to_string(), - _ => REQUEST_FAILED.to_owned(), - } - } -} +impl UserFacingError for TransportError {} // Helps eliminate graceless `.map_err` calls without introducing another ctor. impl From for TransportError { @@ -162,15 +146,19 @@ impl<'a> Api<'a> { ]) .build()?; - info!(id = request_id, url = req.url().as_str(), "request"); - let resp = self.endpoint.execute(req).await?; - if !resp.status().is_success() { - return Err(TransportError::HttpStatus(resp.status()).into()); - } + let span = info_span!("http", id = request_id, url = req.url().as_str()); + info!(parent: &span, "request auth info"); + let msg = self + .endpoint + .checked_execute(req) + .and_then(|r| r.json::()) + .await + .map_err(|e| { + error!(parent: &span, "{e}"); + e + })?; - let response: GetRoleSecretResponse = serde_json::from_str(&resp.text().await?)?; - - scram::ServerSecret::parse(&response.role_secret) + scram::ServerSecret::parse(&msg.role_secret) .map(AuthInfo::Scram) .ok_or(GetAuthInfoError::BadSecret) } @@ -189,17 +177,21 @@ impl<'a> Api<'a> { ]) .build()?; - info!(id = request_id, url = req.url().as_str(), "request"); - let resp = self.endpoint.execute(req).await?; - if !resp.status().is_success() { - return Err(TransportError::HttpStatus(resp.status()).into()); - } - - let response: GetWakeComputeResponse = serde_json::from_str(&resp.text().await?)?; + let span = info_span!("http", id = request_id, url = req.url().as_str()); + info!(parent: &span, "request wake-up"); + let msg = self + .endpoint + .checked_execute(req) + .and_then(|r| r.json::()) + .await + .map_err(|e| { + error!(parent: &span, "{e}"); + e + })?; // Unfortunately, ownership won't let us use `Option::ok_or` here. - let (host, port) = match parse_host_port(&response.address) { - None => return Err(WakeComputeError::BadComputeAddress(response.address)), + let (host, port) = match parse_host_port(&msg.address) { + None => return Err(WakeComputeError::BadComputeAddress(msg.address)), Some(x) => x, }; diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index 863ed53645..c8ca418144 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -1,7 +1,7 @@ use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::info; +use tracing::{info, info_span}; use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; #[derive(Debug, Error)] @@ -51,11 +51,12 @@ pub async fn handle_user( client: &mut PqStream, ) -> auth::Result { let psql_session_id = new_psql_session_id(); + let span = info_span!("link", psql_session_id = &psql_session_id); let greeting = hello_message(link_uri, &psql_session_id); let db_info = super::with_waiter(psql_session_id, |waiter| async { // Give user a URL to spawn a new database. - info!("sending the auth URL to the user"); + info!(parent: &span, "sending the auth URL to the user"); client .write_message_noflush(&Be::AuthenticationOk)? .write_message_noflush(&BeParameterStatusMessage::encoding())? @@ -63,7 +64,7 @@ pub async fn handle_user( .await?; // Wait for web console response (see `mgmt`). - info!("waiting for console's reply..."); + info!(parent: &span, "waiting for console's reply..."); waiter.await?.map_err(LinkAuthError::AuthFailed) }) .await?; diff --git a/proxy/src/http.rs b/proxy/src/http.rs index dbeb3dc784..6f9145678b 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -17,6 +17,7 @@ impl Endpoint { Self { endpoint, client } } + #[inline(always)] pub fn url(&self) -> &ApiUrl { &self.endpoint } @@ -36,6 +37,16 @@ impl Endpoint { ) -> Result { self.client.execute(request).await } + + /// Execute a [request](reqwest::Request) and raise an error if status != 200. + pub async fn checked_execute( + &self, + request: reqwest::Request, + ) -> Result { + self.execute(request) + .await + .and_then(|r| r.error_for_status()) + } } #[cfg(test)] diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 5dcaa000cf..889445239a 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -1,7 +1,7 @@ use crate::auth; use crate::cancellation::{self, CancelMap}; use crate::config::{ProxyConfig, TlsConfig}; -use crate::stream::{MetricsStream, PqStream, Stream}; +use crate::stream::{MeasuredStream, PqStream, Stream}; use anyhow::{bail, Context}; use futures::TryFutureExt; use metrics::{register_int_counter, IntCounter}; @@ -64,7 +64,7 @@ pub async fn task_main( let cancel_map = Arc::new(CancelMap::default()); loop { let (socket, peer_addr) = listener.accept().await?; - info!("accepted connection from {peer_addr}"); + info!("accepted postgres client connection from {peer_addr}"); let session_id = uuid::Uuid::new_v4(); let cancel_map = Arc::clone(&cancel_map); @@ -270,8 +270,8 @@ impl Client<'_, S> { // Starting from here we only proxy the client's traffic. info!("performing the proxy pass..."); - let mut db = MetricsStream::new(db.stream, inc_proxied); - let mut client = MetricsStream::new(stream.into_inner(), inc_proxied); + let mut db = MeasuredStream::new(db.stream, inc_proxied); + let mut client = MeasuredStream::new(stream.into_inner(), inc_proxied); let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?; Ok(()) diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 54ff8bcc07..2a224944e2 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -231,7 +231,7 @@ impl AsyncWrite for Stream { pin_project! { /// This stream tracks all writes and calls user provided /// callback when the underlying stream is flushed. - pub struct MetricsStream { + pub struct MeasuredStream { #[pin] stream: S, write_count: usize, @@ -239,7 +239,7 @@ pin_project! { } } -impl MetricsStream { +impl MeasuredStream { pub fn new(stream: S, inc_write_count: W) -> Self { Self { stream, @@ -249,7 +249,7 @@ impl MetricsStream { } } -impl AsyncRead for MetricsStream { +impl AsyncRead for MeasuredStream { fn poll_read( self: Pin<&mut Self>, context: &mut task::Context<'_>, @@ -259,7 +259,7 @@ impl AsyncRead for MetricsStream { } } -impl AsyncWrite for MetricsStream { +impl AsyncWrite for MeasuredStream { fn poll_write( self: Pin<&mut Self>, context: &mut task::Context<'_>, From 09dda35dacba832f49f13d481033e6d3b1857951 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s?= Date: Wed, 12 Oct 2022 21:28:39 +0200 Subject: [PATCH 0899/1022] Return broken tenants due to non existing timelines dir (#2552) (#2575) Co-authored-by: andres --- pageserver/src/lib.rs | 58 -------- pageserver/src/storage_sync.rs | 34 +++-- pageserver/src/tenant_mgr.rs | 205 ++++++++++++++++------------ test_runner/regress/test_tenants.py | 19 ++- 4 files changed, 155 insertions(+), 161 deletions(-) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 7937f72de7..fe5114a247 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -119,32 +119,6 @@ impl TenantTimelineValues { fn new() -> Self { Self(HashMap::new()) } - - fn with_capacity(capacity: usize) -> Self { - Self(HashMap::with_capacity(capacity)) - } - - /// A convenience method to map certain values and omit some of them, if needed. - /// Tenants that won't have any timeline entries due to the filtering, will still be preserved - /// in the structure. - fn filter_map(self, map: F) -> TenantTimelineValues - where - F: Fn(T) -> Option, - { - let capacity = self.0.len(); - self.0.into_iter().fold( - TenantTimelineValues::::with_capacity(capacity), - |mut new_values, (tenant_id, old_values)| { - let new_timeline_values = new_values.0.entry(tenant_id).or_default(); - for (timeline_id, old_value) in old_values { - if let Some(new_value) = map(old_value) { - new_timeline_values.insert(timeline_id, new_value); - } - } - new_values - }, - ) - } } /// A suffix to be used during file sync from the remote storage, @@ -181,35 +155,3 @@ mod backoff_defaults_tests { ); } } - -#[cfg(test)] -mod tests { - use crate::tenant::harness::TIMELINE_ID; - - use super::*; - - #[test] - fn tenant_timeline_value_mapping() { - let first_tenant = TenantId::generate(); - let second_tenant = TenantId::generate(); - assert_ne!(first_tenant, second_tenant); - - let mut initial = TenantTimelineValues::new(); - initial - .0 - .entry(first_tenant) - .or_default() - .insert(TIMELINE_ID, "test_value"); - let _ = initial.0.entry(second_tenant).or_default(); - assert_eq!(initial.0.len(), 2, "Should have entries for both tenants"); - - let filtered = initial.filter_map(|_| None::<&str>).0; - assert_eq!( - filtered.len(), - 2, - "Should have entries for both tenants even after filtering away all entries" - ); - assert!(filtered.contains_key(&first_tenant)); - assert!(filtered.contains_key(&second_tenant)); - } -} diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index bee460d173..e8844baf5d 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -169,9 +169,14 @@ use self::{ upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, }; use crate::{ - config::PageServerConf, exponential_backoff, storage_sync::index::RemoteIndex, task_mgr, - task_mgr::TaskKind, task_mgr::BACKGROUND_RUNTIME, tenant::metadata::TimelineMetadata, - tenant_mgr::attach_local_tenants, + config::PageServerConf, + exponential_backoff, + storage_sync::index::RemoteIndex, + task_mgr, + task_mgr::TaskKind, + task_mgr::BACKGROUND_RUNTIME, + tenant::metadata::TimelineMetadata, + tenant_mgr::{attach_local_tenants, TenantAttachData}, }; use crate::{ metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD}, @@ -572,7 +577,10 @@ pub fn schedule_layer_download(tenant_id: TenantId, timeline_id: TimelineId) { /// See module docs for loop step description. pub fn spawn_storage_sync_task( conf: &'static PageServerConf, - local_timeline_files: TenantTimelineValues<(TimelineMetadata, HashSet)>, + local_timeline_files: HashMap< + TenantId, + HashMap)>, + >, storage: GenericRemoteStorage, max_concurrent_timelines_sync: NonZeroUsize, max_sync_errors: NonZeroU32, @@ -595,7 +603,7 @@ pub fn spawn_storage_sync_task( let mut keys_for_index_part_downloads = HashSet::new(); let mut timelines_to_sync = HashMap::new(); - for (tenant_id, timeline_data) in local_timeline_files.0 { + for (tenant_id, timeline_data) in local_timeline_files { if timeline_data.is_empty() { info!("got empty tenant {}", tenant_id); let _ = empty_tenants.0.entry(tenant_id).or_default(); @@ -698,7 +706,7 @@ async fn storage_sync_loop( "Sync loop step completed, {} new tenant state update(s)", updated_tenants.len() ); - let mut timelines_to_attach = TenantTimelineValues::new(); + let mut timelines_to_attach = HashMap::new(); let index_accessor = index.read().await; for tenant_id in updated_tenants { let tenant_entry = match index_accessor.tenant_entry(&tenant_id) { @@ -724,12 +732,16 @@ async fn storage_sync_loop( // and register them all at once in a tenant for download // to be submitted in a single operation to tenant // so it can apply them at once to internal timeline map. - timelines_to_attach.0.insert( + timelines_to_attach.insert( tenant_id, - tenant_entry - .iter() - .map(|(&id, entry)| (id, entry.metadata.clone())) - .collect(), + TenantAttachData::Ready( + tenant_entry + .iter() + .map(|(&id, entry)| { + (id, (entry.metadata.clone(), HashSet::new())) + }) + .collect(), + ), ); } } diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 0e8ee8c067..c6698ee22f 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -22,7 +22,7 @@ use crate::tenant::{ }; use crate::tenant_config::TenantConfOpt; use crate::walredo::PostgresRedoManager; -use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; +use crate::TEMP_FILE_SUFFIX; use utils::crashsafe_dir::{self, path_with_suffix_extension}; use utils::id::{TenantId, TimelineId}; @@ -70,34 +70,54 @@ pub fn init_tenant_mgr( .remote_storage_config .as_ref() .expect("remote storage without config"); - + let mut broken_tenants = HashMap::new(); + let mut ready_tenants = HashMap::new(); + for (tenant_id, tenant_attach_data) in local_tenant_files.into_iter() { + match tenant_attach_data { + TenantAttachData::Ready(t) => { + ready_tenants.insert(tenant_id, t); + } + TenantAttachData::Broken(e) => { + broken_tenants.insert(tenant_id, TenantAttachData::Broken(e)); + } + } + } let SyncStartupData { remote_index, local_timeline_init_statuses, } = storage_sync::spawn_storage_sync_task( conf, - local_tenant_files, + ready_tenants, storage, storage_config.max_concurrent_syncs, storage_config.max_sync_errors, ) .context("Failed to spawn the storage sync thread")?; - ( - remote_index, - local_timeline_init_statuses.filter_map(|init_status| match init_status { - LocalTimelineInitStatus::LocallyComplete(metadata) => Some(metadata), - LocalTimelineInitStatus::NeedsSync => None, - }), - ) + let n = local_timeline_init_statuses.0.len(); + let mut synced_timelines = local_timeline_init_statuses.0.into_iter().fold( + HashMap::::with_capacity(n), + |mut new_values, (tenant_id, old_values)| { + let new_timeline_values = new_values + .entry(tenant_id) + .or_insert_with(|| TenantAttachData::Ready(HashMap::new())); + if let TenantAttachData::Ready(t) = new_timeline_values { + for (timeline_id, old_value) in old_values { + if let LocalTimelineInitStatus::LocallyComplete(metadata) = old_value { + t.insert(timeline_id, (metadata, HashSet::new())); + } + } + } + new_values + }, + ); + synced_timelines.extend(broken_tenants); + + (remote_index, synced_timelines) } else { info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled"); - ( - RemoteIndex::default(), - local_tenant_files.filter_map(|(metadata, _)| Some(metadata)), - ) + (RemoteIndex::default(), local_tenant_files) }; - attach_local_tenants(conf, &remote_index, tenants_to_attach); Ok(remote_index) @@ -117,18 +137,12 @@ pub fn init_tenant_mgr( pub fn attach_local_tenants( conf: &'static PageServerConf, remote_index: &RemoteIndex, - tenants_to_attach: TenantTimelineValues, + tenants_to_attach: HashMap, ) { let _entered = info_span!("attach_local_tenants").entered(); - let number_of_tenants = tenants_to_attach.0.len(); - - for (tenant_id, local_timelines) in tenants_to_attach.0 { - info!( - "Attaching {} timelines for {tenant_id}", - local_timelines.len() - ); - debug!("Timelines to attach: {local_timelines:?}"); + let number_of_tenants = tenants_to_attach.len(); + for (tenant_id, local_timelines) in tenants_to_attach { let mut tenants_accessor = tenants_state::write_tenants(); let tenant = match tenants_accessor.entry(tenant_id) { hash_map::Entry::Occupied(o) => { @@ -137,25 +151,55 @@ pub fn attach_local_tenants( } hash_map::Entry::Vacant(v) => { info!("Tenant {tenant_id} was not found in pageserver's memory, loading it"); - let tenant = load_local_tenant(conf, tenant_id, remote_index); + let tenant = Arc::new(Tenant::new( + conf, + TenantConfOpt::default(), + Arc::new(PostgresRedoManager::new(conf, tenant_id)), + tenant_id, + remote_index.clone(), + conf.remote_storage_config.is_some(), + )); + match local_timelines { + TenantAttachData::Broken(_) => { + tenant.set_state(TenantState::Broken); + } + TenantAttachData::Ready(_) => { + match Tenant::load_tenant_config(conf, tenant_id) { + Ok(tenant_conf) => { + tenant.update_tenant_config(tenant_conf); + tenant.activate(false); + } + Err(e) => { + error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}"); + tenant.set_state(TenantState::Broken); + } + }; + } + } v.insert(Arc::clone(&tenant)); tenant } }; drop(tenants_accessor); - - if tenant.current_state() == TenantState::Broken { - warn!("Skipping timeline load for broken tenant {tenant_id}") - } else { - let has_timelines = !local_timelines.is_empty(); - match tenant.init_attach_timelines(local_timelines) { - Ok(()) => { - info!("successfully loaded local timelines for tenant {tenant_id}"); - tenant.activate(has_timelines); - } - Err(e) => { - error!("Failed to attach tenant timelines: {e:?}"); - tenant.set_state(TenantState::Broken); + match local_timelines { + TenantAttachData::Broken(e) => warn!("{}", e), + TenantAttachData::Ready(ref timelines) => { + info!("Attaching {} timelines for {tenant_id}", timelines.len()); + debug!("Timelines to attach: {local_timelines:?}"); + let has_timelines = !timelines.is_empty(); + let timelines_to_attach = timelines + .iter() + .map(|(&k, (v, _))| (k, v.clone())) + .collect(); + match tenant.init_attach_timelines(timelines_to_attach) { + Ok(()) => { + info!("successfully loaded local timelines for tenant {tenant_id}"); + tenant.activate(has_timelines); + } + Err(e) => { + error!("Failed to attach tenant timelines: {e:?}"); + tenant.set_state(TenantState::Broken); + } } } } @@ -164,44 +208,6 @@ pub fn attach_local_tenants( info!("Processed {number_of_tenants} local tenants during attach") } -fn load_local_tenant( - conf: &'static PageServerConf, - tenant_id: TenantId, - remote_index: &RemoteIndex, -) -> Arc { - let tenant = Arc::new(Tenant::new( - conf, - TenantConfOpt::default(), - Arc::new(PostgresRedoManager::new(conf, tenant_id)), - tenant_id, - remote_index.clone(), - conf.remote_storage_config.is_some(), - )); - - let tenant_timelines_dir = conf.timelines_path(&tenant_id); - if !tenant_timelines_dir.is_dir() { - error!( - "Tenant {} has no timelines directory at {}", - tenant_id, - tenant_timelines_dir.display() - ); - tenant.set_state(TenantState::Broken); - } else { - match Tenant::load_tenant_config(conf, tenant_id) { - Ok(tenant_conf) => { - tenant.update_tenant_config(tenant_conf); - tenant.activate(false); - } - Err(e) => { - error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}"); - tenant.set_state(TenantState::Broken); - } - } - } - - tenant -} - /// /// Shut down all tenants. This runs as part of pageserver shutdown. /// @@ -475,16 +481,21 @@ pub fn list_tenant_info(remote_index: &RemoteTimelineIndex) -> Vec { .collect() } +#[derive(Debug)] +pub enum TenantAttachData { + Ready(HashMap)>), + Broken(anyhow::Error), +} /// Attempts to collect information about all tenant and timelines, existing on the local FS. /// If finds any, deletes all temporary files and directories, created before. Also removes empty directories, /// that may appear due to such removals. /// Does not fail on particular timeline or tenant collection errors, rather logging them and ignoring the entities. fn local_tenant_timeline_files( config: &'static PageServerConf, -) -> anyhow::Result)>> { +) -> anyhow::Result> { let _entered = info_span!("local_tenant_timeline_files").entered(); - let mut local_tenant_timeline_files = TenantTimelineValues::new(); + let mut local_tenant_timeline_files = HashMap::new(); let tenants_dir = config.tenants_path(); for tenants_dir_entry in fs::read_dir(&tenants_dir) .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? @@ -506,19 +517,31 @@ fn local_tenant_timeline_files( } } else { match collect_timelines_for_tenant(config, &tenant_dir_path) { - Ok((tenant_id, collected_files)) => { + Ok((tenant_id, TenantAttachData::Broken(e))) => { + local_tenant_timeline_files.entry(tenant_id).or_insert(TenantAttachData::Broken(e)); + }, + Ok((tenant_id, TenantAttachData::Ready(collected_files))) => { if collected_files.is_empty() { match remove_if_empty(&tenant_dir_path) { Ok(true) => info!("Removed empty tenant directory {}", tenant_dir_path.display()), Ok(false) => { // insert empty timeline entry: it has some non-temporary files inside that we cannot remove // so make obvious for HTTP API callers, that something exists there and try to load the tenant - let _ = local_tenant_timeline_files.0.entry(tenant_id).or_default(); + let _ = local_tenant_timeline_files.entry(tenant_id).or_insert_with(|| TenantAttachData::Ready(HashMap::new())); }, Err(e) => error!("Failed to remove empty tenant directory: {e:?}"), } } else { - local_tenant_timeline_files.0.entry(tenant_id).or_default().extend(collected_files.into_iter()) + match local_tenant_timeline_files.entry(tenant_id) { + hash_map::Entry::Vacant(entry) => { + entry.insert(TenantAttachData::Ready(collected_files)); + } + hash_map::Entry::Occupied(entry) =>{ + if let TenantAttachData::Ready(old_timelines) = entry.into_mut() { + old_timelines.extend(collected_files); + } + }, + } } }, Err(e) => error!( @@ -541,7 +564,7 @@ fn local_tenant_timeline_files( info!( "Collected files for {} tenants", - local_tenant_timeline_files.0.len() + local_tenant_timeline_files.len(), ); Ok(local_tenant_timeline_files) } @@ -583,10 +606,7 @@ fn is_temporary(path: &Path) -> bool { fn collect_timelines_for_tenant( config: &'static PageServerConf, tenant_path: &Path, -) -> anyhow::Result<( - TenantId, - HashMap)>, -)> { +) -> anyhow::Result<(TenantId, TenantAttachData)> { let tenant_id = tenant_path .file_name() .and_then(OsStr::to_str) @@ -595,6 +615,17 @@ fn collect_timelines_for_tenant( .context("Could not parse tenant id out of the tenant dir name")?; let timelines_dir = config.timelines_path(&tenant_id); + if !timelines_dir.as_path().is_dir() { + return Ok(( + tenant_id, + TenantAttachData::Broken(anyhow::anyhow!( + "Tenant {} has no timelines directory at {}", + tenant_id, + timelines_dir.display() + )), + )); + } + let mut tenant_timelines = HashMap::new(); for timelines_dir_entry in fs::read_dir(&timelines_dir) .with_context(|| format!("Failed to list timelines dir entry for tenant {tenant_id}"))? @@ -652,7 +683,7 @@ fn collect_timelines_for_tenant( debug!("Tenant {tenant_id} has no timelines loaded"); } - Ok((tenant_id, tenant_timelines)) + Ok((tenant_id, TenantAttachData::Ready(tenant_timelines))) } // discover timeline files and extract timeline metadata diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index f49b6fccb9..37c5a130e2 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -258,11 +258,20 @@ def test_pageserver_with_empty_tenants( tenants = client.tenant_list() assert ( - len(tenants) == 1 - ), "Pageserver should attach only tenants with empty timelines/ dir on restart" - loaded_tenant = tenants[0] - assert loaded_tenant["id"] == str( - tenant_with_empty_timelines_dir + len(tenants) == 2 + ), "Pageserver should attach only tenants with empty or not existing timelines/ dir on restart" + + [broken_tenant] = [t for t in tenants if t["id"] == str(tenant_without_timelines_dir)] + assert ( + broken_tenant + ), f"A broken tenant {tenant_without_timelines_dir} should exists in the tenant list" + assert ( + broken_tenant["state"] == "Broken" + ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken" + + [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)] + assert ( + loaded_tenant ), f"Tenant {tenant_with_empty_timelines_dir} should be loaded as the only one with tenants/ directory" assert loaded_tenant["state"] == { "Active": {"background_jobs_running": False} From ebf54b0de0b42d3b5e1460b1241025297cc8d0ee Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 13 Oct 2022 10:00:29 +0100 Subject: [PATCH 0900/1022] Nightly Benchmarks: Add 50 GB projects (#2612) --- .github/workflows/benchmarking.yml | 38 +++++++++++++++++------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index dee5968ef3..6091c8d7ff 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -138,22 +138,31 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} pgbench-compare: - env: - TEST_PG_BENCH_DURATIONS_MATRIX: "60m" - TEST_PG_BENCH_SCALES_MATRIX: "10gb" - POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 - TEST_OUTPUT: /tmp/test_output - BUILD_TYPE: remote - SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} - strategy: fail-fast: false matrix: # neon-captest-new: Run pgbench in a freshly created project # neon-captest-reuse: Same, but reusing existing project # neon-captest-prefetch: Same, with prefetching enabled (new project) - platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-aurora ] + platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch ] + db_size: [ 10gb ] + include: + - platform: neon-captest-new + db_size: 50gb + - platform: neon-captest-prefetch + db_size: 50gb + - platform: rds-aurora + db_size: 50gb + + env: + TEST_PG_BENCH_DURATIONS_MATRIX: "60m" + TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }} + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 14 + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} + PLATFORM: ${{ matrix.platform }} runs-on: dev container: @@ -178,7 +187,7 @@ jobs: echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - name: Create Neon Project - if: matrix.platform != 'neon-captest-reuse' + if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: @@ -207,8 +216,6 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT psql ${CONNSTR} -c "SELECT version();" - env: - PLATFORM: ${{ matrix.platform }} - name: Set database options if: matrix.platform == 'neon-captest-prefetch' @@ -227,7 +234,6 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init env: - PLATFORM: ${{ matrix.platform }} BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -241,7 +247,6 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update env: - PLATFORM: ${{ matrix.platform }} BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -255,7 +260,6 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only env: - PLATFORM: ${{ matrix.platform }} BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -268,7 +272,7 @@ jobs: build_type: ${{ env.BUILD_TYPE }} - name: Delete Neon Project - if: ${{ matrix.platform != 'neon-captest-reuse' && always() }} + if: ${{ steps.create-neon-project.outputs.project_id && always() }} uses: ./.github/actions/neon-project-delete with: environment: dev From 14c623b254457f1dd4e1b0a64782cf5f568176bf Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 13 Oct 2022 13:54:15 +0300 Subject: [PATCH 0901/1022] Make it possible to build with old cargo version. I'm using the Rust compiler and cargo versions from Debian packages, but the latest available cargo Debian package is quite old, version 1.57. The 'named-profiles' features was not stabilized at that version yet, so ever since commit a463749f5, I've had to manually add this line to the Cargo.toml file to compile. I've been wishing that someone would update the cargo Debian package, but it doesn't seem to be happening any time soon. This doesn't seem to bother anyone else but me, but it shouldn't hurt anyone else either. If there was a good reason, I could install a newer cargo version with 'rustup', but if all we need is this one line in Cargo.toml, I'd prefer to continue using the Debian packages. --- Cargo.toml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index bc2a705558..32c243bf44 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,14 @@ +# 'named-profiles' feature was stabilized in cargo 1.57. This line makes the +# build work with older cargo versions. +# +# We have this because as of this writing, the latest cargo Debian package +# that's available is 1.56. (Confusingly, the Debian package version number +# is 0.57, whereas 'cargo --version' says 1.56.) +# +# See https://tracker.debian.org/pkg/cargo for the current status of the +# package. When that gets updated, we can remove this. +cargo-features = ["named-profiles"] + [workspace] members = [ "compute_tools", From 9fe4548e13774d7f1e5f9b5d23e57da971419442 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 12 Oct 2022 18:14:02 +0400 Subject: [PATCH 0902/1022] Reimplement explicit timeline creation on safekeepers. With the ability to pass commit_lsn. This allows to perform project WAL recovery through different (from the original) set of safekeepers (or under different ttid) by 1) moving WAL files to s3 under proper ttid; 2) explicitly creating timeline on safekeepers, setting commit_lsn to the latest point; 3) putting the lastest .parital file to the timeline directory on safekeepers, if desired. Extend test_s3_wal_replay to exersise this behaviour. Also extends timeline_status endpoint to return postgres information. --- control_plane/src/safekeeper.rs | 27 +--- libs/safekeeper_api/src/models.rs | 17 ++- libs/utils/src/lsn.rs | 5 + pgxn/neon/walproposer.c | 6 - safekeeper/src/http/routes.rs | 28 ++++- safekeeper/src/json_ctrl.rs | 2 + safekeeper/src/receive_wal.rs | 3 +- safekeeper/src/safekeeper.rs | 15 ++- safekeeper/src/timeline.rs | 15 ++- safekeeper/src/timelines_global_map.rs | 13 +- test_runner/fixtures/neon_fixtures.py | 14 +++ test_runner/regress/test_wal_acceptor.py | 152 +++++++++++++++-------- 12 files changed, 195 insertions(+), 102 deletions(-) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 34b2f3000a..64a89124d2 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -12,13 +12,8 @@ use nix::unistd::Pid; use postgres::Config; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; -use safekeeper_api::models::TimelineCreateRequest; use thiserror::Error; -use utils::{ - connstring::connection_address, - http::error::HttpErrorBody, - id::{NodeId, TenantId, TimelineId}, -}; +use utils::{connstring::connection_address, http::error::HttpErrorBody, id::NodeId}; use crate::local_env::{LocalEnv, SafekeeperConf}; use crate::storage::PageServerNode; @@ -281,24 +276,4 @@ impl SafekeeperNode { .error_from_body()?; Ok(()) } - - pub fn timeline_create( - &self, - tenant_id: TenantId, - timeline_id: TimelineId, - peer_ids: Vec, - ) -> Result<()> { - Ok(self - .http_request( - Method::POST, - format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), - ) - .json(&TimelineCreateRequest { - timeline_id, - peer_ids, - }) - .send()? - .error_from_body()? - .json()?) - } } diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index e13ea50eaf..4119650b99 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -1,8 +1,21 @@ use serde::{Deserialize, Serialize}; -use utils::id::{NodeId, TimelineId}; +use utils::{ + id::{NodeId, TenantId, TimelineId}, + lsn::Lsn, +}; #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { + #[serde(with = "serde_with::rust::display_fromstr")] + pub tenant_id: TenantId, + #[serde(with = "serde_with::rust::display_fromstr")] pub timeline_id: TimelineId, - pub peer_ids: Vec, + pub peer_ids: Option>, + pub pg_version: u32, + pub system_id: Option, + pub wal_seg_size: Option, + #[serde(with = "serde_with::rust::display_fromstr")] + pub commit_lsn: Lsn, + // If not passed, it is assigned to the beginning of commit_lsn segment. + pub local_start_lsn: Option, } diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index 1090f4c679..289cec12a8 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -66,6 +66,11 @@ impl Lsn { (self.0 % seg_sz as u64) as usize } + /// Compute LSN of the segment start. + pub fn segment_lsn(self, seg_sz: usize) -> Lsn { + Lsn(self.0 - (self.0 % seg_sz as u64)) + } + /// Compute the segment number pub fn segment_number(self, seg_sz: usize) -> u64 { self.0 / seg_sz as u64 diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index fc0b660a64..ff37be2de1 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -1471,12 +1471,6 @@ SendProposerElected(Safekeeper *sk) */ th = &sk->voteResponse.termHistory; - /* - * If any WAL is present on the sk, it must be authorized by some term. - * OTOH, without any WAL there are no term swiches in the log. - */ - Assert((th->n_entries == 0) == - (sk->voteResponse.flushLsn == InvalidXLogRecPtr)); /* We must start somewhere. */ Assert(propTermHistory.n_entries >= 1); diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 43c0a17f84..6efd09c7e2 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,8 +1,8 @@ -use anyhow::anyhow; use hyper::{Body, Request, Response, StatusCode, Uri}; use anyhow::Context; use once_cell::sync::Lazy; +use postgres_ffi::WAL_SEGMENT_SIZE; use serde::Serialize; use serde::Serializer; use std::collections::{HashMap, HashSet}; @@ -10,6 +10,7 @@ use std::fmt::Display; use std::sync::Arc; use tokio::task::JoinError; +use crate::safekeeper::ServerInfo; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; @@ -77,6 +78,7 @@ struct TimelineStatus { #[serde(serialize_with = "display_serialize")] timeline_id: TimelineId, acceptor_state: AcceptorStateStatus, + pg_info: ServerInfo, #[serde(serialize_with = "display_serialize")] flush_lsn: Lsn, #[serde(serialize_with = "display_serialize")] @@ -121,6 +123,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result Result ReceiveWalConn<'pg> { system_id: greeting.system_id, wal_seg_size: greeting.wal_seg_size, }; - GlobalTimelines::create(spg.ttid, server_info)? + GlobalTimelines::create(spg.ttid, server_info, Lsn::INVALID, Lsn::INVALID)? } _ => bail!("unexpected message {:?} instead of greeting", next_msg), }; diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 7869aa8b3a..7b11aaf92a 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -222,6 +222,8 @@ impl SafeKeeperState { ttid: &TenantTimelineId, server_info: ServerInfo, peers: Vec, + commit_lsn: Lsn, + local_start_lsn: Lsn, ) -> SafeKeeperState { SafeKeeperState { tenant_id: ttid.tenant_id, @@ -233,10 +235,10 @@ impl SafeKeeperState { server: server_info, proposer_uuid: [0; 16], timeline_start_lsn: Lsn(0), - local_start_lsn: Lsn(0), - commit_lsn: Lsn(0), - backup_lsn: Lsn::INVALID, - peer_horizon_lsn: Lsn(0), + local_start_lsn, + commit_lsn, + backup_lsn: local_start_lsn, + peer_horizon_lsn: local_start_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(peers.iter().map(|p| (*p, PeerInfo::new())).collect()), } @@ -252,6 +254,8 @@ impl SafeKeeperState { wal_seg_size: 0, }, vec![], + Lsn::INVALID, + Lsn::INVALID, ) } } @@ -740,7 +744,8 @@ where "setting timeline_start_lsn to {:?}", state.timeline_start_lsn ); - + } + if state.local_start_lsn == Lsn(0) { state.local_start_lsn = msg.start_streaming_at; info!("setting local_start_lsn to {:?}", state.local_start_lsn); } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index dc7503af65..3fb77bf582 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -107,6 +107,14 @@ impl SharedState { bail!(TimelineError::UninitialinzedPgVersion(*ttid)); } + if state.commit_lsn < state.local_start_lsn { + bail!( + "commit_lsn {} is higher than local_start_lsn {}", + state.commit_lsn, + state.local_start_lsn + ); + } + // We don't want to write anything to disk, because we may have existing timeline there. // These functions should not change anything on disk. let control_store = control_file::FileStorage::create_new(ttid, conf, state)?; @@ -286,7 +294,7 @@ pub struct Timeline { /// Sending here asks for wal backup launcher attention (start/stop /// offloading). Sending ttid instead of concrete command allows to do /// sending without timeline lock. - wal_backup_launcher_tx: Sender, + pub wal_backup_launcher_tx: Sender, /// Used to broadcast commit_lsn updates to all background jobs. commit_lsn_watch_tx: watch::Sender, @@ -339,10 +347,12 @@ impl Timeline { ttid: TenantTimelineId, wal_backup_launcher_tx: Sender, server_info: ServerInfo, + commit_lsn: Lsn, + local_start_lsn: Lsn, ) -> Result { let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID); let (cancellation_tx, cancellation_rx) = watch::channel(false); - let state = SafeKeeperState::new(&ttid, server_info, vec![]); + let state = SafeKeeperState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn); Ok(Timeline { ttid, @@ -381,6 +391,7 @@ impl Timeline { match || -> Result<()> { shared_state.sk.persist()?; // TODO: add more initialization steps here + shared_state.update_status(self.ttid); Ok(()) }() { Ok(_) => Ok(()), diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index cf99a243d7..a5d373a1da 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -15,6 +15,7 @@ use std::sync::{Arc, Mutex, MutexGuard}; use tokio::sync::mpsc::Sender; use tracing::*; use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; struct GlobalTimelinesState { timelines: HashMap>, @@ -153,7 +154,12 @@ impl GlobalTimelines { /// Create a new timeline with the given id. If the timeline already exists, returns /// an existing timeline. - pub fn create(ttid: TenantTimelineId, server_info: ServerInfo) -> Result> { + pub fn create( + ttid: TenantTimelineId, + server_info: ServerInfo, + commit_lsn: Lsn, + local_start_lsn: Lsn, + ) -> Result> { let (conf, wal_backup_launcher_tx) = { let state = TIMELINES_STATE.lock().unwrap(); if let Ok(timeline) = state.get(&ttid) { @@ -170,6 +176,8 @@ impl GlobalTimelines { ttid, wal_backup_launcher_tx, server_info, + commit_lsn, + local_start_lsn, )?); // Take a lock and finish the initialization holding this mutex. No other threads @@ -190,6 +198,9 @@ impl GlobalTimelines { Ok(_) => { // We are done with bootstrap, release the lock, return the timeline. drop(shared_state); + timeline + .wal_backup_launcher_tx + .blocking_send(timeline.ttid)?; Ok(timeline) } Err(e) => { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 5df0f5cc50..0d6b6f4cd7 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2339,6 +2339,7 @@ class Safekeeper: @dataclass class SafekeeperTimelineStatus: acceptor_epoch: int + pg_version: int flush_lsn: Lsn timeline_start_lsn: Lsn backup_lsn: Lsn @@ -2367,6 +2368,18 @@ class SafekeeperHttpClient(requests.Session): def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + def timeline_create( + self, tenant_id: TenantId, timeline_id: TimelineId, pg_version: int, commit_lsn: Lsn + ): + body = { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "pg_version": pg_version, + "commit_lsn": str(commit_lsn), + } + res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body) + res.raise_for_status() + def timeline_status( self, tenant_id: TenantId, timeline_id: TimelineId ) -> SafekeeperTimelineStatus: @@ -2375,6 +2388,7 @@ class SafekeeperHttpClient(requests.Session): resj = res.json() return SafekeeperTimelineStatus( acceptor_epoch=resj["acceptor_state"]["epoch"], + pg_version=resj["pg_info"]["pg_version"], flush_lsn=Lsn(resj["flush_lsn"]), timeline_start_lsn=Lsn(resj["timeline_start_lsn"]), backup_lsn=Lsn(resj["backup_lsn"]), diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 1f9a0157fc..9c8e66e0e2 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1,6 +1,7 @@ import os import pathlib import random +import shutil import signal import subprocess import sys @@ -8,6 +9,7 @@ import threading import time from contextlib import closing from dataclasses import dataclass, field +from functools import partial from pathlib import Path from typing import Any, List, Optional @@ -371,51 +373,48 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): ) # wait till first segment is removed on all safekeepers + wait( + lambda first_segments=first_segments: all(not os.path.exists(p) for p in first_segments), + "first segment get removed", + ) + + +# Wait for something, defined as f() returning True, raising error if this +# doesn't happen without timeout seconds. +def wait(f, desc, timeout=30): started_at = time.time() while True: - if all(not os.path.exists(p) for p in first_segments): + if f(): break elapsed = time.time() - started_at - if elapsed > 20: - raise RuntimeError(f"timed out waiting {elapsed:.0f}s for first segment get removed") + if elapsed > timeout: + raise RuntimeError(f"timed out waiting {elapsed:.0f}s for {desc}") time.sleep(0.5) -def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end: Lsn): - started_at = time.time() - http_cli = live_sk.http_client() - while True: - tli_status = http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"live sk status is {tli_status}") - - if tli_status.backup_lsn >= seg_end: - break - elapsed = time.time() - started_at - if elapsed > 30: - raise RuntimeError( - f"timed out waiting {elapsed:.0f}s for segment ending at {seg_end} get offloaded" - ) - time.sleep(0.5) - - -def wait_wal_trim(tenant_id, timeline_id, sk, target_size_mb): - started_at = time.time() +def is_segment_offloaded( + sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, seg_end: Lsn +): http_cli = sk.http_client() - while True: - tli_status = http_cli.timeline_status(tenant_id, timeline_id) - sk_wal_size = get_dir_size(os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id))) - sk_wal_size_mb = sk_wal_size / 1024 / 1024 - log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}") + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"sk status is {tli_status}") + return tli_status.backup_lsn >= seg_end - if sk_wal_size_mb <= target_size_mb: - break - elapsed = time.time() - started_at - if elapsed > 20: - raise RuntimeError( - f"timed out waiting {elapsed:.0f}s for sk_id={sk.id} to trim WAL to {target_size_mb:.2f}MB, current size is {sk_wal_size_mb:.2f}MB" - ) - time.sleep(0.5) +def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"sk status is {tli_status}") + return tli_status.flush_lsn >= lsn + + +def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + sk_wal_size = get_dir_size(os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id))) + sk_wal_size_mb = sk_wal_size / 1024 / 1024 + log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}") + return sk_wal_size_mb <= target_size_mb @pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) @@ -451,7 +450,10 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot cur.execute("insert into t select generate_series(1,250000), 'payload'") live_sk = [sk for sk in env.safekeepers if sk != victim][0] - wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end) + wait( + partial(is_segment_offloaded, live_sk, tenant_id, timeline_id, seg_end), + f"segment ending at {seg_end} get offloaded", + ) victim.start() @@ -463,7 +465,11 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot with closing(pg.connect()) as conn: with conn.cursor() as cur: cur.execute("insert into t select generate_series(1,250000), 'payload'") - wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], Lsn("0/5000000")) + seg_end = Lsn("0/5000000") + wait( + partial(is_segment_offloaded, env.safekeepers[1], tenant_id, timeline_id, seg_end), + f"segment ending at {seg_end} get offloaded", + ) @pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) @@ -494,38 +500,72 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re cur.execute("insert into t values (1, 'payload')") expected_sum += 1 - offloaded_seg_end = [Lsn("0/3000000")] - for seg_end in offloaded_seg_end: - # roughly fills two segments - cur.execute("insert into t select generate_series(1,500000), 'payload'") - expected_sum += 500000 * 500001 // 2 + offloaded_seg_end = Lsn("0/3000000") + # roughly fills two segments + cur.execute("insert into t select generate_series(1,500000), 'payload'") + expected_sum += 500000 * 500001 // 2 - assert query_scalar(cur, "select sum(key) from t") == expected_sum + assert query_scalar(cur, "select sum(key) from t") == expected_sum - for sk in env.safekeepers: - wait_segment_offload(tenant_id, timeline_id, sk, seg_end) + for sk in env.safekeepers: + wait( + partial(is_segment_offloaded, sk, tenant_id, timeline_id, offloaded_seg_end), + f"segment ending at {offloaded_seg_end} get offloaded", + ) # advance remote_consistent_lsn to trigger WAL trimming # this LSN should be less than commit_lsn, so timeline will be active=true in safekeepers, to push etcd updates env.safekeepers[0].http_client().record_safekeeper_info( - tenant_id, timeline_id, {"remote_consistent_lsn": str(offloaded_seg_end[-1])} + tenant_id, timeline_id, {"remote_consistent_lsn": str(offloaded_seg_end)} ) + last_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + for sk in env.safekeepers: # require WAL to be trimmed, so no more than one segment is left on disk - wait_wal_trim(tenant_id, timeline_id, sk, 16 * 1.5) - - last_lsn = query_scalar(cur, "SELECT pg_current_wal_flush_lsn()") + target_size_mb = 16 * 1.5 + wait( + partial(is_wal_trimmed, sk, tenant_id, timeline_id, target_size_mb), + f"sk_id={sk.id} to trim WAL to {target_size_mb:.2f}MB", + ) + # wait till everyone puts data up to last_lsn on disk, we are + # going to recreate state on safekeepers claiming they have data till last_lsn. + wait( + partial(is_flush_lsn_caught_up, sk, tenant_id, timeline_id, last_lsn), + f"sk_id={sk.id} to flush {last_lsn}", + ) ps_cli = env.pageserver.http_client() - pageserver_lsn = ps_cli.timeline_detail(tenant_id, timeline_id)["local"]["last_record_lsn"] - lag = Lsn(last_lsn) - Lsn(pageserver_lsn) + pageserver_lsn = Lsn(ps_cli.timeline_detail(tenant_id, timeline_id)["local"]["last_record_lsn"]) + lag = last_lsn - pageserver_lsn log.info( f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb" ) pg.stop_and_destroy() + # Also delete and manually create timeline on safekeepers -- this tests + # scenario of manual recovery on different set of safekeepers. + + # save the last (partial) file to put it back after recreation; others will be fetched from s3 + sk = env.safekeepers[0] + tli_dir = Path(sk.data_dir()) / str(tenant_id) / str(timeline_id) + f_partial = Path([f for f in os.listdir(tli_dir) if f.endswith(".partial")][0]) + f_partial_path = tli_dir / f_partial + f_partial_saved = Path(sk.data_dir()) / f_partial.name + f_partial_path.rename(f_partial_saved) + + pg_version = sk.http_client().timeline_status(tenant_id, timeline_id).pg_version + + for sk in env.safekeepers: + cli = sk.http_client() + cli.timeline_delete_force(tenant_id, timeline_id) + cli.timeline_create(tenant_id, timeline_id, pg_version, last_lsn) + f_partial_path = ( + Path(sk.data_dir()) / str(tenant_id) / str(timeline_id) / f_partial_saved.name + ) + shutil.copy(f_partial_saved, f_partial_path) + # recreate timeline on pageserver from scratch ps_cli.timeline_delete(tenant_id, timeline_id) ps_cli.timeline_create(tenant_id, timeline_id) @@ -539,10 +579,12 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re if elapsed > wait_lsn_timeout: raise RuntimeError("Timed out waiting for WAL redo") - pageserver_lsn = env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[ - "local" - ]["last_record_lsn"] - lag = Lsn(last_lsn) - Lsn(pageserver_lsn) + pageserver_lsn = Lsn( + env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["local"][ + "last_record_lsn" + ] + ) + lag = last_lsn - pageserver_lsn if time.time() > last_debug_print + 10 or lag <= 0: last_debug_print = time.time() From a13b48694362df10546434c94b843165139a786f Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 14 Oct 2022 10:37:30 +0300 Subject: [PATCH 0903/1022] Bump vendor/postgres-v15. Rebase to 15.0 --- vendor/postgres-v15 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index ff18cec1ee..339f2d642d 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit ff18cec1ee9b80055accd9c76b040875329b11ed +Subproject commit 339f2d642d7d430c44839f8293ae271f90e3cb81 From ee64a6b80b1413612630161f0500bdfe44a52db5 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 14 Oct 2022 17:11:14 +0300 Subject: [PATCH 0904/1022] Fix CI: push versioned compute images to production ECR --- .github/workflows/build_and_test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 2c6aa02b22..69b17113ed 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -622,6 +622,8 @@ jobs: crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest - name: Configure Docker Hub login run: | From 500239176ccf515253d896ad00bfe3f202e84134 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 14 Oct 2022 17:31:36 +0300 Subject: [PATCH 0905/1022] Make TimelineInfo.local field mandatory. It was only None when you queried the status of a timeline with 'timeline_detail' mgmt API call, and it was still being downloaded. You can check for that status with the 'tenant_status' API call instead, checking for has_in_progress_downloads field. Anothere case was if an error happened while trying to get the current logical size, in a 'timeline_detail' request. It might make sense to tolerate such errors, and leave the fields we cannot fill in as empty, None, 0 or similar, but it doesn't make sense to me to leave the whole 'local' struct empty in tht case. --- control_plane/src/bin/neon_local.rs | 41 +++++---------- libs/pageserver_api/src/models.rs | 2 +- pageserver/src/http/routes.rs | 60 +++++++++------------- test_runner/regress/test_remote_storage.py | 7 ++- 4 files changed, 39 insertions(+), 71 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 0c26842b34..244ae5cfd0 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -358,9 +358,7 @@ fn print_timelines_tree( // Memorize all direct children of each timeline. for timeline in timelines.iter() { - if let Some(ancestor_timeline_id) = - timeline.local.as_ref().and_then(|l| l.ancestor_timeline_id) - { + if let Some(ancestor_timeline_id) = timeline.local.ancestor_timeline_id { timelines_hash .get_mut(&ancestor_timeline_id) .context("missing timeline info in the HashMap")? @@ -371,13 +369,7 @@ fn print_timelines_tree( for timeline in timelines_hash.values() { // Start with root local timelines (no ancestors) first. - if timeline - .info - .local - .as_ref() - .and_then(|l| l.ancestor_timeline_id) - .is_none() - { + if timeline.info.local.ancestor_timeline_id.is_none() { print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?; } } @@ -394,17 +386,17 @@ fn print_timeline( timeline: &TimelineTreeEl, timelines: &HashMap, ) -> Result<()> { - let local_remote = match (timeline.info.local.as_ref(), timeline.info.remote.as_ref()) { - (None, None) => unreachable!("in this case no info for a timeline is found"), - (None, Some(_)) => "(R)", - (Some(_), None) => "(L)", - (Some(_), Some(_)) => "(L+R)", + let local_remote = if timeline.info.remote.is_some() { + "(L)" + } else { + "(L+R)" }; + // Draw main padding print!("{} ", local_remote); if nesting_level > 0 { - let ancestor_lsn = match timeline.info.local.as_ref().and_then(|i| i.ancestor_lsn) { + let ancestor_lsn = match timeline.info.local.ancestor_lsn { Some(lsn) => lsn.to_string(), None => "Unknown Lsn".to_string(), }; @@ -597,10 +589,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an Some(pg_version), )?; let new_timeline_id = timeline_info.timeline_id; - let last_record_lsn = timeline_info - .local - .context(format!("Failed to get last record LSN: no local timeline info for timeline {new_timeline_id}"))? - .last_record_lsn; + let last_record_lsn = timeline_info.local.last_record_lsn; env.register_branch_mapping( DEFAULT_BRANCH_NAME.to_string(), @@ -655,10 +644,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?; let new_timeline_id = timeline_info.timeline_id; - let last_record_lsn = timeline_info - .local - .expect("no local timeline info") - .last_record_lsn; + let last_record_lsn = timeline_info.local.last_record_lsn; env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; println!( @@ -738,10 +724,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - )?; let new_timeline_id = timeline_info.timeline_id; - let last_record_lsn = timeline_info - .local - .expect("no local timeline info") - .last_record_lsn; + let last_record_lsn = timeline_info.local.last_record_lsn; env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; @@ -801,7 +784,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { // Use the LSN at the end of the timeline. timeline_infos .get(&node.timeline_id) - .and_then(|bi| bi.local.as_ref().map(|l| l.last_record_lsn.to_string())) + .map(|bi| bi.local.last_record_lsn.to_string()) .unwrap_or_else(|| "?".to_string()) } Some(lsn) => { diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 43059ead84..8f38a02189 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -169,7 +169,7 @@ pub struct TimelineInfo { pub tenant_id: TenantId, #[serde_as(as = "DisplayFromStr")] pub timeline_id: TimelineId, - pub local: Option, + pub local: LocalTimelineInfo, pub remote: Option, } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2b4ad86310..21cc87631f 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -184,7 +184,7 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, let mut response_data = Vec::with_capacity(timelines.len()); for timeline in timelines { let timeline_id = timeline.timeline_id; - let local = match local_timeline_info_from_timeline( + let local = local_timeline_info_from_timeline( &timeline, include_non_incremental_logical_size, include_non_incremental_physical_size, - ) { - Ok(local) => Some(local), - Err(e) => { - error!("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}"); - None - } - }; + ) + .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}") + .map_err(ApiError::InternalServerError)?; response_data.push(TimelineInfo { tenant_id, @@ -300,19 +296,15 @@ async fn timeline_detail_handler(request: Request) -> Result Some(local_info), - Err(e) => { - error!("Failed to get local timeline info: {e:#}"); - None - } - }; + let timeline = timeline.map_err(ApiError::NotFound)?; + + let local_timeline_info = local_timeline_info_from_timeline( + &timeline, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + ) + .context("Failed to get local timeline info: {e:#}") + .map_err(ApiError::InternalServerError)?; let remote_timeline_info = { let remote_index_read = get_state(&request).remote_index.read().await; @@ -331,21 +323,15 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result, ApiError> { diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 3e775b10b0..9cf8a1e940 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -111,10 +111,9 @@ def test_remote_storage_backup_and_restore( with pytest.raises(Exception, match="Conflict: Tenant download is already in progress"): client.tenant_attach(tenant_id) - detail = client.timeline_detail(tenant_id, timeline_id) - log.info("Timeline detail with active failpoint: %s", detail) - assert detail["local"] is None - assert detail["remote"]["awaits_download"] + tenant_status = client.tenant_status(tenant_id) + log.info("Tenant status with active failpoint: %s", tenant_status) + assert tenant_status["has_in_progress_downloads"] is True # trigger temporary download files removal env.pageserver.stop() From 538876650a0c303aeae4fac71336a3d62aa6da28 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 14 Oct 2022 17:31:43 +0300 Subject: [PATCH 0906/1022] Merge 'local' and 'remote' parts of TimelineInfo into one struct. The 'local' part was always filled in, so that was easy to merge into into the TimelineInfo itself. 'remote' only contained two fields, 'remote_consistent_lsn' and 'awaits_download'. I made 'remote_consistent_lsn' an optional field, and 'awaits_download' is now false if the timeline is not present remotely. However, I kept stub versions of the 'local' and 'remote' structs for backwards-compatibility, with a few fields that are actively used by the control plane. They just duplicate the fields from TimelineInfo now. They can be removed later, once the control plane has been updated to use the new fields. --- control_plane/src/bin/neon_local.rs | 23 +- libs/pageserver_api/src/models.rs | 47 ++-- pageserver/src/http/openapi_spec.yml | 60 +++-- pageserver/src/http/routes.rs | 208 ++++++++---------- scripts/export_import_between_pageservers.py | 6 +- test_runner/fixtures/neon_fixtures.py | 30 +-- test_runner/regress/test_import.py | 4 +- test_runner/regress/test_pageserver_api.py | 12 +- test_runner/regress/test_remote_storage.py | 22 +- test_runner/regress/test_tenant_relocation.py | 13 +- test_runner/regress/test_timeline_delete.py | 2 +- test_runner/regress/test_timeline_size.py | 84 ++++--- test_runner/regress/test_wal_acceptor.py | 13 +- .../regress/test_wal_acceptor_async.py | 4 +- 14 files changed, 252 insertions(+), 276 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 244ae5cfd0..08797fe907 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -358,7 +358,7 @@ fn print_timelines_tree( // Memorize all direct children of each timeline. for timeline in timelines.iter() { - if let Some(ancestor_timeline_id) = timeline.local.ancestor_timeline_id { + if let Some(ancestor_timeline_id) = timeline.ancestor_timeline_id { timelines_hash .get_mut(&ancestor_timeline_id) .context("missing timeline info in the HashMap")? @@ -369,7 +369,7 @@ fn print_timelines_tree( for timeline in timelines_hash.values() { // Start with root local timelines (no ancestors) first. - if timeline.info.local.ancestor_timeline_id.is_none() { + if timeline.info.ancestor_timeline_id.is_none() { print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?; } } @@ -386,17 +386,8 @@ fn print_timeline( timeline: &TimelineTreeEl, timelines: &HashMap, ) -> Result<()> { - let local_remote = if timeline.info.remote.is_some() { - "(L)" - } else { - "(L+R)" - }; - - // Draw main padding - print!("{} ", local_remote); - if nesting_level > 0 { - let ancestor_lsn = match timeline.info.local.ancestor_lsn { + let ancestor_lsn = match timeline.info.ancestor_lsn { Some(lsn) => lsn.to_string(), None => "Unknown Lsn".to_string(), }; @@ -589,7 +580,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an Some(pg_version), )?; let new_timeline_id = timeline_info.timeline_id; - let last_record_lsn = timeline_info.local.last_record_lsn; + let last_record_lsn = timeline_info.last_record_lsn; env.register_branch_mapping( DEFAULT_BRANCH_NAME.to_string(), @@ -644,7 +635,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?; let new_timeline_id = timeline_info.timeline_id; - let last_record_lsn = timeline_info.local.last_record_lsn; + let last_record_lsn = timeline_info.last_record_lsn; env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; println!( @@ -724,7 +715,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - )?; let new_timeline_id = timeline_info.timeline_id; - let last_record_lsn = timeline_info.local.last_record_lsn; + let last_record_lsn = timeline_info.last_record_lsn; env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; @@ -784,7 +775,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { // Use the LSN at the end of the timeline. timeline_infos .get(&node.timeline_id) - .map(|bi| bi.local.last_record_lsn.to_string()) + .map(|bi| bi.last_record_lsn.to_string()) .unwrap_or_else(|| "?".to_string()) } Some(lsn) => { diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 8f38a02189..a153f1a01e 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -123,9 +123,15 @@ pub struct TenantInfo { pub has_in_progress_downloads: Option, } +/// This represents the output of the "timeline_detail" and "timeline_list" API calls. #[serde_as] #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct LocalTimelineInfo { +pub struct TimelineInfo { + #[serde_as(as = "DisplayFromStr")] + pub tenant_id: TenantId, + #[serde_as(as = "DisplayFromStr")] + pub timeline_id: TimelineId, + #[serde_as(as = "Option")] pub ancestor_timeline_id: Option, #[serde_as(as = "Option")] @@ -149,28 +155,33 @@ pub struct LocalTimelineInfo { /// the timestamp (in microseconds) of the last received message pub last_received_msg_ts: Option, pub pg_version: u32, + + #[serde_as(as = "Option")] + pub remote_consistent_lsn: Option, + pub awaits_download: bool, + + // Some of the above fields are duplicated in 'local' and 'remote', for backwards- + // compatility with older clients. + pub local: LocalTimelineInfo, + pub remote: RemoteTimelineInfo, +} + +#[serde_as] +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct LocalTimelineInfo { + #[serde_as(as = "Option")] + pub ancestor_timeline_id: Option, + #[serde_as(as = "Option")] + pub ancestor_lsn: Option, + pub current_logical_size: Option, // is None when timeline is Unloaded + pub current_physical_size: Option, // is None when timeline is Unloaded } #[serde_as] #[derive(Debug, Serialize, Deserialize, Clone)] pub struct RemoteTimelineInfo { - #[serde_as(as = "DisplayFromStr")] - pub remote_consistent_lsn: Lsn, - pub awaits_download: bool, -} - -/// -/// This represents the output of the "timeline_detail" API call. -/// -#[serde_as] -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct TimelineInfo { - #[serde_as(as = "DisplayFromStr")] - pub tenant_id: TenantId, - #[serde_as(as = "DisplayFromStr")] - pub timeline_id: TimelineId, - pub local: LocalTimelineInfo, - pub remote: Option, + #[serde_as(as = "Option")] + pub remote_consistent_lsn: Option, } pub type ConfigureFailpointsRequest = Vec; diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 97fdcd7bbd..05809a92da 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -207,7 +207,6 @@ paths: schema: $ref: "#/components/schemas/Error" - /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp: parameters: - name: tenant_id @@ -612,6 +611,9 @@ components: required: - timeline_id - tenant_id + - last_record_lsn + - disk_consistent_lsn + - awaits_download properties: timeline_id: type: string @@ -619,33 +621,15 @@ components: tenant_id: type: string format: hex - local: - $ref: "#/components/schemas/LocalTimelineInfo" - remote: - $ref: "#/components/schemas/RemoteTimelineInfo" - RemoteTimelineInfo: - type: object - required: - - awaits_download - - remote_consistent_lsn - properties: - awaits_download: - type: boolean - remote_consistent_lsn: - type: string - format: hex - LocalTimelineInfo: - type: object - required: - - last_record_lsn - - disk_consistent_lsn - properties: last_record_lsn: type: string format: hex disk_consistent_lsn: type: string format: hex + remote_consistent_lsn: + type: string + format: hex ancestor_timeline_id: type: string format: hex @@ -670,7 +654,39 @@ components: format: hex last_received_msg_ts: type: integer + awaits_download: + type: boolean + # These 'local' and 'remote' fields just duplicate some of the fields + # above. They are kept for backwards-compatibility. They can be removed, + # when the control plane has been updated to look at the above fields + # directly. + local: + $ref: "#/components/schemas/LocalTimelineInfo" + remote: + $ref: "#/components/schemas/RemoteTimelineInfo" + + LocalTimelineInfo: + type: object + properties: + ancestor_timeline_id: + type: string + format: hex + ancestor_lsn: + type: string + format: hex + current_logical_size: + type: integer + current_physical_size: + type: integer + RemoteTimelineInfo: + type: object + required: + - remote_consistent_lsn + properties: + remote_consistent_lsn: + type: string + format: hex Error: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 21cc87631f..4d7339ec13 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -79,13 +79,13 @@ fn get_config(request: &Request) -> &'static PageServerConf { get_state(request).conf } -// Helper functions to construct a LocalTimelineInfo struct for a timeline - -fn local_timeline_info_from_timeline( +// Helper function to construct a TimelineInfo struct for a timeline +async fn build_timeline_info( + state: &State, timeline: &Arc, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, -) -> anyhow::Result { +) -> anyhow::Result { let last_record_lsn = timeline.get_last_record_lsn(); let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = { let guard = timeline.last_received_wal.lock().unwrap(); @@ -100,24 +100,47 @@ fn local_timeline_info_from_timeline( } }; - let info = LocalTimelineInfo { - ancestor_timeline_id: timeline.get_ancestor_timeline_id(), - ancestor_lsn: { - match timeline.get_ancestor_lsn() { - Lsn(0) => None, - lsn @ Lsn(_) => Some(lsn), - } - }, + let (remote_consistent_lsn, awaits_download) = if let Some(remote_entry) = state + .remote_index + .read() + .await + .timeline_entry(&TenantTimelineId { + tenant_id: timeline.tenant_id, + timeline_id: timeline.timeline_id, + }) { + ( + Some(remote_entry.metadata.disk_consistent_lsn()), + remote_entry.awaits_download, + ) + } else { + (None, false) + }; + + let ancestor_timeline_id = timeline.get_ancestor_timeline_id(); + let ancestor_lsn = match timeline.get_ancestor_lsn() { + Lsn(0) => None, + lsn @ Lsn(_) => Some(lsn), + }; + let current_logical_size = match timeline.get_current_logical_size() { + Ok(size) => Some(size), + Err(err) => { + error!("Timeline info creation failed to get current logical size: {err:?}"); + None + } + }; + let current_physical_size = Some(timeline.get_physical_size()); + + let info = TimelineInfo { + tenant_id: timeline.tenant_id, + timeline_id: timeline.timeline_id, + ancestor_timeline_id, + ancestor_lsn, disk_consistent_lsn: timeline.get_disk_consistent_lsn(), last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), - current_logical_size: Some( - timeline - .get_current_logical_size() - .context("Timeline info creation failed to get current logical size")?, - ), - current_physical_size: Some(timeline.get_physical_size()), + current_logical_size, + current_physical_size, current_logical_size_non_incremental: if include_non_incremental_logical_size { Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?) } else { @@ -132,32 +155,25 @@ fn local_timeline_info_from_timeline( last_received_msg_lsn, last_received_msg_ts, pg_version: timeline.pg_version, + + remote_consistent_lsn, + awaits_download, + + // Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility + // with the control plane. + local: LocalTimelineInfo { + ancestor_timeline_id, + ancestor_lsn, + current_logical_size, + current_physical_size, + }, + remote: RemoteTimelineInfo { + remote_consistent_lsn, + }, }; Ok(info) } -fn list_local_timelines( - tenant_id: TenantId, - include_non_incremental_logical_size: bool, - include_non_incremental_physical_size: bool, -) -> Result> { - let tenant = tenant_mgr::get_tenant(tenant_id, true)?; - let timelines = tenant.list_timelines(); - - let mut local_timeline_info = Vec::with_capacity(timelines.len()); - for repository_timeline in timelines { - local_timeline_info.push(( - repository_timeline.timeline_id, - local_timeline_info_from_timeline( - &repository_timeline, - include_non_incremental_logical_size, - include_non_incremental_physical_size, - )?, - )) - } - Ok(local_timeline_info) -} - // healthcheck handler async fn status_handler(request: Request) -> Result, ApiError> { let config = get_config(&request); @@ -169,6 +185,8 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result { // Created. Construct a TimelineInfo for it. - let local_info = local_timeline_info_from_timeline(&new_timeline, false, false) + let timeline_info = build_timeline_info(state, &new_timeline, false, false) + .await .map_err(ApiError::InternalServerError)?; - Ok(Some(TimelineInfo { - tenant_id, - timeline_id: new_timeline.timeline_id, - local: local_info, - remote: None, - })) + Ok(Some(timeline_info)) } Ok(None) => Ok(None), // timeline already exists Err(err) => Err(ApiError::InternalServerError(err)), @@ -209,6 +223,8 @@ async fn timeline_list_handler(request: Request) -> Result, query_param_present(&request, "include-non-incremental-physical-size"); check_permission(&request, Some(tenant_id))?; + let state = get_state(&request); + let timelines = tokio::task::spawn_blocking(move || { let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?; @@ -219,32 +235,17 @@ async fn timeline_list_handler(request: Request) -> Result, let mut response_data = Vec::with_capacity(timelines.len()); for timeline in timelines { - let timeline_id = timeline.timeline_id; - let local = local_timeline_info_from_timeline( + let timeline_info = build_timeline_info( + state, &timeline, include_non_incremental_logical_size, include_non_incremental_physical_size, ) - .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}") - .map_err(ApiError::InternalServerError)?; + .await + .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}") + .map_err(ApiError::InternalServerError)?; - response_data.push(TimelineInfo { - tenant_id, - timeline_id, - local, - remote: get_state(&request) - .remote_index - .read() - .await - .timeline_entry(&TenantTimelineId { - tenant_id, - timeline_id, - }) - .map(|remote_entry| RemoteTimelineInfo { - remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(), - awaits_download: remote_entry.awaits_download, - }), - }) + response_data.push(timeline_info); } json_response(StatusCode::OK, response_data) @@ -289,7 +290,9 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result((local_timeline_info, remote_timeline_info)) + Ok::<_, ApiError>(timeline_info) } .instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id)) .await?; - json_response( - StatusCode::OK, - TimelineInfo { - tenant_id, - timeline_id, - local: local_timeline_info, - remote: remote_timeline_info, - }, - ) + json_response(StatusCode::OK, timeline_info) } async fn get_lsn_by_timestamp_handler(request: Request) -> Result, ApiError> { @@ -553,36 +538,27 @@ async fn tenant_status(request: Request) -> Result, ApiErro false }); - let tenant_state = match tenant { - Ok(tenant) => tenant.current_state(), + let (tenant_state, current_physical_size) = match tenant { + Ok(tenant) => { + let timelines = tenant.list_timelines(); + // Calculate total physical size of all timelines + let mut current_physical_size = 0; + for timeline in timelines { + current_physical_size += timeline.get_physical_size(); + } + + (tenant.current_state(), Some(current_physical_size)) + } Err(e) => { error!("Failed to get local tenant state: {e:#}"); if has_in_progress_downloads { - TenantState::Paused + (TenantState::Paused, None) } else { - TenantState::Broken + (TenantState::Broken, None) } } }; - let current_physical_size = - match tokio::task::spawn_blocking(move || list_local_timelines(tenant_id, false, false)) - .await - .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))? - { - Err(err) => { - // Getting local timelines can fail when no local tenant directory is on disk (e.g, when tenant data is being downloaded). - // In that case, put a warning message into log and operate normally. - warn!("Failed to get local timelines for tenant {tenant_id}: {err}"); - None - } - Ok(local_timeline_infos) => Some( - local_timeline_infos - .into_iter() - .fold(0, |acc, x| acc + x.1.current_physical_size.unwrap()), - ), - }; - json_response( StatusCode::OK, TenantInfo { diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 6f6c3864dd..152ce40cea 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -317,13 +317,13 @@ def remote_consistent_lsn( ) -> int: detail = pageserver_http_client.timeline_detail(tenant, timeline) - if detail["remote"] is None: + lsn_str = detail["remote_consistent_lsn"] + if lsn_str is None: # No remote information at all. This happens right after creating # a timeline, before any part of it has been uploaded to remote # storage yet. return 0 else: - lsn_str = detail["remote"]["remote_consistent_lsn"] assert isinstance(lsn_str, str) return lsn_from_hex(lsn_str) @@ -577,7 +577,7 @@ def main(args: argparse.Namespace): args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar" ) - pg_version = timeline["local"]["pg_version"] + pg_version = timeline["pg_version"] # Export timeline from old pageserver if args.only_import is False: diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 0d6b6f4cd7..88910d2bdf 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1179,7 +1179,7 @@ CREATE_TIMELINE_ID_EXTRACTOR = re.compile( r"^Created timeline '(?P[^']+)'", re.MULTILINE ) TIMELINE_DATA_EXTRACTOR = re.compile( - r"\s(?P[^\s]+)\s\[(?P[^\]]+)\]", re.MULTILINE + r"\s?(?P[^\s]+)\s\[(?P[^\]]+)\]", re.MULTILINE ) @@ -1430,8 +1430,8 @@ class NeonCli(AbstractNeonCli): Returns a list of (branch_name, timeline_id) tuples out of parsed `neon timeline list` CLI output. """ - # (L) main [b49f7954224a0ad25cc0013ea107b54b] - # (L) ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540] + # main [b49f7954224a0ad25cc0013ea107b54b] + # ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540] res = self.raw_cli( ["timeline", "list", "--tenant-id", str(tenant_id or self.env.initial_tenant)] ) @@ -2702,19 +2702,6 @@ def wait_until(number_of_iterations: int, interval: float, func): raise Exception("timed out while waiting for %s" % func) from last_exception -def assert_timeline_local( - pageserver_http_client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId -): - timeline_detail = pageserver_http_client.timeline_detail( - tenant, - timeline, - include_non_incremental_logical_size=True, - include_non_incremental_physical_size=True, - ) - assert timeline_detail.get("local", {}).get("disk_consistent_lsn"), timeline_detail - return timeline_detail - - def assert_no_in_progress_downloads_for_tenant( pageserver_http_client: NeonPageserverHttpClient, tenant: TenantId, @@ -2728,15 +2715,14 @@ def remote_consistent_lsn( ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) - if detail["remote"] is None: + lsn_str = detail["remote_consistent_lsn"] + if lsn_str is None: # No remote information at all. This happens right after creating # a timeline, before any part of it has been uploaded to remote # storage yet. return Lsn(0) - else: - lsn_str = detail["remote"]["remote_consistent_lsn"] - assert isinstance(lsn_str, str) - return Lsn(lsn_str) + assert isinstance(lsn_str, str) + return Lsn(lsn_str) def wait_for_upload( @@ -2768,7 +2754,7 @@ def last_record_lsn( ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) - lsn_str = detail["local"]["last_record_lsn"] + lsn_str = detail["last_record_lsn"] assert isinstance(lsn_str, str) return Lsn(lsn_str) diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index c84d282a4d..5910b4f74f 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -155,8 +155,8 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne lsn = _generate_data(num_rows, pg) logical_size = env.pageserver.http_client().timeline_detail(env.initial_tenant, timeline)[ - "local" - ]["current_logical_size"] + "current_logical_size" + ] log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB") assert logical_size > 1024**3 # = 1GB diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index def6bd5b33..bab96cff4f 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -93,7 +93,6 @@ def check_client(client: NeonPageserverHttpClient, initial_tenant: TenantId): assert TenantId(timeline_details["tenant_id"]) == tenant_id assert TimelineId(timeline_details["timeline_id"]) == timeline_id - assert timeline_details.get("local") is not None def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): @@ -125,16 +124,15 @@ def expect_updated_msg_lsn( timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id) # a successful `timeline_details` response must contain the below fields - local_timeline_details = timeline_details["local"] - assert "wal_source_connstr" in local_timeline_details.keys() - assert "last_received_msg_lsn" in local_timeline_details.keys() - assert "last_received_msg_ts" in local_timeline_details.keys() + assert "wal_source_connstr" in timeline_details.keys() + assert "last_received_msg_lsn" in timeline_details.keys() + assert "last_received_msg_ts" in timeline_details.keys() assert ( - local_timeline_details["last_received_msg_lsn"] is not None + timeline_details["last_received_msg_lsn"] is not None ), "the last received message's LSN is empty" - last_msg_lsn = Lsn(local_timeline_details["last_received_msg_lsn"]) + last_msg_lsn = Lsn(timeline_details["last_received_msg_lsn"]) assert ( prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn ), f"the last received message's LSN {last_msg_lsn} hasn't been updated \ diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 9cf8a1e940..0a02a80de5 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -10,8 +10,8 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, + NeonPageserverHttpClient, RemoteStorageKind, - assert_timeline_local, available_remote_storages, wait_for_last_record_lsn, wait_for_upload, @@ -125,16 +125,15 @@ def test_remote_storage_backup_and_restore( wait_until( number_of_iterations=20, interval=1, - func=lambda: assert_timeline_local(client, tenant_id, timeline_id), + func=lambda: expect_tenant_to_download_timeline(client, tenant_id), ) detail = client.timeline_detail(tenant_id, timeline_id) - assert detail["local"] is not None log.info("Timeline detail after attach completed: %s", detail) assert ( - Lsn(detail["local"]["last_record_lsn"]) >= current_lsn + Lsn(detail["last_record_lsn"]) >= current_lsn ), "current db Lsn should should not be less than the one stored on remote storage" - assert not detail["remote"]["awaits_download"] + assert not detail["awaits_download"] pg = env.postgres.create_start("main") with pg.cursor() as cur: @@ -143,3 +142,16 @@ def test_remote_storage_backup_and_restore( query_scalar(cur, f"SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};") == f"{data_secret}|{checkpoint_number}" ) + + +def expect_tenant_to_download_timeline( + client: NeonPageserverHttpClient, + tenant_id: TenantId, +): + for tenant in client.tenant_list(): + if tenant["id"] == str(tenant_id): + assert not tenant.get( + "has_in_progress_downloads", True + ), f"Tenant {tenant_id} should have no downloads in progress" + return + assert False, f"Tenant {tenant_id} is missing on pageserver" diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 2b01546198..a3245d65e4 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -16,7 +16,6 @@ from fixtures.neon_fixtures import ( PortDistributor, Postgres, assert_no_in_progress_downloads_for_tenant, - assert_timeline_local, base_dir, neon_binpath, pg_distrib_dir, @@ -167,18 +166,18 @@ def check_timeline_attached( old_current_lsn: Lsn, ): # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint - new_timeline_detail = assert_timeline_local(new_pageserver_http_client, tenant_id, timeline_id) + new_timeline_detail = new_pageserver_http_client.timeline_detail(tenant_id, timeline_id) # when load is active these checks can break because lsns are not static # so let's check with some margin assert_abs_margin_ratio( - int(Lsn(new_timeline_detail["local"]["disk_consistent_lsn"])), - int(Lsn(old_timeline_detail["local"]["disk_consistent_lsn"])), + int(Lsn(new_timeline_detail["disk_consistent_lsn"])), + int(Lsn(old_timeline_detail["disk_consistent_lsn"])), 0.03, ) assert_abs_margin_ratio( - int(Lsn(new_timeline_detail["local"]["disk_consistent_lsn"])), int(old_current_lsn), 0.03 + int(Lsn(new_timeline_detail["disk_consistent_lsn"])), int(old_current_lsn), 0.03 ) @@ -301,10 +300,10 @@ def test_tenant_relocation( # wait until pageserver receives that data wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_main, current_lsn_main) - timeline_detail_main = assert_timeline_local(pageserver_http, tenant_id, timeline_id_main) + timeline_detail_main = pageserver_http.timeline_detail(tenant_id, timeline_id_main) wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_second, current_lsn_second) - timeline_detail_second = assert_timeline_local(pageserver_http, tenant_id, timeline_id_second) + timeline_detail_second = pageserver_http.timeline_detail(tenant_id, timeline_id_second) if with_load == "with_load": # create load table diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index ac248c1b4b..de05d445ed 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -65,7 +65,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # check 404 with pytest.raises( NeonPageserverApiException, - match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} is not found neither locally nor remotely", + match=f"Timeline {leaf_timeline_id} was not found for tenant {env.initial_tenant}", ): ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 3a482be5db..d26d5f3afa 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -16,7 +16,6 @@ from fixtures.neon_fixtures import ( PortDistributor, Postgres, VanillaPostgres, - assert_timeline_local, wait_for_last_flush_lsn, ) from fixtures.types import TenantId, TimelineId @@ -44,20 +43,16 @@ def test_timeline_size(neon_simple_env: NeonEnv): """ ) - res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - local_details = res["local"] - assert ( - local_details["current_logical_size"] - == local_details["current_logical_size_non_incremental"] + res = client.timeline_detail( + env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True ) + assert res["current_logical_size"] == res["current_logical_size_non_incremental"] cur.execute("TRUNCATE foo") - res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - local_details = res["local"] - assert ( - local_details["current_logical_size"] - == local_details["current_logical_size_non_incremental"] + res = client.timeline_detail( + env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True ) + assert res["current_logical_size"] == res["current_logical_size_non_incremental"] def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): @@ -66,22 +61,22 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): client = env.pageserver.http_client() wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) - timeline_details = assert_timeline_local(client, env.initial_tenant, new_timeline_id) + timeline_details = client.timeline_detail( + env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True + ) pgmain = env.postgres.create_start("test_timeline_size_createdropdb") log.info("postgres is running on 'test_timeline_size_createdropdb' branch") with closing(pgmain.connect()) as conn: with conn.cursor() as cur: - res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - local_details = res["local"] - assert ( - local_details["current_logical_size"] - == local_details["current_logical_size_non_incremental"] + res = client.timeline_detail( + env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True ) + assert res["current_logical_size"] == res["current_logical_size_non_incremental"] assert ( - timeline_details["local"]["current_logical_size_non_incremental"] - == local_details["current_logical_size_non_incremental"] + timeline_details["current_logical_size_non_incremental"] + == res["current_logical_size_non_incremental"] ), "no writes should not change the incremental logical size" cur.execute("CREATE DATABASE foodb") @@ -97,21 +92,21 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): """ ) - res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - local_details = res["local"] + res = client.timeline_detail( + env.initial_tenant, + new_timeline_id, + include_non_incremental_logical_size=True, + ) assert ( - local_details["current_logical_size"] - == local_details["current_logical_size_non_incremental"] + res["current_logical_size"] == res["current_logical_size_non_incremental"] ) cur.execute("DROP DATABASE foodb") - res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - local_details = res["local"] - assert ( - local_details["current_logical_size"] - == local_details["current_logical_size_non_incremental"] + res = client.timeline_detail( + env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True ) + assert res["current_logical_size"] == res["current_logical_size_non_incremental"] # wait until received_lsn_lag is 0 @@ -210,10 +205,11 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): pg_cluster_size = cur.fetchone() log.info(f"pg_cluster_size = {pg_cluster_size}") - new_res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) + new_res = client.timeline_detail( + env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True + ) assert ( - new_res["local"]["current_logical_size"] - == new_res["local"]["current_logical_size_non_incremental"] + new_res["current_logical_size"] == new_res["current_logical_size_non_incremental"] ), "after the WAL is streamed, current_logical_size is expected to be calculated and to be equal its non-incremental value" @@ -419,7 +415,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): def get_timeline_physical_size(timeline: TimelineId): res = client.timeline_detail(tenant, timeline, include_non_incremental_physical_size=True) - return res["local"]["current_physical_size_non_incremental"] + return res["current_physical_size_non_incremental"] timeline_total_size = get_timeline_physical_size(timeline) for i in range(10): @@ -450,13 +446,10 @@ def assert_physical_size(env: NeonEnv, tenant_id: TenantId, timeline_id: Timelin """Check the current physical size returned from timeline API matches the total physical size of the timeline on disk""" client = env.pageserver.http_client() - res = assert_timeline_local(client, tenant_id, timeline_id) + res = client.timeline_detail(tenant_id, timeline_id, include_non_incremental_physical_size=True) timeline_path = env.timeline_dir(tenant_id, timeline_id) - assert ( - res["local"]["current_physical_size"] - == res["local"]["current_physical_size_non_incremental"] - ) - assert res["local"]["current_physical_size"] == get_timeline_dir_size(timeline_path) + assert res["current_physical_size"] == res["current_physical_size_non_incremental"] + assert res["current_physical_size"] == get_timeline_dir_size(timeline_path) # Timeline logical size initialization is an asynchronous background task that runs once, @@ -465,13 +458,16 @@ def wait_for_timeline_size_init( client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId ): for i in range(10): - timeline_details = assert_timeline_local(client, tenant, timeline) - if ( - timeline_details["local"]["current_logical_size"] - == timeline_details["local"]["current_logical_size_non_incremental"] - ): + timeline_details = client.timeline_detail( + tenant, timeline, include_non_incremental_logical_size=True + ) + current_logical_size = timeline_details["current_logical_size"] + non_incremental = timeline_details["current_logical_size_non_incremental"] + if current_logical_size == non_incremental: return - log.info(f"waiting for current_logical_size of a timeline to be calculated, iteration {i}") + log.info( + f"waiting for current_logical_size of a timeline to be calculated, iteration {i}: {current_logical_size} vs {non_incremental}" + ) time.sleep(1) raise Exception( f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}" diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 9c8e66e0e2..4451ba9d57 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -127,14 +127,9 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): for timeline_detail in timeline_details: timeline_id = TimelineId(timeline_detail["timeline_id"]) - local_timeline_detail = timeline_detail.get("local") - if local_timeline_detail is None: - log.debug(f"Timeline {timeline_id} is not present locally, skipping") - continue - m = TimelineMetrics( timeline_id=timeline_id, - last_record_lsn=Lsn(local_timeline_detail["last_record_lsn"]), + last_record_lsn=Lsn(timeline_detail["last_record_lsn"]), ) for sk_m in sk_metrics: m.flush_lsns.append(Lsn(sk_m.flush_lsn_inexact[(tenant_id, timeline_id)])) @@ -536,7 +531,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re ) ps_cli = env.pageserver.http_client() - pageserver_lsn = Lsn(ps_cli.timeline_detail(tenant_id, timeline_id)["local"]["last_record_lsn"]) + pageserver_lsn = Lsn(ps_cli.timeline_detail(tenant_id, timeline_id)["last_record_lsn"]) lag = last_lsn - pageserver_lsn log.info( f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb" @@ -580,9 +575,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re raise RuntimeError("Timed out waiting for WAL redo") pageserver_lsn = Lsn( - env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["local"][ - "last_record_lsn" - ] + env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["last_record_lsn"] ) lag = last_lsn - pageserver_lsn diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 9d2008296a..70ae6bae18 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -179,9 +179,7 @@ async def run_restarts_under_load( log.info(f"Postgres flush_lsn {flush_lsn}") pageserver_lsn = Lsn( - env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["local"][ - "last_record_lsn" - ] + env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["last_record_lsn"] ) sk_ps_lag = flush_lsn - pageserver_lsn log.info(f"Pageserver last_record_lsn={pageserver_lsn} lag={sk_ps_lag / 1024}kb") From 9c24de254fd93d5b002665da6ea2b1426cafa55c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 14 Oct 2022 16:41:35 +0300 Subject: [PATCH 0907/1022] Add description and license fields to OpenAPI spec. These were added earlier to the control plane's copy of this file. This is the master version of this file, so let's keep it in sync. --- pageserver/src/http/openapi_spec.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 05809a92da..626cc07429 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -1,7 +1,11 @@ openapi: "3.0.2" info: title: Page Server API + description: Neon Pageserver API version: "1.0" + license: + name: "Apache" + url: https://github.com/neondatabase/neon/blob/main/LICENSE servers: - url: "" paths: From f03b7c345821083d1e6c91397b7eca06fb54a63b Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 15 Oct 2022 01:55:31 +0300 Subject: [PATCH 0908/1022] Bump regular dependencies (#2618) * etcd-client is not updated, since we plan to replace it with another client and the new version errors with some missing prost library error * clap had released another major update that requires changing every CLI declaration again, deserves a separate PR --- Cargo.lock | 763 ++++++++++++++++-------------- control_plane/Cargo.toml | 8 +- libs/etcd_broker/Cargo.toml | 2 +- libs/pageserver_api/Cargo.toml | 2 +- libs/remote_storage/Cargo.toml | 2 +- libs/safekeeper_api/Cargo.toml | 2 +- libs/safekeeper_api/src/models.rs | 9 +- libs/utils/Cargo.toml | 6 +- pageserver/Cargo.toml | 6 +- safekeeper/Cargo.toml | 4 +- workspace_hack/Cargo.toml | 9 +- 11 files changed, 436 insertions(+), 377 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8488fc4f9d..7659be6c92 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,9 +30,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "0.7.18" +version = "0.7.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +checksum = "b4f55bd91a0978cbfd91c457a164bab8b4001c833b7f323132c0a4e1922dd44e" dependencies = [ "memchr", ] @@ -44,31 +44,34 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f27d3d00d3d115395a7a8a4dc045feb7aa82b641e485f7e15f4e67ac16f4f56d" [[package]] -name = "ansi_term" -version = "0.12.1" +name = "android_system_properties" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" dependencies = [ - "winapi", + "libc", ] [[package]] -name = "anyhow" -version = "1.0.59" +name = "anes" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91f1f46651137be86f3a2b9a8359f9ab421d04d941c62b5982e1ca21113adf9" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anyhow" +version = "1.0.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98161a4e3e2184da77bb14f02184cdd111e83bbbcc9979dfee3c44b9a85f5602" dependencies = [ "backtrace", ] [[package]] name = "arrayvec" -version = "0.4.12" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd9fd44efafa8690358b7408d253adf110036b88f55672a933f01d616ad9b1b9" -dependencies = [ - "nodrop", -] +checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" [[package]] name = "asn1-rs" @@ -83,7 +86,7 @@ dependencies = [ "num-traits", "rusticata-macros", "thiserror", - "time 0.3.12", + "time 0.3.15", ] [[package]] @@ -169,9 +172,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "axum" -version = "0.5.13" +version = "0.5.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b9496f0c1d1afb7a2af4338bbe1d969cddfead41d87a9fb3aaa6d0bbc7af648" +checksum = "c9e3356844c4d6a6d6467b8da2cffb4a2820be256f50a3a386c9d152bab31043" dependencies = [ "async-trait", "axum-core", @@ -181,7 +184,7 @@ dependencies = [ "http", "http-body", "hyper", - "itoa 1.0.3", + "itoa", "matchit", "memchr", "mime", @@ -266,7 +269,7 @@ dependencies = [ "bitflags", "cexpr", "clang-sys", - "clap 3.2.16", + "clap", "env_logger", "lazy_static", "lazycell", @@ -309,9 +312,9 @@ dependencies = [ [[package]] name = "block-buffer" -version = "0.10.2" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bf7fe51849ea569fd452f37822f606a5cabb684dc918707a0193fd4664ff324" +checksum = "69cce20737498f97b993470a6e536b8523f0af7892a4f928cceb1ac5e52ebe7e" dependencies = [ "generic-array", ] @@ -331,20 +334,19 @@ dependencies = [ "lazy_static", "memchr", "regex-automata", - "serde", ] [[package]] name = "bumpalo" -version = "3.10.0" +version = "3.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37ccbd214614c6783386c1af30caf03192f17891059cecc394b4fb119e363de3" +checksum = "c1ad822118d20d2c234f427000d5acc36eabe1e29a348c89b63dd60b13f28e5d" [[package]] name = "bytemuck" -version = "1.11.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5377c8865e74a160d21f29c2d40669f53286db6eab59b88540cbb12ffc8b835" +checksum = "2f5715e491b5a1598fc2bef5a606847b5dc1d48ea625bd3c02c00de8285591da" [[package]] name = "byteorder" @@ -390,23 +392,52 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.19" +version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" +checksum = "bfd4d1b31faaa3a89d7934dbded3111da0d2ef28e3ebccdb4f0179f5929d1ef1" dependencies = [ - "libc", + "iana-time-zone", + "js-sys", "num-integer", "num-traits", "serde", "time 0.1.44", + "wasm-bindgen", "winapi", ] [[package]] -name = "clang-sys" -version = "1.3.3" +name = "ciborium" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a050e2153c5be08febd6734e29298e844fdb0fa21aeddd63b4eb7baa106c69b" +checksum = "b0c137568cc60b904a7724001b35ce2630fd00d5d84805fbb608ab89509d788f" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "346de753af073cc87b52b2083a506b38ac176a44cfb05497b622e27be899b369" + +[[package]] +name = "ciborium-ll" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "213030a2b5a4e0c0892b6652260cf6ccac84827b83a85a534e178e3906c4cf1b" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clang-sys" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa2e27ae6ab525c3d369ded447057bca5438d86dc3a68f6faafb8269ba82ebf3" dependencies = [ "glob", "libc", @@ -415,20 +446,9 @@ dependencies = [ [[package]] name = "clap" -version = "2.34.0" +version = "3.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" -dependencies = [ - "bitflags", - "textwrap 0.11.0", - "unicode-width", -] - -[[package]] -name = "clap" -version = "3.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3dbbb6653e7c55cc8595ad3e1f7be8f32aba4eb7ff7f0fd1163d4f3d137c0a9" +checksum = "86447ad904c7fb335a790c9d7fe3d0d971dc523b8ccd1561a520de9a85302750" dependencies = [ "atty", "bitflags", @@ -436,7 +456,7 @@ dependencies = [ "indexmap", "strsim", "termcolor", - "textwrap 0.15.0", + "textwrap", ] [[package]] @@ -468,10 +488,20 @@ dependencies = [ ] [[package]] -name = "combine" -version = "4.6.4" +name = "codespan-reporting" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a604e93b79d1808327a6fca85a6f2d69de66461e7620f5a4cbf5fb4d1d7c948" +checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" +dependencies = [ + "termcolor", + "unicode-width", +] + +[[package]] +name = "combine" +version = "4.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4" dependencies = [ "bytes", "memchr", @@ -479,9 +509,9 @@ dependencies = [ [[package]] name = "comfy-table" -version = "5.0.1" +version = "6.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b103d85ca6e209388771bfb7aa6b68a7aeec4afbf6f0a0264bfbf50360e5212e" +checksum = "85914173c2f558d61613bfbbf1911f14e630895087a7ed2fafc0f5319e1536e7" dependencies = [ "crossterm", "strum", @@ -495,7 +525,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap 3.2.16", + "clap", "env_logger", "futures", "hyper", @@ -514,18 +544,18 @@ dependencies = [ [[package]] name = "const_format" -version = "0.2.26" +version = "0.2.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "939dc9e2eb9077e0679d2ce32de1ded8531779360b003b4a972a7a39ec263495" +checksum = "7309d9b4d3d2c0641e018d449232f2e28f1b22933c137f157d3dbc14228b8c0e" dependencies = [ "const_format_proc_macros", ] [[package]] name = "const_format_proc_macros" -version = "0.2.22" +version = "0.2.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef196d5d972878a48da7decb7686eded338b4858fbabeed513d63a7c98b2b82d" +checksum = "d897f47bf7270cf70d370f8f98c1abb6d2d4cf60a6845d30e05bfb90c6568650" dependencies = [ "proc-macro2", "quote", @@ -537,10 +567,10 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "clap 3.2.16", + "clap", "comfy-table", "git-version", - "nix", + "nix 0.25.0", "once_cell", "pageserver_api", "postgres", @@ -595,9 +625,9 @@ dependencies = [ [[package]] name = "cpufeatures" -version = "0.2.2" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59a6001667ab124aebae2a495118e11d30984c3a653e99d86d58971708cf5e4b" +checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320" dependencies = [ "libc", ] @@ -622,15 +652,16 @@ dependencies = [ [[package]] name = "criterion" -version = "0.3.6" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f" +checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb" dependencies = [ + "anes", "atty", "cast", - "clap 2.34.0", + "ciborium", + "clap", "criterion-plot", - "csv", "itertools", "lazy_static", "num-traits", @@ -639,7 +670,6 @@ dependencies = [ "rayon", "regex", "serde", - "serde_cbor", "serde_derive", "serde_json", "tinytemplate", @@ -648,9 +678,9 @@ dependencies = [ [[package]] name = "criterion-plot" -version = "0.4.5" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" dependencies = [ "cast", "itertools", @@ -691,15 +721,14 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.10" +version = "0.9.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "045ebe27666471bb549370b4b0b3e51b07f56325befa4284db65fc89c02511b1" +checksum = "f916dfc5d356b0ed9dae65f1db9fc9770aa2851d2662b988ccf4fe3516e86348" dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", "memoffset", - "once_cell", "scopeguard", ] @@ -715,9 +744,9 @@ dependencies = [ [[package]] name = "crossterm" -version = "0.23.2" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2102ea4f781910f8a5b98dd061f4c2023f479ce7bb1236330099ceb5a93cf17" +checksum = "e64e6c0fbe2c17357405f7c758c1ef960fce08bdfb2c03d88d2a18d7e09c4b67" dependencies = [ "bitflags", "crossterm_winapi", @@ -759,25 +788,47 @@ dependencies = [ ] [[package]] -name = "csv" -version = "1.1.6" +name = "cxx" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +checksum = "19f39818dcfc97d45b03953c1292efc4e80954e1583c4aa770bac1383e2310a4" dependencies = [ - "bstr", - "csv-core", - "itoa 0.4.8", - "ryu", - "serde", + "cc", + "cxxbridge-flags", + "cxxbridge-macro", + "link-cplusplus", ] [[package]] -name = "csv-core" -version = "0.1.10" +name = "cxx-build" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +checksum = "3e580d70777c116df50c390d1211993f62d40302881e54d4b79727acb83d0199" dependencies = [ - "memchr", + "cc", + "codespan-reporting", + "once_cell", + "proc-macro2", + "quote", + "scratch", + "syn", +] + +[[package]] +name = "cxxbridge-flags" +version = "1.0.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56a46460b88d1cec95112c8c363f0e2c39afdb237f60583b0b36343bf627ea9c" + +[[package]] +name = "cxxbridge-macro" +version = "1.0.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "747b608fecf06b0d72d440f27acc99288207324b793be2c17991839f3d4995ea" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -792,9 +843,9 @@ dependencies = [ [[package]] name = "darling" -version = "0.13.4" +version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a01d95850c592940db9b8194bc39f4bc0e89dee5c4265e4b1807c34a9aba453c" +checksum = "4529658bdda7fd6769b8614be250cdcfc3aeb0ee72fe66f9e41e5e5eb73eac02" dependencies = [ "darling_core", "darling_macro", @@ -802,9 +853,9 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.13.4" +version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "859d65a907b6852c9361e3185c862aae7fafd2887876799fa55f5f99dc40d610" +checksum = "649c91bc01e8b1eac09fb91e8dbc7d517684ca6be8ebc75bb9cafc894f9fdb6f" dependencies = [ "fnv", "ident_case", @@ -816,9 +867,9 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.13.4" +version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c972679f83bdf9c42bd905396b6c3588a843a17f0f16dfcfa3e2c5d57441835" +checksum = "ddfc69c5bfcbd2fc09a0f38451d2daf0e372e367986a83906d1b0dbc88134fb5" dependencies = [ "darling_core", "quote", @@ -865,11 +916,11 @@ dependencies = [ [[package]] name = "digest" -version = "0.10.3" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2fb860ca6fafa5552fb6d0e816a69c8e49f0908bf524e30a90d97c85892d506" +checksum = "adfbc57365a37acbd2ebf2b64d7e69bb766e2fea813521ed536f5d0520dcf86c" dependencies = [ - "block-buffer 0.10.2", + "block-buffer 0.10.3", "crypto-common", "subtle", ] @@ -908,9 +959,9 @@ dependencies = [ [[package]] name = "either" -version = "1.7.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f107b87b6afc2a64fd13cac55fe06d6c8859f12d4b14cbcdd2c67d0976781be" +checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" [[package]] name = "embedded-hal" @@ -933,9 +984,9 @@ dependencies = [ [[package]] name = "env_logger" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b2cf0344971ee6c64c31be0d530793fba457d322dfec2810c453d0ef228f9c3" +checksum = "c90bf5f19754d10198ccb95b70664fc925bd1fc090a0fd9a6ebc54acc8cd6272" dependencies = [ "atty", "humantime", @@ -979,12 +1030,12 @@ dependencies = [ [[package]] name = "fail" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3245a0ca564e7f3c797d20d833a6870f57a728ac967d5225b3ffdef4465011" +checksum = "fe5e43d0f78a42ad591453aedb1d7ae631ce7ee445c7643691055a9ed8d3b01c" dependencies = [ - "lazy_static", "log", + "once_cell", "rand", ] @@ -1056,11 +1107,10 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" [[package]] name = "form_urlencoded" -version = "1.0.1" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fc25a87fa4fd2094bffb06925852034d90a17f0d1e05197d4956d3555752191" +checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8" dependencies = [ - "matches", "percent-encoding", ] @@ -1085,9 +1135,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f73fe65f54d1e12b726f517d3e2135ca3125a437b6d998caf1962961f7172d9e" +checksum = "7f21eda599937fba36daeb58a22e8f5cee2d14c4a17b5b7739c7c8e5e3b8230c" dependencies = [ "futures-channel", "futures-core", @@ -1100,9 +1150,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3083ce4b914124575708913bca19bfe887522d6e2e6d0952943f5eac4a74010" +checksum = "30bdd20c28fadd505d0fd6712cdfcb0d4b5648baf45faef7f852afb2399bb050" dependencies = [ "futures-core", "futures-sink", @@ -1110,15 +1160,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3" +checksum = "4e5aa3de05362c3fb88de6531e6296e85cde7739cccad4b9dfeeb7f6ebce56bf" [[package]] name = "futures-executor" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9420b90cfa29e327d0429f19be13e7ddb68fa1cccb09d65e5706b8c7a749b8a6" +checksum = "9ff63c23854bee61b6e9cd331d523909f238fc7636290b96826e9cfa5faa00ab" dependencies = [ "futures-core", "futures-task", @@ -1127,15 +1177,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b" +checksum = "bbf4d2a7a308fd4578637c0b17c7e1c7ba127b8f6ba00b29f717e9655d85eb68" [[package]] name = "futures-macro" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512" +checksum = "42cd15d1c7456c04dbdf7e88bcd69760d74f3a798d6444e16974b505b0e62f17" dependencies = [ "proc-macro2", "quote", @@ -1144,21 +1194,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21163e139fa306126e6eedaf49ecdb4588f939600f0b1e770f4205ee4b7fa868" +checksum = "21b20ba5a92e727ba30e72834706623d94ac93a725410b6a6b6fbc1b07f7ba56" [[package]] name = "futures-task" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c66a976bf5909d801bbef33416c41372779507e7a6b3a5e25e4749c58f776a" +checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1" [[package]] name = "futures-util" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a" +checksum = "44fb6cb1be61cc1d2e43b262516aafcf63b241cffdb1d3fa115f91d9c7b09c90" dependencies = [ "futures-channel", "futures-core", @@ -1229,9 +1279,9 @@ checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" [[package]] name = "h2" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37a82c6d637fc9515a4694bbf1cb2457b79d81ce52b3108bdeea58b07dd34a57" +checksum = "5ca32592cf21ac7ccab1825cd87f6c9b3d9022c44d086172ed0966bec8af30be" dependencies = [ "bytes", "fnv", @@ -1283,15 +1333,6 @@ dependencies = [ "stable_deref_trait", ] -[[package]] -name = "heck" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" -dependencies = [ - "unicode-segmentation", -] - [[package]] name = "heck" version = "0.4.0" @@ -1338,7 +1379,7 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" dependencies = [ - "digest 0.10.3", + "digest 0.10.5", ] [[package]] @@ -1349,7 +1390,7 @@ checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" dependencies = [ "bytes", "fnv", - "itoa 1.0.3", + "itoa", ] [[package]] @@ -1371,9 +1412,9 @@ checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29" [[package]] name = "httparse" -version = "1.7.1" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "496ce29bb5a52785b44e0f7ca2847ae0bb839c9bd28f69acac9b99d461c0c04c" +checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" [[package]] name = "httpdate" @@ -1412,7 +1453,7 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa 1.0.3", + "itoa", "pin-project-lite", "socket2", "tokio", @@ -1459,6 +1500,30 @@ dependencies = [ "tokio-native-tls", ] +[[package]] +name = "iana-time-zone" +version = "0.1.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5a6ef98976b22b3b7f2f3a806f858cb862044cfa66805aa3ad84cb3d3b785ed" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "winapi", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fde6edd6cef363e9359ed3c98ba64590ba9eecba2293eb5a723ab32aee8926aa" +dependencies = [ + "cxx", + "cxx-build", +] + [[package]] name = "ident_case" version = "1.0.1" @@ -1467,11 +1532,10 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "idna" -version = "0.2.3" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" +checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6" dependencies = [ - "matches", "unicode-bidi", "unicode-normalization", ] @@ -1484,6 +1548,7 @@ checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" dependencies = [ "autocfg", "hashbrown", + "serde", ] [[package]] @@ -1495,7 +1560,7 @@ dependencies = [ "ahash", "atty", "indexmap", - "itoa 1.0.3", + "itoa", "lazy_static", "log", "num-format", @@ -1541,30 +1606,24 @@ checksum = "879d54834c8c76457ef4293a689b2a8c59b076067ad77b15efafbb05f92a592b" [[package]] name = "itertools" -version = "0.10.3" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" dependencies = [ "either", ] [[package]] name = "itoa" -version = "0.4.8" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" - -[[package]] -name = "itoa" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754" +checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc" [[package]] name = "js-sys" -version = "0.3.59" +version = "0.3.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "258451ab10b34f8af53416d1fdab72c22e805f0c92a1136d59470ec0b11138b2" +checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47" dependencies = [ "wasm-bindgen", ] @@ -1603,15 +1662,6 @@ dependencies = [ "libc", ] -[[package]] -name = "kstring" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b310ccceade8121d7d77fee406160e457c2f4e7c7982d589da3499bc7ea4526" -dependencies = [ - "serde", -] - [[package]] name = "lazy_static" version = "1.4.0" @@ -1626,9 +1676,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.127" +version = "0.2.135" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "505e71a4706fa491e9b1b55f51b95d4037d0821ee40131190475f692b35b009b" +checksum = "68783febc7782c6c5cb401fbda4de5a9898be1762314da0bb2c10ced61f18b0c" [[package]] name = "libloading" @@ -1647,10 +1697,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565" [[package]] -name = "lock_api" -version = "0.4.7" +name = "link-cplusplus" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "327fa5b6a6940e4699ec49a9beae1ea4845c6bab9314e4f84ac68742139d8c53" +checksum = "9272ab7b96c9046fbc5bc56c06c117cb639fe2d509df0c421cad82d2915cf369" +dependencies = [ + "cc", +] + +[[package]] +name = "lock_api" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" dependencies = [ "autocfg", "scopeguard", @@ -1675,12 +1734,6 @@ dependencies = [ "regex-automata", ] -[[package]] -name = "matches" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" - [[package]] name = "matchit" version = "0.5.0" @@ -1700,11 +1753,11 @@ dependencies = [ [[package]] name = "md-5" -version = "0.10.1" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "658646b21e0b72f7866c7038ab086d3d5e1cd6271f060fd37defb241949d0582" +checksum = "6365506850d44bff6e2fbcb5176cf63650e48bd45ef2fe2665ae1570e0f4b9ca" dependencies = [ - "digest 0.10.3", + "digest 0.10.5", ] [[package]] @@ -1721,9 +1774,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "memmap2" -version = "0.5.5" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a79b39c93a7a5a27eeaf9a23b5ff43f1b9e0ad6b1cdd441140ae53c35613fc7" +checksum = "95af15f345b17af2efc8ead6080fb8bc376f8cec1b35277b935637595fe77498" dependencies = [ "libc", ] @@ -1761,9 +1814,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.5.3" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f5c75688da582b8ffc1f1799e9db273f32133c49e048f614d22ec3256773ccc" +checksum = "96590ba8f175222643a85693f33d26e9c8a015f599c216509b1a6894af675d34" dependencies = [ "adler", ] @@ -1833,10 +1886,18 @@ dependencies = [ ] [[package]] -name = "nodrop" -version = "0.1.14" +name = "nix" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" +checksum = "e322c04a9e3440c327fca7b6c8a63e6890a32fa2ad689db972425f07e0d22abb" +dependencies = [ + "autocfg", + "bitflags", + "cfg-if", + "libc", + "memoffset", + "pin-utils", +] [[package]] name = "nom" @@ -1866,6 +1927,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num-bigint" version = "0.4.3" @@ -1879,12 +1950,12 @@ dependencies = [ [[package]] name = "num-format" -version = "0.4.0" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bafe4179722c2894288ee77a9f044f02811c86af699344c498b0840c698a2465" +checksum = "54b862ff8df690cf089058c98b183676a7ed0f974cc08b426800093227cbff3b" dependencies = [ "arrayvec", - "itoa 0.4.8", + "itoa", ] [[package]] @@ -1946,9 +2017,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.13.0" +version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18a6dbe30758c9f83eb00cbea4ac95966305f5a7772f3f42ebfc7fc7eddbd8e1" +checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1" [[package]] name = "oorandom" @@ -1964,9 +2035,9 @@ checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" [[package]] name = "openssl" -version = "0.10.41" +version = "0.10.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "618febf65336490dfcf20b73f885f5651a0c89c64c2d4a8c3662585a70bf5bd0" +checksum = "12fc0523e3bd51a692c8850d075d74dc062ccf251c0110668cbd921917118a13" dependencies = [ "bitflags", "cfg-if", @@ -1996,9 +2067,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.75" +version = "0.9.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5f9bd0c2710541a3cda73d6f9ac4f1b240de4ae261065d309dbe73d9dceb42f" +checksum = "5230151e44c0f05157effb743e8d517472843121cf9243e8b81393edb5acd9ce" dependencies = [ "autocfg", "cc", @@ -2009,9 +2080,15 @@ dependencies = [ [[package]] name = "os_str_bytes" -version = "6.2.0" +version = "6.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "648001efe5d5c0102d8cea768e348da85d90af8ba91f0bea908f157951493cd4" +checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff" + +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "pageserver" @@ -2024,7 +2101,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 3.2.16", + "clap", "close_fds", "const_format", "crc32c", @@ -2041,7 +2118,7 @@ dependencies = [ "hyper", "itertools", "metrics", - "nix", + "nix 0.25.0", "num-traits", "once_cell", "pageserver_api", @@ -2149,9 +2226,9 @@ dependencies = [ [[package]] name = "percent-encoding" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" +checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" [[package]] name = "petgraph" @@ -2183,18 +2260,18 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78203e83c48cffbe01e4a2d35d566ca4de445d79a85372fc64e378bfc812a260" +checksum = "ad29a609b6bcd67fee905812e544992d216af9d755757c05ed2d0e15a74c6ecc" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "710faf75e1b33345361201d36d04e98ac1ed8909151a017ed384700836104c74" +checksum = "069bdb1e05adc7a8990dce9cc75370895fbe4e3d58b9b73bf1aee56359344a55" dependencies = [ "proc-macro2", "quote", @@ -2221,9 +2298,9 @@ checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae" [[package]] name = "plotters" -version = "0.3.2" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9428003b84df1496fb9d6eeee9c5f8145cb41ca375eb0dad204328888832811f" +checksum = "2538b639e642295546c50fcd545198c9d64ee2a38620a628724a3b266d5fbf97" dependencies = [ "num-traits", "plotters-backend", @@ -2240,9 +2317,9 @@ checksum = "193228616381fecdc1224c62e96946dfbc73ff4384fba576e052ff8c1bea8142" [[package]] name = "plotters-svg" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0918736323d1baff32ee0eade54984f6f201ad7e97d5cfb5d6ab4a358529615" +checksum = "f9a81d2759aae1dae668f783c308bc5c8ebd191ff4184aaa1b37f65a6ae5a56f" dependencies = [ "plotters-backend", ] @@ -2271,10 +2348,10 @@ dependencies = [ "fallible-iterator", "hmac 0.12.1", "lazy_static", - "md-5 0.10.1", + "md-5 0.10.5", "memchr", "rand", - "sha2 0.10.2", + "sha2 0.10.6", "stringprep", ] @@ -2324,7 +2401,7 @@ dependencies = [ "lazy_static", "libc", "log", - "nix", + "nix 0.23.1", "parking_lot 0.11.2", "symbolic-demangle", "tempfile", @@ -2339,9 +2416,9 @@ checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" [[package]] name = "prettyplease" -version = "0.1.18" +version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "697ae720ee02011f439e0701db107ffe2916d83f718342d65d7f8bf7b8a5fee9" +checksum = "83fead41e178796ef8274dc612a7d8ce4c7e10ca35cd2c5b5ad24cac63aeb6c0" dependencies = [ "proc-macro2", "syn", @@ -2355,9 +2432,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro2" -version = "1.0.43" +version = "1.0.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab" +checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b" dependencies = [ "unicode-ident", ] @@ -2377,9 +2454,9 @@ dependencies = [ [[package]] name = "prometheus" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cface98dfa6d645ea4c789839f176e4b072265d085bfcc48eaa8d137f58d3c39" +checksum = "45c8babc29389186697fe5a2a4859d697825496b83db5d0b65271cdc0488e88c" dependencies = [ "cfg-if", "fnv", @@ -2410,7 +2487,7 @@ dependencies = [ "bytes", "cfg-if", "cmake", - "heck 0.4.0", + "heck", "itertools", "lazy_static", "log", @@ -2456,7 +2533,7 @@ dependencies = [ "base64", "bstr", "bytes", - "clap 3.2.16", + "clap", "futures", "git-version", "hashbrown", @@ -2479,7 +2556,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", - "sha2 0.10.2", + "sha2 0.10.6", "socket2", "thiserror", "tokio", @@ -2537,9 +2614,9 @@ dependencies = [ [[package]] name = "rand_core" -version = "0.6.3" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ "getrandom", ] @@ -2667,9 +2744,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.11" +version = "0.11.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b75aa69a3f06bbcc66ede33af2af253c6f7a86b1ca0033f60c580a27074fbf92" +checksum = "431949c384f4e2ae07605ccaa56d1d9d2ecdb5cadd4f9577ccfab29f2e5149fc" dependencies = [ "base64", "bytes", @@ -2683,9 +2760,9 @@ dependencies = [ "hyper-rustls", "ipnet", "js-sys", - "lazy_static", "log", "mime", + "once_cell", "percent-encoding", "pin-project-lite", "rustls", @@ -2706,9 +2783,9 @@ dependencies = [ [[package]] name = "rgb" -version = "0.8.33" +version = "0.8.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3b221de559e4a29df3b957eec92bc0de6bc8eaf6ca9cfed43e5e1d67ff65a34" +checksum = "3603b7d71ca82644f79b5a06d1220e9a58ede60bd32255f698cb1af8838b8db3" dependencies = [ "bytemuck", ] @@ -2895,7 +2972,7 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ - "semver 1.0.13", + "semver 1.0.14", ] [[package]] @@ -2957,7 +3034,7 @@ dependencies = [ "async-trait", "byteorder", "bytes", - "clap 3.2.16", + "clap", "const_format", "crc32c", "daemonize", @@ -3027,6 +3104,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "scratch" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8132065adcfd6e02db789d9285a0deb2f3fcb04002865ab67d5fb103533898" + [[package]] name = "sct" version = "0.7.0" @@ -3039,9 +3122,9 @@ dependencies = [ [[package]] name = "security-framework" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dc14f172faf8a0194a3aded622712b0de276821addc574fa54fc0a1167e10dc" +checksum = "2bc1bb97804af6631813c55739f771071e0f2ed33ee20b68c86ec505d906356c" dependencies = [ "bitflags", "core-foundation", @@ -3071,9 +3154,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93f6841e709003d68bb2deee8c343572bf446003ec20a583e76f7b15cebf3711" +checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4" [[package]] name = "semver-parser" @@ -3083,28 +3166,18 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" [[package]] name = "serde" -version = "1.0.142" +version = "1.0.145" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e590c437916fb6b221e1d00df6e3294f3fccd70ca7e92541c475d6ed6ef5fee2" +checksum = "728eb6351430bccb993660dfffc5a72f91ccc1295abaa8ce19b27ebe4f75568b" dependencies = [ "serde_derive", ] -[[package]] -name = "serde_cbor" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" -dependencies = [ - "half", - "serde", -] - [[package]] name = "serde_derive" -version = "1.0.142" +version = "1.0.145" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34b5b8d809babe02f538c2cfec6f2c1ed10804c0e5a6a041a049a4f5588ccc2e" +checksum = "81fa1584d3d1bcacd84c277a0dfe21f5b0f6accf4a23d04d4c6d61f1af522b4c" dependencies = [ "proc-macro2", "quote", @@ -3113,11 +3186,11 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.83" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38dd04e3c8279e75b31ef29dbdceebfe5ad89f4d0937213c53f7d49d01b3d5a7" +checksum = "41feea4228a6f1cd09ec7a3593a682276702cd67b5273544757dae23c096f074" dependencies = [ - "itoa 1.0.3", + "itoa", "ryu", "serde", ] @@ -3129,26 +3202,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" dependencies = [ "form_urlencoded", - "itoa 1.0.3", + "itoa", "ryu", "serde", ] [[package]] name = "serde_with" -version = "1.14.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "678b5a069e50bf00ecd22d0cd8ddf7c236f68581b03db652061ed5eb13a312ff" +checksum = "368f2d60d049ea019a84dcd6687b0d1e0030fe663ae105039bdf967ed5e6a9a7" dependencies = [ + "base64", + "chrono", + "hex", + "indexmap", "serde", + "serde_json", "serde_with_macros", + "time 0.3.15", ] [[package]] name = "serde_with_macros" -version = "1.5.2" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e182d6ec6f05393cc0e5ed1bf81ad6db3a8feedf8ee515ecdd369809bcce8082" +checksum = "1ccadfacf6cf10faad22bbadf55986bdd0856edfb5d9210aa1dcf1f516e84e93" dependencies = [ "darling", "proc-macro2", @@ -3171,13 +3250,13 @@ dependencies = [ [[package]] name = "sha2" -version = "0.10.2" +version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55deaec60f81eefe3cce0dc50bda92d6d8e88f2a27df7c5033b42afeb1ed2676" +checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" dependencies = [ "cfg-if", "cpufeatures", - "digest 0.10.3", + "digest 0.10.5", ] [[package]] @@ -3234,7 +3313,7 @@ dependencies = [ "num-bigint", "num-traits", "thiserror", - "time 0.3.12", + "time 0.3.15", ] [[package]] @@ -3254,15 +3333,15 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" [[package]] name = "socket2" -version = "0.4.4" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66d72b759436ae32898a2af0a14218dbf55efde3feeb170eb623637db85ee1e0" +checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" dependencies = [ "libc", "winapi", @@ -3313,17 +3392,17 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "strum" -version = "0.23.0" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb" +checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" [[package]] name = "strum_macros" -version = "0.23.1" +version = "0.24.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38" +checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" dependencies = [ - "heck 0.3.3", + "heck", "proc-macro2", "quote", "rustversion", @@ -3361,9 +3440,9 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.99" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13" +checksum = "3fcd952facd492f9be3ef0d0b7032a6e442ee9b361d4acc2b1d0c4aaa5f613a1" dependencies = [ "proc-macro2", "quote", @@ -3424,33 +3503,24 @@ dependencies = [ [[package]] name = "textwrap" -version = "0.11.0" +version = "0.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" -dependencies = [ - "unicode-width", -] - -[[package]] -name = "textwrap" -version = "0.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb" +checksum = "949517c0cf1bf4ee812e2e07e08ab448e3ae0d23472aee8a06c985f0c8815b16" [[package]] name = "thiserror" -version = "1.0.32" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5f6586b7f764adc0231f4c79be7b920e766bb2f3e51b3661cdb263828f19994" +checksum = "10deb33631e3c9018b9baf9dcbbc4f737320d2b576bac10f6aefa048fa407e3e" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.32" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12bafc5b54507e0149cdf1b145a5d80ab80a90bcd9275df43d4fff68460f6c21" +checksum = "982d17546b47146b28f7c22e3d08465f6b8903d0ea13c1660d9d84a6e7adcdbb" dependencies = [ "proc-macro2", "quote", @@ -3479,14 +3549,14 @@ dependencies = [ [[package]] name = "time" -version = "0.3.12" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74b7cc93fc23ba97fde84f7eea56c55d1ba183f495c6715defdfc7b9cb8c870f" +checksum = "d634a985c4d4238ec39cacaed2e7ae552fbd3c476b552c1deac3021b7d7eaf0c" dependencies = [ - "itoa 1.0.3", - "js-sys", + "itoa", "libc", "num_threads", + "serde", "time-macros", ] @@ -3523,9 +3593,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.20.1" +version = "1.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a8325f63a7d4774dd041e363b2409ed1c5cbbd0f867795e661df066b2b0a581" +checksum = "0020c875007ad96677dcc890298f4b942882c5d4eb7cc8f439fc3bf813dc9c95" dependencies = [ "autocfg", "bytes", @@ -3621,9 +3691,9 @@ dependencies = [ [[package]] name = "tokio-stream" -version = "0.1.9" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df54d54117d6fdc4e4fea40fe1e4e566b3505700e148a6827e59b34b0d2600d9" +checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce" dependencies = [ "futures-core", "pin-project-lite", @@ -3632,9 +3702,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc463cd8deddc3770d20f9852143d50bf6094e640b485cb2e189a2099085ff45" +checksum = "0bb2e075f03b3d66d8d8785356224ba688d2906a371015e225beeb65ca92c740" dependencies = [ "bytes", "futures-core", @@ -3655,14 +3725,13 @@ dependencies = [ [[package]] name = "toml_edit" -version = "0.13.4" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744e9ed5b352340aa47ce033716991b5589e23781acb97cad37d4ea70560f55b" +checksum = "5376256e44f2443f8896ac012507c19a012df0fe8758b55246ae51a2279db51f" dependencies = [ "combine", "indexmap", "itertools", - "kstring", "serde", ] @@ -3752,9 +3821,9 @@ dependencies = [ [[package]] name = "tower-layer" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343bc9466d3fe6b0f960ef45960509f84480bf4fd96f92901afe7ff3df9d3a62" +checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" [[package]] name = "tower-service" @@ -3764,9 +3833,9 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" -version = "0.1.36" +version = "0.1.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307" +checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" dependencies = [ "cfg-if", "log", @@ -3777,9 +3846,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11c75893af559bc8e10716548bdef5cb2b983f8e637db9d0e15126b61b484ee2" +checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" dependencies = [ "proc-macro2", "quote", @@ -3788,9 +3857,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.29" +version = "0.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aeea4303076558a00714b823f9ad67d58a3bbda1df83d8827d21193156e22f7" +checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" dependencies = [ "once_cell", "valuable", @@ -3819,13 +3888,13 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.11" +version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bc28f93baff38037f64e6f43d34cfa1605f27a49c34e8a04c5e78b0babf2596" +checksum = "a6176eae26dd70d0c919749377897b54a9276bd7061339665dd68777926b5a70" dependencies = [ - "ansi_term", - "lazy_static", "matchers", + "nu-ansi-term", + "once_cell", "regex", "sharded-slab", "smallvec", @@ -3855,36 +3924,30 @@ checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992" [[package]] name = "unicode-ident" -version = "1.0.3" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4f5b37a154999a8f3f98cc23a628d850e154479cd94decf3414696e12e31aaf" +checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" [[package]] name = "unicode-normalization" -version = "0.1.21" +version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "854cbdc4f7bc6ae19c820d44abdc3277ac3e1b2b93db20a636825d9322fb60e6" +checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" dependencies = [ "tinyvec", ] -[[package]] -name = "unicode-segmentation" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99" - [[package]] name = "unicode-width" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" [[package]] name = "unicode-xid" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "957e51f3646910546462e67d5f7599b9e4fb8acdd304b087a6494730f9eebf04" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" [[package]] name = "untrusted" @@ -3894,13 +3957,12 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" [[package]] name = "url" -version = "2.2.2" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" +checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643" dependencies = [ "form_urlencoded", "idna", - "matches", "percent-encoding", ] @@ -3920,7 +3982,7 @@ dependencies = [ "hyper", "jsonwebtoken", "metrics", - "nix", + "nix 0.25.0", "once_cell", "pin-project-lite", "postgres", @@ -3997,7 +4059,7 @@ name = "wal_craft" version = "0.1.0" dependencies = [ "anyhow", - "clap 3.2.16", + "clap", "env_logger", "log", "once_cell", @@ -4042,9 +4104,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.82" +version = "0.2.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc7652e3f6c4706c8d9cd54832c4a4ccb9b5336e2c3bd154d5cccfbf1c1f5f7d" +checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -4052,9 +4114,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.82" +version = "0.2.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "662cd44805586bd52971b9586b1df85cdbbd9112e4ef4d8f41559c334dc6ac3f" +checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142" dependencies = [ "bumpalo", "log", @@ -4067,9 +4129,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.32" +version = "0.4.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa76fb221a1f8acddf5b54ace85912606980ad661ac7a503b4570ffd3a624dad" +checksum = "23639446165ca5a5de86ae1d8896b737ae80319560fbaa4c2887b7da6e7ebd7d" dependencies = [ "cfg-if", "js-sys", @@ -4079,9 +4141,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.82" +version = "0.2.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b260f13d3012071dfb1512849c033b1925038373aea48ced3012c09df952c602" +checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -4089,9 +4151,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.82" +version = "0.2.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5be8e654bdd9b79216c2929ab90721aa82faf65c48cdf08bdc4e7f51357b80da" +checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c" dependencies = [ "proc-macro2", "quote", @@ -4102,15 +4164,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.82" +version = "0.2.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6598dd0bd3c7d51095ff6531a5b23e02acdc81804e30d8f07afb77b7215a140a" +checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f" [[package]] name = "web-sys" -version = "0.3.59" +version = "0.3.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed055ab27f941423197eb86b2035720b1a3ce40504df082cac2ecc6ed73335a1" +checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f" dependencies = [ "js-sys", "wasm-bindgen", @@ -4128,22 +4190,22 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1c760f0d366a6c24a02ed7816e23e691f5d92291f94d15e836006fd11b04daf" +checksum = "368bfe657969fb01238bb756d351dcade285e0f6fcbd36dcb23359a5169975be" dependencies = [ "webpki", ] [[package]] name = "which" -version = "4.2.5" +version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c4fb54e6113b6a8772ee41c3404fb0301ac79604489467e0a9ce1f3e97c24ae" +checksum = "1c831fbbee9e129a8cf93e7747a82da9d95ba8e16621cae60ec2cdc849bacb7b" dependencies = [ "either", - "lazy_static", "libc", + "once_cell", ] [[package]] @@ -4235,18 +4297,16 @@ version = "0.1.0" dependencies = [ "ahash", "anyhow", - "bstr", "bytes", "chrono", + "clap", "crossbeam-utils", "either", "fail", "hashbrown", "indexmap", - "itoa 0.4.8", "libc", "log", - "memchr", "nom", "num-bigint", "num-integer", @@ -4254,13 +4314,12 @@ dependencies = [ "prost", "rand", "regex", - "regex-automata", "regex-syntax", "scopeguard", "serde", "stable_deref_trait", "syn", - "time 0.3.12", + "time 0.3.15", "tokio", "tokio-util", "tracing", @@ -4283,7 +4342,7 @@ dependencies = [ "oid-registry", "rusticata-macros", "thiserror", - "time 0.3.12", + "time 0.3.15", ] [[package]] diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index ee8481e141..690b63613a 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -5,18 +5,18 @@ edition = "2021" [dependencies] clap = "3.0" -comfy-table = "5.0.1" +comfy-table = "6.1" git-version = "0.3.5" tar = "0.4.38" -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } serde = { version = "1.0", features = ["derive"] } -serde_with = "1.12.0" +serde_with = "2.0" toml = "0.5" once_cell = "1.13.0" regex = "1" anyhow = "1.0" thiserror = "1" -nix = "0.23" +nix = "0.25" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } # Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api diff --git a/libs/etcd_broker/Cargo.toml b/libs/etcd_broker/Cargo.toml index f7bfbad4ba..b18dcbe5a3 100644 --- a/libs/etcd_broker/Cargo.toml +++ b/libs/etcd_broker/Cargo.toml @@ -8,7 +8,7 @@ regex = "1.4.5" serde = { version = "1.0", features = ["derive"] } serde_json = "1" - serde_with = "1.12.0" + serde_with = "2.0" once_cell = "1.13.0" utils = { path = "../utils" } diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index be8762100c..5995325a2f 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [dependencies] serde = { version = "1.0", features = ["derive"] } -serde_with = "1.12.0" +serde_with = "2.0" const_format = "0.2.21" utils = { path = "../utils" } diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index cec344a4ad..f54d91905c 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -15,7 +15,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1" tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] } tokio-util = { version = "0.7", features = ["io"] } -toml_edit = { version = "0.13", features = ["easy"] } +toml_edit = { version = "0.14", features = ["easy"] } tracing = "0.1.27" workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index 852d643f30..15bdecd71d 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [dependencies] serde = { version = "1.0", features = ["derive"] } -serde_with = "1.12.0" +serde_with = "2.0" const_format = "0.2.21" utils = { path = "../utils" } diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 4119650b99..85c6439367 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -1,20 +1,23 @@ use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; + use utils::{ id::{NodeId, TenantId, TimelineId}, lsn::Lsn, }; +#[serde_as] #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { - #[serde(with = "serde_with::rust::display_fromstr")] + #[serde_as(as = "DisplayFromStr")] pub tenant_id: TenantId, - #[serde(with = "serde_with::rust::display_fromstr")] + #[serde_as(as = "DisplayFromStr")] pub timeline_id: TimelineId, pub peer_ids: Option>, pub pg_version: u32, pub system_id: Option, pub wal_seg_size: Option, - #[serde(with = "serde_with::rust::display_fromstr")] + #[serde_as(as = "DisplayFromStr")] pub commit_lsn: Lsn, // If not passed, it is assigned to the beginning of commit_lsn segment. pub local_start_lsn: Option, diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index ef2aa8b305..a7baddada4 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -20,7 +20,7 @@ tokio = { version = "1.17", features = ["macros"]} tokio-rustls = "0.23" tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } -nix = "0.23.0" +nix = "0.25" signal-hook = "0.3.10" rand = "0.8.3" jsonwebtoken = "8" @@ -28,7 +28,7 @@ hex = { version = "0.4.3", features = ["serde"] } rustls = "0.20.2" rustls-split = "0.3.0" git-version = "0.3.5" -serde_with = "1.12.0" +serde_with = "2.0" once_cell = "1.13.0" @@ -40,7 +40,7 @@ byteorder = "1.4.3" bytes = "1.0.1" hex-literal = "0.3" tempfile = "3.2" -criterion = "0.3" +criterion = "0.4" rustls-pemfile = "1" [[bench]] diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 88430f3a86..ea0cf3f18a 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -38,18 +38,18 @@ tar = "0.4.33" humantime = "2.1.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1" -serde_with = "1.12.0" +serde_with = "2.0" humantime-serde = "1.1.1" pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true } -toml_edit = { version = "0.13", features = ["easy"] } +toml_edit = { version = "0.14", features = ["easy"] } scopeguard = "1.1.0" const_format = "0.2.21" tracing = "0.1.36" signal-hook = "0.3.10" url = "2" -nix = "0.23" +nix = "0.25" once_cell = "1.13.0" crossbeam-utils = "0.8.5" fail = "0.5.0" diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index cb1cecade9..ddc3956d74 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -22,14 +22,14 @@ humantime = "2.1.0" url = "2.2.2" signal-hook = "0.3.10" serde = { version = "1.0", features = ["derive"] } -serde_with = "1.12.0" +serde_with = "2.0" hex = "0.4.3" const_format = "0.2.21" tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } git-version = "0.3.5" async-trait = "0.1" once_cell = "1.13.0" -toml_edit = { version = "0.13", features = ["easy"] } +toml_edit = { version = "0.14", features = ["easy"] } thiserror = "1" parking_lot = "0.12.1" diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 6977665c7d..af055ed9a4 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -16,18 +16,16 @@ publish = false [dependencies] ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } -bstr = { version = "0.2", features = ["lazy_static", "regex-automata", "serde", "serde1", "serde1-nostd", "std", "unicode"] } bytes = { version = "1", features = ["serde", "std"] } -chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "std", "time", "winapi"] } +chrono = { version = "0.4", features = ["clock", "iana-time-zone", "js-sys", "oldtime", "serde", "std", "time", "wasm-bindgen", "wasmbind", "winapi"] } +clap = { version = "3", features = ["atty", "color", "std", "strsim", "suggestions", "termcolor"] } crossbeam-utils = { version = "0.8", features = ["once_cell", "std"] } either = { version = "1", features = ["use_std"] } fail = { version = "0.5", default-features = false, features = ["failpoints"] } hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } -itoa = { version = "0.4", features = ["i128", "std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } -memchr = { version = "2", features = ["std", "use_std"] } nom = { version = "7", features = ["alloc", "std"] } num-bigint = { version = "0.4", features = ["std"] } num-integer = { version = "0.1", default-features = false, features = ["i128", "std"] } @@ -35,7 +33,6 @@ num-traits = { version = "0.2", features = ["i128", "libm", "std"] } prost = { version = "0.10", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } -regex-automata = { version = "0.1", features = ["regex-syntax", "std"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } @@ -51,12 +48,12 @@ uuid = { version = "0.8", features = ["getrandom", "serde", "std", "v4"] } ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } +clap = { version = "3", features = ["atty", "color", "std", "strsim", "suggestions", "termcolor"] } either = { version = "1", features = ["use_std"] } hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } -memchr = { version = "2", features = ["std", "use_std"] } nom = { version = "7", features = ["alloc", "std"] } prost = { version = "0.10", features = ["prost-derive", "std"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } From 5d6553d41d383459ef5fb9ebfc1199faed978ebe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Sun, 16 Oct 2022 14:37:10 +0300 Subject: [PATCH 0909/1022] Fix pageserver configuration generation bug (#2584) * We had an issue with `lineinfile` usage for pageserver configuration file: if the S3 bucket related values were changed, it would have resulted in duplicate keys, resulting in invalid toml. So to fix the issue, we should keep the configuration in structured format (yaml in this case) so we can always generate syntactically correct toml. Inventories are converted to yaml just so that it's easier to maintain the configuration there. Another alternative would have been a separate variable files. * Keep the ansible collections dir, but locally installed collections should not be tracked. --- .github/ansible/.gitignore | 3 ++ .github/ansible/ansible.cfg | 1 + .github/ansible/collections/.keep | 0 .github/ansible/deploy.yaml | 35 ++++++++++++----- .github/ansible/neon-stress.hosts | 20 ---------- .github/ansible/neon-stress.hosts.yaml | 30 +++++++++++++++ .github/ansible/production.hosts | 20 ---------- .github/ansible/production.hosts.yaml | 31 +++++++++++++++ .github/ansible/staging.hosts | 25 ------------ .github/ansible/staging.hosts.yaml | 40 ++++++++++++++++++++ .github/ansible/templates/pageserver.toml.j2 | 1 + .github/workflows/build_and_test.yml | 6 +-- 12 files changed, 134 insertions(+), 78 deletions(-) create mode 100644 .github/ansible/collections/.keep delete mode 100644 .github/ansible/neon-stress.hosts create mode 100644 .github/ansible/neon-stress.hosts.yaml delete mode 100644 .github/ansible/production.hosts create mode 100644 .github/ansible/production.hosts.yaml delete mode 100644 .github/ansible/staging.hosts create mode 100644 .github/ansible/staging.hosts.yaml create mode 100644 .github/ansible/templates/pageserver.toml.j2 diff --git a/.github/ansible/.gitignore b/.github/ansible/.gitignore index 441d9a8b82..e3454fd43c 100644 --- a/.github/ansible/.gitignore +++ b/.github/ansible/.gitignore @@ -2,3 +2,6 @@ zenith_install.tar.gz .zenith_current_version neon_install.tar.gz .neon_current_version + +collections/* +!collections/.keep diff --git a/.github/ansible/ansible.cfg b/.github/ansible/ansible.cfg index 5818a64455..0497ee401d 100644 --- a/.github/ansible/ansible.cfg +++ b/.github/ansible/ansible.cfg @@ -3,6 +3,7 @@ localhost_warning = False host_key_checking = False timeout = 30 +collections_paths = ./collections [ssh_connection] ssh_args = -F ./ansible.ssh.cfg diff --git a/.github/ansible/collections/.keep b/.github/ansible/collections/.keep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index e206f9d5ba..bfd3fd123d 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -14,7 +14,8 @@ - safekeeper - name: inform about versions - debug: msg="Version to deploy - {{ current_version }}" + debug: + msg: "Version to deploy - {{ current_version }}" tags: - pageserver - safekeeper @@ -63,15 +64,29 @@ tags: - pageserver - - name: update remote storage (s3) config - lineinfile: - path: /storage/pageserver/data/pageserver.toml - line: "{{ item }}" - loop: - - "[remote_storage]" - - "bucket_name = '{{ bucket_name }}'" - - "bucket_region = '{{ bucket_region }}'" - - "prefix_in_bucket = '{{ inventory_hostname }}'" + - name: read the existing remote pageserver config + ansible.builtin.slurp: + src: /storage/pageserver/data/pageserver.toml + register: _remote_ps_config + tags: + - pageserver + + - name: parse the existing pageserver configuration + ansible.builtin.set_fact: + _existing_ps_config: "{{ _remote_ps_config['content'] | b64decode | sivel.toiletwater.from_toml }}" + tags: + - pageserver + + - name: construct the final pageserver configuration dict + ansible.builtin.set_fact: + pageserver_config: "{{ pageserver_config_stub | combine({'id': _existing_ps_config.id }) }}" + tags: + - pageserver + + - name: template the pageserver config + template: + src: templates/pageserver.toml.j2 + dest: /storage/pageserver/data/pageserver.toml become: true tags: - pageserver diff --git a/.github/ansible/neon-stress.hosts b/.github/ansible/neon-stress.hosts deleted file mode 100644 index c1bc8243f8..0000000000 --- a/.github/ansible/neon-stress.hosts +++ /dev/null @@ -1,20 +0,0 @@ -[pageservers] -neon-stress-ps-1 console_region_id=1 -neon-stress-ps-2 console_region_id=1 - -[safekeepers] -neon-stress-sk-1 console_region_id=1 -neon-stress-sk-2 console_region_id=1 -neon-stress-sk-3 console_region_id=1 - -[storage:children] -pageservers -safekeepers - -[storage:vars] -env_name = neon-stress -console_mgmt_base_url = http://neon-stress-console.local -bucket_name = neon-storage-ireland -bucket_region = eu-west-1 -etcd_endpoints = neon-stress-etcd.local:2379 -safekeeper_enable_s3_offload = false diff --git a/.github/ansible/neon-stress.hosts.yaml b/.github/ansible/neon-stress.hosts.yaml new file mode 100644 index 0000000000..d4c77e7ada --- /dev/null +++ b/.github/ansible/neon-stress.hosts.yaml @@ -0,0 +1,30 @@ +storage: + vars: + bucket_name: neon-storage-ireland + bucket_region: eu-west-1 + console_mgmt_base_url: http://neon-stress-console.local + env_name: neon-stress + etcd_endpoints: neon-stress-etcd.local:2379 + safekeeper_enable_s3_offload: 'false' + pageserver_config_stub: + pg_distrib_dir: /usr/local + remote_storage: + bucket_name: "{{ bucket_name }}" + bucket_region: "{{ bucket_region }}" + prefix_in_bucket: "{{ inventory_hostname }}" + + children: + pageservers: + hosts: + neon-stress-ps-1: + console_region_id: 1 + neon-stress-ps-2: + console_region_id: 1 + safekeepers: + hosts: + neon-stress-sk-1: + console_region_id: 1 + neon-stress-sk-2: + console_region_id: 1 + neon-stress-sk-3: + console_region_id: 1 diff --git a/.github/ansible/production.hosts b/.github/ansible/production.hosts deleted file mode 100644 index 364e8ed50e..0000000000 --- a/.github/ansible/production.hosts +++ /dev/null @@ -1,20 +0,0 @@ -[pageservers] -#zenith-1-ps-1 console_region_id=1 -zenith-1-ps-2 console_region_id=1 -zenith-1-ps-3 console_region_id=1 - -[safekeepers] -zenith-1-sk-1 console_region_id=1 -zenith-1-sk-2 console_region_id=1 -zenith-1-sk-3 console_region_id=1 - -[storage:children] -pageservers -safekeepers - -[storage:vars] -env_name = prod-1 -console_mgmt_base_url = http://console-release.local -bucket_name = zenith-storage-oregon -bucket_region = us-west-2 -etcd_endpoints = zenith-1-etcd.local:2379 diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml new file mode 100644 index 0000000000..c276ca3805 --- /dev/null +++ b/.github/ansible/production.hosts.yaml @@ -0,0 +1,31 @@ +--- +storage: + vars: + env_name: prod-1 + console_mgmt_base_url: http://console-release.local + bucket_name: zenith-storage-oregon + bucket_region: us-west-2 + etcd_endpoints: zenith-1-etcd.local:2379 + pageserver_config_stub: + pg_distrib_dir: /usr/local + remote_storage: + bucket_name: "{{ bucket_name }}" + bucket_region: "{{ bucket_region }}" + prefix_in_bucket: "{{ inventory_hostname }}" + + children: + pageservers: + hosts: + zenith-1-ps-2: + console_region_id: 1 + zenith-1-ps-3: + console_region_id: 1 + + safekeepers: + hosts: + zenith-1-sk-1: + console_region_id: 1 + zenith-1-sk-2: + console_region_id: 1 + zenith-1-sk-3: + console_region_id: 1 diff --git a/.github/ansible/staging.hosts b/.github/ansible/staging.hosts deleted file mode 100644 index f5accc188a..0000000000 --- a/.github/ansible/staging.hosts +++ /dev/null @@ -1,25 +0,0 @@ -[pageservers] -#zenith-us-stage-ps-1 console_region_id=27 -zenith-us-stage-ps-2 console_region_id=27 -zenith-us-stage-ps-3 console_region_id=27 -zenith-us-stage-ps-4 console_region_id=27 -zenith-us-stage-test-ps-1 console_region_id=28 - -[safekeepers] -zenith-us-stage-sk-4 console_region_id=27 -zenith-us-stage-sk-5 console_region_id=27 -zenith-us-stage-sk-6 console_region_id=27 -zenith-us-stage-test-sk-1 console_region_id=28 -zenith-us-stage-test-sk-2 console_region_id=28 -zenith-us-stage-test-sk-3 console_region_id=28 - -[storage:children] -pageservers -safekeepers - -[storage:vars] -env_name = us-stage -console_mgmt_base_url = http://console-staging.local -bucket_name = zenith-staging-storage-us-east-1 -bucket_region = us-east-1 -etcd_endpoints = zenith-us-stage-etcd.local:2379 diff --git a/.github/ansible/staging.hosts.yaml b/.github/ansible/staging.hosts.yaml new file mode 100644 index 0000000000..a3534ed5ce --- /dev/null +++ b/.github/ansible/staging.hosts.yaml @@ -0,0 +1,40 @@ +storage: + vars: + bucket_name: zenith-staging-storage-us-east-1 + bucket_region: us-east-1 + console_mgmt_base_url: http://console-staging.local + env_name: us-stage + etcd_endpoints: zenith-us-stage-etcd.local:2379 + pageserver_config_stub: + pg_distrib_dir: /usr/local + remote_storage: + bucket_name: "{{ bucket_name }}" + bucket_region: "{{ bucket_region }}" + prefix_in_bucket: "{{ inventory_hostname }}" + + children: + pageservers: + hosts: + zenith-us-stage-ps-2: + console_region_id: 27 + zenith-us-stage-ps-3: + console_region_id: 27 + zenith-us-stage-ps-4: + console_region_id: 27 + zenith-us-stage-test-ps-1: + console_region_id: 28 + + safekeepers: + hosts: + zenith-us-stage-sk-4: + console_region_id: 27 + zenith-us-stage-sk-5: + console_region_id: 27 + zenith-us-stage-sk-6: + console_region_id: 27 + zenith-us-stage-test-sk-1: + console_region_id: 28 + zenith-us-stage-test-sk-2: + console_region_id: 28 + zenith-us-stage-test-sk-3: + console_region_id: 28 diff --git a/.github/ansible/templates/pageserver.toml.j2 b/.github/ansible/templates/pageserver.toml.j2 new file mode 100644 index 0000000000..7b0857d5e0 --- /dev/null +++ b/.github/ansible/templates/pageserver.toml.j2 @@ -0,0 +1 @@ +{{ pageserver_config | sivel.toiletwater.to_toml }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 69b17113ed..e8d724581d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -712,7 +712,7 @@ jobs: - name: Setup ansible run: | export PATH="/root/.local/bin:$PATH" - pip install --progress-bar off --user ansible boto3 + pip install --progress-bar off --user ansible boto3 toml - name: Redeploy run: | @@ -734,8 +734,8 @@ jobs: chmod 0600 ssh-key ssh-add ssh-key rm -f ssh-key ssh-key-cert.pub - - ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts + ansible-galaxy collection install sivel.toiletwater + ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml rm -f neon_install.tar.gz .neon_current_version deploy-proxy: From c7093545794e5a1072fa358d1aa38c87031bce7e Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 17 Oct 2022 12:21:04 +0300 Subject: [PATCH 0910/1022] Add layer sizes to index_part.json (#2582) This is the first step in verifying layer files. Next up on the road is hashing the files and verifying the hashes. The metadata additions do not require any migration. The idea is that the change is backward and forward-compatible with regard to `index_part.json` due to the softness of JSON schema and the deserialization options in use. New types added: - LayerFileMetadata for tracking the file metadata - starting with only the file size - in future hopefully a sha256 as well - IndexLayerMetadata, the serialized counterpart of LayerFileMetadata LayerFileMetadata needing to have all fields Option is a problem but that is not possible to handle without conflicting a lot more with other ongoing work. Co-authored-by: Kirill Bulatov --- pageserver/src/http/routes.rs | 2 +- pageserver/src/storage_sync.rs | 384 ++++++++++++++---- pageserver/src/storage_sync/delete.rs | 2 +- pageserver/src/storage_sync/download.rs | 267 +++++++----- pageserver/src/storage_sync/index.rs | 293 ++++++++++--- pageserver/src/storage_sync/upload.rs | 52 ++- pageserver/src/tenant/timeline.rs | 57 ++- pageserver/src/tenant_mgr.rs | 33 +- .../test_tenants_with_remote_storage.py | 246 +++++++++++ 9 files changed, 1057 insertions(+), 279 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 4d7339ec13..91a385bf77 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -386,7 +386,7 @@ async fn tenant_attach_handler(request: Request) -> Result, } return json_response(StatusCode::ACCEPTED, ()); } - // no tenant in the index, release the lock to make the potentially lengthy download opetation + // no tenant in the index, release the lock to make the potentially lengthy download operation drop(index_accessor); // download index parts for every tenant timeline diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index e8844baf5d..037fe76d7f 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -171,7 +171,7 @@ use self::{ use crate::{ config::PageServerConf, exponential_backoff, - storage_sync::index::RemoteIndex, + storage_sync::index::{LayerFileMetadata, RemoteIndex}, task_mgr, task_mgr::TaskKind, task_mgr::BACKGROUND_RUNTIME, @@ -193,7 +193,7 @@ static SYNC_QUEUE: OnceCell = OnceCell::new(); /// A timeline status to share with pageserver's sync counterpart, /// after comparing local and remote timeline state. -#[derive(Clone)] +#[derive(Clone, PartialEq, Eq)] pub enum LocalTimelineInitStatus { /// The timeline has every remote layer present locally. /// There could be some layers requiring uploading, @@ -316,7 +316,7 @@ impl SyncQueue { /// A task to run in the async download/upload loop. /// Limited by the number of retries, after certain threshold the failing task gets evicted and the timeline disabled. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq)] enum SyncTask { /// A checkpoint outcome with possible local file updates that need actualization in the remote storage. /// Not necessary more fresh than the one already uploaded. @@ -427,7 +427,7 @@ impl SyncTaskBatch { .extend(new_delete.data.deleted_layers.iter().cloned()); } if let Some(batch_upload) = &mut self.upload { - let not_deleted = |layer: &PathBuf| { + let not_deleted = |layer: &PathBuf, _: &mut LayerFileMetadata| { !new_delete.data.layers_to_delete.contains(layer) && !new_delete.data.deleted_layers.contains(layer) }; @@ -455,21 +455,35 @@ impl SyncTaskBatch { #[derive(Debug, Clone, PartialEq, Eq)] struct LayersUpload { /// Layer file path in the pageserver workdir, that were added for the corresponding checkpoint. - layers_to_upload: HashSet, + layers_to_upload: HashMap, /// Already uploaded layers. Used to store the data about the uploads between task retries /// and to record the data into the remote index after the task got completed or evicted. - uploaded_layers: HashSet, + uploaded_layers: HashMap, metadata: Option, } /// A timeline download task. /// Does not contain the file list to download, to allow other /// parts of the pageserer code to schedule the task -/// without using the remote index or any other ways to list the remote timleine files. +/// without using the remote index or any other ways to list the remote timeline files. /// Skips the files that are already downloaded. #[derive(Debug, Clone, PartialEq, Eq)] struct LayersDownload { layers_to_skip: HashSet, + + /// Paths which have been downloaded, and had their metadata verified or generated. + /// + /// Metadata generation happens when upgrading from past version of `IndexPart`. + gathered_metadata: HashMap, +} + +impl LayersDownload { + fn from_skipped_layers(layers_to_skip: HashSet) -> Self { + LayersDownload { + layers_to_skip, + gathered_metadata: HashMap::default(), + } + } } #[derive(Debug, Clone, PartialEq, Eq)] @@ -491,7 +505,7 @@ struct LayersDeletion { pub fn schedule_layer_upload( tenant_id: TenantId, timeline_id: TimelineId, - layers_to_upload: HashSet, + layers_to_upload: HashMap, metadata: Option, ) { let sync_queue = match SYNC_QUEUE.get() { @@ -508,7 +522,7 @@ pub fn schedule_layer_upload( }, SyncTask::upload(LayersUpload { layers_to_upload, - uploaded_layers: HashSet::new(), + uploaded_layers: HashMap::new(), metadata, }), ); @@ -566,21 +580,44 @@ pub fn schedule_layer_download(tenant_id: TenantId, timeline_id: TimelineId) { tenant_id, timeline_id, }, - SyncTask::download(LayersDownload { - layers_to_skip: HashSet::new(), - }), + SyncTask::download(LayersDownload::from_skipped_layers(HashSet::new())), ); debug!("Download task for tenant {tenant_id}, timeline {timeline_id} sent") } +/// Local existing timeline files +/// +/// Values of this type serve different meanings in different contexts. On startup, collected +/// timelines come with the full collected information and when signalling readyness to attach +/// after completed download. After the download the file information is no longer carried, because +/// it is already merged into [`RemoteTimeline`]. +#[derive(Debug)] +pub struct TimelineLocalFiles(TimelineMetadata, HashMap); + +impl TimelineLocalFiles { + pub fn metadata(&self) -> &TimelineMetadata { + &self.0 + } + + /// Called during startup, for all of the local files with full metadata. + pub(crate) fn collected( + metadata: TimelineMetadata, + timeline_files: HashMap, + ) -> TimelineLocalFiles { + TimelineLocalFiles(metadata, timeline_files) + } + + /// Called near the end of tenant initialization, to signal readyness to attach tenants. + pub(crate) fn ready(metadata: TimelineMetadata) -> Self { + TimelineLocalFiles(metadata, HashMap::new()) + } +} + /// Launch a thread to perform remote storage sync tasks. /// See module docs for loop step description. pub fn spawn_storage_sync_task( conf: &'static PageServerConf, - local_timeline_files: HashMap< - TenantId, - HashMap)>, - >, + local_timeline_files: HashMap>, storage: GenericRemoteStorage, max_concurrent_timelines_sync: NonZeroUsize, max_sync_errors: NonZeroU32, @@ -738,7 +775,7 @@ async fn storage_sync_loop( tenant_entry .iter() .map(|(&id, entry)| { - (id, (entry.metadata.clone(), HashSet::new())) + (id, TimelineLocalFiles::ready(entry.metadata.clone())) }) .collect(), ), @@ -983,15 +1020,27 @@ async fn download_timeline_data( } DownloadedTimeline::Successful(mut download_data) => { match update_local_metadata(conf, sync_id, current_remote_timeline).await { - Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) { - Ok(()) => { - register_sync_status(sync_id, sync_start, TASK_NAME, Some(true)); - return DownloadStatus::Downloaded; - } - Err(e) => { - error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}"); - } - }, + Ok(()) => { + let mut g = index.write().await; + + match g.set_awaits_download(&sync_id, false) { + Ok(()) => { + let timeline = g + .timeline_entry_mut(&sync_id) + .expect("set_awaits_download verified existence"); + + timeline.merge_metadata_from_downloaded( + &download_data.data.gathered_metadata, + ); + + register_sync_status(sync_id, sync_start, TASK_NAME, Some(true)); + return DownloadStatus::Downloaded; + } + Err(e) => { + error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}"); + } + }; + } Err(e) => { error!("Failed to update local timeline metadata: {e:?}"); download_data.retries += 1; @@ -1194,11 +1243,18 @@ async fn update_remote_data( } if upload_failed { existing_entry.add_upload_failures( - uploaded_data.layers_to_upload.iter().cloned(), + uploaded_data + .layers_to_upload + .iter() + .map(|(k, v)| (k.to_owned(), v.to_owned())), ); } else { - existing_entry - .add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned()); + existing_entry.add_timeline_layers( + uploaded_data + .uploaded_layers + .iter() + .map(|(k, v)| (k.to_owned(), v.to_owned())), + ); } } RemoteDataUpdate::Delete(layers_to_remove) => { @@ -1218,11 +1274,19 @@ async fn update_remote_data( }; let mut new_remote_timeline = RemoteTimeline::new(new_metadata.clone()); if upload_failed { - new_remote_timeline - .add_upload_failures(uploaded_data.layers_to_upload.iter().cloned()); + new_remote_timeline.add_upload_failures( + uploaded_data + .layers_to_upload + .iter() + .map(|(k, v)| (k.to_owned(), v.to_owned())), + ); } else { - new_remote_timeline - .add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned()); + new_remote_timeline.add_timeline_layers( + uploaded_data + .uploaded_layers + .iter() + .map(|(k, v)| (k.to_owned(), v.to_owned())), + ); } index_accessor.add_timeline_entry(sync_id, new_remote_timeline.clone()); @@ -1270,13 +1334,14 @@ async fn validate_task_retries( fn schedule_first_sync_tasks( index: &mut RemoteTimelineIndex, sync_queue: &SyncQueue, - local_timeline_files: HashMap)>, + local_timeline_files: HashMap, ) -> TenantTimelineValues { let mut local_timeline_init_statuses = TenantTimelineValues::new(); let mut new_sync_tasks = VecDeque::with_capacity(local_timeline_files.len()); - for (sync_id, (local_metadata, local_files)) in local_timeline_files { + for (sync_id, local_timeline) in local_timeline_files { + let TimelineLocalFiles(local_metadata, local_files) = local_timeline; match index.timeline_entry_mut(&sync_id) { Some(remote_timeline) => { let (timeline_status, awaits_download) = compare_local_and_remote_timeline( @@ -1320,7 +1385,7 @@ fn schedule_first_sync_tasks( sync_id, SyncTask::upload(LayersUpload { layers_to_upload: local_files, - uploaded_layers: HashSet::new(), + uploaded_layers: HashMap::new(), metadata: Some(local_metadata.clone()), }), )); @@ -1347,20 +1412,46 @@ fn compare_local_and_remote_timeline( new_sync_tasks: &mut VecDeque<(TenantTimelineId, SyncTask)>, sync_id: TenantTimelineId, local_metadata: TimelineMetadata, - local_files: HashSet, + local_files: HashMap, remote_entry: &RemoteTimeline, ) -> (LocalTimelineInitStatus, bool) { let _entered = info_span!("compare_local_and_remote_timeline", sync_id = %sync_id).entered(); - let remote_files = remote_entry.stored_files(); + let needed_to_download_files = remote_entry + .stored_files() + .iter() + .filter_map(|(layer_file, remote_metadata)| { + if let Some(local_metadata) = local_files.get(layer_file) { + match (remote_metadata.file_size(), local_metadata.file_size()) { + (Some(x), Some(y)) if x == y => { None }, + (None, Some(_)) => { + // upgrading from an earlier IndexPart without metadata + None + }, + _ => { + // having to deal with other than (Some(x), Some(y)) where x != y here is a + // bummer, but see #2582 and #2610 for attempts and discussion. + warn!("Redownloading locally existing {layer_file:?} due to size mismatch, size on index: {:?}, on disk: {:?}", remote_metadata.file_size(), local_metadata.file_size()); + Some(layer_file) + }, + } + } else { + // doesn't exist locally + Some(layer_file) + } + }) + .collect::>(); - let number_of_layers_to_download = remote_files.difference(&local_files).count(); - let (initial_timeline_status, awaits_download) = if number_of_layers_to_download > 0 { + let (initial_timeline_status, awaits_download) = if !needed_to_download_files.is_empty() { new_sync_tasks.push_back(( sync_id, - SyncTask::download(LayersDownload { - layers_to_skip: local_files.clone(), - }), + SyncTask::download(LayersDownload::from_skipped_layers( + local_files + .keys() + .filter(|path| !needed_to_download_files.contains(path)) + .cloned() + .collect(), + )), )); info!("NeedsSync"); (LocalTimelineInitStatus::NeedsSync, true) @@ -1375,15 +1466,22 @@ fn compare_local_and_remote_timeline( }; let layers_to_upload = local_files - .difference(remote_files) - .cloned() - .collect::>(); + .iter() + .filter_map(|(local_file, metadata)| { + if !remote_entry.stored_files().contains_key(local_file) { + Some((local_file.to_owned(), metadata.to_owned())) + } else { + None + } + }) + .collect::>(); + if !layers_to_upload.is_empty() { new_sync_tasks.push_back(( sync_id, SyncTask::upload(LayersUpload { layers_to_upload, - uploaded_layers: HashSet::new(), + uploaded_layers: HashMap::new(), metadata: Some(local_metadata), }), )); @@ -1439,11 +1537,12 @@ mod test_utils { let timeline_path = harness.timeline_path(&timeline_id); fs::create_dir_all(&timeline_path).await?; - let mut layers_to_upload = HashSet::with_capacity(filenames.len()); + let mut layers_to_upload = HashMap::with_capacity(filenames.len()); for &file in filenames { let file_path = timeline_path.join(file); fs::write(&file_path, dummy_contents(file).into_bytes()).await?; - layers_to_upload.insert(file_path); + let metadata = LayerFileMetadata::new(file_path.metadata()?.len()); + layers_to_upload.insert(file_path, metadata); } fs::write( @@ -1454,7 +1553,7 @@ mod test_utils { Ok(LayersUpload { layers_to_upload, - uploaded_layers: HashSet::new(), + uploaded_layers: HashMap::new(), metadata: Some(metadata), }) } @@ -1509,12 +1608,13 @@ mod tests { assert!(sync_id_2 != sync_id_3); assert!(sync_id_3 != TEST_SYNC_ID); - let download_task = SyncTask::download(LayersDownload { - layers_to_skip: HashSet::from([PathBuf::from("sk")]), - }); + let download_task = + SyncTask::download(LayersDownload::from_skipped_layers(HashSet::from([ + PathBuf::from("sk"), + ]))); let upload_task = SyncTask::upload(LayersUpload { - layers_to_upload: HashSet::from([PathBuf::from("up")]), - uploaded_layers: HashSet::from([PathBuf::from("upl")]), + layers_to_upload: HashMap::from([(PathBuf::from("up"), LayerFileMetadata::new(123))]), + uploaded_layers: HashMap::from([(PathBuf::from("upl"), LayerFileMetadata::new(123))]), metadata: Some(dummy_metadata(Lsn(2))), }); let delete_task = SyncTask::delete(LayersDeletion { @@ -1558,12 +1658,10 @@ mod tests { let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); assert_eq!(sync_queue.len(), 0); - let download = LayersDownload { - layers_to_skip: HashSet::from([PathBuf::from("sk")]), - }; + let download = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk")])); let upload = LayersUpload { - layers_to_upload: HashSet::from([PathBuf::from("up")]), - uploaded_layers: HashSet::from([PathBuf::from("upl")]), + layers_to_upload: HashMap::from([(PathBuf::from("up"), LayerFileMetadata::new(123))]), + uploaded_layers: HashMap::from([(PathBuf::from("upl"), LayerFileMetadata::new(123))]), metadata: Some(dummy_metadata(Lsn(2))), }; let delete = LayersDeletion { @@ -1611,18 +1709,10 @@ mod tests { #[tokio::test] async fn same_task_id_same_tasks_batch() { let sync_queue = SyncQueue::new(NonZeroUsize::new(1).unwrap()); - let download_1 = LayersDownload { - layers_to_skip: HashSet::from([PathBuf::from("sk1")]), - }; - let download_2 = LayersDownload { - layers_to_skip: HashSet::from([PathBuf::from("sk2")]), - }; - let download_3 = LayersDownload { - layers_to_skip: HashSet::from([PathBuf::from("sk3")]), - }; - let download_4 = LayersDownload { - layers_to_skip: HashSet::from([PathBuf::from("sk4")]), - }; + let download_1 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk1")])); + let download_2 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk2")])); + let download_3 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk3")])); + let download_4 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk4")])); let sync_id_2 = TenantTimelineId { tenant_id: TenantId::from_array(hex!("22223344556677881122334455667788")), @@ -1646,15 +1736,15 @@ mod tests { Some(SyncTaskBatch { download: Some(SyncData { retries: 0, - data: LayersDownload { - layers_to_skip: { + data: LayersDownload::from_skipped_layers( + { let mut set = HashSet::new(); set.extend(download_1.layers_to_skip.into_iter()); set.extend(download_2.layers_to_skip.into_iter()); set.extend(download_4.layers_to_skip.into_iter()); set }, - } + ) }), upload: None, delete: None, @@ -1670,4 +1760,148 @@ mod tests { "Should have one task left out of the batch" ); } + + mod local_and_remote_comparisons { + use super::*; + + #[test] + fn ready() { + let mut new_sync_tasks = VecDeque::default(); + let sync_id = TenantTimelineId::generate(); + let local_metadata = dummy_metadata(0x02.into()); + let local_files = + HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]); + let mut remote_entry = RemoteTimeline::new(local_metadata.clone()); + remote_entry + .add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]); + + let (status, sync_needed) = compare_local_and_remote_timeline( + &mut new_sync_tasks, + sync_id, + local_metadata.clone(), + local_files, + &remote_entry, + ); + + assert_eq!( + status, + LocalTimelineInitStatus::LocallyComplete(local_metadata) + ); + assert!(!sync_needed); + + assert!(new_sync_tasks.is_empty(), "{:?}", new_sync_tasks); + } + + #[test] + fn needs_download() { + let mut new_sync_tasks = VecDeque::default(); + let sync_id = TenantTimelineId::generate(); + let local_metadata = dummy_metadata(0x02.into()); + let local_files = HashMap::default(); + let mut remote_entry = RemoteTimeline::new(local_metadata.clone()); + remote_entry + .add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]); + + let (status, sync_needed) = compare_local_and_remote_timeline( + &mut new_sync_tasks, + sync_id, + local_metadata, + local_files.clone(), + &remote_entry, + ); + + assert_eq!(status, LocalTimelineInitStatus::NeedsSync); + assert!(sync_needed); + + let new_sync_tasks = new_sync_tasks.into_iter().collect::>(); + + assert_eq!( + &new_sync_tasks, + &[( + sync_id, + SyncTask::download(LayersDownload::from_skipped_layers( + local_files.keys().cloned().collect() + )) + )] + ); + } + + #[test] + fn redownload_is_not_needed_on_upgrade() { + // originally the implementation missed the `(None, Some(_))` case in the match, and + // proceeded to always redownload if the remote metadata was not available. + + let mut new_sync_tasks = VecDeque::default(); + let sync_id = TenantTimelineId::generate(); + + let local_metadata = dummy_metadata(0x02.into()); + + // type system would in general allow that LayerFileMetadata would be created with + // file_size: None, however `LayerFileMetadata::default` is only allowed from tests, + // and so everywhere within the system valid LayerFileMetadata is being created, it is + // created through `::new`. + let local_files = + HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]); + + let mut remote_entry = RemoteTimeline::new(local_metadata.clone()); + + // RemoteTimeline is constructed out of an older version IndexPart, which didn't carry + // any metadata. + remote_entry + .add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::default())]); + + let (status, sync_needed) = compare_local_and_remote_timeline( + &mut new_sync_tasks, + sync_id, + local_metadata.clone(), + local_files, + &remote_entry, + ); + + assert_eq!( + status, + LocalTimelineInitStatus::LocallyComplete(local_metadata) + ); + assert!(!sync_needed); + } + + #[test] + fn needs_upload() { + let mut new_sync_tasks = VecDeque::default(); + let sync_id = TenantTimelineId::generate(); + let local_metadata = dummy_metadata(0x02.into()); + let local_files = + HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]); + let mut remote_entry = RemoteTimeline::new(local_metadata.clone()); + remote_entry.add_timeline_layers([]); + + let (status, sync_needed) = compare_local_and_remote_timeline( + &mut new_sync_tasks, + sync_id, + local_metadata.clone(), + local_files.clone(), + &remote_entry, + ); + + assert_eq!( + status, + LocalTimelineInitStatus::LocallyComplete(local_metadata.clone()) + ); + assert!(!sync_needed); + + let new_sync_tasks = new_sync_tasks.into_iter().collect::>(); + + assert_eq!( + &new_sync_tasks, + &[( + sync_id, + SyncTask::upload(LayersUpload { + layers_to_upload: local_files, + uploaded_layers: HashMap::default(), + metadata: Some(local_metadata), + }) + )] + ); + } + } } diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index 21a3372e70..39846f0da3 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -171,7 +171,7 @@ mod tests { let local_timeline_path = harness.timeline_path(&TIMELINE_ID); let timeline_upload = create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; - for local_path in timeline_upload.layers_to_upload { + for (local_path, _metadata) in timeline_upload.layers_to_upload { let remote_path = local_storage.resolve_in_storage(&local_storage.remote_object_id(&local_path)?)?; let remote_parent_dir = remote_path.parent().unwrap(); diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 3e850443d8..61ef164f14 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -16,7 +16,11 @@ use tokio::{ }; use tracing::{debug, error, info, warn}; -use crate::{config::PageServerConf, storage_sync::SyncTask, TEMP_FILE_SUFFIX}; +use crate::{ + config::PageServerConf, + storage_sync::{index::LayerFileMetadata, SyncTask}, + TEMP_FILE_SUFFIX, +}; use utils::{ crashsafe_dir::path_with_suffix_extension, id::{TenantId, TenantTimelineId, TimelineId}, @@ -219,8 +223,14 @@ pub(super) async fn download_timeline_layers<'a>( let layers_to_download = remote_timeline .stored_files() - .difference(&download.layers_to_skip) - .cloned() + .iter() + .filter_map(|(layer_path, metadata)| { + if !download.layers_to_skip.contains(layer_path) { + Some((layer_path.to_owned(), metadata.to_owned())) + } else { + None + } + }) .collect::>(); debug!("Layers to download: {layers_to_download:?}"); @@ -233,89 +243,129 @@ pub(super) async fn download_timeline_layers<'a>( let mut download_tasks = layers_to_download .into_iter() - .map(|layer_destination_path| async move { - if layer_destination_path.exists() { - debug!( - "Layer already exists locally, skipping download: {}", - layer_destination_path.display() - ); - } else { - // Perform a rename inspired by durable_rename from file_utils.c. - // The sequence: - // write(tmp) - // fsync(tmp) - // rename(tmp, new) - // fsync(new) - // fsync(parent) - // For more context about durable_rename check this email from postgres mailing list: - // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com - // If pageserver crashes the temp file will be deleted on startup and re-downloaded. - let temp_file_path = - path_with_suffix_extension(&layer_destination_path, TEMP_FILE_SUFFIX); + .map(|(layer_destination_path, metadata)| async move { - let mut destination_file = - fs::File::create(&temp_file_path).await.with_context(|| { - format!( - "Failed to create a destination file for layer '{}'", - temp_file_path.display() - ) - })?; + match layer_destination_path.metadata() { + Ok(m) if m.is_file() => { + // the file exists from earlier round when we failed after renaming it as + // layer_destination_path + let verified = if let Some(expected) = metadata.file_size() { + m.len() == expected + } else { + // behaviour before recording metadata was to accept any existing + true + }; - let mut layer_download = storage.download_storage_object(None, &layer_destination_path) - .await - .with_context(|| { - format!( - "Failed to initiate the download the layer for {sync_id} into file '{}'", - temp_file_path.display() - ) - })?; - io::copy(&mut layer_download.download_stream, &mut destination_file) - .await - .with_context(|| { - format!( - "Failed to download the layer for {sync_id} into file '{}'", - temp_file_path.display() - ) - })?; - - // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: - // A file will not be closed immediately when it goes out of scope if there are any IO operations - // that have not yet completed. To ensure that a file is closed immediately when it is dropped, - // you should call flush before dropping it. - // - // From the tokio code I see that it waits for pending operations to complete. There shouldn't be any because - // we assume that `destination_file` file is fully written. I.e there is no pending .write(...).await operations. - // But for additional safety let's check/wait for any pending operations. - destination_file.flush().await.with_context(|| { - format!( - "failed to flush source file at {}", - temp_file_path.display() - ) - })?; - - // not using sync_data because it can lose file size update - destination_file.sync_all().await.with_context(|| { - format!( - "failed to fsync source file at {}", - temp_file_path.display() - ) - })?; - drop(destination_file); - - fail::fail_point!("remote-storage-download-pre-rename", |_| { - anyhow::bail!("remote-storage-download-pre-rename failpoint triggered") - }); - - fs::rename(&temp_file_path, &layer_destination_path).await?; - - fsync_path(&layer_destination_path).await.with_context(|| { - format!( - "Cannot fsync layer destination path {}", - layer_destination_path.display(), - ) - })?; + if verified { + debug!( + "Layer already exists locally, skipping download: {}", + layer_destination_path.display() + ); + return Ok((layer_destination_path, LayerFileMetadata::new(m.len()))) + } else { + // no need to remove it, it will be overwritten by fs::rename + // after successful download + warn!("Downloaded layer exists already but layer file metadata mismatches: {}, metadata {:?}", layer_destination_path.display(), metadata); + } + } + Ok(m) => { + return Err(anyhow::anyhow!("Downloaded layer destination exists but is not a file: {m:?}, target needs to be removed/archived manually: {layer_destination_path:?}")); + } + Err(_) => { + // behave as the file didn't exist + } } - Ok::<_, anyhow::Error>(layer_destination_path) + + // Perform a rename inspired by durable_rename from file_utils.c. + // The sequence: + // write(tmp) + // fsync(tmp) + // rename(tmp, new) + // fsync(new) + // fsync(parent) + // For more context about durable_rename check this email from postgres mailing list: + // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com + // If pageserver crashes the temp file will be deleted on startup and re-downloaded. + let temp_file_path = + path_with_suffix_extension(&layer_destination_path, TEMP_FILE_SUFFIX); + + // TODO: this doesn't use the cached fd for some reason? + let mut destination_file = + fs::File::create(&temp_file_path).await.with_context(|| { + format!( + "Failed to create a destination file for layer '{}'", + temp_file_path.display() + ) + })?; + + let mut layer_download = storage.download_storage_object(None, &layer_destination_path) + .await + .with_context(|| { + format!( + "Failed to initiate the download the layer for {sync_id} into file '{}'", + temp_file_path.display() + ) + })?; + + let bytes_amount = io::copy(&mut layer_download.download_stream, &mut destination_file) + .await + .with_context(|| { + format!( + "Failed to download the layer for {sync_id} into file '{}'", + temp_file_path.display() + ) + })?; + + // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: + // A file will not be closed immediately when it goes out of scope if there are any IO operations + // that have not yet completed. To ensure that a file is closed immediately when it is dropped, + // you should call flush before dropping it. + // + // From the tokio code I see that it waits for pending operations to complete. There shouldn't be any because + // we assume that `destination_file` file is fully written. I.e there is no pending .write(...).await operations. + // But for additional safety let's check/wait for any pending operations. + destination_file.flush().await.with_context(|| { + format!( + "failed to flush source file at {}", + temp_file_path.display() + ) + })?; + + match metadata.file_size() { + Some(expected) if expected != bytes_amount => { + anyhow::bail!( + "According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'", + temp_file_path.display() + ); + }, + Some(_) | None => { + // matches, or upgrading from an earlier IndexPart version + } + } + + // not using sync_data because it can lose file size update + destination_file.sync_all().await.with_context(|| { + format!( + "failed to fsync source file at {}", + temp_file_path.display() + ) + })?; + drop(destination_file); + + fail::fail_point!("remote-storage-download-pre-rename", |_| { + anyhow::bail!("remote-storage-download-pre-rename failpoint triggered") + }); + + fs::rename(&temp_file_path, &layer_destination_path).await?; + + fsync_path(&layer_destination_path).await.with_context(|| { + format!( + "Cannot fsync layer destination path {}", + layer_destination_path.display(), + ) + })?; + + Ok::<_, anyhow::Error>((layer_destination_path, LayerFileMetadata::new(bytes_amount))) }) .collect::>(); @@ -324,9 +374,12 @@ pub(super) async fn download_timeline_layers<'a>( let mut undo = HashSet::new(); while let Some(download_result) = download_tasks.next().await { match download_result { - Ok(downloaded_path) => { + Ok((downloaded_path, metadata)) => { undo.insert(downloaded_path.clone()); - download.layers_to_skip.insert(downloaded_path); + download.layers_to_skip.insert(downloaded_path.clone()); + // what if the key existed already? ignore, because then we would had + // downloaded a partial file, and had to retry + download.gathered_metadata.insert(downloaded_path, metadata); } Err(e) => { errors_happened = true; @@ -349,6 +402,8 @@ pub(super) async fn download_timeline_layers<'a>( ); for item in undo { download.layers_to_skip.remove(&item); + // intentionally don't clear the gathered_metadata because it exists for fsync_path + // failure on parent directory } errors_happened = true; } @@ -453,9 +508,9 @@ mod tests { let timeline_upload = create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; - for local_path in timeline_upload.layers_to_upload { + for local_path in timeline_upload.layers_to_upload.keys() { let remote_path = - local_storage.resolve_in_storage(&storage.remote_object_id(&local_path)?)?; + local_storage.resolve_in_storage(&storage.remote_object_id(local_path)?)?; let remote_parent_dir = remote_path.parent().unwrap(); if !remote_parent_dir.exists() { fs::create_dir_all(&remote_parent_dir).await?; @@ -473,11 +528,19 @@ mod tests { let mut remote_timeline = RemoteTimeline::new(metadata.clone()); remote_timeline.awaits_download = true; - remote_timeline.add_timeline_layers( - layer_files - .iter() - .map(|layer| local_timeline_path.join(layer)), - ); + remote_timeline.add_timeline_layers(layer_files.iter().map(|layer| { + let layer_path = local_timeline_path.join(layer); + + // this could had also been LayerFileMetadata::default(), but since in this test we + // don't do the merge operation done by storage_sync::download_timeline_data, it would + // not be merged back to timeline. + let metadata_from_upload = timeline_upload + .layers_to_upload + .get(&layer_path) + .expect("layer must exist in previously uploaded paths") + .to_owned(); + (layer_path, metadata_from_upload) + })); let download_data = match download_timeline_layers( harness.conf, @@ -487,9 +550,9 @@ mod tests { sync_id, SyncData::new( current_retries, - LayersDownload { - layers_to_skip: HashSet::from([local_timeline_path.join("layer_to_skip")]), - }, + LayersDownload::from_skipped_layers(HashSet::from([ + local_timeline_path.join("layer_to_skip") + ])), ), ) .await @@ -552,12 +615,7 @@ mod tests { &sync_queue, None, sync_id, - SyncData::new( - 0, - LayersDownload { - layers_to_skip: HashSet::new(), - }, - ), + SyncData::new(0, LayersDownload::from_skipped_layers(HashSet::new())), ) .await; assert!( @@ -576,12 +634,7 @@ mod tests { &sync_queue, Some(¬_expecting_download_remote_timeline), sync_id, - SyncData::new( - 0, - LayersDownload { - layers_to_skip: HashSet::new(), - }, - ), + SyncData::new(0, LayersDownload::from_skipped_layers(HashSet::new())), ) .await; assert!( diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index db37c7b411..0779d95e8e 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -212,8 +212,8 @@ impl RemoteTimelineIndex { /// Restored index part data about the timeline, stored in the remote index. #[derive(Debug, Clone)] pub struct RemoteTimeline { - timeline_layers: HashSet, - missing_layers: HashSet, + timeline_layers: HashMap, + missing_layers: HashMap, pub metadata: TimelineMetadata, pub awaits_download: bool, @@ -222,62 +222,161 @@ pub struct RemoteTimeline { impl RemoteTimeline { pub fn new(metadata: TimelineMetadata) -> Self { Self { - timeline_layers: HashSet::new(), - missing_layers: HashSet::new(), + timeline_layers: HashMap::default(), + missing_layers: HashMap::default(), metadata, awaits_download: false, } } - pub fn add_timeline_layers(&mut self, new_layers: impl IntoIterator) { - self.timeline_layers.extend(new_layers.into_iter()); + pub fn add_timeline_layers( + &mut self, + new_layers: impl IntoIterator, + ) { + self.timeline_layers.extend(new_layers); } - pub fn add_upload_failures(&mut self, upload_failures: impl IntoIterator) { - self.missing_layers.extend(upload_failures.into_iter()); + pub fn add_upload_failures( + &mut self, + upload_failures: impl IntoIterator, + ) { + self.missing_layers.extend(upload_failures); } pub fn remove_layers(&mut self, layers_to_remove: &HashSet) { self.timeline_layers - .retain(|layer| !layers_to_remove.contains(layer)); + .retain(|layer, _| !layers_to_remove.contains(layer)); self.missing_layers - .retain(|layer| !layers_to_remove.contains(layer)); + .retain(|layer, _| !layers_to_remove.contains(layer)); } /// Lists all layer files in the given remote timeline. Omits the metadata file. - pub fn stored_files(&self) -> &HashSet { + pub fn stored_files(&self) -> &HashMap { &self.timeline_layers } + /// Combines metadata gathered or verified during downloading needed layer files to metadata on + /// the [`RemoteIndex`], so it can be uploaded later. + pub fn merge_metadata_from_downloaded( + &mut self, + downloaded: &HashMap, + ) { + downloaded.iter().for_each(|(path, metadata)| { + if let Some(upgraded) = self.timeline_layers.get_mut(path) { + upgraded.merge(metadata); + } + }); + } + pub fn from_index_part(timeline_path: &Path, index_part: IndexPart) -> anyhow::Result { let metadata = TimelineMetadata::from_bytes(&index_part.metadata_bytes)?; + let default_metadata = &IndexLayerMetadata::default(); + + let find_metadata = |key: &RelativePath| -> LayerFileMetadata { + index_part + .layer_metadata + .get(key) + .unwrap_or(default_metadata) + .into() + }; + Ok(Self { - timeline_layers: to_local_paths(timeline_path, index_part.timeline_layers), - missing_layers: to_local_paths(timeline_path, index_part.missing_layers), + timeline_layers: index_part + .timeline_layers + .iter() + .map(|layer_path| (layer_path.as_path(timeline_path), find_metadata(layer_path))) + .collect(), + missing_layers: index_part + .missing_layers + .iter() + .map(|layer_path| (layer_path.as_path(timeline_path), find_metadata(layer_path))) + .collect(), metadata, awaits_download: false, }) } } +/// Metadata gathered for each of the layer files. +/// +/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which +/// might have less or more metadata depending if upgrading or rolling back an upgrade. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +#[cfg_attr(test, derive(Default))] +pub struct LayerFileMetadata { + file_size: Option, +} + +impl From<&'_ IndexLayerMetadata> for LayerFileMetadata { + fn from(other: &IndexLayerMetadata) -> Self { + LayerFileMetadata { + file_size: other.file_size, + } + } +} + +impl LayerFileMetadata { + pub fn new(file_size: u64) -> Self { + LayerFileMetadata { + file_size: Some(file_size), + } + } + + pub fn file_size(&self) -> Option { + self.file_size + } + + /// Metadata has holes due to version upgrades. This method is called to upgrade self with the + /// other value. + /// + /// This is called on the possibly outdated version. + pub fn merge(&mut self, other: &Self) { + self.file_size = other.file_size.or(self.file_size); + } +} + /// Part of the remote index, corresponding to a certain timeline. /// Contains the data about all files in the timeline, present remotely and its metadata. +/// +/// This type needs to be backwards and forwards compatible. When changing the fields, +/// remember to add a test case for the changed version. #[serde_as] #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub struct IndexPart { + /// Debugging aid describing the version of this type. + #[serde(default)] + version: usize, + + /// Each of the layers present on remote storage. + /// + /// Additional metadata can might exist in `layer_metadata`. timeline_layers: HashSet, + /// Currently is not really used in pageserver, /// present to manually keep track of the layer files that pageserver might never retrieve. /// /// Such "holes" might appear if any upload task was evicted on an error threshold: /// the this layer will only be rescheduled for upload on pageserver restart. missing_layers: HashSet, + + /// Per layer file metadata, which can be present for a present or missing layer file. + /// + /// Older versions of `IndexPart` will not have this property or have only a part of metadata + /// that latest version stores. + #[serde(default)] + layer_metadata: HashMap, + #[serde_as(as = "DisplayFromStr")] disk_consistent_lsn: Lsn, metadata_bytes: Vec, } impl IndexPart { + /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be + /// used to understand later versions. + /// + /// Version is currently informative only. + const LATEST_VERSION: usize = 1; pub const FILE_NAME: &'static str = "index_part.json"; #[cfg(test)] @@ -288,8 +387,10 @@ impl IndexPart { metadata_bytes: Vec, ) -> Self { Self { + version: Self::LATEST_VERSION, timeline_layers, missing_layers, + layer_metadata: HashMap::default(), disk_consistent_lsn, metadata_bytes, } @@ -304,35 +405,68 @@ impl IndexPart { remote_timeline: RemoteTimeline, ) -> anyhow::Result { let metadata_bytes = remote_timeline.metadata.to_bytes()?; + + let mut layer_metadata = HashMap::new(); + + let mut missing_layers = HashSet::new(); + + separate_paths_and_metadata( + timeline_path, + &remote_timeline.missing_layers, + &mut missing_layers, + &mut layer_metadata, + ) + .context("Failed to convert missing layers' paths to relative ones")?; + + let mut timeline_layers = HashSet::new(); + + separate_paths_and_metadata( + timeline_path, + &remote_timeline.timeline_layers, + &mut timeline_layers, + &mut layer_metadata, + ) + .context("Failed to convert timeline layers' paths to relative ones")?; + Ok(Self { - timeline_layers: to_relative_paths(timeline_path, remote_timeline.timeline_layers) - .context("Failed to convert timeline layers' paths to relative ones")?, - missing_layers: to_relative_paths(timeline_path, remote_timeline.missing_layers) - .context("Failed to convert missing layers' paths to relative ones")?, + version: Self::LATEST_VERSION, + timeline_layers, + missing_layers, + layer_metadata, disk_consistent_lsn: remote_timeline.metadata.disk_consistent_lsn(), metadata_bytes, }) } } -fn to_local_paths( - timeline_path: &Path, - paths: impl IntoIterator, -) -> HashSet { - paths - .into_iter() - .map(|path| path.as_path(timeline_path)) - .collect() +/// Serialized form of [`LayerFileMetadata`]. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)] +pub struct IndexLayerMetadata { + file_size: Option, } -fn to_relative_paths( +impl From<&'_ LayerFileMetadata> for IndexLayerMetadata { + fn from(other: &'_ LayerFileMetadata) -> Self { + IndexLayerMetadata { + file_size: other.file_size, + } + } +} + +fn separate_paths_and_metadata( timeline_path: &Path, - paths: impl IntoIterator, -) -> anyhow::Result> { - paths - .into_iter() - .map(|path| RelativePath::new(timeline_path, path)) - .collect() + input: &HashMap, + output: &mut HashSet, + layer_metadata: &mut HashMap, +) -> anyhow::Result<()> { + for (path, metadata) in input { + let rel_path = RelativePath::new(timeline_path, path)?; + let metadata = IndexLayerMetadata::from(metadata); + + layer_metadata.insert(rel_path.clone(), metadata); + output.insert(rel_path); + } + Ok(()) } #[cfg(test)] @@ -357,13 +491,13 @@ mod tests { DEFAULT_PG_VERSION, ); let remote_timeline = RemoteTimeline { - timeline_layers: HashSet::from([ - timeline_path.join("layer_1"), - timeline_path.join("layer_2"), + timeline_layers: HashMap::from([ + (timeline_path.join("layer_1"), LayerFileMetadata::new(1)), + (timeline_path.join("layer_2"), LayerFileMetadata::new(2)), ]), - missing_layers: HashSet::from([ - timeline_path.join("missing_1"), - timeline_path.join("missing_2"), + missing_layers: HashMap::from([ + (timeline_path.join("missing_1"), LayerFileMetadata::new(3)), + (timeline_path.join("missing_2"), LayerFileMetadata::new(4)), ]), metadata: metadata.clone(), awaits_download: false, @@ -485,13 +619,13 @@ mod tests { let conversion_result = IndexPart::from_remote_timeline( &timeline_path, RemoteTimeline { - timeline_layers: HashSet::from([ - PathBuf::from("bad_path"), - timeline_path.join("layer_2"), + timeline_layers: HashMap::from([ + (PathBuf::from("bad_path"), LayerFileMetadata::new(1)), + (timeline_path.join("layer_2"), LayerFileMetadata::new(2)), ]), - missing_layers: HashSet::from([ - timeline_path.join("missing_1"), - timeline_path.join("missing_2"), + missing_layers: HashMap::from([ + (timeline_path.join("missing_1"), LayerFileMetadata::new(3)), + (timeline_path.join("missing_2"), LayerFileMetadata::new(4)), ]), metadata: metadata.clone(), awaits_download: false, @@ -502,13 +636,13 @@ mod tests { let conversion_result = IndexPart::from_remote_timeline( &timeline_path, RemoteTimeline { - timeline_layers: HashSet::from([ - timeline_path.join("layer_1"), - timeline_path.join("layer_2"), + timeline_layers: HashMap::from([ + (timeline_path.join("layer_1"), LayerFileMetadata::new(1)), + (timeline_path.join("layer_2"), LayerFileMetadata::new(2)), ]), - missing_layers: HashSet::from([ - PathBuf::from("bad_path"), - timeline_path.join("missing_2"), + missing_layers: HashMap::from([ + (PathBuf::from("bad_path"), LayerFileMetadata::new(3)), + (timeline_path.join("missing_2"), LayerFileMetadata::new(4)), ]), metadata, awaits_download: false, @@ -516,4 +650,63 @@ mod tests { ); assert!(conversion_result.is_err(), "Should not be able to convert metadata with missing layer paths that are not in the timeline directory"); } + + #[test] + fn v0_indexpart_is_parsed() { + let example = r#"{ + "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], + "missing_layers":["not_a_real_layer_but_adding_coverage"], + "disk_consistent_lsn":"0/16960E8", + "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] + }"#; + + let expected = IndexPart { + version: 0, + timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(), + missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(), + layer_metadata: HashMap::default(), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), + }; + + let part = serde_json::from_str::(example).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v1_indexpart_is_parsed() { + let example = r#"{ + "version":1, + "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], + "missing_layers":["not_a_real_layer_but_adding_coverage"], + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] + }"#; + + let expected = IndexPart { + // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? + version: 1, + timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(), + missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(), + layer_metadata: HashMap::from([ + (RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned()), IndexLayerMetadata { + file_size: Some(25600000), + }), + (RelativePath("not_a_real_layer_but_adding_coverage".to_owned()), IndexLayerMetadata { + // serde_json should always parse this but this might be a double with jq for + // example. + file_size: Some(9007199254741001), + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), + }; + + let part = serde_json::from_str::(example).unwrap(); + assert_eq!(part, expected); + } } diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 75657915c0..f91105052b 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -69,14 +69,25 @@ pub(super) async fn upload_timeline_layers<'a>( .map(|meta| meta.disk_consistent_lsn()); let already_uploaded_layers = remote_timeline - .map(|timeline| timeline.stored_files()) - .cloned() + .map(|timeline| { + timeline + .stored_files() + .keys() + .cloned() + .collect::>() + }) .unwrap_or_default(); let layers_to_upload = upload .layers_to_upload - .difference(&already_uploaded_layers) - .cloned() + .iter() + .filter_map(|(k, v)| { + if !already_uploaded_layers.contains(k) { + Some((k.to_owned(), v.to_owned())) + } else { + None + } + }) .collect::>(); if layers_to_upload.is_empty() { @@ -98,7 +109,7 @@ pub(super) async fn upload_timeline_layers<'a>( let mut upload_tasks = layers_to_upload .into_iter() - .map(|source_path| async move { + .map(|(source_path, known_metadata)| async move { let source_file = match fs::File::open(&source_path).await.with_context(|| { format!( "Failed to upen a source file for layer '{}'", @@ -109,7 +120,7 @@ pub(super) async fn upload_timeline_layers<'a>( Err(e) => return Err(UploadError::MissingLocalFile(source_path, e)), }; - let source_size = source_file + let fs_size = source_file .metadata() .await .with_context(|| { @@ -119,10 +130,24 @@ pub(super) async fn upload_timeline_layers<'a>( ) }) .map_err(UploadError::Other)? - .len() as usize; + .len(); + + // FIXME: this looks bad + if let Some(metadata_size) = known_metadata.file_size() { + if metadata_size != fs_size { + return Err(UploadError::Other(anyhow::anyhow!( + "File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}" + ))); + } + } else { + // this is a silly state we would like to avoid + } + + let fs_size = usize::try_from(fs_size).with_context(|| format!("File {source_path:?} size {fs_size} could not be converted to usize")) + .map_err(UploadError::Other)?; match storage - .upload_storage_object(Box::new(source_file), source_size, &source_path) + .upload_storage_object(Box::new(source_file), fs_size, &source_path) .await .with_context(|| format!("Failed to upload layer file for {sync_id}")) { @@ -136,8 +161,11 @@ pub(super) async fn upload_timeline_layers<'a>( while let Some(upload_result) = upload_tasks.next().await { match upload_result { Ok(uploaded_path) => { - upload.layers_to_upload.remove(&uploaded_path); - upload.uploaded_layers.insert(uploaded_path); + let metadata = upload + .layers_to_upload + .remove(&uploaded_path) + .expect("metadata should always exist, assuming no double uploads"); + upload.uploaded_layers.insert(uploaded_path, metadata); } Err(e) => match e { UploadError::Other(e) => { @@ -262,7 +290,7 @@ mod tests { assert_eq!( upload .uploaded_layers - .iter() + .keys() .cloned() .collect::>(), layer_files @@ -357,7 +385,7 @@ mod tests { assert_eq!( upload .uploaded_layers - .iter() + .keys() .cloned() .collect::>(), layer_files diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 3639e30fee..0f8e60f8d3 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -52,7 +52,10 @@ use crate::task_mgr::TaskKind; use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task}; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; -use crate::{page_cache, storage_sync}; +use crate::{ + page_cache, + storage_sync::{self, index::LayerFileMetadata}, +}; pub struct Timeline { conf: &'static PageServerConf, @@ -1190,8 +1193,8 @@ impl Timeline { self.create_image_layers(&partitioning, self.initdb_lsn, true)? } else { // normal case, write out a L0 delta layer file. - let delta_path = self.create_delta_layer(&frozen_layer)?; - HashSet::from([delta_path]) + let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?; + HashMap::from([(delta_path, metadata)]) }; fail_point!("flush-frozen-before-sync"); @@ -1226,7 +1229,7 @@ impl Timeline { fn update_disk_consistent_lsn( &self, disk_consistent_lsn: Lsn, - layer_paths_to_upload: HashSet, + layer_paths_to_upload: HashMap, ) -> Result<()> { // If we were able to advance 'disk_consistent_lsn', save it the metadata file. // After crash, we will restart WAL streaming and processing from that point. @@ -1295,7 +1298,10 @@ impl Timeline { } // Write out the given frozen in-memory layer as a new L0 delta file - fn create_delta_layer(&self, frozen_layer: &InMemoryLayer) -> Result { + fn create_delta_layer( + &self, + frozen_layer: &InMemoryLayer, + ) -> Result<(PathBuf, LayerFileMetadata)> { // Write it out let new_delta = frozen_layer.write_to_disk()?; let new_delta_path = new_delta.path(); @@ -1321,12 +1327,13 @@ impl Timeline { // update the timeline's physical size let sz = new_delta_path.metadata()?.len(); + self.metrics.current_physical_size_gauge.add(sz); // update metrics self.metrics.num_persistent_files_created.inc_by(1); self.metrics.persistent_bytes_written.inc_by(sz); - Ok(new_delta_path) + Ok((new_delta_path, LayerFileMetadata::new(sz))) } pub fn compact(&self) -> anyhow::Result<()> { @@ -1392,7 +1399,7 @@ impl Timeline { storage_sync::schedule_layer_upload( self.tenant_id, self.timeline_id, - HashSet::from_iter(layer_paths_to_upload), + layer_paths_to_upload, None, ); } @@ -1473,10 +1480,9 @@ impl Timeline { partitioning: &KeyPartitioning, lsn: Lsn, force: bool, - ) -> Result> { + ) -> Result> { let timer = self.metrics.create_images_time_histo.start_timer(); let mut image_layers: Vec = Vec::new(); - let mut layer_paths_to_upload = HashSet::new(); for partition in partitioning.parts.iter() { if force || self.time_for_new_image_layer(partition, lsn)? { let img_range = @@ -1498,7 +1504,6 @@ impl Timeline { } } let image_layer = image_layer_writer.finish()?; - layer_paths_to_upload.insert(image_layer.path()); image_layers.push(image_layer); } } @@ -1512,15 +1517,25 @@ impl Timeline { // // Compaction creates multiple image layers. It would be better to create them all // and fsync them all in parallel. - let mut all_paths = Vec::from_iter(layer_paths_to_upload.clone()); - all_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); + let all_paths = image_layers + .iter() + .map(|layer| layer.path()) + .chain(std::iter::once( + self.conf.timeline_path(&self.timeline_id, &self.tenant_id), + )) + .collect::>(); par_fsync::par_fsync(&all_paths)?; + let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len()); + let mut layers = self.layers.write().unwrap(); for l in image_layers { - self.metrics - .current_physical_size_gauge - .add(l.path().metadata()?.len()); + let path = l.path(); + let metadata = path.metadata()?; + + layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len())); + + self.metrics.current_physical_size_gauge.add(metadata.len()); layers.insert_historic(Arc::new(l)); } drop(layers); @@ -1771,16 +1786,16 @@ impl Timeline { } let mut layers = self.layers.write().unwrap(); - let mut new_layer_paths = HashSet::with_capacity(new_layers.len()); + let mut new_layer_paths = HashMap::with_capacity(new_layers.len()); for l in new_layers { let new_delta_path = l.path(); - // update the timeline's physical size - self.metrics - .current_physical_size_gauge - .add(new_delta_path.metadata()?.len()); + let metadata = new_delta_path.metadata()?; - new_layer_paths.insert(new_delta_path); + // update the timeline's physical size + self.metrics.current_physical_size_gauge.add(metadata.len()); + + new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len())); layers.insert_historic(Arc::new(l)); } diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index c6698ee22f..b2c927d4fc 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -1,7 +1,7 @@ //! This module acts as a switchboard to access different repositories managed by this //! page server. -use std::collections::{hash_map, HashMap, HashSet}; +use std::collections::{hash_map, HashMap}; use std::ffi::OsStr; use std::fs; use std::path::{Path, PathBuf}; @@ -14,8 +14,8 @@ use remote_storage::GenericRemoteStorage; use crate::config::{PageServerConf, METADATA_FILE_NAME}; use crate::http::models::TenantInfo; -use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; -use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; +use crate::storage_sync::index::{LayerFileMetadata, RemoteIndex, RemoteTimelineIndex}; +use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData, TimelineLocalFiles}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::{ ephemeral_file::is_ephemeral_file, metadata::TimelineMetadata, Tenant, TenantState, @@ -104,7 +104,7 @@ pub fn init_tenant_mgr( if let TenantAttachData::Ready(t) = new_timeline_values { for (timeline_id, old_value) in old_values { if let LocalTimelineInitStatus::LocallyComplete(metadata) = old_value { - t.insert(timeline_id, (metadata, HashSet::new())); + t.insert(timeline_id, TimelineLocalFiles::ready(metadata)); } } } @@ -189,7 +189,7 @@ pub fn attach_local_tenants( let has_timelines = !timelines.is_empty(); let timelines_to_attach = timelines .iter() - .map(|(&k, (v, _))| (k, v.clone())) + .map(|(&k, v)| (k, v.metadata().to_owned())) .collect(); match tenant.init_attach_timelines(timelines_to_attach) { Ok(()) => { @@ -483,7 +483,7 @@ pub fn list_tenant_info(remote_index: &RemoteTimelineIndex) -> Vec { #[derive(Debug)] pub enum TenantAttachData { - Ready(HashMap)>), + Ready(HashMap), Broken(anyhow::Error), } /// Attempts to collect information about all tenant and timelines, existing on the local FS. @@ -602,7 +602,6 @@ fn is_temporary(path: &Path) -> bool { } } -#[allow(clippy::type_complexity)] fn collect_timelines_for_tenant( config: &'static PageServerConf, tenant_path: &Path, @@ -648,7 +647,10 @@ fn collect_timelines_for_tenant( } else { match collect_timeline_files(&timeline_dir) { Ok((timeline_id, metadata, timeline_files)) => { - tenant_timelines.insert(timeline_id, (metadata, timeline_files)); + tenant_timelines.insert( + timeline_id, + TimelineLocalFiles::collected(metadata, timeline_files), + ); } Err(e) => { error!( @@ -690,8 +692,12 @@ fn collect_timelines_for_tenant( // NOTE: ephemeral files are excluded from the list fn collect_timeline_files( timeline_dir: &Path, -) -> anyhow::Result<(TimelineId, TimelineMetadata, HashSet)> { - let mut timeline_files = HashSet::new(); +) -> anyhow::Result<( + TimelineId, + TimelineMetadata, + HashMap, +)> { + let mut timeline_files = HashMap::new(); let mut timeline_metadata_path = None; let timeline_id = timeline_dir @@ -704,7 +710,9 @@ fn collect_timeline_files( fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; for entry in timeline_dir_entries { let entry_path = entry.context("Failed to list timeline dir entry")?.path(); - if entry_path.is_file() { + let metadata = entry_path.metadata()?; + + if metadata.is_file() { if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) { timeline_metadata_path = Some(entry_path); } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) { @@ -719,7 +727,8 @@ fn collect_timeline_files( ) })?; } else { - timeline_files.insert(entry_path); + let layer_metadata = LayerFileMetadata::new(metadata.len()); + timeline_files.insert(entry_path, layer_metadata); } } } diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index d8424e22c8..a7c2e7ace0 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -7,13 +7,16 @@ # import asyncio +import json import os +import shutil from pathlib import Path from typing import List, Tuple import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( + LocalFsStorage, NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient, @@ -189,3 +192,246 @@ def expect_tenant_to_download_timeline( ), f"Tenant {tenant_id} should have no downloads in progress" return assert False, f"Tenant {tenant_id} is missing on pageserver" + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_tenant_upgrades_index_json_from_v0( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + # the "image" for the v0 index_part.json. the fields themselves are + # replaced with values read from the later version because of #2592 (initdb + # lsn not reproducible). + v0_skeleton = json.loads( + """{ + "timeline_layers":[ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9" + ], + "missing_layers":[], + "disk_consistent_lsn":"0/16960E8", + "metadata_bytes":[] + }""" + ) + + # getting a too eager compaction happening for this test would not play + # well with the strict assertions. + neon_env_builder.pageserver_config_override = "tenant_config.compaction_period='1h'" + + neon_env_builder.enable_remote_storage( + remote_storage_kind, "test_tenant_upgrades_index_json_from_v0" + ) + + # launch pageserver, populate the default tenants timeline, wait for it to be uploaded, + # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + pg = env.postgres.create_start("main") + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + with pg.cursor() as cur: + cur.execute("CREATE TABLE t0 AS VALUES (123, 'second column as text');") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # flush, wait until in remote storage + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + env.postgres.stop_all() + env.pageserver.stop() + + # remove all local data for the tenant to force redownloading and subsequent upgrade + shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id)) + + # downgrade the remote file + timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id) + with open(timeline_path, "r+") as timeline_file: + # keep the deserialized for later inspection + orig_index_part = json.load(timeline_file) + + v0_index_part = {key: orig_index_part[key] for key in v0_skeleton} + + timeline_file.seek(0) + json.dump(v0_index_part, timeline_file) + + env.pageserver.start() + pageserver_http = env.pageserver.http_client() + pageserver_http.tenant_attach(tenant_id) + + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: expect_tenant_to_download_timeline(pageserver_http, tenant_id), + ) + + pg = env.postgres.create_start("main") + + with pg.cursor() as cur: + cur.execute("INSERT INTO t0 VALUES (234, 'test data');") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + # not needed anymore + env.postgres.stop_all() + env.pageserver.stop() + + # make sure the file has been upgraded back to how it started + index_part = local_fs_index_part(env, tenant_id, timeline_id) + assert index_part["version"] == orig_index_part["version"] + assert index_part["missing_layers"] == orig_index_part["missing_layers"] + + # expect one more layer because of the forced checkpoint + assert len(index_part["timeline_layers"]) == len(orig_index_part["timeline_layers"]) + 1 + + # all of the same layer files are there, but they might be shuffled around + orig_layers = set(orig_index_part["timeline_layers"]) + later_layers = set(index_part["timeline_layers"]) + assert later_layers.issuperset(orig_layers) + + added_layers = later_layers - orig_layers + assert len(added_layers) == 1 + + # all of metadata has been regenerated (currently just layer file size) + all_metadata_keys = set() + for layer in orig_layers: + orig_metadata = orig_index_part["layer_metadata"][layer] + new_metadata = index_part["layer_metadata"][layer] + assert ( + orig_metadata == new_metadata + ), f"metadata for layer {layer} should not have changed {orig_metadata} vs. {new_metadata}" + all_metadata_keys |= set(orig_metadata.keys()) + + one_new_layer = next(iter(added_layers)) + assert one_new_layer in index_part["layer_metadata"], "new layer should have metadata" + + only_new_metadata = index_part["layer_metadata"][one_new_layer] + + assert ( + set(only_new_metadata.keys()).symmetric_difference(all_metadata_keys) == set() + ), "new layer metadata has same metadata as others" + + +# FIXME: test index_part.json getting downgraded from imaginary new version + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_tenant_redownloads_truncated_file_on_startup( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + # since we now store the layer file length metadata, we notice on startup that a layer file is of wrong size, and proceed to redownload it. + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_tenant_redownloads_truncated_file_on_startup", + ) + + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + pg = env.postgres.create_start("main") + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + with pg.cursor() as cur: + cur.execute("CREATE TABLE t1 AS VALUES (123, 'foobar');") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + env.postgres.stop_all() + env.pageserver.stop() + + timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + local_layer_truncated = None + for path in Path.iterdir(timeline_dir): + if path.name.startswith("00000"): + correct_size = os.stat(path).st_size + os.truncate(path, 0) + local_layer_truncated = (path, correct_size) + break + assert ( + local_layer_truncated is not None + ), f"Found no local layer files to delete in directory {timeline_dir}" + + (path, expected_size) = local_layer_truncated + + # ensure the same size is found from the index_part.json + index_part = local_fs_index_part(env, tenant_id, timeline_id) + assert index_part["layer_metadata"][path.name]["file_size"] == expected_size + + ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory + env.pageserver.start() + client = env.pageserver.http_client() + + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: expect_tenant_to_download_timeline(client, tenant_id), + ) + + restored_timelines = client.timeline_list(tenant_id) + assert ( + len(restored_timelines) == 1 + ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage" + retored_timeline = restored_timelines[0] + assert retored_timeline["timeline_id"] == str( + timeline_id + ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" + + assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded" + + # the remote side of local_layer_truncated + remote_layer_path = local_fs_index_part_path(env, tenant_id, timeline_id).parent / path.name + + # if the upload ever was ongoing, this check would be racy, but at least one + # extra http request has been made in between so assume it's enough delay + assert ( + os.stat(remote_layer_path).st_size == expected_size + ), "truncated file should not had been uploaded around re-download" + + pg = env.postgres.create_start("main") + + with pg.cursor() as cur: + cur.execute("INSERT INTO t1 VALUES (234, 'test data');") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + # now that the upload is complete, make sure the file hasn't been + # re-uploaded truncated. this is a rather bogus check given the current + # implementation, but it's critical it doesn't happen so wasting a few + # lines of python to do this. + assert ( + os.stat(remote_layer_path).st_size == expected_size + ), "truncated file should not had been uploaded after next checkpoint" + + +def local_fs_index_part(env, tenant_id, timeline_id): + """ + Return json.load parsed index_part.json of tenant and timeline from LOCAL_FS + """ + timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id) + with open(timeline_path, "r") as timeline_file: + return json.load(timeline_file) + + +def local_fs_index_part_path(env, tenant_id, timeline_id): + """ + Return path to the LOCAL_FS index_part.json of the tenant and timeline. + """ + assert isinstance(env.remote_storage, LocalFsStorage) + return ( + env.remote_storage.root + / "tenants" + / str(tenant_id) + / "timelines" + / str(timeline_id) + / "index_part.json" + ) From c4ee62d427622d2d576db0e3b2ac3035ea37cd04 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 17 Oct 2022 12:58:40 +0300 Subject: [PATCH 0911/1022] Bump clap and other minor dependencies (#2623) --- Cargo.lock | 172 ++++-- compute_tools/Cargo.toml | 2 +- compute_tools/src/bin/compute_ctl.rs | 95 +-- control_plane/Cargo.toml | 2 +- control_plane/src/bin/neon_local.rs | 580 +++++++++--------- libs/postgres_ffi/Cargo.toml | 4 +- libs/postgres_ffi/wal_craft/Cargo.toml | 2 +- .../wal_craft/src/bin/wal_craft.rs | 163 ++--- pageserver/Cargo.toml | 2 +- pageserver/src/bin/pageserver.rs | 121 ++-- pageserver/src/bin/pageserver_binutils.rs | 88 +-- proxy/Cargo.toml | 20 +- proxy/src/main.rs | 160 ++--- safekeeper/Cargo.toml | 2 +- safekeeper/src/bin/safekeeper.rs | 242 ++++---- workspace_hack/Cargo.toml | 7 +- 16 files changed, 893 insertions(+), 769 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7659be6c92..131d9a8aa2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -75,9 +75,9 @@ checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" [[package]] name = "asn1-rs" -version = "0.3.1" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30ff05a702273012438132f449575dbc804e27b2f3cbe3069aa237d26c98fa33" +checksum = "cf6690c370453db30743b373a60ba498fc0d6d83b11f4abfd87a84a075db5dd4" dependencies = [ "asn1-rs-derive", "asn1-rs-impl", @@ -91,9 +91,9 @@ dependencies = [ [[package]] name = "asn1-rs-derive" -version = "0.1.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db8b7511298d5b7784b40b092d9e9dcd3a627a5707e4b5e507931ab0d44eeebf" +checksum = "726535892e8eae7e70657b4c8ea93d26b8553afb1ce617caee529ef96d7dee6c" dependencies = [ "proc-macro2", "quote", @@ -262,15 +262,13 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.60.1" +version = "0.61.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "062dddbc1ba4aca46de6338e2bf87771414c335f7b2f2036e8f3e9befebf88e6" +checksum = "8a022e58a142a46fea340d68012b9201c094e93ec3d033a944a24f8fd4a4f09a" dependencies = [ "bitflags", "cexpr", "clang-sys", - "clap", - "env_logger", "lazy_static", "lazycell", "log", @@ -280,6 +278,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", + "syn", "which", ] @@ -327,13 +326,14 @@ checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426" [[package]] name = "bstr" -version = "0.2.17" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +checksum = "fca0852af221f458706eb0725c03e4ed6c46af9ac98e6a689d5e634215d594dd" dependencies = [ - "lazy_static", "memchr", + "once_cell", "regex-automata", + "serde", ] [[package]] @@ -449,14 +449,24 @@ name = "clap" version = "3.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86447ad904c7fb335a790c9d7fe3d0d971dc523b8ccd1561a520de9a85302750" +dependencies = [ + "bitflags", + "clap_lex 0.2.4", + "indexmap", + "textwrap", +] + +[[package]] +name = "clap" +version = "4.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bf8832993da70a4c6d13c581f4463c2bdda27b9bf1c5498dc4365543abe6d6f" dependencies = [ "atty", "bitflags", - "clap_lex", - "indexmap", + "clap_lex 0.3.0", "strsim", "termcolor", - "textwrap", ] [[package]] @@ -468,6 +478,15 @@ dependencies = [ "os_str_bytes", ] +[[package]] +name = "clap_lex" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d4198f73e42b4936b35b5bb248d81d2b595ecb170da0bac7655c54eedfa8da8" +dependencies = [ + "os_str_bytes", +] + [[package]] name = "close_fds" version = "0.3.2" @@ -525,7 +544,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap", + "clap 4.0.15", "env_logger", "futures", "hyper", @@ -567,7 +586,7 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "clap", + "clap 4.0.15", "comfy-table", "git-version", "nix 0.25.0", @@ -660,7 +679,7 @@ dependencies = [ "atty", "cast", "ciborium", - "clap", + "clap 3.2.22", "criterion-plot", "itertools", "lazy_static", @@ -728,7 +747,7 @@ dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", - "memoffset", + "memoffset 0.6.5", "scopeguard", ] @@ -789,9 +808,9 @@ dependencies = [ [[package]] name = "cxx" -version = "1.0.78" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19f39818dcfc97d45b03953c1292efc4e80954e1583c4aa770bac1383e2310a4" +checksum = "3f83d0ebf42c6eafb8d7c52f7e5f2d3003b89c7aa4fd2b79229209459a849af8" dependencies = [ "cc", "cxxbridge-flags", @@ -801,9 +820,9 @@ dependencies = [ [[package]] name = "cxx-build" -version = "1.0.78" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e580d70777c116df50c390d1211993f62d40302881e54d4b79727acb83d0199" +checksum = "07d050484b55975889284352b0ffc2ecbda25c0c55978017c132b29ba0818a86" dependencies = [ "cc", "codespan-reporting", @@ -816,15 +835,15 @@ dependencies = [ [[package]] name = "cxxbridge-flags" -version = "1.0.78" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56a46460b88d1cec95112c8c363f0e2c39afdb237f60583b0b36343bf627ea9c" +checksum = "99d2199b00553eda8012dfec8d3b1c75fce747cf27c169a270b3b99e3448ab78" [[package]] name = "cxxbridge-macro" -version = "1.0.78" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "747b608fecf06b0d72d440f27acc99288207324b793be2c17991839f3d4995ea" +checksum = "dcb67a6de1f602736dd7eaead0080cf3435df806c61b24b13328db128c58868f" dependencies = [ "proc-macro2", "quote", @@ -888,14 +907,14 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6ee87af31d84ef885378aebca32be3d682b0e0dc119d5b4860a2c5bb5046730" dependencies = [ - "uuid", + "uuid 0.8.2", ] [[package]] name = "der-parser" -version = "7.0.0" +version = "8.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe398ac75057914d7d07307bf67dc7f3f574a26783b4fc7805a20ffa9f506e82" +checksum = "42d4bc9b0db0a0df9ae64634ac5bdefb7afcb534e182275ca0beadbe486701c1" dependencies = [ "asn1-rs", "displaydoc", @@ -1204,6 +1223,12 @@ version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1" +[[package]] +name = "futures-timer" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" + [[package]] name = "futures-util" version = "0.3.24" @@ -1516,9 +1541,9 @@ dependencies = [ [[package]] name = "iana-time-zone-haiku" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fde6edd6cef363e9359ed3c98ba64590ba9eecba2293eb5a723ab32aee8926aa" +checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" dependencies = [ "cxx", "cxx-build", @@ -1644,9 +1669,9 @@ dependencies = [ [[package]] name = "kqueue" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d6112e8f37b59803ac47a42d14f1f3a59bbf72fc6857ffc5be455e28a691f8e" +checksum = "2c8fc60ba15bf51257aa9807a48a61013db043fcf3a78cb0d916e8e396dcad98" dependencies = [ "kqueue-sys", "libc", @@ -1790,6 +1815,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg", +] + [[package]] name = "metrics" version = "0.1.0" @@ -1882,7 +1916,7 @@ dependencies = [ "cc", "cfg-if", "libc", - "memoffset", + "memoffset 0.6.5", ] [[package]] @@ -1895,7 +1929,7 @@ dependencies = [ "bitflags", "cfg-if", "libc", - "memoffset", + "memoffset 0.6.5", "pin-utils", ] @@ -2008,9 +2042,9 @@ dependencies = [ [[package]] name = "oid-registry" -version = "0.4.0" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38e20717fa0541f39bd146692035c37bedfa532b3e5071b35761082407546b2a" +checksum = "7d4bda43fd1b844cbc6e6e54b5444e2b1bc7838bce59ad205902cccbb26d6761" dependencies = [ "asn1-rs", ] @@ -2101,7 +2135,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap", + "clap 4.0.15", "close_fds", "const_format", "crc32c", @@ -2377,7 +2411,7 @@ dependencies = [ "env_logger", "hex", "log", - "memoffset", + "memoffset 0.7.1", "once_cell", "postgres", "rand", @@ -2416,9 +2450,9 @@ checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" [[package]] name = "prettyplease" -version = "0.1.20" +version = "0.1.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83fead41e178796ef8274dc612a7d8ce4c7e10ca35cd2c5b5ad24cac63aeb6c0" +checksum = "c142c0e46b57171fe0c528bee8c5b7569e80f0c17e377cd0e30ea57dbc11bb51" dependencies = [ "proc-macro2", "syn", @@ -2432,9 +2466,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro2" -version = "1.0.46" +version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b" +checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" dependencies = [ "unicode-ident", ] @@ -2533,7 +2567,7 @@ dependencies = [ "base64", "bstr", "bytes", - "clap", + "clap 4.0.15", "futures", "git-version", "hashbrown", @@ -2567,7 +2601,7 @@ dependencies = [ "tracing-subscriber", "url", "utils", - "uuid", + "uuid 1.2.1", "workspace_hack", "x509-parser", ] @@ -2656,13 +2690,13 @@ dependencies = [ [[package]] name = "rcgen" -version = "0.8.14" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5911d1403f4143c9d56a702069d593e8d0f3fab880a85e103604d0893ea31ba7" +checksum = "ffbe84efe2f38dea12e9bfc1f65377fdf03e53a18cb3b995faedf7934c7e785b" dependencies = [ - "chrono", "pem", "ring", + "time 0.3.15", "yasna", ] @@ -2852,9 +2886,21 @@ dependencies = [ [[package]] name = "rstest" -version = "0.12.0" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d912f35156a3f99a66ee3e11ac2e0b3f34ac85a07e05263d05a7e2c8810d616f" +checksum = "e9c9dc66cc29792b663ffb5269be669f1613664e69ad56441fdb895c2347b930" +dependencies = [ + "futures", + "futures-timer", + "rstest_macros", + "rustc_version 0.4.0", +] + +[[package]] +name = "rstest_macros" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5015e68a0685a95ade3eee617ff7101ab6a3fc689203101ca16ebc16f2b89c66" dependencies = [ "cfg-if", "proc-macro2", @@ -3034,7 +3080,7 @@ dependencies = [ "async-trait", "byteorder", "bytes", - "clap", + "clap 4.0.15", "const_format", "crc32c", "daemonize", @@ -3424,7 +3470,7 @@ dependencies = [ "debugid", "memmap2", "stable_deref_trait", - "uuid", + "uuid 0.8.2", ] [[package]] @@ -4010,6 +4056,12 @@ name = "uuid" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" + +[[package]] +name = "uuid" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "feb41e78f93363bb2df8b0e86a2ca30eed7806ea16ea0c790d757cf93f79be83" dependencies = [ "getrandom", "serde", @@ -4059,7 +4111,7 @@ name = "wal_craft" version = "0.1.0" dependencies = [ "anyhow", - "clap", + "clap 4.0.15", "env_logger", "log", "once_cell", @@ -4299,7 +4351,7 @@ dependencies = [ "anyhow", "bytes", "chrono", - "clap", + "clap 4.0.15", "crossbeam-utils", "either", "fail", @@ -4307,6 +4359,7 @@ dependencies = [ "indexmap", "libc", "log", + "memchr", "nom", "num-bigint", "num-integer", @@ -4324,14 +4377,13 @@ dependencies = [ "tokio-util", "tracing", "tracing-core", - "uuid", ] [[package]] name = "x509-parser" -version = "0.13.2" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb9bace5b5589ffead1afb76e43e34cff39cd0f3ce7e170ae0c29e53b88eb1c" +checksum = "e0ecbeb7b67ce215e40e3cc7f2ff902f94a223acf44995934763467e7b1febc8" dependencies = [ "asn1-rs", "base64", @@ -4362,11 +4414,11 @@ checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3" [[package]] name = "yasna" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e262a29d0e61ccf2b6190d7050d4b237535fc76ce4c1210d9caa316f71dffa75" +checksum = "346d34a236c9d3e5f3b9b74563f238f955bbd05fa0b8b4efa53c130c43982f4c" dependencies = [ - "chrono", + "time 0.3.15", ] [[package]] diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 43cf7ae2dd..d6f8fae34c 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" [dependencies] anyhow = "1.0" chrono = "0.4" -clap = "3.0" +clap = "4.0" env_logger = "0.9" futures = "0.3.13" hyper = { version = "0.14", features = ["full"] } diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index fc5bbc5fd2..7786d7af9c 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -51,53 +51,19 @@ fn main() -> Result<()> { // TODO: re-use `utils::logging` later init_logger(DEFAULT_LOG_LEVEL)?; - // Env variable is set by `cargo` - let version: Option<&str> = option_env!("CARGO_PKG_VERSION"); - let matches = clap::App::new("compute_ctl") - .version(version.unwrap_or("unknown")) - .arg( - Arg::new("connstr") - .short('C') - .long("connstr") - .value_name("DATABASE_URL") - .required(true), - ) - .arg( - Arg::new("pgdata") - .short('D') - .long("pgdata") - .value_name("DATADIR") - .required(true), - ) - .arg( - Arg::new("pgbin") - .short('b') - .long("pgbin") - .value_name("POSTGRES_PATH"), - ) - .arg( - Arg::new("spec") - .short('s') - .long("spec") - .value_name("SPEC_JSON"), - ) - .arg( - Arg::new("spec-path") - .short('S') - .long("spec-path") - .value_name("SPEC_PATH"), - ) - .get_matches(); + let matches = cli().get_matches(); - let pgdata = matches.value_of("pgdata").expect("PGDATA path is required"); + let pgdata = matches + .get_one::("pgdata") + .expect("PGDATA path is required"); let connstr = matches - .value_of("connstr") + .get_one::("connstr") .expect("Postgres connection string is required"); - let spec = matches.value_of("spec"); - let spec_path = matches.value_of("spec-path"); + let spec = matches.get_one::("spec"); + let spec_path = matches.get_one::("spec-path"); // Try to use just 'postgres' if no path is provided - let pgbin = matches.value_of("pgbin").unwrap_or("postgres"); + let pgbin = matches.get_one::("pgbin").unwrap(); let spec: ComputeSpec = match spec { // First, try to get cluster spec from the cli argument @@ -173,3 +139,48 @@ fn main() -> Result<()> { } } } + +fn cli() -> clap::Command { + // Env variable is set by `cargo` + let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown"); + clap::Command::new("compute_ctl") + .version(version) + .arg( + Arg::new("connstr") + .short('C') + .long("connstr") + .value_name("DATABASE_URL") + .required(true), + ) + .arg( + Arg::new("pgdata") + .short('D') + .long("pgdata") + .value_name("DATADIR") + .required(true), + ) + .arg( + Arg::new("pgbin") + .short('b') + .long("pgbin") + .default_value("postgres") + .value_name("POSTGRES_PATH"), + ) + .arg( + Arg::new("spec") + .short('s') + .long("spec") + .value_name("SPEC_JSON"), + ) + .arg( + Arg::new("spec-path") + .short('S') + .long("spec-path") + .value_name("SPEC_PATH"), + ) +} + +#[test] +fn verify_cli() { + cli().debug_assert() +} diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 690b63613a..287385c709 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -clap = "3.0" +clap = "4.0" comfy-table = "6.1" git-version = "0.3.5" tar = "0.4.38" diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 08797fe907..70a2c97a9e 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -6,7 +6,7 @@ //! rely on `neon_local` to set up the environment for each test. //! use anyhow::{anyhow, bail, Context, Result}; -use clap::{App, AppSettings, Arg, ArgMatches}; +use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; use control_plane::compute::ComputeControlPlane; use control_plane::local_env::{EtcdBroker, LocalEnv}; use control_plane::safekeeper::SafekeeperNode; @@ -85,212 +85,7 @@ struct TimelineTreeEl { // * Providing CLI api to the pageserver // * TODO: export/import to/from usual postgres fn main() -> Result<()> { - let branch_name_arg = Arg::new("branch-name") - .long("branch-name") - .takes_value(true) - .help("Name of the branch to be created or used as an alias for other services") - .required(false); - - let pg_node_arg = Arg::new("node").help("Postgres node name").required(false); - - let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false); - - let tenant_id_arg = Arg::new("tenant-id") - .long("tenant-id") - .help("Tenant id. Represented as a hexadecimal string 32 symbols length") - .takes_value(true) - .required(false); - - let timeline_id_arg = Arg::new("timeline-id") - .long("timeline-id") - .help("Timeline id. Represented as a hexadecimal string 32 symbols length") - .takes_value(true) - .required(false); - - let pg_version_arg = Arg::new("pg-version") - .long("pg-version") - .help("Postgres version to use for the initial tenant") - .required(false) - .takes_value(true) - .default_value(DEFAULT_PG_VERSION); - - let port_arg = Arg::new("port") - .long("port") - .required(false) - .value_name("port"); - - let stop_mode_arg = Arg::new("stop-mode") - .short('m') - .takes_value(true) - .possible_values(&["fast", "immediate"]) - .help("If 'immediate', don't flush repository data at shutdown") - .required(false) - .value_name("stop-mode"); - - let pageserver_config_args = Arg::new("pageserver-config-override") - .long("pageserver-config-override") - .takes_value(true) - .number_of_values(1) - .multiple_occurrences(true) - .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more") - .required(false); - - let lsn_arg = Arg::new("lsn") - .long("lsn") - .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.") - .takes_value(true) - .required(false); - - let matches = App::new("Neon CLI") - .setting(AppSettings::ArgRequiredElseHelp) - .version(GIT_VERSION) - .subcommand( - App::new("init") - .about("Initialize a new Neon repository") - .arg(pageserver_config_args.clone()) - .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) - .arg( - Arg::new("config") - .long("config") - .required(false) - .value_name("config"), - ) - .arg(pg_version_arg.clone()) - ) - .subcommand( - App::new("timeline") - .about("Manage timelines") - .subcommand(App::new("list") - .about("List all timelines, available to this pageserver") - .arg(tenant_id_arg.clone())) - .subcommand(App::new("branch") - .about("Create a new timeline, using another timeline as a base, copying its data") - .arg(tenant_id_arg.clone()) - .arg(branch_name_arg.clone()) - .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name").takes_value(true) - .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false)) - .arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn").takes_value(true) - .help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false))) - .subcommand(App::new("create") - .about("Create a new blank timeline") - .arg(tenant_id_arg.clone()) - .arg(branch_name_arg.clone()) - .arg(pg_version_arg.clone()) - ) - .subcommand(App::new("import") - .about("Import timeline from basebackup directory") - .arg(tenant_id_arg.clone()) - .arg(timeline_id_arg.clone()) - .arg(Arg::new("node-name").long("node-name").takes_value(true) - .help("Name to assign to the imported timeline")) - .arg(Arg::new("base-tarfile").long("base-tarfile").takes_value(true) - .help("Basebackup tarfile to import")) - .arg(Arg::new("base-lsn").long("base-lsn").takes_value(true) - .help("Lsn the basebackup starts at")) - .arg(Arg::new("wal-tarfile").long("wal-tarfile").takes_value(true) - .help("Wal to add after base")) - .arg(Arg::new("end-lsn").long("end-lsn").takes_value(true) - .help("Lsn the basebackup ends at")) - .arg(pg_version_arg.clone()) - ) - ).subcommand( - App::new("tenant") - .setting(AppSettings::ArgRequiredElseHelp) - .about("Manage tenants") - .subcommand(App::new("list")) - .subcommand(App::new("create") - .arg(tenant_id_arg.clone()) - .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) - .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false)) - .arg(pg_version_arg.clone()) - ) - .subcommand(App::new("config") - .arg(tenant_id_arg.clone()) - .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false)) - ) - ) - .subcommand( - App::new("pageserver") - .setting(AppSettings::ArgRequiredElseHelp) - .about("Manage pageserver") - .subcommand(App::new("status")) - .subcommand(App::new("start").about("Start local pageserver").arg(pageserver_config_args.clone())) - .subcommand(App::new("stop").about("Stop local pageserver") - .arg(stop_mode_arg.clone())) - .subcommand(App::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone())) - ) - .subcommand( - App::new("safekeeper") - .setting(AppSettings::ArgRequiredElseHelp) - .about("Manage safekeepers") - .subcommand(App::new("start") - .about("Start local safekeeper") - .arg(safekeeper_id_arg.clone()) - ) - .subcommand(App::new("stop") - .about("Stop local safekeeper") - .arg(safekeeper_id_arg.clone()) - .arg(stop_mode_arg.clone()) - ) - .subcommand(App::new("restart") - .about("Restart local safekeeper") - .arg(safekeeper_id_arg.clone()) - .arg(stop_mode_arg.clone()) - ) - ) - .subcommand( - App::new("pg") - .setting(AppSettings::ArgRequiredElseHelp) - .about("Manage postgres instances") - .subcommand(App::new("list").arg(tenant_id_arg.clone())) - .subcommand(App::new("create") - .about("Create a postgres compute node") - .arg(pg_node_arg.clone()) - .arg(branch_name_arg.clone()) - .arg(tenant_id_arg.clone()) - .arg(lsn_arg.clone()) - .arg(port_arg.clone()) - .arg( - Arg::new("config-only") - .help("Don't do basebackup, create compute node with only config files") - .long("config-only") - .required(false)) - .arg(pg_version_arg.clone()) - ) - .subcommand(App::new("start") - .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") - .arg(pg_node_arg.clone()) - .arg(tenant_id_arg.clone()) - .arg(branch_name_arg.clone()) - .arg(timeline_id_arg.clone()) - .arg(lsn_arg.clone()) - .arg(port_arg.clone()) - .arg(pg_version_arg.clone()) - ) - .subcommand( - App::new("stop") - .arg(pg_node_arg.clone()) - .arg(tenant_id_arg.clone()) - .arg( - Arg::new("destroy") - .help("Also delete data directory (now optional, should be default in future)") - .long("destroy") - .required(false) - ) - ) - - ) - .subcommand( - App::new("start") - .about("Start page server and safekeepers") - .arg(pageserver_config_args) - ) - .subcommand( - App::new("stop") - .about("Stop page server and safekeepers") - .arg(stop_mode_arg.clone()) - ) - .get_matches(); + let matches = cli().get_matches(); let (sub_name, sub_args) = match matches.subcommand() { Some(subcommand_data) => subcommand_data, @@ -475,16 +270,16 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { sub_match - .value_of("tenant-id") - .map(TenantId::from_str) + .get_one::("tenant-id") + .map(|tenant_id| TenantId::from_str(tenant_id)) .transpose() .context("Failed to parse tenant id from the argument string") } fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result> { sub_match - .value_of("timeline-id") - .map(TimelineId::from_str) + .get_one::("timeline-id") + .map(|timeline_id| TimelineId::from_str(timeline_id)) .transpose() .context("Failed to parse timeline id from the argument string") } @@ -493,19 +288,22 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { let initial_timeline_id_arg = parse_timeline_id(init_match)?; // Create config file - let toml_file: String = if let Some(config_path) = init_match.value_of("config") { + let toml_file: String = if let Some(config_path) = init_match.get_one::("config") { // load and parse the file - std::fs::read_to_string(std::path::Path::new(config_path)) - .with_context(|| format!("Could not read configuration file '{config_path}'"))? + std::fs::read_to_string(config_path).with_context(|| { + format!( + "Could not read configuration file '{}'", + config_path.display() + ) + })? } else { // Built-in default config default_conf(&EtcdBroker::locate_etcd()?) }; let pg_version = init_match - .value_of("pg-version") - .unwrap() - .parse::() + .get_one::("pg-version") + .copied() .context("Failed to parse postgres version from the argument string")?; let mut env = @@ -541,9 +339,10 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { init_match - .values_of("pageserver-config-override") + .get_many::("pageserver-config-override") .into_iter() .flatten() + .map(|s| s.as_str()) .collect() } @@ -558,7 +357,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an Some(("create", create_match)) => { let initial_tenant_id = parse_tenant_id(create_match)?; let tenant_conf: HashMap<_, _> = create_match - .values_of("config") + .get_many::("config") .map(|vals| vals.flat_map(|c| c.split_once(':')).collect()) .unwrap_or_default(); let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?; @@ -567,9 +366,8 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an // Create an initial timeline for the new tenant let new_timeline_id = parse_timeline_id(create_match)?; let pg_version = create_match - .value_of("pg-version") - .unwrap() - .parse::() + .get_one::("pg-version") + .copied() .context("Failed to parse postgres version from the argument string")?; let timeline_info = pageserver.timeline_create( @@ -595,7 +393,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an Some(("config", create_match)) => { let tenant_id = get_tenant_id(create_match, env)?; let tenant_conf: HashMap<_, _> = create_match - .values_of("config") + .get_many::("config") .map(|vals| vals.flat_map(|c| c.split_once(':')).collect()) .unwrap_or_default(); @@ -622,13 +420,12 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - Some(("create", create_match)) => { let tenant_id = get_tenant_id(create_match, env)?; let new_branch_name = create_match - .value_of("branch-name") + .get_one::("branch-name") .ok_or_else(|| anyhow!("No branch name provided"))?; let pg_version = create_match - .value_of("pg-version") - .unwrap() - .parse::() + .get_one::("pg-version") + .copied() .context("Failed to parse postgres version from the argument string")?; let timeline_info = @@ -647,35 +444,32 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let tenant_id = get_tenant_id(import_match, env)?; let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided"); let name = import_match - .value_of("node-name") + .get_one::("node-name") .ok_or_else(|| anyhow!("No node name provided"))?; // Parse base inputs let base_tarfile = import_match - .value_of("base-tarfile") - .map(|s| PathBuf::from_str(s).unwrap()) - .ok_or_else(|| anyhow!("No base-tarfile provided"))?; + .get_one::("base-tarfile") + .ok_or_else(|| anyhow!("No base-tarfile provided"))? + .to_owned(); let base_lsn = Lsn::from_str( import_match - .value_of("base-lsn") + .get_one::("base-lsn") .ok_or_else(|| anyhow!("No base-lsn provided"))?, )?; let base = (base_lsn, base_tarfile); // Parse pg_wal inputs - let wal_tarfile = import_match - .value_of("wal-tarfile") - .map(|s| PathBuf::from_str(s).unwrap()); + let wal_tarfile = import_match.get_one::("wal-tarfile").cloned(); let end_lsn = import_match - .value_of("end-lsn") + .get_one::("end-lsn") .map(|s| Lsn::from_str(s).unwrap()); // TODO validate both or none are provided let pg_wal = end_lsn.zip(wal_tarfile); let pg_version = import_match - .value_of("pg-version") - .unwrap() - .parse::() + .get_one::("pg-version") + .copied() .context("Failed to parse postgres version from the argument string")?; let mut cplane = ComputeControlPlane::load(env.clone())?; @@ -690,10 +484,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - Some(("branch", branch_match)) => { let tenant_id = get_tenant_id(branch_match, env)?; let new_branch_name = branch_match - .value_of("branch-name") + .get_one::("branch-name") .ok_or_else(|| anyhow!("No branch name provided"))?; let ancestor_branch_name = branch_match - .value_of("ancestor-branch-name") + .get_one::("ancestor-branch-name") + .map(|s| s.as_str()) .unwrap_or(DEFAULT_BRANCH_NAME); let ancestor_timeline_id = env .get_branch_timeline_id(ancestor_branch_name, tenant_id) @@ -702,8 +497,8 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - })?; let start_lsn = branch_match - .value_of("ancestor-start-lsn") - .map(Lsn::from_str) + .get_one::("ancestor-start-lsn") + .map(|lsn_str| Lsn::from_str(lsn_str)) .transpose() .context("Failed to parse ancestor start Lsn from the request")?; let timeline_info = pageserver.timeline_create( @@ -804,45 +599,39 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { } "create" => { let branch_name = sub_args - .value_of("branch-name") + .get_one::("branch-name") + .map(|s| s.as_str()) .unwrap_or(DEFAULT_BRANCH_NAME); let node_name = sub_args - .value_of("node") - .map(ToString::to_string) - .unwrap_or_else(|| format!("{}_node", branch_name)); + .get_one::("node") + .map(|node_name| node_name.to_string()) + .unwrap_or_else(|| format!("{branch_name}_node")); let lsn = sub_args - .value_of("lsn") - .map(Lsn::from_str) + .get_one::("lsn") + .map(|lsn_str| Lsn::from_str(lsn_str)) .transpose() .context("Failed to parse Lsn from the request")?; let timeline_id = env .get_branch_timeline_id(branch_name, tenant_id) - .ok_or_else(|| anyhow!("Found no timeline id for branch name '{}'", branch_name))?; + .ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?; - let port: Option = match sub_args.value_of("port") { - Some(p) => Some(p.parse()?), - None => None, - }; + let port: Option = sub_args.get_one::("port").copied(); let pg_version = sub_args - .value_of("pg-version") - .unwrap() - .parse::() + .get_one::("pg-version") + .copied() .context("Failed to parse postgres version from the argument string")?; cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port, pg_version)?; } "start" => { - let port: Option = match sub_args.value_of("port") { - Some(p) => Some(p.parse()?), - None => None, - }; + let port: Option = sub_args.get_one::("port").copied(); let node_name = sub_args - .value_of("node") + .get_one::("node") .ok_or_else(|| anyhow!("No node name was provided to start"))?; - let node = cplane.nodes.get(&(tenant_id, node_name.to_owned())); + let node = cplane.nodes.get(&(tenant_id, node_name.to_string())); let auth_token = if matches!(env.pageserver.auth_type, AuthType::NeonJWT) { let claims = Claims::new(Some(tenant_id), Scope::Tenant); @@ -853,36 +642,33 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { }; if let Some(node) = node { - println!("Starting existing postgres {}...", node_name); + println!("Starting existing postgres {node_name}..."); node.start(&auth_token)?; } else { let branch_name = sub_args - .value_of("branch-name") + .get_one::("branch-name") + .map(|s| s.as_str()) .unwrap_or(DEFAULT_BRANCH_NAME); let timeline_id = env .get_branch_timeline_id(branch_name, tenant_id) .ok_or_else(|| { - anyhow!("Found no timeline id for branch name '{}'", branch_name) + anyhow!("Found no timeline id for branch name '{branch_name}'") })?; let lsn = sub_args - .value_of("lsn") - .map(Lsn::from_str) + .get_one::("lsn") + .map(|lsn_str| Lsn::from_str(lsn_str)) .transpose() .context("Failed to parse Lsn from the request")?; let pg_version = sub_args - .value_of("pg-version") - .unwrap() - .parse::() - .context("Failed to parse postgres version from the argument string")?; + .get_one::("pg-version") + .copied() + .context("Failed to `pg-version` from the argument string")?; // when used with custom port this results in non obvious behaviour // port is remembered from first start command, i e // start --port X // stop // start <-- will also use port X even without explicit port argument - println!( - "Starting new postgres (v{}) {} on timeline {} ...", - pg_version, node_name, timeline_id - ); + println!("Starting new postgres (v{pg_version}) {node_name} on timeline {timeline_id} ..."); let node = cplane.new_node(tenant_id, node_name, timeline_id, lsn, port, pg_version)?; @@ -891,18 +677,18 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { } "stop" => { let node_name = sub_args - .value_of("node") + .get_one::("node") .ok_or_else(|| anyhow!("No node name was provided to stop"))?; - let destroy = sub_args.is_present("destroy"); + let destroy = sub_args.get_flag("destroy"); let node = cplane .nodes - .get(&(tenant_id, node_name.to_owned())) - .with_context(|| format!("postgres {} is not found", node_name))?; + .get(&(tenant_id, node_name.to_string())) + .with_context(|| format!("postgres {node_name} is not found"))?; node.stop(destroy)?; } - _ => bail!("Unexpected pg subcommand '{}'", sub_name), + _ => bail!("Unexpected pg subcommand '{sub_name}'"), } Ok(()) @@ -920,7 +706,10 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul } Some(("stop", stop_match)) => { - let immediate = stop_match.value_of("stop-mode") == Some("immediate"); + let immediate = stop_match + .get_one::("stop-mode") + .map(|s| s.as_str()) + == Some("immediate"); if let Err(e) = pageserver.stop(immediate) { eprintln!("pageserver stop failed: {}", e); @@ -970,7 +759,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul }; // All the commands take an optional safekeeper name argument - let sk_id = if let Some(id_str) = sub_args.value_of("id") { + let sk_id = if let Some(id_str) = sub_args.get_one::("id") { NodeId(id_str.parse().context("while parsing safekeeper id")?) } else { DEFAULT_SAFEKEEPER_ID @@ -986,7 +775,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul } "stop" => { - let immediate = sub_args.value_of("stop-mode") == Some("immediate"); + let immediate = + sub_args.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); if let Err(e) = safekeeper.stop(immediate) { eprintln!("safekeeper stop failed: {}", e); @@ -995,7 +785,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul } "restart" => { - let immediate = sub_args.value_of("stop-mode") == Some("immediate"); + let immediate = + sub_args.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); if let Err(e) = safekeeper.stop(immediate) { eprintln!("safekeeper stop failed: {}", e); @@ -1039,7 +830,8 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow } fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let immediate = sub_match.value_of("stop-mode") == Some("immediate"); + let immediate = + sub_match.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); let pageserver = PageServerNode::from_env(env); @@ -1072,3 +864,219 @@ fn try_stop_etcd_process(env: &local_env::LocalEnv) { eprintln!("etcd stop failed: {e}"); } } + +fn cli() -> Command { + let branch_name_arg = Arg::new("branch-name") + .long("branch-name") + .help("Name of the branch to be created or used as an alias for other services") + .required(false); + + let pg_node_arg = Arg::new("node").help("Postgres node name").required(false); + + let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false); + + let tenant_id_arg = Arg::new("tenant-id") + .long("tenant-id") + .help("Tenant id. Represented as a hexadecimal string 32 symbols length") + .required(false); + + let timeline_id_arg = Arg::new("timeline-id") + .long("timeline-id") + .help("Timeline id. Represented as a hexadecimal string 32 symbols length") + .required(false); + + let pg_version_arg = Arg::new("pg-version") + .long("pg-version") + .help("Postgres version to use for the initial tenant") + .required(false) + .value_parser(value_parser!(u32)) + .default_value(DEFAULT_PG_VERSION); + + let port_arg = Arg::new("port") + .long("port") + .required(false) + .value_parser(value_parser!(u16)) + .value_name("port"); + + let stop_mode_arg = Arg::new("stop-mode") + .short('m') + .value_parser(["fast", "immediate"]) + .help("If 'immediate', don't flush repository data at shutdown") + .required(false) + .value_name("stop-mode"); + + let pageserver_config_args = Arg::new("pageserver-config-override") + .long("pageserver-config-override") + .num_args(1) + .action(ArgAction::Append) + .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more") + .required(false); + + let lsn_arg = Arg::new("lsn") + .long("lsn") + .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.") + .required(false); + + Command::new("Neon CLI") + .arg_required_else_help(true) + .version(GIT_VERSION) + .subcommand( + Command::new("init") + .about("Initialize a new Neon repository") + .arg(pageserver_config_args.clone()) + .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) + .arg( + Arg::new("config") + .long("config") + .required(false) + .value_parser(value_parser!(PathBuf)) + .value_name("config"), + ) + .arg(pg_version_arg.clone()) + ) + .subcommand( + Command::new("timeline") + .about("Manage timelines") + .subcommand(Command::new("list") + .about("List all timelines, available to this pageserver") + .arg(tenant_id_arg.clone())) + .subcommand(Command::new("branch") + .about("Create a new timeline, using another timeline as a base, copying its data") + .arg(tenant_id_arg.clone()) + .arg(branch_name_arg.clone()) + .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name") + .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false)) + .arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn") + .help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false))) + .subcommand(Command::new("create") + .about("Create a new blank timeline") + .arg(tenant_id_arg.clone()) + .arg(branch_name_arg.clone()) + .arg(pg_version_arg.clone()) + ) + .subcommand(Command::new("import") + .about("Import timeline from basebackup directory") + .arg(tenant_id_arg.clone()) + .arg(timeline_id_arg.clone()) + .arg(Arg::new("node-name").long("node-name") + .help("Name to assign to the imported timeline")) + .arg(Arg::new("base-tarfile") + .long("base-tarfile") + .value_parser(value_parser!(PathBuf)) + .help("Basebackup tarfile to import") + ) + .arg(Arg::new("base-lsn").long("base-lsn") + .help("Lsn the basebackup starts at")) + .arg(Arg::new("wal-tarfile") + .long("wal-tarfile") + .value_parser(value_parser!(PathBuf)) + .help("Wal to add after base") + ) + .arg(Arg::new("end-lsn").long("end-lsn") + .help("Lsn the basebackup ends at")) + .arg(pg_version_arg.clone()) + ) + ).subcommand( + Command::new("tenant") + .arg_required_else_help(true) + .about("Manage tenants") + .subcommand(Command::new("list")) + .subcommand(Command::new("create") + .arg(tenant_id_arg.clone()) + .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) + .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)) + .arg(pg_version_arg.clone()) + ) + .subcommand(Command::new("config") + .arg(tenant_id_arg.clone()) + .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)) + ) + ) + .subcommand( + Command::new("pageserver") + .arg_required_else_help(true) + .about("Manage pageserver") + .subcommand(Command::new("status")) + .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone())) + .subcommand(Command::new("stop").about("Stop local pageserver") + .arg(stop_mode_arg.clone())) + .subcommand(Command::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone())) + ) + .subcommand( + Command::new("safekeeper") + .arg_required_else_help(true) + .about("Manage safekeepers") + .subcommand(Command::new("start") + .about("Start local safekeeper") + .arg(safekeeper_id_arg.clone()) + ) + .subcommand(Command::new("stop") + .about("Stop local safekeeper") + .arg(safekeeper_id_arg.clone()) + .arg(stop_mode_arg.clone()) + ) + .subcommand(Command::new("restart") + .about("Restart local safekeeper") + .arg(safekeeper_id_arg) + .arg(stop_mode_arg.clone()) + ) + ) + .subcommand( + Command::new("pg") + .arg_required_else_help(true) + .about("Manage postgres instances") + .subcommand(Command::new("list").arg(tenant_id_arg.clone())) + .subcommand(Command::new("create") + .about("Create a postgres compute node") + .arg(pg_node_arg.clone()) + .arg(branch_name_arg.clone()) + .arg(tenant_id_arg.clone()) + .arg(lsn_arg.clone()) + .arg(port_arg.clone()) + .arg( + Arg::new("config-only") + .help("Don't do basebackup, create compute node with only config files") + .long("config-only") + .required(false)) + .arg(pg_version_arg.clone()) + ) + .subcommand(Command::new("start") + .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") + .arg(pg_node_arg.clone()) + .arg(tenant_id_arg.clone()) + .arg(branch_name_arg) + .arg(timeline_id_arg) + .arg(lsn_arg) + .arg(port_arg) + .arg(pg_version_arg) + ) + .subcommand( + Command::new("stop") + .arg(pg_node_arg) + .arg(tenant_id_arg) + .arg( + Arg::new("destroy") + .help("Also delete data directory (now optional, should be default in future)") + .long("destroy") + .action(ArgAction::SetTrue) + .required(false) + ) + ) + + ) + .subcommand( + Command::new("start") + .about("Start page server and safekeepers") + .arg(pageserver_config_args) + ) + .subcommand( + Command::new("stop") + .about("Stop page server and safekeepers") + .arg(stop_mode_arg) + ) +} + +#[test] +fn verify_cli() { + cli().debug_assert(); +} diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 60caca76b8..01ff6ab60e 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -13,7 +13,7 @@ crc32c = "0.6.0" hex = "0.4.3" once_cell = "1.13.0" log = "0.4.14" -memoffset = "0.6.2" +memoffset = "0.7" thiserror = "1.0" serde = { version = "1.0", features = ["derive"] } utils = { path = "../utils" } @@ -26,4 +26,4 @@ wal_craft = { path = "wal_craft" } [build-dependencies] anyhow = "1.0" -bindgen = "0.60.1" +bindgen = "0.61" diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index 88466737ed..4c35c5a650 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -7,7 +7,7 @@ edition = "2021" [dependencies] anyhow = "1.0" -clap = "3.0" +clap = "4.0" env_logger = "0.9" log = "0.4" once_cell = "1.13.0" diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 9563298cd8..e87ca27e90 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -1,68 +1,19 @@ use anyhow::*; -use clap::{App, Arg, ArgMatches}; -use std::str::FromStr; +use clap::{value_parser, Arg, ArgMatches, Command}; +use std::{path::PathBuf, str::FromStr}; use wal_craft::*; fn main() -> Result<()> { env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("wal_craft=info")) .init(); - let type_arg = &Arg::new("type") - .takes_value(true) - .help("Type of WAL to craft") - .possible_values([ - Simple::NAME, - LastWalRecordXlogSwitch::NAME, - LastWalRecordXlogSwitchEndsOnPageBoundary::NAME, - WalRecordCrossingSegmentFollowedBySmallOne::NAME, - LastWalRecordCrossingSegment::NAME, - ]) - .required(true); - let arg_matches = App::new("Postgres WAL crafter") - .about("Crafts Postgres databases with specific WAL properties") - .subcommand( - App::new("print-postgres-config") - .about("Print the configuration required for PostgreSQL server before running this script") - ) - .subcommand( - App::new("with-initdb") - .about("Craft WAL in a new data directory first initialized with initdb") - .arg(type_arg) - .arg( - Arg::new("datadir") - .takes_value(true) - .help("Data directory for the Postgres server") - .required(true) - ) - .arg( - Arg::new("pg-distrib-dir") - .long("pg-distrib-dir") - .takes_value(true) - .help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)") - .default_value("/usr/local") - ) - .arg( - Arg::new("pg-version") - .long("pg-version") - .help("Postgres version to use for the initial tenant") - .required(true) - .takes_value(true) - ) - ) - .subcommand( - App::new("in-existing") - .about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.") - .arg(type_arg) - .arg( - Arg::new("connection") - .takes_value(true) - .help("Connection string to the Postgres database to populate") - .required(true) - ) - ) - .get_matches(); + let arg_matches = cli().get_matches(); let wal_craft = |arg_matches: &ArgMatches, client| { - let (intermediate_lsns, end_of_wal_lsn) = match arg_matches.value_of("type").unwrap() { + let (intermediate_lsns, end_of_wal_lsn) = match arg_matches + .get_one::("type") + .map(|s| s.as_str()) + .context("'type' is required")? + { Simple::NAME => Simple::craft(client)?, LastWalRecordXlogSwitch::NAME => LastWalRecordXlogSwitch::craft(client)?, LastWalRecordXlogSwitchEndsOnPageBoundary::NAME => { @@ -72,12 +23,12 @@ fn main() -> Result<()> { WalRecordCrossingSegmentFollowedBySmallOne::craft(client)? } LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?, - a => panic!("Unknown --type argument: {}", a), + a => panic!("Unknown --type argument: {a}"), }; for lsn in intermediate_lsns { - println!("intermediate_lsn = {}", lsn); + println!("intermediate_lsn = {lsn}"); } - println!("end_of_wal = {}", end_of_wal_lsn); + println!("end_of_wal = {end_of_wal_lsn}"); Ok(()) }; @@ -85,20 +36,24 @@ fn main() -> Result<()> { None => panic!("No subcommand provided"), Some(("print-postgres-config", _)) => { for cfg in REQUIRED_POSTGRES_CONFIG.iter() { - println!("{}", cfg); + println!("{cfg}"); } Ok(()) } Some(("with-initdb", arg_matches)) => { let cfg = Conf { - pg_version: arg_matches - .value_of("pg-version") - .unwrap() - .parse::() - .context("Failed to parse postgres version from the argument string")?, - pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(), - datadir: arg_matches.value_of("datadir").unwrap().into(), + pg_version: *arg_matches + .get_one::("pg-version") + .context("'pg-version' is required")?, + pg_distrib_dir: arg_matches + .get_one::("pg-distrib-dir") + .context("'pg-distrib-dir' is required")? + .to_owned(), + datadir: arg_matches + .get_one::("datadir") + .context("'datadir' is required")? + .to_owned(), }; cfg.initdb()?; let srv = cfg.start_server()?; @@ -108,9 +63,77 @@ fn main() -> Result<()> { } Some(("in-existing", arg_matches)) => wal_craft( arg_matches, - &mut postgres::Config::from_str(arg_matches.value_of("connection").unwrap())? - .connect(postgres::NoTls)?, + &mut postgres::Config::from_str( + arg_matches + .get_one::("connection") + .context("'connection' is required")?, + ) + .context( + "'connection' argument value could not be parsed as a postgres connection string", + )? + .connect(postgres::NoTls)?, ), Some(_) => panic!("Unknown subcommand"), } } + +fn cli() -> Command { + let type_arg = &Arg::new("type") + .help("Type of WAL to craft") + .value_parser([ + Simple::NAME, + LastWalRecordXlogSwitch::NAME, + LastWalRecordXlogSwitchEndsOnPageBoundary::NAME, + WalRecordCrossingSegmentFollowedBySmallOne::NAME, + LastWalRecordCrossingSegment::NAME, + ]) + .required(true); + + Command::new("Postgres WAL crafter") + .about("Crafts Postgres databases with specific WAL properties") + .subcommand( + Command::new("print-postgres-config") + .about("Print the configuration required for PostgreSQL server before running this script") + ) + .subcommand( + Command::new("with-initdb") + .about("Craft WAL in a new data directory first initialized with initdb") + .arg(type_arg) + .arg( + Arg::new("datadir") + .help("Data directory for the Postgres server") + .value_parser(value_parser!(PathBuf)) + .required(true) + ) + .arg( + Arg::new("pg-distrib-dir") + .long("pg-distrib-dir") + .value_parser(value_parser!(PathBuf)) + .help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)") + .default_value("/usr/local") + ) + .arg( + Arg::new("pg-version") + .long("pg-version") + .help("Postgres version to use for the initial tenant") + .value_parser(value_parser!(u32)) + .required(true) + + ) + ) + .subcommand( + Command::new("in-existing") + .about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.") + .arg(type_arg) + .arg( + Arg::new("connection") + .help("Connection string to the Postgres database to populate") + .required(true) + ) + ) +} + +#[test] +fn verify_cli() { + cli().debug_assert(); +} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index ea0cf3f18a..ebdda5c6da 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -23,7 +23,7 @@ futures = "0.3.13" hex = "0.4.3" hyper = "0.14" itertools = "0.10.3" -clap = "3.0" +clap = { version = "4.0", features = ["string"] } daemonize = "0.4.1" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } tokio-util = { version = "0.7.3", features = ["io", "io-util"] } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 4cd82e37b1..12f594077e 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -6,7 +6,7 @@ use tracing::*; use anyhow::{anyhow, bail, Context, Result}; -use clap::{App, Arg}; +use clap::{Arg, ArgAction, Command}; use daemonize::Daemonize; use fail::FailScenario; @@ -51,57 +51,17 @@ fn version() -> String { } fn main() -> anyhow::Result<()> { - let arg_matches = App::new("Neon page server") - .about("Materializes WAL stream to pages and serves them to the postgres") - .version(&*version()) - .arg( + let arg_matches = cli().get_matches(); - Arg::new("daemonize") - .short('d') - .long("daemonize") - .takes_value(false) - .help("Run in the background"), - ) - .arg( - Arg::new("init") - .long("init") - .takes_value(false) - .help("Initialize pageserver with all given config overrides"), - ) - .arg( - Arg::new("workdir") - .short('D') - .long("workdir") - .takes_value(true) - .help("Working directory for the pageserver"), - ) - // See `settings.md` for more details on the extra configuration patameters pageserver can process - .arg( - Arg::new("config-override") - .short('c') - .takes_value(true) - .number_of_values(1) - .multiple_occurrences(true) - .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). - Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), - ) - .arg(Arg::new("update-config").long("update-config").takes_value(false).help( - "Update the config file when started", - )) - .arg( - Arg::new("enabled-features") - .long("enabled-features") - .takes_value(false) - .help("Show enabled compile time features"), - ) - .get_matches(); - - if arg_matches.is_present("enabled-features") { + if arg_matches.get_flag("enabled-features") { println!("{{\"features\": {FEATURES:?} }}"); return Ok(()); } - let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".neon")); + let workdir = arg_matches + .get_one::("workdir") + .map(Path::new) + .unwrap_or_else(|| Path::new(".neon")); let workdir = workdir .canonicalize() .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?; @@ -115,7 +75,7 @@ fn main() -> anyhow::Result<()> { ) })?; - let daemonize = arg_matches.is_present("daemonize"); + let daemonize = arg_matches.get_flag("daemonize"); let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? { ControlFlow::Continue(conf) => conf, @@ -153,8 +113,8 @@ fn initialize_config( arg_matches: clap::ArgMatches, workdir: &Path, ) -> anyhow::Result> { - let init = arg_matches.is_present("init"); - let update_config = init || arg_matches.is_present("update-config"); + let init = arg_matches.get_flag("init"); + let update_config = init || arg_matches.get_flag("update-config"); let (mut toml, config_file_exists) = if cfg_file_path.is_file() { if init { @@ -196,13 +156,10 @@ fn initialize_config( ) }; - if let Some(values) = arg_matches.values_of("config-override") { + if let Some(values) = arg_matches.get_many::("config-override") { for option_line in values { let doc = toml_edit::Document::from_str(option_line).with_context(|| { - format!( - "Option '{}' could not be parsed as a toml document", - option_line - ) + format!("Option '{option_line}' could not be parsed as a toml document") })?; for (key, item) in doc.iter() { @@ -244,7 +201,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() // Initialize logger let log_file = logging::init(LOG_FILE_NAME, daemonize)?; - info!("version: {GIT_VERSION}"); + info!("version: {}", version()); // TODO: Check that it looks like a valid repository before going further @@ -385,3 +342,55 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() } }) } + +fn cli() -> Command { + Command::new("Neon page server") + .about("Materializes WAL stream to pages and serves them to the postgres") + .version(version()) + .arg( + + Arg::new("daemonize") + .short('d') + .long("daemonize") + .action(ArgAction::SetTrue) + .help("Run in the background"), + ) + .arg( + Arg::new("init") + .long("init") + .action(ArgAction::SetTrue) + .help("Initialize pageserver with all given config overrides"), + ) + .arg( + Arg::new("workdir") + .short('D') + .long("workdir") + .help("Working directory for the pageserver"), + ) + // See `settings.md` for more details on the extra configuration patameters pageserver can process + .arg( + Arg::new("config-override") + .short('c') + .num_args(1) + .action(ArgAction::Append) + .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \ + Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), + ) + .arg( + Arg::new("update-config") + .long("update-config") + .action(ArgAction::SetTrue) + .help("Update the config file when started"), + ) + .arg( + Arg::new("enabled-features") + .long("enabled-features") + .action(ArgAction::SetTrue) + .help("Show enabled compile time features"), + ) +} + +#[test] +fn verify_cli() { + cli().debug_assert(); +} diff --git a/pageserver/src/bin/pageserver_binutils.rs b/pageserver/src/bin/pageserver_binutils.rs index ec7699f194..b1484ac45a 100644 --- a/pageserver/src/bin/pageserver_binutils.rs +++ b/pageserver/src/bin/pageserver_binutils.rs @@ -9,7 +9,7 @@ use std::{ }; use anyhow::Context; -use clap::{App, Arg}; +use clap::{value_parser, Arg, Command}; use pageserver::{ page_cache, @@ -24,40 +24,14 @@ project_git_version!(GIT_VERSION); const METADATA_SUBCOMMAND: &str = "metadata"; fn main() -> anyhow::Result<()> { - let arg_matches = App::new("Neon Pageserver binutils") - .about("Reads pageserver (and related) binary files management utility") - .version(GIT_VERSION) - .arg(Arg::new("path").help("Input file path").required(false)) - .subcommand( - App::new(METADATA_SUBCOMMAND) - .about("Read and update pageserver metadata file") - .arg( - Arg::new("metadata_path") - .help("Input metadata file path") - .required(false), - ) - .arg( - Arg::new("disk_consistent_lsn") - .long("disk_consistent_lsn") - .takes_value(true) - .help("Replace disk consistent Lsn"), - ) - .arg( - Arg::new("prev_record_lsn") - .long("prev_record_lsn") - .takes_value(true) - .help("Replace previous record Lsn"), - ), - ) - .get_matches(); + let arg_matches = cli().get_matches(); match arg_matches.subcommand() { Some((subcommand_name, subcommand_matches)) => { - let path = PathBuf::from( - subcommand_matches - .value_of("metadata_path") - .context("'metadata_path' argument is missing")?, - ); + let path = subcommand_matches + .get_one::("metadata_path") + .context("'metadata_path' argument is missing")? + .to_path_buf(); anyhow::ensure!( subcommand_name == METADATA_SUBCOMMAND, "Unknown subcommand {subcommand_name}" @@ -65,11 +39,10 @@ fn main() -> anyhow::Result<()> { handle_metadata(&path, subcommand_matches)?; } None => { - let path = PathBuf::from( - arg_matches - .value_of("path") - .context("'path' argument is missing")?, - ); + let path = arg_matches + .get_one::("path") + .context("'path' argument is missing")? + .to_path_buf(); println!( "No subcommand specified, attempting to guess the format for file {}", path.display() @@ -110,7 +83,7 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?; println!("Current metadata:\n{meta:?}"); let mut update_meta = false; - if let Some(disk_consistent_lsn) = arg_matches.value_of("disk_consistent_lsn") { + if let Some(disk_consistent_lsn) = arg_matches.get_one::("disk_consistent_lsn") { meta = TimelineMetadata::new( Lsn::from_str(disk_consistent_lsn)?, meta.prev_record_lsn(), @@ -122,7 +95,7 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an ); update_meta = true; } - if let Some(prev_record_lsn) = arg_matches.value_of("prev_record_lsn") { + if let Some(prev_record_lsn) = arg_matches.get_one::("prev_record_lsn") { meta = TimelineMetadata::new( meta.disk_consistent_lsn(), Some(Lsn::from_str(prev_record_lsn)?), @@ -142,3 +115,40 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an Ok(()) } + +fn cli() -> Command { + Command::new("Neon Pageserver binutils") + .about("Reads pageserver (and related) binary files management utility") + .version(GIT_VERSION) + .arg( + Arg::new("path") + .help("Input file path") + .value_parser(value_parser!(PathBuf)) + .required(false), + ) + .subcommand( + Command::new(METADATA_SUBCOMMAND) + .about("Read and update pageserver metadata file") + .arg( + Arg::new("metadata_path") + .help("Input metadata file path") + .value_parser(value_parser!(PathBuf)) + .required(false), + ) + .arg( + Arg::new("disk_consistent_lsn") + .long("disk_consistent_lsn") + .help("Replace disk consistent Lsn"), + ) + .arg( + Arg::new("prev_record_lsn") + .long("prev_record_lsn") + .help("Replace previous record Lsn"), + ), + ) +} + +#[test] +fn verify_cli() { + cli().debug_assert(); +} diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 8049737989..395c22b8bc 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -7,9 +7,9 @@ edition = "2021" anyhow = "1.0" atty = "0.2.14" base64 = "0.13.0" -bstr = "0.2.17" +bstr = "1.0" bytes = { version = "1.0.1", features = ['serde'] } -clap = "3.0" +clap = "4.0" futures = "0.3.13" git-version = "0.3.5" hashbrown = "0.12" @@ -22,7 +22,11 @@ once_cell = "1.13.0" parking_lot = "0.12" pin-project-lite = "0.2.7" rand = "0.8.3" -reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } +reqwest = { version = "0.11", default-features = false, features = [ + "blocking", + "json", + "rustls-tls", +] } routerify = "3" rustls = "0.20.0" rustls-pemfile = "1" @@ -33,13 +37,13 @@ sha2 = "0.10.2" socket2 = "0.4.4" thiserror = "1.0.30" tokio = { version = "1.17", features = ["macros"] } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-rustls = "0.23.0" tracing = "0.1.36" tracing-subscriber = { version = "0.3", features = ["env-filter"] } url = "2.2.2" -uuid = { version = "0.8.2", features = ["v4", "serde"]} -x509-parser = "0.13.2" +uuid = { version = "1.2", features = ["v4", "serde"] } +x509-parser = "0.14" utils = { path = "../libs/utils" } metrics = { path = "../libs/metrics" } @@ -47,6 +51,6 @@ workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] async-trait = "0.1" -rcgen = "0.8.14" -rstest = "0.12" +rcgen = "0.10" +rstest = "0.15" tokio-postgres-rustls = "0.9.0" diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 91ef26a37f..2055616a6e 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -45,98 +45,43 @@ async fn main() -> anyhow::Result<()> { .with_target(false) .init(); - let arg_matches = clap::App::new("Neon proxy/router") - .version(GIT_VERSION) - .arg( - Arg::new("proxy") - .short('p') - .long("proxy") - .takes_value(true) - .help("listen for incoming client connections on ip:port") - .default_value("127.0.0.1:4432"), - ) - .arg( - Arg::new("auth-backend") - .long("auth-backend") - .takes_value(true) - .possible_values(["console", "postgres", "link"]) - .default_value("link"), - ) - .arg( - Arg::new("mgmt") - .short('m') - .long("mgmt") - .takes_value(true) - .help("listen for management callback connection on ip:port") - .default_value("127.0.0.1:7000"), - ) - .arg( - Arg::new("http") - .short('h') - .long("http") - .takes_value(true) - .help("listen for incoming http connections (metrics, etc) on ip:port") - .default_value("127.0.0.1:7001"), - ) - .arg( - Arg::new("uri") - .short('u') - .long("uri") - .takes_value(true) - .help("redirect unauthenticated users to the given uri in case of link auth") - .default_value("http://localhost:3000/psql_session/"), - ) - .arg( - Arg::new("auth-endpoint") - .short('a') - .long("auth-endpoint") - .takes_value(true) - .help("cloud API endpoint for authenticating users") - .default_value("http://localhost:3000/authenticate_proxy_request/"), - ) - .arg( - Arg::new("tls-key") - .short('k') - .long("tls-key") - .alias("ssl-key") // backwards compatibility - .takes_value(true) - .help("path to TLS key for client postgres connections"), - ) - .arg( - Arg::new("tls-cert") - .short('c') - .long("tls-cert") - .alias("ssl-cert") // backwards compatibility - .takes_value(true) - .help("path to TLS cert for client postgres connections"), - ) - .get_matches(); + let arg_matches = cli().get_matches(); let tls_config = match ( - arg_matches.value_of("tls-key"), - arg_matches.value_of("tls-cert"), + arg_matches.get_one::("tls-key"), + arg_matches.get_one::("tls-cert"), ) { (Some(key_path), Some(cert_path)) => Some(config::configure_tls(key_path, cert_path)?), (None, None) => None, _ => bail!("either both or neither tls-key and tls-cert must be specified"), }; - let proxy_address: SocketAddr = arg_matches.value_of("proxy").unwrap().parse()?; - let mgmt_address: SocketAddr = arg_matches.value_of("mgmt").unwrap().parse()?; - let http_address: SocketAddr = arg_matches.value_of("http").unwrap().parse()?; + let proxy_address: SocketAddr = arg_matches.get_one::("proxy").unwrap().parse()?; + let mgmt_address: SocketAddr = arg_matches.get_one::("mgmt").unwrap().parse()?; + let http_address: SocketAddr = arg_matches.get_one::("http").unwrap().parse()?; - let auth_backend = match arg_matches.value_of("auth-backend").unwrap() { + let auth_backend = match arg_matches + .get_one::("auth-backend") + .unwrap() + .as_str() + { "console" => { - let url = arg_matches.value_of("auth-endpoint").unwrap().parse()?; + let url = arg_matches + .get_one::("auth-endpoint") + .unwrap() + .parse()?; let endpoint = http::Endpoint::new(url, reqwest::Client::new()); auth::BackendType::Console(Cow::Owned(endpoint), ()) } "postgres" => { - let url = arg_matches.value_of("auth-endpoint").unwrap().parse()?; + let url = arg_matches + .get_one::("auth-endpoint") + .unwrap() + .parse()?; auth::BackendType::Postgres(Cow::Owned(url), ()) } "link" => { - let url = arg_matches.value_of("uri").unwrap().parse()?; + let url = arg_matches.get_one::("uri").unwrap().parse()?; auth::BackendType::Link(Cow::Owned(url)) } other => bail!("unsupported auth backend: {other}"), @@ -174,3 +119,68 @@ async fn main() -> anyhow::Result<()> { Ok(()) } + +fn cli() -> clap::Command { + clap::Command::new("Neon proxy/router") + .disable_help_flag(true) + .version(GIT_VERSION) + .arg( + Arg::new("proxy") + .short('p') + .long("proxy") + .help("listen for incoming client connections on ip:port") + .default_value("127.0.0.1:4432"), + ) + .arg( + Arg::new("auth-backend") + .long("auth-backend") + .value_parser(["console", "postgres", "link"]) + .default_value("link"), + ) + .arg( + Arg::new("mgmt") + .short('m') + .long("mgmt") + .help("listen for management callback connection on ip:port") + .default_value("127.0.0.1:7000"), + ) + .arg( + Arg::new("http") + .long("http") + .help("listen for incoming http connections (metrics, etc) on ip:port") + .default_value("127.0.0.1:7001"), + ) + .arg( + Arg::new("uri") + .short('u') + .long("uri") + .help("redirect unauthenticated users to the given uri in case of link auth") + .default_value("http://localhost:3000/psql_session/"), + ) + .arg( + Arg::new("auth-endpoint") + .short('a') + .long("auth-endpoint") + .help("cloud API endpoint for authenticating users") + .default_value("http://localhost:3000/authenticate_proxy_request/"), + ) + .arg( + Arg::new("tls-key") + .short('k') + .long("tls-key") + .alias("ssl-key") // backwards compatibility + .help("path to TLS key for client postgres connections"), + ) + .arg( + Arg::new("tls-cert") + .short('c') + .long("tls-cert") + .alias("ssl-cert") // backwards compatibility + .help("path to TLS cert for client postgres connections"), + ) +} + +#[test] +fn verify_cli() { + cli().debug_assert(); +} diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index ddc3956d74..64c541ddef 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -11,7 +11,7 @@ hyper = "0.14" fs2 = "0.4.3" serde_json = "1" tracing = "0.1.27" -clap = "3.0" +clap = "4.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["macros", "fs"] } postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 3f55d823cc..9422b55d60 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -2,7 +2,7 @@ // Main entry point for the safekeeper executable // use anyhow::{bail, Context, Result}; -use clap::{App, Arg}; +use clap::{value_parser, Arg, ArgAction, Command}; use const_format::formatcp; use daemonize::Daemonize; use fs2::FileExt; @@ -40,145 +40,44 @@ const ID_FILE_NAME: &str = "safekeeper.id"; project_git_version!(GIT_VERSION); fn main() -> anyhow::Result<()> { - let arg_matches = App::new("Neon safekeeper") - .about("Store WAL stream to local file system and push it to WAL receivers") - .version(GIT_VERSION) - .arg( - Arg::new("datadir") - .short('D') - .long("dir") - .takes_value(true) - .help("Path to the safekeeper data directory"), - ) - .arg( - Arg::new("init") - .long("init") - .takes_value(false) - .help("Initialize safekeeper with ID"), - ) - .arg( - Arg::new("listen-pg") - .short('l') - .long("listen-pg") - .alias("listen") // for compatibility - .takes_value(true) - .help(formatcp!("listen for incoming WAL data connections on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")), - ) - .arg( - Arg::new("listen-http") - .long("listen-http") - .takes_value(true) - .help(formatcp!("http endpoint address for metrics on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")), - ) - // FIXME this argument is no longer needed since pageserver address is forwarded from compute. - // However because this argument is in use by console's e2e tests let's keep it for now and remove separately. - // So currently it is a noop. - .arg( - Arg::new("pageserver") - .short('p') - .long("pageserver") - .takes_value(true), - ) - .arg( - Arg::new("recall") - .long("recall") - .takes_value(true) - .help("Period for requestion pageserver to call for replication"), - ) - .arg( - Arg::new("daemonize") - .short('d') - .long("daemonize") - .takes_value(false) - .help("Run in the background"), - ) - .arg( - Arg::new("no-sync") - .short('n') - .long("no-sync") - .takes_value(false) - .help("Do not wait for changes to be written safely to disk"), - ) - .arg( - Arg::new("dump-control-file") - .long("dump-control-file") - .takes_value(true) - .help("Dump control file at path specified by this argument and exit"), - ) - .arg( - Arg::new("id").long("id").takes_value(true).help("safekeeper node id: integer") - ).arg( - Arg::new("broker-endpoints") - .long("broker-endpoints") - .takes_value(true) - .help("a comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'"), - ) - .arg( - Arg::new("broker-etcd-prefix") - .long("broker-etcd-prefix") - .takes_value(true) - .help("a prefix to always use when polling/pusing data in etcd from this safekeeper"), - ) - .arg( - Arg::new("wal-backup-threads").long("backup-threads").takes_value(true).help(formatcp!("number of threads for wal backup (default {DEFAULT_WAL_BACKUP_RUNTIME_THREADS}")), - ).arg( - Arg::new("remote-storage") - .long("remote-storage") - .takes_value(true) - .help("Remote storage configuration for WAL backup (offloading to s3) as TOML inline table, e.g. {\"max_concurrent_syncs\" = 17, \"max_sync_errors\": 13, \"bucket_name\": \"\", \"bucket_region\":\"\", \"concurrency_limit\": 119}.\nSafekeeper offloads WAL to [prefix_in_bucket/]//, mirroring structure on the file system.") - ) - .arg( - Arg::new("enable-wal-backup") - .long("enable-wal-backup") - .takes_value(true) - .default_value("true") - .default_missing_value("true") - .help("Enable/disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring WAL backup horizon."), - ) - .arg( - Arg::new("auth-validation-public-key-path") - .long("auth-validation-public-key-path") - .takes_value(true) - .help("Path to an RSA .pem public key which is used to check JWT tokens") - ) - .get_matches(); + let arg_matches = cli().get_matches(); - if let Some(addr) = arg_matches.value_of("dump-control-file") { + if let Some(addr) = arg_matches.get_one::("dump-control-file") { let state = control_file::FileStorage::load_control_file(Path::new(addr))?; let json = serde_json::to_string(&state)?; - print!("{}", json); + print!("{json}"); return Ok(()); } let mut conf = SafeKeeperConf::default(); - if let Some(dir) = arg_matches.value_of("datadir") { + if let Some(dir) = arg_matches.get_one::("datadir") { // change into the data directory. - std::env::set_current_dir(PathBuf::from(dir))?; + std::env::set_current_dir(dir)?; } - if arg_matches.is_present("no-sync") { + if arg_matches.get_flag("no-sync") { conf.no_sync = true; } - if arg_matches.is_present("daemonize") { + if arg_matches.get_flag("daemonize") { conf.daemonize = true; } - if let Some(addr) = arg_matches.value_of("listen-pg") { - conf.listen_pg_addr = addr.to_owned(); + if let Some(addr) = arg_matches.get_one::("listen-pg") { + conf.listen_pg_addr = addr.to_string(); } - if let Some(addr) = arg_matches.value_of("listen-http") { - conf.listen_http_addr = addr.to_owned(); + if let Some(addr) = arg_matches.get_one::("listen-http") { + conf.listen_http_addr = addr.to_string(); } - if let Some(recall) = arg_matches.value_of("recall") { + if let Some(recall) = arg_matches.get_one::("recall") { conf.recall_period = humantime::parse_duration(recall)?; } let mut given_id = None; - if let Some(given_id_str) = arg_matches.value_of("id") { + if let Some(given_id_str) = arg_matches.get_one::("id") { given_id = Some(NodeId( given_id_str .parse() @@ -186,20 +85,20 @@ fn main() -> anyhow::Result<()> { )); } - if let Some(addr) = arg_matches.value_of("broker-endpoints") { + if let Some(addr) = arg_matches.get_one::("broker-endpoints") { let collected_ep: Result, ParseError> = addr.split(',').map(Url::parse).collect(); conf.broker_endpoints = collected_ep.context("Failed to parse broker endpoint urls")?; } - if let Some(prefix) = arg_matches.value_of("broker-etcd-prefix") { + if let Some(prefix) = arg_matches.get_one::("broker-etcd-prefix") { conf.broker_etcd_prefix = prefix.to_string(); } - if let Some(backup_threads) = arg_matches.value_of("wal-backup-threads") { + if let Some(backup_threads) = arg_matches.get_one::("wal-backup-threads") { conf.backup_runtime_threads = backup_threads .parse() .with_context(|| format!("Failed to parse backup threads {}", backup_threads))?; } - if let Some(storage_conf) = arg_matches.value_of("remote-storage") { + if let Some(storage_conf) = arg_matches.get_one::("remote-storage") { // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse let storage_conf_toml = format!("remote_storage = {}", storage_conf); let parsed_toml = storage_conf_toml.parse::()?; // parse @@ -208,16 +107,16 @@ fn main() -> anyhow::Result<()> { } // Seems like there is no better way to accept bool values explicitly in clap. conf.wal_backup_enabled = arg_matches - .value_of("enable-wal-backup") + .get_one::("enable-wal-backup") .unwrap() .parse() .context("failed to parse bool enable-s3-offload bool")?; conf.auth_validation_public_key_path = arg_matches - .value_of("auth-validation-public-key-path") + .get_one::("auth-validation-public-key-path") .map(PathBuf::from); - start_safekeeper(conf, given_id, arg_matches.is_present("init")) + start_safekeeper(conf, given_id, arg_matches.get_flag("init")) } fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { @@ -424,3 +323,102 @@ fn set_id(conf: &mut SafeKeeperConf, given_id: Option) -> Result<()> { conf.my_id = my_id; Ok(()) } + +fn cli() -> Command { + Command::new("Neon safekeeper") + .about("Store WAL stream to local file system and push it to WAL receivers") + .version(GIT_VERSION) + .arg( + Arg::new("datadir") + .short('D') + .long("dir") + .value_parser(value_parser!(PathBuf)) + .help("Path to the safekeeper data directory"), + ) + .arg( + Arg::new("init") + .long("init") + .action(ArgAction::SetTrue) + .help("Initialize safekeeper with ID"), + ) + .arg( + Arg::new("listen-pg") + .short('l') + .long("listen-pg") + .alias("listen") // for compatibility + .help(formatcp!("listen for incoming WAL data connections on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")), + ) + .arg( + Arg::new("listen-http") + .long("listen-http") + .help(formatcp!("http endpoint address for metrics on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")), + ) + // FIXME this argument is no longer needed since pageserver address is forwarded from compute. + // However because this argument is in use by console's e2e tests let's keep it for now and remove separately. + // So currently it is a noop. + .arg( + Arg::new("pageserver") + .short('p') + .long("pageserver"), + ) + .arg( + Arg::new("recall") + .long("recall") + .help("Period for requestion pageserver to call for replication"), + ) + .arg( + Arg::new("daemonize") + .short('d') + .long("daemonize") + .action(ArgAction::SetTrue) + .help("Run in the background"), + ) + .arg( + Arg::new("no-sync") + .short('n') + .long("no-sync") + .action(ArgAction::SetTrue) + .help("Do not wait for changes to be written safely to disk"), + ) + .arg( + Arg::new("dump-control-file") + .long("dump-control-file") + .help("Dump control file at path specified by this argument and exit"), + ) + .arg( + Arg::new("id").long("id").help("safekeeper node id: integer") + ).arg( + Arg::new("broker-endpoints") + .long("broker-endpoints") + .help("a comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'"), + ) + .arg( + Arg::new("broker-etcd-prefix") + .long("broker-etcd-prefix") + .help("a prefix to always use when polling/pusing data in etcd from this safekeeper"), + ) + .arg( + Arg::new("wal-backup-threads").long("backup-threads").help(formatcp!("number of threads for wal backup (default {DEFAULT_WAL_BACKUP_RUNTIME_THREADS}")), + ).arg( + Arg::new("remote-storage") + .long("remote-storage") + .help("Remote storage configuration for WAL backup (offloading to s3) as TOML inline table, e.g. {\"max_concurrent_syncs\" = 17, \"max_sync_errors\": 13, \"bucket_name\": \"\", \"bucket_region\":\"\", \"concurrency_limit\": 119}.\nSafekeeper offloads WAL to [prefix_in_bucket/]//, mirroring structure on the file system.") + ) + .arg( + Arg::new("enable-wal-backup") + .long("enable-wal-backup") + .default_value("true") + .default_missing_value("true") + .help("Enable/disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring WAL backup horizon."), + ) + .arg( + Arg::new("auth-validation-public-key-path") + .long("auth-validation-public-key-path") + .help("Path to an RSA .pem public key which is used to check JWT tokens") + ) +} + +#[test] +fn verify_cli() { + cli().debug_assert(); +} diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index af055ed9a4..f4468d85f0 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -8,7 +8,6 @@ version = "0.1.0" description = "workspace-hack package, managed by hakari" # You can choose to publish this crate: see https://docs.rs/cargo-hakari/latest/cargo_hakari/publishing. publish = false - # The parts of the file between the BEGIN HAKARI SECTION and END HAKARI SECTION comments # are managed by hakari. @@ -18,7 +17,7 @@ ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } chrono = { version = "0.4", features = ["clock", "iana-time-zone", "js-sys", "oldtime", "serde", "std", "time", "wasm-bindgen", "wasmbind", "winapi"] } -clap = { version = "3", features = ["atty", "color", "std", "strsim", "suggestions", "termcolor"] } +clap = { version = "4", features = ["color", "error-context", "help", "std", "string", "suggestions", "usage"] } crossbeam-utils = { version = "0.8", features = ["once_cell", "std"] } either = { version = "1", features = ["use_std"] } fail = { version = "0.5", default-features = false, features = ["failpoints"] } @@ -26,6 +25,7 @@ hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } +memchr = { version = "2", features = ["std"] } nom = { version = "7", features = ["alloc", "std"] } num-bigint = { version = "0.4", features = ["std"] } num-integer = { version = "0.1", default-features = false, features = ["i128", "std"] } @@ -42,18 +42,17 @@ tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } tracing-core = { version = "0.1", features = ["once_cell", "std"] } -uuid = { version = "0.8", features = ["getrandom", "serde", "std", "v4"] } [build-dependencies] ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } -clap = { version = "3", features = ["atty", "color", "std", "strsim", "suggestions", "termcolor"] } either = { version = "1", features = ["use_std"] } hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } +memchr = { version = "2", features = ["std"] } nom = { version = "7", features = ["alloc", "std"] } prost = { version = "0.10", features = ["prost-derive", "std"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } From 0ec5ddea0b855908ca3bda60cc0778d906e4bb9e Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 17 Oct 2022 19:21:36 +0300 Subject: [PATCH 0912/1022] GRANT CREATE ON SCHEMA public TO web_access --- compute_tools/src/spec.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index bd47614386..89a6a93510 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -380,6 +380,10 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { info!("grant query {}", &query); client.execute(query.as_str(), &[])?; + + // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user. + // This is needed since postgres 15, where this privilege is removed by default. + client.execute("GRANT CREATE ON SCHEMA public TO web_access", &[])?; } // Do some per-database access adjustments. We'd better do this at db creation time, From 129f7c82b7d29faa985b9f507cfe7c5758b7f7c5 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 17 Oct 2022 21:36:30 +0300 Subject: [PATCH 0913/1022] remove redundant expect_tenant_to_download_timeline --- test_runner/regress/test_remote_storage.py | 17 ++------------- .../test_tenants_with_remote_storage.py | 21 ++++--------------- 2 files changed, 6 insertions(+), 32 deletions(-) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 0a02a80de5..56b14dc42b 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -10,8 +10,8 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, - NeonPageserverHttpClient, RemoteStorageKind, + assert_no_in_progress_downloads_for_tenant, available_remote_storages, wait_for_last_record_lsn, wait_for_upload, @@ -125,7 +125,7 @@ def test_remote_storage_backup_and_restore( wait_until( number_of_iterations=20, interval=1, - func=lambda: expect_tenant_to_download_timeline(client, tenant_id), + func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), ) detail = client.timeline_detail(tenant_id, timeline_id) @@ -142,16 +142,3 @@ def test_remote_storage_backup_and_restore( query_scalar(cur, f"SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};") == f"{data_secret}|{checkpoint_number}" ) - - -def expect_tenant_to_download_timeline( - client: NeonPageserverHttpClient, - tenant_id: TenantId, -): - for tenant in client.tenant_list(): - if tenant["id"] == str(tenant_id): - assert not tenant.get( - "has_in_progress_downloads", True - ), f"Tenant {tenant_id} should have no downloads in progress" - return - assert False, f"Tenant {tenant_id} is missing on pageserver" diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index a7c2e7ace0..96c1fc25db 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -19,9 +19,9 @@ from fixtures.neon_fixtures import ( LocalFsStorage, NeonEnv, NeonEnvBuilder, - NeonPageserverHttpClient, Postgres, RemoteStorageKind, + assert_no_in_progress_downloads_for_tenant, available_remote_storages, wait_for_last_record_lsn, wait_for_upload, @@ -168,7 +168,7 @@ def test_tenants_attached_after_download( wait_until( number_of_iterations=5, interval=1, - func=lambda: expect_tenant_to_download_timeline(client, tenant_id), + func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), ) restored_timelines = client.timeline_list(tenant_id) @@ -181,19 +181,6 @@ def test_tenants_attached_after_download( ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" -def expect_tenant_to_download_timeline( - client: NeonPageserverHttpClient, - tenant_id: TenantId, -): - for tenant in client.tenant_list(): - if tenant["id"] == str(tenant_id): - assert not tenant.get( - "has_in_progress_downloads", True - ), f"Tenant {tenant_id} should have no downloads in progress" - return - assert False, f"Tenant {tenant_id} is missing on pageserver" - - @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) def test_tenant_upgrades_index_json_from_v0( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind @@ -262,7 +249,7 @@ def test_tenant_upgrades_index_json_from_v0( wait_until( number_of_iterations=5, interval=1, - func=lambda: expect_tenant_to_download_timeline(pageserver_http, tenant_id), + func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id), ) pg = env.postgres.create_start("main") @@ -371,7 +358,7 @@ def test_tenant_redownloads_truncated_file_on_startup( wait_until( number_of_iterations=5, interval=1, - func=lambda: expect_tenant_to_download_timeline(client, tenant_id), + func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), ) restored_timelines = client.timeline_list(tenant_id) From 80746b1c7a0627ab36ad850832d49bdd22a1755f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 17 Oct 2022 16:44:32 +0300 Subject: [PATCH 0914/1022] Add micro-benchmark for layer map search function The test data was extracted from our pgbench benchmark project on the captest environment, the one we use for the 'neon-captest-reuse' test. --- Cargo.lock | 1 + pageserver/Cargo.toml | 5 + pageserver/benches/bench_layer_map.rs | 5866 +++++++++++++++++++++++++ pageserver/src/tenant.rs | 7 +- 4 files changed, 5876 insertions(+), 3 deletions(-) create mode 100644 pageserver/benches/bench_layer_map.rs diff --git a/Cargo.lock b/Cargo.lock index 131d9a8aa2..d02ec1f5a1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2139,6 +2139,7 @@ dependencies = [ "close_fds", "const_format", "crc32c", + "criterion", "crossbeam-utils", "daemonize", "etcd_broker", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index ebdda5c6da..75aa6e93eb 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -69,5 +69,10 @@ close_fds = "0.3.2" walkdir = "2.3.2" [dev-dependencies] +criterion = "0.4" hex-literal = "0.3" tempfile = "3.2" + +[[bench]] +name = "bench_layer_map" +harness = false diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs new file mode 100644 index 0000000000..25d5ecd643 --- /dev/null +++ b/pageserver/benches/bench_layer_map.rs @@ -0,0 +1,5866 @@ +use anyhow::Result; +use pageserver::repository::{Key, Value}; +use pageserver::tenant::filename::{DeltaFileName, ImageFileName}; +use pageserver::tenant::layer_map::LayerMap; +use pageserver::tenant::storage_layer::Layer; +use pageserver::tenant::storage_layer::ValueReconstructResult; +use pageserver::tenant::storage_layer::ValueReconstructState; +use std::cmp::{max, min}; +use std::ops::Range; +use std::path::PathBuf; +use std::str::FromStr; +use std::sync::Arc; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; + +use criterion::{criterion_group, criterion_main, Criterion}; + +struct DummyDelta { + key_range: Range, + lsn_range: Range, +} + +impl Layer for DummyDelta { + fn get_tenant_id(&self) -> TenantId { + TenantId::from_str("00000000000000000000000000000000").unwrap() + } + + fn get_timeline_id(&self) -> TimelineId { + TimelineId::from_str("00000000000000000000000000000000").unwrap() + } + + fn get_key_range(&self) -> Range { + self.key_range.clone() + } + + fn get_lsn_range(&self) -> Range { + self.lsn_range.clone() + } + + fn filename(&self) -> PathBuf { + todo!() + } + + fn local_path(&self) -> Option { + todo!() + } + + fn get_value_reconstruct_data( + &self, + _key: Key, + _lsn_range: Range, + _reconstruct_data: &mut ValueReconstructState, + ) -> Result { + panic!() + } + + fn is_incremental(&self) -> bool { + true + } + + fn is_in_memory(&self) -> bool { + false + } + + fn iter(&self) -> Box> + '_> { + panic!() + } + + fn key_iter(&self) -> Box + '_> { + panic!("Not implemented") + } + + fn delete(&self) -> Result<()> { + panic!() + } + + fn dump(&self, _verbose: bool) -> Result<()> { + todo!() + } +} + +struct DummyImage { + key_range: Range, + lsn: Lsn, +} + +impl Layer for DummyImage { + fn get_tenant_id(&self) -> TenantId { + TenantId::from_str("00000000000000000000000000000000").unwrap() + } + + fn get_timeline_id(&self) -> TimelineId { + TimelineId::from_str("00000000000000000000000000000000").unwrap() + } + + fn get_key_range(&self) -> Range { + self.key_range.clone() + } + + fn get_lsn_range(&self) -> Range { + // End-bound is exclusive + self.lsn..(self.lsn + 1) + } + + fn filename(&self) -> PathBuf { + todo!() + } + + fn local_path(&self) -> Option { + todo!() + } + + fn get_value_reconstruct_data( + &self, + _key: Key, + _lsn_range: Range, + _reconstruct_data: &mut ValueReconstructState, + ) -> Result { + panic!() + } + + fn is_incremental(&self) -> bool { + false + } + + fn is_in_memory(&self) -> bool { + false + } + + fn iter(&self) -> Box> + '_> { + panic!() + } + + fn key_iter(&self) -> Box + '_> { + panic!("Not implemented") + } + + fn delete(&self) -> Result<()> { + panic!() + } + + fn dump(&self, _verbose: bool) -> Result<()> { + todo!() + } +} + +fn build_layer_map() -> LayerMap { + let mut layer_map = LayerMap::default(); + + let mut min_lsn = Lsn(u64::MAX); + let mut max_lsn = Lsn(0); + + for fname in TEST_LAYER_FILENAMES { + if let Some(imgfilename) = ImageFileName::parse_str(fname) { + let layer = DummyImage { + key_range: imgfilename.key_range, + lsn: imgfilename.lsn, + }; + layer_map.insert_historic(Arc::new(layer)); + min_lsn = min(min_lsn, imgfilename.lsn); + max_lsn = max(max_lsn, imgfilename.lsn); + } else if let Some(deltafilename) = DeltaFileName::parse_str(fname) { + let layer = DummyDelta { + key_range: deltafilename.key_range, + lsn_range: deltafilename.lsn_range.clone(), + }; + layer_map.insert_historic(Arc::new(layer)); + min_lsn = min(min_lsn, deltafilename.lsn_range.start); + max_lsn = max(max_lsn, deltafilename.lsn_range.end); + } else { + panic!("unexpected filename {fname}"); + } + } + + println!("min: {min_lsn}, max: {max_lsn}"); + + layer_map +} + +fn large_layer_map(c: &mut Criterion) { + let layer_map = build_layer_map(); + + c.bench_function("search", |b| { + b.iter(|| { + let result = layer_map.search( + // Just an arbitrary point + Key::from_hex("000000067F000080000009E014000001B011").unwrap(), + // This LSN is higher than any of the LSNs in the tree + Lsn::from_str("D0/80208AE1").unwrap(), + ); + result.unwrap(); + }); + }); + + // test with a key that corresponds to the RelDir entry. See pgdatadir_mapping.rs. + c.bench_function("search_rel_dir", |b| { + b.iter(|| { + let result = layer_map.search( + Key::from_hex("000000067F00008000000000000000000001").unwrap(), + // This LSN is higher than any of the LSNs in the tree + Lsn::from_str("D0/80208AE1").unwrap(), + ); + result.unwrap(); + }); + }); +} + +criterion_group!(benches, large_layer_map); +criterion_main!(benches); + +// A list of layer filenames, extracted from our performance test environment, from +// a project where we have run pgbench many timmes. The pgbench database was initialized +// between each test run. +const TEST_LAYER_FILENAMES: &[&str] = &[ +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000006CF69CD8B0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000006F949B7C08", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__00000071F15CF6B0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__00000072AEE2BFE0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000756884A510", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__00000077B1836CA0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000007D41715570", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000007F12B83FE8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__00000083D5DE3FD0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000873B520940", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000890CF51FE0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000008C71903720", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000008E43487FF0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000009445A06DC8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__00000096187D1FC8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__00000096E85806C0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000009921F3B4A8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000009B5229DFE8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000009EBB11FFC0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000A93DDE5FE0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000AD3698E000", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000B3AC039FE8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000B8606C92A0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000BC59629F98", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000BD25E66810", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000BEF683BFD0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000C14270A078", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000C3687EDFE8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000C6C7BD8140", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000C896B8DFD8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000CB82C2FF68", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000CD51009FE8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000CF7E08BFD0", +"000000000000000000000000000000000000-000000067F00008000000540090100000000__0000006AEF261AF8", +"000000000000000000000000000000000000-000000067F00008000000560090100000000__0000006DA30DA180", +"000000000000000000000000000000000000-000000067F00008000000580090100000000__0000006FAFE25518", +"000000000000000000000000000000000000-000000067F000080000005E0090100000000__00000073AF75E930", +"000000000000000000000000000000000000-000000067F00008000000620090100000000__00000078B2CB1C68", +"000000000000000000000000000000000000-000000067F00008000000640090100000000__0000007B9877EF40", +"000000000000000000000000000000000000-000000067F00008000000680090100000000__00000080E477E868", +"000000000000000000000000000000000000-000000067F000080000006C0090100000000__00000085BE169568", +"000000000000000000000000000000000000-000000067F00008000000700090100000000__0000008AF15FEF50", +"000000000000000000000000000000000000-000000067F00008000000740090100000000__000000902186B1D0", +"000000000000000000000000000000000000-000000067F00008000000760090100000000__00000092CA5E4EA8", +"000000000000000000000000000000000000-000000067F000080000007E0090100000000__0000009D34F8D4D8", +"000000000000000000000000000000000000-000000067F00008000000820090100000000__000000A29F1D8950", +"000000000000000000000000000000000000-000000067F00008000000860090100000000__000000A434813A68", +"000000000000000000000000000000000000-000000067F000080000008C0090100000000__000000AAEBE534F8", +"000000000000000000000000000000000000-000000067F00008000000960090100000000__000000B6C2E92A88", +"000000000000000000000000000000000000-000000067F00008000000A20090100000000__000000C5745579F0", +"000000000000000000000000000000000000-000000067F00008000000A60090100000000__000000CA2C877DC8", +"000000000000000000000000000000000000-030000000000000000000000000000000002__000000AFB4666000", +"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF7DC97FD1-000000CF801FC221", +"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF801FC221-000000CF801FDB61", +"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF801FDB61-000000CF80201FA1", +"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF80201FA1-000000CF80203CC1", +"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF80203CC1-000000CF802067C1", +"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF802067C1-000000CF80208AE1", +"000000067F000032AC000040040000000000-000000067F000080000005400C0000007DD8__0000006A5C770149-0000006ACEF98449", +"000000067F000032AC000040040000000000-000000067F000080000005600C0000008077__0000006CF7781D19-0000006D69B48989", +"000000067F000032AC000040040000000000-000000067F000080000005800C0000007A49__0000006F95E72491-0000006FA8EDF3B9", +"000000067F000032AC000040040000000000-000000067F000080000005A00C0000007614__000000723877FF21-00000072A0D7CEA1", +"000000067F000032AC000040040000000000-000000067F000080000005C00C0000016516__00000072A0D7CEA1-0000007318DDE691", +"000000067F000032AC000040040000000000-000000067F000080000006000C0000008FB7__00000075687C3009-00000075E915EBC9", +"000000067F000032AC000040040000000000-000000067F000080000006200C0000009441__0000007805801C41-00000078859FEA11", +"000000067F000032AC000040040000000000-000000067F000080000006400C0000007987__0000007AA1DF6639-0000007B14D5C521", +"000000067F000032AC000040040000000000-000000067F000080000006600C0000009381__0000007D41EA8D51-0000007DC21DE569", +"000000067F000032AC000040040000000000-000000067F000080000006800C0000007D6A__0000007FDCDCE659-000000804F6BFFC1", +"000000067F000032AC000040040000000000-000000067F000080000006801400000044E4__00000081AFAF5FD1-0000008215AFE5A9", +"000000067F000032AC000040040000000000-000000067F000080000006C00C00000090F5__00000084A325AA01-00000085239DFB81", +"000000067F000032AC000040040000000000-000000067F000080000006E00C00000096C8__000000873C9A2551-00000087BC75E5B1", +"000000067F000032AC000040040000000000-000000067F000080000007000C000000955C__00000089D6B8EE99-0000008A56BBF739", +"000000067F000032AC000040040000000000-000000067F000080000007200C000000933D__0000008C72843D41-0000008CF2BFFC89", +"000000067F000032AC000040040000000000-000000067F000080000007400C00000090E9__0000008F10E3E189-0000008F915DE591", +"000000067F000032AC000040040000000000-000000067F000080000007600C0000008180__00000091A6DD7A79-0000009228F7FA79", +"000000067F000032AC000040040000000000-000000067F000080000007800C000000974C__0000009446B52FD1-00000094D67DF4F9", +"000000067F000032AC000040040000000000-000000067F000080000007A00C000000974B__00000096E85829C9-00000098A7ADFC91", +"000000067F000032AC000040040000000000-000000067F000080000007C00C0000007EA5__000000997F5D23C9-00000099F1C9FC71", +"000000067F000032AC000040040000000000-000000067F000080000007E00C00000092CD__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000032AC000040040000000000-000000067F000080000008000C00000081F6__0000009EBBC72771-000000A154401909", +"000000067F000032AC000040040000000000-000000067F000080000008200C000000974D__000000A154401909-000000A1E407F839", +"000000067F000032AC000040040000000000-000000067F0000800000082014000000393C__000000A323C9E001-000000A37A60B1A9", +"000000067F000032AC000040040000000000-000000067F000080000008600C0000009747__000000A37A60B1A9-000000A3CA47ECA9", +"000000067F000032AC000040040000000000-000000067F000080000008801C0000009703__000000A5A081B661-000000A6503DE919", +"000000067F000032AC000040040000000000-000000067F000080000008801C00000CF6B0__000000A6F001F909-000000A91D97FD49", +"000000067F000032AC000040040000000000-000000067F000080000008C00C0000002330__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000032AC000040040000000000-000000067F000080000008E00C00000077B3__000000AB6533BFD9-000000ABF63DF511", +"000000067F000032AC000040040000000000-000000067F000080000008E02A000000529F__000000AF5D587FE1-000000AFB4666001", +"000000067F000032AC000040040000000000-000000067F000080000009004000000047E0__000000B18495C001-000000B1FA75F501", +"000000067F000032AC000040040000000000-000000067F00008000000920140000005289__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F000032AC000040040000000000-000000067F000080000009400C000008DEA4__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000032AC000040040000000000-000000067F000080000009600C000000974F__000000B5CED8CF79-000000B63EADE5B9", +"000000067F000032AC000040040000000000-000000067F000080000009600C0000055A74__000000B808718889-000000B8606C92A1", +"000000067F000032AC000040040000000000-000000067F000080000009800C0000009748__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000032AC000040040000000000-000000067F000080000009800C000010EC71__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F000032AC000040040000000000-000000067F000080000009A00C0000071F6F__000000BCEF79BE91-000000BD263A5849", +"000000067F000032AC000040040000000000-000000067F000080000009C00C0000009749__000000BD263A5849-000000BDA607F261", +"000000067F000032AC000040040000000000-000000067F000080000009E00C0000004916__000000BEF5F47FD1-000000BF48FFEB11", +"000000067F000032AC000040040000000000-000000067F00008000000A000C0000008EF9__000000C19744E959-000000C217F3F379", +"000000067F000032AC000040040000000000-000000067F00008000000A200C0000009748__000000C430961E71-000000C4C05DDB29", +"000000067F000032AC000040040000000000-000000067F00008000000A400C0000009743__000000C6C87B6329-000000C74849FAE1", +"000000067F000032AC000040040000000000-000000067F00008000000A600C0000009746__000000C90726D0D9-000000C986F5F0D9", +"000000067F000032AC000040040000000000-000000067F00008000000A600C000007A149__000000CB40C16489-000000CB82C37859", +"000000067F000032AC000040040000000000-000000067F00008000000A800C0000009748__000000CB82C37859-000000CC11F5EDC9", +"000000067F000032AC000040040000000000-000000067F00008000000A800F0100000003__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000000000000000001-000000067F000080000005400C000004B479__0000006C98B77D29-0000006CF7781D19", +"000000067F00008000000000000000000001-000000067F000080000005400C0000104BE4__0000006C1E7C73C1-0000006C98B77D29", +"000000067F00008000000000000000000001-000000067F000080000005600C0000048643__0000006F3370DD59-0000006F95E72491", +"000000067F00008000000000000000000001-000000067F000080000005600C0000100001__0000006EB935F989-0000006F3370DD59", +"000000067F00008000000000000000000001-000000067F000080000005800C000005CF06__00000071F21624D1-000000723877FF21", +"000000067F00008000000000000000000001-000000067F000080000005800C000009D78D__000000716A103FC9-00000071F21624D1", +"000000067F00008000000000000000000001-000000067F000080000005800C00000CDE2D__00000070E8761431-000000716A103FC9", +"000000067F00008000000000000000000001-000000067F000080000005E00C00000385D9__0000007318DDE691-0000007497B01FF9", +"000000067F00008000000000000000000001-000000067F000080000005E00C0000050175__000000751253A4C1-00000075687C3009", +"000000067F00008000000000000000000001-000000067F000080000005E00C00000AF576__0000007497B01FF9-000000751253A4C1", +"000000067F00008000000000000000000001-000000067F000080000006000C0000051A02__00000077B2AD0F91-0000007805801C41", +"000000067F00008000000000000000000001-000000067F000080000006000C00000C3C38__00000077391A8001-00000077B2AD0F91", +"000000067F00008000000000000000000001-000000067F000080000006000C00000C56C1__00000076A8CDE8F9-00000077391A8001", +"000000067F00008000000000000000000001-000000067F000080000006200C000004811C__0000007A3F679FA1-0000007AA1DF6639", +"000000067F00008000000000000000000001-000000067F000080000006200C0000107883__00000079C527F0D9-0000007A3F679FA1", +"000000067F00008000000000000000000001-000000067F000080000006400C000004B4C9__0000007B14D5C521-0000007C73B53FC9", +"000000067F00008000000000000000000001-000000067F000080000006400C000005258F__0000007CEE5A0B91-0000007D41EA8D51", +"000000067F00008000000000000000000001-000000067F000080000006400C00000A887C__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F00008000000000000000000001-000000067F000080000006600C0000049742__0000007F7BE4E6F1-0000007FDCDCE659", +"000000067F00008000000000000000000001-000000067F000080000006600C00000BC29F__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F00008000000000000000000001-000000067F000080000006600C0000111C82__0000007F11E4BFE9-0000007F7BE4E6F1", +"000000067F00008000000000000000000001-000000067F000080000006800C00000A8D4C__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F00008000000000000000000001-000000067F000080000006A00C0000051984__000000844F1A6789-00000084A325AA01", +"000000067F00008000000000000000000001-000000067F000080000006A00C00000703EC__00000082B573F579-00000083D5901FD9", +"000000067F00008000000000000000000001-000000067F000080000006A00C00000C4CC8__00000083D5901FD9-000000844F1A6789", +"000000067F00008000000000000000000001-000000067F000080000006C00C0000055EA3__00000086ED29E361-000000873C9A2551", +"000000067F00008000000000000000000001-000000067F000080000006C00C00000BC102__00000085D35BF439-0000008673817FC9", +"000000067F00008000000000000000000001-000000067F000080000006C00C00000BFB6E__0000008673817FC9-00000086ED29E361", +"000000067F00008000000000000000000001-000000067F000080000006E00C0000054244__0000008985FD3611-00000089D6B8EE99", +"000000067F00008000000000000000000001-000000067F000080000006E00C00000B6F42__000000890C5B6001-0000008985FD3611", +"000000067F00008000000000000000000001-000000067F000080000006E00C00000C5883__000000887C2DFE59-000000890C5B6001", +"000000067F00008000000000000000000001-000000067F000080000007000C0000053C20__0000008C2045B721-0000008C72843D41", +"000000067F00008000000000000000000001-000000067F000080000007000C00000B2B06__0000008AF67FEC19-0000008BA6803FC9", +"000000067F00008000000000000000000001-000000067F000080000007000C00000BF157__0000008BA6803FC9-0000008C2045B721", +"000000067F00008000000000000000000001-000000067F000080000007200C0000051312__0000008EBC4827C1-0000008F10E3E189", +"000000067F00008000000000000000000001-000000067F000080000007200C00000BA086__0000008E42A19FD1-0000008EBC4827C1", +"000000067F00008000000000000000000001-000000067F000080000007200C00000C58B0__0000008DB277FA49-0000008E42A19FD1", +"000000067F00008000000000000000000001-000000067F000080000007400C000004DF08__000000914B2393B1-00000091A6DD7A79", +"000000067F00008000000000000000000001-000000067F000080000007400C00000FCCA8__00000090D0E5EA29-000000914B2393B1", +"000000067F00008000000000000000000001-000000067F000080000007600C00000544BA__0000009228F7FA79-00000093786F8001", +"000000067F00008000000000000000000001-000000067F000080000007600C0000061028__0000009402435A49-0000009446B52FD1", +"000000067F00008000000000000000000001-000000067F000080000007600C000008C52F__00000093786F8001-0000009402435A49", +"000000067F00008000000000000000000001-000000067F000080000007800C000006D445__00000096AEF27399-00000096E85829C9", +"000000067F00008000000000000000000001-000000067F000080000007800C000007B8BC__00000096193A8001-00000096AEF27399", +"000000067F00008000000000000000000001-000000067F000080000007800C00000CD6B6__000000959635F2A9-00000096193A8001", +"000000067F00008000000000000000000001-000000067F000080000007A00C000004B9A5__0000009921E47AA1-000000997F5D23C9", +"000000067F00008000000000000000000001-000000067F000080000007A00C00000F720F__00000098A7ADFC91-0000009921E47AA1", +"000000067F00008000000000000000000001-000000067F000080000007C00C0000052A9D__0000009BCB4E4461-0000009C1E8CC879", +"000000067F00008000000000000000000001-000000067F000080000007C00C00000A9244__0000009A918DF181-0000009B51A8BBB9", +"000000067F00008000000000000000000001-000000067F000080000007C00C00000BA258__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F00008000000000000000000001-000000067F000080000007E00C0000061ADC__0000009E781A9731-0000009EBBC72771", +"000000067F00008000000000000000000001-000000067F000080000007E00C0000093E3A__0000009DEEE6BFF9-0000009E781A9731", +"000000067F00008000000000000000000001-000000067F000080000007E00C00000B2704__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F00008000000000000000000001-000000067F000080000008200C000005D8FE__000000A1E407F839-000000A323C9E001", +"000000067F00008000000000000000000001-000000067F000080000008600C000010ECC4__000000A539BDE561-000000A5A081B661", +"000000067F00008000000000000000000001-000000067F000080000008A00C0000104A0C__000000A91D97FD49-000000A98AB7EE49", +"000000067F00008000000000000000000001-000000067F000080000008C00C000005DA8C__000000AA2597E9A1-000000AB6533BFD9", +"000000067F00008000000000000000000001-000000067F000080000008E00C00000BC018__000000AC9601EA19-000000AD36393FE9", +"000000067F00008000000000000000000001-000000067F000080000008E0140000003E33__000000AD36393FE9-000000ADB047EAB9", +"000000067F00008000000000000000000001-000000067F000080000008E022000008E3D1__000000AE6FFFE799-000000AF5D587FE1", +"000000067F00008000000000000000000001-000000067F000080000009003800000C5213__000000B0F3EDEAC9-000000B18495C001", +"000000067F00008000000000000000000001-000000067F000080000009200C000009567A__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F00008000000000000000000001-000000067F000080000009600C00000A93FD__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F00008000000000000000000001-000000067F000080000009600C020000000B__000000B79E68FFF9-000000B808718889", +"000000067F00008000000000000000000001-000000067F000080000009A00C00000794DC__000000BC596B5D59-000000BCEF79BE91", +"000000067F00008000000000000000000001-000000067F000080000009A00C00000D6C06__000000BBE607E8F1-000000BC596B5D59", +"000000067F00008000000000000000000001-000000067F000080000009C00C00000B2921__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F00008000000000000000000001-000000067F000080000009E00C0000050E55__000000C1426D92E1-000000C19744E959", +"000000067F00008000000000000000000001-000000067F000080000009E00C000009FB21__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F00008000000000000000000001-000000067F000080000009E00C00000C0C74__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F00008000000000000000000001-000000067F00008000000A000C000005635B__000000C3E17E01A1-000000C430961E71", +"000000067F00008000000000000000000001-000000067F00008000000A000C00000B8B52__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000000000000000001-000000067F00008000000A000C00000BC072__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000000000000000001-000000067F00008000000A200C00000677D8__000000C689AF4AC1-000000C6C87B6329", +"000000067F00008000000000000000000001-000000067F00008000000A200C00000933F0__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000000000000000001-000000067F00008000000A200C00000BBC1F__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000000000000000001-000000067F00008000000A400C00000C4AE6__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000000000000000001-000000067F00008000000A400C0000107F8F__000000C8993EBFF9-000000C90726D0D9", +"000000067F00008000000000000000000001-000000067F00008000000A600C0000054BFB__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000000000000000001-000000067F00008000000A600C00001117CB__000000CAD5D7FFF1-000000CB40C16489", +"000000067F00008000000000000000000001-000000067F00008000000A800C00000BCB46__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000000000000000001-000000067F00008000000AA00C0000078E97__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000004E10100000002-000000067F000080000005400C000004BA9C__0000006ACEF98449-0000006C1E7C73C1", +"000000067F00008000000004E10100000002-000000067F000080000005800C0000071854__0000007048B1EC09-00000070E8761431", +"000000067F00008000000004E10200000000-000000067F000080000005600C000004BA9D__0000006D69B48989-0000006EB935F989", +"000000067F00008000000004EB0100000002-000000067F00008000000A400C00000551FC__000000C74849FAE1-000000C80801E859", +"000000067F000080000005200C000006C000-030000000000000000000000000000000002__000000687B67FC58", +"000000067F00008000000520140000028A69-030000000000000000000000000000000002__0000006981B5FDC9-00000069FBEEB099", +"000000067F0000800000052014000002C260-030000000000000000000000000000000002__00000069FBEEB099-0000006A5C770149", +"000000067F000080000005400C0000000000-000000067F000080000005400C0000004000__0000006CF69CD8B0", +"000000067F000080000005400C0000004000-000000067F000080000005400C0000008000__0000006CF69CD8B0", +"000000067F000080000005400C0000008000-000000067F000080000005400C000000C000__0000006CF69CD8B0", +"000000067F000080000005400C000000C000-000000067F000080000005400C0000010000__0000006CF69CD8B0", +"000000067F000080000005400C0000010000-000000067F000080000005400C0000014000__0000006CF69CD8B0", +"000000067F000080000005400C0000014000-000000067F000080000005400C0000018000__0000006CF69CD8B0", +"000000067F000080000005400C0000018000-000000067F000080000005400C000001C000__0000006CF69CD8B0", +"000000067F000080000005400C000001C000-000000067F000080000005400C0000020000__0000006CF69CD8B0", +"000000067F000080000005400C0000020000-000000067F000080000005400C0000024000__0000006CF69CD8B0", +"000000067F000080000005400C0000024000-000000067F000080000005400C0000028000__0000006CF69CD8B0", +"000000067F000080000005400C0000028000-000000067F000080000005400C000002C000__0000006CF69CD8B0", +"000000067F000080000005400C000002C000-000000067F000080000005400C0000030000__0000006CF69CD8B0", +"000000067F000080000005400C0000030000-000000067F000080000005400C0000034000__0000006CF69CD8B0", +"000000067F000080000005400C0000034000-000000067F000080000005400C0000038000__0000006CF69CD8B0", +"000000067F000080000005400C0000038000-000000067F000080000005400C000003C000__0000006CF69CD8B0", +"000000067F000080000005400C000003C000-000000067F000080000005400C0000040000__0000006CF69CD8B0", +"000000067F000080000005400C0000040000-000000067F000080000005400C0000044000__0000006CF69CD8B0", +"000000067F000080000005400C0000044000-000000067F000080000005400C0000048000__0000006CF69CD8B0", +"000000067F000080000005400C0000048000-000000067F000080000005400C000004C000__0000006CF69CD8B0", +"000000067F000080000005400C000004B483-000000067F000080000005400C00000967AD__0000006C98B77D29-0000006CF7781D19", +"000000067F000080000005400C000004C000-000000067F000080000005400C0000050000__0000006CF69CD8B0", +"000000067F000080000005400C0000050000-000000067F000080000005400C0000054000__0000006CF69CD8B0", +"000000067F000080000005400C0000054000-000000067F000080000005400C0000058000__0000006CF69CD8B0", +"000000067F000080000005400C0000054000-030000000000000000000000000000000002__0000006AEF261AF8", +"000000067F000080000005400C0000058000-000000067F000080000005400C000005C000__0000006CF69CD8B0", +"000000067F000080000005400C000005C000-000000067F000080000005400C0000060000__0000006CF69CD8B0", +"000000067F000080000005400C0000060000-000000067F000080000005400C0000064000__0000006CF69CD8B0", +"000000067F000080000005400C0000064000-000000067F000080000005400C0000068000__0000006CF69CD8B0", +"000000067F000080000005400C0000068000-000000067F000080000005400C000006C000__0000006CF69CD8B0", +"000000067F000080000005400C000006C000-000000067F000080000005400C0000070000__0000006CF69CD8B0", +"000000067F000080000005400C0000070000-000000067F000080000005400C0000074000__0000006CF69CD8B0", +"000000067F000080000005400C0000074000-000000067F000080000005400C0000078000__0000006CF69CD8B0", +"000000067F000080000005400C0000078000-000000067F000080000005400C000007C000__0000006CF69CD8B0", +"000000067F000080000005400C000007C000-000000067F000080000005400C0000080000__0000006CF69CD8B0", +"000000067F000080000005400C0000080000-000000067F000080000005400C0000084000__0000006CF69CD8B0", +"000000067F000080000005400C0000084000-000000067F000080000005400C0000088000__0000006CF69CD8B0", +"000000067F000080000005400C0000088000-000000067F000080000005400C000008C000__0000006CF69CD8B0", +"000000067F000080000005400C000008C000-000000067F000080000005400C0000090000__0000006CF69CD8B0", +"000000067F000080000005400C0000090000-000000067F000080000005400C0000094000__0000006CF69CD8B0", +"000000067F000080000005400C0000094000-000000067F000080000005400C0000098000__0000006CF69CD8B0", +"000000067F000080000005400C00000967BA-000000067F000080000005400C00000E2771__0000006C98B77D29-0000006CF7781D19", +"000000067F000080000005400C0000098000-000000067F000080000005400C000009C000__0000006CF69CD8B0", +"000000067F000080000005400C000009C000-000000067F000080000005400C00000A0000__0000006CF69CD8B0", +"000000067F000080000005400C00000A0000-000000067F000080000005400C00000A4000__0000006CF69CD8B0", +"000000067F000080000005400C00000A4000-000000067F000080000005400C00000A8000__0000006CF69CD8B0", +"000000067F000080000005400C00000A8000-000000067F000080000005400C00000AC000__0000006CF69CD8B0", +"000000067F000080000005400C00000AC000-000000067F000080000005400C00000B0000__0000006CF69CD8B0", +"000000067F000080000005400C00000B0000-000000067F000080000005400C00000B4000__0000006CF69CD8B0", +"000000067F000080000005400C00000B4000-000000067F000080000005400C00000B8000__0000006CF69CD8B0", +"000000067F000080000005400C00000B8000-000000067F000080000005400C00000BC000__0000006CF69CD8B0", +"000000067F000080000005400C00000BC000-000000067F000080000005400C00000C0000__0000006CF69CD8B0", +"000000067F000080000005400C00000C0000-000000067F000080000005400C00000C4000__0000006CF69CD8B0", +"000000067F000080000005400C00000C4000-000000067F000080000005400C00000C8000__0000006CF69CD8B0", +"000000067F000080000005400C00000C8000-000000067F000080000005400C00000CC000__0000006CF69CD8B0", +"000000067F000080000005400C00000CC000-000000067F000080000005400C00000D0000__0000006CF69CD8B0", +"000000067F000080000005400C00000D0000-000000067F000080000005400C00000D4000__0000006CF69CD8B0", +"000000067F000080000005400C00000D4000-000000067F000080000005400C00000D8000__0000006CF69CD8B0", +"000000067F000080000005400C00000D8000-000000067F000080000005400C00000DC000__0000006CF69CD8B0", +"000000067F000080000005400C00000DC000-000000067F000080000005400C00000E0000__0000006CF69CD8B0", +"000000067F000080000005400C00000E0000-000000067F000080000005400C00000E4000__0000006CF69CD8B0", +"000000067F000080000005400C00000E277B-000000067F00008000000540140000005B2E__0000006C98B77D29-0000006CF7781D19", +"000000067F000080000005400C00000E4000-000000067F000080000005400C00000E8000__0000006CF69CD8B0", +"000000067F000080000005400C00000E8000-000000067F000080000005400C00000EC000__0000006CF69CD8B0", +"000000067F000080000005400C00000EC000-000000067F000080000005400C00000F0000__0000006CF69CD8B0", +"000000067F000080000005400C00000F0000-000000067F000080000005400C00000F4000__0000006CF69CD8B0", +"000000067F000080000005400C00000F4000-000000067F000080000005400C00000F8000__0000006CF69CD8B0", +"000000067F000080000005400C00000F8000-000000067F000080000005400C00000FC000__0000006CF69CD8B0", +"000000067F000080000005400C00000FC000-000000067F000080000005400C0000100000__0000006CF69CD8B0", +"000000067F000080000005400C0000100000-000000067F000080000005400C0000104000__0000006CF69CD8B0", +"000000067F000080000005400C0000104000-000000067F000080000005400C0000108000__0000006CF69CD8B0", +"000000067F000080000005400C0000108000-000000067F000080000005400C000010C000__0000006CF69CD8B0", +"000000067F000080000005400C000010C000-000000067F000080000005400C0000110000__0000006CF69CD8B0", +"000000067F000080000005400C0000110000-000000067F00008000000540120100000000__0000006CF69CD8B0", +"000000067F000080000005400C0100000000-000000067F00008000000540140000004760__0000006C1E7C73C1-0000006C98B77D29", +"000000067F00008000000540140000004760-000000067F0000800000054014000000BB51__0000006C1E7C73C1-0000006C98B77D29", +"000000067F00008000000540140000005B2F-000000067F0000800000054014000001A04C__0000006C98B77D29-0000006CF7781D19", +"000000067F0000800000054014000000BB51-000000067F00008000000540140000012EFA__0000006C1E7C73C1-0000006C98B77D29", +"000000067F00008000000540140000012EFA-000000067F0000800000054014000001A2E5__0000006C1E7C73C1-0000006C98B77D29", +"000000067F0000800000054014000001A04E-000000067F0000800000054016000000022B__0000006C98B77D29-0000006CF7781D19", +"000000067F0000800000054014000001A2E5-000000067F000080000005401400000216D5__0000006C1E7C73C1-0000006C98B77D29", +"000000067F000080000005401400000216D5-000000067F00008000000540140000028AD9__0000006C1E7C73C1-0000006C98B77D29", +"000000067F00008000000540140000028AD9-030000000000000000000000000000000002__0000006C1E7C73C1-0000006C98B77D29", +"000000067F0000800000054016000000022B-030000000000000000000000000000000002__0000006C98B77D29-0000006CF7781D19", +"000000067F000080000005600C0000000000-000000067F000080000005600C0000004000__0000006DA30DA180", +"000000067F000080000005600C0000000000-000000067F000080000005600C0000004000__0000006F949B7C08", +"000000067F000080000005600C0000004000-000000067F000080000005600C0000008000__0000006DA30DA180", +"000000067F000080000005600C0000004000-000000067F000080000005600C0000008000__0000006F949B7C08", +"000000067F000080000005600C0000008000-000000067F000080000005600C000000C000__0000006DA30DA180", +"000000067F000080000005600C0000008000-000000067F000080000005600C000000C000__0000006F949B7C08", +"000000067F000080000005600C0000008077-000000067F000080000005600C00000117CE__0000006CF7781D19-0000006D69B48989", +"000000067F000080000005600C000000C000-000000067F000080000005600C0000010000__0000006DA30DA180", +"000000067F000080000005600C000000C000-000000067F000080000005600C0000010000__0000006F949B7C08", +"000000067F000080000005600C0000010000-000000067F000080000005600C0000014000__0000006DA30DA180", +"000000067F000080000005600C0000010000-000000067F000080000005600C0000014000__0000006F949B7C08", +"000000067F000080000005600C00000117CE-000000067F000080000005600C000001AF0A__0000006CF7781D19-0000006D69B48989", +"000000067F000080000005600C0000014000-000000067F000080000005600C0000018000__0000006DA30DA180", +"000000067F000080000005600C0000014000-000000067F000080000005600C0000018000__0000006F949B7C08", +"000000067F000080000005600C0000018000-000000067F000080000005600C000001C000__0000006DA30DA180", +"000000067F000080000005600C0000018000-000000067F000080000005600C000001C000__0000006F949B7C08", +"000000067F000080000005600C000001AF0A-000000067F000080000005600C0000024670__0000006CF7781D19-0000006D69B48989", +"000000067F000080000005600C000001C000-000000067F000080000005600C0000020000__0000006DA30DA180", +"000000067F000080000005600C000001C000-000000067F000080000005600C0000020000__0000006F949B7C08", +"000000067F000080000005600C0000020000-000000067F000080000005600C0000024000__0000006DA30DA180", +"000000067F000080000005600C0000020000-000000067F000080000005600C0000024000__0000006F949B7C08", +"000000067F000080000005600C0000024000-000000067F000080000005600C0000028000__0000006DA30DA180", +"000000067F000080000005600C0000024000-000000067F000080000005600C0000028000__0000006F949B7C08", +"000000067F000080000005600C0000024670-000000067F000080000005600C000002DDD6__0000006CF7781D19-0000006D69B48989", +"000000067F000080000005600C0000028000-000000067F000080000005600C000002C000__0000006DA30DA180", +"000000067F000080000005600C0000028000-000000067F000080000005600C000002C000__0000006F949B7C08", +"000000067F000080000005600C000002C000-000000067F000080000005600C0000030000__0000006DA30DA180", +"000000067F000080000005600C000002C000-000000067F000080000005600C0000030000__0000006F949B7C08", +"000000067F000080000005600C000002DDD6-000000067F000080000005600C000003752A__0000006CF7781D19-0000006D69B48989", +"000000067F000080000005600C0000030000-000000067F000080000005600C0000034000__0000006DA30DA180", +"000000067F000080000005600C0000030000-000000067F000080000005600C0000034000__0000006F949B7C08", +"000000067F000080000005600C0000034000-000000067F000080000005600C0000038000__0000006DA30DA180", +"000000067F000080000005600C0000034000-000000067F000080000005600C0000038000__0000006F949B7C08", +"000000067F000080000005600C000003752A-000000067F000080000005600C0000040C90__0000006CF7781D19-0000006D69B48989", +"000000067F000080000005600C0000038000-000000067F000080000005600C000003C000__0000006DA30DA180", +"000000067F000080000005600C0000038000-000000067F000080000005600C000003C000__0000006F949B7C08", +"000000067F000080000005600C000003C000-000000067F000080000005600C0000040000__0000006DA30DA180", +"000000067F000080000005600C000003C000-000000067F000080000005600C0000040000__0000006F949B7C08", +"000000067F000080000005600C0000040000-000000067F000080000005600C0000044000__0000006DA30DA180", +"000000067F000080000005600C0000040000-000000067F000080000005600C0000044000__0000006F949B7C08", +"000000067F000080000005600C0000040C90-030000000000000000000000000000000002__0000006CF7781D19-0000006D69B48989", +"000000067F000080000005600C0000044000-000000067F000080000005600C0000048000__0000006DA30DA180", +"000000067F000080000005600C0000044000-000000067F000080000005600C0000048000__0000006F949B7C08", +"000000067F000080000005600C0000048000-000000067F000080000005600C000004C000__0000006DA30DA180", +"000000067F000080000005600C0000048000-000000067F000080000005600C000004C000__0000006F949B7C08", +"000000067F000080000005600C0000048643-000000067F000080000005600C00000907F3__0000006F3370DD59-0000006F95E72491", +"000000067F000080000005600C000004BA9D-000000067F000080000005600C00000551D2__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C000004C000-000000067F000080000005600C0000050000__0000006DA30DA180", +"000000067F000080000005600C000004C000-000000067F000080000005600C0000050000__0000006F949B7C08", +"000000067F000080000005600C0000050000-000000067F000080000005600C0000054000__0000006DA30DA180", +"000000067F000080000005600C0000050000-000000067F000080000005600C0000054000__0000006F949B7C08", +"000000067F000080000005600C0000054000-000000067F000080000005600C0000058000__0000006DA30DA180", +"000000067F000080000005600C0000054000-000000067F000080000005600C0000058000__0000006F949B7C08", +"000000067F000080000005600C00000551D2-000000067F000080000005600C000005E90B__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C0000058000-000000067F000080000005600C000005C000__0000006DA30DA180", +"000000067F000080000005600C0000058000-000000067F000080000005600C000005C000__0000006F949B7C08", +"000000067F000080000005600C000005C000-000000067F000080000005600C0000060000__0000006DA30DA180", +"000000067F000080000005600C000005C000-000000067F000080000005600C0000060000__0000006F949B7C08", +"000000067F000080000005600C000005E90B-000000067F000080000005600C000006802B__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C0000060000-000000067F000080000005600C0000064000__0000006DA30DA180", +"000000067F000080000005600C0000060000-000000067F000080000005600C0000064000__0000006F949B7C08", +"000000067F000080000005600C0000064000-000000067F000080000005600C0000068000__0000006F949B7C08", +"000000067F000080000005600C0000064000-030000000000000000000000000000000002__0000006DA30DA180", +"000000067F000080000005600C0000068000-000000067F000080000005600C000006C000__0000006F949B7C08", +"000000067F000080000005600C000006802B-000000067F000080000005600C0000071782__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C000006C000-000000067F000080000005600C0000070000__0000006F949B7C08", +"000000067F000080000005600C0000070000-000000067F000080000005600C0000074000__0000006F949B7C08", +"000000067F000080000005600C0000071782-000000067F000080000005600C000007AEE8__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C0000074000-000000067F000080000005600C0000078000__0000006F949B7C08", +"000000067F000080000005600C0000078000-000000067F000080000005600C000007C000__0000006F949B7C08", +"000000067F000080000005600C000007AEE8-000000067F000080000005600C000008460B__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C000007C000-000000067F000080000005600C0000080000__0000006F949B7C08", +"000000067F000080000005600C0000080000-000000067F000080000005600C0000084000__0000006F949B7C08", +"000000067F000080000005600C0000084000-000000067F000080000005600C0000088000__0000006F949B7C08", +"000000067F000080000005600C000008460B-000000067F000080000005600C000008DD71__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C0000088000-000000067F000080000005600C000008C000__0000006F949B7C08", +"000000067F000080000005600C000008C000-000000067F000080000005600C0000090000__0000006F949B7C08", +"000000067F000080000005600C000008DD71-000000067F000080000005600C00000974D7__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C0000090000-000000067F000080000005600C0000094000__0000006F949B7C08", +"000000067F000080000005600C00000907F5-000000067F000080000005600C00000D90E0__0000006F3370DD59-0000006F95E72491", +"000000067F000080000005600C0000094000-000000067F000080000005600C0000098000__0000006F949B7C08", +"000000067F000080000005600C00000974D7-000000067F000080000005600C00000A0C0B__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C0000098000-000000067F000080000005600C000009C000__0000006F949B7C08", +"000000067F000080000005600C000009C000-000000067F000080000005600C00000A0000__0000006F949B7C08", +"000000067F000080000005600C00000A0000-000000067F000080000005600C00000A4000__0000006F949B7C08", +"000000067F000080000005600C00000A0C0B-000000067F000080000005600C00000AA371__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000A4000-000000067F000080000005600C00000A8000__0000006F949B7C08", +"000000067F000080000005600C00000A8000-000000067F000080000005600C00000AC000__0000006F949B7C08", +"000000067F000080000005600C00000AA371-000000067F000080000005600C00000B3AD7__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000AC000-000000067F000080000005600C00000B0000__0000006F949B7C08", +"000000067F000080000005600C00000B0000-000000067F000080000005600C00000B4000__0000006F949B7C08", +"000000067F000080000005600C00000B3AD7-000000067F000080000005600C00000BD20B__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000B4000-000000067F000080000005600C00000B8000__0000006F949B7C08", +"000000067F000080000005600C00000B8000-000000067F000080000005600C00000BC000__0000006F949B7C08", +"000000067F000080000005600C00000BC000-000000067F000080000005600C00000C0000__0000006F949B7C08", +"000000067F000080000005600C00000BD20B-000000067F000080000005600C00000C6932__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000C0000-000000067F000080000005600C00000C4000__0000006F949B7C08", +"000000067F000080000005600C00000C4000-000000067F000080000005600C00000C8000__0000006F949B7C08", +"000000067F000080000005600C00000C6932-000000067F000080000005600C00000D0098__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000C8000-000000067F000080000005600C00000CC000__0000006F949B7C08", +"000000067F000080000005600C00000CC000-000000067F000080000005600C00000D0000__0000006F949B7C08", +"000000067F000080000005600C00000D0000-000000067F000080000005600C00000D4000__0000006F949B7C08", +"000000067F000080000005600C00000D0098-000000067F000080000005600C00000D97FE__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000D4000-000000067F000080000005600C00000D8000__0000006F949B7C08", +"000000067F000080000005600C00000D8000-000000067F000080000005600C00000DC000__0000006F949B7C08", +"000000067F000080000005600C00000D90F8-000000067F00008000000560140000002A9A__0000006F3370DD59-0000006F95E72491", +"000000067F000080000005600C00000D97FE-000000067F000080000005600C00000E2F0B__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000DC000-000000067F000080000005600C00000E0000__0000006F949B7C08", +"000000067F000080000005600C00000E0000-000000067F000080000005600C00000E4000__0000006F949B7C08", +"000000067F000080000005600C00000E2F0B-000000067F000080000005600C00000EC671__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000E4000-000000067F000080000005600C00000E8000__0000006F949B7C08", +"000000067F000080000005600C00000E8000-000000067F000080000005600C00000EC000__0000006F949B7C08", +"000000067F000080000005600C00000EC000-000000067F000080000005600C00000F0000__0000006F949B7C08", +"000000067F000080000005600C00000EC671-000000067F000080000005600C00000F5D9F__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000F0000-000000067F000080000005600C00000F4000__0000006F949B7C08", +"000000067F000080000005600C00000F4000-000000067F000080000005600C00000F8000__0000006F949B7C08", +"000000067F000080000005600C00000F5D9F-000000067F000080000005600C00000FF505__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000F8000-000000067F000080000005600C00000FC000__0000006F949B7C08", +"000000067F000080000005600C00000FC000-000000067F000080000005600C0000100000__0000006F949B7C08", +"000000067F000080000005600C00000FF505-000000067F000080000005600C0000108C10__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C0000100000-000000067F000080000005600C0000104000__0000006F949B7C08", +"000000067F000080000005600C0000100001-000000067F000080000005600C0000111BF7__0000006EB935F989-0000006F3370DD59", +"000000067F000080000005600C0000104000-000000067F000080000005600C0000108000__0000006F949B7C08", +"000000067F000080000005600C0000108000-000000067F000080000005600C000010C000__0000006F949B7C08", +"000000067F000080000005600C0000108C10-000000067F000080000005600C0100000000__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C000010C000-000000067F000080000005600C0000110000__0000006F949B7C08", +"000000067F000080000005600C0000110000-000000067F00008000000560120100000000__0000006F949B7C08", +"000000067F000080000005600C0000111BF7-000000067F0000800000056014000000451D__0000006EB935F989-0000006F3370DD59", +"000000067F00008000000560140000002A9A-000000067F00008000000560140000016143__0000006F3370DD59-0000006F95E72491", +"000000067F0000800000056014000000451D-000000067F0000800000056014000000B9A7__0000006EB935F989-0000006F3370DD59", +"000000067F0000800000056014000000B9A7-000000067F00008000000560140000012DE3__0000006EB935F989-0000006F3370DD59", +"000000067F00008000000560140000012DE3-000000067F0000800000056014000001A213__0000006EB935F989-0000006F3370DD59", +"000000067F00008000000560140000016143-000000067F00008000000560140000029CE0__0000006F3370DD59-0000006F95E72491", +"000000067F0000800000056014000001A213-000000067F00008000000560140000021666__0000006EB935F989-0000006F3370DD59", +"000000067F00008000000560140000021666-000000067F00008000000560140000028A7C__0000006EB935F989-0000006F3370DD59", +"000000067F00008000000560140000028A7C-030000000000000000000000000000000002__0000006EB935F989-0000006F3370DD59", +"000000067F00008000000560140000029CE2-030000000000000000000000000000000002__0000006F3370DD59-0000006F95E72491", +"000000067F000080000005800C0000000000-000000067F000080000005800C0000004000__0000006FAFE25518", +"000000067F000080000005800C0000000000-000000067F000080000005800C0000004000__00000071F15CF6B0", +"000000067F000080000005800C0000004000-000000067F000080000005800C0000008000__0000006FAFE25518", +"000000067F000080000005800C0000004000-000000067F000080000005800C0000008000__00000071F15CF6B0", +"000000067F000080000005800C0000007A49-030000000000000000000000000000000002__0000006F95E72491-0000006FA8EDF3B9", +"000000067F000080000005800C0000008000-000000067F000080000005800C000000C000__0000006FAFE25518", +"000000067F000080000005800C0000008000-000000067F000080000005800C000000C000__0000007168C9DFF8", +"000000067F000080000005800C0000008000-000000067F000080000005800C000000C000__00000072377CDB60", +"000000067F000080000005800C00000096DE-000000067F000080000005800C0000012E0C__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C000000C000-000000067F000080000005800C0000010000__0000007168C9DFF8", +"000000067F000080000005800C000000C000-000000067F000080000005800C0000010000__00000072377CDB60", +"000000067F000080000005800C000000C000-030000000000000000000000000000000002__0000006FAFE25518", +"000000067F000080000005800C0000010000-000000067F000080000005800C0000014000__0000007168C9DFF8", +"000000067F000080000005800C0000010000-000000067F000080000005800C0000014000__00000072377CDB60", +"000000067F000080000005800C0000012E0C-000000067F000080000005800C000001C572__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C0000014000-000000067F000080000005800C0000018000__0000007168C9DFF8", +"000000067F000080000005800C0000014000-000000067F000080000005800C0000018000__00000072377CDB60", +"000000067F000080000005800C0000018000-000000067F000080000005800C000001C000__0000007168C9DFF8", +"000000067F000080000005800C0000018000-000000067F000080000005800C000001C000__00000072377CDB60", +"000000067F000080000005800C000001C000-000000067F000080000005800C0000020000__0000007168C9DFF8", +"000000067F000080000005800C000001C000-000000067F000080000005800C0000020000__00000072377CDB60", +"000000067F000080000005800C000001C572-000000067F000080000005800C0000025CD8__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C0000020000-000000067F000080000005800C0000024000__0000007168C9DFF8", +"000000067F000080000005800C0000020000-000000067F000080000005800C0000024000__00000072377CDB60", +"000000067F000080000005800C0000024000-000000067F000080000005800C0000028000__0000007168C9DFF8", +"000000067F000080000005800C0000024000-000000067F000080000005800C0000028000__00000072377CDB60", +"000000067F000080000005800C0000025CD8-000000067F000080000005800C000002F40B__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C0000028000-000000067F000080000005800C000002C000__0000007168C9DFF8", +"000000067F000080000005800C0000028000-000000067F000080000005800C000002C000__00000072377CDB60", +"000000067F000080000005800C000002C000-000000067F000080000005800C0000030000__0000007168C9DFF8", +"000000067F000080000005800C000002C000-000000067F000080000005800C0000030000__00000072377CDB60", +"000000067F000080000005800C000002F40B-000000067F000080000005800C0000038B1E__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C0000030000-000000067F000080000005800C0000034000__0000007168C9DFF8", +"000000067F000080000005800C0000030000-000000067F000080000005800C0000034000__00000072377CDB60", +"000000067F000080000005800C0000034000-000000067F000080000005800C0000038000__0000007168C9DFF8", +"000000067F000080000005800C0000034000-000000067F000080000005800C0000038000__00000072377CDB60", +"000000067F000080000005800C0000038000-000000067F000080000005800C000003C000__0000007168C9DFF8", +"000000067F000080000005800C0000038000-000000067F000080000005800C000003C000__00000072377CDB60", +"000000067F000080000005800C0000038B1E-000000067F000080000005800C0000042284__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C000003C000-000000067F000080000005800C0000040000__0000007168C9DFF8", +"000000067F000080000005800C000003C000-000000067F000080000005800C0000040000__00000072377CDB60", +"000000067F000080000005800C0000040000-000000067F000080000005800C0000044000__0000007168C9DFF8", +"000000067F000080000005800C0000040000-000000067F000080000005800C0000044000__00000072377CDB60", +"000000067F000080000005800C0000042284-000000067F000080000005800C000004B9EA__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C0000044000-000000067F000080000005800C0000048000__0000007168C9DFF8", +"000000067F000080000005800C0000044000-000000067F000080000005800C0000048000__00000072377CDB60", +"000000067F000080000005800C0000048000-000000067F000080000005800C000004C000__0000007168C9DFF8", +"000000067F000080000005800C0000048000-000000067F000080000005800C000004C000__00000072377CDB60", +"000000067F000080000005800C000004B9EA-000000067F000080000005800C000005510B__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C000004C000-000000067F000080000005800C0000050000__0000007168C9DFF8", +"000000067F000080000005800C000004C000-000000067F000080000005800C0000050000__00000072377CDB60", +"000000067F000080000005800C0000050000-000000067F000080000005800C0000054000__0000007168C9DFF8", +"000000067F000080000005800C0000050000-000000067F000080000005800C0000054000__00000072377CDB60", +"000000067F000080000005800C0000054000-000000067F000080000005800C0000058000__0000007168C9DFF8", +"000000067F000080000005800C0000054000-000000067F000080000005800C0000058000__00000072377CDB60", +"000000067F000080000005800C000005510B-000000067F000080000005800C000005E871__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C0000058000-000000067F000080000005800C000005C000__0000007168C9DFF8", +"000000067F000080000005800C0000058000-000000067F000080000005800C000005C000__00000072377CDB60", +"000000067F000080000005800C000005C000-000000067F000080000005800C0000060000__0000007168C9DFF8", +"000000067F000080000005800C000005C000-000000067F000080000005800C0000060000__00000072377CDB60", +"000000067F000080000005800C000005CF08-000000067F000080000005800C00000BAF56__00000071F21624D1-000000723877FF21", +"000000067F000080000005800C000005E871-000000067F000080000005800C0000067F8B__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C0000060000-000000067F000080000005800C0000064000__0000007168C9DFF8", +"000000067F000080000005800C0000060000-000000067F000080000005800C0000064000__00000072377CDB60", +"000000067F000080000005800C0000064000-000000067F000080000005800C0000068000__0000007168C9DFF8", +"000000067F000080000005800C0000064000-000000067F000080000005800C0000068000__00000072377CDB60", +"000000067F000080000005800C0000067F8B-000000067F000080000005800C0100000000__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C0000068000-000000067F000080000005800C000006C000__0000007168C9DFF8", +"000000067F000080000005800C0000068000-000000067F000080000005800C000006C000__00000072377CDB60", +"000000067F000080000005800C000006C000-000000067F000080000005800C0000070000__0000007168C9DFF8", +"000000067F000080000005800C000006C000-000000067F000080000005800C0000070000__00000072377CDB60", +"000000067F000080000005800C0000070000-000000067F000080000005800C0000074000__0000007168C9DFF8", +"000000067F000080000005800C0000070000-000000067F000080000005800C0000074000__00000072377CDB60", +"000000067F000080000005800C0000071854-000000067F000080000005800C000007AFBA__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C0000074000-000000067F000080000005800C0000078000__0000007168C9DFF8", +"000000067F000080000005800C0000074000-000000067F000080000005800C0000078000__00000072377CDB60", +"000000067F000080000005800C0000078000-000000067F000080000005800C000007C000__0000007168C9DFF8", +"000000067F000080000005800C0000078000-000000067F000080000005800C000007C000__00000072377CDB60", +"000000067F000080000005800C000007AFBA-000000067F000080000005800C0000084720__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C000007C000-000000067F000080000005800C0000080000__0000007168C9DFF8", +"000000067F000080000005800C000007C000-000000067F000080000005800C0000080000__00000072377CDB60", +"000000067F000080000005800C0000080000-000000067F000080000005800C0000084000__0000007168C9DFF8", +"000000067F000080000005800C0000080000-000000067F000080000005800C0000084000__00000072377CDB60", +"000000067F000080000005800C0000084000-000000067F000080000005800C0000088000__0000007168C9DFF8", +"000000067F000080000005800C0000084000-000000067F000080000005800C0000088000__00000072377CDB60", +"000000067F000080000005800C0000084720-000000067F000080000005800C000008DE86__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C0000088000-000000067F000080000005800C000008C000__0000007168C9DFF8", +"000000067F000080000005800C0000088000-000000067F000080000005800C000008C000__00000072377CDB60", +"000000067F000080000005800C000008C000-000000067F000080000005800C0000090000__0000007168C9DFF8", +"000000067F000080000005800C000008C000-000000067F000080000005800C0000090000__00000072377CDB60", +"000000067F000080000005800C000008DE86-000000067F000080000005800C00000975A6__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C0000090000-000000067F000080000005800C0000094000__0000007168C9DFF8", +"000000067F000080000005800C0000090000-000000067F000080000005800C0000094000__00000072377CDB60", +"000000067F000080000005800C0000094000-000000067F000080000005800C0000098000__0000007168C9DFF8", +"000000067F000080000005800C0000094000-000000067F000080000005800C0000098000__00000072377CDB60", +"000000067F000080000005800C00000975A6-000000067F000080000005800C00000A0D0C__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C0000098000-000000067F000080000005800C000009C000__0000007168C9DFF8", +"000000067F000080000005800C0000098000-000000067F000080000005800C000009C000__00000072377CDB60", +"000000067F000080000005800C000009C000-000000067F000080000005800C00000A0000__0000007168C9DFF8", +"000000067F000080000005800C000009C000-000000067F000080000005800C00000A0000__00000072377CDB60", +"000000067F000080000005800C000009D78D-000000067F000080000005800C0200000018__000000716A103FC9-00000071F21624D1", +"000000067F000080000005800C00000A0000-000000067F000080000005800C00000A4000__0000007168C9DFF8", +"000000067F000080000005800C00000A0000-000000067F000080000005800C00000A4000__00000072377CDB60", +"000000067F000080000005800C00000A0D0C-000000067F000080000005800C00000AA472__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C00000A4000-000000067F000080000005800C00000A8000__0000007168C9DFF8", +"000000067F000080000005800C00000A4000-000000067F000080000005800C00000A8000__00000072377CDB60", +"000000067F000080000005800C00000A8000-000000067F000080000005800C00000AC000__0000007168C9DFF8", +"000000067F000080000005800C00000A8000-000000067F000080000005800C00000AC000__00000072377CDB60", +"000000067F000080000005800C00000AA472-000000067F000080000005800C00000B3BB4__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C00000AC000-000000067F000080000005800C00000B0000__0000007168C9DFF8", +"000000067F000080000005800C00000AC000-000000067F000080000005800C00000B0000__00000072377CDB60", +"000000067F000080000005800C00000B0000-000000067F000080000005800C00000B4000__0000007168C9DFF8", +"000000067F000080000005800C00000B0000-000000067F000080000005800C00000B4000__00000072377CDB60", +"000000067F000080000005800C00000B3BB4-000000067F000080000005800C00000BD30B__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C00000B4000-000000067F000080000005800C00000B8000__0000007168C9DFF8", +"000000067F000080000005800C00000B4000-000000067F000080000005800C00000B8000__00000072377CDB60", +"000000067F000080000005800C00000B8000-000000067F000080000005800C00000BC000__0000007168C9DFF8", +"000000067F000080000005800C00000B8000-000000067F000080000005800C00000BC000__00000072377CDB60", +"000000067F000080000005800C00000BAF5F-000000067F000080000005801400000007C1__00000071F21624D1-000000723877FF21", +"000000067F000080000005800C00000BC000-000000067F000080000005800C00000C0000__0000007168C9DFF8", +"000000067F000080000005800C00000BC000-000000067F000080000005800C00000C0000__00000072377CDB60", +"000000067F000080000005800C00000BD30B-000000067F000080000005800C00000C6A32__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C00000C0000-000000067F000080000005800C00000C4000__0000007168C9DFF8", +"000000067F000080000005800C00000C0000-000000067F000080000005800C00000C4000__00000072377CDB60", +"000000067F000080000005800C00000C4000-000000067F000080000005800C00000C8000__0000007168C9DFF8", +"000000067F000080000005800C00000C4000-000000067F000080000005800C00000C8000__00000072377CDB60", +"000000067F000080000005800C00000C6A32-000000067F000080000005800C0100000000__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C00000C8000-000000067F000080000005800C00000CC000__0000007168C9DFF8", +"000000067F000080000005800C00000C8000-000000067F000080000005800C00000CC000__00000072377CDB60", +"000000067F000080000005800C00000CC000-000000067F000080000005800C00000D0000__0000007168C9DFF8", +"000000067F000080000005800C00000CC000-000000067F000080000005800C00000D0000__00000072377CDB60", +"000000067F000080000005800C00000CDE2D-000000067F000080000005800C00000D754D__00000070E8761431-000000716A103FC9", +"000000067F000080000005800C00000D0000-000000067F000080000005800C00000D4000__0000007168C9DFF8", +"000000067F000080000005800C00000D0000-000000067F000080000005800C00000D4000__00000072377CDB60", +"000000067F000080000005800C00000D4000-000000067F000080000005800C00000D8000__0000007168C9DFF8", +"000000067F000080000005800C00000D4000-000000067F000080000005800C00000D8000__00000072377CDB60", +"000000067F000080000005800C00000D754D-000000067F000080000005800C00000E0CB3__00000070E8761431-000000716A103FC9", +"000000067F000080000005800C00000D8000-000000067F000080000005800C00000DC000__0000007168C9DFF8", +"000000067F000080000005800C00000D8000-000000067F000080000005800C00000DC000__00000072377CDB60", +"000000067F000080000005800C00000DC000-000000067F000080000005800C00000E0000__0000007168C9DFF8", +"000000067F000080000005800C00000DC000-000000067F000080000005800C00000E0000__00000072377CDB60", +"000000067F000080000005800C00000E0000-000000067F000080000005800C00000E4000__0000007168C9DFF8", +"000000067F000080000005800C00000E0000-000000067F000080000005800C00000E4000__00000072377CDB60", +"000000067F000080000005800C00000E0CB3-000000067F000080000005800C00000EA409__00000070E8761431-000000716A103FC9", +"000000067F000080000005800C00000E4000-000000067F000080000005800C00000E8000__0000007168C9DFF8", +"000000067F000080000005800C00000E4000-000000067F000080000005800C00000E8000__00000072377CDB60", +"000000067F000080000005800C00000E8000-000000067F000080000005800C00000EC000__0000007168C9DFF8", +"000000067F000080000005800C00000E8000-000000067F000080000005800C00000EC000__00000072377CDB60", +"000000067F000080000005800C00000EA409-000000067F000080000005800C00000F3B4B__00000070E8761431-000000716A103FC9", +"000000067F000080000005800C00000EC000-000000067F000080000005800C00000F0000__0000007168C9DFF8", +"000000067F000080000005800C00000EC000-000000067F000080000005800C00000F0000__00000072377CDB60", +"000000067F000080000005800C00000F0000-000000067F000080000005800C00000F4000__0000007168C9DFF8", +"000000067F000080000005800C00000F0000-000000067F000080000005800C00000F4000__00000072377CDB60", +"000000067F000080000005800C00000F3B4B-000000067F000080000005800C00000FD2B1__00000070E8761431-000000716A103FC9", +"000000067F000080000005800C00000F4000-000000067F000080000005800C00000F8000__0000007168C9DFF8", +"000000067F000080000005800C00000F4000-000000067F000080000005800C00000F8000__00000072377CDB60", +"000000067F000080000005800C00000F8000-000000067F000080000005800C00000FC000__0000007168C9DFF8", +"000000067F000080000005800C00000F8000-000000067F000080000005800C00000FC000__00000072377CDB60", +"000000067F000080000005800C00000FC000-000000067F000080000005800C0000100000__0000007168C9DFF8", +"000000067F000080000005800C00000FC000-000000067F000080000005800C0000100000__00000072377CDB60", +"000000067F000080000005800C00000FD2B1-000000067F000080000005800C00001069D8__00000070E8761431-000000716A103FC9", +"000000067F000080000005800C0000100000-000000067F000080000005800C0000104000__0000007168C9DFF8", +"000000067F000080000005800C0000100000-000000067F000080000005800C0000104000__00000072377CDB60", +"000000067F000080000005800C0000104000-000000067F000080000005800C0000108000__0000007168C9DFF8", +"000000067F000080000005800C0000104000-000000067F000080000005800C0000108000__00000072377CDB60", +"000000067F000080000005800C00001069D8-000000067F000080000005800C000011010C__00000070E8761431-000000716A103FC9", +"000000067F000080000005800C0000108000-000000067F000080000005800C000010C000__0000007168C9DFF8", +"000000067F000080000005800C0000108000-000000067F000080000005800C000010C000__00000072377CDB60", +"000000067F000080000005800C000010C000-000000067F000080000005800C0000110000__0000007168C9DFF8", +"000000067F000080000005800C000010C000-000000067F000080000005800C0000110000__00000072377CDB60", +"000000067F000080000005800C0000110000-000000067F00008000000580120100000000__00000072377CDB60", +"000000067F000080000005800C0000110000-030000000000000000000000000000000002__0000007168C9DFF8", +"000000067F000080000005800C000011010C-01000000000000000100000002000000001E__00000070E8761431-000000716A103FC9", +"000000067F000080000005800C0200000018-000000067F000080000005801400000059BE__000000716A103FC9-00000071F21624D1", +"000000067F00008000000580140000000000-000000067F00008000000580140000004000__00000072377CDB60", +"000000067F000080000005801400000007C3-000000067F00008000000580140000020462__00000071F21624D1-000000723877FF21", +"000000067F00008000000580140000004000-000000067F00008000000580140000008000__00000072377CDB60", +"000000067F000080000005801400000059BE-000000067F0000800000058014000000BF38__000000716A103FC9-00000071F21624D1", +"000000067F00008000000580140000008000-000000067F0000800000058014000000C000__00000072377CDB60", +"000000067F0000800000058014000000BF38-000000067F00008000000580140000012530__000000716A103FC9-00000071F21624D1", +"000000067F0000800000058014000000C000-000000067F00008000000580140000010000__00000072377CDB60", +"000000067F00008000000580140000010000-000000067F00008000000580140000014000__00000072377CDB60", +"000000067F00008000000580140000012530-000000067F00008000000580140000018B50__000000716A103FC9-00000071F21624D1", +"000000067F00008000000580140000014000-000000067F00008000000580140000018000__00000072377CDB60", +"000000067F00008000000580140000018000-000000067F0000800000058014000001C000__00000072377CDB60", +"000000067F00008000000580140000018B50-000000067F0000800000058014000001F0D3__000000716A103FC9-00000071F21624D1", +"000000067F0000800000058014000001C000-000000067F00008000000580140000020000__00000072377CDB60", +"000000067F0000800000058014000001F0D3-000000067F0000800000058014000002562B__000000716A103FC9-00000071F21624D1", +"000000067F00008000000580140000020000-000000067F00008000000580140000024000__00000072377CDB60", +"000000067F00008000000580140000020464-030000000000000000000000000000000002__00000071F21624D1-000000723877FF21", +"000000067F00008000000580140000024000-000000067F00008000000580140000028000__00000072377CDB60", +"000000067F0000800000058014000002562B-000000067F0000800000058014000002BC37__000000716A103FC9-00000071F21624D1", +"000000067F00008000000580140000028000-000000067F0000800000058014000002C000__00000072377CDB60", +"000000067F0000800000058014000002BC37-030000000000000000000000000000000002__000000716A103FC9-00000071F21624D1", +"000000067F0000800000058014000002C000-030000000000000000000000000000000002__00000072377CDB60", +"000000067F000080000005A00C0000007614-000000067F000080000005A00C000000ED44__000000723877FF21-00000072A0D7CEA1", +"000000067F000080000005A00C000000ED44-000000067F000080000005A00C0000016337__000000723877FF21-00000072A0D7CEA1", +"000000067F000080000005A00C0000016337-000000067F000080000005A014000000148C__000000723877FF21-00000072A0D7CEA1", +"000000067F000080000005A014000000148C-000000067F000080000005C00C0000003207__000000723877FF21-00000072A0D7CEA1", +"000000067F000080000005C00C0000003207-000000067F000080000005C00C000000C96D__000000723877FF21-00000072A0D7CEA1", +"000000067F000080000005C00C000000C96D-030000000000000000000000000000000002__000000723877FF21-00000072A0D7CEA1", +"000000067F000080000005C00C0000016516-000000067F000080000005C0140000001694__00000072A0D7CEA1-0000007318DDE691", +"000000067F000080000005C0140000001694-000000067F000080000005E00C000000360C__00000072A0D7CEA1-0000007318DDE691", +"000000067F000080000005E00C0000000000-000000067F000080000005E00C0000004000__00000073AF75E930", +"000000067F000080000005E00C0000000000-000000067F000080000005E00C0000004000__000000756884A510", +"000000067F000080000005E00C000000360C-000000067F000080000005E00C000000CD72__00000072A0D7CEA1-0000007318DDE691", +"000000067F000080000005E00C0000004000-000000067F000080000005E00C0000008000__00000073AF75E930", +"000000067F000080000005E00C0000004000-000000067F000080000005E00C0000008000__000000756884A510", +"000000067F000080000005E00C0000008000-000000067F000080000005E00C000000C000__00000073AF75E930", +"000000067F000080000005E00C0000008000-000000067F000080000005E00C000000C000__000000756884A510", +"000000067F000080000005E00C000000C000-000000067F000080000005E00C0000010000__00000073AF75E930", +"000000067F000080000005E00C000000C000-000000067F000080000005E00C0000010000__000000756884A510", +"000000067F000080000005E00C000000CD72-000000067F000080000005E00C00000164D8__00000072A0D7CEA1-0000007318DDE691", +"000000067F000080000005E00C0000010000-000000067F000080000005E00C0000014000__00000073AF75E930", +"000000067F000080000005E00C0000010000-000000067F000080000005E00C0000014000__000000756884A510", +"000000067F000080000005E00C0000014000-000000067F000080000005E00C0000018000__00000073AF75E930", +"000000067F000080000005E00C0000014000-000000067F000080000005E00C0000018000__000000756884A510", +"000000067F000080000005E00C00000164D8-000000067F000080000005E00C000001FC0B__00000072A0D7CEA1-0000007318DDE691", +"000000067F000080000005E00C0000018000-000000067F000080000005E00C000001C000__00000073AF75E930", +"000000067F000080000005E00C0000018000-000000067F000080000005E00C000001C000__000000756884A510", +"000000067F000080000005E00C000001C000-000000067F000080000005E00C0000020000__00000073AF75E930", +"000000067F000080000005E00C000001C000-000000067F000080000005E00C0000020000__000000756884A510", +"000000067F000080000005E00C000001FC0B-000000067F000080000005E00C0000029319__00000072A0D7CEA1-0000007318DDE691", +"000000067F000080000005E00C0000020000-000000067F000080000005E00C0000024000__00000073AF75E930", +"000000067F000080000005E00C0000020000-000000067F000080000005E00C0000024000__000000756884A510", +"000000067F000080000005E00C0000024000-000000067F000080000005E00C0000028000__00000073AF75E930", +"000000067F000080000005E00C0000024000-000000067F000080000005E00C0000028000__000000756884A510", +"000000067F000080000005E00C0000028000-000000067F000080000005E00C000002C000__00000073AF75E930", +"000000067F000080000005E00C0000028000-000000067F000080000005E00C000002C000__000000756884A510", +"000000067F000080000005E00C0000029319-030000000000000000000000000000000002__00000072A0D7CEA1-0000007318DDE691", +"000000067F000080000005E00C000002C000-000000067F000080000005E00C0000030000__00000073AF75E930", +"000000067F000080000005E00C000002C000-000000067F000080000005E00C0000030000__000000756884A510", +"000000067F000080000005E00C0000030000-000000067F000080000005E00C0000034000__00000073AF75E930", +"000000067F000080000005E00C0000030000-000000067F000080000005E00C0000034000__000000756884A510", +"000000067F000080000005E00C0000034000-000000067F000080000005E00C0000038000__00000073AF75E930", +"000000067F000080000005E00C0000034000-000000067F000080000005E00C0000038000__000000756884A510", +"000000067F000080000005E00C0000038000-000000067F000080000005E00C000003C000__00000073AF75E930", +"000000067F000080000005E00C0000038000-000000067F000080000005E00C000003C000__000000756884A510", +"000000067F000080000005E00C00000385D9-000000067F000080000005E00C0000041D0A__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C000003C000-000000067F000080000005E00C0000040000__00000073AF75E930", +"000000067F000080000005E00C000003C000-000000067F000080000005E00C0000040000__000000756884A510", +"000000067F000080000005E00C0000040000-000000067F000080000005E00C0000044000__00000073AF75E930", +"000000067F000080000005E00C0000040000-000000067F000080000005E00C0000044000__000000756884A510", +"000000067F000080000005E00C0000041D0A-000000067F000080000005E00C000004B470__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000044000-000000067F000080000005E00C0000048000__00000073AF75E930", +"000000067F000080000005E00C0000044000-000000067F000080000005E00C0000048000__000000756884A510", +"000000067F000080000005E00C0000048000-000000067F000080000005E00C000004C000__00000073AF75E930", +"000000067F000080000005E00C0000048000-000000067F000080000005E00C000004C000__000000756884A510", +"000000067F000080000005E00C000004B470-000000067F000080000005E00C0000054BA9__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C000004C000-000000067F000080000005E00C0000050000__00000073AF75E930", +"000000067F000080000005E00C000004C000-000000067F000080000005E00C0000050000__000000756884A510", +"000000067F000080000005E00C0000050000-000000067F000080000005E00C0000054000__00000073AF75E930", +"000000067F000080000005E00C0000050000-000000067F000080000005E00C0000054000__000000756884A510", +"000000067F000080000005E00C000005017A-000000067F000080000005E00C000009FEAD__000000751253A4C1-00000075687C3009", +"000000067F000080000005E00C0000054000-000000067F000080000005E00C0000058000__00000073AF75E930", +"000000067F000080000005E00C0000054000-000000067F000080000005E00C0000058000__000000756884A510", +"000000067F000080000005E00C0000054BA9-000000067F000080000005E00C000005E30B__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000058000-000000067F000080000005E00C000005C000__00000073AF75E930", +"000000067F000080000005E00C0000058000-000000067F000080000005E00C000005C000__000000756884A510", +"000000067F000080000005E00C000005C000-000000067F000080000005E00C0000060000__00000073AF75E930", +"000000067F000080000005E00C000005C000-000000067F000080000005E00C0000060000__000000756884A510", +"000000067F000080000005E00C000005E30B-000000067F000080000005E00C0000067A2C__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000060000-000000067F000080000005E00C0000064000__00000073AF75E930", +"000000067F000080000005E00C0000060000-000000067F000080000005E00C0000064000__000000756884A510", +"000000067F000080000005E00C0000064000-000000067F000080000005E00C0000068000__00000073AF75E930", +"000000067F000080000005E00C0000064000-000000067F000080000005E00C0000068000__000000756884A510", +"000000067F000080000005E00C0000067A2C-000000067F000080000005E00C0000071187__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000068000-000000067F000080000005E00C000006C000__00000073AF75E930", +"000000067F000080000005E00C0000068000-000000067F000080000005E00C000006C000__000000756884A510", +"000000067F000080000005E00C000006C000-000000067F000080000005E00C0000070000__00000073AF75E930", +"000000067F000080000005E00C000006C000-000000067F000080000005E00C0000070000__000000756884A510", +"000000067F000080000005E00C0000070000-000000067F000080000005E00C0000074000__00000073AF75E930", +"000000067F000080000005E00C0000070000-000000067F000080000005E00C0000074000__000000756884A510", +"000000067F000080000005E00C0000071187-000000067F000080000005E00C000007A8ED__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000074000-000000067F000080000005E00C0000078000__00000073AF75E930", +"000000067F000080000005E00C0000074000-000000067F000080000005E00C0000078000__000000756884A510", +"000000067F000080000005E00C0000078000-000000067F000080000005E00C000007C000__00000073AF75E930", +"000000067F000080000005E00C0000078000-000000067F000080000005E00C000007C000__000000756884A510", +"000000067F000080000005E00C000007A8ED-000000067F000080000005E00C000008400B__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C000007C000-000000067F000080000005E00C0000080000__00000073AF75E930", +"000000067F000080000005E00C000007C000-000000067F000080000005E00C0000080000__000000756884A510", +"000000067F000080000005E00C0000080000-000000067F000080000005E00C0000084000__00000073AF75E930", +"000000067F000080000005E00C0000080000-000000067F000080000005E00C0000084000__000000756884A510", +"000000067F000080000005E00C0000084000-000000067F000080000005E00C0000088000__00000073AF75E930", +"000000067F000080000005E00C0000084000-000000067F000080000005E00C0000088000__000000756884A510", +"000000067F000080000005E00C000008400B-000000067F000080000005E00C000008D771__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000088000-000000067F000080000005E00C000008C000__000000756884A510", +"000000067F000080000005E00C0000088000-030000000000000000000000000000000002__00000073AF75E930", +"000000067F000080000005E00C000008C000-000000067F000080000005E00C0000090000__000000756884A510", +"000000067F000080000005E00C000008D771-000000067F000080000005E00C0000096ED7__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000090000-000000067F000080000005E00C0000094000__000000756884A510", +"000000067F000080000005E00C0000094000-000000067F000080000005E00C0000098000__000000756884A510", +"000000067F000080000005E00C0000096ED7-000000067F000080000005E00C00000A060B__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000098000-000000067F000080000005E00C000009C000__000000756884A510", +"000000067F000080000005E00C000009C000-000000067F000080000005E00C00000A0000__000000756884A510", +"000000067F000080000005E00C000009FEB2-000000067F000080000005E00C00000EF4ED__000000751253A4C1-00000075687C3009", +"000000067F000080000005E00C00000A0000-000000067F000080000005E00C00000A4000__000000756884A510", +"000000067F000080000005E00C00000A060B-000000067F000080000005E00C00000A9D71__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000A4000-000000067F000080000005E00C00000A8000__000000756884A510", +"000000067F000080000005E00C00000A8000-000000067F000080000005E00C00000AC000__000000756884A510", +"000000067F000080000005E00C00000A9D71-000000067F000080000005E00C00000B34D7__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000AC000-000000067F000080000005E00C00000B0000__000000756884A510", +"000000067F000080000005E00C00000AF576-000000067F000080000005E00C0200000023__0000007497B01FF9-000000751253A4C1", +"000000067F000080000005E00C00000B0000-000000067F000080000005E00C00000B4000__000000756884A510", +"000000067F000080000005E00C00000B34D7-000000067F000080000005E00C00000BCC0C__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000B4000-000000067F000080000005E00C00000B8000__000000756884A510", +"000000067F000080000005E00C00000B8000-000000067F000080000005E00C00000BC000__000000756884A510", +"000000067F000080000005E00C00000BC000-000000067F000080000005E00C00000C0000__000000756884A510", +"000000067F000080000005E00C00000BCC0C-000000067F000080000005E00C00000C6336__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000C0000-000000067F000080000005E00C00000C4000__000000756884A510", +"000000067F000080000005E00C00000C4000-000000067F000080000005E00C00000C8000__000000756884A510", +"000000067F000080000005E00C00000C6336-000000067F000080000005E00C00000CFA9C__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000C8000-000000067F000080000005E00C00000CC000__000000756884A510", +"000000067F000080000005E00C00000CC000-000000067F000080000005E00C00000D0000__000000756884A510", +"000000067F000080000005E00C00000CFA9C-000000067F000080000005E00C00000D91AB__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000D0000-000000067F000080000005E00C00000D4000__000000756884A510", +"000000067F000080000005E00C00000D4000-000000067F000080000005E00C00000D8000__000000756884A510", +"000000067F000080000005E00C00000D8000-000000067F000080000005E00C00000DC000__000000756884A510", +"000000067F000080000005E00C00000D91AB-000000067F000080000005E00C00000E2911__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000DC000-000000067F000080000005E00C00000E0000__000000756884A510", +"000000067F000080000005E00C00000E0000-000000067F000080000005E00C00000E4000__000000756884A510", +"000000067F000080000005E00C00000E2911-000000067F000080000005E00C00000EC077__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000E4000-000000067F000080000005E00C00000E8000__000000756884A510", +"000000067F000080000005E00C00000E8000-000000067F000080000005E00C00000EC000__000000756884A510", +"000000067F000080000005E00C00000EC000-000000067F000080000005E00C00000F0000__000000756884A510", +"000000067F000080000005E00C00000EC077-000000067F000080000005E00C00000F57A8__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000EF4F1-000000067F000080000005E014000000BDDE__000000751253A4C1-00000075687C3009", +"000000067F000080000005E00C00000F0000-000000067F000080000005E00C00000F4000__000000756884A510", +"000000067F000080000005E00C00000F4000-000000067F000080000005E00C00000F8000__000000756884A510", +"000000067F000080000005E00C00000F57A8-000000067F000080000005E00C00000FEF0A__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000F8000-000000067F000080000005E00C00000FC000__000000756884A510", +"000000067F000080000005E00C00000FC000-000000067F000080000005E00C0000100000__000000756884A510", +"000000067F000080000005E00C00000FEF0A-000000067F000080000005E00C000010862B__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000100000-000000067F000080000005E00C0000104000__000000756884A510", +"000000067F000080000005E00C0000104000-000000067F000080000005E00C0000108000__000000756884A510", +"000000067F000080000005E00C0000108000-000000067F000080000005E00C000010C000__000000756884A510", +"000000067F000080000005E00C000010862B-000000067F000080000005E00C0000111C20__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C000010C000-000000067F000080000005E00C0000110000__000000756884A510", +"000000067F000080000005E00C0000110000-000000067F000080000005E0120100000000__000000756884A510", +"000000067F000080000005E00C00FFFFFFFF-010000000000000001000000030000000002__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C02FFFFFFFF-000000067F000080000005E0140000006C41__0000007497B01FF9-000000751253A4C1", +"000000067F000080000005E0140000000000-000000067F000080000005E0140000004000__000000756884A510", +"000000067F000080000005E0140000004000-000000067F000080000005E0140000008000__000000756884A510", +"000000067F000080000005E0140000006C41-000000067F000080000005E014000000D890__0000007497B01FF9-000000751253A4C1", +"000000067F000080000005E0140000008000-000000067F000080000005E014000000C000__000000756884A510", +"000000067F000080000005E014000000BDDE-000000067F000080000005E0140000023A18__000000751253A4C1-00000075687C3009", +"000000067F000080000005E014000000C000-000000067F000080000005E0140000010000__000000756884A510", +"000000067F000080000005E014000000D890-000000067F000080000005E01400000144C8__0000007497B01FF9-000000751253A4C1", +"000000067F000080000005E0140000010000-000000067F000080000005E0140000014000__000000756884A510", +"000000067F000080000005E0140000014000-000000067F000080000005E0140000018000__000000756884A510", +"000000067F000080000005E01400000144C8-000000067F000080000005E014000001B1AC__0000007497B01FF9-000000751253A4C1", +"000000067F000080000005E0140000018000-000000067F000080000005E014000001C000__000000756884A510", +"000000067F000080000005E014000001B1AC-000000067F000080000005E0140000021E03__0000007497B01FF9-000000751253A4C1", +"000000067F000080000005E014000001C000-000000067F000080000005E0140000020000__000000756884A510", +"000000067F000080000005E0140000020000-000000067F000080000005E0140000024000__000000756884A510", +"000000067F000080000005E0140000021E03-000000067F000080000005E0140000028A36__0000007497B01FF9-000000751253A4C1", +"000000067F000080000005E0140000023A18-030000000000000000000000000000000002__000000751253A4C1-00000075687C3009", +"000000067F000080000005E0140000024000-000000067F000080000005E0140000028000__000000756884A510", +"000000067F000080000005E0140000028000-000000067F000080000005E014000002C000__000000756884A510", +"000000067F000080000005E0140000028A36-030000000000000000000000000000000002__0000007497B01FF9-000000751253A4C1", +"000000067F000080000005E014000002C000-030000000000000000000000000000000002__000000756884A510", +"000000067F000080000006000C0000000000-000000067F000080000006000C0000004000__00000077B1836CA0", +"000000067F000080000006000C0000004000-000000067F000080000006000C0000008000__00000077B1836CA0", +"000000067F000080000006000C0000008000-000000067F000080000006000C000000C000__00000077B1836CA0", +"000000067F000080000006000C0000008FB7-000000067F000080000006000C000001271D__00000075687C3009-00000075E915EBC9", +"000000067F000080000006000C000000C000-000000067F000080000006000C0000010000__00000077B1836CA0", +"000000067F000080000006000C0000010000-000000067F000080000006000C0000014000__00000077B1836CA0", +"000000067F000080000006000C000001271D-000000067F000080000006000C000001BE83__00000075687C3009-00000075E915EBC9", +"000000067F000080000006000C0000014000-000000067F000080000006000C0000018000__00000077B1836CA0", +"000000067F000080000006000C0000018000-000000067F000080000006000C000001C000__00000077B1836CA0", +"000000067F000080000006000C000001BE83-000000067F000080000006000C00000255B6__00000075687C3009-00000075E915EBC9", +"000000067F000080000006000C000001C000-000000067F000080000006000C0000020000__00000077B1836CA0", +"000000067F000080000006000C0000020000-000000067F000080000006000C0000024000__00000077B1836CA0", +"000000067F000080000006000C0000024000-000000067F000080000006000C0000028000__00000077B1836CA0", +"000000067F000080000006000C00000255B6-000000067F000080000006000C000002ED0B__00000075687C3009-00000075E915EBC9", +"000000067F000080000006000C0000028000-000000067F000080000006000C000002C000__00000077B1836CA0", +"000000067F000080000006000C000002C000-000000067F000080000006000C0000030000__00000077B1836CA0", +"000000067F000080000006000C000002ED0B-000000067F000080000006000C000003842B__00000075687C3009-00000075E915EBC9", +"000000067F000080000006000C0000030000-000000067F000080000006000C0000034000__00000077B1836CA0", +"000000067F000080000006000C0000034000-000000067F000080000006000C0000038000__00000077B1836CA0", +"000000067F000080000006000C0000038000-000000067F000080000006000C000003C000__00000077B1836CA0", +"000000067F000080000006000C000003842B-000000067F000080000006000C0000041B80__00000075687C3009-00000075E915EBC9", +"000000067F000080000006000C000003C000-000000067F000080000006000C0000040000__00000077B1836CA0", +"000000067F000080000006000C0000040000-000000067F000080000006000C0000044000__00000077B1836CA0", +"000000067F000080000006000C0000041B80-000000067F000080000006000C000004B2E6__00000075687C3009-00000075E915EBC9", +"000000067F000080000006000C0000044000-000000067F000080000006000C0000048000__00000077B1836CA0", +"000000067F000080000006000C0000048000-000000067F000080000006000C000004C000__0000007739203FF0", +"000000067F000080000006000C000004B2E6-030000000000000000000000000000000002__00000075687C3009-00000075E915EBC9", +"000000067F000080000006000C000004BAC2-000000067F000080000006000C00000551F7__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C000004C000-000000067F000080000006000C0000050000__0000007739203FF0", +"000000067F000080000006000C0000050000-000000067F000080000006000C0000054000__0000007739203FF0", +"000000067F000080000006000C0000051A05-000000067F000080000006000C00000A4D93__00000077B2AD0F91-0000007805801C41", +"000000067F000080000006000C0000054000-000000067F000080000006000C0000058000__0000007739203FF0", +"000000067F000080000006000C00000551F7-000000067F000080000006000C000005E90B__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C0000058000-000000067F000080000006000C000005C000__0000007739203FF0", +"000000067F000080000006000C000005C000-000000067F000080000006000C0000060000__0000007739203FF0", +"000000067F000080000006000C000005E90B-000000067F000080000006000C000006802B__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C0000060000-000000067F000080000006000C0000064000__0000007739203FF0", +"000000067F000080000006000C0000064000-000000067F000080000006000C0000068000__0000007739203FF0", +"000000067F000080000006000C0000068000-000000067F000080000006000C000006C000__0000007739203FF0", +"000000067F000080000006000C000006802B-000000067F000080000006000C0000071782__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C000006C000-000000067F000080000006000C0000070000__0000007739203FF0", +"000000067F000080000006000C0000070000-000000067F000080000006000C0000074000__0000007739203FF0", +"000000067F000080000006000C0000071782-000000067F000080000006000C000007AEE8__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C0000074000-000000067F000080000006000C0000078000__0000007739203FF0", +"000000067F000080000006000C0000078000-000000067F000080000006000C000007C000__0000007739203FF0", +"000000067F000080000006000C000007AEE8-000000067F000080000006000C000008460B__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C000007C000-000000067F000080000006000C0000080000__0000007739203FF0", +"000000067F000080000006000C0000080000-000000067F000080000006000C0000084000__0000007739203FF0", +"000000067F000080000006000C0000084000-000000067F000080000006000C0000088000__0000007739203FF0", +"000000067F000080000006000C000008460B-000000067F000080000006000C000008DD71__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C0000088000-000000067F000080000006000C000008C000__0000007739203FF0", +"000000067F000080000006000C000008C000-000000067F000080000006000C0000090000__0000007739203FF0", +"000000067F000080000006000C000008DD71-000000067F000080000006000C00000974D7__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C0000090000-000000067F000080000006000C0000094000__0000007739203FF0", +"000000067F000080000006000C0000094000-000000067F000080000006000C0000098000__0000007739203FF0", +"000000067F000080000006000C00000974D7-000000067F000080000006000C00000A0C0B__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C0000098000-000000067F000080000006000C000009C000__0000007739203FF0", +"000000067F000080000006000C000009C000-000000067F000080000006000C00000A0000__0000007739203FF0", +"000000067F000080000006000C00000A0000-000000067F000080000006000C00000A4000__0000007739203FF0", +"000000067F000080000006000C00000A0C0B-000000067F000080000006000C00000AA371__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C00000A4000-000000067F000080000006000C00000A8000__0000007739203FF0", +"000000067F000080000006000C00000A4D95-000000067F000080000006000C00000F7C7B__00000077B2AD0F91-0000007805801C41", +"000000067F000080000006000C00000A8000-000000067F000080000006000C00000AC000__0000007739203FF0", +"000000067F000080000006000C00000AA371-000000067F000080000006000C00000B3AD7__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C00000AC000-000000067F000080000006000C00000B0000__0000007739203FF0", +"000000067F000080000006000C00000B0000-000000067F000080000006000C00000B4000__0000007739203FF0", +"000000067F000080000006000C00000B3AD7-000000067F000080000006000C00000BD20B__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C00000B4000-000000067F000080000006000C00000B8000__0000007739203FF0", +"000000067F000080000006000C00000B8000-000000067F000080000006000C00000BC000__0000007739203FF0", +"000000067F000080000006000C00000BC000-000000067F000080000006000C00000C0000__0000007739203FF0", +"000000067F000080000006000C00000BD20B-000000067F000080000006000C0100000000__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C00000C0000-000000067F000080000006000C00000C4000__0000007739203FF0", +"000000067F000080000006000C00000C3C38-000000067F00008000000600140000001B38__00000077391A8001-00000077B2AD0F91", +"000000067F000080000006000C00000C4000-000000067F000080000006000C00000C8000__0000007739203FF0", +"000000067F000080000006000C00000C56C1-000000067F000080000006000C00000CEE0A__00000076A8CDE8F9-00000077391A8001", +"000000067F000080000006000C00000C8000-000000067F000080000006000C00000CC000__0000007739203FF0", +"000000067F000080000006000C00000CC000-000000067F000080000006000C00000D0000__0000007739203FF0", +"000000067F000080000006000C00000CEE0A-000000067F000080000006000C00000D8520__00000076A8CDE8F9-00000077391A8001", +"000000067F000080000006000C00000D0000-000000067F000080000006000C00000D4000__0000007739203FF0", +"000000067F000080000006000C00000D4000-000000067F000080000006000C00000D8000__0000007739203FF0", +"000000067F000080000006000C00000D8000-000000067F000080000006000C00000DC000__0000007739203FF0", +"000000067F000080000006000C00000D8520-000000067F000080000006000C00000E1C86__00000076A8CDE8F9-00000077391A8001", +"000000067F000080000006000C00000DC000-000000067F000080000006000C00000E0000__0000007739203FF0", +"000000067F000080000006000C00000E0000-000000067F000080000006000C00000E4000__0000007739203FF0", +"000000067F000080000006000C00000E1C86-000000067F000080000006000C00000EB3EC__00000076A8CDE8F9-00000077391A8001", +"000000067F000080000006000C00000E4000-000000067F000080000006000C00000E8000__0000007739203FF0", +"000000067F000080000006000C00000E8000-000000067F000080000006000C00000EC000__0000007739203FF0", +"000000067F000080000006000C00000EB3EC-000000067F000080000006000C00000F4B0C__00000076A8CDE8F9-00000077391A8001", +"000000067F000080000006000C00000EC000-000000067F000080000006000C00000F0000__0000007739203FF0", +"000000067F000080000006000C00000F0000-000000067F000080000006000C00000F4000__0000007739203FF0", +"000000067F000080000006000C00000F4000-000000067F000080000006000C00000F8000__0000007739203FF0", +"000000067F000080000006000C00000F4B0C-000000067F000080000006000C00000FE272__00000076A8CDE8F9-00000077391A8001", +"000000067F000080000006000C00000F7C96-000000067F0000800000060014000000F3A9__00000077B2AD0F91-0000007805801C41", +"000000067F000080000006000C00000F8000-000000067F000080000006000C00000FC000__0000007739203FF0", +"000000067F000080000006000C00000FC000-000000067F000080000006000C0000100000__0000007739203FF0", +"000000067F000080000006000C00000FE272-000000067F000080000006000C000010798F__00000076A8CDE8F9-00000077391A8001", +"000000067F000080000006000C0000100000-000000067F000080000006000C0000104000__0000007739203FF0", +"000000067F000080000006000C0000104000-000000067F000080000006000C0000108000__0000007739203FF0", +"000000067F000080000006000C000010798F-000000067F000080000006000C00001110F5__00000076A8CDE8F9-00000077391A8001", +"000000067F000080000006000C0000108000-000000067F000080000006000C000010C000__0000007739203FF0", +"000000067F000080000006000C000010C000-000000067F000080000006000C0000110000__0000007739203FF0", +"000000067F000080000006000C0000110000-030000000000000000000000000000000002__0000007739203FF0", +"000000067F000080000006000C00001110F5-010000000000000001000000030000000006__00000076A8CDE8F9-00000077391A8001", +"000000067F00008000000600140000001B38-000000067F00008000000600140000008758__00000077391A8001-00000077B2AD0F91", +"000000067F00008000000600140000008758-000000067F0000800000060014000000F32F__00000077391A8001-00000077B2AD0F91", +"000000067F0000800000060014000000F32F-000000067F00008000000600140000015EDC__00000077391A8001-00000077B2AD0F91", +"000000067F0000800000060014000000F3A9-000000067F00008000000600140000028656__00000077B2AD0F91-0000007805801C41", +"000000067F00008000000600140000015EDC-000000067F0000800000060014000001CB12__00000077391A8001-00000077B2AD0F91", +"000000067F0000800000060014000001CB12-000000067F000080000006001400000236BC__00000077391A8001-00000077B2AD0F91", +"000000067F000080000006001400000236BC-000000067F0000800000060014000002A294__00000077391A8001-00000077B2AD0F91", +"000000067F00008000000600140000028657-030000000000000000000000000000000002__00000077B2AD0F91-0000007805801C41", +"000000067F0000800000060014000002A294-030000000000000000000000000000000002__00000077391A8001-00000077B2AD0F91", +"000000067F000080000006200C0000000000-000000067F000080000006200C0000004000__00000078B2CB1C68", +"000000067F000080000006200C0000004000-000000067F000080000006200C0000008000__00000078B2CB1C68", +"000000067F000080000006200C0000008000-000000067F000080000006200C000000C000__00000078B2CB1C68", +"000000067F000080000006200C0000009441-000000067F000080000006200C0000012B8D__0000007805801C41-00000078859FEA11", +"000000067F000080000006200C000000C000-000000067F000080000006200C0000010000__00000078B2CB1C68", +"000000067F000080000006200C0000010000-000000067F000080000006200C0000014000__00000078B2CB1C68", +"000000067F000080000006200C0000012B8D-000000067F000080000006200C000001C2F3__0000007805801C41-00000078859FEA11", +"000000067F000080000006200C0000014000-000000067F000080000006200C0000018000__00000078B2CB1C68", +"000000067F000080000006200C0000018000-000000067F000080000006200C000001C000__00000078B2CB1C68", +"000000067F000080000006200C000001C000-000000067F000080000006200C0000020000__00000078B2CB1C68", +"000000067F000080000006200C000001C2F3-000000067F000080000006200C0000025A0C__0000007805801C41-00000078859FEA11", +"000000067F000080000006200C0000020000-000000067F000080000006200C0000024000__00000078B2CB1C68", +"000000067F000080000006200C0000024000-000000067F000080000006200C0000028000__00000078B2CB1C68", +"000000067F000080000006200C0000025A0C-000000067F000080000006200C000002F172__0000007805801C41-00000078859FEA11", +"000000067F000080000006200C0000028000-000000067F000080000006200C000002C000__00000078B2CB1C68", +"000000067F000080000006200C000002C000-000000067F000080000006200C0000030000__00000078B2CB1C68", +"000000067F000080000006200C000002F172-000000067F000080000006200C00000388D8__0000007805801C41-00000078859FEA11", +"000000067F000080000006200C0000030000-000000067F000080000006200C0000034000__00000078B2CB1C68", +"000000067F000080000006200C0000034000-000000067F000080000006200C0000038000__00000078B2CB1C68", +"000000067F000080000006200C0000038000-000000067F000080000006200C000003C000__00000078B2CB1C68", +"000000067F000080000006200C00000388D8-000000067F000080000006200C0000042009__0000007805801C41-00000078859FEA11", +"000000067F000080000006200C000003C000-000000067F000080000006200C0000040000__00000078B2CB1C68", +"000000067F000080000006200C0000040000-000000067F000080000006200C0000044000__00000078B2CB1C68", +"000000067F000080000006200C0000042009-000000067F000080000006200C000004B76F__0000007805801C41-00000078859FEA11", +"000000067F000080000006200C0000044000-000000067F000080000006200C0000048000__00000078B2CB1C68", +"000000067F000080000006200C0000048000-000000067F000080000006200C000004C000__00000078B2CB1C68", +"000000067F000080000006200C0000048000-000000067F000080000006200C000004C000__0000007AA0A6FB48", +"000000067F000080000006200C0000048121-000000067F000080000006200C0000090C08__0000007A3F679FA1-0000007AA1DF6639", +"000000067F000080000006200C000004B76F-030000000000000000000000000000000002__0000007805801C41-00000078859FEA11", +"000000067F000080000006200C000004BAC9-000000067F000080000006200C00000551FE__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C000004C000-000000067F000080000006200C0000050000__00000078B2CB1C68", +"000000067F000080000006200C000004C000-000000067F000080000006200C0000050000__0000007AA0A6FB48", +"000000067F000080000006200C0000050000-000000067F000080000006200C0000054000__00000078B2CB1C68", +"000000067F000080000006200C0000050000-000000067F000080000006200C0000054000__0000007AA0A6FB48", +"000000067F000080000006200C0000054000-000000067F000080000006200C0000058000__00000078B2CB1C68", +"000000067F000080000006200C0000054000-000000067F000080000006200C0000058000__0000007AA0A6FB48", +"000000067F000080000006200C00000551FE-000000067F000080000006200C000005E90C__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C0000058000-000000067F000080000006200C000005C000__00000078B2CB1C68", +"000000067F000080000006200C0000058000-000000067F000080000006200C000005C000__0000007AA0A6FB48", +"000000067F000080000006200C000005C000-000000067F000080000006200C0000060000__00000078B2CB1C68", +"000000067F000080000006200C000005C000-000000067F000080000006200C0000060000__0000007AA0A6FB48", +"000000067F000080000006200C000005E90C-000000067F000080000006200C000006802C__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C0000060000-000000067F000080000006200C0000064000__00000078B2CB1C68", +"000000067F000080000006200C0000060000-000000067F000080000006200C0000064000__0000007AA0A6FB48", +"000000067F000080000006200C0000064000-000000067F000080000006200C0000068000__0000007AA0A6FB48", +"000000067F000080000006200C0000064000-030000000000000000000000000000000002__00000078B2CB1C68", +"000000067F000080000006200C0000068000-000000067F000080000006200C000006C000__0000007AA0A6FB48", +"000000067F000080000006200C000006802C-000000067F000080000006200C0000071783__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C000006C000-000000067F000080000006200C0000070000__0000007AA0A6FB48", +"000000067F000080000006200C0000070000-000000067F000080000006200C0000074000__0000007AA0A6FB48", +"000000067F000080000006200C0000071783-000000067F000080000006200C000007AEE9__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C0000074000-000000067F000080000006200C0000078000__0000007AA0A6FB48", +"000000067F000080000006200C0000078000-000000067F000080000006200C000007C000__0000007AA0A6FB48", +"000000067F000080000006200C000007AEE9-000000067F000080000006200C000008460B__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C000007C000-000000067F000080000006200C0000080000__0000007AA0A6FB48", +"000000067F000080000006200C0000080000-000000067F000080000006200C0000084000__0000007AA0A6FB48", +"000000067F000080000006200C0000084000-000000067F000080000006200C0000088000__0000007AA0A6FB48", +"000000067F000080000006200C000008460B-000000067F000080000006200C000008DD71__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C0000088000-000000067F000080000006200C000008C000__0000007AA0A6FB48", +"000000067F000080000006200C000008C000-000000067F000080000006200C0000090000__0000007AA0A6FB48", +"000000067F000080000006200C000008DD71-000000067F000080000006200C00000974D7__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C0000090000-000000067F000080000006200C0000094000__0000007AA0A6FB48", +"000000067F000080000006200C0000090C11-000000067F000080000006200C00000DA35B__0000007A3F679FA1-0000007AA1DF6639", +"000000067F000080000006200C0000094000-000000067F000080000006200C0000098000__0000007AA0A6FB48", +"000000067F000080000006200C00000974D7-000000067F000080000006200C00000A0C0B__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C0000098000-000000067F000080000006200C000009C000__0000007AA0A6FB48", +"000000067F000080000006200C000009C000-000000067F000080000006200C00000A0000__0000007AA0A6FB48", +"000000067F000080000006200C00000A0000-000000067F000080000006200C00000A4000__0000007AA0A6FB48", +"000000067F000080000006200C00000A0C0B-000000067F000080000006200C00000AA371__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000A4000-000000067F000080000006200C00000A8000__0000007AA0A6FB48", +"000000067F000080000006200C00000A8000-000000067F000080000006200C00000AC000__0000007AA0A6FB48", +"000000067F000080000006200C00000AA371-000000067F000080000006200C00000B3AD7__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000AC000-000000067F000080000006200C00000B0000__0000007AA0A6FB48", +"000000067F000080000006200C00000B0000-000000067F000080000006200C00000B4000__0000007AA0A6FB48", +"000000067F000080000006200C00000B3AD7-000000067F000080000006200C00000BD20B__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000B4000-000000067F000080000006200C00000B8000__0000007AA0A6FB48", +"000000067F000080000006200C00000B8000-000000067F000080000006200C00000BC000__0000007AA0A6FB48", +"000000067F000080000006200C00000BC000-000000067F000080000006200C00000C0000__0000007AA0A6FB48", +"000000067F000080000006200C00000BD20B-000000067F000080000006200C00000C6932__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000C0000-000000067F000080000006200C00000C4000__0000007AA0A6FB48", +"000000067F000080000006200C00000C4000-000000067F000080000006200C00000C8000__0000007AA0A6FB48", +"000000067F000080000006200C00000C6932-000000067F000080000006200C00000D0098__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000C8000-000000067F000080000006200C00000CC000__0000007AA0A6FB48", +"000000067F000080000006200C00000CC000-000000067F000080000006200C00000D0000__0000007AA0A6FB48", +"000000067F000080000006200C00000D0000-000000067F000080000006200C00000D4000__0000007AA0A6FB48", +"000000067F000080000006200C00000D0098-000000067F000080000006200C00000D97FE__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000D4000-000000067F000080000006200C00000D8000__0000007AA0A6FB48", +"000000067F000080000006200C00000D8000-000000067F000080000006200C00000DC000__0000007AA0A6FB48", +"000000067F000080000006200C00000D97FE-000000067F000080000006200C00000E2F0B__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000DA36C-000000067F00008000000620140000002D07__0000007A3F679FA1-0000007AA1DF6639", +"000000067F000080000006200C00000DC000-000000067F000080000006200C00000E0000__0000007AA0A6FB48", +"000000067F000080000006200C00000E0000-000000067F000080000006200C00000E4000__0000007AA0A6FB48", +"000000067F000080000006200C00000E2F0B-000000067F000080000006200C00000EC671__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000E4000-000000067F000080000006200C00000E8000__0000007AA0A6FB48", +"000000067F000080000006200C00000E8000-000000067F000080000006200C00000EC000__0000007AA0A6FB48", +"000000067F000080000006200C00000EC000-000000067F000080000006200C00000F0000__0000007AA0A6FB48", +"000000067F000080000006200C00000EC671-000000067F000080000006200C00000F5D9F__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000F0000-000000067F000080000006200C00000F4000__0000007AA0A6FB48", +"000000067F000080000006200C00000F4000-000000067F000080000006200C00000F8000__0000007AA0A6FB48", +"000000067F000080000006200C00000F5D9F-000000067F000080000006200C00000FF505__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000F8000-000000067F000080000006200C00000FC000__0000007AA0A6FB48", +"000000067F000080000006200C00000FC000-000000067F000080000006200C0000100000__0000007AA0A6FB48", +"000000067F000080000006200C00000FF505-000000067F000080000006200C0000108C10__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C0000100000-000000067F000080000006200C0000104000__0000007AA0A6FB48", +"000000067F000080000006200C0000104000-000000067F000080000006200C0000108000__0000007AA0A6FB48", +"000000067F000080000006200C0000107883-000000067F000080000006200C01000000AF__00000079C527F0D9-0000007A3F679FA1", +"000000067F000080000006200C0000108000-000000067F000080000006200C000010C000__0000007AA0A6FB48", +"000000067F000080000006200C0000108C10-000000067F000080000006200C0100000000__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C000010C000-000000067F000080000006200C0000110000__0000007AA0A6FB48", +"000000067F000080000006200C0000110000-000000067F00008000000620120100000000__0000007AA0A6FB48", +"000000067F000080000006200C01000000AF-000000067F00008000000620140000004888__00000079C527F0D9-0000007A3F679FA1", +"000000067F00008000000620140000002D0A-000000067F00008000000620140000016355__0000007A3F679FA1-0000007AA1DF6639", +"000000067F00008000000620140000004888-000000067F0000800000062014000000BC11__00000079C527F0D9-0000007A3F679FA1", +"000000067F0000800000062014000000BC11-000000067F00008000000620140000012FA7__00000079C527F0D9-0000007A3F679FA1", +"000000067F00008000000620140000012FA7-000000067F0000800000062014000001A33D__00000079C527F0D9-0000007A3F679FA1", +"000000067F00008000000620140000016357-000000067F00008000000620140000029C35__0000007A3F679FA1-0000007AA1DF6639", +"000000067F0000800000062014000001A33D-000000067F000080000006201400000216B4__00000079C527F0D9-0000007A3F679FA1", +"000000067F000080000006201400000216B4-000000067F00008000000620140000028A65__00000079C527F0D9-0000007A3F679FA1", +"000000067F00008000000620140000028A65-030000000000000000000000000000000002__00000079C527F0D9-0000007A3F679FA1", +"000000067F00008000000620140000029C38-030000000000000000000000000000000002__0000007A3F679FA1-0000007AA1DF6639", +"000000067F000080000006400C0000000000-000000067F000080000006400C0000004000__0000007B9877EF40", +"000000067F000080000006400C0000000000-000000067F000080000006400C0000004000__0000007D41715570", +"000000067F000080000006400C0000004000-000000067F000080000006400C0000008000__0000007B9877EF40", +"000000067F000080000006400C0000004000-000000067F000080000006400C0000008000__0000007D41715570", +"000000067F000080000006400C0000007987-000000067F000080000006400C00000110ED__0000007AA1DF6639-0000007B14D5C521", +"000000067F000080000006400C0000008000-000000067F000080000006400C000000C000__0000007B9877EF40", +"000000067F000080000006400C0000008000-000000067F000080000006400C000000C000__0000007D41715570", +"000000067F000080000006400C000000C000-000000067F000080000006400C0000010000__0000007B9877EF40", +"000000067F000080000006400C000000C000-000000067F000080000006400C0000010000__0000007D41715570", +"000000067F000080000006400C0000010000-000000067F000080000006400C0000014000__0000007B9877EF40", +"000000067F000080000006400C0000010000-000000067F000080000006400C0000014000__0000007D41715570", +"000000067F000080000006400C00000110ED-000000067F000080000006400C000001A80A__0000007AA1DF6639-0000007B14D5C521", +"000000067F000080000006400C0000014000-000000067F000080000006400C0000018000__0000007B9877EF40", +"000000067F000080000006400C0000014000-000000067F000080000006400C0000018000__0000007D41715570", +"000000067F000080000006400C0000018000-000000067F000080000006400C000001C000__0000007B9877EF40", +"000000067F000080000006400C0000018000-000000067F000080000006400C000001C000__0000007D41715570", +"000000067F000080000006400C000001A80A-000000067F000080000006400C0000023F4A__0000007AA1DF6639-0000007B14D5C521", +"000000067F000080000006400C000001C000-000000067F000080000006400C0000020000__0000007B9877EF40", +"000000067F000080000006400C000001C000-000000067F000080000006400C0000020000__0000007D41715570", +"000000067F000080000006400C0000020000-000000067F000080000006400C0000024000__0000007B9877EF40", +"000000067F000080000006400C0000020000-000000067F000080000006400C0000024000__0000007D41715570", +"000000067F000080000006400C0000023F4A-000000067F000080000006400C000002D6B0__0000007AA1DF6639-0000007B14D5C521", +"000000067F000080000006400C0000024000-000000067F000080000006400C0000028000__0000007B9877EF40", +"000000067F000080000006400C0000024000-000000067F000080000006400C0000028000__0000007D41715570", +"000000067F000080000006400C0000028000-000000067F000080000006400C000002C000__0000007B9877EF40", +"000000067F000080000006400C0000028000-000000067F000080000006400C000002C000__0000007D41715570", +"000000067F000080000006400C000002C000-000000067F000080000006400C0000030000__0000007B9877EF40", +"000000067F000080000006400C000002C000-000000067F000080000006400C0000030000__0000007D41715570", +"000000067F000080000006400C000002D6B0-000000067F000080000006400C0000036DD4__0000007AA1DF6639-0000007B14D5C521", +"000000067F000080000006400C0000030000-000000067F000080000006400C0000034000__0000007B9877EF40", +"000000067F000080000006400C0000030000-000000067F000080000006400C0000034000__0000007D41715570", +"000000067F000080000006400C0000034000-000000067F000080000006400C0000038000__0000007B9877EF40", +"000000067F000080000006400C0000034000-000000067F000080000006400C0000038000__0000007D41715570", +"000000067F000080000006400C0000036DD4-000000067F000080000006400C000004050A__0000007AA1DF6639-0000007B14D5C521", +"000000067F000080000006400C0000038000-000000067F000080000006400C000003C000__0000007B9877EF40", +"000000067F000080000006400C0000038000-000000067F000080000006400C000003C000__0000007D41715570", +"000000067F000080000006400C000003C000-000000067F000080000006400C0000040000__0000007B9877EF40", +"000000067F000080000006400C000003C000-000000067F000080000006400C0000040000__0000007D41715570", +"000000067F000080000006400C0000040000-000000067F000080000006400C0000044000__0000007B9877EF40", +"000000067F000080000006400C0000040000-000000067F000080000006400C0000044000__0000007D41715570", +"000000067F000080000006400C000004050A-030000000000000000000000000000000002__0000007AA1DF6639-0000007B14D5C521", +"000000067F000080000006400C0000044000-000000067F000080000006400C0000048000__0000007B9877EF40", +"000000067F000080000006400C0000044000-000000067F000080000006400C0000048000__0000007D41715570", +"000000067F000080000006400C0000048000-000000067F000080000006400C000004C000__0000007B9877EF40", +"000000067F000080000006400C0000048000-000000067F000080000006400C000004C000__0000007D41715570", +"000000067F000080000006400C000004B4C9-000000067F000080000006400C0000054C01__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C000004C000-000000067F000080000006400C0000050000__0000007B9877EF40", +"000000067F000080000006400C000004C000-000000067F000080000006400C0000050000__0000007D41715570", +"000000067F000080000006400C0000050000-000000067F000080000006400C0000054000__0000007B9877EF40", +"000000067F000080000006400C0000050000-000000067F000080000006400C0000054000__0000007D41715570", +"000000067F000080000006400C00000525C4-000000067F000080000006400C00000A47A7__0000007CEE5A0B91-0000007D41EA8D51", +"000000067F000080000006400C0000054000-000000067F000080000006400C0000058000__0000007B9877EF40", +"000000067F000080000006400C0000054000-000000067F000080000006400C0000058000__0000007D41715570", +"000000067F000080000006400C0000054C01-000000067F000080000006400C000005E30C__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C0000058000-000000067F000080000006400C000005C000__0000007B9877EF40", +"000000067F000080000006400C0000058000-000000067F000080000006400C000005C000__0000007D41715570", +"000000067F000080000006400C000005C000-000000067F000080000006400C0000060000__0000007B9877EF40", +"000000067F000080000006400C000005C000-000000067F000080000006400C0000060000__0000007D41715570", +"000000067F000080000006400C000005E30C-000000067F000080000006400C0000067A2C__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C0000060000-000000067F000080000006400C0000064000__0000007B9877EF40", +"000000067F000080000006400C0000060000-000000067F000080000006400C0000064000__0000007D41715570", +"000000067F000080000006400C0000064000-000000067F000080000006400C0000068000__0000007B9877EF40", +"000000067F000080000006400C0000064000-000000067F000080000006400C0000068000__0000007D41715570", +"000000067F000080000006400C0000067A2C-000000067F000080000006400C0000071187__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C0000068000-000000067F000080000006400C000006C000__0000007B9877EF40", +"000000067F000080000006400C0000068000-000000067F000080000006400C000006C000__0000007D41715570", +"000000067F000080000006400C000006C000-000000067F000080000006400C0000070000__0000007B9877EF40", +"000000067F000080000006400C000006C000-000000067F000080000006400C0000070000__0000007D41715570", +"000000067F000080000006400C0000070000-000000067F000080000006400C0000074000__0000007B9877EF40", +"000000067F000080000006400C0000070000-000000067F000080000006400C0000074000__0000007D41715570", +"000000067F000080000006400C0000071187-000000067F000080000006400C000007A8ED__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C0000074000-000000067F000080000006400C0000078000__0000007B9877EF40", +"000000067F000080000006400C0000074000-000000067F000080000006400C0000078000__0000007D41715570", +"000000067F000080000006400C0000078000-000000067F000080000006400C000007C000__0000007B9877EF40", +"000000067F000080000006400C0000078000-000000067F000080000006400C000007C000__0000007D41715570", +"000000067F000080000006400C000007A8ED-000000067F000080000006400C000008400B__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C000007C000-000000067F000080000006400C0000080000__0000007B9877EF40", +"000000067F000080000006400C000007C000-000000067F000080000006400C0000080000__0000007D41715570", +"000000067F000080000006400C0000080000-000000067F000080000006400C0000084000__0000007B9877EF40", +"000000067F000080000006400C0000080000-000000067F000080000006400C0000084000__0000007D41715570", +"000000067F000080000006400C0000084000-000000067F000080000006400C0000088000__0000007B9877EF40", +"000000067F000080000006400C0000084000-000000067F000080000006400C0000088000__0000007D41715570", +"000000067F000080000006400C000008400B-000000067F000080000006400C000008D771__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C0000088000-000000067F000080000006400C000008C000__0000007B9877EF40", +"000000067F000080000006400C0000088000-000000067F000080000006400C000008C000__0000007D41715570", +"000000067F000080000006400C000008C000-000000067F000080000006400C0000090000__0000007B9877EF40", +"000000067F000080000006400C000008C000-000000067F000080000006400C0000090000__0000007D41715570", +"000000067F000080000006400C000008D771-000000067F000080000006400C0000096ED7__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C0000090000-000000067F000080000006400C0000094000__0000007D41715570", +"000000067F000080000006400C0000090000-030000000000000000000000000000000002__0000007B9877EF40", +"000000067F000080000006400C0000094000-000000067F000080000006400C0000098000__0000007D41715570", +"000000067F000080000006400C0000096ED7-000000067F000080000006400C00000A060B__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C0000098000-000000067F000080000006400C000009C000__0000007D41715570", +"000000067F000080000006400C000009C000-000000067F000080000006400C00000A0000__0000007D41715570", +"000000067F000080000006400C00000A0000-000000067F000080000006400C00000A4000__0000007D41715570", +"000000067F000080000006400C00000A060B-000000067F000080000006400C00000A9D71__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000A4000-000000067F000080000006400C00000A8000__0000007D41715570", +"000000067F000080000006400C00000A47B1-000000067F000080000006400C00000F593E__0000007CEE5A0B91-0000007D41EA8D51", +"000000067F000080000006400C00000A8000-000000067F000080000006400C00000AC000__0000007D41715570", +"000000067F000080000006400C00000A887C-000000067F000080000006400C020000001F__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F000080000006400C00000A9D71-000000067F000080000006400C00000B34D7__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000AC000-000000067F000080000006400C00000B0000__0000007D41715570", +"000000067F000080000006400C00000B0000-000000067F000080000006400C00000B4000__0000007D41715570", +"000000067F000080000006400C00000B34D7-000000067F000080000006400C00000BCC0C__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000B4000-000000067F000080000006400C00000B8000__0000007D41715570", +"000000067F000080000006400C00000B8000-000000067F000080000006400C00000BC000__0000007D41715570", +"000000067F000080000006400C00000BC000-000000067F000080000006400C00000C0000__0000007D41715570", +"000000067F000080000006400C00000BCC0C-000000067F000080000006400C00000C6336__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000C0000-000000067F000080000006400C00000C4000__0000007D41715570", +"000000067F000080000006400C00000C4000-000000067F000080000006400C00000C8000__0000007D41715570", +"000000067F000080000006400C00000C6336-000000067F000080000006400C00000CFA9C__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000C8000-000000067F000080000006400C00000CC000__0000007D41715570", +"000000067F000080000006400C00000CC000-000000067F000080000006400C00000D0000__0000007D41715570", +"000000067F000080000006400C00000CFA9C-000000067F000080000006400C00000D91AB__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000D0000-000000067F000080000006400C00000D4000__0000007D41715570", +"000000067F000080000006400C00000D4000-000000067F000080000006400C00000D8000__0000007D41715570", +"000000067F000080000006400C00000D8000-000000067F000080000006400C00000DC000__0000007D41715570", +"000000067F000080000006400C00000D91AB-000000067F000080000006400C00000E2911__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000DC000-000000067F000080000006400C00000E0000__0000007D41715570", +"000000067F000080000006400C00000E0000-000000067F000080000006400C00000E4000__0000007D41715570", +"000000067F000080000006400C00000E2911-000000067F000080000006400C00000EC077__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000E4000-000000067F000080000006400C00000E8000__0000007D41715570", +"000000067F000080000006400C00000E8000-000000067F000080000006400C00000EC000__0000007D41715570", +"000000067F000080000006400C00000EC000-000000067F000080000006400C00000F0000__0000007D41715570", +"000000067F000080000006400C00000EC077-000000067F000080000006400C00000F57A8__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000F0000-000000067F000080000006400C00000F4000__0000007D41715570", +"000000067F000080000006400C00000F4000-000000067F000080000006400C00000F8000__0000007D41715570", +"000000067F000080000006400C00000F57A8-000000067F000080000006400C00000FEF0A__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000F5940-000000067F0000800000064014000000E7FF__0000007CEE5A0B91-0000007D41EA8D51", +"000000067F000080000006400C00000F8000-000000067F000080000006400C00000FC000__0000007D41715570", +"000000067F000080000006400C00000FC000-000000067F000080000006400C0000100000__0000007D41715570", +"000000067F000080000006400C00000FEF0A-000000067F000080000006400C000010862B__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C0000100000-000000067F000080000006400C0000104000__0000007D41715570", +"000000067F000080000006400C0000104000-000000067F000080000006400C0000108000__0000007D41715570", +"000000067F000080000006400C0000108000-000000067F000080000006400C000010C000__0000007D41715570", +"000000067F000080000006400C000010862B-000000067F000080000006400C0000111C20__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C000010C000-000000067F000080000006400C0000110000__0000007D41715570", +"000000067F000080000006400C0000110000-000000067F00008000000640120100000000__0000007D41715570", +"000000067F000080000006400C00FFFFFFFF-01000000000000000100000003000000000D__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C020000001F-000000067F0000800000064014000000691F__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F00008000000640140000000000-000000067F00008000000640140000004000__0000007D41715570", +"000000067F00008000000640140000004000-000000067F00008000000640140000008000__0000007D41715570", +"000000067F0000800000064014000000691F-000000067F0000800000064014000000D68F__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F00008000000640140000008000-000000067F0000800000064014000000C000__0000007D41715570", +"000000067F0000800000064014000000C000-000000067F00008000000640140000010000__0000007D41715570", +"000000067F0000800000064014000000D68F-000000067F00008000000640140000014406__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F0000800000064014000000E803-000000067F000080000006401400000274BB__0000007CEE5A0B91-0000007D41EA8D51", +"000000067F00008000000640140000010000-000000067F00008000000640140000014000__0000007D41715570", +"000000067F00008000000640140000014000-000000067F00008000000640140000018000__0000007D41715570", +"000000067F00008000000640140000014406-000000067F0000800000064014000001B192__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F00008000000640140000018000-000000067F0000800000064014000001C000__0000007D41715570", +"000000067F0000800000064014000001B192-000000067F00008000000640140000021F03__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F0000800000064014000001C000-000000067F00008000000640140000020000__0000007D41715570", +"000000067F00008000000640140000020000-000000067F00008000000640140000024000__0000007D41715570", +"000000067F00008000000640140000021F03-000000067F00008000000640140000028C6A__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F00008000000640140000024000-000000067F00008000000640140000028000__0000007D41715570", +"000000067F000080000006401400000274BF-030000000000000000000000000000000002__0000007CEE5A0B91-0000007D41EA8D51", +"000000067F00008000000640140000028000-000000067F0000800000064014000002C000__0000007D41715570", +"000000067F00008000000640140000028C6A-030000000000000000000000000000000002__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F0000800000064014000002C000-030000000000000000000000000000000002__0000007D41715570", +"000000067F000080000006600C0000000000-000000067F000080000006600C0000004000__0000007F12B83FE8", +"000000067F000080000006600C0000004000-000000067F000080000006600C0000008000__0000007F12B83FE8", +"000000067F000080000006600C0000008000-000000067F000080000006600C000000C000__0000007F12B83FE8", +"000000067F000080000006600C0000009381-000000067F000080000006600C0000012AE7__0000007D41EA8D51-0000007DC21DE569", +"000000067F000080000006600C000000C000-000000067F000080000006600C0000010000__0000007F12B83FE8", +"000000067F000080000006600C0000010000-000000067F000080000006600C0000014000__0000007F12B83FE8", +"000000067F000080000006600C0000012AE7-000000067F000080000006600C000001C20B__0000007D41EA8D51-0000007DC21DE569", +"000000067F000080000006600C0000014000-000000067F000080000006600C0000018000__0000007F12B83FE8", +"000000067F000080000006600C0000018000-000000067F000080000006600C000001C000__0000007F12B83FE8", +"000000067F000080000006600C000001C000-000000067F000080000006600C0000020000__0000007F12B83FE8", +"000000067F000080000006600C000001C20B-000000067F000080000006600C000002593B__0000007D41EA8D51-0000007DC21DE569", +"000000067F000080000006600C0000020000-000000067F000080000006600C0000024000__0000007F12B83FE8", +"000000067F000080000006600C0000024000-000000067F000080000006600C0000028000__0000007F12B83FE8", +"000000067F000080000006600C000002593B-000000067F000080000006600C000002F0A1__0000007D41EA8D51-0000007DC21DE569", +"000000067F000080000006600C0000028000-000000067F000080000006600C000002C000__0000007F12B83FE8", +"000000067F000080000006600C000002C000-000000067F000080000006600C0000030000__0000007F12B83FE8", +"000000067F000080000006600C000002F0A1-000000067F000080000006600C00000387B6__0000007D41EA8D51-0000007DC21DE569", +"000000067F000080000006600C0000030000-000000067F000080000006600C0000034000__0000007F12B83FE8", +"000000067F000080000006600C0000034000-000000067F000080000006600C0000038000__0000007F12B83FE8", +"000000067F000080000006600C0000038000-000000067F000080000006600C000003C000__0000007F12B83FE8", +"000000067F000080000006600C00000387B6-000000067F000080000006600C0000041F1C__0000007D41EA8D51-0000007DC21DE569", +"000000067F000080000006600C000003C000-000000067F000080000006600C0000040000__0000007F12B83FE8", +"000000067F000080000006600C0000040000-000000067F000080000006600C0000044000__0000007F12B83FE8", +"000000067F000080000006600C0000041F1C-000000067F000080000006600C000004B682__0000007D41EA8D51-0000007DC21DE569", +"000000067F000080000006600C0000044000-000000067F000080000006600C0000048000__0000007F12B83FE8", +"000000067F000080000006600C0000048000-000000067F000080000006600C000004C000__0000007F108C1FD8", +"000000067F000080000006600C0000048000-000000067F000080000006600C000004C000__0000007FDCA75700", +"000000067F000080000006600C0000049743-000000067F000080000006600C0000093532__0000007F7BE4E6F1-0000007FDCDCE659", +"000000067F000080000006600C000004B682-030000000000000000000000000000000002__0000007D41EA8D51-0000007DC21DE569", +"000000067F000080000006600C000004BAC3-000000067F000080000006600C00000551F8__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C000004C000-000000067F000080000006600C0000050000__0000007F108C1FD8", +"000000067F000080000006600C000004C000-000000067F000080000006600C0000050000__0000007FDCA75700", +"000000067F000080000006600C0000050000-000000067F000080000006600C0000054000__0000007F108C1FD8", +"000000067F000080000006600C0000050000-000000067F000080000006600C0000054000__0000007FDCA75700", +"000000067F000080000006600C0000054000-000000067F000080000006600C0000058000__0000007F108C1FD8", +"000000067F000080000006600C0000054000-000000067F000080000006600C0000058000__0000007FDCA75700", +"000000067F000080000006600C00000551F8-000000067F000080000006600C000005E90C__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C0000058000-000000067F000080000006600C000005C000__0000007F108C1FD8", +"000000067F000080000006600C0000058000-000000067F000080000006600C000005C000__0000007FDCA75700", +"000000067F000080000006600C000005C000-000000067F000080000006600C0000060000__0000007F108C1FD8", +"000000067F000080000006600C000005C000-000000067F000080000006600C0000060000__0000007FDCA75700", +"000000067F000080000006600C000005E90C-000000067F000080000006600C000006802C__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C0000060000-000000067F000080000006600C0000064000__0000007F108C1FD8", +"000000067F000080000006600C0000060000-000000067F000080000006600C0000064000__0000007FDCA75700", +"000000067F000080000006600C0000064000-000000067F000080000006600C0000068000__0000007F108C1FD8", +"000000067F000080000006600C0000064000-000000067F000080000006600C0000068000__0000007FDCA75700", +"000000067F000080000006600C0000068000-000000067F000080000006600C000006C000__0000007F108C1FD8", +"000000067F000080000006600C0000068000-000000067F000080000006600C000006C000__0000007FDCA75700", +"000000067F000080000006600C000006802C-000000067F000080000006600C0000071783__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C000006C000-000000067F000080000006600C0000070000__0000007F108C1FD8", +"000000067F000080000006600C000006C000-000000067F000080000006600C0000070000__0000007FDCA75700", +"000000067F000080000006600C0000070000-000000067F000080000006600C0000074000__0000007F108C1FD8", +"000000067F000080000006600C0000070000-000000067F000080000006600C0000074000__0000007FDCA75700", +"000000067F000080000006600C0000071783-000000067F000080000006600C000007AEE9__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C0000074000-000000067F000080000006600C0000078000__0000007F108C1FD8", +"000000067F000080000006600C0000074000-000000067F000080000006600C0000078000__0000007FDCA75700", +"000000067F000080000006600C0000078000-000000067F000080000006600C000007C000__0000007F108C1FD8", +"000000067F000080000006600C0000078000-000000067F000080000006600C000007C000__0000007FDCA75700", +"000000067F000080000006600C000007AEE9-000000067F000080000006600C000008460B__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C000007C000-000000067F000080000006600C0000080000__0000007F108C1FD8", +"000000067F000080000006600C000007C000-000000067F000080000006600C0000080000__0000007FDCA75700", +"000000067F000080000006600C0000080000-000000067F000080000006600C0000084000__0000007F108C1FD8", +"000000067F000080000006600C0000080000-000000067F000080000006600C0000084000__0000007FDCA75700", +"000000067F000080000006600C0000084000-000000067F000080000006600C0000088000__0000007F108C1FD8", +"000000067F000080000006600C0000084000-000000067F000080000006600C0000088000__0000007FDCA75700", +"000000067F000080000006600C000008460B-000000067F000080000006600C000008DD71__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C0000088000-000000067F000080000006600C000008C000__0000007F108C1FD8", +"000000067F000080000006600C0000088000-000000067F000080000006600C000008C000__0000007FDCA75700", +"000000067F000080000006600C000008C000-000000067F000080000006600C0000090000__0000007F108C1FD8", +"000000067F000080000006600C000008C000-000000067F000080000006600C0000090000__0000007FDCA75700", +"000000067F000080000006600C000008DD71-000000067F000080000006600C00000974D7__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C0000090000-000000067F000080000006600C0000094000__0000007F108C1FD8", +"000000067F000080000006600C0000090000-000000067F000080000006600C0000094000__0000007FDCA75700", +"000000067F000080000006600C0000093532-000000067F000080000006600C00000DD150__0000007F7BE4E6F1-0000007FDCDCE659", +"000000067F000080000006600C0000094000-000000067F000080000006600C0000098000__0000007F108C1FD8", +"000000067F000080000006600C0000094000-000000067F000080000006600C0000098000__0000007FDCA75700", +"000000067F000080000006600C00000974D7-000000067F000080000006600C00000A0C0B__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C0000098000-000000067F000080000006600C000009C000__0000007F108C1FD8", +"000000067F000080000006600C0000098000-000000067F000080000006600C000009C000__0000007FDCA75700", +"000000067F000080000006600C000009C000-000000067F000080000006600C00000A0000__0000007F108C1FD8", +"000000067F000080000006600C000009C000-000000067F000080000006600C00000A0000__0000007FDCA75700", +"000000067F000080000006600C00000A0000-000000067F000080000006600C00000A4000__0000007F108C1FD8", +"000000067F000080000006600C00000A0000-000000067F000080000006600C00000A4000__0000007FDCA75700", +"000000067F000080000006600C00000A0C0B-000000067F000080000006600C00000AA371__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C00000A4000-000000067F000080000006600C00000A8000__0000007F108C1FD8", +"000000067F000080000006600C00000A4000-000000067F000080000006600C00000A8000__0000007FDCA75700", +"000000067F000080000006600C00000A8000-000000067F000080000006600C00000AC000__0000007F108C1FD8", +"000000067F000080000006600C00000A8000-000000067F000080000006600C00000AC000__0000007FDCA75700", +"000000067F000080000006600C00000AA371-000000067F000080000006600C00000B3AD7__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C00000AC000-000000067F000080000006600C00000B0000__0000007F108C1FD8", +"000000067F000080000006600C00000AC000-000000067F000080000006600C00000B0000__0000007FDCA75700", +"000000067F000080000006600C00000B0000-000000067F000080000006600C00000B4000__0000007F108C1FD8", +"000000067F000080000006600C00000B0000-000000067F000080000006600C00000B4000__0000007FDCA75700", +"000000067F000080000006600C00000B3AD7-000000067F000080000006600C0100000000__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C00000B4000-000000067F000080000006600C00000B8000__0000007F108C1FD8", +"000000067F000080000006600C00000B4000-000000067F000080000006600C00000B8000__0000007FDCA75700", +"000000067F000080000006600C00000B8000-000000067F000080000006600C00000BC000__0000007F108C1FD8", +"000000067F000080000006600C00000B8000-000000067F000080000006600C00000BC000__0000007FDCA75700", +"000000067F000080000006600C00000BC000-000000067F000080000006600C00000C0000__0000007F108C1FD8", +"000000067F000080000006600C00000BC000-000000067F000080000006600C00000C0000__0000007FDCA75700", +"000000067F000080000006600C00000BC29F-000000067F000080000006600C00000C59CF__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C00000C0000-000000067F000080000006600C00000C4000__0000007F108C1FD8", +"000000067F000080000006600C00000C0000-000000067F000080000006600C00000C4000__0000007FDCA75700", +"000000067F000080000006600C00000C4000-000000067F000080000006600C00000C8000__0000007F108C1FD8", +"000000067F000080000006600C00000C4000-000000067F000080000006600C00000C8000__0000007FDCA75700", +"000000067F000080000006600C00000C59CF-000000067F000080000006600C00000CF10B__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C00000C8000-000000067F000080000006600C00000CC000__0000007F108C1FD8", +"000000067F000080000006600C00000C8000-000000067F000080000006600C00000CC000__0000007FDCA75700", +"000000067F000080000006600C00000CC000-000000067F000080000006600C00000D0000__0000007F108C1FD8", +"000000067F000080000006600C00000CC000-000000067F000080000006600C00000D0000__0000007FDCA75700", +"000000067F000080000006600C00000CF10B-000000067F000080000006600C00000D882C__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C00000D0000-000000067F000080000006600C00000D4000__0000007F108C1FD8", +"000000067F000080000006600C00000D0000-000000067F000080000006600C00000D4000__0000007FDCA75700", +"000000067F000080000006600C00000D4000-000000067F000080000006600C00000D8000__0000007F108C1FD8", +"000000067F000080000006600C00000D4000-000000067F000080000006600C00000D8000__0000007FDCA75700", +"000000067F000080000006600C00000D8000-000000067F000080000006600C00000DC000__0000007F108C1FD8", +"000000067F000080000006600C00000D8000-000000067F000080000006600C00000DC000__0000007FDCA75700", +"000000067F000080000006600C00000D882C-000000067F000080000006600C00000E1F7F__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C00000DC000-000000067F000080000006600C00000E0000__0000007F108C1FD8", +"000000067F000080000006600C00000DC000-000000067F000080000006600C00000E0000__0000007FDCA75700", +"000000067F000080000006600C00000DD152-000000067F00008000000660140000003DA8__0000007F7BE4E6F1-0000007FDCDCE659", +"000000067F000080000006600C00000E0000-000000067F000080000006600C00000E4000__0000007F108C1FD8", +"000000067F000080000006600C00000E0000-000000067F000080000006600C00000E4000__0000007FDCA75700", +"000000067F000080000006600C00000E1F7F-000000067F000080000006600C00000EB6E5__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C00000E4000-000000067F000080000006600C00000E8000__0000007F108C1FD8", +"000000067F000080000006600C00000E4000-000000067F000080000006600C00000E8000__0000007FDCA75700", +"000000067F000080000006600C00000E8000-000000067F000080000006600C00000EC000__0000007F108C1FD8", +"000000067F000080000006600C00000E8000-000000067F000080000006600C00000EC000__0000007FDCA75700", +"000000067F000080000006600C00000EB6E5-000000067F000080000006600C00000F4E0C__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C00000EC000-000000067F000080000006600C00000F0000__0000007F108C1FD8", +"000000067F000080000006600C00000EC000-000000067F000080000006600C00000F0000__0000007FDCA75700", +"000000067F000080000006600C00000F0000-000000067F000080000006600C00000F4000__0000007F108C1FD8", +"000000067F000080000006600C00000F0000-000000067F000080000006600C00000F4000__0000007FDCA75700", +"000000067F000080000006600C00000F4000-000000067F000080000006600C00000F8000__0000007F108C1FD8", +"000000067F000080000006600C00000F4000-000000067F000080000006600C00000F8000__0000007FDCA75700", +"000000067F000080000006600C00000F4E0C-000000067F000080000006600C00000FE572__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C00000F8000-000000067F000080000006600C00000FC000__0000007F108C1FD8", +"000000067F000080000006600C00000F8000-000000067F000080000006600C00000FC000__0000007FDCA75700", +"000000067F000080000006600C00000FC000-000000067F000080000006600C0000100000__0000007F108C1FD8", +"000000067F000080000006600C00000FC000-000000067F000080000006600C0000100000__0000007FDCA75700", +"000000067F000080000006600C00000FE572-000000067F000080000006600C0000107CD8__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C0000100000-000000067F000080000006600C0000104000__0000007F108C1FD8", +"000000067F000080000006600C0000100000-000000067F000080000006600C0000104000__0000007FDCA75700", +"000000067F000080000006600C0000104000-000000067F000080000006600C0000108000__0000007F108C1FD8", +"000000067F000080000006600C0000104000-000000067F000080000006600C0000108000__0000007FDCA75700", +"000000067F000080000006600C0000107CD8-000000067F000080000006600C000011140B__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C0000108000-000000067F000080000006600C000010C000__0000007F108C1FD8", +"000000067F000080000006600C0000108000-000000067F000080000006600C000010C000__0000007FDCA75700", +"000000067F000080000006600C000010C000-000000067F000080000006600C0000110000__0000007F108C1FD8", +"000000067F000080000006600C000010C000-000000067F000080000006600C0000110000__0000007FDCA75700", +"000000067F000080000006600C0000110000-000000067F00008000000660120100000000__0000007FDCA75700", +"000000067F000080000006600C0000110000-030000000000000000000000000000000002__0000007F108C1FD8", +"000000067F000080000006600C000011140B-010000000000000001000000030000000010__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C0000111C82-000000067F0000800000066014000000535B__0000007F11E4BFE9-0000007F7BE4E6F1", +"000000067F00008000000660140000000000-000000067F00008000000660140000004000__0000007FDCA75700", +"000000067F00008000000660140000003DAA-000000067F00008000000660140000017C4D__0000007F7BE4E6F1-0000007FDCDCE659", +"000000067F00008000000660140000004000-000000067F00008000000660140000008000__0000007FDCA75700", +"000000067F0000800000066014000000535B-000000067F0000800000066014000000C839__0000007F11E4BFE9-0000007F7BE4E6F1", +"000000067F00008000000660140000008000-000000067F0000800000066014000000C000__0000007FDCA75700", +"000000067F0000800000066014000000C000-000000067F00008000000660140000010000__0000007FDCA75700", +"000000067F0000800000066014000000C839-000000067F00008000000660140000013D42__0000007F11E4BFE9-0000007F7BE4E6F1", +"000000067F00008000000660140000010000-000000067F00008000000660140000014000__0000007FDCA75700", +"000000067F00008000000660140000013D42-000000067F0000800000066014000001B222__0000007F11E4BFE9-0000007F7BE4E6F1", +"000000067F00008000000660140000014000-000000067F00008000000660140000018000__0000007FDCA75700", +"000000067F00008000000660140000017C51-000000067F0000800000066014000002B9D0__0000007F7BE4E6F1-0000007FDCDCE659", +"000000067F00008000000660140000018000-000000067F0000800000066014000001C000__0000007FDCA75700", +"000000067F0000800000066014000001B222-000000067F00008000000660140000022704__0000007F11E4BFE9-0000007F7BE4E6F1", +"000000067F0000800000066014000001C000-000000067F00008000000660140000020000__0000007FDCA75700", +"000000067F00008000000660140000020000-000000067F00008000000660140000024000__0000007FDCA75700", +"000000067F00008000000660140000022704-000000067F00008000000660140000029C2D__0000007F11E4BFE9-0000007F7BE4E6F1", +"000000067F00008000000660140000024000-000000067F00008000000660140000028000__0000007FDCA75700", +"000000067F00008000000660140000028000-000000067F0000800000066014000002C000__0000007FDCA75700", +"000000067F00008000000660140000029C2D-030000000000000000000000000000000002__0000007F11E4BFE9-0000007F7BE4E6F1", +"000000067F0000800000066014000002B9D1-030000000000000000000000000000000002__0000007F7BE4E6F1-0000007FDCDCE659", +"000000067F0000800000066014000002C000-030000000000000000000000000000000002__0000007FDCA75700", +"000000067F000080000006800C0000000000-000000067F000080000006800C0000004000__00000081AFEDBFE0", +"000000067F000080000006800C0000004000-000000067F000080000006800C0000008000__00000081AFEDBFE0", +"000000067F000080000006800C0000007D6A-000000067F000080000006800C00000114D0__0000007FDCDCE659-000000804F6BFFC1", +"000000067F000080000006800C0000008000-000000067F000080000006800C000000C000__00000081AFEDBFE0", +"000000067F000080000006800C000000C000-000000067F000080000006800C0000010000__00000081AFEDBFE0", +"000000067F000080000006800C0000010000-000000067F000080000006800C0000014000__00000081AFEDBFE0", +"000000067F000080000006800C00000114D0-000000067F000080000006800C000001AC0B__0000007FDCDCE659-000000804F6BFFC1", +"000000067F000080000006800C0000014000-000000067F000080000006800C0000018000__00000081AFEDBFE0", +"000000067F000080000006800C0000018000-000000067F000080000006800C000001C000__00000081AFEDBFE0", +"000000067F000080000006800C000001AC0B-000000067F000080000006800C0000024348__0000007FDCDCE659-000000804F6BFFC1", +"000000067F000080000006800C000001C000-000000067F000080000006800C0000020000__00000081AFEDBFE0", +"000000067F000080000006800C0000020000-000000067F000080000006800C0000024000__00000081AFEDBFE0", +"000000067F000080000006800C0000024000-000000067F000080000006800C0000028000__00000081AFEDBFE0", +"000000067F000080000006800C0000024348-000000067F000080000006800C000002DAAE__0000007FDCDCE659-000000804F6BFFC1", +"000000067F000080000006800C0000028000-000000067F000080000006800C000002C000__00000081AFEDBFE0", +"000000067F000080000006800C000002C000-000000067F000080000006800C0000030000__00000081AFEDBFE0", +"000000067F000080000006800C000002DAAE-000000067F000080000006800C00000371D0__0000007FDCDCE659-000000804F6BFFC1", +"000000067F000080000006800C0000030000-000000067F000080000006800C0000034000__00000081AFEDBFE0", +"000000067F000080000006800C0000034000-000000067F000080000006800C0000038000__00000081AFEDBFE0", +"000000067F000080000006800C00000371D0-000000067F000080000006800C000004090B__0000007FDCDCE659-000000804F6BFFC1", +"000000067F000080000006800C0000038000-000000067F000080000006800C000003C000__00000081AFEDBFE0", +"000000067F000080000006800C000003C000-000000067F000080000006800C0000040000__00000081AFEDBFE0", +"000000067F000080000006800C0000040000-000000067F000080000006800C0000044000__00000081A164D628", +"000000067F000080000006800C000004090B-030000000000000000000000000000000002__0000007FDCDCE659-000000804F6BFFC1", +"000000067F000080000006800C0000042368-000000067F000080000006800C000004BACE__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C0000044000-000000067F000080000006800C0000048000__00000081A164D628", +"000000067F000080000006800C0000048000-000000067F000080000006800C000004C000__00000081A164D628", +"000000067F000080000006800C000004BACE-000000067F000080000006800C0000055202__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C000004C000-000000067F000080000006800C0000050000__00000081A164D628", +"000000067F000080000006800C0000050000-000000067F000080000006800C0000054000__00000081A164D628", +"000000067F000080000006800C0000054000-000000067F000080000006800C0000058000__00000081A164D628", +"000000067F000080000006800C0000055202-000000067F000080000006800C000005E90D__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C0000058000-000000067F000080000006800C000005C000__00000081A164D628", +"000000067F000080000006800C000005C000-000000067F000080000006800C0000060000__00000081A164D628", +"000000067F000080000006800C000005E90D-000000067F000080000006800C000006802B__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C0000060000-000000067F000080000006800C0000064000__00000081A164D628", +"000000067F000080000006800C0000064000-000000067F000080000006800C0000068000__00000081A164D628", +"000000067F000080000006800C0000068000-000000067F000080000006800C000006C000__00000081A164D628", +"000000067F000080000006800C000006802B-000000067F000080000006800C0000071782__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C000006C000-000000067F000080000006800C0000070000__00000081A164D628", +"000000067F000080000006800C0000070000-000000067F000080000006800C0000074000__00000081A164D628", +"000000067F000080000006800C0000071782-000000067F000080000006800C000007AEE8__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C0000074000-000000067F000080000006800C0000078000__00000081A164D628", +"000000067F000080000006800C0000078000-000000067F000080000006800C000007C000__00000081A164D628", +"000000067F000080000006800C000007AEE8-000000067F000080000006800C000008460B__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C000007C000-000000067F000080000006800C0000080000__00000081A164D628", +"000000067F000080000006800C0000080000-000000067F000080000006800C0000084000__00000081A164D628", +"000000067F000080000006800C0000084000-000000067F000080000006800C0000088000__00000081A164D628", +"000000067F000080000006800C000008460B-000000067F000080000006800C000008DD71__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C0000088000-000000067F000080000006800C000008C000__00000081A164D628", +"000000067F000080000006800C000008C000-000000067F000080000006800C0000090000__00000081A164D628", +"000000067F000080000006800C000008DD71-000000067F000080000006800C00000974D7__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C0000090000-000000067F000080000006800C0000094000__00000081A164D628", +"000000067F000080000006800C0000094000-000000067F000080000006800C0000098000__00000081A164D628", +"000000067F000080000006800C00000974D7-000000067F000080000006800C00000A0C0B__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C0000098000-000000067F000080000006800C000009C000__00000081A164D628", +"000000067F000080000006800C000009C000-000000067F000080000006800C00000A0000__00000081A164D628", +"000000067F000080000006800C00000A0000-000000067F000080000006800C00000A4000__00000081A164D628", +"000000067F000080000006800C00000A0C0B-000000067F000080000006800C0100000000__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C00000A4000-000000067F000080000006800C00000A8000__00000081A164D628", +"000000067F000080000006800C00000A8000-000000067F000080000006800C00000AC000__00000081A164D628", +"000000067F000080000006800C00000A8D4C-000000067F000080000006800C00000B24B2__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000AC000-000000067F000080000006800C00000B0000__00000081A164D628", +"000000067F000080000006800C00000B0000-000000067F000080000006800C00000B4000__00000081A164D628", +"000000067F000080000006800C00000B24B2-000000067F000080000006800C00000BBC0B__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000B4000-000000067F000080000006800C00000B8000__00000081A164D628", +"000000067F000080000006800C00000B8000-000000067F000080000006800C00000BC000__00000081A164D628", +"000000067F000080000006800C00000BBC0B-000000067F000080000006800C00000C533F__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000BC000-000000067F000080000006800C00000C0000__00000081A164D628", +"000000067F000080000006800C00000C0000-000000067F000080000006800C00000C4000__00000081A164D628", +"000000067F000080000006800C00000C4000-000000067F000080000006800C00000C8000__00000081A164D628", +"000000067F000080000006800C00000C533F-000000067F000080000006800C00000CEAA5__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000C8000-000000067F000080000006800C00000CC000__00000081A164D628", +"000000067F000080000006800C00000CC000-000000067F000080000006800C00000D0000__00000081A164D628", +"000000067F000080000006800C00000CEAA5-000000067F000080000006800C00000D81BE__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000D0000-000000067F000080000006800C00000D4000__00000081A164D628", +"000000067F000080000006800C00000D4000-000000067F000080000006800C00000D8000__00000081A164D628", +"000000067F000080000006800C00000D8000-000000067F000080000006800C00000DC000__00000081A164D628", +"000000067F000080000006800C00000D81BE-000000067F000080000006800C00000E190B__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000DC000-000000067F000080000006800C00000E0000__00000081A164D628", +"000000067F000080000006800C00000E0000-000000067F000080000006800C00000E4000__00000081A164D628", +"000000067F000080000006800C00000E190B-000000067F000080000006800C00000EB071__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000E4000-000000067F000080000006800C00000E8000__00000081A164D628", +"000000067F000080000006800C00000E8000-000000067F000080000006800C00000EC000__00000081A164D628", +"000000067F000080000006800C00000EB071-000000067F000080000006800C00000F47AC__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000EC000-000000067F000080000006800C00000F0000__00000081A164D628", +"000000067F000080000006800C00000F0000-000000067F000080000006800C00000F4000__00000081A164D628", +"000000067F000080000006800C00000F4000-000000067F000080000006800C00000F8000__00000081A164D628", +"000000067F000080000006800C00000F47AC-000000067F000080000006800C00000FDF0A__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000F8000-000000067F000080000006800C00000FC000__00000081A164D628", +"000000067F000080000006800C00000FC000-000000067F000080000006800C0000100000__00000081A164D628", +"000000067F000080000006800C00000FDF0A-000000067F000080000006800C000010762B__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C0000100000-000000067F000080000006800C0000104000__00000081A164D628", +"000000067F000080000006800C0000104000-000000067F000080000006800C0000108000__00000081A164D628", +"000000067F000080000006800C000010762B-000000067F000080000006800C0000110D88__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C0000108000-030000000000000000000000000000000002__00000081A164D628", +"000000067F000080000006800C0000110D88-010000000000000001000000030000000014__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006801400000044E4-000000067F0000800000068014000000C3F5__00000081AFAF5FD1-0000008215AFE5A9", +"000000067F0000800000068014000000C3F5-000000067F00008000000680140000014303__00000081AFAF5FD1-0000008215AFE5A9", +"000000067F00008000000680140000014303-000000067F0000800000068014000001C214__00000081AFAF5FD1-0000008215AFE5A9", +"000000067F0000800000068014000001C214-000000067F00008000000680140000024125__00000081AFAF5FD1-0000008215AFE5A9", +"000000067F00008000000680140000024125-000000067F0000800000068014000002C035__00000081AFAF5FD1-0000008215AFE5A9", +"000000067F0000800000068014000002C035-000000067F000080000006A00C00000072CA__00000081AFAF5FD1-0000008215AFE5A9", +"000000067F000080000006A00C0000000000-000000067F000080000006A00C0000004000__00000083D5DE3FD0", +"000000067F000080000006A00C0000004000-000000067F000080000006A00C0000008000__00000083D5DE3FD0", +"000000067F000080000006A00C00000072CA-030000000000000000000000000000000002__00000081AFAF5FD1-0000008215AFE5A9", +"000000067F000080000006A00C0000008000-000000067F000080000006A00C000000C000__00000083865C64B8", +"000000067F000080000006A00C0000008000-000000067F000080000006A00C000000C000__00000084A1F03030", +"000000067F000080000006A00C00000096E3-000000067F000080000006A00C0000012E0B__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C000000C000-000000067F000080000006A00C0000010000__00000083865C64B8", +"000000067F000080000006A00C000000C000-000000067F000080000006A00C0000010000__00000084A1F03030", +"000000067F000080000006A00C0000010000-000000067F000080000006A00C0000014000__00000083865C64B8", +"000000067F000080000006A00C0000010000-000000067F000080000006A00C0000014000__00000084A1F03030", +"000000067F000080000006A00C0000012E0B-000000067F000080000006A00C000001C571__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C0000014000-000000067F000080000006A00C0000018000__00000083865C64B8", +"000000067F000080000006A00C0000014000-000000067F000080000006A00C0000018000__00000084A1F03030", +"000000067F000080000006A00C0000018000-000000067F000080000006A00C000001C000__00000083865C64B8", +"000000067F000080000006A00C0000018000-000000067F000080000006A00C000001C000__00000084A1F03030", +"000000067F000080000006A00C000001C000-000000067F000080000006A00C0000020000__00000083865C64B8", +"000000067F000080000006A00C000001C000-000000067F000080000006A00C0000020000__00000084A1F03030", +"000000067F000080000006A00C000001C571-000000067F000080000006A00C0000025CD7__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C0000020000-000000067F000080000006A00C0000024000__00000083865C64B8", +"000000067F000080000006A00C0000020000-000000067F000080000006A00C0000024000__00000084A1F03030", +"000000067F000080000006A00C0000024000-000000067F000080000006A00C0000028000__00000083865C64B8", +"000000067F000080000006A00C0000024000-000000067F000080000006A00C0000028000__00000084A1F03030", +"000000067F000080000006A00C0000025CD7-000000067F000080000006A00C000002F40B__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C0000028000-000000067F000080000006A00C000002C000__00000083865C64B8", +"000000067F000080000006A00C0000028000-000000067F000080000006A00C000002C000__00000084A1F03030", +"000000067F000080000006A00C000002C000-000000067F000080000006A00C0000030000__00000083865C64B8", +"000000067F000080000006A00C000002C000-000000067F000080000006A00C0000030000__00000084A1F03030", +"000000067F000080000006A00C000002F40B-000000067F000080000006A00C0000038B1E__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C0000030000-000000067F000080000006A00C0000034000__00000083865C64B8", +"000000067F000080000006A00C0000030000-000000067F000080000006A00C0000034000__00000084A1F03030", +"000000067F000080000006A00C0000034000-000000067F000080000006A00C0000038000__00000083865C64B8", +"000000067F000080000006A00C0000034000-000000067F000080000006A00C0000038000__00000084A1F03030", +"000000067F000080000006A00C0000038000-000000067F000080000006A00C000003C000__00000083865C64B8", +"000000067F000080000006A00C0000038000-000000067F000080000006A00C000003C000__00000084A1F03030", +"000000067F000080000006A00C0000038B1E-000000067F000080000006A00C0000042284__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C000003C000-000000067F000080000006A00C0000040000__00000083865C64B8", +"000000067F000080000006A00C000003C000-000000067F000080000006A00C0000040000__00000084A1F03030", +"000000067F000080000006A00C0000040000-000000067F000080000006A00C0000044000__00000083865C64B8", +"000000067F000080000006A00C0000040000-000000067F000080000006A00C0000044000__00000084A1F03030", +"000000067F000080000006A00C0000042284-000000067F000080000006A00C000004B9EA__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C0000044000-000000067F000080000006A00C0000048000__00000083865C64B8", +"000000067F000080000006A00C0000044000-000000067F000080000006A00C0000048000__00000084A1F03030", +"000000067F000080000006A00C0000048000-000000067F000080000006A00C000004C000__00000083865C64B8", +"000000067F000080000006A00C0000048000-000000067F000080000006A00C000004C000__00000084A1F03030", +"000000067F000080000006A00C000004B9EA-000000067F000080000006A00C000005510B__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C000004C000-000000067F000080000006A00C0000050000__00000083865C64B8", +"000000067F000080000006A00C000004C000-000000067F000080000006A00C0000050000__00000084A1F03030", +"000000067F000080000006A00C0000050000-000000067F000080000006A00C0000054000__00000083865C64B8", +"000000067F000080000006A00C0000050000-000000067F000080000006A00C0000054000__00000084A1F03030", +"000000067F000080000006A00C000005198B-000000067F000080000006A00C00000A31A6__000000844F1A6789-00000084A325AA01", +"000000067F000080000006A00C0000054000-000000067F000080000006A00C0000058000__00000083865C64B8", +"000000067F000080000006A00C0000054000-000000067F000080000006A00C0000058000__00000084A1F03030", +"000000067F000080000006A00C000005510B-000000067F000080000006A00C000005E871__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C0000058000-000000067F000080000006A00C000005C000__00000083865C64B8", +"000000067F000080000006A00C0000058000-000000067F000080000006A00C000005C000__00000084A1F03030", +"000000067F000080000006A00C000005C000-000000067F000080000006A00C0000060000__00000083865C64B8", +"000000067F000080000006A00C000005C000-000000067F000080000006A00C0000060000__00000084A1F03030", +"000000067F000080000006A00C000005E871-000000067F000080000006A00C0000067F8B__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C0000060000-000000067F000080000006A00C0000064000__00000083865C64B8", +"000000067F000080000006A00C0000060000-000000067F000080000006A00C0000064000__00000084A1F03030", +"000000067F000080000006A00C0000064000-000000067F000080000006A00C0000068000__00000083865C64B8", +"000000067F000080000006A00C0000064000-000000067F000080000006A00C0000068000__00000084A1F03030", +"000000067F000080000006A00C0000067F8B-000000067F000080000006A00C0100000000__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C0000068000-000000067F000080000006A00C000006C000__00000083865C64B8", +"000000067F000080000006A00C0000068000-000000067F000080000006A00C000006C000__00000084A1F03030", +"000000067F000080000006A00C000006C000-000000067F000080000006A00C0000070000__00000083865C64B8", +"000000067F000080000006A00C000006C000-000000067F000080000006A00C0000070000__00000084A1F03030", +"000000067F000080000006A00C0000070000-000000067F000080000006A00C0000074000__00000083865C64B8", +"000000067F000080000006A00C0000070000-000000067F000080000006A00C0000074000__00000084A1F03030", +"000000067F000080000006A00C00000703EC-000000067F000080000006A00C0000079B0C__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C0000074000-000000067F000080000006A00C0000078000__00000083865C64B8", +"000000067F000080000006A00C0000074000-000000067F000080000006A00C0000078000__00000084A1F03030", +"000000067F000080000006A00C0000078000-000000067F000080000006A00C000007C000__00000083865C64B8", +"000000067F000080000006A00C0000078000-000000067F000080000006A00C000007C000__00000084A1F03030", +"000000067F000080000006A00C0000079B0C-000000067F000080000006A00C0000083272__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C000007C000-000000067F000080000006A00C0000080000__00000083865C64B8", +"000000067F000080000006A00C000007C000-000000067F000080000006A00C0000080000__00000084A1F03030", +"000000067F000080000006A00C0000080000-000000067F000080000006A00C0000084000__00000083865C64B8", +"000000067F000080000006A00C0000080000-000000067F000080000006A00C0000084000__00000084A1F03030", +"000000067F000080000006A00C0000083272-000000067F000080000006A00C000008C9D8__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C0000084000-000000067F000080000006A00C0000088000__00000083865C64B8", +"000000067F000080000006A00C0000084000-000000067F000080000006A00C0000088000__00000084A1F03030", +"000000067F000080000006A00C0000088000-000000067F000080000006A00C000008C000__00000083865C64B8", +"000000067F000080000006A00C0000088000-000000067F000080000006A00C000008C000__00000084A1F03030", +"000000067F000080000006A00C000008C000-000000067F000080000006A00C0000090000__00000083865C64B8", +"000000067F000080000006A00C000008C000-000000067F000080000006A00C0000090000__00000084A1F03030", +"000000067F000080000006A00C000008C9D8-000000067F000080000006A00C0000096129__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C0000090000-000000067F000080000006A00C0000094000__00000083865C64B8", +"000000067F000080000006A00C0000090000-000000067F000080000006A00C0000094000__00000084A1F03030", +"000000067F000080000006A00C0000094000-000000067F000080000006A00C0000098000__00000083865C64B8", +"000000067F000080000006A00C0000094000-000000067F000080000006A00C0000098000__00000084A1F03030", +"000000067F000080000006A00C0000096129-000000067F000080000006A00C000009F88F__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C0000098000-000000067F000080000006A00C000009C000__00000083865C64B8", +"000000067F000080000006A00C0000098000-000000067F000080000006A00C000009C000__00000084A1F03030", +"000000067F000080000006A00C000009C000-000000067F000080000006A00C00000A0000__00000083865C64B8", +"000000067F000080000006A00C000009C000-000000067F000080000006A00C00000A0000__00000084A1F03030", +"000000067F000080000006A00C000009F88F-000000067F000080000006A00C00000A8F9F__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000A0000-000000067F000080000006A00C00000A4000__00000083865C64B8", +"000000067F000080000006A00C00000A0000-000000067F000080000006A00C00000A4000__00000084A1F03030", +"000000067F000080000006A00C00000A31B0-000000067F000080000006A00C00000F4C19__000000844F1A6789-00000084A325AA01", +"000000067F000080000006A00C00000A4000-000000067F000080000006A00C00000A8000__00000083865C64B8", +"000000067F000080000006A00C00000A4000-000000067F000080000006A00C00000A8000__00000084A1F03030", +"000000067F000080000006A00C00000A8000-000000067F000080000006A00C00000AC000__00000083865C64B8", +"000000067F000080000006A00C00000A8000-000000067F000080000006A00C00000AC000__00000084A1F03030", +"000000067F000080000006A00C00000A8F9F-000000067F000080000006A00C00000B2705__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000AC000-000000067F000080000006A00C00000B0000__00000083865C64B8", +"000000067F000080000006A00C00000AC000-000000067F000080000006A00C00000B0000__00000084A1F03030", +"000000067F000080000006A00C00000B0000-000000067F000080000006A00C00000B4000__00000083865C64B8", +"000000067F000080000006A00C00000B0000-000000067F000080000006A00C00000B4000__00000084A1F03030", +"000000067F000080000006A00C00000B2705-000000067F000080000006A00C00000BBE10__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000B4000-000000067F000080000006A00C00000B8000__00000083865C64B8", +"000000067F000080000006A00C00000B4000-000000067F000080000006A00C00000B8000__00000084A1F03030", +"000000067F000080000006A00C00000B8000-000000067F000080000006A00C00000BC000__00000083865C64B8", +"000000067F000080000006A00C00000B8000-000000067F000080000006A00C00000BC000__00000084A1F03030", +"000000067F000080000006A00C00000BBE10-000000067F000080000006A00C00000C5543__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000BC000-000000067F000080000006A00C00000C0000__00000083865C64B8", +"000000067F000080000006A00C00000BC000-000000067F000080000006A00C00000C0000__00000084A1F03030", +"000000067F000080000006A00C00000C0000-000000067F000080000006A00C00000C4000__00000083865C64B8", +"000000067F000080000006A00C00000C0000-000000067F000080000006A00C00000C4000__00000084A1F03030", +"000000067F000080000006A00C00000C4000-000000067F000080000006A00C00000C8000__00000083865C64B8", +"000000067F000080000006A00C00000C4000-000000067F000080000006A00C00000C8000__00000084A1F03030", +"000000067F000080000006A00C00000C4CC8-000000067F000080000006A0140000001CBC__00000083D5901FD9-000000844F1A6789", +"000000067F000080000006A00C00000C5543-000000067F000080000006A00C00000CECA9__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000C8000-000000067F000080000006A00C00000CC000__00000083865C64B8", +"000000067F000080000006A00C00000C8000-000000067F000080000006A00C00000CC000__00000084A1F03030", +"000000067F000080000006A00C00000CC000-000000067F000080000006A00C00000D0000__00000083865C64B8", +"000000067F000080000006A00C00000CC000-000000067F000080000006A00C00000D0000__00000084A1F03030", +"000000067F000080000006A00C00000CECA9-000000067F000080000006A00C00000D83C0__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000D0000-000000067F000080000006A00C00000D4000__00000083865C64B8", +"000000067F000080000006A00C00000D0000-000000067F000080000006A00C00000D4000__00000084A1F03030", +"000000067F000080000006A00C00000D4000-000000067F000080000006A00C00000D8000__00000083865C64B8", +"000000067F000080000006A00C00000D4000-000000067F000080000006A00C00000D8000__00000084A1F03030", +"000000067F000080000006A00C00000D8000-000000067F000080000006A00C00000DC000__00000083865C64B8", +"000000067F000080000006A00C00000D8000-000000067F000080000006A00C00000DC000__00000084A1F03030", +"000000067F000080000006A00C00000D83C0-000000067F000080000006A00C00000E1B0A__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000DC000-000000067F000080000006A00C00000E0000__00000083865C64B8", +"000000067F000080000006A00C00000DC000-000000067F000080000006A00C00000E0000__00000084A1F03030", +"000000067F000080000006A00C00000E0000-000000067F000080000006A00C00000E4000__00000084A1F03030", +"000000067F000080000006A00C00000E0000-030000000000000000000000000000000002__00000083865C64B8", +"000000067F000080000006A00C00000E1B0A-000000067F000080000006A00C00000EB270__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000E4000-000000067F000080000006A00C00000E8000__00000084A1F03030", +"000000067F000080000006A00C00000E8000-000000067F000080000006A00C00000EC000__00000084A1F03030", +"000000067F000080000006A00C00000EB270-000000067F000080000006A00C00000F49AA__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000EC000-000000067F000080000006A00C00000F0000__00000084A1F03030", +"000000067F000080000006A00C00000F0000-000000067F000080000006A00C00000F4000__00000084A1F03030", +"000000067F000080000006A00C00000F4000-000000067F000080000006A00C00000F8000__00000084A1F03030", +"000000067F000080000006A00C00000F49AA-000000067F000080000006A00C00000FE10A__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000F4C23-000000067F000080000006A014000000E1C2__000000844F1A6789-00000084A325AA01", +"000000067F000080000006A00C00000F8000-000000067F000080000006A00C00000FC000__00000084A1F03030", +"000000067F000080000006A00C00000FC000-000000067F000080000006A00C0000100000__00000084A1F03030", +"000000067F000080000006A00C00000FE10A-000000067F000080000006A00C000010782C__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C0000100000-000000067F000080000006A00C0000104000__00000084A1F03030", +"000000067F000080000006A00C0000104000-000000067F000080000006A00C0000108000__00000084A1F03030", +"000000067F000080000006A00C000010782C-000000067F000080000006A00C0000110F88__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C0000108000-000000067F000080000006A00C000010C000__00000084A1F03030", +"000000067F000080000006A00C000010C000-000000067F000080000006A00C0000110000__00000084A1F03030", +"000000067F000080000006A00C0000110000-000000067F000080000006A0120100000000__00000084A1F03030", +"000000067F000080000006A00C0000110F88-010000000000000001000000030000000014__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A0140000000000-000000067F000080000006A0140000004000__00000084A1F03030", +"000000067F000080000006A0140000001CBC-000000067F000080000006A01400000088E1__00000083D5901FD9-000000844F1A6789", +"000000067F000080000006A0140000004000-000000067F000080000006A0140000008000__00000084A1F03030", +"000000067F000080000006A0140000008000-000000067F000080000006A014000000C000__00000084A1F03030", +"000000067F000080000006A01400000088E1-000000067F000080000006A014000000F459__00000083D5901FD9-000000844F1A6789", +"000000067F000080000006A014000000C000-000000067F000080000006A0140000010000__00000084A1F03030", +"000000067F000080000006A014000000E1C2-000000067F000080000006A014000002682C__000000844F1A6789-00000084A325AA01", +"000000067F000080000006A014000000F459-000000067F000080000006A0140000016068__00000083D5901FD9-000000844F1A6789", +"000000067F000080000006A0140000010000-000000067F000080000006A0140000014000__00000084A1F03030", +"000000067F000080000006A0140000014000-000000067F000080000006A0140000018000__00000084A1F03030", +"000000067F000080000006A0140000016068-000000067F000080000006A014000001CC14__00000083D5901FD9-000000844F1A6789", +"000000067F000080000006A0140000018000-000000067F000080000006A014000001C000__00000084A1F03030", +"000000067F000080000006A014000001C000-000000067F000080000006A0140000020000__00000084A1F03030", +"000000067F000080000006A014000001CC14-000000067F000080000006A014000002384E__00000083D5901FD9-000000844F1A6789", +"000000067F000080000006A0140000020000-000000067F000080000006A0140000024000__00000084A1F03030", +"000000067F000080000006A014000002384E-000000067F000080000006A014000002A467__00000083D5901FD9-000000844F1A6789", +"000000067F000080000006A0140000024000-000000067F000080000006A0140000028000__00000084A1F03030", +"000000067F000080000006A0140000026831-030000000000000000000000000000000002__000000844F1A6789-00000084A325AA01", +"000000067F000080000006A0140000028000-000000067F000080000006A014000002C000__00000084A1F03030", +"000000067F000080000006A014000002A467-030000000000000000000000000000000002__00000083D5901FD9-000000844F1A6789", +"000000067F000080000006A014000002C000-030000000000000000000000000000000002__00000084A1F03030", +"000000067F000080000006C00C0000000000-000000067F000080000006C00C0000004000__00000086746BDFE0", +"000000067F000080000006C00C0000004000-000000067F000080000006C00C0000008000__00000086746BDFE0", +"000000067F000080000006C00C0000008000-000000067F000080000006C00C000000C000__00000086746BDFE0", +"000000067F000080000006C00C00000090F5-000000067F000080000006C00C000001280C__00000084A325AA01-00000085239DFB81", +"000000067F000080000006C00C000000C000-000000067F000080000006C00C0000010000__00000086746BDFE0", +"000000067F000080000006C00C0000010000-000000067F000080000006C00C0000014000__00000086746BDFE0", +"000000067F000080000006C00C000001280C-000000067F000080000006C00C000001BF72__00000084A325AA01-00000085239DFB81", +"000000067F000080000006C00C0000014000-000000067F000080000006C00C0000018000__00000086746BDFE0", +"000000067F000080000006C00C0000018000-000000067F000080000006C00C000001C000__00000086746BDFE0", +"000000067F000080000006C00C000001BF72-000000067F000080000006C00C00000256D8__00000084A325AA01-00000085239DFB81", +"000000067F000080000006C00C000001C000-000000067F000080000006C00C0000020000__00000086746BDFE0", +"000000067F000080000006C00C0000020000-000000067F000080000006C00C0000024000__00000086746BDFE0", +"000000067F000080000006C00C0000024000-000000067F000080000006C00C0000028000__00000086746BDFE0", +"000000067F000080000006C00C00000256D8-000000067F000080000006C00C000002EE0B__00000084A325AA01-00000085239DFB81", +"000000067F000080000006C00C0000028000-000000067F000080000006C00C000002C000__00000086746BDFE0", +"000000067F000080000006C00C000002C000-000000067F000080000006C00C0000030000__00000086746BDFE0", +"000000067F000080000006C00C000002EE0B-000000067F000080000006C00C0000038521__00000084A325AA01-00000085239DFB81", +"000000067F000080000006C00C0000030000-000000067F000080000006C00C0000034000__00000086746BDFE0", +"000000067F000080000006C00C0000034000-000000067F000080000006C00C0000038000__00000086746BDFE0", +"000000067F000080000006C00C0000038000-000000067F000080000006C00C000003C000__00000086746BDFE0", +"000000067F000080000006C00C0000038521-000000067F000080000006C00C0000041C87__00000084A325AA01-00000085239DFB81", +"000000067F000080000006C00C000003C000-000000067F000080000006C00C0000040000__00000086746BDFE0", +"000000067F000080000006C00C0000040000-000000067F000080000006C00C0000044000__00000086746BDFE0", +"000000067F000080000006C00C0000041C87-000000067F000080000006C00C000004B3ED__00000084A325AA01-00000085239DFB81", +"000000067F000080000006C00C0000044000-000000067F000080000006C00C0000048000__00000086746BDFE0", +"000000067F000080000006C00C0000048000-000000067F000080000006C00C000004C000__00000086720CFFF0", +"000000067F000080000006C00C0000048000-000000067F000080000006C00C000004C000__000000873B520940", +"000000067F000080000006C00C000004B3ED-030000000000000000000000000000000002__00000084A325AA01-00000085239DFB81", +"000000067F000080000006C00C000004BAC4-000000067F000080000006C00C00000551F9__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C000004C000-000000067F000080000006C00C0000050000__00000086720CFFF0", +"000000067F000080000006C00C000004C000-000000067F000080000006C00C0000050000__000000873B520940", +"000000067F000080000006C00C0000050000-000000067F000080000006C00C0000054000__00000086720CFFF0", +"000000067F000080000006C00C0000050000-000000067F000080000006C00C0000054000__000000873B520940", +"000000067F000080000006C00C0000054000-000000067F000080000006C00C0000058000__00000086720CFFF0", +"000000067F000080000006C00C0000054000-000000067F000080000006C00C0000058000__000000873B520940", +"000000067F000080000006C00C00000551F9-000000067F000080000006C00C000005E90C__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C0000055EB3-000000067F000080000006C00C00000AB316__00000086ED29E361-000000873C9A2551", +"000000067F000080000006C00C0000058000-000000067F000080000006C00C000005C000__00000086720CFFF0", +"000000067F000080000006C00C0000058000-000000067F000080000006C00C000005C000__000000873B520940", +"000000067F000080000006C00C000005C000-000000067F000080000006C00C0000060000__00000086720CFFF0", +"000000067F000080000006C00C000005C000-000000067F000080000006C00C0000060000__000000873B520940", +"000000067F000080000006C00C000005E90C-000000067F000080000006C00C000006802C__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C0000060000-000000067F000080000006C00C0000064000__00000086720CFFF0", +"000000067F000080000006C00C0000060000-000000067F000080000006C00C0000064000__000000873B520940", +"000000067F000080000006C00C0000064000-000000067F000080000006C00C0000068000__00000086720CFFF0", +"000000067F000080000006C00C0000064000-000000067F000080000006C00C0000068000__000000873B520940", +"000000067F000080000006C00C0000068000-000000067F000080000006C00C000006C000__00000086720CFFF0", +"000000067F000080000006C00C0000068000-000000067F000080000006C00C000006C000__000000873B520940", +"000000067F000080000006C00C000006802C-000000067F000080000006C00C0000071783__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C000006C000-000000067F000080000006C00C0000070000__00000086720CFFF0", +"000000067F000080000006C00C000006C000-000000067F000080000006C00C0000070000__000000873B520940", +"000000067F000080000006C00C0000070000-000000067F000080000006C00C0000074000__00000086720CFFF0", +"000000067F000080000006C00C0000070000-000000067F000080000006C00C0000074000__000000873B520940", +"000000067F000080000006C00C0000071783-000000067F000080000006C00C000007AEE9__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C0000074000-000000067F000080000006C00C0000078000__00000086720CFFF0", +"000000067F000080000006C00C0000074000-000000067F000080000006C00C0000078000__000000873B520940", +"000000067F000080000006C00C0000078000-000000067F000080000006C00C000007C000__00000086720CFFF0", +"000000067F000080000006C00C0000078000-000000067F000080000006C00C000007C000__000000873B520940", +"000000067F000080000006C00C000007AEE9-000000067F000080000006C00C000008460B__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C000007C000-000000067F000080000006C00C0000080000__00000086720CFFF0", +"000000067F000080000006C00C000007C000-000000067F000080000006C00C0000080000__000000873B520940", +"000000067F000080000006C00C0000080000-000000067F000080000006C00C0000084000__00000086720CFFF0", +"000000067F000080000006C00C0000080000-000000067F000080000006C00C0000084000__000000873B520940", +"000000067F000080000006C00C0000084000-000000067F000080000006C00C0000088000__00000086720CFFF0", +"000000067F000080000006C00C0000084000-000000067F000080000006C00C0000088000__000000873B520940", +"000000067F000080000006C00C000008460B-000000067F000080000006C00C000008DD71__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C0000088000-000000067F000080000006C00C000008C000__00000086720CFFF0", +"000000067F000080000006C00C0000088000-000000067F000080000006C00C000008C000__000000873B520940", +"000000067F000080000006C00C000008C000-000000067F000080000006C00C0000090000__00000086720CFFF0", +"000000067F000080000006C00C000008C000-000000067F000080000006C00C0000090000__000000873B520940", +"000000067F000080000006C00C000008DD71-000000067F000080000006C00C00000974D7__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C0000090000-000000067F000080000006C00C0000094000__00000086720CFFF0", +"000000067F000080000006C00C0000090000-000000067F000080000006C00C0000094000__000000873B520940", +"000000067F000080000006C00C0000094000-000000067F000080000006C00C0000098000__00000086720CFFF0", +"000000067F000080000006C00C0000094000-000000067F000080000006C00C0000098000__000000873B520940", +"000000067F000080000006C00C00000974D7-000000067F000080000006C00C00000A0C0B__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C0000098000-000000067F000080000006C00C000009C000__00000086720CFFF0", +"000000067F000080000006C00C0000098000-000000067F000080000006C00C000009C000__000000873B520940", +"000000067F000080000006C00C000009C000-000000067F000080000006C00C00000A0000__00000086720CFFF0", +"000000067F000080000006C00C000009C000-000000067F000080000006C00C00000A0000__000000873B520940", +"000000067F000080000006C00C00000A0000-000000067F000080000006C00C00000A4000__00000086720CFFF0", +"000000067F000080000006C00C00000A0000-000000067F000080000006C00C00000A4000__000000873B520940", +"000000067F000080000006C00C00000A0C0B-000000067F000080000006C00C00000AA371__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C00000A4000-000000067F000080000006C00C00000A8000__00000086720CFFF0", +"000000067F000080000006C00C00000A4000-000000067F000080000006C00C00000A8000__000000873B520940", +"000000067F000080000006C00C00000A8000-000000067F000080000006C00C00000AC000__00000086720CFFF0", +"000000067F000080000006C00C00000A8000-000000067F000080000006C00C00000AC000__000000873B520940", +"000000067F000080000006C00C00000AA371-000000067F000080000006C00C00000B3AD7__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C00000AB316-000000067F000080000006C00C00001015F1__00000086ED29E361-000000873C9A2551", +"000000067F000080000006C00C00000AC000-000000067F000080000006C00C00000B0000__00000086720CFFF0", +"000000067F000080000006C00C00000AC000-000000067F000080000006C00C00000B0000__000000873B520940", +"000000067F000080000006C00C00000B0000-000000067F000080000006C00C00000B4000__00000086720CFFF0", +"000000067F000080000006C00C00000B0000-000000067F000080000006C00C00000B4000__000000873B520940", +"000000067F000080000006C00C00000B3AD7-000000067F000080000006C00C0100000000__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C00000B4000-000000067F000080000006C00C00000B8000__00000086720CFFF0", +"000000067F000080000006C00C00000B4000-000000067F000080000006C00C00000B8000__000000873B520940", +"000000067F000080000006C00C00000B8000-000000067F000080000006C00C00000BC000__00000086720CFFF0", +"000000067F000080000006C00C00000B8000-000000067F000080000006C00C00000BC000__000000873B520940", +"000000067F000080000006C00C00000BC000-000000067F000080000006C00C00000C0000__00000086720CFFF0", +"000000067F000080000006C00C00000BC000-000000067F000080000006C00C00000C0000__000000873B520940", +"000000067F000080000006C00C00000BC102-000000067F000080000006C00C00000C580D__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C00000BFB6E-000000067F000080000006C01400000016BC__0000008673817FC9-00000086ED29E361", +"000000067F000080000006C00C00000C0000-000000067F000080000006C00C00000C4000__00000086720CFFF0", +"000000067F000080000006C00C00000C0000-000000067F000080000006C00C00000C4000__000000873B520940", +"000000067F000080000006C00C00000C4000-000000067F000080000006C00C00000C8000__00000086720CFFF0", +"000000067F000080000006C00C00000C4000-000000067F000080000006C00C00000C8000__000000873B520940", +"000000067F000080000006C00C00000C580D-000000067F000080000006C00C00000CEF73__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C00000C8000-000000067F000080000006C00C00000CC000__00000086720CFFF0", +"000000067F000080000006C00C00000C8000-000000067F000080000006C00C00000CC000__000000873B520940", +"000000067F000080000006C00C00000CC000-000000067F000080000006C00C00000D0000__00000086720CFFF0", +"000000067F000080000006C00C00000CC000-000000067F000080000006C00C00000D0000__000000873B520940", +"000000067F000080000006C00C00000CEF73-000000067F000080000006C00C00000D86D9__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C00000D0000-000000067F000080000006C00C00000D4000__00000086720CFFF0", +"000000067F000080000006C00C00000D0000-000000067F000080000006C00C00000D4000__000000873B520940", +"000000067F000080000006C00C00000D4000-000000067F000080000006C00C00000D8000__00000086720CFFF0", +"000000067F000080000006C00C00000D4000-000000067F000080000006C00C00000D8000__000000873B520940", +"000000067F000080000006C00C00000D8000-000000067F000080000006C00C00000DC000__00000086720CFFF0", +"000000067F000080000006C00C00000D8000-000000067F000080000006C00C00000DC000__000000873B520940", +"000000067F000080000006C00C00000D86D9-000000067F000080000006C00C00000E1E0C__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C00000DC000-000000067F000080000006C00C00000E0000__00000086720CFFF0", +"000000067F000080000006C00C00000DC000-000000067F000080000006C00C00000E0000__000000873B520940", +"000000067F000080000006C00C00000E0000-000000067F000080000006C00C00000E4000__00000086720CFFF0", +"000000067F000080000006C00C00000E0000-000000067F000080000006C00C00000E4000__000000873B520940", +"000000067F000080000006C00C00000E1E0C-000000067F000080000006C00C00000EB572__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C00000E4000-000000067F000080000006C00C00000E8000__00000086720CFFF0", +"000000067F000080000006C00C00000E4000-000000067F000080000006C00C00000E8000__000000873B520940", +"000000067F000080000006C00C00000E8000-000000067F000080000006C00C00000EC000__00000086720CFFF0", +"000000067F000080000006C00C00000E8000-000000067F000080000006C00C00000EC000__000000873B520940", +"000000067F000080000006C00C00000EB572-000000067F000080000006C00C00000F4CD8__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C00000EC000-000000067F000080000006C00C00000F0000__00000086720CFFF0", +"000000067F000080000006C00C00000EC000-000000067F000080000006C00C00000F0000__000000873B520940", +"000000067F000080000006C00C00000F0000-000000067F000080000006C00C00000F4000__00000086720CFFF0", +"000000067F000080000006C00C00000F0000-000000067F000080000006C00C00000F4000__000000873B520940", +"000000067F000080000006C00C00000F4000-000000067F000080000006C00C00000F8000__00000086720CFFF0", +"000000067F000080000006C00C00000F4000-000000067F000080000006C00C00000F8000__000000873B520940", +"000000067F000080000006C00C00000F4CD8-000000067F000080000006C00C00000FE40B__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C00000F8000-000000067F000080000006C00C00000FC000__00000086720CFFF0", +"000000067F000080000006C00C00000F8000-000000067F000080000006C00C00000FC000__000000873B520940", +"000000067F000080000006C00C00000FC000-000000067F000080000006C00C0000100000__00000086720CFFF0", +"000000067F000080000006C00C00000FC000-000000067F000080000006C00C0000100000__000000873B520940", +"000000067F000080000006C00C00000FE40B-000000067F000080000006C00C0000107B27__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C0000100000-000000067F000080000006C00C0000104000__00000086720CFFF0", +"000000067F000080000006C00C0000100000-000000067F000080000006C00C0000104000__000000873B520940", +"000000067F000080000006C00C00001015F3-000000067F000080000006C0140000013635__00000086ED29E361-000000873C9A2551", +"000000067F000080000006C00C0000104000-000000067F000080000006C00C0000108000__00000086720CFFF0", +"000000067F000080000006C00C0000104000-000000067F000080000006C00C0000108000__000000873B520940", +"000000067F000080000006C00C0000107B27-000000067F000080000006C00C000011128D__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C0000108000-000000067F000080000006C00C000010C000__00000086720CFFF0", +"000000067F000080000006C00C0000108000-000000067F000080000006C00C000010C000__000000873B520940", +"000000067F000080000006C00C000010C000-000000067F000080000006C00C0000110000__00000086720CFFF0", +"000000067F000080000006C00C000010C000-000000067F000080000006C00C0000110000__000000873B520940", +"000000067F000080000006C00C0000110000-000000067F000080000006C0120100000000__000000873B520940", +"000000067F000080000006C00C0000110000-030000000000000000000000000000000002__00000086720CFFF0", +"000000067F000080000006C00C000011128D-010000000000000001000000030000000017__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C0140000000000-000000067F000080000006C0140000004000__000000873B520940", +"000000067F000080000006C01400000016BC-000000067F000080000006C014000000830F__0000008673817FC9-00000086ED29E361", +"000000067F000080000006C0140000004000-000000067F000080000006C0140000008000__000000873B520940", +"000000067F000080000006C0140000008000-000000067F000080000006C014000000C000__000000873B520940", +"000000067F000080000006C014000000830F-000000067F000080000006C014000000EF5B__0000008673817FC9-00000086ED29E361", +"000000067F000080000006C014000000C000-000000067F000080000006C0140000010000__000000873B520940", +"000000067F000080000006C014000000EF5B-000000067F000080000006C0140000015BA7__0000008673817FC9-00000086ED29E361", +"000000067F000080000006C0140000010000-000000067F000080000006C0140000014000__000000873B520940", +"000000067F000080000006C0140000013636-000000067F000080000006C014000002DB5F__00000086ED29E361-000000873C9A2551", +"000000067F000080000006C0140000014000-000000067F000080000006C0140000018000__000000873B520940", +"000000067F000080000006C0140000015BA7-000000067F000080000006C014000001C7F0__0000008673817FC9-00000086ED29E361", +"000000067F000080000006C0140000018000-000000067F000080000006C014000001C000__000000873B520940", +"000000067F000080000006C014000001C000-000000067F000080000006C0140000020000__000000873B520940", +"000000067F000080000006C014000001C7F0-000000067F000080000006C0140000023430__0000008673817FC9-00000086ED29E361", +"000000067F000080000006C0140000020000-000000067F000080000006C0140000024000__000000873B520940", +"000000067F000080000006C0140000023430-000000067F000080000006C014000002A049__0000008673817FC9-00000086ED29E361", +"000000067F000080000006C0140000024000-000000067F000080000006C0140000028000__000000873B520940", +"000000067F000080000006C0140000028000-000000067F000080000006C014000002C000__000000873B520940", +"000000067F000080000006C014000002A049-030000000000000000000000000000000002__0000008673817FC9-00000086ED29E361", +"000000067F000080000006C014000002C000-030000000000000000000000000000000002__000000873B520940", +"000000067F000080000006C014000002DB60-030000000000000000000000000000000002__00000086ED29E361-000000873C9A2551", +"000000067F000080000006E00C0000000000-000000067F000080000006E00C0000004000__000000890CF51FE0", +"000000067F000080000006E00C0000004000-000000067F000080000006E00C0000008000__000000890CF51FE0", +"000000067F000080000006E00C0000008000-000000067F000080000006E00C000000C000__000000890CF51FE0", +"000000067F000080000006E00C00000096C8-000000067F000080000006E00C0000012E0A__000000873C9A2551-00000087BC75E5B1", +"000000067F000080000006E00C000000C000-000000067F000080000006E00C0000010000__000000890CF51FE0", +"000000067F000080000006E00C0000010000-000000067F000080000006E00C0000014000__000000890CF51FE0", +"000000067F000080000006E00C0000012E0A-000000067F000080000006E00C000001C570__000000873C9A2551-00000087BC75E5B1", +"000000067F000080000006E00C0000014000-000000067F000080000006E00C0000018000__000000890CF51FE0", +"000000067F000080000006E00C0000018000-000000067F000080000006E00C000001C000__000000890CF51FE0", +"000000067F000080000006E00C000001C000-000000067F000080000006E00C0000020000__000000890CF51FE0", +"000000067F000080000006E00C000001C570-000000067F000080000006E00C0000025CD6__000000873C9A2551-00000087BC75E5B1", +"000000067F000080000006E00C0000020000-000000067F000080000006E00C0000024000__000000890CF51FE0", +"000000067F000080000006E00C0000024000-000000067F000080000006E00C0000028000__000000890CF51FE0", +"000000067F000080000006E00C0000025CD6-000000067F000080000006E00C000002F40A__000000873C9A2551-00000087BC75E5B1", +"000000067F000080000006E00C0000028000-000000067F000080000006E00C000002C000__000000890CF51FE0", +"000000067F000080000006E00C000002C000-000000067F000080000006E00C0000030000__000000890CF51FE0", +"000000067F000080000006E00C000002F40A-000000067F000080000006E00C0000038B1D__000000873C9A2551-00000087BC75E5B1", +"000000067F000080000006E00C0000030000-000000067F000080000006E00C0000034000__000000890CF51FE0", +"000000067F000080000006E00C0000034000-000000067F000080000006E00C0000038000__000000890CF51FE0", +"000000067F000080000006E00C0000038000-000000067F000080000006E00C000003C000__000000890CF51FE0", +"000000067F000080000006E00C0000038B1D-000000067F000080000006E00C0000042283__000000873C9A2551-00000087BC75E5B1", +"000000067F000080000006E00C000003C000-000000067F000080000006E00C0000040000__000000890CF51FE0", +"000000067F000080000006E00C0000040000-000000067F000080000006E00C0000044000__000000890CF51FE0", +"000000067F000080000006E00C0000042283-000000067F000080000006E00C000004B9E9__000000873C9A2551-00000087BC75E5B1", +"000000067F000080000006E00C0000044000-000000067F000080000006E00C0000048000__000000890CF51FE0", +"000000067F000080000006E00C0000048000-000000067F000080000006E00C000004C000__000000890AE2DFC8", +"000000067F000080000006E00C0000048000-000000067F000080000006E00C000004C000__00000089D5AEF6E8", +"000000067F000080000006E00C000004B9E9-030000000000000000000000000000000002__000000873C9A2551-00000087BC75E5B1", +"000000067F000080000006E00C000004BACB-000000067F000080000006E00C0000055200__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C000004C000-000000067F000080000006E00C0000050000__000000890AE2DFC8", +"000000067F000080000006E00C000004C000-000000067F000080000006E00C0000050000__00000089D5AEF6E8", +"000000067F000080000006E00C0000050000-000000067F000080000006E00C0000054000__000000890AE2DFC8", +"000000067F000080000006E00C0000050000-000000067F000080000006E00C0000054000__00000089D5AEF6E8", +"000000067F000080000006E00C0000054000-000000067F000080000006E00C0000058000__000000890AE2DFC8", +"000000067F000080000006E00C0000054000-000000067F000080000006E00C0000058000__00000089D5AEF6E8", +"000000067F000080000006E00C0000054246-000000067F000080000006E00C00000A83ED__0000008985FD3611-00000089D6B8EE99", +"000000067F000080000006E00C0000055200-000000067F000080000006E00C000005E90B__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C0000058000-000000067F000080000006E00C000005C000__000000890AE2DFC8", +"000000067F000080000006E00C0000058000-000000067F000080000006E00C000005C000__00000089D5AEF6E8", +"000000067F000080000006E00C000005C000-000000067F000080000006E00C0000060000__000000890AE2DFC8", +"000000067F000080000006E00C000005C000-000000067F000080000006E00C0000060000__00000089D5AEF6E8", +"000000067F000080000006E00C000005E90B-000000067F000080000006E00C000006802B__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C0000060000-000000067F000080000006E00C0000064000__000000890AE2DFC8", +"000000067F000080000006E00C0000060000-000000067F000080000006E00C0000064000__00000089D5AEF6E8", +"000000067F000080000006E00C0000064000-000000067F000080000006E00C0000068000__000000890AE2DFC8", +"000000067F000080000006E00C0000064000-000000067F000080000006E00C0000068000__00000089D5AEF6E8", +"000000067F000080000006E00C0000068000-000000067F000080000006E00C000006C000__000000890AE2DFC8", +"000000067F000080000006E00C0000068000-000000067F000080000006E00C000006C000__00000089D5AEF6E8", +"000000067F000080000006E00C000006802B-000000067F000080000006E00C0000071782__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C000006C000-000000067F000080000006E00C0000070000__000000890AE2DFC8", +"000000067F000080000006E00C000006C000-000000067F000080000006E00C0000070000__00000089D5AEF6E8", +"000000067F000080000006E00C0000070000-000000067F000080000006E00C0000074000__000000890AE2DFC8", +"000000067F000080000006E00C0000070000-000000067F000080000006E00C0000074000__00000089D5AEF6E8", +"000000067F000080000006E00C0000071782-000000067F000080000006E00C000007AEE8__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C0000074000-000000067F000080000006E00C0000078000__000000890AE2DFC8", +"000000067F000080000006E00C0000074000-000000067F000080000006E00C0000078000__00000089D5AEF6E8", +"000000067F000080000006E00C0000078000-000000067F000080000006E00C000007C000__000000890AE2DFC8", +"000000067F000080000006E00C0000078000-000000067F000080000006E00C000007C000__00000089D5AEF6E8", +"000000067F000080000006E00C000007AEE8-000000067F000080000006E00C000008460B__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C000007C000-000000067F000080000006E00C0000080000__000000890AE2DFC8", +"000000067F000080000006E00C000007C000-000000067F000080000006E00C0000080000__00000089D5AEF6E8", +"000000067F000080000006E00C0000080000-000000067F000080000006E00C0000084000__000000890AE2DFC8", +"000000067F000080000006E00C0000080000-000000067F000080000006E00C0000084000__00000089D5AEF6E8", +"000000067F000080000006E00C0000084000-000000067F000080000006E00C0000088000__000000890AE2DFC8", +"000000067F000080000006E00C0000084000-000000067F000080000006E00C0000088000__00000089D5AEF6E8", +"000000067F000080000006E00C000008460B-000000067F000080000006E00C000008DD71__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C0000088000-000000067F000080000006E00C000008C000__000000890AE2DFC8", +"000000067F000080000006E00C0000088000-000000067F000080000006E00C000008C000__00000089D5AEF6E8", +"000000067F000080000006E00C000008C000-000000067F000080000006E00C0000090000__000000890AE2DFC8", +"000000067F000080000006E00C000008C000-000000067F000080000006E00C0000090000__00000089D5AEF6E8", +"000000067F000080000006E00C000008DD71-000000067F000080000006E00C00000974D7__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C0000090000-000000067F000080000006E00C0000094000__000000890AE2DFC8", +"000000067F000080000006E00C0000090000-000000067F000080000006E00C0000094000__00000089D5AEF6E8", +"000000067F000080000006E00C0000094000-000000067F000080000006E00C0000098000__000000890AE2DFC8", +"000000067F000080000006E00C0000094000-000000067F000080000006E00C0000098000__00000089D5AEF6E8", +"000000067F000080000006E00C00000974D7-000000067F000080000006E00C00000A0C0B__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C0000098000-000000067F000080000006E00C000009C000__000000890AE2DFC8", +"000000067F000080000006E00C0000098000-000000067F000080000006E00C000009C000__00000089D5AEF6E8", +"000000067F000080000006E00C000009C000-000000067F000080000006E00C00000A0000__000000890AE2DFC8", +"000000067F000080000006E00C000009C000-000000067F000080000006E00C00000A0000__00000089D5AEF6E8", +"000000067F000080000006E00C00000A0000-000000067F000080000006E00C00000A4000__000000890AE2DFC8", +"000000067F000080000006E00C00000A0000-000000067F000080000006E00C00000A4000__00000089D5AEF6E8", +"000000067F000080000006E00C00000A0C0B-000000067F000080000006E00C00000AA371__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C00000A4000-000000067F000080000006E00C00000A8000__000000890AE2DFC8", +"000000067F000080000006E00C00000A4000-000000067F000080000006E00C00000A8000__00000089D5AEF6E8", +"000000067F000080000006E00C00000A8000-000000067F000080000006E00C00000AC000__000000890AE2DFC8", +"000000067F000080000006E00C00000A8000-000000067F000080000006E00C00000AC000__00000089D5AEF6E8", +"000000067F000080000006E00C00000A8407-000000067F000080000006E00C00000FD787__0000008985FD3611-00000089D6B8EE99", +"000000067F000080000006E00C00000AA371-000000067F000080000006E00C00000B3AD7__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C00000AC000-000000067F000080000006E00C00000B0000__000000890AE2DFC8", +"000000067F000080000006E00C00000AC000-000000067F000080000006E00C00000B0000__00000089D5AEF6E8", +"000000067F000080000006E00C00000B0000-000000067F000080000006E00C00000B4000__000000890AE2DFC8", +"000000067F000080000006E00C00000B0000-000000067F000080000006E00C00000B4000__00000089D5AEF6E8", +"000000067F000080000006E00C00000B3AD7-000000067F000080000006E00C00000BD20B__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C00000B4000-000000067F000080000006E00C00000B8000__000000890AE2DFC8", +"000000067F000080000006E00C00000B4000-000000067F000080000006E00C00000B8000__00000089D5AEF6E8", +"000000067F000080000006E00C00000B6F42-000000067F000080000006E0140000000EEF__000000890C5B6001-0000008985FD3611", +"000000067F000080000006E00C00000B8000-000000067F000080000006E00C00000BC000__000000890AE2DFC8", +"000000067F000080000006E00C00000B8000-000000067F000080000006E00C00000BC000__00000089D5AEF6E8", +"000000067F000080000006E00C00000BC000-000000067F000080000006E00C00000C0000__000000890AE2DFC8", +"000000067F000080000006E00C00000BC000-000000067F000080000006E00C00000C0000__00000089D5AEF6E8", +"000000067F000080000006E00C00000BD20B-000000067F000080000006E00C0100000000__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C00000C0000-000000067F000080000006E00C00000C4000__000000890AE2DFC8", +"000000067F000080000006E00C00000C0000-000000067F000080000006E00C00000C4000__00000089D5AEF6E8", +"000000067F000080000006E00C00000C4000-000000067F000080000006E00C00000C8000__000000890AE2DFC8", +"000000067F000080000006E00C00000C4000-000000067F000080000006E00C00000C8000__00000089D5AEF6E8", +"000000067F000080000006E00C00000C5883-000000067F000080000006E00C00000CEFE9__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E00C00000C8000-000000067F000080000006E00C00000CC000__000000890AE2DFC8", +"000000067F000080000006E00C00000C8000-000000067F000080000006E00C00000CC000__00000089D5AEF6E8", +"000000067F000080000006E00C00000CC000-000000067F000080000006E00C00000D0000__000000890AE2DFC8", +"000000067F000080000006E00C00000CC000-000000067F000080000006E00C00000D0000__00000089D5AEF6E8", +"000000067F000080000006E00C00000CEFE9-000000067F000080000006E00C00000D872B__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E00C00000D0000-000000067F000080000006E00C00000D4000__000000890AE2DFC8", +"000000067F000080000006E00C00000D0000-000000067F000080000006E00C00000D4000__00000089D5AEF6E8", +"000000067F000080000006E00C00000D4000-000000067F000080000006E00C00000D8000__000000890AE2DFC8", +"000000067F000080000006E00C00000D4000-000000067F000080000006E00C00000D8000__00000089D5AEF6E8", +"000000067F000080000006E00C00000D8000-000000067F000080000006E00C00000DC000__000000890AE2DFC8", +"000000067F000080000006E00C00000D8000-000000067F000080000006E00C00000DC000__00000089D5AEF6E8", +"000000067F000080000006E00C00000D872B-000000067F000080000006E00C00000E1E91__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E00C00000DC000-000000067F000080000006E00C00000E0000__000000890AE2DFC8", +"000000067F000080000006E00C00000DC000-000000067F000080000006E00C00000E0000__00000089D5AEF6E8", +"000000067F000080000006E00C00000E0000-000000067F000080000006E00C00000E4000__000000890AE2DFC8", +"000000067F000080000006E00C00000E0000-000000067F000080000006E00C00000E4000__00000089D5AEF6E8", +"000000067F000080000006E00C00000E1E91-000000067F000080000006E00C00000EB5F7__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E00C00000E4000-000000067F000080000006E00C00000E8000__000000890AE2DFC8", +"000000067F000080000006E00C00000E4000-000000067F000080000006E00C00000E8000__00000089D5AEF6E8", +"000000067F000080000006E00C00000E8000-000000067F000080000006E00C00000EC000__000000890AE2DFC8", +"000000067F000080000006E00C00000E8000-000000067F000080000006E00C00000EC000__00000089D5AEF6E8", +"000000067F000080000006E00C00000EB5F7-000000067F000080000006E00C00000F4D0C__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E00C00000EC000-000000067F000080000006E00C00000F0000__000000890AE2DFC8", +"000000067F000080000006E00C00000EC000-000000067F000080000006E00C00000F0000__00000089D5AEF6E8", +"000000067F000080000006E00C00000F0000-000000067F000080000006E00C00000F4000__000000890AE2DFC8", +"000000067F000080000006E00C00000F0000-000000067F000080000006E00C00000F4000__00000089D5AEF6E8", +"000000067F000080000006E00C00000F4000-000000067F000080000006E00C00000F8000__000000890AE2DFC8", +"000000067F000080000006E00C00000F4000-000000067F000080000006E00C00000F8000__00000089D5AEF6E8", +"000000067F000080000006E00C00000F4D0C-000000067F000080000006E00C00000FE472__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E00C00000F8000-000000067F000080000006E00C00000FC000__000000890AE2DFC8", +"000000067F000080000006E00C00000F8000-000000067F000080000006E00C00000FC000__00000089D5AEF6E8", +"000000067F000080000006E00C00000FC000-000000067F000080000006E00C0000100000__000000890AE2DFC8", +"000000067F000080000006E00C00000FC000-000000067F000080000006E00C0000100000__00000089D5AEF6E8", +"000000067F000080000006E00C00000FD78D-000000067F000080000006E0140000011DB5__0000008985FD3611-00000089D6B8EE99", +"000000067F000080000006E00C00000FE472-000000067F000080000006E00C0000107B8E__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E00C0000100000-000000067F000080000006E00C0000104000__000000890AE2DFC8", +"000000067F000080000006E00C0000100000-000000067F000080000006E00C0000104000__00000089D5AEF6E8", +"000000067F000080000006E00C0000104000-000000067F000080000006E00C0000108000__000000890AE2DFC8", +"000000067F000080000006E00C0000104000-000000067F000080000006E00C0000108000__00000089D5AEF6E8", +"000000067F000080000006E00C0000107B8E-000000067F000080000006E00C00001112F4__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E00C0000108000-000000067F000080000006E00C000010C000__000000890AE2DFC8", +"000000067F000080000006E00C0000108000-000000067F000080000006E00C000010C000__00000089D5AEF6E8", +"000000067F000080000006E00C000010C000-000000067F000080000006E00C0000110000__000000890AE2DFC8", +"000000067F000080000006E00C000010C000-000000067F000080000006E00C0000110000__00000089D5AEF6E8", +"000000067F000080000006E00C0000110000-000000067F000080000006E0120100000000__00000089D5AEF6E8", +"000000067F000080000006E00C0000110000-030000000000000000000000000000000002__000000890AE2DFC8", +"000000067F000080000006E00C00001112F4-01000000000000000100000003000000001A__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E0140000000000-000000067F000080000006E0140000004000__00000089D5AEF6E8", +"000000067F000080000006E0140000000EEF-000000067F000080000006E0140000007C4F__000000890C5B6001-0000008985FD3611", +"000000067F000080000006E0140000004000-000000067F000080000006E0140000008000__00000089D5AEF6E8", +"000000067F000080000006E0140000007C4F-000000067F000080000006E014000000E97E__000000890C5B6001-0000008985FD3611", +"000000067F000080000006E0140000008000-000000067F000080000006E014000000C000__00000089D5AEF6E8", +"000000067F000080000006E014000000C000-000000067F000080000006E0140000010000__00000089D5AEF6E8", +"000000067F000080000006E014000000E97E-000000067F000080000006E01400000156DC__000000890C5B6001-0000008985FD3611", +"000000067F000080000006E0140000010000-000000067F000080000006E0140000014000__00000089D5AEF6E8", +"000000067F000080000006E0140000011DB5-000000067F000080000006E014000002B9CE__0000008985FD3611-00000089D6B8EE99", +"000000067F000080000006E0140000014000-000000067F000080000006E0140000018000__00000089D5AEF6E8", +"000000067F000080000006E01400000156DC-000000067F000080000006E014000001C468__000000890C5B6001-0000008985FD3611", +"000000067F000080000006E0140000018000-000000067F000080000006E014000001C000__00000089D5AEF6E8", +"000000067F000080000006E014000001C000-000000067F000080000006E0140000020000__00000089D5AEF6E8", +"000000067F000080000006E014000001C468-000000067F000080000006E01400000231D5__000000890C5B6001-0000008985FD3611", +"000000067F000080000006E0140000020000-000000067F000080000006E0140000024000__00000089D5AEF6E8", +"000000067F000080000006E01400000231D5-000000067F000080000006E0140000029F96__000000890C5B6001-0000008985FD3611", +"000000067F000080000006E0140000024000-000000067F000080000006E0140000028000__00000089D5AEF6E8", +"000000067F000080000006E0140000028000-000000067F000080000006E014000002C000__00000089D5AEF6E8", +"000000067F000080000006E0140000029F96-030000000000000000000000000000000002__000000890C5B6001-0000008985FD3611", +"000000067F000080000006E014000002B9D0-030000000000000000000000000000000002__0000008985FD3611-00000089D6B8EE99", +"000000067F000080000006E014000002C000-030000000000000000000000000000000002__00000089D5AEF6E8", +"000000067F000080000007000C0000000000-000000067F000080000007000C0000004000__0000008BA730BFE8", +"000000067F000080000007000C0000004000-000000067F000080000007000C0000008000__0000008BA730BFE8", +"000000067F000080000007000C0000008000-000000067F000080000007000C000000C000__0000008BA730BFE8", +"000000067F000080000007000C000000955C-000000067F000080000007000C0000012CC2__00000089D6B8EE99-0000008A56BBF739", +"000000067F000080000007000C000000C000-000000067F000080000007000C0000010000__0000008BA730BFE8", +"000000067F000080000007000C0000010000-000000067F000080000007000C0000014000__0000008BA730BFE8", +"000000067F000080000007000C0000012CC2-000000067F000080000007000C000001C40A__00000089D6B8EE99-0000008A56BBF739", +"000000067F000080000007000C0000014000-000000067F000080000007000C0000018000__0000008BA730BFE8", +"000000067F000080000007000C0000018000-000000067F000080000007000C000001C000__0000008BA730BFE8", +"000000067F000080000007000C000001C000-000000067F000080000007000C0000020000__0000008BA730BFE8", +"000000067F000080000007000C000001C40A-000000067F000080000007000C0000025B39__00000089D6B8EE99-0000008A56BBF739", +"000000067F000080000007000C0000020000-000000067F000080000007000C0000024000__0000008BA730BFE8", +"000000067F000080000007000C0000024000-000000067F000080000007000C0000028000__0000008BA730BFE8", +"000000067F000080000007000C0000025B39-000000067F000080000007000C000002F29F__00000089D6B8EE99-0000008A56BBF739", +"000000067F000080000007000C0000028000-000000067F000080000007000C000002C000__0000008BA730BFE8", +"000000067F000080000007000C000002C000-000000067F000080000007000C0000030000__0000008BA730BFE8", +"000000067F000080000007000C000002F29F-000000067F000080000007000C00000389B3__00000089D6B8EE99-0000008A56BBF739", +"000000067F000080000007000C0000030000-000000067F000080000007000C0000034000__0000008BA730BFE8", +"000000067F000080000007000C0000034000-000000067F000080000007000C0000038000__0000008BA730BFE8", +"000000067F000080000007000C0000038000-000000067F000080000007000C000003C000__0000008BA730BFE8", +"000000067F000080000007000C00000389B3-000000067F000080000007000C0000042119__00000089D6B8EE99-0000008A56BBF739", +"000000067F000080000007000C000003C000-000000067F000080000007000C0000040000__0000008BA730BFE8", +"000000067F000080000007000C0000040000-000000067F000080000007000C0000044000__0000008BA730BFE8", +"000000067F000080000007000C0000042119-000000067F000080000007000C000004B87F__00000089D6B8EE99-0000008A56BBF739", +"000000067F000080000007000C0000044000-000000067F000080000007000C0000048000__0000008BA730BFE8", +"000000067F000080000007000C0000048000-000000067F000080000007000C000004C000__0000008B9669EDB0", +"000000067F000080000007000C0000048000-000000067F000080000007000C000004C000__0000008C71903720", +"000000067F000080000007000C000004B87F-030000000000000000000000000000000002__00000089D6B8EE99-0000008A56BBF739", +"000000067F000080000007000C000004BAD3-000000067F000080000007000C0000055207__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C000004C000-000000067F000080000007000C0000050000__0000008B9669EDB0", +"000000067F000080000007000C000004C000-000000067F000080000007000C0000050000__0000008C71903720", +"000000067F000080000007000C0000050000-000000067F000080000007000C0000054000__0000008B9669EDB0", +"000000067F000080000007000C0000050000-000000067F000080000007000C0000054000__0000008C71903720", +"000000067F000080000007000C0000053C23-000000067F000080000007000C00000A6F76__0000008C2045B721-0000008C72843D41", +"000000067F000080000007000C0000054000-000000067F000080000007000C0000058000__0000008B9669EDB0", +"000000067F000080000007000C0000054000-000000067F000080000007000C0000058000__0000008C71903720", +"000000067F000080000007000C0000055207-000000067F000080000007000C000005E912__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C0000058000-000000067F000080000007000C000005C000__0000008B9669EDB0", +"000000067F000080000007000C0000058000-000000067F000080000007000C000005C000__0000008C71903720", +"000000067F000080000007000C000005C000-000000067F000080000007000C0000060000__0000008B9669EDB0", +"000000067F000080000007000C000005C000-000000067F000080000007000C0000060000__0000008C71903720", +"000000067F000080000007000C000005E912-000000067F000080000007000C000006802C__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C0000060000-000000067F000080000007000C0000064000__0000008B9669EDB0", +"000000067F000080000007000C0000060000-000000067F000080000007000C0000064000__0000008C71903720", +"000000067F000080000007000C0000064000-000000067F000080000007000C0000068000__0000008B9669EDB0", +"000000067F000080000007000C0000064000-000000067F000080000007000C0000068000__0000008C71903720", +"000000067F000080000007000C0000068000-000000067F000080000007000C000006C000__0000008B9669EDB0", +"000000067F000080000007000C0000068000-000000067F000080000007000C000006C000__0000008C71903720", +"000000067F000080000007000C000006802C-000000067F000080000007000C0000071783__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C000006C000-000000067F000080000007000C0000070000__0000008B9669EDB0", +"000000067F000080000007000C000006C000-000000067F000080000007000C0000070000__0000008C71903720", +"000000067F000080000007000C0000070000-000000067F000080000007000C0000074000__0000008B9669EDB0", +"000000067F000080000007000C0000070000-000000067F000080000007000C0000074000__0000008C71903720", +"000000067F000080000007000C0000071783-000000067F000080000007000C000007AEE9__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C0000074000-000000067F000080000007000C0000078000__0000008B9669EDB0", +"000000067F000080000007000C0000074000-000000067F000080000007000C0000078000__0000008C71903720", +"000000067F000080000007000C0000078000-000000067F000080000007000C000007C000__0000008B9669EDB0", +"000000067F000080000007000C0000078000-000000067F000080000007000C000007C000__0000008C71903720", +"000000067F000080000007000C000007AEE9-000000067F000080000007000C000008460B__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C000007C000-000000067F000080000007000C0000080000__0000008B9669EDB0", +"000000067F000080000007000C000007C000-000000067F000080000007000C0000080000__0000008C71903720", +"000000067F000080000007000C0000080000-000000067F000080000007000C0000084000__0000008B9669EDB0", +"000000067F000080000007000C0000080000-000000067F000080000007000C0000084000__0000008C71903720", +"000000067F000080000007000C0000084000-000000067F000080000007000C0000088000__0000008B9669EDB0", +"000000067F000080000007000C0000084000-000000067F000080000007000C0000088000__0000008C71903720", +"000000067F000080000007000C000008460B-000000067F000080000007000C000008DD71__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C0000088000-000000067F000080000007000C000008C000__0000008B9669EDB0", +"000000067F000080000007000C0000088000-000000067F000080000007000C000008C000__0000008C71903720", +"000000067F000080000007000C000008C000-000000067F000080000007000C0000090000__0000008B9669EDB0", +"000000067F000080000007000C000008C000-000000067F000080000007000C0000090000__0000008C71903720", +"000000067F000080000007000C000008DD71-000000067F000080000007000C00000974D7__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C0000090000-000000067F000080000007000C0000094000__0000008B9669EDB0", +"000000067F000080000007000C0000090000-000000067F000080000007000C0000094000__0000008C71903720", +"000000067F000080000007000C0000094000-000000067F000080000007000C0000098000__0000008B9669EDB0", +"000000067F000080000007000C0000094000-000000067F000080000007000C0000098000__0000008C71903720", +"000000067F000080000007000C00000974D7-000000067F000080000007000C00000A0C0B__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C0000098000-000000067F000080000007000C000009C000__0000008B9669EDB0", +"000000067F000080000007000C0000098000-000000067F000080000007000C000009C000__0000008C71903720", +"000000067F000080000007000C000009C000-000000067F000080000007000C00000A0000__0000008B9669EDB0", +"000000067F000080000007000C000009C000-000000067F000080000007000C00000A0000__0000008C71903720", +"000000067F000080000007000C00000A0000-000000067F000080000007000C00000A4000__0000008B9669EDB0", +"000000067F000080000007000C00000A0000-000000067F000080000007000C00000A4000__0000008C71903720", +"000000067F000080000007000C00000A0C0B-000000067F000080000007000C00000AA371__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C00000A4000-000000067F000080000007000C00000A8000__0000008B9669EDB0", +"000000067F000080000007000C00000A4000-000000067F000080000007000C00000A8000__0000008C71903720", +"000000067F000080000007000C00000A6F77-000000067F000080000007000C00000FA170__0000008C2045B721-0000008C72843D41", +"000000067F000080000007000C00000A8000-000000067F000080000007000C00000AC000__0000008B9669EDB0", +"000000067F000080000007000C00000A8000-000000067F000080000007000C00000AC000__0000008C71903720", +"000000067F000080000007000C00000AA371-000000067F000080000007000C0100000000__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C00000AC000-000000067F000080000007000C00000B0000__0000008B9669EDB0", +"000000067F000080000007000C00000AC000-000000067F000080000007000C00000B0000__0000008C71903720", +"000000067F000080000007000C00000B0000-000000067F000080000007000C00000B4000__0000008B9669EDB0", +"000000067F000080000007000C00000B0000-000000067F000080000007000C00000B4000__0000008C71903720", +"000000067F000080000007000C00000B2B06-000000067F000080000007000C00000BC211__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C00000B4000-000000067F000080000007000C00000B8000__0000008B9669EDB0", +"000000067F000080000007000C00000B4000-000000067F000080000007000C00000B8000__0000008C71903720", +"000000067F000080000007000C00000B8000-000000067F000080000007000C00000BC000__0000008B9669EDB0", +"000000067F000080000007000C00000B8000-000000067F000080000007000C00000BC000__0000008C71903720", +"000000067F000080000007000C00000BC000-000000067F000080000007000C00000C0000__0000008B9669EDB0", +"000000067F000080000007000C00000BC000-000000067F000080000007000C00000C0000__0000008C71903720", +"000000067F000080000007000C00000BC211-000000067F000080000007000C00000C5941__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C00000BF157-000000067F000080000007001400000016B2__0000008BA6803FC9-0000008C2045B721", +"000000067F000080000007000C00000C0000-000000067F000080000007000C00000C4000__0000008B9669EDB0", +"000000067F000080000007000C00000C0000-000000067F000080000007000C00000C4000__0000008C71903720", +"000000067F000080000007000C00000C4000-000000067F000080000007000C00000C8000__0000008B9669EDB0", +"000000067F000080000007000C00000C4000-000000067F000080000007000C00000C8000__0000008C71903720", +"000000067F000080000007000C00000C5941-000000067F000080000007000C00000CF0A7__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C00000C8000-000000067F000080000007000C00000CC000__0000008B9669EDB0", +"000000067F000080000007000C00000C8000-000000067F000080000007000C00000CC000__0000008C71903720", +"000000067F000080000007000C00000CC000-000000067F000080000007000C00000D0000__0000008B9669EDB0", +"000000067F000080000007000C00000CC000-000000067F000080000007000C00000D0000__0000008C71903720", +"000000067F000080000007000C00000CF0A7-000000067F000080000007000C00000D87BC__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C00000D0000-000000067F000080000007000C00000D4000__0000008B9669EDB0", +"000000067F000080000007000C00000D0000-000000067F000080000007000C00000D4000__0000008C71903720", +"000000067F000080000007000C00000D4000-000000067F000080000007000C00000D8000__0000008B9669EDB0", +"000000067F000080000007000C00000D4000-000000067F000080000007000C00000D8000__0000008C71903720", +"000000067F000080000007000C00000D8000-000000067F000080000007000C00000DC000__0000008B9669EDB0", +"000000067F000080000007000C00000D8000-000000067F000080000007000C00000DC000__0000008C71903720", +"000000067F000080000007000C00000D87BC-000000067F000080000007000C00000E1F0A__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C00000DC000-000000067F000080000007000C00000E0000__0000008B9669EDB0", +"000000067F000080000007000C00000DC000-000000067F000080000007000C00000E0000__0000008C71903720", +"000000067F000080000007000C00000E0000-000000067F000080000007000C00000E4000__0000008B9669EDB0", +"000000067F000080000007000C00000E0000-000000067F000080000007000C00000E4000__0000008C71903720", +"000000067F000080000007000C00000E1F0A-000000067F000080000007000C00000EB670__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C00000E4000-000000067F000080000007000C00000E8000__0000008B9669EDB0", +"000000067F000080000007000C00000E4000-000000067F000080000007000C00000E8000__0000008C71903720", +"000000067F000080000007000C00000E8000-000000067F000080000007000C00000EC000__0000008B9669EDB0", +"000000067F000080000007000C00000E8000-000000067F000080000007000C00000EC000__0000008C71903720", +"000000067F000080000007000C00000EB670-000000067F000080000007000C00000F4DA7__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C00000EC000-000000067F000080000007000C00000F0000__0000008B9669EDB0", +"000000067F000080000007000C00000EC000-000000067F000080000007000C00000F0000__0000008C71903720", +"000000067F000080000007000C00000F0000-000000067F000080000007000C00000F4000__0000008B9669EDB0", +"000000067F000080000007000C00000F0000-000000067F000080000007000C00000F4000__0000008C71903720", +"000000067F000080000007000C00000F4000-000000067F000080000007000C00000F8000__0000008B9669EDB0", +"000000067F000080000007000C00000F4000-000000067F000080000007000C00000F8000__0000008C71903720", +"000000067F000080000007000C00000F4DA7-000000067F000080000007000C00000FE509__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C00000F8000-000000067F000080000007000C00000FC000__0000008B9669EDB0", +"000000067F000080000007000C00000F8000-000000067F000080000007000C00000FC000__0000008C71903720", +"000000067F000080000007000C00000FA175-000000067F00008000000700140000010412__0000008C2045B721-0000008C72843D41", +"000000067F000080000007000C00000FC000-000000067F000080000007000C0000100000__0000008B9669EDB0", +"000000067F000080000007000C00000FC000-000000067F000080000007000C0000100000__0000008C71903720", +"000000067F000080000007000C00000FE509-000000067F000080000007000C0000107C2B__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C0000100000-000000067F000080000007000C0000104000__0000008B9669EDB0", +"000000067F000080000007000C0000100000-000000067F000080000007000C0000104000__0000008C71903720", +"000000067F000080000007000C0000104000-000000067F000080000007000C0000108000__0000008B9669EDB0", +"000000067F000080000007000C0000104000-000000067F000080000007000C0000108000__0000008C71903720", +"000000067F000080000007000C0000107C2B-000000067F000080000007000C0000111385__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C0000108000-000000067F000080000007000C000010C000__0000008C71903720", +"000000067F000080000007000C0000108000-030000000000000000000000000000000002__0000008B9669EDB0", +"000000067F000080000007000C000010C000-000000067F000080000007000C0000110000__0000008C71903720", +"000000067F000080000007000C0000110000-000000067F00008000000700120100000000__0000008C71903720", +"000000067F000080000007000C0000111385-01000000000000000100000003000000001E__0000008AF67FEC19-0000008BA6803FC9", +"000000067F00008000000700140000000000-000000067F00008000000700140000004000__0000008C71903720", +"000000067F000080000007001400000016B2-000000067F000080000007001400000082A6__0000008BA6803FC9-0000008C2045B721", +"000000067F00008000000700140000004000-000000067F00008000000700140000008000__0000008C71903720", +"000000067F00008000000700140000008000-000000067F0000800000070014000000C000__0000008C71903720", +"000000067F000080000007001400000082A6-000000067F0000800000070014000000EED0__0000008BA6803FC9-0000008C2045B721", +"000000067F0000800000070014000000C000-000000067F00008000000700140000010000__0000008C71903720", +"000000067F0000800000070014000000EED0-000000067F00008000000700140000015ADC__0000008BA6803FC9-0000008C2045B721", +"000000067F00008000000700140000010000-000000067F00008000000700140000014000__0000008C71903720", +"000000067F0000800000070014000001041E-000000067F000080000007001400000294B8__0000008C2045B721-0000008C72843D41", +"000000067F00008000000700140000014000-000000067F00008000000700140000018000__0000008C71903720", +"000000067F00008000000700140000015ADC-000000067F0000800000070014000001C6D6__0000008BA6803FC9-0000008C2045B721", +"000000067F00008000000700140000018000-000000067F0000800000070014000001C000__0000008C71903720", +"000000067F0000800000070014000001C000-000000067F00008000000700140000020000__0000008C71903720", +"000000067F0000800000070014000001C6D6-000000067F000080000007001400000232FD__0000008BA6803FC9-0000008C2045B721", +"000000067F00008000000700140000020000-000000067F00008000000700140000024000__0000008C71903720", +"000000067F000080000007001400000232FD-000000067F00008000000700140000029F07__0000008BA6803FC9-0000008C2045B721", +"000000067F00008000000700140000024000-000000067F00008000000700140000028000__0000008C71903720", +"000000067F00008000000700140000028000-000000067F0000800000070014000002C000__0000008C71903720", +"000000067F000080000007001400000294BA-030000000000000000000000000000000002__0000008C2045B721-0000008C72843D41", +"000000067F00008000000700140000029F07-030000000000000000000000000000000002__0000008BA6803FC9-0000008C2045B721", +"000000067F0000800000070014000002C000-030000000000000000000000000000000002__0000008C71903720", +"000000067F000080000007200C0000000000-000000067F000080000007200C0000004000__0000008E43487FF0", +"000000067F000080000007200C0000004000-000000067F000080000007200C0000008000__0000008E43487FF0", +"000000067F000080000007200C0000008000-000000067F000080000007200C000000C000__0000008E43487FF0", +"000000067F000080000007200C000000933D-000000067F000080000007200C0000012AA3__0000008C72843D41-0000008CF2BFFC89", +"000000067F000080000007200C000000C000-000000067F000080000007200C0000010000__0000008E43487FF0", +"000000067F000080000007200C0000010000-000000067F000080000007200C0000014000__0000008E43487FF0", +"000000067F000080000007200C0000012AA3-000000067F000080000007200C000001C209__0000008C72843D41-0000008CF2BFFC89", +"000000067F000080000007200C0000014000-000000067F000080000007200C0000018000__0000008E43487FF0", +"000000067F000080000007200C0000018000-000000067F000080000007200C000001C000__0000008E43487FF0", +"000000067F000080000007200C000001C000-000000067F000080000007200C0000020000__0000008E43487FF0", +"000000067F000080000007200C000001C209-000000067F000080000007200C0000025939__0000008C72843D41-0000008CF2BFFC89", +"000000067F000080000007200C0000020000-000000067F000080000007200C0000024000__0000008E43487FF0", +"000000067F000080000007200C0000024000-000000067F000080000007200C0000028000__0000008E43487FF0", +"000000067F000080000007200C0000025939-000000067F000080000007200C000002F09F__0000008C72843D41-0000008CF2BFFC89", +"000000067F000080000007200C0000028000-000000067F000080000007200C000002C000__0000008E43487FF0", +"000000067F000080000007200C000002C000-000000067F000080000007200C0000030000__0000008E43487FF0", +"000000067F000080000007200C000002F09F-000000067F000080000007200C00000387B4__0000008C72843D41-0000008CF2BFFC89", +"000000067F000080000007200C0000030000-000000067F000080000007200C0000034000__0000008E43487FF0", +"000000067F000080000007200C0000034000-000000067F000080000007200C0000038000__0000008E43487FF0", +"000000067F000080000007200C0000038000-000000067F000080000007200C000003C000__0000008E43487FF0", +"000000067F000080000007200C00000387B4-000000067F000080000007200C0000041F1A__0000008C72843D41-0000008CF2BFFC89", +"000000067F000080000007200C000003C000-000000067F000080000007200C0000040000__0000008E43487FF0", +"000000067F000080000007200C0000040000-000000067F000080000007200C0000044000__0000008E43487FF0", +"000000067F000080000007200C0000041F1A-000000067F000080000007200C000004B680__0000008C72843D41-0000008CF2BFFC89", +"000000067F000080000007200C0000044000-000000067F000080000007200C0000048000__0000008E43487FF0", +"000000067F000080000007200C0000048000-000000067F000080000007200C000004C000__0000008E3CDF59C0", +"000000067F000080000007200C0000048000-000000067F000080000007200C000004C000__0000008F10EA21C8", +"000000067F000080000007200C000004B680-030000000000000000000000000000000002__0000008C72843D41-0000008CF2BFFC89", +"000000067F000080000007200C000004BACE-000000067F000080000007200C0000055202__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C000004C000-000000067F000080000007200C0000050000__0000008E3CDF59C0", +"000000067F000080000007200C000004C000-000000067F000080000007200C0000050000__0000008F10EA21C8", +"000000067F000080000007200C0000050000-000000067F000080000007200C0000054000__0000008E3CDF59C0", +"000000067F000080000007200C0000050000-000000067F000080000007200C0000054000__0000008F10EA21C8", +"000000067F000080000007200C000005131D-000000067F000080000007200C00000A2138__0000008EBC4827C1-0000008F10E3E189", +"000000067F000080000007200C0000054000-000000067F000080000007200C0000058000__0000008E3CDF59C0", +"000000067F000080000007200C0000054000-000000067F000080000007200C0000058000__0000008F10EA21C8", +"000000067F000080000007200C0000055202-000000067F000080000007200C000005E90D__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C0000058000-000000067F000080000007200C000005C000__0000008E3CDF59C0", +"000000067F000080000007200C0000058000-000000067F000080000007200C000005C000__0000008F10EA21C8", +"000000067F000080000007200C000005C000-000000067F000080000007200C0000060000__0000008E3CDF59C0", +"000000067F000080000007200C000005C000-000000067F000080000007200C0000060000__0000008F10EA21C8", +"000000067F000080000007200C000005E90D-000000067F000080000007200C000006802B__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C0000060000-000000067F000080000007200C0000064000__0000008E3CDF59C0", +"000000067F000080000007200C0000060000-000000067F000080000007200C0000064000__0000008F10EA21C8", +"000000067F000080000007200C0000064000-000000067F000080000007200C0000068000__0000008E3CDF59C0", +"000000067F000080000007200C0000064000-000000067F000080000007200C0000068000__0000008F10EA21C8", +"000000067F000080000007200C0000068000-000000067F000080000007200C000006C000__0000008E3CDF59C0", +"000000067F000080000007200C0000068000-000000067F000080000007200C000006C000__0000008F10EA21C8", +"000000067F000080000007200C000006802B-000000067F000080000007200C0000071782__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C000006C000-000000067F000080000007200C0000070000__0000008E3CDF59C0", +"000000067F000080000007200C000006C000-000000067F000080000007200C0000070000__0000008F10EA21C8", +"000000067F000080000007200C0000070000-000000067F000080000007200C0000074000__0000008E3CDF59C0", +"000000067F000080000007200C0000070000-000000067F000080000007200C0000074000__0000008F10EA21C8", +"000000067F000080000007200C0000071782-000000067F000080000007200C000007AEE8__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C0000074000-000000067F000080000007200C0000078000__0000008E3CDF59C0", +"000000067F000080000007200C0000074000-000000067F000080000007200C0000078000__0000008F10EA21C8", +"000000067F000080000007200C0000078000-000000067F000080000007200C000007C000__0000008E3CDF59C0", +"000000067F000080000007200C0000078000-000000067F000080000007200C000007C000__0000008F10EA21C8", +"000000067F000080000007200C000007AEE8-000000067F000080000007200C000008460B__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C000007C000-000000067F000080000007200C0000080000__0000008E3CDF59C0", +"000000067F000080000007200C000007C000-000000067F000080000007200C0000080000__0000008F10EA21C8", +"000000067F000080000007200C0000080000-000000067F000080000007200C0000084000__0000008E3CDF59C0", +"000000067F000080000007200C0000080000-000000067F000080000007200C0000084000__0000008F10EA21C8", +"000000067F000080000007200C0000084000-000000067F000080000007200C0000088000__0000008E3CDF59C0", +"000000067F000080000007200C0000084000-000000067F000080000007200C0000088000__0000008F10EA21C8", +"000000067F000080000007200C000008460B-000000067F000080000007200C000008DD71__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C0000088000-000000067F000080000007200C000008C000__0000008E3CDF59C0", +"000000067F000080000007200C0000088000-000000067F000080000007200C000008C000__0000008F10EA21C8", +"000000067F000080000007200C000008C000-000000067F000080000007200C0000090000__0000008E3CDF59C0", +"000000067F000080000007200C000008C000-000000067F000080000007200C0000090000__0000008F10EA21C8", +"000000067F000080000007200C000008DD71-000000067F000080000007200C00000974D7__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C0000090000-000000067F000080000007200C0000094000__0000008E3CDF59C0", +"000000067F000080000007200C0000090000-000000067F000080000007200C0000094000__0000008F10EA21C8", +"000000067F000080000007200C0000094000-000000067F000080000007200C0000098000__0000008E3CDF59C0", +"000000067F000080000007200C0000094000-000000067F000080000007200C0000098000__0000008F10EA21C8", +"000000067F000080000007200C00000974D7-000000067F000080000007200C00000A0C0B__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C0000098000-000000067F000080000007200C000009C000__0000008E3CDF59C0", +"000000067F000080000007200C0000098000-000000067F000080000007200C000009C000__0000008F10EA21C8", +"000000067F000080000007200C000009C000-000000067F000080000007200C00000A0000__0000008E3CDF59C0", +"000000067F000080000007200C000009C000-000000067F000080000007200C00000A0000__0000008F10EA21C8", +"000000067F000080000007200C00000A0000-000000067F000080000007200C00000A4000__0000008E3CDF59C0", +"000000067F000080000007200C00000A0000-000000067F000080000007200C00000A4000__0000008F10EA21C8", +"000000067F000080000007200C00000A0C0B-000000067F000080000007200C00000AA371__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C00000A2138-000000067F000080000007200C00000F342E__0000008EBC4827C1-0000008F10E3E189", +"000000067F000080000007200C00000A4000-000000067F000080000007200C00000A8000__0000008E3CDF59C0", +"000000067F000080000007200C00000A4000-000000067F000080000007200C00000A8000__0000008F10EA21C8", +"000000067F000080000007200C00000A8000-000000067F000080000007200C00000AC000__0000008E3CDF59C0", +"000000067F000080000007200C00000A8000-000000067F000080000007200C00000AC000__0000008F10EA21C8", +"000000067F000080000007200C00000AA371-000000067F000080000007200C00000B3AD7__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C00000AC000-000000067F000080000007200C00000B0000__0000008E3CDF59C0", +"000000067F000080000007200C00000AC000-000000067F000080000007200C00000B0000__0000008F10EA21C8", +"000000067F000080000007200C00000B0000-000000067F000080000007200C00000B4000__0000008E3CDF59C0", +"000000067F000080000007200C00000B0000-000000067F000080000007200C00000B4000__0000008F10EA21C8", +"000000067F000080000007200C00000B3AD7-000000067F000080000007200C00000BD20B__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C00000B4000-000000067F000080000007200C00000B8000__0000008E3CDF59C0", +"000000067F000080000007200C00000B4000-000000067F000080000007200C00000B8000__0000008F10EA21C8", +"000000067F000080000007200C00000B8000-000000067F000080000007200C00000BC000__0000008E3CDF59C0", +"000000067F000080000007200C00000B8000-000000067F000080000007200C00000BC000__0000008F10EA21C8", +"000000067F000080000007200C00000BA086-000000067F00008000000720140000001101__0000008E42A19FD1-0000008EBC4827C1", +"000000067F000080000007200C00000BC000-000000067F000080000007200C00000C0000__0000008E3CDF59C0", +"000000067F000080000007200C00000BC000-000000067F000080000007200C00000C0000__0000008F10EA21C8", +"000000067F000080000007200C00000BD20B-000000067F000080000007200C0100000000__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C00000C0000-000000067F000080000007200C00000C4000__0000008E3CDF59C0", +"000000067F000080000007200C00000C0000-000000067F000080000007200C00000C4000__0000008F10EA21C8", +"000000067F000080000007200C00000C4000-000000067F000080000007200C00000C8000__0000008E3CDF59C0", +"000000067F000080000007200C00000C4000-000000067F000080000007200C00000C8000__0000008F10EA21C8", +"000000067F000080000007200C00000C58B0-000000067F000080000007200C00000CF00A__0000008DB277FA49-0000008E42A19FD1", +"000000067F000080000007200C00000C8000-000000067F000080000007200C00000CC000__0000008E3CDF59C0", +"000000067F000080000007200C00000C8000-000000067F000080000007200C00000CC000__0000008F10EA21C8", +"000000067F000080000007200C00000CC000-000000067F000080000007200C00000D0000__0000008E3CDF59C0", +"000000067F000080000007200C00000CC000-000000067F000080000007200C00000D0000__0000008F10EA21C8", +"000000067F000080000007200C00000CF00A-000000067F000080000007200C00000D871F__0000008DB277FA49-0000008E42A19FD1", +"000000067F000080000007200C00000D0000-000000067F000080000007200C00000D4000__0000008E3CDF59C0", +"000000067F000080000007200C00000D0000-000000067F000080000007200C00000D4000__0000008F10EA21C8", +"000000067F000080000007200C00000D4000-000000067F000080000007200C00000D8000__0000008E3CDF59C0", +"000000067F000080000007200C00000D4000-000000067F000080000007200C00000D8000__0000008F10EA21C8", +"000000067F000080000007200C00000D8000-000000067F000080000007200C00000DC000__0000008E3CDF59C0", +"000000067F000080000007200C00000D8000-000000067F000080000007200C00000DC000__0000008F10EA21C8", +"000000067F000080000007200C00000D871F-000000067F000080000007200C00000E1E85__0000008DB277FA49-0000008E42A19FD1", +"000000067F000080000007200C00000DC000-000000067F000080000007200C00000E0000__0000008E3CDF59C0", +"000000067F000080000007200C00000DC000-000000067F000080000007200C00000E0000__0000008F10EA21C8", +"000000067F000080000007200C00000E0000-000000067F000080000007200C00000E4000__0000008E3CDF59C0", +"000000067F000080000007200C00000E0000-000000067F000080000007200C00000E4000__0000008F10EA21C8", +"000000067F000080000007200C00000E1E85-000000067F000080000007200C00000EB5EB__0000008DB277FA49-0000008E42A19FD1", +"000000067F000080000007200C00000E4000-000000067F000080000007200C00000E8000__0000008E3CDF59C0", +"000000067F000080000007200C00000E4000-000000067F000080000007200C00000E8000__0000008F10EA21C8", +"000000067F000080000007200C00000E8000-000000067F000080000007200C00000EC000__0000008E3CDF59C0", +"000000067F000080000007200C00000E8000-000000067F000080000007200C00000EC000__0000008F10EA21C8", +"000000067F000080000007200C00000EB5EB-000000067F000080000007200C00000F4D0C__0000008DB277FA49-0000008E42A19FD1", +"000000067F000080000007200C00000EC000-000000067F000080000007200C00000F0000__0000008E3CDF59C0", +"000000067F000080000007200C00000EC000-000000067F000080000007200C00000F0000__0000008F10EA21C8", +"000000067F000080000007200C00000F0000-000000067F000080000007200C00000F4000__0000008E3CDF59C0", +"000000067F000080000007200C00000F0000-000000067F000080000007200C00000F4000__0000008F10EA21C8", +"000000067F000080000007200C00000F342F-000000067F0000800000072014000000D54C__0000008EBC4827C1-0000008F10E3E189", +"000000067F000080000007200C00000F4000-000000067F000080000007200C00000F8000__0000008E3CDF59C0", +"000000067F000080000007200C00000F4000-000000067F000080000007200C00000F8000__0000008F10EA21C8", +"000000067F000080000007200C00000F4D0C-000000067F000080000007200C00000FE472__0000008DB277FA49-0000008E42A19FD1", +"000000067F000080000007200C00000F8000-000000067F000080000007200C00000FC000__0000008E3CDF59C0", +"000000067F000080000007200C00000F8000-000000067F000080000007200C00000FC000__0000008F10EA21C8", +"000000067F000080000007200C00000FC000-000000067F000080000007200C0000100000__0000008E3CDF59C0", +"000000067F000080000007200C00000FC000-000000067F000080000007200C0000100000__0000008F10EA21C8", +"000000067F000080000007200C00000FE472-000000067F000080000007200C0000107B8E__0000008DB277FA49-0000008E42A19FD1", +"000000067F000080000007200C0000100000-000000067F000080000007200C0000104000__0000008E3CDF59C0", +"000000067F000080000007200C0000100000-000000067F000080000007200C0000104000__0000008F10EA21C8", +"000000067F000080000007200C0000104000-000000067F000080000007200C0000108000__0000008E3CDF59C0", +"000000067F000080000007200C0000104000-000000067F000080000007200C0000108000__0000008F10EA21C8", +"000000067F000080000007200C0000107B8E-000000067F000080000007200C00001112F4__0000008DB277FA49-0000008E42A19FD1", +"000000067F000080000007200C0000108000-000000067F000080000007200C000010C000__0000008E3CDF59C0", +"000000067F000080000007200C0000108000-000000067F000080000007200C000010C000__0000008F10EA21C8", +"000000067F000080000007200C000010C000-000000067F000080000007200C0000110000__0000008F10EA21C8", +"000000067F000080000007200C000010C000-030000000000000000000000000000000002__0000008E3CDF59C0", +"000000067F000080000007200C0000110000-000000067F00008000000720120100000000__0000008F10EA21C8", +"000000067F000080000007200C00001112F4-010000000000000001000000040000000001__0000008DB277FA49-0000008E42A19FD1", +"000000067F00008000000720140000000000-000000067F00008000000720140000004000__0000008F10EA21C8", +"000000067F00008000000720140000001101-000000067F00008000000720140000007E82__0000008E42A19FD1-0000008EBC4827C1", +"000000067F00008000000720140000004000-000000067F00008000000720140000008000__0000008F10EA21C8", +"000000067F00008000000720140000007E82-000000067F0000800000072014000000EB9D__0000008E42A19FD1-0000008EBC4827C1", +"000000067F00008000000720140000008000-000000067F0000800000072014000000C000__0000008F10EA21C8", +"000000067F0000800000072014000000C000-000000067F00008000000720140000010000__0000008F10EA21C8", +"000000067F0000800000072014000000D54D-000000067F00008000000720140000025E6D__0000008EBC4827C1-0000008F10E3E189", +"000000067F0000800000072014000000EB9D-000000067F00008000000720140000015866__0000008E42A19FD1-0000008EBC4827C1", +"000000067F00008000000720140000010000-000000067F00008000000720140000014000__0000008F10EA21C8", +"000000067F00008000000720140000014000-000000067F00008000000720140000018000__0000008F10EA21C8", +"000000067F00008000000720140000015866-000000067F0000800000072014000001C591__0000008E42A19FD1-0000008EBC4827C1", +"000000067F00008000000720140000018000-000000067F0000800000072014000001C000__0000008F10EA21C8", +"000000067F0000800000072014000001C000-000000067F00008000000720140000020000__0000008F10EA21C8", +"000000067F0000800000072014000001C591-000000067F0000800000072014000002326E__0000008E42A19FD1-0000008EBC4827C1", +"000000067F00008000000720140000020000-000000067F00008000000720140000024000__0000008F10EA21C8", +"000000067F0000800000072014000002326E-000000067F00008000000720140000029F59__0000008E42A19FD1-0000008EBC4827C1", +"000000067F00008000000720140000024000-000000067F00008000000720140000028000__0000008F10EA21C8", +"000000067F00008000000720140000025E75-030000000000000000000000000000000002__0000008EBC4827C1-0000008F10E3E189", +"000000067F00008000000720140000028000-000000067F0000800000072014000002C000__0000008F10EA21C8", +"000000067F00008000000720140000029F59-030000000000000000000000000000000002__0000008E42A19FD1-0000008EBC4827C1", +"000000067F0000800000072014000002C000-030000000000000000000000000000000002__0000008F10EA21C8", +"000000067F000080000007400C0000000000-000000067F000080000007400C0000004000__00000091A67E3E18", +"000000067F000080000007400C0000004000-000000067F000080000007400C0000008000__00000091A67E3E18", +"000000067F000080000007400C0000008000-000000067F000080000007400C000000C000__00000091A67E3E18", +"000000067F000080000007400C00000090E9-000000067F000080000007400C000001280C__0000008F10E3E189-0000008F915DE591", +"000000067F000080000007400C000000C000-000000067F000080000007400C0000010000__00000091A67E3E18", +"000000067F000080000007400C0000010000-000000067F000080000007400C0000014000__00000091A67E3E18", +"000000067F000080000007400C000001280C-000000067F000080000007400C000001BF72__0000008F10E3E189-0000008F915DE591", +"000000067F000080000007400C0000014000-000000067F000080000007400C0000018000__00000091A67E3E18", +"000000067F000080000007400C0000018000-000000067F000080000007400C000001C000__00000091A67E3E18", +"000000067F000080000007400C000001BF72-000000067F000080000007400C00000256D8__0000008F10E3E189-0000008F915DE591", +"000000067F000080000007400C000001C000-000000067F000080000007400C0000020000__00000091A67E3E18", +"000000067F000080000007400C0000020000-000000067F000080000007400C0000024000__00000091A67E3E18", +"000000067F000080000007400C0000024000-000000067F000080000007400C0000028000__00000091A67E3E18", +"000000067F000080000007400C00000256D8-000000067F000080000007400C000002EE0B__0000008F10E3E189-0000008F915DE591", +"000000067F000080000007400C0000028000-000000067F000080000007400C000002C000__00000091A67E3E18", +"000000067F000080000007400C000002C000-000000067F000080000007400C0000030000__00000091A67E3E18", +"000000067F000080000007400C000002EE0B-000000067F000080000007400C0000038521__0000008F10E3E189-0000008F915DE591", +"000000067F000080000007400C0000030000-000000067F000080000007400C0000034000__00000091A67E3E18", +"000000067F000080000007400C0000034000-000000067F000080000007400C0000038000__00000091A67E3E18", +"000000067F000080000007400C0000038000-000000067F000080000007400C000003C000__00000091A67E3E18", +"000000067F000080000007400C0000038521-000000067F000080000007400C0000041C87__0000008F10E3E189-0000008F915DE591", +"000000067F000080000007400C000003C000-000000067F000080000007400C0000040000__00000091A67E3E18", +"000000067F000080000007400C0000040000-000000067F000080000007400C0000044000__00000091A67E3E18", +"000000067F000080000007400C0000041C87-000000067F000080000007400C000004B3ED__0000008F10E3E189-0000008F915DE591", +"000000067F000080000007400C0000044000-000000067F000080000007400C0000048000__00000091A67E3E18", +"000000067F000080000007400C0000048000-000000067F000080000007400C000004C000__000000914B20A810", +"000000067F000080000007400C000004B3ED-030000000000000000000000000000000002__0000008F10E3E189-0000008F915DE591", +"000000067F000080000007400C000004BAC9-000000067F000080000007400C00000551FE__0000008F915DE591-000000903121F569", +"000000067F000080000007400C000004C000-000000067F000080000007400C0000050000__000000914B20A810", +"000000067F000080000007400C000004DF0B-000000067F000080000007400C000009B41F__000000914B2393B1-00000091A6DD7A79", +"000000067F000080000007400C0000050000-000000067F000080000007400C0000054000__000000914B20A810", +"000000067F000080000007400C0000054000-000000067F000080000007400C0000058000__000000914B20A810", +"000000067F000080000007400C00000551FE-000000067F000080000007400C000005E90C__0000008F915DE591-000000903121F569", +"000000067F000080000007400C0000058000-000000067F000080000007400C000005C000__000000914B20A810", +"000000067F000080000007400C000005C000-000000067F000080000007400C0000060000__000000914B20A810", +"000000067F000080000007400C000005E90C-000000067F000080000007400C000006802C__0000008F915DE591-000000903121F569", +"000000067F000080000007400C0000060000-000000067F000080000007400C0000064000__000000914B20A810", +"000000067F000080000007400C0000064000-000000067F000080000007400C0000068000__000000914B20A810", +"000000067F000080000007400C0000068000-000000067F000080000007400C000006C000__000000914B20A810", +"000000067F000080000007400C000006802C-000000067F000080000007400C0000071783__0000008F915DE591-000000903121F569", +"000000067F000080000007400C000006C000-000000067F000080000007400C0000070000__000000914B20A810", +"000000067F000080000007400C0000070000-000000067F000080000007400C0000074000__000000914B20A810", +"000000067F000080000007400C0000071783-000000067F000080000007400C000007AEE9__0000008F915DE591-000000903121F569", +"000000067F000080000007400C0000074000-000000067F000080000007400C0000078000__000000914B20A810", +"000000067F000080000007400C0000078000-000000067F000080000007400C000007C000__000000914B20A810", +"000000067F000080000007400C000007AEE9-000000067F000080000007400C000008460B__0000008F915DE591-000000903121F569", +"000000067F000080000007400C000007C000-000000067F000080000007400C0000080000__000000914B20A810", +"000000067F000080000007400C0000080000-000000067F000080000007400C0000084000__000000914B20A810", +"000000067F000080000007400C0000084000-000000067F000080000007400C0000088000__000000914B20A810", +"000000067F000080000007400C000008460B-000000067F000080000007400C000008DD71__0000008F915DE591-000000903121F569", +"000000067F000080000007400C0000088000-000000067F000080000007400C000008C000__000000914B20A810", +"000000067F000080000007400C000008C000-000000067F000080000007400C0000090000__000000914B20A810", +"000000067F000080000007400C000008DD71-000000067F000080000007400C00000974D7__0000008F915DE591-000000903121F569", +"000000067F000080000007400C0000090000-000000067F000080000007400C0000094000__000000914B20A810", +"000000067F000080000007400C0000094000-000000067F000080000007400C0000098000__000000914B20A810", +"000000067F000080000007400C00000974D7-000000067F000080000007400C00000A0C0B__0000008F915DE591-000000903121F569", +"000000067F000080000007400C0000098000-000000067F000080000007400C000009C000__000000914B20A810", +"000000067F000080000007400C000009B420-000000067F000080000007400C00000E830A__000000914B2393B1-00000091A6DD7A79", +"000000067F000080000007400C000009C000-000000067F000080000007400C00000A0000__000000914B20A810", +"000000067F000080000007400C00000A0000-000000067F000080000007400C00000A4000__000000914B20A810", +"000000067F000080000007400C00000A0C0B-000000067F000080000007400C00000AA371__0000008F915DE591-000000903121F569", +"000000067F000080000007400C00000A4000-000000067F000080000007400C00000A8000__000000914B20A810", +"000000067F000080000007400C00000A8000-000000067F000080000007400C00000AC000__00000090DFD64240", +"000000067F000080000007400C00000AA371-000000067F000080000007400C0100000000__0000008F915DE591-000000903121F569", +"000000067F000080000007400C00000AA4EC-000000067F000080000007400C00000B3C0C__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000AC000-000000067F000080000007400C00000B0000__00000090DFD64240", +"000000067F000080000007400C00000B0000-000000067F000080000007400C00000B4000__00000090DFD64240", +"000000067F000080000007400C00000B3C0C-000000067F000080000007400C00000BD372__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000B4000-000000067F000080000007400C00000B8000__00000090DFD64240", +"000000067F000080000007400C00000B8000-000000067F000080000007400C00000BC000__00000090DFD64240", +"000000067F000080000007400C00000BC000-000000067F000080000007400C00000C0000__00000090DFD64240", +"000000067F000080000007400C00000BD372-000000067F000080000007400C00000C6AD8__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000C0000-000000067F000080000007400C00000C4000__00000090DFD64240", +"000000067F000080000007400C00000C4000-000000067F000080000007400C00000C8000__00000090DFD64240", +"000000067F000080000007400C00000C6AD8-000000067F000080000007400C00000D020B__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000C8000-000000067F000080000007400C00000CC000__00000090DFD64240", +"000000067F000080000007400C00000CC000-000000067F000080000007400C00000D0000__00000090DFD64240", +"000000067F000080000007400C00000D0000-000000067F000080000007400C00000D4000__00000090DFD64240", +"000000067F000080000007400C00000D020B-000000067F000080000007400C00000D9971__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000D4000-000000067F000080000007400C00000D8000__00000090DFD64240", +"000000067F000080000007400C00000D8000-000000067F000080000007400C00000DC000__00000090DFD64240", +"000000067F000080000007400C00000D9971-000000067F000080000007400C00000E30D7__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000DC000-000000067F000080000007400C00000E0000__00000090DFD64240", +"000000067F000080000007400C00000E0000-000000067F000080000007400C00000E4000__00000090DFD64240", +"000000067F000080000007400C00000E30D7-000000067F000080000007400C00000EC80B__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000E4000-000000067F000080000007400C00000E8000__00000090DFD64240", +"000000067F000080000007400C00000E8000-000000067F000080000007400C00000EC000__00000090DFD64240", +"000000067F000080000007400C00000E8314-000000067F00008000000740140000008178__000000914B2393B1-00000091A6DD7A79", +"000000067F000080000007400C00000EC000-000000067F000080000007400C00000F0000__00000090DFD64240", +"000000067F000080000007400C00000EC80B-000000067F000080000007400C00000F5F38__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000F0000-000000067F000080000007400C00000F4000__00000090DFD64240", +"000000067F000080000007400C00000F4000-000000067F000080000007400C00000F8000__00000090DFD64240", +"000000067F000080000007400C00000F5F38-000000067F000080000007400C00000FF69E__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000F8000-000000067F000080000007400C00000FC000__00000090DFD64240", +"000000067F000080000007400C00000FC000-000000067F000080000007400C0000100000__00000090DFD64240", +"000000067F000080000007400C00000FCCA8-000000067F000080000007400C00001119BA__00000090D0E5EA29-000000914B2393B1", +"000000067F000080000007400C00000FF69E-000000067F000080000007400C0000108DAF__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C0000100000-000000067F000080000007400C0000104000__00000090DFD64240", +"000000067F000080000007400C0000104000-000000067F000080000007400C0000108000__00000090DFD64240", +"000000067F000080000007400C0000108000-000000067F000080000007400C000010C000__00000090DFD64240", +"000000067F000080000007400C0000108DAF-000000067F000080000007400C0100000000__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C000010C000-000000067F000080000007400C0000110000__00000090DFD64240", +"000000067F000080000007400C0000110000-030000000000000000000000000000000002__00000090DFD64240", +"000000067F000080000007400C00001119BA-000000067F00008000000740140000004326__00000090D0E5EA29-000000914B2393B1", +"000000067F00008000000740140000004326-000000067F0000800000074014000000B7EE__00000090D0E5EA29-000000914B2393B1", +"000000067F00008000000740140000008179-000000067F0000800000074014000001D4B7__000000914B2393B1-00000091A6DD7A79", +"000000067F0000800000074014000000B7EE-000000067F00008000000740140000012CCD__00000090D0E5EA29-000000914B2393B1", +"000000067F00008000000740140000012CCD-000000067F0000800000074014000001A16B__00000090D0E5EA29-000000914B2393B1", +"000000067F0000800000074014000001A16B-000000067F000080000007401400000215C9__00000090D0E5EA29-000000914B2393B1", +"000000067F0000800000074014000001D4BA-030000000000000000000000000000000002__000000914B2393B1-00000091A6DD7A79", +"000000067F000080000007401400000215C9-000000067F00008000000740140000028A4A__00000090D0E5EA29-000000914B2393B1", +"000000067F00008000000740140000028A4A-030000000000000000000000000000000002__00000090D0E5EA29-000000914B2393B1", +"000000067F000080000007600C0000000000-000000067F000080000007600C0000004000__00000092CA5E4EA8", +"000000067F000080000007600C0000000000-000000067F000080000007600C0000004000__0000009445A06DC8", +"000000067F000080000007600C0000004000-000000067F000080000007600C0000008000__00000092CA5E4EA8", +"000000067F000080000007600C0000004000-000000067F000080000007600C0000008000__0000009445A06DC8", +"000000067F000080000007600C0000008000-000000067F000080000007600C000000C000__00000092CA5E4EA8", +"000000067F000080000007600C0000008000-000000067F000080000007600C000000C000__0000009445A06DC8", +"000000067F000080000007600C0000008180-000000067F000080000007600C00000118E6__00000091A6DD7A79-0000009228F7FA79", +"000000067F000080000007600C000000C000-000000067F000080000007600C0000010000__00000092CA5E4EA8", +"000000067F000080000007600C000000C000-000000067F000080000007600C0000010000__0000009445A06DC8", +"000000067F000080000007600C0000010000-000000067F000080000007600C0000014000__00000092CA5E4EA8", +"000000067F000080000007600C0000010000-000000067F000080000007600C0000014000__0000009445A06DC8", +"000000067F000080000007600C00000118E6-000000067F000080000007600C000001B00A__00000091A6DD7A79-0000009228F7FA79", +"000000067F000080000007600C0000014000-000000067F000080000007600C0000018000__00000092CA5E4EA8", +"000000067F000080000007600C0000014000-000000067F000080000007600C0000018000__0000009445A06DC8", +"000000067F000080000007600C0000018000-000000067F000080000007600C000001C000__00000092CA5E4EA8", +"000000067F000080000007600C0000018000-000000067F000080000007600C000001C000__0000009445A06DC8", +"000000067F000080000007600C000001B00A-000000067F000080000007600C0000024745__00000091A6DD7A79-0000009228F7FA79", +"000000067F000080000007600C000001C000-000000067F000080000007600C0000020000__00000092CA5E4EA8", +"000000067F000080000007600C000001C000-000000067F000080000007600C0000020000__0000009445A06DC8", +"000000067F000080000007600C0000020000-000000067F000080000007600C0000024000__00000092CA5E4EA8", +"000000067F000080000007600C0000020000-000000067F000080000007600C0000024000__0000009445A06DC8", +"000000067F000080000007600C0000024000-000000067F000080000007600C0000028000__00000092CA5E4EA8", +"000000067F000080000007600C0000024000-000000067F000080000007600C0000028000__0000009445A06DC8", +"000000067F000080000007600C0000024745-000000067F000080000007600C000002DEAB__00000091A6DD7A79-0000009228F7FA79", +"000000067F000080000007600C0000028000-000000067F000080000007600C000002C000__00000092CA5E4EA8", +"000000067F000080000007600C0000028000-000000067F000080000007600C000002C000__0000009445A06DC8", +"000000067F000080000007600C000002C000-000000067F000080000007600C0000030000__00000092CA5E4EA8", +"000000067F000080000007600C000002C000-000000067F000080000007600C0000030000__0000009445A06DC8", +"000000067F000080000007600C000002DEAB-000000067F000080000007600C00000375CB__00000091A6DD7A79-0000009228F7FA79", +"000000067F000080000007600C0000030000-000000067F000080000007600C0000034000__00000092CA5E4EA8", +"000000067F000080000007600C0000030000-000000067F000080000007600C0000034000__0000009445A06DC8", +"000000067F000080000007600C0000034000-000000067F000080000007600C0000038000__00000092CA5E4EA8", +"000000067F000080000007600C0000034000-000000067F000080000007600C0000038000__0000009445A06DC8", +"000000067F000080000007600C00000375CB-000000067F000080000007600C0000040D0B__00000091A6DD7A79-0000009228F7FA79", +"000000067F000080000007600C0000038000-000000067F000080000007600C000003C000__00000092CA5E4EA8", +"000000067F000080000007600C0000038000-000000067F000080000007600C000003C000__0000009445A06DC8", +"000000067F000080000007600C000003C000-000000067F000080000007600C0000040000__00000092CA5E4EA8", +"000000067F000080000007600C000003C000-000000067F000080000007600C0000040000__0000009445A06DC8", +"000000067F000080000007600C0000040000-000000067F000080000007600C0000044000__00000092CA5E4EA8", +"000000067F000080000007600C0000040000-000000067F000080000007600C0000044000__0000009445A06DC8", +"000000067F000080000007600C0000040D0B-000000067F000080000007600C000004A471__00000091A6DD7A79-0000009228F7FA79", +"000000067F000080000007600C0000044000-000000067F000080000007600C0000048000__00000092CA5E4EA8", +"000000067F000080000007600C0000044000-000000067F000080000007600C0000048000__0000009445A06DC8", +"000000067F000080000007600C0000048000-000000067F000080000007600C000004C000__00000092CA5E4EA8", +"000000067F000080000007600C0000048000-000000067F000080000007600C000004C000__0000009445A06DC8", +"000000067F000080000007600C000004A471-030000000000000000000000000000000002__00000091A6DD7A79-0000009228F7FA79", +"000000067F000080000007600C000004C000-000000067F000080000007600C0000050000__00000092CA5E4EA8", +"000000067F000080000007600C000004C000-000000067F000080000007600C0000050000__0000009445A06DC8", +"000000067F000080000007600C0000050000-000000067F000080000007600C0000054000__00000092CA5E4EA8", +"000000067F000080000007600C0000050000-000000067F000080000007600C0000054000__0000009445A06DC8", +"000000067F000080000007600C0000054000-000000067F000080000007600C0000058000__00000092CA5E4EA8", +"000000067F000080000007600C0000054000-000000067F000080000007600C0000058000__0000009445A06DC8", +"000000067F000080000007600C00000544BA-000000067F000080000007600C000005DC0A__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000058000-000000067F000080000007600C000005C000__00000092CA5E4EA8", +"000000067F000080000007600C0000058000-000000067F000080000007600C000005C000__0000009445A06DC8", +"000000067F000080000007600C000005C000-000000067F000080000007600C0000060000__00000092CA5E4EA8", +"000000067F000080000007600C000005C000-000000067F000080000007600C0000060000__0000009445A06DC8", +"000000067F000080000007600C000005DC0A-000000067F000080000007600C000006732B__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000060000-000000067F000080000007600C0000064000__00000092CA5E4EA8", +"000000067F000080000007600C0000060000-000000067F000080000007600C0000064000__0000009445A06DC8", +"000000067F000080000007600C0000061031-000000067F000080000007600C00000C1159__0000009402435A49-0000009446B52FD1", +"000000067F000080000007600C0000064000-000000067F000080000007600C0000068000__00000092CA5E4EA8", +"000000067F000080000007600C0000064000-000000067F000080000007600C0000068000__0000009445A06DC8", +"000000067F000080000007600C000006732B-000000067F000080000007600C0000070A91__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000068000-000000067F000080000007600C000006C000__00000092CA5E4EA8", +"000000067F000080000007600C0000068000-000000067F000080000007600C000006C000__0000009445A06DC8", +"000000067F000080000007600C000006C000-000000067F000080000007600C0000070000__00000092CA5E4EA8", +"000000067F000080000007600C000006C000-000000067F000080000007600C0000070000__0000009445A06DC8", +"000000067F000080000007600C0000070000-000000067F000080000007600C0000074000__00000092CA5E4EA8", +"000000067F000080000007600C0000070000-000000067F000080000007600C0000074000__0000009445A06DC8", +"000000067F000080000007600C0000070A91-000000067F000080000007600C000007A1F7__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000074000-000000067F000080000007600C0000078000__00000092CA5E4EA8", +"000000067F000080000007600C0000074000-000000067F000080000007600C0000078000__0000009445A06DC8", +"000000067F000080000007600C0000078000-000000067F000080000007600C000007C000__00000092CA5E4EA8", +"000000067F000080000007600C0000078000-000000067F000080000007600C000007C000__0000009445A06DC8", +"000000067F000080000007600C000007A1F7-000000067F000080000007600C000008390C__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C000007C000-000000067F000080000007600C0000080000__00000092CA5E4EA8", +"000000067F000080000007600C000007C000-000000067F000080000007600C0000080000__0000009445A06DC8", +"000000067F000080000007600C0000080000-000000067F000080000007600C0000084000__00000092CA5E4EA8", +"000000067F000080000007600C0000080000-000000067F000080000007600C0000084000__0000009445A06DC8", +"000000067F000080000007600C000008390C-000000067F000080000007600C000008D072__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000084000-000000067F000080000007600C0000088000__00000092CA5E4EA8", +"000000067F000080000007600C0000084000-000000067F000080000007600C0000088000__0000009445A06DC8", +"000000067F000080000007600C0000088000-000000067F000080000007600C000008C000__00000092CA5E4EA8", +"000000067F000080000007600C0000088000-000000067F000080000007600C000008C000__0000009445A06DC8", +"000000067F000080000007600C000008C000-000000067F000080000007600C0000090000__00000092CA5E4EA8", +"000000067F000080000007600C000008C000-000000067F000080000007600C0000090000__0000009445A06DC8", +"000000067F000080000007600C000008C52F-000000067F000080000007600C000010B57A__00000093786F8001-0000009402435A49", +"000000067F000080000007600C000008D072-000000067F000080000007600C000009679A__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000090000-000000067F000080000007600C0000094000__00000092CA5E4EA8", +"000000067F000080000007600C0000090000-000000067F000080000007600C0000094000__0000009445A06DC8", +"000000067F000080000007600C0000094000-000000067F000080000007600C0000098000__00000092CA5E4EA8", +"000000067F000080000007600C0000094000-000000067F000080000007600C0000098000__0000009445A06DC8", +"000000067F000080000007600C000009679A-000000067F000080000007600C000009FF00__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000098000-000000067F000080000007600C000009C000__00000092CA5E4EA8", +"000000067F000080000007600C0000098000-000000067F000080000007600C000009C000__0000009445A06DC8", +"000000067F000080000007600C000009C000-000000067F000080000007600C00000A0000__00000092CA5E4EA8", +"000000067F000080000007600C000009C000-000000067F000080000007600C00000A0000__0000009445A06DC8", +"000000067F000080000007600C000009FF00-000000067F000080000007600C00000A960B__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000A0000-000000067F000080000007600C00000A4000__00000092CA5E4EA8", +"000000067F000080000007600C00000A0000-000000067F000080000007600C00000A4000__0000009445A06DC8", +"000000067F000080000007600C00000A4000-000000067F000080000007600C00000A8000__00000092CA5E4EA8", +"000000067F000080000007600C00000A4000-000000067F000080000007600C00000A8000__0000009445A06DC8", +"000000067F000080000007600C00000A8000-000000067F000080000007600C00000AC000__0000009445A06DC8", +"000000067F000080000007600C00000A8000-030000000000000000000000000000000002__00000092CA5E4EA8", +"000000067F000080000007600C00000A960B-000000067F000080000007600C00000B2D55__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000AC000-000000067F000080000007600C00000B0000__0000009445A06DC8", +"000000067F000080000007600C00000B0000-000000067F000080000007600C00000B4000__0000009445A06DC8", +"000000067F000080000007600C00000B2D55-000000067F000080000007600C00000BC4BB__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000B4000-000000067F000080000007600C00000B8000__0000009445A06DC8", +"000000067F000080000007600C00000B8000-000000067F000080000007600C00000BC000__0000009445A06DC8", +"000000067F000080000007600C00000BC000-000000067F000080000007600C00000C0000__0000009445A06DC8", +"000000067F000080000007600C00000BC4BB-000000067F000080000007600C00000C5BEA__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000C0000-000000067F000080000007600C00000C4000__0000009445A06DC8", +"000000067F000080000007600C00000C115D-000000067F0000800000076014000000333A__0000009402435A49-0000009446B52FD1", +"000000067F000080000007600C00000C4000-000000067F000080000007600C00000C8000__0000009445A06DC8", +"000000067F000080000007600C00000C5BEA-000000067F000080000007600C00000CF30B__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000C8000-000000067F000080000007600C00000CC000__0000009445A06DC8", +"000000067F000080000007600C00000CC000-000000067F000080000007600C00000D0000__0000009445A06DC8", +"000000067F000080000007600C00000CF30B-000000067F000080000007600C00000D8A2B__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000D0000-000000067F000080000007600C00000D4000__0000009445A06DC8", +"000000067F000080000007600C00000D4000-000000067F000080000007600C00000D8000__0000009445A06DC8", +"000000067F000080000007600C00000D8000-000000067F000080000007600C00000DC000__0000009445A06DC8", +"000000067F000080000007600C00000D8A2B-000000067F000080000007600C00000E217C__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000DC000-000000067F000080000007600C00000E0000__0000009445A06DC8", +"000000067F000080000007600C00000E0000-000000067F000080000007600C00000E4000__0000009445A06DC8", +"000000067F000080000007600C00000E217C-000000067F000080000007600C00000EB8E2__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000E4000-000000067F000080000007600C00000E8000__0000009445A06DC8", +"000000067F000080000007600C00000E8000-000000067F000080000007600C00000EC000__0000009445A06DC8", +"000000067F000080000007600C00000EB8E2-000000067F000080000007600C00000F500B__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000EC000-000000067F000080000007600C00000F0000__0000009445A06DC8", +"000000067F000080000007600C00000F0000-000000067F000080000007600C00000F4000__0000009445A06DC8", +"000000067F000080000007600C00000F4000-000000067F000080000007600C00000F8000__0000009445A06DC8", +"000000067F000080000007600C00000F500B-000000067F000080000007600C00000FE771__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000F8000-000000067F000080000007600C00000FC000__0000009445A06DC8", +"000000067F000080000007600C00000FC000-000000067F000080000007600C0000100000__0000009445A06DC8", +"000000067F000080000007600C00000FE771-000000067F000080000007600C0000107ED7__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000100000-000000067F000080000007600C0000104000__0000009445A06DC8", +"000000067F000080000007600C0000104000-000000067F000080000007600C0000108000__0000009445A06DC8", +"000000067F000080000007600C0000107ED7-000000067F000080000007600C000011160C__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000108000-000000067F000080000007600C000010C000__0000009445A06DC8", +"000000067F000080000007600C000010B57A-000000067F00008000000760140000003D14__00000093786F8001-0000009402435A49", +"000000067F000080000007600C000010C000-000000067F000080000007600C0000110000__0000009445A06DC8", +"000000067F000080000007600C0000110000-000000067F00008000000760120100000000__0000009445A06DC8", +"000000067F000080000007600C000011160C-010000000000000001000000040000000008__0000009228F7FA79-00000093786F8001", +"000000067F00008000000760140000000000-000000067F00008000000760140000004000__0000009445A06DC8", +"000000067F00008000000760140000003354-000000067F00008000000760140000023CAB__0000009402435A49-0000009446B52FD1", +"000000067F00008000000760140000003D14-000000067F0000800000076014000000A251__00000093786F8001-0000009402435A49", +"000000067F00008000000760140000004000-000000067F00008000000760140000008000__0000009445A06DC8", +"000000067F00008000000760140000008000-000000067F0000800000076014000000C000__0000009445A06DC8", +"000000067F0000800000076014000000A251-000000067F000080000007601400000107AC__00000093786F8001-0000009402435A49", +"000000067F0000800000076014000000C000-000000067F00008000000760140000010000__0000009445A06DC8", +"000000067F00008000000760140000010000-000000067F00008000000760140000014000__0000009445A06DC8", +"000000067F000080000007601400000107AC-000000067F00008000000760140000016CC4__00000093786F8001-0000009402435A49", +"000000067F00008000000760140000014000-000000067F00008000000760140000018000__0000009445A06DC8", +"000000067F00008000000760140000016CC4-000000067F0000800000076014000001D272__00000093786F8001-0000009402435A49", +"000000067F00008000000760140000018000-000000067F0000800000076014000001C000__0000009445A06DC8", +"000000067F0000800000076014000001C000-000000067F00008000000760140000020000__0000009445A06DC8", +"000000067F0000800000076014000001D272-000000067F000080000007601400000237C3__00000093786F8001-0000009402435A49", +"000000067F00008000000760140000020000-000000067F00008000000760140000024000__0000009445A06DC8", +"000000067F000080000007601400000237C3-000000067F00008000000760140000029CC5__00000093786F8001-0000009402435A49", +"000000067F00008000000760140000023CB3-030000000000000000000000000000000002__0000009402435A49-0000009446B52FD1", +"000000067F00008000000760140000024000-000000067F00008000000760140000028000__0000009445A06DC8", +"000000067F00008000000760140000028000-000000067F0000800000076014000002C000__0000009445A06DC8", +"000000067F00008000000760140000029CC5-030000000000000000000000000000000002__00000093786F8001-0000009402435A49", +"000000067F0000800000076014000002C000-030000000000000000000000000000000002__0000009445A06DC8", +"000000067F000080000007800C0000000000-000000067F000080000007800C0000004000__00000096187D1FC8", +"000000067F000080000007800C0000000000-000000067F000080000007800C0000004000__00000096E85806C0", +"000000067F000080000007800C0000004000-000000067F000080000007800C0000008000__00000096187D1FC8", +"000000067F000080000007800C0000004000-000000067F000080000007800C0000008000__00000096E85806C0", +"000000067F000080000007800C0000008000-000000067F000080000007800C000000C000__00000096187D1FC8", +"000000067F000080000007800C0000008000-000000067F000080000007800C000000C000__00000096E85806C0", +"000000067F000080000007800C000000974C-000000067F000080000007800C0000012EB2__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C000000C000-000000067F000080000007800C0000010000__00000096187D1FC8", +"000000067F000080000007800C000000C000-000000067F000080000007800C0000010000__00000096E85806C0", +"000000067F000080000007800C0000010000-000000067F000080000007800C0000014000__00000096187D1FC8", +"000000067F000080000007800C0000010000-000000067F000080000007800C0000014000__00000096E85806C0", +"000000067F000080000007800C0000012EB2-000000067F000080000007800C000001C60B__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C0000014000-000000067F000080000007800C0000018000__00000096187D1FC8", +"000000067F000080000007800C0000014000-000000067F000080000007800C0000018000__00000096E85806C0", +"000000067F000080000007800C0000018000-000000067F000080000007800C000001C000__00000096187D1FC8", +"000000067F000080000007800C0000018000-000000067F000080000007800C000001C000__00000096E85806C0", +"000000067F000080000007800C000001C000-000000067F000080000007800C0000020000__00000096187D1FC8", +"000000067F000080000007800C000001C000-000000067F000080000007800C0000020000__00000096E85806C0", +"000000067F000080000007800C000001C60B-000000067F000080000007800C0000025D39__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C0000020000-000000067F000080000007800C0000024000__00000096187D1FC8", +"000000067F000080000007800C0000020000-000000067F000080000007800C0000024000__00000096E85806C0", +"000000067F000080000007800C0000024000-000000067F000080000007800C0000028000__00000096187D1FC8", +"000000067F000080000007800C0000024000-000000067F000080000007800C0000028000__00000096E85806C0", +"000000067F000080000007800C0000025D39-000000067F000080000007800C000002F49F__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C0000028000-000000067F000080000007800C000002C000__00000096187D1FC8", +"000000067F000080000007800C0000028000-000000067F000080000007800C000002C000__00000096E85806C0", +"000000067F000080000007800C000002C000-000000067F000080000007800C0000030000__00000096187D1FC8", +"000000067F000080000007800C000002C000-000000067F000080000007800C0000030000__00000096E85806C0", +"000000067F000080000007800C000002F49F-000000067F000080000007800C0000038BB2__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C0000030000-000000067F000080000007800C0000034000__00000096187D1FC8", +"000000067F000080000007800C0000030000-000000067F000080000007800C0000034000__00000096E85806C0", +"000000067F000080000007800C0000034000-000000067F000080000007800C0000038000__00000096187D1FC8", +"000000067F000080000007800C0000034000-000000067F000080000007800C0000038000__00000096E85806C0", +"000000067F000080000007800C0000038000-000000067F000080000007800C000003C000__00000096187D1FC8", +"000000067F000080000007800C0000038000-000000067F000080000007800C000003C000__00000096E85806C0", +"000000067F000080000007800C0000038BB2-000000067F000080000007800C0000042318__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C000003C000-000000067F000080000007800C0000040000__00000096187D1FC8", +"000000067F000080000007800C000003C000-000000067F000080000007800C0000040000__00000096E85806C0", +"000000067F000080000007800C0000040000-000000067F000080000007800C0000044000__00000096187D1FC8", +"000000067F000080000007800C0000040000-000000067F000080000007800C0000044000__00000096E85806C0", +"000000067F000080000007800C0000042318-000000067F000080000007800C000004BA7E__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C0000044000-000000067F000080000007800C0000048000__00000096187D1FC8", +"000000067F000080000007800C0000044000-000000067F000080000007800C0000048000__00000096E85806C0", +"000000067F000080000007800C0000048000-000000067F000080000007800C000004C000__00000096187D1FC8", +"000000067F000080000007800C0000048000-000000067F000080000007800C000004C000__00000096E85806C0", +"000000067F000080000007800C000004BA7E-000000067F000080000007800C00000551B3__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C000004C000-000000067F000080000007800C0000050000__00000096187D1FC8", +"000000067F000080000007800C000004C000-000000067F000080000007800C0000050000__00000096E85806C0", +"000000067F000080000007800C0000050000-000000067F000080000007800C0000054000__00000096187D1FC8", +"000000067F000080000007800C0000050000-000000067F000080000007800C0000054000__00000096E85806C0", +"000000067F000080000007800C0000054000-000000067F000080000007800C0000058000__0000009614F1FFE8", +"000000067F000080000007800C0000054000-000000067F000080000007800C0000058000__00000096E85806C0", +"000000067F000080000007800C00000551B3-030000000000000000000000000000000002__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C000005523E-000000067F000080000007800C000005E9A4__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C0000058000-000000067F000080000007800C000005C000__0000009614F1FFE8", +"000000067F000080000007800C0000058000-000000067F000080000007800C000005C000__00000096E85806C0", +"000000067F000080000007800C000005C000-000000067F000080000007800C0000060000__0000009614F1FFE8", +"000000067F000080000007800C000005C000-000000067F000080000007800C0000060000__00000096E85806C0", +"000000067F000080000007800C000005E9A4-000000067F000080000007800C000006810A__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C0000060000-000000067F000080000007800C0000064000__0000009614F1FFE8", +"000000067F000080000007800C0000060000-000000067F000080000007800C0000064000__00000096E85806C0", +"000000067F000080000007800C0000064000-000000067F000080000007800C0000068000__0000009614F1FFE8", +"000000067F000080000007800C0000064000-000000067F000080000007800C0000068000__00000096E85806C0", +"000000067F000080000007800C0000068000-000000067F000080000007800C000006C000__0000009614F1FFE8", +"000000067F000080000007800C0000068000-000000067F000080000007800C000006C000__00000096E85806C0", +"000000067F000080000007800C000006810A-000000067F000080000007800C0000071870__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C000006C000-000000067F000080000007800C0000070000__0000009614F1FFE8", +"000000067F000080000007800C000006C000-000000067F000080000007800C0000070000__00000096E85806C0", +"000000067F000080000007800C000006D446-000000067F000080000007800C00000D9B82__00000096AEF27399-00000096E85829C9", +"000000067F000080000007800C0000070000-000000067F000080000007800C0000074000__0000009614F1FFE8", +"000000067F000080000007800C0000070000-000000067F000080000007800C0000074000__00000096E85806C0", +"000000067F000080000007800C0000071870-000000067F000080000007800C000007AFD6__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C0000074000-000000067F000080000007800C0000078000__0000009614F1FFE8", +"000000067F000080000007800C0000074000-000000067F000080000007800C0000078000__00000096E85806C0", +"000000067F000080000007800C0000078000-000000067F000080000007800C000007C000__0000009614F1FFE8", +"000000067F000080000007800C0000078000-000000067F000080000007800C000007C000__00000096E85806C0", +"000000067F000080000007800C000007AFD6-000000067F000080000007800C000008470B__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C000007B8DE-000000067F000080000007800C00000F73DA__00000096193A8001-00000096AEF27399", +"000000067F000080000007800C000007C000-000000067F000080000007800C0000080000__0000009614F1FFE8", +"000000067F000080000007800C000007C000-000000067F000080000007800C0000080000__00000096E85806C0", +"000000067F000080000007800C0000080000-000000067F000080000007800C0000084000__0000009614F1FFE8", +"000000067F000080000007800C0000080000-000000067F000080000007800C0000084000__00000096E85806C0", +"000000067F000080000007800C0000084000-000000067F000080000007800C0000088000__0000009614F1FFE8", +"000000067F000080000007800C0000084000-000000067F000080000007800C0000088000__00000096E85806C0", +"000000067F000080000007800C000008470B-000000067F000080000007800C000008DE71__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C0000088000-000000067F000080000007800C000008C000__0000009614F1FFE8", +"000000067F000080000007800C0000088000-000000067F000080000007800C000008C000__00000096E85806C0", +"000000067F000080000007800C000008C000-000000067F000080000007800C0000090000__0000009614F1FFE8", +"000000067F000080000007800C000008C000-000000067F000080000007800C0000090000__00000096E85806C0", +"000000067F000080000007800C000008DE71-000000067F000080000007800C0000097591__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C0000090000-000000067F000080000007800C0000094000__0000009614F1FFE8", +"000000067F000080000007800C0000090000-000000067F000080000007800C0000094000__00000096E85806C0", +"000000067F000080000007800C0000094000-000000067F000080000007800C0000098000__0000009614F1FFE8", +"000000067F000080000007800C0000094000-000000067F000080000007800C0000098000__00000096E85806C0", +"000000067F000080000007800C0000097591-000000067F000080000007800C00000A0CF7__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C0000098000-000000067F000080000007800C000009C000__0000009614F1FFE8", +"000000067F000080000007800C0000098000-000000067F000080000007800C000009C000__00000096E85806C0", +"000000067F000080000007800C000009C000-000000067F000080000007800C00000A0000__0000009614F1FFE8", +"000000067F000080000007800C000009C000-000000067F000080000007800C00000A0000__00000096E85806C0", +"000000067F000080000007800C00000A0000-000000067F000080000007800C00000A4000__0000009614F1FFE8", +"000000067F000080000007800C00000A0000-000000067F000080000007800C00000A4000__00000096E85806C0", +"000000067F000080000007800C00000A0CF7-000000067F000080000007800C00000AA40B__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C00000A4000-000000067F000080000007800C00000A8000__0000009614F1FFE8", +"000000067F000080000007800C00000A4000-000000067F000080000007800C00000A8000__00000096E85806C0", +"000000067F000080000007800C00000A8000-000000067F000080000007800C00000AC000__0000009614F1FFE8", +"000000067F000080000007800C00000A8000-000000067F000080000007800C00000AC000__00000096E85806C0", +"000000067F000080000007800C00000AA40B-000000067F000080000007800C00000B3B4D__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C00000AC000-000000067F000080000007800C00000B0000__0000009614F1FFE8", +"000000067F000080000007800C00000AC000-000000067F000080000007800C00000B0000__00000096E85806C0", +"000000067F000080000007800C00000B0000-000000067F000080000007800C00000B4000__0000009614F1FFE8", +"000000067F000080000007800C00000B0000-000000067F000080000007800C00000B4000__00000096E85806C0", +"000000067F000080000007800C00000B3B4D-000000067F000080000007800C00000BD2B3__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C00000B4000-000000067F000080000007800C00000B8000__0000009614F1FFE8", +"000000067F000080000007800C00000B4000-000000067F000080000007800C00000B8000__00000096E85806C0", +"000000067F000080000007800C00000B8000-000000067F000080000007800C00000BC000__0000009614F1FFE8", +"000000067F000080000007800C00000B8000-000000067F000080000007800C00000BC000__00000096E85806C0", +"000000067F000080000007800C00000BC000-000000067F000080000007800C00000C0000__0000009614F1FFE8", +"000000067F000080000007800C00000BC000-000000067F000080000007800C00000C0000__00000096E85806C0", +"000000067F000080000007800C00000BD2B3-000000067F000080000007800C00000C69DA__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C00000C0000-000000067F000080000007800C00000C4000__0000009614F1FFE8", +"000000067F000080000007800C00000C0000-000000067F000080000007800C00000C4000__00000096E85806C0", +"000000067F000080000007800C00000C4000-000000067F000080000007800C00000C8000__0000009614F1FFE8", +"000000067F000080000007800C00000C4000-000000067F000080000007800C00000C8000__00000096E85806C0", +"000000067F000080000007800C00000C69DA-000000067F000080000007800C0100000000__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C00000C8000-000000067F000080000007800C00000CC000__0000009614F1FFE8", +"000000067F000080000007800C00000C8000-000000067F000080000007800C00000CC000__00000096E85806C0", +"000000067F000080000007800C00000CC000-000000067F000080000007800C00000D0000__0000009614F1FFE8", +"000000067F000080000007800C00000CC000-000000067F000080000007800C00000D0000__00000096E85806C0", +"000000067F000080000007800C00000CD6B6-000000067F000080000007800C00000D6C18__000000959635F2A9-00000096193A8001", +"000000067F000080000007800C00000D0000-000000067F000080000007800C00000D4000__0000009614F1FFE8", +"000000067F000080000007800C00000D0000-000000067F000080000007800C00000D4000__00000096E85806C0", +"000000067F000080000007800C00000D4000-000000067F000080000007800C00000D8000__0000009614F1FFE8", +"000000067F000080000007800C00000D4000-000000067F000080000007800C00000D8000__00000096E85806C0", +"000000067F000080000007800C00000D6C18-000000067F000080000007800C00000E0179__000000959635F2A9-00000096193A8001", +"000000067F000080000007800C00000D8000-000000067F000080000007800C00000DC000__0000009614F1FFE8", +"000000067F000080000007800C00000D8000-000000067F000080000007800C00000DC000__00000096E85806C0", +"000000067F000080000007800C00000D9BA3-000000067F00008000000780140000013481__00000096AEF27399-00000096E85829C9", +"000000067F000080000007800C00000DC000-000000067F000080000007800C00000E0000__0000009614F1FFE8", +"000000067F000080000007800C00000DC000-000000067F000080000007800C00000E0000__00000096E85806C0", +"000000067F000080000007800C00000E0000-000000067F000080000007800C00000E4000__0000009614F1FFE8", +"000000067F000080000007800C00000E0000-000000067F000080000007800C00000E4000__00000096E85806C0", +"000000067F000080000007800C00000E0179-000000067F000080000007800C00000E96DC__000000959635F2A9-00000096193A8001", +"000000067F000080000007800C00000E4000-000000067F000080000007800C00000E8000__0000009614F1FFE8", +"000000067F000080000007800C00000E4000-000000067F000080000007800C00000E8000__00000096E85806C0", +"000000067F000080000007800C00000E8000-000000067F000080000007800C00000EC000__0000009614F1FFE8", +"000000067F000080000007800C00000E8000-000000067F000080000007800C00000EC000__00000096E85806C0", +"000000067F000080000007800C00000E96DC-000000067F000080000007800C00000F2C3E__000000959635F2A9-00000096193A8001", +"000000067F000080000007800C00000EC000-000000067F000080000007800C00000F0000__0000009614F1FFE8", +"000000067F000080000007800C00000EC000-000000067F000080000007800C00000F0000__00000096E85806C0", +"000000067F000080000007800C00000F0000-000000067F000080000007800C00000F4000__0000009614F1FFE8", +"000000067F000080000007800C00000F0000-000000067F000080000007800C00000F4000__00000096E85806C0", +"000000067F000080000007800C00000F2C3E-000000067F000080000007800C00000FC1A0__000000959635F2A9-00000096193A8001", +"000000067F000080000007800C00000F4000-000000067F000080000007800C00000F8000__0000009614F1FFE8", +"000000067F000080000007800C00000F4000-000000067F000080000007800C00000F8000__00000096E85806C0", +"000000067F000080000007800C00000F73E3-000000067F00008000000780140000003F18__00000096193A8001-00000096AEF27399", +"000000067F000080000007800C00000F8000-000000067F000080000007800C00000FC000__0000009614F1FFE8", +"000000067F000080000007800C00000F8000-000000067F000080000007800C00000FC000__00000096E85806C0", +"000000067F000080000007800C00000FC000-000000067F000080000007800C0000100000__0000009614F1FFE8", +"000000067F000080000007800C00000FC000-000000067F000080000007800C0000100000__00000096E85806C0", +"000000067F000080000007800C00000FC1A0-000000067F000080000007800C00001057C1__000000959635F2A9-00000096193A8001", +"000000067F000080000007800C0000100000-000000067F000080000007800C0000104000__0000009614F1FFE8", +"000000067F000080000007800C0000100000-000000067F000080000007800C0000104000__00000096E85806C0", +"000000067F000080000007800C0000104000-000000067F000080000007800C0000108000__0000009614F1FFE8", +"000000067F000080000007800C0000104000-000000067F000080000007800C0000108000__00000096E85806C0", +"000000067F000080000007800C00001057C1-000000067F000080000007800C000010EF0B__000000959635F2A9-00000096193A8001", +"000000067F000080000007800C0000108000-000000067F000080000007800C000010C000__0000009614F1FFE8", +"000000067F000080000007800C0000108000-000000067F000080000007800C000010C000__00000096E85806C0", +"000000067F000080000007800C000010C000-000000067F000080000007800C0000110000__0000009614F1FFE8", +"000000067F000080000007800C000010C000-000000067F000080000007800C0000110000__00000096E85806C0", +"000000067F000080000007800C000010EF0B-01000000000000000100000004000000000B__000000959635F2A9-00000096193A8001", +"000000067F000080000007800C0000110000-000000067F00008000000780120100000000__00000096E85806C0", +"000000067F000080000007800C0000110000-030000000000000000000000000000000002__0000009614F1FFE8", +"000000067F00008000000780140000000000-000000067F00008000000780140000004000__00000096E85806C0", +"000000067F00008000000780140000003F18-000000067F00008000000780140000009ED4__00000096193A8001-00000096AEF27399", +"000000067F00008000000780140000004000-000000067F00008000000780140000008000__00000096E85806C0", +"000000067F00008000000780140000008000-000000067F0000800000078014000000C000__00000096E85806C0", +"000000067F00008000000780140000009ED4-000000067F0000800000078014000000FE9A__00000096193A8001-00000096AEF27399", +"000000067F0000800000078014000000C000-000000067F00008000000780140000010000__00000096E85806C0", +"000000067F0000800000078014000000FE9A-000000067F00008000000780140000015DD1__00000096193A8001-00000096AEF27399", +"000000067F00008000000780140000010000-000000067F00008000000780140000014000__00000096E85806C0", +"000000067F00008000000780140000013481-030000000000000000000000000000000002__00000096AEF27399-00000096E85829C9", +"000000067F00008000000780140000014000-000000067F00008000000780140000018000__00000096E85806C0", +"000000067F00008000000780140000015DD1-000000067F0000800000078014000001BD7E__00000096193A8001-00000096AEF27399", +"000000067F00008000000780140000018000-000000067F0000800000078014000001C000__00000096E85806C0", +"000000067F0000800000078014000001BD7E-000000067F00008000000780140000021CF0__00000096193A8001-00000096AEF27399", +"000000067F0000800000078014000001C000-000000067F00008000000780140000020000__00000096E85806C0", +"000000067F00008000000780140000020000-000000067F00008000000780140000024000__00000096E85806C0", +"000000067F00008000000780140000021CF0-000000067F00008000000780140000027CF8__00000096193A8001-00000096AEF27399", +"000000067F00008000000780140000024000-000000067F00008000000780140000028000__00000096E85806C0", +"000000067F00008000000780140000027CF8-000000067F0000800000078014000002DC88__00000096193A8001-00000096AEF27399", +"000000067F00008000000780140000028000-000000067F0000800000078014000002C000__00000096E85806C0", +"000000067F0000800000078014000002C000-030000000000000000000000000000000002__00000096E85806C0", +"000000067F0000800000078014000002DC88-030000000000000000000000000000000002__00000096193A8001-00000096AEF27399", +"000000067F000080000007A00C0000000000-000000067F000080000007A00C0000004000__0000009921F3B4A8", +"000000067F000080000007A00C0000004000-000000067F000080000007A00C0000008000__0000009921F3B4A8", +"000000067F000080000007A00C0000008000-000000067F000080000007A00C000000C000__0000009921F3B4A8", +"000000067F000080000007A00C000000974B-000000067F000080000007A00C0000012EB1__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C000000C000-000000067F000080000007A00C0000010000__0000009921F3B4A8", +"000000067F000080000007A00C0000010000-000000067F000080000007A00C0000014000__0000009921F3B4A8", +"000000067F000080000007A00C0000012EB1-000000067F000080000007A00C000001C60B__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000014000-000000067F000080000007A00C0000018000__0000009921F3B4A8", +"000000067F000080000007A00C0000018000-000000067F000080000007A00C000001C000__0000009921F3B4A8", +"000000067F000080000007A00C000001C000-000000067F000080000007A00C0000020000__0000009921F3B4A8", +"000000067F000080000007A00C000001C60B-000000067F000080000007A00C0000025D39__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000020000-000000067F000080000007A00C0000024000__0000009921F3B4A8", +"000000067F000080000007A00C0000024000-000000067F000080000007A00C0000028000__0000009921F3B4A8", +"000000067F000080000007A00C0000025D39-000000067F000080000007A00C000002F49F__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000028000-000000067F000080000007A00C000002C000__0000009921F3B4A8", +"000000067F000080000007A00C000002C000-000000067F000080000007A00C0000030000__0000009921F3B4A8", +"000000067F000080000007A00C000002F49F-000000067F000080000007A00C0000038BB2__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000030000-000000067F000080000007A00C0000034000__0000009921F3B4A8", +"000000067F000080000007A00C0000034000-000000067F000080000007A00C0000038000__0000009921F3B4A8", +"000000067F000080000007A00C0000038000-000000067F000080000007A00C000003C000__0000009921F3B4A8", +"000000067F000080000007A00C0000038BB2-000000067F000080000007A00C0000042318__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C000003C000-000000067F000080000007A00C0000040000__0000009921F3B4A8", +"000000067F000080000007A00C0000040000-000000067F000080000007A00C0000044000__0000009921F3B4A8", +"000000067F000080000007A00C0000042318-000000067F000080000007A00C000004BA7E__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000044000-000000067F000080000007A00C0000048000__0000009921F3B4A8", +"000000067F000080000007A00C0000048000-000000067F000080000007A00C000004C000__0000009921F3B4A8", +"000000067F000080000007A00C000004B9B2-000000067F000080000007A00C0000097B6D__0000009921E47AA1-000000997F5D23C9", +"000000067F000080000007A00C000004BA7E-000000067F000080000007A00C00000551B3__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C000004C000-000000067F000080000007A00C0000050000__0000009921F3B4A8", +"000000067F000080000007A00C0000050000-000000067F000080000007A00C0000054000__0000009921F3B4A8", +"000000067F000080000007A00C0000054000-000000067F000080000007A00C0000058000__0000009921F3B4A8", +"000000067F000080000007A00C00000551B3-000000067F000080000007A00C000005E90A__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000058000-000000067F000080000007A00C000005C000__0000009921F3B4A8", +"000000067F000080000007A00C000005C000-000000067F000080000007A00C0000060000__0000009921F3B4A8", +"000000067F000080000007A00C000005E90A-000000067F000080000007A00C000006802C__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000060000-000000067F000080000007A00C0000064000__0000009921F3B4A8", +"000000067F000080000007A00C0000064000-000000067F000080000007A00C0000068000__0000009921F3B4A8", +"000000067F000080000007A00C0000068000-000000067F000080000007A00C000006C000__0000009921F3B4A8", +"000000067F000080000007A00C000006802C-000000067F000080000007A00C0000071783__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C000006C000-000000067F000080000007A00C0000070000__0000009921F3B4A8", +"000000067F000080000007A00C0000070000-000000067F000080000007A00C0000074000__0000009921F3B4A8", +"000000067F000080000007A00C0000071783-000000067F000080000007A00C000007AEE8__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000074000-000000067F000080000007A00C0000078000__0000009921F3B4A8", +"000000067F000080000007A00C0000078000-000000067F000080000007A00C000007C000__0000009921F3B4A8", +"000000067F000080000007A00C000007AEE8-000000067F000080000007A00C000008460B__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C000007C000-000000067F000080000007A00C0000080000__0000009921F3B4A8", +"000000067F000080000007A00C0000080000-000000067F000080000007A00C0000084000__0000009921F3B4A8", +"000000067F000080000007A00C0000084000-000000067F000080000007A00C0000088000__0000009921F3B4A8", +"000000067F000080000007A00C000008460B-000000067F000080000007A00C000008DD71__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000088000-000000067F000080000007A00C000008C000__0000009921F3B4A8", +"000000067F000080000007A00C000008C000-000000067F000080000007A00C0000090000__0000009921F3B4A8", +"000000067F000080000007A00C000008DD71-000000067F000080000007A00C00000974D7__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000090000-000000067F000080000007A00C0000094000__0000009921F3B4A8", +"000000067F000080000007A00C0000094000-000000067F000080000007A00C0000098000__0000009921F3B4A8", +"000000067F000080000007A00C00000974D7-000000067F000080000007A00C00000A0C0B__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000097B7A-000000067F000080000007A00C00000E3627__0000009921E47AA1-000000997F5D23C9", +"000000067F000080000007A00C0000098000-000000067F000080000007A00C000009C000__0000009921F3B4A8", +"000000067F000080000007A00C000009C000-000000067F000080000007A00C00000A0000__0000009921F3B4A8", +"000000067F000080000007A00C00000A0000-000000067F000080000007A00C00000A4000__0000009921F3B4A8", +"000000067F000080000007A00C00000A0C0B-000000067F000080000007A00C00000AA371__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000A4000-000000067F000080000007A00C00000A8000__0000009921F3B4A8", +"000000067F000080000007A00C00000A8000-000000067F000080000007A00C00000AC000__0000009921F3B4A8", +"000000067F000080000007A00C00000AA371-000000067F000080000007A00C00000B3AD7__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000AC000-000000067F000080000007A00C00000B0000__0000009921F3B4A8", +"000000067F000080000007A00C00000B0000-000000067F000080000007A00C00000B4000__0000009921F3B4A8", +"000000067F000080000007A00C00000B3AD7-000000067F000080000007A00C00000BD20B__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000B4000-000000067F000080000007A00C00000B8000__0000009921F3B4A8", +"000000067F000080000007A00C00000B8000-000000067F000080000007A00C00000BC000__0000009921F3B4A8", +"000000067F000080000007A00C00000BC000-000000067F000080000007A00C00000C0000__0000009921F3B4A8", +"000000067F000080000007A00C00000BD20B-000000067F000080000007A00C00000C6932__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000C0000-000000067F000080000007A00C00000C4000__0000009921F3B4A8", +"000000067F000080000007A00C00000C4000-000000067F000080000007A00C00000C8000__0000009921F3B4A8", +"000000067F000080000007A00C00000C6932-000000067F000080000007A00C00000D0098__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000C8000-000000067F000080000007A00C00000CC000__0000009921F3B4A8", +"000000067F000080000007A00C00000CC000-000000067F000080000007A00C00000D0000__0000009921F3B4A8", +"000000067F000080000007A00C00000D0000-000000067F000080000007A00C00000D4000__0000009921F3B4A8", +"000000067F000080000007A00C00000D0098-000000067F000080000007A00C00000D97FE__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000D4000-000000067F000080000007A00C00000D8000__0000009921F3B4A8", +"000000067F000080000007A00C00000D8000-000000067F000080000007A00C00000DC000__0000009921F3B4A8", +"000000067F000080000007A00C00000D97FE-000000067F000080000007A00C00000E2F0B__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000DC000-000000067F000080000007A00C00000E0000__0000009921F3B4A8", +"000000067F000080000007A00C00000E0000-000000067F000080000007A00C00000E4000__0000009921F3B4A8", +"000000067F000080000007A00C00000E2F0B-000000067F000080000007A00C00000EC671__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000E364A-000000067F000080000007A01400000065FE__0000009921E47AA1-000000997F5D23C9", +"000000067F000080000007A00C00000E4000-000000067F000080000007A00C00000E8000__0000009921F3B4A8", +"000000067F000080000007A00C00000E8000-000000067F000080000007A00C00000EC000__0000009921F3B4A8", +"000000067F000080000007A00C00000EC000-000000067F000080000007A00C00000F0000__0000009921F3B4A8", +"000000067F000080000007A00C00000EC671-000000067F000080000007A00C00000F5D9F__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000F0000-000000067F000080000007A00C00000F4000__0000009921F3B4A8", +"000000067F000080000007A00C00000F4000-000000067F000080000007A00C00000F8000__0000009921F3B4A8", +"000000067F000080000007A00C00000F5D9F-000000067F000080000007A00C00000FF505__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000F720F-000000067F000080000007A00C0000111692__00000098A7ADFC91-0000009921E47AA1", +"000000067F000080000007A00C00000F8000-000000067F000080000007A00C00000FC000__0000009921F3B4A8", +"000000067F000080000007A00C00000FC000-000000067F000080000007A00C0000100000__0000009921F3B4A8", +"000000067F000080000007A00C00000FF505-000000067F000080000007A00C0000108C10__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000100000-000000067F000080000007A00C0000104000__0000009921F3B4A8", +"000000067F000080000007A00C0000104000-000000067F000080000007A00C0000108000__0000009921F3B4A8", +"000000067F000080000007A00C0000108000-000000067F000080000007A00C000010C000__0000009921F3B4A8", +"000000067F000080000007A00C0000108C10-030000000000000000000000000000000002__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C000010C000-000000067F000080000007A00C0000110000__0000009921F3B4A8", +"000000067F000080000007A00C0000110000-000000067F000080000007A0120100000000__0000009921F3B4A8", +"000000067F000080000007A00C0000111692-000000067F000080000007A01400000040E7__00000098A7ADFC91-0000009921E47AA1", +"000000067F000080000007A0140000000000-000000067F000080000007A0140000004000__0000009921F3B4A8", +"000000067F000080000007A0140000004000-000000067F000080000007A0140000008000__0000009921F3B4A8", +"000000067F000080000007A01400000040E7-000000067F000080000007A014000000B5F6__00000098A7ADFC91-0000009921E47AA1", +"000000067F000080000007A0140000006601-000000067F000080000007A014000001B4CB__0000009921E47AA1-000000997F5D23C9", +"000000067F000080000007A0140000008000-000000067F000080000007A014000000C000__0000009921F3B4A8", +"000000067F000080000007A014000000B5F6-000000067F000080000007A0140000012AFC__00000098A7ADFC91-0000009921E47AA1", +"000000067F000080000007A014000000C000-000000067F000080000007A0140000010000__0000009921F3B4A8", +"000000067F000080000007A0140000010000-000000067F000080000007A0140000014000__0000009921F3B4A8", +"000000067F000080000007A0140000012AFC-000000067F000080000007A0140000019F9B__00000098A7ADFC91-0000009921E47AA1", +"000000067F000080000007A0140000014000-000000067F000080000007A0140000018000__0000009921F3B4A8", +"000000067F000080000007A0140000018000-000000067F000080000007A014000001C000__0000009921F3B4A8", +"000000067F000080000007A0140000019F9B-000000067F000080000007A01400000214BE__00000098A7ADFC91-0000009921E47AA1", +"000000067F000080000007A014000001B4CB-030000000000000000000000000000000002__0000009921E47AA1-000000997F5D23C9", +"000000067F000080000007A014000001C000-000000067F000080000007A0140000020000__0000009921F3B4A8", +"000000067F000080000007A0140000020000-000000067F000080000007A0140000024000__0000009921F3B4A8", +"000000067F000080000007A01400000214BE-000000067F000080000007A01400000289C9__00000098A7ADFC91-0000009921E47AA1", +"000000067F000080000007A0140000024000-000000067F000080000007A0140000028000__0000009921F3B4A8", +"000000067F000080000007A0140000028000-000000067F000080000007A014000002C000__0000009921F3B4A8", +"000000067F000080000007A01400000289C9-030000000000000000000000000000000002__00000098A7ADFC91-0000009921E47AA1", +"000000067F000080000007A014000002C000-030000000000000000000000000000000002__0000009921F3B4A8", +"000000067F000080000007C00C0000000000-000000067F000080000007C00C0000004000__0000009B5229DFE8", +"000000067F000080000007C00C0000004000-000000067F000080000007C00C0000008000__0000009B5229DFE8", +"000000067F000080000007C00C0000007EA5-000000067F000080000007C00C00000115FE__000000997F5D23C9-00000099F1C9FC71", +"000000067F000080000007C00C0000008000-000000067F000080000007C00C000000C000__0000009B5229DFE8", +"000000067F000080000007C00C000000C000-000000067F000080000007C00C0000010000__0000009B5229DFE8", +"000000067F000080000007C00C0000010000-000000067F000080000007C00C0000014000__0000009B5229DFE8", +"000000067F000080000007C00C00000115FE-000000067F000080000007C00C000001AD0C__000000997F5D23C9-00000099F1C9FC71", +"000000067F000080000007C00C0000014000-000000067F000080000007C00C0000018000__0000009B5229DFE8", +"000000067F000080000007C00C0000018000-000000067F000080000007C00C000001C000__0000009B5229DFE8", +"000000067F000080000007C00C000001AD0C-000000067F000080000007C00C0000024472__000000997F5D23C9-00000099F1C9FC71", +"000000067F000080000007C00C000001C000-000000067F000080000007C00C0000020000__0000009B5229DFE8", +"000000067F000080000007C00C0000020000-000000067F000080000007C00C0000024000__0000009B5229DFE8", +"000000067F000080000007C00C0000024000-000000067F000080000007C00C0000028000__0000009B5229DFE8", +"000000067F000080000007C00C0000024472-000000067F000080000007C00C000002DBD8__000000997F5D23C9-00000099F1C9FC71", +"000000067F000080000007C00C0000028000-000000067F000080000007C00C000002C000__0000009B5229DFE8", +"000000067F000080000007C00C000002C000-000000067F000080000007C00C0000030000__0000009B5229DFE8", +"000000067F000080000007C00C000002DBD8-000000067F000080000007C00C000003732B__000000997F5D23C9-00000099F1C9FC71", +"000000067F000080000007C00C0000030000-000000067F000080000007C00C0000034000__0000009B5229DFE8", +"000000067F000080000007C00C0000034000-000000067F000080000007C00C0000038000__0000009B5229DFE8", +"000000067F000080000007C00C000003732B-000000067F000080000007C00C0000040A91__000000997F5D23C9-00000099F1C9FC71", +"000000067F000080000007C00C0000038000-000000067F000080000007C00C000003C000__0000009B5229DFE8", +"000000067F000080000007C00C000003C000-000000067F000080000007C00C0000040000__0000009B5229DFE8", +"000000067F000080000007C00C0000040000-000000067F000080000007C00C0000044000__0000009B40525F80", +"000000067F000080000007C00C0000040000-000000067F000080000007C00C0000044000__0000009C1E3799F0", +"000000067F000080000007C00C0000040A91-030000000000000000000000000000000002__000000997F5D23C9-00000099F1C9FC71", +"000000067F000080000007C00C0000042360-000000067F000080000007C00C000004BAC6__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C0000044000-000000067F000080000007C00C0000048000__0000009B40525F80", +"000000067F000080000007C00C0000044000-000000067F000080000007C00C0000048000__0000009C1E3799F0", +"000000067F000080000007C00C0000048000-000000067F000080000007C00C000004C000__0000009B40525F80", +"000000067F000080000007C00C0000048000-000000067F000080000007C00C000004C000__0000009C1E3799F0", +"000000067F000080000007C00C000004BAC6-000000067F000080000007C00C00000551FB__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C000004C000-000000067F000080000007C00C0000050000__0000009B40525F80", +"000000067F000080000007C00C000004C000-000000067F000080000007C00C0000050000__0000009C1E3799F0", +"000000067F000080000007C00C0000050000-000000067F000080000007C00C0000054000__0000009B40525F80", +"000000067F000080000007C00C0000050000-000000067F000080000007C00C0000054000__0000009C1E3799F0", +"000000067F000080000007C00C0000052AA4-000000067F000080000007C00C00000A4244__0000009BCB4E4461-0000009C1E8CC879", +"000000067F000080000007C00C0000054000-000000067F000080000007C00C0000058000__0000009B40525F80", +"000000067F000080000007C00C0000054000-000000067F000080000007C00C0000058000__0000009C1E3799F0", +"000000067F000080000007C00C00000551FB-000000067F000080000007C00C000005E90B__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C0000058000-000000067F000080000007C00C000005C000__0000009B40525F80", +"000000067F000080000007C00C0000058000-000000067F000080000007C00C000005C000__0000009C1E3799F0", +"000000067F000080000007C00C000005C000-000000067F000080000007C00C0000060000__0000009B40525F80", +"000000067F000080000007C00C000005C000-000000067F000080000007C00C0000060000__0000009C1E3799F0", +"000000067F000080000007C00C000005E90B-000000067F000080000007C00C000006802B__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C0000060000-000000067F000080000007C00C0000064000__0000009B40525F80", +"000000067F000080000007C00C0000060000-000000067F000080000007C00C0000064000__0000009C1E3799F0", +"000000067F000080000007C00C0000064000-000000067F000080000007C00C0000068000__0000009B40525F80", +"000000067F000080000007C00C0000064000-000000067F000080000007C00C0000068000__0000009C1E3799F0", +"000000067F000080000007C00C0000068000-000000067F000080000007C00C000006C000__0000009B40525F80", +"000000067F000080000007C00C0000068000-000000067F000080000007C00C000006C000__0000009C1E3799F0", +"000000067F000080000007C00C000006802B-000000067F000080000007C00C0000071782__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C000006C000-000000067F000080000007C00C0000070000__0000009B40525F80", +"000000067F000080000007C00C000006C000-000000067F000080000007C00C0000070000__0000009C1E3799F0", +"000000067F000080000007C00C0000070000-000000067F000080000007C00C0000074000__0000009B40525F80", +"000000067F000080000007C00C0000070000-000000067F000080000007C00C0000074000__0000009C1E3799F0", +"000000067F000080000007C00C0000071782-000000067F000080000007C00C000007AEE8__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C0000074000-000000067F000080000007C00C0000078000__0000009B40525F80", +"000000067F000080000007C00C0000074000-000000067F000080000007C00C0000078000__0000009C1E3799F0", +"000000067F000080000007C00C0000078000-000000067F000080000007C00C000007C000__0000009B40525F80", +"000000067F000080000007C00C0000078000-000000067F000080000007C00C000007C000__0000009C1E3799F0", +"000000067F000080000007C00C000007AEE8-000000067F000080000007C00C000008460B__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C000007C000-000000067F000080000007C00C0000080000__0000009B40525F80", +"000000067F000080000007C00C000007C000-000000067F000080000007C00C0000080000__0000009C1E3799F0", +"000000067F000080000007C00C0000080000-000000067F000080000007C00C0000084000__0000009B40525F80", +"000000067F000080000007C00C0000080000-000000067F000080000007C00C0000084000__0000009C1E3799F0", +"000000067F000080000007C00C0000084000-000000067F000080000007C00C0000088000__0000009B40525F80", +"000000067F000080000007C00C0000084000-000000067F000080000007C00C0000088000__0000009C1E3799F0", +"000000067F000080000007C00C000008460B-000000067F000080000007C00C000008DD71__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C0000088000-000000067F000080000007C00C000008C000__0000009B40525F80", +"000000067F000080000007C00C0000088000-000000067F000080000007C00C000008C000__0000009C1E3799F0", +"000000067F000080000007C00C000008C000-000000067F000080000007C00C0000090000__0000009B40525F80", +"000000067F000080000007C00C000008C000-000000067F000080000007C00C0000090000__0000009C1E3799F0", +"000000067F000080000007C00C000008DD71-000000067F000080000007C00C00000974D7__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C0000090000-000000067F000080000007C00C0000094000__0000009B40525F80", +"000000067F000080000007C00C0000090000-000000067F000080000007C00C0000094000__0000009C1E3799F0", +"000000067F000080000007C00C0000094000-000000067F000080000007C00C0000098000__0000009B40525F80", +"000000067F000080000007C00C0000094000-000000067F000080000007C00C0000098000__0000009C1E3799F0", +"000000067F000080000007C00C00000974D7-000000067F000080000007C00C00000A0C0B__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C0000098000-000000067F000080000007C00C000009C000__0000009B40525F80", +"000000067F000080000007C00C0000098000-000000067F000080000007C00C000009C000__0000009C1E3799F0", +"000000067F000080000007C00C000009C000-000000067F000080000007C00C00000A0000__0000009B40525F80", +"000000067F000080000007C00C000009C000-000000067F000080000007C00C00000A0000__0000009C1E3799F0", +"000000067F000080000007C00C00000A0000-000000067F000080000007C00C00000A4000__0000009B40525F80", +"000000067F000080000007C00C00000A0000-000000067F000080000007C00C00000A4000__0000009C1E3799F0", +"000000067F000080000007C00C00000A0C0B-000000067F000080000007C00C0100000000__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C00000A4000-000000067F000080000007C00C00000A8000__0000009B40525F80", +"000000067F000080000007C00C00000A4000-000000067F000080000007C00C00000A8000__0000009C1E3799F0", +"000000067F000080000007C00C00000A424C-000000067F000080000007C00C00000F5B43__0000009BCB4E4461-0000009C1E8CC879", +"000000067F000080000007C00C00000A8000-000000067F000080000007C00C00000AC000__0000009B40525F80", +"000000067F000080000007C00C00000A8000-000000067F000080000007C00C00000AC000__0000009C1E3799F0", +"000000067F000080000007C00C00000A9244-000000067F000080000007C00C00000B2991__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000AC000-000000067F000080000007C00C00000B0000__0000009B40525F80", +"000000067F000080000007C00C00000AC000-000000067F000080000007C00C00000B0000__0000009C1E3799F0", +"000000067F000080000007C00C00000B0000-000000067F000080000007C00C00000B4000__0000009B40525F80", +"000000067F000080000007C00C00000B0000-000000067F000080000007C00C00000B4000__0000009C1E3799F0", +"000000067F000080000007C00C00000B2991-000000067F000080000007C00C00000BC0F7__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000B4000-000000067F000080000007C00C00000B8000__0000009B40525F80", +"000000067F000080000007C00C00000B4000-000000067F000080000007C00C00000B8000__0000009C1E3799F0", +"000000067F000080000007C00C00000B8000-000000067F000080000007C00C00000BC000__0000009B40525F80", +"000000067F000080000007C00C00000B8000-000000067F000080000007C00C00000BC000__0000009C1E3799F0", +"000000067F000080000007C00C00000BA258-000000067F000080000007C01400000011E2__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F000080000007C00C00000BC000-000000067F000080000007C00C00000C0000__0000009B40525F80", +"000000067F000080000007C00C00000BC000-000000067F000080000007C00C00000C0000__0000009C1E3799F0", +"000000067F000080000007C00C00000BC0F7-000000067F000080000007C00C00000C580C__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000C0000-000000067F000080000007C00C00000C4000__0000009B40525F80", +"000000067F000080000007C00C00000C0000-000000067F000080000007C00C00000C4000__0000009C1E3799F0", +"000000067F000080000007C00C00000C4000-000000067F000080000007C00C00000C8000__0000009B40525F80", +"000000067F000080000007C00C00000C4000-000000067F000080000007C00C00000C8000__0000009C1E3799F0", +"000000067F000080000007C00C00000C580C-000000067F000080000007C00C00000CEF72__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000C8000-000000067F000080000007C00C00000CC000__0000009B40525F80", +"000000067F000080000007C00C00000C8000-000000067F000080000007C00C00000CC000__0000009C1E3799F0", +"000000067F000080000007C00C00000CC000-000000067F000080000007C00C00000D0000__0000009B40525F80", +"000000067F000080000007C00C00000CC000-000000067F000080000007C00C00000D0000__0000009C1E3799F0", +"000000067F000080000007C00C00000CEF72-000000067F000080000007C00C00000D86D8__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000D0000-000000067F000080000007C00C00000D4000__0000009B40525F80", +"000000067F000080000007C00C00000D0000-000000067F000080000007C00C00000D4000__0000009C1E3799F0", +"000000067F000080000007C00C00000D4000-000000067F000080000007C00C00000D8000__0000009B40525F80", +"000000067F000080000007C00C00000D4000-000000067F000080000007C00C00000D8000__0000009C1E3799F0", +"000000067F000080000007C00C00000D8000-000000067F000080000007C00C00000DC000__0000009B40525F80", +"000000067F000080000007C00C00000D8000-000000067F000080000007C00C00000DC000__0000009C1E3799F0", +"000000067F000080000007C00C00000D86D8-000000067F000080000007C00C00000E1E0B__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000DC000-000000067F000080000007C00C00000E0000__0000009B40525F80", +"000000067F000080000007C00C00000DC000-000000067F000080000007C00C00000E0000__0000009C1E3799F0", +"000000067F000080000007C00C00000E0000-000000067F000080000007C00C00000E4000__0000009B40525F80", +"000000067F000080000007C00C00000E0000-000000067F000080000007C00C00000E4000__0000009C1E3799F0", +"000000067F000080000007C00C00000E1E0B-000000067F000080000007C00C00000EB571__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000E4000-000000067F000080000007C00C00000E8000__0000009B40525F80", +"000000067F000080000007C00C00000E4000-000000067F000080000007C00C00000E8000__0000009C1E3799F0", +"000000067F000080000007C00C00000E8000-000000067F000080000007C00C00000EC000__0000009B40525F80", +"000000067F000080000007C00C00000E8000-000000067F000080000007C00C00000EC000__0000009C1E3799F0", +"000000067F000080000007C00C00000EB571-000000067F000080000007C00C00000F4CD7__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000EC000-000000067F000080000007C00C00000F0000__0000009B40525F80", +"000000067F000080000007C00C00000EC000-000000067F000080000007C00C00000F0000__0000009C1E3799F0", +"000000067F000080000007C00C00000F0000-000000067F000080000007C00C00000F4000__0000009B40525F80", +"000000067F000080000007C00C00000F0000-000000067F000080000007C00C00000F4000__0000009C1E3799F0", +"000000067F000080000007C00C00000F4000-000000067F000080000007C00C00000F8000__0000009B40525F80", +"000000067F000080000007C00C00000F4000-000000067F000080000007C00C00000F8000__0000009C1E3799F0", +"000000067F000080000007C00C00000F4CD7-000000067F000080000007C00C00000FE40B__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000F5B56-000000067F000080000007C014000000EB5A__0000009BCB4E4461-0000009C1E8CC879", +"000000067F000080000007C00C00000F8000-000000067F000080000007C00C00000FC000__0000009B40525F80", +"000000067F000080000007C00C00000F8000-000000067F000080000007C00C00000FC000__0000009C1E3799F0", +"000000067F000080000007C00C00000FC000-000000067F000080000007C00C0000100000__0000009B40525F80", +"000000067F000080000007C00C00000FC000-000000067F000080000007C00C0000100000__0000009C1E3799F0", +"000000067F000080000007C00C00000FE40B-000000067F000080000007C00C0000107B27__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C0000100000-000000067F000080000007C00C0000104000__0000009B40525F80", +"000000067F000080000007C00C0000100000-000000067F000080000007C00C0000104000__0000009C1E3799F0", +"000000067F000080000007C00C0000104000-000000067F000080000007C00C0000108000__0000009B40525F80", +"000000067F000080000007C00C0000104000-000000067F000080000007C00C0000108000__0000009C1E3799F0", +"000000067F000080000007C00C0000107B27-000000067F000080000007C00C000011128D__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C0000108000-000000067F000080000007C00C000010C000__0000009C1E3799F0", +"000000067F000080000007C00C0000108000-030000000000000000000000000000000002__0000009B40525F80", +"000000067F000080000007C00C000010C000-000000067F000080000007C00C0000110000__0000009C1E3799F0", +"000000067F000080000007C00C0000110000-000000067F000080000007C0120100000000__0000009C1E3799F0", +"000000067F000080000007C00C000011128D-010000000000000001000000040000000012__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C0140000000000-000000067F000080000007C0140000004000__0000009C1E3799F0", +"000000067F000080000007C01400000011E2-000000067F000080000007C0140000007F04__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F000080000007C0140000004000-000000067F000080000007C0140000008000__0000009C1E3799F0", +"000000067F000080000007C0140000007F04-000000067F000080000007C014000000EC12__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F000080000007C0140000008000-000000067F000080000007C014000000C000__0000009C1E3799F0", +"000000067F000080000007C014000000C000-000000067F000080000007C0140000010000__0000009C1E3799F0", +"000000067F000080000007C014000000EB5A-000000067F000080000007C0140000027B5C__0000009BCB4E4461-0000009C1E8CC879", +"000000067F000080000007C014000000EC12-000000067F000080000007C0140000015910__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F000080000007C0140000010000-000000067F000080000007C0140000014000__0000009C1E3799F0", +"000000067F000080000007C0140000014000-000000067F000080000007C0140000018000__0000009C1E3799F0", +"000000067F000080000007C0140000015910-000000067F000080000007C014000001C5BB__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F000080000007C0140000018000-000000067F000080000007C014000001C000__0000009C1E3799F0", +"000000067F000080000007C014000001C000-000000067F000080000007C0140000020000__0000009C1E3799F0", +"000000067F000080000007C014000001C5BB-000000067F000080000007C0140000023298__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F000080000007C0140000020000-000000067F000080000007C0140000024000__0000009C1E3799F0", +"000000067F000080000007C0140000023298-000000067F000080000007C0140000029F9A__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F000080000007C0140000024000-000000067F000080000007C0140000028000__0000009C1E3799F0", +"000000067F000080000007C0140000027B5E-030000000000000000000000000000000002__0000009BCB4E4461-0000009C1E8CC879", +"000000067F000080000007C0140000028000-000000067F000080000007C014000002C000__0000009C1E3799F0", +"000000067F000080000007C0140000029F9A-030000000000000000000000000000000002__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F000080000007C014000002C000-030000000000000000000000000000000002__0000009C1E3799F0", +"000000067F000080000007E00C0000000000-000000067F000080000007E00C0000004000__0000009DEF760000", +"000000067F000080000007E00C0000004000-000000067F000080000007E00C0000008000__0000009DEF760000", +"000000067F000080000007E00C0000008000-000000067F000080000007E00C000000C000__0000009DEF760000", +"000000067F000080000007E00C00000092CD-000000067F000080000007E00C0000012A0A__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000080000007E00C000000C000-000000067F000080000007E00C0000010000__0000009DEF760000", +"000000067F000080000007E00C0000010000-000000067F000080000007E00C0000014000__0000009DEF760000", +"000000067F000080000007E00C0000012A0A-000000067F000080000007E00C000001C170__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000080000007E00C0000014000-000000067F000080000007E00C0000018000__0000009DEF760000", +"000000067F000080000007E00C0000018000-000000067F000080000007E00C000001C000__0000009DEF760000", +"000000067F000080000007E00C000001C000-000000067F000080000007E00C0000020000__0000009DEF760000", +"000000067F000080000007E00C000001C170-000000067F000080000007E00C00000258D6__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000080000007E00C0000020000-000000067F000080000007E00C0000024000__0000009DEF760000", +"000000067F000080000007E00C0000024000-000000067F000080000007E00C0000028000__0000009DEF760000", +"000000067F000080000007E00C00000258D6-000000067F000080000007E00C000002F00B__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000080000007E00C0000028000-000000067F000080000007E00C000002C000__0000009DEF760000", +"000000067F000080000007E00C000002C000-000000067F000080000007E00C0000030000__0000009DEF760000", +"000000067F000080000007E00C000002F00B-000000067F000080000007E00C0000038720__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000080000007E00C0000030000-000000067F000080000007E00C0000034000__0000009DEF760000", +"000000067F000080000007E00C0000034000-000000067F000080000007E00C0000038000__0000009DEF760000", +"000000067F000080000007E00C0000038000-000000067F000080000007E00C000003C000__0000009DEF760000", +"000000067F000080000007E00C0000038720-000000067F000080000007E00C0000041E86__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000080000007E00C000003C000-000000067F000080000007E00C0000040000__0000009DEF760000", +"000000067F000080000007E00C0000040000-000000067F000080000007E00C0000044000__0000009DEF760000", +"000000067F000080000007E00C0000041E86-000000067F000080000007E00C000004B5EC__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000080000007E00C0000044000-000000067F000080000007E00C0000048000__0000009DEF760000", +"000000067F000080000007E00C0000048000-000000067F000080000007E00C000004C000__0000009DDBE10620", +"000000067F000080000007E00C0000048000-000000067F000080000007E00C000004C000__0000009EBB11FFC0", +"000000067F000080000007E00C000004B5EC-030000000000000000000000000000000002__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000080000007E00C000004BACA-000000067F000080000007E00C00000551FF__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C000004C000-000000067F000080000007E00C0000050000__0000009DDBE10620", +"000000067F000080000007E00C000004C000-000000067F000080000007E00C0000050000__0000009EBB11FFC0", +"000000067F000080000007E00C0000050000-000000067F000080000007E00C0000054000__0000009DDBE10620", +"000000067F000080000007E00C0000050000-000000067F000080000007E00C0000054000__0000009EBB11FFC0", +"000000067F000080000007E00C0000054000-000000067F000080000007E00C0000058000__0000009DDBE10620", +"000000067F000080000007E00C0000054000-000000067F000080000007E00C0000058000__0000009EBB11FFC0", +"000000067F000080000007E00C00000551FF-000000067F000080000007E00C000005E90C__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C0000058000-000000067F000080000007E00C000005C000__0000009DDBE10620", +"000000067F000080000007E00C0000058000-000000067F000080000007E00C000005C000__0000009EBB11FFC0", +"000000067F000080000007E00C000005C000-000000067F000080000007E00C0000060000__0000009DDBE10620", +"000000067F000080000007E00C000005C000-000000067F000080000007E00C0000060000__0000009EBB11FFC0", +"000000067F000080000007E00C000005E90C-000000067F000080000007E00C000006802C__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C0000060000-000000067F000080000007E00C0000064000__0000009DDBE10620", +"000000067F000080000007E00C0000060000-000000067F000080000007E00C0000064000__0000009EBB11FFC0", +"000000067F000080000007E00C0000061AE1-000000067F000080000007E00C00000C2A6C__0000009E781A9731-0000009EBBC72771", +"000000067F000080000007E00C0000064000-000000067F000080000007E00C0000068000__0000009DDBE10620", +"000000067F000080000007E00C0000064000-000000067F000080000007E00C0000068000__0000009EBB11FFC0", +"000000067F000080000007E00C0000068000-000000067F000080000007E00C000006C000__0000009DDBE10620", +"000000067F000080000007E00C0000068000-000000067F000080000007E00C000006C000__0000009EBB11FFC0", +"000000067F000080000007E00C000006802C-000000067F000080000007E00C0000071783__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C000006C000-000000067F000080000007E00C0000070000__0000009DDBE10620", +"000000067F000080000007E00C000006C000-000000067F000080000007E00C0000070000__0000009EBB11FFC0", +"000000067F000080000007E00C0000070000-000000067F000080000007E00C0000074000__0000009DDBE10620", +"000000067F000080000007E00C0000070000-000000067F000080000007E00C0000074000__0000009EBB11FFC0", +"000000067F000080000007E00C0000071783-000000067F000080000007E00C000007AEE9__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C0000074000-000000067F000080000007E00C0000078000__0000009DDBE10620", +"000000067F000080000007E00C0000074000-000000067F000080000007E00C0000078000__0000009EBB11FFC0", +"000000067F000080000007E00C0000078000-000000067F000080000007E00C000007C000__0000009DDBE10620", +"000000067F000080000007E00C0000078000-000000067F000080000007E00C000007C000__0000009EBB11FFC0", +"000000067F000080000007E00C000007AEE9-000000067F000080000007E00C000008460B__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C000007C000-000000067F000080000007E00C0000080000__0000009DDBE10620", +"000000067F000080000007E00C000007C000-000000067F000080000007E00C0000080000__0000009EBB11FFC0", +"000000067F000080000007E00C0000080000-000000067F000080000007E00C0000084000__0000009DDBE10620", +"000000067F000080000007E00C0000080000-000000067F000080000007E00C0000084000__0000009EBB11FFC0", +"000000067F000080000007E00C0000084000-000000067F000080000007E00C0000088000__0000009DDBE10620", +"000000067F000080000007E00C0000084000-000000067F000080000007E00C0000088000__0000009EBB11FFC0", +"000000067F000080000007E00C000008460B-000000067F000080000007E00C000008DD71__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C0000088000-000000067F000080000007E00C000008C000__0000009DDBE10620", +"000000067F000080000007E00C0000088000-000000067F000080000007E00C000008C000__0000009EBB11FFC0", +"000000067F000080000007E00C000008C000-000000067F000080000007E00C0000090000__0000009DDBE10620", +"000000067F000080000007E00C000008C000-000000067F000080000007E00C0000090000__0000009EBB11FFC0", +"000000067F000080000007E00C000008DD71-000000067F000080000007E00C00000974D7__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C0000090000-000000067F000080000007E00C0000094000__0000009DDBE10620", +"000000067F000080000007E00C0000090000-000000067F000080000007E00C0000094000__0000009EBB11FFC0", +"000000067F000080000007E00C0000093E3A-000000067F000080000007E00C0000111CED__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E00C0000094000-000000067F000080000007E00C0000098000__0000009DDBE10620", +"000000067F000080000007E00C0000094000-000000067F000080000007E00C0000098000__0000009EBB11FFC0", +"000000067F000080000007E00C00000974D7-000000067F000080000007E00C00000A0C0B__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C0000098000-000000067F000080000007E00C000009C000__0000009DDBE10620", +"000000067F000080000007E00C0000098000-000000067F000080000007E00C000009C000__0000009EBB11FFC0", +"000000067F000080000007E00C000009C000-000000067F000080000007E00C00000A0000__0000009DDBE10620", +"000000067F000080000007E00C000009C000-000000067F000080000007E00C00000A0000__0000009EBB11FFC0", +"000000067F000080000007E00C00000A0000-000000067F000080000007E00C00000A4000__0000009DDBE10620", +"000000067F000080000007E00C00000A0000-000000067F000080000007E00C00000A4000__0000009EBB11FFC0", +"000000067F000080000007E00C00000A0C0B-000000067F000080000007E00C00000AA371__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C00000A4000-000000067F000080000007E00C00000A8000__0000009DDBE10620", +"000000067F000080000007E00C00000A4000-000000067F000080000007E00C00000A8000__0000009EBB11FFC0", +"000000067F000080000007E00C00000A8000-000000067F000080000007E00C00000AC000__0000009DDBE10620", +"000000067F000080000007E00C00000A8000-000000067F000080000007E00C00000AC000__0000009EBB11FFC0", +"000000067F000080000007E00C00000AA371-000000067F000080000007E00C0100000000__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C00000AC000-000000067F000080000007E00C00000B0000__0000009DDBE10620", +"000000067F000080000007E00C00000AC000-000000067F000080000007E00C00000B0000__0000009EBB11FFC0", +"000000067F000080000007E00C00000B0000-000000067F000080000007E00C00000B4000__0000009DDBE10620", +"000000067F000080000007E00C00000B0000-000000067F000080000007E00C00000B4000__0000009EBB11FFC0", +"000000067F000080000007E00C00000B2704-000000067F000080000007E00C00000BBE0F__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C00000B4000-000000067F000080000007E00C00000B8000__0000009DDBE10620", +"000000067F000080000007E00C00000B4000-000000067F000080000007E00C00000B8000__0000009EBB11FFC0", +"000000067F000080000007E00C00000B8000-000000067F000080000007E00C00000BC000__0000009DDBE10620", +"000000067F000080000007E00C00000B8000-000000067F000080000007E00C00000BC000__0000009EBB11FFC0", +"000000067F000080000007E00C00000BBE0F-000000067F000080000007E00C00000C5542__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C00000BC000-000000067F000080000007E00C00000C0000__0000009DDBE10620", +"000000067F000080000007E00C00000BC000-000000067F000080000007E00C00000C0000__0000009EBB11FFC0", +"000000067F000080000007E00C00000C0000-000000067F000080000007E00C00000C4000__0000009DDBE10620", +"000000067F000080000007E00C00000C0000-000000067F000080000007E00C00000C4000__0000009EBB11FFC0", +"000000067F000080000007E00C00000C2A75-000000067F000080000007E0140000004415__0000009E781A9731-0000009EBBC72771", +"000000067F000080000007E00C00000C4000-000000067F000080000007E00C00000C8000__0000009DDBE10620", +"000000067F000080000007E00C00000C4000-000000067F000080000007E00C00000C8000__0000009EBB11FFC0", +"000000067F000080000007E00C00000C5542-000000067F000080000007E00C00000CECA8__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C00000C8000-000000067F000080000007E00C00000CC000__0000009DDBE10620", +"000000067F000080000007E00C00000C8000-000000067F000080000007E00C00000CC000__0000009EBB11FFC0", +"000000067F000080000007E00C00000CC000-000000067F000080000007E00C00000D0000__0000009DDBE10620", +"000000067F000080000007E00C00000CC000-000000067F000080000007E00C00000D0000__0000009EBB11FFC0", +"000000067F000080000007E00C00000CECA8-000000067F000080000007E00C00000D83BF__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C00000D0000-000000067F000080000007E00C00000D4000__0000009DDBE10620", +"000000067F000080000007E00C00000D0000-000000067F000080000007E00C00000D4000__0000009EBB11FFC0", +"000000067F000080000007E00C00000D4000-000000067F000080000007E00C00000D8000__0000009DDBE10620", +"000000067F000080000007E00C00000D4000-000000067F000080000007E00C00000D8000__0000009EBB11FFC0", +"000000067F000080000007E00C00000D8000-000000067F000080000007E00C00000DC000__0000009DDBE10620", +"000000067F000080000007E00C00000D8000-000000067F000080000007E00C00000DC000__0000009EBB11FFC0", +"000000067F000080000007E00C00000D83BF-000000067F000080000007E00C00000E1B0A__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C00000DC000-000000067F000080000007E00C00000E0000__0000009DDBE10620", +"000000067F000080000007E00C00000DC000-000000067F000080000007E00C00000E0000__0000009EBB11FFC0", +"000000067F000080000007E00C00000E0000-000000067F000080000007E00C00000E4000__0000009DDBE10620", +"000000067F000080000007E00C00000E0000-000000067F000080000007E00C00000E4000__0000009EBB11FFC0", +"000000067F000080000007E00C00000E1B0A-000000067F000080000007E00C00000EB270__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C00000E4000-000000067F000080000007E00C00000E8000__0000009DDBE10620", +"000000067F000080000007E00C00000E4000-000000067F000080000007E00C00000E8000__0000009EBB11FFC0", +"000000067F000080000007E00C00000E8000-000000067F000080000007E00C00000EC000__0000009DDBE10620", +"000000067F000080000007E00C00000E8000-000000067F000080000007E00C00000EC000__0000009EBB11FFC0", +"000000067F000080000007E00C00000EB270-000000067F000080000007E00C00000F49AA__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C00000EC000-000000067F000080000007E00C00000F0000__0000009DDBE10620", +"000000067F000080000007E00C00000EC000-000000067F000080000007E00C00000F0000__0000009EBB11FFC0", +"000000067F000080000007E00C00000F0000-000000067F000080000007E00C00000F4000__0000009DDBE10620", +"000000067F000080000007E00C00000F0000-000000067F000080000007E00C00000F4000__0000009EBB11FFC0", +"000000067F000080000007E00C00000F4000-000000067F000080000007E00C00000F8000__0000009DDBE10620", +"000000067F000080000007E00C00000F4000-000000067F000080000007E00C00000F8000__0000009EBB11FFC0", +"000000067F000080000007E00C00000F49AA-000000067F000080000007E00C00000FE10A__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C00000F8000-000000067F000080000007E00C00000FC000__0000009DDBE10620", +"000000067F000080000007E00C00000F8000-000000067F000080000007E00C00000FC000__0000009EBB11FFC0", +"000000067F000080000007E00C00000FC000-000000067F000080000007E00C0000100000__0000009DDBE10620", +"000000067F000080000007E00C00000FC000-000000067F000080000007E00C0000100000__0000009EBB11FFC0", +"000000067F000080000007E00C00000FE10A-000000067F000080000007E00C000010782C__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C0000100000-000000067F000080000007E00C0000104000__0000009DDBE10620", +"000000067F000080000007E00C0000100000-000000067F000080000007E00C0000104000__0000009EBB11FFC0", +"000000067F000080000007E00C0000104000-000000067F000080000007E00C0000108000__0000009EBB11FFC0", +"000000067F000080000007E00C0000104000-030000000000000000000000000000000002__0000009DDBE10620", +"000000067F000080000007E00C000010782C-000000067F000080000007E00C0000110F88__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C0000108000-000000067F000080000007E00C000010C000__0000009EBB11FFC0", +"000000067F000080000007E00C000010C000-000000067F000080000007E00C0000110000__0000009EBB11FFC0", +"000000067F000080000007E00C0000110000-000000067F000080000007E0120100000000__0000009EBB11FFC0", +"000000067F000080000007E00C0000110F88-010000000000000001000000040000000015__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C0000111CED-000000067F000080000007E0140000004818__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E0140000000000-000000067F000080000007E0140000004000__0000009EBB11FFC0", +"000000067F000080000007E0140000004000-000000067F000080000007E0140000008000__0000009EBB11FFC0", +"000000067F000080000007E0140000004418-000000067F000080000007E0140000025351__0000009E781A9731-0000009EBBC72771", +"000000067F000080000007E0140000004818-000000067F000080000007E014000000AD57__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E0140000008000-000000067F000080000007E014000000C000__0000009EBB11FFC0", +"000000067F000080000007E014000000AD57-000000067F000080000007E0140000011291__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E014000000C000-000000067F000080000007E0140000010000__0000009EBB11FFC0", +"000000067F000080000007E0140000010000-000000067F000080000007E0140000014000__0000009EBB11FFC0", +"000000067F000080000007E0140000011291-000000067F000080000007E0140000017809__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E0140000014000-000000067F000080000007E0140000018000__0000009EBB11FFC0", +"000000067F000080000007E0140000017809-000000067F000080000007E014000001DD22__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E0140000018000-000000067F000080000007E014000001C000__0000009EBB11FFC0", +"000000067F000080000007E014000001C000-000000067F000080000007E0140000020000__0000009EBB11FFC0", +"000000067F000080000007E014000001DD22-000000067F000080000007E0140000024244__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E0140000020000-000000067F000080000007E0140000024000__0000009EBB11FFC0", +"000000067F000080000007E0140000024000-000000067F000080000007E0140000028000__0000009EBB11FFC0", +"000000067F000080000007E0140000024244-000000067F000080000007E014000002A798__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E0140000025355-030000000000000000000000000000000002__0000009E781A9731-0000009EBBC72771", +"000000067F000080000007E0140000028000-000000067F000080000007E014000002C000__0000009EBB11FFC0", +"000000067F000080000007E014000002A798-030000000000000000000000000000000002__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E014000002C000-030000000000000000000000000000000002__0000009EBB11FFC0", +"000000067F000080000008000C00000081F6-000000067F000080000008000C0000010448__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000010448-000000067F000080000008000C000001870A__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C000001870A-000000067F000080000008000C0000020905__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000020905-000000067F000080000008000C0000028AF3__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000028AF3-000000067F000080000008000C0000030CEA__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000030CEA-000000067F000080000008000C0000038EB6__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000038EB6-000000067F000080000008000C00000410B5__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000410B5-000000067F000080000008000C00000492CB__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000492CB-000000067F000080000008000C00000514F8__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000514F8-000000067F000080000008000C000005977B__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C000005977B-000000067F000080000008000C00000619C6__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000619C6-000000067F000080000008000C0000069B6B__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000069B6B-000000067F000080000008000C0000071DBE__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000071DBE-000000067F000080000008000C0000079F8E__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000079F8E-000000067F000080000008000C00000821D7__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000821D7-000000067F000080000008000C000008A3AB__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C000008A3AB-000000067F000080000008000C0000092556__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000092556-000000067F000080000008000C000009A744__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C000009A744-000000067F000080000008000C00000A29B0__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000A29B0-000000067F000080000008000C00000AAC4B__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000AAC4B-000000067F000080000008000C00000B2E21__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000B2E21-000000067F000080000008000C00000BB0DB__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000BB0DB-000000067F000080000008000C00000C331B__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000C331B-000000067F000080000008000C00000CB4D2__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000CB4D2-000000067F000080000008000C00000D3754__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000D3754-000000067F000080000008000C00000DB9C6__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000DB9C6-000000067F000080000008000C00000E3BC1__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000E3BC1-000000067F000080000008000C00000EBE00__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000EBE00-000000067F000080000008000C00000F3F63__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000F3F63-000000067F000080000008000C00000FC160__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000FC160-000000067F000080000008000C0000104448__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000104448-000000067F000080000008000C000010C675__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C000010C675-000000067F000080000008000C020000000B__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C020000000B-000000067F00008000000800140000003ED1__0000009EBBC72771-000000A154401909", +"000000067F00008000000800140000003ED1-000000067F00008000000800140000009486__0000009EBBC72771-000000A154401909", +"000000067F00008000000800140000009486-000000067F0000800000080014000000EA73__0000009EBBC72771-000000A154401909", +"000000067F0000800000080014000000EA73-000000067F0000800000080014000001404D__0000009EBBC72771-000000A154401909", +"000000067F0000800000080014000001404D-000000067F000080000008001400000195A4__0000009EBBC72771-000000A154401909", +"000000067F000080000008001400000195A4-000000067F0000800000080014000001EBB4__0000009EBBC72771-000000A154401909", +"000000067F0000800000080014000001EBB4-000000067F000080000008001400000241E2__0000009EBBC72771-000000A154401909", +"000000067F000080000008001400000241E2-000000067F00008000000800140000029762__0000009EBBC72771-000000A154401909", +"000000067F00008000000800140000029762-030000000000000000000000000000000002__0000009EBBC72771-000000A154401909", +"000000067F000080000008200C0000000000-000000067F000080000008200C0000004000__000000A29F1D8950", +"000000067F000080000008200C0000004000-000000067F000080000008200C0000008000__000000A29F1D8950", +"000000067F000080000008200C0000008000-000000067F000080000008200C000000C000__000000A29F1D8950", +"000000067F000080000008200C000000974D-000000067F000080000008200C0000012EB3__000000A154401909-000000A1E407F839", +"000000067F000080000008200C000000C000-000000067F000080000008200C0000010000__000000A29F1D8950", +"000000067F000080000008200C0000010000-000000067F000080000008200C0000014000__000000A29F1D8950", +"000000067F000080000008200C0000012EB3-000000067F000080000008200C000001C60A__000000A154401909-000000A1E407F839", +"000000067F000080000008200C0000014000-000000067F000080000008200C0000018000__000000A29F1D8950", +"000000067F000080000008200C0000018000-000000067F000080000008200C000001C000__000000A29F1D8950", +"000000067F000080000008200C000001C000-000000067F000080000008200C0000020000__000000A29F1D8950", +"000000067F000080000008200C000001C60A-000000067F000080000008200C0000025D38__000000A154401909-000000A1E407F839", +"000000067F000080000008200C0000020000-000000067F000080000008200C0000024000__000000A29F1D8950", +"000000067F000080000008200C0000024000-000000067F000080000008200C0000028000__000000A29F1D8950", +"000000067F000080000008200C0000025D38-000000067F000080000008200C000002F49E__000000A154401909-000000A1E407F839", +"000000067F000080000008200C0000028000-000000067F000080000008200C000002C000__000000A29F1D8950", +"000000067F000080000008200C000002C000-000000067F000080000008200C0000030000__000000A29F1D8950", +"000000067F000080000008200C000002F49E-000000067F000080000008200C0000038BB1__000000A154401909-000000A1E407F839", +"000000067F000080000008200C0000030000-000000067F000080000008200C0000034000__000000A29F1D8950", +"000000067F000080000008200C0000034000-000000067F000080000008200C0000038000__000000A29F1D8950", +"000000067F000080000008200C0000038000-000000067F000080000008200C000003C000__000000A29F1D8950", +"000000067F000080000008200C0000038BB1-000000067F000080000008200C0000042317__000000A154401909-000000A1E407F839", +"000000067F000080000008200C000003C000-000000067F000080000008200C0000040000__000000A29F1D8950", +"000000067F000080000008200C0000040000-000000067F000080000008200C0000044000__000000A29F1D8950", +"000000067F000080000008200C0000042317-000000067F000080000008200C000004BA7D__000000A154401909-000000A1E407F839", +"000000067F000080000008200C0000044000-000000067F000080000008200C0000048000__000000A29F1D8950", +"000000067F000080000008200C0000048000-000000067F000080000008200C000004C000__000000A29F1D8950", +"000000067F000080000008200C000004BA7D-000000067F000080000008200C00000551B2__000000A154401909-000000A1E407F839", +"000000067F000080000008200C000004C000-000000067F000080000008200C0000050000__000000A29F1D8950", +"000000067F000080000008200C0000050000-000000067F000080000008200C0000054000__000000A29F1D8950", +"000000067F000080000008200C0000054000-000000067F000080000008200C0000058000__000000A29F1D8950", +"000000067F000080000008200C00000551B2-030000000000000000000000000000000002__000000A154401909-000000A1E407F839", +"000000067F000080000008200C0000058000-000000067F000080000008200C000005C000__000000A29F1D8950", +"000000067F000080000008200C000005C000-000000067F000080000008200C0000060000__000000A29F1D8950", +"000000067F000080000008200C000005D8FE-000000067F000080000008200C000006700C__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C0000060000-000000067F000080000008200C0000064000__000000A29F1D8950", +"000000067F000080000008200C0000064000-000000067F000080000008200C0000068000__000000A29F1D8950", +"000000067F000080000008200C000006700C-000000067F000080000008200C000007076D__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C0000068000-000000067F000080000008200C000006C000__000000A29F1D8950", +"000000067F000080000008200C000006C000-000000067F000080000008200C0000070000__000000A29F1D8950", +"000000067F000080000008200C0000070000-000000067F000080000008200C0000074000__000000A29F1D8950", +"000000067F000080000008200C000007076D-000000067F000080000008200C0000079ED3__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C0000074000-000000067F000080000008200C0000078000__000000A29F1D8950", +"000000067F000080000008200C0000078000-000000067F000080000008200C000007C000__000000A29F1D8950", +"000000067F000080000008200C0000079ED3-000000067F000080000008200C000008360A__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C000007C000-000000067F000080000008200C0000080000__000000A29F1D8950", +"000000067F000080000008200C0000080000-000000067F000080000008200C0000084000__000000A29F1D8950", +"000000067F000080000008200C000008360A-000000067F000080000008200C000008CD70__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C0000084000-000000067F000080000008200C0000088000__000000A29F1D8950", +"000000067F000080000008200C0000088000-000000067F000080000008200C000008C000__000000A29F1D8950", +"000000067F000080000008200C000008C000-000000067F000080000008200C0000090000__000000A29F1D8950", +"000000067F000080000008200C000008CD70-000000067F000080000008200C00000964D6__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C0000090000-000000067F000080000008200C0000094000__000000A29F1D8950", +"000000067F000080000008200C0000094000-000000067F000080000008200C0000098000__000000A29F1D8950", +"000000067F000080000008200C00000964D6-000000067F000080000008200C000009FC0B__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C0000098000-000000067F000080000008200C000009C000__000000A29F1D8950", +"000000067F000080000008200C000009C000-000000067F000080000008200C00000A0000__000000A29F1D8950", +"000000067F000080000008200C000009FC0B-000000067F000080000008200C00000A9319__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000A0000-000000067F000080000008200C00000A4000__000000A29F1D8950", +"000000067F000080000008200C00000A4000-000000067F000080000008200C00000A8000__000000A29F1D8950", +"000000067F000080000008200C00000A8000-000000067F000080000008200C00000AC000__000000A29F1D8950", +"000000067F000080000008200C00000A9319-000000067F000080000008200C00000B2A7F__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000AC000-000000067F000080000008200C00000B0000__000000A29F1D8950", +"000000067F000080000008200C00000B0000-000000067F000080000008200C00000B4000__000000A29F1D8950", +"000000067F000080000008200C00000B2A7F-000000067F000080000008200C00000BC1E5__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000B4000-000000067F000080000008200C00000B8000__000000A29F1D8950", +"000000067F000080000008200C00000B8000-000000067F000080000008200C00000BC000__000000A29F1D8950", +"000000067F000080000008200C00000BC000-000000067F000080000008200C00000C0000__000000A29F1D8950", +"000000067F000080000008200C00000BC1E5-000000067F000080000008200C00000C590C__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000C0000-010000000000000000000000000000000001__000000A29F1D8950", +"000000067F000080000008200C00000C590C-000000067F000080000008200C00000CF071__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000CF071-000000067F000080000008200C00000D8786__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000D8786-000000067F000080000008200C00000E1EEC__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000E1EEC-000000067F000080000008200C00000EB60C__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000EB60C-000000067F000080000008200C00000F4D43__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000F4D43-000000067F000080000008200C00000FE4A9__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000FE4A9-000000067F000080000008200C0000107BC5__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C0000107BC5-000000067F000080000008200C000011130B__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C000011130B-01000000000000000100000004000000001C__000000A1E407F839-000000A323C9E001", +"000000067F0000800000082014000000393C-000000067F0000800000082014000000B84D__000000A323C9E001-000000A37A60B1A9", +"000000067F0000800000082014000000B84D-000000067F0000800000082014000001375E__000000A323C9E001-000000A37A60B1A9", +"000000067F0000800000082014000001375E-000000067F0000800000082014000001B66D__000000A323C9E001-000000A37A60B1A9", +"000000067F0000800000082014000001B66D-000000067F0000800000082014000002357E__000000A323C9E001-000000A37A60B1A9", +"000000067F0000800000082014000002357E-000000067F0000800000082014000002B48D__000000A323C9E001-000000A37A60B1A9", +"000000067F0000800000082014000002B48D-030000000000000000000000000000000002__000000A323C9E001-000000A37A60B1A9", +"000000067F000080000008600C0000000000-000000067F000080000008600C0000004000__000000A434813A68", +"000000067F000080000008600C0000004000-000000067F000080000008600C0000008000__000000A434813A68", +"000000067F000080000008600C0000008000-000000067F000080000008600C000000C000__000000A434813A68", +"000000067F000080000008600C0000009747-000000067F000080000008600C0000012EAD__000000A37A60B1A9-000000A3CA47ECA9", +"000000067F000080000008600C000000C000-000000067F000080000008600C0000010000__000000A434813A68", +"000000067F000080000008600C0000010000-000000067F000080000008600C0000014000__000000A434813A68", +"000000067F000080000008600C0000012EAD-000000067F000080000008600C000001C60A__000000A37A60B1A9-000000A3CA47ECA9", +"000000067F000080000008600C0000014000-000000067F000080000008600C0000018000__000000A434813A68", +"000000067F000080000008600C0000018000-000000067F000080000008600C000001C000__000000A434813A68", +"000000067F000080000008600C000001C000-000000067F000080000008600C0000020000__000000A434813A68", +"000000067F000080000008600C000001C60A-000000067F000080000008600C0000025D38__000000A37A60B1A9-000000A3CA47ECA9", +"000000067F000080000008600C0000020000-000000067F000080000008600C0000024000__000000A434813A68", +"000000067F000080000008600C0000024000-000000067F000080000008600C0000028000__000000A434813A68", +"000000067F000080000008600C0000025D38-000000067F000080000008600C000002F49E__000000A37A60B1A9-000000A3CA47ECA9", +"000000067F000080000008600C0000028000-000000067F000080000008600C000002C000__000000A434813A68", +"000000067F000080000008600C000002C000-000000067F000080000008600C0000030000__000000A434813A68", +"000000067F000080000008600C000002F49E-030000000000000000000000000000000002__000000A37A60B1A9-000000A3CA47ECA9", +"000000067F000080000008600C000002F4CA-000000067F000080000008600C0000038BDD__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C0000030000-000000067F000080000008600C0000034000__000000A434813A68", +"000000067F000080000008600C0000034000-000000067F000080000008600C0000038000__000000A434813A68", +"000000067F000080000008600C0000038000-000000067F000080000008600C000003C000__000000A434813A68", +"000000067F000080000008600C0000038BDD-000000067F000080000008600C000004230B__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C000003C000-000000067F000080000008600C0000040000__000000A434813A68", +"000000067F000080000008600C0000040000-000000067F000080000008600C0000044000__000000A434813A68", +"000000067F000080000008600C000004230B-000000067F000080000008600C000004BA71__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C0000044000-000000067F000080000008600C0000048000__000000A434813A68", +"000000067F000080000008600C0000048000-000000067F000080000008600C000004C000__000000A434813A68", +"000000067F000080000008600C000004BA71-000000067F000080000008600C00000551A6__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C000004C000-000000067F000080000008600C0000050000__000000A434813A68", +"000000067F000080000008600C0000050000-000000067F000080000008600C0000054000__000000A434813A68", +"000000067F000080000008600C0000054000-000000067F000080000008600C0000058000__000000A434813A68", +"000000067F000080000008600C00000551A6-000000067F000080000008600C000005E90A__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C0000058000-000000067F000080000008600C000005C000__000000A434813A68", +"000000067F000080000008600C000005C000-000000067F000080000008600C0000060000__000000A434813A68", +"000000067F000080000008600C000005E90A-000000067F000080000008600C000006802C__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C0000060000-000000067F000080000008600C0000064000__000000A434813A68", +"000000067F000080000008600C0000064000-000000067F000080000008600C0000068000__000000A434813A68", +"000000067F000080000008600C0000068000-000000067F000080000008600C000006C000__000000A434813A68", +"000000067F000080000008600C000006802C-000000067F000080000008600C0000071783__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C000006C000-030000000000000000000000000000000002__000000A434813A68", +"000000067F000080000008600C0000071783-000000067F000080000008600C000007AEE9__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C000007AEE9-000000067F000080000008600C000008460B__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C000008460B-000000067F000080000008600C000008DD71__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C000008DD71-000000067F000080000008600C00000974D7__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000974D7-000000067F000080000008600C00000A0C0B__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000A0C0B-000000067F000080000008600C00000AA371__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000AA371-000000067F000080000008600C00000B3AD7__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000B3AD7-000000067F000080000008600C00000BD20B__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000BD20B-000000067F000080000008600C00000C6932__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000C6932-000000067F000080000008600C00000D0098__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000D0098-000000067F000080000008600C00000D97FE__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000D97FE-000000067F000080000008600C00000E2F0B__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000E2F0B-000000067F000080000008600C00000EC671__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000EC671-000000067F000080000008600C00000F5D9F__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000F5D9F-000000067F000080000008600C00000FF505__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000FF505-000000067F000080000008600C0000108C10__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C0000108C10-000000067F000080000008600C0100000000__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C000010ECC4-000000067F00008000000860140000002607__000000A539BDE561-000000A5A081B661", +"000000067F00008000000860140000002607-000000067F0000800000086014000000A518__000000A539BDE561-000000A5A081B661", +"000000067F0000800000086014000000A518-000000067F00008000000860140000012429__000000A539BDE561-000000A5A081B661", +"000000067F00008000000860140000012429-000000067F0000800000086014000001A338__000000A539BDE561-000000A5A081B661", +"000000067F0000800000086014000001A338-000000067F00008000000860140000022249__000000A539BDE561-000000A5A081B661", +"000000067F00008000000860140000022249-000000067F0000800000086014000002A159__000000A539BDE561-000000A5A081B661", +"000000067F0000800000086014000002A159-030000000000000000000000000000000002__000000A539BDE561-000000A5A081B661", +"000000067F000080000008801C0000009703-000000067F000080000008801C0000012E0E__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C0000012E0E-000000067F000080000008801C000001C574__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C000001C574-000000067F000080000008801C0000025CDA__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C0000025CDA-000000067F000080000008801C000002F40A__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C000002F40A-000000067F000080000008801C0000038B1D__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C0000038B1D-000000067F000080000008801C0000042283__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C0000042283-000000067F000080000008801C000004B9E9__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C000004B9E9-000000067F000080000008801C000005510B__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C000005510B-000000067F000080000008801C000005E871__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C000005E871-000000067F000080000008801C0000067F8B__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C0000067F8B-030000000000000000000000000000000002__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C0000068000-000000067F000080000008801C000006C000__000000A76EC5DFE8", +"000000067F000080000008801C00000680F7-000000067F000080000008801C000007180C__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C000006C000-000000067F000080000008801C0000070000__000000A76EC5DFE8", +"000000067F000080000008801C0000070000-000000067F000080000008801C0000074000__000000A76EC5DFE8", +"000000067F000080000008801C000007180C-000000067F000080000008801C000007AF72__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C0000074000-000000067F000080000008801C0000078000__000000A76EC5DFE8", +"000000067F000080000008801C0000078000-000000067F000080000008801C000007C000__000000A76F097A80", +"000000067F000080000008801C000007AF72-000000067F000080000008801C00000846D8__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C000007C000-000000067F000080000008801C0000080000__000000A76F097A80", +"000000067F000080000008801C0000080000-000000067F000080000008801C0000084000__000000A76F097A80", +"000000067F000080000008801C0000084000-000000067F000080000008801C0000088000__000000A76F097A80", +"000000067F000080000008801C00000846D8-000000067F000080000008801C000008DE0B__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C0000088000-000000067F000080000008801C000008C000__000000A76F097A80", +"000000067F000080000008801C000008C000-000000067F000080000008801C0000090000__000000A76F097A80", +"000000067F000080000008801C000008DE0B-000000067F000080000008801C000009752B__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C0000090000-000000067F000080000008801C0000094000__000000A76F097A80", +"000000067F000080000008801C0000094000-000000067F000080000008801C0000098000__000000A76F097A80", +"000000067F000080000008801C000009752B-000000067F000080000008801C00000A0C91__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C0000098000-000000067F000080000008801C000009C000__000000A76F097A80", +"000000067F000080000008801C000009C000-000000067F000080000008801C00000A0000__000000A76F097A80", +"000000067F000080000008801C00000A0000-000000067F000080000008801C00000A4000__000000A76F097A80", +"000000067F000080000008801C00000A0C91-000000067F000080000008801C00000AA3F7__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C00000A4000-000000067F000080000008801C00000A8000__000000A76F097A80", +"000000067F000080000008801C00000A8000-000000067F000080000008801C00000AC000__000000A76F097A80", +"000000067F000080000008801C00000AA3F7-000000067F000080000008801C00000B3B0C__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C00000AC000-000000067F000080000008801C00000B0000__000000A76F097A80", +"000000067F000080000008801C00000B0000-000000067F000080000008801C00000B4000__000000A76F097A80", +"000000067F000080000008801C00000B3B0C-000000067F000080000008801C00000BD272__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C00000B4000-000000067F000080000008801C00000B8000__000000A76F097A80", +"000000067F000080000008801C00000B8000-000000067F000080000008801C00000BC000__000000A76F097A80", +"000000067F000080000008801C00000BC000-000000067F000080000008801C00000C0000__000000A76F097A80", +"000000067F000080000008801C00000BD272-000000067F000080000008801C00000C6999__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C00000C0000-000000067F000080000008801C00000C4000__000000A76F097A80", +"000000067F000080000008801C00000C4000-000000067F000080000008801C00000C8000__000000A76F097A80", +"000000067F000080000008801C00000C6999-000000067F000080000008801C0100000000__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C00000C8000-000000067F000080000008801C00000CC000__000000A76F097A80", +"000000067F000080000008801C00000CC000-000000067F000080000008801C00000D0000__000000A76F097A80", +"000000067F000080000008801C00000CF6B0-000000067F000080000008801C00000D8DC1__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008801C00000D0000-000000067F000080000008801C00000D4000__000000A76F097A80", +"000000067F000080000008801C00000D4000-000000067F000080000008801C00000D8000__000000A76F097A80", +"000000067F000080000008801C00000D8000-000000067F000080000008801C00000DC000__000000A76F097A80", +"000000067F000080000008801C00000D8DC1-000000067F000080000008801C00000E250B__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008801C00000DC000-000000067F000080000008801C00000E0000__000000A76F097A80", +"000000067F000080000008801C00000E0000-000000067F000080000008801C00000E4000__000000A76F097A80", +"000000067F000080000008801C00000E250B-000000067F000080000008801C00000EBC71__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008801C00000E4000-000000067F000080000008801C00000E8000__000000A76F097A80", +"000000067F000080000008801C00000E8000-000000067F000080000008801C00000EC000__000000A76F097A80", +"000000067F000080000008801C00000EBC71-000000067F000080000008801C00000F53A5__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008801C00000EC000-000000067F000080000008801C00000F0000__000000A76F097A80", +"000000067F000080000008801C00000F0000-000000067F000080000008801C00000F4000__000000A76F097A80", +"000000067F000080000008801C00000F4000-000000067F000080000008801C00000F8000__000000A76F097A80", +"000000067F000080000008801C00000F53A5-000000067F000080000008801C00000FEB0B__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008801C00000F8000-000000067F000080000008801C00000FC000__000000A76F097A80", +"000000067F000080000008801C00000FC000-000000067F000080000008801C0000100000__000000A76F097A80", +"000000067F000080000008801C00000FEB0B-000000067F000080000008801C000010822C__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008801C0000100000-000000067F000080000008801C0000104000__000000A76F097A80", +"000000067F000080000008801C0000104000-000000067F000080000008801C0000108000__000000A76F097A80", +"000000067F000080000008801C0000108000-000000067F000080000008801C000010C000__000000A76F097A80", +"000000067F000080000008801C000010822C-000000067F000080000008801C0000111982__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008801C000010C000-000000067F000080000008801C0000110000__000000A76F097A80", +"000000067F000080000008801C0000110000-030000000000000000000000000000000002__000000A76F097A80", +"000000067F000080000008801C0000111982-000000067F000080000008A00C00000084EA__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000084EA-000000067F000080000008A00C0000011C0C__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C0000011C0C-000000067F000080000008A00C000001B372__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C000001B372-000000067F000080000008A00C0000024AD8__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C0000024AD8-000000067F000080000008A00C000002E20B__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C000002E20B-000000067F000080000008A00C0000037928__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C0000037928-000000067F000080000008A00C000004108E__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C000004108E-000000067F000080000008A00C000004A7F4__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C000004A7F4-000000067F000080000008A00C0000053F0B__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C0000053F0B-000000067F000080000008A00C000005D671__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C000005D671-000000067F000080000008A00C0000066D95__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C0000066D95-000000067F000080000008A00C00000704FB__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000704FB-000000067F000080000008A00C0000079C0B__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C0000079C0B-000000067F000080000008A00C0000083351__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C0000083351-000000067F000080000008A00C000008CAB7__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C000008CAB7-000000067F000080000008A00C00000961E2__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000961E2-000000067F000080000008A00C000009F90B__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C000009F90B-000000067F000080000008A00C00000A902B__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000A902B-000000067F000080000008A00C00000B2779__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000B2779-000000067F000080000008A00C00000BBEDF__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000BBEDF-000000067F000080000008A00C00000C560A__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000C560A-000000067F000080000008A00C00000CED70__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000CED70-000000067F000080000008A00C00000D84D6__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000D84D6-000000067F000080000008A00C00000E1C0A__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000E1C0A-000000067F000080000008A00C00000EB370__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000EB370-000000067F000080000008A00C00000F4AD6__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000F4AD6-000000067F000080000008A00C00000FE20B__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000FE20B-030000000000000000000000000000000002__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C0000104A0C-000000067F000080000008A00C000010DF6E__000000A91D97FD49-000000A98AB7EE49", +"000000067F000080000008A00C000010DF6E-000000067F000080000008A0140000001A21__000000A91D97FD49-000000A98AB7EE49", +"000000067F000080000008A0140000001A21-000000067F000080000008A0140000009932__000000A91D97FD49-000000A98AB7EE49", +"000000067F000080000008A0140000009932-000000067F000080000008A0140000011843__000000A91D97FD49-000000A98AB7EE49", +"000000067F000080000008A0140000011843-000000067F000080000008A0140000019753__000000A91D97FD49-000000A98AB7EE49", +"000000067F000080000008A0140000019753-000000067F000080000008A0140000021664__000000A91D97FD49-000000A98AB7EE49", +"000000067F000080000008A0140000021664-01000000000000000100000004000000001C__000000A91D97FD49-000000A98AB7EE49", +"000000067F000080000008C00C0000000000-000000067F000080000008C00C0000004000__000000AAEBE534F8", +"000000067F000080000008C00C0000002330-000000067F000080000008C00C000000BA96__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C0000004000-000000067F000080000008C00C0000008000__000000AAEBE534F8", +"000000067F000080000008C00C0000008000-000000067F000080000008C00C000000C000__000000AAEBE534F8", +"000000067F000080000008C00C000000BA96-000000067F000080000008C00C00000151CB__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C000000C000-000000067F000080000008C00C0000010000__000000AAEBE534F8", +"000000067F000080000008C00C0000010000-000000067F000080000008C00C0000014000__000000AAEBE534F8", +"000000067F000080000008C00C0000014000-000000067F000080000008C00C0000018000__000000AAEBE534F8", +"000000067F000080000008C00C00000151CB-000000067F000080000008C00C000001E90B__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C0000018000-000000067F000080000008C00C000001C000__000000AAEBE534F8", +"000000067F000080000008C00C000001C000-000000067F000080000008C00C0000020000__000000AAEBE534F8", +"000000067F000080000008C00C000001E90B-000000067F000080000008C00C000002802C__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C0000020000-000000067F000080000008C00C0000024000__000000AAEBE534F8", +"000000067F000080000008C00C0000024000-000000067F000080000008C00C0000028000__000000AAEBE534F8", +"000000067F000080000008C00C0000028000-000000067F000080000008C00C000002C000__000000AAEBE534F8", +"000000067F000080000008C00C000002802C-000000067F000080000008C00C0000031783__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C000002C000-000000067F000080000008C00C0000030000__000000AAEBE534F8", +"000000067F000080000008C00C0000030000-000000067F000080000008C00C0000034000__000000AAEBE534F8", +"000000067F000080000008C00C0000031783-000000067F000080000008C00C000003AEE9__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C0000034000-000000067F000080000008C00C0000038000__000000AAEBE534F8", +"000000067F000080000008C00C0000038000-000000067F000080000008C00C000003C000__000000AAEBE534F8", +"000000067F000080000008C00C000003AEE9-000000067F000080000008C00C000004460B__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C000003C000-000000067F000080000008C00C0000040000__000000AAEBE534F8", +"000000067F000080000008C00C0000040000-000000067F000080000008C00C0000044000__000000AAEBE534F8", +"000000067F000080000008C00C0000044000-000000067F000080000008C00C0000048000__000000AAEBE534F8", +"000000067F000080000008C00C000004460B-000000067F000080000008C00C000004DD71__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C0000048000-000000067F000080000008C00C000004C000__000000AAEBE534F8", +"000000067F000080000008C00C000004C000-000000067F000080000008C00C0000050000__000000AAEBE534F8", +"000000067F000080000008C00C000004DD71-030000000000000000000000000000000002__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C0000050000-000000067F000080000008C00C0000054000__000000AAEBE534F8", +"000000067F000080000008C00C0000054000-000000067F000080000008C00C0000058000__000000AAEBE534F8", +"000000067F000080000008C00C0000058000-000000067F000080000008C00C000005C000__000000AAEBE534F8", +"000000067F000080000008C00C000005C000-000000067F000080000008C00C0000060000__000000AAEBE534F8", +"000000067F000080000008C00C000005DA8C-000000067F000080000008C00C00000671AE__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C0000060000-000000067F000080000008C00C0000064000__000000AAEBE534F8", +"000000067F000080000008C00C0000064000-000000067F000080000008C00C0000068000__000000AAEBE534F8", +"000000067F000080000008C00C00000671AE-000000067F000080000008C00C000007090A__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C0000068000-000000067F000080000008C00C000006C000__000000AAEBE534F8", +"000000067F000080000008C00C000006C000-000000067F000080000008C00C0000070000__000000AAEBE534F8", +"000000067F000080000008C00C0000070000-000000067F000080000008C00C0000074000__000000AAEBE534F8", +"000000067F000080000008C00C000007090A-000000067F000080000008C00C000007A070__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C0000074000-000000067F000080000008C00C0000078000__000000AAEBE534F8", +"000000067F000080000008C00C0000078000-000000067F000080000008C00C000007C000__000000AAEBE534F8", +"000000067F000080000008C00C000007A070-000000067F000080000008C00C00000837B4__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C000007C000-000000067F000080000008C00C0000080000__000000AAEBE534F8", +"000000067F000080000008C00C0000080000-000000067F000080000008C00C0000084000__000000AAEBE534F8", +"000000067F000080000008C00C00000837B4-000000067F000080000008C00C000008CF0A__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C0000084000-000000067F000080000008C00C0000088000__000000AAEBE534F8", +"000000067F000080000008C00C0000088000-000000067F000080000008C00C000008C000__000000AAEBE534F8", +"000000067F000080000008C00C000008C000-000000067F000080000008C00C0000090000__000000AAEBE534F8", +"000000067F000080000008C00C000008CF0A-000000067F000080000008C00C0000096670__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C0000090000-000000067F000080000008C00C0000094000__000000AAEBE534F8", +"000000067F000080000008C00C0000094000-000000067F000080000008C00C0000098000__000000AAEBE534F8", +"000000067F000080000008C00C0000096670-000000067F000080000008C00C000009FDD6__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C0000098000-000000067F000080000008C00C000009C000__000000AAEBE534F8", +"000000067F000080000008C00C000009C000-000000067F000080000008C00C00000A0000__000000AAEBE534F8", +"000000067F000080000008C00C000009FDD6-000000067F000080000008C00C00000A952A__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000A0000-000000067F000080000008C00C00000A4000__000000AAEBE534F8", +"000000067F000080000008C00C00000A4000-000000067F000080000008C00C00000A8000__000000AAEBE534F8", +"000000067F000080000008C00C00000A8000-000000067F000080000008C00C00000AC000__000000AAEBE534F8", +"000000067F000080000008C00C00000A952A-000000067F000080000008C00C00000B2C90__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000AC000-000000067F000080000008C00C00000B0000__000000AAEBE534F8", +"000000067F000080000008C00C00000B0000-000000067F000080000008C00C00000B4000__000000AAEBE534F8", +"000000067F000080000008C00C00000B2C90-000000067F000080000008C00C00000BC3F6__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000B4000-000000067F000080000008C00C00000B8000__000000AAEBE534F8", +"000000067F000080000008C00C00000B8000-000000067F000080000008C00C00000BC000__000000AAEBE534F8", +"000000067F000080000008C00C00000BC000-000000067F000080000008C00C00000C0000__000000AAEBE534F8", +"000000067F000080000008C00C00000BC3F6-000000067F000080000008C00C00000C5B0C__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000C0000-000000067F000080000008C00C00000C4000__000000AAEBE534F8", +"000000067F000080000008C00C00000C4000-000000067F000080000008C00C00000C8000__000000AAEBE534F8", +"000000067F000080000008C00C00000C5B0C-000000067F000080000008C00C00000CF272__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000C8000-030000000000000000000000000000000002__000000AAEBE534F8", +"000000067F000080000008C00C00000CF272-000000067F000080000008C00C00000D8986__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000D8986-000000067F000080000008C00C00000E20EC__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000E20EC-000000067F000080000008C00C00000EB80A__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000EB80A-000000067F000080000008C00C00000F4F40__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000F4F40-000000067F000080000008C00C00000FE6A6__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000FE6A6-000000067F000080000008C00C0000107DC1__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C0000107DC1-000000067F000080000008C00C000011150A__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C000011150A-01000000000000000100000004000000001C__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008E00C0000000000-000000067F000080000008E00C0000004000__000000AD3698E000", +"000000067F000080000008E00C0000004000-000000067F000080000008E00C0000008000__000000AD3698E000", +"000000067F000080000008E00C00000077B3-000000067F000080000008E00C0000010F0A__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C0000008000-000000067F000080000008E00C000000C000__000000AD3698E000", +"000000067F000080000008E00C000000C000-000000067F000080000008E00C0000010000__000000AD3698E000", +"000000067F000080000008E00C0000010000-000000067F000080000008E00C0000014000__000000AD3698E000", +"000000067F000080000008E00C0000010F0A-000000067F000080000008E00C000001A670__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C0000014000-000000067F000080000008E00C0000018000__000000AD3698E000", +"000000067F000080000008E00C0000018000-000000067F000080000008E00C000001C000__000000AD3698E000", +"000000067F000080000008E00C000001A670-000000067F000080000008E00C0000023DB1__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C000001C000-000000067F000080000008E00C0000020000__000000AD3698E000", +"000000067F000080000008E00C0000020000-000000067F000080000008E00C0000024000__000000AD3698E000", +"000000067F000080000008E00C0000023DB1-000000067F000080000008E00C000002D50A__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C0000024000-000000067F000080000008E00C0000028000__000000AD3698E000", +"000000067F000080000008E00C0000028000-000000067F000080000008E00C000002C000__000000AD3698E000", +"000000067F000080000008E00C000002C000-000000067F000080000008E00C0000030000__000000AD3698E000", +"000000067F000080000008E00C000002D50A-000000067F000080000008E00C0000036C30__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C0000030000-000000067F000080000008E00C0000034000__000000AD3698E000", +"000000067F000080000008E00C0000034000-000000067F000080000008E00C0000038000__000000AD3698E000", +"000000067F000080000008E00C0000036C30-000000067F000080000008E00C0000040393__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C0000038000-000000067F000080000008E00C000003C000__000000AD3698E000", +"000000067F000080000008E00C000003C000-000000067F000080000008E00C0000040000__000000AD3698E000", +"000000067F000080000008E00C0000040000-000000067F000080000008E00C0000044000__000000AD3698E000", +"000000067F000080000008E00C0000040393-000000067F000080000008E00C0000049AF9__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C0000044000-000000067F000080000008E00C0000048000__000000AD3698E000", +"000000067F000080000008E00C0000048000-000000067F000080000008E00C000004C000__000000AD3698E000", +"000000067F000080000008E00C0000049AF9-000000067F000080000008E00C000005320C__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C000004C000-000000067F000080000008E00C0000050000__000000AD3698E000", +"000000067F000080000008E00C0000050000-000000067F000080000008E00C0000054000__000000AD3698E000", +"000000067F000080000008E00C000005320C-030000000000000000000000000000000002__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C0000054000-000000067F000080000008E00C0000058000__000000AD34AF7FD8", +"000000067F000080000008E00C000005523E-000000067F000080000008E00C000005E9A4__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C0000058000-000000067F000080000008E00C000005C000__000000AD34AF7FD8", +"000000067F000080000008E00C000005C000-000000067F000080000008E00C0000060000__000000AD34AF7FD8", +"000000067F000080000008E00C000005E9A4-000000067F000080000008E00C000006810A__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C0000060000-000000067F000080000008E00C0000064000__000000AD34AF7FD8", +"000000067F000080000008E00C0000064000-000000067F000080000008E00C0000068000__000000AD34AF7FD8", +"000000067F000080000008E00C0000068000-000000067F000080000008E00C000006C000__000000AD34AF7FD8", +"000000067F000080000008E00C000006810A-000000067F000080000008E00C0000071870__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C000006C000-000000067F000080000008E00C0000070000__000000AD34AF7FD8", +"000000067F000080000008E00C0000070000-000000067F000080000008E00C0000074000__000000AD34AF7FD8", +"000000067F000080000008E00C0000071870-000000067F000080000008E00C000007AFD6__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C0000074000-000000067F000080000008E00C0000078000__000000AD34AF7FD8", +"000000067F000080000008E00C0000078000-000000067F000080000008E00C000007C000__000000AD34AF7FD8", +"000000067F000080000008E00C000007AFD6-000000067F000080000008E00C000008470B__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C000007C000-000000067F000080000008E00C0000080000__000000AD34AF7FD8", +"000000067F000080000008E00C0000080000-000000067F000080000008E00C0000084000__000000AD34AF7FD8", +"000000067F000080000008E00C0000084000-000000067F000080000008E00C0000088000__000000AD34AF7FD8", +"000000067F000080000008E00C000008470B-000000067F000080000008E00C000008DE71__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C0000088000-000000067F000080000008E00C000008C000__000000AD34AF7FD8", +"000000067F000080000008E00C000008C000-000000067F000080000008E00C0000090000__000000AD34AF7FD8", +"000000067F000080000008E00C000008DE71-000000067F000080000008E00C0000097591__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C0000090000-000000067F000080000008E00C0000094000__000000AD34AF7FD8", +"000000067F000080000008E00C0000094000-000000067F000080000008E00C0000098000__000000AD34AF7FD8", +"000000067F000080000008E00C0000097591-000000067F000080000008E00C00000A0CF7__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C0000098000-000000067F000080000008E00C000009C000__000000AD34AF7FD8", +"000000067F000080000008E00C000009C000-000000067F000080000008E00C00000A0000__000000AD34AF7FD8", +"000000067F000080000008E00C00000A0000-000000067F000080000008E00C00000A4000__000000AD34AF7FD8", +"000000067F000080000008E00C00000A0CF7-000000067F000080000008E00C00000AA40B__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C00000A4000-000000067F000080000008E00C00000A8000__000000AD34AF7FD8", +"000000067F000080000008E00C00000A8000-000000067F000080000008E00C00000AC000__000000AD34AF7FD8", +"000000067F000080000008E00C00000AA40B-000000067F000080000008E00C00000B3B4D__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C00000AC000-000000067F000080000008E00C00000B0000__000000AD34AF7FD8", +"000000067F000080000008E00C00000B0000-000000067F000080000008E00C00000B4000__000000AD34AF7FD8", +"000000067F000080000008E00C00000B3B4D-000000067F000080000008E00C0100000000__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C00000B4000-000000067F000080000008E00C00000B8000__000000AD34AF7FD8", +"000000067F000080000008E00C00000B8000-000000067F000080000008E00C00000BC000__000000AD34AF7FD8", +"000000067F000080000008E00C00000BC000-000000067F000080000008E00C00000C0000__000000AD34AF7FD8", +"000000067F000080000008E00C00000BC018-000000067F000080000008E00C00000C5749__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C00000C0000-000000067F000080000008E00C00000C4000__000000AD34AF7FD8", +"000000067F000080000008E00C00000C4000-000000067F000080000008E00C00000C8000__000000AD34AF7FD8", +"000000067F000080000008E00C00000C5749-000000067F000080000008E00C00000CEEAF__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C00000C8000-000000067F000080000008E00C00000CC000__000000AD34AF7FD8", +"000000067F000080000008E00C00000CC000-000000067F000080000008E00C00000D0000__000000AD34AF7FD8", +"000000067F000080000008E00C00000CEEAF-000000067F000080000008E00C00000D85C5__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C00000D0000-000000067F000080000008E00C00000D4000__000000AD34AF7FD8", +"000000067F000080000008E00C00000D4000-000000067F000080000008E00C00000D8000__000000AD34AF7FD8", +"000000067F000080000008E00C00000D8000-000000067F000080000008E00C00000DC000__000000AD34AF7FD8", +"000000067F000080000008E00C00000D85C5-000000067F000080000008E00C00000E1D0B__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C00000DC000-000000067F000080000008E00C00000E0000__000000AD34AF7FD8", +"000000067F000080000008E00C00000E0000-000000067F000080000008E00C00000E4000__000000AD34AF7FD8", +"000000067F000080000008E00C00000E1D0B-000000067F000080000008E00C00000EB471__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C00000E4000-000000067F000080000008E00C00000E8000__000000AD34AF7FD8", +"000000067F000080000008E00C00000E8000-000000067F000080000008E00C00000EC000__000000AD34AF7FD8", +"000000067F000080000008E00C00000EB471-000000067F000080000008E00C00000F4BAA__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C00000EC000-000000067F000080000008E00C00000F0000__000000AD34AF7FD8", +"000000067F000080000008E00C00000F0000-000000067F000080000008E00C00000F4000__000000AD34AF7FD8", +"000000067F000080000008E00C00000F4000-000000067F000080000008E00C00000F8000__000000AD34AF7FD8", +"000000067F000080000008E00C00000F4BAA-000000067F000080000008E00C00000FE30A__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C00000F8000-000000067F000080000008E00C00000FC000__000000AD34AF7FD8", +"000000067F000080000008E00C00000FC000-000000067F000080000008E00C0000100000__000000AD34AF7FD8", +"000000067F000080000008E00C00000FE30A-000000067F000080000008E00C0000107A2C__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C0000100000-000000067F000080000008E00C0000104000__000000AD34AF7FD8", +"000000067F000080000008E00C0000104000-000000067F000080000008E00C0000108000__000000AD34AF7FD8", +"000000067F000080000008E00C0000107A2C-000000067F000080000008E00C0000111187__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C0000108000-000000067F000080000008E00C000010C000__000000AD34AF7FD8", +"000000067F000080000008E00C000010C000-000000067F000080000008E00C0000110000__000000AD34AF7FD8", +"000000067F000080000008E00C0000110000-030000000000000000000000000000000002__000000AD34AF7FD8", +"000000067F000080000008E00C0000111187-01000000000000000100000004000000001C__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E0140000003E33-000000067F000080000008E014000000BD44__000000AD36393FE9-000000ADB047EAB9", +"000000067F000080000008E014000000BD44-000000067F000080000008E0140000013C54__000000AD36393FE9-000000ADB047EAB9", +"000000067F000080000008E0140000013C54-000000067F000080000008E014000001BB63__000000AD36393FE9-000000ADB047EAB9", +"000000067F000080000008E014000001BB63-000000067F000080000008E0140000023A74__000000AD36393FE9-000000ADB047EAB9", +"000000067F000080000008E0140000023A74-000000067F000080000008E014000002B984__000000AD36393FE9-000000ADB047EAB9", +"000000067F000080000008E014000002B984-000000067F000080000008E0220000006AD0__000000AD36393FE9-000000ADB047EAB9", +"000000067F000080000008E0220000000000-000000067F000080000008E0220000004000__000000AF5D7D4000", +"000000067F000080000008E0220000004000-000000067F000080000008E0220000008000__000000AF5D7D4000", +"000000067F000080000008E0220000006AD0-000000067F000080000008E022000001020C__000000AD36393FE9-000000ADB047EAB9", +"000000067F000080000008E0220000008000-000000067F000080000008E022000000C000__000000AF5D7D4000", +"000000067F000080000008E022000000C000-000000067F000080000008E0220000010000__000000AF5D7D4000", +"000000067F000080000008E0220000010000-000000067F000080000008E0220000014000__000000AF5D7D4000", +"000000067F000080000008E022000001020C-01000000000000000100000004000000001C__000000AD36393FE9-000000ADB047EAB9", +"000000067F000080000008E0220000014000-000000067F000080000008E0220000018000__000000AF56604248", +"000000067F000080000008E02200000151DD-000000067F000080000008E022000001E90B__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000018000-000000067F000080000008E022000001C000__000000AF56604248", +"000000067F000080000008E022000001C000-000000067F000080000008E0220000020000__000000AF56604248", +"000000067F000080000008E022000001E90B-000000067F000080000008E022000002802C__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000020000-000000067F000080000008E0220000024000__000000AF56604248", +"000000067F000080000008E0220000024000-000000067F000080000008E0220000028000__000000AF56604248", +"000000067F000080000008E0220000028000-000000067F000080000008E022000002C000__000000AF56604248", +"000000067F000080000008E022000002802C-000000067F000080000008E0220000031783__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E022000002C000-000000067F000080000008E0220000030000__000000AF56604248", +"000000067F000080000008E0220000030000-000000067F000080000008E0220000034000__000000AF56604248", +"000000067F000080000008E0220000031783-000000067F000080000008E022000003AEE9__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000034000-000000067F000080000008E0220000038000__000000AF56604248", +"000000067F000080000008E0220000038000-000000067F000080000008E022000003C000__000000AF56604248", +"000000067F000080000008E022000003AEE9-000000067F000080000008E022000004460B__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E022000003C000-000000067F000080000008E0220000040000__000000AF56604248", +"000000067F000080000008E0220000040000-000000067F000080000008E0220000044000__000000AF56604248", +"000000067F000080000008E0220000044000-000000067F000080000008E0220000048000__000000AF56604248", +"000000067F000080000008E022000004460B-000000067F000080000008E022000004DD71__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000048000-000000067F000080000008E022000004C000__000000AF56604248", +"000000067F000080000008E022000004C000-000000067F000080000008E0220000050000__000000AF56604248", +"000000067F000080000008E022000004DD71-000000067F000080000008E02200000574D7__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000050000-000000067F000080000008E0220000054000__000000AF56604248", +"000000067F000080000008E0220000054000-000000067F000080000008E0220000058000__000000AF56604248", +"000000067F000080000008E02200000574D7-000000067F000080000008E0220000060C0B__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000058000-000000067F000080000008E022000005C000__000000AF56604248", +"000000067F000080000008E022000005C000-000000067F000080000008E0220000060000__000000AF56604248", +"000000067F000080000008E0220000060000-000000067F000080000008E0220000064000__000000AF56604248", +"000000067F000080000008E0220000060C0B-000000067F000080000008E022000006A371__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000064000-000000067F000080000008E0220000068000__000000AF56604248", +"000000067F000080000008E0220000068000-000000067F000080000008E022000006C000__000000AF56604248", +"000000067F000080000008E022000006A371-000000067F000080000008E0220000073AD7__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E022000006C000-000000067F000080000008E0220000070000__000000AF56604248", +"000000067F000080000008E0220000070000-000000067F000080000008E0220000074000__000000AF56604248", +"000000067F000080000008E0220000073AD7-000000067F000080000008E022000007D20B__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000074000-000000067F000080000008E0220000078000__000000AF56604248", +"000000067F000080000008E0220000078000-000000067F000080000008E022000007C000__000000AF56604248", +"000000067F000080000008E022000007C000-000000067F000080000008E0220000080000__000000AF56604248", +"000000067F000080000008E022000007D20B-000000067F000080000008E0220000086932__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000080000-000000067F000080000008E0220000084000__000000AF56604248", +"000000067F000080000008E0220000084000-000000067F000080000008E0220000088000__000000AF56604248", +"000000067F000080000008E0220000086932-000000067F000080000008E0220100000000__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000088000-000000067F000080000008E022000008C000__000000AF56604248", +"000000067F000080000008E022000008C000-000000067F000080000008E0220000090000__000000AF56604248", +"000000067F000080000008E022000008E3D1-000000067F000080000008E022000009797E__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E0220000090000-000000067F000080000008E0220000094000__000000AF56604248", +"000000067F000080000008E0220000094000-000000067F000080000008E0220000098000__000000AF56604248", +"000000067F000080000008E022000009797E-000000067F000080000008E02200000A10E4__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E0220000098000-000000067F000080000008E022000009C000__000000AF56604248", +"000000067F000080000008E022000009C000-000000067F000080000008E02200000A0000__000000AF56604248", +"000000067F000080000008E02200000A0000-000000067F000080000008E02200000A4000__000000AF56604248", +"000000067F000080000008E02200000A10E4-000000067F000080000008E02200000AA80B__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000A4000-000000067F000080000008E02200000A8000__000000AF56604248", +"000000067F000080000008E02200000A8000-000000067F000080000008E02200000AC000__000000AF56604248", +"000000067F000080000008E02200000AA80B-000000067F000080000008E02200000B3F4B__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000AC000-000000067F000080000008E02200000B0000__000000AF56604248", +"000000067F000080000008E02200000B0000-000000067F000080000008E02200000B4000__000000AF56604248", +"000000067F000080000008E02200000B3F4B-000000067F000080000008E02200000BD6B1__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000B4000-000000067F000080000008E02200000B8000__000000AF56604248", +"000000067F000080000008E02200000B8000-000000067F000080000008E02200000BC000__000000AF56604248", +"000000067F000080000008E02200000BC000-000000067F000080000008E02200000C0000__000000AF56604248", +"000000067F000080000008E02200000BD6B1-000000067F000080000008E02200000C6DD5__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000C0000-000000067F000080000008E02200000C4000__000000AF56604248", +"000000067F000080000008E02200000C4000-000000067F000080000008E02200000C8000__000000AF56604248", +"000000067F000080000008E02200000C6DD5-000000067F000080000008E02200000D050B__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000C8000-000000067F000080000008E02200000CC000__000000AF56604248", +"000000067F000080000008E02200000CC000-000000067F000080000008E02200000D0000__000000AF56604248", +"000000067F000080000008E02200000D0000-000000067F000080000008E02200000D4000__000000AF56604248", +"000000067F000080000008E02200000D050B-000000067F000080000008E02200000D9C71__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000D4000-000000067F000080000008E02200000D8000__000000AF56604248", +"000000067F000080000008E02200000D8000-000000067F000080000008E02200000DC000__000000AF56604248", +"000000067F000080000008E02200000D9C71-000000067F000080000008E02200000E33B8__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000DC000-000000067F000080000008E02200000E0000__000000AF56604248", +"000000067F000080000008E02200000E0000-000000067F000080000008E02200000E4000__000000AF56604248", +"000000067F000080000008E02200000E33B8-000000067F000080000008E02200000ECB09__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000E4000-000000067F000080000008E02200000E8000__000000AF56604248", +"000000067F000080000008E02200000E8000-000000067F000080000008E02200000EC000__000000AF56604248", +"000000067F000080000008E02200000EC000-000000067F000080000008E02200000F0000__000000AF56604248", +"000000067F000080000008E02200000ECB09-000000067F000080000008E02200000F626F__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000F0000-000000067F000080000008E02200000F4000__000000AF56604248", +"000000067F000080000008E02200000F4000-000000067F000080000008E02200000F8000__000000AF56604248", +"000000067F000080000008E02200000F626F-000000067F000080000008E02200000FF9D5__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000F8000-000000067F000080000008E02200000FC000__000000AF56604248", +"000000067F000080000008E02200000FC000-000000067F000080000008E0220000100000__000000AF56604248", +"000000067F000080000008E02200000FF9D5-000000067F000080000008E022000010912A__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E0220000100000-000000067F000080000008E0220000104000__000000AF56604248", +"000000067F000080000008E0220000104000-000000067F000080000008E0220000108000__000000AF56604248", +"000000067F000080000008E0220000108000-000000067F000080000008E022000010C000__000000AF56604248", +"000000067F000080000008E022000010912A-000000067F000080000008E0220000111C20__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E022000010C000-030000000000000000000000000000000002__000000AF56604248", +"000000067F000080000008E02200FFFFFFFF-01000000000000000100000004000000001C__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02A000000529F-000000067F000080000008E02A000000D1B0__000000AF5D587FE1-000000AFB4666001", +"000000067F000080000008E02A000000D1B0-000000067F000080000008E02A00000150BF__000000AF5D587FE1-000000AFB4666001", +"000000067F000080000008E02A00000150BF-000000067F000080000008E02A000001CFD0__000000AF5D587FE1-000000AFB4666001", +"000000067F000080000008E02A000001CFD0-000000067F000080000008E02A0000024EE1__000000AF5D587FE1-000000AFB4666001", +"000000067F000080000008E02A0000024EE1-000000067F000080000008E02A000002CDF1__000000AF5D587FE1-000000AFB4666001", +"000000067F000080000008E02A000002CDF1-030000000000000000000000000000000002__000000AF5D587FE1-000000AFB4666001", +"000000067F00008000000900380000000000-000000067F0000800000090038000000970B__000000AFB4666001-000000B05429F579", +"000000067F0000800000090038000000970B-000000067F00008000000900380000012E71__000000AFB4666001-000000B05429F579", +"000000067F00008000000900380000012E71-000000067F0000800000090038000001C5D7__000000AFB4666001-000000B05429F579", +"000000067F0000800000090038000001C5D7-000000067F00008000000900380000025D2B__000000AFB4666001-000000B05429F579", +"000000067F00008000000900380000025D2B-000000067F0000800000090038000002F491__000000AFB4666001-000000B05429F579", +"000000067F0000800000090038000002F491-000000067F00008000000900380000038BA4__000000AFB4666001-000000B05429F579", +"000000067F00008000000900380000038BA4-000000067F0000800000090038000004230A__000000AFB4666001-000000B05429F579", +"000000067F0000800000090038000004230A-000000067F0000800000090038000004BA70__000000AFB4666001-000000B05429F579", +"000000067F0000800000090038000004BA70-000000067F000080000009003800000551A5__000000AFB4666001-000000B05429F579", +"000000067F000080000009003800000551A5-000000067F0000800000090038000005E909__000000AFB4666001-000000B05429F579", +"000000067F0000800000090038000005C000-000000067F00008000000900380000060000__000000B18434BFD0", +"000000067F0000800000090038000005E909-000000067F000080000009003B0100000000__000000AFB4666001-000000B05429F579", +"000000067F0000800000090038000005EA0C-000000067F00008000000900380000068125__000000B05429F579-000000B0F3EDEAC9", +"000000067F00008000000900380000060000-000000067F00008000000900380000064000__000000B18434BFD0", +"000000067F00008000000900380000064000-000000067F00008000000900380000068000__000000B18434BFD0", +"000000067F00008000000900380000068000-000000067F0000800000090038000006C000__000000B18434BFD0", +"000000067F00008000000900380000068125-000000067F0000800000090038000007188B__000000B05429F579-000000B0F3EDEAC9", +"000000067F0000800000090038000006C000-000000067F00008000000900380000070000__000000B18434BFD0", +"000000067F00008000000900380000070000-000000067F00008000000900380000074000__000000B18434BFD0", +"000000067F0000800000090038000007188B-000000067F0000800000090038000007AFF1__000000B05429F579-000000B0F3EDEAC9", +"000000067F00008000000900380000074000-000000067F00008000000900380000078000__000000B18434BFD0", +"000000067F00008000000900380000078000-000000067F0000800000090038000007C000__000000B18434BFD0", +"000000067F0000800000090038000007AFF1-000000067F0000800000090038000008470C__000000B05429F579-000000B0F3EDEAC9", +"000000067F0000800000090038000007C000-000000067F00008000000900380000080000__000000B18434BFD0", +"000000067F00008000000900380000080000-000000067F00008000000900380000084000__000000B18434BFD0", +"000000067F00008000000900380000084000-000000067F00008000000900380000088000__000000B18434BFD0", +"000000067F0000800000090038000008470C-000000067F0000800000090038000008DE72__000000B05429F579-000000B0F3EDEAC9", +"000000067F00008000000900380000088000-000000067F0000800000090038000008C000__000000B18434BFD0", +"000000067F0000800000090038000008C000-000000067F00008000000900380000090000__000000B18434BFD0", +"000000067F0000800000090038000008DE72-000000067F00008000000900380000097592__000000B05429F579-000000B0F3EDEAC9", +"000000067F00008000000900380000090000-000000067F00008000000900380000094000__000000B18434BFD0", +"000000067F00008000000900380000094000-000000067F00008000000900380000098000__000000B18434BFD0", +"000000067F00008000000900380000097592-000000067F000080000009003800000A0CF8__000000B05429F579-000000B0F3EDEAC9", +"000000067F00008000000900380000098000-000000067F0000800000090038000009C000__000000B18434BFD0", +"000000067F0000800000090038000009C000-000000067F000080000009003800000A0000__000000B18434BFD0", +"000000067F000080000009003800000A0000-000000067F000080000009003800000A4000__000000B18434BFD0", +"000000067F000080000009003800000A0CF8-000000067F000080000009003800000AA40C__000000B05429F579-000000B0F3EDEAC9", +"000000067F000080000009003800000A4000-000000067F000080000009003800000A8000__000000B18434BFD0", +"000000067F000080000009003800000A8000-000000067F000080000009003800000AC000__000000B18434BFD0", +"000000067F000080000009003800000AA40C-000000067F000080000009003800000B3B4E__000000B05429F579-000000B0F3EDEAC9", +"000000067F000080000009003800000AC000-000000067F000080000009003800000B0000__000000B18434BFD0", +"000000067F000080000009003800000B0000-000000067F000080000009003800000B4000__000000B18434BFD0", +"000000067F000080000009003800000B3B4E-000000067F000080000009003800000BD2B4__000000B05429F579-000000B0F3EDEAC9", +"000000067F000080000009003800000B4000-000000067F000080000009003800000B8000__000000B18434BFD0", +"000000067F000080000009003800000B8000-000000067F000080000009003800000BC000__000000B18434BFD0", +"000000067F000080000009003800000BC000-000000067F000080000009003800000C0000__000000B18434BFD0", +"000000067F000080000009003800000BD2B4-000000067F00008000000900380100000000__000000B05429F579-000000B0F3EDEAC9", +"000000067F000080000009003800000C0000-000000067F000080000009003800000C4000__000000B18434BFD0", +"000000067F000080000009003800000C4000-000000067F000080000009003800000C8000__000000B18434BFD0", +"000000067F000080000009003800000C5213-000000067F000080000009003800000CE979__000000B0F3EDEAC9-000000B18495C001", +"000000067F000080000009003800000C8000-000000067F000080000009003800000CC000__000000B18434BFD0", +"000000067F000080000009003800000CC000-000000067F000080000009003800000D0000__000000B18434BFD0", +"000000067F000080000009003800000CE979-000000067F000080000009003800000D80DF__000000B0F3EDEAC9-000000B18495C001", +"000000067F000080000009003800000D0000-000000067F000080000009003800000D4000__000000B18434BFD0", +"000000067F000080000009003800000D4000-000000067F000080000009003800000D8000__000000B18434BFD0", +"000000067F000080000009003800000D8000-000000067F000080000009003800000DC000__000000B18434BFD0", +"000000067F000080000009003800000D80DF-000000067F000080000009003800000E180A__000000B0F3EDEAC9-000000B18495C001", +"000000067F000080000009003800000DC000-000000067F000080000009003800000E0000__000000B18434BFD0", +"000000067F000080000009003800000E0000-000000067F000080000009003800000E4000__000000B18434BFD0", +"000000067F000080000009003800000E180A-000000067F000080000009003800000EAF70__000000B0F3EDEAC9-000000B18495C001", +"000000067F000080000009003800000E4000-000000067F000080000009003800000E8000__000000B18434BFD0", +"000000067F000080000009003800000E8000-000000067F000080000009003800000EC000__000000B18434BFD0", +"000000067F000080000009003800000EAF70-000000067F000080000009003800000F46D6__000000B0F3EDEAC9-000000B18495C001", +"000000067F000080000009003800000EC000-000000067F000080000009003800000F0000__000000B18434BFD0", +"000000067F000080000009003800000F0000-000000067F000080000009003800000F4000__000000B18434BFD0", +"000000067F000080000009003800000F4000-000000067F000080000009003800000F8000__000000B18434BFD0", +"000000067F000080000009003800000F46D6-000000067F000080000009003800000FDE0B__000000B0F3EDEAC9-000000B18495C001", +"000000067F000080000009003800000F8000-000000067F000080000009003800000FC000__000000B18434BFD0", +"000000067F000080000009003800000FC000-000000067F00008000000900380000100000__000000B18434BFD0", +"000000067F000080000009003800000FDE0B-000000067F0000800000090038000010752B__000000B0F3EDEAC9-000000B18495C001", +"000000067F00008000000900380000100000-000000067F00008000000900380000104000__000000B18434BFD0", +"000000067F00008000000900380000104000-000000067F00008000000900380000108000__000000B18434BFD0", +"000000067F0000800000090038000010752B-000000067F00008000000900380000110C91__000000B0F3EDEAC9-000000B18495C001", +"000000067F00008000000900380000108000-000000067F0000800000090038000010C000__000000B18434BFD0", +"000000067F0000800000090038000010C000-000000067F00008000000900380000110000__000000B18434BFD0", +"000000067F00008000000900380000110000-030000000000000000000000000000000002__000000B18434BFD0", +"000000067F00008000000900380000110C91-01000000000000000100000004000000001C__000000B0F3EDEAC9-000000B18495C001", +"000000067F000080000009004000000047E0-000000067F0000800000090040000000C6F1__000000B18495C001-000000B1FA75F501", +"000000067F0000800000090040000000C6F1-000000067F00008000000900400000014600__000000B18495C001-000000B1FA75F501", +"000000067F00008000000900400000014600-000000067F0000800000090040000001C511__000000B18495C001-000000B1FA75F501", +"000000067F0000800000090040000001C511-000000067F00008000000900400000024421__000000B18495C001-000000B1FA75F501", +"000000067F00008000000900400000024421-000000067F0000800000090040000002C331__000000B18495C001-000000B1FA75F501", +"000000067F0000800000090040000002C331-000000067F000080000009200C0000007658__000000B18495C001-000000B1FA75F501", +"000000067F000080000009200C0000000000-000000067F000080000009200C0000004000__000000B3AC039FE8", +"000000067F000080000009200C0000004000-000000067F000080000009200C0000008000__000000B3AC039FE8", +"000000067F000080000009200C0000007658-000000067F000080000009200C0000010DB5__000000B18495C001-000000B1FA75F501", +"000000067F000080000009200C0000008000-000000067F000080000009200C000000C000__000000B3AC039FE8", +"000000067F000080000009200C000000C000-000000067F000080000009200C0000010000__000000B3AC039FE8", +"000000067F000080000009200C0000010000-000000067F000080000009200C0000014000__000000B3A3EC82C8", +"000000067F000080000009200C0000010DB5-030000000000000000000000000000000002__000000B18495C001-000000B1FA75F501", +"000000067F000080000009200C0000012E97-000000067F000080000009200C000001C5FD__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000014000-000000067F000080000009200C0000018000__000000B3A3EC82C8", +"000000067F000080000009200C0000018000-000000067F000080000009200C000001C000__000000B3A3EC82C8", +"000000067F000080000009200C000001C000-000000067F000080000009200C0000020000__000000B3A3EC82C8", +"000000067F000080000009200C000001C5FD-000000067F000080000009200C0000025D0C__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000020000-000000067F000080000009200C0000024000__000000B3A3EC82C8", +"000000067F000080000009200C0000024000-000000067F000080000009200C0000028000__000000B3A3EC82C8", +"000000067F000080000009200C0000025D0C-000000067F000080000009200C000002F472__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000028000-000000067F000080000009200C000002C000__000000B3A3EC82C8", +"000000067F000080000009200C000002C000-000000067F000080000009200C0000030000__000000B3A3EC82C8", +"000000067F000080000009200C000002F472-000000067F000080000009200C0000038B85__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000030000-000000067F000080000009200C0000034000__000000B3A3EC82C8", +"000000067F000080000009200C0000034000-000000067F000080000009200C0000038000__000000B3A3EC82C8", +"000000067F000080000009200C0000038000-000000067F000080000009200C000003C000__000000B3A3EC82C8", +"000000067F000080000009200C0000038B85-000000067F000080000009200C00000422EB__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C000003C000-000000067F000080000009200C0000040000__000000B3A3EC82C8", +"000000067F000080000009200C0000040000-000000067F000080000009200C0000044000__000000B3A3EC82C8", +"000000067F000080000009200C00000422EB-000000067F000080000009200C000004BA0C__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000044000-000000067F000080000009200C0000048000__000000B3A3EC82C8", +"000000067F000080000009200C0000048000-000000067F000080000009200C000004C000__000000B3A3EC82C8", +"000000067F000080000009200C000004BA0C-000000067F000080000009200C0000055141__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C000004C000-000000067F000080000009200C0000050000__000000B3A3EC82C8", +"000000067F000080000009200C0000050000-000000067F000080000009200C0000054000__000000B3A3EC82C8", +"000000067F000080000009200C0000054000-000000067F000080000009200C0000058000__000000B3A3EC82C8", +"000000067F000080000009200C0000055141-000000067F000080000009200C000005E8A7__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000058000-000000067F000080000009200C000005C000__000000B3A3EC82C8", +"000000067F000080000009200C000005C000-000000067F000080000009200C0000060000__000000B3A3EC82C8", +"000000067F000080000009200C000005E8A7-000000067F000080000009200C0000067FC1__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000060000-000000067F000080000009200C0000064000__000000B3A3EC82C8", +"000000067F000080000009200C0000064000-000000067F000080000009200C0000068000__000000B3A3EC82C8", +"000000067F000080000009200C0000067FC1-000000067F000080000009200C0000071709__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000068000-000000067F000080000009200C000006C000__000000B3A3EC82C8", +"000000067F000080000009200C000006C000-000000067F000080000009200C0000070000__000000B3A3EC82C8", +"000000067F000080000009200C0000070000-000000067F000080000009200C0000074000__000000B3A3EC82C8", +"000000067F000080000009200C0000071709-000000067F000080000009200C000007AE6F__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000074000-000000067F000080000009200C0000078000__000000B3A3EC82C8", +"000000067F000080000009200C0000078000-000000067F000080000009200C000007C000__000000B3A3EC82C8", +"000000067F000080000009200C000007AE6F-000000067F000080000009200C00000845AB__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C000007C000-000000067F000080000009200C0000080000__000000B3A3EC82C8", +"000000067F000080000009200C0000080000-000000067F000080000009200C0000084000__000000B3A3EC82C8", +"000000067F000080000009200C0000084000-000000067F000080000009200C0000088000__000000B3A3EC82C8", +"000000067F000080000009200C00000845AB-000000067F000080000009200C000008DD09__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000088000-000000067F000080000009200C000008C000__000000B3A3EC82C8", +"000000067F000080000009200C000008C000-000000067F000080000009200C0000090000__000000B3A3EC82C8", +"000000067F000080000009200C000008DD09-000000067F000080000009200C0100000000__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000090000-000000067F000080000009200C0000094000__000000B3A3EC82C8", +"000000067F000080000009200C0000094000-000000067F000080000009200C0000098000__000000B3A3EC82C8", +"000000067F000080000009200C000009567A-000000067F000080000009200C000009EDE0__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C0000098000-000000067F000080000009200C000009C000__000000B3A3EC82C8", +"000000067F000080000009200C000009C000-000000067F000080000009200C00000A0000__000000B3A3EC82C8", +"000000067F000080000009200C000009EDE0-000000067F000080000009200C00000A852B__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000A0000-000000067F000080000009200C00000A4000__000000B3A3EC82C8", +"000000067F000080000009200C00000A4000-000000067F000080000009200C00000A8000__000000B3A3EC82C8", +"000000067F000080000009200C00000A8000-000000067F000080000009200C00000AC000__000000B3A3EC82C8", +"000000067F000080000009200C00000A852B-000000067F000080000009200C00000B1C91__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000AC000-000000067F000080000009200C00000B0000__000000B3A3EC82C8", +"000000067F000080000009200C00000B0000-000000067F000080000009200C00000B4000__000000B3A3EC82C8", +"000000067F000080000009200C00000B1C91-000000067F000080000009200C00000BB3F7__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000B4000-000000067F000080000009200C00000B8000__000000B3A3EC82C8", +"000000067F000080000009200C00000B8000-000000067F000080000009200C00000BC000__000000B3A3EC82C8", +"000000067F000080000009200C00000BB3F7-000000067F000080000009200C00000C4B0C__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000BC000-000000067F000080000009200C00000C0000__000000B3A3EC82C8", +"000000067F000080000009200C00000C0000-000000067F000080000009200C00000C4000__000000B3A3EC82C8", +"000000067F000080000009200C00000C4000-000000067F000080000009200C00000C8000__000000B3A3EC82C8", +"000000067F000080000009200C00000C4B0C-000000067F000080000009200C00000CE272__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000C8000-000000067F000080000009200C00000CC000__000000B3A3EC82C8", +"000000067F000080000009200C00000CC000-000000067F000080000009200C00000D0000__000000B3A3EC82C8", +"000000067F000080000009200C00000CE272-000000067F000080000009200C00000D798F__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000D0000-000000067F000080000009200C00000D4000__000000B3A3EC82C8", +"000000067F000080000009200C00000D4000-000000067F000080000009200C00000D8000__000000B3A3EC82C8", +"000000067F000080000009200C00000D798F-000000067F000080000009200C00000E10F5__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000D8000-000000067F000080000009200C00000DC000__000000B3A3EC82C8", +"000000067F000080000009200C00000DC000-000000067F000080000009200C00000E0000__000000B3A3EC82C8", +"000000067F000080000009200C00000E0000-000000067F000080000009200C00000E4000__000000B3A3EC82C8", +"000000067F000080000009200C00000E10F5-000000067F000080000009200C00000EA80B__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000E4000-000000067F000080000009200C00000E8000__000000B3A3EC82C8", +"000000067F000080000009200C00000E8000-000000067F000080000009200C00000EC000__000000B3A3EC82C8", +"000000067F000080000009200C00000EA80B-000000067F000080000009200C00000F3F4B__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000EC000-000000067F000080000009200C00000F0000__000000B3A3EC82C8", +"000000067F000080000009200C00000F0000-000000067F000080000009200C00000F4000__000000B3A3EC82C8", +"000000067F000080000009200C00000F3F4B-000000067F000080000009200C00000FD6B1__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000F4000-000000067F000080000009200C00000F8000__000000B3A3EC82C8", +"000000067F000080000009200C00000F8000-000000067F000080000009200C00000FC000__000000B3A3EC82C8", +"000000067F000080000009200C00000FC000-000000067F000080000009200C0000100000__000000B3A3EC82C8", +"000000067F000080000009200C00000FD6B1-000000067F000080000009200C0000106DD5__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C0000100000-000000067F000080000009200C0000104000__000000B3A3EC82C8", +"000000067F000080000009200C0000104000-000000067F000080000009200C0000108000__000000B3A3EC82C8", +"000000067F000080000009200C0000106DD5-000000067F000080000009200C000011050B__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C0000108000-000000067F000080000009200C000010C000__000000B3A3EC82C8", +"000000067F000080000009200C000010C000-030000000000000000000000000000000002__000000B3A3EC82C8", +"000000067F000080000009200C000011050B-01000000000000000100000004000000001C__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F00008000000920140000005289-000000067F0000800000092014000000D19A__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F0000800000092014000000D19A-000000067F000080000009201400000150A9__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F000080000009201400000150A9-000000067F0000800000092014000001CFBA__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F0000800000092014000001CFBA-000000067F00008000000920140000024ECB__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F00008000000920140000024ECB-000000067F0000800000092014000002CDDB__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F0000800000092014000002CDDB-000000067F000080000009400C000000830C__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F000080000009400C0000000000-000000067F000080000009400C0000004000__000000B5CED8CF78", +"000000067F000080000009400C0000004000-000000067F000080000009400C0000008000__000000B5CED8CF78", +"000000067F000080000009400C0000008000-000000067F000080000009400C000000C000__000000B5CED8CF78", +"000000067F000080000009400C000000830C-000000067F000080000009400C0000011A72__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F000080000009400C000000C000-000000067F000080000009400C0000010000__000000B5CED8CF78", +"000000067F000080000009400C0000010000-000000067F000080000009400C0000014000__000000B568835548", +"000000067F000080000009400C0000011A72-030000000000000000000000000000000002__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F000080000009400C0000012E51-000000067F000080000009400C000001C5B7__000000B4208FF3D1-000000B43089EC11", +"000000067F000080000009400C0000012E51-000000067F000080000009400C000001C5B7__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000014000-000000067F000080000009400C0000018000__000000B568835548", +"000000067F000080000009400C0000018000-000000067F000080000009400C000001C000__000000B568835548", +"000000067F000080000009400C000001C000-000000067F000080000009400C0000020000__000000B568835548", +"000000067F000080000009400C000001C5B7-000000067F000080000009400C0000025D1D__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C000001C5B7-000000067F000080000009400C0100000000__000000B4208FF3D1-000000B43089EC11", +"000000067F000080000009400C0000020000-000000067F000080000009400C0000024000__000000B568835548", +"000000067F000080000009400C0000024000-000000067F000080000009400C0000028000__000000B568835548", +"000000067F000080000009400C0000025D1D-000000067F000080000009400C000002F483__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000028000-000000067F000080000009400C000002C000__000000B568835548", +"000000067F000080000009400C000002C000-000000067F000080000009400C0000030000__000000B568835548", +"000000067F000080000009400C000002F483-000000067F000080000009400C0000038B96__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000030000-000000067F000080000009400C0000034000__000000B568835548", +"000000067F000080000009400C0000034000-000000067F000080000009400C0000038000__000000B568835548", +"000000067F000080000009400C0000038000-000000067F000080000009400C000003C000__000000B568835548", +"000000067F000080000009400C0000038B96-000000067F000080000009400C00000422FC__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C000003C000-000000067F000080000009400C0000040000__000000B568835548", +"000000067F000080000009400C0000040000-000000067F000080000009400C0000044000__000000B568835548", +"000000067F000080000009400C00000422FC-000000067F000080000009400C000004BA0C__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000044000-000000067F000080000009400C0000048000__000000B568835548", +"000000067F000080000009400C0000048000-000000067F000080000009400C000004C000__000000B568835548", +"000000067F000080000009400C000004BA0C-000000067F000080000009400C0000055141__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C000004C000-000000067F000080000009400C0000050000__000000B568835548", +"000000067F000080000009400C0000050000-000000067F000080000009400C0000054000__000000B568835548", +"000000067F000080000009400C0000054000-000000067F000080000009400C0000058000__000000B568835548", +"000000067F000080000009400C0000055141-000000067F000080000009400C000005E8A7__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000058000-000000067F000080000009400C000005C000__000000B568835548", +"000000067F000080000009400C000005C000-000000067F000080000009400C0000060000__000000B568835548", +"000000067F000080000009400C000005E8A7-000000067F000080000009400C0000067FC1__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000060000-000000067F000080000009400C0000064000__000000B568835548", +"000000067F000080000009400C0000064000-000000067F000080000009400C0000068000__000000B568835548", +"000000067F000080000009400C0000067FC1-000000067F000080000009400C0000071709__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000068000-000000067F000080000009400C000006C000__000000B568835548", +"000000067F000080000009400C000006C000-000000067F000080000009400C0000070000__000000B568835548", +"000000067F000080000009400C0000070000-000000067F000080000009400C0000074000__000000B568835548", +"000000067F000080000009400C0000071709-000000067F000080000009400C000007AE6F__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000074000-000000067F000080000009400C0000078000__000000B568835548", +"000000067F000080000009400C0000078000-000000067F000080000009400C000007C000__000000B568835548", +"000000067F000080000009400C000007AE6F-000000067F000080000009400C00000845AB__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C000007C000-000000067F000080000009400C0000080000__000000B568835548", +"000000067F000080000009400C0000080000-000000067F000080000009400C0000084000__000000B568835548", +"000000067F000080000009400C0000084000-000000067F000080000009400C0000088000__000000B568835548", +"000000067F000080000009400C00000845AB-000000067F000080000009400C0100000000__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000088000-000000067F000080000009400C000008C000__000000B568835548", +"000000067F000080000009400C000008C000-000000067F000080000009400C0000090000__000000B568835548", +"000000067F000080000009400C000008DEA4-000000067F000080000009400C00000975C4__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C0000090000-000000067F000080000009400C0000094000__000000B568835548", +"000000067F000080000009400C0000094000-000000067F000080000009400C0000098000__000000B568835548", +"000000067F000080000009400C00000975C4-000000067F000080000009400C00000A0D0A__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C0000098000-000000067F000080000009400C000009C000__000000B568835548", +"000000067F000080000009400C000009C000-000000067F000080000009400C00000A0000__000000B568835548", +"000000067F000080000009400C00000A0000-000000067F000080000009400C00000A4000__000000B568835548", +"000000067F000080000009400C00000A0D0A-000000067F000080000009400C00000AA470__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000A4000-000000067F000080000009400C00000A8000__000000B568835548", +"000000067F000080000009400C00000A8000-000000067F000080000009400C00000AC000__000000B568835548", +"000000067F000080000009400C00000AA470-000000067F000080000009400C00000B3BB2__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000AC000-000000067F000080000009400C00000B0000__000000B568835548", +"000000067F000080000009400C00000B0000-000000067F000080000009400C00000B4000__000000B568835548", +"000000067F000080000009400C00000B3BB2-000000067F000080000009400C00000BD30A__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000B4000-000000067F000080000009400C00000B8000__000000B568835548", +"000000067F000080000009400C00000B8000-000000067F000080000009400C00000BC000__000000B568835548", +"000000067F000080000009400C00000BC000-000000067F000080000009400C00000C0000__000000B568835548", +"000000067F000080000009400C00000BD30A-000000067F000080000009400C00000C6A30__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000C0000-000000067F000080000009400C00000C4000__000000B568835548", +"000000067F000080000009400C00000C4000-000000067F000080000009400C00000C8000__000000B568835548", +"000000067F000080000009400C00000C6A30-000000067F000080000009400C00000D0194__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000C8000-000000067F000080000009400C00000CC000__000000B568835548", +"000000067F000080000009400C00000CC000-000000067F000080000009400C00000D0000__000000B568835548", +"000000067F000080000009400C00000D0000-000000067F000080000009400C00000D4000__000000B568835548", +"000000067F000080000009400C00000D0194-000000067F000080000009400C00000D98FA__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000D4000-030000000000000000000000000000000002__000000B568835548", +"000000067F000080000009400C00000D98FA-000000067F000080000009400C00000E300D__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000E300D-000000067F000080000009400C00000EC773__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000EC773-000000067F000080000009400C00000F5ED9__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000F5ED9-000000067F000080000009400C00000FF60C__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000FF60C-000000067F000080000009400C0000108D1D__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C0000108D1D-000000067F000080000009400C0000111C20__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00FFFFFFFF-030000000000000000000000000000000002__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009600C0000000000-000000067F000080000009600C0000004000__000000B79F439FE0", +"000000067F000080000009600C0000004000-000000067F000080000009600C0000008000__000000B79F439FE0", +"000000067F000080000009600C0000008000-000000067F000080000009600C000000C000__000000B79F439FE0", +"000000067F000080000009600C000000974F-000000067F000080000009600C0000012EB5__000000B5CED8CF79-000000B63EADE5B9", +"000000067F000080000009600C000000C000-000000067F000080000009600C0000010000__000000B79F439FE0", +"000000067F000080000009600C0000010000-000000067F000080000009600C0000014000__000000B79F439FE0", +"000000067F000080000009600C0000012EB5-000000067F000080000009600C000001C60A__000000B5CED8CF79-000000B63EADE5B9", +"000000067F000080000009600C0000014000-000000067F000080000009600C0000018000__000000B79F439FE0", +"000000067F000080000009600C0000018000-000000067F000080000009600C000001C000__000000B79F439FE0", +"000000067F000080000009600C000001C000-000000067F000080000009600C0000020000__000000B79F439FE0", +"000000067F000080000009600C000001C60A-000000067F000080000009600C0000025D38__000000B5CED8CF79-000000B63EADE5B9", +"000000067F000080000009600C0000020000-000000067F000080000009600C0000024000__000000B79F439FE0", +"000000067F000080000009600C0000024000-000000067F000080000009600C0000028000__000000B79F439FE0", +"000000067F000080000009600C0000025D38-000000067F000080000009600C000002F49E__000000B5CED8CF79-000000B63EADE5B9", +"000000067F000080000009600C0000028000-000000067F000080000009600C000002C000__000000B79F439FE0", +"000000067F000080000009600C000002C000-000000067F000080000009600C0000030000__000000B79F439FE0", +"000000067F000080000009600C000002F49E-000000067F000080000009600C0000038BB1__000000B5CED8CF79-000000B63EADE5B9", +"000000067F000080000009600C0000030000-000000067F000080000009600C0000034000__000000B79F439FE0", +"000000067F000080000009600C0000034000-000000067F000080000009600C0000038000__000000B79F439FE0", +"000000067F000080000009600C0000038000-000000067F000080000009600C000003C000__000000B79F439FE0", +"000000067F000080000009600C0000038BB1-000000067F000080000009600C0000042317__000000B5CED8CF79-000000B63EADE5B9", +"000000067F000080000009600C000003C000-000000067F000080000009600C0000040000__000000B79F439FE0", +"000000067F000080000009600C0000040000-000000067F000080000009600C0000044000__000000B79D17BFD0", +"000000067F000080000009600C0000040000-000000067F000080000009600C0000044000__000000B8606C92A0", +"000000067F000080000009600C0000042317-030000000000000000000000000000000002__000000B5CED8CF79-000000B63EADE5B9", +"000000067F000080000009600C000004236E-000000067F000080000009600C000004BAD4__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C0000044000-000000067F000080000009600C0000048000__000000B79D17BFD0", +"000000067F000080000009600C0000044000-000000067F000080000009600C0000048000__000000B8606C92A0", +"000000067F000080000009600C0000048000-000000067F000080000009600C000004C000__000000B79D17BFD0", +"000000067F000080000009600C0000048000-000000067F000080000009600C000004C000__000000B8606C92A0", +"000000067F000080000009600C000004BAD4-000000067F000080000009600C0000055208__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C000004C000-000000067F000080000009600C0000050000__000000B79D17BFD0", +"000000067F000080000009600C000004C000-000000067F000080000009600C0000050000__000000B8606C92A0", +"000000067F000080000009600C0000050000-000000067F000080000009600C0000054000__000000B79D17BFD0", +"000000067F000080000009600C0000050000-000000067F000080000009600C0000054000__000000B8606C92A0", +"000000067F000080000009600C0000054000-000000067F000080000009600C0000058000__000000B79D17BFD0", +"000000067F000080000009600C0000054000-000000067F000080000009600C0000058000__000000B8606C92A0", +"000000067F000080000009600C0000055208-000000067F000080000009600C000005E96E__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C0000055A77-000000067F000080000009600C00000AAEA5__000000B808718889-000000B8606C92A1", +"000000067F000080000009600C0000058000-000000067F000080000009600C000005C000__000000B79D17BFD0", +"000000067F000080000009600C0000058000-000000067F000080000009600C000005C000__000000B8606C92A0", +"000000067F000080000009600C000005C000-000000067F000080000009600C0000060000__000000B79D17BFD0", +"000000067F000080000009600C000005C000-000000067F000080000009600C0000060000__000000B8606C92A0", +"000000067F000080000009600C000005E96E-000000067F000080000009600C00000680D4__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C0000060000-000000067F000080000009600C0000064000__000000B79D17BFD0", +"000000067F000080000009600C0000060000-000000067F000080000009600C0000064000__000000B8606C92A0", +"000000067F000080000009600C0000064000-000000067F000080000009600C0000068000__000000B79D17BFD0", +"000000067F000080000009600C0000064000-000000067F000080000009600C0000068000__000000B8606C92A0", +"000000067F000080000009600C0000068000-000000067F000080000009600C000006C000__000000B79D17BFD0", +"000000067F000080000009600C0000068000-000000067F000080000009600C000006C000__000000B8606C92A0", +"000000067F000080000009600C00000680D4-000000067F000080000009600C000007180B__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C000006C000-000000067F000080000009600C0000070000__000000B79D17BFD0", +"000000067F000080000009600C000006C000-000000067F000080000009600C0000070000__000000B8606C92A0", +"000000067F000080000009600C0000070000-000000067F000080000009600C0000074000__000000B79D17BFD0", +"000000067F000080000009600C0000070000-000000067F000080000009600C0000074000__000000B8606C92A0", +"000000067F000080000009600C000007180B-000000067F000080000009600C000007AF71__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C0000074000-000000067F000080000009600C0000078000__000000B79D17BFD0", +"000000067F000080000009600C0000074000-000000067F000080000009600C0000078000__000000B8606C92A0", +"000000067F000080000009600C0000078000-000000067F000080000009600C000007C000__000000B79D17BFD0", +"000000067F000080000009600C0000078000-000000067F000080000009600C000007C000__000000B8606C92A0", +"000000067F000080000009600C000007AF71-000000067F000080000009600C00000846D7__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C000007C000-000000067F000080000009600C0000080000__000000B79D17BFD0", +"000000067F000080000009600C000007C000-000000067F000080000009600C0000080000__000000B8606C92A0", +"000000067F000080000009600C0000080000-000000067F000080000009600C0000084000__000000B79D17BFD0", +"000000067F000080000009600C0000080000-000000067F000080000009600C0000084000__000000B8606C92A0", +"000000067F000080000009600C0000084000-000000067F000080000009600C0000088000__000000B79D17BFD0", +"000000067F000080000009600C0000084000-000000067F000080000009600C0000088000__000000B8606C92A0", +"000000067F000080000009600C00000846D7-000000067F000080000009600C000008DE0C__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C0000088000-000000067F000080000009600C000008C000__000000B79D17BFD0", +"000000067F000080000009600C0000088000-000000067F000080000009600C000008C000__000000B8606C92A0", +"000000067F000080000009600C000008C000-000000067F000080000009600C0000090000__000000B79D17BFD0", +"000000067F000080000009600C000008C000-000000067F000080000009600C0000090000__000000B8606C92A0", +"000000067F000080000009600C000008DE0C-000000067F000080000009600C000009752C__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C0000090000-000000067F000080000009600C0000094000__000000B79D17BFD0", +"000000067F000080000009600C0000090000-000000067F000080000009600C0000094000__000000B8606C92A0", +"000000067F000080000009600C0000094000-000000067F000080000009600C0000098000__000000B79D17BFD0", +"000000067F000080000009600C0000094000-000000067F000080000009600C0000098000__000000B8606C92A0", +"000000067F000080000009600C000009752C-000000067F000080000009600C00000A0C92__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C0000098000-000000067F000080000009600C000009C000__000000B79D17BFD0", +"000000067F000080000009600C0000098000-000000067F000080000009600C000009C000__000000B8606C92A0", +"000000067F000080000009600C000009C000-000000067F000080000009600C00000A0000__000000B79D17BFD0", +"000000067F000080000009600C000009C000-000000067F000080000009600C00000A0000__000000B8606C92A0", +"000000067F000080000009600C00000A0000-000000067F000080000009600C00000A4000__000000B79D17BFD0", +"000000067F000080000009600C00000A0000-000000067F000080000009600C00000A4000__000000B8606C92A0", +"000000067F000080000009600C00000A0C92-000000067F000080000009600C0100000000__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C00000A4000-000000067F000080000009600C00000A8000__000000B79D17BFD0", +"000000067F000080000009600C00000A4000-000000067F000080000009600C00000A8000__000000B8606C92A0", +"000000067F000080000009600C00000A8000-000000067F000080000009600C00000AC000__000000B79D17BFD0", +"000000067F000080000009600C00000A8000-000000067F000080000009600C00000AC000__000000B8606C92A0", +"000000067F000080000009600C00000A93FD-000000067F000080000009600C00000B2B0C__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000AAEA5-000000067F000080000009600C0000101445__000000B808718889-000000B8606C92A1", +"000000067F000080000009600C00000AC000-000000067F000080000009600C00000B0000__000000B79D17BFD0", +"000000067F000080000009600C00000AC000-000000067F000080000009600C00000B0000__000000B8606C92A0", +"000000067F000080000009600C00000B0000-000000067F000080000009600C00000B4000__000000B79D17BFD0", +"000000067F000080000009600C00000B0000-000000067F000080000009600C00000B4000__000000B8606C92A0", +"000000067F000080000009600C00000B2B0C-000000067F000080000009600C00000BC272__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000B4000-000000067F000080000009600C00000B8000__000000B79D17BFD0", +"000000067F000080000009600C00000B4000-000000067F000080000009600C00000B8000__000000B8606C92A0", +"000000067F000080000009600C00000B8000-000000067F000080000009600C00000BC000__000000B79D17BFD0", +"000000067F000080000009600C00000B8000-000000067F000080000009600C00000BC000__000000B8606C92A0", +"000000067F000080000009600C00000BC000-000000067F000080000009600C00000C0000__000000B79D17BFD0", +"000000067F000080000009600C00000BC000-000000067F000080000009600C00000C0000__000000B8606C92A0", +"000000067F000080000009600C00000BC272-000000067F000080000009600C00000C59A2__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000C0000-000000067F000080000009600C00000C4000__000000B79D17BFD0", +"000000067F000080000009600C00000C0000-000000067F000080000009600C00000C4000__000000B8606C92A0", +"000000067F000080000009600C00000C4000-000000067F000080000009600C00000C8000__000000B79D17BFD0", +"000000067F000080000009600C00000C4000-000000067F000080000009600C00000C8000__000000B8606C92A0", +"000000067F000080000009600C00000C59A2-000000067F000080000009600C00000CF108__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000C8000-000000067F000080000009600C00000CC000__000000B79D17BFD0", +"000000067F000080000009600C00000C8000-000000067F000080000009600C00000CC000__000000B8606C92A0", +"000000067F000080000009600C00000CC000-000000067F000080000009600C00000D0000__000000B79D17BFD0", +"000000067F000080000009600C00000CC000-000000067F000080000009600C00000D0000__000000B8606C92A0", +"000000067F000080000009600C00000CF108-000000067F000080000009600C00000D882B__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000D0000-000000067F000080000009600C00000D4000__000000B79D17BFD0", +"000000067F000080000009600C00000D0000-000000067F000080000009600C00000D4000__000000B8606C92A0", +"000000067F000080000009600C00000D4000-000000067F000080000009600C00000D8000__000000B79D17BFD0", +"000000067F000080000009600C00000D4000-000000067F000080000009600C00000D8000__000000B8606C92A0", +"000000067F000080000009600C00000D8000-000000067F000080000009600C00000DC000__000000B79D17BFD0", +"000000067F000080000009600C00000D8000-000000067F000080000009600C00000DC000__000000B8606C92A0", +"000000067F000080000009600C00000D882B-000000067F000080000009600C00000E1F7E__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000DC000-000000067F000080000009600C00000E0000__000000B79D17BFD0", +"000000067F000080000009600C00000DC000-000000067F000080000009600C00000E0000__000000B8606C92A0", +"000000067F000080000009600C00000E0000-000000067F000080000009600C00000E4000__000000B79D17BFD0", +"000000067F000080000009600C00000E0000-000000067F000080000009600C00000E4000__000000B8606C92A0", +"000000067F000080000009600C00000E1F7E-000000067F000080000009600C00000EB6E4__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000E4000-000000067F000080000009600C00000E8000__000000B79D17BFD0", +"000000067F000080000009600C00000E4000-000000067F000080000009600C00000E8000__000000B8606C92A0", +"000000067F000080000009600C00000E8000-000000067F000080000009600C00000EC000__000000B79D17BFD0", +"000000067F000080000009600C00000E8000-000000067F000080000009600C00000EC000__000000B8606C92A0", +"000000067F000080000009600C00000EB6E4-000000067F000080000009600C00000F4E0B__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000EC000-000000067F000080000009600C00000F0000__000000B79D17BFD0", +"000000067F000080000009600C00000EC000-000000067F000080000009600C00000F0000__000000B8606C92A0", +"000000067F000080000009600C00000F0000-000000067F000080000009600C00000F4000__000000B79D17BFD0", +"000000067F000080000009600C00000F0000-000000067F000080000009600C00000F4000__000000B8606C92A0", +"000000067F000080000009600C00000F4000-000000067F000080000009600C00000F8000__000000B79D17BFD0", +"000000067F000080000009600C00000F4000-000000067F000080000009600C00000F8000__000000B8606C92A0", +"000000067F000080000009600C00000F4E0B-000000067F000080000009600C00000FE571__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000F8000-000000067F000080000009600C00000FC000__000000B79D17BFD0", +"000000067F000080000009600C00000F8000-000000067F000080000009600C00000FC000__000000B8606C92A0", +"000000067F000080000009600C00000FC000-000000067F000080000009600C0000100000__000000B79D17BFD0", +"000000067F000080000009600C00000FC000-000000067F000080000009600C0000100000__000000B8606C92A0", +"000000067F000080000009600C00000FE571-000000067F000080000009600C0000107CD7__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C0000100000-000000067F000080000009600C0000104000__000000B79D17BFD0", +"000000067F000080000009600C0000100000-000000067F000080000009600C0000104000__000000B8606C92A0", +"000000067F000080000009600C000010144D-000000067F0000800000096014000000E7D9__000000B808718889-000000B8606C92A1", +"000000067F000080000009600C0000104000-000000067F000080000009600C0000108000__000000B79D17BFD0", +"000000067F000080000009600C0000104000-000000067F000080000009600C0000108000__000000B8606C92A0", +"000000067F000080000009600C0000107CD7-000000067F000080000009600C000011140C__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C0000108000-000000067F000080000009600C000010C000__000000B79D17BFD0", +"000000067F000080000009600C0000108000-000000067F000080000009600C000010C000__000000B8606C92A0", +"000000067F000080000009600C000010C000-000000067F000080000009600C0000110000__000000B79D17BFD0", +"000000067F000080000009600C000010C000-000000067F000080000009600C0000110000__000000B8606C92A0", +"000000067F000080000009600C0000110000-000000067F00008000000960120100000000__000000B8606C92A0", +"000000067F000080000009600C0000110000-030000000000000000000000000000000002__000000B79D17BFD0", +"000000067F000080000009600C000011140C-01000000000000000100000004000000001C__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C020000000B-000000067F0000800000096014000000571F__000000B79E68FFF9-000000B808718889", +"000000067F00008000000960140000000000-000000067F00008000000960140000004000__000000B8606C92A0", +"000000067F00008000000960140000004000-000000067F00008000000960140000008000__000000B8606C92A0", +"000000067F0000800000096014000000571F-000000067F0000800000096014000000CB61__000000B79E68FFF9-000000B808718889", +"000000067F00008000000960140000008000-000000067F0000800000096014000000C000__000000B8606C92A0", +"000000067F0000800000096014000000C000-000000067F00008000000960140000010000__000000B8606C92A0", +"000000067F0000800000096014000000CB61-000000067F00008000000960140000013F98__000000B79E68FFF9-000000B808718889", +"000000067F0000800000096014000000E7DB-000000067F00008000000960140000022A8D__000000B808718889-000000B8606C92A1", +"000000067F00008000000960140000010000-000000067F00008000000960140000014000__000000B8606C92A0", +"000000067F00008000000960140000013F98-000000067F0000800000096014000001B3C2__000000B79E68FFF9-000000B808718889", +"000000067F00008000000960140000014000-000000067F00008000000960140000018000__000000B8606C92A0", +"000000067F00008000000960140000018000-000000067F0000800000096014000001C000__000000B8606C92A0", +"000000067F0000800000096014000001B3C2-000000067F000080000009601400000227FC__000000B79E68FFF9-000000B808718889", +"000000067F0000800000096014000001C000-000000067F00008000000960140000020000__000000B8606C92A0", +"000000067F00008000000960140000020000-000000067F00008000000960140000024000__000000B8606C92A0", +"000000067F000080000009601400000227FC-000000067F00008000000960140000029BD8__000000B79E68FFF9-000000B808718889", +"000000067F00008000000960140000022A8D-030000000000000000000000000000000002__000000B808718889-000000B8606C92A1", +"000000067F00008000000960140000024000-000000067F00008000000960140000028000__000000B8606C92A0", +"000000067F00008000000960140000028000-000000067F0000800000096014000002C000__000000B8606C92A0", +"000000067F00008000000960140000029BD8-030000000000000000000000000000000002__000000B79E68FFF9-000000B808718889", +"000000067F0000800000096014000002C000-030000000000000000000000000000000002__000000B8606C92A0", +"000000067F000080000009800C0000009748-000000067F000080000009800C0000012EAE__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000080000009800C0000012EAE-000000067F000080000009800C000001C60A__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000080000009800C000001C60A-000000067F000080000009800C0000025D38__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000080000009800C0000025D38-000000067F000080000009800C000002F49E__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000080000009800C000002F49E-000000067F000080000009800C0000038BB1__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000080000009800C0000038BB1-000000067F000080000009800C0000042317__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000080000009800C0000042317-000000067F000080000009800C000004BA7D__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000080000009800C000004BA7D-030000000000000000000000000000000002__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000080000009800C000004BAD2-000000067F000080000009800C0000055206__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C0000055206-000000067F000080000009800C000005E911__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C000005E911-000000067F000080000009800C000006802B__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C000006802B-000000067F000080000009800C0000071782__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C0000071782-000000067F000080000009800C000007AEE8__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C000007AEE8-000000067F000080000009800C000008460B__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C000008460B-000000067F000080000009800C000008DD71__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C000008DD71-000000067F000080000009800C00000974D7__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C00000974D7-000000067F000080000009800C00000A0C0B__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C00000A0C0B-000000067F000080000009800C00000AA371__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C00000A8000-000000067F000080000009800C00000AC000__000000BA2E67EA20", +"000000067F000080000009800C00000AA371-000000067F000080000009800C0100000000__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C00000AA4F5-000000067F000080000009800C00000B3C0B__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000AC000-000000067F000080000009800C00000B0000__000000BA2E67EA20", +"000000067F000080000009800C00000B0000-000000067F000080000009800C00000B4000__000000BA2E67EA20", +"000000067F000080000009800C00000B3C0B-000000067F000080000009800C00000BD371__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000B4000-000000067F000080000009800C00000B8000__000000BA2E67EA20", +"000000067F000080000009800C00000B8000-000000067F000080000009800C00000BC000__000000BA2E67EA20", +"000000067F000080000009800C00000BC000-000000067F000080000009800C00000C0000__000000BA2E67EA20", +"000000067F000080000009800C00000BD371-000000067F000080000009800C00000C6AD7__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000C0000-000000067F000080000009800C00000C4000__000000BA2E67EA20", +"000000067F000080000009800C00000C4000-000000067F000080000009800C00000C8000__000000BA2E67EA20", +"000000067F000080000009800C00000C6AD7-000000067F000080000009800C00000D020B__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000C8000-000000067F000080000009800C00000CC000__000000BA2E67EA20", +"000000067F000080000009800C00000CC000-000000067F000080000009800C00000D0000__000000BA2E67EA20", +"000000067F000080000009800C00000D0000-000000067F000080000009800C00000D4000__000000BA2E67EA20", +"000000067F000080000009800C00000D020B-000000067F000080000009800C00000D9971__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000D4000-000000067F000080000009800C00000D8000__000000BA2E67EA20", +"000000067F000080000009800C00000D8000-000000067F000080000009800C00000DC000__000000BA2E67EA20", +"000000067F000080000009800C00000D9971-000000067F000080000009800C00000E30D7__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000DC000-000000067F000080000009800C00000E0000__000000BA2E67EA20", +"000000067F000080000009800C00000E0000-000000067F000080000009800C00000E4000__000000BA2E67EA20", +"000000067F000080000009800C00000E30D7-000000067F000080000009800C00000EC80B__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000E4000-000000067F000080000009800C00000E8000__000000BA2E67EA20", +"000000067F000080000009800C00000E8000-000000067F000080000009800C00000EC000__000000BA2E67EA20", +"000000067F000080000009800C00000EC000-000000067F000080000009800C00000F0000__000000BA2E67EA20", +"000000067F000080000009800C00000EC80B-000000067F000080000009800C00000F5F38__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000F0000-000000067F000080000009800C00000F4000__000000BA2E67EA20", +"000000067F000080000009800C00000F4000-000000067F000080000009800C00000F8000__000000BA2E67EA20", +"000000067F000080000009800C00000F5F38-000000067F000080000009800C00000FF69E__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000F8000-000000067F000080000009800C00000FC000__000000BA2E67EA20", +"000000067F000080000009800C00000FC000-000000067F000080000009800C0000100000__000000BA2E67EA20", +"000000067F000080000009800C00000FF69E-000000067F000080000009800C0000108DAF__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C0000100000-000000067F000080000009800C0000104000__000000BA2E67EA20", +"000000067F000080000009800C0000104000-000000067F000080000009800C0000108000__000000BA2E67EA20", +"000000067F000080000009800C0000108000-000000067F000080000009800C000010C000__000000BA2E67EA20", +"000000067F000080000009800C0000108DAF-000000067F000080000009800F0100000003__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C000010C000-000000067F000080000009800C0000110000__000000BA2E67EA20", +"000000067F000080000009800C000010EC71-000000067F000080000009801400000025C3__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F000080000009800C0000110000-030000000000000000000000000000000002__000000BA2E67EA20", +"000000067F000080000009801400000025C3-000000067F0000800000098014000000A4D3__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F0000800000098014000000A4D3-000000067F000080000009801400000123E4__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F000080000009801400000123E4-000000067F0000800000098014000001A2F3__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F0000800000098014000001A2F3-000000067F00008000000980140000022204__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F00008000000980140000022204-000000067F0000800000098014000002A114__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F0000800000098014000002A114-000000067F000080000009A00C0000004DB3__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F000080000009A00C0000000000-000000067F000080000009A00C0000004000__000000BCEF79BE90", +"000000067F000080000009A00C0000004000-000000067F000080000009A00C0000008000__000000BCEF79BE90", +"000000067F000080000009A00C0000004DB3-030000000000000000000000000000000002__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F000080000009A00C0000008000-000000067F000080000009A00C000000C000__000000BC59629F98", +"000000067F000080000009A00C0000008000-000000067F000080000009A00C000000C000__000000BD25E66810", +"000000067F000080000009A00C00000096E8-000000067F000080000009A00C0000012E0B__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C000000C000-000000067F000080000009A00C0000010000__000000BC59629F98", +"000000067F000080000009A00C000000C000-000000067F000080000009A00C0000010000__000000BD25E66810", +"000000067F000080000009A00C0000010000-000000067F000080000009A00C0000014000__000000BC59629F98", +"000000067F000080000009A00C0000010000-000000067F000080000009A00C0000014000__000000BD25E66810", +"000000067F000080000009A00C0000012E0B-000000067F000080000009A00C000001C571__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000014000-000000067F000080000009A00C0000018000__000000BC59629F98", +"000000067F000080000009A00C0000014000-000000067F000080000009A00C0000018000__000000BD25E66810", +"000000067F000080000009A00C0000018000-000000067F000080000009A00C000001C000__000000BC59629F98", +"000000067F000080000009A00C0000018000-000000067F000080000009A00C000001C000__000000BD25E66810", +"000000067F000080000009A00C000001C000-000000067F000080000009A00C0000020000__000000BC59629F98", +"000000067F000080000009A00C000001C000-000000067F000080000009A00C0000020000__000000BD25E66810", +"000000067F000080000009A00C000001C571-000000067F000080000009A00C0000025CD7__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000020000-000000067F000080000009A00C0000024000__000000BC59629F98", +"000000067F000080000009A00C0000020000-000000067F000080000009A00C0000024000__000000BD25E66810", +"000000067F000080000009A00C0000024000-000000067F000080000009A00C0000028000__000000BC59629F98", +"000000067F000080000009A00C0000024000-000000067F000080000009A00C0000028000__000000BD25E66810", +"000000067F000080000009A00C0000025CD7-000000067F000080000009A00C000002F40B__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000028000-000000067F000080000009A00C000002C000__000000BC59629F98", +"000000067F000080000009A00C0000028000-000000067F000080000009A00C000002C000__000000BD25E66810", +"000000067F000080000009A00C000002C000-000000067F000080000009A00C0000030000__000000BC59629F98", +"000000067F000080000009A00C000002C000-000000067F000080000009A00C0000030000__000000BD25E66810", +"000000067F000080000009A00C000002F40B-000000067F000080000009A00C0000038B1E__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000030000-000000067F000080000009A00C0000034000__000000BC59629F98", +"000000067F000080000009A00C0000030000-000000067F000080000009A00C0000034000__000000BD25E66810", +"000000067F000080000009A00C0000034000-000000067F000080000009A00C0000038000__000000BC59629F98", +"000000067F000080000009A00C0000034000-000000067F000080000009A00C0000038000__000000BD25E66810", +"000000067F000080000009A00C0000038000-000000067F000080000009A00C000003C000__000000BC59629F98", +"000000067F000080000009A00C0000038000-000000067F000080000009A00C000003C000__000000BD25E66810", +"000000067F000080000009A00C0000038B1E-000000067F000080000009A00C0000042284__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C000003C000-000000067F000080000009A00C0000040000__000000BC59629F98", +"000000067F000080000009A00C000003C000-000000067F000080000009A00C0000040000__000000BD25E66810", +"000000067F000080000009A00C0000040000-000000067F000080000009A00C0000044000__000000BC59629F98", +"000000067F000080000009A00C0000040000-000000067F000080000009A00C0000044000__000000BD25E66810", +"000000067F000080000009A00C0000042284-000000067F000080000009A00C000004B9EA__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000044000-000000067F000080000009A00C0000048000__000000BC59629F98", +"000000067F000080000009A00C0000044000-000000067F000080000009A00C0000048000__000000BD25E66810", +"000000067F000080000009A00C0000048000-000000067F000080000009A00C000004C000__000000BC59629F98", +"000000067F000080000009A00C0000048000-000000067F000080000009A00C000004C000__000000BD25E66810", +"000000067F000080000009A00C000004B9EA-000000067F000080000009A00C000005510B__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C000004C000-000000067F000080000009A00C0000050000__000000BC59629F98", +"000000067F000080000009A00C000004C000-000000067F000080000009A00C0000050000__000000BD25E66810", +"000000067F000080000009A00C0000050000-000000067F000080000009A00C0000054000__000000BC59629F98", +"000000067F000080000009A00C0000050000-000000067F000080000009A00C0000054000__000000BD25E66810", +"000000067F000080000009A00C0000054000-000000067F000080000009A00C0000058000__000000BC59629F98", +"000000067F000080000009A00C0000054000-000000067F000080000009A00C0000058000__000000BD25E66810", +"000000067F000080000009A00C000005510B-000000067F000080000009A00C000005E871__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000058000-000000067F000080000009A00C000005C000__000000BC59629F98", +"000000067F000080000009A00C0000058000-000000067F000080000009A00C000005C000__000000BD25E66810", +"000000067F000080000009A00C000005C000-000000067F000080000009A00C0000060000__000000BC59629F98", +"000000067F000080000009A00C000005C000-000000067F000080000009A00C0000060000__000000BD25E66810", +"000000067F000080000009A00C000005E871-000000067F000080000009A00C0000067F8B__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000060000-000000067F000080000009A00C0000064000__000000BC59629F98", +"000000067F000080000009A00C0000060000-000000067F000080000009A00C0000064000__000000BD25E66810", +"000000067F000080000009A00C0000064000-000000067F000080000009A00C0000068000__000000BC59629F98", +"000000067F000080000009A00C0000064000-000000067F000080000009A00C0000068000__000000BD25E66810", +"000000067F000080000009A00C0000067F8B-000000067F000080000009A00C00000716F1__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000068000-000000067F000080000009A00C000006C000__000000BC59629F98", +"000000067F000080000009A00C0000068000-000000067F000080000009A00C000006C000__000000BD25E66810", +"000000067F000080000009A00C000006C000-000000067F000080000009A00C0000070000__000000BC59629F98", +"000000067F000080000009A00C000006C000-000000067F000080000009A00C0000070000__000000BD25E66810", +"000000067F000080000009A00C0000070000-000000067F000080000009A00C0000074000__000000BC53F74828", +"000000067F000080000009A00C0000070000-000000067F000080000009A00C0000074000__000000BD25E66810", +"000000067F000080000009A00C00000716F1-000000067F000080000009A00C0100000000__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000071875-000000067F000080000009A00C000007AFDB__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C0000071F8D-000000067F000080000009A00C00000E4F8F__000000BCEF79BE91-000000BD263A5849", +"000000067F000080000009A00C0000074000-000000067F000080000009A00C0000078000__000000BC53F74828", +"000000067F000080000009A00C0000074000-000000067F000080000009A00C0000078000__000000BD25E66810", +"000000067F000080000009A00C0000078000-000000067F000080000009A00C000007C000__000000BC53F74828", +"000000067F000080000009A00C0000078000-000000067F000080000009A00C000007C000__000000BD25E66810", +"000000067F000080000009A00C00000794E0-000000067F000080000009A00C00000F2480__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A00C000007AFDB-000000067F000080000009A00C000008470A__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C000007C000-000000067F000080000009A00C0000080000__000000BC53F74828", +"000000067F000080000009A00C000007C000-000000067F000080000009A00C0000080000__000000BD25E66810", +"000000067F000080000009A00C0000080000-000000067F000080000009A00C0000084000__000000BC53F74828", +"000000067F000080000009A00C0000080000-000000067F000080000009A00C0000084000__000000BD25E66810", +"000000067F000080000009A00C0000084000-000000067F000080000009A00C0000088000__000000BC53F74828", +"000000067F000080000009A00C0000084000-000000067F000080000009A00C0000088000__000000BD25E66810", +"000000067F000080000009A00C000008470A-000000067F000080000009A00C000008DE70__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C0000088000-000000067F000080000009A00C000008C000__000000BC53F74828", +"000000067F000080000009A00C0000088000-000000067F000080000009A00C000008C000__000000BD25E66810", +"000000067F000080000009A00C000008C000-000000067F000080000009A00C0000090000__000000BC53F74828", +"000000067F000080000009A00C000008C000-000000067F000080000009A00C0000090000__000000BD25E66810", +"000000067F000080000009A00C000008DE70-000000067F000080000009A00C0000097590__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C0000090000-000000067F000080000009A00C0000094000__000000BC53F74828", +"000000067F000080000009A00C0000090000-000000067F000080000009A00C0000094000__000000BD25E66810", +"000000067F000080000009A00C0000094000-000000067F000080000009A00C0000098000__000000BC53F74828", +"000000067F000080000009A00C0000094000-000000067F000080000009A00C0000098000__000000BD25E66810", +"000000067F000080000009A00C0000097590-000000067F000080000009A00C00000A0CF6__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C0000098000-000000067F000080000009A00C000009C000__000000BC53F74828", +"000000067F000080000009A00C0000098000-000000067F000080000009A00C000009C000__000000BD25E66810", +"000000067F000080000009A00C000009C000-000000067F000080000009A00C00000A0000__000000BC53F74828", +"000000067F000080000009A00C000009C000-000000067F000080000009A00C00000A0000__000000BD25E66810", +"000000067F000080000009A00C00000A0000-000000067F000080000009A00C00000A4000__000000BC53F74828", +"000000067F000080000009A00C00000A0000-000000067F000080000009A00C00000A4000__000000BD25E66810", +"000000067F000080000009A00C00000A0CF6-000000067F000080000009A00C00000AA40B__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C00000A4000-000000067F000080000009A00C00000A8000__000000BC53F74828", +"000000067F000080000009A00C00000A4000-000000067F000080000009A00C00000A8000__000000BD25E66810", +"000000067F000080000009A00C00000A8000-000000067F000080000009A00C00000AC000__000000BC53F74828", +"000000067F000080000009A00C00000A8000-000000067F000080000009A00C00000AC000__000000BD25E66810", +"000000067F000080000009A00C00000AA40B-000000067F000080000009A00C00000B3B4D__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C00000AC000-000000067F000080000009A00C00000B0000__000000BC53F74828", +"000000067F000080000009A00C00000AC000-000000067F000080000009A00C00000B0000__000000BD25E66810", +"000000067F000080000009A00C00000B0000-000000067F000080000009A00C00000B4000__000000BC53F74828", +"000000067F000080000009A00C00000B0000-000000067F000080000009A00C00000B4000__000000BD25E66810", +"000000067F000080000009A00C00000B3B4D-000000067F000080000009A00C00000BD2B3__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C00000B4000-000000067F000080000009A00C00000B8000__000000BC53F74828", +"000000067F000080000009A00C00000B4000-000000067F000080000009A00C00000B8000__000000BD25E66810", +"000000067F000080000009A00C00000B8000-000000067F000080000009A00C00000BC000__000000BC53F74828", +"000000067F000080000009A00C00000B8000-000000067F000080000009A00C00000BC000__000000BD25E66810", +"000000067F000080000009A00C00000BC000-000000067F000080000009A00C00000C0000__000000BC53F74828", +"000000067F000080000009A00C00000BC000-000000067F000080000009A00C00000C0000__000000BD25E66810", +"000000067F000080000009A00C00000BD2B3-000000067F000080000009A00C00000C69D9__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C00000C0000-000000067F000080000009A00C00000C4000__000000BC53F74828", +"000000067F000080000009A00C00000C0000-000000067F000080000009A00C00000C4000__000000BD25E66810", +"000000067F000080000009A00C00000C4000-000000067F000080000009A00C00000C8000__000000BC53F74828", +"000000067F000080000009A00C00000C4000-000000067F000080000009A00C00000C8000__000000BD25E66810", +"000000067F000080000009A00C00000C69D9-000000067F000080000009A00C00000D010C__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C00000C8000-000000067F000080000009A00C00000CC000__000000BC53F74828", +"000000067F000080000009A00C00000C8000-000000067F000080000009A00C00000CC000__000000BD25E66810", +"000000067F000080000009A00C00000CC000-000000067F000080000009A00C00000D0000__000000BC53F74828", +"000000067F000080000009A00C00000CC000-000000067F000080000009A00C00000D0000__000000BD25E66810", +"000000067F000080000009A00C00000D0000-000000067F000080000009A00C00000D4000__000000BC53F74828", +"000000067F000080000009A00C00000D0000-000000067F000080000009A00C00000D4000__000000BD25E66810", +"000000067F000080000009A00C00000D010C-000000067F000080000009A00C0100000000__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C00000D4000-000000067F000080000009A00C00000D8000__000000BC53F74828", +"000000067F000080000009A00C00000D4000-000000067F000080000009A00C00000D8000__000000BD25E66810", +"000000067F000080000009A00C00000D6C06-000000067F000080000009A00C00000E0166__000000BBE607E8F1-000000BC596B5D59", +"000000067F000080000009A00C00000D8000-000000067F000080000009A00C00000DC000__000000BC53F74828", +"000000067F000080000009A00C00000D8000-000000067F000080000009A00C00000DC000__000000BD25E66810", +"000000067F000080000009A00C00000DC000-000000067F000080000009A00C00000E0000__000000BC53F74828", +"000000067F000080000009A00C00000DC000-000000067F000080000009A00C00000E0000__000000BD25E66810", +"000000067F000080000009A00C00000E0000-000000067F000080000009A00C00000E4000__000000BC53F74828", +"000000067F000080000009A00C00000E0000-000000067F000080000009A00C00000E4000__000000BD25E66810", +"000000067F000080000009A00C00000E0166-000000067F000080000009A00C00000E96C9__000000BBE607E8F1-000000BC596B5D59", +"000000067F000080000009A00C00000E4000-000000067F000080000009A00C00000E8000__000000BC53F74828", +"000000067F000080000009A00C00000E4000-000000067F000080000009A00C00000E8000__000000BD25E66810", +"000000067F000080000009A00C00000E4F97-000000067F000080000009A0140000019842__000000BCEF79BE91-000000BD263A5849", +"000000067F000080000009A00C00000E8000-000000067F000080000009A00C00000EC000__000000BC53F74828", +"000000067F000080000009A00C00000E8000-000000067F000080000009A00C00000EC000__000000BD25E66810", +"000000067F000080000009A00C00000E96C9-000000067F000080000009A00C00000F2C2B__000000BBE607E8F1-000000BC596B5D59", +"000000067F000080000009A00C00000EC000-000000067F000080000009A00C00000F0000__000000BC53F74828", +"000000067F000080000009A00C00000EC000-000000067F000080000009A00C00000F0000__000000BD25E66810", +"000000067F000080000009A00C00000F0000-000000067F000080000009A00C00000F4000__000000BC53F74828", +"000000067F000080000009A00C00000F0000-000000067F000080000009A00C00000F4000__000000BD25E66810", +"000000067F000080000009A00C00000F248B-000000067F000080000009A0140000004031__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A00C00000F2C2B-000000067F000080000009A00C00000FC18E__000000BBE607E8F1-000000BC596B5D59", +"000000067F000080000009A00C00000F4000-000000067F000080000009A00C00000F8000__000000BC53F74828", +"000000067F000080000009A00C00000F4000-000000067F000080000009A00C00000F8000__000000BD25E66810", +"000000067F000080000009A00C00000F8000-000000067F000080000009A00C00000FC000__000000BC53F74828", +"000000067F000080000009A00C00000F8000-000000067F000080000009A00C00000FC000__000000BD25E66810", +"000000067F000080000009A00C00000FC000-000000067F000080000009A00C0000100000__000000BC53F74828", +"000000067F000080000009A00C00000FC000-000000067F000080000009A00C0000100000__000000BD25E66810", +"000000067F000080000009A00C00000FC18E-000000067F000080000009A00C00001056F2__000000BBE607E8F1-000000BC596B5D59", +"000000067F000080000009A00C0000100000-000000067F000080000009A00C0000104000__000000BC53F74828", +"000000067F000080000009A00C0000100000-000000067F000080000009A00C0000104000__000000BD25E66810", +"000000067F000080000009A00C0000104000-000000067F000080000009A00C0000108000__000000BC53F74828", +"000000067F000080000009A00C0000104000-000000067F000080000009A00C0000108000__000000BD25E66810", +"000000067F000080000009A00C00001056F2-000000067F000080000009A00C000010EC54__000000BBE607E8F1-000000BC596B5D59", +"000000067F000080000009A00C0000108000-000000067F000080000009A00C000010C000__000000BC53F74828", +"000000067F000080000009A00C0000108000-000000067F000080000009A00C000010C000__000000BD25E66810", +"000000067F000080000009A00C000010C000-000000067F000080000009A00C0000110000__000000BC53F74828", +"000000067F000080000009A00C000010C000-000000067F000080000009A00C0000110000__000000BD25E66810", +"000000067F000080000009A00C000010EC54-010000000000000001000000040000000020__000000BBE607E8F1-000000BC596B5D59", +"000000067F000080000009A00C0000110000-000000067F000080000009A0120100000000__000000BD25E66810", +"000000067F000080000009A00C0000110000-030000000000000000000000000000000002__000000BC53F74828", +"000000067F000080000009A0140000000000-000000067F000080000009A0140000004000__000000BD25E66810", +"000000067F000080000009A0140000004000-000000067F000080000009A0140000008000__000000BD25E66810", +"000000067F000080000009A0140000004031-000000067F000080000009A0140000009FC7__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A0140000008000-000000067F000080000009A014000000C000__000000BD25E66810", +"000000067F000080000009A0140000009FC7-000000067F000080000009A014000000FF53__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A014000000C000-000000067F000080000009A0140000010000__000000BD25E66810", +"000000067F000080000009A014000000FF53-000000067F000080000009A0140000015F1C__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A0140000010000-000000067F000080000009A0140000014000__000000BD25E66810", +"000000067F000080000009A0140000014000-000000067F000080000009A0140000018000__000000BD25E66810", +"000000067F000080000009A0140000015F1C-000000067F000080000009A014000001BED0__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A0140000018000-000000067F000080000009A014000001C000__000000BD25E66810", +"000000067F000080000009A0140000019844-030000000000000000000000000000000002__000000BCEF79BE91-000000BD263A5849", +"000000067F000080000009A014000001BED0-000000067F000080000009A0140000021E6C__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A014000001C000-000000067F000080000009A0140000020000__000000BD25E66810", +"000000067F000080000009A0140000020000-000000067F000080000009A0140000024000__000000BD25E66810", +"000000067F000080000009A0140000021E6C-000000067F000080000009A0140000027DB1__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A0140000024000-000000067F000080000009A0140000028000__000000BD25E66810", +"000000067F000080000009A0140000027DB1-000000067F000080000009A014000002DC9E__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A0140000028000-000000067F000080000009A014000002C000__000000BD25E66810", +"000000067F000080000009A014000002C000-030000000000000000000000000000000002__000000BD25E66810", +"000000067F000080000009A01400FFFFFFFF-030000000000000000000000000000000002__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009C00C0000000000-000000067F000080000009C00C0000004000__000000BEF683BFD0", +"000000067F000080000009C00C0000004000-000000067F000080000009C00C0000008000__000000BEF683BFD0", +"000000067F000080000009C00C0000008000-000000067F000080000009C00C000000C000__000000BEF683BFD0", +"000000067F000080000009C00C0000009749-000000067F000080000009C00C0000012EAF__000000BD263A5849-000000BDA607F261", +"000000067F000080000009C00C000000C000-000000067F000080000009C00C0000010000__000000BEF683BFD0", +"000000067F000080000009C00C0000010000-000000067F000080000009C00C0000014000__000000BEF683BFD0", +"000000067F000080000009C00C0000012EAF-000000067F000080000009C00C000001C60B__000000BD263A5849-000000BDA607F261", +"000000067F000080000009C00C0000014000-000000067F000080000009C00C0000018000__000000BEF683BFD0", +"000000067F000080000009C00C0000018000-000000067F000080000009C00C000001C000__000000BEF683BFD0", +"000000067F000080000009C00C000001C000-000000067F000080000009C00C0000020000__000000BEF683BFD0", +"000000067F000080000009C00C000001C60B-000000067F000080000009C00C0000025D39__000000BD263A5849-000000BDA607F261", +"000000067F000080000009C00C0000020000-000000067F000080000009C00C0000024000__000000BEF683BFD0", +"000000067F000080000009C00C0000024000-000000067F000080000009C00C0000028000__000000BEF683BFD0", +"000000067F000080000009C00C0000025D39-000000067F000080000009C00C000002F49F__000000BD263A5849-000000BDA607F261", +"000000067F000080000009C00C0000028000-000000067F000080000009C00C000002C000__000000BEF683BFD0", +"000000067F000080000009C00C000002C000-000000067F000080000009C00C0000030000__000000BEF683BFD0", +"000000067F000080000009C00C000002F49F-000000067F000080000009C00C0000038BB2__000000BD263A5849-000000BDA607F261", +"000000067F000080000009C00C0000030000-000000067F000080000009C00C0000034000__000000BEF683BFD0", +"000000067F000080000009C00C0000034000-000000067F000080000009C00C0000038000__000000BEF683BFD0", +"000000067F000080000009C00C0000038000-000000067F000080000009C00C000003C000__000000BEF683BFD0", +"000000067F000080000009C00C0000038BB2-000000067F000080000009C00C0000042318__000000BD263A5849-000000BDA607F261", +"000000067F000080000009C00C000003C000-000000067F000080000009C00C0000040000__000000BEF683BFD0", +"000000067F000080000009C00C0000040000-000000067F000080000009C00C0000044000__000000BEF683BFD0", +"000000067F000080000009C00C0000042318-000000067F000080000009C00C000004BA7E__000000BD263A5849-000000BDA607F261", +"000000067F000080000009C00C0000044000-000000067F000080000009C00C0000048000__000000BEF683BFD0", +"000000067F000080000009C00C0000048000-000000067F000080000009C00C000004C000__000000BEF06884C8", +"000000067F000080000009C00C000004BA7E-030000000000000000000000000000000002__000000BD263A5849-000000BDA607F261", +"000000067F000080000009C00C000004BAC3-000000067F000080000009C00C00000551F8__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C000004C000-000000067F000080000009C00C0000050000__000000BEF06884C8", +"000000067F000080000009C00C0000050000-000000067F000080000009C00C0000054000__000000BEF06884C8", +"000000067F000080000009C00C0000054000-000000067F000080000009C00C0000058000__000000BEF06884C8", +"000000067F000080000009C00C00000551F8-000000067F000080000009C00C000005E90C__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C0000058000-000000067F000080000009C00C000005C000__000000BEF06884C8", +"000000067F000080000009C00C000005C000-000000067F000080000009C00C0000060000__000000BEF06884C8", +"000000067F000080000009C00C000005E90C-000000067F000080000009C00C000006802C__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C0000060000-000000067F000080000009C00C0000064000__000000BEF06884C8", +"000000067F000080000009C00C0000064000-000000067F000080000009C00C0000068000__000000BEF06884C8", +"000000067F000080000009C00C0000068000-000000067F000080000009C00C000006C000__000000BEF06884C8", +"000000067F000080000009C00C000006802C-000000067F000080000009C00C0000071783__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C000006C000-000000067F000080000009C00C0000070000__000000BEF06884C8", +"000000067F000080000009C00C0000070000-000000067F000080000009C00C0000074000__000000BEF06884C8", +"000000067F000080000009C00C0000071783-000000067F000080000009C00C000007AEE9__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C0000074000-000000067F000080000009C00C0000078000__000000BEF06884C8", +"000000067F000080000009C00C0000078000-000000067F000080000009C00C000007C000__000000BEF06884C8", +"000000067F000080000009C00C000007AEE9-000000067F000080000009C00C000008460B__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C000007C000-000000067F000080000009C00C0000080000__000000BEF06884C8", +"000000067F000080000009C00C0000080000-000000067F000080000009C00C0000084000__000000BEF06884C8", +"000000067F000080000009C00C0000084000-000000067F000080000009C00C0000088000__000000BEF06884C8", +"000000067F000080000009C00C000008460B-000000067F000080000009C00C000008DD71__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C0000088000-000000067F000080000009C00C000008C000__000000BEF06884C8", +"000000067F000080000009C00C000008C000-000000067F000080000009C00C0000090000__000000BEF06884C8", +"000000067F000080000009C00C000008DD71-000000067F000080000009C00C00000974D7__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C0000090000-000000067F000080000009C00C0000094000__000000BEF06884C8", +"000000067F000080000009C00C0000094000-000000067F000080000009C00C0000098000__000000BEF06884C8", +"000000067F000080000009C00C00000974D7-000000067F000080000009C00C00000A0C0B__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C0000098000-000000067F000080000009C00C000009C000__000000BEF06884C8", +"000000067F000080000009C00C000009C000-000000067F000080000009C00C00000A0000__000000BEF06884C8", +"000000067F000080000009C00C00000A0000-000000067F000080000009C00C00000A4000__000000BEF06884C8", +"000000067F000080000009C00C00000A0C0B-000000067F000080000009C00C00000AA371__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C00000A4000-000000067F000080000009C00C00000A8000__000000BEF06884C8", +"000000067F000080000009C00C00000A8000-000000067F000080000009C00C00000AC000__000000BEF06884C8", +"000000067F000080000009C00C00000AA371-000000067F000080000009C00C0100000000__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C00000AC000-000000067F000080000009C00C00000B0000__000000BEF06884C8", +"000000067F000080000009C00C00000B0000-000000067F000080000009C00C00000B4000__000000BEF06884C8", +"000000067F000080000009C00C00000B2921-000000067F000080000009C00C00000BC087__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C00000B4000-000000067F000080000009C00C00000B8000__000000BEF06884C8", +"000000067F000080000009C00C00000B8000-000000067F000080000009C00C00000BC000__000000BEF06884C8", +"000000067F000080000009C00C00000BC000-000000067F000080000009C00C00000C0000__000000BEF06884C8", +"000000067F000080000009C00C00000BC087-000000067F000080000009C00C00000C57B8__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C00000C0000-000000067F000080000009C00C00000C4000__000000BEF06884C8", +"000000067F000080000009C00C00000C4000-000000067F000080000009C00C00000C8000__000000BEF06884C8", +"000000067F000080000009C00C00000C57B8-000000067F000080000009C00C00000CEF09__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C00000C8000-000000067F000080000009C00C00000CC000__000000BEF06884C8", +"000000067F000080000009C00C00000CC000-000000067F000080000009C00C00000D0000__000000BEF06884C8", +"000000067F000080000009C00C00000CEF09-000000067F000080000009C00C00000D862B__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C00000D0000-000000067F000080000009C00C00000D4000__000000BEF06884C8", +"000000067F000080000009C00C00000D4000-000000067F000080000009C00C00000D8000__000000BEF06884C8", +"000000067F000080000009C00C00000D8000-000000067F000080000009C00C00000DC000__000000BEF06884C8", +"000000067F000080000009C00C00000D862B-000000067F000080000009C00C00000E1D7F__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C00000DC000-000000067F000080000009C00C00000E0000__000000BEF06884C8", +"000000067F000080000009C00C00000E0000-000000067F000080000009C00C00000E4000__000000BEF06884C8", +"000000067F000080000009C00C00000E1D7F-000000067F000080000009C00C00000EB4E5__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C00000E4000-000000067F000080000009C00C00000E8000__000000BEF06884C8", +"000000067F000080000009C00C00000E8000-000000067F000080000009C00C00000EC000__000000BEF06884C8", +"000000067F000080000009C00C00000EB4E5-000000067F000080000009C00C00000F4C0B__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C00000EC000-000000067F000080000009C00C00000F0000__000000BEF06884C8", +"000000067F000080000009C00C00000F0000-000000067F000080000009C00C00000F4000__000000BEF06884C8", +"000000067F000080000009C00C00000F4000-000000067F000080000009C00C00000F8000__000000BEF06884C8", +"000000067F000080000009C00C00000F4C0B-000000067F000080000009C00C00000FE371__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C00000F8000-000000067F000080000009C00C00000FC000__000000BEF06884C8", +"000000067F000080000009C00C00000FC000-000000067F000080000009C00C0000100000__000000BEF06884C8", +"000000067F000080000009C00C00000FE371-000000067F000080000009C00C0000107AD7__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C0000100000-000000067F000080000009C00C0000104000__000000BEF06884C8", +"000000067F000080000009C00C0000104000-000000067F000080000009C00C0000108000__000000BEF06884C8", +"000000067F000080000009C00C0000107AD7-000000067F000080000009C00C000011120B__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C0000108000-000000067F000080000009C00C000010C000__000000BEF06884C8", +"000000067F000080000009C00C000010C000-030000000000000000000000000000000002__000000BEF06884C8", +"000000067F000080000009C00C000011120B-010000000000000001000000050000000003__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009E00C0000000000-000000067F000080000009E00C0000004000__000000C0C9769FD8", +"000000067F000080000009E00C0000004000-000000067F000080000009E00C0000008000__000000C0C9769FD8", +"000000067F000080000009E00C0000004916-000000067F000080000009E00C000000E07C__000000BEF5F47FD1-000000BF48FFEB11", +"000000067F000080000009E00C0000008000-000000067F000080000009E00C000000C000__000000C0C9769FD8", +"000000067F000080000009E00C000000C000-000000067F000080000009E00C0000010000__000000C0C9769FD8", +"000000067F000080000009E00C000000E07C-000000067F000080000009E00C000001779A__000000BEF5F47FD1-000000BF48FFEB11", +"000000067F000080000009E00C0000010000-000000067F000080000009E00C0000014000__000000C0C9769FD8", +"000000067F000080000009E00C0000014000-000000067F000080000009E00C0000018000__000000C0C9769FD8", +"000000067F000080000009E00C000001779A-000000067F000080000009E00C0000020F00__000000BEF5F47FD1-000000BF48FFEB11", +"000000067F000080000009E00C0000018000-000000067F000080000009E00C000001C000__000000C0C9769FD8", +"000000067F000080000009E00C000001C000-000000067F000080000009E00C0000020000__000000C0C9769FD8", +"000000067F000080000009E00C0000020000-000000067F000080000009E00C0000024000__000000C0C9769FD8", +"000000067F000080000009E00C0000020F00-000000067F000080000009E00C000002A60B__000000BEF5F47FD1-000000BF48FFEB11", +"000000067F000080000009E00C0000024000-000000067F000080000009E00C0000028000__000000C0C9769FD8", +"000000067F000080000009E00C0000028000-000000067F000080000009E00C000002C000__000000C0C9769FD8", +"000000067F000080000009E00C000002A60B-030000000000000000000000000000000002__000000BEF5F47FD1-000000BF48FFEB11", +"000000067F000080000009E00C000002C000-000000067F000080000009E00C0000030000__000000C0B597E900", +"000000067F000080000009E00C000002C000-000000067F000080000009E00C0000030000__000000C1972392A8", +"000000067F000080000009E00C000002F506-000000067F000080000009E00C0000038C11__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C0000030000-000000067F000080000009E00C0000034000__000000C0B597E900", +"000000067F000080000009E00C0000030000-000000067F000080000009E00C0000034000__000000C1972392A8", +"000000067F000080000009E00C0000034000-000000067F000080000009E00C0000038000__000000C0B597E900", +"000000067F000080000009E00C0000034000-000000067F000080000009E00C0000038000__000000C1972392A8", +"000000067F000080000009E00C0000038000-000000067F000080000009E00C000003C000__000000C0B597E900", +"000000067F000080000009E00C0000038000-000000067F000080000009E00C000003C000__000000C1972392A8", +"000000067F000080000009E00C0000038C11-000000067F000080000009E00C0000042361__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C000003C000-000000067F000080000009E00C0000040000__000000C0B597E900", +"000000067F000080000009E00C000003C000-000000067F000080000009E00C0000040000__000000C1972392A8", +"000000067F000080000009E00C0000040000-000000067F000080000009E00C0000044000__000000C0B597E900", +"000000067F000080000009E00C0000040000-000000067F000080000009E00C0000044000__000000C1972392A8", +"000000067F000080000009E00C0000042361-000000067F000080000009E00C000004BAC7__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C0000044000-000000067F000080000009E00C0000048000__000000C0B597E900", +"000000067F000080000009E00C0000044000-000000067F000080000009E00C0000048000__000000C1972392A8", +"000000067F000080000009E00C0000048000-000000067F000080000009E00C000004C000__000000C0B597E900", +"000000067F000080000009E00C0000048000-000000067F000080000009E00C000004C000__000000C1972392A8", +"000000067F000080000009E00C000004BAC7-000000067F000080000009E00C00000551FC__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C000004C000-000000067F000080000009E00C0000050000__000000C0B597E900", +"000000067F000080000009E00C000004C000-000000067F000080000009E00C0000050000__000000C1972392A8", +"000000067F000080000009E00C0000050000-000000067F000080000009E00C0000054000__000000C0B597E900", +"000000067F000080000009E00C0000050000-000000067F000080000009E00C0000054000__000000C1972392A8", +"000000067F000080000009E00C0000050E89-000000067F000080000009E00C00000A18A0__000000C1426D92E1-000000C19744E959", +"000000067F000080000009E00C0000054000-000000067F000080000009E00C0000058000__000000C0B597E900", +"000000067F000080000009E00C0000054000-000000067F000080000009E00C0000058000__000000C1972392A8", +"000000067F000080000009E00C00000551FC-000000067F000080000009E00C000005E90B__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C0000058000-000000067F000080000009E00C000005C000__000000C0B597E900", +"000000067F000080000009E00C0000058000-000000067F000080000009E00C000005C000__000000C1972392A8", +"000000067F000080000009E00C000005C000-000000067F000080000009E00C0000060000__000000C0B597E900", +"000000067F000080000009E00C000005C000-000000067F000080000009E00C0000060000__000000C1972392A8", +"000000067F000080000009E00C000005E90B-000000067F000080000009E00C000006802B__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C0000060000-000000067F000080000009E00C0000064000__000000C0B597E900", +"000000067F000080000009E00C0000060000-000000067F000080000009E00C0000064000__000000C1972392A8", +"000000067F000080000009E00C0000064000-000000067F000080000009E00C0000068000__000000C0B597E900", +"000000067F000080000009E00C0000064000-000000067F000080000009E00C0000068000__000000C1972392A8", +"000000067F000080000009E00C0000068000-000000067F000080000009E00C000006C000__000000C0B597E900", +"000000067F000080000009E00C0000068000-000000067F000080000009E00C000006C000__000000C1972392A8", +"000000067F000080000009E00C000006802B-000000067F000080000009E00C0000071782__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C000006C000-000000067F000080000009E00C0000070000__000000C0B597E900", +"000000067F000080000009E00C000006C000-000000067F000080000009E00C0000070000__000000C1972392A8", +"000000067F000080000009E00C0000070000-000000067F000080000009E00C0000074000__000000C0B597E900", +"000000067F000080000009E00C0000070000-000000067F000080000009E00C0000074000__000000C1972392A8", +"000000067F000080000009E00C0000071782-000000067F000080000009E00C000007AEE8__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C0000074000-000000067F000080000009E00C0000078000__000000C0B597E900", +"000000067F000080000009E00C0000074000-000000067F000080000009E00C0000078000__000000C1972392A8", +"000000067F000080000009E00C0000078000-000000067F000080000009E00C000007C000__000000C0B597E900", +"000000067F000080000009E00C0000078000-000000067F000080000009E00C000007C000__000000C1972392A8", +"000000067F000080000009E00C000007AEE8-000000067F000080000009E00C000008460B__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C000007C000-000000067F000080000009E00C0000080000__000000C0B597E900", +"000000067F000080000009E00C000007C000-000000067F000080000009E00C0000080000__000000C1972392A8", +"000000067F000080000009E00C0000080000-000000067F000080000009E00C0000084000__000000C0B597E900", +"000000067F000080000009E00C0000080000-000000067F000080000009E00C0000084000__000000C1972392A8", +"000000067F000080000009E00C0000084000-000000067F000080000009E00C0000088000__000000C0B597E900", +"000000067F000080000009E00C0000084000-000000067F000080000009E00C0000088000__000000C1972392A8", +"000000067F000080000009E00C000008460B-000000067F000080000009E00C000008DD71__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C0000088000-000000067F000080000009E00C000008C000__000000C0B597E900", +"000000067F000080000009E00C0000088000-000000067F000080000009E00C000008C000__000000C1972392A8", +"000000067F000080000009E00C000008C000-000000067F000080000009E00C0000090000__000000C0B597E900", +"000000067F000080000009E00C000008C000-000000067F000080000009E00C0000090000__000000C1972392A8", +"000000067F000080000009E00C000008DD71-000000067F000080000009E00C00000974D7__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C0000090000-000000067F000080000009E00C0000094000__000000C0B597E900", +"000000067F000080000009E00C0000090000-000000067F000080000009E00C0000094000__000000C1972392A8", +"000000067F000080000009E00C0000094000-000000067F000080000009E00C0000098000__000000C0B597E900", +"000000067F000080000009E00C0000094000-000000067F000080000009E00C0000098000__000000C1972392A8", +"000000067F000080000009E00C00000974D7-000000067F000080000009E00C0100000000__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C0000098000-000000067F000080000009E00C000009C000__000000C0B597E900", +"000000067F000080000009E00C0000098000-000000067F000080000009E00C000009C000__000000C1972392A8", +"000000067F000080000009E00C000009C000-000000067F000080000009E00C00000A0000__000000C0B597E900", +"000000067F000080000009E00C000009C000-000000067F000080000009E00C00000A0000__000000C1972392A8", +"000000067F000080000009E00C000009FB21-000000067F000080000009E00C00000A9230__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000A0000-000000067F000080000009E00C00000A4000__000000C0B597E900", +"000000067F000080000009E00C00000A0000-000000067F000080000009E00C00000A4000__000000C1972392A8", +"000000067F000080000009E00C00000A18A4-000000067F000080000009E00C00000F2B76__000000C1426D92E1-000000C19744E959", +"000000067F000080000009E00C00000A4000-000000067F000080000009E00C00000A8000__000000C0B597E900", +"000000067F000080000009E00C00000A4000-000000067F000080000009E00C00000A8000__000000C1972392A8", +"000000067F000080000009E00C00000A8000-000000067F000080000009E00C00000AC000__000000C0B597E900", +"000000067F000080000009E00C00000A8000-000000067F000080000009E00C00000AC000__000000C1972392A8", +"000000067F000080000009E00C00000A9230-000000067F000080000009E00C00000B297D__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000AC000-000000067F000080000009E00C00000B0000__000000C0B597E900", +"000000067F000080000009E00C00000AC000-000000067F000080000009E00C00000B0000__000000C1972392A8", +"000000067F000080000009E00C00000B0000-000000067F000080000009E00C00000B4000__000000C0B597E900", +"000000067F000080000009E00C00000B0000-000000067F000080000009E00C00000B4000__000000C1972392A8", +"000000067F000080000009E00C00000B297D-000000067F000080000009E00C00000BC0E3__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000B4000-000000067F000080000009E00C00000B8000__000000C0B597E900", +"000000067F000080000009E00C00000B4000-000000067F000080000009E00C00000B8000__000000C1972392A8", +"000000067F000080000009E00C00000B8000-000000067F000080000009E00C00000BC000__000000C0B597E900", +"000000067F000080000009E00C00000B8000-000000067F000080000009E00C00000BC000__000000C1972392A8", +"000000067F000080000009E00C00000BC000-000000067F000080000009E00C00000C0000__000000C0B597E900", +"000000067F000080000009E00C00000BC000-000000067F000080000009E00C00000C0000__000000C1972392A8", +"000000067F000080000009E00C00000BC0E3-000000067F000080000009E00C00000C580C__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000C0000-000000067F000080000009E00C00000C4000__000000C0B597E900", +"000000067F000080000009E00C00000C0000-000000067F000080000009E00C00000C4000__000000C1972392A8", +"000000067F000080000009E00C00000C0C74-000000067F000080000009E0140000001880__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F000080000009E00C00000C4000-000000067F000080000009E00C00000C8000__000000C0B597E900", +"000000067F000080000009E00C00000C4000-000000067F000080000009E00C00000C8000__000000C1972392A8", +"000000067F000080000009E00C00000C580C-000000067F000080000009E00C00000CEF71__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000C8000-000000067F000080000009E00C00000CC000__000000C0B597E900", +"000000067F000080000009E00C00000C8000-000000067F000080000009E00C00000CC000__000000C1972392A8", +"000000067F000080000009E00C00000CC000-000000067F000080000009E00C00000D0000__000000C0B597E900", +"000000067F000080000009E00C00000CC000-000000067F000080000009E00C00000D0000__000000C1972392A8", +"000000067F000080000009E00C00000CEF71-000000067F000080000009E00C00000D86D7__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000D0000-000000067F000080000009E00C00000D4000__000000C0B597E900", +"000000067F000080000009E00C00000D0000-000000067F000080000009E00C00000D4000__000000C1972392A8", +"000000067F000080000009E00C00000D4000-000000067F000080000009E00C00000D8000__000000C0B597E900", +"000000067F000080000009E00C00000D4000-000000067F000080000009E00C00000D8000__000000C1972392A8", +"000000067F000080000009E00C00000D8000-000000067F000080000009E00C00000DC000__000000C0B597E900", +"000000067F000080000009E00C00000D8000-000000067F000080000009E00C00000DC000__000000C1972392A8", +"000000067F000080000009E00C00000D86D7-000000067F000080000009E00C00000E1E0C__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000DC000-000000067F000080000009E00C00000E0000__000000C0B597E900", +"000000067F000080000009E00C00000DC000-000000067F000080000009E00C00000E0000__000000C1972392A8", +"000000067F000080000009E00C00000E0000-000000067F000080000009E00C00000E4000__000000C0B597E900", +"000000067F000080000009E00C00000E0000-000000067F000080000009E00C00000E4000__000000C1972392A8", +"000000067F000080000009E00C00000E1E0C-000000067F000080000009E00C00000EB572__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000E4000-000000067F000080000009E00C00000E8000__000000C0B597E900", +"000000067F000080000009E00C00000E4000-000000067F000080000009E00C00000E8000__000000C1972392A8", +"000000067F000080000009E00C00000E8000-000000067F000080000009E00C00000EC000__000000C0B597E900", +"000000067F000080000009E00C00000E8000-000000067F000080000009E00C00000EC000__000000C1972392A8", +"000000067F000080000009E00C00000EB572-000000067F000080000009E00C00000F4CD8__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000EC000-000000067F000080000009E00C00000F0000__000000C0B597E900", +"000000067F000080000009E00C00000EC000-000000067F000080000009E00C00000F0000__000000C1972392A8", +"000000067F000080000009E00C00000F0000-000000067F000080000009E00C00000F4000__000000C0B597E900", +"000000067F000080000009E00C00000F0000-000000067F000080000009E00C00000F4000__000000C1972392A8", +"000000067F000080000009E00C00000F2B77-000000067F000080000009E014000000D3EB__000000C1426D92E1-000000C19744E959", +"000000067F000080000009E00C00000F4000-000000067F000080000009E00C00000F8000__000000C0B597E900", +"000000067F000080000009E00C00000F4000-000000067F000080000009E00C00000F8000__000000C1972392A8", +"000000067F000080000009E00C00000F4CD8-000000067F000080000009E00C00000FE40B__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000F8000-000000067F000080000009E00C00000FC000__000000C0B597E900", +"000000067F000080000009E00C00000F8000-000000067F000080000009E00C00000FC000__000000C1972392A8", +"000000067F000080000009E00C00000FC000-000000067F000080000009E00C0000100000__000000C0B597E900", +"000000067F000080000009E00C00000FC000-000000067F000080000009E00C0000100000__000000C1972392A8", +"000000067F000080000009E00C00000FE40B-000000067F000080000009E00C0000107B27__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C0000100000-000000067F000080000009E00C0000104000__000000C0B597E900", +"000000067F000080000009E00C0000100000-000000067F000080000009E00C0000104000__000000C1972392A8", +"000000067F000080000009E00C0000104000-000000067F000080000009E00C0000108000__000000C1972392A8", +"000000067F000080000009E00C0000104000-030000000000000000000000000000000002__000000C0B597E900", +"000000067F000080000009E00C0000107B27-000000067F000080000009E00C000011128D__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C0000108000-000000067F000080000009E00C000010C000__000000C1972392A8", +"000000067F000080000009E00C000010C000-000000067F000080000009E00C0000110000__000000C1972392A8", +"000000067F000080000009E00C0000110000-000000067F000080000009E0120100000000__000000C1972392A8", +"000000067F000080000009E00C000011128D-010000000000000001000000050000000003__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E0140000000000-000000067F000080000009E0140000004000__000000C1972392A8", +"000000067F000080000009E0140000001880-000000067F000080000009E014000000842E__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F000080000009E0140000004000-000000067F000080000009E0140000008000__000000C1972392A8", +"000000067F000080000009E0140000008000-000000067F000080000009E014000000C000__000000C1972392A8", +"000000067F000080000009E014000000842E-000000067F000080000009E014000000F011__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F000080000009E014000000C000-000000067F000080000009E0140000010000__000000C1972392A8", +"000000067F000080000009E014000000D3EB-000000067F000080000009E014000002578F__000000C1426D92E1-000000C19744E959", +"000000067F000080000009E014000000F011-000000067F000080000009E0140000015BD8__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F000080000009E0140000010000-000000067F000080000009E0140000014000__000000C1972392A8", +"000000067F000080000009E0140000014000-000000067F000080000009E0140000018000__000000C1972392A8", +"000000067F000080000009E0140000015BD8-000000067F000080000009E014000001C7C5__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F000080000009E0140000018000-000000067F000080000009E014000001C000__000000C1972392A8", +"000000067F000080000009E014000001C000-000000067F000080000009E0140000020000__000000C1972392A8", +"000000067F000080000009E014000001C7C5-000000067F000080000009E014000002337F__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F000080000009E0140000020000-000000067F000080000009E0140000024000__000000C1972392A8", +"000000067F000080000009E014000002337F-000000067F000080000009E0140000029F4A__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F000080000009E0140000024000-000000067F000080000009E0140000028000__000000C1972392A8", +"000000067F000080000009E0140000025790-030000000000000000000000000000000002__000000C1426D92E1-000000C19744E959", +"000000067F000080000009E0140000028000-000000067F000080000009E014000002C000__000000C1972392A8", +"000000067F000080000009E0140000029F4A-030000000000000000000000000000000002__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F000080000009E014000002C000-030000000000000000000000000000000002__000000C1972392A8", +"000000067F00008000000A000C0000000000-000000067F00008000000A000C0000004000__000000C3687EDFE8", +"000000067F00008000000A000C0000004000-000000067F00008000000A000C0000008000__000000C3687EDFE8", +"000000067F00008000000A000C0000008000-000000067F00008000000A000C000000C000__000000C3687EDFE8", +"000000067F00008000000A000C0000008EF9-000000067F00008000000A000C000001260C__000000C19744E959-000000C217F3F379", +"000000067F00008000000A000C000000C000-000000067F00008000000A000C0000010000__000000C3687EDFE8", +"000000067F00008000000A000C0000010000-000000067F00008000000A000C0000014000__000000C3687EDFE8", +"000000067F00008000000A000C000001260C-000000067F00008000000A000C000001BD72__000000C19744E959-000000C217F3F379", +"000000067F00008000000A000C0000014000-000000067F00008000000A000C0000018000__000000C3687EDFE8", +"000000067F00008000000A000C0000018000-000000067F00008000000A000C000001C000__000000C3687EDFE8", +"000000067F00008000000A000C000001BD72-000000067F00008000000A000C00000254D8__000000C19744E959-000000C217F3F379", +"000000067F00008000000A000C000001C000-000000067F00008000000A000C0000020000__000000C3687EDFE8", +"000000067F00008000000A000C0000020000-000000067F00008000000A000C0000024000__000000C3687EDFE8", +"000000067F00008000000A000C0000024000-000000067F00008000000A000C0000028000__000000C3687EDFE8", +"000000067F00008000000A000C00000254D8-000000067F00008000000A000C000002EC0B__000000C19744E959-000000C217F3F379", +"000000067F00008000000A000C0000028000-000000067F00008000000A000C000002C000__000000C3687EDFE8", +"000000067F00008000000A000C000002C000-000000067F00008000000A000C0000030000__000000C3687EDFE8", +"000000067F00008000000A000C000002EC0B-000000067F00008000000A000C0000038322__000000C19744E959-000000C217F3F379", +"000000067F00008000000A000C0000030000-000000067F00008000000A000C0000034000__000000C3687EDFE8", +"000000067F00008000000A000C0000034000-000000067F00008000000A000C0000038000__000000C3687EDFE8", +"000000067F00008000000A000C0000038000-000000067F00008000000A000C000003C000__000000C3687EDFE8", +"000000067F00008000000A000C0000038322-000000067F00008000000A000C0000041A88__000000C19744E959-000000C217F3F379", +"000000067F00008000000A000C000003C000-000000067F00008000000A000C0000040000__000000C3687EDFE8", +"000000067F00008000000A000C0000040000-000000067F00008000000A000C0000044000__000000C3687EDFE8", +"000000067F00008000000A000C0000041A88-000000067F00008000000A000C000004B1EE__000000C19744E959-000000C217F3F379", +"000000067F00008000000A000C0000044000-000000067F00008000000A000C0000048000__000000C3687EDFE8", +"000000067F00008000000A000C0000048000-000000067F00008000000A000C000004C000__000000C366619FD8", +"000000067F00008000000A000C0000048000-000000067F00008000000A000C000004C000__000000C42FE73810", +"000000067F00008000000A000C000004B1EE-030000000000000000000000000000000002__000000C19744E959-000000C217F3F379", +"000000067F00008000000A000C000004BACE-000000067F00008000000A000C0000055202__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C000004C000-000000067F00008000000A000C0000050000__000000C366619FD8", +"000000067F00008000000A000C000004C000-000000067F00008000000A000C0000050000__000000C42FE73810", +"000000067F00008000000A000C0000050000-000000067F00008000000A000C0000054000__000000C366619FD8", +"000000067F00008000000A000C0000050000-000000067F00008000000A000C0000054000__000000C42FE73810", +"000000067F00008000000A000C0000054000-000000067F00008000000A000C0000058000__000000C366619FD8", +"000000067F00008000000A000C0000054000-000000067F00008000000A000C0000058000__000000C42FE73810", +"000000067F00008000000A000C0000055202-000000067F00008000000A000C000005E90D__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C0000056365-000000067F00008000000A000C00000ACA1A__000000C3E17E01A1-000000C430961E71", +"000000067F00008000000A000C0000058000-000000067F00008000000A000C000005C000__000000C366619FD8", +"000000067F00008000000A000C0000058000-000000067F00008000000A000C000005C000__000000C42FE73810", +"000000067F00008000000A000C000005C000-000000067F00008000000A000C0000060000__000000C366619FD8", +"000000067F00008000000A000C000005C000-000000067F00008000000A000C0000060000__000000C42FE73810", +"000000067F00008000000A000C000005E90D-000000067F00008000000A000C000006802B__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C0000060000-000000067F00008000000A000C0000064000__000000C366619FD8", +"000000067F00008000000A000C0000060000-000000067F00008000000A000C0000064000__000000C42FE73810", +"000000067F00008000000A000C0000064000-000000067F00008000000A000C0000068000__000000C366619FD8", +"000000067F00008000000A000C0000064000-000000067F00008000000A000C0000068000__000000C42FE73810", +"000000067F00008000000A000C0000068000-000000067F00008000000A000C000006C000__000000C366619FD8", +"000000067F00008000000A000C0000068000-000000067F00008000000A000C000006C000__000000C42FE73810", +"000000067F00008000000A000C000006802B-000000067F00008000000A000C0000071782__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C000006C000-000000067F00008000000A000C0000070000__000000C366619FD8", +"000000067F00008000000A000C000006C000-000000067F00008000000A000C0000070000__000000C42FE73810", +"000000067F00008000000A000C0000070000-000000067F00008000000A000C0000074000__000000C366619FD8", +"000000067F00008000000A000C0000070000-000000067F00008000000A000C0000074000__000000C42FE73810", +"000000067F00008000000A000C0000071782-000000067F00008000000A000C000007AEE8__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C0000074000-000000067F00008000000A000C0000078000__000000C366619FD8", +"000000067F00008000000A000C0000074000-000000067F00008000000A000C0000078000__000000C42FE73810", +"000000067F00008000000A000C0000078000-000000067F00008000000A000C000007C000__000000C366619FD8", +"000000067F00008000000A000C0000078000-000000067F00008000000A000C000007C000__000000C42FE73810", +"000000067F00008000000A000C000007AEE8-000000067F00008000000A000C000008460B__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C000007C000-000000067F00008000000A000C0000080000__000000C366619FD8", +"000000067F00008000000A000C000007C000-000000067F00008000000A000C0000080000__000000C42FE73810", +"000000067F00008000000A000C0000080000-000000067F00008000000A000C0000084000__000000C366619FD8", +"000000067F00008000000A000C0000080000-000000067F00008000000A000C0000084000__000000C42FE73810", +"000000067F00008000000A000C0000084000-000000067F00008000000A000C0000088000__000000C366619FD8", +"000000067F00008000000A000C0000084000-000000067F00008000000A000C0000088000__000000C42FE73810", +"000000067F00008000000A000C000008460B-000000067F00008000000A000C000008DD71__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C0000088000-000000067F00008000000A000C000008C000__000000C366619FD8", +"000000067F00008000000A000C0000088000-000000067F00008000000A000C000008C000__000000C42FE73810", +"000000067F00008000000A000C000008C000-000000067F00008000000A000C0000090000__000000C366619FD8", +"000000067F00008000000A000C000008C000-000000067F00008000000A000C0000090000__000000C42FE73810", +"000000067F00008000000A000C000008DD71-000000067F00008000000A000C00000974D7__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C0000090000-000000067F00008000000A000C0000094000__000000C366619FD8", +"000000067F00008000000A000C0000090000-000000067F00008000000A000C0000094000__000000C42FE73810", +"000000067F00008000000A000C0000094000-000000067F00008000000A000C0000098000__000000C366619FD8", +"000000067F00008000000A000C0000094000-000000067F00008000000A000C0000098000__000000C42FE73810", +"000000067F00008000000A000C00000974D7-000000067F00008000000A000C00000A0C0B__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C0000098000-000000067F00008000000A000C000009C000__000000C366619FD8", +"000000067F00008000000A000C0000098000-000000067F00008000000A000C000009C000__000000C42FE73810", +"000000067F00008000000A000C000009C000-000000067F00008000000A000C00000A0000__000000C366619FD8", +"000000067F00008000000A000C000009C000-000000067F00008000000A000C00000A0000__000000C42FE73810", +"000000067F00008000000A000C00000A0000-000000067F00008000000A000C00000A4000__000000C366619FD8", +"000000067F00008000000A000C00000A0000-000000067F00008000000A000C00000A4000__000000C42FE73810", +"000000067F00008000000A000C00000A0C0B-000000067F00008000000A000C00000AA371__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C00000A4000-000000067F00008000000A000C00000A8000__000000C366619FD8", +"000000067F00008000000A000C00000A4000-000000067F00008000000A000C00000A8000__000000C42FE73810", +"000000067F00008000000A000C00000A8000-000000067F00008000000A000C00000AC000__000000C366619FD8", +"000000067F00008000000A000C00000A8000-000000067F00008000000A000C00000AC000__000000C42FE73810", +"000000067F00008000000A000C00000AA371-000000067F00008000000A000C00000B3AD7__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C00000AC000-000000067F00008000000A000C00000B0000__000000C366619FD8", +"000000067F00008000000A000C00000AC000-000000067F00008000000A000C00000B0000__000000C42FE73810", +"000000067F00008000000A000C00000ACA25-000000067F00008000000A000C0000102D7C__000000C3E17E01A1-000000C430961E71", +"000000067F00008000000A000C00000B0000-000000067F00008000000A000C00000B4000__000000C366619FD8", +"000000067F00008000000A000C00000B0000-000000067F00008000000A000C00000B4000__000000C42FE73810", +"000000067F00008000000A000C00000B3AD7-000000067F00008000000A000C0100000000__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C00000B4000-000000067F00008000000A000C00000B8000__000000C366619FD8", +"000000067F00008000000A000C00000B4000-000000067F00008000000A000C00000B8000__000000C42FE73810", +"000000067F00008000000A000C00000B8000-000000067F00008000000A000C00000BC000__000000C366619FD8", +"000000067F00008000000A000C00000B8000-000000067F00008000000A000C00000BC000__000000C42FE73810", +"000000067F00008000000A000C00000B8B52-000000067F00008000000A00140000001132__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000A000C00000BC000-000000067F00008000000A000C00000C0000__000000C366619FD8", +"000000067F00008000000A000C00000BC000-000000067F00008000000A000C00000C0000__000000C42FE73810", +"000000067F00008000000A000C00000BC072-000000067F00008000000A000C00000C57A3__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C00000C0000-000000067F00008000000A000C00000C4000__000000C366619FD8", +"000000067F00008000000A000C00000C0000-000000067F00008000000A000C00000C4000__000000C42FE73810", +"000000067F00008000000A000C00000C4000-000000067F00008000000A000C00000C8000__000000C366619FD8", +"000000067F00008000000A000C00000C4000-000000067F00008000000A000C00000C8000__000000C42FE73810", +"000000067F00008000000A000C00000C57A3-000000067F00008000000A000C00000CEF09__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C00000C8000-000000067F00008000000A000C00000CC000__000000C366619FD8", +"000000067F00008000000A000C00000C8000-000000067F00008000000A000C00000CC000__000000C42FE73810", +"000000067F00008000000A000C00000CC000-000000067F00008000000A000C00000D0000__000000C366619FD8", +"000000067F00008000000A000C00000CC000-000000067F00008000000A000C00000D0000__000000C42FE73810", +"000000067F00008000000A000C00000CEF09-000000067F00008000000A000C00000D862B__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C00000D0000-000000067F00008000000A000C00000D4000__000000C366619FD8", +"000000067F00008000000A000C00000D0000-000000067F00008000000A000C00000D4000__000000C42FE73810", +"000000067F00008000000A000C00000D4000-000000067F00008000000A000C00000D8000__000000C366619FD8", +"000000067F00008000000A000C00000D4000-000000067F00008000000A000C00000D8000__000000C42FE73810", +"000000067F00008000000A000C00000D8000-000000067F00008000000A000C00000DC000__000000C366619FD8", +"000000067F00008000000A000C00000D8000-000000067F00008000000A000C00000DC000__000000C42FE73810", +"000000067F00008000000A000C00000D862B-000000067F00008000000A000C00000E1D7F__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C00000DC000-000000067F00008000000A000C00000E0000__000000C366619FD8", +"000000067F00008000000A000C00000DC000-000000067F00008000000A000C00000E0000__000000C42FE73810", +"000000067F00008000000A000C00000E0000-000000067F00008000000A000C00000E4000__000000C366619FD8", +"000000067F00008000000A000C00000E0000-000000067F00008000000A000C00000E4000__000000C42FE73810", +"000000067F00008000000A000C00000E1D7F-000000067F00008000000A000C00000EB4E5__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C00000E4000-000000067F00008000000A000C00000E8000__000000C366619FD8", +"000000067F00008000000A000C00000E4000-000000067F00008000000A000C00000E8000__000000C42FE73810", +"000000067F00008000000A000C00000E8000-000000067F00008000000A000C00000EC000__000000C366619FD8", +"000000067F00008000000A000C00000E8000-000000067F00008000000A000C00000EC000__000000C42FE73810", +"000000067F00008000000A000C00000EB4E5-000000067F00008000000A000C00000F4C0B__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C00000EC000-000000067F00008000000A000C00000F0000__000000C366619FD8", +"000000067F00008000000A000C00000EC000-000000067F00008000000A000C00000F0000__000000C42FE73810", +"000000067F00008000000A000C00000F0000-000000067F00008000000A000C00000F4000__000000C366619FD8", +"000000067F00008000000A000C00000F0000-000000067F00008000000A000C00000F4000__000000C42FE73810", +"000000067F00008000000A000C00000F4000-000000067F00008000000A000C00000F8000__000000C366619FD8", +"000000067F00008000000A000C00000F4000-000000067F00008000000A000C00000F8000__000000C42FE73810", +"000000067F00008000000A000C00000F4C0B-000000067F00008000000A000C00000FE371__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C00000F8000-000000067F00008000000A000C00000FC000__000000C366619FD8", +"000000067F00008000000A000C00000F8000-000000067F00008000000A000C00000FC000__000000C42FE73810", +"000000067F00008000000A000C00000FC000-000000067F00008000000A000C0000100000__000000C366619FD8", +"000000067F00008000000A000C00000FC000-000000067F00008000000A000C0000100000__000000C42FE73810", +"000000067F00008000000A000C00000FE371-000000067F00008000000A000C0000107AD7__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C0000100000-000000067F00008000000A000C0000104000__000000C366619FD8", +"000000067F00008000000A000C0000100000-000000067F00008000000A000C0000104000__000000C42FE73810", +"000000067F00008000000A000C0000102D7F-000000067F00008000000A0014000001409C__000000C3E17E01A1-000000C430961E71", +"000000067F00008000000A000C0000104000-000000067F00008000000A000C0000108000__000000C366619FD8", +"000000067F00008000000A000C0000104000-000000067F00008000000A000C0000108000__000000C42FE73810", +"000000067F00008000000A000C0000107AD7-000000067F00008000000A000C000011120B__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C0000108000-000000067F00008000000A000C000010C000__000000C366619FD8", +"000000067F00008000000A000C0000108000-000000067F00008000000A000C000010C000__000000C42FE73810", +"000000067F00008000000A000C000010C000-000000067F00008000000A000C0000110000__000000C366619FD8", +"000000067F00008000000A000C000010C000-000000067F00008000000A000C0000110000__000000C42FE73810", +"000000067F00008000000A000C0000110000-000000067F00008000000A00120100000000__000000C42FE73810", +"000000067F00008000000A000C0000110000-030000000000000000000000000000000002__000000C366619FD8", +"000000067F00008000000A000C000011120B-010000000000000001000000050000000007__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A00140000000000-000000067F00008000000A00140000004000__000000C42FE73810", +"000000067F00008000000A00140000001132-000000067F00008000000A00140000007E49__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000A00140000004000-000000067F00008000000A00140000008000__000000C42FE73810", +"000000067F00008000000A00140000007E49-000000067F00008000000A0014000000EBBC__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000A00140000008000-000000067F00008000000A0014000000C000__000000C42FE73810", +"000000067F00008000000A0014000000C000-000000067F00008000000A00140000010000__000000C42FE73810", +"000000067F00008000000A0014000000EBBC-000000067F00008000000A00140000015925__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000A00140000010000-000000067F00008000000A00140000014000__000000C42FE73810", +"000000067F00008000000A00140000014000-000000067F00008000000A00140000018000__000000C42FE73810", +"000000067F00008000000A0014000001409F-000000067F00008000000A0016000000020E__000000C3E17E01A1-000000C430961E71", +"000000067F00008000000A00140000015925-000000067F00008000000A0014000001C612__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000A00140000018000-000000067F00008000000A0014000001C000__000000C42FE73810", +"000000067F00008000000A0014000001C000-000000067F00008000000A00140000020000__000000C42FE73810", +"000000067F00008000000A0014000001C612-000000067F00008000000A00140000023364__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000A00140000020000-000000067F00008000000A00140000024000__000000C42FE73810", +"000000067F00008000000A00140000023364-000000067F00008000000A0014000002A070__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000A00140000024000-000000067F00008000000A00140000028000__000000C42FE73810", +"000000067F00008000000A00140000028000-000000067F00008000000A0014000002C000__000000C42FE73810", +"000000067F00008000000A0014000002A070-030000000000000000000000000000000002__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000A0014000002C000-030000000000000000000000000000000002__000000C42FE73810", +"000000067F00008000000A0016000000020E-030000000000000000000000000000000002__000000C3E17E01A1-000000C430961E71", +"000000067F00008000000A200C0000000000-000000067F00008000000A200C0000004000__000000C601294000", +"000000067F00008000000A200C0000004000-000000067F00008000000A200C0000008000__000000C601294000", +"000000067F00008000000A200C0000008000-000000067F00008000000A200C000000C000__000000C601294000", +"000000067F00008000000A200C0000009748-000000067F00008000000A200C0000012EAE__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C000000C000-000000067F00008000000A200C0000010000__000000C601294000", +"000000067F00008000000A200C0000010000-000000067F00008000000A200C0000014000__000000C601294000", +"000000067F00008000000A200C0000012EAE-000000067F00008000000A200C000001C60A__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C0000014000-000000067F00008000000A200C0000018000__000000C601294000", +"000000067F00008000000A200C0000018000-000000067F00008000000A200C000001C000__000000C601294000", +"000000067F00008000000A200C000001C000-000000067F00008000000A200C0000020000__000000C601294000", +"000000067F00008000000A200C000001C60A-000000067F00008000000A200C0000025D38__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C0000020000-000000067F00008000000A200C0000024000__000000C601294000", +"000000067F00008000000A200C0000024000-000000067F00008000000A200C0000028000__000000C601294000", +"000000067F00008000000A200C0000025D38-000000067F00008000000A200C000002F49E__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C0000028000-000000067F00008000000A200C000002C000__000000C601294000", +"000000067F00008000000A200C000002C000-000000067F00008000000A200C0000030000__000000C601294000", +"000000067F00008000000A200C000002F49E-000000067F00008000000A200C0000038BB1__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C0000030000-000000067F00008000000A200C0000034000__000000C601294000", +"000000067F00008000000A200C0000034000-000000067F00008000000A200C0000038000__000000C601294000", +"000000067F00008000000A200C0000038000-000000067F00008000000A200C000003C000__000000C601294000", +"000000067F00008000000A200C0000038BB1-000000067F00008000000A200C0000042317__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C000003C000-000000067F00008000000A200C0000040000__000000C601294000", +"000000067F00008000000A200C0000040000-000000067F00008000000A200C0000044000__000000C601294000", +"000000067F00008000000A200C0000042317-000000067F00008000000A200C000004BA7D__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C0000044000-000000067F00008000000A200C0000048000__000000C601294000", +"000000067F00008000000A200C0000048000-000000067F00008000000A200C000004C000__000000C601294000", +"000000067F00008000000A200C000004BA7D-000000067F00008000000A200C00000551B2__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C000004C000-000000067F00008000000A200C0000050000__000000C601294000", +"000000067F00008000000A200C0000050000-000000067F00008000000A200C0000054000__000000C601294000", +"000000067F00008000000A200C0000054000-000000067F00008000000A200C0000058000__000000C5FED35FC8", +"000000067F00008000000A200C0000054000-000000067F00008000000A200C0000058000__000000C6C7BD8140", +"000000067F00008000000A200C00000551B2-030000000000000000000000000000000002__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C0000055230-000000067F00008000000A200C000005E996__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C0000058000-000000067F00008000000A200C000005C000__000000C5FED35FC8", +"000000067F00008000000A200C0000058000-000000067F00008000000A200C000005C000__000000C6C7BD8140", +"000000067F00008000000A200C000005C000-000000067F00008000000A200C0000060000__000000C5FED35FC8", +"000000067F00008000000A200C000005C000-000000067F00008000000A200C0000060000__000000C6C7BD8140", +"000000067F00008000000A200C000005E996-000000067F00008000000A200C00000680FC__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C0000060000-000000067F00008000000A200C0000064000__000000C5FED35FC8", +"000000067F00008000000A200C0000060000-000000067F00008000000A200C0000064000__000000C6C7BD8140", +"000000067F00008000000A200C0000064000-000000067F00008000000A200C0000068000__000000C5FED35FC8", +"000000067F00008000000A200C0000064000-000000067F00008000000A200C0000068000__000000C6C7BD8140", +"000000067F00008000000A200C00000677DB-000000067F00008000000A200C00000CF739__000000C689AF4AC1-000000C6C87B6329", +"000000067F00008000000A200C0000068000-000000067F00008000000A200C000006C000__000000C5FED35FC8", +"000000067F00008000000A200C0000068000-000000067F00008000000A200C000006C000__000000C6C7BD8140", +"000000067F00008000000A200C00000680FC-000000067F00008000000A200C000007180C__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C000006C000-000000067F00008000000A200C0000070000__000000C5FED35FC8", +"000000067F00008000000A200C000006C000-000000067F00008000000A200C0000070000__000000C6C7BD8140", +"000000067F00008000000A200C0000070000-000000067F00008000000A200C0000074000__000000C5FED35FC8", +"000000067F00008000000A200C0000070000-000000067F00008000000A200C0000074000__000000C6C7BD8140", +"000000067F00008000000A200C000007180C-000000067F00008000000A200C000007AF72__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C0000074000-000000067F00008000000A200C0000078000__000000C5FED35FC8", +"000000067F00008000000A200C0000074000-000000067F00008000000A200C0000078000__000000C6C7BD8140", +"000000067F00008000000A200C0000078000-000000067F00008000000A200C000007C000__000000C5FED35FC8", +"000000067F00008000000A200C0000078000-000000067F00008000000A200C000007C000__000000C6C7BD8140", +"000000067F00008000000A200C000007AF72-000000067F00008000000A200C00000846D8__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C000007C000-000000067F00008000000A200C0000080000__000000C5FED35FC8", +"000000067F00008000000A200C000007C000-000000067F00008000000A200C0000080000__000000C6C7BD8140", +"000000067F00008000000A200C0000080000-000000067F00008000000A200C0000084000__000000C5FED35FC8", +"000000067F00008000000A200C0000080000-000000067F00008000000A200C0000084000__000000C6C7BD8140", +"000000067F00008000000A200C0000084000-000000067F00008000000A200C0000088000__000000C5FED35FC8", +"000000067F00008000000A200C0000084000-000000067F00008000000A200C0000088000__000000C6C7BD8140", +"000000067F00008000000A200C00000846D8-000000067F00008000000A200C000008DE0B__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C0000088000-000000067F00008000000A200C000008C000__000000C5FED35FC8", +"000000067F00008000000A200C0000088000-000000067F00008000000A200C000008C000__000000C6C7BD8140", +"000000067F00008000000A200C000008C000-000000067F00008000000A200C0000090000__000000C5FED35FC8", +"000000067F00008000000A200C000008C000-000000067F00008000000A200C0000090000__000000C6C7BD8140", +"000000067F00008000000A200C000008DE0B-000000067F00008000000A200C000009752B__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C0000090000-000000067F00008000000A200C0000094000__000000C5FED35FC8", +"000000067F00008000000A200C0000090000-000000067F00008000000A200C0000094000__000000C6C7BD8140", +"000000067F00008000000A200C00000933F0-000000067F00008000000A200C0000110901__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A200C0000094000-000000067F00008000000A200C0000098000__000000C5FED35FC8", +"000000067F00008000000A200C0000094000-000000067F00008000000A200C0000098000__000000C6C7BD8140", +"000000067F00008000000A200C000009752B-000000067F00008000000A200C00000A0C91__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C0000098000-000000067F00008000000A200C000009C000__000000C5FED35FC8", +"000000067F00008000000A200C0000098000-000000067F00008000000A200C000009C000__000000C6C7BD8140", +"000000067F00008000000A200C000009C000-000000067F00008000000A200C00000A0000__000000C5FED35FC8", +"000000067F00008000000A200C000009C000-000000067F00008000000A200C00000A0000__000000C6C7BD8140", +"000000067F00008000000A200C00000A0000-000000067F00008000000A200C00000A4000__000000C5FED35FC8", +"000000067F00008000000A200C00000A0000-000000067F00008000000A200C00000A4000__000000C6C7BD8140", +"000000067F00008000000A200C00000A0C91-000000067F00008000000A200C00000AA3F7__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C00000A4000-000000067F00008000000A200C00000A8000__000000C5FED35FC8", +"000000067F00008000000A200C00000A4000-000000067F00008000000A200C00000A8000__000000C6C7BD8140", +"000000067F00008000000A200C00000A8000-000000067F00008000000A200C00000AC000__000000C5FED35FC8", +"000000067F00008000000A200C00000A8000-000000067F00008000000A200C00000AC000__000000C6C7BD8140", +"000000067F00008000000A200C00000AA3F7-000000067F00008000000A200C00000B3B0C__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C00000AC000-000000067F00008000000A200C00000B0000__000000C5FED35FC8", +"000000067F00008000000A200C00000AC000-000000067F00008000000A200C00000B0000__000000C6C7BD8140", +"000000067F00008000000A200C00000B0000-000000067F00008000000A200C00000B4000__000000C5FED35FC8", +"000000067F00008000000A200C00000B0000-000000067F00008000000A200C00000B4000__000000C6C7BD8140", +"000000067F00008000000A200C00000B3B0C-000000067F00008000000A200C0100000000__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C00000B4000-000000067F00008000000A200C00000B8000__000000C5FED35FC8", +"000000067F00008000000A200C00000B4000-000000067F00008000000A200C00000B8000__000000C6C7BD8140", +"000000067F00008000000A200C00000B8000-000000067F00008000000A200C00000BC000__000000C5FED35FC8", +"000000067F00008000000A200C00000B8000-000000067F00008000000A200C00000BC000__000000C6C7BD8140", +"000000067F00008000000A200C00000BBC1F-000000067F00008000000A200C00000C5353__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C00000BC000-000000067F00008000000A200C00000C0000__000000C5FED35FC8", +"000000067F00008000000A200C00000BC000-000000067F00008000000A200C00000C0000__000000C6C7BD8140", +"000000067F00008000000A200C00000C0000-000000067F00008000000A200C00000C4000__000000C5FED35FC8", +"000000067F00008000000A200C00000C0000-000000067F00008000000A200C00000C4000__000000C6C7BD8140", +"000000067F00008000000A200C00000C4000-000000067F00008000000A200C00000C8000__000000C5FED35FC8", +"000000067F00008000000A200C00000C4000-000000067F00008000000A200C00000C8000__000000C6C7BD8140", +"000000067F00008000000A200C00000C5353-000000067F00008000000A200C00000CEAB9__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C00000C8000-000000067F00008000000A200C00000CC000__000000C5FED35FC8", +"000000067F00008000000A200C00000C8000-000000067F00008000000A200C00000CC000__000000C6C7BD8140", +"000000067F00008000000A200C00000CC000-000000067F00008000000A200C00000D0000__000000C5FED35FC8", +"000000067F00008000000A200C00000CC000-000000067F00008000000A200C00000D0000__000000C6C7BD8140", +"000000067F00008000000A200C00000CEAB9-000000067F00008000000A200C00000D81D2__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C00000CF742-000000067F00008000000A2014000000B47B__000000C689AF4AC1-000000C6C87B6329", +"000000067F00008000000A200C00000D0000-000000067F00008000000A200C00000D4000__000000C5FED35FC8", +"000000067F00008000000A200C00000D0000-000000067F00008000000A200C00000D4000__000000C6C7BD8140", +"000000067F00008000000A200C00000D4000-000000067F00008000000A200C00000D8000__000000C5FED35FC8", +"000000067F00008000000A200C00000D4000-000000067F00008000000A200C00000D8000__000000C6C7BD8140", +"000000067F00008000000A200C00000D8000-000000067F00008000000A200C00000DC000__000000C5FED35FC8", +"000000067F00008000000A200C00000D8000-000000067F00008000000A200C00000DC000__000000C6C7BD8140", +"000000067F00008000000A200C00000D81D2-000000067F00008000000A200C00000E190B__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C00000DC000-000000067F00008000000A200C00000E0000__000000C5FED35FC8", +"000000067F00008000000A200C00000DC000-000000067F00008000000A200C00000E0000__000000C6C7BD8140", +"000000067F00008000000A200C00000E0000-000000067F00008000000A200C00000E4000__000000C5FED35FC8", +"000000067F00008000000A200C00000E0000-000000067F00008000000A200C00000E4000__000000C6C7BD8140", +"000000067F00008000000A200C00000E190B-000000067F00008000000A200C00000EB071__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C00000E4000-000000067F00008000000A200C00000E8000__000000C5FED35FC8", +"000000067F00008000000A200C00000E4000-000000067F00008000000A200C00000E8000__000000C6C7BD8140", +"000000067F00008000000A200C00000E8000-000000067F00008000000A200C00000EC000__000000C5FED35FC8", +"000000067F00008000000A200C00000E8000-000000067F00008000000A200C00000EC000__000000C6C7BD8140", +"000000067F00008000000A200C00000EB071-000000067F00008000000A200C00000F47AC__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C00000EC000-000000067F00008000000A200C00000F0000__000000C5FED35FC8", +"000000067F00008000000A200C00000EC000-000000067F00008000000A200C00000F0000__000000C6C7BD8140", +"000000067F00008000000A200C00000F0000-000000067F00008000000A200C00000F4000__000000C5FED35FC8", +"000000067F00008000000A200C00000F0000-000000067F00008000000A200C00000F4000__000000C6C7BD8140", +"000000067F00008000000A200C00000F4000-000000067F00008000000A200C00000F8000__000000C5FED35FC8", +"000000067F00008000000A200C00000F4000-000000067F00008000000A200C00000F8000__000000C6C7BD8140", +"000000067F00008000000A200C00000F47AC-000000067F00008000000A200C00000FDF0A__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C00000F8000-000000067F00008000000A200C00000FC000__000000C5FED35FC8", +"000000067F00008000000A200C00000F8000-000000067F00008000000A200C00000FC000__000000C6C7BD8140", +"000000067F00008000000A200C00000FC000-000000067F00008000000A200C0000100000__000000C5FED35FC8", +"000000067F00008000000A200C00000FC000-000000067F00008000000A200C0000100000__000000C6C7BD8140", +"000000067F00008000000A200C00000FDF0A-000000067F00008000000A200C000010762B__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C0000100000-000000067F00008000000A200C0000104000__000000C5FED35FC8", +"000000067F00008000000A200C0000100000-000000067F00008000000A200C0000104000__000000C6C7BD8140", +"000000067F00008000000A200C0000104000-000000067F00008000000A200C0000108000__000000C5FED35FC8", +"000000067F00008000000A200C0000104000-000000067F00008000000A200C0000108000__000000C6C7BD8140", +"000000067F00008000000A200C000010762B-000000067F00008000000A200C0000110D88__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C0000108000-000000067F00008000000A200C000010C000__000000C5FED35FC8", +"000000067F00008000000A200C0000108000-000000067F00008000000A200C000010C000__000000C6C7BD8140", +"000000067F00008000000A200C000010C000-000000067F00008000000A200C0000110000__000000C5FED35FC8", +"000000067F00008000000A200C000010C000-000000067F00008000000A200C0000110000__000000C6C7BD8140", +"000000067F00008000000A200C0000110000-000000067F00008000000A20120100000000__000000C6C7BD8140", +"000000067F00008000000A200C0000110000-030000000000000000000000000000000002__000000C5FED35FC8", +"000000067F00008000000A200C0000110901-000000067F00008000000A201400000047CD__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A200C0000110D88-01000000000000000100000005000000000A__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A20140000000000-000000067F00008000000A20140000004000__000000C6C7BD8140", +"000000067F00008000000A20140000004000-000000067F00008000000A20140000008000__000000C6C7BD8140", +"000000067F00008000000A201400000047CD-000000067F00008000000A2014000000ADA8__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A20140000008000-000000067F00008000000A2014000000C000__000000C6C7BD8140", +"000000067F00008000000A2014000000ADA8-000000067F00008000000A201400000113B8__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A2014000000B47C-010000000000000001000000050100000000__000000C689AF4AC1-000000C6C87B6329", +"000000067F00008000000A2014000000C000-000000067F00008000000A20140000010000__000000C6C7BD8140", +"000000067F00008000000A20140000010000-000000067F00008000000A20140000014000__000000C6C7BD8140", +"000000067F00008000000A201400000113B8-000000067F00008000000A20140000017969__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A20140000014000-000000067F00008000000A20140000018000__000000C6C7BD8140", +"000000067F00008000000A20140000017969-000000067F00008000000A2014000001DF7E__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A20140000018000-000000067F00008000000A2014000001C000__000000C6C7BD8140", +"000000067F00008000000A2014000001C000-000000067F00008000000A20140000020000__000000C6C7BD8140", +"000000067F00008000000A2014000001DF7E-000000067F00008000000A2014000002457D__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A20140000020000-000000067F00008000000A20140000024000__000000C6C7BD8140", +"000000067F00008000000A20140000024000-000000067F00008000000A20140000028000__000000C6C7BD8140", +"000000067F00008000000A2014000002457D-000000067F00008000000A2014000002AB1D__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A20140000028000-000000067F00008000000A2014000002C000__000000C6C7BD8140", +"000000067F00008000000A2014000002AB1D-030000000000000000000000000000000002__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A2014000002C000-030000000000000000000000000000000002__000000C6C7BD8140", +"000000067F00008000000A400C0000000000-000000067F00008000000A400C0000004000__000000C896B8DFD8", +"000000067F00008000000A400C0000004000-000000067F00008000000A400C0000008000__000000C896B8DFD8", +"000000067F00008000000A400C0000008000-000000067F00008000000A400C000000C000__000000C896B8DFD8", +"000000067F00008000000A400C0000009743-000000067F00008000000A400C0000012EA9__000000C6C87B6329-000000C74849FAE1", +"000000067F00008000000A400C000000C000-000000067F00008000000A400C0000010000__000000C896B8DFD8", +"000000067F00008000000A400C0000010000-000000067F00008000000A400C0000014000__000000C896B8DFD8", +"000000067F00008000000A400C0000012EA9-000000067F00008000000A400C000001C60A__000000C6C87B6329-000000C74849FAE1", +"000000067F00008000000A400C0000014000-000000067F00008000000A400C0000018000__000000C896B8DFD8", +"000000067F00008000000A400C0000018000-000000067F00008000000A400C000001C000__000000C896B8DFD8", +"000000067F00008000000A400C000001C000-000000067F00008000000A400C0000020000__000000C896B8DFD8", +"000000067F00008000000A400C000001C60A-000000067F00008000000A400C0000025D38__000000C6C87B6329-000000C74849FAE1", +"000000067F00008000000A400C0000020000-000000067F00008000000A400C0000024000__000000C896B8DFD8", +"000000067F00008000000A400C0000024000-000000067F00008000000A400C0000028000__000000C896B8DFD8", +"000000067F00008000000A400C0000025D38-000000067F00008000000A400C000002F49E__000000C6C87B6329-000000C74849FAE1", +"000000067F00008000000A400C0000028000-000000067F00008000000A400C000002C000__000000C896B8DFD8", +"000000067F00008000000A400C000002C000-000000067F00008000000A400C0000030000__000000C896B8DFD8", +"000000067F00008000000A400C000002F49E-000000067F00008000000A400C0000038BB1__000000C6C87B6329-000000C74849FAE1", +"000000067F00008000000A400C0000030000-000000067F00008000000A400C0000034000__000000C896B8DFD8", +"000000067F00008000000A400C0000034000-000000067F00008000000A400C0000038000__000000C896B8DFD8", +"000000067F00008000000A400C0000038000-000000067F00008000000A400C000003C000__000000C896B8DFD8", +"000000067F00008000000A400C0000038BB1-000000067F00008000000A400C0000042317__000000C6C87B6329-000000C74849FAE1", +"000000067F00008000000A400C000003C000-000000067F00008000000A400C0000040000__000000C896B8DFD8", +"000000067F00008000000A400C0000040000-000000067F00008000000A400C0000044000__000000C896B8DFD8", +"000000067F00008000000A400C0000042317-000000067F00008000000A400C000004BA7D__000000C6C87B6329-000000C74849FAE1", +"000000067F00008000000A400C0000044000-000000067F00008000000A400C0000048000__000000C896B8DFD8", +"000000067F00008000000A400C0000048000-000000067F00008000000A400C000004C000__000000C896B8DFD8", +"000000067F00008000000A400C000004BA7D-030000000000000000000000000000000002__000000C6C87B6329-000000C74849FAE1", +"000000067F00008000000A400C000004C000-000000067F00008000000A400C0000050000__000000C896B8DFD8", +"000000067F00008000000A400C0000050000-000000067F00008000000A400C0000054000__000000C896B8DFD8", +"000000067F00008000000A400C0000054000-000000067F00008000000A400C0000058000__000000C896B8DFD8", +"000000067F00008000000A400C00000551FC-000000067F00008000000A400C000005E90B__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C0000058000-000000067F00008000000A400C000005C000__000000C896B8DFD8", +"000000067F00008000000A400C000005C000-000000067F00008000000A400C0000060000__000000C896B8DFD8", +"000000067F00008000000A400C000005E90B-000000067F00008000000A400C000006802B__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C0000060000-000000067F00008000000A400C0000064000__000000C896B8DFD8", +"000000067F00008000000A400C0000064000-000000067F00008000000A400C0000068000__000000C896B8DFD8", +"000000067F00008000000A400C0000068000-000000067F00008000000A400C000006C000__000000C896B8DFD8", +"000000067F00008000000A400C000006802B-000000067F00008000000A400C0000071782__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C000006C000-000000067F00008000000A400C0000070000__000000C896B8DFD8", +"000000067F00008000000A400C0000070000-000000067F00008000000A400C0000074000__000000C896B8DFD8", +"000000067F00008000000A400C0000071782-000000067F00008000000A400C000007AEE8__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C0000074000-000000067F00008000000A400C0000078000__000000C896B8DFD8", +"000000067F00008000000A400C0000078000-000000067F00008000000A400C000007C000__000000C896B8DFD8", +"000000067F00008000000A400C000007AEE8-000000067F00008000000A400C000008460B__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C000007C000-000000067F00008000000A400C0000080000__000000C896B8DFD8", +"000000067F00008000000A400C0000080000-000000067F00008000000A400C0000084000__000000C896B8DFD8", +"000000067F00008000000A400C0000084000-000000067F00008000000A400C0000088000__000000C896B8DFD8", +"000000067F00008000000A400C000008460B-000000067F00008000000A400C000008DD71__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C0000088000-000000067F00008000000A400C000008C000__000000C896B8DFD8", +"000000067F00008000000A400C000008C000-000000067F00008000000A400C0000090000__000000C896B8DFD8", +"000000067F00008000000A400C000008DD71-000000067F00008000000A400C00000974D7__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C0000090000-000000067F00008000000A400C0000094000__000000C896B8DFD8", +"000000067F00008000000A400C0000094000-000000067F00008000000A400C0000098000__000000C896B8DFD8", +"000000067F00008000000A400C00000974D7-000000067F00008000000A400C00000A0C0B__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C0000098000-000000067F00008000000A400C000009C000__000000C896B8DFD8", +"000000067F00008000000A400C000009C000-000000067F00008000000A400C00000A0000__000000C896B8DFD8", +"000000067F00008000000A400C00000A0000-000000067F00008000000A400C00000A4000__000000C896B8DFD8", +"000000067F00008000000A400C00000A0C0B-000000067F00008000000A400C00000AA371__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C00000A4000-000000067F00008000000A400C00000A8000__000000C896B8DFD8", +"000000067F00008000000A400C00000A8000-000000067F00008000000A400C00000AC000__000000C896B8DFD8", +"000000067F00008000000A400C00000AA371-000000067F00008000000A400C00000B3AD7__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C00000AC000-000000067F00008000000A400C00000B0000__000000C896B8DFD8", +"000000067F00008000000A400C00000B0000-000000067F00008000000A400C00000B4000__000000C896B8DFD8", +"000000067F00008000000A400C00000B3AD7-000000067F00008000000A400C00000BD20B__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C00000B4000-000000067F00008000000A400C00000B8000__000000C896B8DFD8", +"000000067F00008000000A400C00000B8000-000000067F00008000000A400C00000BC000__000000C896B8DFD8", +"000000067F00008000000A400C00000BC000-000000067F00008000000A400C00000C0000__000000C896B8DFD8", +"000000067F00008000000A400C00000BD20B-000000067F00008000000A400C0100000000__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C00000C0000-000000067F00008000000A400C00000C4000__000000C896B8DFD8", +"000000067F00008000000A400C00000C4000-000000067F00008000000A400C00000C8000__000000C896B8DFD8", +"000000067F00008000000A400C00000C4AE6-000000067F00008000000A400C00000CE20C__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A400C00000C8000-000000067F00008000000A400C00000CC000__000000C896B8DFD8", +"000000067F00008000000A400C00000CC000-000000067F00008000000A400C00000D0000__000000C896B8DFD8", +"000000067F00008000000A400C00000CE20C-000000067F00008000000A400C00000D7929__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A400C00000D0000-000000067F00008000000A400C00000D4000__000000C896B8DFD8", +"000000067F00008000000A400C00000D4000-000000067F00008000000A400C00000D8000__000000C896B8DFD8", +"000000067F00008000000A400C00000D7929-000000067F00008000000A400C00000E108F__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A400C00000D8000-000000067F00008000000A400C00000DC000__000000C896B8DFD8", +"000000067F00008000000A400C00000DC000-000000067F00008000000A400C00000E0000__000000C896B8DFD8", +"000000067F00008000000A400C00000E0000-000000067F00008000000A400C00000E4000__000000C896B8DFD8", +"000000067F00008000000A400C00000E108F-000000067F00008000000A400C00000EA7F5__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A400C00000E4000-000000067F00008000000A400C00000E8000__000000C896B8DFD8", +"000000067F00008000000A400C00000E8000-000000067F00008000000A400C00000EC000__000000C896B8DFD8", +"000000067F00008000000A400C00000EA7F5-000000067F00008000000A400C00000F3F0B__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A400C00000EC000-000000067F00008000000A400C00000F0000__000000C896B8DFD8", +"000000067F00008000000A400C00000F0000-000000067F00008000000A400C00000F4000__000000C896B8DFD8", +"000000067F00008000000A400C00000F3F0B-000000067F00008000000A400C00000FD671__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A400C00000F4000-000000067F00008000000A400C00000F8000__000000C896B8DFD8", +"000000067F00008000000A400C00000F8000-000000067F00008000000A400C00000FC000__000000C896B8DFD8", +"000000067F00008000000A400C00000FC000-000000067F00008000000A400C0000100000__000000C896B8DFD8", +"000000067F00008000000A400C00000FD671-000000067F00008000000A400C0000106D95__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A400C0000100000-000000067F00008000000A400C0000104000__000000C896B8DFD8", +"000000067F00008000000A400C0000104000-000000067F00008000000A400C0000108000__000000C896B8DFD8", +"000000067F00008000000A400C0000106D95-000000067F00008000000A400C00001104FB__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A400C0000107F8F-000000067F00008000000A40140000005626__000000C8993EBFF9-000000C90726D0D9", +"000000067F00008000000A400C0000108000-000000067F00008000000A400C000010C000__000000C896B8DFD8", +"000000067F00008000000A400C000010C000-000000067F00008000000A400C0000110000__000000C896B8DFD8", +"000000067F00008000000A400C0000110000-030000000000000000000000000000000002__000000C896B8DFD8", +"000000067F00008000000A400C00001104FB-01000000000000000100000005000000000D__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A40140000005626-000000067F00008000000A4014000000C7F9__000000C8993EBFF9-000000C90726D0D9", +"000000067F00008000000A4014000000C7F9-000000067F00008000000A401400000139F8__000000C8993EBFF9-000000C90726D0D9", +"000000067F00008000000A401400000139F8-000000067F00008000000A4014000001ABE9__000000C8993EBFF9-000000C90726D0D9", +"000000067F00008000000A4014000001ABE9-000000067F00008000000A40140000021DF4__000000C8993EBFF9-000000C90726D0D9", +"000000067F00008000000A40140000021DF4-000000067F00008000000A40140000028FA9__000000C8993EBFF9-000000C90726D0D9", +"000000067F00008000000A40140000028FA9-030000000000000000000000000000000002__000000C8993EBFF9-000000C90726D0D9", +"000000067F00008000000A600C0000000000-000000067F00008000000A600C0000004000__000000CA2C877DC8", +"000000067F00008000000A600C0000000000-000000067F00008000000A600C0000004000__000000CB82C2FF68", +"000000067F00008000000A600C0000004000-000000067F00008000000A600C0000008000__000000CA2C877DC8", +"000000067F00008000000A600C0000004000-000000067F00008000000A600C0000008000__000000CB82C2FF68", +"000000067F00008000000A600C0000008000-000000067F00008000000A600C000000C000__000000CA2C877DC8", +"000000067F00008000000A600C0000008000-000000067F00008000000A600C000000C000__000000CB82C2FF68", +"000000067F00008000000A600C0000009746-000000067F00008000000A600C0000012EAC__000000C90726D0D9-000000C986F5F0D9", +"000000067F00008000000A600C000000C000-000000067F00008000000A600C0000010000__000000CA2C877DC8", +"000000067F00008000000A600C000000C000-000000067F00008000000A600C0000010000__000000CB82C2FF68", +"000000067F00008000000A600C0000010000-000000067F00008000000A600C0000014000__000000CA2C877DC8", +"000000067F00008000000A600C0000010000-000000067F00008000000A600C0000014000__000000CB82C2FF68", +"000000067F00008000000A600C0000012EAC-000000067F00008000000A600C000001C60A__000000C90726D0D9-000000C986F5F0D9", +"000000067F00008000000A600C0000014000-000000067F00008000000A600C0000018000__000000CA2C877DC8", +"000000067F00008000000A600C0000014000-000000067F00008000000A600C0000018000__000000CB82C2FF68", +"000000067F00008000000A600C0000018000-000000067F00008000000A600C000001C000__000000CA2C877DC8", +"000000067F00008000000A600C0000018000-000000067F00008000000A600C000001C000__000000CB82C2FF68", +"000000067F00008000000A600C000001C000-000000067F00008000000A600C0000020000__000000CA2C877DC8", +"000000067F00008000000A600C000001C000-000000067F00008000000A600C0000020000__000000CB82C2FF68", +"000000067F00008000000A600C000001C60A-000000067F00008000000A600C0000025D38__000000C90726D0D9-000000C986F5F0D9", +"000000067F00008000000A600C0000020000-000000067F00008000000A600C0000024000__000000CA2C877DC8", +"000000067F00008000000A600C0000020000-000000067F00008000000A600C0000024000__000000CB82C2FF68", +"000000067F00008000000A600C0000024000-000000067F00008000000A600C0000028000__000000CA2C877DC8", +"000000067F00008000000A600C0000024000-000000067F00008000000A600C0000028000__000000CB82C2FF68", +"000000067F00008000000A600C0000025D38-000000067F00008000000A600C000002F49E__000000C90726D0D9-000000C986F5F0D9", +"000000067F00008000000A600C0000028000-000000067F00008000000A600C000002C000__000000CA2C877DC8", +"000000067F00008000000A600C0000028000-000000067F00008000000A600C000002C000__000000CB82C2FF68", +"000000067F00008000000A600C000002C000-000000067F00008000000A600C0000030000__000000CA2C877DC8", +"000000067F00008000000A600C000002C000-000000067F00008000000A600C0000030000__000000CB82C2FF68", +"000000067F00008000000A600C000002F49E-000000067F00008000000A600C0000038BB1__000000C90726D0D9-000000C986F5F0D9", +"000000067F00008000000A600C0000030000-000000067F00008000000A600C0000034000__000000CA2C877DC8", +"000000067F00008000000A600C0000030000-000000067F00008000000A600C0000034000__000000CB82C2FF68", +"000000067F00008000000A600C0000034000-000000067F00008000000A600C0000038000__000000CA2C877DC8", +"000000067F00008000000A600C0000034000-000000067F00008000000A600C0000038000__000000CB82C2FF68", +"000000067F00008000000A600C0000038000-000000067F00008000000A600C000003C000__000000CA2C877DC8", +"000000067F00008000000A600C0000038000-000000067F00008000000A600C000003C000__000000CB82C2FF68", +"000000067F00008000000A600C0000038BB1-000000067F00008000000A600C0000042317__000000C90726D0D9-000000C986F5F0D9", +"000000067F00008000000A600C000003C000-000000067F00008000000A600C0000040000__000000CA2C877DC8", +"000000067F00008000000A600C000003C000-000000067F00008000000A600C0000040000__000000CB82C2FF68", +"000000067F00008000000A600C0000040000-000000067F00008000000A600C0000044000__000000CA2C877DC8", +"000000067F00008000000A600C0000040000-000000067F00008000000A600C0000044000__000000CB82C2FF68", +"000000067F00008000000A600C0000042317-000000067F00008000000A600C000004BA7D__000000C90726D0D9-000000C986F5F0D9", +"000000067F00008000000A600C0000044000-000000067F00008000000A600C0000048000__000000CA2C877DC8", +"000000067F00008000000A600C0000044000-000000067F00008000000A600C0000048000__000000CB82C2FF68", +"000000067F00008000000A600C0000048000-000000067F00008000000A600C000004C000__000000CA2C877DC8", +"000000067F00008000000A600C0000048000-000000067F00008000000A600C000004C000__000000CB82C2FF68", +"000000067F00008000000A600C000004BA7D-030000000000000000000000000000000002__000000C90726D0D9-000000C986F5F0D9", +"000000067F00008000000A600C000004C000-000000067F00008000000A600C0000050000__000000CA2C877DC8", +"000000067F00008000000A600C000004C000-000000067F00008000000A600C0000050000__000000CB82C2FF68", +"000000067F00008000000A600C0000050000-000000067F00008000000A600C0000054000__000000CA2C877DC8", +"000000067F00008000000A600C0000050000-000000067F00008000000A600C0000054000__000000CB82C2FF68", +"000000067F00008000000A600C0000054000-000000067F00008000000A600C0000058000__000000CA2C877DC8", +"000000067F00008000000A600C0000054000-000000067F00008000000A600C0000058000__000000CB82C2FF68", +"000000067F00008000000A600C0000054BFB-000000067F00008000000A600C000005E30C__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C0000058000-000000067F00008000000A600C000005C000__000000CA2C877DC8", +"000000067F00008000000A600C0000058000-000000067F00008000000A600C000005C000__000000CB82C2FF68", +"000000067F00008000000A600C000005C000-000000067F00008000000A600C0000060000__000000CA2C877DC8", +"000000067F00008000000A600C000005C000-000000067F00008000000A600C0000060000__000000CB82C2FF68", +"000000067F00008000000A600C000005E30C-000000067F00008000000A600C0000067A2B__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C0000060000-000000067F00008000000A600C0000064000__000000CA2C877DC8", +"000000067F00008000000A600C0000060000-000000067F00008000000A600C0000064000__000000CB82C2FF68", +"000000067F00008000000A600C0000064000-000000067F00008000000A600C0000068000__000000CA2C877DC8", +"000000067F00008000000A600C0000064000-000000067F00008000000A600C0000068000__000000CB82C2FF68", +"000000067F00008000000A600C0000067A2B-000000067F00008000000A600C0000071186__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C0000068000-000000067F00008000000A600C000006C000__000000CA2C877DC8", +"000000067F00008000000A600C0000068000-000000067F00008000000A600C000006C000__000000CB82C2FF68", +"000000067F00008000000A600C000006C000-000000067F00008000000A600C0000070000__000000CA2C877DC8", +"000000067F00008000000A600C000006C000-000000067F00008000000A600C0000070000__000000CB82C2FF68", +"000000067F00008000000A600C0000070000-000000067F00008000000A600C0000074000__000000CA2C877DC8", +"000000067F00008000000A600C0000070000-000000067F00008000000A600C0000074000__000000CB82C2FF68", +"000000067F00008000000A600C0000071186-000000067F00008000000A600C000007A8EC__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C0000074000-000000067F00008000000A600C0000078000__000000CA2C877DC8", +"000000067F00008000000A600C0000074000-000000067F00008000000A600C0000078000__000000CB82C2FF68", +"000000067F00008000000A600C0000078000-000000067F00008000000A600C000007C000__000000CA2C877DC8", +"000000067F00008000000A600C0000078000-000000067F00008000000A600C000007C000__000000CB82C2FF68", +"000000067F00008000000A600C000007A149-000000067F00008000000A600C00000F5F42__000000CB40C16489-000000CB82C37859", +"000000067F00008000000A600C000007A8EC-000000067F00008000000A600C000008400A__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C000007C000-000000067F00008000000A600C0000080000__000000CA2C877DC8", +"000000067F00008000000A600C000007C000-000000067F00008000000A600C0000080000__000000CB82C2FF68", +"000000067F00008000000A600C0000080000-000000067F00008000000A600C0000084000__000000CA2C877DC8", +"000000067F00008000000A600C0000080000-000000067F00008000000A600C0000084000__000000CB82C2FF68", +"000000067F00008000000A600C0000084000-000000067F00008000000A600C0000088000__000000CA2C877DC8", +"000000067F00008000000A600C0000084000-000000067F00008000000A600C0000088000__000000CB82C2FF68", +"000000067F00008000000A600C000008400A-000000067F00008000000A600C000008D770__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C0000088000-000000067F00008000000A600C000008C000__000000CA2C877DC8", +"000000067F00008000000A600C0000088000-000000067F00008000000A600C000008C000__000000CB82C2FF68", +"000000067F00008000000A600C000008C000-000000067F00008000000A600C0000090000__000000CA2C877DC8", +"000000067F00008000000A600C000008C000-000000067F00008000000A600C0000090000__000000CB82C2FF68", +"000000067F00008000000A600C000008D770-000000067F00008000000A600C0000096ED6__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C0000090000-000000067F00008000000A600C0000094000__000000CA2C877DC8", +"000000067F00008000000A600C0000090000-000000067F00008000000A600C0000094000__000000CB82C2FF68", +"000000067F00008000000A600C0000094000-000000067F00008000000A600C0000098000__000000CA2C877DC8", +"000000067F00008000000A600C0000094000-000000067F00008000000A600C0000098000__000000CB82C2FF68", +"000000067F00008000000A600C0000096ED6-000000067F00008000000A600C00000A060B__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C0000098000-000000067F00008000000A600C000009C000__000000CA2C877DC8", +"000000067F00008000000A600C0000098000-000000067F00008000000A600C000009C000__000000CB82C2FF68", +"000000067F00008000000A600C000009C000-000000067F00008000000A600C00000A0000__000000CA2C877DC8", +"000000067F00008000000A600C000009C000-000000067F00008000000A600C00000A0000__000000CB82C2FF68", +"000000067F00008000000A600C00000A0000-000000067F00008000000A600C00000A4000__000000CA2C877DC8", +"000000067F00008000000A600C00000A0000-000000067F00008000000A600C00000A4000__000000CB82C2FF68", +"000000067F00008000000A600C00000A060B-000000067F00008000000A600C00000A9D71__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000A4000-000000067F00008000000A600C00000A8000__000000CA2C877DC8", +"000000067F00008000000A600C00000A4000-000000067F00008000000A600C00000A8000__000000CB82C2FF68", +"000000067F00008000000A600C00000A8000-000000067F00008000000A600C00000AC000__000000CA2C877DC8", +"000000067F00008000000A600C00000A8000-000000067F00008000000A600C00000AC000__000000CB82C2FF68", +"000000067F00008000000A600C00000A9D71-000000067F00008000000A600C00000B34D7__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000AC000-000000067F00008000000A600C00000B0000__000000CB82C2FF68", +"000000067F00008000000A600C00000AC000-030000000000000000000000000000000002__000000CA2C877DC8", +"000000067F00008000000A600C00000B0000-000000067F00008000000A600C00000B4000__000000CB82C2FF68", +"000000067F00008000000A600C00000B34D7-000000067F00008000000A600C00000BCC0C__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000B4000-000000067F00008000000A600C00000B8000__000000CB82C2FF68", +"000000067F00008000000A600C00000B8000-000000067F00008000000A600C00000BC000__000000CB82C2FF68", +"000000067F00008000000A600C00000BC000-000000067F00008000000A600C00000C0000__000000CB82C2FF68", +"000000067F00008000000A600C00000BCC0C-000000067F00008000000A600C00000C6336__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000C0000-000000067F00008000000A600C00000C4000__000000CB82C2FF68", +"000000067F00008000000A600C00000C4000-000000067F00008000000A600C00000C8000__000000CB82C2FF68", +"000000067F00008000000A600C00000C6336-000000067F00008000000A600C00000CFA9C__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000C8000-000000067F00008000000A600C00000CC000__000000CB82C2FF68", +"000000067F00008000000A600C00000CC000-000000067F00008000000A600C00000D0000__000000CB82C2FF68", +"000000067F00008000000A600C00000CFA9C-000000067F00008000000A600C00000D91AB__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000D0000-000000067F00008000000A600C00000D4000__000000CB82C2FF68", +"000000067F00008000000A600C00000D4000-000000067F00008000000A600C00000D8000__000000CB82C2FF68", +"000000067F00008000000A600C00000D8000-000000067F00008000000A600C00000DC000__000000CB82C2FF68", +"000000067F00008000000A600C00000D91AB-000000067F00008000000A600C00000E2911__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000DC000-000000067F00008000000A600C00000E0000__000000CB82C2FF68", +"000000067F00008000000A600C00000E0000-000000067F00008000000A600C00000E4000__000000CB82C2FF68", +"000000067F00008000000A600C00000E2911-000000067F00008000000A600C00000EC077__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000E4000-000000067F00008000000A600C00000E8000__000000CB82C2FF68", +"000000067F00008000000A600C00000E8000-000000067F00008000000A600C00000EC000__000000CB82C2FF68", +"000000067F00008000000A600C00000EC000-000000067F00008000000A600C00000F0000__000000CB82C2FF68", +"000000067F00008000000A600C00000EC077-000000067F00008000000A600C00000F57A8__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000F0000-000000067F00008000000A600C00000F4000__000000CB82C2FF68", +"000000067F00008000000A600C00000F4000-000000067F00008000000A600C00000F8000__000000CB82C2FF68", +"000000067F00008000000A600C00000F57A8-000000067F00008000000A600C00000FEF0A__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000F5F4F-000000067F00008000000A60140000011158__000000CB40C16489-000000CB82C37859", +"000000067F00008000000A600C00000F8000-000000067F00008000000A600C00000FC000__000000CB82C2FF68", +"000000067F00008000000A600C00000FC000-000000067F00008000000A600C0000100000__000000CB82C2FF68", +"000000067F00008000000A600C00000FEF0A-000000067F00008000000A600C000010862B__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C0000100000-000000067F00008000000A600C0000104000__000000CB82C2FF68", +"000000067F00008000000A600C0000104000-000000067F00008000000A600C0000108000__000000CB82C2FF68", +"000000067F00008000000A600C0000108000-000000067F00008000000A600C000010C000__000000CB82C2FF68", +"000000067F00008000000A600C000010862B-000000067F00008000000A600C0000111C20__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C000010C000-000000067F00008000000A600C0000110000__000000CB82C2FF68", +"000000067F00008000000A600C0000110000-000000067F00008000000A60120100000000__000000CB82C2FF68", +"000000067F00008000000A600C00001117CB-000000067F00008000000A6014000000499B__000000CAD5D7FFF1-000000CB40C16489", +"000000067F00008000000A600C00FFFFFFFF-01000000000000000100000005000000000E__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A60140000000000-000000067F00008000000A60140000004000__000000CB82C2FF68", +"000000067F00008000000A60140000004000-000000067F00008000000A60140000008000__000000CB82C2FF68", +"000000067F00008000000A6014000000499B-000000067F00008000000A6014000000BD4E__000000CAD5D7FFF1-000000CB40C16489", +"000000067F00008000000A60140000008000-000000067F00008000000A6014000000C000__000000CB82C2FF68", +"000000067F00008000000A6014000000BD4E-000000067F00008000000A601400000130ED__000000CAD5D7FFF1-000000CB40C16489", +"000000067F00008000000A6014000000C000-000000067F00008000000A60140000010000__000000CB82C2FF68", +"000000067F00008000000A60140000010000-000000067F00008000000A60140000014000__000000CB82C2FF68", +"000000067F00008000000A60140000011159-000000067F00008000000A60140000029BB2__000000CB40C16489-000000CB82C37859", +"000000067F00008000000A601400000130ED-000000067F00008000000A6014000001A4BD__000000CAD5D7FFF1-000000CB40C16489", +"000000067F00008000000A60140000014000-000000067F00008000000A60140000018000__000000CB82C2FF68", +"000000067F00008000000A60140000018000-000000067F00008000000A6014000001C000__000000CB82C2FF68", +"000000067F00008000000A6014000001A4BD-000000067F00008000000A60140000021886__000000CAD5D7FFF1-000000CB40C16489", +"000000067F00008000000A6014000001C000-000000067F00008000000A60140000020000__000000CB82C2FF68", +"000000067F00008000000A60140000020000-000000067F00008000000A60140000024000__000000CB82C2FF68", +"000000067F00008000000A60140000021886-000000067F00008000000A60140000028C0A__000000CAD5D7FFF1-000000CB40C16489", +"000000067F00008000000A60140000024000-000000067F00008000000A60140000028000__000000CB82C2FF68", +"000000067F00008000000A60140000028000-000000067F00008000000A6014000002C000__000000CB82C2FF68", +"000000067F00008000000A60140000028C0A-030000000000000000000000000000000002__000000CAD5D7FFF1-000000CB40C16489", +"000000067F00008000000A60140000029BB2-030000000000000000000000000000000002__000000CB40C16489-000000CB82C37859", +"000000067F00008000000A6014000002C000-030000000000000000000000000000000002__000000CB82C2FF68", +"000000067F00008000000A800C0000000000-000000067F00008000000A800C0000004000__000000CD51009FE8", +"000000067F00008000000A800C0000004000-000000067F00008000000A800C0000008000__000000CD51009FE8", +"000000067F00008000000A800C0000008000-000000067F00008000000A800C000000C000__000000CD51009FE8", +"000000067F00008000000A800C0000009748-000000067F00008000000A800C0000012EAE__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800C000000C000-000000067F00008000000A800C0000010000__000000CD51009FE8", +"000000067F00008000000A800C0000010000-000000067F00008000000A800C0000014000__000000CD51009FE8", +"000000067F00008000000A800C0000012EAE-000000067F00008000000A800C000001C60A__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800C0000014000-000000067F00008000000A800C0000018000__000000CD51009FE8", +"000000067F00008000000A800C0000018000-000000067F00008000000A800C000001C000__000000CD51009FE8", +"000000067F00008000000A800C000001C000-000000067F00008000000A800C0000020000__000000CD51009FE8", +"000000067F00008000000A800C000001C60A-000000067F00008000000A800C0000025D38__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800C0000020000-000000067F00008000000A800C0000024000__000000CD51009FE8", +"000000067F00008000000A800C0000024000-000000067F00008000000A800C0000028000__000000CD51009FE8", +"000000067F00008000000A800C0000025D38-000000067F00008000000A800C000002F49E__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800C0000028000-000000067F00008000000A800C000002C000__000000CD51009FE8", +"000000067F00008000000A800C000002C000-000000067F00008000000A800C0000030000__000000CD51009FE8", +"000000067F00008000000A800C000002F49E-000000067F00008000000A800C0000038BB1__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800C0000030000-000000067F00008000000A800C0000034000__000000CD51009FE8", +"000000067F00008000000A800C0000034000-000000067F00008000000A800C0000038000__000000CD51009FE8", +"000000067F00008000000A800C0000038000-000000067F00008000000A800C000003C000__000000CD51009FE8", +"000000067F00008000000A800C0000038BB1-000000067F00008000000A800C0000042317__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800C000003C000-000000067F00008000000A800C0000040000__000000CD51009FE8", +"000000067F00008000000A800C0000040000-000000067F00008000000A800C0000044000__000000CD51009FE8", +"000000067F00008000000A800C0000042317-000000067F00008000000A800C000004BA7D__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800C0000044000-000000067F00008000000A800C0000048000__000000CD51009FE8", +"000000067F00008000000A800C0000048000-000000067F00008000000A800C000004C000__000000CD51009FE8", +"000000067F00008000000A800C000004BA7D-000000067F00008000000A800C0000054CA0__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800C000004C000-000000067F00008000000A800C0000050000__000000CD51009FE8", +"000000067F00008000000A800C0000050000-000000067F00008000000A800C0000054000__000000CD51009FE8", +"000000067F00008000000A800C0000054000-000000067F00008000000A800C0000058000__000000CD51009FE8", +"000000067F00008000000A800C0000054C9F-000000067F00008000000A800C000005E405__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C0000058000-000000067F00008000000A800C000005C000__000000CD51009FE8", +"000000067F00008000000A800C000005C000-000000067F00008000000A800C0000060000__000000CD51009FE8", +"000000067F00008000000A800C000005E405-000000067F00008000000A800C0000067B10__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C0000060000-000000067F00008000000A800C0000064000__000000CD51009FE8", +"000000067F00008000000A800C0000064000-000000067F00008000000A800C0000068000__000000CD51009FE8", +"000000067F00008000000A800C0000067B10-000000067F00008000000A800C0000071276__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C0000068000-000000067F00008000000A800C000006C000__000000CD51009FE8", +"000000067F00008000000A800C000006C000-000000067F00008000000A800C0000070000__000000CD51009FE8", +"000000067F00008000000A800C0000070000-000000067F00008000000A800C0000074000__000000CD51009FE8", +"000000067F00008000000A800C0000071276-000000067F00008000000A800C000007A9DC__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C0000074000-000000067F00008000000A800C0000078000__000000CD51009FE8", +"000000067F00008000000A800C0000078000-000000067F00008000000A800C000007C000__000000CD51009FE8", +"000000067F00008000000A800C000007A9DC-000000067F00008000000A800C000008410B__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C000007C000-000000067F00008000000A800C0000080000__000000CD51009FE8", +"000000067F00008000000A800C0000080000-000000067F00008000000A800C0000084000__000000CD51009FE8", +"000000067F00008000000A800C0000084000-000000067F00008000000A800C0000088000__000000CD51009FE8", +"000000067F00008000000A800C000008410B-000000067F00008000000A800C000008D871__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C0000088000-000000067F00008000000A800C000008C000__000000CD51009FE8", +"000000067F00008000000A800C000008C000-000000067F00008000000A800C0000090000__000000CD51009FE8", +"000000067F00008000000A800C000008D871-000000067F00008000000A800C0000096F94__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C0000090000-000000067F00008000000A800C0000094000__000000CD51009FE8", +"000000067F00008000000A800C0000094000-000000067F00008000000A800C0000098000__000000CD51009FE8", +"000000067F00008000000A800C0000096F94-000000067F00008000000A800C00000A06FA__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C0000098000-000000067F00008000000A800C000009C000__000000CD51009FE8", +"000000067F00008000000A800C000009C000-000000067F00008000000A800C00000A0000__000000CD51009FE8", +"000000067F00008000000A800C00000A0000-000000067F00008000000A800C00000A4000__000000CD51009FE8", +"000000067F00008000000A800C00000A06FA-000000067F00008000000A800C00000A9E0D__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C00000A4000-000000067F00008000000A800C00000A8000__000000CD51009FE8", +"000000067F00008000000A800C00000A8000-000000067F00008000000A800C00000AC000__000000CD51009FE8", +"000000067F00008000000A800C00000A9E0D-000000067F00008000000A800C00000B3553__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C00000AC000-000000067F00008000000A800C00000B0000__000000CD51009FE8", +"000000067F00008000000A800C00000B0000-000000067F00008000000A800C00000B4000__000000CD51009FE8", +"000000067F00008000000A800C00000B3553-000000067F00008000000A800C0100000000__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C00000B4000-000000067F00008000000A800C00000B8000__000000CD51009FE8", +"000000067F00008000000A800C00000B8000-000000067F00008000000A800C00000BC000__000000CD51009FE8", +"000000067F00008000000A800C00000BC000-000000067F00008000000A800C00000C0000__000000CD51009FE8", +"000000067F00008000000A800C00000BCB46-000000067F00008000000A800C00000C62AC__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C00000C0000-000000067F00008000000A800C00000C4000__000000CD51009FE8", +"000000067F00008000000A800C00000C4000-000000067F00008000000A800C00000C8000__000000CD51009FE8", +"000000067F00008000000A800C00000C62AC-000000067F00008000000A800C00000CFA09__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C00000C8000-000000067F00008000000A800C00000CC000__000000CD51009FE8", +"000000067F00008000000A800C00000CC000-000000067F00008000000A800C00000D0000__000000CD51009FE8", +"000000067F00008000000A800C00000CFA09-000000067F00008000000A800C00000D9118__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C00000D0000-000000067F00008000000A800C00000D4000__000000CD51009FE8", +"000000067F00008000000A800C00000D4000-000000067F00008000000A800C00000D8000__000000CD51009FE8", +"000000067F00008000000A800C00000D8000-000000067F00008000000A800C00000DC000__000000CD51009FE8", +"000000067F00008000000A800C00000D9118-000000067F00008000000A800C00000E287E__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C00000DC000-000000067F00008000000A800C00000E0000__000000CD51009FE8", +"000000067F00008000000A800C00000E0000-000000067F00008000000A800C00000E4000__000000CD51009FE8", +"000000067F00008000000A800C00000E287E-000000067F00008000000A800C00000EBFE4__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C00000E4000-000000067F00008000000A800C00000E8000__000000CD51009FE8", +"000000067F00008000000A800C00000E8000-000000067F00008000000A800C00000EC000__000000CD51009FE8", +"000000067F00008000000A800C00000EBFE4-000000067F00008000000A800C00000F570B__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C00000EC000-000000067F00008000000A800C00000F0000__000000CD51009FE8", +"000000067F00008000000A800C00000F0000-000000067F00008000000A800C00000F4000__000000CD51009FE8", +"000000067F00008000000A800C00000F4000-000000067F00008000000A800C00000F8000__000000CD51009FE8", +"000000067F00008000000A800C00000F570B-000000067F00008000000A800C00000FEE71__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C00000F8000-000000067F00008000000A800C00000FC000__000000CD51009FE8", +"000000067F00008000000A800C00000FC000-000000067F00008000000A800C0000100000__000000CD51009FE8", +"000000067F00008000000A800C00000FEE71-000000067F00008000000A800C0000108587__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C0000100000-000000067F00008000000A800C0000104000__000000CD51009FE8", +"000000067F00008000000A800C0000104000-000000067F00008000000A800C0000108000__000000CD51009FE8", +"000000067F00008000000A800C0000108000-000000067F00008000000A800C000010C000__000000CD51009FE8", +"000000067F00008000000A800C0000108587-000000067F00008000000A800C0000111C20__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C000010C000-000000067F00008000000A800C0000110000__000000CD51009FE8", +"000000067F00008000000A800C0000110000-030000000000000000000000000000000002__000000CD51009FE8", +"000000067F00008000000A800C00FFFFFFFF-010000000000000001000000050000000011__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C00FFFFFFFF-030000000000000000000000000000000002__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800F0200000000-000000067F00008000000A80140000007ADF__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000A80140000007ADF-000000067F00008000000A8014000000F7D0__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000A8014000000F7D0-000000067F00008000000A801400000176D0__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000A801400000176D0-000000067F00008000000A8014000001F5D2__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000A8014000001F5D2-000000067F00008000000A801400000274D5__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000A801400000274D5-000000067F00008000000AA00C0000001863__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000AA00C0000000000-000000067F00008000000AA00C0000004000__000000CF7E08BFD0", +"000000067F00008000000AA00C0000001863-000000067F00008000000AA00C000000AFC9__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000AA00C0000004000-000000067F00008000000AA00C0000008000__000000CF7E08BFD0", +"000000067F00008000000AA00C0000008000-000000067F00008000000AA00C000000C000__000000CF7E08BFD0", +"000000067F00008000000AA00C000000AFC9-030000000000000000000000000000000002__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000AA00C000000C000-000000067F00008000000AA00C0000010000__000000CF7E08BFD0", +"000000067F00008000000AA00C0000010000-000000067F00008000000AA00C0000014000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000126EC-000000067F00008000000AA00C000001BE0C__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C0000014000-000000067F00008000000AA00C0000018000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000018000-000000067F00008000000AA00C000001C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000001BE0C-000000067F00008000000AA00C000002553F__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C000001C000-000000067F00008000000AA00C0000020000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000020000-000000067F00008000000AA00C0000024000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000024000-000000067F00008000000AA00C0000028000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000002553F-000000067F00008000000AA00C000002ECA5__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C0000028000-000000067F00008000000AA00C000002C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000002C000-000000067F00008000000AA00C0000030000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000002ECA5-000000067F00008000000AA00C00000383BC__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C0000030000-000000067F00008000000AA00C0000034000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000034000-000000067F00008000000AA00C0000038000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000038000-000000067F00008000000AA00C000003C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000383BC-000000067F00008000000AA00C0000041B0A__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C000003C000-000000067F00008000000AA00C0000040000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000040000-000000067F00008000000AA00C0000044000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000041B0A-000000067F00008000000AA00C000004B270__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C0000044000-000000067F00008000000AA00C0000048000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000048000-000000067F00008000000AA00C000004C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000004B270-000000067F00008000000AA00C00000549AA__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C000004C000-000000067F00008000000AA00C0000050000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000050000-000000067F00008000000AA00C0000054000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000054000-000000067F00008000000AA00C0000058000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000549AA-000000067F00008000000AA00C000005E10B__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C0000058000-000000067F00008000000AA00C000005C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000005C000-000000067F00008000000AA00C0000060000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000005E10B-000000067F00008000000AA00C000006782C__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C0000060000-000000067F00008000000AA00C0000064000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000064000-000000067F00008000000AA00C0000068000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000006782C-000000067F00008000000AA00C0000070F88__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C0000068000-000000067F00008000000AA00C000006C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000006C000-000000067F00008000000AA00C0000070000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000070000-000000067F00008000000AA00C0000074000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000070F88-000000067F00008000000AA00C0100000000__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C0000074000-000000067F00008000000AA00C0000078000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000078000-000000067F00008000000AA00C000007C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000078E97-000000067F00008000000AA00C00000823F9__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C000007C000-000000067F00008000000AA00C0000080000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000080000-000000067F00008000000AA00C0000084000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000823F9-000000067F00008000000AA00C000008BA8A__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C0000084000-000000067F00008000000AA00C0000088000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000088000-000000067F00008000000AA00C000008C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000008BA8A-000000067F00008000000AA00C00000951BF__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C000008C000-000000067F00008000000AA00C0000090000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000090000-000000067F00008000000AA00C0000094000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000094000-000000067F00008000000AA00C0000098000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000951BF-000000067F00008000000AA00C000009E90A__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C0000098000-000000067F00008000000AA00C000009C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000009C000-000000067F00008000000AA00C00000A0000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000009E90A-000000067F00008000000AA00C00000A802B__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000A0000-000000067F00008000000AA00C00000A4000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000A4000-000000067F00008000000AA00C00000A8000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000A8000-000000067F00008000000AA00C00000AC000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000A802B-000000067F00008000000AA00C00000B1782__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000AC000-000000067F00008000000AA00C00000B0000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000B0000-000000067F00008000000AA00C00000B4000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000B1782-000000067F00008000000AA00C00000BAEE8__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000B4000-000000067F00008000000AA00C00000B8000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000B8000-000000067F00008000000AA00C00000BC000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000BAEE8-000000067F00008000000AA00C00000C460C__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000BC000-000000067F00008000000AA00C00000C0000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000C0000-000000067F00008000000AA00C00000C4000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000C4000-000000067F00008000000AA00C00000C8000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000C460C-000000067F00008000000AA00C00000CDD72__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000C8000-000000067F00008000000AA00C00000CC000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000CC000-000000067F00008000000AA00C00000D0000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000CDD72-000000067F00008000000AA00C00000D74D8__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000D0000-000000067F00008000000AA00C00000D4000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000D4000-000000067F00008000000AA00C00000D8000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000D74D8-000000067F00008000000AA00C00000E0C0B__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000D8000-000000067F00008000000AA00C00000DC000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000DC000-000000067F00008000000AA00C00000E0000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000E0000-000000067F00008000000AA00C00000E4000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000E0C0B-000000067F00008000000AA00C00000EA371__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000E4000-000000067F00008000000AA00C00000E8000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000E8000-000000067F00008000000AA00C00000EC000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000EA371-000000067F00008000000AA00C00000F3AD7__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000EC000-000000067F00008000000AA00C00000F0000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000F0000-000000067F00008000000AA00C00000F4000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000F3AD7-000000067F00008000000AA00C00000FD20B__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000F4000-000000067F00008000000AA00C00000F8000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000F8000-000000067F00008000000AA00C00000FC000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000FC000-000000067F00008000000AA00C0000100000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000FD20B-000000067F00008000000AA00C0000106932__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C0000100000-000000067F00008000000AA00C0000104000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000104000-000000067F00008000000AA00C0000108000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000106932-000000067F00008000000AA00C0000110098__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C0000108000-000000067F00008000000AA00C000010C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000010C000-000000067F00008000000AA00C0000110000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000110000-030000000000000000000000000000000002__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000110098-010000000000000001000000050000000012__000000CE6C3FED31-000000CF7DC97FD1", +"010000000000000001000000000000000000-030000000000000000000000000000000002__000000A29F1D8950", +"030000000000000000000000000000000001-030000000000000000000000000000000002__000000C689AF4AC1-000000C6C87B6329", +]; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 3dd2f92b5e..47ef9284b8 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -59,13 +59,14 @@ pub mod block_io; mod delta_layer; mod disk_btree; pub(crate) mod ephemeral_file; -mod filename; +pub mod filename; mod image_layer; mod inmemory_layer; -mod layer_map; +pub mod layer_map; + pub mod metadata; mod par_fsync; -mod storage_layer; +pub mod storage_layer; mod timeline; From 2418e726491991370f0fbefabad237109e7cc500 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 17 Oct 2022 16:51:01 +0300 Subject: [PATCH 0915/1022] Speed up layer_map::search, by remembering the "envelope" for each layer. Lookups in the R-tree call the "envelope" function for every comparison, and our envelope function isn't very cheap, so that overhead adds up. Create the envelope once, when the layer is inserted into the tree, and store it along with the layer. That uses some more memory per layer, but that's not very significant. Speeds up the search operation 2x --- pageserver/src/tenant/layer_map.rs | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 495833e3ae..9d914c1839 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -62,6 +62,8 @@ pub struct LayerMap { struct LayerRTreeObject { layer: Arc, + + envelope: AABB<[IntKey; 2]>, } // Representation of Key as numeric type. @@ -197,9 +199,16 @@ impl PartialEq for LayerRTreeObject { impl RTreeObject for LayerRTreeObject { type Envelope = AABB<[IntKey; 2]>; fn envelope(&self) -> Self::Envelope { - let key_range = self.layer.get_key_range(); - let lsn_range = self.layer.get_lsn_range(); - AABB::from_corners( + self.envelope + } +} + +impl LayerRTreeObject { + fn new(layer: Arc) -> Self { + let key_range = layer.get_key_range(); + let lsn_range = layer.get_lsn_range(); + + let envelope = AABB::from_corners( [ IntKey::from(key_range.start.to_i128()), IntKey::from(lsn_range.start.0 as i128), @@ -208,7 +217,8 @@ impl RTreeObject for LayerRTreeObject { IntKey::from(key_range.end.to_i128() - 1), IntKey::from(lsn_range.end.0 as i128 - 1), ], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive - ) + ); + LayerRTreeObject { layer, envelope } } } @@ -338,7 +348,7 @@ impl LayerMap { if layer.get_key_range() == (Key::MIN..Key::MAX) { self.l0_delta_layers.push(layer.clone()); } - self.historic_layers.insert(LayerRTreeObject { layer }); + self.historic_layers.insert(LayerRTreeObject::new(layer)); NUM_ONDISK_LAYERS.inc(); } @@ -362,7 +372,7 @@ impl LayerMap { } assert!(self .historic_layers - .remove(&LayerRTreeObject { layer }) + .remove(&LayerRTreeObject::new(layer)) .is_some()); NUM_ONDISK_LAYERS.dec(); } From 59bc7e67e0e406576c0d1e670815253892a891bc Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 17 Oct 2022 16:54:04 +0300 Subject: [PATCH 0916/1022] Use an optimized version of amplify_num. Speeds up layer_map::search somewhat. I also opened a PR in the upstream rust-amplify repository with these changes, see https://github.com/rust-amplify/rust-amplify/pull/148. We can switch back to upstream version when that's merged. --- Cargo.lock | 3 +-- pageserver/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d02ec1f5a1..657baf5d80 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -40,8 +40,7 @@ dependencies = [ [[package]] name = "amplify_num" version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f27d3d00d3d115395a7a8a4dc045feb7aa82b641e485f7e15f4e67ac16f4f56d" +source = "git+https://github.com/hlinnaka/rust-amplify.git?branch=unsigned-int-perf#bd49b737c2e6e623ab8e9ba5ceaed5712d3a3940" [[package]] name = "android_system_properties" diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 75aa6e93eb..2139e24ee2 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -56,7 +56,7 @@ fail = "0.5.0" git-version = "0.3.5" rstar = "0.9.3" num-traits = "0.2.15" -amplify_num = "0.4.1" +amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" } pageserver_api = { path = "../libs/pageserver_api" } postgres_ffi = { path = "../libs/postgres_ffi" } From 546e9bdbec4ebdabdf39ea5e70c625fb5a53ca4b Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Tue, 18 Oct 2022 15:52:15 +0300 Subject: [PATCH 0917/1022] Deploy storage into new account and migrate to management API v2 (#2619) Deploy storage into new account Migrate safekeeper and pageserver initialisation to management api v2 --- .github/ansible/deploy.yaml | 6 +-- .github/ansible/neon-stress.hosts.yaml | 13 +++--- .github/ansible/production.hosts.yaml | 12 ++--- .github/ansible/scripts/init_pageserver.sh | 9 ++-- .github/ansible/scripts/init_safekeeper.sh | 10 ++--- .github/ansible/ssm_config | 3 ++ .github/ansible/staging.hosts.yaml | 22 ++++----- .github/ansible/staging.us-east-2.hosts.yaml | 32 +++++++++++++ .github/ansible/systemd/pageserver.service | 2 +- .github/ansible/systemd/safekeeper.service | 4 +- .github/workflows/build_and_test.yml | 47 ++++++++++++++++++-- 11 files changed, 116 insertions(+), 44 deletions(-) create mode 100644 .github/ansible/ssm_config create mode 100644 .github/ansible/staging.us-east-2.hosts.yaml diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index bfd3fd123d..4adc685684 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -1,7 +1,7 @@ - name: Upload Neon binaries hosts: storage gather_facts: False - remote_user: admin + remote_user: "{{ remote_user }}" tasks: @@ -36,7 +36,7 @@ - name: Deploy pageserver hosts: pageservers gather_facts: False - remote_user: admin + remote_user: "{{ remote_user }}" tasks: @@ -124,7 +124,7 @@ - name: Deploy safekeeper hosts: safekeepers gather_facts: False - remote_user: admin + remote_user: "{{ remote_user }}" tasks: diff --git a/.github/ansible/neon-stress.hosts.yaml b/.github/ansible/neon-stress.hosts.yaml index d4c77e7ada..8afc9a5be8 100644 --- a/.github/ansible/neon-stress.hosts.yaml +++ b/.github/ansible/neon-stress.hosts.yaml @@ -12,19 +12,20 @@ storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" prefix_in_bucket: "{{ inventory_hostname }}" - + hostname_suffix: ".local" + remote_user: admin children: pageservers: hosts: neon-stress-ps-1: - console_region_id: 1 + console_region_id: aws-eu-west-1 neon-stress-ps-2: - console_region_id: 1 + console_region_id: aws-eu-west-1 safekeepers: hosts: neon-stress-sk-1: - console_region_id: 1 + console_region_id: aws-eu-west-1 neon-stress-sk-2: - console_region_id: 1 + console_region_id: aws-eu-west-1 neon-stress-sk-3: - console_region_id: 1 + console_region_id: aws-eu-west-1 diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml index c276ca3805..9f9b12d25d 100644 --- a/.github/ansible/production.hosts.yaml +++ b/.github/ansible/production.hosts.yaml @@ -12,20 +12,22 @@ storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" prefix_in_bucket: "{{ inventory_hostname }}" + hostname_suffix: ".local" + remote_user: admin children: pageservers: hosts: zenith-1-ps-2: - console_region_id: 1 + console_region_id: aws-us-west-2 zenith-1-ps-3: - console_region_id: 1 + console_region_id: aws-us-west-2 safekeepers: hosts: zenith-1-sk-1: - console_region_id: 1 + console_region_id: aws-us-west-2 zenith-1-sk-2: - console_region_id: 1 + console_region_id: aws-us-west-2 zenith-1-sk-3: - console_region_id: 1 + console_region_id: aws-us-west-2 diff --git a/.github/ansible/scripts/init_pageserver.sh b/.github/ansible/scripts/init_pageserver.sh index 1cbdd0db94..426925a837 100644 --- a/.github/ansible/scripts/init_pageserver.sh +++ b/.github/ansible/scripts/init_pageserver.sh @@ -12,18 +12,19 @@ cat <> $GITHUB_OUTPUT elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}' + PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}' echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" @@ -735,7 +735,46 @@ jobs: ssh-add ssh-key rm -f ssh-key ssh-key-cert.pub ansible-galaxy collection install sivel.toiletwater - ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml + ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} + rm -f neon_install.tar.gz .neon_current_version + + deploy-new: + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned + # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. + # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly + needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] + if: | + (github.ref_name == 'main') && + github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Redeploy + run: | + export DOCKER_TAG=${{needs.tag.outputs.build-tag}} + cd "$(pwd)/.github/ansible" + + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + ./get_binaries.sh + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + RELEASE=true ./get_binaries.sh + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + + ansible-playbook deploy.yaml -i staging.us-east-2.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}} rm -f neon_install.tar.gz .neon_current_version deploy-proxy: From 0cd2d91b9d114010b41a41baf736182160d3be04 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Tue, 18 Oct 2022 17:44:19 +0300 Subject: [PATCH 0918/1022] Fix deploy-new job by installing sivel.toiletwater (#2641) --- .github/workflows/build_and_test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index ad68b09832..d90455ccca 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -774,6 +774,7 @@ jobs: exit 1 fi + ansible-galaxy collection install sivel.toiletwater ansible-playbook deploy.yaml -i staging.us-east-2.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}} rm -f neon_install.tar.gz .neon_current_version From 41550ec8bfcb0fa558662012a4c0e7e44dfbc919 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 18 Oct 2022 17:00:06 +0300 Subject: [PATCH 0919/1022] Remove unnecessary indirections of libpqwalproposer functions In the Postgres backend, we cannot link directly with libpq (check the pgsql-hackers arhive for all kinds of fun that ensued when we tried to do that). Therefore, the libpq functions are used through the thin wrapper functions in libpqwalreceiver.so, and libpqwalreceiver.so is loaded dynamically. To hide the dynamic loading and make the calls look like regular functions, we use macros to hide the function pointers. We had inherited the same indirections in libpqwalproposer, but it's not needed since the neon extension is already a shared library that's loaded dynamically. There's no problem calling the functions directly there. Remove the indirections. --- pgxn/neon/libpqwalproposer.c | 99 +++++++++++------------------------- pgxn/neon/neon.c | 1 - pgxn/neon/neon.h | 1 - pgxn/neon/walproposer.c | 7 --- pgxn/neon/walproposer.h | 81 +++++------------------------ 5 files changed, 42 insertions(+), 147 deletions(-) diff --git a/pgxn/neon/libpqwalproposer.c b/pgxn/neon/libpqwalproposer.c index 1f739f3722..6b1e6a8bcc 100644 --- a/pgxn/neon/libpqwalproposer.c +++ b/pgxn/neon/libpqwalproposer.c @@ -10,51 +10,12 @@ struct WalProposerConn PGconn *pg_conn; bool is_nonblocking; /* whether the connection is non-blocking */ char *recvbuf; /* last received data from - * libpqprop_async_read */ + * walprop_async_read */ }; -/* Prototypes for exported functions */ -static char *libpqprop_error_message(WalProposerConn * conn); -static WalProposerConnStatusType libpqprop_status(WalProposerConn * conn); -static WalProposerConn * libpqprop_connect_start(char *conninfo); -static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn * conn); -static bool libpqprop_send_query(WalProposerConn * conn, char *query); -static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn * conn); -static pgsocket libpqprop_socket(WalProposerConn * conn); -static int libpqprop_flush(WalProposerConn * conn); -static void libpqprop_finish(WalProposerConn * conn); -static PGAsyncReadResult libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount); -static PGAsyncWriteResult libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size); -static bool libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size); - -static WalProposerFunctionsType PQWalProposerFunctions = -{ - libpqprop_error_message, - libpqprop_status, - libpqprop_connect_start, - libpqprop_connect_poll, - libpqprop_send_query, - libpqprop_get_query_result, - libpqprop_socket, - libpqprop_flush, - libpqprop_finish, - libpqprop_async_read, - libpqprop_async_write, - libpqprop_blocking_write, -}; - -/* Module initialization */ -void -pg_init_libpqwalproposer(void) -{ - if (WalProposerFunctions != NULL) - elog(ERROR, "libpqwalproposer already loaded"); - WalProposerFunctions = &PQWalProposerFunctions; -} - /* Helper function */ static bool -ensure_nonblocking_status(WalProposerConn * conn, bool is_nonblocking) +ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking) { /* If we're already correctly blocking or nonblocking, all good */ if (is_nonblocking == conn->is_nonblocking) @@ -69,14 +30,14 @@ ensure_nonblocking_status(WalProposerConn * conn, bool is_nonblocking) } /* Exported function definitions */ -static char * -libpqprop_error_message(WalProposerConn * conn) +char * +walprop_error_message(WalProposerConn *conn) { return PQerrorMessage(conn->pg_conn); } -static WalProposerConnStatusType -libpqprop_status(WalProposerConn * conn) +WalProposerConnStatusType +walprop_status(WalProposerConn *conn) { switch (PQstatus(conn->pg_conn)) { @@ -89,8 +50,8 @@ libpqprop_status(WalProposerConn * conn) } } -static WalProposerConn * -libpqprop_connect_start(char *conninfo) +WalProposerConn * +walprop_connect_start(char *conninfo) { WalProposerConn *conn; PGconn *pg_conn; @@ -119,8 +80,8 @@ libpqprop_connect_start(char *conninfo) return conn; } -static WalProposerConnectPollStatusType -libpqprop_connect_poll(WalProposerConn * conn) +WalProposerConnectPollStatusType +walprop_connect_poll(WalProposerConn *conn) { WalProposerConnectPollStatusType return_val; @@ -160,8 +121,8 @@ libpqprop_connect_poll(WalProposerConn * conn) return return_val; } -static bool -libpqprop_send_query(WalProposerConn * conn, char *query) +bool +walprop_send_query(WalProposerConn *conn, char *query) { /* * We need to be in blocking mode for sending the query to run without @@ -177,8 +138,8 @@ libpqprop_send_query(WalProposerConn * conn, char *query) return true; } -static WalProposerExecStatusType -libpqprop_get_query_result(WalProposerConn * conn) +WalProposerExecStatusType +walprop_get_query_result(WalProposerConn *conn) { PGresult *result; WalProposerExecStatusType return_val; @@ -255,20 +216,20 @@ libpqprop_get_query_result(WalProposerConn * conn) return return_val; } -static pgsocket -libpqprop_socket(WalProposerConn * conn) +pgsocket +walprop_socket(WalProposerConn *conn) { return PQsocket(conn->pg_conn); } -static int -libpqprop_flush(WalProposerConn * conn) +int +walprop_flush(WalProposerConn *conn) { return (PQflush(conn->pg_conn)); } -static void -libpqprop_finish(WalProposerConn * conn) +void +walprop_finish(WalProposerConn *conn) { if (conn->recvbuf != NULL) PQfreemem(conn->recvbuf); @@ -282,8 +243,8 @@ libpqprop_finish(WalProposerConn * conn) * On success, the data is placed in *buf. It is valid until the next call * to this function. */ -static PGAsyncReadResult -libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount) +PGAsyncReadResult +walprop_async_read(WalProposerConn *conn, char **buf, int *amount) { int result; @@ -353,8 +314,8 @@ libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount) } } -static PGAsyncWriteResult -libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size) +PGAsyncWriteResult +walprop_async_write(WalProposerConn *conn, void const *buf, size_t size) { int result; @@ -408,8 +369,12 @@ libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size) } } -static bool -libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size) +/* + * This function is very similar to walprop_async_write. For more + * information, refer to the comments there. + */ +bool +walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size) { int result; @@ -417,10 +382,6 @@ libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size) if (!ensure_nonblocking_status(conn, false)) return false; - /* - * Ths function is very similar to libpqprop_async_write. For more - * information, refer to the comments there - */ if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1) return false; diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 2a2a163ee8..5c98902554 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -32,7 +32,6 @@ void _PG_init(void) { pg_init_libpagestore(); - pg_init_libpqwalproposer(); pg_init_walproposer(); EmitWarningsOnPlaceholders("neon"); diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index dad9c1b508..6b9ba372fb 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -13,7 +13,6 @@ #define NEON_H extern void pg_init_libpagestore(void); -extern void pg_init_libpqwalproposer(void); extern void pg_init_walproposer(void); #endif /* NEON_H */ diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index ff37be2de1..29290fa736 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -79,9 +79,6 @@ bool am_wal_proposer; char *neon_timeline_walproposer = NULL; char *neon_tenant_walproposer = NULL; -/* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */ -WalProposerFunctionsType *WalProposerFunctions = NULL; - #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" static int n_safekeepers = 0; @@ -438,10 +435,6 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) char *sep; char *port; - /* Load the libpq-specific functions */ - if (WalProposerFunctions == NULL) - elog(ERROR, "libpqwalproposer didn't initialize correctly"); - load_file("libpqwalreceiver", false); if (WalReceiverFunctions == NULL) elog(ERROR, "libpqwalreceiver didn't initialize correctly"); diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 051c7c02a6..e237947441 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -446,31 +446,31 @@ typedef enum } WalProposerConnStatusType; /* Re-exported PQerrorMessage */ -typedef char *(*walprop_error_message_fn) (WalProposerConn * conn); +extern char *walprop_error_message(WalProposerConn *conn); /* Re-exported PQstatus */ -typedef WalProposerConnStatusType(*walprop_status_fn) (WalProposerConn * conn); +extern WalProposerConnStatusType walprop_status(WalProposerConn *conn); /* Re-exported PQconnectStart */ -typedef WalProposerConn * (*walprop_connect_start_fn) (char *conninfo); +extern WalProposerConn * walprop_connect_start(char *conninfo); /* Re-exported PQconectPoll */ -typedef WalProposerConnectPollStatusType(*walprop_connect_poll_fn) (WalProposerConn * conn); +extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn); /* Blocking wrapper around PQsendQuery */ -typedef bool (*walprop_send_query_fn) (WalProposerConn * conn, char *query); +extern bool walprop_send_query(WalProposerConn *conn, char *query); /* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */ -typedef WalProposerExecStatusType(*walprop_get_query_result_fn) (WalProposerConn * conn); +extern WalProposerExecStatusType walprop_get_query_result(WalProposerConn *conn); /* Re-exported PQsocket */ -typedef pgsocket (*walprop_socket_fn) (WalProposerConn * conn); +extern pgsocket walprop_socket(WalProposerConn *conn); /* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */ -typedef int (*walprop_flush_fn) (WalProposerConn * conn); +extern int walprop_flush(WalProposerConn *conn); /* Re-exported PQfinish */ -typedef void (*walprop_finish_fn) (WalProposerConn * conn); +extern void walprop_finish(WalProposerConn *conn); /* * Ergonomic wrapper around PGgetCopyData @@ -486,9 +486,7 @@ typedef void (*walprop_finish_fn) (WalProposerConn * conn); * performs a bit of extra checking work that's always required and is normally * somewhat verbose. */ -typedef PGAsyncReadResult(*walprop_async_read_fn) (WalProposerConn * conn, - char **buf, - int *amount); +extern PGAsyncReadResult walprop_async_read(WalProposerConn *conn, char **buf, int *amount); /* * Ergonomic wrapper around PQputCopyData + PQflush @@ -497,69 +495,14 @@ typedef PGAsyncReadResult(*walprop_async_read_fn) (WalProposerConn * conn, * * For information on the meaning of return codes, refer to PGAsyncWriteResult. */ -typedef PGAsyncWriteResult(*walprop_async_write_fn) (WalProposerConn * conn, - void const *buf, - size_t size); +extern PGAsyncWriteResult walprop_async_write(WalProposerConn *conn, void const *buf, size_t size); /* * Blocking equivalent to walprop_async_write_fn * * Returns 'true' if successful, 'false' on failure. */ -typedef bool (*walprop_blocking_write_fn) (WalProposerConn * conn, void const *buf, size_t size); - -/* All libpqwalproposer exported functions collected together. */ -typedef struct WalProposerFunctionsType -{ - walprop_error_message_fn walprop_error_message; - walprop_status_fn walprop_status; - walprop_connect_start_fn walprop_connect_start; - walprop_connect_poll_fn walprop_connect_poll; - walprop_send_query_fn walprop_send_query; - walprop_get_query_result_fn walprop_get_query_result; - walprop_socket_fn walprop_socket; - walprop_flush_fn walprop_flush; - walprop_finish_fn walprop_finish; - walprop_async_read_fn walprop_async_read; - walprop_async_write_fn walprop_async_write; - walprop_blocking_write_fn walprop_blocking_write; -} WalProposerFunctionsType; - -/* Allow the above functions to be "called" with normal syntax */ -#define walprop_error_message(conn) \ - WalProposerFunctions->walprop_error_message(conn) -#define walprop_status(conn) \ - WalProposerFunctions->walprop_status(conn) -#define walprop_connect_start(conninfo) \ - WalProposerFunctions->walprop_connect_start(conninfo) -#define walprop_connect_poll(conn) \ - WalProposerFunctions->walprop_connect_poll(conn) -#define walprop_send_query(conn, query) \ - WalProposerFunctions->walprop_send_query(conn, query) -#define walprop_get_query_result(conn) \ - WalProposerFunctions->walprop_get_query_result(conn) -#define walprop_set_nonblocking(conn, arg) \ - WalProposerFunctions->walprop_set_nonblocking(conn, arg) -#define walprop_socket(conn) \ - WalProposerFunctions->walprop_socket(conn) -#define walprop_flush(conn) \ - WalProposerFunctions->walprop_flush(conn) -#define walprop_finish(conn) \ - WalProposerFunctions->walprop_finish(conn) -#define walprop_async_read(conn, buf, amount) \ - WalProposerFunctions->walprop_async_read(conn, buf, amount) -#define walprop_async_write(conn, buf, size) \ - WalProposerFunctions->walprop_async_write(conn, buf, size) -#define walprop_blocking_write(conn, buf, size) \ - WalProposerFunctions->walprop_blocking_write(conn, buf, size) - -/* - * The runtime location of the libpqwalproposer functions. - * - * This pointer is set by the initializer in libpqwalproposer, so that we - * can use it later. - */ -extern PGDLLIMPORT WalProposerFunctionsType * WalProposerFunctions; +extern bool walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size); extern uint64 BackpressureThrottlingTime(void); From 7ca72578f9ad5a675701c04e31d36722bce2ad9a Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Tue, 18 Oct 2022 13:40:57 +0300 Subject: [PATCH 0920/1022] Enable plv8 again Now with quickfix for https://github.com/plv8/plv8/issues/503 --- Dockerfile.compute-node-v14 | 5 +++-- Dockerfile.compute-node-v15 | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14 index ed57b29009..f5ccdf7e99 100644 --- a/Dockerfile.compute-node-v14 +++ b/Dockerfile.compute-node-v14 @@ -71,10 +71,12 @@ RUN apt update && \ RUN apt update && \ apt install -y --no-install-recommends -t testing binutils +# Sed is used to patch for https://github.com/plv8/plv8/issues/503 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ tar xvzf v3.1.4.tar.gz && \ cd plv8-3.1.4 && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ + sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ @@ -116,8 +118,7 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3 # FROM build-deps AS neon-pg-ext-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ -# plv8 still sometimes crashes during the creation -# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /h3/usr / COPY pgxn/ pgxn/ diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15 index bdb4330c4f..ec555ad932 100644 --- a/Dockerfile.compute-node-v15 +++ b/Dockerfile.compute-node-v15 @@ -76,10 +76,12 @@ RUN apt update && \ RUN apt update && \ apt install -y --no-install-recommends -t testing binutils +# Sed is used to patch for https://github.com/plv8/plv8/issues/503 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ tar xvzf v3.1.4.tar.gz && \ cd plv8-3.1.4 && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ + sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ @@ -121,8 +123,7 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3 # FROM build-deps AS neon-pg-ext-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ -# plv8 still sometimes crashes during the creation -# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /h3/usr / COPY pgxn/ pgxn/ From 989d78aac837e57c1ef7aaf35af5ded10aa8b010 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 18 Oct 2022 02:11:11 +0300 Subject: [PATCH 0921/1022] Buffer the TCP incoming stream on libpq connections. Reduces the number of syscalls needed to read the commands from the compute. Here's a snippet of strace output from the pageserver, when performing a sequential scan on a table, with prefetch: 3084934 recvfrom(47, "d", 1, 0, NULL, NULL) = 1 3084934 recvfrom(47, "\0\0\0\37", 4, 0, NULL, NULL) = 4 3084934 recvfrom(47, "\2\1\0\0\0\0\362\302\360\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\0\3", 27, 0, NULL, NULL) = 27 3084934 pread64(28, "\0\0\0\1\0\0\0\0\0\0\0\253 "..., 8192, 25190400) = 8192 3084934 write(45, "B\0\0\0\25\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\3A\0\0\32\355\0\0\0\0\1"..., 7010) = 7010 3084934 poll([{fd=46, events=POLLIN}, {fd=48, events=POLLIN}], 2, 60000) = 1 ([{fd=46, revents=POLLIN}]) 3084934 read(46, "\0\0\0\0p\311q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237\362\0\0\237\362\0"..., 8192) = 8192 3084934 sendto(47, "d\0\0 \5f\0\0\0\0p\311q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237"..., 8198, MSG_NOSIGNAL, NULL, 0) = 8198 3084934 recvfrom(47, "d", 1, 0, NULL, NULL) = 1 3084934 recvfrom(47, "\0\0\0\37", 4, 0, NULL, NULL) = 4 3084934 recvfrom(47, "\2\1\0\0\0\0\362\302\360\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\0\4", 27, 0, NULL, NULL) = 27 3084934 pread64(28, " \0=\0L\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0;;\0\0\0\4\4\0"..., 8192, 25198592) = 8192 3084934 write(45, "B\0\0\0\25\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\4A\0\0\32\355\0\0\0\0\1"..., 7010) = 7010 3084934 poll([{fd=46, events=POLLIN}, {fd=48, events=POLLIN}], 2, 60000) = 1 ([{fd=46, revents=POLLIN}]) 3084934 read(46, "\0\0\0\0\260\344q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237\362\0\0\237\362\0"..., 8192) = 8192 3084934 sendto(47, "d\0\0 \5f\0\0\0\0\260\344q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237"..., 8198, MSG_NOSIGNAL, NULL, 0) = 8198 3084934 recvfrom(47, "d", 1, 0, NULL, NULL) = 1 3084934 recvfrom(47, "\0\0\0\37", 4, 0, NULL, NULL) = 4 3084934 recvfrom(47, "\2\1\0\0\0\0\362\302\360\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\0\5", 27, 0, NULL, NULL) = 27 3084934 write(45, "B\0\0\0\25\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\5A\0\0\32\355\0\0\0\0\1"..., 7010) = 7010 3084934 poll([{fd=46, events=POLLIN}, {fd=48, events=POLLIN}], 2, 60000) = 1 ([{fd=46, revents=POLLIN}]) 3084934 read(46, "\0\0\0\0\330\377q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237\362\0\0\237\362\0"..., 8192) = 8192 3084934 sendto(47, "d\0\0 \5f\0\0\0\0\330\377q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237"..., 8198, MSG_NOSIGNAL, NULL, 0) = 8198 This shows the interaction for three get_page_at_lsn requests. For each request, the pageserver performs three recvfrom syscalls to read the incoming request from the socket. After this patch, those recvfrom calls are gone: 3086123 read(47, "\0\0\0\0\360\222q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237\362\0\0\237\362\0"..., 8192) = 8192 3086123 sendto(45, "d\0\0 \5f\0\0\0\0\360\222q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237"..., 8198, MSG_NOSIGNAL, NULL, 0) = 8198 3086123 pread64(29, " "..., 8192, 25182208) = 8192 3086123 write(46, "B\0\0\0\25\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\2A\0\0\32\355\0\0\0\0\1"..., 7010) = 7010 3086123 poll([{fd=47, events=POLLIN}, {fd=49, events=POLLIN}], 2, 60000) = 1 ([{fd=47, revents=POLLIN}]) 3086123 read(47, "\0\0\0\0000\256q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237\362\0\0\237\362\0"..., 8192) = 8192 3086123 sendto(45, "d\0\0 \5f\0\0\0\0000\256q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237"..., 8198, MSG_NOSIGNAL, NULL, 0) = 8198 3086123 pread64(29, "\0\0\0\1\0\0\0\0\0\0\0\253 "..., 8192, 25190400) = 8192 3086123 write(46, "B\0\0\0\25\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\3A\0\0\32\355\0\0\0\0\1"..., 7010) = 7010 3086123 poll([{fd=47, events=POLLIN}, {fd=49, events=POLLIN}], 2, 60000) = 1 ([{fd=47, revents=POLLIN}]) 3086123 read(47, "\0\0\0\0p\311q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237\362\0\0\237\362\0"..., 8192) = 8192 3086123 sendto(45, "d\0\0 \5f\0\0\0\0p\311q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237"..., 8198, MSG_NOSIGNAL, NULL, 0) = 8198 3086123 pread64(29, " \0=\0L\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0;;\0\0\0\4\4\0"..., 8192, 25198592) = 8192 3086123 write(46, "B\0\0\0\25\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\4A\0\0\32\355\0\0\0\0\1"..., 7010) = 7010 3086123 poll([{fd=47, events=POLLIN}, {fd=49, events=POLLIN}], 2, 60000) = 1 ([{fd=47, revents=POLLIN}]) In this test, the compute sends a batch of prefetch requests, and they are read from the socket in one syscall. That syscall was not captured by the strace snippet above, but there are much fewer of them than before. --- libs/utils/src/postgres_backend_async.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs index 87e4478a99..53f6759d62 100644 --- a/libs/utils/src/postgres_backend_async.rs +++ b/libs/utils/src/postgres_backend_async.rs @@ -15,7 +15,7 @@ use std::sync::Arc; use std::task::Poll; use tracing::{debug, error, trace}; -use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader}; use tokio_rustls::TlsAcceptor; #[async_trait::async_trait] @@ -66,8 +66,8 @@ pub enum ProcessMsgResult { /// Always-writeable sock_split stream. /// May not be readable. See [`PostgresBackend::take_stream_in`] pub enum Stream { - Unencrypted(tokio::net::TcpStream), - Tls(Box>), + Unencrypted(BufReader), + Tls(Box>>), Broken, } @@ -157,7 +157,7 @@ impl PostgresBackend { let peer_addr = socket.peer_addr()?; Ok(Self { - stream: Stream::Unencrypted(socket), + stream: Stream::Unencrypted(BufReader::new(socket)), buf_out: BytesMut::with_capacity(10 * 1024), state: ProtoState::Initialization, md5_salt: [0u8; 4], From 8fbe437768e2545855a0f928794abc679b7f9d0b Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Tue, 18 Oct 2022 11:53:28 -0400 Subject: [PATCH 0922/1022] Improve pageserver IO metrics (#2629) --- pageserver/src/metrics.rs | 20 ++++++++++++++------ test_runner/fixtures/compare_fixtures.py | 5 +++-- test_runner/fixtures/metrics.py | 2 ++ 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 5c2f81d731..b654be031c 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -107,18 +107,20 @@ static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, // or in testing they estimate how much we would upload if we did. -static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { - register_int_counter!( +static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { + register_int_counter_vec!( "pageserver_created_persistent_files_total", "Number of files created that are meant to be uploaded to cloud storage", + &["tenant_id", "timeline_id"] ) .expect("failed to define a metric") }); -static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { - register_int_counter!( +static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { + register_int_counter_vec!( "pageserver_written_persistent_bytes_total", "Total bytes written that are meant to be uploaded to cloud storage", + &["tenant_id", "timeline_id"] ) .expect("failed to define a metric") }); @@ -386,8 +388,12 @@ impl TimelineMetrics { let current_logical_size_gauge = CURRENT_LOGICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); - let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED.clone(); - let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN.clone(); + let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); TimelineMetrics { tenant_id, @@ -419,6 +425,8 @@ impl Drop for TimelineMetrics { let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]); let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]); + let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); for op in STORAGE_TIME_OPERATIONS { let _ = STORAGE_TIME.remove_label_values(&[op, tenant_id, timeline_id]); diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 78a12c6c45..2d36d90bd6 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -130,11 +130,12 @@ class NeonCompare(PgCompare): "size", timeline_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER ) + params = f'{{tenant_id="{self.env.initial_tenant}",timeline_id="{self.timeline}"}}' total_files = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_created_persistent_files_total" + self.env.pageserver, "pageserver_created_persistent_files_total" + params ) total_bytes = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_written_persistent_bytes_total" + self.env.pageserver, "pageserver_written_persistent_bytes_total" + params ) self.zenbenchmark.record( "data_uploaded", total_bytes / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 4d680aa641..62e3cbbe99 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -60,4 +60,6 @@ PAGESERVER_PER_TENANT_METRICS = [ "pageserver_wait_lsn_seconds_bucket", "pageserver_wait_lsn_seconds_count", "pageserver_wait_lsn_seconds_sum", + "pageserver_created_persistent_files_total", + "pageserver_written_persistent_bytes_total", ] From c67cf340400b03df95587104cdfd2af861ad38a3 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 19 Oct 2022 11:16:36 +0300 Subject: [PATCH 0923/1022] Update GH Action version (#2646) --- .github/workflows/codestyle.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 961d811a51..66f9f33256 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -36,7 +36,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: submodules: true fetch-depth: 2 From 91411c415ad9a48e9da2278fe21be73bd1eb7ace Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 19 Oct 2022 12:32:03 +0300 Subject: [PATCH 0924/1022] Persists latest_gc_cutoff_lsn before performing GC (#2558) * Persists latest_gc_cutoff_lsn before performing GC * Peform some refactoring and code deduplication refer #2539 * Add test for persisting GC cutoff * Fix python test style warnings * Bump postgres version * Reduce number of iterations in test_gc_cutoff test * Bump postgres version * Undo bumping postgres version --- pageserver/src/tenant/timeline.rs | 127 ++++++++++++++------------ pageserver/src/tenant_tasks.rs | 8 +- test_runner/regress/test_gc_cutoff.py | 38 ++++++++ 3 files changed, 114 insertions(+), 59 deletions(-) create mode 100644 test_runner/regress/test_gc_cutoff.py diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 0f8e60f8d3..1728c7be32 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1220,78 +1220,76 @@ impl Timeline { // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing // *all* the layers, to avoid fsyncing the file multiple times. let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); - self.update_disk_consistent_lsn(disk_consistent_lsn, layer_paths_to_upload)?; + let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); + // If we were able to advance 'disk_consistent_lsn', save it the metadata file. + // After crash, we will restart WAL streaming and processing from that point. + if disk_consistent_lsn != old_disk_consistent_lsn { + assert!(disk_consistent_lsn > old_disk_consistent_lsn); + self.update_metadata_file(disk_consistent_lsn, layer_paths_to_upload)?; + // Also update the in-memory copy + self.disk_consistent_lsn.store(disk_consistent_lsn); + } Ok(()) } /// Update metadata file - fn update_disk_consistent_lsn( + fn update_metadata_file( &self, disk_consistent_lsn: Lsn, layer_paths_to_upload: HashMap, ) -> Result<()> { - // If we were able to advance 'disk_consistent_lsn', save it the metadata file. - // After crash, we will restart WAL streaming and processing from that point. - let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); - if disk_consistent_lsn != old_disk_consistent_lsn { - assert!(disk_consistent_lsn > old_disk_consistent_lsn); + // We can only save a valid 'prev_record_lsn' value on disk if we + // flushed *all* in-memory changes to disk. We only track + // 'prev_record_lsn' in memory for the latest processed record, so we + // don't remember what the correct value that corresponds to some old + // LSN is. But if we flush everything, then the value corresponding + // current 'last_record_lsn' is correct and we can store it on disk. + let RecordLsn { + last: last_record_lsn, + prev: prev_record_lsn, + } = self.last_record_lsn.load(); + let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn { + Some(prev_record_lsn) + } else { + None + }; - // We can only save a valid 'prev_record_lsn' value on disk if we - // flushed *all* in-memory changes to disk. We only track - // 'prev_record_lsn' in memory for the latest processed record, so we - // don't remember what the correct value that corresponds to some old - // LSN is. But if we flush everything, then the value corresponding - // current 'last_record_lsn' is correct and we can store it on disk. - let RecordLsn { - last: last_record_lsn, - prev: prev_record_lsn, - } = self.last_record_lsn.load(); - let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn { - Some(prev_record_lsn) - } else { - None - }; + let ancestor_timeline_id = self + .ancestor_timeline + .as_ref() + .map(|ancestor| ancestor.timeline_id); - let ancestor_timeline_id = self - .ancestor_timeline - .as_ref() - .map(|ancestor| ancestor.timeline_id); + let metadata = TimelineMetadata::new( + disk_consistent_lsn, + ondisk_prev_record_lsn, + ancestor_timeline_id, + self.ancestor_lsn, + *self.latest_gc_cutoff_lsn.read(), + self.initdb_lsn, + self.pg_version, + ); - let metadata = TimelineMetadata::new( - disk_consistent_lsn, - ondisk_prev_record_lsn, - ancestor_timeline_id, - self.ancestor_lsn, - *self.latest_gc_cutoff_lsn.read(), - self.initdb_lsn, - self.pg_version, - ); + fail_point!("checkpoint-before-saving-metadata", |x| bail!( + "{}", + x.unwrap() + )); - fail_point!("checkpoint-before-saving-metadata", |x| bail!( - "{}", - x.unwrap() - )); + save_metadata( + self.conf, + self.timeline_id, + self.tenant_id, + &metadata, + false, + )?; - save_metadata( - self.conf, - self.timeline_id, + if self.upload_layers.load(atomic::Ordering::Relaxed) { + storage_sync::schedule_layer_upload( self.tenant_id, - &metadata, - false, - )?; - - if self.upload_layers.load(atomic::Ordering::Relaxed) { - storage_sync::schedule_layer_upload( - self.tenant_id, - self.timeline_id, - layer_paths_to_upload, - Some(metadata), - ); - } - - // Also update the in-memory copy - self.disk_consistent_lsn.store(disk_consistent_lsn); + self.timeline_id, + layer_paths_to_upload, + Some(metadata), + ); } Ok(()) @@ -1961,6 +1959,9 @@ impl Timeline { new_gc_cutoff ); write_guard.store_and_unlock(new_gc_cutoff).wait(); + + // Persist metadata file + self.update_metadata_file(self.disk_consistent_lsn.load(), HashSet::new())?; } info!("GC starting"); @@ -2087,6 +2088,18 @@ impl Timeline { result.layers_removed += 1; } + info!( + "GC completed removing {} layers, cuttof {}", + result.layers_removed, new_gc_cutoff + ); + if result.layers_removed != 0 { + fail_point!("gc-before-save-metadata", |_| { + info!("Abnormaly terinate pageserver at gc-before-save-metadata fail point"); + std::process::abort(); + }); + return Ok(result); + } + if self.upload_layers.load(atomic::Ordering::Relaxed) { storage_sync::schedule_layer_delete( self.tenant_id, diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 8329b15c08..030055df6d 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -70,8 +70,10 @@ async fn compaction_loop(tenant_id: TenantId) { // Run compaction let mut sleep_duration = tenant.get_compaction_period(); if let Err(e) = tenant.compaction_iteration() { - error!("Compaction failed, retrying: {e:#}"); sleep_duration = wait_duration; + error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration); + #[cfg(feature = "testing")] + std::process::abort(); } // Sleep @@ -119,8 +121,10 @@ async fn gc_loop(tenant_id: TenantId) { if gc_horizon > 0 { if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false) { - error!("Gc failed, retrying: {e:#}"); sleep_duration = wait_duration; + error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration); + #[cfg(feature = "testing")] + std::process::abort(); } } diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py new file mode 100644 index 0000000000..946c689a30 --- /dev/null +++ b/test_runner/regress/test_gc_cutoff.py @@ -0,0 +1,38 @@ +import pytest +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin +from performance.test_perf_pgbench import get_scales_matrix + + +# Test gc_cuttoff +# +# This test set fail point after at the end of GC and checks +# that pageserver normally restarts after it +@pytest.mark.parametrize("scale", get_scales_matrix(10)) +def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, scale: int): + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test + tenant_id, _ = env.neon_cli.create_tenant( + conf={ + "gc_period": "10 s", + "gc_horizon": f"{1024 ** 2}", + "checkpoint_distance": f"{1024 ** 2}", + "compaction_target_size": f"{1024 ** 2}", + # set PITR interval to be small, so we can do GC + "pitr_interval": "1 s", + } + ) + pg = env.postgres.create_start("main", tenant_id=tenant_id) + connstr = pg.connstr() + pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + + pageserver_http.configure_failpoints(("gc-before-save-metadata", "return")) + + for i in range(5): + try: + pg_bin.run_capture(["pgbench", "-T100", connstr]) + except Exception: + env.pageserver.stop() + env.pageserver.start() + pageserver_http.configure_failpoints(("gc-before-save-metadata", "return")) From 6b49b370fc1bcd4f8c6b6ed81f79dc8f7fcc076b Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 19 Oct 2022 13:16:55 +0300 Subject: [PATCH 0925/1022] Fix build after applying PR #2558 --- pageserver/src/tenant/timeline.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1728c7be32..a771f82caf 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1961,7 +1961,7 @@ impl Timeline { write_guard.store_and_unlock(new_gc_cutoff).wait(); // Persist metadata file - self.update_metadata_file(self.disk_consistent_lsn.load(), HashSet::new())?; + self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; } info!("GC starting"); From 7576b18b14720708f2873083c50874da913eaf27 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 18 Oct 2022 14:45:51 +0300 Subject: [PATCH 0926/1022] [compute_tools] fix GRANT CREATE ON SCHEMA public - run the grant query in each database --- compute_tools/src/spec.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 89a6a93510..84d72714db 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -380,10 +380,6 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { info!("grant query {}", &query); client.execute(query.as_str(), &[])?; - - // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user. - // This is needed since postgres 15, where this privilege is removed by default. - client.execute("GRANT CREATE ON SCHEMA public TO web_access", &[])?; } // Do some per-database access adjustments. We'd better do this at db creation time, @@ -426,6 +422,12 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { db.owner.quote() ); db_client.simple_query(&alter_query)?; + + // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user. + // This is needed since postgres 15, where this privilege is removed by default. + let grant_query: String = "GRANT CREATE ON SCHEMA public TO web_access".to_string(); + info!("grant query for db {} : {}", &db.name, &grant_query); + db_client.simple_query(&grant_query)?; } Ok(()) From 4d1e48f3b9a4b7064787513fd2c455f0001f6e18 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Wed, 19 Oct 2022 19:20:06 +0200 Subject: [PATCH 0927/1022] [compute_ctl] Use postgres::config to properly escape database names (#2652) We've got at least one user in production that cannot create a database with a trailing space in the name. This happens because we use `url` crate for manipulating the DATABASE_URL, but it follows a standard that doesn't fit really well with Postgres. For example, it trims all trailing spaces from the path: > Remove any leading and trailing C0 control or space from input. > See: https://url.spec.whatwg.org/#url-parsing But we used `set_path()` to set database name and it's totally valid to have trailing spaces in the database name in Postgres. Thus, use `postgres::config::Config` to modify database name in the connection details. --- compute_tools/src/pg_helpers.rs | 22 +++++----- compute_tools/src/spec.rs | 54 ++++++++++++------------- compute_tools/tests/pg_helpers_tests.rs | 4 +- 3 files changed, 39 insertions(+), 41 deletions(-) diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index ad7ea0abc8..42aa00af01 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -8,11 +8,10 @@ use std::process::Child; use std::time::{Duration, Instant}; use anyhow::{bail, Result}; +use notify::{RecursiveMode, Watcher}; use postgres::{Client, Transaction}; use serde::Deserialize; -use notify::{RecursiveMode, Watcher}; - const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds /// Rust representation of Postgres role info with only those fields @@ -169,7 +168,7 @@ impl Database { /// it may require a proper quoting too. pub fn to_pg_options(&self) -> String { let mut params: String = self.options.as_pg_options(); - write!(params, " OWNER {}", &self.owner.quote()) + write!(params, " OWNER {}", &self.owner.pg_quote()) .expect("String is documented to not to error during write operations"); params @@ -180,18 +179,17 @@ impl Database { /// intended to be used for DB / role names. pub type PgIdent = String; -/// Generic trait used to provide quoting for strings used in the -/// Postgres SQL queries. Currently used only to implement quoting -/// of identifiers, but could be used for literals in the future. -pub trait PgQuote { - fn quote(&self) -> String; +/// Generic trait used to provide quoting / encoding for strings used in the +/// Postgres SQL queries and DATABASE_URL. +pub trait Escaping { + fn pg_quote(&self) -> String; } -impl PgQuote for PgIdent { +impl Escaping for PgIdent { /// This is intended to mimic Postgres quote_ident(), but for simplicity it - /// always quotes provided string with `""` and escapes every `"`. Not idempotent, - /// i.e. if string is already escaped it will be escaped again. - fn quote(&self) -> String { + /// always quotes provided string with `""` and escapes every `"`. + /// **Not idempotent**, i.e. if string is already escaped it will be escaped again. + fn pg_quote(&self) -> String { let result = format!("\"{}\"", self.replace('"', "\"\"")); result } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 84d72714db..e0c0e9404b 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,7 +1,9 @@ use std::path::Path; +use std::str::FromStr; use anyhow::Result; use log::{info, log_enabled, warn, Level}; +use postgres::config::Config; use postgres::{Client, NoTls}; use serde::Deserialize; @@ -115,8 +117,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { if existing_roles.iter().any(|r| r.name == op.name) { let query: String = format!( "ALTER ROLE {} RENAME TO {}", - op.name.quote(), - new_name.quote() + op.name.pg_quote(), + new_name.pg_quote() ); warn!("renaming role '{}' to '{}'", op.name, new_name); @@ -162,7 +164,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { } if update_role { - let mut query: String = format!("ALTER ROLE {} ", name.quote()); + let mut query: String = format!("ALTER ROLE {} ", name.pg_quote()); info_print!(" -> update"); query.push_str(&role.to_pg_options()); @@ -170,7 +172,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { } } else { info!("role name: '{}'", &name); - let mut query: String = format!("CREATE ROLE {} ", name.quote()); + let mut query: String = format!("CREATE ROLE {} ", name.pg_quote()); info!("role create query: '{}'", &query); info_print!(" -> create"); @@ -179,7 +181,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { let grant_query = format!( "GRANT pg_read_all_data, pg_write_all_data TO {}", - name.quote() + name.pg_quote() ); xact.execute(grant_query.as_str(), &[])?; info!("role grant query: '{}'", &grant_query); @@ -215,7 +217,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result< // We do not check either role exists or not, // Postgres will take care of it for us if op.action == "delete_role" { - let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote()); + let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.pg_quote()); warn!("deleting role '{}'", &op.name); xact.execute(query.as_str(), &[])?; @@ -230,17 +232,16 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result< fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> { for db in &node.spec.cluster.databases { if db.owner != *role_name { - let mut connstr = node.connstr.clone(); - // database name is always the last and the only component of the path - connstr.set_path(&db.name); + let mut conf = Config::from_str(node.connstr.as_str())?; + conf.dbname(&db.name); - let mut client = Client::connect(connstr.as_str(), NoTls)?; + let mut client = conf.connect(NoTls)?; // This will reassign all dependent objects to the db owner let reassign_query = format!( "REASSIGN OWNED BY {} TO {}", - role_name.quote(), - db.owner.quote() + role_name.pg_quote(), + db.owner.pg_quote() ); info!( "reassigning objects owned by '{}' in db '{}' to '{}'", @@ -249,7 +250,7 @@ fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> client.simple_query(&reassign_query)?; // This now will only drop privileges of the role - let drop_query = format!("DROP OWNED BY {}", role_name.quote()); + let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote()); client.simple_query(&drop_query)?; } } @@ -279,7 +280,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { // We do not check either DB exists or not, // Postgres will take care of it for us "delete_db" => { - let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.quote()); + let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.pg_quote()); warn!("deleting database '{}'", &op.name); client.execute(query.as_str(), &[])?; @@ -291,8 +292,8 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { if existing_dbs.iter().any(|r| r.name == op.name) { let query: String = format!( "ALTER DATABASE {} RENAME TO {}", - op.name.quote(), - new_name.quote() + op.name.pg_quote(), + new_name.pg_quote() ); warn!("renaming database '{}' to '{}'", op.name, new_name); @@ -320,7 +321,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { // XXX: db owner name is returned as quoted string from Postgres, // when quoting is needed. let new_owner = if r.owner.starts_with('"') { - db.owner.quote() + db.owner.pg_quote() } else { db.owner.clone() }; @@ -328,15 +329,15 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { if new_owner != r.owner { let query: String = format!( "ALTER DATABASE {} OWNER TO {}", - name.quote(), - db.owner.quote() + name.pg_quote(), + db.owner.pg_quote() ); info_print!(" -> update"); client.execute(query.as_str(), &[])?; } } else { - let mut query: String = format!("CREATE DATABASE {} ", name.quote()); + let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote()); info_print!(" -> create"); query.push_str(&db.to_pg_options()); @@ -366,7 +367,7 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { .cluster .roles .iter() - .map(|r| r.name.quote()) + .map(|r| r.name.pg_quote()) .collect::>(); for db in &spec.cluster.databases { @@ -374,7 +375,7 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { let query: String = format!( "GRANT CREATE ON DATABASE {} TO {}", - dbname.quote(), + dbname.pg_quote(), roles.join(", ") ); info!("grant query {}", &query); @@ -385,12 +386,11 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { // Do some per-database access adjustments. We'd better do this at db creation time, // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants // atomically. - let mut db_connstr = node.connstr.clone(); for db in &node.spec.cluster.databases { - // database name is always the last and the only component of the path - db_connstr.set_path(&db.name); + let mut conf = Config::from_str(node.connstr.as_str())?; + conf.dbname(&db.name); - let mut db_client = Client::connect(db_connstr.as_str(), NoTls)?; + let mut db_client = conf.connect(NoTls)?; // This will only change ownership on the schema itself, not the objects // inside it. Without it owner of the `public` schema will be `cloud_admin` @@ -419,7 +419,7 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { END IF;\n\ END\n\ $$;", - db.owner.quote() + db.owner.pg_quote() ); db_client.simple_query(&alter_query)?; diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index bae944440e..24cad4663a 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -33,9 +33,9 @@ mod pg_helpers_tests { } #[test] - fn quote_ident() { + fn ident_pg_quote() { let ident: PgIdent = PgIdent::from("\"name\";\\n select 1;"); - assert_eq!(ident.quote(), "\"\"\"name\"\";\\n select 1;\""); + assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\""); } } From b237feedab9d6ecec6a16379a9a7c045c7255aa4 Mon Sep 17 00:00:00 2001 From: MMeent Date: Wed, 19 Oct 2022 22:47:11 +0200 Subject: [PATCH 0928/1022] Add more redo metrics: (#2645) - Measure size of redo WAL (new histogram), with bounds between 24B-32kB - Add 2 more buckets at the upper end of the redo time histogram We often (>0.1% of several hours each day) take more than 250ms to do the redo round-trip to the postgres process. We need to measure these redo times more precisely. --- pageserver/src/metrics.rs | 26 +++++++++++++++++++++++++- pageserver/src/walredo.rs | 23 ++++++++++++++++++----- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index b654be031c..7ae2d0f14c 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -277,11 +277,15 @@ pub static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { /// smallest redo processing times. These buckets allow us to measure down /// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec. /// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec. +/// +/// Values up to 1s are recorded because metrics show that we have redo +/// durations and lock times larger than 0.250s. macro_rules! redo_histogram_time_buckets { () => { vec![ 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000, - 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, + 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, 0.500_000, + 1.000_000, ] }; } @@ -296,6 +300,17 @@ macro_rules! redo_histogram_count_buckets { }; } +macro_rules! redo_bytes_histogram_count_buckets { + () => { + // powers of (2^.5), from 2^4.5 to 2^15 (22 buckets) + // rounded up to the next multiple of 8 to capture any MAXALIGNed record of that size, too. + vec![ + 24.0, 32.0, 48.0, 64.0, 96.0, 128.0, 184.0, 256.0, 368.0, 512.0, 728.0, 1024.0, 1456.0, + 2048.0, 2904.0, 4096.0, 5800.0, 8192.0, 11592.0, 16384.0, 23176.0, 32768.0, + ] + }; +} + pub static WAL_REDO_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_seconds", @@ -323,6 +338,15 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub static WAL_REDO_BYTES_HISTOGRAM: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_wal_redo_bytes_histogram", + "Histogram of number of records replayed per redo", + redo_bytes_histogram_count_buckets!(), + ) + .expect("failed to define a metric") +}); + pub static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_replayed_wal_records_total", diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 15a9408dc9..1dd27caba6 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -39,7 +39,8 @@ use utils::crashsafe_dir::path_with_suffix_extension; use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; use crate::metrics::{ - WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, WAL_REDO_WAIT_TIME, + WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, + WAL_REDO_WAIT_TIME, }; use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; use crate::reltag::{RelTag, SlruKind}; @@ -244,12 +245,23 @@ impl PostgresRedoManager { let end_time = Instant::now(); let duration = end_time.duration_since(lock_time); + let len = records.len(); + let nbytes = records.iter().fold(0, |acumulator, record| { + acumulator + + match &record.1 { + NeonWalRecord::Postgres { rec, .. } => rec.len(), + _ => unreachable!("Only PostgreSQL records are accepted in this batch"), + } + }); + WAL_REDO_TIME.observe(duration.as_secs_f64()); - WAL_REDO_RECORDS_HISTOGRAM.observe(records.len() as f64); + WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64); + WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64); debug!( - "postgres applied {} WAL records in {} us to reconstruct page image at LSN {}", - records.len(), + "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}", + len, + nbytes, duration.as_micros(), lsn ); @@ -258,8 +270,9 @@ impl PostgresRedoManager { // next request will launch a new one. if result.is_err() { error!( - "error applying {} WAL records to reconstruct page image at LSN {}", + "error applying {} WAL records ({} bytes) to reconstruct page image at LSN {}", records.len(), + nbytes, lsn ); let process = process_guard.take().unwrap(); From bc5ec43056773f4a6742fb64dbff681392b02dd3 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 18 Oct 2022 18:23:27 +0300 Subject: [PATCH 0929/1022] Fix flaky physical-size tests in test_timeline_size.py. These two tests, test_timeline_physical_size_post_compaction and test_timeline_physical_size_post_gc, assumed that after you have waited for the WAL from a bulk insertion to arrive, and you run a cycle of checkpoint and compaction, no new layer files are created. Because if a new layer file is created while we are calculating the incremental and non-incremental physical sizes, they might differ. However, the tests used a very small checkpoint_distance, so even a small amount of WAL generated in PostgreSQL could cause a new layer file to be created. Autovacuum can kick in at any time, and do that. That caused occasional failues in the test. I was able to reproduce it reliably by adding a long delay between the incremental and non-incremental size calculations: ``` --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -129,6 +129,9 @@ async fn build_timeline_info( } }; let current_physical_size = Some(timeline.get_physical_size()); + if include_non_incremental_physical_size { + std::thread::sleep(std::time::Duration::from_millis(60000)); + } let info = TimelineInfo { tenant_id: timeline.tenant_id, ``` To fix, disable autovacuum for the table. Autovacuum could still kick in for other tables, e.g. catalog tables, but that seems less likely to generate enough WAL to causea new layer file to be flushed. If this continues to be a problem in the future, we could simply retry the physical size call a few times, if there's a mismatch. A mismatch could happen every once in a while, but it's very unlikely to happen more than once or twice in a row. Fixes https://github.com/neondatabase/neon/issues/2212 --- test_runner/regress/test_timeline_size.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index d26d5f3afa..d783f897f9 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -270,9 +270,15 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction") pg = env.postgres.create_start("test_timeline_physical_size_post_compaction") + # We don't want autovacuum to run on the table, while we are calculating the + # physical size, because that could cause a new layer to be created and a + # mismatch between the incremental and non-incremental size. (If that still + # happens, because of some other background activity or autovacuum on other + # tables, we could simply retry the size calculations. It's unlikely that + # that would happen more than once.) pg.safe_psql_many( [ - "CREATE TABLE foo (t text)", + "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)", """INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g""", @@ -297,9 +303,10 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc") pg = env.postgres.create_start("test_timeline_physical_size_post_gc") + # Like in test_timeline_physical_size_post_compaction, disable autovacuum pg.safe_psql_many( [ - "CREATE TABLE foo (t text)", + "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)", """INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g""", From 7734929a8202c8cc41596a861ffbe0b51b5f3cb9 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Wed, 19 Oct 2022 18:59:22 -0400 Subject: [PATCH 0930/1022] Remove stale todos (#2630) --- pageserver/src/http/routes.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 91a385bf77..489adbb2cf 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -781,11 +781,6 @@ async fn failpoints_handler(mut request: Request) -> Result } // Run GC immediately on given timeline. -// FIXME: This is just for tests. See test_runner/regress/test_gc.py. -// This probably should require special authentication or a global flag to -// enable, I don't think we want to or need to allow regular clients to invoke -// GC. -// @hllinnaka in commits ec44f4b29, 3aca717f3 #[cfg(feature = "testing")] async fn timeline_gc_handler(mut request: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; @@ -811,9 +806,6 @@ async fn timeline_gc_handler(mut request: Request) -> Result) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; From 9211923bef11da53421ca5949c558cfec13c7f61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s?= Date: Thu, 20 Oct 2022 09:46:57 +0200 Subject: [PATCH 0931/1022] Pageserver Python tests should not fail if the server is built with no testing feature (#2636) Co-authored-by: andres --- test_runner/fixtures/neon_fixtures.py | 60 ++++++++++--------- test_runner/performance/test_perf_pgbench.py | 5 +- test_runner/regress/test_recovery.py | 7 +-- test_runner/regress/test_tenant_relocation.py | 6 +- 4 files changed, 38 insertions(+), 40 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 88910d2bdf..c6bfa7f69e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -149,19 +149,6 @@ def pytest_configure(config): raise Exception('neon binaries not found at "{}"'.format(neon_binpath)) -def profiling_supported(): - """Return True if the pageserver was compiled with the 'profiling' feature""" - bin_pageserver = os.path.join(str(neon_binpath), "pageserver") - res = subprocess.run( - [bin_pageserver, "--version"], - check=True, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - return "profiling:true" in res.stdout - - def shareable_scope(fixture_name, config) -> Literal["session", "function"]: """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar. @@ -874,6 +861,17 @@ class NeonEnv: """Get a timeline directory's path based on the repo directory of the test environment""" return self.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + def get_pageserver_version(self) -> str: + bin_pageserver = os.path.join(str(neon_binpath), "pageserver") + res = subprocess.run( + [bin_pageserver, "--version"], + check=True, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + return res.stdout + @cached_property def auth_keys(self) -> AuthKeys: pub = (Path(self.repo_dir) / "auth_public_key.pem").read_bytes() @@ -972,10 +970,11 @@ class NeonPageserverApiException(Exception): class NeonPageserverHttpClient(requests.Session): - def __init__(self, port: int, auth_token: Optional[str] = None): + def __init__(self, port: int, is_testing_enabled_or_skip, auth_token: Optional[str] = None): super().__init__() self.port = port self.auth_token = auth_token + self.is_testing_enabled_or_skip = is_testing_enabled_or_skip if auth_token is not None: self.headers["Authorization"] = f"Bearer {auth_token}" @@ -994,6 +993,8 @@ class NeonPageserverHttpClient(requests.Session): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]) -> None: + self.is_testing_enabled_or_skip() + if isinstance(config_strings, tuple): pairs = [config_strings] else: @@ -1111,6 +1112,8 @@ class NeonPageserverHttpClient(requests.Session): def timeline_gc( self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int] ) -> dict[str, Any]: + self.is_testing_enabled_or_skip() + log.info( f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}" ) @@ -1126,6 +1129,8 @@ class NeonPageserverHttpClient(requests.Session): return res_json def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId): + self.is_testing_enabled_or_skip() + log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact" @@ -1150,6 +1155,8 @@ class NeonPageserverHttpClient(requests.Session): return res_json def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): + self.is_testing_enabled_or_skip() + log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint" @@ -1469,21 +1476,6 @@ class NeonCli(AbstractNeonCli): res.check_returncode() return res - def pageserver_enabled_features(self) -> Any: - bin_pageserver = os.path.join(str(neon_binpath), "pageserver") - args = [bin_pageserver, "--enabled-features"] - log.info('Running command "{}"'.format(" ".join(args))) - - res = subprocess.run( - args, - check=True, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - log.info(f"pageserver_enabled_features success: {res.stdout}") - return json.loads(res.stdout) - def pageserver_start( self, overrides=(), @@ -1642,6 +1634,7 @@ class NeonPageserver(PgProtocol): self.running = False self.service_port = port self.config_override = config_override + self.version = env.get_pageserver_version() def start(self, overrides=()) -> "NeonPageserver": """ @@ -1671,10 +1664,19 @@ class NeonPageserver(PgProtocol): def __exit__(self, exc_type, exc, tb): self.stop(immediate=True) + def is_testing_enabled_or_skip(self): + if "testing:true" not in self.version: + pytest.skip("pageserver was built without 'testing' feature") + + def is_profiling_enabled_or_skip(self): + if "profiling:true" not in self.version: + pytest.skip("pageserver was built without 'profiling' feature") + def http_client(self, auth_token: Optional[str] = None) -> NeonPageserverHttpClient: return NeonPageserverHttpClient( port=self.service_port.http, auth_token=auth_token, + is_testing_enabled_or_skip=self.is_testing_enabled_or_skip, ) diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 656826d6a3..0ed3e45971 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -9,7 +9,6 @@ from typing import Dict, List import pytest from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult from fixtures.compare_fixtures import NeonCompare, PgCompare -from fixtures.neon_fixtures import profiling_supported from fixtures.utils import get_scale_for_db @@ -187,10 +186,8 @@ def test_pgbench_flamegraph(zenbenchmark, pg_bin, neon_env_builder, scale: int, neon_env_builder.pageserver_config_override = """ profiling="page_requests" """ - if not profiling_supported(): - pytest.skip("pageserver was built without 'profiling' feature") - env = neon_env_builder.init_start() + env.pageserver.is_profiling_enabled_or_skip() env.neon_cli.create_branch("empty", "main") neon_compare = NeonCompare(zenbenchmark, env, pg_bin, "pgbench") diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index d0ba96e8e0..e70b1351ba 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -13,13 +13,8 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" env = neon_env_builder.init() + env.pageserver.is_testing_enabled_or_skip() - # Check if failpoints enables. Otherwise the test doesn't make sense - f = env.neon_cli.pageserver_enabled_features() - - assert ( - "testing" in f["features"] - ), "Build pageserver with --features=testing option to run this test" neon_env_builder.start() # Create a branch for us diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index a3245d65e4..e14434ffdc 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -346,7 +346,11 @@ def test_tenant_relocation( log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port) pageserver_bin = pathlib.Path(neon_binpath) / "pageserver" - new_pageserver_http = NeonPageserverHttpClient(port=new_pageserver_http_port, auth_token=None) + new_pageserver_http = NeonPageserverHttpClient( + port=new_pageserver_http_port, + auth_token=None, + is_testing_enabled_or_skip=env.pageserver.is_testing_enabled_or_skip, + ) with new_pageserver_helper( new_pageserver_dir, From 50297bef9fb5163e067a76001062d68ab5498b84 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 20 Oct 2022 12:49:54 +0300 Subject: [PATCH 0932/1022] RFC about Tenant / Timeline guard objects (#2660) Co-authored-by: Heikki Linnakangas --- docs/SUMMARY.md | 2 + docs/rfcs/019-tenant-timeline-lifecycles.md | 91 +++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 docs/rfcs/019-tenant-timeline-lifecycles.md diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index fb6467ffd5..faf2b2336f 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -80,4 +80,6 @@ - [015-storage-messaging](rfcs/015-storage-messaging.md) - [016-connection-routing](rfcs/016-connection-routing.md) - [017-timeline-data-management](rfcs/017-timeline-data-management.md) +- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md) +- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md) - [cluster-size-limits](rfcs/cluster-size-limits.md) diff --git a/docs/rfcs/019-tenant-timeline-lifecycles.md b/docs/rfcs/019-tenant-timeline-lifecycles.md new file mode 100644 index 0000000000..2734bf17b9 --- /dev/null +++ b/docs/rfcs/019-tenant-timeline-lifecycles.md @@ -0,0 +1,91 @@ +# Managing Tenant and Timeline lifecycles + +## Summary + +The pageserver has a Tenant object in memory for each tenant it manages, and a +Timeline for each timeline. There are a lot of tasks that operate on the tenants +and timelines with references to those objects. We have some mechanisms to track +which tasks are operating on each Tenant and Timeline, and to request them to +shutdown when a tenant or timeline is deleted, but it does not cover all uses, +and as a result we have many race conditions around tenant/timeline shutdown. + +## Motivation + +We have a bunch of race conditions that can produce weird errors and can be hard +to track down. + +## Non Goals + +This RFC only covers the problem of ensuring that a task/thread isn't operating +on a Tenant or Timeline. It does not cover what states, aside from Active and +non-Active, each Tenant and Timeline should have, or when exactly the transitions +should happen. + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +Pageserver. Although I wonder if the safekeeper should have a similar mechanism. + +## Current situation + +Most pageserver tasks of are managed by task_mgr.rs: + +- LibpqEndpointListener +- HttpEndPointListener +- WalReceiverManager and -Connection +- GarbageCollector and Compaction +- InitialLogicalSizeCalculation + +In addition to those tasks, the walreceiver performs some direct tokio::spawn +calls to spawn tasks that are not registered with 'task_mgr'. And all of these +tasks can spawn extra operations with tokio spawn_blocking. + +Whenever a tenant or timeline is removed from the system, by pageserver +shutdown, delete_timeline or tenant-detach operation, we rely on the task +registry in 'task_mgr.rs' to wait until there are no tasks operating on the +tenant or timeline, before its Tenant/Timeline object is removed. That relies on +each task to register itself with the tenant/timeline ID in +'task_mgr.rs'. However, there are many gaps in that. For example, +GarbageCollection and Compaction tasks are registered with the tenant, but when +they proceed to operate on a particular timeline of the tenant, they don't +register with timeline ID. Because of that, the timeline can be deleted while GC +or compaction is running on it, causing failures in the GC or compaction (see +https://github.com/neondatabase/neon/issues/2442). + +Another problem is that the task registry only works for tokio Tasks. There is +no way to register a piece of code that runs inside spawn_blocking(), for +example. + +## Proposed implementation + +This "voluntary" registration of tasks is fragile. Let's use Rust language features +to enforce that a tenant/timeline cannot be removed from the system when there is +still some code operating on it. + +Let's introduce new Guard objects for Tenant and Timeline, and do all actions through +the Guard object. Something like: + +TenantActiveGuard: Guard object over Arc. When you acquire the guard, +the code checks that the tenant is in Active state. If it's not, you get an +error. You can change the state of the tenant to Stopping while there are +ActiveTenantGuard objects still on it, to prevent new ActiveTenantGuards from +being acquired, but the Tenant cannot be removed until all the guards are gone. + +TenantMaintenanceGuard: Like ActiveTenantGuard, but can be held even when the +tenant is not in Active state. Used for operations like attach/detach. Perhaps +allow only one such guard on a Tenant at a time. + +Similarly for Timelines. We don't currentl have a "state" on Timeline, but I think +we need at least two states: Active and Stopping. The Stopping state is used at +deletion, to prevent new TimelineActiveGuards from appearing, while you wait for +existing TimelineActiveGuards to die out. + +The shutdown-signaling, using shutdown_watcher() and is_shutdown_requested(), +probably also needs changes to deal with the new Guards. The rule is that if you +have a TenantActiveGuard, and the tenant's state changes from Active to +Stopping, the is_shutdown_requested() function should return true, and +shutdown_watcher() future should return. + +This signaling doesn't neessarily need to cover all cases. For example, if you +have a block of code in spawn_blocking(), it might be acceptable if +is_shutdown_requested() doesn't return true even though the tenant is in +Stopping state, as long as the code finishes reasonably fast. From 84c5f681b07479b4d128d2e0e31d0a2d01f6b9fa Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 20 Oct 2022 13:44:03 +0300 Subject: [PATCH 0933/1022] Fix test feature detection (#2659) Follow-up of #2636 and #2654 , fixing the test detection feature. Pageserver currently outputs features as ``` /target/debug/pageserver --version Neon page server git:7734929a8202c8cc41596a861ffbe0b51b5f3cb9 failpoints: true, features: ["testing", "profiling"] ``` --- test_runner/fixtures/neon_fixtures.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index c6bfa7f69e..a77b3958c9 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1665,11 +1665,11 @@ class NeonPageserver(PgProtocol): self.stop(immediate=True) def is_testing_enabled_or_skip(self): - if "testing:true" not in self.version: + if '"testing"' not in self.version: pytest.skip("pageserver was built without 'testing' feature") def is_profiling_enabled_or_skip(self): - if "profiling:true" not in self.version: + if '"profiling"' not in self.version: pytest.skip("pageserver was built without 'profiling' feature") def http_client(self, auth_token: Optional[str] = None) -> NeonPageserverHttpClient: From 306a47c4fab6f9c0c2cf83af64848750ff772335 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 20 Oct 2022 14:19:17 +0300 Subject: [PATCH 0934/1022] Use uninit mark files during timeline init for atomic creation (#2489) Part of https://github.com/neondatabase/neon/pull/2239 Regular, from scratch, timeline creation involves initdb to be run in a separate directory, data from this directory to be imported into pageserver and, finally, timeline-related background tasks to start. This PR ensures we don't leave behind any directories that are not marked as temporary and that pageserver removes such directories on restart, allowing timeline creation to be retried with the same IDs, if needed. It would be good to later rewrite the logic to use a temporary directory, similar what tenant creation does. Yet currently it's harder than this change, so not done. --- libs/remote_storage/src/local_fs.rs | 2 +- .../src/{crashsafe_dir.rs => crashsafe.rs} | 43 +- libs/utils/src/lib.rs | 4 +- pageserver/src/bin/pageserver.rs | 2 +- pageserver/src/config.rs | 13 + pageserver/src/import_datadir.rs | 16 +- pageserver/src/page_service.rs | 18 +- pageserver/src/pgdatadir_mapping.rs | 4 +- pageserver/src/storage_sync/download.rs | 2 +- pageserver/src/tenant.rs | 619 ++++++++++++++---- pageserver/src/tenant_mgr.rs | 236 +++++-- .../src/walreceiver/connection_manager.rs | 4 +- pageserver/src/walredo.rs | 2 +- test_runner/regress/test_broken_timeline.py | 41 +- test_runner/regress/test_import.py | 6 +- test_runner/regress/test_tenants.py | 24 +- 16 files changed, 777 insertions(+), 259 deletions(-) rename libs/utils/src/{crashsafe_dir.rs => crashsafe.rs} (84%) diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 5723a512f6..2f824cc453 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -16,7 +16,7 @@ use tokio::{ io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, }; use tracing::*; -use utils::crashsafe_dir::path_with_suffix_extension; +use utils::crashsafe::path_with_suffix_extension; use crate::{Download, DownloadError, RemoteObjectId}; diff --git a/libs/utils/src/crashsafe_dir.rs b/libs/utils/src/crashsafe.rs similarity index 84% rename from libs/utils/src/crashsafe_dir.rs rename to libs/utils/src/crashsafe.rs index 032ab0a916..3726779cb2 100644 --- a/libs/utils/src/crashsafe_dir.rs +++ b/libs/utils/src/crashsafe.rs @@ -12,16 +12,8 @@ pub fn create_dir(path: impl AsRef) -> io::Result<()> { let path = path.as_ref(); fs::create_dir(path)?; - File::open(path)?.sync_all()?; - - if let Some(parent) = path.parent() { - File::open(parent)?.sync_all() - } else { - Err(io::Error::new( - io::ErrorKind::InvalidInput, - "can't find parent", - )) - } + fsync_file_and_parent(path)?; + Ok(()) } /// Similar to [`std::fs::create_dir_all`], except we fsync all @@ -65,12 +57,12 @@ pub fn create_dir_all(path: impl AsRef) -> io::Result<()> { // Fsync the created directories from child to parent. for &path in dirs_to_create.iter() { - File::open(path)?.sync_all()?; + fsync(path)?; } // If we created any new directories, fsync the parent. if !dirs_to_create.is_empty() { - File::open(path)?.sync_all()?; + fsync(path)?; } Ok(()) @@ -92,6 +84,33 @@ pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) .with_extension(new_extension.as_ref()) } +pub fn fsync_file_and_parent(file_path: &Path) -> io::Result<()> { + let parent = file_path.parent().ok_or_else(|| { + io::Error::new( + io::ErrorKind::Other, + format!("File {file_path:?} has no parent"), + ) + })?; + + fsync(file_path)?; + fsync(parent)?; + Ok(()) +} + +pub fn fsync(path: &Path) -> io::Result<()> { + File::open(path) + .map_err(|e| io::Error::new(e.kind(), format!("Failed to open the file {path:?}: {e}"))) + .and_then(|file| { + file.sync_all().map_err(|e| { + io::Error::new( + e.kind(), + format!("Failed to sync file {path:?} data and metadata: {e}"), + ) + }) + }) + .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}"))) +} + #[cfg(test)] mod tests { use tempfile::tempdir; diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 2c80556446..f1f48f5a90 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -22,8 +22,8 @@ pub mod pq_proto; // dealing with connstring parsing and handy access to it's parts pub mod connstring; -// helper functions for creating and fsyncing directories/trees -pub mod crashsafe_dir; +// helper functions for creating and fsyncing +pub mod crashsafe; // common authentication routines pub mod auth; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 12f594077e..9317dd5dd7 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -87,7 +87,7 @@ fn main() -> anyhow::Result<()> { let tenants_path = conf.tenants_path(); if !tenants_path.exists() { - utils::crashsafe_dir::create_dir_all(conf.tenants_path()).with_context(|| { + utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| { format!( "Failed to create tenants root dir at '{}'", tenants_path.display() diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 6e3c7baad8..b797866e43 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -7,6 +7,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use remote_storage::RemoteStorageConfig; use std::env; +use utils::crashsafe::path_with_suffix_extension; use std::path::{Path, PathBuf}; use std::str::FromStr; @@ -24,6 +25,7 @@ use crate::tenant_config::{TenantConf, TenantConfOpt}; /// The name of the metadata file pageserver creates per timeline. pub const METADATA_FILE_NAME: &str = "metadata"; +pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; const TENANT_CONFIG_NAME: &str = "config"; pub mod defaults { @@ -364,6 +366,17 @@ impl PageServerConf { self.timelines_path(tenant_id).join(timeline_id.to_string()) } + pub fn timeline_uninit_mark_file_path( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> PathBuf { + path_with_suffix_extension( + self.timeline_path(&timeline_id, &tenant_id), + TIMELINE_UNINIT_MARK_SUFFIX, + ) + } + /// Points to a place in pageserver's local directory, /// where certain timeline's metadata file should be located. pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf { diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 23c4351b4e..ee3dc684e3 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -43,19 +43,19 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result { /// The code that deals with the checkpoint would not work right if the /// cluster was not shut down cleanly. pub fn import_timeline_from_postgres_datadir( - path: &Path, tline: &Timeline, - lsn: Lsn, + pgdata_path: &Path, + pgdata_lsn: Lsn, ) -> Result<()> { let mut pg_control: Option = None; // TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn) // Then fishing out pg_control would be unnecessary - let mut modification = tline.begin_modification(lsn); + let mut modification = tline.begin_modification(pgdata_lsn); modification.init_empty()?; // Import all but pg_wal - let all_but_wal = WalkDir::new(path) + let all_but_wal = WalkDir::new(pgdata_path) .into_iter() .filter_entry(|entry| !entry.path().ends_with("pg_wal")); for entry in all_but_wal { @@ -63,7 +63,7 @@ pub fn import_timeline_from_postgres_datadir( let metadata = entry.metadata().expect("error getting dir entry metadata"); if metadata.is_file() { let absolute_path = entry.path(); - let relative_path = absolute_path.strip_prefix(path)?; + let relative_path = absolute_path.strip_prefix(pgdata_path)?; let file = File::open(absolute_path)?; let len = metadata.len() as usize; @@ -84,7 +84,7 @@ pub fn import_timeline_from_postgres_datadir( "Postgres cluster was not shut down cleanly" ); ensure!( - pg_control.checkPointCopy.redo == lsn.0, + pg_control.checkPointCopy.redo == pgdata_lsn.0, "unexpected checkpoint REDO pointer" ); @@ -92,10 +92,10 @@ pub fn import_timeline_from_postgres_datadir( // this reads the checkpoint record itself, advancing the tip of the timeline to // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'. import_wal( - &path.join("pg_wal"), + &pgdata_path.join("pg_wal"), tline, Lsn(pg_control.checkPointCopy.redo), - lsn, + pgdata_lsn, )?; Ok(()) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 795a99058d..9b2bb3114d 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -32,7 +32,7 @@ use utils::{ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; -use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; +use crate::import_datadir::import_wal_from_tar; use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::profiling::profpoint_start; use crate::reltag::RelTag; @@ -500,11 +500,8 @@ impl PageServerHandler { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); - let timeline = tenant_mgr::get_tenant(tenant_id, true)?.create_empty_timeline( - timeline_id, - base_lsn, - pg_version, - )?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; + let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version)?; // TODO mark timeline as not ready until it reaches end_lsn. // We might have some wal to import as well, and we should prevent compute @@ -527,7 +524,8 @@ impl PageServerHandler { // - use block_in_place() let mut copyin_stream = Box::pin(copyin_stream(pgb)); let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); - tokio::task::block_in_place(|| import_basebackup_from_tar(&timeline, reader, base_lsn))?; + tokio::task::block_in_place(|| timeline.import_basebackup_from_tar(reader, base_lsn))?; + timeline.initialize()?; // Drain the rest of the Copy data let mut bytes_after_tar = 0; @@ -544,12 +542,6 @@ impl PageServerHandler { // It wouldn't work if base came from vanilla postgres though, // since we discard some log files. - // Flush data to disk, then upload to s3 - info!("flushing layers"); - timeline.checkpoint(CheckpointConfig::Flush)?; - - timeline.launch_wal_receiver()?; - info!("done"); Ok(()) } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index fc9867dc05..424ce4769a 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1403,7 +1403,9 @@ pub fn create_test_timeline( timeline_id: utils::id::TimelineId, pg_version: u32, ) -> Result> { - let tline = tenant.create_empty_timeline(timeline_id, Lsn(8), pg_version)?; + let tline = tenant + .create_empty_timeline(timeline_id, Lsn(8), pg_version)? + .initialize()?; let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; m.commit()?; diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 61ef164f14..6f9b2e2071 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -22,7 +22,7 @@ use crate::{ TEMP_FILE_SUFFIX, }; use utils::{ - crashsafe_dir::path_with_suffix_extension, + crashsafe::path_with_suffix_extension, id::{TenantId, TenantTimelineId, TimelineId}, }; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 47ef9284b8..93c473f0fe 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -14,7 +14,7 @@ use anyhow::{bail, ensure, Context, Result}; use tokio::sync::watch; use tracing::*; -use utils::crashsafe_dir::path_with_suffix_extension; +use utils::crashsafe::path_with_suffix_extension; use std::cmp::min; use std::collections::hash_map::Entry; @@ -23,10 +23,12 @@ use std::collections::HashMap; use std::fs; use std::fs::File; use std::fs::OpenOptions; +use std::io; use std::io::Write; use std::num::NonZeroU64; use std::ops::Bound::Included; use std::path::Path; +use std::path::PathBuf; use std::process::Command; use std::process::Stdio; use std::sync::Arc; @@ -49,7 +51,7 @@ pub use pageserver_api::models::TenantState; use toml_edit; use utils::{ - crashsafe_dir, + crashsafe, id::{TenantId, TimelineId}, lsn::{Lsn, RecordLsn}, }; @@ -120,6 +122,216 @@ pub struct Tenant { upload_layers: bool, } +/// A timeline with some of its files on disk, being initialized. +/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or +/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory +/// to be removed on next restart. +/// +/// The caller is responsible for proper timeline data filling before the final init. +#[must_use] +pub struct UninitializedTimeline<'t> { + owning_tenant: &'t Tenant, + timeline_id: TimelineId, + raw_timeline: Option<(Timeline, TimelineUninitMark)>, +} + +/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory, +/// or gets removed eventually. +/// +/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first. +#[must_use] +struct TimelineUninitMark { + uninit_mark_deleted: bool, + uninit_mark_path: PathBuf, + timeline_path: PathBuf, +} + +impl UninitializedTimeline<'_> { + /// Ensures timeline data is valid, loads it into pageserver's memory and removes uninit mark file on success. + pub fn initialize(self) -> anyhow::Result> { + let mut timelines = self.owning_tenant.timelines.lock().unwrap(); + self.initialize_with_lock(&mut timelines, true) + } + + fn initialize_with_lock( + mut self, + timelines: &mut HashMap>, + load_layer_map: bool, + ) -> anyhow::Result> { + let timeline_id = self.timeline_id; + let tenant_id = self.owning_tenant.tenant_id; + + let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| { + format!("No timeline for initalization found for {tenant_id}/{timeline_id}") + })?; + let new_timeline = Arc::new(new_timeline); + + let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn(); + // TODO it would be good to ensure that, but apparently a lot of our testing is dependend on that at least + // ensure!(new_disk_consistent_lsn.is_valid(), + // "Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn and cannot be initialized"); + + match timelines.entry(timeline_id) { + Entry::Occupied(_) => anyhow::bail!( + "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map" + ), + Entry::Vacant(v) => { + if load_layer_map { + new_timeline + .load_layer_map(new_disk_consistent_lsn) + .with_context(|| { + format!( + "Failed to load layermap for timeline {tenant_id}/{timeline_id}" + ) + })?; + } + uninit_mark.remove_uninit_mark().with_context(|| { + format!( + "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}" + ) + })?; + v.insert(Arc::clone(&new_timeline)); + new_timeline.launch_wal_receiver().with_context(|| { + format!("Failed to launch walreceiver for timeline {tenant_id}/{timeline_id}") + })?; + } + } + + Ok(new_timeline) + } + + /// Prepares timeline data by loading it from the basebackup archive. + pub fn import_basebackup_from_tar( + &self, + reader: impl std::io::Read, + base_lsn: Lsn, + ) -> anyhow::Result<()> { + let raw_timeline = self.raw_timeline()?; + import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn).with_context( + || { + format!( + "Failed to import basebackup for timeline {}/{}", + self.owning_tenant.tenant_id, self.timeline_id + ) + }, + )?; + + fail::fail_point!("before-checkpoint-new-timeline", |_| { + bail!("failpoint before-checkpoint-new-timeline"); + }); + + raw_timeline + .checkpoint(CheckpointConfig::Flush) + .with_context(|| { + format!( + "Failed to checkpoint after basebackup import for timeline {}/{}", + self.owning_tenant.tenant_id, self.timeline_id + ) + })?; + Ok(()) + } + + fn raw_timeline(&self) -> anyhow::Result<&Timeline> { + Ok(&self + .raw_timeline + .as_ref() + .with_context(|| { + format!( + "No raw timeline {}/{} found", + self.owning_tenant.tenant_id, self.timeline_id + ) + })? + .0) + } +} + +impl Drop for UninitializedTimeline<'_> { + fn drop(&mut self) { + if let Some((_, uninit_mark)) = self.raw_timeline.take() { + let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered(); + error!("Timeline got dropped without initializing, cleaning its files"); + cleanup_timeline_directory(uninit_mark); + } + } +} + +fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) { + let timeline_path = &uninit_mark.timeline_path; + match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) { + Ok(()) => { + info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark") + } + Err(e) => { + error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}") + } + } + drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists +} + +impl TimelineUninitMark { + /// Useful for initializing timelines, existing on disk after the restart. + pub fn dummy() -> Self { + Self { + uninit_mark_deleted: true, + uninit_mark_path: PathBuf::new(), + timeline_path: PathBuf::new(), + } + } + + fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self { + Self { + uninit_mark_deleted: false, + uninit_mark_path, + timeline_path, + } + } + + fn remove_uninit_mark(mut self) -> anyhow::Result<()> { + if !self.uninit_mark_deleted { + self.delete_mark_file_if_present()?; + } + + Ok(()) + } + + fn delete_mark_file_if_present(&mut self) -> Result<(), anyhow::Error> { + let uninit_mark_file = &self.uninit_mark_path; + let uninit_mark_parent = uninit_mark_file + .parent() + .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?; + ignore_absent_files(|| fs::remove_file(&uninit_mark_file)).with_context(|| { + format!("Failed to remove uninit mark file at path {uninit_mark_file:?}") + })?; + crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?; + self.uninit_mark_deleted = true; + + Ok(()) + } +} + +impl Drop for TimelineUninitMark { + fn drop(&mut self) { + if !self.uninit_mark_deleted { + if self.timeline_path.exists() { + error!( + "Uninit mark {} is not removed, timeline {} stays uninitialized", + self.uninit_mark_path.display(), + self.timeline_path.display() + ) + } else { + // unblock later timeline creation attempts + warn!( + "Removing intermediate uninit mark file {}", + self.uninit_mark_path.display() + ); + if let Err(e) = self.delete_mark_file_if_present() { + error!("Failed to remove the uninit mark file: {e}") + } + } + } + } +} + /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. impl Tenant { @@ -162,19 +374,10 @@ impl Tenant { new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: u32, - ) -> Result> { - // XXX: keep the lock to avoid races during timeline creation - let mut timelines = self.timelines.lock().unwrap(); - - anyhow::ensure!( - timelines.get(&new_timeline_id).is_none(), - "Timeline {new_timeline_id} already exists" - ); - - let timeline_path = self.conf.timeline_path(&new_timeline_id, &self.tenant_id); - if timeline_path.exists() { - bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.") - } + ) -> anyhow::Result { + let timelines = self.timelines.lock().unwrap(); + let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id, &timelines)?; + drop(timelines); let new_metadata = TimelineMetadata::new( Lsn(0), @@ -185,11 +388,13 @@ impl Tenant { initdb_lsn, pg_version, ); - let new_timeline = - self.create_initialized_timeline(new_timeline_id, new_metadata, &mut timelines)?; - new_timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); - - Ok(new_timeline) + self.prepare_timeline( + new_timeline_id, + new_metadata, + timeline_uninit_mark, + true, + None, + ) } /// Create a new timeline. @@ -205,14 +410,10 @@ impl Tenant { ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, pg_version: u32, - ) -> Result>> { + ) -> anyhow::Result>> { let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); - if self - .conf - .timeline_path(&new_timeline_id, &self.tenant_id) - .exists() - { + if self.get_timeline(new_timeline_id).is_ok() { debug!("timeline {new_timeline_id} already exists"); return Ok(None); } @@ -391,21 +592,32 @@ impl Tenant { timeline_id, metadata.pg_version() ); - let ancestor = metadata - .ancestor_timeline() - .and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id)) - .cloned(); - match timelines_accessor.entry(timeline_id) { - Entry::Occupied(_) => warn!( + + if timelines_accessor.contains_key(&timeline_id) { + warn!( "Timeline {}/{} already exists in the tenant map, skipping its initialization", self.tenant_id, timeline_id - ), - Entry::Vacant(v) => { - let timeline = self - .initialize_new_timeline(timeline_id, metadata, ancestor) - .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; - v.insert(timeline); - } + ); + continue; + } else { + let ancestor = metadata + .ancestor_timeline() + .and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id)) + .cloned(); + let timeline = UninitializedTimeline { + owning_tenant: self, + timeline_id, + raw_timeline: Some(( + self.create_timeline_data(timeline_id, metadata, ancestor) + .with_context(|| { + format!("Failed to initialize timeline {timeline_id}") + })?, + TimelineUninitMark::dummy(), + )), + }; + let initialized_timeline = + timeline.initialize_with_lock(&mut timelines_accessor, true)?; + timelines_accessor.insert(timeline_id, initialized_timeline); } } @@ -599,12 +811,12 @@ impl Tenant { self.tenant_conf.write().unwrap().update(&new_tenant_conf); } - fn initialize_new_timeline( + fn create_timeline_data( &self, new_timeline_id: TimelineId, new_metadata: TimelineMetadata, ancestor: Option>, - ) -> anyhow::Result> { + ) -> anyhow::Result { if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() { anyhow::ensure!( ancestor.is_some(), @@ -612,9 +824,8 @@ impl Tenant { ) } - let new_disk_consistent_lsn = new_metadata.disk_consistent_lsn(); let pg_version = new_metadata.pg_version(); - let new_timeline = Arc::new(Timeline::new( + Ok(Timeline::new( self.conf, Arc::clone(&self.tenant_conf), new_metadata, @@ -624,15 +835,7 @@ impl Tenant { Arc::clone(&self.walredo_mgr), self.upload_layers, pg_version, - )); - - new_timeline - .load_layer_map(new_disk_consistent_lsn) - .context("failed to load layermap")?; - - new_timeline.launch_wal_receiver()?; - - Ok(new_timeline) + )) } pub fn new( @@ -914,11 +1117,14 @@ impl Tenant { src: TimelineId, dst: TimelineId, start_lsn: Option, - ) -> Result> { + ) -> anyhow::Result> { // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn // about timelines, so otherwise a race condition is possible, where we create new timeline and GC // concurrently removes data that is needed by the new timeline. let _gc_cs = self.gc_cs.lock().unwrap(); + let timelines = self.timelines.lock().unwrap(); + let timeline_uninit_mark = self.create_timeline_uninit_mark(dst, &timelines)?; + drop(timelines); // In order for the branch creation task to not wait for GC/compaction, // we need to make sure that the starting LSN of the child branch is not out of scope midway by @@ -929,12 +1135,12 @@ impl Tenant { // Step 2 is to avoid initializing the new branch using data removed by past GC iterations // or in-queue GC iterations. - // XXX: keep the lock to avoid races during timeline creation - let mut timelines = self.timelines.lock().unwrap(); - let src_timeline = timelines - .get(&src) - // message about timeline being remote is one .context up in the stack - .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {src}"))?; + let src_timeline = self.get_timeline(src).with_context(|| { + format!( + "No ancestor {} found for timeline {}/{}", + src, self.tenant_id, dst + ) + })?; let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); @@ -988,7 +1194,17 @@ impl Tenant { src_timeline.initdb_lsn, src_timeline.pg_version, ); - let new_timeline = self.create_initialized_timeline(dst, metadata, &mut timelines)?; + let mut timelines = self.timelines.lock().unwrap(); + let new_timeline = self + .prepare_timeline( + dst, + metadata, + timeline_uninit_mark, + false, + Some(src_timeline), + )? + .initialize_with_lock(&mut timelines, true)?; + drop(timelines); info!("branched timeline {dst} from {src} at {start_lsn}"); Ok(new_timeline) @@ -1000,7 +1216,10 @@ impl Tenant { &self, timeline_id: TimelineId, pg_version: u32, - ) -> Result> { + ) -> anyhow::Result> { + let timelines = self.timelines.lock().unwrap(); + let timeline_uninit_mark = self.create_timeline_uninit_mark(timeline_id, &timelines)?; + drop(timelines); // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` // temporary directory for basebackup files for the given timeline. let initdb_path = path_with_suffix_extension( @@ -1010,24 +1229,65 @@ impl Tenant { TEMP_FILE_SUFFIX, ); - // Init temporarily repo to get bootstrap data + // an uninit mark was placed before, nothing else can access this timeline files + // current initdb was not run yet, so remove whatever was left from the previous runs + if initdb_path.exists() { + fs::remove_dir_all(&initdb_path).with_context(|| { + format!( + "Failed to remove already existing initdb directory: {}", + initdb_path.display() + ) + })?; + } + // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path run_initdb(self.conf, &initdb_path, pg_version)?; - let pgdata_path = initdb_path; - - let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); + // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it + scopeguard::defer! { + if let Err(e) = fs::remove_dir_all(&initdb_path) { + // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call + error!("Failed to remove temporary initdb directory '{}': {}", initdb_path.display(), e); + } + } + let pgdata_path = &initdb_path; + let pgdata_lsn = import_datadir::get_lsn_from_controlfile(pgdata_path)?.align(); // Import the contents of the data directory at the initial checkpoint // LSN, and any WAL after that. // Initdb lsn will be equal to last_record_lsn which will be set after import. - // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = self.create_empty_timeline(timeline_id, lsn, pg_version)?; - import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; + // Because we know it upfront avoid having an option or dummy zero value by passing it to the metadata. + let new_metadata = TimelineMetadata::new( + Lsn(0), + None, + None, + Lsn(0), + pgdata_lsn, + pgdata_lsn, + pg_version, + ); + let raw_timeline = + self.prepare_timeline(timeline_id, new_metadata, timeline_uninit_mark, true, None)?; + + let tenant_id = raw_timeline.owning_tenant.tenant_id; + let unfinished_timeline = raw_timeline.raw_timeline()?; + import_datadir::import_timeline_from_postgres_datadir( + unfinished_timeline, + pgdata_path, + pgdata_lsn, + ) + .with_context(|| { + format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}") + })?; fail::fail_point!("before-checkpoint-new-timeline", |_| { - bail!("failpoint before-checkpoint-new-timeline"); + anyhow::bail!("failpoint before-checkpoint-new-timeline"); }); + unfinished_timeline + .checkpoint(CheckpointConfig::Forced) + .with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?; - timeline.checkpoint(CheckpointConfig::Forced)?; + let mut timelines = self.timelines.lock().unwrap(); + let timeline = raw_timeline.initialize_with_lock(&mut timelines, false)?; + drop(timelines); info!( "created root timeline {} timeline.lsn {}", @@ -1035,25 +1295,65 @@ impl Tenant { timeline.get_last_record_lsn() ); - // Remove temp dir. We don't need it anymore - fs::remove_dir_all(pgdata_path)?; - Ok(timeline) } - fn create_initialized_timeline( + /// Creates intermediate timeline structure and its files, without loading it into memory. + /// It's up to the caller to import the necesary data and import the timeline into memory. + fn prepare_timeline( &self, new_timeline_id: TimelineId, new_metadata: TimelineMetadata, - timelines: &mut MutexGuard>>, - ) -> Result> { - crashsafe_dir::create_dir_all(self.conf.timeline_path(&new_timeline_id, &self.tenant_id)) - .with_context(|| { - format!( - "Failed to create timeline {}/{} directory", - new_timeline_id, self.tenant_id - ) - })?; + uninit_mark: TimelineUninitMark, + init_layers: bool, + ancestor: Option>, + ) -> anyhow::Result { + let tenant_id = self.tenant_id; + + match self.create_timeline_files( + &uninit_mark.timeline_path, + new_timeline_id, + new_metadata, + ancestor, + ) { + Ok(new_timeline) => { + if init_layers { + new_timeline.layers.write().unwrap().next_open_layer_at = + Some(new_timeline.initdb_lsn); + } + debug!( + "Successfully created initial files for timeline {tenant_id}/{new_timeline_id}" + ); + Ok(UninitializedTimeline { + owning_tenant: self, + timeline_id: new_timeline_id, + raw_timeline: Some((new_timeline, uninit_mark)), + }) + } + Err(e) => { + error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}"); + cleanup_timeline_directory(uninit_mark); + Err(e) + } + } + } + + fn create_timeline_files( + &self, + timeline_path: &Path, + new_timeline_id: TimelineId, + new_metadata: TimelineMetadata, + ancestor: Option>, + ) -> anyhow::Result { + let timeline_data = self + .create_timeline_data(new_timeline_id, new_metadata.clone(), ancestor) + .context("Failed to create timeline data structure")?; + crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?; + + fail::fail_point!("after-timeline-uninit-mark-creation", |_| { + anyhow::bail!("failpoint after-timeline-uninit-mark-creation"); + }); + save_metadata( self.conf, new_timeline_id, @@ -1061,37 +1361,49 @@ impl Tenant { &new_metadata, true, ) - .with_context(|| { - format!( - "Failed to create timeline {}/{} metadata", - new_timeline_id, self.tenant_id - ) - })?; + .context("Failed to create timeline metadata")?; - let ancestor = new_metadata - .ancestor_timeline() - .and_then(|ancestor_timeline_id| timelines.get(&ancestor_timeline_id)) - .cloned(); - let new_timeline = self - .initialize_new_timeline(new_timeline_id, new_metadata, ancestor) + Ok(timeline_data) + } + + /// Attempts to create an uninit mark file for the timeline initialization. + /// Bails, if the timeline is already loaded into the memory (i.e. initialized before), or the uninit mark file already exists. + /// + /// This way, we need to hold the timelines lock only for small amount of time during the mark check/creation per timeline init. + fn create_timeline_uninit_mark( + &self, + timeline_id: TimelineId, + timelines: &MutexGuard>>, + ) -> anyhow::Result { + let tenant_id = self.tenant_id; + + anyhow::ensure!( + timelines.get(&timeline_id).is_none(), + "Timeline {tenant_id}/{timeline_id} already exists in pageserver's memory" + ); + let timeline_path = self.conf.timeline_path(&timeline_id, &tenant_id); + anyhow::ensure!( + !timeline_path.exists(), + "Timeline {} already exists, cannot create its uninit mark file", + timeline_path.display() + ); + + let uninit_mark_path = self + .conf + .timeline_uninit_mark_file_path(tenant_id, timeline_id); + fs::File::create(&uninit_mark_path) + .context("Failed to create uninit mark file") + .and_then(|_| { + crashsafe::fsync_file_and_parent(&uninit_mark_path) + .context("Failed to fsync uninit mark file") + }) .with_context(|| { - format!( - "Failed to initialize timeline {}/{}", - new_timeline_id, self.tenant_id - ) + format!("Failed to crate uninit mark for timeline {tenant_id}/{timeline_id}") })?; - match timelines.entry(new_timeline_id) { - Entry::Occupied(_) => bail!( - "Found freshly initialized timeline {} in the tenant map", - new_timeline_id - ), - Entry::Vacant(v) => { - v.insert(Arc::clone(&new_timeline)); - } - } + let uninit_mark = TimelineUninitMark::new(uninit_mark_path, timeline_path); - Ok(new_timeline) + Ok(uninit_mark) } } @@ -1111,7 +1423,7 @@ fn run_initdb( initdb_lib_dir.display(), ); - let initdb_output = Command::new(initdb_bin_path) + let initdb_output = Command::new(&initdb_bin_path) .args(&["-D", &initdb_target_dir.to_string_lossy()]) .args(&["-U", &conf.superuser]) .args(&["-E", "utf8"]) @@ -1124,7 +1436,13 @@ fn run_initdb( .env("DYLD_LIBRARY_PATH", &initdb_lib_dir) .stdout(Stdio::null()) .output() - .context("failed to execute initdb")?; + .with_context(|| { + format!( + "failed to execute {} at target dir {}", + initdb_bin_path.display(), + initdb_target_dir.display() + ) + })?; if !initdb_output.status.success() { bail!( "initdb failed: '{}'", @@ -1163,6 +1481,19 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { Ok(()) } +fn ignore_absent_files(fs_operation: F) -> io::Result<()> +where + F: Fn() -> io::Result<()>, +{ + fs_operation().or_else(|e| { + if e.kind() == io::ErrorKind::NotFound { + Ok(()) + } else { + Err(e) + } + }) +} + #[cfg(test)] pub mod harness { use bytes::{Bytes, BytesMut}; @@ -1379,7 +1710,9 @@ mod tests { #[test] fn test_basic() -> Result<()> { let tenant = TenantHarness::create("test_basic")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1401,13 +1734,18 @@ mod tests { #[test] fn no_duplicate_timelines() -> Result<()> { let tenant = TenantHarness::create("no_duplicate_timelines")?.load(); - let _ = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let _ = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION) { Ok(_) => panic!("duplicate timeline creation should fail"), Err(e) => assert_eq!( e.to_string(), - format!("Timeline {TIMELINE_ID} already exists") + format!( + "Timeline {}/{} already exists in pageserver's memory", + tenant.tenant_id, TIMELINE_ID + ) ), } @@ -1427,7 +1765,9 @@ mod tests { #[test] fn test_branch() -> Result<()> { let tenant = TenantHarness::create("test_branch")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; let writer = tline.writer(); use std::str::from_utf8; @@ -1522,7 +1862,9 @@ mod tests { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? .load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20))?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 @@ -1552,7 +1894,9 @@ mod tests { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?; + tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)? + .initialize()?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), @@ -1596,7 +1940,9 @@ mod tests { fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; @@ -1613,7 +1959,9 @@ mod tests { fn test_parent_keeps_data_forever_after_branching() -> Result<()> { let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; @@ -1641,8 +1989,9 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; { let tenant = harness.load(); - let tline = - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)? + .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x8000))?; tline.checkpoint(CheckpointConfig::Forced)?; } @@ -1662,7 +2011,9 @@ mod tests { // create two timelines { let tenant = harness.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tline.checkpoint(CheckpointConfig::Forced)?; @@ -1698,7 +2049,9 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; let tenant = harness.load(); - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; drop(tenant); let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); @@ -1735,7 +2088,9 @@ mod tests { #[test] fn test_images() -> Result<()> { let tenant = TenantHarness::create("test_images")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1785,7 +2140,9 @@ mod tests { #[test] fn test_bulk_insert() -> Result<()> { let tenant = TenantHarness::create("test_bulk_insert")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; let mut lsn = Lsn(0x10); @@ -1825,7 +2182,9 @@ mod tests { #[test] fn test_random_updates() -> Result<()> { let tenant = TenantHarness::create("test_random_updates")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; const NUM_KEYS: usize = 1000; @@ -1895,7 +2254,9 @@ mod tests { #[test] fn test_traverse_branches() -> Result<()> { let tenant = TenantHarness::create("test_traverse_branches")?.load(); - let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let mut tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; const NUM_KEYS: usize = 1000; @@ -1974,7 +2335,9 @@ mod tests { #[test] fn test_traverse_ancestors() -> Result<()> { let tenant = TenantHarness::create("test_traverse_ancestors")?.load(); - let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let mut tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; const NUM_KEYS: usize = 100; const NUM_TLINES: usize = 50; diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index b2c927d4fc..f1db50bf7f 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -12,7 +12,7 @@ use tracing::*; use remote_storage::GenericRemoteStorage; -use crate::config::{PageServerConf, METADATA_FILE_NAME}; +use crate::config::{PageServerConf, METADATA_FILE_NAME, TIMELINE_UNINIT_MARK_SUFFIX}; use crate::http::models::TenantInfo; use crate::storage_sync::index::{LayerFileMetadata, RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData, TimelineLocalFiles}; @@ -24,7 +24,7 @@ use crate::tenant_config::TenantConfOpt; use crate::walredo::PostgresRedoManager; use crate::TEMP_FILE_SUFFIX; -use utils::crashsafe_dir::{self, path_with_suffix_extension}; +use utils::crashsafe::{self, path_with_suffix_extension}; use utils::id::{TenantId, TimelineId}; mod tenants_state { @@ -265,58 +265,98 @@ fn create_tenant_files( temporary_tenant_dir.display() ); - let temporary_tenant_timelines_dir = rebase_directory( - &conf.timelines_path(&tenant_id), - &target_tenant_directory, - &temporary_tenant_dir, - )?; - let temporary_tenant_config_path = rebase_directory( - &conf.tenant_config_path(tenant_id), - &target_tenant_directory, - &temporary_tenant_dir, - )?; - // top-level dir may exist if we are creating it through CLI - crashsafe_dir::create_dir_all(&temporary_tenant_dir).with_context(|| { + crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| { format!( "could not create temporary tenant directory {}", temporary_tenant_dir.display() ) })?; - // first, create a config in the top-level temp directory, fsync the file - Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true)?; - // then, create a subdirectory in the top-level temp directory, fsynced - crashsafe_dir::create_dir(&temporary_tenant_timelines_dir).with_context(|| { + + let creation_result = try_create_target_tenant_dir( + conf, + tenant_conf, + tenant_id, + &temporary_tenant_dir, + &target_tenant_directory, + ); + + if creation_result.is_err() { + error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data"); + if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) { + error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}") + } else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) { + error!( + "Failed to fsync removed temporary tenant directory {temporary_tenant_dir:?}: {e}" + ) + } + } + + creation_result +} + +fn try_create_target_tenant_dir( + conf: &'static PageServerConf, + tenant_conf: TenantConfOpt, + tenant_id: TenantId, + temporary_tenant_dir: &Path, + target_tenant_directory: &Path, +) -> Result<(), anyhow::Error> { + let temporary_tenant_timelines_dir = rebase_directory( + &conf.timelines_path(&tenant_id), + target_tenant_directory, + temporary_tenant_dir, + ) + .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary timelines dir"))?; + let temporary_tenant_config_path = rebase_directory( + &conf.tenant_config_path(tenant_id), + target_tenant_directory, + temporary_tenant_dir, + ) + .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary config path"))?; + + Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true).with_context( + || { + format!( + "Failed to write tenant {} config to {}", + tenant_id, + temporary_tenant_config_path.display() + ) + }, + )?; + crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| { format!( - "could not create temporary tenant timelines directory {}", + "could not create tenant {} temporary timelines directory {}", + tenant_id, temporary_tenant_timelines_dir.display() ) })?; - fail::fail_point!("tenant-creation-before-tmp-rename", |_| { anyhow::bail!("failpoint tenant-creation-before-tmp-rename"); }); - // move-rename tmp directory with all files synced into a permanent directory, fsync its parent - fs::rename(&temporary_tenant_dir, &target_tenant_directory).with_context(|| { + fs::rename(&temporary_tenant_dir, target_tenant_directory).with_context(|| { format!( - "failed to move temporary tenant directory {} into the permanent one {}", + "failed to move tenant {} temporary directory {} into the permanent one {}", + tenant_id, temporary_tenant_dir.display(), target_tenant_directory.display() ) })?; let target_dir_parent = target_tenant_directory.parent().with_context(|| { format!( - "Failed to get tenant dir parent for {}", + "Failed to get tenant {} dir parent for {}", + tenant_id, target_tenant_directory.display() ) })?; - fs::File::open(target_dir_parent)?.sync_all()?; - - info!( - "created tenant directory structure in {}", - target_tenant_directory.display() - ); + crashsafe::fsync(target_dir_parent).with_context(|| { + format!( + "Failed to fsync renamed directory's parent {} for tenant {}", + target_dir_parent.display(), + tenant_id, + ) + })?; Ok(()) } @@ -602,6 +642,15 @@ fn is_temporary(path: &Path) -> bool { } } +fn is_uninit_mark(path: &Path) -> bool { + match path.file_name() { + Some(name) => name + .to_string_lossy() + .ends_with(TIMELINE_UNINIT_MARK_SUFFIX), + None => false, + } +} + fn collect_timelines_for_tenant( config: &'static PageServerConf, tenant_path: &Path, @@ -644,28 +693,74 @@ fn collect_timelines_for_tenant( e ); } + } else if is_uninit_mark(&timeline_dir) { + let timeline_uninit_mark_file = &timeline_dir; + info!( + "Found an uninit mark file {}, removing the timeline and its uninit mark", + timeline_uninit_mark_file.display() + ); + let timeline_id = timeline_uninit_mark_file + .file_stem() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!( + "Could not parse timeline id out of the timeline uninit mark name {}", + timeline_uninit_mark_file.display() + ) + })?; + let timeline_dir = config.timeline_path(&timeline_id, &tenant_id); + if let Err(e) = + remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) + { + error!("Failed to clean up uninit marked timeline: {e:?}"); + } } else { - match collect_timeline_files(&timeline_dir) { - Ok((timeline_id, metadata, timeline_files)) => { - tenant_timelines.insert( - timeline_id, - TimelineLocalFiles::collected(metadata, timeline_files), - ); + let timeline_id = timeline_dir + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!( + "Could not parse timeline id out of the timeline dir name {}", + timeline_dir.display() + ) + })?; + let timeline_uninit_mark_file = + config.timeline_uninit_mark_file_path(tenant_id, timeline_id); + if timeline_uninit_mark_file.exists() { + info!("Found an uninit mark file for timeline {tenant_id}/{timeline_id}, removing the timeline and its uninit mark"); + if let Err(e) = remove_timeline_and_uninit_mark( + &timeline_dir, + &timeline_uninit_mark_file, + ) { + error!("Failed to clean up uninit marked timeline: {e:?}"); } - Err(e) => { - error!( - "Failed to process timeline dir contents at '{}', reason: {:?}", - timeline_dir.display(), - e - ); - match remove_if_empty(&timeline_dir) { - Ok(true) => info!( - "Removed empty timeline directory {}", - timeline_dir.display() - ), - Ok(false) => (), - Err(e) => { - error!("Failed to remove empty timeline directory: {e:?}") + } else { + match collect_timeline_files(&timeline_dir) { + Ok((metadata, timeline_files)) => { + tenant_timelines.insert( + timeline_id, + TimelineLocalFiles::collected(metadata, timeline_files), + ); + } + Err(e) => { + error!( + "Failed to process timeline dir contents at '{}', reason: {:?}", + timeline_dir.display(), + e + ); + match remove_if_empty(&timeline_dir) { + Ok(true) => info!( + "Removed empty timeline directory {}", + timeline_dir.display() + ), + Ok(false) => (), + Err(e) => { + error!("Failed to remove empty timeline directory: {e:?}") + } } } } @@ -688,24 +783,41 @@ fn collect_timelines_for_tenant( Ok((tenant_id, TenantAttachData::Ready(tenant_timelines))) } +fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> { + fs::remove_dir_all(&timeline_dir) + .or_else(|e| { + if e.kind() == std::io::ErrorKind::NotFound { + // we can leave the uninit mark without a timeline dir, + // just remove the mark then + Ok(()) + } else { + Err(e) + } + }) + .with_context(|| { + format!( + "Failed to remove unit marked timeline directory {}", + timeline_dir.display() + ) + })?; + fs::remove_file(&uninit_mark).with_context(|| { + format!( + "Failed to remove timeline uninit mark file {}", + uninit_mark.display() + ) + })?; + + Ok(()) +} + // discover timeline files and extract timeline metadata // NOTE: ephemeral files are excluded from the list fn collect_timeline_files( timeline_dir: &Path, -) -> anyhow::Result<( - TimelineId, - TimelineMetadata, - HashMap, -)> { +) -> anyhow::Result<(TimelineMetadata, HashMap)> { let mut timeline_files = HashMap::new(); let mut timeline_metadata_path = None; - let timeline_id = timeline_dir - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .context("Could not parse timeline id out of the timeline dir name")?; let timeline_dir_entries = fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; for entry in timeline_dir_entries { @@ -754,5 +866,5 @@ fn collect_timeline_files( "Timeline has no ancestor and no layer files" ); - Ok((timeline_id, metadata, timeline_files)) + Ok((metadata, timeline_files)) } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 29179e9871..01389e52f4 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -1374,7 +1374,9 @@ mod tests { timeline: harness .load() .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION) - .expect("Failed to create an empty timeline for dummy wal connection manager"), + .expect("Failed to create an empty timeline for dummy wal connection manager") + .initialize() + .unwrap(), wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 1dd27caba6..b8874a0223 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -35,7 +35,7 @@ use std::sync::Mutex; use std::time::Duration; use std::time::Instant; use tracing::*; -use utils::crashsafe_dir::path_with_suffix_extension; +use utils::crashsafe::path_with_suffix_extension; use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; use crate::metrics::{ diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 7baa67935d..101cce9ffc 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -111,18 +111,20 @@ def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): future.result() -def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): +def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv): env = neon_simple_env pageserver_http = env.pageserver.http_client() tenant_id, _ = env.neon_cli.create_tenant() + timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + initial_timeline_dirs = [d for d in timelines_dir.iterdir()] - # Introduce failpoint when creating a new timeline + # Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed. pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return")) with pytest.raises(Exception, match="before-checkpoint-new-timeline"): - _ = env.neon_cli.create_timeline("test_fix_broken_timelines", tenant_id) + _ = env.neon_cli.create_timeline("test_timeline_init_break_before_checkpoint", tenant_id) # Restart the page server env.neon_cli.pageserver_stop(immediate=True) @@ -133,3 +135,36 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): assert ( new_tenant_timelines == old_tenant_timelines ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" + + timeline_dirs = [d for d in timelines_dir.iterdir()] + assert ( + timeline_dirs == initial_timeline_dirs + ), "pageserver should clean its temp timeline files on timeline creation failure" + + +def test_timeline_create_break_after_uninit_mark(neon_simple_env: NeonEnv): + env = neon_simple_env + pageserver_http = env.pageserver.http_client() + + tenant_id, _ = env.neon_cli.create_tenant() + + timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" + old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + initial_timeline_dirs = [d for d in timelines_dir.iterdir()] + + # Introduce failpoint when creating a new timeline uninit mark, before any other files were created + pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return")) + with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"): + _ = env.neon_cli.create_timeline("test_timeline_create_break_after_uninit_mark", tenant_id) + + # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. + # "New" timeline is not present in the list, allowing pageserver to retry the same request + new_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + assert ( + new_tenant_timelines == old_tenant_timelines + ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" + + timeline_dirs = [d for d in timelines_dir.iterdir()] + assert ( + timeline_dirs == initial_timeline_dirs + ), "pageserver should clean its temp timeline files on timeline creation failure" diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 5910b4f74f..c888c6f7ee 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -105,15 +105,11 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build with pytest.raises(Exception): import_tar(corrupt_base_tar, wal_tar) - # Clean up - # TODO it should clean itself - client = env.pageserver.http_client() - client.timeline_delete(tenant, timeline) - # Importing correct backup works import_tar(base_tar, wal_tar) # Wait for data to land in s3 + client = env.pageserver.http_client() wait_for_last_record_lsn(client, tenant, timeline, Lsn(end_lsn)) wait_for_upload(client, tenant, timeline, Lsn(end_lsn)) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 37c5a130e2..4ffea60950 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -23,7 +23,7 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): initial_tenants = sorted( map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) ) - initial_tenant_dirs = set([d for d in tenants_dir.iterdir()]) + initial_tenant_dirs = [d for d in tenants_dir.iterdir()] pageserver_http = neon_simple_env.pageserver.http_client() pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return")) @@ -35,26 +35,10 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): ) assert initial_tenants == new_tenants, "should not create new tenants" - new_tenant_dirs = list(set([d for d in tenants_dir.iterdir()]) - initial_tenant_dirs) - assert len(new_tenant_dirs) == 1, "should have new tenant directory created" - tmp_tenant_dir = new_tenant_dirs[0] - assert str(tmp_tenant_dir).endswith( - ".___temp" - ), "new tenant directory created should be a temporary one" - - neon_simple_env.pageserver.stop() - neon_simple_env.pageserver.start() - - tenants_after_restart = sorted( - map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) - ) - dirs_after_restart = set([d for d in tenants_dir.iterdir()]) + new_tenant_dirs = [d for d in tenants_dir.iterdir()] assert ( - tenants_after_restart == initial_tenants - ), "should load all non-corrupt tenants after restart" - assert ( - dirs_after_restart == initial_tenant_dirs - ), "pageserver should clean its temp tenant dirs on restart" + new_tenant_dirs == initial_tenant_dirs + ), "pageserver should clean its temp tenant dirs on tenant creation failure" def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder): From f5ab9f761bfb3bcc8cfff942520517cba7aa462c Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 20 Oct 2022 16:14:32 +0300 Subject: [PATCH 0935/1022] Remove flaky checks in test_delete_force (#2567) --- test_runner/regress/test_wal_acceptor.py | 25 +++++------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 4451ba9d57..79adfb7b68 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1114,10 +1114,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): cur.execute("INSERT INTO t (key) VALUES (1)") # Remove initial tenant's br1 (active) - assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == { - "dir_existed": True, - "was_active": True, - } + assert sk_http.timeline_delete_force(tenant_id, timeline_id_1)["dir_existed"] assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() @@ -1125,10 +1122,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Ensure repeated deletion succeeds - assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == { - "dir_existed": False, - "was_active": False, - } + assert not sk_http.timeline_delete_force(tenant_id, timeline_id_1)["dir_existed"] assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() @@ -1145,10 +1139,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Remove initial tenant's br2 (inactive) - assert sk_http.timeline_delete_force(tenant_id, timeline_id_2) == { - "dir_existed": True, - "was_active": False, - } + assert sk_http.timeline_delete_force(tenant_id, timeline_id_2)["dir_existed"] assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() @@ -1156,10 +1147,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Remove non-existing branch, should succeed - assert sk_http.timeline_delete_force(tenant_id, TimelineId("00" * 16)) == { - "dir_existed": False, - "was_active": False, - } + assert not sk_http.timeline_delete_force(tenant_id, TimelineId("00" * 16))["dir_existed"] assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists() @@ -1168,10 +1156,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # Remove initial tenant fully (two branches are active) response = sk_http.tenant_delete_force(tenant_id) - assert response[str(timeline_id_3)] == { - "dir_existed": True, - "was_active": True, - } + assert response[str(timeline_id_3)]["dir_existed"] assert not (sk_data_dir / str(tenant_id)).exists() assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() From eb1bdcc6cfb72293019186021f40297affc6dc33 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 20 Oct 2022 13:21:36 +0300 Subject: [PATCH 0936/1022] If an FSM or VM page cannot be reconstructed, fill it with zeros. If we cannot reconstruct an FSM or VM page, while creating image layers, fill it with zeros instead. That should always be safe, for the FSM and VM, in the sense that you won't lose actual user data. It will get cleaned up by VACUUM later. We had a bug with FSM/VM truncation, where we truncated the FSM and VM at WAL replay to a smaller size than PostgreSQL originally did. We thought was harmless, as the FSM and VM are not critical for correctness and can be zeroed out or truncated without affecting user data. However, it lead to a situation where PostgreSQL created incremental WAL records for pages that we had already truncated away in the pageserver, and when we tried to replay those WAL records, that failed. That lead to a permanent error in image layer creation, and prevented it from ever finishing. See https://github.com/neondatabase/neon/issues/2601. With this patch, those pages will be filled with zeros in the image layer, which allows the image layer creation to finish. --- pageserver/src/lib.rs | 2 ++ pageserver/src/pgdatadir_mapping.rs | 11 +++++++++++ pageserver/src/tenant/timeline.rs | 29 ++++++++++++++++++++++++++++- pageserver/src/walingest.rs | 3 +-- 4 files changed, 42 insertions(+), 3 deletions(-) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index fe5114a247..c75f940386 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -46,6 +46,8 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61; pub const LOG_FILE_NAME: &str = "pageserver.log"; +static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); + /// Config for the Repository checkpointer #[derive(Debug, Clone, Copy)] pub enum CheckpointConfig { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 424ce4769a..ca931ed37d 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1373,6 +1373,17 @@ fn is_rel_block_key(key: Key) -> bool { key.field1 == 0x00 && key.field4 != 0 } +pub fn is_rel_fsm_block_key(key: Key) -> bool { + key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff +} + +pub fn is_rel_vm_block_key(key: Key) -> bool { + key.field1 == 0x00 + && key.field4 != 0 + && key.field5 == VISIBILITYMAP_FORKNUM + && key.field6 != 0xffffffff +} + pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> { Ok(match key.field1 { 0x01 => { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a771f82caf..d6ce644bb5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -34,6 +34,7 @@ use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::metrics::TimelineMetrics; use crate::pgdatadir_mapping::BlockNumber; use crate::pgdatadir_mapping::LsnForTimestamp; +use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; use crate::reltag::RelTag; use crate::tenant_config::TenantConfOpt; @@ -52,6 +53,7 @@ use crate::task_mgr::TaskKind; use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task}; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; +use crate::ZERO_PAGE; use crate::{ page_cache, storage_sync::{self, index::LayerFileMetadata}, @@ -1496,7 +1498,32 @@ impl Timeline { for range in &partition.ranges { let mut key = range.start; while key < range.end { - let img = self.get(key, lsn)?; + let img = match self.get(key, lsn) { + Ok(img) => img, + Err(err) => { + // If we fail to reconstruct a VM or FSM page, we can zero the + // page without losing any actual user data. That seems better + // than failing repeatedly and getting stuck. + // + // We had a bug at one point, where we truncated the FSM and VM + // in the pageserver, but the Postgres didn't know about that + // and continued to generate incremental WAL records for pages + // that didn't exist in the pageserver. Trying to replay those + // WAL records failed to find the previous image of the page. + // This special case allows us to recover from that situation. + // See https://github.com/neondatabase/neon/issues/2601. + // + // Unfortunately we cannot do this for the main fork, or for + // any metadata keys, keys, as that would lead to actual data + // loss. + if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) { + warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}"); + ZERO_PAGE.clone() + } else { + return Err(err); + } + } + }; image_layer_writer.put_image(key, &img)?; key = key.next(); } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index d3d2c6d9b2..9a6b99d991 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -34,6 +34,7 @@ use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; use crate::walrecord::*; +use crate::ZERO_PAGE; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; @@ -43,8 +44,6 @@ use postgres_ffi::TransactionId; use postgres_ffi::BLCKSZ; use utils::lsn::Lsn; -static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); - pub struct WalIngest<'a> { timeline: &'a Timeline, From 7404777efc9cddd0c98cb3ee5de769a6225da4af Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 20 Oct 2022 20:06:05 +0300 Subject: [PATCH 0937/1022] Pin pages with speculative insert tuples to prevent their reconstruction because spec_token is not wal logged (#2657) * Pin pages with speculative insert tuples to prevent their reconstruction because spec_token is not wal logged refer ##2587 * Bump postgres versions --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 19d948fd47..bdd502a8da 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 19d948fd47f45d83367062d9a54709cf2d9c8921 +Subproject commit bdd502a8da5de9e0ac709caabc0401455c97d235 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 339f2d642d..f7c5269e9c 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 339f2d642d7d430c44839f8293ae271f90e3cb81 +Subproject commit f7c5269e9c7e818653ad6fe95ba072d1901c4497 From 30984c163ca28818a7c0d35f42817d015fc7645e Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Thu, 20 Oct 2022 23:01:01 +0300 Subject: [PATCH 0938/1022] Fix race between pushing image to ECR and copying to dockerhub (#2662) --- .github/workflows/build_and_test.yml | 42 ++++++++++++++++------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d90455ccca..460d73a552 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -481,6 +481,7 @@ jobs: neon-image: runs-on: dev + needs: [ tag ] container: gcr.io/kaniko-project/executor:v1.9.0-debug steps: @@ -494,10 +495,11 @@ jobs: run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - name: Kaniko build neon - run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} compute-tools-image: runs-on: dev + needs: [ tag ] container: gcr.io/kaniko-project/executor:v1.9.0-debug steps: @@ -508,11 +510,12 @@ jobs: run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - name: Kaniko build compute tools - run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-node-image: runs-on: dev container: gcr.io/kaniko-project/executor:v1.9.0-debug + needs: [ tag ] steps: - name: Checkout uses: actions/checkout@v1 # v3 won't work with kaniko @@ -527,11 +530,12 @@ jobs: # cloud repo depends on this image name, thus duplicating it # remove compute-node when cloud repo is updated - name: Kaniko build compute node with extensions v14 (compatibility) - run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} compute-node-image-v14: runs-on: dev container: gcr.io/kaniko-project/executor:v1.9.0-debug + needs: [ tag ] steps: - name: Checkout uses: actions/checkout@v1 # v3 won't work with kaniko @@ -543,12 +547,13 @@ jobs: run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - name: Kaniko build compute node with extensions v14 - run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-image-v15: runs-on: dev container: gcr.io/kaniko-project/executor:v1.9.0-debug + needs: [ tag ] steps: - name: Checkout uses: actions/checkout@v1 # v3 won't work with kaniko @@ -560,11 +565,11 @@ jobs: run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - name: Kaniko build compute node with extensions v15 - run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} promote-images: runs-on: dev - needs: [ neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ] + needs: [ tag, neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ] if: github.event_name != 'workflow_dispatch' container: amazon/aws-cli strategy: @@ -577,8 +582,9 @@ jobs: steps: - name: Promote image to latest - run: - MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=$GITHUB_RUN_ID --query 'images[].imageManifest' --output text) && aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST" + run: | + export MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=${{needs.tag.outputs.build-tag}} --query 'images[].imageManifest' --output text) + aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST" push-docker-hub: runs-on: dev @@ -597,19 +603,19 @@ jobs: echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json - name: Pull neon image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:latest neon + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} neon - name: Pull compute tools image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest compute-tools + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools - name: Pull compute node image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest compute-node + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} compute-node - name: Pull compute node v14 image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest compute-node-v14 + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14 - name: Pull compute node v15 image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest compute-node-v15 + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15 - name: Pull rust image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust @@ -619,11 +625,11 @@ jobs: (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' run: | - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest - name: Configure Docker Hub login run: | From cca1ace651a3bcc177503c1bbcbf3b290bbad918 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 20 Oct 2022 23:20:28 +0300 Subject: [PATCH 0939/1022] make launch_wal_receiver infallible --- pageserver/src/tenant.rs | 4 +--- pageserver/src/tenant/timeline.rs | 8 +++----- pageserver/src/walreceiver/connection_manager.rs | 3 +-- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 93c473f0fe..6aae740a78 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -191,9 +191,7 @@ impl UninitializedTimeline<'_> { ) })?; v.insert(Arc::clone(&new_timeline)); - new_timeline.launch_wal_receiver().with_context(|| { - format!("Failed to launch walreceiver for timeline {tenant_id}/{timeline_id}") - })?; + new_timeline.launch_wal_receiver(); } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d6ce644bb5..8f325d31ec 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -602,11 +602,11 @@ impl Timeline { result } - pub fn launch_wal_receiver(self: &Arc) -> anyhow::Result<()> { + pub fn launch_wal_receiver(self: &Arc) { if !is_etcd_client_initialized() { if cfg!(test) { info!("not launching WAL receiver because etcd client hasn't been initialized"); - return Ok(()); + return; } else { panic!("etcd client not initialized"); } @@ -634,9 +634,7 @@ impl Timeline { walreceiver_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, - )?; - - Ok(()) + ); } /// diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 01389e52f4..3a5d1c7ad6 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -47,7 +47,7 @@ pub fn spawn_connection_manager_task( wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, -) -> anyhow::Result<()> { +) { let mut etcd_client = get_etcd_client().clone(); let tenant_id = timeline.tenant_id; @@ -95,7 +95,6 @@ pub fn spawn_connection_manager_task( info_span!("wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id), ), ); - Ok(()) } /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. From fc4ea3553ebed97a11503e970f77196f7d39a4ca Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 21 Oct 2022 02:39:55 +0300 Subject: [PATCH 0940/1022] test_gc_cutoff.py fixes (#2655) * Fix bogus early exit from GC. Commit 91411c415a added this failpoint, but the early exit was not intentional. * Cleanup test_gc_cutoff.py test. - Remove the 'scale' parameter, this isn't a benchmark - Tweak pgbench and pageserver options to create garbage faster that the the GC can collect away. The test used to take just under 5 minutes, which was uncomfortably close to the default 5 minute test timeout, and annoyingly even without the hard limit. These changes bring it down to about 1-2 minutes. - Improve comments, fix typos - Rename the failpoint. The old name, 'gc-before-save-metadata' implied that the failpoint was before the metadata update, but it was in fact much later in the function. - Move the call to persist the metadata outside the lock, to avoid holding it for too long. To verify that this test still covers the original bug, https://github.com/neondatabase/neon/issues/2539, I commenting out updating the metadata file like this: ``` diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1e857a9a..f8a9f34a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1962,7 +1962,7 @@ impl Timeline { } // Persist the new GC cutoff value in the metadata file, before // we actually remove anything. - self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; + //self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; info!("GC starting"); ``` It doesn't fail every time with that, but it did fail after about 5 runs. --- pageserver/src/tenant/timeline.rs | 15 ++++++--------- test_runner/regress/test_gc_cutoff.py | 27 ++++++++++++++------------- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8f325d31ec..a0ac0adea2 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1984,10 +1984,10 @@ impl Timeline { new_gc_cutoff ); write_guard.store_and_unlock(new_gc_cutoff).wait(); - - // Persist metadata file - self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; } + // Persist the new GC cutoff value in the metadata file, before + // we actually remove anything. + self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; info!("GC starting"); @@ -2114,15 +2114,12 @@ impl Timeline { } info!( - "GC completed removing {} layers, cuttof {}", + "GC completed removing {} layers, cutoff {}", result.layers_removed, new_gc_cutoff ); + if result.layers_removed != 0 { - fail_point!("gc-before-save-metadata", |_| { - info!("Abnormaly terinate pageserver at gc-before-save-metadata fail point"); - std::process::abort(); - }); - return Ok(result); + fail_point!("after-timeline-gc-removed-layers"); } if self.upload_layers.load(atomic::Ordering::Relaxed) { diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py index 946c689a30..22b77d2cf1 100644 --- a/test_runner/regress/test_gc_cutoff.py +++ b/test_runner/regress/test_gc_cutoff.py @@ -1,14 +1,13 @@ -import pytest from fixtures.neon_fixtures import NeonEnvBuilder, PgBin -from performance.test_perf_pgbench import get_scales_matrix -# Test gc_cuttoff +# Test gc_cutoff # -# This test set fail point after at the end of GC and checks -# that pageserver normally restarts after it -@pytest.mark.parametrize("scale", get_scales_matrix(10)) -def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, scale: int): +# This test sets fail point at the end of GC, and checks that pageserver +# normally restarts after it. Also, there should be GC ERRORs in the log, +# but the fixture checks the log for any unexpected ERRORs after every +# test anyway, so it doesn't need any special attention here. +def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -18,21 +17,23 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, scale: int): "gc_period": "10 s", "gc_horizon": f"{1024 ** 2}", "checkpoint_distance": f"{1024 ** 2}", - "compaction_target_size": f"{1024 ** 2}", + "compaction_period": "5 s", # set PITR interval to be small, so we can do GC "pitr_interval": "1 s", + "compaction_threshold": "3", + "image_creation_threshold": "2", } ) pg = env.postgres.create_start("main", tenant_id=tenant_id) - connstr = pg.connstr() - pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + connstr = pg.connstr(options="-csynchronous_commit=off") + pg_bin.run_capture(["pgbench", "-i", "-s10", connstr]) - pageserver_http.configure_failpoints(("gc-before-save-metadata", "return")) + pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit")) for i in range(5): try: - pg_bin.run_capture(["pgbench", "-T100", connstr]) + pg_bin.run_capture(["pgbench", "-N", "-c5", "-T100", "-Mprepared", connstr]) except Exception: env.pageserver.stop() env.pageserver.start() - pageserver_http.configure_failpoints(("gc-before-save-metadata", "return")) + pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit")) From a347d2b6ac22f3ab30297b1237f3d16ef682c118 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 20 Oct 2022 20:27:26 +0300 Subject: [PATCH 0941/1022] #2616 handle 'Unsupported pg_version' error properly --- control_plane/src/compute.rs | 12 +++++------ control_plane/src/local_env.rs | 30 +++++++++++++------------- libs/postgres_ffi/wal_craft/src/lib.rs | 22 +++++++++---------- pageserver/src/config.rs | 24 ++++++++++----------- pageserver/src/tenant.rs | 4 ++-- pageserver/src/walredo.rs | 25 +++++++++++++++------ 6 files changed, 65 insertions(+), 52 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 89994c5647..9f32ad31c1 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -183,18 +183,18 @@ impl PostgresNode { } fn sync_safekeepers(&self, auth_token: &Option, pg_version: u32) -> Result { - let pg_path = self.env.pg_bin_dir(pg_version).join("postgres"); + let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres"); let mut cmd = Command::new(&pg_path); cmd.arg("--sync-safekeepers") .env_clear() .env( "LD_LIBRARY_PATH", - self.env.pg_lib_dir(pg_version).to_str().unwrap(), + self.env.pg_lib_dir(pg_version)?.to_str().unwrap(), ) .env( "DYLD_LIBRARY_PATH", - self.env.pg_lib_dir(pg_version).to_str().unwrap(), + self.env.pg_lib_dir(pg_version)?.to_str().unwrap(), ) .env("PGDATA", self.pgdata().to_str().unwrap()) .stdout(Stdio::piped()) @@ -422,7 +422,7 @@ impl PostgresNode { } fn pg_ctl(&self, args: &[&str], auth_token: &Option) -> Result<()> { - let pg_ctl_path = self.env.pg_bin_dir(self.pg_version).join("pg_ctl"); + let pg_ctl_path = self.env.pg_bin_dir(self.pg_version)?.join("pg_ctl"); let mut cmd = Command::new(pg_ctl_path); cmd.args( [ @@ -440,11 +440,11 @@ impl PostgresNode { .env_clear() .env( "LD_LIBRARY_PATH", - self.env.pg_lib_dir(self.pg_version).to_str().unwrap(), + self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(), ) .env( "DYLD_LIBRARY_PATH", - self.env.pg_lib_dir(self.pg_version).to_str().unwrap(), + self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(), ); if let Some(token) = auth_token { cmd.env("ZENITH_AUTH_TOKEN", token); diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index f4fbc99420..34ddb41f32 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -3,7 +3,7 @@ //! Now it also provides init method which acts like a stub for proper installation //! script which will use local paths. -use anyhow::{bail, ensure, Context}; +use anyhow::{bail, ensure, Context, Result}; use reqwest::Url; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; @@ -201,28 +201,28 @@ impl LocalEnv { self.pg_distrib_dir.clone() } - pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf { + pub fn pg_distrib_dir(&self, pg_version: u32) -> Result { let path = self.pg_distrib_dir.clone(); match pg_version { - 14 => path.join(format!("v{pg_version}")), - 15 => path.join(format!("v{pg_version}")), - _ => panic!("Unsupported postgres version: {}", pg_version), + 14 => Ok(path.join(format!("v{pg_version}"))), + 15 => Ok(path.join(format!("v{pg_version}"))), + _ => bail!("Unsupported postgres version: {}", pg_version), } } - pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf { + pub fn pg_bin_dir(&self, pg_version: u32) -> Result { match pg_version { - 14 => self.pg_distrib_dir(pg_version).join("bin"), - 15 => self.pg_distrib_dir(pg_version).join("bin"), - _ => panic!("Unsupported postgres version: {}", pg_version), + 14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), + 15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), + _ => bail!("Unsupported postgres version: {}", pg_version), } } - pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf { + pub fn pg_lib_dir(&self, pg_version: u32) -> Result { match pg_version { - 14 => self.pg_distrib_dir(pg_version).join("lib"), - 15 => self.pg_distrib_dir(pg_version).join("lib"), - _ => panic!("Unsupported postgres version: {}", pg_version), + 14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), + 15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), + _ => bail!("Unsupported postgres version: {}", pg_version), } } @@ -422,10 +422,10 @@ impl LocalEnv { "directory '{}' already exists. Perhaps already initialized?", base_path.display() ); - if !self.pg_bin_dir(pg_version).join("postgres").exists() { + if !self.pg_bin_dir(pg_version)?.join("postgres").exists() { bail!( "Can't find postgres binary at {}", - self.pg_bin_dir(pg_version).display() + self.pg_bin_dir(pg_version)?.display() ); } for binary in ["pageserver", "safekeeper"] { diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 7ffe19e209..f0203ce322 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -37,22 +37,22 @@ pub static REQUIRED_POSTGRES_CONFIG: Lazy> = Lazy::new(|| { }); impl Conf { - pub fn pg_distrib_dir(&self) -> PathBuf { + pub fn pg_distrib_dir(&self) -> Result { let path = self.pg_distrib_dir.clone(); match self.pg_version { - 14 => path.join(format!("v{}", self.pg_version)), - 15 => path.join(format!("v{}", self.pg_version)), - _ => panic!("Unsupported postgres version: {}", self.pg_version), + 14 => Ok(path.join(format!("v{}", self.pg_version))), + 15 => Ok(path.join(format!("v{}", self.pg_version))), + _ => bail!("Unsupported postgres version: {}", self.pg_version), } } - fn pg_bin_dir(&self) -> PathBuf { - self.pg_distrib_dir().join("bin") + fn pg_bin_dir(&self) -> Result { + Ok(self.pg_distrib_dir()?.join("bin")) } - fn pg_lib_dir(&self) -> PathBuf { - self.pg_distrib_dir().join("lib") + fn pg_lib_dir(&self) -> Result { + Ok(self.pg_distrib_dir()?.join("lib")) } pub fn wal_dir(&self) -> PathBuf { @@ -60,12 +60,12 @@ impl Conf { } fn new_pg_command(&self, command: impl AsRef) -> Result { - let path = self.pg_bin_dir().join(command); + let path = self.pg_bin_dir()?.join(command); ensure!(path.exists(), "Command {:?} does not exist", path); let mut cmd = Command::new(path); cmd.env_clear() - .env("LD_LIBRARY_PATH", self.pg_lib_dir()) - .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()); + .env("LD_LIBRARY_PATH", self.pg_lib_dir()?) + .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?); Ok(cmd) } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index b797866e43..2872fc6255 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -387,28 +387,28 @@ impl PageServerConf { // // Postgres distribution paths // - pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf { + pub fn pg_distrib_dir(&self, pg_version: u32) -> Result { let path = self.pg_distrib_dir.clone(); match pg_version { - 14 => path.join(format!("v{pg_version}")), - 15 => path.join(format!("v{pg_version}")), - _ => panic!("Unsupported postgres version: {}", pg_version), + 14 => Ok(path.join(format!("v{pg_version}"))), + 15 => Ok(path.join(format!("v{pg_version}"))), + _ => bail!("Unsupported postgres version: {}", pg_version), } } - pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf { + pub fn pg_bin_dir(&self, pg_version: u32) -> Result { match pg_version { - 14 => self.pg_distrib_dir(pg_version).join("bin"), - 15 => self.pg_distrib_dir(pg_version).join("bin"), - _ => panic!("Unsupported postgres version: {}", pg_version), + 14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), + 15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), + _ => bail!("Unsupported postgres version: {}", pg_version), } } - pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf { + pub fn pg_lib_dir(&self, pg_version: u32) -> Result { match pg_version { - 14 => self.pg_distrib_dir(pg_version).join("lib"), - 15 => self.pg_distrib_dir(pg_version).join("lib"), - _ => panic!("Unsupported postgres version: {}", pg_version), + 14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), + 15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), + _ => bail!("Unsupported postgres version: {}", pg_version), } } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 6aae740a78..0e9a6ce4ea 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1412,8 +1412,8 @@ fn run_initdb( initdb_target_dir: &Path, pg_version: u32, ) -> Result<()> { - let initdb_bin_path = conf.pg_bin_dir(pg_version).join("initdb"); - let initdb_lib_dir = conf.pg_lib_dir(pg_version); + let initdb_bin_path = conf.pg_bin_dir(pg_version)?.join("initdb"); + let initdb_lib_dir = conf.pg_lib_dir(pg_version)?; info!( "running {} in {}, libdir: {}", initdb_bin_path.display(), diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index b8874a0223..e683c301d8 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -610,13 +610,26 @@ impl PostgresRedoProcess { ); fs::remove_dir_all(&datadir)?; } + let pg_bin_dir_path = conf.pg_bin_dir(pg_version).map_err(|e| { + Error::new( + ErrorKind::Other, + format!("incorrect pg_bin_dir path: {}", e), + ) + })?; + let pg_lib_dir_path = conf.pg_lib_dir(pg_version).map_err(|e| { + Error::new( + ErrorKind::Other, + format!("incorrect pg_lib_dir path: {}", e), + ) + })?; + info!("running initdb in {}", datadir.display()); - let initdb = Command::new(conf.pg_bin_dir(pg_version).join("initdb")) + let initdb = Command::new(pg_bin_dir_path.join("initdb")) .args(&["-D", &datadir.to_string_lossy()]) .arg("-N") .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) + .env("LD_LIBRARY_PATH", &pg_lib_dir_path) + .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) // macOS .close_fds() .output() .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?; @@ -642,14 +655,14 @@ impl PostgresRedoProcess { } // Start postgres itself - let mut child = Command::new(conf.pg_bin_dir(pg_version).join("postgres")) + let mut child = Command::new(pg_bin_dir_path.join("postgres")) .arg("--wal-redo") .stdin(Stdio::piped()) .stderr(Stdio::piped()) .stdout(Stdio::piped()) .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) + .env("LD_LIBRARY_PATH", &pg_lib_dir_path) + .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) .env("PGDATA", &datadir) // The redo process is not trusted, so it runs in seccomp mode // (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't From 52e75fead996b66631e4d7354accb89d4860527b Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 21 Oct 2022 11:50:29 +0300 Subject: [PATCH 0942/1022] Use anyhow::Result explicitly --- control_plane/src/local_env.rs | 8 ++++---- libs/postgres_ffi/wal_craft/src/lib.rs | 6 +++--- pageserver/src/config.rs | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 34ddb41f32..10b2db6396 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -3,7 +3,7 @@ //! Now it also provides init method which acts like a stub for proper installation //! script which will use local paths. -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{bail, ensure, Context}; use reqwest::Url; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; @@ -201,7 +201,7 @@ impl LocalEnv { self.pg_distrib_dir.clone() } - pub fn pg_distrib_dir(&self, pg_version: u32) -> Result { + pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); match pg_version { @@ -211,14 +211,14 @@ impl LocalEnv { } } - pub fn pg_bin_dir(&self, pg_version: u32) -> Result { + pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { match pg_version { 14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), 15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), _ => bail!("Unsupported postgres version: {}", pg_version), } } - pub fn pg_lib_dir(&self, pg_version: u32) -> Result { + pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result { match pg_version { 14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), 15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index f0203ce322..c4404b37ba 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -37,7 +37,7 @@ pub static REQUIRED_POSTGRES_CONFIG: Lazy> = Lazy::new(|| { }); impl Conf { - pub fn pg_distrib_dir(&self) -> Result { + pub fn pg_distrib_dir(&self) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); match self.pg_version { @@ -47,11 +47,11 @@ impl Conf { } } - fn pg_bin_dir(&self) -> Result { + fn pg_bin_dir(&self) -> anyhow::Result { Ok(self.pg_distrib_dir()?.join("bin")) } - fn pg_lib_dir(&self) -> Result { + fn pg_lib_dir(&self) -> anyhow::Result { Ok(self.pg_distrib_dir()?.join("lib")) } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 2872fc6255..4f80fc96b5 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -387,7 +387,7 @@ impl PageServerConf { // // Postgres distribution paths // - pub fn pg_distrib_dir(&self, pg_version: u32) -> Result { + pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); match pg_version { @@ -397,14 +397,14 @@ impl PageServerConf { } } - pub fn pg_bin_dir(&self, pg_version: u32) -> Result { + pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { match pg_version { 14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), 15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), _ => bail!("Unsupported postgres version: {}", pg_version), } } - pub fn pg_lib_dir(&self, pg_version: u32) -> Result { + pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result { match pg_version { 14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), 15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), From 39e4bdb99e6bcdb39462237e053314c717e82b3b Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 21 Oct 2022 13:58:43 +0300 Subject: [PATCH 0943/1022] Actualize tenant and timeline API modifiers (#2661) * Actualize tenant and timeline API modifiers * Use anyhow::Result explicitly --- pageserver/src/tenant.rs | 90 +++---- pageserver/src/tenant/timeline.rs | 374 +++++++++++++++--------------- 2 files changed, 215 insertions(+), 249 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 0e9a6ce4ea..69c89a80b4 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -11,7 +11,7 @@ //! parent timeline, and the last LSN that has been written to disk. //! -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{bail, ensure, Context}; use tokio::sync::watch; use tracing::*; use utils::crashsafe::path_with_suffix_extension; @@ -25,7 +25,6 @@ use std::fs::File; use std::fs::OpenOptions; use std::io; use std::io::Write; -use std::num::NonZeroU64; use std::ops::Bound::Included; use std::path::Path; use std::path::PathBuf; @@ -292,7 +291,7 @@ impl TimelineUninitMark { Ok(()) } - fn delete_mark_file_if_present(&mut self) -> Result<(), anyhow::Error> { + fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> { let uninit_mark_file = &self.uninit_mark_path; let uninit_mark_parent = uninit_mark_file .parent() @@ -470,7 +469,7 @@ impl Tenant { horizon: u64, pitr: Duration, checkpoint_before_gc: bool, - ) -> Result { + ) -> anyhow::Result { let timeline_str = target_timeline_id .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); @@ -486,7 +485,7 @@ impl Tenant { /// This function is periodically called by compactor task. /// Also it can be explicitly requested per timeline through page server /// api's 'compact' command. - pub fn compaction_iteration(&self) -> Result<()> { + pub fn compaction_iteration(&self) -> anyhow::Result<()> { // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // compactions. We don't want to block everything else while the @@ -510,7 +509,7 @@ impl Tenant { /// /// Used at graceful shutdown. /// - pub fn checkpoint(&self) -> Result<()> { + pub fn checkpoint(&self) -> anyhow::Result<()> { // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // checkpoints. We don't want to block everything else while the @@ -681,7 +680,7 @@ impl Tenant { /// before the children. fn tree_sort_timelines( timelines: HashMap, -) -> Result> { +) -> anyhow::Result> { let mut result = Vec::with_capacity(timelines.len()); let mut now = Vec::with_capacity(timelines.len()); @@ -784,27 +783,6 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.pitr_interval) } - pub fn get_wal_receiver_connect_timeout(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap(); - tenant_conf - .walreceiver_connect_timeout - .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout) - } - - pub fn get_lagging_wal_timeout(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap(); - tenant_conf - .lagging_wal_timeout - .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout) - } - - pub fn get_max_lsn_wal_lag(&self) -> NonZeroU64 { - let tenant_conf = self.tenant_conf.read().unwrap(); - tenant_conf - .max_lsn_wal_lag - .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag) - } - pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) { self.tenant_conf.write().unwrap().update(&new_tenant_conf); } @@ -836,7 +814,7 @@ impl Tenant { )) } - pub fn new( + pub(super) fn new( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, walredo_mgr: Arc, @@ -859,7 +837,7 @@ impl Tenant { } /// Locate and load config - pub fn load_tenant_config( + pub(super) fn load_tenant_config( conf: &'static PageServerConf, tenant_id: TenantId, ) -> anyhow::Result { @@ -901,7 +879,7 @@ impl Tenant { Ok(tenant_conf) } - pub fn persist_tenant_config( + pub(super) fn persist_tenant_config( target_config_path: &Path, tenant_conf: TenantConfOpt, first_save: bool, @@ -994,7 +972,7 @@ impl Tenant { horizon: u64, pitr: Duration, checkpoint_before_gc: bool, - ) -> Result { + ) -> anyhow::Result { let mut totals: GcResult = Default::default(); let now = Instant::now(); @@ -1411,7 +1389,7 @@ fn run_initdb( conf: &'static PageServerConf, initdb_target_dir: &Path, pg_version: u32, -) -> Result<()> { +) -> anyhow::Result<()> { let initdb_bin_path = conf.pg_bin_dir(pg_version)?.join("initdb"); let initdb_lib_dir = conf.pg_lib_dir(pg_version)?; info!( @@ -1457,7 +1435,7 @@ impl Drop for Tenant { } } /// Dump contents of a layer file to stdout. -pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { +pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> anyhow::Result<()> { use std::os::unix::fs::FileExt; // All layer files start with a two-byte "magic" value, to identify the kind of @@ -1562,13 +1540,13 @@ pub mod harness { } impl<'a> TenantHarness<'a> { - pub fn create(test_name: &'static str) -> Result { + pub fn create(test_name: &'static str) -> anyhow::Result { Self::create_internal(test_name, false) } - pub fn create_exclusive(test_name: &'static str) -> Result { + pub fn create_exclusive(test_name: &'static str) -> anyhow::Result { Self::create_internal(test_name, true) } - fn create_internal(test_name: &'static str, exclusive: bool) -> Result { + fn create_internal(test_name: &'static str, exclusive: bool) -> anyhow::Result { let lock_guard = if exclusive { (None, Some(LOCK.write().unwrap())) } else { @@ -1602,7 +1580,7 @@ pub mod harness { self.try_load().expect("failed to load test tenant") } - pub fn try_load(&self) -> Result { + pub fn try_load(&self) -> anyhow::Result { let walredo_mgr = Arc::new(TestRedoManager); let tenant = Tenant::new( @@ -1682,7 +1660,7 @@ pub mod harness { }, records.len() ); - println!("{}", s); + println!("{s}"); Ok(TEST_IMG(&s)) } @@ -1706,7 +1684,7 @@ mod tests { Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001"))); #[test] - fn test_basic() -> Result<()> { + fn test_basic() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_basic")?.load(); let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? @@ -1730,7 +1708,7 @@ mod tests { } #[test] - fn no_duplicate_timelines() -> Result<()> { + fn no_duplicate_timelines() -> anyhow::Result<()> { let tenant = TenantHarness::create("no_duplicate_timelines")?.load(); let _ = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? @@ -1761,7 +1739,7 @@ mod tests { /// Test branch creation /// #[test] - fn test_branch() -> Result<()> { + fn test_branch() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_branch")?.load(); let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? @@ -1814,7 +1792,7 @@ mod tests { Ok(()) } - fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> Result<()> { + fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> { let mut lsn = start_lsn; #[allow(non_snake_case)] { @@ -1856,7 +1834,7 @@ mod tests { } #[test] - fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> { + fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? .load(); @@ -1888,7 +1866,7 @@ mod tests { } #[test] - fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> { + fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); @@ -1915,7 +1893,7 @@ mod tests { // FIXME: This currently fails to error out. Calling GC doesn't currently // remove the old value, we'd need to work a little harder #[test] - fn test_prohibit_get_for_garbage_collected_data() -> Result<()> { + fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> { let repo = RepoHarness::create("test_prohibit_get_for_garbage_collected_data")? .load(); @@ -1935,7 +1913,7 @@ mod tests { */ #[test] - fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { + fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); let tline = tenant @@ -1954,7 +1932,7 @@ mod tests { Ok(()) } #[test] - fn test_parent_keeps_data_forever_after_branching() -> Result<()> { + fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); let tline = tenant @@ -1982,7 +1960,7 @@ mod tests { } #[test] - fn timeline_load() -> Result<()> { + fn timeline_load() -> anyhow::Result<()> { const TEST_NAME: &str = "timeline_load"; let harness = TenantHarness::create(TEST_NAME)?; { @@ -2003,7 +1981,7 @@ mod tests { } #[test] - fn timeline_load_with_ancestor() -> Result<()> { + fn timeline_load_with_ancestor() -> anyhow::Result<()> { const TEST_NAME: &str = "timeline_load_with_ancestor"; let harness = TenantHarness::create(TEST_NAME)?; // create two timelines @@ -2042,7 +2020,7 @@ mod tests { } #[test] - fn corrupt_metadata() -> Result<()> { + fn corrupt_metadata() -> anyhow::Result<()> { const TEST_NAME: &str = "corrupt_metadata"; let harness = TenantHarness::create(TEST_NAME)?; let tenant = harness.load(); @@ -2084,7 +2062,7 @@ mod tests { } #[test] - fn test_images() -> Result<()> { + fn test_images() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_images")?.load(); let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? @@ -2136,7 +2114,7 @@ mod tests { // repeat 50 times. // #[test] - fn test_bulk_insert() -> Result<()> { + fn test_bulk_insert() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_bulk_insert")?.load(); let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? @@ -2178,7 +2156,7 @@ mod tests { } #[test] - fn test_random_updates() -> Result<()> { + fn test_random_updates() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_random_updates")?.load(); let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? @@ -2250,7 +2228,7 @@ mod tests { } #[test] - fn test_traverse_branches() -> Result<()> { + fn test_traverse_branches() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_traverse_branches")?.load(); let mut tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? @@ -2331,7 +2309,7 @@ mod tests { } #[test] - fn test_traverse_ancestors() -> Result<()> { + fn test_traverse_ancestors() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_traverse_ancestors")?.load(); let mut tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a0ac0adea2..ccd094b65a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1,6 +1,6 @@ //! -use anyhow::{anyhow, bail, ensure, Context, Result}; +use anyhow::{anyhow, bail, ensure, Context}; use bytes::Bytes; use fail::fail_point; use itertools::Itertools; @@ -307,10 +307,6 @@ pub struct GcInfo { /// Public interface functions impl Timeline { - //------------------------------------------------------------------------------ - // Public GET functions - //------------------------------------------------------------------------------ - /// Get the LSN where this branch was created pub fn get_ancestor_lsn(&self) -> Lsn { self.ancestor_lsn @@ -445,7 +441,7 @@ impl Timeline { &self, lsn: Lsn, latest_gc_cutoff_lsn: &RcuReadGuard, - ) -> Result<()> { + ) -> anyhow::Result<()> { ensure!( lsn >= **latest_gc_cutoff_lsn, "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", @@ -455,12 +451,6 @@ impl Timeline { Ok(()) } - //------------------------------------------------------------------------------ - // Public PUT functions, to update the repository with new page versions. - // - // These are called by the WAL receiver to digest WAL records. - //------------------------------------------------------------------------------ - /// Flush to disk all data that was written with the put_* functions /// /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't @@ -479,6 +469,91 @@ impl Timeline { } } + pub fn compact(&self) -> anyhow::Result<()> { + let last_record_lsn = self.get_last_record_lsn(); + + // Last record Lsn could be zero in case the timelie was just created + if !last_record_lsn.is_valid() { + warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); + return Ok(()); + } + + // + // High level strategy for compaction / image creation: + // + // 1. First, calculate the desired "partitioning" of the + // currently in-use key space. The goal is to partition the + // key space into roughly fixed-size chunks, but also take into + // account any existing image layers, and try to align the + // chunk boundaries with the existing image layers to avoid + // too much churn. Also try to align chunk boundaries with + // relation boundaries. In principle, we don't know about + // relation boundaries here, we just deal with key-value + // pairs, and the code in pgdatadir_mapping.rs knows how to + // map relations into key-value pairs. But in practice we know + // that 'field6' is the block number, and the fields 1-5 + // identify a relation. This is just an optimization, + // though. + // + // 2. Once we know the partitioning, for each partition, + // decide if it's time to create a new image layer. The + // criteria is: there has been too much "churn" since the last + // image layer? The "churn" is fuzzy concept, it's a + // combination of too many delta files, or too much WAL in + // total in the delta file. Or perhaps: if creating an image + // file would allow to delete some older files. + // + // 3. After that, we compact all level0 delta files if there + // are too many of them. While compacting, we also garbage + // collect any page versions that are no longer needed because + // of the new image layers we created in step 2. + // + // TODO: This high level strategy hasn't been implemented yet. + // Below are functions compact_level0() and create_image_layers() + // but they are a bit ad hoc and don't quite work like it's explained + // above. Rewrite it. + let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); + + let target_file_size = self.get_checkpoint_distance(); + + // Define partitioning schema if needed + + match self.repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + ) { + Ok((partitioning, lsn)) => { + // 2. Create new image layers for partitions that have been modified + // "enough". + let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; + if !layer_paths_to_upload.is_empty() + && self.upload_layers.load(atomic::Ordering::Relaxed) + { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + layer_paths_to_upload, + None, + ); + } + + // 3. Compact + let timer = self.metrics.compact_time_histo.start_timer(); + self.compact_level0(target_file_size)?; + timer.stop_and_record(); + } + Err(err) => { + // no partitioning? This is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline + // as a simple key-value store, ignoring the datadir layout. Log the + // error but continue. + error!("could not compact, repartitioning keyspace failed: {err:?}"); + } + }; + + Ok(()) + } + /// Mutate the timeline with a [`TimelineWriter`]. pub fn writer(&self) -> TimelineWriter<'_> { TimelineWriter { @@ -486,6 +561,80 @@ impl Timeline { _write_guard: self.write_lock.lock().unwrap(), } } + + /// Retrieve current logical size of the timeline. + /// + /// The size could be lagging behind the actual number, in case + /// the initial size calculation has not been run (gets triggered on the first size access). + pub fn get_current_logical_size(self: &Arc) -> anyhow::Result { + let current_size = self.current_logical_size.current_size()?; + debug!("Current size: {current_size:?}"); + + let size = current_size.size(); + if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) = + (current_size, self.current_logical_size.initial_part_end) + { + self.try_spawn_size_init_task(init_lsn); + } + + Ok(size) + } + + /// Check if more than 'checkpoint_distance' of WAL has been accumulated in + /// the in-memory layer, and initiate flushing it if so. + /// + /// Also flush after a period of time without new data -- it helps + /// safekeepers to regard pageserver as caught up and suspend activity. + pub fn check_checkpoint_distance(self: &Arc) -> anyhow::Result<()> { + let last_lsn = self.get_last_record_lsn(); + let layers = self.layers.read().unwrap(); + if let Some(open_layer) = &layers.open_layer { + let open_layer_size = open_layer.size()?; + drop(layers); + let last_freeze_at = self.last_freeze_at.load(); + let last_freeze_ts = *(self.last_freeze_ts.read().unwrap()); + let distance = last_lsn.widening_sub(last_freeze_at); + // Checkpointing the open layer can be triggered by layer size or LSN range. + // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and + // we want to stay below that with a big margin. The LSN distance determines how + // much WAL the safekeepers need to store. + if distance >= self.get_checkpoint_distance().into() + || open_layer_size > self.get_checkpoint_distance() + || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()) + { + info!( + "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}", + distance, + open_layer_size, + last_freeze_ts.elapsed() + ); + + self.freeze_inmem_layer(true); + self.last_freeze_at.store(last_lsn); + *(self.last_freeze_ts.write().unwrap()) = Instant::now(); + + // Launch a task to flush the frozen layer to disk, unless + // a task was already running. (If the task was running + // at the time that we froze the layer, it must've seen the + // the layer we just froze before it exited; see comments + // in flush_frozen_layers()) + if let Ok(guard) = self.layer_flush_lock.try_lock() { + drop(guard); + let self_clone = Arc::clone(self); + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::LayerFlushTask, + Some(self.tenant_id), + Some(self.timeline_id), + "layer flush task", + false, + async move { self_clone.flush_frozen_layers(false) }, + ); + } + } + } + Ok(()) + } } // Private functions @@ -529,7 +678,7 @@ impl Timeline { /// /// Loads the metadata for the timeline into memory, but not the layer map. #[allow(clippy::too_many_arguments)] - pub fn new( + pub(super) fn new( conf: &'static PageServerConf, tenant_conf: Arc>, metadata: TimelineMetadata, @@ -602,7 +751,7 @@ impl Timeline { result } - pub fn launch_wal_receiver(self: &Arc) { + pub(super) fn launch_wal_receiver(self: &Arc) { if !is_etcd_client_initialized() { if cfg!(test) { info!("not launching WAL receiver because etcd client hasn't been initialized"); @@ -641,7 +790,7 @@ impl Timeline { /// Scan the timeline directory to populate the layer map. /// Returns all timeline-related files that were found and loaded. /// - pub fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { + pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { let mut layers = self.layers.write().unwrap(); let mut num_layers = 0; @@ -727,30 +876,12 @@ impl Timeline { Ok(()) } - pub fn layer_removal_guard(&self) -> anyhow::Result> { + pub(super) fn layer_removal_guard(&self) -> anyhow::Result> { self.layer_removal_cs .try_lock() .map_err(|e| anyhow!("cannot lock compaction critical section {e}")) } - /// Retrieve current logical size of the timeline. - /// - /// The size could be lagging behind the actual number, in case - /// the initial size calculation has not been run (gets triggered on the first size access). - pub fn get_current_logical_size(self: &Arc) -> anyhow::Result { - let current_size = self.current_logical_size.current_size()?; - debug!("Current size: {current_size:?}"); - - let size = current_size.size(); - if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) = - (current_size, self.current_logical_size.initial_part_end) - { - self.try_spawn_size_init_task(init_lsn); - } - - Ok(size) - } - fn try_spawn_size_init_task(self: &Arc, init_lsn: Lsn) { let timeline_id = self.timeline_id; @@ -971,7 +1102,7 @@ impl Timeline { Some((lsn, img)) } - fn get_ancestor_timeline(&self) -> Result> { + fn get_ancestor_timeline(&self) -> anyhow::Result> { let ancestor = self.ancestor_timeline.as_ref().with_context(|| { format!( "Ancestor is missing. Timeline id: {} Ancestor id {:?}", @@ -1030,14 +1161,14 @@ impl Timeline { Ok(layer) } - fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { + fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> { //info!("PUT: key {} at {}", key, lsn); let layer = self.get_layer_for_write(lsn)?; layer.put_value(key, lsn, val)?; Ok(()) } - fn put_tombstone(&self, key_range: Range, lsn: Lsn) -> Result<()> { + fn put_tombstone(&self, key_range: Range, lsn: Lsn) -> anyhow::Result<()> { let layer = self.get_layer_for_write(lsn)?; layer.put_tombstone(key_range, lsn)?; @@ -1076,64 +1207,6 @@ impl Timeline { drop(layers); } - /// - /// Check if more than 'checkpoint_distance' of WAL has been accumulated in - /// the in-memory layer, and initiate flushing it if so. - /// - /// Also flush after a period of time without new data -- it helps - /// safekeepers to regard pageserver as caught up and suspend activity. - /// - pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { - let last_lsn = self.get_last_record_lsn(); - let layers = self.layers.read().unwrap(); - if let Some(open_layer) = &layers.open_layer { - let open_layer_size = open_layer.size()?; - drop(layers); - let last_freeze_at = self.last_freeze_at.load(); - let last_freeze_ts = *(self.last_freeze_ts.read().unwrap()); - let distance = last_lsn.widening_sub(last_freeze_at); - // Checkpointing the open layer can be triggered by layer size or LSN range. - // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and - // we want to stay below that with a big margin. The LSN distance determines how - // much WAL the safekeepers need to store. - if distance >= self.get_checkpoint_distance().into() - || open_layer_size > self.get_checkpoint_distance() - || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()) - { - info!( - "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}", - distance, - open_layer_size, - last_freeze_ts.elapsed() - ); - - self.freeze_inmem_layer(true); - self.last_freeze_at.store(last_lsn); - *(self.last_freeze_ts.write().unwrap()) = Instant::now(); - - // Launch a task to flush the frozen layer to disk, unless - // a task was already running. (If the task was running - // at the time that we froze the layer, it must've seen the - // the layer we just froze before it exited; see comments - // in flush_frozen_layers()) - if let Ok(guard) = self.layer_flush_lock.try_lock() { - drop(guard); - let self_clone = Arc::clone(self); - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - task_mgr::TaskKind::LayerFlushTask, - Some(self.tenant_id), - Some(self.timeline_id), - "layer flush task", - false, - async move { self_clone.flush_frozen_layers(false) }, - ); - } - } - } - Ok(()) - } - /// Flush all frozen layers to disk. /// /// Only one task at a time can be doing layer-flushing for a @@ -1141,7 +1214,7 @@ impl Timeline { /// currently doing the flushing, this function will wait for it /// to finish. If 'wait' is false, this function will return /// immediately instead. - fn flush_frozen_layers(&self, wait: bool) -> Result<()> { + fn flush_frozen_layers(&self, wait: bool) -> anyhow::Result<()> { let flush_lock_guard = if wait { self.layer_flush_lock.lock().unwrap() } else { @@ -1180,7 +1253,7 @@ impl Timeline { } /// Flush one frozen in-memory layer to disk, as a new delta layer. - fn flush_frozen_layer(&self, frozen_layer: Arc) -> Result<()> { + fn flush_frozen_layer(&self, frozen_layer: Arc) -> anyhow::Result<()> { // As a special case, when we have just imported an image into the repository, // instead of writing out a L0 delta layer, we directly write out image layer // files instead. This is possible as long as *all* the data imported into the @@ -1238,7 +1311,7 @@ impl Timeline { &self, disk_consistent_lsn: Lsn, layer_paths_to_upload: HashMap, - ) -> Result<()> { + ) -> anyhow::Result<()> { // We can only save a valid 'prev_record_lsn' value on disk if we // flushed *all* in-memory changes to disk. We only track // 'prev_record_lsn' in memory for the latest processed record, so we @@ -1299,7 +1372,7 @@ impl Timeline { fn create_delta_layer( &self, frozen_layer: &InMemoryLayer, - ) -> Result<(PathBuf, LayerFileMetadata)> { + ) -> anyhow::Result<(PathBuf, LayerFileMetadata)> { // Write it out let new_delta = frozen_layer.write_to_disk()?; let new_delta_path = new_delta.path(); @@ -1334,92 +1407,7 @@ impl Timeline { Ok((new_delta_path, LayerFileMetadata::new(sz))) } - pub fn compact(&self) -> anyhow::Result<()> { - let last_record_lsn = self.get_last_record_lsn(); - - // Last record Lsn could be zero in case the timelie was just created - if !last_record_lsn.is_valid() { - warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); - return Ok(()); - } - - // - // High level strategy for compaction / image creation: - // - // 1. First, calculate the desired "partitioning" of the - // currently in-use key space. The goal is to partition the - // key space into roughly fixed-size chunks, but also take into - // account any existing image layers, and try to align the - // chunk boundaries with the existing image layers to avoid - // too much churn. Also try to align chunk boundaries with - // relation boundaries. In principle, we don't know about - // relation boundaries here, we just deal with key-value - // pairs, and the code in pgdatadir_mapping.rs knows how to - // map relations into key-value pairs. But in practice we know - // that 'field6' is the block number, and the fields 1-5 - // identify a relation. This is just an optimization, - // though. - // - // 2. Once we know the partitioning, for each partition, - // decide if it's time to create a new image layer. The - // criteria is: there has been too much "churn" since the last - // image layer? The "churn" is fuzzy concept, it's a - // combination of too many delta files, or too much WAL in - // total in the delta file. Or perhaps: if creating an image - // file would allow to delete some older files. - // - // 3. After that, we compact all level0 delta files if there - // are too many of them. While compacting, we also garbage - // collect any page versions that are no longer needed because - // of the new image layers we created in step 2. - // - // TODO: This high level strategy hasn't been implemented yet. - // Below are functions compact_level0() and create_image_layers() - // but they are a bit ad hoc and don't quite work like it's explained - // above. Rewrite it. - let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); - - let target_file_size = self.get_checkpoint_distance(); - - // Define partitioning schema if needed - - match self.repartition( - self.get_last_record_lsn(), - self.get_compaction_target_size(), - ) { - Ok((partitioning, lsn)) => { - // 2. Create new image layers for partitions that have been modified - // "enough". - let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; - if !layer_paths_to_upload.is_empty() - && self.upload_layers.load(atomic::Ordering::Relaxed) - { - storage_sync::schedule_layer_upload( - self.tenant_id, - self.timeline_id, - layer_paths_to_upload, - None, - ); - } - - // 3. Compact - let timer = self.metrics.compact_time_histo.start_timer(); - self.compact_level0(target_file_size)?; - timer.stop_and_record(); - } - Err(err) => { - // no partitioning? This is normal, if the timeline was just created - // as an empty timeline. Also in unit tests, when we use the timeline - // as a simple key-value store, ignoring the datadir layout. Log the - // error but continue. - error!("could not compact, repartitioning keyspace failed: {err:?}"); - } - }; - - Ok(()) - } - - fn repartition(&self, lsn: Lsn, partition_size: u64) -> Result<(KeyPartitioning, Lsn)> { + fn repartition(&self, lsn: Lsn, partition_size: u64) -> anyhow::Result<(KeyPartitioning, Lsn)> { let mut partitioning_guard = self.partitioning.lock().unwrap(); if partitioning_guard.1 == Lsn(0) || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold @@ -1433,7 +1421,7 @@ impl Timeline { } // Is it time to create a new image layer for the given partition? - fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result { + fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result { let layers = self.layers.read().unwrap(); for part_range in &partition.ranges { @@ -1478,7 +1466,7 @@ impl Timeline { partitioning: &KeyPartitioning, lsn: Lsn, force: bool, - ) -> Result> { + ) -> anyhow::Result> { let timer = self.metrics.create_images_time_histo.start_timer(); let mut image_layers: Vec = Vec::new(); for partition in partitioning.parts.iter() { @@ -1571,7 +1559,7 @@ impl Timeline { /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as /// as Level 1 files. /// - fn compact_level0(&self, target_file_size: u64) -> Result<()> { + fn compact_level0(&self, target_file_size: u64) -> anyhow::Result<()> { let layers = self.layers.read().unwrap(); let mut level0_deltas = layers.get_level0_deltas()?; drop(layers); @@ -1881,12 +1869,12 @@ impl Timeline { /// /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine /// whether a record is needed for PITR. - pub fn update_gc_info( + pub(super) fn update_gc_info( &self, retain_lsns: Vec, cutoff_horizon: Lsn, pitr: Duration, - ) -> Result<()> { + ) -> anyhow::Result<()> { let mut gc_info = self.gc_info.write().unwrap(); gc_info.horizon_cutoff = cutoff_horizon; @@ -1941,7 +1929,7 @@ impl Timeline { /// within a layer file. We can only remove the whole file if it's fully /// obsolete. /// - pub fn gc(&self) -> Result { + pub(super) fn gc(&self) -> anyhow::Result { let mut result: GcResult = Default::default(); let now = SystemTime::now(); @@ -2261,11 +2249,11 @@ impl<'a> TimelineWriter<'a> { /// /// This will implicitly extend the relation, if the page is beyond the /// current end-of-file. - pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> { + pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> { self.tl.put_value(key, lsn, value) } - pub fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()> { + pub fn delete(&self, key_range: Range, lsn: Lsn) -> anyhow::Result<()> { self.tl.put_tombstone(key_range, lsn) } From 2709878b8be8f34602bc5273d31fe3ad07fd8aac Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Fri, 21 Oct 2022 14:21:22 +0300 Subject: [PATCH 0944/1022] Deploy scram proxies into new account (#2643) --- .../dev-us-east-2-beta.neon-proxy-scram.yaml | 31 +++++++++++++++++++ .github/workflows/build_and_test.yml | 28 +++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml new file mode 100644 index 0000000000..f2247fa4c1 --- /dev/null +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml @@ -0,0 +1,31 @@ +# Helm chart values for neon-proxy-scram. +# This is a YAML-formatted file. + +image: + repository: neondatabase/neon + +settings: + authBackend: "console" + authEndpoint: "http://console-staging.local/management/api/v2" + domain: "*.us-east-2.aws.neon.build" + +# -- Additional labels for neon-proxy pods +podLabels: + zenith_service: proxy-scram + zenith_env: dev + zenith_region: us-east-2 + zenith_region_slug: us-east-2 + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build + +#metrics: +# enabled: true +# serviceMonitor: +# enabled: true +# selector: +# release: kube-prometheus-stack diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 460d73a552..14ee61c5b9 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -825,3 +825,31 @@ jobs: DOCKER_TAG=${{needs.tag.outputs.build-tag}} helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + + deploy-proxy-new: + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned + # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. + needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Configure environment + run: | + helm repo add neondatabase https://neondatabase.github.io/helm-charts + aws --region us-east-2 eks update-kubeconfig --name dev-us-east-2-beta --role-arn arn:aws:iam::369495373322:role/github-runner + + - name: Re-deploy proxy + run: | + DOCKER_TAG=${{needs.tag.outputs.build-tag}} + helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s From 7480a0338a3aed73b531625f17df5089f0ab0830 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Sun, 16 Oct 2022 15:51:21 +0400 Subject: [PATCH 0945/1022] Determine safekeeper for offloading WAL without etcd election API. This API is rather pointless, as sane choice anyway requires knowledge of peers status and leaders lifetime in any case can intersect, which is fine for us -- so manual elections are straightforward. Here, we deterministically choose among the reasonably caught up safekeepers, shifting by timeline id to spread the load. A step towards custom broker https://github.com/neondatabase/neon/issues/2394 --- control_plane/src/safekeeper.rs | 1 - libs/etcd_broker/src/subscription_value.rs | 3 + libs/utils/src/id.rs | 12 + .../src/walreceiver/connection_manager.rs | 31 ++ safekeeper/src/bin/safekeeper.rs | 40 ++- safekeeper/src/broker.rs | 115 +------ safekeeper/src/control_file_upgrade.rs | 13 +- safekeeper/src/lib.rs | 13 +- safekeeper/src/safekeeper.rs | 19 +- safekeeper/src/timeline.rs | 131 +++++-- safekeeper/src/wal_backup.rs | 322 +++++++++--------- 11 files changed, 363 insertions(+), 337 deletions(-) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 64a89124d2..17f5d0c109 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -123,7 +123,6 @@ impl SafekeeperNode { .args(&["--id", self.id.to_string().as_ref()]) .args(&["--listen-pg", &listen_pg]) .args(&["--listen-http", &listen_http]) - .args(&["--recall", "1 second"]) .arg("--daemonize"), ); if !self.conf.sync { diff --git a/libs/etcd_broker/src/subscription_value.rs b/libs/etcd_broker/src/subscription_value.rs index d3e2011761..60a5411926 100644 --- a/libs/etcd_broker/src/subscription_value.rs +++ b/libs/etcd_broker/src/subscription_value.rs @@ -29,6 +29,9 @@ pub struct SkTimelineInfo { #[serde_as(as = "Option")] #[serde(default)] pub peer_horizon_lsn: Option, + #[serde_as(as = "Option")] + #[serde(default)] + pub local_start_lsn: Option, /// A connection string to use for WAL receiving. #[serde(default)] pub safekeeper_connstr: Option, diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index 059ce69ca4..f245f7c3d4 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -75,6 +75,12 @@ impl From<[u8; 16]> for Id { } } +impl From for u128 { + fn from(id: Id) -> Self { + u128::from_le_bytes(id.0) + } +} + impl fmt::Display for Id { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(&self.hex_encode()) @@ -136,6 +142,12 @@ macro_rules! id_newtype { } } + impl From<$t> for u128 { + fn from(id: $t) -> Self { + u128::from(id.0) + } + } + impl fmt::Display for $t { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 3a5d1c7ad6..2380caaff1 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -801,6 +801,7 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, safekeeper_connstr: None, }, etcd_version: 0, @@ -817,6 +818,8 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: Some("no commit_lsn".to_string()), }, etcd_version: 0, @@ -833,6 +836,7 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, safekeeper_connstr: Some("no commit_lsn".to_string()), }, etcd_version: 0, @@ -849,6 +853,7 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, safekeeper_connstr: None, }, etcd_version: 0, @@ -908,6 +913,8 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), }, etcd_version: 0, @@ -924,6 +931,8 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: Some("not advanced Lsn".to_string()), }, etcd_version: 0, @@ -940,6 +949,8 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: Some("not enough advanced Lsn".to_string()), }, etcd_version: 0, @@ -974,6 +985,8 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), }, etcd_version: 0, @@ -1006,6 +1019,8 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: Some("smaller commit_lsn".to_string()), }, etcd_version: 0, @@ -1022,6 +1037,8 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), }, etcd_version: 0, @@ -1038,6 +1055,8 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: None, }, etcd_version: 0, @@ -1083,6 +1102,8 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), }, etcd_version: 0, @@ -1099,6 +1120,8 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), }, etcd_version: 0, @@ -1168,6 +1191,8 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), }, etcd_version: 0, @@ -1184,6 +1209,8 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: Some("advanced by Lsn safekeeper".to_string()), }, etcd_version: 0, @@ -1255,6 +1282,8 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), }, etcd_version: 0, @@ -1326,6 +1355,8 @@ mod tests { backup_lsn: None, remote_consistent_lsn: None, peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), }, etcd_version: 0, diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 9422b55d60..a867aea5af 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -21,7 +21,8 @@ use metrics::set_build_info_metric; use safekeeper::broker; use safekeeper::control_file; use safekeeper::defaults::{ - DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS, + DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, + DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS, }; use safekeeper::http; use safekeeper::remove_wal; @@ -72,10 +73,6 @@ fn main() -> anyhow::Result<()> { conf.listen_http_addr = addr.to_string(); } - if let Some(recall) = arg_matches.get_one::("recall") { - conf.recall_period = humantime::parse_duration(recall)?; - } - let mut given_id = None; if let Some(given_id_str) = arg_matches.get_one::("id") { given_id = Some(NodeId( @@ -93,6 +90,16 @@ fn main() -> anyhow::Result<()> { conf.broker_etcd_prefix = prefix.to_string(); } + if let Some(heartbeat_timeout_str) = arg_matches.get_one::("heartbeat-timeout") { + conf.heartbeat_timeout = + humantime::parse_duration(heartbeat_timeout_str).with_context(|| { + format!( + "failed to parse heartbeat-timeout {}", + heartbeat_timeout_str + ) + })?; + } + if let Some(backup_threads) = arg_matches.get_one::("wal-backup-threads") { conf.backup_runtime_threads = backup_threads .parse() @@ -105,6 +112,14 @@ fn main() -> anyhow::Result<()> { let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again conf.remote_storage = Some(RemoteStorageConfig::from_toml(storage_conf_parsed_toml)?); } + if let Some(max_offloader_lag_str) = arg_matches.get_one::("max-offloader-lag") { + conf.max_offloader_lag_bytes = max_offloader_lag_str.parse().with_context(|| { + format!( + "failed to parse max offloader lag {}", + max_offloader_lag_str + ) + })?; + } // Seems like there is no better way to accept bool values explicitly in clap. conf.wal_backup_enabled = arg_matches .get_one::("enable-wal-backup") @@ -361,11 +376,6 @@ fn cli() -> Command { .short('p') .long("pageserver"), ) - .arg( - Arg::new("recall") - .long("recall") - .help("Period for requestion pageserver to call for replication"), - ) .arg( Arg::new("daemonize") .short('d') @@ -397,6 +407,11 @@ fn cli() -> Command { .long("broker-etcd-prefix") .help("a prefix to always use when polling/pusing data in etcd from this safekeeper"), ) + .arg( + Arg::new("heartbeat-timeout") + .long("heartbeat-timeout") + .help(formatcp!("Peer is considered dead after not receiving heartbeats from it during this period (default {}s), passed as a human readable duration.", DEFAULT_HEARTBEAT_TIMEOUT.as_secs())) + ) .arg( Arg::new("wal-backup-threads").long("backup-threads").help(formatcp!("number of threads for wal backup (default {DEFAULT_WAL_BACKUP_RUNTIME_THREADS}")), ).arg( @@ -404,6 +419,11 @@ fn cli() -> Command { .long("remote-storage") .help("Remote storage configuration for WAL backup (offloading to s3) as TOML inline table, e.g. {\"max_concurrent_syncs\" = 17, \"max_sync_errors\": 13, \"bucket_name\": \"\", \"bucket_region\":\"\", \"concurrency_limit\": 119}.\nSafekeeper offloads WAL to [prefix_in_bucket/]//, mirroring structure on the file system.") ) + .arg( + Arg::new("max-offloader-lag") + .long("max-offloader-lag") + .help(formatcp!("Safekeeper won't be elected for WAL offloading if it is lagging for more than this value (default {}MB) in bytes", DEFAULT_MAX_OFFLOADER_LAG_BYTES / (1 << 20))) + ) .arg( Arg::new("enable-wal-backup") .long("enable-wal-backup") diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 6a2456ecda..76135241b9 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -1,6 +1,5 @@ //! Communication with etcd, providing safekeeper peers and pageserver coordination. -use anyhow::anyhow; use anyhow::Context; use anyhow::Error; use anyhow::Result; @@ -12,11 +11,9 @@ use std::collections::hash_map::Entry; use std::collections::HashMap; use std::collections::HashSet; use std::time::Duration; -use tokio::spawn; use tokio::task::JoinHandle; use tokio::{runtime, time::sleep}; use tracing::*; -use url::Url; use crate::GlobalTimelines; use crate::SafeKeeperConf; @@ -56,113 +53,6 @@ fn timeline_safekeeper_path( ) } -pub struct Election { - pub election_name: String, - pub candidate_name: String, - pub broker_endpoints: Vec, -} - -impl Election { - pub fn new(election_name: String, candidate_name: String, broker_endpoints: Vec) -> Self { - Self { - election_name, - candidate_name, - broker_endpoints, - } - } -} - -pub struct ElectionLeader { - client: Client, - keep_alive: JoinHandle>, -} - -impl ElectionLeader { - pub async fn check_am_i( - &mut self, - election_name: String, - candidate_name: String, - ) -> Result { - let resp = self.client.leader(election_name).await?; - - let kv = resp - .kv() - .ok_or_else(|| anyhow!("failed to get leader response"))?; - let leader = kv.value_str()?; - - Ok(leader == candidate_name) - } - - pub async fn give_up(self) { - self.keep_alive.abort(); - // TODO: it'll be wise to resign here but it'll happen after lease expiration anyway - // should we await for keep alive termination? - let _ = self.keep_alive.await; - } -} - -pub async fn get_leader(req: &Election, leader: &mut Option) -> Result<()> { - let mut client = Client::connect(req.broker_endpoints.clone(), None) - .await - .context("Could not connect to etcd")?; - - let lease = client - .lease_grant(LEASE_TTL_SEC, None) - .await - .context("Could not acquire a lease"); - - let lease_id = lease.map(|l| l.id()).unwrap(); - - // kill previous keepalive, if any - if let Some(l) = leader.take() { - l.give_up().await; - } - - let keep_alive = spawn::<_>(lease_keep_alive(client.clone(), lease_id)); - // immediately save handle to kill task if we get canceled below - *leader = Some(ElectionLeader { - client: client.clone(), - keep_alive, - }); - - client - .campaign( - req.election_name.clone(), - req.candidate_name.clone(), - lease_id, - ) - .await?; - - Ok(()) -} - -async fn lease_keep_alive(mut client: Client, lease_id: i64) -> Result<()> { - let (mut keeper, mut ka_stream) = client - .lease_keep_alive(lease_id) - .await - .context("failed to create keepalive stream")?; - - loop { - let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); - - keeper - .keep_alive() - .await - .context("failed to send LeaseKeepAliveRequest")?; - - ka_stream - .message() - .await - .context("failed to receive LeaseKeepAliveResponse")?; - - sleep(push_interval).await; - } -} - -pub fn get_candiate_name(system_id: NodeId) -> String { - format!("id_{system_id}") -} - async fn push_sk_info( ttid: TenantTimelineId, mut client: Client, @@ -236,7 +126,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { let handles = active_tlis .iter() .map(|tli| { - let sk_info = tli.get_public_info(&conf); + let sk_info = tli.get_safekeeper_info(&conf); let key = timeline_safekeeper_path(conf.broker_etcd_prefix.clone(), tli.ttid, conf.my_id); let lease = leases.remove(&tli.ttid).unwrap(); @@ -282,6 +172,9 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { Some(new_info) => { // note: there are blocking operations below, but it's considered fine for now if let Ok(tli) = GlobalTimelines::get(new_info.key.id) { + // Note that we also receive *our own* info. That's + // important, as it is used as an indication of live + // connection to the broker. tli.record_safekeeper_info(&new_info.value, new_info.key.node_id) .await? } diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 1ce9186085..856c164be8 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -1,6 +1,7 @@ //! Code to deal with safekeeper control file upgrades use crate::safekeeper::{ - AcceptorState, Peers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermSwitchEntry, + AcceptorState, PersistedPeers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, + TermSwitchEntry, }; use anyhow::{bail, Result}; use serde::{Deserialize, Serialize}; @@ -134,7 +135,7 @@ pub struct SafeKeeperStateV4 { // fundamental; but state is saved here only for informational purposes and // obviously can be stale. (Currently not saved at all, but let's provision // place to have less file version upgrades). - pub peers: Peers, + pub peers: PersistedPeers, } pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { @@ -165,7 +166,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result backup_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), - peers: Peers(vec![]), + peers: PersistedPeers(vec![]), }); // migrate to hexing some ids } else if version == 2 { @@ -188,7 +189,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result backup_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), - peers: Peers(vec![]), + peers: PersistedPeers(vec![]), }); // migrate to moving tenant_id/timeline_id to the top and adding some lsns } else if version == 3 { @@ -211,7 +212,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result backup_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), - peers: Peers(vec![]), + peers: PersistedPeers(vec![]), }); // migrate to having timeline_start_lsn } else if version == 4 { @@ -234,7 +235,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result backup_lsn: Lsn::INVALID, peer_horizon_lsn: oldstate.peer_horizon_lsn, remote_consistent_lsn: Lsn(0), - peers: Peers(vec![]), + peers: PersistedPeers(vec![]), }); } else if version == 5 { info!("reading safekeeper control file version {}", version); diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index e38a5a4633..19dff79b88 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -1,4 +1,6 @@ -use defaults::DEFAULT_WAL_BACKUP_RUNTIME_THREADS; +use defaults::{ + DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_WAL_BACKUP_RUNTIME_THREADS, +}; // use remote_storage::RemoteStorageConfig; use std::path::PathBuf; @@ -34,8 +36,9 @@ pub mod defaults { DEFAULT_PG_LISTEN_PORT, }; - pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(10); pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8; + pub const DEFAULT_HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(5); + pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20); } #[derive(Debug, Clone)] @@ -52,7 +55,6 @@ pub struct SafeKeeperConf { pub no_sync: bool, pub listen_pg_addr: String, pub listen_http_addr: String, - pub recall_period: Duration, pub remote_storage: Option, pub backup_runtime_threads: usize, pub wal_backup_enabled: bool, @@ -60,6 +62,8 @@ pub struct SafeKeeperConf { pub broker_endpoints: Vec, pub broker_etcd_prefix: String, pub auth_validation_public_key_path: Option, + pub heartbeat_timeout: Duration, + pub max_offloader_lag_bytes: u64, } impl SafeKeeperConf { @@ -85,13 +89,14 @@ impl Default for SafeKeeperConf { listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), remote_storage: None, - recall_period: defaults::DEFAULT_RECALL_PERIOD, my_id: NodeId(0), broker_endpoints: Vec::new(), broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS, wal_backup_enabled: true, auth_validation_public_key_path: None, + heartbeat_timeout: DEFAULT_HEARTBEAT_TIMEOUT, + max_offloader_lag_bytes: DEFAULT_MAX_OFFLOADER_LAG_BYTES, } } } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 7b11aaf92a..3f9b70f282 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -11,6 +11,7 @@ use std::cmp::max; use std::cmp::min; use std::fmt; use std::io::Read; + use tracing::*; use crate::control_file; @@ -132,9 +133,8 @@ pub struct ServerInfo { pub wal_seg_size: u32, } -/// Data published by safekeeper to the peers #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct PeerInfo { +pub struct PersistedPeerInfo { /// LSN up to which safekeeper offloaded WAL to s3. backup_lsn: Lsn, /// Term of the last entry. @@ -145,7 +145,7 @@ pub struct PeerInfo { commit_lsn: Lsn, } -impl PeerInfo { +impl PersistedPeerInfo { fn new() -> Self { Self { backup_lsn: Lsn::INVALID, @@ -156,10 +156,8 @@ impl PeerInfo { } } -// vector-based node id -> peer state map with very limited functionality we -// need/ #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Peers(pub Vec<(NodeId, PeerInfo)>); +pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>); /// Persistent information stored on safekeeper node /// On disk data is prefixed by magic and format version and followed by checksum. @@ -203,7 +201,7 @@ pub struct SafeKeeperState { // fundamental; but state is saved here only for informational purposes and // obviously can be stale. (Currently not saved at all, but let's provision // place to have less file version upgrades). - pub peers: Peers, + pub peers: PersistedPeers, } #[derive(Debug, Clone)] @@ -240,7 +238,12 @@ impl SafeKeeperState { backup_lsn: local_start_lsn, peer_horizon_lsn: local_start_lsn, remote_consistent_lsn: Lsn(0), - peers: Peers(peers.iter().map(|p| (*p, PeerInfo::new())).collect()), + peers: PersistedPeers( + peers + .iter() + .map(|p| (*p, PersistedPeerInfo::new())) + .collect(), + ), } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 3fb77bf582..1930b3574a 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -7,7 +7,7 @@ use etcd_broker::subscription_value::SkTimelineInfo; use postgres_ffi::XLogSegNo; -use tokio::sync::watch; +use tokio::{sync::watch, time::Instant}; use std::cmp::{max, min}; @@ -26,7 +26,7 @@ use utils::{ use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, - SafekeeperMemState, ServerInfo, + SafekeeperMemState, ServerInfo, Term, }; use crate::send_wal::HotStandbyFeedback; use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; @@ -36,6 +36,53 @@ use crate::wal_storage; use crate::wal_storage::Storage as wal_storage_iface; use crate::SafeKeeperConf; +/// Things safekeeper should know about timeline state on peers. +#[derive(Debug, Clone)] +pub struct PeerInfo { + pub sk_id: NodeId, + /// Term of the last entry. + _last_log_term: Term, + /// LSN of the last record. + _flush_lsn: Lsn, + pub commit_lsn: Lsn, + /// Since which LSN safekeeper has WAL. TODO: remove this once we fill new + /// sk since backup_lsn. + pub local_start_lsn: Lsn, + /// When info was received. + ts: Instant, +} + +impl PeerInfo { + fn from_sk_info(sk_id: NodeId, sk_info: &SkTimelineInfo, ts: Instant) -> PeerInfo { + PeerInfo { + sk_id, + _last_log_term: sk_info.last_log_term.unwrap_or(0), + _flush_lsn: sk_info.flush_lsn.unwrap_or(Lsn::INVALID), + commit_lsn: sk_info.commit_lsn.unwrap_or(Lsn::INVALID), + local_start_lsn: sk_info.local_start_lsn.unwrap_or(Lsn::INVALID), + ts, + } + } +} + +// vector-based node id -> peer state map with very limited functionality we +// need. +#[derive(Debug, Clone, Default)] +pub struct PeersInfo(pub Vec); + +impl PeersInfo { + fn get(&mut self, id: NodeId) -> Option<&mut PeerInfo> { + self.0.iter_mut().find(|p| p.sk_id == id) + } + + fn upsert(&mut self, p: &PeerInfo) { + match self.get(p.sk_id) { + Some(rp) => *rp = p.clone(), + None => self.0.push(p.clone()), + } + } +} + /// Replica status update + hot standby feedback #[derive(Debug, Clone, Copy)] pub struct ReplicaState { @@ -74,6 +121,8 @@ impl ReplicaState { pub struct SharedState { /// Safekeeper object sk: SafeKeeper, + /// In memory list containing state of peers sent in latest messages from them. + peers_info: PeersInfo, /// State of replicas replicas: Vec>, /// True when WAL backup launcher oversees the timeline, making sure WAL is @@ -123,7 +172,8 @@ impl SharedState { Ok(Self { sk, - replicas: Vec::new(), + peers_info: PeersInfo(vec![]), + replicas: vec![], wal_backup_active: false, active: false, num_computes: 0, @@ -142,6 +192,7 @@ impl SharedState { Ok(Self { sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?, + peers_info: PeersInfo(vec![]), replicas: Vec::new(), wal_backup_active: false, active: false, @@ -201,12 +252,6 @@ impl SharedState { self.wal_backup_active } - // Can this safekeeper offload to s3? Recently joined safekeepers might not - // have necessary WAL. - fn can_wal_backup(&self) -> bool { - self.sk.state.local_start_lsn <= self.sk.inmem.backup_lsn - } - fn get_wal_seg_size(&self) -> usize { self.sk.state.server.wal_seg_size as usize } @@ -268,6 +313,24 @@ impl SharedState { self.replicas.push(Some(state)); pos } + + fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo { + SkTimelineInfo { + last_log_term: Some(self.sk.get_epoch()), + flush_lsn: Some(self.sk.wal_store.flush_lsn()), + // note: this value is not flushed to control file yet and can be lost + commit_lsn: Some(self.sk.inmem.commit_lsn), + // TODO: rework feedbacks to avoid max here + remote_consistent_lsn: Some(max( + self.get_replicas_state().remote_consistent_lsn, + self.sk.inmem.remote_consistent_lsn, + )), + peer_horizon_lsn: Some(self.sk.inmem.peer_horizon_lsn), + safekeeper_connstr: Some(conf.listen_pg_addr.clone()), + backup_lsn: Some(self.sk.inmem.backup_lsn), + local_start_lsn: Some(self.sk.state.local_start_lsn), + } + } } #[derive(Debug, thiserror::Error)] @@ -517,17 +580,6 @@ impl Timeline { self.write_shared_state().wal_backup_attend() } - /// Can this safekeeper offload to s3? Recently joined safekeepers might not - /// have necessary WAL. - pub fn can_wal_backup(&self) -> bool { - if self.is_cancelled() { - return false; - } - - let shared_state = self.write_shared_state(); - shared_state.can_wal_backup() - } - /// Returns full timeline info, required for the metrics. If the timeline is /// not active, returns None instead. pub fn info_for_metrics(&self) -> Option { @@ -632,36 +684,25 @@ impl Timeline { Ok(()) } - /// Return public safekeeper info for broadcasting to broker and other peers. - pub fn get_public_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo { + /// Get safekeeper info for broadcasting to broker and other peers. + pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo { let shared_state = self.write_shared_state(); - SkTimelineInfo { - last_log_term: Some(shared_state.sk.get_epoch()), - flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()), - // note: this value is not flushed to control file yet and can be lost - commit_lsn: Some(shared_state.sk.inmem.commit_lsn), - // TODO: rework feedbacks to avoid max here - remote_consistent_lsn: Some(max( - shared_state.get_replicas_state().remote_consistent_lsn, - shared_state.sk.inmem.remote_consistent_lsn, - )), - peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn), - safekeeper_connstr: Some(conf.listen_pg_addr.clone()), - backup_lsn: Some(shared_state.sk.inmem.backup_lsn), - } + shared_state.get_safekeeper_info(conf) } /// Update timeline state with peer safekeeper data. pub async fn record_safekeeper_info( &self, sk_info: &SkTimelineInfo, - _sk_id: NodeId, + sk_id: NodeId, ) -> Result<()> { let is_wal_backup_action_pending: bool; let commit_lsn: Lsn; { let mut shared_state = self.write_shared_state(); shared_state.sk.record_safekeeper_info(sk_info)?; + let peer_info = PeerInfo::from_sk_info(sk_id, sk_info, Instant::now()); + shared_state.peers_info.upsert(&peer_info); is_wal_backup_action_pending = shared_state.update_status(self.ttid); commit_lsn = shared_state.sk.inmem.commit_lsn; } @@ -673,6 +714,22 @@ impl Timeline { Ok(()) } + /// Get our latest view of alive peers status on the timeline. + /// We pass our own info through the broker as well, so when we don't have connection + /// to the broker returned vec is empty. + pub fn get_peers(&self, conf: &SafeKeeperConf) -> Vec { + let shared_state = self.write_shared_state(); + let now = Instant::now(); + shared_state + .peers_info + .0 + .iter() + // Regard peer as absent if we haven't heard from it within heartbeat_timeout. + .filter(|p| now.duration_since(p.ts) <= conf.heartbeat_timeout) + .cloned() + .collect() + } + /// Add send_wal replica to the in-memory vector of replicas. pub fn add_replica(&self, state: ReplicaState) -> usize { self.write_shared_state().add_replica(state) diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index c82a003161..13287bd036 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -1,8 +1,7 @@ use anyhow::{Context, Result}; -use etcd_broker::subscription_key::{ - NodeKind, OperationKind, SkOperationKind, SubscriptionKey, SubscriptionKind, -}; + use tokio::task::JoinHandle; +use utils::id::NodeId; use std::cmp::min; use std::collections::HashMap; @@ -26,14 +25,11 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; -use crate::broker::{Election, ElectionLeader}; -use crate::timeline::Timeline; -use crate::{broker, GlobalTimelines, SafeKeeperConf}; +use crate::timeline::{PeerInfo, Timeline}; +use crate::{GlobalTimelines, SafeKeeperConf}; use once_cell::sync::OnceCell; -const BROKER_CONNECTION_RETRY_DELAY_MS: u64 = 1000; - const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10; const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; @@ -70,47 +66,104 @@ struct WalBackupTimelineEntry { handle: Option, } -/// Start per timeline task, if it makes sense for this safekeeper to offload. -fn consider_start_task( +async fn shut_down_task(ttid: TenantTimelineId, entry: &mut WalBackupTimelineEntry) { + if let Some(wb_handle) = entry.handle.take() { + // Tell the task to shutdown. Error means task exited earlier, that's ok. + let _ = wb_handle.shutdown_tx.send(()).await; + // Await the task itself. TODO: restart panicked tasks earlier. + if let Err(e) = wb_handle.handle.await { + warn!("WAL backup task for {} panicked: {}", ttid, e); + } + } +} + +/// The goal is to ensure that normally only one safekeepers offloads. However, +/// it is fine (and inevitable, as s3 doesn't provide CAS) that for some short +/// time we have several ones as they PUT the same files. Also, +/// - frequently changing the offloader would be bad; +/// - electing seriously lagging safekeeper is undesirable; +/// So we deterministically choose among the reasonably caught up candidates. +/// TODO: take into account failed attempts to deal with hypothetical situation +/// where s3 is unreachable only for some sks. +fn determine_offloader( + alive_peers: &[PeerInfo], + wal_backup_lsn: Lsn, + ttid: TenantTimelineId, + conf: &SafeKeeperConf, +) -> (Option, String) { + // TODO: remove this once we fill newly joined safekeepers since backup_lsn. + let capable_peers = alive_peers + .iter() + .filter(|p| p.local_start_lsn <= wal_backup_lsn); + match alive_peers.iter().map(|p| p.commit_lsn).max() { + None => (None, "no connected peers to elect from".to_string()), + Some(max_commit_lsn) => { + let threshold = max_commit_lsn + .checked_sub(conf.max_offloader_lag_bytes) + .unwrap_or(Lsn(0)); + let mut caughtup_peers = capable_peers + .clone() + .filter(|p| p.commit_lsn >= threshold) + .collect::>(); + caughtup_peers.sort_by(|p1, p2| p1.sk_id.cmp(&p2.sk_id)); + + // To distribute the load, shift by timeline_id. + let offloader = caughtup_peers + [(u128::from(ttid.timeline_id) % caughtup_peers.len() as u128) as usize] + .sk_id; + + let mut capable_peers_dbg = capable_peers + .map(|p| (p.sk_id, p.commit_lsn)) + .collect::>(); + capable_peers_dbg.sort_by(|p1, p2| p1.0.cmp(&p2.0)); + ( + Some(offloader), + format!( + "elected {} among {:?} peers, with {} of them being caughtup", + offloader, + capable_peers_dbg, + caughtup_peers.len() + ), + ) + } + } +} + +/// Based on peer information determine which safekeeper should offload; if it +/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task +/// is running, kill it. +async fn update_task( conf: &SafeKeeperConf, ttid: TenantTimelineId, - task: &mut WalBackupTimelineEntry, + entry: &mut WalBackupTimelineEntry, ) { - if !task.timeline.can_wal_backup() { - return; + let alive_peers = entry.timeline.get_peers(conf); + let wal_backup_lsn = entry.timeline.get_wal_backup_lsn(); + let (offloader, election_dbg_str) = + determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf); + let elected_me = Some(conf.my_id) == offloader; + + if elected_me != (entry.handle.is_some()) { + if elected_me { + info!("elected for backup {}: {}", ttid, election_dbg_str); + + let (shutdown_tx, shutdown_rx) = mpsc::channel(1); + let timeline_dir = conf.timeline_dir(&ttid); + + let handle = tokio::spawn( + backup_task_main(ttid, timeline_dir, shutdown_rx) + .instrument(info_span!("WAL backup task", ttid = %ttid)), + ); + + entry.handle = Some(WalBackupTaskHandle { + shutdown_tx, + handle, + }); + } else { + info!("stepping down from backup {}: {}", ttid, election_dbg_str); + shut_down_task(ttid, entry).await; + } } - info!("starting WAL backup task for {}", ttid); - - // TODO: decide who should offload right here by simply checking current - // state instead of running elections in offloading task. - let election_name = SubscriptionKey { - cluster_prefix: conf.broker_etcd_prefix.clone(), - kind: SubscriptionKind::Operation( - ttid, - NodeKind::Safekeeper, - OperationKind::Safekeeper(SkOperationKind::WalBackup), - ), - } - .watch_key(); - let my_candidate_name = broker::get_candiate_name(conf.my_id); - let election = broker::Election::new( - election_name, - my_candidate_name, - conf.broker_endpoints.clone(), - ); - - let (shutdown_tx, shutdown_rx) = mpsc::channel(1); - let timeline_dir = conf.timeline_dir(&ttid); - - let handle = tokio::spawn( - backup_task_main(ttid, timeline_dir, shutdown_rx, election) - .instrument(info_span!("WAL backup task", ttid = %ttid)), - ); - - task.handle = Some(WalBackupTaskHandle { - shutdown_tx, - handle, - }); } const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000; @@ -158,27 +211,20 @@ async fn wal_backup_launcher_main_loop( timeline, handle: None, }); - consider_start_task(&conf, ttid, entry); + update_task(&conf, ttid, entry).await; } else { // need to stop the task info!("stopping WAL backup task for {}", ttid); - - let entry = tasks.remove(&ttid).unwrap(); - if let Some(wb_handle) = entry.handle { - // Tell the task to shutdown. Error means task exited earlier, that's ok. - let _ = wb_handle.shutdown_tx.send(()).await; - // Await the task itself. TODO: restart panicked tasks earlier. - if let Err(e) = wb_handle.handle.await { - warn!("WAL backup task for {} panicked: {}", ttid, e); - } - } + let mut entry = tasks.remove(&ttid).unwrap(); + shut_down_task(ttid, &mut entry).await; } } } - // Start known tasks, if needed and possible. + // For each timeline needing offloading, check if this safekeeper + // should do the job and start/stop the task accordingly. _ = ticker.tick() => { - for (ttid, entry) in tasks.iter_mut().filter(|(_, entry)| entry.handle.is_none()) { - consider_start_task(&conf, *ttid, entry); + for (ttid, entry) in tasks.iter_mut() { + update_task(&conf, *ttid, entry).await; } } } @@ -190,17 +236,13 @@ struct WalBackupTask { timeline_dir: PathBuf, wal_seg_size: usize, commit_lsn_watch_rx: watch::Receiver, - leader: Option, - election: Election, } -/// Offload single timeline. Called only after we checked that backup -/// is required (wal_backup_attend) and possible (can_wal_backup). +/// Offload single timeline. async fn backup_task_main( ttid: TenantTimelineId, timeline_dir: PathBuf, mut shutdown_rx: Receiver<()>, - election: Election, ) { info!("started"); let res = GlobalTimelines::get(ttid); @@ -215,8 +257,6 @@ async fn backup_task_main( commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), timeline: tli, timeline_dir, - leader: None, - election, }; // task is spinned up only when wal_seg_size already initialized @@ -229,9 +269,6 @@ async fn backup_task_main( canceled = true; } } - if let Some(l) = wb.leader { - l.give_up().await; - } info!("task {}", if canceled { "canceled" } else { "terminated" }); } @@ -239,106 +276,71 @@ impl WalBackupTask { async fn run(&mut self) { let mut backup_lsn = Lsn(0); - // election loop + let mut retry_attempt = 0u32; + // offload loop loop { - let mut retry_attempt = 0u32; + if retry_attempt == 0 { + // wait for new WAL to arrive + if let Err(e) = self.commit_lsn_watch_rx.changed().await { + // should never happen, as we hold Arc to timeline. + error!("commit_lsn watch shut down: {:?}", e); + return; + } + } else { + // or just sleep if we errored previously + let mut retry_delay = UPLOAD_FAILURE_RETRY_MAX_MS; + if let Some(backoff_delay) = UPLOAD_FAILURE_RETRY_MIN_MS.checked_shl(retry_attempt) + { + retry_delay = min(retry_delay, backoff_delay); + } + sleep(Duration::from_millis(retry_delay)).await; + } - info!("acquiring leadership"); - if let Err(e) = broker::get_leader(&self.election, &mut self.leader).await { - error!("error during leader election {:?}", e); - sleep(Duration::from_millis(BROKER_CONNECTION_RETRY_DELAY_MS)).await; + let commit_lsn = *self.commit_lsn_watch_rx.borrow(); + + // Note that backup_lsn can be higher than commit_lsn if we + // don't have much local WAL and others already uploaded + // segments we don't even have. + if backup_lsn.segment_number(self.wal_seg_size) + >= commit_lsn.segment_number(self.wal_seg_size) + { + retry_attempt = 0; + continue; /* nothing to do, common case as we wake up on every commit_lsn bump */ + } + // Perhaps peers advanced the position, check shmem value. + backup_lsn = self.timeline.get_wal_backup_lsn(); + if backup_lsn.segment_number(self.wal_seg_size) + >= commit_lsn.segment_number(self.wal_seg_size) + { + retry_attempt = 0; continue; } - info!("acquired leadership"); - // offload loop - loop { - if retry_attempt == 0 { - // wait for new WAL to arrive - if let Err(e) = self.commit_lsn_watch_rx.changed().await { - // should never happen, as we hold Arc to timeline. - error!("commit_lsn watch shut down: {:?}", e); + match backup_lsn_range( + backup_lsn, + commit_lsn, + self.wal_seg_size, + &self.timeline_dir, + ) + .await + { + Ok(backup_lsn_result) => { + backup_lsn = backup_lsn_result; + let res = self.timeline.set_wal_backup_lsn(backup_lsn_result); + if let Err(e) = res { + error!("failed to set wal_backup_lsn: {}", e); return; } - } else { - // or just sleep if we errored previously - let mut retry_delay = UPLOAD_FAILURE_RETRY_MAX_MS; - if let Some(backoff_delay) = - UPLOAD_FAILURE_RETRY_MIN_MS.checked_shl(retry_attempt) - { - retry_delay = min(retry_delay, backoff_delay); - } - sleep(Duration::from_millis(retry_delay)).await; + retry_attempt = 0; } + Err(e) => { + error!( + "failed while offloading range {}-{}: {:?}", + backup_lsn, commit_lsn, e + ); - let commit_lsn = *self.commit_lsn_watch_rx.borrow(); - - // Note that backup_lsn can be higher than commit_lsn if we - // don't have much local WAL and others already uploaded - // segments we don't even have. - if backup_lsn.segment_number(self.wal_seg_size) - >= commit_lsn.segment_number(self.wal_seg_size) - { - continue; /* nothing to do, common case as we wake up on every commit_lsn bump */ - } - // Perhaps peers advanced the position, check shmem value. - backup_lsn = self.timeline.get_wal_backup_lsn(); - if backup_lsn.segment_number(self.wal_seg_size) - >= commit_lsn.segment_number(self.wal_seg_size) - { - continue; - } - - if let Some(l) = self.leader.as_mut() { - // Optimization idea for later: - // Avoid checking election leader every time by returning current lease grant expiration time - // Re-check leadership only after expiration time, - // such approach would reduce overhead on write-intensive workloads - - match l - .check_am_i( - self.election.election_name.clone(), - self.election.candidate_name.clone(), - ) - .await - { - Ok(leader) => { - if !leader { - info!("lost leadership"); - break; - } - } - Err(e) => { - warn!("error validating leader, {:?}", e); - break; - } - } - } - - match backup_lsn_range( - backup_lsn, - commit_lsn, - self.wal_seg_size, - &self.timeline_dir, - ) - .await - { - Ok(backup_lsn_result) => { - backup_lsn = backup_lsn_result; - let res = self.timeline.set_wal_backup_lsn(backup_lsn_result); - if let Err(e) = res { - error!("backup error: {}", e); - return; - } - retry_attempt = 0; - } - Err(e) => { - error!( - "failed while offloading range {}-{}: {:?}", - backup_lsn, commit_lsn, e - ); - - retry_attempt = min(retry_attempt + 1, u32::MAX); + if retry_attempt < u32::MAX { + retry_attempt += 1; } } } From 6ff2c61ae0b80a5d53421a32d8c31c6a742b0072 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Fri, 21 Oct 2022 16:44:08 +0300 Subject: [PATCH 0946/1022] Refactor safekeeper s3 config and change it for new account (#2672) --- .github/ansible/neon-stress.hosts.yaml | 2 +- .github/ansible/production.hosts.yaml | 2 +- .github/ansible/staging.hosts.yaml | 2 +- .github/ansible/staging.us-east-2.hosts.yaml | 2 +- .github/ansible/systemd/safekeeper.service | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ansible/neon-stress.hosts.yaml b/.github/ansible/neon-stress.hosts.yaml index 8afc9a5be8..dd61ac5a5e 100644 --- a/.github/ansible/neon-stress.hosts.yaml +++ b/.github/ansible/neon-stress.hosts.yaml @@ -3,7 +3,6 @@ storage: bucket_name: neon-storage-ireland bucket_region: eu-west-1 console_mgmt_base_url: http://neon-stress-console.local - env_name: neon-stress etcd_endpoints: neon-stress-etcd.local:2379 safekeeper_enable_s3_offload: 'false' pageserver_config_stub: @@ -12,6 +11,7 @@ storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" prefix_in_bucket: "{{ inventory_hostname }}" + safekeeper_s3_prefix: neon-stress/wal hostname_suffix: ".local" remote_user: admin children: diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml index 9f9b12d25d..bca2614399 100644 --- a/.github/ansible/production.hosts.yaml +++ b/.github/ansible/production.hosts.yaml @@ -1,7 +1,6 @@ --- storage: vars: - env_name: prod-1 console_mgmt_base_url: http://console-release.local bucket_name: zenith-storage-oregon bucket_region: us-west-2 @@ -12,6 +11,7 @@ storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" prefix_in_bucket: "{{ inventory_hostname }}" + safekeeper_s3_prefix: prod-1/wal hostname_suffix: ".local" remote_user: admin diff --git a/.github/ansible/staging.hosts.yaml b/.github/ansible/staging.hosts.yaml index 7e91e8e728..44d971455d 100644 --- a/.github/ansible/staging.hosts.yaml +++ b/.github/ansible/staging.hosts.yaml @@ -3,7 +3,6 @@ storage: bucket_name: zenith-staging-storage-us-east-1 bucket_region: us-east-1 console_mgmt_base_url: http://console-staging.local - env_name: us-stage etcd_endpoints: zenith-us-stage-etcd.local:2379 pageserver_config_stub: pg_distrib_dir: /usr/local @@ -11,6 +10,7 @@ storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" prefix_in_bucket: "{{ inventory_hostname }}" + safekeeper_s3_prefix: us-stage/wal hostname_suffix: ".local" remote_user: admin diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index 5da0cce973..db3ed87c45 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -3,7 +3,6 @@ storage: bucket_name: neon-staging-storage-us-east-2 bucket_region: us-east-2 console_mgmt_base_url: http://console-staging.local - env_name: us-stage etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379 pageserver_config_stub: pg_distrib_dir: /usr/local @@ -11,6 +10,7 @@ storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" prefix_in_bucket: "pageserver/v1" + safekeeper_s3_prefix: safekeeper/v1/wal hostname_suffix: "" remote_user: ssm-user ansible_aws_ssm_region: us-east-2 diff --git a/.github/ansible/systemd/safekeeper.service b/.github/ansible/systemd/safekeeper.service index 877579fbfa..69827e36ac 100644 --- a/.github/ansible/systemd/safekeeper.service +++ b/.github/ansible/systemd/safekeeper.service @@ -6,7 +6,7 @@ After=network.target auditd.service Type=simple User=safekeeper Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib -ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}' +ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}' ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed KillSignal=SIGINT From 5928cb33c553913c28c7857f126fbab9d3537ff6 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 21 Oct 2022 18:51:48 +0300 Subject: [PATCH 0947/1022] Introduce timeline state (#2651) Similar to https://github.com/neondatabase/neon/pull/2395, introduces a state field in Timeline, that's possible to subscribe to. Adjusts * walreceiver to not to have any connections if timeline is not Active * remote storage sync to not to schedule uploads if timeline is Broken * not to create timelines if a tenant/timeline is broken * automatically switches timelines' states based on tenant state Does not adjust timeline's gc, checkpointing and layer flush behaviour much, since it's not safe to cancel these processes abruptly and there's task_mgr::shutdown_tasks that does similar thing. --- libs/pageserver_api/src/models.rs | 18 ++ pageserver/src/http/openapi_spec.yml | 3 + pageserver/src/http/routes.rs | 31 ++- pageserver/src/page_service.rs | 3 +- pageserver/src/tenant.rs | 240 +++++++++++------- pageserver/src/tenant/timeline.rs | 101 ++++++-- pageserver/src/tenant_tasks.rs | 2 +- .../src/walreceiver/connection_manager.rs | 104 ++++++-- test_runner/regress/test_broken_timeline.py | 8 +- test_runner/regress/test_timeline_delete.py | 2 +- 10 files changed, 367 insertions(+), 145 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index a153f1a01e..dd40ba9e1c 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -19,6 +19,22 @@ pub enum TenantState { Broken, } +/// A state of a timeline in pageserver's memory. +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum TimelineState { + /// Timeline is fully operational, its background jobs are running. + Active, + /// A timeline is recognized by pageserver, but not yet ready to operate. + /// The status indicates, that the timeline could eventually go back to Active automatically: + /// for example, if the owning tenant goes back to Active again. + Suspended, + /// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to + /// automatically become Active after certain events: only a management call can change this status. + Paused, + /// A timeline is recognized by the pageserver, but no longer used for any operations, as failed to get activated. + Broken, +} + #[serde_as] #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { @@ -160,6 +176,8 @@ pub struct TimelineInfo { pub remote_consistent_lsn: Option, pub awaits_download: bool, + pub state: TimelineState, + // Some of the above fields are duplicated in 'local' and 'remote', for backwards- // compatility with older clients. pub local: LocalTimelineInfo, diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 626cc07429..89609f5674 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -618,6 +618,7 @@ components: - last_record_lsn - disk_consistent_lsn - awaits_download + - state properties: timeline_id: type: string @@ -660,6 +661,8 @@ components: type: integer awaits_download: type: boolean + state: + type: string # These 'local' and 'remote' fields just duplicate some of the fields # above. They are kept for backwards-compatibility. They can be removed, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 489adbb2cf..8ec7604b8a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -129,6 +129,7 @@ async fn build_timeline_info( } }; let current_physical_size = Some(timeline.get_physical_size()); + let state = timeline.current_state(); let info = TimelineInfo { tenant_id: timeline.tenant_id, @@ -158,6 +159,7 @@ async fn build_timeline_info( remote_consistent_lsn, awaits_download, + state, // Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility // with the control plane. @@ -294,7 +296,7 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result format!("{}", lsn), + LsnForTimestamp::Present(lsn) => format!("{lsn}"), LsnForTimestamp::Future(_lsn) => "future".into(), LsnForTimestamp::Past(_lsn) => "past".into(), LsnForTimestamp::NoData(_lsn) => "nodata".into(), @@ -788,16 +789,16 @@ async fn timeline_gc_handler(mut request: Request) -> Result) -> Result) -> Result Result> { - tenant_mgr::get_tenant(tenant_id, true).and_then(|tenant| tenant.get_timeline(timeline_id)) + tenant_mgr::get_tenant(tenant_id, true) + .and_then(|tenant| tenant.get_timeline(timeline_id, true)) } /// diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 69c89a80b4..84833e9c40 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -11,7 +11,8 @@ //! parent timeline, and the last LSN that has been written to disk. //! -use anyhow::{bail, ensure, Context}; +use anyhow::{bail, Context}; +use pageserver_api::models::TimelineState; use tokio::sync::watch; use tracing::*; use utils::crashsafe::path_with_suffix_extension; @@ -189,6 +190,7 @@ impl UninitializedTimeline<'_> { "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}" ) })?; + new_timeline.set_state(TimelineState::Active); v.insert(Arc::clone(&new_timeline)); new_timeline.launch_wal_receiver(); } @@ -338,18 +340,26 @@ impl Tenant { /// Get Timeline handle for given Neon timeline ID. /// This function is idempotent. It doesn't change internal state in any way. - pub fn get_timeline(&self, timeline_id: TimelineId) -> anyhow::Result> { - self.timelines - .lock() - .unwrap() - .get(&timeline_id) - .with_context(|| { - format!( - "Timeline {} was not found for tenant {}", - timeline_id, self.tenant_id - ) - }) - .map(Arc::clone) + pub fn get_timeline( + &self, + timeline_id: TimelineId, + active_only: bool, + ) -> anyhow::Result> { + let timelines_accessor = self.timelines.lock().unwrap(); + let timeline = timelines_accessor.get(&timeline_id).with_context(|| { + format!("Timeline {}/{} was not found", self.tenant_id, timeline_id) + })?; + + if active_only && !timeline.is_active() { + anyhow::bail!( + "Timeline {}/{} is not active, state: {:?}", + self.tenant_id, + timeline_id, + timeline.current_state() + ) + } else { + Ok(Arc::clone(timeline)) + } } /// Lists timelines the tenant contains. @@ -372,6 +382,11 @@ impl Tenant { initdb_lsn: Lsn, pg_version: u32, ) -> anyhow::Result { + anyhow::ensure!( + self.is_active(), + "Cannot create empty timelines on inactive tenant" + ); + let timelines = self.timelines.lock().unwrap(); let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id, &timelines)?; drop(timelines); @@ -408,9 +423,14 @@ impl Tenant { mut ancestor_start_lsn: Option, pg_version: u32, ) -> anyhow::Result>> { + anyhow::ensure!( + self.is_active(), + "Cannot create timelines on inactive tenant" + ); + let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); - if self.get_timeline(new_timeline_id).is_ok() { + if self.get_timeline(new_timeline_id, false).is_ok() { debug!("timeline {new_timeline_id} already exists"); return Ok(None); } @@ -418,7 +438,7 @@ impl Tenant { let loaded_timeline = match ancestor_timeline_id { Some(ancestor_timeline_id) => { let ancestor_timeline = self - .get_timeline(ancestor_timeline_id) + .get_timeline(ancestor_timeline_id, false) .context("Cannot branch off the timeline that's not present in pageserver")?; if let Some(lsn) = ancestor_start_lsn.as_mut() { @@ -470,6 +490,11 @@ impl Tenant { pitr: Duration, checkpoint_before_gc: bool, ) -> anyhow::Result { + anyhow::ensure!( + self.is_active(), + "Cannot run GC iteration on inactive tenant" + ); + let timeline_str = target_timeline_id .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); @@ -486,6 +511,11 @@ impl Tenant { /// Also it can be explicitly requested per timeline through page server /// api's 'compact' command. pub fn compaction_iteration(&self) -> anyhow::Result<()> { + anyhow::ensure!( + self.is_active(), + "Cannot run compaction iteration on inactive tenant" + ); + // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // compactions. We don't want to block everything else while the @@ -493,6 +523,7 @@ impl Tenant { let timelines = self.timelines.lock().unwrap(); let timelines_to_compact = timelines .iter() + .filter(|(_, timeline)| timeline.is_active()) .map(|(timeline_id, timeline)| (*timeline_id, timeline.clone())) .collect::>(); drop(timelines); @@ -515,13 +546,13 @@ impl Tenant { // checkpoints. We don't want to block everything else while the // checkpoint runs. let timelines = self.timelines.lock().unwrap(); - let timelines_to_compact = timelines + let timelines_to_checkpoint = timelines .iter() .map(|(timeline_id, timeline)| (*timeline_id, Arc::clone(timeline))) .collect::>(); drop(timelines); - for (timeline_id, timeline) in &timelines_to_compact { + for (timeline_id, timeline) in &timelines_to_checkpoint { let _entered = info_span!("checkpoint", timeline = %timeline_id, tenant = %self.tenant_id) .entered(); @@ -543,7 +574,7 @@ impl Tenant { .iter() .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); - ensure!( + anyhow::ensure!( !children_exist, "Cannot delete timeline which has child timelines" ); @@ -552,7 +583,10 @@ impl Tenant { Entry::Vacant(_) => bail!("timeline not found"), }; - let layer_removal_guard = timeline_entry.get().layer_removal_guard()?; + let timeline = timeline_entry.get(); + timeline.set_state(TimelineState::Paused); + + let layer_removal_guard = timeline.layer_removal_guard()?; let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id); std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { @@ -569,58 +603,6 @@ impl Tenant { Ok(()) } - pub fn init_attach_timelines( - &self, - timelines: HashMap, - ) -> anyhow::Result<()> { - let sorted_timelines = if timelines.len() == 1 { - timelines.into_iter().collect() - } else if !timelines.is_empty() { - tree_sort_timelines(timelines)? - } else { - warn!("No timelines to attach received"); - return Ok(()); - }; - - let mut timelines_accessor = self.timelines.lock().unwrap(); - for (timeline_id, metadata) in sorted_timelines { - info!( - "Attaching timeline {} pg_version {}", - timeline_id, - metadata.pg_version() - ); - - if timelines_accessor.contains_key(&timeline_id) { - warn!( - "Timeline {}/{} already exists in the tenant map, skipping its initialization", - self.tenant_id, timeline_id - ); - continue; - } else { - let ancestor = metadata - .ancestor_timeline() - .and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id)) - .cloned(); - let timeline = UninitializedTimeline { - owning_tenant: self, - timeline_id, - raw_timeline: Some(( - self.create_timeline_data(timeline_id, metadata, ancestor) - .with_context(|| { - format!("Failed to initialize timeline {timeline_id}") - })?, - TimelineUninitMark::dummy(), - )), - }; - let initialized_timeline = - timeline.initialize_with_lock(&mut timelines_accessor, true)?; - timelines_accessor.insert(timeline_id, initialized_timeline); - } - } - - Ok(()) - } - /// Allows to retrieve remote timeline index from the tenant. Used in walreceiver to grab remote consistent lsn. pub fn get_remote_index(&self) -> &RemoteIndex { &self.remote_index @@ -661,10 +643,30 @@ impl Tenant { } (_, new_state) => { self.state.send_replace(new_state); - if self.should_run_tasks() { - // Spawn gc and compaction loops. The loops will shut themselves - // down when they notice that the tenant is inactive. - crate::tenant_tasks::start_background_loops(self.tenant_id); + + let timelines_accessor = self.timelines.lock().unwrap(); + let not_broken_timelines = timelines_accessor + .values() + .filter(|timeline| timeline.current_state() != TimelineState::Broken); + match new_state { + TenantState::Active { + background_jobs_running, + } => { + if background_jobs_running { + // Spawn gc and compaction loops. The loops will shut themselves + // down when they notice that the tenant is inactive. + crate::tenant_tasks::start_background_loops(self.tenant_id); + } + + for timeline in not_broken_timelines { + timeline.set_state(TimelineState::Active); + } + } + TenantState::Paused | TenantState::Broken => { + for timeline in not_broken_timelines { + timeline.set_state(TimelineState::Suspended); + } + } } } } @@ -993,6 +995,7 @@ impl Tenant { timelines .iter() + .filter(|(_, timeline)| timeline.is_active()) .map(|(timeline_id, timeline_entry)| { // This is unresolved question for now, how to do gc in presence of remote timelines // especially when this is combined with branching. @@ -1026,7 +1029,7 @@ impl Tenant { for timeline_id in timeline_ids { // Timeline is known to be local and loaded. let timeline = self - .get_timeline(timeline_id) + .get_timeline(timeline_id, false) .with_context(|| format!("Timeline {timeline_id} was not found"))?; // If target_timeline is specified, ignore all other timelines @@ -1111,7 +1114,7 @@ impl Tenant { // Step 2 is to avoid initializing the new branch using data removed by past GC iterations // or in-queue GC iterations. - let src_timeline = self.get_timeline(src).with_context(|| { + let src_timeline = self.get_timeline(src, false).with_context(|| { format!( "No ancestor {} found for timeline {}/{}", src, self.tenant_id, dst @@ -1381,6 +1384,68 @@ impl Tenant { Ok(uninit_mark) } + + pub(super) fn init_attach_timelines( + &self, + timelines: HashMap, + ) -> anyhow::Result<()> { + let sorted_timelines = if timelines.len() == 1 { + timelines.into_iter().collect() + } else if !timelines.is_empty() { + tree_sort_timelines(timelines)? + } else { + warn!("No timelines to attach received"); + return Ok(()); + }; + + let tenant_id = self.tenant_id; + let mut timelines_accessor = self.timelines.lock().unwrap(); + for (timeline_id, metadata) in sorted_timelines { + info!( + "Attaching timeline {}/{} pg_version {}", + tenant_id, + timeline_id, + metadata.pg_version() + ); + + if timelines_accessor.contains_key(&timeline_id) { + warn!("Timeline {tenant_id}/{timeline_id} already exists in the tenant map, skipping its initialization"); + continue; + } + + let ancestor = metadata + .ancestor_timeline() + .and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id)) + .cloned(); + let dummy_timeline = self + .create_timeline_data(timeline_id, metadata.clone(), ancestor.clone()) + .with_context(|| { + format!("Failed to crate dummy timeline data for {tenant_id}/{timeline_id}") + })?; + let timeline = UninitializedTimeline { + owning_tenant: self, + timeline_id, + raw_timeline: Some((dummy_timeline, TimelineUninitMark::dummy())), + }; + match timeline.initialize_with_lock(&mut timelines_accessor, true) { + Ok(initialized_timeline) => { + timelines_accessor.insert(timeline_id, initialized_timeline); + } + Err(e) => { + error!("Failed to initialize timeline {tenant_id}/{timeline_id}: {e:?}"); + let broken_timeline = self + .create_timeline_data(timeline_id, metadata, ancestor) + .with_context(|| { + format!("Failed to crate broken timeline data for {tenant_id}/{timeline_id}") + })?; + broken_timeline.set_state(TimelineState::Broken); + timelines_accessor.insert(timeline_id, Arc::new(broken_timeline)); + } + } + } + + Ok(()) + } } /// Create the cluster temporarily in 'initdbpath' directory inside the repository @@ -1608,6 +1673,9 @@ pub mod harness { timelines_to_load.insert(timeline_id, timeline_metadata); } tenant.init_attach_timelines(timelines_to_load)?; + tenant.set_state(TenantState::Active { + background_jobs_running: false, + }); Ok(tenant) } @@ -1767,7 +1835,7 @@ mod tests { // Branch the history, modify relation differently on the new timeline tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; let newtline = tenant - .get_timeline(NEW_TIMELINE_ID) + .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); let new_writer = newtline.writer(); new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?; @@ -1923,7 +1991,7 @@ mod tests { tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = tenant - .get_timeline(NEW_TIMELINE_ID) + .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; @@ -1942,7 +2010,7 @@ mod tests { tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = tenant - .get_timeline(NEW_TIMELINE_ID) + .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); make_some_layers(newtline.as_ref(), Lsn(0x60))?; @@ -1974,7 +2042,7 @@ mod tests { let tenant = harness.load(); tenant - .get_timeline(TIMELINE_ID) + .get_timeline(TIMELINE_ID, true) .expect("cannot load timeline"); Ok(()) @@ -1997,7 +2065,7 @@ mod tests { tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = tenant - .get_timeline(NEW_TIMELINE_ID) + .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); make_some_layers(newtline.as_ref(), Lsn(0x60))?; @@ -2009,11 +2077,11 @@ mod tests { // check that both, child and ancestor are loaded let _child_tline = tenant - .get_timeline(NEW_TIMELINE_ID) + .get_timeline(NEW_TIMELINE_ID, true) .expect("cannot get child timeline loaded"); let _ancestor_tline = tenant - .get_timeline(TIMELINE_ID) + .get_timeline(TIMELINE_ID, true) .expect("cannot get ancestor timeline loaded"); Ok(()) @@ -2267,7 +2335,7 @@ mod tests { let new_tline_id = TimelineId::generate(); tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; tline = tenant - .get_timeline(new_tline_id) + .get_timeline(new_tline_id, true) .expect("Should have the branched timeline"); tline_id = new_tline_id; @@ -2330,7 +2398,7 @@ mod tests { let new_tline_id = TimelineId::generate(); tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; tline = tenant - .get_timeline(new_tline_id) + .get_timeline(new_tline_id, true) .expect("Should have the branched timeline"); tline_id = new_tline_id; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index ccd094b65a..194ca0d857 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -5,6 +5,8 @@ use bytes::Bytes; use fail::fail_point; use itertools::Itertools; use once_cell::sync::OnceCell; +use pageserver_api::models::TimelineState; +use tokio::sync::watch; use tokio::task::spawn_blocking; use tracing::*; @@ -160,6 +162,8 @@ pub struct Timeline { /// Relation size cache pub rel_size_cache: RwLock>, + + state: watch::Sender, } /// Internal structure to hold all data needed for logical size calculation. @@ -416,9 +420,11 @@ impl Timeline { /// those functions with an LSN that has been processed yet is an error. /// pub async fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { + anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline"); + // This should never be called from the WAL receiver, because that could lead // to a deadlock. - ensure!( + anyhow::ensure!( task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnection), "wait_lsn cannot be called in WAL receiver" ); @@ -635,6 +641,35 @@ impl Timeline { } Ok(()) } + + pub fn set_state(&self, new_state: TimelineState) { + match (self.current_state(), new_state) { + (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => { + debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}"); + } + (TimelineState::Broken, _) => { + error!("Ignoring state update {new_state:?} for broken tenant"); + } + (TimelineState::Paused, TimelineState::Active) => { + debug!("Not activating a paused timeline"); + } + (_, new_state) => { + self.state.send_replace(new_state); + } + } + } + + pub fn current_state(&self) -> TimelineState { + *self.state.borrow() + } + + pub fn is_active(&self) -> bool { + self.current_state() == TimelineState::Active + } + + pub fn subscribe_for_state_updates(&self) -> watch::Receiver { + self.state.subscribe() + } } // Private functions @@ -688,8 +723,9 @@ impl Timeline { walredo_mgr: Arc, upload_layers: bool, pg_version: u32, - ) -> Timeline { + ) -> Self { let disk_consistent_lsn = metadata.disk_consistent_lsn(); + let (state, _) = watch::channel(TimelineState::Suspended); let mut result = Timeline { conf, @@ -746,6 +782,7 @@ impl Timeline { last_received_wal: Mutex::new(None), rel_size_cache: RwLock::new(HashMap::new()), + state, }; result.repartition_threshold = result.get_checkpoint_distance() / 10; result @@ -883,8 +920,6 @@ impl Timeline { } fn try_spawn_size_init_task(self: &Arc, init_lsn: Lsn) { - let timeline_id = self.timeline_id; - // Atomically check if the timeline size calculation had already started. // If the flag was not already set, this sets it. if !self @@ -901,17 +936,42 @@ impl Timeline { "initial size calculation", false, async move { - let calculated_size = self_clone.calculate_logical_size(init_lsn)?; - let result = spawn_blocking(move || { - self_clone.current_logical_size.initial_logical_size.set(calculated_size) - }).await?; - match result { - Ok(()) => info!("Successfully calculated initial logical size"), - Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"), + let mut timeline_state_updates = self_clone.subscribe_for_state_updates(); + let self_calculation = Arc::clone(&self_clone); + tokio::select! { + calculation_result = spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn)) => { + let calculated_size = calculation_result + .context("Failed to spawn calculation result task")? + .context("Failed to calculate logical size")?; + match self_clone.current_logical_size.initial_logical_size.set(calculated_size) { + Ok(()) => info!("Successfully calculated initial logical size"), + Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"), + } + Ok(()) + }, + new_event = async { + loop { + match timeline_state_updates.changed().await { + Ok(()) => { + let new_state = *timeline_state_updates.borrow(); + match new_state { + // we're running this job for active timelines only + TimelineState::Active => continue, + TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return Some(new_state), + } + } + Err(_sender_dropped_error) => return None, + } + } + } => { + match new_event { + Some(new_state) => info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates"), + None => info!("Timeline dropped state updates sender, stopping init size calculation"), + } + Ok(()) + }, } - Ok(()) - } - .instrument(info_span!("initial_logical_size_calculation", timeline = %timeline_id)) + }.instrument(info_span!("initial_logical_size_calculation", tenant = %self.tenant_id, timeline = %self.timeline_id)), ); } } @@ -1356,7 +1416,7 @@ impl Timeline { false, )?; - if self.upload_layers.load(atomic::Ordering::Relaxed) { + if self.can_upload_layers() { storage_sync::schedule_layer_upload( self.tenant_id, self.timeline_id, @@ -1826,7 +1886,7 @@ impl Timeline { } drop(layers); - if self.upload_layers.load(atomic::Ordering::Relaxed) { + if self.can_upload_layers() { storage_sync::schedule_layer_upload( self.tenant_id, self.timeline_id, @@ -1930,7 +1990,7 @@ impl Timeline { /// obsolete. /// pub(super) fn gc(&self) -> anyhow::Result { - let mut result: GcResult = Default::default(); + let mut result: GcResult = GcResult::default(); let now = SystemTime::now(); fail_point!("before-timeline-gc"); @@ -2110,7 +2170,7 @@ impl Timeline { fail_point!("after-timeline-gc-removed-layers"); } - if self.upload_layers.load(atomic::Ordering::Relaxed) { + if self.can_upload_layers() { storage_sync::schedule_layer_delete( self.tenant_id, self.timeline_id, @@ -2199,6 +2259,11 @@ impl Timeline { } } } + + fn can_upload_layers(&self) -> bool { + self.upload_layers.load(atomic::Ordering::Relaxed) + && self.current_state() != TimelineState::Broken + } } /// Helper function for get_reconstruct_data() to add the path of layers traversed diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 030055df6d..23ce9dc699 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -175,7 +175,7 @@ async fn wait_for_active_tenant( } state => { debug!("Not running the task loop, tenant is not active with background jobs enabled: {state:?}"); - tokio::time::sleep(wait).await; + continue; } } } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 2380caaff1..53dd2d8eac 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -12,6 +12,7 @@ use std::{ collections::{hash_map, HashMap}, num::NonZeroU64, + ops::ControlFlow, sync::Arc, time::Duration, }; @@ -26,7 +27,8 @@ use etcd_broker::{ subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription, BrokerUpdate, Client, }; -use tokio::select; +use pageserver_api::models::TimelineState; +use tokio::{select, sync::watch}; use tracing::*; use crate::{ @@ -58,10 +60,7 @@ pub fn spawn_connection_manager_task( TaskKind::WalReceiverManager, Some(tenant_id), Some(timeline_id), - &format!( - "walreceiver for tenant {} timeline {}", - timeline.tenant_id, timeline.timeline_id - ), + &format!("walreceiver for timeline {tenant_id}/{timeline_id}"), false, async move { info!("WAL receiver broker started, connecting to etcd"); @@ -75,19 +74,21 @@ pub fn spawn_connection_manager_task( select! { _ = task_mgr::shutdown_watcher() => { info!("WAL receiver shutdown requested, shutting down"); - // Kill current connection, if any - if let Some(wal_connection) = walreceiver_state.wal_connection.take() - { - wal_connection.connection_task.shutdown().await; - } + walreceiver_state.shutdown().await; return Ok(()); }, - - _ = connection_manager_loop_step( + loop_step_result = connection_manager_loop_step( &broker_loop_prefix, &mut etcd_client, &mut walreceiver_state, - ) => {}, + ) => match loop_step_result { + ControlFlow::Continue(()) => continue, + ControlFlow::Break(()) => { + info!("Connection manager loop ended, shutting down"); + walreceiver_state.shutdown().await; + return Ok(()); + } + }, } } } @@ -104,7 +105,17 @@ async fn connection_manager_loop_step( broker_prefix: &str, etcd_client: &mut Client, walreceiver_state: &mut WalreceiverState, -) { +) -> ControlFlow<(), ()> { + let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates(); + + match wait_for_active_timeline(&mut timeline_state_updates).await { + ControlFlow::Continue(()) => {} + ControlFlow::Break(()) => { + info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop"); + return ControlFlow::Break(()); + } + } + let id = TenantTimelineId { tenant_id: walreceiver_state.timeline.tenant_id, timeline_id: walreceiver_state.timeline.timeline_id, @@ -129,10 +140,12 @@ async fn connection_manager_loop_step( // - change connection if the rules decide so, or if the current connection dies // - receive updates from broker // - this might change the current desired connection + // - timeline state changes to something that does not allow walreceiver to run concurrently select! { broker_connection_result = &mut broker_subscription.watcher_handle => { + info!("Broker connection was closed from the other side, ending current broker loop step"); cleanup_broker_connection(broker_connection_result, walreceiver_state); - return; + return ControlFlow::Continue(()); }, Some(wal_connection_update) = async { @@ -185,11 +198,36 @@ async fn connection_manager_loop_step( (&mut broker_subscription.watcher_handle).await, walreceiver_state, ); - return; + return ControlFlow::Continue(()); } } }, + new_event = async { + loop { + match timeline_state_updates.changed().await { + Ok(()) => { + let new_state = walreceiver_state.timeline.current_state(); + match new_state { + // we're already active as walreceiver, no need to reactivate + TimelineState::Active => continue, + TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return ControlFlow::Continue(new_state), + } + } + Err(_sender_dropped_error) => return ControlFlow::Break(()), + } + } + } => match new_event { + ControlFlow::Continue(new_state) => { + info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates"); + return ControlFlow::Continue(()); + } + ControlFlow::Break(()) => { + info!("Timeline dropped state updates sender, stopping wal connection manager loop"); + return ControlFlow::Break(()); + } + }, + _ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {} } @@ -216,6 +254,34 @@ async fn connection_manager_loop_step( } } +async fn wait_for_active_timeline( + timeline_state_updates: &mut watch::Receiver, +) -> ControlFlow<(), ()> { + let current_state = *timeline_state_updates.borrow(); + if current_state == TimelineState::Active { + return ControlFlow::Continue(()); + } + + loop { + match timeline_state_updates.changed().await { + Ok(()) => { + let new_state = *timeline_state_updates.borrow(); + match new_state { + TimelineState::Active => { + debug!("Timeline state changed to active, continuing the walreceiver connection manager"); + return ControlFlow::Continue(()); + } + state => { + debug!("Not running the walreceiver connection manager, timeline is not active: {state:?}"); + continue; + } + } + } + Err(_sender_dropped_error) => return ControlFlow::Break(()), + } + } +} + fn cleanup_broker_connection( broker_connection_result: Result, tokio::task::JoinError>, walreceiver_state: &mut WalreceiverState, @@ -723,6 +789,12 @@ impl WalreceiverState { self.wal_connection_retries.remove(&node_id); } } + + async fn shutdown(mut self) { + if let Some(wal_connection) = self.wal_connection.take() { + wal_connection.connection_task.shutdown().await; + } + } } #[derive(Debug, PartialEq, Eq)] diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 101cce9ffc..b747af4d09 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -70,18 +70,14 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # But all others are broken # First timeline would not get loaded into pageserver due to corrupt metadata file - with pytest.raises( - Exception, match=f"Timeline {timeline1} was not found for tenant {tenant1}" - ) as err: + with pytest.raises(Exception, match=f"Timeline {tenant1}/{timeline1} was not found") as err: pg1.start() log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") # Second timeline has no ancestors, only the metadata file and no layer files # We don't have the remote storage enabled, which means timeline is in an incorrect state, # it's not loaded at all - with pytest.raises( - Exception, match=f"Timeline {timeline2} was not found for tenant {tenant2}" - ) as err: + with pytest.raises(Exception, match=f"Timeline {tenant2}/{timeline2} was not found") as err: pg2.start() log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index de05d445ed..4a78a2746e 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -65,7 +65,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # check 404 with pytest.raises( NeonPageserverApiException, - match=f"Timeline {leaf_timeline_id} was not found for tenant {env.initial_tenant}", + match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found", ): ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) From 71ef7b666305350d59298cc9c53d721be509a093 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s?= Date: Fri, 21 Oct 2022 19:02:31 +0200 Subject: [PATCH 0948/1022] Remove cached_property package (#2673) Co-authored-by: andres --- poetry.lock | 12 ------------ pyproject.toml | 2 -- test_runner/fixtures/neon_fixtures.py | 2 +- test_runner/regress/test_close_fds.py | 2 +- 4 files changed, 2 insertions(+), 16 deletions(-) diff --git a/poetry.lock b/poetry.lock index 2af0d97511..27de8508ce 100644 --- a/poetry.lock +++ b/poetry.lock @@ -514,14 +514,6 @@ python-versions = ">=3.7" [package.dependencies] typing-extensions = ">=4.1.0" -[[package]] -name = "cached-property" -version = "1.5.2" -description = "A decorator for caching properties in classes." -category = "main" -optional = false -python-versions = "*" - [[package]] name = "certifi" version = "2022.6.15" @@ -1647,10 +1639,6 @@ botocore-stubs = [ {file = "botocore-stubs-1.27.38.tar.gz", hash = "sha256:408e8b86b5d171b58f81c74ca9d3b5317a5a8e2d3bc2073aa841ac13b8939e56"}, {file = "botocore_stubs-1.27.38-py3-none-any.whl", hash = "sha256:7add7641e9a479a9c8366893bb522fd9ca3d58714201e43662a200a148a1bc38"}, ] -cached-property = [ - {file = "cached-property-1.5.2.tar.gz", hash = "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130"}, - {file = "cached_property-1.5.2-py2.py3-none-any.whl", hash = "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"}, -] certifi = [ {file = "certifi-2022.6.15-py3-none-any.whl", hash = "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412"}, {file = "certifi-2022.6.15.tar.gz", hash = "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d"}, diff --git a/pyproject.toml b/pyproject.toml index 9c2aa39c7c..1ee6fbe6f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,6 @@ requests = "^2.26.0" pytest-xdist = "^2.3.0" asyncpg = "^0.24.0" aiopg = "^1.3.1" -cached-property = "^1.5.2" Jinja2 = "^3.0.2" types-requests = "^2.28.5" types-psycopg2 = "^2.9.18" @@ -74,7 +73,6 @@ strict = true [[tool.mypy.overrides]] module = [ "asyncpg.*", - "cached_property.*", "pg8000.*", ] ignore_missing_imports = true diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index a77b3958c9..4b2638bb2a 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -17,6 +17,7 @@ import uuid from contextlib import closing, contextmanager from dataclasses import dataclass, field from enum import Flag, auto +from functools import cached_property from pathlib import Path from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, TypeVar, Union, cast @@ -27,7 +28,6 @@ import jwt import psycopg2 import pytest import requests -from cached_property import cached_property from fixtures.log_helper import log from fixtures.types import Lsn, TenantId, TimelineId diff --git a/test_runner/regress/test_close_fds.py b/test_runner/regress/test_close_fds.py index c7ea37f9c8..22f245f79b 100644 --- a/test_runner/regress/test_close_fds.py +++ b/test_runner/regress/test_close_fds.py @@ -1,10 +1,10 @@ import os.path import shutil import subprocess +import threading import time from contextlib import closing -from cached_property import threading from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv From 321aeac3d4f15e84ca615c499caf35033bc891e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Fri, 21 Oct 2022 20:30:20 +0300 Subject: [PATCH 0949/1022] Json logging capability (#2624) * Support configuring the log format as json or plain. Separately test json and plain logger. They would be competing on the same global subscriber otherwise. * Implement log_format for pageserver config * Implement configurable log format for safekeeper. --- Cargo.lock | 15 ++++++ libs/utils/Cargo.toml | 4 +- libs/utils/src/logging.rs | 78 ++++++++++++++++++++++----- libs/utils/tests/logger_json_test.rs | 36 +++++++++++++ libs/utils/tests/logger_plain_test.rs | 36 +++++++++++++ pageserver/src/bin/pageserver.rs | 2 +- pageserver/src/config.rs | 22 ++++++++ safekeeper/src/bin/safekeeper.rs | 19 +++++-- safekeeper/src/lib.rs | 7 ++- 9 files changed, 200 insertions(+), 19 deletions(-) create mode 100644 libs/utils/tests/logger_json_test.rs create mode 100644 libs/utils/tests/logger_plain_test.rs diff --git a/Cargo.lock b/Cargo.lock index 657baf5d80..13774f7fe6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3932,6 +3932,16 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-serde" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1" +dependencies = [ + "serde", + "tracing-core", +] + [[package]] name = "tracing-subscriber" version = "0.3.16" @@ -3942,12 +3952,15 @@ dependencies = [ "nu-ansi-term", "once_cell", "regex", + "serde", + "serde_json", "sharded-slab", "smallvec", "thread_local", "tracing", "tracing-core", "tracing-log", + "tracing-serde", ] [[package]] @@ -4042,6 +4055,8 @@ dependencies = [ "serde_json", "serde_with", "signal-hook", + "strum", + "strum_macros", "tempfile", "thiserror", "tokio", diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index a7baddada4..1753ee81b9 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -19,7 +19,7 @@ thiserror = "1.0" tokio = { version = "1.17", features = ["macros"]} tokio-rustls = "0.23" tracing = "0.1" -tracing-subscriber = { version = "0.3", features = ["env-filter"] } +tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } nix = "0.25" signal-hook = "0.3.10" rand = "0.8.3" @@ -30,6 +30,8 @@ rustls-split = "0.3.0" git-version = "0.3.5" serde_with = "2.0" once_cell = "1.13.0" +strum = "0.24" +strum_macros = "0.24" metrics = { path = "../metrics" } diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 1576a54c8e..31c0e02f98 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -1,11 +1,35 @@ use std::{ fs::{File, OpenOptions}, path::Path, + str::FromStr, }; use anyhow::{Context, Result}; +use strum_macros::{EnumString, EnumVariantNames}; -pub fn init(log_filename: impl AsRef, daemonize: bool) -> Result { +#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)] +#[strum(serialize_all = "snake_case")] +pub enum LogFormat { + Plain, + Json, +} + +impl LogFormat { + pub fn from_config(s: &str) -> anyhow::Result { + use strum::VariantNames; + LogFormat::from_str(s).with_context(|| { + format!( + "Unrecognized log format. Please specify one of: {:?}", + LogFormat::VARIANTS + ) + }) + } +} +pub fn init( + log_filename: impl AsRef, + daemonize: bool, + log_format: LogFormat, +) -> Result { // Don't open the same file for output multiple times; // the different fds could overwrite each other's output. let log_file = OpenOptions::new() @@ -21,22 +45,50 @@ pub fn init(log_filename: impl AsRef, daemonize: bool) -> Result { let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str)); + let x: File = log_file.try_clone().unwrap(); let base_logger = tracing_subscriber::fmt() .with_env_filter(env_filter) - .with_target(false) // don't include event targets - .with_ansi(false); // don't use colors in log file; + .with_target(false) + .with_ansi(false) + .with_writer(move || -> Box { + // we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it + // if we do not use daemonization (e.g. in docker) it is better to log to stdout directly + // for example to be in line with docker log command which expects logs comimg from stdout + if daemonize { + Box::new(x.try_clone().unwrap()) + } else { + Box::new(std::io::stdout()) + } + }); - // we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it - // if we do not use daemonization (e.g. in docker) it is better to log to stdout directly - // for example to be in line with docker log command which expects logs comimg from stdout - if daemonize { - let x = log_file.try_clone().unwrap(); - base_logger - .with_writer(move || x.try_clone().unwrap()) - .init(); - } else { - base_logger.init(); + match log_format { + LogFormat::Json => base_logger.json().init(), + LogFormat::Plain => base_logger.init(), } Ok(log_file) } + +// #[cfg(test)] +// Due to global logger, can't run tests in same process. +// So until there's a non-global one, the tests are in ../tests/ as separate files. +#[macro_export(local_inner_macros)] +macro_rules! test_init_file_logger { + ($log_level:expr, $log_format:expr) => {{ + use std::str::FromStr; + std::env::set_var("RUST_LOG", $log_level); + + let tmp_dir = tempfile::TempDir::new().unwrap(); + let log_file_path = tmp_dir.path().join("logfile"); + + let log_format = $crate::logging::LogFormat::from_str($log_format).unwrap(); + let _log_file = $crate::logging::init(&log_file_path, true, log_format).unwrap(); + + let log_file = std::fs::OpenOptions::new() + .read(true) + .open(&log_file_path) + .unwrap(); + + log_file + }}; +} diff --git a/libs/utils/tests/logger_json_test.rs b/libs/utils/tests/logger_json_test.rs new file mode 100644 index 0000000000..5d63b9b004 --- /dev/null +++ b/libs/utils/tests/logger_json_test.rs @@ -0,0 +1,36 @@ +// This could be in ../src/logging.rs but since the logger is global, these +// can't be run in threads of the same process +use std::fs::File; +use std::io::{BufRead, BufReader, Lines}; +use tracing::*; +use utils::test_init_file_logger; + +fn read_lines(file: File) -> Lines> { + BufReader::new(file).lines() +} + +#[test] +fn test_json_format_has_message_and_custom_field() { + std::env::set_var("RUST_LOG", "info"); + + let log_file = test_init_file_logger!("info", "json"); + + let custom_field: &str = "hi"; + trace!(custom = %custom_field, "test log message"); + debug!(custom = %custom_field, "test log message"); + info!(custom = %custom_field, "test log message"); + warn!(custom = %custom_field, "test log message"); + error!(custom = %custom_field, "test log message"); + + let lines = read_lines(log_file); + for line in lines { + let content = line.unwrap(); + let json_object = serde_json::from_str::(&content).unwrap(); + + assert_eq!(json_object["fields"]["custom"], "hi"); + assert_eq!(json_object["fields"]["message"], "test log message"); + + assert_ne!(json_object["level"], "TRACE"); + assert_ne!(json_object["level"], "DEBUG"); + } +} diff --git a/libs/utils/tests/logger_plain_test.rs b/libs/utils/tests/logger_plain_test.rs new file mode 100644 index 0000000000..bc5abf45dd --- /dev/null +++ b/libs/utils/tests/logger_plain_test.rs @@ -0,0 +1,36 @@ +// This could be in ../src/logging.rs but since the logger is global, these +// can't be run in threads of the same process +use std::fs::File; +use std::io::{BufRead, BufReader, Lines}; +use tracing::*; +use utils::test_init_file_logger; + +fn read_lines(file: File) -> Lines> { + BufReader::new(file).lines() +} + +#[test] +fn test_plain_format_has_message_and_custom_field() { + std::env::set_var("RUST_LOG", "warn"); + + let log_file = test_init_file_logger!("warn", "plain"); + + let custom_field: &str = "hi"; + trace!(custom = %custom_field, "test log message"); + debug!(custom = %custom_field, "test log message"); + info!(custom = %custom_field, "test log message"); + warn!(custom = %custom_field, "test log message"); + error!(custom = %custom_field, "test log message"); + + let lines = read_lines(log_file); + for line in lines { + let content = line.unwrap(); + serde_json::from_str::(&content).unwrap_err(); + assert!(content.contains("custom=hi")); + assert!(content.contains("test log message")); + + assert!(!content.contains("TRACE")); + assert!(!content.contains("DEBUG")); + assert!(!content.contains("INFO")); + } +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 9317dd5dd7..802352be90 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -199,7 +199,7 @@ fn initialize_config( fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> { // Initialize logger - let log_file = logging::init(LOG_FILE_NAME, daemonize)?; + let log_file = logging::init(LOG_FILE_NAME, daemonize, conf.log_format)?; info!("version: {}", version()); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 4f80fc96b5..6a372fb081 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -17,6 +17,7 @@ use toml_edit::{Document, Item}; use url::Url; use utils::{ id::{NodeId, TenantId, TimelineId}, + logging::LogFormat, postgres_backend::AuthType, }; @@ -45,6 +46,8 @@ pub mod defaults { pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; + pub const DEFAULT_LOG_FORMAT: &str = "plain"; + /// /// Default built-in configuration file. /// @@ -63,6 +66,7 @@ pub mod defaults { # initial superuser role name to use when creating a new tenant #initial_superuser_name = '{DEFAULT_SUPERUSER}' +#log_format = '{DEFAULT_LOG_FORMAT}' # [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -126,6 +130,8 @@ pub struct PageServerConf { /// Etcd broker endpoints to connect to. pub broker_endpoints: Vec, + + pub log_format: LogFormat, } #[derive(Debug, Clone, PartialEq, Eq)] @@ -192,6 +198,8 @@ struct PageServerConfigBuilder { profiling: BuilderValue, broker_etcd_prefix: BuilderValue, broker_endpoints: BuilderValue>, + + log_format: BuilderValue, } impl Default for PageServerConfigBuilder { @@ -219,6 +227,7 @@ impl Default for PageServerConfigBuilder { profiling: Set(ProfilingConfig::Disabled), broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()), broker_endpoints: Set(Vec::new()), + log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), } } } @@ -291,6 +300,10 @@ impl PageServerConfigBuilder { self.profiling = BuilderValue::Set(profiling) } + pub fn log_format(&mut self, log_format: LogFormat) { + self.log_format = BuilderValue::Set(log_format) + } + pub fn build(self) -> anyhow::Result { let broker_endpoints = self .broker_endpoints @@ -335,6 +348,7 @@ impl PageServerConfigBuilder { broker_etcd_prefix: self .broker_etcd_prefix .ok_or(anyhow!("missing broker_etcd_prefix"))?, + log_format: self.log_format.ok_or(anyhow!("missing log_format"))?, }) } } @@ -459,6 +473,9 @@ impl PageServerConf { }) .collect::>()?, ), + "log_format" => builder.log_format( + LogFormat::from_config(&parse_toml_string(key, item)?)? + ), _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -571,6 +588,7 @@ impl PageServerConf { default_tenant_conf: TenantConf::dummy_conf(), broker_endpoints: Vec::new(), broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), + log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), } } } @@ -665,6 +683,8 @@ max_file_descriptors = 333 initial_superuser_name = 'zzzz' id = 10 +log_format = 'json' + "#; #[test] @@ -704,6 +724,7 @@ id = 10 .parse() .expect("Failed to parse a valid broker endpoint URL")], broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), + log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), }, "Correct defaults should be used when no config values are provided" ); @@ -748,6 +769,7 @@ id = 10 .parse() .expect("Failed to parse a valid broker endpoint URL")], broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), + log_format: LogFormat::Json, }, "Should be able to parse all basic config values correctly" ); diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index a867aea5af..67c2c62f73 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -32,8 +32,12 @@ use safekeeper::GlobalTimelines; use safekeeper::SafeKeeperConf; use utils::auth::JwtAuth; use utils::{ - http::endpoint, id::NodeId, logging, project_git_version, shutdown::exit_now, signals, - tcp_listener, + http::endpoint, + id::NodeId, + logging::{self, LogFormat}, + project_git_version, + shutdown::exit_now, + signals, tcp_listener, }; const LOCK_FILE_NAME: &str = "safekeeper.lock"; @@ -131,11 +135,15 @@ fn main() -> anyhow::Result<()> { .get_one::("auth-validation-public-key-path") .map(PathBuf::from); + if let Some(log_format) = arg_matches.get_one::("log-format") { + conf.log_format = LogFormat::from_config(log_format)?; + } + start_safekeeper(conf, given_id, arg_matches.get_flag("init")) } fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { - let log_file = logging::init("safekeeper.log", conf.daemonize)?; + let log_file = logging::init("safekeeper.log", conf.daemonize, conf.log_format)?; info!("version: {GIT_VERSION}"); @@ -436,6 +444,11 @@ fn cli() -> Command { .long("auth-validation-public-key-path") .help("Path to an RSA .pem public key which is used to check JWT tokens") ) + .arg( + Arg::new("log-format") + .long("log-format") + .help("Format for logging, either 'plain' or 'json'") + ) } #[test] diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 19dff79b88..c3b8227e17 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -7,7 +7,10 @@ use std::path::PathBuf; use std::time::Duration; use url::Url; -use utils::id::{NodeId, TenantId, TenantTimelineId}; +use utils::{ + id::{NodeId, TenantId, TenantTimelineId}, + logging::LogFormat, +}; pub mod broker; pub mod control_file; @@ -64,6 +67,7 @@ pub struct SafeKeeperConf { pub auth_validation_public_key_path: Option, pub heartbeat_timeout: Duration, pub max_offloader_lag_bytes: u64, + pub log_format: LogFormat, } impl SafeKeeperConf { @@ -97,6 +101,7 @@ impl Default for SafeKeeperConf { auth_validation_public_key_path: None, heartbeat_timeout: DEFAULT_HEARTBEAT_TIMEOUT, max_offloader_lag_bytes: DEFAULT_MAX_OFFLOADER_LAG_BYTES, + log_format: LogFormat::Plain, } } } From 7b6431cbd7ca1cf7fe7fcf7001cecfd1102d7879 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 22 Oct 2022 14:59:18 +0300 Subject: [PATCH 0950/1022] Disable wal_log_hints by default (#2598) * Disable wal_log_hints by default * Remove obsolete comment anbout wal_log_hints --- control_plane/src/compute.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 9f32ad31c1..b3f90b5922 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -282,9 +282,7 @@ impl PostgresNode { fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> { let mut conf = PostgresConf::new(); conf.append("max_wal_senders", "10"); - // wal_log_hints is mandatory when running against pageserver (see gh issue#192) - // TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE? - conf.append("wal_log_hints", "on"); + conf.append("wal_log_hints", "off"); conf.append("max_replication_slots", "10"); conf.append("hot_standby", "on"); conf.append("shared_buffers", "1MB"); From 9f49605041cc4954eff96690c7874cc16ac3f8f8 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Sat, 22 Oct 2022 15:11:43 +0400 Subject: [PATCH 0951/1022] Fix division by zero panic in determine_offloader. --- safekeeper/src/wal_backup.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 13287bd036..0a43d6085c 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -95,7 +95,7 @@ fn determine_offloader( let capable_peers = alive_peers .iter() .filter(|p| p.local_start_lsn <= wal_backup_lsn); - match alive_peers.iter().map(|p| p.commit_lsn).max() { + match capable_peers.clone().map(|p| p.commit_lsn).max() { None => (None, "no connected peers to elect from".to_string()), Some(max_commit_lsn) => { let threshold = max_commit_lsn From 2f399f08b2f61ea3507d108768d6d9a29113d0fc Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Sat, 22 Oct 2022 02:26:28 +0300 Subject: [PATCH 0952/1022] Hotfix to disable grant create on public schema `GRANT CREATE ON SCHEMA public` fails if there is no schema `public`. Disable it in release for now and make a better fix later (it is needed for v15 support). --- compute_tools/src/spec.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index e0c0e9404b..1e7cd51b6e 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -423,11 +423,11 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { ); db_client.simple_query(&alter_query)?; - // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user. - // This is needed since postgres 15, where this privilege is removed by default. - let grant_query: String = "GRANT CREATE ON SCHEMA public TO web_access".to_string(); - info!("grant query for db {} : {}", &db.name, &grant_query); - db_client.simple_query(&grant_query)?; + // // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user. + // // This is needed since postgres 15, where this privilege is removed by default. + // let grant_query: String = "GRANT CREATE ON SCHEMA public TO web_access".to_string(); + // info!("grant query for db {} : {}", &db.name, &grant_query); + // db_client.simple_query(&grant_query)?; } Ok(()) From 39897105b27fc133802b0edc676bc3c8297d00b0 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 24 Oct 2022 11:49:36 +0300 Subject: [PATCH 0953/1022] Check postgres version and ensure that public schema exists before running GRANT query on it --- compute_tools/src/spec.rs | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 1e7cd51b6e..58c94d74ae 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -423,11 +423,32 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { ); db_client.simple_query(&alter_query)?; - // // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user. - // // This is needed since postgres 15, where this privilege is removed by default. - // let grant_query: String = "GRANT CREATE ON SCHEMA public TO web_access".to_string(); - // info!("grant query for db {} : {}", &db.name, &grant_query); - // db_client.simple_query(&grant_query)?; + // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user. + // This is needed because since postgres 15 this privilege is removed by default. + let grant_query = "DO $$\n\ + BEGIN\n\ + IF EXISTS(\n\ + SELECT nspname\n\ + FROM pg_catalog.pg_namespace\n\ + WHERE nspname = 'public'\n\ + ) AND\n\ + current_setting('server_version_num')::int/10000 >= 15\n\ + THEN\n\ + IF EXISTS(\n\ + SELECT rolname\n\ + FROM pg_catalog.pg_roles\n\ + WHERE rolname = 'web_access'\n\ + )\n\ + THEN\n\ + GRANT CREATE ON SCHEMA public TO web_access;\n\ + END IF;\n\ + END IF;\n\ + END\n\ + $$;" + .to_string(); + + info!("grant query for db {} : {}", &db.name, &grant_query); + db_client.simple_query(&grant_query)?; } Ok(()) From df18b041c0889dc034ee59a7091f99442bc07e20 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Tue, 25 Oct 2022 13:09:41 +0300 Subject: [PATCH 0954/1022] Use apt version pinning instead of repo priorities Higher `bullseye` priority doesn't works for packages installed via `bullseye-updates`, e.g.: ``` libc-bin: Installed: 2.31-13+deb11u5 Candidate: 2.35-3 Version table: 2.35-3 500 500 http://ftp.debian.org/debian testing/main amd64 Packages *** 2.31-13+deb11u5 500 500 http://deb.debian.org/debian bullseye-updates/main amd64 Packages 100 /var/lib/dpkg/status 2.31-13+deb11u4 990 990 http://deb.debian.org/debian bullseye/main amd64 Packages ``` Try version pinning instead --- Dockerfile.compute-node-v14 | 4 ++-- Dockerfile.compute-node-v15 | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14 index f5ccdf7e99..6d2b285fa3 100644 --- a/Dockerfile.compute-node-v14 +++ b/Dockerfile.compute-node-v14 @@ -9,7 +9,7 @@ ARG TAG=pinned # FROM debian:bullseye-slim AS build-deps RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ - echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \ apt update RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ @@ -191,7 +191,7 @@ RUN apt update && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ echo "Installing GLIBC 2.34" && \ echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ - echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \ apt update && \ apt install -y --no-install-recommends -t testing libc6 && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15 index ec555ad932..b7b1f25103 100644 --- a/Dockerfile.compute-node-v15 +++ b/Dockerfile.compute-node-v15 @@ -14,7 +14,7 @@ ARG TAG=pinned # FROM debian:bullseye-slim AS build-deps RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ - echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \ apt update RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ @@ -196,7 +196,7 @@ RUN apt update && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ echo "Installing GLIBC 2.34" && \ echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ - echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \ apt update && \ apt install -y --no-install-recommends -t testing libc6 && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ From 834ffe1bac4febfc5442459efdd861a283cabed3 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 25 Oct 2022 16:41:50 +0200 Subject: [PATCH 0955/1022] Add data format backward compatibility tests (#2626) --- .../actions/run-python-test-set/action.yml | 18 ++ .github/workflows/build_and_test.yml | 23 +- poetry.lock | 52 +++- pyproject.toml | 2 + test_runner/fixtures/neon_fixtures.py | 2 +- test_runner/regress/test_compatibility.py | 267 ++++++++++++++++++ 6 files changed, 352 insertions(+), 12 deletions(-) create mode 100644 test_runner/regress/test_compatibility.py diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index cc6ab65b76..07cb7edbe7 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -73,6 +73,13 @@ runs: shell: bash -euxo pipefail {0} run: ./scripts/pysync + - name: Download compatibility snapshot for Postgres 14 + uses: ./.github/actions/download + with: + name: compatibility-snapshot-${{ inputs.build_type }}-pg14 + path: /tmp/compatibility_snapshot_pg14 + prefix: latest + - name: Run pytest env: NEON_BIN: /tmp/neon/bin @@ -80,6 +87,8 @@ runs: BUILD_TYPE: ${{ inputs.build_type }} AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }} AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }} + COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14 + ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes') shell: bash -euxo pipefail {0} run: | # PLATFORM will be embedded in the perf test report @@ -154,6 +163,15 @@ runs: scripts/generate_and_push_perf_report.sh fi + - name: Upload compatibility snapshot for Postgres 14 + if: github.ref_name == 'release' + uses: ./.github/actions/upload + with: + name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }} + # The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test + path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/ + prefix: latest + - name: Create Allure report if: always() uses: ./.github/actions/allure-report diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 14ee61c5b9..660f93b025 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -844,7 +844,7 @@ jobs: submodules: true fetch-depth: 0 - - name: Configure environment + - name: Configure environment run: | helm repo add neondatabase https://neondatabase.github.io/helm-charts aws --region us-east-2 eks update-kubeconfig --name dev-us-east-2-beta --role-arn arn:aws:iam::369495373322:role/github-runner @@ -853,3 +853,24 @@ jobs: run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + + promote-compatibility-test-snapshot: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + needs: [ deploy, deploy-proxy ] + if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch' + steps: + - name: Promote compatibility snapshot for the release + shell: bash -euxo pipefail {0} + env: + BUCKET: neon-github-public-dev + PREFIX: artifacts/latest + run: | + for build_type in debug release; do + OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst + NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst + + time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME} + done diff --git a/poetry.lock b/poetry.lock index 27de8508ce..dfcb16107f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -11,7 +11,7 @@ async-timeout = ">=3.0,<5.0" psycopg2-binary = ">=2.8.4" [package.extras] -sa = ["sqlalchemy[postgresql_psycopg2binary] (>=1.3,<1.5)"] +sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"] [[package]] name = "allure-pytest" @@ -80,7 +80,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"] docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"] -tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"] +tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"] [[package]] name = "aws-sam-translator" @@ -560,7 +560,7 @@ optional = false python-versions = ">=3.6.0" [package.extras] -unicode_backport = ["unicodedata2"] +unicode-backport = ["unicodedata2"] [[package]] name = "click" @@ -593,7 +593,7 @@ python-versions = ">=3.6" cffi = ">=1.12" [package.extras] -docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"] +docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx_rtd_theme"] docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"] sdist = ["setuptools_rust (>=0.11.4)"] @@ -738,9 +738,9 @@ python-versions = ">=3.6.1,<4.0" [package.extras] colors = ["colorama (>=0.4.3,<0.5.0)"] -pipfile_deprecated_finder = ["pipreqs", "requirementslib"] +pipfile-deprecated-finder = ["pipreqs", "requirementslib"] plugins = ["setuptools"] -requirements_deprecated_finder = ["pip-api", "pipreqs"] +requirements-deprecated-finder = ["pip-api", "pipreqs"] [[package]] name = "itsdangerous" @@ -815,7 +815,7 @@ python-versions = ">=2.7" [package.extras] docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"] testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"] -"testing.libs" = ["simplejson", "ujson", "yajl"] +testing-libs = ["simplejson", "ujson", "yajl"] [[package]] name = "jsonpointer" @@ -836,11 +836,12 @@ python-versions = "*" [package.dependencies] attrs = ">=17.4.0" pyrsistent = ">=0.14.0" +setuptools = "*" six = ">=1.11.0" [package.extras] format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"] -format_nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"] +format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"] [[package]] name = "junit-xml" @@ -900,6 +901,7 @@ pytz = "*" PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""} requests = ">=2.5" responses = ">=0.9.0" +setuptools = {version = "*", optional = true, markers = "extra == \"server\""} sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""} werkzeug = ">=0.5,<2.2.0" xmltodict = "*" @@ -1008,6 +1010,7 @@ python-versions = ">=3.7.0,<4.0.0" jsonschema = ">=3.2.0,<5.0.0" openapi-schema-validator = ">=0.2.0,<0.3.0" PyYAML = ">=5.1" +setuptools = "*" [package.extras] requests = ["requests"] @@ -1340,7 +1343,7 @@ urllib3 = ">=1.21.1,<1.27" [package.extras] socks = ["PySocks (>=1.5.6,!=1.5.7)"] -use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "responses" @@ -1394,6 +1397,19 @@ python-versions = ">= 2.7" attrs = "*" pbr = "*" +[[package]] +name = "setuptools" +version = "65.5.0" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] + [[package]] name = "six" version = "1.16.0" @@ -1460,6 +1476,14 @@ category = "main" optional = false python-versions = ">=3.7,<4.0" +[[package]] +name = "types-toml" +version = "0.10.8" +description = "Typing stubs for toml" +category = "dev" +optional = false +python-versions = "*" + [[package]] name = "types-urllib3" version = "1.26.17" @@ -1544,7 +1568,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "ead1495454ee6d880bb240447025db93a25ebe263c2709de5f144cc2d85dc975" +content-hash = "17cdbfe90f1b06dffaf24c3e076384ec08dd4a2dce5a05e50565f7364932eb2d" [metadata.files] aiopg = [ @@ -2182,6 +2206,10 @@ sarif-om = [ {file = "sarif_om-1.0.4-py3-none-any.whl", hash = "sha256:539ef47a662329b1c8502388ad92457425e95dc0aaaf995fe46f4984c4771911"}, {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"}, ] +setuptools = [ + {file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"}, + {file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"}, +] six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, @@ -2210,6 +2238,10 @@ types-s3transfer = [ {file = "types-s3transfer-0.6.0.post3.tar.gz", hash = "sha256:92c3704e5d041202bfb5ddb79d083fd1a02de2c5dfec6a91576823e6b5c93993"}, {file = "types_s3transfer-0.6.0.post3-py3-none-any.whl", hash = "sha256:eedc5117275565b3c83662c0ccc81662a34da5dda8bd502b89d296b6d5cb091d"}, ] +types-toml = [ + {file = "types-toml-0.10.8.tar.gz", hash = "sha256:b7e7ea572308b1030dc86c3ba825c5210814c2825612ec679eb7814f8dd9295a"}, + {file = "types_toml-0.10.8-py3-none-any.whl", hash = "sha256:8300fd093e5829eb9c1fba69cee38130347d4b74ddf32d0a7df650ae55c2b599"}, +] types-urllib3 = [ {file = "types-urllib3-1.26.17.tar.gz", hash = "sha256:73fd274524c3fc7cd8cd9ceb0cb67ed99b45f9cb2831013e46d50c1451044800"}, {file = "types_urllib3-1.26.17-py3-none-any.whl", hash = "sha256:0d027fcd27dbb3cb532453b4d977e05bc1e13aefd70519866af211b3003d895d"}, diff --git a/pyproject.toml b/pyproject.toml index 1ee6fbe6f4..765e0b97eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,12 +28,14 @@ Werkzeug = "2.1.2" pytest-order = "^1.0.1" allure-pytest = "^2.10.0" pytest-asyncio = "^0.19.0" +toml = "^0.10.2" [tool.poetry.dev-dependencies] flake8 = "^5.0.4" mypy = "==0.971" black = "^22.6.0" isort = "^5.10.1" +types-toml = "^0.10.8" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 4b2638bb2a..38a0db7cf7 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -970,7 +970,7 @@ class NeonPageserverApiException(Exception): class NeonPageserverHttpClient(requests.Session): - def __init__(self, port: int, is_testing_enabled_or_skip, auth_token: Optional[str] = None): + def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None): super().__init__() self.port = port self.auth_token = auth_token diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py new file mode 100644 index 0000000000..944ff64390 --- /dev/null +++ b/test_runner/regress/test_compatibility.py @@ -0,0 +1,267 @@ +import os +import re +import shutil +import subprocess +from pathlib import Path +from typing import Any, Dict, Union + +import pytest +import toml +from fixtures.neon_fixtures import ( + NeonCli, + NeonEnvBuilder, + NeonPageserverHttpClient, + PgBin, + PortDistributor, + wait_for_last_record_lsn, + wait_for_upload, +) +from fixtures.types import Lsn +from pytest import FixtureRequest + + +def dump_differs(first: Path, second: Path, output: Path) -> bool: + """ + Runs diff(1) command on two SQL dumps and write the output to the given output file. + Returns True if the dumps differ, False otherwise. + """ + + with output.open("w") as stdout: + rv = subprocess.run( + [ + "diff", + "--unified", # Make diff output more readable + "--ignore-matching-lines=^--", # Ignore changes in comments + "--ignore-blank-lines", + str(first), + str(second), + ], + stdout=stdout, + ) + + return rv.returncode != 0 + + +class PortReplacer(object): + """ + Class-helper for replacing ports in config files. + """ + + def __init__(self, port_distributor: PortDistributor): + self.port_distributor = port_distributor + self.port_map: Dict[int, int] = {} + + def replace_port(self, value: Union[int, str]) -> Union[int, str]: + if isinstance(value, int): + if (known_port := self.port_map.get(value)) is not None: + return known_port + + self.port_map[value] = self.port_distributor.get_port() + return self.port_map[value] + + if isinstance(value, str): + # Use regex to find port in a string + # urllib.parse.urlparse produces inconvenient results for cases without scheme like "localhost:5432" + # See https://bugs.python.org/issue27657 + ports = re.findall(r":(\d+)(?:/|$)", value) + assert len(ports) == 1, f"can't find port in {value}" + port_int = int(ports[0]) + + if (known_port := self.port_map.get(port_int)) is not None: + return value.replace(f":{port_int}", f":{known_port}") + + self.port_map[port_int] = self.port_distributor.get_port() + return value.replace(f":{port_int}", f":{self.port_map[port_int]}") + + raise TypeError(f"unsupported type {type(value)} of {value=}") + + +def test_backward_compatibility( + pg_bin: PgBin, port_distributor: PortDistributor, test_output_dir: Path, request: FixtureRequest +): + compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR") + assert ( + compatibility_snapshot_dir_env is not None + ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_prepare_snapshot" + compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve() + + # Make compatibility snapshot artifacts pickupable by Allure + # by copying the snapshot directory to the curent test output directory. + repo_dir = test_output_dir / "compatibility_snapshot" / "repo" + + shutil.copytree(compatibility_snapshot_dir / "repo", repo_dir) + + # Remove old logs to avoid confusion in test artifacts + for logfile in repo_dir.glob("**/*.log"): + logfile.unlink() + + # Remove tenants data for computes + for tenant in (repo_dir / "pgdatadirs" / "tenants").glob("*"): + shutil.rmtree(tenant) + + # Remove wal-redo temp directory + for tenant in (repo_dir / "tenants").glob("*"): + shutil.rmtree(tenant / "wal-redo-datadir.___temp") + + # Update paths and ports in config files + pr = PortReplacer(port_distributor) + + pageserver_toml = repo_dir / "pageserver.toml" + pageserver_config = toml.load(pageserver_toml) + new_local_path = pageserver_config["remote_storage"]["local_path"].replace( + "/test_prepare_snapshot/", + "/test_backward_compatibility/compatibility_snapshot/", + ) + + pageserver_config["remote_storage"]["local_path"] = new_local_path + pageserver_config["listen_http_addr"] = pr.replace_port(pageserver_config["listen_http_addr"]) + pageserver_config["listen_pg_addr"] = pr.replace_port(pageserver_config["listen_pg_addr"]) + pageserver_config["broker_endpoints"] = [ + pr.replace_port(ep) for ep in pageserver_config["broker_endpoints"] + ] + + with pageserver_toml.open("w") as f: + toml.dump(pageserver_config, f) + + snapshot_config_toml = repo_dir / "config" + snapshot_config = toml.load(snapshot_config_toml) + snapshot_config["etcd_broker"]["broker_endpoints"] = [ + pr.replace_port(ep) for ep in snapshot_config["etcd_broker"]["broker_endpoints"] + ] + snapshot_config["pageserver"]["listen_http_addr"] = pr.replace_port( + snapshot_config["pageserver"]["listen_http_addr"] + ) + snapshot_config["pageserver"]["listen_pg_addr"] = pr.replace_port( + snapshot_config["pageserver"]["listen_pg_addr"] + ) + for sk in snapshot_config["safekeepers"]: + sk["http_port"] = pr.replace_port(sk["http_port"]) + sk["pg_port"] = pr.replace_port(sk["pg_port"]) + + with (snapshot_config_toml).open("w") as f: + toml.dump(snapshot_config, f) + + # Ensure that snapshot doesn't contain references to the original path + rv = subprocess.run( + [ + "grep", + "--recursive", + "--binary-file=without-match", + "--files-with-matches", + "test_prepare_snapshot/repo", + str(repo_dir), + ], + capture_output=True, + text=True, + ) + assert ( + rv.returncode != 0 + ), f"there're files referencing `test_prepare_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}" + + # NeonEnv stub to make NeonCli happy + config: Any = type("NeonEnvStub", (object,), {}) + config.rust_log_override = None + config.repo_dir = repo_dir + config.pg_version = "14" # Note: `pg_dumpall` (from pg_bin) version is set by DEFAULT_PG_VERSION_DEFAULT and can be overriden by DEFAULT_PG_VERSION env var + config.initial_tenant = snapshot_config["default_tenant_id"] + + # Check that we can start the project + cli = NeonCli(config) + try: + cli.raw_cli(["start"]) + request.addfinalizer(lambda: cli.raw_cli(["stop"])) + + result = cli.pg_start("main") + request.addfinalizer(lambda: cli.pg_stop("main")) + except Exception: + breaking_changes_allowed = ( + os.environ.get("ALLOW_BREAKING_CHANGES", "false").lower() == "true" + ) + if breaking_changes_allowed: + pytest.xfail("Breaking changes are allowed by ALLOW_BREAKING_CHANGES env var") + else: + raise + + connstr_all = re.findall(r"Starting postgres node at '([^']+)'", result.stdout) + assert len(connstr_all) == 1, f"can't parse connstr from {result.stdout}" + connstr = connstr_all[0] + + # Check that the project produces the same dump as the previous version. + # The assert itself deferred to the end of the test + # to allow us to perform checks that change data before failing + pg_bin.run(["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"]) + initial_dump_differs = dump_differs( + compatibility_snapshot_dir / "dump.sql", + test_output_dir / "dump.sql", + test_output_dir / "dump.filediff", + ) + + # Check that project can be recovered from WAL + # loosely based on https://github.com/neondatabase/cloud/wiki/Recovery-from-WAL + tenant_id = snapshot_config["default_tenant_id"] + timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id] + pageserver_port = snapshot_config["pageserver"]["listen_http_addr"].split(":")[-1] + auth_token = snapshot_config["pageserver"]["auth_token"] + pageserver_http = NeonPageserverHttpClient( + port=pageserver_port, + is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled + auth_token=auth_token, + ) + + shutil.rmtree(repo_dir / "local_fs_remote_storage") + pageserver_http.timeline_delete(tenant_id, timeline_id) + pageserver_http.timeline_create(tenant_id, timeline_id) + pg_bin.run( + ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"] + ) + # The assert itself deferred to the end of the test + # to allow us to perform checks that change data before failing + dump_from_wal_differs = dump_differs( + test_output_dir / "dump.sql", + test_output_dir / "dump-from-wal.sql", + test_output_dir / "dump-from-wal.filediff", + ) + + # Check that we can interract with the data + pg_bin.run(["pgbench", "--time=10", "--progress=2", connstr]) + + assert not dump_from_wal_differs, "dump from WAL differs" + assert not initial_dump_differs, "initial dump differs" + + +@pytest.mark.order(after="test_backward_compatibility") +# Note: if renaming this test, don't forget to update a reference to it in a workflow file: +# "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml +def test_prepare_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path): + # The test doesn't really test anything + # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`. + # + # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it. + neon_env_builder.pg_version = "14" + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_local_fs_remote_storage() + + env = neon_env_builder.init_start() + pg = env.postgres.create_start("main") + pg_bin.run(["pgbench", "--initialize", "--scale=10", pg.connstr()]) + pg_bin.run(["pgbench", "--time=60", "--progress=2", pg.connstr()]) + pg_bin.run(["pg_dumpall", f"--dbname={pg.connstr()}", f"--file={test_output_dir / 'dump.sql'}"]) + + snapshot_config = toml.load(test_output_dir / "repo" / "config") + tenant_id = snapshot_config["default_tenant_id"] + timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id] + + pageserver_http = env.pageserver.http_client() + lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn) + wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn) + + env.postgres.stop_all() + for sk in env.safekeepers: + sk.stop() + env.pageserver.stop() + + shutil.copytree(test_output_dir, test_output_dir / "compatibility_snapshot_pg14") + # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it From 9fb2287f87a933e9b89b59c7e286560de641dbc6 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Tue, 25 Oct 2022 11:25:22 -0400 Subject: [PATCH 0956/1022] Add draw_timeline binary (#2688) --- Cargo.lock | 7 ++ Dockerfile | 3 +- pageserver/Cargo.toml | 1 + pageserver/src/bin/draw_timeline_dir.rs | 150 ++++++++++++++++++++++++ 4 files changed, 160 insertions(+), 1 deletion(-) create mode 100644 pageserver/src/bin/draw_timeline_dir.rs diff --git a/Cargo.lock b/Cargo.lock index 13774f7fe6..b39ca6e5a7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2170,6 +2170,7 @@ dependencies = [ "serde_json", "serde_with", "signal-hook", + "svg_fmt", "tar", "tempfile", "thiserror", @@ -3461,6 +3462,12 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" +[[package]] +name = "svg_fmt" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2" + [[package]] name = "symbolic-common" version = "8.8.0" diff --git a/Dockerfile b/Dockerfile index cb4e213687..b0d934d480 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,7 +44,7 @@ COPY . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ -&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin safekeeper --bin proxy --locked --release \ +&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin proxy --locked --release \ && cachepot -s # Build final image @@ -65,6 +65,7 @@ RUN set -e \ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 2139e24ee2..b075b86aa1 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -67,6 +67,7 @@ remote_storage = { path = "../libs/remote_storage" } workspace_hack = { version = "0.1", path = "../workspace_hack" } close_fds = "0.3.2" walkdir = "2.3.2" +svg_fmt = "0.4.1" [dev-dependencies] criterion = "0.4" diff --git a/pageserver/src/bin/draw_timeline_dir.rs b/pageserver/src/bin/draw_timeline_dir.rs new file mode 100644 index 0000000000..ea1ff7f3c7 --- /dev/null +++ b/pageserver/src/bin/draw_timeline_dir.rs @@ -0,0 +1,150 @@ +//! A tool for visualizing the arrangement of layerfiles within a timeline. +//! +//! It reads filenames from stdin and prints a svg on stdout. The image is a plot in +//! page-lsn space, where every delta layer is a rectangle and every image layer is a +//! thick line. Legend: +//! - The x axis (left to right) represents page index. +//! - The y axis represents LSN, growing upwards. +//! +//! Coordinates in both axis are compressed for better readability. +//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb) +//! +//! Example use: +//! ``` +//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE +//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg +//! $ firefox out.svg +//! ``` +//! +//! This API was chosen so that we can easily work with filenames extracted from ssh, +//! or from pageserver log files. +//! +//! TODO Consider shipping this as a grafana panel plugin: +//! https://grafana.com/tutorials/build-a-panel-plugin/ +use anyhow::Result; +use pageserver::repository::Key; +use std::cmp::Ordering; +use std::io::{self, BufRead}; +use std::{ + collections::{BTreeMap, BTreeSet}, + ops::Range, +}; +use svg_fmt::{rectangle, rgb, BeginSvg, EndSvg, Fill, Stroke}; +use utils::{lsn::Lsn, project_git_version}; + +project_git_version!(GIT_VERSION); + +// Map values to their compressed coordinate - the index the value +// would have in a sorted and deduplicated list of all values. +fn build_coordinate_compression_map(coords: Vec) -> BTreeMap { + let set: BTreeSet = coords.into_iter().collect(); + + let mut map: BTreeMap = BTreeMap::new(); + for (i, e) in set.iter().enumerate() { + map.insert(*e, i); + } + + map +} + +fn parse_filename(name: &str) -> (Range, Range) { + let split: Vec<&str> = name.split("__").collect(); + let keys: Vec<&str> = split[0].split('-').collect(); + let mut lsns: Vec<&str> = split[1].split('-').collect(); + if lsns.len() == 1 { + lsns.push(lsns[0]); + } + + let keys = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap(); + let lsns = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap(); + (keys, lsns) +} + +fn main() -> Result<()> { + // Parse layer filenames from stdin + let mut ranges: Vec<(Range, Range)> = vec![]; + let stdin = io::stdin(); + for line in stdin.lock().lines() { + let range = parse_filename(&line.unwrap()); + ranges.push(range); + } + + // Collect all coordinates + let mut keys: Vec = vec![]; + let mut lsns: Vec = vec![]; + for (keyr, lsnr) in &ranges { + keys.push(keyr.start); + keys.push(keyr.end); + lsns.push(lsnr.start); + lsns.push(lsnr.end); + } + + // Analyze + let key_map = build_coordinate_compression_map(keys); + let lsn_map = build_coordinate_compression_map(lsns); + + // Initialize stats + let mut num_deltas = 0; + let mut num_images = 0; + + // Draw + let stretch = 3.0; // Stretch out vertically for better visibility + println!( + "{}", + BeginSvg { + w: key_map.len() as f32, + h: stretch * lsn_map.len() as f32 + } + ); + for (keyr, lsnr) in &ranges { + let key_start = *key_map.get(&keyr.start).unwrap(); + let key_end = *key_map.get(&keyr.end).unwrap(); + let key_diff = key_end - key_start; + let lsn_max = lsn_map.len(); + + if key_start >= key_end { + panic!("Invalid key range {}-{}", key_start, key_end); + } + + let lsn_start = *lsn_map.get(&lsnr.start).unwrap(); + let lsn_end = *lsn_map.get(&lsnr.end).unwrap(); + + let mut lsn_diff = (lsn_end - lsn_start) as f32; + let mut fill = Fill::None; + let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas + let mut lsn_offset = 0.0; + + // Fill in and thicken rectangle if it's an + // image layer so that we can see it. + match lsn_start.cmp(&lsn_end) { + Ordering::Less => num_deltas += 1, + Ordering::Equal => { + num_images += 1; + lsn_diff = 0.3; + lsn_offset = -lsn_diff / 2.0; + margin = 0.05; + fill = Fill::Color(rgb(0, 0, 0)); + } + Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end), + } + + println!( + " {}", + rectangle( + key_start as f32 + stretch * margin, + stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)), + key_diff as f32 - stretch * 2.0 * margin, + stretch * (lsn_diff - 2.0 * margin) + ) + .fill(fill) + .stroke(Stroke::Color(rgb(0, 0, 0), 0.1)) + .border_radius(0.4) + ); + } + println!("{}", EndSvg); + + eprintln!("num_images: {}", num_images); + eprintln!("num_deltas: {}", num_deltas); + + Ok(()) +} From a3cb8c11e067aac0efe637f4095863eba0361822 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Wed, 26 Oct 2022 02:51:23 +0300 Subject: [PATCH 0957/1022] Do not release to new staging proxies on release (#2685) --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 660f93b025..1b8b380179 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -832,7 +832,7 @@ jobs: # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] if: | - (github.ref_name == 'main' || github.ref_name == 'release') && + (github.ref_name == 'main') && github.event_name != 'workflow_dispatch' defaults: run: From 259a5f356e036b5cf274cd2222884434dd8c55f0 Mon Sep 17 00:00:00 2001 From: mikecaat <35882227+mikecaat@users.noreply.github.com> Date: Wed, 26 Oct 2022 19:59:25 +0900 Subject: [PATCH 0958/1022] Add a docker-compose example file (#1943) (#2666) Co-authored-by: Masahiro Ikeda --- docker-compose/compute/shell/compute.sh | 48 +++++ .../compute/var/db/postgres/specs/spec.json | 141 ++++++++++++ docker-compose/docker-compose.yml | 200 ++++++++++++++++++ docker-compose/image/compute/Dockerfile | 10 + docs/docker.md | 64 ++++++ scripts/docker-compose_test.sh | 51 +++++ 6 files changed, 514 insertions(+) create mode 100755 docker-compose/compute/shell/compute.sh create mode 100644 docker-compose/compute/var/db/postgres/specs/spec.json create mode 100644 docker-compose/docker-compose.yml create mode 100644 docker-compose/image/compute/Dockerfile create mode 100755 scripts/docker-compose_test.sh diff --git a/docker-compose/compute/shell/compute.sh b/docker-compose/compute/shell/compute.sh new file mode 100755 index 0000000000..cef2b485f3 --- /dev/null +++ b/docker-compose/compute/shell/compute.sh @@ -0,0 +1,48 @@ +#!/bin/bash +set -eux + +PG_VERSION=${PG_VERSION:-14} + +SPEC_FILE_ORG=/var/db/postgres/specs/spec.json +SPEC_FILE=/tmp/spec.json + +echo "Waiting pageserver become ready." +while ! nc -z pageserver 6400; do + sleep 1; +done +echo "Page server is ready." + +echo "Create a tenant and timeline" +PARAMS=( + -sb + -X POST + -H "Content-Type: application/json" + -d "{}" + http://pageserver:9898/v1/tenant/ +) +tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g') + +PARAMS=( + -sb + -X POST + -H "Content-Type: application/json" + -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}" + "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/" +) +result=$(curl "${PARAMS[@]}") +echo $result | jq . + +echo "Overwrite tenant id and timeline id in spec file" +tenant_id=$(echo ${result} | jq -r .tenant_id) +timeline_id=$(echo ${result} | jq -r .timeline_id) + +sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE} +sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE} + +cat ${SPEC_FILE} + +echo "Start compute node" +/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \ + -C "postgresql://cloud_admin@localhost:55433/postgres" \ + -b /usr/local/bin/postgres \ + -S ${SPEC_FILE} diff --git a/docker-compose/compute/var/db/postgres/specs/spec.json b/docker-compose/compute/var/db/postgres/specs/spec.json new file mode 100644 index 0000000000..10ae0b0ecf --- /dev/null +++ b/docker-compose/compute/var/db/postgres/specs/spec.json @@ -0,0 +1,141 @@ +{ + "format_version": 1.0, + + "timestamp": "2022-10-12T18:00:00.000Z", + "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c", + + "cluster": { + "cluster_id": "docker_compose", + "name": "docker_compose_test", + "state": "restarted", + "roles": [ + { + "name": "cloud_admin", + "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8", + "options": null + } + ], + "databases": [ + ], + "settings": [ + { + "name": "fsync", + "value": "off", + "vartype": "bool" + }, + { + "name": "wal_level", + "value": "replica", + "vartype": "enum" + }, + { + "name": "hot_standby", + "value": "on", + "vartype": "bool" + }, + { + "name": "wal_log_hints", + "value": "on", + "vartype": "bool" + }, + { + "name": "log_connections", + "value": "on", + "vartype": "bool" + }, + { + "name": "port", + "value": "55433", + "vartype": "integer" + }, + { + "name": "shared_buffers", + "value": "1MB", + "vartype": "string" + }, + { + "name": "max_connections", + "value": "100", + "vartype": "integer" + }, + { + "name": "listen_addresses", + "value": "0.0.0.0", + "vartype": "string" + }, + { + "name": "max_wal_senders", + "value": "10", + "vartype": "integer" + }, + { + "name": "max_replication_slots", + "value": "10", + "vartype": "integer" + }, + { + "name": "wal_sender_timeout", + "value": "5s", + "vartype": "string" + }, + { + "name": "wal_keep_size", + "value": "0", + "vartype": "integer" + }, + { + "name": "password_encryption", + "value": "md5", + "vartype": "enum" + }, + { + "name": "restart_after_crash", + "value": "off", + "vartype": "bool" + }, + { + "name": "synchronous_standby_names", + "value": "walproposer", + "vartype": "string" + }, + { + "name": "shared_preload_libraries", + "value": "neon", + "vartype": "string" + }, + { + "name": "neon.safekeepers", + "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454", + "vartype": "string" + }, + { + "name": "neon.timeline_id", + "value": "TIMELINE_ID", + "vartype": "string" + }, + { + "name": "neon.tenant_id", + "value": "TENANT_ID", + "vartype": "string" + }, + { + "name": "neon.pageserver_connstring", + "value": "host=pageserver port=6400", + "vartype": "string" + }, + { + "name": "max_replication_write_lag", + "value": "500MB", + "vartype": "string" + }, + { + "name": "max_replication_flush_lag", + "value": "10GB", + "vartype": "string" + } + ] + }, + + "delta_operations": [ + ] +} diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml new file mode 100644 index 0000000000..9ab775c3f9 --- /dev/null +++ b/docker-compose/docker-compose.yml @@ -0,0 +1,200 @@ +version: '3' + +services: + etcd: + image: quay.io/coreos/etcd:v3.5.4 + ports: + - 2379:2379 + - 2380:2380 + environment: + # This signifficantly speeds up etcd and we anyway don't data persistency there. + ETCD_UNSAFE_NO_FSYNC: "1" + command: + - "etcd" + - "--auto-compaction-mode=revision" + - "--auto-compaction-retention=1" + - "--name=etcd-cluster" + - "--initial-cluster-state=new" + - "--initial-cluster-token=etcd-cluster-1" + - "--initial-cluster=etcd-cluster=http://etcd:2380" + - "--initial-advertise-peer-urls=http://etcd:2380" + - "--advertise-client-urls=http://etcd:2379" + - "--listen-client-urls=http://0.0.0.0:2379" + - "--listen-peer-urls=http://0.0.0.0:2380" + - "--quota-backend-bytes=134217728" # 128 MB + + minio: + image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z + ports: + - 9000:9000 + - 9001:9001 + environment: + - MINIO_ROOT_USER=minio + - MINIO_ROOT_PASSWORD=password + command: server /data --address :9000 --console-address ":9001" + + minio_create_buckets: + image: minio/mc + environment: + - MINIO_ROOT_USER=minio + - MINIO_ROOT_PASSWORD=password + entrypoint: + - "/bin/sh" + - "-c" + command: + - "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do + echo 'Waiting to start minio...' && sleep 1; + done; + /usr/bin/mc mb minio/neon --region=eu-north-1; + exit 0;" + depends_on: + - minio + + pageserver: + image: neondatabase/neon:${TAG:-latest} + environment: + - BROKER_ENDPOINT='http://etcd:2379' + - AWS_ACCESS_KEY_ID=minio + - AWS_SECRET_ACCESS_KEY=password + #- RUST_BACKTRACE=1 + ports: + #- 6400:6400 # pg protocol handler + - 9898:9898 # http endpoints + entrypoint: + - "/bin/sh" + - "-c" + command: + - "/usr/local/bin/pageserver -D /data/.neon/ + -c \"broker_endpoints=[$$BROKER_ENDPOINT]\" + -c \"listen_pg_addr='0.0.0.0:6400'\" + -c \"listen_http_addr='0.0.0.0:9898'\" + -c \"remote_storage={endpoint='http://minio:9000', + bucket_name='neon', + bucket_region='eu-north-1', + prefix_in_bucket='/pageserver/'}\"" + depends_on: + - etcd + - minio_create_buckets + + safekeeper1: + image: neondatabase/neon:${TAG:-latest} + environment: + - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454 + - SAFEKEEPER_ID=1 + - BROKER_ENDPOINT=http://etcd:2379 + - AWS_ACCESS_KEY_ID=minio + - AWS_SECRET_ACCESS_KEY=password + #- RUST_BACKTRACE=1 + ports: + #- 5454:5454 # pg protocol handler + - 7676:7676 # http endpoints + entrypoint: + - "/bin/sh" + - "-c" + command: + - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL + --listen-http='0.0.0.0:7676' + --id=$$SAFEKEEPER_ID + --broker-endpoints=$$BROKER_ENDPOINT + -D /data + --remote-storage=\"{endpoint='http://minio:9000', + bucket_name='neon', + bucket_region='eu-north-1', + prefix_in_bucket='/safekeeper/'}\"" + depends_on: + - etcd + - minio_create_buckets + + safekeeper2: + image: neondatabase/neon:${TAG:-latest} + environment: + - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454 + - SAFEKEEPER_ID=2 + - BROKER_ENDPOINT=http://etcd:2379 + - AWS_ACCESS_KEY_ID=minio + - AWS_SECRET_ACCESS_KEY=password + #- RUST_BACKTRACE=1 + ports: + #- 5454:5454 # pg protocol handler + - 7677:7676 # http endpoints + entrypoint: + - "/bin/sh" + - "-c" + command: + - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL + --listen-http='0.0.0.0:7676' + --id=$$SAFEKEEPER_ID + --broker-endpoints=$$BROKER_ENDPOINT + -D /data + --remote-storage=\"{endpoint='http://minio:9000', + bucket_name='neon', + bucket_region='eu-north-1', + prefix_in_bucket='/safekeeper/'}\"" + depends_on: + - etcd + - minio_create_buckets + + safekeeper3: + image: neondatabase/neon:${TAG:-latest} + environment: + - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454 + - SAFEKEEPER_ID=3 + - BROKER_ENDPOINT=http://etcd:2379 + - AWS_ACCESS_KEY_ID=minio + - AWS_SECRET_ACCESS_KEY=password + #- RUST_BACKTRACE=1 + ports: + #- 5454:5454 # pg protocol handler + - 7678:7676 # http endpoints + entrypoint: + - "/bin/sh" + - "-c" + command: + - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL + --listen-http='0.0.0.0:7676' + --id=$$SAFEKEEPER_ID + --broker-endpoints=$$BROKER_ENDPOINT + -D /data + --remote-storage=\"{endpoint='http://minio:9000', + bucket_name='neon', + bucket_region='eu-north-1', + prefix_in_bucket='/safekeeper/'}\"" + depends_on: + - etcd + - minio_create_buckets + + compute: + build: + context: ./image/compute + args: + - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest} + - http_proxy=$http_proxy + - https_proxy=$https_proxy + environment: + - PG_VERSION=${PG_VERSION:-14} + #- RUST_BACKTRACE=1 + volumes: + - ./compute/var/db/postgres/specs/:/var/db/postgres/specs/ + - ./compute/shell/:/shell/ + ports: + - 55433:55433 # pg protocol handler + - 3080:3080 # http endpoints + entrypoint: + - "/shell/compute.sh" + depends_on: + - safekeeper1 + - safekeeper2 + - safekeeper3 + - pageserver + + compute_is_ready: + image: postgres:latest + entrypoint: + - "/bin/bash" + - "-c" + command: + - "until pg_isready -h compute -p 55433 ; do + echo 'Waiting to start compute...' && sleep 1; + done" + depends_on: + - compute diff --git a/docker-compose/image/compute/Dockerfile b/docker-compose/image/compute/Dockerfile new file mode 100644 index 0000000000..1b9d8c4900 --- /dev/null +++ b/docker-compose/image/compute/Dockerfile @@ -0,0 +1,10 @@ +ARG COMPUTE_IMAGE=compute-node-v14:latest +FROM neondatabase/${COMPUTE_IMAGE} + +USER root +RUN apt-get update && \ + apt-get install -y curl \ + jq \ + netcat + +USER postgres diff --git a/docs/docker.md b/docs/docker.md index 100cdd248b..42f0048e6f 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -18,3 +18,67 @@ We build all images after a successful `release` tests run and push automaticall 1. `neondatabase/compute-tools` and `neondatabase/compute-node` 2. `neondatabase/neon` + +## Docker Compose example + +You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers. + +- etcd x 1 +- pageserver x 1 +- safekeeper x 3 +- compute x 1 +- MinIO x 1 # This is Amazon S3 compatible object storage + +### How to use + +1. create containers + +You can specify version of neon cluster using following environment values. +- PG_VERSION: postgres version for compute (default is 14) +- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml) +``` +$ cd docker-compose/docker-compose.yml +$ docker-compose down # remove the conainers if exists +$ PG_VERSION=15 TAG=2221 docker-compose up --build -d # You can specify the postgres and image version +Creating network "dockercompose_default" with the default driver +Creating dockercompose_etcd3_1 ... +(...omit...) +``` + +2. connect compute node +``` +$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass +$ psql -h localhost -p 55433 -U cloud_admin +postgres=# CREATE TABLE t(key int primary key, value text); +CREATE TABLE +postgres=# insert into t values(1,1); +INSERT 0 1 +postgres=# select * from t; + key | value +-----+------- + 1 | 1 +(1 row) +``` + +3. If you want to see the log, you can use `docker-compose logs` command. +``` +# check the container name you want to see +$ docker ps +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +d6968a5ae912 dockercompose_compute "/shell/compute.sh" 5 minutes ago Up 5 minutes 0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp dockercompose_compute_1 +(...omit...) + +$ docker logs -f dockercompose_compute_1 +2022-10-21 06:15:48.757 GMT [56] LOG: connection authorized: user=cloud_admin database=postgres application_name=psql +2022-10-21 06:17:00.307 GMT [56] LOG: [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400' +(...omit...) +``` + +4. If you want to see durable data in MinIO which is s3 compatible storage + +Access http://localhost:9001 and sign in. + +- Username: `minio` +- Password: `password` + +You can see durable pages and WAL data in `neon` bucket. \ No newline at end of file diff --git a/scripts/docker-compose_test.sh b/scripts/docker-compose_test.sh new file mode 100755 index 0000000000..b4551365f8 --- /dev/null +++ b/scripts/docker-compose_test.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# this is a shortcut script to avoid duplication in CI +set -eux -o pipefail + +SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +COMPOSE_FILE=$SCRIPT_DIR/../docker-compose/docker-compose.yml + +COMPUTE_CONTAINER_NAME=dockercompose_compute_1 +SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;" +PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres" + +cleanup() { + echo "show container information" + docker ps + docker-compose -f $COMPOSE_FILE logs + echo "stop containers..." + docker-compose -f $COMPOSE_FILE down +} + +echo "clean up containers if exists" +cleanup + +for pg_version in 14 15; do + echo "start containers (pg_version=$pg_version)." + PG_VERSION=$pg_version TAG=latest docker-compose -f $COMPOSE_FILE up --build -d + + echo "wait until the compute is ready. timeout after 60s. " + cnt=0 + while sleep 1; do + # check timeout + cnt=`expr $cnt + 1` + if [ $cnt -gt 60 ]; then + echo "timeout before the compute is ready." + cleanup + exit 1 + fi + + # check if the compute is ready + set +o pipefail + result=`docker-compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l` + set -o pipefail + if [ $result -eq 1 ]; then + echo "OK. The compute is ready to connect." + echo "execute simple queries." + docker exec -it $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION" + cleanup + break + fi + done +done From 0c54eb65fbde98aae61b7d8a167c451ab5d62285 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Wed, 26 Oct 2022 17:32:31 -0400 Subject: [PATCH 0959/1022] Move pagestream api to libs/pageserver_api (#2698) --- Cargo.lock | 3 + libs/pageserver_api/Cargo.toml | 3 + libs/pageserver_api/src/lib.rs | 1 + libs/pageserver_api/src/models.rs | 161 +++++++++++++++++ .../pageserver_api}/src/reltag.rs | 0 pageserver/src/basebackup.rs | 2 +- pageserver/src/import_datadir.rs | 2 +- pageserver/src/lib.rs | 1 - pageserver/src/page_service.rs | 166 +----------------- pageserver/src/pgdatadir_mapping.rs | 2 +- pageserver/src/tenant/timeline.rs | 2 +- pageserver/src/walingest.rs | 2 +- pageserver/src/walredo.rs | 2 +- 13 files changed, 181 insertions(+), 166 deletions(-) rename {pageserver => libs/pageserver_api}/src/reltag.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index b39ca6e5a7..3e67126add 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2189,7 +2189,10 @@ dependencies = [ name = "pageserver_api" version = "0.1.0" dependencies = [ + "anyhow", + "bytes", "const_format", + "postgres_ffi", "serde", "serde_with", "utils", diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 5995325a2f..9121cd4989 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -7,6 +7,9 @@ edition = "2021" serde = { version = "1.0", features = ["derive"] } serde_with = "2.0" const_format = "0.2.21" +anyhow = { version = "1.0", features = ["backtrace"] } +bytes = "1.0.1" utils = { path = "../utils" } +postgres_ffi = { path = "../postgres_ffi" } workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs index a36c1692a9..4890d54f36 100644 --- a/libs/pageserver_api/src/lib.rs +++ b/libs/pageserver_api/src/lib.rs @@ -2,6 +2,7 @@ use const_format::formatcp; /// Public API types pub mod models; +pub mod reltag; pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index dd40ba9e1c..4360f76fd1 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -7,6 +7,10 @@ use utils::{ lsn::Lsn, }; +use crate::reltag::RelTag; +use anyhow::bail; +use bytes::{Buf, BufMut, Bytes, BytesMut}; + /// A state of a tenant in pageserver's memory. #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum TenantState { @@ -219,3 +223,160 @@ pub struct FailpointConfig { pub struct TimelineGcRequest { pub gc_horizon: Option, } + +// Wrapped in libpq CopyData +pub enum PagestreamFeMessage { + Exists(PagestreamExistsRequest), + Nblocks(PagestreamNblocksRequest), + GetPage(PagestreamGetPageRequest), + DbSize(PagestreamDbSizeRequest), +} + +// Wrapped in libpq CopyData +pub enum PagestreamBeMessage { + Exists(PagestreamExistsResponse), + Nblocks(PagestreamNblocksResponse), + GetPage(PagestreamGetPageResponse), + Error(PagestreamErrorResponse), + DbSize(PagestreamDbSizeResponse), +} + +#[derive(Debug)] +pub struct PagestreamExistsRequest { + pub latest: bool, + pub lsn: Lsn, + pub rel: RelTag, +} + +#[derive(Debug)] +pub struct PagestreamNblocksRequest { + pub latest: bool, + pub lsn: Lsn, + pub rel: RelTag, +} + +#[derive(Debug)] +pub struct PagestreamGetPageRequest { + pub latest: bool, + pub lsn: Lsn, + pub rel: RelTag, + pub blkno: u32, +} + +#[derive(Debug)] +pub struct PagestreamDbSizeRequest { + pub latest: bool, + pub lsn: Lsn, + pub dbnode: u32, +} + +#[derive(Debug)] +pub struct PagestreamExistsResponse { + pub exists: bool, +} + +#[derive(Debug)] +pub struct PagestreamNblocksResponse { + pub n_blocks: u32, +} + +#[derive(Debug)] +pub struct PagestreamGetPageResponse { + pub page: Bytes, +} + +#[derive(Debug)] +pub struct PagestreamErrorResponse { + pub message: String, +} + +#[derive(Debug)] +pub struct PagestreamDbSizeResponse { + pub db_size: i64, +} + +impl PagestreamFeMessage { + pub fn parse(mut body: Bytes) -> anyhow::Result { + // TODO these gets can fail + + // these correspond to the NeonMessageTag enum in pagestore_client.h + // + // TODO: consider using protobuf or serde bincode for less error prone + // serialization. + let msg_tag = body.get_u8(); + match msg_tag { + 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { + latest: body.get_u8() != 0, + lsn: Lsn::from(body.get_u64()), + rel: RelTag { + spcnode: body.get_u32(), + dbnode: body.get_u32(), + relnode: body.get_u32(), + forknum: body.get_u8(), + }, + })), + 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { + latest: body.get_u8() != 0, + lsn: Lsn::from(body.get_u64()), + rel: RelTag { + spcnode: body.get_u32(), + dbnode: body.get_u32(), + relnode: body.get_u32(), + forknum: body.get_u8(), + }, + })), + 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { + latest: body.get_u8() != 0, + lsn: Lsn::from(body.get_u64()), + rel: RelTag { + spcnode: body.get_u32(), + dbnode: body.get_u32(), + relnode: body.get_u32(), + forknum: body.get_u8(), + }, + blkno: body.get_u32(), + })), + 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { + latest: body.get_u8() != 0, + lsn: Lsn::from(body.get_u64()), + dbnode: body.get_u32(), + })), + _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body), + } + } +} + +impl PagestreamBeMessage { + pub fn serialize(&self) -> Bytes { + let mut bytes = BytesMut::new(); + + match self { + Self::Exists(resp) => { + bytes.put_u8(100); /* tag from pagestore_client.h */ + bytes.put_u8(resp.exists as u8); + } + + Self::Nblocks(resp) => { + bytes.put_u8(101); /* tag from pagestore_client.h */ + bytes.put_u32(resp.n_blocks); + } + + Self::GetPage(resp) => { + bytes.put_u8(102); /* tag from pagestore_client.h */ + bytes.put(&resp.page[..]); + } + + Self::Error(resp) => { + bytes.put_u8(103); /* tag from pagestore_client.h */ + bytes.put(resp.message.as_bytes()); + bytes.put_u8(0); // null terminator + } + Self::DbSize(resp) => { + bytes.put_u8(104); /* tag from pagestore_client.h */ + bytes.put_i64(resp.db_size); + } + } + + bytes.into() + } +} diff --git a/pageserver/src/reltag.rs b/libs/pageserver_api/src/reltag.rs similarity index 100% rename from pageserver/src/reltag.rs rename to libs/pageserver_api/src/reltag.rs diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index d0a57a473b..973c3cd3a6 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -22,8 +22,8 @@ use std::time::SystemTime; use tar::{Builder, EntryType, Header}; use tracing::*; -use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; +use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA}; diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index ee3dc684e3..642e41765b 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -12,10 +12,10 @@ use tracing::*; use walkdir::WalkDir; use crate::pgdatadir_mapping::*; -use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; use crate::walingest::WalIngest; use crate::walrecord::DecodedWALRecord; +use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::*; use postgres_ffi::waldecoder::WalStreamDecoder; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index c75f940386..52a4cb0381 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -8,7 +8,6 @@ pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; pub mod profiling; -pub mod reltag; pub mod repository; pub mod storage_sync; pub mod task_mgr; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d61885314e..aec91bc7f1 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -10,8 +10,14 @@ // use anyhow::{bail, ensure, Context, Result}; -use bytes::{Buf, BufMut, Bytes, BytesMut}; +use bytes::Bytes; use futures::{Stream, StreamExt}; +use pageserver_api::models::{ + PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, + PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, + PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, + PagestreamNblocksRequest, PagestreamNblocksResponse, +}; use std::io; use std::net::TcpListener; use std::str; @@ -35,7 +41,6 @@ use crate::config::{PageServerConf, ProfilingConfig}; use crate::import_datadir::import_wal_from_tar; use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::profiling::profpoint_start; -use crate::reltag::RelTag; use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::tenant::Timeline; @@ -45,163 +50,6 @@ use crate::CheckpointConfig; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; -// Wrapped in libpq CopyData -enum PagestreamFeMessage { - Exists(PagestreamExistsRequest), - Nblocks(PagestreamNblocksRequest), - GetPage(PagestreamGetPageRequest), - DbSize(PagestreamDbSizeRequest), -} - -// Wrapped in libpq CopyData -enum PagestreamBeMessage { - Exists(PagestreamExistsResponse), - Nblocks(PagestreamNblocksResponse), - GetPage(PagestreamGetPageResponse), - Error(PagestreamErrorResponse), - DbSize(PagestreamDbSizeResponse), -} - -#[derive(Debug)] -struct PagestreamExistsRequest { - latest: bool, - lsn: Lsn, - rel: RelTag, -} - -#[derive(Debug)] -struct PagestreamNblocksRequest { - latest: bool, - lsn: Lsn, - rel: RelTag, -} - -#[derive(Debug)] -struct PagestreamGetPageRequest { - latest: bool, - lsn: Lsn, - rel: RelTag, - blkno: u32, -} - -#[derive(Debug)] -struct PagestreamDbSizeRequest { - latest: bool, - lsn: Lsn, - dbnode: u32, -} - -#[derive(Debug)] -struct PagestreamExistsResponse { - exists: bool, -} - -#[derive(Debug)] -struct PagestreamNblocksResponse { - n_blocks: u32, -} - -#[derive(Debug)] -struct PagestreamGetPageResponse { - page: Bytes, -} - -#[derive(Debug)] -struct PagestreamErrorResponse { - message: String, -} - -#[derive(Debug)] -struct PagestreamDbSizeResponse { - db_size: i64, -} - -impl PagestreamFeMessage { - fn parse(mut body: Bytes) -> anyhow::Result { - // TODO these gets can fail - - // these correspond to the NeonMessageTag enum in pagestore_client.h - // - // TODO: consider using protobuf or serde bincode for less error prone - // serialization. - let msg_tag = body.get_u8(); - match msg_tag { - 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { - latest: body.get_u8() != 0, - lsn: Lsn::from(body.get_u64()), - rel: RelTag { - spcnode: body.get_u32(), - dbnode: body.get_u32(), - relnode: body.get_u32(), - forknum: body.get_u8(), - }, - })), - 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - latest: body.get_u8() != 0, - lsn: Lsn::from(body.get_u64()), - rel: RelTag { - spcnode: body.get_u32(), - dbnode: body.get_u32(), - relnode: body.get_u32(), - forknum: body.get_u8(), - }, - })), - 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - latest: body.get_u8() != 0, - lsn: Lsn::from(body.get_u64()), - rel: RelTag { - spcnode: body.get_u32(), - dbnode: body.get_u32(), - relnode: body.get_u32(), - forknum: body.get_u8(), - }, - blkno: body.get_u32(), - })), - 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - latest: body.get_u8() != 0, - lsn: Lsn::from(body.get_u64()), - dbnode: body.get_u32(), - })), - _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body), - } - } -} - -impl PagestreamBeMessage { - fn serialize(&self) -> Bytes { - let mut bytes = BytesMut::new(); - - match self { - Self::Exists(resp) => { - bytes.put_u8(100); /* tag from pagestore_client.h */ - bytes.put_u8(resp.exists as u8); - } - - Self::Nblocks(resp) => { - bytes.put_u8(101); /* tag from pagestore_client.h */ - bytes.put_u32(resp.n_blocks); - } - - Self::GetPage(resp) => { - bytes.put_u8(102); /* tag from pagestore_client.h */ - bytes.put(&resp.page[..]); - } - - Self::Error(resp) => { - bytes.put_u8(103); /* tag from pagestore_client.h */ - bytes.put(resp.message.as_bytes()); - bytes.put_u8(0); // null terminator - } - Self::DbSize(resp) => { - bytes.put_u8(104); /* tag from pagestore_client.h */ - bytes.put_i64(resp.db_size); - } - } - - bytes.into() - } -} - fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream> + '_ { async_stream::try_stream! { loop { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index ca931ed37d..0e334a63df 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -7,12 +7,12 @@ //! Clarify that) //! use crate::keyspace::{KeySpace, KeySpaceAccum}; -use crate::reltag::{RelTag, SlruKind}; use crate::repository::*; use crate::tenant::Timeline; use crate::walrecord::NeonWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; +use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; use postgres_ffi::{Oid, TimestampTz, TransactionId}; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 194ca0d857..6a96254df4 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -37,8 +37,8 @@ use crate::metrics::TimelineMetrics; use crate::pgdatadir_mapping::BlockNumber; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; -use crate::reltag::RelTag; use crate::tenant_config::TenantConfOpt; +use pageserver_api::reltag::RelTag; use postgres_ffi::to_pg_timestamp; use utils::{ diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 9a6b99d991..8c81ed824b 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -31,10 +31,10 @@ use bytes::{Buf, Bytes, BytesMut}; use tracing::*; use crate::pgdatadir_mapping::*; -use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; use crate::walrecord::*; use crate::ZERO_PAGE; +use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index e683c301d8..1cde11082e 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -43,10 +43,10 @@ use crate::metrics::{ WAL_REDO_WAIT_TIME, }; use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; -use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; use crate::walrecord::NeonWalRecord; use crate::{config::PageServerConf, TEMP_FILE_SUFFIX}; +use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; use postgres_ffi::v14::nonrelfile_utils::{ From 1f08ba5790dc19293434d1aca779125002ebe8bc Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Thu, 27 Oct 2022 03:50:46 +0300 Subject: [PATCH 0960/1022] Avoid debian-testing packages in compute Dockerfiles plv8 can only be built with a fairly new gold linker version. We used to install it via binutils packages from testing, but it also updates libc and that causes troubles in the resulting image as different extensions were built against different libc versions. We could either use libc from debian-testing everywhere or restrain from using testing packages and install necessary programs manually. This patch uses the latter approach: gold for plv8 and cmake for h3 are installed manually. In a passing declare h3_postgis as a safe extension (previous omission). --- Dockerfile.compute-node-v14 | 87 ++++++++++++++++++++++--------------- Dockerfile.compute-node-v15 | 74 ++++++++++++++++++------------- 2 files changed, 95 insertions(+), 66 deletions(-) diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14 index 6d2b285fa3..035dfc0d08 100644 --- a/Dockerfile.compute-node-v14 +++ b/Dockerfile.compute-node-v14 @@ -1,24 +1,26 @@ -ARG TAG=pinned -# apparently, ARGs don't get replaced in RUN commands in kaniko -# ARG POSTGIS_VERSION=3.3.0 -# ARG PLV8_VERSION=3.1.4 -# ARG PG_VERSION=v14 +# +# This file is identical to the Dockerfile.compute-node-v15 file +# except for the version of Postgres that is built. +# +ARG TAG=pinned + +######################################################################################### # # Layer "build-deps" # +######################################################################################### FROM debian:bullseye-slim AS build-deps -RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ - echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \ - apt update RUN apt update && \ - apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ - libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev + apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ + zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config +######################################################################################### # # Layer "pg-build" # Build Postgres from the neon postgres repository. # +######################################################################################### FROM build-deps AS pg-build COPY vendor/postgres-v14 postgres RUN cd postgres && \ @@ -29,22 +31,20 @@ RUN cd postgres && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install +######################################################################################### # # Layer "postgis-build" # Build PostGIS from the upstream PostGIS mirror. # -# PostGIS compiles against neon postgres sources without changes. Perhaps we -# could even use the upstream binaries, compiled against vanilla Postgres, but -# it would require some investigation to check that it works, and also keeps -# working in the future. So for now, we compile our own binaries. +######################################################################################### FROM build-deps AS postgis-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc -RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ - tar xvzf postgis-3.3.0.tar.gz && \ - cd postgis-3.3.0 && \ +RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \ + tar xvzf postgis-3.3.1.tar.gz && \ + cd postgis-3.3.1 && \ ./autogen.sh && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ ./configure && \ @@ -57,19 +57,29 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control +######################################################################################### # # Layer "plv8-build" # Build plv8 # +######################################################################################### FROM build-deps AS plv8-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 + apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils -# https://github.com/plv8/plv8/issues/475 -# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary -RUN apt update && \ - apt install -y --no-install-recommends -t testing binutils +# https://github.com/plv8/plv8/issues/475: +# v8 uses gold for linking and sets `--thread-count=4` which breaks +# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607) +# Install newer gold version manually as debian-testing binutils version updates +# libc version, which in turn breaks other extension built against non-testing libc. +RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \ + tar xvzf binutils-2.38.tar.gz && \ + cd binutils-2.38 && \ + cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \ + cd ../bfd && ./configure && make bfdver.h && \ + cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \ + cp /usr/local/bin/ld.gold /usr/bin/gold # Sed is used to patch for https://github.com/plv8/plv8/issues/503 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ @@ -77,21 +87,25 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ cd plv8-3.1.4 && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ + make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control +######################################################################################### # # Layer "h3-pg-build" # Build h3_pg # +######################################################################################### FROM build-deps AS h3-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # packaged cmake is too old -RUN apt update && \ - apt install -y --no-install-recommends -t testing cmake +RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \ + -q -O /tmp/cmake-install.sh \ + && chmod u+x /tmp/cmake-install.sh \ + && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ + && rm /tmp/cmake-install.sh RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \ tar xvzf h3.tgz && \ @@ -110,12 +124,15 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3 export PATH="/usr/local/pgsql/bin:$PATH" && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control + echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control +######################################################################################### # # Layer "neon-pg-ext-build" # compile neon extensions # +######################################################################################### FROM build-deps AS neon-pg-ext-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -128,16 +145,22 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ -C pgxn/neon \ -s install +######################################################################################### +# # Compile and run the Neon-specific `compute_ctl` binary +# +######################################################################################### FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto +######################################################################################### # # Clean up postgres folder before inclusion # +######################################################################################### FROM neon-pg-ext-build AS postgres-cleanup-layer COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql @@ -155,10 +178,12 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src # if they were to be used by other libraries. RUN rm /usr/local/pgsql/lib/lib*.a +######################################################################################### # # Final layer # Put it all together into the final image # +######################################################################################### FROM debian:bullseye-slim # Add user postgres RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ @@ -175,8 +200,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb # libreadline8 for psql # libossp-uuid16 for extension ossp-uuid # libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS -# GLIBC 2.34 for plv8. -# Debian bullseye provides GLIBC 2.31, so we install the library from testing # # Lastly, link compute_ctl into zenith_ctl while we're at it, # so that we don't need to put this in another layer. @@ -189,12 +212,6 @@ RUN apt update && \ libproj19 \ libprotobuf-c1 && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ - echo "Installing GLIBC 2.34" && \ - echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ - echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \ - apt update && \ - apt install -y --no-install-recommends -t testing libc6 && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl USER postgres diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15 index b7b1f25103..0b6e570b44 100644 --- a/Dockerfile.compute-node-v15 +++ b/Dockerfile.compute-node-v15 @@ -4,26 +4,23 @@ # ARG TAG=pinned -# apparently, ARGs don't get replaced in RUN commands in kaniko -# ARG POSTGIS_VERSION=3.3.1 -# ARG PLV8_VERSION=3.1.4 -# ARG PG_VERSION=v15 +######################################################################################### # # Layer "build-deps" # +######################################################################################### FROM debian:bullseye-slim AS build-deps -RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ - echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \ - apt update RUN apt update && \ - apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ - libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev + apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ + zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config +######################################################################################### # # Layer "pg-build" # Build Postgres from the neon postgres repository. # +######################################################################################### FROM build-deps AS pg-build COPY vendor/postgres-v15 postgres RUN cd postgres && \ @@ -34,14 +31,12 @@ RUN cd postgres && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install +######################################################################################### # # Layer "postgis-build" # Build PostGIS from the upstream PostGIS mirror. # -# PostGIS compiles against neon postgres sources without changes. Perhaps we -# could even use the upstream binaries, compiled against vanilla Postgres, but -# it would require some investigation to check that it works, and also keeps -# working in the future. So for now, we compile our own binaries. +######################################################################################### FROM build-deps AS postgis-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ @@ -62,19 +57,29 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control +######################################################################################### # # Layer "plv8-build" # Build plv8 # +######################################################################################### FROM build-deps AS plv8-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 + apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils -# https://github.com/plv8/plv8/issues/475 -# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary -RUN apt update && \ - apt install -y --no-install-recommends -t testing binutils +# https://github.com/plv8/plv8/issues/475: +# v8 uses gold for linking and sets `--thread-count=4` which breaks +# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607) +# Install newer gold version manually as debian-testing binutils version updates +# libc version, which in turn breaks other extension built against non-testing libc. +RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \ + tar xvzf binutils-2.38.tar.gz && \ + cd binutils-2.38 && \ + cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \ + cd ../bfd && ./configure && make bfdver.h && \ + cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \ + cp /usr/local/bin/ld.gold /usr/bin/gold # Sed is used to patch for https://github.com/plv8/plv8/issues/503 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ @@ -82,21 +87,25 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ cd plv8-3.1.4 && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ + make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control +######################################################################################### # # Layer "h3-pg-build" # Build h3_pg # +######################################################################################### FROM build-deps AS h3-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # packaged cmake is too old -RUN apt update && \ - apt install -y --no-install-recommends -t testing cmake +RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \ + -q -O /tmp/cmake-install.sh \ + && chmod u+x /tmp/cmake-install.sh \ + && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ + && rm /tmp/cmake-install.sh RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \ tar xvzf h3.tgz && \ @@ -115,12 +124,15 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3 export PATH="/usr/local/pgsql/bin:$PATH" && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control + echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control +######################################################################################### # # Layer "neon-pg-ext-build" # compile neon extensions # +######################################################################################### FROM build-deps AS neon-pg-ext-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -133,16 +145,22 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ -C pgxn/neon \ -s install +######################################################################################### +# # Compile and run the Neon-specific `compute_ctl` binary +# +######################################################################################### FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto +######################################################################################### # # Clean up postgres folder before inclusion # +######################################################################################### FROM neon-pg-ext-build AS postgres-cleanup-layer COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql @@ -160,10 +178,12 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src # if they were to be used by other libraries. RUN rm /usr/local/pgsql/lib/lib*.a +######################################################################################### # # Final layer # Put it all together into the final image # +######################################################################################### FROM debian:bullseye-slim # Add user postgres RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ @@ -180,8 +200,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb # libreadline8 for psql # libossp-uuid16 for extension ossp-uuid # libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS -# GLIBC 2.34 for plv8. -# Debian bullseye provides GLIBC 2.31, so we install the library from testing # # Lastly, link compute_ctl into zenith_ctl while we're at it, # so that we don't need to put this in another layer. @@ -194,12 +212,6 @@ RUN apt update && \ libproj19 \ libprotobuf-c1 && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ - echo "Installing GLIBC 2.34" && \ - echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ - echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \ - apt update && \ - apt install -y --no-install-recommends -t testing libc6 && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl USER postgres From b42bf9265ad7b28c0aa3186f9afc2c22e79533f6 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Thu, 27 Oct 2022 11:09:09 +0400 Subject: [PATCH 0961/1022] Enable etcd compaction in neon_local. --- control_plane/src/etcd.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs index ccadfa8ce7..ca2df8a50b 100644 --- a/control_plane/src/etcd.rs +++ b/control_plane/src/etcd.rs @@ -52,6 +52,10 @@ pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { // size smaller. Our test etcd clusters are very small. // See https://github.com/etcd-io/etcd/issues/7910 "--quota-backend-bytes=100000000".to_string(), + // etcd doesn't compact (vacuum) with default settings, + // enable it to prevent space exhaustion. + "--auto-compaction-mode=revision".to_string(), + "--auto-compaction-retention=1".to_string(), ]) .stdout(Stdio::from(etcd_stdout_file)) .stderr(Stdio::from(etcd_stderr_file)) From 6dbf202e0df73ab033c458d815245e3dbea77f46 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Thu, 27 Oct 2022 16:00:40 +0200 Subject: [PATCH 0962/1022] Update crane copy target (#2704) Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1b8b380179..8d16e406ce 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -625,11 +625,11 @@ jobs: (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' run: | - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest - name: Configure Docker Hub login run: | From 78e412b84b71ff34e71c84e0150d9e23973dee4a Mon Sep 17 00:00:00 2001 From: Alexander Stanovoy <38102252+alexstanovoy@users.noreply.github.com> Date: Thu, 27 Oct 2022 17:02:55 +0300 Subject: [PATCH 0963/1022] The fix of #2650. (#2686) * Wrappers and drop implementations for image and delta layer writers. * Two regression tests for the image and delta layer files. --- pageserver/src/tenant/delta_layer.rs | 118 ++++++++++++++++-- pageserver/src/tenant/image_layer.rs | 93 ++++++++++++-- pageserver/src/tenant/timeline.rs | 9 ++ pageserver/src/virtual_file.rs | 6 + test_runner/fixtures/neon_fixtures.py | 2 + .../regress/test_layer_writers_fail.py | 92 ++++++++++++++ 6 files changed, 302 insertions(+), 18 deletions(-) create mode 100644 test_runner/regress/test_layer_writers_fail.py diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs index 41715ab0a4..a908d66200 100644 --- a/pageserver/src/tenant/delta_layer.rs +++ b/pageserver/src/tenant/delta_layer.rs @@ -610,9 +610,9 @@ impl DeltaLayer { /// /// 3. Call `finish`. /// -pub struct DeltaLayerWriter { +struct DeltaLayerWriterInner { conf: &'static PageServerConf, - path: PathBuf, + pub path: PathBuf, timeline_id: TimelineId, tenant_id: TenantId, @@ -624,17 +624,17 @@ pub struct DeltaLayerWriter { blob_writer: WriteBlobWriter>, } -impl DeltaLayerWriter { +impl DeltaLayerWriterInner { /// /// Start building a new delta layer. /// - pub fn new( + fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_id: TenantId, key_start: Key, lsn_range: Range, - ) -> Result { + ) -> anyhow::Result { // Create the file initially with a temporary filename. We don't know // the end key yet, so we cannot form the final filename yet. We will // rename it when we're done. @@ -653,7 +653,7 @@ impl DeltaLayerWriter { let block_buf = BlockBuf::new(); let tree_builder = DiskBtreeBuilder::new(block_buf); - Ok(DeltaLayerWriter { + Ok(Self { conf, path, timeline_id, @@ -670,17 +670,17 @@ impl DeltaLayerWriter { /// /// The values must be appended in key, lsn order. /// - pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> { + fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init()) } - pub fn put_value_bytes( + fn put_value_bytes( &mut self, key: Key, lsn: Lsn, val: &[u8], will_init: bool, - ) -> Result<()> { + ) -> anyhow::Result<()> { assert!(self.lsn_range.start <= lsn); let off = self.blob_writer.write_blob(val)?; @@ -693,14 +693,14 @@ impl DeltaLayerWriter { Ok(()) } - pub fn size(&self) -> u64 { + fn size(&self) -> u64 { self.blob_writer.size() + self.tree.borrow_writer().size() } /// /// Finish writing the delta layer. /// - pub fn finish(self, key_end: Key) -> anyhow::Result { + fn finish(self, key_end: Key) -> anyhow::Result { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; @@ -768,6 +768,102 @@ impl DeltaLayerWriter { } } +/// A builder object for constructing a new delta layer. +/// +/// Usage: +/// +/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...) +/// +/// 2. Write the contents by calling `put_value` for every page +/// version to store in the layer. +/// +/// 3. Call `finish`. +/// +/// # Note +/// +/// As described in https://github.com/neondatabase/neon/issues/2650, it's +/// possible for the writer to drop before `finish` is actually called. So this +/// could lead to odd temporary files in the directory, exhausting file system. +/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop` +/// implementation that cleans up the temporary file in failure. It's not +/// possible to do this directly in `DeltaLayerWriterInner` since `finish` moves +/// out some fields, making it impossible to implement `Drop`. +/// +#[must_use] +pub struct DeltaLayerWriter { + inner: Option, +} + +impl DeltaLayerWriter { + /// + /// Start building a new delta layer. + /// + pub fn new( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_id: TenantId, + key_start: Key, + lsn_range: Range, + ) -> anyhow::Result { + Ok(Self { + inner: Some(DeltaLayerWriterInner::new( + conf, + timeline_id, + tenant_id, + key_start, + lsn_range, + )?), + }) + } + + /// + /// Append a key-value pair to the file. + /// + /// The values must be appended in key, lsn order. + /// + pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { + self.inner.as_mut().unwrap().put_value(key, lsn, val) + } + + pub fn put_value_bytes( + &mut self, + key: Key, + lsn: Lsn, + val: &[u8], + will_init: bool, + ) -> anyhow::Result<()> { + self.inner + .as_mut() + .unwrap() + .put_value_bytes(key, lsn, val, will_init) + } + + pub fn size(&self) -> u64 { + self.inner.as_ref().unwrap().size() + } + + /// + /// Finish writing the delta layer. + /// + pub fn finish(mut self, key_end: Key) -> anyhow::Result { + self.inner.take().unwrap().finish(key_end) + } +} + +impl Drop for DeltaLayerWriter { + fn drop(&mut self) { + if let Some(inner) = self.inner.take() { + match inner.blob_writer.into_inner().into_inner() { + Ok(vfile) => vfile.remove(), + Err(err) => warn!( + "error while flushing buffer of image layer temporary file: {}", + err + ), + } + } + } +} + /// /// Iterator over all key-value pairse stored in a delta layer /// diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs index cbfa0134b0..8409d34bc9 100644 --- a/pageserver/src/tenant/image_layer.rs +++ b/pageserver/src/tenant/image_layer.rs @@ -411,7 +411,7 @@ impl ImageLayer { /// /// 3. Call `finish`. /// -pub struct ImageLayerWriter { +struct ImageLayerWriterInner { conf: &'static PageServerConf, path: PathBuf, timeline_id: TimelineId, @@ -423,14 +423,17 @@ pub struct ImageLayerWriter { tree: DiskBtreeBuilder, } -impl ImageLayerWriter { - pub fn new( +impl ImageLayerWriterInner { + /// + /// Start building a new image layer. + /// + fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_id: TenantId, key_range: &Range, lsn: Lsn, - ) -> anyhow::Result { + ) -> anyhow::Result { // Create the file initially with a temporary filename. // We'll atomically rename it to the final name when we're done. let path = ImageLayer::temp_path_for( @@ -455,7 +458,7 @@ impl ImageLayerWriter { let block_buf = BlockBuf::new(); let tree_builder = DiskBtreeBuilder::new(block_buf); - let writer = ImageLayerWriter { + let writer = Self { conf, path, timeline_id, @@ -474,7 +477,7 @@ impl ImageLayerWriter { /// /// The page versions must be appended in blknum order. /// - pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> { + fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> { ensure!(self.key_range.contains(&key)); let off = self.blob_writer.write_blob(img)?; @@ -485,7 +488,10 @@ impl ImageLayerWriter { Ok(()) } - pub fn finish(self) -> anyhow::Result { + /// + /// Finish writing the image layer. + /// + fn finish(self) -> anyhow::Result { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; @@ -552,3 +558,76 @@ impl ImageLayerWriter { Ok(layer) } } + +/// A builder object for constructing a new image layer. +/// +/// Usage: +/// +/// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...) +/// +/// 2. Write the contents by calling `put_page_image` for every key-value +/// pair in the key range. +/// +/// 3. Call `finish`. +/// +/// # Note +/// +/// As described in https://github.com/neondatabase/neon/issues/2650, it's +/// possible for the writer to drop before `finish` is actually called. So this +/// could lead to odd temporary files in the directory, exhausting file system. +/// This structure wraps `ImageLayerWriterInner` and also contains `Drop` +/// implementation that cleans up the temporary file in failure. It's not +/// possible to do this directly in `ImageLayerWriterInner` since `finish` moves +/// out some fields, making it impossible to implement `Drop`. +/// +#[must_use] +pub struct ImageLayerWriter { + inner: Option, +} + +impl ImageLayerWriter { + /// + /// Start building a new image layer. + /// + pub fn new( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_id: TenantId, + key_range: &Range, + lsn: Lsn, + ) -> anyhow::Result { + Ok(Self { + inner: Some(ImageLayerWriterInner::new( + conf, + timeline_id, + tenant_id, + key_range, + lsn, + )?), + }) + } + + /// + /// Write next value to the file. + /// + /// The page versions must be appended in blknum order. + /// + pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> { + self.inner.as_mut().unwrap().put_image(key, img) + } + + /// + /// Finish writing the image layer. + /// + pub fn finish(mut self) -> anyhow::Result { + self.inner.take().unwrap().finish() + } +} + +impl Drop for ImageLayerWriter { + fn drop(&mut self) { + if let Some(inner) = self.inner.take() { + inner.blob_writer.into_inner().remove(); + } + } +} diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6a96254df4..d63429ea6a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1541,6 +1541,10 @@ impl Timeline { lsn, )?; + fail_point!("image-layer-writer-fail-before-finish", |_| { + anyhow::bail!("failpoint image-layer-writer-fail-before-finish"); + }); + for range in &partition.ranges { let mut key = range.start; while key < range.end { @@ -1835,6 +1839,11 @@ impl Timeline { }, )?); } + + fail_point!("delta-layer-writer-fail-before-finish", |_| { + anyhow::bail!("failpoint delta-layer-writer-fail-before-finish"); + }); + writer.as_mut().unwrap().put_value(key, lsn, value)?; prev_key = Some(key); } diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 896c2603a2..46e4acd50c 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -319,6 +319,12 @@ impl VirtualFile { Ok(result) } + + pub fn remove(self) { + let path = self.path.clone(); + drop(self); + std::fs::remove_file(path).expect("failed to remove the virtual file"); + } } impl Drop for VirtualFile { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 38a0db7cf7..e7e0e4ce56 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1628,6 +1628,8 @@ class NeonPageserver(PgProtocol): Initializes the repository via `neon init`. """ + TEMP_FILE_SUFFIX = "___temp" + def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional[str] = None): super().__init__(host="localhost", port=port.pg, user="cloud_admin") self.env = env diff --git a/test_runner/regress/test_layer_writers_fail.py b/test_runner/regress/test_layer_writers_fail.py new file mode 100644 index 0000000000..e8ba0e7d91 --- /dev/null +++ b/test_runner/regress/test_layer_writers_fail.py @@ -0,0 +1,92 @@ +import pytest +from fixtures.neon_fixtures import NeonEnv, NeonPageserver + + +@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2703") +def test_image_layer_writer_fail_before_finish(neon_simple_env: NeonEnv): + env = neon_simple_env + pageserver_http = env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{1024 ** 2}", + # set the target size to be large to allow the image layer to cover the whole key space + "compaction_target_size": f"{1024 ** 3}", + # tweak the default settings to allow quickly create image layers and L1 layers + "compaction_period": "1 s", + "compaction_threshold": "2", + "image_creation_threshold": "1", + } + ) + + pg = env.postgres.create_start("main", tenant_id=tenant_id) + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""", + ] + ) + + pageserver_http.configure_failpoints(("image-layer-writer-fail-before-finish", "return")) + with pytest.raises(Exception, match="image-layer-writer-fail-before-finish"): + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + new_temp_layer_files = list( + filter( + lambda file: str(file).endswith(NeonPageserver.TEMP_FILE_SUFFIX), + [path for path in env.timeline_dir(tenant_id, timeline_id).iterdir()], + ) + ) + + assert ( + len(new_temp_layer_files) == 0 + ), "pageserver should clean its temporary new image layer files on failure" + + +@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2703") +def test_delta_layer_writer_fail_before_finish(neon_simple_env: NeonEnv): + env = neon_simple_env + pageserver_http = env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{1024 ** 2}", + # set the target size to be large to allow the image layer to cover the whole key space + "compaction_target_size": f"{1024 ** 3}", + # tweak the default settings to allow quickly create image layers and L1 layers + "compaction_period": "1 s", + "compaction_threshold": "2", + "image_creation_threshold": "1", + } + ) + + pg = env.postgres.create_start("main", tenant_id=tenant_id) + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""", + ] + ) + + pageserver_http.configure_failpoints(("delta-layer-writer-fail-before-finish", "return")) + # Note: we cannot test whether the exception is exactly 'delta-layer-writer-fail-before-finish' + # since our code does it in loop, we cannot get this exact error for our request. + with pytest.raises(Exception): + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + new_temp_layer_files = list( + filter( + lambda file: str(file).endswith(NeonPageserver.TEMP_FILE_SUFFIX), + [path for path in env.timeline_dir(tenant_id, timeline_id).iterdir()], + ) + ) + + assert ( + len(new_temp_layer_files) == 0 + ), "pageserver should clean its temporary new delta layer files on failure" From 0cbae6e8f32cfa177a812464e0b6121f84fd9740 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 27 Oct 2022 17:54:49 +0200 Subject: [PATCH 0964/1022] test_backward_compatibility: friendlier error message (#2707) --- test_runner/regress/test_compatibility.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 944ff64390..20a17e449d 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -19,6 +19,8 @@ from fixtures.neon_fixtures import ( from fixtures.types import Lsn from pytest import FixtureRequest +DEFAILT_LOCAL_SNAPSHOT_DIR = "test_output/test_prepare_snapshot/compatibility_snapshot_pg14" + def dump_differs(first: Path, second: Path, output: Path) -> bool: """ @@ -76,14 +78,18 @@ class PortReplacer(object): raise TypeError(f"unsupported type {type(value)} of {value=}") +@pytest.mark.order(after="test_prepare_snapshot") def test_backward_compatibility( pg_bin: PgBin, port_distributor: PortDistributor, test_output_dir: Path, request: FixtureRequest ): - compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR") - assert ( - compatibility_snapshot_dir_env is not None - ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_prepare_snapshot" - compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve() + compatibility_snapshot_dir = Path( + os.environ.get("COMPATIBILITY_SNAPSHOT_DIR", DEFAILT_LOCAL_SNAPSHOT_DIR) + ) + assert compatibility_snapshot_dir.exists(), ( + f"{compatibility_snapshot_dir} doesn't exist. Please run `test_prepare_snapshot` test first " + "to create the snapshot or set COMPATIBILITY_SNAPSHOT_DIR env variable to the existing snapshot" + ) + compatibility_snapshot_dir = compatibility_snapshot_dir.resolve() # Make compatibility snapshot artifacts pickupable by Allure # by copying the snapshot directory to the curent test output directory. @@ -229,7 +235,6 @@ def test_backward_compatibility( assert not initial_dump_differs, "initial dump differs" -@pytest.mark.order(after="test_backward_compatibility") # Note: if renaming this test, don't forget to update a reference to it in a workflow file: # "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml def test_prepare_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path): From 128dc8d405783b160029af7b413fd24239c0de9f Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 27 Oct 2022 18:26:10 +0200 Subject: [PATCH 0965/1022] Nightly Benchmarks: fix workflow (#2708) --- .github/actions/run-python-test-set/action.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 07cb7edbe7..3459449e15 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -74,6 +74,7 @@ runs: run: ./scripts/pysync - name: Download compatibility snapshot for Postgres 14 + if: inputs.build_type != 'remote' uses: ./.github/actions/download with: name: compatibility-snapshot-${{ inputs.build_type }}-pg14 From d3c8749da5a72085612f5f05d5b2159bd16d9b49 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Thu, 27 Oct 2022 23:19:44 +0300 Subject: [PATCH 0966/1022] Build compute postgres with openssl support The main reason for that change is that Postgres 15 requires OpenSSL for `pgcrypto` to work. Also not a bad idea to have SSL-enabled Postgres in general. --- Dockerfile.compute-node-v14 | 4 ++-- Dockerfile.compute-node-v15 | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14 index 035dfc0d08..27e15593ad 100644 --- a/Dockerfile.compute-node-v14 +++ b/Dockerfile.compute-node-v14 @@ -13,7 +13,7 @@ ARG TAG=pinned FROM debian:bullseye-slim AS build-deps RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ - zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config + zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev ######################################################################################### # @@ -24,7 +24,7 @@ RUN apt update && \ FROM build-deps AS pg-build COPY vendor/postgres-v14 postgres RUN cd postgres && \ - ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \ + ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ # Install headers diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15 index 0b6e570b44..567848ffd7 100644 --- a/Dockerfile.compute-node-v15 +++ b/Dockerfile.compute-node-v15 @@ -13,7 +13,7 @@ ARG TAG=pinned FROM debian:bullseye-slim AS build-deps RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ - zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config + zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev ######################################################################################### # @@ -24,7 +24,7 @@ RUN apt update && \ FROM build-deps AS pg-build COPY vendor/postgres-v15 postgres RUN cd postgres && \ - ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \ + ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ # Install headers From e86a9105a4dce87128ff260c993086e1770f1d46 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Fri, 28 Oct 2022 13:17:27 +0300 Subject: [PATCH 0967/1022] Deploy storage to new prod regions (#2709) --- .../ansible/prod.ap-southeast-1.hosts.yaml | 35 ++++++++++++++ .github/ansible/prod.eu-central-1.hosts.yaml | 35 ++++++++++++++ .github/ansible/prod.us-east-2.hosts.yaml | 36 ++++++++++++++ .github/ansible/ssm_config | 1 - .github/ansible/staging.us-east-2.hosts.yaml | 1 + .github/workflows/build_and_test.yml | 48 +++++++++++++++++-- 6 files changed, 151 insertions(+), 5 deletions(-) create mode 100644 .github/ansible/prod.ap-southeast-1.hosts.yaml create mode 100644 .github/ansible/prod.eu-central-1.hosts.yaml create mode 100644 .github/ansible/prod.us-east-2.hosts.yaml diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml new file mode 100644 index 0000000000..bb4af91f71 --- /dev/null +++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml @@ -0,0 +1,35 @@ +storage: + vars: + bucket_name: neon-prod-storage-ap-southeast-1 + bucket_region: ap-southeast-1 + console_mgmt_base_url: http://console-release.local + etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379 + pageserver_config_stub: + pg_distrib_dir: /usr/local + remote_storage: + bucket_name: "{{ bucket_name }}" + bucket_region: "{{ bucket_region }}" + prefix_in_bucket: "pageserver/v1" + safekeeper_s3_prefix: safekeeper/v1/wal + hostname_suffix: "" + remote_user: ssm-user + ansible_aws_ssm_region: ap-southeast-1 + ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1 + console_region_id: aws-ap-southeast-1 + + children: + pageservers: + hosts: + pageserver-0.ap-southeast-1.aws.neon.tech: + ansible_host: i-064de8ea28bdb495b + pageserver-1.ap-southeast-1.aws.neon.tech: + ansible_host: i-0b180defcaeeb6b93 + + safekeepers: + hosts: + safekeeper-0.ap-southeast-1.aws.neon.tech: + ansible_host: i-0d6f1dc5161eef894 + safekeeper-1.ap-southeast-1.aws.neon.tech: + ansible_host: i-0e338adda8eb2d19f + safekeeper-2.ap-southeast-1.aws.neon.tech: + ansible_host: i-04fb63634e4679eb9 diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml new file mode 100644 index 0000000000..68b1579746 --- /dev/null +++ b/.github/ansible/prod.eu-central-1.hosts.yaml @@ -0,0 +1,35 @@ +storage: + vars: + bucket_name: neon-prod-storage-eu-central-1 + bucket_region: eu-central-1 + console_mgmt_base_url: http://console-release.local + etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379 + pageserver_config_stub: + pg_distrib_dir: /usr/local + remote_storage: + bucket_name: "{{ bucket_name }}" + bucket_region: "{{ bucket_region }}" + prefix_in_bucket: "pageserver/v1" + safekeeper_s3_prefix: safekeeper/v1/wal + hostname_suffix: "" + remote_user: ssm-user + ansible_aws_ssm_region: eu-central-1 + ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1 + console_region_id: aws-eu-central-1 + + children: + pageservers: + hosts: + pageserver-0.eu-central-1.aws.neon.tech: + ansible_host: i-0cd8d316ecbb715be + pageserver-1.eu-central-1.aws.neon.tech: + ansible_host: i-090044ed3d383fef0 + + safekeepers: + hosts: + safekeeper-0.eu-central-1.aws.neon.tech: + ansible_host: i-0b238612d2318a050 + safekeeper-1.eu-central-1.aws.neon.tech: + ansible_host: i-07b9c45e5c2637cd4 + safekeeper-2.eu-central-1.aws.neon.tech: + ansible_host: i-020257302c3c93d88 diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml new file mode 100644 index 0000000000..1d54e2ef0a --- /dev/null +++ b/.github/ansible/prod.us-east-2.hosts.yaml @@ -0,0 +1,36 @@ +storage: + vars: + bucket_name: neon-prod-storage-us-east-2 + bucket_region: us-east-2 + console_mgmt_base_url: http://console-release.local + etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379 + pageserver_config_stub: + pg_distrib_dir: /usr/local + remote_storage: + bucket_name: "{{ bucket_name }}" + bucket_region: "{{ bucket_region }}" + prefix_in_bucket: "pageserver/v1" + safekeeper_s3_prefix: safekeeper/v1/wal + hostname_suffix: "" + remote_user: ssm-user + ansible_aws_ssm_region: us-east-2 + ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2 + console_region_id: aws-us-east-2 + + children: + pageservers: + hosts: + pageserver-0.us-east-2.aws.neon.tech: + ansible_host: i-062227ba7f119eb8c + pageserver-1.us-east-2.aws.neon.tech: + ansible_host: i-0b3ec0afab5968938 + + safekeepers: + hosts: + safekeeper-0.us-east-2.aws.neon.tech: + ansible_host: i-0e94224750c57d346 + safekeeper-1.us-east-2.aws.neon.tech: + ansible_host: i-06d113fb73bfddeb0 + safekeeper-2.us-east-2.aws.neon.tech: + ansible_host: i-09f66c8e04afff2e8 + diff --git a/.github/ansible/ssm_config b/.github/ansible/ssm_config index 94958b4490..0dc67507f2 100644 --- a/.github/ansible/ssm_config +++ b/.github/ansible/ssm_config @@ -1,3 +1,2 @@ ansible_connection: aws_ssm -ansible_aws_ssm_bucket_name: neon-dev-bucket ansible_python_interpreter: /usr/bin/python3 diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index db3ed87c45..3bbf5fe8cb 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -14,6 +14,7 @@ storage: hostname_suffix: "" remote_user: ssm-user ansible_aws_ssm_region: us-east-2 + ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2 console_region_id: aws-us-east-2 children: diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8d16e406ce..7133574a0f 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -756,9 +756,9 @@ jobs: defaults: run: shell: bash - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + strategy: + matrix: + target_region: [ us-east-2 ] steps: - name: Checkout uses: actions/checkout@v3 @@ -781,7 +781,47 @@ jobs: fi ansible-galaxy collection install sivel.toiletwater - ansible-playbook deploy.yaml -i staging.us-east-2.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}} + ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}} + rm -f neon_install.tar.gz .neon_current_version + + deploy-prod-new: + runs-on: prod + container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. + # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly + needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] + if: | + (github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash + strategy: + matrix: + target_region: [ us-east-2, eu-central-1, ap-southeast-1 ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Redeploy + run: | + export DOCKER_TAG=${{needs.tag.outputs.build-tag}} + cd "$(pwd)/.github/ansible" + + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + ./get_binaries.sh + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + RELEASE=true ./get_binaries.sh + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + + ansible-galaxy collection install sivel.toiletwater + ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}} rm -f neon_install.tar.gz .neon_current_version deploy-proxy: From 59a3ca4ec60ee4dbfbe31dd589ca84055b65fc7c Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Fri, 28 Oct 2022 16:25:28 +0300 Subject: [PATCH 0968/1022] Deploy proxy to new prod regions (#2713) * Refactor proxy deploy * Test new prod deploy * Remove assume role * Add new values * Add all regions --- ...-southeast-1-epsilon.neon-proxy-scram.yaml | 31 +++++++++++ ...d-eu-central-1-gamma.neon-proxy-scram.yaml | 31 +++++++++++ ...prod-us-east-2-delta.neon-proxy-scram.yaml | 31 +++++++++++ .github/workflows/build_and_test.yml | 55 +++++++++++++++++-- 4 files changed, 142 insertions(+), 6 deletions(-) create mode 100644 .github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml create mode 100644 .github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml create mode 100644 .github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml new file mode 100644 index 0000000000..f90f89a516 --- /dev/null +++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml @@ -0,0 +1,31 @@ +# Helm chart values for neon-proxy-scram. +# This is a YAML-formatted file. + +image: + repository: neondatabase/neon + +settings: + authBackend: "console" + authEndpoint: "http://console-release.local/management/api/v2" + domain: "*.ap-southeast-1.aws.neon.tech" + +# -- Additional labels for neon-proxy pods +podLabels: + zenith_service: proxy-scram + zenith_env: prod + zenith_region: ap-southeast-1 + zenith_region_slug: ap-southeast-1 + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech + +#metrics: +# enabled: true +# serviceMonitor: +# enabled: true +# selector: +# release: kube-prometheus-stack diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml new file mode 100644 index 0000000000..33a1154099 --- /dev/null +++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml @@ -0,0 +1,31 @@ +# Helm chart values for neon-proxy-scram. +# This is a YAML-formatted file. + +image: + repository: neondatabase/neon + +settings: + authBackend: "console" + authEndpoint: "http://console-release.local/management/api/v2" + domain: "*.eu-central-1.aws.neon.tech" + +# -- Additional labels for neon-proxy pods +podLabels: + zenith_service: proxy-scram + zenith_env: prod + zenith_region: eu-central-1 + zenith_region_slug: eu-central-1 + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech + +#metrics: +# enabled: true +# serviceMonitor: +# enabled: true +# selector: +# release: kube-prometheus-stack diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml new file mode 100644 index 0000000000..5f9f2d2e66 --- /dev/null +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml @@ -0,0 +1,31 @@ +# Helm chart values for neon-proxy-scram. +# This is a YAML-formatted file. + +image: + repository: neondatabase/neon + +settings: + authBackend: "console" + authEndpoint: "http://console-release.local/management/api/v2" + domain: "*.us-east-2.aws.neon.tech" + +# -- Additional labels for neon-proxy pods +podLabels: + zenith_service: proxy-scram + zenith_env: prod + zenith_region: us-east-2 + zenith_region_slug: us-east-2 + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech + +#metrics: +# enabled: true +# serviceMonitor: +# enabled: true +# selector: +# release: kube-prometheus-stack diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7133574a0f..abca7f7701 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -870,13 +870,18 @@ jobs: runs-on: dev container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. - needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] - if: | - (github.ref_name == 'main') && - github.event_name != 'workflow_dispatch' +# needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] +# if: | +# (github.ref_name == 'main') && +# github.event_name != 'workflow_dispatch' defaults: run: shell: bash + strategy: + matrix: + include: + - target_region: us-east-2 + target_cluster: dev-us-east-2-beta steps: - name: Checkout uses: actions/checkout@v3 @@ -887,12 +892,50 @@ jobs: - name: Configure environment run: | helm repo add neondatabase https://neondatabase.github.io/helm-charts - aws --region us-east-2 eks update-kubeconfig --name dev-us-east-2-beta --role-arn arn:aws:iam::369495373322:role/github-runner + aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} + + - name: Re-deploy proxy + run: | + # DOCKER_TAG=${{needs.tag.outputs.build-tag}} + DOCKER_TAG=2257 + helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + + deploy-proxy-prod-new: + runs-on: prod + container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. + needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] + if: | + (github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash + strategy: + matrix: + include: + - target_region: us-east-2 + target_cluster: prod-us-east-2-delta + - target_region: eu-central-1 + target_cluster: prod-eu-central-1-gamma + - target_region: ap-southeast-1 + target_cluster: prod-ap-southeast-1-epsilon + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Configure environment + run: | + helm repo add neondatabase https://neondatabase.github.io/helm-charts + aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} - name: Re-deploy proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s promote-compatibility-test-snapshot: runs-on: dev From 1eb9bd052ad4217e59f08d4c1f9c8f08189ac9a4 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 28 Oct 2022 16:39:11 +0400 Subject: [PATCH 0969/1022] Bump vendor/postgres-v15 to fix XLP_FIRST_IS_CONTRECORD issue. ref https://github.com/neondatabase/cloud/issues/2688 --- vendor/postgres-v15 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index f7c5269e9c..64558b386b 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit f7c5269e9c7e818653ad6fe95ba072d1901c4497 +Subproject commit 64558b386bcd5a3300163ec7ea5d7f31cef8593c From 7481fb082c9fa7b0007e5db755496abbaab98f6c Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Fri, 28 Oct 2022 17:12:49 +0300 Subject: [PATCH 0970/1022] Fix bugs in #2713 (#2716) --- .github/workflows/build_and_test.yml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index abca7f7701..91d9561e7d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -870,10 +870,10 @@ jobs: runs-on: dev container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. -# needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] -# if: | -# (github.ref_name == 'main') && -# github.event_name != 'workflow_dispatch' + needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] + if: | + (github.ref_name == 'main') && + github.event_name != 'workflow_dispatch' defaults: run: shell: bash @@ -896,8 +896,7 @@ jobs: - name: Re-deploy proxy run: | - # DOCKER_TAG=${{needs.tag.outputs.build-tag}} - DOCKER_TAG=2257 + DOCKER_TAG=${{needs.tag.outputs.build-tag}} helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s deploy-proxy-prod-new: From 596d622a82c3b118873ebc2fc511407c58b56676 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 28 Oct 2022 21:44:47 +0400 Subject: [PATCH 0971/1022] Fix test_prepare_snapshot. It should checkpoint pageserver after waiting for all data arrival, not before. --- test_runner/regress/test_compatibility.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 20a17e449d..0487cd8f2c 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -259,8 +259,8 @@ def test_prepare_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_ pageserver_http = env.pageserver.http_client() lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn) env.postgres.stop_all() From 22cc8760b9a129dac40d210f2b4b970641e28c36 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 31 Oct 2022 01:11:50 +0100 Subject: [PATCH 0972/1022] Move walredo process code under pgxn in the main 'neon' repository. - Refactor the way the WalProposerMain function is called when started with --sync-safekeepers. The postgres binary now explicitly loads the 'neon.so' library and calls the WalProposerMain in it. This is simpler than the global function callback "hook" we previously used. - Move the WAL redo process code to a new library, neon_walredo.so, and use the same mechanism as for --sync-safekeepers to call the WalRedoMain function, when launched with --walredo argument. - Also move the seccomp code to neon_walredo.so library. I kept the configure check in the postgres side for now, though. --- Makefile | 10 + docs/sourcetree.md | 4 + pageserver/src/walredo.rs | 17 +- pgxn/neon/Makefile | 1 - pgxn/neon/libpagestore.c | 17 +- pgxn/neon/pagestore_client.h | 27 - pgxn/neon/pagestore_smgr.c | 1 - pgxn/neon/walproposer.c | 63 +- pgxn/neon_walredo/Makefile | 22 + pgxn/{neon => neon_walredo}/inmem_smgr.c | 63 +- pgxn/neon_walredo/inmem_smgr.h | 17 + pgxn/neon_walredo/neon_seccomp.h | 22 + pgxn/neon_walredo/seccomp.c | 257 +++++++ pgxn/neon_walredo/walredoproc.c | 847 +++++++++++++++++++++++ vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 16 files changed, 1288 insertions(+), 84 deletions(-) create mode 100644 pgxn/neon_walredo/Makefile rename pgxn/{neon => neon_walredo}/inmem_smgr.c (81%) create mode 100644 pgxn/neon_walredo/inmem_smgr.h create mode 100644 pgxn/neon_walredo/neon_seccomp.h create mode 100644 pgxn/neon_walredo/seccomp.c create mode 100644 pgxn/neon_walredo/walredoproc.c diff --git a/Makefile b/Makefile index 738a45fd5e..6e8b659171 100644 --- a/Makefile +++ b/Makefile @@ -151,6 +151,11 @@ neon-pg-ext-v14: postgres-v14 (cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \ $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install) + +@echo "Compiling neon_walredo v14" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 + (cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install) +@echo "Compiling neon_test_utils" v14 mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 (cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \ @@ -163,6 +168,11 @@ neon-pg-ext-v15: postgres-v15 (cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \ $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install) + +@echo "Compiling neon_walredo v15" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 + (cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install) +@echo "Compiling neon_test_utils" v15 mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 (cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \ diff --git a/docs/sourcetree.md b/docs/sourcetree.md index c468134b81..4b4efcecd7 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -52,6 +52,10 @@ PostgreSQL extension that implements storage manager API and network communicati PostgreSQL extension that contains functions needed for testing and debugging. +`/pgxn/neon_walredo`: + +Library to run Postgres as a "WAL redo process" in the pageserver. + `/safekeeper`: The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver. diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 1cde11082e..39dccf2eba 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -10,7 +10,7 @@ //! process. Then we get the page image back. Communication with the //! postgres process happens via stdin/stdout //! -//! See src/backend/tcop/zenith_wal_redo.c for the other side of +//! See pgxn/neon_walredo/walredoproc.c for the other side of //! this communication. //! //! The Postgres process is assumed to be secure against malicious WAL @@ -644,14 +644,12 @@ impl PostgresRedoProcess { ), )); } else { - // Limit shared cache for wal-redo-postres + // Limit shared cache for wal-redo-postgres let mut config = OpenOptions::new() .append(true) .open(PathBuf::from(&datadir).join("postgresql.conf"))?; config.write_all(b"shared_buffers=128kB\n")?; config.write_all(b"fsync=off\n")?; - config.write_all(b"shared_preload_libraries=neon\n")?; - config.write_all(b"neon.wal_redo=on\n")?; } // Start postgres itself @@ -664,10 +662,11 @@ impl PostgresRedoProcess { .env("LD_LIBRARY_PATH", &pg_lib_dir_path) .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) .env("PGDATA", &datadir) - // The redo process is not trusted, so it runs in seccomp mode - // (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't - // inherit any file descriptors from the pageserver that would allow - // an attacker to do bad things. + // The redo process is not trusted, and runs in seccomp mode that + // doesn't allow it to open any files. We have to also make sure it + // doesn't inherit any file descriptors from the pageserver, that + // would allow an attacker to read any files that happen to be open + // in the pageserver. // // The Rust standard library makes sure to mark any file descriptors with // as close-on-exec by default, but that's not enough, since we use @@ -844,7 +843,7 @@ impl PostgresRedoProcess { } // Functions for constructing messages to send to the postgres WAL redo -// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for +// process. See pgxn/neon_walredo/walredoproc.c for // explanation of the protocol. fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec) { diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index a6ce611974..7f4e30a12e 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -4,7 +4,6 @@ MODULE_big = neon OBJS = \ $(WIN32RES) \ - inmem_smgr.o \ libpagestore.o \ libpqwalproposer.o \ pagestore_smgr.o \ diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 9cd2a86941..d3c2bc063f 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -419,15 +419,6 @@ pg_init_libpagestore(void) 0, /* no flags required */ check_neon_id, NULL, NULL); - DefineCustomBoolVariable("neon.wal_redo", - "start in wal-redo mode", - NULL, - &wal_redo, - false, - PGC_POSTMASTER, - 0, - NULL, NULL, NULL); - DefineCustomIntVariable("neon.max_cluster_size", "cluster size limit", NULL, @@ -452,13 +443,7 @@ pg_init_libpagestore(void) neon_timeline_walproposer = neon_timeline; neon_tenant_walproposer = neon_tenant; - if (wal_redo) - { - neon_log(PageStoreTrace, "set inmem_smgr hook"); - smgr_hook = smgr_inmem; - smgr_init_hook = smgr_init_inmem; - } - else if (page_server_connstring && page_server_connstring[0]) + if (page_server_connstring && page_server_connstring[0]) { neon_log(PageStoreTrace, "set neon_smgr hook"); smgr_hook = smgr_neon; diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index e0cda11b63..4a4e60b707 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -155,10 +155,6 @@ extern int32 max_cluster_size; extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode); extern void smgr_init_neon(void); -extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode); -extern void smgr_init_inmem(void); -extern void smgr_shutdown_inmem(void); - /* Neon storage manager functionality */ extern void neon_init(void); @@ -188,29 +184,6 @@ extern void neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum); -/* neon wal-redo storage manager functionality */ - -extern void inmem_init(void); -extern void inmem_open(SMgrRelation reln); -extern void inmem_close(SMgrRelation reln, ForkNumber forknum); -extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); -extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum); -extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); -extern void inmem_extend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); -extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); -extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer); -extern void inmem_write(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); -extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks); -extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); -extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); -extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); - /* utils for neon relsize cache */ extern void relsize_hash_init(void); extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 1187550f2a..927c8f1fc1 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -99,7 +99,6 @@ char *page_server_connstring; /*with substituted password*/ char *neon_timeline; char *neon_tenant; -bool wal_redo = false; int32 max_cluster_size; /* unlogged relation build states */ diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 29290fa736..c78c79a9bb 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -43,6 +43,7 @@ #if PG_VERSION_NUM >= 150000 #include "access/xlogrecovery.h" #endif +#include "storage/fd.h" #include "storage/latch.h" #include "miscadmin.h" #include "pgstat.h" @@ -69,7 +70,8 @@ #include "neon.h" #include "walproposer.h" #include "walproposer_utils.h" -#include "replication/walpropshim.h" + +static bool syncSafekeepers = false; char *wal_acceptors_list; int wal_acceptor_reconnect_timeout; @@ -117,8 +119,8 @@ static TimestampTz last_reconnect_attempt; static WalproposerShmemState * walprop_shared; /* Prototypes for private functions */ -static void WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId); -static void WalProposerStartImpl(void); +static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId); +static void WalProposerStart(void); static void WalProposerLoop(void); static void InitEventSet(void); static void UpdateEventSet(Safekeeper *sk, uint32 events); @@ -186,9 +188,56 @@ pg_init_walproposer(void) ProcessInterruptsCallback = backpressure_throttling_impl; WalProposerRegister(); +} - WalProposerInit = &WalProposerInitImpl; - WalProposerStart = &WalProposerStartImpl; +/* + * Entry point for `postgres --sync-safekeepers`. + */ +void +WalProposerSync(int argc, char *argv[]) +{ + struct stat stat_buf; + + syncSafekeepers = true; +#if PG_VERSION_NUM < 150000 + ThisTimeLineID = 1; +#endif + + /* + * Initialize postmaster_alive_fds as WaitEventSet checks them. + * + * Copied from InitPostmasterDeathWatchHandle() + */ + if (pipe(postmaster_alive_fds) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg_internal("could not create pipe to monitor postmaster death: %m"))); + if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1) + ereport(FATAL, + (errcode_for_socket_access(), + errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m"))); + + ChangeToDataDir(); + + /* Create pg_wal directory, if it doesn't exist */ + if (stat(XLOGDIR, &stat_buf) != 0) + { + ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR))); + if (MakePGDirectory(XLOGDIR) < 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + XLOGDIR))); + exit(1); + } + } + + WalProposerInit(0, 0); + + BackgroundWorkerUnblockSignals(); + + WalProposerStart(); } static void @@ -429,7 +478,7 @@ WalProposerRegister(void) } static void -WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) +WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId) { char *host; char *sep; @@ -508,7 +557,7 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) } static void -WalProposerStartImpl(void) +WalProposerStart(void) { /* Initiate connections to all safekeeper nodes */ diff --git a/pgxn/neon_walredo/Makefile b/pgxn/neon_walredo/Makefile new file mode 100644 index 0000000000..495527c89b --- /dev/null +++ b/pgxn/neon_walredo/Makefile @@ -0,0 +1,22 @@ +# pgxs/neon_walredo/Makefile + +MODULE_big = neon_walredo +OBJS = \ + $(WIN32RES) \ + inmem_smgr.o \ + walredoproc.o \ + +# This really should be guarded by $(with_libseccomp), but I couldn't +# make that work with pgxs. So we always compile it, but its contents +# are wrapped in #ifdef HAVE_LIBSECCOMP instead. +OBJS += seccomp.o + +PGFILEDESC = "neon_walredo - helper process that runs in Neon pageserver" + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) + +ifeq ($(with_libseccomp),yes) +SHLIB_LINK += -lseccomp +endif diff --git a/pgxn/neon/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c similarity index 81% rename from pgxn/neon/inmem_smgr.c rename to pgxn/neon_walredo/inmem_smgr.c index bc0ee352b8..2219543628 100644 --- a/pgxn/neon/inmem_smgr.c +++ b/pgxn/neon_walredo/inmem_smgr.c @@ -3,9 +3,8 @@ * inmem_smgr.c * * This is an implementation of the SMGR interface, used in the WAL redo - * process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent - * storage, the pages that are written out are kept in a small number of - * in-memory buffers. + * process. It has no persistent storage, the pages that are written out + * are kept in a small number of in-memory buffers. * * Normally, replaying a WAL record only needs to access a handful of * buffers, which fit in the normal buffer cache, so this is just for @@ -15,15 +14,11 @@ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * IDENTIFICATION - * contrib/neon/inmem_smgr.c - * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/xlog.h" -#include "pagestore_client.h" #include "storage/block.h" #include "storage/buf_internals.h" #include "storage/relfilenode.h" @@ -33,6 +28,8 @@ #include "access/xlogutils.h" #endif +#include "inmem_smgr.h" + /* Size of the in-memory smgr */ #define MAX_PAGES 64 @@ -59,10 +56,34 @@ locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno) return -1; } + +/* neon wal-redo storage manager functionality */ +static void inmem_init(void); +static void inmem_open(SMgrRelation reln); +static void inmem_close(SMgrRelation reln, ForkNumber forknum); +static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); +static bool inmem_exists(SMgrRelation reln, ForkNumber forknum); +static void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +static void inmem_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer); +static void inmem_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +static void inmem_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); +static void inmem_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); + + /* * inmem_init() -- Initialize private state */ -void +static void inmem_init(void) { used_pages = 0; @@ -71,7 +92,7 @@ inmem_init(void) /* * inmem_exists() -- Does the physical file exist? */ -bool +static bool inmem_exists(SMgrRelation reln, ForkNumber forknum) { for (int i = 0; i < used_pages; i++) @@ -90,7 +111,7 @@ inmem_exists(SMgrRelation reln, ForkNumber forknum) * * If isRedo is true, it's okay for the relation to exist already. */ -void +static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo) { } @@ -98,7 +119,7 @@ inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo) /* * inmem_unlink() -- Unlink a relation. */ -void +static void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo) { } @@ -112,7 +133,7 @@ inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo) * EOF). Note that we assume writing a block beyond current EOF * causes intervening file space to become filled with zeroes. */ -void +static void inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, char *buffer, bool skipFsync) { @@ -123,7 +144,7 @@ inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, /* * inmem_open() -- Initialize newly-opened relation. */ -void +static void inmem_open(SMgrRelation reln) { } @@ -131,7 +152,7 @@ inmem_open(SMgrRelation reln) /* * inmem_close() -- Close the specified relation, if it isn't closed already. */ -void +static void inmem_close(SMgrRelation reln, ForkNumber forknum) { } @@ -139,7 +160,7 @@ inmem_close(SMgrRelation reln, ForkNumber forknum) /* * inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation */ -bool +static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { return true; @@ -148,7 +169,7 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) /* * inmem_writeback() -- Tell the kernel to write pages back to storage. */ -void +static void inmem_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks) { @@ -157,7 +178,7 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum, /* * inmem_read() -- Read the specified block from a relation. */ -void +static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, char *buffer) { @@ -177,7 +198,7 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, * relation (ie, those before the current EOF). To extend a relation, * use mdextend(). */ -void +static void inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync) { @@ -224,7 +245,7 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, /* * inmem_nblocks() -- Get the number of blocks stored in a relation. */ -BlockNumber +static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum) { /* @@ -243,7 +264,7 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum) /* * inmem_truncate() -- Truncate relation to specified number of blocks. */ -void +static void inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) { } @@ -251,7 +272,7 @@ inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) /* * inmem_immedsync() -- Immediately sync a relation to stable storage. */ -void +static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum) { } diff --git a/pgxn/neon_walredo/inmem_smgr.h b/pgxn/neon_walredo/inmem_smgr.h new file mode 100644 index 0000000000..af7c3fe6cc --- /dev/null +++ b/pgxn/neon_walredo/inmem_smgr.h @@ -0,0 +1,17 @@ +/*------------------------------------------------------------------------- + * + * inmem_smgr.h + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +#ifndef INMEM_SMGR_H +#define INMEM_SMGR_H + +extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode); +extern void smgr_init_inmem(void); + +#endif /* INMEM_SMGR_H */ diff --git a/pgxn/neon_walredo/neon_seccomp.h b/pgxn/neon_walredo/neon_seccomp.h new file mode 100644 index 0000000000..ea92d38a77 --- /dev/null +++ b/pgxn/neon_walredo/neon_seccomp.h @@ -0,0 +1,22 @@ +#ifndef NEON_SECCOMP_H +#define NEON_SECCOMP_H + +#include + +typedef struct { + int psr_syscall; /* syscall number */ + uint32 psr_action; /* libseccomp action, e.g. SCMP_ACT_ALLOW */ +} PgSeccompRule; + +#define PG_SCMP(syscall, action) \ + (PgSeccompRule) { \ + .psr_syscall = SCMP_SYS(syscall), \ + .psr_action = (action), \ + } + +#define PG_SCMP_ALLOW(syscall) \ + PG_SCMP(syscall, SCMP_ACT_ALLOW) + +extern void seccomp_load_rules(PgSeccompRule *syscalls, int count); + +#endif /* NEON_SECCOMP_H */ diff --git a/pgxn/neon_walredo/seccomp.c b/pgxn/neon_walredo/seccomp.c new file mode 100644 index 0000000000..5d5ba549ef --- /dev/null +++ b/pgxn/neon_walredo/seccomp.c @@ -0,0 +1,257 @@ +/*------------------------------------------------------------------------- + * + * seccomp.c + * Secure Computing BPF API wrapper. + * + * Pageserver delegates complex WAL decoding duties to postgres, + * which means that the latter might fall victim to carefully designed + * malicious WAL records and start doing harmful things to the system. + * To prevent this, it has been decided to limit possible interactions + * with the outside world using the Secure Computing BPF mode. + * + * We use this mode to disable all syscalls not in the allowlist. This + * approach has its pros & cons: + * + * - We have to carefully handpick and maintain the set of syscalls + * required for the WAL redo process. Core dumps help with that. + * The method of trial and error seems to work reasonably well, + * but it would be nice to find a proper way to "prove" that + * the set in question is both necessary and sufficient. + * + * - Once we enter the seccomp bpf mode, it's impossible to lift those + * restrictions (otherwise, what kind of "protection" would that be?). + * Thus, we have to either enable extra syscalls for the clean shutdown, + * or exit the process immediately via _exit() instead of proc_exit(). + * + * - Should we simply use SCMP_ACT_KILL_PROCESS, or implement a custom + * facility to deal with the forbidden syscalls? If we'd like to embed + * a startup security test, we should go with the latter; In that + * case, which one of the following options is preferable? + * + * * Catch the denied syscalls with a signal handler using SCMP_ACT_TRAP. + * Provide a common signal handler with a static switch to override + * its behavior for the test case. This would undermine the whole + * purpose of such protection, so we'd have to go further and remap + * the memory backing the switch as readonly, then ban mprotect(). + * Ugly and fragile, to say the least. + * + * * Yet again, catch the denied syscalls using SCMP_ACT_TRAP. + * Provide 2 different signal handlers: one for a test case, + * another for the main processing loop. Install the first one, + * enable seccomp, perform the test, switch to the second one, + * finally ban sigaction(), presto! + * + * * Spoof the result of a syscall using SECCOMP_RET_ERRNO for the + * test, then ban it altogether with another filter. The downside + * of this solution is that we don't actually check that + * SCMP_ACT_KILL_PROCESS/SCMP_ACT_TRAP works. + * + * Either approach seems to require two eBPF filter programs, + * which is unfortunate: the man page tells this is uncommon. + * Maybe I (@funbringer) am missing something, though; I encourage + * any reader to get familiar with it and scrutinize my conclusions. + * + * TODOs and ideas in no particular order: + * + * - Do something about mmap() in musl's malloc(). + * Definitely not a priority if we don't care about musl. + * + * - See if we can untangle PG's shutdown sequence (involving unlink()): + * + * * Simplify (or rather get rid of) shmem setup in PG's WAL redo mode. + * * Investigate chroot() or mount namespaces for better FS isolation. + * * (Per Heikki) Simply call _exit(), no big deal. + * * Come up with a better idea? + * + * - Make use of seccomp's argument inspection (for what?). + * Unfortunately, it views all syscall arguments as scalars, + * so it won't work for e.g. string comparison in unlink(). + * + * - Benchmark with bpf jit on/off, try seccomp_syscall_priority(). + * + * - Test against various linux distros & glibc versions. + * I suspect that certain libc functions might involve slightly + * different syscalls, e.g. select/pselect6/pselect6_time64/whatever. + * + * - Test on any arch other than amd64 to see if it works there. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +/* + * I couldn't find a good way to do a conditional OBJS += seccomp.o in + * the Makefile, so this file is compiled even when seccomp is disabled, + * it's just empty in that case. + */ +#ifdef HAVE_LIBSECCOMP + +#include +#include + +#include "miscadmin.h" + +#include "neon_seccomp.h" + +static void die(int code, const char *str); + +static bool seccomp_test_sighandler_done = false; +static void seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt); +static void seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt); + +static int do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action); + +void +seccomp_load_rules(PgSeccompRule *rules, int count) +{ + struct sigaction action = { .sa_flags = SA_SIGINFO }; + PgSeccompRule rule; + long fd; + + /* + * Install a test signal handler. + * XXX: pqsignal() is too restrictive for our purposes, + * since we'd like to examine the contents of siginfo_t. + */ + action.sa_sigaction = seccomp_test_sighandler; + if (sigaction(SIGSYS, &action, NULL) != 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not install test SIGSYS handler"))); + + /* + * First, check that open of a well-known file works. + * XXX: We use raw syscall() to call the very open(). + */ + fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); + if (seccomp_test_sighandler_done) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: signal handler test flag was set unexpectedly"))); + if (fd < 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not open /dev/null for seccomp testing: %m"))); + close((int) fd); + + /* Set a trap on open() to test seccomp bpf */ + rule = PG_SCMP(open, SCMP_ACT_TRAP); + if (do_seccomp_load_rules(&rule, 1, SCMP_ACT_ALLOW) != 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not load test trap"))); + + /* Finally, check that open() now raises SIGSYS */ + (void) syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); + if (!seccomp_test_sighandler_done) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: SIGSYS handler doesn't seem to work"))); + + /* Now that everything seems to work, install a proper handler */ + action.sa_sigaction = seccomp_deny_sighandler; + if (sigaction(SIGSYS, &action, NULL) != 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not install SIGSYS handler"))); + + /* If this succeeds, any syscall not in the list will crash the process */ + if (do_seccomp_load_rules(rules, count, SCMP_ACT_TRAP) != 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not enter seccomp mode"))); +} + +/* + * Enter seccomp mode with a BPF filter that will only allow + * certain syscalls to proceed. + */ +static int +do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action) +{ + scmp_filter_ctx ctx; + int rc = -1; + + /* Create a context with a default action for syscalls not in the list */ + if ((ctx = seccomp_init(def_action)) == NULL) + goto cleanup; + + for (int i = 0; i < count; i++) + { + PgSeccompRule *rule = &rules[i]; + if ((rc = seccomp_rule_add(ctx, rule->psr_action, rule->psr_syscall, 0)) != 0) + goto cleanup; + } + + /* Try building & loading the program into the kernel */ + if ((rc = seccomp_load(ctx)) != 0) + goto cleanup; + +cleanup: + /* + * We don't need the context anymore regardless of the result, + * since either we failed or the eBPF program has already been + * loaded into the linux kernel. + */ + seccomp_release(ctx); + return rc; +} + +static void +die(int code, const char *str) +{ + /* work around gcc ignoring that it shouldn't warn on (void) result being unused */ + ssize_t _unused pg_attribute_unused(); + /* Best effort write to stderr */ + _unused = write(fileno(stderr), str, strlen(str)); + + /* XXX: we don't want to run any atexit callbacks */ + _exit(code); +} + +static void +seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused()) +{ +#define DIE_PREFIX "seccomp test signal handler: " + + /* Check that this signal handler is used only for a single test case */ + if (seccomp_test_sighandler_done) + die(1, DIE_PREFIX "test handler should only be used for 1 test\n"); + seccomp_test_sighandler_done = true; + + if (signum != SIGSYS) + die(1, DIE_PREFIX "bad signal number\n"); + + /* TODO: maybe somehow extract the hardcoded syscall number */ + if (info->si_syscall != SCMP_SYS(open)) + die(1, DIE_PREFIX "bad syscall number\n"); + +#undef DIE_PREFIX +} + +static void +seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused()) +{ + /* + * Unfortunately, we can't use seccomp_syscall_resolve_num_arch() + * to resolve the syscall's name, since it calls strdup() + * under the hood (wtf!). + */ + char buffer[128]; + (void)snprintf(buffer, lengthof(buffer), + "---------------------------------------\n" + "seccomp: bad syscall %d\n" + "---------------------------------------\n", + info->si_syscall); + + /* + * Instead of silently crashing the process with + * a fake SIGSYS caused by SCMP_ACT_KILL_PROCESS, + * we'd like to receive a real SIGSYS to print the + * message and *then* immediately exit. + */ + die(1, buffer); +} + +#endif /* HAVE_LIBSECCOMP */ diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c new file mode 100644 index 0000000000..ffbfca5a40 --- /dev/null +++ b/pgxn/neon_walredo/walredoproc.c @@ -0,0 +1,847 @@ +/*------------------------------------------------------------------------- + * + * walredoproc.c + * Entry point for WAL redo helper + * + * + * This file contains an alternative main() function for the 'postgres' + * binary. In the special mode, we go into a special mode that's similar + * to the single user mode. We don't launch postmaster or any auxiliary + * processes. Instead, we wait for command from 'stdin', and respond to + * 'stdout'. + * + * The protocol through stdin/stdout is loosely based on the libpq protocol. + * The process accepts messages through stdin, and each message has the format: + * + * char msgtype; + * int32 length; // length of message including 'length' but excluding + * // 'msgtype', in network byte order + * + * + * There are three message types: + * + * BeginRedoForBlock ('B'): Prepare for WAL replay for given block + * PushPage ('P'): Copy a page image (in the payload) to buffer cache + * ApplyRecord ('A'): Apply a WAL record (in the payload) + * GetPage ('G'): Return a page image from buffer cache. + * + * Currently, you only get a response to GetPage requests; the response is + * simply a 8k page, without any headers. Errors are logged to stderr. + * + * FIXME: + * - this currently requires a valid PGDATA, and creates a lock file there + * like a normal postmaster. There's no fundamental reason for that, though. + * - should have EndRedoForBlock, and flush page cache, to allow using this + * mechanism for more than one block without restarting the process. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include +#include +#include +#include +#ifdef HAVE_SYS_SELECT_H +#include +#endif +#ifdef HAVE_SYS_RESOURCE_H +#include +#include +#endif + +#if defined(HAVE_LIBSECCOMP) && defined(__GLIBC__) +#define MALLOC_NO_MMAP +#include +#endif + +#ifndef HAVE_GETRUSAGE +#include "rusagestub.h" +#endif + +#include "access/xlog.h" +#include "access/xlog_internal.h" +#if PG_VERSION_NUM >= 150000 +#include "access/xlogrecovery.h" +#endif +#include "access/xlogutils.h" +#include "catalog/pg_class.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "miscadmin.h" +#include "postmaster/postmaster.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" + +#include "inmem_smgr.h" + +#ifdef HAVE_LIBSECCOMP +#include "neon_seccomp.h" +#endif + +PG_MODULE_MAGIC; + +static int ReadRedoCommand(StringInfo inBuf); +static void BeginRedoForBlock(StringInfo input_message); +static void PushPage(StringInfo input_message); +static void ApplyRecord(StringInfo input_message); +static void apply_error_callback(void *arg); +static bool redo_block_filter(XLogReaderState *record, uint8 block_id); +static void GetPage(StringInfo input_message); +static ssize_t buffered_read(void *buf, size_t count); + +static BufferTag target_redo_tag; + +static XLogReaderState *reader_state; + +#define TRACE DEBUG5 + +#ifdef HAVE_LIBSECCOMP +static void +enter_seccomp_mode(void) +{ + PgSeccompRule syscalls[] = + { + /* Hard requirements */ + PG_SCMP_ALLOW(exit_group), + PG_SCMP_ALLOW(pselect6), + PG_SCMP_ALLOW(read), + PG_SCMP_ALLOW(select), + PG_SCMP_ALLOW(write), + + /* Memory allocation */ + PG_SCMP_ALLOW(brk), +#ifndef MALLOC_NO_MMAP + /* TODO: musl doesn't have mallopt */ + PG_SCMP_ALLOW(mmap), + PG_SCMP_ALLOW(munmap), +#endif + /* + * getpid() is called on assertion failure, in ExceptionalCondition. + * It's not really needed, but seems pointless to hide it either. The + * system call unlikely to expose a kernel vulnerability, and the PID + * is stored in MyProcPid anyway. + */ + PG_SCMP_ALLOW(getpid), + + /* Enable those for a proper shutdown. + PG_SCMP_ALLOW(munmap), + PG_SCMP_ALLOW(shmctl), + PG_SCMP_ALLOW(shmdt), + PG_SCMP_ALLOW(unlink), // shm_unlink + */ + }; + +#ifdef MALLOC_NO_MMAP + /* Ask glibc not to use mmap() */ + mallopt(M_MMAP_MAX, 0); +#endif + + seccomp_load_rules(syscalls, lengthof(syscalls)); +} +#endif /* HAVE_LIBSECCOMP */ + +/* + * Entry point for the WAL redo process. + * + * Performs similar initialization as PostgresMain does for normal + * backend processes. Some initialization was done in CallExtMain + * already. + */ +void +WalRedoMain(int argc, char *argv[]) +{ + int firstchar; + StringInfoData input_message; +#ifdef HAVE_LIBSECCOMP + bool enable_seccomp; +#endif + + am_wal_redo_postgres = true; + + /* + * WAL redo does not need a large number of buffers. And speed of + * DropRelFileNodeAllLocalBuffers() is proportional to the number of + * buffers. So let's keep it small (default value is 1024) + */ + num_temp_buffers = 4; + + /* + * install the simple in-memory smgr + */ + smgr_hook = smgr_inmem; + smgr_init_hook = smgr_init_inmem; + + /* + * Validate we have been given a reasonable-looking DataDir and change into it. + */ + checkDataDir(); + ChangeToDataDir(); + + /* + * Create lockfile for data directory. + */ + CreateDataDirLockFile(false); + + /* read control file (error checking and contains config ) */ + LocalProcessControlFile(false); + + /* + * process any libraries that should be preloaded at postmaster start + */ + process_shared_preload_libraries(); + + /* Initialize MaxBackends (if under postmaster, was done already) */ + InitializeMaxBackends(); + +#if PG_VERSION_NUM >= 150000 + /* + * Give preloaded libraries a chance to request additional shared memory. + */ + process_shmem_requests(); + + /* + * Now that loadable modules have had their chance to request additional + * shared memory, determine the value of any runtime-computed GUCs that + * depend on the amount of shared memory required. + */ + InitializeShmemGUCs(); + + /* + * Now that modules have been loaded, we can process any custom resource + * managers specified in the wal_consistency_checking GUC. + */ + InitializeWalConsistencyChecking(); +#endif + + CreateSharedMemoryAndSemaphores(); + + /* + * Remember stand-alone backend startup time,roughly at the same point + * during startup that postmaster does so. + */ + PgStartTime = GetCurrentTimestamp(); + + /* + * Create a per-backend PGPROC struct in shared memory. We must do + * this before we can use LWLocks. + */ + InitAuxiliaryProcess(); + + SetProcessingMode(NormalProcessing); + + /* Redo routines won't work if we're not "in recovery" */ + InRecovery = true; + + /* + * Create the memory context we will use in the main loop. + * + * MessageContext is reset once per iteration of the main loop, ie, upon + * completion of processing of each command message from the client. + */ + MessageContext = AllocSetContextCreate(TopMemoryContext, + "MessageContext", + ALLOCSET_DEFAULT_SIZES); + + /* we need a ResourceOwner to hold buffer pins */ + Assert(CurrentResourceOwner == NULL); + CurrentResourceOwner = ResourceOwnerCreate(NULL, "wal redo"); + + /* Initialize resource managers */ + for (int rmid = 0; rmid <= RM_MAX_ID; rmid++) + { + if (RmgrTable[rmid].rm_startup != NULL) + RmgrTable[rmid].rm_startup(); + } + reader_state = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(), NULL); + +#ifdef HAVE_LIBSECCOMP + /* We prefer opt-out to opt-in for greater security */ + enable_seccomp = true; + for (int i = 1; i < argc; i++) + if (strcmp(argv[i], "--disable-seccomp") == 0) + enable_seccomp = false; + + /* + * We deliberately delay the transition to the seccomp mode + * until it's time to enter the main processing loop; + * else we'd have to add a lot more syscalls to the allowlist. + */ + if (enable_seccomp) + enter_seccomp_mode(); +#endif /* HAVE_LIBSECCOMP */ + + /* + * Main processing loop + */ + MemoryContextSwitchTo(MessageContext); + initStringInfo(&input_message); + + for (;;) + { + /* Release memory left over from prior query cycle. */ + resetStringInfo(&input_message); + + set_ps_display("idle"); + + /* + * (3) read a command (loop blocks here) + */ + firstchar = ReadRedoCommand(&input_message); + switch (firstchar) + { + case 'B': /* BeginRedoForBlock */ + BeginRedoForBlock(&input_message); + break; + + case 'P': /* PushPage */ + PushPage(&input_message); + break; + + case 'A': /* ApplyRecord */ + ApplyRecord(&input_message); + break; + + case 'G': /* GetPage */ + GetPage(&input_message); + break; + + /* + * EOF means we're done. Perform normal shutdown. + */ + case EOF: + ereport(LOG, + (errmsg("received EOF on stdin, shutting down"))); + +#ifdef HAVE_LIBSECCOMP + /* + * Skip the shutdown sequence, leaving some garbage behind. + * Hopefully, postgres will clean it up in the next run. + * This way we don't have to enable extra syscalls, which is nice. + * See enter_seccomp_mode() above. + */ + if (enable_seccomp) + _exit(0); +#endif /* HAVE_LIBSECCOMP */ + /* + * NOTE: if you are tempted to add more code here, DON'T! + * Whatever you had in mind to do should be set up as an + * on_proc_exit or on_shmem_exit callback, instead. Otherwise + * it will fail to be called during other backend-shutdown + * scenarios. + */ + proc_exit(0); + + default: + ereport(FATAL, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid frontend message type %d", + firstchar))); + } + } /* end of input-reading loop */ +} + + +/* Version compatility wrapper for ReadBufferWithoutRelcache */ +static inline Buffer +NeonRedoReadBuffer(RelFileNode rnode, + ForkNumber forkNum, BlockNumber blockNum, + ReadBufferMode mode) +{ +#if PG_VERSION_NUM >= 150000 + return ReadBufferWithoutRelcache(rnode, forkNum, blockNum, mode, + NULL, /* no strategy */ + true); /* WAL redo is only performed on permanent rels */ +#else + return ReadBufferWithoutRelcache(rnode, forkNum, blockNum, mode, + NULL); /* no strategy */ +#endif +} + + +/* + * Some debug function that may be handy for now. + */ +pg_attribute_unused() +static char * +pprint_buffer(char *data, int len) +{ + StringInfoData s; + + initStringInfo(&s); + appendStringInfo(&s, "\n"); + for (int i = 0; i < len; i++) { + + appendStringInfo(&s, "%02x ", (*(((char *) data) + i) & 0xff) ); + if (i % 32 == 31) { + appendStringInfo(&s, "\n"); + } + } + appendStringInfo(&s, "\n"); + + return s.data; +} + +/* ---------------------------------------------------------------- + * routines to obtain user input + * ---------------------------------------------------------------- + */ + +/* + * Read next command from the client. + * + * the string entered by the user is placed in its parameter inBuf, + * and we act like a Q message was received. + * + * EOF is returned if end-of-file input is seen; time to shut down. + * ---------------- + */ +static int +ReadRedoCommand(StringInfo inBuf) +{ + ssize_t ret; + char hdr[1 + sizeof(int32)]; + int qtype; + int32 len; + + /* Read message type and message length */ + ret = buffered_read(hdr, sizeof(hdr)); + if (ret != sizeof(hdr)) + { + if (ret == 0) + return EOF; + else if (ret < 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not read message header: %m"))); + else + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected EOF"))); + } + + qtype = hdr[0]; + memcpy(&len, &hdr[1], sizeof(int32)); + len = pg_ntoh32(len); + + if (len < 4) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid message length"))); + + len -= 4; /* discount length itself */ + + /* Read the message payload */ + enlargeStringInfo(inBuf, len); + ret = buffered_read(inBuf->data, len); + if (ret != len) + { + if (ret < 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not read message: %m"))); + else + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected EOF"))); + } + inBuf->len = len; + inBuf->data[len] = '\0'; + + return qtype; +} + +/* + * Prepare for WAL replay on given block + */ +static void +BeginRedoForBlock(StringInfo input_message) +{ + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blknum; + SMgrRelation reln; + + /* + * message format: + * + * spcNode + * dbNode + * relNode + * ForkNumber + * BlockNumber + */ + forknum = pq_getmsgbyte(input_message); + rnode.spcNode = pq_getmsgint(input_message, 4); + rnode.dbNode = pq_getmsgint(input_message, 4); + rnode.relNode = pq_getmsgint(input_message, 4); + blknum = pq_getmsgint(input_message, 4); + wal_redo_buffer = InvalidBuffer; + + INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum); + + elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u", + target_redo_tag.rnode.spcNode, + target_redo_tag.rnode.dbNode, + target_redo_tag.rnode.relNode, + target_redo_tag.forkNum, + target_redo_tag.blockNum); + + reln = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); + if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber || + reln->smgr_cached_nblocks[forknum] < blknum + 1) + { + reln->smgr_cached_nblocks[forknum] = blknum + 1; + } +} + +/* + * Receive a page given by the client, and put it into buffer cache. + */ +static void +PushPage(StringInfo input_message) +{ + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blknum; + const char *content; + Buffer buf; + Page page; + + /* + * message format: + * + * spcNode + * dbNode + * relNode + * ForkNumber + * BlockNumber + * 8k page content + */ + forknum = pq_getmsgbyte(input_message); + rnode.spcNode = pq_getmsgint(input_message, 4); + rnode.dbNode = pq_getmsgint(input_message, 4); + rnode.relNode = pq_getmsgint(input_message, 4); + blknum = pq_getmsgint(input_message, 4); + content = pq_getmsgbytes(input_message, BLCKSZ); + + buf = NeonRedoReadBuffer(rnode, forknum, blknum, RBM_ZERO_AND_LOCK); + wal_redo_buffer = buf; + page = BufferGetPage(buf); + memcpy(page, content, BLCKSZ); + MarkBufferDirty(buf); /* pro forma */ + UnlockReleaseBuffer(buf); +} + +/* + * Receive a WAL record, and apply it. + * + * All the pages should be loaded into the buffer cache by PushPage calls already. + */ +static void +ApplyRecord(StringInfo input_message) +{ + char *errormsg; + XLogRecPtr lsn; + XLogRecord *record; + int nleft; + ErrorContextCallback errcallback; +#if PG_VERSION_NUM >= 150000 + DecodedXLogRecord *decoded; +#endif + + /* + * message format: + * + * LSN (the *end* of the record) + * record + */ + lsn = pq_getmsgint64(input_message); + + smgrinit(); /* reset inmem smgr state */ + + /* note: the input must be aligned here */ + record = (XLogRecord *) pq_getmsgbytes(input_message, sizeof(XLogRecord)); + + nleft = input_message->len - input_message->cursor; + if (record->xl_tot_len != sizeof(XLogRecord) + nleft) + elog(ERROR, "mismatch between record (%d) and message size (%d)", + record->xl_tot_len, (int) sizeof(XLogRecord) + nleft); + + /* Setup error traceback support for ereport() */ + errcallback.callback = apply_error_callback; + errcallback.arg = (void *) reader_state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + XLogBeginRead(reader_state, lsn); + +#if PG_VERSION_NUM >= 150000 + decoded = (DecodedXLogRecord *) XLogReadRecordAlloc(reader_state, record->xl_tot_len, true); + + if (!DecodeXLogRecord(reader_state, decoded, record, lsn, &errormsg)) + elog(ERROR, "failed to decode WAL record: %s", errormsg); + else + { + /* Record the location of the next record. */ + decoded->next_lsn = reader_state->NextRecPtr; + + /* + * If it's in the decode buffer, mark the decode buffer space as + * occupied. + */ + if (!decoded->oversized) + { + /* The new decode buffer head must be MAXALIGNed. */ + Assert(decoded->size == MAXALIGN(decoded->size)); + if ((char *) decoded == reader_state->decode_buffer) + reader_state->decode_buffer_tail = reader_state->decode_buffer + decoded->size; + else + reader_state->decode_buffer_tail += decoded->size; + } + + /* Insert it into the queue of decoded records. */ + Assert(reader_state->decode_queue_tail != decoded); + if (reader_state->decode_queue_tail) + reader_state->decode_queue_tail->next = decoded; + reader_state->decode_queue_tail = decoded; + if (!reader_state->decode_queue_head) + reader_state->decode_queue_head = decoded; + + /* + * Update the pointers to the beginning and one-past-the-end of this + * record, again for the benefit of historical code that expected the + * decoder to track this rather than accessing these fields of the record + * itself. + */ + reader_state->record = reader_state->decode_queue_head; + reader_state->ReadRecPtr = reader_state->record->lsn; + reader_state->EndRecPtr = reader_state->record->next_lsn; + } +#else + /* + * In lieu of calling XLogReadRecord, store the record 'decoded_record' + * buffer directly. + */ + reader_state->ReadRecPtr = lsn; + reader_state->decoded_record = record; + if (!DecodeXLogRecord(reader_state, record, &errormsg)) + elog(ERROR, "failed to decode WAL record: %s", errormsg); +#endif + + /* Ignore any other blocks than the ones the caller is interested in */ + redo_read_buffer_filter = redo_block_filter; + + RmgrTable[record->xl_rmid].rm_redo(reader_state); + + /* + * If no base image of the page was provided by PushPage, initialize + * wal_redo_buffer here. The first WAL record must initialize the page + * in that case. + */ + if (BufferIsInvalid(wal_redo_buffer)) + { + wal_redo_buffer = NeonRedoReadBuffer(target_redo_tag.rnode, + target_redo_tag.forkNum, + target_redo_tag.blockNum, + RBM_NORMAL); + Assert(!BufferIsInvalid(wal_redo_buffer)); + ReleaseBuffer(wal_redo_buffer); + } + + redo_read_buffer_filter = NULL; + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + elog(TRACE, "applied WAL record with LSN %X/%X", + (uint32) (lsn >> 32), (uint32) lsn); +#if PG_VERSION_NUM >= 150000 + if (decoded && decoded->oversized) + pfree(decoded); +#endif +} + +/* + * Error context callback for errors occurring during ApplyRecord + */ +static void +apply_error_callback(void *arg) +{ + XLogReaderState *record = (XLogReaderState *) arg; + StringInfoData buf; + + initStringInfo(&buf); + xlog_outdesc(&buf, record); + + /* translator: %s is a WAL record description */ + errcontext("WAL redo at %X/%X for %s", + LSN_FORMAT_ARGS(record->ReadRecPtr), + buf.data); + + pfree(buf.data); +} + + + +static bool +redo_block_filter(XLogReaderState *record, uint8 block_id) +{ + BufferTag target_tag; + +#if PG_VERSION_NUM >= 150000 + XLogRecGetBlockTag(record, block_id, + &target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum); +#else + if (!XLogRecGetBlockTag(record, block_id, + &target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum)) + { + /* Caller specified a bogus block_id */ + elog(PANIC, "failed to locate backup block with ID %d", block_id); + } +#endif + + /* + * Can a WAL redo function ever access a relation other than the one that + * it modifies? I don't see why it would. + */ + if (!RelFileNodeEquals(target_tag.rnode, target_redo_tag.rnode)) + elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u", + target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode, target_tag.forkNum, target_tag.blockNum); + + /* + * If this block isn't one we are currently restoring, then return 'true' + * so that this gets ignored + */ + return !BUFFERTAGS_EQUAL(target_tag, target_redo_tag); +} + +/* + * Get a page image back from buffer cache. + * + * After applying some records. + */ +static void +GetPage(StringInfo input_message) +{ + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blknum; + Buffer buf; + Page page; + int tot_written; + + /* + * message format: + * + * spcNode + * dbNode + * relNode + * ForkNumber + * BlockNumber + */ + forknum = pq_getmsgbyte(input_message); + rnode.spcNode = pq_getmsgint(input_message, 4); + rnode.dbNode = pq_getmsgint(input_message, 4); + rnode.relNode = pq_getmsgint(input_message, 4); + blknum = pq_getmsgint(input_message, 4); + + /* FIXME: check that we got a BeginRedoForBlock message or this earlier */ + + buf = NeonRedoReadBuffer(rnode, forknum, blknum, RBM_NORMAL); + Assert(buf == wal_redo_buffer); + page = BufferGetPage(buf); + /* single thread, so don't bother locking the page */ + + /* Response: Page content */ + tot_written = 0; + do { + ssize_t rc; + + rc = write(STDOUT_FILENO, &page[tot_written], BLCKSZ - tot_written); + if (rc < 0) { + /* If interrupted by signal, just retry */ + if (errno == EINTR) + continue; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to stdout: %m"))); + } + tot_written += rc; + } while (tot_written < BLCKSZ); + + ReleaseBuffer(buf); + DropRelFileNodeAllLocalBuffers(rnode); + wal_redo_buffer = InvalidBuffer; + + elog(TRACE, "Page sent back for block %u", blknum); +} + + +/* Buffer used by buffered_read() */ +static char stdin_buf[16 * 1024]; +static size_t stdin_len = 0; /* # of bytes in buffer */ +static size_t stdin_ptr = 0; /* # of bytes already consumed */ + +/* + * Like read() on stdin, but buffered. + * + * We cannot use libc's buffered fread(), because it uses syscalls that we + * have disabled with seccomp(). Depending on the platform, it can call + * 'fstat' or 'newfstatat'. 'fstat' is probably harmless, but 'newfstatat' + * seems problematic because it allows interrogating files by path name. + * + * The return value is the number of bytes read. On error, -1 is returned, and + * errno is set appropriately. Unlike read(), this fills the buffer completely + * unless an error happens or EOF is reached. + */ +static ssize_t +buffered_read(void *buf, size_t count) +{ + char *dst = buf; + + while (count > 0) + { + size_t nthis; + + if (stdin_ptr == stdin_len) + { + ssize_t ret; + + ret = read(STDIN_FILENO, stdin_buf, sizeof(stdin_buf)); + if (ret < 0) + { + /* don't do anything here that could set 'errno' */ + return ret; + } + if (ret == 0) + { + /* EOF */ + break; + } + stdin_len = (size_t) ret; + stdin_ptr = 0; + } + nthis = Min(stdin_len - stdin_ptr, count); + + memcpy(dst, &stdin_buf[stdin_ptr], nthis); + + stdin_ptr += nthis; + count -= nthis; + dst += nthis; + } + + return (dst - (char *) buf); +} diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index bdd502a8da..e9b0010b45 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit bdd502a8da5de9e0ac709caabc0401455c97d235 +Subproject commit e9b0010b45b287eea2213427ebac53a3fb7bdce9 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 64558b386b..5cd7e44799 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 64558b386bcd5a3300163ec7ea5d7f31cef8593c +Subproject commit 5cd7e44799567c52f13dc8c42e0bcab913022438 From c64a121aa8632d5838fee865fcd5e59229864cb7 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 31 Oct 2022 18:03:25 +0200 Subject: [PATCH 0973/1022] do not nest wal_connection_manager span inside parent one --- pageserver/src/walreceiver/connection_manager.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 53dd2d8eac..1d53df788d 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -93,7 +93,7 @@ pub fn spawn_connection_manager_task( } } .instrument( - info_span!("wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id), + info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id), ), ); } From 0df3467146f9368d67a145c59949e7f0ee6c01c9 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Mon, 31 Oct 2022 18:39:29 +0300 Subject: [PATCH 0974/1022] Refactoring: replace `utils::connstring` with `Url`-based APIs --- Cargo.lock | 1 + control_plane/Cargo.toml | 15 ++--- control_plane/src/compute.rs | 4 +- control_plane/src/connection.rs | 57 +++++++++++++++++++ control_plane/src/lib.rs | 1 + control_plane/src/safekeeper.rs | 12 ++-- control_plane/src/storage.rs | 15 +++-- libs/utils/src/connstring.rs | 52 ----------------- libs/utils/src/lib.rs | 3 - .../src/walreceiver/connection_manager.rs | 37 ++++++------ 10 files changed, 103 insertions(+), 94 deletions(-) create mode 100644 control_plane/src/connection.rs delete mode 100644 libs/utils/src/connstring.rs diff --git a/Cargo.lock b/Cargo.lock index 3e67126add..326cccaecb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -600,6 +600,7 @@ dependencies = [ "tar", "thiserror", "toml", + "url", "utils", "workspace_hack", ] diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 287385c709..a9d30b4a86 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -4,20 +4,21 @@ version = "0.1.0" edition = "2021" [dependencies] +anyhow = "1.0" clap = "4.0" comfy-table = "6.1" git-version = "0.3.5" -tar = "0.4.38" +nix = "0.25" +once_cell = "1.13.0" postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +regex = "1" +reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } serde = { version = "1.0", features = ["derive"] } serde_with = "2.0" -toml = "0.5" -once_cell = "1.13.0" -regex = "1" -anyhow = "1.0" +tar = "0.4.38" thiserror = "1" -nix = "0.25" -reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } +toml = "0.5" +url = "2.2.2" # Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api # instead, so that recompile times are better. diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index b3f90b5922..89e4e85eb0 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -12,7 +12,6 @@ use std::time::Duration; use anyhow::{Context, Result}; use utils::{ - connstring::connection_host_port, id::{TenantId, TimelineId}, lsn::Lsn, postgres_backend::AuthType, @@ -300,7 +299,8 @@ impl PostgresNode { // Configure the node to fetch pages from pageserver let pageserver_connstr = { - let (host, port) = connection_host_port(&self.pageserver.pg_connection_config); + let config = &self.pageserver.pg_connection_config; + let (host, port) = (config.host(), config.port()); // Set up authentication // diff --git a/control_plane/src/connection.rs b/control_plane/src/connection.rs new file mode 100644 index 0000000000..cca837de6e --- /dev/null +++ b/control_plane/src/connection.rs @@ -0,0 +1,57 @@ +use url::Url; + +#[derive(Debug)] +pub struct PgConnectionConfig { + url: Url, +} + +impl PgConnectionConfig { + pub fn host(&self) -> &str { + self.url.host_str().expect("BUG: no host") + } + + pub fn port(&self) -> u16 { + self.url.port().expect("BUG: no port") + } + + /// Return a `:` string. + pub fn raw_address(&self) -> String { + format!("{}:{}", self.host(), self.port()) + } + + /// Connect using postgres protocol with TLS disabled. + pub fn connect_no_tls(&self) -> Result { + postgres::Client::connect(self.url.as_str(), postgres::NoTls) + } +} + +impl std::str::FromStr for PgConnectionConfig { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + let mut url: Url = s.parse()?; + + match url.scheme() { + "postgres" | "postgresql" => {} + other => anyhow::bail!("invalid scheme: {other}"), + } + + // It's not a valid connection url if host is unavailable. + if url.host().is_none() { + anyhow::bail!(url::ParseError::EmptyHost); + } + + // E.g. `postgres:bar`. + if url.cannot_be_a_base() { + anyhow::bail!("URL cannot be a base"); + } + + // Set the default PG port if it's missing. + if url.port().is_none() { + url.set_port(Some(5432)) + .expect("BUG: couldn't set the default port"); + } + + Ok(Self { url }) + } +} diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index 17232ccf45..f22dce1810 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -12,6 +12,7 @@ use std::path::Path; use std::process::Command; pub mod compute; +pub mod connection; pub mod etcd; pub mod local_env; pub mod postgresql_conf; diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 17f5d0c109..91cedeca23 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -9,12 +9,12 @@ use anyhow::bail; use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; -use postgres::Config; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use thiserror::Error; -use utils::{connstring::connection_address, http::error::HttpErrorBody, id::NodeId}; +use utils::{http::error::HttpErrorBody, id::NodeId}; +use crate::connection::PgConnectionConfig; use crate::local_env::{LocalEnv, SafekeeperConf}; use crate::storage::PageServerNode; use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile}; @@ -63,7 +63,7 @@ pub struct SafekeeperNode { pub conf: SafekeeperConf, - pub pg_connection_config: Config, + pub pg_connection_config: PgConnectionConfig, pub env: LocalEnv, pub http_client: Client, pub http_base_url: String, @@ -87,9 +87,9 @@ impl SafekeeperNode { } /// Construct libpq connection string for connecting to this safekeeper. - fn safekeeper_connection_config(port: u16) -> Config { + fn safekeeper_connection_config(port: u16) -> PgConnectionConfig { // TODO safekeeper authentication not implemented yet - format!("postgresql://no_user@127.0.0.1:{}/no_db", port) + format!("postgresql://no_user@127.0.0.1:{port}/no_db") .parse() .unwrap() } @@ -109,7 +109,7 @@ impl SafekeeperNode { pub fn start(&self) -> anyhow::Result<()> { print!( "Starting safekeeper at '{}' in '{}'", - connection_address(&self.pg_connection_config), + self.pg_connection_config.raw_address(), self.datadir_path().display() ); io::stdout().flush().unwrap(); diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 59cb3d7efb..4b705690f0 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -7,6 +7,7 @@ use std::process::Command; use std::time::Duration; use std::{io, result, thread}; +use crate::connection::PgConnectionConfig; use anyhow::{bail, Context}; use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; @@ -14,12 +15,10 @@ use nix::unistd::Pid; use pageserver_api::models::{ TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo, }; -use postgres::{Config, NoTls}; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use thiserror::Error; use utils::{ - connstring::connection_address, http::error::HttpErrorBody, id::{TenantId, TimelineId}, lsn::Lsn, @@ -75,7 +74,7 @@ impl ResponseErrorMessageExt for Response { // #[derive(Debug)] pub struct PageServerNode { - pub pg_connection_config: Config, + pub pg_connection_config: PgConnectionConfig, pub env: LocalEnv, pub http_client: Client, pub http_base_url: String, @@ -101,7 +100,7 @@ impl PageServerNode { } /// Construct libpq connection string for connecting to the pageserver. - fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config { + fn pageserver_connection_config(password: &str, listen_addr: &str) -> PgConnectionConfig { format!("postgresql://no_user:{password}@{listen_addr}/no_db") .parse() .unwrap() @@ -212,7 +211,7 @@ impl PageServerNode { ) -> anyhow::Result<()> { println!( "Starting pageserver at '{}' in '{}'", - connection_address(&self.pg_connection_config), + self.pg_connection_config.raw_address(), datadir.display() ); io::stdout().flush()?; @@ -343,14 +342,14 @@ impl PageServerNode { } pub fn page_server_psql(&self, sql: &str) -> Vec { - let mut client = self.pg_connection_config.connect(NoTls).unwrap(); + let mut client = self.pg_connection_config.connect_no_tls().unwrap(); println!("Pageserver query: '{sql}'"); client.simple_query(sql).unwrap() } pub fn page_server_psql_client(&self) -> result::Result { - self.pg_connection_config.connect(NoTls) + self.pg_connection_config.connect_no_tls() } fn http_request(&self, method: Method, url: U) -> RequestBuilder { @@ -549,7 +548,7 @@ impl PageServerNode { pg_wal: Option<(Lsn, PathBuf)>, pg_version: u32, ) -> anyhow::Result<()> { - let mut client = self.pg_connection_config.connect(NoTls).unwrap(); + let mut client = self.pg_connection_config.connect_no_tls().unwrap(); // Init base reader let (start_lsn, base_tarfile_path) = base; diff --git a/libs/utils/src/connstring.rs b/libs/utils/src/connstring.rs deleted file mode 100644 index cda8eeac86..0000000000 --- a/libs/utils/src/connstring.rs +++ /dev/null @@ -1,52 +0,0 @@ -use postgres::Config; - -pub fn connection_host_port(config: &Config) -> (String, u16) { - assert_eq!( - config.get_hosts().len(), - 1, - "only one pair of host and port is supported in connection string" - ); - assert_eq!( - config.get_ports().len(), - 1, - "only one pair of host and port is supported in connection string" - ); - let host = match &config.get_hosts()[0] { - postgres::config::Host::Tcp(host) => host.as_ref(), - postgres::config::Host::Unix(host) => host.to_str().unwrap(), - }; - (host.to_owned(), config.get_ports()[0]) -} - -pub fn connection_address(config: &Config) -> String { - let (host, port) = connection_host_port(config); - format!("{}:{}", host, port) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_connection_host_port() { - let config: Config = "postgresql://no_user@localhost:64000/no_db" - .parse() - .unwrap(); - assert_eq!( - connection_host_port(&config), - ("localhost".to_owned(), 64000) - ); - } - - #[test] - #[should_panic(expected = "only one pair of host and port is supported in connection string")] - fn test_connection_host_port_multiple_ports() { - let config: Config = "postgresql://no_user@localhost:64000,localhost:64001/no_db" - .parse() - .unwrap(); - assert_eq!( - connection_host_port(&config), - ("localhost".to_owned(), 64000) - ); - } -} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index f1f48f5a90..aff86c8076 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -19,9 +19,6 @@ pub mod postgres_backend; pub mod postgres_backend_async; pub mod pq_proto; -// dealing with connstring parsing and handy access to it's parts -pub mod connstring; - // helper functions for creating and fsyncing pub mod crashsafe; diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 1d53df788d..d527e521e0 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -836,15 +836,20 @@ fn wal_stream_connection_string( listen_pg_addr_str: &str, ) -> anyhow::Result { let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db"); - let me_conf = sk_connstr - .parse::() - .with_context(|| { - format!("Failed to parse pageserver connection string '{sk_connstr}' as a postgres one") - })?; - let (host, port) = utils::connstring::connection_host_port(&me_conf); - Ok(format!( - "host={host} port={port} options='-c timeline_id={timeline_id} tenant_id={tenant_id}'" - )) + sk_connstr + .parse() + .context("bad url") + .and_then(|url: url::Url| { + let host = url.host_str().context("host is missing")?; + let port = url.port().unwrap_or(5432); // default PG port + + Ok(format!( + "host={host} \ + port={port} \ + options='-c timeline_id={timeline_id} tenant_id={tenant_id}'" + )) + }) + .with_context(|| format!("Failed to parse pageserver connection URL '{sk_connstr}'")) } #[cfg(test)] @@ -892,7 +897,7 @@ mod tests { peer_horizon_lsn: None, local_start_lsn: None, - safekeeper_connstr: Some("no commit_lsn".to_string()), + safekeeper_connstr: Some("no_commit_lsn".to_string()), }, etcd_version: 0, latest_update: now, @@ -909,7 +914,7 @@ mod tests { remote_consistent_lsn: None, peer_horizon_lsn: None, local_start_lsn: None, - safekeeper_connstr: Some("no commit_lsn".to_string()), + safekeeper_connstr: Some("no_commit_lsn".to_string()), }, etcd_version: 0, latest_update: now, @@ -1005,7 +1010,7 @@ mod tests { peer_horizon_lsn: None, local_start_lsn: None, - safekeeper_connstr: Some("not advanced Lsn".to_string()), + safekeeper_connstr: Some("not_advanced_lsn".to_string()), }, etcd_version: 0, latest_update: now, @@ -1023,7 +1028,7 @@ mod tests { peer_horizon_lsn: None, local_start_lsn: None, - safekeeper_connstr: Some("not enough advanced Lsn".to_string()), + safekeeper_connstr: Some("not_enough_advanced_lsn".to_string()), }, etcd_version: 0, latest_update: now, @@ -1093,7 +1098,7 @@ mod tests { peer_horizon_lsn: None, local_start_lsn: None, - safekeeper_connstr: Some("smaller commit_lsn".to_string()), + safekeeper_connstr: Some("smaller_commit_lsn".to_string()), }, etcd_version: 0, latest_update: now, @@ -1283,7 +1288,7 @@ mod tests { peer_horizon_lsn: None, local_start_lsn: None, - safekeeper_connstr: Some("advanced by Lsn safekeeper".to_string()), + safekeeper_connstr: Some("advanced_by_lsn_safekeeper".to_string()), }, etcd_version: 0, latest_update: now, @@ -1307,7 +1312,7 @@ mod tests { ); assert!(over_threshcurrent_candidate .wal_source_connstr - .contains("advanced by Lsn safekeeper")); + .contains("advanced_by_lsn_safekeeper")); Ok(()) } From 32d14403bd6ab4f4520a94cbfd81a6acef7a526c Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 31 Oct 2022 18:17:38 +0200 Subject: [PATCH 0975/1022] remove wrong is_active filter for timelines in compaction/gc Gc needs to know about all branch points, not only ones for timelines that are active at the moment of gc. If timeline is inactive then we wont know about branch point. In this case gc can delete data that is needed by child timeline. For compaction it is less severe. Delaying compaction can cause an effect on performance. So it is still better to run it. There is a logic to exit it quickly if there is nothing to compact --- pageserver/src/tenant.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 84833e9c40..ed9d2e8c7a 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -523,7 +523,6 @@ impl Tenant { let timelines = self.timelines.lock().unwrap(); let timelines_to_compact = timelines .iter() - .filter(|(_, timeline)| timeline.is_active()) .map(|(timeline_id, timeline)| (*timeline_id, timeline.clone())) .collect::>(); drop(timelines); @@ -995,7 +994,6 @@ impl Tenant { timelines .iter() - .filter(|(_, timeline)| timeline.is_active()) .map(|(timeline_id, timeline_entry)| { // This is unresolved question for now, how to do gc in presence of remote timelines // especially when this is combined with branching. From 6df4d5c91122294558793e7cb760ae80b9ba9fbc Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 2 Nov 2022 01:21:33 +0200 Subject: [PATCH 0976/1022] Bump rustc to 1.62.1 (#2728) Changelog: https://github.com/rust-lang/rust/blob/master/RELEASES.md#version-1621-2022-07-19 --- .github/workflows/build_and_test.yml | 6 +++--- .github/workflows/codestyle.yml | 2 +- rust-toolchain.toml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 91d9561e7d..435265270a 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -127,8 +127,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} - v9-${{ runner.os }}-${{ matrix.build_type }}-cargo- + v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} + v10-${{ runner.os }}-${{ matrix.build_type }}-cargo- - name: Cache postgres v14 build id: cache_pg_14 @@ -389,7 +389,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} + key: v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact uses: ./.github/actions/download diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 66f9f33256..1e77963760 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -106,7 +106,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git target - key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust + key: v6-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust - name: Run cargo clippy run: ./run_clippy.sh diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 5aa0f8d4e5..928a10e555 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -4,7 +4,7 @@ # version, we can consider updating. # See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package, # we use "unstable" version number as the highest version used in the project by default. -channel = "1.61" # do update GitHub CI cache values for rust builds, when changing this value +channel = "1.62.1" # do update GitHub CI cache values for rust builds, when changing this value profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html From d42700280fffab16327949125d4c09c89de51c84 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 2 Nov 2022 02:26:37 +0200 Subject: [PATCH 0977/1022] Remove daemonize from storage components (#2677) Move daemonization logic into `control_plane`. Storage binaries now only crate a lockfile to avoid concurrent services running in the same directory. --- Cargo.lock | 19 +- control_plane/src/background_process.rs | 264 ++++++++++++++++++ control_plane/src/bin/neon_local.rs | 2 +- control_plane/src/compute.rs | 2 +- control_plane/src/etcd.rs | 130 ++++----- control_plane/src/lib.rs | 52 +--- control_plane/src/local_env.rs | 8 +- .../src/{storage.rs => pageserver.rs} | 168 ++++------- control_plane/src/safekeeper.rs | 189 ++++--------- libs/utils/src/lib.rs | 2 + libs/utils/src/lock_file.rs | 81 ++++++ libs/utils/src/logging.rs | 59 +--- libs/utils/tests/logger_json_test.rs | 36 --- libs/utils/tests/logger_plain_test.rs | 36 --- pageserver/Cargo.toml | 1 - pageserver/src/bin/pageserver.rs | 91 +++--- pageserver/src/lib.rs | 3 - pageserver/src/walredo.rs | 4 - poetry.lock | 23 +- safekeeper/Cargo.toml | 2 +- safekeeper/src/bin/safekeeper.rs | 80 ++---- safekeeper/src/lib.rs | 2 - test_runner/fixtures/neon_fixtures.py | 50 +--- test_runner/fixtures/utils.py | 59 +++- test_runner/regress/test_auth.py | 6 +- test_runner/regress/test_compatibility.py | 4 +- test_runner/regress/test_neon_cli.py | 6 +- test_runner/regress/test_normal_work.py | 4 +- test_runner/regress/test_pageserver_api.py | 11 +- test_runner/regress/test_remote_storage.py | 3 +- test_runner/regress/test_tenant_detach.py | 14 +- test_runner/regress/test_tenant_relocation.py | 44 +-- test_runner/regress/test_tenant_tasks.py | 3 +- .../test_tenants_with_remote_storage.py | 3 +- test_runner/regress/test_timeline_delete.py | 11 +- test_runner/regress/test_timeline_size.py | 4 +- test_runner/regress/test_wal_acceptor.py | 22 +- 37 files changed, 754 insertions(+), 744 deletions(-) create mode 100644 control_plane/src/background_process.rs rename control_plane/src/{storage.rs => pageserver.rs} (80%) create mode 100644 libs/utils/src/lock_file.rs delete mode 100644 libs/utils/tests/logger_json_test.rs delete mode 100644 libs/utils/tests/logger_plain_test.rs diff --git a/Cargo.lock b/Cargo.lock index 326cccaecb..01b8abda9a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -317,12 +317,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "boxfnonce" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426" - [[package]] name = "bstr" version = "1.0.1" @@ -850,16 +844,6 @@ dependencies = [ "syn", ] -[[package]] -name = "daemonize" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70c24513e34f53b640819f0ac9f705b673fcf4006d7aab8778bee72ebfc89815" -dependencies = [ - "boxfnonce", - "libc", -] - [[package]] name = "darling" version = "0.14.1" @@ -2141,7 +2125,6 @@ dependencies = [ "crc32c", "criterion", "crossbeam-utils", - "daemonize", "etcd_broker", "fail", "futures", @@ -3088,7 +3071,6 @@ dependencies = [ "clap 4.0.15", "const_format", "crc32c", - "daemonize", "etcd_broker", "fs2", "git-version", @@ -3096,6 +3078,7 @@ dependencies = [ "humantime", "hyper", "metrics", + "nix 0.25.0", "once_cell", "parking_lot 0.12.1", "postgres", diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs new file mode 100644 index 0000000000..2f8098b7c9 --- /dev/null +++ b/control_plane/src/background_process.rs @@ -0,0 +1,264 @@ +//! Spawns and kills background processes that are needed by Neon CLI. +//! Applies common set-up such as log and pid files (if needed) to every process. +//! +//! Neon CLI does not run in background, so it needs to store the information about +//! spawned processes, which it does in this module. +//! We do that by storing the pid of the process in the "${process_name}.pid" file. +//! The pid file can be created by the process itself +//! (Neon storage binaries do that and also ensure that a lock is taken onto that file) +//! or we create such file after starting the process +//! (non-Neon binaries don't necessarily follow our pidfile conventions). +//! The pid stored in the file is later used to stop the service. +//! +//! See [`lock_file`] module for more info. + +use std::ffi::OsStr; +use std::io::Write; +use std::path::Path; +use std::process::{Child, Command}; +use std::time::Duration; +use std::{fs, io, thread}; + +use anyhow::{anyhow, bail, Context, Result}; +use nix::errno::Errno; +use nix::sys::signal::{kill, Signal}; +use nix::unistd::Pid; + +use utils::lock_file; + +const RETRIES: u32 = 15; +const RETRY_TIMEOUT_MILLIS: u64 = 500; + +/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates +/// it itself. +pub enum InitialPidFile<'t> { + /// Create a pidfile, to allow future CLI invocations to manipulate the process. + Create(&'t Path), + /// The process will create the pidfile itself, need to wait for that event. + Expect(&'t Path), +} + +/// Start a background child process using the parameters given. +pub fn start_process>( + process_name: &str, + datadir: &Path, + command: &Path, + args: &[S], + initial_pid_file: InitialPidFile, + process_status_check: F, +) -> anyhow::Result +where + F: Fn() -> anyhow::Result, +{ + let log_path = datadir.join(format!("{process_name}.log")); + let process_log_file = fs::OpenOptions::new() + .create(true) + .write(true) + .append(true) + .open(&log_path) + .with_context(|| { + format!("Could not open {process_name} log file {log_path:?} for writing") + })?; + let same_file_for_stderr = process_log_file.try_clone().with_context(|| { + format!("Could not reuse {process_name} log file {log_path:?} for writing stderr") + })?; + + let mut command = Command::new(command); + let background_command = command + .stdout(process_log_file) + .stderr(same_file_for_stderr) + .args(args); + let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command)); + + let mut spawned_process = filled_cmd.spawn().with_context(|| { + format!("Could not spawn {process_name}, see console output and log files for details.") + })?; + let pid = spawned_process.id(); + let pid = Pid::from_raw( + i32::try_from(pid) + .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?, + ); + + let pid_file_to_check = match initial_pid_file { + InitialPidFile::Create(target_pid_file_path) => { + match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) { + lock_file::LockCreationResult::Created { .. } => { + // We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon + // as this CLI invocation exits, so it's a bit useless, but doesn't any harm either. + } + lock_file::LockCreationResult::AlreadyLocked { .. } => { + anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process") + } + lock_file::LockCreationResult::CreationFailed(e) => { + return Err(e.context(format!( + "Failed to create pid file for {process_name} at path {target_pid_file_path:?}" + ))) + } + } + None + } + InitialPidFile::Expect(pid_file_path) => Some(pid_file_path), + }; + + for retries in 0..RETRIES { + match process_started(pid, pid_file_to_check, &process_status_check) { + Ok(true) => { + println!("\n{process_name} started, pid: {pid}"); + return Ok(spawned_process); + } + Ok(false) => { + if retries < 5 { + print!("."); + io::stdout().flush().unwrap(); + } else { + if retries == 5 { + println!() // put a line break after dots for second message + } + println!("{process_name} has not started yet, retrying ({retries})..."); + } + thread::sleep(Duration::from_millis(RETRY_TIMEOUT_MILLIS)); + } + Err(e) => { + println!("{process_name} failed to start: {e:#}"); + if let Err(e) = spawned_process.kill() { + println!("Could not stop {process_name} subprocess: {e:#}") + }; + return Err(e); + } + } + } + anyhow::bail!("{process_name} could not start in {RETRIES} attempts"); +} + +/// Stops the process, using the pid file given. Returns Ok also if the process is already not running. +pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> { + if !pid_file.exists() { + println!("{process_name} is already stopped: no pid file {pid_file:?} is present"); + return Ok(()); + } + let pid = read_pidfile(pid_file)?; + + let sig = if immediate { + print!("Stopping {process_name} with pid {pid} immediately.."); + Signal::SIGQUIT + } else { + print!("Stopping {process_name} with pid {pid} gracefully.."); + Signal::SIGTERM + }; + io::stdout().flush().unwrap(); + match kill(pid, sig) { + Ok(()) => (), + Err(Errno::ESRCH) => { + println!( + "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found" + ); + return Ok(()); + } + Err(e) => anyhow::bail!("Failed to send signal to {process_name} with pid {pid}: {e}"), + } + + // Wait until process is gone + for _ in 0..RETRIES { + match process_has_stopped(pid) { + Ok(true) => { + println!("\n{process_name} stopped"); + if let Err(e) = fs::remove_file(pid_file) { + if e.kind() != io::ErrorKind::NotFound { + eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}"); + } + } + return Ok(()); + } + Ok(false) => { + print!("."); + io::stdout().flush().unwrap(); + thread::sleep(Duration::from_secs(1)) + } + Err(e) => { + println!("{process_name} with pid {pid} failed to stop: {e:#}"); + return Err(e); + } + } + } + + anyhow::bail!("{process_name} with pid {pid} failed to stop in {RETRIES} attempts"); +} + +fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command { + let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1"); + + let var = "LLVM_PROFILE_FILE"; + if let Some(val) = std::env::var_os(var) { + filled_cmd = filled_cmd.env(var, val); + } + + const RUST_LOG_KEY: &str = "RUST_LOG"; + if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) { + filled_cmd.env(RUST_LOG_KEY, rust_log_value) + } else { + filled_cmd + } +} + +fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command { + for env_key in [ + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + "AWS_SESSION_TOKEN", + ] { + if let Ok(value) = std::env::var(env_key) { + cmd = cmd.env(env_key, value); + } + } + cmd +} + +fn process_started( + pid: Pid, + pid_file_to_check: Option<&Path>, + status_check: &F, +) -> anyhow::Result +where + F: Fn() -> anyhow::Result, +{ + match status_check() { + Ok(true) => match pid_file_to_check { + Some(pid_file_path) => { + if pid_file_path.exists() { + let pid_in_file = read_pidfile(pid_file_path)?; + Ok(pid_in_file == pid) + } else { + Ok(false) + } + } + None => Ok(true), + }, + Ok(false) => Ok(false), + Err(e) => anyhow::bail!("process failed to start: {e}"), + } +} + +/// Read a PID file +/// +/// We expect a file that contains a single integer. +fn read_pidfile(pidfile: &Path) -> Result { + let pid_str = fs::read_to_string(pidfile) + .with_context(|| format!("failed to read pidfile {pidfile:?}"))?; + let pid: i32 = pid_str + .parse() + .map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?; + if pid < 1 { + bail!("pidfile {pidfile:?} contained bad value '{pid}'"); + } + Ok(Pid::from_raw(pid)) +} + +fn process_has_stopped(pid: Pid) -> anyhow::Result { + match kill(pid, None) { + // Process exists, keep waiting + Ok(_) => Ok(false), + // Process not found, we're done + Err(Errno::ESRCH) => Ok(true), + Err(err) => anyhow::bail!("Failed to send signal to process with pid {pid}: {err}"), + } +} diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 70a2c97a9e..42a9199037 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -9,8 +9,8 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; use control_plane::compute::ComputeControlPlane; use control_plane::local_env::{EtcdBroker, LocalEnv}; +use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; -use control_plane::storage::PageServerNode; use control_plane::{etcd, local_env}; use pageserver_api::models::TimelineInfo; use pageserver_api::{ diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 89e4e85eb0..359948a8c9 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -18,8 +18,8 @@ use utils::{ }; use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION}; +use crate::pageserver::PageServerNode; use crate::postgresql_conf::PostgresConf; -use crate::storage::PageServerNode; // // ComputeControlPlane diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs index ca2df8a50b..4c15914e24 100644 --- a/control_plane/src/etcd.rs +++ b/control_plane/src/etcd.rs @@ -1,99 +1,75 @@ -use std::{ - fs, - path::PathBuf, - process::{Command, Stdio}, -}; +use std::{fs, path::PathBuf}; use anyhow::Context; -use nix::{ - sys::signal::{kill, Signal}, - unistd::Pid, -}; -use crate::{local_env, read_pidfile}; +use crate::{background_process, local_env}; pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { let etcd_broker = &env.etcd_broker; println!( - "Starting etcd broker using {}", - etcd_broker.etcd_binary_path.display() + "Starting etcd broker using {:?}", + etcd_broker.etcd_binary_path ); let etcd_data_dir = env.base_data_dir.join("etcd"); - fs::create_dir_all(&etcd_data_dir).with_context(|| { - format!( - "Failed to create etcd data dir: {}", - etcd_data_dir.display() - ) - })?; + fs::create_dir_all(&etcd_data_dir) + .with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?; - let etcd_stdout_file = - fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| { - format!( - "Failed to create etcd stout file in directory {}", - etcd_data_dir.display() - ) - })?; - let etcd_stderr_file = - fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| { - format!( - "Failed to create etcd stderr file in directory {}", - etcd_data_dir.display() - ) - })?; let client_urls = etcd_broker.comma_separated_endpoints(); + let args = [ + format!("--data-dir={}", etcd_data_dir.display()), + format!("--listen-client-urls={client_urls}"), + format!("--advertise-client-urls={client_urls}"), + // Set --quota-backend-bytes to keep the etcd virtual memory + // size smaller. Our test etcd clusters are very small. + // See https://github.com/etcd-io/etcd/issues/7910 + "--quota-backend-bytes=100000000".to_string(), + // etcd doesn't compact (vacuum) with default settings, + // enable it to prevent space exhaustion. + "--auto-compaction-mode=revision".to_string(), + "--auto-compaction-retention=1".to_string(), + ]; - let etcd_process = Command::new(&etcd_broker.etcd_binary_path) - .args(&[ - format!("--data-dir={}", etcd_data_dir.display()), - format!("--listen-client-urls={client_urls}"), - format!("--advertise-client-urls={client_urls}"), - // Set --quota-backend-bytes to keep the etcd virtual memory - // size smaller. Our test etcd clusters are very small. - // See https://github.com/etcd-io/etcd/issues/7910 - "--quota-backend-bytes=100000000".to_string(), - // etcd doesn't compact (vacuum) with default settings, - // enable it to prevent space exhaustion. - "--auto-compaction-mode=revision".to_string(), - "--auto-compaction-retention=1".to_string(), - ]) - .stdout(Stdio::from(etcd_stdout_file)) - .stderr(Stdio::from(etcd_stderr_file)) - .spawn() - .context("Failed to spawn etcd subprocess")?; - let pid = etcd_process.id(); + let pid_file_path = etcd_pid_file_path(env); - let etcd_pid_file_path = etcd_pid_file_path(env); - fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| { - format!( - "Failed to create etcd pid file at {}", - etcd_pid_file_path.display() - ) - })?; + let client = reqwest::blocking::Client::new(); + + background_process::start_process( + "etcd", + &etcd_data_dir, + &etcd_broker.etcd_binary_path, + &args, + background_process::InitialPidFile::Create(&pid_file_path), + || { + for broker_endpoint in &etcd_broker.broker_endpoints { + let request = broker_endpoint + .join("health") + .with_context(|| { + format!( + "Failed to append /health path to broker endopint {}", + broker_endpoint + ) + }) + .and_then(|url| { + client.get(&url.to_string()).build().with_context(|| { + format!("Failed to construct request to etcd endpoint {url}") + }) + })?; + if client.execute(request).is_ok() { + return Ok(true); + } + } + + Ok(false) + }, + ) + .context("Failed to spawn etcd subprocess")?; Ok(()) } pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { - let etcd_path = &env.etcd_broker.etcd_binary_path; - println!("Stopping etcd broker at {}", etcd_path.display()); - - let etcd_pid_file_path = etcd_pid_file_path(env); - let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| { - format!( - "Failed to read etcd pid file at {}", - etcd_pid_file_path.display() - ) - })?); - - kill(pid, Signal::SIGTERM).with_context(|| { - format!( - "Failed to stop etcd with pid {pid} at {}", - etcd_pid_file_path.display() - ) - })?; - - Ok(()) + background_process::stop_process(true, "etcd", &etcd_pid_file_path(env)) } fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf { diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index f22dce1810..c3b47fe81b 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -6,60 +6,12 @@ // Intended to be used in integration tests and in CLI tools for // local installations. // -use anyhow::{anyhow, bail, Context, Result}; -use std::fs; -use std::path::Path; -use std::process::Command; +mod background_process; pub mod compute; pub mod connection; pub mod etcd; pub mod local_env; +pub mod pageserver; pub mod postgresql_conf; pub mod safekeeper; -pub mod storage; - -/// Read a PID file -/// -/// We expect a file that contains a single integer. -/// We return an i32 for compatibility with libc and nix. -pub fn read_pidfile(pidfile: &Path) -> Result { - let pid_str = fs::read_to_string(pidfile) - .with_context(|| format!("failed to read pidfile {:?}", pidfile))?; - let pid: i32 = pid_str - .parse() - .map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?; - if pid < 1 { - bail!("pidfile {:?} contained bad value '{}'", pidfile, pid); - } - Ok(pid) -} - -fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command { - let cmd = cmd.env_clear().env("RUST_BACKTRACE", "1"); - - let var = "LLVM_PROFILE_FILE"; - if let Some(val) = std::env::var_os(var) { - cmd.env(var, val); - } - - const RUST_LOG_KEY: &str = "RUST_LOG"; - if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) { - cmd.env(RUST_LOG_KEY, rust_log_value) - } else { - cmd - } -} - -fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command { - for env_key in [ - "AWS_ACCESS_KEY_ID", - "AWS_SECRET_ACCESS_KEY", - "AWS_SESSION_TOKEN", - ] { - if let Ok(value) = std::env::var(env_key) { - cmd = cmd.env(env_key, value); - } - } - cmd -} diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 10b2db6396..ac4ebd0d1e 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -226,12 +226,12 @@ impl LocalEnv { } } - pub fn pageserver_bin(&self) -> anyhow::Result { - Ok(self.neon_distrib_dir.join("pageserver")) + pub fn pageserver_bin(&self) -> PathBuf { + self.neon_distrib_dir.join("pageserver") } - pub fn safekeeper_bin(&self) -> anyhow::Result { - Ok(self.neon_distrib_dir.join("safekeeper")) + pub fn safekeeper_bin(&self) -> PathBuf { + self.neon_distrib_dir.join("safekeeper") } pub fn pg_data_dirs_path(&self) -> PathBuf { diff --git a/control_plane/src/storage.rs b/control_plane/src/pageserver.rs similarity index 80% rename from control_plane/src/storage.rs rename to control_plane/src/pageserver.rs index 4b705690f0..fa6d1e496a 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/pageserver.rs @@ -1,17 +1,13 @@ use std::collections::HashMap; -use std::fs::File; +use std::fs::{self, File}; use std::io::{BufReader, Write}; use std::num::NonZeroU64; use std::path::{Path, PathBuf}; -use std::process::Command; -use std::time::Duration; -use std::{io, result, thread}; +use std::process::Child; +use std::{io, result}; use crate::connection::PgConnectionConfig; use anyhow::{bail, Context}; -use nix::errno::Errno; -use nix::sys::signal::{kill, Signal}; -use nix::unistd::Pid; use pageserver_api::models::{ TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo, }; @@ -25,8 +21,7 @@ use utils::{ postgres_backend::AuthType, }; -use crate::local_env::LocalEnv; -use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile}; +use crate::{background_process, local_env::LocalEnv}; #[derive(Error, Debug)] pub enum PageserverHttpError { @@ -160,7 +155,15 @@ impl PageServerNode { init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'"); } - self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?; + let mut pageserver_process = self + .start_node(&init_config_overrides, &self.env.base_data_dir, true) + .with_context(|| { + format!( + "Failed to start a process for pageserver {}", + self.env.pageserver.id, + ) + })?; + let init_result = self .try_init_timeline(create_tenant, initial_timeline_id, pg_version) .context("Failed to create initial tenant and timeline for pageserver"); @@ -170,7 +173,29 @@ impl PageServerNode { } Err(e) => eprintln!("{e:#}"), } - self.stop(false)?; + match pageserver_process.kill() { + Err(e) => { + eprintln!( + "Failed to stop pageserver {} process with pid {}: {e:#}", + self.env.pageserver.id, + pageserver_process.id(), + ) + } + Ok(()) => { + println!( + "Stopped pageserver {} process with pid {}", + self.env.pageserver.id, + pageserver_process.id(), + ); + // cleanup after pageserver startup, since we do not call regular `stop_process` during init + let pid_file = self.pid_file(); + if let Err(e) = fs::remove_file(&pid_file) { + if e.kind() != io::ErrorKind::NotFound { + eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}"); + } + } + } + } init_result } @@ -195,11 +220,14 @@ impl PageServerNode { self.env.pageserver_data_dir() } - pub fn pid_file(&self) -> PathBuf { + /// The pid file is created by the pageserver process, with its pid stored inside. + /// Other pageservers cannot lock the same file and overwrite it for as long as the current + /// pageserver runs. (Unless someone removes the file manually; never do that!) + fn pid_file(&self) -> PathBuf { self.repo_path().join("pageserver.pid") } - pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> { + pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result { self.start_node(config_overrides, &self.repo_path(), false) } @@ -208,7 +236,7 @@ impl PageServerNode { config_overrides: &[&str], datadir: &Path, update_config: bool, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { println!( "Starting pageserver at '{}' in '{}'", self.pg_connection_config.raw_address(), @@ -219,10 +247,7 @@ impl PageServerNode { let mut args = vec![ "-D", datadir.to_str().with_context(|| { - format!( - "Datadir path '{}' cannot be represented as a unicode string", - datadir.display() - ) + format!("Datadir path {datadir:?} cannot be represented as a unicode string") })?, ]; @@ -234,48 +259,18 @@ impl PageServerNode { args.extend(["-c", config_override]); } - let mut cmd = Command::new(self.env.pageserver_bin()?); - let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize")); - filled_cmd = fill_aws_secrets_vars(filled_cmd); - - if !filled_cmd.status()?.success() { - bail!( - "Pageserver failed to start. See console output and '{}' for details.", - datadir.join("pageserver.log").display() - ); - } - - // It takes a while for the page server to start up. Wait until it is - // open for business. - const RETRIES: i8 = 15; - for retries in 1..RETRIES { - match self.check_status() { - Ok(()) => { - println!("\nPageserver started"); - return Ok(()); - } - Err(err) => { - match err { - PageserverHttpError::Transport(err) => { - if err.is_connect() && retries < 5 { - print!("."); - io::stdout().flush().unwrap(); - } else { - if retries == 5 { - println!() // put a line break after dots for second message - } - println!("Pageserver not responding yet, err {err} retrying ({retries})..."); - } - } - PageserverHttpError::Response(msg) => { - bail!("pageserver failed to start: {msg} ") - } - } - thread::sleep(Duration::from_secs(1)); - } - } - } - bail!("pageserver failed to start in {RETRIES} seconds"); + background_process::start_process( + "pageserver", + datadir, + &self.env.pageserver_bin(), + &args, + background_process::InitialPidFile::Expect(&self.pid_file()), + || match self.check_status() { + Ok(()) => Ok(true), + Err(PageserverHttpError::Transport(_)) => Ok(false), + Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")), + }, + ) } /// @@ -287,58 +282,7 @@ impl PageServerNode { /// If the server is not running, returns success /// pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { - let pid_file = self.pid_file(); - if !pid_file.exists() { - println!("Pageserver is already stopped"); - return Ok(()); - } - let pid = Pid::from_raw(read_pidfile(&pid_file)?); - - let sig = if immediate { - print!("Stopping pageserver immediately.."); - Signal::SIGQUIT - } else { - print!("Stopping pageserver gracefully.."); - Signal::SIGTERM - }; - io::stdout().flush().unwrap(); - match kill(pid, sig) { - Ok(_) => (), - Err(Errno::ESRCH) => { - println!("Pageserver with pid {pid} does not exist, but a PID file was found"); - return Ok(()); - } - Err(err) => bail!( - "Failed to send signal to pageserver with pid {pid}: {}", - err.desc() - ), - } - - // Wait until process is gone - for i in 0..600 { - let signal = None; // Send no signal, just get the error code - match kill(pid, signal) { - Ok(_) => (), // Process exists, keep waiting - Err(Errno::ESRCH) => { - // Process not found, we're done - println!("done!"); - return Ok(()); - } - Err(err) => bail!( - "Failed to send signal to pageserver with pid {}: {}", - pid, - err.desc() - ), - }; - - if i % 10 == 0 { - print!("."); - io::stdout().flush().unwrap(); - } - thread::sleep(Duration::from_millis(100)); - } - - bail!("Failed to stop pageserver with pid {pid}"); + background_process::stop_process(immediate, "pageserver", &self.pid_file()) } pub fn page_server_psql(&self, sql: &str) -> Vec { diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 91cedeca23..0bc35b3680 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -1,23 +1,21 @@ use std::io::Write; use std::path::PathBuf; -use std::process::Command; +use std::process::Child; use std::sync::Arc; -use std::time::Duration; -use std::{io, result, thread}; +use std::{io, result}; -use anyhow::bail; -use nix::errno::Errno; -use nix::sys::signal::{kill, Signal}; -use nix::unistd::Pid; +use anyhow::Context; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use thiserror::Error; use utils::{http::error::HttpErrorBody, id::NodeId}; use crate::connection::PgConnectionConfig; -use crate::local_env::{LocalEnv, SafekeeperConf}; -use crate::storage::PageServerNode; -use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile}; +use crate::pageserver::PageServerNode; +use crate::{ + background_process, + local_env::{LocalEnv, SafekeeperConf}, +}; #[derive(Error, Debug)] pub enum SafekeeperHttpError { @@ -95,7 +93,7 @@ impl SafekeeperNode { } pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf { - env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref()) + env.safekeeper_data_dir(&format!("sk{sk_id}")) } pub fn datadir_path(&self) -> PathBuf { @@ -106,7 +104,7 @@ impl SafekeeperNode { self.datadir_path().join("safekeeper.pid") } - pub fn start(&self) -> anyhow::Result<()> { + pub fn start(&self) -> anyhow::Result { print!( "Starting safekeeper at '{}' in '{}'", self.pg_connection_config.raw_address(), @@ -116,81 +114,68 @@ impl SafekeeperNode { let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port); let listen_http = format!("127.0.0.1:{}", self.conf.http_port); + let id = self.id; + let datadir = self.datadir_path(); - let mut cmd = Command::new(self.env.safekeeper_bin()?); - fill_rust_env_vars( - cmd.args(&["-D", self.datadir_path().to_str().unwrap()]) - .args(&["--id", self.id.to_string().as_ref()]) - .args(&["--listen-pg", &listen_pg]) - .args(&["--listen-http", &listen_http]) - .arg("--daemonize"), - ); + let id_string = id.to_string(); + let mut args = vec![ + "-D", + datadir.to_str().with_context(|| { + format!("Datadir path {datadir:?} cannot be represented as a unicode string") + })?, + "--id", + &id_string, + "--listen-pg", + &listen_pg, + "--listen-http", + &listen_http, + ]; if !self.conf.sync { - cmd.arg("--no-sync"); + args.push("--no-sync"); } let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints(); if !comma_separated_endpoints.is_empty() { - cmd.args(&["--broker-endpoints", &comma_separated_endpoints]); + args.extend(["--broker-endpoints", &comma_separated_endpoints]); } if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() { - cmd.args(&["--broker-etcd-prefix", prefix]); + args.extend(["--broker-etcd-prefix", prefix]); } + + let mut backup_threads = String::new(); if let Some(threads) = self.conf.backup_threads { - cmd.args(&["--backup-threads", threads.to_string().as_ref()]); + backup_threads = threads.to_string(); + args.extend(["--backup-threads", &backup_threads]); + } else { + drop(backup_threads); } + if let Some(ref remote_storage) = self.conf.remote_storage { - cmd.args(&["--remote-storage", remote_storage]); + args.extend(["--remote-storage", remote_storage]); } + + let key_path = self.env.base_data_dir.join("auth_public_key.pem"); if self.conf.auth_enabled { - cmd.arg("--auth-validation-public-key-path"); - // PathBuf is better be passed as is, not via `String`. - cmd.arg(self.env.base_data_dir.join("auth_public_key.pem")); + args.extend([ + "--auth-validation-public-key-path", + key_path.to_str().with_context(|| { + format!("Key path {key_path:?} cannot be represented as a unicode string") + })?, + ]); } - fill_aws_secrets_vars(&mut cmd); - - if !cmd.status()?.success() { - bail!( - "Safekeeper failed to start. See '{}' for details.", - self.datadir_path().join("safekeeper.log").display() - ); - } - - // It takes a while for the safekeeper to start up. Wait until it is - // open for business. - const RETRIES: i8 = 15; - for retries in 1..RETRIES { - match self.check_status() { - Ok(_) => { - println!("\nSafekeeper started"); - return Ok(()); - } - Err(err) => { - match err { - SafekeeperHttpError::Transport(err) => { - if err.is_connect() && retries < 5 { - print!("."); - io::stdout().flush().unwrap(); - } else { - if retries == 5 { - println!() // put a line break after dots for second message - } - println!( - "Safekeeper not responding yet, err {} retrying ({})...", - err, retries - ); - } - } - SafekeeperHttpError::Response(msg) => { - bail!("safekeeper failed to start: {} ", msg) - } - } - thread::sleep(Duration::from_secs(1)); - } - } - } - bail!("safekeeper failed to start in {} seconds", RETRIES); + background_process::start_process( + &format!("safekeeper {id}"), + &datadir, + &self.env.safekeeper_bin(), + &args, + background_process::InitialPidFile::Expect(&self.pid_file()), + || match self.check_status() { + Ok(()) => Ok(true), + Err(SafekeeperHttpError::Transport(_)) => Ok(false), + Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")), + }, + ) } /// @@ -202,63 +187,11 @@ impl SafekeeperNode { /// If the server is not running, returns success /// pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { - let pid_file = self.pid_file(); - if !pid_file.exists() { - println!("Safekeeper {} is already stopped", self.id); - return Ok(()); - } - let pid = read_pidfile(&pid_file)?; - let pid = Pid::from_raw(pid); - - let sig = if immediate { - print!("Stopping safekeeper {} immediately..", self.id); - Signal::SIGQUIT - } else { - print!("Stopping safekeeper {} gracefully..", self.id); - Signal::SIGTERM - }; - io::stdout().flush().unwrap(); - match kill(pid, sig) { - Ok(_) => (), - Err(Errno::ESRCH) => { - println!( - "Safekeeper with pid {} does not exist, but a PID file was found", - pid - ); - return Ok(()); - } - Err(err) => bail!( - "Failed to send signal to safekeeper with pid {}: {}", - pid, - err.desc() - ), - } - - // Wait until process is gone - for i in 0..600 { - let signal = None; // Send no signal, just get the error code - match kill(pid, signal) { - Ok(_) => (), // Process exists, keep waiting - Err(Errno::ESRCH) => { - // Process not found, we're done - println!("done!"); - return Ok(()); - } - Err(err) => bail!( - "Failed to send signal to pageserver with pid {}: {}", - pid, - err.desc() - ), - }; - - if i % 10 == 0 { - print!("."); - io::stdout().flush().unwrap(); - } - thread::sleep(Duration::from_millis(100)); - } - - bail!("Failed to stop safekeeper with pid {}", pid); + background_process::stop_process( + immediate, + &format!("safekeeper {}", self.id), + &self.pid_file(), + ) } fn http_request(&self, method: Method, url: U) -> RequestBuilder { diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index aff86c8076..6f51465609 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -36,6 +36,8 @@ pub mod sock_split; // common log initialisation routine pub mod logging; +pub mod lock_file; + // Misc pub mod accum; pub mod shutdown; diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs new file mode 100644 index 0000000000..4fef65852b --- /dev/null +++ b/libs/utils/src/lock_file.rs @@ -0,0 +1,81 @@ +//! A module to create and read lock files. A lock file ensures that only one +//! process is running at a time, in a particular directory. +//! +//! File locking is done using [`fcntl::flock`], which means that holding the +//! lock on file only prevents acquiring another lock on it; all other +//! operations are still possible on files. Other process can still open, read, +//! write, or remove the file, for example. +//! If the file is removed while a process is holding a lock on it, +//! the process that holds the lock does not get any error or notification. +//! Furthermore, you can create a new file with the same name and lock the new file, +//! while the old process is still running. +//! Deleting the lock file while the locking process is still running is a bad idea! + +use std::{fs, os::unix::prelude::AsRawFd, path::Path}; + +use anyhow::Context; +use nix::fcntl; + +use crate::crashsafe; + +pub enum LockCreationResult { + Created { + new_lock_contents: String, + file: fs::File, + }, + AlreadyLocked { + existing_lock_contents: String, + }, + CreationFailed(anyhow::Error), +} + +/// Creates a lock file in the path given and writes the given contents into the file. +/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program. +pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult { + let lock_file = match fs::OpenOptions::new() + .create(true) // O_CREAT + .write(true) + .open(lock_file_path) + .context("Failed to open lock file") + { + Ok(file) => file, + Err(e) => return LockCreationResult::CreationFailed(e), + }; + + match fcntl::flock( + lock_file.as_raw_fd(), + fcntl::FlockArg::LockExclusiveNonblock, + ) { + Ok(()) => { + match lock_file + .set_len(0) + .context("Failed to truncate lockfile") + .and_then(|()| { + fs::write(lock_file_path, &contents).with_context(|| { + format!("Failed to write '{contents}' contents into lockfile") + }) + }) + .and_then(|()| { + crashsafe::fsync_file_and_parent(lock_file_path) + .context("Failed to fsync lockfile") + }) { + Ok(()) => LockCreationResult::Created { + new_lock_contents: contents, + file: lock_file, + }, + Err(e) => LockCreationResult::CreationFailed(e), + } + } + Err(nix::errno::Errno::EAGAIN) => { + match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") { + Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked { + existing_lock_contents, + }, + Err(e) => LockCreationResult::CreationFailed(e), + } + } + Err(e) => { + LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}")) + } + } +} diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 31c0e02f98..3b1a1f5aff 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -1,10 +1,6 @@ -use std::{ - fs::{File, OpenOptions}, - path::Path, - str::FromStr, -}; +use std::str::FromStr; -use anyhow::{Context, Result}; +use anyhow::Context; use strum_macros::{EnumString, EnumVariantNames}; #[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)] @@ -25,19 +21,8 @@ impl LogFormat { }) } } -pub fn init( - log_filename: impl AsRef, - daemonize: bool, - log_format: LogFormat, -) -> Result { - // Don't open the same file for output multiple times; - // the different fds could overwrite each other's output. - let log_file = OpenOptions::new() - .create(true) - .append(true) - .open(&log_filename) - .with_context(|| format!("failed to open {:?}", log_filename.as_ref()))?; +pub fn init(log_format: LogFormat) -> anyhow::Result<()> { let default_filter_str = "info"; // We fall back to printing all spans at info-level or above if @@ -45,50 +30,16 @@ pub fn init( let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str)); - let x: File = log_file.try_clone().unwrap(); let base_logger = tracing_subscriber::fmt() .with_env_filter(env_filter) .with_target(false) .with_ansi(false) - .with_writer(move || -> Box { - // we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it - // if we do not use daemonization (e.g. in docker) it is better to log to stdout directly - // for example to be in line with docker log command which expects logs comimg from stdout - if daemonize { - Box::new(x.try_clone().unwrap()) - } else { - Box::new(std::io::stdout()) - } - }); + .with_writer(std::io::stdout); match log_format { LogFormat::Json => base_logger.json().init(), LogFormat::Plain => base_logger.init(), } - Ok(log_file) -} - -// #[cfg(test)] -// Due to global logger, can't run tests in same process. -// So until there's a non-global one, the tests are in ../tests/ as separate files. -#[macro_export(local_inner_macros)] -macro_rules! test_init_file_logger { - ($log_level:expr, $log_format:expr) => {{ - use std::str::FromStr; - std::env::set_var("RUST_LOG", $log_level); - - let tmp_dir = tempfile::TempDir::new().unwrap(); - let log_file_path = tmp_dir.path().join("logfile"); - - let log_format = $crate::logging::LogFormat::from_str($log_format).unwrap(); - let _log_file = $crate::logging::init(&log_file_path, true, log_format).unwrap(); - - let log_file = std::fs::OpenOptions::new() - .read(true) - .open(&log_file_path) - .unwrap(); - - log_file - }}; + Ok(()) } diff --git a/libs/utils/tests/logger_json_test.rs b/libs/utils/tests/logger_json_test.rs deleted file mode 100644 index 5d63b9b004..0000000000 --- a/libs/utils/tests/logger_json_test.rs +++ /dev/null @@ -1,36 +0,0 @@ -// This could be in ../src/logging.rs but since the logger is global, these -// can't be run in threads of the same process -use std::fs::File; -use std::io::{BufRead, BufReader, Lines}; -use tracing::*; -use utils::test_init_file_logger; - -fn read_lines(file: File) -> Lines> { - BufReader::new(file).lines() -} - -#[test] -fn test_json_format_has_message_and_custom_field() { - std::env::set_var("RUST_LOG", "info"); - - let log_file = test_init_file_logger!("info", "json"); - - let custom_field: &str = "hi"; - trace!(custom = %custom_field, "test log message"); - debug!(custom = %custom_field, "test log message"); - info!(custom = %custom_field, "test log message"); - warn!(custom = %custom_field, "test log message"); - error!(custom = %custom_field, "test log message"); - - let lines = read_lines(log_file); - for line in lines { - let content = line.unwrap(); - let json_object = serde_json::from_str::(&content).unwrap(); - - assert_eq!(json_object["fields"]["custom"], "hi"); - assert_eq!(json_object["fields"]["message"], "test log message"); - - assert_ne!(json_object["level"], "TRACE"); - assert_ne!(json_object["level"], "DEBUG"); - } -} diff --git a/libs/utils/tests/logger_plain_test.rs b/libs/utils/tests/logger_plain_test.rs deleted file mode 100644 index bc5abf45dd..0000000000 --- a/libs/utils/tests/logger_plain_test.rs +++ /dev/null @@ -1,36 +0,0 @@ -// This could be in ../src/logging.rs but since the logger is global, these -// can't be run in threads of the same process -use std::fs::File; -use std::io::{BufRead, BufReader, Lines}; -use tracing::*; -use utils::test_init_file_logger; - -fn read_lines(file: File) -> Lines> { - BufReader::new(file).lines() -} - -#[test] -fn test_plain_format_has_message_and_custom_field() { - std::env::set_var("RUST_LOG", "warn"); - - let log_file = test_init_file_logger!("warn", "plain"); - - let custom_field: &str = "hi"; - trace!(custom = %custom_field, "test log message"); - debug!(custom = %custom_field, "test log message"); - info!(custom = %custom_field, "test log message"); - warn!(custom = %custom_field, "test log message"); - error!(custom = %custom_field, "test log message"); - - let lines = read_lines(log_file); - for line in lines { - let content = line.unwrap(); - serde_json::from_str::(&content).unwrap_err(); - assert!(content.contains("custom=hi")); - assert!(content.contains("test log message")); - - assert!(!content.contains("TRACE")); - assert!(!content.contains("DEBUG")); - assert!(!content.contains("INFO")); - } -} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index b075b86aa1..4262ca9820 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -24,7 +24,6 @@ hex = "0.4.3" hyper = "0.14" itertools = "0.10.3" clap = { version = "4.0", features = ["string"] } -daemonize = "0.4.1" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } tokio-util = { version = "0.7.3", features = ["io", "io-util"] } postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 802352be90..62119b51c6 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -1,17 +1,14 @@ //! Main entry point for the Page Server executable. -use remote_storage::GenericRemoteStorage; use std::{env, ops::ControlFlow, path::Path, str::FromStr}; + +use anyhow::{anyhow, Context}; +use clap::{Arg, ArgAction, Command}; +use fail::FailScenario; +use nix::unistd::Pid; use tracing::*; -use anyhow::{anyhow, bail, Context, Result}; - -use clap::{Arg, ArgAction, Command}; -use daemonize::Daemonize; - -use fail::FailScenario; use metrics::set_build_info_metric; - use pageserver::{ config::{defaults::*, PageServerConf}, http, page_cache, page_service, profiling, task_mgr, @@ -19,20 +16,22 @@ use pageserver::{ task_mgr::{ BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, }, - tenant_mgr, virtual_file, LOG_FILE_NAME, + tenant_mgr, virtual_file, }; +use remote_storage::GenericRemoteStorage; use utils::{ auth::JwtAuth, - logging, + lock_file, logging, postgres_backend::AuthType, project_git_version, - shutdown::exit_now, signals::{self, Signal}, tcp_listener, }; project_git_version!(GIT_VERSION); +const PID_FILE_NAME: &str = "pageserver.pid"; + const FEATURES: &[&str] = &[ #[cfg(feature = "testing")] "testing", @@ -65,6 +64,7 @@ fn main() -> anyhow::Result<()> { let workdir = workdir .canonicalize() .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?; + let cfg_file_path = workdir.join("pageserver.toml"); // Set CWD to workdir for non-daemon modes @@ -75,8 +75,6 @@ fn main() -> anyhow::Result<()> { ) })?; - let daemonize = arg_matches.get_flag("daemonize"); - let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? { ControlFlow::Continue(conf) => conf, ControlFlow::Break(()) => { @@ -102,7 +100,7 @@ fn main() -> anyhow::Result<()> { virtual_file::init(conf.max_file_descriptors); page_cache::init(conf.page_cache_size); - start_pageserver(conf, daemonize).context("Failed to start pageserver")?; + start_pageserver(conf).context("Failed to start pageserver")?; scenario.teardown(); Ok(()) @@ -197,12 +195,34 @@ fn initialize_config( }) } -fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> { - // Initialize logger - let log_file = logging::init(LOG_FILE_NAME, daemonize, conf.log_format)?; - +fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { + logging::init(conf.log_format)?; info!("version: {}", version()); + let lock_file_path = conf.workdir.join(PID_FILE_NAME); + let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) { + lock_file::LockCreationResult::Created { + new_lock_contents, + file, + } => { + info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}"); + file + } + lock_file::LockCreationResult::AlreadyLocked { + existing_lock_contents, + } => anyhow::bail!( + "Could not lock pid file; pageserver is already running in {:?} with PID {}", + conf.workdir, + existing_lock_contents + ), + lock_file::LockCreationResult::CreationFailed(e) => { + return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}"))) + } + }; + // ensure that the lock file is held even if the main thread of the process is panics + // we need to release the lock file only when the current process is gone + let _ = Box::leak(Box::new(lock_file)); + // TODO: Check that it looks like a valid repository before going further // bind sockets before daemonizing so we report errors early and do not return until we are listening @@ -218,33 +238,6 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() ); let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?; - // NB: Don't spawn any threads before daemonizing! - if daemonize { - info!("daemonizing..."); - - // There shouldn't be any logging to stdin/stdout. Redirect it to the main log so - // that we will see any accidental manual fprintf's or backtraces. - let stdout = log_file - .try_clone() - .with_context(|| format!("Failed to clone log file '{:?}'", log_file))?; - let stderr = log_file; - - let daemonize = Daemonize::new() - .pid_file("pageserver.pid") - .working_directory(".") - .stdout(stdout) - .stderr(stderr); - - // XXX: The parent process should exit abruptly right after - // it has spawned a child to prevent coverage machinery from - // dumping stats into a `profraw` file now owned by the child. - // Otherwise, the coverage data will be damaged. - match daemonize.exit_action(|| exit_now(0)).start() { - Ok(_) => info!("Success, daemonized"), - Err(err) => bail!("{err}. could not daemonize. bailing."), - } - } - let signals = signals::install_shutdown_handlers()?; // start profiler (if enabled) @@ -347,14 +340,6 @@ fn cli() -> Command { Command::new("Neon page server") .about("Materializes WAL stream to pages and serves them to the postgres") .version(version()) - .arg( - - Arg::new("daemonize") - .short('d') - .long("daemonize") - .action(ArgAction::SetTrue) - .help("Run in the background"), - ) .arg( Arg::new("init") .long("init") diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 52a4cb0381..11be649e9f 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -43,8 +43,6 @@ pub const DEFAULT_PG_VERSION: u32 = 14; pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; pub const DELTA_FILE_MAGIC: u16 = 0x5A61; -pub const LOG_FILE_NAME: &str = "pageserver.log"; - static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); /// Config for the Repository checkpointer @@ -81,7 +79,6 @@ pub async fn shutdown_pageserver(exit_code: i32) { // There should be nothing left, but let's be sure task_mgr::shutdown_tasks(None, None, None).await; - info!("Shut down successfully completed"); std::process::exit(exit_code); } diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 39dccf2eba..e21ec4d742 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -671,10 +671,6 @@ impl PostgresRedoProcess { // The Rust standard library makes sure to mark any file descriptors with // as close-on-exec by default, but that's not enough, since we use // libraries that directly call libc open without setting that flag. - // - // One example is the pidfile of the daemonize library, which doesn't - // currently mark file descriptors as close-on-exec. Either way, we - // want to be on the safe side and prevent accidental regression. .close_fds() .spawn() .map_err(|e| { diff --git a/poetry.lock b/poetry.lock index dfcb16107f..fdfe88acf1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1568,7 +1568,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "17cdbfe90f1b06dffaf24c3e076384ec08dd4a2dce5a05e50565f7364932eb2d" +content-hash = "9352a89d49d34807f6a58f6c3f898acbd8cf3570e0f45ede973673644bde4d0e" [metadata.files] aiopg = [ @@ -1978,6 +1978,7 @@ prometheus-client = [ psycopg2-binary = [ {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"}, @@ -2011,6 +2012,7 @@ psycopg2-binary = [ {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"}, {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"}, @@ -2022,6 +2024,7 @@ psycopg2-binary = [ {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"}, @@ -2038,18 +2041,7 @@ py = [ {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] pyasn1 = [ - {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"}, - {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"}, - {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"}, - {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"}, {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, - {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"}, - {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"}, - {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"}, - {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"}, - {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"}, - {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"}, - {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"}, {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, ] pycodestyle = [ @@ -2159,6 +2151,13 @@ pyyaml = [ {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, + {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"}, + {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"}, + {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"}, + {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"}, {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 64c541ddef..0c0ca2ff9f 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -12,7 +12,7 @@ fs2 = "0.4.3" serde_json = "1" tracing = "0.1.27" clap = "4.0" -daemonize = "0.4.1" +nix = "0.25" tokio = { version = "1.17", features = ["macros", "fs"] } postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 67c2c62f73..42f8188d6a 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -4,8 +4,7 @@ use anyhow::{bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, Command}; use const_format::formatcp; -use daemonize::Daemonize; -use fs2::FileExt; +use nix::unistd::Pid; use remote_storage::RemoteStorageConfig; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; @@ -16,6 +15,7 @@ use tokio::sync::mpsc; use toml_edit::Document; use tracing::*; use url::{ParseError, Url}; +use utils::lock_file; use metrics::set_build_info_metric; use safekeeper::broker; @@ -35,12 +35,10 @@ use utils::{ http::endpoint, id::NodeId, logging::{self, LogFormat}, - project_git_version, - shutdown::exit_now, - signals, tcp_listener, + project_git_version, signals, tcp_listener, }; -const LOCK_FILE_NAME: &str = "safekeeper.lock"; +const PID_FILE_NAME: &str = "safekeeper.pid"; const ID_FILE_NAME: &str = "safekeeper.id"; project_git_version!(GIT_VERSION); @@ -65,10 +63,6 @@ fn main() -> anyhow::Result<()> { conf.no_sync = true; } - if arg_matches.get_flag("daemonize") { - conf.daemonize = true; - } - if let Some(addr) = arg_matches.get_one::("listen-pg") { conf.listen_pg_addr = addr.to_string(); } @@ -143,19 +137,33 @@ fn main() -> anyhow::Result<()> { } fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { - let log_file = logging::init("safekeeper.log", conf.daemonize, conf.log_format)?; - + logging::init(conf.log_format)?; info!("version: {GIT_VERSION}"); // Prevent running multiple safekeepers on the same directory - let lock_file_path = conf.workdir.join(LOCK_FILE_NAME); - let lock_file = File::create(&lock_file_path).context("failed to open lockfile")?; - lock_file.try_lock_exclusive().with_context(|| { - format!( - "control file {} is locked by some other process", - lock_file_path.display() - ) - })?; + let lock_file_path = conf.workdir.join(PID_FILE_NAME); + let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) { + lock_file::LockCreationResult::Created { + new_lock_contents, + file, + } => { + info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}"); + file + } + lock_file::LockCreationResult::AlreadyLocked { + existing_lock_contents, + } => anyhow::bail!( + "Could not lock pid file; safekeeper is already running in {:?} with PID {}", + conf.workdir, + existing_lock_contents + ), + lock_file::LockCreationResult::CreationFailed(e) => { + return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}"))) + } + }; + // ensure that the lock file is held even if the main thread of the process is panics + // we need to release the lock file only when the current process is gone + let _ = Box::leak(Box::new(lock_file)); // Set or read our ID. set_id(&mut conf, given_id)?; @@ -187,31 +195,6 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo } }; - // XXX: Don't spawn any threads before daemonizing! - if conf.daemonize { - info!("daemonizing..."); - - // There should'n be any logging to stdin/stdout. Redirect it to the main log so - // that we will see any accidental manual fprintf's or backtraces. - let stdout = log_file.try_clone().unwrap(); - let stderr = log_file; - - let daemonize = Daemonize::new() - .pid_file("safekeeper.pid") - .working_directory(Path::new(".")) - .stdout(stdout) - .stderr(stderr); - - // XXX: The parent process should exit abruptly right after - // it has spawned a child to prevent coverage machinery from - // dumping stats into a `profraw` file now owned by the child. - // Otherwise, the coverage data will be damaged. - match daemonize.exit_action(|| exit_now(0)).start() { - Ok(_) => info!("Success, daemonized"), - Err(err) => bail!("Error: {err}. could not daemonize. bailing."), - } - } - // Register metrics collector for active timelines. It's important to do this // after daemonizing, otherwise process collector will be upset. let timeline_collector = safekeeper::metrics::TimelineCollector::new(); @@ -384,13 +367,6 @@ fn cli() -> Command { .short('p') .long("pageserver"), ) - .arg( - Arg::new("daemonize") - .short('d') - .long("daemonize") - .action(ArgAction::SetTrue) - .help("Run in the background"), - ) .arg( Arg::new("no-sync") .short('n') diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index c3b8227e17..395a29c9ed 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -54,7 +54,6 @@ pub struct SafeKeeperConf { // data directories to avoid clashing with each other. pub workdir: PathBuf, - pub daemonize: bool, pub no_sync: bool, pub listen_pg_addr: String, pub listen_http_addr: String, @@ -88,7 +87,6 @@ impl Default for SafeKeeperConf { // command line, so that when the server is running, all paths are relative // to that. workdir: PathBuf::from("./"), - daemonize: false, no_sync: false, listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index e7e0e4ce56..b62c80824a 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -19,7 +19,7 @@ from dataclasses import dataclass, field from enum import Flag, auto from functools import cached_property from pathlib import Path -from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, TypeVar, Union, cast +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast import asyncpg import backoff # type: ignore @@ -36,7 +36,7 @@ from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import make_dsn, parse_dsn from typing_extensions import Literal -from .utils import allure_attach_from_dir, etcd_path, get_self_dir, subprocess_capture +from .utils import Fn, allure_attach_from_dir, etcd_path, get_self_dir, subprocess_capture """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -56,7 +56,6 @@ put directly-importable functions into utils.py or another separate file. """ Env = Dict[str, str] -Fn = TypeVar("Fn", bound=Callable[..., Any]) DEFAULT_OUTPUT_DIR = "test_output" DEFAULT_BRANCH_NAME = "main" @@ -965,11 +964,11 @@ def neon_env_builder( yield builder -class NeonPageserverApiException(Exception): +class PageserverApiException(Exception): pass -class NeonPageserverHttpClient(requests.Session): +class PageserverHttpClient(requests.Session): def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None): super().__init__() self.port = port @@ -987,7 +986,7 @@ class NeonPageserverHttpClient(requests.Session): msg = res.json()["msg"] except: # noqa: E722 msg = "" - raise NeonPageserverApiException(msg) from e + raise PageserverApiException(msg) from e def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() @@ -1624,8 +1623,6 @@ class ComputeCtl(AbstractNeonCli): class NeonPageserver(PgProtocol): """ An object representing a running pageserver. - - Initializes the repository via `neon init`. """ TEMP_FILE_SUFFIX = "___temp" @@ -1674,8 +1671,8 @@ class NeonPageserver(PgProtocol): if '"profiling"' not in self.version: pytest.skip("pageserver was built without 'profiling' feature") - def http_client(self, auth_token: Optional[str] = None) -> NeonPageserverHttpClient: - return NeonPageserverHttpClient( + def http_client(self, auth_token: Optional[str] = None) -> PageserverHttpClient: + return PageserverHttpClient( port=self.service_port.http, auth_token=auth_token, is_testing_enabled_or_skip=self.is_testing_enabled_or_skip, @@ -2260,11 +2257,6 @@ class PostgresFactory: return self -def read_pid(path: Path) -> int: - """Read content of file into number""" - return int(path.read_text()) - - @dataclass class SafekeeperPort: pg: int @@ -2688,26 +2680,8 @@ def check_restored_datadir_content( assert (mismatch, error) == ([], []) -def wait_until(number_of_iterations: int, interval: float, func): - """ - Wait until 'func' returns successfully, without exception. Returns the - last return value from the function. - """ - last_exception = None - for i in range(number_of_iterations): - try: - res = func() - except Exception as e: - log.info("waiting for %s iteration %s failed", func, i + 1) - last_exception = e - time.sleep(interval) - continue - return res - raise Exception("timed out while waiting for %s" % func) from last_exception - - def assert_no_in_progress_downloads_for_tenant( - pageserver_http_client: NeonPageserverHttpClient, + pageserver_http_client: PageserverHttpClient, tenant: TenantId, ): tenant_status = pageserver_http_client.tenant_status(tenant) @@ -2715,7 +2689,7 @@ def assert_no_in_progress_downloads_for_tenant( def remote_consistent_lsn( - pageserver_http_client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId + pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) @@ -2730,7 +2704,7 @@ def remote_consistent_lsn( def wait_for_upload( - pageserver_http_client: NeonPageserverHttpClient, + pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId, lsn: Lsn, @@ -2754,7 +2728,7 @@ def wait_for_upload( def last_record_lsn( - pageserver_http_client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId + pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) @@ -2764,7 +2738,7 @@ def last_record_lsn( def wait_for_last_record_lsn( - pageserver_http_client: NeonPageserverHttpClient, + pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId, lsn: Lsn, diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 5fb91344ad..1242305ec3 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -4,13 +4,16 @@ import re import shutil import subprocess import tarfile +import time from pathlib import Path -from typing import Any, List, Tuple +from typing import Any, Callable, List, Tuple, TypeVar import allure # type: ignore from fixtures.log_helper import log from psycopg2.extensions import cursor +Fn = TypeVar("Fn", bound=Callable[..., Any]) + def get_self_dir() -> str: """Get the path to the directory where this script lives.""" @@ -188,3 +191,57 @@ def allure_attach_from_dir(dir: Path): extension = attachment.suffix.removeprefix(".") allure.attach.file(source, name, attachment_type, extension) + + +def start_in_background( + command: list[str], cwd: Path, log_file_name: str, is_started: Fn +) -> subprocess.Popen[bytes]: + """Starts a process, creates the logfile and redirects stderr and stdout there. Runs the start checks before the process is started, or errors.""" + + log.info(f'Running command "{" ".join(command)}"') + + with open(cwd / log_file_name, "wb") as log_file: + spawned_process = subprocess.Popen(command, stdout=log_file, stderr=log_file, cwd=cwd) + error = None + try: + return_code = spawned_process.poll() + if return_code is not None: + error = f"expected subprocess to run but it exited with code {return_code}" + else: + attempts = 10 + try: + wait_until( + number_of_iterations=attempts, + interval=1, + func=is_started, + ) + except Exception: + error = f"Failed to get correct status from subprocess in {attempts} attempts" + except Exception as e: + error = f"expected subprocess to start but it failed with exception: {e}" + + if error is not None: + log.error(error) + spawned_process.kill() + raise Exception(f"Failed to run subprocess as {command}, reason: {error}") + + log.info("subprocess spawned") + return spawned_process + + +def wait_until(number_of_iterations: int, interval: float, func: Fn): + """ + Wait until 'func' returns successfully, without exception. Returns the + last return value from the function. + """ + last_exception = None + for i in range(number_of_iterations): + try: + res = func() + except Exception as e: + log.info("waiting for %s iteration %s failed", func, i + 1) + last_exception = e + time.sleep(interval) + continue + return res + raise Exception("timed out while waiting for %s" % func) from last_exception diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index ce4a8ffa9e..8443aa029f 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -1,7 +1,7 @@ from contextlib import closing import pytest -from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException +from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException from fixtures.types import TenantId @@ -39,7 +39,7 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): # fail to create branch using token with different tenant_id with pytest.raises( - NeonPageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied" + PageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied" ): invalid_tenant_http_client.timeline_create( tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id @@ -50,7 +50,7 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): # fail to create tenant using tenant token with pytest.raises( - NeonPageserverApiException, + PageserverApiException, match="Forbidden: Attempt to access management api with tenant scope. Permission denied", ): tenant_http_client.tenant_create() diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 0487cd8f2c..1f0940cab7 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -10,7 +10,7 @@ import toml from fixtures.neon_fixtures import ( NeonCli, NeonEnvBuilder, - NeonPageserverHttpClient, + PageserverHttpClient, PgBin, PortDistributor, wait_for_last_record_lsn, @@ -208,7 +208,7 @@ def test_backward_compatibility( timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id] pageserver_port = snapshot_config["pageserver"]["listen_http_addr"].split(":")[-1] auth_token = snapshot_config["pageserver"]["auth_token"] - pageserver_http = NeonPageserverHttpClient( + pageserver_http = PageserverHttpClient( port=pageserver_port, is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled auth_token=auth_token, diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index a9dc63dd50..d146f78c3a 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -5,13 +5,13 @@ from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, - NeonPageserverHttpClient, + PageserverHttpClient, ) from fixtures.types import TenantId, TimelineId def helper_compare_timeline_list( - pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv, initial_tenant: TenantId + pageserver_http_client: PageserverHttpClient, env: NeonEnv, initial_tenant: TenantId ): """ Compare timelines list returned by CLI and directly via API. @@ -56,7 +56,7 @@ def test_cli_timeline_list(neon_simple_env: NeonEnv): assert nested_timeline_id in timelines_cli -def helper_compare_tenant_list(pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv): +def helper_compare_tenant_list(pageserver_http_client: PageserverHttpClient, env: NeonEnv): tenants = pageserver_http_client.tenant_list() tenants_api = sorted(map(lambda t: cast(str, t["id"]), tenants)) diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py index 73918ee476..73933021a4 100644 --- a/test_runner/regress/test_normal_work.py +++ b/test_runner/regress/test_normal_work.py @@ -1,9 +1,9 @@ import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PageserverHttpClient -def check_tenant(env: NeonEnv, pageserver_http: NeonPageserverHttpClient): +def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient): tenant_id, timeline_id = env.neon_cli.create_tenant() pg = env.postgres.create_start("main", tenant_id=tenant_id) # we rely upon autocommit after each statement diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index bab96cff4f..f5e02af8dd 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -6,12 +6,12 @@ from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, - NeonPageserverHttpClient, + PageserverHttpClient, neon_binpath, pg_distrib_dir, - wait_until, ) from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import wait_until # test that we cannot override node id after init @@ -29,8 +29,9 @@ def test_pageserver_init_node_id(neon_simple_env: NeonEnv): stderr=subprocess.PIPE, ) - # remove initial config + # remove initial config and stop existing pageserver pageserver_config.unlink() + neon_simple_env.pageserver.stop() bad_init = run_pageserver(["--init", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']) assert ( @@ -60,7 +61,7 @@ def test_pageserver_init_node_id(neon_simple_env: NeonEnv): assert "has node id already, it cannot be overridden" in bad_update.stderr -def check_client(client: NeonPageserverHttpClient, initial_tenant: TenantId): +def check_client(client: PageserverHttpClient, initial_tenant: TenantId): client.check_status() # check initial tenant is there @@ -116,7 +117,7 @@ def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): def expect_updated_msg_lsn( - client: NeonPageserverHttpClient, + client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId, prev_msg_lsn: Optional[Lsn], diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 56b14dc42b..4fb5a5406d 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -15,10 +15,9 @@ from fixtures.neon_fixtures import ( available_remote_storages, wait_for_last_record_lsn, wait_for_upload, - wait_until, ) from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import query_scalar +from fixtures.utils import query_scalar, wait_until # diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index a310eac1f7..dc4cd2e37e 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -2,16 +2,12 @@ from threading import Thread import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - NeonEnvBuilder, - NeonPageserverApiException, - NeonPageserverHttpClient, -) +from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException, PageserverHttpClient from fixtures.types import TenantId, TimelineId def do_gc_target( - pageserver_http: NeonPageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId + pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" try: @@ -27,7 +23,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # first check for non existing tenant tenant_id = TenantId.generate() with pytest.raises( - expected_exception=NeonPageserverApiException, + expected_exception=PageserverApiException, match=f"Tenant not found for id {tenant_id}", ): pageserver_http.tenant_detach(tenant_id) @@ -49,7 +45,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # gc should not try to even start with pytest.raises( - expected_exception=NeonPageserverApiException, match="gc target timeline does not exist" + expected_exception=PageserverApiException, match="gc target timeline does not exist" ): bogus_timeline_id = TimelineId.generate() pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0) @@ -78,6 +74,6 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() with pytest.raises( - expected_exception=NeonPageserverApiException, match=f"Tenant {tenant_id} not found" + expected_exception=PageserverApiException, match=f"Tenant {tenant_id} not found" ): pageserver_http.timeline_gc(tenant_id, timeline_id, 0) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index e14434ffdc..2c11812a7b 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -1,7 +1,5 @@ import os import pathlib -import signal -import subprocess import threading from contextlib import closing, contextmanager from typing import Any, Dict, Optional, Tuple @@ -12,7 +10,7 @@ from fixtures.neon_fixtures import ( Etcd, NeonEnv, NeonEnvBuilder, - NeonPageserverHttpClient, + PageserverHttpClient, PortDistributor, Postgres, assert_no_in_progress_downloads_for_tenant, @@ -21,10 +19,9 @@ from fixtures.neon_fixtures import ( pg_distrib_dir, wait_for_last_record_lsn, wait_for_upload, - wait_until, ) from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import query_scalar, subprocess_capture +from fixtures.utils import query_scalar, start_in_background, subprocess_capture, wait_until def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @@ -32,7 +29,7 @@ def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @contextmanager -def new_pageserver_helper( +def new_pageserver_service( new_pageserver_dir: pathlib.Path, pageserver_bin: pathlib.Path, remote_storage_mock_path: pathlib.Path, @@ -49,7 +46,6 @@ def new_pageserver_helper( str(pageserver_bin), "--workdir", str(new_pageserver_dir), - "--daemonize", "--update-config", f"-c listen_pg_addr='localhost:{pg_port}'", f"-c listen_http_addr='localhost:{http_port}'", @@ -61,16 +57,26 @@ def new_pageserver_helper( cmd.append( f"-c broker_endpoints=['{broker.client_url()}']", ) - - log.info("starting new pageserver %s", cmd) - out = subprocess.check_output(cmd, text=True) - log.info("started new pageserver %s", out) + pageserver_client = PageserverHttpClient( + port=http_port, + auth_token=None, + is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled + ) try: - yield + pageserver_process = start_in_background( + cmd, new_pageserver_dir, "pageserver.log", pageserver_client.check_status + ) + except Exception as e: + log.error(e) + pageserver_process.kill() + raise Exception(f"Failed to start pageserver as {cmd}, reason: {e}") + + log.info("new pageserver started") + try: + yield pageserver_process finally: log.info("stopping new pageserver") - pid = int((new_pageserver_dir / "pageserver.pid").read_text()) - os.kill(pid, signal.SIGQUIT) + pageserver_process.kill() @contextmanager @@ -113,7 +119,7 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve def populate_branch( pg: Postgres, tenant_id: TenantId, - ps_http: NeonPageserverHttpClient, + ps_http: PageserverHttpClient, create_table: bool, expected_sum: Optional[int], ) -> Tuple[TimelineId, Lsn]: @@ -146,7 +152,7 @@ def populate_branch( def ensure_checkpoint( - pageserver_http: NeonPageserverHttpClient, + pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId, current_lsn: Lsn, @@ -159,7 +165,7 @@ def ensure_checkpoint( def check_timeline_attached( - new_pageserver_http_client: NeonPageserverHttpClient, + new_pageserver_http_client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId, old_timeline_detail: Dict[str, Any], @@ -346,13 +352,13 @@ def test_tenant_relocation( log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port) pageserver_bin = pathlib.Path(neon_binpath) / "pageserver" - new_pageserver_http = NeonPageserverHttpClient( + new_pageserver_http = PageserverHttpClient( port=new_pageserver_http_port, auth_token=None, is_testing_enabled_or_skip=env.pageserver.is_testing_enabled_or_skip, ) - with new_pageserver_helper( + with new_pageserver_service( new_pageserver_dir, pageserver_bin, remote_storage_mock_path, diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 97a13bbcb0..a6e935035c 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,6 +1,7 @@ from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, wait_until +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.types import TenantId, TimelineId +from fixtures.utils import wait_until def get_only_element(l): # noqa: E741 diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 96c1fc25db..9a4cbe135b 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -25,10 +25,9 @@ from fixtures.neon_fixtures import ( available_remote_storages, wait_for_last_record_lsn, wait_for_upload, - wait_until, ) from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import query_scalar +from fixtures.utils import query_scalar, wait_until async def tenant_workload(env: NeonEnv, pg: Postgres): diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 4a78a2746e..450f7f2381 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -1,6 +1,7 @@ import pytest -from fixtures.neon_fixtures import NeonEnv, NeonPageserverApiException, wait_until +from fixtures.neon_fixtures import NeonEnv, PageserverApiException from fixtures.types import TenantId, TimelineId +from fixtures.utils import wait_until def test_timeline_delete(neon_simple_env: NeonEnv): @@ -11,13 +12,13 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # first try to delete non existing timeline # for existing tenant: invalid_timeline_id = TimelineId.generate() - with pytest.raises(NeonPageserverApiException, match="timeline not found"): + with pytest.raises(PageserverApiException, match="timeline not found"): ps_http.timeline_delete(tenant_id=env.initial_tenant, timeline_id=invalid_timeline_id) # for non existing tenant: invalid_tenant_id = TenantId.generate() with pytest.raises( - NeonPageserverApiException, + PageserverApiException, match=f"Tenant {invalid_tenant_id} not found in the local state", ): ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id) @@ -32,7 +33,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): ps_http = env.pageserver.http_client() with pytest.raises( - NeonPageserverApiException, match="Cannot delete timeline which has child timelines" + PageserverApiException, match="Cannot delete timeline which has child timelines" ): timeline_path = ( @@ -64,7 +65,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # check 404 with pytest.raises( - NeonPageserverApiException, + PageserverApiException, match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found", ): ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index d783f897f9..c87e9a6720 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -11,7 +11,7 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, - NeonPageserverHttpClient, + PageserverHttpClient, PgBin, PortDistributor, Postgres, @@ -462,7 +462,7 @@ def assert_physical_size(env: NeonEnv, tenant_id: TenantId, timeline_id: Timelin # Timeline logical size initialization is an asynchronous background task that runs once, # try a few times to ensure it's activated properly def wait_for_timeline_size_init( - client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId + client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId ): for i in range(10): timeline_details = client.timeline_detail( diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 79adfb7b68..09f6f4b9f9 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -27,6 +27,7 @@ from fixtures.neon_fixtures import ( RemoteStorageKind, RemoteStorageUsers, Safekeeper, + SafekeeperHttpClient, SafekeeperPort, available_remote_storages, neon_binpath, @@ -34,7 +35,7 @@ from fixtures.neon_fixtures import ( wait_for_upload, ) from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import get_dir_size, query_scalar +from fixtures.utils import get_dir_size, query_scalar, start_in_background def wait_lsn_force_checkpoint( @@ -841,7 +842,7 @@ class SafekeeperEnv: safekeeper_dir = self.repo_dir / f"sk{i}" safekeeper_dir.mkdir(exist_ok=True) - args = [ + cmd = [ self.bin_safekeeper, "-l", f"127.0.0.1:{port.pg}", @@ -853,11 +854,22 @@ class SafekeeperEnv: str(i), "--broker-endpoints", self.broker.client_url(), - "--daemonize", ] + log.info(f'Running command "{" ".join(cmd)}"') - log.info(f'Running command "{" ".join(args)}"') - return subprocess.run(args, check=True) + safekeeper_client = SafekeeperHttpClient( + port=port.http, + auth_token=None, + ) + try: + safekeeper_process = start_in_background( + cmd, safekeeper_dir, "safekeeper.log", safekeeper_client.check_status + ) + return safekeeper_process + except Exception as e: + log.error(e) + safekeeper_process.kill() + raise Exception(f"Failed to start safekepeer as {cmd}, reason: {e}") def get_safekeeper_connstrs(self): return ",".join([sk_proc.args[2] for sk_proc in self.safekeepers]) From 67401cbdb86c85e8ee107b393e42282bee3a14c7 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 21 Oct 2022 23:41:32 +0300 Subject: [PATCH 0978/1022] pageserver s3 coordination --- docs/rfcs/020-pageserver-s3-coordination.md | 75 +++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 docs/rfcs/020-pageserver-s3-coordination.md diff --git a/docs/rfcs/020-pageserver-s3-coordination.md b/docs/rfcs/020-pageserver-s3-coordination.md new file mode 100644 index 0000000000..ff618da66c --- /dev/null +++ b/docs/rfcs/020-pageserver-s3-coordination.md @@ -0,0 +1,75 @@ +# Coordinating access of multiple pageservers to the same s3 data + +## Motivation + +There are some blind spots around coordinating access of multiple pageservers to the same s3 data. +Currently this is applicable only to tenant relocation case, but in the future we'll need to solve similar problems for replica/standby pageservers. + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +Pageserver + +## The problem: + +### Relocation + +During relocation both pageservers can write to s3. This should be ok for all data except the `index_part.json`. For index part it causes problems during compaction/gc because they remove files from index/s3. + +Imagine this case: + +```mermaid +sequenceDiagram + autonumber + participant PS1 + participant S3 + participant PS2 + + PS1->>S3: Uploads L1, L2
Index contains L1 L2 + PS2->>S3: Attach called, sees L1, L2 + PS1->>S3: Compaction comes
Removes L1, adds L3 + note over S3: Index now L2, L3 + PS2->>S3: Uploads new layer L4
(added to previous view of the index) + note over S3: Index now L1, L2, L4 +``` + +At this point it is not possible to restore from index, it contains L2 which is no longer available in s3 and doesnt contain L3 added by compaction by the first pageserver. So if any of the pageservers restart initial sync will fail (or in on-demand world it will fail a bit later during page request from missing layer) + +### Standby pageserver + +Another related case is standby pageserver. In this case second pageserver can be used as a replica to scale reads and serve as a failover target in case first one fails. + +In this mode second pageserver needs to have the same picture of s3 files to be able to load layers on-demand. To accomplish that second pageserver cannot run gc/compaction jobs. Instead it needs to receive updates for index contents. (There is no need to run walreceiver on the second pageserver then). + +## Observations + +- If both pageservers ingest wal then their layer set diverges, because layer file generation is not deterministic +- If one of the pageservers does not ingest wal (and just picks up layer updates) then it lags behind and cannot really answer queries in the same pace as the primary one +- Can compaction help make layers deterministic? E g we do not upload level zero layers and construction of higher levels should be deterministic. This way we can guarantee that layer creation by timeout wont mess things up. This way one pageserver uploads data and second one can just ingest it. But we still need some form of election + +## Solutions + +### Manual orchestration + +One possible solution for relocation case is to orchestrate background jobs from outside. The oracle who runs migration can turn off background jobs on PS1 before migration and then run migration -> enable them on PS2. The problem comes if migration fails. In this case in order to resume background jobs oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt respond then PS1 is stuck unable to run compaction/gc. This cannot be solved without human ensuring that no upload from PS2 can happen. In order to be able to resolve this automatically CAS is required on S3 side so pageserver can avoid overwriting index part if it is no longer the leading one + +Note that flag that disables background jobs needs to be persistent, because otherwise pageserver restart will clean it + +### Avoid index_part.json + +Index part consists of two parts, list of layers and metadata. List of layers can be easily obtained by \`ListObjects\` S3 API method. But what to do with metadata? Create metadata instance for each checkpoint and add some counter to the file name? + +Back to potentially long s3 ls. + +### Coordination based approach + +Do it like safekeepers chose leader for WAL upload. Ping each other and decide based on some heuristics e g smallest node id. During relocation PS1 sends "resign" ping message so others can start election without waiting for a timeout. + +This still leaves metadata question open and non deterministic layers are a problem as well + +### Avoid metadata file + +One way to eliminate metadata file is to store it in layer files under some special key. This may resonate with intention to keep all relation sizes in some special segment to avoid initial download during size calculation. Maybe with that we can even store pre calculated value. + +As a downside each checkpoint gets 512 bytes larger. + +If we entirely avoid metadata file this opens up many approaches From ccdc3188ed4723e154a1f8fb7edaffba46b570d5 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 26 Oct 2022 16:15:57 +0300 Subject: [PATCH 0979/1022] update according to discussion and comments --- docs/rfcs/020-pageserver-s3-coordination.md | 103 +++++++++++++++++++- 1 file changed, 102 insertions(+), 1 deletion(-) diff --git a/docs/rfcs/020-pageserver-s3-coordination.md b/docs/rfcs/020-pageserver-s3-coordination.md index ff618da66c..e2041ad26a 100644 --- a/docs/rfcs/020-pageserver-s3-coordination.md +++ b/docs/rfcs/020-pageserver-s3-coordination.md @@ -56,7 +56,7 @@ Note that flag that disables background jobs needs to be persistent, because oth ### Avoid index_part.json -Index part consists of two parts, list of layers and metadata. List of layers can be easily obtained by \`ListObjects\` S3 API method. But what to do with metadata? Create metadata instance for each checkpoint and add some counter to the file name? +Index part consists of two parts, list of layers and metadata. List of layers can be easily obtained by `ListObjects` S3 API method. But what to do with metadata? Create metadata instance for each checkpoint and add some counter to the file name? Back to potentially long s3 ls. @@ -73,3 +73,104 @@ One way to eliminate metadata file is to store it in layer files under some spec As a downside each checkpoint gets 512 bytes larger. If we entirely avoid metadata file this opens up many approaches + +* * * + +During discussion it seems that we converged on the approach consisting of: +* index files stored per pageserver in the same timeline directory. With that index file name starts to look like: `_index_part.json`. In such set up there are no concurrent overwrites of index file by different pageservers. Index +* For replica pageservers the solution would be for primary to broadcast index changes to any followers with an ability to check index files in s3 and restore the full state. To properly merge changes with index files we can use a counter that is persisted in an index file, is incremented on every change to it and passed along with broadcasted change. This way we can determine whether we need to apply change to the index state or not. +* Responsibility for running background jobs is assigned externally. Pageserver keeps locally persistent flag for each tenant that indicates whether this pageserver is considered as primary one or not. TODO what happends if we crash and cannot start for some extended period of time? Control plane can assign ownership to some other pageserver. Pageserver needs some way to check if its still the blessed one. Maybe by explicit request to control plane on start. + +Requirement for deterministic layer generation was considered overly strict because of two reasons: +* It can limit possible optimizations e g when pageserver wants to reshuffle some data locally and doesnt want to coordinate this +* The deterministic algorithm itself can change so during deployments for some tim there will be two different version running at the same time which can cause non determinism + +### External elections + +The above case with lost state in this schema with externally managed leadership is represented like this: + +Note that here we keep objects list in the index file. + +```mermaid +sequenceDiagram + autonumber + participant PS1 + participant CP as Control Plane + participant S3 + participant PS2 + + note over PS1,PS2: PS1 starts up and still a leader + PS1->>CP: Am I still the leader for Tenant X? + activate CP + CP->>PS1: Yes + deactivate CP + PS1->>S3: Fetch PS1 index. + note over PS1: Continue operations, start backround jobs + note over PS1,PS2: PS1 starts up and still and is not a leader anymore + PS1->>CP: Am I still the leader for Tenant X? + CP->>PS1: No + PS1->>PS2: Subscribe to index changes + PS1->>S3: Fetch PS1 and PS2 indexes + note over PS1: Combine index file to include layers
from both indexes to be able
to see newer files from leader (PS2) + note over PS1: Continue operations, do not start background jobs +``` + +### Internal elections + +To manage leadership internally we can use broker to exchange pings so nodes can decide on the leader roles. In case multiple pageservers are active leader is the one with lowest node id. + +Operations with internally managed elections: + +```mermaid +sequenceDiagram + autonumber + participant PS1 + participant S3 + participant PS2 + + note over PS1: Starts up + note over PS1: Subscribes to changes, waits for two ping
timeouts to see if there is a leader + PS1->>S3: Fetch indexes from s3 + alt there is a leader + note over PS1: do not start background jobs,
continue applying index updates + else there is no leader + note over PS1: start background jobs,
broadcast index changes + end + + note over PS1,PS2: Then the picture is similar to external elections
the difference is that follower can become a leader
if there are no pings after some timeout new leader gets elected +``` + +### Eviction + +When two pageservers operate on a tenant for extended period of time follower doesnt perform write operations in s3. +When layer is evicted follower relies on updates from primary to get info about layers it needs to cover range for evicted layer. + +Note that it wont match evicted layer exactly, so layers will overlap and lookup code needs to correctly handle that. + +### Relocation flow + +Actions become: +* Attach tenant to new pageserver +* New pageserver becomes follower since previous one is still leading +* New pageserver starts replicating from safekeepers but does not upload layers +* Detach is called on the old one +* New pageserver becomes leader after it realizes that old one disappeared + +### Index File + +Using `s3 ls` on startup simplifies things, but we still need metadata, so we need to fetch index files anyway. If they contain list of files we can combine them and avoid costly `s3 ls` + +### Remaining issues + +* More than one remote consistent lsn for safekeepers to know + +Anything else? + +### Proposed solution + +To recap. On meeting we converged on approach with external elections but I think it will be overall harder to manage and will introduce a dependency on control plane for pageserver. Using separate index files for each pageserver consisting of log of operations and a metadata snapshot should be enough. + +### What we need to get there? + +* Change index file structure to contain log of changes instead of just the file list +* Implement pinging/elections for pageservers From e56d11c8e18d353395556e5f18f5eec4a924b321 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 26 Oct 2022 16:41:03 +0300 Subject: [PATCH 0980/1022] fix style if possible (cannot really split long lines in mermaid) --- docs/rfcs/020-pageserver-s3-coordination.md | 148 ++++++++++++++------ 1 file changed, 109 insertions(+), 39 deletions(-) diff --git a/docs/rfcs/020-pageserver-s3-coordination.md b/docs/rfcs/020-pageserver-s3-coordination.md index e2041ad26a..5e2912ba99 100644 --- a/docs/rfcs/020-pageserver-s3-coordination.md +++ b/docs/rfcs/020-pageserver-s3-coordination.md @@ -2,18 +2,22 @@ ## Motivation -There are some blind spots around coordinating access of multiple pageservers to the same s3 data. -Currently this is applicable only to tenant relocation case, but in the future we'll need to solve similar problems for replica/standby pageservers. +There are some blind spots around coordinating access of multiple pageservers +to the same s3 data. Currently this is applicable only to tenant relocation +case, but in the future we'll need to solve similar problems for +replica/standby pageservers. ## Impacted components (e.g. pageserver, safekeeper, console, etc) Pageserver -## The problem: +## The problem ### Relocation -During relocation both pageservers can write to s3. This should be ok for all data except the `index_part.json`. For index part it causes problems during compaction/gc because they remove files from index/s3. +During relocation both pageservers can write to s3. This should be ok for all +data except the `index_part.json`. For index part it causes problems during +compaction/gc because they remove files from index/s3. Imagine this case: @@ -32,43 +36,77 @@ sequenceDiagram note over S3: Index now L1, L2, L4 ``` -At this point it is not possible to restore from index, it contains L2 which is no longer available in s3 and doesnt contain L3 added by compaction by the first pageserver. So if any of the pageservers restart initial sync will fail (or in on-demand world it will fail a bit later during page request from missing layer) +At this point it is not possible to restore from index, it contains L2 which +is no longer available in s3 and doesnt contain L3 added by compaction by the +first pageserver. So if any of the pageservers restart initial sync will fail +(or in on-demand world it will fail a bit later during page request from +missing layer) ### Standby pageserver -Another related case is standby pageserver. In this case second pageserver can be used as a replica to scale reads and serve as a failover target in case first one fails. +Another related case is standby pageserver. In this case second pageserver can +be used as a replica to scale reads and serve as a failover target in case +first one fails. -In this mode second pageserver needs to have the same picture of s3 files to be able to load layers on-demand. To accomplish that second pageserver cannot run gc/compaction jobs. Instead it needs to receive updates for index contents. (There is no need to run walreceiver on the second pageserver then). +In this mode second pageserver needs to have the same picture of s3 files to +be able to load layers on-demand. To accomplish that second pageserver +cannot run gc/compaction jobs. Instead it needs to receive updates for index +contents. (There is no need to run walreceiver on the second pageserver then). ## Observations -- If both pageservers ingest wal then their layer set diverges, because layer file generation is not deterministic -- If one of the pageservers does not ingest wal (and just picks up layer updates) then it lags behind and cannot really answer queries in the same pace as the primary one -- Can compaction help make layers deterministic? E g we do not upload level zero layers and construction of higher levels should be deterministic. This way we can guarantee that layer creation by timeout wont mess things up. This way one pageserver uploads data and second one can just ingest it. But we still need some form of election +- If both pageservers ingest wal then their layer set diverges, because layer + file generation is not deterministic +- If one of the pageservers does not ingest wal (and just picks up layer + updates) then it lags behind and cannot really answer queries in the same + pace as the primary one +- Can compaction help make layers deterministic? E g we do not upload level + zero layers and construction of higher levels should be deterministic. + This way we can guarantee that layer creation by timeout wont mess things up. + This way one pageserver uploads data and second one can just ingest it. + But we still need some form of election ## Solutions ### Manual orchestration -One possible solution for relocation case is to orchestrate background jobs from outside. The oracle who runs migration can turn off background jobs on PS1 before migration and then run migration -> enable them on PS2. The problem comes if migration fails. In this case in order to resume background jobs oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt respond then PS1 is stuck unable to run compaction/gc. This cannot be solved without human ensuring that no upload from PS2 can happen. In order to be able to resolve this automatically CAS is required on S3 side so pageserver can avoid overwriting index part if it is no longer the leading one +One possible solution for relocation case is to orchestrate background jobs +from outside. The oracle who runs migration can turn off background jobs on +PS1 before migration and then run migration -> enable them on PS2. The problem +comes if migration fails. In this case in order to resume background jobs +oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt +respond then PS1 is stuck unable to run compaction/gc. This cannot be solved +without human ensuring that no upload from PS2 can happen. In order to be able +to resolve this automatically CAS is required on S3 side so pageserver can +avoid overwriting index part if it is no longer the leading one -Note that flag that disables background jobs needs to be persistent, because otherwise pageserver restart will clean it +Note that flag that disables background jobs needs to be persistent, because +otherwise pageserver restart will clean it ### Avoid index_part.json -Index part consists of two parts, list of layers and metadata. List of layers can be easily obtained by `ListObjects` S3 API method. But what to do with metadata? Create metadata instance for each checkpoint and add some counter to the file name? +Index part consists of two parts, list of layers and metadata. List of layers +can be easily obtained by `ListObjects` S3 API method. But what to do with +metadata? Create metadata instance for each checkpoint and add some counter +to the file name? Back to potentially long s3 ls. ### Coordination based approach -Do it like safekeepers chose leader for WAL upload. Ping each other and decide based on some heuristics e g smallest node id. During relocation PS1 sends "resign" ping message so others can start election without waiting for a timeout. +Do it like safekeepers chose leader for WAL upload. Ping each other and decide +based on some heuristics e g smallest node id. During relocation PS1 sends +"resign" ping message so others can start election without waiting for a timeout. -This still leaves metadata question open and non deterministic layers are a problem as well +This still leaves metadata question open and non deterministic layers are a +problem as well ### Avoid metadata file -One way to eliminate metadata file is to store it in layer files under some special key. This may resonate with intention to keep all relation sizes in some special segment to avoid initial download during size calculation. Maybe with that we can even store pre calculated value. +One way to eliminate metadata file is to store it in layer files under some +special key. This may resonate with intention to keep all relation sizes in +some special segment to avoid initial download during size calculation. +Maybe with that we can even store pre calculated value. As a downside each checkpoint gets 512 bytes larger. @@ -77,17 +115,38 @@ If we entirely avoid metadata file this opens up many approaches * * * During discussion it seems that we converged on the approach consisting of: -* index files stored per pageserver in the same timeline directory. With that index file name starts to look like: `_index_part.json`. In such set up there are no concurrent overwrites of index file by different pageservers. Index -* For replica pageservers the solution would be for primary to broadcast index changes to any followers with an ability to check index files in s3 and restore the full state. To properly merge changes with index files we can use a counter that is persisted in an index file, is incremented on every change to it and passed along with broadcasted change. This way we can determine whether we need to apply change to the index state or not. -* Responsibility for running background jobs is assigned externally. Pageserver keeps locally persistent flag for each tenant that indicates whether this pageserver is considered as primary one or not. TODO what happends if we crash and cannot start for some extended period of time? Control plane can assign ownership to some other pageserver. Pageserver needs some way to check if its still the blessed one. Maybe by explicit request to control plane on start. -Requirement for deterministic layer generation was considered overly strict because of two reasons: -* It can limit possible optimizations e g when pageserver wants to reshuffle some data locally and doesnt want to coordinate this -* The deterministic algorithm itself can change so during deployments for some tim there will be two different version running at the same time which can cause non determinism +- index files stored per pageserver in the same timeline directory. With that + index file name starts to look like: `_index_part.json`. + In such set up there are no concurrent overwrites of index file by different + pageservers. +- For replica pageservers the solution would be for primary to broadcast index + changes to any followers with an ability to check index files in s3 and + restore the full state. To properly merge changes with index files we can use + a counter that is persisted in an index file, is incremented on every change + to it and passed along with broadcasted change. This way we can determine + whether we need to apply change to the index state or not. +- Responsibility for running background jobs is assigned externally. Pageserver + keeps locally persistent flag for each tenant that indicates whether this + pageserver is considered as primary one or not. TODO what happends if we + crash and cannot start for some extended period of time? Control plane can + assign ownership to some other pageserver. Pageserver needs some way to check + if its still the blessed one. Maybe by explicit request to control plane on + start. + +Requirement for deterministic layer generation was considered overly strict +because of two reasons: + +- It can limit possible optimizations e g when pageserver wants to reshuffle + some data locally and doesnt want to coordinate this +- The deterministic algorithm itself can change so during deployments for some + time there will be two different version running at the same time which can + cause non determinism ### External elections -The above case with lost state in this schema with externally managed leadership is represented like this: +The above case with lost state in this schema with externally managed +leadership is represented like this: Note that here we keep objects list in the index file. @@ -117,7 +176,9 @@ sequenceDiagram ### Internal elections -To manage leadership internally we can use broker to exchange pings so nodes can decide on the leader roles. In case multiple pageservers are active leader is the one with lowest node id. +To manage leadership internally we can use broker to exchange pings so nodes +can decide on the leader roles. In case multiple pageservers are active leader +is the one with lowest node id. Operations with internally managed elections: @@ -126,7 +187,6 @@ sequenceDiagram autonumber participant PS1 participant S3 - participant PS2 note over PS1: Starts up note over PS1: Subscribes to changes, waits for two ping
timeouts to see if there is a leader @@ -137,40 +197,50 @@ sequenceDiagram note over PS1: start background jobs,
broadcast index changes end - note over PS1,PS2: Then the picture is similar to external elections
the difference is that follower can become a leader
if there are no pings after some timeout new leader gets elected + note over PS1,S3: Then the picture is similar to external elections
the difference is that follower can become a leader
if there are no pings after some timeout new leader gets elected ``` ### Eviction -When two pageservers operate on a tenant for extended period of time follower doesnt perform write operations in s3. -When layer is evicted follower relies on updates from primary to get info about layers it needs to cover range for evicted layer. +When two pageservers operate on a tenant for extended period of time follower +doesnt perform write operations in s3. When layer is evicted follower relies +on updates from primary to get info about layers it needs to cover range for +evicted layer. -Note that it wont match evicted layer exactly, so layers will overlap and lookup code needs to correctly handle that. +Note that it wont match evicted layer exactly, so layers will overlap and +lookup code needs to correctly handle that. ### Relocation flow Actions become: -* Attach tenant to new pageserver -* New pageserver becomes follower since previous one is still leading -* New pageserver starts replicating from safekeepers but does not upload layers -* Detach is called on the old one -* New pageserver becomes leader after it realizes that old one disappeared + +- Attach tenant to new pageserver +- New pageserver becomes follower since previous one is still leading +- New pageserver starts replicating from safekeepers but does not upload layers +- Detach is called on the old one +- New pageserver becomes leader after it realizes that old one disappeared ### Index File -Using `s3 ls` on startup simplifies things, but we still need metadata, so we need to fetch index files anyway. If they contain list of files we can combine them and avoid costly `s3 ls` +Using `s3 ls` on startup simplifies things, but we still need metadata, so we +need to fetch index files anyway. If they contain list of files we can combine +them and avoid costly `s3 ls` ### Remaining issues -* More than one remote consistent lsn for safekeepers to know +- More than one remote consistent lsn for safekeepers to know Anything else? ### Proposed solution -To recap. On meeting we converged on approach with external elections but I think it will be overall harder to manage and will introduce a dependency on control plane for pageserver. Using separate index files for each pageserver consisting of log of operations and a metadata snapshot should be enough. +To recap. On meeting we converged on approach with external elections but I +think it will be overall harder to manage and will introduce a dependency on +control plane for pageserver. Using separate index files for each pageserver +consisting of log of operations and a metadata snapshot should be enough. ### What we need to get there? -* Change index file structure to contain log of changes instead of just the file list -* Implement pinging/elections for pageservers +- Change index file structure to contain log of changes instead of just the + file list +- Implement pinging/elections for pageservers From 0a0595b98dc60b3db56783b6c4bd84b3ee5493b9 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 2 Nov 2022 15:22:38 +0000 Subject: [PATCH 0981/1022] test_backward_compatibility: assign random port to compute (#2738) --- test_runner/regress/test_compatibility.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 1f0940cab7..100027048f 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -177,7 +177,7 @@ def test_backward_compatibility( cli.raw_cli(["start"]) request.addfinalizer(lambda: cli.raw_cli(["stop"])) - result = cli.pg_start("main") + result = cli.pg_start("main", port=port_distributor.get_port()) request.addfinalizer(lambda: cli.pg_stop("main")) except Exception: breaking_changes_allowed = ( From 590f894db8bb48d7e7d2298853e514d516c5fd55 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 2 Nov 2022 11:33:46 +0100 Subject: [PATCH 0982/1022] tenant_status: remove unnecessary spawn_blocking The spawn_blocking is pointless in this cases: get_tenant is not expected to block for any meaningful amount of time. There are get_tenant calls in most other functions in the file too, and they don't bother with spawn_blocking. Let's remove the spawn_blocking from tenant_status, too, to be consistent. fixes https://github.com/neondatabase/neon/issues/2731 --- pageserver/src/http/routes.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 8ec7604b8a..e8a160e395 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -523,9 +523,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro check_permission(&request, Some(tenant_id))?; // if tenant is in progress of downloading it can be absent in global tenant map - let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false)) - .await - .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?; + let tenant = tenant_mgr::get_tenant(tenant_id, false); let state = get_state(&request); let remote_index = &state.remote_index; From a86a38c96e7ca1a2b22350cf466c70283b9f82a2 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 2 Nov 2022 11:56:40 +0100 Subject: [PATCH 0983/1022] README: fix instructions on how to run tests The `make debug` target doesn't exist, and I can't find it in the Git history. --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index dc469c36b1..e9c30668e0 100644 --- a/README.md +++ b/README.md @@ -223,10 +223,7 @@ Ensure your dependencies are installed as described [here](https://github.com/ne ```sh git clone --recursive https://github.com/neondatabase/neon.git -# either: CARGO_BUILD_FLAGS="--features=testing" make -# or: -make debug ./scripts/pytest ``` From b15499251086183b073ef893c3852a138ee3f919 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 2 Nov 2022 12:06:10 +0100 Subject: [PATCH 0984/1022] timeline_list_handler: avoid spawn_blocking As per https://github.com/neondatabase/neon/issues/2731#issuecomment-1299335813 refs https://github.com/neondatabase/neon/issues/2731 --- pageserver/src/http/routes.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index e8a160e395..d88cf6e075 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -227,13 +227,10 @@ async fn timeline_list_handler(request: Request) -> Result, let state = get_state(&request); - let timelines = tokio::task::spawn_blocking(move || { - let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); + let timelines = info_span!("timeline_list", tenant = %tenant_id).in_scope(|| { let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?; Ok(tenant.list_timelines()) - }) - .await - .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; + })?; let mut response_data = Vec::with_capacity(timelines.len()); for timeline in timelines { From a0a74868a4e21cb2127f827c2f7598527ffc26c2 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Wed, 2 Nov 2022 12:30:09 -0400 Subject: [PATCH 0985/1022] Fix clippy (#2742) --- pageserver/src/walreceiver.rs | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index c7de24080a..1fad91c836 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -155,22 +155,19 @@ impl TaskHandle { /// Aborts current task, waiting for it to finish. pub async fn shutdown(self) { - match self.join_handle { - Some(jh) => { - self.cancellation.send(()).ok(); - match jh.await { - Ok(Ok(())) => debug!("Shutdown success"), - Ok(Err(e)) => error!("Shutdown task error: {e:?}"), - Err(join_error) => { - if join_error.is_cancelled() { - error!("Shutdown task was cancelled"); - } else { - error!("Shutdown task join error: {join_error}") - } + if let Some(jh) = self.join_handle { + self.cancellation.send(()).ok(); + match jh.await { + Ok(Ok(())) => debug!("Shutdown success"), + Ok(Err(e)) => error!("Shutdown task error: {e:?}"), + Err(join_error) => { + if join_error.is_cancelled() { + error!("Shutdown task was cancelled"); + } else { + error!("Shutdown task join error: {join_error}") } } } - None => {} } } } From 51121429976da628f3118db9130d3171a8e29dab Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 2 Nov 2022 20:37:48 +0200 Subject: [PATCH 0986/1022] fix: use different port for temporary postgres (#2743) `test_tenant_relocation` ends up starting a temporary postgres instance with a fixed port. the change makes the port configurable at scripts/export_import_between_pageservers.py and uses that in test_tenant_relocation. --- scripts/export_import_between_pageservers.py | 26 ++++++++++++++----- scripts/reformat | 6 ++--- test_runner/regress/test_tenant_relocation.py | 2 ++ 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 152ce40cea..1734038661 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -370,7 +370,7 @@ def pack_base(log_dir, restored_dir, output_tar): shutil.move(tmp_tar_path, output_tar) -def reconstruct_paths(log_dir, pg_bin, base_tar): +def reconstruct_paths(log_dir, pg_bin, base_tar, port: int): """Reconstruct what relation files should exist in the datadir by querying postgres.""" with tempfile.TemporaryDirectory() as restored_dir: # Unpack the base tar @@ -378,8 +378,7 @@ def reconstruct_paths(log_dir, pg_bin, base_tar): # Start a vanilla postgres from the given datadir and query it to find # what relfiles should exist, but possibly don't. - port = "55439" # Probably free - with VanillaPostgres(restored_dir, pg_bin, port, init=False) as vanilla_pg: + with VanillaPostgres(Path(restored_dir), pg_bin, port, init=False) as vanilla_pg: vanilla_pg.configure([f"port={port}"]) vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log")) @@ -443,8 +442,8 @@ def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths): # a vanilla postgres from the exported datadir, and query it # to see what empty relations are missing, and then create # those empty files before importing. -def add_missing_rels(base_tar, output_tar, log_dir, pg_bin): - reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar)) +def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int): + reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar, tmp_pg_port)) touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths) @@ -535,7 +534,7 @@ def export_timeline( # Add missing rels pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) - add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin) + add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin, args.tmp_pg_port) # Log more info file_size = os.path.getsize(tar_filename) @@ -633,6 +632,13 @@ def main(args: argparse.Namespace): raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}") +def non_zero_tcp_port(arg: Any): + port = int(arg) + if port < 1 or port > 65535: + raise argparse.ArgumentTypeError(f"invalid tcp port: {arg}") + return port + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -728,5 +734,13 @@ if __name__ == "__main__": default=False, help="directory where temporary tar files are stored", ) + parser.add_argument( + "--tmp-pg-port", + dest="tmp_pg_port", + required=False, + default=55439, + type=non_zero_tcp_port, + help="localhost port to use for temporary postgres instance", + ) args = parser.parse_args() main(args) diff --git a/scripts/reformat b/scripts/reformat index 67140a705a..5346c78ead 100755 --- a/scripts/reformat +++ b/scripts/reformat @@ -6,6 +6,6 @@ set -euox pipefail echo 'Reformatting Rust code' cargo fmt echo 'Reformatting Python code' -poetry run isort test_runner -poetry run flake8 test_runner -poetry run black test_runner +poetry run isort test_runner scripts +poetry run flake8 test_runner scripts +poetry run black test_runner scripts diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 2c11812a7b..fa00a4da82 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -392,6 +392,8 @@ def test_tenant_relocation( pg_distrib_dir, "--work-dir", os.path.join(test_output_dir), + "--tmp-pg-port", + str(port_distributor.get_port()), ] subprocess_capture(test_output_dir, cmd, check=True) elif method == "minor": From d7eeb73f6fa8ae96c8311737a7b5c04b4b9c1e82 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Wed, 2 Nov 2022 23:44:07 -0400 Subject: [PATCH 0987/1022] Impl serialize for pagestream FeMessage (#2741) --- Cargo.lock | 1 + libs/pageserver_api/Cargo.toml | 1 + libs/pageserver_api/src/models.rs | 99 +++++++++++++++++++++++-------- pageserver/src/page_service.rs | 3 +- 4 files changed, 77 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 01b8abda9a..98daddbd96 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2174,6 +2174,7 @@ name = "pageserver_api" version = "0.1.0" dependencies = [ "anyhow", + "byteorder", "bytes", "const_format", "postgres_ffi", diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 9121cd4989..2102ae5373 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -9,6 +9,7 @@ serde_with = "2.0" const_format = "0.2.21" anyhow = { version = "1.0", features = ["backtrace"] } bytes = "1.0.1" +byteorder = "1.4.3" utils = { path = "../utils" } postgres_ffi = { path = "../postgres_ffi" } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 4360f76fd1..3453f9672a 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1,5 +1,6 @@ use std::num::NonZeroU64; +use byteorder::{BigEndian, ReadBytesExt}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use utils::{ @@ -9,7 +10,7 @@ use utils::{ use crate::reltag::RelTag; use anyhow::bail; -use bytes::{Buf, BufMut, Bytes, BytesMut}; +use bytes::{BufMut, Bytes, BytesMut}; /// A state of a tenant in pageserver's memory. #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -296,52 +297,98 @@ pub struct PagestreamDbSizeResponse { } impl PagestreamFeMessage { - pub fn parse(mut body: Bytes) -> anyhow::Result { + pub fn serialize(&self) -> Bytes { + let mut bytes = BytesMut::new(); + + match self { + Self::Exists(req) => { + bytes.put_u8(0); + bytes.put_u8(if req.latest { 1 } else { 0 }); + bytes.put_u64(req.lsn.0); + bytes.put_u32(req.rel.spcnode); + bytes.put_u32(req.rel.dbnode); + bytes.put_u32(req.rel.relnode); + bytes.put_u8(req.rel.forknum); + } + + Self::Nblocks(req) => { + bytes.put_u8(1); + bytes.put_u8(if req.latest { 1 } else { 0 }); + bytes.put_u64(req.lsn.0); + bytes.put_u32(req.rel.spcnode); + bytes.put_u32(req.rel.dbnode); + bytes.put_u32(req.rel.relnode); + bytes.put_u8(req.rel.forknum); + } + + Self::GetPage(req) => { + bytes.put_u8(2); + bytes.put_u8(if req.latest { 1 } else { 0 }); + bytes.put_u64(req.lsn.0); + bytes.put_u32(req.rel.spcnode); + bytes.put_u32(req.rel.dbnode); + bytes.put_u32(req.rel.relnode); + bytes.put_u8(req.rel.forknum); + bytes.put_u32(req.blkno); + } + + Self::DbSize(req) => { + bytes.put_u8(3); + bytes.put_u8(if req.latest { 1 } else { 0 }); + bytes.put_u64(req.lsn.0); + bytes.put_u32(req.dbnode); + } + } + + bytes.into() + } + + pub fn parse(body: &mut R) -> anyhow::Result { // TODO these gets can fail // these correspond to the NeonMessageTag enum in pagestore_client.h // // TODO: consider using protobuf or serde bincode for less error prone // serialization. - let msg_tag = body.get_u8(); + let msg_tag = body.read_u8()?; match msg_tag { 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { - latest: body.get_u8() != 0, - lsn: Lsn::from(body.get_u64()), + latest: body.read_u8()? != 0, + lsn: Lsn::from(body.read_u64::()?), rel: RelTag { - spcnode: body.get_u32(), - dbnode: body.get_u32(), - relnode: body.get_u32(), - forknum: body.get_u8(), + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, }, })), 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - latest: body.get_u8() != 0, - lsn: Lsn::from(body.get_u64()), + latest: body.read_u8()? != 0, + lsn: Lsn::from(body.read_u64::()?), rel: RelTag { - spcnode: body.get_u32(), - dbnode: body.get_u32(), - relnode: body.get_u32(), - forknum: body.get_u8(), + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, }, })), 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - latest: body.get_u8() != 0, - lsn: Lsn::from(body.get_u64()), + latest: body.read_u8()? != 0, + lsn: Lsn::from(body.read_u64::()?), rel: RelTag { - spcnode: body.get_u32(), - dbnode: body.get_u32(), - relnode: body.get_u32(), - forknum: body.get_u8(), + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, }, - blkno: body.get_u32(), + blkno: body.read_u32::()?, })), 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - latest: body.get_u8() != 0, - lsn: Lsn::from(body.get_u64()), - dbnode: body.get_u32(), + latest: body.read_u8()? != 0, + lsn: Lsn::from(body.read_u64::()?), + dbnode: body.read_u32::()?, })), - _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body), + _ => bail!("unknown smgr message tag: {:?}", msg_tag), } } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index aec91bc7f1..f83ab1929a 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -10,6 +10,7 @@ // use anyhow::{bail, ensure, Context, Result}; +use bytes::Buf; use bytes::Bytes; use futures::{Stream, StreamExt}; use pageserver_api::models::{ @@ -299,7 +300,7 @@ impl PageServerHandler { trace!("query: {copy_data_bytes:?}"); - let neon_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; + let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?; let response = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { From 63221e4b4268be1e49c133fa9eac592b870025de Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Thu, 3 Nov 2022 08:30:07 +0400 Subject: [PATCH 0988/1022] Fix sk->ps walsender shutdown on sk side on caughtup. This will fix many threads issue, but code around awfully still wants improvement. https://github.com/neondatabase/neon/issues/2722 --- safekeeper/src/timeline.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 1930b3574a..d8d1fb98ad 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -555,13 +555,20 @@ impl Timeline { if self.is_cancelled() { return true; } - let mut shared_state = self.write_shared_state(); if shared_state.num_computes == 0 { let replica_state = shared_state.replicas[replica_id].unwrap(); + let reported_remote_consistent_lsn = replica_state + .pageserver_feedback + .map(|f| Lsn(f.ps_applylsn)) + .unwrap_or(Lsn::INVALID); + info!( + "checking should ws stop ttid {} lsn {} rcl {}", + self.ttid, reported_remote_consistent_lsn, shared_state.sk.inmem.commit_lsn + ); let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet - (replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. - replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); + (reported_remote_consistent_lsn!= Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. + reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); if stop { shared_state.update_status(self.ttid); return true; From cf68963b188c90716efc5516c46b4385065f7dbe Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 3 Nov 2022 14:39:19 +0200 Subject: [PATCH 0989/1022] Add initial tenant sizing model and a http route to query it (#2714) Tenant size information is gathered by using existing parts of `Tenant::gc_iteration` which are now separated as `Tenant::refresh_gc_info`. `Tenant::refresh_gc_info` collects branch points, and invokes `Timeline::update_gc_info`; nothing was supposed to be changed there. The gathered branch points (through Timeline's `GcInfo::retain_lsns`), `GcInfo::horizon_cutoff`, and `GcInfo::pitr_cutoff` are used to build up a Vec of updates fed into the `libs/tenant_size_model` to calculate the history size. The gathered information is now exposed using `GET /v1/tenant/{tenant_id}/size`, which which will respond with the actual calculated size. Initially the idea was to have this delivered as tenant background task and exported via metric, but it might be too computationally expensive to run it periodically as we don't yet know if the returned values are any good. Adds one new metric: - pageserver_storage_operations_seconds with label `logical_size` - separating from original `init_logical_size` Adds a pageserver wide configuration variable: - `concurrent_tenant_size_logical_size_queries` with default 1 This leaves a lot of TODO's, tracked on issue #2748. --- Cargo.lock | 8 + libs/tenant_size_model/.gitignore | 3 + libs/tenant_size_model/Cargo.toml | 8 + libs/tenant_size_model/Makefile | 13 + libs/tenant_size_model/README.md | 7 + libs/tenant_size_model/src/lib.rs | 349 ++++++++++++++++++ libs/tenant_size_model/src/main.rs | 268 ++++++++++++++ libs/utils/src/lsn.rs | 2 +- pageserver/Cargo.toml | 1 + pageserver/src/config.rs | 84 +++++ pageserver/src/http/openapi_spec.yml | 48 +++ pageserver/src/http/routes.rs | 39 ++ pageserver/src/metrics.rs | 6 + pageserver/src/tenant.rs | 130 +++++-- pageserver/src/tenant/size.rs | 454 ++++++++++++++++++++++++ pageserver/src/tenant/timeline.rs | 28 +- test_runner/fixtures/neon_fixtures.py | 29 +- test_runner/regress/test_tenant_size.py | 276 ++++++++++++++ 18 files changed, 1704 insertions(+), 49 deletions(-) create mode 100644 libs/tenant_size_model/.gitignore create mode 100644 libs/tenant_size_model/Cargo.toml create mode 100644 libs/tenant_size_model/Makefile create mode 100644 libs/tenant_size_model/README.md create mode 100644 libs/tenant_size_model/src/lib.rs create mode 100644 libs/tenant_size_model/src/main.rs create mode 100644 pageserver/src/tenant/size.rs create mode 100644 test_runner/regress/test_tenant_size.py diff --git a/Cargo.lock b/Cargo.lock index 98daddbd96..9a5ac0b1d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2157,6 +2157,7 @@ dependencies = [ "svg_fmt", "tar", "tempfile", + "tenant_size_model", "thiserror", "tokio", "tokio-postgres", @@ -3533,6 +3534,13 @@ dependencies = [ "winapi", ] +[[package]] +name = "tenant_size_model" +version = "0.1.0" +dependencies = [ + "workspace_hack", +] + [[package]] name = "termcolor" version = "1.1.3" diff --git a/libs/tenant_size_model/.gitignore b/libs/tenant_size_model/.gitignore new file mode 100644 index 0000000000..15a65bec1e --- /dev/null +++ b/libs/tenant_size_model/.gitignore @@ -0,0 +1,3 @@ +*.dot +*.png +*.svg diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml new file mode 100644 index 0000000000..1aabf5a4f9 --- /dev/null +++ b/libs/tenant_size_model/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "tenant_size_model" +version = "0.1.0" +edition = "2021" +publish = false + +[dependencies] +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/tenant_size_model/Makefile b/libs/tenant_size_model/Makefile new file mode 100644 index 0000000000..1cffe81c10 --- /dev/null +++ b/libs/tenant_size_model/Makefile @@ -0,0 +1,13 @@ +all: 1.svg 2.svg 3.svg 4.svg 1.png 2.png 3.png 4.png + +../../target/debug/tenant_size_model: Cargo.toml src/main.rs src/lib.rs + cargo build --bin tenant_size_model + +%.svg: %.dot + dot -Tsvg $< > $@ + +%.png: %.dot + dot -Tpng $< > $@ + +%.dot: ../../target/debug/tenant_size_model + ../../target/debug/tenant_size_model $* > $@ diff --git a/libs/tenant_size_model/README.md b/libs/tenant_size_model/README.md new file mode 100644 index 0000000000..b850130d67 --- /dev/null +++ b/libs/tenant_size_model/README.md @@ -0,0 +1,7 @@ +# Logical size + WAL pricing + +This is a simulator to calculate the tenant size in different scenarios, +using the "Logical size + WAL" method. Makefile produces diagrams used in a +private presentation: + +https://docs.google.com/presentation/d/1OapE4k11xmcwMh7I7YvNWGC63yCRLh6udO9bXZ-fZmo/edit?usp=sharing diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs new file mode 100644 index 0000000000..c7ec1e8870 --- /dev/null +++ b/libs/tenant_size_model/src/lib.rs @@ -0,0 +1,349 @@ +use std::borrow::Cow; +use std::collections::HashMap; + +/// Pricing model or history size builder. +/// +/// Maintains knowledge of the branches and their modifications. Generic over the branch name key +/// type. +pub struct Storage { + segments: Vec, + + /// Mapping from the branch name to the index of a segment describing it's latest state. + branches: HashMap, +} + +/// Snapshot of a branch. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Segment { + /// Previous segment index into ['Storage::segments`], if any. + parent: Option, + + /// Description of how did we get to this state. + /// + /// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when + /// modifying a branch directly. + pub op: Cow<'static, str>, + + /// LSN before this state + start_lsn: u64, + + /// LSN at this state + pub end_lsn: u64, + + /// Logical size before this state + start_size: u64, + + /// Logical size at this state + pub end_size: u64, + + /// Indices to [`Storage::segments`] + /// + /// FIXME: this could be an Option + children_after: Vec, + + /// Determined by `retention_period` given to [`Storage::calculate`] + pub needed: bool, +} + +// +// +// +// +// *-g--*---D---> +// / +// / +// / *---b----*-B---> +// / / +// / / +// -----*--e---*-----f----* C +// E \ +// \ +// *--a---*---A--> +// +// If A and B need to be retained, is it cheaper to store +// snapshot at C+a+b, or snapshots at A and B ? +// +// If D also needs to be retained, which is cheaper: +// +// 1. E+g+e+f+a+b +// 2. D+C+a+b +// 3. D+A+B + +/// [`Segment`] which has had it's size calculated. +pub struct SegmentSize { + pub seg_id: usize, + + pub method: SegmentMethod, + + this_size: u64, + + pub children: Vec, +} + +impl SegmentSize { + fn total(&self) -> u64 { + self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total()) + } + + pub fn total_children(&self) -> u64 { + if self.method == SnapshotAfter { + self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total()) + } else { + self.children.iter().fold(0, |acc, x| acc + x.total()) + } + } +} + +/// Different methods to retain history from a particular state +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum SegmentMethod { + SnapshotAfter, + Wal, + WalNeeded, + Skipped, +} + +use SegmentMethod::*; + +impl Storage { + /// Creates a new storage with the given default branch name. + pub fn new(initial_branch: K) -> Storage { + let init_segment = Segment { + op: "".into(), + needed: false, + parent: None, + start_lsn: 0, + end_lsn: 0, + start_size: 0, + end_size: 0, + children_after: Vec::new(), + }; + + Storage { + segments: vec![init_segment], + branches: HashMap::from([(initial_branch, 0)]), + } + } + + /// Advances the branch with the named operation, by the relative LSN and logical size bytes. + pub fn modify_branch( + &mut self, + branch: &Q, + op: Cow<'static, str>, + lsn_bytes: u64, + size_bytes: i64, + ) where + K: std::borrow::Borrow, + Q: std::hash::Hash + Eq, + { + let lastseg_id = *self.branches.get(branch).unwrap(); + let newseg_id = self.segments.len(); + let lastseg = &mut self.segments[lastseg_id]; + + let newseg = Segment { + op, + parent: Some(lastseg_id), + start_lsn: lastseg.end_lsn, + end_lsn: lastseg.end_lsn + lsn_bytes, + start_size: lastseg.end_size, + end_size: (lastseg.end_size as i64 + size_bytes) as u64, + children_after: Vec::new(), + needed: false, + }; + lastseg.children_after.push(newseg_id); + + self.segments.push(newseg); + *self.branches.get_mut(branch).expect("read already") = newseg_id; + } + + pub fn insert(&mut self, branch: &Q, bytes: u64) + where + K: std::borrow::Borrow, + Q: std::hash::Hash + Eq, + { + self.modify_branch(branch, "insert".into(), bytes, bytes as i64); + } + + pub fn update(&mut self, branch: &Q, bytes: u64) + where + K: std::borrow::Borrow, + Q: std::hash::Hash + Eq, + { + self.modify_branch(branch, "update".into(), bytes, 0i64); + } + + pub fn delete(&mut self, branch: &Q, bytes: u64) + where + K: std::borrow::Borrow, + Q: std::hash::Hash + Eq, + { + self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64)); + } + + /// Panics if the parent branch cannot be found. + pub fn branch(&mut self, parent: &Q, name: K) + where + K: std::borrow::Borrow, + Q: std::hash::Hash + Eq, + { + // Find the right segment + let branchseg_id = *self + .branches + .get(parent) + .expect("should had found the parent by key"); + let _branchseg = &mut self.segments[branchseg_id]; + + // Create branch name for it + self.branches.insert(name, branchseg_id); + } + + pub fn calculate(&mut self, retention_period: u64) -> SegmentSize { + // Phase 1: Mark all the segments that need to be retained + for (_branch, &last_seg_id) in self.branches.iter() { + let last_seg = &self.segments[last_seg_id]; + let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period); + let mut seg_id = last_seg_id; + loop { + let seg = &mut self.segments[seg_id]; + if seg.end_lsn < cutoff_lsn { + break; + } + seg.needed = true; + if let Some(prev_seg_id) = seg.parent { + seg_id = prev_seg_id; + } else { + break; + } + } + } + + // Phase 2: For each oldest segment in a chain that needs to be retained, + // calculate if we should store snapshot or WAL + self.size_from_snapshot_later(0) + } + + fn size_from_wal(&self, seg_id: usize) -> SegmentSize { + let seg = &self.segments[seg_id]; + + let this_size = seg.end_lsn - seg.start_lsn; + + let mut children = Vec::new(); + + // try both ways + for &child_id in seg.children_after.iter() { + // try each child both ways + let child = &self.segments[child_id]; + let p1 = self.size_from_wal(child_id); + + let p = if !child.needed { + let p2 = self.size_from_snapshot_later(child_id); + if p1.total() < p2.total() { + p1 + } else { + p2 + } + } else { + p1 + }; + children.push(p); + } + SegmentSize { + seg_id, + method: if seg.needed { WalNeeded } else { Wal }, + this_size, + children, + } + } + + fn size_from_snapshot_later(&self, seg_id: usize) -> SegmentSize { + // If this is needed, then it's time to do the snapshot and continue + // with wal method. + let seg = &self.segments[seg_id]; + //eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed); + if seg.needed { + let mut children = Vec::new(); + + for &child_id in seg.children_after.iter() { + // try each child both ways + let child = &self.segments[child_id]; + let p1 = self.size_from_wal(child_id); + + let p = if !child.needed { + let p2 = self.size_from_snapshot_later(child_id); + if p1.total() < p2.total() { + p1 + } else { + p2 + } + } else { + p1 + }; + children.push(p); + } + SegmentSize { + seg_id, + method: WalNeeded, + this_size: seg.start_size, + children, + } + } else { + // If any of the direct children are "needed", need to be able to reconstruct here + let mut children_needed = false; + for &child in seg.children_after.iter() { + let seg = &self.segments[child]; + if seg.needed { + children_needed = true; + break; + } + } + + let method1 = if !children_needed { + let mut children = Vec::new(); + for child in seg.children_after.iter() { + children.push(self.size_from_snapshot_later(*child)); + } + Some(SegmentSize { + seg_id, + method: Skipped, + this_size: 0, + children, + }) + } else { + None + }; + + // If this a junction, consider snapshotting here + let method2 = if children_needed || seg.children_after.len() >= 2 { + let mut children = Vec::new(); + for child in seg.children_after.iter() { + children.push(self.size_from_wal(*child)); + } + Some(SegmentSize { + seg_id, + method: SnapshotAfter, + this_size: seg.end_size, + children, + }) + } else { + None + }; + + match (method1, method2) { + (None, None) => panic!(), + (Some(method), None) => method, + (None, Some(method)) => method, + (Some(method1), Some(method2)) => { + if method1.total() < method2.total() { + method1 + } else { + method2 + } + } + } + } + } + + pub fn into_segments(self) -> Vec { + self.segments + } +} diff --git a/libs/tenant_size_model/src/main.rs b/libs/tenant_size_model/src/main.rs new file mode 100644 index 0000000000..47c0e8122f --- /dev/null +++ b/libs/tenant_size_model/src/main.rs @@ -0,0 +1,268 @@ +//! Tenant size model testing ground. +//! +//! Has a number of scenarios and a `main` for invoking these by number, calculating the history +//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios +//! into pngs. + +use tenant_size_model::{Segment, SegmentSize, Storage}; + +// Main branch only. Some updates on it. +fn scenario_1() -> (Vec, SegmentSize) { + // Create main branch + let mut storage = Storage::new("main"); + + // Bulk load 5 GB of data to it + storage.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + storage.update("main", 1_000); + } + + let size = storage.calculate(1000); + + (storage.into_segments(), size) +} + +// Main branch only. Some updates on it. +fn scenario_2() -> (Vec, SegmentSize) { + // Create main branch + let mut storage = Storage::new("main"); + + // Bulk load 5 GB of data to it + storage.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + storage.update("main", 1_000); + } + + // Branch + storage.branch("main", "child"); + storage.update("child", 1_000); + + // More updates on parent + storage.update("main", 1_000); + + let size = storage.calculate(1000); + + (storage.into_segments(), size) +} + +// Like 2, but more updates on main +fn scenario_3() -> (Vec, SegmentSize) { + // Create main branch + let mut storage = Storage::new("main"); + + // Bulk load 5 GB of data to it + storage.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + storage.update("main", 1_000); + } + + // Branch + storage.branch("main", "child"); + storage.update("child", 1_000); + + // More updates on parent + for _ in 0..5 { + storage.update("main", 1_000); + } + + let size = storage.calculate(1000); + + (storage.into_segments(), size) +} + +// Diverged branches +fn scenario_4() -> (Vec, SegmentSize) { + // Create main branch + let mut storage = Storage::new("main"); + + // Bulk load 5 GB of data to it + storage.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + storage.update("main", 1_000); + } + + // Branch + storage.branch("main", "child"); + storage.update("child", 1_000); + + // More updates on parent + for _ in 0..8 { + storage.update("main", 1_000); + } + + let size = storage.calculate(1000); + + (storage.into_segments(), size) +} + +fn scenario_5() -> (Vec, SegmentSize) { + let mut storage = Storage::new("a"); + storage.insert("a", 5000); + storage.branch("a", "b"); + storage.update("b", 4000); + storage.update("a", 2000); + storage.branch("a", "c"); + storage.insert("c", 4000); + storage.insert("a", 2000); + + let size = storage.calculate(5000); + + (storage.into_segments(), size) +} + +fn scenario_6() -> (Vec, SegmentSize) { + use std::borrow::Cow; + + const NO_OP: Cow<'static, str> = Cow::Borrowed(""); + + let branches = [ + Some(0x7ff1edab8182025f15ae33482edb590a_u128), + Some(0xb1719e044db05401a05a2ed588a3ad3f), + Some(0xb68d6691c895ad0a70809470020929ef), + ]; + + // compared to other scenarios, this one uses bytes instead of kB + + let mut storage = Storage::new(None); + + storage.branch(&None, branches[0]); // at 0 + storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128); // at 108951064 + storage.branch(&branches[0], branches[1]); // at 108951064 + storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392); // at 124511472 + storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904); // at 283415424 + storage.branch(&branches[0], branches[2]); // at 283415424 + storage.modify_branch(&branches[2], NO_OP, 15906192, 8192); // at 299321616 + storage.modify_branch(&branches[0], NO_OP, 18909976, 32768); // at 302325400 + + let size = storage.calculate(100_000); + + (storage.into_segments(), size) +} + +fn main() { + let args: Vec = std::env::args().collect(); + + let scenario = if args.len() < 2 { "1" } else { &args[1] }; + + let (segments, size) = match scenario { + "1" => scenario_1(), + "2" => scenario_2(), + "3" => scenario_3(), + "4" => scenario_4(), + "5" => scenario_5(), + "6" => scenario_6(), + other => { + eprintln!("invalid scenario {}", other); + std::process::exit(1); + } + }; + + graphviz_tree(&segments, &size); +} + +fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) { + use tenant_size_model::SegmentMethod::*; + + let seg_id = node.seg_id; + let seg = segments.get(seg_id).unwrap(); + let lsn = seg.end_lsn; + let size = seg.end_size; + let method = node.method; + + println!(" {{"); + println!(" node [width=0.1 height=0.1 shape=oval]"); + + let tenant_size = node.total_children(); + + let penwidth = if seg.needed { 6 } else { 3 }; + let x = match method { + SnapshotAfter => + format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"), + Wal => + format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"), + WalNeeded => + format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"), + Skipped => + format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"), + }; + + println!(" \"seg{seg_id}\" [{x}]"); + println!(" }}"); + + // Recurse. Much of the data is actually on the edge + for child in node.children.iter() { + let child_id = child.seg_id; + graphviz_recurse(segments, child); + + let edge_color = match child.method { + SnapshotAfter => "gray", + Wal => "black", + WalNeeded => "black", + Skipped => "gray", + }; + + println!(" {{"); + println!(" edge [] "); + print!(" \"seg{seg_id}\" -> \"seg{child_id}\" ["); + print!("color={edge_color}"); + if child.method == WalNeeded { + print!(" penwidth=6"); + } + if child.method == Wal { + print!(" penwidth=3"); + } + + let next = segments.get(child_id).unwrap(); + + if next.op.is_empty() { + print!( + " label=\"{} / {}\"", + next.end_lsn - seg.end_lsn, + (next.end_size as i128 - seg.end_size as i128) + ); + } else { + print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn); + } + println!("]"); + println!(" }}"); + } +} + +fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) { + println!("digraph G {{"); + println!(" fontname=\"Helvetica,Arial,sans-serif\""); + println!(" node [fontname=\"Helvetica,Arial,sans-serif\"]"); + println!(" edge [fontname=\"Helvetica,Arial,sans-serif\"]"); + println!(" graph [center=1 rankdir=LR]"); + println!(" edge [dir=none]"); + + graphviz_recurse(segments, tree); + + println!("}}"); +} + +#[test] +fn scenarios_return_same_size() { + type ScenarioFn = fn() -> (Vec, SegmentSize); + let truths: &[(u32, ScenarioFn, _)] = &[ + (line!(), scenario_1, 8000), + (line!(), scenario_2, 9000), + (line!(), scenario_3, 13000), + (line!(), scenario_4, 16000), + (line!(), scenario_5, 17000), + (line!(), scenario_6, 333_792_000), + ]; + + for (line, scenario, expected) in truths { + let (_, size) = scenario(); + assert_eq!(*expected, size.total_children(), "scenario on line {line}"); + } +} diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index 289cec12a8..39fed8670d 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -13,7 +13,7 @@ use crate::seqwait::MonotonicCounter; pub const XLOG_BLCKSZ: u32 = 8192; /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr -#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Serialize, Deserialize)] +#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)] #[serde(transparent)] pub struct Lsn(pub u64); diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 4262ca9820..7ce936ca27 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -63,6 +63,7 @@ etcd_broker = { path = "../libs/etcd_broker" } metrics = { path = "../libs/metrics" } utils = { path = "../libs/utils" } remote_storage = { path = "../libs/remote_storage" } +tenant_size_model = { path = "../libs/tenant_size_model" } workspace_hack = { version = "0.1", path = "../workspace_hack" } close_fds = "0.3.2" walkdir = "2.3.2" diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 6a372fb081..747e63af2b 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -9,6 +9,7 @@ use remote_storage::RemoteStorageConfig; use std::env; use utils::crashsafe::path_with_suffix_extension; +use std::num::NonZeroUsize; use std::path::{Path, PathBuf}; use std::str::FromStr; use std::time::Duration; @@ -48,6 +49,9 @@ pub mod defaults { pub const DEFAULT_LOG_FORMAT: &str = "plain"; + pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = + super::ConfigurableSemaphore::DEFAULT_INITIAL.get(); + /// /// Default built-in configuration file. /// @@ -67,6 +71,9 @@ pub mod defaults { #initial_superuser_name = '{DEFAULT_SUPERUSER}' #log_format = '{DEFAULT_LOG_FORMAT}' + +#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}' + # [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -132,6 +139,9 @@ pub struct PageServerConf { pub broker_endpoints: Vec, pub log_format: LogFormat, + + /// Number of concurrent [`Tenant::gather_size_inputs`] allowed. + pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore, } #[derive(Debug, Clone, PartialEq, Eq)] @@ -200,6 +210,8 @@ struct PageServerConfigBuilder { broker_endpoints: BuilderValue>, log_format: BuilderValue, + + concurrent_tenant_size_logical_size_queries: BuilderValue, } impl Default for PageServerConfigBuilder { @@ -228,6 +240,8 @@ impl Default for PageServerConfigBuilder { broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()), broker_endpoints: Set(Vec::new()), log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), + + concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()), } } } @@ -304,6 +318,10 @@ impl PageServerConfigBuilder { self.log_format = BuilderValue::Set(log_format) } + pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: ConfigurableSemaphore) { + self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u); + } + pub fn build(self) -> anyhow::Result { let broker_endpoints = self .broker_endpoints @@ -349,6 +367,11 @@ impl PageServerConfigBuilder { .broker_etcd_prefix .ok_or(anyhow!("missing broker_etcd_prefix"))?, log_format: self.log_format.ok_or(anyhow!("missing log_format"))?, + concurrent_tenant_size_logical_size_queries: self + .concurrent_tenant_size_logical_size_queries + .ok_or(anyhow!( + "missing concurrent_tenant_size_logical_size_queries" + ))?, }) } } @@ -476,6 +499,12 @@ impl PageServerConf { "log_format" => builder.log_format( LogFormat::from_config(&parse_toml_string(key, item)?)? ), + "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({ + let input = parse_toml_string(key, item)?; + let permits = input.parse::().context("expected a number of initial permits, not {s:?}")?; + let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?; + ConfigurableSemaphore::new(permits) + }), _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -589,6 +618,7 @@ impl PageServerConf { broker_endpoints: Vec::new(), broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), + concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), } } } @@ -654,6 +684,58 @@ fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result> { .collect() } +/// Configurable semaphore permits setting. +/// +/// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty +/// semaphore cannot be distinguished, leading any feature using these to await forever (or until +/// new permits are added). +#[derive(Debug, Clone)] +pub struct ConfigurableSemaphore { + initial_permits: NonZeroUsize, + inner: std::sync::Arc, +} + +impl ConfigurableSemaphore { + pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) { + Some(x) => x, + None => panic!("const unwrap is not yet stable"), + }; + + /// Initializse using a non-zero amount of permits. + /// + /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a + /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will + /// behave like [`futures::future::pending`], just waiting until new permits are added. + pub fn new(initial_permits: NonZeroUsize) -> Self { + ConfigurableSemaphore { + initial_permits, + inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())), + } + } +} + +impl Default for ConfigurableSemaphore { + fn default() -> Self { + Self::new(Self::DEFAULT_INITIAL) + } +} + +impl PartialEq for ConfigurableSemaphore { + fn eq(&self, other: &Self) -> bool { + // the number of permits can be increased at runtime, so we cannot really fulfill the + // PartialEq value equality otherwise + self.initial_permits == other.initial_permits + } +} + +impl Eq for ConfigurableSemaphore {} + +impl ConfigurableSemaphore { + pub fn inner(&self) -> &std::sync::Arc { + &self.inner + } +} + #[cfg(test)] mod tests { use std::{ @@ -725,6 +807,7 @@ log_format = 'json' .expect("Failed to parse a valid broker endpoint URL")], broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), + concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), }, "Correct defaults should be used when no config values are provided" ); @@ -770,6 +853,7 @@ log_format = 'json' .expect("Failed to parse a valid broker endpoint URL")], broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), log_format: LogFormat::Json, + concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 89609f5674..1bb5f94f4e 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -354,6 +354,54 @@ paths: schema: $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/size: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + get: + description: | + Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes). + responses: + "200": + description: OK, + content: + application/json: + schema: + type: object + required: + - id + - size + properties: + id: + type: string + format: hex + size: + type: integer + description: | + Size metric in bytes. + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/timeline/: parameters: - name: tenant_id diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index d88cf6e075..7087c68dbd 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -566,6 +566,44 @@ async fn tenant_status(request: Request) -> Result, ApiErro ) } +async fn tenant_size_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::InternalServerError)?; + + // this can be long operation, it currently is not backed by any request coalescing or similar + let inputs = tenant + .gather_size_inputs() + .await + .map_err(ApiError::InternalServerError)?; + + let size = inputs.calculate().map_err(ApiError::InternalServerError)?; + + /// Private response type with the additional "unstable" `inputs` field. + /// + /// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is + /// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`. + #[serde_with::serde_as] + #[derive(serde::Serialize)] + struct TenantHistorySize { + #[serde_as(as = "serde_with::DisplayFromStr")] + id: TenantId, + /// Size is a mixture of WAL and logical size, so the unit is bytes. + size: u64, + inputs: crate::tenant::size::ModelInputs, + } + + json_response( + StatusCode::OK, + TenantHistorySize { + id: tenant_id, + size, + inputs, + }, + ) +} + // Helper function to standardize the error messages we produce on bad durations // // Intended to be used with anyhow's `with_context`, e.g.: @@ -893,6 +931,7 @@ pub fn make_router( .get("/v1/tenant", tenant_list_handler) .post("/v1/tenant", tenant_create_handler) .get("/v1/tenant/:tenant_id", tenant_status) + .get("/v1/tenant/:tenant_id/size", tenant_size_handler) .put("/v1/tenant/config", tenant_config_handler) .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler) .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 7ae2d0f14c..43586b926d 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -31,6 +31,7 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[ "compact", "create images", "init logical size", + "logical size", "load layer map", "gc", ]; @@ -365,6 +366,7 @@ pub struct TimelineMetrics { pub compact_time_histo: Histogram, pub create_images_time_histo: Histogram, pub init_logical_size_histo: Histogram, + pub logical_size_histo: Histogram, pub load_layer_map_histo: Histogram, pub last_record_gauge: IntGauge, pub wait_lsn_time_histo: Histogram, @@ -397,6 +399,9 @@ impl TimelineMetrics { let init_logical_size_histo = STORAGE_TIME .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id]) .unwrap(); + let logical_size_histo = STORAGE_TIME + .get_metric_with_label_values(&["logical size", &tenant_id, &timeline_id]) + .unwrap(); let load_layer_map_histo = STORAGE_TIME .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id]) .unwrap(); @@ -428,6 +433,7 @@ impl TimelineMetrics { compact_time_histo, create_images_time_histo, init_logical_size_histo, + logical_size_histo, load_layer_map_histo, last_record_gauge, wait_lsn_time_histo, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ed9d2e8c7a..4621482065 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -72,6 +72,8 @@ pub mod storage_layer; mod timeline; +pub mod size; + use storage_layer::Layer; pub use timeline::Timeline; @@ -120,6 +122,9 @@ pub struct Tenant { /// Makes every timeline to backup their files to remote storage. upload_layers: bool, + + /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`]. + cached_logical_sizes: tokio::sync::Mutex>, } /// A timeline with some of its files on disk, being initialized. @@ -834,6 +839,7 @@ impl Tenant { remote_index, upload_layers, state, + cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()), } } @@ -955,8 +961,9 @@ impl Tenant { // +-----baz--------> // // - // 1. Grab 'gc_cs' mutex to prevent new timelines from being created - // 2. Scan all timelines, and on each timeline, make note of the + // 1. Grab 'gc_cs' mutex to prevent new timelines from being created while Timeline's + // `gc_infos` are being refreshed + // 2. Scan collected timelines, and on each timeline, make note of the // all the points where other timelines have been branched off. // We will refrain from removing page versions at those LSNs. // 3. For each timeline, scan all layer files on the timeline. @@ -977,6 +984,68 @@ impl Tenant { let mut totals: GcResult = Default::default(); let now = Instant::now(); + let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?; + + // Perform GC for each timeline. + // + // Note that we don't hold the GC lock here because we don't want + // to delay the branch creation task, which requires the GC lock. + // A timeline GC iteration can be slow because it may need to wait for + // compaction (both require `layer_removal_cs` lock), + // but the GC iteration can run concurrently with branch creation. + // + // See comments in [`Tenant::branch_timeline`] for more information + // about why branch creation task can run concurrently with timeline's GC iteration. + for timeline in gc_timelines { + if task_mgr::is_shutdown_requested() { + // We were requested to shut down. Stop and return with the progress we + // made. + break; + } + + // If requested, force flush all in-memory layers to disk first, + // so that they too can be garbage collected. That's + // used in tests, so we want as deterministic results as possible. + if checkpoint_before_gc { + timeline.checkpoint(CheckpointConfig::Forced)?; + info!( + "timeline {} checkpoint_before_gc done", + timeline.timeline_id + ); + } + + let result = timeline.gc()?; + totals += result; + } + + totals.elapsed = now.elapsed(); + Ok(totals) + } + + /// Refreshes the Timeline::gc_info for all timelines, returning the + /// vector of timelines which have [`Timeline::get_last_record_lsn`] past + /// [`Tenant::get_gc_horizon`]. + /// + /// This is usually executed as part of periodic gc, but can now be triggered more often. + pub fn refresh_gc_info(&self) -> anyhow::Result>> { + // since this method can now be called at different rates than the configured gc loop, it + // might be that these configuration values get applied faster than what it was previously, + // since these were only read from the gc task. + let horizon = self.get_gc_horizon(); + let pitr = self.get_pitr_interval(); + + // refresh all timelines + let target_timeline_id = None; + + self.refresh_gc_info_internal(target_timeline_id, horizon, pitr) + } + + fn refresh_gc_info_internal( + &self, + target_timeline_id: Option, + horizon: u64, + pitr: Duration, + ) -> anyhow::Result>> { // grab mutex to prevent new timelines from being created here. let gc_cs = self.gc_cs.lock().unwrap(); @@ -995,9 +1064,6 @@ impl Tenant { timelines .iter() .map(|(timeline_id, timeline_entry)| { - // This is unresolved question for now, how to do gc in presence of remote timelines - // especially when this is combined with branching. - // Somewhat related: https://github.com/neondatabase/neon/issues/999 if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { // If target_timeline is specified, we only need to know branchpoints of its children if let Some(timeline_id) = target_timeline_id { @@ -1052,40 +1118,7 @@ impl Tenant { } drop(gc_cs); - // Perform GC for each timeline. - // - // Note that we don't hold the GC lock here because we don't want - // to delay the branch creation task, which requires the GC lock. - // A timeline GC iteration can be slow because it may need to wait for - // compaction (both require `layer_removal_cs` lock), - // but the GC iteration can run concurrently with branch creation. - // - // See comments in [`Tenant::branch_timeline`] for more information - // about why branch creation task can run concurrently with timeline's GC iteration. - for timeline in gc_timelines { - if task_mgr::is_shutdown_requested() { - // We were requested to shut down. Stop and return with the progress we - // made. - break; - } - - // If requested, force flush all in-memory layers to disk first, - // so that they too can be garbage collected. That's - // used in tests, so we want as deterministic results as possible. - if checkpoint_before_gc { - timeline.checkpoint(CheckpointConfig::Forced)?; - info!( - "timeline {} checkpoint_before_gc done", - timeline.timeline_id - ); - } - - let result = timeline.gc()?; - totals += result; - } - - totals.elapsed = now.elapsed(); - Ok(totals) + Ok(gc_timelines) } /// Branch an existing timeline @@ -1444,6 +1477,25 @@ impl Tenant { Ok(()) } + + /// Gathers inputs from all of the timelines to produce a sizing model input. + /// + /// Future is cancellation safe. Only one calculation can be running at once per tenant. + #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] + pub async fn gather_size_inputs(&self) -> anyhow::Result { + let logical_sizes_at_once = self + .conf + .concurrent_tenant_size_logical_size_queries + .inner(); + + // TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries + // are for testing/experimenting, we tolerate this. + // + // See more for on the issue #2748 condenced out of the initial PR review. + let mut shared_cache = self.cached_logical_sizes.lock().await; + + size::gather_inputs(self, logical_sizes_at_once, &mut *shared_cache).await + } } /// Create the cluster temporarily in 'initdbpath' directory inside the repository diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs new file mode 100644 index 0000000000..f0c611ae39 --- /dev/null +++ b/pageserver/src/tenant/size.rs @@ -0,0 +1,454 @@ +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use anyhow::Context; +use tokio::sync::Semaphore; + +use super::Tenant; +use utils::id::TimelineId; +use utils::lsn::Lsn; + +use tracing::*; + +/// Inputs to the actual tenant sizing model +/// +/// Implements [`serde::Serialize`] but is not meant to be part of the public API, instead meant to +/// be a transferrable format between execution environments and developer. +#[serde_with::serde_as] +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub struct ModelInputs { + updates: Vec, + retention_period: u64, + #[serde_as(as = "HashMap")] + timeline_inputs: HashMap, +} + +/// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as +/// part of [`ModelInputs`] from the HTTP api, explaining the inputs. +#[serde_with::serde_as] +#[derive(Debug, serde::Serialize, serde::Deserialize)] +struct TimelineInputs { + #[serde_as(as = "serde_with::DisplayFromStr")] + last_record: Lsn, + #[serde_as(as = "serde_with::DisplayFromStr")] + latest_gc_cutoff: Lsn, + #[serde_as(as = "serde_with::DisplayFromStr")] + horizon_cutoff: Lsn, + #[serde_as(as = "serde_with::DisplayFromStr")] + pitr_cutoff: Lsn, + #[serde_as(as = "serde_with::DisplayFromStr")] + next_gc_cutoff: Lsn, +} + +pub(super) async fn gather_inputs( + tenant: &Tenant, + limit: &Arc, + logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>, +) -> anyhow::Result { + // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to + // our advantage with `?` error handling. + let mut joinset = tokio::task::JoinSet::new(); + + let timelines = tenant + .refresh_gc_info() + .context("Failed to refresh gc_info before gathering inputs")?; + + if timelines.is_empty() { + // All timelines are below tenant's gc_horizon; alternative would be to use + // Tenant::list_timelines but then those gc_info's would not be updated yet, possibly + // missing GcInfo::retain_lsns or having obsolete values for cutoff's. + return Ok(ModelInputs { + updates: vec![], + retention_period: 0, + timeline_inputs: HashMap::new(), + }); + } + + // record the used/inserted cache keys here, to remove extras not to start leaking + // after initial run the cache should be quite stable, but live timelines will eventually + // require new lsns to be inspected. + let mut needed_cache = HashSet::<(TimelineId, Lsn)>::new(); + + let mut updates = Vec::new(); + + // record the per timline values used to determine `retention_period` + let mut timeline_inputs = HashMap::with_capacity(timelines.len()); + + // used to determine the `retention_period` for the size model + let mut max_cutoff_distance = None; + + // this will probably conflict with on-demand downloaded layers, or at least force them all + // to be downloaded + for timeline in timelines { + let last_record_lsn = timeline.get_last_record_lsn(); + + let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = { + // there's a race between the update (holding tenant.gc_lock) and this read but it + // might not be an issue, because it's not for Timeline::gc + let gc_info = timeline.gc_info.read().unwrap(); + + // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a + // new gc run, which we have no control over. + // maybe this should be moved to gc_info.next_gc_cutoff()? + let next_gc_cutoff = std::cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff); + + let maybe_cutoff = if next_gc_cutoff > timeline.get_ancestor_lsn() { + // only include these if they are after branching point; otherwise we would end up + // with duplicate updates before the actual branching. + Some((next_gc_cutoff, LsnKind::GcCutOff)) + } else { + None + }; + + // this assumes there are no other lsns than the branchpoints + let lsns = gc_info + .retain_lsns + .iter() + .inspect(|&&lsn| { + trace!( + timeline_id=%timeline.timeline_id, + "retained lsn: {lsn:?}, is_before_ancestor_lsn={}", + lsn < timeline.get_ancestor_lsn() + ) + }) + .filter(|&&lsn| lsn > timeline.get_ancestor_lsn()) + .copied() + .map(|lsn| (lsn, LsnKind::BranchPoint)) + .chain(maybe_cutoff) + .collect::>(); + + ( + lsns, + gc_info.horizon_cutoff, + gc_info.pitr_cutoff, + next_gc_cutoff, + ) + }; + + // update this to have a retention_period later for the tenant_size_model + // tenant_size_model compares this to the last segments start_lsn + if let Some(cutoff_distance) = last_record_lsn.checked_sub(next_gc_cutoff) { + match max_cutoff_distance.as_mut() { + Some(max) => { + *max = std::cmp::max(*max, cutoff_distance); + } + _ => { + max_cutoff_distance = Some(cutoff_distance); + } + } + } + + // all timelines branch from something, because it might be impossible to pinpoint + // which is the tenant_size_model's "default" branch. + updates.push(Update { + lsn: timeline.get_ancestor_lsn(), + command: Command::BranchFrom(timeline.get_ancestor_timeline_id()), + timeline_id: timeline.timeline_id, + }); + + for (lsn, _kind) in &interesting_lsns { + if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) { + updates.push(Update { + lsn: *lsn, + timeline_id: timeline.timeline_id, + command: Command::Update(*size), + }); + + needed_cache.insert((timeline.timeline_id, *lsn)); + } else { + let timeline = Arc::clone(&timeline); + let parallel_size_calcs = Arc::clone(limit); + joinset.spawn(calculate_logical_size(parallel_size_calcs, timeline, *lsn)); + } + } + + timeline_inputs.insert( + timeline.timeline_id, + TimelineInputs { + last_record: last_record_lsn, + // this is not used above, because it might not have updated recently enough + latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), + horizon_cutoff, + pitr_cutoff, + next_gc_cutoff, + }, + ); + } + + let mut have_any_error = false; + + while let Some(res) = joinset.join_next().await { + // each of these come with Result, JoinError> + // because of spawn + spawn_blocking + let res = res.and_then(|inner| inner); + match res { + Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => { + debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated"); + + logical_size_cache.insert((timeline.timeline_id, lsn), size); + needed_cache.insert((timeline.timeline_id, lsn)); + + updates.push(Update { + lsn, + timeline_id: timeline.timeline_id, + command: Command::Update(size), + }); + } + Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => { + warn!( + timeline_id=%timeline.timeline_id, + "failed to calculate logical size at {lsn}: {error:#}" + ); + have_any_error = true; + } + Err(join_error) if join_error.is_cancelled() => { + unreachable!("we are not cancelling any of the futures, nor should be"); + } + Err(join_error) => { + // cannot really do anything, as this panic is likely a bug + error!("logical size query panicked: {join_error:#}"); + have_any_error = true; + } + } + } + + // prune any keys not needed anymore; we record every used key and added key. + logical_size_cache.retain(|key, _| needed_cache.contains(key)); + + if have_any_error { + // we cannot complete this round, because we are missing data. + // we have however cached all we were able to request calculation on. + anyhow::bail!("failed to calculate some logical_sizes"); + } + + // the data gathered to updates is per lsn, regardless of the branch, so we can use it to + // our advantage, not requiring a sorted container or graph walk. + // + // for branch points, which come as multiple updates at the same LSN, the Command::Update + // is needed before a branch is made out of that branch Command::BranchFrom. this is + // handled by the variant order in `Command`. + updates.sort_unstable(); + + let retention_period = match max_cutoff_distance { + Some(max) => max.0, + None => { + anyhow::bail!("the first branch should have a gc_cutoff after it's branch point at 0") + } + }; + + Ok(ModelInputs { + updates, + retention_period, + timeline_inputs, + }) +} + +impl ModelInputs { + pub fn calculate(&self) -> anyhow::Result { + // Option is used for "naming" the branches because it is assumed to be + // impossible to always determine the a one main branch. + let mut storage = tenant_size_model::Storage::>::new(None); + + // tracking these not to require modifying the current implementation of the size model, + // which works in relative LSNs and sizes. + let mut last_state: HashMap = HashMap::new(); + + for update in &self.updates { + let Update { + lsn, + command: op, + timeline_id, + } = update; + match op { + Command::Update(sz) => { + let latest = last_state.get_mut(timeline_id).ok_or_else(|| { + anyhow::anyhow!( + "ordering-mismatch: there must had been a previous state for {timeline_id}" + ) + })?; + + let lsn_bytes = { + let Lsn(now) = lsn; + let Lsn(prev) = latest.0; + debug_assert!(prev <= *now, "self.updates should had been sorted"); + now - prev + }; + + let size_diff = + i64::try_from(*sz as i128 - latest.1 as i128).with_context(|| { + format!("size difference i64 overflow for {timeline_id}") + })?; + + storage.modify_branch(&Some(*timeline_id), "".into(), lsn_bytes, size_diff); + *latest = (*lsn, *sz); + } + Command::BranchFrom(parent) => { + storage.branch(parent, Some(*timeline_id)); + + let size = parent + .as_ref() + .and_then(|id| last_state.get(id)) + .map(|x| x.1) + .unwrap_or(0); + last_state.insert(*timeline_id, (*lsn, size)); + } + } + } + + Ok(storage.calculate(self.retention_period).total_children()) + } +} + +/// Single size model update. +/// +/// Sizing model works with relative increments over latest branch state. +/// Updates are absolute, so additional state needs to be tracked when applying. +#[serde_with::serde_as] +#[derive( + Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize, +)] +struct Update { + #[serde_as(as = "serde_with::DisplayFromStr")] + lsn: utils::lsn::Lsn, + command: Command, + #[serde_as(as = "serde_with::DisplayFromStr")] + timeline_id: TimelineId, +} + +#[serde_with::serde_as] +#[derive(PartialOrd, PartialEq, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "snake_case")] +enum Command { + Update(u64), + BranchFrom(#[serde_as(as = "Option")] Option), +} + +impl std::fmt::Debug for Command { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // custom one-line implementation makes it more enjoyable to read {:#?} avoiding 3 + // linebreaks + match self { + Self::Update(arg0) => write!(f, "Update({arg0})"), + Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"), + } + } +} + +#[derive(Debug, Clone, Copy)] +enum LsnKind { + BranchPoint, + GcCutOff, +} + +/// Newtype around the tuple that carries the timeline at lsn logical size calculation. +struct TimelineAtLsnSizeResult( + Arc, + utils::lsn::Lsn, + anyhow::Result, +); + +#[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))] +async fn calculate_logical_size( + limit: Arc, + timeline: Arc, + lsn: utils::lsn::Lsn, +) -> Result { + let permit = tokio::sync::Semaphore::acquire_owned(limit) + .await + .expect("global semaphore should not had been closed"); + + tokio::task::spawn_blocking(move || { + let _permit = permit; + let size_res = timeline.calculate_logical_size(lsn); + TimelineAtLsnSizeResult(timeline, lsn, size_res) + }) + .await +} + +#[test] +fn updates_sort() { + use std::str::FromStr; + use utils::id::TimelineId; + use utils::lsn::Lsn; + + let ids = [ + TimelineId::from_str("7ff1edab8182025f15ae33482edb590a").unwrap(), + TimelineId::from_str("b1719e044db05401a05a2ed588a3ad3f").unwrap(), + TimelineId::from_str("b68d6691c895ad0a70809470020929ef").unwrap(), + ]; + + // try through all permutations + let ids = [ + [&ids[0], &ids[1], &ids[2]], + [&ids[0], &ids[2], &ids[1]], + [&ids[1], &ids[0], &ids[2]], + [&ids[1], &ids[2], &ids[0]], + [&ids[2], &ids[0], &ids[1]], + [&ids[2], &ids[1], &ids[0]], + ]; + + for ids in ids { + // apply a fixture which uses a permutation of ids + let commands = [ + Update { + lsn: Lsn(0), + command: Command::BranchFrom(None), + timeline_id: *ids[0], + }, + Update { + lsn: Lsn::from_str("0/67E7618").unwrap(), + command: Command::Update(43696128), + timeline_id: *ids[0], + }, + Update { + lsn: Lsn::from_str("0/67E7618").unwrap(), + command: Command::BranchFrom(Some(*ids[0])), + timeline_id: *ids[1], + }, + Update { + lsn: Lsn::from_str("0/76BE4F0").unwrap(), + command: Command::Update(41844736), + timeline_id: *ids[1], + }, + Update { + lsn: Lsn::from_str("0/10E49380").unwrap(), + command: Command::Update(42164224), + timeline_id: *ids[0], + }, + Update { + lsn: Lsn::from_str("0/10E49380").unwrap(), + command: Command::BranchFrom(Some(*ids[0])), + timeline_id: *ids[2], + }, + Update { + lsn: Lsn::from_str("0/11D74910").unwrap(), + command: Command::Update(42172416), + timeline_id: *ids[2], + }, + Update { + lsn: Lsn::from_str("0/12051E98").unwrap(), + command: Command::Update(42196992), + timeline_id: *ids[0], + }, + ]; + + let mut sorted = commands; + + // these must sort in the same order, regardless of how the ids sort + // which is why the timeline_id is the last field + sorted.sort_unstable(); + + assert_eq!(commands, sorted, "{:#?} vs. {:#?}", commands, sorted); + } +} + +#[test] +fn verify_size_for_multiple_branches() { + // this is generated from integration test test_tenant_size_with_multiple_branches, but this way + // it has the stable lsn's + let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072,"timeline_inputs":{"cd9d9409c216e64bf580904facedb01b":{"last_record":"0/18D5E40","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/18B5E40","pitr_cutoff":"0/18B5E40","next_gc_cutoff":"0/18B5E40"},"10b532a550540bc15385eac4edde416a":{"last_record":"0/1839818","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/1819818","pitr_cutoff":"0/1819818","next_gc_cutoff":"0/1819818"},"230fc9d756f7363574c0d66533564dcc":{"last_record":"0/222F438","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/220F438","pitr_cutoff":"0/220F438","next_gc_cutoff":"0/220F438"}}}"#; + + let inputs: ModelInputs = serde_json::from_str(doc).unwrap(); + + assert_eq!(inputs.calculate().unwrap(), 36_409_872); +} diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d63429ea6a..279da70128 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -272,6 +272,11 @@ impl LogicalSize { self.size_added_after_initial .fetch_add(delta, AtomicOrdering::SeqCst); } + + /// Returns the initialized (already calculated) value, if any. + fn initialized_size(&self) -> Option { + self.initial_logical_size.get().copied() + } } pub struct WalReceiverInfo { @@ -979,9 +984,26 @@ impl Timeline { /// Calculate the logical size of the database at the latest LSN. /// /// NOTE: counted incrementally, includes ancestors, this can be a slow operation. - fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result { - info!("Calculating logical size for timeline {}", self.timeline_id); - let timer = self.metrics.init_logical_size_histo.start_timer(); + pub fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result { + info!( + "Calculating logical size for timeline {} at {}", + self.timeline_id, up_to_lsn + ); + let timer = if up_to_lsn == self.initdb_lsn { + if let Some(size) = self.current_logical_size.initialized_size() { + if size != 0 { + // non-zero size means that the size has already been calculated by this method + // after startup. if the logical size is for a new timeline without layers the + // size will be zero, and we cannot use that, or this caching strategy until + // pageserver restart. + return Ok(size); + } + } + + self.metrics.init_logical_size_histo.start_timer() + } else { + self.metrics.logical_size_histo.start_timer() + }; let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn)?; debug!("calculated logical size: {logical_size}"); timer.stop_and_record(); diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b62c80824a..63b809a786 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1047,6 +1047,21 @@ class PageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json + def tenant_size(self, tenant_id: TenantId) -> int: + """ + Returns the tenant size, together with the model inputs as the second tuple item. + """ + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/size") + self.verbose_error(res) + res = res.json() + assert isinstance(res, dict) + assert TenantId(res["id"]) == tenant_id + size = res["size"] + assert type(size) == int + # there are additional inputs, which are the collected raw information before being fed to the tenant_size_model + # there are no tests for those right now. + return size + def timeline_list(self, tenant_id: TenantId) -> List[Dict[str, Any]]: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline") self.verbose_error(res) @@ -2742,12 +2757,12 @@ def wait_for_last_record_lsn( tenant: TenantId, timeline: TimelineId, lsn: Lsn, -): - """waits for pageserver to catch up to a certain lsn""" +) -> Lsn: + """waits for pageserver to catch up to a certain lsn, returns the last observed lsn.""" for i in range(10): current_lsn = last_record_lsn(pageserver_http_client, tenant, timeline) if current_lsn >= lsn: - return + return current_lsn log.info( "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( lsn, current_lsn, i + 1 @@ -2759,10 +2774,12 @@ def wait_for_last_record_lsn( ) -def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId): - """Wait for pageserver to catch up the latest flush LSN""" +def wait_for_last_flush_lsn( + env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId +) -> Lsn: + """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) + return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) def fork_at_current_lsn( diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py new file mode 100644 index 0000000000..ecf78499bb --- /dev/null +++ b/test_runner/regress/test_tenant_size.py @@ -0,0 +1,276 @@ +import time +from typing import List, Tuple + +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PageserverApiException, + wait_for_last_flush_lsn, +) +from fixtures.types import Lsn + + +def test_empty_tenant_size(neon_simple_env: NeonEnv): + env = neon_simple_env + (tenant_id, _) = env.neon_cli.create_tenant() + http_client = env.pageserver.http_client() + size = http_client.tenant_size(tenant_id) + + # we should never have zero, because there should be the initdb however + # this is questionable if we should have anything in this case, as the + # gc_cutoff is negative + assert ( + size == 0 + ), "initial implementation returns zero tenant_size before last_record_lsn is past gc_horizon" + + with env.postgres.create_start("main", tenant_id=tenant_id) as pg: + with pg.cursor() as cur: + cur.execute("SELECT 1") + row = cur.fetchone() + assert row is not None + assert row[0] == 1 + size = http_client.tenant_size(tenant_id) + assert size == 0, "starting idle compute should not change the tenant size" + + # the size should be the same, until we increase the size over the + # gc_horizon + size = http_client.tenant_size(tenant_id) + assert size == 0, "tenant_size should not be affected by shutdown of compute" + + +def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): + """ + Operate on single branch reading the tenants size after each transaction. + """ + + # gc and compaction is not wanted automatically + # the pitr_interval here is quite problematic, so we cannot really use it. + # it'd have to be calibrated per test executing env. + neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='1h', gc_period='1h', pitr_interval='0sec', gc_horizon={128 * 1024}}}" + # in this test we don't run gc or compaction, but the tenant size is + # expected to use the "next gc" cutoff, so the small amounts should work + + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + branch_name, timeline_id = env.neon_cli.list_timelines(tenant_id)[0] + + http_client = env.pageserver.http_client() + + collected_responses: List[Tuple[Lsn, int]] = [] + + with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg: + with pg.cursor() as cur: + cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL)") + + batch_size = 100 + + i = 0 + while True: + with pg.cursor() as cur: + cur.execute( + f"INSERT INTO t0(i) SELECT i FROM generate_series({batch_size} * %s, ({batch_size} * (%s + 1)) - 1) s(i)", + (i, i), + ) + + i += 1 + + current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + size = http_client.tenant_size(tenant_id) + + if len(collected_responses) > 0: + prev = collected_responses[-1][1] + if size == 0: + assert prev == 0 + else: + assert size > prev + + collected_responses.append((current_lsn, size)) + + if len(collected_responses) > 2: + break + + while True: + with pg.cursor() as cur: + cur.execute( + f"UPDATE t0 SET i = -i WHERE i IN (SELECT i FROM t0 WHERE i > 0 LIMIT {batch_size})" + ) + updated = cur.rowcount + + if updated == 0: + break + + current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + size = http_client.tenant_size(tenant_id) + prev = collected_responses[-1][1] + assert size > prev, "tenant_size should grow with updates" + collected_responses.append((current_lsn, size)) + + while True: + with pg.cursor() as cur: + cur.execute(f"DELETE FROM t0 WHERE i IN (SELECT i FROM t0 LIMIT {batch_size})") + deleted = cur.rowcount + + if deleted == 0: + break + + current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + size = http_client.tenant_size(tenant_id) + prev = collected_responses[-1][1] + assert ( + size > prev + ), "even though rows have been deleted, the tenant_size should increase" + collected_responses.append((current_lsn, size)) + + with pg.cursor() as cur: + cur.execute("DROP TABLE t0") + + current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + size = http_client.tenant_size(tenant_id) + prev = collected_responses[-1][1] + assert size > prev, "dropping table grows tenant_size" + collected_responses.append((current_lsn, size)) + + # this isn't too many lines to forget for a while. observed while + # developing these tests that locally the value is a bit more than what we + # get in the ci. + for lsn, size in collected_responses: + log.info(f"collected: {lsn}, {size}") + + env.pageserver.stop() + env.pageserver.start() + + size_after = http_client.tenant_size(tenant_id) + prev = collected_responses[-1][1] + + assert size_after == prev, "size after restarting pageserver should not have changed" + + +def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder): + """ + Reported size goes up while branches or rows are being added, goes down after removing branches. + """ + + gc_horizon = 128 * 1024 + + neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='1h', gc_period='1h', pitr_interval='0sec', gc_horizon={gc_horizon}}}" + + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + main_branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0] + + http_client = env.pageserver.http_client() + + main_pg = env.postgres.create_start(main_branch_name, tenant_id=tenant_id) + + batch_size = 10000 + + with main_pg.cursor() as cur: + cur.execute( + f"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, {batch_size}) s(i)" + ) + + wait_for_last_flush_lsn(env, main_pg, tenant_id, main_timeline_id) + size_at_branch = http_client.tenant_size(tenant_id) + assert size_at_branch > 0 + + first_branch_timeline_id = env.neon_cli.create_branch( + "first-branch", main_branch_name, tenant_id + ) + + # unsure why this happens, the size difference is more than a page alignment + size_after_first_branch = http_client.tenant_size(tenant_id) + assert size_after_first_branch > size_at_branch + assert size_after_first_branch - size_at_branch == gc_horizon + + first_branch_pg = env.postgres.create_start("first-branch", tenant_id=tenant_id) + + with first_branch_pg.cursor() as cur: + cur.execute( + f"CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, {batch_size}) s(i)" + ) + + wait_for_last_flush_lsn(env, first_branch_pg, tenant_id, first_branch_timeline_id) + size_after_growing_first_branch = http_client.tenant_size(tenant_id) + assert size_after_growing_first_branch > size_after_first_branch + + with main_pg.cursor() as cur: + cur.execute( + f"CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 2*{batch_size}) s(i)" + ) + + wait_for_last_flush_lsn(env, main_pg, tenant_id, main_timeline_id) + size_after_continuing_on_main = http_client.tenant_size(tenant_id) + assert size_after_continuing_on_main > size_after_growing_first_branch + + second_branch_timeline_id = env.neon_cli.create_branch( + "second-branch", main_branch_name, tenant_id + ) + size_after_second_branch = http_client.tenant_size(tenant_id) + assert size_after_second_branch > size_after_continuing_on_main + + second_branch_pg = env.postgres.create_start("second-branch", tenant_id=tenant_id) + + with second_branch_pg.cursor() as cur: + cur.execute( + f"CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 3*{batch_size}) s(i)" + ) + + wait_for_last_flush_lsn(env, second_branch_pg, tenant_id, second_branch_timeline_id) + size_after_growing_second_branch = http_client.tenant_size(tenant_id) + assert size_after_growing_second_branch > size_after_second_branch + + with second_branch_pg.cursor() as cur: + cur.execute("DROP TABLE t0") + cur.execute("DROP TABLE t1") + cur.execute("VACUUM FULL") + + wait_for_last_flush_lsn(env, second_branch_pg, tenant_id, second_branch_timeline_id) + size_after_thinning_branch = http_client.tenant_size(tenant_id) + assert ( + size_after_thinning_branch > size_after_growing_second_branch + ), "tenant_size should grow with dropped tables and full vacuum" + + first_branch_pg.stop_and_destroy() + second_branch_pg.stop_and_destroy() + main_pg.stop() + env.pageserver.stop() + env.pageserver.start() + + # chance of compaction and gc on startup might have an effect on the + # tenant_size but so far this has been reliable, even though at least gc + # and tenant_size race for the same locks + size_after = http_client.tenant_size(tenant_id) + assert size_after == size_after_thinning_branch + + # teardown, delete branches, and the size should be going down + deleted = False + for _ in range(10): + try: + http_client.timeline_delete(tenant_id, first_branch_timeline_id) + deleted = True + break + except PageserverApiException as e: + # compaction is ok but just retry if this fails; related to #2442 + if "cannot lock compaction critical section" in str(e): + time.sleep(1) + continue + raise + + assert deleted + + size_after_deleting_first = http_client.tenant_size(tenant_id) + assert size_after_deleting_first < size_after_thinning_branch + + http_client.timeline_delete(tenant_id, second_branch_timeline_id) + size_after_deleting_second = http_client.tenant_size(tenant_id) + assert size_after_deleting_second < size_after_deleting_first + + assert size_after_deleting_second < size_after_continuing_on_main + assert size_after_deleting_second > size_after_first_branch From 71d268c7c4cd187ae62476c75c0bd5f84fb81462 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Thu, 3 Nov 2022 10:24:15 -0400 Subject: [PATCH 0990/1022] Write message serialization test (#2746) --- libs/pageserver_api/src/models.rs | 64 +++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 4 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 3453f9672a..e5bd46f260 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -226,6 +226,7 @@ pub struct TimelineGcRequest { } // Wrapped in libpq CopyData +#[derive(PartialEq, Eq)] pub enum PagestreamFeMessage { Exists(PagestreamExistsRequest), Nblocks(PagestreamNblocksRequest), @@ -242,21 +243,21 @@ pub enum PagestreamBeMessage { DbSize(PagestreamDbSizeResponse), } -#[derive(Debug)] +#[derive(Debug, PartialEq, Eq)] pub struct PagestreamExistsRequest { pub latest: bool, pub lsn: Lsn, pub rel: RelTag, } -#[derive(Debug)] +#[derive(Debug, PartialEq, Eq)] pub struct PagestreamNblocksRequest { pub latest: bool, pub lsn: Lsn, pub rel: RelTag, } -#[derive(Debug)] +#[derive(Debug, PartialEq, Eq)] pub struct PagestreamGetPageRequest { pub latest: bool, pub lsn: Lsn, @@ -264,7 +265,7 @@ pub struct PagestreamGetPageRequest { pub blkno: u32, } -#[derive(Debug)] +#[derive(Debug, PartialEq, Eq)] pub struct PagestreamDbSizeRequest { pub latest: bool, pub lsn: Lsn, @@ -427,3 +428,58 @@ impl PagestreamBeMessage { bytes.into() } } + +#[cfg(test)] +mod tests { + use bytes::Buf; + + use super::*; + + #[test] + fn test_pagestream() { + // Test serialization/deserialization of PagestreamFeMessage + let messages = vec![ + PagestreamFeMessage::Exists(PagestreamExistsRequest { + latest: true, + lsn: Lsn(4), + rel: RelTag { + forknum: 1, + spcnode: 2, + dbnode: 3, + relnode: 4, + }, + }), + PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { + latest: false, + lsn: Lsn(4), + rel: RelTag { + forknum: 1, + spcnode: 2, + dbnode: 3, + relnode: 4, + }, + }), + PagestreamFeMessage::GetPage(PagestreamGetPageRequest { + latest: true, + lsn: Lsn(4), + rel: RelTag { + forknum: 1, + spcnode: 2, + dbnode: 3, + relnode: 4, + }, + blkno: 7, + }), + PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { + latest: true, + lsn: Lsn(4), + dbnode: 7, + }), + ]; + for msg in messages { + let bytes = msg.serialize(); + let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap(); + assert!(msg == reconstructed); + } + } +} From c38f38dab746ef10b32b350ecd38f5adc4d98a1a Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Mon, 31 Oct 2022 16:49:40 +0300 Subject: [PATCH 0991/1022] Move pq_proto to its own crate --- Cargo.lock | 26 ++++++- libs/pq_proto/Cargo.toml | 16 ++++ .../src/pq_proto.rs => pq_proto/src/lib.rs} | 8 +- libs/{utils => pq_proto}/src/sync.rs | 2 +- libs/utils/Cargo.toml | 5 +- libs/utils/src/lib.rs | 6 -- libs/utils/src/postgres_backend.rs | 2 +- libs/utils/src/postgres_backend_async.rs | 2 +- pageserver/Cargo.toml | 73 +++++++++---------- pageserver/src/page_service.rs | 2 +- .../src/walreceiver/walreceiver_connection.rs | 4 +- proxy/Cargo.toml | 9 +-- proxy/src/auth/backend/link.rs | 2 +- proxy/src/auth/credentials.rs | 2 +- proxy/src/auth/flow.rs | 2 +- proxy/src/cancellation.rs | 2 +- proxy/src/compute.rs | 4 +- proxy/src/mgmt.rs | 6 +- proxy/src/proxy.rs | 2 +- proxy/src/sasl/messages.rs | 6 +- proxy/src/stream.rs | 2 +- safekeeper/Cargo.toml | 57 ++++++++------- safekeeper/src/control_file_upgrade.rs | 2 +- safekeeper/src/handler.rs | 2 +- safekeeper/src/json_ctrl.rs | 7 +- safekeeper/src/metrics.rs | 2 +- safekeeper/src/receive_wal.rs | 7 +- safekeeper/src/safekeeper.rs | 2 +- safekeeper/src/send_wal.rs | 9 +-- safekeeper/src/timeline.rs | 20 ++--- workspace_hack/Cargo.toml | 4 + 31 files changed, 154 insertions(+), 141 deletions(-) create mode 100644 libs/pq_proto/Cargo.toml rename libs/{utils/src/pq_proto.rs => pq_proto/src/lib.rs} (99%) rename libs/{utils => pq_proto}/src/sync.rs (99%) diff --git a/Cargo.lock b/Cargo.lock index 9a5ac0b1d5..c112c05188 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2145,6 +2145,7 @@ dependencies = [ "postgres-types", "postgres_ffi", "pprof", + "pq_proto", "rand", "regex", "remote_storage", @@ -2438,6 +2439,21 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" +[[package]] +name = "pq_proto" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes", + "pin-project-lite", + "postgres-protocol", + "rand", + "serde", + "tokio", + "tracing", + "workspace_hack", +] + [[package]] name = "prettyplease" version = "0.1.21" @@ -2570,6 +2586,7 @@ dependencies = [ "once_cell", "parking_lot 0.12.1", "pin-project-lite", + "pq_proto", "rand", "rcgen", "reqwest", @@ -3086,6 +3103,7 @@ dependencies = [ "postgres", "postgres-protocol", "postgres_ffi", + "pq_proto", "regex", "remote_storage", "safekeeper_api", @@ -4046,9 +4064,7 @@ dependencies = [ "metrics", "nix 0.25.0", "once_cell", - "pin-project-lite", - "postgres", - "postgres-protocol", + "pq_proto", "rand", "routerify", "rustls", @@ -4373,6 +4389,9 @@ dependencies = [ "crossbeam-utils", "either", "fail", + "futures-channel", + "futures-task", + "futures-util", "hashbrown", "indexmap", "libc", @@ -4386,6 +4405,7 @@ dependencies = [ "rand", "regex", "regex-syntax", + "reqwest", "scopeguard", "serde", "stable_deref_trait", diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml new file mode 100644 index 0000000000..4d48e431b4 --- /dev/null +++ b/libs/pq_proto/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "pq_proto" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = "1.0" +bytes = "1.0.1" +pin-project-lite = "0.2.7" +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +rand = "0.8.3" +serde = { version = "1.0", features = ["derive"] } +tokio = { version = "1.17", features = ["macros"] } +tracing = "0.1" + +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/utils/src/pq_proto.rs b/libs/pq_proto/src/lib.rs similarity index 99% rename from libs/utils/src/pq_proto.rs rename to libs/pq_proto/src/lib.rs index 8c4e297f82..2e311dd6e3 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/pq_proto/src/lib.rs @@ -2,7 +2,9 @@ //! //! on message formats. -use crate::sync::{AsyncishRead, SyncFuture}; +// Tools for calling certain async methods in sync contexts. +pub mod sync; + use anyhow::{bail, ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use postgres_protocol::PG_EPOCH; @@ -16,6 +18,7 @@ use std::{ str, time::{Duration, SystemTime}, }; +use sync::{AsyncishRead, SyncFuture}; use tokio::io::AsyncReadExt; use tracing::{trace, warn}; @@ -198,7 +201,7 @@ impl FeMessage { /// /// ``` /// # use std::io; - /// # use utils::pq_proto::FeMessage; + /// # use pq_proto::FeMessage; /// # /// # fn process_message(msg: FeMessage) -> anyhow::Result<()> { /// # Ok(()) @@ -302,6 +305,7 @@ impl FeStartupPacket { Err(e) => return Err(e.into()), }; + #[allow(clippy::manual_range_contains)] if len < 4 || len > MAX_STARTUP_PACKET_LENGTH { bail!("invalid message length"); } diff --git a/libs/utils/src/sync.rs b/libs/pq_proto/src/sync.rs similarity index 99% rename from libs/utils/src/sync.rs rename to libs/pq_proto/src/sync.rs index 48f0ff6384..b7ff1fb70b 100644 --- a/libs/utils/src/sync.rs +++ b/libs/pq_proto/src/sync.rs @@ -29,7 +29,7 @@ impl SyncFuture { /// Example: /// /// ``` - /// # use utils::sync::SyncFuture; + /// # use pq_proto::sync::SyncFuture; /// # use std::future::Future; /// # use tokio::io::AsyncReadExt; /// # diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 1753ee81b9..36a379b47a 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -9,9 +9,6 @@ anyhow = "1.0" bincode = "1.3" bytes = "1.0.1" hyper = { version = "0.14.7", features = ["full"] } -pin-project-lite = "0.2.7" -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } routerify = "3" serde = { version = "1.0", features = ["derive"] } serde_json = "1" @@ -33,8 +30,8 @@ once_cell = "1.13.0" strum = "0.24" strum_macros = "0.24" - metrics = { path = "../metrics" } +pq_proto = { path = "../pq_proto" } workspace_hack = { version = "0.1", path = "../../workspace_hack" } [dev-dependencies] diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 6f51465609..11ee7ac7eb 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -1,8 +1,6 @@ //! `utils` is intended to be a place to put code that is shared //! between other crates in this repository. -#![allow(clippy::manual_range_contains)] - /// `Lsn` type implements common tasks on Log Sequence Numbers pub mod lsn; /// SeqWait allows waiting for a future sequence number to arrive @@ -17,7 +15,6 @@ pub mod vec_map; pub mod bin_ser; pub mod postgres_backend; pub mod postgres_backend_async; -pub mod pq_proto; // helper functions for creating and fsyncing pub mod crashsafe; @@ -42,9 +39,6 @@ pub mod lock_file; pub mod accum; pub mod shutdown; -// Tools for calling certain async methods in sync contexts -pub mod sync; - // Utility for binding TcpListeners with proper socket options. pub mod tcp_listener; diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index adee46c2dd..89f7197718 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -3,10 +3,10 @@ //! implementation determining how to process the queries. Currently its API //! is rather narrow, but we can extend it once required. -use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket}; use crate::sock_split::{BidiStream, ReadStream, WriteStream}; use anyhow::{bail, ensure, Context, Result}; use bytes::{Bytes, BytesMut}; +use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket}; use rand::Rng; use serde::{Deserialize, Serialize}; use std::fmt; diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs index 53f6759d62..376819027b 100644 --- a/libs/utils/src/postgres_backend_async.rs +++ b/libs/utils/src/postgres_backend_async.rs @@ -4,9 +4,9 @@ //! is rather narrow, but we can extend it once required. use crate::postgres_backend::AuthType; -use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket}; use anyhow::{bail, Context, Result}; use bytes::{Bytes, BytesMut}; +use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket}; use rand::Rng; use std::future::Future; use std::net::SocketAddr; diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 7ce936ca27..a38978512d 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -12,62 +12,61 @@ testing = ["fail/failpoints"] profiling = ["pprof"] [dependencies] +amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" } +anyhow = { version = "1.0", features = ["backtrace"] } async-stream = "0.3" async-trait = "0.1" -chrono = "0.4.19" -rand = "0.8.3" -regex = "1.4.5" -bytes = "1.0.1" byteorder = "1.4.3" +bytes = "1.0.1" +chrono = "0.4.19" +clap = { version = "4.0", features = ["string"] } +close_fds = "0.3.2" +const_format = "0.2.21" +crc32c = "0.6.0" +crossbeam-utils = "0.8.5" +fail = "0.5.0" futures = "0.3.13" +git-version = "0.3.5" hex = "0.4.3" +humantime = "2.1.0" +humantime-serde = "1.1.1" hyper = "0.14" itertools = "0.10.3" -clap = { version = "4.0", features = ["string"] } -tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } -tokio-util = { version = "0.7.3", features = ["io", "io-util"] } -postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +nix = "0.25" +num-traits = "0.2.15" +once_cell = "1.13.0" postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -anyhow = { version = "1.0", features = ["backtrace"] } -crc32c = "0.6.0" -thiserror = "1.0" -tar = "0.4.33" -humantime = "2.1.0" +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true } +rand = "0.8.3" +regex = "1.4.5" +rstar = "0.9.3" +scopeguard = "1.1.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_with = "2.0" -humantime-serde = "1.1.1" - -pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true } - -toml_edit = { version = "0.14", features = ["easy"] } -scopeguard = "1.1.0" -const_format = "0.2.21" -tracing = "0.1.36" signal-hook = "0.3.10" +svg_fmt = "0.4.1" +tar = "0.4.33" +thiserror = "1.0" +tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-util = { version = "0.7.3", features = ["io", "io-util"] } +toml_edit = { version = "0.14", features = ["easy"] } +tracing = "0.1.36" url = "2" -nix = "0.25" -once_cell = "1.13.0" -crossbeam-utils = "0.8.5" -fail = "0.5.0" -git-version = "0.3.5" -rstar = "0.9.3" -num-traits = "0.2.15" -amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" } +walkdir = "2.3.2" -pageserver_api = { path = "../libs/pageserver_api" } -postgres_ffi = { path = "../libs/postgres_ffi" } etcd_broker = { path = "../libs/etcd_broker" } metrics = { path = "../libs/metrics" } -utils = { path = "../libs/utils" } +pageserver_api = { path = "../libs/pageserver_api" } +postgres_ffi = { path = "../libs/postgres_ffi" } +pq_proto = { path = "../libs/pq_proto" } remote_storage = { path = "../libs/remote_storage" } tenant_size_model = { path = "../libs/tenant_size_model" } +utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } -close_fds = "0.3.2" -walkdir = "2.3.2" -svg_fmt = "0.4.1" [dev-dependencies] criterion = "0.4" diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index f83ab1929a..fcc7a5476b 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -19,6 +19,7 @@ use pageserver_api::models::{ PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, }; +use pq_proto::{BeMessage, FeMessage, RowDescriptor}; use std::io; use std::net::TcpListener; use std::str; @@ -33,7 +34,6 @@ use utils::{ lsn::Lsn, postgres_backend::AuthType, postgres_backend_async::{self, PostgresBackend}, - pq_proto::{BeMessage, FeMessage, RowDescriptor}, simple_rcu::RcuReadGuard, }; diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index a4a6af455c..0070834288 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -31,8 +31,8 @@ use crate::{ walrecord::DecodedWALRecord, }; use postgres_ffi::waldecoder::WalStreamDecoder; -use utils::id::TenantTimelineId; -use utils::{lsn::Lsn, pq_proto::ReplicationFeedback}; +use pq_proto::ReplicationFeedback; +use utils::{id::TenantTimelineId, lsn::Lsn}; /// Status of the connection. #[derive(Debug, Clone)] diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 395c22b8bc..14a5450d5e 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -22,11 +22,7 @@ once_cell = "1.13.0" parking_lot = "0.12" pin-project-lite = "0.2.7" rand = "0.8.3" -reqwest = { version = "0.11", default-features = false, features = [ - "blocking", - "json", - "rustls-tls", -] } +reqwest = { version = "0.11", default-features = false, features = [ "json", "rustls-tls" ] } routerify = "3" rustls = "0.20.0" rustls-pemfile = "1" @@ -45,8 +41,9 @@ url = "2.2.2" uuid = { version = "1.2", features = ["v4", "serde"] } x509-parser = "0.14" -utils = { path = "../libs/utils" } metrics = { path = "../libs/metrics" } +pq_proto = { path = "../libs/pq_proto" } +utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index c8ca418144..96c6f0ba18 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -1,8 +1,8 @@ use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters}; +use pq_proto::{BeMessage as Be, BeParameterStatusMessage}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, info_span}; -use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; #[derive(Debug, Error)] pub enum LinkAuthError { diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 57128a61f5..907f99b8e0 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,10 +1,10 @@ //! User credentials used in authentication. use crate::error::UserFacingError; +use pq_proto::StartupMessageParams; use std::borrow::Cow; use thiserror::Error; use tracing::info; -use utils::pq_proto::StartupMessageParams; #[derive(Debug, Error, PartialEq, Eq, Clone)] pub enum ClientCredsParseError { diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 5a516fdc30..865af4d2e5 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -2,9 +2,9 @@ use super::{AuthErrorImpl, PasswordHackPayload}; use crate::{sasl, scram, stream::PqStream}; +use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; use std::io; use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; /// Every authentication selector is supposed to implement this trait. pub trait AuthMethod { diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 404533ad42..b219cd0fa2 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,11 +1,11 @@ use anyhow::{anyhow, Context}; use hashbrown::HashMap; use parking_lot::Mutex; +use pq_proto::CancelKeyData; use std::net::SocketAddr; use tokio::net::TcpStream; use tokio_postgres::{CancelToken, NoTls}; use tracing::info; -use utils::pq_proto::CancelKeyData; /// Enables serving `CancelRequest`s. #[derive(Default)] diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 8e4caf6eeb..4771c774a1 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,12 +1,12 @@ use crate::{cancellation::CancelClosure, error::UserFacingError}; use futures::TryFutureExt; use itertools::Itertools; +use pq_proto::StartupMessageParams; use std::{io, net::SocketAddr}; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::NoTls; use tracing::{error, info}; -use utils::pq_proto::StartupMessageParams; #[derive(Debug, Error)] pub enum ConnectionError { @@ -44,7 +44,7 @@ pub type ComputeConnCfg = tokio_postgres::Config; /// Various compute node info for establishing connection etc. pub struct NodeInfo { - /// Did we send [`utils::pq_proto::BeMessage::AuthenticationOk`]? + /// Did we send [`pq_proto::BeMessage::AuthenticationOk`]? pub reported_auth_ok: bool, /// Compute node connection params. pub config: tokio_postgres::Config, diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 67693b1fb0..06d1a4f106 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -1,15 +1,13 @@ use crate::auth; use anyhow::Context; +use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; use serde::Deserialize; use std::{ net::{TcpListener, TcpStream}, thread, }; use tracing::{error, info}; -use utils::{ - postgres_backend::{self, AuthType, PostgresBackend}, - pq_proto::{BeMessage, SINGLE_COL_ROWDESC}, -}; +use utils::postgres_backend::{self, AuthType, PostgresBackend}; /// TODO: move all of that to auth-backend/link.rs when we ditch legacy-console backend diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 889445239a..9257fcd650 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -6,10 +6,10 @@ use anyhow::{bail, Context}; use futures::TryFutureExt; use metrics::{register_int_counter, IntCounter}; use once_cell::sync::Lazy; +use pq_proto::{BeMessage as Be, *}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{error, info, info_span, Instrument}; -use utils::pq_proto::{BeMessage as Be, *}; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; const ERR_PROTO_VIOLATION: &str = "protocol violation"; diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs index f48aee4f26..fb3833c8b6 100644 --- a/proxy/src/sasl/messages.rs +++ b/proxy/src/sasl/messages.rs @@ -1,9 +1,9 @@ //! Definitions for SASL messages. use crate::parse::{split_at_const, split_cstr}; -use utils::pq_proto::{BeAuthenticationSaslMessage, BeMessage}; +use pq_proto::{BeAuthenticationSaslMessage, BeMessage}; -/// SASL-specific payload of [`PasswordMessage`](utils::pq_proto::FeMessage::PasswordMessage). +/// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage). #[derive(Debug)] pub struct FirstMessage<'a> { /// Authentication method, e.g. `"SCRAM-SHA-256"`. @@ -31,7 +31,7 @@ impl<'a> FirstMessage<'a> { /// A single SASL message. /// This struct is deliberately decoupled from lower-level -/// [`BeAuthenticationSaslMessage`](utils::pq_proto::BeAuthenticationSaslMessage). +/// [`BeAuthenticationSaslMessage`](pq_proto::BeAuthenticationSaslMessage). #[derive(Debug)] pub(super) enum ServerMessage { /// We expect to see more steps. diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 2a224944e2..8e4084775c 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -2,6 +2,7 @@ use crate::error::UserFacingError; use anyhow::bail; use bytes::BytesMut; use pin_project_lite::pin_project; +use pq_proto::{BeMessage, FeMessage, FeStartupPacket}; use rustls::ServerConfig; use std::pin::Pin; use std::sync::Arc; @@ -9,7 +10,6 @@ use std::{io, task}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, ReadBuf}; use tokio_rustls::server::TlsStream; -use utils::pq_proto::{BeMessage, FeMessage, FeStartupPacket}; pin_project! { /// Stream wrapper which implements libpq's protocol. diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 0c0ca2ff9f..658bdfe42c 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -4,41 +4,42 @@ version = "0.1.0" edition = "2021" [dependencies] -regex = "1.4.5" -bytes = "1.0.1" -byteorder = "1.4.3" -hyper = "0.14" -fs2 = "0.4.3" -serde_json = "1" -tracing = "0.1.27" -clap = "4.0" -nix = "0.25" -tokio = { version = "1.17", features = ["macros", "fs"] } -postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } anyhow = "1.0" -crc32c = "0.6.0" -humantime = "2.1.0" -url = "2.2.2" -signal-hook = "0.3.10" -serde = { version = "1.0", features = ["derive"] } -serde_with = "2.0" -hex = "0.4.3" -const_format = "0.2.21" -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -git-version = "0.3.5" async-trait = "0.1" +byteorder = "1.4.3" +bytes = "1.0.1" +clap = "4.0" +const_format = "0.2.21" +crc32c = "0.6.0" +fs2 = "0.4.3" +git-version = "0.3.5" +hex = "0.4.3" +humantime = "2.1.0" +hyper = "0.14" +nix = "0.25" once_cell = "1.13.0" -toml_edit = { version = "0.14", features = ["easy"] } -thiserror = "1" parking_lot = "0.12.1" +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +regex = "1.4.5" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1" +serde_with = "2.0" +signal-hook = "0.3.10" +thiserror = "1" +tokio = { version = "1.17", features = ["macros", "fs"] } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +toml_edit = { version = "0.14", features = ["easy"] } +tracing = "0.1.27" +url = "2.2.2" -safekeeper_api = { path = "../libs/safekeeper_api" } -postgres_ffi = { path = "../libs/postgres_ffi" } -metrics = { path = "../libs/metrics" } -utils = { path = "../libs/utils" } etcd_broker = { path = "../libs/etcd_broker" } +metrics = { path = "../libs/metrics" } +postgres_ffi = { path = "../libs/postgres_ffi" } +pq_proto = { path = "../libs/pq_proto" } remote_storage = { path = "../libs/remote_storage" } +safekeeper_api = { path = "../libs/safekeeper_api" } +utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 856c164be8..95cb96fae9 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -4,13 +4,13 @@ use crate::safekeeper::{ TermSwitchEntry, }; use anyhow::{bail, Result}; +use pq_proto::SystemId; use serde::{Deserialize, Serialize}; use tracing::*; use utils::{ bin_ser::LeSer, id::{TenantId, TimelineId}, lsn::Lsn, - pq_proto::SystemId, }; /// Persistent consensus state of the acceptor. diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index ca887399e1..a1e0bcbec0 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -12,12 +12,12 @@ use anyhow::{bail, Context, Result}; use postgres_ffi::PG_TLI; use regex::Regex; +use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; use tracing::info; use utils::{ id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, postgres_backend::{self, PostgresBackend}, - pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}, }; /// Safekeeper handler of postgres commands diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 8cb2ced238..746b4461b7 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -24,11 +24,8 @@ use crate::timeline::Timeline; use crate::GlobalTimelines; use postgres_ffi::encode_logical_message; use postgres_ffi::WAL_SEGMENT_SIZE; -use utils::{ - lsn::Lsn, - postgres_backend::PostgresBackend, - pq_proto::{BeMessage, RowDescriptor, TEXT_OID}, -}; +use pq_proto::{BeMessage, RowDescriptor, TEXT_OID}; +use utils::{lsn::Lsn, postgres_backend::PostgresBackend}; #[derive(Serialize, Deserialize, Debug)] pub struct AppendLogicalMessage { diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 095d80623a..d4d3d37737 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -383,7 +383,7 @@ impl Collector for TimelineCollector { let timeline_id = tli.ttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; - let mut most_advanced: Option = None; + let mut most_advanced: Option = None; for replica in tli.replicas.iter() { if let Some(replica_feedback) = replica.pageserver_feedback { if let Some(current) = most_advanced { diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 09ccfe7758..6577e8c4d6 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -23,11 +23,8 @@ use crate::safekeeper::AcceptorProposerMessage; use crate::safekeeper::ProposerAcceptorMessage; use crate::handler::SafekeeperPostgresHandler; -use utils::{ - postgres_backend::PostgresBackend, - pq_proto::{BeMessage, FeMessage}, - sock_split::ReadStream, -}; +use pq_proto::{BeMessage, FeMessage}; +use utils::{postgres_backend::PostgresBackend, sock_split::ReadStream}; pub struct ReceiveWalConn<'pg> { /// Postgres connection diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 3f9b70f282..7dfa6f636e 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -18,11 +18,11 @@ use crate::control_file; use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; +use pq_proto::{ReplicationFeedback, SystemId}; use utils::{ bin_ser::LeSer, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, - pq_proto::{ReplicationFeedback, SystemId}, }; pub const SK_MAGIC: u32 = 0xcafeceefu32; diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 2829c875ed..576a02c686 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -17,16 +17,11 @@ use std::sync::Arc; use std::time::Duration; use std::{str, thread}; +use pq_proto::{BeMessage, FeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody}; use tokio::sync::watch::Receiver; use tokio::time::timeout; use tracing::*; -use utils::{ - bin_ser::BeSer, - lsn::Lsn, - postgres_backend::PostgresBackend, - pq_proto::{BeMessage, FeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody}, - sock_split::ReadStream, -}; +use utils::{bin_ser::BeSer, lsn::Lsn, postgres_backend::PostgresBackend, sock_split::ReadStream}; // See: https://www.postgresql.org/docs/13/protocol-replication.html const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h'; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index d8d1fb98ad..a3f0ff94ee 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -2,26 +2,20 @@ //! to glue together SafeKeeper and all other background services. use anyhow::{bail, Result}; - use etcd_broker::subscription_value::SkTimelineInfo; - -use postgres_ffi::XLogSegNo; - -use tokio::{sync::watch, time::Instant}; - -use std::cmp::{max, min}; - use parking_lot::{Mutex, MutexGuard}; - +use postgres_ffi::XLogSegNo; +use pq_proto::ReplicationFeedback; +use std::cmp::{max, min}; use std::path::PathBuf; - -use tokio::sync::mpsc::Sender; +use tokio::{ + sync::{mpsc::Sender, watch}, + time::Instant, +}; use tracing::*; - use utils::{ id::{NodeId, TenantTimelineId}, lsn::Lsn, - pq_proto::ReplicationFeedback, }; use crate::safekeeper::{ diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index f4468d85f0..2daa08c9b6 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -21,6 +21,9 @@ clap = { version = "4", features = ["color", "error-context", "help", "std", "st crossbeam-utils = { version = "0.8", features = ["once_cell", "std"] } either = { version = "1", features = ["use_std"] } fail = { version = "0.5", default-features = false, features = ["failpoints"] } +futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] } +futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] } +futures-util = { version = "0.3", default-features = false, features = ["alloc", "async-await", "async-await-macro", "channel", "futures-channel", "futures-io", "futures-macro", "futures-sink", "io", "memchr", "sink", "slab", "std"] } hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } @@ -34,6 +37,7 @@ prost = { version = "0.10", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } +reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "hyper-rustls", "json", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-rustls", "webpki-roots"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } stable_deref_trait = { version = "1", features = ["alloc", "std"] } From 07b3ba5ce327a2f6d656c499d67f8231eb7c3362 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 4 Nov 2022 20:30:13 +0200 Subject: [PATCH 0992/1022] Bump postgres submodules, for some cosmetic cleanups. --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index e9b0010b45..c0284ce58e 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit e9b0010b45b287eea2213427ebac53a3fb7bdce9 +Subproject commit c0284ce58e7ee64e2307dd6fbffe1eaf4c23e5d1 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 5cd7e44799..e5cc262697 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 5cd7e44799567c52f13dc8c42e0bcab913022438 +Subproject commit e5cc2626970c9a4c9b7f1df3a457584c6bd071ad From bc40a5595fa4ed669243a50accad632e1a02d262 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 4 Nov 2022 21:29:51 +0000 Subject: [PATCH 0993/1022] test_runner: update cryptography (#2753) Bump `cryptography` package from 37.0.4 to 38.0.3 to fix https://github.com/neondatabase/neon/security/dependabot/6 (rather just in case). Ref https://www.openssl.org/news/secadv/20221101.txt --- poetry.lock | 75 ++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/poetry.lock b/poetry.lock index fdfe88acf1..01265aaea1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -583,7 +583,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] name = "cryptography" -version = "37.0.4" +version = "38.0.3" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." category = "main" optional = false @@ -593,10 +593,10 @@ python-versions = ">=3.6" cffi = ">=1.12" [package.extras] -docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx_rtd_theme"] +docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"] docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"] -sdist = ["setuptools_rust (>=0.11.4)"] +sdist = ["setuptools-rust (>=0.11.4)"] ssh = ["bcrypt (>=3.1.5)"] test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", "pytest-xdist", "pytz"] @@ -1750,28 +1750,32 @@ colorama = [ {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"}, ] cryptography = [ - {file = "cryptography-37.0.4-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:549153378611c0cca1042f20fd9c5030d37a72f634c9326e225c9f666d472884"}, - {file = "cryptography-37.0.4-cp36-abi3-macosx_10_10_x86_64.whl", hash = "sha256:a958c52505c8adf0d3822703078580d2c0456dd1d27fabfb6f76fe63d2971cd6"}, - {file = "cryptography-37.0.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f721d1885ecae9078c3f6bbe8a88bc0786b6e749bf32ccec1ef2b18929a05046"}, - {file = "cryptography-37.0.4-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:3d41b965b3380f10e4611dbae366f6dc3cefc7c9ac4e8842a806b9672ae9add5"}, - {file = "cryptography-37.0.4-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:80f49023dd13ba35f7c34072fa17f604d2f19bf0989f292cedf7ab5770b87a0b"}, - {file = "cryptography-37.0.4-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2dcb0b3b63afb6df7fd94ec6fbddac81b5492513f7b0436210d390c14d46ee8"}, - {file = "cryptography-37.0.4-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:b7f8dd0d4c1f21759695c05a5ec8536c12f31611541f8904083f3dc582604280"}, - {file = "cryptography-37.0.4-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:30788e070800fec9bbcf9faa71ea6d8068f5136f60029759fd8c3efec3c9dcb3"}, - {file = "cryptography-37.0.4-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:190f82f3e87033821828f60787cfa42bff98404483577b591429ed99bed39d59"}, - {file = "cryptography-37.0.4-cp36-abi3-win32.whl", hash = "sha256:b62439d7cd1222f3da897e9a9fe53bbf5c104fff4d60893ad1355d4c14a24157"}, - {file = "cryptography-37.0.4-cp36-abi3-win_amd64.whl", hash = "sha256:f7a6de3e98771e183645181b3627e2563dcde3ce94a9e42a3f427d2255190327"}, - {file = "cryptography-37.0.4-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bc95ed67b6741b2607298f9ea4932ff157e570ef456ef7ff0ef4884a134cc4b"}, - {file = "cryptography-37.0.4-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:f8c0a6e9e1dd3eb0414ba320f85da6b0dcbd543126e30fcc546e7372a7fbf3b9"}, - {file = "cryptography-37.0.4-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:e007f052ed10cc316df59bc90fbb7ff7950d7e2919c9757fd42a2b8ecf8a5f67"}, - {file = "cryptography-37.0.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bc997818309f56c0038a33b8da5c0bfbb3f1f067f315f9abd6fc07ad359398d"}, - {file = "cryptography-37.0.4-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:d204833f3c8a33bbe11eda63a54b1aad7aa7456ed769a982f21ec599ba5fa282"}, - {file = "cryptography-37.0.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:75976c217f10d48a8b5a8de3d70c454c249e4b91851f6838a4e48b8f41eb71aa"}, - {file = "cryptography-37.0.4-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:7099a8d55cd49b737ffc99c17de504f2257e3787e02abe6d1a6d136574873441"}, - {file = "cryptography-37.0.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2be53f9f5505673eeda5f2736bea736c40f051a739bfae2f92d18aed1eb54596"}, - {file = "cryptography-37.0.4-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:91ce48d35f4e3d3f1d83e29ef4a9267246e6a3be51864a5b7d2247d5086fa99a"}, - {file = "cryptography-37.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:4c590ec31550a724ef893c50f9a97a0c14e9c851c85621c5650d699a7b88f7ab"}, - {file = "cryptography-37.0.4.tar.gz", hash = "sha256:63f9c17c0e2474ccbebc9302ce2f07b55b3b3fcb211ded18a42d5764f5c10a82"}, + {file = "cryptography-38.0.3-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:984fe150f350a3c91e84de405fe49e688aa6092b3525f407a18b9646f6612320"}, + {file = "cryptography-38.0.3-cp36-abi3-macosx_10_10_x86_64.whl", hash = "sha256:ed7b00096790213e09eb11c97cc6e2b757f15f3d2f85833cd2d3ec3fe37c1722"}, + {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:bbf203f1a814007ce24bd4d51362991d5cb90ba0c177a9c08825f2cc304d871f"}, + {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:554bec92ee7d1e9d10ded2f7e92a5d70c1f74ba9524947c0ba0c850c7b011828"}, + {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1b52c9e5f8aa2b802d48bd693190341fae201ea51c7a167d69fc48b60e8a959"}, + {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:728f2694fa743a996d7784a6194da430f197d5c58e2f4e278612b359f455e4a2"}, + {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:dfb4f4dd568de1b6af9f4cda334adf7d72cf5bc052516e1b2608b683375dd95c"}, + {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5419a127426084933076132d317911e3c6eb77568a1ce23c3ac1e12d111e61e0"}, + {file = "cryptography-38.0.3-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:9b24bcff7853ed18a63cfb0c2b008936a9554af24af2fb146e16d8e1aed75748"}, + {file = "cryptography-38.0.3-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:25c1d1f19729fb09d42e06b4bf9895212292cb27bb50229f5aa64d039ab29146"}, + {file = "cryptography-38.0.3-cp36-abi3-win32.whl", hash = "sha256:7f836217000342d448e1c9a342e9163149e45d5b5eca76a30e84503a5a96cab0"}, + {file = "cryptography-38.0.3-cp36-abi3-win_amd64.whl", hash = "sha256:c46837ea467ed1efea562bbeb543994c2d1f6e800785bd5a2c98bc096f5cb220"}, + {file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06fc3cc7b6f6cca87bd56ec80a580c88f1da5306f505876a71c8cfa7050257dd"}, + {file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:65535bc550b70bd6271984d9863a37741352b4aad6fb1b3344a54e6950249b55"}, + {file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:5e89468fbd2fcd733b5899333bc54d0d06c80e04cd23d8c6f3e0542358c6060b"}, + {file = "cryptography-38.0.3-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:6ab9516b85bebe7aa83f309bacc5f44a61eeb90d0b4ec125d2d003ce41932d36"}, + {file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:068147f32fa662c81aebab95c74679b401b12b57494872886eb5c1139250ec5d"}, + {file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:402852a0aea73833d982cabb6d0c3bb582c15483d29fb7085ef2c42bfa7e38d7"}, + {file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:b1b35d9d3a65542ed2e9d90115dfd16bbc027b3f07ee3304fc83580f26e43249"}, + {file = "cryptography-38.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:6addc3b6d593cd980989261dc1cce38263c76954d758c3c94de51f1e010c9a50"}, + {file = "cryptography-38.0.3-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:be243c7e2bfcf6cc4cb350c0d5cdf15ca6383bbcb2a8ef51d3c9411a9d4386f0"}, + {file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78cf5eefac2b52c10398a42765bfa981ce2372cbc0457e6bf9658f41ec3c41d8"}, + {file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:4e269dcd9b102c5a3d72be3c45d8ce20377b8076a43cbed6f660a1afe365e436"}, + {file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:8d41a46251bf0634e21fac50ffd643216ccecfaf3701a063257fe0b2be1b6548"}, + {file = "cryptography-38.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:785e4056b5a8b28f05a533fab69febf5004458e20dad7e2e13a3120d8ecec75a"}, + {file = "cryptography-38.0.3.tar.gz", hash = "sha256:bfbe6ee19615b07a98b1d2287d6a6073f734735b49ee45b11324d85efc4d5cbd"}, ] docker = [ {file = "docker-4.2.2-py2.py3-none-any.whl", hash = "sha256:03a46400c4080cb6f7aa997f881ddd84fef855499ece219d75fbdb53289c17ab"}, @@ -1978,7 +1982,6 @@ prometheus-client = [ psycopg2-binary = [ {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"}, @@ -2012,7 +2015,6 @@ psycopg2-binary = [ {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"}, {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"}, @@ -2024,7 +2026,6 @@ psycopg2-binary = [ {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"}, @@ -2041,7 +2042,18 @@ py = [ {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] pyasn1 = [ + {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"}, + {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"}, + {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"}, + {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"}, {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, + {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"}, + {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"}, + {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"}, + {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"}, + {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"}, + {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"}, + {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"}, {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, ] pycodestyle = [ @@ -2151,13 +2163,6 @@ pyyaml = [ {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, - {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"}, - {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"}, - {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"}, - {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"}, {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, From 7b7f84f1b4643f0417b6ccc1793cc0c38aeb55db Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 15 Oct 2022 02:08:18 +0300 Subject: [PATCH 0994/1022] Refactor layer flushing task Extracted from https://github.com/neondatabase/neon/pull/2595 --- pageserver/src/http/routes.rs | 5 +- pageserver/src/page_service.rs | 17 +- pageserver/src/tenant.rs | 239 +++++++++++++++----------- pageserver/src/tenant/timeline.rs | 275 ++++++++++++++++++------------ pageserver/src/tenant_mgr.rs | 2 +- pageserver/src/tenant_tasks.rs | 2 +- pageserver/src/walredo.rs | 2 +- 7 files changed, 317 insertions(+), 225 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 7087c68dbd..14ea054577 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -825,14 +825,14 @@ async fn timeline_gc_handler(mut request: Request) -> Result) -> Result { owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Timeline, TimelineUninitMark)>, + raw_timeline: Option<(Arc, TimelineUninitMark)>, } /// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory, @@ -169,7 +174,6 @@ impl UninitializedTimeline<'_> { let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| { format!("No timeline for initalization found for {tenant_id}/{timeline_id}") })?; - let new_timeline = Arc::new(new_timeline); let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn(); // TODO it would be good to ensure that, but apparently a lot of our testing is dependend on that at least @@ -197,6 +201,9 @@ impl UninitializedTimeline<'_> { })?; new_timeline.set_state(TimelineState::Active); v.insert(Arc::clone(&new_timeline)); + + new_timeline.maybe_spawn_flush_loop(); + new_timeline.launch_wal_receiver(); } } @@ -205,20 +212,28 @@ impl UninitializedTimeline<'_> { } /// Prepares timeline data by loading it from the basebackup archive. - pub fn import_basebackup_from_tar( - &self, - reader: impl std::io::Read, + pub async fn import_basebackup_from_tar( + self, + mut copyin_stream: &mut Pin<&mut impl Stream>>, base_lsn: Lsn, - ) -> anyhow::Result<()> { + ) -> anyhow::Result> { let raw_timeline = self.raw_timeline()?; - import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn).with_context( - || { - format!( - "Failed to import basebackup for timeline {}/{}", - self.owning_tenant.tenant_id, self.timeline_id - ) - }, - )?; + + // import_basebackup_from_tar() is not async, mainly because the Tar crate + // it uses is not async. So we need to jump through some hoops: + // - convert the input from client connection to a synchronous Read + // - use block_in_place() + let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); + + tokio::task::block_in_place(|| { + import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn) + .context("Failed to import basebackup") + })?; + + // Flush loop needs to be spawned in order for checkpoint to be able to flush. + // We want to run proper checkpoint before we mark timeline as available to outside world + // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock + raw_timeline.maybe_spawn_flush_loop(); fail::fail_point!("before-checkpoint-new-timeline", |_| { bail!("failpoint before-checkpoint-new-timeline"); @@ -226,16 +241,15 @@ impl UninitializedTimeline<'_> { raw_timeline .checkpoint(CheckpointConfig::Flush) - .with_context(|| { - format!( - "Failed to checkpoint after basebackup import for timeline {}/{}", - self.owning_tenant.tenant_id, self.timeline_id - ) - })?; - Ok(()) + .await + .context("Failed to checkpoint after basebackup import")?; + + let timeline = self.initialize()?; + + Ok(timeline) } - fn raw_timeline(&self) -> anyhow::Result<&Timeline> { + fn raw_timeline(&self) -> anyhow::Result<&Arc> { Ok(&self .raw_timeline .as_ref() @@ -470,7 +484,7 @@ impl Tenant { self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? } - None => self.bootstrap_timeline(new_timeline_id, pg_version)?, + None => self.bootstrap_timeline(new_timeline_id, pg_version).await?, }; // Have added new timeline into the tenant, now its background tasks are needed. @@ -488,7 +502,7 @@ impl Tenant { /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC /// to make tests more deterministic. /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed? - pub fn gc_iteration( + pub async fn gc_iteration( &self, target_timeline_id: Option, horizon: u64, @@ -504,11 +518,13 @@ impl Tenant { .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); - STORAGE_TIME - .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str]) - .observe_closure_duration(|| { - self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc) - }) + { + let _timer = STORAGE_TIME + .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str]) + .start_timer(); + self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc) + .await + } } /// Perform one compaction iteration. @@ -544,23 +560,24 @@ impl Tenant { /// /// Used at graceful shutdown. /// - pub fn checkpoint(&self) -> anyhow::Result<()> { + pub async fn checkpoint(&self) -> anyhow::Result<()> { // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // checkpoints. We don't want to block everything else while the // checkpoint runs. - let timelines = self.timelines.lock().unwrap(); - let timelines_to_checkpoint = timelines - .iter() - .map(|(timeline_id, timeline)| (*timeline_id, Arc::clone(timeline))) - .collect::>(); - drop(timelines); + let timelines_to_checkpoint = { + let timelines = self.timelines.lock().unwrap(); + timelines + .iter() + .map(|(id, timeline)| (*id, Arc::clone(timeline))) + .collect::>() + }; - for (timeline_id, timeline) in &timelines_to_checkpoint { - let _entered = - info_span!("checkpoint", timeline = %timeline_id, tenant = %self.tenant_id) - .entered(); - timeline.checkpoint(CheckpointConfig::Flush)?; + for (id, timeline) in &timelines_to_checkpoint { + timeline + .checkpoint(CheckpointConfig::Flush) + .instrument(info_span!("checkpoint", timeline = %id, tenant = %self.tenant_id)) + .await?; } Ok(()) @@ -974,7 +991,7 @@ impl Tenant { // - if a relation has a non-incremental persistent layer on a child branch, then we // don't need to keep that in the parent anymore. But currently // we do. - fn gc_iteration_internal( + async fn gc_iteration_internal( &self, target_timeline_id: Option, horizon: u64, @@ -1007,7 +1024,7 @@ impl Tenant { // so that they too can be garbage collected. That's // used in tests, so we want as deterministic results as possible. if checkpoint_before_gc { - timeline.checkpoint(CheckpointConfig::Forced)?; + timeline.checkpoint(CheckpointConfig::Forced).await?; info!( "timeline {} checkpoint_before_gc done", timeline.timeline_id @@ -1117,7 +1134,6 @@ impl Tenant { } } drop(gc_cs); - Ok(gc_timelines) } @@ -1222,14 +1238,15 @@ impl Tenant { /// - run initdb to init temporary instance and get bootstrap data /// - after initialization complete, remove the temp dir. - fn bootstrap_timeline( + async fn bootstrap_timeline( &self, timeline_id: TimelineId, pg_version: u32, ) -> anyhow::Result> { - let timelines = self.timelines.lock().unwrap(); - let timeline_uninit_mark = self.create_timeline_uninit_mark(timeline_id, &timelines)?; - drop(timelines); + let timeline_uninit_mark = { + let timelines = self.timelines.lock().unwrap(); + self.create_timeline_uninit_mark(timeline_id, &timelines)? + }; // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` // temporary directory for basebackup files for the given timeline. let initdb_path = path_with_suffix_extension( @@ -1279,25 +1296,35 @@ impl Tenant { let tenant_id = raw_timeline.owning_tenant.tenant_id; let unfinished_timeline = raw_timeline.raw_timeline()?; - import_datadir::import_timeline_from_postgres_datadir( - unfinished_timeline, - pgdata_path, - pgdata_lsn, - ) + + tokio::task::block_in_place(|| { + import_datadir::import_timeline_from_postgres_datadir( + unfinished_timeline, + pgdata_path, + pgdata_lsn, + ) + }) .with_context(|| { format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}") })?; + // Flush loop needs to be spawned in order for checkpoint to be able to flush. + // We want to run proper checkpoint before we mark timeline as available to outside world + // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock + unfinished_timeline.maybe_spawn_flush_loop(); + fail::fail_point!("before-checkpoint-new-timeline", |_| { anyhow::bail!("failpoint before-checkpoint-new-timeline"); }); + unfinished_timeline - .checkpoint(CheckpointConfig::Forced) + .checkpoint(CheckpointConfig::Forced).await .with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?; - let mut timelines = self.timelines.lock().unwrap(); - let timeline = raw_timeline.initialize_with_lock(&mut timelines, false)?; - drop(timelines); + let timeline = { + let mut timelines = self.timelines.lock().unwrap(); + raw_timeline.initialize_with_lock(&mut timelines, false)? + }; info!( "created root timeline {} timeline.lsn {}", @@ -1337,7 +1364,7 @@ impl Tenant { Ok(UninitializedTimeline { owning_tenant: self, timeline_id: new_timeline_id, - raw_timeline: Some((new_timeline, uninit_mark)), + raw_timeline: Some((Arc::new(new_timeline), uninit_mark)), }) } Err(e) => { @@ -1456,7 +1483,7 @@ impl Tenant { let timeline = UninitializedTimeline { owning_tenant: self, timeline_id, - raw_timeline: Some((dummy_timeline, TimelineUninitMark::dummy())), + raw_timeline: Some((Arc::new(dummy_timeline), TimelineUninitMark::dummy())), }; match timeline.initialize_with_lock(&mut timelines_accessor, true) { Ok(initialized_timeline) => { @@ -1910,7 +1937,7 @@ mod tests { Ok(()) } - fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> { + async fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> { let mut lsn = start_lsn; #[allow(non_snake_case)] { @@ -1931,7 +1958,7 @@ mod tests { writer.finish_write(lsn); lsn += 0x10; } - tline.checkpoint(CheckpointConfig::Forced)?; + tline.checkpoint(CheckpointConfig::Forced).await?; { let writer = tline.writer(); writer.put( @@ -1948,24 +1975,26 @@ mod tests { )?; writer.finish_write(lsn); } - tline.checkpoint(CheckpointConfig::Forced) + tline.checkpoint(CheckpointConfig::Forced).await } - #[test] - fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> { + #[tokio::test] + async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? .load(); let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? .initialize()?; - make_some_layers(tline.as_ref(), Lsn(0x20))?; + make_some_layers(tline.as_ref(), Lsn(0x20)).await?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 // FIXME: this doesn't actually remove any layer currently, given how the checkpointing // and compaction works. But it does set the 'cutoff' point so that the cross check // below should fail. - tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + tenant + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false) + .await?; // try to branch at lsn 25, should fail because we already garbage collected the data match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { @@ -2010,14 +2039,14 @@ mod tests { /* // FIXME: This currently fails to error out. Calling GC doesn't currently // remove the old value, we'd need to work a little harder - #[test] - fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> { + #[tokio::test] + async fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> { let repo = RepoHarness::create("test_prohibit_get_for_garbage_collected_data")? .load(); let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; - make_some_layers(tline.as_ref(), Lsn(0x20))?; + make_some_layers(tline.as_ref(), Lsn(0x20)).await?; repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); @@ -2030,43 +2059,47 @@ mod tests { } */ - #[test] - fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> { + #[tokio::test] + async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? .initialize()?; - make_some_layers(tline.as_ref(), Lsn(0x20))?; + make_some_layers(tline.as_ref(), Lsn(0x20)).await?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + tenant + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false) + .await?; assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok()); Ok(()) } - #[test] - fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> { + #[tokio::test] + async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? .initialize()?; - make_some_layers(tline.as_ref(), Lsn(0x20))?; + make_some_layers(tline.as_ref(), Lsn(0x20)).await?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); - make_some_layers(newtline.as_ref(), Lsn(0x60))?; + make_some_layers(newtline.as_ref(), Lsn(0x60)).await?; // run gc on parent - tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + tenant + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false) + .await?; // Check that the data is still accessible on the branch. assert_eq!( @@ -2077,8 +2110,8 @@ mod tests { Ok(()) } - #[test] - fn timeline_load() -> anyhow::Result<()> { + #[tokio::test] + async fn timeline_load() -> anyhow::Result<()> { const TEST_NAME: &str = "timeline_load"; let harness = TenantHarness::create(TEST_NAME)?; { @@ -2086,8 +2119,8 @@ mod tests { let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)? .initialize()?; - make_some_layers(tline.as_ref(), Lsn(0x8000))?; - tline.checkpoint(CheckpointConfig::Forced)?; + make_some_layers(tline.as_ref(), Lsn(0x8000)).await?; + tline.checkpoint(CheckpointConfig::Forced).await?; } let tenant = harness.load(); @@ -2098,8 +2131,8 @@ mod tests { Ok(()) } - #[test] - fn timeline_load_with_ancestor() -> anyhow::Result<()> { + #[tokio::test] + async fn timeline_load_with_ancestor() -> anyhow::Result<()> { const TEST_NAME: &str = "timeline_load_with_ancestor"; let harness = TenantHarness::create(TEST_NAME)?; // create two timelines @@ -2109,8 +2142,8 @@ mod tests { .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? .initialize()?; - make_some_layers(tline.as_ref(), Lsn(0x20))?; - tline.checkpoint(CheckpointConfig::Forced)?; + make_some_layers(tline.as_ref(), Lsn(0x20)).await?; + tline.checkpoint(CheckpointConfig::Forced).await?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; @@ -2118,8 +2151,8 @@ mod tests { .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); - make_some_layers(newtline.as_ref(), Lsn(0x60))?; - tline.checkpoint(CheckpointConfig::Forced)?; + make_some_layers(newtline.as_ref(), Lsn(0x60)).await?; + tline.checkpoint(CheckpointConfig::Forced).await?; } // check that both of them are initially unloaded @@ -2179,8 +2212,8 @@ mod tests { Ok(()) } - #[test] - fn test_images() -> anyhow::Result<()> { + #[tokio::test] + async fn test_images() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_images")?.load(); let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? @@ -2191,7 +2224,7 @@ mod tests { writer.finish_write(Lsn(0x10)); drop(writer); - tline.checkpoint(CheckpointConfig::Forced)?; + tline.checkpoint(CheckpointConfig::Forced).await?; tline.compact()?; let writer = tline.writer(); @@ -2199,7 +2232,7 @@ mod tests { writer.finish_write(Lsn(0x20)); drop(writer); - tline.checkpoint(CheckpointConfig::Forced)?; + tline.checkpoint(CheckpointConfig::Forced).await?; tline.compact()?; let writer = tline.writer(); @@ -2207,7 +2240,7 @@ mod tests { writer.finish_write(Lsn(0x30)); drop(writer); - tline.checkpoint(CheckpointConfig::Forced)?; + tline.checkpoint(CheckpointConfig::Forced).await?; tline.compact()?; let writer = tline.writer(); @@ -2215,7 +2248,7 @@ mod tests { writer.finish_write(Lsn(0x40)); drop(writer); - tline.checkpoint(CheckpointConfig::Forced)?; + tline.checkpoint(CheckpointConfig::Forced).await?; tline.compact()?; assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); @@ -2231,8 +2264,8 @@ mod tests { // Insert 1000 key-value pairs with increasing keys, checkpoint, // repeat 50 times. // - #[test] - fn test_bulk_insert() -> anyhow::Result<()> { + #[tokio::test] + async fn test_bulk_insert() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_bulk_insert")?.load(); let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? @@ -2265,7 +2298,7 @@ mod tests { let cutoff = tline.get_last_record_lsn(); tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; - tline.checkpoint(CheckpointConfig::Forced)?; + tline.checkpoint(CheckpointConfig::Forced).await?; tline.compact()?; tline.gc()?; } @@ -2273,8 +2306,8 @@ mod tests { Ok(()) } - #[test] - fn test_random_updates() -> anyhow::Result<()> { + #[tokio::test] + async fn test_random_updates() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_random_updates")?.load(); let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? @@ -2337,7 +2370,7 @@ mod tests { println!("checkpointing {}", lsn); let cutoff = tline.get_last_record_lsn(); tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; - tline.checkpoint(CheckpointConfig::Forced)?; + tline.checkpoint(CheckpointConfig::Forced).await?; tline.compact()?; tline.gc()?; } @@ -2345,8 +2378,8 @@ mod tests { Ok(()) } - #[test] - fn test_traverse_branches() -> anyhow::Result<()> { + #[tokio::test] + async fn test_traverse_branches() -> anyhow::Result<()> { let tenant = TenantHarness::create("test_traverse_branches")?.load(); let mut tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? @@ -2418,7 +2451,7 @@ mod tests { println!("checkpointing {}", lsn); let cutoff = tline.get_last_record_lsn(); tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; - tline.checkpoint(CheckpointConfig::Forced)?; + tline.checkpoint(CheckpointConfig::Forced).await?; tline.compact()?; tline.gc()?; } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 279da70128..1f23fedcc1 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -16,7 +16,7 @@ use std::fs; use std::ops::{Deref, Range}; use std::path::PathBuf; use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError}; +use std::sync::{Arc, Mutex, MutexGuard, RwLock}; use std::time::{Duration, Instant, SystemTime}; use crate::tenant::{ @@ -121,8 +121,16 @@ pub struct Timeline { /// to avoid deadlock. write_lock: Mutex<()>, - /// Used to ensure that there is only task performing flushing at a time - layer_flush_lock: Mutex<()>, + /// Used to avoid multiple `flush_loop` tasks running + flush_loop_started: Mutex, + + /// layer_flush_start_tx can be used to wake up the layer-flushing task. + /// The value is a counter, incremented every time a new flush cycle is requested. + /// The flush cycle counter is sent back on the layer_flush_done channel when + /// the flush finishes. You can use that to wait for the flush to finish. + layer_flush_start_tx: tokio::sync::watch::Sender, + /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel + layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>, /// Layer removal lock. /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks. @@ -466,15 +474,16 @@ impl Timeline { /// /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't /// know anything about them here in the repository. - pub fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { + #[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))] + pub async fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { match cconf { CheckpointConfig::Flush => { self.freeze_inmem_layer(false); - self.flush_frozen_layers(true) + self.flush_frozen_layers_and_wait().await } CheckpointConfig::Forced => { self.freeze_inmem_layer(false); - self.flush_frozen_layers(true)?; + self.flush_frozen_layers_and_wait().await?; self.compact() } } @@ -591,62 +600,6 @@ impl Timeline { Ok(size) } - /// Check if more than 'checkpoint_distance' of WAL has been accumulated in - /// the in-memory layer, and initiate flushing it if so. - /// - /// Also flush after a period of time without new data -- it helps - /// safekeepers to regard pageserver as caught up and suspend activity. - pub fn check_checkpoint_distance(self: &Arc) -> anyhow::Result<()> { - let last_lsn = self.get_last_record_lsn(); - let layers = self.layers.read().unwrap(); - if let Some(open_layer) = &layers.open_layer { - let open_layer_size = open_layer.size()?; - drop(layers); - let last_freeze_at = self.last_freeze_at.load(); - let last_freeze_ts = *(self.last_freeze_ts.read().unwrap()); - let distance = last_lsn.widening_sub(last_freeze_at); - // Checkpointing the open layer can be triggered by layer size or LSN range. - // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and - // we want to stay below that with a big margin. The LSN distance determines how - // much WAL the safekeepers need to store. - if distance >= self.get_checkpoint_distance().into() - || open_layer_size > self.get_checkpoint_distance() - || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()) - { - info!( - "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}", - distance, - open_layer_size, - last_freeze_ts.elapsed() - ); - - self.freeze_inmem_layer(true); - self.last_freeze_at.store(last_lsn); - *(self.last_freeze_ts.write().unwrap()) = Instant::now(); - - // Launch a task to flush the frozen layer to disk, unless - // a task was already running. (If the task was running - // at the time that we froze the layer, it must've seen the - // the layer we just froze before it exited; see comments - // in flush_frozen_layers()) - if let Ok(guard) = self.layer_flush_lock.try_lock() { - drop(guard); - let self_clone = Arc::clone(self); - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - task_mgr::TaskKind::LayerFlushTask, - Some(self.tenant_id), - Some(self.timeline_id), - "layer flush task", - false, - async move { self_clone.flush_frozen_layers(false) }, - ); - } - } - } - Ok(()) - } - pub fn set_state(&self, new_state: TimelineState) { match (self.current_state(), new_state) { (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => { @@ -732,6 +685,9 @@ impl Timeline { let disk_consistent_lsn = metadata.disk_consistent_lsn(); let (state, _) = watch::channel(TimelineState::Suspended); + let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0); + let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(()))); + let mut result = Timeline { conf, tenant_conf, @@ -759,8 +715,12 @@ impl Timeline { upload_layers: AtomicBool::new(upload_layers), + flush_loop_started: Mutex::new(false), + + layer_flush_start_tx, + layer_flush_done_tx, + write_lock: Mutex::new(()), - layer_flush_lock: Mutex::new(()), layer_removal_cs: Mutex::new(()), gc_info: RwLock::new(GcInfo { @@ -793,6 +753,33 @@ impl Timeline { result } + pub(super) fn maybe_spawn_flush_loop(self: &Arc) { + let mut flush_loop_started = self.flush_loop_started.lock().unwrap(); + if *flush_loop_started { + info!( + "skipping attempt to start flush_loop twice {}/{}", + self.tenant_id, self.timeline_id + ); + return; + } + + let layer_flush_start_rx = self.layer_flush_start_tx.subscribe(); + let self_clone = Arc::clone(self); + info!("spawning flush loop"); + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::LayerFlushTask, + Some(self.tenant_id), + Some(self.timeline_id), + "layer flush task", + false, + async move { self_clone.flush_loop(layer_flush_start_rx).await; Ok(()) } + .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id)) + ); + + *flush_loop_started = true; + } + pub(super) fn launch_wal_receiver(self: &Arc) { if !is_etcd_client_initialized() { if cfg!(test) { @@ -1289,53 +1276,128 @@ impl Timeline { drop(layers); } - /// Flush all frozen layers to disk. /// - /// Only one task at a time can be doing layer-flushing for a - /// given timeline. If 'wait' is true, and another task is - /// currently doing the flushing, this function will wait for it - /// to finish. If 'wait' is false, this function will return - /// immediately instead. - fn flush_frozen_layers(&self, wait: bool) -> anyhow::Result<()> { - let flush_lock_guard = if wait { - self.layer_flush_lock.lock().unwrap() - } else { - match self.layer_flush_lock.try_lock() { - Ok(guard) => guard, - Err(TryLockError::WouldBlock) => return Ok(()), - Err(TryLockError::Poisoned(err)) => panic!("{:?}", err), - } - }; + /// Check if more than 'checkpoint_distance' of WAL has been accumulated in + /// the in-memory layer, and initiate flushing it if so. + /// + /// Also flush after a period of time without new data -- it helps + /// safekeepers to regard pageserver as caught up and suspend activity. + /// + pub fn check_checkpoint_distance(self: &Arc) -> anyhow::Result<()> { + let last_lsn = self.get_last_record_lsn(); + let layers = self.layers.read().unwrap(); + if let Some(open_layer) = &layers.open_layer { + let open_layer_size = open_layer.size()?; + drop(layers); + let last_freeze_at = self.last_freeze_at.load(); + let last_freeze_ts = *(self.last_freeze_ts.read().unwrap()); + let distance = last_lsn.widening_sub(last_freeze_at); + // Checkpointing the open layer can be triggered by layer size or LSN range. + // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and + // we want to stay below that with a big margin. The LSN distance determines how + // much WAL the safekeepers need to store. + if distance >= self.get_checkpoint_distance().into() + || open_layer_size > self.get_checkpoint_distance() + || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()) + { + info!( + "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}", + distance, + open_layer_size, + last_freeze_ts.elapsed() + ); - let timer = self.metrics.flush_time_histo.start_timer(); + self.freeze_inmem_layer(true); + self.last_freeze_at.store(last_lsn); + *(self.last_freeze_ts.write().unwrap()) = Instant::now(); - loop { - let layers = self.layers.read().unwrap(); - if let Some(frozen_layer) = layers.frozen_layers.front() { - let frozen_layer = Arc::clone(frozen_layer); - drop(layers); // to allow concurrent reads and writes - self.flush_frozen_layer(frozen_layer)?; - } else { - // Drop the 'layer_flush_lock' *before* 'layers'. That - // way, if you freeze a layer, and then call - // flush_frozen_layers(false), it is guaranteed that - // if another thread was busy flushing layers and the - // call therefore returns immediately, the other - // thread will have seen the newly-frozen layer and - // will flush that too (assuming no errors). - drop(flush_lock_guard); - drop(layers); - break; + // Wake up the layer flusher + self.flush_frozen_layers(); } } - - timer.stop_and_record(); - Ok(()) } + /// Layer flusher task's main loop. + async fn flush_loop(&self, mut layer_flush_start_rx: tokio::sync::watch::Receiver) { + info!("started flush loop"); + loop { + tokio::select! { + _ = task_mgr::shutdown_watcher() => { + info!("shutting down layer flush task"); + break; + }, + _ = layer_flush_start_rx.changed() => {} + } + + trace!("waking up"); + let timer = self.metrics.flush_time_histo.start_timer(); + let flush_counter = *layer_flush_start_rx.borrow(); + let result = loop { + let layer_to_flush = { + let layers = self.layers.read().unwrap(); + layers.frozen_layers.front().cloned() + // drop 'layers' lock to allow concurrent reads and writes + }; + if let Some(layer_to_flush) = layer_to_flush { + if let Err(err) = self.flush_frozen_layer(layer_to_flush).await { + error!("could not flush frozen layer: {err:?}"); + break Err(err); + } + continue; + } else { + break Ok(()); + } + }; + // Notify any listeners that we're done + let _ = self + .layer_flush_done_tx + .send_replace((flush_counter, result)); + + timer.stop_and_record(); + } + } + + async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> { + let mut rx = self.layer_flush_done_tx.subscribe(); + + // Increment the flush cycle counter and wake up the flush task. + // Remember the new value, so that when we listen for the flush + // to finish, we know when the flush that we initiated has + // finished, instead of some other flush that was started earlier. + let mut my_flush_request = 0; + self.layer_flush_start_tx.send_modify(|counter| { + my_flush_request = *counter + 1; + *counter = my_flush_request; + }); + + loop { + { + let (last_result_counter, last_result) = &*rx.borrow(); + if *last_result_counter >= my_flush_request { + if let Err(_err) = last_result { + // We already logged the original error in + // flush_loop. We cannot propagate it to the caller + // here, because it might not be Cloneable + bail!("could not flush frozen layer"); + } else { + return Ok(()); + } + } + } + trace!("waiting for flush to complete"); + rx.changed().await?; + trace!("done") + } + } + + fn flush_frozen_layers(&self) { + self.layer_flush_start_tx.send_modify(|val| *val += 1); + } + /// Flush one frozen in-memory layer to disk, as a new delta layer. - fn flush_frozen_layer(&self, frozen_layer: Arc) -> anyhow::Result<()> { + #[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.filename().display()))] + async fn flush_frozen_layer(&self, frozen_layer: Arc) -> anyhow::Result<()> { // As a special case, when we have just imported an image into the repository, // instead of writing out a L0 delta layer, we directly write out image layer // files instead. This is possible as long as *all* the data imported into the @@ -2265,13 +2327,10 @@ impl Timeline { let last_rec_lsn = data.records.last().unwrap().0; - let img = self.walredo_mgr.request_redo( - key, - request_lsn, - base_img, - data.records, - self.pg_version, - )?; + let img = self + .walredo_mgr + .request_redo(key, request_lsn, base_img, data.records, self.pg_version) + .context("Failed to reconstruct a page image:")?; if img.len() == page_cache::PAGE_SZ { let cache = page_cache::get(); diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index f1db50bf7f..3766bc5cb3 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -241,7 +241,7 @@ pub async fn shutdown_all_tenants() { let tenant_id = tenant.tenant_id(); debug!("shutdown tenant {tenant_id}"); - if let Err(err) = tenant.checkpoint() { + if let Err(err) = tenant.checkpoint().await { error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}"); } } diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 23ce9dc699..a24bdd5812 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -119,7 +119,7 @@ async fn gc_loop(tenant_id: TenantId) { let gc_horizon = tenant.get_gc_horizon(); let mut sleep_duration = gc_period; if gc_horizon > 0 { - if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false) + if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await { sleep_duration = wait_duration; error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index e21ec4d742..54d322373b 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -120,7 +120,7 @@ fn can_apply_in_neon(rec: &NeonWalRecord) -> bool { /// An error happened in WAL redo #[derive(Debug, thiserror::Error)] pub enum WalRedoError { - #[error(transparent)] + #[error("encountered io error: {0}")] IoError(#[from] std::io::Error), #[error("cannot perform WAL redo now")] From 15d970f731e83898e696dbfa7868e76fdfc6d404 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 3 Nov 2022 15:25:05 +0200 Subject: [PATCH 0995/1022] decrease diff by moving check_checkpoint_distance back Co-authored-by: Christian Schwarz --- pageserver/src/tenant/timeline.rs | 82 +++++++++++++++---------------- 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1f23fedcc1..a382ad5e11 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -600,6 +600,46 @@ impl Timeline { Ok(size) } + /// Check if more than 'checkpoint_distance' of WAL has been accumulated in + /// the in-memory layer, and initiate flushing it if so. + /// + /// Also flush after a period of time without new data -- it helps + /// safekeepers to regard pageserver as caught up and suspend activity. + pub fn check_checkpoint_distance(self: &Arc) -> anyhow::Result<()> { + let last_lsn = self.get_last_record_lsn(); + let layers = self.layers.read().unwrap(); + if let Some(open_layer) = &layers.open_layer { + let open_layer_size = open_layer.size()?; + drop(layers); + let last_freeze_at = self.last_freeze_at.load(); + let last_freeze_ts = *(self.last_freeze_ts.read().unwrap()); + let distance = last_lsn.widening_sub(last_freeze_at); + // Checkpointing the open layer can be triggered by layer size or LSN range. + // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and + // we want to stay below that with a big margin. The LSN distance determines how + // much WAL the safekeepers need to store. + if distance >= self.get_checkpoint_distance().into() + || open_layer_size > self.get_checkpoint_distance() + || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()) + { + info!( + "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}", + distance, + open_layer_size, + last_freeze_ts.elapsed() + ); + + self.freeze_inmem_layer(true); + self.last_freeze_at.store(last_lsn); + *(self.last_freeze_ts.write().unwrap()) = Instant::now(); + + // Wake up the layer flusher + self.flush_frozen_layers(); + } + } + Ok(()) + } + pub fn set_state(&self, new_state: TimelineState) { match (self.current_state(), new_state) { (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => { @@ -1276,48 +1316,6 @@ impl Timeline { drop(layers); } - /// - /// Check if more than 'checkpoint_distance' of WAL has been accumulated in - /// the in-memory layer, and initiate flushing it if so. - /// - /// Also flush after a period of time without new data -- it helps - /// safekeepers to regard pageserver as caught up and suspend activity. - /// - pub fn check_checkpoint_distance(self: &Arc) -> anyhow::Result<()> { - let last_lsn = self.get_last_record_lsn(); - let layers = self.layers.read().unwrap(); - if let Some(open_layer) = &layers.open_layer { - let open_layer_size = open_layer.size()?; - drop(layers); - let last_freeze_at = self.last_freeze_at.load(); - let last_freeze_ts = *(self.last_freeze_ts.read().unwrap()); - let distance = last_lsn.widening_sub(last_freeze_at); - // Checkpointing the open layer can be triggered by layer size or LSN range. - // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and - // we want to stay below that with a big margin. The LSN distance determines how - // much WAL the safekeepers need to store. - if distance >= self.get_checkpoint_distance().into() - || open_layer_size > self.get_checkpoint_distance() - || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()) - { - info!( - "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}", - distance, - open_layer_size, - last_freeze_ts.elapsed() - ); - - self.freeze_inmem_layer(true); - self.last_freeze_at.store(last_lsn); - *(self.last_freeze_ts.write().unwrap()) = Instant::now(); - - // Wake up the layer flusher - self.flush_frozen_layers(); - } - } - Ok(()) - } - /// Layer flusher task's main loop. async fn flush_loop(&self, mut layer_flush_start_rx: tokio::sync::watch::Receiver) { info!("started flush loop"); From 99e745a760b7af73a8066bd3d9a10073381388d5 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 3 Nov 2022 15:39:15 +0200 Subject: [PATCH 0996/1022] review adjustments --- pageserver/src/tenant/timeline.rs | 10 +++++++++- pageserver/src/walredo.rs | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a382ad5e11..0b2f7876db 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1364,6 +1364,11 @@ impl Timeline { // to finish, we know when the flush that we initiated has // finished, instead of some other flush that was started earlier. let mut my_flush_request = 0; + + if !&*self.flush_loop_started.lock().unwrap() { + anyhow::bail!("cannot flush frozen layers when flush_loop is not running") + } + self.layer_flush_start_tx.send_modify(|counter| { my_flush_request = *counter + 1; *counter = my_flush_request; @@ -1377,7 +1382,10 @@ impl Timeline { // We already logged the original error in // flush_loop. We cannot propagate it to the caller // here, because it might not be Cloneable - bail!("could not flush frozen layer"); + anyhow::bail!( + "Could not flush frozen layer. Request id: {}", + my_flush_request + ); } else { return Ok(()); } diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 54d322373b..e21ec4d742 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -120,7 +120,7 @@ fn can_apply_in_neon(rec: &NeonWalRecord) -> bool { /// An error happened in WAL redo #[derive(Debug, thiserror::Error)] pub enum WalRedoError { - #[error("encountered io error: {0}")] + #[error(transparent)] IoError(#[from] std::io::Error), #[error("cannot perform WAL redo now")] From 548d472b1244708b895e031399c497f2a9088670 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 7 Nov 2022 12:03:57 +0200 Subject: [PATCH 0997/1022] fix: logical size query at before initdb_lsn (#2755) With more realistic selection of gc_horizon in tests there is an immediate failure with trying to query logical size with lsn < initdb_lsn. Fixes that, adds illustration gathered from clarity of explaining this tenant size calculation to more people. Cc: #2748, #2599. --- pageserver/src/tenant/size.rs | 33 ++++++++++++++++++++----- test_runner/regress/test_tenant_size.py | 10 +++++--- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index f0c611ae39..86e685fd4c 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -1,3 +1,4 @@ +use std::cmp; use std::collections::{HashMap, HashSet}; use std::sync::Arc; @@ -40,6 +41,21 @@ struct TimelineInputs { next_gc_cutoff: Lsn, } +/// Gathers the inputs for the tenant sizing model. +/// +/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which +/// is updated on-demand, during the start of this calculation and separate from the +/// [`Timeline::latest_gc_cutoff`]. +/// +/// For timelines in general: +/// +/// ```ignore +/// 0-----|---------|----|------------| · · · · · |·> lsn +/// initdb_lsn branchpoints* next_gc_cutoff latest +/// ``` +/// +/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the +/// tenant size will be zero. pub(super) async fn gather_inputs( tenant: &Tenant, limit: &Arc, @@ -88,13 +104,18 @@ pub(super) async fn gather_inputs( let gc_info = timeline.gc_info.read().unwrap(); // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a - // new gc run, which we have no control over. - // maybe this should be moved to gc_info.next_gc_cutoff()? - let next_gc_cutoff = std::cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff); + // new gc run, which we have no control over. however differently from `Timeline::gc` + // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not + // actually removing files. + let next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff); - let maybe_cutoff = if next_gc_cutoff > timeline.get_ancestor_lsn() { - // only include these if they are after branching point; otherwise we would end up - // with duplicate updates before the actual branching. + // the minimum where we should find the next_gc_cutoff for our calculations. + // + // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we + // want to query any logical size before initdb_lsn. + let cutoff_minimum = cmp::max(timeline.get_ancestor_lsn(), timeline.initdb_lsn); + + let maybe_cutoff = if next_gc_cutoff > cutoff_minimum { Some((next_gc_cutoff, LsnKind::GcCutOff)) } else { None diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index ecf78499bb..03e7129ff7 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -47,9 +47,13 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): # gc and compaction is not wanted automatically # the pitr_interval here is quite problematic, so we cannot really use it. # it'd have to be calibrated per test executing env. - neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='1h', gc_period='1h', pitr_interval='0sec', gc_horizon={128 * 1024}}}" - # in this test we don't run gc or compaction, but the tenant size is - # expected to use the "next gc" cutoff, so the small amounts should work + + # there was a bug which was hidden if the create table and first batch of + # inserts is larger than gc_horizon. for example 0x20000 here hid the fact + # that there next_gc_cutoff could be smaller than initdb_lsn, which will + # obviously lead to issues when calculating the size. + gc_horizon = 0x30000 + neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='1h', gc_period='1h', pitr_interval='0sec', gc_horizon={gc_horizon}}}" env = neon_env_builder.init_start() From d5b6471fa91944f016a4e6a623c0393c885673c5 Mon Sep 17 00:00:00 2001 From: MMeent Date: Mon, 7 Nov 2022 17:13:24 +0100 Subject: [PATCH 0998/1022] Update prefetch mechanism: (#2687) Prefetch requests and responses are stored in a ringbuffer instead of a queue, which means we can utilize prefetches of many relations concurrently -- page reads of un-prefetched relations now don't imply dropping prefetches. In a future iteration, this may detect sequential scans based on the read behavior of sequential scans, and will dynamically prefetch buffers for such relations as needed. Right now, it still depends on explicit prefetch requests from PostgreSQL. The main improvement here is that we now have a buffer for prefetched pages of 128 entries with random access. Before, we had a similarly sized cache, but this cache did not allow for random access, which resulted in dropped entries when multiple systems used the prefetching subsystem concurrently. See also: #2544 --- pgxn/neon/libpagestore.c | 42 ++- pgxn/neon/pagestore_client.h | 8 +- pgxn/neon/pagestore_smgr.c | 686 +++++++++++++++++++++++++++++------ vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 5 files changed, 600 insertions(+), 140 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index d3c2bc063f..c37adc6cb7 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -42,6 +42,11 @@ PGconn *pageserver_conn = NULL; char *page_server_connstring_raw; +int n_unflushed_requests = 0; +int flush_every_n_requests = 8; + +static void pageserver_flush(void); + static void pageserver_connect() { @@ -164,6 +169,8 @@ pageserver_disconnect(void) PQfinish(pageserver_conn); pageserver_conn = NULL; connected = false; + + prefetch_on_ps_disconnect(); } } @@ -174,11 +181,7 @@ pageserver_send(NeonRequest * request) /* If the connection was lost for some reason, reconnect */ if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) - { - PQfinish(pageserver_conn); - pageserver_conn = NULL; - connected = false; - } + pageserver_disconnect(); if (!connected) pageserver_connect(); @@ -202,6 +205,11 @@ pageserver_send(NeonRequest * request) } pfree(req_buff.data); + n_unflushed_requests++; + + if (flush_every_n_requests > 0 && n_unflushed_requests >= flush_every_n_requests) + pageserver_flush(); + if (message_level_is_interesting(PageStoreTrace)) { char *msg = nm_to_string((NeonMessage *) request); @@ -255,25 +263,21 @@ pageserver_receive(void) static void pageserver_flush(void) { - if (PQflush(pageserver_conn)) + if (!connected) + { + neon_log(WARNING, "Tried to flush while disconnected"); + } + else if (PQflush(pageserver_conn)) { char *msg = PQerrorMessage(pageserver_conn); pageserver_disconnect(); neon_log(ERROR, "failed to flush page requests: %s", msg); } -} - -static NeonResponse * -pageserver_call(NeonRequest * request) -{ - pageserver_send(request); - pageserver_flush(); - return pageserver_receive(); + n_unflushed_requests = 0; } page_server_api api = { - .request = pageserver_call, .send = pageserver_send, .flush = pageserver_flush, .receive = pageserver_receive @@ -427,6 +431,14 @@ pg_init_libpagestore(void) PGC_SIGHUP, GUC_UNIT_MB, NULL, NULL, NULL); + DefineCustomIntVariable("neon.flush_output_after", + "Flush the output buffer after every N unflushed requests", + NULL, + &flush_every_n_requests, + 8, -1, INT_MAX, + PGC_SIGHUP, + 0, /* no flags required */ + NULL, NULL, NULL); relsize_hash_init(); diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 4a4e60b707..be6c4b3a77 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -115,6 +115,8 @@ typedef struct char page[FLEXIBLE_ARRAY_MEMBER]; } NeonGetPageResponse; +#define PS_GETPAGERESPONSE_SIZE (MAXALIGN(offsetof(NeonGetPageResponse, page) + BLCKSZ)) + typedef struct { NeonMessageTag tag; @@ -138,15 +140,18 @@ extern char *nm_to_string(NeonMessage * msg); typedef struct { - NeonResponse *(*request) (NeonRequest * request); void (*send) (NeonRequest * request); NeonResponse *(*receive) (void); void (*flush) (void); } page_server_api; +extern void prefetch_on_ps_disconnect(void); + extern page_server_api * page_server; extern char *page_server_connstring; +extern bool seqscan_prefetch_enabled; +extern int seqscan_prefetch_distance; extern char *neon_timeline; extern char *neon_tenant; extern bool wal_redo; @@ -167,7 +172,6 @@ extern void neon_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); -extern void neon_reset_prefetch(SMgrRelation reln); extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 927c8f1fc1..59c5ff8db2 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -49,22 +49,20 @@ #include "access/xlog.h" #include "access/xloginsert.h" #include "access/xlog_internal.h" -#include "catalog/pg_class.h" -#include "pagestore_client.h" -#include "pagestore_client.h" -#include "storage/smgr.h" #include "access/xlogdefs.h" +#include "catalog/pg_class.h" +#include "common/hashfn.h" +#include "pagestore_client.h" #include "postmaster/interrupt.h" +#include "postmaster/autovacuum.h" #include "replication/walsender.h" #include "storage/bufmgr.h" #include "storage/relfilenode.h" #include "storage/buf_internals.h" +#include "storage/smgr.h" #include "storage/md.h" -#include "fmgr.h" -#include "miscadmin.h" #include "pgstat.h" -#include "catalog/pg_tablespace_d.h" -#include "postmaster/autovacuum.h" + #if PG_VERSION_NUM >= 150000 #include "access/xlogutils.h" @@ -113,48 +111,482 @@ typedef enum static SMgrRelation unlogged_build_rel = NULL; static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; - /* * Prefetch implementation: + * * Prefetch is performed locally by each backend. - * There can be up to MAX_PREFETCH_REQUESTS registered using smgr_prefetch - * before smgr_read. All this requests are appended to primary smgr_read request. - * It is assumed that pages will be requested in prefetch order. - * Reading of prefetch responses is delayed until them are actually needed (smgr_read). - * It make it possible to parallelize processing and receiving of prefetched pages. - * In case of prefetch miss or any other SMGR request other than smgr_read, - * all prefetch responses has to be consumed. + * + * There can be up to READ_BUFFER_SIZE active IO requests registered at any + * time. Requests using smgr_prefetch are sent to the pageserver, but we don't + * wait on the response. Requests using smgr_read are either read from the + * buffer, or (if that's not possible) we wait on the response to arrive - + * this also will allow us to receive other prefetched pages. + * Each request is immediately written to the output buffer of the pageserver + * connection, but may not be flushed if smgr_prefetch is used: pageserver + * flushes sent requests on manual flush, or every neon.flush_output_after + * unflushed requests; which is not necessarily always and all the time. + * + * Once we have received a response, this value will be stored in the response + * buffer, indexed in a hash table. This allows us to retain our buffered + * prefetch responses even when we have cache misses. + * + * Reading of prefetch responses is delayed until them are actually needed + * (smgr_read). In case of prefetch miss or any other SMGR request other than + * smgr_read, all prefetch responses in the pipeline will need to be read from + * the connection; the responses are stored for later use. + * + * NOTE: The current implementation of the prefetch system implements a ring + * buffer of up to READ_BUFFER_SIZE requests. If there are more _read and + * _prefetch requests between the initial _prefetch and the _read of a buffer, + * the prefetch request will have been dropped from this prefetch buffer, and + * your prefetch was wasted. */ -#define MAX_PREFETCH_REQUESTS 128 +/* Max amount of tracked buffer reads */ +#define READ_BUFFER_SIZE 128 -BufferTag prefetch_requests[MAX_PREFETCH_REQUESTS]; -BufferTag prefetch_responses[MAX_PREFETCH_REQUESTS]; -int n_prefetch_requests; -int n_prefetch_responses; -int n_prefetched_buffers; -int n_prefetch_hits; -int n_prefetch_misses; -XLogRecPtr prefetch_lsn; +typedef enum PrefetchStatus { + PRFS_UNUSED = 0, /* unused slot */ + PRFS_REQUESTED, /* request was written to the sendbuffer to PS, but not + * necessarily flushed. + * all fields except response valid */ + PRFS_RECEIVED, /* all fields valid */ + PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still valid */ +} PrefetchStatus; +typedef struct PrefetchRequest { + BufferTag buftag; /* must be first entry in the struct */ + XLogRecPtr effective_request_lsn; + NeonResponse *response; /* may be null */ + PrefetchStatus status; + uint64 my_ring_index; +} PrefetchRequest; + +/* prefetch buffer lookup hash table */ + +typedef struct PrfHashEntry { + PrefetchRequest *slot; + uint32 status; + uint32 hash; +} PrfHashEntry; + +#define SH_PREFIX prfh +#define SH_ELEMENT_TYPE PrfHashEntry +#define SH_KEY_TYPE PrefetchRequest * +#define SH_KEY slot +#define SH_STORE_HASH +#define SH_GET_HASH(tb, a) ((a)->hash) +#define SH_HASH_KEY(tb, key) hash_bytes( \ + ((const unsigned char *) &(key)->buftag), \ + sizeof(BufferTag) \ +) + +#define SH_EQUAL(tb, a, b) (BUFFERTAGS_EQUAL((a)->buftag, (b)->buftag)) +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + +/* + * PrefetchState maintains the state of (prefetch) getPage@LSN requests. + * It maintains a (ring) buffer of in-flight requests and responses. + * + * We maintain several indexes into the ring buffer: + * ring_unused >= ring_receive >= ring_last >= 0 + * + * ring_unused points to the first unused slot of the buffer + * ring_receive is the next request that is to be received + * ring_last is the oldest received entry in the buffer + * + * Apart from being an entry in the ring buffer of prefetch requests, each + * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag. + */ +typedef struct PrefetchState { + MemoryContext bufctx; /* context for prf_buffer[].response allocations */ + MemoryContext errctx; /* context for prf_buffer[].response allocations */ + MemoryContext hashctx; /* context for prf_buffer */ + + /* buffer indexes */ + uint64 ring_unused; /* first unused slot */ + uint64 ring_receive; /* next slot that is to receive a response */ + uint64 ring_last; /* min slot with a response value */ + + /* metrics / statistics */ + int n_responses_buffered; /* count of PS responses not yet in buffers */ + int n_requests_inflight; /* count of PS requests considered in flight */ + int n_unused; /* count of buffers < unused, > last, that are also unused */ + + /* the buffers */ + prfh_hash *prf_hash; + PrefetchRequest prf_buffer[READ_BUFFER_SIZE]; /* prefetch buffers */ +} PrefetchState; + +PrefetchState *MyPState; + +int n_prefetch_hits = 0; +int n_prefetch_misses = 0; +int n_prefetch_missed_caches = 0; +int n_prefetch_dupes = 0; + +XLogRecPtr prefetch_lsn = 0; + +static void consume_prefetch_responses(void); +static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn); +static void prefetch_read(PrefetchRequest *slot); +static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn); +static void prefetch_wait_for(uint64 ring_index); +static void prefetch_cleanup(void); +static inline void prefetch_set_unused(uint64 ring_index, bool hash_cleanup); + +static XLogRecPtr neon_get_request_lsn(bool *latest, RelFileNode rnode, + ForkNumber forknum, BlockNumber blkno); + + +/* + * Make sure that there are no responses still in the buffer. + */ static void consume_prefetch_responses(void) { - for (int i = n_prefetched_buffers; i < n_prefetch_responses; i++) - { - NeonResponse *resp = page_server->receive(); + if (MyPState->ring_receive < MyPState->ring_unused) + prefetch_wait_for(MyPState->ring_unused - 1); +} - pfree(resp); +static void +prefetch_cleanup(void) +{ + int index; + uint64 ring_index; + PrefetchRequest *slot; + + while (MyPState->ring_last < MyPState->ring_receive) { + ring_index = MyPState->ring_last; + index = (ring_index % READ_BUFFER_SIZE); + slot = &MyPState->prf_buffer[index]; + + if (slot->status == PRFS_UNUSED) + MyPState->ring_last += 1; + else + break; } - n_prefetched_buffers = 0; - n_prefetch_responses = 0; +} + +/* + * Wait for slot of ring_index to have received its response. + * The caller is responsible for making sure the request buffer is flushed. + */ +static void +prefetch_wait_for(uint64 ring_index) +{ + int index; + PrefetchRequest *entry; + + Assert(MyPState->ring_unused > ring_index); + + while (MyPState->ring_receive <= ring_index) + { + index = (MyPState->ring_receive % READ_BUFFER_SIZE); + entry = &MyPState->prf_buffer[index]; + + Assert(entry->status == PRFS_REQUESTED); + prefetch_read(entry); + } +} + +/* + * Read the response of a prefetch request into its slot. + * + * The caller is responsible for making sure that the request for this buffer + * was flushed to the PageServer. + */ +static void +prefetch_read(PrefetchRequest *slot) +{ + NeonResponse *response; + MemoryContext old; + + Assert(slot->status == PRFS_REQUESTED); + Assert(slot->response == NULL); + Assert(slot->my_ring_index == MyPState->ring_receive); + + old = MemoryContextSwitchTo(MyPState->errctx); + response = (NeonResponse *) page_server->receive(); + MemoryContextSwitchTo(old); + + /* update prefetch state */ + MyPState->n_responses_buffered += 1; + MyPState->n_requests_inflight -= 1; + MyPState->ring_receive += 1; + + /* update slot state */ + slot->status = PRFS_RECEIVED; + slot->response = response; +} + +/* + * Disconnect hook - drop prefetches when the connection drops + * + * If we don't remove the failed prefetches, we'd be serving incorrect + * data to the smgr. + */ +void +prefetch_on_ps_disconnect(void) +{ + for (; MyPState->ring_receive < MyPState->ring_unused; MyPState->ring_receive++) + { + PrefetchRequest *slot; + int index = MyPState->ring_receive % READ_BUFFER_SIZE; + + slot = &MyPState->prf_buffer[index]; + Assert(slot->status == PRFS_REQUESTED); + Assert(slot->my_ring_index == MyPState->ring_receive); + + /* clean up the request */ + slot->status = PRFS_TAG_REMAINS; + MyPState->n_requests_inflight--; + prefetch_set_unused(MyPState->ring_receive, true); + } +} + +/* + * prefetch_set_unused() - clear a received prefetch slot + * + * The slot at ring_index must be a current member of the ring buffer, + * and may not be in the PRFS_REQUESTED state. + */ +static inline void +prefetch_set_unused(uint64 ring_index, bool hash_cleanup) +{ + PrefetchRequest *slot = &MyPState->prf_buffer[ring_index % READ_BUFFER_SIZE]; + + Assert(MyPState->ring_last <= ring_index && + MyPState->ring_unused > ring_index); + + if (slot->status == PRFS_UNUSED) + return; + + Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS); + Assert(ring_index >= MyPState->ring_last && + ring_index < MyPState->ring_unused); + + if (slot->status == PRFS_RECEIVED) + { + pfree(slot->response); + slot->response = NULL; + + MyPState->n_responses_buffered -= 1; + MyPState->n_unused += 1; + } + else + { + Assert(slot->response == NULL); + } + + if (hash_cleanup) + prfh_delete(MyPState->prf_hash, slot); + + /* clear all fields */ + MemSet(slot, 0, sizeof(PrefetchRequest)); + slot->status = PRFS_UNUSED; + + /* run cleanup if we're holding back ring_last */ + if (MyPState->ring_last == ring_index) + prefetch_cleanup(); +} + +static void +prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn) +{ + NeonGetPageRequest request = { + .req.tag = T_NeonGetPageRequest, + .req.latest = false, + .req.lsn = 0, + .rnode = slot->buftag.rnode, + .forknum = slot->buftag.forkNum, + .blkno = slot->buftag.blockNum, + }; + + if (force_lsn && force_latest) + { + request.req.lsn = *force_lsn; + request.req.latest = *force_latest; + slot->effective_request_lsn = *force_lsn; + } + else + { + XLogRecPtr lsn = neon_get_request_lsn( + &request.req.latest, + slot->buftag.rnode, + slot->buftag.forkNum, + slot->buftag.blockNum + ); + /* + * Note: effective_request_lsn is potentially higher than the requested + * LSN, but still correct: + * + * We know there are no changes between the actual requested LSN and + * the value of effective_request_lsn: If there were, the page would + * have been in cache and evicted between those LSN values, which + * then would have had to result in a larger request LSN for this page. + * + * It is possible that a concurrent backend loads the page, modifies + * it and then evicts it again, but the LSN of that eviction cannot be + * smaller than the current WAL insert/redo pointer, which is already + * larger than this prefetch_lsn. So in any case, that would + * invalidate this cache. + * + * The best LSN to use for effective_request_lsn would be + * XLogCtl->Insert.RedoRecPtr, but that's expensive to access. + */ + request.req.lsn = lsn; + prefetch_lsn = Max(prefetch_lsn, lsn); + slot->effective_request_lsn = prefetch_lsn; + } + + Assert(slot->response == NULL); + Assert(slot->my_ring_index == MyPState->ring_unused); + page_server->send((NeonRequest *) &request); + + /* update prefetch state */ + MyPState->n_requests_inflight += 1; + MyPState->n_unused -= 1; + MyPState->ring_unused += 1; + + /* update slot state */ + slot->status = PRFS_REQUESTED; +} + +/* + * prefetch_register_buffer() - register and prefetch buffer + * + * Register that we may want the contents of BufferTag in the near future. + * + * If force_latest and force_lsn are not NULL, those values are sent to the + * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure + * to fill in these values manually. + */ + +static uint64 +prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn) +{ + int index; + bool found; + uint64 ring_index; + PrefetchRequest req; + PrefetchRequest *slot; + PrfHashEntry *entry; + + /* use an intermediate PrefetchRequest struct to ensure correct alignment */ + req.buftag = tag; + + entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req); + + if (entry != NULL) + { + slot = entry->slot; + ring_index = slot->my_ring_index; + index = (ring_index % READ_BUFFER_SIZE); + Assert(slot == &MyPState->prf_buffer[index]); + + Assert(slot->status != PRFS_UNUSED); + Assert(BUFFERTAGS_EQUAL(slot->buftag, tag)); + + /* + * If we want a specific lsn, we do not accept requests that were made + * with a potentially different LSN. + */ + if (force_lsn && slot->effective_request_lsn != *force_lsn) + { + prefetch_wait_for(ring_index); + prefetch_set_unused(ring_index, true); + } + /* + * We received a prefetch for a page that was recently read and + * removed from the buffers. Remove that request from the buffers. + */ + else if (slot->status == PRFS_TAG_REMAINS) + { + prefetch_set_unused(ring_index, true); + } + else + { + /* The buffered request is good enough, return that index */ + n_prefetch_dupes++; + return ring_index; + } + } + + /* + * If the prefetch queue is full, we need to make room by clearing the + * oldest slot. If the oldest slot holds a buffer that was already + * received, we can just throw it away; we fetched the page unnecessarily + * in that case. If the oldest slot holds a request that we haven't + * received a response for yet, we have to wait for the response to that + * before we can continue. We might not have even flushed the request to + * the pageserver yet, it might be just sitting in the output buffer. In + * that case, we flush it and wait for the response. (We could decide not + * to send it, but it's hard to abort when the request is already in the + * output buffer, and 'not sending' a prefetch request kind of goes + * against the principles of prefetching) + */ + if (MyPState->ring_last + READ_BUFFER_SIZE - 1 == MyPState->ring_unused) + { + slot = &MyPState->prf_buffer[(MyPState->ring_last % READ_BUFFER_SIZE)]; + + Assert(slot->status != PRFS_UNUSED); + + /* We have the slot for ring_last, so that must still be in progress */ + switch (slot->status) + { + case PRFS_REQUESTED: + Assert(MyPState->ring_receive == MyPState->ring_last); + prefetch_wait_for(MyPState->ring_last); + prefetch_set_unused(MyPState->ring_last, true); + break; + case PRFS_RECEIVED: + case PRFS_TAG_REMAINS: + prefetch_set_unused(MyPState->ring_last, true); + break; + default: + pg_unreachable(); + } + } + + /* + * The next buffer pointed to by `ring_unused` is now unused, so we can insert + * the new request to it. + */ + ring_index = MyPState->ring_unused; + index = (ring_index % READ_BUFFER_SIZE); + slot = &MyPState->prf_buffer[index]; + + Assert(MyPState->ring_last <= ring_index); + + Assert(slot->status == PRFS_UNUSED); + + /* + * We must update the slot data before insertion, because the hash + * function reads the buffer tag from the slot. + */ + slot->buftag = tag; + slot->my_ring_index = ring_index; + + prfh_insert(MyPState->prf_hash, slot, &found); + Assert(!found); + + prefetch_do_request(slot, force_latest, force_lsn); + Assert(slot->status == PRFS_REQUESTED); + Assert(ring_index < MyPState->ring_unused); + return ring_index; } static NeonResponse * page_server_request(void const *req) { + page_server->send((NeonRequest *) req); + page_server->flush(); consume_prefetch_responses(); - return page_server->request((NeonRequest *) req); + return page_server->receive(); } @@ -268,12 +700,15 @@ nm_unpack_response(StringInfo s) case T_NeonGetPageResponse: { - NeonGetPageResponse *msg_resp = palloc0(offsetof(NeonGetPageResponse, page) + BLCKSZ); + NeonGetPageResponse *msg_resp; + msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE); msg_resp->tag = tag; /* XXX: should be varlena */ memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); pq_getmsgend(s); + + Assert(msg_resp->tag == T_NeonGetPageResponse); resp = (NeonResponse *) msg_resp; break; @@ -617,7 +1052,32 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch void neon_init(void) { - /* noop */ + HASHCTL info; + + if (MyPState != NULL) + return; + + MyPState = MemoryContextAllocZero(TopMemoryContext, sizeof(PrefetchState)); + + MyPState->n_unused = READ_BUFFER_SIZE; + + MyPState->bufctx = SlabContextCreate(TopMemoryContext, + "NeonSMGR/prefetch", + SLAB_DEFAULT_BLOCK_SIZE * 17, + PS_GETPAGERESPONSE_SIZE); + MyPState->errctx = AllocSetContextCreate(TopMemoryContext, + "NeonSMGR/errors", + ALLOCSET_DEFAULT_SIZES); + MyPState->hashctx = AllocSetContextCreate(TopMemoryContext, + "NeonSMGR/prefetch", + ALLOCSET_DEFAULT_SIZES); + + info.keysize = sizeof(BufferTag); + info.entrysize = sizeof(uint64); + + MyPState->prf_hash = prfh_create(MyPState->hashctx, + READ_BUFFER_SIZE, NULL); + #ifdef DEBUG_COMPARE_LOCAL mdinit(); #endif @@ -1004,27 +1464,17 @@ neon_close(SMgrRelation reln, ForkNumber forknum) } -/* - * neon_reset_prefetch() -- reoe all previously rgistered prefeth requests - */ -void -neon_reset_prefetch(SMgrRelation reln) -{ - n_prefetch_requests = 0; -} - /* * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation */ bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { + uint64 ring_index; + switch (reln->smgr_relpersistence) { - case 0: - /* probably shouldn't happen, but ignore it */ - break; - + case 0: /* probably shouldn't happen, but ignore it */ case RELPERSISTENCE_PERMANENT: break; @@ -1036,14 +1486,17 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - if (n_prefetch_requests < MAX_PREFETCH_REQUESTS) - { - prefetch_requests[n_prefetch_requests].rnode = reln->smgr_rnode.node; - prefetch_requests[n_prefetch_requests].forkNum = forknum; - prefetch_requests[n_prefetch_requests].blockNum = blocknum; - n_prefetch_requests += 1; - return true; - } + BufferTag tag = (BufferTag) { + .rnode = reln->smgr_rnode.node, + .forkNum = forknum, + .blockNum = blocknum + }; + + ring_index = prefetch_register_buffer(tag, NULL, NULL); + + Assert(ring_index < MyPState->ring_unused && + MyPState->ring_last <= ring_index); + return false; } @@ -1094,81 +1547,72 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, XLogRecPtr request_lsn, bool request_latest, char *buffer) { NeonResponse *resp; - int i; + BufferTag buftag; + uint64 ring_index; + PrfHashEntry *entry; + PrefetchRequest *slot; + + buftag = (BufferTag) { + .rnode = rnode, + .forkNum = forkNum, + .blockNum = blkno, + }; /* - * Try to find prefetched page. It is assumed that pages will be requested - * in the same order as them are prefetched, but some other backend may - * load page in shared buffers, so some prefetch responses should be - * skipped. + * Try to find prefetched page in the list of received pages. */ - for (i = n_prefetched_buffers; i < n_prefetch_responses; i++) - { - resp = page_server->receive(); - if (resp->tag == T_NeonGetPageResponse && - RelFileNodeEquals(prefetch_responses[i].rnode, rnode) && - prefetch_responses[i].forkNum == forkNum && - prefetch_responses[i].blockNum == blkno) - { - char *page = ((NeonGetPageResponse *) resp)->page; + entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); + if (entry != NULL) + { + if (entry->slot->effective_request_lsn >= prefetch_lsn) + { + slot = entry->slot; + ring_index = slot->my_ring_index; + n_prefetch_hits += 1; + } + else /* the current prefetch LSN is not large enough, so drop the prefetch */ + { /* - * Check if prefetched page is still relevant. If it is updated by - * some other backend, then it should not be requested from smgr - * unless it is evicted from shared buffers. In the last case - * last_evicted_lsn should be updated and request_lsn should be - * greater than prefetch_lsn. Maximum with page LSN is used - * because page returned by page server may have LSN either - * greater either smaller than requested. + * We can't drop cache for not-yet-received requested items. It is + * unlikely this happens, but it can happen if prefetch distance is + * large enough and a backend didn't consume all prefetch requests. */ - if (Max(prefetch_lsn, PageGetLSN(page)) >= request_lsn) + if (entry->slot->status == PRFS_REQUESTED) { - n_prefetched_buffers = i + 1; - n_prefetch_hits += 1; - n_prefetch_requests = 0; - memcpy(buffer, page, BLCKSZ); - pfree(resp); - return; + page_server->flush(); + prefetch_wait_for(entry->slot->my_ring_index); } + /* drop caches */ + prefetch_set_unused(entry->slot->my_ring_index, true); + n_prefetch_missed_caches += 1; + /* make it look like a prefetch cache miss */ + entry = NULL; } - pfree(resp); } - n_prefetched_buffers = 0; - n_prefetch_responses = 0; - n_prefetch_misses += 1; - { - NeonGetPageRequest request = { - .req.tag = T_NeonGetPageRequest, - .req.latest = request_latest, - .req.lsn = request_lsn, - .rnode = rnode, - .forknum = forkNum, - .blkno = blkno - }; - if (n_prefetch_requests > 0) - { - /* Combine all prefetch requests with primary request */ - page_server->send((NeonRequest *) & request); - for (i = 0; i < n_prefetch_requests; i++) - { - request.rnode = prefetch_requests[i].rnode; - request.forknum = prefetch_requests[i].forkNum; - request.blkno = prefetch_requests[i].blockNum; - prefetch_responses[i] = prefetch_requests[i]; - page_server->send((NeonRequest *) & request); - } - page_server->flush(); - n_prefetch_responses = n_prefetch_requests; - n_prefetch_requests = 0; - prefetch_lsn = request_lsn; - resp = page_server->receive(); - } - else - { - resp = page_server->request((NeonRequest *) & request); - } + if (entry == NULL) + { + n_prefetch_misses += 1; + + ring_index = prefetch_register_buffer(buftag, &request_latest, + &request_lsn); + slot = &MyPState->prf_buffer[(ring_index % READ_BUFFER_SIZE)]; } + + Assert(MyPState->ring_last <= ring_index && + MyPState->ring_unused > ring_index); + Assert(slot->my_ring_index == ring_index); + Assert(slot->status != PRFS_UNUSED); + Assert(&MyPState->prf_buffer[(ring_index % READ_BUFFER_SIZE)] == slot); + + page_server->flush(); + prefetch_wait_for(ring_index); + + Assert(slot->status == PRFS_RECEIVED); + + resp = slot->response; + switch (resp->tag) { case T_NeonGetPageResponse: @@ -1188,12 +1632,13 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; - default: elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); } - pfree(resp); + /* buffer was used, clean up for later reuse */ + prefetch_set_unused(ring_index, true); + prefetch_cleanup(); } /* @@ -1815,7 +2260,6 @@ static const struct f_smgr neon_smgr = .smgr_unlink = neon_unlink, .smgr_extend = neon_extend, .smgr_prefetch = neon_prefetch, - .smgr_reset_prefetch = neon_reset_prefetch, .smgr_read = neon_read, .smgr_write = neon_write, .smgr_writeback = neon_writeback, diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index c0284ce58e..e56b812dd8 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit c0284ce58e7ee64e2307dd6fbffe1eaf4c23e5d1 +Subproject commit e56b812dd85a3d9355478cc626c10909406816ba diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index e5cc262697..39e3d745b3 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit e5cc2626970c9a4c9b7f1df3a457584c6bd071ad +Subproject commit 39e3d745b3701a3f47f40412fbec62cbb01a42bf From c1a76eb0e5cae664979479b362038166c533bc52 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 7 Nov 2022 18:39:51 +0000 Subject: [PATCH 0999/1022] test_runner: replace global variables with fixtures (#2754) This PR replaces the following global variables in the test framework with fixtures to make tests more configurable. I mainly need this for the forward compatibility tests (draft in https://github.com/neondatabase/neon/pull/2766). ``` base_dir neon_binpath pg_distrib_dir top_output_dir default_pg_version (this one got replaced with a fixture named pg_version) ``` Also, this PR adds more `Path` type where the code implies it. --- test_runner/fixtures/neon_fixtures.py | 248 ++++++++++-------- test_runner/fixtures/utils.py | 7 +- test_runner/pg_clients/test_pg_clients.py | 4 +- test_runner/regress/test_compatibility.py | 9 +- test_runner/regress/test_fullbackup.py | 20 +- test_runner/regress/test_import.py | 14 +- test_runner/regress/test_pageserver_api.py | 10 +- test_runner/regress/test_pg_regress.py | 92 ++++--- test_runner/regress/test_tenant_relocation.py | 31 +-- test_runner/regress/test_timeline_size.py | 3 +- test_runner/regress/test_wal_acceptor.py | 10 +- test_runner/regress/test_wal_restore.py | 20 +- 12 files changed, 269 insertions(+), 199 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 63b809a786..715c0753af 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -65,17 +65,8 @@ BASE_PORT = 15000 WORKER_PORT_NUM = 1000 -# These are set in pytest_configure() -base_dir = "" -neon_binpath = "" -pg_distrib_dir = "" -top_output_dir = "" -default_pg_version = "" - - def pytest_configure(config): """ - Ensure that no unwanted daemons are running before we start testing. Check that we do not overflow available ports range. """ @@ -85,67 +76,89 @@ def pytest_configure(config): ): # do not use ephemeral ports raise Exception("Too many workers configured. Cannot distribute ports for services.") + +@pytest.fixture(scope="session") +def base_dir() -> Iterator[Path]: # find the base directory (currently this is the git root) - global base_dir - base_dir = os.path.normpath(os.path.join(get_self_dir(), "../..")) + base_dir = get_self_dir().parent.parent log.info(f"base_dir is {base_dir}") - # Compute the top-level directory for all tests. - global top_output_dir - env_test_output = os.environ.get("TEST_OUTPUT") - if env_test_output is not None: - top_output_dir = env_test_output - else: - top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR) - Path(top_output_dir).mkdir(exist_ok=True) + yield base_dir - # Find the postgres installation. - global default_pg_version - log.info(f"default_pg_version is {default_pg_version}") - env_default_pg_version = os.environ.get("DEFAULT_PG_VERSION") - if env_default_pg_version: - default_pg_version = env_default_pg_version - log.info(f"default_pg_version is set to {default_pg_version}") - else: - default_pg_version = DEFAULT_PG_VERSION_DEFAULT - - global pg_distrib_dir - - env_postgres_bin = os.environ.get("POSTGRES_DISTRIB_DIR") - if env_postgres_bin: - pg_distrib_dir = env_postgres_bin - else: - pg_distrib_dir = os.path.normpath(os.path.join(base_dir, "pg_install")) - - log.info(f"pg_distrib_dir is {pg_distrib_dir}") - psql_bin_path = os.path.join(pg_distrib_dir, "v{}".format(default_pg_version), "bin/psql") - postgres_bin_path = os.path.join( - pg_distrib_dir, "v{}".format(default_pg_version), "bin/postgres" - ) - - if os.getenv("REMOTE_ENV"): - # When testing against a remote server, we only need the client binary. - if not os.path.exists(psql_bin_path): - raise Exception('psql not found at "{}"'.format(psql_bin_path)) - else: - if not os.path.exists(postgres_bin_path): - raise Exception('postgres not found at "{}"'.format(postgres_bin_path)) +@pytest.fixture(scope="session") +def neon_binpath(base_dir: Path) -> Iterator[Path]: if os.getenv("REMOTE_ENV"): # we are in remote env and do not have neon binaries locally # this is the case for benchmarks run on self-hosted runner return + # Find the neon binaries. - global neon_binpath - env_neon_bin = os.environ.get("NEON_BIN") - if env_neon_bin: - neon_binpath = env_neon_bin + if env_neon_bin := os.environ.get("NEON_BIN"): + binpath = Path(env_neon_bin) else: build_type = os.environ.get("BUILD_TYPE", "debug") - neon_binpath = os.path.join(base_dir, "target", build_type) - log.info(f"neon_binpath is {neon_binpath}") - if not os.path.exists(os.path.join(neon_binpath, "pageserver")): - raise Exception('neon binaries not found at "{}"'.format(neon_binpath)) + binpath = base_dir / "target" / build_type + log.info(f"neon_binpath is {binpath}") + + if not (binpath / "pageserver").exists(): + raise Exception(f"neon binaries not found at '{binpath}'") + + yield binpath + + +@pytest.fixture(scope="session") +def pg_distrib_dir(base_dir: Path) -> Iterator[Path]: + if env_postgres_bin := os.environ.get("POSTGRES_DISTRIB_DIR"): + distrib_dir = Path(env_postgres_bin).resolve() + else: + distrib_dir = base_dir / "pg_install" + + log.info(f"pg_distrib_dir is {distrib_dir}") + yield distrib_dir + + +@pytest.fixture(scope="session") +def top_output_dir(base_dir: Path) -> Iterator[Path]: + # Compute the top-level directory for all tests. + if env_test_output := os.environ.get("TEST_OUTPUT"): + output_dir = Path(env_test_output).resolve() + else: + output_dir = base_dir / DEFAULT_OUTPUT_DIR + output_dir.mkdir(exist_ok=True) + + log.info(f"top_output_dir is {output_dir}") + yield output_dir + + +@pytest.fixture(scope="session") +def pg_version() -> Iterator[str]: + if env_default_pg_version := os.environ.get("DEFAULT_PG_VERSION"): + version = env_default_pg_version + else: + version = DEFAULT_PG_VERSION_DEFAULT + + log.info(f"pg_version is {version}") + yield version + + +@pytest.fixture(scope="session") +def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: str) -> Iterator[Path]: + versioned_dir = pg_distrib_dir / f"v{pg_version}" + + psql_bin_path = versioned_dir / "bin/psql" + postgres_bin_path = versioned_dir / "bin/postgres" + + if os.getenv("REMOTE_ENV"): + # When testing against a remote server, we only need the client binary. + if not psql_bin_path.exists(): + raise Exception(f"psql not found at '{psql_bin_path}'") + else: + if not postgres_bin_path.exists: + raise Exception(f"postgres not found at '{postgres_bin_path}'") + + log.info(f"versioned_pg_distrib_dir is {versioned_dir}") + yield versioned_dir def shareable_scope(fixture_name, config) -> Literal["session", "function"]: @@ -232,16 +245,18 @@ def port_distributor(worker_base_port): @pytest.fixture(scope="session") -def default_broker(request: Any, port_distributor: PortDistributor): +def default_broker(request: Any, port_distributor: PortDistributor, top_output_dir: Path): client_port = port_distributor.get_port() # multiple pytest sessions could get launched in parallel, get them different datadirs - etcd_datadir = os.path.join(get_test_output_dir(request), f"etcd_datadir_{client_port}") - Path(etcd_datadir).mkdir(exist_ok=True, parents=True) + etcd_datadir = get_test_output_dir(request, top_output_dir) / f"etcd_datadir_{client_port}" + etcd_datadir.mkdir(exist_ok=True, parents=True) - broker = Etcd(datadir=etcd_datadir, port=client_port, peer_port=port_distributor.get_port()) + broker = Etcd( + datadir=str(etcd_datadir), port=client_port, peer_port=port_distributor.get_port() + ) yield broker broker.stop() - allure_attach_from_dir(Path(etcd_datadir)) + allure_attach_from_dir(etcd_datadir) @pytest.fixture(scope="session") @@ -521,6 +536,9 @@ class NeonEnvBuilder: broker: Etcd, run_id: uuid.UUID, mock_s3_server: MockS3Server, + neon_binpath: Path, + pg_distrib_dir: Path, + pg_version: str, remote_storage: Optional[RemoteStorage] = None, remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER, pageserver_config_override: Optional[str] = None, @@ -550,7 +568,9 @@ class NeonEnvBuilder: self.env: Optional[NeonEnv] = None self.remote_storage_prefix: Optional[str] = None self.keep_remote_storage_contents: bool = True - self.pg_version = default_pg_version + self.neon_binpath = neon_binpath + self.pg_distrib_dir = pg_distrib_dir + self.pg_version = pg_version def init(self) -> NeonEnv: # Cannot create more than one environment from one builder @@ -766,6 +786,8 @@ class NeonEnv: self.remote_storage = config.remote_storage self.remote_storage_users = config.remote_storage_users self.pg_version = config.pg_version + self.neon_binpath = config.neon_binpath + self.pg_distrib_dir = config.pg_distrib_dir # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. @@ -861,7 +883,7 @@ class NeonEnv: return self.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) def get_pageserver_version(self) -> str: - bin_pageserver = os.path.join(str(neon_binpath), "pageserver") + bin_pageserver = str(self.neon_binpath / "pageserver") res = subprocess.run( [bin_pageserver, "--version"], check=True, @@ -885,6 +907,10 @@ def _shared_simple_env( mock_s3_server: MockS3Server, default_broker: Etcd, run_id: uuid.UUID, + top_output_dir: Path, + neon_binpath: Path, + pg_distrib_dir: Path, + pg_version: str, ) -> Iterator[NeonEnv]: """ # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES @@ -893,17 +919,20 @@ def _shared_simple_env( if os.environ.get("TEST_SHARED_FIXTURES") is None: # Create the environment in the per-test output directory - repo_dir = os.path.join(get_test_output_dir(request), "repo") + repo_dir = get_test_output_dir(request, top_output_dir) / "repo" else: # We're running shared fixtures. Share a single directory. - repo_dir = os.path.join(str(top_output_dir), "shared_repo") + repo_dir = top_output_dir / "shared_repo" shutil.rmtree(repo_dir, ignore_errors=True) with NeonEnvBuilder( - repo_dir=Path(repo_dir), + repo_dir=repo_dir, port_distributor=port_distributor, broker=default_broker, mock_s3_server=mock_s3_server, + neon_binpath=neon_binpath, + pg_distrib_dir=pg_distrib_dir, + pg_version=pg_version, run_id=run_id, ) as builder: env = builder.init_start() @@ -934,6 +963,9 @@ def neon_env_builder( test_output_dir, port_distributor: PortDistributor, mock_s3_server: MockS3Server, + neon_binpath: Path, + pg_distrib_dir: Path, + pg_version: str, default_broker: Etcd, run_id: uuid.UUID, ) -> Iterator[NeonEnvBuilder]: @@ -958,6 +990,9 @@ def neon_env_builder( repo_dir=Path(repo_dir), port_distributor=port_distributor, mock_s3_server=mock_s3_server, + neon_binpath=neon_binpath, + pg_distrib_dir=pg_distrib_dir, + pg_version=pg_version, broker=default_broker, run_id=run_id, ) as builder: @@ -1240,7 +1275,7 @@ class AbstractNeonCli(abc.ABC): assert type(arguments) == list assert type(self.COMMAND) == str - bin_neon = os.path.join(str(neon_binpath), self.COMMAND) + bin_neon = str(self.env.neon_binpath / self.COMMAND) args = [bin_neon] + arguments log.info('Running command "{}"'.format(" ".join(args))) @@ -1248,7 +1283,7 @@ class AbstractNeonCli(abc.ABC): env_vars = os.environ.copy() env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir) - env_vars["POSTGRES_DISTRIB_DIR"] = str(pg_distrib_dir) + env_vars["POSTGRES_DISTRIB_DIR"] = str(self.env.pg_distrib_dir) if self.env.rust_log_override is not None: env_vars["RUST_LOG"] = self.env.rust_log_override for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): @@ -1723,17 +1758,17 @@ def append_pageserver_param_overrides( class PgBin: """A helper class for executing postgres binaries""" - def __init__(self, log_dir: Path, pg_version: str): + def __init__(self, log_dir: Path, pg_distrib_dir: Path, pg_version: str): self.log_dir = log_dir self.pg_version = pg_version - self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin") - self.pg_lib_dir = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "lib") + self.pg_bin_path = pg_distrib_dir / f"v{pg_version}" / "bin" + self.pg_lib_dir = pg_distrib_dir / f"v{pg_version}" / "lib" self.env = os.environ.copy() - self.env["LD_LIBRARY_PATH"] = self.pg_lib_dir + self.env["LD_LIBRARY_PATH"] = str(self.pg_lib_dir) def _fixpath(self, command: List[str]): - if "/" not in command[0]: - command[0] = os.path.join(self.pg_bin_path, command[0]) + if "/" not in str(command[0]): + command[0] = str(self.pg_bin_path / command[0]) def _build_env(self, env_add: Optional[Env]) -> Env: if env_add is None: @@ -1757,7 +1792,7 @@ class PgBin: """ self._fixpath(command) - log.info('Running command "{}"'.format(" ".join(command))) + log.info(f"Running command '{' '.join(command)}'") env = self._build_env(env) subprocess.run(command, env=env, cwd=cwd, check=True) @@ -1776,16 +1811,14 @@ class PgBin: """ self._fixpath(command) - log.info('Running command "{}"'.format(" ".join(command))) + log.info(f"Running command '{' '.join(command)}'") env = self._build_env(env) - return subprocess_capture( - str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs - ) + return subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs) @pytest.fixture(scope="function") -def pg_bin(test_output_dir: Path, pg_version: str) -> PgBin: - return PgBin(test_output_dir, pg_version) +def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: str) -> PgBin: + return PgBin(test_output_dir, pg_distrib_dir, pg_version) class VanillaPostgres(PgProtocol): @@ -1832,19 +1865,15 @@ class VanillaPostgres(PgProtocol): self.stop() -@pytest.fixture(scope="session") -def pg_version() -> str: - return default_pg_version - - @pytest.fixture(scope="function") def vanilla_pg( test_output_dir: Path, port_distributor: PortDistributor, + pg_distrib_dir: Path, pg_version: str, ) -> Iterator[VanillaPostgres]: pgdatadir = test_output_dir / "pgdata-vanilla" - pg_bin = PgBin(test_output_dir, pg_version) + pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) port = port_distributor.get_port() with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: yield vanilla_pg @@ -1880,8 +1909,10 @@ class RemotePostgres(PgProtocol): @pytest.fixture(scope="function") -def remote_pg(test_output_dir: Path, pg_version: str) -> Iterator[RemotePostgres]: - pg_bin = PgBin(test_output_dir, pg_version) +def remote_pg( + test_output_dir: Path, pg_distrib_dir: Path, pg_version: str +) -> Iterator[RemotePostgres]: + pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) connstr = os.getenv("BENCHMARK_CONNSTR") if connstr is None: @@ -1926,10 +1957,18 @@ class PSQL: class NeonProxy(PgProtocol): - def __init__(self, proxy_port: int, http_port: int, auth_endpoint=None, mgmt_port=None): + def __init__( + self, + proxy_port: int, + http_port: int, + neon_binpath: Path, + auth_endpoint=None, + mgmt_port=None, + ): super().__init__(dsn=auth_endpoint, port=proxy_port) self.host = "127.0.0.1" self.http_port = http_port + self.neon_binpath = neon_binpath self.proxy_port = proxy_port self.mgmt_port = mgmt_port self.auth_endpoint = auth_endpoint @@ -1945,7 +1984,7 @@ class NeonProxy(PgProtocol): # Start proxy args = [ - os.path.join(neon_binpath, "proxy"), + str(self.neon_binpath / "proxy"), *["--http", f"{self.host}:{self.http_port}"], *["--proxy", f"{self.host}:{self.proxy_port}"], *["--auth-backend", "postgres"], @@ -1961,7 +2000,7 @@ class NeonProxy(PgProtocol): assert self._popen is None # Start proxy - bin_proxy = os.path.join(str(neon_binpath), "proxy") + bin_proxy = str(self.neon_binpath / "proxy") args = [bin_proxy] args.extend(["--http", f"{self.host}:{self.http_port}"]) args.extend(["--proxy", f"{self.host}:{self.proxy_port}"]) @@ -1993,18 +2032,18 @@ class NeonProxy(PgProtocol): @pytest.fixture(scope="function") -def link_proxy(port_distributor) -> Iterator[NeonProxy]: +def link_proxy(port_distributor, neon_binpath: Path) -> Iterator[NeonProxy]: """Neon proxy that routes through link auth.""" http_port = port_distributor.get_port() proxy_port = port_distributor.get_port() mgmt_port = port_distributor.get_port() - with NeonProxy(proxy_port, http_port, mgmt_port=mgmt_port) as proxy: + with NeonProxy(proxy_port, http_port, neon_binpath=neon_binpath, mgmt_port=mgmt_port) as proxy: proxy.start_with_link_auth() yield proxy @pytest.fixture(scope="function") -def static_proxy(vanilla_pg, port_distributor) -> Iterator[NeonProxy]: +def static_proxy(vanilla_pg, port_distributor, neon_binpath: Path) -> Iterator[NeonProxy]: """Neon proxy that routes directly to vanilla postgres.""" # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql` @@ -2020,7 +2059,10 @@ def static_proxy(vanilla_pg, port_distributor) -> Iterator[NeonProxy]: http_port = port_distributor.get_port() with NeonProxy( - proxy_port=proxy_port, http_port=http_port, auth_endpoint=auth_endpoint + proxy_port=proxy_port, + http_port=http_port, + neon_binpath=neon_binpath, + auth_endpoint=auth_endpoint, ) as proxy: proxy.start() yield proxy @@ -2523,10 +2565,10 @@ class Etcd: self.handle.wait() -def get_test_output_dir(request: Any) -> Path: +def get_test_output_dir(request: Any, top_output_dir: Path) -> Path: """Compute the working directory for an individual test.""" test_name = request.node.name - test_dir = Path(top_output_dir) / test_name.replace("/", "-") + test_dir = top_output_dir / test_name.replace("/", "-") log.info(f"get_test_output_dir is {test_dir}") # make mypy happy assert isinstance(test_dir, Path) @@ -2543,11 +2585,11 @@ def get_test_output_dir(request: Any) -> Path: # this fixture ensures that the directory exists. That works because # 'autouse' fixtures are run before other fixtures. @pytest.fixture(scope="function", autouse=True) -def test_output_dir(request: Any) -> Iterator[Path]: +def test_output_dir(request: Any, top_output_dir: Path) -> Iterator[Path]: """Create the working directory for an individual test.""" # one directory per test - test_dir = get_test_output_dir(request) + test_dir = get_test_output_dir(request, top_output_dir) log.info(f"test_output_dir is {test_dir}") shutil.rmtree(test_dir, ignore_errors=True) test_dir.mkdir() @@ -2639,7 +2681,7 @@ def check_restored_datadir_content( restored_dir_path = env.repo_dir / f"{pg.node_name}_restored_datadir" restored_dir_path.mkdir(exist_ok=True) - pg_bin = PgBin(test_output_dir, env.pg_version) + pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) psql_path = os.path.join(pg_bin.pg_bin_path, "psql") cmd = rf""" diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 1242305ec3..e73453f2c4 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -15,12 +15,13 @@ from psycopg2.extensions import cursor Fn = TypeVar("Fn", bound=Callable[..., Any]) -def get_self_dir() -> str: +def get_self_dir() -> Path: """Get the path to the directory where this script lives.""" - return os.path.dirname(os.path.abspath(__file__)) + # return os.path.dirname(os.path.abspath(__file__)) + return Path(__file__).resolve().parent -def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: +def subprocess_capture(capture_dir: Path, cmd: List[str], **kwargs: Any) -> str: """Run a process and capture its output Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py index 2dbab19e7a..6ffe3bf918 100644 --- a/test_runner/pg_clients/test_pg_clients.py +++ b/test_runner/pg_clients/test_pg_clients.py @@ -46,9 +46,9 @@ def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: st raise RuntimeError("docker is required for running this test") build_cmd = [docker_bin, "build", "--tag", image_tag, f"{Path(__file__).parent / client}"] - subprocess_capture(str(test_output_dir), build_cmd, check=True) + subprocess_capture(test_output_dir, build_cmd, check=True) run_cmd = [docker_bin, "run", "--rm", "--env-file", env_file, image_tag] - basepath = subprocess_capture(str(test_output_dir), run_cmd, check=True) + basepath = subprocess_capture(test_output_dir, run_cmd, check=True) assert Path(f"{basepath}.stdout").read_text().strip() == "1" diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 100027048f..b0643ec05e 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -80,7 +80,12 @@ class PortReplacer(object): @pytest.mark.order(after="test_prepare_snapshot") def test_backward_compatibility( - pg_bin: PgBin, port_distributor: PortDistributor, test_output_dir: Path, request: FixtureRequest + pg_bin: PgBin, + port_distributor: PortDistributor, + test_output_dir: Path, + request: FixtureRequest, + neon_binpath: Path, + pg_distrib_dir: Path, ): compatibility_snapshot_dir = Path( os.environ.get("COMPATIBILITY_SNAPSHOT_DIR", DEFAILT_LOCAL_SNAPSHOT_DIR) @@ -170,6 +175,8 @@ def test_backward_compatibility( config.repo_dir = repo_dir config.pg_version = "14" # Note: `pg_dumpall` (from pg_bin) version is set by DEFAULT_PG_VERSION_DEFAULT and can be overriden by DEFAULT_PG_VERSION env var config.initial_tenant = snapshot_config["default_tenant_id"] + config.neon_binpath = neon_binpath + config.pg_distrib_dir = pg_distrib_dir # Check that we can start the project cli = NeonCli(config) diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index 0048e7b580..fc515e5878 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -1,13 +1,8 @@ import os +from pathlib import Path from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - NeonEnvBuilder, - PgBin, - PortDistributor, - VanillaPostgres, - pg_distrib_dir, -) +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar, subprocess_capture @@ -16,7 +11,10 @@ num_rows = 1000 # Ensure that regular postgres can start from fullbackup def test_fullbackup( - neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + port_distributor: PortDistributor, + pg_distrib_dir: Path, ): env = neon_env_builder.init_start() @@ -40,7 +38,7 @@ def test_fullbackup( # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": os.path.join(str(pg_distrib_dir), "lib")} + psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} # Get and unpack fullbackup from pageserver restored_dir_path = env.repo_dir / "restored_datadir" @@ -49,9 +47,7 @@ def test_fullbackup( cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] result_basepath = pg_bin.run_capture(cmd, env=psql_env) tar_output_file = result_basepath + ".stdout" - subprocess_capture( - str(env.repo_dir), ["tar", "-xf", tar_output_file, "-C", str(restored_dir_path)] - ) + subprocess_capture(env.repo_dir, ["tar", "-xf", tar_output_file, "-C", str(restored_dir_path)]) # HACK # fullbackup returns neon specific pg_control and first WAL segment diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index c888c6f7ee..ced5e18406 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -13,7 +13,6 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, Postgres, - pg_distrib_dir, wait_for_last_record_lsn, wait_for_upload, ) @@ -128,7 +127,7 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu num_rows = 3000 lsn = _generate_data(num_rows, pg) - _import(num_rows, lsn, env, pg_bin, timeline) + _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir) @pytest.mark.timeout(1800) @@ -156,7 +155,7 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB") assert logical_size > 1024**3 # = 1GB - tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline) + tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir) # Check if the backup data contains multiple segment files cnt_seg_files = 0 @@ -191,7 +190,12 @@ def _generate_data(num_rows: int, pg: Postgres) -> Lsn: def _import( - expected_num_rows: int, lsn: Lsn, env: NeonEnv, pg_bin: PgBin, timeline: TimelineId + expected_num_rows: int, + lsn: Lsn, + env: NeonEnv, + pg_bin: PgBin, + timeline: TimelineId, + pg_distrib_dir: Path, ) -> str: """Test importing backup data to the pageserver. @@ -205,7 +209,7 @@ def _import( # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": os.path.join(str(pg_distrib_dir), "lib")} + psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} # Get a fullbackup from pageserver query = f"fullbackup { env.initial_tenant} {timeline} {lsn}" diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index f5e02af8dd..ab321eeb02 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -1,5 +1,5 @@ -import pathlib import subprocess +from pathlib import Path from typing import Optional from fixtures.neon_fixtures import ( @@ -7,18 +7,18 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PageserverHttpClient, - neon_binpath, - pg_distrib_dir, ) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until # test that we cannot override node id after init -def test_pageserver_init_node_id(neon_simple_env: NeonEnv): +def test_pageserver_init_node_id( + neon_simple_env: NeonEnv, neon_binpath: Path, pg_distrib_dir: Path +): repo_dir = neon_simple_env.repo_dir pageserver_config = repo_dir / "pageserver.toml" - pageserver_bin = pathlib.Path(neon_binpath) / "pageserver" + pageserver_bin = neon_binpath / "pageserver" def run_pageserver(args): return subprocess.run( diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index f23811b671..5eb1ebb3de 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -1,11 +1,10 @@ # # This file runs pg_regress-based tests. # -import os from pathlib import Path import pytest -from fixtures.neon_fixtures import NeonEnv, base_dir, check_restored_datadir_content, pg_distrib_dir +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content # Run the main PostgreSQL regression tests, in src/test/regress. @@ -13,7 +12,14 @@ from fixtures.neon_fixtures import NeonEnv, base_dir, check_restored_datadir_con # This runs for a long time, especially in debug mode, so use a larger-than-default # timeout. @pytest.mark.timeout(1800) -def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): +def test_pg_regress( + neon_simple_env: NeonEnv, + test_output_dir: Path, + pg_bin, + capsys, + base_dir: Path, + pg_distrib_dir: Path, +): env = neon_simple_env env.neon_cli.create_branch("test_pg_regress", "empty") @@ -26,20 +32,20 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, cap (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. - build_path = os.path.join(pg_distrib_dir, "build/v{}/src/test/regress").format(env.pg_version) - src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/regress").format(env.pg_version) - bindir = os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin") - schedule = os.path.join(src_path, "parallel_schedule") - pg_regress = os.path.join(build_path, "pg_regress") + build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/regress" + src_path = base_dir / f"vendor/postgres-v{env.pg_version}/src/test/regress" + bindir = pg_distrib_dir / f"v{env.pg_version}/bin" + schedule = src_path / "parallel_schedule" + pg_regress = build_path / "pg_regress" pg_regress_command = [ - pg_regress, + str(pg_regress), '--bindir=""', "--use-existing", - "--bindir={}".format(bindir), - "--dlpath={}".format(build_path), - "--schedule={}".format(schedule), - "--inputdir={}".format(src_path), + f"--bindir={bindir}", + f"--dlpath={build_path}", + f"--schedule={schedule}", + f"--inputdir={src_path}", ] env_vars = { @@ -66,7 +72,14 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, cap # This runs for a long time, especially in debug mode, so use a larger-than-default # timeout. @pytest.mark.timeout(1800) -def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): +def test_isolation( + neon_simple_env: NeonEnv, + test_output_dir: Path, + pg_bin, + capsys, + base_dir: Path, + pg_distrib_dir: Path, +): env = neon_simple_env env.neon_cli.create_branch("test_isolation", "empty") @@ -80,21 +93,19 @@ def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, caps (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_isolation_regress will need. - build_path = os.path.join(pg_distrib_dir, "build/v{}/src/test/isolation".format(env.pg_version)) - src_path = os.path.join( - base_dir, "vendor/postgres-v{}/src/test/isolation".format(env.pg_version) - ) - bindir = os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin") - schedule = os.path.join(src_path, "isolation_schedule") - pg_isolation_regress = os.path.join(build_path, "pg_isolation_regress") + build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/isolation" + src_path = base_dir / f"vendor/postgres-v{env.pg_version}/src/test/isolation" + bindir = pg_distrib_dir / f"v{env.pg_version}/bin" + schedule = src_path / "isolation_schedule" + pg_isolation_regress = build_path / "pg_isolation_regress" pg_isolation_regress_command = [ - pg_isolation_regress, + str(pg_isolation_regress), "--use-existing", - "--bindir={}".format(bindir), - "--dlpath={}".format(build_path), - "--inputdir={}".format(src_path), - "--schedule={}".format(schedule), + f"--bindir={bindir}", + f"--dlpath={build_path}", + f"--inputdir={src_path}", + f"--schedule={schedule}", ] env_vars = { @@ -112,7 +123,14 @@ def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, caps # Run extra Neon-specific pg_regress-based tests. The tests and their # schedule file are in the sql_regress/ directory. -def test_sql_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): +def test_sql_regress( + neon_simple_env: NeonEnv, + test_output_dir: Path, + pg_bin, + capsys, + base_dir: Path, + pg_distrib_dir: Path, +): env = neon_simple_env env.neon_cli.create_branch("test_sql_regress", "empty") @@ -126,19 +144,19 @@ def test_sql_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, ca # Compute all the file locations that pg_regress will need. # This test runs neon specific tests - build_path = os.path.join(pg_distrib_dir, "build/v{}/src/test/regress").format(env.pg_version) - src_path = os.path.join(base_dir, "test_runner/sql_regress") - bindir = os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin") - schedule = os.path.join(src_path, "parallel_schedule") - pg_regress = os.path.join(build_path, "pg_regress") + build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/regress" + src_path = base_dir / "test_runner/sql_regress" + bindir = pg_distrib_dir / f"v{env.pg_version}/bin" + schedule = src_path / "parallel_schedule" + pg_regress = build_path / "pg_regress" pg_regress_command = [ - pg_regress, + str(pg_regress), "--use-existing", - "--bindir={}".format(bindir), - "--dlpath={}".format(build_path), - "--schedule={}".format(schedule), - "--inputdir={}".format(src_path), + f"--bindir={bindir}", + f"--dlpath={build_path}", + f"--schedule={schedule}", + f"--inputdir={src_path}", ] env_vars = { diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index fa00a4da82..aec45307f7 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -1,7 +1,7 @@ import os -import pathlib import threading from contextlib import closing, contextmanager +from pathlib import Path from typing import Any, Dict, Optional, Tuple import pytest @@ -14,9 +14,6 @@ from fixtures.neon_fixtures import ( PortDistributor, Postgres, assert_no_in_progress_downloads_for_tenant, - base_dir, - neon_binpath, - pg_distrib_dir, wait_for_last_record_lsn, wait_for_upload, ) @@ -30,12 +27,13 @@ def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @contextmanager def new_pageserver_service( - new_pageserver_dir: pathlib.Path, - pageserver_bin: pathlib.Path, - remote_storage_mock_path: pathlib.Path, + new_pageserver_dir: Path, + pageserver_bin: Path, + remote_storage_mock_path: Path, pg_port: int, http_port: int, broker: Optional[Etcd], + pg_distrib_dir: Path, ): """ cannot use NeonPageserver yet because it depends on neon cli @@ -193,10 +191,10 @@ def switch_pg_to_new_pageserver( new_pageserver_port: int, tenant_id: TenantId, timeline_id: TimelineId, -) -> pathlib.Path: +) -> Path: pg.stop() - pg_config_file_path = pathlib.Path(pg.config_file_path()) + pg_config_file_path = Path(pg.config_file_path()) pg_config_file_path.open("a").write( f"\nneon.pageserver_connstring = 'postgresql://no_user:@localhost:{new_pageserver_port}'" ) @@ -219,7 +217,7 @@ def switch_pg_to_new_pageserver( return timeline_to_detach_local_path -def post_migration_check(pg: Postgres, sum_before_migration: int, old_local_path: pathlib.Path): +def post_migration_check(pg: Postgres, sum_before_migration: int, old_local_path: Path): with pg_cur(pg) as cur: # check that data is still there cur.execute("SELECT sum(key) FROM t") @@ -251,7 +249,9 @@ def post_migration_check(pg: Postgres, sum_before_migration: int, old_local_path def test_tenant_relocation( neon_env_builder: NeonEnvBuilder, port_distributor: PortDistributor, - test_output_dir, + test_output_dir: Path, + neon_binpath: Path, + base_dir: Path, method: str, with_load: str, ): @@ -350,7 +350,7 @@ def test_tenant_relocation( new_pageserver_pg_port = port_distributor.get_port() new_pageserver_http_port = port_distributor.get_port() log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port) - pageserver_bin = pathlib.Path(neon_binpath) / "pageserver" + pageserver_bin = neon_binpath / "pageserver" new_pageserver_http = PageserverHttpClient( port=new_pageserver_http_port, @@ -365,6 +365,7 @@ def test_tenant_relocation( new_pageserver_pg_port, new_pageserver_http_port, neon_env_builder.broker, + neon_env_builder.pg_distrib_dir, ): # Migrate either by attaching from s3 or import/export basebackup @@ -373,7 +374,7 @@ def test_tenant_relocation( "poetry", "run", "python", - os.path.join(base_dir, "scripts/export_import_between_pageservers.py"), + str(base_dir / "scripts/export_import_between_pageservers.py"), "--tenant-id", str(tenant_id), "--from-host", @@ -389,9 +390,9 @@ def test_tenant_relocation( "--to-pg-port", str(new_pageserver_pg_port), "--pg-distrib-dir", - pg_distrib_dir, + str(neon_env_builder.pg_distrib_dir), "--work-dir", - os.path.join(test_output_dir), + str(test_output_dir), "--tmp-pg-port", str(port_distributor.get_port()), ] diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index c87e9a6720..ec2bed7fee 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -338,6 +338,7 @@ def test_timeline_size_metrics( neon_simple_env: NeonEnv, test_output_dir: Path, port_distributor: PortDistributor, + pg_distrib_dir: Path, pg_version: str, ): env = neon_simple_env @@ -382,7 +383,7 @@ def test_timeline_size_metrics( tl_logical_size_metric = int(matches.group(1)) pgdatadir = test_output_dir / "pgdata-vanilla" - pg_bin = PgBin(test_output_dir, pg_version) + pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) port = port_distributor.get_port() with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: vanilla_pg.configure([f"port={port}"]) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 09f6f4b9f9..8ef7f27752 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -30,7 +30,6 @@ from fixtures.neon_fixtures import ( SafekeeperHttpClient, SafekeeperPort, available_remote_storages, - neon_binpath, wait_for_last_record_lsn, wait_for_upload, ) @@ -797,6 +796,7 @@ class SafekeeperEnv: repo_dir: Path, port_distributor: PortDistributor, pg_bin: PgBin, + neon_binpath: Path, num_safekeepers: int = 1, ): self.repo_dir = repo_dir @@ -808,7 +808,7 @@ class SafekeeperEnv: ) self.pg_bin = pg_bin self.num_safekeepers = num_safekeepers - self.bin_safekeeper = os.path.join(str(neon_binpath), "safekeeper") + self.bin_safekeeper = str(neon_binpath / "safekeeper") self.safekeepers: Optional[List[subprocess.CompletedProcess[Any]]] = None self.postgres: Optional[ProposerPostgres] = None self.tenant_id: Optional[TenantId] = None @@ -911,7 +911,10 @@ class SafekeeperEnv: def test_safekeeper_without_pageserver( - test_output_dir: str, port_distributor: PortDistributor, pg_bin: PgBin + test_output_dir: str, + port_distributor: PortDistributor, + pg_bin: PgBin, + neon_binpath: Path, ): # Create the environment in the test-specific output dir repo_dir = Path(os.path.join(test_output_dir, "repo")) @@ -920,6 +923,7 @@ def test_safekeeper_without_pageserver( repo_dir, port_distributor, pg_bin, + neon_binpath, ) with env: diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index db6f1e5137..e1b1e03515 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -1,14 +1,6 @@ -import os from pathlib import Path -from fixtures.neon_fixtures import ( - NeonEnvBuilder, - PgBin, - PortDistributor, - VanillaPostgres, - base_dir, - pg_distrib_dir, -) +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres from fixtures.types import TenantId @@ -17,6 +9,8 @@ def test_wal_restore( pg_bin: PgBin, test_output_dir: Path, port_distributor: PortDistributor, + base_dir: Path, + pg_distrib_dir: Path, ): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_wal_restore") @@ -26,11 +20,13 @@ def test_wal_restore( env.neon_cli.pageserver_stop() port = port_distributor.get_port() data_dir = test_output_dir / "pgsql.restored" - with VanillaPostgres(data_dir, PgBin(test_output_dir, env.pg_version), port) as restored: + with VanillaPostgres( + data_dir, PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version), port + ) as restored: pg_bin.run_capture( [ - os.path.join(base_dir, "libs/utils/scripts/restore_from_wal.sh"), - os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin"), + str(base_dir / "libs/utils/scripts/restore_from_wal.sh"), + str(pg_distrib_dir / f"v{env.pg_version}/bin"), str(test_output_dir / "repo" / "safekeepers" / "sk1" / str(tenant_id) / "*"), str(data_dir), str(port), From c3a470a29b14e5d71476f683da48506b3bf198b2 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 8 Nov 2022 14:03:13 +0100 Subject: [PATCH 1000/1022] walredo process management: handle every error on the kill() and drop path If we're not calling kill() before dropping the PostgresRedoProcess, we currently leak it. That's most likely the root cause for #2761. This patch 1. adds an error log message for that case and 2. adds error handling for all errors on the kill() path. If we're a `testing` build, we panic. Otherwise, we log an error and leak the process. The error handling changes (2) are necessary to conclusively state that the root cause for #2761 is indeed (1). If we didn't have them, the root cause could be missing error handling instead. To make the log messages useful, I've added tracing::instrument attributes that log the tenant_id and PID. That helps mapping back the PID of `defunct` processes to pageserver log messages. Note that a defunct process's `/proc/$PID/` directory isn't very useful. We have left little more than its PID. Once we have validated the root cause, we'll find a fix, but that's still an ongoing discussion. refs https://github.com/neondatabase/neon/issues/2761 closes https://github.com/neondatabase/neon/pull/2769 --- pageserver/src/walredo.rs | 68 +++++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 9 deletions(-) diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index e21ec4d742..a787da7069 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -229,7 +229,7 @@ impl PostgresRedoManager { // launch the WAL redo process on first use if process_guard.is_none() { - let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id, pg_version)?; + let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?; *process_guard = Some(p); } let process = process_guard.as_mut().unwrap(); @@ -579,26 +579,29 @@ impl CloseFileDescriptors for C { /// Handle to the Postgres WAL redo process /// struct PostgresRedoProcess { + tenant_id: TenantId, child: Child, stdin: ChildStdin, stdout: ChildStdout, stderr: ChildStderr, + called_kill: bool, } impl PostgresRedoProcess { // // Start postgres binary in special WAL redo mode. // + #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))] fn launch( conf: &PageServerConf, - tenant_id: &TenantId, + tenant_id: TenantId, pg_version: u32, ) -> Result { // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we // just create one with constant name. That fails if you try to launch more than // one WAL redo manager concurrently. let datadir = path_with_suffix_extension( - conf.tenant_path(tenant_id).join("wal-redo-datadir"), + conf.tenant_path(&tenant_id).join("wal-redo-datadir"), TEMP_FILE_SUFFIX, ); @@ -681,6 +684,7 @@ impl PostgresRedoProcess { })?; info!( + pid = child.id(), "launched WAL redo postgres process on {}", datadir.display() ); @@ -689,22 +693,59 @@ impl PostgresRedoProcess { let stdout = child.stdout.take().unwrap(); let stderr = child.stderr.take().unwrap(); - set_nonblock(stdin.as_raw_fd())?; - set_nonblock(stdout.as_raw_fd())?; - set_nonblock(stderr.as_raw_fd())?; + macro_rules! set_nonblock_or_log_err { + ($file:ident) => {{ + let res = set_nonblock($file.as_raw_fd()); + if let Err(e) = &res { + error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed"); + } + res + }}; + } + set_nonblock_or_log_err!(stdin)?; + set_nonblock_or_log_err!(stdout)?; + set_nonblock_or_log_err!(stderr)?; Ok(PostgresRedoProcess { + tenant_id, child, stdin, stdout, stderr, + called_kill: false, }) } + #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))] fn kill(mut self) { - let _ = self.child.kill(); - if let Ok(exit_status) = self.child.wait() { - error!("wal-redo-postgres exited with code {}", exit_status); + info!("killing wal-redo-postgres process"); + self.called_kill = true; + + let res = self.child.kill(); + if let Err(e) = res { + // This branch is very unlikely because: + // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it. + // - This is the only place that calls .kill() + // - We consume `self`, so, .kill() can't be called twice. + // - If the process exited by itself or was killed by someone else, + // .kill() will still succeed because we haven't wait()'ed yet. + // + // So, if we arrive here, we have really no idea what happened, + // whether the PID stored in self.child is still valid, etc. + // If this function were fallible, we'd return an error, but + // since it isn't, all we can do is log an error and proceed + // with the wait(). + error!(error = %e, "failed to SIGKILL wal-redo-postgres; subsequent wait() might fail or wait for wrong process"); + } + + match self.child.wait() { + Ok(exit_status) => { + // log at error level since .kill() is something we only do on errors ATM + error!(exit_status = %exit_status, "wal-redo-postgres wait successful"); + } + Err(e) => { + error!(error = %e, "wal-redo-postgres wait error; might leak the child process; it will show as zombie (defunct)"); + } } drop(self); } @@ -713,6 +754,7 @@ impl PostgresRedoProcess { // Apply given WAL records ('records') over an old page image. Returns // new page image. // + #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))] fn apply_wal_records( &mut self, tag: BufferTag, @@ -838,6 +880,14 @@ impl PostgresRedoProcess { } } +impl Drop for PostgresRedoProcess { + fn drop(&mut self) { + if !self.called_kill { + error!(tenant_id=%self.tenant_id, pid = %self.child.id(), "dropping PostgresRedoProcess that wasn't killed, likely causing defunct postgres process"); + } + } +} + // Functions for constructing messages to send to the postgres WAL redo // process. See pgxn/neon_walredo/walredoproc.c for // explanation of the protocol. From 40164bd5899878878be701798f4415dc2c968e67 Mon Sep 17 00:00:00 2001 From: andres Date: Tue, 25 Oct 2022 18:05:26 +0200 Subject: [PATCH 1001/1022] Use latestMsgReceivedAt in walproposer --- pgxn/neon/walproposer.c | 10 ++++------ pgxn/neon/walproposer.h | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index c78c79a9bb..c5f283aa22 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -438,9 +438,7 @@ WalProposerPoll(void) { Safekeeper *sk = &safekeeper[i]; - if ((sk->state == SS_CONNECTING_WRITE || - sk->state == SS_CONNECTING_READ) && - TimestampDifferenceExceeds(sk->startedConnAt, now, + if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now, wal_acceptor_connect_timeout)) { elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms", @@ -760,7 +758,7 @@ ResetConnection(Safekeeper *sk) elog(LOG, "connecting with node %s:%s", sk->host, sk->port); sk->state = SS_CONNECTING_WRITE; - sk->startedConnAt = GetCurrentTimestamp(); + sk->latestMsgReceivedAt = GetCurrentTimestamp(); sock = walprop_socket(sk->conn); sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk); @@ -918,7 +916,7 @@ HandleConnectionEvent(Safekeeper *sk) case WP_CONN_POLLING_OK: elog(LOG, "connected with node %s:%s", sk->host, sk->port); - + sk->latestMsgReceivedAt = GetCurrentTimestamp(); /* * We have to pick some event to update event set. We'll * eventually need the socket to be readable, so we go with that. @@ -2304,7 +2302,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg) ResetConnection(sk); return false; } - + sk->latestMsgReceivedAt = GetCurrentTimestamp(); switch (tag) { case 'g': diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index e237947441..0d3af54a68 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -371,7 +371,7 @@ typedef struct Safekeeper int eventPos; /* position in wait event set. Equal to -1 if* * no event */ SafekeeperState state; /* safekeeper state machine state */ - TimestampTz startedConnAt; /* when connection attempt started */ + TimestampTz latestMsgReceivedAt; /* when latest msg is received */ AcceptorGreeting greetResponse; /* acceptor greeting */ VoteResponse voteResponse; /* the vote */ AppendResponse appendResponse; /* feedback for master */ From 1cf257bc4ae224dbd5b411c3ea8675bb1a188ba4 Mon Sep 17 00:00:00 2001 From: andres Date: Fri, 4 Nov 2022 09:58:35 +0100 Subject: [PATCH 1002/1022] feedback --- compute_tools/src/pg_helpers.rs | 2 +- pgxn/neon/walproposer.c | 16 +++++++++------- pgxn/neon/walproposer.h | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 42aa00af01..289f223bda 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -65,7 +65,7 @@ impl GenericOption { let name = match self.name.as_str() { "safekeepers" => "neon.safekeepers", "wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout", - "wal_acceptor_connect_timeout" => "neon.safekeeper_connect_timeout", + "wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout", it => it, }; diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index c5f283aa22..c24142dca1 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -75,7 +75,7 @@ static bool syncSafekeepers = false; char *wal_acceptors_list; int wal_acceptor_reconnect_timeout; -int wal_acceptor_connect_timeout; +int wal_acceptor_connection_timeout; bool am_wal_proposer; char *neon_timeline_walproposer = NULL; @@ -266,9 +266,9 @@ nwp_register_gucs(void) DefineCustomIntVariable( "neon.safekeeper_connect_timeout", - "Timeout after which give up connection attempt to safekeeper.", + "Timeout for connection establishement and it's maintenance against safekeeper", NULL, - &wal_acceptor_connect_timeout, + &wal_acceptor_connection_timeout, 5000, 0, INT_MAX, PGC_SIGHUP, GUC_UNIT_MS, @@ -417,7 +417,9 @@ WalProposerPoll(void) ResetLatch(MyLatch); break; } - if (rc == 0) /* timeout expired: poll state */ + + now = GetCurrentTimestamp(); + if (rc == 0 || TimeToReconnect(now) <= 0) /* timeout expired: poll state */ { TimestampTz now; @@ -439,10 +441,10 @@ WalProposerPoll(void) Safekeeper *sk = &safekeeper[i]; if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now, - wal_acceptor_connect_timeout)) + wal_acceptor_connection_timeout)) { - elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms", - sk->host, sk->port, wal_acceptor_connect_timeout); + elog(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms", + sk->host, sk->port, FormatSafekeeperState(sk->state), wal_acceptor_connection_timeout); ShutdownConnection(sk); } } diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 0d3af54a68..3c4f080353 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -30,7 +30,7 @@ extern char *wal_acceptors_list; extern int wal_acceptor_reconnect_timeout; -extern int wal_acceptor_connect_timeout; +extern int wal_acceptor_connection_timeout; extern bool am_wal_proposer; struct WalProposerConn; /* Defined in libpqwalproposer */ From e999f66b01acfaa32869152aa87ecc7a0ddb2443 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 8 Nov 2022 15:35:13 +0200 Subject: [PATCH 1003/1022] Use a cached WaitEventSet instead of WaitLatchOrSocket. When we repeatedly wait for the same events, it's faster to create the event set once and reuse it. While testing with a sequential scan test case, I saw WaitLatchOrSocket consuming a lot of CPU: > - 40.52% 0.14% postgres postgres [.] WaitLatchOrSocket > - 40.38% WaitLatchOrSocket > + 17.83% AddWaitEventToSet > + 9.47% close@plt > + 8.29% CreateWaitEventSet > + 4.57% WaitEventSetWait This eliminates most of that overhead. --- pgxn/neon/libpagestore.c | 47 +++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index c37adc6cb7..1e4e18e7d1 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -40,6 +40,14 @@ bool connected = false; PGconn *pageserver_conn = NULL; +/* + * WaitEventSet containing: + * - WL_SOCKET_READABLE on pageserver_conn, + * - WL_LATCH_SET on MyLatch, and + * - WL_EXIT_ON_PM_DEATH. + */ +WaitEventSet *pageserver_conn_wes = NULL; + char *page_server_connstring_raw; int n_unflushed_requests = 0; @@ -63,6 +71,7 @@ pageserver_connect() PQfinish(pageserver_conn); pageserver_conn = NULL; + ereport(ERROR, (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), errmsg(NEON_TAG "could not establish connection to pageserver"), @@ -78,22 +87,26 @@ pageserver_connect() neon_log(ERROR, "could not send pagestream command to pageserver"); } + pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3); + AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET, + MyLatch, NULL); + AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); + AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL); + while (PQisBusy(pageserver_conn)) { int wc; + WaitEvent event; /* Sleep until there's something to do */ - wc = WaitLatchOrSocket(MyLatch, - WL_LATCH_SET | WL_SOCKET_READABLE | - WL_EXIT_ON_PM_DEATH, - PQsocket(pageserver_conn), - -1L, PG_WAIT_EXTENSION); + wc = WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); /* Data available in socket? */ - if (wc & WL_SOCKET_READABLE) + if (event.events & WL_SOCKET_READABLE) { if (!PQconsumeInput(pageserver_conn)) { @@ -101,6 +114,7 @@ pageserver_connect() PQfinish(pageserver_conn); pageserver_conn = NULL; + FreeWaitEventSet(pageserver_conn_wes); neon_log(ERROR, "could not complete handshake with pageserver: %s", msg); @@ -117,33 +131,30 @@ pageserver_connect() * A wrapper around PQgetCopyData that checks for interrupts while sleeping. */ static int -call_PQgetCopyData(PGconn *conn, char **buffer) +call_PQgetCopyData(char **buffer) { int ret; retry: - ret = PQgetCopyData(conn, buffer, 1 /* async */ ); + ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ ); if (ret == 0) { int wc; + WaitEvent event; /* Sleep until there's something to do */ - wc = WaitLatchOrSocket(MyLatch, - WL_LATCH_SET | WL_SOCKET_READABLE | - WL_EXIT_ON_PM_DEATH, - PQsocket(conn), - -1L, PG_WAIT_EXTENSION); + wc = WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); /* Data available in socket? */ - if (wc & WL_SOCKET_READABLE) + if (event.events & WL_SOCKET_READABLE) { - if (!PQconsumeInput(conn)) + if (!PQconsumeInput(pageserver_conn)) neon_log(ERROR, "could not get response from pageserver: %s", - PQerrorMessage(conn)); + PQerrorMessage(pageserver_conn)); } goto retry; @@ -172,6 +183,8 @@ pageserver_disconnect(void) prefetch_on_ps_disconnect(); } + if (pageserver_conn_wes != NULL) + FreeWaitEventSet(pageserver_conn_wes); } static void @@ -228,7 +241,7 @@ pageserver_receive(void) PG_TRY(); { /* read response */ - resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data); + resp_buff.len = call_PQgetCopyData(&resp_buff.data); resp_buff.cursor = 0; if (resp_buff.len < 0) From b55466045e33a5031e02c4d8d2f5263252b6f83e Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Tue, 8 Nov 2022 12:36:22 +0100 Subject: [PATCH 1004/1022] Introduce codeowners --- CODEOWNERS | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 CODEOWNERS diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 0000000000..4c8c8924d6 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,10 @@ +/compute_tools/ @neondatabase/control-plane +/control_plane/ @neondatabase/compute @neondatabase/storage +/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage +/libs/postgres_ffi/ @neondatabase/compute +/libs/remote_storage/ @neondatabase/storage +/libs/safekeeper_api/ @neondatabase/safekeepers +/pageserver/ @neondatabase/compute @neondatabase/storage +/pgxn/ @neondatabase/compute +/proxy/ @neondatabase/control-plane +/safekeeper/ @neondatabase/safekeepers From 4a10e1b066c8d153fb215c3395a3086881b40915 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 3 Nov 2022 15:21:28 +0200 Subject: [PATCH 1005/1022] Pass pushed storage Docker tag to e2e jobs --- .github/workflows/build_and_test.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 435265270a..707b20bcb8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -445,11 +445,15 @@ jobs: container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned options: --init - needs: [ build-neon ] + needs: [ push-docker-hub, tag ] steps: - name: Set PR's status to pending and request a remote CI test run: | + # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit + # but we need to use a real sha of a latest commit in the PR's branch for the e2e job, + # to place a job run status update later. COMMIT_SHA=${{ github.event.pull_request.head.sha }} + # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} REMOTE_REPO="${{ github.repository_owner }}/cloud" @@ -475,7 +479,9 @@ jobs: \"inputs\": { \"ci_job_name\": \"neon-cloud-e2e\", \"commit_hash\": \"$COMMIT_SHA\", - \"remote_repo\": \"${{ github.repository }}\" + \"remote_repo\": \"${{ github.repository }}\", + \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\", + \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\" } }" From c4f9f1dc6d7c9ac40c98e00240ff07c7e8ed1bb8 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 10 Nov 2022 09:06:34 +0000 Subject: [PATCH 1006/1022] Add data format forward compatibility tests (#2766) Add `test_forward_compatibility`, which checks if it's going to be possible to roll back a release to the previous version. The test uses artifacts (Neon & Postgres binaries) from the previous release to start Neon on the repo created by the current version. It performs exactly the same checks as `test_backward_compatibility` does. Single `ALLOW_BREAKING_CHANGES` env var got replaced by `ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE` & `ALLOW_FORWARD_COMPATIBILITY_BREAKAGE` and can be set by `backward compatibility breakage` and `forward compatibility breakage` labels respectively. --- .../actions/run-python-test-set/action.yml | 37 +- .github/workflows/build_and_test.yml | 43 +- poetry.lock | 25 +- pyproject.toml | 2 +- test_runner/fixtures/neon_fixtures.py | 33 ++ test_runner/fixtures/utils.py | 1 - test_runner/regress/test_compatibility.py | 367 +++++++++++------- 7 files changed, 302 insertions(+), 206 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 3459449e15..97783df444 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -55,6 +55,22 @@ runs: name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact path: /tmp/neon + - name: Download Neon binaries for the previous release + if: inputs.build_type != 'remote' + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact + path: /tmp/neon-previous + prefix: latest + + - name: Download compatibility snapshot for Postgres 14 + if: inputs.build_type != 'remote' + uses: ./.github/actions/download + with: + name: compatibility-snapshot-${{ inputs.build_type }}-pg14 + path: /tmp/compatibility_snapshot_pg14 + prefix: latest + - name: Checkout if: inputs.needs_postgres_source == 'true' uses: actions/checkout@v3 @@ -73,23 +89,18 @@ runs: shell: bash -euxo pipefail {0} run: ./scripts/pysync - - name: Download compatibility snapshot for Postgres 14 - if: inputs.build_type != 'remote' - uses: ./.github/actions/download - with: - name: compatibility-snapshot-${{ inputs.build_type }}-pg14 - path: /tmp/compatibility_snapshot_pg14 - prefix: latest - - name: Run pytest env: NEON_BIN: /tmp/neon/bin + COMPATIBILITY_NEON_BIN: /tmp/neon-previous/bin + COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install TEST_OUTPUT: /tmp/test_output BUILD_TYPE: ${{ inputs.build_type }} AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }} AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }} COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14 - ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes') + ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage') + ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage') shell: bash -euxo pipefail {0} run: | # PLATFORM will be embedded in the perf test report @@ -150,12 +161,16 @@ runs: # -n4 uses four processes to run tests via pytest-xdist # -s is not used to prevent pytest from capturing output, because tests are running # in parallel and logs are mixed between different tests + # --dist=loadgroup points tests marked with @pytest.mark.xdist_group to the same worker, + # to make @pytest.mark.order work with xdist + # mkdir -p $TEST_OUTPUT/allure/results "${cov_prefix[@]}" ./scripts/pytest \ --junitxml=$TEST_OUTPUT/junit.xml \ --alluredir=$TEST_OUTPUT/allure/results \ --tb=short \ --verbose \ + --dist=loadgroup \ -rA $TEST_SELECTION $EXTRA_PARAMS if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then @@ -169,8 +184,8 @@ runs: uses: ./.github/actions/upload with: name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }} - # The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test - path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/ + # The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test + path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/ prefix: latest - name: Create Allure report diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 707b20bcb8..b598949f2b 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -268,32 +268,6 @@ jobs: if: matrix.build_type == 'debug' uses: ./.github/actions/save-coverage-data - upload-latest-artifacts: - runs-on: dev - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned - options: --init - needs: [ regress-tests ] - if: github.ref_name == 'main' - steps: - - name: Copy Neon artifact to the latest directory - shell: bash -euxo pipefail {0} - env: - BUCKET: neon-github-public-dev - PREFIX: artifacts/${{ github.run_id }} - run: | - for build_type in debug release; do - FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst - - S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) - if [ -z "${S3_KEY}" ]; then - echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist" - exit 1 - fi - - time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/artifacts/latest/${FILENAME} - done - benchmarks: runs-on: dev container: @@ -942,7 +916,7 @@ jobs: DOCKER_TAG=${{needs.tag.outputs.build-tag}} helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s - promote-compatibility-test-snapshot: + promote-compatibility-data: runs-on: dev container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned @@ -956,9 +930,24 @@ jobs: BUCKET: neon-github-public-dev PREFIX: artifacts/latest run: | + # Update compatibility snapshot for the release for build_type in debug release; do OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME} done + + # Update Neon artifact for the release (reuse already uploaded artifact) + for build_type in debug release; do + OLD_PREFIX=artifacts/${GITHUB_RUN_ID} + FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst + + S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) + if [ -z "${S3_KEY}" ]; then + echo 2>&1 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist" + exit 1 + fi + + time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME} + done diff --git a/poetry.lock b/poetry.lock index 01265aaea1..551b267a87 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1207,18 +1207,6 @@ pytest = ">=6.1.0" [package.extras] testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"] -[[package]] -name = "pytest-forked" -version = "1.4.0" -description = "run tests in isolated forked subprocesses" -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -py = "*" -pytest = ">=3.10" - [[package]] name = "pytest-lazy-fixture" version = "0.6.3" @@ -1257,7 +1245,7 @@ pytest = ">=5.0.0" [[package]] name = "pytest-xdist" -version = "2.5.0" +version = "3.0.2" description = "pytest xdist plugin for distributed testing and loop-on-failing modes" category = "main" optional = false @@ -1266,7 +1254,6 @@ python-versions = ">=3.6" [package.dependencies] execnet = ">=1.1" pytest = ">=6.2.0" -pytest-forked = "*" [package.extras] psutil = ["psutil (>=3.0)"] @@ -1568,7 +1555,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "9352a89d49d34807f6a58f6c3f898acbd8cf3570e0f45ede973673644bde4d0e" +content-hash = "ebe16714bd4db1f34f005c9b72392f165618b020a2d0948cae20e0e8894c5517" [metadata.files] aiopg = [ @@ -2111,10 +2098,6 @@ pytest-asyncio = [ {file = "pytest-asyncio-0.19.0.tar.gz", hash = "sha256:ac4ebf3b6207259750bc32f4c1d8fcd7e79739edbc67ad0c58dd150b1d072fed"}, {file = "pytest_asyncio-0.19.0-py3-none-any.whl", hash = "sha256:7a97e37cfe1ed296e2e84941384bdd37c376453912d397ed39293e0916f521fa"}, ] -pytest-forked = [ - {file = "pytest-forked-1.4.0.tar.gz", hash = "sha256:8b67587c8f98cbbadfdd804539ed5455b6ed03802203485dd2f53c1422d7440e"}, - {file = "pytest_forked-1.4.0-py3-none-any.whl", hash = "sha256:bbbb6717efc886b9d64537b41fb1497cfaf3c9601276be8da2cccfea5a3c8ad8"}, -] pytest-lazy-fixture = [ {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"}, {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"}, @@ -2128,8 +2111,8 @@ pytest-timeout = [ {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"}, ] pytest-xdist = [ - {file = "pytest-xdist-2.5.0.tar.gz", hash = "sha256:4580deca3ff04ddb2ac53eba39d76cb5dd5edeac050cb6fbc768b0dd712b4edf"}, - {file = "pytest_xdist-2.5.0-py3-none-any.whl", hash = "sha256:6fe5c74fec98906deb8f2d2b616b5c782022744978e7bd4695d39c8f42d0ce65"}, + {file = "pytest-xdist-3.0.2.tar.gz", hash = "sha256:688da9b814370e891ba5de650c9327d1a9d861721a524eb917e620eec3e90291"}, + {file = "pytest_xdist-3.0.2-py3-none-any.whl", hash = "sha256:9feb9a18e1790696ea23e1434fa73b325ed4998b0e9fcb221f16fd1945e6df1b"}, ] python-dateutil = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, diff --git a/pyproject.toml b/pyproject.toml index 765e0b97eb..c2e2e2393b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ psycopg2-binary = "^2.9.1" typing-extensions = "^4.1.0" PyJWT = {version = "^2.1.0", extras = ["crypto"]} requests = "^2.26.0" -pytest-xdist = "^2.3.0" +pytest-xdist = "^3.0.2" asyncpg = "^0.24.0" aiopg = "^1.3.1" Jinja2 = "^3.0.2" diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 715c0753af..7a46a08f08 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -228,6 +228,7 @@ def can_bind(host: str, port: int) -> bool: class PortDistributor: def __init__(self, base_port: int, port_number: int): self.iterator = iter(range(base_port, base_port + port_number)) + self.port_map: Dict[int, int] = {} def get_port(self) -> int: for port in self.iterator: @@ -238,6 +239,38 @@ class PortDistributor: "port range configured for test is exhausted, consider enlarging the range" ) + def replace_with_new_port(self, value: Union[int, str]) -> Union[int, str]: + """ + Returns a new port for a port number in a string (like "localhost:1234") or int. + Replacements are memorised, so a substitution for the same port is always the same. + """ + + # TODO: replace with structural pattern matching for Python >= 3.10 + if isinstance(value, int): + return self._replace_port_int(value) + + if isinstance(value, str): + return self._replace_port_str(value) + + raise TypeError(f"unsupported type {type(value)} of {value=}") + + def _replace_port_int(self, value: int) -> int: + known_port = self.port_map.get(value) + if known_port is None: + known_port = self.port_map[value] = self.get_port() + + return known_port + + def _replace_port_str(self, value: str) -> str: + # Use regex to find port in a string + # urllib.parse.urlparse produces inconvenient results for cases without scheme like "localhost:5432" + # See https://bugs.python.org/issue27657 + ports = re.findall(r":(\d+)(?:/|$)", value) + assert len(ports) == 1, f"can't find port in {value}" + port_int = int(ports[0]) + + return value.replace(f":{port_int}", f":{self._replace_port_int(port_int)}") + @pytest.fixture(scope="session") def port_distributor(worker_base_port): diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index e73453f2c4..b04e02d3b8 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -17,7 +17,6 @@ Fn = TypeVar("Fn", bound=Callable[..., Any]) def get_self_dir() -> Path: """Get the path to the directory where this script lives.""" - # return os.path.dirname(os.path.abspath(__file__)) return Path(__file__).resolve().parent diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index b0643ec05e..306aa84040 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -1,12 +1,12 @@ import os -import re import shutil import subprocess from pathlib import Path -from typing import Any, Dict, Union +from typing import Any import pytest -import toml +import toml # TODO: replace with tomllib for Python >= 3.11 +from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonCli, NeonEnvBuilder, @@ -19,94 +19,185 @@ from fixtures.neon_fixtures import ( from fixtures.types import Lsn from pytest import FixtureRequest -DEFAILT_LOCAL_SNAPSHOT_DIR = "test_output/test_prepare_snapshot/compatibility_snapshot_pg14" +# +# A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases. +# - `test_create_snapshot` a script wrapped in a test that creates a data snapshot. +# - `test_backward_compatibility` checks that the current version of Neon can start/read/interract with a data snapshot created by the previous version. +# The path to the snapshot is configured by COMPATIBILITY_SNAPSHOT_DIR environment variable. +# If the breakage is intentional, the test can be xfaild with setting ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE=true. +# - `test_forward_compatibility` checks that a snapshot created by the current version can be started/read/interracted by the previous version of Neon. +# Paths to Neon and Postgres are configured by COMPATIBILITY_NEON_BIN and COMPATIBILITY_POSTGRES_DISTRIB_DIR environment variables. +# If the breakage is intentional, the test can be xfaild with setting ALLOW_FORWARD_COMPATIBILITY_BREAKAGE=true. +# +# The file contains a couple of helper functions: +# - prepare_snapshot copies the snapshot, cleans it up and makes it ready for the current version of Neon (replaces paths and ports in config files). +# - check_neon_works performs the test itself, feel free to add more checks there. +# -def dump_differs(first: Path, second: Path, output: Path) -> bool: - """ - Runs diff(1) command on two SQL dumps and write the output to the given output file. - Returns True if the dumps differ, False otherwise. - """ +# Note: if renaming this test, don't forget to update a reference to it in a workflow file: +# "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml +@pytest.mark.xdist_group("compatibility") +@pytest.mark.order(before="test_forward_compatibility") +def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path): + # The test doesn't really test anything + # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`. + # + # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it. + neon_env_builder.pg_version = "14" + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_local_fs_remote_storage() - with output.open("w") as stdout: - rv = subprocess.run( - [ - "diff", - "--unified", # Make diff output more readable - "--ignore-matching-lines=^--", # Ignore changes in comments - "--ignore-blank-lines", - str(first), - str(second), - ], - stdout=stdout, - ) + env = neon_env_builder.init_start() + pg = env.postgres.create_start("main") + pg_bin.run(["pgbench", "--initialize", "--scale=10", pg.connstr()]) + pg_bin.run(["pgbench", "--time=60", "--progress=2", pg.connstr()]) + pg_bin.run(["pg_dumpall", f"--dbname={pg.connstr()}", f"--file={test_output_dir / 'dump.sql'}"]) - return rv.returncode != 0 + snapshot_config = toml.load(test_output_dir / "repo" / "config") + tenant_id = snapshot_config["default_tenant_id"] + timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id] + + pageserver_http = env.pageserver.http_client() + lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn) + + env.postgres.stop_all() + for sk in env.safekeepers: + sk.stop() + env.pageserver.stop() + + shutil.copytree(test_output_dir, test_output_dir / "compatibility_snapshot_pg14") + # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it -class PortReplacer(object): - """ - Class-helper for replacing ports in config files. - """ - - def __init__(self, port_distributor: PortDistributor): - self.port_distributor = port_distributor - self.port_map: Dict[int, int] = {} - - def replace_port(self, value: Union[int, str]) -> Union[int, str]: - if isinstance(value, int): - if (known_port := self.port_map.get(value)) is not None: - return known_port - - self.port_map[value] = self.port_distributor.get_port() - return self.port_map[value] - - if isinstance(value, str): - # Use regex to find port in a string - # urllib.parse.urlparse produces inconvenient results for cases without scheme like "localhost:5432" - # See https://bugs.python.org/issue27657 - ports = re.findall(r":(\d+)(?:/|$)", value) - assert len(ports) == 1, f"can't find port in {value}" - port_int = int(ports[0]) - - if (known_port := self.port_map.get(port_int)) is not None: - return value.replace(f":{port_int}", f":{known_port}") - - self.port_map[port_int] = self.port_distributor.get_port() - return value.replace(f":{port_int}", f":{self.port_map[port_int]}") - - raise TypeError(f"unsupported type {type(value)} of {value=}") - - -@pytest.mark.order(after="test_prepare_snapshot") +@pytest.mark.xdist_group("compatibility") +@pytest.mark.order(after="test_create_snapshot") def test_backward_compatibility( pg_bin: PgBin, port_distributor: PortDistributor, test_output_dir: Path, - request: FixtureRequest, neon_binpath: Path, pg_distrib_dir: Path, + pg_version: str, + request: FixtureRequest, ): - compatibility_snapshot_dir = Path( - os.environ.get("COMPATIBILITY_SNAPSHOT_DIR", DEFAILT_LOCAL_SNAPSHOT_DIR) - ) - assert compatibility_snapshot_dir.exists(), ( - f"{compatibility_snapshot_dir} doesn't exist. Please run `test_prepare_snapshot` test first " - "to create the snapshot or set COMPATIBILITY_SNAPSHOT_DIR env variable to the existing snapshot" - ) - compatibility_snapshot_dir = compatibility_snapshot_dir.resolve() + compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR") + assert ( + compatibility_snapshot_dir_env is not None + ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)" + compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve() - # Make compatibility snapshot artifacts pickupable by Allure - # by copying the snapshot directory to the curent test output directory. - repo_dir = test_output_dir / "compatibility_snapshot" / "repo" + # Copy the snapshot to current directory, and prepare for the test + prepare_snapshot( + from_dir=compatibility_snapshot_dir, + to_dir=test_output_dir / "compatibility_snapshot", + port_distributor=port_distributor, + ) - shutil.copytree(compatibility_snapshot_dir / "repo", repo_dir) + breaking_changes_allowed = ( + os.environ.get("ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" + ) + try: + check_neon_works( + test_output_dir / "compatibility_snapshot" / "repo", + neon_binpath, + pg_distrib_dir, + pg_version, + port_distributor, + test_output_dir, + pg_bin, + request, + ) + except Exception: + if breaking_changes_allowed: + pytest.xfail( + "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE env var" + ) + else: + raise + + assert ( + not breaking_changes_allowed + ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" + + +@pytest.mark.xdist_group("compatibility") +@pytest.mark.order(after="test_create_snapshot") +def test_forward_compatibility( + test_output_dir: Path, + port_distributor: PortDistributor, + pg_version: str, + request: FixtureRequest, +): + compatibility_neon_bin_env = os.environ.get("COMPATIBILITY_NEON_BIN") + assert compatibility_neon_bin_env is not None, ( + "COMPATIBILITY_NEON_BIN is not set. It should be set to a path with Neon binaries " + "(ideally generated by the previous version of Neon)" + ) + compatibility_neon_bin = Path(compatibility_neon_bin_env).resolve() + + compatibility_postgres_distrib_dir_env = os.environ.get("COMPATIBILITY_POSTGRES_DISTRIB_DIR") + assert ( + compatibility_postgres_distrib_dir_env is not None + ), "COMPATIBILITY_POSTGRES_DISTRIB_DIR is not set. It should be set to a pg_install directrory (ideally generated by the previous version of Neon)" + compatibility_postgres_distrib_dir = Path(compatibility_postgres_distrib_dir_env).resolve() + + compatibility_snapshot_dir = ( + test_output_dir.parent / "test_create_snapshot" / "compatibility_snapshot_pg14" + ) + # Copy the snapshot to current directory, and prepare for the test + prepare_snapshot( + from_dir=compatibility_snapshot_dir, + to_dir=test_output_dir / "compatibility_snapshot", + port_distributor=port_distributor, + ) + + breaking_changes_allowed = ( + os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" + ) + try: + check_neon_works( + test_output_dir / "compatibility_snapshot" / "repo", + compatibility_neon_bin, + compatibility_postgres_distrib_dir, + pg_version, + port_distributor, + test_output_dir, + PgBin(test_output_dir, compatibility_postgres_distrib_dir, pg_version), + request, + ) + except Exception: + if breaking_changes_allowed: + pytest.xfail( + "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE env var" + ) + else: + raise + + assert ( + not breaking_changes_allowed + ), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" + + +def prepare_snapshot(from_dir: Path, to_dir: Path, port_distributor: PortDistributor): + assert from_dir.exists(), f"Snapshot '{from_dir}' doesn't exist" + assert (from_dir / "repo").exists(), f"Snapshot '{from_dir}' doesn't contain a repo directory" + assert (from_dir / "dump.sql").exists(), f"Snapshot '{from_dir}' doesn't contain a dump.sql" + + log.info(f"Copying snapshot from {from_dir} to {to_dir}") + shutil.copytree(from_dir, to_dir) + + repo_dir = to_dir / "repo" # Remove old logs to avoid confusion in test artifacts for logfile in repo_dir.glob("**/*.log"): logfile.unlink() - # Remove tenants data for computes + # Remove tenants data for compute for tenant in (repo_dir / "pgdatadirs" / "tenants").glob("*"): shutil.rmtree(tenant) @@ -115,20 +206,17 @@ def test_backward_compatibility( shutil.rmtree(tenant / "wal-redo-datadir.___temp") # Update paths and ports in config files - pr = PortReplacer(port_distributor) - pageserver_toml = repo_dir / "pageserver.toml" pageserver_config = toml.load(pageserver_toml) - new_local_path = pageserver_config["remote_storage"]["local_path"].replace( - "/test_prepare_snapshot/", - "/test_backward_compatibility/compatibility_snapshot/", + pageserver_config["remote_storage"]["local_path"] = repo_dir / "local_fs_remote_storage" + pageserver_config["listen_http_addr"] = port_distributor.replace_with_new_port( + pageserver_config["listen_http_addr"] + ) + pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port( + pageserver_config["listen_pg_addr"] ) - - pageserver_config["remote_storage"]["local_path"] = new_local_path - pageserver_config["listen_http_addr"] = pr.replace_port(pageserver_config["listen_http_addr"]) - pageserver_config["listen_pg_addr"] = pr.replace_port(pageserver_config["listen_pg_addr"]) pageserver_config["broker_endpoints"] = [ - pr.replace_port(ep) for ep in pageserver_config["broker_endpoints"] + port_distributor.replace_with_new_port(ep) for ep in pageserver_config["broker_endpoints"] ] with pageserver_toml.open("w") as f: @@ -137,17 +225,18 @@ def test_backward_compatibility( snapshot_config_toml = repo_dir / "config" snapshot_config = toml.load(snapshot_config_toml) snapshot_config["etcd_broker"]["broker_endpoints"] = [ - pr.replace_port(ep) for ep in snapshot_config["etcd_broker"]["broker_endpoints"] + port_distributor.replace_with_new_port(ep) + for ep in snapshot_config["etcd_broker"]["broker_endpoints"] ] - snapshot_config["pageserver"]["listen_http_addr"] = pr.replace_port( + snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port( snapshot_config["pageserver"]["listen_http_addr"] ) - snapshot_config["pageserver"]["listen_pg_addr"] = pr.replace_port( + snapshot_config["pageserver"]["listen_pg_addr"] = port_distributor.replace_with_new_port( snapshot_config["pageserver"]["listen_pg_addr"] ) for sk in snapshot_config["safekeepers"]: - sk["http_port"] = pr.replace_port(sk["http_port"]) - sk["pg_port"] = pr.replace_port(sk["pg_port"]) + sk["http_port"] = port_distributor.replace_with_new_port(sk["http_port"]) + sk["pg_port"] = port_distributor.replace_with_new_port(sk["pg_port"]) with (snapshot_config_toml).open("w") as f: toml.dump(snapshot_config, f) @@ -159,7 +248,7 @@ def test_backward_compatibility( "--recursive", "--binary-file=without-match", "--files-with-matches", - "test_prepare_snapshot/repo", + "test_create_snapshot/repo", str(repo_dir), ], capture_output=True, @@ -167,44 +256,47 @@ def test_backward_compatibility( ) assert ( rv.returncode != 0 - ), f"there're files referencing `test_prepare_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}" + ), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}" - # NeonEnv stub to make NeonCli happy + +def check_neon_works( + repo_dir: Path, + neon_binpath: Path, + pg_distrib_dir: Path, + pg_version: str, + port_distributor: PortDistributor, + test_output_dir: Path, + pg_bin: PgBin, + request: FixtureRequest, +): + snapshot_config_toml = repo_dir / "config" + snapshot_config = toml.load(snapshot_config_toml) + snapshot_config["neon_distrib_dir"] = str(neon_binpath) + snapshot_config["postgres_distrib_dir"] = str(pg_distrib_dir) + with (snapshot_config_toml).open("w") as f: + toml.dump(snapshot_config, f) + + # TODO: replace with NeonEnvBuilder / NeonEnv config: Any = type("NeonEnvStub", (object,), {}) config.rust_log_override = None config.repo_dir = repo_dir - config.pg_version = "14" # Note: `pg_dumpall` (from pg_bin) version is set by DEFAULT_PG_VERSION_DEFAULT and can be overriden by DEFAULT_PG_VERSION env var + config.pg_version = pg_version config.initial_tenant = snapshot_config["default_tenant_id"] config.neon_binpath = neon_binpath config.pg_distrib_dir = pg_distrib_dir - # Check that we can start the project cli = NeonCli(config) - try: - cli.raw_cli(["start"]) - request.addfinalizer(lambda: cli.raw_cli(["stop"])) + cli.raw_cli(["start"]) + request.addfinalizer(lambda: cli.raw_cli(["stop"])) - result = cli.pg_start("main", port=port_distributor.get_port()) - request.addfinalizer(lambda: cli.pg_stop("main")) - except Exception: - breaking_changes_allowed = ( - os.environ.get("ALLOW_BREAKING_CHANGES", "false").lower() == "true" - ) - if breaking_changes_allowed: - pytest.xfail("Breaking changes are allowed by ALLOW_BREAKING_CHANGES env var") - else: - raise + pg_port = port_distributor.get_port() + cli.pg_start("main", port=pg_port) + request.addfinalizer(lambda: cli.pg_stop("main")) - connstr_all = re.findall(r"Starting postgres node at '([^']+)'", result.stdout) - assert len(connstr_all) == 1, f"can't parse connstr from {result.stdout}" - connstr = connstr_all[0] - - # Check that the project produces the same dump as the previous version. - # The assert itself deferred to the end of the test - # to allow us to perform checks that change data before failing + connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres" pg_bin.run(["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"]) initial_dump_differs = dump_differs( - compatibility_snapshot_dir / "dump.sql", + repo_dir.parent / "dump.sql", test_output_dir / "dump.sql", test_output_dir / "dump.filediff", ) @@ -242,38 +334,23 @@ def test_backward_compatibility( assert not initial_dump_differs, "initial dump differs" -# Note: if renaming this test, don't forget to update a reference to it in a workflow file: -# "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml -def test_prepare_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path): - # The test doesn't really test anything - # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`. - # - # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it. - neon_env_builder.pg_version = "14" - neon_env_builder.num_safekeepers = 3 - neon_env_builder.enable_local_fs_remote_storage() +def dump_differs(first: Path, second: Path, output: Path) -> bool: + """ + Runs diff(1) command on two SQL dumps and write the output to the given output file. + Returns True if the dumps differ, False otherwise. + """ - env = neon_env_builder.init_start() - pg = env.postgres.create_start("main") - pg_bin.run(["pgbench", "--initialize", "--scale=10", pg.connstr()]) - pg_bin.run(["pgbench", "--time=60", "--progress=2", pg.connstr()]) - pg_bin.run(["pg_dumpall", f"--dbname={pg.connstr()}", f"--file={test_output_dir / 'dump.sql'}"]) + with output.open("w") as stdout: + rv = subprocess.run( + [ + "diff", + "--unified", # Make diff output more readable + "--ignore-matching-lines=^--", # Ignore changes in comments + "--ignore-blank-lines", + str(first), + str(second), + ], + stdout=stdout, + ) - snapshot_config = toml.load(test_output_dir / "repo" / "config") - tenant_id = snapshot_config["default_tenant_id"] - timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id] - - pageserver_http = env.pageserver.http_client() - lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - - wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn) - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn) - - env.postgres.stop_all() - for sk in env.safekeepers: - sk.stop() - env.pageserver.stop() - - shutil.copytree(test_output_dir, test_output_dir / "compatibility_snapshot_pg14") - # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it + return rv.returncode != 0 From f720dd735e704b21a7ec702b39104335209c48c2 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Tue, 8 Nov 2022 10:51:34 +0100 Subject: [PATCH 1007/1022] Stricter mypy linters for `test_runner/fixtures/*` --- test_runner/fixtures/benchmark_fixture.py | 88 ++++---- test_runner/fixtures/compare_fixtures.py | 87 ++++---- test_runner/fixtures/metrics.py | 12 +- test_runner/fixtures/neon_fixtures.py | 188 ++++++++++-------- test_runner/fixtures/pg_stats.py | 4 +- test_runner/fixtures/slow.py | 10 +- test_runner/fixtures/types.py | 38 ++-- test_runner/fixtures/utils.py | 20 +- .../performance/test_wal_backpressure.py | 5 +- .../python/asyncpg/asyncpg_example.py | 3 +- test_runner/regress/test_proxy.py | 1 + 11 files changed, 255 insertions(+), 201 deletions(-) diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index b5565dab0f..27fb0a60b2 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -11,39 +11,37 @@ from datetime import datetime from pathlib import Path # Type-related stuff -from typing import Iterator, Optional +from typing import Callable, ClassVar, Iterator, Optional import pytest from _pytest.config import Config +from _pytest.config.argparsing import Parser from _pytest.terminal import TerminalReporter +from fixtures.neon_fixtures import NeonPageserver from fixtures.types import TenantId, TimelineId """ This file contains fixtures for micro-benchmarks. -To use, declare the 'zenbenchmark' fixture in the test function. Run the -bencmark, and then record the result by calling zenbenchmark.record. For example: +To use, declare the `zenbenchmark` fixture in the test function. Run the +bencmark, and then record the result by calling `zenbenchmark.record`. For example: -import timeit -from fixtures.neon_fixtures import NeonEnv - -def test_mybench(neon_simple_env: env, zenbenchmark): - - # Initialize the test - ... - - # Run the test, timing how long it takes - with zenbenchmark.record_duration('test_query'): - cur.execute('SELECT test_query(...)') - - # Record another measurement - zenbenchmark.record('speed_of_light', 300000, 'km/s') +>>> import timeit +>>> from fixtures.neon_fixtures import NeonEnv +>>> def test_mybench(neon_simple_env: NeonEnv, zenbenchmark): +... # Initialize the test +... ... +... # Run the test, timing how long it takes +... with zenbenchmark.record_duration('test_query'): +... cur.execute('SELECT test_query(...)') +... # Record another measurement +... zenbenchmark.record('speed_of_light', 300000, 'km/s') There's no need to import this file to use it. It should be declared as a plugin -inside conftest.py, and that makes it available to all tests. +inside `conftest.py`, and that makes it available to all tests. You can measure multiple things in one test, and record each one with a separate -call to zenbenchmark. For example, you could time the bulk loading that happens +call to `zenbenchmark`. For example, you could time the bulk loading that happens in the test initialization, or measure disk usage after the test query. """ @@ -117,7 +115,7 @@ class PgBenchRunResult: # tps = 309.281539 (without initial connection time) if line.startswith("tps = ") and ( "(excluding connections establishing)" in line - or "(without initial connection time)" + or "(without initial connection time)" in line ): tps = float(line.split()[2]) @@ -137,6 +135,17 @@ class PgBenchRunResult: @dataclasses.dataclass class PgBenchInitResult: + REGEX: ClassVar[re.Pattern] = re.compile( # type: ignore[type-arg] + r"done in (\d+\.\d+) s " + r"\(" + r"(?:drop tables (\d+\.\d+) s)?(?:, )?" + r"(?:create tables (\d+\.\d+) s)?(?:, )?" + r"(?:client-side generate (\d+\.\d+) s)?(?:, )?" + r"(?:vacuum (\d+\.\d+) s)?(?:, )?" + r"(?:primary keys (\d+\.\d+) s)?(?:, )?" + r"\)\." + ) + total: float drop_tables: Optional[float] create_tables: Optional[float] @@ -160,18 +169,7 @@ class PgBenchInitResult: last_line = stderr.splitlines()[-1] - regex = re.compile( - r"done in (\d+\.\d+) s " - r"\(" - r"(?:drop tables (\d+\.\d+) s)?(?:, )?" - r"(?:create tables (\d+\.\d+) s)?(?:, )?" - r"(?:client-side generate (\d+\.\d+) s)?(?:, )?" - r"(?:vacuum (\d+\.\d+) s)?(?:, )?" - r"(?:primary keys (\d+\.\d+) s)?(?:, )?" - r"\)\." - ) - - if (m := regex.match(last_line)) is not None: + if (m := cls.REGEX.match(last_line)) is not None: total, drop_tables, create_tables, client_side_generate, vacuum, primary_keys = [ float(v) for v in m.groups() if v is not None ] @@ -208,7 +206,7 @@ class NeonBenchmarker: function by the zenbenchmark fixture """ - def __init__(self, property_recorder): + def __init__(self, property_recorder: Callable[[str, object], None]): # property recorder here is a pytest fixture provided by junitxml module # https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property self.property_recorder = property_recorder @@ -236,7 +234,7 @@ class NeonBenchmarker: ) @contextmanager - def record_duration(self, metric_name: str): + def record_duration(self, metric_name: str) -> Iterator[None]: """ Record a duration. Usage: @@ -337,21 +335,21 @@ class NeonBenchmarker: f"{prefix}.{metric}", value, unit="s", report=MetricReport.LOWER_IS_BETTER ) - def get_io_writes(self, pageserver) -> int: + def get_io_writes(self, pageserver: NeonPageserver) -> int: """ Fetch the "cumulative # of bytes written" metric from the pageserver """ metric_name = r'libmetrics_disk_io_bytes_total{io_operation="write"}' return self.get_int_counter_value(pageserver, metric_name) - def get_peak_mem(self, pageserver) -> int: + def get_peak_mem(self, pageserver: NeonPageserver) -> int: """ Fetch the "maxrss" metric from the pageserver """ metric_name = r"libmetrics_maxrss_kb" return self.get_int_counter_value(pageserver, metric_name) - def get_int_counter_value(self, pageserver, metric_name) -> int: + def get_int_counter_value(self, pageserver: NeonPageserver, metric_name: str) -> int: """Fetch the value of given int counter from pageserver metrics.""" # TODO: If we start to collect more of the prometheus metrics in the # performance test suite like this, we should refactor this to load and @@ -365,7 +363,9 @@ class NeonBenchmarker: assert matches, f"metric {metric_name} not found" return int(round(float(matches.group(1)))) - def get_timeline_size(self, repo_dir: Path, tenant_id: TenantId, timeline_id: TimelineId): + def get_timeline_size( + self, repo_dir: Path, tenant_id: TenantId, timeline_id: TimelineId + ) -> int: """ Calculate the on-disk size of a timeline """ @@ -379,7 +379,9 @@ class NeonBenchmarker: return totalbytes @contextmanager - def record_pageserver_writes(self, pageserver, metric_name): + def record_pageserver_writes( + self, pageserver: NeonPageserver, metric_name: str + ) -> Iterator[None]: """ Record bytes written by the pageserver during a test. """ @@ -396,7 +398,7 @@ class NeonBenchmarker: @pytest.fixture(scope="function") -def zenbenchmark(record_property) -> Iterator[NeonBenchmarker]: +def zenbenchmark(record_property: Callable[[str, object], None]) -> Iterator[NeonBenchmarker]: """ This is a python decorator for benchmark fixtures. It contains functions for recording measurements, and prints them out at the end. @@ -405,7 +407,7 @@ def zenbenchmark(record_property) -> Iterator[NeonBenchmarker]: yield benchmarker -def pytest_addoption(parser): +def pytest_addoption(parser: Parser): parser.addoption( "--out-dir", dest="out_dir", @@ -429,7 +431,9 @@ def get_out_path(target_dir: Path, revision: str) -> Path: # Hook to print the results at the end @pytest.hookimpl(hookwrapper=True) -def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, config: Config): +def pytest_terminal_summary( + terminalreporter: TerminalReporter, exitstatus: int, config: Config +) -> Iterator[None]: yield revision = os.getenv("GITHUB_SHA", "local") platform = os.getenv("PLATFORM", "local") diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 2d36d90bd6..291f924379 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -1,10 +1,11 @@ from abc import ABC, abstractmethod -from contextlib import contextmanager +from contextlib import _GeneratorContextManager, contextmanager # Type-related stuff -from typing import Dict, List +from typing import Dict, Iterator, List import pytest +from _pytest.fixtures import FixtureRequest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.neon_fixtures import NeonEnv, PgBin, PgProtocol, RemotePostgres, VanillaPostgres from fixtures.pg_stats import PgStatTable @@ -28,19 +29,20 @@ class PgCompare(ABC): pass @property + @abstractmethod def zenbenchmark(self) -> NeonBenchmarker: pass @abstractmethod - def flush(self) -> None: + def flush(self): pass @abstractmethod - def report_peak_memory_use(self) -> None: + def report_peak_memory_use(self): pass @abstractmethod - def report_size(self) -> None: + def report_size(self): pass @contextmanager @@ -54,7 +56,7 @@ class PgCompare(ABC): pass @contextmanager - def record_pg_stats(self, pg_stats: List[PgStatTable]): + def record_pg_stats(self, pg_stats: List[PgStatTable]) -> Iterator[None]: init_data = self._retrieve_pg_stats(pg_stats) yield @@ -84,7 +86,11 @@ class NeonCompare(PgCompare): """PgCompare interface for the neon stack.""" def __init__( - self, zenbenchmark: NeonBenchmarker, neon_simple_env: NeonEnv, pg_bin: PgBin, branch_name + self, + zenbenchmark: NeonBenchmarker, + neon_simple_env: NeonEnv, + pg_bin: PgBin, + branch_name: str, ): self.env = neon_simple_env self._zenbenchmark = zenbenchmark @@ -97,15 +103,15 @@ class NeonCompare(PgCompare): self.timeline = self.pg.safe_psql("SHOW neon.timeline_id")[0][0] @property - def pg(self): + def pg(self) -> PgProtocol: return self._pg @property - def zenbenchmark(self): + def zenbenchmark(self) -> NeonBenchmarker: return self._zenbenchmark @property - def pg_bin(self): + def pg_bin(self) -> PgBin: return self._pg_bin def flush(self): @@ -114,7 +120,7 @@ class NeonCompare(PgCompare): def compact(self): self.pageserver_http_client.timeline_compact(self.env.initial_tenant, self.timeline) - def report_peak_memory_use(self) -> None: + def report_peak_memory_use(self): self.zenbenchmark.record( "peak_mem", self.zenbenchmark.get_peak_mem(self.env.pageserver) / 1024, @@ -122,7 +128,7 @@ class NeonCompare(PgCompare): report=MetricReport.LOWER_IS_BETTER, ) - def report_size(self) -> None: + def report_size(self): timeline_size = self.zenbenchmark.get_timeline_size( self.env.repo_dir, self.env.initial_tenant, self.timeline ) @@ -144,17 +150,17 @@ class NeonCompare(PgCompare): "num_files_uploaded", total_files, "", report=MetricReport.LOWER_IS_BETTER ) - def record_pageserver_writes(self, out_name): + def record_pageserver_writes(self, out_name: str) -> _GeneratorContextManager[None]: return self.zenbenchmark.record_pageserver_writes(self.env.pageserver, out_name) - def record_duration(self, out_name): + def record_duration(self, out_name: str) -> _GeneratorContextManager[None]: return self.zenbenchmark.record_duration(out_name) class VanillaCompare(PgCompare): """PgCompare interface for vanilla postgres.""" - def __init__(self, zenbenchmark, vanilla_pg: VanillaPostgres): + def __init__(self, zenbenchmark: NeonBenchmarker, vanilla_pg: VanillaPostgres): self._pg = vanilla_pg self._zenbenchmark = zenbenchmark vanilla_pg.configure( @@ -170,24 +176,24 @@ class VanillaCompare(PgCompare): self.cur = self.conn.cursor() @property - def pg(self): + def pg(self) -> PgProtocol: return self._pg @property - def zenbenchmark(self): + def zenbenchmark(self) -> NeonBenchmarker: return self._zenbenchmark @property - def pg_bin(self): + def pg_bin(self) -> PgBin: return self._pg.pg_bin def flush(self): self.cur.execute("checkpoint") - def report_peak_memory_use(self) -> None: + def report_peak_memory_use(self): pass # TODO find something - def report_size(self) -> None: + def report_size(self): data_size = self.pg.get_subdir_size("base") self.zenbenchmark.record( "data_size", data_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER @@ -198,17 +204,17 @@ class VanillaCompare(PgCompare): ) @contextmanager - def record_pageserver_writes(self, out_name): + def record_pageserver_writes(self, out_name: str) -> Iterator[None]: yield # Do nothing - def record_duration(self, out_name): + def record_duration(self, out_name: str) -> _GeneratorContextManager[None]: return self.zenbenchmark.record_duration(out_name) class RemoteCompare(PgCompare): """PgCompare interface for a remote postgres instance.""" - def __init__(self, zenbenchmark, remote_pg: RemotePostgres): + def __init__(self, zenbenchmark: NeonBenchmarker, remote_pg: RemotePostgres): self._pg = remote_pg self._zenbenchmark = zenbenchmark @@ -217,55 +223,60 @@ class RemoteCompare(PgCompare): self.cur = self.conn.cursor() @property - def pg(self): + def pg(self) -> PgProtocol: return self._pg @property - def zenbenchmark(self): + def zenbenchmark(self) -> NeonBenchmarker: return self._zenbenchmark @property - def pg_bin(self): + def pg_bin(self) -> PgBin: return self._pg.pg_bin def flush(self): # TODO: flush the remote pageserver pass - def report_peak_memory_use(self) -> None: + def report_peak_memory_use(self): # TODO: get memory usage from remote pageserver pass - def report_size(self) -> None: + def report_size(self): # TODO: get storage size from remote pageserver pass @contextmanager - def record_pageserver_writes(self, out_name): + def record_pageserver_writes(self, out_name: str) -> Iterator[None]: yield # Do nothing - def record_duration(self, out_name): + def record_duration(self, out_name: str) -> _GeneratorContextManager[None]: return self.zenbenchmark.record_duration(out_name) @pytest.fixture(scope="function") -def neon_compare(request, zenbenchmark, pg_bin, neon_simple_env) -> NeonCompare: +def neon_compare( + request: FixtureRequest, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + neon_simple_env: NeonEnv, +) -> NeonCompare: branch_name = request.node.name return NeonCompare(zenbenchmark, neon_simple_env, pg_bin, branch_name) @pytest.fixture(scope="function") -def vanilla_compare(zenbenchmark, vanilla_pg) -> VanillaCompare: +def vanilla_compare(zenbenchmark: NeonBenchmarker, vanilla_pg: VanillaPostgres) -> VanillaCompare: return VanillaCompare(zenbenchmark, vanilla_pg) @pytest.fixture(scope="function") -def remote_compare(zenbenchmark, remote_pg) -> RemoteCompare: +def remote_compare(zenbenchmark: NeonBenchmarker, remote_pg: RemotePostgres) -> RemoteCompare: return RemoteCompare(zenbenchmark, remote_pg) @pytest.fixture(params=["vanilla_compare", "neon_compare"], ids=["vanilla", "neon"]) -def neon_with_baseline(request) -> PgCompare: +def neon_with_baseline(request: FixtureRequest) -> PgCompare: """Parameterized fixture that helps compare neon against vanilla postgres. A test that uses this fixture turns into a parameterized test that runs against: @@ -286,8 +297,6 @@ def neon_with_baseline(request) -> PgCompare: implementation-specific logic is widely useful across multiple tests, it might make sense to add methods to the PgCompare class. """ - fixture = request.getfixturevalue(request.param) - if isinstance(fixture, PgCompare): - return fixture - else: - raise AssertionError(f"test error: fixture {request.param} is not PgCompare") + fixture = request.getfixturevalue(request.param) # type: ignore + assert isinstance(fixture, PgCompare), f"test error: fixture {fixture} is not PgCompare" + return fixture diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 62e3cbbe99..86ab4425ed 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import Dict, List +from typing import Dict, List, Optional, Tuple from prometheus_client.parser import text_string_to_metric_families from prometheus_client.samples import Sample @@ -23,13 +23,13 @@ class Metrics: pass return res - def query_one(self, name: str, filter: Dict[str, str] = {}) -> Sample: - res = self.query_all(name, filter) + def query_one(self, name: str, filter: Optional[Dict[str, str]] = None) -> Sample: + res = self.query_all(name, filter or {}) assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}" return res[0] -def parse_metrics(text: str, name: str = ""): +def parse_metrics(text: str, name: str = "") -> Metrics: metrics = Metrics(name) gen = text_string_to_metric_families(text) for family in gen: @@ -39,7 +39,7 @@ def parse_metrics(text: str, name: str = ""): return metrics -PAGESERVER_PER_TENANT_METRICS = [ +PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_current_logical_size", "pageserver_current_physical_size", "pageserver_getpage_reconstruct_seconds_bucket", @@ -62,4 +62,4 @@ PAGESERVER_PER_TENANT_METRICS = [ "pageserver_wait_lsn_seconds_sum", "pageserver_created_persistent_files_total", "pageserver_written_persistent_bytes_total", -] +) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7a46a08f08..f68c6a25db 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -19,7 +19,8 @@ from dataclasses import dataclass, field from enum import Flag, auto from functools import cached_property from pathlib import Path -from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast +from types import TracebackType +from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast import asyncpg import backoff # type: ignore @@ -28,16 +29,18 @@ import jwt import psycopg2 import pytest import requests +from _pytest.config import Config +from _pytest.fixtures import FixtureRequest from fixtures.log_helper import log from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import Fn, allure_attach_from_dir, etcd_path, get_self_dir, subprocess_capture # Type-related stuff from psycopg2.extensions import connection as PgConnection +from psycopg2.extensions import cursor as PgCursor from psycopg2.extensions import make_dsn, parse_dsn from typing_extensions import Literal -from .utils import Fn, allure_attach_from_dir, etcd_path, get_self_dir, subprocess_capture - """ This file contains pytest fixtures. A fixture is a test resource that can be summoned by placing its name in the test's arguments. @@ -57,15 +60,15 @@ put directly-importable functions into utils.py or another separate file. Env = Dict[str, str] -DEFAULT_OUTPUT_DIR = "test_output" -DEFAULT_BRANCH_NAME = "main" -DEFAULT_PG_VERSION_DEFAULT = "14" +DEFAULT_OUTPUT_DIR: str = "test_output" +DEFAULT_BRANCH_NAME: str = "main" +DEFAULT_PG_VERSION_DEFAULT: str = "14" -BASE_PORT = 15000 -WORKER_PORT_NUM = 1000 +BASE_PORT: int = 15000 +WORKER_PORT_NUM: int = 1000 -def pytest_configure(config): +def pytest_configure(config: Config): """ Check that we do not overflow available ports range. """ @@ -154,14 +157,14 @@ def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: str) -> Iterator[ if not psql_bin_path.exists(): raise Exception(f"psql not found at '{psql_bin_path}'") else: - if not postgres_bin_path.exists: + if not postgres_bin_path.exists(): raise Exception(f"postgres not found at '{postgres_bin_path}'") log.info(f"versioned_pg_distrib_dir is {versioned_dir}") yield versioned_dir -def shareable_scope(fixture_name, config) -> Literal["session", "function"]: +def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]: """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar. This function can be used as a scope like this: @@ -173,7 +176,7 @@ def shareable_scope(fixture_name, config) -> Literal["session", "function"]: @pytest.fixture(scope="session") -def worker_seq_no(worker_id: str): +def worker_seq_no(worker_id: str) -> int: # worker_id is a pytest-xdist fixture # it can be master or gw # parse it to always get a number @@ -184,7 +187,7 @@ def worker_seq_no(worker_id: str): @pytest.fixture(scope="session") -def worker_base_port(worker_seq_no: int): +def worker_base_port(worker_seq_no: int) -> int: # so we divide ports in ranges of 100 ports # so workers have disjoint set of ports for services return BASE_PORT + worker_seq_no * WORKER_PORT_NUM @@ -234,10 +237,9 @@ class PortDistributor: for port in self.iterator: if can_bind("localhost", port): return port - else: - raise RuntimeError( - "port range configured for test is exhausted, consider enlarging the range" - ) + raise RuntimeError( + "port range configured for test is exhausted, consider enlarging the range" + ) def replace_with_new_port(self, value: Union[int, str]) -> Union[int, str]: """ @@ -273,12 +275,14 @@ class PortDistributor: @pytest.fixture(scope="session") -def port_distributor(worker_base_port): +def port_distributor(worker_base_port: int) -> PortDistributor: return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM) @pytest.fixture(scope="session") -def default_broker(request: Any, port_distributor: PortDistributor, top_output_dir: Path): +def default_broker( + request: FixtureRequest, port_distributor: PortDistributor, top_output_dir: Path +) -> Iterator[Etcd]: client_port = port_distributor.get_port() # multiple pytest sessions could get launched in parallel, get them different datadirs etcd_datadir = get_test_output_dir(request, top_output_dir) / f"etcd_datadir_{client_port}" @@ -293,12 +297,12 @@ def default_broker(request: Any, port_distributor: PortDistributor, top_output_d @pytest.fixture(scope="session") -def run_id(): +def run_id() -> Iterator[uuid.UUID]: yield uuid.uuid4() @pytest.fixture(scope="session") -def mock_s3_server(port_distributor: PortDistributor): +def mock_s3_server(port_distributor: PortDistributor) -> Iterator[MockS3Server]: mock_s3_server = MockS3Server(port_distributor.get_port()) yield mock_s3_server mock_s3_server.kill() @@ -307,16 +311,16 @@ def mock_s3_server(port_distributor: PortDistributor): class PgProtocol: """Reusable connection logic""" - def __init__(self, **kwargs): + def __init__(self, **kwargs: Any): self.default_options = kwargs - def connstr(self, **kwargs) -> str: + def connstr(self, **kwargs: Any) -> str: """ Build a libpq connection string for the Postgres instance. """ return str(make_dsn(**self.conn_options(**kwargs))) - def conn_options(self, **kwargs): + def conn_options(self, **kwargs: Any) -> Dict[str, Any]: """ Construct a dictionary of connection options from default values and extra parameters. An option can be dropped from the returning dictionary by None-valued extra parameter. @@ -338,7 +342,7 @@ class PgProtocol: return result # autocommit=True here by default because that's what we need most of the time - def connect(self, autocommit=True, **kwargs) -> PgConnection: + def connect(self, autocommit: bool = True, **kwargs: Any) -> PgConnection: """ Connect to the node. Returns psycopg2's connection object. @@ -351,7 +355,7 @@ class PgProtocol: return conn @contextmanager - def cursor(self, autocommit=True, **kwargs): + def cursor(self, autocommit: bool = True, **kwargs: Any) -> Iterator[PgCursor]: """ Shorthand for pg.connect().cursor(). The cursor and connection are closed when the context is exited. @@ -359,7 +363,7 @@ class PgProtocol: with closing(self.connect(autocommit=autocommit, **kwargs)) as conn: yield conn.cursor() - async def connect_async(self, **kwargs) -> asyncpg.Connection: + async def connect_async(self, **kwargs: Any) -> asyncpg.Connection: """ Connect to the node from async python. Returns asyncpg's connection object. @@ -413,10 +417,10 @@ class PgProtocol: @dataclass class AuthKeys: - pub: bytes - priv: bytes + pub: str + priv: str - def generate_management_token(self): + def generate_management_token(self) -> str: token = jwt.encode({"scope": "pageserverapi"}, self.priv, algorithm="RS256") # jwt.encode can return 'bytes' or 'str', depending on Python version or type @@ -427,9 +431,11 @@ class AuthKeys: return token - def generate_tenant_token(self, tenant_id): + def generate_tenant_token(self, tenant_id: TenantId) -> str: token = jwt.encode( - {"scope": "tenant", "tenant_id": str(tenant_id)}, self.priv, algorithm="RS256" + {"scope": "tenant", "tenant_id": str(tenant_id)}, + self.priv, + algorithm="RS256", ) if isinstance(token, bytes): @@ -485,7 +491,7 @@ class MockS3Server: @enum.unique -class RemoteStorageKind(enum.Enum): +class RemoteStorageKind(str, enum.Enum): LOCAL_FS = "local_fs" MOCK_S3 = "mock_s3" REAL_S3 = "real_s3" @@ -529,7 +535,7 @@ RemoteStorage = Union[LocalFsStorage, S3Storage] # serialize as toml inline table -def remote_storage_to_toml_inline_table(remote_storage): +def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str: if isinstance(remote_storage, LocalFsStorage): remote_storage_config = f"local_path='{remote_storage.root}'" elif isinstance(remote_storage, S3Storage): @@ -582,7 +588,7 @@ class NeonEnvBuilder: safekeepers_enable_fsync: bool = False, auth_enabled: bool = False, rust_log_override: Optional[str] = None, - default_branch_name=DEFAULT_BRANCH_NAME, + default_branch_name: str = DEFAULT_BRANCH_NAME, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -636,7 +642,7 @@ class NeonEnvBuilder: else: raise RuntimeError(f"Unknown storage type: {remote_storage_kind}") - def enable_local_fs_remote_storage(self, force_enable=True): + def enable_local_fs_remote_storage(self, force_enable: bool = True): """ Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path. Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. @@ -644,7 +650,7 @@ class NeonEnvBuilder: assert force_enable or self.remote_storage is None, "remote storage is enabled already" self.remote_storage = LocalFsStorage(Path(self.repo_dir / "local_fs_remote_storage")) - def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable=True): + def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable: bool = True): """ Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already. Starts up the mock server, if that does not run yet. @@ -671,7 +677,7 @@ class NeonEnvBuilder: secret_key=self.mock_s3_server.secret_key(), ) - def enable_real_s3_remote_storage(self, test_name: str, force_enable=True): + def enable_real_s3_remote_storage(self, test_name: str, force_enable: bool = True): """ Sets up configuration to use real s3 endpoint without mock server """ @@ -759,10 +765,15 @@ class NeonEnvBuilder: log.info("deleted %s objects from remote storage", cnt) - def __enter__(self): + def __enter__(self) -> "NeonEnvBuilder": return self - def __exit__(self, exc_type, exc_value, traceback): + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_value: Optional[BaseException], + traceback: Optional[TracebackType], + ): # Stop all the nodes. if self.env: log.info("Cleaning up all storage and compute nodes") @@ -909,7 +920,7 @@ class NeonEnv: def get_safekeeper_connstrs(self) -> str: """Get list of safekeeper endpoints suitable for safekeepers GUC""" - return ",".join([f"localhost:{wa.port.pg}" for wa in self.safekeepers]) + return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers) def timeline_dir(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" @@ -928,14 +939,14 @@ class NeonEnv: @cached_property def auth_keys(self) -> AuthKeys: - pub = (Path(self.repo_dir) / "auth_public_key.pem").read_bytes() - priv = (Path(self.repo_dir) / "auth_private_key.pem").read_bytes() + pub = (Path(self.repo_dir) / "auth_public_key.pem").read_text() + priv = (Path(self.repo_dir) / "auth_private_key.pem").read_text() return AuthKeys(pub=pub, priv=priv) @pytest.fixture(scope=shareable_scope) def _shared_simple_env( - request: Any, + request: FixtureRequest, port_distributor: PortDistributor, mock_s3_server: MockS3Server, default_broker: Etcd, @@ -993,7 +1004,7 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]: @pytest.fixture(scope="function") def neon_env_builder( - test_output_dir, + test_output_dir: str, port_distributor: PortDistributor, mock_s3_server: MockS3Server, neon_binpath: Path, @@ -1059,7 +1070,7 @@ class PageserverHttpClient(requests.Session): def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() - def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]) -> None: + def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): self.is_testing_enabled_or_skip() if isinstance(config_strings, tuple): @@ -1189,7 +1200,6 @@ class PageserverHttpClient(requests.Session): self.verbose_error(res) res_json = res.json() assert res_json is None - return res_json def timeline_gc( self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int] @@ -1221,7 +1231,6 @@ class PageserverHttpClient(requests.Session): self.verbose_error(res) res_json = res.json() assert res_json is None - return res_json def timeline_get_lsn_by_timestamp( self, tenant_id: TenantId, timeline_id: TimelineId, timestamp @@ -1247,7 +1256,6 @@ class PageserverHttpClient(requests.Session): self.verbose_error(res) res_json = res.json() assert res_json is None - return res_json def get_metrics(self) -> str: res = self.get(f"http://localhost:{self.port}/metrics") @@ -1261,13 +1269,10 @@ class PageserverPort: http: int -CREATE_TIMELINE_ID_EXTRACTOR = re.compile( +CREATE_TIMELINE_ID_EXTRACTOR: re.Pattern = re.compile( # type: ignore[type-arg] r"^Created timeline '(?P[^']+)'", re.MULTILINE ) -CREATE_TIMELINE_ID_EXTRACTOR = re.compile( - r"^Created timeline '(?P[^']+)'", re.MULTILINE -) -TIMELINE_DATA_EXTRACTOR = re.compile( +TIMELINE_DATA_EXTRACTOR: re.Pattern = re.compile( # type: ignore[type-arg] r"\s?(?P[^\s]+)\s\[(?P[^\]]+)\]", re.MULTILINE ) @@ -1560,7 +1565,7 @@ class NeonCli(AbstractNeonCli): def pageserver_start( self, - overrides=(), + overrides: Tuple[str, ...] = (), ) -> "subprocess.CompletedProcess[str]": start_args = ["pageserver", "start", *overrides] append_pageserver_param_overrides( @@ -1718,7 +1723,7 @@ class NeonPageserver(PgProtocol): self.config_override = config_override self.version = env.get_pageserver_version() - def start(self, overrides=()) -> "NeonPageserver": + def start(self, overrides: Tuple[str, ...] = ()) -> "NeonPageserver": """ Start the page server. `overrides` allows to add some config to this pageserver start. @@ -1730,7 +1735,7 @@ class NeonPageserver(PgProtocol): self.running = True return self - def stop(self, immediate=False) -> "NeonPageserver": + def stop(self, immediate: bool = False) -> "NeonPageserver": """ Stop the page server. Returns self. @@ -1740,10 +1745,15 @@ class NeonPageserver(PgProtocol): self.running = False return self - def __enter__(self): + def __enter__(self) -> "NeonPageserver": return self - def __exit__(self, exc_type, exc, tb): + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc: Optional[BaseException], + tb: Optional[TracebackType], + ): self.stop(immediate=True) def is_testing_enabled_or_skip(self): @@ -1855,7 +1865,7 @@ def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: str) -> PgBi class VanillaPostgres(PgProtocol): - def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True): + def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init: bool = True): super().__init__(host="localhost", port=port, dbname="postgres") self.pgdatadir = pgdatadir self.pg_bin = pg_bin @@ -1890,10 +1900,15 @@ class VanillaPostgres(PgProtocol): """Return size of pgdatadir subdirectory in bytes.""" return get_dir_size(os.path.join(self.pgdatadir, subdir)) - def __enter__(self): + def __enter__(self) -> "VanillaPostgres": return self - def __exit__(self, exc_type, exc, tb): + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc: Optional[BaseException], + tb: Optional[TracebackType], + ): if self.running: self.stop() @@ -1933,10 +1948,15 @@ class RemotePostgres(PgProtocol): # See https://www.postgresql.org/docs/14/functions-admin.html#FUNCTIONS-ADMIN-GENFILE raise Exception("cannot get size of a Postgres instance") - def __enter__(self): + def __enter__(self) -> "RemotePostgres": return self - def __exit__(self, exc_type, exc, tb): + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc: Optional[BaseException], + tb: Optional[TracebackType], + ): # do nothing pass @@ -1975,7 +1995,7 @@ class PSQL: self.path = path self.database_url = f"postgres://{host}:{port}/main?options=project%3Dgeneric-project-name" - async def run(self, query=None): + async def run(self, query: Optional[str] = None) -> asyncio.subprocess.Process: run_args = [self.path, "--no-psqlrc", "--quiet", "--tuples-only", self.database_url] if query is not None: run_args += ["--command", query] @@ -2008,7 +2028,7 @@ class NeonProxy(PgProtocol): self._popen: Optional[subprocess.Popen[bytes]] = None self.link_auth_uri_prefix = "http://dummy-uri" - def start(self) -> None: + def start(self): """ Starts a proxy with option '--auth-backend postgres' and a postgres instance already provided though '--auth-endpoint '." """ @@ -2026,7 +2046,7 @@ class NeonProxy(PgProtocol): self._popen = subprocess.Popen(args) self._wait_until_ready() - def start_with_link_auth(self) -> None: + def start_with_link_auth(self): """ Starts a proxy with option '--auth-backend link' and a dummy authentication link '--uri dummy-auth-link'." """ @@ -2054,10 +2074,15 @@ class NeonProxy(PgProtocol): request_result.raise_for_status() return request_result.text - def __enter__(self): + def __enter__(self) -> "NeonProxy": return self - def __exit__(self, exc_type, exc, tb): + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc: Optional[BaseException], + tb: Optional[TracebackType], + ): if self._popen is not None: # NOTE the process will die when we're done with tests anyway, because # it's a child process. This is mostly to clean up in between different tests. @@ -2065,7 +2090,7 @@ class NeonProxy(PgProtocol): @pytest.fixture(scope="function") -def link_proxy(port_distributor, neon_binpath: Path) -> Iterator[NeonProxy]: +def link_proxy(port_distributor: PortDistributor, neon_binpath: Path) -> Iterator[NeonProxy]: """Neon proxy that routes through link auth.""" http_port = port_distributor.get_port() proxy_port = port_distributor.get_port() @@ -2076,7 +2101,9 @@ def link_proxy(port_distributor, neon_binpath: Path) -> Iterator[NeonProxy]: @pytest.fixture(scope="function") -def static_proxy(vanilla_pg, port_distributor, neon_binpath: Path) -> Iterator[NeonProxy]: +def static_proxy( + vanilla_pg: VanillaPostgres, port_distributor: PortDistributor, neon_binpath: Path +) -> Iterator[NeonProxy]: """Neon proxy that routes directly to vanilla postgres.""" # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql` @@ -2276,10 +2303,15 @@ class Postgres(PgProtocol): return self - def __enter__(self): + def __enter__(self) -> "Postgres": return self - def __exit__(self, exc_type, exc, tb): + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc: Optional[BaseException], + tb: Optional[TracebackType], + ): self.stop() @@ -2288,7 +2320,7 @@ class PostgresFactory: def __init__(self, env: NeonEnv): self.env = env - self.num_instances = 0 + self.num_instances: int = 0 self.instances: List[Postgres] = [] def create_start( @@ -2383,7 +2415,7 @@ class Safekeeper: break # success return self - def stop(self, immediate=False) -> "Safekeeper": + def stop(self, immediate: bool = False) -> "Safekeeper": log.info("Stopping safekeeper {}".format(self.id)) self.env.neon_cli.safekeeper_stop(self.id, immediate) self.running = False @@ -2598,7 +2630,7 @@ class Etcd: self.handle.wait() -def get_test_output_dir(request: Any, top_output_dir: Path) -> Path: +def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: """Compute the working directory for an individual test.""" test_name = request.node.name test_dir = top_output_dir / test_name.replace("/", "-") @@ -2618,7 +2650,7 @@ def get_test_output_dir(request: Any, top_output_dir: Path) -> Path: # this fixture ensures that the directory exists. That works because # 'autouse' fixtures are run before other fixtures. @pytest.fixture(scope="function", autouse=True) -def test_output_dir(request: Any, top_output_dir: Path) -> Iterator[Path]: +def test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Iterator[Path]: """Create the working directory for an individual test.""" # one directory per test @@ -2682,7 +2714,7 @@ def should_skip_file(filename: str) -> bool: # # Test helpers # -def list_files_to_compare(pgdata_dir: Path): +def list_files_to_compare(pgdata_dir: Path) -> List[str]: pgdata_files = [] for root, _file, filenames in os.walk(pgdata_dir): for filename in filenames: diff --git a/test_runner/fixtures/pg_stats.py b/test_runner/fixtures/pg_stats.py index b2e6886eb3..adb3a7730e 100644 --- a/test_runner/fixtures/pg_stats.py +++ b/test_runner/fixtures/pg_stats.py @@ -1,3 +1,4 @@ +from functools import cached_property from typing import List import pytest @@ -13,7 +14,7 @@ class PgStatTable: self.columns = columns self.additional_query = filter_query - @property + @cached_property def query(self) -> str: return f"SELECT {','.join(self.columns)} FROM {self.table} {self.additional_query}" @@ -55,6 +56,5 @@ def pg_stats_wal() -> List[PgStatTable]: PgStatTable( "pg_stat_wal", ["wal_records", "wal_fpi", "wal_bytes", "wal_buffers_full", "wal_write"], - "", ) ] diff --git a/test_runner/fixtures/slow.py b/test_runner/fixtures/slow.py index 94199ae785..ae0e87b553 100644 --- a/test_runner/fixtures/slow.py +++ b/test_runner/fixtures/slow.py @@ -1,4 +1,8 @@ +from typing import Any, List + import pytest +from _pytest.config import Config +from _pytest.config.argparsing import Parser """ This plugin allows tests to be marked as slow using pytest.mark.slow. By default slow @@ -9,15 +13,15 @@ Copied from here: https://docs.pytest.org/en/latest/example/simple.html """ -def pytest_addoption(parser): +def pytest_addoption(parser: Parser): parser.addoption("--runslow", action="store_true", default=False, help="run slow tests") -def pytest_configure(config): +def pytest_configure(config: Config): config.addinivalue_line("markers", "slow: mark test as slow to run") -def pytest_collection_modifyitems(config, items): +def pytest_collection_modifyitems(config: Config, items: List[Any]): if config.getoption("--runslow"): # --runslow given in cli: do not skip slow tests return diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py index de2e131b79..2bb962d44a 100644 --- a/test_runner/fixtures/types.py +++ b/test_runner/fixtures/types.py @@ -1,6 +1,8 @@ import random from functools import total_ordering -from typing import Union +from typing import Any, Type, TypeVar, Union + +T = TypeVar("T", bound="Id") @total_ordering @@ -17,31 +19,35 @@ class Lsn: """Convert lsn from hex notation to int.""" l, r = x.split("/") self.lsn_int = (int(l, 16) << 32) + int(r, 16) - # FIXME: error if it doesn't look like a valid LSN + assert 0 <= self.lsn_int <= 0xFFFFFFFF_FFFFFFFF - def __str__(self): + def __str__(self) -> str: """Convert lsn from int to standard hex notation.""" - return "{:X}/{:X}".format(self.lsn_int >> 32, self.lsn_int & 0xFFFFFFFF) + return f"{(self.lsn_int >> 32):X}/{(self.lsn_int & 0xFFFFFFFF):X}" - def __repr__(self): - return 'Lsn("{:X}/{:X}")'.format(self.lsn_int >> 32, self.lsn_int & 0xFFFFFFFF) + def __repr__(self) -> str: + return f'Lsn("{str(self)}")' - def __int__(self): + def __int__(self) -> int: return self.lsn_int - def __lt__(self, other: "Lsn") -> bool: + def __lt__(self, other: Any) -> bool: + if not isinstance(other, Lsn): + return NotImplemented return self.lsn_int < other.lsn_int - def __eq__(self, other) -> bool: + def __eq__(self, other: Any) -> bool: if not isinstance(other, Lsn): return NotImplemented return self.lsn_int == other.lsn_int # Returns the difference between two Lsns, in bytes - def __sub__(self, other: "Lsn") -> int: + def __sub__(self, other: Any) -> int: + if not isinstance(other, Lsn): + return NotImplemented return self.lsn_int - other.lsn_int - def __hash__(self): + def __hash__(self) -> int: return hash(self.lsn_int) @@ -57,7 +63,7 @@ class Id: self.id = bytearray.fromhex(x) assert len(self.id) == 16 - def __str__(self): + def __str__(self) -> str: return self.id.hex() def __lt__(self, other) -> bool: @@ -70,20 +76,20 @@ class Id: return NotImplemented return self.id == other.id - def __hash__(self): + def __hash__(self) -> int: return hash(str(self.id)) @classmethod - def generate(cls): + def generate(cls: Type[T]) -> T: """Generate a random ID""" return cls(random.randbytes(16).hex()) class TenantId(Id): - def __repr__(self): + def __repr__(self) -> str: return f'`TenantId("{self.id.hex()}")' class TimelineId(Id): - def __repr__(self): + def __repr__(self) -> str: return f'TimelineId("{self.id.hex()}")' diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index b04e02d3b8..506fe6f9da 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -6,7 +6,7 @@ import subprocess import tarfile import time from pathlib import Path -from typing import Any, Callable, List, Tuple, TypeVar +from typing import Any, Callable, Dict, List, Tuple, TypeVar import allure # type: ignore from fixtures.log_helper import log @@ -30,11 +30,11 @@ def subprocess_capture(capture_dir: Path, cmd: List[str], **kwargs: Any) -> str: If those files already exist, we will overwrite them. Returns basepath for files with captured output. """ - assert type(cmd) is list - base = os.path.basename(cmd[0]) + "_{}".format(global_counter()) + assert isinstance(cmd, list) + base = f"{os.path.basename(cmd[0])}_{global_counter()}" basepath = os.path.join(capture_dir, base) - stdout_filename = basepath + ".stdout" - stderr_filename = basepath + ".stderr" + stdout_filename = f"{basepath}.stdout" + stderr_filename = f"{basepath}.stderr" try: with open(stdout_filename, "w") as stdout_f: @@ -64,7 +64,7 @@ def global_counter() -> int: return _global_counter -def print_gc_result(row): +def print_gc_result(row: Dict[str, Any]): log.info("GC duration {elapsed} ms".format_map(row)) log.info( " total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_pitr {layers_needed_by_pitr}" @@ -78,8 +78,7 @@ def etcd_path() -> Path: path_output = shutil.which("etcd") if path_output is None: raise RuntimeError("etcd not found in PATH") - else: - return Path(path_output) + return Path(path_output) def query_scalar(cur: cursor, query: str) -> Any: @@ -124,7 +123,6 @@ def get_timeline_dir_size(path: Path) -> int: # file is a delta layer _ = parse_delta_layer(dir_entry.name) sz += dir_entry.stat().st_size - continue return sz @@ -157,8 +155,8 @@ def get_scale_for_db(size_mb: int) -> int: return round(0.06689 * size_mb - 0.5) -ATTACHMENT_NAME_REGEX = re.compile( - r".+\.log|.+\.stderr|.+\.stdout|.+\.filediff|.+\.metrics|flamegraph\.svg|regression\.diffs|.+\.html" +ATTACHMENT_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] + r"flamegraph\.svg|regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)" ) diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 47e2435052..cb35cad46b 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -2,7 +2,7 @@ import statistics import threading import time import timeit -from typing import Callable +from typing import Any, Callable, List import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker @@ -197,7 +197,7 @@ def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_inte if not isinstance(env, NeonCompare): return - lsn_write_lags = [] + lsn_write_lags: List[Any] = [] last_received_lsn = Lsn(0) last_pg_flush_lsn = Lsn(0) @@ -216,6 +216,7 @@ def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_inte ) res = cur.fetchone() + assert isinstance(res, list) lsn_write_lags.append(res[0]) curr_received_lsn = Lsn(res[3]) diff --git a/test_runner/pg_clients/python/asyncpg/asyncpg_example.py b/test_runner/pg_clients/python/asyncpg/asyncpg_example.py index 7f579ce672..4d9dfb09c1 100755 --- a/test_runner/pg_clients/python/asyncpg/asyncpg_example.py +++ b/test_runner/pg_clients/python/asyncpg/asyncpg_example.py @@ -24,7 +24,6 @@ if __name__ == "__main__": if (v := os.environ.get(k, None)) is not None } - loop = asyncio.new_event_loop() - row = loop.run_until_complete(run(**kwargs)) + row = asyncio.run(run(**kwargs)) print(row[0]) diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index bd02841dc0..b4647ebbe9 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -129,6 +129,7 @@ async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProx create_and_send_db_info(vanilla_pg, psql_session_id, link_proxy.mgmt_port) + assert proc.stdout is not None out = (await proc.stdout.read()).decode("utf-8").strip() assert out == "42" From 8654e95fae5a035307d9348bac1d6a68448df955 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 8 Nov 2022 16:44:17 +0100 Subject: [PATCH 1008/1022] walredo: fix zombie processes ([postgres] ) This change wraps the std::process:Child that we spawn for WAL redo into a type that ensures that we try to SIGKILL + waitpid() on it. If there is no explicit call to kill_and_wait(), the Drop implementation will spawns a task that does it in the BACKGROUND_RUNTIME. That's an ugly hack but I think it's better than doing kill+wait synchronously from Drop, since I think the general assumption in the Rust ecosystem is that Drop doesn't block. Especially since the drop sites can be _any_ place that drops the last Arc, e.g., compaction or GC. The benefit of having the new type over just adding a Drop impl to PostgresRedoProcess is that we can construct it earlier than the full PostgresRedoProcess in PostgresRedoProcess::launch(). That allows us to correctly kill+wait the child if there is an error in PostgresRedoProcess::launch() after spawning it. I also took a stab at a regression test. I manually verified that it fails before the fix to walredo.rs. fixes https://github.com/neondatabase/neon/issues/2761 closes https://github.com/neondatabase/neon/pull/2776 --- pageserver/src/walredo.rs | 150 ++++++++++++------ poetry.lock | 64 ++++++-- pyproject.toml | 2 + .../test_walredo_not_left_behind_on_detach.py | 104 ++++++++++++ 4 files changed, 261 insertions(+), 59 deletions(-) create mode 100644 test_runner/regress/test_walredo_not_left_behind_on_detach.py diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index a787da7069..59dadbb1d3 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -22,10 +22,10 @@ use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; use serde::Serialize; -use std::fs; use std::fs::OpenOptions; use std::io::prelude::*; use std::io::{Error, ErrorKind}; +use std::ops::{Deref, DerefMut}; use std::os::unix::io::AsRawFd; use std::os::unix::prelude::CommandExt; use std::path::PathBuf; @@ -34,6 +34,7 @@ use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command}; use std::sync::Mutex; use std::time::Duration; use std::time::Instant; +use std::{fs, io}; use tracing::*; use utils::crashsafe::path_with_suffix_extension; use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; @@ -44,6 +45,7 @@ use crate::metrics::{ }; use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; use crate::repository::Key; +use crate::task_mgr::BACKGROUND_RUNTIME; use crate::walrecord::NeonWalRecord; use crate::{config::PageServerConf, TEMP_FILE_SUFFIX}; use pageserver_api::reltag::{RelTag, SlruKind}; @@ -580,11 +582,10 @@ impl CloseFileDescriptors for C { /// struct PostgresRedoProcess { tenant_id: TenantId, - child: Child, + child: NoLeakChild, stdin: ChildStdin, stdout: ChildStdout, stderr: ChildStderr, - called_kill: bool, } impl PostgresRedoProcess { @@ -656,7 +657,7 @@ impl PostgresRedoProcess { } // Start postgres itself - let mut child = Command::new(pg_bin_dir_path.join("postgres")) + let child = Command::new(pg_bin_dir_path.join("postgres")) .arg("--wal-redo") .stdin(Stdio::piped()) .stderr(Stdio::piped()) @@ -675,7 +676,7 @@ impl PostgresRedoProcess { // as close-on-exec by default, but that's not enough, since we use // libraries that directly call libc open without setting that flag. .close_fds() - .spawn() + .spawn_no_leak_child() .map_err(|e| { Error::new( e.kind(), @@ -683,11 +684,10 @@ impl PostgresRedoProcess { ) })?; - info!( - pid = child.id(), - "launched WAL redo postgres process on {}", - datadir.display() - ); + let mut child = scopeguard::guard(child, |child| { + error!("killing wal-redo-postgres process due to a problem during launch"); + child.kill_and_wait(); + }); let stdin = child.stdin.take().unwrap(); let stdout = child.stdout.take().unwrap(); @@ -706,48 +706,21 @@ impl PostgresRedoProcess { set_nonblock_or_log_err!(stdout)?; set_nonblock_or_log_err!(stderr)?; + // all fallible operations post-spawn are complete, so get rid of the guard + let child = scopeguard::ScopeGuard::into_inner(child); + Ok(PostgresRedoProcess { tenant_id, child, stdin, stdout, stderr, - called_kill: false, }) } #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))] - fn kill(mut self) { - info!("killing wal-redo-postgres process"); - self.called_kill = true; - - let res = self.child.kill(); - if let Err(e) = res { - // This branch is very unlikely because: - // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it. - // - This is the only place that calls .kill() - // - We consume `self`, so, .kill() can't be called twice. - // - If the process exited by itself or was killed by someone else, - // .kill() will still succeed because we haven't wait()'ed yet. - // - // So, if we arrive here, we have really no idea what happened, - // whether the PID stored in self.child is still valid, etc. - // If this function were fallible, we'd return an error, but - // since it isn't, all we can do is log an error and proceed - // with the wait(). - error!(error = %e, "failed to SIGKILL wal-redo-postgres; subsequent wait() might fail or wait for wrong process"); - } - - match self.child.wait() { - Ok(exit_status) => { - // log at error level since .kill() is something we only do on errors ATM - error!(exit_status = %exit_status, "wal-redo-postgres wait successful"); - } - Err(e) => { - error!(error = %e, "wal-redo-postgres wait error; might leak the child process; it will show as zombie (defunct)"); - } - } - drop(self); + fn kill(self) { + self.child.kill_and_wait(); } // @@ -880,11 +853,96 @@ impl PostgresRedoProcess { } } -impl Drop for PostgresRedoProcess { - fn drop(&mut self) { - if !self.called_kill { - error!(tenant_id=%self.tenant_id, pid = %self.child.id(), "dropping PostgresRedoProcess that wasn't killed, likely causing defunct postgres process"); +/// Wrapper type around `std::process::Child` which guarantees that the child +/// will be killed and waited-for by this process before being dropped. +struct NoLeakChild { + child: Option, +} + +impl Deref for NoLeakChild { + type Target = Child; + + fn deref(&self) -> &Self::Target { + self.child.as_ref().expect("must not use from drop") + } +} + +impl DerefMut for NoLeakChild { + fn deref_mut(&mut self) -> &mut Self::Target { + self.child.as_mut().expect("must not use from drop") + } +} + +impl NoLeakChild { + fn spawn(command: &mut Command) -> io::Result { + let child = command.spawn()?; + Ok(NoLeakChild { child: Some(child) }) + } + + fn kill_and_wait(mut self) { + let child = match self.child.take() { + Some(child) => child, + None => return, + }; + Self::kill_and_wait_impl(child); + } + + #[instrument(skip_all, fields(pid=child.id()))] + fn kill_and_wait_impl(mut child: Child) { + let res = child.kill(); + if let Err(e) = res { + // This branch is very unlikely because: + // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it. + // - This is the only place that calls .kill() + // - We consume `self`, so, .kill() can't be called twice. + // - If the process exited by itself or was killed by someone else, + // .kill() will still succeed because we haven't wait()'ed yet. + // + // So, if we arrive here, we have really no idea what happened, + // whether the PID stored in self.child is still valid, etc. + // If this function were fallible, we'd return an error, but + // since it isn't, all we can do is log an error and proceed + // with the wait(). + error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process"); } + + match child.wait() { + Ok(exit_status) => { + // log at error level since .kill() is something we only do on errors ATM + error!(exit_status = %exit_status, "wait successful"); + } + Err(e) => { + error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)"); + } + } + } +} + +impl Drop for NoLeakChild { + fn drop(&mut self) { + let child = match self.child.take() { + Some(child) => child, + None => return, + }; + // Offload the kill+wait of the child process into the background. + // If someone stops the runtime, we'll leak the child process. + // We can ignore that case because we only stop the runtime on pageserver exit. + BACKGROUND_RUNTIME.spawn(async move { + tokio::task::spawn_blocking(move || { + Self::kill_and_wait_impl(child); + }) + .await + }); + } +} + +trait NoLeakChildCommandExt { + fn spawn_no_leak_child(&mut self) -> io::Result; +} + +impl NoLeakChildCommandExt for Command { + fn spawn_no_leak_child(&mut self) -> io::Result { + NoLeakChild::spawn(self) } } diff --git a/poetry.lock b/poetry.lock index 551b267a87..bc1b57bc64 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1077,6 +1077,17 @@ python-versions = ">=3.6" [package.extras] twisted = ["twisted"] +[[package]] +name = "psutil" +version = "5.9.4" +description = "Cross-platform lib for process and system monitoring in Python." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.extras] +test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] + [[package]] name = "psycopg2-binary" version = "2.9.3" @@ -1228,8 +1239,8 @@ python-versions = ">=3.6" [package.dependencies] pytest = [ - {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, {version = ">=5.0", markers = "python_version < \"3.10\""}, + {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, ] [[package]] @@ -1436,6 +1447,14 @@ category = "dev" optional = false python-versions = ">=3.7" +[[package]] +name = "types-psutil" +version = "5.9.5.4" +description = "Typing stubs for psutil" +category = "main" +optional = false +python-versions = "*" + [[package]] name = "types-psycopg2" version = "2.9.18" @@ -1555,7 +1574,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "ebe16714bd4db1f34f005c9b72392f165618b020a2d0948cae20e0e8894c5517" +content-hash = "c95c184fccaf40815405ad616ec1c55869c7f87b72777cc3a9cbaff41de98977" [metadata.files] aiopg = [ @@ -1966,9 +1985,26 @@ prometheus-client = [ {file = "prometheus_client-0.14.1-py3-none-any.whl", hash = "sha256:522fded625282822a89e2773452f42df14b5a8e84a86433e3f8a189c1d54dc01"}, {file = "prometheus_client-0.14.1.tar.gz", hash = "sha256:5459c427624961076277fdc6dc50540e2bacb98eebde99886e59ec55ed92093a"}, ] +psutil = [ + {file = "psutil-5.9.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c1ca331af862803a42677c120aff8a814a804e09832f166f226bfd22b56feee8"}, + {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:68908971daf802203f3d37e78d3f8831b6d1014864d7a85937941bb35f09aefe"}, + {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:3ff89f9b835100a825b14c2808a106b6fdcc4b15483141482a12c725e7f78549"}, + {file = "psutil-5.9.4-cp27-cp27m-win32.whl", hash = "sha256:852dd5d9f8a47169fe62fd4a971aa07859476c2ba22c2254d4a1baa4e10b95ad"}, + {file = "psutil-5.9.4-cp27-cp27m-win_amd64.whl", hash = "sha256:9120cd39dca5c5e1c54b59a41d205023d436799b1c8c4d3ff71af18535728e94"}, + {file = "psutil-5.9.4-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6b92c532979bafc2df23ddc785ed116fced1f492ad90a6830cf24f4d1ea27d24"}, + {file = "psutil-5.9.4-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:efeae04f9516907be44904cc7ce08defb6b665128992a56957abc9b61dca94b7"}, + {file = "psutil-5.9.4-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:54d5b184728298f2ca8567bf83c422b706200bcbbfafdc06718264f9393cfeb7"}, + {file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:16653106f3b59386ffe10e0bad3bb6299e169d5327d3f187614b1cb8f24cf2e1"}, + {file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54c0d3d8e0078b7666984e11b12b88af2db11d11249a8ac8920dd5ef68a66e08"}, + {file = "psutil-5.9.4-cp36-abi3-win32.whl", hash = "sha256:149555f59a69b33f056ba1c4eb22bb7bf24332ce631c44a319cec09f876aaeff"}, + {file = "psutil-5.9.4-cp36-abi3-win_amd64.whl", hash = "sha256:fd8522436a6ada7b4aad6638662966de0d61d241cb821239b2ae7013d41a43d4"}, + {file = "psutil-5.9.4-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6001c809253a29599bc0dfd5179d9f8a5779f9dffea1da0f13c53ee568115e1e"}, + {file = "psutil-5.9.4.tar.gz", hash = "sha256:3d7f9739eb435d4b1338944abe23f49584bde5395f27487d2ee25ad9a8774a62"}, +] psycopg2-binary = [ {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"}, @@ -2002,6 +2038,7 @@ psycopg2-binary = [ {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"}, {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"}, @@ -2013,6 +2050,7 @@ psycopg2-binary = [ {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"}, @@ -2029,18 +2067,7 @@ py = [ {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] pyasn1 = [ - {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"}, - {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"}, - {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"}, - {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"}, {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, - {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"}, - {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"}, - {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"}, - {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"}, - {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"}, - {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"}, - {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"}, {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, ] pycodestyle = [ @@ -2146,6 +2173,13 @@ pyyaml = [ {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, + {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"}, + {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"}, + {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"}, + {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"}, {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, @@ -2213,6 +2247,10 @@ tomli = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +types-psutil = [ + {file = "types-psutil-5.9.5.4.tar.gz", hash = "sha256:aa09102b80c65a3b4573216614372398dab78972d650488eaff1ff05482cc18f"}, + {file = "types_psutil-5.9.5.4-py3-none-any.whl", hash = "sha256:28e59764630187e462d43788efa16d59d5e77b510115f9e25901b2d4007fca62"}, +] types-psycopg2 = [ {file = "types-psycopg2-2.9.18.tar.gz", hash = "sha256:9b0e9e1f097b15cd9fa8aad2596a9e3082fd72f8d9cfe52b190cfa709105b6c0"}, {file = "types_psycopg2-2.9.18-py3-none-any.whl", hash = "sha256:14c779dcab18c31453fa1cad3cf4b1601d33540a344adead3c47a6b8091cd2fa"}, diff --git a/pyproject.toml b/pyproject.toml index c2e2e2393b..b13acece18 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,8 @@ pytest-order = "^1.0.1" allure-pytest = "^2.10.0" pytest-asyncio = "^0.19.0" toml = "^0.10.2" +psutil = "^5.9.4" +types-psutil = "^5.9.5.4" [tool.poetry.dev-dependencies] flake8 = "^5.0.4" diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py new file mode 100644 index 0000000000..c79aea35da --- /dev/null +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -0,0 +1,104 @@ +import time + +import psutil +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException +from fixtures.types import TenantId + + +def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_present=False): + children = psutil.Process(pageserver_pid).children() + for child in children: + if not wal_redo_present: + assert "--wal-redo" not in child.cmdline() + if not defunct_present: + assert child.status() != psutil.STATUS_ZOMBIE + + +# Check that the pageserver doesn't leave behind WAL redo processes +# when a tenant is detached. We had an issue previously where we failed +# to wait and consume the exit code of the WAL redo process, leaving it behind +# as a zombie process. +def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + pagserver_pid = int((env.repo_dir / "pageserver.pid").read_text()) + + assert_child_processes(pagserver_pid, wal_redo_present=False, defunct_present=False) + + # first check for non existing tenant + tenant_id = TenantId.generate() + with pytest.raises( + expected_exception=PageserverApiException, + match=f"Tenant not found for id {tenant_id}", + ): + pageserver_http.tenant_detach(tenant_id) + + # create new nenant + tenant_id, _ = env.neon_cli.create_tenant() + + # assert tenant exists on disk + assert (env.repo_dir / "tenants" / str(tenant_id)).exists() + + pg = env.postgres.create_start("main", tenant_id=tenant_id) + + pg_conn = pg.connect() + cur = pg_conn.cursor() + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers, otherwise the SELECT after restart will just return answer + # from shared_buffers without hitting the page server, which defeats the point + # of this test. + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Verify that the table is larger than shared_buffers + cur.execute( + """ + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize + from pg_settings where name = 'shared_buffers' + """ + ) + row = cur.fetchone() + assert row is not None + log.info(f"shared_buffers is {row[0]}, table size {row[1]}") + assert int(row[0]) < int(row[1]) + + cur.execute("SELECT count(*) FROM foo") + assert cur.fetchone() == (100000,) + + # After filling the table and doing the SELECT, it is guaranteed that we did some WAL redo. + # So, assert that the WAL redo process is present. + # XXX this is quite brittle as the lifecycle of the WAL redo process is an implementation detail + assert_child_processes(pagserver_pid, wal_redo_present=True, defunct_present=False) + + last_error = None + for i in range(3): + try: + pageserver_http.tenant_detach(tenant_id) + except Exception as e: + last_error = e + log.error(f"try {i} error detaching tenant: {e}") + continue + else: + break + # else is called if the loop finished without reaching "break" + else: + pytest.fail(f"could not detach tenant: {last_error}") + + # check that nothing is left on disk for deleted tenant + assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() + + # Pageserver schedules kill+wait of the WAL redo process to the background runtime, + # asynchronously to tenant detach. Cut it some slack to complete kill+wait before + # checking. + time.sleep(1.0) + assert_child_processes(pagserver_pid, wal_redo_present=False, defunct_present=False) From 175779c0efe8a0e28032d27f21d8209540c1d21d Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 10 Nov 2022 12:51:47 +0000 Subject: [PATCH 1009/1022] GitHub Actions: fix non-parallel benchmarks on CI (#2787) Fix non-parallel pytest run by setting `--dist=loadgroup` only for pytest command with xdist enabled (`-n` is set) --- .github/actions/run-python-test-set/action.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 97783df444..0b880c7306 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -123,7 +123,12 @@ runs: exit 1 fi if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then + # -n4 uses four processes to run tests via pytest-xdist EXTRA_PARAMS="-n4 $EXTRA_PARAMS" + + # --dist=loadgroup points tests marked with @pytest.mark.xdist_group + # to the same worker to make @pytest.mark.order work with xdist + EXTRA_PARAMS="--dist=loadgroup $EXTRA_PARAMS" fi if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then @@ -158,11 +163,8 @@ runs: # --verbose prints name of each test (helpful when there are # multiple tests in one file) # -rA prints summary in the end - # -n4 uses four processes to run tests via pytest-xdist # -s is not used to prevent pytest from capturing output, because tests are running # in parallel and logs are mixed between different tests - # --dist=loadgroup points tests marked with @pytest.mark.xdist_group to the same worker, - # to make @pytest.mark.order work with xdist # mkdir -p $TEST_OUTPUT/allure/results "${cov_prefix[@]}" ./scripts/pytest \ @@ -170,7 +172,6 @@ runs: --alluredir=$TEST_OUTPUT/allure/results \ --tb=short \ --verbose \ - --dist=loadgroup \ -rA $TEST_SELECTION $EXTRA_PARAMS if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then From c6072d38c2d3ee16175df10281d08712fe8e52df Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 10 Nov 2022 16:49:00 +0100 Subject: [PATCH 1010/1022] Remove debug logs in should_walsender_stop (#2791) --- safekeeper/src/timeline.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index a3f0ff94ee..132a926203 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -556,10 +556,6 @@ impl Timeline { .pageserver_feedback .map(|f| Lsn(f.ps_applylsn)) .unwrap_or(Lsn::INVALID); - info!( - "checking should ws stop ttid {} lsn {} rcl {}", - self.ttid, reported_remote_consistent_lsn, shared_state.sk.inmem.commit_lsn - ); let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet (reported_remote_consistent_lsn!= Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); From d5b7832c215a3831eb55e581b41b52a9ccbaa225 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 10 Nov 2022 16:15:04 +0000 Subject: [PATCH 1011/1022] Fix test_wal_backpressure tests (#2792) Fix expected return type for `fetchone `: ``` AssertionError: assert False + where False = isinstance((Decimal('56048'), '55 kB', '0/1CF52D8', '0/1CE77E8'), list) ``` --- test_runner/performance/test_wal_backpressure.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index cb35cad46b..dd840acd25 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -216,7 +216,7 @@ def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_inte ) res = cur.fetchone() - assert isinstance(res, list) + assert isinstance(res, tuple) lsn_write_lags.append(res[0]) curr_received_lsn = Lsn(res[3]) From 84212181524e84ce40d46c8209baf68cb121ec8b Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Mon, 7 Nov 2022 13:56:00 +0100 Subject: [PATCH 1012/1022] Change the branch name for V14 as it does for V15 --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 23765194c1..081a404135 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,7 @@ [submodule "vendor/postgres-v14"] path = vendor/postgres-v14 url = https://github.com/neondatabase/postgres.git - branch = main + branch = REL_14_STABLE_neon [submodule "vendor/postgres-v15"] path = vendor/postgres-v15 url = https://github.com/neondatabase/postgres.git From 7edc098c40b1c0d613e88b92d06817d79f0c7675 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Thu, 10 Nov 2022 16:05:57 -0500 Subject: [PATCH 1013/1022] Add perf test instructions (#2777) --- test_runner/performance/README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index 21e48cf899..725612853a 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -1,3 +1,22 @@ +# Running locally + +First make a release build. The profiling flag is optional, used only for tests that +generate flame graphs. The `-s` flag just silences a lot of output, and makes it +easier to see if you have compile errors without scrolling up. +`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing,profiling" make -s -j8` + +NOTE: the `profiling` flag only works on linux because we use linux-specific +libc APIs like `libc::timer_t`. + +Then run the tests +`NEON_BIN=./target/release poetry run pytest test_runner/performance"` + +Some handy pytest flags for local development: +- `-x` tells pytest to stop on first error +- `-s` shows test output +- `-k` selects a test to run +- `--timeout=0` disables our default timeout of 300s (see `setup.cfg`) + # What performance tests do we have and how we run them Performance tests are built using the same infrastructure as our usual python integration tests. There are some extra fixtures that help to collect performance metrics, and to run tests against both vanilla PostgreSQL and Neon for comparison. From 7fd88fab597370d195dd6471a3414a862af14aa2 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Thu, 10 Nov 2022 16:43:04 -0500 Subject: [PATCH 1014/1022] Trace read requests (#2762) --- control_plane/src/pageserver.rs | 10 ++++++ libs/pageserver_api/src/models.rs | 3 ++ libs/utils/src/id.rs | 11 +++++++ pageserver/src/config.rs | 17 ++++++++++ pageserver/src/http/routes.rs | 7 +++++ pageserver/src/lib.rs | 1 + pageserver/src/page_service.rs | 19 ++++++++++++ pageserver/src/tenant.rs | 8 +++++ pageserver/src/tenant_config.rs | 7 +++++ pageserver/src/trace.rs | 36 ++++++++++++++++++++++ test_runner/performance/test_read_trace.py | 31 +++++++++++++++++++ 11 files changed, 150 insertions(+) create mode 100644 pageserver/src/trace.rs create mode 100644 test_runner/performance/test_read_trace.py diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index fa6d1e496a..18d6aee68d 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -362,6 +362,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, + trace_read_requests: settings + .remove("trace_read_requests") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'trace_read_requests' as bool")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -424,6 +429,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, + trace_read_requests: settings + .get("trace_read_requests") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'trace_read_requests' as bool")?, }) .send()? .error_from_body()?; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index e5bd46f260..af9be2d456 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -73,6 +73,7 @@ pub struct TenantCreateRequest { pub walreceiver_connect_timeout: Option, pub lagging_wal_timeout: Option, pub max_lsn_wal_lag: Option, + pub trace_read_requests: Option, } #[serde_as] @@ -112,6 +113,7 @@ pub struct TenantConfigRequest { pub walreceiver_connect_timeout: Option, pub lagging_wal_timeout: Option, pub max_lsn_wal_lag: Option, + pub trace_read_requests: Option, } impl TenantConfigRequest { @@ -130,6 +132,7 @@ impl TenantConfigRequest { walreceiver_connect_timeout: None, lagging_wal_timeout: None, max_lsn_wal_lag: None, + trace_read_requests: None, } } } diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index f245f7c3d4..7ce324614d 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -204,6 +204,17 @@ pub struct TenantId(Id); id_newtype!(TenantId); +/// Neon Connection Id identifies long-lived connections (for example a pagestream +/// connection with the page_service). Is used for better logging and tracing +/// +/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look +/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. +/// See [`Id`] for alternative ways to serialize it. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] +pub struct ConnectionId(Id); + +id_newtype!(ConnectionId); + // A pair uniquely identifying Neon instance. #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct TenantTimelineId { diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 747e63af2b..f40b608da1 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -8,6 +8,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use remote_storage::RemoteStorageConfig; use std::env; use utils::crashsafe::path_with_suffix_extension; +use utils::id::ConnectionId; use std::num::NonZeroUsize; use std::path::{Path, PathBuf}; @@ -414,6 +415,22 @@ impl PageServerConf { ) } + pub fn traces_path(&self) -> PathBuf { + self.workdir.join("traces") + } + + pub fn trace_path( + &self, + tenant_id: &TenantId, + timeline_id: &TimelineId, + connection_id: &ConnectionId, + ) -> PathBuf { + self.traces_path() + .join(tenant_id.to_string()) + .join(timeline_id.to_string()) + .join(connection_id.to_string()) + } + /// Points to a place in pageserver's local directory, /// where certain timeline's metadata file should be located. pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf { diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 14ea054577..db581efc7d 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -618,6 +618,7 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result) -> Result bool { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .trace_read_requests + .unwrap_or(self.conf.default_tenant_conf.trace_read_requests) + } + pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) { self.tenant_conf.write().unwrap().update(&new_tenant_conf); } @@ -1666,6 +1673,7 @@ pub mod harness { walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout), lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout), max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), + trace_read_requests: Some(tenant_conf.trace_read_requests), } } } diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index dc1b9353a6..dd3792450d 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -82,6 +82,7 @@ pub struct TenantConf { /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update, /// to avoid eager reconnects. pub max_lsn_wal_lag: NonZeroU64, + pub trace_read_requests: bool, } /// Same as TenantConf, but this struct preserves the information about @@ -105,6 +106,7 @@ pub struct TenantConfOpt { #[serde(with = "humantime_serde")] pub lagging_wal_timeout: Option, pub max_lsn_wal_lag: Option, + pub trace_read_requests: Option, } impl TenantConfOpt { @@ -138,6 +140,9 @@ impl TenantConfOpt { .lagging_wal_timeout .unwrap_or(global_conf.lagging_wal_timeout), max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag), + trace_read_requests: self + .trace_read_requests + .unwrap_or(global_conf.trace_read_requests), } } @@ -207,6 +212,7 @@ impl TenantConf { .expect("cannot parse default walreceiver lagging wal timeout"), max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) .expect("cannot parse default max walreceiver Lsn wal lag"), + trace_read_requests: false, } } @@ -232,6 +238,7 @@ impl TenantConf { .unwrap(), max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) .unwrap(), + trace_read_requests: false, } } } diff --git a/pageserver/src/trace.rs b/pageserver/src/trace.rs new file mode 100644 index 0000000000..9e466dd9b0 --- /dev/null +++ b/pageserver/src/trace.rs @@ -0,0 +1,36 @@ +use bytes::Bytes; +use std::{ + fs::{create_dir_all, File}, + io::{BufWriter, Write}, + path::PathBuf, +}; + +pub struct Tracer { + writer: BufWriter, +} + +impl Drop for Tracer { + fn drop(&mut self) { + self.flush() + } +} + +impl Tracer { + pub fn new(path: PathBuf) -> Self { + let parent = path.parent().expect("failed to parse parent path"); + create_dir_all(parent).expect("failed to create trace dir"); + + let file = File::create(path).expect("failed to create trace file"); + Tracer { + writer: BufWriter::new(file), + } + } + + pub fn trace(&mut self, msg: &Bytes) { + self.writer.write_all(msg).expect("failed to write trace"); + } + + pub fn flush(&mut self) { + self.writer.flush().expect("failed to flush trace file"); + } +} diff --git a/test_runner/performance/test_read_trace.py b/test_runner/performance/test_read_trace.py new file mode 100644 index 0000000000..a5bd0b8de6 --- /dev/null +++ b/test_runner/performance/test_read_trace.py @@ -0,0 +1,31 @@ +from contextlib import closing + +from fixtures.neon_fixtures import NeonEnvBuilder + + +# This test demonstrates how to collect a read trace. It's useful until +# it gets replaced by a test that actually does stuff with the trace. +def test_read_request_tracing(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + tenant, _ = env.neon_cli.create_tenant( + conf={ + "trace_read_requests": "true", + } + ) + + timeline = env.neon_cli.create_timeline("test_trace_replay", tenant_id=tenant) + pg = env.postgres.create_start("test_trace_replay", "main", tenant) + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("create table t (i integer);") + cur.execute(f"insert into t values (generate_series(1,{10000}));") + cur.execute("select count(*) from t;") + + # Stop pg so we drop the connection and flush the traces + pg.stop() + + trace_path = env.repo_dir / "traces" / str(tenant) / str(timeline) + assert trace_path.exists() From 03695261fccc0311591c7027bfa8062327b557e8 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 11 Nov 2022 19:42:26 +0200 Subject: [PATCH 1015/1022] Test storage Docker images (#2767) Closes https://github.com/neondatabase/neon/issues/2697 Example: https://github.com/neondatabase/neon/actions/runs/3416774593/jobs/5688394855 Adds a set of tests on the storage Docker images before they are pushed to the public registries: * tests that pageserver binary has the correct version string (other binaries are built with the same library, so it should be enough to test one) * tests that the compose file set-up works and all components are able to start and perform a single SQL query (CREATE TABLE) --- .github/workflows/build_and_test.yml | 46 +++++++++++++- docker-compose/compute_wrapper/Dockerfile | 13 ++++ .../shell/compute.sh | 0 .../var/db/postgres/specs/spec.json | 0 docker-compose/docker-compose.yml | 29 +++++---- docker-compose/docker_compose_test.sh | 60 +++++++++++++++++++ docker-compose/image/compute/Dockerfile | 10 ---- scripts/docker-compose_test.sh | 51 ---------------- 8 files changed, 136 insertions(+), 73 deletions(-) create mode 100644 docker-compose/compute_wrapper/Dockerfile rename docker-compose/{compute => compute_wrapper}/shell/compute.sh (100%) rename docker-compose/{compute => compute_wrapper}/var/db/postgres/specs/spec.json (100%) create mode 100755 docker-compose/docker_compose_test.sh delete mode 100644 docker-compose/image/compute/Dockerfile delete mode 100755 scripts/docker-compose_test.sh diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index b598949f2b..e6014ecb84 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -529,7 +529,6 @@ jobs: - name: Kaniko build compute node with extensions v14 run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} - compute-node-image-v15: runs-on: dev container: gcr.io/kaniko-project/executor:v1.9.0-debug @@ -547,9 +546,52 @@ jobs: - name: Kaniko build compute node with extensions v15 run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} + test-images: + needs: [ tag, neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ] + runs-on: dev + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library. + # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify. + # Regular pageserver version string looks like + # Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: [] + # Bad versions might loop like: + # Neon page server git-env:local failpoints: true, features: ["testing"] + # Ensure that we don't have bad versions. + - name: Verify image versions + shell: bash # ensure no set -e for better error messages + run: | + pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") + + echo "Pageserver version string: $pageserver_version" + + if ! echo "$pageserver_version" | grep -qv 'git-env:local' ; then + echo "Pageserver version should not be the default Dockerfile one" + exit 1 + fi + + if ! echo "$pageserver_version" | grep -qv '"testing"' ; then + echo "Pageserver version should have no testing feature enabled" + exit 1 + fi + + - name: Verify docker-compose example + run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh + + - name: Print logs and clean up + if: always() + run: | + docker compose -f ./docker-compose/docker-compose.yml logs || 0 + docker compose -f ./docker-compose/docker-compose.yml down + promote-images: runs-on: dev - needs: [ tag, neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ] + needs: [ tag, test-images ] if: github.event_name != 'workflow_dispatch' container: amazon/aws-cli strategy: diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile new file mode 100644 index 0000000000..f1b1986072 --- /dev/null +++ b/docker-compose/compute_wrapper/Dockerfile @@ -0,0 +1,13 @@ +ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG COMPUTE_IMAGE=compute-node-v14 +ARG TAG=latest + +FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG + +USER root +RUN apt-get update && \ + apt-get install -y curl \ + jq \ + netcat + +USER postgres diff --git a/docker-compose/compute/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh similarity index 100% rename from docker-compose/compute/shell/compute.sh rename to docker-compose/compute_wrapper/shell/compute.sh diff --git a/docker-compose/compute/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json similarity index 100% rename from docker-compose/compute/var/db/postgres/specs/spec.json rename to docker-compose/compute_wrapper/var/db/postgres/specs/spec.json diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 9ab775c3f9..61b53dba41 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -2,6 +2,7 @@ version: '3' services: etcd: + restart: always image: quay.io/coreos/etcd:v3.5.4 ports: - 2379:2379 @@ -9,7 +10,7 @@ services: environment: # This signifficantly speeds up etcd and we anyway don't data persistency there. ETCD_UNSAFE_NO_FSYNC: "1" - command: + command: - "etcd" - "--auto-compaction-mode=revision" - "--auto-compaction-retention=1" @@ -24,6 +25,7 @@ services: - "--quota-backend-bytes=134217728" # 128 MB minio: + restart: always image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z ports: - 9000:9000 @@ -41,7 +43,7 @@ services: entrypoint: - "/bin/sh" - "-c" - command: + command: - "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do echo 'Waiting to start minio...' && sleep 1; done; @@ -51,7 +53,8 @@ services: - minio pageserver: - image: neondatabase/neon:${TAG:-latest} + restart: always + image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} environment: - BROKER_ENDPOINT='http://etcd:2379' - AWS_ACCESS_KEY_ID=minio @@ -77,7 +80,8 @@ services: - minio_create_buckets safekeeper1: - image: neondatabase/neon:${TAG:-latest} + restart: always + image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} environment: - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454 - SAFEKEEPER_ID=1 @@ -106,7 +110,8 @@ services: - minio_create_buckets safekeeper2: - image: neondatabase/neon:${TAG:-latest} + restart: always + image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} environment: - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454 - SAFEKEEPER_ID=2 @@ -135,7 +140,8 @@ services: - minio_create_buckets safekeeper3: - image: neondatabase/neon:${TAG:-latest} + restart: always + image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} environment: - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454 - SAFEKEEPER_ID=3 @@ -164,18 +170,21 @@ services: - minio_create_buckets compute: + restart: always build: - context: ./image/compute + context: ./compute_wrapper/ args: - - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest} + - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14} + - TAG=${TAG:-latest} - http_proxy=$http_proxy - https_proxy=$https_proxy environment: - PG_VERSION=${PG_VERSION:-14} #- RUST_BACKTRACE=1 + # Mount the test files directly, for faster editing cycle. volumes: - - ./compute/var/db/postgres/specs/:/var/db/postgres/specs/ - - ./compute/shell/:/shell/ + - ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/ + - ./compute_wrapper/shell/:/shell/ ports: - 55433:55433 # pg protocol handler - 3080:3080 # http endpoints diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh new file mode 100755 index 0000000000..9de5277bf1 --- /dev/null +++ b/docker-compose/docker_compose_test.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# A basic test to ensure Docker images are built correctly. +# Build a wrapper around the compute, start all services and runs a simple SQL query. +# Repeats the process for all currenly supported Postgres versions. + +# Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file +# Their defaults point at DockerHub `neondatabase/neon:latest` image.`, +# to verify custom image builds (e.g pre-published ones). + +# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer. + +set -eux -o pipefail + +SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml + +COMPUTE_CONTAINER_NAME=docker-compose-compute-1 +SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;" +PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres" + +cleanup() { + echo "show container information" + docker ps + docker compose -f $COMPOSE_FILE logs + echo "stop containers..." + docker compose -f $COMPOSE_FILE down +} + +echo "clean up containers if exists" +cleanup + +for pg_version in 14 15; do + echo "start containers (pg_version=$pg_version)." + PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d + + echo "wait until the compute is ready. timeout after 60s. " + cnt=0 + while sleep 1; do + # check timeout + cnt=`expr $cnt + 1` + if [ $cnt -gt 60 ]; then + echo "timeout before the compute is ready." + cleanup + exit 1 + fi + + # check if the compute is ready + set +o pipefail + result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l` + set -o pipefail + if [ $result -eq 1 ]; then + echo "OK. The compute is ready to connect." + echo "execute simple queries." + docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION" + cleanup + break + fi + done +done diff --git a/docker-compose/image/compute/Dockerfile b/docker-compose/image/compute/Dockerfile deleted file mode 100644 index 1b9d8c4900..0000000000 --- a/docker-compose/image/compute/Dockerfile +++ /dev/null @@ -1,10 +0,0 @@ -ARG COMPUTE_IMAGE=compute-node-v14:latest -FROM neondatabase/${COMPUTE_IMAGE} - -USER root -RUN apt-get update && \ - apt-get install -y curl \ - jq \ - netcat - -USER postgres diff --git a/scripts/docker-compose_test.sh b/scripts/docker-compose_test.sh deleted file mode 100755 index b4551365f8..0000000000 --- a/scripts/docker-compose_test.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash - -# this is a shortcut script to avoid duplication in CI -set -eux -o pipefail - -SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -COMPOSE_FILE=$SCRIPT_DIR/../docker-compose/docker-compose.yml - -COMPUTE_CONTAINER_NAME=dockercompose_compute_1 -SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;" -PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres" - -cleanup() { - echo "show container information" - docker ps - docker-compose -f $COMPOSE_FILE logs - echo "stop containers..." - docker-compose -f $COMPOSE_FILE down -} - -echo "clean up containers if exists" -cleanup - -for pg_version in 14 15; do - echo "start containers (pg_version=$pg_version)." - PG_VERSION=$pg_version TAG=latest docker-compose -f $COMPOSE_FILE up --build -d - - echo "wait until the compute is ready. timeout after 60s. " - cnt=0 - while sleep 1; do - # check timeout - cnt=`expr $cnt + 1` - if [ $cnt -gt 60 ]; then - echo "timeout before the compute is ready." - cleanup - exit 1 - fi - - # check if the compute is ready - set +o pipefail - result=`docker-compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l` - set -o pipefail - if [ $result -eq 1 ]; then - echo "OK. The compute is ready to connect." - echo "execute simple queries." - docker exec -it $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION" - cleanup - break - fi - done -done From 4131a6efae2a9534cc038404fa536105a4f9a7d6 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 10 Nov 2022 11:49:17 +0200 Subject: [PATCH 1016/1022] Remove unused Dockerfile.compute-node.legacy. The cloud end-to-end tests use the docker images built by the neon PR now, and don't need this legacy Dockerfile anymore. --- Dockerfile.compute-node.legacy | 88 ---------------------------------- 1 file changed, 88 deletions(-) delete mode 100644 Dockerfile.compute-node.legacy diff --git a/Dockerfile.compute-node.legacy b/Dockerfile.compute-node.legacy deleted file mode 100644 index 6653d81019..0000000000 --- a/Dockerfile.compute-node.legacy +++ /dev/null @@ -1,88 +0,0 @@ -# -# Legacy version of the Dockerfile for the compute node. -# Used by e2e CI. Building Dockerfile.compute-node will take -# unreasonable ammount of time without v2 runners. -# -# TODO: remove once cloud repo CI is moved to v2 runners. -# - - -# Allow specifiyng different compute-tools tag and image repo, so we are -# able to use different images -ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com -ARG IMAGE=compute-tools -ARG TAG=latest - -# -# Image with pre-built tools -# -FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps -# Only to get ready compute_ctl binary as deppendency - -# -# Image with Postgres build deps -# -FROM debian:bullseye-slim AS build-deps - -RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ - libcurl4-openssl-dev libossp-uuid-dev - -# -# Image with built Postgres -# -FROM build-deps AS pg-build - -# Add user postgres -RUN adduser postgres -RUN mkdir /pg && chown postgres:postgres /pg - -# Copy source files -# version 14 is default for now -COPY ./vendor/postgres-v14 /pg/ -COPY ./pgxn /pg/ - -# Build and install Postgres locally -RUN mkdir /pg/compute_build && cd /pg/compute_build && \ - ../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \ - # Install main binaries and contribs - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ - # Install headers - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install - -# Install neon contrib -RUN make MAKELEVEL=0 PG_CONFIG=/pg/compute_build/postgres_bin/bin/pg_config -j $(getconf _NPROCESSORS_ONLN) -C /pg/neon install - -USER postgres -WORKDIR /pg - -# -# Final compute node image to be exported -# -FROM debian:bullseye-slim - -# libreadline-dev is required to run psql -RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev - -# Add user postgres -RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ - echo "postgres:test_console_pass" | chpasswd && \ - mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ - chown -R postgres:postgres /var/db/postgres && \ - chmod 0750 /var/db/postgres/compute - -# Copy ready Postgres binaries -COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local - -# Copy binaries from compute-tools -COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl - -# XXX: temporary symlink for compatibility with old control-plane -RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl - -# Add postgres shared objects to the search path -RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig - -USER postgres - -ENTRYPOINT ["/usr/local/bin/compute_ctl"] From dbe5b52494279e2f5e4885bef27474dbef884d44 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 10 Nov 2022 14:05:13 +0200 Subject: [PATCH 1017/1022] Avoid some vector-growing overhead. I saw this in 'perf' profile of a sequential scan: > - 31.93% 0.21% compute request pageserver [.] ::request_redo > - 31.72% ::request_redo > - 31.26% pageserver::walredo::PostgresRedoManager::apply_batch_postgres > + 7.64% ::write > + 6.17% nix::poll::poll > + 3.58% ::read > + 2.96% std::sync::condvar::Condvar::notify_one > + 2.48% std::sys::unix::locks::futex::Condvar::wait > + 2.19% alloc::raw_vec::RawVec::reserve::do_reserve_and_handle > + 1.14% std::sys::unix::locks::futex::Mutex::lock_contended > 0.67% __rust_alloc_zeroed > 0.62% __stpcpy_ssse3 > 0.56% std::sys::unix::locks::futex::Mutex::wake Note the 'do_reserve_handle' overhead. That's caused by having to grow the buffer used to construct the WAL redo request. This commit eliminates that overhead. It's only about 2% of the overall CPU usage, but every little helps. Also reuse the temp buffer when reading records from a DeltaLayer, and call Vec::reserve to avoid growing a buffer when reading a blob across pages. I saw a reduction from 2% to 1% of CPU spent in do_reserve_and_handle in that codepath, but that's such a small change that it could be just noise. Seems like it shouldn't hurt though. --- pageserver/src/tenant/blob_io.rs | 1 + pageserver/src/tenant/delta_layer.rs | 3 ++- pageserver/src/walredo.rs | 6 +++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 78ecbcb9c1..52eafc72ee 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -74,6 +74,7 @@ where }; dstbuf.clear(); + dstbuf.reserve(len); // Read the payload let mut remain = len; diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs index a908d66200..dcd6956640 100644 --- a/pageserver/src/tenant/delta_layer.rs +++ b/pageserver/src/tenant/delta_layer.rs @@ -260,8 +260,9 @@ impl Layer for DeltaLayer { // Ok, 'offsets' now contains the offsets of all the entries we need to read let mut cursor = file.block_cursor(); + let mut buf = Vec::new(); for (entry_lsn, pos) in offsets { - let buf = cursor.read_blob(pos).with_context(|| { + cursor.read_blob_into_buf(pos, &mut buf).with_context(|| { format!( "Failed to read blob from virtual file {}", file.file.path.display() diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 59dadbb1d3..f05bf46d96 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -740,7 +740,11 @@ impl PostgresRedoProcess { // This could be problematic if there are millions of records to replay, // but in practice the number of records is usually so small that it doesn't // matter, and it's better to keep this code simple. - let mut writebuf: Vec = Vec::new(); + // + // Most requests start with a before-image with BLCKSZ bytes, followed by + // by some other WAL records. Start with a buffer that can hold that + // comfortably. + let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); build_begin_redo_for_block_msg(tag, &mut writebuf); if let Some(img) = base_img { build_push_page_msg(tag, &img, &mut writebuf); From f30ef004399dab22278bb2ca077e544c6def0803 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 10 Nov 2022 11:58:47 +0200 Subject: [PATCH 1018/1022] Stop building the legacy "compute-node" docker image. Before we had separate images for v14 and v15, the compute node image was called just "neondatabase/compute-node". It has been superseded by the "neondatabase/compute-node-v14" and "neondatabase/compute-node-v15" images. The old image is not used by the cloud console build or tests anymore. --- .github/workflows/build_and_test.yml | 35 ++-------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e6014ecb84..8a03ee9a0c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -492,26 +492,6 @@ jobs: - name: Kaniko build compute tools run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} - compute-node-image: - runs-on: dev - container: gcr.io/kaniko-project/executor:v1.9.0-debug - needs: [ tag ] - steps: - - name: Checkout - uses: actions/checkout@v1 # v3 won't work with kaniko - with: - submodules: true - fetch-depth: 0 - - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - # compute-node uses postgres 14, which is default now - # cloud repo depends on this image name, thus duplicating it - # remove compute-node when cloud repo is updated - - name: Kaniko build compute node with extensions v14 (compatibility) - run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} - compute-node-image-v14: runs-on: dev container: gcr.io/kaniko-project/executor:v1.9.0-debug @@ -547,7 +527,7 @@ jobs: run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} test-images: - needs: [ tag, neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ] + needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ] runs-on: dev steps: @@ -597,10 +577,7 @@ jobs: strategy: fail-fast: false matrix: - # compute-node uses postgres 14, which is default now - # cloud repo depends on this image name, thus duplicating it - # remove compute-node when cloud repo is updated - name: [ neon, compute-node, compute-node-v14, compute-node-v15, compute-tools ] + name: [ neon, compute-node-v14, compute-node-v15, compute-tools ] steps: - name: Promote image to latest @@ -630,9 +607,6 @@ jobs: - name: Pull compute tools image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools - - name: Pull compute node image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} compute-node - - name: Pull compute node v14 image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14 @@ -649,7 +623,6 @@ jobs: run: | crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest @@ -665,9 +638,6 @@ jobs: - name: Push compute tools image to Docker Hub run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} - - name: Push compute node image to Docker Hub - run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}} - - name: Push compute node v14 image to Docker Hub run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} @@ -684,7 +654,6 @@ jobs: run: | crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest From c11cbf0f5c3010bc82a6fdf430f0a7cb5dbfbaf9 Mon Sep 17 00:00:00 2001 From: andres Date: Sun, 6 Nov 2022 20:25:18 +0100 Subject: [PATCH 1019/1022] fix test_compare_child_and_root_pgbench_perf to do a fair comparison --- test_runner/performance/test_branching.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test_runner/performance/test_branching.py b/test_runner/performance/test_branching.py index 562e751458..0fe7306f87 100644 --- a/test_runner/performance/test_branching.py +++ b/test_runner/performance/test_branching.py @@ -4,6 +4,7 @@ from typing import List from fixtures.benchmark_fixture import PgBenchRunResult from fixtures.compare_fixtures import NeonCompare +from fixtures.neon_fixtures import fork_at_current_lsn from performance.test_perf_pgbench import utc_now_timestamp # ----------------------------------------------------------------------- @@ -43,7 +44,8 @@ def test_compare_child_and_root_pgbench_perf(neon_compare: NeonCompare): pg_root = env.postgres.create_start("root") pg_bin.run_capture(["pgbench", "-i", pg_root.connstr(), "-s10"]) - env.neon_cli.create_branch("child", "root") + fork_at_current_lsn(env, pg_root, "child", "root") + pg_child = env.postgres.create_start("child") run_pgbench_on_branch("root", ["pgbench", "-c10", "-T10", pg_root.connstr()]) From f87017c04db1330941bbf78fbf2ca3b793209371 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 14 Nov 2022 14:44:41 +0200 Subject: [PATCH 1020/1022] Omit dependencies' debug info (#2803) Based on https://neondb.slack.com/archives/C0277TKAJCA/p1668079753506749 Co-authored-by: Arseny Sher --- Cargo.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 32c243bf44..0d73710bbb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,10 @@ members = [ # Besides, debug info should not affect the performance. debug = true +# disable debug symbols for all packages except this one to decrease binaries size +[profile.release.package."*"] +debug = false + [profile.release-line-debug] inherits = "release" debug = 1 # true = 2 = all symbols, 1 = line only From 03190a216150dac6d8ba4876cae5abaae67c4fae Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 15 Nov 2022 10:27:59 +0000 Subject: [PATCH 1021/1022] GitHub Actions: Do not create Allure report for cancelled jobs (#2813) If a workflow is cancelled, do not delay its finishing by creating an allure report. --- .github/actions/run-python-test-set/action.yml | 2 +- .github/workflows/benchmarking.yml | 2 +- .github/workflows/build_and_test.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 0b880c7306..990c7e25a9 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -190,7 +190,7 @@ runs: prefix: latest - name: Create Allure report - if: always() + if: success() || failure() uses: ./.github/actions/allure-report with: action: store diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 6091c8d7ff..8477104c30 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -265,7 +265,7 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: Create Allure report - if: always() + if: success() || failure() uses: ./.github/actions/allure-report with: action: generate diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8a03ee9a0c..a726cb01ff 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -305,7 +305,7 @@ jobs: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init needs: [ regress-tests, benchmarks ] - if: always() + if: success() || failure() strategy: fail-fast: false matrix: From 01778e37cc4dfdd647d66884c0a295d0a68fa221 Mon Sep 17 00:00:00 2001 From: MMeent Date: Tue, 15 Nov 2022 15:12:38 +0100 Subject: [PATCH 1022/1022] Address issues in the pagestore prefetch mechanism: (#2790) - Update vendored PostgreSQL to address prefetch issues - Make flushed state explicit in PrefetchState - Move flush logic into prefetch_wait_for, where possible - Clean up some prefetch state handling code in the various code elements handling state transitions. - Fix a race condition in neon_read_at_lsn where a hash entry pointer was used after the hash table was updated. This could result in incorrect state transitions and assertion failures after disconnects during prefetch_wait_for in that neon_read_at_lsn. Fixes #2780 --- pgxn/neon/libpagestore.c | 16 +- pgxn/neon/pagestore_client.h | 3 + pgxn/neon/pagestore_smgr.c | 311 ++++++++++++++++++++++++++--------- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 5 files changed, 257 insertions(+), 77 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 1e4e18e7d1..d8e9d8b52c 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -52,6 +52,7 @@ char *page_server_connstring_raw; int n_unflushed_requests = 0; int flush_every_n_requests = 8; +int readahead_buffer_size = 128; static void pageserver_flush(void); @@ -449,9 +450,22 @@ pg_init_libpagestore(void) NULL, &flush_every_n_requests, 8, -1, INT_MAX, - PGC_SIGHUP, + PGC_USERSET, 0, /* no flags required */ NULL, NULL, NULL); + DefineCustomIntVariable("neon.readahead_buffer_size", + "number of prefetches to buffer", + "This buffer is used to store prefetched data; so " + "it is important that this buffer is at least as " + "large as the configured value of all tablespaces' " + "effective_io_concurrency and maintenance_io_concurrency, " + "your sessions' values of these, and the value for " + "seqscan_prefetch_buffers.", + &readahead_buffer_size, + 128, 16, 1024, + PGC_USERSET, + 0, /* no flags required */ + NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL); relsize_hash_init(); diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index be6c4b3a77..9b8081065c 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -150,6 +150,8 @@ extern void prefetch_on_ps_disconnect(void); extern page_server_api * page_server; extern char *page_server_connstring; +extern int flush_every_n_requests; +extern int readahead_buffer_size; extern bool seqscan_prefetch_enabled; extern int seqscan_prefetch_distance; extern char *neon_timeline; @@ -159,6 +161,7 @@ extern int32 max_cluster_size; extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode); extern void smgr_init_neon(void); +extern void readahead_buffer_resize(int newsize, void *extra); /* Neon storage manager functionality */ diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 59c5ff8db2..d6fa7c46c9 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -116,10 +116,10 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; * * Prefetch is performed locally by each backend. * - * There can be up to READ_BUFFER_SIZE active IO requests registered at any - * time. Requests using smgr_prefetch are sent to the pageserver, but we don't - * wait on the response. Requests using smgr_read are either read from the - * buffer, or (if that's not possible) we wait on the response to arrive - + * There can be up to readahead_buffer_size active IO requests registered at + * any time. Requests using smgr_prefetch are sent to the pageserver, but we + * don't wait on the response. Requests using smgr_read are either read from + * the buffer, or (if that's not possible) we wait on the response to arrive - * this also will allow us to receive other prefetched pages. * Each request is immediately written to the output buffer of the pageserver * connection, but may not be flushed if smgr_prefetch is used: pageserver @@ -136,15 +136,25 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; * the connection; the responses are stored for later use. * * NOTE: The current implementation of the prefetch system implements a ring - * buffer of up to READ_BUFFER_SIZE requests. If there are more _read and + * buffer of up to readahead_buffer_size requests. If there are more _read and * _prefetch requests between the initial _prefetch and the _read of a buffer, * the prefetch request will have been dropped from this prefetch buffer, and * your prefetch was wasted. */ -/* Max amount of tracked buffer reads */ -#define READ_BUFFER_SIZE 128 - +/* + * State machine: + * + * not in hash : in hash + * : + * UNUSED ------> REQUESTED --> RECEIVED + * ^ : | | + * | : v | + * | : TAG_UNUSED | + * | : | | + * +----------------+------------+ + * : + */ typedef enum PrefetchStatus { PRFS_UNUSED = 0, /* unused slot */ PRFS_REQUESTED, /* request was written to the sendbuffer to PS, but not @@ -192,7 +202,7 @@ typedef struct PrfHashEntry { * It maintains a (ring) buffer of in-flight requests and responses. * * We maintain several indexes into the ring buffer: - * ring_unused >= ring_receive >= ring_last >= 0 + * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0 * * ring_unused points to the first unused slot of the buffer * ring_receive is the next request that is to be received @@ -208,6 +218,7 @@ typedef struct PrefetchState { /* buffer indexes */ uint64 ring_unused; /* first unused slot */ + uint64 ring_flush; /* next request to flush */ uint64 ring_receive; /* next slot that is to receive a response */ uint64 ring_last; /* min slot with a response value */ @@ -218,11 +229,19 @@ typedef struct PrefetchState { /* the buffers */ prfh_hash *prf_hash; - PrefetchRequest prf_buffer[READ_BUFFER_SIZE]; /* prefetch buffers */ + PrefetchRequest prf_buffer[]; /* prefetch buffers */ } PrefetchState; PrefetchState *MyPState; +#define GetPrfSlot(ring_index) ( \ + ( \ + AssertMacro((ring_index) < MyPState->ring_unused && \ + (ring_index) >= MyPState->ring_last), \ + &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ + ) \ +) + int n_prefetch_hits = 0; int n_prefetch_misses = 0; int n_prefetch_missed_caches = 0; @@ -236,14 +255,112 @@ static void prefetch_read(PrefetchRequest *slot); static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn); static void prefetch_wait_for(uint64 ring_index); static void prefetch_cleanup(void); -static inline void prefetch_set_unused(uint64 ring_index, bool hash_cleanup); +static inline void prefetch_set_unused(uint64 ring_index); static XLogRecPtr neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno); +void +readahead_buffer_resize(int newsize, void *extra) +{ + uint64 end, + nfree = newsize; + PrefetchState *newPState; + Size newprfs_size = offsetof(PrefetchState, prf_buffer) + ( + sizeof(PrefetchRequest) * readahead_buffer_size + ); + + /* don't try to re-initialize if we haven't initialized yet */ + if (MyPState == NULL) + return; + + /* + * Make sure that we don't lose track of active prefetch requests by + * ensuring we have received all but the last n requests (n = newsize). + */ + if (MyPState->n_requests_inflight > newsize) + prefetch_wait_for(MyPState->ring_unused - newsize); + + /* construct the new PrefetchState, and copy over the memory contexts */ + newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size); + + newPState->bufctx = MyPState->bufctx; + newPState->errctx = MyPState->errctx; + newPState->hashctx = MyPState->hashctx; + newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL); + newPState->n_unused = newsize; + newPState->n_requests_inflight = 0; + newPState->n_responses_buffered = 0; + newPState->ring_last = newsize; + newPState->ring_unused = newsize; + newPState->ring_receive = newsize; + newPState->ring_flush = newsize; + + /* + * Copy over the prefetches. + * + * We populate the prefetch array from the end; to retain the most recent + * prefetches, but this has the benefit of only needing to do one iteration + * on the dataset, and trivial compaction. + */ + for (end = MyPState->ring_unused - 1; + end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0; + end -= 1) + { + PrefetchRequest *slot = GetPrfSlot(end); + PrefetchRequest *newslot; + bool found; + + if (slot->status == PRFS_UNUSED) + continue; + + nfree -= 1; + + newslot = &newPState->prf_buffer[nfree]; + *newslot = *slot; + newslot->my_ring_index = nfree; + + prfh_insert(newPState->prf_hash, newslot, &found); + + Assert(!found); + + switch (newslot->status) + { + case PRFS_UNUSED: + pg_unreachable(); + case PRFS_REQUESTED: + newPState->n_requests_inflight += 1; + newPState->ring_receive -= 1; + newPState->ring_last -= 1; + break; + case PRFS_RECEIVED: + newPState->n_responses_buffered += 1; + newPState->ring_last -= 1; + break; + case PRFS_TAG_REMAINS: + newPState->ring_last -= 1; + break; + } + newPState->n_unused -= 1; + } + + for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1) + { + prefetch_set_unused(end); + } + + prfh_destroy(MyPState->prf_hash); + pfree(MyPState); + MyPState = newPState; +} + + /* * Make sure that there are no responses still in the buffer. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. */ static void consume_prefetch_responses(void) @@ -255,14 +372,12 @@ consume_prefetch_responses(void) static void prefetch_cleanup(void) { - int index; uint64 ring_index; PrefetchRequest *slot; while (MyPState->ring_last < MyPState->ring_receive) { ring_index = MyPState->ring_last; - index = (ring_index % READ_BUFFER_SIZE); - slot = &MyPState->prf_buffer[index]; + slot = GetPrfSlot(ring_index); if (slot->status == PRFS_UNUSED) MyPState->ring_last += 1; @@ -274,19 +389,27 @@ prefetch_cleanup(void) /* * Wait for slot of ring_index to have received its response. * The caller is responsible for making sure the request buffer is flushed. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. */ static void prefetch_wait_for(uint64 ring_index) { - int index; PrefetchRequest *entry; + if (MyPState->ring_flush <= ring_index && + MyPState->ring_unused > MyPState->ring_flush) + { + page_server->flush(); + MyPState->ring_flush = MyPState->ring_unused; + } + Assert(MyPState->ring_unused > ring_index); while (MyPState->ring_receive <= ring_index) { - index = (MyPState->ring_receive % READ_BUFFER_SIZE); - entry = &MyPState->prf_buffer[index]; + entry = GetPrfSlot(MyPState->ring_receive); Assert(entry->status == PRFS_REQUESTED); prefetch_read(entry); @@ -298,6 +421,9 @@ prefetch_wait_for(uint64 ring_index) * * The caller is responsible for making sure that the request for this buffer * was flushed to the PageServer. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. */ static void prefetch_read(PrefetchRequest *slot) @@ -312,7 +438,7 @@ prefetch_read(PrefetchRequest *slot) old = MemoryContextSwitchTo(MyPState->errctx); response = (NeonResponse *) page_server->receive(); MemoryContextSwitchTo(old); - + /* update prefetch state */ MyPState->n_responses_buffered += 1; MyPState->n_requests_inflight -= 1; @@ -332,19 +458,22 @@ prefetch_read(PrefetchRequest *slot) void prefetch_on_ps_disconnect(void) { - for (; MyPState->ring_receive < MyPState->ring_unused; MyPState->ring_receive++) + MyPState->ring_flush = MyPState->ring_unused; + while (MyPState->ring_receive < MyPState->ring_unused) { PrefetchRequest *slot; - int index = MyPState->ring_receive % READ_BUFFER_SIZE; + uint64 ring_index = MyPState->ring_receive; + + slot = GetPrfSlot(ring_index); - slot = &MyPState->prf_buffer[index]; Assert(slot->status == PRFS_REQUESTED); - Assert(slot->my_ring_index == MyPState->ring_receive); + Assert(slot->my_ring_index == ring_index); /* clean up the request */ slot->status = PRFS_TAG_REMAINS; - MyPState->n_requests_inflight--; - prefetch_set_unused(MyPState->ring_receive, true); + MyPState->n_requests_inflight -= 1; + MyPState->ring_receive += 1; + prefetch_set_unused(ring_index); } } @@ -353,21 +482,24 @@ prefetch_on_ps_disconnect(void) * * The slot at ring_index must be a current member of the ring buffer, * and may not be in the PRFS_REQUESTED state. + * + * NOTE: this function will update MyPState->pfs_hash; which invalidates any + * active pointers into the hash table. */ static inline void -prefetch_set_unused(uint64 ring_index, bool hash_cleanup) +prefetch_set_unused(uint64 ring_index) { - PrefetchRequest *slot = &MyPState->prf_buffer[ring_index % READ_BUFFER_SIZE]; + PrefetchRequest *slot = GetPrfSlot(ring_index); - Assert(MyPState->ring_last <= ring_index && - MyPState->ring_unused > ring_index); + if (ring_index < MyPState->ring_last) + return; /* Should already be unused */ + + Assert(MyPState->ring_unused > ring_index); if (slot->status == PRFS_UNUSED) return; Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS); - Assert(ring_index >= MyPState->ring_last && - ring_index < MyPState->ring_unused); if (slot->status == PRFS_RECEIVED) { @@ -382,8 +514,7 @@ prefetch_set_unused(uint64 ring_index, bool hash_cleanup) Assert(slot->response == NULL); } - if (hash_cleanup) - prfh_delete(MyPState->prf_hash, slot); + prfh_delete(MyPState->prf_hash, slot); /* clear all fields */ MemSet(slot, 0, sizeof(PrefetchRequest)); @@ -397,6 +528,7 @@ prefetch_set_unused(uint64 ring_index, bool hash_cleanup) static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn) { + bool found; NeonGetPageRequest request = { .req.tag = T_NeonGetPageRequest, .req.latest = false, @@ -454,6 +586,9 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force /* update slot state */ slot->status = PRFS_REQUESTED; + + prfh_insert(MyPState->prf_hash, slot, &found); + Assert(!found); } /* @@ -464,13 +599,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force * If force_latest and force_lsn are not NULL, those values are sent to the * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure * to fill in these values manually. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. */ static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn) { - int index; - bool found; uint64 ring_index; PrefetchRequest req; PrefetchRequest *slot; @@ -485,28 +621,49 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls { slot = entry->slot; ring_index = slot->my_ring_index; - index = (ring_index % READ_BUFFER_SIZE); - Assert(slot == &MyPState->prf_buffer[index]); + Assert(slot == GetPrfSlot(ring_index)); Assert(slot->status != PRFS_UNUSED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); Assert(BUFFERTAGS_EQUAL(slot->buftag, tag)); - + /* * If we want a specific lsn, we do not accept requests that were made * with a potentially different LSN. */ - if (force_lsn && slot->effective_request_lsn != *force_lsn) + if (force_latest && force_lsn) { - prefetch_wait_for(ring_index); - prefetch_set_unused(ring_index, true); + /* if we want the latest version, any effective_request_lsn < request lsn is OK */ + if (*force_latest) + { + if (*force_lsn > slot->effective_request_lsn) + { + prefetch_wait_for(ring_index); + prefetch_set_unused(ring_index); + entry = NULL; + } + } + /* if we don't want the latest version, only accept requests with the exact same LSN */ + else + { + if (*force_lsn != slot->effective_request_lsn) + { + prefetch_wait_for(ring_index); + prefetch_set_unused(ring_index); + entry = NULL; + } + } } + /* * We received a prefetch for a page that was recently read and * removed from the buffers. Remove that request from the buffers. */ else if (slot->status == PRFS_TAG_REMAINS) { - prefetch_set_unused(ring_index, true); + prefetch_set_unused(ring_index); + entry = NULL; } else { @@ -529,9 +686,10 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls * output buffer, and 'not sending' a prefetch request kind of goes * against the principles of prefetching) */ - if (MyPState->ring_last + READ_BUFFER_SIZE - 1 == MyPState->ring_unused) + if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused) { - slot = &MyPState->prf_buffer[(MyPState->ring_last % READ_BUFFER_SIZE)]; + uint64 cleanup_index = MyPState->ring_last; + slot = GetPrfSlot(cleanup_index); Assert(slot->status != PRFS_UNUSED); @@ -539,13 +697,13 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls switch (slot->status) { case PRFS_REQUESTED: - Assert(MyPState->ring_receive == MyPState->ring_last); - prefetch_wait_for(MyPState->ring_last); - prefetch_set_unused(MyPState->ring_last, true); + Assert(MyPState->ring_receive == cleanup_index); + prefetch_wait_for(cleanup_index); + prefetch_set_unused(cleanup_index); break; case PRFS_RECEIVED: case PRFS_TAG_REMAINS: - prefetch_set_unused(MyPState->ring_last, true); + prefetch_set_unused(cleanup_index); break; default: pg_unreachable(); @@ -553,12 +711,11 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls } /* - * The next buffer pointed to by `ring_unused` is now unused, so we can insert - * the new request to it. + * The next buffer pointed to by `ring_unused` is now definitely empty, + * so we can insert the new request to it. */ ring_index = MyPState->ring_unused; - index = (ring_index % READ_BUFFER_SIZE); - slot = &MyPState->prf_buffer[index]; + slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)]; Assert(MyPState->ring_last <= ring_index); @@ -571,12 +728,18 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls slot->buftag = tag; slot->my_ring_index = ring_index; - prfh_insert(MyPState->prf_hash, slot, &found); - Assert(!found); - prefetch_do_request(slot, force_latest, force_lsn); Assert(slot->status == PRFS_REQUESTED); - Assert(ring_index < MyPState->ring_unused); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + + if (flush_every_n_requests > 0 && + MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) + { + page_server->flush(); + MyPState->ring_flush = MyPState->ring_unused; + } + return ring_index; } @@ -585,6 +748,7 @@ page_server_request(void const *req) { page_server->send((NeonRequest *) req); page_server->flush(); + MyPState->ring_flush = MyPState->ring_unused; consume_prefetch_responses(); return page_server->receive(); } @@ -1052,14 +1216,18 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch void neon_init(void) { - HASHCTL info; + Size prfs_size; if (MyPState != NULL) return; - MyPState = MemoryContextAllocZero(TopMemoryContext, sizeof(PrefetchState)); + prfs_size = offsetof(PrefetchState, prf_buffer) + ( + sizeof(PrefetchRequest) * readahead_buffer_size + ); + + MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size); - MyPState->n_unused = READ_BUFFER_SIZE; + MyPState->n_unused = readahead_buffer_size; MyPState->bufctx = SlabContextCreate(TopMemoryContext, "NeonSMGR/prefetch", @@ -1072,11 +1240,8 @@ neon_init(void) "NeonSMGR/prefetch", ALLOCSET_DEFAULT_SIZES); - info.keysize = sizeof(BufferTag); - info.entrysize = sizeof(uint64); - MyPState->prf_hash = prfh_create(MyPState->hashctx, - READ_BUFFER_SIZE, NULL); + readahead_buffer_size, NULL); #ifdef DEBUG_COMPARE_LOCAL mdinit(); @@ -1470,7 +1635,7 @@ neon_close(SMgrRelation reln, ForkNumber forknum) bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { - uint64 ring_index; + uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; switch (reln->smgr_relpersistence) { @@ -1565,9 +1730,9 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, if (entry != NULL) { - if (entry->slot->effective_request_lsn >= prefetch_lsn) + slot = entry->slot; + if (slot->effective_request_lsn >= request_lsn) { - slot = entry->slot; ring_index = slot->my_ring_index; n_prefetch_hits += 1; } @@ -1578,13 +1743,12 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, * unlikely this happens, but it can happen if prefetch distance is * large enough and a backend didn't consume all prefetch requests. */ - if (entry->slot->status == PRFS_REQUESTED) + if (slot->status == PRFS_REQUESTED) { - page_server->flush(); - prefetch_wait_for(entry->slot->my_ring_index); + prefetch_wait_for(slot->my_ring_index); } /* drop caches */ - prefetch_set_unused(entry->slot->my_ring_index, true); + prefetch_set_unused(slot->my_ring_index); n_prefetch_missed_caches += 1; /* make it look like a prefetch cache miss */ entry = NULL; @@ -1597,16 +1761,15 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, ring_index = prefetch_register_buffer(buftag, &request_latest, &request_lsn); - slot = &MyPState->prf_buffer[(ring_index % READ_BUFFER_SIZE)]; + slot = GetPrfSlot(ring_index); } + Assert(slot->my_ring_index == ring_index); Assert(MyPState->ring_last <= ring_index && MyPState->ring_unused > ring_index); - Assert(slot->my_ring_index == ring_index); Assert(slot->status != PRFS_UNUSED); - Assert(&MyPState->prf_buffer[(ring_index % READ_BUFFER_SIZE)] == slot); + Assert(GetPrfSlot(ring_index) == slot); - page_server->flush(); prefetch_wait_for(ring_index); Assert(slot->status == PRFS_RECEIVED); @@ -1637,7 +1800,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, } /* buffer was used, clean up for later reuse */ - prefetch_set_unused(ring_index, true); + prefetch_set_unused(ring_index); prefetch_cleanup(); } diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index e56b812dd8..cd0693e2be 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit e56b812dd85a3d9355478cc626c10909406816ba +Subproject commit cd0693e2be224bedfa0b61f9c5e2ff4cd88eec2c diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 39e3d745b3..e9e0fd5947 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 39e3d745b3701a3f47f40412fbec62cbb01a42bf +Subproject commit e9e0fd59477587ff571189f731e0f39bdfae57e3